From c223df7d08037115bd9c15ce165ec258553985bd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 9 Nov 2022 22:44:19 +0000 Subject: [PATCH 001/344] added inference example stub --- CMakeLists.txt | 6 ++ config/config.inc | 7 ++ config/config.linux | 1 + .../cpp/inference/MLP_Unify/CMakeLists.txt | 12 +++ examples/cpp/inference/MLP_Unify/Makefile | 39 ++++++++ examples/cpp/inference/MLP_Unify/mlp.cc | 93 +++++++++++++++++++ 6 files changed, 158 insertions(+) create mode 100644 examples/cpp/inference/MLP_Unify/CMakeLists.txt create mode 100644 examples/cpp/inference/MLP_Unify/Makefile create mode 100644 examples/cpp/inference/MLP_Unify/mlp.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index a50b41a59b..4c86199992 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -334,6 +334,8 @@ option(FF_BUILD_MOE "build mixture of experts example" OFF) option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) option(FF_BUILD_SPLIT_TEST "build split test example" OFF) option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) +option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) +option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) @@ -376,6 +378,10 @@ if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/MLP_Unify) endif() +if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/MLP_Unify) +endif() + if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) endif() diff --git a/config/config.inc b/config/config.inc index da043b2880..58632183b9 100644 --- a/config/config.inc +++ b/config/config.inc @@ -91,6 +91,13 @@ elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then else SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON" fi +if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then + SET_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" +elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then + SET_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF" +else + SET_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" +fi # enable C++ unit tests if [ "$FF_BUILD_UNIT_TESTS" = "ON" ]; then diff --git a/config/config.linux b/config/config.linux index 017243408b..28cf7c2fe1 100755 --- a/config/config.linux +++ b/config/config.linux @@ -39,6 +39,7 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} +FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-OFF} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} diff --git a/examples/cpp/inference/MLP_Unify/CMakeLists.txt b/examples/cpp/inference/MLP_Unify/CMakeLists.txt new file mode 100644 index 0000000000..e4299dcfb3 --- /dev/null +++ b/examples/cpp/inference/MLP_Unify/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExampleInference_MLPUnify) +set(project_target mlp_inference_unify) + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + mlp.cc) + +cuda_add_executable(${project_target} ${CPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/examples/cpp/inference/MLP_Unify/Makefile b/examples/cpp/inference/MLP_Unify/Makefile new file mode 100644 index 0000000000..9798c4f18a --- /dev/null +++ b/examples/cpp/inference/MLP_Unify/Makefile @@ -0,0 +1,39 @@ +# Copyright 2021 CMU, Facebook, LANL, MIT, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 1 # Include debugging symbols +MAX_DIM ?= 5 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 0 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 0 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) +USE_HIP ?= 1 # Include HIP support (requires HIP) +HIP_TARGET ?= ROCM +USE_GPU_REDUCTIONS ?= 0 + +# Put the binary file name here +OUTFILE ?= mlp_inference +# List all the application source files here +GEN_SRC = mlp.cc +GEN_GPU_SRC = +GEN_HIP_SRC = + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc new file mode 100644 index 0000000000..167281b4c9 --- /dev/null +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -0,0 +1,93 @@ +/* Copyright 2021 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/model.h" +#include +#include +#include +using namespace Legion; +using namespace FlexFlow; + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffConfig; + fprintf(stderr, + "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes); + FFModel ff(ffConfig); + + std::vector hidden_dims = { + 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; + Tensor input1, input2; + { + int const dims[] = {ffConfig.batchSize, 1024}; + input1 = ff.create_tensor<2>(dims, DT_FLOAT); + input2 = ff.create_tensor<2>(dims, DT_FLOAT); + } + Tensor t1 = input1, t2 = input2; + for (size_t i = 0; i < hidden_dims.size(); i++) { + int const dims[] = {hidden_dims[i], t1->dims[0]}; + ActiMode acti_mode = + (i + 1 == hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; + t1 = ff.dense(t1, hidden_dims[i], acti_mode, false); + t2 = ff.dense(t2, hidden_dims[i], acti_mode, false); + } + Tensor t = ff.add(t1, t2); + t = ff.softmax(t); + Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f); + std::vector metrics; + metrics.push_back(METRICS_ACCURACY); + metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); + ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics); + ff.init_operators(); + // Start timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_start = Realm::Clock::current_time_in_microseconds(); + for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { + ff.reset_metrics(); + int iterations = 128; + for (int iter = 0; iter < iterations; iter++) { + runtime->begin_trace(ctx, 111 /*trace_id*/); + ff.forward(); + ff.zero_gradients(); + // ff.backward(); + // ff.update(); + runtime->end_trace(ctx, 111 /*trace_id*/); + } + } + // End timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_end = Realm::Clock::current_time_in_microseconds(); + double run_time = 1e-6 * (ts_end - ts_start); + printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", + run_time, + ffConfig.batchSize * 128 * ffConfig.epochs / run_time); +} + +void FlexFlow::register_custom_tasks() {} From 0b2f1137eb6b864acbdc7d985e90dfcc42485132 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 9 Nov 2022 17:51:26 -0500 Subject: [PATCH 002/344] turning on inference mode --- examples/cpp/inference/MLP_Unify/mlp.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 167281b4c9..5ea113de33 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -54,7 +54,7 @@ void FlexFlow::top_level_task(Task const *task, std::vector metrics; metrics.push_back(METRICS_ACCURACY); metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); - ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics); + ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics, CompMode.COMP_MODE_INFERENCE); ff.init_operators(); // Start timer { @@ -64,18 +64,18 @@ void FlexFlow::top_level_task(Task const *task, future.get_void_result(); } double ts_start = Realm::Clock::current_time_in_microseconds(); - for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - ff.reset_metrics(); - int iterations = 128; - for (int iter = 0; iter < iterations; iter++) { - runtime->begin_trace(ctx, 111 /*trace_id*/); - ff.forward(); - ff.zero_gradients(); - // ff.backward(); - // ff.update(); - runtime->end_trace(ctx, 111 /*trace_id*/); - } + //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { + ff.reset_metrics(); + int iterations = 128; + for (int iter = 0; iter < iterations; iter++) { + runtime->begin_trace(ctx, 111 /*trace_id*/); + ff.forward(); + ff.zero_gradients(); + // ff.backward(); + // ff.update(); + runtime->end_trace(ctx, 111 /*trace_id*/); } + //} // End timer { runtime->issue_execution_fence(ctx); From 1ed8644c9fbd587fba772962993a67f2f38bb7e2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 9 Nov 2022 23:16:09 +0000 Subject: [PATCH 003/344] fix --- examples/cpp/inference/MLP_Unify/mlp.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 5ea113de33..434b4f18e5 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -54,7 +54,7 @@ void FlexFlow::top_level_task(Task const *task, std::vector metrics; metrics.push_back(METRICS_ACCURACY); metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); - ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics, CompMode.COMP_MODE_INFERENCE); + ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics, CompMode::COMP_MODE_INFERENCE); ff.init_operators(); // Start timer { From a24fd7b83fb7b8cbb6adbf7de16cb2b49610f4ab Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Nov 2022 06:48:36 +0000 Subject: [PATCH 004/344] fix inference case --- examples/cpp/inference/MLP_Unify/mlp.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 434b4f18e5..26623b4a76 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -70,12 +70,8 @@ void FlexFlow::top_level_task(Task const *task, for (int iter = 0; iter < iterations; iter++) { runtime->begin_trace(ctx, 111 /*trace_id*/); ff.forward(); - ff.zero_gradients(); - // ff.backward(); - // ff.update(); runtime->end_trace(ctx, 111 /*trace_id*/); } - //} // End timer { runtime->issue_execution_fence(ctx); From 6f6a281879c4411deddc7f0a1535f6b1df85f98d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 11 Nov 2022 07:19:44 +0000 Subject: [PATCH 005/344] fix --- config/config.inc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/config.inc b/config/config.inc index 58632183b9..2308b8b66b 100644 --- a/config/config.inc +++ b/config/config.inc @@ -92,11 +92,11 @@ else SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON" fi if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then - SET_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" + SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then - SET_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF" + SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF" else - SET_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" + SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" fi # enable C++ unit tests @@ -189,7 +189,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_GASNET} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_GASNET} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} From 9202297d846db45908b090a2bee0e37370780512 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Nov 2022 18:53:37 +0000 Subject: [PATCH 006/344] added inference example for transformer --- CMakeLists.txt | 3 + .../cpp/inference/Transformer/CMakeLists.txt | 19 + examples/cpp/inference/Transformer/Makefile | 35 ++ .../cpp/inference/Transformer/transformer.cc | 397 ++++++++++++++++++ .../cpp/inference/Transformer/transformer.cu | 58 +++ .../cpp/inference/Transformer/transformer.h | 54 +++ 6 files changed, 566 insertions(+) create mode 100644 examples/cpp/inference/Transformer/CMakeLists.txt create mode 100644 examples/cpp/inference/Transformer/Makefile create mode 100644 examples/cpp/inference/Transformer/transformer.cc create mode 100644 examples/cpp/inference/Transformer/transformer.cu create mode 100644 examples/cpp/inference/Transformer/transformer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c86199992..5f886fcec0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -419,6 +419,9 @@ endif() if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) + +if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/Transformer) endif() # installation diff --git a/examples/cpp/inference/Transformer/CMakeLists.txt b/examples/cpp/inference/Transformer/CMakeLists.txt new file mode 100644 index 0000000000..ac46d77f32 --- /dev/null +++ b/examples/cpp/inference/Transformer/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_Transformer) +set(project_target transformer) + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + transformer.cc + transformer.h) + +set(GPU_SRC +transformer.cu) + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/Transformer/Makefile b/examples/cpp/inference/Transformer/Makefile new file mode 100644 index 0000000000..911f234c45 --- /dev/null +++ b/examples/cpp/inference/Transformer/Makefile @@ -0,0 +1,35 @@ +# Copyright 2021 Facebook, Stanford, LANL +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 1 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= transformer +# List all the application source files here +GEN_SRC = transformer.cc +GEN_GPU_SRC = transformer.cu + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/Transformer/transformer.cc b/examples/cpp/inference/Transformer/transformer.cc new file mode 100644 index 0000000000..38675577cc --- /dev/null +++ b/examples/cpp/inference/Transformer/transformer.cc @@ -0,0 +1,397 @@ +/* Copyright 2021 Facebook + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "transformer.h" + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("Transformer"); + +Tensor create_emb(FFModel *model, + Tensor const &input, + int input_dim, + int output_dim, + int idx) { + float range = sqrt(1.0f / input_dim); + Initializer *embed_init = new UniformInitializer(std::rand(), -range, range); + return model->embedding( + input, input_dim, output_dim, AGGR_MODE_SUM, NULL, embed_init); +} + +Tensor create_attention_encoder(FFModel *model, + Tensor const &input, + int hidden_dim, + int num_heads, + int kdim, + int vdim) { + Tensor t = model->multihead_attention( + input, input, input, hidden_dim, num_heads, kdim, vdim); + return model->dense(model->dense(t, hidden_dim, AC_MODE_RELU, false /*bias*/), + hidden_dim, + AC_MODE_NONE, + false /*bias*/); +} + +void create_attention_encoder_decoder(FFModel *model, + Tensor const &input1, + Tensor const &input2, + Tensor &output1, + Tensor &output2, + int hidden_dim, + int num_heads, + int kdim, + int vdim) { + Tensor t1 = + model->add(model->multihead_attention( + input1, input1, input1, hidden_dim, num_heads, kdim, vdim), + input1); + t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/), + hidden_dim, + AC_MODE_NONE, + false /*bias*/); + Tensor t2 = + model->add(model->multihead_attention( + input2, input2, input2, hidden_dim, num_heads, kdim, vdim), + input2); + t2 = model->add( + model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim), + t2); + t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/), + hidden_dim, + AC_MODE_NONE, + false /*bias*/); + output1 = t1; + output2 = t2; +} + +TransformerConfig::TransformerConfig(void) { + hidden_size = 1024; + embedding_size = 1024; + num_heads = 16; + num_layers = 12; + sequence_length = 512; +} + +void parse_input_args(char **argv, int argc, TransformerConfig &config) { + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--num-layers")) { + config.num_layers = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--embedding-size")) { + config.embedding_size = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--hidden-size")) { + config.hidden_size = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--num-heads")) { + config.num_heads = atoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--sequence-length")) { + config.sequence_length = atoi(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffConfig; + TransformerConfig tfConfig; + { + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, tfConfig); + log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes); + log_app.print("Hidden Size(%d)", tfConfig.hidden_size); + log_app.print("Embedding Vocab Size(%d)", tfConfig.embedding_size); + log_app.print("Number of Heads(%d)", tfConfig.num_heads); + log_app.print("Number of Layers(%d)", tfConfig.num_layers); + log_app.print("Sequence Length(%d)", tfConfig.sequence_length); + } + FFModel ff(ffConfig); + Tensor input; + { + int const dims[] = { + ffConfig.batchSize, tfConfig.sequence_length, tfConfig.hidden_size}; + input = ff.create_tensor<3>(dims, DT_FLOAT); + } + // Tensor t = create_emb(&ff, input, tfConfig.embedding_size, + // tfConfig.hidden_size); Tensor input1 = input, input2 = input; Tensor t1, + // t2; + Tensor t = input; + for (int i = 0; i < tfConfig.num_layers; i++) { + t = create_attention_encoder(&ff, + t, + tfConfig.hidden_size, + tfConfig.num_heads, + tfConfig.hidden_size / tfConfig.num_heads, + tfConfig.hidden_size / tfConfig.num_heads); + // create_attention_encoder_decoder(&ff, input1, input2, t1, t2, + // tfConfig.hidden_size, tfConfig.num_heads, + // tfConfig.hidden_size / tfConfig.num_heads, + // tfConfig.hidden_size / tfConfig.num_heads); + // input1 = t1; + // input2 = t2; + } + t = ff.dense(t, 1, AC_MODE_NONE, false /*bias*/); + Optimizer *optimizer = new SGDOptimizer(&ff, 0.01f); + std::vector metrics; + // metrics.push_back(METRICS_ACCURACY); + // metrics.push_back(METRICS_MEAN_SQUARED_ERROR); + ff.compile(optimizer, LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics, CompMode::COMP_MODE_INFERENCE); + // Data Loader + DataLoader loader(ff, tfConfig, input, ff.label_tensor); + loader.next_batch(ff); + loader.reset(); + ff.init_operators(); + + // Start timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + log_app.print("Warmup finished...Start timer..."); + log_app.print("Num. epochs = %d", ffConfig.epochs); + log_app.print("Num. iterations/epoch = %d", + loader.num_samples / ffConfig.batchSize); + printf("parameters.size() = %lu\n", ff.parameters.size()); + double ts_start = Realm::Clock::current_time_in_microseconds(); + int epoch=0; + //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { + loader.reset(); + ff.reset_metrics(); + int iterations = loader.num_samples / ffConfig.batchSize; + for (int iter = 0; iter < iterations; iter++) { + // Only load data once for random input + if (iter == 0 && epoch == 0) + loader.next_batch(ff); + runtime->begin_trace(ctx, 111 /*trace_id*/); + ff.forward(); + //ff.zero_gradients(); + //ff.backward(); + //ff.update(); + runtime->end_trace(ctx, 111 /*trace_id*/); + } + //} + // End timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_end = Realm::Clock::current_time_in_microseconds(); + double run_time = 1e-6 * (ts_end - ts_start); + printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", + run_time, + loader.num_samples * ffConfig.epochs / run_time); +} + +DataLoader::DataLoader(FFModel &ff, + TransformerConfig const &tf, + Tensor const &_input, + Tensor const &_label) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + num_samples = 0; + log_app.print("Use random dataset..."); + num_samples = + ff.config.batchSize * ff.config.workersPerNode * ff.config.numNodes; + log_app.print("Number of random samples = %d\n", num_samples); + return; + { + batch_input = _input; + int const dims[] = {num_samples, tf.sequence_length, tf.hidden_size}; + full_input = ff.create_tensor<3>(dims, DT_FLOAT); + } + { + batch_label = _label; + int const dims[] = {num_samples, tf.sequence_length, 1}; + full_label = ff.create_tensor<3>(dims, DT_FLOAT); + } + // Load entire dataset + // TODO: Use index launcher instead of task launcher + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(NULL, 0)); + // regions[0]: full_sparse_input + launcher.add_region_requirement( + RegionRequirement(full_input->parallel_tensor->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->parallel_tensor->region, + MAP_TO_FB_MEMORY)); + launcher.add_field(0, FID_DATA); + // regions[1]: full_label + launcher.add_region_requirement( + RegionRequirement(full_label->parallel_tensor->region, + WRITE_ONLY, + EXCLUSIVE, + full_label->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(1, FID_DATA); + runtime->execute_task(ctx, launcher); +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + // Note that these instances are in ZCM, can only use + // TensorAccessorW with readOutput flag + AccessorWO const acc_input(regions[0], FID_DATA); + AccessorWO const acc_label(regions[1], FID_DATA); + Rect<3> rect_input = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Rect<3> rect_label = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + assert(acc_input.accessor.is_dense_arbitrary(rect_input)); + assert(acc_label.accessor.is_dense_arbitrary(rect_label)); + float *input_ptr = acc_input.ptr(rect_input.lo); + float *label_ptr = acc_label.ptr(rect_label.lo); + // assert(rect_input == rect_label); + + for (size_t i = 0; i < rect_input.volume(); i++) + input_ptr[i] = ((float)std::rand()) / RAND_MAX; + for (size_t i = 0; i < rect_label.volume(); i++) + label_ptr[i] = std::rand() % 2; +} + +void DataLoader::next_batch(FFModel &ff) { + return; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load Input + { + Domain domain = runtime->get_index_space_domain( + ctx, batch_input->parallel_tensor->parallel_is); + ArgumentMap argmap; + int idx = next_index; + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + assert(ff.config.batchSize % batch_input->parallel_tensor->dims[2].size == + 0); + meta.num_samples = + ff.config.batchSize / batch_input->parallel_tensor->dims[2].size; + for (int i = 0; i < meta.num_samples; i++) + meta.idxs[i] = idx++; + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, + batch_input->parallel_tensor->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->parallel_tensor->machine_view.hash()); + // Full dataset in ZCM + launcher.add_region_requirement( + RegionRequirement(full_input->parallel_tensor->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_input->parallel_tensor->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->parallel_tensor->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + // Load Labels + { + Domain domain = runtime->get_index_space_domain( + ctx, batch_label->parallel_tensor->parallel_is); + ArgumentMap argmap; + int idx = next_index; + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + assert(ff.config.batchSize % batch_label->parallel_tensor->dims[2].size == + 0); + meta.num_samples = + ff.config.batchSize / batch_label->parallel_tensor->dims[2].size; + for (int i = 0; i < meta.num_samples; i++) + meta.idxs[i] = idx++; + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, + batch_label->parallel_tensor->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_label->parallel_tensor->machine_view.hash()); + // Full dataset in ZCM + launcher.add_region_requirement( + RegionRequirement(full_label->parallel_tensor->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_label->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_label->parallel_tensor->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_label->parallel_tensor->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + // progress next_index + next_index += ff.config.batchSize; +} + +void DataLoader::reset() { + next_index = 0; +} + +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Inputs Task"); + } +} diff --git a/examples/cpp/inference/Transformer/transformer.cu b/examples/cpp/inference/Transformer/transformer.cu new file mode 100644 index 0000000000..7da473e54c --- /dev/null +++ b/examples/cpp/inference/Transformer/transformer.cu @@ -0,0 +1,58 @@ +/* Copyright 2021 Stanford, Facebook + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/cuda_helper.h" +#include "transformer.h" + +void DataLoader::load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + TensorAccessorR acc_full_input( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + TensorAccessorW acc_batch_input(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + int batch_size = acc_batch_input.rect.hi[2] - acc_batch_input.rect.lo[2] + 1; + int embed_size = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1; + int seq_length = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1; + assert(acc_batch_input.rect.hi[0] == acc_full_input.rect.hi[0]); + assert(acc_batch_input.rect.lo[0] == acc_full_input.rect.lo[0]); + assert(acc_batch_input.rect.hi[1] == acc_full_input.rect.hi[1]); + assert(acc_batch_input.rect.lo[1] == acc_full_input.rect.lo[1]); + + float *input_zc; + checkCUDA(cudaHostAlloc(&input_zc, + sizeof(float) * acc_batch_input.rect.volume(), + cudaHostAllocPortable | cudaHostAllocMapped)); + assert(batch_size == meta->num_samples); + for (int i = 0; i < batch_size; i++) { + int base_offset = meta->idxs[i] * embed_size * seq_length; + for (int j = 0; j < embed_size * seq_length; j++) + input_zc[i * embed_size * seq_length + j] = + acc_full_input.ptr[base_offset + j]; + } + checkCUDA(cudaMemcpy(acc_batch_input.ptr, + input_zc, + sizeof(float) * acc_batch_input.rect.volume(), + cudaMemcpyHostToDevice)); + checkCUDA(cudaFreeHost(input_zc)); +} diff --git a/examples/cpp/inference/Transformer/transformer.h b/examples/cpp/inference/Transformer/transformer.h new file mode 100644 index 0000000000..551a9eff19 --- /dev/null +++ b/examples/cpp/inference/Transformer/transformer.h @@ -0,0 +1,54 @@ +/* Copyright 2021 Facebook, Stanford + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/model.h" +#define MAX_NUM_SAMPLES 65536 + +using namespace Legion; +using namespace FlexFlow; + +struct TransformerConfig { + TransformerConfig(void); + int hidden_size, embedding_size, num_heads, num_layers, sequence_length; +}; + +class DataLoader { +public: + DataLoader(FFModel &ff, + TransformerConfig const &tf, + Tensor const &_input, + Tensor const &_label); + void next_batch(FFModel &ff); + void reset(); + static void load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + +public: + int num_samples, next_index; + +private: + Tensor full_input, batch_input, full_label, batch_label; +}; + +struct SampleIdxs { + int num_samples; + int idxs[MAX_NUM_SAMPLES]; +}; From f8644691e1c3e1c8fc4d9bc79efa1097ee366706 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 13 Nov 2022 11:13:47 -0500 Subject: [PATCH 007/344] added g-shard stub --- CMakeLists.txt | 9 +- .../cpp/inference/G-Shard-MoE/CMakeLists.txt | 12 ++ examples/cpp/inference/G-Shard-MoE/Makefile | 39 ++++++ examples/cpp/inference/G-Shard-MoE/g_shard.cc | 121 ++++++++++++++++++ 4 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 examples/cpp/inference/G-Shard-MoE/CMakeLists.txt create mode 100644 examples/cpp/inference/G-Shard-MoE/Makefile create mode 100644 examples/cpp/inference/G-Shard-MoE/g_shard.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f886fcec0..07d77fb129 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -335,6 +335,8 @@ option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) option(FF_BUILD_SPLIT_TEST "build split test example" OFF) option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) +option(FF_BUILD_TRANSFORMER_INFERENCE "build transformer inference example" OFF) +option(FF_BUILD_G_SHARD_INFERENCE "build G-Shard inference example" OFF) option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) @@ -419,11 +421,16 @@ endif() if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) +endif() -if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) +if(FF_BUILD_TRANSFORMER_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/Transformer) endif() +if(FF_BUILD_G_SHARD_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/G-Shard-MoE) +endif() + # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/examples/cpp/inference/G-Shard-MoE/CMakeLists.txt b/examples/cpp/inference/G-Shard-MoE/CMakeLists.txt new file mode 100644 index 0000000000..daab2dc49b --- /dev/null +++ b/examples/cpp/inference/G-Shard-MoE/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExampleInferenceGShard) +set(project_target g_shard) + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + g_shard.cc) + +cuda_add_executable(${project_target} ${CPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/examples/cpp/inference/G-Shard-MoE/Makefile b/examples/cpp/inference/G-Shard-MoE/Makefile new file mode 100644 index 0000000000..f64e670e05 --- /dev/null +++ b/examples/cpp/inference/G-Shard-MoE/Makefile @@ -0,0 +1,39 @@ +# Copyright 2021 CMU, Facebook, LANL, MIT, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 1 # Include debugging symbols +MAX_DIM ?= 5 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 0 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 0 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) +USE_HIP ?= 1 # Include HIP support (requires HIP) +HIP_TARGET ?= ROCM +USE_GPU_REDUCTIONS ?= 0 + +# Put the binary file name here +OUTFILE ?= g_shard +# List all the application source files here +GEN_SRC = g_shard.cc +GEN_GPU_SRC = +GEN_HIP_SRC = + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/G-Shard-MoE/g_shard.cc b/examples/cpp/inference/G-Shard-MoE/g_shard.cc new file mode 100644 index 0000000000..253fe77c93 --- /dev/null +++ b/examples/cpp/inference/G-Shard-MoE/g_shard.cc @@ -0,0 +1,121 @@ +/* Copyright 2021 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/model.h" +#include +#include +#include +using namespace Legion; +using namespace FlexFlow; + +void create_attention_decoder(FFModel *model, + Tensor const &input1, + Tensor const &input2, + Tensor &output1, + Tensor &output2, + int hidden_dim, + int num_heads, + int kdim, + int vdim) { + Tensor t1 = + model->add(model->multihead_attention( + input1, input1, input1, hidden_dim, num_heads, kdim, vdim), + input1); + t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/), + hidden_dim, + AC_MODE_NONE, + false /*bias*/); + Tensor t2 = + model->add(model->multihead_attention( + input2, input2, input2, hidden_dim, num_heads, kdim, vdim), + input2); + t2 = model->add( + model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim), + t2); + t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/), + hidden_dim, + AC_MODE_NONE, + false /*bias*/); + output1 = t1; + output2 = t2; +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffConfig; + fprintf(stderr, + "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes); + FFModel ff(ffConfig); + + std::vector hidden_dims = { + 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; + Tensor input1, input2; + { + int const dims[] = {ffConfig.batchSize, 1024}; + input1 = ff.create_tensor<2>(dims, DT_FLOAT); + input2 = ff.create_tensor<2>(dims, DT_FLOAT); + } + Tensor t1 = input1, t2 = input2; + for (size_t i = 0; i < hidden_dims.size(); i++) { + int const dims[] = {hidden_dims[i], t1->dims[0]}; + ActiMode acti_mode = + (i + 1 == hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; + t1 = ff.dense(t1, hidden_dims[i], acti_mode, false); + t2 = ff.dense(t2, hidden_dims[i], acti_mode, false); + } + Tensor t = ff.add(t1, t2); + t = ff.softmax(t); + Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f); + std::vector metrics; + metrics.push_back(METRICS_ACCURACY); + metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); + ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics, CompMode::COMP_MODE_INFERENCE); + ff.init_operators(); + // Start timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_start = Realm::Clock::current_time_in_microseconds(); + //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { + ff.reset_metrics(); + int iterations = 128; + for (int iter = 0; iter < iterations; iter++) { + runtime->begin_trace(ctx, 111 /*trace_id*/); + ff.forward(); + runtime->end_trace(ctx, 111 /*trace_id*/); + } + // End timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_end = Realm::Clock::current_time_in_microseconds(); + double run_time = 1e-6 * (ts_end - ts_start); + printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", + run_time, + ffConfig.batchSize * 128 * ffConfig.epochs / run_time); +} + +void FlexFlow::register_custom_tasks() {} From 7eb3a106ad554f90ee1cd900d8c116f45a781a12 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 16 Nov 2022 03:01:29 -0500 Subject: [PATCH 008/344] setting batch_size=1 --- examples/cpp/inference/MLP_Unify/mlp.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 26623b4a76..9479cb3bcf 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -25,6 +25,7 @@ void FlexFlow::top_level_task(Task const *task, Context ctx, Runtime *runtime) { FFConfig ffConfig; + ffConfig.batchSize=1; fprintf(stderr, "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", ffConfig.batchSize, @@ -36,7 +37,7 @@ void FlexFlow::top_level_task(Task const *task, 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; Tensor input1, input2; { - int const dims[] = {ffConfig.batchSize, 1024}; + int const dims[] = {64, 1024}; input1 = ff.create_tensor<2>(dims, DT_FLOAT); input2 = ff.create_tensor<2>(dims, DT_FLOAT); } From aa0385030423797724092decc30d5539f7ec6bce Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 16 Nov 2022 15:51:50 +0000 Subject: [PATCH 009/344] added stub of moe part --- examples/cpp/inference/G-Shard-MoE/g_shard.cc | 76 ++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/examples/cpp/inference/G-Shard-MoE/g_shard.cc b/examples/cpp/inference/G-Shard-MoE/g_shard.cc index 253fe77c93..d8310998f9 100644 --- a/examples/cpp/inference/G-Shard-MoE/g_shard.cc +++ b/examples/cpp/inference/G-Shard-MoE/g_shard.cc @@ -20,15 +20,87 @@ using namespace Legion; using namespace FlexFlow; +// embed_dim=768, +// num_heads=12, +// kdim=None, +// vdim=None, +// dropout=0.1, +// bias=True, +// add_bias_kv=False, +// add_zero_attn=False, +// self_attention=True, +// encoder_decoder_attention=False, +// q_noise=0.0, +// qn_block_size=8, + +// Tensor FFModel::multihead_attention(const Tensor query, +// const Tensor key, +// const Tensor value, +// int embed_dim, +// int num_heads, +// int kdim, +// int vdim, +// float dropout, +// bool bias, +// bool add_bias_kv, +// bool add_zero_attn, +// Initializer *kernel_initializer, +// char const *name) { + + void create_attention_decoder(FFModel *model, Tensor const &input1, Tensor const &input2, Tensor &output1, Tensor &output2, - int hidden_dim, + int embed_dim, int num_heads, int kdim, - int vdim) { + int vdim, + float dropout=0.1, + bool normalize_before, + bool is_moe) { + + std::vector axes = {embed_dim}; + Tensor x = normalize_before ? model->LayerNorm(input1 /*const Tensor input*/, &axes /*std::vector const &axes*/, true /*elementwise_affine*/, 1e-05 /*eps*/) : input1; + x = model->add(model->dropout(model->multihead_attention(x, x, x, embed_dim, num_heads, embed_dim, embed_dim, dropout, true /*bias*/, false /*add_bias_kv*/, false /*add_zero_attn*/), dropout), x); + //x = normalize_before ? x : model->LayerNorm(x, &axes, true, 1e-05); + x = model->LayerNorm(x, &axes, true, 1e-05); + + if(!is_moe) { + x = model->dropout(model->dense(model->dropout(model->dense(x, 3072, AC_MODE_GELU, true /*bias*/), dropout), embed_dim, AC_MODE_NONE, true /*bias*/), dropout); + } else { + // x - seq_len, batch_size, model_dim + // x = x.transpose(0, 1) # batch_size, seq_len, model_dim + // x, l_aux = self.moe_layer(x) + // x = x.transpose(0, 1) # seq_len, batch_size, model_dim + //x = self.residual_connection(x, residual) + + //if not self.normalize_before: + // x = self.final_layer_norm(x) + x = normalize_before ? x : model->LayerNorm(x, &axes, true, 1e-05); + float alpha = 2.0f; // factor overhead tensor size for imbalance + float lambda = 0.04f; // multiplier for load balance term + + // MoE model + Tensor gate_preds = ff.dense(x, num_exp, AC_MODE_RELU); + Tensor topK_output[2]; + ff.top_k(gate_preds, topK_output, num_select, false); + + Tensor exp_tensors[num_exp]; + ff.group_by(input, topK_output[1], exp_tensors, num_exp, alpha); + + Tensor agg_inputs[num_exp + 4]; + agg_inputs[0] = ff.softmax(topK_output[0]); // gate preds + agg_inputs[1] = topK_output[1]; // gate assign + agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) + agg_inputs[3] = gate_preds; // full gate preds + for (int i = 0; i < num_exp; i++) { + Tensor exp_pred = ff.dense(exp_tensors[i], OUT_DIM, AC_MODE_RELU); + agg_inputs[i + 4] = ff.softmax(exp_pred); + } + } + Tensor t1 = model->add(model->multihead_attention( input1, input1, input1, hidden_dim, num_heads, kdim, vdim), From b1e1ed43c280d9981bba197ddacef9ad79c78295 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 16 Nov 2022 16:01:57 +0000 Subject: [PATCH 010/344] added moe placeholder --- examples/cpp/inference/G-Shard-MoE/g_shard.cc | 69 ++++++++++--------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/examples/cpp/inference/G-Shard-MoE/g_shard.cc b/examples/cpp/inference/G-Shard-MoE/g_shard.cc index d8310998f9..95c0b3f1dd 100644 --- a/examples/cpp/inference/G-Shard-MoE/g_shard.cc +++ b/examples/cpp/inference/G-Shard-MoE/g_shard.cc @@ -58,14 +58,14 @@ void create_attention_decoder(FFModel *model, int kdim, int vdim, float dropout=0.1, - bool normalize_before, - bool is_moe) { + bool normalize_before=false, + bool is_moe=false) { std::vector axes = {embed_dim}; - Tensor x = normalize_before ? model->LayerNorm(input1 /*const Tensor input*/, &axes /*std::vector const &axes*/, true /*elementwise_affine*/, 1e-05 /*eps*/) : input1; + Tensor x = normalize_before ? model->layer_norm(input1 /*const Tensor input*/, axes /*std::vector const &axes*/, true /*elementwise_affine*/, 1e-05 /*eps*/) : input1; x = model->add(model->dropout(model->multihead_attention(x, x, x, embed_dim, num_heads, embed_dim, embed_dim, dropout, true /*bias*/, false /*add_bias_kv*/, false /*add_zero_attn*/), dropout), x); - //x = normalize_before ? x : model->LayerNorm(x, &axes, true, 1e-05); - x = model->LayerNorm(x, &axes, true, 1e-05); + //x = normalize_before ? x : model->layer_norm(x, axes, true, 1e-05); + x = model->layer_norm(x, axes, true, 1e-05); if(!is_moe) { x = model->dropout(model->dense(model->dropout(model->dense(x, 3072, AC_MODE_GELU, true /*bias*/), dropout), embed_dim, AC_MODE_NONE, true /*bias*/), dropout); @@ -78,50 +78,53 @@ void create_attention_decoder(FFModel *model, //if not self.normalize_before: // x = self.final_layer_norm(x) - x = normalize_before ? x : model->LayerNorm(x, &axes, true, 1e-05); + x = normalize_before ? x : model->layer_norm(x, axes, true, 1e-05); float alpha = 2.0f; // factor overhead tensor size for imbalance float lambda = 0.04f; // multiplier for load balance term + int num_exp = 128; + int num_select = 2; // MoE model - Tensor gate_preds = ff.dense(x, num_exp, AC_MODE_RELU); + Tensor input = x; + Tensor gate_preds = model->dense(x, num_exp, AC_MODE_RELU); Tensor topK_output[2]; - ff.top_k(gate_preds, topK_output, num_select, false); + model->top_k(gate_preds, topK_output, num_select, false); Tensor exp_tensors[num_exp]; - ff.group_by(input, topK_output[1], exp_tensors, num_exp, alpha); + model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); Tensor agg_inputs[num_exp + 4]; - agg_inputs[0] = ff.softmax(topK_output[0]); // gate preds + agg_inputs[0] = model->softmax(topK_output[0]); // gate preds agg_inputs[1] = topK_output[1]; // gate assign agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) agg_inputs[3] = gate_preds; // full gate preds for (int i = 0; i < num_exp; i++) { - Tensor exp_pred = ff.dense(exp_tensors[i], OUT_DIM, AC_MODE_RELU); - agg_inputs[i + 4] = ff.softmax(exp_pred); + Tensor exp_pred = model->dense(exp_tensors[i], embed_dim, AC_MODE_RELU); + agg_inputs[i + 4] = model->softmax(exp_pred); } } - Tensor t1 = - model->add(model->multihead_attention( - input1, input1, input1, hidden_dim, num_heads, kdim, vdim), - input1); - t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/), - hidden_dim, - AC_MODE_NONE, - false /*bias*/); - Tensor t2 = - model->add(model->multihead_attention( - input2, input2, input2, hidden_dim, num_heads, kdim, vdim), - input2); - t2 = model->add( - model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim), - t2); - t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/), - hidden_dim, - AC_MODE_NONE, - false /*bias*/); - output1 = t1; - output2 = t2; + // Tensor t1 = + // model->add(model->multihead_attention( + // input1, input1, input1, hidden_dim, num_heads, kdim, vdim), + // input1); + // t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/), + // hidden_dim, + // AC_MODE_NONE, + // false /*bias*/); + // Tensor t2 = + // model->add(model->multihead_attention( + // input2, input2, input2, hidden_dim, num_heads, kdim, vdim), + // input2); + // t2 = model->add( + // model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim), + // t2); + // t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/), + // hidden_dim, + // AC_MODE_NONE, + // false /*bias*/); + // output1 = t1; + // output2 = t2; } void FlexFlow::top_level_task(Task const *task, From b895fd0373528ac74c68e319b6664219aac1fb00 Mon Sep 17 00:00:00 2001 From: Rae Wong <33883582+yingyee0111@users.noreply.github.com> Date: Wed, 16 Nov 2022 11:14:03 -0500 Subject: [PATCH 011/344] Data generator script, to refactor interface later --- examples/cpp/inference/data_generator.cpp | 140 ++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 examples/cpp/inference/data_generator.cpp diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp new file mode 100644 index 0000000000..b6b2fc2ede --- /dev/null +++ b/examples/cpp/inference/data_generator.cpp @@ -0,0 +1,140 @@ +// +// main.cpp +// dataloader +// +// Created by User on 11/15/22. +// + +#include +#include +#include +#include +#include +#include +#include +using namespace std; +typedef std::chrono::high_resolution_clock Clock; +typedef std::chrono::milliseconds milliseconds; + +class Generator { + public: + size_t num_requests; // total number of requests + size_t tensor_size; // dimension of one request tensor + bool poisson; // false implied uniform distribution + double lambda; // mean #num of arrivals per sec + + Generator(size_t req, size_t tensor, bool poi, double lamb) { + num_requests = req; + tensor_size = tensor; + poisson = poi; + lambda = lamb; + compute_distribution(); + arrivals_ptr = arrivals.begin(); + timer_started = false; + } + + vector> get_requests(void); // function to retrieve requests + + private: + bool timer_started; // tracks if start time has been initiated + Clock::time_point start_time; // time when get_requests() is called for the first time + vector arrivals; // arrival times (ms) generated based on distribution + vector::iterator arrivals_ptr; // next request to output + + void compute_distribution( void ); // populate arrivals + vector get_random_tensor(void); // generate a random tensor +}; + +void Generator::compute_distribution( void ) { + // set up uniform number generator [0,1) + random_device rnd; + mt19937 gen(rnd()); + uniform_real_distribution dist {0, 1.0}; + double cur_arrival = 0; // assume first request comes in at time 0 + + for (size_t i = 0; i < num_requests; i++) { + arrivals.push_back(cur_arrival); + cout << "arrival time " << i << ": +" << cur_arrival << "ms \n"; + + if (poisson) { + double u = dist(gen); + double interval = -(1/lambda) * log(1-u) * 1000; + cur_arrival += interval; + } else { + cur_arrival += (1000/lambda); + } + } + return; +}; + +vector> Generator::get_requests(void) { + Clock::time_point cur_time = Clock::now(); + vector> requests; + if (!timer_started){ + // simply return one request and start timer for the first call + start_time = Clock::now(); + timer_started = true; + arrivals_ptr++; + requests.push_back(get_random_tensor()); + return requests; + } + + // output requests till we reach current timestamp + milliseconds ms_from_start = chrono::duration_cast(cur_time - start_time); + while (arrivals_ptr < arrivals.end() && ms_from_start.count() >= *arrivals_ptr){ + cout << "output request at arrival time +" << *arrivals_ptr << "\n"; + requests.push_back(get_random_tensor()); + arrivals_ptr++; + } + return requests; +}; + +vector Generator::get_random_tensor(void) { + random_device rnd_device; + mt19937 mersenne_engine {rnd_device()}; + uniform_real_distribution dist {0, 1.0}; // state distribution + + auto gen = [&dist, &mersenne_engine](){ + return dist(mersenne_engine); + }; + + vector vec(tensor_size); + generate(begin(vec), end(vec), gen); + return vec; +}; + +// for debugging +void print_requests(vector> req) { + cout << "printing requests\n"; + for (vector v: req){ + for (double e: v) { + cout << e << ","; + } + cout << "\n"; + } + cout << "\n"; +}; + +int main(int argc, const char * argv[]) { + // insert code here... + cout << "Hello, World!\n"; + Generator data_generator(10, 4, true, 1); + + vector> req0 = data_generator.get_requests(); + print_requests(req0); + + this_thread::sleep_for(milliseconds(1200)); + vector> req1200 = data_generator.get_requests(); + print_requests(req1200); + + this_thread::sleep_for(milliseconds(10)); + vector> req1210 = data_generator.get_requests(); + print_requests(req1210); + + this_thread::sleep_for(milliseconds(4000)); + vector> req5210 = data_generator.get_requests(); + print_requests(req5210); + + + return 0; +} From dcac31af79db74e05c2bb6a6288005e2cacddaa1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 16 Nov 2022 17:53:49 +0000 Subject: [PATCH 012/344] integrate with Rae's generator --- .../cpp/inference/MLP_Unify/CMakeLists.txt | 4 ++- examples/cpp/inference/MLP_Unify/mlp.cc | 27 ++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/CMakeLists.txt b/examples/cpp/inference/MLP_Unify/CMakeLists.txt index e4299dcfb3..5a2402fdd0 100644 --- a/examples/cpp/inference/MLP_Unify/CMakeLists.txt +++ b/examples/cpp/inference/MLP_Unify/CMakeLists.txt @@ -5,8 +5,10 @@ set(project_target mlp_inference_unify) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} - mlp.cc) + mlp.cc + ../data_generator.h) cuda_add_executable(${project_target} ${CPU_SRC}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 9479cb3bcf..938cb37d60 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -17,6 +17,7 @@ #include #include #include +#include "data_generator.h" using namespace Legion; using namespace FlexFlow; @@ -26,6 +27,7 @@ void FlexFlow::top_level_task(Task const *task, Runtime *runtime) { FFConfig ffConfig; ffConfig.batchSize=1; + size_t total_requests = 256; fprintf(stderr, "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", ffConfig.batchSize, @@ -37,7 +39,7 @@ void FlexFlow::top_level_task(Task const *task, 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; Tensor input1, input2; { - int const dims[] = {64, 1024}; + int const dims[] = {total_requests, 1024}; input1 = ff.create_tensor<2>(dims, DT_FLOAT); input2 = ff.create_tensor<2>(dims, DT_FLOAT); } @@ -67,12 +69,25 @@ void FlexFlow::top_level_task(Task const *task, double ts_start = Realm::Clock::current_time_in_microseconds(); //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { ff.reset_metrics(); - int iterations = 128; - for (int iter = 0; iter < iterations; iter++) { - runtime->begin_trace(ctx, 111 /*trace_id*/); - ff.forward(); - runtime->end_trace(ctx, 111 /*trace_id*/); + //int iterations = 128; + size_t processed_requests=0; + Generator data_generator(total_requests, 4, true, 25); + while(processed_requests < total_requests) { + vector> req = data_generator.get_requests(); + size_t iterations = req.size(); + for (size_t iter = 0; iter < iterations; iter++) { + runtime->begin_trace(ctx, 111 /*trace_id*/); + ff.forward(); + runtime->end_trace(ctx, 111 /*trace_id*/); + } + processed_requests+= iterations; } + + // for (int iter = 0; iter < iterations; iter++) { + // runtime->begin_trace(ctx, 111 /*trace_id*/); + // ff.forward(); + // runtime->end_trace(ctx, 111 /*trace_id*/); + // } // End timer { runtime->issue_execution_fence(ctx); From 92c269fa71feb833850f78d157600a12c67e20da Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 16 Nov 2022 17:54:11 +0000 Subject: [PATCH 013/344] add missing file --- examples/cpp/inference/data_generator.h | 109 ++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 examples/cpp/inference/data_generator.h diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h new file mode 100644 index 0000000000..936b07727a --- /dev/null +++ b/examples/cpp/inference/data_generator.h @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +using namespace std; +typedef std::chrono::high_resolution_clock Clock; +typedef std::chrono::milliseconds milliseconds; + +class Generator { + public: + size_t num_requests; // total number of requests + size_t tensor_size; // dimension of one request tensor + bool poisson; // false implied uniform distribution + double lambda; // mean #num of arrivals per sec + + Generator(size_t req, size_t tensor, bool poi, double lamb) { + num_requests = req; + tensor_size = tensor; + poisson = poi; + lambda = lamb; + compute_distribution(); + arrivals_ptr = arrivals.begin(); + timer_started = false; + } + + vector> get_requests(void); // function to retrieve requests + + private: + bool timer_started; // tracks if start time has been initiated + Clock::time_point start_time; // time when get_requests() is called for the first time + vector arrivals; // arrival times (ms) generated based on distribution + vector::iterator arrivals_ptr; // next request to output + + void compute_distribution( void ); // populate arrivals + vector get_random_tensor(void); // generate a random tensor +}; + +void Generator::compute_distribution( void ) { + // set up uniform number generator [0,1) + random_device rnd; + mt19937 gen(rnd()); + uniform_real_distribution dist {0, 1.0}; + double cur_arrival = 0; // assume first request comes in at time 0 + + for (size_t i = 0; i < num_requests; i++) { + arrivals.push_back(cur_arrival); + cout << "arrival time " << i << ": +" << cur_arrival << "ms \n"; + + if (poisson) { + double u = dist(gen); + double interval = -(1/lambda) * log(1-u) * 1000; + cur_arrival += interval; + } else { + cur_arrival += (1000/lambda); + } + } + return; +}; + +vector> Generator::get_requests(void) { + Clock::time_point cur_time = Clock::now(); + vector> requests; + if (!timer_started){ + // simply return one request and start timer for the first call + start_time = Clock::now(); + timer_started = true; + arrivals_ptr++; + requests.push_back(get_random_tensor()); + return requests; + } + + // output requests till we reach current timestamp + milliseconds ms_from_start = chrono::duration_cast(cur_time - start_time); + while (arrivals_ptr < arrivals.end() && ms_from_start.count() >= *arrivals_ptr){ + cout << "output request at arrival time +" << *arrivals_ptr << "\n"; + requests.push_back(get_random_tensor()); + arrivals_ptr++; + } + return requests; +}; + +vector Generator::get_random_tensor(void) { + random_device rnd_device; + mt19937 mersenne_engine {rnd_device()}; + uniform_real_distribution dist {0, 1.0}; // state distribution + + auto gen = [&dist, &mersenne_engine](){ + return dist(mersenne_engine); + }; + + vector vec(tensor_size); + generate(begin(vec), end(vec), gen); + return vec; +}; + +// for debugging +void print_requests(vector> req) { + cout << "printing requests\n"; + for (vector v: req){ + for (double e: v) { + cout << e << ","; + } + cout << "\n"; + } + cout << "\n"; +}; From aa3edca13bf2e77a33e12694246efe4603e9cec2 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 17 Nov 2022 05:08:03 +0000 Subject: [PATCH 014/344] [InferenceManager] initial impl --- examples/cpp/inference/MLP_Unify/mlp.cc | 13 ++--- include/flexflow/inference.h | 38 +++++++++++++ include/flexflow/operator.h | 5 ++ include/flexflow/ops/linear.h | 4 ++ src/ops/linear.cc | 45 +++++++++++++++ src/runtime/inference_manager.cc | 76 +++++++++++++++++++++++++ 6 files changed, 174 insertions(+), 7 deletions(-) create mode 100644 include/flexflow/inference.h create mode 100644 src/runtime/inference_manager.cc diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 938cb37d60..0f74a1e974 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -14,6 +14,7 @@ */ #include "flexflow/model.h" +#include "flexflow/inference.h" #include #include #include @@ -53,11 +54,8 @@ void FlexFlow::top_level_task(Task const *task, } Tensor t = ff.add(t1, t2); t = ff.softmax(t); - Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f); - std::vector metrics; - metrics.push_back(METRICS_ACCURACY); - metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); - ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics, CompMode::COMP_MODE_INFERENCE); + int num_inflight_batches = 10; + InferenceManager im(&ff, 5/*num_requests_per_batch*/, num_inflight_batches); ff.init_operators(); // Start timer { @@ -67,8 +65,9 @@ void FlexFlow::top_level_task(Task const *task, future.get_void_result(); } double ts_start = Realm::Clock::current_time_in_microseconds(); + int index = 0; //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - ff.reset_metrics(); + // ff.reset_metrics(); //int iterations = 128; size_t processed_requests=0; Generator data_generator(total_requests, 4, true, 25); @@ -77,7 +76,7 @@ void FlexFlow::top_level_task(Task const *task, size_t iterations = req.size(); for (size_t iter = 0; iter < iterations; iter++) { runtime->begin_trace(ctx, 111 /*trace_id*/); - ff.forward(); + im.inference((index++) % num_inflight_batches); runtime->end_trace(ctx, 111 /*trace_id*/); } processed_requests+= iterations; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h new file mode 100644 index 0000000000..7991548b44 --- /dev/null +++ b/include/flexflow/inference.h @@ -0,0 +1,38 @@ +/* Copyright 2022 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "flexflow/model.h" + +namespace FlexFlow { + +class FFModel; + +class InferenceManager { +public: + InferenceManager(FFModel* _model, + int max_num_requests_per_batch, + int max_num_inflight_batches); + void compile_model_and_allocate_buffer(void); + void inference(int index); +public: + std::unordered_map > tensor_buffer; + FFModel* model; + int max_num_requests_per_batch; + int max_num_inflight_batches; +}; + +} diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 3fd84ce55b..bdaecac8a2 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -187,6 +187,11 @@ class Op { virtual void init(FFModel const &) = 0; virtual void forward(FFModel const &) = 0; virtual void backward(FFModel const &) = 0; + // Pure virtual functions for inference + virtual void inference(FFModel const &, + std::vector const &, + std::vector const &, + std::vector const &) {assert(false);}; virtual void print_layer(FFModel const &model) = 0; virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index eb414f2ed2..6ad1a0752f 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -37,6 +37,10 @@ class Linear : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + std::vector const &) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 668c8d070a..b47a884c5c 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -367,6 +367,51 @@ void Linear::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void Linear::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_weights, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(LINEAR_FWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_weights[0]->region)); + launcher.add_field(2, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(batch_weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_weights[1]->region)); + launcher.add_field(3, FID_DATA); + } + runtime->execute_index_space(ctx, launcher); +} + void Linear::forward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc new file mode 100644 index 0000000000..cd68b5e04d --- /dev/null +++ b/src/runtime/inference_manager.cc @@ -0,0 +1,76 @@ +/* Copyright 2022 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" + +namespace FlexFlow { + +using namespace Legion; + +InferenceManager::InferenceManager(FFModel *_model, + int _max_num_requests_per_batch, + int _max_num_inflight_batches) + : model(_model), + max_num_requests_per_batch(_max_num_requests_per_batch), + max_num_inflight_batches(_max_num_inflight_batches) { + +} + +void InferenceManager::compile_model_and_allocate_buffer(void) { + std::vector metrics; + model->config.batchSize = max_num_requests_per_batch; + model->compile(LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics, COMP_MODE_INFERENCE); + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + for (const auto& op : model->operators) { + // Skip weight operators + if (op->op_type == OP_WEIGHT) + continue; + for (int i = 0; i < op->numOutputs; i++) { + ParallelTensor pt_base = op->outputs[i]; + assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); + std::vector list; + for (int j = 0; j < max_num_inflight_batches; j++) { + // Copy the metadata from pt_base to pt + ParallelTensor pt = new ParallelTensorBase(*pt_base); + pt->region = runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition(ctx, pt->region, pt_base->part.get_index_partition()); + list.push_back(pt); + } + tensor_buffer[pt_base] = list; + } + } +} + +void InferenceManager::inference(int index) { + assert(index < max_num_inflight_batches); + for (size_t o = 0; o < model->operators.size(); o++) { + Op* op = model->operators[o]; + std::vector inputs(op->numInputs); + std::vector weights(op->numWeights); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) + inputs[i] = tensor_buffer[op->inputs[i]][index]; + for (int i = 0; i < op->numWeights; i++) + weights[i] = op->weights[i]; + for (int i = 0; i < op->numOutputs; i++) + outputs[i] = tensor_buffer[op->outputs[i]][index]; + op->inference(*model, inputs, weights, outputs); + } +}; + +}; From 17855aeaa071cbb09944947161b704b26911ce62 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 23 Nov 2022 10:04:57 -0500 Subject: [PATCH 015/344] added dataloader --- .../cpp/inference/MLP_Unify/CMakeLists.txt | 1 + examples/cpp/inference/MLP_Unify/mlp.cc | 234 +++++++++++++++--- examples/cpp/inference/MLP_Unify/mlp.h | 62 +++++ 3 files changed, 259 insertions(+), 38 deletions(-) create mode 100644 examples/cpp/inference/MLP_Unify/mlp.h diff --git a/examples/cpp/inference/MLP_Unify/CMakeLists.txt b/examples/cpp/inference/MLP_Unify/CMakeLists.txt index 5a2402fdd0..e83d292efc 100644 --- a/examples/cpp/inference/MLP_Unify/CMakeLists.txt +++ b/examples/cpp/inference/MLP_Unify/CMakeLists.txt @@ -6,6 +6,7 @@ set(project_target mlp_inference_unify) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} mlp.cc + mlp.h ../data_generator.h) cuda_add_executable(${project_target} ${CPU_SRC}) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 0f74a1e974..9298a87b10 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -13,50 +13,190 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/inference.h" -#include -#include -#include -#include "data_generator.h" +#include "mlp.h" + using namespace Legion; using namespace FlexFlow; +DataLoader::DataLoader(FFModel &ff, + MLPConfig const &mlpConfig, + InferenceManager const *im, + Tensor input) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + log_app.print("Use random dataset..."); + + // The number of samples is the total number of request samples that can ever be loaded into memory at the same time. In the case of training, the value is batchSize * workersPerNode * numNodes, since each worker can only process one batch at a time. In inference, batchSize + size_t max_parallel_requests = im->max_num_inflight_batches * (ff.config.batchSize * im->max_num_requests_per_batch); + num_samples = max_parallel_requests * ff.config.workersPerNode * ff.config.numNodes; + log_app.print("Number of random samples = %d\n", num_samples); + + // return; + + // Create full input + { + batch_input = input; + int const dims[] = {num_samples, tf.sequence_length * mlpConfig->embedding_size}; + full_input = ff.create_tensor<2>(dims, DT_FLOAT); + } + + // Load entire dataset + // TODO: Use index launcher instead of task launcher + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(NULL, 0)); + launcher.add_region_requirement( + RegionRequirement(full_input.parallel_tensor->region, + WRITE_ONLY, + EXCLUSIVE, + full_input.parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + runtime->execute_task(ctx, launcher); + reset(); + next_batch(ff); +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 1); // no labels + assert(task->regions.size() == 1); + // Note that these instances are in ZCM, can only use + // TensorAccessorW with readOutput flag + AccessorWO const acc_input(regions[0], FID_DATA); + Rect<2> rect_input = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + assert(acc_input.accessor.is_dense_arbitrary(rect_input)); + float *input_ptr = acc_input.ptr(rect_input.lo); + // Fill dataset with random data + for (size_t i = 0; i < rect_input.volume(); i++) { + input_ptr[i] = ((float)std::rand()) / RAND_MAX; + } + log_app.print("finish loading data\n"); +} + +void DataLoader::next_batch(FFModel &ff) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load Input + { + Rect<2> rect = runtime->get_index_space_domain(ctx, batch_input->parallel_tensor->parallel_is); + ArgumentMap argmap; + int idx = next_index; + for (PointInRectIterator<2> it(rect); it(); it++) { + SampleIdxs meta; + assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); + meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1); + for (int i = 0; i < meta.num_samples; i++) + meta.idxs[i] = idx++; + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, + batch_input->parallel_tensor->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->parallel_tensor->machine_view.hash()); + launcher.add_region_requirement( + RegionRequirement(full_input->parallel_tensor->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->parallel_tensor->region, + MAP_TO_FB_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_input->parallel_tensor->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->parallel_tensor->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + // progress to the next_index + next_index += ff.config.batchSize; +} + +void DataLoader::reset() { + next_index = 0; +} + + +Tensor create_mlp(FFModel *model, + MLPConfig const *mlpConfig, + Tensor const &input1, + Tensor const &input2) { + Tensor t1 = input1, t2 = input2; + for (size_t i = 0; i < mlpConfig->hidden_dims.size(); i++) { + int const dims[] = {mlpConfig->hidden_dims[i], t1->dims[0]}; + ActiMode acti_mode = + (i + 1 == mlpConfig->hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; + t1 = model->dense(t1, mlpConfig->hidden_dims[i], acti_mode, false); + t2 = model->dense(t2, mlpConfig->hidden_dims[i], acti_mode, false); + } + Tensor t = model->add(t1, t2); + return model->softmax(t); +} + void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + + // Inference parameters + size_t total_requests = 256; // total number of requests processed as part of the simulation + size_t request_tensor_size = 4; // request tensor dimensions + bool poisson_distribution=true; + double lambda = 25; // average number of request arrivals per second + size_t num_requests_per_batch=5; + size_t num_inflight_batches = 10; + + // MLP parameters + size_t embedding_size=1024; + size_t sequence_length=512; + std::vector hidden_dims = {8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; + FFConfig ffConfig; ffConfig.batchSize=1; - size_t total_requests = 256; - fprintf(stderr, - "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes); + { + fprintf(stderr, "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes + ); + } FFModel ff(ffConfig); - - std::vector hidden_dims = { - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; + MLPConfig mlpConfig(embedding_size, sequence_length, hidden_dims); + { + stringstream hd; + hd << '{' + for (size_t i = 0; i < hidden_dims.size(); i++) { + if (i != 0) hd << ","; + hd << hidden_dims[i]; + } + hd << '}' + fprintf(stderr, + "embedding_size(%d) sequence_length(%d) hidden_dims(%s)\n", + mlpConfig.embedding_size, + mlpConfig.sequence_length, + hd.c_str()); + } + Tensor input1, input2; { - int const dims[] = {total_requests, 1024}; + int const dims[] = {total_requests, mlpConfig.sequence_length * mlpConfig.embedding_size}; input1 = ff.create_tensor<2>(dims, DT_FLOAT); input2 = ff.create_tensor<2>(dims, DT_FLOAT); } - Tensor t1 = input1, t2 = input2; - for (size_t i = 0; i < hidden_dims.size(); i++) { - int const dims[] = {hidden_dims[i], t1->dims[0]}; - ActiMode acti_mode = - (i + 1 == hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; - t1 = ff.dense(t1, hidden_dims[i], acti_mode, false); - t2 = ff.dense(t2, hidden_dims[i], acti_mode, false); - } - Tensor t = ff.add(t1, t2); - t = ff.softmax(t); - int num_inflight_batches = 10; - InferenceManager im(&ff, 5/*num_requests_per_batch*/, num_inflight_batches); + Tensor t = create_mlp(&ff, &mlpConfig, input1, input2); + + InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); ff.init_operators(); + // Start timer { runtime->issue_execution_fence(ctx); @@ -65,12 +205,14 @@ void FlexFlow::top_level_task(Task const *task, future.get_void_result(); } double ts_start = Realm::Clock::current_time_in_microseconds(); + + + /////////////////////////////////////////////////////////////////////////////////// + + // Main loop, processing requests as they come (from the generator) int index = 0; - //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - // ff.reset_metrics(); - //int iterations = 128; size_t processed_requests=0; - Generator data_generator(total_requests, 4, true, 25); + Generator data_generator(total_requests, request_tensor_size, poisson_distribution, lambda); while(processed_requests < total_requests) { vector> req = data_generator.get_requests(); size_t iterations = req.size(); @@ -82,11 +224,9 @@ void FlexFlow::top_level_task(Task const *task, processed_requests+= iterations; } - // for (int iter = 0; iter < iterations; iter++) { - // runtime->begin_trace(ctx, 111 /*trace_id*/); - // ff.forward(); - // runtime->end_trace(ctx, 111 /*trace_id*/); - // } + /////////////////////////////////////////////////////////////////////////////////// + + // End timer { runtime->issue_execution_fence(ctx); @@ -101,4 +241,22 @@ void FlexFlow::top_level_task(Task const *task, ffConfig.batchSize * 128 * ffConfig.epochs / run_time); } -void FlexFlow::register_custom_tasks() {} +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(FlexFlow::CUSTOM_GPU_TASK_ID_1, + "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Input Task"); + } +} diff --git a/examples/cpp/inference/MLP_Unify/mlp.h b/examples/cpp/inference/MLP_Unify/mlp.h new file mode 100644 index 0000000000..e2dba0f00a --- /dev/null +++ b/examples/cpp/inference/MLP_Unify/mlp.h @@ -0,0 +1,62 @@ +/* Copyright 2022 CMU, Stanford + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/model.h" +#include "flexflow/inference.h" +#include +#include +#include +#include "data_generator.h" + +using namespace Legion; +using namespace FlexFlow; + +struct MLPConfig { + MLPConfig(void); + MLPConfig(int embedding_size, int sequence_length, std::vector hidden_dims) + : embedding_size(embedding_size), + sequence_length(sequence_length), + hidden_dims(hidden_dims) {} + + int embedding_size, sequence_length; + std::vector hidden_dims; +}; + +class DataLoader { +public: + DataLoader(FFModel &ff, + MLPConfig const &mlpConfig, + InferenceManager const *im, + Tensor input); + /*static void load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime);*/ + static void load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + void next_batch(FFModel &); + void reset(void); + +public: + int num_samples, next_index; + Tensor full_input, batch_input; +}; + +struct SampleIdxs { + int num_samples; + int idxs[MAX_NUM_SAMPLES]; +}; \ No newline at end of file From 8de95ec6b9b9d3dab7fdd268ba7262ab04598eb7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 2 Dec 2022 15:01:23 +0000 Subject: [PATCH 016/344] fix data generator to work for -std=c++ < 20 --- examples/cpp/inference/data_generator.cpp | 105 +--------------------- examples/cpp/inference/data_generator.h | 7 ++ 2 files changed, 8 insertions(+), 104 deletions(-) diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp index b6b2fc2ede..02b841286a 100644 --- a/examples/cpp/inference/data_generator.cpp +++ b/examples/cpp/inference/data_generator.cpp @@ -9,111 +9,8 @@ #include #include #include -#include -#include -#include +#include "data_generator.h" using namespace std; -typedef std::chrono::high_resolution_clock Clock; -typedef std::chrono::milliseconds milliseconds; - -class Generator { - public: - size_t num_requests; // total number of requests - size_t tensor_size; // dimension of one request tensor - bool poisson; // false implied uniform distribution - double lambda; // mean #num of arrivals per sec - - Generator(size_t req, size_t tensor, bool poi, double lamb) { - num_requests = req; - tensor_size = tensor; - poisson = poi; - lambda = lamb; - compute_distribution(); - arrivals_ptr = arrivals.begin(); - timer_started = false; - } - - vector> get_requests(void); // function to retrieve requests - - private: - bool timer_started; // tracks if start time has been initiated - Clock::time_point start_time; // time when get_requests() is called for the first time - vector arrivals; // arrival times (ms) generated based on distribution - vector::iterator arrivals_ptr; // next request to output - - void compute_distribution( void ); // populate arrivals - vector get_random_tensor(void); // generate a random tensor -}; - -void Generator::compute_distribution( void ) { - // set up uniform number generator [0,1) - random_device rnd; - mt19937 gen(rnd()); - uniform_real_distribution dist {0, 1.0}; - double cur_arrival = 0; // assume first request comes in at time 0 - - for (size_t i = 0; i < num_requests; i++) { - arrivals.push_back(cur_arrival); - cout << "arrival time " << i << ": +" << cur_arrival << "ms \n"; - - if (poisson) { - double u = dist(gen); - double interval = -(1/lambda) * log(1-u) * 1000; - cur_arrival += interval; - } else { - cur_arrival += (1000/lambda); - } - } - return; -}; - -vector> Generator::get_requests(void) { - Clock::time_point cur_time = Clock::now(); - vector> requests; - if (!timer_started){ - // simply return one request and start timer for the first call - start_time = Clock::now(); - timer_started = true; - arrivals_ptr++; - requests.push_back(get_random_tensor()); - return requests; - } - - // output requests till we reach current timestamp - milliseconds ms_from_start = chrono::duration_cast(cur_time - start_time); - while (arrivals_ptr < arrivals.end() && ms_from_start.count() >= *arrivals_ptr){ - cout << "output request at arrival time +" << *arrivals_ptr << "\n"; - requests.push_back(get_random_tensor()); - arrivals_ptr++; - } - return requests; -}; - -vector Generator::get_random_tensor(void) { - random_device rnd_device; - mt19937 mersenne_engine {rnd_device()}; - uniform_real_distribution dist {0, 1.0}; // state distribution - - auto gen = [&dist, &mersenne_engine](){ - return dist(mersenne_engine); - }; - - vector vec(tensor_size); - generate(begin(vec), end(vec), gen); - return vec; -}; - -// for debugging -void print_requests(vector> req) { - cout << "printing requests\n"; - for (vector v: req){ - for (double e: v) { - cout << e << ","; - } - cout << "\n"; - } - cout << "\n"; -}; int main(int argc, const char * argv[]) { // insert code here... diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 936b07727a..14bf91b433 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -82,6 +82,13 @@ vector> Generator::get_requests(void) { return requests; }; +template< class ForwardIt, class Generator > +void generate( ForwardIt first, ForwardIt last, Generator gen ) { + while (first != last) { + *first++ = gen(); + } +} + vector Generator::get_random_tensor(void) { random_device rnd_device; mt19937 mersenne_engine {rnd_device()}; From 59312b2c47af6719fc505d66d4def14d5246bfc7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 2 Dec 2022 17:13:03 +0000 Subject: [PATCH 017/344] fix bugs --- CMakeLists.txt | 12 ++-- config/config.linux | 4 +- examples/cpp/inference/MLP_Unify/mlp.cc | 78 +++++++++++++---------- examples/cpp/inference/MLP_Unify/mlp.h | 13 ++-- examples/cpp/inference/data_generator.cpp | 44 ++++++------- 5 files changed, 81 insertions(+), 70 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 07d77fb129..e7738cded8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -423,13 +423,13 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) endif() -if(FF_BUILD_TRANSFORMER_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/Transformer) -endif() +# if(FF_BUILD_TRANSFORMER_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) +# add_subdirectory(examples/cpp/inference/Transformer) +# endif() -if(FF_BUILD_G_SHARD_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/G-Shard-MoE) -endif() +# if(FF_BUILD_G_SHARD_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) +# add_subdirectory(examples/cpp/inference/G-Shard-MoE) +# endif() # installation set(INCLUDE_DEST "include") diff --git a/config/config.linux b/config/config.linux index 28cf7c2fe1..0f819f4031 100755 --- a/config/config.linux +++ b/config/config.linux @@ -38,8 +38,8 @@ FF_USE_GASNET=${FF_USE_GASNET:-OFF} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} # build C++ examples -FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-OFF} +FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-ON} +FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 9298a87b10..9fea4e979b 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -24,19 +24,26 @@ DataLoader::DataLoader(FFModel &ff, Tensor input) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - log_app.print("Use random dataset..."); - - // The number of samples is the total number of request samples that can ever be loaded into memory at the same time. In the case of training, the value is batchSize * workersPerNode * numNodes, since each worker can only process one batch at a time. In inference, batchSize - size_t max_parallel_requests = im->max_num_inflight_batches * (ff.config.batchSize * im->max_num_requests_per_batch); - num_samples = max_parallel_requests * ff.config.workersPerNode * ff.config.numNodes; - log_app.print("Number of random samples = %d\n", num_samples); + printf("Use random dataset..."); + + // The number of samples is the total number of request samples that can ever + // be loaded into memory at the same time. In the case of training, the value + // is batchSize * workersPerNode * numNodes, since each worker can only + // process one batch at a time. In inference, batchSize + int max_parallel_requests = + im->max_num_inflight_batches * + (ff.config.batchSize * im->max_num_requests_per_batch); + num_samples = + max_parallel_requests * ff.config.workersPerNode * ff.config.numNodes; + printf("Number of random samples = %d\n", num_samples); // return; // Create full input { batch_input = input; - int const dims[] = {num_samples, tf.sequence_length * mlpConfig->embedding_size}; + int const dims[] = {num_samples, + mlpConfig.sequence_length * mlpConfig.embedding_size}; full_input = ff.create_tensor<2>(dims, DT_FLOAT); } @@ -44,15 +51,15 @@ DataLoader::DataLoader(FFModel &ff, // TODO: Use index launcher instead of task launcher TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(NULL, 0)); launcher.add_region_requirement( - RegionRequirement(full_input.parallel_tensor->region, + RegionRequirement(full_input->parallel_tensor->region, WRITE_ONLY, EXCLUSIVE, - full_input.parallel_tensor->region, + full_input->parallel_tensor->region, MAP_TO_ZC_MEMORY)); launcher.add_field(0, FID_DATA); runtime->execute_task(ctx, launcher); reset(); - next_batch(ff); + //next_batch(ff); } void DataLoader::load_entire_dataset(Task const *task, @@ -69,10 +76,10 @@ void DataLoader::load_entire_dataset(Task const *task, assert(acc_input.accessor.is_dense_arbitrary(rect_input)); float *input_ptr = acc_input.ptr(rect_input.lo); // Fill dataset with random data - for (size_t i = 0; i < rect_input.volume(); i++) { + for (int i = 0; i < rect_input.volume(); i++) { input_ptr[i] = ((float)std::rand()) / RAND_MAX; } - log_app.print("finish loading data\n"); + printf("finish loading data\n"); } void DataLoader::next_batch(FFModel &ff) { @@ -131,7 +138,7 @@ Tensor create_mlp(FFModel *model, Tensor const &input1, Tensor const &input2) { Tensor t1 = input1, t2 = input2; - for (size_t i = 0; i < mlpConfig->hidden_dims.size(); i++) { + for (int i = 0; i < mlpConfig->hidden_dims.size(); i++) { int const dims[] = {mlpConfig->hidden_dims[i], t1->dims[0]}; ActiMode acti_mode = (i + 1 == mlpConfig->hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; @@ -148,17 +155,19 @@ void FlexFlow::top_level_task(Task const *task, Runtime *runtime) { // Inference parameters - size_t total_requests = 256; // total number of requests processed as part of the simulation - size_t request_tensor_size = 4; // request tensor dimensions - bool poisson_distribution=true; + int total_requests = + 256; // total number of requests processed as part of the simulation + int request_tensor_size = 4; // request tensor dimensions + bool poisson_distribution = true; double lambda = 25; // average number of request arrivals per second - size_t num_requests_per_batch=5; - size_t num_inflight_batches = 10; + int num_requests_per_batch = 5; + int num_inflight_batches = 10; // MLP parameters - size_t embedding_size=1024; - size_t sequence_length=512; - std::vector hidden_dims = {8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; + int embedding_size = 1024; + int sequence_length = 512; + std::vector hidden_dims = { + 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; FFConfig ffConfig; ffConfig.batchSize=1; @@ -173,17 +182,15 @@ void FlexFlow::top_level_task(Task const *task, MLPConfig mlpConfig(embedding_size, sequence_length, hidden_dims); { stringstream hd; - hd << '{' - for (size_t i = 0; i < hidden_dims.size(); i++) { - if (i != 0) hd << ","; + hd << '{'; + for (int i = 0; i < hidden_dims.size(); i++) { + if (i != 0) { + hd << ","; + } hd << hidden_dims[i]; } - hd << '}' - fprintf(stderr, - "embedding_size(%d) sequence_length(%d) hidden_dims(%s)\n", - mlpConfig.embedding_size, - mlpConfig.sequence_length, - hd.c_str()); + hd << '}'; + fprintf(stderr, "embedding_size(%d) sequence_length(%d) hidden_dims(%s)\n", mlpConfig.embedding_size, mlpConfig.sequence_length, hd.str().c_str()); } Tensor input1, input2; @@ -211,12 +218,13 @@ void FlexFlow::top_level_task(Task const *task, // Main loop, processing requests as they come (from the generator) int index = 0; - size_t processed_requests=0; - Generator data_generator(total_requests, request_tensor_size, poisson_distribution, lambda); - while(processed_requests < total_requests) { + int processed_requests = 0; + Generator data_generator( + total_requests, request_tensor_size, poisson_distribution, lambda); + while (processed_requests < total_requests) { vector> req = data_generator.get_requests(); - size_t iterations = req.size(); - for (size_t iter = 0; iter < iterations; iter++) { + int iterations = req.size(); + for (int iter = 0; iter < iterations; iter++) { runtime->begin_trace(ctx, 111 /*trace_id*/); im.inference((index++) % num_inflight_batches); runtime->end_trace(ctx, 111 /*trace_id*/); diff --git a/examples/cpp/inference/MLP_Unify/mlp.h b/examples/cpp/inference/MLP_Unify/mlp.h index e2dba0f00a..8d1cdd27ad 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.h +++ b/examples/cpp/inference/MLP_Unify/mlp.h @@ -23,15 +23,18 @@ using namespace Legion; using namespace FlexFlow; +#define MAX_NUM_SAMPLES 1024000 + struct MLPConfig { MLPConfig(void); - MLPConfig(int embedding_size, int sequence_length, std::vector hidden_dims) - : embedding_size(embedding_size), - sequence_length(sequence_length), - hidden_dims(hidden_dims) {} + MLPConfig(int embedding_size, + int sequence_length, + std::vector hidden_dims) + : embedding_size(embedding_size), sequence_length(sequence_length), + hidden_dims(hidden_dims) {} int embedding_size, sequence_length; - std::vector hidden_dims; + std::vector hidden_dims; }; class DataLoader { diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp index 02b841286a..bf6456ef84 100644 --- a/examples/cpp/inference/data_generator.cpp +++ b/examples/cpp/inference/data_generator.cpp @@ -12,26 +12,26 @@ #include "data_generator.h" using namespace std; -int main(int argc, const char * argv[]) { - // insert code here... - cout << "Hello, World!\n"; - Generator data_generator(10, 4, true, 1); - - vector> req0 = data_generator.get_requests(); - print_requests(req0); - - this_thread::sleep_for(milliseconds(1200)); - vector> req1200 = data_generator.get_requests(); - print_requests(req1200); - - this_thread::sleep_for(milliseconds(10)); - vector> req1210 = data_generator.get_requests(); - print_requests(req1210); - - this_thread::sleep_for(milliseconds(4000)); - vector> req5210 = data_generator.get_requests(); - print_requests(req5210); - - - return 0; +// This is for running the dataloader standalone +int main(int argc, char const *argv[]) { + // insert code here... + cout << "Hello, World!\n"; + Generator data_generator(10, 4, true, 1); + + vector> req0 = data_generator.get_requests(); + print_requests(req0); + + this_thread::sleep_for(milliseconds(1200)); + vector> req1200 = data_generator.get_requests(); + print_requests(req1200); + + this_thread::sleep_for(milliseconds(10)); + vector> req1210 = data_generator.get_requests(); + print_requests(req1210); + + this_thread::sleep_for(milliseconds(4000)); + vector> req5210 = data_generator.get_requests(); + print_requests(req5210); + + return 0; } From 84a44e0a4de1dea1e05f478cf25c76e61b1bbdf9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 3 Dec 2022 00:24:03 -0500 Subject: [PATCH 018/344] cmake update --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7738cded8..09856f5c66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -376,9 +376,9 @@ if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/AlexNet) endif() -if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/MLP_Unify) -endif() +# if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) +# add_subdirectory(examples/cpp/MLP_Unify) +# endif() if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/MLP_Unify) From bb44ca14a7d86e76b1287965ea64d905f25c6ca5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 3 Dec 2022 19:48:26 +0000 Subject: [PATCH 019/344] fix filename --- examples/cpp/inference/MLP_Unify/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cpp/inference/MLP_Unify/CMakeLists.txt b/examples/cpp/inference/MLP_Unify/CMakeLists.txt index e83d292efc..57b9ea0835 100644 --- a/examples/cpp/inference/MLP_Unify/CMakeLists.txt +++ b/examples/cpp/inference/MLP_Unify/CMakeLists.txt @@ -9,7 +9,9 @@ set(CPU_SRC mlp.h ../data_generator.h) -cuda_add_executable(${project_target} ${CPU_SRC}) +set(GPU_SRC mlp.cu) + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) From 32eaf3a9faec670543ed393a66215108678c67ce Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 3 Dec 2022 20:07:56 +0000 Subject: [PATCH 020/344] cmake update --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 09856f5c66..06dab8812c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -376,14 +376,14 @@ if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/AlexNet) endif() -# if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) -# add_subdirectory(examples/cpp/MLP_Unify) -# endif() - -if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/MLP_Unify) +if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/MLP_Unify) endif() +# if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES) +# add_subdirectory(examples/cpp/inference/MLP_Unify) +# endif() + if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) endif() From da7ba33d8d011e3e0499c6e94ab6530da2fd832f Mon Sep 17 00:00:00 2001 From: Daiyaan Date: Tue, 6 Dec 2022 19:32:23 +0000 Subject: [PATCH 021/344] [Attention] inference impl --- include/flexflow/ops/attention.h | 4 +++ src/ops/attention.cc | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index 2903497af9..d50ba78871 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -66,6 +66,10 @@ class MultiHeadAttention : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + std::vector const &) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 9c9c87bd56..662d4e47e1 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -523,6 +523,56 @@ void MultiHeadAttention::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void MultiHeadAttention::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_weights, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + int idx = 0; + IndexLauncher launcher(ATTENTION_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[2]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(4, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + /* regions[0](I): query regions[1](I): key From c4ba15062af71de65a3b75d286169f563d843040 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 7 Dec 2022 03:04:57 +0000 Subject: [PATCH 022/344] [MOE][INFERENCE] - Added MoE for inference example --- CMakeLists.txt | 4 + examples/cpp/inference/data_generator.h | 12 +- .../mixture_of_experts/CMakeLists.txt | 22 + .../cpp/inference/mixture_of_experts/Makefile | 35 ++ .../cpp/inference/mixture_of_experts/moe.cc | 458 ++++++++++++++++++ .../cpp/inference/mixture_of_experts/moe.cu | 75 +++ .../cpp/inference/mixture_of_experts/moe.h | 60 +++ .../inference/mixture_of_experts/run_moe.sh | 11 + 8 files changed, 671 insertions(+), 6 deletions(-) create mode 100644 examples/cpp/inference/mixture_of_experts/CMakeLists.txt create mode 100644 examples/cpp/inference/mixture_of_experts/Makefile create mode 100644 examples/cpp/inference/mixture_of_experts/moe.cc create mode 100644 examples/cpp/inference/mixture_of_experts/moe.cu create mode 100644 examples/cpp/inference/mixture_of_experts/moe.h create mode 100644 examples/cpp/inference/mixture_of_experts/run_moe.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 06dab8812c..b4eff0574d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -423,6 +423,10 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) endif() +if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) + add_subdirectory(examples/cpp/inference/mixture_of_experts) +endif() + # if(FF_BUILD_TRANSFORMER_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) # add_subdirectory(examples/cpp/inference/Transformer) # endif() diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 14bf91b433..199af572f8 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -82,12 +82,12 @@ vector> Generator::get_requests(void) { return requests; }; -template< class ForwardIt, class Generator > -void generate( ForwardIt first, ForwardIt last, Generator gen ) { - while (first != last) { - *first++ = gen(); - } -} +// template +// void generate(ForwardIt first, ForwardIt last, Generator gen) { +// while (first != last) { +// *first++ = gen(); +// } +// } vector Generator::get_random_tensor(void) { random_device rnd_device; diff --git a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt new file mode 100644 index 0000000000..ee1c063b18 --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_MoE) +set(project_target inference_moe) + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + moe.cc + moe.h + ../data_generator.h) + +set(GPU_SRC + moe.cu) + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) + diff --git a/examples/cpp/inference/mixture_of_experts/Makefile b/examples/cpp/inference/mixture_of_experts/Makefile new file mode 100644 index 0000000000..a9eb401850 --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/Makefile @@ -0,0 +1,35 @@ +# Copyright 2020 Stanford University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 1 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 0 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= inference_moe +# List all the application source files here +GEN_SRC = moe.cc +GEN_GPU_SRC = moe.cu + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc new file mode 100644 index 0000000000..bf7a18bd0a --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -0,0 +1,458 @@ +/* Copyright 2020 Stanford + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "moe.h" +#include "data_generator.h" +#include "flexflow/inference.h" +#include +#include +#include + +#define NUM_SAMPLES 60000 +#define TRAIN_SAMPLES 60000 +#define TEST_SAMPLES 00000 +#define MNIST_DIMS 28 * 28 +#define CIFAR_DIMS 3 * 32 * 32 +#define DATA_DIMS MNIST_DIMS +#define OUT_DIM 10 + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("MoE"); +int num_exp = 5; +int num_select = 2; + +void parse_input_args(char **argv, int argc, MoeConfig &config) { + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--dataset")) { + config.dataset_path = std::string(argv[++i]); + continue; + } + } +} + +Tensor create_moe(FFModel *model, + MoeConfig const *moeConfig, + Tensor const &input) { + float alpha = 2.0f; // factor overhead tensor size for imbalance + float lambda = 0.04f; // multiplier for load balance term + + // MoE model + Tensor gate_preds = model->dense(input, 64, AC_MODE_RELU); + gate_preds = model->dense(gate_preds, num_exp, AC_MODE_RELU); + Tensor topK_output[2]; + model->top_k(gate_preds, topK_output, num_select, false); + + Tensor exp_tensors[num_exp]; + // printf("num_exp: %i, alpha: %f\n", num_exp); + input->print("input_tensor"); + topK_output[1]->print("topK_output[1]"); + return topK_output[0]; + // exp_tensors->print("exp_tensors"); + // model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); + + // Tensor agg_inputs[num_exp + 4]; + // agg_inputs[0] = model->softmax(topK_output[0]); // gate preds + // agg_inputs[1] = topK_output[1]; // gate assign + // agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) + // agg_inputs[3] = gate_preds; // full gate preds + // for (int i = 0; i < num_exp; i++) { + // Tensor exp_pred = model->dense(exp_tensors[i], OUT_DIM, AC_MODE_RELU); + // agg_inputs[i + 4] = model->softmax(exp_pred); + // } + + // Tensor coop_output = model->aggregate(agg_inputs, num_exp, lambda); + // model->get_metrics(); + // return coop_output; +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // Inference parameters + int total_requests = + 256; // total number of requests processed as part of the simulation + int request_tensor_size = 4; // request tensor dimensions + bool poisson_distribution = true; + double lambda = 25; // average number of request arrivals per second + int num_requests_per_batch = 5; + int num_inflight_batches = 10; + + //----------------------------------------------------------------- + + FFConfig ffConfig; + MoeConfig moeConfig; + { + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, moeConfig); + log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes); + } + FFModel ff(ffConfig); + + Tensor input; + { + int const dims[] = {ffConfig.batchSize, DATA_DIMS}; + input = ff.create_tensor<2>(dims, DT_FLOAT); + } + + //----------------------------------------------------------------- + + Tensor t = create_moe(&ff, &moeConfig, input); + InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); + im.compile_model_and_allocate_buffer(); + ff.init_operators(); + + // Data Loader + DataLoader data_loader(ff, moeConfig, input, ff.label_tensor); + + //----------------------------------------------------------------- + + // Start timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_start = Realm::Clock::current_time_in_microseconds(); + + /////////////////////////////////////////////////////////////////////////////////// + + int index = 0; + int processed_requests = 0; + Generator data_generator( + total_requests, request_tensor_size, poisson_distribution, lambda); + while (processed_requests < total_requests) { + vector> req = data_generator.get_requests(); + int iterations = req.size(); + for (int iter = 0; iter < iterations; iter++) { + data_loader.next_batch(ff); + runtime->begin_trace(ctx, 111 /*trace_id*/); + im.inference((index++) % num_inflight_batches); + runtime->end_trace(ctx, 111 /*trace_id*/); + } + processed_requests += iterations; + } + + /////////////////////////////////////////////////////////////////////////////////// + + // End timer + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_end = Realm::Clock::current_time_in_microseconds(); + double run_time = 1e-6 * (ts_end - ts_start); + printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", + run_time, + TRAIN_SAMPLES * ffConfig.epochs / run_time); +} + +DataLoader::DataLoader(FFModel &ff, + MoeConfig const &moe, + Tensor input, + Tensor label) { + num_samples = NUM_SAMPLES; + + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + + // Create full input + { + batch_input = input; + int const dims[] = {NUM_SAMPLES, DATA_DIMS}; + full_input = ff.create_tensor<2>(dims, DT_FLOAT); + } + // Create full label + { + batch_label = label; + int const dims[] = {NUM_SAMPLES, 1}; + full_label = ff.create_tensor<2>(dims, DT_INT32); + } + + // Load entire dataset + // TODO: Use index launcher instead of task launcher + MoeConfig const *ptr = &moe; + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, + TaskArgument(&ptr, sizeof(MoeConfig *))); + // regions[0]: full_input + launcher.add_region_requirement( + RegionRequirement(full_input->parallel_tensor->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + // regions[1]: full_label + launcher.add_region_requirement( + RegionRequirement(full_input->parallel_tensor->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(1, FID_DATA); + + runtime->execute_task(ctx, launcher); + reset(); + next_batch(ff); +} + +__inline__ int calc_offset(int c, int y, int x, int yscale, int xscale) { + return (c * yscale * xscale + y * xscale + x); +} + +// ================================================= +// Load data +// ================================================= + +/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/, unpack to +this directory (Flexflow/examples/cpp/mixture_of_experts) */ + +void read_cifar100(float *input_ptr, int *label_ptr) { + std::ifstream file; + file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); + if (!file) { + std::cout << "Error opening CIFAR100 train data file" << std::endl; + assert(false); + } + + file.seekg(0, std::ios::beg); + + // each sample: <1 x coarse label><1 x fine label><3072 x pixel> + for (std::size_t i = 0; i < NUM_SAMPLES; i++) { + unsigned char temp = 0; + file.read((char *)&temp, sizeof(temp)); // coarse label, skip + file.read((char *)&temp, sizeof(temp)); + label_ptr[i] = temp; + for (std::size_t j = 0; j < 3072; ++j) { + file.read((char *)&temp, sizeof(temp)); + input_ptr[i * 3072 + j] = (float)temp / 255.0f; + } + } + + file.close(); +} + +int reverseInt(int i) { + unsigned char c1, c2, c3, c4; + + c1 = i & 255; + c2 = (i >> 8) & 255; + c3 = (i >> 16) & 255; + c4 = (i >> 24) & 255; + + return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; +} + +void read_mnist(float *input_ptr, int *label_ptr) { + // read inputs + std::ifstream input("train-images-idx3-ubyte", std::ios::binary); + if (input.is_open()) { + int magic_number = 0; + int number_of_images = 0; + int n_rows = 0; + int n_cols = 0; + input.read((char *)&magic_number, sizeof(magic_number)); + magic_number = reverseInt(magic_number); + input.read((char *)&number_of_images, sizeof(number_of_images)); + number_of_images = reverseInt(number_of_images); + input.read((char *)&n_rows, sizeof(n_rows)); + n_rows = reverseInt(n_rows); + input.read((char *)&n_cols, sizeof(n_cols)); + n_cols = reverseInt(n_cols); + + for (int i = 0; i < number_of_images; i++) { + for (int r = 0; r < n_rows; r++) { + for (int c = 0; c < n_cols; c++) { + unsigned char temp = 0; + input.read((char *)&temp, sizeof(temp)); + input_ptr[i * n_rows * n_cols + r * n_cols + c] = + (float)temp / 255.0f; + } + } + } + } else { + std::cout << "Error opening MNIST input data file" << std::endl; + assert(false); + } + + // read labels + std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary); + if (labels.is_open()) { + int magic_number = 0; + int number_of_images = 0; + labels.read((char *)&magic_number, sizeof(magic_number)); + magic_number = reverseInt(magic_number); + labels.read((char *)&number_of_images, sizeof(number_of_images)); + number_of_images = reverseInt(number_of_images); + + for (int i = 0; i < number_of_images; i++) { + unsigned char temp = 0; + labels.read((char *)&temp, sizeof(temp)); + label_ptr[i] = temp; + } + } else { + std::cout << "Error opening MNIST label data file" << std::endl; + assert(false); + } +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const MoeConfig* conf = *((MoeConfig**)task->args); + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + + // get input and label pointer + AccessorWO const acc_input(regions[0], FID_DATA); + AccessorWO const acc_label(regions[1], FID_DATA); + Rect<2> rect_input = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + assert(acc_input.accessor.is_dense_arbitrary(rect_input)); + Rect<2> rect_label = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + assert(acc_label.accessor.is_dense_arbitrary(rect_label)); + float *input_ptr = acc_input.ptr(rect_input.lo); + int *label_ptr = acc_label.ptr(rect_label.lo); + + read_mnist(input_ptr, label_ptr); + log_app.print("finish loading data\n"); +} + +void DataLoader::next_batch(FFModel &ff) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load input + { + IndexSpace task_is = batch_input->parallel_tensor->parallel_is; + Rect<2> rect = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = next_index; + for (PointInRectIterator<2> it(rect); it(); it++) { + SampleIdxs meta; + assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); + meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1); + for (int i = 0; i < meta.num_samples; i++) + meta.idxs[i] = idx++; + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, + task_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->parallel_tensor->machine_view.hash()); + launcher.add_region_requirement( + RegionRequirement(full_input->parallel_tensor->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_input->parallel_tensor->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->parallel_tensor->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + // Load label + { + // IndexSpaceT<2> task_is = IndexSpaceT<2>(ff.get_or_create_task_is(2, "")); + IndexSpace task_is = batch_label->parallel_tensor->parallel_is; + Rect<2> rect = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = next_index; + for (PointInRectIterator<2> it(rect); it(); it++) { + SampleIdxs meta; + assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); + meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1); + for (int i = 0; i < meta.num_samples; i++) + meta.idxs[i] = idx++; + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, + task_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_label->parallel_tensor->machine_view.hash()); + launcher.add_region_requirement( + RegionRequirement(full_label->parallel_tensor->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_label->parallel_tensor->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_label->parallel_tensor->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_label->parallel_tensor->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + next_index += ff.config.batchSize; +} + +void DataLoader::reset() { + next_index = 0; +} + +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Input Task"); + } + // Load label + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Label Task"); + } +} diff --git a/examples/cpp/inference/mixture_of_experts/moe.cu b/examples/cpp/inference/mixture_of_experts/moe.cu new file mode 100644 index 0000000000..c4224e0a49 --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/moe.cu @@ -0,0 +1,75 @@ +/* Copyright 2020 Stanford + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/cuda_helper.h" +#include "moe.h" + +void DataLoader::load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + TensorAccessorR acc_full_input( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + TensorAccessorW acc_batch_input(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + + coord_t batch_size = + acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1; + coord_t sample_dim = + acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1; + + // FIXME: currently assume continous indices + assert(batch_size == meta->num_samples); + for (int i = 1; i < batch_size; i++) + assert(meta->idxs[i] == meta->idxs[0] + i); + coord_t start_idx = meta->idxs[0]; + float const *input_zc = acc_full_input.ptr + start_idx * sample_dim; + copy_kernel<<>>( + acc_batch_input.ptr, input_zc, acc_batch_input.rect.volume()); + checkCUDA(cudaDeviceSynchronize()); +} + +void DataLoader::load_label(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + TensorAccessorR acc_full_label( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + TensorAccessorW acc_batch_label(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + int batch_size = acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1; + // FIXME: currently assume continous indices + assert(batch_size == meta->num_samples); + for (int i = 1; i < meta->num_samples; i++) + assert(meta->idxs[i] == meta->idxs[0] + i); + int const *input_zc = acc_full_label.ptr + meta->idxs[0]; + copy_kernel<<>>( + acc_batch_label.ptr, input_zc, acc_batch_label.rect.volume()); + checkCUDA(cudaDeviceSynchronize()); +} diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h new file mode 100644 index 0000000000..5610de4ab7 --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -0,0 +1,60 @@ +/* Copyright 2017 Stanford, NVIDIA + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/model.h" +#define MAX_NUM_SAMPLES 60000 + +using namespace Legion; +using namespace std; +using namespace FlexFlow; + +struct MoeConfig { + MoeConfig(void) { + // Set default configurations here + } + std::string dataset_path; +}; + +class DataLoader { +public: + DataLoader(FFModel &ff, + MoeConfig const &alexnet, + Tensor _input, + Tensor _label); + static void load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_label(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + void next_batch(FFModel &); + void reset(void); + +public: + int num_samples, next_index; + Tensor full_input, batch_input; + Tensor full_label, batch_label; +}; + +struct SampleIdxs { + int num_samples; + int idxs[MAX_NUM_SAMPLES]; +}; diff --git a/examples/cpp/inference/mixture_of_experts/run_moe.sh b/examples/cpp/inference/mixture_of_experts/run_moe.sh new file mode 100644 index 0000000000..33c6c5f7fb --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/run_moe.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --job-name=pagerank +#SBATCH --output=slurm.txt +#SBATCH --time=10:00 +#SBATCH --nodes=2 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=6000MB +#SBATCH --nodelist=g0001,g0002 +#SBATCH --partition=gpu + +srun -n 2 ./moe -ll:cpu 4 -ll:gpu 4 -ll:fsize 15000 -ll:zsize 15000 --nodes 2 -ll:util 1 -b 40 -e 1 --search-budget 1 --export strat-tmp.txt From e093a44631bca9f5e863ec72ed4e71e3c25a8c91 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 13 Dec 2022 16:17:28 +0000 Subject: [PATCH 023/344] [Inference][MLP_Unify] - Fixed bug --- CMakeLists.txt | 6 +++--- examples/cpp/inference/MLP_Unify/mlp.cc | 6 ++++++ examples/cpp/inference/MLP_Unify/mlp.h | 7 ++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b4eff0574d..b796308aa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -380,9 +380,9 @@ if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/MLP_Unify) endif() -# if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES) -# add_subdirectory(examples/cpp/inference/MLP_Unify) -# endif() +if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES) + add_subdirectory(examples/cpp/inference/MLP_Unify) +endif() if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 9fea4e979b..b96cb9b22c 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -14,6 +14,11 @@ */ #include "mlp.h" +#include "data_generator.h" +#include "flexflow/inference.h" +#include +#include +#include using namespace Legion; using namespace FlexFlow; @@ -202,6 +207,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor t = create_mlp(&ff, &mlpConfig, input1, input2); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); + im.compile_model_and_allocate_buffer(); ff.init_operators(); // Start timer diff --git a/examples/cpp/inference/MLP_Unify/mlp.h b/examples/cpp/inference/MLP_Unify/mlp.h index 8d1cdd27ad..213d77d992 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.h +++ b/examples/cpp/inference/MLP_Unify/mlp.h @@ -13,14 +13,11 @@ * limitations under the License. */ + #include "flexflow/model.h" #include "flexflow/inference.h" -#include -#include -#include -#include "data_generator.h" - using namespace Legion; +using namespace std; using namespace FlexFlow; #define MAX_NUM_SAMPLES 1024000 From 7aa090d042a541d8c16b96324c2ebb7e26304e4c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 13 Dec 2022 16:54:16 +0000 Subject: [PATCH 024/344] [Inference][TopK] - Added inference impl for TopK, removed batched weights from all ops --- examples/cpp/inference/MLP_Unify/mlp.h | 3 +-- include/flexflow/operator.h | 5 ++-- include/flexflow/ops/attention.h | 1 - include/flexflow/ops/linear.h | 1 - include/flexflow/ops/topk.h | 3 +++ src/ops/attention.cc | 12 ++++----- src/ops/linear.cc | 9 +++---- src/ops/topk.cc | 36 ++++++++++++++++++++++++++ src/runtime/inference_manager.cc | 5 +--- 9 files changed, 54 insertions(+), 21 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.h b/examples/cpp/inference/MLP_Unify/mlp.h index 213d77d992..7a8cc06955 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.h +++ b/examples/cpp/inference/MLP_Unify/mlp.h @@ -13,9 +13,8 @@ * limitations under the License. */ - -#include "flexflow/model.h" #include "flexflow/inference.h" +#include "flexflow/model.h" using namespace Legion; using namespace std; using namespace FlexFlow; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index bdaecac8a2..2fe689c284 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -190,8 +190,9 @@ class Op { // Pure virtual functions for inference virtual void inference(FFModel const &, std::vector const &, - std::vector const &, - std::vector const &) {assert(false);}; + std::vector const &) { + assert(false); + }; virtual void print_layer(FFModel const &model) = 0; virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index d50ba78871..be54ef21a1 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -67,7 +67,6 @@ class MultiHeadAttention : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, - std::vector const &, std::vector const &, std::vector const &) override; void print_layer(FFModel const &model) override { diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index 6ad1a0752f..510799c43a 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -38,7 +38,6 @@ class Linear : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, - std::vector const &, std::vector const &, std::vector const &) override; void print_layer(FFModel const &model) override; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 6b1613c828..1b31df998d 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -30,6 +30,9 @@ class TopK : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 662d4e47e1..7b9e711b0c 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -523,10 +523,10 @@ void MultiHeadAttention::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void MultiHeadAttention::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_weights, - std::vector const &batch_outputs) { +void MultiHeadAttention::inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -558,11 +558,11 @@ void MultiHeadAttention::inference(FFModel const &ff, EXCLUSIVE, batch_inputs[2]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_weights[0]->part, + launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_weights[0]->region)); + weights[0]->region)); launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index b47a884c5c..9413f5f726 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -369,7 +369,6 @@ void Linear::forward(FFModel const &ff) { void Linear::inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_weights, std::vector const &batch_outputs) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -395,18 +394,18 @@ void Linear::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_weights[0]->part, + launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_weights[0]->region)); + weights[0]->region)); launcher.add_field(2, FID_DATA); if (use_bias) { - launcher.add_region_requirement(RegionRequirement(batch_weights[1]->part, + launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_weights[1]->region)); + weights[1]->region)); launcher.add_field(3, FID_DATA); } runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 1a87c6c80c..ae43038416 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -220,6 +220,42 @@ void TopK::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void TopK::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(TOPK_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + void TopK::forward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index cd68b5e04d..71fe0ec6f9 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -61,15 +61,12 @@ void InferenceManager::inference(int index) { for (size_t o = 0; o < model->operators.size(); o++) { Op* op = model->operators[o]; std::vector inputs(op->numInputs); - std::vector weights(op->numWeights); std::vector outputs(op->numOutputs); for (int i = 0; i < op->numInputs; i++) inputs[i] = tensor_buffer[op->inputs[i]][index]; - for (int i = 0; i < op->numWeights; i++) - weights[i] = op->weights[i]; for (int i = 0; i < op->numOutputs; i++) outputs[i] = tensor_buffer[op->outputs[i]][index]; - op->inference(*model, inputs, weights, outputs); + op->inference(*model, inputs, outputs); } }; From ab40cacab71914ec44a1e94b775b46600212ad55 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 13 Dec 2022 19:16:24 +0000 Subject: [PATCH 025/344] [GroupBy] - Add replica dimension --- include/flexflow/ops/groupby.h | 6 ++++-- src/ops/group_by.cc | 21 +++++++++++++++----- src/ops/group_by.cpp | 35 +++++++++++++++++----------------- src/ops/group_by.cu | 17 +++++++++++------ 4 files changed, 48 insertions(+), 31 deletions(-) diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index 4a15f6f439..a958f3c4ce 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -71,7 +71,8 @@ class Group_by : public Op { int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim); + int data_dim, + int n_replicas); static void backward_kernel_wrapper(GroupByMeta const *m, float *input_grad, @@ -81,7 +82,8 @@ class Group_by : public Op { int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim); + int data_dim, + int n_replicas); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 850a5c4587..80ff3508b4 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -270,6 +270,7 @@ void Group_by::forward_task(Task const *task, int n = gb->n; float alpha = gb->alpha; + // Check that the number of regions is n+2: n outputs and 2 inputs assert((int)regions.size() == n + 2); assert((int)task->regions.size() == n + 2); @@ -287,13 +288,16 @@ void Group_by::forward_task(Task const *task, coord_t input_rows = rect_input.hi[1] - rect_input.lo[1] + 1; coord_t input_cols = rect_input.hi[0] - rect_input.lo[0] + 1; + coord_t input_replicas = rect_input.hi[2] - rect_input.lo[2] + 1; + // Check that dimensions match in the input and assign tensors assert(input_rows == rect_assign.hi[1] - rect_assign.lo[1] + 1); - + assert(input_replicas == rect_assign.hi[2] - rect_assign.lo[2] + 1); // does this need to be true? int k = rect_assign.hi[0] - rect_assign.lo[0] + 1; int batch_size = input_rows; int data_dim = input_cols; + int n_replicas = input_replicas; - // Create a vector of n outputs, where n is the number of experts. + // Create a vector of n outputs, where n is the number of experts. // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert float *outputs[n]; @@ -310,6 +314,9 @@ void Group_by::forward_task(Task const *task, assert(output_cols == input_cols); } + // Launch the kernel responsible from copying the data from the input tensor + // to each output tensor, according to the input to expert assignments from + // the assign tensor. Group_by::forward_kernel_wrapper(m, acc_input.ptr(rect_input), acc_assign.ptr(rect_assign), @@ -318,7 +325,8 @@ void Group_by::forward_task(Task const *task, k, alpha, batch_size, - data_dim); + data_dim, + n_replicas); } void Group_by::backward(FFModel const &ff) { @@ -388,11 +396,13 @@ void Group_by::backward_task(Task const *task, coord_t input_rows = rect_input_grad.hi[1] - rect_input_grad.lo[1] + 1; coord_t input_cols = rect_input_grad.hi[0] - rect_input_grad.lo[0] + 1; + coord_t input_replicas = rect_input_grad.hi[2] - rect_input_grad.lo[2] + 1; assert(input_rows == rect_assign.hi[1] - rect_assign.lo[1] + 1); - + assert(input_replicas == rect_assign.hi[2] - rect_assign.lo[2] + 1); // does this need to be true? int k = rect_assign.hi[0] - rect_assign.lo[0] + 1; int batch_size = input_rows; int data_dim = input_cols; + int n_replicas = input_replicas; // get output float *output_grads[n]; @@ -417,7 +427,8 @@ void Group_by::backward_task(Task const *task, k, alpha, batch_size, - data_dim); + data_dim, + n_replicas); } void Group_by::serialize(Legion::Serializer &sez) const { diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index f45e9092a5..16c8354bca 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -33,25 +33,19 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { - __shared__ float - *chosen_exp_preds[MAX_K * - MAX_BATCH_SIZE]; // one pointer for each exp_assign - // (TopK_output[1]) element + int data_dim, + int n_replicas) { + __shared__ float *chosen_exp_preds[MAX_K * MAX_BATCH_SIZE]; // one pointer for each exp_assign (TopK_output[1]) element // Get pred pointers, single thread per block if (threadIdx.x == 0) { - int exp_tensor_rows = - ceil(alpha * k / n * batch_size); // This is the max expert capacity - int expert_idx[MAX_N] = { - 0}; // This is the number of tokens assigned to each expert + int exp_tensor_rows = ceil(alpha * k / n * batch_size); // This is the max expert capacity + int expert_idx[MAX_N] = {0}; // This is the number of tokens assigned to each expert // Iterate through flattened assign tensor, which has shape (k, batch_size) for (int i = 0; i < k * batch_size; i++) { // Get pointer to chosen expert predictions - int expert = - exp_assign[i]; // index of the expert that is to receive the token i - if (expert_idx[expert] >= - exp_tensor_rows) { // check if the expert is already at capacity + int expert = exp_assign[i]; // index of the expert that is to receive the token i + if (expert_idx[expert] >= exp_tensor_rows) { // check if the expert is already at capacity // dropped sample chosen_exp_preds[i] = 0; continue; @@ -85,7 +79,8 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { __shared__ float *chosen_exp_grads[MAX_K * MAX_BATCH_SIZE]; // Get pred pointers, single thread @@ -127,7 +122,8 @@ void Group_by::forward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { // TODO: why cublas/cudnn stream is needed here? hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -148,7 +144,8 @@ void Group_by::forward_kernel_wrapper( k, alpha, batch_size, - data_dim); + data_dim, + n_replicas); } void Group_by::backward_kernel_wrapper( @@ -160,7 +157,8 @@ void Group_by::backward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { // TODO: why cublas/cudnn stream is needed here hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -183,7 +181,8 @@ void Group_by::backward_kernel_wrapper( k, alpha, batch_size, - data_dim); + data_dim, + n_replicas); } GroupByMeta::GroupByMeta(FFHandler handler, int n) : OpMeta(handler) { diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index ee0b18337c..5e5f4a6fb8 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -32,7 +32,8 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { __shared__ float *chosen_exp_preds[MAX_K * MAX_BATCH_SIZE]; // Get pred pointers, single thread per block @@ -71,7 +72,8 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { __shared__ float *chosen_exp_grads[MAX_K * MAX_BATCH_SIZE]; assert(k <= MAX_K); assert(batch_size <= MAX_BATCH_SIZE); @@ -115,7 +117,8 @@ void Group_by::forward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { // TODO: why cublas/cudnn stream is needed here? cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -136,7 +139,7 @@ void Group_by::forward_kernel_wrapper( min(CUDA_NUM_THREADS, (int)(batch_size * k * data_dim)), 0, stream>>>( - input, exp_assign, m->dev_region_ptrs, n, k, alpha, batch_size, data_dim); + input, exp_assign, m->dev_region_ptrs, n, k, alpha, batch_size, data_dim, n_replicas); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -157,7 +160,8 @@ void Group_by::backward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim) { + int data_dim, + int n_replicas) { // TODO: why cublas/cudnn stream is needed here cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -184,7 +188,8 @@ void Group_by::backward_kernel_wrapper( k, alpha, batch_size, - data_dim); + data_dim, + n_replicas); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); From 49601c572cb195f6d3605c8385c5f9bac6f13937 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 13 Dec 2022 19:27:01 +0000 Subject: [PATCH 026/344] [MoE] - uncommented layers --- .../cpp/inference/mixture_of_experts/moe.cc | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index bf7a18bd0a..d1f47359b3 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -51,31 +51,41 @@ Tensor create_moe(FFModel *model, // MoE model Tensor gate_preds = model->dense(input, 64, AC_MODE_RELU); + // gate_preds->print("gate_preds"); gate_preds = model->dense(gate_preds, num_exp, AC_MODE_RELU); + // gate_preds->print("gate_preds2"); Tensor topK_output[2]; model->top_k(gate_preds, topK_output, num_select, false); - + // topK_output[0]->print("topK_output[0]"); + // topK_output[1]->print("topK_output[1]"); Tensor exp_tensors[num_exp]; // printf("num_exp: %i, alpha: %f\n", num_exp); - input->print("input_tensor"); - topK_output[1]->print("topK_output[1]"); - return topK_output[0]; - // exp_tensors->print("exp_tensors"); - // model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); - - // Tensor agg_inputs[num_exp + 4]; - // agg_inputs[0] = model->softmax(topK_output[0]); // gate preds - // agg_inputs[1] = topK_output[1]; // gate assign - // agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) - // agg_inputs[3] = gate_preds; // full gate preds - // for (int i = 0; i < num_exp; i++) { - // Tensor exp_pred = model->dense(exp_tensors[i], OUT_DIM, AC_MODE_RELU); - // agg_inputs[i + 4] = model->softmax(exp_pred); + // input->print("input_tensor"); + + // return topK_output[0]; + // exp_tensors[0]->print("exp_tensors[0]"); + // exp_tensors[num_exp-1]->print("exp_tensors[num_exp-1]"); + model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); + // for (int i=0; idims[2] = 1; + // exp_tensors[i]->print("exp_tensors[i]"); // } - - // Tensor coop_output = model->aggregate(agg_inputs, num_exp, lambda); + Tensor agg_inputs[num_exp + 4]; + agg_inputs[0] = model->softmax(topK_output[0]); // gate preds + agg_inputs[1] = topK_output[1]; // gate assign + agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) + agg_inputs[3] = gate_preds; // full gate preds + for (int i = 0; i < num_exp; i++) { + Tensor exp_pred = model->dense(exp_tensors[i], OUT_DIM, AC_MODE_RELU); + exp_pred->print("exp_pred"); + agg_inputs[i + 4] = model->softmax(exp_pred); + } + for (int i = 0; i < num_exp + 4; i++) { + agg_inputs[i]->print("agg_inputs[i]"); + } + Tensor coop_output = model->aggregate(agg_inputs, num_exp, lambda); // model->get_metrics(); - // return coop_output; + return coop_output; } void FlexFlow::top_level_task(Task const *task, From 261d208c7ff2a9cc10891610d5003379e574d024 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 13 Dec 2022 19:33:55 +0000 Subject: [PATCH 027/344] fixes --- examples/cpp/inference/mixture_of_experts/moe.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index d1f47359b3..c1c1958e91 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -66,10 +66,10 @@ Tensor create_moe(FFModel *model, // exp_tensors[0]->print("exp_tensors[0]"); // exp_tensors[num_exp-1]->print("exp_tensors[num_exp-1]"); model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); - // for (int i=0; idims[2] = 1; - // exp_tensors[i]->print("exp_tensors[i]"); - // } + for (int i=0; idims[2] = 1; // temporary fix to replica dimension being undefined + exp_tensors[i]->print("exp_tensors[i]"); + } Tensor agg_inputs[num_exp + 4]; agg_inputs[0] = model->softmax(topK_output[0]); // gate preds agg_inputs[1] = topK_output[1]; // gate assign From d67bd3b6fd574ac0eab5ba63c0a1380affe71878 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Tue, 13 Dec 2022 21:05:50 -0500 Subject: [PATCH 028/344] Inference: Implement init version of Group_by::inference (#522) * Implement init Group_by::inference * Remove .DS_Store --- include/flexflow/ops/groupby.h | 3 +++ src/ops/group_by.cc | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index a958f3c4ce..f118af0a38 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -35,6 +35,9 @@ class Group_by : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 80ff3508b4..6ea01c83a8 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -261,6 +261,49 @@ void Group_by::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void Group_by::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + IndexLauncher launcher(GROUP_BY_FWD_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Group_by)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // data + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + + // assign + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(1, FID_DATA); + + // output + for (int i = 0; i < n; i++) { + launcher.add_region_requirement(RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(i + 2, FID_DATA); + } + + runtime->execute_index_space(ctx, launcher); +} + void Group_by::forward_task(Task const *task, std::vector const ®ions, Context ctx, From bc09bb990c0ae8c105a1a3a03f86be75e36ec1b7 Mon Sep 17 00:00:00 2001 From: Rae Wong <33883582+yingyee0111@users.noreply.github.com> Date: Tue, 13 Dec 2022 21:06:06 -0500 Subject: [PATCH 029/344] feat: implement inference batch tensors (#521) --- include/flexflow/ops/aggregate.h | 3 ++ include/flexflow/ops/aggregate_spec.h | 3 ++ src/ops/aggregate.cc | 50 +++++++++++++++++++++++++++ src/ops/aggregate_spec.cc | 49 ++++++++++++++++++++++++++ 4 files changed, 105 insertions(+) diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 4eeb695e92..f93e26057d 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -36,6 +36,9 @@ class Aggregate : public Op { char const *name = nullptr); void init(FFModel const &) override; void forward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 8c1966e72a..7cd0ee49c0 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -28,6 +28,9 @@ class AggregateSpec : public Op { char const *name); void init(FFModel const &) override; void forward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 0ad9d91d62..ce8894599f 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -243,6 +243,56 @@ void Aggregate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void Aggregate::inference(FFModel const &ff + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + parallel_is = outputs[0]->parallel_is; + IndexLauncher launcher(AGGREGATE_FWD_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Aggregate)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // gate_preds + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // gate_assign + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // exp_preds + for (int i = 0; i < n; i++) { + launcher.add_region_requirement(RegionRequirement(batch_inputs[i + 4]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[i + 4]->region)); + launcher.add_field(i + 2, FID_DATA); + } + // output + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(n + 2, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + + void Aggregate::forward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 749d071310..b0870684ec 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -232,6 +232,55 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void AggregateSpec::inference(FFModel const &ff + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + parallel_is = outputs[0]->parallel_is; + IndexLauncher launcher(AGG_SPEC_FWD_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AggregateSpec)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // gate_preds + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // gate_assign + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // exp_preds + for (int i = 0; i < n; i++) { + launcher.add_region_requirement(RegionRequirement(batch_inputs[i + 4]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[i + 4]->region)); + launcher.add_field(i + 2, FID_DATA); + } + // output + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(n + 2, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + void AggregateSpec::forward_task(Task const *task, std::vector const ®ions, Context ctx, From 5cce0671036d80a4fcae69538101eba5c47e7219 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 14 Dec 2022 02:53:03 +0000 Subject: [PATCH 030/344] fix --- src/ops/aggregate.cc | 14 +++++++------- src/ops/aggregate_spec.cc | 18 ++++++++++-------- src/ops/group_by.cc | 15 ++++++++------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index ce8894599f..16f8d492f9 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -243,7 +243,7 @@ void Aggregate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Aggregate::inference(FFModel const &ff +void Aggregate::inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs) { ArgumentMap argmap; @@ -275,11 +275,12 @@ void Aggregate::inference(FFModel const &ff launcher.add_field(1, FID_DATA); // exp_preds for (int i = 0; i < n; i++) { - launcher.add_region_requirement(RegionRequirement(batch_inputs[i + 4]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[i + 4]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[i + 4]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[i + 4]->region)); launcher.add_field(i + 2, FID_DATA); } // output @@ -292,7 +293,6 @@ void Aggregate::inference(FFModel const &ff runtime->execute_index_space(ctx, launcher); } - void Aggregate::forward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index b0870684ec..8a26b53906 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -232,9 +232,10 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void AggregateSpec::inference(FFModel const &ff - std::vector const &batch_inputs, - std::vector const &batch_outputs) { +void AggregateSpec::inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -264,11 +265,12 @@ void AggregateSpec::inference(FFModel const &ff launcher.add_field(1, FID_DATA); // exp_preds for (int i = 0; i < n; i++) { - launcher.add_region_requirement(RegionRequirement(batch_inputs[i + 4]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[i + 4]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[i + 4]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[i + 4]->region)); launcher.add_field(i + 2, FID_DATA); } // output diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 6ea01c83a8..79201dc626 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -262,8 +262,8 @@ void Group_by::forward(FFModel const &ff) { } void Group_by::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_inputs, + std::vector const &batch_outputs) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -293,11 +293,12 @@ void Group_by::inference(FFModel const &ff, // output for (int i = 0; i < n; i++) { - launcher.add_region_requirement(RegionRequirement(batch_outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[i]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); launcher.add_field(i + 2, FID_DATA); } From 74ccfaaa5228f29a614eeb434d51c48697a51bfd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 14 Dec 2022 08:06:39 +0000 Subject: [PATCH 031/344] finished debugging moe/mlp --- examples/cpp/inference/MLP_Unify/mlp.cc | 2 +- examples/cpp/inference/mixture_of_experts/moe.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index b96cb9b22c..9da8214c00 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -207,7 +207,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor t = create_mlp(&ff, &mlpConfig, input1, input2); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); - im.compile_model_and_allocate_buffer(); + // im.compile_model_and_allocate_buffer(); ff.init_operators(); // Start timer diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index c1c1958e91..fe18e705bd 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -127,11 +127,11 @@ void FlexFlow::top_level_task(Task const *task, Tensor t = create_moe(&ff, &moeConfig, input); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); - im.compile_model_and_allocate_buffer(); + // im.compile_model_and_allocate_buffer(); ff.init_operators(); // Data Loader - DataLoader data_loader(ff, moeConfig, input, ff.label_tensor); + // DataLoader data_loader(ff, moeConfig, input, ff.label_tensor); //----------------------------------------------------------------- @@ -154,7 +154,7 @@ void FlexFlow::top_level_task(Task const *task, vector> req = data_generator.get_requests(); int iterations = req.size(); for (int iter = 0; iter < iterations; iter++) { - data_loader.next_batch(ff); + // data_loader.next_batch(ff); runtime->begin_trace(ctx, 111 /*trace_id*/); im.inference((index++) % num_inflight_batches); runtime->end_trace(ctx, 111 /*trace_id*/); From ea29f51b52861c61df9d9e65155970c69d1a4d42 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 14 Dec 2022 17:09:52 +0000 Subject: [PATCH 032/344] [MoE] - Added implementation of encoder --- examples/cpp/inference/mixture_of_experts/moe.cc | 16 ++++++++++++++++ examples/cpp/inference/mixture_of_experts/moe.h | 7 +++++++ 2 files changed, 23 insertions(+) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index fe18e705bd..ef0bf0b207 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -88,6 +88,22 @@ Tensor create_moe(FFModel *model, return coop_output; } +Tensor create_moe_encoder(FFModel *model, + MoeConfig const *moeConfig, + Tensor const &input, + int num_heads, + int kdim, + int vdim) { + Tensor t = model->multihead_attention(input, + input, + input, + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim); + return create_moe(model, moeConfig, t); +} + void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index 5610de4ab7..d782132bbd 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -23,8 +23,15 @@ using namespace FlexFlow; struct MoeConfig { MoeConfig(void) { // Set default configurations here + hidden_size = 64; + num_attention_heads = 16; + attention_kdim = attention_vdim = hidden_size / num_attention_heads; } std::string dataset_path; + int hidden_size; + int num_attention_heads; + int attention_kdim; + int attention_vdim; }; class DataLoader { From 1cd1fa35d8199efbd501c3b6d0a455d17d8cf0e9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 14 Dec 2022 17:28:14 +0000 Subject: [PATCH 033/344] [MoE][Inference] - Added LayerNorm and residual connection to encoder --- .../cpp/inference/mixture_of_experts/moe.cc | 23 +++--- include/flexflow/ops/element_binary.h | 3 + include/flexflow/ops/layer_norm.h | 3 + src/ops/element_binary.cc | 71 +++++++++++++++++++ src/ops/layer_norm.cc | 44 ++++++++++++ 5 files changed, 136 insertions(+), 8 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index ef0bf0b207..e2e93e0a22 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -94,14 +94,21 @@ Tensor create_moe_encoder(FFModel *model, int num_heads, int kdim, int vdim) { - Tensor t = model->multihead_attention(input, - input, - input, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim); - return create_moe(model, moeConfig, t); + std::vector axes = {moeConfig->hidden_size}; + Tensor t = model->layer_norm( + model->add(model->multihead_attention(input, + input, + input, + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim), + input), + axes, + true, + 1e-05); + return model->layer_norm( + model->add(create_moe(model, moeConfig, t), t), axes, true, 1e-05); } void FlexFlow::top_level_task(Task const *task, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index cfacec50f7..b64539e53c 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -27,6 +27,9 @@ class ElementBinary : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8273b9ab52..9ab88c1202 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -26,6 +26,9 @@ class LayerNorm : public Op { void init(FFModel const &); void forward(FFModel const &); void backward(FFModel const &); + void inference(FFModel const &, + std::vector const &, + std::vector const &) override; void print_layer(FFModel const &model) { assert(0); } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 56d132b32c..e41426d039 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -424,6 +424,77 @@ void ElementBinary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void ElementBinary::inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + if (inplace_a) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + if (has_same_operands) { + // do nothing else + } else { + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + } + } else { + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + if (has_same_operands) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + } else { + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + /* regions[0](I): in1 regions[1](I): in2 diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 5d7fff3410..76d5352944 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -278,6 +278,50 @@ void LayerNorm::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void LayerNorm::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(LAYERNORM_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } + runtime->execute_index_space(ctx, launcher); +} + /* regions[0](I): input regions[1](O): output From c89e3e5880f104d7fb5af60e49baa5cef0b460c9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 15 Dec 2022 14:45:19 -0500 Subject: [PATCH 034/344] [MOE] - Add support for multi-layer encoder --- .../cpp/inference/mixture_of_experts/moe.cc | 55 ++++++++----------- .../cpp/inference/mixture_of_experts/moe.h | 11 +++- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index e2e93e0a22..23dd88506e 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -51,20 +51,10 @@ Tensor create_moe(FFModel *model, // MoE model Tensor gate_preds = model->dense(input, 64, AC_MODE_RELU); - // gate_preds->print("gate_preds"); gate_preds = model->dense(gate_preds, num_exp, AC_MODE_RELU); - // gate_preds->print("gate_preds2"); Tensor topK_output[2]; model->top_k(gate_preds, topK_output, num_select, false); - // topK_output[0]->print("topK_output[0]"); - // topK_output[1]->print("topK_output[1]"); Tensor exp_tensors[num_exp]; - // printf("num_exp: %i, alpha: %f\n", num_exp); - // input->print("input_tensor"); - - // return topK_output[0]; - // exp_tensors[0]->print("exp_tensors[0]"); - // exp_tensors[num_exp-1]->print("exp_tensors[num_exp-1]"); model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); for (int i=0; idims[2] = 1; // temporary fix to replica dimension being undefined @@ -76,7 +66,8 @@ Tensor create_moe(FFModel *model, agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) agg_inputs[3] = gate_preds; // full gate preds for (int i = 0; i < num_exp; i++) { - Tensor exp_pred = model->dense(exp_tensors[i], OUT_DIM, AC_MODE_RELU); + Tensor exp_pred = + model->dense(exp_tensors[i], moeConfig->hidden_size, AC_MODE_RELU); exp_pred->print("exp_pred"); agg_inputs[i + 4] = model->softmax(exp_pred); } @@ -90,25 +81,26 @@ Tensor create_moe(FFModel *model, Tensor create_moe_encoder(FFModel *model, MoeConfig const *moeConfig, - Tensor const &input, - int num_heads, - int kdim, - int vdim) { - std::vector axes = {moeConfig->hidden_size}; - Tensor t = model->layer_norm( - model->add(model->multihead_attention(input, - input, - input, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim), - input), - axes, - true, - 1e-05); - return model->layer_norm( - model->add(create_moe(model, moeConfig, t), t), axes, true, 1e-05); + Tensor const &input) { + std::vector axes = {0, 1}; + Tensor x = input; + for (int i = 0; i < moeConfig->num_encoder_layers; i++) { + x = model->layer_norm( + model->add(model->multihead_attention(x, + x, + x, + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim), + x), + axes, + true, + 1e-05); + x = model->layer_norm( + model->add(create_moe(model, moeConfig, x), x), axes, true, 1e-05); + } + return x; } void FlexFlow::top_level_task(Task const *task, @@ -148,7 +140,8 @@ void FlexFlow::top_level_task(Task const *task, //----------------------------------------------------------------- - Tensor t = create_moe(&ff, &moeConfig, input); + Tensor t = create_moe_encoder(&ff, &moeConfig, input); + t = ff.dense(t, OUT_DIM, AC_MODE_RELU); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); // im.compile_model_and_allocate_buffer(); ff.init_operators(); diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index d782132bbd..d447c9fd63 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -15,6 +15,13 @@ #include "flexflow/model.h" #define MAX_NUM_SAMPLES 60000 +#define NUM_SAMPLES 60000 +#define TRAIN_SAMPLES 60000 +#define TEST_SAMPLES 00000 +#define MNIST_DIMS 28 * 28 +#define CIFAR_DIMS 3 * 32 * 32 +#define DATA_DIMS MNIST_DIMS +#define OUT_DIM 10 using namespace Legion; using namespace std; @@ -23,15 +30,17 @@ using namespace FlexFlow; struct MoeConfig { MoeConfig(void) { // Set default configurations here - hidden_size = 64; + hidden_size = DATA_DIMS; num_attention_heads = 16; attention_kdim = attention_vdim = hidden_size / num_attention_heads; + num_encoder_layers = 6; } std::string dataset_path; int hidden_size; int num_attention_heads; int attention_kdim; int attention_vdim; + int num_encoder_layers; }; class DataLoader { From b73a8c7e3378eabecce638373dad10e814822382 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Dec 2022 13:06:00 -0500 Subject: [PATCH 035/344] added machine view parameter to inference --- include/flexflow/operator.h | 3 ++- include/flexflow/ops/aggregate.h | 3 ++- include/flexflow/ops/aggregate_spec.h | 3 ++- include/flexflow/ops/attention.h | 3 ++- include/flexflow/ops/element_binary.h | 3 ++- include/flexflow/ops/groupby.h | 3 ++- include/flexflow/ops/layer_norm.h | 3 ++- include/flexflow/ops/linear.h | 3 ++- include/flexflow/ops/topk.h | 3 ++- src/ops/aggregate.cc | 6 ++++-- src/ops/aggregate_spec.cc | 11 ++++++----- src/ops/attention.cc | 6 ++++-- src/ops/element_binary.cc | 11 ++++++----- src/ops/group_by.cc | 6 ++++-- src/ops/layer_norm.cc | 6 ++++-- src/ops/linear.cc | 6 ++++-- src/ops/topk.cc | 6 ++++-- 17 files changed, 54 insertions(+), 31 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 2fe689c284..a276f0fd74 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -190,7 +190,8 @@ class Op { // Pure virtual functions for inference virtual void inference(FFModel const &, std::vector const &, - std::vector const &) { + std::vector const &, + MachineView const *mv = nullptr) { assert(false); }; virtual void print_layer(FFModel const &model) = 0; diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index f93e26057d..6e4dd0b4ac 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -38,7 +38,8 @@ class Aggregate : public Op { void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 7cd0ee49c0..816574ced0 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -30,7 +30,8 @@ class AggregateSpec : public Op { void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index be54ef21a1..1531708bb7 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -68,7 +68,8 @@ class MultiHeadAttention : public Op { void backward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index b64539e53c..2f081f1b7e 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -29,7 +29,8 @@ class ElementBinary : public Op { void backward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index f118af0a38..44e03ec6e3 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -37,7 +37,8 @@ class Group_by : public Op { void backward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 9ab88c1202..c05461acdf 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -28,7 +28,8 @@ class LayerNorm : public Op { void backward(FFModel const &); void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) { assert(0); } diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index 510799c43a..ab1c1febc5 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -39,7 +39,8 @@ class Linear : public Op { void backward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 1b31df998d..af62f51c93 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -32,7 +32,8 @@ class TopK : public Op { void backward(FFModel const &) override; void inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 16f8d492f9..8c36edf1bc 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -245,12 +245,14 @@ void Aggregate::forward(FFModel const &ff) { void Aggregate::inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_init(ff, argmap); parallel_is = outputs[0]->parallel_is; + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(AGGREGATE_FWD_TASK_ID, parallel_is, TaskArgument(this, sizeof(Aggregate)), @@ -258,7 +260,7 @@ void Aggregate::inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); // gate_preds launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 8a26b53906..1e3d66fdee 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -232,15 +232,16 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void AggregateSpec::inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs) { +void AggregateSpec::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_init(ff, argmap); parallel_is = outputs[0]->parallel_is; + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(AGG_SPEC_FWD_TASK_ID, parallel_is, TaskArgument(this, sizeof(AggregateSpec)), @@ -248,7 +249,7 @@ void AggregateSpec::inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); // gate_preds launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 7b9e711b0c..d9ee14ecb6 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -526,12 +526,14 @@ void MultiHeadAttention::forward(FFModel const &ff) { void MultiHeadAttention::inference( FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); int idx = 0; + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(ATTENTION_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -539,7 +541,7 @@ void MultiHeadAttention::inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index e41426d039..feb1862b04 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -424,14 +424,15 @@ void ElementBinary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void ElementBinary::inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs) { +void ElementBinary::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -439,7 +440,7 @@ void ElementBinary::inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); if (inplace_a) { assert(batch_outputs[0]->part == batch_inputs[0]->part); assert(batch_outputs[0]->region == batch_inputs[0]->region); diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 79201dc626..b341f004ac 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -263,10 +263,12 @@ void Group_by::forward(FFModel const &ff) { void Group_by::inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(GROUP_BY_FWD_TASK_ID, parallel_is, TaskArgument(this, sizeof(Group_by)), @@ -274,7 +276,7 @@ void Group_by::inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); // data launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 76d5352944..bec6f7d651 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -280,11 +280,13 @@ void LayerNorm::forward(FFModel const &ff) { void LayerNorm::inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(LAYERNORM_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -292,7 +294,7 @@ void LayerNorm::inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 9413f5f726..435080dbe1 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -369,11 +369,13 @@ void Linear::forward(FFModel const &ff) { void Linear::inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(LINEAR_FWD_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -381,7 +383,7 @@ void Linear::inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, diff --git a/src/ops/topk.cc b/src/ops/topk.cc index ae43038416..ec6da77a31 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -222,11 +222,13 @@ void TopK::forward(FFModel const &ff) { void TopK::inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(TOPK_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -234,7 +236,7 @@ void TopK::inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, From 7b1e1624a527717ea27cdb0dadb0a829ff01a95f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Dec 2022 13:13:31 -0500 Subject: [PATCH 036/344] make inference functions return futuremaps --- include/flexflow/operator.h | 10 +++++----- include/flexflow/ops/aggregate.h | 9 +++++---- include/flexflow/ops/aggregate_spec.h | 10 +++++----- include/flexflow/ops/attention.h | 9 +++++---- include/flexflow/ops/element_binary.h | 9 +++++---- include/flexflow/ops/groupby.h | 9 +++++---- include/flexflow/ops/layer_norm.h | 10 +++++----- include/flexflow/ops/linear.h | 10 +++++----- include/flexflow/ops/topk.h | 9 +++++---- src/ops/aggregate.cc | 10 +++++----- src/ops/aggregate_spec.cc | 11 ++++++----- src/ops/attention.cc | 4 ++-- src/ops/element_binary.cc | 11 ++++++----- src/ops/group_by.cc | 10 +++++----- src/ops/layer_norm.cc | 10 +++++----- src/ops/linear.cc | 10 +++++----- src/ops/topk.cc | 10 +++++----- 17 files changed, 84 insertions(+), 77 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index a276f0fd74..94304784c9 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -8,7 +8,7 @@ #include namespace FlexFlow { - +using Legion::FutureMap; extern LegionRuntime::Logger::Category log_measure; class OpMeta; @@ -188,10 +188,10 @@ class Op { virtual void forward(FFModel const &) = 0; virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference - virtual void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) { + virtual FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) { assert(false); }; virtual void print_layer(FFModel const &model) = 0; diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 6e4dd0b4ac..5f73b8f7b3 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -5,6 +5,7 @@ #include "flexflow/ops/aggregate_params.h" namespace FlexFlow { +using Legion::FutureMap; #define AGGREGATE_MAX_K 4 #define AGGREGATE_MAX_BATCH_SIZE 64 @@ -36,10 +37,10 @@ class Aggregate : public Op { char const *name = nullptr); void init(FFModel const &) override; void forward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 816574ced0..21ade888a4 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -5,7 +5,7 @@ #include "flexflow/ops/aggregate_spec_params.h" namespace FlexFlow { - +using Legion::FutureMap; #define AGGREGATE_SPEC_MAX_K 4 #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32 #define AGGREGATE_SPEC_MAX_N 12 @@ -28,10 +28,10 @@ class AggregateSpec : public Op { char const *name); void init(FFModel const &) override; void forward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index 1531708bb7..fe870579b3 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -10,6 +10,7 @@ #include "flexflow/ops/attention_params.h" namespace FlexFlow { +using Legion::FutureMap; class MultiHeadAttentionMeta; @@ -66,10 +67,10 @@ class MultiHeadAttention : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 2f081f1b7e..593e229653 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -7,6 +7,7 @@ #include "flexflow/ops/element_binary_params.h" namespace FlexFlow { +using Legion::FutureMap; class ElementBinary : public Op { public: @@ -27,10 +28,10 @@ class ElementBinary : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index 44e03ec6e3..88e1df3ee9 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -6,6 +6,7 @@ #include "flexflow/ops/groupby_params.h" namespace FlexFlow { +using Legion::FutureMap; class GroupByMeta : public OpMeta { public: @@ -35,10 +36,10 @@ class Group_by : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index c05461acdf..9b43a40e3c 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -3,7 +3,7 @@ #include "flexflow/model.h" namespace FlexFlow { - +using Legion::FutureMap; class LayerNormMeta; class LayerNorm : public Op { @@ -26,10 +26,10 @@ class LayerNorm : public Op { void init(FFModel const &); void forward(FFModel const &); void backward(FFModel const &); - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) { assert(0); } diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index ab1c1febc5..6247dc7f76 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -6,7 +6,7 @@ #include "flexflow/ops/linear_params.h" namespace FlexFlow { - +using Legion::FutureMap; class FFModel; class Layer; @@ -37,10 +37,10 @@ class Linear : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index af62f51c93..5c3f12d294 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -6,6 +6,7 @@ #include "flexflow/ops/topk_params.h" namespace FlexFlow { +using Legion::FutureMap; class TopKMeta : public OpMeta { public: @@ -30,10 +31,10 @@ class TopK : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 8c36edf1bc..096b483a23 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -243,10 +243,10 @@ void Aggregate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Aggregate::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap Aggregate::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -292,7 +292,7 @@ void Aggregate::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(n + 2, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Aggregate::forward_task(Task const *task, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 1e3d66fdee..4ce863cb50 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -232,10 +232,11 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void AggregateSpec::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap + AggregateSpec::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -281,7 +282,7 @@ void AggregateSpec::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(n + 2, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void AggregateSpec::forward_task(Task const *task, diff --git a/src/ops/attention.cc b/src/ops/attention.cc index d9ee14ecb6..57d945c2b3 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -523,7 +523,7 @@ void MultiHeadAttention::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void MultiHeadAttention::inference( +FutureMap MultiHeadAttention::inference( FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -572,7 +572,7 @@ void MultiHeadAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(4, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index feb1862b04..9214113ccc 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -424,10 +424,11 @@ void ElementBinary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void ElementBinary::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap + ElementBinary::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -493,7 +494,7 @@ void ElementBinary::inference(FFModel const &ff, launcher.add_field(2, FID_DATA); } } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index b341f004ac..33038d69c3 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -261,10 +261,10 @@ void Group_by::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Group_by::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap Group_by::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -304,7 +304,7 @@ void Group_by::inference(FFModel const &ff, launcher.add_field(i + 2, FID_DATA); } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Group_by::forward_task(Task const *task, diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index bec6f7d651..02dd519780 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -278,10 +278,10 @@ void LayerNorm::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void LayerNorm::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap LayerNorm::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -321,7 +321,7 @@ void LayerNorm::inference(FFModel const &ff, weights[1]->region)); launcher.add_field(3, FID_DATA); } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 435080dbe1..49560fc20f 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -367,10 +367,10 @@ void Linear::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Linear::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap Linear::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -410,7 +410,7 @@ void Linear::inference(FFModel const &ff, weights[1]->region)); launcher.add_field(3, FID_DATA); } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Linear::forward_task(Task const *task, diff --git a/src/ops/topk.cc b/src/ops/topk.cc index ec6da77a31..52b54711cb 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -220,10 +220,10 @@ void TopK::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void TopK::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap TopK::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -255,7 +255,7 @@ void TopK::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[1]->region)); launcher.add_field(2, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void TopK::forward_task(Task const *task, From be13e076c9b3fb53ae02359c3e24ba21d6abc520 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 10 Jan 2023 22:59:56 +0000 Subject: [PATCH 037/344] [Experts] initial implementation --- config/config.linux | 2 +- include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 20 +- include/flexflow/operator_params.h | 1 + include/flexflow/ops/experts.h | 79 ++++++ include/flexflow/ops/experts_params.h | 26 ++ src/ops/experts.cc | 365 ++++++++++++++++++++++++++ src/runtime/model.cc | 30 +++ 8 files changed, 522 insertions(+), 2 deletions(-) create mode 100644 include/flexflow/ops/experts.h create mode 100644 include/flexflow/ops/experts_params.h create mode 100644 src/ops/experts.cc diff --git a/config/config.linux b/config/config.linux index 0f819f4031..1cf28a374c 100755 --- a/config/config.linux +++ b/config/config.linux @@ -14,7 +14,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=${BUILD_TYPE:-Release} +BUILD_TYPE=${BUILD_TYPE:-Debug} # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 45e754231d..1a69d43aad 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -143,6 +143,7 @@ enum OperatorType { OP_POW, // https://pytorch.org/docs/stable/generated/torch.pow.html OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html OP_LAYERNORM, + OP_EXPERTS, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d3e7522b9e..0545cc7fbb 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -54,6 +54,10 @@ enum TaskIDs { ELEMENTUNARY_INIT_TASK_ID, ELEMENTUNARY_FWD_TASK_ID, ELEMENTUNARY_BWD_TASK_ID, + EXPERTS_INIT_TASK_ID, + EXPERTS_FWD_TASK_ID, + EXPERTS_BWD_TASK_ID, + EXPERTS_INF_TASK_ID, CONV2D_INIT_TASK_ID, CONV2D_INIT_PARA_TASK_ID, CONV2D_FWD_TASK_ID, @@ -255,6 +259,7 @@ class Dropout; class ElementBinary; class ElementUnary; class Embedding; +class Experts; class Flat; class Group_by; class LayerNorm; @@ -448,7 +453,7 @@ class FFModel { PoolType type = POOL_MAX, ActiMode activation = AC_MODE_NONE, char const *name = NULL); - // Add a batch_norm layer + // Add a layer_norm layer Tensor layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, @@ -478,6 +483,15 @@ class FFModel { // Add a concat layer Tensor concat(int n, Tensor const *tensors, int axis, char const *name = NULL); + // Add an experts layer + Tensor experts(const Tensor input, + const Tensor indices, + int num_experts, + int experts_start_idx, + int experts_num_layers, + int experts_output_dim_size, + int experts_internal_dim_size, + char const *name = nullptr); // Add a mean layer Tensor mean(const Tensor input, std::vector const &dims, @@ -849,6 +863,10 @@ class FFModel { ElementUnary *>, std::unordered_map, Embedding *>, + std::unordered_map< + std::pair, + ExpertsParams>, + Experts *>, std::unordered_map, Flat *>, std::unordered_map< diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 8bf33a3cfa..8c6a8d0ddf 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -12,6 +12,7 @@ #include "flexflow/ops/element_binary_params.h" #include "flexflow/ops/element_unary_params.h" #include "flexflow/ops/embedding_params.h" +#include "flexflow/ops/experts_params.h" #include "flexflow/ops/flat_params.h" #include "flexflow/ops/groupby_params.h" #include "flexflow/ops/layer_norm_params.h" diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h new file mode 100644 index 0000000000..74e9c0ff02 --- /dev/null +++ b/include/flexflow/ops/experts.h @@ -0,0 +1,79 @@ +#pragma once + +#include "flexflow/model.h" + +namespace FlexFlow { + +class ExpertsMeta : public OpMeta { +public: + ExpertsMeta(FFHandler handler) : OpMeta(handler){}; +}; + +class Experts : public Op { +public: + using Params = ExpertsParams; + using Input = std::pair; + Experts(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name = nullptr); + Experts(FFModel &model, + const ParallelTensor input, + const ParallelTensor indices, + int _num_experts, + int _experts_start_idx, + int _experts_num_layers, + int _experts_output_dim_size, + int _experts_internal_dim_size, + char const *name = nullptr); + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + FutureMap inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + Params get_params() const; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + +public: + int num_experts; + int experts_start_idx; + int experts_num_layers; + int experts_output_dim_size; + int experts_internal_dim_size; +}; + +}; // namespace FlexFlow diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h new file mode 100644 index 0000000000..8f0cee4959 --- /dev/null +++ b/include/flexflow/ops/experts_params.h @@ -0,0 +1,26 @@ +#pragma once + +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct ExpertsParams { + bool is_valid( + std::pair const &) const; + int num_experts; + int experts_start_idx; + int experts_num_layers; + int experts_output_dim_size; + int experts_internal_dim_size; +}; + +bool operator==(ExpertsParams const &, ExpertsParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ExpertsParams const &) const; +}; +} // namespace std diff --git a/src/ops/experts.cc b/src/ops/experts.cc new file mode 100644 index 0000000000..438d9179f5 --- /dev/null +++ b/src/ops/experts.cc @@ -0,0 +1,365 @@ +/* Copyright 2022 CMU + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/experts.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +bool operator==(ExpertsParams const &lhs, ExpertsParams const &rhs) { + return lhs.num_experts == rhs.num_experts && + lhs.experts_start_idx == rhs.experts_start_idx && + lhs.experts_num_layers == rhs.experts_num_layers && + lhs.experts_output_dim_size == rhs.experts_output_dim_size && + lhs.experts_internal_dim_size == rhs.experts_internal_dim_size; +} + +bool ExpertsParams::is_valid( + std::pair const &input) const { + if (!input.first.is_valid()) + return false; + if (!input.second.is_valid()) + return false; + if (input.first.num_dims != input.second.num_dims + 1) + return false; + if (input.second.data_type != DT_INT32 && input.second.data_type != DT_INT64) + return false; + for (int i = 0; i < input.second.num_dims; i++) + if (input.second.dims[i] != input.first.dims[i + 1]) + return false; + return true; +} + +ExpertsParams Experts::get_params() const { + ExpertsParams params; + params.num_experts = num_experts; + params.experts_start_idx = experts_start_idx; + params.experts_num_layers = experts_num_layers; + params.experts_output_dim_size = experts_output_dim_size; + params.experts_internal_dim_size = experts_internal_dim_size; + return params; +} + +Tensor FFModel::experts(const Tensor input, + const Tensor indices, + int num_experts, + int experts_start_idx, + int experts_num_layers, + int experts_output_dim_size, + int experts_internal_dim_size, + char const *name) { + Layer *e = new Layer(this, + OP_EXPERTS, + DT_FLOAT, + name, + 2 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input, + indices); + assert(input->num_dims == indices->num_dims + 1); + for (int i = 0; i < indices->num_dims; i++) + assert(input->dims[i + 1] == indices->dims[i]); + assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); + int dims[MAX_TENSOR_DIM]; + int numdim = input->num_dims; + for (int i = 1; i < input->num_dims; i++) + dims[i] = input->dims[i]; + dims[0] = experts_output_dim_size; + e->outputs[0] = create_tensor_legion_ordering( + numdim, dims, input->data_type, e, 0, true /*create_grad*/); + e->add_int_property("num_experts", num_experts); + e->add_int_property("experts_start_idx", experts_start_idx); + e->add_int_property("experts_num_layers", experts_num_layers); + e->add_int_property("experts_output_dim_size", experts_output_dim_size); + e->add_int_property("experts_internal_dim_size", experts_internal_dim_size); + layers.push_back(e); + return e->outputs[0]; +} + +Op *Experts::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("num_experts", value); + int num_experts = value; + layer->get_int_property("experts_start_idx", value); + int experts_start_idx = value; + layer->get_int_property("experts_num_layers", value); + int experts_num_layers = value; + layer->get_int_property("experts_output_dim_size", value); + int experts_output_dim_size = value; + layer->get_int_property("experts_internal_dim_size", value); + int experts_internal_dim_size = value; + return new Experts(model, + inputs[0], + inputs[1], + num_experts, + experts_start_idx, + experts_num_layers, + experts_output_dim_size, + experts_internal_dim_size, + layer->name); +} + +Experts::Experts(FFModel &model, + ExpertsParams const ¶ms, + std::pair const &inputs, + char const *name) + : Experts(model, + inputs.first, + inputs.second, + params.num_experts, + params.experts_start_idx, + params.experts_num_layers, + params.experts_output_dim_size, + params.experts_internal_dim_size, + name) {} + +Experts::Experts(FFModel &model, + const ParallelTensor input, + const ParallelTensor indices, + int _num_experts, + int _experts_start_idx, + int _experts_num_layers, + int _experts_output_dim_size, + int _experts_internal_dim_size, + char const *name) + : Op(model, + OP_EXPERTS, + DT_FLOAT, + name, + 2 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input, + indices), + num_experts(_num_experts), experts_start_idx(_experts_start_idx), + experts_num_layers(_experts_num_layers), + experts_output_dim_size(_experts_output_dim_size), + experts_internal_dim_size(_experts_internal_dim_size) { + assert(input->num_dims == indices->num_dims + 1); + assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); + for (int i = 0; i < indices->num_dims; i++) + assert(input->dims[i + 1] == indices->dims[i]); + // Assume that we don't parallelize the channel dim + assert(input->dims[0].degree == 1); + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < input->num_dims; i++) + dims[i] = input->dims[i]; + dims[0].size = experts_output_dim_size; + numOutputs = 1; + outputs[0] = model.create_parallel_tensor_legion_ordering( + input->num_dims, dims, input->data_type, this); +} + +void Experts::serialize(Legion::Serializer &sez) const { + ExpertsParams params = get_params(); + sez.serialize(params.num_experts); + sez.serialize(params.experts_start_idx); + sez.serialize(params.experts_num_layers); + sez.serialize(params.experts_output_dim_size); + sez.serialize(params.experts_internal_dim_size); +} + +using PCG::Node; +Node Experts::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + int num_experts, experts_start_idx, experts_num_layers, + experts_output_dim_size, experts_internal_dim_size; + dez.deserialize(num_experts); + dez.deserialize(experts_start_idx); + dez.deserialize(experts_num_layers); + dez.deserialize(experts_output_dim_size); + dez.deserialize(experts_internal_dim_size); + + ExpertsParams params; + params.num_experts = num_experts; + params.experts_start_idx = experts_start_idx; + params.experts_num_layers = experts_num_layers; + params.experts_output_dim_size = experts_output_dim_size; + params.experts_internal_dim_size = experts_internal_dim_size; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} + +Op *Experts::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + ExpertsParams params = get_params(); + return new Experts(ff, params, {inputs[0], inputs[1]}, this->name); +} + +void Experts::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(EXPERTS_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Experts)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *Experts::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Experts const *bmm = (Experts *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + ExpertsMeta *m = new ExpertsMeta(handle); + return m; +} + +void Experts::forward(FFModel const &ff) { + assert(false && "Experts is designed for inference only"); +} + +void Experts::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(false && "Experts is designed for inference only"); +} + +void Experts::backward(FFModel const &ff) { + assert(false && "Experts is designed for inference only"); +} + +void Experts::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(false && "Experts is designed for inference only"); +} + +FutureMap Experts::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + IndexLauncher launcher(EXPERTS_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void Experts::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // TODO: to be implemented +} + +void Experts::print_layer(FFModel const &ff) { + return; +} + +bool Experts::measure_operator_cost(Simulator *sim, + MachineView const &c, + CostMetrics &cost_metrics) const { + // This is an inference only operator + assert(false); + return false; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::ExpertsParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.num_experts); + hash_combine(key, params.experts_start_idx); + hash_combine(key, params.experts_num_layers); + hash_combine(key, params.experts_output_dim_size); + hash_combine(key, params.experts_internal_dim_size); + return key; +} +}; // namespace std diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ad8a01563e..7350735314 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -35,6 +35,7 @@ #include "flexflow/ops/element_binary.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" +#include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/groupby.h" @@ -3755,6 +3756,35 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "ElementWiseBinary Backward Task"); } + // Experts + { + TaskVariantRegistrar registrar(EXPERTS_INIT_TASK_ID, "Experts Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Experts Init Task"); + } + { + TaskVariantRegistrar registrar(EXPERTS_FWD_TASK_ID, "Experts Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Experts Forward Task"); + } + { + TaskVariantRegistrar registrar(EXPERTS_BWD_TASK_ID, "Experts Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Experts Backward Task"); + } + { + TaskVariantRegistrar registrar(EXPERTS_INF_TASK_ID, "Experts Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Experts Inference Task"); + } // Cast { TaskVariantRegistrar registrar(CAST_INIT_TASK_ID, "Cast Init"); From 9a32a6eed15580106218c583bd2c4ecde0487e74 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 10 Jan 2023 23:01:29 +0000 Subject: [PATCH 038/344] [Experts] undo a change --- config/config.linux | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.linux b/config/config.linux index 1cf28a374c..0f819f4031 100755 --- a/config/config.linux +++ b/config/config.linux @@ -14,7 +14,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=${BUILD_TYPE:-Debug} +BUILD_TYPE=${BUILD_TYPE:-Release} # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). From 285b2e971d2364091f2926e053046d4683d89ec5 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 14 Jan 2023 12:19:35 -0600 Subject: [PATCH 039/344] [MOE] update moe cpp example and aggregate implementation (#555) * [MOE] update moe cpp example and aggregate implementation * [MOE] bug fixes to make the MOE example work --- .../cpp/inference/mixture_of_experts/moe.cc | 22 +++++++++---------- src/ops/aggregate.cc | 21 ++++++++++-------- src/ops/experts.cc | 16 ++++++++------ src/runtime/ffconst_utils.cc | 2 ++ 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 23dd88506e..459ffb42b5 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -31,7 +31,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_app("MoE"); -int num_exp = 5; +int num_exp = 4; int num_select = 2; void parse_input_args(char **argv, int argc, MoeConfig &config) { @@ -54,22 +54,20 @@ Tensor create_moe(FFModel *model, gate_preds = model->dense(gate_preds, num_exp, AC_MODE_RELU); Tensor topK_output[2]; model->top_k(gate_preds, topK_output, num_select, false); - Tensor exp_tensors[num_exp]; - model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); - for (int i=0; idims[2] = 1; // temporary fix to replica dimension being undefined - exp_tensors[i]->print("exp_tensors[i]"); - } Tensor agg_inputs[num_exp + 4]; agg_inputs[0] = model->softmax(topK_output[0]); // gate preds agg_inputs[1] = topK_output[1]; // gate assign agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) agg_inputs[3] = gate_preds; // full gate preds - for (int i = 0; i < num_exp; i++) { - Tensor exp_pred = - model->dense(exp_tensors[i], moeConfig->hidden_size, AC_MODE_RELU); - exp_pred->print("exp_pred"); - agg_inputs[i + 4] = model->softmax(exp_pred); + for (int i = 0; i < num_exp /*number of experts layers*/; i++) { + Tensor exp_pred = model->experts(gate_preds, + topK_output[1], + 32 /*number of experts*/, + 32 * i /*expert start index*/, + 1 /*number of linear layers*/, + moeConfig->hidden_size /*output_size*/, + moeConfig->hidden_size /*internal_size*/); + agg_inputs[i + 4] = exp_pred; } for (int i = 0; i < num_exp + 4; i++) { agg_inputs[i]->print("agg_inputs[i]"); diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 096b483a23..941c6da8cb 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -55,10 +55,8 @@ Tensor FFModel::aggregate( int num_dim = inputs[4]->num_dims; // Set output shape int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < num_dim - 1; i++) { + for (int i = 0; i < num_dim; i++) dims[i] = inputs[4]->dims[i]; - } - dims[num_dim - 1] = inputs[0]->dims[num_dim - 1]; li->outputs[0] = create_tensor_legion_ordering( num_dim, dims, DT_FLOAT, li, 0, true /*create_grad*/); } @@ -143,11 +141,16 @@ Aggregate::Aggregate(FFModel &model, } // Set output shape ParallelDim dims[MAX_TENSOR_DIM]; +<<<<<<< HEAD for (int i = 0; i < num_dim - 1; i++) { dims[i] = inputs[4]->dims[i]; } dims[num_dim - 2] = inputs[0]->dims[num_dim - 2]; dims[num_dim - 1] = inputs[0]->dims[num_dim - 1]; +======= + for (int i = 0; i < num_dim; i++) + dims[i] = inputs[4]->dims[i]; +>>>>>>> 99a89a9b... [MOE] update moe cpp example and aggregate implementation (#555) numOutputs = 1; outputs[0] = model.create_parallel_tensor_legion_ordering( num_dim, dims, DT_FLOAT, this); @@ -204,7 +207,7 @@ void Aggregate::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(AGGREGATE_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Aggregate)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -255,7 +258,7 @@ FutureMap Aggregate::inference(FFModel const &ff, size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(AGGREGATE_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Aggregate)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -299,10 +302,10 @@ void Aggregate::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - int n = ((Aggregate *)task->args)->n; - - assert((int)regions.size() == n + 3); - assert((int)task->regions.size() == n + 3); + assert(regions.size() == task->regions.size()); + int n = regions.size() - 3; + // FIXME: skip the aggregate computation for now + return; AggregateMeta const *m = *((AggregateMeta **)task->local_args); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 438d9179f5..28405d3dd7 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -86,9 +86,9 @@ Tensor FFModel::experts(const Tensor input, 1 /*outputs*/, input, indices); - assert(input->num_dims == indices->num_dims + 1); - for (int i = 0; i < indices->num_dims; i++) - assert(input->dims[i + 1] == indices->dims[i]); + assert(input->num_dims == indices->num_dims); + for (int i = 1; i < indices->num_dims; i++) + assert(input->dims[i] == indices->dims[i]); assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); int dims[MAX_TENSOR_DIM]; int numdim = input->num_dims; @@ -168,12 +168,14 @@ Experts::Experts(FFModel &model, experts_num_layers(_experts_num_layers), experts_output_dim_size(_experts_output_dim_size), experts_internal_dim_size(_experts_internal_dim_size) { - assert(input->num_dims == indices->num_dims + 1); + assert(input->num_dims == indices->num_dims); assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); - for (int i = 0; i < indices->num_dims; i++) - assert(input->dims[i + 1] == indices->dims[i]); - // Assume that we don't parallelize the channel dim + for (int i = 1; i < indices->num_dims; i++) + assert(input->dims[i] == indices->dims[i]); + // Assume that we don't parallelize the channel dim of input + // nor the expert_assigned dim of indices assert(input->dims[0].degree == 1); + assert(indices->dims[0].degree == 1); ParallelDim dims[MAX_TENSOR_DIM]; for (int i = 0; i < input->num_dims; i++) dims[i] = input->dims[i]; diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index a604d0bd4b..901f72e816 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -45,6 +45,8 @@ std::string get_operator_type_name(OperatorType type) { return "Split"; case OP_EMBEDDING: return "Embedding"; + case OP_EXPERTS: + return "Experts"; case OP_GROUP_BY: return "Group_by"; case OP_CACHE: From 521363421939092f0e3774d11db29ad069781c40 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 17 Jan 2023 21:00:48 -0500 Subject: [PATCH 040/344] [inference] removed gshard and transformer files --- CMakeLists.txt | 10 - .../cpp/inference/G-Shard-MoE/CMakeLists.txt | 12 - examples/cpp/inference/G-Shard-MoE/Makefile | 39 -- examples/cpp/inference/G-Shard-MoE/g_shard.cc | 196 --------- .../cpp/inference/Transformer/CMakeLists.txt | 19 - examples/cpp/inference/Transformer/Makefile | 35 -- .../cpp/inference/Transformer/transformer.cc | 397 ------------------ .../cpp/inference/Transformer/transformer.cu | 58 --- .../cpp/inference/Transformer/transformer.h | 54 --- 9 files changed, 820 deletions(-) delete mode 100644 examples/cpp/inference/G-Shard-MoE/CMakeLists.txt delete mode 100644 examples/cpp/inference/G-Shard-MoE/Makefile delete mode 100644 examples/cpp/inference/G-Shard-MoE/g_shard.cc delete mode 100644 examples/cpp/inference/Transformer/CMakeLists.txt delete mode 100644 examples/cpp/inference/Transformer/Makefile delete mode 100644 examples/cpp/inference/Transformer/transformer.cc delete mode 100644 examples/cpp/inference/Transformer/transformer.cu delete mode 100644 examples/cpp/inference/Transformer/transformer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b796308aa3..b6a8fcec4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -335,8 +335,6 @@ option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) option(FF_BUILD_SPLIT_TEST "build split test example" OFF) option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) -option(FF_BUILD_TRANSFORMER_INFERENCE "build transformer inference example" OFF) -option(FF_BUILD_G_SHARD_INFERENCE "build G-Shard inference example" OFF) option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) @@ -427,14 +425,6 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) add_subdirectory(examples/cpp/inference/mixture_of_experts) endif() -# if(FF_BUILD_TRANSFORMER_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) -# add_subdirectory(examples/cpp/inference/Transformer) -# endif() - -# if(FF_BUILD_G_SHARD_INFERENCE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) -# add_subdirectory(examples/cpp/inference/G-Shard-MoE) -# endif() - # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/examples/cpp/inference/G-Shard-MoE/CMakeLists.txt b/examples/cpp/inference/G-Shard-MoE/CMakeLists.txt deleted file mode 100644 index daab2dc49b..0000000000 --- a/examples/cpp/inference/G-Shard-MoE/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExampleInferenceGShard) -set(project_target g_shard) - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - g_shard.cc) - -cuda_add_executable(${project_target} ${CPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/examples/cpp/inference/G-Shard-MoE/Makefile b/examples/cpp/inference/G-Shard-MoE/Makefile deleted file mode 100644 index f64e670e05..0000000000 --- a/examples/cpp/inference/G-Shard-MoE/Makefile +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2021 CMU, Facebook, LANL, MIT, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 1 # Include debugging symbols -MAX_DIM ?= 5 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 0 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 0 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) -USE_HIP ?= 1 # Include HIP support (requires HIP) -HIP_TARGET ?= ROCM -USE_GPU_REDUCTIONS ?= 0 - -# Put the binary file name here -OUTFILE ?= g_shard -# List all the application source files here -GEN_SRC = g_shard.cc -GEN_GPU_SRC = -GEN_HIP_SRC = - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/G-Shard-MoE/g_shard.cc b/examples/cpp/inference/G-Shard-MoE/g_shard.cc deleted file mode 100644 index 95c0b3f1dd..0000000000 --- a/examples/cpp/inference/G-Shard-MoE/g_shard.cc +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright 2021 Stanford University - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/model.h" -#include -#include -#include -using namespace Legion; -using namespace FlexFlow; - -// embed_dim=768, -// num_heads=12, -// kdim=None, -// vdim=None, -// dropout=0.1, -// bias=True, -// add_bias_kv=False, -// add_zero_attn=False, -// self_attention=True, -// encoder_decoder_attention=False, -// q_noise=0.0, -// qn_block_size=8, - -// Tensor FFModel::multihead_attention(const Tensor query, -// const Tensor key, -// const Tensor value, -// int embed_dim, -// int num_heads, -// int kdim, -// int vdim, -// float dropout, -// bool bias, -// bool add_bias_kv, -// bool add_zero_attn, -// Initializer *kernel_initializer, -// char const *name) { - - -void create_attention_decoder(FFModel *model, - Tensor const &input1, - Tensor const &input2, - Tensor &output1, - Tensor &output2, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout=0.1, - bool normalize_before=false, - bool is_moe=false) { - - std::vector axes = {embed_dim}; - Tensor x = normalize_before ? model->layer_norm(input1 /*const Tensor input*/, axes /*std::vector const &axes*/, true /*elementwise_affine*/, 1e-05 /*eps*/) : input1; - x = model->add(model->dropout(model->multihead_attention(x, x, x, embed_dim, num_heads, embed_dim, embed_dim, dropout, true /*bias*/, false /*add_bias_kv*/, false /*add_zero_attn*/), dropout), x); - //x = normalize_before ? x : model->layer_norm(x, axes, true, 1e-05); - x = model->layer_norm(x, axes, true, 1e-05); - - if(!is_moe) { - x = model->dropout(model->dense(model->dropout(model->dense(x, 3072, AC_MODE_GELU, true /*bias*/), dropout), embed_dim, AC_MODE_NONE, true /*bias*/), dropout); - } else { - // x - seq_len, batch_size, model_dim - // x = x.transpose(0, 1) # batch_size, seq_len, model_dim - // x, l_aux = self.moe_layer(x) - // x = x.transpose(0, 1) # seq_len, batch_size, model_dim - //x = self.residual_connection(x, residual) - - //if not self.normalize_before: - // x = self.final_layer_norm(x) - x = normalize_before ? x : model->layer_norm(x, axes, true, 1e-05); - float alpha = 2.0f; // factor overhead tensor size for imbalance - float lambda = 0.04f; // multiplier for load balance term - int num_exp = 128; - int num_select = 2; - - // MoE model - Tensor input = x; - Tensor gate_preds = model->dense(x, num_exp, AC_MODE_RELU); - Tensor topK_output[2]; - model->top_k(gate_preds, topK_output, num_select, false); - - Tensor exp_tensors[num_exp]; - model->group_by(input, topK_output[1], exp_tensors, num_exp, alpha); - - Tensor agg_inputs[num_exp + 4]; - agg_inputs[0] = model->softmax(topK_output[0]); // gate preds - agg_inputs[1] = topK_output[1]; // gate assign - agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) - agg_inputs[3] = gate_preds; // full gate preds - for (int i = 0; i < num_exp; i++) { - Tensor exp_pred = model->dense(exp_tensors[i], embed_dim, AC_MODE_RELU); - agg_inputs[i + 4] = model->softmax(exp_pred); - } - } - - // Tensor t1 = - // model->add(model->multihead_attention( - // input1, input1, input1, hidden_dim, num_heads, kdim, vdim), - // input1); - // t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/), - // hidden_dim, - // AC_MODE_NONE, - // false /*bias*/); - // Tensor t2 = - // model->add(model->multihead_attention( - // input2, input2, input2, hidden_dim, num_heads, kdim, vdim), - // input2); - // t2 = model->add( - // model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim), - // t2); - // t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/), - // hidden_dim, - // AC_MODE_NONE, - // false /*bias*/); - // output1 = t1; - // output2 = t2; -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffConfig; - fprintf(stderr, - "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes); - FFModel ff(ffConfig); - - std::vector hidden_dims = { - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; - Tensor input1, input2; - { - int const dims[] = {ffConfig.batchSize, 1024}; - input1 = ff.create_tensor<2>(dims, DT_FLOAT); - input2 = ff.create_tensor<2>(dims, DT_FLOAT); - } - Tensor t1 = input1, t2 = input2; - for (size_t i = 0; i < hidden_dims.size(); i++) { - int const dims[] = {hidden_dims[i], t1->dims[0]}; - ActiMode acti_mode = - (i + 1 == hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; - t1 = ff.dense(t1, hidden_dims[i], acti_mode, false); - t2 = ff.dense(t2, hidden_dims[i], acti_mode, false); - } - Tensor t = ff.add(t1, t2); - t = ff.softmax(t); - Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f); - std::vector metrics; - metrics.push_back(METRICS_ACCURACY); - metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); - ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics, CompMode::COMP_MODE_INFERENCE); - ff.init_operators(); - // Start timer - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_start = Realm::Clock::current_time_in_microseconds(); - //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - ff.reset_metrics(); - int iterations = 128; - for (int iter = 0; iter < iterations; iter++) { - runtime->begin_trace(ctx, 111 /*trace_id*/); - ff.forward(); - runtime->end_trace(ctx, 111 /*trace_id*/); - } - // End timer - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_end = Realm::Clock::current_time_in_microseconds(); - double run_time = 1e-6 * (ts_end - ts_start); - printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", - run_time, - ffConfig.batchSize * 128 * ffConfig.epochs / run_time); -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/Transformer/CMakeLists.txt b/examples/cpp/inference/Transformer/CMakeLists.txt deleted file mode 100644 index ac46d77f32..0000000000 --- a/examples/cpp/inference/Transformer/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExample_Transformer) -set(project_target transformer) - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - transformer.cc - transformer.h) - -set(GPU_SRC -transformer.cu) - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) - -set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/Transformer/Makefile b/examples/cpp/inference/Transformer/Makefile deleted file mode 100644 index 911f234c45..0000000000 --- a/examples/cpp/inference/Transformer/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2021 Facebook, Stanford, LANL -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 1 # Include debugging symbols -MAX_DIM ?= 4 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 1 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 1 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) - -# Put the binary file name here -OUTFILE ?= transformer -# List all the application source files here -GEN_SRC = transformer.cc -GEN_GPU_SRC = transformer.cu - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/Transformer/transformer.cc b/examples/cpp/inference/Transformer/transformer.cc deleted file mode 100644 index 38675577cc..0000000000 --- a/examples/cpp/inference/Transformer/transformer.cc +++ /dev/null @@ -1,397 +0,0 @@ -/* Copyright 2021 Facebook - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "transformer.h" - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("Transformer"); - -Tensor create_emb(FFModel *model, - Tensor const &input, - int input_dim, - int output_dim, - int idx) { - float range = sqrt(1.0f / input_dim); - Initializer *embed_init = new UniformInitializer(std::rand(), -range, range); - return model->embedding( - input, input_dim, output_dim, AGGR_MODE_SUM, NULL, embed_init); -} - -Tensor create_attention_encoder(FFModel *model, - Tensor const &input, - int hidden_dim, - int num_heads, - int kdim, - int vdim) { - Tensor t = model->multihead_attention( - input, input, input, hidden_dim, num_heads, kdim, vdim); - return model->dense(model->dense(t, hidden_dim, AC_MODE_RELU, false /*bias*/), - hidden_dim, - AC_MODE_NONE, - false /*bias*/); -} - -void create_attention_encoder_decoder(FFModel *model, - Tensor const &input1, - Tensor const &input2, - Tensor &output1, - Tensor &output2, - int hidden_dim, - int num_heads, - int kdim, - int vdim) { - Tensor t1 = - model->add(model->multihead_attention( - input1, input1, input1, hidden_dim, num_heads, kdim, vdim), - input1); - t1 = model->dense(model->dense(t1, hidden_dim, AC_MODE_RELU, false /*bias*/), - hidden_dim, - AC_MODE_NONE, - false /*bias*/); - Tensor t2 = - model->add(model->multihead_attention( - input2, input2, input2, hidden_dim, num_heads, kdim, vdim), - input2); - t2 = model->add( - model->multihead_attention(t2, t1, t1, hidden_dim, num_heads, kdim, vdim), - t2); - t2 = model->dense(model->dense(t2, hidden_dim, AC_MODE_RELU, false /*bias*/), - hidden_dim, - AC_MODE_NONE, - false /*bias*/); - output1 = t1; - output2 = t2; -} - -TransformerConfig::TransformerConfig(void) { - hidden_size = 1024; - embedding_size = 1024; - num_heads = 16; - num_layers = 12; - sequence_length = 512; -} - -void parse_input_args(char **argv, int argc, TransformerConfig &config) { - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "--num-layers")) { - config.num_layers = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "--embedding-size")) { - config.embedding_size = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "--hidden-size")) { - config.hidden_size = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "--num-heads")) { - config.num_heads = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "--sequence-length")) { - config.sequence_length = atoi(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffConfig; - TransformerConfig tfConfig; - { - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, tfConfig); - log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes); - log_app.print("Hidden Size(%d)", tfConfig.hidden_size); - log_app.print("Embedding Vocab Size(%d)", tfConfig.embedding_size); - log_app.print("Number of Heads(%d)", tfConfig.num_heads); - log_app.print("Number of Layers(%d)", tfConfig.num_layers); - log_app.print("Sequence Length(%d)", tfConfig.sequence_length); - } - FFModel ff(ffConfig); - Tensor input; - { - int const dims[] = { - ffConfig.batchSize, tfConfig.sequence_length, tfConfig.hidden_size}; - input = ff.create_tensor<3>(dims, DT_FLOAT); - } - // Tensor t = create_emb(&ff, input, tfConfig.embedding_size, - // tfConfig.hidden_size); Tensor input1 = input, input2 = input; Tensor t1, - // t2; - Tensor t = input; - for (int i = 0; i < tfConfig.num_layers; i++) { - t = create_attention_encoder(&ff, - t, - tfConfig.hidden_size, - tfConfig.num_heads, - tfConfig.hidden_size / tfConfig.num_heads, - tfConfig.hidden_size / tfConfig.num_heads); - // create_attention_encoder_decoder(&ff, input1, input2, t1, t2, - // tfConfig.hidden_size, tfConfig.num_heads, - // tfConfig.hidden_size / tfConfig.num_heads, - // tfConfig.hidden_size / tfConfig.num_heads); - // input1 = t1; - // input2 = t2; - } - t = ff.dense(t, 1, AC_MODE_NONE, false /*bias*/); - Optimizer *optimizer = new SGDOptimizer(&ff, 0.01f); - std::vector metrics; - // metrics.push_back(METRICS_ACCURACY); - // metrics.push_back(METRICS_MEAN_SQUARED_ERROR); - ff.compile(optimizer, LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics, CompMode::COMP_MODE_INFERENCE); - // Data Loader - DataLoader loader(ff, tfConfig, input, ff.label_tensor); - loader.next_batch(ff); - loader.reset(); - ff.init_operators(); - - // Start timer - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - log_app.print("Warmup finished...Start timer..."); - log_app.print("Num. epochs = %d", ffConfig.epochs); - log_app.print("Num. iterations/epoch = %d", - loader.num_samples / ffConfig.batchSize); - printf("parameters.size() = %lu\n", ff.parameters.size()); - double ts_start = Realm::Clock::current_time_in_microseconds(); - int epoch=0; - //for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - loader.reset(); - ff.reset_metrics(); - int iterations = loader.num_samples / ffConfig.batchSize; - for (int iter = 0; iter < iterations; iter++) { - // Only load data once for random input - if (iter == 0 && epoch == 0) - loader.next_batch(ff); - runtime->begin_trace(ctx, 111 /*trace_id*/); - ff.forward(); - //ff.zero_gradients(); - //ff.backward(); - //ff.update(); - runtime->end_trace(ctx, 111 /*trace_id*/); - } - //} - // End timer - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_end = Realm::Clock::current_time_in_microseconds(); - double run_time = 1e-6 * (ts_end - ts_start); - printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", - run_time, - loader.num_samples * ffConfig.epochs / run_time); -} - -DataLoader::DataLoader(FFModel &ff, - TransformerConfig const &tf, - Tensor const &_input, - Tensor const &_label) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - num_samples = 0; - log_app.print("Use random dataset..."); - num_samples = - ff.config.batchSize * ff.config.workersPerNode * ff.config.numNodes; - log_app.print("Number of random samples = %d\n", num_samples); - return; - { - batch_input = _input; - int const dims[] = {num_samples, tf.sequence_length, tf.hidden_size}; - full_input = ff.create_tensor<3>(dims, DT_FLOAT); - } - { - batch_label = _label; - int const dims[] = {num_samples, tf.sequence_length, 1}; - full_label = ff.create_tensor<3>(dims, DT_FLOAT); - } - // Load entire dataset - // TODO: Use index launcher instead of task launcher - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(NULL, 0)); - // regions[0]: full_sparse_input - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_FB_MEMORY)); - launcher.add_field(0, FID_DATA); - // regions[1]: full_label - launcher.add_region_requirement( - RegionRequirement(full_label->parallel_tensor->region, - WRITE_ONLY, - EXCLUSIVE, - full_label->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(1, FID_DATA); - runtime->execute_task(ctx, launcher); -} - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // Note that these instances are in ZCM, can only use - // TensorAccessorW with readOutput flag - AccessorWO const acc_input(regions[0], FID_DATA); - AccessorWO const acc_label(regions[1], FID_DATA); - Rect<3> rect_input = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<3> rect_label = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(acc_input.accessor.is_dense_arbitrary(rect_input)); - assert(acc_label.accessor.is_dense_arbitrary(rect_label)); - float *input_ptr = acc_input.ptr(rect_input.lo); - float *label_ptr = acc_label.ptr(rect_label.lo); - // assert(rect_input == rect_label); - - for (size_t i = 0; i < rect_input.volume(); i++) - input_ptr[i] = ((float)std::rand()) / RAND_MAX; - for (size_t i = 0; i < rect_label.volume(); i++) - label_ptr[i] = std::rand() % 2; -} - -void DataLoader::next_batch(FFModel &ff) { - return; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load Input - { - Domain domain = runtime->get_index_space_domain( - ctx, batch_input->parallel_tensor->parallel_is); - ArgumentMap argmap; - int idx = next_index; - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - assert(ff.config.batchSize % batch_input->parallel_tensor->dims[2].size == - 0); - meta.num_samples = - ff.config.batchSize / batch_input->parallel_tensor->dims[2].size; - for (int i = 0; i < meta.num_samples; i++) - meta.idxs[i] = idx++; - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, - batch_input->parallel_tensor->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_input->parallel_tensor->machine_view.hash()); - // Full dataset in ZCM - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_input->parallel_tensor->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->parallel_tensor->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - // Load Labels - { - Domain domain = runtime->get_index_space_domain( - ctx, batch_label->parallel_tensor->parallel_is); - ArgumentMap argmap; - int idx = next_index; - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - assert(ff.config.batchSize % batch_label->parallel_tensor->dims[2].size == - 0); - meta.num_samples = - ff.config.batchSize / batch_label->parallel_tensor->dims[2].size; - for (int i = 0; i < meta.num_samples; i++) - meta.idxs[i] = idx++; - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, - batch_label->parallel_tensor->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_label->parallel_tensor->machine_view.hash()); - // Full dataset in ZCM - launcher.add_region_requirement( - RegionRequirement(full_label->parallel_tensor->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_label->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_label->parallel_tensor->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_label->parallel_tensor->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - // progress next_index - next_index += ff.config.batchSize; -} - -void DataLoader::reset() { - next_index = 0; -} - -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Inputs Task"); - } -} diff --git a/examples/cpp/inference/Transformer/transformer.cu b/examples/cpp/inference/Transformer/transformer.cu deleted file mode 100644 index 7da473e54c..0000000000 --- a/examples/cpp/inference/Transformer/transformer.cu +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright 2021 Stanford, Facebook - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/utils/cuda_helper.h" -#include "transformer.h" - -void DataLoader::load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - TensorAccessorR acc_full_input( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_batch_input(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - int batch_size = acc_batch_input.rect.hi[2] - acc_batch_input.rect.lo[2] + 1; - int embed_size = acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1; - int seq_length = acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1; - assert(acc_batch_input.rect.hi[0] == acc_full_input.rect.hi[0]); - assert(acc_batch_input.rect.lo[0] == acc_full_input.rect.lo[0]); - assert(acc_batch_input.rect.hi[1] == acc_full_input.rect.hi[1]); - assert(acc_batch_input.rect.lo[1] == acc_full_input.rect.lo[1]); - - float *input_zc; - checkCUDA(cudaHostAlloc(&input_zc, - sizeof(float) * acc_batch_input.rect.volume(), - cudaHostAllocPortable | cudaHostAllocMapped)); - assert(batch_size == meta->num_samples); - for (int i = 0; i < batch_size; i++) { - int base_offset = meta->idxs[i] * embed_size * seq_length; - for (int j = 0; j < embed_size * seq_length; j++) - input_zc[i * embed_size * seq_length + j] = - acc_full_input.ptr[base_offset + j]; - } - checkCUDA(cudaMemcpy(acc_batch_input.ptr, - input_zc, - sizeof(float) * acc_batch_input.rect.volume(), - cudaMemcpyHostToDevice)); - checkCUDA(cudaFreeHost(input_zc)); -} diff --git a/examples/cpp/inference/Transformer/transformer.h b/examples/cpp/inference/Transformer/transformer.h deleted file mode 100644 index 551a9eff19..0000000000 --- a/examples/cpp/inference/Transformer/transformer.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright 2021 Facebook, Stanford - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/model.h" -#define MAX_NUM_SAMPLES 65536 - -using namespace Legion; -using namespace FlexFlow; - -struct TransformerConfig { - TransformerConfig(void); - int hidden_size, embedding_size, num_heads, num_layers, sequence_length; -}; - -class DataLoader { -public: - DataLoader(FFModel &ff, - TransformerConfig const &tf, - Tensor const &_input, - Tensor const &_label); - void next_batch(FFModel &ff); - void reset(); - static void load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - -public: - int num_samples, next_index; - -private: - Tensor full_input, batch_input, full_label, batch_label; -}; - -struct SampleIdxs { - int num_samples; - int idxs[MAX_NUM_SAMPLES]; -}; From 4a0a70ef67f0031a55bcfc4a14258e7d7bcd4b3d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 17 Jan 2023 21:10:16 -0500 Subject: [PATCH 041/344] Revert "make inference functions return futuremaps" This reverts commit ee16d6dd1b14c7ad7fa1681b02b8a1b597cb8751. --- include/flexflow/operator.h | 10 +++++----- include/flexflow/ops/aggregate.h | 9 ++++----- include/flexflow/ops/aggregate_spec.h | 10 +++++----- include/flexflow/ops/attention.h | 9 ++++----- include/flexflow/ops/element_binary.h | 9 ++++----- include/flexflow/ops/groupby.h | 9 ++++----- include/flexflow/ops/layer_norm.h | 10 +++++----- include/flexflow/ops/linear.h | 10 +++++----- include/flexflow/ops/topk.h | 9 ++++----- src/ops/aggregate.cc | 10 +++++----- src/ops/aggregate_spec.cc | 11 +++++------ src/ops/attention.cc | 4 ++-- src/ops/element_binary.cc | 11 +++++------ src/ops/group_by.cc | 10 +++++----- src/ops/layer_norm.cc | 10 +++++----- src/ops/linear.cc | 10 +++++----- src/ops/topk.cc | 10 +++++----- 17 files changed, 77 insertions(+), 84 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 94304784c9..a276f0fd74 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -8,7 +8,7 @@ #include namespace FlexFlow { -using Legion::FutureMap; + extern LegionRuntime::Logger::Category log_measure; class OpMeta; @@ -188,10 +188,10 @@ class Op { virtual void forward(FFModel const &) = 0; virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference - virtual FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) { + virtual void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) { assert(false); }; virtual void print_layer(FFModel const &model) = 0; diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 5f73b8f7b3..6e4dd0b4ac 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -5,7 +5,6 @@ #include "flexflow/ops/aggregate_params.h" namespace FlexFlow { -using Legion::FutureMap; #define AGGREGATE_MAX_K 4 #define AGGREGATE_MAX_BATCH_SIZE 64 @@ -37,10 +36,10 @@ class Aggregate : public Op { char const *name = nullptr); void init(FFModel const &) override; void forward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 21ade888a4..816574ced0 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -5,7 +5,7 @@ #include "flexflow/ops/aggregate_spec_params.h" namespace FlexFlow { -using Legion::FutureMap; + #define AGGREGATE_SPEC_MAX_K 4 #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32 #define AGGREGATE_SPEC_MAX_N 12 @@ -28,10 +28,10 @@ class AggregateSpec : public Op { char const *name); void init(FFModel const &) override; void forward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index fe870579b3..1531708bb7 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -10,7 +10,6 @@ #include "flexflow/ops/attention_params.h" namespace FlexFlow { -using Legion::FutureMap; class MultiHeadAttentionMeta; @@ -67,10 +66,10 @@ class MultiHeadAttention : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 593e229653..2f081f1b7e 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -7,7 +7,6 @@ #include "flexflow/ops/element_binary_params.h" namespace FlexFlow { -using Legion::FutureMap; class ElementBinary : public Op { public: @@ -28,10 +27,10 @@ class ElementBinary : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index 88e1df3ee9..44e03ec6e3 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -6,7 +6,6 @@ #include "flexflow/ops/groupby_params.h" namespace FlexFlow { -using Legion::FutureMap; class GroupByMeta : public OpMeta { public: @@ -36,10 +35,10 @@ class Group_by : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 9b43a40e3c..c05461acdf 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -3,7 +3,7 @@ #include "flexflow/model.h" namespace FlexFlow { -using Legion::FutureMap; + class LayerNormMeta; class LayerNorm : public Op { @@ -26,10 +26,10 @@ class LayerNorm : public Op { void init(FFModel const &); void forward(FFModel const &); void backward(FFModel const &); - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) { assert(0); } diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index 6247dc7f76..ab1c1febc5 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -6,7 +6,7 @@ #include "flexflow/ops/linear_params.h" namespace FlexFlow { -using Legion::FutureMap; + class FFModel; class Layer; @@ -37,10 +37,10 @@ class Linear : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 5c3f12d294..af62f51c93 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -6,7 +6,6 @@ #include "flexflow/ops/topk_params.h" namespace FlexFlow { -using Legion::FutureMap; class TopKMeta : public OpMeta { public: @@ -31,10 +30,10 @@ class TopK : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - FutureMap inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 941c6da8cb..19ed275b5c 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -246,10 +246,10 @@ void Aggregate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap Aggregate::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void Aggregate::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -295,7 +295,7 @@ FutureMap Aggregate::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(n + 2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void Aggregate::forward_task(Task const *task, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 4ce863cb50..1e3d66fdee 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -232,11 +232,10 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap - AggregateSpec::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void AggregateSpec::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -282,7 +281,7 @@ FutureMap EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(n + 2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void AggregateSpec::forward_task(Task const *task, diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 57d945c2b3..d9ee14ecb6 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -523,7 +523,7 @@ void MultiHeadAttention::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap MultiHeadAttention::inference( +void MultiHeadAttention::inference( FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -572,7 +572,7 @@ FutureMap MultiHeadAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(4, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 9214113ccc..feb1862b04 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -424,11 +424,10 @@ void ElementBinary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap - ElementBinary::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void ElementBinary::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -494,7 +493,7 @@ FutureMap launcher.add_field(2, FID_DATA); } } - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 33038d69c3..b341f004ac 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -261,10 +261,10 @@ void Group_by::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap Group_by::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void Group_by::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -304,7 +304,7 @@ FutureMap Group_by::inference(FFModel const &ff, launcher.add_field(i + 2, FID_DATA); } - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void Group_by::forward_task(Task const *task, diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 02dd519780..bec6f7d651 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -278,10 +278,10 @@ void LayerNorm::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap LayerNorm::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void LayerNorm::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -321,7 +321,7 @@ FutureMap LayerNorm::inference(FFModel const &ff, weights[1]->region)); launcher.add_field(3, FID_DATA); } - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 49560fc20f..435080dbe1 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -367,10 +367,10 @@ void Linear::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap Linear::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void Linear::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -410,7 +410,7 @@ FutureMap Linear::inference(FFModel const &ff, weights[1]->region)); launcher.add_field(3, FID_DATA); } - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void Linear::forward_task(Task const *task, diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 52b54711cb..ec6da77a31 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -220,10 +220,10 @@ void TopK::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap TopK::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +void TopK::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -255,7 +255,7 @@ FutureMap TopK::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[1]->region)); launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void TopK::forward_task(Task const *task, From 17585d342565da8ba2a2ef3903d6e5ef8f8daaad Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 17 Jan 2023 23:24:57 -0500 Subject: [PATCH 042/344] post-rebase cleanup --- .../cpp/inference/mixture_of_experts/moe.cc | 228 +++++++++++------- .../cpp/inference/mixture_of_experts/moe.cu | 29 ++- .../cpp/inference/mixture_of_experts/moe.h | 35 ++- .../inference/mixture_of_experts/run_moe.sh | 11 - include/flexflow/ops/groupby.h | 6 +- src/ops/aggregate.cc | 9 +- src/ops/group_by.cc | 21 +- src/ops/group_by.cpp | 37 +-- src/ops/group_by.cu | 19 +- 9 files changed, 212 insertions(+), 183 deletions(-) delete mode 100644 examples/cpp/inference/mixture_of_experts/run_moe.sh diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 459ffb42b5..7d6e1fddee 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -1,4 +1,4 @@ -/* Copyright 2020 Stanford +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,19 +20,9 @@ #include #include -#define NUM_SAMPLES 60000 -#define TRAIN_SAMPLES 60000 -#define TEST_SAMPLES 00000 -#define MNIST_DIMS 28 * 28 -#define CIFAR_DIMS 3 * 32 * 32 -#define DATA_DIMS MNIST_DIMS -#define OUT_DIM 10 - using namespace Legion; LegionRuntime::Logger::Category log_app("MoE"); -int num_exp = 4; -int num_select = 2; void parse_input_args(char **argv, int argc, MoeConfig &config) { for (int i = 1; i < argc; i++) { @@ -46,12 +36,8 @@ void parse_input_args(char **argv, int argc, MoeConfig &config) { Tensor create_moe(FFModel *model, MoeConfig const *moeConfig, Tensor const &input) { - float alpha = 2.0f; // factor overhead tensor size for imbalance - float lambda = 0.04f; // multiplier for load balance term - // MoE model - Tensor gate_preds = model->dense(input, 64, AC_MODE_RELU); - gate_preds = model->dense(gate_preds, num_exp, AC_MODE_RELU); + Tensor gate_preds = model->dense(input, num_exp, AC_MODE_RELU); Tensor topK_output[2]; model->top_k(gate_preds, topK_output, num_select, false); Tensor agg_inputs[num_exp + 4]; @@ -72,7 +58,7 @@ Tensor create_moe(FFModel *model, for (int i = 0; i < num_exp + 4; i++) { agg_inputs[i]->print("agg_inputs[i]"); } - Tensor coop_output = model->aggregate(agg_inputs, num_exp, lambda); + Tensor coop_output = model->aggregate(agg_inputs, num_exp, moeConfig->lambda); // model->get_metrics(); return coop_output; } @@ -138,14 +124,18 @@ void FlexFlow::top_level_task(Task const *task, //----------------------------------------------------------------- - Tensor t = create_moe_encoder(&ff, &moeConfig, input); + //Tensor t = create_moe_encoder(&ff, &moeConfig, input); + Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, OUT_DIM, AC_MODE_RELU); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); // im.compile_model_and_allocate_buffer(); ff.init_operators(); // Data Loader - // DataLoader data_loader(ff, moeConfig, input, ff.label_tensor); + ParallelTensor input_pt, label_pt; + ff.get_parallel_tensor_from_tensor(input, input_pt); + ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); + DataLoader data_loader(ff, moeConfig, input_pt, label_pt); //----------------------------------------------------------------- @@ -194,8 +184,8 @@ void FlexFlow::top_level_task(Task const *task, DataLoader::DataLoader(FFModel &ff, MoeConfig const &moe, - Tensor input, - Tensor label) { + ParallelTensor input, + ParallelTensor label) { num_samples = NUM_SAMPLES; Context ctx = ff.config.lg_ctx; @@ -203,37 +193,72 @@ DataLoader::DataLoader(FFModel &ff, // Create full input { + // Input has dimensions (batch_size, data_dims), which in legion ordering + // becomes (data_dims, batch_size). The corresponding parallel tensor will + // thus have dimensions (data_dims, batch_size, replica_dim). The dimensions + // of the full_input tensor can be obtained by replacing the batch_size with + // the num_samples: (data_dims, num_samples, replica_dim) + assert(input->num_dims == 3); // two dimensions + the replica dimension batch_input = input; - int const dims[] = {NUM_SAMPLES, DATA_DIMS}; - full_input = ff.create_tensor<2>(dims, DT_FLOAT); + + ParallelDim dims[3]; + for (int i = 0; i < 3; i++) { + dims[i].size = input->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = input->dims[i].is_replica_dim; + // Assume only the first dim can be the replica dim + assert(i == 2 || (!dims[i].is_replica_dim)); + } + dims[1].size = num_samples; + + full_input = ff.create_parallel_tensor_legion_ordering(3, dims, DT_FLOAT); + ff.map_tensor(full_input, NULL /*parallel_op*/); } + // Create full label { + assert(label->num_dims == LABEL_DIM + 2); batch_label = label; - int const dims[] = {NUM_SAMPLES, 1}; - full_label = ff.create_tensor<2>(dims, DT_INT32); + + ParallelDim dims[LABEL_DIM + 2]; + for (int i = 0; i < LABEL_DIM + 2; i++) { + dims[i].size = label->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = label->dims[i].is_replica_dim; + // Assume only the last dim can be the replica dim + assert(i == LABEL_DIM + 1 || (!dims[i].is_replica_dim)); + } + assert(dims[LABEL_DIM].size == ff.config.batchSize); + // replace batch size with number of samples + dims[LABEL_DIM].size = num_samples; + + full_label = ff.create_parallel_tensor_legion_ordering( + LABEL_DIM + 2, dims, DT_INT32); + ff.map_tensor(full_label, NULL /*parallel_op*/); } // Load entire dataset // TODO: Use index launcher instead of task launcher + assert(full_input != nullptr && "full_input is nullptr"); + MoeConfig const *ptr = &moe; TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(&ptr, sizeof(MoeConfig *))); // regions[0]: full_input - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); + launcher.add_region_requirement(RegionRequirement(full_input->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); launcher.add_field(0, FID_DATA); // regions[1]: full_label - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); + launcher.add_region_requirement(RegionRequirement(full_label->region, + WRITE_ONLY, + EXCLUSIVE, + full_label->region, + MAP_TO_ZC_MEMORY)); launcher.add_field(1, FID_DATA); runtime->execute_task(ctx, launcher); @@ -241,17 +266,10 @@ DataLoader::DataLoader(FFModel &ff, next_batch(ff); } -__inline__ int calc_offset(int c, int y, int x, int yscale, int xscale) { - return (c * yscale * xscale + y * xscale + x); -} - // ================================================= // Load data // ================================================= -/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/, unpack to -this directory (Flexflow/examples/cpp/mixture_of_experts) */ - void read_cifar100(float *input_ptr, int *label_ptr) { std::ifstream file; file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); @@ -288,6 +306,8 @@ int reverseInt(int i) { return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; } +/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to +the current working directory */ void read_mnist(float *input_ptr, int *label_ptr) { // read inputs std::ifstream input("train-images-idx3-ubyte", std::ios::binary); @@ -350,19 +370,23 @@ void DataLoader::load_entire_dataset(Task const *task, assert(task->regions.size() == regions.size()); // get input and label pointer - AccessorWO const acc_input(regions[0], FID_DATA); - AccessorWO const acc_label(regions[1], FID_DATA); - Rect<2> rect_input = runtime->get_index_space_domain( + AccessorWO const acc_input(regions[0], FID_DATA); + AccessorWO const acc_label(regions[1], FID_DATA); + Rect<3> rect_input = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); assert(acc_input.accessor.is_dense_arbitrary(rect_input)); - Rect<2> rect_label = runtime->get_index_space_domain( + Rect rect_label = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); assert(acc_label.accessor.is_dense_arbitrary(rect_label)); float *input_ptr = acc_input.ptr(rect_input.lo); int *label_ptr = acc_label.ptr(rect_label.lo); + int num_samples = rect_input.hi[1] - rect_input.lo[1] + 1; + assert(rect_label.hi[1] - rect_label.lo[1] + 1 == num_samples); + // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load + // another dataset using the dataset_path from the MoeConfig object read_mnist(input_ptr, label_ptr); - log_app.print("finish loading data\n"); + log_app.print("finish loading MNIST data\n"); } void DataLoader::next_batch(FFModel &ff) { @@ -370,80 +394,100 @@ void DataLoader::next_batch(FFModel &ff) { Runtime *runtime = ff.config.lg_hlr; // Load input { - IndexSpace task_is = batch_input->parallel_tensor->parallel_is; - Rect<2> rect = runtime->get_index_space_domain(ctx, task_is); + Domain domain = + runtime->get_index_space_domain(ctx, batch_input->parallel_is); ArgumentMap argmap; int idx = next_index; - for (PointInRectIterator<2> it(rect); it(); it++) { + // current limitation of the dataloader: only the batch dimension can be + // partitioned + int input_dims = batch_input->num_dims; + for (int i = 0; i < input_dims; i++) { + if (i != input_dims - 2) { + assert(batch_input->dims[i].degree == 1 && + "Dataloader only supports batch size partitions"); + } + } + int batch_size = batch_input->dims[input_dims - 2].size; + int n_partitions = batch_input->dims[input_dims - 2].degree; + assert(ff.config.batchSize % batch_size == 0); + assert(batch_size % n_partitions == 0); + for (Domain::DomainPointIterator it(domain); it; it++) { SampleIdxs meta; - assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); - meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1); - for (int i = 0; i < meta.num_samples; i++) + meta.num_samples = batch_size / n_partitions; + for (int i = 0; i < meta.num_samples; i++) { meta.idxs[i] = idx++; + } argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); } IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - task_is, + batch_input->parallel_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_input->parallel_tensor->machine_view.hash()); - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); + batch_input->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_input->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_input->parallel_tensor->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->parallel_tensor->region)); + launcher.add_region_requirement(RegionRequirement(batch_input->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->region)); launcher.add_field(1, FID_DATA); runtime->execute_index_space(ctx, launcher); } // Load label { - // IndexSpaceT<2> task_is = IndexSpaceT<2>(ff.get_or_create_task_is(2, "")); - IndexSpace task_is = batch_label->parallel_tensor->parallel_is; - Rect<2> rect = runtime->get_index_space_domain(ctx, task_is); + Domain domain = + runtime->get_index_space_domain(ctx, batch_label->parallel_is); ArgumentMap argmap; int idx = next_index; - for (PointInRectIterator<2> it(rect); it(); it++) { + // current limitation of the dataloader: only the batch dimension can be + // partitioned + int label_dims = batch_label->num_dims; + assert(batch_label->dims[label_dims - 1].degree == 1); + for (int i = 0; i < LABEL_DIM; i++) { + assert(batch_label->dims[i].degree == 1 && + "Dataloader only supports batch size partitions"); + } + int batch_size = batch_label->dims[label_dims - 2].size; + int n_partitions = batch_label->dims[label_dims - 2].degree; + assert(ff.config.batchSize % batch_size == 0); + assert(batch_size % n_partitions == 0); + for (Domain::DomainPointIterator it(domain); it; it++) { SampleIdxs meta; - assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); - meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1); - for (int i = 0; i < meta.num_samples; i++) + meta.num_samples = batch_size / n_partitions; + for (int i = 0; i < meta.num_samples; i++) { meta.idxs[i] = idx++; + } argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); } IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, - task_is, + batch_label->parallel_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_label->parallel_tensor->machine_view.hash()); - launcher.add_region_requirement( - RegionRequirement(full_label->parallel_tensor->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_label->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); + batch_label->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_label->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_label->region, + MAP_TO_ZC_MEMORY)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_label->parallel_tensor->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_label->parallel_tensor->region)); + launcher.add_region_requirement(RegionRequirement(batch_label->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_label->region)); launcher.add_field(1, FID_DATA); runtime->execute_index_space(ctx, launcher); } diff --git a/examples/cpp/inference/mixture_of_experts/moe.cu b/examples/cpp/inference/mixture_of_experts/moe.cu index c4224e0a49..ae1e5aca30 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cu +++ b/examples/cpp/inference/mixture_of_experts/moe.cu @@ -1,4 +1,4 @@ -/* Copyright 2020 Stanford +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,9 +23,9 @@ void DataLoader::load_input(Task const *task, assert(regions.size() == 2); assert(task->regions.size() == 2); SampleIdxs *meta = (SampleIdxs *)task->local_args; - TensorAccessorR acc_full_input( + TensorAccessorR acc_full_input( regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_batch_input(regions[1], + TensorAccessorW acc_batch_input(regions[1], task->regions[1], FID_DATA, ctx, @@ -39,8 +39,9 @@ void DataLoader::load_input(Task const *task, // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); - for (int i = 1; i < batch_size; i++) + for (int i = 1; i < batch_size; i++) { assert(meta->idxs[i] == meta->idxs[0] + i); + } coord_t start_idx = meta->idxs[0]; float const *input_zc = acc_full_input.ptr + start_idx * sample_dim; copy_kernel<<>>( @@ -55,19 +56,21 @@ void DataLoader::load_label(Task const *task, assert(regions.size() == 2); assert(task->regions.size() == 2); SampleIdxs *meta = (SampleIdxs *)task->local_args; - TensorAccessorR acc_full_label( + TensorAccessorR acc_full_label( regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_batch_label(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - int batch_size = acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1; + TensorAccessorW acc_batch_label(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + coord_t batch_size = + acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); - for (int i = 1; i < meta->num_samples; i++) + for (int i = 1; i < meta->num_samples; i++) { assert(meta->idxs[i] == meta->idxs[0] + i); + } int const *input_zc = acc_full_label.ptr + meta->idxs[0]; copy_kernel<<>>( acc_batch_label.ptr, input_zc, acc_batch_label.rect.volume()); diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index d447c9fd63..a9fd2d2325 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -1,4 +1,4 @@ -/* Copyright 2017 Stanford, NVIDIA +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,15 @@ */ #include "flexflow/model.h" -#define MAX_NUM_SAMPLES 60000 -#define NUM_SAMPLES 60000 -#define TRAIN_SAMPLES 60000 +#define MAX_NUM_SAMPLES 1000 +#define NUM_SAMPLES 1000 +#define TRAIN_SAMPLES 1000 #define TEST_SAMPLES 00000 #define MNIST_DIMS 28 * 28 #define CIFAR_DIMS 3 * 32 * 32 #define DATA_DIMS MNIST_DIMS #define OUT_DIM 10 +#define LABEL_DIM 1 using namespace Legion; using namespace std; @@ -29,26 +30,38 @@ using namespace FlexFlow; struct MoeConfig { MoeConfig(void) { - // Set default configurations here + // MoE layer + num_exp = 5; + num_select = 2; + alpha = 2.0f; + lambda = 0.04f; hidden_size = DATA_DIMS; + // Encoder layer num_attention_heads = 16; attention_kdim = attention_vdim = hidden_size / num_attention_heads; num_encoder_layers = 6; } - std::string dataset_path; + // MoE layer + int num_exp; + int num_select; + float alpha; // factor overhead tensor size for imbalance + float lambda; // multiplier for load balance term int hidden_size; + // Encoder layer int num_attention_heads; int attention_kdim; int attention_vdim; int num_encoder_layers; + // Dataset + std::string dataset_path; }; class DataLoader { public: DataLoader(FFModel &ff, - MoeConfig const &alexnet, - Tensor _input, - Tensor _label); + MoeConfig const &moe, + ParallelTensor input, + ParallelTensor label); static void load_input(Task const *task, std::vector const ®ions, Context ctx, @@ -66,8 +79,8 @@ class DataLoader { public: int num_samples, next_index; - Tensor full_input, batch_input; - Tensor full_label, batch_label; + FlexFlow::ParallelTensor full_input, batch_input; + FlexFlow::ParallelTensor full_label, batch_label; }; struct SampleIdxs { diff --git a/examples/cpp/inference/mixture_of_experts/run_moe.sh b/examples/cpp/inference/mixture_of_experts/run_moe.sh deleted file mode 100644 index 33c6c5f7fb..0000000000 --- a/examples/cpp/inference/mixture_of_experts/run_moe.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=pagerank -#SBATCH --output=slurm.txt -#SBATCH --time=10:00 -#SBATCH --nodes=2 -#SBATCH --cpus-per-task=2 -#SBATCH --mem-per-cpu=6000MB -#SBATCH --nodelist=g0001,g0002 -#SBATCH --partition=gpu - -srun -n 2 ./moe -ll:cpu 4 -ll:gpu 4 -ll:fsize 15000 -ll:zsize 15000 --nodes 2 -ll:util 1 -b 40 -e 1 --search-budget 1 --export strat-tmp.txt diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index 44e03ec6e3..afa69d891c 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -75,8 +75,7 @@ class Group_by : public Op { int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas); + int data_dim); static void backward_kernel_wrapper(GroupByMeta const *m, float *input_grad, @@ -86,8 +85,7 @@ class Group_by : public Op { int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas); + int data_dim); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 19ed275b5c..49e564a702 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -55,8 +55,10 @@ Tensor FFModel::aggregate( int num_dim = inputs[4]->num_dims; // Set output shape int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < num_dim; i++) + for (int i = 0; i < num_dim - 1; i++) { dims[i] = inputs[4]->dims[i]; + } + dims[num_dim - 1] = inputs[0]->dims[num_dim - 1]; li->outputs[0] = create_tensor_legion_ordering( num_dim, dims, DT_FLOAT, li, 0, true /*create_grad*/); } @@ -141,16 +143,11 @@ Aggregate::Aggregate(FFModel &model, } // Set output shape ParallelDim dims[MAX_TENSOR_DIM]; -<<<<<<< HEAD for (int i = 0; i < num_dim - 1; i++) { dims[i] = inputs[4]->dims[i]; } dims[num_dim - 2] = inputs[0]->dims[num_dim - 2]; dims[num_dim - 1] = inputs[0]->dims[num_dim - 1]; -======= - for (int i = 0; i < num_dim; i++) - dims[i] = inputs[4]->dims[i]; ->>>>>>> 99a89a9b... [MOE] update moe cpp example and aggregate implementation (#555) numOutputs = 1; outputs[0] = model.create_parallel_tensor_legion_ordering( num_dim, dims, DT_FLOAT, this); diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index b341f004ac..e6dc00f690 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -316,7 +316,6 @@ void Group_by::forward_task(Task const *task, int n = gb->n; float alpha = gb->alpha; - // Check that the number of regions is n+2: n outputs and 2 inputs assert((int)regions.size() == n + 2); assert((int)task->regions.size() == n + 2); @@ -334,16 +333,13 @@ void Group_by::forward_task(Task const *task, coord_t input_rows = rect_input.hi[1] - rect_input.lo[1] + 1; coord_t input_cols = rect_input.hi[0] - rect_input.lo[0] + 1; - coord_t input_replicas = rect_input.hi[2] - rect_input.lo[2] + 1; - // Check that dimensions match in the input and assign tensors assert(input_rows == rect_assign.hi[1] - rect_assign.lo[1] + 1); - assert(input_replicas == rect_assign.hi[2] - rect_assign.lo[2] + 1); // does this need to be true? + int k = rect_assign.hi[0] - rect_assign.lo[0] + 1; int batch_size = input_rows; int data_dim = input_cols; - int n_replicas = input_replicas; - // Create a vector of n outputs, where n is the number of experts. + // Create a vector of n outputs, where n is the number of experts. // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert float *outputs[n]; @@ -360,9 +356,6 @@ void Group_by::forward_task(Task const *task, assert(output_cols == input_cols); } - // Launch the kernel responsible from copying the data from the input tensor - // to each output tensor, according to the input to expert assignments from - // the assign tensor. Group_by::forward_kernel_wrapper(m, acc_input.ptr(rect_input), acc_assign.ptr(rect_assign), @@ -371,8 +364,7 @@ void Group_by::forward_task(Task const *task, k, alpha, batch_size, - data_dim, - n_replicas); + data_dim); } void Group_by::backward(FFModel const &ff) { @@ -442,13 +434,11 @@ void Group_by::backward_task(Task const *task, coord_t input_rows = rect_input_grad.hi[1] - rect_input_grad.lo[1] + 1; coord_t input_cols = rect_input_grad.hi[0] - rect_input_grad.lo[0] + 1; - coord_t input_replicas = rect_input_grad.hi[2] - rect_input_grad.lo[2] + 1; assert(input_rows == rect_assign.hi[1] - rect_assign.lo[1] + 1); - assert(input_replicas == rect_assign.hi[2] - rect_assign.lo[2] + 1); // does this need to be true? + int k = rect_assign.hi[0] - rect_assign.lo[0] + 1; int batch_size = input_rows; int data_dim = input_cols; - int n_replicas = input_replicas; // get output float *output_grads[n]; @@ -473,8 +463,7 @@ void Group_by::backward_task(Task const *task, k, alpha, batch_size, - data_dim, - n_replicas); + data_dim); } void Group_by::serialize(Legion::Serializer &sez) const { diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index 16c8354bca..e0b914cf1a 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -33,19 +33,25 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { - __shared__ float *chosen_exp_preds[MAX_K * MAX_BATCH_SIZE]; // one pointer for each exp_assign (TopK_output[1]) element + int data_dim) { + __shared__ float + *chosen_exp_preds[MAX_K * + MAX_BATCH_SIZE]; // one pointer for each exp_assign + // (TopK_output[1]) element // Get pred pointers, single thread per block if (threadIdx.x == 0) { - int exp_tensor_rows = ceil(alpha * k / n * batch_size); // This is the max expert capacity - int expert_idx[MAX_N] = {0}; // This is the number of tokens assigned to each expert + int exp_tensor_rows = + ceil(alpha * k / n * batch_size); // This is the max expert capacity + int expert_idx[MAX_N] = { + 0}; // This is the number of tokens assigned to each expert // Iterate through flattened assign tensor, which has shape (k, batch_size) for (int i = 0; i < k * batch_size; i++) { // Get pointer to chosen expert predictions - int expert = exp_assign[i]; // index of the expert that is to receive the token i - if (expert_idx[expert] >= exp_tensor_rows) { // check if the expert is already at capacity + int expert = + exp_assign[i]; // index of the expert that is to receive the token i + if (expert_idx[expert] >= + exp_tensor_rows) { // check if the expert is already at capacity // dropped sample chosen_exp_preds[i] = 0; continue; @@ -79,8 +85,7 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { __shared__ float *chosen_exp_grads[MAX_K * MAX_BATCH_SIZE]; // Get pred pointers, single thread @@ -122,8 +127,7 @@ void Group_by::forward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { // TODO: why cublas/cudnn stream is needed here? hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -144,8 +148,7 @@ void Group_by::forward_kernel_wrapper( k, alpha, batch_size, - data_dim, - n_replicas); + data_dim); } void Group_by::backward_kernel_wrapper( @@ -157,8 +160,7 @@ void Group_by::backward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { // TODO: why cublas/cudnn stream is needed here hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -181,8 +183,7 @@ void Group_by::backward_kernel_wrapper( k, alpha, batch_size, - data_dim, - n_replicas); + data_dim); } GroupByMeta::GroupByMeta(FFHandler handler, int n) : OpMeta(handler) { @@ -192,4 +193,4 @@ GroupByMeta::~GroupByMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); } -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index 5e5f4a6fb8..ac29904e9f 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -32,8 +32,7 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { __shared__ float *chosen_exp_preds[MAX_K * MAX_BATCH_SIZE]; // Get pred pointers, single thread per block @@ -72,8 +71,7 @@ __global__ void int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { __shared__ float *chosen_exp_grads[MAX_K * MAX_BATCH_SIZE]; assert(k <= MAX_K); assert(batch_size <= MAX_BATCH_SIZE); @@ -117,8 +115,7 @@ void Group_by::forward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { // TODO: why cublas/cudnn stream is needed here? cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -139,7 +136,7 @@ void Group_by::forward_kernel_wrapper( min(CUDA_NUM_THREADS, (int)(batch_size * k * data_dim)), 0, stream>>>( - input, exp_assign, m->dev_region_ptrs, n, k, alpha, batch_size, data_dim, n_replicas); + input, exp_assign, m->dev_region_ptrs, n, k, alpha, batch_size, data_dim); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -160,8 +157,7 @@ void Group_by::backward_kernel_wrapper( int k, // chosen experts float alpha, // factor additional memory assigned int batch_size, - int data_dim, - int n_replicas) { + int data_dim) { // TODO: why cublas/cudnn stream is needed here cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -188,8 +184,7 @@ void Group_by::backward_kernel_wrapper( k, alpha, batch_size, - data_dim, - n_replicas); + data_dim); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -208,4 +203,4 @@ GroupByMeta::~GroupByMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); } -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file From 1ce534b3f7bd66832e860101722c11d1aaae257a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 17 Jan 2023 23:43:54 -0500 Subject: [PATCH 043/344] bug fix --- examples/cpp/inference/MLP_Unify/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/CMakeLists.txt b/examples/cpp/inference/MLP_Unify/CMakeLists.txt index 57b9ea0835..e83d292efc 100644 --- a/examples/cpp/inference/MLP_Unify/CMakeLists.txt +++ b/examples/cpp/inference/MLP_Unify/CMakeLists.txt @@ -9,9 +9,7 @@ set(CPU_SRC mlp.h ../data_generator.h) -set(GPU_SRC mlp.cu) - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +cuda_add_executable(${project_target} ${CPU_SRC}) target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) From 12779ffff07cebc2e8921ef9215ca91604d3d770 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 17 Jan 2023 23:48:04 -0500 Subject: [PATCH 044/344] more futuremap purging --- include/flexflow/ops/experts.h | 2 +- src/ops/experts.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 74e9c0ff02..9b05270f79 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -34,7 +34,7 @@ class Experts : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - FutureMap inference(FFModel const &, + void inference(FFModel const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 28405d3dd7..368ed9b93d 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -294,7 +294,7 @@ void Experts::backward_task(Task const *task, assert(false && "Experts is designed for inference only"); } -FutureMap Experts::inference(FFModel const &ff, +void Experts::inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -329,7 +329,7 @@ FutureMap Experts::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void Experts::inference_task(Task const *task, From ef5fea026cd2e0f4a378104b7cf94e10f44a0762 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 17 Jan 2023 23:51:45 -0500 Subject: [PATCH 045/344] [Inference][MLP] removed non-working dataloader --- examples/cpp/inference/MLP_Unify/mlp.cc | 136 +----------------------- examples/cpp/inference/MLP_Unify/mlp.h | 2 +- 2 files changed, 3 insertions(+), 135 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 9da8214c00..56833814ba 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 Stanford University +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,120 +23,6 @@ using namespace Legion; using namespace FlexFlow; -DataLoader::DataLoader(FFModel &ff, - MLPConfig const &mlpConfig, - InferenceManager const *im, - Tensor input) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - printf("Use random dataset..."); - - // The number of samples is the total number of request samples that can ever - // be loaded into memory at the same time. In the case of training, the value - // is batchSize * workersPerNode * numNodes, since each worker can only - // process one batch at a time. In inference, batchSize - int max_parallel_requests = - im->max_num_inflight_batches * - (ff.config.batchSize * im->max_num_requests_per_batch); - num_samples = - max_parallel_requests * ff.config.workersPerNode * ff.config.numNodes; - printf("Number of random samples = %d\n", num_samples); - - // return; - - // Create full input - { - batch_input = input; - int const dims[] = {num_samples, - mlpConfig.sequence_length * mlpConfig.embedding_size}; - full_input = ff.create_tensor<2>(dims, DT_FLOAT); - } - - // Load entire dataset - // TODO: Use index launcher instead of task launcher - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, TaskArgument(NULL, 0)); - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - runtime->execute_task(ctx, launcher); - reset(); - //next_batch(ff); -} - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 1); // no labels - assert(task->regions.size() == 1); - // Note that these instances are in ZCM, can only use - // TensorAccessorW with readOutput flag - AccessorWO const acc_input(regions[0], FID_DATA); - Rect<2> rect_input = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(acc_input.accessor.is_dense_arbitrary(rect_input)); - float *input_ptr = acc_input.ptr(rect_input.lo); - // Fill dataset with random data - for (int i = 0; i < rect_input.volume(); i++) { - input_ptr[i] = ((float)std::rand()) / RAND_MAX; - } - printf("finish loading data\n"); -} - -void DataLoader::next_batch(FFModel &ff) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load Input - { - Rect<2> rect = runtime->get_index_space_domain(ctx, batch_input->parallel_tensor->parallel_is); - ArgumentMap argmap; - int idx = next_index; - for (PointInRectIterator<2> it(rect); it(); it++) { - SampleIdxs meta; - assert(ff.config.batchSize % (rect.hi[1] - rect.lo[1] + 1) == 0); - meta.num_samples = ff.config.batchSize / (rect.hi[1] - rect.lo[1] + 1); - for (int i = 0; i < meta.num_samples; i++) - meta.idxs[i] = idx++; - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input->parallel_tensor->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_input->parallel_tensor->machine_view.hash()); - launcher.add_region_requirement( - RegionRequirement(full_input->parallel_tensor->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->parallel_tensor->region, - MAP_TO_FB_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_input->parallel_tensor->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->parallel_tensor->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - // progress to the next_index - next_index += ff.config.batchSize; -} - -void DataLoader::reset() { - next_index = 0; -} - Tensor create_mlp(FFModel *model, MLPConfig const *mlpConfig, @@ -255,22 +141,4 @@ void FlexFlow::top_level_task(Task const *task, ffConfig.batchSize * 128 * ffConfig.epochs / run_time); } -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(FlexFlow::CUSTOM_GPU_TASK_ID_1, - "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Input Task"); - } -} +void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/MLP_Unify/mlp.h b/examples/cpp/inference/MLP_Unify/mlp.h index 7a8cc06955..7cd2f30430 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.h +++ b/examples/cpp/inference/MLP_Unify/mlp.h @@ -1,4 +1,4 @@ -/* Copyright 2022 CMU, Stanford +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From a50069e4b3a61c22f8cded8a53c031169e8195a6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 18 Jan 2023 00:12:36 -0500 Subject: [PATCH 046/344] bug fix --- .../cpp/inference/mixture_of_experts/moe.cc | 17 +++++++++-------- examples/cpp/inference/mixture_of_experts/moe.h | 7 ++++++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 7d6e1fddee..99bdec62fb 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -37,28 +37,29 @@ Tensor create_moe(FFModel *model, MoeConfig const *moeConfig, Tensor const &input) { // MoE model - Tensor gate_preds = model->dense(input, num_exp, AC_MODE_RELU); + Tensor gate_preds = model->dense(input, moeConfig->num_exp, AC_MODE_RELU); Tensor topK_output[2]; - model->top_k(gate_preds, topK_output, num_select, false); - Tensor agg_inputs[num_exp + 4]; + model->top_k(gate_preds, topK_output, moeConfig->num_select, false); + Tensor agg_inputs[moeConfig->num_exp + 4]; agg_inputs[0] = model->softmax(topK_output[0]); // gate preds agg_inputs[1] = topK_output[1]; // gate assign agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) agg_inputs[3] = gate_preds; // full gate preds - for (int i = 0; i < num_exp /*number of experts layers*/; i++) { + assert(moeConfig->num_exp % moeConfig->fused_exp_block_size == 0); + for (int i = 0; i < moeConfig->num_exp /*number of experts layers*/; i++) { Tensor exp_pred = model->experts(gate_preds, topK_output[1], - 32 /*number of experts*/, - 32 * i /*expert start index*/, + moeConfig->fused_exp_block_size /*number of experts*/, + moeConfig->fused_exp_block_size * i /*expert start index*/, 1 /*number of linear layers*/, moeConfig->hidden_size /*output_size*/, moeConfig->hidden_size /*internal_size*/); agg_inputs[i + 4] = exp_pred; } - for (int i = 0; i < num_exp + 4; i++) { + for (int i = 0; i < moeConfig->num_exp + 4; i++) { agg_inputs[i]->print("agg_inputs[i]"); } - Tensor coop_output = model->aggregate(agg_inputs, num_exp, moeConfig->lambda); + Tensor coop_output = model->aggregate(agg_inputs, moeConfig->num_exp, moeConfig->lambda); // model->get_metrics(); return coop_output; } diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index a9fd2d2325..ed7b5af483 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -31,7 +31,11 @@ using namespace FlexFlow; struct MoeConfig { MoeConfig(void) { // MoE layer - num_exp = 5; + // total number of experts + num_exp = 128; + // number of experts in each block of fused experts + fused_exp_block_size = 32; + // number of experts to route each token to num_select = 2; alpha = 2.0f; lambda = 0.04f; @@ -44,6 +48,7 @@ struct MoeConfig { // MoE layer int num_exp; int num_select; + int fused_exp_block_size; float alpha; // factor overhead tensor size for imbalance float lambda; // multiplier for load balance term int hidden_size; From 71463cc01092aea2507ce84e41e5588e8e18250d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 18 Jan 2023 00:13:31 -0500 Subject: [PATCH 047/344] formatting --- examples/cpp/inference/MLP_Unify/mlp.cc | 40 +++-- examples/cpp/inference/data_generator.cpp | 4 +- examples/cpp/inference/data_generator.h | 167 +++++++++--------- .../cpp/inference/mixture_of_experts/moe.cc | 20 ++- include/flexflow/inference.h | 9 +- include/flexflow/ops/experts.h | 6 +- src/ops/experts.cc | 37 ++-- src/runtime/inference_manager.cc | 35 ++-- 8 files changed, 169 insertions(+), 149 deletions(-) diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc index 56833814ba..7631af0445 100644 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ b/examples/cpp/inference/MLP_Unify/mlp.cc @@ -23,7 +23,6 @@ using namespace Legion; using namespace FlexFlow; - Tensor create_mlp(FFModel *model, MLPConfig const *mlpConfig, Tensor const &input1, @@ -44,7 +43,7 @@ void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - + // Inference parameters int total_requests = 256; // total number of requests processed as part of the simulation @@ -61,13 +60,13 @@ void FlexFlow::top_level_task(Task const *task, 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; FFConfig ffConfig; - ffConfig.batchSize=1; + ffConfig.batchSize = 1; { - fprintf(stderr, "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes - ); + fprintf(stderr, + "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes); } FFModel ff(ffConfig); MLPConfig mlpConfig(embedding_size, sequence_length, hidden_dims); @@ -81,21 +80,26 @@ void FlexFlow::top_level_task(Task const *task, hd << hidden_dims[i]; } hd << '}'; - fprintf(stderr, "embedding_size(%d) sequence_length(%d) hidden_dims(%s)\n", mlpConfig.embedding_size, mlpConfig.sequence_length, hd.str().c_str()); + fprintf(stderr, + "embedding_size(%d) sequence_length(%d) hidden_dims(%s)\n", + mlpConfig.embedding_size, + mlpConfig.sequence_length, + hd.str().c_str()); } - + Tensor input1, input2; { - int const dims[] = {total_requests, mlpConfig.sequence_length * mlpConfig.embedding_size}; + int const dims[] = {total_requests, + mlpConfig.sequence_length * mlpConfig.embedding_size}; input1 = ff.create_tensor<2>(dims, DT_FLOAT); input2 = ff.create_tensor<2>(dims, DT_FLOAT); } Tensor t = create_mlp(&ff, &mlpConfig, input1, input2); - + InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); // im.compile_model_and_allocate_buffer(); ff.init_operators(); - + // Start timer { runtime->issue_execution_fence(ctx); @@ -104,10 +108,9 @@ void FlexFlow::top_level_task(Task const *task, future.get_void_result(); } double ts_start = Realm::Clock::current_time_in_microseconds(); - - + /////////////////////////////////////////////////////////////////////////////////// - + // Main loop, processing requests as they come (from the generator) int index = 0; int processed_requests = 0; @@ -121,12 +124,11 @@ void FlexFlow::top_level_task(Task const *task, im.inference((index++) % num_inflight_batches); runtime->end_trace(ctx, 111 /*trace_id*/); } - processed_requests+= iterations; + processed_requests += iterations; } /////////////////////////////////////////////////////////////////////////////////// - - + // End timer { runtime->issue_execution_fence(ctx); diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp index bf6456ef84..7429fdb159 100644 --- a/examples/cpp/inference/data_generator.cpp +++ b/examples/cpp/inference/data_generator.cpp @@ -5,11 +5,11 @@ // Created by User on 11/15/22. // -#include +#include "data_generator.h" #include +#include #include #include -#include "data_generator.h" using namespace std; // This is for running the dataloader standalone diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 199af572f8..8c3a89d2a7 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -1,85 +1,88 @@ -#include +#include #include +#include +#include #include -#include -#include #include -#include +#include using namespace std; typedef std::chrono::high_resolution_clock Clock; typedef std::chrono::milliseconds milliseconds; class Generator { - public: - size_t num_requests; // total number of requests - size_t tensor_size; // dimension of one request tensor - bool poisson; // false implied uniform distribution - double lambda; // mean #num of arrivals per sec - - Generator(size_t req, size_t tensor, bool poi, double lamb) { - num_requests = req; - tensor_size = tensor; - poisson = poi; - lambda = lamb; - compute_distribution(); - arrivals_ptr = arrivals.begin(); - timer_started = false; - } - - vector> get_requests(void); // function to retrieve requests - - private: - bool timer_started; // tracks if start time has been initiated - Clock::time_point start_time; // time when get_requests() is called for the first time - vector arrivals; // arrival times (ms) generated based on distribution - vector::iterator arrivals_ptr; // next request to output - - void compute_distribution( void ); // populate arrivals - vector get_random_tensor(void); // generate a random tensor +public: + size_t num_requests; // total number of requests + size_t tensor_size; // dimension of one request tensor + bool poisson; // false implied uniform distribution + double lambda; // mean #num of arrivals per sec + + Generator(size_t req, size_t tensor, bool poi, double lamb) { + num_requests = req; + tensor_size = tensor; + poisson = poi; + lambda = lamb; + compute_distribution(); + arrivals_ptr = arrivals.begin(); + timer_started = false; + } + + vector> get_requests(void); // function to retrieve requests + +private: + bool timer_started; // tracks if start time has been initiated + Clock::time_point + start_time; // time when get_requests() is called for the first time + vector arrivals; // arrival times (ms) generated based on distribution + vector::iterator arrivals_ptr; // next request to output + + void compute_distribution(void); // populate arrivals + vector get_random_tensor(void); // generate a random tensor }; -void Generator::compute_distribution( void ) { - // set up uniform number generator [0,1) - random_device rnd; - mt19937 gen(rnd()); - uniform_real_distribution dist {0, 1.0}; - double cur_arrival = 0; // assume first request comes in at time 0 - - for (size_t i = 0; i < num_requests; i++) { - arrivals.push_back(cur_arrival); - cout << "arrival time " << i << ": +" << cur_arrival << "ms \n"; - - if (poisson) { - double u = dist(gen); - double interval = -(1/lambda) * log(1-u) * 1000; - cur_arrival += interval; - } else { - cur_arrival += (1000/lambda); - } +void Generator::compute_distribution(void) { + // set up uniform number generator [0,1) + random_device rnd; + mt19937 gen(rnd()); + uniform_real_distribution dist{0, 1.0}; + double cur_arrival = 0; // assume first request comes in at time 0 + + for (size_t i = 0; i < num_requests; i++) { + arrivals.push_back(cur_arrival); + cout << "arrival time " << i << ": +" << cur_arrival << "ms \n"; + + if (poisson) { + double u = dist(gen); + double interval = -(1 / lambda) * log(1 - u) * 1000; + cur_arrival += interval; + } else { + cur_arrival += (1000 / lambda); } - return; + } + return; }; vector> Generator::get_requests(void) { - Clock::time_point cur_time = Clock::now(); - vector> requests; - if (!timer_started){ - // simply return one request and start timer for the first call - start_time = Clock::now(); - timer_started = true; - arrivals_ptr++; - requests.push_back(get_random_tensor()); - return requests; - } - - // output requests till we reach current timestamp - milliseconds ms_from_start = chrono::duration_cast(cur_time - start_time); - while (arrivals_ptr < arrivals.end() && ms_from_start.count() >= *arrivals_ptr){ - cout << "output request at arrival time +" << *arrivals_ptr << "\n"; - requests.push_back(get_random_tensor()); - arrivals_ptr++; - } + Clock::time_point cur_time = Clock::now(); + vector> requests; + if (!timer_started) { + // simply return one request and start timer for the first call + start_time = Clock::now(); + timer_started = true; + arrivals_ptr++; + requests.push_back(get_random_tensor()); return requests; + } + + // output requests till we reach current timestamp + milliseconds ms_from_start = + chrono::duration_cast(cur_time - start_time); + while (arrivals_ptr < arrivals.end() && + ms_from_start.count() >= *arrivals_ptr) { + cout << "output request at arrival time +" << *arrivals_ptr << "\n"; + requests.push_back(get_random_tensor()); + arrivals_ptr++; + } + return requests; }; // template @@ -90,27 +93,25 @@ vector> Generator::get_requests(void) { // } vector Generator::get_random_tensor(void) { - random_device rnd_device; - mt19937 mersenne_engine {rnd_device()}; - uniform_real_distribution dist {0, 1.0}; // state distribution - - auto gen = [&dist, &mersenne_engine](){ - return dist(mersenne_engine); - }; + random_device rnd_device; + mt19937 mersenne_engine{rnd_device()}; + uniform_real_distribution dist{0, 1.0}; // state distribution + + auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); }; - vector vec(tensor_size); - generate(begin(vec), end(vec), gen); - return vec; + vector vec(tensor_size); + generate(begin(vec), end(vec), gen); + return vec; }; // for debugging void print_requests(vector> req) { - cout << "printing requests\n"; - for (vector v: req){ - for (double e: v) { - cout << e << ","; - } - cout << "\n"; + cout << "printing requests\n"; + for (vector v : req) { + for (double e : v) { + cout << e << ","; } cout << "\n"; + } + cout << "\n"; }; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 99bdec62fb..f0823a9b80 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -47,19 +47,21 @@ Tensor create_moe(FFModel *model, agg_inputs[3] = gate_preds; // full gate preds assert(moeConfig->num_exp % moeConfig->fused_exp_block_size == 0); for (int i = 0; i < moeConfig->num_exp /*number of experts layers*/; i++) { - Tensor exp_pred = model->experts(gate_preds, - topK_output[1], - moeConfig->fused_exp_block_size /*number of experts*/, - moeConfig->fused_exp_block_size * i /*expert start index*/, - 1 /*number of linear layers*/, - moeConfig->hidden_size /*output_size*/, - moeConfig->hidden_size /*internal_size*/); + Tensor exp_pred = model->experts( + gate_preds, + topK_output[1], + moeConfig->fused_exp_block_size /*number of experts*/, + moeConfig->fused_exp_block_size * i /*expert start index*/, + 1 /*number of linear layers*/, + moeConfig->hidden_size /*output_size*/, + moeConfig->hidden_size /*internal_size*/); agg_inputs[i + 4] = exp_pred; } for (int i = 0; i < moeConfig->num_exp + 4; i++) { agg_inputs[i]->print("agg_inputs[i]"); } - Tensor coop_output = model->aggregate(agg_inputs, moeConfig->num_exp, moeConfig->lambda); + Tensor coop_output = + model->aggregate(agg_inputs, moeConfig->num_exp, moeConfig->lambda); // model->get_metrics(); return coop_output; } @@ -125,7 +127,7 @@ void FlexFlow::top_level_task(Task const *task, //----------------------------------------------------------------- - //Tensor t = create_moe_encoder(&ff, &moeConfig, input); + // Tensor t = create_moe_encoder(&ff, &moeConfig, input); Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, OUT_DIM, AC_MODE_RELU); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 7991548b44..5c9fe5f497 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -23,16 +23,17 @@ class FFModel; class InferenceManager { public: - InferenceManager(FFModel* _model, + InferenceManager(FFModel *_model, int max_num_requests_per_batch, int max_num_inflight_batches); void compile_model_and_allocate_buffer(void); void inference(int index); + public: - std::unordered_map > tensor_buffer; - FFModel* model; + std::unordered_map> tensor_buffer; + FFModel *model; int max_num_requests_per_batch; int max_num_inflight_batches; }; -} +} // namespace FlexFlow diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 9b05270f79..bd27e8be24 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -35,9 +35,9 @@ class Experts : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; void serialize(Legion::Serializer &) const override; static PCG::Node deserialize(FFModel &ff, diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 368ed9b93d..517190f7b8 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -45,17 +45,24 @@ bool operator==(ExpertsParams const &lhs, ExpertsParams const &rhs) { bool ExpertsParams::is_valid( std::pair const &input) const { - if (!input.first.is_valid()) + if (!input.first.is_valid()) { return false; - if (!input.second.is_valid()) + } + if (!input.second.is_valid()) { return false; - if (input.first.num_dims != input.second.num_dims + 1) + } + if (input.first.num_dims != input.second.num_dims + 1) { return false; - if (input.second.data_type != DT_INT32 && input.second.data_type != DT_INT64) + } + if (input.second.data_type != DT_INT32 && + input.second.data_type != DT_INT64) { return false; - for (int i = 0; i < input.second.num_dims; i++) - if (input.second.dims[i] != input.first.dims[i + 1]) + } + for (int i = 0; i < input.second.num_dims; i++) { + if (input.second.dims[i] != input.first.dims[i + 1]) { return false; + } + } return true; } @@ -87,13 +94,15 @@ Tensor FFModel::experts(const Tensor input, input, indices); assert(input->num_dims == indices->num_dims); - for (int i = 1; i < indices->num_dims; i++) + for (int i = 1; i < indices->num_dims; i++) { assert(input->dims[i] == indices->dims[i]); + } assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); int dims[MAX_TENSOR_DIM]; int numdim = input->num_dims; - for (int i = 1; i < input->num_dims; i++) + for (int i = 1; i < input->num_dims; i++) { dims[i] = input->dims[i]; + } dims[0] = experts_output_dim_size; e->outputs[0] = create_tensor_legion_ordering( numdim, dims, input->data_type, e, 0, true /*create_grad*/); @@ -170,15 +179,17 @@ Experts::Experts(FFModel &model, experts_internal_dim_size(_experts_internal_dim_size) { assert(input->num_dims == indices->num_dims); assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); - for (int i = 1; i < indices->num_dims; i++) + for (int i = 1; i < indices->num_dims; i++) { assert(input->dims[i] == indices->dims[i]); + } // Assume that we don't parallelize the channel dim of input // nor the expert_assigned dim of indices assert(input->dims[0].degree == 1); assert(indices->dims[0].degree == 1); ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < input->num_dims; i++) + for (int i = 0; i < input->num_dims; i++) { dims[i] = input->dims[i]; + } dims[0].size = experts_output_dim_size; numOutputs = 1; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -295,9 +306,9 @@ void Experts::backward_task(Task const *task, } void Experts::inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 71fe0ec6f9..be572848be 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -22,22 +22,21 @@ using namespace Legion; InferenceManager::InferenceManager(FFModel *_model, int _max_num_requests_per_batch, int _max_num_inflight_batches) - : model(_model), - max_num_requests_per_batch(_max_num_requests_per_batch), - max_num_inflight_batches(_max_num_inflight_batches) { - -} + : model(_model), max_num_requests_per_batch(_max_num_requests_per_batch), + max_num_inflight_batches(_max_num_inflight_batches) {} void InferenceManager::compile_model_and_allocate_buffer(void) { std::vector metrics; model->config.batchSize = max_num_requests_per_batch; - model->compile(LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics, COMP_MODE_INFERENCE); + model->compile( + LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics, COMP_MODE_INFERENCE); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; - for (const auto& op : model->operators) { + for (auto const &op : model->operators) { // Skip weight operators - if (op->op_type == OP_WEIGHT) + if (op->op_type == OP_WEIGHT) { continue; + } for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); @@ -45,10 +44,12 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { for (int j = 0; j < max_num_inflight_batches; j++) { // Copy the metadata from pt_base to pt ParallelTensor pt = new ParallelTensorBase(*pt_base); - pt->region = runtime->create_logical_region(ctx, - pt_base->region.get_index_space(), - pt_base->region.get_field_space()); - pt->part = runtime->get_logical_partition(ctx, pt->region, pt_base->part.get_index_partition()); + pt->region = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition( + ctx, pt->region, pt_base->part.get_index_partition()); list.push_back(pt); } tensor_buffer[pt_base] = list; @@ -59,15 +60,17 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { void InferenceManager::inference(int index) { assert(index < max_num_inflight_batches); for (size_t o = 0; o < model->operators.size(); o++) { - Op* op = model->operators[o]; + Op *op = model->operators[o]; std::vector inputs(op->numInputs); std::vector outputs(op->numOutputs); - for (int i = 0; i < op->numInputs; i++) + for (int i = 0; i < op->numInputs; i++) { inputs[i] = tensor_buffer[op->inputs[i]][index]; - for (int i = 0; i < op->numOutputs; i++) + } + for (int i = 0; i < op->numOutputs; i++) { outputs[i] = tensor_buffer[op->outputs[i]][index]; + } op->inference(*model, inputs, outputs); } }; -}; +}; // namespace FlexFlow From 748bad7e4bc5d5b9f9f3c0502e51a000cb86aabf Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 19 Jan 2023 13:04:20 -0500 Subject: [PATCH 048/344] [Inference][Experts OP] - Fix several bugs and seg fault --- config/config.linux | 4 ++-- examples/cpp/inference/mixture_of_experts/moe.cc | 8 ++++---- include/flexflow/model.h | 1 + include/flexflow/ops/aggregate.h | 2 +- src/ops/experts.cc | 10 +++++----- src/runtime/graph.cc | 5 +++++ src/runtime/model.cc | 5 +++++ 7 files changed, 23 insertions(+), 12 deletions(-) diff --git a/config/config.linux b/config/config.linux index 0f819f4031..28cf7c2fe1 100755 --- a/config/config.linux +++ b/config/config.linux @@ -38,8 +38,8 @@ FF_USE_GASNET=${FF_USE_GASNET:-OFF} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} # build C++ examples -FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-ON} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} +FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} +FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-OFF} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index f0823a9b80..a72ef584d2 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -57,9 +57,9 @@ Tensor create_moe(FFModel *model, moeConfig->hidden_size /*internal_size*/); agg_inputs[i + 4] = exp_pred; } - for (int i = 0; i < moeConfig->num_exp + 4; i++) { - agg_inputs[i]->print("agg_inputs[i]"); - } + // for (int i = 0; i < moeConfig->num_exp + 4; i++) { + // agg_inputs[i]->print("agg_inputs[i]"); + // } Tensor coop_output = model->aggregate(agg_inputs, moeConfig->num_exp, moeConfig->lambda); // model->get_metrics(); @@ -131,7 +131,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, OUT_DIM, AC_MODE_RELU); InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); - // im.compile_model_and_allocate_buffer(); + im.compile_model_and_allocate_buffer(); ff.init_operators(); // Data Loader diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 0545cc7fbb..b187629483 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -675,6 +675,7 @@ class FFModel { auto input_shapes = get_input_shape(input); if (!params.is_valid(input_shapes)) { + printf("!params.is_valid(input_shapes)\n"); return PCG::Node::INVALID_NODE; } diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 6e4dd0b4ac..bd757c6911 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -8,7 +8,7 @@ namespace FlexFlow { #define AGGREGATE_MAX_K 4 #define AGGREGATE_MAX_BATCH_SIZE 64 -#define AGGREGATE_MAX_N 12 +#define AGGREGATE_MAX_N 128 class AggregateMeta : public OpMeta { public: diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 517190f7b8..540c7b2abd 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -51,15 +51,15 @@ bool ExpertsParams::is_valid( if (!input.second.is_valid()) { return false; } - if (input.first.num_dims != input.second.num_dims + 1) { + if (input.first.num_dims != input.second.num_dims) { return false; } if (input.second.data_type != DT_INT32 && input.second.data_type != DT_INT64) { return false; } - for (int i = 0; i < input.second.num_dims; i++) { - if (input.second.dims[i] != input.first.dims[i + 1]) { + for (int i = 1; i < input.second.num_dims; i++) { + if (input.second.dims[i] != input.first.dims[i]) { return false; } } @@ -89,7 +89,7 @@ Tensor FFModel::experts(const Tensor input, DT_FLOAT, name, 2 /*inputs*/, - 1 /*weights*/, + 0 /*weights*/, // to be changed back to 1 1 /*outputs*/, input, indices); @@ -169,7 +169,7 @@ Experts::Experts(FFModel &model, DT_FLOAT, name, 2 /*inputs*/, - 1 /*weights*/, + 0 /*weights*/, // to be changed back to 1 1 /*outputs*/, input, indices), diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 837585ae15..85f5132f6f 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -25,6 +25,7 @@ #include "flexflow/ops/element_binary.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" +#include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/layer_norm.h" @@ -2072,6 +2073,10 @@ void FFModel::deserialize_graph_optimal_view( {std::begin(inputs), std::begin(inputs) + num_inputs}, params); break; } + case OP_EXPERTS: { + node = Experts::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_POOL2D: { node = Pool2D::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 7350735314..770292a6f2 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2764,6 +2764,11 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_EXPERTS: { + Op *op = Experts::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } default: assert(false); } From 4e0f568dee141ff5c1ec26c68872ac94981a603c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 19 Jan 2023 21:07:23 -0500 Subject: [PATCH 049/344] removed unnecessary changes --- src/ops/aggregate.cc | 2 -- src/ops/group_by.cpp | 2 +- src/ops/group_by.cu | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 49e564a702..b64238125c 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -301,8 +301,6 @@ void Aggregate::forward_task(Task const *task, Runtime *runtime) { assert(regions.size() == task->regions.size()); int n = regions.size() - 3; - // FIXME: skip the aggregate computation for now - return; AggregateMeta const *m = *((AggregateMeta **)task->local_args); diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index e0b914cf1a..f45e9092a5 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -193,4 +193,4 @@ GroupByMeta::~GroupByMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index ac29904e9f..ee0b18337c 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -203,4 +203,4 @@ GroupByMeta::~GroupByMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow From 0733c94a5758ba5567d9bc6eddfbc27f1e1611a4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 26 Jan 2023 11:53:49 -0500 Subject: [PATCH 050/344] Implement Experts operator (#582) * backup * finished all experts layer, except kernels * moved deserializer back into ops, fixed initialization bug in aggregate * save file * bug fixes * comment out assert for now * finished kernel implementation * fix hip build * clang * rename tensor * fix --- .../cpp/inference/mixture_of_experts/moe.cc | 115 ++-- .../cpp/inference/mixture_of_experts/moe.h | 4 +- include/flexflow/model.h | 21 +- include/flexflow/ops/aggregate.h | 6 +- include/flexflow/ops/experts.h | 36 +- include/flexflow/ops/experts_params.h | 6 +- src/ops/aggregate.cc | 16 + src/ops/experts.cc | 522 ++++++++++++------ src/ops/experts.cpp | 148 +++++ src/ops/experts.cu | 150 +++++ src/runtime/graph.cc | 22 +- src/runtime/model.cc | 17 +- src/runtime/substitution.cc | 8 + 13 files changed, 821 insertions(+), 250 deletions(-) create mode 100644 src/ops/experts.cpp create mode 100644 src/ops/experts.cu diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index a72ef584d2..e6f9a51d21 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -40,30 +40,29 @@ Tensor create_moe(FFModel *model, Tensor gate_preds = model->dense(input, moeConfig->num_exp, AC_MODE_RELU); Tensor topK_output[2]; model->top_k(gate_preds, topK_output, moeConfig->num_select, false); - Tensor agg_inputs[moeConfig->num_exp + 4]; - agg_inputs[0] = model->softmax(topK_output[0]); // gate preds - agg_inputs[1] = topK_output[1]; // gate assign - agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) - agg_inputs[3] = gate_preds; // full gate preds - assert(moeConfig->num_exp % moeConfig->fused_exp_block_size == 0); - for (int i = 0; i < moeConfig->num_exp /*number of experts layers*/; i++) { - Tensor exp_pred = model->experts( - gate_preds, - topK_output[1], - moeConfig->fused_exp_block_size /*number of experts*/, - moeConfig->fused_exp_block_size * i /*expert start index*/, - 1 /*number of linear layers*/, - moeConfig->hidden_size /*output_size*/, - moeConfig->hidden_size /*internal_size*/); - agg_inputs[i + 4] = exp_pred; + + assert(moeConfig->num_exp % moeConfig->experts_per_block == 0); + int nblocks = moeConfig->num_exp / moeConfig->experts_per_block; + Tensor exp_preds; + Tensor expert_block_inputs[3] = {input, topK_output[1], topK_output[0]}; + for (int i = 0; i < nblocks /*number of experts layers*/; i++) { + Tensor block_preds = + model->experts(expert_block_inputs, + moeConfig->experts_per_block, /*number of experts*/ + moeConfig->experts_per_block * i, /*expert start index*/ + moeConfig->hidden_size, /*output_size*/ + moeConfig->alpha); + assert(block_preds != nullptr); + if (i == 0) { + exp_preds = block_preds; + } else { + assert(exp_preds != nullptr); + model->add(exp_preds, block_preds, /*inplace_a*/ true); + } } - // for (int i = 0; i < moeConfig->num_exp + 4; i++) { - // agg_inputs[i]->print("agg_inputs[i]"); - // } - Tensor coop_output = - model->aggregate(agg_inputs, moeConfig->num_exp, moeConfig->lambda); + // model->get_metrics(); - return coop_output; + return exp_preds; } Tensor create_moe_encoder(FFModel *model, @@ -94,14 +93,14 @@ void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - // Inference parameters + /* // Inference parameters int total_requests = 256; // total number of requests processed as part of the simulation int request_tensor_size = 4; // request tensor dimensions bool poisson_distribution = true; double lambda = 25; // average number of request arrivals per second int num_requests_per_batch = 5; - int num_inflight_batches = 10; + int num_inflight_batches = 10; */ //----------------------------------------------------------------- @@ -130,9 +129,15 @@ void FlexFlow::top_level_task(Task const *task, // Tensor t = create_moe_encoder(&ff, &moeConfig, input); Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, OUT_DIM, AC_MODE_RELU); - InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); - im.compile_model_and_allocate_buffer(); - ff.init_operators(); + + /* InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); + im.compile_model_and_allocate_buffer(); */ + + Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f); + std::vector metrics; + metrics.push_back(METRICS_ACCURACY); + metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); + ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics); // Data Loader ParallelTensor input_pt, label_pt; @@ -140,6 +145,8 @@ void FlexFlow::top_level_task(Task const *task, ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); DataLoader data_loader(ff, moeConfig, input_pt, label_pt); + ff.init_operators(); + //----------------------------------------------------------------- // Start timer @@ -153,20 +160,50 @@ void FlexFlow::top_level_task(Task const *task, /////////////////////////////////////////////////////////////////////////////////// - int index = 0; - int processed_requests = 0; - Generator data_generator( - total_requests, request_tensor_size, poisson_distribution, lambda); - while (processed_requests < total_requests) { - vector> req = data_generator.get_requests(); - int iterations = req.size(); + // int index = 0; + // int processed_requests = 0; + // Generator data_generator( + // total_requests, request_tensor_size, poisson_distribution, lambda); + // while (processed_requests < total_requests) { + // vector> req = data_generator.get_requests(); + // int iterations = req.size(); + // for (int iter = 0; iter < iterations; iter++) { + // // data_loader.next_batch(ff); + // runtime->begin_trace(ctx, 111 /*trace_id*/); + // im.inference((index++) % num_inflight_batches); + // runtime->end_trace(ctx, 111 /*trace_id*/); + // } + // processed_requests += iterations; + // } + + for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { + data_loader.reset(); + ff.reset_metrics(); + int iterations = TRAIN_SAMPLES / ffConfig.batchSize; + for (int iter = 0; iter < iterations; iter++) { - // data_loader.next_batch(ff); - runtime->begin_trace(ctx, 111 /*trace_id*/); - im.inference((index++) % num_inflight_batches); - runtime->end_trace(ctx, 111 /*trace_id*/); + data_loader.next_batch(ff); + if (epoch > 0) { + runtime->begin_trace(ctx, 111 /*trace_id*/); + } + ff.forward(); + ff.zero_gradients(); + // ff.backward(); + ff.update(); + // ff.recompile_on_condition(r); + if (epoch > 0) { + runtime->end_trace(ctx, 111 /*trace_id*/); + } } - processed_requests += iterations; + + // TODO: Do properly + ff.reset_metrics(); + // iterations = TEST_SAMPLES / ffConfig.batchSize; + // for (int iter = 0; iter < iterations; iter++) { + // data_loader.next_batch(ff); + // ff.forward(); + // ff.backward(); + // } } /////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index ed7b5af483..80cef3ff87 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -34,7 +34,7 @@ struct MoeConfig { // total number of experts num_exp = 128; // number of experts in each block of fused experts - fused_exp_block_size = 32; + experts_per_block = 32; // number of experts to route each token to num_select = 2; alpha = 2.0f; @@ -48,7 +48,7 @@ struct MoeConfig { // MoE layer int num_exp; int num_select; - int fused_exp_block_size; + int experts_per_block; float alpha; // factor overhead tensor size for imbalance float lambda; // multiplier for load balance term int hidden_size; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index b187629483..9ad9d52eab 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -57,7 +57,6 @@ enum TaskIDs { EXPERTS_INIT_TASK_ID, EXPERTS_FWD_TASK_ID, EXPERTS_BWD_TASK_ID, - EXPERTS_INF_TASK_ID, CONV2D_INIT_TASK_ID, CONV2D_INIT_PARA_TASK_ID, CONV2D_FWD_TASK_ID, @@ -484,14 +483,15 @@ class FFModel { Tensor concat(int n, Tensor const *tensors, int axis, char const *name = NULL); // Add an experts layer - Tensor experts(const Tensor input, - const Tensor indices, - int num_experts, - int experts_start_idx, - int experts_num_layers, - int experts_output_dim_size, - int experts_internal_dim_size, - char const *name = nullptr); + Tensor experts( + Tensor const *inputs, + int num_experts, + int experts_start_idx, + int experts_output_dim_size, + float alpha, + int experts_num_layers = 1, // number of linear layers per expert + int experts_internal_dim_size = 0, // hidden dimension for internal layers + char const *name = NULL); // Add a mean layer Tensor mean(const Tensor input, std::vector const &dims, @@ -865,8 +865,7 @@ class FFModel { std::unordered_map, Embedding *>, std::unordered_map< - std::pair, - ExpertsParams>, + std::pair, ExpertsParams>, Experts *>, std::unordered_map, Flat *>, diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index bd757c6911..ba7240802b 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -26,7 +26,7 @@ class Aggregate : public Op { ParallelTensor const *inputs, int _n, float _lambda_bal, - char const *name); + char const *name = nullptr); Aggregate(FFModel &model, Aggregate const &other, std::vector const &inputs); @@ -85,6 +85,10 @@ class Aggregate : public Op { int const batch_size, int out_dim); void serialize(Legion::Serializer &s) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + Input const &inputs, + int num_inputs); bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index bd27e8be24..6b875a10b9 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -1,29 +1,36 @@ #pragma once #include "flexflow/model.h" +#include "flexflow/ops/experts_params.h" namespace FlexFlow { class ExpertsMeta : public OpMeta { public: - ExpertsMeta(FFHandler handler) : OpMeta(handler){}; + ExpertsMeta(FFHandler handler, int num_experts); + ~ExpertsMeta(void); + float **dev_region_ptrs; }; +// definitions for the CUDA kernel +#define MAX_BATCH_SIZE 64 +#define MAX_EXPERTS_PER_BLOCK 32 + class Experts : public Op { public: using Params = ExpertsParams; - using Input = std::pair; + using Input = std::vector; Experts(FFModel &model, Params const ¶ms, Input const &inputs, char const *name = nullptr); Experts(FFModel &model, - const ParallelTensor input, - const ParallelTensor indices, + ParallelTensor const *inputs, int _num_experts, int _experts_start_idx, - int _experts_num_layers, int _experts_output_dim_size, + float _alpha, + int _experts_num_layers, int _experts_internal_dim_size, char const *name = nullptr); static Op * @@ -42,11 +49,8 @@ class Experts : public Op { void serialize(Legion::Serializer &) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, - ParallelTensor inputs[], + Input const &inputs, int num_inputs); - Op *materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const override; Params get_params() const; static OpMeta *init_task(Legion::Task const *task, std::vector const ®ions, @@ -56,6 +60,17 @@ class Experts : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void forward_kernel_wrapper(ExpertsMeta const *m, + float const *acc_input_ptr, + int const *acc_indices_ptr, + float const *acc_topk_gate_preds_ptr, + float **outputs, + int num_experts, + int experts_start_idx, + int expert_capacity, + int chosen_experts, + int batch_size, + int out_dim); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -71,8 +86,9 @@ class Experts : public Op { public: int num_experts; int experts_start_idx; - int experts_num_layers; int experts_output_dim_size; + float alpha; + int experts_num_layers; int experts_internal_dim_size; }; diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index 8f0cee4959..20a65a06f8 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -5,12 +5,12 @@ namespace FlexFlow { struct ExpertsParams { - bool is_valid( - std::pair const &) const; + bool is_valid(std::vector const &) const; int num_experts; int experts_start_idx; - int experts_num_layers; int experts_output_dim_size; + float alpha; + int experts_num_layers; int experts_internal_dim_size; }; diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index b64238125c..a1e5fcbbad 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -166,6 +166,22 @@ Aggregate::Aggregate(FFModel &model, char const *name) : Aggregate(model, inputs.data(), params.n, params.lambda_bal, name) {} +using PCG::Node; +Node Aggregate::deserialize(FFModel &ff, + Legion::Deserializer &dez, + std::vector const &inputs, + int num_inputs) { + int n; + float lambda_bal; + dez.deserialize(n); + dez.deserialize(lambda_bal); + assert(num_inputs == n + 4); + AggregateParams params; + params.n = n; + params.lambda_bal = lambda_bal; + return ff.get_or_create_node(inputs, params); +} + void Aggregate::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 540c7b2abd..288507b6d8 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -35,84 +35,78 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -bool operator==(ExpertsParams const &lhs, ExpertsParams const &rhs) { - return lhs.num_experts == rhs.num_experts && - lhs.experts_start_idx == rhs.experts_start_idx && - lhs.experts_num_layers == rhs.experts_num_layers && - lhs.experts_output_dim_size == rhs.experts_output_dim_size && - lhs.experts_internal_dim_size == rhs.experts_internal_dim_size; -} - -bool ExpertsParams::is_valid( - std::pair const &input) const { - if (!input.first.is_valid()) { - return false; - } - if (!input.second.is_valid()) { - return false; - } - if (input.first.num_dims != input.second.num_dims) { - return false; - } - if (input.second.data_type != DT_INT32 && - input.second.data_type != DT_INT64) { - return false; - } - for (int i = 1; i < input.second.num_dims; i++) { - if (input.second.dims[i] != input.first.dims[i]) { - return false; - } - } - return true; -} - -ExpertsParams Experts::get_params() const { - ExpertsParams params; - params.num_experts = num_experts; - params.experts_start_idx = experts_start_idx; - params.experts_num_layers = experts_num_layers; - params.experts_output_dim_size = experts_output_dim_size; - params.experts_internal_dim_size = experts_internal_dim_size; - return params; -} - -Tensor FFModel::experts(const Tensor input, - const Tensor indices, +// For now, we use one input and one output per expert +Tensor FFModel::experts(Tensor const *inputs, int num_experts, int experts_start_idx, - int experts_num_layers, int experts_output_dim_size, + float alpha, + int experts_num_layers, int experts_internal_dim_size, char const *name) { + + // Check that there are three inputs: the input tensor, the indices and the + // topk_gate_preds + assert(inputs[0] != nullptr); + int num_dims = inputs[0]->num_dims; + assert(inputs[1]->num_dims == num_dims); + assert(inputs[2]->num_dims == num_dims); + int topk = inputs[1]->dims[0]; + assert(inputs[2]->dims[0] == topk); + for (int i = 1; i < num_dims; i++) { + assert(inputs[0]->dims[i] == inputs[1]->dims[i]); + assert(inputs[1]->dims[i] == inputs[2]->dims[i]); + } + // assert(input->num_dims == indices->num_dims); + // for (int i = 1; i < indices->num_dims; i++) { + // assert(input->dims[i] == indices->dims[i]); + // } + assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); + + assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); + assert(experts_num_layers == 1 || experts_internal_dim_size > 0); + + Tensor fused_experts = this->dense( + inputs[0], num_experts * experts_output_dim_size, AC_MODE_RELU); + fused_experts = this->softmax(fused_experts); + + Tensor const layer_inputs[3] = {fused_experts, inputs[1], inputs[2]}; + Layer *e = new Layer(this, OP_EXPERTS, DT_FLOAT, name, - 2 /*inputs*/, - 0 /*weights*/, // to be changed back to 1 - 1 /*outputs*/, - input, - indices); - assert(input->num_dims == indices->num_dims); - for (int i = 1; i < indices->num_dims; i++) { - assert(input->dims[i] == indices->dims[i]); - } - assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); - int dims[MAX_TENSOR_DIM]; - int numdim = input->num_dims; - for (int i = 1; i < input->num_dims; i++) { - dims[i] = input->dims[i]; + 3 /*inputs*/, + 0 /*weights*/, + num_experts /*outputs*/, + layer_inputs); + + { + int dims[MAX_TENSOR_DIM]; + for (int i = 1; i < num_dims; i++) { + dims[i] = inputs[0]->dims[i]; + } + dims[0] = experts_output_dim_size; + for (int i = 0; i < num_experts; i++) { + e->outputs[i] = create_tensor_legion_ordering( + num_dims, dims, DT_FLOAT, e, 0, true /*create_grad*/); + assert(e->outputs[i] != nullptr); + } } - dims[0] = experts_output_dim_size; - e->outputs[0] = create_tensor_legion_ordering( - numdim, dims, input->data_type, e, 0, true /*create_grad*/); + e->add_int_property("num_experts", num_experts); e->add_int_property("experts_start_idx", experts_start_idx); - e->add_int_property("experts_num_layers", experts_num_layers); e->add_int_property("experts_output_dim_size", experts_output_dim_size); + e->add_float_property("alpha", alpha); + e->add_int_property("experts_num_layers", experts_num_layers); e->add_int_property("experts_internal_dim_size", experts_internal_dim_size); layers.push_back(e); - return e->outputs[0]; + + Tensor ret = e->outputs[0]; + for (int i = 1; i < num_experts; i++) { + this->add(ret, e->outputs[i], /*inplace_a*/ true); + } + return ret; } Op *Experts::create_operator_from_layer( @@ -124,115 +118,228 @@ Op *Experts::create_operator_from_layer( int num_experts = value; layer->get_int_property("experts_start_idx", value); int experts_start_idx = value; - layer->get_int_property("experts_num_layers", value); - int experts_num_layers = value; layer->get_int_property("experts_output_dim_size", value); int experts_output_dim_size = value; + float value2; + layer->get_float_property("alpha", value2); + float alpha = value2; + layer->get_int_property("experts_num_layers", value); + int experts_num_layers = value; layer->get_int_property("experts_internal_dim_size", value); int experts_internal_dim_size = value; return new Experts(model, - inputs[0], - inputs[1], + inputs.data(), num_experts, experts_start_idx, - experts_num_layers, experts_output_dim_size, + alpha, + experts_num_layers, experts_internal_dim_size, layer->name); } +ExpertsParams Experts::get_params() const { + ExpertsParams params; + params.num_experts = num_experts; + params.experts_start_idx = experts_start_idx; + params.experts_output_dim_size = experts_output_dim_size; + params.alpha = alpha; + params.experts_num_layers = experts_num_layers; + params.experts_internal_dim_size = experts_internal_dim_size; + return params; +} + +bool ExpertsParams::is_valid( + std::vector const &inputs) const { + if (inputs.size() != 3) { + printf("Number of inputs to the Experts layer is wrong\n"); + return false; + } + if (!inputs[0].is_valid()) { + printf("The first tensor passed to the Experts layer is not valid\n"); + return false; + } + if (!inputs[1].is_valid()) { + printf("The second tensor passed to the Experts layer is not valid\n"); + return false; + } + if (!inputs[2].is_valid()) { + printf("The third tensor passed to the Experts layer is not valid\n"); + return false; + } + if (inputs[0].num_dims != inputs[1].num_dims || + inputs[1].num_dims != inputs[2].num_dims) { + printf("Mismatch found between the number of dimensions of the three input " + "tensors for the Expert layer\n"); + return false; + } + if (inputs[0].data_type != DT_FLOAT) { + printf("Data type of the first input to the Experts layer is wrong!\n"); + return false; + } + if (inputs[1].data_type != DT_INT32 && inputs[1].data_type != DT_INT64) { + printf("Data type of the second input to the Experts layer is wrong!\n"); + return false; + } + if (inputs[2].data_type != DT_FLOAT) { + printf("Data type of the third input to the Experts layer is wrong!\n"); + return false; + } + if (inputs[0].dims[0].size != num_experts * experts_output_dim_size) { + printf("Dimension 0 of input tensor 1 to the Experts layer is wrong.\n"); + return false; + } + if (inputs[1].dims[0] != inputs[2].dims[0]) { + printf( + "Dimension mismatch between indices and topk_gate_preds tensors passed " + "to the Experts layer.\n"); + return false; + } + for (int i = 1; i < inputs[0].num_dims; i++) { + if (inputs[0].dims[i] != inputs[1].dims[i] || + inputs[1].dims[i] != inputs[2].dims[i]) { + printf("Dimension mismatch among the input tensors passed to the Experts " + "layer.\n"); + return false; + } + } + return true; +} + +bool operator==(ExpertsParams const &lhs, ExpertsParams const &rhs) { + return lhs.num_experts == rhs.num_experts && + lhs.experts_start_idx == rhs.experts_start_idx && + lhs.experts_output_dim_size == rhs.experts_output_dim_size && + lhs.alpha == rhs.alpha && + lhs.experts_num_layers == rhs.experts_num_layers && + lhs.experts_internal_dim_size == rhs.experts_internal_dim_size; +} + Experts::Experts(FFModel &model, ExpertsParams const ¶ms, - std::pair const &inputs, + // std::pair const &inputs, + std::vector const &inputs, char const *name) : Experts(model, - inputs.first, - inputs.second, + inputs.data(), params.num_experts, params.experts_start_idx, - params.experts_num_layers, params.experts_output_dim_size, + params.alpha, + params.experts_num_layers, params.experts_internal_dim_size, name) {} Experts::Experts(FFModel &model, - const ParallelTensor input, - const ParallelTensor indices, + ParallelTensor const *inputs, int _num_experts, int _experts_start_idx, - int _experts_num_layers, int _experts_output_dim_size, + float _alpha, + int _experts_num_layers, int _experts_internal_dim_size, char const *name) : Op(model, OP_EXPERTS, DT_FLOAT, name, - 2 /*inputs*/, - 0 /*weights*/, // to be changed back to 1 - 1 /*outputs*/, - input, - indices), + 3 /*inputs*/, + 0 /*weights*/, + _num_experts /*outputs*/, + inputs), num_experts(_num_experts), experts_start_idx(_experts_start_idx), + experts_output_dim_size(_experts_output_dim_size), alpha(_alpha), experts_num_layers(_experts_num_layers), - experts_output_dim_size(_experts_output_dim_size), experts_internal_dim_size(_experts_internal_dim_size) { - assert(input->num_dims == indices->num_dims); - assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); - for (int i = 1; i < indices->num_dims; i++) { - assert(input->dims[i] == indices->dims[i]); + + assert(num_experts > 0); + assert(numInputs == 3); + assert(numOutputs == num_experts); + + assert(inputs[0] != nullptr); + int num_dims = inputs[0]->num_dims; + assert(inputs[1]->num_dims == num_dims); + assert(inputs[2]->num_dims == num_dims); + + int out_dim = num_experts * experts_output_dim_size; + assert(inputs[0]->dims[0].size == out_dim); + int topk = inputs[1]->dims[0].size; + assert(inputs[2]->dims[0].size == topk); + + for (int i = 1; i < num_dims; i++) { + assert(inputs[0]->dims[i] == inputs[1]->dims[i]); + assert(inputs[1]->dims[i] == inputs[2]->dims[i]); } + // assert(input->num_dims == indices->num_dims); + // for (int i = 1; i < indices->num_dims; i++) { + // assert(input->dims[i] == indices->dims[i]); + // } + assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); + assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); + assert(experts_num_layers == 1 || experts_internal_dim_size > 0); + + // assert(input->num_dims == indices->num_dims); + // assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); + // for (int i = 1; i < indices->num_dims; i++) { + // assert(input->dims[i] == indices->dims[i]); + // } + // Assume that we don't parallelize the channel dim of input // nor the expert_assigned dim of indices - assert(input->dims[0].degree == 1); - assert(indices->dims[0].degree == 1); + assert(inputs[0]->dims[0].degree == 1); + assert(inputs[1]->dims[0].degree == 1); + assert(inputs[2]->dims[0].degree == 1); + ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < input->num_dims; i++) { - dims[i] = input->dims[i]; + for (int i = 0; i < num_dims; i++) { + dims[i] = inputs[0]->dims[i]; } dims[0].size = experts_output_dim_size; - numOutputs = 1; - outputs[0] = model.create_parallel_tensor_legion_ordering( - input->num_dims, dims, input->data_type, this); + // numOutputs = num_experts; + // numWeights = 0; + for (int i = 0; i < num_experts; i++) { + outputs[i] = model.create_parallel_tensor_legion_ordering( + num_dims, dims, inputs[0]->data_type, this, i /*owner_idx*/); + assert(outputs[i] != nullptr); + } } void Experts::serialize(Legion::Serializer &sez) const { ExpertsParams params = get_params(); sez.serialize(params.num_experts); sez.serialize(params.experts_start_idx); - sez.serialize(params.experts_num_layers); sez.serialize(params.experts_output_dim_size); + sez.serialize(params.alpha); + sez.serialize(params.experts_num_layers); sez.serialize(params.experts_internal_dim_size); } using PCG::Node; Node Experts::deserialize(FFModel &ff, Legion::Deserializer &dez, - ParallelTensor inputs[], + std::vector const &inputs, int num_inputs) { - assert(num_inputs == 2); - int num_experts, experts_start_idx, experts_num_layers, - experts_output_dim_size, experts_internal_dim_size; + int num_experts, experts_start_idx, experts_output_dim_size, + experts_num_layers, experts_internal_dim_size; + float alpha; dez.deserialize(num_experts); dez.deserialize(experts_start_idx); - dez.deserialize(experts_num_layers); dez.deserialize(experts_output_dim_size); + dez.deserialize(alpha); + dez.deserialize(experts_num_layers); dez.deserialize(experts_internal_dim_size); + assert(num_inputs == 3); + ExpertsParams params; params.num_experts = num_experts; params.experts_start_idx = experts_start_idx; - params.experts_num_layers = experts_num_layers; params.experts_output_dim_size = experts_output_dim_size; + params.alpha = alpha; + params.experts_num_layers = experts_num_layers; params.experts_internal_dim_size = experts_internal_dim_size; - return ff.get_or_create_node({inputs[0], inputs[1]}, params); -} -Op *Experts::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - ExpertsParams params = get_params(); - return new Experts(ff, params, {inputs[0], inputs[1]}, this->name); + return ff.get_or_create_node(inputs, params); } void Experts::init(FFModel const &ff) { @@ -250,59 +357,68 @@ void Experts::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *Experts::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Experts const *exp = (Experts *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + ExpertsMeta *m = new ExpertsMeta(handle, exp->num_experts); + m->profiling = exp->profiling; + return m; +} + +void Experts::forward(FFModel const &ff) { + // assert(false && "Experts is designed for inference only"); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(EXPERTS_FWD_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Experts)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // expert predictions launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); + // expert assignment indices launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + // topk_gate_preds + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, 0 /*projection id*/, - WRITE_ONLY, + READ_ONLY, EXCLUSIVE, - outputs[0]->region)); + inputs[2]->region)); launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -OpMeta *Experts::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Experts const *bmm = (Experts *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - ExpertsMeta *m = new ExpertsMeta(handle); - return m; -} - -void Experts::forward(FFModel const &ff) { - assert(false && "Experts is designed for inference only"); -} - -void Experts::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(false && "Experts is designed for inference only"); -} - -void Experts::backward(FFModel const &ff) { - assert(false && "Experts is designed for inference only"); -} - -void Experts::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(false && "Experts is designed for inference only"); + for (int i = 0; i < num_experts; i++) { + // expert output per token (only the chosen experts have non-zero + // contributions) + launcher.add_region_requirement(RegionRequirement(outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[i]->region)); + launcher.add_field(i + 3, FID_DATA); + } + runtime->execute_index_space(ctx, launcher); } void Experts::inference(FFModel const &ff, @@ -314,40 +430,125 @@ void Experts::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); - IndexLauncher launcher(EXPERTS_INF_TASK_ID, + IndexLauncher launcher(EXPERTS_FWD_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), + TaskArgument(this, sizeof(Experts)), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + // expert predictions + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_inputs[0]->region)); + inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + // expert assignment indices + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_inputs[1]->region)); + inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + // topk_gate_preds + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, 0 /*projection id*/, - WRITE_ONLY, + READ_ONLY, EXCLUSIVE, - batch_outputs[0]->region)); + inputs[2]->region)); launcher.add_field(2, FID_DATA); + for (int i = 0; i < num_experts; i++) { + // expert output per token (only the chosen experts have non-zero + // contributions) + launcher.add_region_requirement(RegionRequirement(outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[i]->region)); + launcher.add_field(i + 3, FID_DATA); + } runtime->execute_index_space(ctx, launcher); } -void Experts::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // TODO: to be implemented +void Experts::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == task->regions.size()); + int num_experts = regions.size() - 3; + + Experts const *exp = (Experts *)task->args; + assert(exp != nullptr); + assert(exp->num_experts == num_experts); + float alpha = exp->alpha; + int experts_start_idx = exp->experts_start_idx; + + ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); + + // get input, indices, topk_gate_preds + AccessorRO const acc_input(regions[0], FID_DATA); + AccessorRO const acc_indices(regions[1], FID_DATA); + AccessorRO const acc_topk_gate_pred(regions[2], FID_DATA); + Rect<3> rect_input = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Rect<3> rect_indices = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Rect<3> rect_topk_gate_pred = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + coord_t batch_size = rect_input.hi[1] - rect_input.lo[1] + 1; + assert(batch_size == rect_indices.hi[1] - rect_indices.lo[1] + 1); + assert(batch_size == + rect_topk_gate_pred.hi[1] - rect_topk_gate_pred.lo[1] + 1); + coord_t chosen_experts = rect_indices.hi[0] - rect_indices.lo[0]; + assert(chosen_experts == + rect_topk_gate_pred.hi[0] - rect_topk_gate_pred.lo[0]); + coord_t out_dim = (rect_input.hi[0] - rect_input.lo[0] + 1) / num_experts; + + int expert_capacity = + ceil(alpha * (int)chosen_experts / num_experts * (int)batch_size); + + assert(batch_size <= MAX_BATCH_SIZE && + "batch size exceeds MAX_BATCH_SIZE defined in experts.h"); + assert( + num_experts <= MAX_EXPERTS_PER_BLOCK && + "number of experts exceeds MAX_EXPERTS_PER_BLOCK defined in experts.h"); + + float *outputs[num_experts]; + for (int i = 0; i < num_experts; i++) { + Rect<3> rect_output = runtime->get_index_space_domain( + ctx, task->regions[3 + i].region.get_index_space()); + assert((rect_output.hi[0] - rect_output.lo[0] + 1) == out_dim); + assert((rect_output.hi[1] - rect_output.lo[1] + 1) == batch_size); + outputs[i] = helperGetTensorPointerWO( + regions[3 + i], task->regions[3 + i], FID_DATA, ctx, runtime); + assert(outputs[i] != nullptr); + } + + Experts::forward_kernel_wrapper(m, + acc_input.ptr(rect_input), + acc_indices.ptr(rect_indices), + acc_topk_gate_pred.ptr(rect_topk_gate_pred), + outputs, + num_experts, + experts_start_idx, + expert_capacity, + chosen_experts, + batch_size, + out_dim); +} + +void Experts::backward(FFModel const &ff) { + assert(false && "Experts is designed for inference only"); +} + +void Experts::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(false && "Experts is designed for inference only"); } void Experts::print_layer(FFModel const &ff) { @@ -358,7 +559,7 @@ bool Experts::measure_operator_cost(Simulator *sim, MachineView const &c, CostMetrics &cost_metrics) const { // This is an inference only operator - assert(false); + assert(false && "Experts is designed for inference only"); return false; } @@ -370,8 +571,9 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.num_experts); hash_combine(key, params.experts_start_idx); - hash_combine(key, params.experts_num_layers); hash_combine(key, params.experts_output_dim_size); + hash_combine(key, params.alpha); + hash_combine(key, params.experts_num_layers); hash_combine(key, params.experts_internal_dim_size); return key; } diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp new file mode 100644 index 0000000000..a19c7a3a9a --- /dev/null +++ b/src/ops/experts.cpp @@ -0,0 +1,148 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/experts.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +__global__ void experts_forward_kernel(float const *input, + int const *indices, + float const *topk_gate_preds, + float **outputs, + int num_experts, + int experts_start_idx, + int chosen_experts, + int expert_capacity, + int batch_size, + int out_dim) { + // shared at the block level + __shared__ float token_assigned[MAX_BATCH_SIZE][MAX_EXPERTS_PER_BLOCK]; + + // initialize the token assignments to 0 + CUDA_KERNEL_LOOP(i, MAX_BATCH_SIZE * MAX_EXPERTS_PER_BLOCK) { + int token_index = i / MAX_EXPERTS_PER_BLOCK; + int expert_index = i % MAX_EXPERTS_PER_BLOCK; + token_assigned[token_index][expert_index] = 0.0f; + } + + __syncthreads(); + + // Compute token assignments, single thread per block + if (threadIdx.x == 0) { + int token_count[MAX_EXPERTS_PER_BLOCK] = {0}; + for (int i = 0; i < chosen_experts * batch_size; i++) { + // Get the token index, between 0 and batch_size + int token_index = i / chosen_experts; + // Get global index (indices[i]) of expert to which the token is assigned, + // and compute the local index (expert_index) of the expert within the + // block of fused experts + int expert_index = indices[i] - experts_start_idx; + // check if the token is assigned to an expert in this block, and if so, + // whether the expert still has capacity not that since each expert is + // assigned to only one block, it is safe to reason about expert capacity + // locally + if (expert_index >= 0 && expert_index < num_experts && + token_count[expert_index] < expert_capacity) { + token_assigned[token_index][expert_index] = topk_gate_preds[i]; + token_count[expert_index]++; + } else { + } + } + } + + __syncthreads(); + + // compute output + CUDA_KERNEL_LOOP(i, num_experts * batch_size * out_dim) { + // output indexing: + // i = expert_index*(batch_size*out_dim) + token_index*out_dim + dim_index + // input indexing: + // i = token_index * (num_experts * out_dim) + expert_index * out_dim + + // dim_index + int expert_index = i / (batch_size * out_dim); + // int token_index = (i - expert_index*(batch_size*out_dim)) / out_dim; + int token_index = (i % (batch_size * out_dim)) / out_dim; + // int dim_index = i - expert_index*(batch_size*out_dim) - + // token_index*out_dim; + int dim_index = i % out_dim; + outputs[expert_index][token_index * out_dim + dim_index] = + input[i] * token_assigned[token_index][expert_index]; + } +} + +/*static*/ +void Experts::forward_kernel_wrapper(ExpertsMeta const *m, + float const *input, + int const *indices, + float const *topk_gate_preds, + float **outputs, + int num_experts, + int experts_start_idx, + int expert_capacity, + int chosen_experts, + int batch_size, + int out_dim) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // cudaEvent_t t_start, t_end; + // if (m->profiling) { + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + // } + hipMemcpy(m->dev_region_ptrs, + outputs, + num_experts * sizeof(float *), + hipMemcpyHostToDevice); + + hipLaunchKernelGGL( + experts_forward_kernel, + GET_BLOCKS(batch_size * num_experts * out_dim), + min(CUDA_NUM_THREADS, (int)(batch_size * num_experts * out_dim)), + 0, + stream, + input, + indices, + topk_gate_preds, + m->dev_region_ptrs, + num_experts, + experts_start_idx, + chosen_experts, + expert_capacity, + batch_size, + out_dim); + + // if (m->profiling) { + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // printf("[Experts] forward time = %.2lfms\n", elapsed); + // } +} + +ExpertsMeta::ExpertsMeta(FFHandler handler, int num_experts) : OpMeta(handler) { + checkCUDA(hipMalloc(&dev_region_ptrs, num_experts * sizeof(float *))); +} +ExpertsMeta::~ExpertsMeta(void) { + checkCUDA(hipFree(&dev_region_ptrs)); +} + +}; // namespace FlexFlow diff --git a/src/ops/experts.cu b/src/ops/experts.cu new file mode 100644 index 0000000000..b3a7f3d3ca --- /dev/null +++ b/src/ops/experts.cu @@ -0,0 +1,150 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/experts.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +__global__ void experts_forward_kernel(float const *input, + int const *indices, + float const *topk_gate_preds, + float **outputs, + int num_experts, + int experts_start_idx, + int chosen_experts, + int expert_capacity, + int batch_size, + int out_dim) { + // shared at the block level + __shared__ float token_assigned[MAX_BATCH_SIZE][MAX_EXPERTS_PER_BLOCK]; + + // initialize the token assignments to 0 + CUDA_KERNEL_LOOP(i, MAX_BATCH_SIZE * MAX_EXPERTS_PER_BLOCK) { + int token_index = i / MAX_EXPERTS_PER_BLOCK; + int expert_index = i % MAX_EXPERTS_PER_BLOCK; + token_assigned[token_index][expert_index] = 0.0f; + } + + __syncthreads(); + + // Compute token assignments, single thread per block + if (threadIdx.x == 0) { + int token_count[MAX_EXPERTS_PER_BLOCK] = {0}; + for (int i = 0; i < chosen_experts * batch_size; i++) { + // Get the token index, between 0 and batch_size + int token_index = i / chosen_experts; + // Get global index (indices[i]) of expert to which the token is assigned, + // and compute the local index (expert_index) of the expert within the + // block of fused experts + int expert_index = indices[i] - experts_start_idx; + // check if the token is assigned to an expert in this block, and if so, + // whether the expert still has capacity not that since each expert is + // assigned to only one block, it is safe to reason about expert capacity + // locally + if (expert_index >= 0 && expert_index < num_experts && + token_count[expert_index] < expert_capacity) { + token_assigned[token_index][expert_index] = topk_gate_preds[i]; + token_count[expert_index]++; + } else { + } + } + } + + __syncthreads(); + + // compute output + CUDA_KERNEL_LOOP(i, num_experts * batch_size * out_dim) { + // output indexing: + // i = expert_index*(batch_size*out_dim) + token_index*out_dim + dim_index + // input indexing: + // i = token_index * (num_experts * out_dim) + expert_index * out_dim + + // dim_index + int expert_index = i / (batch_size * out_dim); + // int token_index = (i - expert_index*(batch_size*out_dim)) / out_dim; + int token_index = (i % (batch_size * out_dim)) / out_dim; + // int dim_index = i - expert_index*(batch_size*out_dim) - + // token_index*out_dim; + int dim_index = i % out_dim; + outputs[expert_index][token_index * out_dim + dim_index] = + input[i] * token_assigned[token_index][expert_index]; + } +} + +/*static*/ +void Experts::forward_kernel_wrapper(ExpertsMeta const *m, + float const *input, + int const *indices, + float const *topk_gate_preds, + float **outputs, + int num_experts, + int experts_start_idx, + int expert_capacity, + int chosen_experts, + int batch_size, + int out_dim) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + // checkCUDA(cublasSetStream(m->handle.blas, stream)); + // checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // call forward_kernel + cudaMemcpyAsync(m->dev_region_ptrs, + outputs, + num_experts * sizeof(float *), + cudaMemcpyHostToDevice, + stream); + + experts_forward_kernel<<>>(input, + indices, + topk_gate_preds, + m->dev_region_ptrs, + num_experts, + experts_start_idx, + chosen_experts, + expert_capacity, + batch_size, + out_dim); + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[Experts] forward time = %.2lfms\n", elapsed); + } +} + +ExpertsMeta::ExpertsMeta(FFHandler handler, int num_experts) : OpMeta(handler) { + checkCUDA(cudaMalloc(&dev_region_ptrs, num_experts * sizeof(float *))); +} +ExpertsMeta::~ExpertsMeta(void) { + checkCUDA(cudaFree(&dev_region_ptrs)); +} + +}; // namespace FlexFlow diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 85f5132f6f..699b98cd5a 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2060,21 +2060,19 @@ void FFModel::deserialize_graph_optimal_view( break; } case OP_AGGREGATE: { - // node = Aggregate::deserialize(*this, dez, inputs, num_inputs); - int n; - float lambda_bal; - dez.deserialize(n); - dez.deserialize(lambda_bal); - assert(num_inputs == n + 4); - AggregateParams params; - params.n = n; - params.lambda_bal = lambda_bal; - node = get_or_create_node( - {std::begin(inputs), std::begin(inputs) + num_inputs}, params); + node = Aggregate::deserialize( + *this, + dez, + {std::begin(inputs), std::begin(inputs) + num_inputs}, + num_inputs); break; } case OP_EXPERTS: { - node = Experts::deserialize(*this, dez, inputs, num_inputs); + node = Experts::deserialize( + *this, + dez, + {std::begin(inputs), std::begin(inputs) + num_inputs}, + num_inputs); break; } case OP_POOL2D: { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 770292a6f2..e0fc25d1ad 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3027,7 +3027,7 @@ void FFModel::compile(LossType loss_type, operators[i]->op_guid); for (int j = 0; j < op->numInputs; j++) { LogicalRegion handle = op->inputs[j]->region; - printf("inputs[%d] region(%d,%d,%d)\n", + printf("\tinputs[%d] region(%d,%d,%d)\n", j, handle.get_index_space().get_id(), handle.get_field_space().get_id(), @@ -3035,7 +3035,7 @@ void FFModel::compile(LossType loss_type, } for (int j = 0; j < op->numOutputs; j++) { LogicalRegion handle = op->outputs[j]->region; - printf("outputs[%d] region(%d,%d,%d)\n", + printf("\toutputs[%d] region(%d,%d,%d)\n", j, handle.get_index_space().get_id(), handle.get_field_space().get_id(), @@ -3043,7 +3043,7 @@ void FFModel::compile(LossType loss_type, } for (int j = 0; j < op->numWeights; j++) { LogicalRegion handle = op->weights[j]->region; - printf("weights[%d] region(%d,%d,%d)\n", + printf("\tweights[%d] region(%d,%d,%d)\n", j, handle.get_index_space().get_id(), handle.get_field_space().get_id(), @@ -3059,7 +3059,7 @@ void FFModel::compile(LossType loss_type, printf("operator[%zu]: type(%d)\n", i, operators[i]->op_type); for (int j = 0; j < op->numInputs; j++) { LogicalRegion handle = op->inputs[j]->region; - printf("inputs[%d] region(%d,%d,%d)\n", + printf("\tinputs[%d] region(%d,%d,%d)\n", j, handle.get_index_space().get_id(), handle.get_field_space().get_id(), @@ -3067,7 +3067,7 @@ void FFModel::compile(LossType loss_type, } for (int j = 0; j < op->numOutputs; j++) { LogicalRegion handle = op->outputs[j]->region; - printf("outputs[%d] region(%d,%d,%d)\n", + printf("\toutputs[%d] region(%d,%d,%d)\n", j, handle.get_index_space().get_id(), handle.get_field_space().get_id(), @@ -3783,13 +3783,6 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "Experts Backward Task"); } - { - TaskVariantRegistrar registrar(EXPERTS_INF_TASK_ID, "Experts Inference"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Experts Inference Task"); - } // Cast { TaskVariantRegistrar registrar(CAST_INIT_TASK_ID, "Cast Init"); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index fb66ac6e36..d528fd6345 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -26,6 +26,7 @@ #include "flexflow/ops/element_binary.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" +#include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -3164,6 +3165,13 @@ bool FFModel::convert_graph_to_operators( new_op = new Aggregate(*this, inputs, aggr->n, aggr->lambda_bal, NULL); break; } + case OP_EXPERTS: { + Experts *exp = (Experts *)node.ptr; + ExpertsParams params = exp->get_params(); + new_op = new Experts( + *this, params, {std::begin(inputs), std::end(inputs)}, NULL); + break; + } case OP_SPLIT: { Split *split = (Split *)node.ptr; std::vector splits; From 07a361744929ccdd3a4e5dedac6c13206e997c41 Mon Sep 17 00:00:00 2001 From: Viren Abhyankar Date: Thu, 26 Jan 2023 11:38:24 -0800 Subject: [PATCH 051/344] Dropout kernels (#591) * Dropout kernels * Include dropout --- include/flexflow/ops/dropout.h | 38 ------ .../flexflow/ops/kernels/dropout_kernels.h | 53 ++++++++ src/ops/dropout.cc | 3 + src/ops/fused.cpp | 11 +- src/ops/fused.cu | 13 +- .../dropout_kernels.cpp} | 114 +++++++++--------- .../dropout_kernels.cu} | 110 +++++++++-------- src/ops/kernels/element_binary_kernels.cu | 68 +++++------ src/ops/kernels/flat_kernels.cu | 3 +- 9 files changed, 223 insertions(+), 190 deletions(-) create mode 100644 include/flexflow/ops/kernels/dropout_kernels.h rename src/ops/{dropout.cpp => kernels/dropout_kernels.cpp} (81%) rename src/ops/{dropout.cu => kernels/dropout_kernels.cu} (80%) diff --git a/include/flexflow/ops/dropout.h b/include/flexflow/ops/dropout.h index e72792725e..37304bdada 100644 --- a/include/flexflow/ops/dropout.h +++ b/include/flexflow/ops/dropout.h @@ -1,18 +1,13 @@ #ifndef _FLEXFLOW_DROPOUT_H #define _FLEXFLOW_DROPOUT_H -#include "flexflow/device.h" -#include "flexflow/fftype.h" #include "flexflow/layer.h" #include "flexflow/node.h" -#include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/dropout_params.h" namespace FlexFlow { -class DropoutMeta; - class Dropout : public Op { public: using Params = DropoutParams; @@ -50,20 +45,6 @@ class Dropout : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void forward_kernel(DropoutMeta *m, - float const *input_ptr, - float *output_ptr, - ffStream_t stream); - static void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr); - static void backward_kernel(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr, - ffStream_t stream); - static void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -81,25 +62,6 @@ class Dropout : public Op { unsigned long long seed; }; -class DropoutMeta : public OpMeta { -public: - DropoutMeta(FFHandler handle, - Dropout const *dropout, - Legion::Memory gpu_mem, - Legion::Domain const &output_domain); - ~DropoutMeta(void); - Realm::RegionInstance reserveInst; -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t inputTensor, outputTensor; - cudnnDropoutDescriptor_t dropoutDesc; -#else - miopenTensorDescriptor_t inputTensor, outputTensor; - miopenDropoutDescriptor_t dropoutDesc; -#endif - void *reserveSpace, *dropoutStates; - size_t reserveSpaceSize, dropoutStateSize; -}; - }; // namespace FlexFlow #endif diff --git a/include/flexflow/ops/kernels/dropout_kernels.h b/include/flexflow/ops/kernels/dropout_kernels.h new file mode 100644 index 0000000000..421974fbaa --- /dev/null +++ b/include/flexflow/ops/kernels/dropout_kernels.h @@ -0,0 +1,53 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/dropout.h" + +namespace FlexFlow { + +class DropoutMeta : public OpMeta { +public: + DropoutMeta(FFHandler handle, + Dropout const *dropout, + Legion::Memory gpu_mem, + Legion::Domain const &output_domain); + ~DropoutMeta(void); + Realm::RegionInstance reserveInst; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnTensorDescriptor_t inputTensor, outputTensor; + cudnnDropoutDescriptor_t dropoutDesc; +#else + miopenTensorDescriptor_t inputTensor, outputTensor; + miopenDropoutDescriptor_t dropoutDesc; +#endif + void *reserveSpace, *dropoutStates; + size_t reserveSpaceSize, dropoutStateSize; +}; + +namespace Kernels { +namespace Dropout { +void forward_kernel_wrapper(DropoutMeta *m, + float const *input_ptr, + float *output_ptr); +void backward_kernel_wrapper(DropoutMeta *m, + float const *output_grad_ptr, + float *input_grad_ptr); + +namespace Internal { +void forward_kernel(DropoutMeta *m, + float const *input_ptr, + float *output_ptr, + ffStream_t stream); +void backward_kernel(DropoutMeta *m, + float const *output_grad_ptr, + float *input_grad_ptr, + ffStream_t stream); +} // namespace Internal +} // namespace Dropout +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 75f8aec4b3..55f6730827 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -1,5 +1,6 @@ #include "flexflow/ops/dropout.h" #include "flexflow/model.h" +#include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" @@ -25,6 +26,8 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; +using namespace FlexFlow::Kernels::Dropout; + Tensor FFModel::dropout(const Tensor input, float rate, unsigned long long seed, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index e7ab9aea6f..bd1ecdc0ab 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -16,11 +16,11 @@ #include "flexflow/ops/fused.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" -#include "flexflow/ops/dropout.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" +#include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/ops/kernels/element_binary_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" @@ -199,9 +199,10 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Dropout::forward_kernel_wrapper(m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); break; } case OP_LINEAR: { @@ -586,7 +587,7 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Dropout::backward_kernel_wrapper( + Kernels::Dropout::backward_kernel_wrapper( m, my_output_grad_accessor[0].get_float_ptr(), my_input_grad_accessor[0].get_float_ptr()); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 53ebfd93aa..60d8bc93e8 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -16,18 +16,18 @@ #include "flexflow/accessor.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" -#include "flexflow/ops/dropout.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" +#include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/ops/kernels/element_binary_kernels.h" +#include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" -#include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/utils/cuda_helper.h" @@ -210,9 +210,10 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Dropout::forward_kernel_wrapper(m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); break; } case OP_LINEAR: { @@ -732,7 +733,7 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Dropout::backward_kernel_wrapper( + Kernels::Dropout::backward_kernel_wrapper( m, my_output_grad_accessor[0].get_float_ptr(), my_input_grad_accessor[0].get_float_ptr()); diff --git a/src/ops/dropout.cpp b/src/ops/kernels/dropout_kernels.cpp similarity index 81% rename from src/ops/dropout.cpp rename to src/ops/kernels/dropout_kernels.cpp index 9ec2270491..b0dd4c644e 100644 --- a/src/ops/dropout.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/dropout.h" +#include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/utils/hip_helper.h" #include @@ -24,58 +24,6 @@ using Legion::coord_t; using Legion::Domain; using Legion::Memory; -void Dropout::forward_kernel(DropoutMeta *m, - float const *input_ptr, - float *output_ptr, - hipStream_t stream) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - - checkCUDNN(miopenDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); -} - -/*static*/ -void Dropout::forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - Dropout::forward_kernel(m, input_ptr, output_ptr, stream); -} - -void Dropout::backward_kernel(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr, - hipStream_t stream) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - - checkCUDNN(miopenDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); -} - -/*static*/ -void Dropout::backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - Dropout::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); -} - DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, @@ -127,4 +75,62 @@ DropoutMeta::~DropoutMeta(void) { checkCUDNN(miopenDestroyDropoutDescriptor(dropoutDesc)); } -}; // namespace FlexFlow +namespace Kernels { +namespace Dropout { + +void forward_kernel_wrapper(DropoutMeta *m, + float const *input_ptr, + float *output_ptr) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + Internal::forward_kernel(m, input_ptr, output_ptr, stream); +} + +void backward_kernel_wrapper(DropoutMeta *m, + float const *output_grad_ptr, + float *input_grad_ptr) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); +} + +namespace Internal { + +void forward_kernel(DropoutMeta *m, + float const *input_ptr, + float *output_ptr, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + checkCUDNN(miopenDropoutForward(m->handle.dnn, + m->dropoutDesc, + m->inputTensor /* not used */, + m->inputTensor, + input_ptr, + m->outputTensor, + output_ptr, + m->reserveSpace, + m->reserveSpaceSize)); +} + +void backward_kernel(DropoutMeta *m, + float const *output_grad_ptr, + float *input_grad_ptr, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + checkCUDNN(miopenDropoutBackward(m->handle.dnn, + m->dropoutDesc, + m->inputTensor /* not used */, + m->outputTensor, + output_grad_ptr, + m->inputTensor, + input_grad_ptr, + m->reserveSpace, + m->reserveSpaceSize)); +} + +} // namespace Internal +} // namespace Dropout +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/dropout.cu b/src/ops/kernels/dropout_kernels.cu similarity index 80% rename from src/ops/dropout.cu rename to src/ops/kernels/dropout_kernels.cu index 70b0223c7a..4a76301fd6 100644 --- a/src/ops/dropout.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/dropout.h" +#include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -23,56 +23,6 @@ using Legion::coord_t; using Legion::Domain; using Legion::Memory; -void Dropout::forward_kernel(DropoutMeta *m, - float const *input_ptr, - float *output_ptr, - cudaStream_t stream) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - - checkCUDNN(cudnnDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); -} - -/*static*/ -void Dropout::forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - Dropout::forward_kernel(m, input_ptr, output_ptr, stream); -} - -void Dropout::backward_kernel(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr, - cudaStream_t stream) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - - checkCUDNN(cudnnDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); -} - -/*static*/ -void Dropout::backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - Dropout::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); -} - DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, @@ -121,4 +71,60 @@ DropoutMeta::~DropoutMeta(void) { checkCUDNN(cudnnDestroyDropoutDescriptor(dropoutDesc)); } -}; // namespace FlexFlow +namespace Kernels { +namespace Dropout { + +void forward_kernel_wrapper(DropoutMeta *m, + float const *input_ptr, + float *output_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + Internal::forward_kernel(m, input_ptr, output_ptr, stream); +} + +void backward_kernel_wrapper(DropoutMeta *m, + float const *output_grad_ptr, + float *input_grad_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); +} + +namespace Internal { + +void forward_kernel(DropoutMeta *m, + float const *input_ptr, + float *output_ptr, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + checkCUDNN(cudnnDropoutForward(m->handle.dnn, + m->dropoutDesc, + m->inputTensor, + input_ptr, + m->outputTensor, + output_ptr, + m->reserveSpace, + m->reserveSpaceSize)); +} + +void backward_kernel(DropoutMeta *m, + float const *output_grad_ptr, + float *input_grad_ptr, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + checkCUDNN(cudnnDropoutBackward(m->handle.dnn, + m->dropoutDesc, + m->outputTensor, + output_grad_ptr, + m->inputTensor, + input_grad_ptr, + m->reserveSpace, + m->reserveSpaceSize)); +} + +} // namespace Internal +} // namespace Dropout +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index 509946bddf..99db607447 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -268,21 +268,21 @@ void forward_kernel(ElementBinaryMeta const *m, // currently only handle add and sub assert(m->op_type == OP_EW_SUB || m->op_type == OP_EW_ADD || m->op_type == OP_EW_MUL); - if(m->op_type == OP_EW_SUB || m->op_type == OP_EW_ADD){ + if (m->op_type == OP_EW_SUB || m->op_type == OP_EW_ADD) { // output = (beta*output + alpha1*input1) + beta*output = input1 checkCUDNN(cudnnOpTensor(m->handle.dnn, - m->opDesc, - &beta, - m->outputTensor, - out_ptr, - &alpha1, - m->input1Tensor, - in1_ptr, - &beta, - m->outputTensor, - out_ptr)); - // output = (beta*output + alpha2*input2) + alpha1*output = alpha2*input2 - // + alpha1*input1 + m->opDesc, + &beta, + m->outputTensor, + out_ptr, + &alpha1, + m->input1Tensor, + in1_ptr, + &beta, + m->outputTensor, + out_ptr)); + // output = (beta*output + alpha2*input2) + alpha1*output = alpha2*input2 + // + alpha1*input1 checkCUDNN(cudnnOpTensor(m->handle.dnn, m->opDesc, &beta, @@ -294,7 +294,7 @@ void forward_kernel(ElementBinaryMeta const *m, &alpha1, m->outputTensor, out_ptr)); - } else if(m->op_type == OP_EW_MUL) { + } else if (m->op_type == OP_EW_MUL) { checkCUDNN(cudnnSetOpTensorDescriptor(m->opDesc, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, @@ -437,16 +437,16 @@ void backward_kernel(ElementBinaryMeta const *m, in1_grad_ptr)); } else { checkCUDNN(cudnnOpTensor(m->handle.dnn, - m->opDesc, - &alpha1, - m->outputTensor, - out_grad_ptr, - &alpha2, - m->input2Tensor, - in2_ptr, - &beta, - m->input1Tensor, - in1_grad_ptr)); + m->opDesc, + &alpha1, + m->outputTensor, + out_grad_ptr, + &alpha2, + m->input2Tensor, + in2_ptr, + &beta, + m->input1Tensor, + in1_grad_ptr)); } } if (in2_grad_ptr != nullptr) { @@ -477,16 +477,16 @@ void backward_kernel(ElementBinaryMeta const *m, in2_grad_ptr)); } else { checkCUDNN(cudnnOpTensor(m->handle.dnn, - m->opDesc, - &alpha1, - m->outputTensor, - out_grad_ptr, - &alpha2, - m->input1Tensor, - in1_ptr, - &beta, - m->input2Tensor, - in2_grad_ptr)); + m->opDesc, + &alpha1, + m->outputTensor, + out_grad_ptr, + &alpha2, + m->input1Tensor, + in1_ptr, + &beta, + m->input2Tensor, + in2_grad_ptr)); } } } else { diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu index 0de2c31585..3836c02c94 100644 --- a/src/ops/kernels/flat_kernels.cu +++ b/src/ops/kernels/flat_kernels.cu @@ -35,7 +35,8 @@ void backward_kernel_wrapper(float *input_grad_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(input_grad_ptr, output_grad_ptr, num_elements, stream); + Internal::backward_kernel( + input_grad_ptr, output_grad_ptr, num_elements, stream); // checkCUDA(cudaMemcpyAsync(acc_input_grad.ptr, acc_output_grad.ptr, // acc_input_grad.rect.volume() * sizeof(float), // cudaMemcpyDeviceToDevice)); From eaedc294a3c76a89c459893deba8b2627a11541e Mon Sep 17 00:00:00 2001 From: Viren Abhyankar Date: Thu, 26 Jan 2023 16:17:48 -0800 Subject: [PATCH 052/344] Softmax kernels (#593) --- .../flexflow/ops/kernels/softmax_kernels.h | 52 +++++++++++ include/flexflow/ops/softmax.h | 39 -------- src/ops/{ => kernels}/softmax.cpp | 90 ++++++++++--------- src/ops/{ => kernels}/softmax.cu | 90 ++++++++++--------- src/ops/softmax.cc | 7 +- 5 files changed, 151 insertions(+), 127 deletions(-) create mode 100644 include/flexflow/ops/kernels/softmax_kernels.h rename src/ops/{ => kernels}/softmax.cpp (77%) rename src/ops/{ => kernels}/softmax.cu (77%) diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h new file mode 100644 index 0000000000..81b34d8558 --- /dev/null +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -0,0 +1,52 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/softmax.h" + +namespace FlexFlow { + +class SoftmaxMeta : public OpMeta { +public: + SoftmaxMeta(FFHandler handle, + Softmax const *softmax, + Legion::Domain const &input_domain); +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnTensorDescriptor_t inputTensor; +#else + miopenTensorDescriptor_t inputTensor; +#endif + bool profiling; + int dim; + char op_name[MAX_OPNAME]; +}; + +namespace Kernels { +namespace Softmax { + +void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr); + +void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements); + +namespace Internal { +void forward_kernel(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr, + ffStream_t stream); +void backward_kernel(float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements, + ffStream_t stream); +} // namespace Internal +} // namespace Softmax +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index c9d6a1e6a9..25a20315bd 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -1,33 +1,13 @@ #ifndef _FLEXFLOW_SOFTMAX_H #define _FLEXFLOW_SOFTMAX_H -#include "flexflow/device.h" -#include "flexflow/fftype.h" #include "flexflow/layer.h" #include "flexflow/node.h" -#include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/softmax_params.h" namespace FlexFlow { -class Softmax; - -class SoftmaxMeta : public OpMeta { -public: - SoftmaxMeta(FFHandler handle, - Softmax const *softmax, - Legion::Domain const &input_domain); -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t inputTensor; -#else - miopenTensorDescriptor_t inputTensor; -#endif - bool profiling; - int dim; - char op_name[MAX_OPNAME]; -}; - class Softmax : public Op { public: using Params = SoftmaxParams; @@ -63,28 +43,9 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - void init_meta(SoftmaxMeta *m, - Legion::Rect<2> const &input, - Legion::Rect<2> const &output) const; bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; - static void forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, - ffStream_t stream); - static void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); - static void backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements, - ffStream_t stream); - static void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); - Params get_params() const; private: diff --git a/src/ops/softmax.cpp b/src/ops/kernels/softmax.cpp similarity index 77% rename from src/ops/softmax.cpp rename to src/ops/kernels/softmax.cpp index e53b41f4a4..d63bd0edc5 100644 --- a/src/ops/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/softmax.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -33,29 +33,12 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, std::strcpy(op_name, softmax->name); } -/* static */ -void Softmax::forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, - hipStream_t stream) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); +namespace Kernels { +namespace Softmax { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &alpha, - m->inputTensor, - input_ptr, - &beta, - m->inputTensor, - output_ptr, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); -} - -/* static */ -void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr) { +void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -65,7 +48,7 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - Softmax::forward_kernel(m, input_ptr, output_ptr, stream); + Internal::forward_kernel(m, input_ptr, output_ptr, stream); if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -81,23 +64,10 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, } } -/* static */ -void Softmax::backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements, - hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); -} - -/* static */ -void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements) { +void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -107,7 +77,7 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - Softmax::backward_kernel( + Internal::backward_kernel( input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { hipEventRecord(t_end, stream); @@ -124,4 +94,38 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, } } -}; // namespace FlexFlow +namespace Internal { + +void forward_kernel(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + input_ptr, + &beta, + m->inputTensor, + output_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); +} + +void backward_kernel(float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements, + hipStream_t stream) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); +} + +} // namespace Internal +} // namespace Softmax +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/softmax.cu b/src/ops/kernels/softmax.cu similarity index 77% rename from src/ops/softmax.cu rename to src/ops/kernels/softmax.cu index 7114f06274..d83d9952c9 100644 --- a/src/ops/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/softmax.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/cuda_helper.h" #include "flexflow/utils/hash_utils.h" @@ -32,29 +32,12 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, std::strcpy(op_name, softmax->name); } -/* static */ -void Softmax::forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, - cudaStream_t stream) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); +namespace Kernels { +namespace Softmax { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->inputTensor, - input_ptr, - &beta, - m->inputTensor, - output_ptr)); -} - -/* static */ -void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr) { +void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -64,7 +47,7 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Softmax::forward_kernel(m, input_ptr, output_ptr, stream); + Internal::forward_kernel(m, input_ptr, output_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -80,23 +63,10 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, } } -/* static */ -void Softmax::backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements, - cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); -} - -/* static */ -void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements) { +void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -106,7 +76,7 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Softmax::backward_kernel( + Internal::backward_kernel( input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -123,4 +93,38 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, } } -}; // namespace FlexFlow +namespace Internal { + +void forward_kernel(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + input_ptr, + &beta, + m->inputTensor, + output_ptr)); +} + +void backward_kernel(float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements, + cudaStream_t stream) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); +} + +} // namespace Internal +} // namespace Softmax +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 813104292f..029b20afd1 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -15,6 +15,7 @@ #include "flexflow/ops/softmax.h" #include "flexflow/model.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -34,6 +35,8 @@ using Legion::Task; using Legion::TaskArgument; using Legion::TaskLauncher; +using namespace FlexFlow::Kernels::Softmax; + /* Params */ bool operator==(SoftmaxParams const &lhs, SoftmaxParams const &rhs) { return lhs.dim == rhs.dim; @@ -252,7 +255,7 @@ void Softmax::forward_task_with_dim(Task const *task, runtime, false /*readOutput*/); - Softmax::forward_kernel_wrapper(m, acc_input.ptr, acc_output.ptr); + forward_kernel_wrapper(m, acc_input.ptr, acc_output.ptr); } void Softmax::backward(FFModel const &ff) { @@ -327,7 +330,7 @@ void Softmax::backward_task_with_dim(Task const *task, // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - Softmax::backward_kernel_wrapper( + backward_kernel_wrapper( m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); } From 3e41717e4a61b3d36c173c15a3ef6ebe12593e3f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 2 Feb 2023 14:29:26 -0500 Subject: [PATCH 053/344] fixed initialization error message --- src/ops/experts.cc | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 288507b6d8..7ad30dac03 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -357,6 +357,35 @@ void Experts::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + // expert predictions + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // expert assignment indices + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // topk_gate_preds + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[2]->region)); + launcher.add_field(2, FID_DATA); + for (int i = 0; i < num_experts; i++) { + launcher.add_region_requirement(RegionRequirement(outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[i]->region)); + launcher.add_field(i + 3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); From eb7284e68ba765e32f2f244075a96d94544cf2e9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 3 Feb 2023 01:06:27 -0500 Subject: [PATCH 054/344] [MoE][Experts] - Add support for tensors with more than 2 dimensions (#607) * add support for higher dimensions in experts.cc * fix linear layer issue * remove outdated comments * redo change accidentally removed * linting --- .../cpp/inference/mixture_of_experts/moe.cc | 26 +++--- .../cpp/inference/mixture_of_experts/moe.h | 6 +- include/flexflow/ops/experts.h | 2 +- src/ops/experts.cc | 80 +++++++++++-------- src/ops/linear.cc | 2 +- 5 files changed, 68 insertions(+), 48 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index e6f9a51d21..fa01cee517 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -68,7 +68,7 @@ Tensor create_moe(FFModel *model, Tensor create_moe_encoder(FFModel *model, MoeConfig const *moeConfig, Tensor const &input) { - std::vector axes = {0, 1}; + std::vector axes = {0, 1, 2}; Tensor x = input; for (int i = 0; i < moeConfig->num_encoder_layers; i++) { x = model->layer_norm( @@ -104,8 +104,9 @@ void FlexFlow::top_level_task(Task const *task, //----------------------------------------------------------------- - FFConfig ffConfig; MoeConfig moeConfig; + FFConfig ffConfig; + ffConfig.batchSize = moeConfig.batch_size; { InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -120,14 +121,15 @@ void FlexFlow::top_level_task(Task const *task, Tensor input; { - int const dims[] = {ffConfig.batchSize, DATA_DIMS}; - input = ff.create_tensor<2>(dims, DT_FLOAT); + int const dims[] = { + ffConfig.batchSize, moeConfig.sequence_length, DATA_DIMS}; + input = ff.create_tensor<3>(dims, DT_FLOAT); } //----------------------------------------------------------------- - // Tensor t = create_moe_encoder(&ff, &moeConfig, input); - Tensor t = create_moe(&ff, &moeConfig, input); + Tensor t = create_moe_encoder(&ff, &moeConfig, input); + // Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, OUT_DIM, AC_MODE_RELU); /* InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); @@ -140,10 +142,10 @@ void FlexFlow::top_level_task(Task const *task, ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics); // Data Loader - ParallelTensor input_pt, label_pt; - ff.get_parallel_tensor_from_tensor(input, input_pt); - ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); - DataLoader data_loader(ff, moeConfig, input_pt, label_pt); + // ParallelTensor input_pt, label_pt; + // ff.get_parallel_tensor_from_tensor(input, input_pt); + // ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); + // DataLoader data_loader(ff, moeConfig, input_pt, label_pt); ff.init_operators(); @@ -177,12 +179,12 @@ void FlexFlow::top_level_task(Task const *task, // } for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - data_loader.reset(); + // data_loader.reset(); ff.reset_metrics(); int iterations = TRAIN_SAMPLES / ffConfig.batchSize; for (int iter = 0; iter < iterations; iter++) { - data_loader.next_batch(ff); + // data_loader.next_batch(ff); if (epoch > 0) { runtime->begin_trace(ctx, 111 /*trace_id*/); } diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index 80cef3ff87..2df988f530 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -40,10 +40,12 @@ struct MoeConfig { alpha = 2.0f; lambda = 0.04f; hidden_size = DATA_DIMS; + batch_size = 32; + sequence_length = 10; // Encoder layer num_attention_heads = 16; attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 6; + num_encoder_layers = 1; // } // MoE layer int num_exp; @@ -52,6 +54,8 @@ struct MoeConfig { float alpha; // factor overhead tensor size for imbalance float lambda; // multiplier for load balance term int hidden_size; + int batch_size; + int sequence_length; // Encoder layer int num_attention_heads; int attention_kdim; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 6b875a10b9..a7f0b46e05 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -13,7 +13,7 @@ class ExpertsMeta : public OpMeta { }; // definitions for the CUDA kernel -#define MAX_BATCH_SIZE 64 +#define MAX_BATCH_SIZE 32 * 10 #define MAX_EXPERTS_PER_BLOCK 32 class Experts : public Op { diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 7ad30dac03..6450ab5173 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -57,10 +57,7 @@ Tensor FFModel::experts(Tensor const *inputs, assert(inputs[0]->dims[i] == inputs[1]->dims[i]); assert(inputs[1]->dims[i] == inputs[2]->dims[i]); } - // assert(input->num_dims == indices->num_dims); - // for (int i = 1; i < indices->num_dims; i++) { - // assert(input->dims[i] == indices->dims[i]); - // } + assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); @@ -270,20 +267,11 @@ Experts::Experts(FFModel &model, assert(inputs[0]->dims[i] == inputs[1]->dims[i]); assert(inputs[1]->dims[i] == inputs[2]->dims[i]); } - // assert(input->num_dims == indices->num_dims); - // for (int i = 1; i < indices->num_dims; i++) { - // assert(input->dims[i] == indices->dims[i]); - // } + assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); assert(experts_num_layers == 1 || experts_internal_dim_size > 0); - // assert(input->num_dims == indices->num_dims); - // assert(indices->data_type == DT_INT32 || indices->data_type == DT_INT64); - // for (int i = 1; i < indices->num_dims; i++) { - // assert(input->dims[i] == indices->dims[i]); - // } - // Assume that we don't parallelize the channel dim of input // nor the expert_assigned dim of indices assert(inputs[0]->dims[0].degree == 1); @@ -517,24 +505,46 @@ void Experts::forward_task(Task const *task, ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); // get input, indices, topk_gate_preds - AccessorRO const acc_input(regions[0], FID_DATA); - AccessorRO const acc_indices(regions[1], FID_DATA); - AccessorRO const acc_topk_gate_pred(regions[2], FID_DATA); - Rect<3> rect_input = runtime->get_index_space_domain( + float const *input_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + int const *indices_ptr = helperGetTensorPointerRO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + float const *topk_gate_pred_ptr = helperGetTensorPointerRO( + regions[2], task->regions[2], FID_DATA, ctx, runtime); + + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Rect<3> rect_indices = runtime->get_index_space_domain( + Domain indices_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_topk_gate_pred = runtime->get_index_space_domain( + Domain topk_gate_pred_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - coord_t batch_size = rect_input.hi[1] - rect_input.lo[1] + 1; - assert(batch_size == rect_indices.hi[1] - rect_indices.lo[1] + 1); - assert(batch_size == - rect_topk_gate_pred.hi[1] - rect_topk_gate_pred.lo[1] + 1); - coord_t chosen_experts = rect_indices.hi[0] - rect_indices.lo[0]; + int input_dims = input_domain.get_dim(); + int indices_dims = indices_domain.get_dim(); + int topk_gate_pred_dims = topk_gate_pred_domain.get_dim(); + assert(input_dims == indices_dims); + assert(indices_dims == topk_gate_pred_dims); + + int replica_dim = input_dims - 1; + int samples_index = input_dims - 2; + + coord_t out_dim = + (input_domain.hi()[0] - input_domain.lo()[0] + 1) / num_experts; + coord_t batch_size = + input_domain.hi()[samples_index] - input_domain.lo()[samples_index] + 1; + coord_t chosen_experts = indices_domain.hi()[0] - indices_domain.lo()[0]; assert(chosen_experts == - rect_topk_gate_pred.hi[0] - rect_topk_gate_pred.lo[0]); - coord_t out_dim = (rect_input.hi[0] - rect_input.lo[0] + 1) / num_experts; + topk_gate_pred_domain.hi()[0] - topk_gate_pred_domain.lo()[0]); + + for (int i = 1; i < input_dims; i++) { + int a = input_domain.hi()[i] - input_domain.lo()[i] + 1; + int b = indices_domain.hi()[i] - indices_domain.lo()[i] + 1; + int c = topk_gate_pred_domain.hi()[i] - topk_gate_pred_domain.lo()[i] + 1; + assert(a == b && b == c); + if (i >= 1 && i < samples_index) { + batch_size *= a; + } + } int expert_capacity = ceil(alpha * (int)chosen_experts / num_experts * (int)batch_size); @@ -547,19 +557,23 @@ void Experts::forward_task(Task const *task, float *outputs[num_experts]; for (int i = 0; i < num_experts; i++) { - Rect<3> rect_output = runtime->get_index_space_domain( + Domain output_domain = runtime->get_index_space_domain( ctx, task->regions[3 + i].region.get_index_space()); - assert((rect_output.hi[0] - rect_output.lo[0] + 1) == out_dim); - assert((rect_output.hi[1] - rect_output.lo[1] + 1) == batch_size); + assert((output_domain.hi()[0] - output_domain.lo()[0] + 1) == out_dim); + for (int j = 1; j < input_dims; j++) { + int a = input_domain.hi()[j] - input_domain.lo()[j] + 1; + int b = output_domain.hi()[j] - output_domain.lo()[j] + 1; + assert(a == b); + } outputs[i] = helperGetTensorPointerWO( regions[3 + i], task->regions[3 + i], FID_DATA, ctx, runtime); assert(outputs[i] != nullptr); } Experts::forward_kernel_wrapper(m, - acc_input.ptr(rect_input), - acc_indices.ptr(rect_indices), - acc_topk_gate_pred.ptr(rect_topk_gate_pred), + input_ptr, + indices_ptr, + topk_gate_pred_ptr, outputs, num_experts, experts_start_idx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 435080dbe1..0e09e20e44 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -464,7 +464,7 @@ void Linear::forward_task_with_dim(Task const *task, assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); float const *acc_bias_ptr = NULL; if (m->use_bias) { - TensorAccessorR acc_bias( + TensorAccessorR acc_bias( regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(acc_bias.rect.volume() == static_cast(out_dim)); acc_bias_ptr = acc_bias.ptr; From 19e3dccf49c92fea9ecbf8513639b49bf8f3beee Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 4 Feb 2023 16:01:59 -0500 Subject: [PATCH 055/344] [Inference] - Bug fixes and code improvements (#605) * fixed compilation bug and others * add inference init functions * fixed init issue * fix bug * further cleanup * fix * fix * bring back EXPERTS_INF_TASK_ID task id * passing configurations via init task only * removed task arg from other operators --- examples/cpp/inference/data_generator.h | 2 +- .../cpp/inference/mixture_of_experts/moe.cc | 81 ++++--------- include/flexflow/inference.h | 1 + include/flexflow/model.h | 4 + include/flexflow/operator.h | 5 + include/flexflow/ops/aggregate.h | 3 + include/flexflow/ops/aggregate_spec.h | 3 + include/flexflow/ops/attention.h | 3 + include/flexflow/ops/element_binary.h | 3 + include/flexflow/ops/experts.h | 14 ++- include/flexflow/ops/groupby.h | 42 +++---- include/flexflow/ops/layer_norm.h | 3 + include/flexflow/ops/linear.h | 3 + include/flexflow/ops/noop.h | 7 ++ include/flexflow/ops/softmax.h | 7 ++ include/flexflow/ops/topk.h | 3 + include/flexflow/parallel_ops/parallel_op.h | 8 ++ include/flexflow/parallel_ops/partition.h | 11 ++ src/ops/aggregate.cc | 23 ++++ src/ops/aggregate_spec.cc | 31 ++++- src/ops/attention.cc | 56 ++++++++- src/ops/element_binary.cc | 68 ++++++++++- src/ops/experts.cc | 114 +++++++++++++----- src/ops/experts.cpp | 23 ++-- src/ops/experts.cu | 23 ++-- src/ops/group_by.cc | 91 +++++++++----- src/ops/group_by.cpp | 45 +++---- src/ops/group_by.cu | 45 +++---- src/ops/layer_norm.cc | 38 +++++- src/ops/linear.cc | 53 +++++++- src/ops/noop.cc | 88 ++++++++++++++ src/ops/softmax.cc | 67 ++++++++++ src/ops/topk.cc | 43 ++++++- src/parallel_ops/partition.cc | 89 ++++++++++++++ src/runtime/inference_manager.cc | 44 +++++++ src/runtime/model.cc | 21 +++- 36 files changed, 944 insertions(+), 221 deletions(-) diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 8c3a89d2a7..98af050a98 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -78,7 +78,7 @@ vector> Generator::get_requests(void) { chrono::duration_cast(cur_time - start_time); while (arrivals_ptr < arrivals.end() && ms_from_start.count() >= *arrivals_ptr) { - cout << "output request at arrival time +" << *arrivals_ptr << "\n"; + cout << "request at arrival time +" << *arrivals_ptr << "\n"; requests.push_back(get_random_tensor()); arrivals_ptr++; } diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index fa01cee517..918f04d0b7 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -16,9 +16,12 @@ #include "moe.h" #include "data_generator.h" #include "flexflow/inference.h" +#include #include +#include #include #include +#include using namespace Legion; @@ -93,14 +96,14 @@ void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - /* // Inference parameters + // Inference parameters int total_requests = 256; // total number of requests processed as part of the simulation int request_tensor_size = 4; // request tensor dimensions bool poisson_distribution = true; double lambda = 25; // average number of request arrivals per second int num_requests_per_batch = 5; - int num_inflight_batches = 10; */ + int num_inflight_batches = 10; //----------------------------------------------------------------- @@ -132,22 +135,15 @@ void FlexFlow::top_level_task(Task const *task, // Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, OUT_DIM, AC_MODE_RELU); - /* InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); - im.compile_model_and_allocate_buffer(); */ - - Optimizer *optimizer = new SGDOptimizer(&ff, 0.001f); - std::vector metrics; - metrics.push_back(METRICS_ACCURACY); - metrics.push_back(METRICS_SPARSE_CATEGORICAL_CROSSENTROPY); - ff.compile(optimizer, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics); + InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); + im.compile_model_and_allocate_buffer(); + im.init_operators_inference(); // Data Loader - // ParallelTensor input_pt, label_pt; - // ff.get_parallel_tensor_from_tensor(input, input_pt); - // ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); - // DataLoader data_loader(ff, moeConfig, input_pt, label_pt); - - ff.init_operators(); + /* ParallelTensor input_pt, label_pt; + ff.get_parallel_tensor_from_tensor(input, input_pt); + ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); + DataLoader data_loader(ff, moeConfig, input_pt, label_pt); */ //----------------------------------------------------------------- @@ -162,50 +158,21 @@ void FlexFlow::top_level_task(Task const *task, /////////////////////////////////////////////////////////////////////////////////// - // int index = 0; - // int processed_requests = 0; - // Generator data_generator( - // total_requests, request_tensor_size, poisson_distribution, lambda); - // while (processed_requests < total_requests) { - // vector> req = data_generator.get_requests(); - // int iterations = req.size(); - // for (int iter = 0; iter < iterations; iter++) { - // // data_loader.next_batch(ff); - // runtime->begin_trace(ctx, 111 /*trace_id*/); - // im.inference((index++) % num_inflight_batches); - // runtime->end_trace(ctx, 111 /*trace_id*/); - // } - // processed_requests += iterations; - // } - - for (int epoch = 0; epoch < ffConfig.epochs; epoch++) { - // data_loader.reset(); - ff.reset_metrics(); - int iterations = TRAIN_SAMPLES / ffConfig.batchSize; - + int index = 0; + int processed_requests = 0; + Generator data_generator( + total_requests, request_tensor_size, poisson_distribution, lambda); + // data_loader.reset(); + while (processed_requests < total_requests) { + vector> req = data_generator.get_requests(); + int iterations = req.size(); for (int iter = 0; iter < iterations; iter++) { // data_loader.next_batch(ff); - if (epoch > 0) { - runtime->begin_trace(ctx, 111 /*trace_id*/); - } - ff.forward(); - ff.zero_gradients(); - // ff.backward(); - ff.update(); - // ff.recompile_on_condition(r); - if (epoch > 0) { - runtime->end_trace(ctx, 111 /*trace_id*/); - } + runtime->begin_trace(ctx, 111 /*trace_id*/); + im.inference((index++) % num_inflight_batches); + runtime->end_trace(ctx, 111 /*trace_id*/); } - - // TODO: Do properly - ff.reset_metrics(); - // iterations = TEST_SAMPLES / ffConfig.batchSize; - // for (int iter = 0; iter < iterations; iter++) { - // data_loader.next_batch(ff); - // ff.forward(); - // ff.backward(); - // } + processed_requests += iterations; } /////////////////////////////////////////////////////////////////////////////////// diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 5c9fe5f497..bb2a70e8a8 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -27,6 +27,7 @@ class InferenceManager { int max_num_requests_per_batch, int max_num_inflight_batches); void compile_model_and_allocate_buffer(void); + void init_operators_inference(); void inference(int index); public: diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 9ad9d52eab..2e13e9f4cf 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -57,6 +57,7 @@ enum TaskIDs { EXPERTS_INIT_TASK_ID, EXPERTS_FWD_TASK_ID, EXPERTS_BWD_TASK_ID, + EXPERTS_INF_TASK_ID, CONV2D_INIT_TASK_ID, CONV2D_INIT_PARA_TASK_ID, CONV2D_FWD_TASK_ID, @@ -760,6 +761,9 @@ class FFModel { Legion::Runtime *runtime); void reset_metrics(); void init_operators(); + void init_operators_inference( + std::vector const &batch_inputs, + std::vector const &batch_outputs); void prefetch(); void forward(int seq_length = -1); void compute_metrics(); diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index a276f0fd74..280df29f83 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -185,6 +185,11 @@ class Op { virtual bool get_weight_parameter(TNParameter, DIMParameter, int *) const; // Pure virtual functions that must be implemented virtual void init(FFModel const &) = 0; + virtual void init_inference(FFModel const &, + std::vector const &, + std::vector const &) { + assert(false); + }; virtual void forward(FFModel const &) = 0; virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index ba7240802b..4d6aaeccb9 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -35,6 +35,9 @@ class Aggregate : public Op { Input const &inputs, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 816574ced0..2792ce58a4 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -27,6 +27,9 @@ class AggregateSpec : public Op { float _lambda_bal, char const *name); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index 1531708bb7..684e29e910 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -64,6 +64,9 @@ class MultiHeadAttention : public Op { Layer const *layer, std::vector const &inputs); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 2f081f1b7e..1116519e8c 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -25,6 +25,9 @@ class ElementBinary : public Op { char const *name = nullptr, bool inplace_a = false); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index a7f0b46e05..190e76d865 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -7,8 +7,14 @@ namespace FlexFlow { class ExpertsMeta : public OpMeta { public: - ExpertsMeta(FFHandler handler, int num_experts); + ExpertsMeta(FFHandler handler, + int _num_experts, + int _experts_start_idx, + float _alpha); ~ExpertsMeta(void); + int num_experts; + int experts_start_idx; + float alpha; float **dev_region_ptrs; }; @@ -39,6 +45,9 @@ class Experts : public Op { std::vector const &inputs); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, @@ -65,9 +74,6 @@ class Experts : public Op { int const *acc_indices_ptr, float const *acc_topk_gate_preds_ptr, float **outputs, - int num_experts, - int experts_start_idx, - int expert_capacity, int chosen_experts, int batch_size, int out_dim); diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index afa69d891c..0b8a001f67 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -9,8 +9,9 @@ namespace FlexFlow { class GroupByMeta : public OpMeta { public: - GroupByMeta(FFHandler handle, int n); + GroupByMeta(FFHandler handle, int n, float _alpha); ~GroupByMeta(void); + float alpha; float **dev_region_ptrs; }; @@ -33,6 +34,9 @@ class Group_by : public Op { Input const &inputs, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, @@ -66,26 +70,22 @@ class Group_by : public Op { Op *materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const override; - static void - forward_kernel_wrapper(GroupByMeta const *m, - float const *input, - int const *exp_assign, - float **outputs, - int n, // num experts - int k, // chosen experts - float alpha, // factor additional memory assigned - int batch_size, - int data_dim); - static void - backward_kernel_wrapper(GroupByMeta const *m, - float *input_grad, - int const *exp_assign, - float **output_grads, - int n, // num experts - int k, // chosen experts - float alpha, // factor additional memory assigned - int batch_size, - int data_dim); + static void forward_kernel_wrapper(GroupByMeta const *m, + float const *input, + int const *exp_assign, + float **outputs, + int n, // num experts + int k, // chosen experts + int batch_size, + int data_dim); + static void backward_kernel_wrapper(GroupByMeta const *m, + float *input_grad, + int const *exp_assign, + float **output_grads, + int n, // num experts + int k, // chosen experts + int batch_size, + int data_dim); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index c05461acdf..3a81fe50f1 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -24,6 +24,9 @@ class LayerNorm : public Op { bool allocate_weights, char const *name); void init(FFModel const &); + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &); void backward(FFModel const &); void inference(FFModel const &, diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index ab1c1febc5..fb2767a590 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -35,6 +35,9 @@ class Linear : public Op { bool allocate_weights = false); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/noop.h b/include/flexflow/ops/noop.h index 5f39c999e6..688ab083df 100644 --- a/include/flexflow/ops/noop.h +++ b/include/flexflow/ops/noop.h @@ -17,7 +17,14 @@ class NoOp : public Op { const ParallelTensor output, char const *name = NULL); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 25a20315bd..c1d3ebdba2 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -21,7 +21,14 @@ class Softmax : public Op { const Input input, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; void print_layer(FFModel const &model) override { diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index af62f51c93..9c82930822 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -28,6 +28,9 @@ class TopK : public Op { Input const input, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h index a374b7ab40..0bf573996c 100644 --- a/include/flexflow/parallel_ops/parallel_op.h +++ b/include/flexflow/parallel_ops/parallel_op.h @@ -24,6 +24,12 @@ class ParallelOp : public Op { virtual void forward(FFModel const &) = 0; virtual void backward(FFModel const &) = 0; virtual void create_input_partition(FFModel &model) = 0; + virtual void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(false); + } void print_layer(FFModel const &model){}; virtual bool measure_operator_cost(Simulator *sim, MachineView const &pc, @@ -34,6 +40,8 @@ class ParallelOp : public Op { public: Legion::LogicalPartition input_lp, output_grad_lp; + std::unordered_map + inference_input_lps; }; }; // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/partition.h b/include/flexflow/parallel_ops/partition.h index 5c2fa9c228..d940841eb4 100644 --- a/include/flexflow/parallel_ops/partition.h +++ b/include/flexflow/parallel_ops/partition.h @@ -24,8 +24,19 @@ class Repartition : public ParallelOp { Input const input, char const *name = nullptr); void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &) override; void forward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index a1e5fcbbad..458a58b3fe 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -182,6 +182,29 @@ Node Aggregate::deserialize(FFModel &ff, return ff.get_or_create_node(inputs, params); } +void Aggregate::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(AGGREGATE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Aggregate)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void Aggregate::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 1e3d66fdee..a206610095 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -155,6 +155,29 @@ AggregateSpec::AggregateSpec(FFModel &model, numWeights = 0; } +void AggregateSpec::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(AGG_SPEC_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AggregateSpec)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void AggregateSpec::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -193,7 +216,7 @@ void AggregateSpec::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(AGG_SPEC_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(AggregateSpec)), + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -244,7 +267,7 @@ void AggregateSpec::inference(FFModel const &ff, size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); IndexLauncher launcher(AGG_SPEC_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(AggregateSpec)), + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -288,9 +311,9 @@ void AggregateSpec::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - int n = ((AggregateSpec *)task->args)->n; + assert(regions.size() == task->regions.size()); + int n = regions.size() - 3; - assert((int)regions.size() == n + 3); assert((int)task->regions.size() == n + 3); AggregateSpecMeta const *m = *((AggregateSpecMeta **)task->local_args); diff --git a/src/ops/attention.cc b/src/ops/attention.cc index d9ee14ecb6..8f5043e49e 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -372,6 +372,59 @@ MultiHeadAttention::MultiHeadAttention( allocate_weights, name) {} +void MultiHeadAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(MultiHeadAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[2]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(4, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void MultiHeadAttention::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -533,7 +586,8 @@ void MultiHeadAttention::inference( Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); int idx = 0; - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(ATTENTION_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index feb1862b04..ab4df2826a 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -246,6 +246,71 @@ void ElementBinary::do_inplace_output(void) { inplace_a = true; } +void ElementBinary::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + // Check if we have the same oprands + has_same_operands = (batch_inputs[0]->region == batch_inputs[1]->region); + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ELEMENTBINARY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ElementBinary)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + int rid = 0; + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(rid++, FID_DATA); + if (!has_same_operands) { + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(rid++, FID_DATA); + } else { + assert(batch_inputs[0]->part == batch_inputs[1]->part); + } + if (!inplace_a) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(rid++, FID_DATA); + } else { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + // launcher.add_region_requirement( + // RegionRequirement(input_grad_lps[0], 0/*projection id*/, + // WRITE_ONLY, EXCLUSIVE, inputs[0]->region_grad)); + // launcher.add_field(3, FID_DATA); + // if (inputs[0]->region_grad != inputs[1]->region_grad) { + // regions[4](I/O): input1_grad + // launcher.add_region_requirement( + // RegionRequirement(input_grad_lps[1], 0/*projection id*/, + // WRITE_ONLY, EXCLUSIVE, inputs[1]->region_grad)); + // launcher.add_field(4, FID_DATA); + //} + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void ElementBinary::init(FFModel const &ff) { // Check if we have the same oprands has_same_operands = (inputs[0]->region == inputs[1]->region); @@ -432,7 +497,8 @@ void ElementBinary::inference(FFModel const &ff, Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 6450ab5173..b5dbfae0b1 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -330,6 +330,58 @@ Node Experts::deserialize(FFModel &ff, return ff.get_or_create_node(inputs, params); } +void Experts::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(EXPERTS_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Experts)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + // expert predictions + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // expert assignment indices + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // topk_gate_preds + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[2]->region)); + launcher.add_field(2, FID_DATA); + for (int i = 0; i < num_experts; i++) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(i + 3, FID_DATA); + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void Experts::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -385,7 +437,8 @@ OpMeta *Experts::init_task(Task const *task, Runtime *runtime) { Experts const *exp = (Experts *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ExpertsMeta *m = new ExpertsMeta(handle, exp->num_experts); + ExpertsMeta *m = new ExpertsMeta( + handle, exp->num_experts, exp->experts_start_idx, exp->alpha); m->profiling = exp->profiling; return m; } @@ -398,7 +451,7 @@ void Experts::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(EXPERTS_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Experts)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -446,62 +499,58 @@ void Experts::inference(FFModel const &ff, Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); - IndexLauncher launcher(EXPERTS_FWD_TASK_ID, + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Experts)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); // expert predictions - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[0]->region)); + batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); // expert assignment indices - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[1]->region)); + batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); // topk_gate_preds - launcher.add_region_requirement(RegionRequirement(inputs[2]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[2]->region)); + batch_inputs[2]->region)); launcher.add_field(2, FID_DATA); for (int i = 0; i < num_experts; i++) { // expert output per token (only the chosen experts have non-zero // contributions) - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); launcher.add_field(i + 3, FID_DATA); } runtime->execute_index_space(ctx, launcher); } -void Experts::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void Experts::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { assert(regions.size() == task->regions.size()); int num_experts = regions.size() - 3; - Experts const *exp = (Experts *)task->args; - assert(exp != nullptr); - assert(exp->num_experts == num_experts); - float alpha = exp->alpha; - int experts_start_idx = exp->experts_start_idx; - ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); // get input, indices, topk_gate_preds @@ -546,9 +595,6 @@ void Experts::forward_task(Task const *task, } } - int expert_capacity = - ceil(alpha * (int)chosen_experts / num_experts * (int)batch_size); - assert(batch_size <= MAX_BATCH_SIZE && "batch size exceeds MAX_BATCH_SIZE defined in experts.h"); assert( @@ -575,14 +621,18 @@ void Experts::forward_task(Task const *task, indices_ptr, topk_gate_pred_ptr, outputs, - num_experts, - experts_start_idx, - expert_capacity, chosen_experts, batch_size, out_dim); } +void Experts::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(false && "Experts is designed for inference only"); +} + void Experts::backward(FFModel const &ff) { assert(false && "Experts is designed for inference only"); } diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index a19c7a3a9a..92fb1e6e13 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -90,15 +90,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int const *indices, float const *topk_gate_preds, float **outputs, - int num_experts, - int experts_start_idx, - int expert_capacity, int chosen_experts, int batch_size, int out_dim) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + int expert_capacity = + ceil(m->alpha * chosen_experts / m->num_experts * batch_size); + // cudaEvent_t t_start, t_end; // if (m->profiling) { // cudaEventCreate(&t_start); @@ -107,21 +107,21 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, // } hipMemcpy(m->dev_region_ptrs, outputs, - num_experts * sizeof(float *), + m->num_experts * sizeof(float *), hipMemcpyHostToDevice); hipLaunchKernelGGL( experts_forward_kernel, - GET_BLOCKS(batch_size * num_experts * out_dim), - min(CUDA_NUM_THREADS, (int)(batch_size * num_experts * out_dim)), + GET_BLOCKS(batch_size * m->num_experts * out_dim), + min(CUDA_NUM_THREADS, (int)(batch_size * m->num_experts * out_dim)), 0, stream, input, indices, topk_gate_preds, m->dev_region_ptrs, - num_experts, - experts_start_idx, + m->num_experts, + m->experts_start_idx, chosen_experts, expert_capacity, batch_size, @@ -138,7 +138,12 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, // } } -ExpertsMeta::ExpertsMeta(FFHandler handler, int num_experts) : OpMeta(handler) { +ExpertsMeta::ExpertsMeta(FFHandler handler, + int _num_experts, + int _experts_start_idx, + float _alpha) + : OpMeta(handler), num_experts(_num_experts), + experts_start_idx(_experts_start_idx), alpha(_alpha) { checkCUDA(hipMalloc(&dev_region_ptrs, num_experts * sizeof(float *))); } ExpertsMeta::~ExpertsMeta(void) { diff --git a/src/ops/experts.cu b/src/ops/experts.cu index b3a7f3d3ca..769b96e5ae 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -89,9 +89,6 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int const *indices, float const *topk_gate_preds, float **outputs, - int num_experts, - int experts_start_idx, - int expert_capacity, int chosen_experts, int batch_size, int out_dim) { @@ -100,6 +97,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, // checkCUDA(cublasSetStream(m->handle.blas, stream)); // checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + int expert_capacity = + ceil(m->alpha * chosen_experts / m->num_experts * batch_size); + cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -110,20 +110,20 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, // call forward_kernel cudaMemcpyAsync(m->dev_region_ptrs, outputs, - num_experts * sizeof(float *), + m->num_experts * sizeof(float *), cudaMemcpyHostToDevice, stream); - experts_forward_kernel<<num_experts * out_dim), min(CUDA_NUM_THREADS, - (int)(batch_size * num_experts * out_dim)), + (int)(batch_size * m->num_experts * out_dim)), 0, stream>>>(input, indices, topk_gate_preds, m->dev_region_ptrs, - num_experts, - experts_start_idx, + m->num_experts, + m->experts_start_idx, chosen_experts, expert_capacity, batch_size, @@ -140,7 +140,12 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } } -ExpertsMeta::ExpertsMeta(FFHandler handler, int num_experts) : OpMeta(handler) { +ExpertsMeta::ExpertsMeta(FFHandler handler, + int _num_experts, + int _experts_start_idx, + float _alpha) + : OpMeta(handler), num_experts(_num_experts), + experts_start_idx(_experts_start_idx), alpha(_alpha) { checkCUDA(cudaMalloc(&dev_region_ptrs, num_experts * sizeof(float *))); } ExpertsMeta::~ExpertsMeta(void) { diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index e6dc00f690..f6e05945a6 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -164,6 +164,54 @@ Group_by::Group_by(FFModel &model, : Group_by( model, inputs.first, inputs.second, params.n, params.alpha, name) {} +void Group_by::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(GROUP_BY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Group_by)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + // data + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // assign + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + + // output + for (int i = 0; i < n; i++) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(i + 2, FID_DATA); + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void Group_by::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -214,7 +262,7 @@ OpMeta *Group_by::init_task(Task const *task, Runtime *runtime) { Group_by *gb = (Group_by *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - GroupByMeta *m = new GroupByMeta(handle, gb->n); + GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha); m->profiling = gb->profiling; return m; } @@ -226,7 +274,7 @@ void Group_by::forward(FFModel const &ff) { set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(GROUP_BY_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Group_by)), + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -268,10 +316,11 @@ void Group_by::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(GROUP_BY_FWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Group_by)), + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -311,12 +360,7 @@ void Group_by::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - // Get n, alpha - Group_by const *gb = (Group_by *)task->args; - int n = gb->n; - float alpha = gb->alpha; - - assert((int)regions.size() == n + 2); + int n = (int)regions.size() - 2; assert((int)task->regions.size() == n + 2); GroupByMeta const *m = *((GroupByMeta **)task->local_args); @@ -343,7 +387,6 @@ void Group_by::forward_task(Task const *task, // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert float *outputs[n]; - int exp_output_rows = (int)ceil(alpha * k / n * batch_size); for (int i = 0; i < n; i++) { Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[i + 2].region.get_index_space()); @@ -352,7 +395,6 @@ void Group_by::forward_task(Task const *task, coord_t output_rows = out_domain.hi()[1] - out_domain.lo()[1] + 1; coord_t output_cols = out_domain.hi()[0] - out_domain.lo()[0] + 1; - assert((int)output_rows == exp_output_rows); assert(output_cols == input_cols); } @@ -362,7 +404,6 @@ void Group_by::forward_task(Task const *task, outputs, n, k, - alpha, batch_size, data_dim); } @@ -374,7 +415,7 @@ void Group_by::backward(FFModel const &ff) { set_argumentmap_for_backward(ff, argmap); IndexLauncher launcher(GROUP_BY_BWD_TASK_ID, parallel_is, - TaskArgument(this, sizeof(Group_by)), + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -414,13 +455,9 @@ void Group_by::backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - // Get n, alpha GroupByMeta const *m = *((GroupByMeta **)task->local_args); - Group_by const *gb = (Group_by *)task->args; - int n = gb->n; - float alpha = gb->alpha; - assert((int)regions.size() == n + 2); + int n = (int)regions.size() - 2; assert((int)task->regions.size() == n + 2); // get input and assign regions @@ -442,7 +479,6 @@ void Group_by::backward_task(Task const *task, // get output float *output_grads[n]; - int exp_output_rows = (int)ceil(alpha * k / n * batch_size); for (int i = 0; i < n; i++) { Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[i + 2].region.get_index_space()); @@ -451,7 +487,6 @@ void Group_by::backward_task(Task const *task, coord_t output_rows = out_domain.hi()[1] - out_domain.lo()[1] + 1; coord_t output_cols = out_domain.hi()[0] - out_domain.lo()[0] + 1; - assert((int)output_rows == exp_output_rows); assert(output_cols == input_cols); } @@ -461,7 +496,6 @@ void Group_by::backward_task(Task const *task, output_grads, n, k, - alpha, batch_size, data_dim); } @@ -512,7 +546,7 @@ bool Group_by::measure_operator_cost(Simulator *sim, } } - GroupByMeta *m = new GroupByMeta(sim->handler, n); + GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha); // allocate sim->free_all(); @@ -546,15 +580,8 @@ bool Group_by::measure_operator_cost(Simulator *sim, int data_dim = in_domain.hi()[0] - in_domain.lo()[0] + 1; forward = [&] { - forward_kernel_wrapper(m, - input_ptr, - assign_ptr, - output_ptrs, - n, - k, - alpha, - batch_size, - data_dim); + forward_kernel_wrapper( + m, input_ptr, assign_ptr, output_ptrs, n, k, batch_size, data_dim); }; inner_measure_operator_cost(sim, forward, backward, cost_metrics); diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index f45e9092a5..51bcd7d7b4 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -118,16 +118,17 @@ __global__ void } /*static*/ -void Group_by::forward_kernel_wrapper( - GroupByMeta const *m, - float const *input, - int const *exp_assign, - float **outputs, - int n, // num experts - int k, // chosen experts - float alpha, // factor additional memory assigned - int batch_size, - int data_dim) { +void Group_by::forward_kernel_wrapper(GroupByMeta const *m, + float const *input, + int const *exp_assign, + float **outputs, + int n, // num experts + int k, // chosen experts + int batch_size, + int data_dim) { + + float alpha = m->alpha; + // TODO: why cublas/cudnn stream is needed here? hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -151,16 +152,17 @@ void Group_by::forward_kernel_wrapper( data_dim); } -void Group_by::backward_kernel_wrapper( - GroupByMeta const *m, - float *input_grad, - int const *exp_assign, - float **output_grads, - int n, // num experts - int k, // chosen experts - float alpha, // factor additional memory assigned - int batch_size, - int data_dim) { +void Group_by::backward_kernel_wrapper(GroupByMeta const *m, + float *input_grad, + int const *exp_assign, + float **output_grads, + int n, // num experts + int k, // chosen experts + int batch_size, + int data_dim) { + + float alpha = m->alpha; + // TODO: why cublas/cudnn stream is needed here hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -186,7 +188,8 @@ void Group_by::backward_kernel_wrapper( data_dim); } -GroupByMeta::GroupByMeta(FFHandler handler, int n) : OpMeta(handler) { +GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) + : OpMeta(handler), alpha(_alpha) { checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index ee0b18337c..0ed09e20b3 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -106,17 +106,18 @@ __global__ void } /*static*/ -void Group_by::forward_kernel_wrapper( - GroupByMeta const *m, - float const *input, - int const *exp_assign, - float **outputs, - int n, // num experts - int k, // chosen experts - float alpha, // factor additional memory assigned - int batch_size, - int data_dim) { +void Group_by::forward_kernel_wrapper(GroupByMeta const *m, + float const *input, + int const *exp_assign, + float **outputs, + int n, // num experts + int k, // chosen experts + int batch_size, + int data_dim) { // TODO: why cublas/cudnn stream is needed here? + + float alpha = m->alpha; + cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -148,16 +149,17 @@ void Group_by::forward_kernel_wrapper( } } -void Group_by::backward_kernel_wrapper( - GroupByMeta const *m, - float *input_grad, - int const *exp_assign, - float **output_grads, - int n, // num experts - int k, // chosen experts - float alpha, // factor additional memory assigned - int batch_size, - int data_dim) { +void Group_by::backward_kernel_wrapper(GroupByMeta const *m, + float *input_grad, + int const *exp_assign, + float **output_grads, + int n, // num experts + int k, // chosen experts + int batch_size, + int data_dim) { + + float alpha = m->alpha; + // TODO: why cublas/cudnn stream is needed here cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -196,7 +198,8 @@ void Group_by::backward_kernel_wrapper( } } -GroupByMeta::GroupByMeta(FFHandler handler, int n) : OpMeta(handler) { +GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) + : OpMeta(handler), alpha(_alpha) { checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index bec6f7d651..11b1185b1c 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -194,6 +194,41 @@ LayerNorm::LayerNorm(FFModel &model, return; } +void LayerNorm::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(LAYERNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(LayerNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void LayerNorm::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -286,7 +321,8 @@ void LayerNorm::inference(FFModel const &ff, Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(LAYERNORM_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 0e09e20e44..fe9a3925a7 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -254,6 +254,56 @@ void Linear::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void Linear::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + // assert(check_output_input_weight_same_machine_view()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(LINEAR_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Linear)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + // launcher.add_region_requirement( + // RegionRequirement(input_lps[0], 0/*projection id*/, + // READ_ONLY, EXCLUSIVE, inputs[0]->region)); + // launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + // launcher.add_region_requirement( + // RegionRequirement(weights[1]->part, 0/*projection id*/, + // READ_ONLY, EXCLUSIVE, weights[1]->region)); + // launcher.add_field(3, FID_DATA); + if (ff.config.computationMode == COMP_MODE_TRAINING) { + // Add inputs[0].region_grad to avoid Legion warning + // launcher.add_region_requirement( + // RegionRequirement(input_grad_lps[0], 0/*projection id*/, + // WRITE_ONLY, EXCLUSIVE, inputs[0].region_grad)); + // launcher.add_field(2, FID_DATA); + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + /* regions[0](O): output regions[1](I): kernel @@ -375,7 +425,8 @@ void Linear::inference(FFModel const &ff, Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(LINEAR_FWD_TASK_ID, parallel_is, TaskArgument(nullptr, 0), diff --git a/src/ops/noop.cc b/src/ops/noop.cc index 94fff30553..91e890ed9f 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -24,6 +24,7 @@ using Legion::coord_t; using Legion::Domain; using Legion::FutureMap; using Legion::IndexLauncher; +using Legion::IndexSpace; using Legion::InlineLauncher; using Legion::LogicalPartition; using Legion::LogicalRegion; @@ -94,8 +95,90 @@ OpMeta *NoOp::init_task(Task const *task, return m; } +void NoOp::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + parallel_is = batch_outputs[0]->parallel_is; + assert(parallel_is != IndexSpace::NO_SPACE); + if (op_type == OP_INPUT && batch_outputs[0]->initializer != nullptr) { + ConstantInitializer *initializer = + (ConstantInitializer *)batch_outputs[0]->initializer; + Runtime *runtime = ff.config.lg_hlr; + Context ctx = ff.config.lg_ctx; + ArgumentMap argmap; + IndexLauncher launcher( + CONSTANT_INIT_TASK_ID, + parallel_is, + TaskArgument(initializer, sizeof(ConstantInitializer)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } else if (op_type == OP_INPUT) { + // For OP_INPUT, initialize tensor to zero + assert(batch_outputs[0]->region != LogicalRegion::NO_REGION); + if (batch_outputs[0]->part == LogicalPartition::NO_PART) { + return; + } + ConstantInitializer *initializer = NULL; + if (batch_outputs[0]->data_type == DT_FLOAT) { + initializer = new ConstantInitializer(0.0f); + } else if (batch_outputs[0]->data_type == DT_INT64) { + initializer = new ConstantInitializer((int64_t)0); + } else if (batch_outputs[0]->data_type == DT_INT32) { + initializer = new ConstantInitializer((int)0); + } + Runtime *runtime = ff.config.lg_hlr; + Context ctx = ff.config.lg_ctx; + ArgumentMap argmap; + IndexLauncher launcher( + CONSTANT_INIT_TASK_ID, + parallel_is, + TaskArgument(initializer, sizeof(ConstantInitializer)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } else if (op_type == OP_WEIGHT) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(NOOP_INIT_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); + } +} + void NoOp::init(FFModel const &ff) { parallel_is = outputs[0]->parallel_is; + assert(parallel_is != IndexSpace::NO_SPACE); if (op_type == OP_INPUT && outputs[0]->initializer != nullptr) { ConstantInitializer *initializer = (ConstantInitializer *)outputs[0]->initializer; @@ -172,6 +255,11 @@ void NoOp::init(FFModel const &ff) { void NoOp::forward(FFModel const &ff) {} +void NoOp::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) {} + void NoOp::backward(FFModel const &ff) {} bool NoOp::measure_operator_cost(Simulator *sim, diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 029b20afd1..b07ae0ad68 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -115,6 +115,40 @@ Softmax::Softmax(FFModel &model, char const *name) : Softmax(model, input, params.dim, name) {} +void Softmax::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(SOFTMAX_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Softmax)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_DISCARD, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void Softmax::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -188,6 +222,39 @@ OpMeta *Softmax::init_task(Task const *task, return m; } +void Softmax::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + IndexLauncher launcher(SOFTMAX_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + void Softmax::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; diff --git a/src/ops/topk.cc b/src/ops/topk.cc index ec6da77a31..b260902cd7 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -136,6 +136,46 @@ TopK::TopK(FFModel &model, char const *name) : TopK(model, input, params.k, params.sorted, name) {} +void TopK::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(TopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + void TopK::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -228,7 +268,8 @@ void TopK::inference(FFModel const &ff, Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(TOPK_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 3ff02db766..6f808a3978 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -101,6 +101,43 @@ OpMeta *Repartition::init_task(Task const *task, return nullptr; } +void Repartition::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + ArgumentMap argmap; + parallel_is = batch_outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(REPARTITION_INIT_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_outputs[0]->machine_view.hash()); + assert(inference_input_lps.find(batch_inputs[0]) != + inference_input_lps.end()); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); +} + void Repartition::init(FFModel const &ff) { ArgumentMap argmap; parallel_is = outputs[0]->parallel_is; @@ -130,6 +167,7 @@ void Repartition::init(FFModel const &ff) { } void Repartition::create_input_partition(FFModel &ff) { + assert(ff.config.computationMode == COMP_MODE_TRAINING); assert(outputs[0]->part != LogicalPartition::NO_PART); assert(inputs[0]->part != LogicalPartition::NO_PART); ff.create_disjoint_partition(outputs[0]->num_dims, @@ -144,6 +182,57 @@ void Repartition::create_input_partition(FFModel &ff) { output_grad_lp); } +void Repartition::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + ff.create_disjoint_partition(batch_outputs[0]->num_dims, + batch_outputs[0]->dims, + batch_outputs[0]->parallel_is, + batch_inputs[0]->region, + inference_input_lps[batch_inputs[0]]); +} + +void Repartition::inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + IndexLauncher launcher(REPARTITION_FWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(&data_type, sizeof(DataType)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + void Repartition::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index be572848be..8f926da316 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -14,6 +14,7 @@ */ #include "flexflow/inference.h" +#include "flexflow/parallel_ops/parallel_op.h" namespace FlexFlow { @@ -57,17 +58,60 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { } } +void InferenceManager::init_operators_inference() { + for (int index = 0; index < max_num_inflight_batches; index++) { + for (size_t o = 0; o < model->operators.size(); o++) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + std::vector inputs(op->numInputs); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > index); + inputs[i] = tensor_buffer[op->inputs[i]][index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->outputs[i]].size() > index); + outputs[i] = tensor_buffer[op->outputs[i]][index]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + if (op->is_parallel_op()) { + ((ParallelOp *)op) + ->create_input_partition_inference(*model, inputs, outputs); + } + op->init_inference(*model, inputs, outputs); + } + } +} + void InferenceManager::inference(int index) { assert(index < max_num_inflight_batches); for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } std::vector inputs(op->numInputs); std::vector outputs(op->numOutputs); for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > index); inputs[i] = tensor_buffer[op->inputs[i]][index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); } for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->outputs[i]].size() > index); outputs[i] = tensor_buffer[op->outputs[i]][index]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } op->inference(*model, inputs, outputs); } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index e0fc25d1ad..6419a2d61f 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2925,8 +2925,10 @@ void FFModel::compile(LossType loss_type, // // Output tensor // map_tensor(op->outputs[i], op); // } - if (op->is_parallel_op()) { - ((ParallelOp *)op)->create_input_partition(*this); + if (config.computationMode == COMP_MODE_TRAINING) { + if (op->is_parallel_op()) { + ((ParallelOp *)op)->create_input_partition(*this); + } } // op->map_output_tensors(*this); } @@ -3114,9 +3116,11 @@ void FFModel::compile(LossType loss_type, assert(false && "Unsupported dim"); } } - // init optimizer - assert(optimizer != NULL); - optimizer->init(); + if (config.computationMode == COMP_MODE_TRAINING) { + // init optimizer + assert(optimizer != NULL); + optimizer->init(); + } #ifdef FF_USE_NCCL if (config.computationMode == COMP_MODE_TRAINING) { @@ -3783,6 +3787,13 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "Experts Backward Task"); } + { + TaskVariantRegistrar registrar(EXPERTS_INF_TASK_ID, "Experts Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Experts Inference Task"); + } // Cast { TaskVariantRegistrar registrar(CAST_INIT_TASK_ID, "Cast Init"); From cf64baa327103504e9d2c30c5b90e76bc9c403c6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 5 Feb 2023 20:45:17 +0000 Subject: [PATCH 056/344] fix LayerNorm dimensions error --- src/ops/layer_norm.cc | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 11b1185b1c..8d8a8c5980 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -177,12 +177,28 @@ LayerNorm::LayerNorm(FFModel &model, _input->num_dims, _input->dims, _input->data_type, this); assert(check_output_input_weight_parallel_dims(allocate_weights)); ParallelDim output_dims[MAX_TENSOR_DIM]; - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + int degree_product = 1; + effective_num_elements = 1; + effective_batch_size = 1; + for (int i = 0; i < inputs[0]->num_dims; i++) { + degree_product *= inputs[0]->dims[i].degree; + bool found = false; + for (int j = 0; j < axes.size(); j++) { + if (i == inputs[0]->num_dims - 1 - axes[j]) { + found = true; + break; + } + } + if (found) { + effective_num_elements *= + inputs[0]->dims[i].size / inputs[0]->dims[i].degree; + } else { + effective_batch_size *= + inputs[0]->dims[i].size / inputs[0]->dims[i].degree; + } } - effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; + assert(effective_num_elements * effective_batch_size * degree_product == + inputs[0]->get_volume()); if (numWeights > 0 && allocate_weights) { int kernel_dims = 2; assert(false); From 373da006cf99cdb8e7e36beb3cd1d551c11a0fc6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 5 Feb 2023 16:23:44 -0500 Subject: [PATCH 057/344] Revert "fix LayerNorm dimensions error". The issue will be fixed when we update the mapping to prevent FlexFlow from partitioning the dimensions within each block of requests This reverts commit cf64baa327103504e9d2c30c5b90e76bc9c403c6. --- src/ops/layer_norm.cc | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 8d8a8c5980..11b1185b1c 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -177,28 +177,12 @@ LayerNorm::LayerNorm(FFModel &model, _input->num_dims, _input->dims, _input->data_type, this); assert(check_output_input_weight_parallel_dims(allocate_weights)); ParallelDim output_dims[MAX_TENSOR_DIM]; - int degree_product = 1; - effective_num_elements = 1; - effective_batch_size = 1; - for (int i = 0; i < inputs[0]->num_dims; i++) { - degree_product *= inputs[0]->dims[i].degree; - bool found = false; - for (int j = 0; j < axes.size(); j++) { - if (i == inputs[0]->num_dims - 1 - axes[j]) { - found = true; - break; - } - } - if (found) { - effective_num_elements *= - inputs[0]->dims[i].size / inputs[0]->dims[i].degree; - } else { - effective_batch_size *= - inputs[0]->dims[i].size / inputs[0]->dims[i].degree; - } + int M = 1; + for (int i = 0; i < axes.size(); i++) { + M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; } - assert(effective_num_elements * effective_batch_size * degree_product == - inputs[0]->get_volume()); + effective_num_elements = M; + effective_batch_size = inputs[0]->get_volume() / M; if (numWeights > 0 && allocate_weights) { int kernel_dims = 2; assert(false); From 2ed8913887d872216b33b6e44fdfddcc4df7de81 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 11 Feb 2023 00:14:26 -0500 Subject: [PATCH 058/344] [Inference][Experts] - New experts operator (except for the kernel) (#627) * removed obsolete code * add weights * fix * fix * fix * bug fix * added functions to hopefully fix parallel_is * fixed bugs --- include/flexflow/ops/experts.h | 24 +- include/flexflow/ops/experts_params.h | 55 ++- src/ops/experts.cc | 535 ++++++++++++++++++++++---- src/ops/experts.cpp | 124 +----- src/ops/experts.cu | 125 ++---- src/runtime/graph.cc | 2 +- src/runtime/model.cc | 1 + src/runtime/substitution.cc | 2 +- 8 files changed, 579 insertions(+), 289 deletions(-) diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 190e76d865..57d6153d0e 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -10,12 +10,16 @@ class ExpertsMeta : public OpMeta { ExpertsMeta(FFHandler handler, int _num_experts, int _experts_start_idx, - float _alpha); + float _alpha, + bool _use_bias, + ActiMode _activation); ~ExpertsMeta(void); + float const **dev_weights; int num_experts; int experts_start_idx; float alpha; - float **dev_region_ptrs; + bool use_bias; + ActiMode activation; }; // definitions for the CUDA kernel @@ -29,8 +33,10 @@ class Experts : public Op { Experts(FFModel &model, Params const ¶ms, Input const &inputs, + bool allocate_weights = false, char const *name = nullptr); Experts(FFModel &model, + LayerID const &layer_guid, ParallelTensor const *inputs, int _num_experts, int _experts_start_idx, @@ -38,6 +44,9 @@ class Experts : public Op { float _alpha, int _experts_num_layers, int _experts_internal_dim_size, + bool _use_bias, + ActiMode _activation, + bool allocate_weights, char const *name = nullptr); static Op * create_operator_from_layer(FFModel &model, @@ -70,10 +79,11 @@ class Experts : public Op { Legion::Context ctx, Legion::Runtime *runtime); static void forward_kernel_wrapper(ExpertsMeta const *m, - float const *acc_input_ptr, - int const *acc_indices_ptr, - float const *acc_topk_gate_preds_ptr, - float **outputs, + float const *input, + int const *indices, + float const *topk_gate_preds, + float *output, + float const **weights, int chosen_experts, int batch_size, int out_dim); @@ -96,6 +106,8 @@ class Experts : public Op { float alpha; int experts_num_layers; int experts_internal_dim_size; + bool use_bias; + ActiMode activation; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index 20a65a06f8..d5b4676f0e 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -1,17 +1,70 @@ #pragma once +#include "flexflow/operator.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { struct ExpertsParams { - bool is_valid(std::vector const &) const; + LayerID layer_guid; int num_experts; int experts_start_idx; int experts_output_dim_size; float alpha; int experts_num_layers; int experts_internal_dim_size; + bool use_bias; + ActiMode activation; + + bool is_valid(std::vector const &) const; + void solve_dims(const ParallelTensor input, + ParallelDim output_dims[MAX_TENSOR_DIM], + int *output_ndims, + ParallelDim kernel_dims[MAX_TENSOR_DIM], + int *kernel_ndims, + ParallelDim bias_dims[MAX_TENSOR_DIM], + int *bias_ndims) const; + void solve_dims(ParallelTensorShape const &input_shape, + ParallelTensorShape &output_shape, + ParallelTensorShape &kernel_shape, + ParallelTensorShape &bias_shape) const; + void solve_dims(ParallelTensorShape const &input_shape, + ParallelDim output_dims[MAX_TENSOR_DIM], + int *output_ndims, + ParallelDim kernel_dims[MAX_TENSOR_DIM], + int *kernel_ndims, + ParallelDim bias_dims[MAX_TENSOR_DIM], + int *bias_ndims) const; + void construct_mappings(std::vector &, + ParallelTensorShape const &) const; + + enum NamedDimensions { + INPUT_CHANNEL, + INPUT_SAMPLE, + INPUT_REPLICA, + OUTPUT_CHANNEL, + OUTPUT_SAMPLE, + OUTPUT_REPLICA, + KERNEL_CHANNEL_IN, + KERNEL_CHANNEL_OUT, + BIAS_CHANNEL_OUT + }; + + std::unordered_map + get_dimension_names(ParallelTensorShape const &input_name) const; + +private: + void mark_replica_dims(ParallelTensorShape const &input_shape, + ParallelDim output_dims[MAX_TENSOR_DIM], + ParallelDim kernel_dims[MAX_TENSOR_DIM], + ParallelDim bias_dims[MAX_TENSOR_DIM]) const; + void calculate_nonreplica_dim_sizes(ParallelTensorShape const &input_shape, + ParallelDim output_dims[MAX_TENSOR_DIM], + int *output_ndims, + ParallelDim kernel_dims[MAX_TENSOR_DIM], + int *kernel_ndims, + ParallelDim bias_dims[MAX_TENSOR_DIM], + int *bias_ndims) const; }; bool operator==(ExpertsParams const &, ExpertsParams const &); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index b5dbfae0b1..9c60a5e212 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -35,6 +35,9 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; +static constexpr int KERNEL_IDX = 0; +static constexpr int BIAS_IDX = 1; + // For now, we use one input and one output per expert Tensor FFModel::experts(Tensor const *inputs, int num_experts, @@ -63,31 +66,51 @@ Tensor FFModel::experts(Tensor const *inputs, assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); assert(experts_num_layers == 1 || experts_internal_dim_size > 0); - Tensor fused_experts = this->dense( - inputs[0], num_experts * experts_output_dim_size, AC_MODE_RELU); - fused_experts = this->softmax(fused_experts); - - Tensor const layer_inputs[3] = {fused_experts, inputs[1], inputs[2]}; + // parameters for the FFN implementing the experts. We can make these + // FFModel::experts(...) function parameters if needed. + bool use_bias = false; + ActiMode activation = AC_MODE_RELU; Layer *e = new Layer(this, OP_EXPERTS, DT_FLOAT, name, 3 /*inputs*/, - 0 /*weights*/, - num_experts /*outputs*/, - layer_inputs); - + num_experts * (1 + use_bias) /*weights*/, + 1 /*outputs*/, + inputs); { int dims[MAX_TENSOR_DIM]; for (int i = 1; i < num_dims; i++) { dims[i] = inputs[0]->dims[i]; } dims[0] = experts_output_dim_size; - for (int i = 0; i < num_experts; i++) { - e->outputs[i] = create_tensor_legion_ordering( - num_dims, dims, DT_FLOAT, e, 0, true /*create_grad*/); - assert(e->outputs[i] != nullptr); + e->outputs[0] = create_tensor_legion_ordering( + num_dims, dims, DT_FLOAT, e, 0, true /*create_grad*/); + assert(e->outputs[0] != nullptr); + } + for (int i = 0; i < num_experts; i++) { + { + int dims[2] = {inputs[0]->dims[0], experts_output_dim_size}; + e->weights[i * (1 + use_bias)] = + create_weight_legion_ordering(2, + dims, + DT_FLOAT, + e, + true /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + } + if (use_bias) { + int dims[1] = {experts_output_dim_size}; + e->weights[i * (1 + use_bias) + use_bias] = + create_weight_legion_ordering(1, + dims, + DT_FLOAT, + e, + true /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); } } @@ -97,13 +120,11 @@ Tensor FFModel::experts(Tensor const *inputs, e->add_float_property("alpha", alpha); e->add_int_property("experts_num_layers", experts_num_layers); e->add_int_property("experts_internal_dim_size", experts_internal_dim_size); + e->add_int_property("use_bias", use_bias); + e->add_int_property("activation", activation); layers.push_back(e); - Tensor ret = e->outputs[0]; - for (int i = 1; i < num_experts; i++) { - this->add(ret, e->outputs[i], /*inplace_a*/ true); - } - return ret; + return e->outputs[0]; } Op *Experts::create_operator_from_layer( @@ -124,7 +145,12 @@ Op *Experts::create_operator_from_layer( int experts_num_layers = value; layer->get_int_property("experts_internal_dim_size", value); int experts_internal_dim_size = value; + layer->get_int_property("use_bias", value); + bool use_bias = (bool)value; + layer->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; return new Experts(model, + layer->layer_guid, inputs.data(), num_experts, experts_start_idx, @@ -132,17 +158,23 @@ Op *Experts::create_operator_from_layer( alpha, experts_num_layers, experts_internal_dim_size, + use_bias, + activation, + false /*allocate_weights*/, layer->name); } ExpertsParams Experts::get_params() const { ExpertsParams params; + params.layer_guid = this->layer_guid; params.num_experts = num_experts; params.experts_start_idx = experts_start_idx; params.experts_output_dim_size = experts_output_dim_size; params.alpha = alpha; params.experts_num_layers = experts_num_layers; params.experts_internal_dim_size = experts_internal_dim_size; + params.use_bias = use_bias; + params.activation = activation; return params; } @@ -182,10 +214,6 @@ bool ExpertsParams::is_valid( printf("Data type of the third input to the Experts layer is wrong!\n"); return false; } - if (inputs[0].dims[0].size != num_experts * experts_output_dim_size) { - printf("Dimension 0 of input tensor 1 to the Experts layer is wrong.\n"); - return false; - } if (inputs[1].dims[0] != inputs[2].dims[0]) { printf( "Dimension mismatch between indices and topk_gate_preds tensors passed " @@ -204,20 +232,23 @@ bool ExpertsParams::is_valid( } bool operator==(ExpertsParams const &lhs, ExpertsParams const &rhs) { - return lhs.num_experts == rhs.num_experts && + return lhs.layer_guid == rhs.layer_guid && + lhs.num_experts == rhs.num_experts && lhs.experts_start_idx == rhs.experts_start_idx && lhs.experts_output_dim_size == rhs.experts_output_dim_size && lhs.alpha == rhs.alpha && lhs.experts_num_layers == rhs.experts_num_layers && - lhs.experts_internal_dim_size == rhs.experts_internal_dim_size; + lhs.experts_internal_dim_size == rhs.experts_internal_dim_size && + lhs.use_bias == rhs.use_bias && lhs.activation == rhs.activation; } Experts::Experts(FFModel &model, ExpertsParams const ¶ms, - // std::pair const &inputs, std::vector const &inputs, + bool allocate_weights, char const *name) : Experts(model, + params.layer_guid, inputs.data(), params.num_experts, params.experts_start_idx, @@ -225,9 +256,13 @@ Experts::Experts(FFModel &model, params.alpha, params.experts_num_layers, params.experts_internal_dim_size, + params.use_bias, + params.activation, + allocate_weights, name) {} Experts::Experts(FFModel &model, + LayerID const &_layer_guid, ParallelTensor const *inputs, int _num_experts, int _experts_start_idx, @@ -235,31 +270,37 @@ Experts::Experts(FFModel &model, float _alpha, int _experts_num_layers, int _experts_internal_dim_size, + bool _use_bias, + ActiMode _activation, + bool allocate_weights, char const *name) : Op(model, OP_EXPERTS, DT_FLOAT, name, 3 /*inputs*/, - 0 /*weights*/, - _num_experts /*outputs*/, + _num_experts * (1 + _use_bias) /*weights*/, + 1 /*outputs*/, inputs), num_experts(_num_experts), experts_start_idx(_experts_start_idx), experts_output_dim_size(_experts_output_dim_size), alpha(_alpha), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size) { + experts_internal_dim_size(_experts_internal_dim_size), + use_bias(_use_bias), activation(_activation) { + + // overwrite layer_guid + layer_guid = _layer_guid; assert(num_experts > 0); assert(numInputs == 3); - assert(numOutputs == num_experts); + assert(numOutputs == 1); + assert(numWeights == num_experts * (1 + use_bias)); assert(inputs[0] != nullptr); int num_dims = inputs[0]->num_dims; assert(inputs[1]->num_dims == num_dims); assert(inputs[2]->num_dims == num_dims); - int out_dim = num_experts * experts_output_dim_size; - assert(inputs[0]->dims[0].size == out_dim); int topk = inputs[1]->dims[0].size; assert(inputs[2]->dims[0].size == topk); @@ -278,28 +319,74 @@ Experts::Experts(FFModel &model, assert(inputs[1]->dims[0].degree == 1); assert(inputs[2]->dims[0].degree == 1); - ParallelDim dims[MAX_TENSOR_DIM]; + ParallelDim out_dims[MAX_TENSOR_DIM]; for (int i = 0; i < num_dims; i++) { - dims[i] = inputs[0]->dims[i]; + out_dims[i] = inputs[0]->dims[i]; } - dims[0].size = experts_output_dim_size; - // numOutputs = num_experts; - // numWeights = 0; - for (int i = 0; i < num_experts; i++) { - outputs[i] = model.create_parallel_tensor_legion_ordering( - num_dims, dims, inputs[0]->data_type, this, i /*owner_idx*/); - assert(outputs[i] != nullptr); + out_dims[0].size = experts_output_dim_size; + outputs[0] = model.create_parallel_tensor_legion_ordering( + num_dims, out_dims, inputs[0]->data_type, this, 0 /*owner_idx*/); + assert(outputs[0] != nullptr); + + // auto dimension_names = + // this->get_params().get_dimension_names(inputs[0]->get_shape()); + ParallelTensorShape input_shape = inputs[0]->get_shape(); + ParallelTensorShape output_shape, kernel_shape, bias_shape; + ExpertsParams params = this->get_params(); + params.construct_mappings(*this->parallel_dims_mapping, input_shape); + params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); + + if (allocate_weights) { +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + for (int i = 0; i < num_experts; i++) { + Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); + { + // ParallelDim dims[2] = {inputs[0]->dims[0], out_dims[0]}; + weights[i * (1 + use_bias)] = + model.create_parallel_weight_legion_ordering( + kernel_shape.num_dims, // 2, + kernel_shape.dims, // dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + comm_type); + assert(weights[i * (1 + use_bias)] != nullptr); + } + if (use_bias) { + Initializer *bias_initializer = new ZeroInitializer(); + ParallelDim dims[1] = {out_dims[0]}; + weights[i * (1 + use_bias) + use_bias] = + model.create_parallel_weight_legion_ordering( + bias_shape.num_dims, // 1, + bias_shape.dims, // dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + bias_initializer, + comm_type); + assert(weights[i * (1 + use_bias) + use_bias] != nullptr); + } + } } + assert(check_output_input_weight_parallel_dims(allocate_weights)); } void Experts::serialize(Legion::Serializer &sez) const { ExpertsParams params = get_params(); + sez.serialize(params.layer_guid.id); sez.serialize(params.num_experts); sez.serialize(params.experts_start_idx); sez.serialize(params.experts_output_dim_size); sez.serialize(params.alpha); sez.serialize(params.experts_num_layers); sez.serialize(params.experts_internal_dim_size); + sez.serialize(params.use_bias); + sez.serialize(params.activation); } using PCG::Node; @@ -310,22 +397,32 @@ Node Experts::deserialize(FFModel &ff, int num_experts, experts_start_idx, experts_output_dim_size, experts_num_layers, experts_internal_dim_size; float alpha; + ActiMode activation; + bool use_bias; + size_t id; + dez.deserialize(id); + LayerID layer_guid(id); dez.deserialize(num_experts); dez.deserialize(experts_start_idx); dez.deserialize(experts_output_dim_size); dez.deserialize(alpha); dez.deserialize(experts_num_layers); dez.deserialize(experts_internal_dim_size); + dez.deserialize(use_bias); + dez.deserialize(activation); assert(num_inputs == 3); ExpertsParams params; + params.layer_guid = layer_guid; params.num_experts = num_experts; params.experts_start_idx = experts_start_idx; params.experts_output_dim_size = experts_output_dim_size; params.alpha = alpha; params.experts_num_layers = experts_num_layers; params.experts_internal_dim_size = experts_internal_dim_size; + params.use_bias = use_bias; + params.activation = activation; return ff.get_or_create_node(inputs, params); } @@ -368,14 +465,29 @@ void Experts::init_inference(FFModel const &ff, EXCLUSIVE, batch_inputs[2]->region)); launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(3, FID_DATA); for (int i = 0; i < num_experts; i++) { launcher.add_region_requirement( - RegionRequirement(batch_outputs[i]->part, + RegionRequirement(weights[i * (1 + use_bias)]->part, 0 /*projection id*/, - WRITE_ONLY, + READ_ONLY, EXCLUSIVE, - batch_outputs[i]->region)); - launcher.add_field(i + 3, FID_DATA); + weights[i * (1 + use_bias)]->region)); + launcher.add_field(4 + i * (1 + use_bias), FID_DATA); + if (use_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i * (1 + use_bias) + use_bias]->region)); + launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); + } } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); @@ -418,13 +530,29 @@ void Experts::init(FFModel const &ff) { EXCLUSIVE, inputs[2]->region)); launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(3, FID_DATA); for (int i = 0; i < num_experts; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); - launcher.add_field(i + 3, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[i * (1 + use_bias)]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i * (1 + use_bias)]->region)); + launcher.add_field(4 + i * (1 + use_bias), FID_DATA); + if (use_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i * (1 + use_bias) + use_bias]->region)); + launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); + } } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); @@ -437,8 +565,12 @@ OpMeta *Experts::init_task(Task const *task, Runtime *runtime) { Experts const *exp = (Experts *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ExpertsMeta *m = new ExpertsMeta( - handle, exp->num_experts, exp->experts_start_idx, exp->alpha); + ExpertsMeta *m = new ExpertsMeta(handle, + exp->num_experts, + exp->experts_start_idx, + exp->alpha, + exp->use_bias, + exp->activation); m->profiling = exp->profiling; return m; } @@ -478,15 +610,31 @@ void Experts::forward(FFModel const &ff) { EXCLUSIVE, inputs[2]->region)); launcher.add_field(2, FID_DATA); + // expert output per token (only the chosen experts have non-zero + // contributions) + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(3, FID_DATA); for (int i = 0; i < num_experts; i++) { - // expert output per token (only the chosen experts have non-zero - // contributions) - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); - launcher.add_field(i + 3, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[i * (1 + use_bias)]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i * (1 + use_bias)]->region)); + launcher.add_field(4 + i * (1 + use_bias), FID_DATA); + if (use_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i * (1 + use_bias) + use_bias]->region)); + launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); + } } runtime->execute_index_space(ctx, launcher); } @@ -530,16 +678,31 @@ void Experts::inference(FFModel const &ff, EXCLUSIVE, batch_inputs[2]->region)); launcher.add_field(2, FID_DATA); + // expert output per token (only the chosen experts have non-zero + // contributions) + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(3, FID_DATA); for (int i = 0; i < num_experts; i++) { - // expert output per token (only the chosen experts have non-zero - // contributions) launcher.add_region_requirement( - RegionRequirement(batch_outputs[i]->part, + RegionRequirement(weights[i * (1 + use_bias)]->part, 0 /*projection id*/, - WRITE_ONLY, + READ_ONLY, EXCLUSIVE, - batch_outputs[i]->region)); - launcher.add_field(i + 3, FID_DATA); + weights[i * (1 + use_bias)]->region)); + launcher.add_field(4 + i * (1 + use_bias), FID_DATA); + if (use_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i * (1 + use_bias) + use_bias]->region)); + launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); + } } runtime->execute_index_space(ctx, launcher); } @@ -549,17 +712,24 @@ void Experts::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(regions.size() == task->regions.size()); - int num_experts = regions.size() - 3; ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); - // get input, indices, topk_gate_preds + int num_experts = m->num_experts; + bool use_bias = m->use_bias; + assert(regions.size() - 4 == num_experts * (1 + use_bias)); + + // get input, indices, topk_gate_preds, outputs float const *input_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); int const *indices_ptr = helperGetTensorPointerRO( regions[1], task->regions[1], FID_DATA, ctx, runtime); float const *topk_gate_pred_ptr = helperGetTensorPointerRO( regions[2], task->regions[2], FID_DATA, ctx, runtime); + float *output_ptr = helperGetTensorPointerWO( + regions[3], task->regions[3], FID_DATA, ctx, runtime); + assert(input_ptr != nullptr && indices_ptr != nullptr && + topk_gate_pred_ptr != nullptr && output_ptr != nullptr); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -567,21 +737,25 @@ void Experts::inference_task(Task const *task, ctx, task->regions[1].region.get_index_space()); Domain topk_gate_pred_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); int input_dims = input_domain.get_dim(); int indices_dims = indices_domain.get_dim(); int topk_gate_pred_dims = topk_gate_pred_domain.get_dim(); + int output_dims = output_domain.get_dim(); assert(input_dims == indices_dims); assert(indices_dims == topk_gate_pred_dims); + assert(input_dims == output_dims); int replica_dim = input_dims - 1; int samples_index = input_dims - 2; - coord_t out_dim = - (input_domain.hi()[0] - input_domain.lo()[0] + 1) / num_experts; + coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; coord_t batch_size = input_domain.hi()[samples_index] - input_domain.lo()[samples_index] + 1; coord_t chosen_experts = indices_domain.hi()[0] - indices_domain.lo()[0]; + coord_t out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; assert(chosen_experts == topk_gate_pred_domain.hi()[0] - topk_gate_pred_domain.lo()[0]); @@ -601,26 +775,51 @@ void Experts::inference_task(Task const *task, num_experts <= MAX_EXPERTS_PER_BLOCK && "number of experts exceeds MAX_EXPERTS_PER_BLOCK defined in experts.h"); - float *outputs[num_experts]; + for (int j = 1; j < input_dims; j++) { + int a = input_domain.hi()[j] - input_domain.lo()[j] + 1; + int b = output_domain.hi()[j] - output_domain.lo()[j] + 1; + assert(a == b); + } + + // get weights + float const *weights_ptrs[num_experts * (1 + use_bias)]; for (int i = 0; i < num_experts; i++) { - Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[3 + i].region.get_index_space()); - assert((output_domain.hi()[0] - output_domain.lo()[0] + 1) == out_dim); - for (int j = 1; j < input_dims; j++) { - int a = input_domain.hi()[j] - input_domain.lo()[j] + 1; - int b = output_domain.hi()[j] - output_domain.lo()[j] + 1; - assert(a == b); + weights_ptrs[i * (1 + use_bias)] = + helperGetTensorPointerRO(regions[4 + i * (1 + use_bias)], + task->regions[4 + i * (1 + use_bias)], + FID_DATA, + ctx, + runtime); + Domain weights_domain = runtime->get_index_space_domain( + ctx, task->regions[4 + i * (1 + use_bias)].region.get_index_space()); + int weights_dims = weights_domain.get_dim(); + assert(weights_dims == input_dims); + assert(weights_domain.hi()[0] - weights_domain.lo()[0] + 1 == data_dim); + assert(weights_domain.hi()[1] - weights_domain.lo()[1] + 1 == out_dim); + if (use_bias) { + weights_ptrs[i * (1 + use_bias) + use_bias] = + helperGetTensorPointerRO( + regions[4 + i * (1 + use_bias) + use_bias], + task->regions[4 + i * (1 + use_bias) + use_bias], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, + task->regions[4 + i * (1 + use_bias) + use_bias] + .region.get_index_space()); + int bias_dims = bias_domain.get_dim(); + assert(bias_dims == 1); + assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); } - outputs[i] = helperGetTensorPointerWO( - regions[3 + i], task->regions[3 + i], FID_DATA, ctx, runtime); - assert(outputs[i] != nullptr); } Experts::forward_kernel_wrapper(m, input_ptr, indices_ptr, topk_gate_pred_ptr, - outputs, + output_ptr, + weights_ptrs, chosen_experts, batch_size, out_dim); @@ -656,18 +855,190 @@ bool Experts::measure_operator_cost(Simulator *sim, return false; } +void ExpertsParams::solve_dims(const ParallelTensor input, + ParallelDim output_dims[MAX_TENSOR_DIM], + int *output_ndims, + ParallelDim kernel_dims[MAX_TENSOR_DIM], + int *kernel_ndims, + ParallelDim bias_dims[MAX_TENSOR_DIM], + int *bias_ndims) const { + this->solve_dims(input->get_shape(), + output_dims, + output_ndims, + kernel_dims, + kernel_ndims, + bias_dims, + bias_ndims); +} + +void ExpertsParams::solve_dims(ParallelTensorShape const &input_shape, + ParallelTensorShape &output_shape, + ParallelTensorShape &kernel_shape, + ParallelTensorShape &bias_shape) const { + this->solve_dims(input_shape, + output_shape.dims, + &output_shape.num_dims, + kernel_shape.dims, + &kernel_shape.num_dims, + bias_shape.dims, + &bias_shape.num_dims); +} + +void ExpertsParams::solve_dims(ParallelTensorShape const &input_shape, + ParallelDim output_dims[MAX_TENSOR_DIM], + int *output_ndims, + ParallelDim kernel_dims[MAX_TENSOR_DIM], + int *kernel_ndims, + ParallelDim bias_dims[MAX_TENSOR_DIM], + int *bias_ndims) const { + assert((output_dims == nullptr) == (output_ndims == nullptr)); + assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); + assert((bias_dims == nullptr) == (bias_ndims == nullptr)); + + std::vector mapping; + this->construct_mappings(mapping, input_shape); + this->mark_replica_dims(input_shape, output_dims, kernel_dims, bias_dims); + + solve_parallel_dim_mappings( + mapping, {input_shape.dims}, {kernel_dims, bias_dims}, {output_dims}); + + this->calculate_nonreplica_dim_sizes(input_shape, + output_dims, + output_ndims, + kernel_dims, + kernel_ndims, + bias_dims, + bias_ndims); +} + +std::unordered_map + ExpertsParams::get_dimension_names( + ParallelTensorShape const &input_shape) const { + int num_dims = input_shape.num_dims; + + return {{INPUT_CHANNEL, 0}, + {INPUT_SAMPLE, num_dims - 2}, + {INPUT_REPLICA, num_dims - 1}, + {OUTPUT_CHANNEL, 0}, + {OUTPUT_SAMPLE, num_dims - 2}, + {OUTPUT_REPLICA, num_dims - 1}, + {KERNEL_CHANNEL_IN, 0}, + {KERNEL_CHANNEL_OUT, 1}, + {BIAS_CHANNEL_OUT, 0}}; +} + +void ExpertsParams::calculate_nonreplica_dim_sizes( + ParallelTensorShape const &input_shape, + ParallelDim output_dims[MAX_TENSOR_DIM], + int *output_ndims, + ParallelDim kernel_dims[MAX_TENSOR_DIM], + int *kernel_ndims, + ParallelDim bias_dims[MAX_TENSOR_DIM], + int *bias_ndims) const { + auto dimension_names = this->get_dimension_names(input_shape); + int num_dims = input_shape.num_dims; + + if (output_dims != nullptr) { + for (int i = 1; i < input_shape.num_dims - 1; i++) { + output_dims[i].size = input_shape.dims[i].size; + } + output_dims[dimension_names.at(OUTPUT_CHANNEL)].size = + experts_output_dim_size; + *output_ndims = num_dims; + } + if (kernel_dims != nullptr) { + kernel_dims[dimension_names.at(KERNEL_CHANNEL_IN)].size = + input_shape.dims[INPUT_CHANNEL].size / + input_shape.dims[INPUT_CHANNEL].degree; + kernel_dims[dimension_names.at(KERNEL_CHANNEL_OUT)].size = + experts_output_dim_size; + *kernel_ndims = num_dims; + } + if (bias_dims != nullptr) { + bias_dims[dimension_names.at(BIAS_CHANNEL_OUT)].size = + experts_output_dim_size; + *bias_ndims = num_dims; + } +} + +void ExpertsParams::mark_replica_dims( + ParallelTensorShape const &input_shape, + ParallelDim output_dims[MAX_TENSOR_DIM], + ParallelDim kernel_dims[MAX_TENSOR_DIM], + ParallelDim bias_dims[MAX_TENSOR_DIM]) const { + int num_dims = input_shape.num_dims; + auto dimension_names = this->get_dimension_names(input_shape); + if (output_dims != nullptr) { + output_dims[dimension_names.at(OUTPUT_REPLICA)].is_replica_dim = true; + } + if (kernel_dims != nullptr) { + for (int i = 2; i < num_dims; i++) { + kernel_dims[i].is_replica_dim = true; + } + } + if (bias_dims != nullptr) { + for (int i = 1; i < num_dims; i++) { + bias_dims[i].is_replica_dim = true; + } + } +} + +void ExpertsParams::construct_mappings( + std::vector &mappings, + ParallelTensorShape const &input_shape) const { + std::unordered_map dimension_names = + this->get_dimension_names(input_shape); + + Op::construct_output_parallel_dims( + mappings, + {{dimension_names.at(INPUT_CHANNEL), dimension_names.at(OUTPUT_REPLICA)}, + {dimension_names.at(INPUT_REPLICA), + dimension_names.at(OUTPUT_CHANNEL)}}); + for (int i = 1; i < input_shape.num_dims - 1; i++) { + Op::construct_output_parallel_dims(mappings, i, i); + } + + Op::construct_weight_parallel_dims(mappings, + {{dimension_names.at(INPUT_CHANNEL), + dimension_names.at(KERNEL_CHANNEL_IN)}, + {dimension_names.at(INPUT_REPLICA), + dimension_names.at(KERNEL_CHANNEL_OUT)}}, + 0 /*input_idx*/, + KERNEL_IDX); + // map a bunch of replica dimensions for the unnamed dimensions in the input + for (int i = 1; i < input_shape.num_dims - 1; i++) { + Op::construct_weight_parallel_dims( + mappings, i, i + 1, 0 /*input_idx*/, KERNEL_IDX); + } + + Op::construct_weight_parallel_dims(mappings, + { + {dimension_names.at(INPUT_REPLICA), + dimension_names.at(BIAS_CHANNEL_OUT)}, + }, + 0 /*input_idx*/, + BIAS_IDX); + for (int i = 0; i < input_shape.num_dims - 1; i++) { + Op::construct_weight_parallel_dims( + mappings, i, i + 1, 0 /*input_idx*/, BIAS_IDX); + } +} + }; // namespace FlexFlow namespace std { size_t hash::operator()( FlexFlow::ExpertsParams const ¶ms) const { size_t key = 0; + hash_combine(key, params.layer_guid.id); hash_combine(key, params.num_experts); hash_combine(key, params.experts_start_idx); hash_combine(key, params.experts_output_dim_size); hash_combine(key, params.alpha); hash_combine(key, params.experts_num_layers); hash_combine(key, params.experts_internal_dim_size); + hash_combine(key, params.use_bias); + hash_combine(key, params.activation); return key; } }; // namespace std diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index 92fb1e6e13..081f814400 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -19,77 +19,13 @@ namespace FlexFlow { -__global__ void experts_forward_kernel(float const *input, - int const *indices, - float const *topk_gate_preds, - float **outputs, - int num_experts, - int experts_start_idx, - int chosen_experts, - int expert_capacity, - int batch_size, - int out_dim) { - // shared at the block level - __shared__ float token_assigned[MAX_BATCH_SIZE][MAX_EXPERTS_PER_BLOCK]; - - // initialize the token assignments to 0 - CUDA_KERNEL_LOOP(i, MAX_BATCH_SIZE * MAX_EXPERTS_PER_BLOCK) { - int token_index = i / MAX_EXPERTS_PER_BLOCK; - int expert_index = i % MAX_EXPERTS_PER_BLOCK; - token_assigned[token_index][expert_index] = 0.0f; - } - - __syncthreads(); - - // Compute token assignments, single thread per block - if (threadIdx.x == 0) { - int token_count[MAX_EXPERTS_PER_BLOCK] = {0}; - for (int i = 0; i < chosen_experts * batch_size; i++) { - // Get the token index, between 0 and batch_size - int token_index = i / chosen_experts; - // Get global index (indices[i]) of expert to which the token is assigned, - // and compute the local index (expert_index) of the expert within the - // block of fused experts - int expert_index = indices[i] - experts_start_idx; - // check if the token is assigned to an expert in this block, and if so, - // whether the expert still has capacity not that since each expert is - // assigned to only one block, it is safe to reason about expert capacity - // locally - if (expert_index >= 0 && expert_index < num_experts && - token_count[expert_index] < expert_capacity) { - token_assigned[token_index][expert_index] = topk_gate_preds[i]; - token_count[expert_index]++; - } else { - } - } - } - - __syncthreads(); - - // compute output - CUDA_KERNEL_LOOP(i, num_experts * batch_size * out_dim) { - // output indexing: - // i = expert_index*(batch_size*out_dim) + token_index*out_dim + dim_index - // input indexing: - // i = token_index * (num_experts * out_dim) + expert_index * out_dim + - // dim_index - int expert_index = i / (batch_size * out_dim); - // int token_index = (i - expert_index*(batch_size*out_dim)) / out_dim; - int token_index = (i % (batch_size * out_dim)) / out_dim; - // int dim_index = i - expert_index*(batch_size*out_dim) - - // token_index*out_dim; - int dim_index = i % out_dim; - outputs[expert_index][token_index * out_dim + dim_index] = - input[i] * token_assigned[token_index][expert_index]; - } -} - /*static*/ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float const *input, int const *indices, float const *topk_gate_preds, - float **outputs, + float *output, + float const **weights, int chosen_experts, int batch_size, int out_dim) { @@ -99,55 +35,33 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int expert_capacity = ceil(m->alpha * chosen_experts / m->num_experts * batch_size); - // cudaEvent_t t_start, t_end; - // if (m->profiling) { - // cudaEventCreate(&t_start); - // cudaEventCreate(&t_end); - // cudaEventRecord(t_start, stream); - // } - hipMemcpy(m->dev_region_ptrs, - outputs, - m->num_experts * sizeof(float *), - hipMemcpyHostToDevice); + int num_experts = m->num_experts; + // int expert_start_index = experts_start_idx; + bool use_bias = m->use_bias; + // ActiMode activation = m->activation; - hipLaunchKernelGGL( - experts_forward_kernel, - GET_BLOCKS(batch_size * m->num_experts * out_dim), - min(CUDA_NUM_THREADS, (int)(batch_size * m->num_experts * out_dim)), - 0, - stream, - input, - indices, - topk_gate_preds, - m->dev_region_ptrs, - m->num_experts, - m->experts_start_idx, - chosen_experts, - expert_capacity, - batch_size, - out_dim); + hipMemcpy(m->dev_weights, + weights, + num_experts * (1 + use_bias) * sizeof(float *), + hipMemcpyHostToDevice); - // if (m->profiling) { - // cudaEventRecord(t_end, stream); - // checkCUDA(cudaEventSynchronize(t_end)); - // float elapsed = 0; - // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - // cudaEventDestroy(t_start); - // cudaEventDestroy(t_end); - // printf("[Experts] forward time = %.2lfms\n", elapsed); - // } + // TODO: write the HIP version of the kernel after finishing the CUDA kernel } ExpertsMeta::ExpertsMeta(FFHandler handler, int _num_experts, int _experts_start_idx, - float _alpha) + float _alpha, + bool _use_bias, + ActiMode _activation) : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), alpha(_alpha) { - checkCUDA(hipMalloc(&dev_region_ptrs, num_experts * sizeof(float *))); + experts_start_idx(_experts_start_idx), alpha(_alpha), use_bias(_use_bias), + activation(_activation) { + checkCUDA( + hipMalloc(&dev_weights, num_experts * (1 + use_bias) * sizeof(float *))); } ExpertsMeta::~ExpertsMeta(void) { - checkCUDA(hipFree(&dev_region_ptrs)); + checkCUDA(hipFree(&dev_weights)); } }; // namespace FlexFlow diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 769b96e5ae..342de3ef65 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -18,88 +18,27 @@ namespace FlexFlow { -__global__ void experts_forward_kernel(float const *input, - int const *indices, - float const *topk_gate_preds, - float **outputs, - int num_experts, - int experts_start_idx, - int chosen_experts, - int expert_capacity, - int batch_size, - int out_dim) { - // shared at the block level - __shared__ float token_assigned[MAX_BATCH_SIZE][MAX_EXPERTS_PER_BLOCK]; - - // initialize the token assignments to 0 - CUDA_KERNEL_LOOP(i, MAX_BATCH_SIZE * MAX_EXPERTS_PER_BLOCK) { - int token_index = i / MAX_EXPERTS_PER_BLOCK; - int expert_index = i % MAX_EXPERTS_PER_BLOCK; - token_assigned[token_index][expert_index] = 0.0f; - } - - __syncthreads(); - - // Compute token assignments, single thread per block - if (threadIdx.x == 0) { - int token_count[MAX_EXPERTS_PER_BLOCK] = {0}; - for (int i = 0; i < chosen_experts * batch_size; i++) { - // Get the token index, between 0 and batch_size - int token_index = i / chosen_experts; - // Get global index (indices[i]) of expert to which the token is assigned, - // and compute the local index (expert_index) of the expert within the - // block of fused experts - int expert_index = indices[i] - experts_start_idx; - // check if the token is assigned to an expert in this block, and if so, - // whether the expert still has capacity not that since each expert is - // assigned to only one block, it is safe to reason about expert capacity - // locally - if (expert_index >= 0 && expert_index < num_experts && - token_count[expert_index] < expert_capacity) { - token_assigned[token_index][expert_index] = topk_gate_preds[i]; - token_count[expert_index]++; - } else { - } - } - } - - __syncthreads(); - - // compute output - CUDA_KERNEL_LOOP(i, num_experts * batch_size * out_dim) { - // output indexing: - // i = expert_index*(batch_size*out_dim) + token_index*out_dim + dim_index - // input indexing: - // i = token_index * (num_experts * out_dim) + expert_index * out_dim + - // dim_index - int expert_index = i / (batch_size * out_dim); - // int token_index = (i - expert_index*(batch_size*out_dim)) / out_dim; - int token_index = (i % (batch_size * out_dim)) / out_dim; - // int dim_index = i - expert_index*(batch_size*out_dim) - - // token_index*out_dim; - int dim_index = i % out_dim; - outputs[expert_index][token_index * out_dim + dim_index] = - input[i] * token_assigned[token_index][expert_index]; - } -} - /*static*/ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float const *input, int const *indices, float const *topk_gate_preds, - float **outputs, + float *output, + float const **weights, int chosen_experts, int batch_size, int out_dim) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - // checkCUDA(cublasSetStream(m->handle.blas, stream)); - // checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); int expert_capacity = ceil(m->alpha * chosen_experts / m->num_experts * batch_size); + int num_experts = m->num_experts; + // int expert_start_index = experts_start_idx; + bool use_bias = m->use_bias; + // ActiMode activation = m->activation; + cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -107,27 +46,23 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaEventRecord(t_start, stream); } - // call forward_kernel - cudaMemcpyAsync(m->dev_region_ptrs, - outputs, - m->num_experts * sizeof(float *), - cudaMemcpyHostToDevice, - stream); - - experts_forward_kernel<<num_experts * out_dim), - min(CUDA_NUM_THREADS, - (int)(batch_size * m->num_experts * out_dim)), - 0, - stream>>>(input, - indices, - topk_gate_preds, - m->dev_region_ptrs, - m->num_experts, - m->experts_start_idx, - chosen_experts, - expert_capacity, - batch_size, - out_dim); + cudaMemcpy(m->dev_weights, + weights, + num_experts * (1 + use_bias) * sizeof(float *), + cudaMemcpyHostToDevice); + + /** TODO: launch one or more kernel(s) to do the following: + * 1. sort the tokens by expert to which they are assigned. This will require + * replicating tokens when chosen_experts > 1 + * 2. matrix multiply (you can use cublasGemmEx) each slice of tokens with the + * corresponding expert's weights tensor. Add the bias. + * - you can obtain the slice by selecting the tokens between the index + * where the expert i starts and min(i+expert_capacity, index where expert i+1 + * starts) + * 3. reorder the outputs by token, and aggregate the outputs of multiple + * experts for the same token by computing an average weighted by the + * appropriate coefficient from the topk_gate_preds matrix. + */ if (m->profiling) { cudaEventRecord(t_end, stream); @@ -143,13 +78,17 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, ExpertsMeta::ExpertsMeta(FFHandler handler, int _num_experts, int _experts_start_idx, - float _alpha) + float _alpha, + bool _use_bias, + ActiMode _activation) : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), alpha(_alpha) { - checkCUDA(cudaMalloc(&dev_region_ptrs, num_experts * sizeof(float *))); + experts_start_idx(_experts_start_idx), alpha(_alpha), use_bias(_use_bias), + activation(_activation) { + checkCUDA( + cudaMalloc(&dev_weights, num_experts * (1 + use_bias) * sizeof(float *))); } ExpertsMeta::~ExpertsMeta(void) { - checkCUDA(cudaFree(&dev_region_ptrs)); + checkCUDA(cudaFree(&dev_weights)); } }; // namespace FlexFlow diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 699b98cd5a..66accf7195 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1756,7 +1756,7 @@ GraphOptimalViewSerialized } assert(node_idx == best_graph->inEdges.size()); // Second, serialize optimal machine view - printf("opotimal_views.size = %zu\n", optimal_views.size()); + printf("optimal_views.size = %zu\n", optimal_views.size()); sez.serialize(optimal_views.size()); for (auto const &it : optimal_views) { sez.serialize((size_t)98765432); // safe guard diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 6419a2d61f..dc746dd7f4 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1682,6 +1682,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, switch (parallel_op->op_type) { case OP_LINEAR: case OP_EMBEDDING: + case OP_EXPERTS: case OP_MULTIHEAD_ATTENTION: { switch (tdim) { #define DIMFUNC(TDIM) \ diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index d528fd6345..884472d204 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -3169,7 +3169,7 @@ bool FFModel::convert_graph_to_operators( Experts *exp = (Experts *)node.ptr; ExpertsParams params = exp->get_params(); new_op = new Experts( - *this, params, {std::begin(inputs), std::end(inputs)}, NULL); + *this, params, {std::begin(inputs), std::end(inputs)}, true); break; } case OP_SPLIT: { From 4f8f4a9df01f4a038ad5207c11a0576ad13e4b9c Mon Sep 17 00:00:00 2001 From: Rae Wong <33883582+yingyee0111@users.noreply.github.com> Date: Mon, 13 Feb 2023 01:47:13 -0500 Subject: [PATCH 059/344] Explicit operator placement on devices (#619) * first step * linting * Revert "linting" This reverts commit 45c56c2a67d3ced1fd82c286e69fb7fe719809ef. * Revert "first step" This reverts commit badb8793bd47913614d340ba5e6b5f0fc36adaff. * use only one device per operator * linting * round robin allocation * debugging * fix device placement for batches in same operator * fixed tracing issue, initializing on right device * fixed CUDNN_STATUS_MAPPING_ERROR for init * fixed remaining bugs --------- Co-authored-by: Gabriele Oliaro Co-authored-by: Rae Wong --- CMakeLists.txt | 6 +- .../cpp/inference/mixture_of_experts/moe.cc | 16 ++- include/flexflow/inference.h | 2 + include/flexflow/operator.h | 13 ++- include/flexflow/ops/aggregate.h | 3 +- include/flexflow/ops/aggregate_spec.h | 3 +- include/flexflow/ops/attention.h | 3 +- include/flexflow/ops/element_binary.h | 3 +- include/flexflow/ops/experts.h | 3 +- include/flexflow/ops/groupby.h | 3 +- include/flexflow/ops/layer_norm.h | 3 +- include/flexflow/ops/linear.h | 3 +- include/flexflow/ops/noop.h | 3 +- include/flexflow/ops/softmax.h | 3 +- include/flexflow/ops/topk.h | 3 +- include/flexflow/parallel_ops/partition.h | 3 +- src/ops/aggregate.cc | 25 +++-- src/ops/aggregate_spec.cc | 20 ++-- src/ops/attention.cc | 21 ++-- src/ops/element_binary.cc | 20 ++-- src/ops/experts.cc | 22 ++-- src/ops/group_by.cc | 18 +-- src/ops/layer_norm.cc | 25 +++-- src/ops/linear.cc | 20 ++-- src/ops/noop.cc | 15 ++- src/ops/softmax.cc | 20 ++-- src/ops/topk.cc | 20 ++-- src/parallel_ops/partition.cc | 9 +- src/runtime/graph.cc | 12 ++ src/runtime/inference_manager.cc | 105 ++++++++++++------ src/runtime/model.cc | 93 +++++++++++++++- 31 files changed, 378 insertions(+), 140 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b6a8fcec4e..8c27008b58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,9 +378,9 @@ if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/MLP_Unify) endif() -if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES) - add_subdirectory(examples/cpp/inference/MLP_Unify) -endif() +# if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES) +# add_subdirectory(examples/cpp/inference/MLP_Unify) +# endif() if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 918f04d0b7..8dc0842e98 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -160,19 +160,25 @@ void FlexFlow::top_level_task(Task const *task, int index = 0; int processed_requests = 0; + int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; Generator data_generator( total_requests, request_tensor_size, poisson_distribution, lambda); + // data_loader.reset(); while (processed_requests < total_requests) { vector> req = data_generator.get_requests(); - int iterations = req.size(); + int nreqs = req.size(); + int iterations = (nreqs % num_requests_per_batch == 0) + ? (nreqs / num_requests_per_batch) + : (nreqs / num_requests_per_batch) + 1; for (int iter = 0; iter < iterations; iter++) { // data_loader.next_batch(ff); - runtime->begin_trace(ctx, 111 /*trace_id*/); - im.inference((index++) % num_inflight_batches); - runtime->end_trace(ctx, 111 /*trace_id*/); + runtime->begin_trace(ctx, 111 + index % num_devices /*trace_id*/); + im.inference(index); + runtime->end_trace(ctx, 111 + index % num_devices /*trace_id*/); + index++; } - processed_requests += iterations; + processed_requests += nreqs; } /////////////////////////////////////////////////////////////////////////////////// diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index bb2a70e8a8..dacf6b3f28 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -35,6 +35,8 @@ class InferenceManager { FFModel *model; int max_num_requests_per_batch; int max_num_inflight_batches; + int num_devices; + std::vector machine_views; }; } // namespace FlexFlow diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 280df29f83..a76ad9a018 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -187,7 +187,8 @@ class Op { virtual void init(FFModel const &) = 0; virtual void init_inference(FFModel const &, std::vector const &, - std::vector const &) { + std::vector const &, + MachineView const *mv = nullptr) { assert(false); }; virtual void forward(FFModel const &) = 0; @@ -254,12 +255,21 @@ class Op { #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); + void set_argumentmap_for_init_inference(FFModel const &ff, + Legion::ArgumentMap &argmap, + MachineView const *view); void set_argumentmap_for_forward(FFModel const &ff, Legion::ArgumentMap &argmap); + void set_argumentmap_for_inference(FFModel const &ff, + Legion::ArgumentMap &argmap, + MachineView const *view); void set_argumentmap_for_backward(FFModel const &ff, Legion::ArgumentMap &argmap); void set_opmeta_from_futuremap(FFModel const &ff, Legion::FutureMap const &fm); + void set_opmeta_from_futuremap_inference(FFModel const &ff, + Legion::FutureMap const &fm, + MachineView const *view); void solve_parallel_dim_mappings( std::vector const &inputs, std::vector const &weights, @@ -279,6 +289,7 @@ class Op { ParallelParameter weights[MAX_NUM_WEIGHTS]; bool trainableInputs[MAX_NUM_INPUTS]; OpMeta *meta[MAX_NUM_WORKERS]; + std::map inference_meta; int numInputs, numWeights, numOutputs; bool profiling; #ifdef FF_USE_NCCL diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 4d6aaeccb9..098e10d8e8 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -37,7 +37,8 @@ class Aggregate : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 2792ce58a4..a80606d761 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -29,7 +29,8 @@ class AggregateSpec : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index 684e29e910..baf4c06d48 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -66,7 +66,8 @@ class MultiHeadAttention : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 1116519e8c..6e7edce223 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -27,7 +27,8 @@ class ElementBinary : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 57d6153d0e..cd66618a07 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -56,7 +56,8 @@ class Experts : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index 0b8a001f67..0acc241a9b 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -36,7 +36,8 @@ class Group_by : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 3a81fe50f1..dac230e410 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -26,7 +26,8 @@ class LayerNorm : public Op { void init(FFModel const &); void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &); void backward(FFModel const &); void inference(FFModel const &, diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index fb2767a590..ccd5724dc5 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -37,7 +37,8 @@ class Linear : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/ops/noop.h b/include/flexflow/ops/noop.h index 688ab083df..a38d2945ca 100644 --- a/include/flexflow/ops/noop.h +++ b/include/flexflow/ops/noop.h @@ -19,7 +19,8 @@ class NoOp : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index c1d3ebdba2..de9ad56b45 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -23,7 +23,8 @@ class Softmax : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 9c82930822..5e8b515672 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -30,7 +30,8 @@ class TopK : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void inference(FFModel const &, diff --git a/include/flexflow/parallel_ops/partition.h b/include/flexflow/parallel_ops/partition.h index d940841eb4..f25bc83276 100644 --- a/include/flexflow/parallel_ops/partition.h +++ b/include/flexflow/parallel_ops/partition.h @@ -31,7 +31,8 @@ class Repartition : public ParallelOp { void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, - std::vector const &) override; + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, std::vector const &, diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 458a58b3fe..02fc971768 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -182,16 +182,18 @@ Node Aggregate::deserialize(FFModel &ff, return ff.get_or_create_node(inputs, params); } -void Aggregate::init_inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs) { +void Aggregate::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(AGGREGATE_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Aggregate)), @@ -199,10 +201,10 @@ void Aggregate::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void Aggregate::init(FFModel const &ff) { @@ -289,9 +291,12 @@ void Aggregate::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - parallel_is = outputs[0]->parallel_is; - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "Aggregate op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(AGGREGATE_FWD_TASK_ID, parallel_is, TaskArgument(nullptr, 0), diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index a206610095..21d429594e 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -158,13 +158,16 @@ AggregateSpec::AggregateSpec(FFModel &model, void AggregateSpec::init_inference( FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(AGG_SPEC_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(AggregateSpec)), @@ -172,10 +175,10 @@ void AggregateSpec::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void AggregateSpec::init(FFModel const &ff) { @@ -262,9 +265,12 @@ void AggregateSpec::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - parallel_is = outputs[0]->parallel_is; - size_t machine_view_hash = mv ? mv->hash() : outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "AggregateSpec op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(AGG_SPEC_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 8f5043e49e..e04440d77e 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -375,13 +375,16 @@ MultiHeadAttention::MultiHeadAttention( void MultiHeadAttention::init_inference( FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(ATTENTION_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(MultiHeadAttention)), @@ -389,7 +392,7 @@ void MultiHeadAttention::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -422,7 +425,7 @@ void MultiHeadAttention::init_inference( launcher.add_field(4, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void MultiHeadAttention::init(FFModel const &ff) { @@ -584,10 +587,14 @@ void MultiHeadAttention::inference( ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "MultiHeadAttention op machine_view: " << *(MachineView const + *)mv + << std::endl; */ int idx = 0; - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(ATTENTION_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index ab4df2826a..acd7f4fdca 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -249,7 +249,8 @@ void ElementBinary::do_inplace_output(void) { void ElementBinary::init_inference( FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { // Check if we have the same oprands has_same_operands = (batch_inputs[0]->region == batch_inputs[1]->region); assert(check_output_input_weight_same_parallel_is()); @@ -257,7 +258,9 @@ void ElementBinary::init_inference( ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(ELEMENTBINARY_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(ElementBinary)), @@ -265,7 +268,7 @@ void ElementBinary::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); int rid = 0; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, @@ -308,7 +311,7 @@ void ElementBinary::init_inference( //} FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void ElementBinary::init(FFModel const &ff) { @@ -496,9 +499,12 @@ void ElementBinary::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "ElementBinary op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 9c60a5e212..afd7bff3c9 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -429,13 +429,16 @@ Node Experts::deserialize(FFModel &ff, void Experts::init_inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(EXPERTS_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Experts)), @@ -443,7 +446,7 @@ void Experts::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); // expert predictions launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, @@ -491,7 +494,7 @@ void Experts::init_inference(FFModel const &ff, } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void Experts::init(FFModel const &ff) { @@ -646,9 +649,12 @@ void Experts::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -813,7 +819,7 @@ void Experts::inference_task(Task const *task, assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); } } - + return; Experts::forward_kernel_wrapper(m, input_ptr, indices_ptr, diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index f6e05945a6..e5d720ba31 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -164,16 +164,18 @@ Group_by::Group_by(FFModel &model, : Group_by( model, inputs.first, inputs.second, params.n, params.alpha, name) {} -void Group_by::init_inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs) { +void Group_by::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(GROUP_BY_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Group_by)), @@ -181,7 +183,7 @@ void Group_by::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); // data launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, @@ -209,7 +211,7 @@ void Group_by::init_inference( } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void Group_by::init(FFModel const &ff) { @@ -318,6 +320,8 @@ void Group_by::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; size_t machine_view_hash = mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + /* std::cout << "GroupBy op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(GROUP_BY_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 11b1185b1c..56c2bec1fc 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -194,16 +194,18 @@ LayerNorm::LayerNorm(FFModel &model, return; } -void LayerNorm::init_inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs) { +void LayerNorm::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(LAYERNORM_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(LayerNorm)), @@ -211,7 +213,7 @@ void LayerNorm::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, @@ -226,7 +228,7 @@ void LayerNorm::init_inference( launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void LayerNorm::init(FFModel const &ff) { @@ -320,9 +322,12 @@ void LayerNorm::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(LAYERNORM_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/linear.cc b/src/ops/linear.cc index fe9a3925a7..c7308bae15 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -256,14 +256,17 @@ void Linear::init(FFModel const &ff) { void Linear::init_inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); // assert(check_output_input_weight_same_machine_view()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(LINEAR_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Linear)), @@ -271,7 +274,7 @@ void Linear::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); // launcher.add_region_requirement( // RegionRequirement(input_lps[0], 0/*projection id*/, // READ_ONLY, EXCLUSIVE, inputs[0]->region)); @@ -301,7 +304,7 @@ void Linear::init_inference(FFModel const &ff, } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } /* @@ -424,9 +427,12 @@ void Linear::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(LINEAR_FWD_TASK_ID, parallel_is, TaskArgument(nullptr, 0), diff --git a/src/ops/noop.cc b/src/ops/noop.cc index 91e890ed9f..46968acb03 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -97,9 +97,12 @@ OpMeta *NoOp::init_task(Task const *task, void NoOp::init_inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { parallel_is = batch_outputs[0]->parallel_is; assert(parallel_is != IndexSpace::NO_SPACE); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); if (op_type == OP_INPUT && batch_outputs[0]->initializer != nullptr) { ConstantInitializer *initializer = (ConstantInitializer *)batch_outputs[0]->initializer; @@ -114,7 +117,7 @@ void NoOp::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement( RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -148,7 +151,7 @@ void NoOp::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement( RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -161,7 +164,7 @@ void NoOp::init_inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -169,10 +172,10 @@ void NoOp::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index b07ae0ad68..389cd8a678 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -117,13 +117,16 @@ Softmax::Softmax(FFModel &model, void Softmax::init_inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(SOFTMAX_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Softmax)), @@ -131,7 +134,7 @@ void Softmax::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -146,7 +149,7 @@ void Softmax::init_inference(FFModel const &ff, launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void Softmax::init(FFModel const &ff) { @@ -229,9 +232,12 @@ void Softmax::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(SOFTMAX_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/ops/topk.cc b/src/ops/topk.cc index b260902cd7..3763514685 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -138,13 +138,16 @@ TopK::TopK(FFModel &model, void TopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); parallel_is = batch_outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); IndexLauncher launcher(TOPK_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(TopK)), @@ -152,7 +155,7 @@ void TopK::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -173,7 +176,7 @@ void TopK::init_inference(FFModel const &ff, launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + set_opmeta_from_futuremap_inference(ff, fm, view); } void TopK::init(FFModel const &ff) { @@ -267,9 +270,12 @@ void TopK::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - size_t machine_view_hash = - mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "TopK op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(TOPK_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 6f808a3978..7fdf9a8e4a 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -104,13 +104,16 @@ OpMeta *Repartition::init_task(Task const *task, void Repartition::init_inference( FFModel const &ff, std::vector const &batch_inputs, - std::vector const &batch_outputs) { + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; parallel_is = batch_outputs[0]->parallel_is; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); IndexLauncher launcher(REPARTITION_INIT_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -118,7 +121,7 @@ void Repartition::init_inference( Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_outputs[0]->machine_view.hash()); + machine_view_hash); assert(inference_input_lps.find(batch_inputs[0]) != inference_input_lps.end()); launcher.add_region_requirement( @@ -209,6 +212,8 @@ void Repartition::inference(FFModel const &ff, DataType data_type = batch_inputs[0]->data_type; size_t machine_view_hash = mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + /* std::cout << "Partition op machine_view: " << *(MachineView const *)mv + << std::endl; */ IndexLauncher launcher(REPARTITION_FWD_TASK_ID, batch_outputs[0]->parallel_is, TaskArgument(&data_type, sizeof(DataType)), diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 66accf7195..47f8ac9d09 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1798,6 +1798,18 @@ void FFModel::register_all_machine_views( valid_views.push_back(view); } } + // No-parallelism views + for (int i = 1; i <= num_nodes * gpus_per_node; i++) { + if (num_nodes * gpus_per_node % i == 0) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = i; + view.stride[0] = 0; + view.start_device_id = 0; + valid_views.push_back(view); + } + } // Two-dimensional views /* for (int i = 1; i <= num_nodes; i++) { */ /* for (int j = 1; j <= gpus_per_node; j++) { */ diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 8f926da316..e3e4bd9b07 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -24,7 +24,19 @@ InferenceManager::InferenceManager(FFModel *_model, int _max_num_requests_per_batch, int _max_num_inflight_batches) : model(_model), max_num_requests_per_batch(_max_num_requests_per_batch), - max_num_inflight_batches(_max_num_inflight_batches) {} + max_num_inflight_batches(_max_num_inflight_batches) { + // populate array of valid single-device machine views + num_devices = model->config.workersPerNode * model->config.numNodes; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } +} void InferenceManager::compile_model_and_allocate_buffer(void) { std::vector metrics; @@ -59,61 +71,88 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { } void InferenceManager::init_operators_inference() { - for (int index = 0; index < max_num_inflight_batches; index++) { - for (size_t o = 0; o < model->operators.size(); o++) { - Op *op = model->operators[o]; - if (op->op_type == OP_WEIGHT) { - continue; - } - std::vector inputs(op->numInputs); - std::vector outputs(op->numOutputs); - for (int i = 0; i < op->numInputs; i++) { - assert(op->inputs[i] != nullptr); - assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(tensor_buffer[op->inputs[i]].size() > index); - inputs[i] = tensor_buffer[op->inputs[i]][index]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - for (int i = 0; i < op->numOutputs; i++) { - assert(op->outputs[i] != nullptr); - assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(tensor_buffer[op->outputs[i]].size() > index); - outputs[i] = tensor_buffer[op->outputs[i]][index]; - assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + for (int batch_index = 0; batch_index < max_num_inflight_batches; + batch_index++) { + for (int device_index = 0; device_index < num_devices; device_index++) { + // int fused_experts_index = 0; + for (size_t o = 0; o < model->operators.size(); o++) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + MachineView *view; + // if (op->op_type == OP_EXPERTS) { + // if (fused_experts_index != device_index) { + // fused_experts_index++; + // continue; + // } + // view = &machine_views[fused_experts_index]; + // fused_experts_index++; + // } else { + view = &machine_views[device_index]; + //} + std::vector inputs(op->numInputs); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > batch_index); + inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->outputs[i]].size() > batch_index); + outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + if (op->is_parallel_op()) { + ((ParallelOp *)op) + ->create_input_partition_inference(*model, inputs, outputs); + } + op->init_inference(*model, inputs, outputs, view); } - if (op->is_parallel_op()) { - ((ParallelOp *)op) - ->create_input_partition_inference(*model, inputs, outputs); - } - op->init_inference(*model, inputs, outputs); } } } void InferenceManager::inference(int index) { - assert(index < max_num_inflight_batches); + int batch_index = index % max_num_inflight_batches; + int device_index = index % num_devices; + int expert_device_index = 0; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { continue; } + + MachineView *view; + if (op->op_type == OP_EXPERTS) { + view = &machine_views[expert_device_index]; + expert_device_index = (expert_device_index + 1) % num_devices; + } else { + // pick mv w startdeviceid = device_index + view = &machine_views[device_index]; + } + std::vector inputs(op->numInputs); std::vector outputs(op->numOutputs); for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i] != nullptr); assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(tensor_buffer[op->inputs[i]].size() > index); - inputs[i] = tensor_buffer[op->inputs[i]][index]; + assert(tensor_buffer[op->inputs[i]].size() > batch_index); + inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); } for (int i = 0; i < op->numOutputs; i++) { assert(op->outputs[i] != nullptr); assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(tensor_buffer[op->outputs[i]].size() > index); - outputs[i] = tensor_buffer[op->outputs[i]][index]; + assert(tensor_buffer[op->outputs[i]].size() > batch_index); + outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } - op->inference(*model, inputs, outputs); + op->inference(*model, inputs, outputs, view); } }; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index dc746dd7f4..39368ba9b0 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -975,6 +975,49 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) { } } +void Op::set_argumentmap_for_init_inference(FFModel const &ff, + ArgumentMap &argmap, + MachineView const *view) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + Domain domain = runtime->get_index_space_domain(ctx, this->parallel_is); + switch (domain.get_dim()) { +#ifdef FF_USE_NCCL +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + FFHandler handle = ff.handlers[view->get_device_id(*it)]; \ + if (ff.config.computationMode == COMP_MODE_TRAINING && \ + op_type == OP_WEIGHT) { \ + ncclComm_t *nccl_comms = ff.find_nccl_comms(*view); \ + handle.ncclComm = nccl_comms[idx++]; \ + } \ + argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC +#else +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + for (PointInRectIterator it(rect); it(); it++) { \ + FFHandler handle = ff.handlers[view->get_device_id(*it)]; \ + argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC +#endif + default: + assert(false); + } +} + void Op::set_opmeta_from_futuremap(FFModel const &ff, FutureMap const &fm) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -996,6 +1039,29 @@ void Op::set_opmeta_from_futuremap(FFModel const &ff, FutureMap const &fm) { } } +void Op::set_opmeta_from_futuremap_inference(FFModel const &ff, + FutureMap const &fm, + MachineView const *view) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + inference_meta[view->hash()][idx++] = fm.get_result(*it); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } +} + void Op::set_argumentmap_for_forward(FFModel const &ff, ArgumentMap &argmap) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -1018,6 +1084,30 @@ void Op::set_argumentmap_for_forward(FFModel const &ff, ArgumentMap &argmap) { } } +void Op::set_argumentmap_for_inference(FFModel const &ff, + ArgumentMap &argmap, + MachineView const *view) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + OpMeta *mp = inference_meta[view->hash()][idx++]; \ + argmap.set_point(*it, TaskArgument(&mp, sizeof(OpMeta *))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } +} + void Op::set_argumentmap_for_backward(FFModel const &ff, ArgumentMap &argmap) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -2631,7 +2721,8 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel) { + if (config.only_data_parallel && + config.computationMode == COMP_MODE_TRAINING) { Repartition *part = new Repartition( *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); operators.push_back(part); From b23a1dcba62dc99dd8bc05751fc4f0e8663a5275 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 Feb 2023 21:14:38 -0500 Subject: [PATCH 060/344] [Inference] - Updated dataloader & integrated with data generator (#630) * integration of data generator with data loader * bug fixes * fixes * add infernece moe to c++ test * workflow fix --- .github/workflows/gpu-ci.yml | 3 + CMakeLists.txt | 6 +- config/config.linux | 2 +- .../cpp/inference/MLP_Unify/CMakeLists.txt | 15 - examples/cpp/inference/MLP_Unify/Makefile | 39 -- examples/cpp/inference/MLP_Unify/mlp.cc | 146 ------ examples/cpp/inference/MLP_Unify/mlp.h | 61 --- examples/cpp/inference/data_generator.cc | 103 +++++ examples/cpp/inference/data_generator.cpp | 47 +- examples/cpp/inference/data_generator.h | 137 ++---- .../mixture_of_experts/CMakeLists.txt | 6 +- .../cpp/inference/mixture_of_experts/Makefile | 4 +- .../mixture_of_experts/dataloader.cc | 388 ++++++++++++++++ .../mixture_of_experts/dataloader.cu | 115 +++++ .../cpp/inference/mixture_of_experts/moe.cc | 416 ++---------------- .../cpp/inference/mixture_of_experts/moe.cu | 78 ---- .../cpp/inference/mixture_of_experts/moe.h | 71 ++- src/loss_functions/loss_functions.cu | 14 +- src/runtime/inference_manager.cc | 4 + tests/cpp_gpu_tests.sh | 4 + 20 files changed, 778 insertions(+), 881 deletions(-) delete mode 100644 examples/cpp/inference/MLP_Unify/CMakeLists.txt delete mode 100644 examples/cpp/inference/MLP_Unify/Makefile delete mode 100644 examples/cpp/inference/MLP_Unify/mlp.cc delete mode 100644 examples/cpp/inference/MLP_Unify/mlp.h create mode 100644 examples/cpp/inference/data_generator.cc create mode 100644 examples/cpp/inference/mixture_of_experts/dataloader.cc create mode 100644 examples/cpp/inference/mixture_of_experts/dataloader.cu delete mode 100644 examples/cpp/inference/mixture_of_experts/moe.cu diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 13b9fbb6f0..2a46e7d498 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -62,6 +62,7 @@ jobs: run: | export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion mkdir build cd build ../config/config.linux @@ -77,6 +78,7 @@ jobs: run: | export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion cd build ../config/config.linux make install @@ -108,6 +110,7 @@ jobs: export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion pip install . --verbose - name: Check FlexFlow Python interface (pip) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c27008b58..10542011df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,10 +378,6 @@ if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/MLP_Unify) endif() -# if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_INFERENCE_EXAMPLES) -# add_subdirectory(examples/cpp/inference/MLP_Unify) -# endif() - if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) endif() @@ -421,7 +417,7 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) endif() -if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) +if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/mixture_of_experts) endif() diff --git a/config/config.linux b/config/config.linux index 28cf7c2fe1..86e8f8b647 100755 --- a/config/config.linux +++ b/config/config.linux @@ -46,7 +46,7 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} # use precompiled NCCL and Legion libraries, where available FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-ON} -FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-ON} +FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF} # use the flag below to use both the NCCL and Legion pre-built libraries. # when the flag below is set to ON, the two flags above are ignored. FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF} diff --git a/examples/cpp/inference/MLP_Unify/CMakeLists.txt b/examples/cpp/inference/MLP_Unify/CMakeLists.txt deleted file mode 100644 index e83d292efc..0000000000 --- a/examples/cpp/inference/MLP_Unify/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExampleInference_MLPUnify) -set(project_target mlp_inference_unify) - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - mlp.cc - mlp.h - ../data_generator.h) - -cuda_add_executable(${project_target} ${CPU_SRC}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/examples/cpp/inference/MLP_Unify/Makefile b/examples/cpp/inference/MLP_Unify/Makefile deleted file mode 100644 index 9798c4f18a..0000000000 --- a/examples/cpp/inference/MLP_Unify/Makefile +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2021 CMU, Facebook, LANL, MIT, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 1 # Include debugging symbols -MAX_DIM ?= 5 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 0 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 0 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) -USE_HIP ?= 1 # Include HIP support (requires HIP) -HIP_TARGET ?= ROCM -USE_GPU_REDUCTIONS ?= 0 - -# Put the binary file name here -OUTFILE ?= mlp_inference -# List all the application source files here -GEN_SRC = mlp.cc -GEN_GPU_SRC = -GEN_HIP_SRC = - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/MLP_Unify/mlp.cc b/examples/cpp/inference/MLP_Unify/mlp.cc deleted file mode 100644 index 7631af0445..0000000000 --- a/examples/cpp/inference/MLP_Unify/mlp.cc +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "mlp.h" -#include "data_generator.h" -#include "flexflow/inference.h" -#include -#include -#include - -using namespace Legion; -using namespace FlexFlow; - -Tensor create_mlp(FFModel *model, - MLPConfig const *mlpConfig, - Tensor const &input1, - Tensor const &input2) { - Tensor t1 = input1, t2 = input2; - for (int i = 0; i < mlpConfig->hidden_dims.size(); i++) { - int const dims[] = {mlpConfig->hidden_dims[i], t1->dims[0]}; - ActiMode acti_mode = - (i + 1 == mlpConfig->hidden_dims.size()) ? AC_MODE_NONE : AC_MODE_RELU; - t1 = model->dense(t1, mlpConfig->hidden_dims[i], acti_mode, false); - t2 = model->dense(t2, mlpConfig->hidden_dims[i], acti_mode, false); - } - Tensor t = model->add(t1, t2); - return model->softmax(t); -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - - // Inference parameters - int total_requests = - 256; // total number of requests processed as part of the simulation - int request_tensor_size = 4; // request tensor dimensions - bool poisson_distribution = true; - double lambda = 25; // average number of request arrivals per second - int num_requests_per_batch = 5; - int num_inflight_batches = 10; - - // MLP parameters - int embedding_size = 1024; - int sequence_length = 512; - std::vector hidden_dims = { - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}; - - FFConfig ffConfig; - ffConfig.batchSize = 1; - { - fprintf(stderr, - "batchSize(%d) workersPerNodes(%d) numNodes(%d)\n", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes); - } - FFModel ff(ffConfig); - MLPConfig mlpConfig(embedding_size, sequence_length, hidden_dims); - { - stringstream hd; - hd << '{'; - for (int i = 0; i < hidden_dims.size(); i++) { - if (i != 0) { - hd << ","; - } - hd << hidden_dims[i]; - } - hd << '}'; - fprintf(stderr, - "embedding_size(%d) sequence_length(%d) hidden_dims(%s)\n", - mlpConfig.embedding_size, - mlpConfig.sequence_length, - hd.str().c_str()); - } - - Tensor input1, input2; - { - int const dims[] = {total_requests, - mlpConfig.sequence_length * mlpConfig.embedding_size}; - input1 = ff.create_tensor<2>(dims, DT_FLOAT); - input2 = ff.create_tensor<2>(dims, DT_FLOAT); - } - Tensor t = create_mlp(&ff, &mlpConfig, input1, input2); - - InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); - // im.compile_model_and_allocate_buffer(); - ff.init_operators(); - - // Start timer - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_start = Realm::Clock::current_time_in_microseconds(); - - /////////////////////////////////////////////////////////////////////////////////// - - // Main loop, processing requests as they come (from the generator) - int index = 0; - int processed_requests = 0; - Generator data_generator( - total_requests, request_tensor_size, poisson_distribution, lambda); - while (processed_requests < total_requests) { - vector> req = data_generator.get_requests(); - int iterations = req.size(); - for (int iter = 0; iter < iterations; iter++) { - runtime->begin_trace(ctx, 111 /*trace_id*/); - im.inference((index++) % num_inflight_batches); - runtime->end_trace(ctx, 111 /*trace_id*/); - } - processed_requests += iterations; - } - - /////////////////////////////////////////////////////////////////////////////////// - - // End timer - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_end = Realm::Clock::current_time_in_microseconds(); - double run_time = 1e-6 * (ts_end - ts_start); - printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", - run_time, - ffConfig.batchSize * 128 * ffConfig.epochs / run_time); -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/MLP_Unify/mlp.h b/examples/cpp/inference/MLP_Unify/mlp.h deleted file mode 100644 index 7cd2f30430..0000000000 --- a/examples/cpp/inference/MLP_Unify/mlp.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/inference.h" -#include "flexflow/model.h" -using namespace Legion; -using namespace std; -using namespace FlexFlow; - -#define MAX_NUM_SAMPLES 1024000 - -struct MLPConfig { - MLPConfig(void); - MLPConfig(int embedding_size, - int sequence_length, - std::vector hidden_dims) - : embedding_size(embedding_size), sequence_length(sequence_length), - hidden_dims(hidden_dims) {} - - int embedding_size, sequence_length; - std::vector hidden_dims; -}; - -class DataLoader { -public: - DataLoader(FFModel &ff, - MLPConfig const &mlpConfig, - InferenceManager const *im, - Tensor input); - /*static void load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime);*/ - static void load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - void next_batch(FFModel &); - void reset(void); - -public: - int num_samples, next_index; - Tensor full_input, batch_input; -}; - -struct SampleIdxs { - int num_samples; - int idxs[MAX_NUM_SAMPLES]; -}; \ No newline at end of file diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc new file mode 100644 index 0000000000..ec168f6f51 --- /dev/null +++ b/examples/cpp/inference/data_generator.cc @@ -0,0 +1,103 @@ +#include "data_generator.h" +#include +#include +#include +using namespace std; + +DataGenerator::DataGenerator(size_t _num_requests, + size_t _token_dim, + size_t _sequence_length, + bool _poisson_distr, + double _lambda) + : num_requests(_num_requests), token_dim(_token_dim), + sequence_length(_sequence_length), poisson_distr(_poisson_distr), + lambda(_lambda), timer_started(false) { + generate_arrival_times(); +}; + +void DataGenerator::generate_requests(float *req_ptr, + int *label_ptr, + int num_labels) { + assert(req_ptr != nullptr); + /* for (size_t i=0; i float_dist{0, 1.0}; + auto gen = [&float_dist, &mersenne_engine]() { + return float_dist(mersenne_engine); + }; + std::generate( + req_ptr, req_ptr + token_dim * sequence_length * num_requests, gen); + + if (label_ptr != nullptr) { + assert(num_labels > 0); + /* for (size_t i=0; i int_dist{0, num_labels}; + auto gen_label = [&int_dist, &mersenne_engine]() { + return int_dist(mersenne_engine); + }; + std::generate( + label_ptr, label_ptr + sequence_length * num_requests, gen_label); + } +}; + +void DataGenerator::generate_arrival_times(void) { + // set up a uniform number generator with range [0,1) + random_device rnd; + mt19937 gen(rnd()); + uniform_real_distribution dist{0, 1.0}; + double cur_arrival = 0; // assume first request comes in at time 0 + + for (size_t i = 0; i < num_requests; i++) { + arrivals.push_back(cur_arrival); + if (poisson_distr) { + double u = dist(gen); + double interval = -(1 / lambda) * log(1 - u) * 1000; + cur_arrival += interval; + } else { + cur_arrival += (1000 / lambda); + } + } + // cout << "Arrivals : ["; + // copy(arrivals.begin(), arrivals.end(), ostream_iterator(cout, " ")); + // cout << "]" << endl; +}; + +void DataGenerator::start_timer(void) { + arrivals_ptr = arrivals.begin(); + start_time = Clock::now(); + timer_started = true; +}; + +size_t DataGenerator::get_requests(void) { + if (!timer_started) { + std::cout << "Warning: tried to get number of requests before the timer " + "was started." + << std::endl; + return 0; + } + Clock::time_point cur_time = Clock::now(); + size_t ms_from_start = + chrono::duration_cast(cur_time - start_time).count(); + vector::iterator new_arrivals_ptr = + upper_bound(arrivals_ptr, arrivals.end(), ms_from_start); + size_t received_requests = new_arrivals_ptr - arrivals_ptr; + arrivals_ptr = new_arrivals_ptr; + if (received_requests > 0) { + std::cout << "received " << received_requests + << " request(s) by arrival time +" << ms_from_start << "ms" + << "\n"; + } + return received_requests; +} diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp index 7429fdb159..9aeb9b49a9 100644 --- a/examples/cpp/inference/data_generator.cpp +++ b/examples/cpp/inference/data_generator.cpp @@ -12,26 +12,49 @@ #include using namespace std; -// This is for running the dataloader standalone +// This is for testing the request generator standalone int main(int argc, char const *argv[]) { - // insert code here... - cout << "Hello, World!\n"; - Generator data_generator(10, 4, true, 1); - vector> req0 = data_generator.get_requests(); - print_requests(req0); + cout << "Starting the Data DataGenerator!\n"; + + // DataGenerator parameters + size_t total_requests = 256; + size_t token_dim = 16; + size_t sequence_length = 20; + bool use_poisson_distr = true; + // average number of request arrivals per second + double lambda = 25; + int label_dims = 10; + + float *requests = (float *)calloc( + token_dim * sequence_length * total_requests, sizeof(float)); + int *labels = (int *)calloc(sequence_length * total_requests, sizeof(int)); + + DataGenerator data_generator( + total_requests, token_dim, sequence_length, use_poisson_distr, lambda); + data_generator.generate_requests(requests, labels, label_dims); + data_generator.start_timer(); + + size_t received_requests = data_generator.get_requests(); + std::cout << "t=0ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(1200)); - vector> req1200 = data_generator.get_requests(); - print_requests(req1200); + received_requests = data_generator.get_requests(); + std::cout << "t=1200ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(10)); - vector> req1210 = data_generator.get_requests(); - print_requests(req1210); + received_requests = data_generator.get_requests(); + std::cout << "t=1210ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(4000)); - vector> req5210 = data_generator.get_requests(); - print_requests(req5210); + received_requests = data_generator.get_requests(); + std::cout << "t=5210ms: received " << received_requests << std::endl; + this_thread::sleep_for(milliseconds(5000)); + received_requests = data_generator.get_requests(); + std::cout << "t=10210ms: received " << received_requests << std::endl; + + free(requests); + free(labels); return 0; } diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 98af050a98..e651881902 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -1,6 +1,9 @@ +#pragma once +#include #include #include #include +#include #include #include #include @@ -9,109 +12,39 @@ using namespace std; typedef std::chrono::high_resolution_clock Clock; typedef std::chrono::milliseconds milliseconds; -class Generator { +class DataGenerator { public: - size_t num_requests; // total number of requests - size_t tensor_size; // dimension of one request tensor - bool poisson; // false implied uniform distribution - double lambda; // mean #num of arrivals per sec - - Generator(size_t req, size_t tensor, bool poi, double lamb) { - num_requests = req; - tensor_size = tensor; - poisson = poi; - lambda = lamb; - compute_distribution(); - arrivals_ptr = arrivals.begin(); - timer_started = false; - } - - vector> get_requests(void); // function to retrieve requests + DataGenerator(size_t _num_requests, + size_t _token_dim, + size_t _sequence_length, + bool _poisson_distr, + double _lambda); + + // Generate random requests by filling each token with random data. For now, + // assume all requests have the same sequence length. Also generate random + // labels (if label_ptr != nullptr and num_labels >0). + void generate_requests(float *req_ptr, + int *label_ptr = nullptr, + int num_labels = 0); + void start_timer(void); + // Get number of requests that have arrived since the last time this function + // was called + size_t get_requests(void); private: - bool timer_started; // tracks if start time has been initiated - Clock::time_point - start_time; // time when get_requests() is called for the first time - vector arrivals; // arrival times (ms) generated based on distribution - vector::iterator arrivals_ptr; // next request to output - - void compute_distribution(void); // populate arrivals - vector get_random_tensor(void); // generate a random tensor -}; - -void Generator::compute_distribution(void) { - // set up uniform number generator [0,1) - random_device rnd; - mt19937 gen(rnd()); - uniform_real_distribution dist{0, 1.0}; - double cur_arrival = 0; // assume first request comes in at time 0 - - for (size_t i = 0; i < num_requests; i++) { - arrivals.push_back(cur_arrival); - cout << "arrival time " << i << ": +" << cur_arrival << "ms \n"; - - if (poisson) { - double u = dist(gen); - double interval = -(1 / lambda) * log(1 - u) * 1000; - cur_arrival += interval; - } else { - cur_arrival += (1000 / lambda); - } - } - return; -}; - -vector> Generator::get_requests(void) { - Clock::time_point cur_time = Clock::now(); - vector> requests; - if (!timer_started) { - // simply return one request and start timer for the first call - start_time = Clock::now(); - timer_started = true; - arrivals_ptr++; - requests.push_back(get_random_tensor()); - return requests; - } - - // output requests till we reach current timestamp - milliseconds ms_from_start = - chrono::duration_cast(cur_time - start_time); - while (arrivals_ptr < arrivals.end() && - ms_from_start.count() >= *arrivals_ptr) { - cout << "request at arrival time +" << *arrivals_ptr << "\n"; - requests.push_back(get_random_tensor()); - arrivals_ptr++; - } - return requests; -}; - -// template -// void generate(ForwardIt first, ForwardIt last, Generator gen) { -// while (first != last) { -// *first++ = gen(); -// } -// } - -vector Generator::get_random_tensor(void) { - random_device rnd_device; - mt19937 mersenne_engine{rnd_device()}; - uniform_real_distribution dist{0, 1.0}; // state distribution - - auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); }; - - vector vec(tensor_size); - generate(begin(vec), end(vec), gen); - return vec; -}; - -// for debugging -void print_requests(vector> req) { - cout << "printing requests\n"; - for (vector v : req) { - for (double e : v) { - cout << e << ","; - } - cout << "\n"; - } - cout << "\n"; + // Compute the arrival times of each request and save them in the arrivals + // vector. + void generate_arrival_times(void); + + size_t num_requests; // total number of requests + size_t token_dim; // embedding dim of each token + size_t sequence_length; // dimension of one request tensor + bool poisson_distr; // false implies uniform distribution + double lambda; // mean #num of arrivals per sec + bool timer_started; // whether timer was initiated + // time when get_requests() is called for the first time + Clock::time_point start_time; + // arrival times (ms) generated based on distribution + vector arrivals; + vector::iterator arrivals_ptr; }; diff --git a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt index ee1c063b18..81c4c184b4 100644 --- a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt +++ b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt @@ -6,11 +6,11 @@ set(project_target inference_moe) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} moe.cc - moe.h - ../data_generator.h) + dataloader.cc + ../data_generator.cc) set(GPU_SRC - moe.cu) + dataloader.cu) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/examples/cpp/inference/mixture_of_experts/Makefile b/examples/cpp/inference/mixture_of_experts/Makefile index a9eb401850..15fbf25b9a 100644 --- a/examples/cpp/inference/mixture_of_experts/Makefile +++ b/examples/cpp/inference/mixture_of_experts/Makefile @@ -25,8 +25,8 @@ ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) # Put the binary file name here OUTFILE ?= inference_moe # List all the application source files here -GEN_SRC = moe.cc -GEN_GPU_SRC = moe.cu +GEN_SRC = moe.cc dataloader.cc ../data_generator.cc +GEN_GPU_SRC = dataloader.cu ifndef FF_HOME $(error FF_HOME variable is not defined, aborting build) diff --git a/examples/cpp/inference/mixture_of_experts/dataloader.cc b/examples/cpp/inference/mixture_of_experts/dataloader.cc new file mode 100644 index 0000000000..557fe4e095 --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/dataloader.cc @@ -0,0 +1,388 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "moe.h" +#include +#include +#include +#include +#include +#include + +using namespace Legion; + +DataLoader::DataLoader(FFModel &ff, + MoeConfig const &moeConfig, + DataGenerator &data_generator, + ParallelTensor input, + ParallelTensor label) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + + int numdims = input->num_dims; + int replica_idx = numdims - 1; + int batch_idx = numdims - 2; + num_samples = moeConfig.total_requests; + + // Create full input + { + batch_input = input; + + ParallelDim dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i].size = input->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = input->dims[i].is_replica_dim; + // Assume only the first dim can be the replica dim + assert(i == replica_idx || (!dims[i].is_replica_dim)); + } + assert(dims[batch_idx].size == ff.config.batchSize); + dims[batch_idx].size = num_samples; + + full_input = + ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_FLOAT); + ff.map_tensor(full_input, NULL /*parallel_op*/); + } + + // Create full label + { + assert(label->num_dims == numdims); + batch_label = label; + + ParallelDim dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i].size = label->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = label->dims[i].is_replica_dim; + // Assume only the last dim can be the replica dim + assert(i == replica_idx || (!dims[i].is_replica_dim)); + } + assert(dims[batch_idx].size == ff.config.batchSize); + // replace batch size with number of samples + dims[batch_idx].size = num_samples; + + full_label = + ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_INT32); + ff.map_tensor(full_label, NULL /*parallel_op*/); + } + + // Load entire dataset + // TODO: Use index launcher instead of task launcher + assert(full_input != nullptr && "full_input is nullptr"); + assert(full_label != nullptr && "full_label is nullptr"); + + DataLoaderInput dataloader_input = {moeConfig, data_generator}; + DataLoaderInput const *ptr = &dataloader_input; + + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, + TaskArgument(ptr, sizeof(DataLoaderInput))); + // regions[0]: full_input + launcher.add_region_requirement(RegionRequirement(full_input->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + // regions[1]: full_label + launcher.add_region_requirement(RegionRequirement(full_label->region, + WRITE_ONLY, + EXCLUSIVE, + full_label->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(1, FID_DATA); + + runtime->execute_task(ctx, launcher); + reset(); +} + +// ================================================= +// Load data +// ================================================= + +void read_cifar100(float *input_ptr, int *label_ptr) { + std::ifstream file; + file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); + if (!file) { + std::cout << "Error opening CIFAR100 train data file" << std::endl; + assert(false); + } + + file.seekg(0, std::ios::beg); + + // each sample: <1 x coarse label><1 x fine label><3072 x pixel> + for (std::size_t i = 0; i < MAX_NUM_SAMPLES; i++) { + unsigned char temp = 0; + file.read((char *)&temp, sizeof(temp)); // coarse label, skip + file.read((char *)&temp, sizeof(temp)); + label_ptr[i] = temp; + for (std::size_t j = 0; j < 3072; ++j) { + file.read((char *)&temp, sizeof(temp)); + input_ptr[i * 3072 + j] = (float)temp / 255.0f; + } + } + + file.close(); +} + +int reverseInt(int i) { + unsigned char c1, c2, c3, c4; + + c1 = i & 255; + c2 = (i >> 8) & 255; + c3 = (i >> 16) & 255; + c4 = (i >> 24) & 255; + + return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; +} + +/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to +the current working directory */ +void read_mnist(float *input_ptr, int *label_ptr) { + // read inputs + std::ifstream input("train-images-idx3-ubyte", std::ios::binary); + if (input.is_open()) { + int magic_number = 0; + int number_of_images = 0; + int n_rows = 0; + int n_cols = 0; + input.read((char *)&magic_number, sizeof(magic_number)); + magic_number = reverseInt(magic_number); + input.read((char *)&number_of_images, sizeof(number_of_images)); + number_of_images = reverseInt(number_of_images); + input.read((char *)&n_rows, sizeof(n_rows)); + n_rows = reverseInt(n_rows); + input.read((char *)&n_cols, sizeof(n_cols)); + n_cols = reverseInt(n_cols); + + for (int i = 0; i < number_of_images; i++) { + for (int r = 0; r < n_rows; r++) { + for (int c = 0; c < n_cols; c++) { + unsigned char temp = 0; + input.read((char *)&temp, sizeof(temp)); + input_ptr[i * n_rows * n_cols + r * n_cols + c] = + (float)temp / 255.0f; + } + } + } + } else { + std::cout << "Error opening MNIST input data file" << std::endl; + assert(false); + } + + // read labels + std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary); + if (labels.is_open()) { + int magic_number = 0; + int number_of_images = 0; + labels.read((char *)&magic_number, sizeof(magic_number)); + magic_number = reverseInt(magic_number); + labels.read((char *)&number_of_images, sizeof(number_of_images)); + number_of_images = reverseInt(number_of_images); + + for (int i = 0; i < number_of_images; i++) { + unsigned char temp = 0; + labels.read((char *)&temp, sizeof(temp)); + label_ptr[i] = temp; + } + } else { + std::cout << "Error opening MNIST label data file" << std::endl; + assert(false); + } +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + DataLoaderInput const input_struct = *((DataLoaderInput *)task->args); + MoeConfig const &conf = input_struct._moeConfig; + DataGenerator &datagen = input_struct._data_generator; + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + + // get input and label pointer + float *input_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + int *label_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain label_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + int input_dims = input_domain.get_dim(); + for (int i = 0; i < input_dims; i++) { + int input_dim = input_domain.hi()[i] - input_domain.lo()[i] + 1; + int label_dim = label_domain.hi()[i] - label_domain.lo()[i] + 1; + assert(i == 0 || input_dim == label_dim); + } + + if (conf.dataset_path.length() == 0) { + printf("Input dataset path is empty, using random input samples\n"); + datagen.generate_requests(input_ptr, label_ptr, conf.num_labels); + } else { + // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load + // another dataset using the dataset_path from the MoeConfig object + // read_mnist(input_ptr, label_ptr); + // log_app.print("finish loading MNIST data\n"); + } +} + +void DataLoader::next_batch(FFModel &ff, size_t received_requests) { + if (received_requests == 0) { + return; + } + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load input + { + Domain domain = + runtime->get_index_space_domain(ctx, batch_input->parallel_is); + ArgumentMap argmap; + int counter = 0; + // current limitation of the dataloader: only the batch dimension can be + // partitioned + int input_dims = batch_input->num_dims; + for (int i = 0; i < input_dims; i++) { + if (i != input_dims - 2) { + assert(batch_input->dims[i].degree == 1 && + "Dataloader only supports batch size partitions"); + } + } + int batch_size = batch_input->dims[input_dims - 2].size; + int n_partitions = batch_input->dims[input_dims - 2].degree; + assert(ff.config.batchSize % batch_size == 0); + assert(batch_size % n_partitions == 0); + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + int requests_left = received_requests - counter; + meta.num_samples = std::min(batch_size / n_partitions, requests_left); + for (int i = 0; i < meta.num_samples; i++) { + meta.idxs[i] = next_index + counter; + counter++; + } + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + assert(counter == received_requests); + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, + batch_input->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_input->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_input->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + // Load label + { + Domain domain = + runtime->get_index_space_domain(ctx, batch_label->parallel_is); + ArgumentMap argmap; + int counter = 0; + // current limitation of the dataloader: only the batch dimension can be + // partitioned + int label_dims = batch_label->num_dims; + // assert(batch_label->dims[label_dims - 1].degree == 1); + for (int i = 0; i < label_dims; i++) { + assert(batch_label->dims[i].degree == 1 && + "Dataloader only supports batch size partitions"); + } + int batch_size = batch_label->dims[label_dims - 2].size; + int n_partitions = batch_label->dims[label_dims - 2].degree; + assert(ff.config.batchSize % batch_size == 0); + assert(batch_size % n_partitions == 0); + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + int requests_left = received_requests - counter; + meta.num_samples = std::min(batch_size / n_partitions, requests_left); + for (int i = 0; i < meta.num_samples; i++) { + meta.idxs[i] = next_index + counter; + counter++; + } + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + assert(counter == received_requests); + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, + batch_label->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_label->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_label->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_label->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_label->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_label->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + next_index += received_requests; +} + +void DataLoader::reset() { + next_index = 0; +} + +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Input Task"); + } + // Load label + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Label Task"); + } +} diff --git a/examples/cpp/inference/mixture_of_experts/dataloader.cu b/examples/cpp/inference/mixture_of_experts/dataloader.cu new file mode 100644 index 0000000000..2e234e9b32 --- /dev/null +++ b/examples/cpp/inference/mixture_of_experts/dataloader.cu @@ -0,0 +1,115 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/cuda_helper.h" +#include "moe.h" + +void DataLoader::load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + float const *full_input_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + float *batch_input_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + Domain full_input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain batch_input_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + coord_t token_dim = + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; + coord_t sequence_length = + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + coord_t batch_size = + batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; + + // FIXME: currently assume continous indices + assert(meta->num_samples <= batch_size); + for (int i = 1; i < meta->num_samples; i++) { + assert(meta->idxs[i] == meta->idxs[0] + i); + } + // pad inputs if needed (this is really only useful for debugging) + if (meta->num_samples < batch_size) { + checkCUDA(cudaMemset(batch_input_ptr + + token_dim * sequence_length * meta->num_samples, + 0, + token_dim * sequence_length * + (batch_size - meta->num_samples) * sizeof(float))); + } + coord_t start_idx = meta->idxs[0]; + assert(batch_input_domain.get_volume() % token_dim * sequence_length * + batch_size == + 0); + assert(batch_input_domain.get_volume() % batch_size == 0); + size_t size_to_copy = + (batch_input_domain.get_volume() / batch_size) * meta->num_samples; + float const *input_zc = + full_input_ptr + start_idx * token_dim * sequence_length; + copy_kernel<<>>( + batch_input_ptr, input_zc, size_to_copy); + checkCUDA(cudaDeviceSynchronize()); +} + +void DataLoader::load_label(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + int const *full_label_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + int *batch_label_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + Domain full_label_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain batch_label_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + coord_t label_dim = + batch_label_domain.hi()[0] - batch_label_domain.lo()[0] + 1; + coord_t sequence_length = + batch_label_domain.hi()[1] - batch_label_domain.lo()[1] + 1; + coord_t batch_size = + batch_label_domain.hi()[2] - batch_label_domain.lo()[2] + 1; + // FIXME: currently assume continous indices + assert(meta->num_samples <= batch_size); + for (int i = 1; i < meta->num_samples; i++) { + assert(meta->idxs[i] == meta->idxs[0] + i); + } + if (meta->num_samples < batch_size) { + checkCUDA(cudaMemset(batch_label_ptr + + label_dim * sequence_length * meta->num_samples, + 0, + label_dim * sequence_length * + (batch_size - meta->num_samples) * sizeof(int))); + } + assert(batch_label_domain.get_volume() % label_dim * sequence_length * + batch_size == + 0); + assert(batch_label_domain.get_volume() % batch_size == 0); + coord_t start_idx = meta->idxs[0]; + size_t size_to_copy = + (batch_label_domain.get_volume() / batch_size) * meta->num_samples; + int const *input_zc = + full_label_ptr + start_idx * label_dim * sequence_length; + copy_kernel<<>>( + batch_label_ptr, input_zc, size_to_copy); + checkCUDA(cudaDeviceSynchronize()); +} diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 8dc0842e98..0fd4b32d26 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -14,7 +14,6 @@ */ #include "moe.h" -#include "data_generator.h" #include "flexflow/inference.h" #include #include @@ -96,17 +95,7 @@ void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - // Inference parameters - int total_requests = - 256; // total number of requests processed as part of the simulation - int request_tensor_size = 4; // request tensor dimensions - bool poisson_distribution = true; - double lambda = 25; // average number of request arrivals per second - int num_requests_per_batch = 5; - int num_inflight_batches = 10; - - //----------------------------------------------------------------- - + //----------------------- Initial configurations ------------------------ MoeConfig moeConfig; FFConfig ffConfig; ffConfig.batchSize = moeConfig.batch_size; @@ -122,32 +111,37 @@ void FlexFlow::top_level_task(Task const *task, } FFModel ff(ffConfig); + //----------------------- Create inputs -------------------------------- Tensor input; { int const dims[] = { - ffConfig.batchSize, moeConfig.sequence_length, DATA_DIMS}; + ffConfig.batchSize, moeConfig.sequence_length, moeConfig.token_dim}; input = ff.create_tensor<3>(dims, DT_FLOAT); } - //----------------------------------------------------------------- - + //----------------------- Define the model ------------------------------ Tensor t = create_moe_encoder(&ff, &moeConfig, input); // Tensor t = create_moe(&ff, &moeConfig, input); - t = ff.dense(t, OUT_DIM, AC_MODE_RELU); + t = ff.dense(t, moeConfig.out_dim, AC_MODE_RELU); - InferenceManager im(&ff, num_requests_per_batch, num_inflight_batches); + //------------------- Initialize the inference manager ------------------ + InferenceManager im( + &ff, moeConfig.batch_size, moeConfig.num_inflight_batches); im.compile_model_and_allocate_buffer(); im.init_operators_inference(); - // Data Loader - /* ParallelTensor input_pt, label_pt; + //------------ Initialize the data loader and data generator ------------ + DataGenerator data_generator(moeConfig.total_requests, + moeConfig.token_dim, + moeConfig.sequence_length, + moeConfig.poisson_distribution, + moeConfig.arrival_rate); + ParallelTensor input_pt, label_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); - DataLoader data_loader(ff, moeConfig, input_pt, label_pt); */ + DataLoader data_loader(ff, moeConfig, data_generator, input_pt, label_pt); - //----------------------------------------------------------------- - - // Start timer + //----------------------- Start timer ----------------------------------- { runtime->issue_execution_fence(ctx); TimingLauncher timer(MEASURE_MICRO_SECONDS); @@ -156,34 +150,29 @@ void FlexFlow::top_level_task(Task const *task, } double ts_start = Realm::Clock::current_time_in_microseconds(); - /////////////////////////////////////////////////////////////////////////////////// - + //----------------------- Begin inference! ------------------------------- int index = 0; int processed_requests = 0; int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; - Generator data_generator( - total_requests, request_tensor_size, poisson_distribution, lambda); - - // data_loader.reset(); - while (processed_requests < total_requests) { - vector> req = data_generator.get_requests(); - int nreqs = req.size(); - int iterations = (nreqs % num_requests_per_batch == 0) - ? (nreqs / num_requests_per_batch) - : (nreqs / num_requests_per_batch) + 1; + data_loader.reset(); + data_generator.start_timer(); + while (processed_requests < moeConfig.total_requests) { + size_t received_requests = data_generator.get_requests(); + int iterations = (received_requests % moeConfig.batch_size == 0) + ? (received_requests / moeConfig.batch_size) + : (received_requests / moeConfig.batch_size) + 1; for (int iter = 0; iter < iterations; iter++) { - // data_loader.next_batch(ff); + data_loader.next_batch(ff, received_requests); runtime->begin_trace(ctx, 111 + index % num_devices /*trace_id*/); im.inference(index); runtime->end_trace(ctx, 111 + index % num_devices /*trace_id*/); index++; } - processed_requests += nreqs; + processed_requests += received_requests; } + //----------------------- End of inference! ------------------------------ - /////////////////////////////////////////////////////////////////////////////////// - - // End timer + //----------------------- Stop timer ------------------------------------- { runtime->issue_execution_fence(ctx); TimingLauncher timer(MEASURE_MICRO_SECONDS); @@ -192,350 +181,7 @@ void FlexFlow::top_level_task(Task const *task, } double ts_end = Realm::Clock::current_time_in_microseconds(); double run_time = 1e-6 * (ts_end - ts_start); - printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n", + printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f requests/s\n", run_time, - TRAIN_SAMPLES * ffConfig.epochs / run_time); -} - -DataLoader::DataLoader(FFModel &ff, - MoeConfig const &moe, - ParallelTensor input, - ParallelTensor label) { - num_samples = NUM_SAMPLES; - - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - - // Create full input - { - // Input has dimensions (batch_size, data_dims), which in legion ordering - // becomes (data_dims, batch_size). The corresponding parallel tensor will - // thus have dimensions (data_dims, batch_size, replica_dim). The dimensions - // of the full_input tensor can be obtained by replacing the batch_size with - // the num_samples: (data_dims, num_samples, replica_dim) - assert(input->num_dims == 3); // two dimensions + the replica dimension - batch_input = input; - - ParallelDim dims[3]; - for (int i = 0; i < 3; i++) { - dims[i].size = input->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = input->dims[i].is_replica_dim; - // Assume only the first dim can be the replica dim - assert(i == 2 || (!dims[i].is_replica_dim)); - } - dims[1].size = num_samples; - - full_input = ff.create_parallel_tensor_legion_ordering(3, dims, DT_FLOAT); - ff.map_tensor(full_input, NULL /*parallel_op*/); - } - - // Create full label - { - assert(label->num_dims == LABEL_DIM + 2); - batch_label = label; - - ParallelDim dims[LABEL_DIM + 2]; - for (int i = 0; i < LABEL_DIM + 2; i++) { - dims[i].size = label->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = label->dims[i].is_replica_dim; - // Assume only the last dim can be the replica dim - assert(i == LABEL_DIM + 1 || (!dims[i].is_replica_dim)); - } - assert(dims[LABEL_DIM].size == ff.config.batchSize); - // replace batch size with number of samples - dims[LABEL_DIM].size = num_samples; - - full_label = ff.create_parallel_tensor_legion_ordering( - LABEL_DIM + 2, dims, DT_INT32); - ff.map_tensor(full_label, NULL /*parallel_op*/); - } - - // Load entire dataset - // TODO: Use index launcher instead of task launcher - assert(full_input != nullptr && "full_input is nullptr"); - - MoeConfig const *ptr = &moe; - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, - TaskArgument(&ptr, sizeof(MoeConfig *))); - // regions[0]: full_input - launcher.add_region_requirement(RegionRequirement(full_input->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - // regions[1]: full_label - launcher.add_region_requirement(RegionRequirement(full_label->region, - WRITE_ONLY, - EXCLUSIVE, - full_label->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(1, FID_DATA); - - runtime->execute_task(ctx, launcher); - reset(); - next_batch(ff); -} - -// ================================================= -// Load data -// ================================================= - -void read_cifar100(float *input_ptr, int *label_ptr) { - std::ifstream file; - file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); - if (!file) { - std::cout << "Error opening CIFAR100 train data file" << std::endl; - assert(false); - } - - file.seekg(0, std::ios::beg); - - // each sample: <1 x coarse label><1 x fine label><3072 x pixel> - for (std::size_t i = 0; i < NUM_SAMPLES; i++) { - unsigned char temp = 0; - file.read((char *)&temp, sizeof(temp)); // coarse label, skip - file.read((char *)&temp, sizeof(temp)); - label_ptr[i] = temp; - for (std::size_t j = 0; j < 3072; ++j) { - file.read((char *)&temp, sizeof(temp)); - input_ptr[i * 3072 + j] = (float)temp / 255.0f; - } - } - - file.close(); -} - -int reverseInt(int i) { - unsigned char c1, c2, c3, c4; - - c1 = i & 255; - c2 = (i >> 8) & 255; - c3 = (i >> 16) & 255; - c4 = (i >> 24) & 255; - - return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; -} - -/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to -the current working directory */ -void read_mnist(float *input_ptr, int *label_ptr) { - // read inputs - std::ifstream input("train-images-idx3-ubyte", std::ios::binary); - if (input.is_open()) { - int magic_number = 0; - int number_of_images = 0; - int n_rows = 0; - int n_cols = 0; - input.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - input.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - input.read((char *)&n_rows, sizeof(n_rows)); - n_rows = reverseInt(n_rows); - input.read((char *)&n_cols, sizeof(n_cols)); - n_cols = reverseInt(n_cols); - - for (int i = 0; i < number_of_images; i++) { - for (int r = 0; r < n_rows; r++) { - for (int c = 0; c < n_cols; c++) { - unsigned char temp = 0; - input.read((char *)&temp, sizeof(temp)); - input_ptr[i * n_rows * n_cols + r * n_cols + c] = - (float)temp / 255.0f; - } - } - } - } else { - std::cout << "Error opening MNIST input data file" << std::endl; - assert(false); - } - - // read labels - std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary); - if (labels.is_open()) { - int magic_number = 0; - int number_of_images = 0; - labels.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - labels.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - - for (int i = 0; i < number_of_images; i++) { - unsigned char temp = 0; - labels.read((char *)&temp, sizeof(temp)); - label_ptr[i] = temp; - } - } else { - std::cout << "Error opening MNIST label data file" << std::endl; - assert(false); - } -} - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // const MoeConfig* conf = *((MoeConfig**)task->args); - assert(regions.size() == 2); - assert(task->regions.size() == regions.size()); - - // get input and label pointer - AccessorWO const acc_input(regions[0], FID_DATA); - AccessorWO const acc_label(regions[1], FID_DATA); - Rect<3> rect_input = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(acc_input.accessor.is_dense_arbitrary(rect_input)); - Rect rect_label = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(acc_label.accessor.is_dense_arbitrary(rect_label)); - float *input_ptr = acc_input.ptr(rect_input.lo); - int *label_ptr = acc_label.ptr(rect_label.lo); - int num_samples = rect_input.hi[1] - rect_input.lo[1] + 1; - assert(rect_label.hi[1] - rect_label.lo[1] + 1 == num_samples); - - // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load - // another dataset using the dataset_path from the MoeConfig object - read_mnist(input_ptr, label_ptr); - log_app.print("finish loading MNIST data\n"); -} - -void DataLoader::next_batch(FFModel &ff) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load input - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_input->parallel_is); - ArgumentMap argmap; - int idx = next_index; - // current limitation of the dataloader: only the batch dimension can be - // partitioned - int input_dims = batch_input->num_dims; - for (int i = 0; i < input_dims; i++) { - if (i != input_dims - 2) { - assert(batch_input->dims[i].degree == 1 && - "Dataloader only supports batch size partitions"); - } - } - int batch_size = batch_input->dims[input_dims - 2].size; - int n_partitions = batch_input->dims[input_dims - 2].degree; - assert(ff.config.batchSize % batch_size == 0); - assert(batch_size % n_partitions == 0); - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - meta.num_samples = batch_size / n_partitions; - for (int i = 0; i < meta.num_samples; i++) { - meta.idxs[i] = idx++; - } - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_input->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_input->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_input->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - // Load label - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_label->parallel_is); - ArgumentMap argmap; - int idx = next_index; - // current limitation of the dataloader: only the batch dimension can be - // partitioned - int label_dims = batch_label->num_dims; - assert(batch_label->dims[label_dims - 1].degree == 1); - for (int i = 0; i < LABEL_DIM; i++) { - assert(batch_label->dims[i].degree == 1 && - "Dataloader only supports batch size partitions"); - } - int batch_size = batch_label->dims[label_dims - 2].size; - int n_partitions = batch_label->dims[label_dims - 2].degree; - assert(ff.config.batchSize % batch_size == 0); - assert(batch_size % n_partitions == 0); - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - meta.num_samples = batch_size / n_partitions; - for (int i = 0; i < meta.num_samples; i++) { - meta.idxs[i] = idx++; - } - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, - batch_label->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_label->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_label->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_label->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_label->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_label->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - next_index += ff.config.batchSize; -} - -void DataLoader::reset() { - next_index = 0; -} - -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Input Task"); - } - // Load label - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Label Task"); - } + moeConfig.total_requests / run_time); } diff --git a/examples/cpp/inference/mixture_of_experts/moe.cu b/examples/cpp/inference/mixture_of_experts/moe.cu deleted file mode 100644 index ae1e5aca30..0000000000 --- a/examples/cpp/inference/mixture_of_experts/moe.cu +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/utils/cuda_helper.h" -#include "moe.h" - -void DataLoader::load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - TensorAccessorR acc_full_input( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_batch_input(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - - coord_t batch_size = - acc_batch_input.rect.hi[1] - acc_batch_input.rect.lo[1] + 1; - coord_t sample_dim = - acc_batch_input.rect.hi[0] - acc_batch_input.rect.lo[0] + 1; - - // FIXME: currently assume continous indices - assert(batch_size == meta->num_samples); - for (int i = 1; i < batch_size; i++) { - assert(meta->idxs[i] == meta->idxs[0] + i); - } - coord_t start_idx = meta->idxs[0]; - float const *input_zc = acc_full_input.ptr + start_idx * sample_dim; - copy_kernel<<>>( - acc_batch_input.ptr, input_zc, acc_batch_input.rect.volume()); - checkCUDA(cudaDeviceSynchronize()); -} - -void DataLoader::load_label(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - TensorAccessorR acc_full_label( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_batch_label(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - coord_t batch_size = - acc_batch_label.rect.hi[1] - acc_batch_label.rect.lo[1] + 1; - // FIXME: currently assume continous indices - assert(batch_size == meta->num_samples); - for (int i = 1; i < meta->num_samples; i++) { - assert(meta->idxs[i] == meta->idxs[0] + i); - } - int const *input_zc = acc_full_label.ptr + meta->idxs[0]; - copy_kernel<<>>( - acc_batch_label.ptr, input_zc, acc_batch_label.rect.volume()); - checkCUDA(cudaDeviceSynchronize()); -} diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index 2df988f530..56feb775e8 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -13,16 +13,11 @@ * limitations under the License. */ +#include "data_generator.h" #include "flexflow/model.h" #define MAX_NUM_SAMPLES 1000 -#define NUM_SAMPLES 1000 -#define TRAIN_SAMPLES 1000 -#define TEST_SAMPLES 00000 #define MNIST_DIMS 28 * 28 -#define CIFAR_DIMS 3 * 32 * 32 -#define DATA_DIMS MNIST_DIMS -#define OUT_DIM 10 -#define LABEL_DIM 1 +#define DATA_DIM MNIST_DIMS using namespace Legion; using namespace std; @@ -30,45 +25,69 @@ using namespace FlexFlow; struct MoeConfig { MoeConfig(void) { - // MoE layer + //----------------------- Input/output data ------------------------ + token_dim = DATA_DIM; + sequence_length = 10; + batch_size = 32; + out_dim = 15; + num_labels = out_dim; + //----------------------- Inference parameters --------------------- + // total number of requests processed as part of the simulation + total_requests = 256; + poisson_distribution = true; + // average number of request arrivals per second + arrival_rate = 25; + num_inflight_batches = 10; + //----------------------- MoE layer -------------------------------- // total number of experts num_exp = 128; // number of experts in each block of fused experts experts_per_block = 32; // number of experts to route each token to num_select = 2; - alpha = 2.0f; - lambda = 0.04f; - hidden_size = DATA_DIMS; - batch_size = 32; - sequence_length = 10; + // expert capacity parameters + alpha = 2.0f; // factor overhead tensor size for imbalance + lambda = 0.04f; // multiplier for load balance term + // expert hidden size + hidden_size = DATA_DIM; + //----------------------- Rest of model parameters ------------------ // Encoder layer num_attention_heads = 16; attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 1; // + num_encoder_layers = 1; } + + // Input/output data + int token_dim; + int sequence_length; + int batch_size; + int out_dim; + int num_labels; + std::string dataset_path; + // Inference parameters + int total_requests; + bool poisson_distribution; + double arrival_rate; + int num_inflight_batches; // MoE layer int num_exp; - int num_select; int experts_per_block; - float alpha; // factor overhead tensor size for imbalance - float lambda; // multiplier for load balance term + int num_select; + float alpha; + float lambda; int hidden_size; - int batch_size; - int sequence_length; - // Encoder layer + // Model parameters int num_attention_heads; int attention_kdim; int attention_vdim; int num_encoder_layers; - // Dataset - std::string dataset_path; }; class DataLoader { public: DataLoader(FFModel &ff, - MoeConfig const &moe, + MoeConfig const &moeConfig, + DataGenerator &data_generator, ParallelTensor input, ParallelTensor label); static void load_input(Task const *task, @@ -83,13 +102,17 @@ class DataLoader { std::vector const ®ions, Context ctx, Runtime *runtime); - void next_batch(FFModel &); + void next_batch(FFModel &, size_t); void reset(void); public: int num_samples, next_index; FlexFlow::ParallelTensor full_input, batch_input; FlexFlow::ParallelTensor full_label, batch_label; + struct DataLoaderInput { + MoeConfig const &_moeConfig; + DataGenerator &_data_generator; + }; }; struct SampleIdxs { diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index 01766347b0..f78311980c 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -122,19 +122,17 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_grad_volume, 0, scale_factor); } -void Loss::identity_loss_backward_kernel_wrapper( - float *loss_grad_ptr, - float const *loss_ptr, - size_t loss_volume, - size_t loss_grad_volume, - float scale_factor) { +void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float scale_factor) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); identity_loss_backward<<>>( - loss_grad_ptr, loss_ptr, loss_volume); + stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( loss_grad_ptr, loss_grad_volume, 0, scale_factor); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index e3e4bd9b07..541cf34976 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -148,6 +148,10 @@ void InferenceManager::inference(int index) { for (int i = 0; i < op->numOutputs; i++) { assert(op->outputs[i] != nullptr); assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + if (op->op_type == OP_INPUT && + tensor_buffer[op->outputs[i]].size() == 0) { + continue; + } assert(tensor_buffer[op->outputs[i]].size() > batch_index); outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index 2c4b189046..3645544b44 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -45,6 +45,8 @@ if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then # TODO: fix split tests # "$FF_HOME"/build/examples/cpp/split_test/split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel + # Inference examples + "$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel else python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))") OLD_PATH="$PATH" @@ -73,6 +75,8 @@ else # TODO: fix split tests # split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel + # Inference examples + inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel fi done export PATH="$OLD_PATH" From 0315bbfdc79a9cf6870589ea85e60846e67fd11b Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 24 Feb 2023 16:27:29 +0000 Subject: [PATCH 061/344] [IncMultiHeadSelfAttention] initial implementation --- include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 19 + include/flexflow/operator_params.h | 2 + .../ops/inc_multihead_self_attention.h | 129 ++++ .../ops/inc_multihead_self_attention_params.h | 30 + src/ops/inc_multihead_self_attention.cc | 692 ++++++++++++++++++ src/ops/inc_multihead_self_attention.cu | 267 +++++++ src/runtime/model.cc | 26 + 8 files changed, 1166 insertions(+) create mode 100644 include/flexflow/ops/inc_multihead_self_attention.h create mode 100644 include/flexflow/ops/inc_multihead_self_attention_params.h create mode 100644 src/ops/inc_multihead_self_attention.cc create mode 100644 src/ops/inc_multihead_self_attention.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index ee0b007a8e..2ccf4a0eb1 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -146,6 +146,7 @@ enum OperatorType { OP_LAYERNORM, OP_EXPERTS, OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html + OP_INC_MULTIHEAD_SELF_ATTENTION, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 2c460e507a..041d8c507d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -136,6 +136,10 @@ enum TaskIDs { ATTENTION_INIT_TASK_ID, ATTENTION_FWD_TASK_ID, ATTENTION_BWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -269,6 +273,7 @@ class Group_by; class LayerNorm; class Linear; class MultiHeadAttention; +class IncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -564,6 +569,17 @@ class FFModel { bool add_zero_attn = false, Initializer *kernel_initializer = NULL, char const *name = NULL); + Tensor inc_multihead_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = true, + bool add_bias_kv = false, + bool add_zero_attn = false, + Initializer *kernel_initializer = NULL, + char const *name = NULL); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], DataType data_type, @@ -910,6 +926,9 @@ class FFModel { ParallelTensorShape>, MultiHeadAttentionParams>, MultiHeadAttention *>, + std::unordered_map< + std::pair, + IncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 0297f690f5..899921a758 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -16,6 +16,7 @@ #include "flexflow/ops/flat_params.h" #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" +#include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" #include "flexflow/ops/pool_2d_params.h" @@ -52,6 +53,7 @@ using OperatorParameters = mp::variant const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + static void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + ffStream_t stream); + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); + Params get_params() const; +public: + int num_heads; + float dropout; + bool bias; + bool add_bias_kv, add_zero_attn; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int qoSeqLength, kvSeqLength; +}; + +class IncMultiHeadSelfAttentionMeta : public OpMeta { +public: + IncMultiHeadSelfAttentionMeta(FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Legion::Memory gpu_mem, + int num_samples, + int num_heads); + ~IncMultiHeadSelfAttentionMeta(void); + +public: + Realm::RegionInstance reserveInst; + size_t weightSize, reserveSpaceSize; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnAttnDescriptor_t attnDesc; + cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; +#endif + int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx; + void *reserveSpace; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ATTENTION_H diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h new file mode 100644 index 0000000000..e7535dc23d --- /dev/null +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H + +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct IncMultiHeadSelfAttentionParams { + LayerID layer_guid; + int embed_dim, num_heads, kdim, vdim; + float dropout; + bool bias, add_bias_kv, add_zero_attn; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(IncMultiHeadSelfAttentionParams const &, + IncMultiHeadSelfAttentionParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::IncMultiHeadSelfAttentionParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc new file mode 100644 index 0000000000..329254fb74 --- /dev/null +++ b/src/ops/inc_multihead_self_attention.cc @@ -0,0 +1,692 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +bool IncMultiHeadSelfAttentionParams::is_valid(ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor FFModel::inc_multihead_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + Initializer *kernel_initializer, + char const *name) { + // Currently assume that + Layer *li = new Layer(this, + OP_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + } + { + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; + li->weights[0] = create_weight_legion_ordering(2, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = DT_FLOAT; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_heads", num_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("bias", bias); + li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + layers.push_back(li); + return li->outputs[0]; +} + +Op *IncMultiHeadSelfAttention::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_heads", value); + int num_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("bias", value); + bool bias = (bool)value; + layer->get_int_property("add_bias_kv", value); + bool add_bias_kv = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + return new IncMultiHeadSelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + false /*allocate_weights*/, + layer->name); +} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_MULTIHEAD_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +{ + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + dims[2].degree = 1; + dims[2].parallel_idx = -1; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input, + _weight), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, + IncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights) + : IncMultiHeadSelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + allocate_weights, + other.name) {} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + IncMultiHeadSelfAttentionParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : IncMultiHeadSelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + allocate_weights, + name) {} + +void IncMultiHeadSelfAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, view); +} + +void IncMultiHeadSelfAttention::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta * + IncMultiHeadSelfAttention::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + IncMultiHeadSelfAttentionMeta *m = + new IncMultiHeadSelfAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); + m->profiling = attn->profiling; + assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + return m; +} + +void IncMultiHeadSelfAttention::forward(FFModel const &ff) { + // IncMultiHeadSelfAttention doesn't support forward + assert(false); +} + +void IncMultiHeadSelfAttention::inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "IncMultiHeadSelfAttention op machine_view: " << *(MachineView const + *)mv + << std::endl; */ + int idx = 0; + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiHeadSelfAttention::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == regions.size()); + // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) task->args; + IncMultiHeadSelfAttentionMeta const *m = + *((IncMultiHeadSelfAttentionMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + IncMultiHeadSelfAttention::inference_kernel_wrapper(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr()); +} + +void IncMultiHeadSelfAttention::backward(FFModel const &ff) { + // IncMultiHeadSelfAttention does not support backward + assert(false); +} + +bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool IncMultiHeadSelfAttention::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + ParallelTensorBase sub_output, sub_input; + if (!inputs[0]->get_sub_tensor(mv, sub_input)) { + return false; + } + if (!outputs[0]->get_sub_tensor(mv, sub_output)) { + return false; + } + // Currently assume only data parallel + size_t num_weights = 0; + { + // Compute weight size + int qSize = sub_input.dims[0].size; + int kSize = sub_input.dims[0].size; + int vSize = sub_input.dims[0].size; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + num_weights = num_heads * (qParas + kParas + vParas + oParas); + } + assert(sub_input.num_dims == 4); + int num_samples = sub_input.dims[2].size; + + IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( + sim->handler, this, sim->memory, num_samples, num_heads); + + // allocate tensors in simulator + sim->free_all(); + float const *input_ptr = + (float const *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(output_ptr != NULL); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float const *weight_ptr = (float const *)sim->allocate(num_weights, DT_FLOAT); + cost_metrics.weights_memory += cost_metrics.total_mem_diff_from(sim->offset); + + assert(m->profiling == false); + + std::function forward, backward; + forward = [&] { + inference_kernel_wrapper( + m, input_ptr, weight_ptr, output_ptr); + }; + if (sim->computationMode == COMP_MODE_TRAINING) { + // IncMultiHeadSelfAttention does not support training + assert(false); + } + + inner_measure_operator_cost(sim, forward, backward, cost_metrics); + + if (sim->computationMode == COMP_MODE_TRAINING) { + printf("[Measure IncMultiHeadSelfAttention] query(%d %d %d) key(%d %d %d) " + "value(%d %d %d) output(%d %d %d)" + "forward_time(%.4lf) backward_time(%.4lf)\n", + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_output.dims[2].size, + sub_output.dims[1].size, + sub_output.dims[0].size, + cost_metrics.forward_time, + cost_metrics.backward_time); + } else { + printf("[Measure IncMultiHeadSelfAttention] query(%d %d %d) key(%d %d %d) " + "value(%d %d %d) output(%d %d %d)" + "forward_time(%.4lf)\n", + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_output.dims[2].size, + sub_output.dims[1].size, + sub_output.dims[0].size, + cost_metrics.forward_time); + } + // Free multiheadattentionmeta + delete m; + return true; +} + +using PCG::Node; + +bool operator==(IncMultiHeadSelfAttentionParams const &lhs, + IncMultiHeadSelfAttentionParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.add_zero_attn == rhs.add_zero_attn; +} + +IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { + IncMultiHeadSelfAttentionParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_heads = this->num_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.bias = this->bias; + params.add_bias_kv = this->add_bias_kv; + params.add_zero_attn = this->add_zero_attn; + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::IncMultiHeadSelfAttentionParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.bias); + hash_combine(key, params.add_bias_kv); + hash_combine(key, params.add_zero_attn); + return key; +} +}; // namespace std diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu new file mode 100644 index 0000000000..b903a6f073 --- /dev/null +++ b/src/ops/inc_multihead_self_attention.cu @@ -0,0 +1,267 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +/*static*/ +void IncMultiHeadSelfAttention::inference_kernel(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + checkCUDNN(cudnnMultiHeadAttnForward(m->handle.dnn, + m->attnDesc, + -1, + m->loWinIdx, + m->hiWinIdx, + m->devQoSeqArray, + m->devKvSeqArray, + m->qDesc, + input_ptr, + NULL /*residual*/, + m->kDesc, + input_ptr, + m->vDesc, + input_ptr, + m->oDesc, + output_ptr, + m->weightSize, + weight_ptr, + m->handle.workSpaceSize, + m->handle.workSpace, + m->reserveSpaceSize, + m->reserveSpace)); +} + +/*static*/ +void IncMultiHeadSelfAttention::inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + IncMultiHeadSelfAttention::inference_kernel( + m, input_ptr, weight_ptr, output_ptr, stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Memory gpu_mem, + int num_samples, + int num_heads) + : OpMeta(handler) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + checkCUDNN(cudnnCreateAttnDescriptor(&attnDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&qDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&kDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&vDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); + // Currently do not support adding bias to key/value projection + assert(!attn->add_bias_kv); + cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; + // Assume no beam search for now + int maxBeamSize = 1; + // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) + // kProjSize(%d)\n", + // num_samples, attn->qSize, attn->kSize, attn->vSize, attn->qProjSize, + // attn->kProjSize); + // printf("vProjSize(%d) oProjSize(%d) qoSeqLength(%d) kvSeqLength(%d)\n", + // attn->vProjSize, attn->oProjSize, attn->qoSeqLength, + // attn->kvSeqLength); + cudnnMathType_t math_type; + if (handle.allowTensorOpMathConversion) { + math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; + } else { + math_type = CUDNN_TENSOR_OP_MATH; + } + checkCUDNN(cudnnSetAttnDescriptor(attnDesc, + attnMode, + num_heads, + 1.0f /*smScalar*/, + CUDNN_DATA_FLOAT, + CUDNN_DATA_FLOAT, + math_type, + NULL /*attnDropoutDesc*/, + NULL /*postDropoutDesc*/, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->qoSeqLength, + attn->kvSeqLength, + num_samples, + maxBeamSize)); + size_t workSpaceSize; + checkCUDNN(cudnnGetMultiHeadAttnBuffers( + handler.dnn, attnDesc, &weightSize, &workSpaceSize, &reserveSpaceSize)); + assert(workSpaceSize <= handler.workSpaceSize); + // printf("weightSize(%zu) workSpaceSize(%zu) reserveSpaceSize(%zu)\n", + // weightSize, workSpaceSize, reserveSpaceSize); + int dimA[CUDNN_SEQDATA_DIM_COUNT]; + cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT]; + assert(CUDNN_SEQDATA_DIM_COUNT == 4); + axes[3] = CUDNN_SEQDATA_VECT_DIM; // 3 = nbDims-1 + axes[2] = CUDNN_SEQDATA_BEAM_DIM; + axes[1] = CUDNN_SEQDATA_TIME_DIM; + axes[0] = CUDNN_SEQDATA_BATCH_DIM; + int *qoSeqArray = (int *)malloc(sizeof(int) * num_samples); + int *kvSeqArray = (int *)malloc(sizeof(int) * num_samples); + for (int i = 0; i < num_samples; i++) { + qoSeqArray[i] = attn->qoSeqLength; + kvSeqArray[i] = attn->kvSeqLength; + } + // Set qDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->qSize; + checkCUDNN(cudnnSetSeqDataDescriptor(qDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + qoSeqArray, + NULL)); + } + // Set kDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->kSize; + checkCUDNN(cudnnSetSeqDataDescriptor(kDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + kvSeqArray, + NULL)); + } + // Set vDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->vSize; + checkCUDNN(cudnnSetSeqDataDescriptor(vDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + kvSeqArray, + NULL)); + } + // Set oDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->oProjSize; + checkCUDNN(cudnnSetSeqDataDescriptor(oDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + qoSeqArray, + NULL)); + } + // allocate memory for the seqArray and reserve space + { + size_t totalSize = reserveSpaceSize + sizeof(int) * num_samples * 2; + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(totalSize - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(reserveInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + devQoSeqArray = (int *)reserveInst.pointer_untyped(0, sizeof(char)); + checkCUDA(cudaMemcpy(devQoSeqArray, + qoSeqArray, + sizeof(int) * num_samples, + cudaMemcpyHostToDevice)); + devKvSeqArray = (int *)devQoSeqArray + num_samples; + checkCUDA(cudaMemcpy(devKvSeqArray, + kvSeqArray, + sizeof(int) * num_samples, + cudaMemcpyHostToDevice)); + reserveSpace = (int *)devKvSeqArray + num_samples; + } + // allocate memory for loWinIdx/hiWinIdx + loWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); + hiWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); + for (int i = 0; i < attn->qoSeqLength; i++) { + loWinIdx[i] = 0; + hiWinIdx[i] = attn->kvSeqLength; + } + free(qoSeqArray); + free(kvSeqArray); +} + +IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { + reserveInst.destroy(); + free(loWinIdx); + free(hiWinIdx); + checkCUDNN(cudnnDestroyAttnDescriptor(attnDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(qDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(kDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(vDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(oDesc)); +} + +}; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4fdc1f9819..ee274dc427 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -40,6 +40,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -2736,6 +2737,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + Op *op = IncMultiHeadSelfAttention::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -4491,6 +4498,25 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "MultiHeadAttention Backward Task"); } + // MultiHeadAttention task + { + TaskVariantRegistrar registrar(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + "IncMultiHeadSelfAttention Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "IncMultiHeadSelfAttention Init Task"); + } + { + TaskVariantRegistrar registrar(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + "IncMultiHeadSelfAttention Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant< + IncMultiHeadSelfAttention::inference_task>( + registrar, "IncMultiHeadSelfAttention Inference Task"); + } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); From 8c0fe63cd2c4d0c33ba652b2b5030743a57872c9 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 27 Feb 2023 09:43:46 -0600 Subject: [PATCH 062/344] [IncMultiHeadSelfAttention] initial implementation (#639) --- include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 19 + include/flexflow/operator_params.h | 2 + .../ops/inc_multihead_self_attention.h | 129 ++++ .../ops/inc_multihead_self_attention_params.h | 30 + src/ops/inc_multihead_self_attention.cc | 692 ++++++++++++++++++ src/ops/inc_multihead_self_attention.cu | 267 +++++++ src/runtime/model.cc | 26 + 8 files changed, 1166 insertions(+) create mode 100644 include/flexflow/ops/inc_multihead_self_attention.h create mode 100644 include/flexflow/ops/inc_multihead_self_attention_params.h create mode 100644 src/ops/inc_multihead_self_attention.cc create mode 100644 src/ops/inc_multihead_self_attention.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index ee0b007a8e..2ccf4a0eb1 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -146,6 +146,7 @@ enum OperatorType { OP_LAYERNORM, OP_EXPERTS, OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html + OP_INC_MULTIHEAD_SELF_ATTENTION, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 2c460e507a..041d8c507d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -136,6 +136,10 @@ enum TaskIDs { ATTENTION_INIT_TASK_ID, ATTENTION_FWD_TASK_ID, ATTENTION_BWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -269,6 +273,7 @@ class Group_by; class LayerNorm; class Linear; class MultiHeadAttention; +class IncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -564,6 +569,17 @@ class FFModel { bool add_zero_attn = false, Initializer *kernel_initializer = NULL, char const *name = NULL); + Tensor inc_multihead_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = true, + bool add_bias_kv = false, + bool add_zero_attn = false, + Initializer *kernel_initializer = NULL, + char const *name = NULL); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], DataType data_type, @@ -910,6 +926,9 @@ class FFModel { ParallelTensorShape>, MultiHeadAttentionParams>, MultiHeadAttention *>, + std::unordered_map< + std::pair, + IncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 0297f690f5..899921a758 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -16,6 +16,7 @@ #include "flexflow/ops/flat_params.h" #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" +#include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" #include "flexflow/ops/pool_2d_params.h" @@ -52,6 +53,7 @@ using OperatorParameters = mp::variant const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + void inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + static void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + ffStream_t stream); + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); + Params get_params() const; +public: + int num_heads; + float dropout; + bool bias; + bool add_bias_kv, add_zero_attn; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int qoSeqLength, kvSeqLength; +}; + +class IncMultiHeadSelfAttentionMeta : public OpMeta { +public: + IncMultiHeadSelfAttentionMeta(FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Legion::Memory gpu_mem, + int num_samples, + int num_heads); + ~IncMultiHeadSelfAttentionMeta(void); + +public: + Realm::RegionInstance reserveInst; + size_t weightSize, reserveSpaceSize; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnAttnDescriptor_t attnDesc; + cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; +#endif + int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx; + void *reserveSpace; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ATTENTION_H diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h new file mode 100644 index 0000000000..e7535dc23d --- /dev/null +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H + +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct IncMultiHeadSelfAttentionParams { + LayerID layer_guid; + int embed_dim, num_heads, kdim, vdim; + float dropout; + bool bias, add_bias_kv, add_zero_attn; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(IncMultiHeadSelfAttentionParams const &, + IncMultiHeadSelfAttentionParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::IncMultiHeadSelfAttentionParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc new file mode 100644 index 0000000000..329254fb74 --- /dev/null +++ b/src/ops/inc_multihead_self_attention.cc @@ -0,0 +1,692 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +bool IncMultiHeadSelfAttentionParams::is_valid(ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor FFModel::inc_multihead_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + Initializer *kernel_initializer, + char const *name) { + // Currently assume that + Layer *li = new Layer(this, + OP_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + } + { + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; + li->weights[0] = create_weight_legion_ordering(2, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = DT_FLOAT; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_heads", num_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("bias", bias); + li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + layers.push_back(li); + return li->outputs[0]; +} + +Op *IncMultiHeadSelfAttention::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_heads", value); + int num_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("bias", value); + bool bias = (bool)value; + layer->get_int_property("add_bias_kv", value); + bool add_bias_kv = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + return new IncMultiHeadSelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + false /*allocate_weights*/, + layer->name); +} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_MULTIHEAD_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +{ + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + dims[2].degree = 1; + dims[2].parallel_idx = -1; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input, + _weight), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, + IncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights) + : IncMultiHeadSelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + allocate_weights, + other.name) {} + +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + IncMultiHeadSelfAttentionParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : IncMultiHeadSelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + allocate_weights, + name) {} + +void IncMultiHeadSelfAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, view); +} + +void IncMultiHeadSelfAttention::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta * + IncMultiHeadSelfAttention::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + IncMultiHeadSelfAttentionMeta *m = + new IncMultiHeadSelfAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); + m->profiling = attn->profiling; + assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + return m; +} + +void IncMultiHeadSelfAttention::forward(FFModel const &ff) { + // IncMultiHeadSelfAttention doesn't support forward + assert(false); +} + +void IncMultiHeadSelfAttention::inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "IncMultiHeadSelfAttention op machine_view: " << *(MachineView const + *)mv + << std::endl; */ + int idx = 0; + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiHeadSelfAttention::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == regions.size()); + // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) task->args; + IncMultiHeadSelfAttentionMeta const *m = + *((IncMultiHeadSelfAttentionMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + IncMultiHeadSelfAttention::inference_kernel_wrapper(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr()); +} + +void IncMultiHeadSelfAttention::backward(FFModel const &ff) { + // IncMultiHeadSelfAttention does not support backward + assert(false); +} + +bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool IncMultiHeadSelfAttention::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + ParallelTensorBase sub_output, sub_input; + if (!inputs[0]->get_sub_tensor(mv, sub_input)) { + return false; + } + if (!outputs[0]->get_sub_tensor(mv, sub_output)) { + return false; + } + // Currently assume only data parallel + size_t num_weights = 0; + { + // Compute weight size + int qSize = sub_input.dims[0].size; + int kSize = sub_input.dims[0].size; + int vSize = sub_input.dims[0].size; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + num_weights = num_heads * (qParas + kParas + vParas + oParas); + } + assert(sub_input.num_dims == 4); + int num_samples = sub_input.dims[2].size; + + IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( + sim->handler, this, sim->memory, num_samples, num_heads); + + // allocate tensors in simulator + sim->free_all(); + float const *input_ptr = + (float const *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + assert(output_ptr != NULL); + cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + + float const *weight_ptr = (float const *)sim->allocate(num_weights, DT_FLOAT); + cost_metrics.weights_memory += cost_metrics.total_mem_diff_from(sim->offset); + + assert(m->profiling == false); + + std::function forward, backward; + forward = [&] { + inference_kernel_wrapper( + m, input_ptr, weight_ptr, output_ptr); + }; + if (sim->computationMode == COMP_MODE_TRAINING) { + // IncMultiHeadSelfAttention does not support training + assert(false); + } + + inner_measure_operator_cost(sim, forward, backward, cost_metrics); + + if (sim->computationMode == COMP_MODE_TRAINING) { + printf("[Measure IncMultiHeadSelfAttention] query(%d %d %d) key(%d %d %d) " + "value(%d %d %d) output(%d %d %d)" + "forward_time(%.4lf) backward_time(%.4lf)\n", + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_output.dims[2].size, + sub_output.dims[1].size, + sub_output.dims[0].size, + cost_metrics.forward_time, + cost_metrics.backward_time); + } else { + printf("[Measure IncMultiHeadSelfAttention] query(%d %d %d) key(%d %d %d) " + "value(%d %d %d) output(%d %d %d)" + "forward_time(%.4lf)\n", + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_input.dims[2].size, + sub_input.dims[1].size, + sub_input.dims[0].size, + sub_output.dims[2].size, + sub_output.dims[1].size, + sub_output.dims[0].size, + cost_metrics.forward_time); + } + // Free multiheadattentionmeta + delete m; + return true; +} + +using PCG::Node; + +bool operator==(IncMultiHeadSelfAttentionParams const &lhs, + IncMultiHeadSelfAttentionParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.add_zero_attn == rhs.add_zero_attn; +} + +IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { + IncMultiHeadSelfAttentionParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_heads = this->num_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.bias = this->bias; + params.add_bias_kv = this->add_bias_kv; + params.add_zero_attn = this->add_zero_attn; + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::IncMultiHeadSelfAttentionParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.bias); + hash_combine(key, params.add_bias_kv); + hash_combine(key, params.add_zero_attn); + return key; +} +}; // namespace std diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu new file mode 100644 index 0000000000..b903a6f073 --- /dev/null +++ b/src/ops/inc_multihead_self_attention.cu @@ -0,0 +1,267 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +/*static*/ +void IncMultiHeadSelfAttention::inference_kernel(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + checkCUDNN(cudnnMultiHeadAttnForward(m->handle.dnn, + m->attnDesc, + -1, + m->loWinIdx, + m->hiWinIdx, + m->devQoSeqArray, + m->devKvSeqArray, + m->qDesc, + input_ptr, + NULL /*residual*/, + m->kDesc, + input_ptr, + m->vDesc, + input_ptr, + m->oDesc, + output_ptr, + m->weightSize, + weight_ptr, + m->handle.workSpaceSize, + m->handle.workSpace, + m->reserveSpaceSize, + m->reserveSpace)); +} + +/*static*/ +void IncMultiHeadSelfAttention::inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + IncMultiHeadSelfAttention::inference_kernel( + m, input_ptr, weight_ptr, output_ptr, stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Memory gpu_mem, + int num_samples, + int num_heads) + : OpMeta(handler) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + checkCUDNN(cudnnCreateAttnDescriptor(&attnDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&qDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&kDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&vDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); + // Currently do not support adding bias to key/value projection + assert(!attn->add_bias_kv); + cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; + // Assume no beam search for now + int maxBeamSize = 1; + // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) + // kProjSize(%d)\n", + // num_samples, attn->qSize, attn->kSize, attn->vSize, attn->qProjSize, + // attn->kProjSize); + // printf("vProjSize(%d) oProjSize(%d) qoSeqLength(%d) kvSeqLength(%d)\n", + // attn->vProjSize, attn->oProjSize, attn->qoSeqLength, + // attn->kvSeqLength); + cudnnMathType_t math_type; + if (handle.allowTensorOpMathConversion) { + math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; + } else { + math_type = CUDNN_TENSOR_OP_MATH; + } + checkCUDNN(cudnnSetAttnDescriptor(attnDesc, + attnMode, + num_heads, + 1.0f /*smScalar*/, + CUDNN_DATA_FLOAT, + CUDNN_DATA_FLOAT, + math_type, + NULL /*attnDropoutDesc*/, + NULL /*postDropoutDesc*/, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->qoSeqLength, + attn->kvSeqLength, + num_samples, + maxBeamSize)); + size_t workSpaceSize; + checkCUDNN(cudnnGetMultiHeadAttnBuffers( + handler.dnn, attnDesc, &weightSize, &workSpaceSize, &reserveSpaceSize)); + assert(workSpaceSize <= handler.workSpaceSize); + // printf("weightSize(%zu) workSpaceSize(%zu) reserveSpaceSize(%zu)\n", + // weightSize, workSpaceSize, reserveSpaceSize); + int dimA[CUDNN_SEQDATA_DIM_COUNT]; + cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT]; + assert(CUDNN_SEQDATA_DIM_COUNT == 4); + axes[3] = CUDNN_SEQDATA_VECT_DIM; // 3 = nbDims-1 + axes[2] = CUDNN_SEQDATA_BEAM_DIM; + axes[1] = CUDNN_SEQDATA_TIME_DIM; + axes[0] = CUDNN_SEQDATA_BATCH_DIM; + int *qoSeqArray = (int *)malloc(sizeof(int) * num_samples); + int *kvSeqArray = (int *)malloc(sizeof(int) * num_samples); + for (int i = 0; i < num_samples; i++) { + qoSeqArray[i] = attn->qoSeqLength; + kvSeqArray[i] = attn->kvSeqLength; + } + // Set qDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->qSize; + checkCUDNN(cudnnSetSeqDataDescriptor(qDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + qoSeqArray, + NULL)); + } + // Set kDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->kSize; + checkCUDNN(cudnnSetSeqDataDescriptor(kDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + kvSeqArray, + NULL)); + } + // Set vDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->vSize; + checkCUDNN(cudnnSetSeqDataDescriptor(vDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + kvSeqArray, + NULL)); + } + // Set oDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->oProjSize; + checkCUDNN(cudnnSetSeqDataDescriptor(oDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + qoSeqArray, + NULL)); + } + // allocate memory for the seqArray and reserve space + { + size_t totalSize = reserveSpaceSize + sizeof(int) * num_samples * 2; + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(totalSize - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(reserveInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + devQoSeqArray = (int *)reserveInst.pointer_untyped(0, sizeof(char)); + checkCUDA(cudaMemcpy(devQoSeqArray, + qoSeqArray, + sizeof(int) * num_samples, + cudaMemcpyHostToDevice)); + devKvSeqArray = (int *)devQoSeqArray + num_samples; + checkCUDA(cudaMemcpy(devKvSeqArray, + kvSeqArray, + sizeof(int) * num_samples, + cudaMemcpyHostToDevice)); + reserveSpace = (int *)devKvSeqArray + num_samples; + } + // allocate memory for loWinIdx/hiWinIdx + loWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); + hiWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); + for (int i = 0; i < attn->qoSeqLength; i++) { + loWinIdx[i] = 0; + hiWinIdx[i] = attn->kvSeqLength; + } + free(qoSeqArray); + free(kvSeqArray); +} + +IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { + reserveInst.destroy(); + free(loWinIdx); + free(hiWinIdx); + checkCUDNN(cudnnDestroyAttnDescriptor(attnDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(qDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(kDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(vDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(oDesc)); +} + +}; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4fdc1f9819..ee274dc427 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -40,6 +40,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -2736,6 +2737,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + Op *op = IncMultiHeadSelfAttention::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -4491,6 +4498,25 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "MultiHeadAttention Backward Task"); } + // MultiHeadAttention task + { + TaskVariantRegistrar registrar(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + "IncMultiHeadSelfAttention Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "IncMultiHeadSelfAttention Init Task"); + } + { + TaskVariantRegistrar registrar(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + "IncMultiHeadSelfAttention Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant< + IncMultiHeadSelfAttention::inference_task>( + registrar, "IncMultiHeadSelfAttention Inference Task"); + } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); From 9e696370cdfc8427309282a51a9b7100c979841d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 27 Feb 2023 16:48:19 +0000 Subject: [PATCH 063/344] interface update --- include/flexflow/operator.h | 3 +++ src/ops/aggregate.cc | 2 ++ src/ops/aggregate_spec.cc | 2 ++ src/ops/attention.cc | 2 ++ src/ops/element_binary.cc | 2 ++ src/ops/experts.cc | 2 ++ src/ops/group_by.cc | 2 ++ src/ops/inc_multihead_self_attention.cc | 7 +++---- 8 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index a76ad9a018..985ef4374f 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -3,6 +3,7 @@ #include "flexflow/fftype.h" #include "flexflow/machine_view.h" +#include "flexflow/batch_config.h" #include "flexflow/parallel_tensor.h" #include "flexflow/utils/dot/record_formatter.h" #include @@ -186,6 +187,7 @@ class Op { // Pure virtual functions that must be implemented virtual void init(FFModel const &) = 0; virtual void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) { @@ -195,6 +197,7 @@ class Op { virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference virtual void inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) { diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 02fc971768..e8bab41ce9 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -183,6 +183,7 @@ Node Aggregate::deserialize(FFModel &ff, } void Aggregate::init_inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -285,6 +286,7 @@ void Aggregate::forward(FFModel const &ff) { } void Aggregate::inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 21d429594e..a4720a669b 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -157,6 +157,7 @@ AggregateSpec::AggregateSpec(FFModel &model, void AggregateSpec::init_inference( FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -259,6 +260,7 @@ void AggregateSpec::forward(FFModel const &ff) { } void AggregateSpec::inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/attention.cc b/src/ops/attention.cc index e04440d77e..ec24eaae94 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -374,6 +374,7 @@ MultiHeadAttention::MultiHeadAttention( void MultiHeadAttention::init_inference( FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -581,6 +582,7 @@ void MultiHeadAttention::forward(FFModel const &ff) { void MultiHeadAttention::inference( FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 2260a1e32a..9758225b3e 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -262,6 +262,7 @@ void ElementBinary::do_inplace_output(void) { void ElementBinary::init_inference( FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -507,6 +508,7 @@ void ElementBinary::forward(FFModel const &ff) { } void ElementBinary::inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/experts.cc b/src/ops/experts.cc index afd7bff3c9..d4a72f5dff 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -428,6 +428,7 @@ Node Experts::deserialize(FFModel &ff, } void Experts::init_inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -643,6 +644,7 @@ void Experts::forward(FFModel const &ff) { } void Experts::inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index e5d720ba31..faeafb9f05 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -165,6 +165,7 @@ Group_by::Group_by(FFModel &model, model, inputs.first, inputs.second, params.n, params.alpha, name) {} void Group_by::init_inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -312,6 +313,7 @@ void Group_by::forward(FFModel const &ff) { } void Group_by::inference(FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 329254fb74..6b5910c711 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -342,6 +342,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -465,6 +466,7 @@ void IncMultiHeadSelfAttention::forward(FFModel const &ff) { void IncMultiHeadSelfAttention::inference( FFModel const &ff, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -475,13 +477,10 @@ void IncMultiHeadSelfAttention::inference( MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, view); size_t machine_view_hash = view->hash(); - /* std::cout << "IncMultiHeadSelfAttention op machine_view: " << *(MachineView const - *)mv - << std::endl; */ int idx = 0; IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&bc, sizeof(BatchConfig)), argmap, Predicate::TRUE_PRED, false /*must*/, From 52e257d031b2fd66b2b4cee414d5da8533d1ef5c Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 28 Feb 2023 01:31:33 +0000 Subject: [PATCH 064/344] checkpoint --- CMakeLists.txt | 1 + config/config.linux | 2 +- examples/cpp/inference/data_generator.cc | 37 +- examples/cpp/inference/data_generator.h | 4 +- .../cpp/inference/mixture_of_experts/moe.cc | 3 +- .../cpp/inference/transformers/dataloader.cc | 388 ++++++++++++++++++ .../cpp/inference/transformers/dataloader.cu | 115 ++++++ .../inference/transformers/transformers.cc | 185 +++++++++ .../cpp/inference/transformers/transformers.h | 123 ++++++ include/flexflow/batch_config.h | 47 +++ include/flexflow/inference.h | 3 +- include/flexflow/operator.h | 4 +- include/flexflow/ops/aggregate.h | 5 +- include/flexflow/ops/aggregate_spec.h | 5 +- include/flexflow/ops/attention.h | 5 +- include/flexflow/ops/element_binary.h | 5 +- include/flexflow/ops/experts.h | 5 +- include/flexflow/ops/groupby.h | 5 +- .../ops/inc_multihead_self_attention.h | 5 +- include/flexflow/ops/layer_norm.h | 5 +- include/flexflow/ops/linear.h | 5 +- include/flexflow/ops/noop.h | 3 + include/flexflow/ops/softmax.h | 5 +- include/flexflow/ops/topk.h | 5 +- include/flexflow/parallel_ops/partition.h | 3 + src/ops/aggregate.cc | 4 +- src/ops/aggregate_spec.cc | 4 +- src/ops/attention.cc | 4 +- src/ops/element_binary.cc | 4 +- src/ops/experts.cc | 4 +- src/ops/group_by.cc | 4 +- src/ops/inc_multihead_self_attention.cc | 4 +- src/ops/layer_norm.cc | 6 +- src/ops/linear.cc | 6 +- src/ops/noop.cc | 4 +- src/ops/softmax.cc | 6 +- src/ops/topk.cc | 6 +- src/parallel_ops/partition.cc | 2 + src/runtime/inference_manager.cc | 9 +- 39 files changed, 995 insertions(+), 45 deletions(-) create mode 100644 examples/cpp/inference/transformers/dataloader.cc create mode 100644 examples/cpp/inference/transformers/dataloader.cu create mode 100644 examples/cpp/inference/transformers/transformers.cc create mode 100644 examples/cpp/inference/transformers/transformers.h create mode 100644 include/flexflow/batch_config.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 10542011df..8be1c10ce8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -419,6 +419,7 @@ endif() if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/mixture_of_experts) + add_subdirectory(examples/cpp/inference/transformers) endif() # installation diff --git a/config/config.linux b/config/config.linux index 86e8f8b647..940757f9e8 100755 --- a/config/config.linux +++ b/config/config.linux @@ -39,7 +39,7 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-OFF} +FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc index ec168f6f51..16efe57e3f 100644 --- a/examples/cpp/inference/data_generator.cc +++ b/examples/cpp/inference/data_generator.cc @@ -11,7 +11,7 @@ DataGenerator::DataGenerator(size_t _num_requests, double _lambda) : num_requests(_num_requests), token_dim(_token_dim), sequence_length(_sequence_length), poisson_distr(_poisson_distr), - lambda(_lambda), timer_started(false) { + lambda(_lambda), timer_started(false), global_unique_id(1000000) { generate_arrival_times(); }; @@ -80,7 +80,7 @@ void DataGenerator::start_timer(void) { timer_started = true; }; -size_t DataGenerator::get_requests(void) { +size_t DataGenerator::get_requests(size_t max_num_requests, std::vector > >&prompts) { if (!timer_started) { std::cout << "Warning: tried to get number of requests before the timer " "was started." @@ -99,5 +99,38 @@ size_t DataGenerator::get_requests(void) { << " request(s) by arrival time +" << ms_from_start << "ms" << "\n"; } + + for (size_t i = 0; i < received_requests; i++) { + int length = std::rand() % 10 + 5; + std::vector prompt; + for (int j = 0; j < length; j++) + prompt.push_back(j + 1000); + prompts.push_back(std::make_pair(global_unique_id++, prompt)); + } + assert(prompts.size() == received_requests); return received_requests; } + +size_t DataGenerator::get_requests() { + if (!timer_started) { + std::cout << "Warning: tried to get number of requests before the timer " + "was started." + << std::endl; + return 0; + } + Clock::time_point cur_time = Clock::now(); + size_t ms_from_start = + chrono::duration_cast(cur_time - start_time).count(); + vector::iterator new_arrivals_ptr = + upper_bound(arrivals_ptr, arrivals.end(), ms_from_start); + size_t received_requests = new_arrivals_ptr - arrivals_ptr; + arrivals_ptr = new_arrivals_ptr; + if (received_requests > 0) { + std::cout << "received " << received_requests + << " request(s) by arrival time +" << ms_from_start << "ms" + << "\n"; + } + + return received_requests; +} + diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index e651881902..93bb565c41 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -29,7 +29,8 @@ class DataGenerator { void start_timer(void); // Get number of requests that have arrived since the last time this function // was called - size_t get_requests(void); + size_t get_requests(size_t max_num_requests, std::vector > > &prompts); + size_t get_requests(); private: // Compute the arrival times of each request and save them in the arrivals @@ -42,6 +43,7 @@ class DataGenerator { bool poisson_distr; // false implies uniform distribution double lambda; // mean #num of arrivals per sec bool timer_started; // whether timer was initiated + size_t global_unique_id; // guid for requests // time when get_requests() is called for the first time Clock::time_point start_time; // arrival times (ms) generated based on distribution diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 0fd4b32d26..726ef5f7ff 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -156,6 +156,7 @@ void FlexFlow::top_level_task(Task const *task, int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; data_loader.reset(); data_generator.start_timer(); + BatchConfig bc; while (processed_requests < moeConfig.total_requests) { size_t received_requests = data_generator.get_requests(); int iterations = (received_requests % moeConfig.batch_size == 0) @@ -164,7 +165,7 @@ void FlexFlow::top_level_task(Task const *task, for (int iter = 0; iter < iterations; iter++) { data_loader.next_batch(ff, received_requests); runtime->begin_trace(ctx, 111 + index % num_devices /*trace_id*/); - im.inference(index); + im.inference(index, bc); runtime->end_trace(ctx, 111 + index % num_devices /*trace_id*/); index++; } diff --git a/examples/cpp/inference/transformers/dataloader.cc b/examples/cpp/inference/transformers/dataloader.cc new file mode 100644 index 0000000000..a61598299e --- /dev/null +++ b/examples/cpp/inference/transformers/dataloader.cc @@ -0,0 +1,388 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "transformers.h" +#include +#include +#include +#include +#include +#include + +using namespace Legion; + +DataLoader::DataLoader(FFModel &ff, + MoeConfig const &moeConfig, + DataGenerator &data_generator, + ParallelTensor input, + ParallelTensor label) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + + int numdims = input->num_dims; + int replica_idx = numdims - 1; + int batch_idx = numdims - 2; + num_samples = moeConfig.total_requests; + + // Create full input + { + batch_input = input; + + ParallelDim dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i].size = input->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = input->dims[i].is_replica_dim; + // Assume only the first dim can be the replica dim + assert(i == replica_idx || (!dims[i].is_replica_dim)); + } + assert(dims[batch_idx].size == ff.config.batchSize); + dims[batch_idx].size = num_samples; + + full_input = + ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_FLOAT); + ff.map_tensor(full_input, NULL /*parallel_op*/); + } + + // Create full label + { + assert(label->num_dims == numdims); + batch_label = label; + + ParallelDim dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i].size = label->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = label->dims[i].is_replica_dim; + // Assume only the last dim can be the replica dim + assert(i == replica_idx || (!dims[i].is_replica_dim)); + } + assert(dims[batch_idx].size == ff.config.batchSize); + // replace batch size with number of samples + dims[batch_idx].size = num_samples; + + full_label = + ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_INT32); + ff.map_tensor(full_label, NULL /*parallel_op*/); + } + + // Load entire dataset + // TODO: Use index launcher instead of task launcher + assert(full_input != nullptr && "full_input is nullptr"); + assert(full_label != nullptr && "full_label is nullptr"); + + DataLoaderInput dataloader_input = {moeConfig, data_generator}; + DataLoaderInput const *ptr = &dataloader_input; + + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, + TaskArgument(ptr, sizeof(DataLoaderInput))); + // regions[0]: full_input + launcher.add_region_requirement(RegionRequirement(full_input->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + // regions[1]: full_label + launcher.add_region_requirement(RegionRequirement(full_label->region, + WRITE_ONLY, + EXCLUSIVE, + full_label->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(1, FID_DATA); + + runtime->execute_task(ctx, launcher); + reset(); +} + +// ================================================= +// Load data +// ================================================= + +void read_cifar100(float *input_ptr, int *label_ptr) { + std::ifstream file; + file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); + if (!file) { + std::cout << "Error opening CIFAR100 train data file" << std::endl; + assert(false); + } + + file.seekg(0, std::ios::beg); + + // each sample: <1 x coarse label><1 x fine label><3072 x pixel> + for (std::size_t i = 0; i < MAX_NUM_SAMPLES; i++) { + unsigned char temp = 0; + file.read((char *)&temp, sizeof(temp)); // coarse label, skip + file.read((char *)&temp, sizeof(temp)); + label_ptr[i] = temp; + for (std::size_t j = 0; j < 3072; ++j) { + file.read((char *)&temp, sizeof(temp)); + input_ptr[i * 3072 + j] = (float)temp / 255.0f; + } + } + + file.close(); +} + +int reverseInt(int i) { + unsigned char c1, c2, c3, c4; + + c1 = i & 255; + c2 = (i >> 8) & 255; + c3 = (i >> 16) & 255; + c4 = (i >> 24) & 255; + + return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; +} + +/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to +the current working directory */ +void read_mnist(float *input_ptr, int *label_ptr) { + // read inputs + std::ifstream input("train-images-idx3-ubyte", std::ios::binary); + if (input.is_open()) { + int magic_number = 0; + int number_of_images = 0; + int n_rows = 0; + int n_cols = 0; + input.read((char *)&magic_number, sizeof(magic_number)); + magic_number = reverseInt(magic_number); + input.read((char *)&number_of_images, sizeof(number_of_images)); + number_of_images = reverseInt(number_of_images); + input.read((char *)&n_rows, sizeof(n_rows)); + n_rows = reverseInt(n_rows); + input.read((char *)&n_cols, sizeof(n_cols)); + n_cols = reverseInt(n_cols); + + for (int i = 0; i < number_of_images; i++) { + for (int r = 0; r < n_rows; r++) { + for (int c = 0; c < n_cols; c++) { + unsigned char temp = 0; + input.read((char *)&temp, sizeof(temp)); + input_ptr[i * n_rows * n_cols + r * n_cols + c] = + (float)temp / 255.0f; + } + } + } + } else { + std::cout << "Error opening MNIST input data file" << std::endl; + assert(false); + } + + // read labels + std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary); + if (labels.is_open()) { + int magic_number = 0; + int number_of_images = 0; + labels.read((char *)&magic_number, sizeof(magic_number)); + magic_number = reverseInt(magic_number); + labels.read((char *)&number_of_images, sizeof(number_of_images)); + number_of_images = reverseInt(number_of_images); + + for (int i = 0; i < number_of_images; i++) { + unsigned char temp = 0; + labels.read((char *)&temp, sizeof(temp)); + label_ptr[i] = temp; + } + } else { + std::cout << "Error opening MNIST label data file" << std::endl; + assert(false); + } +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + DataLoaderInput const input_struct = *((DataLoaderInput *)task->args); + MoeConfig const &conf = input_struct._moeConfig; + DataGenerator &datagen = input_struct._data_generator; + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + + // get input and label pointer + float *input_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + int *label_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain label_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + int input_dims = input_domain.get_dim(); + for (int i = 0; i < input_dims; i++) { + int input_dim = input_domain.hi()[i] - input_domain.lo()[i] + 1; + int label_dim = label_domain.hi()[i] - label_domain.lo()[i] + 1; + assert(i == 0 || input_dim == label_dim); + } + + if (conf.dataset_path.length() == 0) { + printf("Input dataset path is empty, using random input samples\n"); + datagen.generate_requests(input_ptr, label_ptr, conf.num_labels); + } else { + // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load + // another dataset using the dataset_path from the MoeConfig object + // read_mnist(input_ptr, label_ptr); + // log_app.print("finish loading MNIST data\n"); + } +} + +void DataLoader::next_batch(FFModel &ff, size_t received_requests) { + if (received_requests == 0) { + return; + } + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load input + { + Domain domain = + runtime->get_index_space_domain(ctx, batch_input->parallel_is); + ArgumentMap argmap; + int counter = 0; + // current limitation of the dataloader: only the batch dimension can be + // partitioned + int input_dims = batch_input->num_dims; + for (int i = 0; i < input_dims; i++) { + if (i != input_dims - 2) { + assert(batch_input->dims[i].degree == 1 && + "Dataloader only supports batch size partitions"); + } + } + int batch_size = batch_input->dims[input_dims - 2].size; + int n_partitions = batch_input->dims[input_dims - 2].degree; + assert(ff.config.batchSize % batch_size == 0); + assert(batch_size % n_partitions == 0); + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + int requests_left = received_requests - counter; + meta.num_samples = std::min(batch_size / n_partitions, requests_left); + for (int i = 0; i < meta.num_samples; i++) { + meta.idxs[i] = next_index + counter; + counter++; + } + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + assert(counter == received_requests); + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, + batch_input->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_input->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_input->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + // Load label + { + Domain domain = + runtime->get_index_space_domain(ctx, batch_label->parallel_is); + ArgumentMap argmap; + int counter = 0; + // current limitation of the dataloader: only the batch dimension can be + // partitioned + int label_dims = batch_label->num_dims; + // assert(batch_label->dims[label_dims - 1].degree == 1); + for (int i = 0; i < label_dims; i++) { + assert(batch_label->dims[i].degree == 1 && + "Dataloader only supports batch size partitions"); + } + int batch_size = batch_label->dims[label_dims - 2].size; + int n_partitions = batch_label->dims[label_dims - 2].degree; + assert(ff.config.batchSize % batch_size == 0); + assert(batch_size % n_partitions == 0); + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + int requests_left = received_requests - counter; + meta.num_samples = std::min(batch_size / n_partitions, requests_left); + for (int i = 0; i < meta.num_samples; i++) { + meta.idxs[i] = next_index + counter; + counter++; + } + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + assert(counter == received_requests); + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, + batch_label->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_label->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_label->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_label->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_label->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_label->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } + next_index += received_requests; +} + +void DataLoader::reset() { + next_index = 0; +} + +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Input Task"); + } + // Load label + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Label Task"); + } +} diff --git a/examples/cpp/inference/transformers/dataloader.cu b/examples/cpp/inference/transformers/dataloader.cu new file mode 100644 index 0000000000..4624b562e9 --- /dev/null +++ b/examples/cpp/inference/transformers/dataloader.cu @@ -0,0 +1,115 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/cuda_helper.h" +#include "transformers.h" + +void DataLoader::load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + float const *full_input_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + float *batch_input_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + Domain full_input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain batch_input_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + coord_t token_dim = + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; + coord_t sequence_length = + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + coord_t batch_size = + batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; + + // FIXME: currently assume continous indices + assert(meta->num_samples <= batch_size); + for (int i = 1; i < meta->num_samples; i++) { + assert(meta->idxs[i] == meta->idxs[0] + i); + } + // pad inputs if needed (this is really only useful for debugging) + if (meta->num_samples < batch_size) { + checkCUDA(cudaMemset(batch_input_ptr + + token_dim * sequence_length * meta->num_samples, + 0, + token_dim * sequence_length * + (batch_size - meta->num_samples) * sizeof(float))); + } + coord_t start_idx = meta->idxs[0]; + assert(batch_input_domain.get_volume() % token_dim * sequence_length * + batch_size == + 0); + assert(batch_input_domain.get_volume() % batch_size == 0); + size_t size_to_copy = + (batch_input_domain.get_volume() / batch_size) * meta->num_samples; + float const *input_zc = + full_input_ptr + start_idx * token_dim * sequence_length; + copy_kernel<<>>( + batch_input_ptr, input_zc, size_to_copy); + checkCUDA(cudaDeviceSynchronize()); +} + +void DataLoader::load_label(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + int const *full_label_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + int *batch_label_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + Domain full_label_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain batch_label_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + coord_t label_dim = + batch_label_domain.hi()[0] - batch_label_domain.lo()[0] + 1; + coord_t sequence_length = + batch_label_domain.hi()[1] - batch_label_domain.lo()[1] + 1; + coord_t batch_size = + batch_label_domain.hi()[2] - batch_label_domain.lo()[2] + 1; + // FIXME: currently assume continous indices + assert(meta->num_samples <= batch_size); + for (int i = 1; i < meta->num_samples; i++) { + assert(meta->idxs[i] == meta->idxs[0] + i); + } + if (meta->num_samples < batch_size) { + checkCUDA(cudaMemset(batch_label_ptr + + label_dim * sequence_length * meta->num_samples, + 0, + label_dim * sequence_length * + (batch_size - meta->num_samples) * sizeof(int))); + } + assert(batch_label_domain.get_volume() % label_dim * sequence_length * + batch_size == + 0); + assert(batch_label_domain.get_volume() % batch_size == 0); + coord_t start_idx = meta->idxs[0]; + size_t size_to_copy = + (batch_label_domain.get_volume() / batch_size) * meta->num_samples; + int const *input_zc = + full_label_ptr + start_idx * label_dim * sequence_length; + copy_kernel<<>>( + batch_label_ptr, input_zc, size_to_copy); + checkCUDA(cudaDeviceSynchronize()); +} diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc new file mode 100644 index 0000000000..27e438037e --- /dev/null +++ b/examples/cpp/inference/transformers/transformers.cc @@ -0,0 +1,185 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "transformers.h" +#include "flexflow/inference.h" +#include +#include +#include +#include +#include +#include + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("Transformers"); + +void parse_input_args(char **argv, int argc, MoeConfig &config) { + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--dataset")) { + config.dataset_path = std::string(argv[++i]); + continue; + } + } +} + +Tensor create_inc_multihead_attention_decoder(FFModel *model, + MoeConfig const *moeConfig, + Tensor const &input) { + std::vector axes{0}; + Tensor t = model->inc_multihead_self_attention(input, + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim); + + t = model->layer_norm(model->add(t, input), axes, true, 1e-05); + Tensor x = model->dense(model->dense(t, moeConfig->hidden_size, AC_MODE_RELU, false /*bias*/), + moeConfig->hidden_size, + AC_MODE_NONE, + false /*bias*/); + t = model->layer_norm(model->add(x, t), axes, true, 1e-05); + return t; +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + //----------------------- Initial configurations ------------------------ + MoeConfig moeConfig; + FFConfig ffConfig; + ffConfig.batchSize = moeConfig.batch_size; + { + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, moeConfig); + log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", + ffConfig.batchSize, + ffConfig.workersPerNode, + ffConfig.numNodes); + } + FFModel ff(ffConfig); + + //----------------------- Create inputs -------------------------------- + Tensor input; + { + int const dims[] = {BatchConfig::MAX_NUM_TOKENS, moeConfig.token_dim}; + input = ff.create_tensor<2>(dims, DT_FLOAT); + } + + //----------------------- Define the model ------------------------------ + Tensor t = input; + for (int i = 0; i < moeConfig.num_layers; i++) { + t = create_inc_multihead_attention_decoder(&ff, &moeConfig, input); + } + t = ff.dense(t, moeConfig.out_dim, AC_MODE_RELU); + t = ff.softmax(t); + + //------------------- Initialize the inference manager ------------------ + InferenceManager im( + &ff, moeConfig.batch_size, moeConfig.num_inflight_batches); + im.compile_model_and_allocate_buffer(); + im.init_operators_inference(); + + //------------ Initialize the data loader and data generator ------------ + DataGenerator data_generator(moeConfig.total_requests, + moeConfig.token_dim, + moeConfig.sequence_length, + moeConfig.poisson_distribution, + moeConfig.arrival_rate); + ParallelTensor input_pt, label_pt; + ff.get_parallel_tensor_from_tensor(input, input_pt); + ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); + DataLoader data_loader(ff, moeConfig, data_generator, input_pt, label_pt); + + //----------------------- Start timer ----------------------------------- + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_start = Realm::Clock::current_time_in_microseconds(); + + //----------------------- Begin inference! ------------------------------- + int index = 0; + int processed_requests = 0; + int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; + data_loader.reset(); + data_generator.start_timer(); + std::map future_handlers; + std::map batch_configs; + while (processed_requests < moeConfig.total_requests) { + for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { + if (future_handlers.find(bid) == future_handlers.end()) { + std::vector > > prompts; + assert(im.max_num_requests_per_batch <= BatchConfig::MAX_NUM_REQUESTS); + data_generator.get_requests(im.max_num_requests_per_batch, prompts); + assert((int)prompts.size() < im.max_num_requests_per_batch); + //TODO: loading data + BatchConfig* bc = new BatchConfig(); + for (const auto & prompt : prompts) { + assert(bc->register_new_request(prompt.first, prompt.second.size())); + } + bc->prepare_next_batch(); + runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + FutureMap fm = im.inference(bid, *bc); + runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + future_handlers[bid] = future; + batch_configs[bid] = bc; + } else { + Future future = future_handlers[bid]; + if (!future.is_ready(true/*subscribe*/)) { + continue; + } + InferenceResult ir = future.get_result(); + BatchConfig* bc = batch_configs[bid]; + processed_requests += bc->update_results(ir); + int available_slots = BatchConfig::MAX_NUM_REQUESTS - bc->num_processing_requests(); + std::vector > > prompts; + data_generator.get_requests(available_slots, prompts); + processed_requests += prompts.size(); + for (const auto& prompt : prompts) { + assert(bc->register_new_request(prompt.first, prompt.second.size())); + } + bc->prepare_next_batch(); + runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + FutureMap fm = im.inference(bid, *bc); + runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; + } + } + } + //----------------------- End of inference! ------------------------------ + + //----------------------- Stop timer ------------------------------------- + { + runtime->issue_execution_fence(ctx); + TimingLauncher timer(MEASURE_MICRO_SECONDS); + Future future = runtime->issue_timing_measurement(ctx, timer); + future.get_void_result(); + } + double ts_end = Realm::Clock::current_time_in_microseconds(); + double run_time = 1e-6 * (ts_end - ts_start); + printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f requests/s\n", + run_time, + moeConfig.total_requests / run_time); +} diff --git a/examples/cpp/inference/transformers/transformers.h b/examples/cpp/inference/transformers/transformers.h new file mode 100644 index 0000000000..3c09f64055 --- /dev/null +++ b/examples/cpp/inference/transformers/transformers.h @@ -0,0 +1,123 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "data_generator.h" +#include "flexflow/model.h" +#define MAX_NUM_SAMPLES 1000 +#define MNIST_DIMS 28 * 28 +#define DATA_DIM MNIST_DIMS + +using namespace Legion; +using namespace std; +using namespace FlexFlow; + +struct MoeConfig { + MoeConfig(void) { + //----------------------- Input/output data ------------------------ + token_dim = DATA_DIM; + sequence_length = 10; + batch_size = 32; + out_dim = 15; + num_labels = out_dim; + num_layers = 12; + //----------------------- Inference parameters --------------------- + // total number of requests processed as part of the simulation + total_requests = 256; + poisson_distribution = true; + // average number of request arrivals per second + arrival_rate = 25; + num_inflight_batches = 10; + //----------------------- MoE layer -------------------------------- + // total number of experts + num_exp = 128; + // number of experts in each block of fused experts + experts_per_block = 32; + // number of experts to route each token to + num_select = 2; + // expert capacity parameters + alpha = 2.0f; // factor overhead tensor size for imbalance + lambda = 0.04f; // multiplier for load balance term + // expert hidden size + hidden_size = DATA_DIM; + //----------------------- Rest of model parameters ------------------ + // Encoder layer + num_attention_heads = 16; + attention_kdim = attention_vdim = hidden_size / num_attention_heads; + num_encoder_layers = 1; + } + + // Input/output data + int token_dim; + int sequence_length; + int batch_size; + int out_dim; + int num_labels; + int num_layers; + std::string dataset_path; + // Inference parameters + int total_requests; + bool poisson_distribution; + double arrival_rate; + int num_inflight_batches; + // MoE layer + int num_exp; + int experts_per_block; + int num_select; + float alpha; + float lambda; + int hidden_size; + // Model parameters + int num_attention_heads; + int attention_kdim; + int attention_vdim; + int num_encoder_layers; +}; + +class DataLoader { +public: + DataLoader(FFModel &ff, + MoeConfig const &moeConfig, + DataGenerator &data_generator, + ParallelTensor input, + ParallelTensor label); + static void load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_label(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + void next_batch(FFModel &, size_t); + void reset(void); + +public: + int num_samples, next_index; + FlexFlow::ParallelTensor full_input, batch_input; + FlexFlow::ParallelTensor full_label, batch_label; + struct DataLoaderInput { + MoeConfig const &_moeConfig; + DataGenerator &_data_generator; + }; +}; + +struct SampleIdxs { + int num_samples; + int idxs[MAX_NUM_SAMPLES]; +}; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h new file mode 100644 index 0000000000..4e781392b1 --- /dev/null +++ b/include/flexflow/batch_config.h @@ -0,0 +1,47 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + + +namespace FlexFlow { + +struct InferenceResult { + static const int MAX_NUM_TOKENS = 1024; + int results[MAX_NUM_TOKENS]; +}; + +class BatchConfig { +public: + BatchConfig(); + bool register_new_request(int guid, int length); + void prepare_next_batch(); + int update_results(InferenceResult const &ir); + int num_processing_requests(); + static const int MAX_NUM_REQUESTS = 256; + static const int MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; + static const int MAX_SEQUENCE_LENGTH = 1024; + // These are set by update + int token_start_idx[MAX_NUM_REQUESTS]; + int token_last_available_idx[MAX_NUM_REQUESTS]; + int num_processing_tokens[MAX_NUM_REQUESTS]; + size_t request_guid[MAX_NUM_REQUESTS]; + // This is set by the app + bool request_completed[MAX_NUM_REQUESTS]; +}; + +}; // namespace FlexFlow diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index dacf6b3f28..9529052ac7 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -16,6 +16,7 @@ #pragma once #include "flexflow/model.h" +#include "flexflow/batch_config.h" namespace FlexFlow { @@ -28,7 +29,7 @@ class InferenceManager { int max_num_inflight_batches); void compile_model_and_allocate_buffer(void); void init_operators_inference(); - void inference(int index); + Legion::FutureMap inference(int index, BatchConfig const &bc); public: std::unordered_map> tensor_buffer; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 985ef4374f..94bd19c927 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -3,9 +3,9 @@ #include "flexflow/fftype.h" #include "flexflow/machine_view.h" -#include "flexflow/batch_config.h" #include "flexflow/parallel_tensor.h" #include "flexflow/utils/dot/record_formatter.h" +#include "flexflow/batch_config.h" #include namespace FlexFlow { @@ -196,7 +196,7 @@ class Op { virtual void forward(FFModel const &) = 0; virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference - virtual void inference(FFModel const &, + virtual Legion::FutureMap inference(FFModel const &, BatchConfig const &, std::vector const &, std::vector const &, diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 098e10d8e8..56f84308c2 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -3,6 +3,7 @@ #include "flexflow/model.h" #include "flexflow/ops/aggregate_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -36,11 +37,13 @@ class Aggregate : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index a80606d761..f48a5f95ee 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -3,6 +3,7 @@ #include "flexflow/model.h" #include "flexflow/ops/aggregate_spec_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -28,11 +29,13 @@ class AggregateSpec : public Op { char const *name); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index baf4c06d48..88457756c5 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -8,6 +8,7 @@ #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/attention_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -65,12 +66,14 @@ class MultiHeadAttention : public Op { std::vector const &inputs); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 6e7edce223..351685f140 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -5,6 +5,7 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/element_binary_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -26,12 +27,14 @@ class ElementBinary : public Op { bool inplace_a = false); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index cd66618a07..dd15ca5019 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -2,6 +2,7 @@ #include "flexflow/model.h" #include "flexflow/ops/experts_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -55,12 +56,14 @@ class Experts : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index 0acc241a9b..bdfdeea669 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -4,6 +4,7 @@ #include "flexflow/model.h" #include "flexflow/node.h" #include "flexflow/ops/groupby_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -35,12 +36,14 @@ class Group_by : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 0b1572f2ca..7390d127a9 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -9,6 +9,7 @@ #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -60,12 +61,14 @@ class IncMultiHeadSelfAttention : public Op { std::vector const &inputs); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index dac230e410..9c7bb6f31a 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -1,6 +1,7 @@ #pragma once #include "flexflow/model.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -25,12 +26,14 @@ class LayerNorm : public Op { char const *name); void init(FFModel const &); void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &); void backward(FFModel const &); - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index ccd5724dc5..df4772831b 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -4,6 +4,7 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/linear_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -36,12 +37,14 @@ class Linear : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/noop.h b/include/flexflow/ops/noop.h index a38d2945ca..7ca55463fd 100644 --- a/include/flexflow/ops/noop.h +++ b/include/flexflow/ops/noop.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_NOOP_H #include "flexflow/model.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -18,11 +19,13 @@ class NoOp : public Op { char const *name = NULL); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index de9ad56b45..7dfd372451 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -5,6 +5,7 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/softmax_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -22,11 +23,13 @@ class Softmax : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 5e8b515672..8d1bf010c3 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -4,6 +4,7 @@ #include "flexflow/model.h" #include "flexflow/node.h" #include "flexflow/ops/topk_params.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -29,12 +30,14 @@ class TopK : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void inference(FFModel const &, + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/parallel_ops/partition.h b/include/flexflow/parallel_ops/partition.h index f25bc83276..919446b0ab 100644 --- a/include/flexflow/parallel_ops/partition.h +++ b/include/flexflow/parallel_ops/partition.h @@ -6,6 +6,7 @@ #include "flexflow/operator.h" #include "flexflow/parallel_ops/partition_params.h" #include "parallel_op.h" +#include "flexflow/inference.h" namespace FlexFlow { @@ -30,11 +31,13 @@ class Repartition : public ParallelOp { std::vector const &batch_outputs) override; void init(FFModel const &) override; void init_inference(FFModel const &, + BatchConfig const & bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void inference(FFModel const &, + BatchConfig const & bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index e8bab41ce9..1556a93c78 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -285,7 +285,7 @@ void Aggregate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Aggregate::inference(FFModel const &ff, +FutureMap Aggregate::inference(FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -338,7 +338,7 @@ void Aggregate::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(n + 2, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Aggregate::forward_task(Task const *task, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index a4720a669b..a255f2ed5f 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -259,7 +259,7 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void AggregateSpec::inference(FFModel const &ff, +FutureMap AggregateSpec::inference(FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -312,7 +312,7 @@ void AggregateSpec::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(n + 2, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void AggregateSpec::forward_task(Task const *task, diff --git a/src/ops/attention.cc b/src/ops/attention.cc index ec24eaae94..f0195c95d6 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -580,7 +580,7 @@ void MultiHeadAttention::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void MultiHeadAttention::inference( +FutureMap MultiHeadAttention::inference( FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, @@ -635,7 +635,7 @@ void MultiHeadAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(4, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 9758225b3e..e1cc3d04c4 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -507,7 +507,7 @@ void ElementBinary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void ElementBinary::inference(FFModel const &ff, +FutureMap ElementBinary::inference(FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -581,7 +581,7 @@ void ElementBinary::inference(FFModel const &ff, launcher.add_field(2, FID_DATA); } } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/experts.cc b/src/ops/experts.cc index d4a72f5dff..1abf834551 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -643,7 +643,7 @@ void Experts::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Experts::inference(FFModel const &ff, +FutureMap Experts::inference(FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -712,7 +712,7 @@ void Experts::inference(FFModel const &ff, launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); } } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Experts::inference_task(Task const *task, diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index faeafb9f05..bfdb05bfe7 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -312,7 +312,7 @@ void Group_by::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Group_by::inference(FFModel const &ff, +FutureMap Group_by::inference(FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -359,7 +359,7 @@ void Group_by::inference(FFModel const &ff, launcher.add_field(i + 2, FID_DATA); } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Group_by::forward_task(Task const *task, diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 6b5910c711..d938cd1b4c 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -464,7 +464,7 @@ void IncMultiHeadSelfAttention::forward(FFModel const &ff) { assert(false); } -void IncMultiHeadSelfAttention::inference( +FutureMap IncMultiHeadSelfAttention::inference( FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, @@ -504,7 +504,7 @@ void IncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 56c2bec1fc..b2716cb24d 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -195,6 +195,7 @@ LayerNorm::LayerNorm(FFModel &model, } void LayerNorm::init_inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -315,7 +316,8 @@ void LayerNorm::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void LayerNorm::inference(FFModel const &ff, +FutureMap LayerNorm::inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -362,7 +364,7 @@ void LayerNorm::inference(FFModel const &ff, weights[1]->region)); launcher.add_field(3, FID_DATA); } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } /* diff --git a/src/ops/linear.cc b/src/ops/linear.cc index c7308bae15..f91da55762 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -255,6 +255,7 @@ void Linear::init(FFModel const &ff) { } void Linear::init_inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -420,7 +421,8 @@ void Linear::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void Linear::inference(FFModel const &ff, +FutureMap Linear::inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -467,7 +469,7 @@ void Linear::inference(FFModel const &ff, weights[1]->region)); launcher.add_field(3, FID_DATA); } - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Linear::forward_task(Task const *task, diff --git a/src/ops/noop.cc b/src/ops/noop.cc index 46968acb03..7ad714a4c4 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -96,6 +96,7 @@ OpMeta *NoOp::init_task(Task const *task, } void NoOp::init_inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -258,7 +259,8 @@ void NoOp::init(FFModel const &ff) { void NoOp::forward(FFModel const &ff) {} -void NoOp::inference(FFModel const &ff, +FutureMap NoOp::inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) {} diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 389cd8a678..3bdb8b4f8d 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -116,6 +116,7 @@ Softmax::Softmax(FFModel &model, : Softmax(model, input, params.dim, name) {} void Softmax::init_inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -225,7 +226,8 @@ OpMeta *Softmax::init_task(Task const *task, return m; } -void Softmax::inference(FFModel const &ff, +FutureMap Softmax::inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -286,7 +288,7 @@ void Softmax::forward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Softmax::forward_task(Task const *task, diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 3763514685..26aa6aeeb5 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -137,6 +137,7 @@ TopK::TopK(FFModel &model, : TopK(model, input, params.k, params.sorted, name) {} void TopK::init_inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -263,7 +264,8 @@ void TopK::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -void TopK::inference(FFModel const &ff, +FutureMap TopK::inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -302,7 +304,7 @@ void TopK::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[1]->region)); launcher.add_field(2, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void TopK::forward_task(Task const *task, diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 7fdf9a8e4a..3e9ef7eac6 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -103,6 +103,7 @@ OpMeta *Repartition::init_task(Task const *task, void Repartition::init_inference( FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -200,6 +201,7 @@ void Repartition::create_input_partition_inference( } void Repartition::inference(FFModel const &ff, + BatchConfig const & bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 541cf34976..a5b846dbdb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -111,16 +111,18 @@ void InferenceManager::init_operators_inference() { ((ParallelOp *)op) ->create_input_partition_inference(*model, inputs, outputs); } - op->init_inference(*model, inputs, outputs, view); + BatchConfig bc; + op->init_inference(*model, bc, inputs, outputs, view); } } } } -void InferenceManager::inference(int index) { +FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { int batch_index = index % max_num_inflight_batches; int device_index = index % num_devices; int expert_device_index = 0; + FutureMap fm; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { @@ -156,8 +158,9 @@ void InferenceManager::inference(int index) { outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } - op->inference(*model, inputs, outputs, view); + fm = op->inference(*model, bc, inputs, outputs, view); } + return fm; }; }; // namespace FlexFlow From b6ed7639f6632ad69ff71209cfc89e576424ecd4 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 28 Feb 2023 01:38:17 +0000 Subject: [PATCH 065/344] format --- examples/cpp/inference/data_generator.cc | 8 +- examples/cpp/inference/data_generator.h | 16 +- .../inference/transformers/transformers.cc | 42 ++--- include/flexflow/batch_config.h | 9 +- include/flexflow/inference.h | 2 +- include/flexflow/operator.h | 10 +- include/flexflow/ops/aggregate.h | 12 +- include/flexflow/ops/aggregate_spec.h | 12 +- include/flexflow/ops/attention.h | 12 +- include/flexflow/ops/element_binary.h | 10 +- include/flexflow/ops/experts.h | 12 +- include/flexflow/ops/groupby.h | 12 +- .../ops/inc_multihead_self_attention.h | 90 +++++------ include/flexflow/ops/layer_norm.h | 10 +- include/flexflow/ops/linear.h | 10 +- include/flexflow/ops/noop.h | 12 +- include/flexflow/ops/softmax.h | 12 +- include/flexflow/ops/topk.h | 10 +- include/flexflow/parallel_ops/partition.h | 14 +- src/ops/aggregate.cc | 8 +- src/ops/aggregate_spec.cc | 11 +- src/ops/element_binary.cc | 11 +- src/ops/experts.cc | 10 +- src/ops/group_by.cc | 10 +- src/ops/inc_multihead_self_attention.cc | 151 +++++++++--------- src/ops/inc_multihead_self_attention.cu | 31 ++-- src/ops/layer_norm.cc | 10 +- src/ops/linear.cc | 10 +- src/ops/noop.cc | 13 +- src/ops/softmax.cc | 14 +- src/ops/topk.cc | 10 +- src/parallel_ops/partition.cc | 15 +- 32 files changed, 318 insertions(+), 301 deletions(-) diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc index 16efe57e3f..3e8daa7d41 100644 --- a/examples/cpp/inference/data_generator.cc +++ b/examples/cpp/inference/data_generator.cc @@ -80,7 +80,9 @@ void DataGenerator::start_timer(void) { timer_started = true; }; -size_t DataGenerator::get_requests(size_t max_num_requests, std::vector > >&prompts) { +size_t DataGenerator::get_requests( + size_t max_num_requests, + std::vector>> &prompts) { if (!timer_started) { std::cout << "Warning: tried to get number of requests before the timer " "was started." @@ -103,8 +105,9 @@ size_t DataGenerator::get_requests(size_t max_num_requests, std::vector prompt; - for (int j = 0; j < length; j++) + for (int j = 0; j < length; j++) { prompt.push_back(j + 1000); + } prompts.push_back(std::make_pair(global_unique_id++, prompt)); } assert(prompts.size() == received_requests); @@ -133,4 +136,3 @@ size_t DataGenerator::get_requests() { return received_requests; } - diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 93bb565c41..777c7cadd7 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -29,7 +29,9 @@ class DataGenerator { void start_timer(void); // Get number of requests that have arrived since the last time this function // was called - size_t get_requests(size_t max_num_requests, std::vector > > &prompts); + size_t + get_requests(size_t max_num_requests, + std::vector>> &prompts); size_t get_requests(); private: @@ -37,12 +39,12 @@ class DataGenerator { // vector. void generate_arrival_times(void); - size_t num_requests; // total number of requests - size_t token_dim; // embedding dim of each token - size_t sequence_length; // dimension of one request tensor - bool poisson_distr; // false implies uniform distribution - double lambda; // mean #num of arrivals per sec - bool timer_started; // whether timer was initiated + size_t num_requests; // total number of requests + size_t token_dim; // embedding dim of each token + size_t sequence_length; // dimension of one request tensor + bool poisson_distr; // false implies uniform distribution + double lambda; // mean #num of arrivals per sec + bool timer_started; // whether timer was initiated size_t global_unique_id; // guid for requests // time when get_requests() is called for the first time Clock::time_point start_time; diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 27e438037e..ab9f485517 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -36,20 +36,21 @@ void parse_input_args(char **argv, int argc, MoeConfig &config) { } Tensor create_inc_multihead_attention_decoder(FFModel *model, - MoeConfig const *moeConfig, - Tensor const &input) { + MoeConfig const *moeConfig, + Tensor const &input) { std::vector axes{0}; Tensor t = model->inc_multihead_self_attention(input, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim); + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim); t = model->layer_norm(model->add(t, input), axes, true, 1e-05); - Tensor x = model->dense(model->dense(t, moeConfig->hidden_size, AC_MODE_RELU, false /*bias*/), - moeConfig->hidden_size, - AC_MODE_NONE, - false /*bias*/); + Tensor x = model->dense( + model->dense(t, moeConfig->hidden_size, AC_MODE_RELU, false /*bias*/), + moeConfig->hidden_size, + AC_MODE_NONE, + false /*bias*/); t = model->layer_norm(model->add(x, t), axes, true, 1e-05); return t; } @@ -122,17 +123,17 @@ void FlexFlow::top_level_task(Task const *task, data_loader.reset(); data_generator.start_timer(); std::map future_handlers; - std::map batch_configs; + std::map batch_configs; while (processed_requests < moeConfig.total_requests) { for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { if (future_handlers.find(bid) == future_handlers.end()) { - std::vector > > prompts; + std::vector>> prompts; assert(im.max_num_requests_per_batch <= BatchConfig::MAX_NUM_REQUESTS); data_generator.get_requests(im.max_num_requests_per_batch, prompts); assert((int)prompts.size() < im.max_num_requests_per_batch); - //TODO: loading data - BatchConfig* bc = new BatchConfig(); - for (const auto & prompt : prompts) { + // TODO: loading data + BatchConfig *bc = new BatchConfig(); + for (auto const &prompt : prompts) { assert(bc->register_new_request(prompt.first, prompt.second.size())); } bc->prepare_next_batch(); @@ -145,17 +146,18 @@ void FlexFlow::top_level_task(Task const *task, batch_configs[bid] = bc; } else { Future future = future_handlers[bid]; - if (!future.is_ready(true/*subscribe*/)) { + if (!future.is_ready(true /*subscribe*/)) { continue; } InferenceResult ir = future.get_result(); - BatchConfig* bc = batch_configs[bid]; + BatchConfig *bc = batch_configs[bid]; processed_requests += bc->update_results(ir); - int available_slots = BatchConfig::MAX_NUM_REQUESTS - bc->num_processing_requests(); - std::vector > > prompts; + int available_slots = + BatchConfig::MAX_NUM_REQUESTS - bc->num_processing_requests(); + std::vector>> prompts; data_generator.get_requests(available_slots, prompts); processed_requests += prompts.size(); - for (const auto& prompt : prompts) { + for (auto const &prompt : prompts) { assert(bc->register_new_request(prompt.first, prompt.second.size())); } bc->prepare_next_batch(); diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 4e781392b1..648406f2de 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -17,11 +17,10 @@ #include - namespace FlexFlow { struct InferenceResult { - static const int MAX_NUM_TOKENS = 1024; + static int const MAX_NUM_TOKENS = 1024; int results[MAX_NUM_TOKENS]; }; @@ -32,9 +31,9 @@ class BatchConfig { void prepare_next_batch(); int update_results(InferenceResult const &ir); int num_processing_requests(); - static const int MAX_NUM_REQUESTS = 256; - static const int MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; - static const int MAX_SEQUENCE_LENGTH = 1024; + static int const MAX_NUM_REQUESTS = 256; + static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; + static int const MAX_SEQUENCE_LENGTH = 1024; // These are set by update int token_start_idx[MAX_NUM_REQUESTS]; int token_last_available_idx[MAX_NUM_REQUESTS]; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 9529052ac7..92aa8f5d21 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -15,8 +15,8 @@ #pragma once -#include "flexflow/model.h" #include "flexflow/batch_config.h" +#include "flexflow/model.h" namespace FlexFlow { diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 94bd19c927..1690e0e3f3 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -1,11 +1,11 @@ #ifndef _OPERATOR_H #define _OPERATOR_H +#include "flexflow/batch_config.h" #include "flexflow/fftype.h" #include "flexflow/machine_view.h" #include "flexflow/parallel_tensor.h" #include "flexflow/utils/dot/record_formatter.h" -#include "flexflow/batch_config.h" #include namespace FlexFlow { @@ -197,10 +197,10 @@ class Op { virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference virtual Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) { + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) { assert(false); }; virtual void print_layer(FFModel const &model) = 0; diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 56f84308c2..27b9981ddd 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_AGGREGATE_H_ #define _FLEXFLOW_AGGREGATE_H_ +#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/ops/aggregate_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -37,16 +37,16 @@ class Aggregate : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index f48a5f95ee..ce1ceb34d6 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_AGGREGATE_SPEC_H_ #define _FLEXFLOW_AGGREGATE_SPEC_H_ +#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/ops/aggregate_spec_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -29,16 +29,16 @@ class AggregateSpec : public Op { char const *name); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index 88457756c5..c3146ad38b 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -3,12 +3,12 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" +#include "flexflow/inference.h" #include "flexflow/layer.h" #include "flexflow/node.h" #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/attention_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -66,17 +66,17 @@ class MultiHeadAttention : public Op { std::vector const &inputs); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 351685f140..5a14acb80b 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -1,11 +1,11 @@ #ifndef _FLEXFLOW_ELEMENT_BINARY_H #define _FLEXFLOW_ELEMENT_BINARY_H +#include "flexflow/inference.h" #include "flexflow/layer.h" #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/element_binary_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -34,10 +34,10 @@ class ElementBinary : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index dd15ca5019..74c24c5a19 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -1,8 +1,8 @@ #pragma once +#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/ops/experts_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -56,17 +56,17 @@ class Experts : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; void serialize(Legion::Serializer &) const override; static PCG::Node deserialize(FFModel &ff, diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index bdfdeea669..bef61dc755 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_GROUPBY_H_ #define _FLEXFLOW_GROUPBY_H_ +#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/node.h" #include "flexflow/ops/groupby_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -36,17 +36,17 @@ class Group_by : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 7390d127a9..4f2dc80635 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -1,15 +1,14 @@ #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H - #include "flexflow/device.h" #include "flexflow/fftype.h" +#include "flexflow/inference.h" #include "flexflow/layer.h" #include "flexflow/node.h" #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -21,57 +20,57 @@ class IncMultiHeadSelfAttention : public Op { using Input = ParallelTensor; IncMultiHeadSelfAttention(FFModel &model, - LayerID const &layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name); + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name); IncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name); + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name); IncMultiHeadSelfAttention(FFModel &model, - IncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); + IncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights); IncMultiHeadSelfAttention(FFModel &model, - Params const ¶ms, - Input const &inputs, - bool allocate_weights = false, - char const *name = nullptr); + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); static Op * create_operator_from_layer(FFModel &model, Layer const *layer, std::vector const &inputs); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -94,10 +93,11 @@ class IncMultiHeadSelfAttention : public Op { float *output_ptr, ffStream_t stream); static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr); + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); Params get_params() const; + public: int num_heads; float dropout; @@ -110,10 +110,10 @@ class IncMultiHeadSelfAttention : public Op { class IncMultiHeadSelfAttentionMeta : public OpMeta { public: IncMultiHeadSelfAttentionMeta(FFHandler handler, - IncMultiHeadSelfAttention const *attn, - Legion::Memory gpu_mem, - int num_samples, - int num_heads); + IncMultiHeadSelfAttention const *attn, + Legion::Memory gpu_mem, + int num_samples, + int num_heads); ~IncMultiHeadSelfAttentionMeta(void); public: diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 9c7bb6f31a..284f42a716 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -1,7 +1,7 @@ #pragma once -#include "flexflow/model.h" #include "flexflow/inference.h" +#include "flexflow/model.h" namespace FlexFlow { @@ -33,10 +33,10 @@ class LayerNorm : public Op { void forward(FFModel const &); void backward(FFModel const &); Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) { assert(0); } diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index df4772831b..b0af71e610 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LINEAR_H #define _FLEXFLOW_LINEAR_H +#include "flexflow/inference.h" #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/linear_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -44,10 +44,10 @@ class Linear : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * diff --git a/include/flexflow/ops/noop.h b/include/flexflow/ops/noop.h index 7ca55463fd..91ccc15094 100644 --- a/include/flexflow/ops/noop.h +++ b/include/flexflow/ops/noop.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_NOOP_H #define _FLEXFLOW_NOOP_H -#include "flexflow/model.h" #include "flexflow/inference.h" +#include "flexflow/model.h" namespace FlexFlow { @@ -24,11 +24,11 @@ class NoOp : public Op { std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; - void inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 7dfd372451..a8be3b98d5 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -1,11 +1,11 @@ #ifndef _FLEXFLOW_SOFTMAX_H #define _FLEXFLOW_SOFTMAX_H +#include "flexflow/inference.h" #include "flexflow/layer.h" #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/softmax_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -23,16 +23,16 @@ class Softmax : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, + BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; void print_layer(FFModel const &model) override { diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 8d1bf010c3..138c11b4d7 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_TOPK_H_ #define _FLEXFLOW_TOPK_H_ +#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/node.h" #include "flexflow/ops/topk_params.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -37,10 +37,10 @@ class TopK : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/parallel_ops/partition.h b/include/flexflow/parallel_ops/partition.h index 919446b0ab..1658759411 100644 --- a/include/flexflow/parallel_ops/partition.h +++ b/include/flexflow/parallel_ops/partition.h @@ -1,12 +1,12 @@ #ifndef _FLEXFLOW_PARTITION_H #define _FLEXFLOW_PARTITION_H +#include "flexflow/inference.h" #include "flexflow/layer.h" #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/parallel_ops/partition_params.h" #include "parallel_op.h" -#include "flexflow/inference.h" namespace FlexFlow { @@ -31,16 +31,16 @@ class Repartition : public ParallelOp { std::vector const &batch_outputs) override; void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; - void inference(FFModel const &, - BatchConfig const & bc, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 1556a93c78..12ab38efad 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -286,10 +286,10 @@ void Aggregate::forward(FFModel const &ff) { } FutureMap Aggregate::inference(FFModel const &ff, - BatchConfig const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index a255f2ed5f..b5c3551cb3 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -259,11 +259,12 @@ void AggregateSpec::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap AggregateSpec::inference(FFModel const &ff, - BatchConfig const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap + AggregateSpec::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index e1cc3d04c4..01f68919bd 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -507,11 +507,12 @@ void ElementBinary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } -FutureMap ElementBinary::inference(FFModel const &ff, - BatchConfig const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap + ElementBinary::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 1abf834551..37954f67ad 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -428,7 +428,7 @@ Node Experts::deserialize(FFModel &ff, } void Experts::init_inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -644,10 +644,10 @@ void Experts::forward(FFModel const &ff) { } FutureMap Experts::inference(FFModel const &ff, - BatchConfig const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index bfdb05bfe7..ae274a1672 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -165,7 +165,7 @@ Group_by::Group_by(FFModel &model, model, inputs.first, inputs.second, params.n, params.alpha, name) {} void Group_by::init_inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -313,10 +313,10 @@ void Group_by::forward(FFModel const &ff) { } FutureMap Group_by::inference(FFModel const &ff, - BatchConfig const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index d938cd1b4c..1111e35f8e 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -37,7 +37,8 @@ using Legion::Task; using Legion::TaskArgument; using Legion::TaskLauncher; -bool IncMultiHeadSelfAttentionParams::is_valid(ParallelTensorShape const &input) const { +bool IncMultiHeadSelfAttentionParams::is_valid( + ParallelTensorShape const &input) const { bool is_valid = input.is_valid(); return is_valid; } @@ -53,7 +54,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, bool add_zero_attn, Initializer *kernel_initializer, char const *name) { - // Currently assume that + // Currently assume that Layer *li = new Layer(this, OP_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, @@ -139,19 +140,20 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( layer->name); } -IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name) +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) // Initializer* _bias_initializer) : Op(model, OP_MULTIHEAD_ATTENTION, @@ -166,8 +168,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) -{ + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { // overwrite layer_guid layer_guid = _layer_guid; @@ -221,19 +222,20 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, /* assert(check_output_input_weight_parallel_dims()); */ } -IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name) +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) // Initializer* _bias_initializer) : Op(model, OP_INC_MULTIHEAD_SELF_ATTENTION, @@ -302,23 +304,24 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, /* assert(check_output_input_weight_parallel_dims()); */ } -IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, - IncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights) +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + IncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights) : IncMultiHeadSelfAttention(model, - other.layer_guid, - input, - other.oProjSize, - other.num_heads, - other.qProjSize, - other.vProjSize, - other.dropout, - other.bias, - other.add_bias_kv, - other.add_zero_attn, - allocate_weights, - other.name) {} + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + allocate_weights, + other.name) {} IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( FFModel &model, @@ -327,18 +330,18 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool allocate_weights, char const *name) : IncMultiHeadSelfAttention(model, - params.layer_guid, - input, - params.embed_dim, - params.num_heads, - params.kdim, - params.vdim, - params.dropout, - params.bias, - params.add_bias_kv, - params.add_zero_attn, - allocate_weights, - name) {} + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + allocate_weights, + name) {} void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -428,12 +431,13 @@ void IncMultiHeadSelfAttention::init(FFModel const &ff) { regions[1](I): weight regions[2](O): output */ -OpMeta * - IncMultiHeadSelfAttention::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; +OpMeta *IncMultiHeadSelfAttention::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + IncMultiHeadSelfAttention const *attn = + (IncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -452,8 +456,8 @@ OpMeta * .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); - IncMultiHeadSelfAttentionMeta *m = - new IncMultiHeadSelfAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); + IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( + handle, attn, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); return m; @@ -519,7 +523,8 @@ void IncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(regions.size() == 3); assert(task->regions.size() == regions.size()); - // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) task->args; + // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) + // task->args; IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( @@ -529,10 +534,8 @@ void IncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - IncMultiHeadSelfAttention::inference_kernel_wrapper(m, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr()); + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, input.get_float_ptr(), weight.get_float_ptr(), output.get_float_ptr()); } void IncMultiHeadSelfAttention::backward(FFModel const &ff) { @@ -540,7 +543,8 @@ void IncMultiHeadSelfAttention::backward(FFModel const &ff) { assert(false); } -bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { +bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, + int *value) const { switch (para) { case PM_NUM_HEADS: *value = num_heads; @@ -595,8 +599,7 @@ bool IncMultiHeadSelfAttention::measure_operator_cost( std::function forward, backward; forward = [&] { - inference_kernel_wrapper( - m, input_ptr, weight_ptr, output_ptr); + inference_kernel_wrapper(m, input_ptr, weight_ptr, output_ptr); }; if (sim->computationMode == COMP_MODE_TRAINING) { // IncMultiHeadSelfAttention does not support training diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b903a6f073..e7439fa7d2 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -23,11 +23,12 @@ using Legion::coord_t; using Legion::Memory; /*static*/ -void IncMultiHeadSelfAttention::inference_kernel(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - cudaStream_t stream) { +void IncMultiHeadSelfAttention::inference_kernel( + IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); checkCUDNN(cudnnMultiHeadAttnForward(m->handle.dnn, @@ -55,10 +56,11 @@ void IncMultiHeadSelfAttention::inference_kernel(IncMultiHeadSelfAttentionMeta c } /*static*/ -void IncMultiHeadSelfAttention::inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr) { +void IncMultiHeadSelfAttention::inference_kernel_wrapper( + IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -84,11 +86,12 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(IncMultiHeadSelfAttenti } } -IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(FFHandler handler, - IncMultiHeadSelfAttention const *attn, - Memory gpu_mem, - int num_samples, - int num_heads) +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( + FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Memory gpu_mem, + int num_samples, + int num_heads) : OpMeta(handler) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index b2716cb24d..949da919af 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -195,7 +195,7 @@ LayerNorm::LayerNorm(FFModel &model, } void LayerNorm::init_inference(FFModel const &ff, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -317,10 +317,10 @@ void LayerNorm::forward(FFModel const &ff) { } FutureMap LayerNorm::inference(FFModel const &ff, - BatchConfig const & bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/linear.cc b/src/ops/linear.cc index f91da55762..84a962c998 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -255,7 +255,7 @@ void Linear::init(FFModel const &ff) { } void Linear::init_inference(FFModel const &ff, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -422,10 +422,10 @@ void Linear::forward(FFModel const &ff) { } FutureMap Linear::inference(FFModel const &ff, - BatchConfig const & bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/ops/noop.cc b/src/ops/noop.cc index 7ad714a4c4..2a2686cabd 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -96,7 +96,7 @@ OpMeta *NoOp::init_task(Task const *task, } void NoOp::init_inference(FFModel const &ff, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -260,10 +260,13 @@ void NoOp::init(FFModel const &ff) { void NoOp::forward(FFModel const &ff) {} FutureMap NoOp::inference(FFModel const &ff, - BatchConfig const & bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) {} + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + FutureMap empty; + return empty; +} void NoOp::backward(FFModel const &ff) {} diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 3bdb8b4f8d..09eea13492 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -116,7 +116,7 @@ Softmax::Softmax(FFModel &model, : Softmax(model, input, params.dim, name) {} void Softmax::init_inference(FFModel const &ff, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -227,10 +227,10 @@ OpMeta *Softmax::init_task(Task const *task, } FutureMap Softmax::inference(FFModel const &ff, - BatchConfig const & bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -260,7 +260,7 @@ FutureMap Softmax::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Softmax::forward(FFModel const &ff) { @@ -288,7 +288,7 @@ void Softmax::forward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region)); launcher.add_field(1, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + runtime->execute_index_space(ctx, launcher); } void Softmax::forward_task(Task const *task, diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 26aa6aeeb5..6192010c51 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -137,7 +137,7 @@ TopK::TopK(FFModel &model, : TopK(model, input, params.k, params.sorted, name) {} void TopK::init_inference(FFModel const &ff, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -265,10 +265,10 @@ void TopK::forward(FFModel const &ff) { } FutureMap TopK::inference(FFModel const &ff, - BatchConfig const & bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 3e9ef7eac6..464b1cb1e6 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -103,7 +103,7 @@ OpMeta *Repartition::init_task(Task const *task, void Repartition::init_inference( FFModel const &ff, - BatchConfig const & bc, + BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -200,11 +200,12 @@ void Repartition::create_input_partition_inference( inference_input_lps[batch_inputs[0]]); } -void Repartition::inference(FFModel const &ff, - BatchConfig const & bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { +FutureMap + Repartition::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -237,7 +238,7 @@ void Repartition::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Repartition::forward(FFModel const &ff) { From 85b1fc5be50376b76059af1e8b2cc5d4fa73dcbb Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 28 Feb 2023 03:59:34 +0000 Subject: [PATCH 066/344] bug fixes --- .../cpp/inference/transformers/dataloader.cc | 4 +- .../inference/transformers/transformers.cc | 4 +- .../cpp/inference/transformers/transformers.h | 2 +- include/flexflow/batch_config.h | 8 ++-- include/flexflow/model.h | 1 + include/flexflow/ops/softmax.h | 5 +++ src/ops/inc_multihead_self_attention.cc | 2 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/softmax.cc | 25 ++++++++++- src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 44 +++++++++++++++++++ src/runtime/inference_manager.cc | 2 +- src/runtime/model.cc | 7 +++ src/runtime/operator_params.cc | 3 ++ src/runtime/substitution.cc | 8 ++++ 15 files changed, 107 insertions(+), 12 deletions(-) diff --git a/examples/cpp/inference/transformers/dataloader.cc b/examples/cpp/inference/transformers/dataloader.cc index a61598299e..6a1ccb2338 100644 --- a/examples/cpp/inference/transformers/dataloader.cc +++ b/examples/cpp/inference/transformers/dataloader.cc @@ -50,7 +50,7 @@ DataLoader::DataLoader(FFModel &ff, // Assume only the first dim can be the replica dim assert(i == replica_idx || (!dims[i].is_replica_dim)); } - assert(dims[batch_idx].size == ff.config.batchSize); + assert(dims[batch_idx].size == BatchConfig::MAX_NUM_TOKENS); dims[batch_idx].size = num_samples; full_input = @@ -72,7 +72,7 @@ DataLoader::DataLoader(FFModel &ff, // Assume only the last dim can be the replica dim assert(i == replica_idx || (!dims[i].is_replica_dim)); } - assert(dims[batch_idx].size == ff.config.batchSize); + assert(dims[batch_idx].size == BatchConfig::MAX_NUM_TOKENS); // replace batch size with number of samples dims[batch_idx].size = num_samples; diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index ab9f485517..f67d779e3a 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -38,7 +38,7 @@ void parse_input_args(char **argv, int argc, MoeConfig &config) { Tensor create_inc_multihead_attention_decoder(FFModel *model, MoeConfig const *moeConfig, Tensor const &input) { - std::vector axes{0}; + std::vector axes{1}; Tensor t = model->inc_multihead_self_attention(input, moeConfig->hidden_size, moeConfig->num_attention_heads, @@ -153,7 +153,7 @@ void FlexFlow::top_level_task(Task const *task, BatchConfig *bc = batch_configs[bid]; processed_requests += bc->update_results(ir); int available_slots = - BatchConfig::MAX_NUM_REQUESTS - bc->num_processing_requests(); + BatchConfig::MAX_NUM_REQUESTS - bc->num_active_requests(); std::vector>> prompts; data_generator.get_requests(available_slots, prompts); processed_requests += prompts.size(); diff --git a/examples/cpp/inference/transformers/transformers.h b/examples/cpp/inference/transformers/transformers.h index 3c09f64055..207a5de56b 100644 --- a/examples/cpp/inference/transformers/transformers.h +++ b/examples/cpp/inference/transformers/transformers.h @@ -31,7 +31,7 @@ struct MoeConfig { batch_size = 32; out_dim = 15; num_labels = out_dim; - num_layers = 12; + num_layers = 1; //----------------------- Inference parameters --------------------- // total number of requests processed as part of the simulation total_requests = 256; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 648406f2de..558e7841b5 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -27,19 +27,21 @@ struct InferenceResult { class BatchConfig { public: BatchConfig(); - bool register_new_request(int guid, int length); + bool register_new_request(size_t guid, int length); void prepare_next_batch(); int update_results(InferenceResult const &ir); - int num_processing_requests(); + int num_active_requests(); + int num_active_tokens(); static int const MAX_NUM_REQUESTS = 256; static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; static int const MAX_SEQUENCE_LENGTH = 1024; // These are set by update + int num_tokens, num_requests; + bool cached_results; int token_start_idx[MAX_NUM_REQUESTS]; int token_last_available_idx[MAX_NUM_REQUESTS]; int num_processing_tokens[MAX_NUM_REQUESTS]; size_t request_guid[MAX_NUM_REQUESTS]; - // This is set by the app bool request_completed[MAX_NUM_REQUESTS]; }; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 041d8c507d..151aff1eaf 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -112,6 +112,7 @@ enum TaskIDs { SOFTMAX_INIT_TASK_ID, SOFTMAX_FWD_TASK_ID, SOFTMAX_BWD_TASK_ID, + SOFTMAX_INF_TASK_ID, CONCAT_INIT_TASK_ID, CONCAT_FWD_TASK_ID, CONCAT_BWD_TASK_ID, diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index a8be3b98d5..85eecfb744 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -54,6 +54,11 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static InferenceResult + inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 1111e35f8e..efbf8636de 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -156,7 +156,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( char const *name) // Initializer* _bias_initializer) : Op(model, - OP_MULTIHEAD_ATTENTION, + OP_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e7439fa7d2..048453dd17 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -92,7 +92,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 09eea13492..310c835ab8 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -240,7 +240,7 @@ FutureMap Softmax::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(SOFTMAX_FWD_TASK_ID, + IndexLauncher launcher(SOFTMAX_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -409,6 +409,29 @@ void Softmax::backward_task_with_dim(Task const *task, m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); } +InferenceResult + Softmax::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + switch (in_domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + forward_task_with_dim(task, regions, ctx, runtime); \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + // FIXME: replace this with actual result + InferenceResult ir; + return ir; +} + bool Softmax::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_SOFTMAX_DIM: diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index dcf8d7f882..b762ad0dd5 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -143,6 +143,8 @@ std::string get_operator_type_name(OperatorType type) { return "PReLU"; case OP_MULTIHEAD_ATTENTION: return "MultiHeadAttention"; + case OP_INC_MULTIHEAD_SELF_ATTENTION: + return "IncMultiHeadSelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 0efa3711e0..6434c86ced 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -29,6 +29,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -1714,6 +1715,19 @@ GraphOptimalViewSerialized sez.serialize(attn->add_zero_attn); break; } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)op; + sez.serialize(attn->layer_guid.id); + sez.serialize(attn->oProjSize); + sez.serialize(attn->num_heads); + sez.serialize(attn->qProjSize); + sez.serialize(attn->vProjSize); + sez.serialize(attn->dropout); + sez.serialize(attn->bias); + sez.serialize(attn->add_bias_kv); + sez.serialize(attn->add_zero_attn); + break; + } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2072,6 +2086,36 @@ void FFModel::deserialize_graph_optimal_view( {inputs[0], inputs[1], inputs[2]}, params); break; } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(num_inputs == 1); + int embed_dim, num_heads, k_dim, v_dim; + float dropout; + bool bias, add_bias_kv, add_zero_attn; + size_t id; + dez.deserialize(id); + LayerID layer_guid(id); + dez.deserialize(embed_dim); + dez.deserialize(num_heads); + dez.deserialize(k_dim); + dez.deserialize(v_dim); + dez.deserialize(dropout); + dez.deserialize(bias); + dez.deserialize(add_bias_kv); + dez.deserialize(add_zero_attn); + + IncMultiHeadSelfAttentionParams params; + params.embed_dim = embed_dim; + params.num_heads = num_heads; + params.kdim = k_dim; + params.vdim = v_dim; + params.dropout = dropout; + params.bias = bias; + params.add_bias_kv = add_bias_kv; + params.add_zero_attn = add_zero_attn; + params.layer_guid = layer_guid; + node = get_or_create_node(inputs[0], params); + break; + } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index a5b846dbdb..5c8be135a4 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -125,7 +125,7 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { FutureMap fm; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; - if (op->op_type == OP_WEIGHT) { + if (op->op_type == OP_WEIGHT || op->op_type == OP_INPUT) { continue; } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ee274dc427..6ecac694ee 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4286,6 +4286,13 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "softmax_bwd_task"); } + { + TaskVariantRegistrar registrar(SOFTMAX_INF_TASK_ID, "softmax_inf_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "softmax_inf_task"); + } // compute Loss { TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward"); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 41dd37dec7..69f28ca680 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -15,6 +15,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/mean.h" @@ -78,6 +79,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Gather *)op)->get_params(); case OP_MULTIHEAD_ATTENTION: return ((MultiHeadAttention *)op)->get_params(); + case OP_INC_MULTIHEAD_SELF_ATTENTION: + return ((IncMultiHeadSelfAttention *)op)->get_params(); case OP_LAYERNORM: return ((LayerNorm *)op)->get_params(); case OP_REDUCE_SUM: diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 7751bd48de..508697d941 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/embedding.h" #include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" @@ -3222,6 +3223,13 @@ bool FFModel::convert_graph_to_operators( break; break; } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(inList.size() == 1); + IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)node.ptr; + new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true); + break; + break; + } case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; From 579bb0a78b79344d3e138f78489d75e244036118 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 28 Feb 2023 21:20:00 -0500 Subject: [PATCH 067/344] formatting --- .../ops/inc_multihead_self_attention.h | 78 ++++----- src/ops/inc_multihead_self_attention.cc | 154 +++++++++--------- src/ops/inc_multihead_self_attention.cu | 31 ++-- 3 files changed, 135 insertions(+), 128 deletions(-) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 0b1572f2ca..07be210cd3 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -1,7 +1,6 @@ #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H - #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/layer.h" @@ -20,40 +19,40 @@ class IncMultiHeadSelfAttention : public Op { using Input = ParallelTensor; IncMultiHeadSelfAttention(FFModel &model, - LayerID const &layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name); + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name); IncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name); + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name); IncMultiHeadSelfAttention(FFModel &model, - IncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); + IncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights); IncMultiHeadSelfAttention(FFModel &model, - Params const ¶ms, - Input const &inputs, - bool allocate_weights = false, - char const *name = nullptr); + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); static Op * create_operator_from_layer(FFModel &model, Layer const *layer, @@ -91,10 +90,11 @@ class IncMultiHeadSelfAttention : public Op { float *output_ptr, ffStream_t stream); static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr); + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); Params get_params() const; + public: int num_heads; float dropout; @@ -107,10 +107,10 @@ class IncMultiHeadSelfAttention : public Op { class IncMultiHeadSelfAttentionMeta : public OpMeta { public: IncMultiHeadSelfAttentionMeta(FFHandler handler, - IncMultiHeadSelfAttention const *attn, - Legion::Memory gpu_mem, - int num_samples, - int num_heads); + IncMultiHeadSelfAttention const *attn, + Legion::Memory gpu_mem, + int num_samples, + int num_heads); ~IncMultiHeadSelfAttentionMeta(void); public: diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 329254fb74..2a45efe735 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -37,7 +37,8 @@ using Legion::Task; using Legion::TaskArgument; using Legion::TaskLauncher; -bool IncMultiHeadSelfAttentionParams::is_valid(ParallelTensorShape const &input) const { +bool IncMultiHeadSelfAttentionParams::is_valid( + ParallelTensorShape const &input) const { bool is_valid = input.is_valid(); return is_valid; } @@ -53,7 +54,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, bool add_zero_attn, Initializer *kernel_initializer, char const *name) { - // Currently assume that + // Currently assume that Layer *li = new Layer(this, OP_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, @@ -139,19 +140,20 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( layer->name); } -IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name) +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) // Initializer* _bias_initializer) : Op(model, OP_MULTIHEAD_ATTENTION, @@ -166,8 +168,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) -{ + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { // overwrite layer_guid layer_guid = _layer_guid; @@ -221,19 +222,20 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, /* assert(check_output_input_weight_parallel_dims()); */ } -IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name) +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) // Initializer* _bias_initializer) : Op(model, OP_INC_MULTIHEAD_SELF_ATTENTION, @@ -302,23 +304,24 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, /* assert(check_output_input_weight_parallel_dims()); */ } -IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(FFModel &model, - IncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights) +IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( + FFModel &model, + IncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights) : IncMultiHeadSelfAttention(model, - other.layer_guid, - input, - other.oProjSize, - other.num_heads, - other.qProjSize, - other.vProjSize, - other.dropout, - other.bias, - other.add_bias_kv, - other.add_zero_attn, - allocate_weights, - other.name) {} + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + allocate_weights, + other.name) {} IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( FFModel &model, @@ -327,18 +330,18 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool allocate_weights, char const *name) : IncMultiHeadSelfAttention(model, - params.layer_guid, - input, - params.embed_dim, - params.num_heads, - params.kdim, - params.vdim, - params.dropout, - params.bias, - params.add_bias_kv, - params.add_zero_attn, - allocate_weights, - name) {} + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + allocate_weights, + name) {} void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -427,12 +430,13 @@ void IncMultiHeadSelfAttention::init(FFModel const &ff) { regions[1](I): weight regions[2](O): output */ -OpMeta * - IncMultiHeadSelfAttention::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; +OpMeta *IncMultiHeadSelfAttention::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + IncMultiHeadSelfAttention const *attn = + (IncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -451,8 +455,8 @@ OpMeta * .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); - IncMultiHeadSelfAttentionMeta *m = - new IncMultiHeadSelfAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); + IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( + handle, attn, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); return m; @@ -475,7 +479,8 @@ void IncMultiHeadSelfAttention::inference( MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, view); size_t machine_view_hash = view->hash(); - /* std::cout << "IncMultiHeadSelfAttention op machine_view: " << *(MachineView const + /* std::cout << "IncMultiHeadSelfAttention op machine_view: " << *(MachineView + const *)mv << std::endl; */ int idx = 0; @@ -520,7 +525,8 @@ void IncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(regions.size() == 3); assert(task->regions.size() == regions.size()); - // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) task->args; + // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) + // task->args; IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( @@ -530,10 +536,8 @@ void IncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - IncMultiHeadSelfAttention::inference_kernel_wrapper(m, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr()); + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, input.get_float_ptr(), weight.get_float_ptr(), output.get_float_ptr()); } void IncMultiHeadSelfAttention::backward(FFModel const &ff) { @@ -541,7 +545,8 @@ void IncMultiHeadSelfAttention::backward(FFModel const &ff) { assert(false); } -bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { +bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, + int *value) const { switch (para) { case PM_NUM_HEADS: *value = num_heads; @@ -596,8 +601,7 @@ bool IncMultiHeadSelfAttention::measure_operator_cost( std::function forward, backward; forward = [&] { - inference_kernel_wrapper( - m, input_ptr, weight_ptr, output_ptr); + inference_kernel_wrapper(m, input_ptr, weight_ptr, output_ptr); }; if (sim->computationMode == COMP_MODE_TRAINING) { // IncMultiHeadSelfAttention does not support training diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b903a6f073..e7439fa7d2 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -23,11 +23,12 @@ using Legion::coord_t; using Legion::Memory; /*static*/ -void IncMultiHeadSelfAttention::inference_kernel(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - cudaStream_t stream) { +void IncMultiHeadSelfAttention::inference_kernel( + IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); checkCUDNN(cudnnMultiHeadAttnForward(m->handle.dnn, @@ -55,10 +56,11 @@ void IncMultiHeadSelfAttention::inference_kernel(IncMultiHeadSelfAttentionMeta c } /*static*/ -void IncMultiHeadSelfAttention::inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr) { +void IncMultiHeadSelfAttention::inference_kernel_wrapper( + IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -84,11 +86,12 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(IncMultiHeadSelfAttenti } } -IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(FFHandler handler, - IncMultiHeadSelfAttention const *attn, - Memory gpu_mem, - int num_samples, - int num_heads) +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( + FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Memory gpu_mem, + int num_samples, + int num_heads) : OpMeta(handler) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); From a412cdf334cae5ed320e154bec91c852898aa5a1 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 2 Mar 2023 00:58:25 +0000 Subject: [PATCH 068/344] add missing file --- src/runtime/batch_config.cc | 132 ++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 src/runtime/batch_config.cc diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc new file mode 100644 index 0000000000..c196c09a23 --- /dev/null +++ b/src/runtime/batch_config.cc @@ -0,0 +1,132 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/batch_config.h" +#include +#include "legion.h" + +namespace FlexFlow { + +LegionRuntime::Logger::Category log_bc("BatchConfig"); + +BatchConfig::BatchConfig() { + cached_results = false; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + token_start_idx[i] = 0; + token_last_available_idx[i] = -1; + request_completed[i] = true; + num_processing_tokens[i] = 0; + } +} + +int BatchConfig::update_results(InferenceResult const &ir) { + cached_results = false; + int t = 0; + int completed = 0; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (request_completed[i]) + continue; + if (num_processing_tokens[i] == 0) + continue; + t += num_processing_tokens[i]; + token_start_idx[i] += num_processing_tokens[i]; + if (ir.results[t] == 0) { // TODO: replace this with + log_bc.print("[Done] guid(%zu) final_length(%d)", request_guid[i], token_start_idx[i]); + request_completed[i] = true; + token_start_idx[i] = 0; + token_last_available_idx[i] = -1; + num_processing_tokens[i] = 0; + completed ++; + } else if (token_start_idx[i] >= MAX_SEQUENCE_LENGTH) { + //Reach maximum request length + log_bc.print("[Done] guid(%zu) final_length(%d)", request_guid[i], token_start_idx[i]); + request_completed[i] = true; + token_start_idx[i] = 0; + token_last_available_idx[i] = -1; + num_processing_tokens[i] = 0; + completed ++; + } else { + if (token_start_idx[i] == token_last_available_idx[i] + 1) + token_last_available_idx[i] ++; + assert(token_start_idx[i] <= token_last_available_idx[i]); + } + num_processing_tokens[i] = 0; + } + return completed; +} + +bool BatchConfig::register_new_request(size_t guid, int length) { + cached_results = false; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (request_completed[i]) { + log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, length); + token_start_idx[i] = 0; + token_last_available_idx[i] = length - 1; + request_guid[i] = guid; + num_processing_tokens[i] = 0; + request_completed[i] = false; + return true; + } + } + return false; +} + +void BatchConfig::prepare_next_batch() { + cached_results = false; + int num_tokens = 0; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (request_completed[i]) + continue; + if (num_tokens + token_last_available_idx[i] - token_start_idx[i] + 1 <= MAX_NUM_TOKENS) { + num_processing_tokens[i] = token_last_available_idx[i] - token_start_idx[i] + 1; + } else { + num_processing_tokens[i] = MAX_NUM_TOKENS - num_tokens; + } + num_tokens += num_processing_tokens[i]; + } + log_bc.print("[NextBatch] num_tokens(%d)", num_tokens); +} + +int BatchConfig::num_active_requests() { + if (cached_results) + return num_requests; + num_requests = 0; + num_tokens = 0; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (!request_completed[i]) { + num_requests ++; + num_tokens += num_processing_tokens[i]; + } + } + cached_results = true; + return num_requests; +} + +int BatchConfig::num_active_tokens() { + if (cached_results) + return num_tokens; + num_requests = 0; + num_tokens = 0; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (!request_completed[i]) { + num_requests ++; + num_tokens += num_processing_tokens[i]; + } + } + cached_results = true; + return num_tokens; +} + +}; // namespace FlexFlow From 6eb64e67a4a55f11140c4f31af583a7bca3f6dbe Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 2 Mar 2023 01:16:03 -0600 Subject: [PATCH 069/344] [Inference] implementing incremental inference for Transformers (#641) * [IncMultiHeadSelfAttention] initial implementation * interface update * checkpoint * format * bug fixes * added cmake for transformers --------- Co-authored-by: Gabriele Oliaro --- .github/workflows/build.yml | 6 +- .github/workflows/gpu-ci.yml | 1 + config/config.linux | 2 +- .../cpp/inference/transformers/CMakeLists.txt | 22 ++ src/ops/inc_multihead_self_attention.cpp | 252 ++++++++++++++++++ src/runtime/batch_config.cc | 46 ++-- 6 files changed, 309 insertions(+), 20 deletions(-) create mode 100644 examples/cpp/inference/transformers/CMakeLists.txt create mode 100644 src/ops/inc_multihead_self_attention.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 66d59cb95d..2cf1315ea3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -76,7 +76,8 @@ jobs: mkdir build cd build if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON fi ../config/config.linux @@ -91,7 +92,8 @@ jobs: export FF_CUDA_ARCH=70 cd build if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON fi ../config/config.linux diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 2a46e7d498..35397839a6 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -110,6 +110,7 @@ jobs: export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion pip install . --verbose diff --git a/config/config.linux b/config/config.linux index 940757f9e8..86e8f8b647 100755 --- a/config/config.linux +++ b/config/config.linux @@ -39,7 +39,7 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} +FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-OFF} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} diff --git a/examples/cpp/inference/transformers/CMakeLists.txt b/examples/cpp/inference/transformers/CMakeLists.txt new file mode 100644 index 0000000000..d52beae3ad --- /dev/null +++ b/examples/cpp/inference/transformers/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_Transformers) +set(project_target inference_transformers) + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + transformers.cc + dataloader.cc + ../data_generator.cc) + +set(GPU_SRC + dataloader.cu) + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) + diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp new file mode 100644 index 0000000000..0903d7fa5d --- /dev/null +++ b/src/ops/inc_multihead_self_attention.cpp @@ -0,0 +1,252 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +/*static*/ +void IncMultiHeadSelfAttention::inference_kernel( + IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); +} + +/*static*/ +void IncMultiHeadSelfAttention::inference_kernel_wrapper( + IncMultiHeadSelfAttentionMeta const *m, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + IncMultiHeadSelfAttention::inference_kernel( + m, input_ptr, weight_ptr, output_ptr, stream); + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( + FFHandler handler, + IncMultiHeadSelfAttention const *attn, + Memory gpu_mem, + int num_samples, + int num_heads) + : OpMeta(handler, attn) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(handler.dnn, stream)); +#if 0 + checkCUDNN(cudnnCreateAttnDescriptor(&attnDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&qDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&kDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&vDesc)); + checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); + // Currently do not support adding bias to key/value projection + assert(!attn->add_bias_kv); + cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; + // Assume no beam search for now + int maxBeamSize = 1; + // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) + // kProjSize(%d)\n", + // num_samples, attn->qSize, attn->kSize, attn->vSize, attn->qProjSize, + // attn->kProjSize); + // printf("vProjSize(%d) oProjSize(%d) qoSeqLength(%d) kvSeqLength(%d)\n", + // attn->vProjSize, attn->oProjSize, attn->qoSeqLength, + // attn->kvSeqLength); + cudnnMathType_t math_type; + if (handle.allowTensorOpMathConversion) { + math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; + } else { + math_type = CUDNN_TENSOR_OP_MATH; + } + checkCUDNN(cudnnSetAttnDescriptor(attnDesc, + attnMode, + num_heads, + 1.0f /*smScalar*/, + CUDNN_DATA_FLOAT, + CUDNN_DATA_FLOAT, + math_type, + NULL /*attnDropoutDesc*/, + NULL /*postDropoutDesc*/, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->qoSeqLength, + attn->kvSeqLength, + num_samples, + maxBeamSize)); + size_t workSpaceSize; + checkCUDNN(cudnnGetMultiHeadAttnBuffers( + handler.dnn, attnDesc, &weightSize, &workSpaceSize, &reserveSpaceSize)); + assert(workSpaceSize <= handler.workSpaceSize); + // printf("weightSize(%zu) workSpaceSize(%zu) reserveSpaceSize(%zu)\n", + // weightSize, workSpaceSize, reserveSpaceSize); + int dimA[CUDNN_SEQDATA_DIM_COUNT]; + cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT]; + assert(CUDNN_SEQDATA_DIM_COUNT == 4); + axes[3] = CUDNN_SEQDATA_VECT_DIM; // 3 = nbDims-1 + axes[2] = CUDNN_SEQDATA_BEAM_DIM; + axes[1] = CUDNN_SEQDATA_TIME_DIM; + axes[0] = CUDNN_SEQDATA_BATCH_DIM; + int *qoSeqArray = (int *)malloc(sizeof(int) * num_samples); + int *kvSeqArray = (int *)malloc(sizeof(int) * num_samples); + for (int i = 0; i < num_samples; i++) { + qoSeqArray[i] = attn->qoSeqLength; + kvSeqArray[i] = attn->kvSeqLength; + } + // Set qDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->qSize; + checkCUDNN(cudnnSetSeqDataDescriptor(qDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + qoSeqArray, + NULL)); + } + // Set kDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->kSize; + checkCUDNN(cudnnSetSeqDataDescriptor(kDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + kvSeqArray, + NULL)); + } + // Set vDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->vSize; + checkCUDNN(cudnnSetSeqDataDescriptor(vDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + kvSeqArray, + NULL)); + } + // Set oDesc + { + dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; + dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; + dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; + dimA[CUDNN_SEQDATA_VECT_DIM] = attn->oProjSize; + checkCUDNN(cudnnSetSeqDataDescriptor(oDesc, + CUDNN_DATA_FLOAT, + CUDNN_SEQDATA_DIM_COUNT, + dimA, + axes, + num_samples, + qoSeqArray, + NULL)); + } + // allocate memory for the seqArray and reserve space + { + size_t totalSize = reserveSpaceSize + sizeof(int) * num_samples * 2; + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(totalSize - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(reserveInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + devQoSeqArray = (int *)reserveInst.pointer_untyped(0, sizeof(char)); + checkCUDA(cudaMemcpy(devQoSeqArray, + qoSeqArray, + sizeof(int) * num_samples, + cudaMemcpyHostToDevice)); + devKvSeqArray = (int *)devQoSeqArray + num_samples; + checkCUDA(cudaMemcpy(devKvSeqArray, + kvSeqArray, + sizeof(int) * num_samples, + cudaMemcpyHostToDevice)); + reserveSpace = (int *)devKvSeqArray + num_samples; + } + // allocate memory for loWinIdx/hiWinIdx + loWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); + hiWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); + for (int i = 0; i < attn->qoSeqLength; i++) { + loWinIdx[i] = 0; + hiWinIdx[i] = attn->kvSeqLength; + } + free(qoSeqArray); + free(kvSeqArray); +#endif +} + +IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { +#if 0 + reserveInst.destroy(); + free(loWinIdx); + free(hiWinIdx); + checkCUDNN(cudnnDestroyAttnDescriptor(attnDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(qDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(kDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(vDesc)); + checkCUDNN(cudnnDestroySeqDataDescriptor(oDesc)); +#endif +} + +}; // namespace FlexFlow diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index c196c09a23..99c3a18b63 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -14,8 +14,8 @@ */ #include "flexflow/batch_config.h" -#include #include "legion.h" +#include namespace FlexFlow { @@ -36,30 +36,37 @@ int BatchConfig::update_results(InferenceResult const &ir) { int t = 0; int completed = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (request_completed[i]) + if (request_completed[i]) { continue; - if (num_processing_tokens[i] == 0) + } + if (num_processing_tokens[i] == 0) { continue; + } t += num_processing_tokens[i]; token_start_idx[i] += num_processing_tokens[i]; if (ir.results[t] == 0) { // TODO: replace this with - log_bc.print("[Done] guid(%zu) final_length(%d)", request_guid[i], token_start_idx[i]); + log_bc.print("[Done] guid(%zu) final_length(%d)", + request_guid[i], + token_start_idx[i]); request_completed[i] = true; token_start_idx[i] = 0; token_last_available_idx[i] = -1; num_processing_tokens[i] = 0; - completed ++; + completed++; } else if (token_start_idx[i] >= MAX_SEQUENCE_LENGTH) { - //Reach maximum request length - log_bc.print("[Done] guid(%zu) final_length(%d)", request_guid[i], token_start_idx[i]); + // Reach maximum request length + log_bc.print("[Done] guid(%zu) final_length(%d)", + request_guid[i], + token_start_idx[i]); request_completed[i] = true; token_start_idx[i] = 0; token_last_available_idx[i] = -1; num_processing_tokens[i] = 0; - completed ++; + completed++; } else { - if (token_start_idx[i] == token_last_available_idx[i] + 1) - token_last_available_idx[i] ++; + if (token_start_idx[i] == token_last_available_idx[i] + 1) { + token_last_available_idx[i]++; + } assert(token_start_idx[i] <= token_last_available_idx[i]); } num_processing_tokens[i] = 0; @@ -87,10 +94,13 @@ void BatchConfig::prepare_next_batch() { cached_results = false; int num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (request_completed[i]) + if (request_completed[i]) { continue; - if (num_tokens + token_last_available_idx[i] - token_start_idx[i] + 1 <= MAX_NUM_TOKENS) { - num_processing_tokens[i] = token_last_available_idx[i] - token_start_idx[i] + 1; + } + if (num_tokens + token_last_available_idx[i] - token_start_idx[i] + 1 <= + MAX_NUM_TOKENS) { + num_processing_tokens[i] = + token_last_available_idx[i] - token_start_idx[i] + 1; } else { num_processing_tokens[i] = MAX_NUM_TOKENS - num_tokens; } @@ -100,13 +110,14 @@ void BatchConfig::prepare_next_batch() { } int BatchConfig::num_active_requests() { - if (cached_results) + if (cached_results) { return num_requests; + } num_requests = 0; num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { - num_requests ++; + num_requests++; num_tokens += num_processing_tokens[i]; } } @@ -115,13 +126,14 @@ int BatchConfig::num_active_requests() { } int BatchConfig::num_active_tokens() { - if (cached_results) + if (cached_results) { return num_tokens; + } num_requests = 0; num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { - num_requests ++; + num_requests++; num_tokens += num_processing_tokens[i]; } } From 252b34ee288317731512f440a60c63b91f8f8850 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 Mar 2023 14:09:55 -0500 Subject: [PATCH 070/344] [Inference] - Add GPT tokenizer (#645) * add gpt tokenizer * formatting * bug fix * add test * linting * fix typo * renaming * linting --- .github/workflows/gpu-ci.yml | 2 + examples/cpp/inference/gpt_tokenizer.cc | 273 +++++++++++++++++++ examples/cpp/inference/gpt_tokenizer.cpp | 53 ++++ examples/cpp/inference/gpt_tokenizer.h | 210 ++++++++++++++ examples/cpp/inference/gpt_tokenizer_test.sh | 63 +++++ 5 files changed, 601 insertions(+) create mode 100644 examples/cpp/inference/gpt_tokenizer.cc create mode 100644 examples/cpp/inference/gpt_tokenizer.cpp create mode 100644 examples/cpp/inference/gpt_tokenizer.h create mode 100755 examples/cpp/inference/gpt_tokenizer_test.sh diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 35397839a6..c660fb2b97 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -128,5 +128,7 @@ jobs: export FF_HOME=$(pwd) # C++ tests ./tests/cpp_gpu_tests.sh 4 + # GPT tokenizer test + ./examples/cpp/inference/gpt_tokenizer_test.sh # Python tests ./tests/multi_gpu_tests.sh 4 diff --git a/examples/cpp/inference/gpt_tokenizer.cc b/examples/cpp/inference/gpt_tokenizer.cc new file mode 100644 index 0000000000..c349bfacf3 --- /dev/null +++ b/examples/cpp/inference/gpt_tokenizer.cc @@ -0,0 +1,273 @@ +// version 0.1 +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2019-2020 zili wang . + +#include "gpt_tokenizer.h" + +using json = nlohmann::json; + +// codecvt abandoned in c++17 +std::wstring GPT_Tokenizer::utf8_to_wstring(std::string const &src) { + std::wstring_convert, wchar_t> converter; + return converter.from_bytes(src); +}; + +std::u32string GPT_Tokenizer::utf8_to_utf32(std::string const &src) { + std::wstring_convert, char32_t> converter; + return converter.from_bytes(src); +}; + +std::string GPT_Tokenizer::wstring_to_utf8(std::wstring const &src) { + std::wstring_convert, wchar_t> converter; + return converter.to_bytes(src); +}; + +std::string GPT_Tokenizer::utf32_to_utf8(std::u32string const &src) { + std::wstring_convert, char32_t> converter; + return converter.to_bytes(src); +}; + +wchar_t *GPT_Tokenizer::bytes_to_unicode() { + std::vector bs; + for (auto i = uint64_t(L'!'); i < uint64_t(L'~') + 1; ++i) { + bs.push_back(i); + } + for (auto i = uint64_t(L'¡'); i < uint64_t(L'¬') + 1; ++i) { + bs.push_back(i); + } + for (auto i = uint64_t(L'®'); i < uint64_t(L'ÿ') + 1; ++i) { + bs.push_back(i); + } + std::vector cs = bs; + uint64_t n = 0; + for (uint64_t b = 0; b < 256; ++b) { + auto p = find(bs.begin(), bs.end(), b); + if (p == bs.end()) { + bs.push_back(b); + cs.push_back(256 + n); + n++; + } + } + static wchar_t bytes_mapping[256] = {}; + for (size_t i = 0; i < 256; i++) { + bytes_mapping[i] = i; + } + for (size_t i = 0; i < bs.size(); i++) { + bytes_mapping[bs[i]] = cs[i]; + } + return bytes_mapping; +} + +std::vector GPT_Tokenizer::split(std::string const &s, + std::regex rgx) { + std::vector elems; + std::sregex_token_iterator iter(s.begin(), s.end(), rgx, -1); + std::sregex_token_iterator end; + while (iter != end) { + elems.push_back(*iter); + ++iter; + } + return elems; +}; + +std::string GPT_Tokenizer::strip(std::string const &inpt) { + if (inpt.length() == 0) { + return inpt; + } + auto start_it = inpt.begin(); + auto end_it = inpt.rbegin(); + while (std::isspace(*start_it)) { + ++start_it; + } + if (start_it == inpt.end()) { + return ""; + } + while (std::isspace(*end_it)) { + ++end_it; + } + return std::string(start_it, end_it.base()); +} + +std::unordered_set + GPT_Tokenizer::get_pairs(std::vector word) { + std::unordered_set pairs; + std::wstring prev_char = word[0]; + for (size_t i = 1; i < word.size(); ++i) { + pairs.insert(wbigram_pair({prev_char, word[i]})); + prev_char = word[i]; + } + return pairs; +}; + +void GPT_Tokenizer::load_vocab(std::string const &vocab_file) { + std::ifstream file_handle(vocab_file); + assert(file_handle.good() && "file not exists"); + bool discard_first_line = false; + if (discard_first_line) { + std::string first_line_discard; + std::getline(file_handle, first_line_discard); // skip the first line + } + json vocab_data_ = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + auto vocab_ = vocab_data_.get>(); + for (auto item : vocab_) { + vocab.insert({item.first, item.second}); + } +}; + +void GPT_Tokenizer::load_merge(std::string const &merge_file) { + bpe_ranks.reserve(60000); + std::ifstream file_handle(merge_file); + assert(file_handle.good() && "file not exists"); + std::string line; + uint32_t curr_idx = 0; + std::string version_substring = "#version:"; + while (getline(file_handle, line)) { + if (line.size() == 0 || line.rfind(version_substring, 0) == 0) { + continue; + } + std::vector bigrams = split(line); + assert(bigrams.size() == 2 && "unk format"); + wbigram_pair curr(utf8_to_wstring(bigrams[0]), utf8_to_wstring(bigrams[1])); + bpe_ranks.insert({curr, curr_idx}); + curr_idx++; + } +}; + +std::vector GPT_Tokenizer::bpe(std::wstring token) { + // bpe use wstring + if (cache.find(token) != cache.end()) { + return cache[token]; + } + std::vector wword; + for (auto c : token) { + wword.push_back(std::wstring(1, c)); + } + std::unordered_set pairs = get_pairs(wword); + if (pairs.empty()) { + return {wstring_to_utf8(token)}; + } + + while (true) { + auto bigram = pairs.begin(); + if (pairs.size() > 1) { + bigram = std::min_element( + pairs.begin(), + pairs.end(), + [this](wbigram_pair const &a, wbigram_pair const &b) -> bool { + if (bpe_ranks.find(a) == bpe_ranks.end()) { + return false; + } + if (bpe_ranks.find(b) == bpe_ranks.end()) { + return true; + } + return bpe_ranks[a] < bpe_ranks[b]; + }); + } + if (bpe_ranks.find(*bigram) == bpe_ranks.end()) { + break; + } + std::wstring first = bigram->first; + std::wstring second = bigram->second; + decltype(wword) new_wword; + + auto i = wword.begin(); + while (i < wword.end()) { + auto j = std::find(i, wword.end(), first); + if (j == wword.end()) { + new_wword.insert(new_wword.end(), i, wword.end()); + break; + } + new_wword.insert(new_wword.end(), i, j); + i = j; + // i <= wword.end + if (*i == first && i < wword.end() - 1 && *(i + 1) == second) { + new_wword.push_back(first + second); + i += 2; + } else { + new_wword.push_back(*i); + i += 1; + } + } + wword = new_wword; + if (wword.size() == 1) { + break; + } else { + pairs = get_pairs(wword); + } + } + std::vector word; + for (auto w : wword) { + word.push_back(wstring_to_utf8(w)); + } + if (token.size() < cache_word_max_length && cache.size() < cache_max_size) { + cache.insert({token, word}); + } + return word; +}; + +std::vector GPT_Tokenizer::tokenize(std::string str) { + std::vector bpe_tokens; + std::wstring wstr = utf8_to_wstring(str); + std::wsregex_iterator iter(wstr.begin(), wstr.end(), pat); + std::wsregex_iterator end; + while (iter != end) { + std::wstring token; + for (char c : wstring_to_utf8(iter->str())) { + if (0 > c) { + token.push_back(*(bytes_encoder + c + 256)); + } else { + token.push_back(*(bytes_encoder + c)); + } + } + if (token.length() > 0) { + decltype(bpe_tokens) curr_bpe_tokens = bpe(token); + bpe_tokens.insert( + bpe_tokens.end(), curr_bpe_tokens.begin(), curr_bpe_tokens.end()); + } + ++iter; + } + return bpe_tokens; +} + +int64_t GPT_Tokenizer::convert_token_to_id(std::string token) { + auto p = vocab.find(token); + if (p != vocab.end()) { + return vocab[token]; + } else { + return vocab[unk_token]; + } +} + +void GPT_Tokenizer::encode(std::string str, + size_t max_length, + std::vector *input_ids, + std::vector *mask_ids) { + if (not input_ids->empty()) { + input_ids->clear(); + } + if (not mask_ids->empty()) { + mask_ids->clear(); + } + input_ids->reserve(max_length); + mask_ids->reserve(max_length); + // input_ids->push_back(vocab[bos_token]); + // mask_ids->push_back(1); + auto tokens = tokenize(str); + for (auto t : tokens) { + if (input_ids->size() == max_length - 1) { + break; + } + input_ids->push_back(convert_token_to_id(t)); + mask_ids->push_back(1); + } + // input_ids->push_back(vocab[eos_token]); + // mask_ids->push_back(1); + while (input_ids->size() < max_length) { + input_ids->push_back(vocab[pad_token]); + mask_ids->push_back(0); + } +} diff --git a/examples/cpp/inference/gpt_tokenizer.cpp b/examples/cpp/inference/gpt_tokenizer.cpp new file mode 100644 index 0000000000..cd413e468f --- /dev/null +++ b/examples/cpp/inference/gpt_tokenizer.cpp @@ -0,0 +1,53 @@ +#include "gpt_tokenizer.h" + +#include + +int main(int argc, char *argv[]) { + std::string vocab_file = "./gpt2_bpe/vocab.bpe"; + std::string merge_file = "./gpt2_bpe/encoder.json"; + + GPT_Tokenizer tokenizer(merge_file, vocab_file); + + std::string line; + std::vector lines; + std::ifstream infile("./wikitext-103-raw/wiki.valid.raw"); + if (!infile) { + std::cout << "Error opening input file" << std::endl; + return -1; + } + std::ofstream outfile("./wikitext-103-raw/wiki.valid.bpe.flexflow", + std::ofstream::out); + if (!outfile) { + std::cout << "Error opening output file" << std::endl; + return -1; + } + while (std::getline(infile, line)) { + lines.push_back(line); + } + + std::vector input_ids; + std::vector mask_ids; + for (auto l = lines.begin(); l != lines.end(); ++l) { + std::string stripped_line = tokenizer.strip(*l); + if (stripped_line.length() == 0) { + outfile << *l << std::endl; + } else { + tokenizer.encode( + stripped_line, stripped_line.length(), &input_ids, &mask_ids); + bool first = true; + for (std::size_t i = 0; i < input_ids.size(); ++i) { + if (mask_ids[i]) { + if (!first) { + outfile << " "; + } else { + first = false; + } + outfile << input_ids[i]; + } + } + outfile << std::endl; + input_ids.clear(); + mask_ids.clear(); + } + } +} diff --git a/examples/cpp/inference/gpt_tokenizer.h b/examples/cpp/inference/gpt_tokenizer.h new file mode 100644 index 0000000000..9af722df38 --- /dev/null +++ b/examples/cpp/inference/gpt_tokenizer.h @@ -0,0 +1,210 @@ +// version 0.1 +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2019-2020 zili wang . + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using json = nlohmann::json; + +typedef std::pair bigram_pair; +typedef std::pair wbigram_pair; + +struct hash_pair { + template + size_t operator()(std::pair const &p) const { + auto hash1 = std::hash{}(p.first); + auto hash2 = std::hash{}(p.second); + return hash1 ^ hash2; + } +}; + +class GPT_Tokenizer { + +public: + GPT_Tokenizer(std::string const &vocab_file, + std::string const &merge_file, + std::string const &bos_token_str = "", + const std::string eos_token_str = "", + const std::string pad_token_str = "", + const std::string unk_token_str = "", + const std::string mask_token_str = "") { + load_vocab(vocab_file); + load_merge(merge_file); + bos_token = bos_token_str; + eos_token = eos_token_str; + pad_token = pad_token_str; + unk_token = unk_token_str; + mask_token = mask_token_str; + bytes_encoder = bytes_to_unicode(); + }; + // ~GPT_Tokenizer(); + std::vector bpe(std::wstring token); + std::vector tokenize(std::string str); + int64_t convert_token_to_id(std::string token); + void encode(std::string str, + size_t max_length, + std::vector *input_ids, + std::vector *mask_ids); + std::string bos_token; + std::string eos_token; + std::string pad_token; + std::string unk_token; + std::string mask_token; + std::string strip(std::string const &inpt); + +private: + std::unordered_map vocab; + std::unordered_map bpe_ranks; + wchar_t *bytes_to_unicode(); + wchar_t *bytes_encoder; + uint32_t cache_max_size = 500000; + uint32_t cache_word_max_length = 30; + std::string unicode_letter_expr = + "\\u0041-\\u005A\\u0061-\\u007A\\u00AA-\\u00AA\\u00B5-\\u00B5" + "\\u00BA-\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02C1" + "\\u02C6-\\u02D1\\u02E0-\\u02E4\\u02EC-\\u02EC\\u02EE-\\u02EE" + "\\u0370-\\u0374\\u0376-\\u0377\\u037A-\\u037D\\u037F-\\u037F" + "\\u0386-\\u0386\\u0388-\\u038A\\u038C-\\u038C\\u038E-\\u03A1" + "\\u03A3-\\u03F5\\u03F7-\\u0481\\u048A-\\u052F\\u0531-\\u0556" + "\\u0559-\\u0559\\u0560-\\u0588\\u05D0-\\u05EA\\u05EF-\\u05F2" + "\\u0620-\\u064A\\u066E-\\u066F\\u0671-\\u06D3\\u06D5-\\u06D5" + "\\u06E5-\\u06E6\\u06EE-\\u06EF\\u06FA-\\u06FC\\u06FF-\\u06FF" + "\\u0710-\\u0710\\u0712-\\u072F\\u074D-\\u07A5\\u07B1-\\u07B1" + "\\u07CA-\\u07EA\\u07F4-\\u07F5\\u07FA-\\u07FA\\u0800-\\u0815" + "\\u081A-\\u081A\\u0824-\\u0824\\u0828-\\u0828\\u0840-\\u0858" + "\\u0860-\\u086A\\u08A0-\\u08B4\\u08B6-\\u08C7\\u0904-\\u0939" + "\\u093D-\\u093D\\u0950-\\u0950\\u0958-\\u0961\\u0971-\\u0980" + "\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0" + "\\u09B2-\\u09B2\\u09B6-\\u09B9\\u09BD-\\u09BD\\u09CE-\\u09CE" + "\\u09DC-\\u09DD\\u09DF-\\u09E1\\u09F0-\\u09F1\\u09FC-\\u09FC" + "\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30" + "\\u0A32-\\u0A33\\u0A35-\\u0A36\\u0A38-\\u0A39\\u0A59-\\u0A5C" + "\\u0A5E-\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8D\\u0A8F-\\u0A91" + "\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9" + "\\u0ABD-\\u0ABD\\u0AD0-\\u0AD0\\u0AE0-\\u0AE1\\u0AF9-\\u0AF9" + "\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30" + "\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3D-\\u0B3D\\u0B5C-\\u0B5D" + "\\u0B5F-\\u0B61\\u0B71-\\u0B71\\u0B83-\\u0B83\\u0B85-\\u0B8A" + "\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C-\\u0B9C" + "\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9" + "\\u0BD0-\\u0BD0\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28" + "\\u0C2A-\\u0C39\\u0C3D-\\u0C3D\\u0C58-\\u0C5A\\u0C60-\\u0C61" + "\\u0C80-\\u0C80\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8" + "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBD-\\u0CBD\\u0CDE-\\u0CDE" + "\\u0CE0-\\u0CE1\\u0CF1-\\u0CF2\\u0D04-\\u0D0C\\u0D0E-\\u0D10" + "\\u0D12-\\u0D3A\\u0D3D-\\u0D3D\\u0D4E-\\u0D4E\\u0D54-\\u0D56" + "\\u0D5F-\\u0D61\\u0D7A-\\u0D7F\\u0D85-\\u0D96\\u0D9A-\\u0DB1" + "\\u0DB3-\\u0DBB\\u0DBD-\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E30" + "\\u0E32-\\u0E33\\u0E40-\\u0E46\\u0E81-\\u0E82\\u0E84-\\u0E84" + "\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5-\\u0EA5\\u0EA7-\\u0EB0" + "\\u0EB2-\\u0EB3\\u0EBD-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6-\\u0EC6" + "\\u0EDC-\\u0EDF\\u0F00-\\u0F00\\u0F40-\\u0F47\\u0F49-\\u0F6C" + "\\u0F88-\\u0F8C\\u1000-\\u102A\\u103F-\\u103F\\u1050-\\u1055" + "\\u105A-\\u105D\\u1061-\\u1061\\u1065-\\u1066\\u106E-\\u1070" + "\\u1075-\\u1081\\u108E-\\u108E\\u10A0-\\u10C5\\u10C7-\\u10C7" + "\\u10CD-\\u10CD\\u10D0-\\u10FA\\u10FC-\\u1248\\u124A-\\u124D" + "\\u1250-\\u1256\\u1258-\\u1258\\u125A-\\u125D\\u1260-\\u1288" + "\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE" + "\\u12C0-\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310" + "\\u1312-\\u1315\\u1318-\\u135A\\u1380-\\u138F\\u13A0-\\u13F5" + "\\u13F8-\\u13FD\\u1401-\\u166C\\u166F-\\u167F\\u1681-\\u169A" + "\\u16A0-\\u16EA\\u16F1-\\u16F8\\u1700-\\u170C\\u170E-\\u1711" + "\\u1720-\\u1731\\u1740-\\u1751\\u1760-\\u176C\\u176E-\\u1770" + "\\u1780-\\u17B3\\u17D7-\\u17D7\\u17DC-\\u17DC\\u1820-\\u1878" + "\\u1880-\\u1884\\u1887-\\u18A8\\u18AA-\\u18AA\\u18B0-\\u18F5" + "\\u1900-\\u191E\\u1950-\\u196D\\u1970-\\u1974\\u1980-\\u19AB" + "\\u19B0-\\u19C9\\u1A00-\\u1A16\\u1A20-\\u1A54\\u1AA7-\\u1AA7" + "\\u1B05-\\u1B33\\u1B45-\\u1B4B\\u1B83-\\u1BA0\\u1BAE-\\u1BAF" + "\\u1BBA-\\u1BE5\\u1C00-\\u1C23\\u1C4D-\\u1C4F\\u1C5A-\\u1C7D" + "\\u1C80-\\u1C88\\u1C90-\\u1CBA\\u1CBD-\\u1CBF\\u1CE9-\\u1CEC" + "\\u1CEE-\\u1CF3\\u1CF5-\\u1CF6\\u1CFA-\\u1CFA\\u1D00-\\u1DBF" + "\\u1E00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D" + "\\u1F50-\\u1F57\\u1F59-\\u1F59\\u1F5B-\\u1F5B\\u1F5D-\\u1F5D" + "\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE-\\u1FBE" + "\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB" + "\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2071-\\u2071" + "\\u207F-\\u207F\\u2090-\\u209C\\u2102-\\u2102\\u2107-\\u2107" + "\\u210A-\\u2113\\u2115-\\u2115\\u2119-\\u211D\\u2124-\\u2124" + "\\u2126-\\u2126\\u2128-\\u2128\\u212A-\\u212D\\u212F-\\u2139" + "\\u213C-\\u213F\\u2145-\\u2149\\u214E-\\u214E\\u2183-\\u2184" + "\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u2C60-\\u2CE4\\u2CEB-\\u2CEE" + "\\u2CF2-\\u2CF3\\u2D00-\\u2D25\\u2D27-\\u2D27\\u2D2D-\\u2D2D" + "\\u2D30-\\u2D67\\u2D6F-\\u2D6F\\u2D80-\\u2D96\\u2DA0-\\u2DA6" + "\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6" + "\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u2E2F-\\u2E2F" + "\\u3005-\\u3006\\u3031-\\u3035\\u303B-\\u303C\\u3041-\\u3096" + "\\u309D-\\u309F\\u30A1-\\u30FA\\u30FC-\\u30FF\\u3105-\\u312F" + "\\u3131-\\u318E\\u31A0-\\u31BF\\u31F0-\\u31FF\\u3400-\\u4DBF" + "\\u4E00-\\u9FFC\\uA000-\\uA48C\\uA4D0-\\uA4FD\\uA500-\\uA60C" + "\\uA610-\\uA61F\\uA62A-\\uA62B\\uA640-\\uA66E\\uA67F-\\uA69D" + "\\uA6A0-\\uA6E5\\uA717-\\uA71F\\uA722-\\uA788\\uA78B-\\uA7BF" + "\\uA7C2-\\uA7CA\\uA7F5-\\uA801\\uA803-\\uA805\\uA807-\\uA80A" + "\\uA80C-\\uA822\\uA840-\\uA873\\uA882-\\uA8B3\\uA8F2-\\uA8F7" + "\\uA8FB-\\uA8FB\\uA8FD-\\uA8FE\\uA90A-\\uA925\\uA930-\\uA946" + "\\uA960-\\uA97C\\uA984-\\uA9B2\\uA9CF-\\uA9CF\\uA9E0-\\uA9E4" + "\\uA9E6-\\uA9EF\\uA9FA-\\uA9FE\\uAA00-\\uAA28\\uAA40-\\uAA42" + "\\uAA44-\\uAA4B\\uAA60-\\uAA76\\uAA7A-\\uAA7A\\uAA7E-\\uAAAF" + "\\uAAB1-\\uAAB1\\uAAB5-\\uAAB6\\uAAB9-\\uAABD\\uAAC0-\\uAAC0" + "\\uAAC2-\\uAAC2\\uAADB-\\uAADD\\uAAE0-\\uAAEA\\uAAF2-\\uAAF4" + "\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26" + "\\uAB28-\\uAB2E\\uAB30-\\uAB5A\\uAB5C-\\uAB69\\uAB70-\\uABE2" + "\\uAC00-\\uD7A3\\uD7B0-\\uD7C6\\uD7CB-\\uD7FB\\uF900-\\uFA6D" + "\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB1D" + "\\uFB1F-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E-\\uFB3E" + "\\uFB40-\\uFB41\\uFB43-\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D" + "\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74" + "\\uFE76-\\uFEFC\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFF66-\\uFFBE" + "\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC"; + + std::string unicode_number_expr = + "\\u0030-\\u0039\\u00B2-\\u00B3\\u00B9-\\u00B9\\u00BC-\\u00BE" + "\\u0660-\\u0669\\u06F0-\\u06F9\\u07C0-\\u07C9\\u0966-\\u096F" + "\\u09E6-\\u09EF\\u09F4-\\u09F9\\u0A66-\\u0A6F\\u0AE6-\\u0AEF" + "\\u0B66-\\u0B6F\\u0B72-\\u0B77\\u0BE6-\\u0BF2\\u0C66-\\u0C6F" + "\\u0C78-\\u0C7E\\u0CE6-\\u0CEF\\u0D58-\\u0D5E\\u0D66-\\u0D78" + "\\u0DE6-\\u0DEF\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F33" + "\\u1040-\\u1049\\u1090-\\u1099\\u1369-\\u137C\\u16EE-\\u16F0" + "\\u17E0-\\u17E9\\u17F0-\\u17F9\\u1810-\\u1819\\u1946-\\u194F" + "\\u19D0-\\u19DA\\u1A80-\\u1A89\\u1A90-\\u1A99\\u1B50-\\u1B59" + "\\u1BB0-\\u1BB9\\u1C40-\\u1C49\\u1C50-\\u1C59\\u2070-\\u2070" + "\\u2074-\\u2079\\u2080-\\u2089\\u2150-\\u2182\\u2185-\\u2189" + "\\u2460-\\u249B\\u24EA-\\u24FF\\u2776-\\u2793\\u2CFD-\\u2CFD" + "\\u3007-\\u3007\\u3021-\\u3029\\u3038-\\u303A\\u3192-\\u3195" + "\\u3220-\\u3229\\u3248-\\u324F\\u3251-\\u325F\\u3280-\\u3289" + "\\u32B1-\\u32BF\\uA620-\\uA629\\uA6E6-\\uA6EF\\uA830-\\uA835" + "\\uA8D0-\\uA8D9\\uA900-\\uA909\\uA9D0-\\uA9D9\\uA9F0-\\uA9F9" + "\\uAA50-\\uAA59\\uABF0-\\uABF9\\uFF10-\\uFF19"; + + std::wstring wpat_expr = utf8_to_wstring( + "'s|'t|'re|'ve|'m|'ll|'d| ?[" + unicode_letter_expr + "]+| ?[" + + unicode_number_expr + "]+| ?[^\\s" + unicode_letter_expr + + unicode_number_expr + "]+|\\s+(?!\\S)|\\s+"); + + const std::wregex pat = std::wregex(wpat_expr); + std::unordered_map> cache; + void load_vocab(std::string const &vocab_file); + void load_merge(std::string const &merge_file); + + std::unordered_set + get_pairs(std::vector word); + std::wstring utf8_to_wstring(std::string const &src); + std::u32string utf8_to_utf32(std::string const &src); + std::string wstring_to_utf8(std::wstring const &src); + std::string utf32_to_utf8(std::u32string const &src); + + std::vector split(std::string const &s, + std::regex rgx = std::regex("\\s+")); +}; diff --git a/examples/cpp/inference/gpt_tokenizer_test.sh b/examples/cpp/inference/gpt_tokenizer_test.sh new file mode 100755 index 0000000000..8f7660bfe4 --- /dev/null +++ b/examples/cpp/inference/gpt_tokenizer_test.sh @@ -0,0 +1,63 @@ +#! /usr/bin/env bash +set -x +set -e + +cleanup() { + rm -rf wikitext-103-raw-v1.zip wikitext-103-raw gpt2_bpe gpt_tokenizer pytokenizer.py bpe.py +} + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Clean up before test (just in case) +cleanup + +# Compile the FlexFlow C++ tokenizer stand-alone +g++ -std=c++11 -I../../../deps/json/include -o gpt_tokenizer gpt_tokenizer.cpp gpt_tokenizer.cc +chmod +x gpt_tokenizer + +# Download and inflate wikitext dataset +wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip +unzip wikitext-103-raw-v1.zip +rm wikitext-103-raw-v1.zip + +# Download GPT-2 BPE vocab and merges files +mkdir -p gpt2_bpe +wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json +wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe + +# Download minGPT bpe tokenizer for comparison +wget -O bpe.py https://raw.githubusercontent.com/karpathy/minGPT/master/mingpt/bpe.py +chmod +x bpe.py + +# Run the FlexFlow C++ tokenizer +./gpt_tokenizer + +# Run the minGPT tokenizer +cat << EOF > pytokenizer.py +#!/usr/bin/env python +from bpe import BPETokenizer + +tokenizer = BPETokenizer() +inp="./wikitext-103-raw/wiki.valid.raw" +outp="./wikitext-103-raw/wiki.valid.bpe.minGPT" +with open(inp, "r") as infile: + with open(outp, "w+") as outfile: + for l in infile.readlines(): + if len(l.strip()) == 0: + outfile.write(l) + else: + out = tokenizer(l.strip()).tolist()[0] + out = [str(x) for x in out] + out = " ".join(out) + outfile.write(out) + outfile.write("\n") +EOF +chmod +x pytokenizer.py +./pytokenizer.py + +# Check that the outputs match +diff ./wikitext-103-raw/wiki.valid.bpe.flexflow ./wikitext-103-raw/wiki.valid.bpe.minGPT + +# Clean up after test +cleanup From 01417084227f21c99b88b86f78ea733848ecaa0f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Mar 2023 18:20:29 -0500 Subject: [PATCH 071/344] [MoE][Experts] - New `Experts` CUDA kernel implementation (#612) * backup of initial kernel code * Fix some thrust related compiling error. * bug fixing, added more stuff to ExpertsMeta * added cuda kernel for token replication * added notes * debug * updates * Fixed bugs. * bug fix * linting * Add CublasGemmBatched skeleton. * add comments * Update experts kernel to Cublas GemmBatchedEx solution. * Add out_dim to Experts. * Update experts.cu with aggreation results. * parallelized preparation of gemm arrays with exp capacity * fix typo, remove unused functions * added weights pointer copy * Pushed debugging codes. * fixed several bugs * bug fix * more fixes * fixed batch gemm * finished basic version of aggregation * Apply bias to GemmBatchedEx. * Apply activation to experts kernel. * Reformat code in experts.cu * moved thrust functions on legion stream * fixed bug in activation computation * hip rocm build fix * linting * fix * computing all activations at once --------- Co-authored-by: Zeyu Wang Co-authored-by: Zeyu Wang --- .../mixture_of_experts/dataloader.cc | 91 --- include/flexflow/ops/experts.h | 44 ++ include/flexflow/utils/cuda_helper.h | 4 +- src/ops/experts.cc | 46 +- src/ops/experts.cpp | 34 +- src/ops/experts.cu | 613 +++++++++++++++++- src/runtime/cuda_helper.cu | 28 + 7 files changed, 706 insertions(+), 154 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/dataloader.cc b/examples/cpp/inference/mixture_of_experts/dataloader.cc index 557fe4e095..af32cfe98b 100644 --- a/examples/cpp/inference/mixture_of_experts/dataloader.cc +++ b/examples/cpp/inference/mixture_of_experts/dataloader.cc @@ -114,97 +114,6 @@ DataLoader::DataLoader(FFModel &ff, // Load data // ================================================= -void read_cifar100(float *input_ptr, int *label_ptr) { - std::ifstream file; - file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); - if (!file) { - std::cout << "Error opening CIFAR100 train data file" << std::endl; - assert(false); - } - - file.seekg(0, std::ios::beg); - - // each sample: <1 x coarse label><1 x fine label><3072 x pixel> - for (std::size_t i = 0; i < MAX_NUM_SAMPLES; i++) { - unsigned char temp = 0; - file.read((char *)&temp, sizeof(temp)); // coarse label, skip - file.read((char *)&temp, sizeof(temp)); - label_ptr[i] = temp; - for (std::size_t j = 0; j < 3072; ++j) { - file.read((char *)&temp, sizeof(temp)); - input_ptr[i * 3072 + j] = (float)temp / 255.0f; - } - } - - file.close(); -} - -int reverseInt(int i) { - unsigned char c1, c2, c3, c4; - - c1 = i & 255; - c2 = (i >> 8) & 255; - c3 = (i >> 16) & 255; - c4 = (i >> 24) & 255; - - return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; -} - -/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to -the current working directory */ -void read_mnist(float *input_ptr, int *label_ptr) { - // read inputs - std::ifstream input("train-images-idx3-ubyte", std::ios::binary); - if (input.is_open()) { - int magic_number = 0; - int number_of_images = 0; - int n_rows = 0; - int n_cols = 0; - input.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - input.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - input.read((char *)&n_rows, sizeof(n_rows)); - n_rows = reverseInt(n_rows); - input.read((char *)&n_cols, sizeof(n_cols)); - n_cols = reverseInt(n_cols); - - for (int i = 0; i < number_of_images; i++) { - for (int r = 0; r < n_rows; r++) { - for (int c = 0; c < n_cols; c++) { - unsigned char temp = 0; - input.read((char *)&temp, sizeof(temp)); - input_ptr[i * n_rows * n_cols + r * n_cols + c] = - (float)temp / 255.0f; - } - } - } - } else { - std::cout << "Error opening MNIST input data file" << std::endl; - assert(false); - } - - // read labels - std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary); - if (labels.is_open()) { - int magic_number = 0; - int number_of_images = 0; - labels.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - labels.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - - for (int i = 0; i < number_of_images; i++) { - unsigned char temp = 0; - labels.read((char *)&temp, sizeof(temp)); - label_ptr[i] = temp; - } - } else { - std::cout << "Error opening MNIST label data file" << std::endl; - assert(false); - } -} - void DataLoader::load_entire_dataset(Task const *task, std::vector const ®ions, Context ctx, diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 74c24c5a19..58640ec085 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -11,16 +11,56 @@ class ExpertsMeta : public OpMeta { ExpertsMeta(FFHandler handler, int _num_experts, int _experts_start_idx, + int _data_dim, + int _out_dim, + int _effective_batch_size, + int _num_chosen_experts, float _alpha, bool _use_bias, ActiMode _activation); ~ExpertsMeta(void); + + // Thrust helper arrays + int *sorted_indices; + int *original_indices; + int *non_zero_expert_labels; + int *temp_sequence; + int *exp_local_label_to_index; + int *expert_start_indexes; + int *num_assignments_per_expert; // numbers of tokes assigned to each expert. + // Values may exceed the expert capacity + int *capped_num_assignments_per_expert; + int *destination_start_indices; + float const **token_idx_array; float const **dev_weights; + float const **weight_idx_array; + float const **coefficient_idx_array; + float **output_idx_array; + float const **bias_idx_array; + float const *one_ptr; + float const **one_ptr_array; + + // array of arrays to store cublasGemmBatchedEx outputs before aggregation + float **batch_outputs; + float **dev_batch_outputs; + int num_experts; int experts_start_idx; + int data_dim; + int out_dim; + int effective_batch_size; + int num_chosen_experts; + int expert_capacity; float alpha; bool use_bias; ActiMode activation; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnTensorDescriptor_t resultTensorDesc; + cudnnActivationDescriptor_t actiDesc; +#else + miopenTensorDescriptor_t resultTensorDesc; + miopenActivationDescriptor_t actiDesc; +#endif }; // definitions for the CUDA kernel @@ -107,6 +147,10 @@ class Experts : public Op { int num_experts; int experts_start_idx; int experts_output_dim_size; + int data_dim; + int out_dim; + int effective_batch_size; + int num_chosen_experts; float alpha; int experts_num_layers; int experts_internal_dim_size; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 46e323b186..78e21ccd9f 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -137,6 +137,8 @@ cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); cudaDataType_t ff_to_cuda_datatype(DataType type); - cudnnDataType_t ff_to_cudnn_datatype(DataType type); + +cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); +cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); #endif \ No newline at end of file diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 37954f67ad..b57874ac35 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -68,7 +68,7 @@ Tensor FFModel::experts(Tensor const *inputs, // parameters for the FFN implementing the experts. We can make these // FFModel::experts(...) function parameters if needed. - bool use_bias = false; + bool use_bias = true; ActiMode activation = AC_MODE_RELU; Layer *e = new Layer(this, @@ -291,34 +291,44 @@ Experts::Experts(FFModel &model, // overwrite layer_guid layer_guid = _layer_guid; + // Check number of inputs, output, weights assert(num_experts > 0); assert(numInputs == 3); assert(numOutputs == 1); assert(numWeights == num_experts * (1 + use_bias)); - assert(inputs[0] != nullptr); + // Check input dimensions int num_dims = inputs[0]->num_dims; + int topk = inputs[1]->dims[0].size; + assert(inputs[0] != nullptr); assert(inputs[1]->num_dims == num_dims); assert(inputs[2]->num_dims == num_dims); - - int topk = inputs[1]->dims[0].size; assert(inputs[2]->dims[0].size == topk); - for (int i = 1; i < num_dims; i++) { assert(inputs[0]->dims[i] == inputs[1]->dims[i]); assert(inputs[1]->dims[i] == inputs[2]->dims[i]); } - - assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); - assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); - assert(experts_num_layers == 1 || experts_internal_dim_size > 0); - // Assume that we don't parallelize the channel dim of input // nor the expert_assigned dim of indices assert(inputs[0]->dims[0].degree == 1); assert(inputs[1]->dims[0].degree == 1); assert(inputs[2]->dims[0].degree == 1); + // check data type of indices input + assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); + assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); + assert(experts_num_layers == 1 || experts_internal_dim_size > 0); + // save the token embedding dimension (data_dim) and the effective batch size + data_dim = inputs[0]->dims[0].size; + effective_batch_size = 1; + for (int i = 1; i <= num_dims - 2; i++) { + effective_batch_size *= inputs[0]->dims[i].size; + } + num_chosen_experts = topk; + + out_dim = _experts_output_dim_size; + + // Create the parallel tensor for the output ParallelDim out_dims[MAX_TENSOR_DIM]; for (int i = 0; i < num_dims; i++) { out_dims[i] = inputs[0]->dims[i]; @@ -572,6 +582,10 @@ OpMeta *Experts::init_task(Task const *task, ExpertsMeta *m = new ExpertsMeta(handle, exp->num_experts, exp->experts_start_idx, + exp->data_dim, + exp->out_dim, + exp->effective_batch_size, + exp->num_chosen_experts, exp->alpha, exp->use_bias, exp->activation); @@ -762,10 +776,13 @@ void Experts::inference_task(Task const *task, coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; coord_t batch_size = input_domain.hi()[samples_index] - input_domain.lo()[samples_index] + 1; - coord_t chosen_experts = indices_domain.hi()[0] - indices_domain.lo()[0]; + coord_t chosen_experts = indices_domain.hi()[0] - indices_domain.lo()[0] + 1; coord_t out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + assert(data_dim == m->data_dim); + assert(out_dim == m->out_dim); + assert(chosen_experts == m->num_chosen_experts); assert(chosen_experts == - topk_gate_pred_domain.hi()[0] - topk_gate_pred_domain.lo()[0]); + topk_gate_pred_domain.hi()[0] - topk_gate_pred_domain.lo()[0] + 1); for (int i = 1; i < input_dims; i++) { int a = input_domain.hi()[i] - input_domain.lo()[i] + 1; @@ -776,6 +793,7 @@ void Experts::inference_task(Task const *task, batch_size *= a; } } + assert(batch_size == m->effective_batch_size); assert(batch_size <= MAX_BATCH_SIZE && "batch size exceeds MAX_BATCH_SIZE defined in experts.h"); @@ -817,11 +835,11 @@ void Experts::inference_task(Task const *task, task->regions[4 + i * (1 + use_bias) + use_bias] .region.get_index_space()); int bias_dims = bias_domain.get_dim(); - assert(bias_dims == 1); + assert(bias_dims == 4); assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); } } - return; + Experts::forward_kernel_wrapper(m, input_ptr, indices_ptr, diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index 081f814400..3e8bebb4f0 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -29,39 +29,25 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int chosen_experts, int batch_size, int out_dim) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - int expert_capacity = - ceil(m->alpha * chosen_experts / m->num_experts * batch_size); - - int num_experts = m->num_experts; - // int expert_start_index = experts_start_idx; - bool use_bias = m->use_bias; - // ActiMode activation = m->activation; - - hipMemcpy(m->dev_weights, - weights, - num_experts * (1 + use_bias) * sizeof(float *), - hipMemcpyHostToDevice); - // TODO: write the HIP version of the kernel after finishing the CUDA kernel + handle_unimplemented_hip_kernel(OP_EXPERTS); } ExpertsMeta::ExpertsMeta(FFHandler handler, int _num_experts, int _experts_start_idx, + int _data_dim, + int _out_dim, + int _effective_batch_size, + int _num_chosen_experts, float _alpha, bool _use_bias, ActiMode _activation) : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), alpha(_alpha), use_bias(_use_bias), - activation(_activation) { - checkCUDA( - hipMalloc(&dev_weights, num_experts * (1 + use_bias) * sizeof(float *))); -} -ExpertsMeta::~ExpertsMeta(void) { - checkCUDA(hipFree(&dev_weights)); -} + experts_start_idx(_experts_start_idx), data_dim(_data_dim), + out_dim(_out_dim), effective_batch_size(_effective_batch_size), + num_chosen_experts(_num_chosen_experts), alpha(_alpha), + use_bias(_use_bias), activation(_activation) {} +ExpertsMeta::~ExpertsMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 342de3ef65..c74c77b399 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -15,9 +15,360 @@ #include "flexflow/ops/experts.h" #include "flexflow/utils/cuda_helper.h" +#include +#include + +// Thrust-related headers +#define THRUST_IGNORE_DEPRECATED_CPP_DIALECT 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include namespace FlexFlow { +struct exceeds_expert_capacity { + int _expert_capacity; + exceeds_expert_capacity(int expert_capacity) + : _expert_capacity(expert_capacity){}; + __host__ __device__ bool operator()(int x) { + return x > _expert_capacity; + } +}; + +void experts_forward_thrust_wrapper(ExpertsMeta const *m, + int const *indices, + int num_indices, + int experts_start_idx, + int num_experts_per_block, + int expert_capacity, + int *lb_index, + int *ub_index, + int *num_valid_assignments, + int *non_zero_experts_count, + int *start_indexes, + int *gemm_batch_count, + ffStream_t stream) { + // sort the indices and coefficients by expert. Keep track of the original + // position of each index/coefficient using the original_indices array + thrust::device_ptr thrust_indices = + thrust::device_pointer_cast(indices); + thrust::device_ptr sorted_indices = + thrust::device_pointer_cast(m->sorted_indices); + thrust::copy(thrust::cuda::par.on(stream), + thrust_indices, + thrust_indices + num_indices, + sorted_indices); + + thrust::device_ptr original_indices = + thrust::device_pointer_cast(m->original_indices); + thrust::sequence(thrust::cuda::par.on(stream), + original_indices, + original_indices + num_indices); + + thrust::stable_sort_by_key(thrust::cuda::par.on(stream), + sorted_indices, + sorted_indices + num_indices, + original_indices); + + // get lower and upper bound of indices corresponding to experts in the block + thrust::device_ptr lb = thrust::lower_bound(thrust::cuda::par.on(stream), + sorted_indices, + sorted_indices + num_indices, + experts_start_idx); + thrust::device_ptr ub = + thrust::upper_bound(thrust::cuda::par.on(stream), + sorted_indices, + sorted_indices + num_indices, + experts_start_idx + num_experts_per_block); + + *lb_index = lb - sorted_indices; + *ub_index = ub - sorted_indices; + *num_valid_assignments = (*ub_index) - (*lb_index); + if ((*num_valid_assignments) == 0) { + return; + } + + // create "exp_local_label_to_index", a mapping from local expert label to its + // non-zero expert index + thrust::device_ptr non_zero_expert_labels = + thrust::device_pointer_cast(m->non_zero_expert_labels); + thrust::device_ptr non_zero_expert_labels_end = thrust::unique_copy( + thrust::cuda::par.on(stream), lb, ub, non_zero_expert_labels); + *non_zero_experts_count = non_zero_expert_labels_end - non_zero_expert_labels; + + using namespace thrust::placeholders; + thrust::for_each(thrust::cuda::par.on(stream), + non_zero_expert_labels, + non_zero_expert_labels + (*non_zero_experts_count), + _1 -= + experts_start_idx); // convert global indexes to local ones + + thrust::device_ptr temp_sequence = + thrust::device_pointer_cast(m->temp_sequence); + thrust::sequence(thrust::cuda::par.on(stream), + temp_sequence, + temp_sequence + (*non_zero_experts_count)); + + thrust::device_ptr exp_local_label_to_index = + thrust::device_pointer_cast(m->exp_local_label_to_index); + thrust::scatter(thrust::cuda::par.on(stream), + temp_sequence, + temp_sequence + (*non_zero_experts_count), + non_zero_expert_labels, + exp_local_label_to_index); + + // get local start index (within lower/upper bound) for each expert receiving + // non-zero tokens + thrust::device_ptr expert_start_indexes = + thrust::device_pointer_cast(m->expert_start_indexes); + thrust::sequence(thrust::cuda::par.on(stream), + expert_start_indexes, + expert_start_indexes + (*num_valid_assignments)); + *start_indexes = (thrust::unique_by_key_copy(thrust::cuda::par.on(stream), + lb, + ub, + expert_start_indexes, + temp_sequence, + expert_start_indexes)) + .first - + temp_sequence; + assert((*start_indexes) == (*non_zero_experts_count)); + + // append ub_index + expert_start_indexes[(*start_indexes)] = (*ub_index); + + // get number of token assignment to each expert + thrust::device_ptr num_assignments_per_expert = + thrust::device_pointer_cast(m->num_assignments_per_expert); + thrust::transform(thrust::cuda::par.on(stream), + expert_start_indexes + 1, + expert_start_indexes + (*non_zero_experts_count) + 1, + expert_start_indexes, + num_assignments_per_expert, + thrust::minus()); + + // build destination_start_index array, telling us the first slot that belongs + // to each expert in the destination array (after factoring in expert + // capacity) + thrust::device_ptr destination_start_indices = + thrust::device_pointer_cast(m->destination_start_indices); + thrust::replace_copy_if(thrust::cuda::par.on(stream), + num_assignments_per_expert, + num_assignments_per_expert + + (*non_zero_experts_count), + destination_start_indices, + exceeds_expert_capacity(expert_capacity), + expert_capacity); + + cudaMemcpyAsync(m->capped_num_assignments_per_expert, + m->destination_start_indices, + (*non_zero_experts_count) * sizeof(int), + cudaMemcpyDeviceToHost, + stream); + + *gemm_batch_count = + thrust::reduce(thrust::cuda::par.on(stream), + destination_start_indices, + destination_start_indices + (*non_zero_experts_count)); + + thrust::exclusive_scan(thrust::cuda::par.on(stream), + destination_start_indices, + destination_start_indices + (*non_zero_experts_count), + destination_start_indices, + 0); +} + +__global__ void experts_forward_prepare_kernel( + int num_valid_assignments, + int expert_capacity, + int lb_index, + int experts_start_idx, + int num_experts_per_block, + int num_chosen_experts, + int data_dim, + int out_dim, + bool use_bias, + int *sorted_indices, + int *expert_start_indexes, + int *exp_local_label_to_index, + int *destination_start_indices, + int *original_indices, + float const *input, // @In: Tokens' values (in_dim, batch_size) + float *output, + float const **token_idx_array, // @Out: Barray for GemmBatchedEx + float const **weights, // @In: Experts' weights + float const **weight_idx_array, // @Out: Aarray for GemmBatchedEx + float const **bias_idx_array, // @Out: Experts' bias + float const *coefficients, // @In: topk_gate_predss coefficients tensor + // (num_chosen_experts, batch_size) + float const **coefficient_idx_array, // @Out: Barray for Aggregation + float **output_idx_array) { + + CUDA_KERNEL_LOOP(i, num_valid_assignments) { + int global_expert_label = sorted_indices[lb_index + i]; + assert(global_expert_label >= experts_start_idx && + global_expert_label < experts_start_idx + num_experts_per_block); + int local_expert_label = global_expert_label - experts_start_idx; + int expert_index = exp_local_label_to_index[local_expert_label]; + int within_expert_offset = i - expert_start_indexes[expert_index]; + if (within_expert_offset < expert_capacity) { + int rev_idx = original_indices[i + lb_index]; + int token_idx = (rev_idx / num_chosen_experts); + + token_idx_array[destination_start_indices[expert_index] + + within_expert_offset] = &input[token_idx * data_dim]; + weight_idx_array[destination_start_indices[expert_index] + + within_expert_offset] = + weights[local_expert_label * (1 + use_bias)]; + if (use_bias) { + bias_idx_array[destination_start_indices[expert_index] + + within_expert_offset] = + weights[local_expert_label * (1 + use_bias) + use_bias]; + } + coefficient_idx_array[destination_start_indices[expert_index] + + within_expert_offset] = &coefficients[rev_idx]; + output_idx_array[destination_start_indices[expert_index] + + within_expert_offset] = &output[token_idx * out_dim]; + } + } +} + +bool use_activation(ActiMode mode) { + switch (mode) { + case AC_MODE_RELU: + case AC_MODE_SIGMOID: + case AC_MODE_TANH: + return true; + case AC_MODE_NONE: + return false; + default: + assert(0); + break; + } + return false; +} + +void experts_forward_GemmBatched_kernel(ExpertsMeta const *m, + void const **weights_ptr, + void const **input_ptr, + void **results_ptr, + void const **bias_ptr, + ActiMode activation, + int in_dim, + int out_dim, + int num_tokens, + int num_chosen_experts, + int gemm_batch_count, + int non_zero_experts_count, + ffStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + + // cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); + // cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); + // cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); + cudaDataType_t input_type = CUDA_R_32F; + cudaDataType_t weight_type = CUDA_R_32F; + cudaDataType_t output_type = CUDA_R_32F; + + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + + checkCUDA(cublasGemmBatchedEx( + m->handle.blas, + CUBLAS_OP_T, // Tranpose Weight, shape (in_dim, out_dim) => (out_dim, + // in_dim) + CUBLAS_OP_N, // Input_token, shape (in_dim, 1) + out_dim, // num_row of (A, C) = out_dim + 1, // num_col of (B, C) = 1 + in_dim, // num_col of A and num_rows of B = in_dim + &alpha, + weights_ptr, // Aarray (num_tokens * chosen_experts, in_dim, out_dim) + weight_type, + in_dim, // Leading Dimension of weight before transpose + input_ptr, // Barray (num_tokens * chosen_experts, in_dim, 1) + input_type, + in_dim, // Leading Dimension of input_token + &beta, + results_ptr, // Carray (num_tokens * chosen_experts, out_dim, 1) + output_type, + out_dim, // Leading Dimension of output + gemm_batch_count, // Total submatrixes + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // TODO 2: bias and activations + if (m->use_bias) { + checkCUDA(cublasGemmBatchedEx( + m->handle.blas, + CUBLAS_OP_T, // Bias, shape (out_dim, 1) + CUBLAS_OP_N, // Coefficient, shape (1, 1) + out_dim, // num_row of (A, C) = out_dim + 1, // num_col of (B, C) = 1 + 1, // num_col of A and num_rows of B = 1 + &alpha, + bias_ptr, // bias tensor (out_dim, 1) + weight_type, + out_dim, // Leading Dimension of bias tensor + (void const **)m->one_ptr_array, // all-one tensor (1, 1) + CUDA_R_32F, + 1, // Leading Dimension of all-one tensor + &alpha, + results_ptr, // Carray (num_tokens * chosen_experts, out_dim, 1) + output_type, + out_dim, // Leading Dimension of output + gemm_batch_count, // Total submatrixs + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + if (use_activation(activation)) { + checkCUDNN(cudnnActivationForward(m->handle.dnn, + m->actiDesc, + &alpha, + m->resultTensorDesc, + m->batch_outputs[0], + &beta, + m->resultTensorDesc, + m->batch_outputs[0])); + } +} + +__global__ void experts_forward_aggregate_kernel(int num_tokens, + int gemm_batch_count, + int out_dim, + float *output, + float **results_ptr, + float const **coefficient_ptr, + float **output_ptr) { + + CUDA_KERNEL_LOOP(i, num_tokens * out_dim) { + output[i] = 0.0f; + } + + __syncthreads(); + + CUDA_KERNEL_LOOP(i, gemm_batch_count * out_dim) { + int token_index = i / out_dim; + int emb_index = i % out_dim; + float res = + results_ptr[token_index][emb_index] * (*coefficient_ptr[token_index]); + atomicAdd(output_ptr[token_index] + emb_index, res); + } +} + /*static*/ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float const *input, @@ -31,14 +382,6 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - int expert_capacity = - ceil(m->alpha * chosen_experts / m->num_experts * batch_size); - - int num_experts = m->num_experts; - // int expert_start_index = experts_start_idx; - bool use_bias = m->use_bias; - // ActiMode activation = m->activation; - cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -46,23 +389,116 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaEventRecord(t_start, stream); } + int num_experts_per_block = m->num_experts; + int experts_start_idx = m->experts_start_idx; + bool use_bias = m->use_bias; + ActiMode activation = m->activation; + int data_dim = m->data_dim; + int num_chosen_experts = m->num_chosen_experts; + int num_tokens = m->effective_batch_size; + int expert_capacity = m->expert_capacity; + + assert(chosen_experts == num_chosen_experts); + assert(num_tokens == batch_size); + assert(out_dim == m->out_dim); + + // TODO: remove this once we condense all weights in a single tensor + // currently each weight matrix is placed on GPU by Legion, but the array + // holding the pointers to each weight matrix is on CPU cudaMemcpy(m->dev_weights, weights, - num_experts * (1 + use_bias) * sizeof(float *), + num_experts_per_block * (1 + use_bias) * sizeof(float *), cudaMemcpyHostToDevice); - /** TODO: launch one or more kernel(s) to do the following: - * 1. sort the tokens by expert to which they are assigned. This will require - * replicating tokens when chosen_experts > 1 - * 2. matrix multiply (you can use cublasGemmEx) each slice of tokens with the - * corresponding expert's weights tensor. Add the bias. - * - you can obtain the slice by selecting the tokens between the index - * where the expert i starts and min(i+expert_capacity, index where expert i+1 - * starts) - * 3. reorder the outputs by token, and aggregate the outputs of multiple - * experts for the same token by computing an average weighted by the - * appropriate coefficient from the topk_gate_preds matrix. - */ + int num_indices = num_tokens * num_chosen_experts; + // values below are set by Thrust in the experts_forward_thrust_wrapper + // function + int lb_index = 0; + int ub_index = 0; + int num_valid_assignments = 0; + int non_zero_experts_count = 0; + int start_indexes = 0; + int gemm_batch_count = 0; + + experts_forward_thrust_wrapper(m, + indices, + num_indices, + experts_start_idx, + num_experts_per_block, + expert_capacity, + &lb_index, + &ub_index, + &num_valid_assignments, + &non_zero_experts_count, + &start_indexes, + &gemm_batch_count, + stream); + + cudaStreamSynchronize(stream); + + if (num_valid_assignments == 0) { + return; + } + + experts_forward_prepare_kernel<<>>(num_valid_assignments, + expert_capacity, + lb_index, + experts_start_idx, + num_experts_per_block, + num_chosen_experts, + data_dim, + out_dim, + use_bias, + m->sorted_indices, + m->expert_start_indexes, + m->exp_local_label_to_index, + m->destination_start_indices, + m->original_indices, + input, + output, + m->token_idx_array, + m->dev_weights, + m->weight_idx_array, + m->bias_idx_array, + topk_gate_preds, + m->coefficient_idx_array, + m->output_idx_array); + + cudaStreamSynchronize(stream); + + experts_forward_GemmBatched_kernel(m, + (void const **)m->weight_idx_array, + (void const **)m->token_idx_array, + (void **)m->dev_batch_outputs, + (void const **)m->bias_idx_array, + activation, + data_dim, + out_dim, + num_tokens, + num_chosen_experts, + gemm_batch_count, + non_zero_experts_count, + stream); + + cudaStreamSynchronize(stream); + + int aggregation_parallelism = + std::max(num_tokens, gemm_batch_count) * out_dim; + experts_forward_aggregate_kernel<<>>(num_tokens, + gemm_batch_count, + out_dim, + output, + m->dev_batch_outputs, + m->coefficient_idx_array, + m->output_idx_array); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -78,17 +514,146 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, ExpertsMeta::ExpertsMeta(FFHandler handler, int _num_experts, int _experts_start_idx, + int _data_dim, + int _out_dim, + int _effective_batch_size, + int _num_chosen_experts, float _alpha, bool _use_bias, ActiMode _activation) : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), alpha(_alpha), use_bias(_use_bias), - activation(_activation) { + experts_start_idx(_experts_start_idx), data_dim(_data_dim), + out_dim(_out_dim), effective_batch_size(_effective_batch_size), + num_chosen_experts(_num_chosen_experts), alpha(_alpha), + use_bias(_use_bias), activation(_activation) { + expert_capacity = + ceil(alpha * num_chosen_experts / num_experts * effective_batch_size); + + checkCUDA( + cudaMalloc(&sorted_indices, + num_chosen_experts * effective_batch_size * sizeof(int))); + checkCUDA( + cudaMalloc(&original_indices, + num_chosen_experts * effective_batch_size * sizeof(int))); + checkCUDA(cudaMalloc(&non_zero_expert_labels, num_experts * sizeof(int))); + checkCUDA(cudaMalloc( + &temp_sequence, + std::max(num_experts, num_chosen_experts * effective_batch_size) * + sizeof(int))); + checkCUDA(cudaMalloc(&exp_local_label_to_index, num_experts * sizeof(int))); + // expert_start_indexes needs one more slot to save the upper bound index + checkCUDA(cudaMalloc(&expert_start_indexes, (num_experts + 1) * sizeof(int))); + checkCUDA(cudaMalloc(&num_assignments_per_expert, num_experts * sizeof(int))); + capped_num_assignments_per_expert = (int *)malloc(num_experts * sizeof(int)); + checkCUDA(cudaMalloc(&destination_start_indices, num_experts * sizeof(int))); + + checkCUDA( + cudaMalloc(&token_idx_array, + num_chosen_experts * effective_batch_size * sizeof(float *))); checkCUDA( cudaMalloc(&dev_weights, num_experts * (1 + use_bias) * sizeof(float *))); + checkCUDA( + cudaMalloc(&weight_idx_array, + num_chosen_experts * effective_batch_size * sizeof(float *))); + checkCUDA( + cudaMalloc(&bias_idx_array, + num_chosen_experts * effective_batch_size * sizeof(float *))); + checkCUDA( + cudaMalloc(&coefficient_idx_array, + num_chosen_experts * effective_batch_size * sizeof(float *))); + checkCUDA( + cudaMalloc(&output_idx_array, + num_chosen_experts * effective_batch_size * sizeof(float *))); + batch_outputs = new float *[num_chosen_experts * effective_batch_size]; + checkCUDA(cudaMalloc(&batch_outputs[0], + out_dim * num_chosen_experts * effective_batch_size * + sizeof(float))); + for (int i = 1; i < num_chosen_experts * effective_batch_size; i++) { + batch_outputs[i] = batch_outputs[i - 1] + out_dim * sizeof(float); + } + checkCUDA( + cudaMalloc(&dev_batch_outputs, + num_chosen_experts * effective_batch_size * sizeof(float *))); + checkCUDA( + cudaMemcpy(dev_batch_outputs, + batch_outputs, + num_chosen_experts * effective_batch_size * sizeof(float *), + cudaMemcpyHostToDevice)); + // Bias + float *dram_one_ptr = (float *)malloc(sizeof(float) * 1); + for (int i = 0; i < 1; i++) { + dram_one_ptr[i] = 1.0f; + } + float *fb_one_ptr; + checkCUDA(cudaMalloc(&fb_one_ptr, sizeof(float) * 1)); + checkCUDA(cudaMemcpy( + fb_one_ptr, dram_one_ptr, sizeof(float) * 1, cudaMemcpyHostToDevice)); + one_ptr = (float const *)fb_one_ptr; + free((void *)dram_one_ptr); + checkCUDA( + cudaMalloc(&one_ptr_array, + num_chosen_experts * effective_batch_size * sizeof(float *))); + for (int i = 0; i < num_chosen_experts * effective_batch_size; i++) { + checkCUDA(cudaMemcpy(&one_ptr_array[i], + &fb_one_ptr, + sizeof(float *), + cudaMemcpyHostToDevice)); + } + // Activation + checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); + checkCUDNN(cudnnCreateTensorDescriptor(&resultTensorDesc)); + if (use_activation(activation)) { + cudnnActivationMode_t mode; + switch (activation) { + case AC_MODE_RELU: + mode = CUDNN_ACTIVATION_RELU; + break; + case AC_MODE_SIGMOID: + mode = CUDNN_ACTIVATION_SIGMOID; + break; + default: + // Unsupported activation mode + assert(false); + } + checkCUDNN( + cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); + checkCUDNN( + cudnnSetTensor4dDescriptor(resultTensorDesc, + CUDNN_TENSOR_NCHW, + // CUDNN_DATA_FLOAT, + cuda_to_cudnn_datatype(CUDA_R_32F), + num_chosen_experts * effective_batch_size, + out_dim, + 1, + 1)); + } } ExpertsMeta::~ExpertsMeta(void) { - checkCUDA(cudaFree(&dev_weights)); + + checkCUDA(cudaFree(sorted_indices)); + checkCUDA(cudaFree(original_indices)); + checkCUDA(cudaFree(non_zero_expert_labels)); + checkCUDA(cudaFree(temp_sequence)); + checkCUDA(cudaFree(exp_local_label_to_index)); + checkCUDA(cudaFree(expert_start_indexes)); + checkCUDA(cudaFree(num_assignments_per_expert)); + free(capped_num_assignments_per_expert); + checkCUDA(cudaFree(destination_start_indices)); + checkCUDA(cudaFree(token_idx_array)); + checkCUDA(cudaFree(dev_weights)); + checkCUDA(cudaFree(weight_idx_array)); + checkCUDA(cudaFree(coefficient_idx_array)); + checkCUDA(cudaFree(output_idx_array)); + checkCUDA(cudaFree(dev_batch_outputs)); + checkCUDA(cudaFree(bias_idx_array)); + checkCUDA(cudaFree(batch_outputs[0])); + delete[] batch_outputs; + // Bias + checkCUDA(cudaFree((void *)one_ptr)); + checkCUDA(cudaFree((void *)one_ptr_array)); + // Activation + checkCUDNN(cudnnDestroyActivationDescriptor(actiDesc)); + checkCUDNN(cudnnDestroyTensorDescriptor(resultTensorDesc)); } }; // namespace FlexFlow diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 53e61b90d9..1da2e492ed 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -318,6 +318,34 @@ cudaDataType_t ff_to_cuda_datatype(DataType type) { return CUDA_R_32F; } +cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type) { + switch (type) { + case CUDNN_DATA_FLOAT: + return CUDA_R_32F; + case CUDNN_DATA_DOUBLE: + return CUDA_R_64F; + case CUDNN_DATA_INT32: + return CUDA_R_32I; + default: + assert(false && "Unsupported cuda data type"); + } + return CUDA_R_32F; +} + +cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) { + switch (type) { + case CUDA_R_32F: + return CUDNN_DATA_FLOAT; + case CUDA_R_64F: + return CUDNN_DATA_DOUBLE; + case CUDA_R_32I: + return CUDNN_DATA_INT32; + default: + assert(false && "Unsupported cudnn data type"); + } + return CUDNN_DATA_FLOAT; +} + template __global__ void assign_kernel(half *ptr, coord_t size, half value); template __global__ void From 9c879159090b77e3cd19f7f493733e7ac08ae6f1 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 10 Mar 2023 23:32:19 +0000 Subject: [PATCH 072/344] [BatchConfig] make num_active_requests and num_activate_tokens const functions --- include/flexflow/batch_config.h | 5 ++-- src/runtime/batch_config.cc | 44 +++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 558e7841b5..c2a77cf3d6 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -30,8 +30,9 @@ class BatchConfig { bool register_new_request(size_t guid, int length); void prepare_next_batch(); int update_results(InferenceResult const &ir); - int num_active_requests(); - int num_active_tokens(); + bool update_num_active_requests_tokens(); + int num_active_requests() const; + int num_active_tokens() const; static int const MAX_NUM_REQUESTS = 256; static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; static int const MAX_SEQUENCE_LENGTH = 1024; diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 99c3a18b63..936590905b 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -29,6 +29,7 @@ BatchConfig::BatchConfig() { request_completed[i] = true; num_processing_tokens[i] = 0; } + update_num_active_requests_tokens(); } int BatchConfig::update_results(InferenceResult const &ir) { @@ -71,6 +72,7 @@ int BatchConfig::update_results(InferenceResult const &ir) { } num_processing_tokens[i] = 0; } + update_num_active_requests_tokens(); return completed; } @@ -84,15 +86,17 @@ bool BatchConfig::register_new_request(size_t guid, int length) { request_guid[i] = guid; num_processing_tokens[i] = 0; request_completed[i] = false; + update_num_active_requests_tokens(); return true; } } + update_num_active_requests_tokens(); return false; } void BatchConfig::prepare_next_batch() { cached_results = false; - int num_tokens = 0; + int count = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { continue; @@ -104,15 +108,13 @@ void BatchConfig::prepare_next_batch() { } else { num_processing_tokens[i] = MAX_NUM_TOKENS - num_tokens; } - num_tokens += num_processing_tokens[i]; + count += num_processing_tokens[i]; } - log_bc.print("[NextBatch] num_tokens(%d)", num_tokens); + update_num_active_requests_tokens(); + log_bc.print("[NextBatch] num_tokens(%d)", count); } -int BatchConfig::num_active_requests() { - if (cached_results) { - return num_requests; - } +bool BatchConfig::update_num_active_requests_tokens() { num_requests = 0; num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { @@ -122,23 +124,27 @@ int BatchConfig::num_active_requests() { } } cached_results = true; - return num_requests; + return true; } -int BatchConfig::num_active_tokens() { +int BatchConfig::num_active_requests() const { if (cached_results) { - return num_tokens; + return num_requests; + } else { + assert(false && + "some BatchConfig functions updated requests but didn't call " + "update_num_active_requests_tokens() before exit"); } - num_requests = 0; - num_tokens = 0; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (!request_completed[i]) { - num_requests++; - num_tokens += num_processing_tokens[i]; - } +} + +int BatchConfig::num_active_tokens() const { + if (cached_results) { + return num_tokens; + } else { + assert(false && + "some BatchConfig functions updated requests but didn't call " + "update_num_active_requests_tokens() before exit"); } - cached_results = true; - return num_tokens; } }; // namespace FlexFlow From 0f8ce7682eeeb1e5b970c30d2896fbec5edada08 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Mar 2023 19:00:04 -0500 Subject: [PATCH 073/344] removed unnecessary array --- src/ops/experts.cu | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/ops/experts.cu b/src/ops/experts.cu index c74c77b399..803d6da6c4 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -170,12 +170,6 @@ void experts_forward_thrust_wrapper(ExpertsMeta const *m, exceeds_expert_capacity(expert_capacity), expert_capacity); - cudaMemcpyAsync(m->capped_num_assignments_per_expert, - m->destination_start_indices, - (*non_zero_experts_count) * sizeof(int), - cudaMemcpyDeviceToHost, - stream); - *gemm_batch_count = thrust::reduce(thrust::cuda::par.on(stream), destination_start_indices, @@ -269,7 +263,6 @@ void experts_forward_GemmBatched_kernel(ExpertsMeta const *m, int num_tokens, int num_chosen_experts, int gemm_batch_count, - int non_zero_experts_count, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); @@ -481,7 +474,6 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, num_tokens, num_chosen_experts, gemm_batch_count, - non_zero_experts_count, stream); cudaStreamSynchronize(stream); @@ -544,7 +536,6 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, // expert_start_indexes needs one more slot to save the upper bound index checkCUDA(cudaMalloc(&expert_start_indexes, (num_experts + 1) * sizeof(int))); checkCUDA(cudaMalloc(&num_assignments_per_expert, num_experts * sizeof(int))); - capped_num_assignments_per_expert = (int *)malloc(num_experts * sizeof(int)); checkCUDA(cudaMalloc(&destination_start_indices, num_experts * sizeof(int))); checkCUDA( @@ -637,7 +628,6 @@ ExpertsMeta::~ExpertsMeta(void) { checkCUDA(cudaFree(exp_local_label_to_index)); checkCUDA(cudaFree(expert_start_indexes)); checkCUDA(cudaFree(num_assignments_per_expert)); - free(capped_num_assignments_per_expert); checkCUDA(cudaFree(destination_start_indices)); checkCUDA(cudaFree(token_idx_array)); checkCUDA(cudaFree(dev_weights)); From 41343bb00afee0672b1d1d8798581c96facb99d2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 18 Mar 2023 04:06:13 -0400 Subject: [PATCH 074/344] [Inference] - Load data using BatchConfig (#656) * update dataloader, inference loop * fix * linting * fixes * update dataloader, remove duplicate files * fix bugs * fix * backup * revert inf manager changes * backup * bakcup * bug fix --- examples/cpp/inference/data_generator.cc | 187 +++++---- examples/cpp/inference/data_generator.cpp | 50 ++- examples/cpp/inference/data_generator.h | 62 ++- examples/cpp/inference/dataloader.cc | 187 +++++++++ examples/cpp/inference/dataloader.cu | 108 +++++ examples/cpp/inference/dataloader.h | 61 +++ examples/cpp/inference/gpt_tokenizer.cpp | 15 + examples/cpp/inference/inference_config.h | 69 ++++ .../mixture_of_experts/CMakeLists.txt | 7 +- .../mixture_of_experts/dataloader.cc | 297 -------------- .../mixture_of_experts/dataloader.cu | 115 ------ .../cpp/inference/mixture_of_experts/moe.cc | 104 +++-- .../cpp/inference/mixture_of_experts/moe.h | 90 +--- .../cpp/inference/transformers/CMakeLists.txt | 7 +- .../cpp/inference/transformers/dataloader.cc | 388 ------------------ .../cpp/inference/transformers/dataloader.cu | 115 ------ .../inference/transformers/transformers.cc | 145 ++++--- .../cpp/inference/transformers/transformers.h | 110 +---- include/flexflow/batch_config.h | 12 +- include/flexflow/operator.h | 1 - include/flexflow/ops/aggregate.h | 1 - include/flexflow/ops/aggregate_spec.h | 1 - include/flexflow/ops/attention.h | 1 - include/flexflow/ops/element_binary.h | 1 - include/flexflow/ops/experts.h | 3 +- include/flexflow/ops/groupby.h | 1 - .../ops/inc_multihead_self_attention.h | 1 - include/flexflow/ops/layer_norm.h | 1 - include/flexflow/ops/linear.h | 1 - include/flexflow/ops/noop.h | 1 - include/flexflow/ops/softmax.h | 1 - include/flexflow/ops/topk.h | 1 - include/flexflow/parallel_ops/partition.h | 1 - src/ops/aggregate.cc | 1 - src/ops/aggregate_spec.cc | 1 - src/ops/attention.cc | 1 - src/ops/element_binary.cc | 1 - src/ops/experts.cc | 1 - src/ops/group_by.cc | 1 - src/ops/inc_multihead_self_attention.cc | 1 - src/ops/layer_norm.cc | 1 - src/ops/linear.cc | 1 - src/ops/noop.cc | 1 - src/ops/softmax.cc | 1 - src/ops/topk.cc | 1 - src/parallel_ops/partition.cc | 1 - src/runtime/batch_config.cc | 3 +- src/runtime/inference_manager.cc | 3 +- 48 files changed, 805 insertions(+), 1359 deletions(-) create mode 100644 examples/cpp/inference/dataloader.cc create mode 100644 examples/cpp/inference/dataloader.cu create mode 100644 examples/cpp/inference/dataloader.h create mode 100644 examples/cpp/inference/inference_config.h delete mode 100644 examples/cpp/inference/mixture_of_experts/dataloader.cc delete mode 100644 examples/cpp/inference/mixture_of_experts/dataloader.cu delete mode 100644 examples/cpp/inference/transformers/dataloader.cc delete mode 100644 examples/cpp/inference/transformers/dataloader.cu diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc index 3e8daa7d41..961052537e 100644 --- a/examples/cpp/inference/data_generator.cc +++ b/examples/cpp/inference/data_generator.cc @@ -1,3 +1,18 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "data_generator.h" #include #include @@ -6,133 +21,133 @@ using namespace std; DataGenerator::DataGenerator(size_t _num_requests, size_t _token_dim, - size_t _sequence_length, + size_t _min_input_tokens, + size_t _max_input_tokens, + size_t _min_tokens_to_generate, + size_t _max_tokens_to_generate, bool _poisson_distr, double _lambda) : num_requests(_num_requests), token_dim(_token_dim), - sequence_length(_sequence_length), poisson_distr(_poisson_distr), - lambda(_lambda), timer_started(false), global_unique_id(1000000) { - generate_arrival_times(); -}; - -void DataGenerator::generate_requests(float *req_ptr, - int *label_ptr, - int num_labels) { - assert(req_ptr != nullptr); - /* for (size_t i=0; i float_dist{0, 1.0}; - auto gen = [&float_dist, &mersenne_engine]() { - return float_dist(mersenne_engine); - }; - std::generate( - req_ptr, req_ptr + token_dim * sequence_length * num_requests, gen); - - if (label_ptr != nullptr) { - assert(num_labels > 0); - /* for (size_t i=0; i int_dist{0, num_labels}; - auto gen_label = [&int_dist, &mersenne_engine]() { - return int_dist(mersenne_engine); - }; - std::generate( - label_ptr, label_ptr + sequence_length * num_requests, gen_label); - } + min_input_tokens(_min_input_tokens), max_input_tokens(_max_input_tokens), + min_tokens_to_generate(_min_tokens_to_generate), + max_tokens_to_generate(_max_tokens_to_generate), + poisson_distr(_poisson_distr), lambda(_lambda), timer_started(false) { + generate_requests_meta(); }; -void DataGenerator::generate_arrival_times(void) { - // set up a uniform number generator with range [0,1) - random_device rnd; - mt19937 gen(rnd()); - uniform_real_distribution dist{0, 1.0}; +// generate each request's arrival time and sequence length +void DataGenerator::generate_requests_meta() { + random_device rnd1, rnd2, rnd3; + mt19937 gen1(rnd1()), gen2(rnd2()), gen3(rnd3()); + // set up a uniform number generator with range [0,1) (in seconds) for the + // arrival times + uniform_real_distribution dist1{0, 1.0}; double cur_arrival = 0; // assume first request comes in at time 0 + // set up a uniform number generator for the initial/generated sequence length + uniform_int_distribution dist2{min_input_tokens, + max_input_tokens}; + uniform_int_distribution dist3{min_tokens_to_generate, + max_tokens_to_generate}; + size_t cur_seq_len = dist2(gen2); + size_t tokens_to_generate = dist3(gen3); for (size_t i = 0; i < num_requests; i++) { arrivals.push_back(cur_arrival); if (poisson_distr) { - double u = dist(gen); + double u = dist1(gen1); double interval = -(1 / lambda) * log(1 - u) * 1000; cur_arrival += interval; } else { cur_arrival += (1000 / lambda); } + seq_lengths.push_back(std::make_pair(cur_seq_len, tokens_to_generate)); + cur_seq_len = dist2(gen2); + tokens_to_generate = dist3(gen3); } // cout << "Arrivals : ["; // copy(arrivals.begin(), arrivals.end(), ostream_iterator(cout, " ")); // cout << "]" << endl; }; +void DataGenerator::generate_requests(float *req_ptr) { + assert(req_ptr != nullptr); + /* for (size_t i=0; i float_dist{0, 1.0}; + auto gen = [&float_dist, &mersenne_engine]() { + return float_dist(mersenne_engine); + }; + std::generate( + req_ptr, req_ptr + token_dim * max_input_tokens * num_requests, gen); +}; + void DataGenerator::start_timer(void) { arrivals_ptr = arrivals.begin(); start_time = Clock::now(); timer_started = true; }; -size_t DataGenerator::get_requests( - size_t max_num_requests, - std::vector>> &prompts) { +// In non-incremental mode, the number of requests we want is limited by the +// tensor's batch size. As long as each request has a length that is shorter +// than the tensor's max sequence length, we do not need to impose any +// additional requirement on the max number of tokens across requests. We can +// thus pass max_tokens = max_requests * tensor max sequence length as a +// placeholder. In incremental mode, the max number of requests is only limited +// by the BatchConfig request capacity (for storing each request's metadata), +// whereas the total number number of tokens across requests will be limited by +// the tensor's batch_size * sequence length. +std::pair DataGenerator::get_requests(size_t max_requests, + size_t max_tokens) { if (!timer_started) { std::cout << "Warning: tried to get number of requests before the timer " "was started." << std::endl; - return 0; + return std::make_pair(0, 0); } Clock::time_point cur_time = Clock::now(); size_t ms_from_start = chrono::duration_cast(cur_time - start_time).count(); - vector::iterator new_arrivals_ptr = + std::vector::iterator new_arrivals_ptr = upper_bound(arrivals_ptr, arrivals.end(), ms_from_start); - size_t received_requests = new_arrivals_ptr - arrivals_ptr; - arrivals_ptr = new_arrivals_ptr; - if (received_requests > 0) { - std::cout << "received " << received_requests - << " request(s) by arrival time +" << ms_from_start << "ms" - << "\n"; - } - - for (size_t i = 0; i < received_requests; i++) { - int length = std::rand() % 10 + 5; - std::vector prompt; - for (int j = 0; j < length; j++) { - prompt.push_back(j + 1000); + // number of new requests received + size_t received_requests = 0; + // id of first received request + size_t first_request_guid = arrivals_ptr - arrivals.begin(); + size_t new_tokens = 0; + for (size_t j = 0; + j < std::min((size_t)(new_arrivals_ptr - arrivals_ptr), max_requests) && + new_tokens < max_tokens; + j++) { + if (seq_lengths[first_request_guid + j].first <= max_tokens - new_tokens) { + received_requests++; + new_tokens += seq_lengths[first_request_guid + j].first; } - prompts.push_back(std::make_pair(global_unique_id++, prompt)); } - assert(prompts.size() == received_requests); - return received_requests; -} + std::advance(arrivals_ptr, received_requests); -size_t DataGenerator::get_requests() { - if (!timer_started) { - std::cout << "Warning: tried to get number of requests before the timer " - "was started." - << std::endl; - return 0; - } - Clock::time_point cur_time = Clock::now(); - size_t ms_from_start = - chrono::duration_cast(cur_time - start_time).count(); - vector::iterator new_arrivals_ptr = - upper_bound(arrivals_ptr, arrivals.end(), ms_from_start); - size_t received_requests = new_arrivals_ptr - arrivals_ptr; - arrivals_ptr = new_arrivals_ptr; - if (received_requests > 0) { + /* if (received_requests > 0) { std::cout << "received " << received_requests << " request(s) by arrival time +" << ms_from_start << "ms" << "\n"; - } + } */ + + return std::make_pair(first_request_guid, received_requests); +} - return received_requests; +std::pair DataGenerator::get_request_length(size_t guid) { + assert(seq_lengths.size() > + guid); // make sure the guid is valid (seq_lengths has an entry for the + // sequence with given guid) + return seq_lengths[guid]; } diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp index 9aeb9b49a9..4201e36915 100644 --- a/examples/cpp/inference/data_generator.cpp +++ b/examples/cpp/inference/data_generator.cpp @@ -18,43 +18,63 @@ int main(int argc, char const *argv[]) { cout << "Starting the Data DataGenerator!\n"; // DataGenerator parameters - size_t total_requests = 256; + size_t total_requests = 2560; size_t token_dim = 16; - size_t sequence_length = 20; + size_t max_sequence_length = 512 + 128; bool use_poisson_distr = true; // average number of request arrivals per second - double lambda = 25; - int label_dims = 10; + double lambda = 250; + + size_t min_input_tokens = 32, max_input_tokens = 512, + min_tokens_to_generate = 1, max_tokens_to_generate = 128; float *requests = (float *)calloc( - token_dim * sequence_length * total_requests, sizeof(float)); - int *labels = (int *)calloc(sequence_length * total_requests, sizeof(int)); + token_dim * max_sequence_length * total_requests, sizeof(float)); - DataGenerator data_generator( - total_requests, token_dim, sequence_length, use_poisson_distr, lambda); - data_generator.generate_requests(requests, labels, label_dims); + DataGenerator data_generator(total_requests, + token_dim, + min_input_tokens, + max_input_tokens, + min_tokens_to_generate, + max_tokens_to_generate, + use_poisson_distr, + lambda); + data_generator.generate_requests(requests); data_generator.start_timer(); - size_t received_requests = data_generator.get_requests(); + size_t received_requests = 0; + std::pair reqs = data_generator.get_requests(0, 0); + size_t guid = reqs.first; + assert(reqs.second == 0); + this_thread::sleep_for(milliseconds(50)); + + reqs = data_generator.get_requests(2560, 2560 * (512)); + received_requests += reqs.second; std::cout << "t=0ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(1200)); - received_requests = data_generator.get_requests(); + reqs = data_generator.get_requests(2560, 2560 * (512)); + received_requests += reqs.second; std::cout << "t=1200ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(10)); - received_requests = data_generator.get_requests(); + reqs = data_generator.get_requests(2560, 2560 * (512)); + received_requests += reqs.second; std::cout << "t=1210ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(4000)); - received_requests = data_generator.get_requests(); + reqs = data_generator.get_requests(2560, 2560 * (512)); + received_requests += reqs.second; std::cout << "t=5210ms: received " << received_requests << std::endl; this_thread::sleep_for(milliseconds(5000)); - received_requests = data_generator.get_requests(); + + reqs = data_generator.get_requests(2560, 2560 * (512)); + received_requests += reqs.second; std::cout << "t=10210ms: received " << received_requests << std::endl; free(requests); - free(labels); + + assert(received_requests == total_requests); return 0; } diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index 777c7cadd7..d83df8e1fe 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -1,4 +1,20 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once + #include #include #include @@ -8,7 +24,9 @@ #include #include #include + using namespace std; + typedef std::chrono::high_resolution_clock Clock; typedef std::chrono::milliseconds milliseconds; @@ -16,39 +34,45 @@ class DataGenerator { public: DataGenerator(size_t _num_requests, size_t _token_dim, - size_t _sequence_length, + size_t _min_input_tokens, + size_t _max_input_tokens, + size_t _min_tokens_to_generate, + size_t _max_tokens_to_generate, bool _poisson_distr, double _lambda); // Generate random requests by filling each token with random data. For now, // assume all requests have the same sequence length. Also generate random // labels (if label_ptr != nullptr and num_labels >0). - void generate_requests(float *req_ptr, - int *label_ptr = nullptr, - int num_labels = 0); + void generate_requests(float *req_ptr); void start_timer(void); // Get number of requests that have arrived since the last time this function // was called - size_t - get_requests(size_t max_num_requests, - std::vector>> &prompts); - size_t get_requests(); + std::pair get_requests(size_t max_requests, + size_t max_tokens); + std::pair get_request_length(size_t guid); + // size_t max_sequence_length; // dimension of one request tensor private: // Compute the arrival times of each request and save them in the arrivals // vector. - void generate_arrival_times(void); - - size_t num_requests; // total number of requests - size_t token_dim; // embedding dim of each token - size_t sequence_length; // dimension of one request tensor - bool poisson_distr; // false implies uniform distribution - double lambda; // mean #num of arrivals per sec - bool timer_started; // whether timer was initiated - size_t global_unique_id; // guid for requests + // void generate_arrival_times(void); + void generate_requests_meta(); + + size_t num_requests; // total number of requests + size_t token_dim; // embedding dim of each token + size_t min_input_tokens; + size_t max_input_tokens; + size_t min_tokens_to_generate; + size_t max_tokens_to_generate; + bool poisson_distr; // false implies uniform distribution + double lambda; // mean #num of arrivals per sec + bool timer_started; // whether timer was initiated // time when get_requests() is called for the first time Clock::time_point start_time; // arrival times (ms) generated based on distribution - vector arrivals; - vector::iterator arrivals_ptr; + std::vector arrivals; + std::vector::iterator arrivals_ptr; + // sequence lengths generated based on uniform distribution + std::vector> seq_lengths; }; diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc new file mode 100644 index 0000000000..ae7cb5ccd0 --- /dev/null +++ b/examples/cpp/inference/dataloader.cc @@ -0,0 +1,187 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dataloader.h" +#include "flexflow/inference.h" +#include "inference_config.h" + +using namespace Legion; + +DataLoader::DataLoader(FFModel &ff, + InferenceConfig const &inferenceConfig, + DataGenerator &data_generator, + ParallelTensor input) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + + int numdims = input->num_dims; + int replica_idx = numdims - 1; + int batch_idx = numdims - 2; + num_samples = inferenceConfig.total_requests; + + // Create full input + { + batch_input = input; + + ParallelDim dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i].size = input->dims[i].size; + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = input->dims[i].is_replica_dim; + // Assume only the first dim can be the replica dim + assert(i == replica_idx || (!dims[i].is_replica_dim)); + } + assert(dims[batch_idx].size == inferenceConfig.batch_size); + dims[batch_idx].size = num_samples; + + full_input = + ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_FLOAT); + ff.map_tensor(full_input, NULL /*parallel_op*/); + } + + // Load entire dataset + // TODO: Use index launcher instead of task launcher + assert(full_input != nullptr && "full_input is nullptr"); + + DataLoaderInput dataloader_input = {inferenceConfig, data_generator}; + DataLoaderInput const *ptr = &dataloader_input; + + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, + TaskArgument(ptr, sizeof(DataLoaderInput))); + // regions[0]: full_input + launcher.add_region_requirement(RegionRequirement(full_input->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + + runtime->execute_task(ctx, launcher); +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + DataLoaderInput const input_struct = *((DataLoaderInput *)task->args); + InferenceConfig const &conf = input_struct._inferenceConfig; + DataGenerator &datagen = input_struct._data_generator; + assert(regions.size() == 1); + assert(task->regions.size() == regions.size()); + + // get input pointer + float *input_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + int input_dims = input_domain.get_dim(); + for (int i = 0; i < input_dims; i++) { + int input_dim = input_domain.hi()[i] - input_domain.lo()[i] + 1; + } + + if (conf.dataset_path.length() == 0) { + printf("Input dataset path is empty, using random input samples\n"); + datagen.generate_requests(input_ptr); + } else { + // Load specific dataset + } +} + +void DataLoader::next_batch(FFModel &ff, BatchConfig *bc) { + size_t num_active_tokens = bc->num_active_tokens(); + if (num_active_tokens == 0) { + return; + } + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load input + { + Domain domain = + runtime->get_index_space_domain(ctx, batch_input->parallel_is); + ArgumentMap argmap; + // No partitioning of the batch input token in inference mode + int input_dims = batch_input->num_dims; + for (int i = 0; i < input_dims; i++) { + assert(batch_input->dims[i].degree == 1 && + "Dataloader does not support input token partitioning in " + "inference mode"); + } + int batch_size = batch_input->dims[input_dims - 2].size; + int seq_len = batch_input->dims[input_dims - 3].size; + assert(ff.config.batchSize == batch_size && + batch_size * seq_len >= num_active_tokens); + for (Domain::DomainPointIterator it(domain); it; it++) { + SampleIdxs meta; + meta.num_samples = num_active_tokens; + meta.incremental_mode = bc->incremental_mode; + int token_index = 0; + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } else { + for (int j = 0; j < bc->num_processing_tokens[i]; j++) { + meta.guids[token_index] = bc->request_guid[i]; + meta.idxs[token_index] = bc->token_start_idx[i] + j; + token_index++; + } + } + } + assert(token_index == num_active_tokens); + argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + } + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, + batch_input->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_input->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_input->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); + } +} + +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Input Task"); + } +} diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu new file mode 100644 index 0000000000..6f8c6c19ad --- /dev/null +++ b/examples/cpp/inference/dataloader.cu @@ -0,0 +1,108 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dataloader.h" +#include "flexflow/utils/cuda_helper.h" + +void DataLoader::load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + SampleIdxs *meta = (SampleIdxs *)task->local_args; + if (meta->num_samples == 0) { + return; + } + float const *full_input_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + float *batch_input_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + Domain full_input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain batch_input_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + coord_t token_dim = + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; + coord_t sequence_length = + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + coord_t batch_size = + batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; + coord_t full_input_sequence_length = + full_input_domain.hi()[1] - full_input_domain.lo()[1] + 1; + assert(sequence_length == full_input_sequence_length); + + // Currently assume continous indices + assert(meta->num_samples <= batch_size * sequence_length); + for (int i = 1; i < meta->num_samples; i++) { + if (meta->guids[i] == meta->guids[i - 1]) { + assert(meta->idxs[i] == meta->idxs[i - 1] + 1); + } + } + // keep things simple for now + assert(batch_input_domain.get_volume() == + batch_size * sequence_length * token_dim); + + // pad inputs if needed (this is really only useful for debugging) + checkCUDA(cudaMemset( + batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(float))); + + if (!meta->incremental_mode) { + size_t num_requests = 0; + size_t guid; + for (size_t i = 0; i < meta->num_samples; i++) { + if (i == 0 || meta->guids[i] != guid) { + guid = meta->guids[0]; + num_requests++; + } + } + + coord_t start_idx = meta->guids[0]; + assert(batch_input_domain.get_volume() % batch_size == 0); + size_t size_to_copy = token_dim * sequence_length * num_requests; + float const *input_zc = + full_input_ptr + start_idx * token_dim * sequence_length; + copy_kernel<<>>( + batch_input_ptr, input_zc, size_to_copy); + checkCUDA(cudaDeviceSynchronize()); + return; + } + + size_t guid = meta->guids[0]; + size_t start_idx = meta->idxs[0]; + size_t dst_idx = 0; + size_t total_tokens = 0; + for (size_t i = 1; i <= meta->num_samples; i++) { + if (i == meta->num_samples || meta->guids[i] != guid) { + size_t size_to_copy = token_dim * (meta->idxs[i - 1] - start_idx + 1); + total_tokens += size_to_copy / token_dim; + float const *input_zc = full_input_ptr + + (guid * token_dim * sequence_length) + + start_idx * token_dim; + float *dst_ptr = batch_input_ptr + dst_idx * token_dim; + copy_kernel<<>>( + dst_ptr, input_zc, size_to_copy); + if (i < meta->num_samples) { + guid = meta->guids[i]; + start_idx = meta->idxs[i]; + } + dst_idx = i; + } + } + assert(total_tokens == meta->num_samples); + checkCUDA(cudaDeviceSynchronize()); +} diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h new file mode 100644 index 0000000000..e67176d801 --- /dev/null +++ b/examples/cpp/inference/dataloader.h @@ -0,0 +1,61 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "data_generator.h" +#include "flexflow/model.h" +#include "inference_config.h" + +#include +#include +#include +#include +#include +#include + +using namespace Legion; +using namespace FlexFlow; + +class DataLoader { +public: + DataLoader(FFModel &ff, + InferenceConfig const &inferenceConfig, + DataGenerator &data_generator, + ParallelTensor input); + static void load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + void next_batch(FFModel &, BatchConfig *); + +public: + size_t num_samples; + FlexFlow::ParallelTensor full_input, batch_input; + struct DataLoaderInput { + InferenceConfig const &_inferenceConfig; + DataGenerator &_data_generator; + }; +}; + +struct SampleIdxs { + bool incremental_mode; + size_t num_samples; + size_t idxs[MAX_SEQ_LEN]; // the id of each token within its request + size_t guids[MAX_SEQ_LEN]; // the guid of the request each token belongs to +}; \ No newline at end of file diff --git a/examples/cpp/inference/gpt_tokenizer.cpp b/examples/cpp/inference/gpt_tokenizer.cpp index cd413e468f..2d9b521fca 100644 --- a/examples/cpp/inference/gpt_tokenizer.cpp +++ b/examples/cpp/inference/gpt_tokenizer.cpp @@ -1,3 +1,18 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "gpt_tokenizer.h" #include diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h new file mode 100644 index 0000000000..cbd0a082e2 --- /dev/null +++ b/examples/cpp/inference/inference_config.h @@ -0,0 +1,69 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#define MAX_SEQ_LEN 1024 +#define BATCH_SIZE 2 +#define MNIST_DIMS 28 * 28 +#define DATA_DIM MNIST_DIMS + +struct InferenceConfig { + InferenceConfig(void) { + //----------------------- Input/output data ------------------------ + token_dim = DATA_DIM; + sequence_length = MAX_SEQ_LEN; + batch_size = BATCH_SIZE; + out_dim = 15; + num_labels = out_dim; + num_layers = 1; + //----------------------- Inference parameters --------------------- + // total number of requests processed as part of the simulation + total_requests = 2560; + poisson_distribution = true; + // average number of request arrivals per second + arrival_rate = 250; + num_inflight_batches = 5; + incremental_mode = false; + //----------------------- Rest of model parameters ------------------ + hidden_size = DATA_DIM; + // Encoder layer + num_attention_heads = 16; + attention_kdim = attention_vdim = hidden_size / num_attention_heads; + num_encoder_layers = 1; + } + + // Input/output data + int token_dim; + int sequence_length; + int batch_size; + int out_dim; + int num_labels; + int num_layers; + std::string dataset_path; + // Inference parameters + int total_requests; + bool poisson_distribution; + double arrival_rate; + int num_inflight_batches; + bool incremental_mode; + // Model parameters + int hidden_size; + int num_attention_heads; + int attention_kdim; + int attention_vdim; + int num_encoder_layers; +}; diff --git a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt index 81c4c184b4..ecfe29b793 100644 --- a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt +++ b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt @@ -6,11 +6,12 @@ set(project_target inference_moe) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} moe.cc - dataloader.cc - ../data_generator.cc) + ../dataloader.cc + ../data_generator.cc + ../gpt_tokenizer.cc) set(GPU_SRC - dataloader.cu) + ../dataloader.cu) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/examples/cpp/inference/mixture_of_experts/dataloader.cc b/examples/cpp/inference/mixture_of_experts/dataloader.cc deleted file mode 100644 index af32cfe98b..0000000000 --- a/examples/cpp/inference/mixture_of_experts/dataloader.cc +++ /dev/null @@ -1,297 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/inference.h" -#include "moe.h" -#include -#include -#include -#include -#include -#include - -using namespace Legion; - -DataLoader::DataLoader(FFModel &ff, - MoeConfig const &moeConfig, - DataGenerator &data_generator, - ParallelTensor input, - ParallelTensor label) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - - int numdims = input->num_dims; - int replica_idx = numdims - 1; - int batch_idx = numdims - 2; - num_samples = moeConfig.total_requests; - - // Create full input - { - batch_input = input; - - ParallelDim dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i].size = input->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = input->dims[i].is_replica_dim; - // Assume only the first dim can be the replica dim - assert(i == replica_idx || (!dims[i].is_replica_dim)); - } - assert(dims[batch_idx].size == ff.config.batchSize); - dims[batch_idx].size = num_samples; - - full_input = - ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_FLOAT); - ff.map_tensor(full_input, NULL /*parallel_op*/); - } - - // Create full label - { - assert(label->num_dims == numdims); - batch_label = label; - - ParallelDim dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i].size = label->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = label->dims[i].is_replica_dim; - // Assume only the last dim can be the replica dim - assert(i == replica_idx || (!dims[i].is_replica_dim)); - } - assert(dims[batch_idx].size == ff.config.batchSize); - // replace batch size with number of samples - dims[batch_idx].size = num_samples; - - full_label = - ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_INT32); - ff.map_tensor(full_label, NULL /*parallel_op*/); - } - - // Load entire dataset - // TODO: Use index launcher instead of task launcher - assert(full_input != nullptr && "full_input is nullptr"); - assert(full_label != nullptr && "full_label is nullptr"); - - DataLoaderInput dataloader_input = {moeConfig, data_generator}; - DataLoaderInput const *ptr = &dataloader_input; - - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, - TaskArgument(ptr, sizeof(DataLoaderInput))); - // regions[0]: full_input - launcher.add_region_requirement(RegionRequirement(full_input->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - // regions[1]: full_label - launcher.add_region_requirement(RegionRequirement(full_label->region, - WRITE_ONLY, - EXCLUSIVE, - full_label->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(1, FID_DATA); - - runtime->execute_task(ctx, launcher); - reset(); -} - -// ================================================= -// Load data -// ================================================= - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - DataLoaderInput const input_struct = *((DataLoaderInput *)task->args); - MoeConfig const &conf = input_struct._moeConfig; - DataGenerator &datagen = input_struct._data_generator; - assert(regions.size() == 2); - assert(task->regions.size() == regions.size()); - - // get input and label pointer - float *input_ptr = helperGetTensorPointerWO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int *label_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain label_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - int input_dims = input_domain.get_dim(); - for (int i = 0; i < input_dims; i++) { - int input_dim = input_domain.hi()[i] - input_domain.lo()[i] + 1; - int label_dim = label_domain.hi()[i] - label_domain.lo()[i] + 1; - assert(i == 0 || input_dim == label_dim); - } - - if (conf.dataset_path.length() == 0) { - printf("Input dataset path is empty, using random input samples\n"); - datagen.generate_requests(input_ptr, label_ptr, conf.num_labels); - } else { - // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load - // another dataset using the dataset_path from the MoeConfig object - // read_mnist(input_ptr, label_ptr); - // log_app.print("finish loading MNIST data\n"); - } -} - -void DataLoader::next_batch(FFModel &ff, size_t received_requests) { - if (received_requests == 0) { - return; - } - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load input - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_input->parallel_is); - ArgumentMap argmap; - int counter = 0; - // current limitation of the dataloader: only the batch dimension can be - // partitioned - int input_dims = batch_input->num_dims; - for (int i = 0; i < input_dims; i++) { - if (i != input_dims - 2) { - assert(batch_input->dims[i].degree == 1 && - "Dataloader only supports batch size partitions"); - } - } - int batch_size = batch_input->dims[input_dims - 2].size; - int n_partitions = batch_input->dims[input_dims - 2].degree; - assert(ff.config.batchSize % batch_size == 0); - assert(batch_size % n_partitions == 0); - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - int requests_left = received_requests - counter; - meta.num_samples = std::min(batch_size / n_partitions, requests_left); - for (int i = 0; i < meta.num_samples; i++) { - meta.idxs[i] = next_index + counter; - counter++; - } - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - assert(counter == received_requests); - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_input->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_input->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_input->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - // Load label - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_label->parallel_is); - ArgumentMap argmap; - int counter = 0; - // current limitation of the dataloader: only the batch dimension can be - // partitioned - int label_dims = batch_label->num_dims; - // assert(batch_label->dims[label_dims - 1].degree == 1); - for (int i = 0; i < label_dims; i++) { - assert(batch_label->dims[i].degree == 1 && - "Dataloader only supports batch size partitions"); - } - int batch_size = batch_label->dims[label_dims - 2].size; - int n_partitions = batch_label->dims[label_dims - 2].degree; - assert(ff.config.batchSize % batch_size == 0); - assert(batch_size % n_partitions == 0); - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - int requests_left = received_requests - counter; - meta.num_samples = std::min(batch_size / n_partitions, requests_left); - for (int i = 0; i < meta.num_samples; i++) { - meta.idxs[i] = next_index + counter; - counter++; - } - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - assert(counter == received_requests); - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, - batch_label->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_label->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_label->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_label->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_label->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_label->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - next_index += received_requests; -} - -void DataLoader::reset() { - next_index = 0; -} - -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Input Task"); - } - // Load label - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Label Task"); - } -} diff --git a/examples/cpp/inference/mixture_of_experts/dataloader.cu b/examples/cpp/inference/mixture_of_experts/dataloader.cu deleted file mode 100644 index 2e234e9b32..0000000000 --- a/examples/cpp/inference/mixture_of_experts/dataloader.cu +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/utils/cuda_helper.h" -#include "moe.h" - -void DataLoader::load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - float const *full_input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *batch_input_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - Domain full_input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain batch_input_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - - coord_t token_dim = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; - coord_t sequence_length = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - coord_t batch_size = - batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; - - // FIXME: currently assume continous indices - assert(meta->num_samples <= batch_size); - for (int i = 1; i < meta->num_samples; i++) { - assert(meta->idxs[i] == meta->idxs[0] + i); - } - // pad inputs if needed (this is really only useful for debugging) - if (meta->num_samples < batch_size) { - checkCUDA(cudaMemset(batch_input_ptr + - token_dim * sequence_length * meta->num_samples, - 0, - token_dim * sequence_length * - (batch_size - meta->num_samples) * sizeof(float))); - } - coord_t start_idx = meta->idxs[0]; - assert(batch_input_domain.get_volume() % token_dim * sequence_length * - batch_size == - 0); - assert(batch_input_domain.get_volume() % batch_size == 0); - size_t size_to_copy = - (batch_input_domain.get_volume() / batch_size) * meta->num_samples; - float const *input_zc = - full_input_ptr + start_idx * token_dim * sequence_length; - copy_kernel<<>>( - batch_input_ptr, input_zc, size_to_copy); - checkCUDA(cudaDeviceSynchronize()); -} - -void DataLoader::load_label(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - int const *full_label_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int *batch_label_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - Domain full_label_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain batch_label_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - coord_t label_dim = - batch_label_domain.hi()[0] - batch_label_domain.lo()[0] + 1; - coord_t sequence_length = - batch_label_domain.hi()[1] - batch_label_domain.lo()[1] + 1; - coord_t batch_size = - batch_label_domain.hi()[2] - batch_label_domain.lo()[2] + 1; - // FIXME: currently assume continous indices - assert(meta->num_samples <= batch_size); - for (int i = 1; i < meta->num_samples; i++) { - assert(meta->idxs[i] == meta->idxs[0] + i); - } - if (meta->num_samples < batch_size) { - checkCUDA(cudaMemset(batch_label_ptr + - label_dim * sequence_length * meta->num_samples, - 0, - label_dim * sequence_length * - (batch_size - meta->num_samples) * sizeof(int))); - } - assert(batch_label_domain.get_volume() % label_dim * sequence_length * - batch_size == - 0); - assert(batch_label_domain.get_volume() % batch_size == 0); - coord_t start_idx = meta->idxs[0]; - size_t size_to_copy = - (batch_label_domain.get_volume() / batch_size) * meta->num_samples; - int const *input_zc = - full_label_ptr + start_idx * label_dim * sequence_length; - copy_kernel<<>>( - batch_label_ptr, input_zc, size_to_copy); - checkCUDA(cudaDeviceSynchronize()); -} diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 726ef5f7ff..a7e96da03a 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -73,18 +73,21 @@ Tensor create_moe_encoder(FFModel *model, std::vector axes = {0, 1, 2}; Tensor x = input; for (int i = 0; i < moeConfig->num_encoder_layers; i++) { - x = model->layer_norm( - model->add(model->multihead_attention(x, - x, - x, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim), - x), - axes, - true, - 1e-05); + Tensor t = moeConfig->incremental_mode + ? model->inc_multihead_self_attention( + x, + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim) + : model->multihead_attention(x, + x, + x, + moeConfig->hidden_size, + moeConfig->num_attention_heads, + moeConfig->attention_kdim, + moeConfig->attention_vdim); + x = model->layer_norm(model->add(t, x), axes, true, 1e-05); x = model->layer_norm( model->add(create_moe(model, moeConfig, x), x), axes, true, 1e-05); } @@ -123,6 +126,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor t = create_moe_encoder(&ff, &moeConfig, input); // Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, moeConfig.out_dim, AC_MODE_RELU); + t = ff.softmax(t); //------------------- Initialize the inference manager ------------------ InferenceManager im( @@ -131,15 +135,19 @@ void FlexFlow::top_level_task(Task const *task, im.init_operators_inference(); //------------ Initialize the data loader and data generator ------------ + size_t min_input_tokens = 32, max_input_tokens = 512, + min_tokens_to_generate = 1, max_tokens_to_generate = 128; DataGenerator data_generator(moeConfig.total_requests, moeConfig.token_dim, - moeConfig.sequence_length, + min_input_tokens, + max_input_tokens, + min_tokens_to_generate, + max_tokens_to_generate, moeConfig.poisson_distribution, moeConfig.arrival_rate); - ParallelTensor input_pt, label_pt; + ParallelTensor input_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); - ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); - DataLoader data_loader(ff, moeConfig, data_generator, input_pt, label_pt); + DataLoader data_loader(ff, moeConfig, data_generator, input_pt); //----------------------- Start timer ----------------------------------- { @@ -154,22 +162,60 @@ void FlexFlow::top_level_task(Task const *task, int index = 0; int processed_requests = 0; int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; - data_loader.reset(); data_generator.start_timer(); - BatchConfig bc; + std::map future_handlers; + std::map batch_configs; + std::pair new_prompts; + BatchConfig *bc = nullptr; + + // simulation loop. For deployment, we will use a while(true) while (processed_requests < moeConfig.total_requests) { - size_t received_requests = data_generator.get_requests(); - int iterations = (received_requests % moeConfig.batch_size == 0) - ? (received_requests / moeConfig.batch_size) - : (received_requests / moeConfig.batch_size) + 1; - for (int iter = 0; iter < iterations; iter++) { - data_loader.next_batch(ff, received_requests); - runtime->begin_trace(ctx, 111 + index % num_devices /*trace_id*/); - im.inference(index, bc); - runtime->end_trace(ctx, 111 + index % num_devices /*trace_id*/); - index++; + for (int bid = 0; bid < im.max_num_requests_per_batch; bid++) { + if (future_handlers.find(bid) == future_handlers.end()) { + size_t max_reqs = moeConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS + : im.max_num_requests_per_batch; + size_t max_tkns = moeConfig.sequence_length * moeConfig.batch_size; + new_prompts = data_generator.get_requests(max_reqs, max_tkns); + assert(new_prompts.second <= BatchConfig::MAX_NUM_REQUESTS); + bc = new BatchConfig(moeConfig.incremental_mode); + } else { + Future future = future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } + InferenceResult ir = future.get_result(); + bc = batch_configs[bid]; + processed_requests += bc->update_results(ir); + size_t max_reqs = moeConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() + : im.max_num_requests_per_batch; + size_t max_tkns = + moeConfig.sequence_length * moeConfig.batch_size - + (moeConfig.incremental_mode ? bc->num_active_tokens() : 0); + new_prompts = data_generator.get_requests(max_reqs, max_tkns); + } + for (size_t i = 0; i < new_prompts.second; i++) { + size_t guid = new_prompts.first + i; + std::pair seq_lens = + data_generator.get_request_length(guid); + assert(seq_lens.first >= min_input_tokens && + seq_lens.first <= max_input_tokens && + seq_lens.second >= min_tokens_to_generate && + seq_lens.second <= max_tokens_to_generate); + assert(bc->register_new_request(guid, seq_lens.first)); + } + bc->prepare_next_batch(); + // TODO: loading data + data_loader.next_batch(ff, bc); + + runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + FutureMap fm = im.inference(bid, *bc); + runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; } - processed_requests += received_requests; } //----------------------- End of inference! ------------------------------ diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index 56feb775e8..183229bc07 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -13,31 +13,13 @@ * limitations under the License. */ -#include "data_generator.h" -#include "flexflow/model.h" -#define MAX_NUM_SAMPLES 1000 -#define MNIST_DIMS 28 * 28 -#define DATA_DIM MNIST_DIMS +#pragma once -using namespace Legion; -using namespace std; -using namespace FlexFlow; +#include "dataloader.h" +#include "inference_config.h" -struct MoeConfig { - MoeConfig(void) { - //----------------------- Input/output data ------------------------ - token_dim = DATA_DIM; - sequence_length = 10; - batch_size = 32; - out_dim = 15; - num_labels = out_dim; - //----------------------- Inference parameters --------------------- - // total number of requests processed as part of the simulation - total_requests = 256; - poisson_distribution = true; - // average number of request arrivals per second - arrival_rate = 25; - num_inflight_batches = 10; +struct MoeConfig : InferenceConfig { + MoeConfig(void) : InferenceConfig() { //----------------------- MoE layer -------------------------------- // total number of experts num_exp = 128; @@ -50,72 +32,12 @@ struct MoeConfig { lambda = 0.04f; // multiplier for load balance term // expert hidden size hidden_size = DATA_DIM; - //----------------------- Rest of model parameters ------------------ - // Encoder layer - num_attention_heads = 16; - attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 1; } - // Input/output data - int token_dim; - int sequence_length; - int batch_size; - int out_dim; - int num_labels; - std::string dataset_path; - // Inference parameters - int total_requests; - bool poisson_distribution; - double arrival_rate; - int num_inflight_batches; // MoE layer int num_exp; int experts_per_block; int num_select; float alpha; float lambda; - int hidden_size; - // Model parameters - int num_attention_heads; - int attention_kdim; - int attention_vdim; - int num_encoder_layers; -}; - -class DataLoader { -public: - DataLoader(FFModel &ff, - MoeConfig const &moeConfig, - DataGenerator &data_generator, - ParallelTensor input, - ParallelTensor label); - static void load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_label(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - void next_batch(FFModel &, size_t); - void reset(void); - -public: - int num_samples, next_index; - FlexFlow::ParallelTensor full_input, batch_input; - FlexFlow::ParallelTensor full_label, batch_label; - struct DataLoaderInput { - MoeConfig const &_moeConfig; - DataGenerator &_data_generator; - }; -}; - -struct SampleIdxs { - int num_samples; - int idxs[MAX_NUM_SAMPLES]; -}; +}; \ No newline at end of file diff --git a/examples/cpp/inference/transformers/CMakeLists.txt b/examples/cpp/inference/transformers/CMakeLists.txt index d52beae3ad..e3cabdc324 100644 --- a/examples/cpp/inference/transformers/CMakeLists.txt +++ b/examples/cpp/inference/transformers/CMakeLists.txt @@ -6,11 +6,12 @@ set(project_target inference_transformers) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} transformers.cc - dataloader.cc - ../data_generator.cc) + ../dataloader.cc + ../data_generator.cc + ../gpt_tokenizer.cc) set(GPU_SRC - dataloader.cu) + ../dataloader.cu) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/examples/cpp/inference/transformers/dataloader.cc b/examples/cpp/inference/transformers/dataloader.cc deleted file mode 100644 index 6a1ccb2338..0000000000 --- a/examples/cpp/inference/transformers/dataloader.cc +++ /dev/null @@ -1,388 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/inference.h" -#include "transformers.h" -#include -#include -#include -#include -#include -#include - -using namespace Legion; - -DataLoader::DataLoader(FFModel &ff, - MoeConfig const &moeConfig, - DataGenerator &data_generator, - ParallelTensor input, - ParallelTensor label) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - - int numdims = input->num_dims; - int replica_idx = numdims - 1; - int batch_idx = numdims - 2; - num_samples = moeConfig.total_requests; - - // Create full input - { - batch_input = input; - - ParallelDim dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i].size = input->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = input->dims[i].is_replica_dim; - // Assume only the first dim can be the replica dim - assert(i == replica_idx || (!dims[i].is_replica_dim)); - } - assert(dims[batch_idx].size == BatchConfig::MAX_NUM_TOKENS); - dims[batch_idx].size = num_samples; - - full_input = - ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_FLOAT); - ff.map_tensor(full_input, NULL /*parallel_op*/); - } - - // Create full label - { - assert(label->num_dims == numdims); - batch_label = label; - - ParallelDim dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i].size = label->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = label->dims[i].is_replica_dim; - // Assume only the last dim can be the replica dim - assert(i == replica_idx || (!dims[i].is_replica_dim)); - } - assert(dims[batch_idx].size == BatchConfig::MAX_NUM_TOKENS); - // replace batch size with number of samples - dims[batch_idx].size = num_samples; - - full_label = - ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_INT32); - ff.map_tensor(full_label, NULL /*parallel_op*/); - } - - // Load entire dataset - // TODO: Use index launcher instead of task launcher - assert(full_input != nullptr && "full_input is nullptr"); - assert(full_label != nullptr && "full_label is nullptr"); - - DataLoaderInput dataloader_input = {moeConfig, data_generator}; - DataLoaderInput const *ptr = &dataloader_input; - - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, - TaskArgument(ptr, sizeof(DataLoaderInput))); - // regions[0]: full_input - launcher.add_region_requirement(RegionRequirement(full_input->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - // regions[1]: full_label - launcher.add_region_requirement(RegionRequirement(full_label->region, - WRITE_ONLY, - EXCLUSIVE, - full_label->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(1, FID_DATA); - - runtime->execute_task(ctx, launcher); - reset(); -} - -// ================================================= -// Load data -// ================================================= - -void read_cifar100(float *input_ptr, int *label_ptr) { - std::ifstream file; - file.open("train.bin", std::ios::in | std::ios::binary | std::ios::ate); - if (!file) { - std::cout << "Error opening CIFAR100 train data file" << std::endl; - assert(false); - } - - file.seekg(0, std::ios::beg); - - // each sample: <1 x coarse label><1 x fine label><3072 x pixel> - for (std::size_t i = 0; i < MAX_NUM_SAMPLES; i++) { - unsigned char temp = 0; - file.read((char *)&temp, sizeof(temp)); // coarse label, skip - file.read((char *)&temp, sizeof(temp)); - label_ptr[i] = temp; - for (std::size_t j = 0; j < 3072; ++j) { - file.read((char *)&temp, sizeof(temp)); - input_ptr[i * 3072 + j] = (float)temp / 255.0f; - } - } - - file.close(); -} - -int reverseInt(int i) { - unsigned char c1, c2, c3, c4; - - c1 = i & 255; - c2 = (i >> 8) & 255; - c3 = (i >> 16) & 255; - c4 = (i >> 24) & 255; - - return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; -} - -/* NOTE: Download files from http://yann.lecun.com/exdb/mnist/ and unpack to -the current working directory */ -void read_mnist(float *input_ptr, int *label_ptr) { - // read inputs - std::ifstream input("train-images-idx3-ubyte", std::ios::binary); - if (input.is_open()) { - int magic_number = 0; - int number_of_images = 0; - int n_rows = 0; - int n_cols = 0; - input.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - input.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - input.read((char *)&n_rows, sizeof(n_rows)); - n_rows = reverseInt(n_rows); - input.read((char *)&n_cols, sizeof(n_cols)); - n_cols = reverseInt(n_cols); - - for (int i = 0; i < number_of_images; i++) { - for (int r = 0; r < n_rows; r++) { - for (int c = 0; c < n_cols; c++) { - unsigned char temp = 0; - input.read((char *)&temp, sizeof(temp)); - input_ptr[i * n_rows * n_cols + r * n_cols + c] = - (float)temp / 255.0f; - } - } - } - } else { - std::cout << "Error opening MNIST input data file" << std::endl; - assert(false); - } - - // read labels - std::ifstream labels("train-labels-idx1-ubyte", std::ios::binary); - if (labels.is_open()) { - int magic_number = 0; - int number_of_images = 0; - labels.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - labels.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - - for (int i = 0; i < number_of_images; i++) { - unsigned char temp = 0; - labels.read((char *)&temp, sizeof(temp)); - label_ptr[i] = temp; - } - } else { - std::cout << "Error opening MNIST label data file" << std::endl; - assert(false); - } -} - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - DataLoaderInput const input_struct = *((DataLoaderInput *)task->args); - MoeConfig const &conf = input_struct._moeConfig; - DataGenerator &datagen = input_struct._data_generator; - assert(regions.size() == 2); - assert(task->regions.size() == regions.size()); - - // get input and label pointer - float *input_ptr = helperGetTensorPointerWO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int *label_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain label_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - int input_dims = input_domain.get_dim(); - for (int i = 0; i < input_dims; i++) { - int input_dim = input_domain.hi()[i] - input_domain.lo()[i] + 1; - int label_dim = label_domain.hi()[i] - label_domain.lo()[i] + 1; - assert(i == 0 || input_dim == label_dim); - } - - if (conf.dataset_path.length() == 0) { - printf("Input dataset path is empty, using random input samples\n"); - datagen.generate_requests(input_ptr, label_ptr, conf.num_labels); - } else { - // here, you can call `read_cifar100(input_ptr, label_ptr);` instead or load - // another dataset using the dataset_path from the MoeConfig object - // read_mnist(input_ptr, label_ptr); - // log_app.print("finish loading MNIST data\n"); - } -} - -void DataLoader::next_batch(FFModel &ff, size_t received_requests) { - if (received_requests == 0) { - return; - } - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load input - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_input->parallel_is); - ArgumentMap argmap; - int counter = 0; - // current limitation of the dataloader: only the batch dimension can be - // partitioned - int input_dims = batch_input->num_dims; - for (int i = 0; i < input_dims; i++) { - if (i != input_dims - 2) { - assert(batch_input->dims[i].degree == 1 && - "Dataloader only supports batch size partitions"); - } - } - int batch_size = batch_input->dims[input_dims - 2].size; - int n_partitions = batch_input->dims[input_dims - 2].degree; - assert(ff.config.batchSize % batch_size == 0); - assert(batch_size % n_partitions == 0); - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - int requests_left = received_requests - counter; - meta.num_samples = std::min(batch_size / n_partitions, requests_left); - for (int i = 0; i < meta.num_samples; i++) { - meta.idxs[i] = next_index + counter; - counter++; - } - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - assert(counter == received_requests); - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_input->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_input->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_input->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - // Load label - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_label->parallel_is); - ArgumentMap argmap; - int counter = 0; - // current limitation of the dataloader: only the batch dimension can be - // partitioned - int label_dims = batch_label->num_dims; - // assert(batch_label->dims[label_dims - 1].degree == 1); - for (int i = 0; i < label_dims; i++) { - assert(batch_label->dims[i].degree == 1 && - "Dataloader only supports batch size partitions"); - } - int batch_size = batch_label->dims[label_dims - 2].size; - int n_partitions = batch_label->dims[label_dims - 2].degree; - assert(ff.config.batchSize % batch_size == 0); - assert(batch_size % n_partitions == 0); - for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - int requests_left = received_requests - counter; - meta.num_samples = std::min(batch_size / n_partitions, requests_left); - for (int i = 0; i < meta.num_samples; i++) { - meta.idxs[i] = next_index + counter; - counter++; - } - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - } - assert(counter == received_requests); - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_2, - batch_label->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_label->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_label->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_label->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_label->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_label->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } - next_index += received_requests; -} - -void DataLoader::reset() { - next_index = 0; -} - -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Input Task"); - } - // Load label - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_2, "Load Labels"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Label Task"); - } -} diff --git a/examples/cpp/inference/transformers/dataloader.cu b/examples/cpp/inference/transformers/dataloader.cu deleted file mode 100644 index 4624b562e9..0000000000 --- a/examples/cpp/inference/transformers/dataloader.cu +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/utils/cuda_helper.h" -#include "transformers.h" - -void DataLoader::load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - float const *full_input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *batch_input_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - Domain full_input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain batch_input_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - - coord_t token_dim = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; - coord_t sequence_length = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - coord_t batch_size = - batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; - - // FIXME: currently assume continous indices - assert(meta->num_samples <= batch_size); - for (int i = 1; i < meta->num_samples; i++) { - assert(meta->idxs[i] == meta->idxs[0] + i); - } - // pad inputs if needed (this is really only useful for debugging) - if (meta->num_samples < batch_size) { - checkCUDA(cudaMemset(batch_input_ptr + - token_dim * sequence_length * meta->num_samples, - 0, - token_dim * sequence_length * - (batch_size - meta->num_samples) * sizeof(float))); - } - coord_t start_idx = meta->idxs[0]; - assert(batch_input_domain.get_volume() % token_dim * sequence_length * - batch_size == - 0); - assert(batch_input_domain.get_volume() % batch_size == 0); - size_t size_to_copy = - (batch_input_domain.get_volume() / batch_size) * meta->num_samples; - float const *input_zc = - full_input_ptr + start_idx * token_dim * sequence_length; - copy_kernel<<>>( - batch_input_ptr, input_zc, size_to_copy); - checkCUDA(cudaDeviceSynchronize()); -} - -void DataLoader::load_label(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; - int const *full_label_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int *batch_label_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - Domain full_label_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain batch_label_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - coord_t label_dim = - batch_label_domain.hi()[0] - batch_label_domain.lo()[0] + 1; - coord_t sequence_length = - batch_label_domain.hi()[1] - batch_label_domain.lo()[1] + 1; - coord_t batch_size = - batch_label_domain.hi()[2] - batch_label_domain.lo()[2] + 1; - // FIXME: currently assume continous indices - assert(meta->num_samples <= batch_size); - for (int i = 1; i < meta->num_samples; i++) { - assert(meta->idxs[i] == meta->idxs[0] + i); - } - if (meta->num_samples < batch_size) { - checkCUDA(cudaMemset(batch_label_ptr + - label_dim * sequence_length * meta->num_samples, - 0, - label_dim * sequence_length * - (batch_size - meta->num_samples) * sizeof(int))); - } - assert(batch_label_domain.get_volume() % label_dim * sequence_length * - batch_size == - 0); - assert(batch_label_domain.get_volume() % batch_size == 0); - coord_t start_idx = meta->idxs[0]; - size_t size_to_copy = - (batch_label_domain.get_volume() / batch_size) * meta->num_samples; - int const *input_zc = - full_label_ptr + start_idx * label_dim * sequence_length; - copy_kernel<<>>( - batch_label_ptr, input_zc, size_to_copy); - checkCUDA(cudaDeviceSynchronize()); -} diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index f67d779e3a..45d676fe3c 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -26,7 +26,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_app("Transformers"); -void parse_input_args(char **argv, int argc, MoeConfig &config) { +void parse_input_args(char **argv, int argc, TransformerConfig &config) { for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--dataset")) { config.dataset_path = std::string(argv[++i]); @@ -35,20 +35,23 @@ void parse_input_args(char **argv, int argc, MoeConfig &config) { } } -Tensor create_inc_multihead_attention_decoder(FFModel *model, - MoeConfig const *moeConfig, - Tensor const &input) { +Tensor create_inc_multihead_attention_decoder( + FFModel *model, + TransformerConfig const *transformerConfig, + Tensor const &input) { std::vector axes{1}; - Tensor t = model->inc_multihead_self_attention(input, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim); + Tensor t = model->inc_multihead_self_attention( + input, + transformerConfig->hidden_size, + transformerConfig->num_attention_heads, + transformerConfig->attention_kdim, + transformerConfig->attention_vdim); t = model->layer_norm(model->add(t, input), axes, true, 1e-05); Tensor x = model->dense( - model->dense(t, moeConfig->hidden_size, AC_MODE_RELU, false /*bias*/), - moeConfig->hidden_size, + model->dense( + t, transformerConfig->hidden_size, AC_MODE_RELU, false /*bias*/), + transformerConfig->hidden_size, AC_MODE_NONE, false /*bias*/); t = model->layer_norm(model->add(x, t), axes, true, 1e-05); @@ -60,14 +63,14 @@ void FlexFlow::top_level_task(Task const *task, Context ctx, Runtime *runtime) { //----------------------- Initial configurations ------------------------ - MoeConfig moeConfig; + TransformerConfig transformerConfig; FFConfig ffConfig; - ffConfig.batchSize = moeConfig.batch_size; + ffConfig.batchSize = transformerConfig.batch_size; { InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, moeConfig); + parse_input_args(argv, argc, transformerConfig); log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", ffConfig.batchSize, ffConfig.workersPerNode, @@ -78,34 +81,40 @@ void FlexFlow::top_level_task(Task const *task, //----------------------- Create inputs -------------------------------- Tensor input; { - int const dims[] = {BatchConfig::MAX_NUM_TOKENS, moeConfig.token_dim}; + int const dims[] = {BatchConfig::MAX_NUM_TOKENS, + transformerConfig.token_dim}; input = ff.create_tensor<2>(dims, DT_FLOAT); } //----------------------- Define the model ------------------------------ Tensor t = input; - for (int i = 0; i < moeConfig.num_layers; i++) { - t = create_inc_multihead_attention_decoder(&ff, &moeConfig, input); + for (int i = 0; i < transformerConfig.num_layers; i++) { + t = create_inc_multihead_attention_decoder(&ff, &transformerConfig, input); } - t = ff.dense(t, moeConfig.out_dim, AC_MODE_RELU); + t = ff.dense(t, transformerConfig.out_dim, AC_MODE_RELU); t = ff.softmax(t); //------------------- Initialize the inference manager ------------------ - InferenceManager im( - &ff, moeConfig.batch_size, moeConfig.num_inflight_batches); + InferenceManager im(&ff, + transformerConfig.batch_size, + transformerConfig.num_inflight_batches); im.compile_model_and_allocate_buffer(); im.init_operators_inference(); //------------ Initialize the data loader and data generator ------------ - DataGenerator data_generator(moeConfig.total_requests, - moeConfig.token_dim, - moeConfig.sequence_length, - moeConfig.poisson_distribution, - moeConfig.arrival_rate); - ParallelTensor input_pt, label_pt; + size_t min_input_tokens = 32, max_input_tokens = 512, + min_tokens_to_generate = 1, max_tokens_to_generate = 128; + DataGenerator data_generator(transformerConfig.total_requests, + transformerConfig.token_dim, + min_input_tokens, + max_input_tokens, + min_tokens_to_generate, + max_tokens_to_generate, + transformerConfig.poisson_distribution, + transformerConfig.arrival_rate); + ParallelTensor input_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); - ff.get_parallel_tensor_from_tensor(ff.label_tensor, label_pt); - DataLoader data_loader(ff, moeConfig, data_generator, input_pt, label_pt); + DataLoader data_loader(ff, transformerConfig, data_generator, input_pt); //----------------------- Start timer ----------------------------------- { @@ -120,54 +129,60 @@ void FlexFlow::top_level_task(Task const *task, int index = 0; int processed_requests = 0; int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; - data_loader.reset(); data_generator.start_timer(); std::map future_handlers; std::map batch_configs; - while (processed_requests < moeConfig.total_requests) { - for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { + std::pair new_prompts; + BatchConfig *bc = nullptr; + + // simulation loop. For deployment, we will use a while(true) + while (processed_requests < transformerConfig.total_requests) { + for (int bid = 0; bid < im.max_num_requests_per_batch; bid++) { if (future_handlers.find(bid) == future_handlers.end()) { - std::vector>> prompts; - assert(im.max_num_requests_per_batch <= BatchConfig::MAX_NUM_REQUESTS); - data_generator.get_requests(im.max_num_requests_per_batch, prompts); - assert((int)prompts.size() < im.max_num_requests_per_batch); - // TODO: loading data - BatchConfig *bc = new BatchConfig(); - for (auto const &prompt : prompts) { - assert(bc->register_new_request(prompt.first, prompt.second.size())); - } - bc->prepare_next_batch(); - runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - FutureMap fm = im.inference(bid, *bc); - runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - future_handlers[bid] = future; - batch_configs[bid] = bc; + size_t max_reqs = transformerConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS + : im.max_num_requests_per_batch; + size_t max_tkns = + transformerConfig.sequence_length * transformerConfig.batch_size; + new_prompts = data_generator.get_requests(max_reqs, max_tkns); + assert(new_prompts.second <= BatchConfig::MAX_NUM_REQUESTS); + bc = new BatchConfig(transformerConfig.incremental_mode); } else { Future future = future_handlers[bid]; if (!future.is_ready(true /*subscribe*/)) { continue; } InferenceResult ir = future.get_result(); - BatchConfig *bc = batch_configs[bid]; + bc = batch_configs[bid]; processed_requests += bc->update_results(ir); - int available_slots = - BatchConfig::MAX_NUM_REQUESTS - bc->num_active_requests(); - std::vector>> prompts; - data_generator.get_requests(available_slots, prompts); - processed_requests += prompts.size(); - for (auto const &prompt : prompts) { - assert(bc->register_new_request(prompt.first, prompt.second.size())); - } - bc->prepare_next_batch(); - runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - FutureMap fm = im.inference(bid, *bc); - runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; + size_t max_reqs = transformerConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() + : im.max_num_requests_per_batch; + size_t max_tkns = + transformerConfig.sequence_length * transformerConfig.batch_size - + (transformerConfig.incremental_mode ? bc->num_active_tokens() : 0); + new_prompts = data_generator.get_requests(max_reqs, max_tkns); } + for (size_t i = 0; i < new_prompts.second; i++) { + size_t guid = new_prompts.first + i; + std::pair seq_lens = + data_generator.get_request_length(guid); + assert(seq_lens.first >= min_input_tokens && + seq_lens.first <= max_input_tokens && + seq_lens.second >= min_tokens_to_generate && + seq_lens.second <= max_tokens_to_generate); + assert(bc->register_new_request(guid, seq_lens.first)); + } + bc->prepare_next_batch(); + // TODO: loading data + data_loader.next_batch(ff, bc); + + runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + FutureMap fm = im.inference(bid, *bc); + runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; } } //----------------------- End of inference! ------------------------------ @@ -183,5 +198,5 @@ void FlexFlow::top_level_task(Task const *task, double run_time = 1e-6 * (ts_end - ts_start); printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f requests/s\n", run_time, - moeConfig.total_requests / run_time); + transformerConfig.total_requests / run_time); } diff --git a/examples/cpp/inference/transformers/transformers.h b/examples/cpp/inference/transformers/transformers.h index 207a5de56b..0957bd33bb 100644 --- a/examples/cpp/inference/transformers/transformers.h +++ b/examples/cpp/inference/transformers/transformers.h @@ -13,111 +13,11 @@ * limitations under the License. */ -#include "data_generator.h" -#include "flexflow/model.h" -#define MAX_NUM_SAMPLES 1000 -#define MNIST_DIMS 28 * 28 -#define DATA_DIM MNIST_DIMS +#pragma once -using namespace Legion; -using namespace std; -using namespace FlexFlow; +#include "dataloader.h" +#include "inference_config.h" -struct MoeConfig { - MoeConfig(void) { - //----------------------- Input/output data ------------------------ - token_dim = DATA_DIM; - sequence_length = 10; - batch_size = 32; - out_dim = 15; - num_labels = out_dim; - num_layers = 1; - //----------------------- Inference parameters --------------------- - // total number of requests processed as part of the simulation - total_requests = 256; - poisson_distribution = true; - // average number of request arrivals per second - arrival_rate = 25; - num_inflight_batches = 10; - //----------------------- MoE layer -------------------------------- - // total number of experts - num_exp = 128; - // number of experts in each block of fused experts - experts_per_block = 32; - // number of experts to route each token to - num_select = 2; - // expert capacity parameters - alpha = 2.0f; // factor overhead tensor size for imbalance - lambda = 0.04f; // multiplier for load balance term - // expert hidden size - hidden_size = DATA_DIM; - //----------------------- Rest of model parameters ------------------ - // Encoder layer - num_attention_heads = 16; - attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 1; - } - - // Input/output data - int token_dim; - int sequence_length; - int batch_size; - int out_dim; - int num_labels; - int num_layers; - std::string dataset_path; - // Inference parameters - int total_requests; - bool poisson_distribution; - double arrival_rate; - int num_inflight_batches; - // MoE layer - int num_exp; - int experts_per_block; - int num_select; - float alpha; - float lambda; - int hidden_size; - // Model parameters - int num_attention_heads; - int attention_kdim; - int attention_vdim; - int num_encoder_layers; -}; - -class DataLoader { -public: - DataLoader(FFModel &ff, - MoeConfig const &moeConfig, - DataGenerator &data_generator, - ParallelTensor input, - ParallelTensor label); - static void load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_label(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - void next_batch(FFModel &, size_t); - void reset(void); - -public: - int num_samples, next_index; - FlexFlow::ParallelTensor full_input, batch_input; - FlexFlow::ParallelTensor full_label, batch_label; - struct DataLoaderInput { - MoeConfig const &_moeConfig; - DataGenerator &_data_generator; - }; -}; - -struct SampleIdxs { - int num_samples; - int idxs[MAX_NUM_SAMPLES]; +struct TransformerConfig : InferenceConfig { + TransformerConfig(void) : InferenceConfig() {} }; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index c2a77cf3d6..c5dd2ac90f 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -16,26 +16,29 @@ #pragma once #include +#define MAX_SEQ_LEN 1024 +#define BATCH_SIZE 2 +#define MAX_REQUESTS 256 namespace FlexFlow { struct InferenceResult { - static int const MAX_NUM_TOKENS = 1024; + static int const MAX_NUM_TOKENS = MAX_SEQ_LEN * BATCH_SIZE; int results[MAX_NUM_TOKENS]; }; class BatchConfig { public: - BatchConfig(); + BatchConfig(bool _incremental_mode); bool register_new_request(size_t guid, int length); void prepare_next_batch(); int update_results(InferenceResult const &ir); bool update_num_active_requests_tokens(); int num_active_requests() const; int num_active_tokens() const; - static int const MAX_NUM_REQUESTS = 256; + static int const MAX_NUM_REQUESTS = MAX_REQUESTS; static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; - static int const MAX_SEQUENCE_LENGTH = 1024; + static int const MAX_SEQUENCE_LENGTH = MAX_SEQ_LEN; // These are set by update int num_tokens, num_requests; bool cached_results; @@ -44,6 +47,7 @@ class BatchConfig { int num_processing_tokens[MAX_NUM_REQUESTS]; size_t request_guid[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; + bool incremental_mode; }; }; // namespace FlexFlow diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1690e0e3f3..56d4176e10 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -187,7 +187,6 @@ class Op { // Pure virtual functions that must be implemented virtual void init(FFModel const &) = 0; virtual void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) { diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 27b9981ddd..9200c4b123 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -37,7 +37,6 @@ class Aggregate : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index ce1ceb34d6..2c17674181 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -29,7 +29,6 @@ class AggregateSpec : public Op { char const *name); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index c3146ad38b..3f4c14593f 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -66,7 +66,6 @@ class MultiHeadAttention : public Op { std::vector const &inputs); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 5a14acb80b..9c2e6c1252 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -27,7 +27,6 @@ class ElementBinary : public Op { bool inplace_a = false); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 58640ec085..e75a14089c 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -64,7 +64,7 @@ class ExpertsMeta : public OpMeta { }; // definitions for the CUDA kernel -#define MAX_BATCH_SIZE 32 * 10 +#define MAX_BATCH_SIZE 1024 * 2 // 32 * 10 #define MAX_EXPERTS_PER_BLOCK 32 class Experts : public Op { @@ -96,7 +96,6 @@ class Experts : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index bef61dc755..ae421751c3 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -36,7 +36,6 @@ class Group_by : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 4f2dc80635..c60f2089cc 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -60,7 +60,6 @@ class IncMultiHeadSelfAttention : public Op { std::vector const &inputs); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 284f42a716..60987471b2 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -26,7 +26,6 @@ class LayerNorm : public Op { char const *name); void init(FFModel const &); void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index b0af71e610..666d9228ab 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -37,7 +37,6 @@ class Linear : public Op { void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/noop.h b/include/flexflow/ops/noop.h index 91ccc15094..add4150e85 100644 --- a/include/flexflow/ops/noop.h +++ b/include/flexflow/ops/noop.h @@ -19,7 +19,6 @@ class NoOp : public Op { char const *name = NULL); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 85eecfb744..04f1283f89 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -23,7 +23,6 @@ class Softmax : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 138c11b4d7..ec3691ea11 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -30,7 +30,6 @@ class TopK : public Op { char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/parallel_ops/partition.h b/include/flexflow/parallel_ops/partition.h index 1658759411..21eda315ed 100644 --- a/include/flexflow/parallel_ops/partition.h +++ b/include/flexflow/parallel_ops/partition.h @@ -31,7 +31,6 @@ class Repartition : public ParallelOp { std::vector const &batch_outputs) override; void init(FFModel const &) override; void init_inference(FFModel const &, - BatchConfig const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 12ab38efad..304331f485 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -183,7 +183,6 @@ Node Aggregate::deserialize(FFModel &ff, } void Aggregate::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index b5c3551cb3..e076695a2f 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -157,7 +157,6 @@ AggregateSpec::AggregateSpec(FFModel &model, void AggregateSpec::init_inference( FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/attention.cc b/src/ops/attention.cc index f0195c95d6..75923e8da2 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -374,7 +374,6 @@ MultiHeadAttention::MultiHeadAttention( void MultiHeadAttention::init_inference( FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 01f68919bd..11d5ff6012 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -262,7 +262,6 @@ void ElementBinary::do_inplace_output(void) { void ElementBinary::init_inference( FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/experts.cc b/src/ops/experts.cc index b57874ac35..60a9948ec0 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -438,7 +438,6 @@ Node Experts::deserialize(FFModel &ff, } void Experts::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index ae274a1672..53c78538e2 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -165,7 +165,6 @@ Group_by::Group_by(FFModel &model, model, inputs.first, inputs.second, params.n, params.alpha, name) {} void Group_by::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index efbf8636de..fc423caea3 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -345,7 +345,6 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index dfd89c1fe6..4f0703dcc6 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -195,7 +195,6 @@ LayerNorm::LayerNorm(FFModel &model, } void LayerNorm::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 84a962c998..381110a4d3 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -255,7 +255,6 @@ void Linear::init(FFModel const &ff) { } void Linear::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/noop.cc b/src/ops/noop.cc index 2a2686cabd..d35d5d48b7 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -96,7 +96,6 @@ OpMeta *NoOp::init_task(Task const *task, } void NoOp::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 310c835ab8..9543e34a90 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -116,7 +116,6 @@ Softmax::Softmax(FFModel &model, : Softmax(model, input, params.dim, name) {} void Softmax::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 6192010c51..5cefe955b1 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -137,7 +137,6 @@ TopK::TopK(FFModel &model, : TopK(model, input, params.k, params.sorted, name) {} void TopK::init_inference(FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 49ae3222f0..aaa28b7576 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -103,7 +103,6 @@ OpMeta *Repartition::init_task(Task const *task, void Repartition::init_inference( FFModel const &ff, - BatchConfig const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 936590905b..d58204b7c3 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -21,7 +21,8 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_bc("BatchConfig"); -BatchConfig::BatchConfig() { +BatchConfig::BatchConfig(bool _incremental_mode) + : incremental_mode(_incremental_mode) { cached_results = false; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { token_start_idx[i] = 0; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 5c8be135a4..dcb6e9e67d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -111,8 +111,7 @@ void InferenceManager::init_operators_inference() { ((ParallelOp *)op) ->create_input_partition_inference(*model, inputs, outputs); } - BatchConfig bc; - op->init_inference(*model, bc, inputs, outputs, view); + op->init_inference(*model, inputs, outputs, view); } } } From 61f53ba5d8bc1b4729f0981a811b712baf753df6 Mon Sep 17 00:00:00 2001 From: daiyaanarfeen Date: Sun, 19 Mar 2023 10:02:14 -0700 Subject: [PATCH 075/344] [Tensor Equal] ParallelTensor comparison (#613) * [Tensor Equal] Task signature * Tensor Equal task implementation * linting * Revert "linting" This reverts commit b9762f65697e6490386cd6cca5071b8d4afad1b4. * linting --------- Co-authored-by: Gabriele Oliaro --- include/flexflow/model.h | 2 + include/flexflow/parallel_tensor.h | 14 +++++++ scripts/format.sh | 2 +- src/mapper/mapper.cc | 5 +++ src/runtime/model.cc | 9 +++++ src/runtime/parallel_tensor.cc | 62 ++++++++++++++++++++++++++++++ 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4d55f574b2..35b2c13d40 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -225,6 +225,8 @@ enum TaskIDs { // Make sure PYTHON_TOP_LEVEL_TASK_ID is // consistent with python/main.cc PYTHON_TOP_LEVEL_TASK_ID = 11111, + // Tensor Equal Task + TENSOR_EQUAL_TASK_ID, }; enum ShardingID { diff --git a/include/flexflow/parallel_tensor.h b/include/flexflow/parallel_tensor.h index db77b49030..d06ecd7bac 100644 --- a/include/flexflow/parallel_tensor.h +++ b/include/flexflow/parallel_tensor.h @@ -169,6 +169,20 @@ struct ParallelTensorBase { bool get_tensor(FFModel const *model, T *data, bool get_parameters); ParallelTensorShape get_shape() const; + template + bool tensor_equal(FFConfig &config, ParallelTensorBase &tensor); + static bool + tensor_equal_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + template + static bool tensor_equal_task_with_dim( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + private: template bool get_input_sub_tensor_via_mappings(ParallelConfig const &pc, diff --git a/scripts/format.sh b/scripts/format.sh index 6340a33864..bf13948955 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -67,5 +67,5 @@ if [[ ! -e $CLANG_FORMAT_PATH ]]; then chmod u+x "$CLANG_FORMAT_PATH" fi -mapfile -t FILES < <(git ls-files | grep -E '\.(h|cc|cpp|cu)$') +mapfile -t FILES < <(git ls-files | grep -E '\.(h|cc|cpp|cu)$' | grep -v '^triton') "$CLANG_FORMAT_PATH" -i "${FILES[@]}" diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index f89c9b4e63..a0b3dba3ff 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -362,6 +362,11 @@ void FFMapper::select_task_options(const MapperContext ctx, } } + if (task.task_id == TENSOR_EQUAL_TASK_ID) { + output.initial_proc = all_cpus[0]; + return; + } + // Assert that all single tasks should be handled and returned before // So task must be an indextask if (!task.is_index_space) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5086d7bb3f..7fcf1ef61f 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4792,6 +4792,15 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "Weights Prefetch Task"); } + // Tensor Equal task + { + TaskVariantRegistrar registrar(TENSOR_EQUAL_TASK_ID, "Tensor Equal"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Tensor Equal Task"); + } } // template instantiations diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 963ad8af73..a64d118fbc 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -747,6 +747,64 @@ bool ParallelTensorBase::get_tensor(FFModel const *ff, return true; } +template +bool ParallelTensorBase::tensor_equal(FFConfig &config, + ParallelTensorBase &tensor) { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + TaskLauncher launcher(TENSOR_EQUAL_TASK_ID, + TaskArgument(&num_dims, sizeof(num_dims))); + launcher.add_region_requirement( + RegionRequirement(region, READ_ONLY, EXCLUSIVE, region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(tensor.region, READ_ONLY, EXCLUSIVE, tensor.region)); + launcher.add_field(1, FID_DATA); + Future result = runtime->execute_task(ctx, launcher); + bool equals = result.get_result(); +} + +bool ParallelTensorBase::tensor_equal_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + int dim = *(int const *)task->args; + switch (dim) { +#define DIMFUNC(DIM) \ + case DIM: \ + return tensor_equal_task_with_dim(task, regions, ctx, runtime); + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + assert(false); +} + +template +bool ParallelTensorBase::tensor_equal_task_with_dim( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TensorAccessorR acc1( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + TensorAccessorR acc2( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + float const *data1 = acc1.ptr; + float const *data2 = acc2.ptr; + bool equal = true; + for (int i = 0; i < acc1.rect.volume(); i++) { + if (data1[i] != data2[i]) { + equal = false; + break; + } + } + return equal; +} + template float *ParallelTensorBase::get_raw_ptr(FFConfig &config); template int32_t *ParallelTensorBase::get_raw_ptr(FFConfig &config); @@ -796,6 +854,10 @@ template bool ParallelTensorBase::get_tensor(FFModel const *ff, int64_t *data, bool get_gradients); +template bool + ParallelTensorBase::tensor_equal(FFConfig &config, + ParallelTensorBase &tensor); + template bool TensorBase::get_output_parallel_tensor(FFModel const *ff, float *data, bool get_gradients); From 5b21ae8f167c9bde5f564d39a4dcde2824d7b0ea Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:19:44 -0400 Subject: [PATCH 076/344] [MoE][Experts] Fixed bug and verified experts kernel (#661) * Fixed bug with bias in experts kernel. * Removed debug output and formatted code. --- src/ops/experts.cc | 180 +++++++++++++++++++++++++++++++++++++++++++++ src/ops/experts.cu | 6 +- 2 files changed, 185 insertions(+), 1 deletion(-) diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 60a9948ec0..bfe3e73ee1 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/experts.h" +#include "flexflow/utils/cuda_helper.h" #include "legion/legion_utilities.h" namespace FlexFlow { @@ -37,6 +38,7 @@ using PCG::Node; static constexpr int KERNEL_IDX = 0; static constexpr int BIAS_IDX = 1; +static bool DEBUG_MODE = false; // For now, we use one input and one output per expert Tensor FFModel::experts(Tensor const *inputs, @@ -839,6 +841,140 @@ void Experts::inference_task(Task const *task, } } + if (DEBUG_MODE) { + std::cout << "forward_kernel_wrapper" << std::endl + << "-------------------------------" << std::endl; + std::cout << m->data_dim << std::endl; + std::cout << m->out_dim << std::endl; + std::cout << m->num_chosen_experts << std::endl; + std::cout << m->effective_batch_size << std::endl; + std::cout << m->num_experts << std::endl; + std::cout << m->use_bias << std::endl; + + /* ----------------Input Token--------------*/ + float *cpu_input_ptr = new float[data_dim]; + checkCUDA(cudaMemcpy(cpu_input_ptr, + input_ptr, + data_dim * sizeof(float), + cudaMemcpyDeviceToHost)); + + srand(42); + float cpu_sum = 0; + for (int i = 0; i < data_dim; i++) { + // cpu_input_ptr[i] = (float)rand() / (float)RAND_MAX; + cpu_input_ptr[i] = float(i) / (float)data_dim; + cpu_sum += cpu_input_ptr[i]; + } + std::cout << "[CPU] Token 0 sum = " << cpu_sum << std::endl; + std::cout << "Total token number = " << batch_size << std::endl; + for (int i = 0; i < batch_size; i++) { + checkCUDA(cudaMemcpy((float *)(input_ptr + i * data_dim), + cpu_input_ptr, + data_dim * sizeof(float), + cudaMemcpyHostToDevice)); + } + free(cpu_input_ptr); + + /* ----------------indices--------------*/ + int *cpu_indices_ptr = new int[chosen_experts * batch_size]; + checkCUDA(cudaMemcpy(cpu_indices_ptr, + indices_ptr, + chosen_experts * batch_size * sizeof(int), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < chosen_experts * 10; i++) { + if (i % 2 == 1) { + cpu_indices_ptr[i] += chosen_experts; + } + } + checkCUDA(cudaMemcpy((int *)indices_ptr, + cpu_indices_ptr, + chosen_experts * batch_size * sizeof(int), + cudaMemcpyHostToDevice)); + free(cpu_indices_ptr); + + /* ----------------coefficient--------------*/ + float *cpu_topk_gate_pred_ptr = new float[chosen_experts * batch_size]; + checkCUDA(cudaMemcpy(cpu_topk_gate_pred_ptr, + topk_gate_pred_ptr, + chosen_experts * batch_size * sizeof(float), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < chosen_experts * batch_size; i++) { + if (i % 2 == 0) { + cpu_topk_gate_pred_ptr[i] = 0.5; + } else { + cpu_topk_gate_pred_ptr[i] = 0.1; + } + } + checkCUDA(cudaMemcpy((float *)topk_gate_pred_ptr, + cpu_topk_gate_pred_ptr, + chosen_experts * batch_size * sizeof(float), + cudaMemcpyHostToDevice)); + free(cpu_topk_gate_pred_ptr); + + /* ----------------Expert Weights--------------*/ + float *cpu_experts_1 = new float[data_dim * out_dim]; + float *cpu_experts_2 = new float[data_dim * out_dim]; + checkCUDA(cudaMemcpy(cpu_experts_1, + weights_ptrs[0], + data_dim * out_dim * sizeof(float), + cudaMemcpyDeviceToHost)); + checkCUDA(cudaMemcpy(cpu_experts_2, + weights_ptrs[2], + data_dim * out_dim * sizeof(float), + cudaMemcpyDeviceToHost)); + cpu_sum = 0; + for (int i = 0; i < data_dim * out_dim; i++) { + cpu_experts_1[i] = float(i) / float(data_dim * out_dim); + cpu_sum += cpu_experts_1[i]; + } + std::cout << "[CPU] Experts 0 weights sum = " << cpu_sum << std::endl; + + for (int i = 0; i < data_dim * out_dim; i++) { + cpu_experts_2[i] = + float(data_dim * out_dim - i) / float(data_dim * out_dim); + cpu_sum += cpu_experts_2[i]; + } + std::cout << "[CPU] Experts 1 weights sum = " << cpu_sum << std::endl; + + for (int i = 0; i < num_experts; i++) { + if (i % 2 == 0) { + checkCUDA(cudaMemcpy((float *)weights_ptrs[i * (1 + use_bias)], + cpu_experts_1, + data_dim * out_dim * sizeof(float), + cudaMemcpyHostToDevice)); + } else { + checkCUDA(cudaMemcpy((float *)weights_ptrs[i * (1 + use_bias)], + cpu_experts_2, + data_dim * out_dim * sizeof(float), + cudaMemcpyHostToDevice)); + } + } + free(cpu_experts_1); + free(cpu_experts_2); + + /* ----------------Expert Bias--------------*/ + if (use_bias) { + float *bias_experts_1 = new float[out_dim]; + checkCUDA(cudaMemcpy(bias_experts_1, + weights_ptrs[1], + out_dim * sizeof(float), + cudaMemcpyDeviceToHost)); + cpu_sum = 0; + for (int i = 0; i < out_dim; i++) { + cpu_sum += bias_experts_1[i]; + // bias_experts_1[i] = 1.0f; + } + std::cout << "[CPU] Bias 0 sum = " << cpu_sum << std::endl; + for (int i = 0; i < num_experts; i++) { + checkCUDA(cudaMemcpy((float *)weights_ptrs[i * (1 + use_bias) + 1], + bias_experts_1, + out_dim * sizeof(float), + cudaMemcpyHostToDevice)); + } + free(bias_experts_1); + } + } + Experts::forward_kernel_wrapper(m, input_ptr, indices_ptr, @@ -848,6 +984,50 @@ void Experts::inference_task(Task const *task, chosen_experts, batch_size, out_dim); + + if (DEBUG_MODE) { + /* ----------------Output after computation--------------*/ + float *cpu_output_ptr = new float[batch_size * out_dim]; + float cpu_sum = 0; + checkCUDA(cudaMemcpy(cpu_output_ptr, + output_ptr, + batch_size * out_dim * sizeof(float), + cudaMemcpyDeviceToHost)); + for (int j = 0; j < batch_size * out_dim; j += out_dim) { + cpu_sum = 0; + for (int i = 0; i < out_dim; i++) { + cpu_sum += cpu_output_ptr[j + i]; + } + // if ((j/out_dim) < 50) std::cout << "[CPU] output " << (j/out_dim) << " + // sum = " << cpu_sum << std::endl; + if (cpu_sum > 0.0f) { + std::cout << "[CPU] output " << (j / out_dim) << " sum = " << cpu_sum + << std::endl; + } + } + std::cout << "[CPU] output 0's 10th element = " << cpu_output_ptr[10] + << std::endl; + std::cout << "[CPU] output 0's 99th element = " << cpu_output_ptr[99] + << std::endl; + std::cout << "[CPU] output 0's 123th element = " << cpu_output_ptr[123] + << std::endl; + + /* refrence output */ + /* + * Input token sum = 391.5 + * Expert 0 weights sum = 307327.5 + * Expert 1 weights sum = 307328.47 + * ------------------ + * experts 0's reulst = 153533.1 + * experts 1's reulst = 153402.9 + * Aggreated Result = 92106.836 + * 10th element = 41.28053 + * 99th element = 59.057823 + * 123th element = 63.8517 + */ + + free(cpu_output_ptr); + } } void Experts::forward_task(Task const *task, diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 803d6da6c4..3cd86ed56b 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -306,7 +306,7 @@ void experts_forward_GemmBatched_kernel(ExpertsMeta const *m, if (m->use_bias) { checkCUDA(cublasGemmBatchedEx( m->handle.blas, - CUBLAS_OP_T, // Bias, shape (out_dim, 1) + CUBLAS_OP_N, // Bias, shape (out_dim, 1) CUBLAS_OP_N, // Coefficient, shape (1, 1) out_dim, // num_row of (A, C) = out_dim 1, // num_col of (B, C) = 1 @@ -559,6 +559,10 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, checkCUDA(cudaMalloc(&batch_outputs[0], out_dim * num_chosen_experts * effective_batch_size * sizeof(float))); + checkCUDA(cudaMemset(batch_outputs[0], + 0, + out_dim * num_chosen_experts * effective_batch_size * + sizeof(float))); for (int i = 1; i < num_chosen_experts * effective_batch_size; i++) { batch_outputs[i] = batch_outputs[i - 1] + out_dim * sizeof(float); } From c1d52bc908e411eaec8232e53fa2959dc699f658 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 24 Mar 2023 16:26:18 -0400 Subject: [PATCH 077/344] maybe fix --- conda/flexflow-cpu.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml index e4e614b265..6bd9d6d663 100644 --- a/conda/flexflow-cpu.yml +++ b/conda/flexflow-cpu.yml @@ -12,9 +12,8 @@ dependencies: - pytest - pip - pip: - - --extra-index-url https://download.pytorch.org/whl/cpu - qualname>=0.1.0 - keras_preprocessing>=1.1.2 - - torch==1.13.1+cpu - - torchaudio==0.13.1+cpu - - torchvision + - torch --index-url https://download.pytorch.org/whl/cpu + - torchaudio --index-url https://download.pytorch.org/whl/cpu + - torchvision --index-url https://download.pytorch.org/whl/cpu From d2b97a199cd630f6afa11b1bc461b30c619d75d3 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Sun, 26 Mar 2023 04:55:46 -0400 Subject: [PATCH 078/344] [Inference] - Incremental MHA implementation for autoregressive inference demo (#647) * Initial commit. * kvCache member of IncMultiHeadSelfAttentionMeta allocated after cudnn buffers / before reserveSpace; comment out measure_operator_cost * add qkv projection operation * bug fixing and formatting * Add request_token_id struct. * store output of QKV into key and value caches * [kvStore] bug fix * Fix bugs with BatchConfig prepare_next_batch. * Update fixation on BatchConfig. * remove clutter * linting * Update request_id assignment logic. * illegal memory address fix * minor bug fixes * copy rid/tid stuct to device * added support for multi-heads in kv caching * linting * reduced num_inflight_batches * iter loop fix * implemented attention first matmul and softmax * finished all kernels * linting * fix * fix compilation bugs * fix hip build error * linting * bug fix * further debugging * fix dataloader-inference bug. now tensors are no longer filled with 0. * add function to download tensors to gpu. start implementing mha ops check * add templates decl * typo * link with c++ torch for debugging * backup * load weights * linting * fixed bugs, more progress in C++ mha check impl * finished validating all the way to k/v caching. fixed several bugs * update * fixed config * fix * fix bug, implement q*t matmul in c++ * fixed several bugs, finished verifying up to right before softmax * memory release * fix softmax * verified matmul by v * fixed bugs in output projection * ptr check/free * synch thread, add flag to turn on/off checking * fix libtorch settings * ci fix * fix transformer example * fix * bug fix * fix dimension in c++ k/v cache * fix cache * cleanup * fix comments * bug fixes for incremental phase * more bug fixing * fix bugs * add transformer test to CI * nit * fixed bugs in expert layer --------- Co-authored-by: Daiyaan Co-authored-by: Gabriele Oliaro Co-authored-by: Daiyaan Arfeen --- CMakeLists.txt | 26 +- config/config.inc | 15 +- config/config.linux | 13 +- examples/cpp/inference/data_generator.cc | 7 + examples/cpp/inference/dataloader.cc | 69 +- examples/cpp/inference/dataloader.cu | 49 +- examples/cpp/inference/dataloader.h | 15 +- examples/cpp/inference/inference_config.h | 18 +- .../cpp/inference/mixture_of_experts/moe.cc | 47 +- .../inference/transformers/transformers.cc | 76 +- .../cpp/inference/transformers/transformers.h | 4 +- include/flexflow/batch_config.h | 48 +- include/flexflow/model.h | 2 +- include/flexflow/ops/experts.h | 1 + .../ops/inc_multihead_self_attention.h | 33 +- include/flexflow/utils/cuda_helper.h | 3 + src/ops/experts.cc | 8 +- src/ops/experts.cpp | 1 + src/ops/experts.cu | 62 +- src/ops/inc_multihead_self_attention.cc | 841 ++++++++++++++++-- src/ops/inc_multihead_self_attention.cpp | 192 +--- src/ops/inc_multihead_self_attention.cu | 722 +++++++++++---- src/runtime/batch_config.cc | 128 ++- src/runtime/cuda_helper.cu | 23 + tests/cpp_gpu_tests.sh | 2 + 25 files changed, 1739 insertions(+), 666 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8be1c10ce8..392377bf68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,15 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake) set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UNDEBUG") +option(INFERENCE_TESTS "Run inference tests" OFF) +set(LIBTORCH_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../libtorch" CACHE STRING "LibTorch Path") +if (INFERENCE_TESTS) + find_package(Torch REQUIRED PATHS ${LIBTORCH_PATH} NO_DEFAULT_PATH) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + message(STATUS "LIBTORCH_PATH: ${LIBTORCH_PATH}") + message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}") +endif() + # Set a default build type if none was specified set(default_build_type "Debug") if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) @@ -85,10 +94,10 @@ set(LD_FLAGS $ENV{LD_FLAGS}) # Set global FLAGS list(APPEND CC_FLAGS - -std=c++11) + -std=c++14) list(APPEND NVCC_FLAGS - -std=c++11) + -std=c++14) add_compile_options(${CC_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS}) @@ -135,6 +144,14 @@ if(FF_USE_NCCL) -DFF_USE_NCCL) endif() +# Inference tests +if(INFERENCE_TESTS) + list(APPEND FF_CC_FLAGS + -DINFERENCE_TESTS) + list(APPEND FF_NVCC_FLAGS + -DINFERENCE_TESTS) +endif() + # Legion include(legion) @@ -321,6 +338,11 @@ if(FF_USE_NCCL) add_dependencies(flexflow ${NCCL_NAME}) endif() +if (INFERENCE_TESTS) + target_link_libraries(flexflow "${TORCH_LIBRARIES}") + set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) +endif() + # build binary option(FF_BUILD_RESNET "build resnet example" OFF) option(FF_BUILD_RESNEXT "build resnext example" OFF) diff --git a/config/config.inc b/config/config.inc index 2308b8b66b..c9eb554cb4 100644 --- a/config/config.inc +++ b/config/config.inc @@ -27,6 +27,19 @@ if [ -n "$INSTALL_DIR" ]; then SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}" fi +if [ "$INFERENCE_TESTS" = "ON" ]; then + SET_INFERENCE_TESTS="-DINFERENCE_TESTS=ON" +else + SET_INFERENCE_TESTS="-DINFERENCE_TESTS=OFF" +fi + +#set cmake prefix path dir +if [ -n "$LIBTORCH_PATH" ]; then + SET_LIBTORCH_PATH="-DLIBTORCH_PATH=${LIBTORCH_PATH}" +else + SET_LIBTORCH_PATH="" +fi + # set build type if [ -n "$BUILD_TYPE" ]; then SET_BUILD="-DCMAKE_BUILD_TYPE=${BUILD_TYPE}" @@ -189,7 +202,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_GASNET} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_GASNET} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 86e8f8b647..614a671e44 100755 --- a/config/config.linux +++ b/config/config.linux @@ -1,5 +1,4 @@ #!/bin/bash - # set the CC and CXX, usually it is not needed as cmake can detect it # set CC and CXX to mpicc and mpic++ when enable gasnet # CC=mpicc @@ -16,6 +15,18 @@ # set build type BUILD_TYPE=${BUILD_TYPE:-Release} +INFERENCE_TESTS=${INFERENCE_TESTS:-OFF} +LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"} +if [[ "$INFERENCE_TESTS" == "ON" && ! -d "$LIBTORCH_PATH" ]]; then + cwd="$(pwd)" + cd ../.. + wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip + unzip libtorch-shared-with-deps-latest.zip + rm libtorch-shared-with-deps-latest.zip + LIBTORCH_PATH="$(pwd)/libtorch" + cd "$cwd" +fi + # set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal). # To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75). # Alternatively, set "FF_CUDA_ARCH=autodetect" to build FlexFlow for all architectures detected on the machine, diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc index 961052537e..f691247485 100644 --- a/examples/cpp/inference/data_generator.cc +++ b/examples/cpp/inference/data_generator.cc @@ -14,6 +14,7 @@ */ #include "data_generator.h" +#include "flexflow/batch_config.h" #include #include #include @@ -32,6 +33,9 @@ DataGenerator::DataGenerator(size_t _num_requests, min_tokens_to_generate(_min_tokens_to_generate), max_tokens_to_generate(_max_tokens_to_generate), poisson_distr(_poisson_distr), lambda(_lambda), timer_started(false) { + assert(max_input_tokens >= min_input_tokens); + assert(max_tokens_to_generate >= min_tokens_to_generate); + assert(max_input_tokens + max_tokens_to_generate <= MAX_SEQ_LEN); generate_requests_meta(); }; @@ -109,6 +113,7 @@ void DataGenerator::start_timer(void) { // the tensor's batch_size * sequence length. std::pair DataGenerator::get_requests(size_t max_requests, size_t max_tokens) { + // printf("\nget_requests(%lu, %lu)\n\n", max_requests, max_tokens); if (!timer_started) { std::cout << "Warning: tried to get number of requests before the timer " "was started." @@ -132,6 +137,8 @@ std::pair DataGenerator::get_requests(size_t max_requests, if (seq_lengths[first_request_guid + j].first <= max_tokens - new_tokens) { received_requests++; new_tokens += seq_lengths[first_request_guid + j].first; + } else { + break; } } std::advance(arrivals_ptr, received_requests); diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc index ae7cb5ccd0..fba9e24129 100644 --- a/examples/cpp/inference/dataloader.cc +++ b/examples/cpp/inference/dataloader.cc @@ -22,11 +22,21 @@ using namespace Legion; DataLoader::DataLoader(FFModel &ff, InferenceConfig const &inferenceConfig, DataGenerator &data_generator, - ParallelTensor input) { + std::vector input) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - int numdims = input->num_dims; + assert(input.size() > 0); + int numdims = input[0]->num_dims; + for (int i = 1; i < input.size(); i++) { + assert(input[i]->num_dims == numdims); + for (int j = 0; j < numdims; j++) { + assert(input[i]->dims[j].size == input[0]->dims[j].size); + assert(input[i]->dims[j].degree == input[0]->dims[j].degree); + assert(input[i]->dims[j].parallel_idx == input[0]->dims[j].parallel_idx); + } + } + int replica_idx = numdims - 1; int batch_idx = numdims - 2; num_samples = inferenceConfig.total_requests; @@ -37,10 +47,10 @@ DataLoader::DataLoader(FFModel &ff, ParallelDim dims[numdims]; for (int i = 0; i < numdims; i++) { - dims[i].size = input->dims[i].size; + dims[i].size = input[0]->dims[i].size; dims[i].degree = 1; dims[i].parallel_idx = -1; - dims[i].is_replica_dim = input->dims[i].is_replica_dim; + dims[i].is_replica_dim = input[0]->dims[i].is_replica_dim; // Assume only the first dim can be the replica dim assert(i == replica_idx || (!dims[i].is_replica_dim)); } @@ -100,56 +110,46 @@ void DataLoader::load_entire_dataset(Task const *task, } } -void DataLoader::next_batch(FFModel &ff, BatchConfig *bc) { +void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc) { size_t num_active_tokens = bc->num_active_tokens(); if (num_active_tokens == 0) { return; } + assert(bid < batch_input.size()); Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; // Load input { Domain domain = - runtime->get_index_space_domain(ctx, batch_input->parallel_is); + runtime->get_index_space_domain(ctx, batch_input[bid]->parallel_is); ArgumentMap argmap; // No partitioning of the batch input token in inference mode - int input_dims = batch_input->num_dims; + int input_dims = batch_input[bid]->num_dims; for (int i = 0; i < input_dims; i++) { - assert(batch_input->dims[i].degree == 1 && + assert(batch_input[bid]->dims[i].degree == 1 && "Dataloader does not support input token partitioning in " "inference mode"); } - int batch_size = batch_input->dims[input_dims - 2].size; - int seq_len = batch_input->dims[input_dims - 3].size; + int batch_size = batch_input[bid]->dims[input_dims - 2].size; + int seq_len = batch_input[bid]->dims[input_dims - 3].size; + /* printf("ff.config.batchSize: %i, batch_size: %i, seq_len: %i, + num_active_tokens: %i\n", ff.config.batchSize, batch_size, seq_len, + num_active_tokens); */ assert(ff.config.batchSize == batch_size && batch_size * seq_len >= num_active_tokens); for (Domain::DomainPointIterator it(domain); it; it++) { - SampleIdxs meta; - meta.num_samples = num_active_tokens; - meta.incremental_mode = bc->incremental_mode; - int token_index = 0; - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { - if (bc->request_completed[i]) { - continue; - } else { - for (int j = 0; j < bc->num_processing_tokens[i]; j++) { - meta.guids[token_index] = bc->request_guid[i]; - meta.idxs[token_index] = bc->token_start_idx[i] + j; - token_index++; - } - } - } - assert(token_index == num_active_tokens); - argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + // SampleIdxs meta = bc->token2ids; + argmap.set_point( + *it, TaskArgument(&bc->token2ids, sizeof(BatchConfig::SampleIdxs))); } IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input->parallel_is, + batch_input[bid]->parallel_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_input->machine_view.hash()); + batch_input[bid]->machine_view.hash()); launcher.add_region_requirement(RegionRequirement(full_input->region, 0 /*projection id*/, READ_ONLY, @@ -157,11 +157,12 @@ void DataLoader::next_batch(FFModel &ff, BatchConfig *bc) { full_input->region, MAP_TO_ZC_MEMORY)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_input->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->region)); + launcher.add_region_requirement( + RegionRequirement(batch_input[bid]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input[bid]->region)); launcher.add_field(1, FID_DATA); runtime->execute_index_space(ctx, launcher); } diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu index 6f8c6c19ad..0668fd949d 100644 --- a/examples/cpp/inference/dataloader.cu +++ b/examples/cpp/inference/dataloader.cu @@ -22,7 +22,7 @@ void DataLoader::load_input(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); - SampleIdxs *meta = (SampleIdxs *)task->local_args; + BatchConfig::SampleIdxs *meta = (BatchConfig::SampleIdxs *)task->local_args; if (meta->num_samples == 0) { return; } @@ -42,15 +42,23 @@ void DataLoader::load_input(Task const *task, batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; coord_t batch_size = batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; + + coord_t full_input_token_dim = + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; coord_t full_input_sequence_length = - full_input_domain.hi()[1] - full_input_domain.lo()[1] + 1; + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + coord_t full_input_batch_size = + batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; + assert(token_dim == full_input_token_dim); assert(sequence_length == full_input_sequence_length); + assert(batch_size <= full_input_batch_size); // Currently assume continous indices assert(meta->num_samples <= batch_size * sequence_length); for (int i = 1; i < meta->num_samples; i++) { if (meta->guids[i] == meta->guids[i - 1]) { - assert(meta->idxs[i] == meta->idxs[i - 1] + 1); + assert(meta->token_indexes[i].token_position == + meta->token_indexes[i - 1].token_position + 1); } } // keep things simple for now @@ -61,34 +69,15 @@ void DataLoader::load_input(Task const *task, checkCUDA(cudaMemset( batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(float))); - if (!meta->incremental_mode) { - size_t num_requests = 0; - size_t guid; - for (size_t i = 0; i < meta->num_samples; i++) { - if (i == 0 || meta->guids[i] != guid) { - guid = meta->guids[0]; - num_requests++; - } - } - - coord_t start_idx = meta->guids[0]; - assert(batch_input_domain.get_volume() % batch_size == 0); - size_t size_to_copy = token_dim * sequence_length * num_requests; - float const *input_zc = - full_input_ptr + start_idx * token_dim * sequence_length; - copy_kernel<<>>( - batch_input_ptr, input_zc, size_to_copy); - checkCUDA(cudaDeviceSynchronize()); - return; - } - size_t guid = meta->guids[0]; - size_t start_idx = meta->idxs[0]; + size_t start_idx = meta->token_indexes[0].token_position; size_t dst_idx = 0; size_t total_tokens = 0; for (size_t i = 1; i <= meta->num_samples; i++) { if (i == meta->num_samples || meta->guids[i] != guid) { - size_t size_to_copy = token_dim * (meta->idxs[i - 1] - start_idx + 1); + size_t size_to_copy = + token_dim * + (meta->token_indexes[i - 1].token_position - start_idx + 1); total_tokens += size_to_copy / token_dim; float const *input_zc = full_input_ptr + (guid * token_dim * sequence_length) + @@ -98,11 +87,17 @@ void DataLoader::load_input(Task const *task, dst_ptr, input_zc, size_to_copy); if (i < meta->num_samples) { guid = meta->guids[i]; - start_idx = meta->idxs[i]; + start_idx = meta->token_indexes[i].token_position; } dst_idx = i; } } assert(total_tokens == meta->num_samples); + /*printf("token_dim: %lli, sequence_length: %lli, batch_size: %lli\n", + token_dim, sequence_length, batch_size); printf("total_tokens: %lu\n", + total_tokens); printf("guid: %lu\n", guid); + print_tensor(batch_input_ptr, + batch_input_domain.get_volume(), + "[BatchInput]");*/ checkCUDA(cudaDeviceSynchronize()); } diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h index e67176d801..b95108aa35 100644 --- a/examples/cpp/inference/dataloader.h +++ b/examples/cpp/inference/dataloader.h @@ -15,6 +15,7 @@ #pragma once #include "data_generator.h" +#include "flexflow/batch_config.h" #include "flexflow/model.h" #include "inference_config.h" @@ -33,7 +34,7 @@ class DataLoader { DataLoader(FFModel &ff, InferenceConfig const &inferenceConfig, DataGenerator &data_generator, - ParallelTensor input); + std::vector input); static void load_input(Task const *task, std::vector const ®ions, Context ctx, @@ -42,20 +43,14 @@ class DataLoader { std::vector const ®ions, Context ctx, Runtime *runtime); - void next_batch(FFModel &, BatchConfig *); + void next_batch(FFModel &, int, BatchConfig *); public: size_t num_samples; - FlexFlow::ParallelTensor full_input, batch_input; + ParallelTensor full_input; + std::vector batch_input; struct DataLoaderInput { InferenceConfig const &_inferenceConfig; DataGenerator &_data_generator; }; }; - -struct SampleIdxs { - bool incremental_mode; - size_t num_samples; - size_t idxs[MAX_SEQ_LEN]; // the id of each token within its request - size_t guids[MAX_SEQ_LEN]; // the guid of the request each token belongs to -}; \ No newline at end of file diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h index cbd0a082e2..c96d5b9b54 100644 --- a/examples/cpp/inference/inference_config.h +++ b/examples/cpp/inference/inference_config.h @@ -16,10 +16,12 @@ #include -#define MAX_SEQ_LEN 1024 +// #define MAX_SEQ_LEN 1024 +#define MAX_SEQ_LEN 20 #define BATCH_SIZE 2 -#define MNIST_DIMS 28 * 28 -#define DATA_DIM MNIST_DIMS +// #define MNIST_DIMS 28 * 28 +// #define DATA_DIM MNIST_DIMS +#define DATA_DIM 3 struct InferenceConfig { InferenceConfig(void) { @@ -27,7 +29,7 @@ struct InferenceConfig { token_dim = DATA_DIM; sequence_length = MAX_SEQ_LEN; batch_size = BATCH_SIZE; - out_dim = 15; + out_dim = 3; num_labels = out_dim; num_layers = 1; //----------------------- Inference parameters --------------------- @@ -36,12 +38,12 @@ struct InferenceConfig { poisson_distribution = true; // average number of request arrivals per second arrival_rate = 250; - num_inflight_batches = 5; - incremental_mode = false; + num_inflight_batches = 4; + incremental_mode = true; //----------------------- Rest of model parameters ------------------ - hidden_size = DATA_DIM; + hidden_size = 12; // Encoder layer - num_attention_heads = 16; + num_attention_heads = 3; attention_kdim = attention_vdim = hidden_size / num_attention_heads; num_encoder_layers = 1; } diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index a7e96da03a..c923013a88 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -135,8 +135,11 @@ void FlexFlow::top_level_task(Task const *task, im.init_operators_inference(); //------------ Initialize the data loader and data generator ------------ - size_t min_input_tokens = 32, max_input_tokens = 512, - min_tokens_to_generate = 1, max_tokens_to_generate = 128; + /*size_t min_input_tokens = 32, max_input_tokens = 512, + min_tokens_to_generate = 1, max_tokens_to_generate = 128;*/ + size_t min_input_tokens = 5, max_input_tokens = 10, + min_tokens_to_generate = 1, + max_tokens_to_generate = MAX_SEQ_LEN - max_input_tokens; DataGenerator data_generator(moeConfig.total_requests, moeConfig.token_dim, min_input_tokens, @@ -147,7 +150,10 @@ void FlexFlow::top_level_task(Task const *task, moeConfig.arrival_rate); ParallelTensor input_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); - DataLoader data_loader(ff, moeConfig, data_generator, input_pt); + assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); + assert(im.tensor_buffer[input_pt].size() == im.max_num_inflight_batches); + DataLoader data_loader( + ff, moeConfig, data_generator, im.tensor_buffer[input_pt]); //----------------------- Start timer ----------------------------------- { @@ -168,17 +174,18 @@ void FlexFlow::top_level_task(Task const *task, std::pair new_prompts; BatchConfig *bc = nullptr; + assert(im.max_num_requests_per_batch == moeConfig.batch_size); + // simulation loop. For deployment, we will use a while(true) while (processed_requests < moeConfig.total_requests) { - for (int bid = 0; bid < im.max_num_requests_per_batch; bid++) { + for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { + size_t max_reqs, max_tkns; if (future_handlers.find(bid) == future_handlers.end()) { - size_t max_reqs = moeConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - : im.max_num_requests_per_batch; - size_t max_tkns = moeConfig.sequence_length * moeConfig.batch_size; + max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS + : im.max_num_requests_per_batch; + max_tkns = moeConfig.sequence_length * moeConfig.batch_size; new_prompts = data_generator.get_requests(max_reqs, max_tkns); - assert(new_prompts.second <= BatchConfig::MAX_NUM_REQUESTS); - bc = new BatchConfig(moeConfig.incremental_mode); + bc = new BatchConfig(); } else { Future future = future_handlers[bid]; if (!future.is_ready(true /*subscribe*/)) { @@ -187,14 +194,17 @@ void FlexFlow::top_level_task(Task const *task, InferenceResult ir = future.get_result(); bc = batch_configs[bid]; processed_requests += bc->update_results(ir); - size_t max_reqs = moeConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() - : im.max_num_requests_per_batch; - size_t max_tkns = - moeConfig.sequence_length * moeConfig.batch_size - - (moeConfig.incremental_mode ? bc->num_active_tokens() : 0); + max_reqs = moeConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() + : im.max_num_requests_per_batch; + max_tkns = moeConfig.sequence_length * moeConfig.batch_size - + (moeConfig.incremental_mode ? bc->num_active_tokens() : 0); new_prompts = data_generator.get_requests(max_reqs, max_tkns); } + assert(new_prompts.second <= max_reqs); + if (bc->num_active_tokens() == 0 && new_prompts.second == 0) { + continue; + } for (size_t i = 0; i < new_prompts.second; i++) { size_t guid = new_prompts.first + i; std::pair seq_lens = @@ -203,11 +213,10 @@ void FlexFlow::top_level_task(Task const *task, seq_lens.first <= max_input_tokens && seq_lens.second >= min_tokens_to_generate && seq_lens.second <= max_tokens_to_generate); - assert(bc->register_new_request(guid, seq_lens.first)); + assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); } bc->prepare_next_batch(); - // TODO: loading data - data_loader.next_batch(ff, bc); + data_loader.next_batch(ff, bid, bc); runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); FutureMap fm = im.inference(bid, *bc); diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 45d676fe3c..da8b872387 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -39,14 +39,22 @@ Tensor create_inc_multihead_attention_decoder( FFModel *model, TransformerConfig const *transformerConfig, Tensor const &input) { - std::vector axes{1}; - Tensor t = model->inc_multihead_self_attention( - input, - transformerConfig->hidden_size, - transformerConfig->num_attention_heads, - transformerConfig->attention_kdim, - transformerConfig->attention_vdim); - + std::vector axes{2}; + Tensor t = + transformerConfig->incremental_mode + ? model->inc_multihead_self_attention( + input, + transformerConfig->hidden_size, + transformerConfig->num_attention_heads, + transformerConfig->attention_kdim, + transformerConfig->attention_vdim) + : model->multihead_attention(input, + input, + input, + transformerConfig->hidden_size, + transformerConfig->num_attention_heads, + transformerConfig->attention_kdim, + transformerConfig->attention_vdim); t = model->layer_norm(model->add(t, input), axes, true, 1e-05); Tensor x = model->dense( model->dense( @@ -81,9 +89,10 @@ void FlexFlow::top_level_task(Task const *task, //----------------------- Create inputs -------------------------------- Tensor input; { - int const dims[] = {BatchConfig::MAX_NUM_TOKENS, + int const dims[] = {ffConfig.batchSize, + transformerConfig.sequence_length, transformerConfig.token_dim}; - input = ff.create_tensor<2>(dims, DT_FLOAT); + input = ff.create_tensor<3>(dims, DT_FLOAT); } //----------------------- Define the model ------------------------------ @@ -102,8 +111,11 @@ void FlexFlow::top_level_task(Task const *task, im.init_operators_inference(); //------------ Initialize the data loader and data generator ------------ - size_t min_input_tokens = 32, max_input_tokens = 512, - min_tokens_to_generate = 1, max_tokens_to_generate = 128; + /* size_t min_input_tokens = 32, max_input_tokens = 512, + min_tokens_to_generate = 1, max_tokens_to_generate = 128; */ + size_t min_input_tokens = 5, max_input_tokens = 10, + min_tokens_to_generate = 1, + max_tokens_to_generate = MAX_SEQ_LEN - max_input_tokens; DataGenerator data_generator(transformerConfig.total_requests, transformerConfig.token_dim, min_input_tokens, @@ -114,7 +126,10 @@ void FlexFlow::top_level_task(Task const *task, transformerConfig.arrival_rate); ParallelTensor input_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); - DataLoader data_loader(ff, transformerConfig, data_generator, input_pt); + assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); + assert(im.tensor_buffer[input_pt].size() == im.max_num_inflight_batches); + DataLoader data_loader( + ff, transformerConfig, data_generator, im.tensor_buffer[input_pt]); //----------------------- Start timer ----------------------------------- { @@ -125,6 +140,7 @@ void FlexFlow::top_level_task(Task const *task, } double ts_start = Realm::Clock::current_time_in_microseconds(); + //----------------------- Begin inference! ------------------------------- //----------------------- Begin inference! ------------------------------- int index = 0; int processed_requests = 0; @@ -135,18 +151,21 @@ void FlexFlow::top_level_task(Task const *task, std::pair new_prompts; BatchConfig *bc = nullptr; + assert(im.max_num_requests_per_batch == transformerConfig.batch_size); + // assert(transformerConfig.batch_size <= BatchConfig::MAX_NUM_REQUESTS); + // simulation loop. For deployment, we will use a while(true) while (processed_requests < transformerConfig.total_requests) { - for (int bid = 0; bid < im.max_num_requests_per_batch; bid++) { + for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { + size_t max_reqs, max_tkns; if (future_handlers.find(bid) == future_handlers.end()) { - size_t max_reqs = transformerConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - : im.max_num_requests_per_batch; - size_t max_tkns = + max_reqs = transformerConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS + : im.max_num_requests_per_batch; + max_tkns = transformerConfig.sequence_length * transformerConfig.batch_size; new_prompts = data_generator.get_requests(max_reqs, max_tkns); - assert(new_prompts.second <= BatchConfig::MAX_NUM_REQUESTS); - bc = new BatchConfig(transformerConfig.incremental_mode); + bc = new BatchConfig(); } else { Future future = future_handlers[bid]; if (!future.is_ready(true /*subscribe*/)) { @@ -155,14 +174,18 @@ void FlexFlow::top_level_task(Task const *task, InferenceResult ir = future.get_result(); bc = batch_configs[bid]; processed_requests += bc->update_results(ir); - size_t max_reqs = transformerConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() - : im.max_num_requests_per_batch; - size_t max_tkns = + max_reqs = transformerConfig.incremental_mode + ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() + : im.max_num_requests_per_batch; + max_tkns = transformerConfig.sequence_length * transformerConfig.batch_size - (transformerConfig.incremental_mode ? bc->num_active_tokens() : 0); new_prompts = data_generator.get_requests(max_reqs, max_tkns); } + assert(new_prompts.second <= max_reqs); + if (bc->num_active_tokens() == 0 && new_prompts.second == 0) { + continue; + } for (size_t i = 0; i < new_prompts.second; i++) { size_t guid = new_prompts.first + i; std::pair seq_lens = @@ -171,11 +194,10 @@ void FlexFlow::top_level_task(Task const *task, seq_lens.first <= max_input_tokens && seq_lens.second >= min_tokens_to_generate && seq_lens.second <= max_tokens_to_generate); - assert(bc->register_new_request(guid, seq_lens.first)); + assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); } bc->prepare_next_batch(); - // TODO: loading data - data_loader.next_batch(ff, bc); + data_loader.next_batch(ff, bid, bc); runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); FutureMap fm = im.inference(bid, *bc); diff --git a/examples/cpp/inference/transformers/transformers.h b/examples/cpp/inference/transformers/transformers.h index 0957bd33bb..fe474e7949 100644 --- a/examples/cpp/inference/transformers/transformers.h +++ b/examples/cpp/inference/transformers/transformers.h @@ -19,5 +19,7 @@ #include "inference_config.h" struct TransformerConfig : InferenceConfig { - TransformerConfig(void) : InferenceConfig() {} + TransformerConfig(void) : InferenceConfig() { + hidden_size = DATA_DIM; + } }; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index c5dd2ac90f..4d4aec7054 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -16,7 +16,10 @@ #pragma once #include -#define MAX_SEQ_LEN 1024 + +// #define MAX_SEQ_LEN 1024 +// #define BATCH_SIZE 2 +#define MAX_SEQ_LEN 20 #define BATCH_SIZE 2 #define MAX_REQUESTS 256 @@ -29,25 +32,50 @@ struct InferenceResult { class BatchConfig { public: - BatchConfig(bool _incremental_mode); - bool register_new_request(size_t guid, int length); + BatchConfig(); + bool register_new_request(size_t guid, + int initial_length, + int tokens_to_generate); void prepare_next_batch(); int update_results(InferenceResult const &ir); - bool update_num_active_requests_tokens(); + void update_num_active_requests_tokens(); int num_active_requests() const; int num_active_tokens() const; + void print() const; static int const MAX_NUM_REQUESTS = MAX_REQUESTS; static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; - static int const MAX_SEQUENCE_LENGTH = MAX_SEQ_LEN; - // These are set by update + // static int const MAX_SEQUENCE_LENGTH = MAX_SEQ_LEN; + // These are set by update int num_tokens, num_requests; bool cached_results; - int token_start_idx[MAX_NUM_REQUESTS]; - int token_last_available_idx[MAX_NUM_REQUESTS]; - int num_processing_tokens[MAX_NUM_REQUESTS]; + int token_start_idx[MAX_NUM_REQUESTS]; // index of first token in a request + // that should be processed in the + // current batch/iteration + int token_last_available_idx + [MAX_NUM_REQUESTS]; // last valid token index in a request. This includes + // both the prompt and generated tokens + int num_processing_tokens[MAX_NUM_REQUESTS]; // a request's number of tokens + // being processed in the current + // batch/iteration + size_t max_sequence_length[MAX_NUM_REQUESTS]; + + struct token_idxs { + size_t request_index; // the index within the BatchConfig of the request + // that the token belongs to + size_t token_position; // the index indicating the position of each token + // within its request + }; + + struct SampleIdxs { + size_t num_samples; + size_t guids[InferenceResult::MAX_NUM_TOKENS]; // the guid of the request + // each token belongs to + token_idxs token_indexes[InferenceResult::MAX_NUM_TOKENS]; + }; + + SampleIdxs token2ids; size_t request_guid[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; - bool incremental_mode; }; }; // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 35b2c13d40..6e8effcb27 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -723,7 +723,7 @@ class FFModel { std::pair::type, Params> key{ input_shapes, params}; - auto &cache = get::type, Params>, T *>>(this->cached_ops); auto const &it = cache.find(key); diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index e75a14089c..0f51187c78 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -127,6 +127,7 @@ class Experts : public Op { float const *topk_gate_preds, float *output, float const **weights, + int num_active_tokens, int chosen_experts, int batch_size, int out_dim); diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index c60f2089cc..cfdb415354 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -9,6 +9,8 @@ #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" +#include "math.h" +#include namespace FlexFlow { @@ -86,12 +88,8 @@ class IncMultiHeadSelfAttention : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; - static void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - ffStream_t stream); static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, float const *input_ptr, float const *weight_ptr, float *output_ptr); @@ -110,20 +108,31 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { public: IncMultiHeadSelfAttentionMeta(FFHandler handler, IncMultiHeadSelfAttention const *attn, + float const *weight_ptr, Legion::Memory gpu_mem, int num_samples, - int num_heads); + int _num_heads); ~IncMultiHeadSelfAttentionMeta(void); public: Realm::RegionInstance reserveInst; - size_t weightSize, reserveSpaceSize; -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnAttnDescriptor_t attnDesc; - cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; + size_t weights_params, weightSize, reserveSpaceSize; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int num_heads; +#ifdef INFERENCE_TESTS + float *kcache, *vcache; #endif - int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx; - void *reserveSpace; + /*#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnAttnDescriptor_t attnDesc; + cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; + #endif*/ + // int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx, *kvCache; + float *devQKVProjArray, *keyCache, *valueCache; + float *qk_prods, *qk_prods_softmax; + float *attn_heads, *W_out_contiguous; + // void *reserveSpace; + + BatchConfig::token_idxs *dev_token2ids; }; }; // namespace FlexFlow diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 78e21ccd9f..a2e3e4fcdc 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -133,6 +133,9 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +T *download_tensor(T const *ptr, size_t num_elements); + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index bfe3e73ee1..6d6c55d07b 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -672,9 +672,10 @@ FutureMap Experts::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv << std::endl; */ + int num_active_tokens = bc.num_active_tokens(); IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), + TaskArgument(&num_active_tokens, sizeof(int)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -737,6 +738,10 @@ void Experts::inference_task(Task const *task, assert(regions.size() == task->regions.size()); ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); + int num_active_tokens = *(int *)task->args; + if (num_active_tokens == 0) { + return; + } int num_experts = m->num_experts; bool use_bias = m->use_bias; @@ -981,6 +986,7 @@ void Experts::inference_task(Task const *task, topk_gate_pred_ptr, output_ptr, weights_ptrs, + num_active_tokens, chosen_experts, batch_size, out_dim); diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index 3e8bebb4f0..787c6e2d88 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -26,6 +26,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float const *topk_gate_preds, float *output, float const **weights, + int num_active_tokens, int chosen_experts, int batch_size, int out_dim) { diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 3cd86ed56b..67b9d875c7 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -89,7 +89,7 @@ void experts_forward_thrust_wrapper(ExpertsMeta const *m, thrust::upper_bound(thrust::cuda::par.on(stream), sorted_indices, sorted_indices + num_indices, - experts_start_idx + num_experts_per_block); + experts_start_idx + num_experts_per_block - 1); *lb_index = lb - sorted_indices; *ub_index = ub - sorted_indices; @@ -369,6 +369,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float const *topk_gate_preds, float *output, float const **weights, + int num_active_tokens, int chosen_experts, int batch_size, int out_dim) { @@ -382,17 +383,22 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaEventRecord(t_start, stream); } + assert(num_active_tokens > 0); + assert(num_active_tokens <= m->effective_batch_size); + assert(m->effective_batch_size == batch_size); + int num_experts_per_block = m->num_experts; int experts_start_idx = m->experts_start_idx; bool use_bias = m->use_bias; ActiMode activation = m->activation; int data_dim = m->data_dim; int num_chosen_experts = m->num_chosen_experts; - int num_tokens = m->effective_batch_size; + // int num_tokens = m->effective_batch_size; + int num_tokens = num_active_tokens; int expert_capacity = m->expert_capacity; assert(chosen_experts == num_chosen_experts); - assert(num_tokens == batch_size); + // assert(num_tokens == batch_size); assert(out_dim == m->out_dim); // TODO: remove this once we condense all weights in a single tensor @@ -463,34 +469,34 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaStreamSynchronize(stream); - experts_forward_GemmBatched_kernel(m, - (void const **)m->weight_idx_array, - (void const **)m->token_idx_array, - (void **)m->dev_batch_outputs, - (void const **)m->bias_idx_array, - activation, - data_dim, - out_dim, - num_tokens, - num_chosen_experts, - gemm_batch_count, - stream); + // experts_forward_GemmBatched_kernel(m, + // (void const **)m->weight_idx_array, + // (void const **)m->token_idx_array, + // (void **)m->dev_batch_outputs, + // (void const **)m->bias_idx_array, + // activation, + // data_dim, + // out_dim, + // num_tokens, + // num_chosen_experts, + // gemm_batch_count, + // stream); cudaStreamSynchronize(stream); - int aggregation_parallelism = - std::max(num_tokens, gemm_batch_count) * out_dim; - experts_forward_aggregate_kernel<<>>(num_tokens, - gemm_batch_count, - out_dim, - output, - m->dev_batch_outputs, - m->coefficient_idx_array, - m->output_idx_array); + // int aggregation_parallelism = + // std::max(num_tokens, gemm_batch_count) * out_dim; + // experts_forward_aggregate_kernel<<>>(num_tokens, + // gemm_batch_count, + // out_dim, + // output, + // m->dev_batch_outputs, + // m->coefficient_idx_array, + // m->output_idx_array); if (m->profiling) { cudaEventRecord(t_end, stream); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index fc423caea3..1b91d3b6a0 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -15,7 +15,12 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/model.h" +#include "flexflow/utils/cuda_helper.h" #include "flexflow/utils/hash_utils.h" +#ifdef INFERENCE_TESTS +#include +using namespace at::indexing; +#endif namespace FlexFlow { @@ -438,6 +443,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( @@ -456,7 +462,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( .best_affinity_to(task->target_proc) .first(); IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( - handle, attn, gpu_mem, num_samples, num_heads); + handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); return m; @@ -481,6 +487,10 @@ FutureMap IncMultiHeadSelfAttention::inference( set_argumentmap_for_inference(ff, argmap, view); size_t machine_view_hash = view->hash(); int idx = 0; + + printf("BatchConfig, num_tokens: %d, num_requests: %d\n", + bc.num_tokens, + bc.num_requests); IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, TaskArgument(&bc, sizeof(BatchConfig)), @@ -522,10 +532,11 @@ void IncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(regions.size() == 3); assert(task->regions.size() == regions.size()); - // const IncMultiHeadSelfAttention* attn = (IncMultiHeadSelfAttention*) - // task->args; + + BatchConfig const *bc = (BatchConfig *)task->args; IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( @@ -533,8 +544,736 @@ void IncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, input.get_float_ptr(), weight.get_float_ptr(), output.get_float_ptr()); + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 3); + assert(output_domain.get_dim() == 4); + /* print_tensor(input.get_float_ptr(), + input_domain.get_volume(), + "[Attention:forward:query]"); */ + + IncMultiHeadSelfAttention::inference_kernel_wrapper(m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr()); +#ifdef INFERENCE_TESTS + printf("Checking IncMultiHeadSelfAttention computations...\n"); + + // ============================================================================= + // Load input/output/weights and parse general configs + // ============================================================================= + + float *input_cpu = + download_tensor(input.get_float_ptr(), input_domain.get_volume()); + assert(input_cpu != nullptr); + float *weight_cpu = download_tensor(weight.get_float_ptr(), + weight_domain.get_volume()); + assert(weight_cpu != nullptr); + float *output_cpu = download_tensor(output.get_float_ptr(), + output_domain.get_volume()); + assert(output_cpu != nullptr); + + // Input tensor dimensions + coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + coord_t max_sequence_length = input_domain.hi()[1] - input_domain.lo()[1] + 1; + coord_t batch_size = input_domain.hi()[2] - input_domain.lo()[2] + 1; + coord_t replica_dim = input_domain.hi()[3] - input_domain.lo()[3] + 1; + assert(replica_dim == 1); + + size_t effective_batch_size = max_sequence_length * batch_size; + float inputs_arr[data_dim][effective_batch_size] = {0}; + for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { + size_t data_index = i % data_dim; + size_t token_index = i / data_dim; + assert(data_index < data_dim); + assert(token_index < effective_batch_size); + inputs_arr[data_index][token_index] = input_cpu[i]; + } + torch::Tensor torch_input = torch::from_blob( + inputs_arr, {data_dim, (long int)effective_batch_size}, torch::kFloat32); + + // Weight tensor dimensions + coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; + coord_t num_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; + replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; + size_t qParas = m->qProjSize * m->qSize; + size_t kParas = m->kProjSize * m->kSize; + size_t vParas = m->vProjSize * m->vSize; + size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); + + assert(all_weight_params == qParas + kParas + vParas + oParas); + assert(num_heads == m->num_heads); + assert(replica_dim == 1); + + assert(m->qSize == m->kSize && m->kSize == m->vSize); + // printf("m->qSize: %i\n", m->qSize); + // keep things simple for now + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + long int proj_sum = m->qProjSize + m->kProjSize + m->vProjSize; + // load weight manually because Torch can't easily read a tensor serialized in + // column-major order. + + // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " + // "bc->num_active_tokens(): %i, num_heads: %lli, + // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", + // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), + // num_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); + // for (int t=0; t < bc->num_active_tokens(); t++) { + // printf("token %i has request_index: %li and token_position: %li\n", + // t, bc->token2ids.token_indexes[t].request_index, + // bc->token2ids.token_indexes[t].token_position); + // } + + // ============================================================================= + // Load the output tensor (with CUDA results), and create a Torch tensor + // ============================================================================= + + float output_cuda[m->oProjSize][effective_batch_size] = {0}; + for (int i = 0; i < m->oProjSize * effective_batch_size; i++) { + int row_idx = i % m->oProjSize; + int col_idx = i / m->oProjSize; + assert(row_idx < m->oProjSize && col_idx < effective_batch_size); + output_cuda[row_idx][col_idx] = output_cpu[i]; + } + torch::Tensor torch_out_cuda = + torch::from_blob(output_cuda, + {m->oProjSize, (int64_t)effective_batch_size}, + torch::kFloat32); + + // ============================================================================= + // Load the Q/K/V projection weights, and create a Torch tensor + // ============================================================================= + + float w_qkv[m->qSize][m->qProjSize][3][num_heads]; + memset(&w_qkv, + 0, + m->qSize * m->qProjSize * 3 * num_heads * + sizeof(float)); // assuming that 0.0f is encoded as all zero bytes + assert(w_qkv[0][0][0][0] == 0.0f); + + for (int h = 0; h < num_heads; h++) { + for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { + size_t row_index = i % m->qSize; + size_t column_index = i / m->qSize; + // Q + w_qkv[row_index][column_index][0][h] = + weight_cpu[all_weight_params * h + m->qSize * column_index + + row_index]; + // K + w_qkv[row_index][column_index][1][h] = + weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + + m->qSize * column_index + row_index]; + // V + w_qkv[row_index][column_index][2][h] = + weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + + m->qSize * column_index + row_index]; + } + } + // convert weights to torch tensor + torch::Tensor torch_w_qkv = torch::from_blob( + w_qkv, {m->qSize, m->qProjSize, 3, num_heads}, torch::kFloat32); + + /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() + << std::endl; + std::cout << "Torch input size: " << torch_input.sizes() << std::endl; + std::cout << "Number of active tokens: " << bc->num_active_tokens() + << std::endl; */ + // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; + + // ============================================================================= + // Compute the Q/K/V projections, and compare the results with CUDA + // ============================================================================= + + // ----------------------- C++ computations & checks ------------------------ + torch::Tensor qkv_projs = torch::einsum( + "ijkl,im->jmkl", + {torch_w_qkv, + torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); + // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; + assert(qkv_projs.sizes()[0] == m->qProjSize); + assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && + qkv_projs.sizes()[1] <= effective_batch_size); + assert(qkv_projs.sizes()[2] == 3); + assert(qkv_projs.sizes()[3] == num_heads); + + // ----------------------- Loading CUDA results for this step --------------- + float *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, + BatchConfig::MAX_NUM_TOKENS * + proj_sum * m->num_heads); + assert(QKVProjArray_cpu != nullptr); + + float QKVProjArray_converted[m->qProjSize][bc->num_active_tokens()][3] + [num_heads]; + memset(&QKVProjArray_converted, + 0, + m->qProjSize * bc->num_active_tokens() * 3 * num_heads * + sizeof(float)); // assuming that 0.0f is encoded as all zero bytes + + // skip over padding at the end of QKVProjArray_cpu + // convert from column order to 3D matrix because torch cannot automatically + // import matrices flattened in column order + for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { + size_t proj_size_index = i % m->qProjSize; + size_t head_index = i / (proj_sum * bc->num_active_tokens()); + size_t token_index = + ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % + bc->num_active_tokens(); + size_t qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / + (m->qProjSize * bc->num_active_tokens()); + assert(proj_size_index < proj_sum); + assert(head_index < num_heads); + assert(token_index < bc->num_active_tokens()); + assert(qkv_offset < 3); + QKVProjArray_converted[proj_size_index][token_index][qkv_offset] + [head_index] = QKVProjArray_cpu[i]; + } + torch::Tensor QKVProjArray_torch = + torch::from_blob(QKVProjArray_converted, + {m->qProjSize, bc->num_active_tokens(), 3, num_heads}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + // std::cout << "QKVProjArray_torch" << std::endl; + // for (int i=0; inum_active_tokens(); t++) { + for (size_t d = 0; d < m->kProjSize; d++) { + size_t kcache_idx = + d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + + bc->token2ids.token_indexes[t].token_position * m->num_heads * + BatchConfig::MAX_NUM_REQUESTS + + h * BatchConfig::MAX_NUM_REQUESTS + + bc->token2ids.token_indexes[t].request_index; + m->kcache[kcache_idx] = + qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) + .item(); + } + for (size_t d = 0; d < m->vProjSize; d++) { + size_t vcache_idx = + d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + + bc->token2ids.token_indexes[t].token_position * m->num_heads * + BatchConfig::MAX_NUM_REQUESTS + + h * BatchConfig::MAX_NUM_REQUESTS + + bc->token2ids.token_indexes[t].request_index; + m->vcache[vcache_idx] = + qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) + .item(); + } + } + } + // Create torch tensors from the arrays + torch::Tensor K_t = torch::from_blob( + m->kcache, + {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + torch::Tensor V_t = torch::from_blob( + m->vcache, + {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + + // Compute useful indices + std::vector req_idxs; + std::vector r_first_idx; + std::vector r_num_tokens; + for (size_t t = 0; t < bc->num_active_tokens(); t++) { + size_t rid = bc->token2ids.token_indexes[t].request_index; + if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { + req_idxs.push_back(rid); + r_first_idx.push_back(t); + r_num_tokens.push_back(1); + } else { + r_num_tokens[r_num_tokens.size() - 1]++; + } + assert(req_idxs.size() == r_first_idx.size() && + r_first_idx.size() == r_num_tokens.size()); + } + assert(req_idxs.size() == bc->num_active_requests()); + assert(std::accumulate(r_num_tokens.begin(), + r_num_tokens.end(), + decltype(r_num_tokens)::value_type(0)) == + bc->num_active_tokens()); + + // ----------------------- Loading CUDA results for this step --------------- + float *keyCache_cpu = + download_tensor(m->keyCache, + m->num_heads * m->kProjSize * + BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); + float *valueCache_cpu = + download_tensor(m->valueCache, + m->num_heads * m->vProjSize * + BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); + assert(keyCache_cpu != nullptr); + assert(valueCache_cpu != nullptr); + + float *kcache_cuda = (float *)calloc( + m->kProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + float *vcache_cuda = (float *)calloc( + m->vProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + int index = 0; + for (int i = 0; i < m->kProjSize; i++) { + for (int j = 0; j < MAX_SEQ_LEN; j++) { + for (int k = 0; k < m->num_heads; k++) { + for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { + int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_heads + + k * m->kProjSize * MAX_SEQ_LEN + + j * m->kProjSize + i; + kcache_cuda[index++] = keyCache_cpu[col_major_index]; + } + } + } + } + index = 0; + for (int i = 0; i < m->vProjSize; i++) { + for (int j = 0; j < MAX_SEQ_LEN; j++) { + for (int k = 0; k < m->num_heads; k++) { + for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { + int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_heads + + k * m->vProjSize * MAX_SEQ_LEN + + j * m->vProjSize + i; + vcache_cuda[index++] = valueCache_cpu[col_major_index]; + } + } + } + } + torch::Tensor K_t_cuda = torch::from_blob( + kcache_cuda, + {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + torch::Tensor V_t_cuda = torch::from_blob( + vcache_cuda, + {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + + // std::cout << "kcache differences:" << std::endl; + // for (int i=0; i < bc->num_active_requests() + 1; i++) { + // for (int j=0; j < num_heads; j++) { + // for (int l=0; l < m->kProjSize; l++) { + // for (int k=0; k < MAX_SEQ_LEN; k++) { + // size_t kcache_idx = + // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // j * BatchConfig::MAX_NUM_REQUESTS + + // i; + // if ( abs(m->kcache[kcache_idx] - keyCache_cpu[ + // i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // j * m->kProjSize * MAX_SEQ_LEN + + // k * m->kProjSize + + // l + // ]) > 0.00001) { + // printf("req: %i (rid: %i), head: %i, data_dim: %i, token_pos: + // %i\n", + // i, req_idxs[i], j, l, k); + // } + // } + // } + // } + // } + + // std::cout << "keyCache from CUDA:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jkProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // printf("%f ", + // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // j * m->kProjSize * MAX_SEQ_LEN + + // k * m->kProjSize + + // l + // ]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // std::cout << "valueCache from CUDA:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jvProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // printf("%f ", + // valueCache_cpu[ + // i * m->vProjSize * MAX_SEQ_LEN * num_heads + + // j * m->vProjSize * MAX_SEQ_LEN + + // k * m->vProjSize + + // l]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // printf("\n"); + + // std::cout << "C++ kcache:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; j < num_heads; j++) { + // for (int l=0; l < m->kProjSize; l++) { + // for (int k=0; k < MAX_SEQ_LEN; k++) { + // size_t kcache_idx = + // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // j * BatchConfig::MAX_NUM_REQUESTS + + // i; + // printf("%f ", m->kcache[kcache_idx]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // std::cout << "C++ vcache:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jvProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // size_t vcache_idx = + // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // j * BatchConfig::MAX_NUM_REQUESTS + + // i; + // printf("%f ", m->vcache[vcache_idx]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + assert(torch::allclose(K_t_cuda, K_t, 1e-05, 1e-05)); + assert(torch::allclose(V_t_cuda, V_t, 1e-05, 1e-05)); + free(kcache_cuda); + free(vcache_cuda); + + // ============================================================================= + // Load the W_out projection weights + // ============================================================================= + + // ----------------------- C++ operations & checks -------------------------- + float w_out[m->vProjSize][m->num_heads][m->oProjSize] = {0}; + for (int h = 0; h < num_heads; h++) { + for (int v = 0; v < m->vProjSize; v++) { + for (int o = 0; o < m->oProjSize; o++) { + w_out[v][h][o] = + weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + + m->vProjSize * o + v]; + } + } + } + // convert weights to torch tensor + torch::Tensor torch_w_out = torch::from_blob( + w_out, {m->vProjSize, m->num_heads, m->oProjSize}, torch::kFloat32); + + // ----------------------- Loading CUDA results for this step --------------- + float *w_out_cuda = download_tensor( + m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); + assert(w_out_cuda != nullptr); + float converted_wout_tensor[m->vProjSize][m->num_heads][m->oProjSize] = {0}; + for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { + int row_index = i % m->vProjSize; + int col_index = (i / m->vProjSize) % m->num_heads; + int depth_index = i / (m->vProjSize * m->num_heads); + assert(row_index < m->vProjSize && col_index < m->num_heads && + depth_index < m->oProjSize); + converted_wout_tensor[row_index][col_index][depth_index] = w_out_cuda[i]; + } + torch::Tensor w_out_cuda_tensor = + torch::from_blob(converted_wout_tensor, + {m->vProjSize, m->num_heads, m->oProjSize}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); + + // ============================================================================= + // Compute the softmax(QK^T/sqrt(d_k))V product, request by request + // ============================================================================= + + // ----------------------- C++ initialization steps ------------------------- + torch::Tensor Q_projs = qkv_projs.index({Slice(), Slice(), 0, Slice()}) + .reshape({qkv_projs.sizes()[0], + qkv_projs.sizes()[1], + qkv_projs.sizes()[3]}); + + torch::Tensor qk_products[bc->num_active_requests()]; + torch::Tensor qk_softmax[bc->num_active_requests()]; + torch::Tensor attn_heads[bc->num_active_requests()]; + + torch::Tensor cpp_output = + torch::zeros({m->oProjSize, bc->num_active_tokens()}); + + // ----------------------- Loading CUDA results for this step --------------- + float *qk_prods_cpu = download_tensor( + m->qk_prods, + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); + assert(qk_prods_cpu != nullptr); + + float *qk_prods_softmax_cpu = download_tensor( + m->qk_prods_softmax, + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); + assert(qk_prods_softmax_cpu != nullptr); + + float *attn_heads_cpu = download_tensor( + m->attn_heads, BatchConfig::MAX_NUM_TOKENS * m->num_heads * m->vProjSize); + assert(attn_heads_cpu != nullptr); + + // ----------------------- Main loop (request by request) ------------------- + size_t qk_prods_cpu_offset = 0; + + for (size_t r = 0; r < bc->num_active_requests(); r++) { + // Compute pre-request parameters + size_t num_new_tokens = r_num_tokens[r]; + int64_t rid = (int64_t)(req_idxs[r]); + int64_t num_tokens_received_so_far = + (int64_t)(bc->token_last_available_idx[rid] + 1); + // printf("num_new_tokens: %lu, bc->num_processing_tokens[rid]: %i, rid: + // %li\n", + // num_new_tokens, bc->num_processing_tokens[rid], rid); + assert(num_new_tokens == bc->num_processing_tokens[rid]); + assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); + + // ----------------------- C++ computations ------------------------------- + // Get the slice of the Q projection tensor with the tokens in the current + // request + torch::Tensor Q_req = + Q_projs.index({Slice(), + Slice(r_first_idx[r], r_first_idx[r] + num_new_tokens), + Slice()}); + // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; + assert(Q_req.sizes()[0] == m->qProjSize); + assert(Q_req.sizes()[1] == num_new_tokens); + assert(Q_req.sizes()[2] == num_heads); + + /*printf("\n------------ QK multiplication (C++) -------------\n"); + printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, + rid: %li, Qproj slice: (%i, %i)\n", r, num_new_tokens, + num_tokens_received_so_far, rid, r_first_idx[r], r_first_idx[r] + + num_new_tokens); + + std::cout << "Q_req matrix (idk dims):" << std::endl << + Q_req.index({Slice(), Slice(), 0}) << std::endl << std::endl; std::cout << + "K_t matrix (ilk dims):" << std::endl << K_t.index({Slice(), Slice(0, + num_tokens_received_so_far), 0, rid}) << std::endl << std::endl; std::cout + << "C++ alpha: " << (1.0f / sqrt(m->kProjSize)) << std::endl;*/ + + // Compute (Q*K^T)/sqrt(d_k) matmul + qk_products[r] = + torch::einsum("ijk,ilk->jlk", + {Q_req, + K_t.index({Slice(), + Slice(0, num_tokens_received_so_far), + Slice(), + rid})}) * + (1.0f / sqrt(m->kProjSize)); + + // Set entries above diagonal to -inf to make attention causal. + for (int h = 0; h < num_heads; h++) { + qk_products[r].index( + {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = + qk_products[r] + .index({Slice(), + Slice(num_tokens_received_so_far - num_new_tokens), + h}) + .tril() + + torch::full({(int64_t)num_new_tokens, (int64_t)num_new_tokens}, + -INFINITY) + .triu() + .fill_diagonal_(0); + } + // Compute softmax for each request block + qk_softmax[r] = torch::softmax(qk_products[r], -2); + assert(qk_softmax[r].sizes()[0] == num_new_tokens); + assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); + assert(qk_softmax[r].sizes()[2] == m->num_heads); + + // ------------------- Loading CUDA results for this step --------------- + float converted_qk_prod[num_new_tokens][num_tokens_received_so_far] + [num_heads] = {0}; + float converted_qk_prod_softmax[num_new_tokens][num_tokens_received_so_far] + [num_heads] = {0}; + for (size_t i = 0; + i < num_new_tokens * num_tokens_received_so_far * num_heads; + i++) { + size_t new_t_idx = i % num_new_tokens; + size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; + size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); + assert(new_t_idx < num_new_tokens && + all_t_idx < num_tokens_received_so_far && head_idx < num_heads); + converted_qk_prod[new_t_idx][all_t_idx][head_idx] = + qk_prods_cpu[i + qk_prods_cpu_offset]; + converted_qk_prod_softmax[new_t_idx][all_t_idx][head_idx] = + qk_prods_softmax_cpu[i + qk_prods_cpu_offset]; + } + torch::Tensor qk_prods_cuda = torch::from_blob( + converted_qk_prod, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + torch::kFloat32); + torch::Tensor qk_prods_softmax_cuda = torch::from_blob( + converted_qk_prod_softmax, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + torch::kFloat32); + + // ------------------- Comparing C++ & CUDA results ------------------ + /* std::cout << "C++:" <vProjSize); + assert( + V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) + .sizes()[1] == num_tokens_received_so_far); + assert( + V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) + .sizes()[2] == m->num_heads); + attn_heads[r] = torch::einsum( + "ijk,ljk->ilk", + {qk_softmax[r], + V_t.index( + {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); + assert(attn_heads[r].sizes()[0] == num_new_tokens); + assert(attn_heads[r].sizes()[1] == m->vProjSize); + assert(attn_heads[r].sizes()[2] == m->num_heads); + + // ------------------- Loading CUDA results for this step --------------- + float converted_attn_heads_cpu[num_new_tokens][m->vProjSize][m->num_heads] = + {0}; + for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_heads; i++) { + int token_ix = i % num_new_tokens; + int vproj_idx = (i / num_new_tokens) % m->vProjSize; + int head_idx = i / (num_new_tokens * m->vProjSize); + assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && + head_idx < m->num_heads); + converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = + attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_heads + i]; + } + torch::Tensor converted_attn_heads_cuda = + torch::from_blob(converted_attn_heads_cpu, + {(int64_t)num_new_tokens, m->vProjSize, m->num_heads}, + torch::kFloat32); + + // -------------------- Comparing C++ & CUDA results ------------------- + /* std::cout << "CUDA attn head for req " << r << ":" <num_heads; h++) { + std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << + std::endl; + } + std::cout << "C++ attn head for req " << r << ":" <num_heads; h++) { + std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; + } */ + assert(torch::allclose( + converted_attn_heads_cuda, attn_heads[r], 1e-05, 1e-05)); + + // ----------------------- C++ computations ---------------------------- + // Compute output values by projecting all heads to output space + cpp_output.index( + {Slice(), + Slice(r_first_idx[r], r_first_idx[r] + (int64_t)num_new_tokens)}) = + torch::einsum("jkl,ijk->li", {torch_w_out, attn_heads[r]}); + + // increment main loop's auxiliary index + qk_prods_cpu_offset += + num_new_tokens * num_tokens_received_so_far * num_heads; + } + + // ----------------------- Comparing C++ & CUDA results --------------------- + /* std::cout << "C++:" <oProjSize; i++) { + std::cout << cpp_output.index({i, Slice()}) << std::endl; + } + std::cout << "CUDA:" <oProjSize; i++) { + std::cout << torch_out_cuda.index({i, Slice(0, + (int64_t)bc->num_active_tokens())}) << std::endl; + } */ + + assert(torch::allclose( + torch_out_cuda.index( + {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), + cpp_output, + 1e-05, + 1e-05)); + + // ============================================================================= + // Cleanup + // ============================================================================= + + checkCUDA(cudaFreeHost(input_cpu)); + checkCUDA(cudaFreeHost(weight_cpu)); + checkCUDA(cudaFreeHost(output_cpu)); + checkCUDA(cudaFreeHost(QKVProjArray_cpu)); + checkCUDA(cudaFreeHost(keyCache_cpu)); + checkCUDA(cudaFreeHost(valueCache_cpu)); + checkCUDA(cudaFreeHost(qk_prods_cpu)); + checkCUDA(cudaFreeHost(qk_prods_softmax_cpu)); + checkCUDA(cudaFreeHost(attn_heads_cpu)); + checkCUDA(cudaFreeHost(w_out_cuda)); + // assert(false && "All good if you see this assert failure! :)"); +#endif + // Done with INFERENCE_TESTS block } void IncMultiHeadSelfAttention::backward(FFModel const &ff) { @@ -555,97 +1294,7 @@ bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, bool IncMultiHeadSelfAttention::measure_operator_cost( Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - // Currently assume only data parallel - size_t num_weights = 0; - { - // Compute weight size - int qSize = sub_input.dims[0].size; - int kSize = sub_input.dims[0].size; - int vSize = sub_input.dims[0].size; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - num_weights = num_heads * (qParas + kParas + vParas + oParas); - } - assert(sub_input.num_dims == 4); - int num_samples = sub_input.dims[2].size; - - IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( - sim->handler, this, sim->memory, num_samples, num_heads); - - // allocate tensors in simulator - sim->free_all(); - float const *input_ptr = - (float const *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float const *weight_ptr = (float const *)sim->allocate(num_weights, DT_FLOAT); - cost_metrics.weights_memory += cost_metrics.total_mem_diff_from(sim->offset); - - assert(m->profiling == false); - - std::function forward, backward; - forward = [&] { - inference_kernel_wrapper(m, input_ptr, weight_ptr, output_ptr); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - // IncMultiHeadSelfAttention does not support training - assert(false); - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure IncMultiHeadSelfAttention] query(%d %d %d) key(%d %d %d) " - "value(%d %d %d) output(%d %d %d)" - "forward_time(%.4lf) backward_time(%.4lf)\n", - sub_input.dims[2].size, - sub_input.dims[1].size, - sub_input.dims[0].size, - sub_input.dims[2].size, - sub_input.dims[1].size, - sub_input.dims[0].size, - sub_input.dims[2].size, - sub_input.dims[1].size, - sub_input.dims[0].size, - sub_output.dims[2].size, - sub_output.dims[1].size, - sub_output.dims[0].size, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure IncMultiHeadSelfAttention] query(%d %d %d) key(%d %d %d) " - "value(%d %d %d) output(%d %d %d)" - "forward_time(%.4lf)\n", - sub_input.dims[2].size, - sub_input.dims[1].size, - sub_input.dims[0].size, - sub_input.dims[2].size, - sub_input.dims[1].size, - sub_input.dims[0].size, - sub_input.dims[2].size, - sub_input.dims[1].size, - sub_input.dims[0].size, - sub_output.dims[2].size, - sub_output.dims[1].size, - sub_output.dims[0].size, - cost_metrics.forward_time); - } - // Free multiheadattentionmeta - delete m; - return true; + return false; } using PCG::Node; diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 0903d7fa5d..12ab8ae30c 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -23,20 +23,10 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; -/*static*/ -void IncMultiHeadSelfAttention::inference_kernel( - IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - hipStream_t stream) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); -} - /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, float const *input_ptr, float const *weight_ptr, float *output_ptr) { @@ -49,8 +39,9 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - IncMultiHeadSelfAttention::inference_kernel( - m, input_ptr, weight_ptr, output_ptr, stream); + + handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); + if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -68,185 +59,16 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, + float const *weight_ptr, Memory gpu_mem, int num_samples, - int num_heads) + int _num_heads) : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); -#if 0 - checkCUDNN(cudnnCreateAttnDescriptor(&attnDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&qDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&kDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&vDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); - // Currently do not support adding bias to key/value projection - assert(!attn->add_bias_kv); - cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; - // Assume no beam search for now - int maxBeamSize = 1; - // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) - // kProjSize(%d)\n", - // num_samples, attn->qSize, attn->kSize, attn->vSize, attn->qProjSize, - // attn->kProjSize); - // printf("vProjSize(%d) oProjSize(%d) qoSeqLength(%d) kvSeqLength(%d)\n", - // attn->vProjSize, attn->oProjSize, attn->qoSeqLength, - // attn->kvSeqLength); - cudnnMathType_t math_type; - if (handle.allowTensorOpMathConversion) { - math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; - } else { - math_type = CUDNN_TENSOR_OP_MATH; - } - checkCUDNN(cudnnSetAttnDescriptor(attnDesc, - attnMode, - num_heads, - 1.0f /*smScalar*/, - CUDNN_DATA_FLOAT, - CUDNN_DATA_FLOAT, - math_type, - NULL /*attnDropoutDesc*/, - NULL /*postDropoutDesc*/, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->qoSeqLength, - attn->kvSeqLength, - num_samples, - maxBeamSize)); - size_t workSpaceSize; - checkCUDNN(cudnnGetMultiHeadAttnBuffers( - handler.dnn, attnDesc, &weightSize, &workSpaceSize, &reserveSpaceSize)); - assert(workSpaceSize <= handler.workSpaceSize); - // printf("weightSize(%zu) workSpaceSize(%zu) reserveSpaceSize(%zu)\n", - // weightSize, workSpaceSize, reserveSpaceSize); - int dimA[CUDNN_SEQDATA_DIM_COUNT]; - cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT]; - assert(CUDNN_SEQDATA_DIM_COUNT == 4); - axes[3] = CUDNN_SEQDATA_VECT_DIM; // 3 = nbDims-1 - axes[2] = CUDNN_SEQDATA_BEAM_DIM; - axes[1] = CUDNN_SEQDATA_TIME_DIM; - axes[0] = CUDNN_SEQDATA_BATCH_DIM; - int *qoSeqArray = (int *)malloc(sizeof(int) * num_samples); - int *kvSeqArray = (int *)malloc(sizeof(int) * num_samples); - for (int i = 0; i < num_samples; i++) { - qoSeqArray[i] = attn->qoSeqLength; - kvSeqArray[i] = attn->kvSeqLength; - } - // Set qDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->qSize; - checkCUDNN(cudnnSetSeqDataDescriptor(qDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - qoSeqArray, - NULL)); - } - // Set kDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->kSize; - checkCUDNN(cudnnSetSeqDataDescriptor(kDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - kvSeqArray, - NULL)); - } - // Set vDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->vSize; - checkCUDNN(cudnnSetSeqDataDescriptor(vDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - kvSeqArray, - NULL)); - } - // Set oDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->oProjSize; - checkCUDNN(cudnnSetSeqDataDescriptor(oDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - qoSeqArray, - NULL)); - } - // allocate memory for the seqArray and reserve space - { - size_t totalSize = reserveSpaceSize + sizeof(int) * num_samples * 2; - Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(totalSize - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(reserveInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - devQoSeqArray = (int *)reserveInst.pointer_untyped(0, sizeof(char)); - checkCUDA(cudaMemcpy(devQoSeqArray, - qoSeqArray, - sizeof(int) * num_samples, - cudaMemcpyHostToDevice)); - devKvSeqArray = (int *)devQoSeqArray + num_samples; - checkCUDA(cudaMemcpy(devKvSeqArray, - kvSeqArray, - sizeof(int) * num_samples, - cudaMemcpyHostToDevice)); - reserveSpace = (int *)devKvSeqArray + num_samples; - } - // allocate memory for loWinIdx/hiWinIdx - loWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); - hiWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); - for (int i = 0; i < attn->qoSeqLength; i++) { - loWinIdx[i] = 0; - hiWinIdx[i] = attn->kvSeqLength; - } - free(qoSeqArray); - free(kvSeqArray); -#endif } -IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { -#if 0 - reserveInst.destroy(); - free(loWinIdx); - free(hiWinIdx); - checkCUDNN(cudnnDestroyAttnDescriptor(attnDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(qDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(kDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(vDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(oDesc)); -#endif -} +IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 048453dd17..4c3e123aa3 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -22,42 +22,467 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; -/*static*/ -void IncMultiHeadSelfAttention::inference_kernel( - IncMultiHeadSelfAttentionMeta const *m, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - cudaStream_t stream) { +__global__ void build_w_out_tensor(float const *weight_ptr, + float *contiguous_weight_ptr, + int vProjSize, + int oProjSize, + int num_heads, + int qkv_weight_block_size) { + CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { + int row_idx = i % vProjSize; + int col_idx = (i / vProjSize) % oProjSize; + int head_idx = i / (vProjSize * oProjSize); + contiguous_weight_ptr[col_idx * vProjSize * num_heads + + head_idx * vProjSize + row_idx] = + weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + + qkv_weight_block_size + col_idx * vProjSize + row_idx]; + } +} + +void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnMultiHeadAttnForward(m->handle.dnn, - m->attnDesc, - -1, - m->loWinIdx, - m->hiWinIdx, - m->devQoSeqArray, - m->devKvSeqArray, - m->qDesc, + float alpha = 1.0f, beta = 0.0f; + assert(m->qSize == m->vSize && m->qSize == m->kSize); + cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) + // Weights: qSize x qProjSize x 3 x num_heads + // Input: qSize x num_tokens + // Output >>> qProjSize x num_tokens x 3 x num_heads + int m_q = m->qProjSize; + int m_k = m->kProjSize; + int m_v = m->vProjSize; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->qSize; + int lda = k, ldb = k, ldc_q = m_q, ldc_k = m_k, ldc_v = m_v; + size_t strideA = + m->weights_params; // need to also skip over all the parameters for each + // head, plus the unused W_o weights + size_t strideB = 0; // input stays the same for all heads. + size_t strideC = + (m_q + m_k + m_v) * n; // size of the output block for each head. + // Q + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_q, + n, + k, + &alpha, + weight_ptr, + data_type, + lda, + strideA, input_ptr, - NULL /*residual*/, - m->kDesc, + data_type, + ldb, + strideB, + &beta, + output_ptr, + data_type, + ldc_q, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // K + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_k, + n, + k, + &alpha, + weight_ptr + m_q * k, + data_type, + lda, + strideA, input_ptr, - m->vDesc, + data_type, + ldb, + strideB, + &beta, + output_ptr + m_q * n, + data_type, + ldc_k, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // V + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_v, + n, + k, + &alpha, + weight_ptr + (m_q + m_k) * k, + data_type, + lda, + strideA, input_ptr, - m->oDesc, - output_ptr, - m->weightSize, - weight_ptr, - m->handle.workSpaceSize, - m->handle.workSpace, - m->reserveSpaceSize, - m->reserveSpace)); + data_type, + ldb, + strideB, + &beta, + output_ptr + (m_q + m_k) * n, + data_type, + ldc_v, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} + +__global__ void store_kv_cache(float const *devQKVProjArray, + float *cache_ptr, + BatchConfig::token_idxs const *id_map, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_heads, + int max_seq_len, + bool k_cache) { + CUDA_KERNEL_LOOP(i, + num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { + int proj_size = k_cache ? kProjSize : vProjSize; + int head_idx = i / (num_tokens * proj_size); + int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = i % proj_size; + + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int current_head_block_size = + num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); + float val = + devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + + token_idx * proj_size + data_idx]; + + int const req_id = id_map[token_idx].request_index; + int const tok_id = id_map[token_idx].token_position; + + cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + if (num_tokens > 0) { + int parallelism = m->kProjSize * num_tokens * m->num_heads; + store_kv_cache<<>>(m->devQKVProjArray, + m->keyCache, + m->dev_token2ids, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ true); + parallelism = m->vProjSize * num_tokens * m->num_heads; + store_kv_cache<<>>(m->devQKVProjArray, + m->valueCache, + m->dev_token2ids, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ false); + } +} + +__global__ void fill_entries_above_diagonal(float *matrix, + size_t num_rows, + size_t num_cols, + size_t num_heads, + size_t entries_above_diagonal, + float value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + +void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int tokens_prev_requests_squares = 0; + int qkv_block_size = + (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int kt_block_size = m->kProjSize * MAX_SEQ_LEN; + int kt_req_block_size = kt_block_size * m->num_heads; + int vt_block_size = m->vProjSize * MAX_SEQ_LEN; + int vt_req_block_size = vt_block_size * m->num_heads; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + int num_new_tokens = bc->num_processing_tokens[i]; + int total_tokens = bc->token_last_available_idx[i] + 1; + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = qkv_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + // To get A, skip over Q entries from previous requests (same head) + void const *A = (void const *)(m->devQKVProjArray + + tokens_previous_requests * m->qProjSize); + // To get B, skip over K entries from previous requests (all heads + + // padding) + void const *B = (void const *)(m->keyCache + i * kt_req_block_size); + // To get C, skip over QK^T products from previous requests + void *C = + (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); + + /*printf("\n------------ QK multiplication (CUDA) -------------\n"); + printf("req: %i, num_new_tokens: %i, total_tokens: %i, + tokens_previous_requests: %i, tokens_prev_requests_squares: %i\n", i, + num_new_tokens, total_tokens, tokens_previous_requests, + tokens_prev_requests_squares); printf("About to multiply the following + matrices (printing only first head):\n"); printf("A:\n"); float + *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, + BatchConfig::MAX_NUM_TOKENS * (m->qProjSize + m->kProjSize + m->vProjSize) * + m->num_heads); assert(QKVProjArray_cpu != nullptr); float *keyCache_cpu = + download_tensor(m->keyCache, + m->num_heads * m->kProjSize * + BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); + assert(keyCache_cpu != nullptr); + for (int aaa=0; aaa < m->qProjSize; aaa++) { + for (int bbb=0; bbbqProjSize + aaa]); + } + printf("\n"); + } + printf("B:\n"); + for (int aaa=0; aaa < m->kProjSize; aaa++) { + for (int bbb=0; bbb < total_tokens; bbb++) { + printf("%f ", keyCache_cpu[i * kt_req_block_size + bbb*m->kProjSize + + aaa]); + } + printf("\n"); + } + checkCUDA(cudaFreeHost(QKVProjArray_cpu)); + checkCUDA(cudaFreeHost(keyCache_cpu)); + printf("------------------------------------------------------------\n"); + printf("CUDA alpha: %f", alpha);*/ + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_heads * entries_above_diagonal; + fill_entries_above_diagonal<<>>((float *)C, + num_new_tokens, + total_tokens, + m->num_heads, + entries_above_diagonal, + -INFINITY); + } + // Compute Softmax(QK^T/sqrt(d_k)) + cudnnTensorDescriptor_t qk_tensor; + checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, + n_param, + c_param, + h_param, + w_param)); + alpha = 1.0f, beta = 0.0f; + void *C_softmax = (void *)(m->qk_prods_softmax + + m->num_heads * tokens_prev_requests_squares); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + qk_tensor, + (void *)((float *)C), + &beta, + qk_tensor, + (void *)((float *)C_softmax))); + + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = vt_block_size; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = (void const *)C_softmax; + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = (void const *)(m->valueCache + i * vt_req_block_size); + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = (void *)(m->attn_heads + + tokens_previous_requests * m->num_heads * m->vProjSize); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = (void const *)m->W_out_contiguous; + B = (void const *)C; + C = (void *)(output_ptr + tokens_previous_requests * m->oProjSize); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; + } + + assert(tokens_previous_requests == num_tokens); } /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, float const *input_ptr, float const *weight_ptr, float *output_ptr) { @@ -70,8 +495,24 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - IncMultiHeadSelfAttention::inference_kernel( - m, input_ptr, weight_ptr, output_ptr, stream); + cudaDeviceSynchronize(); + // phase 1: Implement kernel to compute KQV for input tokens + inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + cudaDeviceSynchronize(); + // phase 2: Update key/val cache + cudaMemcpyAsync(m->dev_token2ids, + &(bc->token2ids.token_indexes), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::token_idxs), + cudaMemcpyHostToDevice, + stream); + cudaDeviceSynchronize(); + inference_kernel2(m, bc, stream); + cudaDeviceSynchronize(); + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + inference_kernel3(m, bc, output_ptr, stream); + cudaDeviceSynchronize(); + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -89,139 +530,67 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, + float const *weight_ptr, Memory gpu_mem, int num_samples, - int num_heads) + int _num_heads) : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkCUDNN(cudnnSetStream(handler.dnn, stream)); + // checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + qSize = attn->qSize; + kSize = attn->kSize; + vSize = attn->vSize; + // assume dimensions match for now + assert(qSize == kSize); + assert(kSize == vSize); + qProjSize = attn->qProjSize; + kProjSize = attn->kProjSize; + assert(qProjSize == kProjSize); // required for attention QK^T matmul + vProjSize = attn->vProjSize; + oProjSize = attn->oProjSize; + num_heads = _num_heads; + weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); + weightSize = weights_params * num_heads * sizeof(float); - checkCUDNN(cudnnCreateAttnDescriptor(&attnDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&qDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&kDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&vDesc)); - checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); - cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; - // Assume no beam search for now - int maxBeamSize = 1; - // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) - // kProjSize(%d)\n", - // num_samples, attn->qSize, attn->kSize, attn->vSize, attn->qProjSize, - // attn->kProjSize); - // printf("vProjSize(%d) oProjSize(%d) qoSeqLength(%d) kvSeqLength(%d)\n", - // attn->vProjSize, attn->oProjSize, attn->qoSeqLength, - // attn->kvSeqLength); - cudnnMathType_t math_type; - if (handle.allowTensorOpMathConversion) { - math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; - } else { - math_type = CUDNN_TENSOR_OP_MATH; - } - checkCUDNN(cudnnSetAttnDescriptor(attnDesc, - attnMode, - num_heads, - 1.0f /*smScalar*/, - CUDNN_DATA_FLOAT, - CUDNN_DATA_FLOAT, - math_type, - NULL /*attnDropoutDesc*/, - NULL /*postDropoutDesc*/, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->qoSeqLength, - attn->kvSeqLength, - num_samples, - maxBeamSize)); - size_t workSpaceSize; - checkCUDNN(cudnnGetMultiHeadAttnBuffers( - handler.dnn, attnDesc, &weightSize, &workSpaceSize, &reserveSpaceSize)); - assert(workSpaceSize <= handler.workSpaceSize); - // printf("weightSize(%zu) workSpaceSize(%zu) reserveSpaceSize(%zu)\n", - // weightSize, workSpaceSize, reserveSpaceSize); - int dimA[CUDNN_SEQDATA_DIM_COUNT]; - cudnnSeqDataAxis_t axes[CUDNN_SEQDATA_DIM_COUNT]; - assert(CUDNN_SEQDATA_DIM_COUNT == 4); - axes[3] = CUDNN_SEQDATA_VECT_DIM; // 3 = nbDims-1 - axes[2] = CUDNN_SEQDATA_BEAM_DIM; - axes[1] = CUDNN_SEQDATA_TIME_DIM; - axes[0] = CUDNN_SEQDATA_BATCH_DIM; - int *qoSeqArray = (int *)malloc(sizeof(int) * num_samples); - int *kvSeqArray = (int *)malloc(sizeof(int) * num_samples); - for (int i = 0; i < num_samples; i++) { - qoSeqArray[i] = attn->qoSeqLength; - kvSeqArray[i] = attn->kvSeqLength; - } - // Set qDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->qSize; - checkCUDNN(cudnnSetSeqDataDescriptor(qDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - qoSeqArray, - NULL)); - } - // Set kDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->kSize; - checkCUDNN(cudnnSetSeqDataDescriptor(kDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - kvSeqArray, - NULL)); - } - // Set vDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->kvSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->vSize; - checkCUDNN(cudnnSetSeqDataDescriptor(vDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - kvSeqArray, - NULL)); - } - // Set oDesc - { - dimA[CUDNN_SEQDATA_BEAM_DIM] = 1; - dimA[CUDNN_SEQDATA_BATCH_DIM] = num_samples; - dimA[CUDNN_SEQDATA_TIME_DIM] = attn->qoSeqLength; - dimA[CUDNN_SEQDATA_VECT_DIM] = attn->oProjSize; - checkCUDNN(cudnnSetSeqDataDescriptor(oDesc, - CUDNN_DATA_FLOAT, - CUDNN_SEQDATA_DIM_COUNT, - dimA, - axes, - num_samples, - qoSeqArray, - NULL)); - } + +#ifdef INFERENCE_TESTS + kcache = (float *)calloc(kProjSize * MAX_SEQ_LEN * num_heads * + BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + vcache = (float *)calloc(vProjSize * MAX_SEQ_LEN * num_heads * + BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); +#endif + // allocate memory for the seqArray and reserve space { - size_t totalSize = reserveSpaceSize + sizeof(int) * num_samples * 2; + size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; + size_t qkv_max_proj_size = + BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; + size_t key_cache_size = + num_heads * kProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN; + size_t value_cache_size = + num_heads * vProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN; + size_t token2ids_size = BatchConfig::MAX_NUM_TOKENS; + size_t qk_prod_size = + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads; + size_t attn_heads_size = + BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; + size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + size_t W_out_contiguous_size = W_out_block_size * num_heads; + size_t totalSize = + (qkv_max_proj_size + key_cache_size + value_cache_size + + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * + sizeof(float) + + token2ids_size * + sizeof(BatchConfig::token_idxs); // more components will + // be added here later + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(totalSize - 1)); std::vector field_sizes; @@ -233,38 +602,35 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 0, Realm::ProfilingRequestSet()) .wait(); - devQoSeqArray = (int *)reserveInst.pointer_untyped(0, sizeof(char)); - checkCUDA(cudaMemcpy(devQoSeqArray, - qoSeqArray, - sizeof(int) * num_samples, - cudaMemcpyHostToDevice)); - devKvSeqArray = (int *)devQoSeqArray + num_samples; - checkCUDA(cudaMemcpy(devKvSeqArray, - kvSeqArray, - sizeof(int) * num_samples, - cudaMemcpyHostToDevice)); - reserveSpace = (int *)devKvSeqArray + num_samples; - } - // allocate memory for loWinIdx/hiWinIdx - loWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); - hiWinIdx = (int *)malloc(sizeof(int) * attn->qoSeqLength); - for (int i = 0; i < attn->qoSeqLength; i++) { - loWinIdx[i] = 0; - hiWinIdx[i] = attn->kvSeqLength; + devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); + keyCache = (float *)devQKVProjArray + qkv_max_proj_size; + valueCache = (float *)keyCache + key_cache_size; + dev_token2ids = (BatchConfig::token_idxs *)(valueCache + value_cache_size); + qk_prods = (float *)(dev_token2ids + token2ids_size); + qk_prods_softmax = (float *)(qk_prods + qk_prod_size); + attn_heads = (float *)qk_prods_softmax + qk_prod_size; + W_out_contiguous = (float *)attn_heads + attn_heads_size; + int parallelism = vProjSize * oProjSize * num_heads; + build_w_out_tensor<<>>( + weight_ptr, + W_out_contiguous, + vProjSize, + oProjSize, + num_heads, + (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); } - free(qoSeqArray); - free(kvSeqArray); + cudaStreamSynchronize(stream); } IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { reserveInst.destroy(); - free(loWinIdx); - free(hiWinIdx); - checkCUDNN(cudnnDestroyAttnDescriptor(attnDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(qDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(kDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(vDesc)); - checkCUDNN(cudnnDestroySeqDataDescriptor(oDesc)); +#ifdef INFERENCE_TESTS + free(kcache); + free(vcache); +#endif } }; // namespace FlexFlow diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d58204b7c3..c1c9ca8f40 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -16,47 +16,47 @@ #include "flexflow/batch_config.h" #include "legion.h" #include +#include namespace FlexFlow { LegionRuntime::Logger::Category log_bc("BatchConfig"); -BatchConfig::BatchConfig(bool _incremental_mode) - : incremental_mode(_incremental_mode) { +BatchConfig::BatchConfig() { cached_results = false; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { token_start_idx[i] = 0; token_last_available_idx[i] = -1; request_completed[i] = true; num_processing_tokens[i] = 0; + max_sequence_length[i] = 0; + } + token2ids.num_samples = 0; + for (int i = 0; i < MAX_NUM_TOKENS; i++) { + token2ids.guids[i] = SIZE_MAX; + token2ids.token_indexes[i].request_index = SIZE_MAX; + token2ids.token_indexes[i].token_position = SIZE_MAX; } update_num_active_requests_tokens(); } int BatchConfig::update_results(InferenceResult const &ir) { cached_results = false; - int t = 0; + // int tokens_processed = 0; int completed = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { continue; } - if (num_processing_tokens[i] == 0) { - continue; - } - t += num_processing_tokens[i]; + assert(num_processing_tokens[i] > 0); + // if (num_processing_tokens[i] == 0) { + // continue; + // } + // tokens_processed += num_processing_tokens[i]; token_start_idx[i] += num_processing_tokens[i]; - if (ir.results[t] == 0) { // TODO: replace this with - log_bc.print("[Done] guid(%zu) final_length(%d)", - request_guid[i], - token_start_idx[i]); - request_completed[i] = true; - token_start_idx[i] = 0; - token_last_available_idx[i] = -1; - num_processing_tokens[i] = 0; - completed++; - } else if (token_start_idx[i] >= MAX_SEQUENCE_LENGTH) { - // Reach maximum request length + if (token_start_idx[i] >= max_sequence_length[i] + // || ir.results[t] == 0 TODO: replace this with + ) { log_bc.print("[Done] guid(%zu) final_length(%d)", request_guid[i], token_start_idx[i]); @@ -68,22 +68,28 @@ int BatchConfig::update_results(InferenceResult const &ir) { } else { if (token_start_idx[i] == token_last_available_idx[i] + 1) { token_last_available_idx[i]++; + num_processing_tokens[i] = 1; // incremental phase + } else { + assert(false); } assert(token_start_idx[i] <= token_last_available_idx[i]); } - num_processing_tokens[i] = 0; } update_num_active_requests_tokens(); return completed; } -bool BatchConfig::register_new_request(size_t guid, int length) { +bool BatchConfig::register_new_request(size_t guid, + int initial_length, + int tokens_to_generate) { cached_results = false; + assert(initial_length > 0 && tokens_to_generate > 0); for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { - log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, length); + log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, initial_length); token_start_idx[i] = 0; - token_last_available_idx[i] = length - 1; + token_last_available_idx[i] = initial_length - 1; + max_sequence_length[i] = initial_length + tokens_to_generate; request_guid[i] = guid; num_processing_tokens[i] = 0; request_completed[i] = false; @@ -115,17 +121,23 @@ void BatchConfig::prepare_next_batch() { log_bc.print("[NextBatch] num_tokens(%d)", count); } -bool BatchConfig::update_num_active_requests_tokens() { +void BatchConfig::update_num_active_requests_tokens() { num_requests = 0; num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { num_requests++; - num_tokens += num_processing_tokens[i]; + for (int j = 0; j < num_processing_tokens[i]; j++) { + token2ids.guids[num_tokens] = request_guid[i]; + token2ids.token_indexes[num_tokens].token_position = + token_start_idx[i] + j; + token2ids.token_indexes[num_tokens].request_index = i; + num_tokens++; + } } } + token2ids.num_samples = num_tokens; cached_results = true; - return true; } int BatchConfig::num_active_requests() const { @@ -148,4 +160,70 @@ int BatchConfig::num_active_tokens() const { } } +void BatchConfig::print() const { + printf("--------------------------BatchConfig--------------------------\n"); + printf("num_tokens: %i, num_requests: %i, cached_results: %i\n", + num_tokens, + num_requests, + cached_results); + + printf("requests_completed: "); + for (int i = 0; i < num_requests; i++) { + printf("%i ", request_completed[i]); + } + printf("\n"); + + printf("token_start_idx: "); + for (int i = 0; i < num_requests; i++) { + printf("%i ", token_start_idx[i]); + } + printf("\n"); + + printf("token_last_available_idx: "); + for (int i = 0; i < num_requests; i++) { + printf("%i ", token_last_available_idx[i]); + } + printf("\n"); + + printf("num_processing_tokens: "); + for (int i = 0; i < num_requests; i++) { + printf("%i ", num_processing_tokens[i]); + } + printf("\n"); + + printf("max_sequence_length: "); + for (int i = 0; i < num_requests; i++) { + printf("%lu ", max_sequence_length[i]); + } + printf("\n"); + + printf("request_guid: "); + for (int i = 0; i < num_requests; i++) { + printf("%lu ", request_guid[i]); + } + printf("\n"); + + printf("token2ids.num_samples:%lu\n", token2ids.num_samples); + + printf("token2ids.guids: "); + for (int i = 0; i < num_tokens; i++) { + printf("%lu ", token2ids.guids[i]); + } + printf("\n"); + + printf("token2ids.token_indexes[i].request_index: "); + for (int i = 0; i < num_tokens; i++) { + printf("%lu ", token2ids.token_indexes[i].request_index); + } + printf("\n"); + + printf("token2ids.token_indexes[i].token_position: "); + for (int i = 0; i < num_tokens; i++) { + printf("%lu ", token2ids.token_indexes[i].token_position); + } + printf("\n"); + printf("---------------------------------------------------------------------" + "---------\n"); +} + }; // namespace FlexFlow diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 1da2e492ed..47b0ff74b4 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -224,6 +224,20 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ T *download_tensor(T const *ptr, size_t num_elements) { + // device synchronize to make sure the data are ready + // checkCUDA(cudaDeviceSynchronize()); + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + // checkCUDA(cudaDeviceSynchronize()); + return host_ptr; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -398,3 +412,12 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); + +template __host__ float *download_tensor(float const *ptr, + size_t num_elements); +template __host__ double *download_tensor(double const *ptr, + size_t num_elements); +template __host__ int32_t *download_tensor(int32_t const *ptr, + size_t num_elements); +template __host__ int64_t *download_tensor(int64_t const *ptr, + size_t num_elements); \ No newline at end of file diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index 3645544b44..ea4dc6b5b9 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -47,6 +47,7 @@ if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then # "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples "$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + "$FF_HOME"/build/examples/cpp/inference/transformers/inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel else python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))") OLD_PATH="$PATH" @@ -77,6 +78,7 @@ else # split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel fi done export PATH="$OLD_PATH" From 96cf5f371795830b00001b1f84ec877e26d3445e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 26 Mar 2023 13:34:13 -0400 Subject: [PATCH 079/344] hip_rocm fix --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 392377bf68..ab8bb471b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,11 +93,19 @@ set(NVCC_FLAGS $ENV{NVCC_FLAGS}) set(LD_FLAGS $ENV{LD_FLAGS}) # Set global FLAGS +if(INFERENCE_TESTS) list(APPEND CC_FLAGS -std=c++14) list(APPEND NVCC_FLAGS -std=c++14) +else() + list(APPEND CC_FLAGS + -std=c++11) + +list(APPEND NVCC_FLAGS + -std=c++11) +endif() add_compile_options(${CC_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS}) From 193a7ae511f9f85175a28b7b024547e8e0bc6a70 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 26 Mar 2023 13:39:01 -0400 Subject: [PATCH 080/344] hip rocm fix 2 --- src/ops/experts.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 6d6c55d07b..2186f18370 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -14,7 +14,9 @@ */ #include "flexflow/ops/experts.h" +#ifdef INFERENCE_TESTS #include "flexflow/utils/cuda_helper.h" +#endif #include "legion/legion_utilities.h" namespace FlexFlow { @@ -38,7 +40,9 @@ using PCG::Node; static constexpr int KERNEL_IDX = 0; static constexpr int BIAS_IDX = 1; +#ifdef INFERENCE_TESTS static bool DEBUG_MODE = false; +#endif // For now, we use one input and one output per expert Tensor FFModel::experts(Tensor const *inputs, @@ -845,7 +849,7 @@ void Experts::inference_task(Task const *task, assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); } } - +#ifdef INFERENCE_TESTS if (DEBUG_MODE) { std::cout << "forward_kernel_wrapper" << std::endl << "-------------------------------" << std::endl; @@ -979,7 +983,7 @@ void Experts::inference_task(Task const *task, free(bias_experts_1); } } - +#endif Experts::forward_kernel_wrapper(m, input_ptr, indices_ptr, @@ -990,7 +994,7 @@ void Experts::inference_task(Task const *task, chosen_experts, batch_size, out_dim); - +#ifdef INFERENCE_TESTS if (DEBUG_MODE) { /* ----------------Output after computation--------------*/ float *cpu_output_ptr = new float[batch_size * out_dim]; @@ -1034,6 +1038,7 @@ void Experts::inference_task(Task const *task, free(cpu_output_ptr); } +#endif } void Experts::forward_task(Task const *task, From d4a41586f059fb29551bac5e4805ad47d0969eb9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 28 Mar 2023 01:05:45 -0400 Subject: [PATCH 081/344] [Inference] Fused experts kernel bug fix (#663) * fix bug in thrust kernel * fix * commenting everything back in * cleanup * increase num_inflight_batches * restore transformers file --- examples/cpp/inference/inference_config.h | 18 ++--- include/flexflow/batch_config.h | 2 +- src/ops/experts.cu | 85 ++++++++++++++--------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h index c96d5b9b54..7214cf42a2 100644 --- a/examples/cpp/inference/inference_config.h +++ b/examples/cpp/inference/inference_config.h @@ -18,10 +18,10 @@ // #define MAX_SEQ_LEN 1024 #define MAX_SEQ_LEN 20 -#define BATCH_SIZE 2 -// #define MNIST_DIMS 28 * 28 -// #define DATA_DIM MNIST_DIMS -#define DATA_DIM 3 +#define BATCH_SIZE 32 +#define MNIST_DIMS 28 * 28 +#define DATA_DIM MNIST_DIMS +// #define DATA_DIM 3 struct InferenceConfig { InferenceConfig(void) { @@ -29,9 +29,9 @@ struct InferenceConfig { token_dim = DATA_DIM; sequence_length = MAX_SEQ_LEN; batch_size = BATCH_SIZE; - out_dim = 3; + out_dim = DATA_DIM; num_labels = out_dim; - num_layers = 1; + num_layers = 3; //----------------------- Inference parameters --------------------- // total number of requests processed as part of the simulation total_requests = 2560; @@ -41,11 +41,11 @@ struct InferenceConfig { num_inflight_batches = 4; incremental_mode = true; //----------------------- Rest of model parameters ------------------ - hidden_size = 12; + hidden_size = DATA_DIM; // Encoder layer - num_attention_heads = 3; + num_attention_heads = 16; attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 1; + num_encoder_layers = 3; } // Input/output data diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 4d4aec7054..ac12b11dd0 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,7 +20,7 @@ // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 #define MAX_SEQ_LEN 20 -#define BATCH_SIZE 2 +#define BATCH_SIZE 32 #define MAX_REQUESTS 256 namespace FlexFlow { diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 67b9d875c7..e51545ffdb 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -80,7 +80,8 @@ void experts_forward_thrust_wrapper(ExpertsMeta const *m, sorted_indices + num_indices, original_indices); - // get lower and upper bound of indices corresponding to experts in the block + // get lower and upper bound of token->expert assignments corresponding to + // experts in the block thrust::device_ptr lb = thrust::lower_bound(thrust::cuda::par.on(stream), sorted_indices, sorted_indices + num_indices, @@ -90,28 +91,34 @@ void experts_forward_thrust_wrapper(ExpertsMeta const *m, sorted_indices, sorted_indices + num_indices, experts_start_idx + num_experts_per_block - 1); - + // lowest index in the sorted indices array corresponding to an expert within + // the block *lb_index = lb - sorted_indices; + // 1 + largest index in the sorted indices array corresponding to an expert + // within the block *ub_index = ub - sorted_indices; *num_valid_assignments = (*ub_index) - (*lb_index); if ((*num_valid_assignments) == 0) { return; } - // create "exp_local_label_to_index", a mapping from local expert label to its - // non-zero expert index thrust::device_ptr non_zero_expert_labels = thrust::device_pointer_cast(m->non_zero_expert_labels); + // non_zero_expert_labels: a list of global labels of the experts in this + // block receiving nonzero tokens thrust::device_ptr non_zero_expert_labels_end = thrust::unique_copy( thrust::cuda::par.on(stream), lb, ub, non_zero_expert_labels); + // number of experts in this block receiving at least one token *non_zero_experts_count = non_zero_expert_labels_end - non_zero_expert_labels; using namespace thrust::placeholders; + // convert global labels to local labelling (e.g. expert 65->index 65-64=1 in + // block containing experts 64-96) by substracting the experts_start_idx, + // inplace. thrust::for_each(thrust::cuda::par.on(stream), non_zero_expert_labels, non_zero_expert_labels + (*non_zero_experts_count), - _1 -= - experts_start_idx); // convert global indexes to local ones + _1 -= experts_start_idx); thrust::device_ptr temp_sequence = thrust::device_pointer_cast(m->temp_sequence); @@ -119,6 +126,9 @@ void experts_forward_thrust_wrapper(ExpertsMeta const *m, temp_sequence, temp_sequence + (*non_zero_experts_count)); + // create "exp_local_label_to_index", a mapping from local expert label to its + // non-zero expert index (i.e. expert with index i is the i-th expert in the + // block to receive at least 1 token) thrust::device_ptr exp_local_label_to_index = thrust::device_pointer_cast(m->exp_local_label_to_index); thrust::scatter(thrust::cuda::par.on(stream), @@ -145,7 +155,7 @@ void experts_forward_thrust_wrapper(ExpertsMeta const *m, assert((*start_indexes) == (*non_zero_experts_count)); // append ub_index - expert_start_indexes[(*start_indexes)] = (*ub_index); + expert_start_indexes[(*start_indexes)] = (*ub_index) - (*lb_index); // get number of token assignment to each expert thrust::device_ptr num_assignments_per_expert = @@ -435,6 +445,17 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaStreamSynchronize(stream); + assert(ub_index - lb_index == num_valid_assignments); + assert(num_valid_assignments >= non_zero_experts_count); + assert(non_zero_experts_count <= num_experts_per_block); + if (non_zero_experts_count == 0) { + assert(num_valid_assignments == 0 && gemm_batch_count == 0); + } else { + assert(num_valid_assignments > 0 && gemm_batch_count > 0); + } + assert(num_valid_assignments <= num_indices); + assert(gemm_batch_count <= num_valid_assignments); + if (num_valid_assignments == 0) { return; } @@ -469,34 +490,34 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaStreamSynchronize(stream); - // experts_forward_GemmBatched_kernel(m, - // (void const **)m->weight_idx_array, - // (void const **)m->token_idx_array, - // (void **)m->dev_batch_outputs, - // (void const **)m->bias_idx_array, - // activation, - // data_dim, - // out_dim, - // num_tokens, - // num_chosen_experts, - // gemm_batch_count, - // stream); + experts_forward_GemmBatched_kernel(m, + (void const **)m->weight_idx_array, + (void const **)m->token_idx_array, + (void **)m->dev_batch_outputs, + (void const **)m->bias_idx_array, + activation, + data_dim, + out_dim, + num_tokens, + num_chosen_experts, + gemm_batch_count, + stream); cudaStreamSynchronize(stream); - // int aggregation_parallelism = - // std::max(num_tokens, gemm_batch_count) * out_dim; - // experts_forward_aggregate_kernel<<>>(num_tokens, - // gemm_batch_count, - // out_dim, - // output, - // m->dev_batch_outputs, - // m->coefficient_idx_array, - // m->output_idx_array); + int aggregation_parallelism = + std::max(num_tokens, gemm_batch_count) * out_dim; + experts_forward_aggregate_kernel<<>>(num_tokens, + gemm_batch_count, + out_dim, + output, + m->dev_batch_outputs, + m->coefficient_idx_array, + m->output_idx_array); if (m->profiling) { cudaEventRecord(t_end, stream); From ad4a60ff5c3f35fec35a0797bf1f4fd6d7721d7b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 28 Mar 2023 14:14:12 -0400 Subject: [PATCH 082/344] fix --- examples/cpp/inference/transformers/transformers.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index da8b872387..217e5583fb 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -98,7 +98,7 @@ void FlexFlow::top_level_task(Task const *task, //----------------------- Define the model ------------------------------ Tensor t = input; for (int i = 0; i < transformerConfig.num_layers; i++) { - t = create_inc_multihead_attention_decoder(&ff, &transformerConfig, input); + t = create_inc_multihead_attention_decoder(&ff, &transformerConfig, t); } t = ff.dense(t, transformerConfig.out_dim, AC_MODE_RELU); t = ff.softmax(t); From bf34ba9abc4d5c5e74faac28fa66aaf48175e7fb Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 28 Mar 2023 20:20:24 +0000 Subject: [PATCH 083/344] fixed legion warning --- examples/cpp/inference/mixture_of_experts/moe.cc | 3 ++- examples/cpp/inference/transformers/transformers.cc | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index c923013a88..f6d483cffa 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -216,11 +216,12 @@ void FlexFlow::top_level_task(Task const *task, assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); } bc->prepare_next_batch(); - data_loader.next_batch(ff, bid, bc); runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + data_loader.next_batch(ff, bid, bc); FutureMap fm = im.inference(bid, *bc); runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); batch_configs[bid] = bc; diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 217e5583fb..797a2c1958 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -197,11 +197,12 @@ void FlexFlow::top_level_task(Task const *task, assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); } bc->prepare_next_batch(); - data_loader.next_batch(ff, bid, bc); runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + data_loader.next_batch(ff, bid, bc); FutureMap fm = im.inference(bid, *bc); runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); batch_configs[bid] = bc; From 75a5b9036fa73dbcec5f379a25935417276154c6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 29 Mar 2023 04:10:33 +0000 Subject: [PATCH 084/344] various updates --- examples/cpp/inference/dataloader.cc | 6 ++++-- examples/cpp/inference/dataloader.h | 2 +- examples/cpp/inference/mixture_of_experts/moe.cc | 7 ++++--- examples/cpp/inference/transformers/transformers.cc | 7 ++++--- include/flexflow/inference.h | 1 + src/ops/inc_multihead_self_attention.cu | 9 ++++----- src/runtime/inference_manager.cc | 11 +++++++++-- 7 files changed, 27 insertions(+), 16 deletions(-) diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc index fba9e24129..80ce078508 100644 --- a/examples/cpp/inference/dataloader.cc +++ b/examples/cpp/inference/dataloader.cc @@ -110,7 +110,7 @@ void DataLoader::load_entire_dataset(Task const *task, } } -void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc) { +void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc, MachineView const *mv) { size_t num_active_tokens = bc->num_active_tokens(); if (num_active_tokens == 0) { return; @@ -142,6 +142,8 @@ void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc) { argmap.set_point( *it, TaskArgument(&bc->token2ids, sizeof(BatchConfig::SampleIdxs))); } + MachineView const *view = mv ? mv : &batch_input[bid]->machine_view; + size_t machine_view_hash = view->hash(); IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, batch_input[bid]->parallel_is, TaskArgument(NULL, 0), @@ -149,7 +151,7 @@ void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc) { Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - batch_input[bid]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(full_input->region, 0 /*projection id*/, READ_ONLY, diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h index b95108aa35..f3e9a989de 100644 --- a/examples/cpp/inference/dataloader.h +++ b/examples/cpp/inference/dataloader.h @@ -43,7 +43,7 @@ class DataLoader { std::vector const ®ions, Context ctx, Runtime *runtime); - void next_batch(FFModel &, int, BatchConfig *); + void next_batch(FFModel &ff, int bid, BatchConfig *bc, MachineView const *mv = nullptr); public: size_t num_samples; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index f6d483cffa..d96ab74295 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -216,11 +216,12 @@ void FlexFlow::top_level_task(Task const *task, assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); } bc->prepare_next_batch(); + MachineView *view = im.get_machine_view(bid % im.num_devices); - runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - data_loader.next_batch(ff, bid, bc); + //runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + data_loader.next_batch(ff, bid, bc, view); FutureMap fm = im.inference(bid, *bc); - runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + //runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 797a2c1958..a85ca02b10 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -197,11 +197,12 @@ void FlexFlow::top_level_task(Task const *task, assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); } bc->prepare_next_batch(); + MachineView *view = im.get_machine_view(bid % im.num_devices); - runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - data_loader.next_batch(ff, bid, bc); + //runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + data_loader.next_batch(ff, bid, bc, view); FutureMap fm = im.inference(bid, *bc); - runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + //runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 92aa8f5d21..87cc80e055 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -29,6 +29,7 @@ class InferenceManager { int max_num_inflight_batches); void compile_model_and_allocate_buffer(void); void init_operators_inference(); + MachineView *get_machine_view(int mv_id); Legion::FutureMap inference(int index, BatchConfig const &bc); public: diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 4c3e123aa3..e802647db5 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -495,23 +495,22 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - cudaDeviceSynchronize(); + // phase 1: Implement kernel to compute KQV for input tokens inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); - cudaDeviceSynchronize(); + // phase 2: Update key/val cache cudaMemcpyAsync(m->dev_token2ids, &(bc->token2ids.token_indexes), bc->MAX_NUM_TOKENS * sizeof(BatchConfig::token_idxs), cudaMemcpyHostToDevice, stream); - cudaDeviceSynchronize(); + inference_kernel2(m, bc, stream); - cudaDeviceSynchronize(); + // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 inference_kernel3(m, bc, output_ptr, stream); - cudaDeviceSynchronize(); if (m->profiling) { cudaEventRecord(t_end, stream); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index dcb6e9e67d..33707ea83d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -117,6 +117,11 @@ void InferenceManager::init_operators_inference() { } } +MachineView *InferenceManager::get_machine_view(int mv_id) { + assert(mv_id >= 0 && mv_id < machine_views.size()); + return &machine_views[mv_id]; +} + FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { int batch_index = index % max_num_inflight_batches; int device_index = index % num_devices; @@ -130,11 +135,13 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { MachineView *view; if (op->op_type == OP_EXPERTS) { - view = &machine_views[expert_device_index]; + view = get_machine_view(expert_device_index); + //view = &machine_views[expert_device_index]; expert_device_index = (expert_device_index + 1) % num_devices; } else { // pick mv w startdeviceid = device_index - view = &machine_views[device_index]; + //view = &machine_views[device_index]; + view = get_machine_view(device_index); } std::vector inputs(op->numInputs); From a0bbfcb754de2d38a6ec57a02ecda08e6434421c Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Wed, 29 Mar 2023 21:59:37 -0400 Subject: [PATCH 085/344] Fixed parallel tensor equals. (#665) --- examples/cpp/inference/dataloader.cc | 5 ++++- examples/cpp/inference/dataloader.h | 5 ++++- examples/cpp/inference/mixture_of_experts/moe.cc | 4 ++-- examples/cpp/inference/transformers/transformers.cc | 4 ++-- src/ops/experts.cu | 7 +++++++ src/runtime/inference_manager.cc | 4 ++-- src/runtime/parallel_tensor.cc | 1 + 7 files changed, 22 insertions(+), 8 deletions(-) diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc index 80ce078508..2de6648b12 100644 --- a/examples/cpp/inference/dataloader.cc +++ b/examples/cpp/inference/dataloader.cc @@ -110,7 +110,10 @@ void DataLoader::load_entire_dataset(Task const *task, } } -void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc, MachineView const *mv) { +void DataLoader::next_batch(FFModel &ff, + int bid, + BatchConfig *bc, + MachineView const *mv) { size_t num_active_tokens = bc->num_active_tokens(); if (num_active_tokens == 0) { return; diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h index f3e9a989de..59c5e8b119 100644 --- a/examples/cpp/inference/dataloader.h +++ b/examples/cpp/inference/dataloader.h @@ -43,7 +43,10 @@ class DataLoader { std::vector const ®ions, Context ctx, Runtime *runtime); - void next_batch(FFModel &ff, int bid, BatchConfig *bc, MachineView const *mv = nullptr); + void next_batch(FFModel &ff, + int bid, + BatchConfig *bc, + MachineView const *mv = nullptr); public: size_t num_samples; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index d96ab74295..e35f998be3 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -218,10 +218,10 @@ void FlexFlow::top_level_task(Task const *task, bc->prepare_next_batch(); MachineView *view = im.get_machine_view(bid % im.num_devices); - //runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); data_loader.next_batch(ff, bid, bc, view); FutureMap fm = im.inference(bid, *bc); - //runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index a85ca02b10..98a0bc5be4 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -199,10 +199,10 @@ void FlexFlow::top_level_task(Task const *task, bc->prepare_next_batch(); MachineView *view = im.get_machine_view(bid % im.num_devices); - //runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); + // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); data_loader.next_batch(ff, bid, bc, view); FutureMap fm = im.inference(bid, *bc); - //runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); + // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); diff --git a/src/ops/experts.cu b/src/ops/experts.cu index e51545ffdb..c4c5620cff 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -457,6 +457,13 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(gemm_batch_count <= num_valid_assignments); if (num_valid_assignments == 0) { + if (m->profiling) { + cudaEventRecord(t_end, stream); + cudaEventSynchronize(t_end); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, t_start, t_end); + printf("forward_kernel_wrapper: %f ms\n", milliseconds); + } return; } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 33707ea83d..ba62357411 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -136,11 +136,11 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { MachineView *view; if (op->op_type == OP_EXPERTS) { view = get_machine_view(expert_device_index); - //view = &machine_views[expert_device_index]; + // view = &machine_views[expert_device_index]; expert_device_index = (expert_device_index + 1) % num_devices; } else { // pick mv w startdeviceid = device_index - //view = &machine_views[device_index]; + // view = &machine_views[device_index]; view = get_machine_view(device_index); } diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index a64d118fbc..3ad2f17f0c 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -762,6 +762,7 @@ bool ParallelTensorBase::tensor_equal(FFConfig &config, launcher.add_field(1, FID_DATA); Future result = runtime->execute_task(ctx, launcher); bool equals = result.get_result(); + return equals; } bool ParallelTensorBase::tensor_equal_task( From ccc06ec6f5a6e1a92879e28a9d2c0b2590b5827d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 31 Mar 2023 01:03:52 -0400 Subject: [PATCH 086/344] [Inference] - Pass generated tokens to next batch (#666) * add single output topk operator * saving results in InferenceResult struct * storing output * bug fixing * undo triton file change * fix --- examples/cpp/inference/data_generator.cc | 19 +- examples/cpp/inference/data_generator.cpp | 8 +- examples/cpp/inference/data_generator.h | 12 +- examples/cpp/inference/dataloader.cc | 67 ++- examples/cpp/inference/dataloader.cu | 108 +++-- examples/cpp/inference/dataloader.h | 9 + examples/cpp/inference/inference_config.h | 8 + .../cpp/inference/mixture_of_experts/moe.cc | 24 +- .../inference/transformers/transformers.cc | 25 +- include/flexflow/batch_config.h | 4 +- include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 10 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/arg_topk.h | 97 ++++ include/flexflow/ops/arg_topk_params.h | 25 + include/flexflow/ops/embedding.h | 9 + include/flexflow/utils/cuda_helper.h | 3 + src/ops/arg_topk.cc | 386 +++++++++++++++ src/ops/arg_topk.cpp | 450 ++++++++++++++++++ src/ops/arg_topk.cu | 446 +++++++++++++++++ src/ops/embedding.cc | 87 ++++ src/runtime/batch_config.cc | 19 +- src/runtime/cuda_helper.cu | 24 +- src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 5 + src/runtime/hip_helper.cpp | 45 ++ src/runtime/model.cc | 21 + 27 files changed, 1829 insertions(+), 87 deletions(-) create mode 100644 include/flexflow/ops/arg_topk.h create mode 100644 include/flexflow/ops/arg_topk_params.h create mode 100644 src/ops/arg_topk.cc create mode 100644 src/ops/arg_topk.cpp create mode 100644 src/ops/arg_topk.cu diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc index f691247485..69ed577822 100644 --- a/examples/cpp/inference/data_generator.cc +++ b/examples/cpp/inference/data_generator.cc @@ -21,14 +21,14 @@ using namespace std; DataGenerator::DataGenerator(size_t _num_requests, - size_t _token_dim, + size_t _vocab_size, size_t _min_input_tokens, size_t _max_input_tokens, size_t _min_tokens_to_generate, size_t _max_tokens_to_generate, bool _poisson_distr, double _lambda) - : num_requests(_num_requests), token_dim(_token_dim), + : num_requests(_num_requests), vocab_size(_vocab_size), min_input_tokens(_min_input_tokens), max_input_tokens(_max_input_tokens), min_tokens_to_generate(_min_tokens_to_generate), max_tokens_to_generate(_max_tokens_to_generate), @@ -73,7 +73,7 @@ void DataGenerator::generate_requests_meta() { // cout << "]" << endl; }; -void DataGenerator::generate_requests(float *req_ptr) { +void DataGenerator::generate_requests(int *req_ptr) { assert(req_ptr != nullptr); /* for (size_t i=0; i float_dist{0, 1.0}; - auto gen = [&float_dist, &mersenne_engine]() { - return float_dist(mersenne_engine); + // uniform_real_distribution float_dist{0, 1.0}; + // auto gen = [&float_dist, &mersenne_engine]() { + // return float_dist(mersenne_engine); + // }; + std::uniform_int_distribution int_dist(0, vocab_size - 1); + auto gen = [&int_dist, &mersenne_engine]() { + return int_dist(mersenne_engine); }; - std::generate( - req_ptr, req_ptr + token_dim * max_input_tokens * num_requests, gen); + std::generate(req_ptr, req_ptr + max_input_tokens * num_requests, gen); }; void DataGenerator::start_timer(void) { diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp index 4201e36915..765e9813b9 100644 --- a/examples/cpp/inference/data_generator.cpp +++ b/examples/cpp/inference/data_generator.cpp @@ -19,7 +19,7 @@ int main(int argc, char const *argv[]) { // DataGenerator parameters size_t total_requests = 2560; - size_t token_dim = 16; + size_t vocab_size = 50257; size_t max_sequence_length = 512 + 128; bool use_poisson_distr = true; // average number of request arrivals per second @@ -28,11 +28,11 @@ int main(int argc, char const *argv[]) { size_t min_input_tokens = 32, max_input_tokens = 512, min_tokens_to_generate = 1, max_tokens_to_generate = 128; - float *requests = (float *)calloc( - token_dim * max_sequence_length * total_requests, sizeof(float)); + int *requests = + (int *)calloc(max_sequence_length * total_requests, sizeof(int)); DataGenerator data_generator(total_requests, - token_dim, + vocab_size, min_input_tokens, max_input_tokens, min_tokens_to_generate, diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h index d83df8e1fe..3ba3007123 100644 --- a/examples/cpp/inference/data_generator.h +++ b/examples/cpp/inference/data_generator.h @@ -33,7 +33,7 @@ typedef std::chrono::milliseconds milliseconds; class DataGenerator { public: DataGenerator(size_t _num_requests, - size_t _token_dim, + size_t _vocab_size, size_t _min_input_tokens, size_t _max_input_tokens, size_t _min_tokens_to_generate, @@ -41,17 +41,15 @@ class DataGenerator { bool _poisson_distr, double _lambda); - // Generate random requests by filling each token with random data. For now, - // assume all requests have the same sequence length. Also generate random - // labels (if label_ptr != nullptr and num_labels >0). - void generate_requests(float *req_ptr); + // Generate random requests by filling each tensor with random tokens. For + // now, assume all requests have the same sequence length. + void generate_requests(int *req_ptr); void start_timer(void); // Get number of requests that have arrived since the last time this function // was called std::pair get_requests(size_t max_requests, size_t max_tokens); std::pair get_request_length(size_t guid); - // size_t max_sequence_length; // dimension of one request tensor private: // Compute the arrival times of each request and save them in the arrivals @@ -60,7 +58,7 @@ class DataGenerator { void generate_requests_meta(); size_t num_requests; // total number of requests - size_t token_dim; // embedding dim of each token + size_t vocab_size; // number of words in the vocab size_t min_input_tokens; size_t max_input_tokens; size_t min_tokens_to_generate; diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc index 2de6648b12..36f99718c3 100644 --- a/examples/cpp/inference/dataloader.cc +++ b/examples/cpp/inference/dataloader.cc @@ -58,7 +58,7 @@ DataLoader::DataLoader(FFModel &ff, dims[batch_idx].size = num_samples; full_input = - ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_FLOAT); + ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_INT32); ff.map_tensor(full_input, NULL /*parallel_op*/); } @@ -93,7 +93,7 @@ void DataLoader::load_entire_dataset(Task const *task, assert(task->regions.size() == regions.size()); // get input pointer - float *input_ptr = helperGetTensorPointerWO( + int *input_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -113,6 +113,7 @@ void DataLoader::load_entire_dataset(Task const *task, void DataLoader::next_batch(FFModel &ff, int bid, BatchConfig *bc, + std::map &batch_predictions, MachineView const *mv) { size_t num_active_tokens = bc->num_active_tokens(); if (num_active_tokens == 0) { @@ -140,16 +141,23 @@ void DataLoader::next_batch(FFModel &ff, num_active_tokens); */ assert(ff.config.batchSize == batch_size && batch_size * seq_len >= num_active_tokens); - for (Domain::DomainPointIterator it(domain); it; it++) { - // SampleIdxs meta = bc->token2ids; - argmap.set_point( - *it, TaskArgument(&bc->token2ids, sizeof(BatchConfig::SampleIdxs))); - } + + /* std::cout << "About to call next_batch function..." << std::endl; + bc->print(); + std::cout << "batch_predictions: "; + for (const auto& elem : batch_predictions){ + std::cout << elem.first << ":" << elem.second << ", "; + } */ + DataLoaderNextBatchInput next_batch_input = {bc->token2ids, + batch_predictions}; + DataLoaderNextBatchInput const *ptr = &next_batch_input; + size_t next_batch_input_sz = sizeof(next_batch_input); + assert(ptr->prev_batch_preds.size() == batch_predictions.size()); MachineView const *view = mv ? mv : &batch_input[bid]->machine_view; size_t machine_view_hash = view->hash(); IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, batch_input[bid]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(ptr, next_batch_input_sz), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -173,6 +181,49 @@ void DataLoader::next_batch(FFModel &ff, } } +void DataLoader::store_outputs(BatchConfig *bc, + InferenceResult const &ir, + std::map &batch_predictions) { + assert(bc->token2ids.num_samples == bc->num_active_tokens() && + bc->token2ids.num_samples <= bc->MAX_NUM_TOKENS); + batch_predictions.clear(); + // bc->print(); + for (size_t i = 0; i < bc->token2ids.num_samples; i++) { + if (i == bc->token2ids.num_samples - 1 || + bc->token2ids.guids[i] != bc->token2ids.guids[i + 1]) { + assert(bc->token2ids.token_indexes[i].token_position == + bc->token_last_available_idx[bc->token2ids.token_indexes[i] + .request_index]); + if (outputs.find(bc->token2ids.guids[i]) == outputs.end()) { + std::vector v{ir.results[i]}; + outputs[bc->token2ids.guids[i]] = v; + } else { + outputs[bc->token2ids.guids[i]].push_back(ir.results[i]); + } + /* std::cout << "outputs: "; + for(const auto& elem : outputs){ + std::cout << elem.first << ": ["; + for (const auto &vel : elem.second) { + std::cout << vel << " "; + } + std::cout << "]" << std::endl; + } */ + // std::cout << "outputs[bc->token2ids.guids[i]].size(): " << + // outputs[bc->token2ids.guids[i]].size() << std::endl; std::cout << "i: " + // << i << std::endl; std::cout << + // "bc->token2ids.token_indexes[i].token_position: " << + // bc->token2ids.token_indexes[i].token_position << std::endl; std::cout + // << "bc->token2ids.token_indexes[i].initial_length: " << + // bc->token2ids.token_indexes[i].initial_length << std::endl; + assert(outputs[bc->token2ids.guids[i]].size() == + (bc->token2ids.token_indexes[i].token_position + 1) - + (bc->token2ids.token_indexes[i].initial_length - 1)); + batch_predictions[bc->token2ids.guids[i]] = ir.results[i]; + } + } + assert(batch_predictions.size() == bc->num_active_requests()); +} + void FlexFlow::register_custom_tasks() { // Load entire dataset { diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu index 0668fd949d..71dc14db49 100644 --- a/examples/cpp/inference/dataloader.cu +++ b/examples/cpp/inference/dataloader.cu @@ -22,13 +22,18 @@ void DataLoader::load_input(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); - BatchConfig::SampleIdxs *meta = (BatchConfig::SampleIdxs *)task->local_args; - if (meta->num_samples == 0) { + + DataLoaderNextBatchInput const input_struct = + *((DataLoaderNextBatchInput *)task->args); + BatchConfig::SampleIdxs const &meta = input_struct.meta; + std::map const &prev_batch_preds = input_struct.prev_batch_preds; + + if (meta.num_samples == 0) { return; } - float const *full_input_ptr = helperGetTensorPointerRO( + int const *full_input_ptr = helperGetTensorPointerRO( regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *batch_input_ptr = helperGetTensorPointerWO( + int *batch_input_ptr = helperGetTensorPointerWO( regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain full_input_domain = runtime->get_index_space_domain( @@ -36,67 +41,90 @@ void DataLoader::load_input(Task const *task, Domain batch_input_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - coord_t token_dim = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; coord_t sequence_length = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; coord_t batch_size = - batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - coord_t full_input_token_dim = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; coord_t full_input_sequence_length = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; coord_t full_input_batch_size = - batch_input_domain.hi()[2] - batch_input_domain.lo()[2] + 1; - assert(token_dim == full_input_token_dim); + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + assert(sequence_length == full_input_sequence_length); assert(batch_size <= full_input_batch_size); // Currently assume continous indices - assert(meta->num_samples <= batch_size * sequence_length); - for (int i = 1; i < meta->num_samples; i++) { - if (meta->guids[i] == meta->guids[i - 1]) { - assert(meta->token_indexes[i].token_position == - meta->token_indexes[i - 1].token_position + 1); + assert(meta.num_samples <= batch_size * sequence_length); + for (int i = 1; i < meta.num_samples; i++) { + if (meta.guids[i] == meta.guids[i - 1]) { + assert(meta.token_indexes[i].token_position == + meta.token_indexes[i - 1].token_position + 1); } } // keep things simple for now - assert(batch_input_domain.get_volume() == - batch_size * sequence_length * token_dim); + assert(batch_input_domain.get_volume() == batch_size * sequence_length); // pad inputs if needed (this is really only useful for debugging) checkCUDA(cudaMemset( - batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(float))); + batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(int))); - size_t guid = meta->guids[0]; - size_t start_idx = meta->token_indexes[0].token_position; + size_t guid = meta.guids[0]; + size_t start_idx = meta.token_indexes[0].token_position; size_t dst_idx = 0; size_t total_tokens = 0; - for (size_t i = 1; i <= meta->num_samples; i++) { - if (i == meta->num_samples || meta->guids[i] != guid) { - size_t size_to_copy = - token_dim * - (meta->token_indexes[i - 1].token_position - start_idx + 1); - total_tokens += size_to_copy / token_dim; - float const *input_zc = full_input_ptr + - (guid * token_dim * sequence_length) + - start_idx * token_dim; - float *dst_ptr = batch_input_ptr + dst_idx * token_dim; - copy_kernel<<>>( - dst_ptr, input_zc, size_to_copy); - if (i < meta->num_samples) { - guid = meta->guids[i]; - start_idx = meta->token_indexes[i].token_position; + for (size_t i = 1; i <= meta.num_samples; i++) { + if (i == meta.num_samples || meta.guids[i] != guid) { + + size_t tokens_to_copy = + (meta.token_indexes[i - 1].token_position - start_idx + 1); + // size_t size_to_copy = token_dim * tokens_to_copy; + assert(tokens_to_copy > 0); + if (tokens_to_copy > 1 || meta.token_indexes[i - 1].token_position < + meta.token_indexes[i - 1].initial_length) { + // initialization phase + assert(meta.token_indexes[i - 1].token_position < + meta.token_indexes[i - 1].initial_length); + int const *input_zc = + full_input_ptr + (guid * sequence_length) + start_idx; + int *dst_ptr = batch_input_ptr + dst_idx; + copy_kernel<<>>( + dst_ptr, input_zc, tokens_to_copy); + } else { + // incremental phase + assert(meta.token_indexes[i - 1].token_position >= + meta.token_indexes[i - 1].initial_length); + assert(tokens_to_copy == 1); + + /* std::cout << "Looking for guid: " << guid << std::endl; + std::cout << "prev_batch_preds: "; + for (const auto& elem : prev_batch_preds){ + std::cout << elem.first << ":" << elem.second << ", "; + } + std::cout << std::endl; */ + assert(prev_batch_preds.find(guid) != prev_batch_preds.end()); + int token = prev_batch_preds.at(guid); + int *dst_ptr = batch_input_ptr + dst_idx; + cudaMemcpy(dst_ptr, &token, 1, cudaMemcpyHostToDevice); + // copy_kernel<<>>(dst_ptr, &token, tokens_to_copy); + // cudaMemcpyAsync(batch_input_ptr + dst_idx * token_dim, &token, 1, + // cudaMemcpyHostToDevice); + } + total_tokens += tokens_to_copy; + + if (i < meta.num_samples) { + guid = meta.guids[i]; + start_idx = meta.token_indexes[i].token_position; } dst_idx = i; } } - assert(total_tokens == meta->num_samples); + assert(total_tokens == meta.num_samples); /*printf("token_dim: %lli, sequence_length: %lli, batch_size: %lli\n", token_dim, sequence_length, batch_size); printf("total_tokens: %lu\n", total_tokens); printf("guid: %lu\n", guid); - print_tensor(batch_input_ptr, + print_tensor(batch_input_ptr, batch_input_domain.get_volume(), "[BatchInput]");*/ checkCUDA(cudaDeviceSynchronize()); diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h index 59c5e8b119..afb45801d1 100644 --- a/examples/cpp/inference/dataloader.h +++ b/examples/cpp/inference/dataloader.h @@ -46,14 +46,23 @@ class DataLoader { void next_batch(FFModel &ff, int bid, BatchConfig *bc, + std::map &batch_predictions, MachineView const *mv = nullptr); + void store_outputs(BatchConfig *bc, + InferenceResult const &ir, + std::map &batch_predictions); public: size_t num_samples; ParallelTensor full_input; std::vector batch_input; + std::map> outputs; struct DataLoaderInput { InferenceConfig const &_inferenceConfig; DataGenerator &_data_generator; }; + struct DataLoaderNextBatchInput { + BatchConfig::SampleIdxs const &meta; + std::map const &prev_batch_preds; + }; }; diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h index 7214cf42a2..8301640e45 100644 --- a/examples/cpp/inference/inference_config.h +++ b/examples/cpp/inference/inference_config.h @@ -32,6 +32,10 @@ struct InferenceConfig { out_dim = DATA_DIM; num_labels = out_dim; num_layers = 3; + + vocab_size = 50257; + block_size = 1024; + //----------------------- Inference parameters --------------------- // total number of requests processed as part of the simulation total_requests = 2560; @@ -55,6 +59,10 @@ struct InferenceConfig { int out_dim; int num_labels; int num_layers; + + int vocab_size; + int block_size; + std::string dataset_path; // Inference parameters int total_requests; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index e35f998be3..5ebd23a4c7 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -117,16 +117,26 @@ void FlexFlow::top_level_task(Task const *task, //----------------------- Create inputs -------------------------------- Tensor input; { - int const dims[] = { - ffConfig.batchSize, moeConfig.sequence_length, moeConfig.token_dim}; - input = ff.create_tensor<3>(dims, DT_FLOAT); + int const dims[] = {ffConfig.batchSize, moeConfig.sequence_length}; + input = ff.create_tensor<2>(dims, DT_INT32); } + Tensor t = input; + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + t = ff.embedding(t, + moeConfig.vocab_size, + moeConfig.token_dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); //----------------------- Define the model ------------------------------ - Tensor t = create_moe_encoder(&ff, &moeConfig, input); + t = create_moe_encoder(&ff, &moeConfig, t); // Tensor t = create_moe(&ff, &moeConfig, input); t = ff.dense(t, moeConfig.out_dim, AC_MODE_RELU); t = ff.softmax(t); + // select most likely next token + Tensor output = ff.arg_top_k(t, /*k=*/1, /*sorted=*/false); //------------------- Initialize the inference manager ------------------ InferenceManager im( @@ -141,7 +151,7 @@ void FlexFlow::top_level_task(Task const *task, min_tokens_to_generate = 1, max_tokens_to_generate = MAX_SEQ_LEN - max_input_tokens; DataGenerator data_generator(moeConfig.total_requests, - moeConfig.token_dim, + moeConfig.vocab_size, min_input_tokens, max_input_tokens, min_tokens_to_generate, @@ -173,6 +183,7 @@ void FlexFlow::top_level_task(Task const *task, std::map batch_configs; std::pair new_prompts; BatchConfig *bc = nullptr; + std::map batch_predictions[im.max_num_inflight_batches]; assert(im.max_num_requests_per_batch == moeConfig.batch_size); @@ -193,6 +204,7 @@ void FlexFlow::top_level_task(Task const *task, } InferenceResult ir = future.get_result(); bc = batch_configs[bid]; + data_loader.store_outputs(bc, ir, batch_predictions[bid]); processed_requests += bc->update_results(ir); max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() @@ -219,7 +231,7 @@ void FlexFlow::top_level_task(Task const *task, MachineView *view = im.get_machine_view(bid % im.num_devices); // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - data_loader.next_batch(ff, bid, bc, view); + data_loader.next_batch(ff, bid, bc, batch_predictions[bid], view); FutureMap fm = im.inference(bid, *bc); // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 98a0bc5be4..14414bb8f1 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -89,19 +89,29 @@ void FlexFlow::top_level_task(Task const *task, //----------------------- Create inputs -------------------------------- Tensor input; { - int const dims[] = {ffConfig.batchSize, - transformerConfig.sequence_length, - transformerConfig.token_dim}; - input = ff.create_tensor<3>(dims, DT_FLOAT); + int const dims[] = {ffConfig.batchSize, transformerConfig.sequence_length}; + input = ff.create_tensor<2>(dims, DT_INT32); } //----------------------- Define the model ------------------------------ Tensor t = input; + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + t = ff.embedding(t, + transformerConfig.vocab_size, + transformerConfig.token_dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + for (int i = 0; i < transformerConfig.num_layers; i++) { t = create_inc_multihead_attention_decoder(&ff, &transformerConfig, t); } t = ff.dense(t, transformerConfig.out_dim, AC_MODE_RELU); t = ff.softmax(t); + // select most likely next token + Tensor output = ff.arg_top_k(t, /*k=*/1, false); //------------------- Initialize the inference manager ------------------ InferenceManager im(&ff, @@ -117,7 +127,7 @@ void FlexFlow::top_level_task(Task const *task, min_tokens_to_generate = 1, max_tokens_to_generate = MAX_SEQ_LEN - max_input_tokens; DataGenerator data_generator(transformerConfig.total_requests, - transformerConfig.token_dim, + transformerConfig.vocab_size, min_input_tokens, max_input_tokens, min_tokens_to_generate, @@ -140,7 +150,6 @@ void FlexFlow::top_level_task(Task const *task, } double ts_start = Realm::Clock::current_time_in_microseconds(); - //----------------------- Begin inference! ------------------------------- //----------------------- Begin inference! ------------------------------- int index = 0; int processed_requests = 0; @@ -150,6 +159,7 @@ void FlexFlow::top_level_task(Task const *task, std::map batch_configs; std::pair new_prompts; BatchConfig *bc = nullptr; + std::map batch_predictions[im.max_num_inflight_batches]; assert(im.max_num_requests_per_batch == transformerConfig.batch_size); // assert(transformerConfig.batch_size <= BatchConfig::MAX_NUM_REQUESTS); @@ -173,6 +183,7 @@ void FlexFlow::top_level_task(Task const *task, } InferenceResult ir = future.get_result(); bc = batch_configs[bid]; + data_loader.store_outputs(bc, ir, batch_predictions[bid]); processed_requests += bc->update_results(ir); max_reqs = transformerConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() @@ -200,7 +211,7 @@ void FlexFlow::top_level_task(Task const *task, MachineView *view = im.get_machine_view(bid % im.num_devices); // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - data_loader.next_batch(ff, bid, bc, view); + data_loader.next_batch(ff, bid, bc, batch_predictions[bid], view); FutureMap fm = im.inference(bid, *bc); // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ac12b11dd0..05f6e062d6 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -34,7 +34,7 @@ class BatchConfig { public: BatchConfig(); bool register_new_request(size_t guid, - int initial_length, + int initial_len, int tokens_to_generate); void prepare_next_batch(); int update_results(InferenceResult const &ir); @@ -57,6 +57,7 @@ class BatchConfig { int num_processing_tokens[MAX_NUM_REQUESTS]; // a request's number of tokens // being processed in the current // batch/iteration + size_t initial_length[MAX_NUM_REQUESTS]; size_t max_sequence_length[MAX_NUM_REQUESTS]; struct token_idxs { @@ -64,6 +65,7 @@ class BatchConfig { // that the token belongs to size_t token_position; // the index indicating the position of each token // within its request + size_t initial_length; }; struct SampleIdxs { diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 2ccf4a0eb1..c698191811 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -123,6 +123,7 @@ enum OperatorType { OP_SHAPE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape OP_SIZE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size OP_TOPK, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK + OP_ARG_TOPK, OP_WHERE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where OP_CEIL, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil OP_CAST, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Cast diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6e8effcb27..da26d54af2 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -132,6 +132,8 @@ enum TaskIDs { TOPK_INIT_TASK_ID, TOPK_FWD_TASK_ID, TOPK_BWD_TASK_ID, + ARG_TOPK_INIT_TASK_ID, + ARG_TOPK_INF_TASK_ID, TRANSPOSE_INIT_TASK_ID, TRANSPOSE_FWD_TASK_ID, TRANSPOSE_BWD_TASK_ID, @@ -284,6 +286,7 @@ class Reshape; class Softmax; class Split; class TopK; +class ArgTopK; class Transpose; class Combine; class Repartition; @@ -560,6 +563,11 @@ class FFModel { int k, bool sorted, char const *name = NULL); + Tensor arg_top_k(const Tensor input, + // Tensor *outputs, + int k, + bool sorted, + char const *name = NULL); Tensor multihead_attention(const Tensor query, const Tensor key, const Tensor value, @@ -953,6 +961,8 @@ class FFModel { std::unordered_map, Softmax *>, std::unordered_map, TopK *>, + std::unordered_map, + ArgTopK *>, std::unordered_map, Transpose *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 899921a758..340cc38659 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -3,6 +3,7 @@ #include "flexflow/ops/aggregate_params.h" #include "flexflow/ops/aggregate_spec_params.h" +#include "flexflow/ops/arg_topk_params.h" #include "flexflow/ops/attention_params.h" #include "flexflow/ops/batch_matmul_params.h" #include "flexflow/ops/cast_params.h" @@ -59,6 +60,7 @@ using OperatorParameters = mp::variant const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static InferenceResult + inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void serialize(Legion::Serializer &s) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + static void forward_kernel(ArgTopKMeta const *m, + float const *input_ptr, + // float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted, + ffStream_t stream); + static void forward_kernel_wrapper(ArgTopKMeta const *m, + float const *input_ptr, + // float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted); + Params get_params() const; + +public: + int k; + bool sorted; +}; + +}; // namespace FlexFlow + +#endif diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h new file mode 100644 index 0000000000..ca88a5b9be --- /dev/null +++ b/include/flexflow/ops/arg_topk_params.h @@ -0,0 +1,25 @@ +#ifndef _FLEXFLOW_ARG_TOPK_PARAMS_H +#define _FLEXFLOW_ARG_TOPK_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct ArgTopKParams { + int k; + bool sorted; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(ArgTopKParams const &, ArgTopKParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ArgTopKParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_ARG_TOPK_PARAMS_H diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index 91caf06af0..bd7c15b2fe 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -49,8 +49,17 @@ class Embedding : public Op { bool allocate_weights = false, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; // void update(const FFModel&); void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index a2e3e4fcdc..4271919911 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -136,6 +136,9 @@ void print_tensor(T const *ptr, size_t num_elements, char const *prefix); template T *download_tensor(T const *ptr, size_t num_elements); +template +bool download_tensor(T const *ptr, T *dst, size_t num_elements); + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc new file mode 100644 index 0000000000..35bb80a2b9 --- /dev/null +++ b/src/ops/arg_topk.cc @@ -0,0 +1,386 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/arg_topk.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +// For an input tensor, computes the top k entries in each row +// (resp. vector along the last dimension). Thus, +// values.shape = indices.shape = input.shape[:-1] + [k] +Tensor FFModel::arg_top_k(const Tensor input, + int k, + bool sorted, + char const *name) { + Layer *li = new Layer(this, + OP_ARG_TOPK, + input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = k; + // li->outputs[0] = create_tensor_legion_ordering( + // numdims, dims, input->data_type, li, 0, true /*create_grad*/); + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + } + li->add_int_property("k", k); + li->add_int_property("sorted", sorted); + layers.push_back(li); + // outputs[0] = li->outputs[0]; + // outputs[1] = li->outputs[1]; + return li->outputs[0]; +} + +Op *ArgTopK::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("k", value); + int k = value; + layer->get_int_property("sorted", value); + bool sorted = (bool)value; + return new ArgTopK(model, inputs[0], k, sorted, layer->name); +} + +ArgTopKParams ArgTopK::get_params() const { + ArgTopKParams params; + params.k = this->k; + params.sorted = this->sorted; + return params; +} + +bool ArgTopKParams::is_valid(ParallelTensorShape const &) const { + // topk is always valid + return true; +} + +bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { + return lhs.k == rhs.k && lhs.sorted == rhs.sorted; +} + +ArgTopK::ArgTopK(FFModel &model, + const ParallelTensor _input, + int _k, + bool _sorted, + char const *name) + : Op(model, + OP_ARG_TOPK, + _input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + _input), + k(_k), sorted(_sorted) { + int numdim = inputs[0]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[0]->dims[i]; + } + dims[0].size = k; + assert(inputs[0]->dims[0].degree == 1); + assert(inputs[0]->dims[0].parallel_idx == -1); + // outputs[0] = model.create_parallel_tensor_legion_ordering( + // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_INT32, this, 0 /*owner_idx*/); +} + +ArgTopK::ArgTopK(FFModel &model, + ArgTopK const &other, + const ParallelTensor input) + : ArgTopK(model, input, other.k, other.sorted, other.name) {} + +ArgTopK::ArgTopK(FFModel &model, + ArgTopKParams const ¶ms, + const ParallelTensor input, + char const *name) + : ArgTopK(model, input, params.k, params.sorted, name) {} + +void ArgTopK::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); + IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ArgTopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + // 0 /*projection id*/, + // WRITE_ONLY, + // EXCLUSIVE, + // batch_outputs[1]->region)); + // launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, view); +} + +void ArgTopK::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ArgTopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // launcher.add_region_requirement(RegionRequirement(outputs[1]->part, + // 0 /*projection id*/, + // WRITE_ONLY, + // EXCLUSIVE, + // outputs[1]->region)); + // launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *ArgTopK::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ArgTopK *topk = (ArgTopK *)task->args; + FFHandler handle = *((FFHandler *)task->local_args); + ArgTopKMeta *m = new ArgTopKMeta(handle); + m->profiling = topk->profiling; + m->sorted = topk->sorted; + return m; +} + +void ArgTopK::forward(FFModel const &ff) { + // ArgTopK does not support forward + assert(false); +} + +FutureMap ArgTopK::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + // 0 /*projection id*/, + // WRITE_ONLY, + // EXCLUSIVE, + // batch_outputs[1]->region)); + // launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +InferenceResult + ArgTopK::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + // const ArgTopK* topk = (const ArgTopK*) task->args; + ArgTopKMeta const *m = *((ArgTopKMeta **)task->local_args); + Domain in1_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + // Domain out1_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + Domain out2_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + int numdims = in1_domain.get_dim(); + assert(out2_domain.get_dim() == numdims); + + int in_cols = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; + // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1; + int out2_cols = out2_domain.hi()[0] - out2_domain.lo()[0] + 1; + + // assert(out1_domain == out2_domain); + for (int i = 1; i < in1_domain.get_dim(); i++) { + assert(in1_domain.lo()[i] == out2_domain.lo()[i]); + assert(in1_domain.hi()[i] == out2_domain.hi()[i]); + } + float const *in_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + // float *value_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + int *index_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int length = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; + int k = + out2_domain.hi()[0] - out2_domain.lo()[0] + 1; /*TODO: This prints to 5*/ + size_t batch_size = in1_domain.get_volume() / length; + assert(out2_domain.get_volume() / k == batch_size); + + ArgTopK::forward_kernel_wrapper( + m, in_ptr, index_ptr, batch_size, length, k, m->sorted); + + InferenceResult ir; + download_tensor(index_ptr, ir.results, batch_size); + return ir; +} + +void ArgTopK::backward(FFModel const &ff) { + // ArgTopK does not support backward + assert(false); +} + +void ArgTopK::serialize(Legion::Serializer &sez) const { + sez.serialize(this->k); + sez.serialize(this->sorted); +} + +Node ArgTopK::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + int k; + bool sorted; + dez.deserialize(k); + dez.deserialize(sorted); + ArgTopKParams params; + params.k = k; + params.sorted = sorted; + return ff.get_or_create_node(inputs[0], params); +} + +Op *ArgTopK::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + ArgTopKParams params = get_params(); + return new ArgTopK(ff, params, inputs[0], this->name); +} + +bool ArgTopK::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::ArgTopKParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.k); + hash_combine(key, params.sorted); + return key; +} +}; // namespace std diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp new file mode 100644 index 0000000000..cc43967894 --- /dev/null +++ b/src/ops/arg_topk.cpp @@ -0,0 +1,450 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/arg_topk.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +enum class HeapType { kMinHeap, kMaxHeap }; +enum class PreferIndices { kLower, kHigher }; + +template +struct Entry { + int index; + T value; +}; + +template +struct LinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return data[i].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; +}; + +template +struct IndirectLinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return backing_data[data[i].index].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; + Entry *const backing_data; +}; + +template +struct StridedData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index * blockDim.x + threadIdx.x]; + } + + __device__ int get_index(int i) const { + return (*this)[i].index; + } + __device__ T get_value(int i) const { + return (*this)[i].value; + } + + Entry *const data; +}; + +// A heap of Entry that can either work as a min-heap or as a max-heap. +template + class Data, + typename T> +struct IndexedHeap { + typedef typename Data::Entry Entry; + Data const data; + __device__ IndexedHeap(Data const &d) : data(d) {} + + __device__ bool is_above(int left, int right) { + T left_value = data.get_value(left); + T right_value = data.get_value(right); + if (left_value == right_value) { + if (preferIndices == PreferIndices::kLower) { + return data.get_index(left) < data.get_index(right); + } else { + return data.get_index(left) > data.get_index(right); + } + } + if (heapType == HeapType::kMinHeap) { + return left_value < right_value; + } else { + return left_value > right_value; + } + } + + __device__ void assign(int i, Entry const &entry) { + data[i] = entry; + } + + __device__ void push_up(int i) { + int child = i; + int parent; + for (; child > 0; child = parent) { + parent = (child - 1) / 2; + if (!is_above(child, parent)) { + // Heap property satisfied. + break; + } + swap(child, parent); + } + } + + __device__ void swap(int a, int b) { + auto tmp = data[b]; + data[b] = data[a]; + data[a] = tmp; + } + + __device__ void push_root_down(int k) { + push_down(0, k); + } + + // MAX-HEAPIFY in Cormen + __device__ void push_down(int node, int k) { + while (true) { + int const left = 2 * node + 1; + int const right = left + 1; + int smallest = node; + if (left < k && is_above(left, smallest)) { + smallest = left; + } + if (right < k && is_above(right, smallest)) { + smallest = right; + } + if (smallest == node) { + break; + } + swap(smallest, node); + node = smallest; + } + } + + // BUILD-MAX-HEAPIFY in Cormen + __device__ void build(int k) { + for (int node = (k - 1) / 2; node >= 0; node--) { + push_down(node, k); + } + } + + // HEAP-EXTRACT-MAX in Cormen + __device__ void remove_root(int k) { + data[0] = data[k - 1]; + push_root_down(k - 1); + } + + // in-place HEAPSORT in Cormen + // This method destroys the heap property. + __device__ void sort(int k) { + for (int slot = k - 1; slot > 0; slot--) { + // This is like remove_root but we insert the element at the end. + swap(slot, 0); + // Heap is now an element smaller. + push_root_down(/*k=*/slot); + } + } + + __device__ void replace_root(Entry const &entry, int k) { + data[0] = entry; + push_root_down(k); + } + + __device__ Entry const &root() { + return data[0]; + } +}; + +template + class Data, + typename T> +__device__ IndexedHeap + make_indexed_heap(typename Data::Entry *data) { + return IndexedHeap{Data{data}}; +} + +// heapArgTopK walks over [input, input+length) with `step_size` stride starting +// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries` +// using `Accessor` to access elements in `heap_entries`. If sorted=true, the +// elements will be sorted at the end. +template class Data = LinearData> +__device__ void heapArgTopK(T const *__restrict__ input, + int length, + int k, + Entry *__restrict__ heap_entries, + bool sorted = false, + int start_index = 0, + int step_size = 1) { + assert(k <= length); + + auto heap = + make_indexed_heap( + heap_entries); + + int heap_end_index = start_index + k * step_size; + if (heap_end_index > length) { + heap_end_index = length; + } + // Initialize the min-heap. + for (int index = start_index, slot = 0; index < heap_end_index; + index += step_size, slot++) { + heap.assign(slot, {index, input[index]}); + } + + heap.build(k); + + // Now iterate over the remaining items. + // If an item is smaller than the min element, it is not amongst the top k. + // Otherwise, replace the min element with it and push upwards. + for (int index = heap_end_index; index < length; index += step_size) { + // We prefer elements with lower indices. This is given here. + // Later elements automatically have higher indices, so can be discarded. + if (input[index] > heap.root().value) { + // This element should replace the min. + heap.replace_root({index, input[index]}, k); + } + } + + // Sort if wanted. + if (sorted) { + heap.sort(k); + } +} + +// mergeShards performs a top-k merge on `num_shards` many sorted streams that +// are sorted and stored in `entries` in a strided way: +// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|... +// The overall top k elements are written to `top_k_values` and their indices +// to top_k_indices. +// `top_k_heap` is used as temporary storage for the merge heap. +template +__device__ void mergeShards(int num_shards, + int k, + Entry *__restrict__ entries, + Entry *__restrict__ top_k_heap, + // T *top_k_values, + int *top_k_indices) { + // If k < num_shards, we can use a min-heap with k elements to get the top k + // of the sorted blocks. + // If k > num_shards, we can initialize a min-heap with the top element from + // each sorted block. + int const heap_size = k < num_shards ? k : num_shards; + + // Min-heap part. + { + auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Initialize the heap as a min-heap. + for (int slot = 0; slot < heap_size; slot++) { + min_heap.assign(slot, {slot, entries[slot].value}); + } + min_heap.build(heap_size); + + // Now perform top k with the remaining shards (if num_shards > heap_size). + for (int shard = heap_size; shard < num_shards; shard++) { + auto const entry = entries[shard]; + auto const root = min_heap.root(); + if (entry.value < root.value) { + continue; + } + if (entry.value == root.value && + entry.index > entries[root.index].index) { + continue; + } + // This element should replace the min. + min_heap.replace_root({shard, entry.value}, heap_size); + } + } + + // Max-part. + { + // Turn the min-heap into a max-heap in-place. + auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Heapify into a max heap. + max_heap.build(heap_size); + + // Now extract the minimum k-1 times. + // k is treated specially. + int const last_k = k - 1; + for (int rank = 0; rank < last_k; rank++) { + Entry const &max_element = max_heap.root(); + // top_k_values[rank] = max_element.value; + int shard_index = max_element.index; + top_k_indices[rank] = entries[shard_index].index; + int next_shard_index = shard_index + num_shards; + // For rank < k-1, each top k heap still contains at least 1 element, + // so we can draw a replacement. + max_heap.replace_root({next_shard_index, entries[next_shard_index].value}, + heap_size); + } + + // rank == last_k. + Entry const &max_element = max_heap.root(); + // top_k_values[last_k] = max_element.value; + int shard_index = max_element.index; + top_k_indices[last_k] = entries[shard_index].index; + } +} + +template +__global__ void arg_topk_forward_kernel(T const *__restrict__ input, + size_t shared_memory_size, + int length, + int k, + bool sorted, + // T *__restrict__ output, + int *__restrict__ indices) { + __shared__ char shared_memory[48 << 10]; + int const batch_index = blockIdx.x; + T const *batch_input = input + batch_index * length; + int const thread_index = threadIdx.x; + int const thread_count = blockDim.x; + Entry *shared_entries = (Entry *)shared_memory; + heapArgTopK( + batch_input, length, k, shared_entries, true, thread_index, thread_count); + __syncthreads(); + if (thread_index == 0) { + int const offset = batch_index * k; + // auto batch_output = output + offset; + auto batch_indices = indices + offset; + Entry *top_k_heap = shared_entries + thread_count * k; + mergeShards(thread_count, + k, + shared_entries, + top_k_heap, + // batch_output, + batch_indices); + } +} + +/*static*/ +void ArgTopK::forward_kernel(ArgTopKMeta const *m, + float const *input_ptr, + // float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted, + hipStream_t stream) { + // Adopted from TensorFlow's ArgTopK implementation + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h + int num_shards = 0; + { + constexpr auto shared_memory_size = 48 << 10; + auto const heap_size = k * sizeof(Entry); + // shared_memory_size = (num_shards + 1) * heap_size <=> + num_shards = shared_memory_size / heap_size - 1; + assert(num_shards > 0); + if (num_shards > CUDA_NUM_THREADS) { + num_shards = CUDA_NUM_THREADS; + } + } + // We are limited by the amount of shared memory we have per block. + size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry); + // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; + size_t num_blocks = batch_size; + assert(num_shards >= (size_t)k); + num_shards = k; + hipLaunchKernelGGL(arg_topk_forward_kernel, + num_blocks, + num_shards, + 0, + stream, + input_ptr, + shared_memory_size, + length, + k, + sorted, + // output_ptr, + indices_ptr); +} + +/*static*/ +void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, + float const *input_ptr, + // float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + ArgTopK::forward_kernel(m, + input_ptr, + // output_ptr, + indices_ptr, + batch_size, + length, + k, + sorted, + stream); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + } +} + +ArgTopKMeta::ArgTopKMeta(FFHandler handler) : OpMeta(handler) {} + +}; // namespace FlexFlow diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu new file mode 100644 index 0000000000..82fc113d4f --- /dev/null +++ b/src/ops/arg_topk.cu @@ -0,0 +1,446 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/arg_topk.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +enum class HeapType { kMinHeap, kMaxHeap }; +enum class PreferIndices { kLower, kHigher }; + +template +struct Entry { + int index; + T value; +}; + +template +struct LinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return data[i].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; +}; + +template +struct IndirectLinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return backing_data[data[i].index].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; + Entry *const backing_data; +}; + +template +struct StridedData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index * blockDim.x + threadIdx.x]; + } + + __device__ int get_index(int i) const { + return (*this)[i].index; + } + __device__ T get_value(int i) const { + return (*this)[i].value; + } + + Entry *const data; +}; + +// A heap of Entry that can either work as a min-heap or as a max-heap. +template + class Data, + typename T> +struct IndexedHeap { + typedef typename Data::Entry Entry; + Data const data; + __device__ IndexedHeap(Data const &d) : data(d) {} + + __device__ bool is_above(int left, int right) { + T left_value = data.get_value(left); + T right_value = data.get_value(right); + if (left_value == right_value) { + if (preferIndices == PreferIndices::kLower) { + return data.get_index(left) < data.get_index(right); + } else { + return data.get_index(left) > data.get_index(right); + } + } + if (heapType == HeapType::kMinHeap) { + return left_value < right_value; + } else { + return left_value > right_value; + } + } + + __device__ void assign(int i, Entry const &entry) { + data[i] = entry; + } + + __device__ void push_up(int i) { + int child = i; + int parent; + for (; child > 0; child = parent) { + parent = (child - 1) / 2; + if (!is_above(child, parent)) { + // Heap property satisfied. + break; + } + swap(child, parent); + } + } + + __device__ void swap(int a, int b) { + auto tmp = data[b]; + data[b] = data[a]; + data[a] = tmp; + } + + __device__ void push_root_down(int k) { + push_down(0, k); + } + + // MAX-HEAPIFY in Cormen + __device__ void push_down(int node, int k) { + while (true) { + int const left = 2 * node + 1; + int const right = left + 1; + int smallest = node; + if (left < k && is_above(left, smallest)) { + smallest = left; + } + if (right < k && is_above(right, smallest)) { + smallest = right; + } + if (smallest == node) { + break; + } + swap(smallest, node); + node = smallest; + } + } + + // BUILD-MAX-HEAPIFY in Cormen + __device__ void build(int k) { + for (int node = (k - 1) / 2; node >= 0; node--) { + push_down(node, k); + } + } + + // HEAP-EXTRACT-MAX in Cormen + __device__ void remove_root(int k) { + data[0] = data[k - 1]; + push_root_down(k - 1); + } + + // in-place HEAPSORT in Cormen + // This method destroys the heap property. + __device__ void sort(int k) { + for (int slot = k - 1; slot > 0; slot--) { + // This is like remove_root but we insert the element at the end. + swap(slot, 0); + // Heap is now an element smaller. + push_root_down(/*k=*/slot); + } + } + + __device__ void replace_root(Entry const &entry, int k) { + data[0] = entry; + push_root_down(k); + } + + __device__ Entry const &root() { + return data[0]; + } +}; + +template + class Data, + typename T> +__device__ IndexedHeap + make_indexed_heap(typename Data::Entry *data) { + return IndexedHeap{Data{data}}; +} + +// heapArgTopK walks over [input, input+length) with `step_size` stride starting +// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries` +// using `Accessor` to access elements in `heap_entries`. If sorted=true, the +// elements will be sorted at the end. +template class Data = LinearData> +__device__ void heapArgTopK(T const *__restrict__ input, + int length, + int k, + Entry *__restrict__ heap_entries, + bool sorted = false, + int start_index = 0, + int step_size = 1) { + assert(k <= length); + + auto heap = + make_indexed_heap( + heap_entries); + + int heap_end_index = start_index + k * step_size; + if (heap_end_index > length) { + heap_end_index = length; + } + // Initialize the min-heap. + for (int index = start_index, slot = 0; index < heap_end_index; + index += step_size, slot++) { + heap.assign(slot, {index, input[index]}); + } + + heap.build(k); + + // Now iterate over the remaining items. + // If an item is smaller than the min element, it is not amongst the top k. + // Otherwise, replace the min element with it and push upwards. + for (int index = heap_end_index; index < length; index += step_size) { + // We prefer elements with lower indices. This is given here. + // Later elements automatically have higher indices, so can be discarded. + if (input[index] > heap.root().value) { + // This element should replace the min. + heap.replace_root({index, input[index]}, k); + } + } + + // Sort if wanted. + if (sorted) { + heap.sort(k); + } +} + +// mergeShards performs a top-k merge on `num_shards` many sorted streams that +// are sorted and stored in `entries` in a strided way: +// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|... +// The overall top k elements are written to `top_k_values` and their indices +// to top_k_indices. +// `top_k_heap` is used as temporary storage for the merge heap. +template +__device__ void mergeShards(int num_shards, + int k, + Entry *__restrict__ entries, + Entry *__restrict__ top_k_heap, + // T *top_k_values, + int *top_k_indices) { + // If k < num_shards, we can use a min-heap with k elements to get the top k + // of the sorted blocks. + // If k > num_shards, we can initialize a min-heap with the top element from + // each sorted block. + int const heap_size = k < num_shards ? k : num_shards; + + // Min-heap part. + { + auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Initialize the heap as a min-heap. + for (int slot = 0; slot < heap_size; slot++) { + min_heap.assign(slot, {slot, entries[slot].value}); + } + min_heap.build(heap_size); + + // Now perform top k with the remaining shards (if num_shards > heap_size). + for (int shard = heap_size; shard < num_shards; shard++) { + auto const entry = entries[shard]; + auto const root = min_heap.root(); + if (entry.value < root.value) { + continue; + } + if (entry.value == root.value && + entry.index > entries[root.index].index) { + continue; + } + // This element should replace the min. + min_heap.replace_root({shard, entry.value}, heap_size); + } + } + + // Max-part. + { + // Turn the min-heap into a max-heap in-place. + auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Heapify into a max heap. + max_heap.build(heap_size); + + // Now extract the minimum k-1 times. + // k is treated specially. + int const last_k = k - 1; + for (int rank = 0; rank < last_k; rank++) { + Entry const &max_element = max_heap.root(); + // top_k_values[rank] = max_element.value; + int shard_index = max_element.index; + top_k_indices[rank] = entries[shard_index].index; + int next_shard_index = shard_index + num_shards; + // For rank < k-1, each top k heap still contains at least 1 element, + // so we can draw a replacement. + max_heap.replace_root({next_shard_index, entries[next_shard_index].value}, + heap_size); + } + + // rank == last_k. + Entry const &max_element = max_heap.root(); + // top_k_values[last_k] = max_element.value; + int shard_index = max_element.index; + top_k_indices[last_k] = entries[shard_index].index; + } +} + +template +__global__ void arg_topk_forward_kernel(T const *__restrict__ input, + size_t shared_memory_size, + int length, + int k, + bool sorted, + // T *__restrict__ output, + int *__restrict__ indices) { + __shared__ char shared_memory[48 << 10]; + int const batch_index = blockIdx.x; + T const *batch_input = input + batch_index * length; + int const thread_index = threadIdx.x; + int const thread_count = blockDim.x; + Entry *shared_entries = (Entry *)shared_memory; + heapArgTopK( + batch_input, length, k, shared_entries, true, thread_index, thread_count); + __syncthreads(); + if (thread_index == 0) { + int const offset = batch_index * k; + // auto batch_output = output + offset; + auto batch_indices = indices + offset; + Entry *top_k_heap = shared_entries + thread_count * k; + mergeShards(thread_count, + k, + shared_entries, + top_k_heap, + // batch_output, + batch_indices); + } +} + +/*static*/ +void ArgTopK::forward_kernel(ArgTopKMeta const *m, + float const *input_ptr, + // float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted, + cudaStream_t stream) { + // Adopted from TensorFlow's ArgTopK implementation + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h + int num_shards = 0; + { + constexpr auto shared_memory_size = 48 << 10; + auto const heap_size = k * sizeof(Entry); + // shared_memory_size = (num_shards + 1) * heap_size <=> + num_shards = shared_memory_size / heap_size - 1; + assert(num_shards > 0); + if (num_shards > CUDA_NUM_THREADS) { + num_shards = CUDA_NUM_THREADS; + } + } + // We are limited by the amount of shared memory we have per block. + size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry); + // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; + size_t num_blocks = batch_size; + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + // output_ptr, + indices_ptr); +} + +/*static*/ +void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, + float const *input_ptr, + // float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + ArgTopK::forward_kernel(m, + input_ptr, + // output_ptr, + indices_ptr, + batch_size, + length, + k, + sorted, + stream); + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ArgTopK] forward time = %.2lfms\n", elapsed); + } +} + +ArgTopKMeta::ArgTopKMeta(FFHandler handler) : OpMeta(handler) {} + +}; // namespace FlexFlow diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 3b53213b91..8dca314c2e 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -369,6 +369,46 @@ void Embedding::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void Embedding::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, view); + + IndexLauncher launcher(EMBED_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Embedding)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + // regions[2]: weight + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, view); +} + OpMeta *Embedding::init_task(Task const *task, std::vector const ®ions, Context ctx, @@ -419,6 +459,53 @@ void Embedding::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Embedding::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, view); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(EMBED_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + // regions[0]: input + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // regions[1]: output + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(1, FID_DATA); + // regions[2]: weight + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + /* regions[0](I): input regions[1](O): output diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index c1c9ca8f40..093e7d6de3 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -30,12 +30,14 @@ BatchConfig::BatchConfig() { request_completed[i] = true; num_processing_tokens[i] = 0; max_sequence_length[i] = 0; + initial_length[i] = 0; } token2ids.num_samples = 0; for (int i = 0; i < MAX_NUM_TOKENS; i++) { token2ids.guids[i] = SIZE_MAX; token2ids.token_indexes[i].request_index = SIZE_MAX; token2ids.token_indexes[i].token_position = SIZE_MAX; + token2ids.token_indexes[i].initial_length = SIZE_MAX; } update_num_active_requests_tokens(); } @@ -80,16 +82,17 @@ int BatchConfig::update_results(InferenceResult const &ir) { } bool BatchConfig::register_new_request(size_t guid, - int initial_length, + int initial_len, int tokens_to_generate) { cached_results = false; - assert(initial_length > 0 && tokens_to_generate > 0); + assert(initial_len > 0 && tokens_to_generate > 0); for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { - log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, initial_length); + log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, initial_len); token_start_idx[i] = 0; - token_last_available_idx[i] = initial_length - 1; - max_sequence_length[i] = initial_length + tokens_to_generate; + token_last_available_idx[i] = initial_len - 1; + max_sequence_length[i] = initial_len + tokens_to_generate; + initial_length[i] = initial_len; request_guid[i] = guid; num_processing_tokens[i] = 0; request_completed[i] = false; @@ -132,6 +135,7 @@ void BatchConfig::update_num_active_requests_tokens() { token2ids.token_indexes[num_tokens].token_position = token_start_idx[i] + j; token2ids.token_indexes[num_tokens].request_index = i; + token2ids.token_indexes[num_tokens].initial_length = initial_length[i]; num_tokens++; } } @@ -221,6 +225,11 @@ void BatchConfig::print() const { for (int i = 0; i < num_tokens; i++) { printf("%lu ", token2ids.token_indexes[i].token_position); } + + printf("token2ids.token_indexes[i].initial_length: "); + for (int i = 0; i < num_tokens; i++) { + printf("%lu ", token2ids.token_indexes[i].initial_length); + } printf("\n"); printf("---------------------------------------------------------------------" "---------\n"); diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 47b0ff74b4..edd5b18e0f 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -238,6 +238,17 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { return host_ptr; } +template +__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { + // device synchronize to make sure the data are ready + // checkCUDA(cudaDeviceSynchronize()); + assert(dst != nullptr); + checkCUDA( + cudaMemcpy(dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + // checkCUDA(cudaDeviceSynchronize()); + return true; +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { int dims[MAX_TENSOR_DIM]; @@ -420,4 +431,15 @@ template __host__ double *download_tensor(double const *ptr, template __host__ int32_t *download_tensor(int32_t const *ptr, size_t num_elements); template __host__ int64_t *download_tensor(int64_t const *ptr, - size_t num_elements); \ No newline at end of file + size_t num_elements); +template __host__ bool + download_tensor(float const *ptr, float *dst, size_t num_elements); +template __host__ bool download_tensor(double const *ptr, + double *dst, + size_t num_elements); +template __host__ bool download_tensor(int32_t const *ptr, + int32_t *dst, + size_t num_elements); +template __host__ bool download_tensor(int64_t const *ptr, + int64_t *dst, + size_t num_elements); \ No newline at end of file diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index b762ad0dd5..7c9a68f3b5 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -113,6 +113,8 @@ std::string get_operator_type_name(OperatorType type) { return "Size"; case OP_TOPK: return "TopK"; + case OP_ARG_TOPK: + return "ArgTopK"; case OP_WHERE: return "Where"; case OP_CEIL: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 2ac815be6c..700da55eda 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -16,6 +16,7 @@ #include "flexflow/dominators.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/aggregate.h" +#include "flexflow/ops/arg_topk.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/cast.h" @@ -2664,6 +2665,10 @@ void FFModel::deserialize_graph_optimal_view( node = TopK::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_ARG_TOPK: { + node = ArgTopK::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_GROUP_BY: { node = Group_by::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 215b635291..d6355def9a 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -248,6 +248,31 @@ __host__ void checkCUDA(hipHostFree(host_ptr)); } +template +__host__ T *download_tensor(T const *ptr, size_t num_elements) { + // device synchronize to make sure the data are ready + // checkCUDA(hipDeviceSynchronize()); + T *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(T) * num_elements, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost)); + // checkCUDA(hipDeviceSynchronize()); + return host_ptr; +} + +template +__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { + // device synchronize to make sure the data are ready + // checkCUDA(hipDeviceSynchronize()); + assert(dst != nullptr); + checkCUDA( + hipMemcpy(dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost)); + // checkCUDA(hipDeviceSynchronize()); + return true; +} + miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, Domain domain) { @@ -382,3 +407,23 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); + +template __host__ float *download_tensor(float const *ptr, + size_t num_elements); +template __host__ double *download_tensor(double const *ptr, + size_t num_elements); +template __host__ int32_t *download_tensor(int32_t const *ptr, + size_t num_elements); +template __host__ int64_t *download_tensor(int64_t const *ptr, + size_t num_elements); +template __host__ bool + download_tensor(float const *ptr, float *dst, size_t num_elements); +template __host__ bool download_tensor(double const *ptr, + double *dst, + size_t num_elements); +template __host__ bool download_tensor(int32_t const *ptr, + int32_t *dst, + size_t num_elements); +template __host__ bool download_tensor(int64_t const *ptr, + int64_t *dst, + size_t num_elements); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 7fcf1ef61f..30fa45e251 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -24,6 +24,7 @@ #include "flexflow/mapper.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/aggregate_spec.h" +#include "flexflow/ops/arg_topk.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" @@ -2862,6 +2863,11 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_ARG_TOPK: { + Op *op = ArgTopK::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_GROUP_BY: { Op *op = Group_by::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -4473,6 +4479,21 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "TopK Backward Task"); } + // ArgTopk task + { + TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "ArgTopK Init Task"); + } + { + TaskVariantRegistrar registrar(ARG_TOPK_INF_TASK_ID, "ArgTopK Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "ArgTopK Inference Task"); + } // Transpose task { TaskVariantRegistrar registrar(TRANSPOSE_INIT_TASK_ID, "Transpose Init"); From 81fa7a81688da03693aefcf1269bd20636fdb3da Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 2 Apr 2023 11:40:02 -0500 Subject: [PATCH 087/344] [Inference init] create multiple OpMeta, one for each batch (#668) * [Inference init] create multiple OpMeta, one for each batch * minor fix * linting --------- Co-authored-by: Gabriele Oliaro --- include/flexflow/operator.h | 8 +- src/ops/aggregate.cc | 6 +- src/ops/aggregate_spec.cc | 6 +- src/ops/arg_topk.cc | 6 +- src/ops/attention.cc | 6 +- src/ops/element_binary.cc | 6 +- src/ops/embedding.cc | 6 +- src/ops/experts.cc | 6 +- src/ops/group_by.cc | 5 +- src/ops/inc_multihead_self_attention.cc | 6 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/layer_norm.cc | 6 +- src/ops/linear.cc | 6 +- src/ops/noop.cc | 4 +- src/ops/softmax.cc | 6 +- src/ops/topk.cc | 6 +- src/runtime/inference_manager.cc | 113 +++++++++++++----------- src/runtime/model.cc | 17 ++-- 18 files changed, 116 insertions(+), 105 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 56d4176e10..122850ec05 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -259,19 +259,19 @@ class Op { void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); void set_argumentmap_for_init_inference(FFModel const &ff, Legion::ArgumentMap &argmap, - MachineView const *view); + ParallelTensor const output0); void set_argumentmap_for_forward(FFModel const &ff, Legion::ArgumentMap &argmap); void set_argumentmap_for_inference(FFModel const &ff, Legion::ArgumentMap &argmap, - MachineView const *view); + ParallelTensor const output0); void set_argumentmap_for_backward(FFModel const &ff, Legion::ArgumentMap &argmap); void set_opmeta_from_futuremap(FFModel const &ff, Legion::FutureMap const &fm); void set_opmeta_from_futuremap_inference(FFModel const &ff, Legion::FutureMap const &fm, - MachineView const *view); + ParallelTensor const output0); void solve_parallel_dim_mappings( std::vector const &inputs, std::vector const &weights, @@ -291,7 +291,7 @@ class Op { ParallelParameter weights[MAX_NUM_WEIGHTS]; bool trainableInputs[MAX_NUM_INPUTS]; OpMeta *meta[MAX_NUM_WORKERS]; - std::map inference_meta; + std::map inference_meta; int numInputs, numWeights, numOutputs; bool profiling; #ifdef FF_USE_NCCL diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 304331f485..bb3eaf8f52 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -193,7 +193,7 @@ void Aggregate::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(AGGREGATE_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Aggregate)), @@ -204,7 +204,7 @@ void Aggregate::init_inference(FFModel const &ff, machine_view_hash); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void Aggregate::init(FFModel const &ff) { @@ -294,7 +294,7 @@ FutureMap Aggregate::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "Aggregate op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index e076695a2f..5ec8ab6857 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -167,7 +167,7 @@ void AggregateSpec::init_inference( Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(AGG_SPEC_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(AggregateSpec)), @@ -178,7 +178,7 @@ void AggregateSpec::init_inference( machine_view_hash); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void AggregateSpec::init(FFModel const &ff) { @@ -269,7 +269,7 @@ FutureMap Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "AggregateSpec op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 35bb80a2b9..8cfc4c38d4 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -157,7 +157,7 @@ void ArgTopK::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(ArgTopK)), @@ -186,7 +186,7 @@ void ArgTopK::init_inference(FFModel const &ff, // launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void ArgTopK::init(FFModel const &ff) { @@ -254,7 +254,7 @@ FutureMap ArgTopK::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 75923e8da2..7af6cb8697 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -384,7 +384,7 @@ void MultiHeadAttention::init_inference( Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(ATTENTION_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(MultiHeadAttention)), @@ -425,7 +425,7 @@ void MultiHeadAttention::init_inference( launcher.add_field(4, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void MultiHeadAttention::init(FFModel const &ff) { @@ -590,7 +590,7 @@ FutureMap MultiHeadAttention::inference( Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "MultiHeadAttention op machine_view: " << *(MachineView const *)mv diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 11d5ff6012..4e5d640c08 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -274,7 +274,7 @@ void ElementBinary::init_inference( Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(ELEMENTBINARY_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(ElementBinary)), @@ -325,7 +325,7 @@ void ElementBinary::init_inference( //} FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void ElementBinary::init(FFModel const &ff) { @@ -517,7 +517,7 @@ FutureMap Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "ElementBinary op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 8dca314c2e..06186d969f 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -380,7 +380,7 @@ void Embedding::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(EMBED_INIT_TASK_ID, parallel_is, @@ -406,7 +406,7 @@ void Embedding::init_inference(FFModel const &ff, launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } OpMeta *Embedding::init_task(Task const *task, @@ -470,7 +470,7 @@ FutureMap Embedding::inference(FFModel const &ff, parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); IndexLauncher launcher(EMBED_FWD_TASK_ID, diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 2186f18370..983c682bd6 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -454,7 +454,7 @@ void Experts::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(EXPERTS_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Experts)), @@ -510,7 +510,7 @@ void Experts::init_inference(FFModel const &ff, } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void Experts::init(FFModel const &ff) { @@ -672,7 +672,7 @@ FutureMap Experts::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 53c78538e2..c805b5fb29 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -175,7 +175,7 @@ void Group_by::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(GROUP_BY_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Group_by)), @@ -211,7 +211,7 @@ void Group_by::init_inference(FFModel const &ff, } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void Group_by::init(FFModel const &ff) { @@ -319,6 +319,7 @@ FutureMap Group_by::inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); /* std::cout << "GroupBy op machine_view: " << *(MachineView const *)mv diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 1b91d3b6a0..a46ad1e6a6 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -360,7 +360,7 @@ void IncMultiHeadSelfAttention::init_inference( Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(IncMultiHeadSelfAttention)), @@ -389,7 +389,7 @@ void IncMultiHeadSelfAttention::init_inference( launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void IncMultiHeadSelfAttention::init(FFModel const &ff) { @@ -484,7 +484,7 @@ FutureMap IncMultiHeadSelfAttention::inference( Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e802647db5..fda69bb3b9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -536,7 +536,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - // checkCUDNN(cudnnSetStream(handler.dnn, stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); qSize = attn->qSize; kSize = attn->kSize; diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 4f0703dcc6..de511812bc 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -205,7 +205,7 @@ void LayerNorm::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(LAYERNORM_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(LayerNorm)), @@ -228,7 +228,7 @@ void LayerNorm::init_inference(FFModel const &ff, launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void LayerNorm::init(FFModel const &ff) { @@ -325,7 +325,7 @@ FutureMap LayerNorm::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 381110a4d3..3b19bf6586 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -266,7 +266,7 @@ void Linear::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(LINEAR_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Linear)), @@ -304,7 +304,7 @@ void Linear::init_inference(FFModel const &ff, } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } /* @@ -430,7 +430,7 @@ FutureMap Linear::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/noop.cc b/src/ops/noop.cc index d35d5d48b7..2b54bdf302 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -164,7 +164,7 @@ void NoOp::init_inference(FFModel const &ff, ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -175,7 +175,7 @@ void NoOp::init_inference(FFModel const &ff, machine_view_hash); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 9543e34a90..304fa7b418 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -126,7 +126,7 @@ void Softmax::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(SOFTMAX_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Softmax)), @@ -149,7 +149,7 @@ void Softmax::init_inference(FFModel const &ff, launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void Softmax::init(FFModel const &ff) { @@ -235,7 +235,7 @@ FutureMap Softmax::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 5cefe955b1..45fdb7a3db 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -147,7 +147,7 @@ void TopK::init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, view); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(TOPK_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(TopK)), @@ -176,7 +176,7 @@ void TopK::init_inference(FFModel const &ff, launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, view); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void TopK::init(FFModel const &ff) { @@ -273,7 +273,7 @@ FutureMap TopK::inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; parallel_is = batch_outputs[0]->parallel_is; MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, view); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); /* std::cout << "TopK op machine_view: " << *(MachineView const *)mv << std::endl; */ diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index ba62357411..09bbbefbe0 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -68,51 +68,71 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { tensor_buffer[pt_base] = list; } } + // Set machine_view for batch_tensors in the tensor_buffer + for (int batch_index = 0; batch_index < max_num_inflight_batches; + batch_index++) { + int expert_device_index = 0; + int device_index = batch_index % num_devices; + for (size_t o = 0; o < model->operators.size(); o++) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + MachineView *view; + if (op->op_type == OP_EXPERTS) { + view = get_machine_view(expert_device_index); + // view = &machine_views[expert_device_index]; + expert_device_index = (expert_device_index + 1) % num_devices; + } else { + // pick mv w startdeviceid = device_index + // view = &machine_views[device_index]; + view = get_machine_view(device_index); + } + for (int i = 0; i < op->numOutputs; i++) { + tensor_buffer[op->outputs[i]][batch_index]->machine_view = *view; + Domain part_domain = + runtime->get_index_space_domain(ctx, op->outputs[i]->parallel_is); + assert(view->get_domain() == part_domain); + } + } + } } void InferenceManager::init_operators_inference() { for (int batch_index = 0; batch_index < max_num_inflight_batches; batch_index++) { - for (int device_index = 0; device_index < num_devices; device_index++) { - // int fused_experts_index = 0; - for (size_t o = 0; o < model->operators.size(); o++) { - Op *op = model->operators[o]; - if (op->op_type == OP_WEIGHT) { - continue; - } - MachineView *view; - // if (op->op_type == OP_EXPERTS) { - // if (fused_experts_index != device_index) { - // fused_experts_index++; - // continue; - // } - // view = &machine_views[fused_experts_index]; - // fused_experts_index++; - // } else { - view = &machine_views[device_index]; - //} - std::vector inputs(op->numInputs); - std::vector outputs(op->numOutputs); - for (int i = 0; i < op->numInputs; i++) { - assert(op->inputs[i] != nullptr); - assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(tensor_buffer[op->inputs[i]].size() > batch_index); - inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - for (int i = 0; i < op->numOutputs; i++) { - assert(op->outputs[i] != nullptr); - assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(tensor_buffer[op->outputs[i]].size() > batch_index); - outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; - assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - if (op->is_parallel_op()) { - ((ParallelOp *)op) - ->create_input_partition_inference(*model, inputs, outputs); + int expert_device_index = 0; + int device_index = batch_index % num_devices; + for (size_t o = 0; o < model->operators.size(); o++) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + std::vector inputs(op->numInputs); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > batch_index); + inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + assert(op->numOutputs > 0); + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->outputs[i]].size() > batch_index); + outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; + if (i > 0) { + assert(outputs[0]->machine_view == outputs[i]->machine_view); } - op->init_inference(*model, inputs, outputs, view); + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + if (op->is_parallel_op()) { + ((ParallelOp *)op) + ->create_input_partition_inference(*model, inputs, outputs); } + op->init_inference(*model, inputs, outputs); } } } @@ -123,9 +143,9 @@ MachineView *InferenceManager::get_machine_view(int mv_id) { } FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { + // We currently assume that the index-th batch will be placed + // on the device_index-th device (except for the experts layers) int batch_index = index % max_num_inflight_batches; - int device_index = index % num_devices; - int expert_device_index = 0; FutureMap fm; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; @@ -133,17 +153,6 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { continue; } - MachineView *view; - if (op->op_type == OP_EXPERTS) { - view = get_machine_view(expert_device_index); - // view = &machine_views[expert_device_index]; - expert_device_index = (expert_device_index + 1) % num_devices; - } else { - // pick mv w startdeviceid = device_index - // view = &machine_views[device_index]; - view = get_machine_view(device_index); - } - std::vector inputs(op->numInputs); std::vector outputs(op->numOutputs); for (int i = 0; i < op->numInputs; i++) { @@ -164,7 +173,7 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } - fm = op->inference(*model, bc, inputs, outputs, view); + fm = op->inference(*model, bc, inputs, outputs); } return fm; }; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 30fa45e251..12a14d808d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -980,10 +980,11 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) { void Op::set_argumentmap_for_init_inference(FFModel const &ff, ArgumentMap &argmap, - MachineView const *view) { + ParallelTensor const output0) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; Domain domain = runtime->get_index_space_domain(ctx, this->parallel_is); + MachineView const view = output0->machine_view; switch (domain.get_dim()) { #ifdef FF_USE_NCCL #define DIMFUNC(DIM) \ @@ -991,10 +992,10 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, Rect rect = domain; \ int idx = 0; \ for (PointInRectIterator it(rect); it(); it++) { \ - FFHandler handle = ff.handlers[view->get_device_id(*it)]; \ + FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ if (ff.config.computationMode == COMP_MODE_TRAINING && \ op_type == OP_WEIGHT) { \ - ncclComm_t *nccl_comms = ff.find_nccl_comms(*view); \ + ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); \ @@ -1008,7 +1009,7 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, case DIM: { \ Rect rect = domain; \ for (PointInRectIterator it(rect); it(); it++) { \ - FFHandler handle = ff.handlers[view->get_device_id(*it)]; \ + FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); \ } \ break; \ @@ -1044,7 +1045,7 @@ void Op::set_opmeta_from_futuremap(FFModel const &ff, FutureMap const &fm) { void Op::set_opmeta_from_futuremap_inference(FFModel const &ff, FutureMap const &fm, - MachineView const *view) { + ParallelTensor const output) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; Domain domain = runtime->get_index_space_domain(ctx, parallel_is); @@ -1054,7 +1055,7 @@ void Op::set_opmeta_from_futuremap_inference(FFModel const &ff, Rect rect = domain; \ int idx = 0; \ for (PointInRectIterator it(rect); it(); it++) { \ - inference_meta[view->hash()][idx++] = fm.get_result(*it); \ + inference_meta[output][idx++] = fm.get_result(*it); \ } \ break; \ } @@ -1089,7 +1090,7 @@ void Op::set_argumentmap_for_forward(FFModel const &ff, ArgumentMap &argmap) { void Op::set_argumentmap_for_inference(FFModel const &ff, ArgumentMap &argmap, - MachineView const *view) { + ParallelTensor const output) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; Domain domain = runtime->get_index_space_domain(ctx, parallel_is); @@ -1099,7 +1100,7 @@ void Op::set_argumentmap_for_inference(FFModel const &ff, Rect rect = domain; \ int idx = 0; \ for (PointInRectIterator it(rect); it(); it++) { \ - OpMeta *mp = inference_meta[view->hash()][idx++]; \ + OpMeta *mp = inference_meta[output][idx++]; \ argmap.set_point(*it, TaskArgument(&mp, sizeof(OpMeta *))); \ } \ break; \ From 71072dde57aa2edaf1c39421780b5a6798928ec4 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Mon, 3 Apr 2023 11:05:52 -0400 Subject: [PATCH 088/344] [IncMHA] - Reload the output projection weight after setting it, fix the index (#671) * reload the weight if set_tensor, fix the index * add flag for just loading once --- .../ops/inc_multihead_self_attention.h | 1 + src/ops/inc_multihead_self_attention.cu | 22 ++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index cfdb415354..e21c741196 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -119,6 +119,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, reserveSpaceSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int num_heads; + bool has_load_weights; #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index fda69bb3b9..61d2b06b93 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -32,8 +32,7 @@ __global__ void build_w_out_tensor(float const *weight_ptr, int row_idx = i % vProjSize; int col_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[col_idx * vProjSize * num_heads + - head_idx * vProjSize + row_idx] = + contiguous_weight_ptr[i] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + qkv_weight_block_size + col_idx * vProjSize + row_idx]; } @@ -495,6 +494,23 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + // reload the weight_o + + if (!m->has_load_weights) { + int parallelism = m->vProjSize * m->oProjSize * m->num_heads; + build_w_out_tensor<<>>(weight_ptr, + m->W_out_contiguous, + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); + m->has_load_weights = true; + } // phase 1: Implement kernel to compute KQV for input tokens inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); @@ -553,7 +569,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); weightSize = weights_params * num_heads * sizeof(float); - + has_load_weights = false; // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); From ea4c6f455bfaf15918ae5d314b1e44c56badeca0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 3 Apr 2023 11:24:43 -0400 Subject: [PATCH 089/344] fix --- include/flexflow/ops/inc_multihead_self_attention.h | 2 +- src/ops/inc_multihead_self_attention.cu | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index e21c741196..716a2563cd 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -119,7 +119,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, reserveSpaceSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int num_heads; - bool has_load_weights; + bool *has_load_weights; #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 61d2b06b93..7c422a8a0e 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -496,7 +496,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( } // reload the weight_o - if (!m->has_load_weights) { + if (!(*m->has_load_weights)) { int parallelism = m->vProjSize * m->oProjSize * m->num_heads; build_w_out_tensor<<qSize * m->qProjSize + m->kSize * m->kProjSize + m->vSize * m->vProjSize)); - m->has_load_weights = true; + *m->has_load_weights = true; } // phase 1: Implement kernel to compute KQV for input tokens @@ -569,7 +569,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); weightSize = weights_params * num_heads * sizeof(float); - has_load_weights = false; + has_load_weights = (bool *)calloc(1, sizeof(bool)); + *has_load_weights = false; // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); From 9efeb0199a7baf115e244f6375feb69751e61e9d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 5 Apr 2023 21:18:07 +0000 Subject: [PATCH 090/344] fix graph.cc issue with large number of layers --- CMakeLists.txt | 8 ++++++-- config/config.inc | 7 ++++++- config/config.linux | 3 +++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ab8bb471b8..d61798f4e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,7 @@ set(FF_MAX_DIM "4" CACHE STRING "Maximum dimention of tensors") # option for legion option(FF_USE_EXTERNAL_LEGION "Use pre-installed Legion" OFF) +set(LEGION_MAX_RETURN_SIZE "32768" CACHE STRING "Maximum Legion return size") set(FLEXFLOW_EXT_LIBRARIES "") set(FLEXFLOW_INCLUDE_DIRS "") @@ -206,9 +207,11 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug") endif() message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}") +message(STATUS "LEGION_MAX_RETURN_SIZE: ${LEGION_MAX_RETURN_SIZE}") list(APPEND FF_CC_FLAGS - -DMAX_TENSOR_DIM=${FF_MAX_DIM}) + -DMAX_TENSOR_DIM=${FF_MAX_DIM} + -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}) if(FF_USE_AVX2) list(APPEND FF_CC_FLAGS @@ -218,7 +221,8 @@ endif() list(APPEND FF_NVCC_FLAGS -Wno-deprecated-gpu-targets - -DMAX_TENSOR_DIM=${FF_MAX_DIM}) + -DMAX_TENSOR_DIM=${FF_MAX_DIM} + -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}) list(APPEND FF_LD_FLAGS -lrt diff --git a/config/config.inc b/config/config.inc index c9eb554cb4..0384b87b20 100644 --- a/config/config.inc +++ b/config/config.inc @@ -159,6 +159,11 @@ if [ -n "$FF_MAX_DIM" ]; then SET_MAX_DIM="-DFF_MAX_DIM=${FF_MAX_DIM}" fi +#set LEGION_MAX_RETURN_SIZE +if [ -n "$LEGION_MAX_RETURN_SIZE" ]; then + SET_LEGION_MAX_RETURN_SIZE="-DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}" +fi + # set ROCM path if [ -n "$ROCM_PATH" ]; then SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}" @@ -202,7 +207,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_GASNET} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_GASNET} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 614a671e44..3c029b85ee 100755 --- a/config/config.linux +++ b/config/config.linux @@ -68,6 +68,9 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF} # set MAX_DIM FF_MAX_DIM=${FF_MAX_DIM:-5} +# set LEGION_MAX_RETURN_SIZE +LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-65536} + # set ROCM path ROCM_PATH=${ROCM_PATH:-"/opt/rocm"} From 0f64663bec21317c5eb7259630dac6c4d3a3c088 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Thu, 6 Apr 2023 16:25:39 -0400 Subject: [PATCH 091/344] [Missing Operators] Add inference rms norm op (#672) * init * fix * fix * fix * Add norm helper and update RMSNormMeta. * Finish basic logic of rms norm. * fix dimensions for rms input and weights. * Format code. * Fix rms logic. * Minor fixation. * Remove deprecated Legion names. * Logic fixation. --------- Co-authored-by: xinhaoc --- include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 8 + include/flexflow/operator_params.h | 2 + .../flexflow/ops/kernels/rms_norm_kernels.h | 49 +++ include/flexflow/ops/rms_norm.h | 74 ++++ include/flexflow/ops/rms_norm_params.h | 26 ++ src/ops/kernels/rms_norm_kernels.cu | 167 ++++++++ src/ops/rms_norm.cc | 377 ++++++++++++++++++ src/runtime/graph.cc | 5 + src/runtime/operator_params.cc | 3 + 10 files changed, 712 insertions(+) create mode 100644 include/flexflow/ops/kernels/rms_norm_kernels.h create mode 100644 include/flexflow/ops/rms_norm.h create mode 100644 include/flexflow/ops/rms_norm_params.h create mode 100644 src/ops/kernels/rms_norm_kernels.cu create mode 100644 src/ops/rms_norm.cc diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index c698191811..d244fb3ac6 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -147,6 +147,7 @@ enum OperatorType { OP_LAYERNORM, OP_EXPERTS, OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html + OP_RMS_NORM, OP_INC_MULTIHEAD_SELF_ATTENTION, // Parallel Ops OP_REPARTITION, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index da26d54af2..6873ce5e43 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -140,6 +140,8 @@ enum TaskIDs { ATTENTION_INIT_TASK_ID, ATTENTION_FWD_TASK_ID, ATTENTION_BWD_TASK_ID, + RMSNROM_INIT_TASK_ID, + RMSNROM_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, @@ -288,6 +290,7 @@ class Split; class TopK; class ArgTopK; class Transpose; +class RMSNorm; class Combine; class Repartition; class Reduction; @@ -499,6 +502,9 @@ class FFModel { int a_seq_length_dim = -1, int b_seq_length_dim = -1, char const *name = nullptr); + // Add a root mean square layer + Tensor + rms_norm(const Tensor input, float eps, int dim, char const *name = NULL); // Add a dense layer Tensor dense(const Tensor input, int outDim, @@ -965,6 +971,8 @@ class FFModel { ArgTopK *>, std::unordered_map, Transpose *>, + std::unordered_map, + RMSNorm *>, std::unordered_map, Repartition *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 340cc38659..f949fe3e4c 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -23,6 +23,7 @@ #include "flexflow/ops/pool_2d_params.h" #include "flexflow/ops/reduce_params.h" #include "flexflow/ops/reshape_params.h" +#include "flexflow/ops/rms_norm_params.h" #include "flexflow/ops/softmax_params.h" #include "flexflow/ops/split_params.h" #include "flexflow/ops/topk_params.h" @@ -55,6 +56,7 @@ using OperatorParameters = mp::variant const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) { + assert(0); + } + + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + RMSNormParams get_params() const; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const; + +public: + float eps; + char op_name[MAX_OPNAME]; + int effective_batch_size; + int data_dim; +}; +} // namespace FlexFlow +#endif // _FLEXFLOW_RMS_NORM_H \ No newline at end of file diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h new file mode 100644 index 0000000000..c5d71f71ce --- /dev/null +++ b/include/flexflow/ops/rms_norm_params.h @@ -0,0 +1,26 @@ +#ifndef _FLEXFLOW_RMSNORM_PARAMS_H +#define _FLEXFLOW_RMSNORM_PARAMS_H + +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct RMSNormParams { + LayerID layer_guid; + float eps; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(RMSNormParams const &, RMSNormParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::RMSNormParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_RMSNORM_PARAMS_H \ No newline at end of file diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu new file mode 100644 index 0000000000..5fa13d064a --- /dev/null +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -0,0 +1,167 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/kernels/rms_norm_kernels.h" +#include "flexflow/ops/rms_norm.h" +#include "flexflow/utils/cuda_helper.h" +#include + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms) + : OpMeta(handler, rms) { + eps = rms->eps; + alpha = 1.0f; + beta = 0.0f; + + in_dim = rms->data_dim; + batch_size = rms->effective_batch_size; + num_elements = in_dim * batch_size; + + checkCUDA(cudaMalloc(&rms_ptr, batch_size * sizeof(float))); + checkCUDA(cudaMalloc(&norm_ptr, num_elements * sizeof(float))); +} + +namespace Kernels { +namespace RMSNorm { + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0; + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void + RowwiseRootMeanSquareKernel(int64_t N, T eps, T const *X, T *rms) { + __shared__ T v_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T sum = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + sum += static_cast(X[index]) * static_cast(X[index]); + } + sum = BlockReduceSum(sum, v_shared); // use BlockReduceSum() to sum X_ij^2 + if (threadIdx.x == 0) { + rms[i] = sqrt((static_cast(N) / sum) + static_cast(eps)); + } +} + +template +__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { + using T_ACC = T; + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + Y[index] = static_cast(X[index]) * static_cast(rstd[i]); + } +} + +void forward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + RowwiseRootMeanSquareKernel + <<batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( + m->in_dim, m->eps, input.get_float_ptr(), m->rms_ptr); + NormKernel<<batch_size, kCUDANumThreads, 0, stream>>>( + m->in_dim, input.get_float_ptr(), m->rms_ptr, m->norm_ptr); + + checkCUDA(cublasGemmEx( + m->handle.blas, + CUBLAS_OP_T, // transpose weight (column major) + CUBLAS_OP_N, + m->in_dim, + m->batch_size, + m->in_dim, + &(m->alpha), + weight.get_float_ptr(), // weight, shape (in_dim, in_dim) + CUDA_R_32F, + m->in_dim, + m->norm_ptr, // norm, shape (in_dim, batch_size) + CUDA_R_32F, + m->in_dim, + &(m->beta), + output + .get_float_ptr(), // output, shape (in_dim, batch_size), same as norm + CUDA_R_32F, + m->in_dim, + CUDA_R_32F, + CUBLAS_GEMM_DFALT_TENSOR_OP)); + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); + print_tensor(input.get_float_ptr(), 32, "[RMSNorm:forward:input]"); + print_tensor(output.get_float_ptr(), 32, "[RMSNorm:forward:output]"); + } +} + +} // namespace RMSNorm +} // namespace Kernels +} // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc new file mode 100644 index 0000000000..3d8daa4389 --- /dev/null +++ b/src/ops/rms_norm.cc @@ -0,0 +1,377 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/rms_norm.h" +#include "flexflow/model.h" +#include "flexflow/ops/kernels/rms_norm_kernels.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::RMSNorm; + +bool operator==(RMSNormParams const &lhs, RMSNormParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps; +} + +bool RMSNormParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +RMSNormParams RMSNorm::get_params() const { + RMSNormParams params; + params.layer_guid = this->layer_guid; + params.eps = this->eps; + return params; +} + +Tensor FFModel::rms_norm(const Tensor input, + float eps, + int dim, + char const *name) { + Layer *rm = new Layer(this, + OP_RMS_NORM, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input); + rm->outputs[0] = create_tensor_legion_ordering( + input->num_dims, input->dims, DT_FLOAT, rm, 0, true /*create_grad*/); + + // weights + int weight_dims[1] = {input->dims[input->num_dims - 1]}; + rm->weights[0] = create_weight_legion_ordering(1, + weight_dims, + DT_FLOAT, + rm, + true /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + rm->add_float_property("eps", eps); + layers.push_back(rm); + return rm->outputs[0]; +} + +Op *RMSNorm::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + float eps; + layer->get_float_property("eps", eps); + return new RMSNorm(model, layer->layer_guid, inputs[0], eps, layer->name); +} + +RMSNorm::RMSNorm(FFModel &model, + RMSNormParams const ¶ms, + ParallelTensor const input, + char const *name) + : RMSNorm(model, params.layer_guid, input, params.eps, name) {} + +RMSNorm::RMSNorm(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + float _eps, + char const *name) + : Op(model, + OP_RMS_NORM, + _input->data_type, + name, + 1 /*num of inputs tensor */, + 1 /*num of weights tensor */, + 1 /*onum of utputs tensor */, + _input) { + + inputs[0] = _input; + + int num_dims = _input->num_dims; + data_dim = _input->dims[0].size; + effective_batch_size = 1; + for (int i = 1; i <= num_dims - 2; i++) { + effective_batch_size *= _input->dims[i].size; + } + + // output has the same parallel dims as input + ParallelDim output_dims[MAX_TENSOR_DIM]; + ParallelDim weight_dims[MAX_TENSOR_DIM]; + for (int i = 0; i < _input->num_dims; i++) { + output_dims[i] = _input->dims[i]; + weight_dims[i] = _input->dims[i]; + weight_dims[i].size = 1; + } + + // weights should have the shape of (data_dim, data_dim) + weight_dims[0].size = _input->dims[0].size; + weight_dims[1].size = _input->dims[0].size; + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, output_dims, _input->data_type, this); + + // weights + Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); + + // TODO: weight dims check + weights[0] = + model.create_parallel_weight_legion_ordering(_input->num_dims, + weight_dims, + _input->data_type, + this /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); +} + +void RMSNorm::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(RMSNROM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(RMSNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void RMSNorm::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + + IndexLauncher launcher(RMSNROM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(RMSNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +OpMeta *RMSNorm::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RMSNorm *rn = (RMSNorm *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + RMSNormMeta *meta = new RMSNormMeta(handle, rn); + return meta; +} + +void RMSNorm::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(RMSNROM_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +FutureMap RMSNorm::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(RMSNROM_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[1](O): output + regions[2](I/O): weight +*/ +void RMSNorm::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 3); + assert(regions.size() == 3); + RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input, weight, output); +} + +void RMSNorm::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->eps); +} + +using PCG::Node; +/*static*/ +Node RMSNorm::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + float eps; + size_t id; + dez.deserialize(id); + LayerID layer_guid(id); + dez.deserialize(eps); + + RMSNormParams params; + params.layer_guid = layer_guid; + params.eps = eps; + return ff.get_or_create_node(inputs[0], params); +} + +Op *RMSNorm::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + RMSNormParams params = get_params(); + return new RMSNorm(ff, params, inputs[0], this->name); +} + +void RMSNorm::backward(FFModel const &ff) {} + +bool RMSNorm::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +} // namespace FlexFlow +namespace std { +size_t hash::operator()( + FlexFlow::RMSNormParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.eps); + return key; +} +}; // namespace std \ No newline at end of file diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 700da55eda..432467bbcf 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -37,6 +37,7 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" @@ -2712,6 +2713,10 @@ void FFModel::deserialize_graph_optimal_view( node = Transpose::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_RMS_NORM: { + node = RMSNorm::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_COMBINE: { assert(num_inputs == 1); int combine_dim, combine_degree; diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 69f28ca680..201a6449c2 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -24,6 +24,7 @@ #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" #include "flexflow/ops/reverse.h" +#include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" @@ -113,6 +114,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Aggregate *)op)->get_params(); case OP_AGG_SPEC: return ((AggregateSpec *)op)->get_params(); + case OP_RMS_NORM: + return ((RMSNorm *)op)->get_params(); // TODO: implement the get_params() function for the operators below and // uncomment the lines below From 26249657aeb1ba9441038a9e247109c8a028dc22 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 6 Apr 2023 17:11:04 -0400 Subject: [PATCH 092/344] [Inference][Experts] - Add comprehensive testing to fused Experts operator, and fix bugs (#678) * added checks, fixed bugs * linting * bug fix * finished bug fixing, commented out tests * cleanup --- examples/cpp/inference/inference_config.h | 4 +- src/ops/experts.cu | 570 +++++++++++++++++++++- 2 files changed, 566 insertions(+), 8 deletions(-) diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h index 8301640e45..53811dd99f 100644 --- a/examples/cpp/inference/inference_config.h +++ b/examples/cpp/inference/inference_config.h @@ -31,7 +31,7 @@ struct InferenceConfig { batch_size = BATCH_SIZE; out_dim = DATA_DIM; num_labels = out_dim; - num_layers = 3; + num_layers = 12; vocab_size = 50257; block_size = 1024; @@ -49,7 +49,7 @@ struct InferenceConfig { // Encoder layer num_attention_heads = 16; attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 3; + num_encoder_layers = 12; } // Input/output data diff --git a/src/ops/experts.cu b/src/ops/experts.cu index c4c5620cff..1a81d9118c 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -443,7 +443,244 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, &gemm_batch_count, stream); - cudaStreamSynchronize(stream); + checkCUDA(cudaStreamSynchronize(stream)); + +#ifdef INFERENCE_TESTS + // Checking + // 1. check that m->sorted_indices contains indices sorted + int *indices_cpu = download_tensor(indices, num_indices); + // assert(indices_cpu != nullptr); + std::vector indices_vec(indices_cpu, indices_cpu + num_indices); + std::vector indices_vec_sorted(indices_vec.size()); + std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin()); + std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end()); + + int *thrust_sorted_indices_cpu = download_tensor( + m->sorted_indices, m->num_chosen_experts * m->effective_batch_size); + // assert(thrust_sorted_indices_cpu != nullptr); + std::vector thrust_sorted_indices_vec( + thrust_sorted_indices_cpu, thrust_sorted_indices_cpu + num_indices); + for (int i = 0; i < num_indices; i++) { + if (indices_vec_sorted[i] != thrust_sorted_indices_vec[i]) { + printf("i=%i\n", i); + printf("indices: "); + std::copy(indices_vec.begin(), + indices_vec.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + printf("indices_vec_sorted: "); + std::copy(indices_vec_sorted.begin(), + indices_vec_sorted.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + printf("thrust_sorted_indices_vec: "); + std::copy(thrust_sorted_indices_vec.begin(), + thrust_sorted_indices_vec.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + } + assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]); + } + // 2. check that indices[m->original_indices[i]] = i + int *thrust_original_indices_cpu = download_tensor( + m->original_indices, m->num_chosen_experts * m->effective_batch_size); + // assert(thrust_original_indices_cpu != nullptr); + std::vector thrust_original_indices_vec( + thrust_original_indices_cpu, thrust_original_indices_cpu + num_indices); + for (int i = 0; i < num_indices; i++) { + assert(indices_vec[thrust_original_indices_vec[i]] == + thrust_sorted_indices_vec[i]); + } + + // 3. check that lb_index is the index of the first element greater or equal + // to expert_start_idx + // 4. check that ub_index is greater than last, or outside array + std::vector::iterator low, up; + low = std::lower_bound( + indices_vec_sorted.begin(), indices_vec_sorted.end(), experts_start_idx); + up = std::upper_bound(indices_vec_sorted.begin(), + indices_vec_sorted.end(), + experts_start_idx + num_experts_per_block - 1); + int lb_index_check = low - indices_vec_sorted.begin(), + ub_index_check = up - indices_vec_sorted.begin(); + + if (lb_index_check != lb_index || ub_index_check != ub_index) { + printf("experts_start_idx: %i, num_experts_per_block: %i, lb_index: %i, " + "lb_index_check: %i, ub_index: %i, ub_index_check: %i\n", + experts_start_idx, + num_experts_per_block, + lb_index, + lb_index_check, + ub_index, + ub_index_check); + printf("indices_vec_sorted: "); + std::copy(indices_vec_sorted.begin(), + indices_vec_sorted.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + } + assert(lb_index_check == lb_index); + assert(ub_index_check == ub_index); + + // 5. compute num_valid_assignments manually, and check that is equal to value + // computed in thrust + int num_valid_assignments_manual = ub_index_check - lb_index_check; + assert(num_valid_assignments_manual == num_valid_assignments); + + // 6. check m->non_zero_expert_labels, *non_zero_experts_count + std::set non_zero_experts_check; + for (int i = 0; i < num_indices; i++) { + if (indices_vec_sorted[i] >= experts_start_idx && + indices_vec_sorted[i] < experts_start_idx + num_experts_per_block) { + non_zero_experts_check.insert(indices_vec_sorted[i]); + } + } + assert(non_zero_experts_count == non_zero_experts_check.size()); + // 7. check exp_local_label_to_index + int *non_zero_expert_labels_cpu = + download_tensor(m->non_zero_expert_labels, non_zero_experts_count); + // assert(non_zero_expert_labels_cpu != nullptr); + std::vector non_zero_expert_labels_vec(non_zero_expert_labels_cpu, + non_zero_expert_labels_cpu + + non_zero_experts_count); + assert(std::is_sorted(non_zero_expert_labels_vec.begin(), + non_zero_expert_labels_vec.end())); + std::vector non_zero_experts_check_vec; + for (auto el : non_zero_experts_check) { + non_zero_experts_check_vec.push_back(el - experts_start_idx); + } + assert(std::is_sorted(non_zero_experts_check_vec.begin(), + non_zero_experts_check_vec.end())); + assert(non_zero_expert_labels_vec == non_zero_experts_check_vec); + + int *exp_local_label_to_index = + download_tensor(m->exp_local_label_to_index, non_zero_experts_count); + // assert(exp_local_label_to_index != nullptr); + std::vector exp_local_label_to_index_vec(exp_local_label_to_index, + exp_local_label_to_index + + non_zero_experts_count); + int z = 0; + for (int i = 0; i < non_zero_experts_count; i++) { + if (non_zero_experts_check.find(i) != non_zero_experts_check.end()) { + assert(exp_local_label_to_index_vec[i] == z); + z++; + } + } + + // 8. Check expert_start_indexes + int *expert_start_indices_thrust = + download_tensor(m->expert_start_indexes, non_zero_experts_count + 1); + // assert(expert_start_indices_thrust != nullptr); + std::vector expert_start_indices_thrust_vec( + expert_start_indices_thrust, + expert_start_indices_thrust + non_zero_experts_count + 1); + std::vector expert_start_indices_cpu; + std::set exp_label; + + std::vector num_assignments_per_expert_cpu; + + for (int i = lb_index; i < ub_index; i++) { + assert(indices_vec_sorted[i] >= experts_start_idx && + indices_vec_sorted[i] < experts_start_idx + num_experts_per_block); + if (exp_label.find(indices_vec_sorted[i]) == exp_label.end()) { + exp_label.insert(indices_vec_sorted[i]); + expert_start_indices_cpu.push_back(i - lb_index); + + num_assignments_per_expert_cpu.push_back(1); + } else { + num_assignments_per_expert_cpu[num_assignments_per_expert_cpu.size() - + 1] += 1; + } + } + expert_start_indices_cpu.push_back(ub_index - lb_index); + assert(num_assignments_per_expert_cpu.size() == non_zero_experts_count); + /* std::cout << "indices_vec_sorted: "; + for (int i=lb_index; i(m->num_assignments_per_expert, + num_assignments_per_expert_thrust, + non_zero_experts_count)); + assert(num_assignments_per_expert_thrust != nullptr); + std::vector num_assignments_per_expert_thrust_vec( + num_assignments_per_expert_thrust, + num_assignments_per_expert_thrust + non_zero_experts_count); + assert(num_assignments_per_expert_cpu == + num_assignments_per_expert_thrust_vec); + + int *destination_start_indices_thrust = + (int *)calloc(non_zero_experts_count, sizeof(int)); + assert(destination_start_indices_thrust != nullptr); + assert(download_tensor(m->destination_start_indices, + destination_start_indices_thrust, + non_zero_experts_count)); + assert(destination_start_indices_thrust != nullptr); + std::vector destination_start_indices_thrust_vec( + destination_start_indices_thrust, + destination_start_indices_thrust + non_zero_experts_count); + std::vector destination_start_indices_cpu; + int gemm_batch_count_cpu = 0; + for (int i = 0; i < num_assignments_per_expert_cpu.size(); i++) { + if (i == 0) { + destination_start_indices_cpu.push_back(0); + } else { + destination_start_indices_cpu.push_back( + std::min(expert_capacity, num_assignments_per_expert_cpu[i - 1])); + } + } + for (int i = 0; i < num_assignments_per_expert_cpu.size(); i++) { + gemm_batch_count_cpu += + std::min(expert_capacity, num_assignments_per_expert_cpu[i]); + } + for (int i = 1; i < destination_start_indices_cpu.size(); i++) { + destination_start_indices_cpu[i] += destination_start_indices_cpu[i - 1]; + } + /* + std::cout << "destination_start_indices_cpu: "; + for (int i=0; i= non_zero_experts_count); @@ -495,7 +732,324 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, m->coefficient_idx_array, m->output_idx_array); - cudaStreamSynchronize(stream); + checkCUDA(cudaStreamSynchronize(stream)); + +#ifdef INFERENCE_TESTS + std::vector token_ptrs, weight_ptrs, bias_ptrs, + coefficient_ptrs; + std::vector output_ptrs; + std::map num_t_per_exp; + for (int i = 0; i < num_indices; i++) { + int global_exp_label = indices_vec[i]; + + if (global_exp_label >= experts_start_idx && + global_exp_label < experts_start_idx + num_experts_per_block && + (num_t_per_exp.find(global_exp_label) == num_t_per_exp.end() || + num_t_per_exp[global_exp_label] < expert_capacity)) { + if (num_t_per_exp.find(global_exp_label) == num_t_per_exp.end()) { + num_t_per_exp[global_exp_label] = 1; + } else { + num_t_per_exp[global_exp_label] = num_t_per_exp[global_exp_label] + 1; + } + int token_idx = i / num_chosen_experts; + // std::cout << "Push back token_idx (" << token_idx << ") * data_dim (" + // << data_dim << "): " << token_idx*data_dim << std::endl; + + token_ptrs.push_back(&input[token_idx * data_dim]); + coefficient_ptrs.push_back(&topk_gate_preds[i]); + int local_exp_label = global_exp_label - experts_start_idx; + weight_ptrs.push_back(weights[local_exp_label * (1 + use_bias)]); + output_ptrs.push_back(&output[token_idx * out_dim]); + if (use_bias) { + bias_ptrs.push_back( + weights[local_exp_label * (1 + use_bias) + use_bias]); + } + } + } + + int i = 0, s = 0; + for (auto it : num_t_per_exp) { + int num_t = it.second; + s += num_t; + /* if (num_assignments_per_expert_cpu[i] != num_t) { + std::cout << "num_assignments_per_expert_cpu: "; + for (int j=0; j token_ptrs_sorted(token_ptrs.size()), + weight_ptrs_sorted(weight_ptrs.size()), + bias_ptrs_sorted(bias_ptrs.size()), + coefficient_ptrs_sorted(coefficient_ptrs.size()); + std::vector output_ptrs_sorted(output_ptrs.size()); + std::copy(token_ptrs.begin(), token_ptrs.end(), token_ptrs_sorted.begin()); + std::sort(token_ptrs_sorted.begin(), token_ptrs_sorted.end()); + std::copy(weight_ptrs.begin(), weight_ptrs.end(), weight_ptrs_sorted.begin()); + std::sort(weight_ptrs_sorted.begin(), weight_ptrs_sorted.end()); + std::copy(bias_ptrs.begin(), bias_ptrs.end(), bias_ptrs_sorted.begin()); + std::sort(bias_ptrs_sorted.begin(), bias_ptrs_sorted.end()); + std::copy(coefficient_ptrs.begin(), + coefficient_ptrs.end(), + coefficient_ptrs_sorted.begin()); + std::sort(coefficient_ptrs_sorted.begin(), coefficient_ptrs_sorted.end()); + std::copy(output_ptrs.begin(), output_ptrs.end(), output_ptrs_sorted.begin()); + std::sort(output_ptrs_sorted.begin(), output_ptrs_sorted.end()); + + // Download + float const **token_idx_array_thrust = + (float const **)calloc(gemm_batch_count, sizeof(float const *)); + assert(token_idx_array_thrust); + checkCUDA(cudaMemcpy(token_idx_array_thrust, + m->token_idx_array, + sizeof(float const *) * gemm_batch_count, + cudaMemcpyDeviceToHost)); + std::vector token_idx_array_thrust_vec( + token_idx_array_thrust, token_idx_array_thrust + gemm_batch_count); + float const **weight_idx_array_thrust = + (float const **)calloc(gemm_batch_count, sizeof(float const *)); + assert(weight_idx_array_thrust); + checkCUDA(cudaMemcpy(weight_idx_array_thrust, + m->weight_idx_array, + sizeof(float const *) * gemm_batch_count, + cudaMemcpyDeviceToHost)); + std::vector weight_idx_array_thrust_vec( + weight_idx_array_thrust, weight_idx_array_thrust + gemm_batch_count); + float const **coefficient_idx_array_thrust = + (float const **)calloc(gemm_batch_count, sizeof(float const *)); + assert(coefficient_idx_array_thrust); + checkCUDA(cudaMemcpy(coefficient_idx_array_thrust, + m->coefficient_idx_array, + sizeof(float const *) * gemm_batch_count, + cudaMemcpyDeviceToHost)); + std::vector coefficient_idx_array_thrust_vec( + coefficient_idx_array_thrust, + coefficient_idx_array_thrust + gemm_batch_count); + float const **bias_idx_array_thrust = + (float const **)calloc(gemm_batch_count, sizeof(float const *)); + assert(bias_idx_array_thrust); + if (use_bias) { + checkCUDA(cudaMemcpy(bias_idx_array_thrust, + m->bias_idx_array, + sizeof(float const *) * gemm_batch_count, + cudaMemcpyDeviceToHost)); + } + std::vector bias_idx_array_thrust_vec( + bias_idx_array_thrust, bias_idx_array_thrust + gemm_batch_count); + float **output_idx_array_thrust = + (float **)calloc(gemm_batch_count, sizeof(float *)); + assert(output_idx_array_thrust); + checkCUDA(cudaMemcpy(output_idx_array_thrust, + m->output_idx_array, + sizeof(float *) * gemm_batch_count, + cudaMemcpyDeviceToHost)); + std::vector output_idx_array_thrust_vec( + output_idx_array_thrust, output_idx_array_thrust + gemm_batch_count); + + std::vector token_idx_array_thrust_vec_sorted( + token_idx_array_thrust_vec.size()), + weight_idx_array_thrust_vec_sorted(weight_idx_array_thrust_vec.size()), + coefficient_idx_array_thrust_vec_sorted( + coefficient_idx_array_thrust_vec.size()), + bias_idx_array_thrust_vec_sorted(bias_idx_array_thrust_vec.size()); + std::vector output_idx_array_thrust_vec_sorted( + output_idx_array_thrust_vec.size()); + std::copy(token_idx_array_thrust_vec.begin(), + token_idx_array_thrust_vec.end(), + token_idx_array_thrust_vec_sorted.begin()); + std::sort(token_idx_array_thrust_vec_sorted.begin(), + token_idx_array_thrust_vec_sorted.end()); + std::copy(weight_idx_array_thrust_vec.begin(), + weight_idx_array_thrust_vec.end(), + weight_idx_array_thrust_vec_sorted.begin()); + std::sort(weight_idx_array_thrust_vec_sorted.begin(), + weight_idx_array_thrust_vec_sorted.end()); + std::copy(coefficient_idx_array_thrust_vec.begin(), + coefficient_idx_array_thrust_vec.end(), + coefficient_idx_array_thrust_vec_sorted.begin()); + std::sort(coefficient_idx_array_thrust_vec_sorted.begin(), + coefficient_idx_array_thrust_vec_sorted.end()); + std::copy(bias_idx_array_thrust_vec.begin(), + bias_idx_array_thrust_vec.end(), + bias_idx_array_thrust_vec_sorted.begin()); + std::sort(bias_idx_array_thrust_vec_sorted.begin(), + bias_idx_array_thrust_vec_sorted.end()); + std::copy(output_idx_array_thrust_vec.begin(), + output_idx_array_thrust_vec.end(), + output_idx_array_thrust_vec_sorted.begin()); + std::sort(output_idx_array_thrust_vec_sorted.begin(), + output_idx_array_thrust_vec_sorted.end()); + + if (token_ptrs_sorted != token_idx_array_thrust_vec_sorted) { + std::cout << "token_ptrs: "; + for (int i = 0; i < token_ptrs_sorted.size(); i++) { + std::cout << token_ptrs_sorted[i] << " "; + } + std::cout << std::endl; + std::cout << "token_idx_array_thrust_vec: "; + for (int i = 0; i < token_idx_array_thrust_vec_sorted.size(); i++) { + std::cout << token_idx_array_thrust_vec_sorted[i] << " "; + } + std::cout << std::endl; + std::cout << "Input: " << input << std::endl; + std::cout << "data_dim: " << data_dim << std::endl; + std::cout << "out_dim: " << out_dim << std::endl; + std::cout << "expert_start_idx: " << experts_start_idx << std::endl; + std::cout << "indices: "; + for (int i = 0; i < indices_vec.size(); i++) { + std::cout << indices_vec[i] << " "; + } + std::cout << std::endl; + std::cout << "indices_vec_sorted: "; + for (int i = 0; i < indices_vec_sorted.size(); i++) { + std::cout << indices_vec_sorted[i] << " "; + } + std::cout << std::endl; + } + assert(token_ptrs_sorted == token_idx_array_thrust_vec_sorted); + assert(weight_ptrs_sorted == weight_idx_array_thrust_vec_sorted); + if (coefficient_ptrs_sorted != coefficient_idx_array_thrust_vec_sorted) { + std::cout << "coefficient_ptrs_sorted: "; + for (int i = 0; i < coefficient_ptrs_sorted.size(); i++) { + std::cout << coefficient_ptrs_sorted[i] << " "; + } + std::cout << std::endl; + std::cout << "coefficient_idx_array_thrust_vec_sorted: "; + for (int i = 0; i < coefficient_idx_array_thrust_vec_sorted.size(); i++) { + std::cout << coefficient_idx_array_thrust_vec_sorted[i] << " "; + } + std::cout << std::endl; + std::cout << "topk_gate_preds: " << topk_gate_preds << std::endl; + std::cout << "data_dim: " << data_dim << std::endl; + std::cout << "out_dim: " << out_dim << std::endl; + std::cout << "expert_start_idx: " << experts_start_idx << std::endl; + std::cout << "indices: "; + for (int i = 0; i < indices_vec.size(); i++) { + std::cout << indices_vec[i] << " "; + } + std::cout << std::endl; + std::cout << "indices_vec_sorted: "; + for (int i = 0; i < indices_vec_sorted.size(); i++) { + std::cout << indices_vec_sorted[i] << " "; + } + std::cout << std::endl; + } + assert(coefficient_ptrs_sorted == coefficient_idx_array_thrust_vec_sorted); + if (use_bias) { + assert(bias_ptrs_sorted == bias_idx_array_thrust_vec_sorted); + } + assert(output_ptrs_sorted == output_idx_array_thrust_vec_sorted); + + assert(token_ptrs_sorted.size() == gemm_batch_count && + weight_ptrs_sorted.size() == gemm_batch_count && + coefficient_ptrs_sorted.size() == gemm_batch_count && + (!use_bias || bias_ptrs_sorted.size() == gemm_batch_count) && + output_ptrs_sorted.size() == gemm_batch_count); + + for (int i = 0; i < token_ptrs_sorted.size(); i++) { + assert(token_ptrs_sorted[i]); + assert(weight_ptrs_sorted[i]); + assert(coefficient_ptrs_sorted[i]); + if (use_bias) { + assert(bias_ptrs_sorted[i]); + } + assert(output_ptrs_sorted[i]); + } + + free(token_idx_array_thrust); + free(weight_idx_array_thrust); + free(coefficient_idx_array_thrust); + free(bias_idx_array_thrust); + free(output_idx_array_thrust); + + checkCUDA(cudaFreeHost(indices_cpu)); + indices_vec.clear(); + indices_vec.shrink_to_fit(); + indices_vec_sorted.clear(); + indices_vec_sorted.shrink_to_fit(); + num_assignments_per_expert_cpu.clear(); + num_assignments_per_expert_cpu.shrink_to_fit(); + + token_ptrs.clear(); + token_ptrs.shrink_to_fit(); + token_ptrs_sorted.clear(); + token_ptrs_sorted.shrink_to_fit(); + weight_ptrs.clear(); + weight_ptrs.shrink_to_fit(); + weight_ptrs_sorted.clear(); + weight_ptrs_sorted.shrink_to_fit(); + bias_ptrs.clear(); + bias_ptrs.shrink_to_fit(); + bias_ptrs_sorted.clear(); + bias_ptrs_sorted.shrink_to_fit(); + coefficient_ptrs.clear(); + coefficient_ptrs.shrink_to_fit(); + output_ptrs.clear(); + output_ptrs.shrink_to_fit(); + output_ptrs_sorted.clear(); + output_ptrs_sorted.shrink_to_fit(); + + token_idx_array_thrust_vec_sorted.clear(); + token_idx_array_thrust_vec_sorted.shrink_to_fit(); + weight_idx_array_thrust_vec_sorted.clear(); + weight_idx_array_thrust_vec_sorted.shrink_to_fit(); + coefficient_idx_array_thrust_vec_sorted.clear(); + coefficient_idx_array_thrust_vec_sorted.shrink_to_fit(); + bias_idx_array_thrust_vec_sorted.clear(); + bias_idx_array_thrust_vec_sorted.shrink_to_fit(); + output_idx_array_thrust_vec_sorted.clear(); + output_idx_array_thrust_vec_sorted.shrink_to_fit(); + + // Check batch output pointers + assert(gemm_batch_count <= m->effective_batch_size); + float **dev_batch_outputs_cuda = (float **)calloc( + num_chosen_experts * m->effective_batch_size, sizeof(float *)); + assert(dev_batch_outputs_cuda); + checkCUDA( + cudaMemcpy(dev_batch_outputs_cuda, + m->dev_batch_outputs, + sizeof(float *) * num_chosen_experts * m->effective_batch_size, + cudaMemcpyDeviceToHost)); + std::vector dev_batch_outputs_cuda_vec( + dev_batch_outputs_cuda, + dev_batch_outputs_cuda + num_chosen_experts * m->effective_batch_size); + + std::vector batch_outputs_host_vec( + m->batch_outputs, + m->batch_outputs + num_chosen_experts * m->effective_batch_size); + assert(batch_outputs_host_vec == dev_batch_outputs_cuda_vec); + + /* std::cout << "dev_batch_outputs_cuda_vec[i]: "; + for (int i=0; i0) { + assert(dev_batch_outputs_cuda_vec[i] == dev_batch_outputs_cuda_vec[i-1] + + out_dim); + } + std::cout << dev_batch_outputs_cuda_vec[i] << " "; + } + std::cout << std::endl; */ + + free(dev_batch_outputs_cuda); +#endif experts_forward_GemmBatched_kernel(m, (void const **)m->weight_idx_array, @@ -510,7 +1064,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, gemm_batch_count, stream); - cudaStreamSynchronize(stream); + checkCUDA(cudaStreamSynchronize(stream)); int aggregation_parallelism = std::max(num_tokens, gemm_batch_count) * out_dim; @@ -567,8 +1121,12 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, std::max(num_experts, num_chosen_experts * effective_batch_size) * sizeof(int))); checkCUDA(cudaMalloc(&exp_local_label_to_index, num_experts * sizeof(int))); - // expert_start_indexes needs one more slot to save the upper bound index - checkCUDA(cudaMalloc(&expert_start_indexes, (num_experts + 1) * sizeof(int))); + // expert_start_indexes needs one more slot to save the upper bound index. + // Initial sequence can require more space, though. + checkCUDA(cudaMalloc( + &expert_start_indexes, + std::max(num_experts + 1, num_chosen_experts * effective_batch_size) * + sizeof(int))); checkCUDA(cudaMalloc(&num_assignments_per_expert, num_experts * sizeof(int))); checkCUDA(cudaMalloc(&destination_start_indices, num_experts * sizeof(int))); @@ -598,7 +1156,7 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, out_dim * num_chosen_experts * effective_batch_size * sizeof(float))); for (int i = 1; i < num_chosen_experts * effective_batch_size; i++) { - batch_outputs[i] = batch_outputs[i - 1] + out_dim * sizeof(float); + batch_outputs[i] = batch_outputs[i - 1] + out_dim; } checkCUDA( cudaMalloc(&dev_batch_outputs, From aa8175e84ef58af1d570c29d86e995cd8742b6a0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 7 Apr 2023 14:05:48 +0000 Subject: [PATCH 093/344] comment out stream sync --- src/ops/experts.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 1a81d9118c..4b16e176d8 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -443,7 +443,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, &gemm_batch_count, stream); - checkCUDA(cudaStreamSynchronize(stream)); + //checkCUDA(cudaStreamSynchronize(stream)); #ifdef INFERENCE_TESTS // Checking @@ -732,7 +732,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, m->coefficient_idx_array, m->output_idx_array); - checkCUDA(cudaStreamSynchronize(stream)); + //checkCUDA(cudaStreamSynchronize(stream)); #ifdef INFERENCE_TESTS std::vector token_ptrs, weight_ptrs, bias_ptrs, @@ -1064,7 +1064,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, gemm_batch_count, stream); - checkCUDA(cudaStreamSynchronize(stream)); + //checkCUDA(cudaStreamSynchronize(stream)); int aggregation_parallelism = std::max(num_tokens, gemm_batch_count) * out_dim; From f842844f07b1d5e2b7537ab0eff0242db713f69f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 7 Apr 2023 13:56:24 -0400 Subject: [PATCH 094/344] [Inference][MoE] - Use single weight for all fused experts (#679) * use one weight for all experts * bug fix * bug fix --- include/flexflow/ops/experts.h | 3 +- include/flexflow/ops/experts_params.h | 4 +- src/ops/experts.cc | 273 +++++++++++--------------- src/ops/experts.cpp | 3 +- src/ops/experts.cu | 36 ++-- 5 files changed, 140 insertions(+), 179 deletions(-) diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 0f51187c78..7f110c79b6 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -126,7 +126,8 @@ class Experts : public Op { int const *indices, float const *topk_gate_preds, float *output, - float const **weights, + float const *weights, + float const *biases, int num_active_tokens, int chosen_experts, int batch_size, diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index d5b4676f0e..e5aa2f1ebb 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -47,7 +47,9 @@ struct ExpertsParams { OUTPUT_REPLICA, KERNEL_CHANNEL_IN, KERNEL_CHANNEL_OUT, - BIAS_CHANNEL_OUT + KERNEL_NUM_EXPERTS, + BIAS_CHANNEL_OUT, + BIAS_NUM_EXPERTS, }; std::unordered_map diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 983c682bd6..6eaa3be943 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -82,7 +82,7 @@ Tensor FFModel::experts(Tensor const *inputs, DT_FLOAT, name, 3 /*inputs*/, - num_experts * (1 + use_bias) /*weights*/, + (1 + use_bias) /*weights*/, 1 /*outputs*/, inputs); { @@ -95,29 +95,15 @@ Tensor FFModel::experts(Tensor const *inputs, num_dims, dims, DT_FLOAT, e, 0, true /*create_grad*/); assert(e->outputs[0] != nullptr); } - for (int i = 0; i < num_experts; i++) { - { - int dims[2] = {inputs[0]->dims[0], experts_output_dim_size}; - e->weights[i * (1 + use_bias)] = - create_weight_legion_ordering(2, - dims, - DT_FLOAT, - e, - true /*create_grad*/, - nullptr, - CHOSEN_SYNC_TYPE); - } - if (use_bias) { - int dims[1] = {experts_output_dim_size}; - e->weights[i * (1 + use_bias) + use_bias] = - create_weight_legion_ordering(1, - dims, - DT_FLOAT, - e, - true /*create_grad*/, - nullptr, - CHOSEN_SYNC_TYPE); - } + { + int dims[3] = {inputs[0]->dims[0], experts_output_dim_size, num_experts}; + e->weights[0] = create_weight_legion_ordering( + 3, dims, DT_FLOAT, e, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); + } + if (use_bias) { + int dims[2] = {experts_output_dim_size, num_experts}; + e->weights[1] = create_weight_legion_ordering( + 2, dims, DT_FLOAT, e, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); } e->add_int_property("num_experts", num_experts); @@ -285,7 +271,7 @@ Experts::Experts(FFModel &model, DT_FLOAT, name, 3 /*inputs*/, - _num_experts * (1 + _use_bias) /*weights*/, + (1 + _use_bias) /*weights*/, 1 /*outputs*/, inputs), num_experts(_num_experts), experts_start_idx(_experts_start_idx), @@ -301,7 +287,7 @@ Experts::Experts(FFModel &model, assert(num_experts > 0); assert(numInputs == 3); assert(numOutputs == 1); - assert(numWeights == num_experts * (1 + use_bias)); + assert(numWeights == (1 + use_bias)); // Check input dimensions int num_dims = inputs[0]->num_dims; @@ -358,35 +344,31 @@ Experts::Experts(FFModel &model, #else ParameterSyncType comm_type = ParameterSyncType::PS; #endif - for (int i = 0; i < num_experts; i++) { + { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - { - // ParallelDim dims[2] = {inputs[0]->dims[0], out_dims[0]}; - weights[i * (1 + use_bias)] = - model.create_parallel_weight_legion_ordering( - kernel_shape.num_dims, // 2, - kernel_shape.dims, // dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - comm_type); - assert(weights[i * (1 + use_bias)] != nullptr); - } - if (use_bias) { - Initializer *bias_initializer = new ZeroInitializer(); - ParallelDim dims[1] = {out_dims[0]}; - weights[i * (1 + use_bias) + use_bias] = - model.create_parallel_weight_legion_ordering( - bias_shape.num_dims, // 1, - bias_shape.dims, // dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - bias_initializer, - comm_type); - assert(weights[i * (1 + use_bias) + use_bias] != nullptr); - } + assert(kernel_shape.dims[2].size == num_experts); + weights[0] = model.create_parallel_weight_legion_ordering( + kernel_shape.num_dims, // 3, + kernel_shape.dims, // dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + comm_type); + assert(weights[0] != nullptr); + } + if (use_bias) { + Initializer *bias_initializer = new ZeroInitializer(); + assert(bias_shape.dims[1].size == num_experts); + weights[1] = model.create_parallel_weight_legion_ordering( + bias_shape.num_dims, // 1, + bias_shape.dims, // dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + bias_initializer, + comm_type); + assert(weights[1] != nullptr); } } assert(check_output_input_weight_parallel_dims(allocate_weights)); @@ -490,23 +472,19 @@ void Experts::init_inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(3, FID_DATA); - for (int i = 0; i < num_experts; i++) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias)]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias)]->region)); - launcher.add_field(4 + i * (1 + use_bias), FID_DATA); - if (use_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias) + use_bias]->region)); - launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); - } + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); @@ -555,23 +533,19 @@ void Experts::init(FFModel const &ff) { EXCLUSIVE, outputs[0]->region)); launcher.add_field(3, FID_DATA); - for (int i = 0; i < num_experts; i++) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias)]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias)]->region)); - launcher.add_field(4 + i * (1 + use_bias), FID_DATA); - if (use_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias) + use_bias]->region)); - launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); - } + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); @@ -641,23 +615,19 @@ void Experts::forward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region)); launcher.add_field(3, FID_DATA); - for (int i = 0; i < num_experts; i++) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias)]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias)]->region)); - launcher.add_field(4 + i * (1 + use_bias), FID_DATA); - if (use_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias) + use_bias]->region)); - launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); - } + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); } runtime->execute_index_space(ctx, launcher); } @@ -714,23 +684,19 @@ FutureMap Experts::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(3, FID_DATA); - for (int i = 0; i < num_experts; i++) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias)]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias)]->region)); - launcher.add_field(4 + i * (1 + use_bias), FID_DATA); - if (use_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[i * (1 + use_bias) + use_bias]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i * (1 + use_bias) + use_bias]->region)); - launcher.add_field(4 + i * (1 + use_bias) + use_bias, FID_DATA); - } + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); } return runtime->execute_index_space(ctx, launcher); } @@ -749,7 +715,7 @@ void Experts::inference_task(Task const *task, int num_experts = m->num_experts; bool use_bias = m->use_bias; - assert(regions.size() - 4 == num_experts * (1 + use_bias)); + assert(regions.size() - 4 == (1 + use_bias)); // get input, indices, topk_gate_preds, outputs float const *input_ptr = helperGetTensorPointerRO( @@ -818,37 +784,29 @@ void Experts::inference_task(Task const *task, } // get weights - float const *weights_ptrs[num_experts * (1 + use_bias)]; - for (int i = 0; i < num_experts; i++) { - weights_ptrs[i * (1 + use_bias)] = - helperGetTensorPointerRO(regions[4 + i * (1 + use_bias)], - task->regions[4 + i * (1 + use_bias)], - FID_DATA, - ctx, - runtime); - Domain weights_domain = runtime->get_index_space_domain( - ctx, task->regions[4 + i * (1 + use_bias)].region.get_index_space()); - int weights_dims = weights_domain.get_dim(); - assert(weights_dims == input_dims); - assert(weights_domain.hi()[0] - weights_domain.lo()[0] + 1 == data_dim); - assert(weights_domain.hi()[1] - weights_domain.lo()[1] + 1 == out_dim); - if (use_bias) { - weights_ptrs[i * (1 + use_bias) + use_bias] = - helperGetTensorPointerRO( - regions[4 + i * (1 + use_bias) + use_bias], - task->regions[4 + i * (1 + use_bias) + use_bias], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, - task->regions[4 + i * (1 + use_bias) + use_bias] - .region.get_index_space()); - int bias_dims = bias_domain.get_dim(); - assert(bias_dims == 4); - assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); - } + float const *weights_ptr = helperGetTensorPointerRO( + regions[4], task->regions[4], FID_DATA, ctx, runtime); + assert(weights_ptr != nullptr); + Domain weights_domain = runtime->get_index_space_domain( + ctx, task->regions[4].region.get_index_space()); + int weights_dims = weights_domain.get_dim(); + assert(weights_dims == input_dims); + assert(weights_domain.hi()[0] - weights_domain.lo()[0] + 1 == data_dim); + assert(weights_domain.hi()[1] - weights_domain.lo()[1] + 1 == out_dim); + assert(weights_domain.hi()[2] - weights_domain.lo()[2] + 1 == num_experts); + + float const *bias_ptr = nullptr; + if (use_bias) { + bias_ptr = helperGetTensorPointerRO( + regions[5], task->regions[5], FID_DATA, ctx, runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[5].region.get_index_space()); + int bias_dims = bias_domain.get_dim(); + assert(bias_dims == 4); + assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); + assert(bias_domain.hi()[1] - bias_domain.lo()[1] + 1 == num_experts); } + #ifdef INFERENCE_TESTS if (DEBUG_MODE) { std::cout << "forward_kernel_wrapper" << std::endl @@ -989,7 +947,8 @@ void Experts::inference_task(Task const *task, indices_ptr, topk_gate_pred_ptr, output_ptr, - weights_ptrs, + weights_ptr, + bias_ptr, num_active_tokens, chosen_experts, batch_size, @@ -1140,7 +1099,9 @@ std::unordered_map {OUTPUT_REPLICA, num_dims - 1}, {KERNEL_CHANNEL_IN, 0}, {KERNEL_CHANNEL_OUT, 1}, - {BIAS_CHANNEL_OUT, 0}}; + {KERNEL_NUM_EXPERTS, 2}, + {BIAS_CHANNEL_OUT, 0}, + {BIAS_NUM_EXPERTS, 1}}; } void ExpertsParams::calculate_nonreplica_dim_sizes( @@ -1168,11 +1129,13 @@ void ExpertsParams::calculate_nonreplica_dim_sizes( input_shape.dims[INPUT_CHANNEL].degree; kernel_dims[dimension_names.at(KERNEL_CHANNEL_OUT)].size = experts_output_dim_size; + kernel_dims[dimension_names.at(KERNEL_NUM_EXPERTS)].size = num_experts; *kernel_ndims = num_dims; } if (bias_dims != nullptr) { bias_dims[dimension_names.at(BIAS_CHANNEL_OUT)].size = experts_output_dim_size; + bias_dims[dimension_names.at(BIAS_NUM_EXPERTS)].size = num_experts; *bias_ndims = num_dims; } } diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index 787c6e2d88..88456ac66c 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -25,7 +25,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int const *indices, float const *topk_gate_preds, float *output, - float const **weights, + float const *weights, + float const *biases, int num_active_tokens, int chosen_experts, int batch_size, diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 4b16e176d8..82f128fd1b 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -210,7 +210,8 @@ __global__ void experts_forward_prepare_kernel( float const *input, // @In: Tokens' values (in_dim, batch_size) float *output, float const **token_idx_array, // @Out: Barray for GemmBatchedEx - float const **weights, // @In: Experts' weights + float const *weights, // @In: Experts' weights + float const *biases, // @In: Experts' biases float const **weight_idx_array, // @Out: Aarray for GemmBatchedEx float const **bias_idx_array, // @Out: Experts' bias float const *coefficients, // @In: topk_gate_predss coefficients tensor @@ -233,11 +234,11 @@ __global__ void experts_forward_prepare_kernel( within_expert_offset] = &input[token_idx * data_dim]; weight_idx_array[destination_start_indices[expert_index] + within_expert_offset] = - weights[local_expert_label * (1 + use_bias)]; + &weights[local_expert_label * data_dim * out_dim]; if (use_bias) { bias_idx_array[destination_start_indices[expert_index] + within_expert_offset] = - weights[local_expert_label * (1 + use_bias) + use_bias]; + &biases[local_expert_label * out_dim]; } coefficient_idx_array[destination_start_indices[expert_index] + within_expert_offset] = &coefficients[rev_idx]; @@ -378,7 +379,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int const *indices, float const *topk_gate_preds, float *output, - float const **weights, + float const *weights, + float const *biases, int num_active_tokens, int chosen_experts, int batch_size, @@ -411,13 +413,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, // assert(num_tokens == batch_size); assert(out_dim == m->out_dim); - // TODO: remove this once we condense all weights in a single tensor - // currently each weight matrix is placed on GPU by Legion, but the array - // holding the pointers to each weight matrix is on CPU - cudaMemcpy(m->dev_weights, - weights, - num_experts_per_block * (1 + use_bias) * sizeof(float *), - cudaMemcpyHostToDevice); + assert(weights != nullptr); + assert(use_bias == (biases != nullptr)); int num_indices = num_tokens * num_chosen_experts; // values below are set by Thrust in the experts_forward_thrust_wrapper @@ -443,7 +440,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, &gemm_batch_count, stream); - //checkCUDA(cudaStreamSynchronize(stream)); + // checkCUDA(cudaStreamSynchronize(stream)); #ifdef INFERENCE_TESTS // Checking @@ -725,14 +722,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, input, output, m->token_idx_array, - m->dev_weights, + weights, + biases, m->weight_idx_array, m->bias_idx_array, topk_gate_preds, m->coefficient_idx_array, m->output_idx_array); - //checkCUDA(cudaStreamSynchronize(stream)); + // checkCUDA(cudaStreamSynchronize(stream)); #ifdef INFERENCE_TESTS std::vector token_ptrs, weight_ptrs, bias_ptrs, @@ -758,11 +756,10 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, token_ptrs.push_back(&input[token_idx * data_dim]); coefficient_ptrs.push_back(&topk_gate_preds[i]); int local_exp_label = global_exp_label - experts_start_idx; - weight_ptrs.push_back(weights[local_exp_label * (1 + use_bias)]); + weight_ptrs.push_back(&weights[local_exp_label * (out_dim * data_dim)]); output_ptrs.push_back(&output[token_idx * out_dim]); if (use_bias) { - bias_ptrs.push_back( - weights[local_exp_label * (1 + use_bias) + use_bias]); + bias_ptrs.push_back(&biases[local_exp_label * out_dim]); } } } @@ -1064,7 +1061,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, gemm_batch_count, stream); - //checkCUDA(cudaStreamSynchronize(stream)); + // checkCUDA(cudaStreamSynchronize(stream)); int aggregation_parallelism = std::max(num_tokens, gemm_batch_count) * out_dim; @@ -1133,8 +1130,6 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, checkCUDA( cudaMalloc(&token_idx_array, num_chosen_experts * effective_batch_size * sizeof(float *))); - checkCUDA( - cudaMalloc(&dev_weights, num_experts * (1 + use_bias) * sizeof(float *))); checkCUDA( cudaMalloc(&weight_idx_array, num_chosen_experts * effective_batch_size * sizeof(float *))); @@ -1226,7 +1221,6 @@ ExpertsMeta::~ExpertsMeta(void) { checkCUDA(cudaFree(num_assignments_per_expert)); checkCUDA(cudaFree(destination_start_indices)); checkCUDA(cudaFree(token_idx_array)); - checkCUDA(cudaFree(dev_weights)); checkCUDA(cudaFree(weight_idx_array)); checkCUDA(cudaFree(coefficient_idx_array)); checkCUDA(cudaFree(output_idx_array)); From b74fa4f473b4eb9a8f437d725ea523c0d5846549 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 15 Apr 2023 23:52:52 -0400 Subject: [PATCH 095/344] LLaMA model (#662) * llama init * fix * load input seperately * fix load input index * fix code format * fix * fix * inference for element_unary and embedding * inference * fix inference * try to run inference * add inference cast * fix rms norm, inc_mha * remove debug print * c format * format * format * fix format * del * fix * split inference * fix * format * fix * fix * fix rocm * rocm * fix --------- Co-authored-by: Gabriele Oliaro --- CMakeLists.txt | 4 + examples/cpp/inference/LLAMA/CMakeLists.txt | 21 + examples/cpp/inference/LLAMA/Makefile | 39 ++ examples/cpp/inference/LLAMA/dataloader.cc | 301 +++++++++++++++ examples/cpp/inference/LLAMA/dataloader.cu | 112 ++++++ examples/cpp/inference/LLAMA/llama.cc | 259 +++++++++++++ examples/cpp/inference/LLAMA/llama.h | 107 ++++++ examples/cpp/inference/dataloader.cu | 2 +- include/flexflow/model.h | 1 + include/flexflow/ops/cast.h | 9 + include/flexflow/ops/element_unary.h | 9 + .../ops/inc_multihead_self_attention.h | 7 +- .../ops/inc_multihead_self_attention_params.h | 2 +- include/flexflow/ops/rms_norm.h | 10 +- include/flexflow/ops/rms_norm_params.h | 2 +- include/flexflow/ops/split.h | 9 + include/flexflow/substitution_loader.h | 180 ++++----- include/flexflow/utils/cuda_helper.h | 9 + include/flexflow/utils/hip_helper.h | 6 + python/flexflow/core/flexflow_cffi.py | 59 ++- python/flexflow/type.py | 1 + python/flexflow_c.cc | 31 ++ python/flexflow_c.h | 14 + src/ops/cast.cc | 74 ++++ src/ops/element_unary.cc | 100 +++++ src/ops/embedding.cc | 1 - src/ops/inc_multihead_self_attention.cc | 26 +- src/ops/inc_multihead_self_attention.cu | 157 +++++--- src/ops/kernels/rms_norm_kernels.cpp | 59 +++ src/ops/kernels/rms_norm_kernels.cu | 61 +-- src/ops/rms_norm.cc | 109 ++++-- src/ops/split.cc | 80 ++++ src/runtime/cuda_helper.cu | 50 ++- src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 5 +- src/runtime/model.cc | 24 ++ src/runtime/substitution.cc | 8 + triton/src/model.cc | 358 +++++++++--------- triton/src/types.h | 1 + 39 files changed, 1930 insertions(+), 379 deletions(-) create mode 100644 examples/cpp/inference/LLAMA/CMakeLists.txt create mode 100644 examples/cpp/inference/LLAMA/Makefile create mode 100644 examples/cpp/inference/LLAMA/dataloader.cc create mode 100644 examples/cpp/inference/LLAMA/dataloader.cu create mode 100644 examples/cpp/inference/LLAMA/llama.cc create mode 100644 examples/cpp/inference/LLAMA/llama.h create mode 100644 src/ops/kernels/rms_norm_kernels.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d61798f4e8..6c73eed895 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -456,6 +456,10 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/transformers) endif() +if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/LLAMA) +endif() + # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/examples/cpp/inference/LLAMA/CMakeLists.txt b/examples/cpp/inference/LLAMA/CMakeLists.txt new file mode 100644 index 0000000000..48e9322af8 --- /dev/null +++ b/examples/cpp/inference/LLAMA/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_LLAMA) +set(project_target LLAMA) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + llama.cc + llama.h + dataloader.cc) + +set(GPU_SRC +dataloader.cu) + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/LLAMA/Makefile b/examples/cpp/inference/LLAMA/Makefile new file mode 100644 index 0000000000..4249443f7d --- /dev/null +++ b/examples/cpp/inference/LLAMA/Makefile @@ -0,0 +1,39 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama +# List all the application source files here +GEN_SRC = llama.cc dataloader.cc +GEN_GPU_SRC = dataloader.cu +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc new file mode 100644 index 0000000000..a09230029f --- /dev/null +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -0,0 +1,301 @@ +#include "llama.h" +#include + +using namespace Legion; + +DataLoader::DataLoader(FFModel &ff, + LLAMAConfig const *llamaconfig, + ParallelTensor const &input) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + num_samples = llamaconfig->sentence_len; + + { + batch_input = input; + int num_dims = input->num_dims; + + ParallelDim dims[num_dims]; + for (int i = 0; i < num_dims; i++) { + if (i == 0) { + dims[i].size = 1; + } else { + dims[i].size = input->dims[i].size; + } + + dims[i].degree = 1; + dims[i].parallel_idx = -1; + dims[i].is_replica_dim = input->dims[i].is_replica_dim; + // Assume only the first dim can be the replica dim + assert(i == num_dims - 1 || (!dims[i].is_replica_dim)); + } + dims[num_dims - 1].size = num_samples; + full_input = + ff.create_parallel_tensor_legion_ordering(num_dims, dims, DT_INT64); + assert(full_input != nullptr && "full_input is nullptr"); + ff.map_tensor(full_input, NULL /*parallel_op*/); + } + + size_t llamaconfig_size = sizeof(llamaconfig); + std::cout << "llama config dataloader: " << llamaconfig->input_path; + + // Load entire dataset + TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, + TaskArgument(llamaconfig, llamaconfig_size)); + // regions[1]: full_input + launcher.add_region_requirement(RegionRequirement(full_input->region, + WRITE_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_FB_MEMORY)); + launcher.add_field(0, FID_DATA); + runtime->execute_task(ctx, launcher); +} + +void DataLoader::load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 1); + assert(task->regions.size() == 1); + LLAMAConfig const *llamaconfig = (LLAMAConfig *)task->args; + + AccessorWO const acc_input(regions[0], FID_DATA); + Rect<3> rect_input = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + assert(acc_input.accessor.is_dense_arbitrary(rect_input)); + + long *input_ptr = acc_input.ptr(rect_input.lo); + std::cout << "load entire dataset" << rect_input.volume(); + + // load from file + load_from_file(input_ptr, + rect_input.volume(), + "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/tokens/" + "llama_demo_tokens"); +} + +void DataLoader::next_batch(FFModel &ff, + BatchConfig *bc, + std::map &batch_predictions) { + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Load Input + { + Domain domain = + runtime->get_index_space_domain(ctx, batch_input->parallel_is); + ArgumentMap argmap; + // int idx = next_index; + // for (Domain::DomainPointIterator it(domain); it; it++) { + // SampleIdxs meta; + // assert(ff.config.batchSize % batch_input->dims[1].size == 0); + // meta.num_samples = ff.config.batchSize / batch_input->dims[2].size; + // for (int i = 0; i < meta.num_samples; i++) { + // meta.idxs[i] = idx++; + // meta.token_idx = next_token_idx; + // meta.batch_idx = next_batch_index; + // } + + // argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); + // } + + DataLoaderNextBatchInput next_batch_input = {bc->token2ids, + batch_predictions}; + DataLoaderNextBatchInput const *ptr = &next_batch_input; + size_t next_batch_input_sz = sizeof(next_batch_input); + assert(ptr->prev_batch_preds.size() == batch_predictions.size()); + + std::cout << "next batch internal" << std::endl; + IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, + batch_input->parallel_is, + TaskArgument(ptr, next_batch_input_sz), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + batch_input->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(full_input->region, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + full_input->region, + MAP_TO_ZC_MEMORY)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_input->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_input->region)); + launcher.add_field(1, FID_DATA); + + runtime->execute_index_space(ctx, launcher); + } + // progress next_index + next_index += ff.config.batchSize; + next_token_idx += 1; +} + +void DataLoader::reset() { + next_index = 0; + next_token_idx = 0; + next_batch_index = 0; +} + +template +void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { + + std::cout << "load from file: " << filename << std::endl; + std::ifstream in(filename, std::ios::in | std::ios::binary); + std::vector host_array(size); + size_t loaded_data_size = sizeof(T) * size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + // std::cout << "size seee" << std::endl; + // std::cout << loaded_data_size << std::endl; + // std::cout << in_get_size << std::endl; + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + + // std::cout << "finish loading input"; + assert(size == host_array.size()); + + // normal + long data_index = 0; + for (auto v : host_array) { + ptr[data_index++] = v; + } + in.close(); +} + +template +void DataLoader::load_attention_weights(T *ptr, + size_t size, + std::string layer_name, + std::string weight_path) { + + std::string q_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wq_weight"; + std::string k_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wk_weight"; + std::string v_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wv_weight"; + std::string o_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wo_weight"; + std::vector weight_files = {q_file, k_file, v_file, o_file}; + + size_t index = 0; + int file_index = 0; + + // q, k, v, o -> 0, 1, 2, 3 + for (auto file : weight_files) { + std::cout << "file name and index: " << file << "->" << file_index << "\n"; + size_t partial_size = size / 4; + std::ifstream in(file, std::ios::in | std::ios::binary); + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(T) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + assert(partial_size == host_array.size()); + + size_t one_head_size = 4096 * 128; + size_t data_index = 0; + + for (int i = 0; i < 32; i++) { + size_t start_index = i * one_head_size * 4 + file_index * one_head_size; + // if (file_index == 3) { + // printf("print wo start index %d, data %.10f\n", + // start_index, + // host_array.at(data_index)); + // } + for (size_t j = start_index; j < start_index + one_head_size; j++) { + ptr[j] = host_array.at(data_index); + data_index += 1; + } + } + file_index++; + + in.close(); + index++; + } +} + +void DataLoader::store_outputs(BatchConfig *bc, + InferenceResult const &ir, + std::map &batch_predictions) { + assert(bc->token2ids.num_samples == bc->num_active_tokens() && + bc->token2ids.num_samples <= bc->MAX_NUM_TOKENS); + + std::cout << "store outputs...." << std::endl; + batch_predictions.clear(); + size_t guid = bc->token2ids.guids[0]; + size_t start_idx = bc->token2ids.token_indexes[0].token_position; + + for (size_t i = 0; i <= bc->token2ids.num_samples; i++) { + if (i == bc->token2ids.num_samples || bc->token2ids.guids[i] != guid) { + // see how many tokens has been put to model in this req + // to get the index of the final token + int result_index = + bc->token2ids.token_indexes[i - 1].token_position - start_idx; + batch_predictions[guid] = ir.results[i - 1]; + std::cout << "i: " << i << ", dds-" << guid << ", result index" + << result_index << ", result value: " << batch_predictions[guid] + << "\n"; + + if (i < bc->token2ids.num_samples) { + guid = bc->token2ids.guids[i]; + start_idx = bc->token2ids.token_indexes[i].token_position; + } + } + } + // bc->print(); + // for (size_t i = 0; i < bc->num_active_requests(); i++) { + // batch_predictions[i] = ir.results[i]; + // std::cout << "i: " << i << ", ith pred: " << i + // << ", value: " << batch_predictions[i] + // << std::endl; + // } + assert(batch_predictions.size() == bc->num_active_requests()); +} + +template void DataLoader::load_attention_weights( + float *ptr, size_t size, std::string layer_name, std::string weight_path); +template void DataLoader::load_from_file(long *ptr, + size_t size, + std::string filename); +template void DataLoader::load_from_file(float *ptr, + size_t size, + std::string filename); + +void FlexFlow::register_custom_tasks() { + // Load entire dataset + { + TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Entire Dataset Task"); + } + // Load input + { + TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Load Inputs Task"); + } +} diff --git a/examples/cpp/inference/LLAMA/dataloader.cu b/examples/cpp/inference/LLAMA/dataloader.cu new file mode 100644 index 0000000000..f2480c8592 --- /dev/null +++ b/examples/cpp/inference/LLAMA/dataloader.cu @@ -0,0 +1,112 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/cuda_helper.h" +#include "llama.h" + +void DataLoader::load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + LLAMAConfig llamaconfig; + assert(regions.size() == 2); + assert(task->regions.size() == 2); + // SampleIdxs *meta = (SampleIdxs *)task->local_args; + + DataLoaderNextBatchInput const input_struct = + *((DataLoaderNextBatchInput *)task->args); + BatchConfig::SampleIdxs const &meta = input_struct.meta; + std::map const &prev_batch_preds = + input_struct.prev_batch_preds; + + TensorAccessorR full_input( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + TensorAccessorW batch_input(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + Domain full_input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain batch_input_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + coord_t sequence_length = + batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; + coord_t batch_size = + batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; + + // copy 1 token from each batch + // FIXME: currently assume continous indices + size_t guid = meta.guids[0]; + size_t start_idx = meta.token_indexes[0].token_position; + size_t dst_idx = 0; + + std::cout << "num samples " << meta.num_samples << "\n"; + + for (size_t i = 0; i <= meta.num_samples; i++) { + + // if the first token in one request + if (i == meta.num_samples || meta.guids[i] != guid) { + size_t tokens_to_copy = + (meta.token_indexes[i - 1].token_position - start_idx + 1); + std::cout << "size to copy: " << tokens_to_copy << "\n"; + + if (tokens_to_copy > 1 || meta.token_indexes[i - 1].token_position < + meta.token_indexes[i - 1].initial_length) { + // token pos < init length, the init length is the input sentence length + // so this is the initial input, load from file. + + size_t copy_start_index = guid * llamaconfig.sentence_len; + std::cout << "copy index: " << copy_start_index << "\n"; + copy_kernel<<>>( + batch_input.ptr + dst_idx, + full_input.ptr + copy_start_index, + tokens_to_copy); + + std::cout << "------------req---------------: " << guid << "\n"; + if (guid == 0) { + std::cout << "guid: " << meta.guids[i] << ", i: " << i << std::endl; + } + for (int i = 0; i < 8; i++) { + std::cout << "value: " << full_input.ptr[copy_start_index + i] + << std::endl; + } + std::cout << "dst index: " << dst_idx << "\n"; + + } else { + // for token by token generating, get token from the previous inference. + + long token = prev_batch_preds.at(guid); + std::cout << "next iter " << meta.token_indexes[i - 1].token_position + << ", dst_idx: " << dst_idx << ", token:" << token << "\n"; + long *dst_ptr = batch_input.ptr + dst_idx; + + cudaMemcpy(dst_ptr, &token, sizeof(long), cudaMemcpyHostToDevice); + } + + // update for next req + if (i < meta.num_samples) { + guid = meta.guids[i]; + start_idx = meta.token_indexes[i].token_position; + } + dst_idx = i; + } + } + + std::cout << "load input finished....." << std::endl; +} diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc new file mode 100644 index 0000000000..3d745d8bd5 --- /dev/null +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -0,0 +1,259 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "llama.h" +#include "flexflow/inference.h" + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("llama"); + +void parse_input_args(char **argv, int argc, LLAMAConfig &config) { + for (int i = 1; i < argc; i++) { + + // input + if (!strcmp(argv[i], "--dataset")) { + config.input_path = std::string(argv[++i]); + continue; + } + + // weights + if (!strcmp(argv[i], "--weights")) { + config.weight_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + LLAMAConfig llamaConfig; + FFModel ff(ffconfig); + std::unordered_map weights_layers; + + // InputArgs const &command_args = HighLevelRuntime::get_input_args(); + // char **argv = command_args.argv; + // int argc = command_args.argc; + // parse_input_args(argv, argc, llamaConfig); + + std::cout << "print llama config: " << llamaConfig.input_path << "-->" + << llamaConfig.batchSize; + + //------------------------------ build the model -------------------------- + Tensor input; + { + int const token_dims[] = {llamaConfig.batchSize, llamaConfig.max_seq_len}; + input = ff.create_tensor<2>(token_dims, DT_INT64); + } + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + Tensor token = ff.embedding(input, + llamaConfig.vocab_size, + llamaConfig.dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *embedding = ff.layers.back(); + weights_layers.emplace("tok_embeddings_weight", embedding); + + // std::cout << "------token shape"; + // std::cout << token->num_dims << "------\n"; + // for (int i = 0; i < token->num_dims; i++) { + // std::cout << token->dims[i] << "------\n"; + // } + + // n transformer blocks impl + for (int i = 0; i < 1; i++) { + // step 1: attention + std::vector axes = {2}; + Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); + Layer *attention_norm = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_attention_norm_weight", + attention_norm); + Tensor mha = + ff.inc_multihead_self_attention(att_norm, + llamaConfig.dim, + llamaConfig.n_heads, + llamaConfig.dim / llamaConfig.n_heads, + llamaConfig.dim / llamaConfig.n_heads, + 0.0f, + true, + false, + false, + NULL, + true); + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + token = ff.add(token, mha); + + // step 2: SILU activaion + Tensor ff_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); + Layer *ffn_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", + ffn_layer); + + Tensor w1 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); + Layer *w1_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); + + Tensor w3 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); + Layer *w3_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); + + Tensor sigmoid = ff.sigmoid(w1); + Tensor silu = ff.multiply(w1, sigmoid); + Tensor multi = ff.multiply(silu, w3); + + Tensor w2 = ff.dense(multi, llamaConfig.dim, AC_MODE_NONE, false); + Layer *w2_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); + token = ff.add(token, w2); + } + + // final normalization and linear + std::vector axes = {2}; + token = ff.rms_norm(token, 1e-6, 4096); + Layer *final_norm = ff.layers.back(); + weights_layers.emplace("norm_weight", final_norm); + Tensor dense = ff.dense(token, llamaConfig.vocab_size, AC_MODE_NONE, false); + Layer *final_linear = ff.layers.back(); + weights_layers.emplace("output_weight", final_linear); + Tensor output = ff.arg_top_k(dense, /*k=*/1, false); + + //------------------- compile the model -------------------------------- + std::cout << "------start compile ----------" << std::endl; + InferenceManager im(&ff, llamaConfig.batchSize, 1); + im.compile_model_and_allocate_buffer(); + + std::cout << "------init ops----------" << std::endl; + im.init_operators_inference(); + std::cout << "------model compiled and init ----------" << std::endl; + + //------------------------------ load inputs -------------------------- + std::cout << "------create dataloaders ----------" << std::endl; + // read prompt into input + ParallelTensor input_pt; + ff.get_parallel_tensor_from_tensor(input, input_pt); + assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); + std::cout << im.tensor_buffer[input_pt].size() << std::endl; + DataLoader loader(ff, &llamaConfig, im.tensor_buffer[input_pt].at(0)); + + //------------------------------ load weights--------------------------- + for (auto &v : weights_layers) { + Tensor weight = v.second->weights[0]; + std::cout << "weights layer: " << v.first << "\n"; + + if (weight == NULL) { + std::cout << "op no weights : " << v.first << "\n"; + continue; + } + + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < weight->num_dims; i++) { + dims_vec.push_back(weight->dims[i]); + volume *= weight->dims[i]; + } + + assert(weight->data_type == DT_FLOAT); + float *data = (float *)malloc(sizeof(float) * volume); + + if (v.first.find("attention_w") != std::string::npos) { + loader.load_attention_weights( + data, volume, v.first, llamaConfig.weight_file_path); + + } else { + loader.load_from_file( + data, volume, llamaConfig.weight_file_path + v.first); + if (v.first.find("attention_norm") != std::string::npos) { + // std::cout << "norm weight data" << std::endl; + // for (int i = 0; i < 100; i++) { + // std::cout << data[i] << ", "; + // } + } + } + + ParallelTensor weight_pt; + ff.get_parallel_tensor_from_tensor(weight, weight_pt); + weight_pt->set_tensor(&ff, dims_vec, data); + } + std::cout << "------load wieght finished----------" << std::endl; + + //------------------------------ do inference--------------------------- + int processed_requests = 0; + std::map future_handlers; + std::map batch_configs; + BatchConfig *bc = nullptr; + std::map batch_predictions[1]; + loader.reset(); + + bool new_req = true; + + while (processed_requests < llamaConfig.sentence_len) { + int bid = 0; + size_t max_reqs, max_tkns; + if (future_handlers.find(bid) == future_handlers.end()) { + bc = new BatchConfig(); + } else { + // have luanched this bid + Future future = future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } else { + std::cout << "future is ready...." << std::endl; + } + // process end + InferenceResult ir = future.get_result(); + bc = batch_configs[bid]; + + std::cout << "store outputs start...." << std::endl; + loader.store_outputs(bc, ir, batch_predictions[bid]); + processed_requests += bc->update_results(ir); + + if (!new_req) { + break; + } + new_req = false; + } + // batch cofig register 5 reqs + // init length relate to the min_prompt_size for llama + if (new_req) { + for (int i = 0; i < llamaConfig.batchSize; i++) { + assert(bc->register_new_request(i, llamaConfig.max_seq_len, 347)); + } + } + + bc->prepare_next_batch(); + std::cout << "new tokens: " << bc->num_active_tokens(); + loader.next_batch(ff, bc, batch_predictions[bid]); + + FutureMap fm = im.inference(bid, *bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h new file mode 100644 index 0000000000..17300f6d0d --- /dev/null +++ b/examples/cpp/inference/LLAMA/llama.h @@ -0,0 +1,107 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/model.h" +#define MAX_NUM_SAMPLES 65536 +#define MAX_TOKEN_LEN 32000 + +using namespace Legion; +using namespace FlexFlow; + +struct LLAMAConfig { + LLAMAConfig(void) { + // todo read from config/param file + n_layers = 32; + vocab_size = 32000; + n_heads = 32; + dim = 4096; + multiple_of = 256; + norm_eps = 1e-6; + total_sentence = 5; + sentence_len = 347; + max_gen_length = 256; + batchSize = 5; + total_requests = 2560; + incremental_mode = true; + sequence_length = MAX_SEQ_LEN; + max_seq_len = 8; + + // todo from args + weight_file_path = + "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/weights/"; + input_path = "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/tokens/" + "llama_demo_tokens"; + + // hidden dim + hidden_dim = 4 * dim; + hidden_dim = int(2 * hidden_dim / 3); + hidden_dim = + multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); + } + int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, + total_sentence, sentence_len, batchSize, total_requests, incremental_mode, + sequence_length, max_gen_length, max_seq_len; + float norm_eps; + std::string weight_file_path; + std::string input_path; +}; + +class DataLoader { +public: + DataLoader(FFModel &ff, + LLAMAConfig const *llamaconfig, + ParallelTensor const &input); + void next_batch(FFModel &ff, + BatchConfig *bc, + std::map &batch_predictions); + void reset(); + static void load_entire_dataset(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + static void load_input(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime); + + template + static void load_from_file(T *ptr, size_t size, std::string filename); + + template + static void load_attention_weights(T *ptr, + size_t size, + std::string layer_name, + std::string weight_path); + void store_outputs(BatchConfig *bc, + InferenceResult const &ir, + std::map &batch_predictions); + +public: + int num_samples, next_index, next_token_idx, next_batch_index; + std::map> outputs; + FlexFlow::ParallelTensor full_input, batch_input; +}; + +struct SampleIdxs { + int num_samples; + int idxs[MAX_NUM_SAMPLES]; + int token_idx; + int batch_idx; +}; + +struct DataLoaderNextBatchInput { + BatchConfig::SampleIdxs const &meta; + std::map const &prev_batch_preds; +}; diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu index 71dc14db49..8dcb8c3ab7 100644 --- a/examples/cpp/inference/dataloader.cu +++ b/examples/cpp/inference/dataloader.cu @@ -105,7 +105,7 @@ void DataLoader::load_input(Task const *task, assert(prev_batch_preds.find(guid) != prev_batch_preds.end()); int token = prev_batch_preds.at(guid); int *dst_ptr = batch_input_ptr + dst_idx; - cudaMemcpy(dst_ptr, &token, 1, cudaMemcpyHostToDevice); + cudaMemcpy(dst_ptr, &token, sizeof(int), cudaMemcpyHostToDevice); // copy_kernel<<>>(dst_ptr, &token, tokens_to_copy); // cudaMemcpyAsync(batch_input_ptr + dst_idx * token_dim, &token, 1, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6873ce5e43..40080c9840 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -597,6 +597,7 @@ class FFModel { bool add_bias_kv = false, bool add_zero_attn = false, Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, char const *name = NULL); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/ops/cast.h b/include/flexflow/ops/cast.h index 2d69b9469e..b1e078f60e 100644 --- a/include/flexflow/ops/cast.h +++ b/include/flexflow/ops/cast.h @@ -35,8 +35,17 @@ class Cast : public Op { Input const &input, char const *name = nullptr); void init(FFModel const &); + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &); void backward(FFModel const &); + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) { assert(0); } diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index 42ab25aaf8..e60084fc78 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -44,8 +44,17 @@ class ElementUnary : public Op { Input const x, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 716a2563cd..8daafd8565 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -11,6 +11,7 @@ #include "flexflow/ops/inc_multihead_self_attention_params.h" #include "math.h" #include +#include namespace FlexFlow { @@ -32,6 +33,7 @@ class IncMultiHeadSelfAttention : public Op { bool _bias, bool _add_bias_kv, bool _add_zero_attn, + bool _apply_rotary_embedding, bool allocate_weights, char const *name); IncMultiHeadSelfAttention(FFModel &model, @@ -45,6 +47,7 @@ class IncMultiHeadSelfAttention : public Op { bool _bias, bool _add_bias_kv, bool _add_zero_attn, + bool _apply_rotary_embedding, bool allocate_weights, char const *name); IncMultiHeadSelfAttention(FFModel &model, @@ -88,6 +91,7 @@ class IncMultiHeadSelfAttention : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, float const *input_ptr, @@ -99,7 +103,7 @@ class IncMultiHeadSelfAttention : public Op { int num_heads; float dropout; bool bias; - bool add_bias_kv, add_zero_attn; + bool add_bias_kv, add_zero_attn, apply_rotary_embedding; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; }; @@ -120,6 +124,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int num_heads; bool *has_load_weights; + bool *apply_rotary_embedding; #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index e7535dc23d..d263bc741a 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -10,7 +10,7 @@ struct IncMultiHeadSelfAttentionParams { LayerID layer_guid; int embed_dim, num_heads, kdim, vdim; float dropout; - bool bias, add_bias_kv, add_zero_attn; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 4100baadcf..db18ebdd39 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -17,11 +17,19 @@ class RMSNorm : public Op { LayerID const &_layer_guid, const ParallelTensor _input, float _eps, + int dim, + bool allocate_weights, char const *name); RMSNorm(FFModel &model, RMSNormParams const ¶ms, ParallelTensor input, + bool allocate_weights, char const *name = nullptr); + + RMSNorm(FFModel &model, + RMSNorm const &other, + const ParallelTensor input, + bool allocate_weights); void init(FFModel const &); void forward(FFModel const &); void backward(FFModel const &); @@ -68,7 +76,7 @@ class RMSNorm : public Op { float eps; char op_name[MAX_OPNAME]; int effective_batch_size; - int data_dim; + int dim, data_dim; }; } // namespace FlexFlow #endif // _FLEXFLOW_RMS_NORM_H \ No newline at end of file diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h index c5d71f71ce..82a459009a 100644 --- a/include/flexflow/ops/rms_norm_params.h +++ b/include/flexflow/ops/rms_norm_params.h @@ -8,7 +8,7 @@ namespace FlexFlow { struct RMSNormParams { LayerID layer_guid; float eps; - + int dim; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index 633268ffbf..cd40d73e18 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -22,6 +22,15 @@ class Split : public Op { const Input input, char const *name = nullptr); void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; void print_layer(FFModel const &model) override { diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h index 9f9db223f2..776fe2c78e 100644 --- a/include/flexflow/substitution_loader.h +++ b/include/flexflow/substitution_loader.h @@ -41,95 +41,97 @@ NLOHMANN_JSON_SERIALIZE_ENUM(PMParameter, {PM_PARALLEL_DEGREE, "PM_PARALLEL_DEGREE"}, {PM_PAD, "PM_PAD"}}) -NLOHMANN_JSON_SERIALIZE_ENUM(OperatorType, - {{OP_INVALID, nullptr}, - {OP_NOOP, "OP_NOOP"}, - {OP_CONV2D, "OP_CONV2D"}, - {OP_DROPOUT, "OP_DROPOUT"}, - {OP_LINEAR, "OP_LINEAR"}, - {OP_BATCHMATMUL, "OP_BATCHMATMUL"}, - {OP_POOL2D, "OP_POOL2D_MAX"}, - {OP_SCALAR_MULTIPLY, "OP_SCALAR_MULTIPLY"}, - {OP_SCALAR_ADD, "OP_SCALAR_ADD"}, - {OP_SCALAR_FLOOR_DIV, "OP_SCALAR_FLOOR_DIV"}, - {OP_SCALAR_TRUE_DIV, "OP_SCALAR_TRUE_DIV"}, - {OP_SCALAR_SUB, "OP_SCALAR_SUB"}, - {OP_RELU, "OP_RELU"}, - {OP_IDENTITY, "OP_IDENTITY"}, - {OP_SIGMOID, "OP_SIGMOID"}, - {OP_TANH, "OP_TANH"}, - {OP_ELU, "OP_ELU"}, - {OP_FLAT, "OP_FLAT"}, - {OP_SOFTMAX, "OP_SOFTMAX"}, - {OP_BATCHNORM, "OP_BATCHNORM"}, - {OP_CONCAT, "OP_CONCAT"}, - {OP_SPLIT, "OP_SPLIT"}, - {OP_EMBEDDING, "OP_EMBEDDING"}, - {OP_GROUP_BY, "OP_GROUP_BY"}, - {OP_CACHE, "OP_CACHE"}, - {OP_AGGREGATE, "OP_AGGREGATE"}, - {OP_AGG_SPEC, "OP_AGG_SPEC"}, - {OP_RESHAPE, "OP_RESHAPE"}, - {OP_REVERSE, "OP_REVERSE"}, - {OP_TRANSPOSE, "OP_TRANSPOSE"}, - {OP_EW_ADD, "OP_EW_ADD"}, - {OP_EW_MUL, "OP_EW_MUL"}, - {OP_MATMUL, "OP_MATMUL"}, - {OP_MUL, "OP_MUL"}, - {OP_ENLARGE, "OP_ENLARGE"}, - {OP_MERGE_GCONV, "OP_MERGE_GCONV"}, - {OP_CONSTANT_IMM, "OP_CONSTANT_IMM"}, - {OP_CONSTANT_ICONV, "OP_CONSTANT_ICONV"}, - {OP_CONSTANT_ONE, "OP_CONSTANT_ONE"}, - {OP_CONSTANT_POOL, "OP_CONSTANT_POOL"}, - {OP_SQUEEZE, "OP_SQUEEZE"}, - {OP_UNSQUEEZE, "OP_UNSQUEEZE"}, - {OP_EW_SUB, "OP_EW_SUB"}, - {OP_EW_DIV, "OP_EW_DIV"}, - {OP_EW_EQUAL, "OP_EW_EQUAL"}, - {OP_EW_GREATER, "OP_EW_GREATER"}, - {OP_EW_LESS, "OP_EW_LESS"}, - {OP_EW_MAX, "OP_EW_MAX"}, - {OP_EW_MIN, "OP_EW_MIN"}, - {OP_REDUCE_ARGMAX, "OP_REDUCE_ARGMAX"}, - {OP_REDUCE_ARGMIN, "OP_REDUCE_ARGMIN"}, - {OP_REDUCE_MAX, "OP_REDUCE_MAX"}, - {OP_REDUCE_MEAN, "OP_REDUCE_MEAN"}, - {OP_REDUCE_MIN, "OP_REDUCE_MIN"}, - {OP_REDUCE_PROD, "OP_REDUCE_PROD"}, - {OP_REDUCE_SUM, "OP_REDUCE_SUM"}, - {OP_PAD, "OP_PAD"}, - {OP_SHAPE, "OP_SHAPE"}, - {OP_SIZE, "OP_SIZE"}, - {OP_TOPK, "OP_TOPK"}, - {OP_WHERE, "OP_WHERE"}, - {OP_CEIL, "OP_CEIL"}, - {OP_CAST, "OP_CAST"}, - {OP_EXP, "OP_EXP"}, - {OP_ROUND, "OP_ROUND"}, - {OP_LOG, "OP_LOG"}, - {OP_LOGICAL_NOT, "OP_LOGICAL_NOT"}, - {OP_SQRT, "OP_SQRT"}, - {OP_SIN, "OP_SIN"}, - {OP_COS, "OP_COS"}, - {OP_LEAKYRELU, "OP_LEAKYRELU"}, - {OP_SLICE, "OP_SLICE"}, - {OP_RESIZE, "OP_RESIZE"}, - {OP_PRELU, "OP_PRELU"}, - {OP_GELU, "OP_GELU"}, - {OP_MULTIHEAD_ATTENTION, - "OP_MULTIHEAD_ATTENTION"}, - {OP_FUSED, "OP_FUSED"}, - {OP_RSQRT, "OP_RSQRT"}, - {OP_POW, "OP_POW"}, - {OP_MEAN, "OP_MEAN"}, - {OP_LAYERNORM, "OP_LAYERNORM"}, - {OP_REPARTITION, "OP_PARTITION"}, - {OP_COMBINE, "OP_COMBINE"}, - {OP_REPLICATE, "OP_REPLICATE"}, - {OP_REDUCTION, "OP_REDUCE"}, - {OP_PIPELINE, "OP_PIPELINE"}, - {OP_FUSED_PARALLEL, "OP_FUSED_PARALLEL"}}) +NLOHMANN_JSON_SERIALIZE_ENUM( + OperatorType, + {{OP_INVALID, nullptr}, + {OP_NOOP, "OP_NOOP"}, + {OP_CONV2D, "OP_CONV2D"}, + {OP_DROPOUT, "OP_DROPOUT"}, + {OP_LINEAR, "OP_LINEAR"}, + {OP_BATCHMATMUL, "OP_BATCHMATMUL"}, + {OP_POOL2D, "OP_POOL2D_MAX"}, + {OP_SCALAR_MULTIPLY, "OP_SCALAR_MULTIPLY"}, + {OP_SCALAR_ADD, "OP_SCALAR_ADD"}, + {OP_SCALAR_FLOOR_DIV, "OP_SCALAR_FLOOR_DIV"}, + {OP_SCALAR_TRUE_DIV, "OP_SCALAR_TRUE_DIV"}, + {OP_SCALAR_SUB, "OP_SCALAR_SUB"}, + {OP_RELU, "OP_RELU"}, + {OP_IDENTITY, "OP_IDENTITY"}, + {OP_SIGMOID, "OP_SIGMOID"}, + {OP_TANH, "OP_TANH"}, + {OP_ELU, "OP_ELU"}, + {OP_FLAT, "OP_FLAT"}, + {OP_SOFTMAX, "OP_SOFTMAX"}, + {OP_BATCHNORM, "OP_BATCHNORM"}, + {OP_CONCAT, "OP_CONCAT"}, + {OP_SPLIT, "OP_SPLIT"}, + {OP_EMBEDDING, "OP_EMBEDDING"}, + {OP_GROUP_BY, "OP_GROUP_BY"}, + {OP_CACHE, "OP_CACHE"}, + {OP_AGGREGATE, "OP_AGGREGATE"}, + {OP_AGG_SPEC, "OP_AGG_SPEC"}, + {OP_RESHAPE, "OP_RESHAPE"}, + {OP_REVERSE, "OP_REVERSE"}, + {OP_TRANSPOSE, "OP_TRANSPOSE"}, + {OP_EW_ADD, "OP_EW_ADD"}, + {OP_EW_MUL, "OP_EW_MUL"}, + {OP_MATMUL, "OP_MATMUL"}, + {OP_MUL, "OP_MUL"}, + {OP_ENLARGE, "OP_ENLARGE"}, + {OP_MERGE_GCONV, "OP_MERGE_GCONV"}, + {OP_CONSTANT_IMM, "OP_CONSTANT_IMM"}, + {OP_CONSTANT_ICONV, "OP_CONSTANT_ICONV"}, + {OP_CONSTANT_ONE, "OP_CONSTANT_ONE"}, + {OP_CONSTANT_POOL, "OP_CONSTANT_POOL"}, + {OP_SQUEEZE, "OP_SQUEEZE"}, + {OP_UNSQUEEZE, "OP_UNSQUEEZE"}, + {OP_EW_SUB, "OP_EW_SUB"}, + {OP_EW_DIV, "OP_EW_DIV"}, + {OP_EW_EQUAL, "OP_EW_EQUAL"}, + {OP_EW_GREATER, "OP_EW_GREATER"}, + {OP_EW_LESS, "OP_EW_LESS"}, + {OP_EW_MAX, "OP_EW_MAX"}, + {OP_EW_MIN, "OP_EW_MIN"}, + {OP_REDUCE_ARGMAX, "OP_REDUCE_ARGMAX"}, + {OP_REDUCE_ARGMIN, "OP_REDUCE_ARGMIN"}, + {OP_REDUCE_MAX, "OP_REDUCE_MAX"}, + {OP_REDUCE_MEAN, "OP_REDUCE_MEAN"}, + {OP_REDUCE_MIN, "OP_REDUCE_MIN"}, + {OP_REDUCE_PROD, "OP_REDUCE_PROD"}, + {OP_REDUCE_SUM, "OP_REDUCE_SUM"}, + {OP_PAD, "OP_PAD"}, + {OP_SHAPE, "OP_SHAPE"}, + {OP_SIZE, "OP_SIZE"}, + {OP_TOPK, "OP_TOPK"}, + {OP_WHERE, "OP_WHERE"}, + {OP_CEIL, "OP_CEIL"}, + {OP_CAST, "OP_CAST"}, + {OP_EXP, "OP_EXP"}, + {OP_ROUND, "OP_ROUND"}, + {OP_LOG, "OP_LOG"}, + {OP_LOGICAL_NOT, "OP_LOGICAL_NOT"}, + {OP_SQRT, "OP_SQRT"}, + {OP_SIN, "OP_SIN"}, + {OP_COS, "OP_COS"}, + {OP_LEAKYRELU, "OP_LEAKYRELU"}, + {OP_SLICE, "OP_SLICE"}, + {OP_RESIZE, "OP_RESIZE"}, + {OP_PRELU, "OP_PRELU"}, + {OP_GELU, "OP_GELU"}, + {OP_MULTIHEAD_ATTENTION, "OP_MULTIHEAD_ATTENTION"}, + {OP_INC_MULTIHEAD_SELF_ATTENTION, "OP_INC_MULTIHEAD_SELF_ATTENTION"}, + {OP_FUSED, "OP_FUSED"}, + {OP_RSQRT, "OP_RSQRT"}, + {OP_POW, "OP_POW"}, + {OP_MEAN, "OP_MEAN"}, + {OP_LAYERNORM, "OP_LAYERNORM"}, + {OP_RMS_NORM, "OP_RMS_NORM"}, + {OP_REPARTITION, "OP_PARTITION"}, + {OP_COMBINE, "OP_COMBINE"}, + {OP_REPLICATE, "OP_REPLICATE"}, + {OP_REDUCTION, "OP_REDUCE"}, + {OP_PIPELINE, "OP_PIPELINE"}, + {OP_FUSED_PARALLEL, "OP_FUSED_PARALLEL"}}) namespace FlexFlow { namespace substitution_loader { diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 4271919911..2ea7227879 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -82,6 +82,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_discrete(DT *dst, + const DT *src, + Legion::coord_t size, + size_t *index); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); @@ -133,6 +139,9 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); + template T *download_tensor(T const *ptr, size_t num_elements); diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 6970832231..f78102c0fe 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -133,6 +133,12 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +T *download_tensor(T const *ptr, size_t num_elements); + +template +bool download_tensor(T const *ptr, T *dst, size_t num_elements); + miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, Legion::Domain domain); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 42339d781c..3c7ea11160 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -427,6 +427,13 @@ class MultiHeadAttention(Op): def __init__(self, handle, idx=None, name=None): super(MultiHeadAttention, self).__init__(handle, idx, name) +# ----------------------------------------------------------------------- +# Increamental MultiHeadAttention +# ----------------------------------------------------------------------- +class IncMultiHeadAttention(Op): + def __init__(self, handle, idx=None, name=None): + super(IncMultiHeadAttention, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # flexflow_op_t handle to Op # ----------------------------------------------------------------------- @@ -506,7 +513,9 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): elif op_type == OpType.REVERSE: return Reverse(handle, idx, name) elif op_type == OpType.MULTIHEAD_ATTENTION: - return Reverse(handle, idx, name) + return MultiHeadAttention(handle, idx, name) + elif op_type == OpType.INC_MULTIHEAD_ATTENTION: + return MultiHeadAttention(handle, idx, name) elif op_type == OpType.RSQRT: return Rsqrt(handle, idx, name) elif op_type == OpType.POW: @@ -1950,7 +1959,55 @@ def multihead_attention(self, query, key, value, handle = ffc.flexflow_model_add_multihead_attention(self.handle, query.handle, key.handle, value.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) self.add_layer(OpType.MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) + def inc_multihead_attention(self, input, + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + kernel_initializer=None, name=None): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, + and returns the dot-product attention between them:. + + :param input: the input Tensor. + :type query: Tensor + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) def reset_metrics(self): """Reset performance metrics. diff --git a/python/flexflow/type.py b/python/flexflow/type.py index c647f9fdb7..51e93f15e3 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -87,6 +87,7 @@ class OpType(Enum): MAX = 2053 MIN = 2054 MULTIHEAD_ATTENTION = 2060 + INC_MULTIHEAD_ATTENTION = 2061 GETITEM = 2070 GETATTR = 2080 EXPAND = 2081 diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 74a5da6ce1..0535aeba85 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -997,6 +997,37 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( return FFCObjectWrapper::wrap(tensor); } +flexflow_tensor_t flexflow_model_add_inc_multihead_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + flexflow_initializer_t kernel_initializer_, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = handle->inc_multihead_self_attention(input, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + kernel_initializer, + name); + return FFCObjectWrapper::wrap(tensor); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); diff --git a/python/flexflow_c.h b/python/flexflow_c.h index fb64c78fd2..26a184b2c2 100644 --- a/python/flexflow_c.h +++ b/python/flexflow_c.h @@ -369,6 +369,20 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_initializer_t kernel_initializer, char const *name); +flexflow_tensor_t flexflow_model_add_inc_multihead_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + flexflow_initializer_t kernel_initializer_, + char const *name); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); diff --git a/src/ops/cast.cc b/src/ops/cast.cc index 25f8e168b1..3adf85a435 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -146,6 +146,44 @@ void Cast::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void Cast::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + + IndexLauncher launcher(CAST_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Cast)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + OpMeta *Cast::init_task(Task const *task, std::vector const ®ions, Context ctx, @@ -186,6 +224,42 @@ void Cast::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Cast::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(CAST_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, false), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + template void Cast::forward_task_with_1_type(Task const *task, std::vector const ®ions, diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 60112bfdc9..6326feb7db 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -285,6 +285,56 @@ void ElementUnary::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void ElementUnary::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher init_launcher(ELEMENTUNARY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ElementUnary)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + if (!inplace) { + init_launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + init_launcher.add_field(0, FID_DATA); + init_launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + init_launcher.add_field(1, FID_DATA); + } else { + init_launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + init_launcher.add_field(0, FID_DATA); + } + FutureMap fm = runtime->execute_index_space(ctx, init_launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + OpMeta *ElementUnary::init_task(Task const *task, std::vector const ®ions, Context ctx, @@ -355,6 +405,56 @@ void ElementUnary::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap + ElementUnary::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(ELEMENTUNARY_FWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + if (inplace) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + } else { + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + void ElementUnary::forward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 06186d969f..832e3e3deb 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -381,7 +381,6 @@ void Embedding::init_inference(FFModel const &ff, MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(EMBED_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Embedding)), diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index a46ad1e6a6..8b2850a91c 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -15,8 +15,13 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/model.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif #include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" #ifdef INFERENCE_TESTS #include using namespace at::indexing; @@ -41,6 +46,7 @@ using Legion::Runtime; using Legion::Task; using Legion::TaskArgument; using Legion::TaskLauncher; +using PCG::Node; bool IncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { @@ -58,6 +64,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, bool add_bias_kv, bool add_zero_attn, Initializer *kernel_initializer, + bool apply_rotary_embedding, char const *name) { // Currently assume that Layer *li = new Layer(this, @@ -105,6 +112,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, li->add_int_property("add_bias_kv", add_bias_kv); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); + li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); layers.push_back(li); return li->outputs[0]; } @@ -130,6 +138,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( bool add_bias_kv = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + layer->get_int_property("apply_rotary_embedding", value); + bool apply_rotary_embedding = (bool)value; return new IncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], @@ -141,6 +151,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( bias, add_bias_kv, add_zero_attn, + apply_rotary_embedding, false /*allocate_weights*/, layer->name); } @@ -157,6 +168,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _bias, bool _add_bias_kv, bool _add_zero_attn, + bool _apply_rotary_embedding, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -170,6 +182,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( _input), num_heads(_num_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -239,6 +252,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _bias, bool _add_bias_kv, bool _add_zero_attn, + bool _apply_rotary_embedding, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -253,6 +267,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( _weight), num_heads(_num_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -325,6 +340,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.bias, other.add_bias_kv, other.add_zero_attn, + other.apply_rotary_embedding, allocate_weights, other.name) {} @@ -345,6 +361,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.bias, params.add_bias_kv, params.add_zero_attn, + params.apply_rotary_embedding, allocate_weights, name) {} @@ -487,7 +504,6 @@ FutureMap IncMultiHeadSelfAttention::inference( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; - printf("BatchConfig, num_tokens: %d, num_requests: %d\n", bc.num_tokens, bc.num_requests); @@ -554,6 +570,7 @@ void IncMultiHeadSelfAttention::inference_task( assert(input_domain.get_dim() == 4); assert(weight_domain.get_dim() == 3); assert(output_domain.get_dim() == 4); + /* print_tensor(input.get_float_ptr(), input_domain.get_volume(), "[Attention:forward:query]"); */ @@ -1297,15 +1314,14 @@ bool IncMultiHeadSelfAttention::measure_operator_cost( return false; } -using PCG::Node; - bool operator==(IncMultiHeadSelfAttentionParams const &lhs, IncMultiHeadSelfAttentionParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && - lhs.add_zero_attn == rhs.add_zero_attn; + lhs.add_zero_attn == rhs.add_zero_attn && + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; } IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { @@ -1319,6 +1335,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.bias = this->bias; params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; + params.apply_rotary_embedding = this->apply_rotary_embedding; return params; } @@ -1337,6 +1354,7 @@ size_t hash::operator()( hash_combine(key, params.bias); hash_combine(key, params.add_bias_kv); hash_combine(key, params.add_zero_attn); + hash_combine(key, params.apply_rotary_embedding); return key; } }; // namespace std diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7c422a8a0e..8b53f047c0 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -12,7 +12,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -38,6 +40,65 @@ __global__ void build_w_out_tensor(float const *weight_ptr, } } +__global__ void apply_rotary_embedding(float *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::token_idxs const *id_map, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int q_block_size, + int k_block_size, + int v_block_size, + bool q_tensor) { + int proj_size = q_tensor ? qProjSize : kProjSize; + CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { + // create complex number + int head_idx = i / (num_tokens * proj_size / 2); + int idx = i % (num_tokens * proj_size / 2); + int real_part_index = + idx * 2 + head_idx * (q_block_size + k_block_size + v_block_size) + + (q_tensor ? 0 : q_block_size); + int complex_part_index = real_part_index + 1; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + // int head_idx = i / (num_tokens * proj_size); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + size_t pos = id_map[token_idx].token_position; + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + + int pos_i = i % (proj_size / 2); + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[real_part_index + 1] = complex_input[i].y; + + // if (i % 64 == 1 && head_idx == 0) { + // printf("head id: %d, tokenid: %d, pospospos:-> %d, before real part + // %f, " + // "before complex part: %f, real part: %f," + // "complext part: %f, freq_cis real: %f, freq_cis commplexx + // %f\n", head_idx, token_idx, pos, before_real, before_complex, + // complex_input[i].x, + // complex_input[i].y, + // complex_pos.x, + // complex_pos.y); + // } + } +} + void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, float const *input_ptr, @@ -47,7 +108,6 @@ void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); @@ -98,7 +158,7 @@ void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // K + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -146,6 +206,50 @@ void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // apply rotary emmmbedding for k and v + // step1 change the k, v to complex tensor + int num_tokens = bc->num_active_tokens(); + int parallelism = m->kProjSize * num_tokens * m->num_heads; + int q_block_size = m->qProjSize * num_tokens; + int k_block_size = m->kProjSize * num_tokens; + int v_block_size = m->vProjSize * num_tokens; + cuFloatComplex *complex_input; + if (*m->apply_rotary_embedding) { + checkCUDA(cudaMalloc(&complex_input, + num_tokens * m->qProjSize * m->num_heads * + sizeof(cuFloatComplex *) / 2)); + /*q*/ + apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->dev_token2ids, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + true); + /*k*/ + apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->dev_token2ids, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + false); + } } __global__ void store_kv_cache(float const *devQKVProjArray, @@ -171,7 +275,6 @@ __global__ void store_kv_cache(float const *devQKVProjArray, float val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + token_idx * proj_size + data_idx]; - int const req_id = id_map[token_idx].request_index; int const tok_id = id_map[token_idx].token_position; @@ -200,6 +303,7 @@ void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, m->num_heads, MAX_SEQ_LEN, /* k_cache = */ true); + parallelism = m->vProjSize * num_tokens * m->num_heads; store_kv_cache<<num_processing_tokens[i]; int total_tokens = bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; int n = total_tokens; @@ -273,6 +376,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, int strideA = qkv_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; + float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; // To get A, skip over Q entries from previous requests (same head) void const *A = (void const *)(m->devQKVProjArray + @@ -284,39 +388,6 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, void *C = (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); - /*printf("\n------------ QK multiplication (CUDA) -------------\n"); - printf("req: %i, num_new_tokens: %i, total_tokens: %i, - tokens_previous_requests: %i, tokens_prev_requests_squares: %i\n", i, - num_new_tokens, total_tokens, tokens_previous_requests, - tokens_prev_requests_squares); printf("About to multiply the following - matrices (printing only first head):\n"); printf("A:\n"); float - *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, - BatchConfig::MAX_NUM_TOKENS * (m->qProjSize + m->kProjSize + m->vProjSize) * - m->num_heads); assert(QKVProjArray_cpu != nullptr); float *keyCache_cpu = - download_tensor(m->keyCache, - m->num_heads * m->kProjSize * - BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); - assert(keyCache_cpu != nullptr); - for (int aaa=0; aaa < m->qProjSize; aaa++) { - for (int bbb=0; bbbqProjSize + aaa]); - } - printf("\n"); - } - printf("B:\n"); - for (int aaa=0; aaa < m->kProjSize; aaa++) { - for (int bbb=0; bbb < total_tokens; bbb++) { - printf("%f ", keyCache_cpu[i * kt_req_block_size + bbb*m->kProjSize + - aaa]); - } - printf("\n"); - } - checkCUDA(cudaFreeHost(QKVProjArray_cpu)); - checkCUDA(cudaFreeHost(keyCache_cpu)); - printf("------------------------------------------------------------\n"); - printf("CUDA alpha: %f", alpha);*/ - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -340,7 +411,6 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -396,7 +466,6 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, &beta, qk_tensor, (void *)((float *)C_softmax))); - // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; m_ = num_new_tokens; @@ -440,7 +509,6 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; @@ -470,7 +538,6 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } @@ -494,6 +561,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + // reload the weight_o if (!(*m->has_load_weights)) { @@ -511,7 +579,6 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( m->vSize * m->vProjSize)); *m->has_load_weights = true; } - // phase 1: Implement kernel to compute KQV for input tokens inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); @@ -565,12 +632,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(qProjSize == kProjSize); // required for attention QK^T matmul vProjSize = attn->vProjSize; oProjSize = attn->oProjSize; + num_heads = _num_heads; weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); weightSize = weights_params * num_heads * sizeof(float); has_load_weights = (bool *)calloc(1, sizeof(bool)); *has_load_weights = false; + apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); + *apply_rotary_embedding = attn->apply_rotary_embedding; // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); @@ -638,6 +708,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( num_heads, (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); } + cudaStreamSynchronize(stream); } diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp new file mode 100644 index 0000000000..5de12b3f1f --- /dev/null +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -0,0 +1,59 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/kernels/rms_norm_kernels.h" +#include "flexflow/ops/rms_norm.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms) + : OpMeta(handler, rms) {} + +namespace Kernels { +namespace RMSNorm { + +void forward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + handle_unimplemented_hip_kernel(OP_RMS_NORM); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + } +} + +} // namespace RMSNorm +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 5fa13d064a..f7945b316d 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -74,7 +74,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -92,8 +92,18 @@ __global__ void sum += static_cast(X[index]) * static_cast(X[index]); } sum = BlockReduceSum(sum, v_shared); // use BlockReduceSum() to sum X_ij^2 + if (threadIdx.x == 0) { - rms[i] = sqrt((static_cast(N) / sum) + static_cast(eps)); + rms[i] = rsqrt((sum / static_cast(N)) + static_cast(eps)); + // printf("index: %d, rms norm mean value: %.15f, rms norm sum value: " + // "%.20f, eps: %f, value: %.20f, num:%d, num2: %d\n", + // i, + // sum / static_cast(N), + // sum, + // static_cast(eps), + // rms[i], + // blockDim.x, + // warpSize); } } @@ -107,13 +117,23 @@ __global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { } } +__global__ void elewise_apply_weights(int64_t batch_size, + int64_t in_dim, + float const *norm, + float const *weights, + float *output) { + CUDA_KERNEL_LOOP(i, batch_size * in_dim) { + output[i] = norm[i] * weights[i % in_dim]; + } +} + void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - + int parallelism = m->batch_size * m->in_dim; cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -124,31 +144,18 @@ void forward_kernel_wrapper(RMSNormMeta const *m, RowwiseRootMeanSquareKernel <<batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( m->in_dim, m->eps, input.get_float_ptr(), m->rms_ptr); + NormKernel<<batch_size, kCUDANumThreads, 0, stream>>>( m->in_dim, input.get_float_ptr(), m->rms_ptr, m->norm_ptr); - checkCUDA(cublasGemmEx( - m->handle.blas, - CUBLAS_OP_T, // transpose weight (column major) - CUBLAS_OP_N, - m->in_dim, - m->batch_size, - m->in_dim, - &(m->alpha), - weight.get_float_ptr(), // weight, shape (in_dim, in_dim) - CUDA_R_32F, - m->in_dim, - m->norm_ptr, // norm, shape (in_dim, batch_size) - CUDA_R_32F, - m->in_dim, - &(m->beta), - output - .get_float_ptr(), // output, shape (in_dim, batch_size), same as norm - CUDA_R_32F, - m->in_dim, - CUDA_R_32F, - CUBLAS_GEMM_DFALT_TENSOR_OP)); - + elewise_apply_weights<<>>(m->batch_size, + m->in_dim, + m->norm_ptr, + weight.get_float_ptr(), + output.get_float_ptr()); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -157,11 +164,9 @@ void forward_kernel_wrapper(RMSNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(input.get_float_ptr(), 32, "[RMSNorm:forward:input]"); - print_tensor(output.get_float_ptr(), 32, "[RMSNorm:forward:output]"); } } } // namespace RMSNorm } // namespace Kernels -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 3d8daa4389..5e02160b7c 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -50,6 +50,7 @@ RMSNormParams RMSNorm::get_params() const { RMSNormParams params; params.layer_guid = this->layer_guid; params.eps = this->eps; + params.dim = this->dim; return params; } @@ -69,7 +70,7 @@ Tensor FFModel::rms_norm(const Tensor input, input->num_dims, input->dims, DT_FLOAT, rm, 0, true /*create_grad*/); // weights - int weight_dims[1] = {input->dims[input->num_dims - 1]}; + int weight_dims[1] = {dim}; rm->weights[0] = create_weight_legion_ordering(1, weight_dims, DT_FLOAT, @@ -77,7 +78,9 @@ Tensor FFModel::rms_norm(const Tensor input, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); + rm->add_float_property("eps", eps); + rm->add_int_property("dim", dim); layers.push_back(rm); return rm->outputs[0]; } @@ -88,19 +91,44 @@ Op *RMSNorm::create_operator_from_layer( std::vector const &inputs) { float eps; layer->get_float_property("eps", eps); - return new RMSNorm(model, layer->layer_guid, inputs[0], eps, layer->name); + long long value; + layer->get_int_property("dim", value); + int dim = value; + + return new RMSNorm( + model, layer->layer_guid, inputs[0], eps, dim, false, layer->name); } RMSNorm::RMSNorm(FFModel &model, RMSNormParams const ¶ms, ParallelTensor const input, + bool allocate_weights = false, char const *name) - : RMSNorm(model, params.layer_guid, input, params.eps, name) {} + : RMSNorm(model, + params.layer_guid, + input, + params.eps, + params.dim, + allocate_weights, + name) {} +RMSNorm::RMSNorm(FFModel &model, + RMSNorm const &other, + const ParallelTensor input, + bool allocate_weights) + : RMSNorm(model, + other.layer_guid, + input, + other.eps, + other.dim, + allocate_weights, + other.name) {} RMSNorm::RMSNorm(FFModel &model, LayerID const &_layer_guid, const ParallelTensor _input, float _eps, + int dim, + bool allocate_weights, char const *name) : Op(model, OP_RMS_NORM, @@ -110,10 +138,11 @@ RMSNorm::RMSNorm(FFModel &model, 1 /*num of weights tensor */, 1 /*onum of utputs tensor */, _input) { - + eps = _eps; inputs[0] = _input; - + layer_guid = _layer_guid; int num_dims = _input->num_dims; + this->dim = dim; data_dim = _input->dims[0].size; effective_batch_size = 1; for (int i = 1; i <= num_dims - 2; i++) { @@ -122,32 +151,32 @@ RMSNorm::RMSNorm(FFModel &model, // output has the same parallel dims as input ParallelDim output_dims[MAX_TENSOR_DIM]; - ParallelDim weight_dims[MAX_TENSOR_DIM]; for (int i = 0; i < _input->num_dims; i++) { output_dims[i] = _input->dims[i]; - weight_dims[i] = _input->dims[i]; - weight_dims[i].size = 1; } - - // weights should have the shape of (data_dim, data_dim) - weight_dims[0].size = _input->dims[0].size; - weight_dims[1].size = _input->dims[0].size; - outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, output_dims, _input->data_type, this); - // weights - Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - - // TODO: weight dims check - weights[0] = - model.create_parallel_weight_legion_ordering(_input->num_dims, - weight_dims, - _input->data_type, - this /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + if (allocate_weights) { + // weights should have the shape of (data_dim, data_dim) + ParallelDim new_weight_dims[MAX_TENSOR_DIM]; + + new_weight_dims[0] = _input->dims[_input->num_dims - 1]; + new_weight_dims[1].size = dim; + new_weight_dims[1].degree = 1; + new_weight_dims[1].parallel_idx = -1; + + // weights + Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); + weights[0] = + model.create_parallel_weight_legion_ordering(2, + new_weight_dims, + _input->data_type, + NULL /*owner_op*/, + false /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } } void RMSNorm::init(FFModel const &ff) { @@ -177,6 +206,13 @@ void RMSNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -215,6 +251,13 @@ void RMSNorm::init_inference(FFModel const &ff, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -319,17 +362,18 @@ void RMSNorm::forward_task(Task const *task, assert(regions.size() == 3); RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); forward_kernel_wrapper(m, input, weight, output); } void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->eps); + sez.serialize(this->dim); } using PCG::Node; @@ -341,13 +385,16 @@ Node RMSNorm::deserialize(FFModel &ff, assert(num_inputs == 1); float eps; size_t id; + int dim; dez.deserialize(id); + LayerID layer_guid(id); dez.deserialize(eps); - + dez.deserialize(dim); RMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; + params.dim = dim; return ff.get_or_create_node(inputs[0], params); } @@ -372,6 +419,8 @@ size_t hash::operator()( FlexFlow::RMSNormParams const ¶ms) const { size_t key = 0; hash_combine(key, params.eps); + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.dim); return key; } }; // namespace std \ No newline at end of file diff --git a/src/ops/split.cc b/src/ops/split.cc index 4f60cb96f0..a9a5000f3d 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -170,6 +170,47 @@ void Split::init(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +void Split::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + + IndexLauncher launcher(SPLIT_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Split)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + for (int i = 0; i < numOutputs; i++) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(i + 1, FID_DATA); + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + OpMeta *Split::init_task(Task const *task, std::vector const ®ions, Context ctx, @@ -205,6 +246,45 @@ void Split::forward(FFModel const &ff) { } runtime->execute_index_space(ctx, launcher); } +FutureMap Split::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(SPLIT_FWD_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Split)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + for (int i = 0; i < numOutputs; i++) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(i + 1, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} void calc_block_size(coord_t &num_blks, coord_t &blk_size, diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index edd5b18e0f..7dc0adeb38 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -62,6 +62,14 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void + copy_kernel_discrete(DT *dst, const DT *src, coord_t size, size_t *index) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[index[i]]; + } +} + template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { CUDA_KERNEL_LOOP(i, n) { @@ -215,8 +223,8 @@ __host__ void int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); - if (idx >= 16) { + printf(" %.20lf", (float)host_ptr[idx]); + if (idx >= 50) { break; } } @@ -224,6 +232,29 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void + save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + // device synchronize to make sure the data are ready + // checkCUDA(cudaDeviceSynchronize()); + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + // checkCUDA(cudaDeviceSynchronize()); + + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.20f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + template __host__ T *download_tensor(T const *ptr, size_t num_elements) { // device synchronize to make sure the data are ready @@ -398,6 +429,15 @@ template __global__ void template __global__ void copy_kernel(int64_t *dst, int64_t const *src, coord_t size); +template __global__ void copy_kernel_discrete(float *dst, + float const *src, + coord_t size, + size_t *index); +template __global__ void copy_kernel_discrete(int64_t *dst, + int64_t const *src, + coord_t size, + size_t *index); + template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, size_t size, @@ -424,6 +464,12 @@ template __host__ void template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int64_t const *ptr, + size_t rect, + char const *file_name); + template __host__ float *download_tensor(float const *ptr, size_t num_elements); template __host__ double *download_tensor(double const *ptr, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 7c9a68f3b5..b02150d153 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -163,6 +163,8 @@ std::string get_operator_type_name(OperatorType type) { return "Mean"; case OP_LAYERNORM: return "LayerNorm"; + case OP_RMS_NORM: + return "RMSNorm"; case OP_IDENTITY: return "Identity"; // Parallel Ops diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 432467bbcf..866dcd3505 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2271,6 +2271,7 @@ GraphOptimalViewSerialized sez.serialize(attn->bias); sez.serialize(attn->add_bias_kv); sez.serialize(attn->add_zero_attn); + sez.serialize(attn->apply_rotary_embedding); break; } case OP_SOFTMAX: { @@ -2636,7 +2637,7 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; float dropout; - bool bias, add_bias_kv, add_zero_attn; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -2648,6 +2649,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(bias); dez.deserialize(add_bias_kv); dez.deserialize(add_zero_attn); + dez.deserialize(apply_rotary_embedding); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2659,6 +2661,7 @@ void FFModel::deserialize_graph_optimal_view( params.add_bias_kv = add_bias_kv; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; + params.apply_rotary_embedding = apply_rotary_embedding; node = get_or_create_node(inputs[0], params); break; } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 12a14d808d..915688f3c8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -49,6 +49,7 @@ #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" #include "flexflow/ops/reverse.h" +#include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" @@ -1581,6 +1582,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], for (int i = 0; i < NDIM; i++) { p->dims[i] = dims[NDIM - 1 - i]; } + assert(p->get_volume() > 0); assert(p->check_valid()); return p; @@ -2824,6 +2826,11 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_RMS_NORM: { + Op *op = RMSNorm::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_LINEAR: { Op *op = Linear::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3032,6 +3039,7 @@ void FFModel::compile(LossType loss_type, for (size_t l = 0; l < operators.size(); l++) { Op *op = operators[l]; + for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i]->owner_op != NULL); } @@ -3040,6 +3048,7 @@ void FFModel::compile(LossType loss_type, assert(op->weights[i]->region != LogicalRegion::NO_REGION); parameters.push_back(op->weights[i]); } + op->map_output_tensors(*this); // for (int i = 0; i < op->numOutputs; i++) { // // Output tensor @@ -4235,6 +4244,21 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "layernorm_fwd_task"); } + // rms norm task + { + TaskVariantRegistrar registrar(RMSNROM_INIT_TASK_ID, "rmsnorm_init_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "rmsnorm_init_task"); + } + { + TaskVariantRegistrar registrar(RMSNROM_FWD_TASK_ID, "rmsnorm_fwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "rmsnorm_fwd_task"); + } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 0cdb8be30b..df2f8b05b3 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -32,6 +32,7 @@ #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" +#include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" #include "flexflow/parallel_ops/combine.h" @@ -3715,6 +3716,13 @@ bool FFModel::convert_graph_to_operators( break; break; } + case OP_RMS_NORM: { + assert(inList.size() == 1); + RMSNorm *rms = (RMSNorm *)node.ptr; + new_op = new RMSNorm(*this, *rms, inputs[0], true); + break; + break; + } case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; diff --git a/triton/src/model.cc b/triton/src/model.cc index a61b207bdd..6d5da30bea 100644 --- a/triton/src/model.cc +++ b/triton/src/model.cc @@ -22,20 +22,22 @@ using namespace Legion; -namespace triton { namespace backend { namespace legion { - -TRITONSERVER_Error* -LegionModelState::Create( - TRITONBACKEND_Model* triton_model, const std::string& name, - uint64_t version, LegionTritonRuntime* runtime, LegionModelState** state) -{ +namespace triton { +namespace backend { +namespace legion { + +TRITONSERVER_Error *LegionModelState::Create(TRITONBACKEND_Model *triton_model, + std::string const &name, + uint64_t version, + LegionTritonRuntime *runtime, + LegionModelState **state) { std::unique_ptr lstate; try { lstate.reset(new LegionModelState(triton_model, runtime, name, version)); - } - catch (const BackendModelException& ex) { + } catch (BackendModelException const &ex) { RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + ex.err_ == nullptr, + TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelException")); RETURN_IF_ERROR(ex.err_); } @@ -45,15 +47,15 @@ LegionModelState::Create( // Auto-complete the configuration if requested... bool auto_complete_config = false; - RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( - triton_model, &auto_complete_config)); + RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(triton_model, + &auto_complete_config)); if (auto_complete_config) { RETURN_IF_ERROR(lstate->AutoCompleteConfig()); triton::common::TritonJson::WriteBuffer json_buffer; lstate->ModelConfig().Write(&json_buffer); - TRITONSERVER_Message* message; + TRITONSERVER_Message *message; RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson( &message, json_buffer.Base(), json_buffer.Size())); RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig( @@ -62,21 +64,21 @@ LegionModelState::Create( RETURN_IF_ERROR(lstate->ValidateModelConfig()); *state = lstate.release(); runtime->RecordModel(*state); - return nullptr; // success + return nullptr; // success } -LegionModelState::~LegionModelState(void) -{ +LegionModelState::~LegionModelState(void) { FreeLayers(); - for (auto& input : inputs_) delete input.second; - if (strategy_) + for (auto &input : inputs_) { + delete input.second; + } + if (strategy_) { delete strategy_; + } runtime_->RemoveModel(this); } -TRITONSERVER_Error* -LegionModelState::LoadModel() -{ +TRITONSERVER_Error *LegionModelState::LoadModel() { // TODO: load files based on the default / cc file name that may be set // in model config auto model_path = JoinPath({RepositoryPath(), std::to_string(Version())}); @@ -87,12 +89,16 @@ LegionModelState::LoadModel() // load the ONNX model description as a list of layers // with tensor dependences between then and put them in layers_ RETURN_IF_ERROR(OnnxParser::LoadModel( - [this]( - Realm::Processor::Kind kind) -> const std::vector& { + [this](Realm::Processor::Kind kind) + -> std::vector const & { return runtime_->FindLocalProcessors(kind); }, - this, strategy_, JoinPath({model_path, "model.onnx"}), &inputs_, - &outputs_, &layers_)); + this, + strategy_, + JoinPath({model_path, "model.onnx"}), + &inputs_, + &outputs_, + &layers_)); RETURN_IF_ERROR(SetOutputInfos()); // Should have the same number of layers in both cases @@ -107,18 +113,14 @@ LegionModelState::LoadModel() return nullptr; } -unsigned -LegionModelState::ReserveInstance(void) -{ +unsigned LegionModelState::ReserveInstance(void) { AutoLock lock(lock_); unsigned result = instances_.size(); instances_.resize(result + 1, nullptr); return result; } -void -LegionModelState::RecordInstance(LegionModelInstance* instance) -{ +void LegionModelState::RecordInstance(LegionModelInstance *instance) { assert(instance->model_state_ == this); AutoLock lock(lock_, false /*exclusive*/); assert(instance->index_ < instances_.size()); @@ -126,27 +128,30 @@ LegionModelState::RecordInstance(LegionModelInstance* instance) instances_[instance->index_] = instance; } -void -LegionModelState::initialize( - LegionModelInstance* instance, const unsigned instance_index, - Runtime* runtime, Context ctx, MapperID mapper) -{ +void LegionModelState::initialize(LegionModelInstance *instance, + unsigned const instance_index, + Runtime *runtime, + Context ctx, + MapperID mapper) { // First create logical regions for all the input tensors - for (auto& input : inputs_) instance->create_tensor_region(input.second); + for (auto &input : inputs_) { + instance->create_tensor_region(input.second); + } - for (auto layer : layers_) + for (auto layer : layers_) { layer->initialize(instance, instance_index, runtime, ctx, mapper); + } } -void -LegionModelState::forward( - LegionModelInstance* instance, const unsigned instance_index, - Runtime* runtime, Context ctx, MapperID mapper, - const std::vector& inputs, - const std::vector& outputs, - std::vector& compute_input_end_ns, - std::vector& compute_output_start_ns) -{ +void LegionModelState::forward(LegionModelInstance *instance, + unsigned const instance_index, + Runtime *runtime, + Context ctx, + MapperID mapper, + std::vector const &inputs, + std::vector const &outputs, + std::vector &compute_input_end_ns, + std::vector &compute_output_start_ns) { assert(inputs.size() == inputs_.size()); assert(outputs.size() == outputs_.size()); // Attach the external memory allocations to the logical regions for the @@ -154,34 +159,40 @@ LegionModelState::forward( const std::vector fields(1, FID_DATA); std::vector input_regions(inputs.size()); for (unsigned idx = 0; idx < inputs.size(); idx++) { - const InputTensor& input = inputs[idx]; + InputTensor const &input = inputs[idx]; assert(input.buffers_.size() == 1); assert(input.buffer_locations_.size() == 1); assert(input.buffer_memories_.size() == 1); assert(input.strides_.size() == inputs_[idx].second->bounds.size()); LogicalRegion region = inputs_[idx].second->region[instance_index]; - AttachLauncher launcher( - LEGION_EXTERNAL_INSTANCE, region, region, false /*restricted*/, - false /*mapped*/); - launcher.attach_array_soa( - const_cast(input.buffers_[0]), false /*not column major*/, - fields, input.buffer_memories_[0]); + AttachLauncher launcher(LEGION_EXTERNAL_INSTANCE, + region, + region, + false /*restricted*/, + false /*mapped*/); + launcher.attach_array_soa(const_cast(input.buffers_[0]), + false /*not column major*/, + fields, + input.buffer_memories_[0]); input_regions[idx] = runtime->attach_external_resource(ctx, launcher); } std::vector output_regions(outputs.size()); for (unsigned idx = 0; idx < outputs.size(); idx++) { - const OutputTensor& output = outputs[idx]; + OutputTensor const &output = outputs[idx]; assert(output.buffers_.size() == 1); assert(output.buffer_locations_.size() == 1); assert(output.buffer_memories_.size() == 1); assert(output.strides_.size() == outputs_[idx].second->bounds.size()); LogicalRegion region = outputs_[idx].second->region[instance_index]; - AttachLauncher launcher( - LEGION_EXTERNAL_INSTANCE, region, region, false /*restricted*/, - false /*mapped*/); - launcher.attach_array_soa( - output.buffers_[0], false /*not column major*/, fields, - output.buffer_memories_[0]); + AttachLauncher launcher(LEGION_EXTERNAL_INSTANCE, + region, + region, + false /*restricted*/, + false /*mapped*/); + launcher.attach_array_soa(output.buffers_[0], + false /*not column major*/, + fields, + output.buffer_memories_[0]); output_regions[idx] = runtime->attach_external_resource(ctx, launcher); } // Execution fence for timing operation @@ -191,45 +202,50 @@ LegionModelState::forward( // We can trace the execution of this model since it should be the same runtime->begin_trace(ctx, 0 /*only ever have one trace*/); - for (auto layer : layers_) + for (auto layer : layers_) { layer->forward(instance, instance_index, runtime, ctx, mapper); + } runtime->end_trace(ctx, 0 /*only ever have one trace*/); // Execution fence for timing operation runtime->issue_execution_fence(ctx); Future stop = runtime->issue_timing_measurement(ctx, timing_launcher); // Detach the external memory allocations - for (unsigned idx = 0; idx < input_regions.size(); idx++) + for (unsigned idx = 0; idx < input_regions.size(); idx++) { runtime->detach_external_resource(ctx, input_regions[idx], false /*flush*/); - for (unsigned idx = 0; idx < output_regions.size(); idx++) + } + for (unsigned idx = 0; idx < output_regions.size(); idx++) { runtime->detach_external_resource(ctx, output_regions[idx], true /*flush*/); + } const uint64_t start_time = start.get_result(); - for (unsigned idx = 0; idx < compute_input_end_ns.size(); idx++) + for (unsigned idx = 0; idx < compute_input_end_ns.size(); idx++) { compute_input_end_ns[idx] = start_time; + } const uint64_t stop_time = stop.get_result(); - for (unsigned idx = 0; idx < compute_output_start_ns.size(); idx++) + for (unsigned idx = 0; idx < compute_output_start_ns.size(); idx++) { compute_output_start_ns[idx] = stop_time; + } // Wait for everything to be done before we return Future done = runtime->issue_execution_fence(ctx); done.wait(); } -void -LegionModelState::finalize( - LegionModelInstance* instance, const unsigned instance_index, - Runtime* runtime, Context ctx, MapperID mapper) -{ - for (auto layer : layers_) +void LegionModelState::finalize(LegionModelInstance *instance, + unsigned const instance_index, + Runtime *runtime, + Context ctx, + MapperID mapper) { + for (auto layer : layers_) { layer->finalize(instance, instance_index, runtime, ctx, mapper); + } } -LegionModelInstance* -LegionModelState::FindInstance( - unsigned instance_index, bool external, bool need_lock) -{ +LegionModelInstance *LegionModelState::FindInstance(unsigned instance_index, + bool external, + bool need_lock) { if (need_lock) { if (external) { AutoLock lock(lock_, false /*exclusive*/); @@ -243,23 +259,17 @@ LegionModelState::FindInstance( return instances_[instance_index]; } -const PartitionStrategy* -LegionModelState::GetStrategy(void) const -{ +PartitionStrategy const *LegionModelState::GetStrategy(void) const { assert(strategy_ != nullptr); return strategy_; } -TRITONSERVER_Error* -LegionModelState::AutoCompleteConfig() -{ +TRITONSERVER_Error *LegionModelState::AutoCompleteConfig() { // FIXME: Check with the FFModel - return nullptr; // success + return nullptr; // success } -TRITONSERVER_Error* -LegionModelState::ValidateModelConfig() -{ +TRITONSERVER_Error *LegionModelState::ValidateModelConfig() { // Constraints that apply to models in general { triton::common::TritonJson::Value igs; @@ -295,8 +305,8 @@ LegionModelState::ValidateModelConfig() { // Build a map from name to tensors of the model for easy lookup - std::map tensors; - for (const auto& io : inputs_) { + std::map tensors; + for (auto const &io : inputs_) { tensors.emplace(io.first, io.second); } @@ -306,10 +316,10 @@ LegionModelState::ValidateModelConfig() if (ios.ArraySize() != tensors.size()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies " + - std::to_string(ios.ArraySize()) + " inputs, the model has " + - std::to_string(tensors.size())) + (std::string("configuration for model '" + Name() + "' specifies " + + std::to_string(ios.ArraySize()) + + " inputs, the model has " + + std::to_string(tensors.size())) .c_str())); } @@ -322,10 +332,11 @@ LegionModelState::ValidateModelConfig() // Check datatypes std::string io_dtype; RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - RETURN_ERROR_IF_TRUE( - (io_dtype == "TYPE_STRING"), TRITONSERVER_ERROR_INVALID_ARG, - std::string("unsupported datatype '") + io_dtype + "' for tensor '" + - io_name + "' for model '" + Name() + "'"); + RETURN_ERROR_IF_TRUE((io_dtype == "TYPE_STRING"), + TRITONSERVER_ERROR_INVALID_ARG, + std::string("unsupported datatype '") + io_dtype + + "' for tensor '" + io_name + "' for model '" + + Name() + "'"); // If a reshape is provided for the input then use that when // validating that the model matches what is expected. std::vector dims; @@ -335,11 +346,12 @@ LegionModelState::ValidateModelConfig() } else { RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); } - for (const auto dim : dims) { + for (auto const dim : dims) { RETURN_ERROR_IF_TRUE( - (dim == WILDCARD_DIM), TRITONSERVER_ERROR_INVALID_ARG, - std::string( - "dynamic tensor is not supported for model '" + Name() + "'")); + (dim == WILDCARD_DIM), + TRITONSERVER_ERROR_INVALID_ARG, + std::string("dynamic tensor is not supported for model '" + Name() + + "'")); } // Check the properties against the corresponding tensor @@ -347,28 +359,26 @@ LegionModelState::ValidateModelConfig() if (it == tensors.end()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies tensor '" + - io_name + "' which is not found in the model") + (std::string("configuration for model '" + Name() + + "' specifies tensor '" + io_name + + "' which is not found in the model") .c_str())); } - const auto& tensor = it->second; + auto const &tensor = it->second; if (ToDataType(ModelConfigDataTypeToTritonServerDataType(io_dtype)) != tensor->type) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies tensor '" + - io_name + "' with type '" + io_dtype + - "', the tensor in the model has type '" + - DataTypeString(tensor->type) + "'") + (std::string("configuration for model '" + Name() + + "' specifies tensor '" + io_name + "' with type '" + + io_dtype + "', the tensor in the model has type '" + + DataTypeString(tensor->type) + "'") .c_str())); } else if (tensor->type == DT_NONE) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "tensor '" + io_name + "' in the model '" + Name() + - "' has unknown type") + (std::string("tensor '" + io_name + "' in the model '" + Name() + + "' has unknown type") .c_str())); } if (max_batch_size_ != 0) { @@ -376,17 +386,17 @@ LegionModelState::ValidateModelConfig() } // put tensor's bound in int64_t to utilize backend common utilities std::vector tensor_bounds; - for (const auto bound : tensor->bounds) { + for (auto const bound : tensor->bounds) { tensor_bounds.emplace_back(bound); } if (dims != tensor_bounds) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies tensor '" + - io_name + "' with full shape " + ShapeToString(dims) + - ", the tensor in the model has shape " + - ShapeToString(tensor_bounds)) + (std::string("configuration for model '" + Name() + + "' specifies tensor '" + io_name + + "' with full shape " + ShapeToString(dims) + + ", the tensor in the model has shape " + + ShapeToString(tensor_bounds)) .c_str())); } } @@ -395,8 +405,8 @@ LegionModelState::ValidateModelConfig() // Outputs { // Build a map from name to tensors of the model for easy lookup - std::map tensors; - for (const auto& io : outputs_) { + std::map tensors; + for (auto const &io : outputs_) { tensors.emplace(io.first, io.second); } @@ -407,10 +417,10 @@ LegionModelState::ValidateModelConfig() if (ios.ArraySize() > tensors.size()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies " + - std::to_string(ios.ArraySize()) + " outputs, the model has " + - std::to_string(tensors.size())) + (std::string("configuration for model '" + Name() + "' specifies " + + std::to_string(ios.ArraySize()) + + " outputs, the model has " + + std::to_string(tensors.size())) .c_str())); } @@ -422,10 +432,11 @@ LegionModelState::ValidateModelConfig() // Check datatypes std::string io_dtype; RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - RETURN_ERROR_IF_TRUE( - (io_dtype == "TYPE_STRING"), TRITONSERVER_ERROR_INVALID_ARG, - std::string("unsupported datatype '") + io_dtype + "' for tensor '" + - io_name + "' for model '" + Name() + "'"); + RETURN_ERROR_IF_TRUE((io_dtype == "TYPE_STRING"), + TRITONSERVER_ERROR_INVALID_ARG, + std::string("unsupported datatype '") + io_dtype + + "' for tensor '" + io_name + "' for model '" + + Name() + "'"); // If a reshape is provided for the input then use that when // validating that the model matches what is expected. std::vector dims; @@ -435,11 +446,12 @@ LegionModelState::ValidateModelConfig() } else { RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); } - for (const auto dim : dims) { + for (auto const dim : dims) { RETURN_ERROR_IF_TRUE( - (dim == WILDCARD_DIM), TRITONSERVER_ERROR_INVALID_ARG, - std::string( - "dynamic tensor is not supported for model '" + Name() + "'")); + (dim == WILDCARD_DIM), + TRITONSERVER_ERROR_INVALID_ARG, + std::string("dynamic tensor is not supported for model '" + Name() + + "'")); } // Check the properties against the corresponding tensor @@ -447,28 +459,26 @@ LegionModelState::ValidateModelConfig() if (it == tensors.end()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies tensor '" + - io_name + "' which is not found in the model") + (std::string("configuration for model '" + Name() + + "' specifies tensor '" + io_name + + "' which is not found in the model") .c_str())); } - const auto& tensor = it->second; + auto const &tensor = it->second; if (ToDataType(ModelConfigDataTypeToTritonServerDataType(io_dtype)) != tensor->type) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies tensor '" + - io_name + "' with type '" + io_dtype + - "', the tensor in the model has type '" + - DataTypeString(tensor->type) + "'") + (std::string("configuration for model '" + Name() + + "' specifies tensor '" + io_name + "' with type '" + + io_dtype + "', the tensor in the model has type '" + + DataTypeString(tensor->type) + "'") .c_str())); } else if (tensor->type == DT_NONE) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "tensor '" + io_name + "' in the model '" + Name() + - "' has unknown type") + (std::string("tensor '" + io_name + "' in the model '" + Name() + + "' has unknown type") .c_str())); } if (max_batch_size_ != 0) { @@ -476,80 +486,78 @@ LegionModelState::ValidateModelConfig() } // put tensor's bound in int64_t to utilize backend common utilities std::vector tensor_bounds; - for (const auto bound : tensor->bounds) { + for (auto const bound : tensor->bounds) { tensor_bounds.emplace_back(bound); } if (dims != tensor_bounds) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "configuration for model '" + Name() + "' specifies tensor '" + - io_name + "' with full shape " + ShapeToString(dims) + - ", the tensor in the model has shape " + - ShapeToString(tensor_bounds)) + (std::string("configuration for model '" + Name() + + "' specifies tensor '" + io_name + + "' with full shape " + ShapeToString(dims) + + ", the tensor in the model has shape " + + ShapeToString(tensor_bounds)) .c_str())); } } } - return nullptr; // success + return nullptr; // success } -TRITONSERVER_Error* -LegionModelState::SetOutputInfos() -{ - for (const auto& output : outputs_) { +TRITONSERVER_Error *LegionModelState::SetOutputInfos() { + for (auto const &output : outputs_) { std::vector tensor_bounds; - for (const auto bound : output.second->bounds) { + for (auto const bound : output.second->bounds) { tensor_bounds.emplace_back(bound); } auto triton_dtype = ToTritonDataType(output.second->type); output_infos_.emplace_back(output.first, triton_dtype, tensor_bounds); } - return nullptr; // success + return nullptr; // success } -void -LegionModelState::LoadLayers(void) const -{ +void LegionModelState::LoadLayers(void) const { std::vector loaded_events; for (unsigned idx1 = 0; idx1 < layers_.size(); idx1++) { - Operator* op = layers_[idx1]; - const LayerStrategy* config = strategy_->layers[idx1]; + Operator *op = layers_[idx1]; + LayerStrategy const *config = strategy_->layers[idx1]; for (unsigned idx2 = 0; idx2 < config->nProcs; idx2++) { Realm::Processor proc = config->local_processors[idx2]; loaded_events.push_back(runtime_->LoadLayer(proc, op)); } } const Realm::Event wait_on = Realm::Event::merge_events(loaded_events); - if (wait_on.exists() && !wait_on.has_triggered()) + if (wait_on.exists() && !wait_on.has_triggered()) { wait_on.external_wait(); + } } -void -LegionModelState::FuseLayers(void) -{ +void LegionModelState::FuseLayers(void) { // FIXME: add support for layer fusion } -void -LegionModelState::FreeLayers(void) const -{ +void LegionModelState::FreeLayers(void) const { std::vector freed_events; for (unsigned idx1 = 0; idx1 < layers_.size(); idx1++) { - Operator* op = layers_[idx1]; - const LayerStrategy* config = strategy_->layers[idx1]; + Operator *op = layers_[idx1]; + LayerStrategy const *config = strategy_->layers[idx1]; for (unsigned idx2 = 0; idx2 < config->nProcs; idx2++) { Realm::Processor proc = config->local_processors[idx2]; freed_events.push_back(runtime_->FreeLayer(proc, op)); } } const Realm::Event wait_on = Realm::Event::merge_events(freed_events); - if (wait_on.exists() && !wait_on.has_triggered()) + if (wait_on.exists() && !wait_on.has_triggered()) { wait_on.external_wait(); + } // Delete layers back to front - for (std::vector::const_reverse_iterator it = layers_.rbegin(); - it != layers_.rend(); it++) + for (std::vector::const_reverse_iterator it = layers_.rbegin(); + it != layers_.rend(); + it++) { delete (*it); + } } -}}} // namespace triton::backend::legion +} // namespace legion +} // namespace backend +} // namespace triton diff --git a/triton/src/types.h b/triton/src/types.h index a034d5f685..b964f3455c 100644 --- a/triton/src/types.h +++ b/triton/src/types.h @@ -151,6 +151,7 @@ enum OperatorType { OP_PRELU, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#PRelu OP_GELU, OP_MULTIHEAD_ATTENTION, + OP_INC_MULTIHEAD_SELF_ATTENTION, OP_FUSED, // Fused operator type for internal fusion optimizations // Parallel Ops OP_REPARTITION, From 6f64c76c0c20f7c1554974450c42c63f099c62d4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 16 Apr 2023 21:26:25 -0400 Subject: [PATCH 096/344] [LayerNorm] - Add support for element-wise affine mode (#683) * impl elementwise affine * fix * fix2 --- examples/cpp/inference/inference_config.h | 2 +- .../inference/transformers/transformers.cc | 2 +- include/flexflow/batch_config.h | 2 +- src/ops/layer_norm.cc | 107 +++++++++++++++--- 4 files changed, 93 insertions(+), 20 deletions(-) diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h index 53811dd99f..8b393336cc 100644 --- a/examples/cpp/inference/inference_config.h +++ b/examples/cpp/inference/inference_config.h @@ -18,7 +18,7 @@ // #define MAX_SEQ_LEN 1024 #define MAX_SEQ_LEN 20 -#define BATCH_SIZE 32 +#define BATCH_SIZE 16 #define MNIST_DIMS 28 * 28 #define DATA_DIM MNIST_DIMS // #define DATA_DIM 3 diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 14414bb8f1..233b1dcaa1 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -39,7 +39,7 @@ Tensor create_inc_multihead_attention_decoder( FFModel *model, TransformerConfig const *transformerConfig, Tensor const &input) { - std::vector axes{2}; + std::vector axes{0}; Tensor t = transformerConfig->incremental_mode ? model->inc_multihead_self_attention( diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 05f6e062d6..17ed9d18e8 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,7 +20,7 @@ // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 #define MAX_SEQ_LEN 20 -#define BATCH_SIZE 32 +#define BATCH_SIZE 16 #define MAX_REQUESTS 256 namespace FlexFlow { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index de511812bc..5103920413 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -62,9 +62,25 @@ Tensor FFModel::layer_norm(const Tensor input, bool elementwise_affine, float eps, char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } +#ifdef DEADCODE for (int i = 0; i < axes.size(); i++) { bool found = false; for (int j = 0; j < axes.size(); j++) { @@ -76,6 +92,7 @@ Tensor FFModel::layer_norm(const Tensor input, assert(false && "axes must be the last axes.size() dimensions"); } } +#endif int num_weights = elementwise_affine ? 2 : 0; Layer *ln = new Layer(this, OP_LAYERNORM, @@ -92,19 +109,19 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, + ln->weights[0] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, + ln->weights[1] = create_weight_legion_ordering(numdims, dims, input->data_type, ln, @@ -179,19 +196,41 @@ LayerNorm::LayerNorm(FFModel &model, ParallelDim output_dims[MAX_TENSOR_DIM]; int M = 1; for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; + M *= inputs[0]->dims[axes[i]].size; } effective_num_elements = M; effective_batch_size = inputs[0]->get_volume() / M; + assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing + ParallelDim dims[axes.size()]; + for (int i = 0; i < axes.size(); i++) { + dims[i] = inputs[0]->dims[i]; + } + int seed = std::rand(); + Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 0.0f); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + comm_type); + weights[1] = + model.create_parallel_weight_legion_ordering(axes.size(), + dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + comm_type); } - return; } void LayerNorm::init_inference(FFModel const &ff, @@ -226,6 +265,20 @@ void LayerNorm::init_inference(FFModel const &ff, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -258,6 +311,20 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(1, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -403,6 +470,12 @@ void LayerNorm::forward_task(Task const *task, regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + for (int i = 0; i < numdims; i++) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + } } else { assert(regions.size() == 2); } From d48763a0fb36c970b2e95ac2623645288650f41d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 17 Apr 2023 18:16:16 -0400 Subject: [PATCH 097/344] [MoE-GPT] - Implement two-layer experts (#684) * first commit * removed unnecessary stuff * update pointers * more progress * bug fix * bug fix * hip_rocm bug fix --- include/flexflow/ops/experts.h | 22 +- include/flexflow/ops/experts_params.h | 50 ---- src/ops/experts.cc | 283 ++++++-------------- src/ops/experts.cpp | 6 +- src/ops/experts.cu | 358 ++++++++++++++++++++------ 5 files changed, 377 insertions(+), 342 deletions(-) diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index 7f110c79b6..c0a6c107aa 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -13,6 +13,8 @@ class ExpertsMeta : public OpMeta { int _experts_start_idx, int _data_dim, int _out_dim, + int _experts_num_layers, + int _experts_internal_dim_size, int _effective_batch_size, int _num_chosen_experts, float _alpha, @@ -33,21 +35,27 @@ class ExpertsMeta : public OpMeta { int *destination_start_indices; float const **token_idx_array; float const **dev_weights; - float const **weight_idx_array; + float const **weight_idx_array1; + float const **weight_idx_array2; float const **coefficient_idx_array; float **output_idx_array; - float const **bias_idx_array; + float const **bias_idx_array1; + float const **bias_idx_array2; float const *one_ptr; float const **one_ptr_array; // array of arrays to store cublasGemmBatchedEx outputs before aggregation - float **batch_outputs; - float **dev_batch_outputs; + float **batch_outputs1; + float **batch_outputs2; + float **dev_batch_outputs1; + float **dev_batch_outputs2; int num_experts; int experts_start_idx; int data_dim; int out_dim; + int experts_num_layers; + int experts_internal_dim_size; int effective_batch_size; int num_chosen_experts; int expert_capacity; @@ -55,11 +63,13 @@ class ExpertsMeta : public OpMeta { bool use_bias; ActiMode activation; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t resultTensorDesc; cudnnActivationDescriptor_t actiDesc; + cudnnTensorDescriptor_t resultTensorDesc1; + cudnnTensorDescriptor_t resultTensorDesc2; #else - miopenTensorDescriptor_t resultTensorDesc; miopenActivationDescriptor_t actiDesc; + miopenTensorDescriptor_t resultTensorDesc1; + miopenTensorDescriptor_t resultTensorDesc2; #endif }; diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index e5aa2f1ebb..b6ba88a96e 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -17,56 +17,6 @@ struct ExpertsParams { ActiMode activation; bool is_valid(std::vector const &) const; - void solve_dims(const ParallelTensor input, - ParallelDim output_dims[MAX_TENSOR_DIM], - int *output_ndims, - ParallelDim kernel_dims[MAX_TENSOR_DIM], - int *kernel_ndims, - ParallelDim bias_dims[MAX_TENSOR_DIM], - int *bias_ndims) const; - void solve_dims(ParallelTensorShape const &input_shape, - ParallelTensorShape &output_shape, - ParallelTensorShape &kernel_shape, - ParallelTensorShape &bias_shape) const; - void solve_dims(ParallelTensorShape const &input_shape, - ParallelDim output_dims[MAX_TENSOR_DIM], - int *output_ndims, - ParallelDim kernel_dims[MAX_TENSOR_DIM], - int *kernel_ndims, - ParallelDim bias_dims[MAX_TENSOR_DIM], - int *bias_ndims) const; - void construct_mappings(std::vector &, - ParallelTensorShape const &) const; - - enum NamedDimensions { - INPUT_CHANNEL, - INPUT_SAMPLE, - INPUT_REPLICA, - OUTPUT_CHANNEL, - OUTPUT_SAMPLE, - OUTPUT_REPLICA, - KERNEL_CHANNEL_IN, - KERNEL_CHANNEL_OUT, - KERNEL_NUM_EXPERTS, - BIAS_CHANNEL_OUT, - BIAS_NUM_EXPERTS, - }; - - std::unordered_map - get_dimension_names(ParallelTensorShape const &input_name) const; - -private: - void mark_replica_dims(ParallelTensorShape const &input_shape, - ParallelDim output_dims[MAX_TENSOR_DIM], - ParallelDim kernel_dims[MAX_TENSOR_DIM], - ParallelDim bias_dims[MAX_TENSOR_DIM]) const; - void calculate_nonreplica_dim_sizes(ParallelTensorShape const &input_shape, - ParallelDim output_dims[MAX_TENSOR_DIM], - int *output_ndims, - ParallelDim kernel_dims[MAX_TENSOR_DIM], - int *kernel_ndims, - ParallelDim bias_dims[MAX_TENSOR_DIM], - int *bias_ndims) const; }; bool operator==(ExpertsParams const &, ExpertsParams const &); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 6eaa3be943..5fa6404ff0 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -69,7 +69,8 @@ Tensor FFModel::experts(Tensor const *inputs, assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); - assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); + assert(experts_num_layers >= 1); + assert(experts_num_layers <= 2 && "Multi-layer experts not implemented yet."); assert(experts_num_layers == 1 || experts_internal_dim_size > 0); // parameters for the FFN implementing the experts. We can make these @@ -96,12 +97,19 @@ Tensor FFModel::experts(Tensor const *inputs, assert(e->outputs[0] != nullptr); } { - int dims[3] = {inputs[0]->dims[0], experts_output_dim_size, num_experts}; + int nparams = (experts_num_layers == 1) + ? (inputs[0]->dims[0] * experts_output_dim_size) + : experts_internal_dim_size * + (inputs[0]->dims[0] + experts_output_dim_size); + int dims[2] = {nparams, num_experts}; e->weights[0] = create_weight_legion_ordering( - 3, dims, DT_FLOAT, e, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); + 2, dims, DT_FLOAT, e, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); } if (use_bias) { - int dims[2] = {experts_output_dim_size, num_experts}; + int nparams = (experts_num_layers == 1) + ? experts_output_dim_size + : (experts_internal_dim_size + experts_output_dim_size); + int dims[2] = {nparams, num_experts}; e->weights[1] = create_weight_legion_ordering( 2, dims, DT_FLOAT, e, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); } @@ -307,7 +315,8 @@ Experts::Experts(FFModel &model, assert(inputs[2]->dims[0].degree == 1); // check data type of indices input assert(inputs[1]->data_type == DT_INT32 || inputs[1]->data_type == DT_INT64); - assert(experts_num_layers == 1 && "Multi-layer experts not implemented yet."); + assert(experts_num_layers >= 1); + assert(experts_num_layers <= 2 && "Multi-layer experts not implemented yet."); assert(experts_num_layers == 1 || experts_internal_dim_size > 0); // save the token embedding dimension (data_dim) and the effective batch size @@ -330,14 +339,6 @@ Experts::Experts(FFModel &model, num_dims, out_dims, inputs[0]->data_type, this, 0 /*owner_idx*/); assert(outputs[0] != nullptr); - // auto dimension_names = - // this->get_params().get_dimension_names(inputs[0]->get_shape()); - ParallelTensorShape input_shape = inputs[0]->get_shape(); - ParallelTensorShape output_shape, kernel_shape, bias_shape; - ExpertsParams params = this->get_params(); - params.construct_mappings(*this->parallel_dims_mapping, input_shape); - params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); - if (allocate_weights) { #ifdef USE_NCCL ParameterSyncType comm_type = ParameterSyncType::NCCL; @@ -345,29 +346,52 @@ Experts::Experts(FFModel &model, ParameterSyncType comm_type = ParameterSyncType::PS; #endif { + ParallelDim dims[3]; + int nparams = (experts_num_layers == 1) + ? (data_dim * experts_output_dim_size) + : experts_internal_dim_size * + (data_dim + experts_output_dim_size); + dims[0].size = nparams; + dims[0].degree = 1; + dims[0].parallel_idx = -1; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = num_experts; + dims[2] = inputs[0]->dims[num_dims - 2]; + dims[2].size = dims[0].degree; Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - assert(kernel_shape.dims[2].size == num_experts); - weights[0] = model.create_parallel_weight_legion_ordering( - kernel_shape.num_dims, // 3, - kernel_shape.dims, // dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - comm_type); + // assert(kernel_shape.dims[2].size == num_experts); + weights[0] = + model.create_parallel_weight_legion_ordering(3, + dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + comm_type); assert(weights[0] != nullptr); } if (use_bias) { Initializer *bias_initializer = new ZeroInitializer(); - assert(bias_shape.dims[1].size == num_experts); - weights[1] = model.create_parallel_weight_legion_ordering( - bias_shape.num_dims, // 1, - bias_shape.dims, // dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - bias_initializer, - comm_type); + // assert(bias_shape.dims[1].size == num_experts); + ParallelDim dims[3]; + int nparams = (experts_num_layers == 1) + ? experts_output_dim_size + : (experts_internal_dim_size + experts_output_dim_size); + dims[0].size = nparams; + dims[0].degree = 1; + dims[0].parallel_idx = -1; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = num_experts; + dims[2] = inputs[0]->dims[num_dims - 2]; + dims[2].size = dims[0].degree; + weights[1] = + model.create_parallel_weight_legion_ordering(3, + dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + bias_initializer, + comm_type); assert(weights[1] != nullptr); } } @@ -563,6 +587,8 @@ OpMeta *Experts::init_task(Task const *task, exp->experts_start_idx, exp->data_dim, exp->out_dim, + exp->experts_num_layers, + exp->experts_internal_dim_size, exp->effective_batch_size, exp->num_chosen_experts, exp->alpha, @@ -754,6 +780,8 @@ void Experts::inference_task(Task const *task, input_domain.hi()[samples_index] - input_domain.lo()[samples_index] + 1; coord_t chosen_experts = indices_domain.hi()[0] - indices_domain.lo()[0] + 1; coord_t out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + coord_t num_replicas = + input_domain.hi()[replica_dim] - input_domain.lo()[replica_dim] + 1; assert(data_dim == m->data_dim); assert(out_dim == m->out_dim); assert(chosen_experts == m->num_chosen_experts); @@ -790,10 +818,14 @@ void Experts::inference_task(Task const *task, Domain weights_domain = runtime->get_index_space_domain( ctx, task->regions[4].region.get_index_space()); int weights_dims = weights_domain.get_dim(); - assert(weights_dims == input_dims); - assert(weights_domain.hi()[0] - weights_domain.lo()[0] + 1 == data_dim); - assert(weights_domain.hi()[1] - weights_domain.lo()[1] + 1 == out_dim); - assert(weights_domain.hi()[2] - weights_domain.lo()[2] + 1 == num_experts); + assert(weights_dims == 3); + int nparams_weight = + (m->experts_num_layers == 1) + ? (data_dim * out_dim) + : m->experts_internal_dim_size * (data_dim + out_dim); + assert(weights_domain.hi()[0] - weights_domain.lo()[0] + 1 == nparams_weight); + assert(weights_domain.hi()[1] - weights_domain.lo()[1] + 1 == num_experts); + assert(weights_domain.hi()[2] - weights_domain.lo()[2] + 1 == num_replicas); float const *bias_ptr = nullptr; if (use_bias) { @@ -802,9 +834,13 @@ void Experts::inference_task(Task const *task, Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[5].region.get_index_space()); int bias_dims = bias_domain.get_dim(); - assert(bias_dims == 4); - assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == out_dim); + assert(bias_dims == 3); + int nparams_bias = (m->experts_num_layers == 1) + ? out_dim + : (m->experts_internal_dim_size + out_dim); + assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == nparams_bias); assert(bias_domain.hi()[1] - bias_domain.lo()[1] + 1 == num_experts); + assert(bias_domain.hi()[2] - bias_domain.lo()[2] + 1 == num_replicas); } #ifdef INFERENCE_TESTS @@ -1030,179 +1066,6 @@ bool Experts::measure_operator_cost(Simulator *sim, return false; } -void ExpertsParams::solve_dims(const ParallelTensor input, - ParallelDim output_dims[MAX_TENSOR_DIM], - int *output_ndims, - ParallelDim kernel_dims[MAX_TENSOR_DIM], - int *kernel_ndims, - ParallelDim bias_dims[MAX_TENSOR_DIM], - int *bias_ndims) const { - this->solve_dims(input->get_shape(), - output_dims, - output_ndims, - kernel_dims, - kernel_ndims, - bias_dims, - bias_ndims); -} - -void ExpertsParams::solve_dims(ParallelTensorShape const &input_shape, - ParallelTensorShape &output_shape, - ParallelTensorShape &kernel_shape, - ParallelTensorShape &bias_shape) const { - this->solve_dims(input_shape, - output_shape.dims, - &output_shape.num_dims, - kernel_shape.dims, - &kernel_shape.num_dims, - bias_shape.dims, - &bias_shape.num_dims); -} - -void ExpertsParams::solve_dims(ParallelTensorShape const &input_shape, - ParallelDim output_dims[MAX_TENSOR_DIM], - int *output_ndims, - ParallelDim kernel_dims[MAX_TENSOR_DIM], - int *kernel_ndims, - ParallelDim bias_dims[MAX_TENSOR_DIM], - int *bias_ndims) const { - assert((output_dims == nullptr) == (output_ndims == nullptr)); - assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); - assert((bias_dims == nullptr) == (bias_ndims == nullptr)); - - std::vector mapping; - this->construct_mappings(mapping, input_shape); - this->mark_replica_dims(input_shape, output_dims, kernel_dims, bias_dims); - - solve_parallel_dim_mappings( - mapping, {input_shape.dims}, {kernel_dims, bias_dims}, {output_dims}); - - this->calculate_nonreplica_dim_sizes(input_shape, - output_dims, - output_ndims, - kernel_dims, - kernel_ndims, - bias_dims, - bias_ndims); -} - -std::unordered_map - ExpertsParams::get_dimension_names( - ParallelTensorShape const &input_shape) const { - int num_dims = input_shape.num_dims; - - return {{INPUT_CHANNEL, 0}, - {INPUT_SAMPLE, num_dims - 2}, - {INPUT_REPLICA, num_dims - 1}, - {OUTPUT_CHANNEL, 0}, - {OUTPUT_SAMPLE, num_dims - 2}, - {OUTPUT_REPLICA, num_dims - 1}, - {KERNEL_CHANNEL_IN, 0}, - {KERNEL_CHANNEL_OUT, 1}, - {KERNEL_NUM_EXPERTS, 2}, - {BIAS_CHANNEL_OUT, 0}, - {BIAS_NUM_EXPERTS, 1}}; -} - -void ExpertsParams::calculate_nonreplica_dim_sizes( - ParallelTensorShape const &input_shape, - ParallelDim output_dims[MAX_TENSOR_DIM], - int *output_ndims, - ParallelDim kernel_dims[MAX_TENSOR_DIM], - int *kernel_ndims, - ParallelDim bias_dims[MAX_TENSOR_DIM], - int *bias_ndims) const { - auto dimension_names = this->get_dimension_names(input_shape); - int num_dims = input_shape.num_dims; - - if (output_dims != nullptr) { - for (int i = 1; i < input_shape.num_dims - 1; i++) { - output_dims[i].size = input_shape.dims[i].size; - } - output_dims[dimension_names.at(OUTPUT_CHANNEL)].size = - experts_output_dim_size; - *output_ndims = num_dims; - } - if (kernel_dims != nullptr) { - kernel_dims[dimension_names.at(KERNEL_CHANNEL_IN)].size = - input_shape.dims[INPUT_CHANNEL].size / - input_shape.dims[INPUT_CHANNEL].degree; - kernel_dims[dimension_names.at(KERNEL_CHANNEL_OUT)].size = - experts_output_dim_size; - kernel_dims[dimension_names.at(KERNEL_NUM_EXPERTS)].size = num_experts; - *kernel_ndims = num_dims; - } - if (bias_dims != nullptr) { - bias_dims[dimension_names.at(BIAS_CHANNEL_OUT)].size = - experts_output_dim_size; - bias_dims[dimension_names.at(BIAS_NUM_EXPERTS)].size = num_experts; - *bias_ndims = num_dims; - } -} - -void ExpertsParams::mark_replica_dims( - ParallelTensorShape const &input_shape, - ParallelDim output_dims[MAX_TENSOR_DIM], - ParallelDim kernel_dims[MAX_TENSOR_DIM], - ParallelDim bias_dims[MAX_TENSOR_DIM]) const { - int num_dims = input_shape.num_dims; - auto dimension_names = this->get_dimension_names(input_shape); - if (output_dims != nullptr) { - output_dims[dimension_names.at(OUTPUT_REPLICA)].is_replica_dim = true; - } - if (kernel_dims != nullptr) { - for (int i = 2; i < num_dims; i++) { - kernel_dims[i].is_replica_dim = true; - } - } - if (bias_dims != nullptr) { - for (int i = 1; i < num_dims; i++) { - bias_dims[i].is_replica_dim = true; - } - } -} - -void ExpertsParams::construct_mappings( - std::vector &mappings, - ParallelTensorShape const &input_shape) const { - std::unordered_map dimension_names = - this->get_dimension_names(input_shape); - - Op::construct_output_parallel_dims( - mappings, - {{dimension_names.at(INPUT_CHANNEL), dimension_names.at(OUTPUT_REPLICA)}, - {dimension_names.at(INPUT_REPLICA), - dimension_names.at(OUTPUT_CHANNEL)}}); - for (int i = 1; i < input_shape.num_dims - 1; i++) { - Op::construct_output_parallel_dims(mappings, i, i); - } - - Op::construct_weight_parallel_dims(mappings, - {{dimension_names.at(INPUT_CHANNEL), - dimension_names.at(KERNEL_CHANNEL_IN)}, - {dimension_names.at(INPUT_REPLICA), - dimension_names.at(KERNEL_CHANNEL_OUT)}}, - 0 /*input_idx*/, - KERNEL_IDX); - // map a bunch of replica dimensions for the unnamed dimensions in the input - for (int i = 1; i < input_shape.num_dims - 1; i++) { - Op::construct_weight_parallel_dims( - mappings, i, i + 1, 0 /*input_idx*/, KERNEL_IDX); - } - - Op::construct_weight_parallel_dims(mappings, - { - {dimension_names.at(INPUT_REPLICA), - dimension_names.at(BIAS_CHANNEL_OUT)}, - }, - 0 /*input_idx*/, - BIAS_IDX); - for (int i = 0; i < input_shape.num_dims - 1; i++) { - Op::construct_weight_parallel_dims( - mappings, i, i + 1, 0 /*input_idx*/, BIAS_IDX); - } -} - }; // namespace FlexFlow namespace std { diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index 88456ac66c..c06f02a647 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -40,6 +40,8 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, int _experts_start_idx, int _data_dim, int _out_dim, + int _experts_num_layers, + int _experts_internal_dim_size, int _effective_batch_size, int _num_chosen_experts, float _alpha, @@ -47,7 +49,9 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, ActiMode _activation) : OpMeta(handler), num_experts(_num_experts), experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), effective_batch_size(_effective_batch_size), + out_dim(_out_dim), experts_num_layers(_experts_num_layers), + experts_internal_dim_size(_experts_internal_dim_size), + effective_batch_size(_effective_batch_size), num_chosen_experts(_num_chosen_experts), alpha(_alpha), use_bias(_use_bias), activation(_activation) {} ExpertsMeta::~ExpertsMeta(void) {} diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 82f128fd1b..c5f79446a1 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -201,6 +201,8 @@ __global__ void experts_forward_prepare_kernel( int num_chosen_experts, int data_dim, int out_dim, + int experts_num_layers, + int experts_internal_dim_size, bool use_bias, int *sorted_indices, int *expert_start_indexes, @@ -209,13 +211,15 @@ __global__ void experts_forward_prepare_kernel( int *original_indices, float const *input, // @In: Tokens' values (in_dim, batch_size) float *output, - float const **token_idx_array, // @Out: Barray for GemmBatchedEx - float const *weights, // @In: Experts' weights - float const *biases, // @In: Experts' biases - float const **weight_idx_array, // @Out: Aarray for GemmBatchedEx - float const **bias_idx_array, // @Out: Experts' bias - float const *coefficients, // @In: topk_gate_predss coefficients tensor - // (num_chosen_experts, batch_size) + float const **token_idx_array, // @Out: Barray for GemmBatchedEx + float const *weights, // @In: Experts' weights + float const *biases, // @In: Experts' biases + float const **weight_idx_array1, // @Out: Aarray for GemmBatchedEx + float const **weight_idx_array2, + float const **bias_idx_array1, // @Out: Experts' bias + float const **bias_idx_array2, + float const *coefficients, // @In: topk_gate_predss coefficients tensor + // (num_chosen_experts, batch_size) float const **coefficient_idx_array, // @Out: Barray for Aggregation float **output_idx_array) { @@ -226,19 +230,38 @@ __global__ void experts_forward_prepare_kernel( int local_expert_label = global_expert_label - experts_start_idx; int expert_index = exp_local_label_to_index[local_expert_label]; int within_expert_offset = i - expert_start_indexes[expert_index]; + int weight_params_count = + experts_num_layers == 1 + ? data_dim * out_dim + : experts_internal_dim_size * (data_dim + out_dim); if (within_expert_offset < expert_capacity) { int rev_idx = original_indices[i + lb_index]; int token_idx = (rev_idx / num_chosen_experts); token_idx_array[destination_start_indices[expert_index] + within_expert_offset] = &input[token_idx * data_dim]; - weight_idx_array[destination_start_indices[expert_index] + - within_expert_offset] = - &weights[local_expert_label * data_dim * out_dim]; + weight_idx_array1[destination_start_indices[expert_index] + + within_expert_offset] = + &weights[local_expert_label * weight_params_count]; + if (experts_num_layers == 2) { + weight_idx_array2[destination_start_indices[expert_index] + + within_expert_offset] = + &weights[local_expert_label * weight_params_count + + (data_dim * experts_internal_dim_size)]; + } if (use_bias) { - bias_idx_array[destination_start_indices[expert_index] + - within_expert_offset] = - &biases[local_expert_label * out_dim]; + int bias_params_count = (experts_num_layers == 1) + ? out_dim + : (experts_internal_dim_size + out_dim); + bias_idx_array1[destination_start_indices[expert_index] + + within_expert_offset] = + &biases[local_expert_label * bias_params_count]; + if (experts_num_layers == 2) { + bias_idx_array2[destination_start_indices[expert_index] + + within_expert_offset] = + &biases[local_expert_label * bias_params_count + + experts_internal_dim_size]; + } } coefficient_idx_array[destination_start_indices[expert_index] + within_expert_offset] = &coefficients[rev_idx]; @@ -264,13 +287,18 @@ bool use_activation(ActiMode mode) { } void experts_forward_GemmBatched_kernel(ExpertsMeta const *m, - void const **weights_ptr, + void const **weights_ptr1, + void const **weights_ptr2, void const **input_ptr, - void **results_ptr, - void const **bias_ptr, + void **results_ptr1, + void **results_ptr2, + void const **bias_ptr1, + void const **bias_ptr2, ActiMode activation, int in_dim, int out_dim, + int experts_num_layers, + int experts_internal_dim_size, int num_tokens, int num_chosen_experts, int gemm_batch_count, @@ -290,63 +318,169 @@ void experts_forward_GemmBatched_kernel(ExpertsMeta const *m, cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + int m_ = out_dim; + int n = 1; + int k = in_dim; + void const **A = weights_ptr1; + void const **B = input_ptr; + void **C = results_ptr1; + int lda = in_dim; + int ldb = in_dim; + int ldc = out_dim; + if (experts_num_layers == 2) { + m_ = ldc = experts_internal_dim_size; + } checkCUDA(cublasGemmBatchedEx( m->handle.blas, CUBLAS_OP_T, // Tranpose Weight, shape (in_dim, out_dim) => (out_dim, // in_dim) CUBLAS_OP_N, // Input_token, shape (in_dim, 1) - out_dim, // num_row of (A, C) = out_dim - 1, // num_col of (B, C) = 1 - in_dim, // num_col of A and num_rows of B = in_dim + m_, // num_row of (A, C) = out_dim + n, // num_col of (B, C) = 1 + k, // num_col of A and num_rows of B = in_dim &alpha, - weights_ptr, // Aarray (num_tokens * chosen_experts, in_dim, out_dim) + A, // Aarray (num_tokens * chosen_experts, in_dim, out_dim) weight_type, - in_dim, // Leading Dimension of weight before transpose - input_ptr, // Barray (num_tokens * chosen_experts, in_dim, 1) + lda, // Leading Dimension of weight before transpose + B, // Barray (num_tokens * chosen_experts, in_dim, 1) input_type, - in_dim, // Leading Dimension of input_token + ldb, // Leading Dimension of input_token &beta, - results_ptr, // Carray (num_tokens * chosen_experts, out_dim, 1) + C, // Carray (num_tokens * chosen_experts, out_dim, 1) output_type, - out_dim, // Leading Dimension of output + ldc, // Leading Dimension of output gemm_batch_count, // Total submatrixes compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // TODO 2: bias and activations if (m->use_bias) { + m_ = out_dim; + n = 1; + k = 1; + A = bias_ptr1; + B = (void const **)m->one_ptr_array; + C = results_ptr1; + lda = out_dim; + ldb = 1; + ldc = out_dim; + if (experts_num_layers == 2) { + m_ = lda = ldc = experts_internal_dim_size; + } + alpha = 1.0f, beta = 0.0f; checkCUDA(cublasGemmBatchedEx( m->handle.blas, CUBLAS_OP_N, // Bias, shape (out_dim, 1) CUBLAS_OP_N, // Coefficient, shape (1, 1) - out_dim, // num_row of (A, C) = out_dim - 1, // num_col of (B, C) = 1 - 1, // num_col of A and num_rows of B = 1 + m_, // num_row of (A, C) = out_dim + n, // num_col of (B, C) = 1 + k, // num_col of A and num_rows of B = 1 &alpha, - bias_ptr, // bias tensor (out_dim, 1) + A, // bias tensor (out_dim, 1) weight_type, - out_dim, // Leading Dimension of bias tensor - (void const **)m->one_ptr_array, // all-one tensor (1, 1) + lda, // Leading Dimension of bias tensor + B, // all-one tensor (1, 1) CUDA_R_32F, - 1, // Leading Dimension of all-one tensor + ldb, // Leading Dimension of all-one tensor &alpha, - results_ptr, // Carray (num_tokens * chosen_experts, out_dim, 1) + C, // Carray (num_tokens * chosen_experts, out_dim, 1) output_type, - out_dim, // Leading Dimension of output + ldc, // Leading Dimension of output gemm_batch_count, // Total submatrixs compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } if (use_activation(activation)) { + alpha = 1.0f, beta = 0.0f; checkCUDNN(cudnnActivationForward(m->handle.dnn, m->actiDesc, &alpha, - m->resultTensorDesc, - m->batch_outputs[0], + m->resultTensorDesc1, + m->batch_outputs1[0], &beta, - m->resultTensorDesc, - m->batch_outputs[0])); + m->resultTensorDesc1, + m->batch_outputs1[0])); + } + + if (experts_num_layers == 2) { + m_ = out_dim; + n = 1; + k = experts_internal_dim_size; + A = weights_ptr2; + B = (void const **)results_ptr1; + C = results_ptr2; + lda = experts_internal_dim_size; + ldb = experts_internal_dim_size; + ldc = out_dim; + alpha = 1.0f, beta = 0.0f; + checkCUDA(cublasGemmBatchedEx( + m->handle.blas, + CUBLAS_OP_T, // Tranpose Weight, shape (in_dim, out_dim) => (out_dim, + // in_dim) + CUBLAS_OP_N, // Input_token, shape (in_dim, 1) + m_, // num_row of (A, C) = out_dim + n, // num_col of (B, C) = 1 + k, // num_col of A and num_rows of B = in_dim + &alpha, + A, // Aarray (num_tokens * chosen_experts, in_dim, out_dim) + weight_type, + lda, // Leading Dimension of weight before transpose + B, // Barray (num_tokens * chosen_experts, in_dim, 1) + input_type, + ldb, // Leading Dimension of input_token + &beta, + C, // Carray (num_tokens * chosen_experts, out_dim, 1) + output_type, + ldc, // Leading Dimension of output + gemm_batch_count, // Total submatrixes + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + if (m->use_bias) { + m_ = out_dim; + n = 1; + k = 1; + A = bias_ptr2; + B = (void const **)m->one_ptr_array; + C = results_ptr2; + lda = out_dim; + ldb = 1; + ldc = out_dim; + alpha = 1.0f, beta = 0.0f; + checkCUDA(cublasGemmBatchedEx( + m->handle.blas, + CUBLAS_OP_N, // Bias, shape (out_dim, 1) + CUBLAS_OP_N, // Coefficient, shape (1, 1) + m_, // num_row of (A, C) = out_dim + n, // num_col of (B, C) = 1 + k, // num_col of A and num_rows of B = 1 + &alpha, + A, // bias tensor (out_dim, 1) + weight_type, + lda, // Leading Dimension of bias tensor + B, // all-one tensor (1, 1) + CUDA_R_32F, + ldb, // Leading Dimension of all-one tensor + &alpha, + C, // Carray (num_tokens * chosen_experts, out_dim, 1) + output_type, + ldc, // Leading Dimension of output + gemm_batch_count, // Total submatrixs + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + if (use_activation(activation)) { + alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnActivationForward(m->handle.dnn, + m->actiDesc, + &alpha, + m->resultTensorDesc2, + m->batch_outputs2[0], + &beta, + m->resultTensorDesc2, + m->batch_outputs2[0])); + } } } @@ -713,6 +847,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, num_chosen_experts, data_dim, out_dim, + m->experts_num_layers, + m->experts_internal_dim_size, use_bias, m->sorted_indices, m->expert_start_indexes, @@ -724,8 +860,10 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, m->token_idx_array, weights, biases, - m->weight_idx_array, - m->bias_idx_array, + m->weight_idx_array1, + m->weight_idx_array2, + m->bias_idx_array1, + m->bias_idx_array2, topk_gate_preds, m->coefficient_idx_array, m->output_idx_array); @@ -1049,13 +1187,18 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, #endif experts_forward_GemmBatched_kernel(m, - (void const **)m->weight_idx_array, + (void const **)m->weight_idx_array1, + (void const **)m->weight_idx_array2, (void const **)m->token_idx_array, - (void **)m->dev_batch_outputs, - (void const **)m->bias_idx_array, + (void **)m->dev_batch_outputs1, + (void **)m->dev_batch_outputs2, + (void const **)m->bias_idx_array1, + (void const **)m->bias_idx_array2, activation, data_dim, out_dim, + m->experts_num_layers, + m->experts_internal_dim_size, num_tokens, num_chosen_experts, gemm_batch_count, @@ -1073,7 +1216,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, gemm_batch_count, out_dim, output, - m->dev_batch_outputs, + m->experts_num_layers == 1 + ? m->dev_batch_outputs1 + : m->dev_batch_outputs2, m->coefficient_idx_array, m->output_idx_array); @@ -1093,6 +1238,8 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, int _experts_start_idx, int _data_dim, int _out_dim, + int _experts_num_layers, + int _experts_internal_dim_size, int _effective_batch_size, int _num_chosen_experts, float _alpha, @@ -1100,7 +1247,9 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, ActiMode _activation) : OpMeta(handler), num_experts(_num_experts), experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), effective_batch_size(_effective_batch_size), + out_dim(_out_dim), experts_num_layers(_experts_num_layers), + experts_internal_dim_size(_experts_internal_dim_size), + effective_batch_size(_effective_batch_size), num_chosen_experts(_num_chosen_experts), alpha(_alpha), use_bias(_use_bias), activation(_activation) { expert_capacity = @@ -1131,10 +1280,10 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, cudaMalloc(&token_idx_array, num_chosen_experts * effective_batch_size * sizeof(float *))); checkCUDA( - cudaMalloc(&weight_idx_array, + cudaMalloc(&weight_idx_array1, num_chosen_experts * effective_batch_size * sizeof(float *))); checkCUDA( - cudaMalloc(&bias_idx_array, + cudaMalloc(&bias_idx_array1, num_chosen_experts * effective_batch_size * sizeof(float *))); checkCUDA( cudaMalloc(&coefficient_idx_array, @@ -1142,25 +1291,54 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, checkCUDA( cudaMalloc(&output_idx_array, num_chosen_experts * effective_batch_size * sizeof(float *))); - batch_outputs = new float *[num_chosen_experts * effective_batch_size]; - checkCUDA(cudaMalloc(&batch_outputs[0], - out_dim * num_chosen_experts * effective_batch_size * - sizeof(float))); - checkCUDA(cudaMemset(batch_outputs[0], + batch_outputs1 = new float *[num_chosen_experts * effective_batch_size]; + int batch_outputs1_dim = + (experts_num_layers == 1) ? out_dim : experts_internal_dim_size; + checkCUDA(cudaMalloc(&batch_outputs1[0], + batch_outputs1_dim * num_chosen_experts * + effective_batch_size * sizeof(float))); + checkCUDA(cudaMemset(batch_outputs1[0], 0, - out_dim * num_chosen_experts * effective_batch_size * - sizeof(float))); + batch_outputs1_dim * num_chosen_experts * + effective_batch_size * sizeof(float))); for (int i = 1; i < num_chosen_experts * effective_batch_size; i++) { - batch_outputs[i] = batch_outputs[i - 1] + out_dim; + batch_outputs1[i] = batch_outputs1[i - 1] + batch_outputs1_dim; } checkCUDA( - cudaMalloc(&dev_batch_outputs, + cudaMalloc(&dev_batch_outputs1, num_chosen_experts * effective_batch_size * sizeof(float *))); checkCUDA( - cudaMemcpy(dev_batch_outputs, - batch_outputs, + cudaMemcpy(dev_batch_outputs1, + batch_outputs1, num_chosen_experts * effective_batch_size * sizeof(float *), cudaMemcpyHostToDevice)); + if (experts_num_layers == 2) { + checkCUDA(cudaMalloc(&weight_idx_array2, + num_chosen_experts * effective_batch_size * + sizeof(float *))); + checkCUDA(cudaMalloc(&bias_idx_array2, + num_chosen_experts * effective_batch_size * + sizeof(float *))); + batch_outputs2 = new float *[num_chosen_experts * effective_batch_size]; + checkCUDA(cudaMalloc(&batch_outputs2[0], + out_dim * num_chosen_experts * effective_batch_size * + sizeof(float))); + checkCUDA(cudaMemset(batch_outputs2[0], + 0, + out_dim * num_chosen_experts * effective_batch_size * + sizeof(float))); + for (int i = 1; i < num_chosen_experts * effective_batch_size; i++) { + batch_outputs2[i] = batch_outputs2[i - 1] + out_dim; + } + checkCUDA(cudaMalloc(&dev_batch_outputs2, + num_chosen_experts * effective_batch_size * + sizeof(float *))); + checkCUDA( + cudaMemcpy(dev_batch_outputs2, + batch_outputs2, + num_chosen_experts * effective_batch_size * sizeof(float *), + cudaMemcpyHostToDevice)); + } // Bias float *dram_one_ptr = (float *)malloc(sizeof(float) * 1); for (int i = 0; i < 1; i++) { @@ -1183,7 +1361,10 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, } // Activation checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); - checkCUDNN(cudnnCreateTensorDescriptor(&resultTensorDesc)); + checkCUDNN(cudnnCreateTensorDescriptor(&resultTensorDesc1)); + if (experts_num_layers == 2) { + checkCUDNN(cudnnCreateTensorDescriptor(&resultTensorDesc2)); + } if (use_activation(activation)) { cudnnActivationMode_t mode; switch (activation) { @@ -1199,15 +1380,36 @@ ExpertsMeta::ExpertsMeta(FFHandler handler, } checkCUDNN( cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN( - cudnnSetTensor4dDescriptor(resultTensorDesc, - CUDNN_TENSOR_NCHW, - // CUDNN_DATA_FLOAT, - cuda_to_cudnn_datatype(CUDA_R_32F), - num_chosen_experts * effective_batch_size, - out_dim, - 1, - 1)); + if (experts_num_layers == 1) { + checkCUDNN( + cudnnSetTensor4dDescriptor(resultTensorDesc1, + CUDNN_TENSOR_NCHW, + // CUDNN_DATA_FLOAT, + cuda_to_cudnn_datatype(CUDA_R_32F), + num_chosen_experts * effective_batch_size, + out_dim, + 1, + 1)); + } else { + checkCUDNN( + cudnnSetTensor4dDescriptor(resultTensorDesc1, + CUDNN_TENSOR_NCHW, + // CUDNN_DATA_FLOAT, + cuda_to_cudnn_datatype(CUDA_R_32F), + num_chosen_experts * effective_batch_size, + experts_internal_dim_size, + 1, + 1)); + checkCUDNN( + cudnnSetTensor4dDescriptor(resultTensorDesc2, + CUDNN_TENSOR_NCHW, + // CUDNN_DATA_FLOAT, + cuda_to_cudnn_datatype(CUDA_R_32F), + num_chosen_experts * effective_batch_size, + out_dim, + 1, + 1)); + } } } ExpertsMeta::~ExpertsMeta(void) { @@ -1221,19 +1423,25 @@ ExpertsMeta::~ExpertsMeta(void) { checkCUDA(cudaFree(num_assignments_per_expert)); checkCUDA(cudaFree(destination_start_indices)); checkCUDA(cudaFree(token_idx_array)); - checkCUDA(cudaFree(weight_idx_array)); + checkCUDA(cudaFree(weight_idx_array1)); + checkCUDA(cudaFree(weight_idx_array2)); checkCUDA(cudaFree(coefficient_idx_array)); checkCUDA(cudaFree(output_idx_array)); - checkCUDA(cudaFree(dev_batch_outputs)); - checkCUDA(cudaFree(bias_idx_array)); - checkCUDA(cudaFree(batch_outputs[0])); - delete[] batch_outputs; + checkCUDA(cudaFree(dev_batch_outputs1)); + checkCUDA(cudaFree(dev_batch_outputs2)); + checkCUDA(cudaFree(bias_idx_array1)); + checkCUDA(cudaFree(bias_idx_array2)); + checkCUDA(cudaFree(batch_outputs1[0])); + checkCUDA(cudaFree(batch_outputs2[0])); + delete[] batch_outputs1; + delete[] batch_outputs2; // Bias checkCUDA(cudaFree((void *)one_ptr)); checkCUDA(cudaFree((void *)one_ptr_array)); // Activation checkCUDNN(cudnnDestroyActivationDescriptor(actiDesc)); - checkCUDNN(cudnnDestroyTensorDescriptor(resultTensorDesc)); + checkCUDNN(cudnnDestroyTensorDescriptor(resultTensorDesc1)); + checkCUDNN(cudnnDestroyTensorDescriptor(resultTensorDesc2)); } }; // namespace FlexFlow From 1d6d03a94d484a5005bec2534c075d4c6f2634c5 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 29 Apr 2023 21:58:08 -0400 Subject: [PATCH 098/344] new batch (#695) * new batch * v2 dataloader for testing new batchconfig in llama * finish the change and test * finish impl + debugging * value field in PerTokenInfo * fix * cleanup * fix bug * Update cpp_gpu_tests.sh --------- Co-authored-by: Gabriele Oliaro Co-authored-by: Gabriele Oliaro --- examples/cpp/inference/LLAMA/dataloader.cc | 63 +++---- examples/cpp/inference/LLAMA/dataloader.cu | 47 +++-- examples/cpp/inference/LLAMA/llama.h | 4 +- examples/cpp/inference/dataloader.cc | 57 ++---- examples/cpp/inference/dataloader.cu | 66 +++---- examples/cpp/inference/dataloader.h | 2 +- include/flexflow/batch_config.h | 38 ++-- .../ops/inc_multihead_self_attention.h | 4 +- src/ops/inc_multihead_self_attention.cu | 72 ++++---- src/runtime/batch_config.cc | 174 +++++------------- src/runtime/inference_manager.cc | 1 + tests/cpp_gpu_tests.sh | 1 + 12 files changed, 201 insertions(+), 328 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index a09230029f..fa69324a96 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -1,3 +1,4 @@ + #include "llama.h" #include @@ -84,22 +85,8 @@ void DataLoader::next_batch(FFModel &ff, Domain domain = runtime->get_index_space_domain(ctx, batch_input->parallel_is); ArgumentMap argmap; - // int idx = next_index; - // for (Domain::DomainPointIterator it(domain); it; it++) { - // SampleIdxs meta; - // assert(ff.config.batchSize % batch_input->dims[1].size == 0); - // meta.num_samples = ff.config.batchSize / batch_input->dims[2].size; - // for (int i = 0; i < meta.num_samples; i++) { - // meta.idxs[i] = idx++; - // meta.token_idx = next_token_idx; - // meta.batch_idx = next_batch_index; - // } - - // argmap.set_point(*it, TaskArgument(&meta, sizeof(SampleIdxs))); - // } - - DataLoaderNextBatchInput next_batch_input = {bc->token2ids, - batch_predictions}; + + DataLoaderNextBatchInput next_batch_input = {bc, batch_predictions}; DataLoaderNextBatchInput const *ptr = &next_batch_input; size_t next_batch_input_sz = sizeof(next_batch_input); assert(ptr->prev_batch_preds.size() == batch_predictions.size()); @@ -217,11 +204,6 @@ void DataLoader::load_attention_weights(T *ptr, for (int i = 0; i < 32; i++) { size_t start_index = i * one_head_size * 4 + file_index * one_head_size; - // if (file_index == 3) { - // printf("print wo start index %d, data %.10f\n", - // start_index, - // host_array.at(data_index)); - // } for (size_t j = start_index; j < start_index + one_head_size; j++) { ptr[j] = host_array.at(data_index); data_index += 1; @@ -237,38 +219,35 @@ void DataLoader::load_attention_weights(T *ptr, void DataLoader::store_outputs(BatchConfig *bc, InferenceResult const &ir, std::map &batch_predictions) { - assert(bc->token2ids.num_samples == bc->num_active_tokens() && - bc->token2ids.num_samples <= bc->MAX_NUM_TOKENS); std::cout << "store outputs...." << std::endl; batch_predictions.clear(); - size_t guid = bc->token2ids.guids[0]; - size_t start_idx = bc->token2ids.token_indexes[0].token_position; - - for (size_t i = 0; i <= bc->token2ids.num_samples; i++) { - if (i == bc->token2ids.num_samples || bc->token2ids.guids[i] != guid) { - // see how many tokens has been put to model in this req - // to get the index of the final token - int result_index = - bc->token2ids.token_indexes[i - 1].token_position - start_idx; + + // size_t guid = bc->tokensInfo[0].guid; + size_t guid = bc->requestsInfo[bc->tokensInfo[0].request_index].guid; + + size_t start_idx = bc->tokensInfo[0].abs_depth_in_request; + + // only store the last token of each req + for (size_t i = 0; i <= bc->num_active_tokens(); i++) { + size_t current_guid = + bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + if (i == bc->num_active_tokens() || current_guid != guid) { + + int result_index = bc->tokensInfo[i - 1].abs_depth_in_request - start_idx; batch_predictions[guid] = ir.results[i - 1]; + std::cout << "i: " << i << ", dds-" << guid << ", result index" << result_index << ", result value: " << batch_predictions[guid] << "\n"; - if (i < bc->token2ids.num_samples) { - guid = bc->token2ids.guids[i]; - start_idx = bc->token2ids.token_indexes[i].token_position; + if (i < bc->num_active_tokens()) { + guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + start_idx = bc->tokensInfo[i].abs_depth_in_request; } } } - // bc->print(); - // for (size_t i = 0; i < bc->num_active_requests(); i++) { - // batch_predictions[i] = ir.results[i]; - // std::cout << "i: " << i << ", ith pred: " << i - // << ", value: " << batch_predictions[i] - // << std::endl; - // } + assert(batch_predictions.size() == bc->num_active_requests()); } diff --git a/examples/cpp/inference/LLAMA/dataloader.cu b/examples/cpp/inference/LLAMA/dataloader.cu index f2480c8592..4fea090b63 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cu +++ b/examples/cpp/inference/LLAMA/dataloader.cu @@ -24,11 +24,11 @@ void DataLoader::load_input(Task const *task, LLAMAConfig llamaconfig; assert(regions.size() == 2); assert(task->regions.size() == 2); - // SampleIdxs *meta = (SampleIdxs *)task->local_args; DataLoaderNextBatchInput const input_struct = *((DataLoaderNextBatchInput *)task->args); - BatchConfig::SampleIdxs const &meta = input_struct.meta; + BatchConfig *bc = input_struct.bc; + std::map const &prev_batch_preds = input_struct.prev_batch_preds; @@ -50,38 +50,33 @@ void DataLoader::load_input(Task const *task, coord_t batch_size = batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - // copy 1 token from each batch - // FIXME: currently assume continous indices - size_t guid = meta.guids[0]; - size_t start_idx = meta.token_indexes[0].token_position; + size_t guid = bc->requestsInfo[bc->tokensInfo[0].request_index].guid; + size_t start_idx = bc->tokensInfo[0].abs_depth_in_request; size_t dst_idx = 0; - std::cout << "num samples " << meta.num_samples << "\n"; - - for (size_t i = 0; i <= meta.num_samples; i++) { - - // if the first token in one request - if (i == meta.num_samples || meta.guids[i] != guid) { + for (int i = 0; i <= bc->num_active_tokens(); i++) { + size_t current_guid = + bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + if (i == bc->num_active_tokens() || current_guid != guid) { size_t tokens_to_copy = - (meta.token_indexes[i - 1].token_position - start_idx + 1); - std::cout << "size to copy: " << tokens_to_copy << "\n"; + (bc->tokensInfo[i - 1].abs_depth_in_request - start_idx + 1); + + size_t request_index = bc->tokensInfo[i - 1].request_index; + size_t token_start_offset = + bc->requestsInfo[request_index].token_start_offset; - if (tokens_to_copy > 1 || meta.token_indexes[i - 1].token_position < - meta.token_indexes[i - 1].initial_length) { + std::cout << "size to copy: " << tokens_to_copy + << ", start offset: " << token_start_offset << "\n"; + if (tokens_to_copy > 1 || token_start_offset == 0) { // token pos < init length, the init length is the input sentence length // so this is the initial input, load from file. - size_t copy_start_index = guid * llamaconfig.sentence_len; std::cout << "copy index: " << copy_start_index << "\n"; copy_kernel<<>>( batch_input.ptr + dst_idx, full_input.ptr + copy_start_index, tokens_to_copy); - std::cout << "------------req---------------: " << guid << "\n"; - if (guid == 0) { - std::cout << "guid: " << meta.guids[i] << ", i: " << i << std::endl; - } for (int i = 0; i < 8; i++) { std::cout << "value: " << full_input.ptr[copy_start_index + i] << std::endl; @@ -92,17 +87,17 @@ void DataLoader::load_input(Task const *task, // for token by token generating, get token from the previous inference. long token = prev_batch_preds.at(guid); - std::cout << "next iter " << meta.token_indexes[i - 1].token_position + + std::cout << "next iter " << bc->tokensInfo[i - 1].abs_depth_in_request << ", dst_idx: " << dst_idx << ", token:" << token << "\n"; long *dst_ptr = batch_input.ptr + dst_idx; cudaMemcpy(dst_ptr, &token, sizeof(long), cudaMemcpyHostToDevice); } - // update for next req - if (i < meta.num_samples) { - guid = meta.guids[i]; - start_idx = meta.token_indexes[i].token_position; + if (i < bc->num_active_tokens()) { + guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + start_idx = bc->tokensInfo[i].abs_depth_in_request; } dst_idx = i; } diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index 17300f6d0d..978eb2bf10 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/inference.h" #include "flexflow/model.h" #define MAX_NUM_SAMPLES 65536 #define MAX_TOKEN_LEN 32000 @@ -102,6 +103,7 @@ struct SampleIdxs { }; struct DataLoaderNextBatchInput { - BatchConfig::SampleIdxs const &meta; + // BatchConfig::SampleIdxs const &meta; + BatchConfig *bc; std::map const &prev_batch_preds; }; diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc index 36f99718c3..67bcbdb648 100644 --- a/examples/cpp/inference/dataloader.cc +++ b/examples/cpp/inference/dataloader.cc @@ -136,20 +136,11 @@ void DataLoader::next_batch(FFModel &ff, } int batch_size = batch_input[bid]->dims[input_dims - 2].size; int seq_len = batch_input[bid]->dims[input_dims - 3].size; - /* printf("ff.config.batchSize: %i, batch_size: %i, seq_len: %i, - num_active_tokens: %i\n", ff.config.batchSize, batch_size, seq_len, - num_active_tokens); */ + assert(ff.config.batchSize == batch_size && batch_size * seq_len >= num_active_tokens); - /* std::cout << "About to call next_batch function..." << std::endl; - bc->print(); - std::cout << "batch_predictions: "; - for (const auto& elem : batch_predictions){ - std::cout << elem.first << ":" << elem.second << ", "; - } */ - DataLoaderNextBatchInput next_batch_input = {bc->token2ids, - batch_predictions}; + DataLoaderNextBatchInput next_batch_input = {bc, batch_predictions}; DataLoaderNextBatchInput const *ptr = &next_batch_input; size_t next_batch_input_sz = sizeof(next_batch_input); assert(ptr->prev_batch_preds.size() == batch_predictions.size()); @@ -184,41 +175,23 @@ void DataLoader::next_batch(FFModel &ff, void DataLoader::store_outputs(BatchConfig *bc, InferenceResult const &ir, std::map &batch_predictions) { - assert(bc->token2ids.num_samples == bc->num_active_tokens() && - bc->token2ids.num_samples <= bc->MAX_NUM_TOKENS); + assert((bc->num_active_tokens() == 0) == (bc->num_active_requests() == 0)); + if (bc->num_active_tokens() == 0) { + return; + } + // there is no num_samples, replace it with num_active_tokens batch_predictions.clear(); - // bc->print(); - for (size_t i = 0; i < bc->token2ids.num_samples; i++) { - if (i == bc->token2ids.num_samples - 1 || - bc->token2ids.guids[i] != bc->token2ids.guids[i + 1]) { - assert(bc->token2ids.token_indexes[i].token_position == - bc->token_last_available_idx[bc->token2ids.token_indexes[i] - .request_index]); - if (outputs.find(bc->token2ids.guids[i]) == outputs.end()) { + for (size_t i = 0; i < bc->num_active_tokens(); i++) { + size_t guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + if (i == bc->num_active_tokens() - 1 || + guid != bc->requestsInfo[bc->tokensInfo[i + 1].request_index].guid) { + if (outputs.find(guid) == outputs.end()) { std::vector v{ir.results[i]}; - outputs[bc->token2ids.guids[i]] = v; + outputs[guid] = v; } else { - outputs[bc->token2ids.guids[i]].push_back(ir.results[i]); + outputs[guid].push_back(ir.results[i]); } - /* std::cout << "outputs: "; - for(const auto& elem : outputs){ - std::cout << elem.first << ": ["; - for (const auto &vel : elem.second) { - std::cout << vel << " "; - } - std::cout << "]" << std::endl; - } */ - // std::cout << "outputs[bc->token2ids.guids[i]].size(): " << - // outputs[bc->token2ids.guids[i]].size() << std::endl; std::cout << "i: " - // << i << std::endl; std::cout << - // "bc->token2ids.token_indexes[i].token_position: " << - // bc->token2ids.token_indexes[i].token_position << std::endl; std::cout - // << "bc->token2ids.token_indexes[i].initial_length: " << - // bc->token2ids.token_indexes[i].initial_length << std::endl; - assert(outputs[bc->token2ids.guids[i]].size() == - (bc->token2ids.token_indexes[i].token_position + 1) - - (bc->token2ids.token_indexes[i].initial_length - 1)); - batch_predictions[bc->token2ids.guids[i]] = ir.results[i]; + batch_predictions[guid] = ir.results[i]; } } assert(batch_predictions.size() == bc->num_active_requests()); diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu index 8dcb8c3ab7..80c53f175a 100644 --- a/examples/cpp/inference/dataloader.cu +++ b/examples/cpp/inference/dataloader.cu @@ -25,10 +25,13 @@ void DataLoader::load_input(Task const *task, DataLoaderNextBatchInput const input_struct = *((DataLoaderNextBatchInput *)task->args); - BatchConfig::SampleIdxs const &meta = input_struct.meta; + + BatchConfig *bc = input_struct.bc; + BatchConfig::PerRequestInfo *requestInfo = bc->requestsInfo; + BatchConfig::PerTokenInfo *tokensInfo = bc->tokensInfo; std::map const &prev_batch_preds = input_struct.prev_batch_preds; - if (meta.num_samples == 0) { + if (bc->num_active_tokens() == 0) { return; } int const *full_input_ptr = helperGetTensorPointerRO( @@ -55,11 +58,13 @@ void DataLoader::load_input(Task const *task, assert(batch_size <= full_input_batch_size); // Currently assume continous indices - assert(meta.num_samples <= batch_size * sequence_length); - for (int i = 1; i < meta.num_samples; i++) { - if (meta.guids[i] == meta.guids[i - 1]) { - assert(meta.token_indexes[i].token_position == - meta.token_indexes[i - 1].token_position + 1); + assert(bc->num_active_tokens() <= batch_size * sequence_length); + for (int i = 1; i < bc->num_active_tokens(); i++) { + size_t prev_guid = requestInfo[tokensInfo[i - 1].request_index].guid; + size_t guid = requestInfo[tokensInfo[i].request_index].guid; + if (guid == prev_guid) { + assert(tokensInfo[i].abs_depth_in_request == + tokensInfo[i - 1].abs_depth_in_request + 1); } } // keep things simple for now @@ -69,22 +74,28 @@ void DataLoader::load_input(Task const *task, checkCUDA(cudaMemset( batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(int))); - size_t guid = meta.guids[0]; - size_t start_idx = meta.token_indexes[0].token_position; + size_t guid = requestInfo[tokensInfo[0].request_index].guid; + size_t start_idx = tokensInfo[0].abs_depth_in_request; size_t dst_idx = 0; size_t total_tokens = 0; - for (size_t i = 1; i <= meta.num_samples; i++) { - if (i == meta.num_samples || meta.guids[i] != guid) { + + for (size_t i = 1; i <= bc->num_active_tokens(); i++) { + size_t current_guid = requestInfo[tokensInfo[i].request_index].guid; + if (i == bc->num_active_tokens() || current_guid != guid) { size_t tokens_to_copy = - (meta.token_indexes[i - 1].token_position - start_idx + 1); - // size_t size_to_copy = token_dim * tokens_to_copy; + (tokensInfo[i - 1].abs_depth_in_request - start_idx + 1); assert(tokens_to_copy > 0); - if (tokens_to_copy > 1 || meta.token_indexes[i - 1].token_position < - meta.token_indexes[i - 1].initial_length) { + + size_t request_index = tokensInfo[i - 1].request_index; + size_t token_start_offset = + bc->requestsInfo[request_index].token_start_offset; + size_t num_processing_tokens = + bc->requestsInfo[request_index].num_tokens_in_batch; + if (tokens_to_copy > 1 || token_start_offset == 0) { // initialization phase - assert(meta.token_indexes[i - 1].token_position < - meta.token_indexes[i - 1].initial_length); + assert(tokensInfo[i - 1].abs_depth_in_request < + (token_start_offset + num_processing_tokens)); int const *input_zc = full_input_ptr + (guid * sequence_length) + start_idx; int *dst_ptr = batch_input_ptr + dst_idx; @@ -92,35 +103,24 @@ void DataLoader::load_input(Task const *task, dst_ptr, input_zc, tokens_to_copy); } else { // incremental phase - assert(meta.token_indexes[i - 1].token_position >= - meta.token_indexes[i - 1].initial_length); + assert(tokensInfo[i - 1].abs_depth_in_request >= token_start_offset); assert(tokens_to_copy == 1); - /* std::cout << "Looking for guid: " << guid << std::endl; - std::cout << "prev_batch_preds: "; - for (const auto& elem : prev_batch_preds){ - std::cout << elem.first << ":" << elem.second << ", "; - } - std::cout << std::endl; */ assert(prev_batch_preds.find(guid) != prev_batch_preds.end()); int token = prev_batch_preds.at(guid); int *dst_ptr = batch_input_ptr + dst_idx; cudaMemcpy(dst_ptr, &token, sizeof(int), cudaMemcpyHostToDevice); - // copy_kernel<<>>(dst_ptr, &token, tokens_to_copy); - // cudaMemcpyAsync(batch_input_ptr + dst_idx * token_dim, &token, 1, - // cudaMemcpyHostToDevice); } total_tokens += tokens_to_copy; - if (i < meta.num_samples) { - guid = meta.guids[i]; - start_idx = meta.token_indexes[i].token_position; + if (i < bc->num_active_tokens()) { + guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + start_idx = tokensInfo[i].abs_depth_in_request; } dst_idx = i; } } - assert(total_tokens == meta.num_samples); + assert(total_tokens == bc->num_active_tokens()); /*printf("token_dim: %lli, sequence_length: %lli, batch_size: %lli\n", token_dim, sequence_length, batch_size); printf("total_tokens: %lu\n", total_tokens); printf("guid: %lu\n", guid); diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h index afb45801d1..c77c70502a 100644 --- a/examples/cpp/inference/dataloader.h +++ b/examples/cpp/inference/dataloader.h @@ -62,7 +62,7 @@ class DataLoader { DataGenerator &_data_generator; }; struct DataLoaderNextBatchInput { - BatchConfig::SampleIdxs const &meta; + BatchConfig *bc; std::map const &prev_batch_preds; }; }; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 17ed9d18e8..674fac4ced 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -44,39 +44,25 @@ class BatchConfig { void print() const; static int const MAX_NUM_REQUESTS = MAX_REQUESTS; static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; - // static int const MAX_SEQUENCE_LENGTH = MAX_SEQ_LEN; + // These are set by update int num_tokens, num_requests; bool cached_results; - int token_start_idx[MAX_NUM_REQUESTS]; // index of first token in a request - // that should be processed in the - // current batch/iteration - int token_last_available_idx - [MAX_NUM_REQUESTS]; // last valid token index in a request. This includes - // both the prompt and generated tokens - int num_processing_tokens[MAX_NUM_REQUESTS]; // a request's number of tokens - // being processed in the current - // batch/iteration - size_t initial_length[MAX_NUM_REQUESTS]; - size_t max_sequence_length[MAX_NUM_REQUESTS]; - struct token_idxs { - size_t request_index; // the index within the BatchConfig of the request - // that the token belongs to - size_t token_position; // the index indicating the position of each token - // within its request - size_t initial_length; + struct PerRequestInfo { + size_t token_start_offset; + size_t num_tokens_in_batch; + size_t guid; }; - - struct SampleIdxs { - size_t num_samples; - size_t guids[InferenceResult::MAX_NUM_TOKENS]; // the guid of the request - // each token belongs to - token_idxs token_indexes[InferenceResult::MAX_NUM_TOKENS]; + struct PerTokenInfo { + size_t abs_depth_in_request; + size_t request_index; + size_t value; }; + PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; + PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; - SampleIdxs token2ids; - size_t request_guid[MAX_NUM_REQUESTS]; + size_t max_sequence_length[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; }; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 8daafd8565..e781da9cf5 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -138,7 +138,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { float *attn_heads, *W_out_contiguous; // void *reserveSpace; - BatchConfig::token_idxs *dev_token2ids; + // BatchConfig::token_idxs *dev_token2ids; + + BatchConfig::PerTokenInfo *token_infos; }; }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 8b53f047c0..0fdecfe6d6 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -40,17 +40,18 @@ __global__ void build_w_out_tensor(float const *weight_ptr, } } -__global__ void apply_rotary_embedding(float *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::token_idxs const *id_map, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int q_block_size, - int k_block_size, - int v_block_size, - bool q_tensor) { +__global__ void + apply_rotary_embedding(float *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int q_block_size, + int k_block_size, + int v_block_size, + bool q_tensor) { int proj_size = q_tensor ? qProjSize : kProjSize; CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { // create complex number @@ -72,7 +73,8 @@ __global__ void apply_rotary_embedding(float *input_ptr, // int head_idx = i / (num_tokens * proj_size); int token_idx = (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - size_t pos = id_map[token_idx].token_position; + // size_t pos = id_map[token_idx].token_position; + size_t pos = tokenInfos[token_idx].abs_depth_in_request; // float before_real = complex_input[i].x, before_complex = // complex_input[i].y; @@ -225,7 +227,7 @@ void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(output_ptr, complex_input, - m->dev_token2ids, + m->token_infos, m->qProjSize, m->kProjSize, m->num_heads, @@ -240,7 +242,7 @@ void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(output_ptr, complex_input, - m->dev_token2ids, + m->token_infos, m->qProjSize, m->kProjSize, m->num_heads, @@ -254,7 +256,7 @@ void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, __global__ void store_kv_cache(float const *devQKVProjArray, float *cache_ptr, - BatchConfig::token_idxs const *id_map, + BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, int vProjSize, @@ -275,8 +277,10 @@ __global__ void store_kv_cache(float const *devQKVProjArray, float val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + token_idx * proj_size + data_idx]; - int const req_id = id_map[token_idx].request_index; - int const tok_id = id_map[token_idx].token_position; + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + @@ -295,7 +299,7 @@ void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(m->devQKVProjArray, m->keyCache, - m->dev_token2ids, + m->token_infos, m->qProjSize, m->kProjSize, m->vProjSize, @@ -310,7 +314,7 @@ void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(m->devQKVProjArray, m->valueCache, - m->dev_token2ids, + m->token_infos, m->qProjSize, m->kProjSize, m->vProjSize, @@ -366,8 +370,10 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i]) { continue; } - int num_new_tokens = bc->num_processing_tokens[i]; - int total_tokens = bc->token_last_available_idx[i] + 1; + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].token_start_offset + + bc->requestsInfo[i].num_tokens_in_batch; + // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; int n = total_tokens; @@ -579,16 +585,16 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( m->vSize * m->vProjSize)); *m->has_load_weights = true; } + // here because we need postion info in infernece 1 + cudaMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); // phase 1: Implement kernel to compute KQV for input tokens inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); // phase 2: Update key/val cache - cudaMemcpyAsync(m->dev_token2ids, - &(bc->token2ids.token_indexes), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::token_idxs), - cudaMemcpyHostToDevice, - stream); - inference_kernel2(m, bc, stream); // phase 3: Compute attention score @@ -662,7 +668,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( num_heads * kProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN; size_t value_cache_size = num_heads * vProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN; - size_t token2ids_size = BatchConfig::MAX_NUM_TOKENS; + size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; size_t qk_prod_size = BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads; size_t attn_heads_size = @@ -673,9 +679,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * sizeof(float) + - token2ids_size * - sizeof(BatchConfig::token_idxs); // more components will - // be added here later + tokeninfo_size * + sizeof(BatchConfig::PerTokenInfo); // more components will + // be added here later Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(totalSize - 1)); @@ -691,8 +697,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); keyCache = (float *)devQKVProjArray + qkv_max_proj_size; valueCache = (float *)keyCache + key_cache_size; - dev_token2ids = (BatchConfig::token_idxs *)(valueCache + value_cache_size); - qk_prods = (float *)(dev_token2ids + token2ids_size); + token_infos = (BatchConfig::PerTokenInfo *)(valueCache + value_cache_size); + qk_prods = (float *)(token_infos + tokeninfo_size); qk_prods_softmax = (float *)(qk_prods + qk_prod_size); attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 093e7d6de3..01c1df551c 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -25,56 +25,41 @@ LegionRuntime::Logger::Category log_bc("BatchConfig"); BatchConfig::BatchConfig() { cached_results = false; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - token_start_idx[i] = 0; - token_last_available_idx[i] = -1; + requestsInfo[i].token_start_offset = 0; + requestsInfo[i].num_tokens_in_batch = 0; request_completed[i] = true; - num_processing_tokens[i] = 0; - max_sequence_length[i] = 0; - initial_length[i] = 0; } - token2ids.num_samples = 0; for (int i = 0; i < MAX_NUM_TOKENS; i++) { - token2ids.guids[i] = SIZE_MAX; - token2ids.token_indexes[i].request_index = SIZE_MAX; - token2ids.token_indexes[i].token_position = SIZE_MAX; - token2ids.token_indexes[i].initial_length = SIZE_MAX; + tokensInfo[i].abs_depth_in_request = SIZE_MAX; + tokensInfo[i].request_index = SIZE_MAX; + tokensInfo[i].value = SIZE_MAX; } update_num_active_requests_tokens(); } int BatchConfig::update_results(InferenceResult const &ir) { cached_results = false; - // int tokens_processed = 0; int completed = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { continue; } - assert(num_processing_tokens[i] > 0); - // if (num_processing_tokens[i] == 0) { - // continue; - // } - // tokens_processed += num_processing_tokens[i]; - token_start_idx[i] += num_processing_tokens[i]; - if (token_start_idx[i] >= max_sequence_length[i] + assert(requestsInfo[i].num_tokens_in_batch > 0); + int processed_tokens = requestsInfo[i].token_start_offset + + requestsInfo[i].num_tokens_in_batch; + if (processed_tokens >= max_sequence_length[i] // || ir.results[t] == 0 TODO: replace this with ) { log_bc.print("[Done] guid(%zu) final_length(%d)", - request_guid[i], - token_start_idx[i]); + requestsInfo[i].guid, + processed_tokens); request_completed[i] = true; - token_start_idx[i] = 0; - token_last_available_idx[i] = -1; - num_processing_tokens[i] = 0; + requestsInfo[i].num_tokens_in_batch = 0; + requestsInfo[i].token_start_offset = 0; completed++; } else { - if (token_start_idx[i] == token_last_available_idx[i] + 1) { - token_last_available_idx[i]++; - num_processing_tokens[i] = 1; // incremental phase - } else { - assert(false); - } - assert(token_start_idx[i] <= token_last_available_idx[i]); + requestsInfo[i].token_start_offset += requestsInfo[i].num_tokens_in_batch; + requestsInfo[i].num_tokens_in_batch = 1; } } update_num_active_requests_tokens(); @@ -89,12 +74,10 @@ bool BatchConfig::register_new_request(size_t guid, for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, initial_len); - token_start_idx[i] = 0; - token_last_available_idx[i] = initial_len - 1; + requestsInfo[i].token_start_offset = 0; + requestsInfo[i].num_tokens_in_batch = initial_len; + requestsInfo[i].guid = guid; max_sequence_length[i] = initial_len + tokens_to_generate; - initial_length[i] = initial_len; - request_guid[i] = guid; - num_processing_tokens[i] = 0; request_completed[i] = false; update_num_active_requests_tokens(); return true; @@ -105,23 +88,9 @@ bool BatchConfig::register_new_request(size_t guid, } void BatchConfig::prepare_next_batch() { - cached_results = false; - int count = 0; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (request_completed[i]) { - continue; - } - if (num_tokens + token_last_available_idx[i] - token_start_idx[i] + 1 <= - MAX_NUM_TOKENS) { - num_processing_tokens[i] = - token_last_available_idx[i] - token_start_idx[i] + 1; - } else { - num_processing_tokens[i] = MAX_NUM_TOKENS - num_tokens; - } - count += num_processing_tokens[i]; - } - update_num_active_requests_tokens(); - log_bc.print("[NextBatch] num_tokens(%d)", count); + assert(cached_results); + assert(num_requests > 0 && num_tokens > 0); + log_bc.print("[NextBatch] num_tokens(%d)", num_tokens); } void BatchConfig::update_num_active_requests_tokens() { @@ -130,17 +99,14 @@ void BatchConfig::update_num_active_requests_tokens() { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { num_requests++; - for (int j = 0; j < num_processing_tokens[i]; j++) { - token2ids.guids[num_tokens] = request_guid[i]; - token2ids.token_indexes[num_tokens].token_position = - token_start_idx[i] + j; - token2ids.token_indexes[num_tokens].request_index = i; - token2ids.token_indexes[num_tokens].initial_length = initial_length[i]; + for (int j = 0; j < requestsInfo[i].num_tokens_in_batch; j++) { + int start_idx = requestsInfo[i].token_start_offset; + tokensInfo[num_tokens].abs_depth_in_request = start_idx + j; + tokensInfo[num_tokens].request_index = i; num_tokens++; } } } - token2ids.num_samples = num_tokens; cached_results = true; } @@ -150,7 +116,7 @@ int BatchConfig::num_active_requests() const { } else { assert(false && "some BatchConfig functions updated requests but didn't call " - "update_num_active_requests_tokens() before exit"); + "() before exit"); } } @@ -165,74 +131,36 @@ int BatchConfig::num_active_tokens() const { } void BatchConfig::print() const { - printf("--------------------------BatchConfig--------------------------\n"); - printf("num_tokens: %i, num_requests: %i, cached_results: %i\n", - num_tokens, - num_requests, - cached_results); - - printf("requests_completed: "); - for (int i = 0; i < num_requests; i++) { - printf("%i ", request_completed[i]); - } - printf("\n"); - - printf("token_start_idx: "); - for (int i = 0; i < num_requests; i++) { - printf("%i ", token_start_idx[i]); - } - printf("\n"); - - printf("token_last_available_idx: "); - for (int i = 0; i < num_requests; i++) { - printf("%i ", token_last_available_idx[i]); - } - printf("\n"); + std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; + std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; + std::cout << "Number of tokens: " << num_tokens << std::endl; + std::cout << "Number of requests: " << num_requests << std::endl; + std::cout << "Cached results: " << cached_results << std::endl; - printf("num_processing_tokens: "); - for (int i = 0; i < num_requests; i++) { - printf("%i ", num_processing_tokens[i]); - } - printf("\n"); - - printf("max_sequence_length: "); - for (int i = 0; i < num_requests; i++) { - printf("%lu ", max_sequence_length[i]); - } - printf("\n"); - - printf("request_guid: "); - for (int i = 0; i < num_requests; i++) { - printf("%lu ", request_guid[i]); - } - printf("\n"); - - printf("token2ids.num_samples:%lu\n", token2ids.num_samples); - - printf("token2ids.guids: "); - for (int i = 0; i < num_tokens; i++) { - printf("%lu ", token2ids.guids[i]); - } - printf("\n"); - - printf("token2ids.token_indexes[i].request_index: "); - for (int i = 0; i < num_tokens; i++) { - printf("%lu ", token2ids.token_indexes[i].request_index); - } - printf("\n"); - - printf("token2ids.token_indexes[i].token_position: "); - for (int i = 0; i < num_tokens; i++) { - printf("%lu ", token2ids.token_indexes[i].token_position); + std::cout << "Per-request info:\n"; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (!request_completed[i]) { + std::cout << " Request " << i << ":\n"; + std::cout << " Token start offset: " + << requestsInfo[i].token_start_offset << std::endl; + std::cout << " Number of tokens in batch: " + << requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << " GUID: " << requestsInfo[i].guid << std::endl; + std::cout << " Max sequence length: " << max_sequence_length[i] + << std::endl; + std::cout << " Request completed: " << request_completed[i] + << std::endl; + } } - printf("token2ids.token_indexes[i].initial_length: "); + std::cout << "Per-token info:\n"; for (int i = 0; i < num_tokens; i++) { - printf("%lu ", token2ids.token_indexes[i].initial_length); + std::cout << " Token " << i << ":\n"; + std::cout << " Absolute depth in request: " + << tokensInfo[i].abs_depth_in_request << std::endl; + std::cout << " Request index: " << tokensInfo[i].request_index + << std::endl; } - printf("\n"); - printf("---------------------------------------------------------------------" - "---------\n"); } }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 09bbbefbe0..60294d4a75 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -143,6 +143,7 @@ MachineView *InferenceManager::get_machine_view(int mv_id) { } FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { + assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) int batch_index = index % max_num_inflight_batches; diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index ea4dc6b5b9..591b0e82be 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -1,5 +1,6 @@ #! /usr/bin/env bash set -e +set -x # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" From 089aaf5cf38499c200818f3fbe9e90ffaeb525fd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 30 Apr 2023 05:13:41 -0400 Subject: [PATCH 099/344] [Inference] - Update tests and Fix bugs discovered by Inference tests (#698) --- src/ops/experts.cc | 156 ++++++++++++++++------ src/ops/experts.cu | 10 +- src/ops/inc_multihead_self_attention.cc | 168 +++++++++++++++--------- src/ops/inc_multihead_self_attention.cu | 9 +- 4 files changed, 232 insertions(+), 111 deletions(-) diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 5fa6404ff0..8ec77131a9 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -828,6 +828,7 @@ void Experts::inference_task(Task const *task, assert(weights_domain.hi()[2] - weights_domain.lo()[2] + 1 == num_replicas); float const *bias_ptr = nullptr; + int nparams_bias = -1; if (use_bias) { bias_ptr = helperGetTensorPointerRO( regions[5], task->regions[5], FID_DATA, ctx, runtime); @@ -835,9 +836,9 @@ void Experts::inference_task(Task const *task, ctx, task->regions[5].region.get_index_space()); int bias_dims = bias_domain.get_dim(); assert(bias_dims == 3); - int nparams_bias = (m->experts_num_layers == 1) - ? out_dim - : (m->experts_internal_dim_size + out_dim); + nparams_bias = (m->experts_num_layers == 1) + ? out_dim + : (m->experts_internal_dim_size + out_dim); assert(bias_domain.hi()[0] - bias_domain.lo()[0] + 1 == nparams_bias); assert(bias_domain.hi()[1] - bias_domain.lo()[1] + 1 == num_experts); assert(bias_domain.hi()[2] - bias_domain.lo()[2] + 1 == num_replicas); @@ -851,6 +852,8 @@ void Experts::inference_task(Task const *task, std::cout << m->out_dim << std::endl; std::cout << m->num_chosen_experts << std::endl; std::cout << m->effective_batch_size << std::endl; + std::cout << m->experts_num_layers << std::endl; + std::cout << m->experts_internal_dim_size << std::endl; std::cout << m->num_experts << std::endl; std::cout << m->use_bias << std::endl; @@ -915,66 +918,137 @@ void Experts::inference_task(Task const *task, free(cpu_topk_gate_pred_ptr); /* ----------------Expert Weights--------------*/ - float *cpu_experts_1 = new float[data_dim * out_dim]; - float *cpu_experts_2 = new float[data_dim * out_dim]; - checkCUDA(cudaMemcpy(cpu_experts_1, - weights_ptrs[0], - data_dim * out_dim * sizeof(float), + assert(m->experts_num_layers == 2 || m->experts_num_layers == 1); + size_t layer0_size = m->experts_num_layers == 1 + ? data_dim * out_dim + : data_dim * m->experts_internal_dim_size; + size_t layer1_size = m->experts_internal_dim_size * out_dim; + float *cpu_experts_0_layer0 = new float[layer0_size]; + float *cpu_experts_1_layer0 = new float[layer0_size]; + float *cpu_experts_0_layer1 = + m->experts_num_layers == 1 ? nullptr : new float[layer1_size]; + float *cpu_experts_1_layer1 = + m->experts_num_layers == 1 ? nullptr : new float[layer1_size]; + /*checkCUDA(cudaMemcpy(cpu_experts_0_layer0, + weights_ptr, + layer0_size * sizeof(float), cudaMemcpyDeviceToHost)); - checkCUDA(cudaMemcpy(cpu_experts_2, - weights_ptrs[2], - data_dim * out_dim * sizeof(float), + checkCUDA(cudaMemcpy(cpu_experts_1_layer0, + weights_ptr[nparams_weight], + layer0_size * sizeof(float), cudaMemcpyDeviceToHost)); + if (m->experts_num_layers == 2) { + checkCUDA(cudaMemcpy(cpu_experts_0_layer1, + weights_ptr[layer0_size], + layer1_size * sizeof(float), + cudaMemcpyDeviceToHost)); + checkCUDA(cudaMemcpy(cpu_experts_1_layer1, + weights_ptr[nparams_weight + layer0_size], + layer1_size * sizeof(float), + cudaMemcpyDeviceToHost)); + }*/ cpu_sum = 0; - for (int i = 0; i < data_dim * out_dim; i++) { - cpu_experts_1[i] = float(i) / float(data_dim * out_dim); - cpu_sum += cpu_experts_1[i]; + for (int i = 0; i < layer0_size; i++) { + cpu_experts_0_layer0[i] = float(i) / float(nparams_weight); + cpu_sum += cpu_experts_0_layer0[i]; + } + if (m->experts_num_layers == 2) { + for (int i = 0; i < layer1_size; i++) { + cpu_experts_0_layer1[i] = + float(layer0_size + i) / float(nparams_weight); + cpu_sum += cpu_experts_0_layer1[i]; + } } std::cout << "[CPU] Experts 0 weights sum = " << cpu_sum << std::endl; - for (int i = 0; i < data_dim * out_dim; i++) { - cpu_experts_2[i] = - float(data_dim * out_dim - i) / float(data_dim * out_dim); - cpu_sum += cpu_experts_2[i]; + cpu_sum = 0; + for (int i = 0; i < layer0_size; i++) { + cpu_experts_1_layer0[i] = + float(nparams_weight - i) / float(nparams_weight); + assert(cpu_experts_1_layer0[i] > 0); + cpu_sum += cpu_experts_1_layer0[i]; + } + if (m->experts_num_layers == 2) { + for (int i = 0; i < layer1_size; i++) { + cpu_experts_1_layer1[i] = + float(nparams_weight - layer0_size + i) / float(nparams_weight); + assert(cpu_experts_1_layer1[i] > 0); + cpu_sum += cpu_experts_1_layer1[i]; + } } std::cout << "[CPU] Experts 1 weights sum = " << cpu_sum << std::endl; for (int i = 0; i < num_experts; i++) { - if (i % 2 == 0) { - checkCUDA(cudaMemcpy((float *)weights_ptrs[i * (1 + use_bias)], - cpu_experts_1, - data_dim * out_dim * sizeof(float), - cudaMemcpyHostToDevice)); - } else { - checkCUDA(cudaMemcpy((float *)weights_ptrs[i * (1 + use_bias)], - cpu_experts_2, - data_dim * out_dim * sizeof(float), - cudaMemcpyHostToDevice)); + // first layer + checkCUDA( + cudaMemcpy((float *)&weights_ptr[nparams_weight * i], + i % 2 == 0 ? cpu_experts_0_layer0 : cpu_experts_1_layer0, + layer0_size * sizeof(float), + cudaMemcpyHostToDevice)); + // second layer + if (m->experts_num_layers == 2) { + checkCUDA( + cudaMemcpy((float *)&weights_ptr[nparams_weight * i + layer0_size], + i % 2 == 0 ? cpu_experts_0_layer1 : cpu_experts_1_layer1, + layer1_size * sizeof(float), + cudaMemcpyHostToDevice)); } } - free(cpu_experts_1); - free(cpu_experts_2); + free(cpu_experts_0_layer0); + free(cpu_experts_1_layer0); + free(cpu_experts_0_layer1); + free(cpu_experts_1_layer1); /* ----------------Expert Bias--------------*/ if (use_bias) { - float *bias_experts_1 = new float[out_dim]; - checkCUDA(cudaMemcpy(bias_experts_1, - weights_ptrs[1], - out_dim * sizeof(float), + size_t layer0_size = + m->experts_num_layers == 1 ? out_dim : m->experts_internal_dim_size; + size_t layer1_size = out_dim; + float *bias_experts_0_layer0 = new float[layer0_size]; + float *bias_experts_0_layer1 = + m->experts_num_layers == 1 ? nullptr : new float[layer1_size]; + + checkCUDA(cudaMemcpy(bias_experts_0_layer0, + bias_ptr, + layer0_size * sizeof(float), cudaMemcpyDeviceToHost)); cpu_sum = 0; - for (int i = 0; i < out_dim; i++) { - cpu_sum += bias_experts_1[i]; + for (int i = 0; i < layer0_size; i++) { + cpu_sum += bias_experts_0_layer0[i]; // bias_experts_1[i] = 1.0f; } - std::cout << "[CPU] Bias 0 sum = " << cpu_sum << std::endl; + std::cout << "[CPU] Bias expert 0 (layer 0) sum = " << cpu_sum + << std::endl; + + if (m->experts_num_layers == 2) { + checkCUDA(cudaMemcpy(bias_experts_0_layer1, + (float *)&bias_ptr[layer0_size], + layer1_size * sizeof(float), + cudaMemcpyDeviceToHost)); + cpu_sum = 0; + for (int i = 0; i < layer1_size; i++) { + cpu_sum += bias_experts_0_layer1[i]; + // bias_experts_1[i] = 1.0f; + } + std::cout << "[CPU] Bias expert 0 (layer 1) sum = " << cpu_sum + << std::endl; + } + for (int i = 0; i < num_experts; i++) { - checkCUDA(cudaMemcpy((float *)weights_ptrs[i * (1 + use_bias) + 1], - bias_experts_1, - out_dim * sizeof(float), + checkCUDA(cudaMemcpy((float *)&bias_ptr[nparams_bias * i], + bias_experts_0_layer0, + layer0_size * sizeof(float), cudaMemcpyHostToDevice)); + if (m->experts_num_layers == 2) { + checkCUDA( + cudaMemcpy((float *)&bias_ptr[nparams_bias * i + layer0_size], + bias_experts_0_layer1, + layer1_size * sizeof(float), + cudaMemcpyHostToDevice)); + } } - free(bias_experts_1); + free(bias_experts_0_layer0); + free(bias_experts_0_layer1); } } #endif diff --git a/src/ops/experts.cu b/src/ops/experts.cu index c5f79446a1..ce15cdff55 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -963,7 +963,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, (float const **)calloc(gemm_batch_count, sizeof(float const *)); assert(weight_idx_array_thrust); checkCUDA(cudaMemcpy(weight_idx_array_thrust, - m->weight_idx_array, + m->weight_idx_array1, sizeof(float const *) * gemm_batch_count, cudaMemcpyDeviceToHost)); std::vector weight_idx_array_thrust_vec( @@ -983,7 +983,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(bias_idx_array_thrust); if (use_bias) { checkCUDA(cudaMemcpy(bias_idx_array_thrust, - m->bias_idx_array, + m->bias_idx_array1, sizeof(float const *) * gemm_batch_count, cudaMemcpyDeviceToHost)); } @@ -1160,7 +1160,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(dev_batch_outputs_cuda); checkCUDA( cudaMemcpy(dev_batch_outputs_cuda, - m->dev_batch_outputs, + m->dev_batch_outputs1, sizeof(float *) * num_chosen_experts * m->effective_batch_size, cudaMemcpyDeviceToHost)); std::vector dev_batch_outputs_cuda_vec( @@ -1168,8 +1168,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, dev_batch_outputs_cuda + num_chosen_experts * m->effective_batch_size); std::vector batch_outputs_host_vec( - m->batch_outputs, - m->batch_outputs + num_chosen_experts * m->effective_batch_size); + m->batch_outputs1, + m->batch_outputs1 + num_chosen_experts * m->effective_batch_size); assert(batch_outputs_host_vec == dev_batch_outputs_cuda_vec); /* std::cout << "dev_batch_outputs_cuda_vec[i]: "; diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8b2850a91c..a0674eddb6 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -583,6 +583,26 @@ void IncMultiHeadSelfAttention::inference_task( #ifdef INFERENCE_TESTS printf("Checking IncMultiHeadSelfAttention computations...\n"); + // ============================================================================= + // Define helper functions to handle row-major arrays + // ============================================================================= + + auto set_value_row_major = [](float *arr, + std::vector const &shape, + std::vector const &indices, + float value) -> void { + int offset = 0; + for (int i = 0; i < shape.size(); i++) { + int index = indices[i]; + int stride = 1; + for (int j = i + 1; j < shape.size(); j++) { + stride *= shape[j]; + } + offset += index * stride; + } + *(arr + offset) = value; + }; + // ============================================================================= // Load input/output/weights and parse general configs // ============================================================================= @@ -667,35 +687,40 @@ void IncMultiHeadSelfAttention::inference_task( // ============================================================================= // Load the Q/K/V projection weights, and create a Torch tensor // ============================================================================= - - float w_qkv[m->qSize][m->qProjSize][3][num_heads]; - memset(&w_qkv, - 0, - m->qSize * m->qProjSize * 3 * num_heads * - sizeof(float)); // assuming that 0.0f is encoded as all zero bytes - assert(w_qkv[0][0][0][0] == 0.0f); + std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_heads}; + float *w_qkv = + (float *)calloc(m->qSize * m->qProjSize * 3 * num_heads, sizeof(float)); + assert(w_qkv[0] == 0.0f); for (int h = 0; h < num_heads; h++) { for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { - size_t row_index = i % m->qSize; - size_t column_index = i / m->qSize; + int row_index = i % m->qSize; + int column_index = i / m->qSize; // Q - w_qkv[row_index][column_index][0][h] = - weight_cpu[all_weight_params * h + m->qSize * column_index + - row_index]; + set_value_row_major(w_qkv, + w_qkv_shape, + {row_index, column_index, 0, h}, + weight_cpu[all_weight_params * h + + m->qSize * column_index + row_index]); // K - w_qkv[row_index][column_index][1][h] = + set_value_row_major( + w_qkv, + w_qkv_shape, + {row_index, column_index, 1, h}, weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + - m->qSize * column_index + row_index]; + m->qSize * column_index + row_index]); // V - w_qkv[row_index][column_index][2][h] = + set_value_row_major( + w_qkv, + w_qkv_shape, + {row_index, column_index, 2, h}, weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + - m->qSize * column_index + row_index]; + m->qSize * column_index + row_index]); } } // convert weights to torch tensor torch::Tensor torch_w_qkv = torch::from_blob( - w_qkv, {m->qSize, m->qProjSize, 3, num_heads}, torch::kFloat32); + w_qkv, {m->qSize, m->qProjSize, 3, (int)num_heads}, torch::kFloat32); /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() << std::endl; @@ -719,6 +744,7 @@ void IncMultiHeadSelfAttention::inference_task( qkv_projs.sizes()[1] <= effective_batch_size); assert(qkv_projs.sizes()[2] == 3); assert(qkv_projs.sizes()[3] == num_heads); + free(w_qkv); // ----------------------- Loading CUDA results for this step --------------- float *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, @@ -726,30 +752,30 @@ void IncMultiHeadSelfAttention::inference_task( proj_sum * m->num_heads); assert(QKVProjArray_cpu != nullptr); - float QKVProjArray_converted[m->qProjSize][bc->num_active_tokens()][3] - [num_heads]; - memset(&QKVProjArray_converted, - 0, - m->qProjSize * bc->num_active_tokens() * 3 * num_heads * - sizeof(float)); // assuming that 0.0f is encoded as all zero bytes + std::vector QKVProjArray_converted_shape = { + m->qProjSize, bc->num_active_tokens(), 3, (int)num_heads}; + float *QKVProjArray_converted = (float *)calloc( + m->qProjSize * bc->num_active_tokens() * 3 * num_heads, sizeof(float)); // skip over padding at the end of QKVProjArray_cpu // convert from column order to 3D matrix because torch cannot automatically // import matrices flattened in column order for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { - size_t proj_size_index = i % m->qProjSize; - size_t head_index = i / (proj_sum * bc->num_active_tokens()); - size_t token_index = + int proj_size_index = i % m->qProjSize; + int head_index = i / (proj_sum * bc->num_active_tokens()); + int token_index = ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % bc->num_active_tokens(); - size_t qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / - (m->qProjSize * bc->num_active_tokens()); + int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / + (m->qProjSize * bc->num_active_tokens()); assert(proj_size_index < proj_sum); assert(head_index < num_heads); assert(token_index < bc->num_active_tokens()); assert(qkv_offset < 3); - QKVProjArray_converted[proj_size_index][token_index][qkv_offset] - [head_index] = QKVProjArray_cpu[i]; + set_value_row_major(QKVProjArray_converted, + QKVProjArray_converted_shape, + {proj_size_index, token_index, qkv_offset, head_index}, + QKVProjArray_cpu[i]); } torch::Tensor QKVProjArray_torch = torch::from_blob(QKVProjArray_converted, @@ -771,6 +797,7 @@ void IncMultiHeadSelfAttention::inference_task( // } // } assert(torch::allclose(QKVProjArray_torch, qkv_projs, 1e-05, 1e-05)); + free(QKVProjArray_converted); // ============================================================================= // Store the K/V projections into the cache @@ -783,10 +810,9 @@ void IncMultiHeadSelfAttention::inference_task( for (size_t d = 0; d < m->kProjSize; d++) { size_t kcache_idx = d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->token2ids.token_indexes[t].token_position * m->num_heads * + bc->tokensInfo[t].abs_depth_in_request * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - h * BatchConfig::MAX_NUM_REQUESTS + - bc->token2ids.token_indexes[t].request_index; + h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; m->kcache[kcache_idx] = qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) .item(); @@ -794,10 +820,9 @@ void IncMultiHeadSelfAttention::inference_task( for (size_t d = 0; d < m->vProjSize; d++) { size_t vcache_idx = d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->token2ids.token_indexes[t].token_position * m->num_heads * + bc->tokensInfo[t].abs_depth_in_request * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - h * BatchConfig::MAX_NUM_REQUESTS + - bc->token2ids.token_indexes[t].request_index; + h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; m->vcache[vcache_idx] = qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) .item(); @@ -819,7 +844,7 @@ void IncMultiHeadSelfAttention::inference_task( std::vector r_first_idx; std::vector r_num_tokens; for (size_t t = 0; t < bc->num_active_tokens(); t++) { - size_t rid = bc->token2ids.token_indexes[t].request_index; + size_t rid = bc->tokensInfo[t].request_index; if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { req_idxs.push_back(rid); r_first_idx.push_back(t); @@ -1004,13 +1029,19 @@ void IncMultiHeadSelfAttention::inference_task( // ============================================================================= // ----------------------- C++ operations & checks -------------------------- - float w_out[m->vProjSize][m->num_heads][m->oProjSize] = {0}; + float *w_out = (float *)calloc(m->vProjSize * m->num_heads * m->oProjSize, + sizeof(float)); + std::vector w_out_shape = {m->vProjSize, m->num_heads, m->oProjSize}; + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); for (int h = 0; h < num_heads; h++) { for (int v = 0; v < m->vProjSize; v++) { for (int o = 0; o < m->oProjSize; o++) { - w_out[v][h][o] = + set_value_row_major( + w_out, + w_out_shape, + {v, h, o}, weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + - m->vProjSize * o + v]; + m->vProjSize * o + v]); } } } @@ -1022,14 +1053,21 @@ void IncMultiHeadSelfAttention::inference_task( float *w_out_cuda = download_tensor( m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); assert(w_out_cuda != nullptr); - float converted_wout_tensor[m->vProjSize][m->num_heads][m->oProjSize] = {0}; + float *converted_wout_tensor = (float *)calloc( + m->vProjSize * m->num_heads * m->oProjSize, sizeof(float)); + std::vector converted_wout_tensor_shape = { + m->vProjSize, m->num_heads, m->oProjSize}; + for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { - int row_index = i % m->vProjSize; - int col_index = (i / m->vProjSize) % m->num_heads; - int depth_index = i / (m->vProjSize * m->num_heads); - assert(row_index < m->vProjSize && col_index < m->num_heads && - depth_index < m->oProjSize); - converted_wout_tensor[row_index][col_index][depth_index] = w_out_cuda[i]; + int v_idx = i % m->vProjSize; + int h_idx = (i / m->vProjSize) % m->num_heads; + int o_idx = i / (m->vProjSize * m->num_heads); + assert(v_idx < m->vProjSize && h_idx < m->num_heads && + o_idx < m->oProjSize); + set_value_row_major(converted_wout_tensor, + converted_wout_tensor_shape, + {v_idx, h_idx, o_idx}, + w_out_cuda[i]); } torch::Tensor w_out_cuda_tensor = torch::from_blob(converted_wout_tensor, @@ -1038,6 +1076,7 @@ void IncMultiHeadSelfAttention::inference_task( // ----------------------- Comparing C++ & CUDA results --------------------- assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); + free(converted_wout_tensor); // ============================================================================= // Compute the softmax(QK^T/sqrt(d_k))V product, request by request @@ -1079,11 +1118,9 @@ void IncMultiHeadSelfAttention::inference_task( size_t num_new_tokens = r_num_tokens[r]; int64_t rid = (int64_t)(req_idxs[r]); int64_t num_tokens_received_so_far = - (int64_t)(bc->token_last_available_idx[rid] + 1); - // printf("num_new_tokens: %lu, bc->num_processing_tokens[rid]: %i, rid: - // %li\n", - // num_new_tokens, bc->num_processing_tokens[rid], rid); - assert(num_new_tokens == bc->num_processing_tokens[rid]); + (int64_t)(bc->requestsInfo[rid].token_start_offset + + bc->requestsInfo[rid].num_tokens_in_batch); + assert(num_new_tokens == bc->requestsInfo[rid].num_tokens_in_batch); assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); // ----------------------- C++ computations ------------------------------- @@ -1141,10 +1178,13 @@ void IncMultiHeadSelfAttention::inference_task( assert(qk_softmax[r].sizes()[2] == m->num_heads); // ------------------- Loading CUDA results for this step --------------- - float converted_qk_prod[num_new_tokens][num_tokens_received_so_far] - [num_heads] = {0}; - float converted_qk_prod_softmax[num_new_tokens][num_tokens_received_so_far] - [num_heads] = {0}; + float *converted_qk_prod = (float *)calloc( + num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + float *converted_qk_prod_softmax = (float *)calloc( + num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + std::vector converted_qk_prod_shape = { + (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_heads}; + for (size_t i = 0; i < num_new_tokens * num_tokens_received_so_far * num_heads; i++) { @@ -1153,10 +1193,14 @@ void IncMultiHeadSelfAttention::inference_task( size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); assert(new_t_idx < num_new_tokens && all_t_idx < num_tokens_received_so_far && head_idx < num_heads); - converted_qk_prod[new_t_idx][all_t_idx][head_idx] = - qk_prods_cpu[i + qk_prods_cpu_offset]; - converted_qk_prod_softmax[new_t_idx][all_t_idx][head_idx] = - qk_prods_softmax_cpu[i + qk_prods_cpu_offset]; + set_value_row_major(converted_qk_prod, + converted_qk_prod_shape, + {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, + qk_prods_cpu[i + qk_prods_cpu_offset]); + set_value_row_major(converted_qk_prod_softmax, + converted_qk_prod_shape, + {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, + qk_prods_softmax_cpu[i + qk_prods_cpu_offset]); } torch::Tensor qk_prods_cuda = torch::from_blob( converted_qk_prod, @@ -1193,6 +1237,8 @@ void IncMultiHeadSelfAttention::inference_task( // } assert(torch::allclose(qk_prods_cuda, qk_products[r], 1e-05, 1e-05)); assert(torch::allclose(qk_prods_softmax_cuda, qk_softmax[r], 1e-05, 1e-05)); + free(converted_qk_prod); + free(converted_qk_prod_softmax); // --------------------- C++ computations -------------------------- // Multiply softmax results by V @@ -1277,7 +1323,7 @@ void IncMultiHeadSelfAttention::inference_task( // ============================================================================= // Cleanup // ============================================================================= - + free(w_out); checkCUDA(cudaFreeHost(input_cpu)); checkCUDA(cudaFreeHost(weight_cpu)); checkCUDA(cudaFreeHost(output_cpu)); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 0fdecfe6d6..d2af3fa8e1 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -31,12 +31,13 @@ __global__ void build_w_out_tensor(float const *weight_ptr, int num_heads, int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - int row_idx = i % vProjSize; - int col_idx = (i / vProjSize) % oProjSize; + int v_idx = i % vProjSize; + int o_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[i] = + contiguous_weight_ptr[o_idx * vProjSize * num_heads + head_idx * vProjSize + + v_idx] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + col_idx * vProjSize + row_idx]; + qkv_weight_block_size + o_idx * vProjSize + v_idx]; } } From 41aa96a7f8bfa496a3b37d2de92da3fcd79c1e6f Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 30 Apr 2023 14:53:19 -0500 Subject: [PATCH 100/344] Add a RequestManager to manage all requests and set BatchConfig for each inference batch (#697) * Support multiple FFModels in a single top_level_task * initial implementation * initial implementation * [RequestManager] more work * [RequestManager] initial implementation done * bug fix --- examples/cpp/inference/LLAMA/dataloader.cc | 12 +- examples/cpp/inference/LLAMA/dataloader.cu | 25 ++-- examples/cpp/inference/LLAMA/llama.cc | 65 ++++---- examples/cpp/inference/dataloader.cc | 11 +- examples/cpp/inference/dataloader.cu | 28 ++-- .../cpp/inference/mixture_of_experts/moe.cc | 2 +- .../inference/transformers/transformers.cc | 2 +- include/flexflow/batch_config.h | 40 ++--- include/flexflow/inference.h | 31 ++++ include/flexflow/model.h | 3 + include/flexflow/runtime.h | 31 ++++ src/ops/arg_topk.cc | 2 +- src/ops/inc_multihead_self_attention.cc | 2 +- src/runtime/batch_config.cc | 84 ++++++----- src/runtime/inference_manager.cc | 34 ++++- src/runtime/model.cc | 81 ++++++---- src/runtime/request_manager.cc | 139 ++++++++++++++++++ src/runtime/request_manager.cu | 50 +++++++ 18 files changed, 486 insertions(+), 156 deletions(-) create mode 100644 include/flexflow/runtime.h create mode 100644 src/runtime/request_manager.cc create mode 100644 src/runtime/request_manager.cu diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index fa69324a96..1370d3f724 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -224,25 +224,25 @@ void DataLoader::store_outputs(BatchConfig *bc, batch_predictions.clear(); // size_t guid = bc->tokensInfo[0].guid; - size_t guid = bc->requestsInfo[bc->tokensInfo[0].request_index].guid; + auto guid = bc->requestsInfo[bc->tokensInfo[0].request_index].request_guid; - size_t start_idx = bc->tokensInfo[0].abs_depth_in_request; + int start_idx = bc->tokensInfo[0].abs_depth_in_request; // only store the last token of each req for (size_t i = 0; i <= bc->num_active_tokens(); i++) { - size_t current_guid = - bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + auto current_guid = + bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; if (i == bc->num_active_tokens() || current_guid != guid) { int result_index = bc->tokensInfo[i - 1].abs_depth_in_request - start_idx; - batch_predictions[guid] = ir.results[i - 1]; + batch_predictions[guid] = ir.token_ids[i - 1]; std::cout << "i: " << i << ", dds-" << guid << ", result index" << result_index << ", result value: " << batch_predictions[guid] << "\n"; if (i < bc->num_active_tokens()) { - guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; start_idx = bc->tokensInfo[i].abs_depth_in_request; } } diff --git a/examples/cpp/inference/LLAMA/dataloader.cu b/examples/cpp/inference/LLAMA/dataloader.cu index 4fea090b63..e32e3ddc33 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cu +++ b/examples/cpp/inference/LLAMA/dataloader.cu @@ -50,19 +50,19 @@ void DataLoader::load_input(Task const *task, coord_t batch_size = batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - size_t guid = bc->requestsInfo[bc->tokensInfo[0].request_index].guid; - size_t start_idx = bc->tokensInfo[0].abs_depth_in_request; - size_t dst_idx = 0; + auto guid = bc->requestsInfo[bc->tokensInfo[0].request_index].request_guid; + int start_idx = bc->tokensInfo[0].abs_depth_in_request; + int dst_idx = 0; for (int i = 0; i <= bc->num_active_tokens(); i++) { - size_t current_guid = - bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + auto current_guid = + bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; if (i == bc->num_active_tokens() || current_guid != guid) { - size_t tokens_to_copy = + int tokens_to_copy = (bc->tokensInfo[i - 1].abs_depth_in_request - start_idx + 1); - size_t request_index = bc->tokensInfo[i - 1].request_index; - size_t token_start_offset = + int request_index = bc->tokensInfo[i - 1].request_index; + int token_start_offset = bc->requestsInfo[request_index].token_start_offset; std::cout << "size to copy: " << tokens_to_copy @@ -70,7 +70,7 @@ void DataLoader::load_input(Task const *task, if (tokens_to_copy > 1 || token_start_offset == 0) { // token pos < init length, the init length is the input sentence length // so this is the initial input, load from file. - size_t copy_start_index = guid * llamaconfig.sentence_len; + int copy_start_index = guid * llamaconfig.sentence_len; std::cout << "copy index: " << copy_start_index << "\n"; copy_kernel<<>>( batch_input.ptr + dst_idx, @@ -92,11 +92,14 @@ void DataLoader::load_input(Task const *task, << ", dst_idx: " << dst_idx << ", token:" << token << "\n"; long *dst_ptr = batch_input.ptr + dst_idx; - cudaMemcpy(dst_ptr, &token, sizeof(long), cudaMemcpyHostToDevice); + cudaMemcpy(dst_ptr, + &token, + sizeof(FlexFlow::RequestManager::TokenId), + cudaMemcpyHostToDevice); } if (i < bc->num_active_tokens()) { - guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; start_idx = bc->tokensInfo[i].abs_depth_in_request; } dst_idx = i; diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 3d745d8bd5..f05526ce9b 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -58,7 +58,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor input; { int const token_dims[] = {llamaConfig.batchSize, llamaConfig.max_seq_len}; - input = ff.create_tensor<2>(token_dims, DT_INT64); + input = ff.create_tensor<2>(token_dims, DT_INT32); } Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); @@ -79,7 +79,7 @@ void FlexFlow::top_level_task(Task const *task, // } // n transformer blocks impl - for (int i = 0; i < 1; i++) { + for (int i = 0; i < 10; i++) { // step 1: attention std::vector axes = {2}; Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); @@ -145,6 +145,7 @@ void FlexFlow::top_level_task(Task const *task, std::cout << "------start compile ----------" << std::endl; InferenceManager im(&ff, llamaConfig.batchSize, 1); im.compile_model_and_allocate_buffer(); + RequestManager rm; std::cout << "------init ops----------" << std::endl; im.init_operators_inference(); @@ -203,18 +204,36 @@ void FlexFlow::top_level_task(Task const *task, //------------------------------ do inference--------------------------- int processed_requests = 0; std::map future_handlers; - std::map batch_configs; - BatchConfig *bc = nullptr; + std::map batch_configs; std::map batch_predictions[1]; loader.reset(); - bool new_req = true; + for (int i = 0; i < llamaConfig.batchSize; i++) { + std::vector tokens{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + }; + rm.register_new_request(tokens, 347); + } while (processed_requests < llamaConfig.sentence_len) { int bid = 0; size_t max_reqs, max_tkns; if (future_handlers.find(bid) == future_handlers.end()) { - bc = new BatchConfig(); + BatchConfig bc; + InferenceResult ir; + bc = rm.prepare_next_batch(bc, ir); + std::cout << "new tokens: " << bc.num_tokens; + FutureMap fm = im.inference(bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; } else { // have luanched this bid Future future = future_handlers[bid]; @@ -225,33 +244,15 @@ void FlexFlow::top_level_task(Task const *task, } // process end InferenceResult ir = future.get_result(); - bc = batch_configs[bid]; - - std::cout << "store outputs start...." << std::endl; - loader.store_outputs(bc, ir, batch_predictions[bid]); - processed_requests += bc->update_results(ir); - - if (!new_req) { - break; - } - new_req = false; + BatchConfig bc = batch_configs[bid]; + processed_requests += bc.num_tokens; + bc = rm.prepare_next_batch(bc, ir); + std::cout << "new tokens: " << bc.num_tokens; + FutureMap fm = im.inference(bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; } - // batch cofig register 5 reqs - // init length relate to the min_prompt_size for llama - if (new_req) { - for (int i = 0; i < llamaConfig.batchSize; i++) { - assert(bc->register_new_request(i, llamaConfig.max_seq_len, 347)); - } - } - - bc->prepare_next_batch(); - std::cout << "new tokens: " << bc->num_active_tokens(); - loader.next_batch(ff, bc, batch_predictions[bid]); - - FutureMap fm = im.inference(bid, *bc); - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; } // float* data diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc index 67bcbdb648..ce49086a92 100644 --- a/examples/cpp/inference/dataloader.cc +++ b/examples/cpp/inference/dataloader.cc @@ -182,16 +182,17 @@ void DataLoader::store_outputs(BatchConfig *bc, // there is no num_samples, replace it with num_active_tokens batch_predictions.clear(); for (size_t i = 0; i < bc->num_active_tokens(); i++) { - size_t guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + auto guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; if (i == bc->num_active_tokens() - 1 || - guid != bc->requestsInfo[bc->tokensInfo[i + 1].request_index].guid) { + guid != bc->requestsInfo[bc->tokensInfo[i + 1].request_index] + .request_guid) { if (outputs.find(guid) == outputs.end()) { - std::vector v{ir.results[i]}; + std::vector v{ir.token_ids[i]}; outputs[guid] = v; } else { - outputs[guid].push_back(ir.results[i]); + outputs[guid].push_back(ir.token_ids[i]); } - batch_predictions[guid] = ir.results[i]; + batch_predictions[guid] = ir.token_ids[i]; } } assert(batch_predictions.size() == bc->num_active_requests()); diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu index 80c53f175a..7fb3478020 100644 --- a/examples/cpp/inference/dataloader.cu +++ b/examples/cpp/inference/dataloader.cu @@ -14,6 +14,7 @@ */ #include "dataloader.h" +#include "flexflow/inference.h" #include "flexflow/utils/cuda_helper.h" void DataLoader::load_input(Task const *task, @@ -60,8 +61,8 @@ void DataLoader::load_input(Task const *task, // Currently assume continous indices assert(bc->num_active_tokens() <= batch_size * sequence_length); for (int i = 1; i < bc->num_active_tokens(); i++) { - size_t prev_guid = requestInfo[tokensInfo[i - 1].request_index].guid; - size_t guid = requestInfo[tokensInfo[i].request_index].guid; + auto prev_guid = requestInfo[tokensInfo[i - 1].request_index].request_guid; + auto guid = requestInfo[tokensInfo[i].request_index].request_guid; if (guid == prev_guid) { assert(tokensInfo[i].abs_depth_in_request == tokensInfo[i - 1].abs_depth_in_request + 1); @@ -74,23 +75,23 @@ void DataLoader::load_input(Task const *task, checkCUDA(cudaMemset( batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(int))); - size_t guid = requestInfo[tokensInfo[0].request_index].guid; - size_t start_idx = tokensInfo[0].abs_depth_in_request; - size_t dst_idx = 0; - size_t total_tokens = 0; + auto guid = requestInfo[tokensInfo[0].request_index].request_guid; + int start_idx = tokensInfo[0].abs_depth_in_request; + int dst_idx = 0; + int total_tokens = 0; for (size_t i = 1; i <= bc->num_active_tokens(); i++) { - size_t current_guid = requestInfo[tokensInfo[i].request_index].guid; + auto current_guid = requestInfo[tokensInfo[i].request_index].request_guid; if (i == bc->num_active_tokens() || current_guid != guid) { size_t tokens_to_copy = (tokensInfo[i - 1].abs_depth_in_request - start_idx + 1); assert(tokens_to_copy > 0); - size_t request_index = tokensInfo[i - 1].request_index; - size_t token_start_offset = + int request_index = tokensInfo[i - 1].request_index; + int token_start_offset = bc->requestsInfo[request_index].token_start_offset; - size_t num_processing_tokens = + int num_processing_tokens = bc->requestsInfo[request_index].num_tokens_in_batch; if (tokens_to_copy > 1 || token_start_offset == 0) { // initialization phase @@ -109,12 +110,15 @@ void DataLoader::load_input(Task const *task, assert(prev_batch_preds.find(guid) != prev_batch_preds.end()); int token = prev_batch_preds.at(guid); int *dst_ptr = batch_input_ptr + dst_idx; - cudaMemcpy(dst_ptr, &token, sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dst_ptr, + &token, + sizeof(FlexFlow::RequestManager::TokenId), + cudaMemcpyHostToDevice); } total_tokens += tokens_to_copy; if (i < bc->num_active_tokens()) { - guid = bc->requestsInfo[bc->tokensInfo[i].request_index].guid; + guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; start_idx = tokensInfo[i].abs_depth_in_request; } dst_idx = i; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 5ebd23a4c7..b3a9b4941a 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -205,7 +205,7 @@ void FlexFlow::top_level_task(Task const *task, InferenceResult ir = future.get_result(); bc = batch_configs[bid]; data_loader.store_outputs(bc, ir, batch_predictions[bid]); - processed_requests += bc->update_results(ir); + processed_requests += bc->update_results(&ir); max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() : im.max_num_requests_per_batch; diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 233b1dcaa1..8a8c0cb53e 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -184,7 +184,7 @@ void FlexFlow::top_level_task(Task const *task, InferenceResult ir = future.get_result(); bc = batch_configs[bid]; data_loader.store_outputs(bc, ir, batch_predictions[bid]); - processed_requests += bc->update_results(ir); + processed_requests += bc->update_results(&ir); max_reqs = transformerConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() : im.max_num_requests_per_batch; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 674fac4ced..a397be28c3 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,50 +20,54 @@ // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 #define MAX_SEQ_LEN 20 -#define BATCH_SIZE 16 -#define MAX_REQUESTS 256 +// #define BATCH_SIZE 16 +// #define MAX_REQUESTS 256 namespace FlexFlow { -struct InferenceResult { - static int const MAX_NUM_TOKENS = MAX_SEQ_LEN * BATCH_SIZE; - int results[MAX_NUM_TOKENS]; -}; +class InferenceResult; class BatchConfig { public: + using RequestGuid = size_t; + using TokenId = int; BatchConfig(); bool register_new_request(size_t guid, int initial_len, int tokens_to_generate); void prepare_next_batch(); - int update_results(InferenceResult const &ir); + int update_results(InferenceResult const *ir); void update_num_active_requests_tokens(); int num_active_requests() const; int num_active_tokens() const; void print() const; - static int const MAX_NUM_REQUESTS = MAX_REQUESTS; - static int const MAX_NUM_TOKENS = InferenceResult::MAX_NUM_TOKENS; + static int const MAX_NUM_REQUESTS = 8; + static int const MAX_NUM_TOKENS = 64; // These are set by update - int num_tokens, num_requests; - bool cached_results; + int num_tokens; struct PerRequestInfo { - size_t token_start_offset; - size_t num_tokens_in_batch; - size_t guid; + int token_start_offset; + int num_tokens_in_batch; + int max_sequence_length; + RequestGuid request_guid; }; struct PerTokenInfo { - size_t abs_depth_in_request; - size_t request_index; - size_t value; + int abs_depth_in_request; + int request_index; + TokenId token_id; }; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; - size_t max_sequence_length[MAX_NUM_REQUESTS]; + // size_t max_sequence_length[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; }; +struct InferenceResult { + static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; + BatchConfig::TokenId token_ids[MAX_NUM_TOKENS]; +}; + }; // namespace FlexFlow diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 87cc80e055..0415d85d11 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -17,6 +17,7 @@ #include "flexflow/batch_config.h" #include "flexflow/model.h" +#include namespace FlexFlow { @@ -31,6 +32,8 @@ class InferenceManager { void init_operators_inference(); MachineView *get_machine_view(int mv_id); Legion::FutureMap inference(int index, BatchConfig const &bc); + void load_input_tokens_from_batch_config(BatchConfig const &bc, + ParallelTensor const input); public: std::unordered_map> tensor_buffer; @@ -41,4 +44,32 @@ class InferenceManager { std::vector machine_views; }; +struct Request { + BatchConfig::RequestGuid guid; + int max_sequence_length; + std::vector tokens; +}; + +class RequestManager { +public: + using RequestGuid = BatchConfig::RequestGuid; + using TokenId = BatchConfig::TokenId; + RequestManager(); + RequestGuid register_new_request(std::vector const &prompt, + int max_sequence_length); + BatchConfig prepare_next_batch(BatchConfig const &bc, + InferenceResult const &result); + static void + load_tokens_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + +private: + std::queue pending_request_queue; + std::unordered_map running_request_queue; + std::mutex request_queue_mutex; + RequestGuid next_available_guid; +}; + } // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 40080c9840..d797158530 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -30,6 +30,7 @@ #include "optimizer.h" #include "parallel_tensor.h" #include "recompile.h" +#include "runtime.h" #include "simulator.h" #include "tensor.h" #include "tl/optional.hpp" @@ -206,6 +207,8 @@ enum TaskIDs { FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, + // InferenceManager & RequestManager + RM_LOAD_TOKENS_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, diff --git a/include/flexflow/runtime.h b/include/flexflow/runtime.h new file mode 100644 index 0000000000..e1371300ec --- /dev/null +++ b/include/flexflow/runtime.h @@ -0,0 +1,31 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _FLEXFLOW_RUNTIME_H_ +#define _FLEXFLOW_RUNTIME_H_ + +#include "config.h" + +namespace FlexFlow { + +class FFRuntime { +public: + FFRuntime(FFConfig &config); + FFHandler handlers[MAX_NUM_WORKERS]; +}; + +} // namespace FlexFlow + +#endif // _FLEXFLOW_RUNTIME_H_ diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 8cfc4c38d4..5636b7b924 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -331,7 +331,7 @@ InferenceResult m, in_ptr, index_ptr, batch_size, length, k, m->sorted); InferenceResult ir; - download_tensor(index_ptr, ir.results, batch_size); + download_tensor(index_ptr, ir.token_ids, batch_size); return ir; } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index a0674eddb6..8976703c6f 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -506,7 +506,7 @@ FutureMap IncMultiHeadSelfAttention::inference( int idx = 0; printf("BatchConfig, num_tokens: %d, num_requests: %d\n", bc.num_tokens, - bc.num_requests); + bc.num_active_requests()); IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, TaskArgument(&bc, sizeof(BatchConfig)), diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 01c1df551c..7412a184c6 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -23,22 +23,22 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_bc("BatchConfig"); BatchConfig::BatchConfig() { - cached_results = false; + num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].token_start_offset = 0; requestsInfo[i].num_tokens_in_batch = 0; request_completed[i] = true; } for (int i = 0; i < MAX_NUM_TOKENS; i++) { - tokensInfo[i].abs_depth_in_request = SIZE_MAX; - tokensInfo[i].request_index = SIZE_MAX; - tokensInfo[i].value = SIZE_MAX; + tokensInfo[i].abs_depth_in_request = -1; + tokensInfo[i].request_index = -1; + tokensInfo[i].token_id = -1; } - update_num_active_requests_tokens(); } -int BatchConfig::update_results(InferenceResult const &ir) { - cached_results = false; +// Deprecated API; should use RequestManager::update_batch +int BatchConfig::update_results(InferenceResult const *ir) { + assert(false); int completed = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { @@ -47,11 +47,11 @@ int BatchConfig::update_results(InferenceResult const &ir) { assert(requestsInfo[i].num_tokens_in_batch > 0); int processed_tokens = requestsInfo[i].token_start_offset + requestsInfo[i].num_tokens_in_batch; - if (processed_tokens >= max_sequence_length[i] + if (processed_tokens >= requestsInfo[i].max_sequence_length // || ir.results[t] == 0 TODO: replace this with ) { log_bc.print("[Done] guid(%zu) final_length(%d)", - requestsInfo[i].guid, + requestsInfo[i].request_guid, processed_tokens); request_completed[i] = true; requestsInfo[i].num_tokens_in_batch = 0; @@ -62,22 +62,23 @@ int BatchConfig::update_results(InferenceResult const &ir) { requestsInfo[i].num_tokens_in_batch = 1; } } - update_num_active_requests_tokens(); return completed; } +// Deprecated API; RequestManager::new_batch and RequestManager::update_batch +// automatically register new requests. bool BatchConfig::register_new_request(size_t guid, int initial_len, int tokens_to_generate) { - cached_results = false; + assert(false); assert(initial_len > 0 && tokens_to_generate > 0); for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (request_completed[i]) { log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, initial_len); requestsInfo[i].token_start_offset = 0; requestsInfo[i].num_tokens_in_batch = initial_len; - requestsInfo[i].guid = guid; - max_sequence_length[i] = initial_len + tokens_to_generate; + requestsInfo[i].request_guid = guid; + requestsInfo[i].max_sequence_length = initial_len + tokens_to_generate; request_completed[i] = false; update_num_active_requests_tokens(); return true; @@ -87,55 +88,63 @@ bool BatchConfig::register_new_request(size_t guid, return false; } +// Deprecated API void BatchConfig::prepare_next_batch() { - assert(cached_results); - assert(num_requests > 0 && num_tokens > 0); + assert(false); + assert(num_tokens > 0); log_bc.print("[NextBatch] num_tokens(%d)", num_tokens); } +// Deprecated API; cannot use this since we need to +// add token_id, which is missing in this API void BatchConfig::update_num_active_requests_tokens() { - num_requests = 0; + assert(false); num_tokens = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { - num_requests++; + int start_idx = requestsInfo[i].token_start_offset; for (int j = 0; j < requestsInfo[i].num_tokens_in_batch; j++) { - int start_idx = requestsInfo[i].token_start_offset; tokensInfo[num_tokens].abs_depth_in_request = start_idx + j; tokensInfo[num_tokens].request_index = i; num_tokens++; } } } - cached_results = true; } int BatchConfig::num_active_requests() const { - if (cached_results) { - return num_requests; - } else { - assert(false && - "some BatchConfig functions updated requests but didn't call " - "() before exit"); + int num_requests = 0; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (!request_completed[i]) { + num_requests++; + } } + return num_requests; + // if (cached_results) { + // return num_requests; + // } else { + // assert(false && + // "some BatchConfig functions updated requests but didn't call " + // "() before exit"); + // } } int BatchConfig::num_active_tokens() const { - if (cached_results) { - return num_tokens; - } else { - assert(false && - "some BatchConfig functions updated requests but didn't call " - "update_num_active_requests_tokens() before exit"); - } + // if (cached_results) { + return num_tokens; + //} else { + // assert(false && + // "some BatchConfig functions updated requests but didn't call " + // "update_num_active_requests_tokens() before exit"); + //} } void BatchConfig::print() const { std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; std::cout << "Number of tokens: " << num_tokens << std::endl; - std::cout << "Number of requests: " << num_requests << std::endl; - std::cout << "Cached results: " << cached_results << std::endl; + std::cout << "Number of requests: " << num_active_requests() << std::endl; + // std::cout << "Cached results: " << cached_results << std::endl; std::cout << "Per-request info:\n"; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { @@ -145,9 +154,9 @@ void BatchConfig::print() const { << requestsInfo[i].token_start_offset << std::endl; std::cout << " Number of tokens in batch: " << requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << " GUID: " << requestsInfo[i].guid << std::endl; - std::cout << " Max sequence length: " << max_sequence_length[i] - << std::endl; + std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; + std::cout << " Max sequence length: " + << requestsInfo[i].max_sequence_length << std::endl; std::cout << " Request completed: " << request_completed[i] << std::endl; } @@ -160,6 +169,7 @@ void BatchConfig::print() const { << tokensInfo[i].abs_depth_in_request << std::endl; std::cout << " Request index: " << tokensInfo[i].request_index << std::endl; + std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; } } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 60294d4a75..a65fa184f8 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -1,4 +1,4 @@ -/* Copyright 2022 CMU, Stanford, Facebook, LANL +/* Copyright 2023 CMU, Stanford, Facebook, LANL * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,11 +148,21 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { // on the device_index-th device (except for the experts layers) int batch_index = index % max_num_inflight_batches; FutureMap fm; + bool found_input_operator = false; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; - if (op->op_type == OP_WEIGHT || op->op_type == OP_INPUT) { + if (op->op_type == OP_WEIGHT) { continue; } + if (op->op_type == OP_INPUT) { + //FIXME: this is a hack, should be replace with an input ParallelTensor + if (found_input_operator) + continue; + found_input_operator = true; + assert(op->numOutputs == 1); + ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; + load_input_tokens_from_batch_config(bc, pt); + } std::vector inputs(op->numInputs); std::vector outputs(op->numOutputs); @@ -179,4 +189,24 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { return fm; }; +void InferenceManager::load_input_tokens_from_batch_config( + BatchConfig const &bc, ParallelTensor const input) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + size_t machine_view_hash = input->machine_view.hash(); + ArgumentMap argmap; + IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, + input->parallel_is, + TaskArgument(&bc, sizeof(BatchConfig)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement( + input->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, input->region)); + launcher.add_field(0, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 915688f3c8..328d4cc3af 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1252,37 +1252,9 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { } } -FFModel::FFModel(FFConfig &_config) - : op_global_guid(OP_GUID_FIRST_VALID), - layer_global_guid(LAYER_GUID_FIRST_VALID), - tensor_global_guid(TENSOR_GUID_FIRST_VALID), - parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID), - node_global_guid(NODE_GUID_FIRST_VALID), config(_config), optimizer(NULL), - loss_op(NULL), metrics_op(NULL), simulator(NULL) { - this->search = new PCG::SearchHelper(this); - this->graph_search = new PCG::GraphSearchHelper(this); - +FFRuntime::FFRuntime(FFConfig &config) { Runtime *runtime = config.lg_hlr; Context ctx = config.lg_ctx; - // Register machine views - register_all_machine_views(config.numNodes, - config.workersPerNode, - config.cpusPerNode, - all_valid_views); - metrics_input = -1; - // Load strategy file - // Create field space - { - FieldAllocator allocator = - runtime->create_field_allocator(ctx, config.field_space); - allocator.allocate_field(sizeof(float), FID_DATA); - } - // Build training dataset - // if (config.datasetPath.length() == 0) { - // dataLoader = NULL; - //} else { - // dataLoader = new DataLoader(config.datasetPath); - //} ArgumentMap argmap; Rect<1> task_rect(Point<1>(0), @@ -1316,6 +1288,48 @@ FFModel::FFModel(FFConfig &_config) } } +FFRuntime *ffruntime_singleton = nullptr; + +FFModel::FFModel(FFConfig &_config) + : op_global_guid(OP_GUID_FIRST_VALID), + layer_global_guid(LAYER_GUID_FIRST_VALID), + tensor_global_guid(TENSOR_GUID_FIRST_VALID), + parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID), + node_global_guid(NODE_GUID_FIRST_VALID), config(_config), optimizer(NULL), + loss_op(NULL), metrics_op(NULL), simulator(NULL) { + this->search = new PCG::SearchHelper(this); + this->graph_search = new PCG::GraphSearchHelper(this); + + if (ffruntime_singleton == nullptr) { + ffruntime_singleton = new FFRuntime(_config); + } + + Runtime *runtime = config.lg_hlr; + Context ctx = config.lg_ctx; + // Register machine views + register_all_machine_views(config.numNodes, + config.workersPerNode, + config.cpusPerNode, + all_valid_views); + metrics_input = -1; + // Load strategy file + // Create field space + { + FieldAllocator allocator = + runtime->create_field_allocator(ctx, config.field_space); + allocator.allocate_field(sizeof(float), FID_DATA); + } + // Build training dataset + // if (config.datasetPath.length() == 0) { + // dataLoader = NULL; + //} else { + // dataLoader = new DataLoader(config.datasetPath); + //} + for (int idx = 0; idx < config.workersPerNode * config.numNodes; idx++) { + handlers[idx] = ffruntime_singleton->handlers[idx]; + } +} + void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -3853,6 +3867,15 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "cuda_init_task"); } + // RequestManager load_tokens + { + TaskVariantRegistrar registrar(RM_LOAD_TOKENS_TASK_ID, + "RequestManager Load Tokens"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "RequestManager Load Tokens Task"); + } // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc new file mode 100644 index 0000000000..7374d36bfa --- /dev/null +++ b/src/runtime/request_manager.cc @@ -0,0 +1,139 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/parallel_ops/parallel_op.h" + +namespace FlexFlow { + +using namespace Legion; + +LegionRuntime::Logger::Category log_req_mgr("RequestManager"); + +RequestManager::RequestManager() : next_available_guid(1000000) {} + +RequestManager::RequestGuid + RequestManager::register_new_request(std::vector const &prompt, + int max_sequence_length) { + const std::lock_guard lock(request_queue_mutex); + + // Add a new request + Request request; + request.guid = next_available_guid++; + request.max_sequence_length = max_sequence_length; + request.tokens = prompt; + + pending_request_queue.push(request); + return request.guid; +} + +BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, + InferenceResult const &result) { + const std::lock_guard lock(request_queue_mutex); + // Step 1: use result to update requests + for (int i = 0; i < old_bc.num_tokens; i++) { + size_t guid = + old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; + Request &request = running_request_queue[guid]; + if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { + // This is a prompt token + continue; + } else { + assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == + request.tokens.size()); + // This is a decoding token + request.tokens.push_back(result.token_ids[i]); + } + } + // Step 2: preparing the next batch for existing requests + BatchConfig new_bc; + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = + running_request_queue[old_bc.requestsInfo[i].request_guid]; + int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + old_bc.requestsInfo[i].num_tokens_in_batch; + assert(processed_tokens < request.tokens.size()); + if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length + // || ir.results[t] == 0 TODO: replace this with + ) { + log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", + old_bc.requestsInfo[i].request_guid, + request.tokens.size()); + } else { + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + if (new_bc.requestsInfo[i].token_start_offset + 1 == + request.tokens.size()) { + // Incremental phase + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + } else { + // Prompt phase + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].token_start_offset); + } + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + assert(depth < request.tokens.size()); + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; + new_bc.num_tokens++; + } + } + } + // Step 3: add new requests to the next batch + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (new_bc.request_completed[i]) { + if (!pending_request_queue.empty() && + new_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS) { + Request const &new_request = pending_request_queue.front(); + pending_request_queue.pop(); + running_request_queue[new_request.guid] = new_request; + new_bc.requestsInfo[i].token_start_offset = 0; + new_bc.requestsInfo[i].request_guid = new_request.guid; + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)new_request.tokens.size()); + new_bc.requestsInfo[i].max_sequence_length = + new_request.max_sequence_length; + new_bc.request_completed[i] = false; + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + assert(depth < new_request.tokens.size()); + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_request.tokens[depth]; + new_bc.num_tokens++; + } + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } + } + } + } + return new_bc; +} + +}; // namespace FlexFlow diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu new file mode 100644 index 0000000000..32e872125c --- /dev/null +++ b/src/runtime/request_manager.cu @@ -0,0 +1,50 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +using namespace Legion; + +void RequestManager::load_tokens_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 1); + assert(task->regions.size() == 1); + + BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; + for (int i = 0; i < batch_config.num_tokens; i++) { + dram_copy[i] = batch_config.tokensInfo[i].token_id; + } + TokenId *fb_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + assert(batch_config.num_tokens <= domain.get_volume()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDA(cudaMemcpyAsync(fb_ptr, + dram_copy, + sizeof(TokenId) * batch_config.num_tokens, + cudaMemcpyHostToDevice, + stream)); +} + +}; // namespace FlexFlow From dbaa1f0e999f5dd59b38255d3c8d4d09a28d3c09 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 30 Apr 2023 22:27:19 -0400 Subject: [PATCH 101/344] add llama to CI, comment out other tests --- examples/cpp/inference/LLAMA/llama.cc | 11 +---------- src/runtime/inference_manager.cc | 5 +++-- tests/cpp_gpu_tests.sh | 10 ++++++---- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index f05526ce9b..910d3e84b3 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -209,16 +209,7 @@ void FlexFlow::top_level_task(Task const *task, loader.reset(); for (int i = 0; i < llamaConfig.batchSize; i++) { - std::vector tokens{ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 - }; + std::vector tokens{0, 0, 0, 0, 0, 0, 0, 0}; rm.register_new_request(tokens, 347); } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index a65fa184f8..5aeee2609c 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -155,9 +155,10 @@ FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { continue; } if (op->op_type == OP_INPUT) { - //FIXME: this is a hack, should be replace with an input ParallelTensor - if (found_input_operator) + // FIXME: this is a hack, should be replace with an input ParallelTensor + if (found_input_operator) { continue; + } found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index 591b0e82be..fbf33cc04d 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -47,8 +47,9 @@ if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then # "$FF_HOME"/build/examples/cpp/split_test/split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples - "$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel - "$FF_HOME"/build/examples/cpp/inference/transformers/inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + "$FF_HOME"/build/examples/cpp/inference/LLAMA/LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + #"$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + #"$FF_HOME"/build/examples/cpp/inference/transformers/inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel else python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))") OLD_PATH="$PATH" @@ -78,8 +79,9 @@ else # split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples - inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel - inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + #inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + #inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel fi done export PATH="$OLD_PATH" From 91750d8d79a49aaaaff42841b0e778f6242dccd0 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 1 May 2023 20:11:52 -0500 Subject: [PATCH 102/344] Support multi-GPU inference (#699) * Support multiple FFModels in a single top_level_task * initial implementation * initial implementation * [RequestManager] more work * [RequestManager] initial implementation done * bug fix * Support multi-GPU inference * . * fix machine view (#702) --------- Co-authored-by: xinhaoc <99570243+xinhaoc@users.noreply.github.com> --- examples/cpp/inference/LLAMA/dataloader.cc | 38 +-- examples/cpp/inference/LLAMA/llama.cc | 40 ++- examples/cpp/inference/LLAMA/llama.h | 2 +- .../cpp/inference/mixture_of_experts/moe.cc | 13 +- .../inference/transformers/transformers.cc | 13 +- include/flexflow/inference.h | 12 +- include/flexflow/model.h | 1 + src/ops/kernels/embedding_kernels.cu | 2 +- src/ops/rms_norm.cc | 2 +- src/runtime/batch_config.cc | 6 +- src/runtime/inference_manager.cc | 314 +++++++++++++++++- src/runtime/model.cc | 6 +- src/runtime/parallel_tensor.cc | 4 +- src/runtime/request_manager.cc | 2 +- 14 files changed, 391 insertions(+), 64 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index 1370d3f724..1bfadde559 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -29,27 +29,27 @@ DataLoader::DataLoader(FFModel &ff, // Assume only the first dim can be the replica dim assert(i == num_dims - 1 || (!dims[i].is_replica_dim)); } - dims[num_dims - 1].size = num_samples; - full_input = - ff.create_parallel_tensor_legion_ordering(num_dims, dims, DT_INT64); - assert(full_input != nullptr && "full_input is nullptr"); - ff.map_tensor(full_input, NULL /*parallel_op*/); + // dims[num_dims - 1].size = num_samples; + // full_input = + // ff.create_parallel_tensor_legion_ordering(num_dims, dims, DT_INT64); + // assert(full_input != nullptr && "full_input is nullptr"); + // ff.map_tensor(full_input, NULL /*parallel_op*/); } - size_t llamaconfig_size = sizeof(llamaconfig); - std::cout << "llama config dataloader: " << llamaconfig->input_path; - - // Load entire dataset - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, - TaskArgument(llamaconfig, llamaconfig_size)); - // regions[1]: full_input - launcher.add_region_requirement(RegionRequirement(full_input->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_FB_MEMORY)); - launcher.add_field(0, FID_DATA); - runtime->execute_task(ctx, launcher); + // size_t llamaconfig_size = sizeof(llamaconfig); + // std::cout << "llama config dataloader: " << llamaconfig->input_path; + + // // Load entire dataset + // TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, + // TaskArgument(llamaconfig, llamaconfig_size)); + // // regions[1]: full_input + // launcher.add_region_requirement(RegionRequirement(full_input->region, + // WRITE_ONLY, + // EXCLUSIVE, + // full_input->region, + // MAP_TO_FB_MEMORY)); + // launcher.add_field(0, FID_DATA); + // runtime->execute_task(ctx, launcher); } void DataLoader::load_entire_dataset(Task const *task, diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 910d3e84b3..321f870a8f 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -44,6 +44,20 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; LLAMAConfig llamaConfig; FFModel ff(ffconfig); + //------------------------------compute machine views ------------------ + int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; + std::vector machine_views; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } + + std::unordered_map> mapping; std::unordered_map weights_layers; // InputArgs const &command_args = HighLevelRuntime::get_input_args(); @@ -60,6 +74,7 @@ void FlexFlow::top_level_task(Task const *task, int const token_dims[] = {llamaConfig.batchSize, llamaConfig.max_seq_len}; input = ff.create_tensor<2>(token_dims, DT_INT32); } + mapping[input].push_back(machine_views[0]); Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); Tensor token = ff.embedding(input, @@ -79,11 +94,19 @@ void FlexFlow::top_level_task(Task const *task, // } // n transformer blocks impl - for (int i = 0; i < 10; i++) { + int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; + for (int i = 0; i < 32; i++) { // step 1: attention std::vector axes = {2}; Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); Layer *attention_norm = ff.layers.back(); + if (i % num_transformer_layers_per_gpu == 0) { + // Map att_norm to the next GPU + // since the size of att_norm is minimum across + // all tensors + mapping[att_norm].push_back( + machine_views[i / num_transformer_layers_per_gpu]); + } weights_layers.emplace("layers_" + std::to_string(i) + "_attention_norm_weight", attention_norm); @@ -144,13 +167,9 @@ void FlexFlow::top_level_task(Task const *task, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; InferenceManager im(&ff, llamaConfig.batchSize, 1); - im.compile_model_and_allocate_buffer(); + im.compile_model_and_allocate_buffer(&ff, mapping); RequestManager rm; - std::cout << "------init ops----------" << std::endl; - im.init_operators_inference(); - std::cout << "------model compiled and init ----------" << std::endl; - //------------------------------ load inputs -------------------------- std::cout << "------create dataloaders ----------" << std::endl; // read prompt into input @@ -198,8 +217,13 @@ void FlexFlow::top_level_task(Task const *task, ParallelTensor weight_pt; ff.get_parallel_tensor_from_tensor(weight, weight_pt); weight_pt->set_tensor(&ff, dims_vec, data); + delete data; } std::cout << "------load wieght finished----------" << std::endl; + //------------------------------ init operators ------------------------ + std::cout << "------init ops----------" << std::endl; + im.init_operators_inference(&ff); + std::cout << "------model compiled and init ----------" << std::endl; //------------------------------ do inference--------------------------- int processed_requests = 0; @@ -221,7 +245,7 @@ void FlexFlow::top_level_task(Task const *task, InferenceResult ir; bc = rm.prepare_next_batch(bc, ir); std::cout << "new tokens: " << bc.num_tokens; - FutureMap fm = im.inference(bid, bc); + FutureMap fm = im.inference(&ff, bid, bc); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); batch_configs[bid] = bc; @@ -239,7 +263,7 @@ void FlexFlow::top_level_task(Task const *task, processed_requests += bc.num_tokens; bc = rm.prepare_next_batch(bc, ir); std::cout << "new tokens: " << bc.num_tokens; - FutureMap fm = im.inference(bid, bc); + FutureMap fm = im.inference(&ff, bid, bc); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); batch_configs[bid] = bc; diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index 978eb2bf10..75b7d3ff1b 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -41,7 +41,7 @@ struct LLAMAConfig { // todo from args weight_file_path = - "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/weights/"; + "/home/ubuntu/FlexFlow_Inference/examples/cpp/inference/LLAMA/weights/"; input_path = "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/tokens/" "llama_demo_tokens"; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index b3a9b4941a..22752db39a 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -141,8 +141,9 @@ void FlexFlow::top_level_task(Task const *task, //------------------- Initialize the inference manager ------------------ InferenceManager im( &ff, moeConfig.batch_size, moeConfig.num_inflight_batches); - im.compile_model_and_allocate_buffer(); - im.init_operators_inference(); + std::unordered_map> mapping; + im.compile_model_and_allocate_buffer(&ff, mapping); + im.init_operators_inference(&ff); //------------ Initialize the data loader and data generator ------------ /*size_t min_input_tokens = 32, max_input_tokens = 512, @@ -185,7 +186,7 @@ void FlexFlow::top_level_task(Task const *task, BatchConfig *bc = nullptr; std::map batch_predictions[im.max_num_inflight_batches]; - assert(im.max_num_requests_per_batch == moeConfig.batch_size); + assert(im.max_num_tokens_per_batch == moeConfig.batch_size); // simulation loop. For deployment, we will use a while(true) while (processed_requests < moeConfig.total_requests) { @@ -193,7 +194,7 @@ void FlexFlow::top_level_task(Task const *task, size_t max_reqs, max_tkns; if (future_handlers.find(bid) == future_handlers.end()) { max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - : im.max_num_requests_per_batch; + : im.max_num_tokens_per_batch; max_tkns = moeConfig.sequence_length * moeConfig.batch_size; new_prompts = data_generator.get_requests(max_reqs, max_tkns); bc = new BatchConfig(); @@ -208,7 +209,7 @@ void FlexFlow::top_level_task(Task const *task, processed_requests += bc->update_results(&ir); max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() - : im.max_num_requests_per_batch; + : im.max_num_tokens_per_batch; max_tkns = moeConfig.sequence_length * moeConfig.batch_size - (moeConfig.incremental_mode ? bc->num_active_tokens() : 0); new_prompts = data_generator.get_requests(max_reqs, max_tkns); @@ -232,7 +233,7 @@ void FlexFlow::top_level_task(Task const *task, // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); data_loader.next_batch(ff, bid, bc, batch_predictions[bid], view); - FutureMap fm = im.inference(bid, *bc); + FutureMap fm = im.inference(&ff, bid, *bc); // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); assert(fm.get_future_map_domain().get_volume() == 1); diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 8a8c0cb53e..860fd23fe4 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -117,8 +117,9 @@ void FlexFlow::top_level_task(Task const *task, InferenceManager im(&ff, transformerConfig.batch_size, transformerConfig.num_inflight_batches); - im.compile_model_and_allocate_buffer(); - im.init_operators_inference(); + std::unordered_map> mapping; + im.compile_model_and_allocate_buffer(&ff, mapping); + im.init_operators_inference(&ff); //------------ Initialize the data loader and data generator ------------ /* size_t min_input_tokens = 32, max_input_tokens = 512, @@ -161,7 +162,7 @@ void FlexFlow::top_level_task(Task const *task, BatchConfig *bc = nullptr; std::map batch_predictions[im.max_num_inflight_batches]; - assert(im.max_num_requests_per_batch == transformerConfig.batch_size); + assert(im.max_num_tokens_per_batch == transformerConfig.batch_size); // assert(transformerConfig.batch_size <= BatchConfig::MAX_NUM_REQUESTS); // simulation loop. For deployment, we will use a while(true) @@ -171,7 +172,7 @@ void FlexFlow::top_level_task(Task const *task, if (future_handlers.find(bid) == future_handlers.end()) { max_reqs = transformerConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - : im.max_num_requests_per_batch; + : im.max_num_tokens_per_batch; max_tkns = transformerConfig.sequence_length * transformerConfig.batch_size; new_prompts = data_generator.get_requests(max_reqs, max_tkns); @@ -187,7 +188,7 @@ void FlexFlow::top_level_task(Task const *task, processed_requests += bc->update_results(&ir); max_reqs = transformerConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() - : im.max_num_requests_per_batch; + : im.max_num_tokens_per_batch; max_tkns = transformerConfig.sequence_length * transformerConfig.batch_size - (transformerConfig.incremental_mode ? bc->num_active_tokens() : 0); @@ -212,7 +213,7 @@ void FlexFlow::top_level_task(Task const *task, // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); data_loader.next_batch(ff, bid, bc, batch_predictions[bid], view); - FutureMap fm = im.inference(bid, *bc); + FutureMap fm = im.inference(&ff, bid, *bc); // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); assert(fm.get_future_map_domain().get_volume() == 1); diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 0415d85d11..0079a570b7 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -26,19 +26,21 @@ class FFModel; class InferenceManager { public: InferenceManager(FFModel *_model, - int max_num_requests_per_batch, + int max_num_tokens_per_batch, int max_num_inflight_batches); - void compile_model_and_allocate_buffer(void); - void init_operators_inference(); + void compile_model_and_allocate_buffer( + FFModel *model, + std::unordered_map> const &mapping); + void init_operators_inference(FFModel *model); MachineView *get_machine_view(int mv_id); - Legion::FutureMap inference(int index, BatchConfig const &bc); + Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); void load_input_tokens_from_batch_config(BatchConfig const &bc, ParallelTensor const input); public: std::unordered_map> tensor_buffer; FFModel *model; - int max_num_requests_per_batch; + int max_num_tokens_per_batch; int max_num_inflight_batches; int num_devices; std::vector machine_views; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d797158530..3cd5e2036e 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -837,6 +837,7 @@ class FFModel { LossType loss_type, std::vector const &metrics, CompMode comp_mode = COMP_MODE_TRAINING); + void compile_inference(); void graph_optimize(size_t budget, bool only_data_parallel, std::unique_ptr &best_graph, diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 65f3089409..22d8161ff1 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -60,7 +60,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, m->aggr, output.domain.get_volume(), stream); - } else if (weight.data_type == DT_HALF) { + } else if (weight.data_type == DT_DOUBLE) { Internal::forward_kernel(input.get_int32_ptr(), output.get_double_ptr(), weight.get_double_ptr(), diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 5e02160b7c..57578f5793 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -238,7 +238,7 @@ void RMSNorm::init_inference(FFModel const &ff, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 7412a184c6..5a41962a13 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -30,9 +30,9 @@ BatchConfig::BatchConfig() { request_completed[i] = true; } for (int i = 0; i < MAX_NUM_TOKENS; i++) { - tokensInfo[i].abs_depth_in_request = -1; - tokensInfo[i].request_index = -1; - tokensInfo[i].token_id = -1; + tokensInfo[i].abs_depth_in_request = 0; + tokensInfo[i].request_index = 0; + tokensInfo[i].token_id = 0; } } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 5aeee2609c..a7da765391 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -13,7 +13,12 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" +#include "flexflow/graph.h" #include "flexflow/inference.h" +#include "flexflow/model.h" +#include "flexflow/ops/fused.h" +#include "flexflow/ops/noop.h" #include "flexflow/parallel_ops/parallel_op.h" namespace FlexFlow { @@ -21,9 +26,9 @@ namespace FlexFlow { using namespace Legion; InferenceManager::InferenceManager(FFModel *_model, - int _max_num_requests_per_batch, + int _max_num_tokens_per_batch, int _max_num_inflight_batches) - : model(_model), max_num_requests_per_batch(_max_num_requests_per_batch), + : model(_model), max_num_tokens_per_batch(_max_num_tokens_per_batch), max_num_inflight_batches(_max_num_inflight_batches) { // populate array of valid single-device machine views num_devices = model->config.workersPerNode * model->config.numNodes; @@ -38,18 +43,56 @@ InferenceManager::InferenceManager(FFModel *_model, } } -void InferenceManager::compile_model_and_allocate_buffer(void) { - std::vector metrics; - model->config.batchSize = max_num_requests_per_batch; - model->compile( - LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE, metrics, COMP_MODE_INFERENCE); +void InferenceManager::compile_model_and_allocate_buffer( + FFModel *model, + std::unordered_map> const + &tensor_mapping) { + model->config.batchSize = max_num_tokens_per_batch; + model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; + + std::unordered_map> mapping; + for (auto const &it : tensor_mapping) { + ParallelTensor pt; + model->get_parallel_tensor_from_tensor(it.first, pt); + assert(pt->owner_op != nullptr); + mapping[pt->owner_op] = it.second; + } for (auto const &op : model->operators) { // Skip weight operators if (op->op_type == OP_WEIGHT) { continue; } + // Get machine views + std::vector machine_views; + if (mapping.find(op) != mapping.end()) { + machine_views = mapping[op]; + assert(machine_views.size() == max_num_inflight_batches); + } else { + // Mapping the current operator using the same machine + // view as the inputs + assert(op->numInputs > 0); + for (int j = 0; j < max_num_inflight_batches; j++) { + MachineView mv = tensor_buffer[op->inputs[0]][j]->machine_view; + for (int k = 1; k < op->numInputs; k++) { + if (mv != tensor_buffer[op->inputs[k]][j]->machine_view) { + fprintf(stderr, + "[Warning] a potentially unnecessary " + " inter-GPU copy of size %zu\n", + op->inputs[k]->get_volume()); + // Heuristics: we use the mv with a larger start_device_id + // to promote load balancing + if (mv.start_device_id < + tensor_buffer[op->inputs[k]][j]->machine_view.start_device_id) { + mv = tensor_buffer[op->inputs[k]][j]->machine_view; + } + } + } + machine_views.push_back(mv); + } + assert(machine_views.size() == max_num_inflight_batches); + } for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); @@ -63,11 +106,17 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { pt_base->region.get_field_space()); pt->part = runtime->get_logical_partition( ctx, pt->region, pt_base->part.get_index_partition()); + pt->machine_view = machine_views[j]; + Domain part_domain = + runtime->get_index_space_domain(ctx, pt_base->parallel_is); + assert(pt->machine_view.get_domain() == part_domain); list.push_back(pt); } + assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); tensor_buffer[pt_base] = list; } } +#ifdef DEADCODE // Set machine_view for batch_tensors in the tensor_buffer for (int batch_index = 0; batch_index < max_num_inflight_batches; batch_index++) { @@ -96,9 +145,10 @@ void InferenceManager::compile_model_and_allocate_buffer(void) { } } } +#endif } -void InferenceManager::init_operators_inference() { +void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < max_num_inflight_batches; batch_index++) { int expert_device_index = 0; @@ -137,12 +187,16 @@ void InferenceManager::init_operators_inference() { } } +// Deprecated API MachineView *InferenceManager::get_machine_view(int mv_id) { + assert(false); assert(mv_id >= 0 && mv_id < machine_views.size()); return &machine_views[mv_id]; } -FutureMap InferenceManager::inference(int index, BatchConfig const &bc) { +FutureMap InferenceManager::inference(FFModel *model, + int index, + BatchConfig const &bc) { assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) @@ -210,4 +264,246 @@ void InferenceManager::load_input_tokens_from_batch_config( runtime->execute_index_space(ctx, launcher); } +void FFModel::compile_inference() { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + config.computationMode = COMP_MODE_INFERENCE; + { + fprintf( + stderr, + "Note: inference currently only supports data/pipeline parallel.\n"); + } + create_operators_from_layers(); + // Launch the graph optimize task + { + FFModel *model = this; + TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + Future future = runtime->execute_task(ctx, launcher); + + PCG::GraphOptimalViewSerialized ret = + future.get_result(); + Deserializer dez(ret.data, ret.total_bytes); + // Reconstruct operators + PCG::Graph *best_graph = new PCG::Graph(this); + std::unordered_map optimal_views; + deserialize_graph_optimal_view(dez, best_graph, optimal_views); + operators.clear(); + convert_graph_to_operators(best_graph, optimal_views); + best_graph->print_dot(); + delete best_graph; + for (auto const &layer : layers) { + // map inputs to parallel tensor + if (layer->op_type == OP_INPUT) { + Tensor tensor = layer->outputs[0]; + ParallelTensor parallel_tensor = nullptr; + for (auto const &op : operators) { + if (op->op_type == OP_INPUT) { + NoOp *noop = (NoOp *)op; + if (noop->input_tensor_guid == tensor->tensor_guid) { + parallel_tensor = op->outputs[0]; + } + } + } + assert(parallel_tensor != nullptr); + tensor->parallel_tensor = parallel_tensor; + } + // map weights to parallel_tensor + for (int i = 0; i < layer->numWeights; i++) { + assert(layer->weights[i] != nullptr); + Tensor weight = layer->weights[i]; + ParallelTensor parallel_weight = nullptr; + for (auto const &op : operators) { + if (op->layer_guid == layer->layer_guid) { + assert(op->op_type == layer->op_type); + assert(op->numWeights == layer->numWeights); + parallel_weight = op->weights[i]; + } + } + assert(parallel_weight != nullptr); + weight->parallel_tensor = parallel_weight; + } + } + } + loss_op = nullptr; + metrics_op = nullptr; + // Perform inplace optimizations + if (config.enable_inplace_optimizations) { + for (size_t l = 1; l < operators.size(); l++) { + if (operators[l]->can_inplace_output()) { + // Assume outputs[0] is inplace with inputs[0] + assert(operators[l]->numOutputs == 1); + if (operators[l]->inputs[0]->owner_op != NULL) { + // int dim1 = operators[l]->outputs[0]->num_dims; + // int dim2 = operators[l]->inputs[0]->num_dims; + MachineView view1 = operators[l]->outputs[0]->machine_view; + MachineView view2 = operators[l]->inputs[0]->machine_view; + if (view1 == view2) { + // Check no others also need operators[l]->inputs[0] + bool found = false; + for (size_t i = 0; i < operators.size(); i++) { + if (i == l) { + continue; + } + for (int j = 0; j < operators[i]->numInputs; j++) { + if ((operators[i]->inputs[j]->owner_op == + operators[l]->inputs[0]->owner_op) && + (operators[i]->inputs[j]->owner_idx == + operators[l]->inputs[0]->owner_idx)) { + found = true; + } + } + } + if (!found) { + // Perform inplace + operators[l]->do_inplace_output(); + } + } + } + } + } + } + + for (size_t l = 0; l < operators.size(); l++) { + Op *op = operators[l]; + + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i]->owner_op != NULL); + } + for (int i = 0; i < op->numWeights; i++) { + assert(op->weights[i]->owner_op != NULL); + assert(op->weights[i]->region != LogicalRegion::NO_REGION); + parameters.push_back(op->weights[i]); + } + op->map_output_tensors(*this); + } + + // Check correctness + for (size_t l = 0; l < operators.size(); l++) { + Op *op = operators[l]; + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i]->owner_op == op); + assert(op->outputs[i]->owner_idx == i); + assert(op->outputs[i]->parallel_tensor_guid != 0); + } + } + // Perform fusion optimizations + if (config.perform_fusion) { + fprintf(stderr, "Applying fusion optimizations during compilation...\n"); + fprintf(stderr, "%zu operators before fusion...\n", operators.size()); + std::vector new_operators; + std::vector old_operators = operators; + while (apply_fusion(operators, new_operators)) { + for (size_t i = 0; i < new_operators.size(); i++) { + for (int idx = 0; idx < new_operators[i]->numInputs; idx++) { + for (size_t j = i + 1; j < new_operators.size(); j++) { + if (new_operators[i]->inputs[idx]->owner_op == new_operators[j]) { + assert(false); + } + } + } + } + operators = new_operators; + } + // Check integrity + for (size_t l = 0; l < operators.size(); l++) { + if (operators[l]->op_type == OP_FUSED) { + FusedOp *fused = (FusedOp *)operators[l]; + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + Op *old_op = fused->operators[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { + assert(fused->inputs[my_off]->region == + old_op->inputs[i]->region); + } else if (fused->op_input_source[i + ioff] == + FusedOp::SOURCE_OUTPUT) { + assert(fused->outputs[my_off]->region == + old_op->inputs[i]->region); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + int my_off = fused->op_weight_idx[i + woff]; + assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); + assert(fused->weights[my_off]->region == + old_op->weights[i]->region); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); + assert(fused->outputs[my_off]->region == + old_op->outputs[i]->region); + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + } else { + bool found = false; + for (size_t i = 0; i < old_operators.size(); i++) { + if (old_operators[i] == operators[l]) { + assert(!found); + found = true; + } + } + assert(found); + } + } + fprintf(stderr, "%zu operators after fusion...\n", operators.size()); + for (size_t i = 0; i < operators.size(); i++) { + Op *op = operators[i]; + printf("operator[%zu]: type(%s) guid(%lu)\n", + i, + get_operator_type_name(operators[i]->op_type).c_str(), + operators[i]->op_guid); + for (int j = 0; j < op->numInputs; j++) { + LogicalRegion handle = op->inputs[j]->region; + printf("\tinputs[%d] region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numOutputs; j++) { + LogicalRegion handle = op->outputs[j]->region; + printf("\toutputs[%d] region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numWeights; j++) { + LogicalRegion handle = op->weights[j]->region; + printf("\tweights[%d] region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + } + } + for (size_t i = 0; i < operators.size(); i++) { + Op *op = operators[i]; + printf("operator[%zu]: type(%d)\n", i, operators[i]->op_type); + for (int j = 0; j < op->numInputs; j++) { + LogicalRegion handle = op->inputs[j]->region; + printf("\tinputs[%d] region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numOutputs; j++) { + LogicalRegion handle = op->outputs[j]->region; + printf("\toutputs[%d] region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + } +} }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 328d4cc3af..12d3bbb18f 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1759,8 +1759,10 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor, runtime->get_logical_partition(ctx, tensor->region_grad, ip); } } - // Step 3: initialize the tensor - if (tensor->initializer != NULL) { + // Step 3: initialize the tensor; don't randomly initialize weights + // for inference + if (tensor->initializer != NULL && + config.computationMode == COMP_MODE_TRAINING) { tensor->initializer->init(this, tensor); } } diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 3ad2f17f0c..2147ac69b3 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -667,7 +667,7 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; } - RegionRequirement req(region, READ_WRITE, EXCLUSIVE, region); + RegionRequirement req(region, WRITE_ONLY, EXCLUSIVE, region); req.add_field(FID_DATA); InlineLauncher launcher(req); PhysicalRegion pr = runtime->map_region(ctx, launcher); @@ -675,7 +675,7 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, switch (num_dims) { #define DIMFUNC(DIM) \ case DIM: { \ - TensorAccessorW acc(pr, req, FID_DATA, ctx, runtime, true); \ + TensorAccessorW acc(pr, req, FID_DATA, ctx, runtime, false); \ assert(acc.rect.volume() == volume * num_replicas); \ T *ptr = acc.ptr; \ for (size_t i = 0; i < num_replicas; i++) { \ diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7374d36bfa..7f5bc89648 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -107,7 +107,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && new_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS) { - Request const &new_request = pending_request_queue.front(); + Request new_request = pending_request_queue.front(); pending_request_queue.pop(); running_request_queue[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; From 49bdead9c551ef8848a90e2c2bfbdecc9cea53cf Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 4 May 2023 18:06:29 -0400 Subject: [PATCH 103/344] [Inference] - Fix small issues (#705) * add check for num cpus in CI * fix hip rocm compilation issue --- src/runtime/request_manager.cpp | 51 +++++++++++++++++++++++++++++++++ tests/cpp_gpu_tests.sh | 9 ++++-- 2 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 src/runtime/request_manager.cpp diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp new file mode 100644 index 0000000000..87e86087fe --- /dev/null +++ b/src/runtime/request_manager.cpp @@ -0,0 +1,51 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +using namespace Legion; + +void RequestManager::load_tokens_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 1); + assert(task->regions.size() == 1); + + BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; + for (int i = 0; i < batch_config.num_tokens; i++) { + dram_copy[i] = batch_config.tokensInfo[i].token_id; + } + TokenId *fb_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + assert(batch_config.num_tokens <= domain.get_volume()); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDA(hipMemcpyAsync(fb_ptr, + dram_copy, + sizeof(TokenId) * batch_config.num_tokens, + hipMemcpyHostToDevice, + stream)); +} + +}; // namespace FlexFlow diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index fbf33cc04d..634a199ea7 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -11,6 +11,9 @@ BATCHSIZE=$((GPUS * 64)) FSIZE=14048 ZSIZE=12192 +GPU_AVAILABLE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +if [ $(( GPUS )) -gt $(( GPU_AVAILABLE )) ]; then echo "The test requires $GPUS GPUs, but only $GPU_AVAILABLE are available. Try reducing the number of nodes, or the number of gpus/node." ; exit; fi + remove_mnist() { rm -f train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz train-images-idx3-ubyte train-labels-idx1-ubyte } @@ -47,7 +50,8 @@ if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then # "$FF_HOME"/build/examples/cpp/split_test/split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples - "$FF_HOME"/build/examples/cpp/inference/LLAMA/LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi + "$FF_HOME"/build/examples/cpp/inference/LLAMA/LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel #"$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel #"$FF_HOME"/build/examples/cpp/inference/transformers/inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel else @@ -79,7 +83,8 @@ else # split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples - LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel + if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi + LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel #inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel #inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel fi From 527d6d285bdbc41c2974e9b6e581e0c4de74a6ee Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 4 May 2023 21:42:35 -0400 Subject: [PATCH 104/344] [Inference] - Token Verification (#703) * newlines * initial commit * implement kernel to update key/value cache for each tree branch * fix indexing of requests slots * phase out num_tree_branches * commit mechanism --- examples/cpp/inference/LLAMA/dataloader.cc | 2 +- examples/cpp/inference/LLAMA/llama.cc | 4 +- include/flexflow/batch_config.h | 18 + include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 21 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/inc_mha_verify.h | 143 ++ include/flexflow/ops/inc_mha_verify_params.h | 31 + .../ops/inc_multihead_self_attention.h | 9 - src/ops/inc_mha_verify.cc | 1439 +++++++++++++++++ src/ops/inc_mha_verify.cpp | 75 + src/ops/inc_mha_verify.cu | 942 +++++++++++ src/runtime/batch_config.cc | 18 + src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 49 + src/runtime/model.cc | 28 + src/runtime/operator_params.cc | 3 + src/runtime/substitution.cc | 10 +- 18 files changed, 2783 insertions(+), 14 deletions(-) create mode 100644 include/flexflow/ops/inc_mha_verify.h create mode 100644 include/flexflow/ops/inc_mha_verify_params.h create mode 100644 src/ops/inc_mha_verify.cc create mode 100644 src/ops/inc_mha_verify.cpp create mode 100644 src/ops/inc_mha_verify.cu diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index 1bfadde559..13de5b5b64 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -143,7 +143,7 @@ void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { // std::cout << loaded_data_size << std::endl; // std::cout << in_get_size << std::endl; if (in_get_size != loaded_data_size) { - std::cout << "load data error"; + std::cout << "load data error" << std::endl; return; } diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 321f870a8f..a4924f5406 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -244,7 +244,7 @@ void FlexFlow::top_level_task(Task const *task, BatchConfig bc; InferenceResult ir; bc = rm.prepare_next_batch(bc, ir); - std::cout << "new tokens: " << bc.num_tokens; + std::cout << "new tokens: " << bc.num_tokens << std::endl; FutureMap fm = im.inference(&ff, bid, bc); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); @@ -262,7 +262,7 @@ void FlexFlow::top_level_task(Task const *task, BatchConfig bc = batch_configs[bid]; processed_requests += bc.num_tokens; bc = rm.prepare_next_batch(bc, ir); - std::cout << "new tokens: " << bc.num_tokens; + std::cout << "new tokens: " << bc.num_tokens << std::endl; FutureMap fm = im.inference(&ff, bid, bc); assert(fm.get_future_map_domain().get_volume() == 1); future_handlers[bid] = fm.get_future(0); diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index a397be28c3..d4150d6c51 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -65,6 +65,24 @@ class BatchConfig { bool request_completed[MAX_NUM_REQUESTS]; }; +class TreeVerifyBatchConfig : public BatchConfig { +public: + struct PerTokenInfo : BatchConfig::PerTokenInfo { + int tree_branch_idx; + }; + struct CommittedTokensInfo { + int token_index; // the index of the token in the previous batch + int request_index; // request index in the batch + int token_depth; // position of the token in the request's sequence + }; + + void compute_tree_branch_indexes(); + + int num_tokens_to_commit; + CommittedTokensInfo commited_tokens[MAX_NUM_TOKENS]; + PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; +}; + struct InferenceResult { static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; BatchConfig::TokenId token_ids[MAX_NUM_TOKENS]; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index d244fb3ac6..a64944ab30 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -149,6 +149,7 @@ enum OperatorType { OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, OP_INC_MULTIHEAD_SELF_ATTENTION, + OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3cd5e2036e..c213af9ac8 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -147,6 +147,10 @@ enum TaskIDs { INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_FWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_BWD_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -285,6 +289,7 @@ class LayerNorm; class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; +class IncMultiHeadSelfAttentionVerify; class Pool2D; class Reduce; class Reshape; @@ -602,6 +607,19 @@ class FFModel { Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, char const *name = NULL); + Tensor inc_multihead_self_attention_verify( + const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = true, + bool add_bias_kv = false, + bool add_zero_attn = false, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + char const *name = NULL); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], DataType data_type, @@ -964,6 +982,9 @@ class FFModel { std::unordered_map< std::pair, IncMultiHeadSelfAttention *>, + std::unordered_map< + std::pair, + IncMultiHeadSelfAttentionVerify *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index f949fe3e4c..ad0e15ce46 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -17,6 +17,7 @@ #include "flexflow/ops/flat_params.h" #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" +#include "flexflow/ops/inc_mha_verify_params.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" @@ -56,6 +57,7 @@ using OperatorParameters = mp::variant +#include + +namespace FlexFlow { + +class IncMultiHeadSelfAttentionVerifyMeta; + +class IncMultiHeadSelfAttentionVerify : public Op { +public: + using Params = IncMultiHeadSelfAttentionVerifyParams; + using Input = ParallelTensor; + + IncMultiHeadSelfAttentionVerify(FFModel &model, + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name); + IncMultiHeadSelfAttentionVerify(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name); + IncMultiHeadSelfAttentionVerify(FFModel &model, + IncMultiHeadSelfAttentionVerify const &other, + const ParallelTensor input, + bool allocate_weights); + IncMultiHeadSelfAttentionVerify(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + + static void + inference_kernel_wrapper(IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); + Params get_params() const; + +public: + int num_heads; + float dropout; + bool bias; + bool add_bias_kv, add_zero_attn, apply_rotary_embedding; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int qoSeqLength, kvSeqLength; +}; + +class IncMultiHeadSelfAttentionVerifyMeta : public OpMeta { +public: + IncMultiHeadSelfAttentionVerifyMeta( + FFHandler handler, + IncMultiHeadSelfAttentionVerify const *attn, + float const *weight_ptr, + Legion::Memory gpu_mem, + int num_samples, + int _num_heads); + ~IncMultiHeadSelfAttentionVerifyMeta(void); + +public: + Realm::RegionInstance reserveInst; + size_t weights_params, weightSize, reserveSpaceSize; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int num_heads; + bool *has_load_weights; + bool *apply_rotary_embedding; +#ifdef INFERENCE_TESTS + float *kcache, *vcache; +#endif + float *devQKVProjArray, *keyCache, *valueCache; + float *qk_prods, *qk_prods_softmax; + float *attn_heads, *W_out_contiguous; + + TreeVerifyBatchConfig::PerTokenInfo *token_infos; + TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H diff --git a/include/flexflow/ops/inc_mha_verify_params.h b/include/flexflow/ops/inc_mha_verify_params.h new file mode 100644 index 0000000000..51ead7b283 --- /dev/null +++ b/include/flexflow/ops/inc_mha_verify_params.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H +#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H + +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct IncMultiHeadSelfAttentionVerifyParams { + LayerID layer_guid; + int embed_dim, num_heads, kdim, vdim; + float dropout; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(IncMultiHeadSelfAttentionVerifyParams const &, + IncMultiHeadSelfAttentionVerifyParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t + operator()(FlexFlow::IncMultiHeadSelfAttentionVerifyParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index e781da9cf5..5db5e0c3c1 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -128,18 +128,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif - /*#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnAttnDescriptor_t attnDesc; - cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; - #endif*/ - // int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx, *kvCache; float *devQKVProjArray, *keyCache, *valueCache; float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; - // void *reserveSpace; - - // BatchConfig::token_idxs *dev_token2ids; - BatchConfig::PerTokenInfo *token_infos; }; diff --git a/src/ops/inc_mha_verify.cc b/src/ops/inc_mha_verify.cc new file mode 100644 index 0000000000..228c0d224a --- /dev/null +++ b/src/ops/inc_mha_verify.cc @@ -0,0 +1,1439 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_mha_verify.h" +#include "flexflow/model.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#ifdef INFERENCE_TESTS +#include +using namespace at::indexing; +#endif + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +bool IncMultiHeadSelfAttentionVerifyParams::is_valid( + ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor FFModel::inc_multihead_self_attention_verify( + const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + char const *name) { + // Currently assume that + Layer *li = new Layer(this, + OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + } + { + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; + li->weights[0] = create_weight_legion_ordering(2, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = DT_FLOAT; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_heads", num_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("bias", bias); + li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + layers.push_back(li); + return li->outputs[0]; +} + +Op *IncMultiHeadSelfAttentionVerify::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_heads", value); + int num_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("bias", value); + bool bias = (bool)value; + layer->get_int_property("add_bias_kv", value); + bool add_bias_kv = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + layer->get_int_property("apply_rotary_embedding", value); + bool apply_rotary_embedding = (bool)value; + return new IncMultiHeadSelfAttentionVerify(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + apply_rotary_embedding, + false /*allocate_weights*/, + layer->name); +} + +IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( + FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + dims[2].degree = 1; + dims[2].parallel_idx = -1; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( + FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input, + _weight), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( + FFModel &model, + IncMultiHeadSelfAttentionVerify const &other, + const ParallelTensor input, + bool allocate_weights) + : IncMultiHeadSelfAttentionVerify(model, + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + other.apply_rotary_embedding, + allocate_weights, + other.name) {} + +IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( + FFModel &model, + IncMultiHeadSelfAttentionVerifyParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : IncMultiHeadSelfAttentionVerify(model, + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + params.apply_rotary_embedding, + allocate_weights, + name) {} + +void IncMultiHeadSelfAttentionVerify::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher( + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiHeadSelfAttentionVerify)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void IncMultiHeadSelfAttentionVerify::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher( + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiHeadSelfAttentionVerify)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta *IncMultiHeadSelfAttentionVerify::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + IncMultiHeadSelfAttentionVerify const *attn = + (IncMultiHeadSelfAttentionVerify *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + IncMultiHeadSelfAttentionVerifyMeta *m = + new IncMultiHeadSelfAttentionVerifyMeta(handle, + attn, + weight.get_float_ptr(), + gpu_mem, + num_samples, + num_heads); + m->profiling = attn->profiling; + assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + return m; +} + +void IncMultiHeadSelfAttentionVerify::forward(FFModel const &ff) { + // IncMultiHeadSelfAttentionVerify doesn't support forward + assert(false); +} + +FutureMap IncMultiHeadSelfAttentionVerify::inference( + FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + printf("TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d\n", + bc.num_tokens, + bc.num_active_requests()); + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INF_TASK_ID, + parallel_is, + TaskArgument(&bc, sizeof(TreeVerifyBatchConfig)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiHeadSelfAttentionVerify::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == regions.size()); + + TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; + IncMultiHeadSelfAttentionVerifyMeta const *m = + *((IncMultiHeadSelfAttentionVerifyMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 3); + assert(output_domain.get_dim() == 4); + + /* print_tensor(input.get_float_ptr(), + input_domain.get_volume(), + "[Attention:forward:query]"); */ + + IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( + m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr()); +#ifdef INFERENCE_TESTS + printf("Checking IncMultiHeadSelfAttentionVerify computations...\n"); + + // ============================================================================= + // Define helper functions to handle row-major arrays + // ============================================================================= + + auto set_value_row_major = [](float *arr, + std::vector const &shape, + std::vector const &indices, + float value) -> void { + int offset = 0; + for (int i = 0; i < shape.size(); i++) { + int index = indices[i]; + int stride = 1; + for (int j = i + 1; j < shape.size(); j++) { + stride *= shape[j]; + } + offset += index * stride; + } + *(arr + offset) = value; + }; + + // ============================================================================= + // Load input/output/weights and parse general configs + // ============================================================================= + + float *input_cpu = + download_tensor(input.get_float_ptr(), input_domain.get_volume()); + assert(input_cpu != nullptr); + float *weight_cpu = download_tensor(weight.get_float_ptr(), + weight_domain.get_volume()); + assert(weight_cpu != nullptr); + float *output_cpu = download_tensor(output.get_float_ptr(), + output_domain.get_volume()); + assert(output_cpu != nullptr); + + // Input tensor dimensions + coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + coord_t max_sequence_length = input_domain.hi()[1] - input_domain.lo()[1] + 1; + coord_t batch_size = input_domain.hi()[2] - input_domain.lo()[2] + 1; + coord_t replica_dim = input_domain.hi()[3] - input_domain.lo()[3] + 1; + assert(replica_dim == 1); + + size_t effective_batch_size = max_sequence_length * batch_size; + float inputs_arr[data_dim][effective_batch_size] = {0}; + for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { + size_t data_index = i % data_dim; + size_t token_index = i / data_dim; + assert(data_index < data_dim); + assert(token_index < effective_batch_size); + inputs_arr[data_index][token_index] = input_cpu[i]; + } + torch::Tensor torch_input = torch::from_blob( + inputs_arr, {data_dim, (long int)effective_batch_size}, torch::kFloat32); + + // Weight tensor dimensions + coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; + coord_t num_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; + replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; + size_t qParas = m->qProjSize * m->qSize; + size_t kParas = m->kProjSize * m->kSize; + size_t vParas = m->vProjSize * m->vSize; + size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); + + assert(all_weight_params == qParas + kParas + vParas + oParas); + assert(num_heads == m->num_heads); + assert(replica_dim == 1); + + assert(m->qSize == m->kSize && m->kSize == m->vSize); + // printf("m->qSize: %i\n", m->qSize); + // keep things simple for now + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + long int proj_sum = m->qProjSize + m->kProjSize + m->vProjSize; + // load weight manually because Torch can't easily read a tensor serialized in + // column-major order. + + // printf("m->kProjSize: %i, TreeVerifyBatchConfig::MAX_NUM_TOKENS: %i, " + // "bc->num_active_tokens(): %i, num_heads: %lli, + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS: %i, " + // "bc->num_active_requests(): %i\n", m->kProjSize, + // TreeVerifyBatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), + // num_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS, + // bc->num_active_requests()); + // for (int t=0; t < bc->num_active_tokens(); t++) { + // printf("token %i has request_index: %li and token_position: %li\n", + // t, bc->token2ids.token_indexes[t].request_index, + // bc->token2ids.token_indexes[t].token_position); + // } + + // ============================================================================= + // Load the output tensor (with CUDA results), and create a Torch tensor + // ============================================================================= + + float output_cuda[m->oProjSize][effective_batch_size] = {0}; + for (int i = 0; i < m->oProjSize * effective_batch_size; i++) { + int row_idx = i % m->oProjSize; + int col_idx = i / m->oProjSize; + assert(row_idx < m->oProjSize && col_idx < effective_batch_size); + output_cuda[row_idx][col_idx] = output_cpu[i]; + } + torch::Tensor torch_out_cuda = + torch::from_blob(output_cuda, + {m->oProjSize, (int64_t)effective_batch_size}, + torch::kFloat32); + + // ============================================================================= + // Load the Q/K/V projection weights, and create a Torch tensor + // ============================================================================= + std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_heads}; + float *w_qkv = + (float *)calloc(m->qSize * m->qProjSize * 3 * num_heads, sizeof(float)); + assert(w_qkv[0] == 0.0f); + + for (int h = 0; h < num_heads; h++) { + for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { + int row_index = i % m->qSize; + int column_index = i / m->qSize; + // Q + set_value_row_major(w_qkv, + w_qkv_shape, + {row_index, column_index, 0, h}, + weight_cpu[all_weight_params * h + + m->qSize * column_index + row_index]); + // K + set_value_row_major( + w_qkv, + w_qkv_shape, + {row_index, column_index, 1, h}, + weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + + m->qSize * column_index + row_index]); + // V + set_value_row_major( + w_qkv, + w_qkv_shape, + {row_index, column_index, 2, h}, + weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + + m->qSize * column_index + row_index]); + } + } + // convert weights to torch tensor + torch::Tensor torch_w_qkv = torch::from_blob( + w_qkv, {m->qSize, m->qProjSize, 3, (int)num_heads}, torch::kFloat32); + + /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() + << std::endl; + std::cout << "Torch input size: " << torch_input.sizes() << std::endl; + std::cout << "Number of active tokens: " << bc->num_active_tokens() + << std::endl; */ + // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; + + // ============================================================================= + // Compute the Q/K/V projections, and compare the results with CUDA + // ============================================================================= + + // ----------------------- C++ computations & checks ------------------------ + torch::Tensor qkv_projs = torch::einsum( + "ijkl,im->jmkl", + {torch_w_qkv, + torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); + // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; + assert(qkv_projs.sizes()[0] == m->qProjSize); + assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && + qkv_projs.sizes()[1] <= effective_batch_size); + assert(qkv_projs.sizes()[2] == 3); + assert(qkv_projs.sizes()[3] == num_heads); + free(w_qkv); + + // ----------------------- Loading CUDA results for this step --------------- + float *QKVProjArray_cpu = download_tensor( + m->devQKVProjArray, + TreeVerifyBatchConfig::MAX_NUM_TOKENS * proj_sum * m->num_heads); + assert(QKVProjArray_cpu != nullptr); + + std::vector QKVProjArray_converted_shape = { + m->qProjSize, bc->num_active_tokens(), 3, (int)num_heads}; + float *QKVProjArray_converted = (float *)calloc( + m->qProjSize * bc->num_active_tokens() * 3 * num_heads, sizeof(float)); + + // skip over padding at the end of QKVProjArray_cpu + // convert from column order to 3D matrix because torch cannot automatically + // import matrices flattened in column order + for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { + int proj_size_index = i % m->qProjSize; + int head_index = i / (proj_sum * bc->num_active_tokens()); + int token_index = + ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % + bc->num_active_tokens(); + int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / + (m->qProjSize * bc->num_active_tokens()); + assert(proj_size_index < proj_sum); + assert(head_index < num_heads); + assert(token_index < bc->num_active_tokens()); + assert(qkv_offset < 3); + set_value_row_major(QKVProjArray_converted, + QKVProjArray_converted_shape, + {proj_size_index, token_index, qkv_offset, head_index}, + QKVProjArray_cpu[i]); + } + torch::Tensor QKVProjArray_torch = + torch::from_blob(QKVProjArray_converted, + {m->qProjSize, bc->num_active_tokens(), 3, num_heads}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + // std::cout << "QKVProjArray_torch" << std::endl; + // for (int i=0; inum_active_tokens(); t++) { + for (size_t d = 0; d < m->kProjSize; d++) { + size_t kcache_idx = d * MAX_SEQ_LEN * m->num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].abs_depth_in_request * + m->num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS + + h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].request_index; + m->kcache[kcache_idx] = + qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) + .item(); + } + for (size_t d = 0; d < m->vProjSize; d++) { + size_t vcache_idx = d * MAX_SEQ_LEN * m->num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].abs_depth_in_request * + m->num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS + + h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].request_index; + m->vcache[vcache_idx] = + qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) + .item(); + } + } + } + // Create torch tensors from the arrays + torch::Tensor K_t = + torch::from_blob(m->kcache, + {m->kProjSize, + MAX_SEQ_LEN, + num_heads, + TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + torch::Tensor V_t = + torch::from_blob(m->vcache, + {m->vProjSize, + MAX_SEQ_LEN, + num_heads, + TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + + // Compute useful indices + std::vector req_idxs; + std::vector r_first_idx; + std::vector r_num_tokens; + for (size_t t = 0; t < bc->num_active_tokens(); t++) { + size_t rid = bc->tokensInfo[t].request_index; + if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { + req_idxs.push_back(rid); + r_first_idx.push_back(t); + r_num_tokens.push_back(1); + } else { + r_num_tokens[r_num_tokens.size() - 1]++; + } + assert(req_idxs.size() == r_first_idx.size() && + r_first_idx.size() == r_num_tokens.size()); + } + assert(req_idxs.size() == bc->num_active_requests()); + assert(std::accumulate(r_num_tokens.begin(), + r_num_tokens.end(), + decltype(r_num_tokens)::value_type(0)) == + bc->num_active_tokens()); + + // ----------------------- Loading CUDA results for this step --------------- + float *keyCache_cpu = download_tensor( + m->keyCache, + m->num_heads * m->kProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * + MAX_SEQ_LEN); + float *valueCache_cpu = download_tensor( + m->valueCache, + m->num_heads * m->vProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * + MAX_SEQ_LEN); + assert(keyCache_cpu != nullptr); + assert(valueCache_cpu != nullptr); + + float *kcache_cuda = + (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + float *vcache_cuda = + (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + int index = 0; + for (int i = 0; i < m->kProjSize; i++) { + for (int j = 0; j < MAX_SEQ_LEN; j++) { + for (int k = 0; k < m->num_heads; k++) { + for (int l = 0; l < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; l++) { + int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_heads + + k * m->kProjSize * MAX_SEQ_LEN + + j * m->kProjSize + i; + kcache_cuda[index++] = keyCache_cpu[col_major_index]; + } + } + } + } + index = 0; + for (int i = 0; i < m->vProjSize; i++) { + for (int j = 0; j < MAX_SEQ_LEN; j++) { + for (int k = 0; k < m->num_heads; k++) { + for (int l = 0; l < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; l++) { + int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_heads + + k * m->vProjSize * MAX_SEQ_LEN + + j * m->vProjSize + i; + vcache_cuda[index++] = valueCache_cpu[col_major_index]; + } + } + } + } + torch::Tensor K_t_cuda = + torch::from_blob(kcache_cuda, + {m->kProjSize, + MAX_SEQ_LEN, + num_heads, + TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + torch::Tensor V_t_cuda = + torch::from_blob(vcache_cuda, + {m->vProjSize, + MAX_SEQ_LEN, + num_heads, + TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + + // std::cout << "kcache differences:" << std::endl; + // for (int i=0; i < bc->num_active_requests() + 1; i++) { + // for (int j=0; j < num_heads; j++) { + // for (int l=0; l < m->kProjSize; l++) { + // for (int k=0; k < MAX_SEQ_LEN; k++) { + // size_t kcache_idx = + // l * MAX_SEQ_LEN * num_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; if ( + // abs(m->kcache[kcache_idx] - keyCache_cpu[ + // i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // j * m->kProjSize * MAX_SEQ_LEN + + // k * m->kProjSize + + // l + // ]) > 0.00001) { + // printf("req: %i (rid: %i), head: %i, data_dim: %i, token_pos: + // %i\n", + // i, req_idxs[i], j, l, k); + // } + // } + // } + // } + // } + + // std::cout << "keyCache from CUDA:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jkProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // printf("%f ", + // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // j * m->kProjSize * MAX_SEQ_LEN + + // k * m->kProjSize + + // l + // ]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // std::cout << "valueCache from CUDA:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jvProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // printf("%f ", + // valueCache_cpu[ + // i * m->vProjSize * MAX_SEQ_LEN * num_heads + + // j * m->vProjSize * MAX_SEQ_LEN + + // k * m->vProjSize + + // l]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // printf("\n"); + + // std::cout << "C++ kcache:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; j < num_heads; j++) { + // for (int l=0; l < m->kProjSize; l++) { + // for (int k=0; k < MAX_SEQ_LEN; k++) { + // size_t kcache_idx = + // l * MAX_SEQ_LEN * num_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; + // printf("%f ", m->kcache[kcache_idx]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // std::cout << "C++ vcache:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jvProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // size_t vcache_idx = + // l * MAX_SEQ_LEN * num_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; + // printf("%f ", m->vcache[vcache_idx]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + assert(torch::allclose(K_t_cuda, K_t, 1e-05, 1e-05)); + assert(torch::allclose(V_t_cuda, V_t, 1e-05, 1e-05)); + free(kcache_cuda); + free(vcache_cuda); + + // ============================================================================= + // Load the W_out projection weights + // ============================================================================= + + // ----------------------- C++ operations & checks -------------------------- + float *w_out = (float *)calloc(m->vProjSize * m->num_heads * m->oProjSize, + sizeof(float)); + std::vector w_out_shape = {m->vProjSize, m->num_heads, m->oProjSize}; + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + for (int h = 0; h < num_heads; h++) { + for (int v = 0; v < m->vProjSize; v++) { + for (int o = 0; o < m->oProjSize; o++) { + set_value_row_major( + w_out, + w_out_shape, + {v, h, o}, + weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + + m->vProjSize * o + v]); + } + } + } + // convert weights to torch tensor + torch::Tensor torch_w_out = torch::from_blob( + w_out, {m->vProjSize, m->num_heads, m->oProjSize}, torch::kFloat32); + + // ----------------------- Loading CUDA results for this step --------------- + float *w_out_cuda = download_tensor( + m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); + assert(w_out_cuda != nullptr); + float *converted_wout_tensor = (float *)calloc( + m->vProjSize * m->num_heads * m->oProjSize, sizeof(float)); + std::vector converted_wout_tensor_shape = { + m->vProjSize, m->num_heads, m->oProjSize}; + + for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { + int v_idx = i % m->vProjSize; + int h_idx = (i / m->vProjSize) % m->num_heads; + int o_idx = i / (m->vProjSize * m->num_heads); + assert(v_idx < m->vProjSize && h_idx < m->num_heads && + o_idx < m->oProjSize); + set_value_row_major(converted_wout_tensor, + converted_wout_tensor_shape, + {v_idx, h_idx, o_idx}, + w_out_cuda[i]); + } + torch::Tensor w_out_cuda_tensor = + torch::from_blob(converted_wout_tensor, + {m->vProjSize, m->num_heads, m->oProjSize}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); + free(converted_wout_tensor); + + // ============================================================================= + // Compute the softmax(QK^T/sqrt(d_k))V product, request by request + // ============================================================================= + + // ----------------------- C++ initialization steps ------------------------- + torch::Tensor Q_projs = qkv_projs.index({Slice(), Slice(), 0, Slice()}) + .reshape({qkv_projs.sizes()[0], + qkv_projs.sizes()[1], + qkv_projs.sizes()[3]}); + + torch::Tensor qk_products[bc->num_active_requests()]; + torch::Tensor qk_softmax[bc->num_active_requests()]; + torch::Tensor attn_heads[bc->num_active_requests()]; + + torch::Tensor cpp_output = + torch::zeros({m->oProjSize, bc->num_active_tokens()}); + + // ----------------------- Loading CUDA results for this step --------------- + float *qk_prods_cpu = download_tensor( + m->qk_prods, + TreeVerifyBatchConfig::MAX_NUM_TOKENS * + TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads); + assert(qk_prods_cpu != nullptr); + + float *qk_prods_softmax_cpu = download_tensor( + m->qk_prods_softmax, + TreeVerifyBatchConfig::MAX_NUM_TOKENS * + TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads); + assert(qk_prods_softmax_cpu != nullptr); + + float *attn_heads_cpu = download_tensor( + m->attn_heads, + TreeVerifyBatchConfig::MAX_NUM_TOKENS * m->num_heads * m->vProjSize); + assert(attn_heads_cpu != nullptr); + + // ----------------------- Main loop (request by request) ------------------- + size_t qk_prods_cpu_offset = 0; + + for (size_t r = 0; r < bc->num_active_requests(); r++) { + // Compute pre-request parameters + size_t num_new_tokens = r_num_tokens[r]; + int64_t rid = (int64_t)(req_idxs[r]); + int64_t num_tokens_received_so_far = + (int64_t)(bc->requestsInfo[rid].token_start_offset + + bc->requestsInfo[rid].num_tokens_in_batch); + assert(num_new_tokens == bc->requestsInfo[rid].num_tokens_in_batch); + assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); + + // ----------------------- C++ computations ------------------------------- + // Get the slice of the Q projection tensor with the tokens in the current + // request + torch::Tensor Q_req = + Q_projs.index({Slice(), + Slice(r_first_idx[r], r_first_idx[r] + num_new_tokens), + Slice()}); + // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; + assert(Q_req.sizes()[0] == m->qProjSize); + assert(Q_req.sizes()[1] == num_new_tokens); + assert(Q_req.sizes()[2] == num_heads); + + /*printf("\n------------ QK multiplication (C++) -------------\n"); + printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, + rid: %li, Qproj slice: (%i, %i)\n", r, num_new_tokens, + num_tokens_received_so_far, rid, r_first_idx[r], r_first_idx[r] + + num_new_tokens); + + std::cout << "Q_req matrix (idk dims):" << std::endl << + Q_req.index({Slice(), Slice(), 0}) << std::endl << std::endl; std::cout << + "K_t matrix (ilk dims):" << std::endl << K_t.index({Slice(), Slice(0, + num_tokens_received_so_far), 0, rid}) << std::endl << std::endl; std::cout + << "C++ alpha: " << (1.0f / sqrt(m->kProjSize)) << std::endl;*/ + + // Compute (Q*K^T)/sqrt(d_k) matmul + qk_products[r] = + torch::einsum("ijk,ilk->jlk", + {Q_req, + K_t.index({Slice(), + Slice(0, num_tokens_received_so_far), + Slice(), + rid})}) * + (1.0f / sqrt(m->kProjSize)); + + // Set entries above diagonal to -inf to make attention causal. + for (int h = 0; h < num_heads; h++) { + qk_products[r].index( + {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = + qk_products[r] + .index({Slice(), + Slice(num_tokens_received_so_far - num_new_tokens), + h}) + .tril() + + torch::full({(int64_t)num_new_tokens, (int64_t)num_new_tokens}, + -INFINITY) + .triu() + .fill_diagonal_(0); + } + // Compute softmax for each request block + qk_softmax[r] = torch::softmax(qk_products[r], -2); + assert(qk_softmax[r].sizes()[0] == num_new_tokens); + assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); + assert(qk_softmax[r].sizes()[2] == m->num_heads); + + // ------------------- Loading CUDA results for this step --------------- + float *converted_qk_prod = (float *)calloc( + num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + float *converted_qk_prod_softmax = (float *)calloc( + num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + std::vector converted_qk_prod_shape = { + (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_heads}; + + for (size_t i = 0; + i < num_new_tokens * num_tokens_received_so_far * num_heads; + i++) { + size_t new_t_idx = i % num_new_tokens; + size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; + size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); + assert(new_t_idx < num_new_tokens && + all_t_idx < num_tokens_received_so_far && head_idx < num_heads); + set_value_row_major(converted_qk_prod, + converted_qk_prod_shape, + {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, + qk_prods_cpu[i + qk_prods_cpu_offset]); + set_value_row_major(converted_qk_prod_softmax, + converted_qk_prod_shape, + {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, + qk_prods_softmax_cpu[i + qk_prods_cpu_offset]); + } + torch::Tensor qk_prods_cuda = torch::from_blob( + converted_qk_prod, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + torch::kFloat32); + torch::Tensor qk_prods_softmax_cuda = torch::from_blob( + converted_qk_prod_softmax, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + torch::kFloat32); + + // ------------------- Comparing C++ & CUDA results ------------------ + /* std::cout << "C++:" <vProjSize); + assert( + V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) + .sizes()[1] == num_tokens_received_so_far); + assert( + V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) + .sizes()[2] == m->num_heads); + attn_heads[r] = torch::einsum( + "ijk,ljk->ilk", + {qk_softmax[r], + V_t.index( + {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); + assert(attn_heads[r].sizes()[0] == num_new_tokens); + assert(attn_heads[r].sizes()[1] == m->vProjSize); + assert(attn_heads[r].sizes()[2] == m->num_heads); + + // ------------------- Loading CUDA results for this step --------------- + float converted_attn_heads_cpu[num_new_tokens][m->vProjSize][m->num_heads] = + {0}; + for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_heads; i++) { + int token_ix = i % num_new_tokens; + int vproj_idx = (i / num_new_tokens) % m->vProjSize; + int head_idx = i / (num_new_tokens * m->vProjSize); + assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && + head_idx < m->num_heads); + converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = + attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_heads + i]; + } + torch::Tensor converted_attn_heads_cuda = + torch::from_blob(converted_attn_heads_cpu, + {(int64_t)num_new_tokens, m->vProjSize, m->num_heads}, + torch::kFloat32); + + // -------------------- Comparing C++ & CUDA results ------------------- + /* std::cout << "CUDA attn head for req " << r << ":" <num_heads; h++) { + std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << + std::endl; + } + std::cout << "C++ attn head for req " << r << ":" <num_heads; h++) { + std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; + } */ + assert(torch::allclose( + converted_attn_heads_cuda, attn_heads[r], 1e-05, 1e-05)); + + // ----------------------- C++ computations ---------------------------- + // Compute output values by projecting all heads to output space + cpp_output.index( + {Slice(), + Slice(r_first_idx[r], r_first_idx[r] + (int64_t)num_new_tokens)}) = + torch::einsum("jkl,ijk->li", {torch_w_out, attn_heads[r]}); + + // increment main loop's auxiliary index + qk_prods_cpu_offset += + num_new_tokens * num_tokens_received_so_far * num_heads; + } + + // ----------------------- Comparing C++ & CUDA results --------------------- + /* std::cout << "C++:" <oProjSize; i++) { + std::cout << cpp_output.index({i, Slice()}) << std::endl; + } + std::cout << "CUDA:" <oProjSize; i++) { + std::cout << torch_out_cuda.index({i, Slice(0, + (int64_t)bc->num_active_tokens())}) << std::endl; + } */ + + assert(torch::allclose( + torch_out_cuda.index( + {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), + cpp_output, + 1e-05, + 1e-05)); + + // ============================================================================= + // Cleanup + // ============================================================================= + free(w_out); + checkCUDA(cudaFreeHost(input_cpu)); + checkCUDA(cudaFreeHost(weight_cpu)); + checkCUDA(cudaFreeHost(output_cpu)); + checkCUDA(cudaFreeHost(QKVProjArray_cpu)); + checkCUDA(cudaFreeHost(keyCache_cpu)); + checkCUDA(cudaFreeHost(valueCache_cpu)); + checkCUDA(cudaFreeHost(qk_prods_cpu)); + checkCUDA(cudaFreeHost(qk_prods_softmax_cpu)); + checkCUDA(cudaFreeHost(attn_heads_cpu)); + checkCUDA(cudaFreeHost(w_out_cuda)); + // assert(false && "All good if you see this assert failure! :)"); +#endif + // Done with INFERENCE_TESTS block +} + +void IncMultiHeadSelfAttentionVerify::backward(FFModel const &ff) { + // IncMultiHeadSelfAttentionVerify does not support backward + assert(false); +} + +bool IncMultiHeadSelfAttentionVerify::get_int_parameter(PMParameter para, + int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool IncMultiHeadSelfAttentionVerify::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(IncMultiHeadSelfAttentionVerifyParams const &lhs, + IncMultiHeadSelfAttentionVerifyParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.add_zero_attn == rhs.add_zero_attn && + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; +} + +IncMultiHeadSelfAttentionVerifyParams + IncMultiHeadSelfAttentionVerify::get_params() const { + IncMultiHeadSelfAttentionVerifyParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_heads = this->num_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.bias = this->bias; + params.add_bias_kv = this->add_bias_kv; + params.add_zero_attn = this->add_zero_attn; + params.apply_rotary_embedding = this->apply_rotary_embedding; + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::IncMultiHeadSelfAttentionVerifyParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.bias); + hash_combine(key, params.add_bias_kv); + hash_combine(key, params.add_zero_attn); + hash_combine(key, params.apply_rotary_embedding); + return key; +} +}; // namespace std diff --git a/src/ops/inc_mha_verify.cpp b/src/ops/inc_mha_verify.cpp new file mode 100644 index 0000000000..42dccfd6cc --- /dev/null +++ b/src/ops/inc_mha_verify.cpp @@ -0,0 +1,75 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_mha_verify.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +/*static*/ +void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( + IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + printf("IncMultiHeadSelfAttentionVerify forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( + FFHandler handler, + IncMultiHeadSelfAttentionVerify const *attn, + float const *weight_ptr, + Memory gpu_mem, + int num_samples, + int _num_heads) + : OpMeta(handler, attn) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(handler.dnn, stream)); +} + +IncMultiHeadSelfAttentionVerifyMeta::~IncMultiHeadSelfAttentionVerifyMeta( + void) {} + +}; // namespace FlexFlow diff --git a/src/ops/inc_mha_verify.cu b/src/ops/inc_mha_verify.cu new file mode 100644 index 0000000000..c841addde9 --- /dev/null +++ b/src/ops/inc_mha_verify.cu @@ -0,0 +1,942 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flexflow/ops/inc_mha_verify.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +__global__ void mha_verify_build_w_out_tensor(float const *weight_ptr, + float *contiguous_weight_ptr, + int vProjSize, + int oProjSize, + int num_heads, + int qkv_weight_block_size) { + CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { + int v_idx = i % vProjSize; + int o_idx = (i / vProjSize) % oProjSize; + int head_idx = i / (vProjSize * oProjSize); + contiguous_weight_ptr[o_idx * vProjSize * num_heads + head_idx * vProjSize + + v_idx] = + weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + + qkv_weight_block_size + o_idx * vProjSize + v_idx]; + } +} + +__global__ void commit_tokens_kernel( + float const *devQKVProjArray, + float *cache_ptr, + TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_heads, + int max_seq_len, + bool k_cache) { + + CUDA_KERNEL_LOOP(i, + num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { + int proj_size = k_cache ? kProjSize : vProjSize; + int data_idx = i % proj_size; + int head_idx = i / (num_tokens * proj_size); + int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; + token_idx = committedTokenInfos[token_idx].token_index; + + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int current_head_block_size = + num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); + float val = + devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + + token_idx * proj_size + data_idx]; + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + int const req_id = committedTokenInfos[token_idx].request_index; + int const tok_id = committedTokenInfos[token_idx].token_depth; + + cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +void commit_tokens(IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + cudaStream_t stream) { + int num_tokens_to_commit = bc->num_tokens_to_commit; + if (num_tokens_to_commit > 0) { + int parallelism = m->kProjSize * num_tokens_to_commit * m->num_heads; + commit_tokens_kernel<<>>(m->devQKVProjArray, + m->keyCache, + m->committed_token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens_to_commit, + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ true); + + parallelism = m->vProjSize * num_tokens_to_commit * m->num_heads; + commit_tokens_kernel<<>>(m->devQKVProjArray, + m->valueCache, + m->committed_token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens_to_commit, + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ false); + } +} + +__global__ void mha_verify_apply_rotary_embedding( + float *input_ptr, + cuFloatComplex *complex_input, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int q_block_size, + int k_block_size, + int v_block_size, + bool q_tensor) { + int proj_size = q_tensor ? qProjSize : kProjSize; + CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { + // create complex number + int head_idx = i / (num_tokens * proj_size / 2); + int idx = i % (num_tokens * proj_size / 2); + int real_part_index = + idx * 2 + head_idx * (q_block_size + k_block_size + v_block_size) + + (q_tensor ? 0 : q_block_size); + int complex_part_index = real_part_index + 1; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + // int head_idx = i / (num_tokens * proj_size); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + // size_t pos = id_map[token_idx].token_position; + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + + int pos_i = i % (proj_size / 2); + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[real_part_index + 1] = complex_input[i].y; + + // if (i % 64 == 1 && head_idx == 0) { + // printf("head id: %d, tokenid: %d, pospospos:-> %d, before real part + // %f, " + // "before complex part: %f, real part: %f," + // "complext part: %f, freq_cis real: %f, freq_cis commplexx + // %f\n", head_idx, token_idx, pos, before_real, before_complex, + // complex_input[i].x, + // complex_input[i].y, + // complex_pos.x, + // complex_pos.y); + // } + } +} + +void inference_kernel1(IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + float alpha = 1.0f, beta = 0.0f; + assert(m->qSize == m->vSize && m->qSize == m->kSize); + cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) + // Weights: qSize x qProjSize x 3 x num_heads + // Input: qSize x num_tokens + // Output >>> qProjSize x num_tokens x 3 x num_heads + int m_q = m->qProjSize; + int m_k = m->kProjSize; + int m_v = m->vProjSize; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->qSize; + int lda = k, ldb = k, ldc_q = m_q, ldc_k = m_k, ldc_v = m_v; + size_t strideA = + m->weights_params; // need to also skip over all the parameters for each + // head, plus the unused W_o weights + size_t strideB = 0; // input stays the same for all heads. + size_t strideC = + (m_q + m_k + m_v) * n; // size of the output block for each head. + // Q + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_q, + n, + k, + &alpha, + weight_ptr, + data_type, + lda, + strideA, + input_ptr, + data_type, + ldb, + strideB, + &beta, + output_ptr, + data_type, + ldc_q, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_k, + n, + k, + &alpha, + weight_ptr + m_q * k, + data_type, + lda, + strideA, + input_ptr, + data_type, + ldb, + strideB, + &beta, + output_ptr + m_q * n, + data_type, + ldc_k, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // V + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_v, + n, + k, + &alpha, + weight_ptr + (m_q + m_k) * k, + data_type, + lda, + strideA, + input_ptr, + data_type, + ldb, + strideB, + &beta, + output_ptr + (m_q + m_k) * n, + data_type, + ldc_v, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // apply rotary emmmbedding for k and v + // step1 change the k, v to complex tensor + int num_tokens = bc->num_active_tokens(); + int parallelism = m->kProjSize * num_tokens * m->num_heads; + int q_block_size = m->qProjSize * num_tokens; + int k_block_size = m->kProjSize * num_tokens; + int v_block_size = m->vProjSize * num_tokens; + cuFloatComplex *complex_input; + if (*m->apply_rotary_embedding) { + checkCUDA(cudaMalloc(&complex_input, + num_tokens * m->qProjSize * m->num_heads * + sizeof(cuFloatComplex *) / 2)); + /*q*/ + mha_verify_apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + true); + /*k*/ + mha_verify_apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + false); + } +} + +__global__ void initial_store_kv_cache( + float const *devQKVProjArray, + float *cache_ptr, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_heads, + int max_seq_len, + bool k_cache) { + CUDA_KERNEL_LOOP(i, + num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { + int proj_size = k_cache ? kProjSize : vProjSize; + int head_idx = i / (num_tokens * proj_size); + int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = i % proj_size; + + // only store the first branch initially, to avoid overwriting + if (tokenInfos[token_idx].tree_branch_idx == 0) { + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int current_head_block_size = + num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); + float val = + devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + + token_idx * proj_size + data_idx]; + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } + } +} + +__global__ void update_tree_branch_kv_cache( + float const *devQKVProjArray, + float *cache_ptr, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens_in_branch, + int num_tokens_previous_branches, + int num_tokens_previous_requests, + int total_tokens_in_batch, + int num_heads, + int max_seq_len, + bool k_cache) { + CUDA_KERNEL_LOOP( + i, num_tokens_in_branch * (k_cache ? kProjSize : vProjSize) * num_heads) { + int proj_size = k_cache ? kProjSize : vProjSize; + int data_idx = i % proj_size; + int token_idx = + (i / proj_size) % num_tokens_in_branch; // index in the tree branch + int head_idx = i / (proj_size * num_tokens_in_branch); + + token_idx += num_tokens_previous_branches; // get index in the whole request + token_idx += num_tokens_previous_requests; // get index in the whole batch + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * + total_tokens_in_batch; // skip over previous heads + int current_head_block_size = + total_tokens_in_batch * + (k_cache ? qProjSize + : qProjSize + kProjSize); // skip over Q entries (and K entries + // if we are working on the V cache) + float val = + devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + + token_idx * proj_size + data_idx]; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +void inference_kernel2(IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + if (num_tokens > 0) { + int parallelism = m->kProjSize * num_tokens * m->num_heads; + initial_store_kv_cache<<>>(m->devQKVProjArray, + m->keyCache, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ true); + + parallelism = m->vProjSize * num_tokens * m->num_heads; + initial_store_kv_cache<<>>(m->devQKVProjArray, + m->valueCache, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ false); + } +} + +__global__ void + mha_verify_fill_entries_above_diagonal(float *matrix, + size_t num_rows, + size_t num_cols, + size_t num_heads, + size_t entries_above_diagonal, + float value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + +void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + float *output_ptr, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int tokens_prev_requests_squares = 0; + int qkv_block_size = + (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int kt_block_size = m->kProjSize * MAX_SEQ_LEN; + int kt_req_block_size = kt_block_size * m->num_heads; + int vt_block_size = m->vProjSize * MAX_SEQ_LEN; + int vt_req_block_size = vt_block_size * m->num_heads; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + + int tokens_previous_tree_branches = 0; + + while (tokens_previous_tree_branches < + bc->requestsInfo[i].num_tokens_in_batch) { + int tree_branch_idx = bc->tokensInfo[tokens_previous_requests + + tokens_previous_tree_branches] + .tree_branch_idx; + int num_new_tokens = 1; + for (int j = tokens_previous_requests + tokens_previous_tree_branches + 1; + j < + tokens_previous_requests + bc->requestsInfo[i].num_tokens_in_batch; + j++) { + if (bc->tokensInfo[j].tree_branch_idx != tree_branch_idx) { + break; + } else { + num_new_tokens++; + } + } + int total_tokens = bc->tokensInfo[tokens_previous_requests + + tokens_previous_tree_branches] + .abs_depth_in_request + + num_new_tokens; + assert(num_new_tokens >= 1 && total_tokens >= num_new_tokens); + + if (tree_branch_idx == 0) { + assert(bc->tokensInfo[tokens_previous_requests].abs_depth_in_request == + bc->requestsInfo[i].token_start_offset); + } else { + // update K-V cache + int parallelism = m->kProjSize * num_new_tokens * m->num_heads; + update_tree_branch_kv_cache<<>>( + m->devQKVProjArray, + m->keyCache, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_new_tokens, // num_tokens_in_branch + tokens_previous_tree_branches, // num_tokens_previous_branches + tokens_previous_requests, // num_tokens_previous_requests + bc->requestsInfo[i].num_tokens_in_batch, // total_tokens_in_batch + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ true); + + parallelism = m->vProjSize * num_new_tokens * m->num_heads; + update_tree_branch_kv_cache<<>>( + m->devQKVProjArray, + m->valueCache, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_new_tokens, // num_tokens_in_branch + tokens_previous_tree_branches, // num_tokens_previous_branches + tokens_previous_requests, // num_tokens_previous_requests + bc->requestsInfo[i].num_tokens_in_batch, // total_tokens_in_batch + m->num_heads, + MAX_SEQ_LEN, + /* k_cache = */ false); + } + + // int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + /* int total_tokens = bc->requestsInfo[i].token_start_offset + + bc->requestsInfo[i].num_tokens_in_batch; */ + // bc->token_last_available_idx[i] + 1; + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = qkv_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + // To get A, skip over Q entries from previous requests (same head) + void const *A = (void const *)(m->devQKVProjArray + + tokens_previous_requests * m->qProjSize); + // To get B, skip over K entries from previous requests (all heads + + // padding) + void const *B = (void const *)(m->keyCache + i * kt_req_block_size); + // To get C, skip over QK^T products from previous requests + void *C = + (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_heads * entries_above_diagonal; + mha_verify_fill_entries_above_diagonal<<>>( + (float *)C, + num_new_tokens, + total_tokens, + m->num_heads, + entries_above_diagonal, + -INFINITY); + } + // Compute Softmax(QK^T/sqrt(d_k)) + cudnnTensorDescriptor_t qk_tensor; + checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, + n_param, + c_param, + h_param, + w_param)); + alpha = 1.0f, beta = 0.0f; + void *C_softmax = (void *)(m->qk_prods_softmax + + m->num_heads * tokens_prev_requests_squares); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + qk_tensor, + (void *)((float *)C), + &beta, + qk_tensor, + (void *)((float *)C_softmax))); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = vt_block_size; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = (void const *)C_softmax; + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = (void const *)(m->valueCache + i * vt_req_block_size); + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = (void *)(m->attn_heads + + tokens_previous_requests * m->num_heads * m->vProjSize); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = (void const *)m->W_out_contiguous; + B = (void const *)C; + C = (void *)(output_ptr + tokens_previous_requests * m->oProjSize); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + tokens_previous_tree_branches += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; + } + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + } + + assert(tokens_previous_requests == num_tokens); +} + +/*static*/ +void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( + IncMultiHeadSelfAttentionVerifyMeta const *m, + TreeVerifyBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // copy committed tokens info to GPU for the commit_tokens kernel + cudaMemcpyAsync(m->committed_token_infos, + &(bc->commited_tokens), + bc->MAX_NUM_TOKENS * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), + cudaMemcpyHostToDevice, + stream); + + // reload the weight_o + + if (!(*m->has_load_weights)) { + int parallelism = m->vProjSize * m->oProjSize * m->num_heads; + mha_verify_build_w_out_tensor<<>>(weight_ptr, + m->W_out_contiguous, + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); + *m->has_load_weights = true; + } + // here because we need postion info in infernece 1 + cudaMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * + sizeof(TreeVerifyBatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + + // phase 2: Update key/val cache + inference_kernel2(m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + inference_kernel3(m, bc, output_ptr, stream); + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiHeadSelfAttentionVerify forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( + FFHandler handler, + IncMultiHeadSelfAttentionVerify const *attn, + float const *weight_ptr, + Memory gpu_mem, + int num_samples, + int _num_heads) + : OpMeta(handler, attn) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + qSize = attn->qSize; + kSize = attn->kSize; + vSize = attn->vSize; + // assume dimensions match for now + assert(qSize == kSize); + assert(kSize == vSize); + qProjSize = attn->qProjSize; + kProjSize = attn->kProjSize; + assert(qProjSize == kProjSize); // required for attention QK^T matmul + vProjSize = attn->vProjSize; + oProjSize = attn->oProjSize; + + num_heads = _num_heads; + weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); + weightSize = weights_params * num_heads * sizeof(float); + has_load_weights = (bool *)calloc(1, sizeof(bool)); + *has_load_weights = false; + apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); + *apply_rotary_embedding = attn->apply_rotary_embedding; + // Currently do not support adding bias to key/value projection + assert(!attn->add_bias_kv); + +#ifdef INFERENCE_TESTS + kcache = (float *)calloc(kProjSize * MAX_SEQ_LEN * num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + vcache = (float *)calloc(vProjSize * MAX_SEQ_LEN * num_heads * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); +#endif + + // allocate memory for the seqArray and reserve space + { + size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; + size_t qkv_max_proj_size = + TreeVerifyBatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; + size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; + size_t key_cache_size = num_heads * kProjSize * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS * + MAX_SEQ_LEN; + size_t value_cache_size = num_heads * vProjSize * + TreeVerifyBatchConfig::MAX_NUM_REQUESTS * + MAX_SEQ_LEN; + size_t tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; + size_t qk_prod_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS * + TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads; + size_t attn_heads_size = + TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; + size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + size_t W_out_contiguous_size = W_out_block_size * num_heads; + size_t totalSize = + (qkv_max_proj_size + key_cache_size + value_cache_size + + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * + sizeof(float) + + tokeninfo_size * sizeof(TreeVerifyBatchConfig::PerTokenInfo) + + committed_tokeninfo_size * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(totalSize - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(reserveInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); + committed_token_infos = + (TreeVerifyBatchConfig::CommittedTokensInfo *)(devQKVProjArray + + qkv_max_proj_size); + keyCache = (float *)(committed_token_infos + committed_tokeninfo_size); + valueCache = (float *)keyCache + key_cache_size; + token_infos = + (TreeVerifyBatchConfig::PerTokenInfo *)(valueCache + value_cache_size); + qk_prods = (float *)(token_infos + tokeninfo_size); + qk_prods_softmax = (float *)(qk_prods + qk_prod_size); + attn_heads = (float *)qk_prods_softmax + qk_prod_size; + W_out_contiguous = (float *)attn_heads + attn_heads_size; + int parallelism = vProjSize * oProjSize * num_heads; + mha_verify_build_w_out_tensor<<>>( + weight_ptr, + W_out_contiguous, + vProjSize, + oProjSize, + num_heads, + (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); + } + + cudaStreamSynchronize(stream); +} + +IncMultiHeadSelfAttentionVerifyMeta::~IncMultiHeadSelfAttentionVerifyMeta( + void) { + reserveInst.destroy(); +#ifdef INFERENCE_TESTS + free(kcache); + free(vcache); +#endif +} + +}; // namespace FlexFlow diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 5a41962a13..2f2655f589 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -173,4 +173,22 @@ void BatchConfig::print() const { } } +void TreeVerifyBatchConfig::compute_tree_branch_indexes() { + // Must be called only after setting num_tokens! + auto is_first_token_in_request = [&](int token_index) -> bool { + if (token_index == 0) { + return true; // First entry in tokensInfo is the first in a request. + } + return tokensInfo[token_index].request_index != + tokensInfo[token_index - 1].request_index; + }; + for (int i = 0; i < num_tokens; i++) { + if (is_first_token_in_request(i)) { + tokensInfo[i].tree_branch_idx = 0; + } else { + tokensInfo[i].tree_branch_idx = tokensInfo[i - 1].tree_branch_idx + 1; + } + } +} + }; // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index b02150d153..01b116c853 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -147,6 +147,8 @@ std::string get_operator_type_name(OperatorType type) { return "MultiHeadAttention"; case OP_INC_MULTIHEAD_SELF_ATTENTION: return "IncMultiHeadSelfAttention"; + case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: + return "IncMultiHeadSelfAttentionVerify"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 866dcd3505..64ced31579 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -30,6 +30,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -2274,6 +2275,21 @@ GraphOptimalViewSerialized sez.serialize(attn->apply_rotary_embedding); break; } + case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { + IncMultiHeadSelfAttentionVerify *attn = + (IncMultiHeadSelfAttentionVerify *)op; + sez.serialize(attn->layer_guid.id); + sez.serialize(attn->oProjSize); + sez.serialize(attn->num_heads); + sez.serialize(attn->qProjSize); + sez.serialize(attn->vProjSize); + sez.serialize(attn->dropout); + sez.serialize(attn->bias); + sez.serialize(attn->add_bias_kv); + sez.serialize(attn->add_zero_attn); + sez.serialize(attn->apply_rotary_embedding); + break; + } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2665,6 +2681,39 @@ void FFModel::deserialize_graph_optimal_view( node = get_or_create_node(inputs[0], params); break; } + case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { + assert(num_inputs == 1); + int embed_dim, num_heads, k_dim, v_dim; + float dropout; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + size_t id; + dez.deserialize(id); + LayerID layer_guid(id); + dez.deserialize(embed_dim); + dez.deserialize(num_heads); + dez.deserialize(k_dim); + dez.deserialize(v_dim); + dez.deserialize(dropout); + dez.deserialize(bias); + dez.deserialize(add_bias_kv); + dez.deserialize(add_zero_attn); + dez.deserialize(apply_rotary_embedding); + + IncMultiHeadSelfAttentionVerifyParams params; + params.embed_dim = embed_dim; + params.num_heads = num_heads; + params.kdim = k_dim; + params.vdim = v_dim; + params.dropout = dropout; + params.bias = bias; + params.add_bias_kv = add_bias_kv; + params.add_zero_attn = add_zero_attn; + params.layer_guid = layer_guid; + params.apply_rotary_embedding = apply_rotary_embedding; + node = get_or_create_node(inputs[0], + params); + break; + } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 12d3bbb18f..d5c3f05851 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -41,6 +41,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -2768,6 +2769,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { + Op *op = IncMultiHeadSelfAttentionVerify::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -4610,6 +4617,27 @@ void register_flexflow_internal_tasks() { IncMultiHeadSelfAttention::inference_task>( registrar, "IncMultiHeadSelfAttention Inference Task"); } + { + TaskVariantRegistrar registrar( + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, + "IncMultiHeadSelfAttentionVerify Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant< + OpMeta *, + IncMultiHeadSelfAttentionVerify::init_task>( + registrar, "IncMultiHeadSelfAttentionVerify Init Task"); + } + { + TaskVariantRegistrar registrar( + INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INF_TASK_ID, + "IncMultiHeadSelfAttentionVerify Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant< + IncMultiHeadSelfAttentionVerify::inference_task>( + registrar, "IncMultiHeadSelfAttentionVerify Inference Task"); + } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 201a6449c2..a968e10c60 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -15,6 +15,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -82,6 +83,8 @@ tl::optional get_op_parameters(Op const *op) { return ((MultiHeadAttention *)op)->get_params(); case OP_INC_MULTIHEAD_SELF_ATTENTION: return ((IncMultiHeadSelfAttention *)op)->get_params(); + case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: + return ((IncMultiHeadSelfAttentionVerify *)op)->get_params(); case OP_LAYERNORM: return ((LayerNorm *)op)->get_params(); case OP_REDUCE_SUM: diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index df2f8b05b3..c353d8fa16 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/embedding.h" #include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" +#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -3707,13 +3708,19 @@ bool FFModel::convert_graph_to_operators( new_op = new MultiHeadAttention( *this, *attn, inputs[0], inputs[1], inputs[2], true); break; - break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(inList.size() == 1); IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)node.ptr; new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true); break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { + assert(inList.size() == 1); + IncMultiHeadSelfAttentionVerify *attn = + (IncMultiHeadSelfAttentionVerify *)node.ptr; + new_op = + new IncMultiHeadSelfAttentionVerify(*this, *attn, inputs[0], true); break; } case OP_RMS_NORM: { @@ -3721,7 +3728,6 @@ bool FFModel::convert_graph_to_operators( RMSNorm *rms = (RMSNorm *)node.ptr; new_op = new RMSNorm(*this, *rms, inputs[0], true); break; - break; } case OP_SOFTMAX: { assert(inList.size() == 1); From 7a0ff7aa968f8cf9e3c73b97cec85a656b0b767f Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Fri, 5 May 2023 20:45:03 -0400 Subject: [PATCH 105/344] [Inference] Implement Beam Search for small model speculative inference (#681) * Add beam_topK and specu_inc_multihead_self_attention ops to includes. * topk init not test * add * fix beam topk * fix attention * fix softmax dim * Revert "Beam search xinhao" * fix softmax dimension * fix beam topk, beam slot * fix, add placeholder * fix * parent id * add depth * fix: spec_inc_attn to deal with sub req, naive stealing * minor fix * minor fix * fix * fix * move location * manually change * manual * fix * fix * fix * Add BeamSearchBatchConfig draft. * Add beam_search_batch_config.cc file. * Update BeamSearchBatchConfig * 1 * Update. * Add BeamSearchPerRequestInfo. * fix bug in build_w_out_tensor kernel * change kernels * change request mmanager * fix everything, add tree * fix fix fix fix fix * linting and bug fix * nit * removed placeholder (still works) * restored normal LLAMA * moved spec_llama back to inference folder * hip fix * saved file * md * fix --------- Co-authored-by: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Co-authored-by: xinhaoc Co-authored-by: Rae Wong Co-authored-by: Gabriele Oliaro --- CMakeLists.txt | 4 + examples/cpp/inference/LLAMA/llama.cc | 2 +- .../cpp/inference/SPEC_LLAMA/CMakeLists.txt | 20 + examples/cpp/inference/SPEC_LLAMA/Makefile | 39 + examples/cpp/inference/SPEC_LLAMA/README.md | 14 + examples/cpp/inference/SPEC_LLAMA/llama.cc | 308 ++++++ examples/cpp/inference/SPEC_LLAMA/llama.h | 111 +++ examples/cpp/inference/file_loader.cc | 182 ++++ examples/cpp/inference/file_loader.h | 36 + include/flexflow/batch_config.h | 61 +- include/flexflow/ffconst.h | 2 + include/flexflow/inference.h | 25 + include/flexflow/model.h | 31 + include/flexflow/operator_params.h | 4 + include/flexflow/ops/beam_topk.h | 101 ++ include/flexflow/ops/beam_topk_params.h | 26 + .../ops/spec_inc_multihead_self_attention.h | 154 +++ ...spec_inc_multihead_self_attention_params.h | 31 + include/flexflow/utils/cuda_helper.h | 10 + src/ops/beam_topk.cc | 469 +++++++++ src/ops/beam_topk.cpp | 716 ++++++++++++++ src/ops/beam_topk.cu | 715 ++++++++++++++ src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/kernels/softmax.cu | 3 +- src/ops/spec_inc_multihead_self_attention.cc | 670 +++++++++++++ src/ops/spec_inc_multihead_self_attention.cpp | 74 ++ src/ops/spec_inc_multihead_self_attention.cu | 904 ++++++++++++++++++ src/runtime/beam_search_batch_config.cc | 88 ++ src/runtime/cuda_helper.cu | 99 +- src/runtime/ffconst_utils.cc | 4 + src/runtime/graph.cc | 54 ++ src/runtime/inference_manager.cc | 18 +- src/runtime/model.cc | 51 + src/runtime/operator_params.cc | 7 + src/runtime/request_manager.cc | 383 ++++++++ 35 files changed, 5404 insertions(+), 14 deletions(-) create mode 100644 examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt create mode 100644 examples/cpp/inference/SPEC_LLAMA/Makefile create mode 100644 examples/cpp/inference/SPEC_LLAMA/README.md create mode 100644 examples/cpp/inference/SPEC_LLAMA/llama.cc create mode 100644 examples/cpp/inference/SPEC_LLAMA/llama.h create mode 100644 examples/cpp/inference/file_loader.cc create mode 100644 examples/cpp/inference/file_loader.h create mode 100644 include/flexflow/ops/beam_topk.h create mode 100644 include/flexflow/ops/beam_topk_params.h create mode 100644 include/flexflow/ops/spec_inc_multihead_self_attention.h create mode 100644 include/flexflow/ops/spec_inc_multihead_self_attention_params.h create mode 100644 src/ops/beam_topk.cc create mode 100644 src/ops/beam_topk.cpp create mode 100644 src/ops/beam_topk.cu create mode 100644 src/ops/spec_inc_multihead_self_attention.cc create mode 100644 src/ops/spec_inc_multihead_self_attention.cpp create mode 100644 src/ops/spec_inc_multihead_self_attention.cu create mode 100644 src/runtime/beam_search_batch_config.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b1f963093..8edad77124 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -544,6 +544,10 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/LLAMA) endif() +if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/SPEC_LLAMA) +endif() + # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index a4924f5406..ac25f70467 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -66,7 +66,7 @@ void FlexFlow::top_level_task(Task const *task, // parse_input_args(argv, argc, llamaConfig); std::cout << "print llama config: " << llamaConfig.input_path << "-->" - << llamaConfig.batchSize; + << llamaConfig.batchSize << std::endl; //------------------------------ build the model -------------------------- Tensor input; diff --git a/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt b/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt new file mode 100644 index 0000000000..d6ceb38ff4 --- /dev/null +++ b/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_SPEC_LLAMA) +set(project_target SPEC_LLAMA) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + llama.cc + llama.h + ../file_loader.cc) + + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/SPEC_LLAMA/Makefile b/examples/cpp/inference/SPEC_LLAMA/Makefile new file mode 100644 index 0000000000..32e8e1cf3d --- /dev/null +++ b/examples/cpp/inference/SPEC_LLAMA/Makefile @@ -0,0 +1,39 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= spec_llama +# List all the application source files here +GEN_SRC = llama.cc dataloader.cc +GEN_GPU_SRC = dataloader.cu +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/SPEC_LLAMA/README.md b/examples/cpp/inference/SPEC_LLAMA/README.md new file mode 100644 index 0000000000..daccccc249 --- /dev/null +++ b/examples/cpp/inference/SPEC_LLAMA/README.md @@ -0,0 +1,14 @@ +# an example of running llama model with beam search + +## how to run it? +1. build the flexflow with FF_BUILD_ALL_INFERENCE_EXAMPLES or FF_BUILD_ALL_EXAMPLES +2. download the weight and token file from aws s3. +```bash +aws s3 cp s3://catalyst-llama/7B_weights_float.tar.gz FF_HOME/examples/cpp/inference/SPEC_LLAMA/weights +tar -zxvf 7B_weights_float.tar.gz +aws s3 cp s3://catalyst-llama/tokens.tar FF_HOME/examples/cpp/inference/SPEC_LLAMA/tokens +tar -zxvf tokens.tar +``` +3. run *SPEC_LLAMA* with `--weights` `--dataset` `-b 5` `--only-data-parallel` +4. [expected results](https://github.com/flexflow/FlexFlow/pull/681#issuecomment-1534264054) + diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.cc b/examples/cpp/inference/SPEC_LLAMA/llama.cc new file mode 100644 index 0000000000..0f2095c1ae --- /dev/null +++ b/examples/cpp/inference/SPEC_LLAMA/llama.cc @@ -0,0 +1,308 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "llama.h" +#include "flexflow/inference.h" + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("llama"); + +void parse_input_args(char **argv, int argc, LLAMAConfig &config) { + for (int i = 1; i < argc; i++) { + // input + if (!strcmp(argv[i], "--dataset")) { + config.input_path = std::string(argv[++i]); + continue; + } + + // weights + if (!strcmp(argv[i], "--weights")) { + config.weight_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + LLAMAConfig llamaConfig; + FFModel ff(ffconfig); + //------------------------------compute machine views ------------------ + int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; + std::vector machine_views; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } + + std::unordered_map> mapping; + std::unordered_map weights_layers; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, llamaConfig); + + std::cout << "print llama config: " << llamaConfig.input_path << "-->" + << llamaConfig.batchSize << std::endl; + + //------------------------------ build the model -------------------------- + Tensor input; + { + int const token_dims[] = {llamaConfig.batchSize, llamaConfig.max_seq_len}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + } + + mapping[input].push_back(machine_views[0]); + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + Tensor token = ff.embedding(input, + llamaConfig.vocab_size, + llamaConfig.dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *embedding = ff.layers.back(); + weights_layers.emplace("tok_embeddings_weight", embedding); + + // std::cout << "------token shape"; + // std::cout << token->num_dims << "------\n"; + // for (int i = 0; i < token->num_dims; i++) { + // std::cout << token->dims[i] << "------\n"; + // } + + // n transformer blocks impl + int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; + + for (int i = 0; i < 1; i++) { + // step 1: attention + std::vector axes = {2}; + Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); + Layer *attention_norm = ff.layers.back(); + + if (i % num_transformer_layers_per_gpu == 0) { + // Map att_norm to the next GPU + // since the size of att_norm is minimum across + // all tensors + mapping[att_norm].push_back( + machine_views[i / num_transformer_layers_per_gpu]); + } + + weights_layers.emplace("layers_" + std::to_string(i) + + "_attention_norm_weight", + attention_norm); + + // std::cout << "------before att shape"; + // std::cout << att_norm->num_dims << "------\n"; + // for (int i = 0; i < att_norm->num_dims; i++) { + // std::cout << att_norm->dims[i] << "------\n"; + // } + Tensor mha = ff.spec_inc_multihead_self_attention( + att_norm, + llamaConfig.dim, + llamaConfig.n_heads, + llamaConfig.dim / llamaConfig.n_heads, + llamaConfig.dim / llamaConfig.n_heads, + 0.0f, + true, + false, + false, + NULL, + true); + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + token = ff.add(token, mha); + + // step 2: SILU activaion + Tensor ff_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); + Layer *ffn_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", + ffn_layer); + + Tensor w1 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); + Layer *w1_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); + + Tensor w3 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); + Layer *w3_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); + + Tensor sigmoid = ff.sigmoid(w1); + Tensor silu = ff.multiply(w1, sigmoid); + Tensor multi = ff.multiply(silu, w3); + + Tensor w2 = ff.dense(multi, llamaConfig.dim, AC_MODE_NONE, false); + Layer *w2_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); + token = ff.add(token, w2); + } + + // final normalization and linear + std::vector axes = {2}; + token = ff.rms_norm(token, 1e-6, 4096); + Layer *final_norm = ff.layers.back(); + weights_layers.emplace("norm_weight", final_norm); + + Tensor dense = ff.dense(token, llamaConfig.vocab_size, AC_MODE_NONE, false); + Layer *final_linear = ff.layers.back(); + weights_layers.emplace("output_weight", final_linear); + + Tensor softmax = ff.softmax(dense, -1); + Tensor output = ff.beam_top_k(softmax, llamaConfig.max_beam_width, false); + + //------------------- compile the model -------------------------------- + std::cout << "------start compile ----------" << std::endl; + InferenceManager im(&ff, llamaConfig.batchSize, 1); + im.compile_model_and_allocate_buffer(&ff, mapping); + RequestManager rm; + + // std::cout << "------init ops----------" << std::endl; + // im.init_operators_inference(); + // std::cout << "------model compiled and init ----------" << std::endl; + + //------------------------------ load inputs -------------------------- + std::cout << "------create dataloaders ----------" << std::endl; + // read prompt into input + ParallelTensor input_pt; + ff.get_parallel_tensor_from_tensor(input, input_pt); + assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); + std::cout << im.tensor_buffer[input_pt].size() << std::endl; + // DataLoader loader(ff, &llamaConfig, im.tensor_buffer[input_pt].at(0)); + + //------------------------------ load weights--------------------------- + // for (auto &v : weights_layers) { + // Tensor weight = v.second->weights[0]; + // std::cout << "weights layer: " << v.first << "\n"; + + // if (weight == NULL) { + // std::cout << "op no weights : " << v.first << "\n"; + // continue; + // } + + // size_t volume = 1; + // std::vector dims_vec; + // for (int i = 0; i < weight->num_dims; i++) { + // dims_vec.push_back(weight->dims[i]); + // volume *= weight->dims[i]; + // } + + // assert(weight->data_type == DT_FLOAT); + // float *data = (float *)malloc(sizeof(float) * volume); + + // if (v.first.find("attention_w") != std::string::npos) { + // loader.load_attention_weights( + // data, volume, v.first, llamaConfig.weight_file_path); + + // } else { + // loader.load_from_file( + // data, volume, llamaConfig.weight_file_path + v.first); + // } + + // ParallelTensor weight_pt; + // ff.get_parallel_tensor_from_tensor(weight, weight_pt); + // weight_pt->set_tensor(&ff, dims_vec, data); + // } + + FileDataLoader fileloader(llamaConfig.input_path, + llamaConfig.weight_file_path); + BatchConfig::TokenId *tokens = fileloader.generate_requests( + llamaConfig.batchSize, llamaConfig.max_seq_len); + + for (int i = 0; i < 40; i++) { + std::cout << tokens[i] << ", "; + } + for (int i = 0; i < llamaConfig.batchSize; i++) { + std::cout << "-------" << std::endl; + std::vector prompt( + tokens + i * llamaConfig.max_seq_len, + tokens + (i + 1) * llamaConfig.max_seq_len); + rm.register_new_request(prompt, llamaConfig.sentence_len); + } + + fileloader.load_weights(&ff, weights_layers); + + std::cout << "------load wieght finished----------" << std::endl; + + //------------------------------ do inference, we only have 5 prompts for the + // test case, so simplify the batch_configs with 1 + im.init_operators_inference(&ff); + // entry--------------------------- + int depth = 0; + std::map future_handlers; + std::map batch_configs; + + bool new_req = true; + + while (depth < llamaConfig.max_beam_depth) { + int bid = 0; + if (future_handlers.find(bid) == future_handlers.end()) { + BeamSearchBatchConfig bc; + BeamInferenceResult ir; + bc = rm.prepare_next_batch_beam(bc, ir); + + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&ff, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; + } else { + // have luanched this bid + Future future = future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } else { + std::cout << "future is ready...." << std::endl; + } + // process end + BeamInferenceResult ir = future.get_result(); + BeamSearchBatchConfig bc = batch_configs[bid]; + depth = bc.beamRequestsInfo[0].current_depth; + bc = rm.prepare_next_batch_beam(bc, ir); + + std::cout << "llama current depth: " << depth << std::endl; + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&ff, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; + + // tranverse the tree in dfs order; + if (depth >= llamaConfig.max_beam_depth) { + std::cout << "tranverse the tree" + << "\n"; + rm.tranverse_beam_tree(bc); + } + } + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.h b/examples/cpp/inference/SPEC_LLAMA/llama.h new file mode 100644 index 0000000000..7fce809073 --- /dev/null +++ b/examples/cpp/inference/SPEC_LLAMA/llama.h @@ -0,0 +1,111 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "file_loader.h" + +using namespace Legion; +using namespace FlexFlow; + +struct LLAMAConfig { + LLAMAConfig(void) { + // todo read from config/param file + n_layers = 32; + vocab_size = 32000; + n_heads = 32; + dim = 4096; + multiple_of = 256; + norm_eps = 1e-6; + total_sentence = 5; + sentence_len = 347; + max_gen_length = 256; + batchSize = 5; + total_requests = 2560; + incremental_mode = true; + sequence_length = MAX_SEQ_LEN; + max_seq_len = 8; + max_beam_width = 3; + max_beam_depth = 3; + + // hidden dim + hidden_dim = 4 * dim; + hidden_dim = int(2 * hidden_dim / 3); + hidden_dim = + multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); + } + int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, + total_sentence, sentence_len, batchSize, total_requests, incremental_mode, + sequence_length, max_gen_length, max_seq_len, max_beam_width, + max_beam_depth; + float norm_eps; + std::string weight_file_path; + std::string input_path; +}; + +// struct Prediction_result{ +// long tokens[MAX_]; +// float probs[MAX_BEAM_SIZE]; +// int parent_ids[MAX_BEAM_SIZE]; +// }; + +// class DataLoader { +// public: +// DataLoader(FFModel &ff, +// LLAMAConfig const *llamaconfig, +// ParallelTensor const &input); +// void next_batch(FFModel &ff, +// BatchConfig *bc, +// std::map &batch_predictions); +// void reset(); +// static void load_entire_dataset(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime); +// static void load_input(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime); + +// template +// static void load_from_file(T *ptr, size_t size, std::string filename); + +// template +// static void load_attention_weights(T *ptr, +// size_t size, +// std::string layer_name, +// std::string weight_path); +// void store_outputs(BatchConfig *bc, +// InferenceResult const &ir, +// std::map &batch_predictions); +// void update_beam_slots(BatchConfig *bc, std::map +// batch_predictions); void update_beam_tree(); + +// public: +// int num_samples, next_index, next_token_idx, next_batch_index; +// std::map> outputs; +// FlexFlow::ParallelTensor full_input, batch_input; +// }; + +// struct SampleIdxs { +// int num_samples; +// int idxs[MAX_NUM_SAMPLES]; +// int token_idx; +// int batch_idx; +// }; + +// struct DataLoaderNextBatchInput { +// BatchConfig const &bc; +// std::map const &prev_batch_preds; +// }; diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc new file mode 100644 index 0000000000..f9f399b464 --- /dev/null +++ b/examples/cpp/inference/file_loader.cc @@ -0,0 +1,182 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "file_loader.h" +#include "flexflow/inference.h" + +#include +using namespace std; + +FileDataLoader::FileDataLoader(std::string _input_path, + std::string _weight_file_path) + : input_path(_input_path), weight_file_path(_weight_file_path){}; + +BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { + + BatchConfig::TokenId *prompts = + (BatchConfig::TokenId *)malloc(sizeof(BatchConfig::TokenId) * 40); + std::cout << "load input from file: " << input_path << std::endl; + std::ifstream in(input_path, std::ios::in | std::ios::binary); + int size = num * length; + std::vector host_array(size); + size_t loaded_data_size = sizeof(long) * size; + + std::cout << "loaded_data_size: " << loaded_data_size << std::endl; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + std::cout << "loaded_data_size: " << loaded_data_size << std::endl; + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return prompts; + } + + assert(size == host_array.size()); + + int index = 0; + int data_index = 0; + + std::cout << "loaded_data_size: " << loaded_data_size << std::endl; + std::cout << host_array.size() << "\n"; + for (auto v : host_array) { + prompts[data_index++] = v; + std::cout << data_index << ", " << (int)v << "\n"; + } + in.close(); + return prompts; +}; + +void load_attention_weights(float *ptr, + size_t size, + std::string layer_name, + std::string weight_path) { + std::string q_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wq_weight"; + std::string k_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wk_weight"; + std::string v_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wv_weight"; + std::string o_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wo_weight"; + std::vector weight_files = {q_file, k_file, v_file, o_file}; + + size_t index = 0; + int file_index = 0; + + // q, k, v, o -> 0, 1, 2, 3 + for (auto file : weight_files) { + std::cout << "file name and index: " << file << "->" << file_index << "\n"; + size_t partial_size = size / 4; + std::ifstream in(file, std::ios::in | std::ios::binary); + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + assert(partial_size == host_array.size()); + + size_t one_head_size = 4096 * 128; + size_t data_index = 0; + + for (int i = 0; i < 32; i++) { + size_t start_index = i * one_head_size * 4 + file_index * one_head_size; + for (size_t j = start_index; j < start_index + one_head_size; j++) { + ptr[j] = host_array.at(data_index); + data_index += 1; + } + } + file_index++; + + in.close(); + index++; + } +} + +void load_from_file(float *ptr, size_t size, std::string filename) { + std::cout << "load from file: " << filename << std::endl; + std::ifstream in(filename, std::ios::in | std::ios::binary); + std::vector host_array(size); + size_t loaded_data_size = sizeof(float) * size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + // std::cout << "size seee" << std::endl; + // std::cout << loaded_data_size << std::endl; + // std::cout << in_get_size << std::endl; + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + + // std::cout << "finish loading input"; + assert(size == host_array.size()); + + // normal + long data_index = 0; + for (auto v : host_array) { + ptr[data_index++] = v; + } + in.close(); +} + +void FileDataLoader::load_weights( + FFModel *ff, std::unordered_map weights_layers) { + + for (auto &v : weights_layers) { + Tensor weight = v.second->weights[0]; + std::cout << "weights layer: " << v.first << "\n"; + + if (weight == NULL) { + std::cout << "op no weights : " << v.first << "\n"; + continue; + } + + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < weight->num_dims; i++) { + dims_vec.push_back(weight->dims[i]); + volume *= weight->dims[i]; + } + + assert(weight->data_type == DT_FLOAT); + float *data = (float *)malloc(sizeof(float) * volume); + + if (v.first.find("attention_w") != std::string::npos) { + load_attention_weights(data, volume, v.first, weight_file_path); + + } else { + load_from_file(data, volume, weight_file_path + v.first); + } + + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + weight_pt->set_tensor(ff, dims_vec, data); + } +} diff --git a/examples/cpp/inference/file_loader.h b/examples/cpp/inference/file_loader.h new file mode 100644 index 0000000000..e1edc3f1a9 --- /dev/null +++ b/examples/cpp/inference/file_loader.h @@ -0,0 +1,36 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "flexflow/batch_config.h" +#include "flexflow/model.h" + +using namespace std; +using namespace FlexFlow; + +class FileDataLoader { +public: + FileDataLoader(std::string _input_path, std::string _weight_file_path); + + BatchConfig::TokenId *generate_requests(int num, int length); + + void load_weights(FFModel *ff, + std::unordered_map weights_layers); + +private: + std::string input_path; + std::string weight_file_path; +}; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index d4150d6c51..db5ff3d485 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -15,6 +15,7 @@ #pragma once +#include #include // #define MAX_SEQ_LEN 1024 @@ -26,6 +27,7 @@ namespace FlexFlow { class InferenceResult; +class BeamInferenceResult; class BatchConfig { public: @@ -48,8 +50,11 @@ class BatchConfig { int num_tokens; struct PerRequestInfo { - int token_start_offset; - int num_tokens_in_batch; + int token_start_offset; // input[token_start_offset * data_dim] is the first + // token + int num_tokens_in_batch; // tokens from input[token_start_offset * data_dim + // : (token_start_offset + num_token_in_batch) * + // data_dim] int max_sequence_length; RequestGuid request_guid; }; @@ -88,4 +93,56 @@ struct InferenceResult { BatchConfig::TokenId token_ids[MAX_NUM_TOKENS]; }; +class BeamSearchBatchConfig : public BatchConfig { +public: + BeamSearchBatchConfig(); + BeamSearchBatchConfig(size_t beam_width, size_t target_iterations); + + ~BeamSearchBatchConfig(); + + void print() const; + bool done() const; + + size_t beam_width; + size_t target_iterations; + static int const MAX_BEAM_WIDTH = 3; + static int const MAX_BEAM_DEPTH = 8; + + struct BeamSearchPerRequestInfo { + // int token_start_offset; // input[token_start_offset * data_dim] is the + // first token int num_tokens_in_batch; // tokens from + // input[token_start_offset * data_dim : (token_start_offset + + // num_token_in_batch) * data_dim] int max_sequence_length; RequestGuid + // request_guid; + bool request_completed; + int beam_size; // + int current_depth = -1; + // int global_depth = -1; + int max_depth = MAX_BEAM_DEPTH; + + BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + }; + + struct BeamSearchPerTokenInfo { + int sub_request_index; + }; + + BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; + BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; + // BeamSlot beam_slots[MAX_NUM_REQUESTS]; + +private: + size_t current_iteration; +}; + +struct BeamInferenceResult : public InferenceResult { + BatchConfig::TokenId + token_ids[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int parent_id[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; +}; + }; // namespace FlexFlow diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index a64944ab30..86898a1a9b 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -148,6 +148,8 @@ enum OperatorType { OP_EXPERTS, OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, + OP_BEAM_TOPK, + OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, OP_INC_MULTIHEAD_SELF_ATTENTION, OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, // Parallel Ops diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 0079a570b7..3025d8a748 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -22,6 +22,7 @@ namespace FlexFlow { class FFModel; +class BeamTree; class InferenceManager { public: @@ -52,6 +53,17 @@ struct Request { std::vector tokens; }; +// store the result of beam search +struct BeamTree { + struct treeLayer { + BeamSearchBatchConfig::TokenId + tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + }; + treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH]; +}; + class RequestManager { public: using RequestGuid = BatchConfig::RequestGuid; @@ -61,6 +73,18 @@ class RequestManager { int max_sequence_length); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); + + BeamSearchBatchConfig + prepare_next_batch_beam(BeamSearchBatchConfig const &bc, + BeamInferenceResult const &result); + + void store_beam_metadata(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result); + void update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamTree &tree, + int request_index); + void tranverse_beam_tree(BeamSearchBatchConfig const &old_bc); + static void load_tokens_task(Legion::Task const *task, std::vector const ®ions, @@ -72,6 +96,7 @@ class RequestManager { std::unordered_map running_request_queue; std::mutex request_queue_mutex; RequestGuid next_available_guid; + struct BeamTree beam_trees[BatchConfig::MAX_NUM_REQUESTS]; }; } // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index c213af9ac8..70a631f9a8 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -143,6 +143,10 @@ enum TaskIDs { ATTENTION_BWD_TASK_ID, RMSNROM_INIT_TASK_ID, RMSNROM_FWD_TASK_ID, + BEAM_TOPK_INIT_TASK_ID, + BEAM_TOPK_INF_TASK_ID, + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, @@ -299,6 +303,8 @@ class TopK; class ArgTopK; class Transpose; class RMSNorm; +class BeamTopK; +class SpecIncMultiHeadSelfAttention; class Combine; class Repartition; class Reduction; @@ -513,6 +519,12 @@ class FFModel { // Add a root mean square layer Tensor rms_norm(const Tensor input, float eps, int dim, char const *name = NULL); + // Add a beam search top k layer + Tensor beam_top_k(const Tensor input, + int max_beam_size, + bool sorted, + char const *name = NULL); + // Add a dense layer Tensor dense(const Tensor input, int outDim, @@ -607,6 +619,19 @@ class FFModel { Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, char const *name = NULL); + Tensor + spec_inc_multihead_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = true, + bool add_bias_kv = false, + bool add_zero_attn = false, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + char const *name = NULL); Tensor inc_multihead_self_attention_verify( const Tensor input, int embed_dim, @@ -620,6 +645,7 @@ class FFModel { Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, char const *name = NULL); + Tensor create_tensor_legion_ordering(int num_dim, int const dims[], DataType data_type, @@ -982,6 +1008,11 @@ class FFModel { std::unordered_map< std::pair, IncMultiHeadSelfAttention *>, + std::unordered_map, + BeamTopK *>, + std::unordered_map< + std::pair, + SpecIncMultiHeadSelfAttention *>, std::unordered_map< std::pair, IncMultiHeadSelfAttentionVerify *>, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index ad0e15ce46..a417f6579f 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -6,6 +6,7 @@ #include "flexflow/ops/arg_topk_params.h" #include "flexflow/ops/attention_params.h" #include "flexflow/ops/batch_matmul_params.h" +#include "flexflow/ops/beam_topk_params.h" #include "flexflow/ops/cast_params.h" #include "flexflow/ops/concat_params.h" #include "flexflow/ops/conv_2d_params.h" @@ -26,6 +27,7 @@ #include "flexflow/ops/reshape_params.h" #include "flexflow/ops/rms_norm_params.h" #include "flexflow/ops/softmax_params.h" +#include "flexflow/ops/spec_inc_multihead_self_attention_params.h" #include "flexflow/ops/split_params.h" #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" @@ -57,6 +59,8 @@ using OperatorParameters = mp::variant const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static BeamInferenceResult + inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void serialize(Legion::Serializer &s) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + static void forward_kernel(BeamTopKMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + int *parent_ptr, + size_t batch_size, + int length, + bool sorted, + ffStream_t stream); + static void forward_kernel_wrapper(BeamTopKMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + int *parent_ptr, + size_t batch_size, + int length, + bool sorted); + Params get_params() const; + +public: + bool sorted; + int max_beam_width; +}; + +}; // namespace FlexFlow + +#endif diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h new file mode 100644 index 0000000000..c217b0f671 --- /dev/null +++ b/include/flexflow/ops/beam_topk_params.h @@ -0,0 +1,26 @@ +#ifndef _FLEXFLOW_BEAM_TOPK_PARAMS_H +#define _FLEXFLOW_BEAM_TOPK_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct BeamTopKParams { + LayerID layer_guid; + bool sorted; + int max_beam_width; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(BeamTopKParams const &, BeamTopKParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::BeamTopKParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_BEAM_TOPK_PARAMS_H diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h new file mode 100644 index 0000000000..4df85cd04e --- /dev/null +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -0,0 +1,154 @@ +#ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H +#define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/inference.h" +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/ops/spec_inc_multihead_self_attention_params.h" +#include "math.h" +#include +#include + +namespace FlexFlow { + +class SpecIncMultiHeadSelfAttentionMeta; + +class SpecIncMultiHeadSelfAttention : public Op { +public: + using Params = SpecIncMultiHeadSelfAttentionParams; + using Input = ParallelTensor; + + SpecIncMultiHeadSelfAttention(FFModel &model, + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name); + SpecIncMultiHeadSelfAttention(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name); + SpecIncMultiHeadSelfAttention(FFModel &model, + SpecIncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights); + SpecIncMultiHeadSelfAttention(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + + static void + inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); + Params get_params() const; + +public: + int num_heads; + float dropout; + bool bias; + bool add_bias_kv, add_zero_attn, apply_rotary_embedding; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int qoSeqLength, kvSeqLength; +}; + +class SpecIncMultiHeadSelfAttentionMeta : public OpMeta { +public: + SpecIncMultiHeadSelfAttentionMeta(FFHandler handler, + SpecIncMultiHeadSelfAttention const *attn, + float const *weight_ptr, + Legion::Memory gpu_mem, + int num_samples, + int _num_heads); + ~SpecIncMultiHeadSelfAttentionMeta(void); + +public: + Realm::RegionInstance reserveInst; + size_t weights_params, weightSize, reserveSpaceSize; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int num_heads; + bool *has_load_weights; + bool *apply_rotary_embedding; +#ifdef INFERENCE_TESTS + float *kcache, *vcache; +#endif + /*#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnAttnDescriptor_t attnDesc; + cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; + #endif*/ + // int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx, *kvCache; + float *devQKVProjArray, *keyCache, *valueCache; + float *qk_prods, *qk_prods_softmax; + float *attn_heads, *W_out_contiguous; + // void *reserveSpace; + + // BatchConfig::token_idxs *dev_token2ids; + BatchConfig::PerTokenInfo *tokenInfos; + BatchConfig::PerRequestInfo *requestInfos; + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos; + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h new file mode 100644 index 0000000000..00e1179a14 --- /dev/null +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H + +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct SpecIncMultiHeadSelfAttentionParams { + LayerID layer_guid; + int embed_dim, num_heads, kdim, vdim; + float dropout; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(SpecIncMultiHeadSelfAttentionParams const &, + SpecIncMultiHeadSelfAttentionParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t + operator()(FlexFlow::SpecIncMultiHeadSelfAttentionParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 2ea7227879..b82426ac59 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -138,6 +138,12 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void print_beam_tensor(T const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); @@ -151,6 +157,10 @@ bool download_tensor(T const *ptr, T *dst, size_t num_elements); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain); +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Legion::Domain domain); + cudaDataType_t ff_to_cuda_datatype(DataType type); cudnnDataType_t ff_to_cudnn_datatype(DataType type); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc new file mode 100644 index 0000000000..6a49573538 --- /dev/null +++ b/src/ops/beam_topk.cc @@ -0,0 +1,469 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/beam_topk.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +// For an input tensor, computes the top k entries in each row +// (resp. vector along the last dimension). Thus, +// values.shape = indices.shape = input.shape[:-1] + [k] +Tensor FFModel::beam_top_k(const Tensor input, + int max_beam_width, + bool sorted, + char const *name) { + Layer *li = new Layer(this, + OP_BEAM_TOPK, + input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + 3 /*outputs*/, + input); + { + int numdims = input->num_dims; + + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = max_beam_width; + + std::cout << "beam input dimen:" << numdims << "\n"; + for (int i = 0; i < numdims; i++) { + std::cout << input->dims[i] << ", "; + } + + // beam width is dynamic + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); + li->outputs[2] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 1, false /*create_grad*/); + } + li->add_int_property("sorted", sorted); + li->add_int_property("max_beam_width", max_beam_width); + layers.push_back(li); + // outputs[0] = li->outputs[0]; + // outputs[1] = li->outputs[1]; + return li->outputs[1]; +} + +Op *BeamTopK::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("sorted", value); + bool sorted = (bool)value; + layer->get_int_property("max_beam_width", value); + int max_beam_width = value; + return new BeamTopK( + model, inputs[0], layer->layer_guid, max_beam_width, sorted, layer->name); +} + +BeamTopKParams BeamTopK::get_params() const { + BeamTopKParams params; + params.layer_guid = this->layer_guid; + params.sorted = this->sorted; + params.max_beam_width = this->max_beam_width; + return params; +} + +bool BeamTopKParams::is_valid(ParallelTensorShape const &) const { + // topk is always valid + return true; +} + +bool operator==(BeamTopKParams const &lhs, BeamTopKParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.sorted == rhs.sorted && + lhs.max_beam_width == rhs.max_beam_width; +} + +BeamTopK::BeamTopK(FFModel &model, + const ParallelTensor _input, + LayerID const &_layer_guid, + int _max_beam_width, + bool _sorted, + char const *name) + : Op(model, + OP_BEAM_TOPK, + _input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + 3 /*outputs*/, + _input) { + sorted = _sorted; + max_beam_width = _max_beam_width; + layer_guid = _layer_guid; + int numdim = inputs[0]->num_dims; + assert(inputs[0]->dims[0].degree == 1); + assert(inputs[0]->dims[0].parallel_idx == -1); + // outputs[0] = model.create_parallel_tensor_legion_ordering( + // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, inputs[0]->dims, DT_INT32, this, 0 /*owner_idx*/); + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, inputs[0]->dims, _input->data_type, this, 1 /*owner_idx*/); + outputs[2] = model.create_parallel_tensor_legion_ordering( + numdim, inputs[0]->dims, DT_INT32, this, 2 /*owner_idx*/); +} + +BeamTopK::BeamTopK(FFModel &model, + BeamTopK const &other, + const ParallelTensor input) + : BeamTopK(model, + input, + other.layer_guid, + other.max_beam_width, + other.sorted, + other.name) {} + +BeamTopK::BeamTopK(FFModel &model, + BeamTopKParams const ¶ms, + const ParallelTensor input, + char const *name) + : BeamTopK(model, + input, + params.layer_guid, + params.max_beam_width, + params.sorted, + name) {} + +void BeamTopK::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(BeamTopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[2]->region)); + launcher.add_field(3, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void BeamTopK::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(BeamTopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[1]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[2]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[2]->region)); + launcher.add_field(3, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *BeamTopK::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BeamTopK *topk = (BeamTopK *)task->args; + FFHandler handle = *((FFHandler *)task->local_args); + BeamTopKMeta *m = new BeamTopKMeta(handle); + m->profiling = topk->profiling; + m->sorted = topk->sorted; + m->max_beam_width = topk->max_beam_width; + return m; +} + +void BeamTopK::forward(FFModel const &ff) { + assert(false); +} + +FutureMap BeamTopK::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher( + BEAM_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument( + &bc, std::max(sizeof(BatchConfig), sizeof(BeamSearchBatchConfig))), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[2]->region)); + launcher.add_field(3, FID_DATA); + + return runtime->execute_index_space(ctx, launcher); +} + +BeamInferenceResult + BeamTopK::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(regions.size() == 4); + assert(task->regions.size() == 4); + + BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + + std::cout << "beam search topk inference: " + << "\n"; + + BeamTopKMeta const *m = *((BeamTopKMeta **)task->local_args); + Domain in1_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + // Domain out1_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + Domain out2_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + int numdims = in1_domain.get_dim(); + + float const *in_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + // float *value_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + int *index_ptr = helperGetTensorPointerWO( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + float *value_ptr = helperGetTensorPointerWO( + regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int *parent_ptr = helperGetTensorPointerWO( + regions[3], task->regions[3], FID_DATA, ctx, runtime); + // embedding size: eg. 4096 + int length = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; + + int k = + out2_domain.hi()[0] - out2_domain.lo()[0] + 1; /*TODO: This prints to 5*/ + + // total token nums + size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; + size_t batch_size = in1_domain.get_volume() / length; + + std::cout << "beam search topk params: " << length << ", " << k << ", " + << batch_size << "\n"; + assert(out2_domain.get_volume() / k == batch_size); + + // std::vector beam_width; + // std::unordered_map sub_requests = bc->sub_requests; + // for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + // if (bc->request_completed[i]) { + // continue; + // } + // // add beam width for each main request + // beam_width.push_back(sub_requests[i]); + // std::cout << "sub req num: " <sorted); + + BeamInferenceResult ir; + + download_tensor(index_ptr, ir.token_ids, batch_size * m->max_beam_width); + download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); + download_tensor( + parent_ptr, ir.parent_id, batch_size * m->max_beam_width); + return ir; +} + +void BeamTopK::backward(FFModel const &ff) { + assert(false); +} + +void BeamTopK::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->sorted); + sez.serialize(this->max_beam_width); +} + +Node BeamTopK::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + bool sorted; + size_t id; + int max_beam_width; + dez.deserialize(id); + LayerID layer_guid(id); + dez.deserialize(sorted); + dez.deserialize(max_beam_width); + BeamTopKParams params; + params.layer_guid = layer_guid; + params.sorted = sorted; + params.max_beam_width = max_beam_width; + return ff.get_or_create_node(inputs[0], params); +} + +Op *BeamTopK::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + BeamTopKParams params = get_params(); + return new BeamTopK(ff, params, inputs[0], this->name); +} + +bool BeamTopK::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::BeamTopKParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.sorted); + hash_combine(key, params.max_beam_width); + return key; +} +}; // namespace std diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp new file mode 100644 index 0000000000..7e9421f299 --- /dev/null +++ b/src/ops/beam_topk.cpp @@ -0,0 +1,716 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/beam_topk.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +enum class HeapType { kMinHeap, kMaxHeap }; +enum class PreferIndices { kLower, kHigher }; + +template +struct Entry { + int index; + T value; +}; + +template +struct LinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return data[i].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; +}; + +template +struct IndirectLinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return backing_data[data[i].index].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; + Entry *const backing_data; +}; + +template +struct StridedData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index * blockDim.x + threadIdx.x]; + } + + __device__ int get_index(int i) const { + return (*this)[i].index; + } + __device__ T get_value(int i) const { + return (*this)[i].value; + } + + Entry *const data; +}; + +// A heap of Entry that can either work as a min-heap or as a max-heap. +template + class Data, + typename T> +struct IndexedHeap { + typedef typename Data::Entry Entry; + Data const data; + __device__ IndexedHeap(Data const &d) : data(d) {} + + __device__ bool is_above(int left, int right) { + T left_value = data.get_value(left); + T right_value = data.get_value(right); + if (left_value == right_value) { + if (preferIndices == PreferIndices::kLower) { + return data.get_index(left) < data.get_index(right); + } else { + return data.get_index(left) > data.get_index(right); + } + } + if (heapType == HeapType::kMinHeap) { + return left_value < right_value; + } else { + return left_value > right_value; + } + } + + __device__ void assign(int i, Entry const &entry) { + data[i] = entry; + } + + __device__ void push_up(int i) { + int child = i; + int parent; + for (; child > 0; child = parent) { + parent = (child - 1) / 2; + if (!is_above(child, parent)) { + // Heap property satisfied. + break; + } + swap(child, parent); + } + } + + __device__ void swap(int a, int b) { + auto tmp = data[b]; + data[b] = data[a]; + data[a] = tmp; + } + + __device__ void push_root_down(int k) { + push_down(0, k); + } + + // MAX-HEAPIFY in Cormen + __device__ void push_down(int node, int k) { + while (true) { + int const left = 2 * node + 1; + int const right = left + 1; + int smallest = node; + if (left < k && is_above(left, smallest)) { + smallest = left; + } + if (right < k && is_above(right, smallest)) { + smallest = right; + } + if (smallest == node) { + break; + } + swap(smallest, node); + node = smallest; + } + } + + // BUILD-MAX-HEAPIFY in Cormen + __device__ void build(int k) { + for (int node = (k - 1) / 2; node >= 0; node--) { + push_down(node, k); + } + } + + // HEAP-EXTRACT-MAX in Cormen + __device__ void remove_root(int k) { + data[0] = data[k - 1]; + push_root_down(k - 1); + } + + // in-place HEAPSORT in Cormen + // This method destroys the heap property. + __device__ void sort(int k) { + for (int slot = k - 1; slot > 0; slot--) { + // This is like remove_root but we insert the element at the end. + swap(slot, 0); + // Heap is now an element smaller. + push_root_down(/*k=*/slot); + } + } + + __device__ void replace_root(Entry const &entry, int k) { + data[0] = entry; + push_root_down(k); + } + + __device__ Entry const &root() { + return data[0]; + } +}; + +template + class Data, + typename T> +__device__ IndexedHeap + make_indexed_heap(typename Data::Entry *data) { + return IndexedHeap{Data{data}}; +} + +// heapBeamTopK walks over [input, input+length) with `step_size` stride +// starting at `start_index`. It builds a top-`k` heap that is stored in +// `heap_entries` using `Accessor` to access elements in `heap_entries`. If +// sorted=true, the elements will be sorted at the end. +template class Data = LinearData> +__device__ void heapBeamTopK(T const *__restrict__ input, + int batch_index, + int length, + int k, + Entry *__restrict__ heap_entries, + bool sorted = false, + int start_index = 0, + int step_size = 1) { + assert(k <= length); + auto heap = + make_indexed_heap( + heap_entries); + + int heap_end_index = start_index + k * step_size; + if (heap_end_index > length) { + heap_end_index = length; + } + // Initialize the min-heap. + for (int index = start_index, slot = 0; index < heap_end_index; + index += step_size, slot++) { + heap.assign(slot, {index, input[index]}); + } + + heap.build(k); + + // Now iterate over the remaining items. + // If an item is smaller than the min element, it is not amongst the top k. + // Otherwise, replace the min element with it and push upwards. + for (int index = heap_end_index; index < length; index += step_size) { + // We prefer elements with lower indices. This is given here. + // Later elements automatically have higher indices, so can be discarded. + if (input[index] > heap.root().value) { + // This element should replace the min. + heap.replace_root({index, input[index]}, k); + } + } + + // Sort if wanted. + if (sorted) { + heap.sort(k); + } + + // if(batch_index == 0){ + // printf("top elemmments: %d, value %.15f\n", start_index, + // heap.root().value); + // } +} + +template +__device__ void mergeBeamShards(int num_shards, + int batch_index, + int k, + int max_heap_size, + int request_id, + int *parent_id, + float *probs, + Entry *__restrict__ entries, + Entry *__restrict__ top_k_heap, + T *top_k_values, + int *top_k_indices, + int *top_k_parents) { + // If k < num_shards, we can use a min-heap with k elements to get the top k + // of the sorted blocks. + // If k > num_shards, we can initialize a min-heap with the top element from + // each sorted block. + int const heap_size = k < num_shards ? k : num_shards; + // printf("see value: %f", entries[0].value); + // Min-heap part. + + { + auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Initialize the heap as a min-heap. + for (int slot = 0; slot < heap_size; slot++) { + // int beam = (slot % max_heap_size) / k; + float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((slot % max_heap_size) / k)]; + min_heap.assign(slot, {slot, (entries[slot].value * prob)}); + if (batch_index == 0) { + printf("slot %d, value %.15f, prob %15f\n", + slot, + entries[slot].value, + prob); + } + } + min_heap.build(heap_size); + + // Now perform top k with the remaining shards (if num_shards > heap_size). + for (int shard = heap_size; shard < num_shards; shard++) { + auto const entry = entries[shard]; + auto const root = min_heap.root(); + + float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard % max_heap_size) / k)]; + if (batch_index == 0) { + printf("shard %d, index %d, value %.15f, prob %.15f\n", + shard, + entry.index, + entry.value, + prob); + } + if (entry.value * prob < root.value) { + continue; + } + if (entry.value * prob == root.value && + entry.index > entries[root.index].index) { + continue; + } + // This element should replace the min. + min_heap.replace_root({shard, entry.value * prob}, heap_size); + } + } + + // Max-part. + { + // Turn the min-heap into a max-heap in-place. + auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Heapify into a max heap. + max_heap.build(heap_size); + + // Now extract the minimum k-1 times. + // k is treated specially. + int const last_k = k - 1; + for (int rank = 0; rank < last_k; rank++) { + Entry const &max_element = max_heap.root(); + top_k_values[rank] = max_element.value; + int shard_index = max_element.index; + top_k_indices[rank] = entries[shard_index].index; + top_k_parents[rank] = + parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard_index % max_heap_size) / k)]; + int next_shard_index = shard_index + num_shards; + + float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((next_shard_index % max_heap_size) / k)]; + if (batch_index == 0) { + printf("next_shard_index %d, value %.15f, prob %.15f\n", + next_shard_index, + entries[next_shard_index].value, + prob); + } + + max_heap.replace_root( + {next_shard_index, entries[next_shard_index].value * prob}, + heap_size); + } + + // rank == last_k. + Entry const &max_element = max_heap.root(); + top_k_values[last_k] = max_element.value; + int shard_index = max_element.index; + top_k_indices[last_k] = entries[shard_index].index; + top_k_parents[last_k] = + parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard_index % max_heap_size) / k)]; + } +} + +template +__global__ void + mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { + using T_ACC = T; + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + Y[index] = static_cast(X[index]) * static_cast(rstd[i]); + } +} + +template +__global__ void beam_topk_forward_kernel(T const *__restrict__ input, + size_t shared_memory_size, + int length, + int k, + int max_heap_size, + int *parent_ids, + float *acc_probs, + int *gpu_block_start_index, + int *gpu_request_id, + int *tokens_per_request, + bool sorted, + T *__restrict__ output, + int *__restrict__ indices, + int *__restrict__ parents, + bool is_print) { + __shared__ char shared_memory[48 << 10]; + int const batch_index = blockIdx.x; + // T const *batch_input = input + batch_index * length; + int const thread_index = threadIdx.x; + int const thread_count = blockDim.x; + int const request_id = gpu_request_id[batch_index]; + int const token_nums = tokens_per_request[batch_index]; + Entry *shared_entries = (Entry *)shared_memory; + + int sub_request_id = thread_index / k; + // if (is_print) { + // printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d, + // " + // "request_id %d, token_nums %d\n", + // batch_index, + // thread_index, + // sub_request_id, + // request_id, + // token_nums); + // } + + T const *batch_input = input + gpu_block_start_index[batch_index] + + (sub_request_id * token_nums * length); + + if (batch_index == 0) { + printf("request 0 start index: thread index %d, offset %d, batch_input %p, " + "acc index %d acc " + "prob %f, thread_count %d, request_id %d\n", + thread_index, + gpu_block_start_index[batch_index] + + (sub_request_id * token_nums * length), + batch_input, + request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id, + acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + sub_request_id], + thread_count, + request_id); + } + // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index, + // thread_count, batch_index); + heapBeamTopK(batch_input, + batch_index, + length, + k, + shared_entries, + true, + thread_index % k, + k); + __syncthreads(); + // printf("beam thread index %d, thread_count %d, thread index %d, batch_index + // " + // "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d, + // offset: %d, offset2 %d, sub_request_id %d\n", thread_index, + // thread_count, + // thread_index, + // batch_index, + // k, + // parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS + + // sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS + + // sub_request_id], sub_request_id, request_id, + // gpu_block_start_index[batch_index], + // batch_index * length, + // sub_request_id); + + if (thread_index == 0) { + // merge beam_width heaps and store the parent + // find which req it belongs to, replace the offset + printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", + batch_index, + sub_request_id, + acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + sub_request_id]); + int const offset = batch_index * k; + auto batch_output = output + offset; + auto batch_indices = indices + offset; + auto batch_parents = parents + offset; + Entry *top_k_heap = shared_entries + thread_count * k; + + // if(batch_index == 0 && is_print){ + // for(int i = 0; i < 18; i++){ + // printf("see value: %.15f\n", shared_entries[i].value); + // } + // } + + // get parent/acc based on the sub request and main request + mergeBeamShards(thread_count, + batch_index, + k, + max_heap_size, + request_id, + parent_ids, + acc_probs, + shared_entries, + top_k_heap, + batch_output, + batch_indices, + batch_parents); + } +} + +/*static*/ +void BeamTopK::forward_kernel(BeamTopKMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + int *parent_ptr, + size_t batch_size, + int length, + bool sorted, + hipStream_t stream) { + // Adopted from TensorFlow's BeamTopK implementation + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h + + int num_shards = 0; + int max_heap_size = 0; + int max_beam_width = 0; + int req_index = 0; + + // sub request + int const *sub_requests = bc->sub_requests; + + // std::vector beam_slots = bc->beam_slots; + // assert(bc->beam_slots.size() > 0); + + int beam_num_blocks = 0; + std::vector beam_block_start_index; + std::vector request_id; + std::vector tokens_per_request; + + int block_start_index = 0; + int depth = + bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; + + // a data structure for prob, parent_id, + int max_total_requests = + BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests(); + int parent_ids[max_total_requests]; + float acc_probs[max_total_requests]; + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + assert(bc->beamRequestsInfo[i].beam_size > 0); + + // int num_new_tokens = bc->num_processing_tokens[i]; + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + + // get beam size; + int beam_size = bc->beamRequestsInfo[i].beam_size; + + // initial request + std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; + assert(sub_requests[i] > 0); + // process sub requests + for (int j = 0; j < sub_requests[i]; j++) { + parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j; + // beam_slots[i].parent_id[j]; + acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = + bc->beamRequestsInfo[i].probs[j]; + std::cout << "probbbb req: " << i << ", sub req probability : " + << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j + << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + << ", data inddd" + << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + << "\n"; + } + + // process tokens + for (int k = 0; k < num_new_tokens; k++) { + beam_block_start_index.push_back(block_start_index); + request_id.push_back(i); + tokens_per_request.push_back(num_new_tokens); + block_start_index += length; + beam_num_blocks++; + } + + max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); + max_beam_width = std::max(max_beam_width, beam_size); + req_index += 1; + block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; + } + std::cout << "what index: " << block_start_index + << ", block num: " << beam_num_blocks << "\n"; + + assert(batch_size >= beam_num_blocks); + assert(bc->num_active_requests() == req_index); + + { + constexpr auto shared_memory_size = 48 << 10; + auto const heap_size = max_heap_size * sizeof(Entry); + // shared_memory_size = (num_shards + 1) * heap_size <=> + num_shards = shared_memory_size / heap_size - 1; + assert(num_shards > 0); + if (num_shards > CUDA_NUM_THREADS) { + num_shards = CUDA_NUM_THREADS; + } + std::cout << "maxheap size: " << max_heap_size << "\n"; + std::cout << "maxbeam width: " << max_beam_width + << ", heap size: " << heap_size << "\n"; + } + // We are limited by the amount of shared memory we have per block. + size_t shared_memory_size = + (num_shards + 1) * max_heap_size * sizeof(Entry); + + assert(num_shards >= (size_t)max_heap_size); + num_shards = max_heap_size; + + // parent_id, per token + int *gpu_parents; + // acc_porbs, per token + float *gpu_probs; + // each block's start index; + // one block means the single token in different requests; + int *gpu_block_start_index; + int *gpu_request_id; + int *gpu_tokens_per_request; + + checkCUDA(hipMalloc(&gpu_parents, sizeof(int) * max_total_requests)); + checkCUDA(hipMalloc(&gpu_probs, sizeof(float) * max_total_requests)); + checkCUDA(hipMalloc(&gpu_block_start_index, sizeof(int) * beam_num_blocks)); + checkCUDA(hipMalloc(&gpu_request_id, sizeof(int) * beam_num_blocks)); + checkCUDA(hipMalloc(&gpu_tokens_per_request, sizeof(int) * beam_num_blocks)); + checkCUDA(hipMemcpy(gpu_parents, + parent_ids, + sizeof(int) * max_total_requests, + hipMemcpyHostToDevice)); + checkCUDA(hipMemcpy(gpu_probs, + acc_probs, + sizeof(float) * max_total_requests, + hipMemcpyHostToDevice)); + checkCUDA(hipMemcpy(gpu_block_start_index, + beam_block_start_index.data(), + sizeof(int) * beam_num_blocks, + hipMemcpyHostToDevice)); + checkCUDA(hipMemcpy(gpu_request_id, + request_id.data(), + sizeof(int) * beam_num_blocks, + hipMemcpyHostToDevice)); + checkCUDA(hipMemcpy(gpu_tokens_per_request, + tokens_per_request.data(), + sizeof(int) * beam_num_blocks, + hipMemcpyHostToDevice)); + + beam_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + max_beam_width, + max_heap_size, + gpu_parents, + gpu_probs, + gpu_block_start_index, + gpu_request_id, + gpu_tokens_per_request, + sorted, + output_ptr, + indices_ptr, + parent_ptr, + depth == 1); + + // merge sub +} + +/*static*/ +void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + int *parent_ptr, + size_t batch_size, + int length, + bool sorted) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + BeamTopK::forward_kernel(m, + bc, + input_ptr, + output_ptr, + indices_ptr, + parent_ptr, + batch_size, + length, + sorted, + stream); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + printf("[BeamTopK] forward time = %.2lfms\n", elapsed); + } + // if(bc->beam_slots.at(0).current_depth == 1){ + // print_beam_tensor((float *)input_ptr, 50, 32000, 15, "beam topk + // input"); print_tensor((float *)output_ptr, 50, "beam topk + // output"); + // } +} + +BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) {} + +}; // namespace FlexFlow diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu new file mode 100644 index 0000000000..b41b4c2ba4 --- /dev/null +++ b/src/ops/beam_topk.cu @@ -0,0 +1,715 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/beam_topk.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +enum class HeapType { kMinHeap, kMaxHeap }; +enum class PreferIndices { kLower, kHigher }; + +template +struct Entry { + int index; + T value; +}; + +template +struct LinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return data[i].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; +}; + +template +struct IndirectLinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return backing_data[data[i].index].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; + Entry *const backing_data; +}; + +template +struct StridedData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index * blockDim.x + threadIdx.x]; + } + + __device__ int get_index(int i) const { + return (*this)[i].index; + } + __device__ T get_value(int i) const { + return (*this)[i].value; + } + + Entry *const data; +}; + +// A heap of Entry that can either work as a min-heap or as a max-heap. +template + class Data, + typename T> +struct IndexedHeap { + typedef typename Data::Entry Entry; + Data const data; + __device__ IndexedHeap(Data const &d) : data(d) {} + + __device__ bool is_above(int left, int right) { + T left_value = data.get_value(left); + T right_value = data.get_value(right); + if (left_value == right_value) { + if (preferIndices == PreferIndices::kLower) { + return data.get_index(left) < data.get_index(right); + } else { + return data.get_index(left) > data.get_index(right); + } + } + if (heapType == HeapType::kMinHeap) { + return left_value < right_value; + } else { + return left_value > right_value; + } + } + + __device__ void assign(int i, Entry const &entry) { + data[i] = entry; + } + + __device__ void push_up(int i) { + int child = i; + int parent; + for (; child > 0; child = parent) { + parent = (child - 1) / 2; + if (!is_above(child, parent)) { + // Heap property satisfied. + break; + } + swap(child, parent); + } + } + + __device__ void swap(int a, int b) { + auto tmp = data[b]; + data[b] = data[a]; + data[a] = tmp; + } + + __device__ void push_root_down(int k) { + push_down(0, k); + } + + // MAX-HEAPIFY in Cormen + __device__ void push_down(int node, int k) { + while (true) { + int const left = 2 * node + 1; + int const right = left + 1; + int smallest = node; + if (left < k && is_above(left, smallest)) { + smallest = left; + } + if (right < k && is_above(right, smallest)) { + smallest = right; + } + if (smallest == node) { + break; + } + swap(smallest, node); + node = smallest; + } + } + + // BUILD-MAX-HEAPIFY in Cormen + __device__ void build(int k) { + for (int node = (k - 1) / 2; node >= 0; node--) { + push_down(node, k); + } + } + + // HEAP-EXTRACT-MAX in Cormen + __device__ void remove_root(int k) { + data[0] = data[k - 1]; + push_root_down(k - 1); + } + + // in-place HEAPSORT in Cormen + // This method destroys the heap property. + __device__ void sort(int k) { + for (int slot = k - 1; slot > 0; slot--) { + // This is like remove_root but we insert the element at the end. + swap(slot, 0); + // Heap is now an element smaller. + push_root_down(/*k=*/slot); + } + } + + __device__ void replace_root(Entry const &entry, int k) { + data[0] = entry; + push_root_down(k); + } + + __device__ Entry const &root() { + return data[0]; + } +}; + +template + class Data, + typename T> +__device__ IndexedHeap + make_indexed_heap(typename Data::Entry *data) { + return IndexedHeap{Data{data}}; +} + +// heapBeamTopK walks over [input, input+length) with `step_size` stride +// starting at `start_index`. It builds a top-`k` heap that is stored in +// `heap_entries` using `Accessor` to access elements in `heap_entries`. If +// sorted=true, the elements will be sorted at the end. +template class Data = LinearData> +__device__ void heapBeamTopK(T const *__restrict__ input, + int batch_index, + int length, + int k, + Entry *__restrict__ heap_entries, + bool sorted = false, + int start_index = 0, + int step_size = 1) { + assert(k <= length); + auto heap = + make_indexed_heap( + heap_entries); + + int heap_end_index = start_index + k * step_size; + if (heap_end_index > length) { + heap_end_index = length; + } + // Initialize the min-heap. + for (int index = start_index, slot = 0; index < heap_end_index; + index += step_size, slot++) { + heap.assign(slot, {index, input[index]}); + } + + heap.build(k); + + // Now iterate over the remaining items. + // If an item is smaller than the min element, it is not amongst the top k. + // Otherwise, replace the min element with it and push upwards. + for (int index = heap_end_index; index < length; index += step_size) { + // We prefer elements with lower indices. This is given here. + // Later elements automatically have higher indices, so can be discarded. + if (input[index] > heap.root().value) { + // This element should replace the min. + heap.replace_root({index, input[index]}, k); + } + } + + // Sort if wanted. + if (sorted) { + heap.sort(k); + } + + // if(batch_index == 0){ + // printf("top elemmments: %d, value %.15f\n", start_index, + // heap.root().value); + // } +} + +template +__device__ void mergeBeamShards(int num_shards, + int batch_index, + int k, + int max_heap_size, + int request_id, + int *parent_id, + float *probs, + Entry *__restrict__ entries, + Entry *__restrict__ top_k_heap, + T *top_k_values, + int *top_k_indices, + int *top_k_parents) { + // If k < num_shards, we can use a min-heap with k elements to get the top k + // of the sorted blocks. + // If k > num_shards, we can initialize a min-heap with the top element from + // each sorted block. + int const heap_size = k < num_shards ? k : num_shards; + // printf("see value: %f", entries[0].value); + // Min-heap part. + + { + auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Initialize the heap as a min-heap. + for (int slot = 0; slot < heap_size; slot++) { + // int beam = (slot % max_heap_size) / k; + float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((slot % max_heap_size) / k)]; + min_heap.assign(slot, {slot, (entries[slot].value * prob)}); + if (batch_index == 0) { + printf("slot %d, value %.15f, prob %15f\n", + slot, + entries[slot].value, + prob); + } + } + min_heap.build(heap_size); + + // Now perform top k with the remaining shards (if num_shards > heap_size). + for (int shard = heap_size; shard < num_shards; shard++) { + auto const entry = entries[shard]; + auto const root = min_heap.root(); + + float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard % max_heap_size) / k)]; + if (batch_index == 0) { + printf("shard %d, index %d, value %.15f, prob %.15f\n", + shard, + entry.index, + entry.value, + prob); + } + if (entry.value * prob < root.value) { + continue; + } + if (entry.value * prob == root.value && + entry.index > entries[root.index].index) { + continue; + } + // This element should replace the min. + min_heap.replace_root({shard, entry.value * prob}, heap_size); + } + } + + // Max-part. + { + // Turn the min-heap into a max-heap in-place. + auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Heapify into a max heap. + max_heap.build(heap_size); + + // Now extract the minimum k-1 times. + // k is treated specially. + int const last_k = k - 1; + for (int rank = 0; rank < last_k; rank++) { + Entry const &max_element = max_heap.root(); + top_k_values[rank] = max_element.value; + int shard_index = max_element.index; + top_k_indices[rank] = entries[shard_index].index; + top_k_parents[rank] = + parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard_index % max_heap_size) / k)]; + int next_shard_index = shard_index + num_shards; + + float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((next_shard_index % max_heap_size) / k)]; + if (batch_index == 0) { + printf("next_shard_index %d, value %.15f, prob %.15f\n", + next_shard_index, + entries[next_shard_index].value, + prob); + } + + max_heap.replace_root( + {next_shard_index, entries[next_shard_index].value * prob}, + heap_size); + } + + // rank == last_k. + Entry const &max_element = max_heap.root(); + top_k_values[last_k] = max_element.value; + int shard_index = max_element.index; + top_k_indices[last_k] = entries[shard_index].index; + top_k_parents[last_k] = + parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard_index % max_heap_size) / k)]; + } +} + +template +__global__ void + mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { + using T_ACC = T; + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + Y[index] = static_cast(X[index]) * static_cast(rstd[i]); + } +} + +template +__global__ void beam_topk_forward_kernel(T const *__restrict__ input, + size_t shared_memory_size, + int length, + int k, + int max_heap_size, + int *parent_ids, + float *acc_probs, + int *gpu_block_start_index, + int *gpu_request_id, + int *tokens_per_request, + bool sorted, + T *__restrict__ output, + int *__restrict__ indices, + int *__restrict__ parents, + bool is_print) { + __shared__ char shared_memory[48 << 10]; + int const batch_index = blockIdx.x; + // T const *batch_input = input + batch_index * length; + int const thread_index = threadIdx.x; + int const thread_count = blockDim.x; + int const request_id = gpu_request_id[batch_index]; + int const token_nums = tokens_per_request[batch_index]; + Entry *shared_entries = (Entry *)shared_memory; + + int sub_request_id = thread_index / k; + // if (is_print) { + // printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d, + // " + // "request_id %d, token_nums %d\n", + // batch_index, + // thread_index, + // sub_request_id, + // request_id, + // token_nums); + // } + + T const *batch_input = input + gpu_block_start_index[batch_index] + + (sub_request_id * token_nums * length); + + if (batch_index == 0) { + printf("request 0 start index: thread index %d, offset %d, batch_input %p, " + "acc index %d acc " + "prob %f, thread_count %d, request_id %d\n", + thread_index, + gpu_block_start_index[batch_index] + + (sub_request_id * token_nums * length), + batch_input, + request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id, + acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + sub_request_id], + thread_count, + request_id); + } + // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index, + // thread_count, batch_index); + heapBeamTopK(batch_input, + batch_index, + length, + k, + shared_entries, + true, + thread_index % k, + k); + __syncthreads(); + // printf("beam thread index %d, thread_count %d, thread index %d, batch_index + // " + // "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d, + // offset: %d, offset2 %d, sub_request_id %d\n", thread_index, + // thread_count, + // thread_index, + // batch_index, + // k, + // parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS + + // sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS + + // sub_request_id], sub_request_id, request_id, + // gpu_block_start_index[batch_index], + // batch_index * length, + // sub_request_id); + + if (thread_index == 0) { + // merge beam_width heaps and store the parent + // find which req it belongs to, replace the offset + printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", + batch_index, + sub_request_id, + acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + sub_request_id]); + int const offset = batch_index * k; + auto batch_output = output + offset; + auto batch_indices = indices + offset; + auto batch_parents = parents + offset; + Entry *top_k_heap = shared_entries + thread_count * k; + + // if(batch_index == 0 && is_print){ + // for(int i = 0; i < 18; i++){ + // printf("see value: %.15f\n", shared_entries[i].value); + // } + // } + + // get parent/acc based on the sub request and main request + mergeBeamShards(thread_count, + batch_index, + k, + max_heap_size, + request_id, + parent_ids, + acc_probs, + shared_entries, + top_k_heap, + batch_output, + batch_indices, + batch_parents); + } +} + +/*static*/ +void BeamTopK::forward_kernel(BeamTopKMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + int *parent_ptr, + size_t batch_size, + int length, + bool sorted, + cudaStream_t stream) { + // Adopted from TensorFlow's BeamTopK implementation + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h + + int num_shards = 0; + int max_heap_size = 0; + int max_beam_width = 0; + int req_index = 0; + + // sub request + int const *sub_requests = bc->sub_requests; + + // std::vector beam_slots = bc->beam_slots; + // assert(bc->beam_slots.size() > 0); + + int beam_num_blocks = 0; + std::vector beam_block_start_index; + std::vector request_id; + std::vector tokens_per_request; + + int block_start_index = 0; + int depth = + bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; + + // a data structure for prob, parent_id, + int max_total_requests = + BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests(); + int parent_ids[max_total_requests]; + float acc_probs[max_total_requests]; + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + assert(bc->beamRequestsInfo[i].beam_size > 0); + + // int num_new_tokens = bc->num_processing_tokens[i]; + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + + // get beam size; + int beam_size = bc->beamRequestsInfo[i].beam_size; + + // initial request + std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; + assert(sub_requests[i] > 0); + // process sub requests + for (int j = 0; j < sub_requests[i]; j++) { + parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j; + // beam_slots[i].parent_id[j]; + acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = + bc->beamRequestsInfo[i].probs[j]; + std::cout << "probbbb req: " << i << ", sub req probability : " + << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j + << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + << ", data inddd" + << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + << "\n"; + } + + // process tokens + for (int k = 0; k < num_new_tokens; k++) { + beam_block_start_index.push_back(block_start_index); + request_id.push_back(i); + tokens_per_request.push_back(num_new_tokens); + block_start_index += length; + beam_num_blocks++; + } + + max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); + max_beam_width = std::max(max_beam_width, beam_size); + req_index += 1; + block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; + } + std::cout << "what index: " << block_start_index + << ", block num: " << beam_num_blocks << "\n"; + + assert(batch_size >= beam_num_blocks); + assert(bc->num_active_requests() == req_index); + + { + constexpr auto shared_memory_size = 48 << 10; + auto const heap_size = max_heap_size * sizeof(Entry); + // shared_memory_size = (num_shards + 1) * heap_size <=> + num_shards = shared_memory_size / heap_size - 1; + assert(num_shards > 0); + if (num_shards > CUDA_NUM_THREADS) { + num_shards = CUDA_NUM_THREADS; + } + std::cout << "maxheap size: " << max_heap_size << "\n"; + std::cout << "maxbeam width: " << max_beam_width + << ", heap size: " << heap_size << "\n"; + } + // We are limited by the amount of shared memory we have per block. + size_t shared_memory_size = + (num_shards + 1) * max_heap_size * sizeof(Entry); + + assert(num_shards >= (size_t)max_heap_size); + num_shards = max_heap_size; + + // parent_id, per token + int *gpu_parents; + // acc_porbs, per token + float *gpu_probs; + // each block's start index; + // one block means the single token in different requests; + int *gpu_block_start_index; + int *gpu_request_id; + int *gpu_tokens_per_request; + + checkCUDA(cudaMalloc(&gpu_parents, sizeof(int) * max_total_requests)); + checkCUDA(cudaMalloc(&gpu_probs, sizeof(float) * max_total_requests)); + checkCUDA(cudaMalloc(&gpu_block_start_index, sizeof(int) * beam_num_blocks)); + checkCUDA(cudaMalloc(&gpu_request_id, sizeof(int) * beam_num_blocks)); + checkCUDA(cudaMalloc(&gpu_tokens_per_request, sizeof(int) * beam_num_blocks)); + checkCUDA(cudaMemcpy(gpu_parents, + parent_ids, + sizeof(int) * max_total_requests, + cudaMemcpyHostToDevice)); + checkCUDA(cudaMemcpy(gpu_probs, + acc_probs, + sizeof(float) * max_total_requests, + cudaMemcpyHostToDevice)); + checkCUDA(cudaMemcpy(gpu_block_start_index, + beam_block_start_index.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice)); + checkCUDA(cudaMemcpy(gpu_request_id, + request_id.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice)); + checkCUDA(cudaMemcpy(gpu_tokens_per_request, + tokens_per_request.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice)); + + beam_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + max_beam_width, + max_heap_size, + gpu_parents, + gpu_probs, + gpu_block_start_index, + gpu_request_id, + gpu_tokens_per_request, + sorted, + output_ptr, + indices_ptr, + parent_ptr, + depth == 1); + + // merge sub +} + +/*static*/ +void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + int *parent_ptr, + size_t batch_size, + int length, + bool sorted) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + BeamTopK::forward_kernel(m, + bc, + input_ptr, + output_ptr, + indices_ptr, + parent_ptr, + batch_size, + length, + sorted, + stream); + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[BeamTopK] forward time = %.2lfms\n", elapsed); + } + // if(bc->beam_slots.at(0).current_depth == 1){ + // print_beam_tensor((float *)input_ptr, 50, 32000, 15, "beam topk + // input"); print_tensor((float *)output_ptr, 50, "beam topk + // output"); + // } +} + +BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) {} + +}; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index d2af3fa8e1..e2e0e0bc82 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -727,4 +727,4 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { #endif } -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index d83d9952c9..c9415a89a2 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,7 +26,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc new file mode 100644 index 0000000000..e365082002 --- /dev/null +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -0,0 +1,670 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/model.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#ifdef INFERENCE_TESTS +#include +using namespace at::indexing; +#endif + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +bool SpecIncMultiHeadSelfAttentionParams::is_valid( + ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor + FFModel::spec_inc_multihead_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + char const *name) { + // Currently assume that + Layer *li = new Layer(this, + OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + } + { + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; + li->weights[0] = create_weight_legion_ordering(2, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = DT_FLOAT; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_heads", num_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("bias", bias); + li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + layers.push_back(li); + return li->outputs[0]; +} + +Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + + std::cout << "spec create operator: " << layer->name << "\n"; + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_heads", value); + int num_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("bias", value); + bool bias = (bool)value; + layer->get_int_property("add_bias_kv", value); + bool add_bias_kv = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + layer->get_int_property("apply_rotary_embedding", value); + bool apply_rotary_embedding = (bool)value; + return new SpecIncMultiHeadSelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + apply_rotary_embedding, + false /*allocate_weights*/, + layer->name); +} + +SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( + FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + dims[2].degree = 1; + dims[2].parallel_idx = -1; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( + FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, + DT_FLOAT, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + _input, + _weight), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_heads; + dims[2].size = qParas + kParas + vParas + oParas; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, DT_FLOAT, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( + FFModel &model, + SpecIncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights) + : SpecIncMultiHeadSelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + other.apply_rotary_embedding, + allocate_weights, + other.name) {} + +SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( + FFModel &model, + SpecIncMultiHeadSelfAttentionParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : SpecIncMultiHeadSelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + params.apply_rotary_embedding, + allocate_weights, + name) {} + +void SpecIncMultiHeadSelfAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher( + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SpecIncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher( + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SpecIncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta *SpecIncMultiHeadSelfAttention::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + SpecIncMultiHeadSelfAttention const *attn = + (SpecIncMultiHeadSelfAttention *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta( + handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); + m->profiling = attn->profiling; + assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + return m; +} + +void SpecIncMultiHeadSelfAttention::forward(FFModel const &ff) { + // SpecIncMultiHeadSelfAttention doesn't support forward + assert(false); +} + +FutureMap SpecIncMultiHeadSelfAttention::inference( + FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + IndexLauncher launcher( + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument( + &bc, std::max(sizeof(BatchConfig), sizeof(BeamSearchBatchConfig))), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void SpecIncMultiHeadSelfAttention::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == regions.size()); + + BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + SpecIncMultiHeadSelfAttentionMeta const *m = + *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 3); + assert(output_domain.get_dim() == 4); + + /* print_tensor(input.get_float_ptr(), + input_domain.get_volume(), + "[Attention:forward:query]"); */ + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr()); + + // print_tensor(input.get_float_ptr(), 20, "attention input"); + // print_tensor(output.get_float_ptr(), 20, "attention output"); + // if(bc->beam_slots.at(0).current_depth == 1){ + // print_beam_tensor(input.get_float_ptr(), 50, 4096, 40, "mha topk + // input"); print_beam_tensor(output.get_float_ptr(), 50, 4096, 40, + // "mha topk output"); + // } +} + +void SpecIncMultiHeadSelfAttention::backward(FFModel const &ff) { + // SpecIncMultiHeadSelfAttention does not support backward + assert(false); +} + +bool SpecIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, + int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +Op *SpecIncMultiHeadSelfAttention::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + SpecIncMultiHeadSelfAttentionParams params = get_params(); + return new SpecIncMultiHeadSelfAttention( + ff, params, inputs[0], true, this->name); +} + +bool SpecIncMultiHeadSelfAttention::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, + SpecIncMultiHeadSelfAttentionParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.add_zero_attn == rhs.add_zero_attn && + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; +} + +SpecIncMultiHeadSelfAttentionParams + SpecIncMultiHeadSelfAttention::get_params() const { + SpecIncMultiHeadSelfAttentionParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_heads = this->num_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.bias = this->bias; + params.add_bias_kv = this->add_bias_kv; + params.add_zero_attn = this->add_zero_attn; + params.apply_rotary_embedding = this->apply_rotary_embedding; + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::SpecIncMultiHeadSelfAttentionParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.bias); + hash_combine(key, params.add_bias_kv); + hash_combine(key, params.add_zero_attn); + hash_combine(key, params.apply_rotary_embedding); + return key; +} +}; // namespace std diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp new file mode 100644 index 0000000000..00335d82fa --- /dev/null +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -0,0 +1,74 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +/*static*/ +void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( + FFHandler handler, + SpecIncMultiHeadSelfAttention const *attn, + float const *weight_ptr, + Memory gpu_mem, + int num_samples, + int _num_heads) + : OpMeta(handler, attn) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(handler.dnn, stream)); +} + +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) {} + +}; // namespace FlexFlow diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu new file mode 100644 index 0000000000..5c2a90e538 --- /dev/null +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -0,0 +1,904 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +__global__ void spec_build_w_out_tensor(float const *weight_ptr, + float *contiguous_weight_ptr, + int vProjSize, + int oProjSize, + int num_heads, + int qkv_weight_block_size) { + CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { + int v_idx = i % vProjSize; + int o_idx = (i / vProjSize) % oProjSize; + int head_idx = i / (vProjSize * oProjSize); + contiguous_weight_ptr[i] = + weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + + qkv_weight_block_size + o_idx * vProjSize + v_idx]; + } +} + +__global__ void + spec_apply_rotary_embedding(float *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int q_block_size, + int k_block_size, + int v_block_size, + bool q_tensor) { + int proj_size = q_tensor ? qProjSize : kProjSize; + CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { + // create complex number + int head_idx = i / (num_tokens * proj_size / 2); + int idx = i % (num_tokens * proj_size / 2); + int real_part_index = + idx * 2 + head_idx * (q_block_size + k_block_size + v_block_size) + + (q_tensor ? 0 : q_block_size); + int complex_part_index = real_part_index + 1; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + // int head_idx = i / (num_tokens * proj_size); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + int pos_i = i % (proj_size / 2); + + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[real_part_index + 1] = complex_input[i].y; + + // if (i % 64 == 1 && head_idx == 0) { + // printf("head id: %d, tokenid: %d, pospospos:-> %d, before real part + // %f, " + // "before complex part: %f, real part: %f," + // "complext part: %f, freq_cis real: %f, freq_cis commplexx + // %f\n", head_idx, token_idx, pos, before_real, before_complex, + // complex_input[i].x, + // complex_input[i].y, + // complex_pos.x, + // complex_pos.y); + // } + } +} + +void inference_kernel1(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + float alpha = 1.0f, beta = 0.0f; + assert(m->qSize == m->vSize && m->qSize == m->kSize); + cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) + // Weights: qSize x qProjSize x 3 x num_heads + // Input: qSize x num_tokens + // Output >>> qProjSize x num_tokens x 3 x num_heads + int m_q = m->qProjSize; + int m_k = m->kProjSize; + int m_v = m->vProjSize; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->qSize; + int lda = k, ldb = k, ldc_q = m_q, ldc_k = m_k, ldc_v = m_v; + size_t strideA = + m->weights_params; // need to also skip over all the parameters for each + // head, plus the unused W_o weights + size_t strideB = 0; // input stays the same for all heads. + size_t strideC = + (m_q + m_k + m_v) * n; // size of the output block for each head. + // Q + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_q, + n, + k, + &alpha, + weight_ptr, + data_type, + lda, + strideA, + input_ptr, + data_type, + ldb, + strideB, + &beta, + output_ptr, + data_type, + ldc_q, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_k, + n, + k, + &alpha, + weight_ptr + m_q * k, + data_type, + lda, + strideA, + input_ptr, + data_type, + ldb, + strideB, + &beta, + output_ptr + m_q * n, + data_type, + ldc_k, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // V + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_v, + n, + k, + &alpha, + weight_ptr + (m_q + m_k) * k, + data_type, + lda, + strideA, + input_ptr, + data_type, + ldb, + strideB, + &beta, + output_ptr + (m_q + m_k) * n, + data_type, + ldc_v, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // apply rotary emmmbedding for k and v + // step1 change the k, v to complex tensor + int num_tokens = bc->num_active_tokens(); + + int parallelism = m->kProjSize * num_tokens * m->num_heads; + int q_block_size = m->qProjSize * num_tokens; + int k_block_size = m->kProjSize * num_tokens; + int v_block_size = m->vProjSize * num_tokens; + cuFloatComplex *complex_input; + + // todo xinhao remember to set token index for each beam + + if (*m->apply_rotary_embedding) { + checkCUDA(cudaMalloc(&complex_input, + num_tokens * m->qProjSize * m->num_heads * + sizeof(cuFloatComplex *) / 2)); + /*q*/ + spec_apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->tokenInfos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + true); + /*k*/ + spec_apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->tokenInfos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + false); + } + checkCUDA(cudaDeviceSynchronize()); +} + +__global__ void spec_store_kv_cache( + float const *devQKVProjArray, + float *cache_ptr, + BatchConfig::PerTokenInfo *tokenInfos, + BatchConfig::PerRequestInfo *requestInfo, + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_heads, + int max_seq_len, + int max_beam_width, + bool k_cache, + bool is_root) { + CUDA_KERNEL_LOOP(i, + num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { + int proj_size = k_cache ? kProjSize : vProjSize; + int head_idx = i / (num_tokens * proj_size); + int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = i % proj_size; + + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int current_head_block_size = + num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); + float val = + devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + + token_idx * proj_size + data_idx]; + + // above no need to be changed + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + // int const sub_req_id = id_map[token_idx].sub_request_index; + // int const parent_id = id_map[token_idx].parent_id; + // int const beam_depth = id_map[token_idx].beam_depth; + // int const beam_width = id_map[token_idx].beam_width; + + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; + int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; + int const beam_depth = beamRequestInfos[req_id].current_depth; + int const beam_width = beamRequestInfos[req_id].beam_size; + + // new token + int new_token_cache_idx = (req_id * max_beam_width + sub_req_id) * + (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + + tok_id * proj_size + data_idx; + cache_ptr[new_token_cache_idx] = val; + + // replica in the root iteration + if (beam_depth == 1) { + for (int i = 1; i < beam_width; i++) { + cache_ptr[(req_id * max_beam_width + i) * + (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } + } + + // if (head_idx == 0 && beam_depth == 0 && token_idx == 8 && k_cache) { + // // printf("token idx %d\n", token_idx); + // printf("data idx: %d, tok_id %d, new_token_cache_idx %d, parent_id %d, + // " + // "sub_req_id %d, num_tokens %d, kProjSize %d, num_heads %d, val " + // "%f, beam_width %d\n", + // data_idx, + // tok_id, + // new_token_cache_idx, + // parent_id, + // sub_req_id, + // num_tokens, + // kProjSize, + // num_heads, + // val, + // beam_width); + // } + + // naive cache stealing + if (sub_req_id != parent_id) { + if (data_idx == 0 && head_idx == 0 && k_cache) { + printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " + "%d, tok_id %d\n", + beam_depth, + req_id, + sub_req_id, + parent_id, + tok_id); + } + + for (int depth = 0; depth < beam_depth; depth++) { + int steal_token_idx = tok_id - beam_depth + depth; + int steal_from_idx = (req_id * max_beam_width + parent_id) * + (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + + steal_token_idx * proj_size + data_idx; + int steal_to_idx = (req_id * max_beam_width + sub_req_id) * + (num_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + + steal_token_idx * proj_size + data_idx; + cache_ptr[steal_to_idx] = cache_ptr[steal_from_idx]; + + // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ + // printf("cache stealing kernel!, steal_token_idx %d\n", + // steal_token_idx); + // } + } + } + + // parallel cache stealing not yet implemented + // logic shld be + // launch spec_store_kv_cache with parallelism * current depth + // from the i here, get depth index + // if depth index not the current one, check if we need to steal + // steal if needed + + // cache stealing theory + // identify which sub request does this token come from + // for initial token, 0 + // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and + // which to be delete copy beam_size bunch of blocks when sub_req_id == + // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + } +} + +void inference_kernel2(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + int curr_depth = bc->beamRequestsInfo[0].current_depth; + printf("curr depth: %d\n", curr_depth); + // assert(curr_depth < 3); + if (num_tokens > 0) { + int parallelism = m->kProjSize * num_tokens * m->num_heads; + spec_store_kv_cache<<>>(m->devQKVProjArray, + m->keyCache, + m->tokenInfos, + m->requestInfos, + m->beamTokenInfos, + m->beamRequestInfos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + MAX_SEQ_LEN, + BeamSearchBatchConfig::MAX_BEAM_WIDTH, + /* k_cache = */ true, + /*root*/ curr_depth == 0); + + parallelism = m->vProjSize * num_tokens * m->num_heads; + spec_store_kv_cache<<>>(m->devQKVProjArray, + m->valueCache, + m->tokenInfos, + m->requestInfos, + m->beamTokenInfos, + m->beamRequestInfos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + MAX_SEQ_LEN, + BeamSearchBatchConfig::MAX_BEAM_WIDTH, + /* k_cache = */ false, + /*root*/ curr_depth == 0); + } +} + +__global__ void spec_fill_entries_above_diagonal(float *matrix, + size_t num_rows, + size_t num_cols, + size_t num_heads, + size_t entries_above_diagonal, + float value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + +void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float *output_ptr, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int tokens_prev_requests_squares = 0; + int qkv_block_size = + (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int kt_block_size = m->kProjSize * MAX_SEQ_LEN; + int kt_req_block_size = kt_block_size * m->num_heads; + int vt_block_size = m->vProjSize * MAX_SEQ_LEN; + int vt_req_block_size = vt_block_size * m->num_heads; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { + + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; + + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].token_start_offset + + bc->requestsInfo[i].num_tokens_in_batch; + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = qkv_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + // To get A, skip over Q entries from previous requests (same head) + void const *A = (void const *)(m->devQKVProjArray + + tokens_previous_requests * m->qProjSize); + // To get B, skip over K entries from previous requests (all heads + + // padding) + void const *B = + (void const *)(m->keyCache + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * + kt_req_block_size); + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + void *C = + (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_heads * entries_above_diagonal; + spec_fill_entries_above_diagonal<<>>((float *)C, + num_new_tokens, + total_tokens, + m->num_heads, + entries_above_diagonal, + -INFINITY); + } + // Compute Softmax(QK^T/sqrt(d_k)) + cudnnTensorDescriptor_t qk_tensor; + checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, + n_param, + c_param, + h_param, + w_param)); + alpha = 1.0f, beta = 0.0f; + void *C_softmax = (void *)(m->qk_prods_softmax + + m->num_heads * tokens_prev_requests_squares); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + qk_tensor, + (void *)((float *)C), + &beta, + qk_tensor, + (void *)((float *)C_softmax))); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = vt_block_size; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = (void const *)C_softmax; + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = (void const *)(m->valueCache + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * + vt_req_block_size); + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = (void *)(m->attn_heads + + tokens_previous_requests * m->num_heads * m->vProjSize); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = (void const *)m->W_out_contiguous; + B = (void const *)C; + C = (void *)(output_ptr + tokens_previous_requests * m->oProjSize); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; + } + } + + assert(tokens_previous_requests == num_tokens); +} + +/*static*/ +void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // reload the weight_o + + if (!(*m->has_load_weights)) { + int parallelism = m->vProjSize * m->oProjSize * m->num_heads; + spec_build_w_out_tensor<<>>(weight_ptr, + m->W_out_contiguous, + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); + *m->has_load_weights = true; + } + + // here because we need postion info in infernece 1 + cudaMemcpyAsync(m->tokenInfos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * + sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(m->requestInfos, + &(bc->requestsInfo), + bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(m->beamTokenInfos, + &(bc->beamTokenInfo), + bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * + sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(m->beamRequestInfos, + &(bc->beamRequestsInfo), + bc->MAX_NUM_REQUESTS * + sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), + cudaMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + // phase 2: Update key/val cache + inference_kernel2(m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + inference_kernel3(m, bc, output_ptr, stream); + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( + FFHandler handler, + SpecIncMultiHeadSelfAttention const *attn, + float const *weight_ptr, + Memory gpu_mem, + int num_samples, + int _num_heads) + : OpMeta(handler, attn) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + qSize = attn->qSize; + kSize = attn->kSize; + vSize = attn->vSize; + // assume dimensions match for now + assert(qSize == kSize); + assert(kSize == vSize); + qProjSize = attn->qProjSize; + kProjSize = attn->kProjSize; + assert(qProjSize == kProjSize); // required for attention QK^T matmul + vProjSize = attn->vProjSize; + oProjSize = attn->oProjSize; + + // print params; + + num_heads = _num_heads; + weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); + weightSize = weights_params * num_heads * sizeof(float); + has_load_weights = (bool *)calloc(1, sizeof(bool)); + *has_load_weights = false; + apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); + *apply_rotary_embedding = attn->apply_rotary_embedding; + // Currently do not support adding bias to key/value projection + assert(!attn->add_bias_kv); + +#ifdef INFERENCE_TESTS + kcache = (float *)calloc(kProjSize * MAX_SEQ_LEN * num_heads * + BeamSearchBatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + vcache = (float *)calloc(vProjSize * MAX_SEQ_LEN * num_heads * + BeamSearchBatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); +#endif + + // allocate memory for the seqArray and reserve space + { + size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; + size_t qkv_max_proj_size = + BeamSearchBatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; + size_t key_cache_size = num_heads * kProjSize * + BeamSearchBatchConfig::MAX_NUM_REQUESTS * + MAX_SEQ_LEN * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + size_t value_cache_size = + num_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + MAX_SEQ_LEN * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + + // size_t token2ids_size = BatchConfig::MAX_NUM_TOKENS; + size_t tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + + size_t beam_tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + + size_t requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t beam_requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; + + size_t qk_prod_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_TOKENS * num_heads; + size_t attn_heads_size = + BeamSearchBatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; + size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + size_t W_out_contiguous_size = W_out_block_size * num_heads; + size_t totalSize = + (qkv_max_proj_size + key_cache_size + value_cache_size + + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * + sizeof(float) + + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + + requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + + beam_tokeninfo_size * + sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + + beam_requestinfo_size * + sizeof(BeamSearchBatchConfig:: + BeamSearchPerRequestInfo); // more components will + // be added here later + + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(totalSize - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(reserveInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); + keyCache = (float *)devQKVProjArray + qkv_max_proj_size; + valueCache = (float *)keyCache + key_cache_size; + // dev_token2ids = (BatchConfig::token_idxs *)(valueCache + + // value_cache_size); + + tokenInfos = (BatchConfig::PerTokenInfo *)(valueCache + value_cache_size); + beamTokenInfos = + (BeamSearchBatchConfig::BeamSearchPerTokenInfo *)(tokenInfos + + tokeninfo_size); + requestInfos = + (BatchConfig::PerRequestInfo *)(beamTokenInfos + beam_tokeninfo_size); + beamRequestInfos = + (BeamSearchBatchConfig::BeamSearchPerRequestInfo *)(requestInfos + + requestinfo_size); + + qk_prods = (float *)(beamRequestInfos + beam_requestinfo_size); + qk_prods_softmax = (float *)(qk_prods + qk_prod_size); + attn_heads = (float *)qk_prods_softmax + qk_prod_size; + W_out_contiguous = (float *)attn_heads + attn_heads_size; + int parallelism = vProjSize * oProjSize * num_heads; + spec_build_w_out_tensor<<>>( + weight_ptr, + W_out_contiguous, + vProjSize, + oProjSize, + num_heads, + (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); + } + + cudaStreamSynchronize(stream); +} + +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { + reserveInst.destroy(); +#ifdef INFERENCE_TESTS + free(kcache); + free(vcache); +#endif +} + +}; // namespace FlexFlow diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc new file mode 100644 index 0000000000..0bb5084d83 --- /dev/null +++ b/src/runtime/beam_search_batch_config.cc @@ -0,0 +1,88 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/batch_config.h" +#include "legion.h" +#include +#include + +#define DEFAULT_BEAM_WIDTH 1 +#define DEFAULT_TARGET_ITERATIONS 3 + +namespace FlexFlow { + +LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig"); + +BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() { + this->beam_width = DEFAULT_BEAM_WIDTH; + this->target_iterations = DEFAULT_TARGET_ITERATIONS; + current_iteration = 0; +} + +BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width, + size_t target_iterations) + : BatchConfig() { + this->beam_width = beam_width; + this->target_iterations = target_iterations; + current_iteration = 0; +} + +BeamSearchBatchConfig::~BeamSearchBatchConfig() {} + +bool BeamSearchBatchConfig::done() const { + assert(current_iteration <= target_iterations); + return current_iteration == target_iterations; +} + +void BeamSearchBatchConfig::print() const { + std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; + std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; + std::cout << "Number of tokens: " << num_tokens << std::endl; + std::cout << "Number of requests: " << num_active_requests() << std::endl; + std::cout << "Beam width: " << beam_width << std::endl; + std::cout << "Target Iterations" << target_iterations << std::endl; + std::cout << "Current Iterations" << current_iteration << std::endl; + + std::cout << "Per-request info:\n"; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (!request_completed[i]) { + std::cout << " Request " << i << ":\n"; + std::cout << " Token start offset: " + << requestsInfo[i].token_start_offset << std::endl; + std::cout << " Number of tokens in batch: " + << requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; + std::cout << " Max sequence length: " + << requestsInfo[i].max_sequence_length << std::endl; + std::cout << " Request completed: " << request_completed[i] + << std::endl; + } + } + + std::cout << "Per-token info:\n"; + for (int i = 0; i < num_tokens; i++) { + std::cout << " Token " << i << ":\n"; + std::cout << " Absolute depth in request: " + << tokensInfo[i].abs_depth_in_request << std::endl; + std::cout << " Request index: " << tokensInfo[i].request_index + << std::endl; + std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; + // std::cout << " Parent token id: " << tokensInfo[i].parent_token_id << + // std::endl; std::cout << " Accumulated log prob: " + // << tokensInfo[i].cum_log_prob << std::endl; + } +} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 7dc0adeb38..2fcf0e096a 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -224,7 +224,7 @@ __host__ void printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { printf(" %.20lf", (float)host_ptr[idx]); - if (idx >= 50) { + if (idx >= 100) { break; } } @@ -232,6 +232,37 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } +template +__host__ void print_beam_tensor(T const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix) { + // device synchronize to make sure the data are ready + // checkCUDA(cudaDeviceSynchronize()); + T *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(T) * channel * skip, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(T) * channel * skip, cudaMemcpyDeviceToHost)); + // checkCUDA(cudaDeviceSynchronize()); + int idx = 0; + printf("%s", prefix); + + for (int i = 0; i < channel; i += 1) { + for (idx = 0; idx < num_elements; idx++) { + printf(" %.20lf", (float)host_ptr[idx + i * skip]); + if (idx >= 100) { + break; + } + } + printf("\n-----***********------\n"); + } + + checkCUDA(cudaFreeHost(host_ptr)); +} + template __host__ void save_tensor(T const *ptr, size_t num_elements, char const *file_name) { @@ -279,6 +310,56 @@ __host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { // checkCUDA(cudaDeviceSynchronize()); return true; } +cudnnStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return cudnnSetTensor4dDescriptor( + tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, + dims[2] * dims[1], + dims[0], + 1, + 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return cudnnSetTensor4dDescriptor(tensor, + CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, + dims[3] * dims[2] * dims[1], + dims[0], + 1, + 1); + } + default: + assert(false && "Unsupported dim number"); + } + return CUDNN_STATUS_BAD_PARAM; +} cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Domain domain) { @@ -464,6 +545,22 @@ template __host__ void template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void print_beam_tensor(float const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); +template __host__ void print_beam_tensor(int32_t const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); +template __host__ void print_beam_tensor(int64_t const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); + template __host__ void save_tensor(float const *ptr, size_t rect, char const *file_name); template __host__ void save_tensor(int64_t const *ptr, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 01b116c853..8c2096b694 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -115,6 +115,8 @@ std::string get_operator_type_name(OperatorType type) { return "TopK"; case OP_ARG_TOPK: return "ArgTopK"; + case OP_BEAM_TOPK: + return "BeamTopK"; case OP_WHERE: return "Where"; case OP_CEIL: @@ -147,6 +149,8 @@ std::string get_operator_type_name(OperatorType type) { return "MultiHeadAttention"; case OP_INC_MULTIHEAD_SELF_ATTENTION: return "IncMultiHeadSelfAttention"; + case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: + return "SpeculativeIncMultiHeadSelfAttention"; case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: return "IncMultiHeadSelfAttentionVerify"; case OP_INPUT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 64ced31579..81fa9e4595 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -19,6 +19,7 @@ #include "flexflow/ops/arg_topk.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" +#include "flexflow/ops/beam_topk.h" #include "flexflow/ops/cast.h" #include "flexflow/ops/concat.h" #include "flexflow/ops/conv_2d.h" @@ -40,6 +41,7 @@ #include "flexflow/ops/reshape.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" +#include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -2275,6 +2277,21 @@ GraphOptimalViewSerialized sez.serialize(attn->apply_rotary_embedding); break; } + case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: { + SpecIncMultiHeadSelfAttention *attn = + (SpecIncMultiHeadSelfAttention *)op; + sez.serialize(attn->layer_guid.id); + sez.serialize(attn->oProjSize); + sez.serialize(attn->num_heads); + sez.serialize(attn->qProjSize); + sez.serialize(attn->vProjSize); + sez.serialize(attn->dropout); + sez.serialize(attn->bias); + sez.serialize(attn->add_bias_kv); + sez.serialize(attn->add_zero_attn); + sez.serialize(attn->apply_rotary_embedding); + break; + } case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { IncMultiHeadSelfAttentionVerify *attn = (IncMultiHeadSelfAttentionVerify *)op; @@ -2681,6 +2698,39 @@ void FFModel::deserialize_graph_optimal_view( node = get_or_create_node(inputs[0], params); break; } + case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: { + assert(num_inputs == 1); + int embed_dim, num_heads, k_dim, v_dim; + float dropout; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + size_t id; + dez.deserialize(id); + LayerID layer_guid(id); + dez.deserialize(embed_dim); + dez.deserialize(num_heads); + dez.deserialize(k_dim); + dez.deserialize(v_dim); + dez.deserialize(dropout); + dez.deserialize(bias); + dez.deserialize(add_bias_kv); + dez.deserialize(add_zero_attn); + dez.deserialize(apply_rotary_embedding); + + SpecIncMultiHeadSelfAttentionParams params; + params.embed_dim = embed_dim; + params.num_heads = num_heads; + params.kdim = k_dim; + params.vdim = v_dim; + params.dropout = dropout; + params.bias = bias; + params.add_bias_kv = add_bias_kv; + params.add_zero_attn = add_zero_attn; + params.layer_guid = layer_guid; + params.apply_rotary_embedding = apply_rotary_embedding; + node = get_or_create_node(inputs[0], + params); + break; + } case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; @@ -2722,6 +2772,10 @@ void FFModel::deserialize_graph_optimal_view( node = ArgTopK::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_BEAM_TOPK: { + node = BeamTopK::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_GROUP_BY: { node = Group_by::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index a7da765391..e9fe33f22e 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -250,14 +250,16 @@ void InferenceManager::load_input_tokens_from_batch_config( Runtime *runtime = model->config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; - IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, - input->parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher( + RM_LOAD_TOKENS_TASK_ID, + input->parallel_is, + TaskArgument( + &bc, std::max(sizeof(BeamSearchBatchConfig), sizeof(BatchConfig))), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_region_requirement(RegionRequirement( input->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, input->region)); launcher.add_field(0, FID_DATA); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index d5c3f05851..4b8bb032d8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" +#include "flexflow/ops/beam_topk.h" #include "flexflow/ops/cache.h" #include "flexflow/ops/cast.h" #include "flexflow/ops/concat.h" @@ -52,6 +53,7 @@ #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" +#include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -2763,6 +2765,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: { + Op *op = SpecIncMultiHeadSelfAttention::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_INC_MULTIHEAD_SELF_ATTENTION: { Op *op = IncMultiHeadSelfAttention::create_operator_from_layer( *this, layer, inputs); @@ -2899,6 +2907,11 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_BEAM_TOPK: { + Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_GROUP_BY: { Op *op = Group_by::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3003,6 +3016,7 @@ void FFModel::compile(LossType loss_type, ParallelTensor parallel_weight = nullptr; for (auto const &op : operators) { if (op->layer_guid == layer->layer_guid) { + std::cout << "opopop: " << op->name << "\n"; assert(op->op_type == layer->op_type); assert(op->numWeights == layer->numWeights); parallel_weight = op->weights[i]; @@ -4551,6 +4565,22 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "ArgTopK Inference Task"); } + // BeamTopk task + { + TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "BeamTopK Init Task"); + } + { + TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "BeamTopK Inference Task"); + } // Transpose task { TaskVariantRegistrar registrar(TRANSPOSE_INIT_TASK_ID, "Transpose Init"); @@ -4617,6 +4647,27 @@ void register_flexflow_internal_tasks() { IncMultiHeadSelfAttention::inference_task>( registrar, "IncMultiHeadSelfAttention Inference Task"); } + // speculative MultiHeadAttention task + { + TaskVariantRegistrar registrar( + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + "Speculative IncMultiHeadSelfAttention Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Speculative IncMultiHeadSelfAttention Init Task"); + } + { + TaskVariantRegistrar registrar( + SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + "Speculative IncMultiHeadSelfAttention Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant< + SpecIncMultiHeadSelfAttention::inference_task>( + registrar, "Speculative IncMultiHeadSelfAttention Inference Task"); + } { TaskVariantRegistrar registrar( INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index a968e10c60..19b6ac6b04 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -1,9 +1,11 @@ #include "flexflow/operator_params.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/aggregate_spec.h" +#include "flexflow/ops/arg_topk.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" +#include "flexflow/ops/beam_topk.h" #include "flexflow/ops/cache.h" #include "flexflow/ops/cast.h" #include "flexflow/ops/concat.h" @@ -27,6 +29,7 @@ #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" +#include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -119,6 +122,10 @@ tl::optional get_op_parameters(Op const *op) { return ((AggregateSpec *)op)->get_params(); case OP_RMS_NORM: return ((RMSNorm *)op)->get_params(); + case OP_ARG_TOPK: + return ((ArgTopK *)op)->get_params(); + case OP_BEAM_TOPK: + return ((BeamTopK *)op)->get_params(); // TODO: implement the get_params() function for the operators below and // uncomment the lines below diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7f5bc89648..810be3df1f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -36,6 +36,8 @@ RequestManager::RequestGuid request.tokens = prompt; pending_request_queue.push(request); + + std::cout << "new req: " << request.tokens.size() << std::endl; return request.guid; } @@ -136,4 +138,385 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, return new_bc; } +//-------beam search specific functions + +// update beam search metadata +BeamSearchBatchConfig + RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result) { + const std::lock_guard lock(request_queue_mutex); + std::cout << "print all results" + << "\n"; + for (int i = 0; i < 40; i++) { + std::cout << result.token_ids[i] << ", "; + } + // Step 1: register first batch + BeamSearchBatchConfig new_bc; + // Step 2: preparing the next batch for existing requests + + // store results + std::cout << "depthhhhhhh: " << old_bc.beamRequestsInfo[0].current_depth + << "\n"; + store_beam_metadata(old_bc, result); + + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = + running_request_queue[old_bc.requestsInfo[i].request_guid]; + int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + old_bc.requestsInfo[i].num_tokens_in_batch; + + // std::cout << "processed tokens" << processed_tokens << ", " + // << request.tokens.size() << "\n"; + // assert(processed_tokens < request.tokens.size()); + if (processed_tokens > + old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() + // || ir.results[t] == 0 TODO: replace this with + // std::cout<<"aaaaaaa"<<"\n"; + ) { + log_req_mgr.print("[Done] guid(%zu) final_length(%i) request_length(%zu)", + old_bc.requestsInfo[i].request_guid, + processed_tokens, + request.tokens.size()); + } else { + + std::cout << "num tokens: " << old_bc.num_tokens << ", " + << new_bc.num_tokens; + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + + // update the beam search metadata + // how many sub request in current request + new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + // update the parentid, accumalated_probs, depth, and token_ids + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth + 1; + new_bc.beamRequestsInfo[i].beam_size = + old_bc.beamRequestsInfo[i].beam_size; + + // do the slot exchange to minimize the cache exchange in kernel. + std::cout << "update metadata" << std::endl; + update_beam_metadata(new_bc, beam_trees[i], i); + + if (new_bc.requestsInfo[i].token_start_offset + 1 >= + request.tokens.size()) { + // Incremental phase + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + } else { + // Prompt phase + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].token_start_offset); + } + + // register more tokens due to the beam width + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + for (int k = 0; k < new_bc.sub_requests[i]; k++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + + // get value from requestinfo + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_bc.beamRequestsInfo[i].tokens[k]; + // request.tokens[depth]; + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; + new_bc.num_tokens++; + } + } + } + } + + for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { + if (new_bc.request_completed[i]) { + if (!pending_request_queue.empty() && + new_bc.num_tokens < BeamSearchBatchConfig::MAX_NUM_TOKENS) { + Request new_request = pending_request_queue.front(); + pending_request_queue.pop(); + running_request_queue[new_request.guid] = new_request; + new_bc.requestsInfo[i].token_start_offset = 0; + new_bc.requestsInfo[i].request_guid = new_request.guid; + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(BeamSearchBatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)new_request.tokens.size()); + new_bc.requestsInfo[i].max_sequence_length = + new_request.max_sequence_length; + + // init the beam search metadata per request + new_bc.beamRequestsInfo[i].beam_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + new_bc.beamRequestsInfo[i].current_depth = 1; + for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + new_bc.beamRequestsInfo[i].parent_id[j] = 0; + new_bc.beamRequestsInfo[i].probs[j] = 1; + } + + new_bc.request_completed[i] = false; + new_bc.sub_requests[i] = 1; + + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + assert(depth < new_request.tokens.size()); + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_request.tokens[depth]; + + // beam search meta data, indicate which sub request this token + // belongs to, init to 0; + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; + new_bc.num_tokens++; + } + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } + } + } + } + return new_bc; +} + +void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result) { + // step1 store the outputs + if (old_bc.num_tokens <= 0) { + return; + } + auto guid = + old_bc.requestsInfo[old_bc.tokensInfo[0].request_index].request_guid; + auto start_idx = old_bc.tokensInfo[0].abs_depth_in_request; + int result_index = 0; + std::cout << "store num tokens" << old_bc.num_tokens << "\n"; + for (int i = 0; i <= old_bc.num_tokens; i++) { + int request_index = old_bc.tokensInfo[i].request_index; + if (i == old_bc.num_tokens || + old_bc.requestsInfo[request_index].request_guid != guid) { + // see how many tokens has been put to model in this req + // to get the index of the final token + // every token will get (beam_width) results + int beam_width = + old_bc.beamRequestsInfo[old_bc.tokensInfo[i].request_index].beam_size; + + result_index += + (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_idx) * + beam_width; + + std::cout << "i = " << i << ", result index = " << result_index + << ",value: " << result.token_ids[result_index] << "\n"; + + int index = old_bc.tokensInfo[i - 1].request_index; + int beam_size = old_bc.beamRequestsInfo[index].beam_size; + int depth = old_bc.beamRequestsInfo[index].current_depth; + + if (depth == 1) { + // store the last input into the tree; + std::cout << "try to store the input" + << "\n"; + Request &request = + running_request_queue[old_bc.requestsInfo[index].request_guid]; + beam_trees[index].treeLayers[depth - 1].tokens[0] = + request.tokens.at(request.tokens.size() - 1); + beam_trees[index].treeLayers[depth - 1].probs[0] = 1; + beam_trees[index].treeLayers[depth - 1].parent_ids[0] = -1; + std::cout << "store the previous last token to the tree root" + << request.tokens.at(request.tokens.size() - 1) << "\n"; + } + + for (int beam_id = 0; beam_id < beam_width; beam_id++) { + beam_trees[index].treeLayers[depth].tokens[beam_id] = + result.token_ids[result_index]; + beam_trees[index].treeLayers[depth].probs[beam_id] = + result.probs[result_index]; + beam_trees[index].treeLayers[depth].parent_ids[beam_id] = + result.parent_id[result_index]; + + std::cout << "tree value: " << depth << "token: " + << beam_trees[index].treeLayers[depth].tokens[beam_id] + << "result tokens: " << result.token_ids[result_index]; + result_index += 1; + } + + if (i < old_bc.num_tokens) { + guid = old_bc.requestsInfo[request_index].request_guid; + start_idx = old_bc.tokensInfo[i].abs_depth_in_request; + } + } + } +} + +// for updating the beam search metadata in requests in incremental phase +void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamTree &tree, + int request_index) { + + // do the exchange + if (new_bc.request_completed[request_index]) { + assert(false); + } + int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; + int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; + + // std::cout << "-----------before parent id exchange-----------" << + // std::endl; for (int j = 0; j < beam_size; j++) { + // std::cout << "after request id: " << request_index << "beam id = " << j + // << "parnt: " + // << new_bc.beamRequestsInfo[request_index].parent_id[j] + // << "token: " << + // new_bc.beamRequestsInfo[request_index].tokens[j] + // << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] + // << std::endl; + // // std::fixed << std::setprecision(15)<< + // } + if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { + for (int j = 0; j < beam_size; j++) { + new_bc.beamRequestsInfo[request_index].parent_id[j] = j; + new_bc.beamRequestsInfo[request_index].probs[j] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[j] = + tree.treeLayers[depth].tokens[j]; + } + } else { + std::set parents; + std::set childs; + // cache stealing + for (int j = 0; j < beam_size; j++) { + int parent_id = tree.treeLayers[depth].parent_ids[j]; + if (childs.find(parent_id) == childs.end()) { + // copy beam slot + new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = + tree.treeLayers[depth].parent_ids[j]; + new_bc.beamRequestsInfo[request_index].probs[parent_id] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[parent_id] = + tree.treeLayers[depth].tokens[j]; + parents.emplace(j); + childs.emplace(parent_id); + } + } + if (parents.size() < beam_size) { + for (int j = 0; j < beam_size; j++) { + if (parents.find(j) == parents.end()) { + // this slot has not been assigned + // find the smallest not assigned child and put in + std::cout << "request_index" << request_index << ", miss slot: " << j + << "\n"; + for (int k = 0; k < beam_size; k++) { + if (childs.find(k) == childs.end()) { + // parent -> j to child k; + new_bc.beamRequestsInfo[request_index].parent_id[k] = + tree.treeLayers[depth].parent_ids[j]; + new_bc.beamRequestsInfo[request_index].probs[k] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[k] = + tree.treeLayers[depth].tokens[j]; + parents.emplace(j); + childs.emplace(k); + break; + } + } + } + } + } + } + std::cout << "-----------after parent id exchange-----------" << std::endl; + for (int j = 0; j < beam_size; j++) { + std::cout << "after request id: " << request_index << "beam id = " << j + << "parnt: " + << new_bc.beamRequestsInfo[request_index].parent_id[j] + << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j] + << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] + << std::endl; + } +} + +bool PreOrder(BeamTree tree, + int max_depth, + int current_depth, + int beam_width, + int id, + std::vector> + &serializedTree) { + // terminate + if (current_depth >= max_depth) { + serializedTree.push_back(std::make_pair( + tree.treeLayers[current_depth].tokens[id], current_depth)); + std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id] + << "\n"; + std::cout << "return true" + << "\n"; + return true; + } + + // add to tree; + // std::cout<<"node: " << current_depth << ", id: " << + serializedTree.push_back( + std::make_pair(tree.treeLayers[current_depth].tokens[id], current_depth)); + std::cout << "push something: " << tree.treeLayers[current_depth].tokens[id] + << ", " << current_depth << std::endl; + int index = serializedTree.size() - 1; + int next_layers = current_depth + 1; + + bool flag = false; + // recursion + for (int i = 0; i < beam_width; i++) { + int child_id = i; + int child_parent = tree.treeLayers[next_layers].parent_ids[i]; + + // for all childs, do preOrder + if (child_parent == id) { + std::cout << "current depth: " << current_depth << ", child_parent, " + << child_parent << ", child_id, " << child_id << "\n"; + bool res = PreOrder(tree, + max_depth, + current_depth + 1, + beam_width, + child_id, + serializedTree); + flag = flag || res; + } + } + if (!flag) { + // no child for this token, delete it + std::cout << "delete a node: " << tree.treeLayers[current_depth].tokens[id] + << ", " << current_depth << std::endl; + serializedTree.erase(serializedTree.begin() + index); + } + return flag; +} + +void RequestManager::tranverse_beam_tree(BeamSearchBatchConfig const &old_bc) { + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + // if(i != 0){ + // continue; + // } + + int depth = old_bc.beamRequestsInfo[i].current_depth; + int beam_width = old_bc.beamRequestsInfo[i].beam_size; + BeamTree tree = beam_trees[i]; + + // token, index + // todo make this one global for different stages + std::vector> serializedTree; + PreOrder( + tree, 3, 0, old_bc.beamRequestsInfo[i].beam_size, 0, serializedTree); + + // print it + std::cout << "print tree, " << i << "\n"; + for (int k = 0; k < serializedTree.size(); k++) { + std::cout << "token id: " << serializedTree.at(k).first + << ", depth: " << serializedTree.at(k).second << "\n"; + } + } +} + }; // namespace FlexFlow From 82a44a9fdead80b46979c635bb78c51811b36243 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 7 May 2023 18:57:15 -0500 Subject: [PATCH 106/344] Fix bugs in tree-based incremental multi-head attention (#710) * Support multiple FFModels in a single top_level_task * rename operators and bug fixes * format * add missing files --------- Co-authored-by: Gabriele Oliaro --- examples/cpp/inference/LLAMA/llama.h | 3 +- examples/cpp/inference/SPEC_LLAMA/llama.cc | 2 +- examples/cpp/inference/SPEC_LLAMA/llama.h | 3 +- examples/cpp/inference/data_generator.cc | 4 +- examples/cpp/inference/inference_config.h | 4 +- include/flexflow/batch_config.h | 2 +- include/flexflow/ffconst.h | 4 +- include/flexflow/model.h | 16 +- include/flexflow/operator_params.h | 4 +- ....h => tree_inc_multihead_self_attention.h} | 99 +++-- ...ree_inc_multihead_self_attention_params.h} | 10 +- src/ops/inc_multihead_self_attention.cu | 66 ++- src/ops/spec_inc_multihead_self_attention.cc | 12 +- src/ops/spec_inc_multihead_self_attention.cpp | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 20 +- ...c => tree_inc_multihead_self_attention.cc} | 181 ++++----- ... => tree_inc_multihead_self_attention.cpp} | 17 +- ...u => tree_inc_multihead_self_attention.cu} | 376 +++++++----------- src/runtime/ffconst_utils.cc | 8 +- src/runtime/graph.cc | 20 +- src/runtime/model.cc | 31 +- src/runtime/operator_params.cc | 6 +- src/runtime/substitution.cc | 10 +- 23 files changed, 394 insertions(+), 506 deletions(-) rename include/flexflow/ops/{inc_mha_verify.h => tree_inc_multihead_self_attention.h} (55%) rename include/flexflow/ops/{inc_mha_verify_params.h => tree_inc_multihead_self_attention_params.h} (64%) rename src/ops/{inc_mha_verify.cc => tree_inc_multihead_self_attention.cc} (91%) rename src/ops/{inc_mha_verify.cpp => tree_inc_multihead_self_attention.cpp} (78%) rename src/ops/{inc_mha_verify.cu => tree_inc_multihead_self_attention.cu} (70%) diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index 75b7d3ff1b..0ae8d57d5b 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" #define MAX_NUM_SAMPLES 65536 @@ -36,7 +37,7 @@ struct LLAMAConfig { batchSize = 5; total_requests = 2560; incremental_mode = true; - sequence_length = MAX_SEQ_LEN; + sequence_length = BatchConfig::MAX_SEQ_LENGTH; max_seq_len = 8; // todo from args diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.cc b/examples/cpp/inference/SPEC_LLAMA/llama.cc index 0f2095c1ae..ae5c4948e7 100644 --- a/examples/cpp/inference/SPEC_LLAMA/llama.cc +++ b/examples/cpp/inference/SPEC_LLAMA/llama.cc @@ -96,7 +96,7 @@ void FlexFlow::top_level_task(Task const *task, // n transformer blocks impl int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; - for (int i = 0; i < 1; i++) { + for (int i = 0; i < 32; i++) { // step 1: attention std::vector axes = {2}; Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.h b/examples/cpp/inference/SPEC_LLAMA/llama.h index 7fce809073..d2a96e70a5 100644 --- a/examples/cpp/inference/SPEC_LLAMA/llama.h +++ b/examples/cpp/inference/SPEC_LLAMA/llama.h @@ -15,6 +15,7 @@ #pragma once #include "file_loader.h" +#include "flexflow/batch_config.h" using namespace Legion; using namespace FlexFlow; @@ -34,7 +35,7 @@ struct LLAMAConfig { batchSize = 5; total_requests = 2560; incremental_mode = true; - sequence_length = MAX_SEQ_LEN; + sequence_length = BatchConfig::MAX_SEQ_LENGTH; max_seq_len = 8; max_beam_width = 3; max_beam_depth = 3; diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc index 69ed577822..9d8fe1b7be 100644 --- a/examples/cpp/inference/data_generator.cc +++ b/examples/cpp/inference/data_generator.cc @@ -19,6 +19,7 @@ #include #include using namespace std; +using namespace FlexFlow; DataGenerator::DataGenerator(size_t _num_requests, size_t _vocab_size, @@ -35,7 +36,8 @@ DataGenerator::DataGenerator(size_t _num_requests, poisson_distr(_poisson_distr), lambda(_lambda), timer_started(false) { assert(max_input_tokens >= min_input_tokens); assert(max_tokens_to_generate >= min_tokens_to_generate); - assert(max_input_tokens + max_tokens_to_generate <= MAX_SEQ_LEN); + assert(max_input_tokens + max_tokens_to_generate <= + BatchConfig::MAX_SEQ_LENGTH); generate_requests_meta(); }; diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h index 8b393336cc..c6cdd5da6c 100644 --- a/examples/cpp/inference/inference_config.h +++ b/examples/cpp/inference/inference_config.h @@ -14,10 +14,10 @@ */ #pragma once +#include "flexflow/batch_config.h" #include - // #define MAX_SEQ_LEN 1024 -#define MAX_SEQ_LEN 20 +static int const MAX_SEQ_LEN = FlexFlow::BatchConfig::MAX_SEQ_LENGTH; #define BATCH_SIZE 16 #define MNIST_DIMS 28 * 28 #define DATA_DIM MNIST_DIMS diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index db5ff3d485..461643e755 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,7 +20,6 @@ // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 -#define MAX_SEQ_LEN 20 // #define BATCH_SIZE 16 // #define MAX_REQUESTS 256 @@ -45,6 +44,7 @@ class BatchConfig { void print() const; static int const MAX_NUM_REQUESTS = 8; static int const MAX_NUM_TOKENS = 64; + static int const MAX_SEQ_LENGTH = 512; // These are set by update int num_tokens; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 86898a1a9b..e6a4eb6f3c 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -149,9 +149,9 @@ enum OperatorType { OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, OP_BEAM_TOPK, - OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, OP_INC_MULTIHEAD_SELF_ATTENTION, - OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 70a631f9a8..caf6229300 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -145,16 +145,14 @@ enum TaskIDs { RMSNROM_FWD_TASK_ID, BEAM_TOPK_INIT_TASK_ID, BEAM_TOPK_INF_TASK_ID, - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_FWD_TASK_ID, - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_BWD_TASK_ID, - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INF_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -293,7 +291,7 @@ class LayerNorm; class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; -class IncMultiHeadSelfAttentionVerify; +class TreeIncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -1014,8 +1012,8 @@ class FFModel { std::pair, SpecIncMultiHeadSelfAttention *>, std::unordered_map< - std::pair, - IncMultiHeadSelfAttentionVerify *>, + std::pair, + TreeIncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index a417f6579f..9549ffc084 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -18,7 +18,6 @@ #include "flexflow/ops/flat_params.h" #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" -#include "flexflow/ops/inc_mha_verify_params.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" @@ -31,6 +30,7 @@ #include "flexflow/ops/split_params.h" #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" +#include "flexflow/ops/tree_inc_multihead_self_attention_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" #include "flexflow/parallel_ops/partition_params.h" @@ -61,7 +61,7 @@ using OperatorParameters = mp::variant -struct hash { +struct hash { size_t - operator()(FlexFlow::IncMultiHeadSelfAttentionVerifyParams const &) const; + operator()(FlexFlow::TreeIncMultiHeadSelfAttentionParams const &) const; }; } // namespace std diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e2e0e0bc82..f68bec459b 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -102,12 +102,12 @@ __global__ void } } -void inference_kernel1(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - cudaStream_t stream) { +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -289,9 +289,9 @@ __global__ void store_kv_cache(float const *devQKVProjArray, } } -void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - cudaStream_t stream) { +void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); if (num_tokens > 0) { int parallelism = m->kProjSize * num_tokens * m->num_heads; @@ -306,7 +306,7 @@ void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ true); parallelism = m->vProjSize * num_tokens * m->num_heads; @@ -321,7 +321,7 @@ void inference_kernel2(IncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ false); } } @@ -342,10 +342,10 @@ __global__ void fill_entries_above_diagonal(float *matrix, } } -void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - float *output_ptr, - cudaStream_t stream) { +void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); @@ -358,12 +358,11 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; - int tokens_prev_requests_squares = 0; int qkv_block_size = (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int kt_block_size = m->kProjSize * MAX_SEQ_LEN; + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_heads; - int vt_block_size = m->vProjSize * MAX_SEQ_LEN; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; int vt_req_block_size = vt_block_size * m->num_heads; assert(m->qProjSize == m->kProjSize); @@ -392,8 +391,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, // padding) void const *B = (void const *)(m->keyCache + i * kt_req_block_size); // To get C, skip over QK^T products from previous requests - void *C = - (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); + void *C = (void *)(m->qk_prods); checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, @@ -458,8 +456,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, h_param, w_param)); alpha = 1.0f, beta = 0.0f; - void *C_softmax = (void *)(m->qk_prods_softmax + - m->num_heads * tokens_prev_requests_squares); + void *C_softmax = (void *)(m->qk_prods_softmax); // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -546,7 +543,6 @@ void inference_kernel3(IncMultiHeadSelfAttentionMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; } assert(tokens_previous_requests == num_tokens); @@ -593,14 +589,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + compute_qkv_kernel(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); // phase 2: Update key/val cache - inference_kernel2(m, bc, stream); + update_kv_cache_kernel(m, bc, stream); // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - inference_kernel3(m, bc, output_ptr, stream); + compute_attention_kernel(m, bc, output_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -652,10 +648,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(!attn->add_bias_kv); #ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * MAX_SEQ_LEN * num_heads * + kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * BatchConfig::MAX_NUM_REQUESTS, sizeof(float)); - vcache = (float *)calloc(vProjSize * MAX_SEQ_LEN * num_heads * + vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * BatchConfig::MAX_NUM_REQUESTS, sizeof(float)); #endif @@ -665,13 +661,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; size_t qkv_max_proj_size = BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; - size_t key_cache_size = - num_heads * kProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN; - size_t value_cache_size = - num_heads * vProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN; + size_t key_cache_size = num_heads * kProjSize * + BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + size_t value_cache_size = num_heads * vProjSize * + BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; size_t qk_prod_size = - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads; + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_heads; size_t attn_heads_size = BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); @@ -727,4 +725,4 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { #endif } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index e365082002..11911a9ba3 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -69,7 +69,7 @@ Tensor char const *name) { // Currently assume that Layer *li = new Layer(this, - OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, @@ -176,7 +176,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( char const *name) // Initializer* _bias_initializer) : Op(model, - OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, @@ -260,7 +260,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( char const *name) // Initializer* _bias_initializer) : Op(model, - OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION, + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, @@ -382,7 +382,7 @@ void SpecIncMultiHeadSelfAttention::init_inference( size_t machine_view_hash = view->hash(); set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher( - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(SpecIncMultiHeadSelfAttention)), argmap, @@ -421,7 +421,7 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_init(ff, argmap); IndexLauncher launcher( - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(SpecIncMultiHeadSelfAttention)), argmap, @@ -510,7 +510,7 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( size_t machine_view_hash = view->hash(); int idx = 0; IndexLauncher launcher( - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, TaskArgument( &bc, std::max(sizeof(BatchConfig), sizeof(BeamSearchBatchConfig))), diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 00335d82fa..37305a83b0 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -40,7 +40,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( hipEventRecord(t_start, stream); } - handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); + handle_unimplemented_hip_kernel(OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION); if (m->profiling) { hipEventRecord(t_end, stream); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 5c2a90e538..dc1d861b08 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -409,7 +409,7 @@ void inference_kernel2(SpecIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, BeamSearchBatchConfig::MAX_BEAM_WIDTH, /* k_cache = */ true, /*root*/ curr_depth == 0); @@ -429,7 +429,7 @@ void inference_kernel2(SpecIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, BeamSearchBatchConfig::MAX_BEAM_WIDTH, /* k_cache = */ false, /*root*/ curr_depth == 0); @@ -471,9 +471,9 @@ void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, int tokens_prev_requests_squares = 0; int qkv_block_size = (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int kt_block_size = m->kProjSize * MAX_SEQ_LEN; + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_heads; - int vt_block_size = m->vProjSize * MAX_SEQ_LEN; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; int vt_req_block_size = vt_block_size * m->num_heads; assert(m->qProjSize == m->kProjSize); @@ -797,10 +797,10 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( assert(!attn->add_bias_kv); #ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * MAX_SEQ_LEN * num_heads * + kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * BeamSearchBatchConfig::MAX_NUM_REQUESTS, sizeof(float)); - vcache = (float *)calloc(vProjSize * MAX_SEQ_LEN * num_heads * + vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * BeamSearchBatchConfig::MAX_NUM_REQUESTS, sizeof(float)); #endif @@ -810,12 +810,12 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; size_t qkv_max_proj_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; - size_t key_cache_size = num_heads * kProjSize * - BeamSearchBatchConfig::MAX_NUM_REQUESTS * - MAX_SEQ_LEN * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + size_t key_cache_size = + num_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; size_t value_cache_size = num_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - MAX_SEQ_LEN * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; // size_t token2ids_size = BatchConfig::MAX_NUM_TOKENS; size_t tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * diff --git a/src/ops/inc_mha_verify.cc b/src/ops/tree_inc_multihead_self_attention.cc similarity index 91% rename from src/ops/inc_mha_verify.cc rename to src/ops/tree_inc_multihead_self_attention.cc index 228c0d224a..eec59c9247 100644 --- a/src/ops/inc_mha_verify.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/inc_mha_verify.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/model.h" #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "flexflow/utils/cuda_helper.h" @@ -48,7 +48,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -bool IncMultiHeadSelfAttentionVerifyParams::is_valid( +bool TreeIncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { bool is_valid = input.is_valid(); return is_valid; @@ -69,7 +69,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( char const *name) { // Currently assume that Layer *li = new Layer(this, - OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, @@ -118,7 +118,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( return li->outputs[0]; } -Op *IncMultiHeadSelfAttentionVerify::create_operator_from_layer( +Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { @@ -141,23 +141,23 @@ Op *IncMultiHeadSelfAttentionVerify::create_operator_from_layer( bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); bool apply_rotary_embedding = (bool)value; - return new IncMultiHeadSelfAttentionVerify(model, - layer->layer_guid, - inputs[0], - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - apply_rotary_embedding, - false /*allocate_weights*/, - layer->name); + return new TreeIncMultiHeadSelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + apply_rotary_embedding, + false /*allocate_weights*/, + layer->name); } -IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( +TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, LayerID const &_layer_guid, const ParallelTensor _input, @@ -174,7 +174,7 @@ IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( char const *name) // Initializer* _bias_initializer) : Op(model, - OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, @@ -241,7 +241,7 @@ IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( /* assert(check_output_input_weight_parallel_dims()); */ } -IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( +TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, const ParallelTensor _input, const ParallelTensor _weight, @@ -258,7 +258,7 @@ IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( char const *name) // Initializer* _bias_initializer) : Op(model, - OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY, + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, @@ -325,48 +325,48 @@ IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( /* assert(check_output_input_weight_parallel_dims()); */ } -IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( +TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, - IncMultiHeadSelfAttentionVerify const &other, + TreeIncMultiHeadSelfAttention const &other, const ParallelTensor input, bool allocate_weights) - : IncMultiHeadSelfAttentionVerify(model, - other.layer_guid, - input, - other.oProjSize, - other.num_heads, - other.qProjSize, - other.vProjSize, - other.dropout, - other.bias, - other.add_bias_kv, - other.add_zero_attn, - other.apply_rotary_embedding, - allocate_weights, - other.name) {} - -IncMultiHeadSelfAttentionVerify::IncMultiHeadSelfAttentionVerify( + : TreeIncMultiHeadSelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + other.apply_rotary_embedding, + allocate_weights, + other.name) {} + +TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, - IncMultiHeadSelfAttentionVerifyParams const ¶ms, + TreeIncMultiHeadSelfAttentionParams const ¶ms, ParallelTensor const &input, bool allocate_weights, char const *name) - : IncMultiHeadSelfAttentionVerify(model, - params.layer_guid, - input, - params.embed_dim, - params.num_heads, - params.kdim, - params.vdim, - params.dropout, - params.bias, - params.add_bias_kv, - params.add_zero_attn, - params.apply_rotary_embedding, - allocate_weights, - name) {} - -void IncMultiHeadSelfAttentionVerify::init_inference( + : TreeIncMultiHeadSelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + params.apply_rotary_embedding, + allocate_weights, + name) {} + +void TreeIncMultiHeadSelfAttention::init_inference( FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -380,9 +380,9 @@ void IncMultiHeadSelfAttentionVerify::init_inference( size_t machine_view_hash = view->hash(); set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher( - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, + TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, parallel_is, - TaskArgument(this, sizeof(IncMultiHeadSelfAttentionVerify)), + TaskArgument(this, sizeof(TreeIncMultiHeadSelfAttention)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -411,7 +411,7 @@ void IncMultiHeadSelfAttentionVerify::init_inference( set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } -void IncMultiHeadSelfAttentionVerify::init(FFModel const &ff) { +void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; ArgumentMap argmap; @@ -419,9 +419,9 @@ void IncMultiHeadSelfAttentionVerify::init(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_init(ff, argmap); IndexLauncher launcher( - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, + TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, parallel_is, - TaskArgument(this, sizeof(IncMultiHeadSelfAttentionVerify)), + TaskArgument(this, sizeof(TreeIncMultiHeadSelfAttention)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -455,13 +455,13 @@ void IncMultiHeadSelfAttentionVerify::init(FFModel const &ff) { regions[1](I): weight regions[2](O): output */ -OpMeta *IncMultiHeadSelfAttentionVerify::init_task( +OpMeta *TreeIncMultiHeadSelfAttention::init_task( Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - IncMultiHeadSelfAttentionVerify const *attn = - (IncMultiHeadSelfAttentionVerify *)task->args; + TreeIncMultiHeadSelfAttention const *attn = + (TreeIncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( @@ -481,24 +481,19 @@ OpMeta *IncMultiHeadSelfAttentionVerify::init_task( .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); - IncMultiHeadSelfAttentionVerifyMeta *m = - new IncMultiHeadSelfAttentionVerifyMeta(handle, - attn, - weight.get_float_ptr(), - gpu_mem, - num_samples, - num_heads); + TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta( + handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); return m; } -void IncMultiHeadSelfAttentionVerify::forward(FFModel const &ff) { - // IncMultiHeadSelfAttentionVerify doesn't support forward +void TreeIncMultiHeadSelfAttention::forward(FFModel const &ff) { + // TreeIncMultiHeadSelfAttention doesn't support forward assert(false); } -FutureMap IncMultiHeadSelfAttentionVerify::inference( +FutureMap TreeIncMultiHeadSelfAttention::inference( FFModel const &ff, BatchConfig const &bc, std::vector const &batch_inputs, @@ -515,7 +510,7 @@ FutureMap IncMultiHeadSelfAttentionVerify::inference( printf("TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d\n", bc.num_tokens, bc.num_active_requests()); - IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INF_TASK_ID, + IndexLauncher launcher(TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, TaskArgument(&bc, sizeof(TreeVerifyBatchConfig)), argmap, @@ -549,7 +544,7 @@ FutureMap IncMultiHeadSelfAttentionVerify::inference( regions[3](I): weight regions[4](O): output */ -void IncMultiHeadSelfAttentionVerify::inference_task( +void TreeIncMultiHeadSelfAttention::inference_task( Task const *task, std::vector const ®ions, Context ctx, @@ -558,8 +553,8 @@ void IncMultiHeadSelfAttentionVerify::inference_task( assert(task->regions.size() == regions.size()); TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; - IncMultiHeadSelfAttentionVerifyMeta const *m = - *((IncMultiHeadSelfAttentionVerifyMeta **)task->local_args); + TreeIncMultiHeadSelfAttentionMeta const *m = + *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -583,14 +578,14 @@ void IncMultiHeadSelfAttentionVerify::inference_task( input_domain.get_volume(), "[Attention:forward:query]"); */ - IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, input.get_float_ptr(), weight.get_float_ptr(), output.get_float_ptr()); #ifdef INFERENCE_TESTS - printf("Checking IncMultiHeadSelfAttentionVerify computations...\n"); + printf("Checking TreeIncMultiHeadSelfAttention computations...\n"); // ============================================================================= // Define helper functions to handle row-major arrays @@ -1371,13 +1366,13 @@ void IncMultiHeadSelfAttentionVerify::inference_task( // Done with INFERENCE_TESTS block } -void IncMultiHeadSelfAttentionVerify::backward(FFModel const &ff) { - // IncMultiHeadSelfAttentionVerify does not support backward +void TreeIncMultiHeadSelfAttention::backward(FFModel const &ff) { + // TreeIncMultiHeadSelfAttention does not support backward assert(false); } -bool IncMultiHeadSelfAttentionVerify::get_int_parameter(PMParameter para, - int *value) const { +bool TreeIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, + int *value) const { switch (para) { case PM_NUM_HEADS: *value = num_heads; @@ -1387,13 +1382,13 @@ bool IncMultiHeadSelfAttentionVerify::get_int_parameter(PMParameter para, } } -bool IncMultiHeadSelfAttentionVerify::measure_operator_cost( +bool TreeIncMultiHeadSelfAttention::measure_operator_cost( Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { return false; } -bool operator==(IncMultiHeadSelfAttentionVerifyParams const &lhs, - IncMultiHeadSelfAttentionVerifyParams const &rhs) { +bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, + TreeIncMultiHeadSelfAttentionParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && @@ -1402,9 +1397,9 @@ bool operator==(IncMultiHeadSelfAttentionVerifyParams const &lhs, lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; } -IncMultiHeadSelfAttentionVerifyParams - IncMultiHeadSelfAttentionVerify::get_params() const { - IncMultiHeadSelfAttentionVerifyParams params; +TreeIncMultiHeadSelfAttentionParams + TreeIncMultiHeadSelfAttention::get_params() const { + TreeIncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; params.embed_dim = this->oProjSize; params.num_heads = this->num_heads; @@ -1421,8 +1416,8 @@ IncMultiHeadSelfAttentionVerifyParams }; // namespace FlexFlow namespace std { -size_t hash::operator()( - FlexFlow::IncMultiHeadSelfAttentionVerifyParams const ¶ms) const { +size_t hash::operator()( + FlexFlow::TreeIncMultiHeadSelfAttentionParams const ¶ms) const { size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); diff --git a/src/ops/inc_mha_verify.cpp b/src/ops/tree_inc_multihead_self_attention.cpp similarity index 78% rename from src/ops/inc_mha_verify.cpp rename to src/ops/tree_inc_multihead_self_attention.cpp index 42dccfd6cc..5e3fc240d5 100644 --- a/src/ops/inc_mha_verify.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/inc_mha_verify.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/hip_helper.h" #include @@ -24,8 +24,8 @@ using Legion::coord_t; using Legion::Memory; /*static*/ -void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( - IncMultiHeadSelfAttentionVerifyMeta const *m, +void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, @@ -40,7 +40,7 @@ void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( hipEventRecord(t_start, stream); } - handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY); + handle_unimplemented_hip_kernel(OP_TREE_INC_MULTIHEAD_SELF_ATTENTION); if (m->profiling) { hipEventRecord(t_end, stream); @@ -49,16 +49,16 @@ void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); hipEventDestroy(t_start); hipEventDestroy(t_end); - printf("IncMultiHeadSelfAttentionVerify forward time = %.2fms\n", elapsed); + printf("TreeIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } } -IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( +TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, - IncMultiHeadSelfAttentionVerify const *attn, + TreeIncMultiHeadSelfAttention const *attn, float const *weight_ptr, Memory gpu_mem, int num_samples, @@ -69,7 +69,6 @@ IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( checkCUDNN(miopenSetStream(handler.dnn, stream)); } -IncMultiHeadSelfAttentionVerifyMeta::~IncMultiHeadSelfAttentionVerifyMeta( - void) {} +TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/inc_mha_verify.cu b/src/ops/tree_inc_multihead_self_attention.cu similarity index 70% rename from src/ops/inc_mha_verify.cu rename to src/ops/tree_inc_multihead_self_attention.cu index c841addde9..c9b85f96b8 100644 --- a/src/ops/inc_mha_verify.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -15,7 +15,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif -#include "flexflow/ops/inc_mha_verify.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -24,12 +24,12 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; -__global__ void mha_verify_build_w_out_tensor(float const *weight_ptr, - float *contiguous_weight_ptr, - int vProjSize, - int oProjSize, - int num_heads, - int qkv_weight_block_size) { +__global__ void tree_build_w_out_tensor(float const *weight_ptr, + float *contiguous_weight_ptr, + int vProjSize, + int oProjSize, + int num_heads, + int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { int v_idx = i % vProjSize; int o_idx = (i / vProjSize) % oProjSize; @@ -78,7 +78,7 @@ __global__ void commit_tokens_kernel( } } -void commit_tokens(IncMultiHeadSelfAttentionVerifyMeta const *m, +void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, cudaStream_t stream) { int num_tokens_to_commit = bc->num_tokens_to_commit; @@ -95,7 +95,7 @@ void commit_tokens(IncMultiHeadSelfAttentionVerifyMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ true); parallelism = m->vProjSize * num_tokens_to_commit * m->num_heads; @@ -110,12 +110,12 @@ void commit_tokens(IncMultiHeadSelfAttentionVerifyMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ false); } } -__global__ void mha_verify_apply_rotary_embedding( +__global__ void tree_apply_rotary_embedding( float *input_ptr, cuFloatComplex *complex_input, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, @@ -176,13 +176,12 @@ __global__ void mha_verify_apply_rotary_embedding( } } -void inference_kernel1(IncMultiHeadSelfAttentionVerifyMeta const *m, - TreeVerifyBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - cudaStream_t stream) { - +void compute_qkv_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; @@ -235,7 +234,6 @@ void inference_kernel1(IncMultiHeadSelfAttentionVerifyMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -283,7 +281,6 @@ void inference_kernel1(IncMultiHeadSelfAttentionVerifyMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // apply rotary emmmbedding for k and v // step1 change the k, v to complex tensor int num_tokens = bc->num_active_tokens(); @@ -297,73 +294,35 @@ void inference_kernel1(IncMultiHeadSelfAttentionVerifyMeta const *m, num_tokens * m->qProjSize * m->num_heads * sizeof(cuFloatComplex *) / 2)); /*q*/ - mha_verify_apply_rotary_embedding<<>>(output_ptr, - complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - true); + tree_apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + true); /*k*/ - mha_verify_apply_rotary_embedding<<>>(output_ptr, - complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - false); - } -} - -__global__ void initial_store_kv_cache( - float const *devQKVProjArray, - float *cache_ptr, - TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens, - int num_heads, - int max_seq_len, - bool k_cache) { - CUDA_KERNEL_LOOP(i, - num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { - int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = i / (num_tokens * proj_size); - int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = i % proj_size; - - // only store the first branch initially, to avoid overwriting - if (tokenInfos[token_idx].tree_branch_idx == 0) { - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int current_head_block_size = - num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); - float val = - devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + - token_idx * proj_size + data_idx]; - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - - cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; - } + tree_apply_rotary_embedding<<>>(output_ptr, + complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + false); } } @@ -375,8 +334,7 @@ __global__ void update_tree_branch_kv_cache( int kProjSize, int vProjSize, int num_tokens_in_branch, - int num_tokens_previous_branches, - int num_tokens_previous_requests, + int processed_tokens_in_batch, int total_tokens_in_batch, int num_heads, int max_seq_len, @@ -389,8 +347,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += num_tokens_previous_branches; // get index in the whole request - token_idx += num_tokens_previous_requests; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = @@ -410,50 +367,12 @@ __global__ void update_tree_branch_kv_cache( } } -void inference_kernel2(IncMultiHeadSelfAttentionVerifyMeta const *m, - TreeVerifyBatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - if (num_tokens > 0) { - int parallelism = m->kProjSize * num_tokens * m->num_heads; - initial_store_kv_cache<<>>(m->devQKVProjArray, - m->keyCache, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - MAX_SEQ_LEN, - /* k_cache = */ true); - - parallelism = m->vProjSize * num_tokens * m->num_heads; - initial_store_kv_cache<<>>(m->devQKVProjArray, - m->valueCache, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - MAX_SEQ_LEN, - /* k_cache = */ false); - } -} - -__global__ void - mha_verify_fill_entries_above_diagonal(float *matrix, - size_t num_rows, - size_t num_cols, - size_t num_heads, - size_t entries_above_diagonal, - float value) { +__global__ void tree_fill_entries_above_diagonal(float *matrix, + size_t num_rows, + size_t num_cols, + size_t num_heads, + size_t entries_above_diagonal, + float value) { CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { size_t head_idx = i / entries_above_diagonal; size_t entry_idx = i % entries_above_diagonal; @@ -464,10 +383,10 @@ __global__ void } } -void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, - TreeVerifyBatchConfig const *bc, - float *output_ptr, - cudaStream_t stream) { +void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + float *output_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); @@ -478,14 +397,12 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, cudaDataType_t compute_type = CUDA_R_32F; #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int tokens_prev_requests_squares = 0; + int processed_tokens_in_batch = 0; int qkv_block_size = - (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int kt_block_size = m->kProjSize * MAX_SEQ_LEN; + (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_heads; - int vt_block_size = m->vProjSize * MAX_SEQ_LEN; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; int vt_req_block_size = vt_block_size * m->num_heads; assert(m->qProjSize == m->kProjSize); @@ -493,35 +410,21 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, if (bc->request_completed[i]) { continue; } - - int tokens_previous_tree_branches = 0; - - while (tokens_previous_tree_branches < - bc->requestsInfo[i].num_tokens_in_batch) { - int tree_branch_idx = bc->tokensInfo[tokens_previous_requests + - tokens_previous_tree_branches] - .tree_branch_idx; + int last_token_idx_of_the_request = + processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; + while (processed_tokens_in_batch <= last_token_idx_of_the_request) { int num_new_tokens = 1; - for (int j = tokens_previous_requests + tokens_previous_tree_branches + 1; - j < - tokens_previous_requests + bc->requestsInfo[i].num_tokens_in_batch; - j++) { - if (bc->tokensInfo[j].tree_branch_idx != tree_branch_idx) { - break; - } else { - num_new_tokens++; - } + int j = num_new_tokens + processed_tokens_in_batch; + while ((j + 1 <= last_token_idx_of_the_request) && + (bc->tokensInfo[j].abs_depth_in_request + 1 == + bc->tokensInfo[j + 1].abs_depth_in_request)) { + j++; + num_new_tokens++; } - int total_tokens = bc->tokensInfo[tokens_previous_requests + - tokens_previous_tree_branches] - .abs_depth_in_request + - num_new_tokens; - assert(num_new_tokens >= 1 && total_tokens >= num_new_tokens); - - if (tree_branch_idx == 0) { - assert(bc->tokensInfo[tokens_previous_requests].abs_depth_in_request == - bc->requestsInfo[i].token_start_offset); - } else { + + int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; + assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); + { // update K-V cache int parallelism = m->kProjSize * num_new_tokens * m->num_heads; update_tree_branch_kv_cache<<qProjSize, m->kProjSize, m->vProjSize, - num_new_tokens, // num_tokens_in_branch - tokens_previous_tree_branches, // num_tokens_previous_branches - tokens_previous_requests, // num_tokens_previous_requests + num_new_tokens, // num_tokens_in_branch + processed_tokens_in_batch, // num_processed_tokens_in_batch bc->requestsInfo[i].num_tokens_in_batch, // total_tokens_in_batch m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ true); parallelism = m->vProjSize * num_new_tokens * m->num_heads; @@ -553,38 +455,33 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, m->qProjSize, m->kProjSize, m->vProjSize, - num_new_tokens, // num_tokens_in_branch - tokens_previous_tree_branches, // num_tokens_previous_branches - tokens_previous_requests, // num_tokens_previous_requests + num_new_tokens, // num_tokens_in_branch + processed_tokens_in_batch, // num_processed_tokens_in_batch bc->requestsInfo[i].num_tokens_in_batch, // total_tokens_in_batch m->num_heads, - MAX_SEQ_LEN, + BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ false); } - // int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - /* int total_tokens = bc->requestsInfo[i].token_start_offset + - bc->requestsInfo[i].num_tokens_in_batch; */ // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; - int n = total_tokens; + int n = total_tokens_in_request; int k = m->qProjSize; int lda = k, ldb = k, ldc = m_; int strideA = qkv_block_size; int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; + int strideC = num_new_tokens * total_tokens_in_request; float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; // To get A, skip over Q entries from previous requests (same head) void const *A = (void const *)(m->devQKVProjArray + - tokens_previous_requests * m->qProjSize); + processed_tokens_in_batch * m->qProjSize); // To get B, skip over K entries from previous requests (all heads + // padding) void const *B = (void const *)(m->keyCache + i * kt_req_block_size); // To get C, skip over QK^T products from previous requests - void *C = - (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); + void *C = (void *)(m->qk_prods); checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, @@ -611,21 +508,20 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Fill all elements above diagonal in qk prods with -inf to force // causal attention. - assert(num_new_tokens <= total_tokens); + assert(num_new_tokens <= total_tokens_in_request); size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; if (entries_above_diagonal > 0) { size_t parallelism = m->num_heads * entries_above_diagonal; - mha_verify_fill_entries_above_diagonal<<>>( - (float *)C, - num_new_tokens, - total_tokens, - m->num_heads, - entries_above_diagonal, - -INFINITY); + tree_fill_entries_above_diagonal<<>>((float *)C, + num_new_tokens, + total_tokens_in_request, + m->num_heads, + entries_above_diagonal, + -INFINITY); } // Compute Softmax(QK^T/sqrt(d_k)) cudnnTensorDescriptor_t qk_tensor; @@ -640,7 +536,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, // columns are the inner dimension and the images are the outermost // dimension. int n_param = m->num_heads; - int c_param = total_tokens; + int c_param = total_tokens_in_request; int h_param = 1; int w_param = num_new_tokens; checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, @@ -651,8 +547,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, h_param, w_param)); alpha = 1.0f, beta = 0.0f; - void *C_softmax = (void *)(m->qk_prods_softmax + - m->num_heads * tokens_prev_requests_squares); + void *C_softmax = (void *)(m->qk_prods_softmax); // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -670,9 +565,9 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, alpha = 1.0f, beta = 0.0f; m_ = num_new_tokens; n = m->vProjSize; - k = total_tokens; + k = total_tokens_in_request; lda = m_, ldb = n, ldc = m_; - strideA = num_new_tokens * total_tokens; + strideA = num_new_tokens * total_tokens_in_request; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous @@ -684,7 +579,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = (void *)(m->attn_heads + - tokens_previous_requests * m->num_heads * m->vProjSize); + processed_tokens_in_batch * m->num_heads * m->vProjSize); checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -717,7 +612,7 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, lda = k, ldb = n, ldc = m_; A = (void const *)m->W_out_contiguous; B = (void const *)C; - C = (void *)(output_ptr + tokens_previous_requests * m->oProjSize); + C = (void *)(output_ptr + processed_tokens_in_batch * m->oProjSize); checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -738,18 +633,19 @@ void inference_kernel3(IncMultiHeadSelfAttentionVerifyMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - tokens_previous_tree_branches += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; + processed_tokens_in_batch += num_new_tokens; } - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // Before moving to the next request + // check that we have finished all tokens of the request + assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } - assert(tokens_previous_requests == num_tokens); + assert(processed_tokens_in_batch == bc->num_active_tokens()); } /*static*/ -void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( - IncMultiHeadSelfAttentionVerifyMeta const *m, +void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, @@ -776,17 +672,17 @@ void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( if (!(*m->has_load_weights)) { int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - mha_verify_build_w_out_tensor<<>>(weight_ptr, - m->W_out_contiguous, - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); + tree_build_w_out_tensor<<>>(weight_ptr, + m->W_out_contiguous, + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); *m->has_load_weights = true; } // here because we need postion info in infernece 1 @@ -797,14 +693,15 @@ void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + compute_qkv_kernel(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); - // phase 2: Update key/val cache - inference_kernel2(m, bc, stream); + // phase 2: No need to update key/val cache + // IncMultiHeadSelfAttention::update_kv_cache_kernel( + // m, bc, stream); // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - inference_kernel3(m, bc, output_ptr, stream); + compute_attention_kernel(m, bc, output_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -813,16 +710,16 @@ void IncMultiHeadSelfAttentionVerify::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("IncMultiHeadSelfAttentionVerify forward time = %.2fms\n", elapsed); + printf("TreeIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } } -IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( +TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, - IncMultiHeadSelfAttentionVerify const *attn, + TreeIncMultiHeadSelfAttention const *attn, float const *weight_ptr, Memory gpu_mem, int num_samples, @@ -856,10 +753,10 @@ IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( assert(!attn->add_bias_kv); #ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * MAX_SEQ_LEN * num_heads * + kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS, sizeof(float)); - vcache = (float *)calloc(vProjSize * MAX_SEQ_LEN * num_heads * + vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS, sizeof(float)); #endif @@ -872,10 +769,10 @@ IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; size_t key_cache_size = num_heads * kProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * - MAX_SEQ_LEN; + BatchConfig::MAX_SEQ_LENGTH; size_t value_cache_size = num_heads * vProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * - MAX_SEQ_LEN; + BatchConfig::MAX_SEQ_LENGTH; size_t tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; size_t qk_prod_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS * TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads; @@ -915,10 +812,10 @@ IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; int parallelism = vProjSize * oProjSize * num_heads; - mha_verify_build_w_out_tensor<<>>( + tree_build_w_out_tensor<<>>( weight_ptr, W_out_contiguous, vProjSize, @@ -930,8 +827,7 @@ IncMultiHeadSelfAttentionVerifyMeta::IncMultiHeadSelfAttentionVerifyMeta( cudaStreamSynchronize(stream); } -IncMultiHeadSelfAttentionVerifyMeta::~IncMultiHeadSelfAttentionVerifyMeta( - void) { +TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) { reserveInst.destroy(); #ifdef INFERENCE_TESTS free(kcache); diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 8c2096b694..39e797ea42 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -149,10 +149,10 @@ std::string get_operator_type_name(OperatorType type) { return "MultiHeadAttention"; case OP_INC_MULTIHEAD_SELF_ATTENTION: return "IncMultiHeadSelfAttention"; - case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: - return "SpeculativeIncMultiHeadSelfAttention"; - case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: - return "IncMultiHeadSelfAttentionVerify"; + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: + return "SpecIncMultiHeadSelfAttention"; + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: + return "TreeIncMultiHeadSelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 81fa9e4595..2a382f0d71 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -31,7 +31,6 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" -#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -45,6 +44,7 @@ #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -2277,7 +2277,7 @@ GraphOptimalViewSerialized sez.serialize(attn->apply_rotary_embedding); break; } - case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { SpecIncMultiHeadSelfAttention *attn = (SpecIncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); @@ -2292,9 +2292,9 @@ GraphOptimalViewSerialized sez.serialize(attn->apply_rotary_embedding); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { - IncMultiHeadSelfAttentionVerify *attn = - (IncMultiHeadSelfAttentionVerify *)op; + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + TreeIncMultiHeadSelfAttention *attn = + (TreeIncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); @@ -2698,7 +2698,7 @@ void FFModel::deserialize_graph_optimal_view( node = get_or_create_node(inputs[0], params); break; } - case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; float dropout; @@ -2731,7 +2731,7 @@ void FFModel::deserialize_graph_optimal_view( params); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; float dropout; @@ -2749,7 +2749,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); - IncMultiHeadSelfAttentionVerifyParams params; + TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; params.num_heads = num_heads; params.kdim = k_dim; @@ -2760,8 +2760,8 @@ void FFModel::deserialize_graph_optimal_view( params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; - node = get_or_create_node(inputs[0], - params); + node = get_or_create_node(inputs[0], + params); break; } case OP_TOPK: { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4b8bb032d8..f3e8664161 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -42,7 +42,6 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" -#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -57,6 +56,7 @@ #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -2765,7 +2765,7 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } - case OP_SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { Op *op = SpecIncMultiHeadSelfAttention::create_operator_from_layer( *this, layer, inputs); operators.push_back(op); @@ -2777,8 +2777,8 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } - case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { - Op *op = IncMultiHeadSelfAttentionVerify::create_operator_from_layer( + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + Op *op = TreeIncMultiHeadSelfAttention::create_operator_from_layer( *this, layer, inputs); operators.push_back(op); return op; @@ -4650,7 +4650,7 @@ void register_flexflow_internal_tasks() { // speculative MultiHeadAttention task { TaskVariantRegistrar registrar( - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, "Speculative IncMultiHeadSelfAttention Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); @@ -4660,7 +4660,7 @@ void register_flexflow_internal_tasks() { } { TaskVariantRegistrar registrar( - SPECULATIVE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, "Speculative IncMultiHeadSelfAttention Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); @@ -4670,24 +4670,23 @@ void register_flexflow_internal_tasks() { } { TaskVariantRegistrar registrar( - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INIT_TASK_ID, - "IncMultiHeadSelfAttentionVerify Init"); + TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + "TreeIncMultiHeadSelfAttention Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant< - OpMeta *, - IncMultiHeadSelfAttentionVerify::init_task>( - registrar, "IncMultiHeadSelfAttentionVerify Init Task"); + Runtime::preregister_task_variant( + registrar, "TreeIncMultiHeadSelfAttention Init Task"); } { TaskVariantRegistrar registrar( - INC_MULTIHEAD_SELF_ATTENTION_VERIFY_INF_TASK_ID, - "IncMultiHeadSelfAttentionVerify Inference"); + TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + "TreeIncMultiHeadSelfAttention Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); Runtime::preregister_task_variant< - IncMultiHeadSelfAttentionVerify::inference_task>( - registrar, "IncMultiHeadSelfAttentionVerify Inference Task"); + TreeIncMultiHeadSelfAttention::inference_task>( + registrar, "TreeIncMultiHeadSelfAttention Inference Task"); } // NoOp { diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 19b6ac6b04..cd5e68d750 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -17,7 +17,6 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" -#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -33,6 +32,7 @@ #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -86,8 +86,8 @@ tl::optional get_op_parameters(Op const *op) { return ((MultiHeadAttention *)op)->get_params(); case OP_INC_MULTIHEAD_SELF_ATTENTION: return ((IncMultiHeadSelfAttention *)op)->get_params(); - case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: - return ((IncMultiHeadSelfAttentionVerify *)op)->get_params(); + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: + return ((TreeIncMultiHeadSelfAttention *)op)->get_params(); case OP_LAYERNORM: return ((LayerNorm *)op)->get_params(); case OP_REDUCE_SUM: diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index c353d8fa16..0c2a2e3f84 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -28,7 +28,6 @@ #include "flexflow/ops/embedding.h" #include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" -#include "flexflow/ops/inc_mha_verify.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -36,6 +35,7 @@ #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -3715,12 +3715,12 @@ bool FFModel::convert_graph_to_operators( new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION_VERIFY: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(inList.size() == 1); - IncMultiHeadSelfAttentionVerify *attn = - (IncMultiHeadSelfAttentionVerify *)node.ptr; + TreeIncMultiHeadSelfAttention *attn = + (TreeIncMultiHeadSelfAttention *)node.ptr; new_op = - new IncMultiHeadSelfAttentionVerify(*this, *attn, inputs[0], true); + new TreeIncMultiHeadSelfAttention(*this, *attn, inputs[0], true); break; } case OP_RMS_NORM: { From 577cec87a1d9ce782583a693fcd58187feed5679 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 8 May 2023 13:23:59 -0500 Subject: [PATCH 107/344] Support multiple FFModels in inference (#713) * Support multiple FFModels in a single top_level_task * rename operators and bug fixes * format * add missing files * rocm fix * Support multiple FFModels for speculative inference * add missing files * format --------- Co-authored-by: Gabriele Oliaro --- examples/cpp/inference/LLAMA/llama.cc | 2 +- .../cpp/inference/SPEC_LLAMA/CMakeLists.txt | 4 +- examples/cpp/inference/SPEC_LLAMA/llama.cc | 333 ++++++------------ examples/cpp/inference/SPEC_LLAMA/llama.h | 112 ------ .../cpp/inference/mixture_of_experts/moe.cc | 2 +- examples/cpp/inference/models/llama.cc | 206 +++++++++++ examples/cpp/inference/models/llama.h | 67 ++++ .../inference/transformers/transformers.cc | 2 +- include/flexflow/batch_config.h | 11 +- include/flexflow/config.h | 2 +- include/flexflow/ffconst.h | 6 + include/flexflow/inference.h | 9 +- src/ops/tree_inc_multihead_self_attention.cu | 3 +- src/runtime/batch_config.cc | 2 + src/runtime/cuda_helper.cu | 2 +- src/runtime/inference_manager.cc | 10 +- src/runtime/model.cc | 12 +- src/runtime/request_manager.cc | 48 ++- 18 files changed, 468 insertions(+), 365 deletions(-) delete mode 100644 examples/cpp/inference/SPEC_LLAMA/llama.h create mode 100644 examples/cpp/inference/models/llama.cc create mode 100644 examples/cpp/inference/models/llama.h diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index ac25f70467..d4f17b3d9b 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -166,7 +166,7 @@ void FlexFlow::top_level_task(Task const *task, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; - InferenceManager im(&ff, llamaConfig.batchSize, 1); + InferenceManager im(ff.config, llamaConfig.batchSize, 1); im.compile_model_and_allocate_buffer(&ff, mapping); RequestManager rm; diff --git a/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt b/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt index d6ceb38ff4..f273a385ea 100644 --- a/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt +++ b/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt @@ -7,8 +7,8 @@ set(project_target SPEC_LLAMA) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} llama.cc - llama.h - ../file_loader.cc) + ../file_loader.cc + ../models/llama.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.cc b/examples/cpp/inference/SPEC_LLAMA/llama.cc index ae5c4948e7..6cdc44f339 100644 --- a/examples/cpp/inference/SPEC_LLAMA/llama.cc +++ b/examples/cpp/inference/SPEC_LLAMA/llama.cc @@ -13,14 +13,14 @@ * limitations under the License. */ -#include "llama.h" +#include "models/llama.h" #include "flexflow/inference.h" using namespace Legion; LegionRuntime::Logger::Category log_app("llama"); -void parse_input_args(char **argv, int argc, LLAMAConfig &config) { +void parse_input_args(char **argv, int argc, LLAMA::Config &config) { for (int i = 1; i < argc; i++) { // input if (!strcmp(argv[i], "--dataset")) { @@ -41,239 +41,48 @@ void FlexFlow::top_level_task(Task const *task, Context ctx, Runtime *runtime) { FFConfig ffconfig; - LLAMAConfig llamaConfig; - FFModel ff(ffconfig); - //------------------------------compute machine views ------------------ - int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; - std::vector machine_views; - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - machine_views.push_back(view); - } - - std::unordered_map> mapping; - std::unordered_map weights_layers; + LLAMA::Config llama_config; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, llamaConfig); - - std::cout << "print llama config: " << llamaConfig.input_path << "-->" - << llamaConfig.batchSize << std::endl; - - //------------------------------ build the model -------------------------- - Tensor input; - { - int const token_dims[] = {llamaConfig.batchSize, llamaConfig.max_seq_len}; - input = ff.create_tensor<2>(token_dims, DT_INT32); - } - - mapping[input].push_back(machine_views[0]); - - Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - Tensor token = ff.embedding(input, - llamaConfig.vocab_size, - llamaConfig.dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - Layer *embedding = ff.layers.back(); - weights_layers.emplace("tok_embeddings_weight", embedding); - - // std::cout << "------token shape"; - // std::cout << token->num_dims << "------\n"; - // for (int i = 0; i < token->num_dims; i++) { - // std::cout << token->dims[i] << "------\n"; - // } - - // n transformer blocks impl - int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; - - for (int i = 0; i < 32; i++) { - // step 1: attention - std::vector axes = {2}; - Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); - Layer *attention_norm = ff.layers.back(); - - if (i % num_transformer_layers_per_gpu == 0) { - // Map att_norm to the next GPU - // since the size of att_norm is minimum across - // all tensors - mapping[att_norm].push_back( - machine_views[i / num_transformer_layers_per_gpu]); - } - - weights_layers.emplace("layers_" + std::to_string(i) + - "_attention_norm_weight", - attention_norm); - - // std::cout << "------before att shape"; - // std::cout << att_norm->num_dims << "------\n"; - // for (int i = 0; i < att_norm->num_dims; i++) { - // std::cout << att_norm->dims[i] << "------\n"; - // } - Tensor mha = ff.spec_inc_multihead_self_attention( - att_norm, - llamaConfig.dim, - llamaConfig.n_heads, - llamaConfig.dim / llamaConfig.n_heads, - llamaConfig.dim / llamaConfig.n_heads, - 0.0f, - true, - false, - false, - NULL, - true); - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - token = ff.add(token, mha); - - // step 2: SILU activaion - Tensor ff_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); - Layer *ffn_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", - ffn_layer); - - Tensor w1 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); - Layer *w1_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); - - Tensor w3 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); - Layer *w3_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); - - Tensor sigmoid = ff.sigmoid(w1); - Tensor silu = ff.multiply(w1, sigmoid); - Tensor multi = ff.multiply(silu, w3); - - Tensor w2 = ff.dense(multi, llamaConfig.dim, AC_MODE_NONE, false); - Layer *w2_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); - token = ff.add(token, w2); - } - - // final normalization and linear - std::vector axes = {2}; - token = ff.rms_norm(token, 1e-6, 4096); - Layer *final_norm = ff.layers.back(); - weights_layers.emplace("norm_weight", final_norm); - - Tensor dense = ff.dense(token, llamaConfig.vocab_size, AC_MODE_NONE, false); - Layer *final_linear = ff.layers.back(); - weights_layers.emplace("output_weight", final_linear); - - Tensor softmax = ff.softmax(dense, -1); - Tensor output = ff.beam_top_k(softmax, llamaConfig.max_beam_width, false); - - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager im(&ff, llamaConfig.batchSize, 1); - im.compile_model_and_allocate_buffer(&ff, mapping); + parse_input_args(argv, argc, llama_config); + InferenceManager im(ffconfig, llama_config.batchSize, 1); RequestManager rm; + // Add a single request + std::vector prompt{ + 1, 306, 4658, 278, 6593, 310, 2834, 338}; + rm.register_new_request(prompt, llama_config.sentence_len); - // std::cout << "------init ops----------" << std::endl; - // im.init_operators_inference(); - // std::cout << "------model compiled and init ----------" << std::endl; - - //------------------------------ load inputs -------------------------- - std::cout << "------create dataloaders ----------" << std::endl; - // read prompt into input - ParallelTensor input_pt; - ff.get_parallel_tensor_from_tensor(input, input_pt); - assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - std::cout << im.tensor_buffer[input_pt].size() << std::endl; - // DataLoader loader(ff, &llamaConfig, im.tensor_buffer[input_pt].at(0)); + FFModel beam_model(ffconfig), tree_model(ffconfig), inc_model(ffconfig); + LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); + LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); + LLAMA::create_llama_model(inc_model, im, llama_config, 1, INC_DECODING_MODE); - //------------------------------ load weights--------------------------- - // for (auto &v : weights_layers) { - // Tensor weight = v.second->weights[0]; - // std::cout << "weights layer: " << v.first << "\n"; - - // if (weight == NULL) { - // std::cout << "op no weights : " << v.first << "\n"; - // continue; - // } - - // size_t volume = 1; - // std::vector dims_vec; - // for (int i = 0; i < weight->num_dims; i++) { - // dims_vec.push_back(weight->dims[i]); - // volume *= weight->dims[i]; - // } - - // assert(weight->data_type == DT_FLOAT); - // float *data = (float *)malloc(sizeof(float) * volume); - - // if (v.first.find("attention_w") != std::string::npos) { - // loader.load_attention_weights( - // data, volume, v.first, llamaConfig.weight_file_path); - - // } else { - // loader.load_from_file( - // data, volume, llamaConfig.weight_file_path + v.first); - // } - - // ParallelTensor weight_pt; - // ff.get_parallel_tensor_from_tensor(weight, weight_pt); - // weight_pt->set_tensor(&ff, dims_vec, data); - // } - - FileDataLoader fileloader(llamaConfig.input_path, - llamaConfig.weight_file_path); - BatchConfig::TokenId *tokens = fileloader.generate_requests( - llamaConfig.batchSize, llamaConfig.max_seq_len); - - for (int i = 0; i < 40; i++) { - std::cout << tokens[i] << ", "; - } - for (int i = 0; i < llamaConfig.batchSize; i++) { - std::cout << "-------" << std::endl; - std::vector prompt( - tokens + i * llamaConfig.max_seq_len, - tokens + (i + 1) * llamaConfig.max_seq_len); - rm.register_new_request(prompt, llamaConfig.sentence_len); - } - - fileloader.load_weights(&ff, weights_layers); - - std::cout << "------load wieght finished----------" << std::endl; - - //------------------------------ do inference, we only have 5 prompts for the - // test case, so simplify the batch_configs with 1 - im.init_operators_inference(&ff); // entry--------------------------- int depth = 0; - std::map future_handlers; - std::map batch_configs; + std::map beam_future_handlers, tree_future_handler; + std::map beam_batch_configs; + std::map tree_batch_configs; bool new_req = true; + TreeVerifyBatchConfig tree_bc; - while (depth < llamaConfig.max_beam_depth) { + while (depth < llama_config.max_beam_depth) { int bid = 0; - if (future_handlers.find(bid) == future_handlers.end()) { + if (beam_future_handlers.find(bid) == beam_future_handlers.end()) { BeamSearchBatchConfig bc; BeamInferenceResult ir; bc = rm.prepare_next_batch_beam(bc, ir); std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&ff, bid, bc); + FutureMap fm = im.inference(&beam_model, bid, bc); assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; } else { // have luanched this bid - Future future = future_handlers[bid]; + Future future = beam_future_handlers[bid]; if (!future.is_ready(true /*subscribe*/)) { continue; } else { @@ -281,26 +90,106 @@ void FlexFlow::top_level_task(Task const *task, } // process end BeamInferenceResult ir = future.get_result(); - BeamSearchBatchConfig bc = batch_configs[bid]; + BeamSearchBatchConfig bc = beam_batch_configs[bid]; depth = bc.beamRequestsInfo[0].current_depth; bc = rm.prepare_next_batch_beam(bc, ir); std::cout << "llama current depth: " << depth << std::endl; std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&ff, bid, bc); + FutureMap fm = im.inference(&beam_model, bid, bc); assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; // tranverse the tree in dfs order; - if (depth >= llamaConfig.max_beam_depth) { - std::cout << "tranverse the tree" - << "\n"; - rm.tranverse_beam_tree(bc); + if (depth >= llama_config.max_beam_depth) { + // std::cout << "tranverse the tree" + // << "\n"; + // rm.tranverse_beam_tree(bc); + tree_bc = rm.convert_beam_to_tree_batch_config(bc); } } } + // original + { + std::vector tokens{1, + 306, + 4658, + 278, + 6593, + 310, + 2834, + 338, + 593, + 595, + 17252, + 5031, + 993, + 616, + 368, + 2302}; + BatchConfig bc; + bc.num_tokens = 16; + bc.requestsInfo[0].num_tokens_in_batch = bc.num_tokens; + bc.requestsInfo[0].token_start_offset = 0; + bc.requestsInfo[0].max_sequence_length = 347; + bc.requestsInfo[0].request_guid = 1000000; + bc.request_completed[0] = false; + for (int i = 0; i < bc.num_tokens; i++) { + bc.tokensInfo[i].token_id = tokens[i]; + bc.tokensInfo[i].abs_depth_in_request = i; + bc.tokensInfo[i].request_index = 0; + } + FutureMap fm = im.inference(&inc_model, 0, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + InferenceResult ir = future.get_result(); + for (int i = 0; i < bc.num_tokens; i++) { + printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); + } + } + + // verification + { + std::vector tokens{1, + 306, + 4658, + 278, + 6593, + 310, + 2834, + 338, + 593, + 595, + 17252, + 5031, + 993, + 616, + 368, + 2302}; + tree_bc.num_tokens = 16; + tree_bc.requestsInfo[0].num_tokens_in_batch = tree_bc.num_tokens; + for (int i = 0; i < tree_bc.num_tokens; i++) { + tree_bc.tokensInfo[i].token_id = tokens[i]; + tree_bc.tokensInfo[i].abs_depth_in_request = i; + tree_bc.tokensInfo[i].request_index = 0; + } + FutureMap fm = im.inference(&tree_model, 0, tree_bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + InferenceResult ir = future.get_result(); + for (int i = 0; i < tree_bc.num_tokens; i++) { + printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); + } + } + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + // float* data std::cout << "----------inference finished--------------" << std::endl; } diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.h b/examples/cpp/inference/SPEC_LLAMA/llama.h deleted file mode 100644 index d2a96e70a5..0000000000 --- a/examples/cpp/inference/SPEC_LLAMA/llama.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "file_loader.h" -#include "flexflow/batch_config.h" - -using namespace Legion; -using namespace FlexFlow; - -struct LLAMAConfig { - LLAMAConfig(void) { - // todo read from config/param file - n_layers = 32; - vocab_size = 32000; - n_heads = 32; - dim = 4096; - multiple_of = 256; - norm_eps = 1e-6; - total_sentence = 5; - sentence_len = 347; - max_gen_length = 256; - batchSize = 5; - total_requests = 2560; - incremental_mode = true; - sequence_length = BatchConfig::MAX_SEQ_LENGTH; - max_seq_len = 8; - max_beam_width = 3; - max_beam_depth = 3; - - // hidden dim - hidden_dim = 4 * dim; - hidden_dim = int(2 * hidden_dim / 3); - hidden_dim = - multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); - } - int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, - total_sentence, sentence_len, batchSize, total_requests, incremental_mode, - sequence_length, max_gen_length, max_seq_len, max_beam_width, - max_beam_depth; - float norm_eps; - std::string weight_file_path; - std::string input_path; -}; - -// struct Prediction_result{ -// long tokens[MAX_]; -// float probs[MAX_BEAM_SIZE]; -// int parent_ids[MAX_BEAM_SIZE]; -// }; - -// class DataLoader { -// public: -// DataLoader(FFModel &ff, -// LLAMAConfig const *llamaconfig, -// ParallelTensor const &input); -// void next_batch(FFModel &ff, -// BatchConfig *bc, -// std::map &batch_predictions); -// void reset(); -// static void load_entire_dataset(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime); -// static void load_input(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime); - -// template -// static void load_from_file(T *ptr, size_t size, std::string filename); - -// template -// static void load_attention_weights(T *ptr, -// size_t size, -// std::string layer_name, -// std::string weight_path); -// void store_outputs(BatchConfig *bc, -// InferenceResult const &ir, -// std::map &batch_predictions); -// void update_beam_slots(BatchConfig *bc, std::map -// batch_predictions); void update_beam_tree(); - -// public: -// int num_samples, next_index, next_token_idx, next_batch_index; -// std::map> outputs; -// FlexFlow::ParallelTensor full_input, batch_input; -// }; - -// struct SampleIdxs { -// int num_samples; -// int idxs[MAX_NUM_SAMPLES]; -// int token_idx; -// int batch_idx; -// }; - -// struct DataLoaderNextBatchInput { -// BatchConfig const &bc; -// std::map const &prev_batch_preds; -// }; diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 22752db39a..0c94452ec1 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -140,7 +140,7 @@ void FlexFlow::top_level_task(Task const *task, //------------------- Initialize the inference manager ------------------ InferenceManager im( - &ff, moeConfig.batch_size, moeConfig.num_inflight_batches); + ff.config, moeConfig.batch_size, moeConfig.num_inflight_batches); std::unordered_map> mapping; im.compile_model_and_allocate_buffer(&ff, mapping); im.init_operators_inference(&ff); diff --git a/examples/cpp/inference/models/llama.cc b/examples/cpp/inference/models/llama.cc new file mode 100644 index 0000000000..ad1743125d --- /dev/null +++ b/examples/cpp/inference/models/llama.cc @@ -0,0 +1,206 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "llama.h" + +namespace FlexFlow { + +using namespace Legion; + +void LLAMA::create_llama_model(FFModel &ff, + InferenceManager &im, + Config const &llama_config, + int num_pipeline_stages, + InferenceMode mode) { + //------------------------------compute machine views ------------------ + int num_devices = ff.config.workersPerNode * ff.config.numNodes; + std::vector machine_views; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } + + std::unordered_map> mapping; + std::unordered_map weights_layers; + + std::cout << "print llama config: " << llama_config.input_path << "-->" + << llama_config.batchSize << std::endl; + + Tensor input; + { + int const token_dims[] = {llama_config.batchSize, llama_config.max_seq_len}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + } + mapping[input].push_back(machine_views[0]); + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + Tensor token = ff.embedding(input, + llama_config.vocab_size, + llama_config.dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *embedding = ff.layers.back(); + weights_layers.emplace("tok_embeddings_weight", embedding); + + int num_transformer_layers_per_stage = + (32 + num_pipeline_stages - 1) / num_pipeline_stages; + + for (int i = 0; i < 1; i++) { + // step 1: attention + std::vector axes = {2}; + Tensor att_norm = + ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); + Layer *attention_norm = ff.layers.back(); + + if (i % num_transformer_layers_per_stage == 0) { + // Map att_norm to the next GPU + // since the size of att_norm is minimum across + // all tensors + mapping[att_norm].push_back( + machine_views[i / num_transformer_layers_per_stage]); + } + + weights_layers.emplace("layers_" + std::to_string(i) + + "_attention_norm_weight", + attention_norm); + + // std::cout << "------before att shape"; + // std::cout << att_norm->num_dims << "------\n"; + // for (int i = 0; i < att_norm->num_dims; i++) { + // std::cout << att_norm->dims[i] << "------\n"; + // } + Tensor mha; + switch (mode) { + case BEAM_SEARCH_MODE: { + mha = ff.spec_inc_multihead_self_attention( + att_norm, + llama_config.dim, + llama_config.n_heads, + llama_config.dim / llama_config.n_heads, + llama_config.dim / llama_config.n_heads, + 0.0f, + true, + false, + false, + NULL, + true); + break; + } + case TREE_VERIFY_MODE: { + mha = ff.inc_multihead_self_attention_verify( + att_norm, + llama_config.dim, + llama_config.n_heads, + llama_config.dim / llama_config.n_heads, + llama_config.dim / llama_config.n_heads, + 0.0f, /*dropout*/ + true, /*bias*/ + false, /*add_bias_kv*/ + false, /*add_zero_attn*/ + nullptr, /*kernel_initializer*/ + true /*apply_rotary_embedding*/ + ); + break; + } + case INC_DECODING_MODE: { + mha = ff.inc_multihead_self_attention( + att_norm, + llama_config.dim, + llama_config.n_heads, + llama_config.dim / llama_config.n_heads, + llama_config.dim / llama_config.n_heads, + 0.0f, /*dropout*/ + true, /*bias*/ + false, /*add_bias_kv*/ + false, /*add_zero_attn*/ + nullptr, /*kernel_initializer*/ + true /*apply_rotary_embedding*/ + ); + break; + } + default: { + assert(false); + } + } + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + token = ff.add(token, mha); + + // step 2: SILU activaion + Tensor ff_norm = + ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); + Layer *ffn_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", + ffn_layer); + + Tensor w1 = ff.dense(ff_norm, llama_config.hidden_dim, AC_MODE_NONE, false); + Layer *w1_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); + + Tensor w3 = ff.dense(ff_norm, llama_config.hidden_dim, AC_MODE_NONE, false); + Layer *w3_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); + + Tensor sigmoid = ff.sigmoid(w1); + Tensor silu = ff.multiply(w1, sigmoid); + Tensor multi = ff.multiply(silu, w3); + + Tensor w2 = ff.dense(multi, llama_config.dim, AC_MODE_NONE, false); + Layer *w2_layer = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); + token = ff.add(token, w2); + } + // final normalization and linear + std::vector axes = {2}; + token = ff.rms_norm(token, 1e-6, 4096); + Layer *final_norm = ff.layers.back(); + weights_layers.emplace("norm_weight", final_norm); + + Tensor dense = ff.dense(token, llama_config.vocab_size, AC_MODE_NONE, false); + Layer *final_linear = ff.layers.back(); + weights_layers.emplace("output_weight", final_linear); + + Tensor output; + if (mode == BEAM_SEARCH_MODE) { + Tensor softmax = ff.softmax(dense, -1); + output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + } else { + output = ff.arg_top_k(dense, /*k=*/1, false); + } + + // Compile the model + std::cout << "------start compile ----------" << std::endl; + im.compile_model_and_allocate_buffer(&ff, mapping); + FileDataLoader fileloader(llama_config.input_path, + llama_config.weight_file_path); + fileloader.load_weights(&ff, weights_layers); + std::cout << "------load wieght finished----------" << std::endl; + + // init operators + im.init_operators_inference(&ff); +} + +}; // namespace FlexFlow diff --git a/examples/cpp/inference/models/llama.h b/examples/cpp/inference/models/llama.h new file mode 100644 index 0000000000..612a28967e --- /dev/null +++ b/examples/cpp/inference/models/llama.h @@ -0,0 +1,67 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "file_loader.h" +#include "flexflow/batch_config.h" +#include "flexflow/inference.h" + +namespace FlexFlow { + +class LLAMA { +public: + struct Config { + Config(void) { + // todo read from config/param file + n_layers = 32; + vocab_size = 32000; + n_heads = 32; + dim = 4096; + multiple_of = 256; + norm_eps = 1e-6; + total_sentence = 5; + sentence_len = 347; + max_gen_length = 256; + batchSize = 5; + total_requests = 2560; + incremental_mode = true; + sequence_length = BatchConfig::MAX_SEQ_LENGTH; + max_seq_len = 8; + max_beam_width = 1; + max_beam_depth = 8; + + // hidden dim + hidden_dim = 4 * dim; + hidden_dim = int(2 * hidden_dim / 3); + hidden_dim = + multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); + } + int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, + total_sentence, sentence_len, batchSize, total_requests, + incremental_mode, sequence_length, max_gen_length, max_seq_len, + max_beam_width, max_beam_depth; + float norm_eps; + std::string weight_file_path; + std::string input_path; + }; + + static void create_llama_model(FFModel &ff, + InferenceManager &im, + Config const &llama_config, + int num_pipeline_stages, + InferenceMode mode); +}; + +}; // namespace FlexFlow diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 860fd23fe4..d416fdca3c 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -114,7 +114,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor output = ff.arg_top_k(t, /*k=*/1, false); //------------------- Initialize the inference manager ------------------ - InferenceManager im(&ff, + InferenceManager im(ff.config, transformerConfig.batch_size, transformerConfig.num_inflight_batches); std::unordered_map> mapping; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 461643e755..55bad9237a 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -72,20 +72,19 @@ class BatchConfig { class TreeVerifyBatchConfig : public BatchConfig { public: - struct PerTokenInfo : BatchConfig::PerTokenInfo { - int tree_branch_idx; - }; + // struct PerTokenInfo : BatchConfig::PerTokenInfo { + // int tree_branch_idx; + // }; struct CommittedTokensInfo { int token_index; // the index of the token in the previous batch int request_index; // request index in the batch int token_depth; // position of the token in the request's sequence }; - void compute_tree_branch_indexes(); + // void compute_tree_branch_indexes(); int num_tokens_to_commit; CommittedTokensInfo commited_tokens[MAX_NUM_TOKENS]; - PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; }; struct InferenceResult { @@ -105,7 +104,7 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; - static int const MAX_BEAM_WIDTH = 3; + static int const MAX_BEAM_WIDTH = 1; static int const MAX_BEAM_DEPTH = 8; struct BeamSearchPerRequestInfo { diff --git a/include/flexflow/config.h b/include/flexflow/config.h index d82b1377c7..c8a9f50aa2 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -122,7 +122,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; - Legion::FieldSpace field_space; + // Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; size_t simulator_work_space_size; size_t search_budget; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index e6a4eb6f3c..6bcfb66927 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -58,6 +58,12 @@ enum MetricsType { METRICS_MEAN_ABSOLUTE_ERROR = 1032, }; +enum InferenceMode { + INC_DECODING_MODE = 2001, + BEAM_SEARCH_MODE = 2002, + TREE_VERIFY_MODE = 2003, +}; + // This is consistent with TASO's OpType // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138 enum OperatorType { diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 3025d8a748..623b8ffd32 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -26,7 +26,7 @@ class BeamTree; class InferenceManager { public: - InferenceManager(FFModel *_model, + InferenceManager(FFConfig const &config, int max_num_tokens_per_batch, int max_num_inflight_batches); void compile_model_and_allocate_buffer( @@ -39,8 +39,8 @@ class InferenceManager { ParallelTensor const input); public: + FFConfig ff_config; std::unordered_map> tensor_buffer; - FFModel *model; int max_num_tokens_per_batch; int max_num_inflight_batches; int num_devices; @@ -69,6 +69,7 @@ class RequestManager { using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; RequestManager(); + size_t get_num_processed_requests(); RequestGuid register_new_request(std::vector const &prompt, int max_sequence_length); BatchConfig prepare_next_batch(BatchConfig const &bc, @@ -84,7 +85,8 @@ class RequestManager { BeamTree &tree, int request_index); void tranverse_beam_tree(BeamSearchBatchConfig const &old_bc); - + TreeVerifyBatchConfig + convert_beam_to_tree_batch_config(BeamSearchBatchConfig const &beam_bc); static void load_tokens_task(Legion::Task const *task, std::vector const ®ions, @@ -97,6 +99,7 @@ class RequestManager { std::mutex request_queue_mutex; RequestGuid next_available_guid; struct BeamTree beam_trees[BatchConfig::MAX_NUM_REQUESTS]; + size_t num_processed_requests; }; } // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index c9b85f96b8..d2b9f63401 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -414,7 +414,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; while (processed_tokens_in_batch <= last_token_idx_of_the_request) { int num_new_tokens = 1; - int j = num_new_tokens + processed_tokens_in_batch; + int j = processed_tokens_in_batch; while ((j + 1 <= last_token_idx_of_the_request) && (bc->tokensInfo[j].abs_depth_in_request + 1 == bc->tokensInfo[j + 1].abs_depth_in_request)) { @@ -506,6 +506,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens_in_request); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 2f2655f589..2bed8f14c0 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -173,6 +173,7 @@ void BatchConfig::print() const { } } +#ifdef DEADCODE void TreeVerifyBatchConfig::compute_tree_branch_indexes() { // Must be called only after setting num_tokens! auto is_first_token_in_request = [&](int token_index) -> bool { @@ -190,5 +191,6 @@ void TreeVerifyBatchConfig::compute_tree_branch_indexes() { } } } +#endif }; // namespace FlexFlow diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 2fcf0e096a..f4c39c6b0b 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -585,4 +585,4 @@ template __host__ bool download_tensor(int32_t const *ptr, size_t num_elements); template __host__ bool download_tensor(int64_t const *ptr, int64_t *dst, - size_t num_elements); \ No newline at end of file + size_t num_elements); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index e9fe33f22e..25d6e2a00e 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -25,13 +25,13 @@ namespace FlexFlow { using namespace Legion; -InferenceManager::InferenceManager(FFModel *_model, +InferenceManager::InferenceManager(FFConfig const &_config, int _max_num_tokens_per_batch, int _max_num_inflight_batches) - : model(_model), max_num_tokens_per_batch(_max_num_tokens_per_batch), + : ff_config(_config), max_num_tokens_per_batch(_max_num_tokens_per_batch), max_num_inflight_batches(_max_num_inflight_batches) { // populate array of valid single-device machine views - num_devices = model->config.workersPerNode * model->config.numNodes; + num_devices = ff_config.workersPerNode * ff_config.numNodes; for (int i = 0; i < num_devices; i++) { MachineView view; view.device_type = MachineView::GPU; @@ -246,8 +246,8 @@ FutureMap InferenceManager::inference(FFModel *model, void InferenceManager::load_input_tokens_from_batch_config( BatchConfig const &bc, ParallelTensor const input) { - Context ctx = model->config.lg_ctx; - Runtime *runtime = model->config.lg_hlr; + Context ctx = ff_config.lg_ctx; + Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; IndexLauncher launcher( diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f3e8664161..38bb7b8333 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1317,11 +1317,11 @@ FFModel::FFModel(FFConfig &_config) metrics_input = -1; // Load strategy file // Create field space - { - FieldAllocator allocator = - runtime->create_field_allocator(ctx, config.field_space); - allocator.allocate_field(sizeof(float), FID_DATA); - } + //{ + // FieldAllocator allocator = + // runtime->create_field_allocator(ctx, config.field_space); + // allocator.allocate_field(sizeof(float), FID_DATA); + //} // Build training dataset // if (config.datasetPath.length() == 0) { // dataLoader = NULL; @@ -3712,7 +3712,7 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); - field_space = runtime->create_field_space(lg_ctx); + // field_space = runtime->create_field_space(lg_ctx); } void FFConfig::parse_args(char **argv, int argc) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 810be3df1f..149bc18ec7 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -22,7 +22,8 @@ using namespace Legion; LegionRuntime::Logger::Category log_req_mgr("RequestManager"); -RequestManager::RequestManager() : next_available_guid(1000000) {} +RequestManager::RequestManager() + : next_available_guid(1000000), num_processed_requests(0) {} RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, @@ -41,6 +42,10 @@ RequestManager::RequestGuid return request.guid; } +size_t RequestManager::get_num_processed_requests() { + return num_processed_requests; +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); @@ -436,7 +441,7 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, } } -bool PreOrder(BeamTree tree, +bool PreOrder(BeamTree const &tree, int max_depth, int current_depth, int beam_width, @@ -491,6 +496,43 @@ bool PreOrder(BeamTree tree, return flag; } +TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( + BeamSearchBatchConfig const &beam_bc) { + TreeVerifyBatchConfig tree_bc; + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (beam_bc.request_completed[i]) { + continue; + } + // We don't modify requests during the conversion + tree_bc.request_completed[i] = beam_bc.request_completed[i]; + BeamTree const &tree = beam_trees[i]; + // token, index + // todo make this one global for different stages + std::vector> serializedTree; + PreOrder(tree, + beam_bc.beamRequestsInfo[i].max_depth, + 0, + beam_bc.beamRequestsInfo[i].beam_size, + 0, + serializedTree); + tree_bc.requestsInfo[i].request_guid = beam_bc.requestsInfo[i].request_guid; + tree_bc.requestsInfo[i].max_sequence_length = + beam_bc.requestsInfo[i].max_sequence_length; + tree_bc.requestsInfo[i].token_start_offset = serializedTree[0].second; + tree_bc.requestsInfo[i].num_tokens_in_batch = 0; + for (int k = 0; k < serializedTree.size(); k++) { + assert(tree_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS); + tree_bc.tokensInfo[tree_bc.num_tokens].request_index = i; + tree_bc.tokensInfo[tree_bc.num_tokens].abs_depth_in_request = + serializedTree[k].second; + tree_bc.tokensInfo[tree_bc.num_tokens].token_id = serializedTree[k].first; + tree_bc.num_tokens++; + tree_bc.requestsInfo[i].num_tokens_in_batch++; + } + } + return tree_bc; +} + void RequestManager::tranverse_beam_tree(BeamSearchBatchConfig const &old_bc) { for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { if (old_bc.request_completed[i]) { @@ -502,7 +544,7 @@ void RequestManager::tranverse_beam_tree(BeamSearchBatchConfig const &old_bc) { int depth = old_bc.beamRequestsInfo[i].current_depth; int beam_width = old_bc.beamRequestsInfo[i].beam_size; - BeamTree tree = beam_trees[i]; + BeamTree const &tree = beam_trees[i]; // token, index // todo make this one global for different stages From 21a2dd7c763f7efab3b63b7554fbafba5175a157 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 May 2023 21:17:04 +0000 Subject: [PATCH 108/344] fix --- examples/cpp/inference/file_loader.cc | 58 ++++++++++++++++++--------- examples/cpp/inference/file_loader.h | 5 ++- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index f9f399b464..57baf05694 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -20,8 +20,10 @@ using namespace std; FileDataLoader::FileDataLoader(std::string _input_path, - std::string _weight_file_path) - : input_path(_input_path), weight_file_path(_weight_file_path){}; + std::string _weight_file_path, + int _num_heads) + : input_path(_input_path), weight_file_path(_weight_file_path), + num_heads(_num_heads){}; BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { @@ -61,8 +63,9 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { return prompts; }; -void load_attention_weights(float *ptr, - size_t size, +void load_attention_weights(float *dst_ptr, + size_t total_weights_size, + int num_heads, std::string layer_name, std::string weight_path) { std::string q_file = weight_path + @@ -80,12 +83,12 @@ void load_attention_weights(float *ptr, std::vector weight_files = {q_file, k_file, v_file, o_file}; size_t index = 0; - int file_index = 0; + int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} - // q, k, v, o -> 0, 1, 2, 3 for (auto file : weight_files) { - std::cout << "file name and index: " << file << "->" << file_index << "\n"; - size_t partial_size = size / 4; + std::cout << "file name and index: " << file << "->" << weight_index + << "\n"; + size_t partial_size = total_weights_size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; @@ -100,17 +103,35 @@ void load_attention_weights(float *ptr, } assert(partial_size == host_array.size()); - size_t one_head_size = 4096 * 128; - size_t data_index = 0; - - for (int i = 0; i < 32; i++) { - size_t start_index = i * one_head_size * 4 + file_index * one_head_size; - for (size_t j = start_index; j < start_index + one_head_size; j++) { - ptr[j] = host_array.at(data_index); - data_index += 1; + size_t hidden_dim = 4096; + size_t qkv_inner_dim = 128; + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_head_size = + single_proj_size * 4; // size of Q+K+V+O weights for a single head + size_t checkpoint_idx, flexflow_idx; + + for (int i = 0; i < num_heads * single_proj_size; i++) { + int checkpoint_row_idx = i % hidden_dim; + int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; + int head_idx = i / single_proj_size; + checkpoint_idx = head_idx * one_head_size + + weight_index * single_proj_size + + checkpoint_column_idx * hidden_dim + checkpoint_row_idx; + if (weight_index < 3) { + // if this is the Q,K or V weight + flexflow_idx = checkpoint_idx; + } else { + // if this is the output projection weight + flexflow_idx = + head_idx * one_head_size + weight_index * single_proj_size + + checkpoint_row_idx * qkv_inner_dim + checkpoint_column_idx; } + dst_ptr[flexflow_idx] = host_array.at(checkpoint_idx); } - file_index++; + + weight_index++; in.close(); index++; @@ -169,7 +190,8 @@ void FileDataLoader::load_weights( float *data = (float *)malloc(sizeof(float) * volume); if (v.first.find("attention_w") != std::string::npos) { - load_attention_weights(data, volume, v.first, weight_file_path); + load_attention_weights( + data, volume, num_heads, v.first, weight_file_path); } else { load_from_file(data, volume, weight_file_path + v.first); diff --git a/examples/cpp/inference/file_loader.h b/examples/cpp/inference/file_loader.h index e1edc3f1a9..1005729ddd 100644 --- a/examples/cpp/inference/file_loader.h +++ b/examples/cpp/inference/file_loader.h @@ -23,7 +23,9 @@ using namespace FlexFlow; class FileDataLoader { public: - FileDataLoader(std::string _input_path, std::string _weight_file_path); + FileDataLoader(std::string _input_path, + std::string _weight_file_path, + int _num_heads = 32); BatchConfig::TokenId *generate_requests(int num, int length); @@ -31,6 +33,7 @@ class FileDataLoader { std::unordered_map weights_layers); private: + int num_heads; std::string input_path; std::string weight_file_path; }; From a7cc9b19615c2c2f385ed9f4b6ef52b67a421061 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 May 2023 21:34:49 +0000 Subject: [PATCH 109/344] more fixes --- examples/cpp/inference/LLAMA/dataloader.cc | 73 ++++++++++++++-------- examples/cpp/inference/LLAMA/llama.cc | 9 ++- examples/cpp/inference/LLAMA/llama.h | 7 ++- examples/cpp/inference/file_loader.cc | 28 ++++++--- examples/cpp/inference/file_loader.h | 5 +- examples/cpp/inference/models/llama.cc | 5 +- 6 files changed, 86 insertions(+), 41 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index 13de5b5b64..1a99ac0099 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -37,7 +37,8 @@ DataLoader::DataLoader(FFModel &ff, } // size_t llamaconfig_size = sizeof(llamaconfig); - // std::cout << "llama config dataloader: " << llamaconfig->input_path; + // std::cout << "llama config dataloader: " << llamaconfig->input_path << + // std::endl; // // Load entire dataset // TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, @@ -66,7 +67,7 @@ void DataLoader::load_entire_dataset(Task const *task, assert(acc_input.accessor.is_dense_arbitrary(rect_input)); long *input_ptr = acc_input.ptr(rect_input.lo); - std::cout << "load entire dataset" << rect_input.volume(); + std::cout << "load entire dataset" << rect_input.volume() << std::endl; // load from file load_from_file(input_ptr, @@ -129,7 +130,6 @@ void DataLoader::reset() { template void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { - std::cout << "load from file: " << filename << std::endl; std::ifstream in(filename, std::ios::in | std::ios::binary); std::vector host_array(size); @@ -159,8 +159,11 @@ void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { } template -void DataLoader::load_attention_weights(T *ptr, - size_t size, +void DataLoader::load_attention_weights(T *dst_ptr, + size_t total_weights_size, + int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, std::string layer_name, std::string weight_path) { @@ -178,41 +181,55 @@ void DataLoader::load_attention_weights(T *ptr, "attention_wo_weight"; std::vector weight_files = {q_file, k_file, v_file, o_file}; - size_t index = 0; - int file_index = 0; + int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} - // q, k, v, o -> 0, 1, 2, 3 for (auto file : weight_files) { - std::cout << "file name and index: " << file << "->" << file_index << "\n"; - size_t partial_size = size / 4; + std::cout << "file name and index: " << file << "->" << weight_index + << "\n"; + size_t partial_size = total_weights_size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(T) * partial_size; + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error"; + std::cout << "load data error" << std::endl; return; } assert(partial_size == host_array.size()); - size_t one_head_size = 4096 * 128; - size_t data_index = 0; - - for (int i = 0; i < 32; i++) { - size_t start_index = i * one_head_size * 4 + file_index * one_head_size; - for (size_t j = start_index; j < start_index + one_head_size; j++) { - ptr[j] = host_array.at(data_index); - data_index += 1; + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_head_size = + single_proj_size * 4; // size of Q+K+V+O weights for a single head + size_t checkpoint_idx, flexflow_idx; + + for (int i = 0; i < num_heads * single_proj_size; i++) { + int checkpoint_row_idx = i % hidden_dim; + int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; + int head_idx = i / single_proj_size; + checkpoint_idx = head_idx * one_head_size + + weight_index * single_proj_size + + checkpoint_column_idx * hidden_dim + checkpoint_row_idx; + if (weight_index < 3) { + // if this is the Q,K or V weight + flexflow_idx = checkpoint_idx; + } else { + // if this is the output projection weight + flexflow_idx = + head_idx * one_head_size + weight_index * single_proj_size + + checkpoint_row_idx * qkv_inner_dim + checkpoint_column_idx; } + dst_ptr[flexflow_idx] = host_array.at(checkpoint_idx); } - file_index++; + + weight_index++; in.close(); - index++; } } @@ -251,8 +268,14 @@ void DataLoader::store_outputs(BatchConfig *bc, assert(batch_predictions.size() == bc->num_active_requests()); } -template void DataLoader::load_attention_weights( - float *ptr, size_t size, std::string layer_name, std::string weight_path); +template void + DataLoader::load_attention_weights(float *dst_ptr, + size_t total_weights_size, + int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weight_path); template void DataLoader::load_from_file(long *ptr, size_t size, std::string filename); diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index d4f17b3d9b..6d714ba2fa 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -200,8 +200,13 @@ void FlexFlow::top_level_task(Task const *task, float *data = (float *)malloc(sizeof(float) * volume); if (v.first.find("attention_w") != std::string::npos) { - loader.load_attention_weights( - data, volume, v.first, llamaConfig.weight_file_path); + loader.load_attention_weights(data, + volume, + llamaConfig.n_heads, + llamaConfig.dim, + llamaConfig.dim / llamaConfig.n_heads, + v.first, + llamaConfig.weight_file_path); } else { loader.load_from_file( diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index 0ae8d57d5b..04cde4a73c 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -82,8 +82,11 @@ class DataLoader { static void load_from_file(T *ptr, size_t size, std::string filename); template - static void load_attention_weights(T *ptr, - size_t size, + static void load_attention_weights(T *dst_ptr, + size_t total_weights_size, + int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, std::string layer_name, std::string weight_path); void store_outputs(BatchConfig *bc, diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index 57baf05694..8241b920ad 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -21,9 +21,12 @@ using namespace std; FileDataLoader::FileDataLoader(std::string _input_path, std::string _weight_file_path, - int _num_heads) + int _num_heads, + size_t _hidden_dim, + size_t _qkv_inner_dim) : input_path(_input_path), weight_file_path(_weight_file_path), - num_heads(_num_heads){}; + num_heads(_num_heads), hidden_dim(_hidden_dim), + qkv_inner_dim(_qkv_inner_dim){}; BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { @@ -44,7 +47,7 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error"; + std::cout << "load data error" << std::endl; return prompts; } @@ -66,6 +69,8 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { void load_attention_weights(float *dst_ptr, size_t total_weights_size, int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, std::string layer_name, std::string weight_path) { std::string q_file = weight_path + @@ -98,13 +103,11 @@ void load_attention_weights(float *dst_ptr, size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error"; + std::cout << "load data error" << std::endl; return; } assert(partial_size == host_array.size()); - size_t hidden_dim = 4096; - size_t qkv_inner_dim = 128; size_t single_proj_size = hidden_dim * qkv_inner_dim; // size of each of Q,K,V,O weights for a single head @@ -152,11 +155,11 @@ void load_from_file(float *ptr, size_t size, std::string filename) { // std::cout << loaded_data_size << std::endl; // std::cout << in_get_size << std::endl; if (in_get_size != loaded_data_size) { - std::cout << "load data error"; + std::cout << "load data error" << std::endl; return; } - // std::cout << "finish loading input"; + // std::cout << "finish loading input" << std::endl; assert(size == host_array.size()); // normal @@ -190,8 +193,13 @@ void FileDataLoader::load_weights( float *data = (float *)malloc(sizeof(float) * volume); if (v.first.find("attention_w") != std::string::npos) { - load_attention_weights( - data, volume, num_heads, v.first, weight_file_path); + load_attention_weights(data, + volume, + num_heads, + hidden_dim, + qkv_inner_dim, + v.first, + weight_file_path); } else { load_from_file(data, volume, weight_file_path + v.first); diff --git a/examples/cpp/inference/file_loader.h b/examples/cpp/inference/file_loader.h index 1005729ddd..7d03b3ac82 100644 --- a/examples/cpp/inference/file_loader.h +++ b/examples/cpp/inference/file_loader.h @@ -25,7 +25,9 @@ class FileDataLoader { public: FileDataLoader(std::string _input_path, std::string _weight_file_path, - int _num_heads = 32); + int _num_heads, + size_t _hidden_dim, + size_t _qkv_inner_dim); BatchConfig::TokenId *generate_requests(int num, int length); @@ -34,6 +36,7 @@ class FileDataLoader { private: int num_heads; + size_t hidden_dim, qkv_inner_dim; std::string input_path; std::string weight_file_path; }; diff --git a/examples/cpp/inference/models/llama.cc b/examples/cpp/inference/models/llama.cc index ad1743125d..efd5c18b6e 100644 --- a/examples/cpp/inference/models/llama.cc +++ b/examples/cpp/inference/models/llama.cc @@ -195,7 +195,10 @@ void LLAMA::create_llama_model(FFModel &ff, std::cout << "------start compile ----------" << std::endl; im.compile_model_and_allocate_buffer(&ff, mapping); FileDataLoader fileloader(llama_config.input_path, - llama_config.weight_file_path); + llama_config.weight_file_path, + llama_config.n_heads, + llama_config.dim, + llama_config.dim / llama_config.n_heads); fileloader.load_weights(&ff, weights_layers); std::cout << "------load wieght finished----------" << std::endl; From 77ee93ad26c06b738a7e6b0b41a58d27622f7ff7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 May 2023 21:38:26 +0000 Subject: [PATCH 110/344] fix --- examples/cpp/inference/LLAMA/dataloader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index 1a99ac0099..a2a0e48561 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -188,8 +188,8 @@ void DataLoader::load_attention_weights(T *dst_ptr, << "\n"; size_t partial_size = total_weights_size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(float) * partial_size; + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(T) * partial_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); From b70b5b6c9999674c4e1c28c010a37ddd7492eb85 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 May 2023 23:27:04 +0000 Subject: [PATCH 111/344] fix --- examples/cpp/inference/LLAMA/dataloader.cc | 6 ++++ examples/cpp/inference/LLAMA/llama.h | 2 +- examples/cpp/inference/file_loader.cc | 40 ++++++++++------------ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index a2a0e48561..eab168a452 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -208,6 +208,12 @@ void DataLoader::load_attention_weights(T *dst_ptr, single_proj_size * 4; // size of Q+K+V+O weights for a single head size_t checkpoint_idx, flexflow_idx; + assert(total_weights_size == one_head_size); + assert(partial_size == single_proj_size); + + std::cout << "host_array.size(): " << host_array.size() << std::endl; + std::cout << "single_proj_size: " << single_proj_size << std::endl; + for (int i = 0; i < num_heads * single_proj_size; i++) { int checkpoint_row_idx = i % hidden_dim; int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index 04cde4a73c..7584c26aa4 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -42,7 +42,7 @@ struct LLAMAConfig { // todo from args weight_file_path = - "/home/ubuntu/FlexFlow_Inference/examples/cpp/inference/LLAMA/weights/"; + "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/weights/"; input_path = "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/tokens/" "llama_demo_tokens"; diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index 8241b920ad..820ffb17df 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -67,7 +67,6 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { }; void load_attention_weights(float *dst_ptr, - size_t total_weights_size, int num_heads, size_t hidden_dim, size_t qkv_inner_dim, @@ -87,16 +86,21 @@ void load_attention_weights(float *dst_ptr, "attention_wo_weight"; std::vector weight_files = {q_file, k_file, v_file, o_file}; - size_t index = 0; int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_head_size = + single_proj_size * 4; // size of Q+K+V+O weights for a single head + size_t one_weight_file_size = + num_heads * single_proj_size; // size of each of Q/K/V/O for all heads for (auto file : weight_files) { std::cout << "file name and index: " << file << "->" << weight_index << "\n"; - size_t partial_size = total_weights_size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(float) * partial_size; + std::vector host_array(one_weight_file_size); + size_t loaded_data_size = sizeof(float) * one_weight_file_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); @@ -106,38 +110,30 @@ void load_attention_weights(float *dst_ptr, std::cout << "load data error" << std::endl; return; } - assert(partial_size == host_array.size()); + assert(one_weight_file_size == host_array.size()); - size_t single_proj_size = - hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head - size_t one_head_size = - single_proj_size * 4; // size of Q+K+V+O weights for a single head - size_t checkpoint_idx, flexflow_idx; - - for (int i = 0; i < num_heads * single_proj_size; i++) { + size_t flexflow_idx; + for (int i = 0; i < one_weight_file_size; i++) { int checkpoint_row_idx = i % hidden_dim; int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; int head_idx = i / single_proj_size; - checkpoint_idx = head_idx * one_head_size + - weight_index * single_proj_size + - checkpoint_column_idx * hidden_dim + checkpoint_row_idx; if (weight_index < 3) { // if this is the Q,K or V weight - flexflow_idx = checkpoint_idx; + flexflow_idx = + head_idx * one_head_size + weight_index * single_proj_size + + checkpoint_column_idx * qkv_inner_dim + checkpoint_row_idx; } else { // if this is the output projection weight flexflow_idx = head_idx * one_head_size + weight_index * single_proj_size + checkpoint_row_idx * qkv_inner_dim + checkpoint_column_idx; } - dst_ptr[flexflow_idx] = host_array.at(checkpoint_idx); + dst_ptr[flexflow_idx] = host_array.at(i); } weight_index++; in.close(); - index++; } } @@ -193,8 +189,10 @@ void FileDataLoader::load_weights( float *data = (float *)malloc(sizeof(float) * volume); if (v.first.find("attention_w") != std::string::npos) { + assert(dims_vec[0] = hidden_dim * qkv_inner_dim * 4); + assert(dims_vec[1] = num_heads); + assert(volume == dims_vec[0] * dims_vec[1]); load_attention_weights(data, - volume, num_heads, hidden_dim, qkv_inner_dim, From c19244b539df13e0bcbe2937dee0c901d8e3424d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 May 2023 23:43:21 +0000 Subject: [PATCH 112/344] update --- examples/cpp/inference/.gitignore | 1 + examples/cpp/inference/LLAMA/dataloader.cc | 41 +++++++++------------- examples/cpp/inference/LLAMA/llama.cc | 5 ++- examples/cpp/inference/LLAMA/llama.h | 1 - 4 files changed, 21 insertions(+), 27 deletions(-) create mode 100644 examples/cpp/inference/.gitignore diff --git a/examples/cpp/inference/.gitignore b/examples/cpp/inference/.gitignore new file mode 100644 index 0000000000..05424f2a4c --- /dev/null +++ b/examples/cpp/inference/.gitignore @@ -0,0 +1 @@ +weights diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index eab168a452..8eabc0a464 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -160,7 +160,6 @@ void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { template void DataLoader::load_attention_weights(T *dst_ptr, - size_t total_weights_size, int num_heads, size_t hidden_dim, size_t qkv_inner_dim, @@ -182,14 +181,20 @@ void DataLoader::load_attention_weights(T *dst_ptr, std::vector weight_files = {q_file, k_file, v_file, o_file}; int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_head_size = + single_proj_size * 4; // size of Q+K+V+O weights for a single head + size_t one_weight_file_size = + num_heads * single_proj_size; // size of each of Q/K/V/O for all heads for (auto file : weight_files) { std::cout << "file name and index: " << file << "->" << weight_index << "\n"; - size_t partial_size = total_weights_size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(T) * partial_size; + std::vector host_array(one_weight_file_size); + size_t loaded_data_size = sizeof(T) * one_weight_file_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); @@ -199,38 +204,25 @@ void DataLoader::load_attention_weights(T *dst_ptr, std::cout << "load data error" << std::endl; return; } - assert(partial_size == host_array.size()); + assert(one_weight_file_size == host_array.size()); - size_t single_proj_size = - hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head - size_t one_head_size = - single_proj_size * 4; // size of Q+K+V+O weights for a single head - size_t checkpoint_idx, flexflow_idx; - - assert(total_weights_size == one_head_size); - assert(partial_size == single_proj_size); - - std::cout << "host_array.size(): " << host_array.size() << std::endl; - std::cout << "single_proj_size: " << single_proj_size << std::endl; - - for (int i = 0; i < num_heads * single_proj_size; i++) { + size_t flexflow_idx; + for (int i = 0; i < one_weight_file_size; i++) { int checkpoint_row_idx = i % hidden_dim; int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; int head_idx = i / single_proj_size; - checkpoint_idx = head_idx * one_head_size + - weight_index * single_proj_size + - checkpoint_column_idx * hidden_dim + checkpoint_row_idx; if (weight_index < 3) { // if this is the Q,K or V weight - flexflow_idx = checkpoint_idx; + flexflow_idx = + head_idx * one_head_size + weight_index * single_proj_size + + checkpoint_column_idx * qkv_inner_dim + checkpoint_row_idx; } else { // if this is the output projection weight flexflow_idx = head_idx * one_head_size + weight_index * single_proj_size + checkpoint_row_idx * qkv_inner_dim + checkpoint_column_idx; } - dst_ptr[flexflow_idx] = host_array.at(checkpoint_idx); + dst_ptr[flexflow_idx] = host_array.at(i); } weight_index++; @@ -276,7 +268,6 @@ void DataLoader::store_outputs(BatchConfig *bc, template void DataLoader::load_attention_weights(float *dst_ptr, - size_t total_weights_size, int num_heads, size_t hidden_dim, size_t qkv_inner_dim, diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 6d714ba2fa..5724a91859 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -200,8 +200,11 @@ void FlexFlow::top_level_task(Task const *task, float *data = (float *)malloc(sizeof(float) * volume); if (v.first.find("attention_w") != std::string::npos) { + assert(dims_vec[0] = + llamaConfig.dim * (llamaConfig.dim / llamaConfig.n_heads) * 4); + assert(dims_vec[1] = llamaConfig.n_heads); + assert(volume == dims_vec[0] * dims_vec[1]); loader.load_attention_weights(data, - volume, llamaConfig.n_heads, llamaConfig.dim, llamaConfig.dim / llamaConfig.n_heads, diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index 7584c26aa4..cfa9345c04 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -83,7 +83,6 @@ class DataLoader { template static void load_attention_weights(T *dst_ptr, - size_t total_weights_size, int num_heads, size_t hidden_dim, size_t qkv_inner_dim, From 7ef5bd7a49d057bfc989ab66d387a0eb69378eaa Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 May 2023 23:47:46 +0000 Subject: [PATCH 113/344] fix --- src/ops/spec_inc_multihead_self_attention.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index dc1d861b08..97e91460a6 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -34,7 +34,8 @@ __global__ void spec_build_w_out_tensor(float const *weight_ptr, int v_idx = i % vProjSize; int o_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[i] = + contiguous_weight_ptr[o_idx * vProjSize * num_heads + head_idx * vProjSize + + v_idx] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + qkv_weight_block_size + o_idx * vProjSize + v_idx]; } From 7ae43ee5bac448ba6d44135fca4d4336de701480 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 9 May 2023 01:17:27 +0000 Subject: [PATCH 114/344] fix --- examples/cpp/inference/file_loader.cc | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index 820ffb17df..277d7fd2f6 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -121,7 +121,7 @@ void load_attention_weights(float *dst_ptr, // if this is the Q,K or V weight flexflow_idx = head_idx * one_head_size + weight_index * single_proj_size + - checkpoint_column_idx * qkv_inner_dim + checkpoint_row_idx; + checkpoint_column_idx * hidden_dim + checkpoint_row_idx; } else { // if this is the output projection weight flexflow_idx = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 97e91460a6..6efd8b8874 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -31,11 +31,12 @@ __global__ void spec_build_w_out_tensor(float const *weight_ptr, int num_heads, int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { + // vProjSize = 128, oProjSize = 4096 int v_idx = i % vProjSize; int o_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[o_idx * vProjSize * num_heads + head_idx * vProjSize + - v_idx] = + contiguous_weight_ptr[head_idx * vProjSize * oProjSize + v_idx * oProjSize + + o_idx] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + qkv_weight_block_size + o_idx * vProjSize + v_idx]; } From d045e3e5cddb7fe97f2299ac22ac54266e36fd39 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 9 May 2023 01:20:02 +0000 Subject: [PATCH 115/344] format --- examples/cpp/inference/file_loader.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index 277d7fd2f6..07cfd0dccf 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -89,7 +89,7 @@ void load_attention_weights(float *dst_ptr, int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} size_t single_proj_size = hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head size_t one_head_size = single_proj_size * 4; // size of Q+K+V+O weights for a single head size_t one_weight_file_size = @@ -119,9 +119,9 @@ void load_attention_weights(float *dst_ptr, int head_idx = i / single_proj_size; if (weight_index < 3) { // if this is the Q,K or V weight - flexflow_idx = - head_idx * one_head_size + weight_index * single_proj_size + - checkpoint_column_idx * hidden_dim + checkpoint_row_idx; + flexflow_idx = head_idx * one_head_size + + weight_index * single_proj_size + + checkpoint_column_idx * hidden_dim + checkpoint_row_idx; } else { // if this is the output projection weight flexflow_idx = From 2d54c32a9acdbf07fc86e097f8109821d1025f25 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 May 2023 06:15:53 +0000 Subject: [PATCH 116/344] cleanup --- examples/cpp/inference/LLAMA/dataloader.cc | 66 +++++++------------- examples/cpp/inference/LLAMA/llama.cc | 8 +-- examples/cpp/inference/LLAMA/llama.h | 6 +- examples/cpp/inference/file_loader.cc | 65 +++++++------------ src/ops/inc_multihead_self_attention.cu | 9 ++- src/ops/spec_inc_multihead_self_attention.cu | 10 ++- src/ops/tree_inc_multihead_self_attention.cu | 9 ++- 7 files changed, 60 insertions(+), 113 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index 8eabc0a464..21557b9c94 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -159,10 +159,8 @@ void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { } template -void DataLoader::load_attention_weights(T *dst_ptr, - int num_heads, - size_t hidden_dim, - size_t qkv_inner_dim, +void DataLoader::load_attention_weights(T *ptr, + size_t size, std::string layer_name, std::string weight_path) { @@ -180,52 +178,37 @@ void DataLoader::load_attention_weights(T *dst_ptr, "attention_wo_weight"; std::vector weight_files = {q_file, k_file, v_file, o_file}; - int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} - size_t single_proj_size = - hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head - size_t one_head_size = - single_proj_size * 4; // size of Q+K+V+O weights for a single head - size_t one_weight_file_size = - num_heads * single_proj_size; // size of each of Q/K/V/O for all heads + int file_index = 0; + // q, k, v, o -> 0, 1, 2, 3 for (auto file : weight_files) { - std::cout << "file name and index: " << file << "->" << weight_index - << "\n"; + std::cout << "file name and index: " << file << "->" << file_index << "\n"; + size_t partial_size = size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(one_weight_file_size); - size_t loaded_data_size = sizeof(T) * one_weight_file_size; + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(T) * partial_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error" << std::endl; + std::cout << "load data error"; return; } - assert(one_weight_file_size == host_array.size()); - - size_t flexflow_idx; - for (int i = 0; i < one_weight_file_size; i++) { - int checkpoint_row_idx = i % hidden_dim; - int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; - int head_idx = i / single_proj_size; - if (weight_index < 3) { - // if this is the Q,K or V weight - flexflow_idx = - head_idx * one_head_size + weight_index * single_proj_size + - checkpoint_column_idx * qkv_inner_dim + checkpoint_row_idx; - } else { - // if this is the output projection weight - flexflow_idx = - head_idx * one_head_size + weight_index * single_proj_size + - checkpoint_row_idx * qkv_inner_dim + checkpoint_column_idx; + assert(partial_size == host_array.size()); + + size_t one_head_size = 4096 * 128; + size_t data_index = 0; + + for (int i = 0; i < 32; i++) { + size_t start_index = i * one_head_size * 4 + file_index * one_head_size; + for (size_t j = start_index; j < start_index + one_head_size; j++) { + ptr[j] = host_array.at(data_index); + data_index += 1; } - dst_ptr[flexflow_idx] = host_array.at(i); } - - weight_index++; + file_index++; in.close(); } @@ -266,13 +249,8 @@ void DataLoader::store_outputs(BatchConfig *bc, assert(batch_predictions.size() == bc->num_active_requests()); } -template void - DataLoader::load_attention_weights(float *dst_ptr, - int num_heads, - size_t hidden_dim, - size_t qkv_inner_dim, - std::string layer_name, - std::string weight_path); +template void DataLoader::load_attention_weights( + float *ptr, size_t size, std::string layer_name, std::string weight_path); template void DataLoader::load_from_file(long *ptr, size_t size, std::string filename); diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 5724a91859..383a93397b 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -204,12 +204,8 @@ void FlexFlow::top_level_task(Task const *task, llamaConfig.dim * (llamaConfig.dim / llamaConfig.n_heads) * 4); assert(dims_vec[1] = llamaConfig.n_heads); assert(volume == dims_vec[0] * dims_vec[1]); - loader.load_attention_weights(data, - llamaConfig.n_heads, - llamaConfig.dim, - llamaConfig.dim / llamaConfig.n_heads, - v.first, - llamaConfig.weight_file_path); + loader.load_attention_weights( + data, volume, v.first, llamaConfig.weight_file_path); } else { loader.load_from_file( diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index cfa9345c04..f2302b25ae 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -82,10 +82,8 @@ class DataLoader { static void load_from_file(T *ptr, size_t size, std::string filename); template - static void load_attention_weights(T *dst_ptr, - int num_heads, - size_t hidden_dim, - size_t qkv_inner_dim, + static void load_attention_weights(T *ptr, + size_t size, std::string layer_name, std::string weight_path); void store_outputs(BatchConfig *bc, diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index 07cfd0dccf..e0ca8cc105 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -66,12 +66,11 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { return prompts; }; -void load_attention_weights(float *dst_ptr, - int num_heads, - size_t hidden_dim, - size_t qkv_inner_dim, +void load_attention_weights(float *ptr, + size_t size, std::string layer_name, std::string weight_path) { + std::string q_file = weight_path + layer_name.substr(0, layer_name.find("attention")) + "attention_wq_weight"; @@ -86,52 +85,37 @@ void load_attention_weights(float *dst_ptr, "attention_wo_weight"; std::vector weight_files = {q_file, k_file, v_file, o_file}; - int weight_index = 0; // {q, k, v, o} -> {0, 1, 2, 3} - size_t single_proj_size = - hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head - size_t one_head_size = - single_proj_size * 4; // size of Q+K+V+O weights for a single head - size_t one_weight_file_size = - num_heads * single_proj_size; // size of each of Q/K/V/O for all heads + int file_index = 0; + // q, k, v, o -> 0, 1, 2, 3 for (auto file : weight_files) { - std::cout << "file name and index: " << file << "->" << weight_index - << "\n"; + std::cout << "file name and index: " << file << "->" << file_index << "\n"; + size_t partial_size = size / 4; std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(one_weight_file_size); - size_t loaded_data_size = sizeof(float) * one_weight_file_size; + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error" << std::endl; + std::cout << "load data error"; return; } - assert(one_weight_file_size == host_array.size()); - - size_t flexflow_idx; - for (int i = 0; i < one_weight_file_size; i++) { - int checkpoint_row_idx = i % hidden_dim; - int checkpoint_column_idx = (i / hidden_dim) % qkv_inner_dim; - int head_idx = i / single_proj_size; - if (weight_index < 3) { - // if this is the Q,K or V weight - flexflow_idx = head_idx * one_head_size + - weight_index * single_proj_size + - checkpoint_column_idx * hidden_dim + checkpoint_row_idx; - } else { - // if this is the output projection weight - flexflow_idx = - head_idx * one_head_size + weight_index * single_proj_size + - checkpoint_row_idx * qkv_inner_dim + checkpoint_column_idx; + assert(partial_size == host_array.size()); + + size_t one_head_size = 4096 * 128; + size_t data_index = 0; + + for (int i = 0; i < 32; i++) { + size_t start_index = i * one_head_size * 4 + file_index * one_head_size; + for (size_t j = start_index; j < start_index + one_head_size; j++) { + ptr[j] = host_array.at(data_index); + data_index += 1; } - dst_ptr[flexflow_idx] = host_array.at(i); } - - weight_index++; + file_index++; in.close(); } @@ -192,12 +176,7 @@ void FileDataLoader::load_weights( assert(dims_vec[0] = hidden_dim * qkv_inner_dim * 4); assert(dims_vec[1] = num_heads); assert(volume == dims_vec[0] * dims_vec[1]); - load_attention_weights(data, - num_heads, - hidden_dim, - qkv_inner_dim, - v.first, - weight_file_path); + load_attention_weights(data, volume, v.first, weight_file_path); } else { load_from_file(data, volume, weight_file_path + v.first); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index f68bec459b..5997a3d48d 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -31,13 +31,12 @@ __global__ void build_w_out_tensor(float const *weight_ptr, int num_heads, int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - int v_idx = i % vProjSize; - int o_idx = (i / vProjSize) % oProjSize; + int row_idx = i % vProjSize; + int col_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[o_idx * vProjSize * num_heads + head_idx * vProjSize + - v_idx] = + contiguous_weight_ptr[i] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + o_idx * vProjSize + v_idx]; + qkv_weight_block_size + col_idx * vProjSize + row_idx]; } } diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 6efd8b8874..42f647f670 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -31,14 +31,12 @@ __global__ void spec_build_w_out_tensor(float const *weight_ptr, int num_heads, int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - // vProjSize = 128, oProjSize = 4096 - int v_idx = i % vProjSize; - int o_idx = (i / vProjSize) % oProjSize; + int row_idx = i % vProjSize; + int col_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[head_idx * vProjSize * oProjSize + v_idx * oProjSize + - o_idx] = + contiguous_weight_ptr[i] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + o_idx * vProjSize + v_idx]; + qkv_weight_block_size + col_idx * vProjSize + row_idx]; } } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index d2b9f63401..1d45ecaf14 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -31,13 +31,12 @@ __global__ void tree_build_w_out_tensor(float const *weight_ptr, int num_heads, int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - int v_idx = i % vProjSize; - int o_idx = (i / vProjSize) % oProjSize; + int row_idx = i % vProjSize; + int col_idx = (i / vProjSize) % oProjSize; int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[o_idx * vProjSize * num_heads + head_idx * vProjSize + - v_idx] = + contiguous_weight_ptr[i] = weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + o_idx * vProjSize + v_idx]; + qkv_weight_block_size + col_idx * vProjSize + row_idx]; } } From 2c88a9a5c51d9d32fc606a049064608b8f90cd21 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 May 2023 06:21:34 +0000 Subject: [PATCH 117/344] removed magic numbers --- examples/cpp/inference/LLAMA/dataloader.cc | 15 +++++++++++---- examples/cpp/inference/LLAMA/llama.cc | 8 ++++++-- examples/cpp/inference/LLAMA/llama.h | 2 ++ examples/cpp/inference/file_loader.cc | 9 ++++++--- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc index 21557b9c94..7f2cfe3577 100644 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ b/examples/cpp/inference/LLAMA/dataloader.cc @@ -161,6 +161,8 @@ void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { template void DataLoader::load_attention_weights(T *ptr, size_t size, + int hidden_dim, + int num_heads, std::string layer_name, std::string weight_path) { @@ -198,10 +200,10 @@ void DataLoader::load_attention_weights(T *ptr, } assert(partial_size == host_array.size()); - size_t one_head_size = 4096 * 128; + size_t one_head_size = hidden_dim * (hidden_dim / num_heads); size_t data_index = 0; - for (int i = 0; i < 32; i++) { + for (int i = 0; i < num_heads; i++) { size_t start_index = i * one_head_size * 4 + file_index * one_head_size; for (size_t j = start_index; j < start_index + one_head_size; j++) { ptr[j] = host_array.at(data_index); @@ -249,8 +251,13 @@ void DataLoader::store_outputs(BatchConfig *bc, assert(batch_predictions.size() == bc->num_active_requests()); } -template void DataLoader::load_attention_weights( - float *ptr, size_t size, std::string layer_name, std::string weight_path); +template void + DataLoader::load_attention_weights(float *ptr, + size_t size, + int hidden_dim, + int num_heads, + std::string layer_name, + std::string weight_path); template void DataLoader::load_from_file(long *ptr, size_t size, std::string filename); diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 383a93397b..de374459cb 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -204,8 +204,12 @@ void FlexFlow::top_level_task(Task const *task, llamaConfig.dim * (llamaConfig.dim / llamaConfig.n_heads) * 4); assert(dims_vec[1] = llamaConfig.n_heads); assert(volume == dims_vec[0] * dims_vec[1]); - loader.load_attention_weights( - data, volume, v.first, llamaConfig.weight_file_path); + loader.load_attention_weights(data, + volume, + llamaConfig.dim, + llamaConfig.n_heads, + v.first, + llamaConfig.weight_file_path); } else { loader.load_from_file( diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h index f2302b25ae..6bf30cb19c 100644 --- a/examples/cpp/inference/LLAMA/llama.h +++ b/examples/cpp/inference/LLAMA/llama.h @@ -84,6 +84,8 @@ class DataLoader { template static void load_attention_weights(T *ptr, size_t size, + int hidden_dim, + int num_heads, std::string layer_name, std::string weight_path); void store_outputs(BatchConfig *bc, diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index e0ca8cc105..deed6ba985 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -68,6 +68,8 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { void load_attention_weights(float *ptr, size_t size, + int hidden_dim, + int num_heads, std::string layer_name, std::string weight_path) { @@ -105,10 +107,10 @@ void load_attention_weights(float *ptr, } assert(partial_size == host_array.size()); - size_t one_head_size = 4096 * 128; + size_t one_head_size = hidden_dim * (hidden_dim / num_heads); size_t data_index = 0; - for (int i = 0; i < 32; i++) { + for (int i = 0; i < num_heads; i++) { size_t start_index = i * one_head_size * 4 + file_index * one_head_size; for (size_t j = start_index; j < start_index + one_head_size; j++) { ptr[j] = host_array.at(data_index); @@ -176,7 +178,8 @@ void FileDataLoader::load_weights( assert(dims_vec[0] = hidden_dim * qkv_inner_dim * 4); assert(dims_vec[1] = num_heads); assert(volume == dims_vec[0] * dims_vec[1]); - load_attention_weights(data, volume, v.first, weight_file_path); + load_attention_weights( + data, volume, hidden_dim, num_heads, v.first, weight_file_path); } else { load_from_file(data, volume, weight_file_path + v.first); From 4efea836dfe6142db9ad292c0627d599fa71d006 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Wed, 10 May 2023 11:13:26 -0400 Subject: [PATCH 118/344] [Inference] Prepare batch for speculative inference (#714) * Modify * Verify results tree with input tree . * Beam_init * VerifyTreeBatchConfig init. * Fix * Add committed_tokens, results splits. * Fix, pass compiling. * Duplicate spec_llama example for testing pipeline. * Add prompt token in the first verify iteration. * Update BeamInferenceResult. * edited example files * added debugging prints * Match one iterations. * Adjust tree depth. * refactor example for iteration looping * Fix, now run with multipile iterations. * Merge conflict. * Fix num_tokens_to_commit. * Format code. --------- Co-authored-by: User Co-authored-by: Rae Wong --- CMakeLists.txt | 4 + .../spec_verify_pipeline/CMakeLists.txt | 20 + .../inference/spec_verify_pipeline/Makefile | 39 ++ .../inference/spec_verify_pipeline/README.md | 14 + .../inference/spec_verify_pipeline/llama.cc | 287 ++++++++++ .../spec_verify_pipeline/llama_rae.cc | 246 ++++++++ include/flexflow/batch_config.h | 19 +- include/flexflow/inference.h | 45 +- src/runtime/batch_config.cc | 2 + src/runtime/inference_manager.cc | 5 + src/runtime/request_manager.cc | 529 ++++++++++++++++-- 11 files changed, 1134 insertions(+), 76 deletions(-) create mode 100644 examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt create mode 100644 examples/cpp/inference/spec_verify_pipeline/Makefile create mode 100644 examples/cpp/inference/spec_verify_pipeline/README.md create mode 100644 examples/cpp/inference/spec_verify_pipeline/llama.cc create mode 100644 examples/cpp/inference/spec_verify_pipeline/llama_rae.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 8edad77124..59f3453f1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,6 +548,10 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/SPEC_LLAMA) endif() +if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/spec_verify_pipeline) +endif() + # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt b/examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt new file mode 100644 index 0000000000..f93189af36 --- /dev/null +++ b/examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_spec_verify_pipeline) +set(project_target spec_verify_pipeline) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + llama.cc + ../file_loader.cc + ../models/llama.cc) + + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/spec_verify_pipeline/Makefile b/examples/cpp/inference/spec_verify_pipeline/Makefile new file mode 100644 index 0000000000..130d52a7ee --- /dev/null +++ b/examples/cpp/inference/spec_verify_pipeline/Makefile @@ -0,0 +1,39 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= spec_verify_pipeline +# List all the application source files here +GEN_SRC = llama.cc dataloader.cc +GEN_GPU_SRC = dataloader.cu +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/spec_verify_pipeline/README.md b/examples/cpp/inference/spec_verify_pipeline/README.md new file mode 100644 index 0000000000..4a112ba45f --- /dev/null +++ b/examples/cpp/inference/spec_verify_pipeline/README.md @@ -0,0 +1,14 @@ +# an example of running llama model with beam search + +## how to run it? +1. build the flexflow with FF_BUILD_ALL_INFERENCE_EXAMPLES or FF_BUILD_ALL_EXAMPLES +2. download the weight and token file from aws s3. +```bash +aws s3 cp s3://catalyst-llama/7B_weights_float.tar.gz FF_HOME/examples/cpp/inference/spec_verify_pipeline/weights +tar -zxvf 7B_weights_float.tar.gz +aws s3 cp s3://catalyst-llama/tokens.tar FF_HOME/examples/cpp/inference/spec_verify_pipeline/tokens +tar -zxvf tokens.tar +``` +3. run *spec_verify_pipeline* with `--weights` `--dataset` `-b 5` `--only-data-parallel` +4. [expected results](https://github.com/flexflow/FlexFlow/pull/681#issuecomment-1534264054) + diff --git a/examples/cpp/inference/spec_verify_pipeline/llama.cc b/examples/cpp/inference/spec_verify_pipeline/llama.cc new file mode 100644 index 0000000000..b00fc522f1 --- /dev/null +++ b/examples/cpp/inference/spec_verify_pipeline/llama.cc @@ -0,0 +1,287 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "models/llama.h" +#include "flexflow/inference.h" + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("llama"); + +void parse_input_args(char **argv, int argc, LLAMA::Config &config) { + for (int i = 1; i < argc; i++) { + // input + if (!strcmp(argv[i], "--dataset")) { + config.input_path = std::string(argv[++i]); + continue; + } + + // weights + if (!strcmp(argv[i], "--weights")) { + config.weight_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + LLAMA::Config llama_config; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, llama_config); + InferenceManager im(ffconfig, llama_config.batchSize, 1); + RequestManager rm; + // Add a single request + std::vector prompt{ + 1, 306, 4658, 278, 6593, 310, 2834, 338}; + rm.register_new_request(prompt, llama_config.sentence_len); + + FFModel beam_model(ffconfig), tree_model(ffconfig), inc_model(ffconfig); + LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); + LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); + LLAMA::create_llama_model(inc_model, im, llama_config, 1, INC_DECODING_MODE); + + // entry--------------------------- + int depth = 0; + std::map beam_future_handlers, tree_future_handler; + std::map beam_batch_configs; + std::map tree_batch_configs; + + bool new_req = true; + TreeVerifyBatchConfig tree_bc; + + int iteration = 0; + + while (depth < llama_config.max_beam_depth) { + int bid = 0; + if (beam_future_handlers.find(bid) == beam_future_handlers.end()) { + BeamSearchBatchConfig bc; + InferenceResult ir; + bc = rm.prepare_next_batch_init(tree_bc, ir); + + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + } else { + // have luanched this bid + Future future = beam_future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } else { + std::cout << "future is ready...." << std::endl; + } + // process end + BeamInferenceResult ir = future.get_result(); + BeamSearchBatchConfig bc = beam_batch_configs[bid]; + depth = bc.beamRequestsInfo[0].current_depth; + bc = rm.prepare_next_batch_beam(bc, ir); + + std::cout << "llama current depth: " << depth << std::endl; + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + + // tranverse the tree in dfs order; + if (depth >= llama_config.max_beam_depth) { + + printf("\n\n ------Final Beam Search Batch------\n"); + printf("[Beam] num_tokens: %d\n", bc.num_tokens); + for (int i = 0; i < bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << bc.tokensInfo[i].request_index + << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; + } + + // printf("\n\n prepare tree_bc from final beam search bc\n"); + tree_bc = rm.prepare_next_batch_verify(bc); + + printf("\n\n\n ------Tree Verify Batch-------\n"); + // should have the same content as the hardcoded verification block + // below right now, it only contains the prompt need to add in the beam + // search result + + printf("[Verify] num_tokens : %d\n", tree_bc.num_tokens); + printf("[Verify] num_tokens_in_batch: %d\n", + tree_bc.requestsInfo[0].num_tokens_in_batch); + printf("------------------------------\n"); + + for (int i = 0; i < tree_bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << tree_bc.tokensInfo[i].request_index << ", Abs Depth: " + << tree_bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << tree_bc.tokensInfo[i].token_id << "\n"; + } + + printf("\n\n ------Commit Verified Tokens-------\n"); + for (int i = 0; i < tree_bc.num_tokens_to_commit; i++) { + std::cout << "[Commit] Request Index: " + << tree_bc.commited_tokens[i].request_index + << ", Abs Depth: " << tree_bc.commited_tokens[i].token_depth + << ", Token Index in batch: " + << tree_bc.commited_tokens[i].token_index << "\n"; + } + + FutureMap fm = im.inference(&tree_model, 0, tree_bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + InferenceResult ir = future.get_result(); + for (int i = 0; i < tree_bc.num_tokens; i++) { + if (i == 7) { + std::cout << "------------------\n"; + } + printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); + } + + std::cout << "------Init New Beam Search Batch------\n"; + bc = rm.prepare_next_batch_init(tree_bc, ir); + std::cout << "[Init] num_tokens: " << bc.num_tokens << "\n"; + for (int i = 0; i < bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << bc.tokensInfo[i].request_index + << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; + } + std::cout << "Batch Depth: " << bc.beamRequestsInfo[0].current_depth + << "\n"; + + iteration++; + + if (iteration < 4) { + std::cout << "\n\n~~~~~~~~~~teration " << iteration << "~~~~~~~~~~\n"; + depth = bc.beamRequestsInfo[0].current_depth; + fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + } else { + break; + } + } + } + } + + // // original + // { + // std::vector tokens{1, + // 306, + // 4658, + // 278, + // 6593, + // 310, + // 2834, + // 338, + // 593, + // 595, + // 17252, + // 5031, + // 993, + // 616, + // 368, + // 2302, + // 3204, + // 29131, + // 2976, + // 11285, + // 8930, + // 635, + // 8519, + // 593, + // 595}; + // BatchConfig bc; + // bc.num_tokens = 25; + // bc.requestsInfo[0].num_tokens_in_batch = bc.num_tokens; + // bc.requestsInfo[0].token_start_offset = 0; + // bc.requestsInfo[0].max_sequence_length = 347; + // bc.requestsInfo[0].request_guid = 1000000; + // bc.request_completed[0] = false; + // for (int i = 0; i < bc.num_tokens; i++) { + // bc.tokensInfo[i].token_id = tokens[i]; + // bc.tokensInfo[i].abs_depth_in_request = i; + // bc.tokensInfo[i].request_index = 0; + // } + // FutureMap fm = im.inference(&inc_model, 0, bc); + // assert(fm.get_future_map_domain().get_volume() == 1); + // Future future = fm.get_future(0); + // InferenceResult ir = future.get_result(); + // for (int i = 0; i < bc.num_tokens; i++) { + // printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); + // } + // } + + // // verification + // { + // std::vector tokens{1, + // 306, + // 4658, + // 278, + // 6593, + // 310, + // 2834, + // 338, + // 593, + // 595, + // 17252, + // 5031, + // 993, + // 616, + // 368, + // 2302, + // 3204, + // 29131, + // 2976, + // 11285, + // 8930, + // 635, + // 8519, + // 593, + // 595}; + // tree_bc.num_tokens = 25; + // tree_bc.requestsInfo[0].num_tokens_in_batch = tree_bc.num_tokens; + // for (int i = 0; i < tree_bc.num_tokens; i++) { + // tree_bc.tokensInfo[i].token_id = tokens[i]; + // tree_bc.tokensInfo[i].abs_depth_in_request = i; + // tree_bc.tokensInfo[i].request_index = 0; + // } + // FutureMap fm = im.inference(&tree_model, 0, tree_bc); + // assert(fm.get_future_map_domain().get_volume() == 1); + // Future future = fm.get_future(0); + // InferenceResult ir = future.get_result(); + // for (int i = 0; i < tree_bc.num_tokens; i++) { + // printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); + // } + // } + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/spec_verify_pipeline/llama_rae.cc b/examples/cpp/inference/spec_verify_pipeline/llama_rae.cc new file mode 100644 index 0000000000..7116c4bf21 --- /dev/null +++ b/examples/cpp/inference/spec_verify_pipeline/llama_rae.cc @@ -0,0 +1,246 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "models/llama.h" + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("llama"); + +void parse_input_args(char **argv, int argc, LLAMA::Config &config) { + for (int i = 1; i < argc; i++) { + // input + if (!strcmp(argv[i], "--dataset")) { + config.input_path = std::string(argv[++i]); + continue; + } + + // weights + if (!strcmp(argv[i], "--weights")) { + config.weight_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + LLAMA::Config llama_config; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, llama_config); + InferenceManager im(ffconfig, llama_config.batchSize, 1); + RequestManager rm; + // Add a single request + std::vector prompt{ + 1, 306, 4658, 278, 6593, 310, 2834, 338}; + rm.register_new_request(prompt, llama_config.sentence_len); + + FFModel beam_model(ffconfig), tree_model(ffconfig), inc_model(ffconfig); + LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); + LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); + // LLAMA::create_llama_model(inc_model, im, llama_config, 1, + // INC_DECODING_MODE); + + // entry--------------------------- + int abs_depth = 0; + std::map beam_future_handlers, tree_future_handler; + std::map beam_batch_configs; + std::map tree_batch_configs; + + bool new_req = true; + TreeVerifyBatchConfig tree_bc; + InferenceResult ir; + int num_iterations = 2; + + for (int itr = 0; itr < num_iterations; itr++) { + printf("\n\n ITERATION %d \n\n", itr); + + // first iteration of beam search, calling prepare_next_batch_init + int beam_search_depth = 0; + int bid = 0; + BeamSearchBatchConfig bc; + bc = rm.prepare_next_batch_init(tree_bc, ir); + + printf("\n\n init beam search bc\n"); + printf("bc.num_tokens: %d\n", bc.num_tokens); + for (int i = 0; i < bc.num_tokens; i++) { + printf("bc.tokensInfo[%d].token_id: %d\n", i, bc.tokensInfo[i].token_id); + printf("bc.tokensInfo[%d].abs_depth_in_request: %d\n", + i, + bc.tokensInfo[i].abs_depth_in_request); + printf("bc.tokensInfo[%d].request_index: %d\n", + i, + bc.tokensInfo[i].request_index); + } + + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + + // subsequent iterations of beam search + while (beam_search_depth < llama_config.max_beam_depth) { + // have luanched this bid + Future future = beam_future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } else { + std::cout << "future is ready...." << std::endl; + } + // process end + BeamInferenceResult ir_beam = future.get_result(); + BeamSearchBatchConfig bc = beam_batch_configs[bid]; + abs_depth = bc.beamRequestsInfo[0].current_depth; + bc = rm.prepare_next_batch_beam(bc, ir_beam); + + std::cout << "llama current depth: " << abs_depth << std::endl; + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + beam_search_depth++; + } + + // verify + printf("\n\n ------Final Beam Search Batch------\n"); + printf("[Beam] num_tokens: %d\n", bc.num_tokens); + for (int i = 0; i < bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " << bc.tokensInfo[i].request_index + << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; + } + + printf("\n\n prepare tree_bc from final beam search bc\n"); + tree_bc = rm.prepare_next_batch_verify(bc); + + printf("\n\n\n t------Tree Verify Batch-------\n"); + // should have the same content as the hardcoded verification block below + // right now, it only contains the prompt + // need to add in the beam search result + + printf("[Verify] num_tokens : %d\n", tree_bc.num_tokens); + printf("[Verify] num_tokens_in_batch: %d\n", + tree_bc.requestsInfo[0].num_tokens_in_batch); + printf("------------------------------\n"); + + for (int i = 0; i < tree_bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << tree_bc.tokensInfo[i].request_index + << ", Abs Depth: " << tree_bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << tree_bc.tokensInfo[i].token_id << "\n"; + } + + fm = im.inference(&tree_model, 0, tree_bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + ir = future.get_result(); + for (int i = 0; i < tree_bc.num_tokens; i++) { + printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); + } + } + + // // original + // { + // std::vector tokens{1, + // 306, + // 4658, + // 278, + // 6593, + // 310, + // 2834, + // 338, + // 593, + // 595, + // 17252, + // 5031, + // 993, + // 616, + // 368, + // 2302}; + // BatchConfig bc; + // bc.num_tokens = 16; + // bc.requestsInfo[0].num_tokens_in_batch = bc.num_tokens; + // bc.requestsInfo[0].token_start_offset = 0; + // bc.requestsInfo[0].max_sequence_length = 347; + // bc.requestsInfo[0].request_guid = 1000000; + // bc.request_completed[0] = false; + // for (int i = 0; i < bc.num_tokens; i++) { + // bc.tokensInfo[i].token_id = tokens[i]; + // bc.tokensInfo[i].abs_depth_in_request = i; + // bc.tokensInfo[i].request_index = 0; + // } + // FutureMap fm = im.inference(&inc_model, 0, bc); + // assert(fm.get_future_map_domain().get_volume() == 1); + // Future future = fm.get_future(0); + // InferenceResult ir = future.get_result(); + // for (int i = 0; i < bc.num_tokens; i++) { + // printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); + // } + // } + + // // verification + // { + // std::vector tokens{1, + // 306, + // 4658, + // 278, + // 6593, + // 310, + // 2834, + // 338, + // 593, + // 595, + // 17252, + // 5031, + // 993, + // 616, + // 368, + // 2302}; + // tree_bc.num_tokens = 16; + // tree_bc.requestsInfo[0].num_tokens_in_batch = tree_bc.num_tokens; + // for (int i = 0; i < tree_bc.num_tokens; i++) { + // tree_bc.tokensInfo[i].token_id = tokens[i]; + // tree_bc.tokensInfo[i].abs_depth_in_request = i; + // tree_bc.tokensInfo[i].request_index = 0; + // } + // FutureMap fm = im.inference(&tree_model, 0, tree_bc); + // assert(fm.get_future_map_domain().get_volume() == 1); + // Future future = fm.get_future(0); + // InferenceResult ir = future.get_result(); + // for (int i = 0; i < tree_bc.num_tokens; i++) { + // printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); + // } + // } + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 55bad9237a..8c6fa41f2e 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -50,11 +50,8 @@ class BatchConfig { int num_tokens; struct PerRequestInfo { - int token_start_offset; // input[token_start_offset * data_dim] is the first - // token - int num_tokens_in_batch; // tokens from input[token_start_offset * data_dim - // : (token_start_offset + num_token_in_batch) * - // data_dim] + int token_start_offset; + int num_tokens_in_batch; int max_sequence_length; RequestGuid request_guid; }; @@ -108,15 +105,9 @@ class BeamSearchBatchConfig : public BatchConfig { static int const MAX_BEAM_DEPTH = 8; struct BeamSearchPerRequestInfo { - // int token_start_offset; // input[token_start_offset * data_dim] is the - // first token int num_tokens_in_batch; // tokens from - // input[token_start_offset * data_dim : (token_start_offset + - // num_token_in_batch) * data_dim] int max_sequence_length; RequestGuid - // request_guid; bool request_completed; - int beam_size; // + int beam_size; int current_depth = -1; - // int global_depth = -1; int max_depth = MAX_BEAM_DEPTH; BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; @@ -130,6 +121,7 @@ class BeamSearchBatchConfig : public BatchConfig { BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; // BeamSlot beam_slots[MAX_NUM_REQUESTS]; @@ -137,7 +129,8 @@ class BeamSearchBatchConfig : public BatchConfig { size_t current_iteration; }; -struct BeamInferenceResult : public InferenceResult { +struct BeamInferenceResult { + static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; BatchConfig::TokenId token_ids[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; float probs[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 623b8ffd32..bbeaf67821 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -50,6 +50,7 @@ class InferenceManager { struct Request { BatchConfig::RequestGuid guid; int max_sequence_length; + int initial_len; std::vector tokens; }; @@ -61,9 +62,15 @@ struct BeamTree { int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; }; - treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH]; + treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; }; +// struct BeamTree_v2 { +// std::vector tokens; +// std::vector parent_ids; +// std::vector probs; +// }; + class RequestManager { public: using RequestGuid = BatchConfig::RequestGuid; @@ -74,19 +81,38 @@ class RequestManager { int max_sequence_length); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); - BeamSearchBatchConfig - prepare_next_batch_beam(BeamSearchBatchConfig const &bc, + prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); + BeamSearchBatchConfig + prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, + InferenceResult const &result); + + TreeVerifyBatchConfig + prepare_next_batch_verify(BeamSearchBatchConfig const &old_bc); + void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); void update_beam_metadata(BeamSearchBatchConfig &new_bc, BeamTree &tree, int request_index); - void tranverse_beam_tree(BeamSearchBatchConfig const &old_bc); + + std::vector> + traverse_beam_tree(BeamSearchBatchConfig const &old_bc, + int request_index, + int token_start_offset); + + std::vector> traverse_verify_tree( + size_t guid, + std::vector> const + &inputSerializedTree, + std::vector> const + &outputSerializedTree); + TreeVerifyBatchConfig convert_beam_to_tree_batch_config(BeamSearchBatchConfig const &beam_bc); + static void load_tokens_task(Legion::Task const *task, std::vector const ®ions, @@ -98,7 +124,18 @@ class RequestManager { std::unordered_map running_request_queue; std::mutex request_queue_mutex; RequestGuid next_available_guid; + struct BeamTree beam_trees[BatchConfig::MAX_NUM_REQUESTS]; + + std::unordered_map>> + dfs_tree_inputs; + + // std::unordered_map beam_trees_v2; + // TODO: cache config info for Verify/Beam exchange: Beam Width, Beam Depth, + // Commited Tokens + std::unordered_map>> + committed_tokens; size_t num_processed_requests; }; diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 2bed8f14c0..d6b1c5bed9 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -117,6 +117,8 @@ int BatchConfig::num_active_requests() const { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { num_requests++; + // } else { + // std::cout << "request " << i << " is completed" << std::endl; } } return num_requests; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 25d6e2a00e..15ae8e3aa3 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -197,6 +197,11 @@ MachineView *InferenceManager::get_machine_view(int mv_id) { FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfig const &bc) { + std::cout << "InferenceManager::inference" << index << std::endl; + std::cout << "num_active_tokens = " << bc.num_active_tokens() + << ", num_active_requests = " << bc.num_active_requests() + << std::endl; + assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 149bc18ec7..fb445fbec6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -15,6 +15,8 @@ #include "flexflow/inference.h" #include "flexflow/parallel_ops/parallel_op.h" +#include +#include namespace FlexFlow { @@ -34,6 +36,7 @@ RequestManager::RequestGuid Request request; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; + request.initial_len = prompt.size(); request.tokens = prompt; pending_request_queue.push(request); @@ -143,27 +146,28 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, return new_bc; } -//-------beam search specific functions +/* ----- Speculative Inference Specific functions ----- */ // update beam search metadata BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); + std::cout << "print all results" << "\n"; for (int i = 0; i < 40; i++) { std::cout << result.token_ids[i] << ", "; } - // Step 1: register first batch - BeamSearchBatchConfig new_bc; - // Step 2: preparing the next batch for existing requests + std::cout << "Current Beam Depth: " + << old_bc.beamRequestsInfo[0].current_depth << "\n"; - // store results - std::cout << "depthhhhhhh: " << old_bc.beamRequestsInfo[0].current_depth - << "\n"; + // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); + // Step 2: preparing the next batch for existing requests + BeamSearchBatchConfig new_bc; + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { if (old_bc.request_completed[i]) { continue; @@ -174,20 +178,22 @@ BeamSearchBatchConfig int processed_tokens = old_bc.requestsInfo[i].token_start_offset + old_bc.requestsInfo[i].num_tokens_in_batch; - // std::cout << "processed tokens" << processed_tokens << ", " - // << request.tokens.size() << "\n"; // assert(processed_tokens < request.tokens.size()); + std::cout << "\nprocessed_tokens: " << processed_tokens << "\n"; if (processed_tokens > old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() // || ir.results[t] == 0 TODO: replace this with - // std::cout<<"aaaaaaa"<<"\n"; ) { - log_req_mgr.print("[Done] guid(%zu) final_length(%i) request_length(%zu)", + log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", old_bc.requestsInfo[i].request_guid, - processed_tokens, - request.tokens.size()); + old_bc.beamRequestsInfo[i].max_depth); + // new_bc.request_completed[i] = true; + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; } else { - std::cout << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; new_bc.request_completed[i] = false; @@ -198,6 +204,7 @@ BeamSearchBatchConfig // update the beam search metadata // how many sub request in current request + // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; // update the parentid, accumalated_probs, depth, and token_ids new_bc.beamRequestsInfo[i].current_depth = @@ -238,7 +245,146 @@ BeamSearchBatchConfig } } } + return new_bc; +} + +BeamSearchBatchConfig + RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, + InferenceResult const &result) { + const std::lock_guard lock(request_queue_mutex); + + // Step 1: use result to update requests + BeamSearchBatchConfig new_bc; + new_bc.num_tokens = 0; + int result_index = 0; + + std::cout << "11111111" << std::endl; + + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + size_t guid = old_bc.requestsInfo[i].request_guid; + Request &request = running_request_queue[guid]; + + printf("req %d\n", i); + + // Verify this: get verified tokens from result + std::vector> tree_outputs = + std::vector>(); + + assert(old_bc.num_tokens > 0); + + std::cout << "222222222" << std::endl; + + int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; + if (committed_tokens.find(guid) == committed_tokens.end()) { + committed_tokens[guid] = std::vector>(); + } else { + committed_tokens.at(guid).clear(); + } + while (result_index < old_bc.num_tokens && + old_bc.tokensInfo[result_index].request_index == i) { + int root_abs_depth = request.tokens.size() - 1; + if (old_bc.tokensInfo[result_index].abs_depth_in_request >= + root_abs_depth) { + tree_outputs.push_back(std::make_pair( + result.token_ids[result_index], + old_bc.tokensInfo[result_index].abs_depth_in_request + 1)); + + committed_tokens.at(guid).push_back( + std::make_pair(old_bc.tokensInfo[result_index].abs_depth_in_request, + result_index)); + + std::cout << "Index with old_bacth: " << result_index << std::endl; + printf(" Input: [%d] %d ---> [%d] %d \n", + old_bc.tokensInfo[result_index].abs_depth_in_request, + old_bc.tokensInfo[result_index].token_id, + tree_outputs.back().second, + tree_outputs.back().first); + // std::cout << " Input: " << old_bc.tokensInfo[result_index].token_id + // << "" + // << old_bc.tokensInfo[result_index].abs_depth_in_request << + // std::endl; + // std::cout << " Result: " << result.token_ids[result_index] << ", + // depth: " + // << old_bc.tokensInfo[result_index].abs_depth_in_request + 1 << + // std::endl; + } + result_index++; + } + + std::cout << "333333333333" << std::endl; + + std::vector> verified_tokens = + traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + + // check if the request is finished + if (verified_tokens.size() + request.tokens.size() >= + request.max_sequence_length) { + // Append all verified tokens to the request + for (int j = 0; j < verified_tokens.size(); j++) { + request.tokens.push_back(verified_tokens[j].first); + } + + log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", + request.guid, + request.tokens.size()); + + new_bc.request_completed[i] = true; + + beam_trees[i] = BeamTree{}; + dfs_tree_inputs.erase( + request.guid); // delete the old input tree from cache + continue; + } + + new_bc.request_completed[i] = false; + + // Normal Reuqest Info + new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + + // TODO: Beam Request Info, missing from VerifyTreeBatchConfig + new_bc.beamRequestsInfo[i].current_depth = 1; + new_bc.beamRequestsInfo[i].beam_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + new_bc.beamRequestsInfo[i].max_depth = + BeamSearchBatchConfig::MAX_BEAM_DEPTH; + new_bc.beamRequestsInfo[i].request_completed = false; + for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + new_bc.beamRequestsInfo[i].parent_id[j] = 0; + new_bc.beamRequestsInfo[i].probs[j] = 1; + } + + new_bc.sub_requests[i] = 1; + + // Token Info + for (int j = 0; j < verified_tokens.size(); j++) { + auto token = verified_tokens.at(j); + + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + + // Beam Token Info + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; + new_bc.num_tokens++; + + // Add verified token to request's token list + request.tokens.push_back(token.first); + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } + } + } + // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && @@ -288,6 +434,171 @@ BeamSearchBatchConfig return new_bc; } +TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( + BeamSearchBatchConfig const &old_bc) { + const std::lock_guard lock(request_queue_mutex); + + TreeVerifyBatchConfig new_bc; + new_bc.num_tokens_to_commit = 0; + new_bc.num_tokens = 0; + + for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + size_t guid = old_bc.requestsInfo[i].request_guid; + Request &request = running_request_queue[guid]; + + // Get the dfs tree + std::vector> dfs_tree_inputs = + traverse_beam_tree(old_bc, i, request.tokens.size() - 1); + + std::cout << "11111" << std::endl; + std::cout << "Request Tokens Size: " << request.tokens.size() << std::endl; + for (int k = 0; k < request.tokens.size(); k++) { + std::cout << k << ": " << request.tokens[k] << std::endl; + } + + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + // TODO: Check this + new_bc.requestsInfo[i].num_tokens_in_batch = 0; + + new_bc.request_completed[i] = false; + + // TODO: Add prompt token first in first verify iteration + if (request.tokens.size() == request.initial_len) { + for (int j = 0; j < request.initial_len; j++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + } + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + assert(false); + break; + } + + new_bc.requestsInfo[i].token_start_offset = 0; + } else { + // Only add the last committed token + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.tokens.size() - 1; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + assert(false); + break; + } + + new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; + } + + std::cout << "dfs_tree_inputs.size(): " << dfs_tree_inputs.size() + << std::endl; + + // add prompt to the dfs tree + if (committed_tokens.find(guid) != committed_tokens.end()) { + // std::cout << "committed_tokens.size(): " << + // committed_tokens.at(guid).size() << std::endl; std::cout << + // "dfs_tree_inputs.at(0).second: " << dfs_tree_inputs.at(0).second << + // std::endl; std::cout << "request.initial_len: " << request.initial_len + // << std::endl; + if (dfs_tree_inputs.at(0).second == + request.initial_len + committed_tokens.at(guid).size() - 1) { + for (int j = 0; j < request.initial_len; j++) { + new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = j; + new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; + new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = j; + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " << j + << ", token_index: " << j << std::endl; + new_bc.num_tokens_to_commit++; + } + } else { + // only add the root token + auto committed_token = committed_tokens.at(guid).at(0); + new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; + new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " << committed_token.first + << ", token_index: " << committed_token.second << std::endl; + new_bc.num_tokens_to_commit++; + } + + std::cout << "new_bc.num_tokens_to_commit: " + << new_bc.num_tokens_to_commit << std::endl; + } + + // Token Info + for (int j = 1; j < dfs_tree_inputs.size(); j++) { + auto token = dfs_tree_inputs.at(j); + + std::cout << "[" << j << "] Token: " << token.first + << ", Depth:" << token.second << std::endl; + + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + + // TODO: Add committed token info + std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit + << std::endl; + + if (committed_tokens.find(guid) != committed_tokens.end()) { + // if (j == 1) { + // auto committed_token = committed_tokens.at(guid).at(0); + // new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = + // committed_token.second; + // new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = + // i; new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth + // = committed_token.first; std:: cout << new_bc.num_tokens_to_commit + // << "- committed_token.token_depth: " << committed_token.first << + // ", token_index: " << committed_token.second << std::endl; + // new_bc.num_tokens_to_commit++; + // } + if (j < committed_tokens.at(guid).size()) { + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; + new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second << std::endl; + new_bc.num_tokens_to_commit++; + } + } + std::cout << "new_bc.num_tokens_to_commit: " + << new_bc.num_tokens_to_commit << std::endl; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } + } + } + + return new_bc; +} + void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { // step1 store the outputs @@ -296,25 +607,30 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } auto guid = old_bc.requestsInfo[old_bc.tokensInfo[0].request_index].request_guid; - auto start_idx = old_bc.tokensInfo[0].abs_depth_in_request; + auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; int result_index = 0; - std::cout << "store num tokens" << old_bc.num_tokens << "\n"; + + std::cout << "Store total of " << old_bc.num_tokens + << " tokens in the current batch.\n"; + for (int i = 0; i <= old_bc.num_tokens; i++) { int request_index = old_bc.tokensInfo[i].request_index; + + // End of the request if (i == old_bc.num_tokens || old_bc.requestsInfo[request_index].request_guid != guid) { - // see how many tokens has been put to model in this req - // to get the index of the final token - // every token will get (beam_width) results - int beam_width = - old_bc.beamRequestsInfo[old_bc.tokensInfo[i].request_index].beam_size; + // Each token yields (beam_width) results + int beam_width = old_bc.beamRequestsInfo[request_index].beam_size; + + // Count tokens sent to model in this request to find the final token's + // index result_index += - (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_idx) * + (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_width; std::cout << "i = " << i << ", result index = " << result_index - << ",value: " << result.token_ids[result_index] << "\n"; + << ", value: " << result.token_ids[result_index] << "\n"; int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -326,12 +642,11 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, << "\n"; Request &request = running_request_queue[old_bc.requestsInfo[index].request_guid]; - beam_trees[index].treeLayers[depth - 1].tokens[0] = - request.tokens.at(request.tokens.size() - 1); - beam_trees[index].treeLayers[depth - 1].probs[0] = 1; - beam_trees[index].treeLayers[depth - 1].parent_ids[0] = -1; - std::cout << "store the previous last token to the tree root" - << request.tokens.at(request.tokens.size() - 1) << "\n"; + beam_trees[index].treeLayers[0].tokens[0] = request.tokens.back(); + beam_trees[index].treeLayers[0].probs[0] = 1; + beam_trees[index].treeLayers[0].parent_ids[0] = -1; + std::cout << "Store the previous last token to the tree root: " + << request.tokens.back() << "\n"; } for (int beam_id = 0; beam_id < beam_width; beam_id++) { @@ -348,9 +663,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, result_index += 1; } + // update the guid and start_depth for current request if (i < old_bc.num_tokens) { guid = old_bc.requestsInfo[request_index].request_guid; - start_idx = old_bc.tokensInfo[i].abs_depth_in_request; + start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } } @@ -379,13 +695,15 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // << std::endl; // // std::fixed << std::setprecision(15)<< // } - if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { + + if (new_bc.beamRequestsInfo[request_index].current_depth == + 1) { // TODO: check if this is correct for (int j = 0; j < beam_size; j++) { new_bc.beamRequestsInfo[request_index].parent_id[j] = j; new_bc.beamRequestsInfo[request_index].probs[j] = - tree.treeLayers[depth].probs[j]; + tree.treeLayers[depth].probs[j]; // ? new_bc.beamRequestsInfo[request_index].tokens[j] = - tree.treeLayers[depth].tokens[j]; + tree.treeLayers[depth].tokens[j]; // ? } } else { std::set parents; @@ -487,12 +805,13 @@ bool PreOrder(BeamTree const &tree, flag = flag || res; } } - if (!flag) { - // no child for this token, delete it - std::cout << "delete a node: " << tree.treeLayers[current_depth].tokens[id] - << ", " << current_depth << std::endl; - serializedTree.erase(serializedTree.begin() + index); - } + // if (!flag) { + // // no child for this token, delete it + // std::cout << "delete a node: " << + // tree.treeLayers[current_depth].tokens[id] + // << ", " << current_depth << std::endl; + // serializedTree.erase(serializedTree.begin() + index); + // } return flag; } @@ -533,32 +852,124 @@ TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( return tree_bc; } -void RequestManager::tranverse_beam_tree(BeamSearchBatchConfig const &old_bc) { - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { - if (old_bc.request_completed[i]) { +std::vector> + RequestManager::traverse_verify_tree( + size_t guid, + std::vector> const + &inputSerializedTree, + std::vector> const + &outputSerializedTree) { + std::vector> verifiedTree; + // verifiedTree.push_back(inputSerializedTree.at(0)); + std::vector> new_committed_tokens = + std::vector>(); + + std::cout << "Input size: " << inputSerializedTree.size() << std::endl; + std::cout << "Output size: " << outputSerializedTree.size() << std::endl; + + std::cout << "========Input============" << std::endl; + for (auto const &pair : inputSerializedTree) { + std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + } + std::cout << "========Output============" << std::endl; + for (auto const &pair : outputSerializedTree) { + std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + } + std::cout << "========Committed============" << std::endl; + for (auto const &pair : committed_tokens.at(guid)) { + std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + } + + assert(inputSerializedTree.size() == outputSerializedTree.size()); + + for (int i = 0; i < inputSerializedTree.size(); i++) { + auto input = inputSerializedTree.at(i); + auto output = outputSerializedTree.at(i); + + if (i == 0) { + verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( + input.second, + committed_tokens.at(guid).at(i).second)); // + std::cout << committed_tokens.at(guid).at(i).first << ", " + << committed_tokens.at(guid).at(i).second << std::endl; + std::cout << input.first << ", " << input.second << std::endl; + + assert(committed_tokens.at(guid).at(i).first == input.second); continue; } - // if(i != 0){ - // continue; - // } - int depth = old_bc.beamRequestsInfo[i].current_depth; - int beam_width = old_bc.beamRequestsInfo[i].beam_size; - BeamTree const &tree = beam_trees[i]; + if (input.first == verifiedTree.back().first && + input.second == verifiedTree.back().second) { + verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( + input.second, + committed_tokens.at(guid).at(i).second)); // + assert(committed_tokens.at(guid).at(i).first == input.second); + } + } + committed_tokens[guid] = new_committed_tokens; + std::cout << "========Verified============" << std::endl; + for (auto const &pair : verifiedTree) { + std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + } - // token, index - // todo make this one global for different stages - std::vector> serializedTree; - PreOrder( - tree, 3, 0, old_bc.beamRequestsInfo[i].beam_size, 0, serializedTree); + std::cout << "========New Committed============" << std::endl; + for (auto const &pair : committed_tokens.at(guid)) { + std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + } - // print it - std::cout << "print tree, " << i << "\n"; - for (int k = 0; k < serializedTree.size(); k++) { - std::cout << "token id: " << serializedTree.at(k).first - << ", depth: " << serializedTree.at(k).second << "\n"; - } + return verifiedTree; +} + +std::vector> + RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, + int request_index, + int token_start_offset) { + + std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; + std::cout << "[Traverse Beam Tree] max_depth: " + << old_bc.beamRequestsInfo[request_index].max_depth << "\n"; + std::cout << "[Traverse Beam Tree] current_depth: " + << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; + std::cout << "[Traverse Beam Tree] beam_width: " + << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + BeamTree tree = beam_trees[request_index]; + + // token, index + // todo make this one global for different stages + std::vector> serializedTree; + PreOrder(tree, + old_bc.beamRequestsInfo[request_index].max_depth, + 0, + old_bc.beamRequestsInfo[request_index].beam_size, + 0, + serializedTree); + + // print it + std::cout << "Print serialized tree, " << request_index << "\n"; + std::cout << serializedTree.size() << "\n"; + for (int k = 0; k < serializedTree.size(); k++) { + serializedTree.at(k).second += token_start_offset; + std::cout << "token id: " << serializedTree.at(k).first + << ", depth: " << serializedTree.at(k).second << "\n"; + } + std::cout << "Done printing serialized tree, " + << old_bc.requestsInfo[request_index].request_guid << "\n"; + + if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid) != + dfs_tree_inputs.end()) { + dfs_tree_inputs[old_bc.requestsInfo[request_index].request_guid] = + serializedTree; + } else { + dfs_tree_inputs.insert(std::make_pair( + old_bc.requestsInfo[request_index].request_guid, serializedTree)); } + + return serializedTree; + // } } }; // namespace FlexFlow From 0bf4fa9ac0d9ceb045063a8b696ae0dc195a2802 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 10 May 2023 19:57:14 -0500 Subject: [PATCH 119/344] Tree verify bug fix (#719) * Support multiple FFModels in a single top_level_task * [TreeVerifyMHA] bug fixes --- .../ops/tree_inc_multihead_self_attention.h | 12 +-- src/ops/tree_inc_multihead_self_attention.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cu | 91 +++++++++++-------- 3 files changed, 60 insertions(+), 45 deletions(-) diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index c1af4c0086..61e7b69fe9 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -92,12 +92,11 @@ class TreeIncMultiHeadSelfAttention : public Op { MachineView const &mv, CostMetrics &cost_metrics) const override; - static void - inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr); + static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m, + TreeVerifyBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr); Params get_params() const; public: @@ -124,6 +123,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, reserveSpaceSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int num_heads; + int num_active_tokens; bool *has_load_weights; bool *apply_rotary_embedding; #ifdef INFERENCE_TESTS diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index eec59c9247..96e2541872 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -553,7 +553,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(task->regions.size() == regions.size()); TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; - TreeIncMultiHeadSelfAttentionMeta const *m = + TreeIncMultiHeadSelfAttentionMeta *m = *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 1d45ecaf14..39a7ceaca3 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -47,29 +47,33 @@ __global__ void commit_tokens_kernel( int qProjSize, int kProjSize, int vProjSize, - int num_tokens, + int num_tokens_to_commit, + int num_active_tokens_in_last_batch, int num_heads, int max_seq_len, bool k_cache) { - CUDA_KERNEL_LOOP(i, - num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { + CUDA_KERNEL_LOOP( + i, num_tokens_to_commit * (k_cache ? kProjSize : vProjSize) * num_heads) { int proj_size = k_cache ? kProjSize : vProjSize; int data_idx = i % proj_size; - int head_idx = i / (num_tokens * proj_size); - int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; - token_idx = committedTokenInfos[token_idx].token_index; - - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int current_head_block_size = - num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); + int head_idx = i / (num_tokens_to_commit * proj_size); + int token_pos = + (i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; + int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; + assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); + + int qkv_block_size = + (qProjSize + kProjSize + vProjSize) * num_active_tokens_in_last_batch; + int current_head_block_size = num_active_tokens_in_last_batch * + (k_cache ? qProjSize : qProjSize + kProjSize); float val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + - token_idx * proj_size + data_idx]; + token_idx_in_last_batch * proj_size + data_idx]; // int const req_id = id_map[token_idx].request_index; // int const tok_id = id_map[token_idx].token_position; - int const req_id = committedTokenInfos[token_idx].request_index; - int const tok_id = committedTokenInfos[token_idx].token_depth; + int const req_id = committedTokenInfos[token_pos].request_index; + int const tok_id = committedTokenInfos[token_pos].token_depth; cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + @@ -86,31 +90,35 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, commit_tokens_kernel<<>>(m->devQKVProjArray, - m->keyCache, - m->committed_token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens_to_commit, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ true); + stream>>>( + m->devQKVProjArray, + m->keyCache, + m->committed_token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens_to_commit, + m->num_active_tokens, // number of active tokens in previous batch + m->num_heads, + BatchConfig::MAX_SEQ_LENGTH, + /* k_cache = */ true); parallelism = m->vProjSize * num_tokens_to_commit * m->num_heads; commit_tokens_kernel<<>>(m->devQKVProjArray, - m->valueCache, - m->committed_token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens_to_commit, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ false); + stream>>>( + m->devQKVProjArray, + m->valueCache, + m->committed_token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens_to_commit, + m->num_active_tokens, // number of active tokens in previous batch + m->num_heads, + BatchConfig::MAX_SEQ_LENGTH, + /* k_cache = */ false); } } @@ -438,7 +446,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - bc->requestsInfo[i].num_tokens_in_batch, // total_tokens_in_batch + m->num_active_tokens, // total_tokens_in_batch m->num_heads, BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ true); @@ -456,7 +464,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - bc->requestsInfo[i].num_tokens_in_batch, // total_tokens_in_batch + m->num_active_tokens, // total_tokens_in_batch m->num_heads, BatchConfig::MAX_SEQ_LENGTH, /* k_cache = */ false); @@ -645,7 +653,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, /*static*/ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - TreeIncMultiHeadSelfAttentionMeta const *m, + TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, @@ -661,15 +669,22 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( } // copy committed tokens info to GPU for the commit_tokens kernel + // Note that m->num_active_tokens stores the number of active + // tokens in the previous batch, which is needed for committing + // keys/values to the key-value cache cudaMemcpyAsync(m->committed_token_infos, &(bc->commited_tokens), bc->MAX_NUM_TOKENS * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); + commit_tokens(m, bc, stream); - // reload the weight_o + // After commit we update m->num_active_tokens to be the number of active + // tokens for the current batch + m->num_active_tokens = bc->num_active_tokens(); + // reload the weight_o if (!(*m->has_load_weights)) { int parallelism = m->vProjSize * m->oProjSize * m->num_heads; tree_build_w_out_tensor<< Date: Wed, 10 May 2023 23:07:32 -0400 Subject: [PATCH 120/344] [Inference] opt model (#717) * init * fix * code * clean up * fix * fix, add md * format * hip_roc * add comment --- CMakeLists.txt | 4 + examples/cpp/inference/LLAMA/llama.cc | 2 +- examples/cpp/inference/file_loader.cc | 184 ++++++++++++--- examples/cpp/inference/file_loader.h | 7 + examples/cpp/inference/models/llama.cc | 6 +- examples/cpp/inference/opt/CMakeLists.txt | 21 ++ examples/cpp/inference/opt/Makefile | 38 +++ examples/cpp/inference/opt/README.md | 45 ++++ examples/cpp/inference/opt/opt.cc | 222 ++++++++++++++++++ examples/cpp/inference/opt/opt.h | 66 ++++++ examples/cpp/inference/opt/opt_baseline.py | 23 ++ include/flexflow/inference.h | 6 + include/flexflow/model.h | 6 +- .../ops/inc_multihead_self_attention.h | 18 +- .../ops/inc_multihead_self_attention_params.h | 5 +- src/ops/inc_multihead_self_attention.cc | 143 ++++++++++- src/ops/inc_multihead_self_attention.cpp | 3 +- src/ops/inc_multihead_self_attention.cu | 94 +++++++- src/runtime/graph.cc | 14 +- src/runtime/inference_manager.cc | 41 +++- src/runtime/model.cc | 9 + src/runtime/request_manager.cc | 6 + src/runtime/request_manager.cpp | 27 +++ src/runtime/request_manager.cu | 29 +++ 24 files changed, 953 insertions(+), 66 deletions(-) create mode 100644 examples/cpp/inference/opt/CMakeLists.txt create mode 100644 examples/cpp/inference/opt/Makefile create mode 100644 examples/cpp/inference/opt/README.md create mode 100644 examples/cpp/inference/opt/opt.cc create mode 100644 examples/cpp/inference/opt/opt.h create mode 100644 examples/cpp/inference/opt/opt_baseline.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 59f3453f1e..ff6e90f200 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,6 +548,10 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/SPEC_LLAMA) endif() +if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/opt) +endif() + if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/spec_verify_pipeline) endif() diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index de374459cb..8ca5cfe98e 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -117,7 +117,7 @@ void FlexFlow::top_level_task(Task const *task, llamaConfig.dim / llamaConfig.n_heads, llamaConfig.dim / llamaConfig.n_heads, 0.0f, - true, + false, false, false, NULL, diff --git a/examples/cpp/inference/file_loader.cc b/examples/cpp/inference/file_loader.cc index deed6ba985..15b88455e5 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/examples/cpp/inference/file_loader.cc @@ -19,6 +19,8 @@ #include using namespace std; +using namespace Legion; + FileDataLoader::FileDataLoader(std::string _input_path, std::string _weight_file_path, int _num_heads, @@ -66,13 +68,68 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { return prompts; }; +void load_attention_bias(float *ptr, + int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weight_path) { + std::string q_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wq_bias"; + std::string k_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wk_bias"; + std::string v_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wv_bias"; + std::string o_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wo_bias"; + std::vector bias_files = {q_file, k_file, v_file, o_file}; + + int file_index = 0; + + for (auto file : bias_files) { + size_t partial_size = hidden_dim; + std::cout << "partial_size in bias" << partial_size << ", file: " << file + << "\n"; + std::ifstream in(file, std::ios::in | std::ios::binary); + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load bias data error"; + return; + } + assert(partial_size == host_array.size()); + + size_t data_index = 0; + + for (int i = 0; i < hidden_dim; i++) { + ptr[file_index * hidden_dim + i] = host_array.at(data_index); + data_index++; + } + + file_index++; + + in.close(); + } +} + void load_attention_weights(float *ptr, - size_t size, - int hidden_dim, int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, std::string layer_name, - std::string weight_path) { - + std::string weight_path, + size_t volume) { + // layers_0_attention_wq_weight + // layers_0_self_attn_q_proj_weight std::string q_file = weight_path + layer_name.substr(0, layer_name.find("attention")) + "attention_wq_weight"; @@ -89,10 +146,20 @@ void load_attention_weights(float *ptr, int file_index = 0; + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_weight_file_size = + num_heads * single_proj_size; // size of each of Q/K/V/O for all heads + // q, k, v, o -> 0, 1, 2, 3 for (auto file : weight_files) { std::cout << "file name and index: " << file << "->" << file_index << "\n"; - size_t partial_size = size / 4; + size_t partial_size = one_weight_file_size; + + std::cout << "partial_size weight " << partial_size << ", " << volume + << ", " << hidden_dim << ", " << qkv_inner_dim << ", " + << num_heads << "\n"; std::ifstream in(file, std::ios::in | std::ios::binary); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; @@ -137,7 +204,8 @@ void load_from_file(float *ptr, size_t size, std::string filename) { // std::cout << loaded_data_size << std::endl; // std::cout << in_get_size << std::endl; if (in_get_size != loaded_data_size) { - std::cout << "load data error" << std::endl; + std::cout << "load weight data error " << in_get_size << ", " + << loaded_data_size << ", " << sizeof(float) << std::endl; return; } @@ -152,41 +220,93 @@ void load_from_file(float *ptr, size_t size, std::string filename) { in.close(); } +void FileDataLoader::load_positions(FFModel *ff, + Tensor pt, + ParallelTensor position_pt, + int max_seq_length, + int offset) { + std::cout << "load positions" << std::endl; + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < pt->num_dims; i++) { + // std::cout<< pt->dims[i] << "\n"; + volume *= pt->dims[i]; + dims_vec.push_back(pt->dims[i]); + std::cout << dims_vec.at(dims_vec.size() - 1) << ", "; + } + + // load data; + int *data = (int *)malloc(sizeof(int) * volume); + for (int i = 0; i < volume; i++) { + data[i] = i % max_seq_length + offset; + std::cout << data[i] << ", "; + } + // set tensor + + // ParallelTensor position_pt; + + // ff->get_parallel_tensor_from_tensor(pt, position_pt); + position_pt->set_tensor(ff, dims_vec, data); +} + void FileDataLoader::load_weights( FFModel *ff, std::unordered_map weights_layers) { for (auto &v : weights_layers) { - Tensor weight = v.second->weights[0]; - std::cout << "weights layer: " << v.first << "\n"; - if (weight == NULL) { - std::cout << "op no weights : " << v.first << "\n"; - continue; - } + int weights_num = v.second->numWeights; + std::cout << "weight layer: " << v.first << ", num" << weights_num << "\n"; - size_t volume = 1; - std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; - } + for (int i = 0; i < weights_num; i++) { + Tensor weight = v.second->weights[i]; + if (weight == NULL) { + std::cout << "op no weights : " << v.first << "\n"; + continue; + } - assert(weight->data_type == DT_FLOAT); - float *data = (float *)malloc(sizeof(float) * volume); + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < weight->num_dims; i++) { + dims_vec.push_back(weight->dims[i]); + volume *= weight->dims[i]; + } + std::cout << "load weights volume: " << volume << std::endl; - if (v.first.find("attention_w") != std::string::npos) { - assert(dims_vec[0] = hidden_dim * qkv_inner_dim * 4); - assert(dims_vec[1] = num_heads); - assert(volume == dims_vec[0] * dims_vec[1]); - load_attention_weights( - data, volume, hidden_dim, num_heads, v.first, weight_file_path); + assert(weight->data_type == DT_FLOAT); + float *data = (float *)malloc(sizeof(float) * volume); - } else { - load_from_file(data, volume, weight_file_path + v.first); - } + if (v.first.find("attention_w") != std::string::npos) { + std::cout << "load weights bias: " << volume << "\n"; + if (i == 0) { + load_attention_weights(data, + num_heads, + hidden_dim, + qkv_inner_dim, + v.first, + weight_file_path, + volume); + } else { + load_attention_bias(data, + num_heads, + hidden_dim, + qkv_inner_dim, + v.first, + weight_file_path); + } - ParallelTensor weight_pt; - ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor(ff, dims_vec, data); + } else { + std::string file_path = v.first; + if (i > 0) { + int index = v.first.find("_weight"); + assert(index != std::string::npos); + file_path = v.first.substr(0, index) + "_bias"; + } + load_from_file(data, volume, weight_file_path + file_path); + } + + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + weight_pt->set_tensor(ff, dims_vec, data); + } } } diff --git a/examples/cpp/inference/file_loader.h b/examples/cpp/inference/file_loader.h index 7d03b3ac82..06714293da 100644 --- a/examples/cpp/inference/file_loader.h +++ b/examples/cpp/inference/file_loader.h @@ -16,6 +16,7 @@ #pragma once #include "flexflow/batch_config.h" +#include "flexflow/inference.h" #include "flexflow/model.h" using namespace std; @@ -34,6 +35,12 @@ class FileDataLoader { void load_weights(FFModel *ff, std::unordered_map weights_layers); + void load_positions(FFModel *ff, + Tensor pt, + ParallelTensor position_pt, + int max_seq_length, + int offset); + private: int num_heads; size_t hidden_dim, qkv_inner_dim; diff --git a/examples/cpp/inference/models/llama.cc b/examples/cpp/inference/models/llama.cc index efd5c18b6e..d2374c8c8f 100644 --- a/examples/cpp/inference/models/llama.cc +++ b/examples/cpp/inference/models/llama.cc @@ -98,7 +98,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, - true, + false, false, false, NULL, @@ -113,7 +113,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, /*dropout*/ - true, /*bias*/ + false, /*bias*/ false, /*add_bias_kv*/ false, /*add_zero_attn*/ nullptr, /*kernel_initializer*/ @@ -129,7 +129,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, /*dropout*/ - true, /*bias*/ + false, /*bias*/ false, /*add_bias_kv*/ false, /*add_zero_attn*/ nullptr, /*kernel_initializer*/ diff --git a/examples/cpp/inference/opt/CMakeLists.txt b/examples/cpp/inference/opt/CMakeLists.txt new file mode 100644 index 0000000000..2a392dce35 --- /dev/null +++ b/examples/cpp/inference/opt/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_OPT) +set(project_target OPT) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + opt.cc + opt.h + ../file_loader.cc) + + + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/opt/Makefile b/examples/cpp/inference/opt/Makefile new file mode 100644 index 0000000000..afe13d305a --- /dev/null +++ b/examples/cpp/inference/opt/Makefile @@ -0,0 +1,38 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= opt +# List all the application source files here +GEN_SRC = opt.cc +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/opt/README.md b/examples/cpp/inference/opt/README.md new file mode 100644 index 0000000000..8a7bd10ee5 --- /dev/null +++ b/examples/cpp/inference/opt/README.md @@ -0,0 +1,45 @@ +# an example of running opt model +## how to run? +1. build the flexflow with FF_BUILD_ALL_INFERENCE_EXAMPLES or FF_BUILD_ALL_EXAMPLES +2. download the weight and token file from aws s3. +```bash +aws s3 cp s3://catalyst-llama/opt_125m_native.tar.gz FF_HOME/examples/cpp/inference/opt/weights + +tar -zxvf opt_125m_native.tar.gz +``` +3. run *OPT* with `--weights` `--dataset` `--only-data-parallel` +4. run examples/cpp/inference/opt/opt_baseline.py +5. if get same result, it should be fine + +## code structure: +1. use two inputs, token & position, the position input should be after the token input +2. for the attention model, set scaling_query = true, scaling_factor = 0.125 and qk_prod_scaling = false, +all other models should set scaling_query = false and qk_prod_scaling = true +## opt default configuration from huggingface opt-125m +```python +OPTConfig { + "_remove_final_layer_norm": false, + "activation_function": "relu", + "attention_dropout": 0.0, + "bos_token_id": 2, + "do_layer_norm_before": true, + "dropout": 0.1, + "enable_bias": true, + "eos_token_id": 2, + "ffn_dim": 3072, + "hidden_size": 768, + "init_std": 0.02, + "layer_norm_elementwise_affine": true, + "layerdrop": 0.0, + "max_position_embeddings": 2048, + "model_type": "opt", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "transformers_version": "4.27.2", + "use_cache": true, + "vocab_size": 50272, + "word_embed_proj_dim": 768 +} +``` + diff --git a/examples/cpp/inference/opt/opt.cc b/examples/cpp/inference/opt/opt.cc new file mode 100644 index 0000000000..c2932df926 --- /dev/null +++ b/examples/cpp/inference/opt/opt.cc @@ -0,0 +1,222 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "opt.h" +#include "flexflow/inference.h" +#include + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("opt"); + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + OptConfig optConfig; + FFModel ff(ffconfig); + //------------------------------compute machine views ------------------ + int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; + std::vector machine_views; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } + + std::unordered_map> mapping; + std::unordered_map weights_layers; + + //------------------------------ build the model -------------------------- + Tensor input; + Tensor position_input; + { + int const token_dims[] = {1, 9}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + position_input = ff.create_tensor<2>(token_dims, DT_INT32); + } + + mapping[input].push_back(machine_views[0]); + mapping[position_input].push_back(machine_views[0]); + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + std::vector axes = {0}; + + Tensor token = ff.embedding(input, + optConfig.vocab_size, + optConfig.word_embed_proj_dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *embedding = ff.layers.back(); + + weights_layers.emplace("embed_tokens_weight", embedding); + + Tensor positional_embedding = ff.embedding(position_input, + optConfig.max_position_embeddings, + optConfig.hidden_size, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *pos_embedding = ff.layers.back(); + weights_layers.emplace("embed_positions_weight", pos_embedding); + + Tensor residual = ff.add(token, positional_embedding); + + int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; + + for (int i = 0; i < optConfig.num_hidden_layers; i++) { + // 125m, 1.7B, ..., 175B applies layer norm BEFORE attention, + // 350m applies layer norm AFTER attention + // https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#LL324C1-L325C1 + // this version is before normalization + + Tensor hidden_states = ff.layer_norm( + residual, axes, optConfig.layer_norm_elementwise_affine, 1e-05); + Layer *self_attn_layer_norm = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_self_attn_layer_norm_weight", + self_attn_layer_norm); + if (i % num_transformer_layers_per_gpu == 0) { + mapping[hidden_states].push_back( + machine_views[i / num_transformer_layers_per_gpu]); + } + + Tensor mha = ff.inc_multihead_self_attention( + hidden_states, + optConfig.hidden_size, + optConfig.num_attention_heads, + optConfig.hidden_size / optConfig.num_attention_heads, + optConfig.hidden_size / optConfig.num_attention_heads, + 0.0f, + true, + false, + false, + NULL, + false, + /*scaling query*/ true, + /*sacling factor*/ + pow((optConfig.hidden_size / optConfig.num_attention_heads), -0.5), + /*qk_prod_scaling*/ false); + + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + Tensor added = ff.add(mha, residual); + + Tensor final_norm = ff.layer_norm( + added, axes, optConfig.layer_norm_elementwise_affine, 1e-05); + Layer *final_layer_norm = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_final_layer_norm_weight", + final_layer_norm); + + //--------linear fc1 fc2 ---------- + Tensor fc1 = ff.dense(final_norm, optConfig.ffn_dim, AC_MODE_NONE, true); + Layer *fc1_linear = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_fc1_weight", + fc1_linear); + Tensor activation = ff.relu(fc1, false); + + Tensor fc2 = + ff.dense(activation, optConfig.hidden_size, AC_MODE_NONE, true); + Layer *fc2_linear = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_fc2_weight", + fc2_linear); + residual = ff.add(added, fc2); + } + + // final + Tensor all_final_norm = ff.layer_norm( + residual, axes, optConfig.layer_norm_elementwise_affine, 1e-05); + Layer *all_final_norm_layer = ff.layers.back(); + weights_layers.emplace("final_layer_norm_weight", all_final_norm_layer); + + Tensor lm_head = + ff.dense(all_final_norm, optConfig.vocab_size, AC_MODE_NONE, false); + Layer *lm_head_layer = ff.layers.back(); + weights_layers.emplace("embed_tokens_weight_lm_head", lm_head_layer); + + Tensor output = ff.arg_top_k(lm_head, /*k=*/1, false); + //------------------- compile the model -------------------------------- + std::cout << "------start compile ----------" << std::endl; + InferenceManager im(ffconfig, 1, 1); + im.compile_model_and_allocate_buffer(&ff, mapping); + RequestManager rm; + + ParallelTensor input_pt; + ff.get_parallel_tensor_from_tensor(input, input_pt); + assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); + + ParallelTensor pos_pt; + ff.get_parallel_tensor_from_tensor(position_input, pos_pt); + assert(im.tensor_buffer.find(pos_pt) != im.tensor_buffer.end()); + + //-------------------load weights and inputs------------------ + FileDataLoader fileloader(optConfig.input_path, + optConfig.weight_file_path, + optConfig.num_attention_heads, + optConfig.hidden_size, + optConfig.hidden_size / + optConfig.num_attention_heads); + //"Today is a beautiful day and I want" + std::vector prompt = {2, 5625, 16, 10, 2721, 183, 8, 38, 236}; + rm.register_new_request(prompt, 20); + fileloader.load_weights(&ff, weights_layers); + + im.init_operators_inference(&ff); + int depth = 0; + std::map future_handlers; + std::map batch_configs; + int sentence_length = 9; + while (true) { + int bid = 0; + if (future_handlers.find(bid) == future_handlers.end()) { + BatchConfig bc; + InferenceResult ir; + bc = rm.prepare_next_batch(bc, ir); + FutureMap fm = im.inference(&ff, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; + } else { + Future future = future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } else { + std::cout << "future is ready...." << std::endl; + } + // process end + InferenceResult ir = future.get_result(); + BatchConfig bc = batch_configs[bid]; + bc = rm.prepare_next_batch(bc, ir); + sentence_length += bc.num_tokens; + FutureMap fm = im.inference(&ff, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future_handlers[bid] = fm.get_future(0); + batch_configs[bid] = bc; + } + } + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/opt/opt.h b/examples/cpp/inference/opt/opt.h new file mode 100644 index 0000000000..d581b73df9 --- /dev/null +++ b/examples/cpp/inference/opt/opt.h @@ -0,0 +1,66 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "file_loader.h" +#include "inference_config.h" +// # OPTConfig { +// # "_remove_final_layer_norm": false, +// # "activation_function": "relu", +// # "attention_dropout": 0.0, +// # "bos_token_id": 2, +// # "do_layer_norm_before": true, +// # "dropout": 0.1, +// # "enable_bias": true, +// # "eos_token_id": 2, +// # "ffn_dim": 3072, +// # "hidden_size": 768, +// # "init_std": 0.02, +// # "layer_norm_elementwise_affine": true, +// # "layerdrop": 0.0, +// # "max_position_embeddings": 2048, +// # "model_type": "opt", +// # "num_attention_heads": 12, +// # "num_hidden_layers": 12, +// # "pad_token_id": 1, +// # "transformers_version": "4.27.2", +// # "use_cache": true, +// # "vocab_size": 50272, +// # "word_embed_proj_dim": 768 +// # } +struct OptConfig : InferenceConfig { + OptConfig(void) : InferenceConfig() { + vocab_size = 50272, word_embed_proj_dim = 768, hidden_size = 768; + max_position_embeddings = 2048; + layer_norm_elementwise_affine = true; + num_attention_heads = 12; + dropout = 0.1; + seed = 3; + ffn_dim = 3072; + num_hidden_layers = 12; + weight_file_path = + "/home/ubuntu/FlexFlow/examples/cpp/inference/opt/weights/"; + } + int word_embed_proj_dim; + std::string input_path; + std::string weight_file_path; + int max_position_embeddings; + bool layer_norm_elementwise_affine; + float dropout; + unsigned long long seed; + int ffn_dim; + int num_hidden_layers; +}; diff --git a/examples/cpp/inference/opt/opt_baseline.py b/examples/cpp/inference/opt/opt_baseline.py new file mode 100644 index 0000000000..5574af259a --- /dev/null +++ b/examples/cpp/inference/opt/opt_baseline.py @@ -0,0 +1,23 @@ +from transformers import OPTConfig, OPTForCausalLM, GPT2Tokenizer + +model_id = "facebook/opt-125m" +tokenizer = GPT2Tokenizer.from_pretrained(model_id) +model = OPTForCausalLM.from_pretrained(model_id) + +prompts = [ + "Today is a beautiful day and I want", + ] + +for prompt in prompts: + input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids + print(input_ids) + generated_ids = model.generate(input_ids, max_length=20) + generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + print(generated_ids) + print(generated_string) + +#get same results with this and opt.cc + +# tensor([[ 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, 458, 19, + # 47, 5, 2770, 527, 9, 127, 78, 655]]) +# 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, 458, 19, 47, 5, 2770, 527, 9, 127, 78, 655 \ No newline at end of file diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index bbeaf67821..ed5c6c3aa0 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -37,6 +37,7 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); void load_input_tokens_from_batch_config(BatchConfig const &bc, ParallelTensor const input); + void load_positions(BatchConfig const &bc, ParallelTensor position_input); public: FFConfig ff_config; @@ -118,6 +119,11 @@ class RequestManager { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + load_positions_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); private: std::queue pending_request_queue; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index caf6229300..a246cf37c3 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -215,6 +215,7 @@ enum TaskIDs { FUSED_PARALLELOP_BWD_TASK_ID, // InferenceManager & RequestManager RM_LOAD_TOKENS_TASK_ID, + RM_LOAD_POSITION_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, @@ -611,11 +612,14 @@ class FFModel { int kdim = 0, int vdim = 0, float dropout = 0.0f, - bool bias = true, + bool bias = false, bool add_bias_kv = false, bool add_zero_attn = false, Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, char const *name = NULL); Tensor spec_inc_multihead_self_attention(const Tensor input, diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 5db5e0c3c1..410c30abd9 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -34,6 +34,9 @@ class IncMultiHeadSelfAttention : public Op { bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name); IncMultiHeadSelfAttention(FFModel &model, @@ -48,6 +51,9 @@ class IncMultiHeadSelfAttention : public Op { bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name); IncMultiHeadSelfAttention(FFModel &model, @@ -96,14 +102,16 @@ class IncMultiHeadSelfAttention : public Op { BatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr); + float *output_ptr, + float const *bias_ptr); Params get_params() const; public: int num_heads; - float dropout; + float dropout, scaling_factor; bool bias; - bool add_bias_kv, add_zero_attn, apply_rotary_embedding; + bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; }; @@ -125,6 +133,10 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { int num_heads; bool *has_load_weights; bool *apply_rotary_embedding; + bool *bias; + bool *scaling_query; + bool *qk_prod_scaling; + float scaling_factor; #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index d263bc741a..66aed3bf3b 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -9,8 +9,9 @@ namespace FlexFlow { struct IncMultiHeadSelfAttentionParams { LayerID layer_guid; int embed_dim, num_heads, kdim, vdim; - float dropout; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8976703c6f..bc96e2a587 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -65,14 +65,18 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, bool add_zero_attn, Initializer *kernel_initializer, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name) { + int weight_num = bias ? 2 : 1; // Currently assume that Layer *li = new Layer(this, OP_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + weight_num /*weights*/, 1 /*outputs*/, input); { @@ -103,6 +107,17 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, kernel_initializer, CHOSEN_SYNC_TYPE); } + if (bias) { + // q, k, v, o + int dims[1] = {embed_dim * 4}; + li->weights[1] = create_weight_legion_ordering(1, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } li->data_type = DT_FLOAT; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); @@ -113,7 +128,11 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("scaling_query", scaling_query); + li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); layers.push_back(li); + return li->outputs[0]; } @@ -140,6 +159,13 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); bool apply_rotary_embedding = (bool)value; + layer->get_int_property("scaling_query", value); + bool scaling_query = (bool)value; + float scaling_factor; + layer->get_float_property("scaling_factor", scaling_factor); + layer->get_int_property("qk_prod_scaling", value); + bool qk_prod_scaling = (bool)value; + return new IncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], @@ -152,6 +178,9 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, false /*allocate_weights*/, layer->name); } @@ -169,6 +198,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -177,7 +209,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + (_bias ? 2 : 1), /*weights*/ 1 /*outputs*/, _input), num_heads(_num_heads), dropout(_dropout), bias(_bias), @@ -186,7 +218,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling) { // overwrite layer_guid layer_guid = _layer_guid; @@ -230,6 +264,26 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( initializer, comm_type); } + if (bias) { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[0] = inputs[0]->dims[num_dims - 1]; + dims[0].size = dims[0].degree; + dims[1].size = oProjSize * 4; + dims[1].degree = 1; + dims[1].parallel_idx = -1; +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[1] = model.create_parallel_weight<2>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + NULL, + comm_type); + } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, DT_FLOAT, this); @@ -253,6 +307,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -261,7 +318,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + (_bias ? 2 : 1), /*weights*/ 1 /*outputs*/, _input, _weight), @@ -271,7 +328,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -312,6 +371,24 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( initializer, comm_type); } + if (bias) { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[0] = inputs[0]->dims[num_dims - 1]; + dims[0].size = dims[0].degree; + dims[1].size = oProjSize * 4; +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[1] = model.create_parallel_weight<2>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + NULL, + comm_type); + } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, DT_FLOAT, this); @@ -341,6 +418,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.add_bias_kv, other.add_zero_attn, other.apply_rotary_embedding, + other.scaling_query, + other.scaling_factor, + other.qk_prod_scaling, allocate_weights, other.name) {} @@ -362,6 +442,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.add_bias_kv, params.add_zero_attn, params.apply_rotary_embedding, + params.scaling_query, + params.scaling_factor, + params.qk_prod_scaling, allocate_weights, name) {} @@ -457,6 +540,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); @@ -480,6 +564,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( .first(); IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); + m->profiling = attn->profiling; assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); return m; @@ -533,6 +618,15 @@ FutureMap IncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); + + if (bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(idx++, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } @@ -546,13 +640,17 @@ void IncMultiHeadSelfAttention::inference_task( std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 3); + assert(task->regions.size() == regions.size()); + float const *bias_ptr = NULL; + BatchConfig const *bc = (BatchConfig *)task->args; IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); + assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( @@ -560,6 +658,20 @@ void IncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (*m->bias) { + GenericTensorAccessorR biases = + helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 2); + bias_ptr = biases.get_float_ptr(); + } + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain weight_domain = runtime->get_index_space_domain( @@ -571,15 +683,12 @@ void IncMultiHeadSelfAttention::inference_task( assert(weight_domain.get_dim() == 3); assert(output_domain.get_dim() == 4); - /* print_tensor(input.get_float_ptr(), - input_domain.get_volume(), - "[Attention:forward:query]"); */ - IncMultiHeadSelfAttention::inference_kernel_wrapper(m, bc, input.get_float_ptr(), weight.get_float_ptr(), - output.get_float_ptr()); + output.get_float_ptr(), + bias_ptr); #ifdef INFERENCE_TESTS printf("Checking IncMultiHeadSelfAttention computations...\n"); @@ -1367,7 +1476,10 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs, lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.scaling_query == rhs.scaling_query && + lhs.scaling_factor == rhs.scaling_factor && + lhs.qk_prod_scaling == rhs.qk_prod_scaling; } IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { @@ -1382,6 +1494,10 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; params.apply_rotary_embedding = this->apply_rotary_embedding; + params.scaling_query = this->scaling_query; + params.scaling_factor = this->scaling_factor; + params.qk_prod_scaling = this->qk_prod_scaling; + return params; } @@ -1401,6 +1517,9 @@ size_t hash::operator()( hash_combine(key, params.add_bias_kv); hash_combine(key, params.add_zero_attn); hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.scaling_query); + hash_combine(key, params.scaling_factor); + hash_combine(key, params.qk_prod_scaling); return key; } }; // namespace std diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 12ab8ae30c..c56e73a266 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -29,7 +29,8 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( BatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr) { + float *output_ptr, + float const *bias_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 5997a3d48d..be86f55de4 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -40,6 +40,49 @@ __global__ void build_w_out_tensor(float const *weight_ptr, } } +__global__ void apply_proj_bias_w(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int oProjSize) { + CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { + int bias_idx = 3 * oProjSize + i % oProjSize; + input_ptr[i] += bias_ptr[bias_idx]; + } +} + +__global__ void apply_proj_bias_qkv(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int qProjSize, + int kProjSize, + int vProjSize, + int num_heads, + bool scaling_query, + float scaling_factor) { + CUDA_KERNEL_LOOP( + i, num_tokens * (qProjSize + kProjSize + vProjSize) * num_heads) { + // for simplicity, assume q, k, v is in same shape + // 0->q, 1->k, 2->v + int qkv_index = i / (num_tokens * qProjSize) % 3; + + int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int q_block_size = qProjSize * num_tokens; + + int idx = i % (num_tokens * (qProjSize)); + + int real_part_index = + head_idx * qkv_block_size + qkv_index * q_block_size + idx; + int bias_idx = qkv_index * qProjSize * num_heads + head_idx * qProjSize + + (idx % qProjSize); + input_ptr[real_part_index] += bias_ptr[bias_idx]; + + if (scaling_query && qkv_index == 0) { + input_ptr[real_part_index] *= scaling_factor; + } + } +} + __global__ void apply_rotary_embedding(float *input_ptr, cuFloatComplex *complex_input, @@ -106,6 +149,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, float const *input_ptr, float const *weight_ptr, float *output_ptr, + float const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); @@ -217,6 +261,23 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int k_block_size = m->kProjSize * num_tokens; int v_block_size = m->vProjSize * num_tokens; cuFloatComplex *complex_input; + + // apply bias for q, k, v + if (*m->bias) { + apply_proj_bias_qkv<<>>(output_ptr, + bias_ptr, + num_tokens, + m->qProjSize, + m->kProjSize, + m->vProjSize, + m->num_heads, + *m->scaling_query, + m->scaling_factor); + } + if (*m->apply_rotary_embedding) { checkCUDA(cudaMalloc(&complex_input, num_tokens * m->qProjSize * m->num_heads * @@ -344,6 +405,7 @@ __global__ void fill_entries_above_diagonal(float *matrix, void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, float *output_ptr, + float const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -382,7 +444,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; - float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + // a flag of using this scaling alpha + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + } // To get A, skip over Q entries from previous requests (same head) void const *A = (void const *)(m->devQKVProjArray + tokens_previous_requests * m->qProjSize); @@ -415,6 +481,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -541,9 +608,19 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + tokens_previous_requests += num_new_tokens; } + if (*m->bias) { + int parallelism = m->oProjSize * num_tokens; + apply_proj_bias_w<<>>( + output_ptr, bias_ptr, num_tokens, m->oProjSize); + } + assert(tokens_previous_requests == num_tokens); } @@ -553,7 +630,8 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( BatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr) { + float *output_ptr, + float const *bias_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -588,14 +666,15 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + compute_qkv_kernel( + m, bc, input_ptr, weight_ptr, m->devQKVProjArray, bias_ptr, stream); // phase 2: Update key/val cache update_kv_cache_kernel(m, bc, stream); // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, stream); + compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -643,6 +722,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( *has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); *apply_rotary_embedding = attn->apply_rotary_embedding; + bias = (bool *)calloc(1, sizeof(bool)); + *bias = attn->bias; + scaling_query = (bool *)calloc(1, sizeof(bool)); + *scaling_query = attn->scaling_query; + scaling_factor = attn->scaling_factor; + qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); + *qk_prod_scaling = attn->qk_prod_scaling; // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 2a382f0d71..baf1b24f5d 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2275,6 +2275,9 @@ GraphOptimalViewSerialized sez.serialize(attn->add_bias_kv); sez.serialize(attn->add_zero_attn); sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->scaling_query); + sez.serialize(attn->scaling_factor); + sez.serialize(attn->qk_prod_scaling); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2669,8 +2672,9 @@ void FFModel::deserialize_graph_optimal_view( case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; - float dropout; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -2683,6 +2687,9 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(add_bias_kv); dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); + dez.deserialize(scaling_query); + dez.deserialize(scaling_factor); + dez.deserialize(qk_prod_scaling); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2695,6 +2702,9 @@ void FFModel::deserialize_graph_optimal_view( params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; + params.scaling_query = scaling_query; + params.scaling_factor = scaling_factor; + params.qk_prod_scaling = qk_prod_scaling; node = get_or_create_node(inputs[0], params); break; } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 15ae8e3aa3..63a5bb6540 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -216,12 +216,18 @@ FutureMap InferenceManager::inference(FFModel *model, if (op->op_type == OP_INPUT) { // FIXME: this is a hack, should be replace with an input ParallelTensor if (found_input_operator) { - continue; + // there is another input for position embedding; + // now only used in opt model, this input should be init after token + // input. + assert(op->numOutputs == 1); + ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; + load_positions(bc, pt); + } else { + found_input_operator = true; + assert(op->numOutputs == 1); + ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; + load_input_tokens_from_batch_config(bc, pt); } - found_input_operator = true; - assert(op->numOutputs == 1); - ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt); } std::vector inputs(op->numInputs); @@ -271,6 +277,31 @@ void InferenceManager::load_input_tokens_from_batch_config( runtime->execute_index_space(ctx, launcher); } +void InferenceManager::load_positions(BatchConfig const &bc, + ParallelTensor position_input) { + Context ctx = ff_config.lg_ctx; + Runtime *runtime = ff_config.lg_hlr; + size_t machine_view_hash = position_input->machine_view.hash(); + ArgumentMap argmap; + IndexLauncher launcher( + RM_LOAD_POSITION_TASK_ID, + position_input->parallel_is, + TaskArgument( + &bc, std::max(sizeof(BeamSearchBatchConfig), sizeof(BatchConfig))), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(position_input->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + position_input->region)); + launcher.add_field(0, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + void FFModel::compile_inference() { Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 38bb7b8333..562b09e411 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3899,6 +3899,15 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "RequestManager Load Tokens Task"); } + // RequestManager load position tokens + { + TaskVariantRegistrar registrar(RM_LOAD_POSITION_TASK_ID, + "RequestManager Load Position tokens"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "RequestManager Load Position Tokens Task"); + } // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index fb445fbec6..5b256d5bb7 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -64,6 +64,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == request.tokens.size()); // This is a decoding token + std::cout << "token is: " << result.token_ids[i]; request.tokens.push_back(result.token_ids[i]); } } @@ -85,6 +86,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", old_bc.requestsInfo[i].request_guid, request.tokens.size()); + std::cout << "print results: " << std::endl; + for (int i = 0; i < request.tokens.size(); i++) { + std::cout << request.tokens.at(i) << ", "; + } } else { new_bc.request_completed[i] = false; new_bc.requestsInfo[i].token_start_offset = processed_tokens; @@ -839,6 +844,7 @@ TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( beam_bc.requestsInfo[i].max_sequence_length; tree_bc.requestsInfo[i].token_start_offset = serializedTree[0].second; tree_bc.requestsInfo[i].num_tokens_in_batch = 0; + for (int k = 0; k < serializedTree.size(); k++) { assert(tree_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS); tree_bc.tokensInfo[tree_bc.num_tokens].request_index = i; diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 87e86087fe..ffbdac68cd 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -48,4 +48,31 @@ void RequestManager::load_tokens_task( stream)); } +void RequestManager::load_positions_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 1); + assert(task->regions.size() == 1); + BatchConfig const batch_config = *((BatchConfig *)task->args); + int offset = 2; + int *pos_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + int dram_copy[BatchConfig::MAX_NUM_TOKENS]; + + for (int i = 0; i < batch_config.num_tokens; i++) { + dram_copy[i] = batch_config.tokensInfo[i].abs_depth_in_request + offset; + } + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDA(hipMemcpyAsync(pos_ptr, + dram_copy, + sizeof(int) * batch_config.num_tokens, + hipMemcpyHostToDevice, + stream)); +} + }; // namespace FlexFlow diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 32e872125c..c1bd02494f 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -47,4 +47,33 @@ void RequestManager::load_tokens_task( stream)); } +void RequestManager::load_positions_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 1); + assert(task->regions.size() == 1); + + BatchConfig const batch_config = *((BatchConfig *)task->args); + int offset = 2; + int *pos_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + int dram_copy[BatchConfig::MAX_NUM_TOKENS]; + + for (int i = 0; i < batch_config.num_tokens; i++) { + dram_copy[i] = batch_config.tokensInfo[i].abs_depth_in_request + offset; + } + + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDA(cudaMemcpyAsync(pos_ptr, + dram_copy, + sizeof(int) * batch_config.num_tokens, + cudaMemcpyHostToDevice, + stream)); +} + }; // namespace FlexFlow From b2d6d9a9221a40c305eb570a06f22ead239a734f Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 11 May 2023 11:01:06 -0500 Subject: [PATCH 121/344] TreeIncMHA and SpecIncMHA bug fixes (#720) * Support multiple FFModels in a single top_level_task * [TreeVerifyMHA] bug fixes * bug fixes * TreeIncMHA and SpecIncMHA bug fixes * fomat. --------- Co-authored-by: xinhaoc --- examples/cpp/inference/models/llama.cc | 2 +- examples/cpp/inference/models/llama.h | 11 +-- .../inference/spec_verify_pipeline/llama.cc | 96 +------------------ include/flexflow/batch_config.h | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 25 +++-- src/ops/tree_inc_multihead_self_attention.cu | 25 +++-- src/runtime/request_manager.cu | 1 + 7 files changed, 32 insertions(+), 130 deletions(-) diff --git a/examples/cpp/inference/models/llama.cc b/examples/cpp/inference/models/llama.cc index d2374c8c8f..46f83f8198 100644 --- a/examples/cpp/inference/models/llama.cc +++ b/examples/cpp/inference/models/llama.cc @@ -45,7 +45,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {llama_config.batchSize, llama_config.max_seq_len}; + int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } mapping[input].push_back(machine_views[0]); diff --git a/examples/cpp/inference/models/llama.h b/examples/cpp/inference/models/llama.h index 612a28967e..8c7d464936 100644 --- a/examples/cpp/inference/models/llama.h +++ b/examples/cpp/inference/models/llama.h @@ -31,16 +31,14 @@ class LLAMA { dim = 4096; multiple_of = 256; norm_eps = 1e-6; - total_sentence = 5; sentence_len = 347; - max_gen_length = 256; batchSize = 5; total_requests = 2560; incremental_mode = true; sequence_length = BatchConfig::MAX_SEQ_LENGTH; - max_seq_len = 8; + max_seq_len = BatchConfig::MAX_NUM_TOKENS; max_beam_width = 1; - max_beam_depth = 8; + max_beam_depth = 4; // hidden dim hidden_dim = 4 * dim; @@ -49,9 +47,8 @@ class LLAMA { multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); } int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, - total_sentence, sentence_len, batchSize, total_requests, - incremental_mode, sequence_length, max_gen_length, max_seq_len, - max_beam_width, max_beam_depth; + sentence_len, batchSize, total_requests, incremental_mode, + sequence_length, max_seq_len, max_beam_width, max_beam_depth; float norm_eps; std::string weight_file_path; std::string input_path; diff --git a/examples/cpp/inference/spec_verify_pipeline/llama.cc b/examples/cpp/inference/spec_verify_pipeline/llama.cc index b00fc522f1..a2a8e1ea2f 100644 --- a/examples/cpp/inference/spec_verify_pipeline/llama.cc +++ b/examples/cpp/inference/spec_verify_pipeline/llama.cc @@ -54,10 +54,11 @@ void FlexFlow::top_level_task(Task const *task, 1, 306, 4658, 278, 6593, 310, 2834, 338}; rm.register_new_request(prompt, llama_config.sentence_len); - FFModel beam_model(ffconfig), tree_model(ffconfig), inc_model(ffconfig); + FFModel beam_model(ffconfig), tree_model(ffconfig); LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); - LLAMA::create_llama_model(inc_model, im, llama_config, 1, INC_DECODING_MODE); + // LLAMA::create_llama_model(inc_model, im, llama_config, 1, + // INC_DECODING_MODE); // entry--------------------------- int depth = 0; @@ -183,97 +184,6 @@ void FlexFlow::top_level_task(Task const *task, } } - // // original - // { - // std::vector tokens{1, - // 306, - // 4658, - // 278, - // 6593, - // 310, - // 2834, - // 338, - // 593, - // 595, - // 17252, - // 5031, - // 993, - // 616, - // 368, - // 2302, - // 3204, - // 29131, - // 2976, - // 11285, - // 8930, - // 635, - // 8519, - // 593, - // 595}; - // BatchConfig bc; - // bc.num_tokens = 25; - // bc.requestsInfo[0].num_tokens_in_batch = bc.num_tokens; - // bc.requestsInfo[0].token_start_offset = 0; - // bc.requestsInfo[0].max_sequence_length = 347; - // bc.requestsInfo[0].request_guid = 1000000; - // bc.request_completed[0] = false; - // for (int i = 0; i < bc.num_tokens; i++) { - // bc.tokensInfo[i].token_id = tokens[i]; - // bc.tokensInfo[i].abs_depth_in_request = i; - // bc.tokensInfo[i].request_index = 0; - // } - // FutureMap fm = im.inference(&inc_model, 0, bc); - // assert(fm.get_future_map_domain().get_volume() == 1); - // Future future = fm.get_future(0); - // InferenceResult ir = future.get_result(); - // for (int i = 0; i < bc.num_tokens; i++) { - // printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); - // } - // } - - // // verification - // { - // std::vector tokens{1, - // 306, - // 4658, - // 278, - // 6593, - // 310, - // 2834, - // 338, - // 593, - // 595, - // 17252, - // 5031, - // 993, - // 616, - // 368, - // 2302, - // 3204, - // 29131, - // 2976, - // 11285, - // 8930, - // 635, - // 8519, - // 593, - // 595}; - // tree_bc.num_tokens = 25; - // tree_bc.requestsInfo[0].num_tokens_in_batch = tree_bc.num_tokens; - // for (int i = 0; i < tree_bc.num_tokens; i++) { - // tree_bc.tokensInfo[i].token_id = tokens[i]; - // tree_bc.tokensInfo[i].abs_depth_in_request = i; - // tree_bc.tokensInfo[i].request_index = 0; - // } - // FutureMap fm = im.inference(&tree_model, 0, tree_bc); - // assert(fm.get_future_map_domain().get_volume() == 1); - // Future future = fm.get_future(0); - // InferenceResult ir = future.get_result(); - // for (int i = 0; i < tree_bc.num_tokens; i++) { - // printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); - // } - // } - // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 8c6fa41f2e..bd109eecd0 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -102,7 +102,7 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; static int const MAX_BEAM_WIDTH = 1; - static int const MAX_BEAM_DEPTH = 8; + static int const MAX_BEAM_DEPTH = 4; struct BeamSearchPerRequestInfo { bool request_completed; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 42f647f670..ee1c6e389b 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -437,18 +437,17 @@ void inference_kernel2(SpecIncMultiHeadSelfAttentionMeta const *m, } __global__ void spec_fill_entries_above_diagonal(float *matrix, - size_t num_rows, - size_t num_cols, + size_t new_tokens, + size_t total_tokens_in_request, size_t num_heads, - size_t entries_above_diagonal, float value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { + //size_t head_idx = i / (new_tokens * total_tokens_in_request); + size_t src_idx = (i / new_tokens) % total_tokens_in_request; + size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; + // Casual Mask + if (src_idx > dst_idx) + matrix[i] = value; } } @@ -543,9 +542,8 @@ void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); - size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_heads * entries_above_diagonal; + if (num_new_tokens > 1) { + size_t parallelism = m->num_heads * num_new_tokens * total_tokens; spec_fill_entries_above_diagonal<<num_heads, - entries_above_diagonal, -INFINITY); } // Compute Softmax(QK^T/sqrt(d_k)) diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 39a7ceaca3..452a1be7b2 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -375,18 +375,17 @@ __global__ void update_tree_branch_kv_cache( } __global__ void tree_fill_entries_above_diagonal(float *matrix, - size_t num_rows, - size_t num_cols, + size_t new_tokens, + size_t total_tokens_in_request, size_t num_heads, - size_t entries_above_diagonal, float value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { + //size_t head_idx = i / (new_tokens * total_tokens_in_request); + size_t src_idx = (i / new_tokens) % total_tokens_in_request; + size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; + // Casual Mask + if (src_idx > dst_idx) + matrix[i] = value; } } @@ -517,9 +516,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens_in_request); - size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_heads * entries_above_diagonal; + if (num_new_tokens > 1) { + size_t parallelism = m->num_heads * num_new_tokens * total_tokens_in_request; tree_fill_entries_above_diagonal<<num_heads, - entries_above_diagonal, -INFINITY); } // Compute Softmax(QK^T/sqrt(d_k)) diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index c1bd02494f..a50ca5ad95 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -30,6 +30,7 @@ void RequestManager::load_tokens_task( BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; + assert(batch_config.num_tokens <= BatchConfig::MAX_NUM_TOKENS); for (int i = 0; i < batch_config.num_tokens; i++) { dram_copy[i] = batch_config.tokensInfo[i].token_id; } From f779d894d6f45641c7fd53e444d32a96bd7c5954 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 11 May 2023 23:33:27 -0400 Subject: [PATCH 122/344] [inference] serving opt pipeline (#722) * serving opt pipeline * format --- CMakeLists.txt | 6 +- .../CMakeLists.txt | 4 +- .../Makefile | 4 +- .../README.md | 0 .../llama.cc | 0 .../llama_rae.cc | 0 examples/cpp/inference/models/llama.cc | 5 - examples/cpp/inference/models/opt.cc | 229 ++++++++++++++++++ examples/cpp/inference/models/opt.h | 65 +++++ examples/cpp/inference/opt/opt.cc | 20 +- examples/cpp/inference/opt/opt.h | 26 -- examples/cpp/inference/opt/opt_baseline.py | 10 +- .../opt_spec_pipeline/CMakeLists.txt | 20 ++ .../cpp/inference/opt_spec_pipeline/Makefile | 37 +++ .../opt_spec_pipeline/opt_pipeline.cc | 189 +++++++++++++++ include/flexflow/model.h | 10 +- .../ops/spec_inc_multihead_self_attention.h | 18 +- ...spec_inc_multihead_self_attention_params.h | 5 +- .../ops/tree_inc_multihead_self_attention.h | 18 +- ...tree_inc_multihead_self_attention_params.h | 5 +- src/ops/spec_inc_multihead_self_attention.cc | 134 +++++++++- src/ops/spec_inc_multihead_self_attention.cpp | 3 +- src/ops/spec_inc_multihead_self_attention.cu | 132 +++++++--- src/ops/tree_inc_multihead_self_attention.cc | 129 +++++++++- src/ops/tree_inc_multihead_self_attention.cpp | 3 +- src/ops/tree_inc_multihead_self_attention.cu | 100 +++++++- src/runtime/graph.cc | 28 ++- 27 files changed, 1083 insertions(+), 117 deletions(-) rename examples/cpp/inference/{spec_verify_pipeline => llama_spec_pipeline}/CMakeLists.txt (87%) rename examples/cpp/inference/{spec_verify_pipeline => llama_spec_pipeline}/Makefile (93%) rename examples/cpp/inference/{spec_verify_pipeline => llama_spec_pipeline}/README.md (100%) rename examples/cpp/inference/{spec_verify_pipeline => llama_spec_pipeline}/llama.cc (100%) rename examples/cpp/inference/{spec_verify_pipeline => llama_spec_pipeline}/llama_rae.cc (100%) create mode 100644 examples/cpp/inference/models/opt.cc create mode 100644 examples/cpp/inference/models/opt.h create mode 100644 examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt create mode 100644 examples/cpp/inference/opt_spec_pipeline/Makefile create mode 100644 examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index ff6e90f200..45e4dfb328 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -553,7 +553,11 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/spec_verify_pipeline) + add_subdirectory(examples/cpp/inference/llama_spec_pipeline) +endif() + +if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/inference/opt_spec_pipeline) endif() # installation diff --git a/examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt b/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt similarity index 87% rename from examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt rename to examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt index f93189af36..aeb3d8891b 100644 --- a/examples/cpp/inference/spec_verify_pipeline/CMakeLists.txt +++ b/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.10) -project(FlexFlowExample_spec_verify_pipeline) -set(project_target spec_verify_pipeline) +project(FlexFlowExample_llama_pipeline) +set(project_target llama_pipeline) set(CPU_SRC diff --git a/examples/cpp/inference/spec_verify_pipeline/Makefile b/examples/cpp/inference/llama_spec_pipeline/Makefile similarity index 93% rename from examples/cpp/inference/spec_verify_pipeline/Makefile rename to examples/cpp/inference/llama_spec_pipeline/Makefile index 130d52a7ee..0e4b79f51f 100644 --- a/examples/cpp/inference/spec_verify_pipeline/Makefile +++ b/examples/cpp/inference/llama_spec_pipeline/Makefile @@ -23,10 +23,8 @@ USE_HDF ?= 1 # Include HDF5 support (requires HDF5) ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) # Put the binary file name here -OUTFILE ?= spec_verify_pipeline +OUTFILE ?= llama_pipeline # List all the application source files here -GEN_SRC = llama.cc dataloader.cc -GEN_GPU_SRC = dataloader.cu ifndef CUDA_HOME CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) endif diff --git a/examples/cpp/inference/spec_verify_pipeline/README.md b/examples/cpp/inference/llama_spec_pipeline/README.md similarity index 100% rename from examples/cpp/inference/spec_verify_pipeline/README.md rename to examples/cpp/inference/llama_spec_pipeline/README.md diff --git a/examples/cpp/inference/spec_verify_pipeline/llama.cc b/examples/cpp/inference/llama_spec_pipeline/llama.cc similarity index 100% rename from examples/cpp/inference/spec_verify_pipeline/llama.cc rename to examples/cpp/inference/llama_spec_pipeline/llama.cc diff --git a/examples/cpp/inference/spec_verify_pipeline/llama_rae.cc b/examples/cpp/inference/llama_spec_pipeline/llama_rae.cc similarity index 100% rename from examples/cpp/inference/spec_verify_pipeline/llama_rae.cc rename to examples/cpp/inference/llama_spec_pipeline/llama_rae.cc diff --git a/examples/cpp/inference/models/llama.cc b/examples/cpp/inference/models/llama.cc index 46f83f8198..7686ba746d 100644 --- a/examples/cpp/inference/models/llama.cc +++ b/examples/cpp/inference/models/llama.cc @@ -83,11 +83,6 @@ void LLAMA::create_llama_model(FFModel &ff, "_attention_norm_weight", attention_norm); - // std::cout << "------before att shape"; - // std::cout << att_norm->num_dims << "------\n"; - // for (int i = 0; i < att_norm->num_dims; i++) { - // std::cout << att_norm->dims[i] << "------\n"; - // } Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { diff --git a/examples/cpp/inference/models/opt.cc b/examples/cpp/inference/models/opt.cc new file mode 100644 index 0000000000..52d1ed6a84 --- /dev/null +++ b/examples/cpp/inference/models/opt.cc @@ -0,0 +1,229 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "opt.h" + +namespace FlexFlow { + +using namespace Legion; + +void OPT::create_opt_model(FFModel &ff, + InferenceManager &im, + Config const &opt_config, + int num_pipeline_stages, + InferenceMode mode) { + //------------------------------compute machine views ------------------ + int num_devices = ff.config.workersPerNode * ff.config.numNodes; + std::vector machine_views; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } + + std::unordered_map> mapping; + std::unordered_map weights_layers; + + //------------------------------ build the model -------------------------- + Tensor input; + Tensor position_input; + { + int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + position_input = ff.create_tensor<2>(token_dims, DT_INT32); + } + mapping[input].push_back(machine_views[0]); + mapping[position_input].push_back(machine_views[0]); + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + std::vector axes = {0}; + + Tensor token = ff.embedding(input, + opt_config.vocab_size, + opt_config.word_embed_proj_dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *embedding = ff.layers.back(); + + weights_layers.emplace("embed_tokens_weight", embedding); + + Tensor positional_embedding = ff.embedding(position_input, + opt_config.max_position_embeddings, + opt_config.hidden_size, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + Layer *pos_embedding = ff.layers.back(); + weights_layers.emplace("embed_positions_weight", pos_embedding); + + Tensor residual = ff.add(token, positional_embedding); + + int num_transformer_layers_per_stage = + (32 + num_pipeline_stages - 1) / num_pipeline_stages; + + for (int i = 0; i < opt_config.num_hidden_layers; i++) { + // 125m, 1.7B, ..., 175B applies layer norm BEFORE attention, + // 350m applies layer norm AFTER attention + // https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#LL324C1-L325C1 + // this version is before normalization + + Tensor hidden_states = ff.layer_norm( + residual, axes, opt_config.layer_norm_elementwise_affine, 1e-05); + Layer *self_attn_layer_norm = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_self_attn_layer_norm_weight", + self_attn_layer_norm); + + if (i % num_transformer_layers_per_stage == 0) { + mapping[hidden_states].push_back( + machine_views[i / num_transformer_layers_per_stage]); + } + + Tensor mha; + switch (mode) { + case BEAM_SEARCH_MODE: { + mha = ff.spec_inc_multihead_self_attention( + hidden_states, + opt_config.hidden_size, + opt_config.num_attention_heads, + opt_config.hidden_size / opt_config.num_attention_heads, + opt_config.hidden_size / opt_config.num_attention_heads, + 0.0f, + true, + false, + false, + NULL, + false, + /*scaling query*/ true, + /*sacling factor*/ + pow((opt_config.hidden_size / opt_config.num_attention_heads), + -0.5), + /*qk_prod_scaling*/ false); + break; + } + case TREE_VERIFY_MODE: { + mha = ff.inc_multihead_self_attention_verify( + hidden_states, + opt_config.hidden_size, + opt_config.num_attention_heads, + opt_config.hidden_size / opt_config.num_attention_heads, + opt_config.hidden_size / opt_config.num_attention_heads, + 0.0f, + true, + false, + false, + NULL, + false, + /*scaling query*/ true, + /*sacling factor*/ + pow((opt_config.hidden_size / opt_config.num_attention_heads), + -0.5), + /*qk_prod_scaling*/ false); + break; + } + case INC_DECODING_MODE: { + mha = ff.inc_multihead_self_attention( + hidden_states, + opt_config.hidden_size, + opt_config.num_attention_heads, + opt_config.hidden_size / opt_config.num_attention_heads, + opt_config.hidden_size / opt_config.num_attention_heads, + 0.0f, + true, + false, + false, + NULL, + false, + /*scaling query*/ true, + /*sacling factor*/ + pow((opt_config.hidden_size / opt_config.num_attention_heads), + -0.5), + /*qk_prod_scaling*/ false); + break; + } + default: { + assert(false); + } + } + + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + + Tensor added = ff.add(mha, residual); + + Tensor final_norm = ff.layer_norm( + added, axes, opt_config.layer_norm_elementwise_affine, 1e-05); + Layer *final_layer_norm = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_final_layer_norm_weight", + final_layer_norm); + + //--------linear fc1 fc2 ---------- + Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, AC_MODE_NONE, true); + Layer *fc1_linear = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_fc1_weight", + fc1_linear); + Tensor activation = ff.relu(fc1, false); + + Tensor fc2 = + ff.dense(activation, opt_config.hidden_size, AC_MODE_NONE, true); + Layer *fc2_linear = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_fc2_weight", + fc2_linear); + residual = ff.add(added, fc2); + } + + // final + Tensor all_final_norm = ff.layer_norm( + residual, axes, opt_config.layer_norm_elementwise_affine, 1e-05); + Layer *all_final_norm_layer = ff.layers.back(); + weights_layers.emplace("final_layer_norm_weight", all_final_norm_layer); + + Tensor lm_head = + ff.dense(all_final_norm, opt_config.vocab_size, AC_MODE_NONE, false); + Layer *lm_head_layer = ff.layers.back(); + weights_layers.emplace("embed_tokens_weight_lm_head", lm_head_layer); + + Tensor output; + if (mode == BEAM_SEARCH_MODE) { + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.beam_top_k(softmax, opt_config.max_beam_width, false); + } else { + output = ff.arg_top_k(lm_head, /*k=*/1, false); + } + + //------------------- compile the model -------------------------------- + std::cout << "------start compile ----------" << std::endl; + im.compile_model_and_allocate_buffer(&ff, mapping); + FileDataLoader fileloader(opt_config.input_path, + opt_config.weight_file_path, + opt_config.num_attention_heads, + opt_config.hidden_size, + opt_config.hidden_size / + opt_config.num_attention_heads); + fileloader.load_weights(&ff, weights_layers); + std::cout << "------load wieght finished----------" << std::endl; + im.init_operators_inference(&ff); +} + +}; // namespace FlexFlow diff --git a/examples/cpp/inference/models/opt.h b/examples/cpp/inference/models/opt.h new file mode 100644 index 0000000000..11ae888eba --- /dev/null +++ b/examples/cpp/inference/models/opt.h @@ -0,0 +1,65 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "file_loader.h" +#include "flexflow/batch_config.h" +#include "flexflow/inference.h" + +namespace FlexFlow { + +class OPT { +public: + struct Config { + Config(void) { + vocab_size = 50272, word_embed_proj_dim = 768, hidden_size = 768; + max_position_embeddings = 2048; + layer_norm_elementwise_affine = true; + num_attention_heads = 12; + dropout = 0.1; + seed = 3; + ffn_dim = 3072; + num_hidden_layers = 12; + max_beam_width = 1; + batchSize = 8; + sentence_len = 100; + max_beam_depth = 4; + } + int vocab_size; + int word_embed_proj_dim; + int hidden_size; + int num_attention_heads; + std::string input_path; + std::string weight_file_path; + int max_position_embeddings; + bool layer_norm_elementwise_affine; + float dropout; + unsigned long long seed; + int ffn_dim; + int num_hidden_layers; + int max_beam_width; + int batchSize; + int sentence_len; + int max_beam_depth; + }; + + static void create_opt_model(FFModel &ff, + InferenceManager &im, + Config const &opt_config, + int num_pipeline_stages, + InferenceMode mode); +}; + +}; // namespace FlexFlow diff --git a/examples/cpp/inference/opt/opt.cc b/examples/cpp/inference/opt/opt.cc index c2932df926..453633c383 100644 --- a/examples/cpp/inference/opt/opt.cc +++ b/examples/cpp/inference/opt/opt.cc @@ -21,6 +21,16 @@ using namespace Legion; LegionRuntime::Logger::Category log_app("opt"); +void parse_input_args(char **argv, int argc, OptConfig &config) { + for (int i = 1; i < argc; i++) { + // weights + if (!strcmp(argv[i], "--weights")) { + config.weight_file_path = std::string(argv[++i]); + continue; + } + } +} + void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, Context ctx, @@ -28,6 +38,12 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; OptConfig optConfig; FFModel ff(ffconfig); + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, optConfig); + //------------------------------compute machine views ------------------ int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; std::vector machine_views; @@ -48,7 +64,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor input; Tensor position_input; { - int const token_dims[] = {1, 9}; + int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -180,7 +196,7 @@ void FlexFlow::top_level_task(Task const *task, optConfig.num_attention_heads); //"Today is a beautiful day and I want" std::vector prompt = {2, 5625, 16, 10, 2721, 183, 8, 38, 236}; - rm.register_new_request(prompt, 20); + rm.register_new_request(prompt, 30); fileloader.load_weights(&ff, weights_layers); im.init_operators_inference(&ff); diff --git a/examples/cpp/inference/opt/opt.h b/examples/cpp/inference/opt/opt.h index d581b73df9..6b9a45f2d5 100644 --- a/examples/cpp/inference/opt/opt.h +++ b/examples/cpp/inference/opt/opt.h @@ -17,30 +17,6 @@ #include "file_loader.h" #include "inference_config.h" -// # OPTConfig { -// # "_remove_final_layer_norm": false, -// # "activation_function": "relu", -// # "attention_dropout": 0.0, -// # "bos_token_id": 2, -// # "do_layer_norm_before": true, -// # "dropout": 0.1, -// # "enable_bias": true, -// # "eos_token_id": 2, -// # "ffn_dim": 3072, -// # "hidden_size": 768, -// # "init_std": 0.02, -// # "layer_norm_elementwise_affine": true, -// # "layerdrop": 0.0, -// # "max_position_embeddings": 2048, -// # "model_type": "opt", -// # "num_attention_heads": 12, -// # "num_hidden_layers": 12, -// # "pad_token_id": 1, -// # "transformers_version": "4.27.2", -// # "use_cache": true, -// # "vocab_size": 50272, -// # "word_embed_proj_dim": 768 -// # } struct OptConfig : InferenceConfig { OptConfig(void) : InferenceConfig() { vocab_size = 50272, word_embed_proj_dim = 768, hidden_size = 768; @@ -51,8 +27,6 @@ struct OptConfig : InferenceConfig { seed = 3; ffn_dim = 3072; num_hidden_layers = 12; - weight_file_path = - "/home/ubuntu/FlexFlow/examples/cpp/inference/opt/weights/"; } int word_embed_proj_dim; std::string input_path; diff --git a/examples/cpp/inference/opt/opt_baseline.py b/examples/cpp/inference/opt/opt_baseline.py index 5574af259a..3e8d7499f0 100644 --- a/examples/cpp/inference/opt/opt_baseline.py +++ b/examples/cpp/inference/opt/opt_baseline.py @@ -11,13 +11,13 @@ for prompt in prompts: input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids print(input_ids) - generated_ids = model.generate(input_ids, max_length=20) + generated_ids = model.generate(input_ids, max_length=30) generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) print(generated_ids) print(generated_string) #get same results with this and opt.cc - -# tensor([[ 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, 458, 19, - # 47, 5, 2770, 527, 9, 127, 78, 655]]) -# 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, 458, 19, 47, 5, 2770, 527, 9, 127, 78, 655 \ No newline at end of file +# tensor([[ 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, +# 458, 19, 47, 5, 2770, 527, 9, 127, 78, 655, +# 1805, 7, 5, 4105, 4, 50118, 100, 21, 98, 2283]]) +# 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, 458, 19, 47, 5, 2770, 527, 9, 127, 78, 655, 1805, 7, 5, 4105, 4, 50118, 100, 21, 98, 2283, \ No newline at end of file diff --git a/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt b/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt new file mode 100644 index 0000000000..7bab587713 --- /dev/null +++ b/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlowExample_opt_pipeline) +set(project_target opt_pipeline) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + opt_pipeline.cc + ../file_loader.cc + ../models/opt.cc) + + +cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/opt_spec_pipeline/Makefile b/examples/cpp/inference/opt_spec_pipeline/Makefile new file mode 100644 index 0000000000..b4a7866073 --- /dev/null +++ b/examples/cpp/inference/opt_spec_pipeline/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= opt_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc b/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc new file mode 100644 index 0000000000..a1f50e230d --- /dev/null +++ b/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc @@ -0,0 +1,189 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "models/opt.h" + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("opt"); + +void parse_input_args(char **argv, int argc, OPT::Config &config) { + for (int i = 1; i < argc; i++) { + // weights + if (!strcmp(argv[i], "--weights")) { + config.weight_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + OPT::Config opt_config; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, opt_config); + InferenceManager im(ffconfig, opt_config.batchSize, 1); + RequestManager rm; + // Add a single request + std::vector prompt = { + 2, 5625, 16, 10, 2721, 183, 8, 38, 236}; + rm.register_new_request(prompt, opt_config.sentence_len); + + FFModel beam_model(ffconfig), tree_model(ffconfig); + OPT::create_opt_model(beam_model, im, opt_config, 1, BEAM_SEARCH_MODE); + OPT::create_opt_model(tree_model, im, opt_config, 1, TREE_VERIFY_MODE); + + // entry--------------------------- + int depth = 0; + std::map beam_future_handlers, tree_future_handler; + std::map beam_batch_configs; + std::map tree_batch_configs; + + bool new_req = true; + TreeVerifyBatchConfig tree_bc; + + int iteration = 0; + + while (depth < opt_config.max_beam_depth) { + int bid = 0; + if (beam_future_handlers.find(bid) == beam_future_handlers.end()) { + BeamSearchBatchConfig bc; + InferenceResult ir; + bc = rm.prepare_next_batch_init(tree_bc, ir); + + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + } else { + // have luanched this bid + Future future = beam_future_handlers[bid]; + if (!future.is_ready(true /*subscribe*/)) { + continue; + } else { + std::cout << "future is ready...." << std::endl; + } + // process end + BeamInferenceResult ir = future.get_result(); + BeamSearchBatchConfig bc = beam_batch_configs[bid]; + depth = bc.beamRequestsInfo[0].current_depth; + bc = rm.prepare_next_batch_beam(bc, ir); + + std::cout << "opt current depth: " << depth << std::endl; + std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; + FutureMap fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + + // tranverse the tree in dfs order; + if (depth >= opt_config.max_beam_depth) { + + printf("\n\n ------Final Beam Search Batch------\n"); + printf("[Beam] num_tokens: %d\n", bc.num_tokens); + for (int i = 0; i < bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << bc.tokensInfo[i].request_index + << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; + } + + // printf("\n\n prepare tree_bc from final beam search bc\n"); + tree_bc = rm.prepare_next_batch_verify(bc); + + printf("\n\n\n ------Tree Verify Batch-------\n"); + // should have the same content as the hardcoded verification block + // below right now, it only contains the prompt need to add in the beam + // search result + + printf("[Verify] num_tokens : %d\n", tree_bc.num_tokens); + printf("[Verify] num_tokens_in_batch: %d\n", + tree_bc.requestsInfo[0].num_tokens_in_batch); + printf("------------------------------\n"); + + for (int i = 0; i < tree_bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << tree_bc.tokensInfo[i].request_index << ", Abs Depth: " + << tree_bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << tree_bc.tokensInfo[i].token_id << "\n"; + } + + printf("\n\n ------Commit Verified Tokens-------\n"); + for (int i = 0; i < tree_bc.num_tokens_to_commit; i++) { + std::cout << "[Commit] Request Index: " + << tree_bc.commited_tokens[i].request_index + << ", Abs Depth: " << tree_bc.commited_tokens[i].token_depth + << ", Token Index in batch: " + << tree_bc.commited_tokens[i].token_index << "\n"; + } + + FutureMap fm = im.inference(&tree_model, 0, tree_bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + InferenceResult ir = future.get_result(); + for (int i = 0; i < tree_bc.num_tokens; i++) { + if (i == 7) { + std::cout << "------------------\n"; + } + printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); + } + + std::cout << "------Init New Beam Search Batch------\n"; + bc = rm.prepare_next_batch_init(tree_bc, ir); + std::cout << "[Init] num_tokens: " << bc.num_tokens << "\n"; + for (int i = 0; i < bc.num_tokens; i++) { + std::cout << "[Token] Request Index: " + << bc.tokensInfo[i].request_index + << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request + << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; + } + std::cout << "Batch Depth: " << bc.beamRequestsInfo[0].current_depth + << "\n"; + + iteration++; + + if (iteration < 4) { + std::cout << "\n\n~~~~~~~~~~teration " << iteration << "~~~~~~~~~~\n"; + depth = bc.beamRequestsInfo[0].current_depth; + fm = im.inference(&beam_model, bid, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + beam_future_handlers[bid] = fm.get_future(0); + beam_batch_configs[bid] = bc; + } else { + break; + } + } + } + } + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/include/flexflow/model.h b/include/flexflow/model.h index a246cf37c3..e406b1af29 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -628,11 +628,14 @@ class FFModel { int kdim = 0, int vdim = 0, float dropout = 0.0f, - bool bias = true, + bool bias = false, bool add_bias_kv = false, bool add_zero_attn = false, Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, char const *name = NULL); Tensor inc_multihead_self_attention_verify( const Tensor input, @@ -641,11 +644,14 @@ class FFModel { int kdim = 0, int vdim = 0, float dropout = 0.0f, - bool bias = true, + bool bias = false, bool add_bias_kv = false, bool add_zero_attn = false, Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, char const *name = NULL); Tensor create_tensor_legion_ordering(int num_dim, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 4df85cd04e..4acab0a6ab 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -34,6 +34,9 @@ class SpecIncMultiHeadSelfAttention : public Op { bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, @@ -48,6 +51,9 @@ class SpecIncMultiHeadSelfAttention : public Op { bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, @@ -100,14 +106,16 @@ class SpecIncMultiHeadSelfAttention : public Op { BeamSearchBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr); + float *output_ptr, + float const *bias_ptr); Params get_params() const; public: int num_heads; - float dropout; + float dropout, scaling_factor; bool bias; - bool add_bias_kv, add_zero_attn, apply_rotary_embedding; + bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; }; @@ -129,6 +137,10 @@ class SpecIncMultiHeadSelfAttentionMeta : public OpMeta { int num_heads; bool *has_load_weights; bool *apply_rotary_embedding; + bool *bias; + bool *scaling_query; + bool *qk_prod_scaling; + float scaling_factor; #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 00e1179a14..5995e95fe1 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -9,8 +9,9 @@ namespace FlexFlow { struct SpecIncMultiHeadSelfAttentionParams { LayerID layer_guid; int embed_dim, num_heads, kdim, vdim; - float dropout; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 61e7b69fe9..0e54bd50d0 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -34,6 +34,9 @@ class TreeIncMultiHeadSelfAttention : public Op { bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, @@ -48,6 +51,9 @@ class TreeIncMultiHeadSelfAttention : public Op { bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, @@ -96,14 +102,16 @@ class TreeIncMultiHeadSelfAttention : public Op { TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr); + float *output_ptr, + float const *bias_ptr); Params get_params() const; public: int num_heads; - float dropout; + float dropout, scaling_factor; bool bias; - bool add_bias_kv, add_zero_attn, apply_rotary_embedding; + bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; }; @@ -126,6 +134,10 @@ class TreeIncMultiHeadSelfAttentionMeta : public OpMeta { int num_active_tokens; bool *has_load_weights; bool *apply_rotary_embedding; + bool *bias; + bool *scaling_query; + bool *qk_prod_scaling; + float scaling_factor; #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index 7a4a258850..f8fbac7e8e 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -9,8 +9,9 @@ namespace FlexFlow { struct TreeIncMultiHeadSelfAttentionParams { LayerID layer_guid; int embed_dim, num_heads, kdim, vdim; - float dropout; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 11911a9ba3..a764fbe8fa 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -66,14 +66,18 @@ Tensor bool add_zero_attn, Initializer *kernel_initializer, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name) { // Currently assume that + int weight_num = bias ? 2 : 1; Layer *li = new Layer(this, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + weight_num /*weights*/, 1 /*outputs*/, input); { @@ -104,6 +108,17 @@ Tensor kernel_initializer, CHOSEN_SYNC_TYPE); } + if (bias) { + // q, k, v, o + int dims[1] = {embed_dim * 4}; + li->weights[1] = create_weight_legion_ordering(1, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } li->data_type = DT_FLOAT; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); @@ -114,6 +129,9 @@ Tensor li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("scaling_query", scaling_query); + li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); layers.push_back(li); return li->outputs[0]; } @@ -143,6 +161,12 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); bool apply_rotary_embedding = (bool)value; + layer->get_int_property("scaling_query", value); + bool scaling_query = (bool)value; + float scaling_factor; + layer->get_float_property("scaling_factor", scaling_factor); + layer->get_int_property("qk_prod_scaling", value); + bool qk_prod_scaling = (bool)value; return new SpecIncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], @@ -155,6 +179,9 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, false /*allocate_weights*/, layer->name); } @@ -172,6 +199,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -180,7 +210,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), num_heads(_num_heads), dropout(_dropout), bias(_bias), @@ -189,7 +219,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling) { // overwrite layer_guid layer_guid = _layer_guid; @@ -233,7 +265,26 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( initializer, comm_type); } - + if (bias) { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[0] = inputs[0]->dims[num_dims - 1]; + dims[0].size = dims[0].degree; + dims[1].size = oProjSize * 4; + dims[1].degree = 1; + dims[1].parallel_idx = -1; +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[1] = model.create_parallel_weight<2>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + NULL, + comm_type); + } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, DT_FLOAT, this); /* for (int i = 0; i < numdim; i++) { */ @@ -256,6 +307,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -264,7 +318,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input, _weight), @@ -274,7 +328,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -315,6 +371,24 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( initializer, comm_type); } + if (bias) { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[0] = inputs[0]->dims[num_dims - 1]; + dims[0].size = dims[0].degree; + dims[1].size = oProjSize * 4; +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[1] = model.create_parallel_weight<2>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + NULL, + comm_type); + } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, DT_FLOAT, this); @@ -344,6 +418,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( other.add_bias_kv, other.add_zero_attn, other.apply_rotary_embedding, + other.scaling_query, + other.scaling_factor, + other.qk_prod_scaling, allocate_weights, other.name) {} @@ -365,6 +442,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.add_bias_kv, params.add_zero_attn, params.apply_rotary_embedding, + params.scaling_query, + params.scaling_factor, + params.qk_prod_scaling, allocate_weights, name) {} @@ -537,6 +617,15 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); + + if (bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(idx++, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } @@ -550,12 +639,13 @@ void SpecIncMultiHeadSelfAttention::inference_task( std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 3); assert(task->regions.size() == regions.size()); + float const *bias_ptr = NULL; BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; SpecIncMultiHeadSelfAttentionMeta const *m = *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); + assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -564,6 +654,19 @@ void SpecIncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (*m->bias) { + GenericTensorAccessorR biases = + helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 2); + bias_ptr = biases.get_float_ptr(); + } Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain weight_domain = runtime->get_index_space_domain( @@ -575,15 +678,13 @@ void SpecIncMultiHeadSelfAttention::inference_task( assert(weight_domain.get_dim() == 3); assert(output_domain.get_dim() == 4); - /* print_tensor(input.get_float_ptr(), - input_domain.get_volume(), - "[Attention:forward:query]"); */ SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, input.get_float_ptr(), weight.get_float_ptr(), - output.get_float_ptr()); + output.get_float_ptr(), + bias_ptr); // print_tensor(input.get_float_ptr(), 20, "attention input"); // print_tensor(output.get_float_ptr(), 20, "attention output"); @@ -630,7 +731,10 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.scaling_query == rhs.scaling_query && + lhs.scaling_factor == rhs.scaling_factor && + lhs.qk_prod_scaling == rhs.qk_prod_scaling; } SpecIncMultiHeadSelfAttentionParams @@ -646,6 +750,9 @@ SpecIncMultiHeadSelfAttentionParams params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; params.apply_rotary_embedding = this->apply_rotary_embedding; + params.scaling_query = this->scaling_query; + params.scaling_factor = this->scaling_factor; + params.qk_prod_scaling = this->qk_prod_scaling; return params; } @@ -665,6 +772,9 @@ size_t hash::operator()( hash_combine(key, params.add_bias_kv); hash_combine(key, params.add_zero_attn); hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.scaling_query); + hash_combine(key, params.scaling_factor); + hash_combine(key, params.qk_prod_scaling); return key; } }; // namespace std diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 37305a83b0..765891ed53 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -29,7 +29,8 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( BeamSearchBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr) { + float *output_ptr, + float const *bias_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index ee1c6e389b..062ef4af03 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -40,6 +40,49 @@ __global__ void spec_build_w_out_tensor(float const *weight_ptr, } } +__global__ void spec_apply_proj_bias_w(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int oProjSize) { + CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { + int bias_idx = 3 * oProjSize + i % oProjSize; + input_ptr[i] += bias_ptr[bias_idx]; + } +} + +__global__ void spec_apply_proj_bias_qkv(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int qProjSize, + int kProjSize, + int vProjSize, + int num_heads, + bool scaling_query, + float scaling_factor) { + CUDA_KERNEL_LOOP( + i, num_tokens * (qProjSize + kProjSize + vProjSize) * num_heads) { + // for simplicity, assume q, k, v is in same shape + // 0->q, 1->k, 2->v + int qkv_index = i / (num_tokens * qProjSize) % 3; + + int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int q_block_size = qProjSize * num_tokens; + + int idx = i % (num_tokens * (qProjSize)); + + int real_part_index = + head_idx * qkv_block_size + qkv_index * q_block_size + idx; + int bias_idx = qkv_index * qProjSize * num_heads + head_idx * qProjSize + + (idx % qProjSize); + input_ptr[real_part_index] += bias_ptr[bias_idx]; + + if (scaling_query && qkv_index == 0) { + input_ptr[real_part_index] *= scaling_factor; + } + } +} + __global__ void spec_apply_rotary_embedding(float *input_ptr, cuFloatComplex *complex_input, @@ -101,12 +144,13 @@ __global__ void } } -void inference_kernel1(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - cudaStream_t stream) { +void compute_qkv_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + float const *bias_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -219,7 +263,21 @@ void inference_kernel1(SpecIncMultiHeadSelfAttentionMeta const *m, int v_block_size = m->vProjSize * num_tokens; cuFloatComplex *complex_input; - // todo xinhao remember to set token index for each beam + // apply bias for q, k, v + if (*m->bias) { + spec_apply_proj_bias_qkv<<>>(output_ptr, + bias_ptr, + num_tokens, + m->qProjSize, + m->kProjSize, + m->vProjSize, + m->num_heads, + *m->scaling_query, + m->scaling_factor); + } if (*m->apply_rotary_embedding) { checkCUDA(cudaMalloc(&complex_input, @@ -386,12 +444,12 @@ __global__ void spec_store_kv_cache( } } -void inference_kernel2(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - cudaStream_t stream) { +void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; - printf("curr depth: %d\n", curr_depth); + // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->kProjSize * num_tokens * m->num_heads; @@ -442,19 +500,21 @@ __global__ void spec_fill_entries_above_diagonal(float *matrix, size_t num_heads, float value) { CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { - //size_t head_idx = i / (new_tokens * total_tokens_in_request); + // size_t head_idx = i / (new_tokens * total_tokens_in_request); size_t src_idx = (i / new_tokens) % total_tokens_in_request; size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; // Casual Mask - if (src_idx > dst_idx) + if (src_idx > dst_idx) { matrix[i] = value; + } } } -void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - float *output_ptr, - cudaStream_t stream) { +void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + float *output_ptr, + float const *bias_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); @@ -497,7 +557,11 @@ void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; - float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + // a flag of using this scaling alpha + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + } // To get A, skip over Q entries from previous requests (same head) void const *A = (void const *)(m->devQKVProjArray + tokens_previous_requests * m->qProjSize); @@ -548,11 +612,8 @@ void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, min((size_t)CUDA_NUM_THREADS, parallelism), 0, - stream>>>((float *)C, - num_new_tokens, - total_tokens, - m->num_heads, - -INFINITY); + stream>>>( + (float *)C, num_new_tokens, total_tokens, m->num_heads, -INFINITY); } // Compute Softmax(QK^T/sqrt(d_k)) cudnnTensorDescriptor_t qk_tensor; @@ -669,6 +730,14 @@ void inference_kernel3(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } + if (*m->bias) { + int parallelism = m->oProjSize * num_tokens; + spec_apply_proj_bias_w<<>>( + output_ptr, bias_ptr, num_tokens, m->oProjSize); + } } assert(tokens_previous_requests == num_tokens); @@ -680,7 +749,8 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( BeamSearchBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr) { + float *output_ptr, + float const *bias_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -734,13 +804,14 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - inference_kernel1(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + compute_qkv_kernel( + m, bc, input_ptr, weight_ptr, m->devQKVProjArray, bias_ptr, stream); // phase 2: Update key/val cache - inference_kernel2(m, bc, stream); + update_kv_cache_kernel(m, bc, stream); // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - inference_kernel3(m, bc, output_ptr, stream); + compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -790,6 +861,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( *has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); *apply_rotary_embedding = attn->apply_rotary_embedding; + bias = (bool *)calloc(1, sizeof(bool)); + *bias = attn->bias; + scaling_query = (bool *)calloc(1, sizeof(bool)); + *scaling_query = attn->scaling_query; + scaling_factor = attn->scaling_factor; + qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); + *qk_prod_scaling = attn->qk_prod_scaling; // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 96e2541872..11c8e22e86 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -66,14 +66,18 @@ Tensor FFModel::inc_multihead_self_attention_verify( bool add_zero_attn, Initializer *kernel_initializer, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name) { + int weight_num = bias ? 2 : 1; // Currently assume that Layer *li = new Layer(this, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + weight_num /*weights*/, 1 /*outputs*/, input); { @@ -104,6 +108,17 @@ Tensor FFModel::inc_multihead_self_attention_verify( kernel_initializer, CHOSEN_SYNC_TYPE); } + if (bias) { + // q, k, v, o + int dims[1] = {embed_dim * 4}; + li->weights[1] = create_weight_legion_ordering(1, + dims, + DT_FLOAT, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } li->data_type = DT_FLOAT; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); @@ -114,6 +129,9 @@ Tensor FFModel::inc_multihead_self_attention_verify( li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("scaling_query", scaling_query); + li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); layers.push_back(li); return li->outputs[0]; } @@ -141,6 +159,12 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); bool apply_rotary_embedding = (bool)value; + layer->get_int_property("scaling_query", value); + bool scaling_query = (bool)value; + float scaling_factor; + layer->get_float_property("scaling_factor", scaling_factor); + layer->get_int_property("qk_prod_scaling", value); + bool qk_prod_scaling = (bool)value; return new TreeIncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], @@ -153,6 +177,9 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, false /*allocate_weights*/, layer->name); } @@ -170,6 +197,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -178,7 +208,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), num_heads(_num_heads), dropout(_dropout), bias(_bias), @@ -187,7 +217,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling) { // overwrite layer_guid layer_guid = _layer_guid; @@ -231,6 +263,26 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( initializer, comm_type); } + if (bias) { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[0] = inputs[0]->dims[num_dims - 1]; + dims[0].size = dims[0].degree; + dims[1].size = oProjSize * 4; + dims[1].degree = 1; + dims[1].parallel_idx = -1; +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[1] = model.create_parallel_weight<2>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + NULL, + comm_type); + } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, DT_FLOAT, this); @@ -254,6 +306,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool _add_bias_kv, bool _add_zero_attn, bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -262,7 +317,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( DT_FLOAT, name, 1 /*inputs*/, - 1 /*weights*/, + (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input, _weight), @@ -272,7 +327,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -313,6 +370,24 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( initializer, comm_type); } + if (bias) { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[0] = inputs[0]->dims[num_dims - 1]; + dims[0].size = dims[0].degree; + dims[1].size = oProjSize * 4; +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[1] = model.create_parallel_weight<2>(dims, + DT_FLOAT, + NULL /*owner_op*/, + true /*create_grad*/, + NULL, + comm_type); + } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, DT_FLOAT, this); @@ -342,6 +417,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( other.add_bias_kv, other.add_zero_attn, other.apply_rotary_embedding, + other.scaling_query, + other.scaling_factor, + other.qk_prod_scaling, allocate_weights, other.name) {} @@ -363,6 +441,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.add_bias_kv, params.add_zero_attn, params.apply_rotary_embedding, + params.scaling_query, + params.scaling_factor, + params.qk_prod_scaling, allocate_weights, name) {} @@ -536,6 +617,14 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); + if (bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(idx++, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } @@ -549,12 +638,13 @@ void TreeIncMultiHeadSelfAttention::inference_task( std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 3); assert(task->regions.size() == regions.size()); + float const *bias_ptr = NULL; TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; TreeIncMultiHeadSelfAttentionMeta *m = *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); + assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -562,6 +652,19 @@ void TreeIncMultiHeadSelfAttention::inference_task( m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (*m->bias) { + GenericTensorAccessorR biases = + helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 2); + bias_ptr = biases.get_float_ptr(); + } Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -583,7 +686,8 @@ void TreeIncMultiHeadSelfAttention::inference_task( bc, input.get_float_ptr(), weight.get_float_ptr(), - output.get_float_ptr()); + output.get_float_ptr(), + bias_ptr); #ifdef INFERENCE_TESTS printf("Checking TreeIncMultiHeadSelfAttention computations...\n"); @@ -1394,7 +1498,10 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding; + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.scaling_query == rhs.scaling_query && + lhs.scaling_factor == rhs.scaling_factor && + lhs.qk_prod_scaling == rhs.qk_prod_scaling; } TreeIncMultiHeadSelfAttentionParams @@ -1410,6 +1517,9 @@ TreeIncMultiHeadSelfAttentionParams params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; params.apply_rotary_embedding = this->apply_rotary_embedding; + params.scaling_query = this->scaling_query; + params.scaling_factor = this->scaling_factor; + params.qk_prod_scaling = this->qk_prod_scaling; return params; } @@ -1429,6 +1539,9 @@ size_t hash::operator()( hash_combine(key, params.add_bias_kv); hash_combine(key, params.add_zero_attn); hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.scaling_query); + hash_combine(key, params.scaling_factor); + hash_combine(key, params.qk_prod_scaling); return key; } }; // namespace std diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 5e3fc240d5..f478e13660 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -29,7 +29,8 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr) { + float *output_ptr, + float const *bias_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 452a1be7b2..86b857a7f9 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -40,6 +40,49 @@ __global__ void tree_build_w_out_tensor(float const *weight_ptr, } } +__global__ void tree_apply_proj_bias_w(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int oProjSize) { + CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { + int bias_idx = 3 * oProjSize + i % oProjSize; + input_ptr[i] += bias_ptr[bias_idx]; + } +} + +__global__ void tree_apply_proj_bias_qkv(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int qProjSize, + int kProjSize, + int vProjSize, + int num_heads, + bool scaling_query, + float scaling_factor) { + CUDA_KERNEL_LOOP( + i, num_tokens * (qProjSize + kProjSize + vProjSize) * num_heads) { + // for simplicity, assume q, k, v is in same shape + // 0->q, 1->k, 2->v + int qkv_index = i / (num_tokens * qProjSize) % 3; + + int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); + int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int q_block_size = qProjSize * num_tokens; + + int idx = i % (num_tokens * (qProjSize)); + + int real_part_index = + head_idx * qkv_block_size + qkv_index * q_block_size + idx; + int bias_idx = qkv_index * qProjSize * num_heads + head_idx * qProjSize + + (idx % qProjSize); + input_ptr[real_part_index] += bias_ptr[bias_idx]; + + if (scaling_query && qkv_index == 0) { + input_ptr[real_part_index] *= scaling_factor; + } + } +} + __global__ void commit_tokens_kernel( float const *devQKVProjArray, float *cache_ptr, @@ -188,6 +231,7 @@ void compute_qkv_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, float const *input_ptr, float const *weight_ptr, float *output_ptr, + float const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -296,6 +340,22 @@ void compute_qkv_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int k_block_size = m->kProjSize * num_tokens; int v_block_size = m->vProjSize * num_tokens; cuFloatComplex *complex_input; + + // apply bias for q, k, v + if (*m->bias) { + tree_apply_proj_bias_qkv<<>>(output_ptr, + bias_ptr, + num_tokens, + m->qProjSize, + m->kProjSize, + m->vProjSize, + m->num_heads, + *m->scaling_query, + m->scaling_factor); + } if (*m->apply_rotary_embedding) { checkCUDA(cudaMalloc(&complex_input, num_tokens * m->qProjSize * m->num_heads * @@ -354,7 +414,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += processed_tokens_in_batch; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = @@ -380,18 +440,20 @@ __global__ void tree_fill_entries_above_diagonal(float *matrix, size_t num_heads, float value) { CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { - //size_t head_idx = i / (new_tokens * total_tokens_in_request); + // size_t head_idx = i / (new_tokens * total_tokens_in_request); size_t src_idx = (i / new_tokens) % total_tokens_in_request; size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; // Casual Mask - if (src_idx > dst_idx) + if (src_idx > dst_idx) { matrix[i] = value; + } } } void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, float *output_ptr, + float const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -479,7 +541,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens_in_request; - float alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + // a flag of using this scaling alpha + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + } // To get A, skip over Q entries from previous requests (same head) void const *A = (void const *)(m->devQKVProjArray + processed_tokens_in_batch * m->qProjSize); @@ -517,7 +583,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // causal attention. assert(num_new_tokens <= total_tokens_in_request); if (num_new_tokens > 1) { - size_t parallelism = m->num_heads * num_new_tokens * total_tokens_in_request; + size_t parallelism = + m->num_heads * num_new_tokens * total_tokens_in_request; tree_fill_entries_above_diagonal<<bias) { + int parallelism = m->oProjSize * processed_tokens_in_batch; + tree_apply_proj_bias_w<<>>( + output_ptr, bias_ptr, processed_tokens_in_batch, m->oProjSize); + } assert(processed_tokens_in_batch == bc->num_active_tokens()); } @@ -654,7 +729,8 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, - float *output_ptr) { + float *output_ptr, + float const *bias_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -705,7 +781,8 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, bc, input_ptr, weight_ptr, m->devQKVProjArray, stream); + compute_qkv_kernel( + m, bc, input_ptr, weight_ptr, m->devQKVProjArray, bias_ptr, stream); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -713,7 +790,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, stream); + compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -761,6 +838,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( *has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); *apply_rotary_embedding = attn->apply_rotary_embedding; + bias = (bool *)calloc(1, sizeof(bool)); + *bias = attn->bias; + scaling_query = (bool *)calloc(1, sizeof(bool)); + *scaling_query = attn->scaling_query; + scaling_factor = attn->scaling_factor; + qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); + *qk_prod_scaling = attn->qk_prod_scaling; // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index baf1b24f5d..bcead0bfaf 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2293,6 +2293,9 @@ GraphOptimalViewSerialized sez.serialize(attn->add_bias_kv); sez.serialize(attn->add_zero_attn); sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->scaling_query); + sez.serialize(attn->scaling_factor); + sez.serialize(attn->qk_prod_scaling); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2308,6 +2311,9 @@ GraphOptimalViewSerialized sez.serialize(attn->add_bias_kv); sez.serialize(attn->add_zero_attn); sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->scaling_query); + sez.serialize(attn->scaling_factor); + sez.serialize(attn->qk_prod_scaling); break; } case OP_SOFTMAX: { @@ -2711,8 +2717,9 @@ void FFModel::deserialize_graph_optimal_view( case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; - float dropout; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -2725,6 +2732,9 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(add_bias_kv); dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); + dez.deserialize(scaling_query); + dez.deserialize(scaling_factor); + dez.deserialize(qk_prod_scaling); SpecIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2737,6 +2747,9 @@ void FFModel::deserialize_graph_optimal_view( params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; + params.scaling_query = scaling_query; + params.scaling_factor = scaling_factor; + params.qk_prod_scaling = qk_prod_scaling; node = get_or_create_node(inputs[0], params); break; @@ -2744,8 +2757,9 @@ void FFModel::deserialize_graph_optimal_view( case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); int embed_dim, num_heads, k_dim, v_dim; - float dropout; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -2758,6 +2772,9 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(add_bias_kv); dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); + dez.deserialize(scaling_query); + dez.deserialize(scaling_factor); + dez.deserialize(qk_prod_scaling); TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2770,6 +2787,9 @@ void FFModel::deserialize_graph_optimal_view( params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; + params.scaling_query = scaling_query; + params.scaling_factor = scaling_factor; + params.qk_prod_scaling = qk_prod_scaling; node = get_or_create_node(inputs[0], params); break; From c08b1a067dee6974894153a0e15f3f9ed0a06f7e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 15 May 2023 02:58:54 +0800 Subject: [PATCH 123/344] add opt tokenizer functionality in C++ tokenizer (#727) Co-authored-by: Zhihao Jia --- examples/cpp/inference/gpt_tokenizer.cc | 4 ++ examples/cpp/inference/gpt_tokenizer.cpp | 13 +++-- examples/cpp/inference/gpt_tokenizer.h | 7 ++- examples/cpp/inference/gpt_tokenizer_test.sh | 52 ++++++++++++++++++-- 4 files changed, 67 insertions(+), 9 deletions(-) diff --git a/examples/cpp/inference/gpt_tokenizer.cc b/examples/cpp/inference/gpt_tokenizer.cc index c349bfacf3..437c4aa551 100644 --- a/examples/cpp/inference/gpt_tokenizer.cc +++ b/examples/cpp/inference/gpt_tokenizer.cc @@ -270,4 +270,8 @@ void GPT_Tokenizer::encode(std::string str, input_ids->push_back(vocab[pad_token]); mask_ids->push_back(0); } + if (mode == OPT) { + mask_ids->insert(mask_ids->begin(), 1); + input_ids->insert(input_ids->begin(), 2); + } } diff --git a/examples/cpp/inference/gpt_tokenizer.cpp b/examples/cpp/inference/gpt_tokenizer.cpp index 2d9b521fca..0691145113 100644 --- a/examples/cpp/inference/gpt_tokenizer.cpp +++ b/examples/cpp/inference/gpt_tokenizer.cpp @@ -18,10 +18,15 @@ #include int main(int argc, char *argv[]) { - std::string vocab_file = "./gpt2_bpe/vocab.bpe"; - std::string merge_file = "./gpt2_bpe/encoder.json"; + if (argc != 2 || (strcmp(argv[1], "gpt-2") && strcmp(argv[1], "opt") )) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + tokenizer_mode mode = strcmp(argv[1], "gpt-2") == 0 ? GPT2 : OPT; + std::string vocab_file = mode == GPT2 ? "./gpt2_bpe/vocab.bpe" : "opt_bpe/vocab.bpe"; + std::string merge_file = mode == GPT2 ? "./gpt2_bpe/encoder.json" : "opt_bpe/encoder.json"; - GPT_Tokenizer tokenizer(merge_file, vocab_file); + GPT_Tokenizer tokenizer(mode, merge_file, vocab_file); std::string line; std::vector lines; @@ -30,7 +35,7 @@ int main(int argc, char *argv[]) { std::cout << "Error opening input file" << std::endl; return -1; } - std::ofstream outfile("./wikitext-103-raw/wiki.valid.bpe.flexflow", + std::ofstream outfile(mode == GPT2 ? "./wikitext-103-raw/wiki.valid.bpe.flexflow.gpt2" : "./wikitext-103-raw/wiki.valid.bpe.flexflow.opt", std::ofstream::out); if (!outfile) { std::cout << "Error opening output file" << std::endl; diff --git a/examples/cpp/inference/gpt_tokenizer.h b/examples/cpp/inference/gpt_tokenizer.h index 9af722df38..476b47fee1 100644 --- a/examples/cpp/inference/gpt_tokenizer.h +++ b/examples/cpp/inference/gpt_tokenizer.h @@ -31,16 +31,20 @@ struct hash_pair { } }; +enum tokenizer_mode { GPT2, OPT }; + class GPT_Tokenizer { public: - GPT_Tokenizer(std::string const &vocab_file, + GPT_Tokenizer(tokenizer_mode mode_, + std::string const &vocab_file, std::string const &merge_file, std::string const &bos_token_str = "", const std::string eos_token_str = "", const std::string pad_token_str = "", const std::string unk_token_str = "", const std::string mask_token_str = "") { + mode = mode_; load_vocab(vocab_file); load_merge(merge_file); bos_token = bos_token_str; @@ -58,6 +62,7 @@ class GPT_Tokenizer { size_t max_length, std::vector *input_ids, std::vector *mask_ids); + tokenizer_mode mode; std::string bos_token; std::string eos_token; std::string pad_token; diff --git a/examples/cpp/inference/gpt_tokenizer_test.sh b/examples/cpp/inference/gpt_tokenizer_test.sh index 8f7660bfe4..26f3f71a05 100755 --- a/examples/cpp/inference/gpt_tokenizer_test.sh +++ b/examples/cpp/inference/gpt_tokenizer_test.sh @@ -3,7 +3,7 @@ set -x set -e cleanup() { - rm -rf wikitext-103-raw-v1.zip wikitext-103-raw gpt2_bpe gpt_tokenizer pytokenizer.py bpe.py + rm -rf wikitext-103-raw-v1.zip wikitext-103-raw gpt2_bpe opt_bpe gpt_tokenizer pytokenizer.py bpe.py hf_tokenizer.py } # Cd into directory holding this script @@ -21,6 +21,10 @@ wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1. unzip wikitext-103-raw-v1.zip rm wikitext-103-raw-v1.zip +############################################################################################### +##################################### GPT-2 tests ############################################# +############################################################################################### + # Download GPT-2 BPE vocab and merges files mkdir -p gpt2_bpe wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json @@ -30,8 +34,8 @@ wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab wget -O bpe.py https://raw.githubusercontent.com/karpathy/minGPT/master/mingpt/bpe.py chmod +x bpe.py -# Run the FlexFlow C++ tokenizer -./gpt_tokenizer +# Run the FlexFlow C++ tokenizer (standard GPT-2) +./gpt_tokenizer gpt-2 # Run the minGPT tokenizer cat << EOF > pytokenizer.py @@ -57,7 +61,47 @@ chmod +x pytokenizer.py ./pytokenizer.py # Check that the outputs match -diff ./wikitext-103-raw/wiki.valid.bpe.flexflow ./wikitext-103-raw/wiki.valid.bpe.minGPT +diff ./wikitext-103-raw/wiki.valid.bpe.flexflow.gpt2 ./wikitext-103-raw/wiki.valid.bpe.minGPT + +############################################################################################### +##################################### OPT tests ############################################### +############################################################################################### + +# Download OPT vocab and merge files +mkdir -p opt_bpe +wget -O opt_bpe/encoder.json https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json +wget -O opt_bpe/vocab.bpe https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt + +# Run the FlexFlow C++ tokenizer (OPT) +./gpt_tokenizer opt + +# Run the Huggingface tokenizer +pip3 install transformers +cat << EOF > hf_tokenizer.py +#!/usr/bin/env python +from transformers import GPT2Tokenizer +model_id = "facebook/opt-125m" +tokenizer = GPT2Tokenizer.from_pretrained(model_id) +inp="./wikitext-103-raw/wiki.valid.raw" +outp="./wikitext-103-raw/wiki.valid.bpe.OPT" +with open(inp, "r") as infile: + with open(outp, "w+") as outfile: + for l in infile.readlines(): + if len(l.strip()) == 0: + outfile.write(l) + else: + input_ids = tokenizer(l.strip(), return_tensors="pt", padding=False).input_ids + out = input_ids.tolist()[0] + out = [str(x) for x in out] + out = " ".join(out) + outfile.write(out) + outfile.write("\n") +EOF +chmod +x hf_tokenizer.py +./hf_tokenizer.py + +# Check that the outputs match +diff ./wikitext-103-raw/wiki.valid.bpe.flexflow.opt ./wikitext-103-raw/wiki.valid.bpe.OPT # Clean up after test cleanup From ebb5f8ea6e8eff1e97f98f727f79541c7298e2ee Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sun, 14 May 2023 15:06:09 -0400 Subject: [PATCH 124/344] code (#726) Co-authored-by: Zhihao Jia --- src/ops/inc_multihead_self_attention.cu | 26 ++++++----------- src/ops/spec_inc_multihead_self_attention.cu | 30 +++++++------------- src/ops/tree_inc_multihead_self_attention.cu | 26 ++++++----------- 3 files changed, 29 insertions(+), 53 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index be86f55de4..fc7c8a7446 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -100,10 +100,14 @@ __global__ void // create complex number int head_idx = i / (num_tokens * proj_size / 2); int idx = i % (num_tokens * proj_size / 2); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + int real_part_index = - idx * 2 + head_idx * (q_block_size + k_block_size + v_block_size) + + idx + token_idx * (proj_size / 2) + + head_idx * (q_block_size + k_block_size + v_block_size) + (q_tensor ? 0 : q_block_size); - int complex_part_index = real_part_index + 1; + int complex_part_index = real_part_index + (proj_size / 2); complex_input[i] = {input_ptr[real_part_index], input_ptr[complex_part_index]}; @@ -114,8 +118,7 @@ __global__ void // get position of token // int head_idx = i / (num_tokens * proj_size); - int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + // size_t pos = id_map[token_idx].token_position; size_t pos = tokenInfos[token_idx].abs_depth_in_request; @@ -127,20 +130,9 @@ __global__ void cuFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; - input_ptr[real_part_index + 1] = complex_input[i].y; - - // if (i % 64 == 1 && head_idx == 0) { - // printf("head id: %d, tokenid: %d, pospospos:-> %d, before real part - // %f, " - // "before complex part: %f, real part: %f," - // "complext part: %f, freq_cis real: %f, freq_cis commplexx - // %f\n", head_idx, token_idx, pos, before_real, before_complex, - // complex_input[i].x, - // complex_input[i].y, - // complex_pos.x, - // complex_pos.y); - // } + input_ptr[complex_part_index] = complex_input[i].y; } } diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 062ef4af03..5850666f56 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -100,10 +100,14 @@ __global__ void // create complex number int head_idx = i / (num_tokens * proj_size / 2); int idx = i % (num_tokens * proj_size / 2); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + int real_part_index = - idx * 2 + head_idx * (q_block_size + k_block_size + v_block_size) + + idx + token_idx * (proj_size / 2) + + head_idx * (q_block_size + k_block_size + v_block_size) + (q_tensor ? 0 : q_block_size); - int complex_part_index = real_part_index + 1; + int complex_part_index = real_part_index + (proj_size / 2); complex_input[i] = {input_ptr[real_part_index], input_ptr[complex_part_index]}; @@ -114,33 +118,21 @@ __global__ void // get position of token // int head_idx = i / (num_tokens * proj_size); - int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + + // size_t pos = id_map[token_idx].token_position; + size_t pos = tokenInfos[token_idx].abs_depth_in_request; // float before_real = complex_input[i].x, before_complex = // complex_input[i].y; - size_t pos = tokenInfos[token_idx].abs_depth_in_request; int pos_i = i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); cuFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; - input_ptr[real_part_index + 1] = complex_input[i].y; - - // if (i % 64 == 1 && head_idx == 0) { - // printf("head id: %d, tokenid: %d, pospospos:-> %d, before real part - // %f, " - // "before complex part: %f, real part: %f," - // "complext part: %f, freq_cis real: %f, freq_cis commplexx - // %f\n", head_idx, token_idx, pos, before_real, before_complex, - // complex_input[i].x, - // complex_input[i].y, - // complex_pos.x, - // complex_pos.y); - // } + input_ptr[complex_part_index] = complex_input[i].y; } } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 86b857a7f9..eb586d858d 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -182,10 +182,14 @@ __global__ void tree_apply_rotary_embedding( // create complex number int head_idx = i / (num_tokens * proj_size / 2); int idx = i % (num_tokens * proj_size / 2); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + int real_part_index = - idx * 2 + head_idx * (q_block_size + k_block_size + v_block_size) + + idx + token_idx * (proj_size / 2) + + head_idx * (q_block_size + k_block_size + v_block_size) + (q_tensor ? 0 : q_block_size); - int complex_part_index = real_part_index + 1; + int complex_part_index = real_part_index + (proj_size / 2); complex_input[i] = {input_ptr[real_part_index], input_ptr[complex_part_index]}; @@ -196,8 +200,7 @@ __global__ void tree_apply_rotary_embedding( // get position of token // int head_idx = i / (num_tokens * proj_size); - int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + // size_t pos = id_map[token_idx].token_position; size_t pos = tokenInfos[token_idx].abs_depth_in_request; @@ -209,20 +212,9 @@ __global__ void tree_apply_rotary_embedding( cuFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; - input_ptr[real_part_index + 1] = complex_input[i].y; - - // if (i % 64 == 1 && head_idx == 0) { - // printf("head id: %d, tokenid: %d, pospospos:-> %d, before real part - // %f, " - // "before complex part: %f, real part: %f," - // "complext part: %f, freq_cis real: %f, freq_cis commplexx - // %f\n", head_idx, token_idx, pos, before_real, before_complex, - // complex_input[i].x, - // complex_input[i].y, - // complex_pos.x, - // complex_pos.y); - // } + input_ptr[complex_part_index] = complex_input[i].y; } } From 86ec73a6e6ec3ccfffd5d3ba53c9a86dd13af1e2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 14 May 2023 20:02:39 -0400 Subject: [PATCH 125/344] fix CI --- examples/cpp/inference/gpt_tokenizer.cpp | 12 ++++++++---- examples/cpp/inference/gpt_tokenizer_test.sh | 2 +- src/ops/tree_inc_multihead_self_attention.cpp | 2 +- src/ops/tree_inc_multihead_self_attention.cu | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/cpp/inference/gpt_tokenizer.cpp b/examples/cpp/inference/gpt_tokenizer.cpp index 0691145113..a8f188e171 100644 --- a/examples/cpp/inference/gpt_tokenizer.cpp +++ b/examples/cpp/inference/gpt_tokenizer.cpp @@ -18,13 +18,15 @@ #include int main(int argc, char *argv[]) { - if (argc != 2 || (strcmp(argv[1], "gpt-2") && strcmp(argv[1], "opt") )) { + if (argc != 2 || (strcmp(argv[1], "gpt-2") && strcmp(argv[1], "opt"))) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } tokenizer_mode mode = strcmp(argv[1], "gpt-2") == 0 ? GPT2 : OPT; - std::string vocab_file = mode == GPT2 ? "./gpt2_bpe/vocab.bpe" : "opt_bpe/vocab.bpe"; - std::string merge_file = mode == GPT2 ? "./gpt2_bpe/encoder.json" : "opt_bpe/encoder.json"; + std::string vocab_file = + mode == GPT2 ? "./gpt2_bpe/vocab.bpe" : "opt_bpe/vocab.bpe"; + std::string merge_file = + mode == GPT2 ? "./gpt2_bpe/encoder.json" : "opt_bpe/encoder.json"; GPT_Tokenizer tokenizer(mode, merge_file, vocab_file); @@ -35,7 +37,9 @@ int main(int argc, char *argv[]) { std::cout << "Error opening input file" << std::endl; return -1; } - std::ofstream outfile(mode == GPT2 ? "./wikitext-103-raw/wiki.valid.bpe.flexflow.gpt2" : "./wikitext-103-raw/wiki.valid.bpe.flexflow.opt", + std::ofstream outfile(mode == GPT2 + ? "./wikitext-103-raw/wiki.valid.bpe.flexflow.gpt2" + : "./wikitext-103-raw/wiki.valid.bpe.flexflow.opt", std::ofstream::out); if (!outfile) { std::cout << "Error opening output file" << std::endl; diff --git a/examples/cpp/inference/gpt_tokenizer_test.sh b/examples/cpp/inference/gpt_tokenizer_test.sh index 26f3f71a05..b336dd05ff 100755 --- a/examples/cpp/inference/gpt_tokenizer_test.sh +++ b/examples/cpp/inference/gpt_tokenizer_test.sh @@ -80,7 +80,7 @@ pip3 install transformers cat << EOF > hf_tokenizer.py #!/usr/bin/env python from transformers import GPT2Tokenizer -model_id = "facebook/opt-125m" +model_id = "facebook/opt-6.7b" tokenizer = GPT2Tokenizer.from_pretrained(model_id) inp="./wikitext-103-raw/wiki.valid.raw" outp="./wikitext-103-raw/wiki.valid.bpe.OPT" diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index f478e13660..28285b72ce 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -25,7 +25,7 @@ using Legion::Memory; /*static*/ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - TreeIncMultiHeadSelfAttentionMeta const *m, + TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, float const *input_ptr, float const *weight_ptr, diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index eb586d858d..ef474193ef 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -406,7 +406,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += processed_tokens_in_batch; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = From 8423610f42653acfaddbf37bc7f0f4321ce544bb Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sun, 14 May 2023 21:07:11 -0400 Subject: [PATCH 126/344] Kernel bug fix (#728) * complex into metadata * topk * format --------- Co-authored-by: Zhihao Jia --- include/flexflow/ops/beam_topk.h | 5 ++ .../ops/inc_multihead_self_attention.h | 1 + .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + src/ops/beam_topk.cu | 56 ++++++++----------- src/ops/inc_multihead_self_attention.cu | 18 +++--- src/ops/spec_inc_multihead_self_attention.cu | 19 +++---- src/ops/tree_inc_multihead_self_attention.cu | 16 +++--- 8 files changed, 57 insertions(+), 60 deletions(-) diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h index 7f8c28d9f8..9fc0be22f4 100644 --- a/include/flexflow/ops/beam_topk.h +++ b/include/flexflow/ops/beam_topk.h @@ -13,6 +13,11 @@ class BeamTopKMeta : public OpMeta { BeamTopKMeta(FFHandler handle); bool sorted; int max_beam_width; + int *parent_ids; + float *acc_probs; + int *block_start_index; + int *request_id; + int *tokens_per_request; }; class BeamTopK : public Op { diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 410c30abd9..4721086ec0 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -144,6 +144,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; BatchConfig::PerTokenInfo *token_infos; + cuFloatComplex *complex_input; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 4acab0a6ab..30e122278a 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -152,6 +152,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public OpMeta { float *devQKVProjArray, *keyCache, *valueCache; float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; + cuFloatComplex *complex_input; // void *reserveSpace; // BatchConfig::token_idxs *dev_token2ids; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 0e54bd50d0..fce4998e5d 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -144,6 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public OpMeta { float *devQKVProjArray, *keyCache, *valueCache; float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; + cuFloatComplex *complex_input; TreeVerifyBatchConfig::PerTokenInfo *token_infos; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index b41b4c2ba4..b6fccb68ba 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -607,38 +607,23 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; - // parent_id, per token - int *gpu_parents; - // acc_porbs, per token - float *gpu_probs; - // each block's start index; - // one block means the single token in different requests; - int *gpu_block_start_index; - int *gpu_request_id; - int *gpu_tokens_per_request; - - checkCUDA(cudaMalloc(&gpu_parents, sizeof(int) * max_total_requests)); - checkCUDA(cudaMalloc(&gpu_probs, sizeof(float) * max_total_requests)); - checkCUDA(cudaMalloc(&gpu_block_start_index, sizeof(int) * beam_num_blocks)); - checkCUDA(cudaMalloc(&gpu_request_id, sizeof(int) * beam_num_blocks)); - checkCUDA(cudaMalloc(&gpu_tokens_per_request, sizeof(int) * beam_num_blocks)); - checkCUDA(cudaMemcpy(gpu_parents, + checkCUDA(cudaMemcpy(m->parent_ids, parent_ids, sizeof(int) * max_total_requests, cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(gpu_probs, + checkCUDA(cudaMemcpy(m->acc_probs, acc_probs, sizeof(float) * max_total_requests, cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(gpu_block_start_index, + checkCUDA(cudaMemcpy(m->block_start_index, beam_block_start_index.data(), sizeof(int) * beam_num_blocks, cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(gpu_request_id, + checkCUDA(cudaMemcpy(m->request_id, request_id.data(), sizeof(int) * beam_num_blocks, cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(gpu_tokens_per_request, + checkCUDA(cudaMemcpy(m->tokens_per_request, tokens_per_request.data(), sizeof(int) * beam_num_blocks, cudaMemcpyHostToDevice)); @@ -649,11 +634,11 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, length, max_beam_width, max_heap_size, - gpu_parents, - gpu_probs, - gpu_block_start_index, - gpu_request_id, - gpu_tokens_per_request, + m->parent_ids, + m->acc_probs, + m->block_start_index, + m->request_id, + m->tokens_per_request, sorted, output_ptr, indices_ptr, @@ -703,13 +688,20 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, cudaEventDestroy(t_end); printf("[BeamTopK] forward time = %.2lfms\n", elapsed); } - // if(bc->beam_slots.at(0).current_depth == 1){ - // print_beam_tensor((float *)input_ptr, 50, 32000, 15, "beam topk - // input"); print_tensor((float *)output_ptr, 50, "beam topk - // output"); - // } } -BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) {} - +BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) { + checkCUDA(cudaMalloc(&parent_ids, + sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(cudaMalloc(&acc_probs, + sizeof(float) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(cudaMalloc(&block_start_index, + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(cudaMalloc(&request_id, + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(cudaMalloc(&tokens_per_request, + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); +} }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index fc7c8a7446..099e54305c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -124,7 +124,6 @@ __global__ void // float before_real = complex_input[i].x, before_complex = // complex_input[i].y; - int pos_i = i % (proj_size / 2); float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); cuFloatComplex complex_pos = {cos(freq), sin(freq)}; @@ -252,8 +251,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; int v_block_size = m->vProjSize * num_tokens; - cuFloatComplex *complex_input; - // apply bias for q, k, v if (*m->bias) { apply_proj_bias_qkv<<apply_rotary_embedding) { - checkCUDA(cudaMalloc(&complex_input, - num_tokens * m->qProjSize * m->num_heads * - sizeof(cuFloatComplex *) / 2)); /*q*/ apply_rotary_embedding<<>>(output_ptr, - complex_input, + m->complex_input, m->token_infos, m->qProjSize, m->kProjSize, @@ -294,7 +288,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>(output_ptr, - complex_input, + m->complex_input, m->token_infos, m->qProjSize, m->kProjSize, @@ -751,12 +745,14 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); size_t W_out_contiguous_size = W_out_block_size * num_heads; + size_t complex_size = + (BatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * sizeof(float) + - tokeninfo_size * - sizeof(BatchConfig::PerTokenInfo); // more components will + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + + complex_size * sizeof(cuFloatComplex); // more components will // be added here later Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), @@ -778,6 +774,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( qk_prods_softmax = (float *)(qk_prods + qk_prod_size); attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; + complex_input = + (cuFloatComplex *)(W_out_contiguous + W_out_contiguous_size); int parallelism = vProjSize * oProjSize * num_heads; build_w_out_tensor<<qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; int v_block_size = m->vProjSize * num_tokens; - cuFloatComplex *complex_input; - // apply bias for q, k, v if (*m->bias) { spec_apply_proj_bias_qkv<<apply_rotary_embedding) { - checkCUDA(cudaMalloc(&complex_input, - num_tokens * m->qProjSize * m->num_heads * - sizeof(cuFloatComplex *) / 2)); /*q*/ spec_apply_rotary_embedding<<>>(output_ptr, - complex_input, + m->complex_input, m->tokenInfos, m->qProjSize, m->kProjSize, @@ -295,7 +290,7 @@ void compute_qkv_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>(output_ptr, - complex_input, + m->complex_input, m->tokenInfos, m->qProjSize, m->kProjSize, @@ -900,6 +895,8 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( BeamSearchBatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); size_t W_out_contiguous_size = W_out_block_size * num_heads; + size_t complex_size = + (BeamSearchBatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * @@ -909,9 +906,9 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( beam_tokeninfo_size * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: - BeamSearchPerRequestInfo); // more components will - // be added here later + sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo) + + complex_size * sizeof(cuFloatComplex); // more components will + // be added here later Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(totalSize - 1)); @@ -944,6 +941,8 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( qk_prods_softmax = (float *)(qk_prods + qk_prod_size); attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; + complex_input = + (cuFloatComplex *)(W_out_contiguous + W_out_contiguous_size); int parallelism = vProjSize * oProjSize * num_heads; spec_build_w_out_tensor<<qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; int v_block_size = m->vProjSize * num_tokens; - cuFloatComplex *complex_input; - // apply bias for q, k, v if (*m->bias) { tree_apply_proj_bias_qkv<<scaling_factor); } if (*m->apply_rotary_embedding) { - checkCUDA(cudaMalloc(&complex_input, - num_tokens * m->qProjSize * m->num_heads * - sizeof(cuFloatComplex *) / 2)); /*q*/ tree_apply_rotary_embedding<<>>(output_ptr, - complex_input, + m->complex_input, m->token_infos, m->qProjSize, m->kProjSize, @@ -372,7 +367,7 @@ void compute_qkv_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>(output_ptr, - complex_input, + m->complex_input, m->token_infos, m->qProjSize, m->kProjSize, @@ -868,13 +863,16 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); size_t W_out_contiguous_size = W_out_block_size * num_heads; + size_t complex_size = + (TreeVerifyBatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * sizeof(float) + tokeninfo_size * sizeof(TreeVerifyBatchConfig::PerTokenInfo) + committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo) + + complex_size * sizeof(cuFloatComplex); Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(totalSize - 1)); @@ -899,6 +897,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( qk_prods_softmax = (float *)(qk_prods + qk_prod_size); attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; + complex_input = + (cuFloatComplex *)(W_out_contiguous + W_out_contiguous_size); int parallelism = vProjSize * oProjSize * num_heads; tree_build_w_out_tensor<< Date: Mon, 15 May 2023 11:24:29 -0400 Subject: [PATCH 127/344] add decoder for gpt tokenizer --- examples/cpp/inference/gpt_tokenizer.cc | 46 ++++++++++++++++++++++++ examples/cpp/inference/gpt_tokenizer.cpp | 2 ++ examples/cpp/inference/gpt_tokenizer.h | 6 ++++ 3 files changed, 54 insertions(+) diff --git a/examples/cpp/inference/gpt_tokenizer.cc b/examples/cpp/inference/gpt_tokenizer.cc index 437c4aa551..1cb2bc1111 100644 --- a/examples/cpp/inference/gpt_tokenizer.cc +++ b/examples/cpp/inference/gpt_tokenizer.cc @@ -59,6 +59,12 @@ wchar_t *GPT_Tokenizer::bytes_to_unicode() { return bytes_mapping; } +void GPT_Tokenizer::unicode_to_bytes() { + for (int i = 0; i < 256; i++) { + bytes_decoder[bytes_encoder[i]] = (char)i; + } +} + std::vector GPT_Tokenizer::split(std::string const &s, std::regex rgx) { std::vector elems; @@ -115,6 +121,7 @@ void GPT_Tokenizer::load_vocab(std::string const &vocab_file) { auto vocab_ = vocab_data_.get>(); for (auto item : vocab_) { vocab.insert({item.first, item.second}); + inverse_vocab.insert({item.second, item.first}); } }; @@ -275,3 +282,42 @@ void GPT_Tokenizer::encode(std::string str, input_ids->insert(input_ids->begin(), 2); } } + +std::string GPT_Tokenizer::decode(std::vector input_ids, + std::vector mask_ids) { + // look up each number in encoder.json dictionary + std::ostringstream oss; + int index = 0; + for (auto const &id : input_ids) { + if (index == 0) { + if (mode == OPT) { + assert(id == 2); + index++; + continue; + } + } + if (!mask_ids[index]) { + index++; + continue; + } + auto it = inverse_vocab.find(id); + if (it != inverse_vocab.end()) { + oss << it->second; + } else { + // Handle the case when the integer is not found in the inverse_vocab map. + // You can choose to ignore it, skip it, or handle it differently based on + // your requirements. + assert(false); + } + index++; + } + std::string concatenated_tokens = oss.str(); + // apply byte_decoder to each character in the input_ids string, then decode + // as utf-8 + std::wstring wstr = utf8_to_wstring(concatenated_tokens); + std::string result; + for (wchar_t ch : wstr) { + result += bytes_decoder[ch]; + } + return result; +} diff --git a/examples/cpp/inference/gpt_tokenizer.cpp b/examples/cpp/inference/gpt_tokenizer.cpp index a8f188e171..8712f0e6b6 100644 --- a/examples/cpp/inference/gpt_tokenizer.cpp +++ b/examples/cpp/inference/gpt_tokenizer.cpp @@ -70,6 +70,8 @@ int main(int argc, char *argv[]) { } } outfile << std::endl; + std::string decoded_line = tokenizer.decode(input_ids, mask_ids); + assert(decoded_line == stripped_line); input_ids.clear(); mask_ids.clear(); } diff --git a/examples/cpp/inference/gpt_tokenizer.h b/examples/cpp/inference/gpt_tokenizer.h index 476b47fee1..701436076a 100644 --- a/examples/cpp/inference/gpt_tokenizer.h +++ b/examples/cpp/inference/gpt_tokenizer.h @@ -53,6 +53,7 @@ class GPT_Tokenizer { unk_token = unk_token_str; mask_token = mask_token_str; bytes_encoder = bytes_to_unicode(); + unicode_to_bytes(); }; // ~GPT_Tokenizer(); std::vector bpe(std::wstring token); @@ -62,6 +63,8 @@ class GPT_Tokenizer { size_t max_length, std::vector *input_ids, std::vector *mask_ids); + std::string decode(std::vector input_ids, + std::vector mask_ids); tokenizer_mode mode; std::string bos_token; std::string eos_token; @@ -72,9 +75,12 @@ class GPT_Tokenizer { private: std::unordered_map vocab; + std::unordered_map inverse_vocab; std::unordered_map bpe_ranks; wchar_t *bytes_to_unicode(); + void unicode_to_bytes(); wchar_t *bytes_encoder; + std::unordered_map bytes_decoder; uint32_t cache_max_size = 500000; uint32_t cache_word_max_length = 30; std::string unicode_letter_expr = From 555aa33837593cb0a725771b3f988f3ecc6d3fcb Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 15 May 2023 23:03:39 -0500 Subject: [PATCH 128/344] Spec infer demo (#724) * Support multiple FFModels in a single top_level_task * [TreeVerifyMHA] bug fixes * bug fixes * TreeIncMHA and SpecIncMHA bug fixes * fomat. * . * add sentence piece tokenizer * format * prepare spec_infer demo * prettier prints * make the llama model work * add small model config * enable speculative inference for spec_infer * fix * rename * fix one of the bugs * fix * del * attempt to fix ci * integrated gpt/opt tokenizer * integrate opt tokenizer with pipeline * . * format * move files * Update README.md * add an overview figure * update images * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * add tokenizer in readme * fix * fix * fix * Update README.md * Update README.md * add gif * add weights to readme, clean some print * Update README.md * update demo * Update README.md * Update README.md * remove outdate file * Update README.md * Update README.md * . --------- Co-authored-by: xinhaoc Co-authored-by: Gabriele Oliaro Co-authored-by: xinhaoc <99570243+xinhaoc@users.noreply.github.com> --- .github/README.md | 98 +++++ .github/workflows/build.yml | 4 + .gitmodules | 3 + CMakeLists.txt | 22 +- config/config.linux | 2 +- deps/sentencepiece | 1 + examples/cpp/inference/LLAMA/CMakeLists.txt | 8 +- examples/cpp/inference/LLAMA/llama.cc | 325 ++++----------- examples/cpp/inference/LLAMA/llama.h | 112 ----- examples/cpp/inference/SPEC_LLAMA/README.md | 14 - examples/cpp/inference/SPEC_LLAMA/llama.cc | 197 --------- .../llama_spec_pipeline/CMakeLists.txt | 6 +- .../inference/llama_spec_pipeline/llama.cc | 69 +++- .../mixture_of_experts/CMakeLists.txt | 2 +- examples/cpp/inference/opt/CMakeLists.txt | 3 +- .../opt_spec_pipeline/CMakeLists.txt | 6 +- .../opt_spec_pipeline/opt_pipeline.cc | 24 +- .../cpp/inference/transformers/CMakeLists.txt | 2 +- img/overview.png | Bin 0 -> 209735 bytes img/performance.png | Bin 0 -> 18951 bytes img/spec_infer_demo.gif | Bin 0 -> 4302606 bytes include/flexflow/batch_config.h | 16 +- include/flexflow/inference.h | 21 +- include/flexflow/tokenizers.h | 103 +++++ .../cpp/inference => include}/gpt_tokenizer.h | 14 +- .../inference => inference}/file_loader.cc | 29 -- .../cpp/inference => inference}/file_loader.h | 0 .../inference => inference}/models/llama.cc | 49 ++- .../inference => inference}/models/llama.h | 25 +- .../cpp/inference => inference}/models/opt.cc | 0 .../cpp/inference => inference}/models/opt.h | 22 +- .../spec_infer}/CMakeLists.txt | 12 +- inference/spec_infer/MODEL_WEIGHTS.md | 27 ++ .../spec_infer}/Makefile | 4 +- inference/spec_infer/spec_infer.cc | 151 +++++++ src/ops/beam_topk.cc | 8 +- src/ops/beam_topk.cu | 72 ++-- src/ops/inc_multihead_self_attention.cc | 8 +- src/ops/spec_inc_multihead_self_attention.cu | 16 +- src/ops/tree_inc_multihead_self_attention.cc | 9 +- src/ops/tree_inc_multihead_self_attention.cu | 4 +- src/runtime/batch_config.cc | 7 +- src/runtime/beam_search_batch_config.cc | 6 +- .../runtime}/gpt_tokenizer.cc | 26 +- src/runtime/inference_manager.cc | 10 +- src/runtime/model.cc | 22 +- src/runtime/request_manager.cc | 388 ++++++++++++------ src/runtime/tree_verify_batch_config.cc | 33 ++ .../cpp/inference => tests}/gpt_tokenizer.cpp | 4 +- .../inference => tests}/gpt_tokenizer_test.sh | 2 +- 50 files changed, 1104 insertions(+), 882 deletions(-) create mode 100644 .github/README.md create mode 160000 deps/sentencepiece delete mode 100644 examples/cpp/inference/LLAMA/llama.h delete mode 100644 examples/cpp/inference/SPEC_LLAMA/README.md delete mode 100644 examples/cpp/inference/SPEC_LLAMA/llama.cc create mode 100644 img/overview.png create mode 100644 img/performance.png create mode 100644 img/spec_infer_demo.gif create mode 100644 include/flexflow/tokenizers.h rename {examples/cpp/inference => include}/gpt_tokenizer.h (96%) rename {examples/cpp/inference => inference}/file_loader.cc (86%) rename {examples/cpp/inference => inference}/file_loader.h (100%) rename {examples/cpp/inference => inference}/models/llama.cc (83%) rename {examples/cpp/inference => inference}/models/llama.h (65%) rename {examples/cpp/inference => inference}/models/opt.cc (100%) rename {examples/cpp/inference => inference}/models/opt.h (85%) rename {examples/cpp/inference/SPEC_LLAMA => inference/spec_infer}/CMakeLists.txt (79%) create mode 100644 inference/spec_infer/MODEL_WEIGHTS.md rename {examples/cpp/inference/SPEC_LLAMA => inference/spec_infer}/Makefile (94%) create mode 100644 inference/spec_infer/spec_infer.cc rename {examples/cpp/inference => src/runtime}/gpt_tokenizer.cc (93%) create mode 100644 src/runtime/tree_verify_batch_config.cc rename {examples/cpp/inference => tests}/gpt_tokenizer.cpp (97%) rename {examples/cpp/inference => tests}/gpt_tokenizer_test.sh (96%) diff --git a/.github/README.md b/.github/README.md new file mode 100644 index 0000000000..fdc52c53c4 --- /dev/null +++ b/.github/README.md @@ -0,0 +1,98 @@ +# SpecInfer +![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) + +

+A SpecInfer Demo +

+ +## What is SpecInfer + +

+An overview of SpecInfer +

+ +The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. +SpecInfer is an open-source distributed multi-GPU system that accelerates generative LLM +inference with __speculative inference__ and __token tree verification__. A key insight +behind SpecInfer is to combine various collectively boost-tuned small speculative +models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The correctness +of all candidate token sequences represented by a token tree is verified against the +LLM’s output in parallel using a novel tree-based parallel decoding mechanism. +SpecInfer uses an LLM as a token tree verifier instead of an incremental decoder, +which largely reduces the end-to-end inference latency and computational requirement +for serving generative LLMs while provably preserving model quality. + +

+Performance comparison +

+ +## Install SpecInfer +SpecInfer is built on top of FlexFlow. You can install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](INSTALL.md) for installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. + +## Run SpecInfer +The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: + +* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) +* `-ll:fsize`: size of device memory on each GPU in MB +* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. +* `-llm-weight`: path to the folder that stores the LLM weights +* `-ssm-weight`: path to the folder that stores the small speculative models' weights. You can use multiple `-ssm-weight`s in the command line to launch multiple SSMs. +* `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). +* `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: + +```c++ +class RequestManager { + RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); +} +``` +For example, you can use the following command line to serve a LLaMA-6B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-weight /path/to/llm/weights -ssm-weight /path/to/ssm1/weights -smm-weight /path/to/ssm2/weights -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json +``` + +### Tokenizers +SpecInfer supports two tokenizers: + +* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentence piece tokenizer from Hugging Face (model id: [decapoda-research/llama-7b-hf](https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/tokenizer.model)). +* The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. + +### LLM Weights +The weight files using in our demo is extracted from HuggingFace, and stored in our AWS S3 bucket. + +| Model | Model id on Hugging Face | Storage Location | +| :---- | :---- | :---- | +| LLaMA-7B | decapoda-research/llama-7b-hf | s3://catalyst-llama/Flexflow_LLM_weights/LLAMA/llama_7B_weights.tar.gz | +| LLaMA-190M | Bingsu/llama-190m-arch | s3://catalyst-llama/Flexflow_LLM_weights/LLAMA/llama_190m_weights.tar.gz | +| OPT-6.7B | facebook/opt-6.7b | s3://catalyst-llama/Flexflow_LLM_weights/OPT/opt_6B_weights.tar.gz | +| OPT-125M | facebook/opt-125m | s3://catalyst-llama/Flexflow_LLM_weights/OPT/opt_125m_native.tar.gz | + +You can use [this script](../inference/spec_infer/MODEL_WEIGHTS.md) to convert the weights of a HuggingFace LLM to the SpecInfer weight format. + +### Prompt Datasets +We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). + +## Difference between SpecInfer and HuggingFace Assistant Model + +There are two major differences between the two systems. + +* First, the HuggingFace assistant model produces a single candidate token sequence during speculation, while SpecInfer generates and verifies a speculated token tree, whose tokens each represent a candidate token sequence. To deal with the more complex verification task, SpecInfer includes a number of systems and algorithmic optimizations to quickly and efficiently verify all tokens of a token tree in parallel. + +* Second, instead of considering a single assistant model, SpecInfer combines a variety of collectively boost-tuned small speculative models (SSMs) to jointly predict the LLM's outputs. We observe that using multiple boost-tuned SSMs is critical for improving speculative performance. + +## TODOs + +SpecInfer is under active development. We currently focus on the following tasks and strongly welcome all contributions to SpecInfer from bug fixes to new features and extensions. + +* Low-precision and mixed-precision support. The current version uses single-precision floating points for computing tree attention. We are actively working on support half-precision floating points, and int4 and int8 quantizations. +* Offloading-based generative LLM inference. Another promising avenue for future work is using speculative inference and token tree verification to reduce the end-to-end inference for offloading-based generative LLM inference. A potential application of this technique is enabling a single commodity GPU to serve LLMs for latency critical tasks. + +## Acknowledgements +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. + +* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](). + +## License +Both SpecInfer and FlexFlow use Apache License 2.0. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fe77492f19..9abd9c9a78 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -79,6 +79,10 @@ jobs: export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON + else + export FF_BUILD_ALL_EXAMPLES=OFF + export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF + export FF_BUILD_UNIT_TESTS=OFF fi ../config/config.linux make -j $n_build_cores diff --git a/.gitmodules b/.gitmodules index b8419fda94..82a77864f2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,3 +19,6 @@ [submodule "deps/json"] path = deps/json url = https://github.com/nlohmann/json.git +[submodule "deps/sentencepiece"] + path = deps/sentencepiece + url = https://github.com/google/sentencepiece.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 45e4dfb328..d48fe22846 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,16 +180,16 @@ set(LD_FLAGS $ENV{LD_FLAGS}) # Set global FLAGS if(INFERENCE_TESTS) list(APPEND CC_FLAGS - -std=c++14) + -std=c++17) list(APPEND NVCC_FLAGS - -std=c++14) + -std=c++17) else() - list(APPEND CC_FLAGS - -std=c++11) +list(APPEND CC_FLAGS + -std=c++17) list(APPEND NVCC_FLAGS - -std=c++11) + -std=c++17) endif() add_compile_options(${CC_FLAGS}) @@ -440,6 +440,7 @@ if (INFERENCE_TESTS) endif() # build binary +option(FF_BUILD_SENTENCEPIECE "build sentencepiece for LLM serving" ON) option(FF_BUILD_RESNET "build resnet example" OFF) option(FF_BUILD_RESNEXT "build resnext example" OFF) option(FF_BUILD_ALEXNET "build alexnet example" OFF) @@ -474,6 +475,12 @@ if(FF_BUILD_VISUALIZATION_TOOL) add_subdirectory(tools/substitutions_to_dot) endif() +if(FF_BUILD_SENTENCEPIECE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) + add_subdirectory(deps/sentencepiece sentencepiece EXCLUDE_FROM_ALL) + target_include_directories(flexflow PUBLIC deps/sentencepiece/src) + target_link_libraries(flexflow sentencepiece) +endif() + # Python if(FF_USE_PYTHON) add_subdirectory(deps/pybind11) @@ -544,16 +551,13 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/LLAMA) endif() -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/SPEC_LLAMA) -endif() - if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/opt) endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/inference/llama_spec_pipeline) + add_subdirectory(inference/spec_infer) endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) diff --git a/config/config.linux b/config/config.linux index 93bce07657..5de72ed06f 100755 --- a/config/config.linux +++ b/config/config.linux @@ -53,7 +53,7 @@ FF_UCX_URL=${FF_UCX_URL:-""} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-OFF} +FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} diff --git a/deps/sentencepiece b/deps/sentencepiece new file mode 160000 index 0000000000..3863f7648e --- /dev/null +++ b/deps/sentencepiece @@ -0,0 +1 @@ +Subproject commit 3863f7648e5d8edb571ac592f3ac4f5f0695275a diff --git a/examples/cpp/inference/LLAMA/CMakeLists.txt b/examples/cpp/inference/LLAMA/CMakeLists.txt index 48e9322af8..b31e04b0a5 100644 --- a/examples/cpp/inference/LLAMA/CMakeLists.txt +++ b/examples/cpp/inference/LLAMA/CMakeLists.txt @@ -7,14 +7,12 @@ set(project_target LLAMA) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} llama.cc - llama.h - dataloader.cc) - -set(GPU_SRC -dataloader.cu) + ${CMAKE_SOURCE_DIR}/inference/file_loader.cc + ${CMAKE_SOURCE_DIR}/inference/models/llama.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc index 8ca5cfe98e..8d25eb3a3a 100644 --- a/examples/cpp/inference/LLAMA/llama.cc +++ b/examples/cpp/inference/LLAMA/llama.cc @@ -13,25 +13,54 @@ * limitations under the License. */ -#include "llama.h" +#include "models/llama.h" #include "flexflow/inference.h" +#include "flexflow/tokenizers.h" +#include using namespace Legion; LegionRuntime::Logger::Category log_app("llama"); -void parse_input_args(char **argv, int argc, LLAMAConfig &config) { - for (int i = 1; i < argc; i++) { +struct FilePaths { + std::string weight1_file_path; + std::string weight2_file_path; + std::string weight3_file_path; + std::string weight4_file_path; + std::string prompt_file_path; + std::string tokenizer_file_path; +}; - // input - if (!strcmp(argv[i], "--dataset")) { - config.input_path = std::string(argv[++i]); +void parse_input_args(char **argv, int argc, FilePaths &paths) { + for (int i = 1; i < argc; i++) { + // weights + if (!strcmp(argv[i], "--weight1")) { + paths.weight1_file_path = std::string(argv[++i]); + continue; + } + // weights + if (!strcmp(argv[i], "--weight2")) { + paths.weight2_file_path = std::string(argv[++i]); + continue; + } + // weights + if (!strcmp(argv[i], "--weight3")) { + paths.weight3_file_path = std::string(argv[++i]); continue; } - // weights - if (!strcmp(argv[i], "--weights")) { - config.weight_file_path = std::string(argv[++i]); + if (!strcmp(argv[i], "--weight4")) { + paths.weight4_file_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "--prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // tokenizer + if (!strcmp(argv[i], "--tokenizer")) { + paths.tokenizer_file_path = std::string(argv[++i]); continue; } } @@ -42,242 +71,70 @@ void FlexFlow::top_level_task(Task const *task, Context ctx, Runtime *runtime) { FFConfig ffconfig; - LLAMAConfig llamaConfig; + FilePaths file_paths; FFModel ff(ffconfig); - //------------------------------compute machine views ------------------ - int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; - std::vector machine_views; - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - machine_views.push_back(view); - } - - std::unordered_map> mapping; - std::unordered_map weights_layers; - - // InputArgs const &command_args = HighLevelRuntime::get_input_args(); - // char **argv = command_args.argv; - // int argc = command_args.argc; - // parse_input_args(argv, argc, llamaConfig); - - std::cout << "print llama config: " << llamaConfig.input_path << "-->" - << llamaConfig.batchSize << std::endl; - //------------------------------ build the model -------------------------- - Tensor input; + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, file_paths); + SentencePieceTokenizer tokenizer(file_paths.tokenizer_file_path); + InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); + RequestManager rm(&tokenizer); + std::string text2 = "I believe the meaning of life is"; + std::string text3 = "Talk to me as if you are python programming language " + "and want to sell me yourself"; + std::string text4 = "Write podcast about importance to include ChatGPT into " + "the evening routine."; + int total_num_requests = 0; { - int const token_dims[] = {llamaConfig.batchSize, llamaConfig.max_seq_len}; - input = ff.create_tensor<2>(token_dims, DT_INT32); - } - mapping[input].push_back(machine_views[0]); - - Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - Tensor token = ff.embedding(input, - llamaConfig.vocab_size, - llamaConfig.dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - Layer *embedding = ff.layers.back(); - weights_layers.emplace("tok_embeddings_weight", embedding); - - // std::cout << "------token shape"; - // std::cout << token->num_dims << "------\n"; - // for (int i = 0; i < token->num_dims; i++) { - // std::cout << token->dims[i] << "------\n"; - // } - - // n transformer blocks impl - int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; - for (int i = 0; i < 32; i++) { - // step 1: attention - std::vector axes = {2}; - Tensor att_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); - Layer *attention_norm = ff.layers.back(); - if (i % num_transformer_layers_per_gpu == 0) { - // Map att_norm to the next GPU - // since the size of att_norm is minimum across - // all tensors - mapping[att_norm].push_back( - machine_views[i / num_transformer_layers_per_gpu]); - } - weights_layers.emplace("layers_" + std::to_string(i) + - "_attention_norm_weight", - attention_norm); - Tensor mha = - ff.inc_multihead_self_attention(att_norm, - llamaConfig.dim, - llamaConfig.n_heads, - llamaConfig.dim / llamaConfig.n_heads, - llamaConfig.dim / llamaConfig.n_heads, - 0.0f, - false, - false, - false, - NULL, - true); - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - token = ff.add(token, mha); - - // step 2: SILU activaion - Tensor ff_norm = ff.rms_norm(token, llamaConfig.norm_eps, llamaConfig.dim); - Layer *ffn_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", - ffn_layer); - - Tensor w1 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); - Layer *w1_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); - - Tensor w3 = ff.dense(ff_norm, llamaConfig.hidden_dim, AC_MODE_NONE, false); - Layer *w3_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); - - Tensor sigmoid = ff.sigmoid(w1); - Tensor silu = ff.multiply(w1, sigmoid); - Tensor multi = ff.multiply(silu, w3); - - Tensor w2 = ff.dense(multi, llamaConfig.dim, AC_MODE_NONE, false); - Layer *w2_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); - token = ff.add(token, w2); - } - - // final normalization and linear - std::vector axes = {2}; - token = ff.rms_norm(token, 1e-6, 4096); - Layer *final_norm = ff.layers.back(); - weights_layers.emplace("norm_weight", final_norm); - Tensor dense = ff.dense(token, llamaConfig.vocab_size, AC_MODE_NONE, false); - Layer *final_linear = ff.layers.back(); - weights_layers.emplace("output_weight", final_linear); - Tensor output = ff.arg_top_k(dense, /*k=*/1, false); - - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager im(ff.config, llamaConfig.batchSize, 1); - im.compile_model_and_allocate_buffer(&ff, mapping); - RequestManager rm; - - //------------------------------ load inputs -------------------------- - std::cout << "------create dataloaders ----------" << std::endl; - // read prompt into input - ParallelTensor input_pt; - ff.get_parallel_tensor_from_tensor(input, input_pt); - assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - std::cout << im.tensor_buffer[input_pt].size() << std::endl; - DataLoader loader(ff, &llamaConfig, im.tensor_buffer[input_pt].at(0)); - - //------------------------------ load weights--------------------------- - for (auto &v : weights_layers) { - Tensor weight = v.second->weights[0]; - std::cout << "weights layer: " << v.first << "\n"; - - if (weight == NULL) { - std::cout << "op no weights : " << v.first << "\n"; - continue; - } - - size_t volume = 1; - std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; - } - - assert(weight->data_type == DT_FLOAT); - float *data = (float *)malloc(sizeof(float) * volume); - - if (v.first.find("attention_w") != std::string::npos) { - assert(dims_vec[0] = - llamaConfig.dim * (llamaConfig.dim / llamaConfig.n_heads) * 4); - assert(dims_vec[1] = llamaConfig.n_heads); - assert(volume == dims_vec[0] * dims_vec[1]); - loader.load_attention_weights(data, - volume, - llamaConfig.dim, - llamaConfig.n_heads, - v.first, - llamaConfig.weight_file_path); - - } else { - loader.load_from_file( - data, volume, llamaConfig.weight_file_path + v.first); - if (v.first.find("attention_norm") != std::string::npos) { - // std::cout << "norm weight data" << std::endl; - // for (int i = 0; i < 100; i++) { - // std::cout << data[i] << ", "; - // } + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + rm.register_new_request(text, 128 /*max_sequence_length*/); + if (total_num_requests == 10) { + break; } } - - ParallelTensor weight_pt; - ff.get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor(&ff, dims_vec, data); - delete data; } - std::cout << "------load wieght finished----------" << std::endl; - //------------------------------ init operators ------------------------ - std::cout << "------init ops----------" << std::endl; - im.init_operators_inference(&ff); - std::cout << "------model compiled and init ----------" << std::endl; - //------------------------------ do inference--------------------------- - int processed_requests = 0; - std::map future_handlers; - std::map batch_configs; - std::map batch_predictions[1]; - loader.reset(); - - for (int i = 0; i < llamaConfig.batchSize; i++) { - std::vector tokens{0, 0, 0, 0, 0, 0, 0, 0}; - rm.register_new_request(tokens, 347); + FFModel model(ffconfig); + LLAMA::create_llama_model(model, + im, + "7b", + file_paths.weight1_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + INC_DECODING_MODE); + + BatchConfig bc; + InferenceResult ir; + while (rm.get_num_processed_requests() < total_num_requests) { + bc = rm.prepare_next_batch(bc, ir); + if (rm.get_num_processed_requests() >= total_num_requests) { + break; + } + FutureMap fm = im.inference(&model, 0, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + ir = future.get_result(); } - while (processed_requests < llamaConfig.sentence_len) { - int bid = 0; - size_t max_reqs, max_tkns; - if (future_handlers.find(bid) == future_handlers.end()) { - BatchConfig bc; - InferenceResult ir; - bc = rm.prepare_next_batch(bc, ir); - std::cout << "new tokens: " << bc.num_tokens << std::endl; - FutureMap fm = im.inference(&ff, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; - } else { - // have luanched this bid - Future future = future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } else { - std::cout << "future is ready...." << std::endl; - } - // process end - InferenceResult ir = future.get_result(); - BatchConfig bc = batch_configs[bid]; - processed_requests += bc.num_tokens; - bc = rm.prepare_next_batch(bc, ir); - std::cout << "new tokens: " << bc.num_tokens << std::endl; - FutureMap fm = im.inference(&ff, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; - } + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); } // float* data std::cout << "----------inference finished--------------" << std::endl; } + +void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/LLAMA/llama.h b/examples/cpp/inference/LLAMA/llama.h deleted file mode 100644 index 6bf30cb19c..0000000000 --- a/examples/cpp/inference/LLAMA/llama.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/batch_config.h" -#include "flexflow/inference.h" -#include "flexflow/model.h" -#define MAX_NUM_SAMPLES 65536 -#define MAX_TOKEN_LEN 32000 - -using namespace Legion; -using namespace FlexFlow; - -struct LLAMAConfig { - LLAMAConfig(void) { - // todo read from config/param file - n_layers = 32; - vocab_size = 32000; - n_heads = 32; - dim = 4096; - multiple_of = 256; - norm_eps = 1e-6; - total_sentence = 5; - sentence_len = 347; - max_gen_length = 256; - batchSize = 5; - total_requests = 2560; - incremental_mode = true; - sequence_length = BatchConfig::MAX_SEQ_LENGTH; - max_seq_len = 8; - - // todo from args - weight_file_path = - "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/weights/"; - input_path = "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/tokens/" - "llama_demo_tokens"; - - // hidden dim - hidden_dim = 4 * dim; - hidden_dim = int(2 * hidden_dim / 3); - hidden_dim = - multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); - } - int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, - total_sentence, sentence_len, batchSize, total_requests, incremental_mode, - sequence_length, max_gen_length, max_seq_len; - float norm_eps; - std::string weight_file_path; - std::string input_path; -}; - -class DataLoader { -public: - DataLoader(FFModel &ff, - LLAMAConfig const *llamaconfig, - ParallelTensor const &input); - void next_batch(FFModel &ff, - BatchConfig *bc, - std::map &batch_predictions); - void reset(); - static void load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - template - static void load_from_file(T *ptr, size_t size, std::string filename); - - template - static void load_attention_weights(T *ptr, - size_t size, - int hidden_dim, - int num_heads, - std::string layer_name, - std::string weight_path); - void store_outputs(BatchConfig *bc, - InferenceResult const &ir, - std::map &batch_predictions); - -public: - int num_samples, next_index, next_token_idx, next_batch_index; - std::map> outputs; - FlexFlow::ParallelTensor full_input, batch_input; -}; - -struct SampleIdxs { - int num_samples; - int idxs[MAX_NUM_SAMPLES]; - int token_idx; - int batch_idx; -}; - -struct DataLoaderNextBatchInput { - // BatchConfig::SampleIdxs const &meta; - BatchConfig *bc; - std::map const &prev_batch_preds; -}; diff --git a/examples/cpp/inference/SPEC_LLAMA/README.md b/examples/cpp/inference/SPEC_LLAMA/README.md deleted file mode 100644 index daccccc249..0000000000 --- a/examples/cpp/inference/SPEC_LLAMA/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# an example of running llama model with beam search - -## how to run it? -1. build the flexflow with FF_BUILD_ALL_INFERENCE_EXAMPLES or FF_BUILD_ALL_EXAMPLES -2. download the weight and token file from aws s3. -```bash -aws s3 cp s3://catalyst-llama/7B_weights_float.tar.gz FF_HOME/examples/cpp/inference/SPEC_LLAMA/weights -tar -zxvf 7B_weights_float.tar.gz -aws s3 cp s3://catalyst-llama/tokens.tar FF_HOME/examples/cpp/inference/SPEC_LLAMA/tokens -tar -zxvf tokens.tar -``` -3. run *SPEC_LLAMA* with `--weights` `--dataset` `-b 5` `--only-data-parallel` -4. [expected results](https://github.com/flexflow/FlexFlow/pull/681#issuecomment-1534264054) - diff --git a/examples/cpp/inference/SPEC_LLAMA/llama.cc b/examples/cpp/inference/SPEC_LLAMA/llama.cc deleted file mode 100644 index 6cdc44f339..0000000000 --- a/examples/cpp/inference/SPEC_LLAMA/llama.cc +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "models/llama.h" -#include "flexflow/inference.h" - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("llama"); - -void parse_input_args(char **argv, int argc, LLAMA::Config &config) { - for (int i = 1; i < argc; i++) { - // input - if (!strcmp(argv[i], "--dataset")) { - config.input_path = std::string(argv[++i]); - continue; - } - - // weights - if (!strcmp(argv[i], "--weights")) { - config.weight_file_path = std::string(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffconfig; - LLAMA::Config llama_config; - - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, llama_config); - InferenceManager im(ffconfig, llama_config.batchSize, 1); - RequestManager rm; - // Add a single request - std::vector prompt{ - 1, 306, 4658, 278, 6593, 310, 2834, 338}; - rm.register_new_request(prompt, llama_config.sentence_len); - - FFModel beam_model(ffconfig), tree_model(ffconfig), inc_model(ffconfig); - LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); - LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); - LLAMA::create_llama_model(inc_model, im, llama_config, 1, INC_DECODING_MODE); - - // entry--------------------------- - int depth = 0; - std::map beam_future_handlers, tree_future_handler; - std::map beam_batch_configs; - std::map tree_batch_configs; - - bool new_req = true; - TreeVerifyBatchConfig tree_bc; - - while (depth < llama_config.max_beam_depth) { - int bid = 0; - if (beam_future_handlers.find(bid) == beam_future_handlers.end()) { - BeamSearchBatchConfig bc; - BeamInferenceResult ir; - bc = rm.prepare_next_batch_beam(bc, ir); - - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - } else { - // have luanched this bid - Future future = beam_future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } else { - std::cout << "future is ready...." << std::endl; - } - // process end - BeamInferenceResult ir = future.get_result(); - BeamSearchBatchConfig bc = beam_batch_configs[bid]; - depth = bc.beamRequestsInfo[0].current_depth; - bc = rm.prepare_next_batch_beam(bc, ir); - - std::cout << "llama current depth: " << depth << std::endl; - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - - // tranverse the tree in dfs order; - if (depth >= llama_config.max_beam_depth) { - // std::cout << "tranverse the tree" - // << "\n"; - // rm.tranverse_beam_tree(bc); - tree_bc = rm.convert_beam_to_tree_batch_config(bc); - } - } - } - - // original - { - std::vector tokens{1, - 306, - 4658, - 278, - 6593, - 310, - 2834, - 338, - 593, - 595, - 17252, - 5031, - 993, - 616, - 368, - 2302}; - BatchConfig bc; - bc.num_tokens = 16; - bc.requestsInfo[0].num_tokens_in_batch = bc.num_tokens; - bc.requestsInfo[0].token_start_offset = 0; - bc.requestsInfo[0].max_sequence_length = 347; - bc.requestsInfo[0].request_guid = 1000000; - bc.request_completed[0] = false; - for (int i = 0; i < bc.num_tokens; i++) { - bc.tokensInfo[i].token_id = tokens[i]; - bc.tokensInfo[i].abs_depth_in_request = i; - bc.tokensInfo[i].request_index = 0; - } - FutureMap fm = im.inference(&inc_model, 0, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - InferenceResult ir = future.get_result(); - for (int i = 0; i < bc.num_tokens; i++) { - printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); - } - } - - // verification - { - std::vector tokens{1, - 306, - 4658, - 278, - 6593, - 310, - 2834, - 338, - 593, - 595, - 17252, - 5031, - 993, - 616, - 368, - 2302}; - tree_bc.num_tokens = 16; - tree_bc.requestsInfo[0].num_tokens_in_batch = tree_bc.num_tokens; - for (int i = 0; i < tree_bc.num_tokens; i++) { - tree_bc.tokensInfo[i].token_id = tokens[i]; - tree_bc.tokensInfo[i].abs_depth_in_request = i; - tree_bc.tokensInfo[i].request_index = 0; - } - FutureMap fm = im.inference(&tree_model, 0, tree_bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - InferenceResult ir = future.get_result(); - for (int i = 0; i < tree_bc.num_tokens; i++) { - printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); - } - } - - // Execution fence - { - Future future = runtime->issue_execution_fence(ctx); - future.get_void_result(); - } - - // float* data - std::cout << "----------inference finished--------------" << std::endl; -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt b/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt index aeb3d8891b..4c8b147e10 100644 --- a/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt +++ b/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt @@ -7,13 +7,13 @@ set(project_target llama_pipeline) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} llama.cc - ../file_loader.cc - ../models/llama.cc) + ${CMAKE_SOURCE_DIR}/inference/file_loader.cc + ${CMAKE_SOURCE_DIR}/inference/models/llama.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") diff --git a/examples/cpp/inference/llama_spec_pipeline/llama.cc b/examples/cpp/inference/llama_spec_pipeline/llama.cc index a2a8e1ea2f..f149b6c9d6 100644 --- a/examples/cpp/inference/llama_spec_pipeline/llama.cc +++ b/examples/cpp/inference/llama_spec_pipeline/llama.cc @@ -15,6 +15,7 @@ #include "models/llama.h" #include "flexflow/inference.h" +#include "flexflow/tokenizers.h" using namespace Legion; @@ -33,6 +34,12 @@ void parse_input_args(char **argv, int argc, LLAMA::Config &config) { config.weight_file_path = std::string(argv[++i]); continue; } + + // weights + if (!strcmp(argv[i], "--tokenizer")) { + config.tokenizer_file_path = std::string(argv[++i]); + continue; + } } } @@ -47,12 +54,14 @@ void FlexFlow::top_level_task(Task const *task, char **argv = command_args.argv; int argc = command_args.argc; parse_input_args(argv, argc, llama_config); - InferenceManager im(ffconfig, llama_config.batchSize, 1); - RequestManager rm; + SentencePieceTokenizer tokenizer(llama_config.tokenizer_file_path); + InferenceManager im(ffconfig, llama_config.max_num_tokens, 1); + RequestManager rm(&tokenizer); // Add a single request - std::vector prompt{ - 1, 306, 4658, 278, 6593, 310, 2834, 338}; - rm.register_new_request(prompt, llama_config.sentence_len); + // std::vector prompt{ + // 1, 306, 4658, 278, 6593, 310, 2834, 338}; + std::string text2 = "I believe the meaning of life is"; + rm.register_new_request(text2, llama_config.max_seq_len); FFModel beam_model(ffconfig), tree_model(ffconfig); LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); @@ -184,6 +193,56 @@ void FlexFlow::top_level_task(Task const *task, } } +#ifdef DEADCODE + { + std::vector prompt{1, + 306, + 4658, + 278, + 6593, + 310, + 2834, + 338, + 593, + 595, + 17252, + 5031, + 993, + 616}; + BatchConfig bc; + bc.request_completed[0] = false; + bc.num_tokens = prompt.size(); + bc.requestsInfo[0].token_start_offset = 0; + bc.requestsInfo[0].num_tokens_in_batch = prompt.size(); + bc.requestsInfo[0].max_sequence_length = 347; + bc.requestsInfo[0].request_guid = 1234; + for (size_t i = 0; i < prompt.size(); i++) { + bc.tokensInfo[i].abs_depth_in_request = i; + bc.tokensInfo[i].request_index = 0; + bc.tokensInfo[i].token_id = prompt[i]; + } + FutureMap fm = im.inference(&inc_model, 0, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + InferenceResult ir = future.get_result(); + for (int i = 0; i < bc.num_tokens; i++) { + printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); + } + bc.num_tokens = 1; + bc.requestsInfo[0].token_start_offset = prompt.size(); + bc.requestsInfo[0].num_tokens_in_batch = 1; + bc.tokensInfo[0].abs_depth_in_request = prompt.size(); + bc.tokensInfo[0].request_index = 0; + bc.tokensInfo[0].token_id = ir.token_ids[prompt.size() - 1]; + fm = im.inference(&inc_model, 0, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + future = fm.get_future(0); + ir = future.get_result(); + printf("decoding_tokens[%d] = %d\n", + bc.tokensInfo[0].abs_depth_in_request, + ir.token_ids[0]); + } +#endif // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt index ecfe29b793..b943623857 100644 --- a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt +++ b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt @@ -8,7 +8,7 @@ set(CPU_SRC moe.cc ../dataloader.cc ../data_generator.cc - ../gpt_tokenizer.cc) + ${FLEXFLOW_ROOT}/src/runtime/gpt_tokenizer.cc) set(GPU_SRC ../dataloader.cu) diff --git a/examples/cpp/inference/opt/CMakeLists.txt b/examples/cpp/inference/opt/CMakeLists.txt index 2a392dce35..3156e71f75 100644 --- a/examples/cpp/inference/opt/CMakeLists.txt +++ b/examples/cpp/inference/opt/CMakeLists.txt @@ -8,12 +8,13 @@ set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} opt.cc opt.h - ../file_loader.cc) + ${CMAKE_SOURCE_DIR}/inference/file_loader.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt b/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt index 7bab587713..d7937d7595 100644 --- a/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt +++ b/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt @@ -7,13 +7,13 @@ set(project_target opt_pipeline) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} opt_pipeline.cc - ../file_loader.cc - ../models/opt.cc) + ${CMAKE_SOURCE_DIR}/inference/file_loader.cc + ${CMAKE_SOURCE_DIR}/inference/models/opt.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") diff --git a/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc b/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc index a1f50e230d..1229ad13c3 100644 --- a/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc +++ b/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc @@ -14,6 +14,7 @@ */ #include "flexflow/inference.h" +#include "flexflow/tokenizers.h" #include "models/opt.h" using namespace Legion; @@ -27,6 +28,11 @@ void parse_input_args(char **argv, int argc, OPT::Config &config) { config.weight_file_path = std::string(argv[++i]); continue; } + // tokenizer + if (!strcmp(argv[i], "--tokenizer")) { + config.tokenizer_assets_folder = std::string(argv[++i]); + continue; + } } } @@ -35,18 +41,26 @@ void FlexFlow::top_level_task(Task const *task, Context ctx, Runtime *runtime) { FFConfig ffconfig; - OPT::Config opt_config; + OPT::Small_Config opt_config; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; parse_input_args(argv, argc, opt_config); + std::string const vocab_filepath = + opt_config.tokenizer_assets_folder + "/gpt2-vocab.json"; + std::string const merges_filepath = + opt_config.tokenizer_assets_folder + "/gpt2-merges.txt"; + OptTokenizer opt_tokenizer(vocab_filepath, merges_filepath); InferenceManager im(ffconfig, opt_config.batchSize, 1); - RequestManager rm; + RequestManager rm(&opt_tokenizer); // Add a single request - std::vector prompt = { - 2, 5625, 16, 10, 2721, 183, 8, 38, 236}; - rm.register_new_request(prompt, opt_config.sentence_len); + // std::vector prompt = { + // 2, 5625, 16, 10, 2721, 183, 8, 38, 236}; + // rm.register_new_request(prompt, opt_config.sentence_len); + std::string text = "I believe the meaning of life is"; + rm.register_new_request(text, + opt_config.sentence_len /*max_sequence_length*/); FFModel beam_model(ffconfig), tree_model(ffconfig); OPT::create_opt_model(beam_model, im, opt_config, 1, BEAM_SEARCH_MODE); diff --git a/examples/cpp/inference/transformers/CMakeLists.txt b/examples/cpp/inference/transformers/CMakeLists.txt index e3cabdc324..0aa95f1058 100644 --- a/examples/cpp/inference/transformers/CMakeLists.txt +++ b/examples/cpp/inference/transformers/CMakeLists.txt @@ -8,7 +8,7 @@ set(CPU_SRC transformers.cc ../dataloader.cc ../data_generator.cc - ../gpt_tokenizer.cc) + ${FLEXFLOW_ROOT}/src/runtime/gpt_tokenizer.cc) set(GPU_SRC ../dataloader.cu) diff --git a/img/overview.png b/img/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..5264e2d41abfccc0a621028b9ba8cc6746cc479e GIT binary patch literal 209735 zcmZs@bzB_H(y)!Y1$P#A5AGJ+U4y$5g1ap4?iM@*m*5gy65Mrx1b2den>)#Q&VAnR zkL5QzGuty=RbACxbv2PHO46uEgh&t&5U8>;l4=kTF!m4-P-zHo;4{SZ=mZcDkhC@u z5-N5Q(h^SgPOj?CCT135=1!&-YSQBDe0;nR5Oj%V#>Q$gOmxF0n8wD#6O8mot{!R; z5ix4U0h2wWJyU?;f}x(w36I7EHwm^_l`5h*~Qik@iP2*MN(k_5rr39@d6{0PgK3*&~QHY)*O1$cl? zm`I1QFJ23T^u7CN$)ii!{n z;9~>`NT3Y_4EP8V{2>H?zydCS`g;b(z5x2~F;v>Gi+Q($a}W@s5VDeD8lI3xx$wT& znhS%<-cD47R4|wbR97^bS*E>xc7==WMGXhutl&b zxdjPKt*wrjH)Pw(kW=lSt>i9c3V2vPbGCzo-J;a#i6Rjp$pJ8-ffyK2;-Ux$|NZ4P zAym!13@)pY0&B-v@UyjobUdk|$A(|aIRWQ_kq|4 z6hX?{B;WX}CqOs(#Y4p@titXK+FbzFZP4QcAb@jI_$y5S=kNx~y-`9p0muo?kJXsr zpA)U%p7U%IoecK`dk0Z_s7d~5J2+w|5vh!iQOq@;y3*%yXPPeW5$c~h_STNPR4K}khW4D{n=+eLlq{x>W&j-c%=38l${ z%zkrO=w^U5~oSd8(SPsfR3hq%TXl#v`zOgSUnUJR! zx#q=5$IjN~`7Zd}ePJ^f*9VN-wJ{Gl(SJ|#Z;LpH4s351P|W2|U0q#`F~pYz#!~-Z zqmqEbMabowZ#x>!f^l;4b>f|wdasY{38T4b(d^4OOFQHy9Zyi(-IlWdEkA1j6vg6e z4C|%XU*|X9Q74dgXO2NrsX|Iiq~@8{!J*{l%FzehdXFlKC<&80C~H@{uKwFd3*?~W zYPaN6HMx^ge_#3Pi-^6uUvH(j`$Eg~t??R5O+S@3o!r$AA@wafQ;o5Ri51-TCqEQXc7g9g{XyF&>iguX z!|6K*MCJ%$u(WaNVs9{0HI$0o)P|Ue{_PopW`Y6{xXb!AlgzN8z#V?g4Rv(4O@J?# z_uT7i-gH?j7Hs09htlD|fpnQ-JhO-~eE!$Mf7^c0Jyiir_RpW52W#y-%sRD!gRw-~ z`47omVgERv0(uPNRO)!QacpX`%DE-cgpo0&vh85s_9yIyEXsAR_vgb%|84C-G@=jz zc*Wu=Kp@bCTdtOV>!O^FEH&Th1>MpNIrG)8cLcvc56r6Gn?x&7c{D>;VK34S2eC)3 zQDtH47K+&s4;Fo-KLYvliZdaBpFXit)6ynqXJ@JG3RAheWZ zH)y1g3{8S-`GNndT-N{ea!*PWCVS7lbJzZ#e!0IXVB-u!YRgb}K-ELPSd93$R3XV# zl)(49;&S}aH#Y)rd)wvoXk?hOrV+uwEk9p-LUQyYp5>0~2*tnqFD^<=DvGGd{`YJ% z!q283GLAtXhri6th1;9iAshZ!p}5dU^JW+oX`Wv>nXAubMCO7#HXWX63G1}1H~6=| z&(VTOQ1={+A!xK;;@D}O+Zib@r%PQOTq99d{qi?Lq+lUfZ>08xzf~O3Pg65_Pm}6e z8(#pWA6qvc{+jgoD`&m5|M_ZiL@Ql3zc32zfoI7h8DS*7f|Vd9FljEY3++sX}0(To_8=rfA-8 z>Z4{rti?{X*@Mkg%jN8rCroomjkcikD7wQ#*_7II!yUoE7kczN=4zjtYB6udxlkbXXHR{@nD4hv9qj2#XB!&S@^ z`xtZ0mC5&ck9PTagcCHfu|SUQXL`$1N8=}_|64!St1w7NNY?8eUf2W#vXpOPRN$*% zh^(xvkWf%cii(N`{oq{R{qbR&Fgb>6^cP1}<%->Iy6;A|d^xp@G9SN1%31f@d8jM( zIbPV!WKyu^$OIj?*8%$NqL6K-ZjtF+)>uIrdI}%CV+zplHYUjxN#(wdSk_r?#D7wI z^Jt0d1y%kK8=qWN*8L@S@W4K|tuloOJ$S9-sd|2PVyO7#QB+#XuZrJ}=VFwGauM6S zbo$$zNaf{)4apoGO!m&pv)B?7sCPO3bF(Z0H@tX?^3VHs z48scQC|CN9qpEUCcjzQK7m(Lo#{sjYFR`-%5QulXgq?VY*1%_56$m;TxMlcUQ8}>tzel*kZFoyav%0psRWu z9i#y7djRYZT2W6qDe;L-#peZzQjra!3b|Rq#|7vT< z__j-9k1$S@-Y40?)1A<#`U84wVkf>X_AX*@bZ*(7WuE%|$;R`$`OY25O9 zdZ$&eu0{~fp83~356gOOMF>jp7jZvl0=mLSarxept;_0il`zg^#x=2%5nzearc&n9IXa#*V>|FAZJxalpS!?LUu*+E!EPt6@po@_31-$7|H(LQJm+vrI z#y6KtjFmqvY3EPxIG_l?6Llzk=xw0!d%<-_Z~z)P63Y$tlE_(K+I|b~2=1S~;C`Ac zj>B6W%nROyvJv(f?=jG=<~zLmz+mop+j1G}{3Rs7U%bw&jByM8c)4B%x)-j{O$X2Y zkW^BF{Nxin6rHhJ;w0Tp;TE+nLk0Wyz4}{H%!&9c33QS8Kz+V$F zxX{B8#TyMC?xXCK6=~n z*ZxTL)7Bf?E%Y{HNH_;7qfodPIWX^!B-qVPE&U5CZANRF8c+9m=6vl)e|{iatm$Lx zkf1DIi$JV=F|<&=4{1j!?LiZw&y%oL8h9FOV_9wMHWEC`HUUiy3;RdhJfBq_pew)l zte&-I0b3wk%XsTNq~j0s3!vpiG?X7+mF^MECoZG4rBH;&syj?LNk8AMyLRRHzzGWjyq` z@H6;leIc-q%Q@_NVdDrI2&65TD#T>jh~sPb5N_dI7u|c++7Ws;#H>m^Td(?&6UI z7(x``%9{_Tm~+Y(aRGEELkqKl>zjp*H(w%ikgJ&8yKlM&T@e^I6Gu1lp-#qFJFtL+ zs3mCo8zo(MpSRLHUdC<**!K>L2WoTkUzFD<$qnJvk6L{Y)J^2p4-@$^{r%Y z_kDJ<)DH|xrX))LIDKcB6yxNa9EF)O<+3njyeQ)t=OYW{Ul)UBvP8-Et?6Wx_kvWz z3GF`{pIPN@4899@G@3(ZuQ7TSELYeRv;0hG+c70Z}IBV_Zd8vFu^zY?;Sf-R$m?7b?VmbEaI*H#g`9n7$Yu}c z_r08b>gpy{{qKAg0)*I&P~XMIdci$T-&9+%gV@~L56I0l6>G>bzt?#0*}AX2gO4PB zPHME!4YMm)SjvKHJhWwkYY;gh`yxJl$K;E;bLc~Z`uSAItkVl_fgvP$xew3-YH%)@ zaGiRT^`n_INi=WN%keU2{g~&L%mi7-N7 zr@I_>6!*{TMfdH=dsla`di{j0vd2}lA^|ab7+myl@P)e!!+*ZZv;CrD1{H%QTu>O} z6i_J|+J`M3vs8UJyi!-Dd4tAolUF?`0XL$sqwjm>(hz7T4R zB3Yf{P%*o2bL_IAtOC>)xo(t#w|;ai0Wts*ZdG0T<-NH^z}>T1=UKK*U`EJ}co>lb z{PRzhDF)^Ir|vE@-$AdQ=;4tsio54*S~&wTMCjfU?dUt|bvd*hbj<;e*tAYfo*A>D zWP);=sSqJOIQ=Fx!{|j1gIxVhA#)(YqNlj2yib6680R7#&&&*1&c>?I$E6UNTsvLL z)eVHO<)6#%?vG#KzKxdE6oib4wCE-(-Xhn%@ltoEBkQ4jLG7#9`~bOf-&-86vvYJJ z`_Vgo+gCL$ffuB;boa%64$Z0crJ3C+R{}oL%_Rn5BZxfmWFDPV`_la!&M14se-XnD zYAKt+bkP&nWE7Ul^RQ=?mw->OYdrjv(6y)Mrv5fkxBA>-Dhrn#I_5CX`s@07(4#`N^`1(TpHe z$@8|ynYoztbH7fR6#&gfg2VRD!od17j7pQw!ED8Bi_;x6`qf1N3oYr{2a*q~pkmMjIZy9ec!FCd#@6u3D>ygf5$TlL3wZ`f z1cZAK>ZKAfrRVF`hcLCp9%Paqm0~Lc=UDCzCI;IrQ^#(zLk?s@Qs>>>WqY?z^(vYV zV&c=~myRV&HJBl1Y=#`ZBtGdq63}T=r1H(x$mv^ecwsth&dkY3;VlonE@$M1#k{Ul zmE*LbWvY36#xqt{vTp5&B&>ul_(bMzH+OW!5BJt8vUW?;eAbBd&SBr;8xT-5l09xp zGgQMFsWMs`RtkE}juLiN_=!AL5Hj^Mntv%yzu0V5L8OdiG8HSSMC`_T;31NRa3VuM zKSK$Emw1k(;)*Y+tIaZypy~>132A1*I?-O2-b5p}5&_cjnR=?s`)t3YKqU2u8^M}& zAvp9SrsViovD# z!FnWSOU^g205~e&T88#`Ubkff#cqD~L{@ITdG=2CnhCgFO9%f7C3X9t?<{x7eS&vZ zmPGzEvSH4Pg39hoaheKUG`=j;>)bCUu7-0iW*bVx!I78WO2bHe9eK8q4kjagdU+EW z6F>SaQ3MCV!hF*z9aXYMk54m+_m?=s2j$fbmy#=}WNRbK=Mnm9ZRxVE6}n=^`dYFB zuXZs<+a+RjyEj=5yyyB;U8goZnN5rzG}19-fPzGj1T6Dn$BN!lbHSqS{(=x z#xor?Ejb_mNlVPbC^-wKHaY~TLZYw7AkdBB=Se7{Ip2Lhz5Btft^6$*b>C)W6SJ9b z=%%hGS)SBiETgJ>fY#wv{~#fs3PuqQ z_ZbBj!rUSI+T{}?9M&7_=PL||+;sHl>=>Uas{_CAJLJ*4f<-LZIu0ck=g+Yj zmdK}LM~e(52a&H%V0C*~XpMJ=NIg!6CrD|a9&&66PB)?!`^{T$cBr= zoovTsrTqH&D|#IrM@<+;m3DgdcZH){XP9t>q877wN{Fb0tDdL+YRG#fMT8eQ-!P}N_7oD!y(c`|SKp0w z)q^(%z|K@eKbsf>;Ag_mc;~t|Z!p2WK`C89bz4$tntUx5X(^bk zXp}0TB@k{>r^NMm7DTxBAYjXc<3o zGUD#5NeKCpW{hQx6xR3Ls1xV~g5V%pbntsoy`(TRx}1>ZGi7S%DjUDki+STcc;Gag z6&l{_V+CI2uXISiyOq%}O0}rNs75t0cs@40mgYh|;gCUm|1)q^X?|}kID=9v!M)FV zk6?B>W1+1L<36J|I9tRguNNAj&DQzY8QnDZ&L~JF3@?rH7A@(uM72Io07tCvJzi?D z+-Xlg)m2Zss=}X4GdCo5=*8_VZK!l_iZK>GzKnXQLgC_K)XNB$1Gek?-{akx1I9p5 zs6r=_i@VvpLxf$f>asn?w^G?G{g&=jtP!Fc^i>e{L%^?#1 zsI00*mH{1zL6=C`y7llSedqw1$Cd7r*;F+ThGP0JdSwicxi>0UCmTyDrj?tN@?T)2W*wVu!e4bEWLF$OL$ z%DP;p$#QZ+rvznPUNe&i7;Y0{m0f9XA?KYtk~-eF)m6)Vykyb2df%X1!rlO0R*Y=V z4I-wzc%68=D_3+VJ^Fkgi|~d8>U)$4Q!1d#rn+k~gJyzmNh%u%EH6?xd~Wa$E#CZ5 zCrZ%xxB?O?>aUVSd65b)JM|(AJp9aRn;W>6Q9hDWofGjpJ3NaX)N`{ok_#_fj;7g1 z%Dx;8?TS2j-D`Ny2nk1B6<>WH&Jlm2YNBNMHeT(9P3HcV$F`rG8o^y`TI+VRStF@7 zJ?i8AJ1sx7JzMECP}#Mc7VQFK$J=-K^3h698Zyw{eGqsSMNiA`r~=TYf-Xm(p{pil6r?n@wHN!DI9PMg-hbLIu5d+(TJ6gry zu+g3R`DpkRv8Z9s?8;uLzyMXF8paKWkA4KTl|eG z3+?=|-=Q05bW3H}uHp7UJGPe{)2iRr71dqW&K}5?G22Nb1FR@yNG}dQkma8}{CY8^ zF`VtBd(9HTc?LBNr!`N3I3GmNH*I`o`zrIFXNF%*lcE2n)q`@XQ!EU(SHgVGZQ*=EwfnR{Zv`>x_A zO3>!lsHwS{23-tYZ!+4D-Afs>t-iEgyZ)m#lPc6Sl8r0c{j*kmRHMWg&3)}4cy*au z&N<7t3r34`(;=%Vg1HX~`sMw1JdEt)M>sn;rn1EP$#xO7xG1%WD!AdUpxkFpUDUoE ze}8pmHxTMJ_QgRDpqa_Xy72&y`e@6hK(WD5!eIAxNI(HSOJu$2t zAZ!EzaXQ42+@{Y}32Nr%Z>=g^LBLiy;!0&c)4jvY{LG@!uf?XzHLP#TYx#6qAK)ra z)4X8CBaIPgk@s?RTY<&`!d~GQYM3d;7D{ho64-OU>vPRI&3T@Ex8#@Q65cP<9;Xg@ zFddd4`<8cVq&im#!0%gjH7X}v5CNN2xLI=qFP_SVENkWNz0q{6_-C~F23uRSo#`T@ zFt`h`KVIcEkpQq!UkM_pR972x3PCvRBiA47s3j)1-^q zt(9h*Gzh;gH2{`bbqg-sIgOHdGPJ(8yrAJ0xG?Mb6fS0z&Um6tCgFopCSg!X5UxVf zQO(@W)LV0XF?d0+#lUM7!N3;kyohCI>pG5KPFwvX+d01CZj4Sc6{ylQtm-c_y?R$~ z)A&?OJNF)c_^`L*gy4(68LBx2Cnh1TbMyY}5?C5A}r7vviTelR~Uk%*F}f zjo!EfffGUrhP{fc-F1$B3J@8F&F&nfTUW@*yE51+;iI2s6N)Kfht1|1v@)Sxzc4tl zQ`+~LyGz(GvePCfVKL1^qwn+ES@&~LXkX=pp03pvF_HyIbC@8{at9w@X>QU71oF zFbW!|`$;ipEQzeWjyG_6bSvbgxR2vXDe)9K@68iU_c)&#lWd?W6k)3GcsK@7xFy{O zyXW$aw)ymN&(Ja(@9dsw))2=fzmU#rRyT7}@$7Z2FKwBa-F*!RMX-X3&JGDaRC^cg z6_+2>ugNue;miu>VmP&Pr^8`-)^o_Dhy?9=sxGNLen=fO6h9WVR%)@Lg>Jz0r(1)50Z2 zl*;Hz4j!VBH%9&0+PCof6E*}XaPykGU#|IAX~WudMzAv9W3C~{hIqvA0nr9{Qqq86 ztE4}EzL(MrC)xPmxFA^cvnsWeM{-=9L4a|DFkXersE+z-LP;k~lcHXG?S`*&%W_-s8ME>^5WH7Ps2(7N64*LYFN z1>&y_txtK}=xz21T}|1gX?wpw@Wk}PF2}WIxnkW`(asMWs`kt|Tr|wwMfwP*UO-Hw z!WSAr=7@5Yqu#l?YG@4h%BSbQ|K3AX{t7duj|k#Jn@jbLc%}*>g2^ixfbG-k&LFhw z5|0o;dYL4YSCC%PDz=8mSo-VPQgmi0(OsTN7u@AP;u73SBIMg2>Z~G97Cy&kU5qI! zYt41!=%R?)*y_(!UkMX;hJW@$dn*m1605gCK%%unco3s|Ratr0_`^6wzU9Q}OSQa) z1vwKL?~VhDy0D}sc^`WZQCX#}sT7mFxl05YgYedacJ7uQ_L&FiZ3Vm!-%@-2mPJao zt41V&UQ9)e)1|lPK!E|jBzkvkTzG+*qgkGG9Aydx(75`tCu6~kG_`v(GNgmk{Z2m! z$tioi-gEpqTQI-VW>T~L`A72!6Gv>J5s!!&aa5OCNCBBl^C`|;xG0;;o){p{Cly#{y?#HAGA)zzUetVRUhn%)#y0{gIW)guQk#3 z<-{h5w4FqdY&XJ!eR37S)0zVD%vFBAZ+h>cNHx<$SPsc#z0dhcP;orB@4Rh7cqC=g zdl~AkOeWtJF$3ze^FDr?U58j*l~uee98IZtZE-R*-%-4v;$&B&Qj(rKjtv!cU9kSm zKjHP#s}LZZD?OiWD{hD$rr3&s9MAuI}4TZJUB|NT6-+j3W zk&@_J4us?aQYtX4Ibb$>XA{SqoxJv6O}1P`BQb{PgsISiWS1vQ(Kd-ZOFvx9kK{^JS(LB64iiEFvZPIvLXj< z%RqJ{8vZ`ADxj!Z@@6_nJ~lPARy6dBl5WN7smJX3V>ZL?a_zFD@H{(Ys55HEzJpTD zBLlVLPeW?sx%R*^$tvOfa$>q0alv=`nR&#Jc*}PpS zc;%G_@O+V;uaSAzYP@ASG1P#doq4{%%7dx(&jSJJbalktRfdBbcAqkt{)`-&U+e~r zf6(sLKiSkK>Of)~>KwPi`cSp?22Nz|LW8*GwQ?f9Bl=`{%@>6q?F;I-`@PN#%#|1i zp&vJ6q2WnrCslVgXp-~>P)Y$T8%JN(`CK+8_H*#1Q+U<8fER71S`M2gW-Q1CU0Qnw z*`g4%ipL)NV+IwITRD1c zCakjh;!ZwapiI>we3>}4g#0aAVvs+>gYxos$o~q?)AhnB!wme+2#X>tQmCxGyux^E zAk;x`zX(&4lk`S9txKZfp}B#uzwR=%7U6ha^G{4NO9NdCdjhlECir9$fvx zr@!JHl!xF8vzpA2H5-ci@QU(HIkYAaLqp78RA(lPN)2{Tu~Q?kOiLSheacI zdk)_34vragLeT*z!sA@a)dmYmMv0`u*ff8n7he%G><#eHS>%59gSBuin#8Sx9-f$M zn)V-mVBQjZMX{KZv?BKYPf_@{I8@O8avF{PPu|5IyCys1ALyW=p~36sVI6YL-**9E zL?PxBp!lx>ESPJo_DHaClU@kNgw`p^LJAPnHVa; zir(J*%zBLxZ(<3h*xA`9K(o8A|FBz1h|ZoLophvKZ?B9`@nu-KI6Wp4cTR`~);STp zF@Gx!@;3*A0X2~wh(+tb6dN5~{QbMNUaK>?q1y!4M0P|@=kF+!LkwBNo006AA{ULM zWb32pFiIN%c0VP8SF*2CV*k}SC2c4f{>MQgr?0wJxl(=)H=1RIg7ygkqJL&9p`srV z&R(e3$+wyg#JT@*O#jDgO(gy|);NRL?FJn-{(qPMM+7by7`>$~U>ET(4DCO?0^f}f zuHg3D*#8%B`9DN0xb3gCVCE31&AySomxhfkp;RGLysoa!yG`M@B>&$Z0AL6MMJ+99 zz=-YiYdT>t#}t6F4RBGOx)dmeG~xvaU8$QbcWn{6gIn4q~py8>Mye!flmOS*wI5Vvfjc`ph+g(x zKewa3SK8h&{d?g2m!YX-1|}zC-90=cgK5J$wJ*e}DJdAE(!2ZnM2zNtQf~is``-~* z@QnEL{$9=1m7}e#jX*%1lzWi&KV;(HoBmJbyt^=xwS2qjqGCo_>)~q!2wY@}+#qXTzud zS6UOl_?S^7sobQQ6Om&@mP13yE(d;%wwKvlJL~J5Z2RUPzDutUURz>vrE`;jL9@CY z%SChY|885%crzw!Vn=e9>F)zyLqiSI3%czo5DL6!ip6Ia&#l9>|Hpq5d3fBy4PngI zX*WFxc3jvkU#u6uUrF67QFR&hXPA_+#&y{C_m;>li{O6G+EV>Lo&K#r=Vc5!I=TX{ z%RNE9sz6KeCTDlxZ;$@v)n4;JjdD>63FR93-&rg-*=ZLGzED64 z<@l%|y5HDKNJWABt(1dOm*UUhE}+LeJ3}w*E0_lz{FuR3R-Tmoo!kl)ZJ_<-%${e~ zL2}YkAbmymGRuGul@P}AjW%-+_DE4D>R+_#EBT@LMu0XOtkHBkUzKY5Tq3|s7hSoT zQ&iXY_pXX@EYg> z&ClAlGXagu)AP@Eiy!(YdYNoiMv;zl4Gq0Sx8#LC7MHJ`otFv9Hgn}8!P{LI0ytAS z&`v2ds_Yb+&#!-IB&1H>PQqfd+vluF2y}T7klgM2j0$#SBXOt=Gd42A4xto)F*?01 zVHU8nc#7UAofSF+&(@5#vy%hT1=JP`qUn9ls7r*{Z^2{d!;zld(WiYy z2mWhJW-nm6L`fP7+KNe#1YJK)_m6l3F`N^?PDf5&p2Xhhb|Sdd;C6&CPQd>z{ZTqV zHH@VrL|;97oif%!nukd>AP2tj*qlkg(6T;`taN}mK$CwpA;aHXn2dQD@R>PKz6AdL zcU2oAaBVm!w5j{+UJKc_bA!EE@gvp{y~C6@KiJ9eUna7Gnch_$?KK54ZzO7`27{sP zV8J;-8^(Y=F&Afrmjmuk-|n#G&mA!JPW1E-aL zDcoN3{K74VzVFfTWzTfH^?o`l-M6ATL~eQ}evlPa@>|cw&i*pLXuNN6%K25bq$@cQ z588;{5-$(bZQgem4mt`if&rbMyC-7Fd#D@?#eVoIySEQg444zIs&3tH>)G|vMea{5 ztWT`O7mbr-?R}eMTFlTo_c$Ut@L2H09f8ZlTNn|9(K%`dP&O;wCoBDXSc@9I5*4Eh z2nfI-4HGsz$Nf+7F8l==SJ^Ha*_GaY6TclA-IKfV#Dm4l(`&~WwI5)%5RW{v#-q}C z6;*I!h|*iiMw`qXz>ytX^m(R7_GLzq&^L>)B@_OQFo_q#UG8Y{j=&X z#99a-iR#S$HEie$3+cdZTc?ZG9{=Y(>0KPq9+7ztQQb?*0`W6at-m2}(pH#+oYORzp3*T@rKkOW8q-5JTxfdIui4mY~UM{@mEueGrP> zPWKg6-QO6Xuvn4X&jGW{61B4H&6U&l2e?0_1}dLYv2j(v%vQ?ay0tNBH|WXxm3^fE zsV*yw67B!QipO{S@A$O-8iDY6`h^gCUOPRqpAfptD8Ti$9qB< zWK1w%`h+d|sKwB({Ei#rd+d!Omm;$+vzpusTZm4d>`1Bd6WFp3iEOv_l%J)0bMIB+ z5ohcg$8bqL&wTyyiEss*t?EO(p^a}dPjZJFs>8#ki&;|yH(Zph)s1h9Xe5{QqQ`}= z?VMf7GR?s%(f3c%tUJz$Ywt{7EYYN8%?iI#_!}2Q2HAN(a^sA?8o#yY1t!hTMj>B% zB|LT!st6=v!Aicj>iw>_3tRXQ90BZQ5`_H)s)IpO>}9`<<r`YKnko7UK8>5NMeA00hj_zjLwGtQd>~gZ{_ zkt&fDInH6KBj(X7!7#V2)khWhjfZH0Kc*|pzpa=jnkNjLGaJx}%gosU(z(p2$*oPK z9_V~Wv|i9eRQUOv^^t$_Y>^jCfD_j@yg@`8KE#-IXKNyHtuVEYSB{>v%J3-Od&RG= zLo;7;yi}kwXdoO5_-RhMF0oFmhUGr#Udarn07lRxCp@~|A+>ouR8=~hmD-Mllxv`o z%Plo!dek1kACsY}-XTQ?Gg;4E1xBPXc0g%V;Ys$$bL=Dd_X{L(Y14 zAQ<=4o)x$>(r!(~^La3Ji&8j{dmDBdf{Z-C7YToiz+ENdV^exm-DZq6eCt`p?m_tQ za#%M;4+e>Dg75UbHbq~DX?h%gE9DhQcaBx}`1HX5x5}lPB8RcS5U&~+3raB#LXh4q z2`#_5&o#Ln;41z%Kptd(-R(i^6#pq&$YNiN2E)W8z=Q0C@OGng!5P?0PtUL^_Oa?h zqpk_ELqB1^H|&yW7FhT~LF(LEN2}z33Z+d%iZ4g8os}G#0NHP>9L@Ff=xAH+>C`{u z8fG8qj%(XrS4Kd@hg1_a zkYMJEwhQU&d<4;i7UC9x3p4lrkJJj`Lxv=}>JwAbOLrcp)AumuWIN1XS=R}-gR3<#TiE5O*JS%7_xJ2A`^Ygjd#3@7r_pp)LL09R# zyR)pXZ8_gUe|Eva;$P_v@N$v}a$ntFbPk2z*%$vOOPdq+6K6nHKQNm1cr3p=HBHE^ z*!x)5o}fBQKxVPg;NzH_py-SL+3P1^J1AzqM+W}-6P`IHR-w!^As;Mi#|Ple^9}ja zIjHICfinHuJgbq$i0L=$9UtY&sXhM|-y0|g&s&!!x@aNz>BU1R<>4)YAg`zC73^A! zrke@ij|Jf1*CgGY@3${;#n(;?ECi*KHqhVT!Xe4w!cFI}hW8#I_hu?xZ&0ZuuTg9* z-nr($hw4?IJ1*t@+Us)(f30!IuCqEtY96PzI^{%QsyKv9tF}-@s+KB}aE4#VS9_c< zSUf=JIp1UUVGIX70Sj67taK4fZ*l>g3f;B=R5pE2OmQd`?+{F&S-C5th@T(qnn-2S z$me?~-YBRhr+&}xdi>3w(dp9{Lg%P1Pt52Xat4v9oA`V+6H05^ z|H*49;X#$H_m|qU)q&+R68-XXA!KbCD%3bgg?{lQe}g81Vm>_KRV$VRlNkP+*$9`M zO%GUPHdfUtQVCu25p~CFf$o5i3qB`VosEVc13|g(JSR+%>^D-EZ$#o<7mwE)cuT|= z?jBQgPtgli2#AFt<(J#EI^XSF=0AxE?Z)X}y!zlgEOq6z0N!RMJ!*NoCwN0w`O)SZ z+>^C3kri4HpJtnct&cEjozMURNF$i9O-DSUa`_ZI&h z@1-SPvk@+5igA%;;O!&Y@LV8q<#+6m@jsuc$(bu>*JumNjL2w=f0s!=jrtfHvZ}=xRf(es#IIETnwOXAR)6lqVp4gwgh6MT&%l}o@)c-pjpZaI-IA}4dJhgu|?>d zVTvbhs!Vb=T&*|EUR17Qq`@oK@lN8vZ9#W?v4zSx_&=D4uj3#8nDLmFmm?M#E;O`x zc;*uHR61HwsLE^8a|zu~UFyLL=D)zQvFpVtua;Q9;@4L{?ld<7sx&(;JZm$%f|eyR z1iJ2&=Y$g&+fZBE6|6yKxv<6MHSlX5<#4w{jGVFVjJvnh@K6wKncz#QuZ_+<$5Zs^LEU?+O3eH>B{k_v-17%H6ka(*p{Ybjwq#WotD)}+j2kPCV{{v3 zw^wNjnIt2ZXaju-jF|go1F;aZKca34cp31~mOQAWzEJPJj8;_H0BauRL=A_UQ9P>Tk`OYQCB=IXjm=MKGosht!fq z&`FO9I_TXB0fn&ioF|i@-zc92*Krf0OX;#Qzo5itUgGHtz=c|oPmK*OSe2W%tcduf zxkbkmqD*QQF{fnON+^*caM#-78WC?DZJ$(h#BDDUA@*Sk=9~pL9dRX1W?aaHu>qQj zj!;cxhDpr1b1%PiRQGj^2w)T@-HeyHm(=81@+NkzUc#ef~6h3+>c{jf%sErGn^BrZ<&jTIoV3t{Mi3)M~{KoB#%c@1@E9A+@kL-4SlJVzAJxx z^fe3X%xukH*9iD70p8@S8M@T^0L$3BXFgBxu;q)#hEmHWDk`f@rS5=mIFg$B&XG75 zgk4A|lZABXIXNd+-P?h#z0#ZPD9t$3>aBAR;R_P2OU*A`OJGmwuoLpR7nFQHGQnDZ z@x=|9{M2s1*KwLK>9-WJ&wpXJK~hGp1qBNkU*7OF}BdQvs(zn z6#W~1`Q3~lZNO)x<(_~h^<$I9mlz2A0n?Wu{h{?2{j_vDrvCPOHKI>Q5+)Xxd20k9$xCiY(yCjd`umO%SMMV`;RN>EVXmsr4UuO{j zN1qZplkM&w=4N?^@BXI7tdU@R{ru?pz&m7CM~BemYAZ%YuD1nI9{1apVFIjgtrpxo z7kZQ|N#Oj3{c0bw)8_&)5LI6d7FL~>#~t2H`U2ZW}gXHBf| z@}no3tfaG6u+_Y$tSLCjg%|j2)8GJHWgF!TS^DVXg$cAA-(IEmf9JepC07lJPi6DP z4{Np#o%{!DhXyzi`kBEm27@MopUsnr*A|2E!ZQ$&9ZoYa;r}D-EyLmnv#wp--CaU( zcXtQ`cXti$?(XjH?(Xg$+zC#w0FC?CnQ!KucjlaPoge(6xw@+9r>dU4*IM^xG~a%k zpb?Z+a2CXReN*u;Q-RuPRVTCtCub)GH_qpjoJc+NYMED*?3f8|n6R;&Dq5f&G9^7Xx z8G12rw9s<-d8s?n5|0ibI$l_0!p2g3+i-OZcfzAw-u{?ay!ehP&u{0Q*y%8fdvQAl zP5(s}FAPh#zF~wC=;MK9v~qH7c@x`z`BFhI0+;ZH8`?`+FZ_ddvcL{0l^VBibnH`*wFMUFz9Jt0K1wN!l7l;}0SE*$6}siwdAB}f%9LZuSA z;E`mH@moR$hK>DiZ_C=U?CZ(Dz#2j@=`c%@^_q`>9GYs-Tu3GajTxh`S~le*mo=gS zWJ1RFmVppF7c_Qn=`BFo#+UkP89F{1V30gU?}70%GEeO4FhJ4V7}pF1EU(zSV`V+GG{h$mn zt|5HyAjYeB#vk5dRh_X?N=Qvhv3l}3nem?U~63;IMAcUF|JDCQe5R8dJe=uWeR zB!Y6XAiRDo_(!lzH&0+y-xEX-9D(NoOpahp*bA#qn%lI4TCLX| z`%?db{IMxbln@CQHfmeG@Jq~2U5iE#3VyN$mQ%D)9DeSuC7*m{Bgye!Hpx zrqrQLBVfKrL7E{A!4Aa+hMmXh*qlQg@!Vwe#TV9aE|s;kx0L3ppA6C{#_fl;Y7JGO zlV|^~N03Q14>nxtUV5kgWBDFvfC5%HKhq&SH4Uw74c2v*$0v-c<&x>`c$;F)o`e-t2T4W z!1guuL+a?;?4{x!Y{|OI6Mpb5Fx8`6QHV2(=wT2#`!d;{cK;rPB9AkSV#6aQ<>zWr znXzVdKHf6cO3$69_4(-)gl^|{8)7bV^d@c!#J13n#T;%J0qHfA^{LY9 zpAMf?NTP3N&VrcSzW;6kLV;2et)-2Qj3_!gv(f9c$Jcfo`8>$DeDQg;$$V1P71?pZSiRCGUs!S)*Ljp)zPvT8!~0Q< zVSs<1wDu4B4_pMG0Z8H;TVQZclCgqv-EXFM<~MePhA8)h*+qK9+tRPZEGW~i)oxG5 zD7sUE#(imK>|fs^E^vcr6oalZt*o<>5{1sO-_=$92StL{5VG$C0Y~*H$xwm=q-@Wn zlL~tmnv2FFN5UizP04d*Q{+j}#gn6QlV)Po4HQaUb1>+_`xe2F1tM|$i-v-^bg3%K zvpWyRN02%x2qn9h9u2#AWVrNyL}AcYLa(4E1H7(z4#_KONU7-)NsF92h{{)6<+V3Y z91G16s%LV&RxSzlqPywF$sqQi_EU(e&wDKg%fdwo6lG__q*c!e#D7BBERW$PkA$Me&Alg;d2;d)eT7c95+SjRjk!j(tyd zusOT+wfTLU_6`eb1vX`_E0Uqp462t$sXZ~qvJZlmE0i2rWLx7rhg8XsIrDR$)f?u7g4Lzl zWADUXO9+Ppb_ylF-X17!D`dnpq)dT7G0ONCCg6HS}le6BM$_n=tE|Kap4ir8DRQ_Fd#iZgs)l1AEM{SYl z9Vjp{)eaFR$z>vP&#Vm@=KeNpHPtNNnzrSX*6Oq@LEb@JJpolhJJ}~8@7pHQ_+4JD ztgDLWY$km|V;!eA+wAB7Xb1_3>dpiE;oDVOqto=l!*HtppS9Ogs*tcS z=u8%8+!r!3S~!l`w=r)ZObHWHQ)*^obAE?dZ_lKX7hk7dczuojicbWEKlE2$`NAHd zUQ@c#gEd_}&FY4)Y|A!E zk2VE8eW_{q4AiZ8LRj*`96&nkq9WuIxlSGvpT`2K&OJ{+pYs5gyX%Ly5ve;(bt@( ze|6#J@_G^U;EFE^-l8~gxCVaX;OZ$kYklIF=|7*}DlN4aT^t1aN*rM`+eCYNGcRaU zs)$n$PbwI6xZ+6Q;MmNnD8cio=*E6FiQHJxp4 z%Lb_2Z}meu4Unq(!~Y>GRZ{rt#J25t($?s9rFM6B|K5*cELGR{N#Wz?uiE)(d08r( zHNLu<#^c(on=xPc{N1PeQb6G2$=Gv0g7U|&UZ15}UEa=~e!g z*Ddf@B-svQHAh5u=>a0WID4@g;3-+#znQaRL1+?w-WFuL7U+PJms^a8hWs>J1bmQ%nyGV@;#c#Llg-(^^hYh(2(`ad*%7TG)j zUDb$6srmv>s`G8XjjXp#RTc9It*j4nT0v25qnPAB3@2nx|56?@SpouBu{e$vt>T@6 zsGRP8D{A$6b0gv~30<->Gba%Fy*UJ)$ohCrv5muKx&nX;OUd1xtLJ`2`|$X<)KR_n zamVX!5T0=<%D{M|WkwSNb6yobA!vT2$Bam}Q#rU3*jH7rgsE6PxH>|+nELklk!=K0 zcUGNVMOSD5n)tNIk{xB*6IHWJ*&}o7uwO{AoWUb+PgiB4t@~QJGUPR+H?ml%v4lBS z8GC86y0_86)zY6}#%_tebhAX8&Cav?ZRzPc zQ(rPn_&a;~H)q2`h5xzA0u}@NAAkAD6Y@AS4dW5qsjBOiKxF0lzC`@Bg}Lo`P`3P- z56txjd_Tx0^1c=h411xi>n`keY;0^yR+rP#!n@q+P;MFewd#%%30-778eL$X=ac$y zu`UXaMW^ru^(BYhD%DbZ(+3CI%p59L%h;cdZ-~bw)K9dpW!2Jci{d0si}{j~IGw%n`bmHz; zx0ri%th`?;sBwK$SW95G{4hNg-FkYX4SSN@X@%HeG_)s|hDMaj)1?J9{%y!Jd%(bF zQ$k(|$T+DWp^X8C-U@fx1mp!u1F8OUlqA%AMJISa6bqB8(uiHUchxNYR#bHgID*vp zJaNOIx5YiYZIBpVqaBlwlaK}+gT-}kWOg1n33YBd;FOCJjj&?gsEnT6!}wc(W(cZ3 zCn3(>w|U79KPWqH9;k>|pFSR|X3UEj6&ZfXzxw%vy=sg;t}we?6)YUbhtU(cRd}r| z0x1!HEqhOiU5b*c14{>?l;cg+^)fDe=s?^MuYix-RqV@_K$(c`m#wSzPoTQjpe)L4 z=(nK&x}qbFWvKVJQ@6m4jb5wX*L7S)(*VRBhybPY(ET*u@?PTivL4~CCjr@mY)O7Q z-_${9-RiDn3os(SzWRG3z?_5=v1WV;)zb^(=gg}mQ4y0+ ze|u$3oT})oml!G#4Z{e(+-$=)3Z)1lV4|j`K083<+!CTyuT{Ct^LfDCgLHCnA&23X zyoOgitSHMZygy%EbXnDv_ z5Rmz_yZP_s6s|y!m&5v@7lsg!mkI? z>Zw%zg3%;Mv#X$frTios^6=kGxE1gn0zVk?L^p=3BMir?I@u}~s{x|JN`hU0b`&== z8besaaPDqjL_cU(SmfX-@SDg=l52FY1k~Q(^jO5oSkdoEgFpG*rO87G(yGWs&M_ zTAKRgNqCZ{Q@zFwxKyuYHoFp5df5`XMZP2p1?iM*3VM`&gR~sdFxs*({*P)9}TmAtDF--(3LEFEAZHfLZ8 z{WmZjYjJ;pZ?SgpiK#xx4SCt)hND;=F>Nc@007S=I4xdnjj_ zhp`CA{s_WkHmQK<;I$T-7d z3BvLYUWeGUCqbMjrR={9b&4ON+vOE22LpS8jX8qd-woeq3-WErDz1t<-iUSYYu)eN zB^O%$$7f~_2Xk0TUO8Z8mlurKf__!>vyZrM@XWhnE zi*x;8lOaU%wd}RN41TLrg|AWU*J^=Bd6G(C>KsuFJES(6-5u%?dZI)hDj}mnOMwQ) zHYmgxj95*rK6lBaYrkDp8_MBxKYLw>xf{$Eeo6>%4mP-^2gCaiJd}RlRHJDqeb-c1 zemtkkwQA=iS>v+w>hnusS!XS~D#osm{e)jKv$)tx1UhQii67Bv{4gtDkzk7riL!O@ zjaGpY6SZ}}xFzn+#Z537SRz7EBCg|-27V#MMoha1Y+{&ZH}3r)Tu}sfHFBW#W`S|V z>WqBV{dKNzp^?q?Fs}0zZ!=&5_5!h48_vvmMeS+q~9U-PHd8bgogL zpr8oBU^jBm!6P6v>sjA6#l^*G#X9cVn@i-iUNnpmDH|cK2vZ63UX~bF^>}0yy)xMu z=dPXrcWy#S6taZ@;Lw#2!&E$Ai|6+oFcFoEw|1*3A?d)V%9q3MK$B7|iSrNX?Veo7 zYz0eH;|c2Gv~vq!mz^dW4&0J{<+sbR#aDQ)&Ov%UqihXt;qdRW)sHqhmysiQ=~;D0 zov9dFlh)@7DVM~WP}4t@(~~Cl6_CvEhk=Y=h1c&}*Z8L%!QulzUZN7Lm%7j{9KL%) z!kAc1^9>YoPAc6>KQL1kPu-+CGPqS+>EY#bSVl=(m&@_a$Tpl=`O?sWv?t7zGrg*o zb(Xr9`Q@(0^s32*V}`3+Im80e^|wW}1dA~YM?2$8psWjD3{tgOyd4xHrrOv(v*Y#} zOYnzF-ycQe6#Tdd(ooAF{|JF>jb;%eW0=^~u(y!T2pN28(BRFN*Vn2~dHZmw`hhq$ zBWGjdD!I-QzdO8_t=Q_9)Yk6X@R@Db9&45sd~as*OAXO10ngU;X5qK$aCQ#HHoAYX zX#d&57ljxc98^KVb22{67LLWP4jND{9ahQ3ooZ}qVy;LaRgZ1e?Q*Rk;Nthvg7S4~ zLEi4WSIL{2&R|uI#Y<)y6F`lG4C0g(#pOe57EX6K3Le-XaRq8c5{5yeAM zEu;xns-d8j>nel0cof<9Ec`ewj*afk=g`~a+0&R~b~#+Fr2N04riRJfwSE)Oykz@m zwJPfoJViGiZ2(vk@2x{BxF3-r7|zd}fEU(JMNHz0`vhNutPPstO%!7*_jrvkcwAbW zX(14?-b}0ukCK@cpCTaWY@e&XNA4k-W8AN}rnz0?NQgz>_@ptocg4uSh0i6A63Cm( zl%t3is75#HdQI`0>lWS2xv(|qf}-#bgDfa0U^Aa3P{n}javRnDr~T~Zp0;nGVQWKS z<#i5vu1(KdN|5*UdIoPHAp2c=ojK(aD&i-gIYvtJvE z+f=T^&;Z#kRjo%tkykFdN*^&pzEZr56$9+>dH!`(8Ac6-$$&=%s>7S3*C9^JjGn1H zr0lmWvALY4WZdjb5ZYEOUZt_{?tDiq3l}vo?{!=;Gm}fHdVjKithu)%DvmZ-Jfh(0v9%&lb`DU-F;^ZXuo1I8D4t`oz?OT{Tl@Rh+3C^7n^LyE|ZAu-NT!OZD|@@d8pkVsQF$m3mcJ!MJ=-rOd|e zOd!i1ty;ChD^Pi(%kgFm`c$y=+@-wpdgdp@m_&>b^y)(Q1h#^0`wK;Y#BO z=NTUhf>W1HV_1&DH;F_k>o5NUwEn5h?%jMLRe_2KNQeVp&Nmo8nE$w_>WEzGYTjG6 z$R0azJi3qXw4x95)sf7l*INCpA+0$qWTdC@)%N1)m5Qk`j3@7X8V(Gx zQoKM48v!%^Y)CG&%uft~`__g3ZWTM&K^yR(n(SfHc66Ohz&n|b)N}f`nm_8%Tq?Z9 z4R-pbKLXhyHGE^USRf*?KDLi3Z!g-bqM_hyCpY;@Ny+P*>VaADb)j5k1ZCMXn41(E z4RGm<#~N|FL~u~4Vn=mVTHlz>S!sy4v#|ehWIRpQ*UFrY)Bd(|e==7OOtn;s)X3TM z1|MY%)mql$TX^E3mnRq~12-mwhR`Dzb(IK$JI7)&ts5N69Idme|Lf5qo8=$o2ZIUK zQj5{~vw5!5cCOZDv-R%zByTZAU9Vy=5amzc&l1Mor@=K;DfgiwsEL5R3KqW)Z}MG@ zlRRPGaCS*PgrIU(&$}N*e6+zN-6XX-UY9LW&GWn!mjcd$szTx?nShd znZm|m)=|(xokU>)owaDDJ8?bFwySUI5hOU}g=dPuX1pSK=A*+h<=5$v^!)P_8g8 z>YrCoEpUY(lS!*B+}vF>15(5%8FAhN`!dLwXhC8F=h!j@vhc*a(X%CyxPVWRM_4{P zqfxmNvDv2OsXST?#C^2Ya_8XUb|yk8V;|oag6TcMQ5IUPQ9#mR1fTGBZSHrVH%W!O z@L>;2! zXHN4vinjPGKn|sfrh5-jO50$E4BsEG?w4_pJj@k3BxP*VOyhi1Ei9suEAdQ> zaFT#z&mQf!iKN?!Y#PyVR5Yrr+glCdsmhb~NL341MBiP}b5$1ruIW#B7G)TtFqe?V zw`ditu13|2GoE!#Lbt&4aN|+#{OT_GDoGK^#mLW(u+GN#B3ggdHtoA0DFc+IOY-xY zN@VLe8iniL8wqLzD@bf*4cjs8OGCZJ zB835MSPbC54Lh#XOA?w_Bn+$c|Lr2f1Z8Ao#N%-;-`*DO7Y>gn3(W?ympm*lfTm`B z?AQ7h{d3bA?KX<;N|qdij!KT@8qp7AT_ju^ZRfM8<*+}swyZ!?pZ@zGuj(dNkau5w z=M16JrJN1hwZ(Gp!h&CJUoN5El9NLV!@(|+$ZvU{j4nG8!M9f3_35EL$SbV6Q45&+ z$k!Vexm|B^QVgn_UnP@dFnY>0uek@;!Uw+BrUgePm%zuUa)*B)S1}xHI5ORPyy*0U z)Ax%oT4z{!EbY~9ks_WKkUi>rpH-eS*-ncK2z%sz^6Sf`>H`Yn0E*+``YFr6e*y5; zQ)&c1#3hf2#{HiYI#lFA_E7&BOky`FTs(`D(MpHiMYeyt(akZB^~hTEAPQPXDzd9^ zhC7B6PEq3+fCMFIG?B}ULZNU+h)|{2U1SNQwGFZVlWtxfkIqd7>!9GQj}+#WdfUmS z1Ie2FTi|YWl@4o05&9H+M>A#in=Kj8HG%Ti906-sVQYI-;4fA0;CyngxZ?F|U;N@_ zv}KeV>F#sA3+*ZQ{PUA#lw;Exhl4fiwb!l#Jp@g>oS;HX%K2GwAx|=PQZ+JOMCA*n zqeSf1BY$Yd%xHSX%$xq)s@qlFRqyN07>BoBoX=y&&g#j%hhOaD?Yk$xKAwT`dCJcP z3%s9f`4&Qp{-vO@;K*H2i5Y2oW)~h((!C$`RbSNe1nF#UW?l+!%}v6qhp#Un zfIH7gf?GIS`S;2GzrKq}8x%x6^Vi|VC6M5I+vf-d=iCzK4d(xkWB#v`*kl8d+cO*r zoga>4iqWS-MoBp7RfXz0$#ia^mYCgVft>z!3 zl+^x+gD_AK5DHpanIo)j5*l_$@(y@*@QcXZ_I|MykNo(##GvCkx(ANXf|zBd@#-QFuE-ye#ZHT?;B z7Z;VNFINyJ_D+?<>uR9wC>1?7Y?4aWl15EHJG?0io3`7XziDZfJ7bc%OLuz5+Umz~ zt~+`yq|^1s=KN-DfOanRFVPm1_u#)(p5Adu#z41d#Q4X{vpMFg?~-mxc8Uh;8w9S6 zty^)}Zq;m>k^B}iSOzDv!OC9bVe<0|un7?-soXv5Nu#r7hmnVFi;Jg&tw#%~{w(Yh zZDJ7H(Y$qj-*7n>a@j^bF1LG=5@o`$>qktbj;>AbG$|0i#{CdErR;E)t z1*VBh6|+*~?Ir+A`=7pP9JV++v7+(d%@cGFr4!kkA2o}9Go?|R>PkXlh?XpG(9{Nx zcdNMWk~2!_^cHVYYWbOrF-3gOd=rArMyV_p>j}=(i65^3_KAhVcVZJ1N8ei|ktbJW zBk4@wA5!)yrA+YF;~YKGT36y^J&X{&g>7;OEwpCVr!r~q-wl1DLQ#|s=dzUoP9A*J zV}T%IW$5p$+OkhsqEt|eH>Uav;;i7r+6!!i&*&Zw+_{kO%;QS^hL}20e3BAael*Xb z=Yn!Xknhx%2l5zAc0Cg;Hs22>Sbsj_l*3mmq;a(BJ$w)(qkiLx9cgzaM2eZcjw+tHb~KmcSi_3$hM+?ni! zEri2kQi&xjHs|G5yVHxZ=@G0q;i7`}`<9G?Y8XkV|7V2)47`f)Vfx;>zmW@bODFWhQdhXG4ut@xewV3MhijP7ibkcp-GAFAIa#<)p(& z8pEQH||a^ve`q2i)Qkf8~r|#w?JHlN>+o&^+w0n z0KToZ#X9jMi49q(i^xoCTcA6#%3{{$DPp$zIadVFrEsH8hD|fJhxhiVV`}wFZuk3J zx!BAxMU&U-lZLEQ3cb9Y~nn=@Hit*Wi0`S8ytJ$m0LM|_?wy^wtNq_Uvt-<(gG_vFsoF2jCJe3r26qC z6hatN{q85~Hya!F^~~SmEGB5dG2X~jUrtz*b~|G&o13Zspp|gD{+dsW30Gy^&MUDh z|9MaUcvQ1wwm^~AdSKX5%AlGV1)qfKtD{4JNXVl?w37;g4}3O5fg!IV_~l|w$N0+* zSiLnsVgUD^o2y502p z$$0QVt5-wK|1e=k(PZ2Y*^qpE-H7Y#8+I<_nS?kIngbb?lBfl&J;eTT%18|FTSqU~ zNK4y(^$-Jdi5gNs!rzUvMc!=6PbJ(30_8cV!-{#I^yTW~1s#63rk$t6kn9`t1pED1e zp`70d-nYqy@L-!7kIl?ft>{abO_z%S+i}uy-8lW{P&daLcG7rG9xdy7v*49{ zM5$yTfpZf&nS9om<^SB`PFs*LKHcHD6tt`AE9tcoF=))PkXY7cXKx@Y9Pf`r0Xs?* zmOt|Y*vKd-LjO8E0*_*NYfKwovoT3poKKZJPY74#m)5-m+}3#HI^=(K%&e?EUpDY71*p)OomeAkv-oDFQh*f*a#a6>_Iwsanq(X|p=YR_|b!5b|Wh8D9N~ zz18>d?TZIQ0Fx+EWutR;_LNn3VhTCkdBK<|pZM0j)K?%OCcdn!?4uc?e%?+~bAHfM zTQS~M`%?`WaS?a*KrHzs8m0P(3~Y}kx@p|Megg+K z-tt*YCw~v%v2_DSB7&{G-HZ|Mvp`fO1LyZGrZXGgov?d9*@KnzN8?(DjH_{T+HRp# zWd{dC@Oa*p7lZqQOu1+@x}7aWjK&em1HbG1)5S7)$zVr`7(8-|oEp(zOz^btFZV@; zL(Zv=sUTTEN{;1DH@9kqQi$&>eN#$!$i5Aa^WDir@?i1aZfBHEmp=_Zs|**;K!JMU zA7jr|Z3yvj9o9Ss;DDgzzn)konr1X)&K(gPAv9=@QO1r0(Pv)|tmp$XmcJMUD z&JQt&CHSc$)T!F_ypF92=7h2MPIn)RO00_~=S&4z&eEa@I!8tb{eI4uB(?X}oel#m z1`D8H0%$JGFgNZje6;NU>}BuAU6OpWUfZr<4M3i;A5_mDgfRM~xIXr(e5%F|11ielcgLpce&#~zEI8(E zIwAA3PN&{_KBVdn*X;*w&StmR))Gj2eojfIpjA@|mQEnj^rs~8#W0X2lw#UDaLq9X zKmU2ub6#Zgv}pi7_9ta`{z!}6uSGUnWEgng&Hr#!%pJjs}*9Fs8|PLDTCfZy!X*f8S41y z_mAnb4rh`o6)YB=4Yp4SXu0(jU+8{zAL3)M>Z z+rO7-`#11T&8{w;bQ`tW6)SPgTGVFQM^^rbvmYP5Rs9~x~=C?Ffqwx1%Y0r z6gm%Ni2hk1BWE#%R{hZZYNy9yDwV$RV^LlZ)i>Vz82}sabbBnj(PWMsL&%#79C0ZQ zoDd2IyrsqfXZlIFoGqa;jji((i_m_8fnOmW(~*yO96+Pc(9_G#4k>wh#coTre+^&- zIDG=>6+^ifdNDvig;^LbwPtX3bxGy6xl8)mvnBXe(z??R2{GWts~`x{$9ywHMl4&6 zP>y^$UFg-Iazkt9NZw@+e6<>Hln47lsJz89s-V2(g_pBfNfc9O?Q|(@Ln?Jzf=57w zqArtOB9G+OAKs~7`ucu+ZY6-x85%^E2l1v$(B!2ts-7|jqRkOv?LP&MNbyc|lxrFK zM1ZnFZ8yja6Aio>;MC{6y<2U(&TK9@AEHr%fWPQ2>Y4Xfv8Z0K1Q+R)!i=H#`102eLoU533Q^5xq;CV)Xp3Iwk^2#FDoatU_fgKi)M$&XWxdrg801z1f^3 z@vuus38Kjr=M@}IkjrmOLGnoGC9OUX91r_}AD}A+s!cvhg0OFUr}&a-k2OuO^}=ii zIr<3sUm4&HU|?E7fj;+IFN*aAMmf5?0f-N6m;rK>dXU3S5akq&u2RV`FO+;LXZU=; zS;afXh<{dOzZdHtFwo)%QxZgftB|rVCJ-LoTB{yTv+~O35==xGgP^b?`u*7Yoz|r+ zbc@JUOHoK~@bS7IH`(PoMma}32>FTkL`7h(iF=xKOXAS*P`7l;KkYJ1?pfs7wpfoG z6>sD)mE*bjC*ANQJ<^?3(D%cf*mDDL<(;J;2WMq@s1@7jA-}x|4;L?WC}#TH(S|km z^grgy*IE*p%?LeZ&wAd{V`Wxtq@u?JOc^aArz&~wlhbG{ITpuX*b4%W#S||+-41f^?-#c0Yk;>g=9#sXz8tETbZoy+ zp{QiORkf7D!|xO*`IIG??@g10Z9|%ghOVt?RO<0JiKEaNgV^q-(>HoQJA!iHaoWlN z8EGiec!#X6^PrMVu9sSs`h5a9+)kl1>a{XeYBgy6qgwo}D~2@$+Ob3I*7s-27z6|t zO7-8C5ta5gWQ_9cE#ow>KrTSSrgQ6d_MeCc2-uBpZ)dwIm+JM3u>aaX7|* z?rA?HC<-e363LM5FMV8nNKB{~hPbf;uC7XunuW#FMa7ij5qYt{iZ0vwc-&9>w%Te) zlQ^l9h)phQfVL;%I17uZT-SiyS+W>4wEqHG0;6tj=r-p6?g=%%XNJdOI-(^gw3CO72IYWD=2?^lUfFcbK;zZwvW{uB>SS`07fihYT{1Ni( zk)1aFW(f>uJq$`DBIH9t4^0mGNnJ$`StyW3I7G)y$L0*fM=jDYVsFgv7n)xnv;#5>o4p==i21rl=flaenpS%etT|^>- z1RllwN(>vwxhxQ~^6nN3!T|~jY6Q@%NHisu1McfNEcI*G#V8Rg6{5)W+e3vg1E!dPzos+8=fKhVl-_>ErpPX&yO|+$J`I1yFi8 z`PnubZWjR~e4%PWZ$7Ey@g7L_L`Eh_^kVtzy>U1Sp`dsX*WEq^!x*f!xtW)bcYkXa zhn3JHDz(^Ieic!OR^SGW8E%SL^C(9^H(03eV4z!tfOHlUO{D|$=p-#$K4QA5J2pAy zw~m|rXGb_^6-pSrytM4mIF<{6go-y-#4yl|<>Jo0-0F?t9hif(GE4@t7?=&fJ4XV!y-dypvUS;Y8d%pwT9;th={U zQ{z40)c}%ptil=)xBjv9MUV6-gYghc_vwQk?vgVI@O)(3gGHkfF2rX|w-$C~tX*s% zob!Q-MDPYqwU``D#IMzlM-UYr9`8IUHKyV@9o2@$z?*|e#OT_`_4kbE$xUc6IyKWmS&`)S zcAiImAS(W03iDe+6l-9-KX!o%YqT?B+6!iA)Sdpr&S$V%Gqenu=p91l-v|i}QX3yi zc&|h4_;XlL<2QzJJrtNULAwlYB)Kk5X;ZUpZFI7g0}xrD^9jqZVO&9=Jk}qhl&PC% zAl~%R!=aL0VcZw`KP+$SQ|=2V46EvR}M?#?Xkm-Pa2g+{ItMa>@9T_{{$tl`cRg z;tc-y1rj7{!NzmK2ij*hmu|Vbsy&Z-=d1D&9&plc1XW=w1VbboiYSka2DQPoQg7Hj z&JMq@urQm&!dF;0NdCj2zAxgDKa;7IO|q{ z&}tX_><2zPRESa%=^~X~g1to4&y%iUwBJ13>o22sAyaWY!iC#uRoZ2a z*V!bdYF4j|0aO?w9(+9n^k2pR12W96pHP!duTa|7EiT`AD_uRd$zS|6tGmT+k4h$U z^c=o#4&u3y#Q7_L%I45t?*_(EUwRURuzLoA{NSPmm2t<>=oNT$(rqS@mcnOfAt8!ibtOv6=8t~!07bp708zYrme|)hkzq<~AP%h&pF zW{eD`mia27qub*NGI9^V?_P@(a}gf&(ZY8_dR9TQl#-Cili8yjgliZq1if!pTnLkO z1ltVDWY!&t$zqEuQ=waeAQTu}1pv*35Gi9Di`r-_bL8)8VtPtoQwZ_mn`HKJLndFV zbqd`+$ZcIg{!~5?P>&cWap%h*3JBx@8%^j|RPEyxP9wuktg(MEX9&L8GlM-jxxR~t z?B?WK#;q-3XupX22H8If2pe*wSf#_ffSJdVwu9vYNgFSa3z(+%mCL~`j=Ao2L@#@v z!B?lKtuh=ETs4zpXng!AL%lAW!#;55+~U>Icd=VxHKF1hDe4CfWkf9BYz=n7- zQrS$IL8767|*5(B|(2TaF4dCrxum2p4SE+AqONE+Jdt z++%y`oC;`j4JiL&0leLOX4A$BiXl&b#v{?c2PPUZ@~EqQvrSJpAQO7->?;0gO#@*z z<(?{Gt24R!z7rPBx=cBum+$VI%EikI!31kIpiZ_tey$Fb9onLQyqzl^fg&1HC$L`9fq@bAO`S_cJ z`}laZdvDbWz{mU|v;GBGW+a&);&UXI@SfF|5FRfSH=3Htg+qTizScM>6^600H}QuC zpL2$R+Nw_lEt8RhgPKl9fQS*nK|iP@2Jyd096UPzu4?b0bYDddLW|sb9rktP_X}sc z_`|fP_5=4TU8;!AoqWr!**9KH=%_CoZDn}Zq|2Vqte|9Y>eYLFrI6e{vjzMZ-iiZjZMd|1zq!~9Gk_k3F-2g!x_HK-`OqUv~z+tgWop!XKF;XB-~qLlC-@XKM+B=4 z>3##v*8aLTO$ux*Hc!{LW;yr;aPVeaV|>jx;z30Fvb)VtK z4FCyW@Ac`@s0+#~bvl`SaVEj~mVONkqDkifeG(-rHQHUpNXAMgJ|$4JKnL#vxH1sN z9&FQ$snq3a!>>Qs;dBDy%@=OpV!O=)G>G*@4mb*lu^;bpjR%+JO?^cP4;XkR&Bot? z2tFAl;Z1~|?=nX+^o?fN&qedIrtC`N6V|%`*u-s_qKGDp4Mq;C!Q-|&irYMg*b}RH zKG-kyfkERcob3w5f$V zgsvZ+sgg;bD@2?y_zO5*@4GeqK4q;$B?gM>Ylxc3pLAou_7T308eySmwS>lfzZ9il zSGLg?yrlTh`LuYP%bsQ{#E}clsnf`M@iVB?uzJhMSYK3)+SDaje+cx$6Arq>TNO&+ znkLy_K91wLZF@;l3D)h|%Y+=X2&X24Q(k4Om!anwTaoywd9z_NTp58+X4-EmgT89< zPpOgl750~upIO-ht>&zYdFqr@;B&Dr*0P+_w2mtcP~@=o-rsZxwP;g@fRHpVWR?B5 zsvEnJi07BU7Zsc#op|sn3Z$@3B$D54InBXNF!ZkO zb~Xb$l0KU#gZNnNFf)jF0Tzp;u>Qu=f;U`cXkB{dQ z$mRs_x4Cs574Fnj!MAJhzwFw)!!&(g(l-)H`P{F+!NAg(vhhXYyLhCMRSb^wm-UF@ zU>pID87vC5&uH>n%mZf@u`j_9*wYv49=7co0%A7LDZ2U}$!N^wNR_pVJt;JIVSBlt zBoO)V#qa8`I4BF^6N{9S8WYBc$8qC4E_$zlNDVE3hdEDa8U;QQqu46)I0*sJY877L za^zT!_Dp{&_tm?BX1j-15hG-N|4-;Qn)>dFH5V#$Zu#UqKMhmMyPd~o8xcFO2_`WI zE4;is9%yBd)aT~dQ*;v0tyCo6&jb`j>_ZqM#ITtJ`-4^}<^U^Zy6}e?B8E|rAA&kw zm5(1>Nt=XQ$a_eT*!kD3jOB~T<2GC`lp zI>gbMTv$a}_+zC8og60y-}4_ZG|L zLSTn(lL14oIA4UsB^NpQxDxEk`3Hq$h2Ae|sD5 zgn}59GQbRC+5urg0V5#o1HHzPu23o|MhG4bg~tr^AfcyIBe^}EDN?J^Qfx39iEc`4 zaxtO)IUEbLTA=!_KopF+U96GrVu&PlDOuj<s61K|3lVWaJ3PrZM(sOySo*NyB2pZ zZpGcbxCMt&tT+@Y?oiwb#ogWAwYYxi{?2>$d)D~_lSyW+$;|Uy_jNPtUvW~F)D1bK z9gR93A)KJpz&xo!UvEwp84#Sx{}B6h*JOE)MN>1&F3jqVqM+vuBXjP0i}9BrSDkf|ghHR&heHROykPJU zFW`t`H3y;GM%@UH`p;h{iu5_~K{$(jQ|8Plz=~QI+i8Szo0@6d9LupCzawrNo7DAf zECw=!qy7T9*zWL3$ptl3!FJ@(w&1xDolyhz0tsu!NdmANtd?`lw~!J$;N+N&?H#dI zyTS_3B;bjNN)ne&MYKyh`wxYh!~-2hh3=0C`idqRA?tzv-VOc({$v0Z?Eo-&cD`Dw z!tdJ}?K5pf*n71d>+9<`gVb?p4!8i4W`rdwfTJR!m#tzDH$x@kv5EzMn}+$|coK{u zDy}8L9uA99L_T_lg?a*k`3lw?W(aY!=Waa?7iO;cN+>=YiHd`Qpr@q1&1_?NDuQ}U z4n6&*QJUvYssLP-VKD@ngdLwq5NTuVJ2m(Sah2dPs2-@c5uxVxYvh2EwLa;+b_=0Y z+1>IdHL_#mGeBc{P&P`}(h36$W{ajXj`548`D@WtivVi6*b1-E-cYiH*jjpw*aPBGmuAqt>WSGHkEWx=v7c;Om0c-&?Q$2 ziHeixRH4oJ*|h5fH0NbSjv~B~HHXa|K1T_MwlEF{5R8cSGCLofX7I>@*d!H^s}`~) zyyAQa=COlWT0r%iZ*9s68>oj(L>kVkh_*>ijK{!% z?M6?&D^AtVU+8t>yr$>E|`+UEEiL7g=!eEdgJV*S4J7fB6@wk!3^8hZ@K)ZUa+J z(W5uBfUz+V1}Fy2WF7`O0)fpBU@7={Z(g%}f&%((N8u{}0ItUks^2B-X!nkl4$Cex zM4(KnfK6+qErg2A-l<+8qQM^CeuIa-Kb{_tl;>=@HftUC>j1(sWW3jBHDOOBfL=ki zDk>Xsa;b#$KKC=e*G=YV_$mu)7^T;rVKj?1zhf&#ziMMWI-}p5SmfTJ9)Y~y{3M3sl~oUbzap7bI+Z-{_&a2wOd8X9 z^lYD(Dj5$4os2O5%#`B;4B6n)bl0mjXc)c}{$6=WXKom|BrptMd;Km{FNHAd=oYzu zBhc!cmDXJ{|HzG)Y1sZm{rP?bxou&pJBK!r*XU#X8b39Fe7Ny^{Om1tSm-Wz=Jjwp z8};ku0c!#JFLKEHzDe?+2%r8}JpJ-8^}fROcCnRCAMNb0GID-(*bPRH$gRv?H!Mdp~)(e9PlUj5( z_MJkMAD&r#aymP)D8|Wo!D2Lzy63C@ft}xP05bG=B;>a9d4dSw_laW1cBayOR@FYj{ZE2~$JBMcj zp%;B4HOz~|y9+Eau*xfgMax9^QJZKNP}AfyawO)_|LG1nHutIedy{WX4AkOK7ydAX zc`F_!T#BJN z2MKSf;f114{=&7y^({3gPTwqRa3<0oW^@rnvTPVZ5`uvY9)}Yb;3^M%_q|@S)_@}> zg99f({Y(Z<>1=$@ZYOnuZbhL#LF|TLNA&jmc)=gkoRk@**iVwYpHGczENR^qR;dI1 z3BLZy%Kt9~>l_9I(+j%p${sH_lnZa?0(dA$Kg-Hu{M~+e9LvmUp{{6R{5-nO0S@o* z?axettTBWc6^C9IEr2jT1qnG5AHeN?Uc?b2?UYwXXK)UH`Iq}M+1CCP+K0v4?IlNT zM1slcpNmRORsHe_cfmXWJIDtV_=tMpUUQ43>?`UedgDDIMR>Wv)_1p@83`tcTrY4= zsz*XV$U%zWlb25RXe4PjS)Ihs9w9sf?-?@13(P;n$bt3!(24I!y?m{B?%QOd10hef z%I!=hQ0#1OO$D|5PubPsYNy>}Pi33rB!de?^CD?e+OB)M!r5Z)$~=(MWYbx0a)vfo zZi?z|;lLTyG8!NMS?*yH72uOl=|OTN1F-W5cx9pa9Tur=%+4^ju$&AX9rSnQJx_08 z(P0E?a=-WZ;%Ih5#FQcD50Sg^U!#Qo}!unZ0}s(WM&)}e~L&A6{pMZ_E%F~i9VTVqokA{AXTC@p-f+Xv3UQz z1+pQpiBHaBm+vF0oFG`cy-Zvj-aO^-&6cNvU_M<-YnHny09Zb@uzoO#+IKW3w+?A< zvkasF3xJ~;l)J`56T^Pm#j84kW)3`GXng-&nqU&+Xn~tJ)42?lq1HH}YihSG!ut4w93~4ty)F>n?j*O-!onC>Fy5EykkpX5fYF=*CzN9 z;Qjf+#3V0cm^Lwgu}3!-go}vR44Z@M{{gvM^k5dMjf!(~DKfZhu>P@9M2sNR`SZ&G z<^OQViyZs9!OxRBb!a+@RukQcU4eZ?CqpD{o!E8VsdXRAC2XqMC7wUWfBs!fS|Ax? zX_{p^zK2L|FJOs!aOs@j5yXR3(c*JggF;m1NqjsVr&&)LQY5qyN9m^`VGrkk(oNLN z&E@6Z4$@SMM1Jq9VH{b%7#Kz=sl>wyH%86Cf()uNJMP_0%Go@c6(R!CheOUp3iW+O ziQ(ocsF@gS*#<)R0YKp__K3!-T)&_8G~?{nCD|$$pFqG3n;dNly29Nn68#?C?#~>u@=Xb8(Q<(!uEF$h8jp zRArx%1gAZawecOJ2?g@GLOMN#NyfYcI)}#KGkG~aUrE4@07=F)*`#z3H-R~PsCJnk~o-@0XH3S__w{_gf(g`@DtZ#_UGaBzVHwV-NAaGVUF)A(8^oaF4vGgERQaYoim+ z{q@h%d6$G5JL<}~)89{1_Uv`J)MJ;BmI;Enp$Zw@M2PY|c{y~WU#l z9RBmGlkfA`ULM0{tI(VNh8uLdxp(R#f=BH6@-_ds<9)(3`;@Qw?_cf3*6)=C0;t1S z!Ha9RjNtwzd!g3WXLO_^?u7Q>EH`pb&&-DF|03$5j`JXTFztm5lQkFccTN584+u-R z8V#ii64S4|?i79O_oA5lPi(lsV`{#qLLSlS-A(>a!65EOK107O;rtTCaz{LoS+@Ky z!7W0jOA+rp=!z_)i{fwgnyT@y%VTnQHTz3VX&;&^TXxYd1_0;|JxsioP$ug!caZ7N zPjh^okf&}!s9f58u*j(vp0dk;C!%NyZy;3S*a%t;5Vr>!7Dlih@!WPHFSd?YnF_Kc zajG@2wbH8DCyj>TZN__KACRMO%mxGgGN9)sno`m8zfrZxwRDQ)W|dgMyAkP&1S;yQ z+X~+EXj*URA=J-#r+ooRYhaHfqsVWby9hm@1kLo55F{HIqb@VNMpkZ^KCI8&Qv>-& zuOj}PUE|*ED62n_mV8YabTh01Z5LLd%VL2!yiUlsZa2+ld(^G5U>ee8tqR*6rg@I= zn0ovQF+&i)*l1VqGyJBD?X+fVyAvtgf>U*!D|-c4>I4Ejr<$VMZ)YxrFu41qmkHix zEU6Tg4Zn<*!m?q#B0M$M2Xf-lYtQ8EnR$S?tv5OX>QgtAwGXd5uf;Yq|pJ`VNwmTv{M9>Lo z1;gQfd1rS?NV)IV-EX5;iO)})?cX9vR1XYLkQ=Ro_pAihrdNipu9eJ&TFZax4Er%b zmb>X_HYiYE)>b|{yzt-H0(w2raide|pOK=$ZXp)3#AiCNO?3kP@kzCR+4V0k5Hdnf zc6LlNaNx489pgNYG}_oqcG(>Xk#?Q}jQ{|t^g}HF_V*`2%dFpV0MO5l`mt&ewM=!` z6{^Yj2T)XAs+T5NuCEMjA3l^xq^B&w&6mhSEo+={q=_%B!MFw?rfz*Q`+N##hcXf* zpj{aIS_+MAF3w{A8Z++jG{hD)G9ULU7*@r8!~m=XdL)|=N0DrXKLa(IA#m~myhn+xk z3E74{udQIO5Wq-5g}g*~A2S1?*(P~D#)7!REpSag7fXlGA`)JmogmhG;$h>qiLspP z$l*6tC;ea)c|;21ow>4FRKul`3YvjusW}{|nPmG2`P*eok>ERL86oOvb$>m?-s=Ao zDoTl5aOk^K2K!=$;TghR7E_ohESF43&{GHZ#P$(zmPDM?#$<a+-@O(CMU{0`qNtt^Gwx%Ykd{6M&3xou0iWQjLTD#<67+}??=&} zbUCnjTItUz?sp`r=cQei*cFGUfyYZ{_K%IbT@ZG?EyJ)r5}YbqY~YV?ZP83MjweLidWSiWYn<~?@m_&u6qbzfzFl9Z3fA>nkkiTP}5x>+0M zQALn!`G4{1)YhvLkP*HjVW6TFPmDEwT0@fzj@WBgUb zxixcLvsFj9Wy#;s=&QaI1V1g}_op_;Aaj-p)npPEC}`^tLfbx=;1k%IvG~Y`@iUxs z3mK_%*AR|EC=k!B(VHZ&wx%j58X4FUt~ywAHjsiigqXUv540$AK&qEB1>=Nh+)(Q{ z63S2^(;3Jy<$Tk0*u~?eQ+H6$MfED&ghmp}xw~OWqxZ&=jcZwJY!yzVAOH99-3_-qnrrhKO;z%fai~ul~7&HN_kz@)|>vR(SAuB`{U;V$T~{}(g64P@Sp-ociKSoAhqXZJUee9 zPcyK?6BAR=0j*Z_!{Jh2WP8x6%KGj3ojKS31F63XG0aruEPds3K0T1b@q944q7XHg z(jE?TU`*@U%)WSILwWmuEFE`p>7o#PSDXCe9%rSC6q8Jm*)ok#Kuq8~2v) zYMau37`Dimr2kpvfl%rah9sZx$-^S&LxU>Ou_J>0xFbbu*S3HbRa+&*Arc_eD-s^} zm76B>c+{E8Ou?S`)HOUd0AiVr_A*@X-YL!@LJB6p4rA)W@dhr37wjUeCkx7F!07`X z@($tO@uhjLL*#Q30&CNFf9E57Q>{R!#=FL5KVxS)0D7X2_(H6_=n(^K6?$mgfWksD z?dd?O(w^8g8QFK79kIW@XieC!2owllB1xXMQGH40xb3O-=2$3eM%Db5n_C3nuUd7{ zQRGX6kl&<=bSRzzAdJ$cALitd9nso$OjWD|->r&`Cy}ZMZt^8W0_7r7$KtJE&KpMT z-2tML#j-UzJz?DEo0qpPyFWL{;O+$nOWmVea+IgqWT006HBa}tkOw1=U|GfBm~N5l z%WUQa+a>w!35IcBSW-6lM*sj>E6r){2VxDraU9V84y?%jiO9EUL^|iZ1 z4^?@%`0VwXP&7{b#fB0ySM+T|*m$JvfYn4w`>=d*^R^P(=)TRCiYGo$LDmM!!QG*L z=+>Xzp$RTU(@j!>`DXQbMAYa)E8)miQ9;zE)&hlM@5?z|T5*oz;q1yfV!SB_j{%gj z_0htM#8Vsp^Ab2eD5VT8Ou$atMQ1(`m1uHzDDem)8M_(fyzAKcrzZFo6H_=qzMy;2 zx`UEcx&v9d{A#zqeeQaHjRQgy6JwbIo>d@?IKklic01UK`9wxb=bA9>(s9s`MytE> zBW|_D`lktQ?i2ZL-p5iAOI4Kyh_#aWDMBlc3YWNf?~Q(~EaGkTS4cDUN^_8Bx>H%X zL2)L5dyxI;odPB3+mz`}7lJ(`^zws*K0fR{=d|-Qr z^L9&1M)$u^08M`{?-ibdK?niAf;a zw2pwlajGra=Ki<+_{e<&dw$Jwo7hWZlwYFvY^9F3jy(_g89hXte8JtHU9yEHLJa=xH36lBIT!xD>zcZa;UsFe1TmLAW9xYj!WtT4_HBzq|~;ku?v zy8?|p>o8GVxp#x?W$cQrW?UeRJ%7k7zk|I^M+PDw9d)M$j6)dW>x)$VDKGstlUCcY z7e!pa7f~|QNrD$f&4*87_^~>rgbQ1TJfcv5WrBxOqlZ#KMzMe#9Nz@$m5zR}tcgst zhn#R9Wti1&mwJS=&Xa69j`iiTB$S(hT2x4wW7w?)TrgxeIUr=OZ~^4{q+noQ}fFOa&`Pk*~j)6vu9jB#D4dWt>0yH|+cI z9?Qm-emPk!SM4@y`9d*19sc)sIem%!B&RuJwp~ek`an< z?iD*~%P{TqW+5O|;E;Z841_(gk0ysT zPw79WURs;nT(0aJ15D)u!1Ph8?V~2;Oq$_aSRgV$rd>fEaO-#vTRdA3QkL(Q{N@ueEcKTOeF9{7{Z5AjN^E?r%`gJrw2mXR?v`PoR|NNjXE2De+~BjLXq&!gbkMF(Z57DA9!Hvhaa6&_a&%t z8MEhnnWwDz<`yPOzFDvOpQB(=#Wka}yVhq73)vpD2MFItgyf}Q+PkrNofGMdgUp7r!?%T5)H=~1^B`4_obPoj zhXd4mUJ{XxI8{zd?P^461umnON;-$djWOzY$N!rS;exoueoNEqH?zeMa!OZcZ1qGS z|BHuk9zrU8zwY#-{+^bTmCenUiIe{XK*%MEgmfc+&iXX<^)A!y(IECsQ1{`(RyKKy2b89d zW!hyFDa0J!HYwbX$FE*M?^PvPm9CtcZcHX762IQ1K~wg=ul-b%ZE#PWWv-B+_kx;~ z`f=$+K`Ym`LnhHP;l~qKO}&^lD8Bpib8PgbG7dIjGm z_<8%Ovj{QJWzEYfuz0fSSDW}8*%QsPM0XxL%H#h0`19=SV$zq_r-YYd*xPqip`oR zWQcG2cynOi+8zkgZ9AB_0x&MU^4%EWg{gWRTcmr`QZwV#*w>>9zW)0mtidk4e$dmo zd!Tib*juQS4!%K5QiQd!V1Re12t6msqqtVDzRIXqT{dK(#mLVV8{FVMt8ULd7frSB zO7U9t7O!iM$)&prw!0x#rXhC6Q8<|yHTr5fCU|u|IFV9p!tAROR(!5|WX7>Sn$ZIN zu;Y{d#3T7G+!jD-(B=FS2?r^UK`7i!PhgXa+k24982{*UXv1ny=}ew_%Wbcn;qv6epA)6YYLpOLXK2pbJewRxpU*mha zoRs8u|IG@QStX_9qi;0A6(~`WsQznE4ZJj9;6|CbA2+>!C2&VOgj)$6e?C4rDhH6l?zE zNrTjM*A;!fK#xh}Z-~*Az5tu5g_~@AEmIkmCO;v!I~9egvd)*j!{afBfeZR3gI`1RXefaM@=vKU!fL(w;5VMfAs#DqM8jll?QUQc+R;s}F(&BW&A~szs8%Prh~f zKdU|8ot66DpQpQSw_mONHiIuLEVTHvf7)LessQVl8od&55adPp>Fp%-tL@$b4o0^s zs%K?NHw$0C$LFJoqZDL5wLN|hmt8r)t|s(CdI+*Ka-{k}18a$CYEvjD{5+++=o2yp zT?tW}4ae=DzOyccH9j(p4*tI@Hx?8WDkU%)WDTKnXq{7y%Eyc$SG8x)(TvY;E!2Gs z*hgT}<`8e0%k1)LGuSX!;}o~kB7(t5@0!g`U z?nB7O2>h5t29@!*3+YI1x3RYzwJb69$MhLUZ@t`cYM>c43qXI2(8lU&m7C(PD*<&H92w@Re@dwBLesg z$@-X1vXT`ezq6!vnvBSavL1Q@#sL6_oLObCsPcwrRD2Z2=jsep%?N8fFI>8r(R`_l{4zC7iXblOlM9!ew!R+T8|QPx5Z7E9V7Zr z;PuvbV=T+)q`+(Ubecw*%1RsPk@shRnC#sad>l&O0uqmzK6eHH1jNlVA zo~csAo?52A2h?LZQrfRHfXC>y$NN zZbeRW&F;?RN#s3=)~>RjuUaY<=}dkLIe~JuLZCGREC=yWl=vVoS+yD+GDUm{N)D{K zv$+}9U*;&vbiU*I7jaP(FEI8i|7psGi6^9_yAfZ5w`M&9pLIA|c2VN@@oq9wn3+VO z=An#J1%BHL4#(=1X==gmG{uR|m2GYcAfX8-Q zYpZDti&;A0Ro)NU)Z?m@KdqXBc(_~9-T1oNIHBviS&Mh&p|NEymKC_mHS&H?(%g(q zhVZO9t+G3hXjn>b%xa%+A{h>1$h;<3YL<%<+Z`#Aiui2fkcxPRt2u5I{pjv1HLN^- zf{!n~xjTokPQ>Q0d1|`ZY96}z+nzq#8FOSs4P1#_OeU@Md8PeQ_eSn>eI-+>y93lJ z$C)hJ7|@#G&!9k$2*h(yC7+l@%NJ8AGDC9&)R(>#n30PiYFDdWr z6_+EZR2Qi8$HpCI?CaZu{c>*A4P(#O@Dqlow5`sch8xZ90+sp=vC9pPpEZ|m$4FhB z=mHi<`&Kk)^S)7BE4F+NNgJ$>R`RZwOy^c9QK5vss`+h9-WlhR5De#jF9<^SGtO63 z=YZJYpzNGc@OWfB_l9xRE>Ptp+0H&Lo5}VNO$g2 za^dQ_UNNrH`J768jMM(UL}+J9j?!&C_GEHL^Etx@*+2vk7YGjFn0XA!3cyzpSdEHX zJ(nS07p**1))*MrNB*unEFMp&P8Ir$LJMVYRWNhvoiSW~N0TkW(28W7EkILK1#KK6 zoDU#Ye}$AGa%(;^F+-u`WqA|GB35k}f^gTG{L7lzWS^CB2dzf*YQ{L18hL&>aRagVRVmp*rpdcGGQ-DfnRqEd;*LbQsi-v z^L!dEI!=`u;O}X{M6%V|;gd%~nXWH>(y>{R5wY31)C0;(u#0J*SRKe!wSgn(G&B?eKE}k+dAy*Y03%lHQVHa{rt;&*>474QcI*$c zfmlTN*l**~9!U9yjOnJ}CEl{<>C_wTdOI7pbtYOs!SKc4Mex zOu<_=1TkCVP=s_Aj2pliAeQ+!F(v#-*!vEL_nAQ?mhhxZ{{FtN5=BdDO5h#qo*O{! z>`*HOzJ=+`_+yV-*gMOuIY6SjzV{Y0WGnGXZ4B(iC{b@!btlLvUv_d#-=NUwP*%|? zGPj25P0PVLK$EjV<~W2l^-|h=D}sfc#{i`C*)^%QS?JcJ?Oi}pUJWtu(lF^~M-^Eq z@WO_imP(a{bFtCDw-q-K)kGHl2{aVc_nRLJgDALloN1LttuM;*@~v`|g2)+q785}| zH%L`p5y_Sq86rapVR*!0eg9`)B4?=>M|I7;Ux%u#)KW9 zFf=qwN=u_Vd1!Lp5`TKW=o;d~;aM%gFd4?vZM0ASV7FNFi)l?Y!bM z4YuCTdXt6BKUy;a8KNyl45@6s!>XcCRQuwhFKt%yE47XYrh{mV?_f@ilf8+O>AH)_ z?CJ<4O2@EqlCgduuZ7MLC=)0s2~fnEAkW7hDLP*6X3>@xM1Df1fs)-eF))ua-WIu9 zHF+bQlX?3#ve~5yJ9d|JAp6CsaXxH(3)+(Icf?x)I~PKdW+-#>0WQ+PJ+B)jsbPUz zz9gAcEsB4bOPUje(qDvIT&*_gaC_o7Fv%^KH3qyo9p*B@w8`&{SlW^a=UFK|O6tpy zIb0!Vmg)%4T&>o-@M}d(`d$I#(P@9PUU&ycMnG#j% zQY?Me$dr0ZRFQka%d}7KWpZE`Z$Pj8MrW3B2UvYL7eKNum=9hUTSy57wd&vnObpc+YEn{j;lzn_ z0h-uCuHJrwZBii7G-kh%PuAD z7c$d^G|=U-t8kwoAYP|a{U;XLywtI zmA$OsEX5fcYkr&|Re#1TDCOJbuMKK97w%FI9LEw)$MDEeXu5W=@*|72&%eke1rS1g zhp5TEG5`3QVA7M}X_n>EYdYV@AN%&zdh4U97)&`$CtrG*#0_(^D=EZI53&%t5G^X@D`Ty?t|P`Sc(2`l5YP|9kvrx9ZU zFE#h!|D3YSU(G-mV4dp~`EN08f52vlUta(Ey?%e+UQ#c${QI$hoMjMT4}SlB0H*8| z8Rl%pPAV09Mk6(G3?eqAH1-1c!-dR1_IrI#zYvVv$g`pPYJG6mP)JroCV|a{boW0b z24dCOd7v&9tK9Jx@OrMtDmPk_#q~VThrbg1R4-a>n{31`rsgc_{`S4(CbbF+B4P+$QHGnCy^+W zP?+wu*_cePQ=NwY!fPi!pqxe7+`)jj85B*;Hnf9AWm#683ZStSqd&* zP}=`W4B0^vLqQ{$d2I9wxc@6Iq?*j97r`(vjWtzfOg{^8k0}me(2QWz1SvoY1t|xw zvi8unw-iy?mP?jTB%s9DtYvVDA^yWewn>C!lt219bQG>h@aVDmjW5G6+aiUq1)731 z(oz{~p#mttc7OrJS=aIaNX0NQSBZ=|S}1f>I394~%26c~mw%|`D!x#Rs*ia#gn?d^l$Mr9_yZa1MLU(Z)QRgj zrKN}QbnxmKvs5;22%`eNTCe{i&g|NtE{6^LbB@|I5>aMi)AjO(-*Jx_5j=7sx1sd< z-S|@+;RsHnB`EH8pl2hmQ|Kan5-N>=`_1jk+X0b_OiPcA@%ehbg3Qz98t)m;Muabz zo%Tl`1;LO0qVMdfJJxp79YGaTDOs0&+JXWdI>6}_t&JTi(e)>X?A^Qdz4VQ!Ys|N4 z;)|Jm2Z2)Wou|>2zX<+sb0k~WVx$Cz_AmbIgkL^vnY~qFzRT3?H+LK3-oW99Cpe)O z8BPd>+52>hdYu*=nXb7ABrCB#hM)jaY0sy?;mj+AoAH~b>H2${!q=mTaWkQ;T`6|H z=Pz(R&pj@Jk!zItL;`PyMNSy!QFjwJco=S1r#7=i-q`#=5d11%TUriP_A?dGQrK?| z>iSo;lhNODKa!@i>voWTj^8C(cNrkW4wOgm(82G=m9^Mn2wV_I^YS5n2OGh3gD*Zd z7J*;kpzhHIbrYOk@HuVfU!=qbk!rBPQ)ZPJGxtVHDfQv+Up$T}A+ChynkrGX)pD7p z>N)J$DyxzFCXt6wu@dW}3t40U(o=>YZC%_0`Wto zPeL0p`z$ZIqJOeZkCDJd7R3iQQ<}~O>W@pc9!uyxxdSy`&CrbjW$-Y9?9lTiW?uXF zW>vK??=LiMS}(V}SzDc5TjXm(uxw8wx^9}TP~wJ%clFI&?q<;qiB@0hc1y@7hSISI zRPKufj%V6@;ryLWcaQ2X5B6LGw*h0%1^*0L&l znwg6lwX5`PY&}!_A&#Yl_;oLFYf4fNB zS}WUPvLwaF@cQiB=lyP<9uv2k!4KTeF*8m*Wko}m2I<941f z1QF^WO`N=dZd8OVFs^s)Z(%6K+3m@#&Hffb;=Ti2HKtUQXim?ucwh=&_bpj!yR(vb z0W*#BZ;mST0mWP7qx2FZczl&xd>Msz$wBvvKi$*38qhZfK2D>X zbovLqXUM9>6#Wk6*Q?JON9iUNe`iRLgjQMn%UkRBUW=gwSBw!6zX55kr9y&W{>XIS zto&$(;OvD@WJP98OWFjYNWh4u^j;<&7DFbB8gdvZ34S*`JR0`vEwpS50RE;PSjHCG zNXD;u-sE#gh9;u|Pr=e4%pn)qs3{&kXHKfAsm*fJ-zB(E>zn6!B?i-vqcXk|fLQ%eTJGf>m#j(ftSI`4DY9av_ zh|Sn4u})&-gdXqbUH{_(2;DFi&iUwjS1d%VjkT)ErcyP(%T8OPln~^Oc}INk#=MGs z;m>6#e2Pg58!XmS=MSvK9WZ%}L}&P-Biy^`nj3Y(4Y)+n`WRnJ=m!XI07C;q=yl4C zFT(=>fQiXI*rTny(9UlGP{JP(0O`_k?k0w)SqgBgl>pnHf~55G>YIMzfxSIw{-m1& zlL=-YSgSk{8#l!|BKQBEA70gOmebEx=A4uLkbGf1mxqS=nJ&TuKtMfQiDpP)ghBg4 zi+R7}LBpZqmH3QxXO4>^I>A9<;YD)K`@{#Vs%h?Nq0GZz+IZ@CW9gQMnl3dU{`UP7 zM4cx!y-J>{*7J$G)Hn6BD~gs3WX7W0EcFc@7Z+nDv76=+#zqS+kE-bfrer($ifMgh z0!vmlYTJ+5iCx7l0)Ug-mEjWyJ$pEEE=9o!j1M{>9UQ;AJ9 z_zXBm0{aw_Bw~c50s(iAhYF#cs0v&+b^zKXdQQtg<(ba(j}s?q_S{;Xip{IhSN21Z z_xR^$UFlvg1_3NElWM_lwqAx_5#3`LpJ0Rj{;lq$!LyQmmt4`p8M{cxC+R+Z&Iu^| z#9gMElijMjZ`OX%w-An`1H-7en!kA*)%yyDJwR_Y-b@^(y^f^Ki*h3po&8|;nWzT~ zY7;CqhzI3tCJ$COL#j}vKR@@YrPEy zekabUt?fheQG$r9zQ)+W1>otyfIrcG^JKQIwm}#7iK*mF^TZRVc z3@eS=nlg98{DP7^4@u6i9qpA{;)W{}-)T`JzK!tf5tYbT_nb<$I4G*U#1t`JM4)vV zEqB5LS~=sF>Q;VhmbQO%E0vr~zZ>(*9x^CXwsM3czh%?(59Q}7`b?noj7CPQQ@WY& zf4?3Ay;z~&5j+207o$5gvzPBVe~=?8i}dpRgr@W^_IV0xulX?2yhEF|_p@-VJi4z?Tj~L!SX}0CH@2obapo zICjYO{W^hY_1zJrmUJ?4!(Pw~cLs-9W2l=H@v_K@9+(IuQklVBgh<{o5kl+aQ?@Yj z1XY{wFn4V+2LSQVKta9PYIfb+UE?_eSp)%srwLIc^2GCMfHRCSIo%m1gE#owKsT?H zJT{f^Jyfc*7rqm>-`k~Yc_BK`BwO8_upi3X?nslwIP$`AEjvqHE#$FR^(dUxH8zbP zgLkZe`2Mrj$v26pTF2v!9EE+$W&dx?l-U+F*v6<-H@W10{4ptJ?VRG0IwPO=%ZJQL zy%T}Zwm=I>^|Ze`SWxzW@a|(--6kc%=O7)D+&nGiJL4L9VfC!#a64{54V-HXa|6YD zr5=j{V6+1sEzw>5{W8I4`-YG|^x!VTXwgVWq9(=)4_0+H`+V{Z0t5IA5?z7^f$1{Xo}d7rekP9VOD#NU*++TLoUe(U`{T`+U1%6%j-=^5)B zN-XGFh3YcfQgWG%QA%C={?$VrqUM>5v zU2mhbv2MzZt7l9QDp)(7Ab&$*U`NqJW`8x>;BW%Sm6u0%Z)Ns+uw^ts|qKK!(R3;&YFKPv-|K85(x| zeb35t$~E0Tj#7l_RY}-38V|8x;px`&E$nYCIg9$kVn6tT&CMUA2agF=G@kL1eDufQ zCb=+IJh}0x=M3#21$JFg8h8Ju_t5H*2D2GPkUd)Zq-6H{f{F412Y zoQtP>8;Agr@ct6ldu{M4WnZrMZznh48L~cE+C;szJS^Z&E07 z0xn-xjikFolD=3qdv)KQ2O;Z@M7$R(l0l(=^bK%$V0f`@@bQHF)yIgt!bMwA?j_o>Xm0T+?BVL@Wu z&;71JmFKZ_b11O$UNz$C_Y$`Z$!Q-C=-ljjM*=Q2OxSkEw2-Mc0u0dSDXL4Y(C_T> zY$!f*0iua@YDJi!Q{#qD%Wi6-e<3gQR3V?e{0pWy>%(|J@z`fjQVx++v)GM80!%cp zS`gFA?Y~y^o+c7(x-04V21rd-gWBcDe#%uC1XcRpy#wUBAbjZ&Fl3u!yo0SR(#BF zgqz2o8g~I0QSsXbIX>bvGbu|2Q0`gAVHR<@)k^QDW@WRNsxG%MmyH*jm`J8IMXDXk zTpqgux(H2Y^diO6j!T^iW#M*Or9UWHY=GwxyD0aknkPJ%0rKZ-=yHD+V8pliJUXC_ zZ%z}@4_*PL6U0-+CHlo;x(_@_>-B~I%{5MPIltSA0*G%h&Cj@2G3TFe%J`EqzQ(+y z+q{Ya?Y(vfe|5kxOGnif1@7#mb!z}nW4At)*`;S=lA*P8V=#61n9yyGg9lu^8%04} z4jRh(;i9tkOaM9?;mhj+vmbEG2^yQK^{r`_IBGq|fz=SVqGpOl+Z zjWH@<4+sWRZ~%^MiKuFNm*F7M#t&}N1oHoft+xz_qgmQOg9H|b1a}D#+!tLmcyMU+

P)cwQa{9xve~p;TS^+^D>)#0_SDur zrX~6%{n*32>l-3kO-G**txFx*B3=n4iR*<6AVDe5fg+tUDV&cOZvf816U#ka=;*+u zn=o=}Qd;Q0^=)}#ao>dv_D7Mubs>KE4wma*zI*{RBEhJ~e?d636z6Wd#D;MM2;reA zht!olEPNvqmR(U%i0Hh~f_wrsqw;v|7T+5& z892bA^p2L3ShXt+9!dw$HFh%jC+iE=$=zHcCwqqy1QyS*4_=VEPODY5BHV?dO+$eR zy^X5^BtiuVJ_(G!0mh2@~m7^|vgu?8ijrQHs{v!<|Y`oOQvux;RN`tYMhaiWG5M^e@|)b37(bKZy{P6*B)UU6HWIL zHVs*CE=a3hw3Q0L^XsX_3Ftv14LY@7)m}D*2&SAhTy!Q_s9bZ!9R0Ofaf8kl61lZuQP+F;EZy^crKYTU6LIgAKcTCj;y9UboVh7L=R?GPw6KU16Z) z{3%W*v+}a`F}Yh!_$x;0%wiv;%GA98H?a;P(znr2FP%KhmGSpoQXs1xERXv%Y8c<~ zAGe@_IV7eMnx$C*-9NweU2Jg&_gAK3jRI#6@J_wHJfaXN%Zx>JVTH=mwLmXKpz;RP zoWHu>Q7u+DHp910@qCMffY+v0MIGH9W*v7}7!_Pf(DV)_C7;0jj!8!=eKv&9I+vU@^rD+vgLxCO?E8O z79DR^Q6gk#E^;NBvX!jL_dss5!??-#AtC8%Lp*i}oHvFrT_B_*}H zp%21sCfB}HHB%IzyN0et_g(1&Tu9~@@3Hp0iZqtfk8z$wmDPr(`mxS`?UR4DOUVx( zaa>^Z>{9YL`yWgz!SYu9swTKJn{D!g0gLY1qJ##2kZgs?AMek;E6;Zb5?iPkIRT)4 z($>RyC`eC!QHZYAJoTkLK-@7bG~AVKML`EC1pBnuM9WPCk*W!cAe`Wa2)>%*8bP)3 ztAJy*rJ1+7E3H$yhY7qn|B(t+bkw=oQ!ZA~Z{U}JEDXQ*9b%X`LHVN4!OQ&p7elAs z%D+lg&vYk3v3kjkJfR=#v7xjjU(^dnSI_Ie$|E%79%(^&lFaK*ohY0_sq)`dTYD>J znOC-o7L3CLyy#ZKGUqo);A3y$)i9$9+Hv?+2in$A0$lwII#$sF5cAe=Ff6PAnS!*0 zraKI~wpZUQ8Ggd$^n>bG_+>ymGZhV9k2Ccv`6#^i68j#d_nuizFZ(FC7Cs)0TTM$d zj;IC?$$XeavLgPZ4Q(UdYXd+HO$=h>tG*Cwq zs(%XtUbK+Z*d9vP79Wly6e8rW!%)4~dqGhPz-|{FtIWI!qT^NstYwe2{o3&lhWZS9 zL15Z^?%Zf|W#e`~;ylyF_o`OP>X&&9{j|CGJ7g}2DCdqbqJspy%Vf+jz+|OFN(^1Ch zJHNemI*v;y{LRAZxRzVt_H5*M#OExfNQhk|JL9dI*i~~|w0wXh)6CW>bA;VNh+wbn zs=hyy$wzCGv8}k;PJ6EqCDPyT7s#x6aN2UuwO!~-IZtd}QPEE9#i7;DkeDN=%&27o3boEEp**xt7pl7T8oL}1N3Cih z-+g(7b$v3_KY8=qI#Mue_Qzpa=ewZ&1k(THlc58qxkSNyIN@+FBQ`)zhapB55W~go z3AeT4N9h~Sv<=|Z!@%#Vk5};M#sdT)>a!sUu2>5CW1zifz%$t(&JHtLNJWEQfxJ5r z5GwlFijJs5oWpui#KNHddiSCrO-WcRMj0@~&xjYgn#u3&W^w^CLxcBBan^R1j^gBO zVH_D5$z_~HakW@{o(_71?9BZ^ZN3@lJcTy?Qr2*?rrFzQyR!B|9mZY;Vnb?DyVK}` zLhL~v4IVzK-W7bDH_3$v+9bwDr3X~Ota}l&6MqjM_6!Od_LTWL2ba64sa@%o)>KDo zVa*yt7z8s3YiQF(k_veCA1~Nn_T#$a^?B+*&=dcRFh=5HKf}sy#U5L&C;;`6S*iGz{}2<$7gujcIN8QMo~=- z-*OU|)~!VB7Sufv&6t=aXrA%gC>o-d_tY0~yX%*G7JQ%n(Yst-ayJ4yuSN>7?`)~6 z-BnIbV(w@2ybb#?$VM!X@0dF=)9R@C;u}Odo2n)b@B3F@twRB9-7N3(#3*PG8f{q? z1G7gz|Ms?QsUDtJ0WCweVZ`D0F}(Vw&K;=q{{8 z2;rMhx3ksdOA0uR6|R%rK|PV&zomXAMlnf-F%b(5Flb%7uJfJgfv40 zX*%r-Vlc-QBX-fp#dY|GIOV)~T4gulcC96h{YkLZrp=6_i$xkCjLpLMvM!C96%_?* zoWd)jg$INb`nl`F7A8-ukEeqo$MwmI5|KzD2F%Ufakn+H*T)IUvX4JRBNVtX>C5h) zCl1seyslP>OkX!#xA3OoM0l$ECwGR6+Vy6&Vlx~Pxxz98C&(xgXMz+z{HuyWLp<^m zM}T&P1O-jNHAUZ&YuG*u&wu)4BhlZ$k<91KyXI@!xe}5w(aPKZYynNdhv2j6S){8cth)%)%(` zi*2wp*&qK)-WW8-9~A=Xr++pdOC{z%(Q9S0T6iouVRiLx@ntBYLMLNEj<31g7m6m& zWg6D}HPLx0NMSW9#YCs*!O)8%hX7{U|MIO0Xb9nXe%5j9n3|Q|%uj~@sEGeR7e*wA zPXQO#*Gg*EatgWEH@CO>R0?^r3G~J+mD95&IoD-K7z>pD732S<(VI~B^SK9@sQua* zO)Iwk;yMrg{{6(!LUmzFi>7yG9g+t4ze|Z0<{ALRAX@=}EC~?d-l>Z5fjXckFVDDW zx>js8tY+xj|9i;4)P=W2@Pv*H4x1R^vIA1VP8DG<*P&ZDKznIE!G^VMh=jC1xiwui zahssG--flPrn^5Mo9RQyT^?whPDNI#Oa!Vdm(aK`Tu)!NgzlYs&lOc&w%^6~E<3P~ zxab+xHZ8(lvAx8LUwO;46&y!93xG>p(LQ+MIN~!VbD`i0C8;f3-#9*Q1uHFtQv6Xj zZbZ_kQ_y^MFPcyu-9|~K+cN1+r=5}iwu7@-A#YK z2BKu0hC^-4(6E8BXRGET?0ItZ`5s&cF6;-_+^0>~Ow6WS-#uUSW$fZ75d2>%%nEmW zx_NKfx~Zf7Z5d6>T*|LwclAZA{(1M+WOJ4vHacFwaNAH(S-A`|Im7c(zm`eMbLlyx zV!?fl(a`p*7UxHf_rie=S3Oh@cI#tBM{|5nzxq?ky`KVs)=O%oyj2jO&M%`EamD*{ zx!y|L6hzPv8YDiKY*d`WxoBg9qlNV;Ia>xwKoD8+s9m#*fCm?O+$0OL=}F7jDAjEb zLUlnwLn1x_!T$7qtbO;Os8y2OU#^G*E8C*Q<^z5?VJ zgWK0ptOg_Q72l9>cGmOJ?TqgqdWDxs>h;*&8Io?Zr8)kOr0ey3{eA>1Irhk0<211qk#0PJ?FH$c>j<70}Zi6 zytcNM$^4~JiBl*OYy0G#3_?x>H5FB1y>ZUurwIsp z4P#Ald;enf6{KO^y`rtXwR0eGe;;@?sBt=@feEXnr5UT;zbK`lsMauMz1*mAY+X@> z4rX(3kmDCV^$>{Zu&Z4*?;4(oQ5*4hrC}f@^E`1u&$RSL8f=yTgtlmJLwv}AwE71tx8%C z0^u>n(L6KM=lPo-zog*1bw6UOHP?5Q$|%n47bWmfX5yEzjWxW`np#@Zjj5e;7fOfh zg~LAWFXvc|94Y9+%MU*0emGc$xw3kT;?xY2fduTGs5z4V!?;NxDbRpohpiXhSPM2B zlMedCYoiE5>^g2CsE~1EejR{aZ0F78tBWJ&!aNT1U?(WVBx(*J>sH$B`A5q}pKR&9z zB#JKdS5gbHNw@mMm^{%r>6)F0z!I5p(I}=qPO0-ld%m1`Ub4Nr0fm_z*Qo%S{XUu1 z?DS=>nCj|}1RSyaD7@!9*MzQA#viGQlratYPu2#GqWM%N*U9<{q{J8<*M65I5|4jC zwQ&YpQfdZ+_Z!U_CQQb>LZ8Nx2}-c)`hW#EW4876JVD|+ zMn-E3&$Xd}5UR;2fTId~9s^4!Gp|DbmKPafI0u+TLfgMw%PAw-OAk;KT3e)H$c@b8 z-xjY+t+i;m$Ms@~x&431!a9L>$z3R*E7a%9F}??6biJuEEpBov`{UZV1C;4H-IYiTD1|n{C&p!iws6fY=ZLBD$*N-2e)!`vG8Hgm&N{(Zr;&SExI3MDRs+&E= zGm9pP9*PhHDMI}GApz+I1~-zdP(_reDIuo3KnV~P~t)~(3p-mWfKJEjIN^tOE%HO7C0Mg z?8jcrgv)W9;R0*n^s+XX;q}L$M)0!KN$!aOtoqOoS9^J*#}^my6TK0j^?(jnt%}O4H?5r8Y-Am{PFM~01^L8U2Z)x*JZ~2p92RENs;TpuYOwP zhhOYa!p8hx2~(?eu>t@Pr^3m|n&YMy2%5+v=s}V|K~i?CmaEGdAlN$D%Cld-c(u)h zUmwq;i69ev(WpE9rRNWedS%xG?VBfxrAIo6Tp1>p&0B)!=0R=k|eHSqU@T z;0DZb4F*1?%LUK?5~2S&j2a}mAN2fdR%YvbkXuWab86<^LDDFfXVX|W zi8j{j@ZNZsVd0fl2`}5+D&An)sALfUV*Fa~dbzLm=lITIW-A>!`!7F#;`n|;$nyn- z@4AXB{Cy_?Ge4l$H_H?3tmuvayL4D_AOZUg53`TxjRi=*s(wX_MBq!Dj(7+x<*roQ zUz%KDuq!XAC@V`7jd~R>@_KwXUk&GVcb?^ZGz=ylp9c_MUHw+G-;ZNo<<38BJ&Z5* zOqM5TLt$TGi^Grnk~v6CGdAAg)w);tx>9~De`;k_c6CEDws(txfpJ)GDD>}MGl7SO zkFd<&IyH*)uh#HiRzWwAy8>PImr4c03`sx;bZDc=21rRuBh#r?`1Jn75dxPWDzh&U zJ52ND^4-wURyW2`K*9gNy1bbql{F0Hcno-H^K!jkC5KeXC`_he?e0Dd4-CjPwylEB z-L4ydW-OA@-D0iMZCsVLqXCj3+tz~x0xs_Rh<AJXSq?mTi;;1du62yD@ z`OKi5tzBJ<>7Di?D10}-!yWD0ie-H++t`o?t1Zea2B^b8*3_xOqcs219|+#o@MZ=g zQ-f}94GK)|dmIZ7Na zxD+M33f4>6+5OnMY`=%{6e@4nX>dk4ifcpT91m?q%DXo+ME=&$c9TNk>Z~|L7kV=H zFZLJ~%HP?^wzW{k;QGbcykfJ()4X79+>OKn_lpGlJPTwy%~6q3WW@iBj5K6wnXIYRpL! z^41=|E$}j_Pp~u+@!$d6?Wv1exeHvYVhlg9^2Itvh< z?r7SMYF}LN+#aG{-(oljkjkR(9$2%3QP_%>Ws$ED*nU|E9U~nukfe)1&@nCuD^T$iD2JgK6`>_FM%$q-y)uG_JVThcP#}F+$nkXmvrNmhkp<0uqj2Sn3huH5(soy zKqHO`serXZQzv~TjAajh2MyZ1(3|CFq5u4PYE0Ha^F55lbhw`3tq)K!)+;)sdJJ9! zk1)YfAd?Agd{ihkfo^29V%|-7p;4{#jIpB-Mj#jTx!{LyOei-W5659Yoh5vF(C+tC z$(bM8_k{b=3E1gmj#@&aim`Z@<~Ge$zhY~G#T;wZs?j+4wx6a)e=N48NS6`1wW^e+ zvaLp;4$jRbpc1}U2ncI$=aWie4r~H<)3DY7TPs@D!OC2F9~b0mV-q}nwpR#m9qs7l z#^>S)bT~j)El48mZ0(2>3Pk0x%Ss}ZS2%6$9SO#58^*dCunS)iXh~8nkONqeAp>#p zlQ^c)fV>lFrL%=PUsZ(X4Z!W2EytR>%#J05{Cqe82KXywMfbm_>8z}zoAah2NYpX^ zvZL}8%_}T+I!9_=YJcp;A{_BV2Jt=!*FNK|Hl#44Sd`SE5C!pE;JkOlDOqIz)8Qe; zmwih!`ZUUkH_;6V4hcDPj|9tn=Fl^E?mjDHSAHkIyui**E83h{8d}mjLZr1JprzYf zMo|6~&a|hqrG*C`4lWCv3RajSTks2z@>Zc15mrC-o!YSlh&GN;go(pA z;^YW^S(C`9vxW-1{n2N|B#_j|d~f1I3N<>k4H4*|m>Pa9$NM zZ@L!ScZ&N($+Dw)f}t+>_pvEbz)g7mR_PGc^O`dCz!(KPjcbAF@lZ3dE-hpd9CslH zr4|OtF^07TTX6#3)3`@*{SO$Bkbc3CV?^pyov7|S=i_2{oA23jiK$8Vc#c%()R{-5 z{QxQw={@0Z71#j28Al{ed%HtU!^1zFYA8Z(;W^_r%dKKsma`=cyT*=Rww`{L!+hth z%39Q3^0ZmADPS>a16R|~>oPhq7?hF7xl4$jxB3D!bG=L*h^;!7xv=^uvmjg!lurbTQ`Y z)Lqjll6iDKD^as-N=bpr@KGl= z!!GrMiiadY?;jI8Jb3|BuqUdA|Kw%eawb4$=>&cDR8TURJKt*FCv-j~i*g%(S-82) z-9LzzF=crRG)6_q{g)#m@Q9GNPhgS5-9;gIh(fKFCcv{NEaj_vyPI!#`D>UT*L6N{ zIasw8#q=wx3BWgiuU*jqY0$CpWIZTk>FO?zol@>)ANsoWJE2#t)MIq-Up_+<3uxw- z7as}vhypRV($lU0sja!WhRDyNDl&tswJ5;nBTrJ>H9` zZyHPi$;s1>BiGp@4h7A^P?n51P&v|NT8kS@aoHjQX^<1(37A6%CkiOm4 zB(EfM3M*}FTf?M>C+oIbD|xLGpmn{xqVDTdRTT#7Q)+lzdj|JCO(O?9d0AAGuh2H5 zq(ZSNDJg9l#uiE!F>-HSjlzr;lQPrR+Jy}f?~dtP)m{DM55}OX3-_Cvdr3n)xa|~% z@^O=5caVwako%$FJ3_XkAIATr+dDG`?=^G$M2IS}JBJ5SaNze; z@RA*45eU4Pss0^F@YEw?S$*{SJEEyf#t2G|3eyWuZuZiWD1h5Gnf-amKTv5d`?*^n2NxoHCQbTV44pdZ5 z)F{PO+;^Aui1%woDXv>pCL#^i7FafXbch}9S z_{*N3k8jnsR$uMxkV06S>`m6TEHotaRQQC`32-r~flYc;p8>2N-drx=6TU2VSu%_C zJ!w?sIUzI92k$v2tlfS#>g#ntLk_i+aV`9X04n20h}Fb0be03R5JDya)1kzLpK3G2 z7C(>3$J4ng%}8=jHAFFfnd)Q4%NII9M>diWmoW$}=VxF$zilej!~dNZw_UsF-GU z^4Bxk`)eFYE6YW;d4q{UsMz{lqL7AjCA)}&SzT8ua3F&0?R~cTPqy)AC*+6I-j_73 zK7Df3mx_9t`k@$U7chu$iEeA}XZ=!*Y(>$&H;N9r`w*J18Ji~3PYWTGAzgN2RPXAs zT_5XdKp9+rmD&G-mAD+PTuJaGyxDLJ0frdot4+{bpMq-)&h9xv@E~qf=n;p7%(YV~u{9ZMlL$p?)Yd)MVz+ zx!BB~Fj(4&~Tzo%J-^GWIILY+uPNBM(Wgnoc(X;ET$cp}hxX4?+zB*Ah$0sOeF_jo(0K$O< z?l&Jf_JwYtVdwE;RpzZpL$xl9r;`;d&o5mYS~fyZe!ud>Q~ma-_tzttMtEgoT`2%B z=NsEw5YBzA#m0ZZd? zn&K;rVRL%VQ+06k%DxX*-~V%|$ovkU%6LZjj<4tFw2PReu4{k|3I;cQxy-Lh5sntv*c6@w-ftWFz7Tw(U z3kL+DryVz->AKuk0ZRCw(`t${OX-xrue6=x>9xHWRDzSZ^Xqx<^IkBOejS?y6TI$rQT;wIqGgzhn2dv?WY6G# zHnwNyx9Q?B5=lAi<1HwALGJ@G6IF2x$jA|2cBq3MdCY;~q@_qEITXoFI zb2Q28B`ZuZ)xqc2iA%61WY-wy$~f{THnG*}rmSy@$~HRwh9z3Ww|8x3`rRDs11OT4 zmff~OV`Hq~y7Ln|>GS0e^draLyvXDgOyS28nf%KdcW`mp%wh$tzQ74uLQ>LnG?fz_4^Jbl!gh=D^yTG6ar9*K8fVgN zw~F1D4GyotvC|Rs?To|gf}@>!K*|M@Ef(72)g9Pna#z|ny;G0?vqlwV;lyC z#HEnCnWe$8eMsCTicw(d?(W*8BDVxW5rR$%clabObylMb+J+QvW)Cd>|dkM@9Quu$N^c@&sBC+w96lOV!FgE%c|xUw$hERGV{@ z>;mr9+?=s|g_D!G)Mr!l?Gx3quGMuls8&3Cq;x8RzUA`rbXOY%482ViOYhN25cNF- zfUrbAD}R?@#TTp~ArAbix$gWC5sCuTBdB=twpVOrBgUcgI_H z(Dp_n*%rv-ygnHR=fSl0CAoma#J)4U{5jC0i&Jq75~=LUUt_;x!?g&6GCN>m6~xGK z`kiA?hU;syw~K};X`mBr7*h0DyMlDtKW9rMJDM>}J4zl{vQQTv0SMzC86Crmwv z`Y!?Rh-z~KuPc+>=x|UhT5yK_%FBX->bhxu;I+yHnqHM~>O!W_q|1ZwN}}OKt%VAB^XD><@>f0 z4jMJ`QC}yLl2{^xjvD;~s8{2!Is+QSG(8dy)Kgvy9ox^`bSQuOK;^SPmpSM!rF(y@morm*Eb3H#Aj(kd z#RCU|;vM4qNin7kDOMT)x)^o=?h z^`eJ9|Tzqccy-Xh5OY%%Vlihj=#6ZuVdh&=jktJjP3{+{y%C%;bkvLgZHp=AG-Yz{JLkW((Bv# zIVknq6LUB!^?s;H`Ijs_46q#Z&C`?+Ag!HB+q3KekzBV}gcnl_Kj&L!fMg0Efyq2b zVI|^=5A{V$z8R^G4O><*OH3u1X#Ml`mnlg`I-xbdh%)srh3@PH5u;BAnnXaXIoaYs8=$PtNgi@0~m$f;2pF#5~=*1M2Sa&s8z$8oby zsa~03HE3){5}{%#i~Y%oa&u}t4`&cq_K91DMQJ_l?X}-)Ak^$Vj?0Vae$wUi0Ydxc zFmPO4CXzd?&nxM8SpC%ma+*@Z?ClEw4)2e5H24G(E+N7g&h zKw@*C_Z7#wTR5eo=ZUKpsAc?t>6XBB>uqQ|xe*8F@?&+)wca zH}m>>jK}t~+X$kB$}*7?RWgMP{!Wt~o<9PV_y- zL-ri|Bza^n#Z!_EMu`1Sksc~~z$>BtK`mR)gx*x2rb+}afqag%Gz@&-r|?ug)>qsd zD=8K17Ht&%YwK-A4%O1rda)56wgoN!omsl>l+@doY!P6Iv_HWwj#}VqHXgEamnPfvl~iJ?WrvM@nCy)=2m|uki>1Vi&6>C*N-WMNmrV2;j@F75 zheq-Ujp3R{v8g}CM*{08zyl;pECSTBNhnYMQTJxQ@K5mxr%1yznSKAc*CUS?#ISeR zS&bVMsoAYRAH*8W#1)<#JdzZ`+=ZZ=5*$Y5X`1uX?B}b{6*MAbck+%lp*fUUKq%LI zhqrnnTL_7!mYR}F4cp=z+dT7&=(3DqOAD!46MgWGFhvBfYBm)DIRLRfmhrRY3)^ZKxh1YY_=SwXYic{pkqwt$ohj0k!;rGYQ{Fi<0s zQfJbQJ(FhQWe1eriJotl3mh(4Gg3%#V~H7{UV;iiBIIuE|}VE}j7^GxW{bhyM@ z72D1aCa~onP$lboNEp9|b$|L91jy5uCmeNNOYGn;px}lpkr18QK&AXI1R- zlmdjFuf5M>tFC3F(^_M`PAE=w{bDriwiE(<-7hxULOPMrYLI0_xJhRlkdS#mh(>Lu|=`r5;}s1 z05AJy(W!b)oT){X$l)BAyoGOrfN#;ol}zl={ks9`ggeIF^G4GV0eA*^!e&+&={!yQR!Ec4VEYATzLk5u?%`98U{jk%=)DLoctVq$%xbdJw?1Oa$g-(i^jx z*Ps?AJBIBe+%66x?)Z+WX4CA#6PCvm!;;vWL};Wds~2&Edosd`YGEn|7tzL;8P9kJ zX>%{u&1O^{Zh#a5@A?sOQ8B>nt}=ytPUydQ*TmHG1gXx@jr{d~4@%eicY z78)A9iQU*BvJXxzj2!Zs(zq?syf!JGBCnq+9Pw6`TrO>NR$EMq#2SRH<3}P} zg9k>_d|8vmX0xFs(7dRm_WQ|Aw@}@S3TyeewU}Yr_aquO*sAjtYrD-fv1ti+A+x6K zr)M9^;*BdxwdB5ak;LO=dA>B(YFS-MNC^}E$}8Ff{!O$#~a{?COkgM4WPWpJL>?#Y5& zxA}P#=7@bRb|a*Vm1|7PwKPp1F^uvQC6_G+V0*louRf+Yrq&}4jmVN9VCHGWKNMJcATILI^LJ?3+Z7V`_alItK^65QK zuGQHPyuDJWl+pv9 zeHi4b5ia!2KUwZTh}4~(DpD+LE;jZL5snjK@A2*0M7SfFNv*Rn4>LgOxD7k$`s&S^ zYw;LUAx5(^Jynrj50U{N&rEArFA22SQ%GttP6^k1WVzWWkm3OKCD1iFfj&3xS3Gl(*PYgoAm`!QCl z7`?OgHotKOyc1aP6g@W!j}sX_4ma#eExcgqG&7^ap~&@-pm-Zar?^^LE-keMkQ2#4 z#2C92;f`lTv?9s$lD(OA{Kca0(izKmeT{!RnF~P6;nSzL)#6va&?`I+g6blWpP5xa zN0JLSScXI^BCy`+LfalXkcX^Z)Qyk9i6B{xdh45xt_=RPiv!q`Ssathc8s(vcTx}SQW6Q9Kh>a@?(vyg&e%^>2Nd@rc1*rS%BxfM=MW>RV5@pj@w?AH z=P1;GKiYWQ5uQ9B2**iGF7r*F$rk^4vnO+|+hv<8>z8Jpkh-&vd`p<1`xto!)BAC| znbOtU6FsuIYB|Qh()g9vlYHB@H_D2gx9miy=J|sDZy!G~K=q^Vub-brZ5(|DLobzj zX^5nykUhE0lS-HBXE`jFr7cP1{sM<;DJKfab@ysmmW-rTRN|rnO#nF~L37sn7#Gd= zh3Ct_z0M~Jw#hu=YYrsOEBgzxkLKX;Wc3VbW)x7&bje@e#GHDGosE&$^2Y1hZ;}uG zh&8h?nCzKd0h)gi|M^k!O}U3A5pri1UL0;o+n{eAqvtU{f=lVe4n+lq`Bs4ur2hbI z!SIAyG@!)j{r`nDHz^2_mO7uDKP!`4;2Iy4R%01N;9hThXddp!HH`2xRtC0e`BvPj?Vzq3Z?m<$795YaV6h$J%yPERS2CJ}bWul_0iFZAmz zcm?j^dWfT=qhn{!{pceN_WPCzJQPGnN4xcf)vU+{Vw&>;=IxstB7vhM=_9aTaSF9a zGjre940WDo9^G@BlqU`}fRifJ%_er(4&s^A_h#eX2iP{Lmd7)ZBn5&ks(BD(-|KVw z8rx3`fj3yDt`>AlYbzB9l%}bvd4)vNCGuM68#b<;jr&>eVf_u+5Fh=P=wj_=Sok0E zyfFb9VO_9lq_ID2xCkC*936me=XRi#F`Std;VJ$pKuX~WE@w>nyN`GP64-H>A)R@b zkf4r4og;_&e*r%KM(kvuqk&Jo`>DG{oyX4)(;)KUM6{_OFO zhg9Sd{13XV+qApqfAK=7(6}PHQzyx9-E27le_4_{B&dHpvC~wqL7}V7 z@)_bRu2Or%24SwU5_AF`pPZaPqnJai@%kA5#=(@wIv$@J$m{4i3B#ai?H z9!oHc`Y^B;)3o*QSQZ)nZy3{OX!t1^IaBRCLZpoT@U5)_Y2l=NxWYJd7OJVcJB~)T zit!q6OK;@=Cath)Kud5`o9ivY!5_`b@CCD^Hc(uSV-9jQh59q5a(>&ev%5r^ChCJB z941SJCs9Q6XHdAMpMmd(1xxC2vVYO6^Yilu0b$wMWcQDc3a+k=Z8ny8(X-7!MT3mL z$W|vx3}(M?>3^Q0-5C$Q9{Nr48`1X;SLI{Z;EHiMUpF^m&X!=eJ}$@o3euut&k3UZ zW$~|?><(S^(Yc88uJ$lDL$defy5>1swD4A|e0*SrPxfMVoAqj<`u5ZJ4b|6&<7qHL zU{GRSgo^uxLgS0Y9_%MPKYoNS9iNc!#1EGt$KEAHgD1o)2~BM;qhOdvnMPH}jV?II zO3%9xTtCS~F*Ar@U(lT!lf3Op?*E@G3p1h2Pn1%}PP)g*o~!NBylA%QOa}O29w#c> zYU(}d9ip!fztm@rdI%Jw(miXgt zV)y0m^O{KkY3YWTozBklReH1ub#^piMVx@4t<;F=T5S-Oje@uK16<@tV7XsWkDu~PZMAP`MOr&ZS&VOKUHpr03;nG{fi)_i9T+NYW zjV46qo(arh{xYR&L(&2rL4byP7zH){L<6%*lHSQ0@YMED&&;Oc>0RuRl;RVjt5fm` zF-@i|JT~bC>zaU`eS5e1-M(;XEEr1BcqPu8U=UnLL!q(+cs_pop5TlR3NYKqlgW_2 z+#M@VsHprK3N?{5cwv+cME-xMddK*b_k1WlVg6nNVAg1juij*U6wceQS$8|tppI*IVRDVf z?U-4+aPIJ=yyEf=5;BI#>SC)g6Yy^S_lnPOqeyJk|7LltDxhDoUw>PK173YLVrx|2 zw>V+XnA~F&w9-3OWs7yF`LeVo>%H&Y-7a7h4VVzWe}ZVXm&j#FegAIO**~pRD8^<8 z^vZ-`b<1NEiQ+?XRj)FnWf5?pH~dKTULR0cM^Rsl%nbu>+^sY6j;VP`Pgr6s9wAX@ zYrm^mLX&fC)_}nOkwKJgnY9~7{Xa$*{~n-R=tlVr0wdDDWc^>M2Ob_uuw8J*;&DjA z(j^lvak7IF{Gw`3-r`4*V1dU#ZB_TAJU6)}DWD(#8R8Up^7{T2DYVq@!~^`V%@(E>V;$0ZZ zu!NsN?{HtC;EU?+UQ`gp$p`vJBU6t_mBDhB@@Qv9x262EJ^~+3!6yp0H`LoRfvM2W zm12wIrTeex^~D3oO!&qp>-maCTIj}K|EhE-Mtd}~a>OGk!=`=Kp(7my>}gzRtbVBi=Do z`wH^^-=|oN*gx#JEXqvrvnyF8#RWh+KJmpz!}vWHsP=oP>1#w2``zOzDs|UzattKE zM_`q`MtNM)>SNE@ngB^y5C8%&4-XG?R8(PLuJ>ZB&i`vYq`d%q+CLyjo=v*f3O2xo z7(U_wQ2*z)0*@mI^2+-x{1}&mf18YZIM5pKU&#nDbXuU`>#ZA&-EcxVtywAC$B#c@ zZY)JGKC2T!8Oc-0%?10(ND3u%CFVqUAABBnN^3y+zI=Oyxg~SnPiTyJOrBrZ#&bBw z$@+?u5v9>#D&uC8YuFg5c%MUIlI;KYfMnqH09~-ovpgAPq#w{oo2?4uc)YX0|> z{P(Z`A3s8{KRl@pW{Bv{pX6h!x2&p|4a=sdC=DYdq^vEA)T(Dtu?+3qeE#X z&$-g7oXE7*lQC7#fS9j42A%h<<~77|exck&QlKHg2zX+uFg?;+b%>dq$1)Cf2>R-p zFU}2lS)l(jivrt%9*ut%i8A{CSR{T|kskj0n;gpZ1$#=k9u6HpX%3;$2w5>F1|fS+ zhakCsVgcSA`;teGcX54!NTbxq=wyKq#4bV*`gZ8wy%Q1O`vLEg#j8EPuyo2nB`G-} zF-qivjJfLsf3ZXMe@MUwIpEQpq7=6O}@l06*osl zsY#6*ZbGI@XhD)0~Is((k5TK(ia8#q^js0G3* z4>`PH?`y8gYu4@jUVu5*&v9HEQOn7tiyh6D~xrH{dm@ivP3P4H9_{CR~q z6(F0ix@xIWK#WDZc=FT#o;OimQd8J`v@Ik4^_c)fgywrJh~@%_wsI@`wab>XY7~Gt z^n=YvDDlh&vH~eRa8Dw|@e2K~*4nKbvj6eav;Ym);N17cJux3Jd18~j_9UbGaE4na zWGaekOZcJmMwL(jA5jANeA^wu$gDuG&pA#2A*X;SR%cVF=k|RmACmIW|2+-RV5X!B zDOoZYAzTNTWzrD)m}^~IDXxB>^aSrGvdPJ1Z?de^wey0aDJqUy_C^;yu?-k`flV4u z8QJ;>>31$ROgIo%nLr*qKgq@qXq6r>T}SxZH(L^Iba>oMaRv@-XQ%#550mO02G;2 z5J4FWCf-dKD@gBSl7cLoic^_!?;jf*NGs_8iyhE3r6F+$Y~6vKG>zx;B7=z*3(~*C z2Xciw-Aaef$q7jgM zVp?D&2N1Jl*7*g^mClY?UDaW2NHG^wQz45}rFY=m4bfgGG}%+S-~L4C2Hxy4GK(EP z@HsJ7_dR%zXEd$5K9?l7TVe!H`|R5x&|Y$&@fGX;9d|@HXvkrsw5+uE=DyK(X4tJ? z$~NnNr<*7Mf<`K7o|(0ipYASl4IPCFe)JY+dgx1cyth%8mD^N!kFkO$0(WgK84VVu$iy6X9LhpP@98SMYi$UE$G!S2)AKoF586E`f zc~(%xQmK6ij%t53D%Q+d8xi{tCZOptb_g3$0|j{@kA$)_dbON%V^Vp-pA7Ta#_Q|P z#=O8czdkP5x3H!7ZGXDx^5ARYG|InLDn+;FTSUh zuG#C)!S)Sd7hHZykODJCN5vZcH4w!`MGsR_D^t~8S4y;ufKxJy)7NoA%@;m2-aiLu zvjMPa=c}KC(g%Z4G#7_B6YF`32Y)a0)Y0m<#h{b0u&9M0qiB#>-Cerg32(2F^7_ZS z3(?Lh1big4zM`VVNGI3mzyh+M$`{7hCPj^+(c~*088b~dgu(|kPq!Xkce%OSXL`JJ z^Zi}aYc0Y0z zn1(_{&r}372Aj4*F%33`B}1#v!2?(Ts%m^I&d&UJ*#Snx>XxB80SdG5NF$XDp_hNL zDzVVS1g?2G*wn#0iKgSQ`$hhF(X9S?Pqn19?A&M1f#jPj&aeEh{eS?MBe;MJc@VB2 ziyw`+4uDpinw@9q(l|51RoRTceAU>x1YGu!>l?XtSa@0A|75Mjc4glx%PYYqikQJ< zjaXu3TP7cf1VQWH03pN55$x_`xl_9^UVP1$w7l8;~O_x_UC!Kzr9i*+*A;rf}-?RB=E!GgN#xz;1LjEUz&4s`v2G z`WB|7|Ca520kemPC;#!e#u3Sd+0OLQ2oRCf#Z0aT;m)Q9jvF^khppv~&JWYx(VA7w z4roW4Vn-oB8=~ios0)8nc~$pPtNh@u83i3uhxcP2I88x^*#@u0wNH%_&hS2w&I4Ki zG6!3nDH9eFVn;X^1DykQr~uWuqhl+VK}s0+tIqSAJNI%cW$ZpnBkcQC}*n7vHr z#jq4f+KlvT5orPCw*|lV`}NIOWvRQw)yE?}c_9MN8&Z1pcp>80gOgT7TKfE+{G)oN z>yryQjAr6tSXQ~S9NVX^7eDkZufn-!>dEOjBX~DS0wK&%xU%t4#`1zbEC_-n>L#dA2y?q5eg zd5N@a6D;Dd0e!)t(d1;fuS7|zgB}r-(Ylcswg3lJ02!ld^7wwuaaeE8ErCyj%xzF- ziC3s-#jGDDg9#lSOl@$#BfYSoKAAbdSwXy6k%5+|=Z&JYm9Xn0M&lijDoVc*;1e&UnO-oWV zXh4)Zb?ZmwT(y0H?s=>7d54HR<^eSOJZ1tz`zPVZT3z77^=Hw2=NYx-N-{d%Ew|R&o{P*%VdlKtTPib({HlLke zc3}HSwV#%`k`D@OY01e2$tFi7rx&L{5zM!sQ78fEl|aS9hPMAfFNMy zWA%1@d3;=XRbfid?+C2)Kv_F*doY{X6n;lTS)$H8U@wA3uy5uvJ_LWqb z6-#1x4ke!Dle*$-nY$4Xfkg;s{Dn7Q4&%e}%iI9h0!oFW@k=GR=}v)ey+h^#i*KT| z!j;W%u2s>aazeR~V8mdY9Z!C=3nNtZXb66PsVY46D6XT2Kd+z5tAx^KqC`ufHXbrzBSiV2 zsIWvgM%asRRc`ZOLEVpS+^-A{|Aac%73gVh73PJu)(BCi0HsX|_u(+NZopKi0Mkgp z*_*7SX6NsN8+UnS9|ZnQ&mp8WK>| zfCYDW)Ow-$hLl)uR77eu^PQIoNKW-)Z#)Nvs}qL_;b<+>T@^H+mlee0hHg=2%Z&BZ z#}XCN$CM3^&)E0!UG=(o7+@4YBgY8?bmfdZ{~km5z0k0u8EG6F%r@?ir>?G3OAfB# z#^BH#3atoc;)H5wv73UE!oIRG(PKl&dTQE?`vv+pAz#Br6n3@nOX6+&r2(@}3emp2 z#oaAU5a^7ua3)=Tn66;R$}Q>3J$LOErT^UwO>;#bK-w&?0IGimv8IxMijeuf4 zU-o)PjJ$SnRYmY4Hse8p>*oin^~p#EDdj-HJF)*lUEmtydCccK@^)pmf$i{xwYp@u z1=0<#AVwDiW7W^^<>RCpBE4k9Kdu!*w4yjH6LS;H+c5Tt! zdM6M5GY+m#%@WpFbsxvl@=SHham6Iy@?h@h`a;RW036T#TIA^-4ohDC-8Lc|B-iQR zNA~bflyXg#Lhn`Rv{6-C(jPy73n9lfuI+vlNPj zrbcn44E9|D8Sj(ffEl&PneR##UiOwwCaula9N}0xYj+yH!dQ8TLjACe%tAdt7K3@2 z|5IzJ5GZuy8;IWDR#|dI{tt!I^9uqNv8sq;KZmKMZS?_p$vN{%wRs||atYP8vI>WP z$51Xur+vA7l|WMTAHs$Q4DUKPIG6!0h&G={V_x{vAa`BXj0vmV&~Z zo)1;J=p>hKsTKaqP`EH847KK@UT+#Xv-SSycw;vJdnpAX5N% zMACBcBxq|4ixHU|#5n|G`N~T>&Q(_LFCU{NAbtzL{Ff0kH$SiXd*S~Vc2H#SkL8+# z&iMUtwwN=Oyg&J&of}0kg6LkkQ-l*NZUDHhD^69`RVvKR-Nz2c4(3HSDJsVz*ofSu z?Y#ar{gKP-s@7p2<*5z@|FuuG%49|vo-4AS&+G0i+zp_L&D*hSk9Q=qfX=aA2lk!l z#IP|FK%AXZLuO^CJ=M!F2VgFtib?gB(d0oE*z!BYMal}akm=rtO&*Z2A%pk^Pi#Kk z(|877f1;X{Lfr?MzP;4NE^3v(#?zNAxI#?M(s~Ed852Y5^W$8ZtV9up5b2-QE;_`Y zuKgTpz_qB7%uA!x$tn?rV+0+FuhmhG3kZ%jvO&ary>*WH8cUAIJM|Rq>}cJd{%DkJ z=-=a9OeFFQ2SU1n^Q1d}|BC%@{DC04fsys=@n66a&zEyPpw{l^8)YZyVP;>(E|)_cQaK@Eyi+d}0ktg>*q7rbBrQDu7fG zvv79)3HIvKDb9C(dhboZ98CtCy3t|~Ck!;+1;9YETNI}Rog|XuI?LiDv{t&G?T}bY zOyC70x!yhZFJFPhw4gF%=wPv_Ek0tI#}4tPyP#^M%W*V9>zPMY8c?d{8U)gz1g!k{ z%KxD2$|ymDSIM=(S6UIGw}cU}Xw0*iF^(z>#~(>wac9=^tw4kwII_Tk8ee2{yU_k= z&5RaP434gA_-BYDd57r_6g;N~gBAi#excV6s(VbTwO=}n_OuQHhmuasre~Y9j5N6e z$#7*XQ90)IZV7)N%ZEWpsAl;}H$ylR<@o@je;-L6WW+&)cbI_xohO``74d>ND}#*0 z+Xi53Hv)O3E6j5-vqk&X-8ObVi5UiL5+R@v zp?kJB=cwG^3dj{F4;aqe$0mImI%P{n4wmF&%jjgU3#iHIJy0#Rrn6$_lEfoWdq zMGrB!99^2d-=eWJWH*yc*(1d8>3OT=GNtkp5|ngmRz(kXFXiG<3^?#aUFE(PnHej| zs6Yg0HmWP*%Zk!qU>qomelu%w+`Ai-5XYP0F?&|2+)0f&9W}cOipD2YlDJ@OO&1Y3 zU5c&8A>_?t#YWEn@7svO8Ih|@5N)LUF#&l(Fb;cLnq+5*Jm*$inM6Cq^vtVf24&V2 z??gf6hJ-kNr*=Z5!`nGB2;57Z&Wv?|3kqr%HR(T-h(6m?pvjul@Tj!3?L~b<&7fBo zi5i=gJ2%G?35`_5W7icU929h)WnywPGq}7Z*Rt)7u<2Lg11j15|wHicp1(72d6s%Ng1Q_$xvWdqs<@kcfPMZ zC6bF(eJUboVlAm6uDpJaR4%n6aSRJfxt*DldROHvy5TZ~mTv3^oI*NkQD`ljgsj

BJfC6+ppXyAg*x~n*le5dh_3n=yPhmUMvEe&qM!N*}VOc)y7g;&LeC;)X4N0hDD z`I^2SpIwwK0=Oa>2WTUT+(w;C@Xr!xYVa($2H6ItaN4fQp2CN`kG+`3S6Py({k_|qej8@P4; zPMIyXMc;!wIh-4#Cf~RTHVCL!=Lz_|3j5GAFce+(4=Diy0{QMvGs?fDzhf6+*d#mw z!}i6VWc>nRY_QfouddKBaSan4cC-|x_EHa_O{94S`|h}rRMm+D9>ZMWhQpKu7<2i#dI+-oAwx_Z22Wq>LQ0`5`nhNm5Ns%~piaCdS*_ z+dOb9dMR*G=0o2<2Ij$ar4Nz+?NmFsq8UF!!bP*5=I`ez6m#LzGxxv0Cq;S2U6UT5|j1u1L# zdYhZhf%D>XS9T~F2Co+37gE~~&CJ0{Ll`*w1}sVUX%thBFzg^aOe2%?rUVClxGdw; z{meqLgP$g2I7J;`H1Yk^l~2^L`wKUhT_o>+hS5%pGYQIEAlZW zArofFiXI-{R9Q^1S?>c+!_qoIo$ZzC$fE3v3G;ptdetY=ak)R&6H2*dFH69l&tx2x zqfKzQbv!EAT@+V2v(RWF96C!`vIBeI(5gnWFOsow#@etv09&X_M$7ybc-W7Z%mPxI zv6rh@EApyULFup0u)m!zu+fCf3xymAa6|^NxbBnYuKVojaqSs&RY$&FE&^|FP<`I- zMMfv%Ap*+2LL@HcW_bBpcwx*}6f&f{-|~x5Z9O?^VIOtrCM5C;C&#I6t{`dkqe;Zw z8|tzg^krEw{~}&+W%j-OHqfD*dK})eku`YkETyJXRR2arX`$Wl=RaxyjVO2wi%Gxb zCH^NMu~3A7v2wG)ly^?nzR)y`^`y&AFYp4)K+urX#W!6qo_=AES=$!h_(G?&&EKg@ zys)!pAqDn!=BN<2x3|>fXSj zk0IYHB=v(UM~7)D{fUM_?+@T#vfF1n<=mhV;F0G05Y94xN^|uu854ZkTBGTITQK|; zRe)PCu;$im^g(gY9 z*Zp$ZGEI?{5JsI_h0Dmw^0*t@%;I#KZhzWfWw~5*`?F5X#g+@+I5AlzGgBn)qmWz!Xv>!MXZHY-NLftU_A4T-%-mxE*K0?u_|E9lNlUvT7{n!&hf zA;tKri_`3utE9ZFQa6_~<(o4Bbitc`YnI;ygiSqiUwkqk=JiJo37wXAW#Le;)Wo3C z-qxe;&9@P;tI~23KsCHrVrK$goM-Z&T@A`8@`DHk?>+wp@cD2sbKpAaLQy_tuVpL>~l()w?R zIQo8AOGY84I2#JPTl5Cp8%amPr&zV8NCt2F-Vg-8Emn zY+QdjjEEKQXYb*=wRn6w1Adf@eNsk)w|lW8BTO4b7EDia0q&+(i?*e%^AP6FXOZF- znW^;q_usmGq_p^7$}+XNWIc|b@BQ+H>6lMWVe0H}P@Q--@zwHptj;qP!Nm}gG9V+H zy{E7rQFXIiLxtR1#$x!O4u&=F7GW=QKRb>0qNWcjA_oe|Vc@z+Y^$dX)h|p-6Lw*o zuVTZ0hYkWLE^WUX`a*4?;HLKDd_}oSO!ZT&z)3gQcL- zrUSRKpCYV>ld&Le+Vojn-2Z_C;qOELG3`-kd!e=SbCf4Y)m|cviI;_jOY^3f?Y7Eo z`~e+ttL(nfxlFU5tu19*rxp0i@`j}x`dwulYAlw^|FBsc(;*?^P;a(7C$&(?rubJZ z8=sUv+)h*=N0Z!iZUc_hX;6zM;l-0?a$lQbSJ)mVwQrQ$%P2?q4S4 z!!X6u+QI4+1YRtl;if--g$S{U)Qfe{F5)%Oxx|r^v1_JQA(OvoYG}Z&)CuQLpR2Ko z5_d>1#T_%2=`ioEFscT@ZF69Cnk@rRF$4EGx+aIZZG2 z{?r>EYH4;jB0OWxCD+|Bx!bnBR>C0ES+M#g4$>`AUx&S&)Uoca*4;jf^e4x&|9x3D zxymbmrg;XgVLT&+M>uAESF~6v4XD&&l1?Uxol3%KFn(L35Z+ zHQN0XZD3u;X9mI#pa88?_QesCzZ7TIXc2Xy@GI3h%ekFI(n`Tg#Hlr&(-P(4)wvg7$?=Zurf~q%$ZE zS~7-Rv9w>?En0KEn|t$CN&J(qS@-dR6HS|RD75J0E?xrUgjJpaXNPJ#)A%6qX6yVr z^$w|be)6f(cz$SaRQx|f*`M%xrn2q_uQx2KvqwBGXj5=V5~Rp(taCgZGW9>Ykpn% z!7}9@6$N!>!4u5LQ~KS<5H^xr+fVyLQQ#aLM48Gb>K!p87?sFLTmR~Ktyq&laLgQ&!$DC>*^`9q81&>HiBqtcBBzBuBt_(~C9eHX^;ZY4g+MxV|n{ z3}HhJygHhP#<$1j>M>PIC$vutyYlIA(hh|g2H*XlD^H_+A@0C>=BPP3*$A}wuA13V ziR-avrSV@SR)Iu7hpD9jYLZhq0$ac>S_n?r+O0tFLajUQm%c_lD#g4=?&f~i2%sQ6 zWr&gY;(M0ae`}Vl=^dLn=x<|yhs4!Y_60D=;=PQO?zX88%6Gj8`M%hI5}qd}8w@&r zYP|Rm_#+2oH%0yR$D>qhh5dNTKZLD)IStcG8b*Ap$MVMBKl1 z)z^MhcchyPms(9NPhds`WzslPn$PThUzkH!i}P_#2fIKrB1)=;D*_rIH0LxzcDsEl zI+tfNcz2Lr7Z4Bt*Qb2y>;(dXsCJHIt&}Tk=v){~uu6&}d85=aZEnDk#S@Qr6f4!& z#JUISi{C=2a|>Ywf96)U42C3$G?I+KN^!sP{pOMzdl!mgTHPQjteit$F2TPl3OyD9 zQl?TiRTA!MaL}$)JCIGGro6u%0QbAqtI+icYGCNbl#;lgS$)yQv(x36N5&%v*?qUk zJVb#^mrgQ`^^t%(!RWYun3w7e^zzn=_{VABgyj8L`pX-syHeW7a^jZqp%Jo?bPf6v z>vO!>0=rc1%EW|<&KDy4xylCx=t^y`+9V!b4<$7Oz9=@@I<2+d8=@b+k7VFc_>@bq z=1I2uCjxtn%0yLXn`;7vlb?$JkKP5DjvqC%L8UQn43gi%lksZ*z|QTRr74#ch!1zu z#w#-KL4V98#_0Oy_;HG~xmeaf{dT%vB=mHdJ9$|4fWxp`QBSlD&E{v{Wq85LtB3Hj z^i)yT=MT7;OUUj2k`iN+NV~hP9IcRj431b(vW1YE*|kyXp6>U+psVxYe2J(2oUXiEjnHk{&_(o<|F;jTWiF|eV0;+_1O&8) zbqa&Fn!aB8(%V4?5*)n|ue%olgnJ}TfhiRQ=_*LpWIo5Zx>ERlZ#lY-A>$@lnLPN^aWVJ#g(jE8}DM2ZD`$7qojin$L_*CNj4KH^iAkGo^1o5 zIlFe~3v}1a`dJ_>WCTMkFSO7?wK*-Aj=3yYFP$uY9Md>6Du;w(H031!;g#$^(fMic zJQbmXcQBK(=!l;l`Uxg+^Cqu+-(MgObp#m*E&_R@HFKF!}_rsEZQccGLevmqPh>O8xPl~< zedLWOajRs(Yq@O-&+`@SE$f(g4SKkeshHk}+7A-=srFNh!=@YvlSK%0@FL@K_ zk7{e$3AjV!wSRp$EsmV^@m&cSpgSmXkPoL2t|sku=w~AH?1?iAB}QndVLI^7h4irNuv(m`k#0FJi3f(caVB$*o26c8;bQW*TU2WU(-KM~8RR!y zm&gsP>gJJCLzAoDuW)8E+AAw0rX6)ZGuOh>Zh(t45nnYE68w{Q8^%*5JH3e1 zk<`fri#g>z9rHF;2>aA87O%6#_{8nn7Ml zqorMbYAzzf1@mhfmk`MfZ(w>GmOv1#(7U|xDbe$#9JG$te+R>Wu%qc1>dxj@7K0}w zMmSyU9K8E&kbLi2pgv(#5L#iCanxk9`s)+?g^}<5hu`9TYq}D^Wl2QG3UhL3W4DY2 zCmx>zbGekO+GR`>WRM)OI84s=rd(4b+tXSlCN6iwV1KR zOM5_X`J92o%mmq=m4{gBT?!oR=h@STpag19^D}Gld->&qX&$5OgSz^8(uM~4g&M9b z4-@}PxW6Wa7E<-6gi{-bQRo(eFXlpB;0cVmknXi{Bi zR`xPiWx$g>C{IT7{lpjNjMcKIG@yPNLoMdAZlN=f63oAHbO4N@6~Ur8@qhx z5i>M_oZ`Nos%Z@67jVksU5TRR%%HmpQ=3ERyYNPe1(3gSTk+WR&CIA!P*7fv1Y$Iw z*mOo_@#a0TloVTNmbzS||9G=2gq-unfw| zn#|cRnF7uckh|UMZqjGjZA=W=L$%*w)t_mA+X6)N@-YHV7)e9)CI2@IVB4qq@$kuP za)^PO7#5!$S0hnf(=>*JhAPAW?|x_YCwS8wGNB8B0Xha`Z9tKT2pRu#u33mNAl8+sAXjsZo>BB37#{kdupi5E<54EQ$4nqFHP9r20O2}uH5fM*9 zTd^5E4~2ok!gsm>aKqUCt=Mg~585bZ$9P1<+PzNEWQ0>-ZnC;w$dn=KgFR2Rih$tCHg-{KI!r!bK8Jc7~>! z9Uh!vkiUt3<1$(9*v<&XgJjG}ocVB4b3I|}{KBuR-$eCh8INl^=;*keB2WLiOVTEs z!cqD;jV3~J1Ro+sFX}Zy`JA9_>2a#YYq7&Z7nz(~bZc**0e)eNi=*t9O}1Q%>b)}p zbJ(PWeu>*G{vEpQQ#%MPC=->fn9ae(LD)PcQH$>{|JOL^Xf8bvX2 zM5q~dG$CvNW&b0)QG6(J^jIVrIldS*j9T;j^u@AYSm?)i@N%GBGPipl#J1bS;vIB! zn~HG{C2%w5kwJWVv63!cvs3%U1pI(vUYI6uKg%B-pBDV#X$mrMO*0VKL0vww2$E0K zTAx2@>P`@L@(Au@?%^1ObCu`6ZtAgE-6zuGrK#|eMuPLkolj0^U=nb#g47Gie}+cx zx%7$!{Ja$YB=-(gbc z27yrisYfK>BWd`{nBtc*rTl>WV@5J1+l|&Mtj!cAu-SY~0ysyWB$tHzvVh`lvZ}Hv zcMGhBSilZV*FIM5t8{Ha3!FyBi=#;71agi(ojQ1fTvOMolR*(jcvC_+t4_)W_foeW zj(beJ$^Kq8SDA$Te~XfCrr@`U{t*4tITKy>$C)Zp*SC@ZOO`5r#*a~LH-w18 zZr@^&Yc8+pY&PBG>D595L^R({yzHpwcB`OSS71}0N9TPN4Maq%1=cl;Fq1Y=K^O^P zC;z;cU1|N0RO~X#V87H)Mz9VLijCC46WHN1%dMZY*m%t z_lf-??lWt(B{D--Lr)UMdM1wSTDVzG$=kTsOU$RM=#qYLGam>g1|sgwKB8Z2qP}vr zxS*TnNQq!#$K!^S`ycqgNPM$3HDXj-#82wgDMgD?46jw8bf;EIoUcnRhHq?Kv5)9J z4Rg(-dY>v|JQWJlJoP17_zg(8MjR|B^1=iq#vNPTF~yBM^_RQIG<2>F%*yvzV@7Af z$a4x6X5@ec)e^y3siET4^emqdG0g_%MI^%#|G0D$Te4mu1#SdUepg5Z11rMx-vi%q zq@g*P%d5Ip{Y9eS3`sLeL$Jdo;UD}Ows zfQ_oBDXDPE3 zzdWzzr75GFI@JsztRtW9Sip1#=QD09JJ@qmco5Vf?!FhVk2h|0c0HHml8W*#?KWx7 z9d?H7t(ETOjf)b>pvdVD#p@M&$EaStq7vnR818fn=KuKv&3+uoVYi`R^BkZq42QCV zXd2Az{hq|FHoPt!ufFuHCJGEe|9uWGG`%9j_zQZw#n&%+tU#n3X!YHAuzNal~P zr$e_9PgUg*vAC}QMz=UkD-v|AmF2;4b=6YLbzb}DR^GQYFWStA9QN6emT>xCU0OG` z_Fs0y^4Yu(REwAbtoK2yO(QxYc_>nhcvUtRI&h;(A!9d4>UkGdOUYs)IpVNOsYRf3 zwJaECRg8pQnz)Er#9iRo7*EAI=+ufE)ft)^Q6d*&382{#1GRz}eF|x9hKd}Z13q#Z zvm8+7%4@<#zrUylv{*np3Q25aEv4ED{h7Hi@_>4b3qxqg`J^*l;=hm@BS!S%Jz+=2 zV25DFWRy#;sw~l*pc0|qX3=b3P9LPq-uvSy9K{rI7iucDMz;sz$Zw|=aF?DGcdqwV z=P`(-KG7K}uLIt$lm(GQI0T;Yk?tXdr;(D@Xn|Z~<`qBTvhpg`3$A)A^GBF7VqyQ8_} z$gsdXEjwB48hD;td0t+_a2b(e+G0YYn}NIAcjee#ym4JeTp+-C5Wh(8Cs&8OIsPGJk6BLOe!vu zg6>fwoYBQLulms|=E*?Vjhczzyp06a>z{0&0yJB#lVLLqA4{TIF=I1-6itzVY_tsX zm3v!!7+{tPA@LO{S+EwypH#^@2t8b_%6hD9oS)I&KV;U{2=(?V)x-$t?ar#B`b2~0 z@}J+BB~#5`3q6&~xQh~SKD-|3`2}e_^WIm?Tg7zcBRn=Kx2r||&L**fk2+;NSL6z> zWuICN(b1r>+}7Bt*zZL3aLq|0WkdO`7*ZPOgwG2aZ@b)}BOo^t@}tCElpX|==dC$u z1rEdnxlUZzvD-v|8r9{LoMx}hQuhJZublI&aAYWI8E`{i09jrQX&6)aO%RPP<|0h` zJVG`h832uhN5H9}C@y;iiu(Xy!wwZKHb85P#kk~SAQ6k1_SOD{+noCS>U*ieN5i8v zd3LNX#WGfB#hmv#=b3W#M}tcR1rTVKUWvGvJ53)oJUlMW)){Y7PA+u-(>25ND+8%n zSdl$D5$fvfsoJmb*^QpQeVTc8-j*d!JO<^ub7C*#v zda89?{*gS_Qh4dZZwGQThjZtv7U3()w+s_B$<6RY?=%q+qp!?yXS-SIIm%@v;MKZ` zpk%{iIjr649ZY56gVPLJ;c;r}EmH*O?zg*G+hwY&M?&c3e70Vs>oDpCP!CsC9c<&qe$rrb;Dx4?GN`=onrq%)Rh@E>Oo@3G;Q1<6=&(4P8>Z+ z{U{c}Xel9cdmJ`BLxxg>Y94od!@LsADdrhO_Tkk5ZNUA(i3T*4r|k>SDirgiwWW zc|1<1PrVBHM`7+ZR%TgpYitwjAZ^ikjDrhGfO4VrMQN+hscM`ugwXkHm_sRMi~x7`WpGJSfdqcQ&O6SCPsACBNKaSQc>Se0iql~l-;3= ze~OpJTyG7 dH{L!59!`IwbULM(F>mmN^+taf9=6-Bp+DstupoCC2|2 z#V@_hiMgACytw*HyQ)H2>*5Gt8c1WAVE4trAEplzL2}i=sp+ftj1S7)R53Y8gZ2F$ zVKt(b*DvCL(WTtvZd$&g)KJ_miJ1_8y>$9aT>xj`b0Ngzk_mE)nL-1V=0SZpWyN3`xz@%eVC_Oso8 zCY!84b~gi#BzuzMP5fPC?!&PJeaD)y|DGgfn&cr=}L;_c0 zk`xf(Cy)iDxXS$>0Qx`$ze&EES=vag&I2Vqs-2`W76(Ebg1WK>zW{j^iYo+e?pR>S z)N0cU*IKOHIKE9TZCW7iZF-8MX9I*g*`j(A#S6h~3t=aBNpk3AHIA&v>qv*aLtUHp z@D$=hPe~e!cGc@hhN+cUnjx$y8|b>@v7!PRoGA(Ma(?YhcswB;9=;3{rct-D6B~RV zRYC3pAaiibDp|aIjq-j;0U}QT_Cd@Y_utz`KSW+jM|Z53{fCYU`J%1@oF%C>(?s{^ zauayfPGGpzag-2E(NU_k?gwK|IX{_?X=h?Z8H zb&$-Mb7F&fb;So6<8ll8dTjgc`>LHsa`)}!4@HL3!=I zWnV#c+)}s%cQ9jDtua#o+PtN*gK7X<2{uvas-i9&foEe*`y(#s0#d5)mkfA0GHbBC zmyVLbuu?VW9{Cr~!#Z%#@Yvn5ogd(p5)&cer*=w8Ot?7KX)bQfZuL~YarS@;+apls zShxAzqB7$+3!woMOtxyaC078A+_IKU6FMrurtM49rDnT(OQrcEKff-)2iJ>t=RwkJ zM8$R4ES14m(n??Z-zV@68VM5QCOH6tzQZ3wsMT@t_N;--Hoqf`;O4`5SqwE9?v2Oh zeIyB3(KN5-BrP6&Ph9G?P)(DhH!p7(lIAiTsyLDgfaUt3O>*&{xnkkyF7^67CSDzi zf1Qw^GZ>yg;fPC)4ZQ}irkYsUyQr?E0yyQ%1_rWI16-F#FCKnshopfx!WLy}bnGwQ z?e7C8jBAeXq&VX;JSW2ch+J4V7X-(buyT%&2LN7~n3f9n7-(2i&TUf|oLq57=5VaQ zo8t!AOnmPf3%-uVISU$my0~?T_-~jePW4+$*Y}qg94Y!{G}4Lh{!AB^)c^Cp^L@Z9 zddf{i4xr=$G|Iz zL^a_0N&p|m$%S7_dVB=9Sw@LhrvV6k3Kk2HBNO}%*!j1F9^Hg_$V_VYd`N1w<043= zq{KxdW0cN_U?#=5;45+pJOb&+9%l-#OP9u7rSaf#cLbm-Wn};H-}t*++_G3)n{<)R zZ_P72RA$>dJ{l%*foG-Z=>LgN|FLD(uCbxYse!0{e@QCtwI0LXRRf?2a3_l}Jb(Q& zjU~U3D00S}UH+X!o;!eR+e~@rH+%GbT!WUX{mk@KJm1WLhw^_Ci{CLrAWZ?rbb03w zv2pd3;6oeb%wJzi$Jb}c`IXa@!At&}218$xhQnWxlfQq6?2u97bMKQv&Jgmh*w<<( z?I-*MAlDmNBvWO_heNPPp|fvlbr~v+9(Xm!mv_55SePD{mp9K-TrSaQgCyTvsA~t? z#j#G)9QEDlhqTBb30m-`ICXwloH~?TL_=Zw#qV^(3$0a%Tm6CK)M*6C)<4@^rA<$Z zkCyYRrc1x6yH!}MR`0dlE3yAa6*bX+>K<+PUHQIu@?+{*nHkJ9+4uDm;FD~sfUqM! zzb5y9zts>F3puh`F0P-2-}MAJ`s-WjH}B-5voEwdjPG}c2IAu4#2wKgQ>ILjH{N(d z4O)5F#K5h6?%cW3w{Kr*)F}VeCu3UgyW@va5k^Vwo3$Ias%%#D=CMNP#+nWPBK$xX z4&aMbZDzWvD8f~R&@=lb6&@++cYri-P1^z?0Crx!;!+9|H6Jj%+TSbAc)$oJl7H=j zN`=jvu{F|YS8v)`TpP4eMh{o8V*(N46@EpMW5OhG+ftC~WQcc{K_v#-B!mZ{FU_Un zgsBqdw-2w~5*2V7X}QDUbps=j3V}Z9w1j5}Y~o+Ges$~r_^bP`$`bJ5X97OyVve-n zHN?rMxp=hbCNA|`BKuUTBu0gbEx3e?V$^_91GuJO$T0jUBRxejLF(-TAjYF*cL_he zSK_W;kmUGSbxqkI8wNdS=DTlNy49uLB!1YAw#)>8YPAut+jBpF$**e?p_ zvqW%K(b!823(^7Ui3!SkC<61ebAxtbVQr&~IeHzMz(BMF=ao^68mP(|pdlU$9-Hbz z-_Z~APQV&{CHZV5MqHBv2-!0S`G3>V?}Jj|v|RmfF?jvzNWB41tFuo1@v#K%Tqd>d z8K#J{No33gRt;cD1=yN|diH<&lmc3P2S1Mmo|%fviWMXxG?E}q_~||3;N4gnKJc3O zubv^N!6DNAm7kPHPu+fFR9k14O;Mpf>Vv_b+ttCM=T>|VWBOOAfrVuPPSF=npx;Z? z9AZ#aiELW9)PkqY_Y(ByL=_(8+*!Bq6Mt@(*fr`_B5f2@NqSPeWa62Y;CT3{T@tu! zxwJ(%S-nl7@AprsAYfa2;?4;wQZ!%}7U(i%0Uxw#DLwuvUf}x;N`8*AJ}2^Q#fCxhsD}c??-qqkZ~c-jyUI=|)@WW{<;YP9U8UgXh<54#3q@iHF9z-ts8 zE{Wi<$i%8z8Xh9lgB;U#1Gr&0h6%#&ERovN%+?X9eBH#-rMfs^WzDvFU9oekiPbtw zHO?5U-d)AYd`)->837yZmtcIqTQ$JznI3m>p@9;G@T2I<1WK>N&AqWAA@=Sv1P|=y z07$bvY~anpYcC<>id|Cib$Jtnb-{?lDw14^&X+CPA^7`RK5O(f}T;4cOvfu^e;N3_) z2uH$mFB$U&y@yp*AcN;v&#d}c8H{#b^(E;BxZ404+aViO19;zAJGmFN>&Eg%4OAKp zBn2GB{nApHnW!uz;NL$a6+mjcmw!~@h-{MpK+wK53a|;s0u8~MI=vs2=3_omU1Hc2 z0h`=ng8f}TTqc%wPI7tE?>P>qfjHT#+k?{T=`YlNhVvwZ1*mNr#dZ5WuD1PG|ES*c z`cqIHB#k9lQ?va5Y5U?db-J|^xE}%7l$KaH)ex^Ire=dSsp8W0F=2);+jeWj>Xa(357GY;v zZ=)DBP!<|kG5;48`a}+m^A|3wj8t9-gX-U_yPov4Ec#K{Hn_rD6QN2e5lz9FqRfNB zo_b3;3oC1}uifO9{iU&kD<}o=>(uM{fTYFUkmw7?K%mNW+5l~^+5(G6>U19ta49Eu zM^XDoz$M|*5jimZRXiYc7LVp#!7pT^8^x%ByHx`WCrFG814wfM>9F@o)RpsMl98mYDe^s~zymE5 zJXW>$X;*%N@9Lv}TMD$U?x4zCu+wnNISzmfijRa>VQ>VOn}NP}k{GrdumX^K*_5Zp3t zo|~e;8kX<&(TW^AH8!rb0QgMeAtmJ6ix0k}eJs>_L3L>3C`>;K)mR}5nLD^kt!&UH z6LVH-=tadpWY@G;A!{gpu}*K!!*>-6@Ww%32^aQG{Pwt{we&&Nr{r0 z0;3LwT}o^OR?R~t9weM`AR0}&5h7VQM#ABYOcP1NBxn?)2JTi3Bu9rKbl{ML9Nz+< z<}7p2DUgIfi6-0z1-T^Q8cMlwP1~*Bak=Tk6S;!`55+Vy1>p{UsPipb5?6fi^?PwW9-W zfY?IFw?$)uea}R;L)K(4BvnTDN4Gy6nawirVmV9>%5yIj^T(aeWd;qE*(QyrG!Uti zQ^P1_qXCBcoSgT8E*L_CI+ouN%HJ}YqV)$a9hjxRz8$63>Vx23bnVt&_%hW_&$39+qUSgJssiQkhxdl0 z%ZJ6rr|r#W$5QS`UVs-pYe;;}7x|bHLa(UsTup!W$jC|_C(lmzOI(n@k&~>HzoW`l zeUSj0(n#yw;k8+~z|&!z)vIi}Gxpc00lfw?Qd8u{g`-G68>Wm7{edEG@Wx^RS2$)p zae}8wMG!j$+B^f$W)5P-j{_h<76uFR>=^UB&7z|An}$|Ow)JP4jMeYJGm?aqqv5EC z9!ppRS6(!9m&OLXY7h#=V+?nI$4w^Y5EjCMyo%gp`&^A@z3d?kO`U1SBStZ5;O^5v zG7OB%TNg|4k^k_}o$XPReN`x~s9Y)`IKs%oyd-ydUSs6$GQrgrhcL&u-~fq7=#=k( zCjhkGT%pZfmo6LBQ3>3;7T$X)$U1{`;niHkchEE9-VhJh2xlM>`kB9flJJu|)ioV? z@hHZyMXJLa&zd2S2xD@p-4Gr_M=>YeYAS&9otTXB2vTza|Dd1h4O@hpt*hB7UN7@v zgmbaB@{~>!=agsjcU{}p_pejLsvI~&MrcrXd}DTn2h3uRtyhM=sR8FmX@7YHVy^D^ zOFUZlg;$2lEzOa~gFGn7Q6Y+8IQH5(@&9|eDyMlG+;UPm>|j4BXkz-MZHpujo@We$JN(ltDs0Lb2^`8>xB#4y7%hRnjuV%jPfE2$-EmJS{U^fQfLF-j zjVc6?TtB1~sMUE0_?fQj_19q71Ub9xTctbd*QH_m@>ZR({R*T31=!F?ER2*!T5sq2 ztqRk$u}nE?V8@;Vu!Ym5Pxo6>(3NAiDpb>H@Zz`vuM-J50}h*fteishn8t+}!UODE z_Ny>}4ZCfJ&&rs8w>#Nd{`JJZl(U z#Ouf0!bZ$3ElTXcmCq6MT1iVy6cenhhaB6C>?uu@L#3C%fJ-iZtf}C)YKBd5)7a+c!WJ zPx0C~*J~{yAO=4<|089bUf%q>n1ers+*$;2N@F262_(hrCp|-)d|DPtAMEEtLzR- zTf=`ckL&Xz&q@zNJiQ?nsTiCK;I5al_Dk7o5ZK%%=buZi9P` z+g3Fif?VEOLY7SuOAlXh?e&yYYoPm`mS+D9>+bN{ulQ{zt6dt`+qFHb^!l^+tS=p2 z`5C0d@dzm{TR_gRpT_Nqp#dg@a&T|}N2ktvPysehPEImm!USpDxZoB2>MeVg_$V%{ z{Y6dAo}Gpu6`ya3*I$t;T(y3aoV##Y-z5upDL6YhBHIzUS-R=>SEP*%Tei!CV_#5R z`u_6|W&GH(os27zE(+?4Ic@}Q&ALbuvJ%m=qxb@&#l}Wl>NY8U6{U10J#89~cnPaJ ziDC&*iDbWJB}${&YAuX*X{<41qQSr?#TKbSPcEGzX)(B|S0VZS=Q;B8+y!Dzx1wB< zoRT8F?&&OZfBZVvk@EexefNH`v9^|u?Ft6!gG0j=Si|1FF!2Kw8d=-xrb~SJj-v9l zPOg`Ta|iVVyw(V#al#y9Vdnw>lvhjfh0nwUL+P`Ug7ms1=;R7(7QmaZ(+9+}O)ow8 zlDicXAk5WiQcv9Cy{|Ve2 zoAs9BzA`+oG`86n+q$jQ%l*Cyl93Qy8qE}|PKH9+)oM_zb4%i+g!6k9h(!R6z@)kv z;eMyv=4WnWmdCz~#3gp=rfB!wm_ZZWizm!H=?@Jr!N*Y|A9Wb$1b--r)>+ef^LP8D&6Zy5Zu@-MH zPua0{iBz+_nIbPo!P|V;weOI;_P7A(qJ%3FWsJ$_^Rh1WpQta=p;ZZy^luil&` zU0s|MU_(%)Q~TBuAD;+=ZdXEp z&07=3%i-gvBr!2bmjC{9vHF%%3sv(|20{R0ula-gGE#3i6nCTyQ?PcfiIAf@#jT>; z&Pzm^+^EZ^5W>($@yy(=Bi4v(Cr6DLtSpVd^mgYI<~R+|WDt=aU-UIx*R3Qy9xGC4 zBNJ}3Obc3-#L3Z7euF2CMXqbUzE8#a{UjwPEAZFW)&?2Jkm;)cp$q8VjVol+ww=6TS*MNCNbA9Dy-EKPb?1J z4aLQ$nOxhqQLY_Y2V&$fES}~ZfrUNz0kA5bgFgrTv%uKxk;1!X+ddMzK0tZ%@Jzl4 zd<1O%Ln7gc37= zOo?vatdBT#8YO17l?RZo_?k8Lu&gvtB*3Pk+b>lvuxf^C0S*atScQHHWxPiE?L)4>&bJAGf;kemPlvo)cRg& zIPmG>R8g+y*t*t~%l|Ev3;!;X`u#>D#Gqi{#@VNZG#vV(1mK*^DW%~kBo|zv;G&QafABeg8ELM^Or+MPpJ8+U8@$-wpH^S z=W8EW?g)V0(C`~lqq>`T*YcEht(zCwV~~c=-h)T9YQ?Uat;EGAsJ`fT=Gq7e3zt0y z4$IYR!SFV+h38H~Y1^`yKJJ&r1Fl|^>mi{kw2DO4yY?TFGv_Xd1H5;7V?OrsteLCU zs`W9E+0xQe#3T#X2ttKOV3`~ff$&YbifDvOpM){^8Q6w?g>Y1FCQQ%w-~mPw}0AP6pZtJH1De!UEwTbQ5smtiddD zu#TO~!*92ax}_o(O+ zCk0N$q6)(bNk&Y*C!-iOV5os20XD|8-iE3xnQeDh7a7>Em;4QW8@d1(0xk@n`f=8L zS+RD5(CFfmZ0GiEs!o<37fo<2wC0~PkP6`=|=>Rx>FL%A4mMb*JSiD-Z6Z!=(w<-MAq^_5{$$yiCx z-PiqxkIDO=eT$WHQ&}48|h|MvkBr^MBUsOgpb6-%ZUd z#JOHeNq{k6rda^RXn@~?72Jj^`ifIQLu+*ZI0^sm1v&NCcXIJIzzq$&S{?2aS7d|< zIf__z5E3RJ1i~yU6+daF;)XGXxP?qjr}-I|RGen;;WuWES5_9xJ4>vn*QhOSND=JO zqAN1cI4WbXs(INI^nHB%^)y+yY?WA9S&ILm|MW$pqGP1jkVn+D`ootW$peG$ld&&O zRKQD9f*!4!$;!2xBsdhA&!B(KBeSM{AtQ$l($~Q-+Bg623)#4J2LO>U6<)(Rhv1R5 zwUzANupHibiL!O)UfHr^kE$mP+$MVfm>fTKlfZD~(1GguT()Yxyz$}ZDr}04uxzs? z4P@i8`T8~uH)UEAhSap@!W>ynZ6Wl8}s_bjY@LrR9usp`^TobJ<_u-IM zg?3o~o&`XNLhyxRD(OP6yW5GSor3~(GC*!`W^N^M@Twv4B5MJRBlgv6;Zk;hQ5-JS zK!8qXUxGHKCb_}f?3UR{7Rp2D)njIYN1vO+%Pz$oI43IupfO^ljAGP49vX0M+zG~% zlWcppKfJhxBK&Ev@(!~Ai7mz21>i5PlhoLVatCZ^s4_cLt;eV2F%sjh!fJp9G+!i) zC17U@uRx>R)Bw4BNXDICDB8`E<+DZasFDI}ID$(XW`lES54f6kl<0s+!0op{M+{!K z@8cy}x3RLyuK@+v1Ox;~>(;Gh#*7&TGvp^M2TFmvqWX}J*(Ac6Rfy1I8;3g8` zQ!!W0NWFfKiBIo`#0M*s@!(^KMF>Pf@FhvW*piV+hO3oXcvwn9=$S4%gU;Gg85$7h z!HgZudShYlAS6_+=H>}P$hz>RsV}xJ9yldac@+3>_(Oq1ZjE#vB_%W3@~wr}AAj?4 zaXOu|H&)>AsOh1*RvBi(d7xh1+BrH(UMlj#TnP-4D6r8hfuYN5haM~BucA0RIUv*$ zSvv@Ld^A~l+|x;MWznb|JbIk$M>r;d*jfO3n>T4Fp-te zyba*kQ0VQ#a8QE91bp29WXEDW(J`^GyXjYPs~k9TT!BSwgVL>Y2ecWXuJr=|h#wjE ziY%EkU4>KGf#-)hnDo^&HOG>di(olD8wuFF{LV+RcF~+$8>u3D65xqaSTc0pgP7(z z89*BiK7O`fT3fuQ!4t+3yfUeYaruHahAQJ?Lc)z8!(Dl9J0LR@xhfPOWNs7za6Y;)h5D1c;l(FQ5w7Mn$$ZG(A4TSNO9vg;53thEpR=|9p46mM~NPs); zp0~6|P95@TWRc|90JI^#!+FHEW<#-u2aZvU8puln4DIPRbvHaMQpG$wMYaieq4a5} zF)#A>seqfjDk#H!3Fmf;g^QP1y45MeR;zAP)oP#L@$0`yJRB^{#l=EtK)Gdb9>@c7xry0vuK@IS^iy5N$EU$?I zZ1~{K^ie6`J}F2V41HOxP8U?hczoS%%oc%itv^W*A(7j6s6~Zv;sTWmCb-ZiI^shb^dw3PS_|}KY`1@+=FUYLb zUEExBt6Btl$YXE`fDSn_S~YJ11FpTyU%V2$J9)3P<{HKwedNPiT<~RFAF=31@vFi$ zs&nagt`5*4Z zF?+sz^wm`8*+c@bT~~n3-vC%OFPVY;ddr+2ChNnH2+W>3b542jJ^J8K1pw0+_d9!D zmGN+Q6L&Wq;a@sH-huslvD?zMQ+o+Mb3knme|)@b+qF+^=l+w1-@pqK-&a6YeE?C5 z0nBJ~+&u#xRP*bBBgd3i&%M37>rZEhXT7@KGH2RXD!hvxMh6k{6@Yo8s!ArjRuT~6 zqq}Ad5Q;noOj2J9M^~xdteXTM+XgQe0A@I)dh@Q5g1L?SRMnAPg)Z>P080!3k36b_ zQJW!2496nyQ^12hiGiyl%bq3`;0JTi7&634pD6nC0-yu99Hj;t0E ztQzI+)BwFePR^eM9)|3CfyinAv)jHn9if2rRdzat@uVlmtIU9r@Q~KTrNPZbv6z5Ua(4N5 z;0MFK9rppRdxlH1N8iWXY$-cG8is6Pk4eDR1(J*fKBm4thq6%@j!4U=C&RO8uvTS- zf9Eq!CV?g{aO5FeI80AY(35NDE<)06hEhGK z59rcfzWwxlwZBEvM)LQ%jdB6J6C3{Bt^k|e|14GE06)T@S_vS8tKoBg_(~4~F(XU+ zHZ9fphJP5Y@c!#B$!jk>tvDY$ukI*)rKmSfT+6%P#=Q6Z8T6)wpUAqqf?z3#6< zGE6g)L28T?!U%~f*krw>`bx1w7pekFRTJoy*H3Z91zDOHCN-dMPgMmuZs=u~ig_Rw zo_>@if3H>77DI`CnEa6*tl=?wuk`MwGr-??X`Jvy0^uiLdQ*iV&6+k@+O%lOqPK{a z;jW-;bmuW&XaMt=hezna8dkP?;cV#&(B|6p5Lvr%ix>hnbuibioIg_^{>0Eq0yb4y z(lIwB;hJE0VL`tL9rJYVDC(;Ec7m$v^U7VxBUZPXU!sw38Z zjU+@y#qygL^fW*9WOamTv{6EDv!HigT)x9u3BuJ$HQ3>3)R_9@(O#~RYIaYP;k#(wi ztM2e7vMo#(D@>VA6RWOgScOsYZ8}2Zwb28w$kCZ^;P)OTzJp)5%^9VCI5qN`q=sD< zyT&&G>(X!4*g{3oKqf*)PyF`2@>*;)4mJViwYb2u0B77(7nxf9^2Xm37}I#un(zLkT;vUA@!?{24U+v;yfF zj(cG0cmT?F(scCu>U{D6@j0*FfTtxs z7aT8dny1E>jLd{wfUwLCf#i4zj`%J73~ArDLPt?*EUm|V4REWDQU~`PojFl%oY|+& zHOIZsrC~ei@ajx;JQykC&n)>$ZSzl?=cb5r{nq-=+D3%m9=W=6DZ;V#Dvv4FbYzFg(nX6#H?52UVB&HZfq=!MHr$CZ@7+>#)d)z z^>KIu5UineM~KAJ*mi{%4!v|rQ_RdQ)QW=%QrTK&^EeczJ&wWB+D2>(>0QG%Nc7w5 ze|d9!6sHe&dQKL+x(?6%KdfY5RX1!sP=gih{DclOjA}5+Oh$H?bCMK%POihCVyJ_u z6$pbZZ4rZ94c802X%Nmr9ttHeLNnn;n+{^(v;+V)AXa6l1MEum+s0wVf0K^73Nlvn z>sMviPauUJaRgR1p(h=|&f*G*do{3zb(lh-oLe&Sppud8-E`s76;(Ysq8J8QmO@?~ z0ygv#)PQcSeR|zlkysbP@J~Z|^3jocu!j4|>qEiMqsmGmJYg~bW~xCJa1of}H{;#) z0dcI;SkC-4MXi{c;;L~3Hy55VCHnjktR5!HmCb)j!{IL}eO(5?nwHI%mX`uK7o;KG zbV^KQkpLUkWsi`?+X?3ZYscyUky=am2?8`cgHVwP0GcET#lBF#y^3>$^F&5+3_Oj% zQG=KEopX%V2)A-;(5BLXHMi4`@p##3p#GroQnw!#oN{^JlwA*Uwv_0H5J^RmOkyPZHs`TN*y_ zD(gvQ?x_+_oxd;(U0+BjnUR?;=g)^D#MiYMydQ7xcP)%n6WUlBp*J3!knI0nG0)w0jin=;VbgHHlR6HDJMe5^-*54j&<2#xyD7$Pt%Vrn zQ>H9;U*54o62gPTtK$HjM@<1hP40ftTZZ6@0BF&KE~cq?nIuH2UAfh(56i%LJe73- zSAZFoB}|ddl;K(Q5aYorC^R&_4wKDw7-r_#OkSNb9IfpJ%*y4tFO=SU=`UWv9}3A19gyPE`?-| zQQg{MXxf0#Hz7P&d8}muc;?*V-LAic;vq0A9eiN?>H$h8MTDx5O=X;#q~;$Qg*r?C zq9)_v)~-gK($OeEq4?aKIy%rM10lkKR|c~9QAmo+dBIQ%Y8xJU3}>y3auBQ6TnYCb z@{)LW8&MgDP-XgDkvKOp;BjV(H1suNiuDUYm+Vt@L@<)(B4&2>z-jMoXQsW35phmNiIQ#o-(8- zw`gHfrx$*qKa>UzG-mtL#HZgF;OlteWRP=42 z>PHkpzZh;tiN1USo;6#fZvU~Oc`A_{|I)^}k`Q(kU{`Gw>P6d`UNg)%mOEO7AP~dU zZk*jO1Z?t>W3%5D6L^(&`*5k4XM376OP!&lCB}+Zw+BJ~tkX5a$`Zl?42u}IE3pO? zU{lzK)zV1o%^x*Wcsc-eyjuVprmE$t@T7NMlb7E9P^FuF=i{%`Dmhnt3CdJM8S>FE zWJwDxF#Mfq{14hbLu1TvTRUD_?}mMS{e@?8cnWE48uoSg{rAeZGiIq!DQ3W8*cH?5 zGOXg^5jSI1wH14kqAs32U8^d`JI}390t2`tbYBv6l$Tx-BSV!-c#n7HAZ#LkkD8+N zk060Estm2H+f+c{!5p4SwK@(|{kXV!0m3_I)D@wU+q6+x4qE^yrI7cL@vPwiFG5>IO6-3*AV*aMv$ZUF zzV+(nICeLFu!Lutg)Oq9rD8=-p`98mcT*b#Ju~nC?T?i~-@(s{2Oijy5aMYC?>*ZZ z-g0)`EX5IHg=@;A{U9*~5bFQWZ1vEs2TX5Phb_*9+<_2JCl{6ZDaYXw`oP~&(v9m9 zg_Yr3e^Lg(jQ+AW*r)XY#}WbfL|*m-$Q*}kRJR%(TG5L5o)m=)X&`D($NXlLve7^? zbRG|4eCLMk^(3kJwR~;QD&>9a2(s+56lBAuM2azqkT!Ig7 zP}$7t0vu`f=m)B&1P`3xK^gGxe5u{@A?5W#&{qRuxDeU><5dz2UK@Iq5d68aZIRFx zbZ_3RP@|=1Bh#Sg=4R16a9D|@y{o=1Tld=f?TWBLz{!jpYXH3Rk|_TpYIo?7&7y5H z1d76Y69Jn#eIHX^MFhtjYBv?{?hmR^CNm4&qPZ$uz#no{CIbAa*ateR&!3Xl-gAYp2o;_7ZRa}9OkB=g^88u25N|N6YnAZE+ zp1<=uY}{W&4Gg}oj{-OtYH$*nqJEvfNY&7`RdaCw;Sj^7X3YLW$X#RjYSy@c-XPJM zr13&w`frku8j5DT815^nZNt8jN5`4Dwhrc07xD?5bjps}*}zVho+dF|H#Q-%>S;~F!keen4dS&vL`nukeo`o`sQC&w|J^A~ebDXaOqTmXSIdRBU$)?!_&iKJXVo9DG&)_pOySAH6B@@XlppZ!L9f4v1E zP#qNDLQf1$$hSm7;eB5{4o}2H5CA`_9a8Tk*QUqKJ?}4;F7GbDd}*dUatsR*7+_|; zyoBV#1TZ{W_m$e!DK#Bc z=6Bld)2C0D!-o&+hoP2U2^cszIpugD)|W8YFsAi3P(}%De*XTOa^$$5N_kB#mqGn{ zsW71-1N+J^zb#OilStJ0{8Nvq8}G4Gr)BP+e%HRP<|e^mVJhvt1@vwRgcL0r|jDPpYg>TX*giN3_8UhVw|6=M^4bCH-a2_ApBJ%G5X>EPK= zoB$>zf=n|0nm=Ox8;S3|;2*HXZ^tM`4U~`uXvnsC{(F#*)-9nL3tx>HKv%xfBCd#~ zd)c>N_?{X|mWBpGkN>BLa_ayWHxNhuf*4KB( zZ_yVD6rh)mCCG~DS;o}dhJ?E;%=t3}nfF4EZb3O$MfmOoPaS%fc=s47fxA}-LpK|b zdQ*9QFrz6S1GCgA{VnN^VD(xAP1x5hn>U{ro$1-&#JK0$#w z7uNj(GUp$KWW$94u1vE%aT1Z?kfQPC;KrO^bH@EuPy;$&`9coNnAW?Hipr;$wwg_Y zMiYh=Onh;i{O_}G8_c0n3WhHEi>G`gJuO-CMg)?ZWG);oc z`0&6=1K1Q*2`=@TiANi7WPn|t+2Yb*z>A)=xw!BRc`6_(T;dvjgII9~A+MR0lX7cy zKt_+MAXW%VaYgEFf+NfZm!A*>Ca}qDZUl5g!y}|NuE7r`zAUeQ@R{O`a&mA`r)R>t zCGeX;5+DPywXsp2dn6qu=t3c9)E_^8Cr?j!8+9L%u<&pR1PIG(o*r;k;F;1w8(E&- zI17HAD((9Wg%O=4JN6tFerFZu(ERsVQIz>R3oL#);87>-l{kpzPADr{+_g}~x z7z4o^W}#x+!{&f%&uJ^-x2HNZkfsWKYKKsbgO zejk(!@SqikV}{Qa{EcW~B6%r??eH4QYr%rXZeiqhX?RI~YF@+)W>&SDN^;O?BVbdQ zFQAH*u>}uICP4S&bKb*mU*~mC?voZ{Kgsbs;ysh$P<{(O1rN)Z99kgY!Vsvd$PlPK ze8Gz@n<4RDM1B6>8%hg_1t9ZLaF zy{+!KJ0}fCwyUAw1nQ)~veY!oo*DCycDi~e!R|kQ{iXmPrn~0KHI1dN z9ox!Rlit;reQ@YNIRt{^+yn|6y!i08HMjJ&NrSxm%5%T^ z=v~Eq!G2Cd`ry2To5Xx#%wWOHTIm&xEtxsgxadqW<8mkZ|# zM~9ma5P3Db^tTzZ8JXA4`d?Ha2mu@drwnN%fWx%il=(kTQDqwA+BFn<$!uQnhvLD} zt~(wlATn>px9VCZc{;P$5oGbIRYN+oX{A6`!+MwrF7V_YmDMibYM=sMTQ+Sh9on`s zET*=#z8D_p)ULJRN0n8M{n-n^=*qS0XwMd5m<^>Z_AyhICOg;trMR+kpFeD1e_69= z_8n!^(%PuV?=(6IFq4R~M(dtPd!3*_GhQ#*SWJ%YYx9i;)EFMrjv4OLwn^wq1Hb_Z z1n7B44?!B4+*g`HqukWVL*I^>Scn}`lY4f&U&2lv6pNHd11MaL; zx?A?GwEI|^<2TlQQv(b&sMF^$mHj3e9#kD(`&G^@|4vB2+4AYh`ZU2INB@gK9=zp9z&j(jyLr9HULA`%7cKb z>@<{jd^kk?{#hAAN=WKzD1}YKOYRUF%{0)+1=4i%|Fd@$@KK!2d*Z~6;3RIONy-=gH6fN%V7F+V?)~xQX>bh+2U3D#6(Mm6*)J`9+Nez5Wnb3*x6t3eC^1!(dSLb-q zP|nC*Z4`eBIVW5(X7^#u8D{dE`^vi+2tw2i1+a*Qpj=$G`T6znVtD}noN^Ae$3h1e z_P)Z0x}3y##bK=E^EA9>?(gdF=~PqQdbr@ydB5QfEQ*Y5huW zssBRt_`omC-IU806D~jn-UUsX?t5`ZE5Il{X@4qTK zH4xI3fO}X&@Cu$8qis58St9AHUI4wqbN1R#Hv8U2$8$c| z*{z&&bN_rJl6vL2d1X6^*(N1J2w3EEt}~gpbU(fpbI@jaq?hWW(T~as<%pi4AVspz zc6%L`?DLL>E22}$#h9G(%BIini}KGiIh*O$*#AVBC{s6<XDU!2V?+Ijm8<5@2?`x|AeI#9Jj3XFcoGao!| z_<4|xm8e}89hrJ2g+6;**8D)-$%7Wgx&YfG30I~ z-w$%3%gO|qL)xs2*IxA`rOyTuM)%~0c+>?x=R}XI*<+tK-(huv)=D$t4^n|rNff`k13!)v zVD}DSpBf8d_8ifI%3z(dAiYj25fXf zr76n-9jW^!b~A2@D)eMVr`bDqkotmy8fofk74{e0Gy5m9DtN}G~1Rk+rJ&nobC zRla;*;7FVxY_2~Z+{fKJ(mzN&J7I5pOGN9&GhgxF)KlqN-WB#rKRe{wA`e!iu(0|3 z#6HJHLAXMZiF<=@*YRV>Yp~WHUMo0;a7c70)!>4FgogZuh8~GQ8FD*@x4~Jx(CS1JnC?s7ctAog+~5Zt^F47OMYvDD2C!cl^;zS`$xN8#`6i=%H#nQ zf{hcMK*CWWX9UAlxYTIsS@A6T3pC?3Z)d?j7Sh2dWBsdWGlYW*QTwwi52mlw&ZuZ< zf!TEJVi_o=AV807*N3)hHOgd&%`JTjDkzPhePV_S4h|GfX8@R62(%oO2@xy<;F%ZR z@-w=Bvsi$tNo)~7z~t^jbqC0Bc`DL-Nbp5w{l>0TK~sb(XMh~`i3@miM0D~*qQ57k zoetw5xrA+sLbdihp54ZT5X*tIaTFDVHYWbt*hK+#bzZo zocT^wmVTxIg%M~K`yod!;it^$mZ%^A@8!aw-^xnc9x&chmpG|B{ksDLL*G@P3cp8d zloB#haF;Zoc~?rpO{UM|hZY*vToZ%P)cNdO5yqmuiI{4c>)-#d2?95}+0o)fHGrRg zsI#B~B5l4bsv=LkYxjpF(7|gH){H%s&&CAkw8Yrd!ce_Rp_J*9l9m36*s`> z;TGwLQdT>`L_ zp)erC1kp;JpyE-ffxzyp%UMi5czF2lm)nCXqp5UJr@(11Qa&TO`VZv@(>8~^DJYh8oHs^+_T3TB(RaS9ZO@^vKHi{V$C1Ne;gtUCQFQM($9N!V;KO&1r+QbcZ z?Nuo2WiUPlsCW3W*zPoRIPz$W_!ZD=Q9bN$nF){`jWrd4rr2gpE$40xSEqDEf0}6s zNXpT7B*oIdr^HwJjAu4G?v)cHRu0tVY^AQgTG#K1(T&6k=<71AmH@z$9zdcfhsszd z>ib}nZ_!bBEB+qu`fG^LsJe@dt&(HF*{P5BvGjET8yW%_7k=K2+9>z*7xSy7zL%I{N^-8ZFf9?U^OiRO^i=3E?%b1M1l}C-5n!> zMVN-xolv2HJfExJ^kb@6``au7L!$k){?58x={VDoIQR=Hf|uk`aDdQ)X7Mc+3SL;- z!HA~|4QFL@pKV&b;VXSwoku1WtJNrqeNE1IpwnDw7!71(cSwMsxG=5l%M&!Qb)GvW z(?c;lfqG_U4hhG^O~SC#5a;Wz^p8kyv`-JQ7apsDTpnQ_?gv^{t*+l0yHM!Vz)9uTDfKE8FFTm2#U+YD?0(qG#xLldNk_n0_y zGWkWz-LWdVb!(so zCyn4WBEF&P$8TChp^R&8$X)NuMzCHJ;503lk#5y|D3g8euhAf%wSrWsk4DJ>xzC9{ z7dT8>Hr&6iIXUoS<<=4!ypr?4x0w*>R+mgRiQslQKy^J?NZ@}uAjReLYFH)YB&~FJ zP=#$f)qQyz2X9-R1!(qUNh2h0S3Vqc#kc(l;3ImEa7qDfH>q_!AQa`GO z0l*Fp7n|govSD+j=PK}fV}FG7^Yg>N#+D!?B;4DCH!{|zF@mt{pjkp5%NFON?ZrX; zT|5oIGe$JBeL^iB1I;AY_>&f8^UMCkDRYV4p7}%155wN- z3Z({@c}sWV%F*R^eR^4P;`=9z#nhkz9pB--uI#xf@xsxWyV&7wggk#Da2gx~z(!krS zWVJsoDgMX+WbwGi_T|d|UVPOV`xuVz!%`w1!&t3x3+dUgYgIO&rojvreTCkkT=D1eHs29a!>mF{w!I7@>^!QFfI1}PJQ^}swK zm8*TKJRuBd)@YJ-?#-uoHDCFZ8?s!Y02O(H{ABcJs>gmI=}pwY9duaAxDM-Cu#XKD zT*}yP*ei%2`GAYR_$?^2pg6kqi_0P6a8t|GAG{D3Ti-ta?8U-4 zJw!+^{zwQ3cPdEmqyzBUBi#Dq=XB)IS`^X#R(P8qt4hGd8`bWEKC;sACtoycM@?)$ z(z8K*TnPNR4FkcghwD1nk`Q%gen^kceIGTiE?;>CDEno#PtYdV&3Tf4kB^4EJPL8f z_tEpc24$@a*$Sc61PiVr2a|=oEUsEp#<0ye#Bq&MtudYEN+gysQhp~lN48cVRyw7k zY!5MGXb3La-Lsk4Gg&DSni7ngSP}$5vs)@O>f9!?F3df<>Y=qvp)npVoRlVjV6~hU z?YHIS zNX03(RI5XYeWU^SmqSMO_&MIN^X(j310LmG^u%%%lkq+)AU0mJ0BH6lx#RE17@$ zpmGw*E-?o7?Z`1Eaca74E0#yC)+v92=e79!1&%^BSJcZAA5FMj$4mh$Z~ zd4P$z`3x+Pk6=PVf*2fL_zxIpXeB#4yV_S*@CM+oc(d7&f|$e`TSpWm3qwaUHTw?E z3_!(f6AA}eQF(4}_io@@IF;L=w6kYBG;@e*1^^$V_E={YaipG3oCwYP_sDdhDI-S+ zD`xRJy%Q8d(9>Lu$=70X=x=b}=5{bqwF}d$C?Wq~LR5Dz&E`EbW=u%{I0vv%InD1Y zie|T;)4PY6ykk*)CFt#d4%#)a@@Y825IGLc2%kVV|(w4OnlRuVI zhST$k3DWZp<@4!URTNcAqJmZ+rLxg=<4`+)6az5Az&1TW|4msp!Kw`Hv$iDCyzkt*grG5M$}RSl?(kl-@-=MtaHMCuvKK zcD*Y43L(^cXf-mc5vShi0JPRs_tRu){qowZK0`W-L} zgt{W2m%%`CO}-%CCe-dckx1>Rr2R$W$p?$<0<57?NFFk{Q^lg4zXo#+jObXLn*oG zvP?$+`O;XBU(!CIoW>t^e6e9bEC4+MRK`((tlGc7aQ(a-z0WF)*iFw%}w1Ezf9Oe@i5dHSN>N1 z8)DCgI(cANp7#Tcvb?r(LMwPu3v0j@5h;iU1*(yfmi}%zQ~W!gSfo@e3QG)LvkJW2 z?%x)3;d$2b1|8spYHbwKFay6u22d52XB5WTE*x83Gsc@~OX4HgX5Fw*-20vMxp;XS zNn(PcqYk!!GHRcN%U8x+)x)4cc7;wB>EuiwJ2F;ey@}=Mi{mLgMjbTK+sz^SxC$fs zL}$TS^@a}=FkP?uS}~)B;|pJgNU1CovS{??F6?*C%M1F9gZ12y1H_G+KU3XkpT5)~ zg-twU8F-T04@-t)p5vgwn~f8$+*gdpL^xdLQ2`v`cVxA3rLAH<{cd$XH+?Pm97*^F zWIN+IdVy@+{Vvg+5^}$dCbKln=ynGUyU-;^VGD|>CVtDNH8TQu;Ux<$|o^m!aMXjoFTZw--?b5P+@{H#q zTKH-!D7N}drTwKoX)@7WlN1guwsnh;`1go@y>ndje*B=XfBGjp*Op_o2or|}F^i}z zX!TR|4t&0Tv!58qBI}Ra$KO2{_x9c|S)@VZPHK%xfXtQ_&TpKKBy!o@jheloS@Cnv z-7A*8y)X_i*asnCxCEOS^5T1sYO`_v0a0%;S>Xu2`Y&R&?*<8ST%VMGm05D}RT_nF zHzAgNiou^yR@1_T;liIz9t#=+nji+jJh6CaHkbnR02{P&7hugZ8{1ZVG493Q`kaSL zCW#^e>cs^d@Wo;PD#;rfDq8CuMxRqE%w00wX!t;g+`Xg^>Dp@eYZra~LAx@7A(|3{$a7JVX*{)fSoxml99NdaAo*o*Ol1YAWE^HW3Wi2fCONggN@G@(nQ@?6EkZAw0GN0qjGDzbBWP1BWvk zg}2BY4=^PEn9R-hx>y`CY_%d2yRZb#6w;w{?kc2Z`rSb#jiJJ^hj1>!2eYoJ-BB6M+_)CK9|s%l)n)rh>xvXfS5je3DyMTDhhwpr zSI=0&znbn0DEGfp4+d70!@gxu=v^39e;arm43`+c#O%6`N|Z}>dqO49eZ}l)JA*@~hz|=3dvt;Or@#=J@k;v;f2Iya4S;CZx-+lFkQF5zYc%W6rJem1 z$d8=F<tC069qc zJQqQElWxQ$=rK79Kh3%aBg(A-RsIR$B|Y>kE1xfHlG%?jcNGaGtMSTssj$S9#2Yi?~okq@=I?gwIVb!jv3qZ_s=VkF-H{^09q{H)U_?lEW|bWT#_MWw1aF;QDVy< zx10KVnmrk~Uxo>G)HuEg!>;#TQ;x2tw6=pQPpT9)1Z@5#gG4)E1r%s}Z<45pUttpH z{Y=xCgl%E0RdEWXN33Z(F@ecvlSMmkIiIWE&MV%YyoQDH)Hk<8N7lGP)1Ye+L&ek{RHt6Q|8I$HRe!z<@1E#2G~zGJnbT>B95; zmlkeE|EN2PclXY&iPCFTT84ovk=|XDDAvQQ-CrE4K+-pr23*iw+o!V0a@(e^Bk;eh z1Wt01V19jj%i@cY%PE3;W}l8dH^|2-AWX3MDDf1-v`H9P_lD0M+YkL`QCe)lLi_y4 zhq}=oTy3rq-#&_~<*t86VMoEvN2S7-H*h)uv|q_c4}3!;T(iNM-VNgXM7$AHL*YLmyFF1gZ1p!kVnG!gLYuC@EmOa)V}9tY z`HdkAZvEE)j1Pe?q$}{WD_6llmxY6hP=|`Wv*p!US z@KhC^+;bR%0DzJzaZD;g(bRvQ&>S&l9;`7Q^*4#x#3mO=9Dqv?;{1!q{&OiR)Lo7hiigg)x4J9w(IJnZUh(JfPd zxDjMe8D}8c6EB9Ya-()XEOVOplwISGUg>x2HeyZ!X9f};mS%hX0z%R)!)F4c8g7Mp zbOm77bB87f2(FB$1}J?~iNW^R28yFY_6!f+4V&owDDihts>GqNc+n&pyfU-*jjjU2 zc7o#~A8?h8UMLXJ5y5SZvqNE#T^v54*d%z1fngBP;YFmDGID<$-6RjngcWAwl)QW` zQsO|D(KU(v(T?HkU${vvHo#!?O>GU<-1S4gqTfr?i!f7My!g)grW1Es=D13%|&FSq)C`UxsBfqo+HLSI9c24 zZ#wX7Oo5ca2gxGRMeAN9}O** z#q>O@GfaXw(0J{pN>*7Yq$E{8ct3J{{q5^+|NiMYhX3^GQI6Oem3j#b2#783cV{t> zWr%1!>--)hCRP;`{6NeLPXUrsR78y-63EeRveB~f4z+fz4RI*}18J&UkbtsYJX_h> zwi+Is3Oa~SgHr>JyU}Q7#D<-lf8eWhG&R%yW?tWgx?_Iz5HFbn?t4OTzJ`bcFjKD| z^Lv2S@t1vMs4FpW-w1c+f!c82s9Ut#A+1#89Xh1Zv#W!Mt&$=G`T*&@Lj?5trwCq9 zb^A~XceHVQ)QKFE#FN}|R4wyJJM(6Wjb1&W@z{+cLV3gp=TEg1dWhx*q3)5JIKhf# zl?MFdP-5(2h@E9MEV5TF*%wTkZM-0<1WX2lFVCn!P;sTI$T&f42Y%$=o>xuRXwX*{5ydP4Fa=GEtu6ZzS=g#?(4BH6%)Y8Synr68@3cGmu_ z{g*c=CJlISMrBGj^n5W!(dRhC=f~;iMtytfQ>;8fJNQeGo`ded)CDAzGUp}aS`!`? z#_I2~{G~zlH{$z_?x7~}BsG48MgH6)ME^vef;J6LgA?##Z`PNEf0&DD(CBaPsGBVN zxry8P!SS>rxqheDKl{vIgADn?*Z1q`x(n!6sE}KC)IefQphk3p6~R0!bL(DxiS5AR zHpTbC4@tq-onLDYsw08PREuzN-HZ?z8khz!#x?0>W zGJ}%rAGhG(1kP{$*B7ak$(Mt?(%VVO3mswc^jHq%??PJ=wAU~E$qAJb_8a;K=<^_9 z-yVb9CRr^fT|5s&%I7^xwWuL*zHf1=XfRv12P`nrCM6{UF>uKch6v-?7*V=_YwD#+ zI7@Y3LYkO6T9{%(BpH~Ma8on&N}ZjPMT$2nAiz7up;Ha|u zKIKMM)CnJ3xi2HQJeup2^{wur7%%YdI-- z;-nJ9MBL;ZwxkhL!MW~lH3NvOHmESBpP!JOT{#}J1${n8VE7z^RHR-*kqzjONkf0x z>CwK5hw6Eds+rTIgvEfBGDPV-F=~}snqR)lWn^3)A=cde51;X$X)y=J6IUG$lEtp$ z+*)Tx1{44`V%jbp_70fDxyS?p#j%8$Y;o$0TI&^CO(CIZ+b%({BO{C!p2;S$h5;AK z2Z-AIylJZTre}j6+Z_oJQ4o*s2yjYnndXiy20_7@o*JgPD$aduJ`jgT$8Pw-B^(T7 z2OTyqrHr)0pS6uAR!nSx?sqRE#H#qr(81=ieK~!4<Vw{Cy@!DrJ_;au$YDl)3ztIzkxHQDXIzi#)HcqsA{YN zzT>yg$9^YNu(ie@hMuyI!UIDlmYV8UT3VuJwA{|J(~-HF*VsrNyY^V3^quRRj7KB2 zS!QqK$gkW3mIts7&DW=9BWz0*M?X>7!;lHS#}e>NOAz^r7D!;LB{ZGn72h_Aill)( z(SK(0KbJrRScrBi2};$cjDjme%t;iT(8oX4B(2yU&mAX^)}>rgF=kgHe-g+ARK!=9 z(142D7xZFcZ%l58D^U7ES8Yp~!QU!MRv$HYcaxPePVxgbzjw znVr0Di_A?vDEBg4?5#B0fx9bg1S?xw7}17tu5y4hnbfj?xM!vx4pX2#^@lAG@r zUr;T$)DLtF5oO*vvS!dTS@{zli+{19;5iK8o1im~;b*!B!hW^4@_kMEi_3}~m6xZicD2)?40glpLkd@1d|d!j$*R?(MY9zcqd^7{Y$e zcqg|sn2Q}xE8c1~0R_a#(vE(L3y0N-U2N7@xq=6v#o^e5h05$duVL1~9NXIYiKn8} z!yr$GT>qlgDguWn#Ub3}L(V(8rOVgR6S?cO6`GX=Z}JLP`^#|0-J9#0JNI_k<23@B z9l(uad?5Ti+6s$JMigFBjjTn}&r7PM=Rw%44r=k6Yt+4zkxe~Rk=W&*%HW&;`KK6E zO2~(;T5vmWu37tdsfUVzi+%O*uV?NoZ@PF!rk}0VaO?btMUvArhy&?Ba{Pl0#J;NP zsYRSqk9vO^*7l9R>wul|e=iRR2+;g#&Sk1679(Q<4m|$U$5D?Yy z0;AhxA;sVXqnG?DQO6XWa8EI}F~4(_2}1**5~i>=f=aFScPHJGwe1jopj#C{7PC!sY>5D!rUc zp`h1Hi&gJ?r_}s`S(}H~pmq5|Kl&P%0VBFF$JY6)qzmROh48H>r^tiiv~^RA8BZKA zBw{A3^`-Pr`j>Q3KDuvAjlxG69YqyCZs05EcBlXvsD$8BCkg{%aUoLbSE?@2kdDG! zlh8LenU6iN)5Uz=D|v{Z=*kF=t|vLmDLe%7VDzVOW5ZKB|6yIV4tf#Gg$BMoSK8Tg zw-DLG+7+fi?A|7sBaSNDc+KB^L>tcwjub(pJKACgYOag|0s@gy zQKTp+=!YTvDb!@rGhP~DGo`-&aM%CqJ5v(|4$0URfb&>{45#SfOk#Tm77oZjWtreR zk!A`067JKd>&!ci2ium28m>N)lEjqIb(7|fTP`jr`*AQukK3-jV)#rlU_Zu$!^>I%mp1tMVx$7d>zQ*Ku z(f{E--LGof%+(fNBPENYs%!&=<@?+SY~cU3K^a6nMQuy zYoUZvyG0z}$54}b+Fly-W4R}JuFqm9P(V)fv zS1T+zd#UD+%Ihka11`VGTzRe-{LW!02zogY3);rFLfsMSCV-%VmfEHA4;gN#gz9PQ zS3267U^-aVl;Jo6pV|Iwh?oXwZAj=x!4UisRQnA4b=z%`ta^x%&m>_83eTwE>t`4o&8 zdxV=@paWI!m#2;k*lZJ|^uDOhFCG;@XH4d~AXKX1joKu)qYrF`Xlm`Y9AoMN(Y{65 zq&moBsE2H^g9}EFp2arons-%t;VDs3!``rj+<$(xLkf@UK@9Jn?HcO~q~+3#{)%Ba zyg;T>0k`hDb_CXsc~Q5sL*cAcjT%;u$ zrj?0-V2Kw8;tCnfQY#;G$SRui;mrVaaxd3bWu>|y!8NS+yFqS*PEIP7sNf~j>do!a z5Pdr(eCo~8l(mQe_S4UrcBTWq&EB>zv*tom7ZN4fIc9obfC~x&W&MMEN~ZkK6x3BA zYiO7dZ>b0#bfGNsd*}N7GpsVSusuE-NJ;1b8@=5vet>YRGZvLy_*>0BVuKT6)Z??W z6jk&XTPloR%os>>R1TTWd)^M%k0IPmdpd@EC%f51Tm|2=`^}Csth>I&k%CzWMuq`k zDN?#vJW0rAj@-+S>LJM%6D^L~uJ*$R%8JQIJbkvD?!T!@7)gPU8q4Xzi11|(FRbdR z(|_cP1cjv5>o_t)1$MR$zgf~Xd3`K0z5}TELj<~S!K3aI6#P5xsdI!^ezr5%bOXeD zsL{)#Iub>FC`fq8nAduNO+=PHN+z><(3Rm&G6cINrRfK|%s2cH)^lpID4sBNFvA0g z2u|)GUyLQv$DD>eY~Wi=qD=}BOMu6PENr6nJLjGQaZTjV#*jr}4Edd|8~%T10p#RS z*m4nExJl_{rC>;a>m_=ea?}wMONbbqnwB#k?hZ@hW=!c?9V{>gQZO-d79dCkiuer+ zxawtrGBN@NvHQM#Hs{6d;pTHDub^yy|J;@@lxjN_ULdlsryvL_w4Qqyaj-<`(J5C(3qm=vujR8OP@hDhtobz)LUUkz^r@VvK(CPP{ z^LKp(!7R%a3F?4wFOqj$a`GQZotUt@)OkMoKhg)E7>J>(>7qXSBo$Wk_mwqVZXlBs zFrj~@F~A}1moY@wk-%Jq$yWghN;=9nD4E3PP*DbLFV z=O^kz5b^RrE$)@qZyxQ2yq;P(do}TUq!fVK`nQisk~OgAGFDUD;AAp+*8;@o_mNh&>$-4>C5YZM|kvwmdJ*`KY;q6g0(0wg^?GC4%D zCw`dLC!>$*TwmyOs} z1U3VM5&4>wQPYa6j&z#}9Z6oj||re9bqo5^uoEB;REf0S4M z`=W{jDaX*3G`$do7>Z~M4&U*+;XNEO>hu+YlVz>TAZaE117>HqJo1rJdzrV11R_*yN9y~U32$B`Z-Ld}P>1CJC;`wuE$fGN^cUL1qU zc$FE{vaN}Mi-o@$0X4v_%zQq#mR># zOmQy1hTT^yt{j!9K}UJr_O_C_+4EoeXU{AIO#cwhg6ZUOzTeRQPniF|iBu^gl;B*2 z@4bHHdi|X47pMl5@mzUZyEh^+#Ma;KtC}B_>NmH;4t4co!zCpQT)R#nae~JzLbDe; zqrtFlEcfa?@}FZ~Rd&4Rxax1@-^9filaAktb?1HsF?wpza*(EgWmptpgxZX&x?;8z zX5p^&WXG)5m!I8i9;>rr{kXs^BHQAQ`iGFOC_Rh@k2k$^VD^&#Y*JyCs)Ju;Jm4wmXf^Jr$+qO&)}o>`v0to3WAY` z$vVlOshh+YV;RNAZk?be$Iv=n3hVZuBgN%qeZ%pb<3hW5cDQGbl-BdzBTZ-`-G6kI z;H?{7@Ml5l|08bw$LRlS0GKg?Cv(4)O%+uoq@?~PRt|; z-7@*>xOmul7WAsP+CBf$>0a;m-XohQCS~|A&>{=!;5=)(O6U_YMn!`!gT_AfgCSf5 z_0Wml*FWP`ev;E(X_|@%|E~D|dgn;N6yo>{hi`RBlhV)m<82ww5OvTX7(ix&3hdZ& zCH9tE1<-%4EwHFwJle>bV%3uN+rC8kjbf;7`~8;g z2VrUDd9!93VUu*|h@5|x8f_1a*Qa_CAowADkE&>E#XPEMX~BZJYiYrtQklcN*Y4aC zS>;7|#{36oJ2F}egMJSkr}SHgxn`GmvZL1>M))tR)j~fW0ST$dea$nh)#XsxCM|x> z`_i9*rYZWbF7m%i)&KbwST6QKpEWR{xM6LofUl2`Ws&N#;e9f0oB$#A$V7iY_}ZGV zc;dcGCMzhyX*$XB-2Qy@EkK-ZY`0zHNTq%gMe^tZ-P+8VbJnh2KIUiqZtu^al%RO( zQ9V;F_TFrV3L@{T7L0h2D#D!En=5a!k8|K0-ATRTw&GWk#O{%|7Pn^LV6t@Ip0M0; zhyT0ZH;!tHlN`F}I07DVf%g}d@pPtOx03~>LO`qpAEr@RSy_YS48_RMP-*y4LoB>F zkcm9jjc7@dYfLrG#M+%UkF?}2RdtN&Sz&-6)$e;at_jvIr+}0J%N=dm9OsF!~q$bUHqFyHa3ps3`pS14!BaC|15wFTt zOC|YI8n-s_Bsa4L@Iy~D5Wd_Mm`=vTD^?cicX0d7))PTFZ$6Hroz!k0rZl0He3A>7z6}#RZ{n>?nE6ATTy%8 z4-XGnopvO>ZhlLQFOSsM_`P}^{rQ1?x@`VQfm%VYh>M+$E+&=1Kt4RvIaN&@Z{ny; z>3rhj@j+j1PT}m0sBzLb7ol>=q+hU~)%`i_`IA9W?g35l!;YpohgQ{-Ib8;| z2YHEF^k=P~+}r)ZlVtcMr%cva3&OjqF~{D=ohix6V~<~ZZ&@5GhMwKZeePIs=G zL!9ML^}1#kH-i5fQ41xSgDd(jIu4JCTr1xT7c{6Ds~X1!6ej%;a9nl_iW~k#EjwJa zv9L!E z@+Z5$kI*?EJo~Q|r=7jW27Nrj&{-YOi=v_}L}RPcVJ)s-2&T3$=Px-L-L~95Bfl9p zu%DiB_ZPagmQYnZ#WWbNUHHt6=?-dbB2>Lh(I~+6*Sn#jTYtgs;Jci(n*A~DzPoXm z&HA_;b{Kf9Uo=RILjO{u65>avbZ=OLb@#|@Z+lf1R$?1 z{=b}JU?OV3gPqwl&hOG%iO^(K*t`8*6Dc#wTy8Yg94~y!AFQT=wuQM|Bj0$0b~e!s zucwZd++F6XLdWw(!azRhT+YSgLO6IdG?n)kn|(#YTJ>f)a_E4fuD7SCs`jf9cyjA0 zLGTQnCbBJb`98Z@Ugvn9^_0`OS+|kcWKA&-O9P zvDMmF|42Y$UFXbfOF+Lo6}EM^%rljOV}pW)b6tb_Qm28nEQ4$AIH-LlVsN?{SgVHTc7!v-q24@|GxhdqVobVOqr;QuqQ+*l zy)O{jp26T~*-^9BB*t{LX<+lfbyLuNM6EwzvX0ba#Iw=4i&6vj*6?3D087vXyH}Uv z*bKfUXPN{y#~8epI?byHyUTRotqFI!_JT)STGM(*;4AW;Ks+)!ityv>77cV98mPy_ z6!TqCM+4vsRR@ZF&DJ)hLG`oH;5MH&0Ozq)I&zY+WF zAN($8aNHIzS1Caz;53x>+&ep?-&CTUuQm{k8g)8|GeW}hK+VrZ= zI}g0E*m5fJ?B^brv%BKnyuU!n>9a=C7j#lmnAoB>tM2BEw@R5ub@5Q9%1)A;l)67CWU7e6j z>uLEHD%uvKgw~O^I-8kX&|!Z@FvbR+@I3O*+*ZclCJToT#trFxKTzRQdyNb`*C1vY zE2vgEp2(@LPS=wtqn_bc-ZyF}qJPBbbuhRVFnbvBdJYbX&oGCXe?01lo7`Qe>15g*H~2J=!9 zB;9zq2=VO?lBKL4e_$Q*c!8Eo@$a_+o!zCA-)@vKojVXt=3C>xkC%08W1xNRuqz%j zP?^)Rz(*Uc1OAJ`h)-E~?YF%Txqdju(#30e^J;2~wrhDoqnLCSOM}Ce zs!4JT8ycO_Np-Z{$gJt;4emH1d%uGk^4Xju;pZhqxcw->cp= zP$|KBJ(&2BOMls_*Fd8I`Uryx;nptC(?``-lw zHp;(yMnZ6!z%n_Epb^<)q@d0EN&W)x=47P~uUpXA#ob*ou}Elo8$x9!kw5q#T)zjS za=kY8j<=^{R<{#Pswm;w+FFt|@6*P4c*{Tsiozjo=e_7zh+at7_q%2LEivP&7DJ=k zqd%-R3yjx$U~+532^^xvAp5{=vI}g2i;;#1tUqpCbV3Y&>@`_#vSqekX<=IRdp=dh zFN6(GI<9Jq&GFpBAr9w%Z@(}aY?VL0weNfsA6Nyq)h4b0M!yV^#wp6n$D&GRWMn9t zfBl-j2T#Z>C<wbW>h zttN^mo>@wS-k&Bk0>L9emr4}w|Lq+j39pj!3vm?x%i}wl4ID}$kY8}&BwR~I#m_ef z_fJeB>;?W^=cDf>M!g2Ka75mt)6I=vYr{SghEi8ZJZi}VA=PwDoH;K!EoFF)oOkMU zob!5-JpR+>u96Jll+tI>kKrGzc`{wJk5+swKK2BE08M0JuER^Qo-JYj64{lZK&c@r zjAal|oz%JI^S#|azIm`E($vt{bB$LA$}WlBC%UED=g1knKYokG=|_8dyPSM4Vx^#f zac@h7%$U+@v7-=}&Eyxc;LI6x$|5>(uc4Qj3AHU;>1||r=;05O`k_nA@MbcNs80i| zc5-&MSe*#@Bp!S_C)YMA!Y`^PWTvS@5k;OW$4$NuRZ$d0o^Hg!UudV z>@l3sN0I|2o zC8#*-6B)G2_IP(to>#68ymtZdhQ)QxyWB~TFDQlKMnF~v+pk%3Y@Aeob@=G z*gx2a&}&E>W{jt&BW=StY$Ioy$=4#n<$=$wwD^fE&zMmk)Ys+opk2eND{$=oO0z=* zAVg*3G*7tIxiYuJS?#$mNvuLZEp4Gd;+X(#kI?xeNbRYkdh4y26F`02<*H#rzJW8r z8(TB~Nyba?(=UeUK=*$oAb*RR8qw~{FGl?EmK`bz15@4ed(&Wr7UeFdx3jJz%C5Qj zbbiRX(@2e*YV$TQ?EYsxWjtT-7bHODV!b)KZ~`N&*2ifWnAHrhraJ<5!g(^Iolr1V z@Okg%aK?<`|Iqc7VR0;5*TEfz;I6^lox$DRgS$Hf*Wd{R3GVLh?ykYzCAdTQ=A3)Z zxi|NI@2`29X?nW5s&=i~d#zP?2_g4`Bj0HIbs$}Fq4jXF`J@rg-y0L{Un~mgf#xG} zp+RU!Z5*V=`R?O98)vhWs)usN=)h(CrPoi+N+p(7gMeq{{35Cw(hFKYz3l3rr$F zY`_y3^H7-StG%ED;x6T--AY?7Gr71NgJxu!ngNU14a#te)r$fTfi7!ETs)5lHeuI} zj8`70r?3bZSmDqr0+3&TwH7gk{rXlnb6%?yLAwtt8lgDO*qZLz5!@a0r7;UtaJ@~( zPM9m-VW2-V;T5oFu{Nqq^-|ma0OhpnlE7>-s+ZD@wYyGQGNBY^mHv4PDIlzu6^?Qj zOrIn1nkXFo@S@8ZR-gIe%G3HuB*z<~20pM7do~!4QtA2vd1hJtr^b74Lrq-R^z~Cy zr&$cNqpoU#8HLf;i?M)4mDD0aVOakvuxT)eB`^pQv&gD1=`WL*>j33Y29&<^%eY^l zVac*&*i!k@@qn1wpbOf=e|5R*JMf|g>7w~?@f|AMF=(~>5Vl`!#ttytO|dldGw8pidN4<~?|WJ; z7e$|LcslTxZ!}B7;@MFBeB#isFbDI=Xg!9ZV7HphZTzmxmZMN9-VDenj-n-P-D}tVQX9h3^M1%YrSFK29-I<;NFd7n{0qJC;*fZ$ zAF=qkkpga^!CD?sAx-x4T+pgL4)*)D4t!<@zmik^XPpMF#h2xvq=Ak>!pP;#&XIe;=Wz&{}1WwFh(mM%y)fPmX657zK{ z&3!{kZ?Vy0n)K@Sw%_P8v=Idplj0DZadTf#89Z^q==gZ_ulvrolzVbY%2X9MV|>^R ze|I@8L`e8HB>=dorAQbu1}T(iLTs#$(HIwWwnFm9E|{GL(~EUxN!B3$z<0G|`xbab zrF=Pq9$mr@u!Nhu)trz+=zNl-2|MUvj$J3<>xjhGHcP0;M*&))9g8&X&lhiKHPFwS z-4G&pG1`(qy_R1yaQoY^Fg!jByZKO8&<&pFPR@sj%H+N8AB1oTvG&;c;&z0vP@3aA znQtLbvY0%^yNqJ&&``dZof;~qZGpYEWTQx-dKk711&(<D!cGYNY!=WW+Ji29-J1$&Wg{@*2psCgVT)x0h}j&c=U+i&GqM4Fuo<94^F# zh86`V$c&Z{4vv4hOR|SLS!5ho%lAPNQvGPo%3o8p1Z$aO|{A z>S|(RhQ_toL?)c}>uoIDYE=d$fGaI#duEchYO^2FkT_Pu4QgHw-847l+|vs$WD$p> zrq#L#V1{(?)3F*mn{l&l)Q$^Lw%#vdHvlrnTk zvAJ$%%fpY|j(fJf`YZWM$S9D#i2T^buS3N?H5ML_AN%!jkH$`2j5cs~rmpvi0}+GF z#VzT(82e>o=zhL)4kKit1+xn-~u<2e8Xwo-gHQLO5W`SNW+MyD>=mnG*W8Z{0X+1QuEs*A* zwKLiA(3_591}_j3+w=Zm^`g2w_15qoxl zO#vU9e}07_IEKFnj8WW((ZHZaX+xw1g_n`V)ZS7kvozXVd{}vGSc#yj>b41>5osIJ zve<(9^0fv2x%nkGRI~ZvCiXKyiZP$cn@37kJ%j@L=%lNqz==gE-HrpKf-(MGg#pNH z>30$ngB|3B4a#S92YiQ*0dR)1c~WSojn&A0)uFUZIrX0LenC|F1CZBQQq2ZF*(oZneI@oN+#AzpprK3j!ms4q}Nw&2}d5|#7fdIPR{qkSaba6H|rN_+@ zLR?=QK5T8 zT&J%U$&O9uaV3r8+8K<+BF&RYVyJUJ*GKIUgDwaU4_8KlvRU}fZoQjK$S<3XmtW$K zAku~98XO!9`#E4In-Zc2db`h*HMEk zoIKk1Nz)=>PGK>PLmCnI1jUi@X}?p)%o%GzYlUrV#X|SAP3Z<8Z2|~kJ4o>`di*xk zASMBa%fKlNAK=hPrc5Wg4`w*JF+LnV;Spj`7)(_li7d=Rk_pl=o9X+WNLJBTk@YlJ zSd%`f1qK$aY;#4a+qMau0Z_bo#R;CpF`5Q^;y~0VhR}l47IkX2_u)bJdbK_9Wz@cp z%&QjHM6O7oyPE;0HnwQZA|x|`306!}4>|-GXb?LB%tCBb04$*wOcSg%RQ(Pf_;tnu z+9p)2Pb$mY7ou+mte4QrrQWGBeL9Cj!|ijeJKwgYOaM$5C18e4_d{k+NrO;^#|==1 zwia0Wnq5BqiJe6xnGnH0wzP33QtHZ(jK>VWmVkL50(lGS%uZUm9`z?ouq8QeoVg?M z@D>8vGyd#bT5gaExe6d9(C>8v=}2HPn~k)Nr{}21HRRa)HnB1|Jdar7dNyUt zSbT0}!VVG}gfq00i>$9tU_J0Dxrmsw{QT{(mf*rYEaZLqKE%>YB(!r1g0JNLppq(C zN*RzYKsAs=k9`OSyDdmOmQaq>d?L^9*P}$84KdJ$U!U6MfG8Z-#7kJ(naSypHXPKo znWE0eIX4n)wBLR`Gm->bPNZ=McWB{>rhIYZqZ!%dhoZ+1stPcl0QpMo0rK=Eya%{w zN|02MFD;^AiU(v;KS=XvT%e$+B!3i#MH0><-hPHbm?2d`h72TXwq$94>)kPX500`3 z&v?ig(OpOc}G+-=*Wc2;eyG2kKL6t-Ea3y|$$z~>wByK^#>uq7n ztP+47f~xZyts4A5hCU6M?j{H-HWIaAb~ znDOV4nms|CAB}sy-lLKm>D?s8L+g;t#@IdU;?+Y9`6OqY6u~}1lLyP)_ZZD6IUAg{ zLpSbeKkN?W82xx&RdxHN{DZwob4?i%EUP=w6@I&2Dw;} zbM168_r6}xoxS#0$o!!Q_^ot_AQ&vO3e?xvufMKxytyIPV~52G-d8mba$u=5jnf-m zCPqd)$Mo1tm6X^o#ni9g3#zILkQa1F6(9d5SLAst0^K30Yrb&c8B(#20|^(& z041}U-#Kip8hNL-sVhnF(A(TkafV1mq89Z*G~N5JQ40AXWkYe~=O;h8ukT6X@=z1waPFvxvbXNKUhNMyTXHOMEX9!?}G zU-=P?uf|fK1N+yprz*n2G(MXkjdZUJ!(R7zxw1+Kp6n4wNO`Dp>edPV-hbwaz({NW zf#A?E+O+Ins)~Ox{dge^>?>9qMgOIIEs;ys6bJCXM{5Ia-hxkJ17zhi-4E4~I|J?| z#TIkwSuHO^H~Loci4ag?sbHx~!&^#q2#C#4abdQle&gEznbN0$0Xv)&o?VC|5h{KE zK14CSsECThCMz@N50PmEP)_7swh}C}Fjf4Qf|i^Cf3_{a&bKP24=gnB>g5vumB~$~JA-wW8;-nYP#avq+hx3Au?A7$rEtWMbWu_vdAQA1y#rk}QGqZy|LJf2hlT ze--~61%YtAN*vWlw@6*`?_=$^BcK?mf3PBmp0!UCQY93c|CcS;IIqheeoMbk4gWss zdW6NcY!`gF5OFrnF+{_~l|FB~M&8@s|L%VH2Ql*hx+~y3aTYFzT2O04$^UtdzaC^l zRW(BN&;R@9v#bD$Jq1<{Kh%E*zW=^KP?jjt|4-UK>EYs78uzT7&|3EY{gb4Mm#|Gu zO^P7d7>MPPLW?!M_44vb_^GI@jP4p)@Bh9NAgY7Fzw^C>`vYv`BZwgk|JU<_^qBkr z07@#TG$Uo-V7DgS8;X>hl|}4)-gRNKSk>KTM>kiYmkO%w8DsK*_ zMNa76iCxZF4_{qpXOgvtVgCqP7mP=w8gqXAIdyKNi}=)Dypg(}3eI?VU*Kzx`yaW-5?!}?HPmfp5+}zyzmLU1kz7Hrn z#MpkL?2ME9>Hg=MZQ?u+hJr4c7k!F=@e?=8R@Qx7&oj6Hk=GmF+jaH^LCBBrb-~F_ zfg3_)51&JRQ9nQJUkzO~>w*Y^pmXM)3-=C0I(gv*@t)R{_&a$)sZgQm4`vWCo?mt(hKx6>121BtERX#_8!}`2T zGF_O^BYx5I%z^r>4T7h3OI?KI6MN{_asLybAM2q#^dH~^agmWS(Re{_n*TMDil|}xK6%jY zkq>^2EYcJ-_Q;<(Issq9yK68DXsQ^`9XqDBv%JyUX8Lt_-; zHT`l_hw_E?*$Ha!dF+6wGB&S+6^WxXYk}zID}k1??{8B}2`V?!3JPf9`2Quq`un|P zfjDu7D(bs8O_8_(33Z}gC#9F@_^PQD?WcrJuC5L7Up2P7~ju>IBt&T93cZG z{x_okc{@}{;3ZMoxOH`>V$ogMXst)>=R}1v&oKqa zC#{$qhQ>`-bvm2%FkA7(1nOHpuyZP+v<;EH{r(lr<)^N|?U4tRt2JvLBKx8MdFIJ# zlQp%gE&r5mmzHe`UX|ves{@1&$F&wN_l>;OdR}{f#x|C>3%2aKzQE1?j~&aNXnV!@ z(DFz@;QfE+7$bo7Kw`r;3Z9x%tRtBu$M^_I5=}RzqYDNKw?t)ir`xCU^DoY0*u3_^XrMY^)Do?%_Hd*+oIYfwlt{gcO zGf*L&N=ZIA6O&v&qwgj^Z%!d>09<54I9D{p?y5C?%CuJ&zDgktj%Ho1nLmndO`N(} zvY)=~{aF|uI&OZ&q16#p!W=3Z?&Aic-8WUm+0?f*$Nr_mIsLf#1p+j zvQ{nUIVpWMd)L%Vn-N=}iTX-Lz>HZOQGi_S^3$%yn z(!kQqNzUZaf;Hh*T!F*M8ggYXEllvFc_*Jlquwo~tKIy@mhpdzLjRmU9I8nj7z8tG ztJK?Wtn|%jnjQN@Ly~W%qcekK>+{pXvmp>VK4`eRN3`4^YsvG~iNK~8h}h?LT_o>{dtpdTYs7rWoO)8GGh26& z=68(~)^W)P`MJQl&Yv;G@p*h=NJH;G{83CFd32NbAyF+g$-41a2&q+19^~ifVl~?7 zlDHx(P@T!w_DWsnsS(lfB43P~`+8%<>9l#k$W-N{bj3F?p&hR=uJj*=ZGW z@vwZ8Jpw-8FpM<6=RnT5 z6QR7990qm*aM^eD<+o17+Ie+J0&6IZP~NxcfKm#Bod$~PZUq1yA0B8w z=~LotO)FGSrd#F-y@NOVQ`tUYjZ+!DsYB!QK|BqIBK*rp>_jkE#`6n)7Wdt9312gY zGHB0=(0nau(ET9AV8T4oHi%)-g*+TO^1QqV* zI9du5x-Y$vZFACL7G@0z&8z&d>rEKr2yGdeOA0Vj-zYqA#5ioEzy!MM><&5;lHW^w zS&bd_+K{a0^U0&z*IL36UjEoi6j`V&UT}7^AmSBOP_+r|La`O>0S6W~mWEhDgxl?N zB1E6+QXtowB(A-l8mT;zKudA#&%hLprQ z#wc@w^YW;6BkwT0q0VI7MwM~LO0yB}_FUh?&NM9AkINlF{YBaw4M8mIWWi1pR38j2 z;A@>5Ky8uwpOHg{y7;Io=(J!lB4}F$6Wi@QO%WVcE9Nl_G4rU8$g~8JLG<5W}~{#H4F?D zy94^a5?{f!jI{<@ti|Jfchp$|YwB60ex6kEKa@kx&mX_oeQRv_d<*7fU%sIA~|t?*wrT? z+C&1+=^CHclT6=8848HTXH(yQD|?XNFKryBeQ#FpdqDaJ^(sJ50-pP#JN8Xt<*xi| z%|oNlo7I!;G`iPAL1UiJHL||P!OG`^)FiT0w~M!LvNvs4Q};R6K~;otEC@d)*oXpA zm?hJjY5lTN5CoX*2Yar@{icu0uMe~F^uICN?6`5B>dOAK%SafsP1_N~eyTkmQujJ- zxHs`Zjzu6*`qj}r*J7!0sVH=qOn6zTQ6~e+&=gAX0DK&4+EGhvBO6BD&~htPg-*R2aY$OUf3@su^6K$eJmUe4hB+V zxuZDU-Zc35&o4{~a4HIPDss?88wA&u!ak~hO$$ja#B%jM^odxhPef*TNxXW@e=Qz= zQ;zTEoZe^5M6D^n^j@fIGXwuXxF{Wl`6e2cQui7E6GRBdiSSB?k5&sZW758cU>70F z+f2aYhN4-I}ODM%P$joSHlrfnUIx5@vwA{$jb@`UnvyAEfTWW?d^K}Qu zdBs{yt49mIzLl504acc|Ps3Vgy8_&5<&NyKf0dA+`^!V9$C7_m#G#;PpvcnXM%D)$8RS2wugry~+4iY)!yBN{i3)|~K`PR)XUB^dfxhH18N=UvL`844w z;vz5+8~Xj=%nYmY;WEknG=at&5gG(xjdgEDKZ`-FZAVI30Oy#k2LA*h8EVZr-hwKa zkfEi}uX@YXLo7>+N#+`ybxNK4iaI*dQKje=<*o(H4DOt(w@4 z-x45jSncN(#9}7LM+;);pW)!9o9yaNAy7?<&x}30K-sVY+UIrY-lsM3aWbA{cOiV7 zvw&nUt)0CsIb#Wa3HRtv+OVVm;fso+#hl7VE-a_)lTZO*d8!Nh598*G-Zb}d6Qm2_ zR4((!gR8^BW}@)*IMc`NDhcjzX%VhwwAdfX%cAa-3TA!;>;Sic>VZ`*!I*{Yz2B#N z^ngf=fQ!@(V!^v#3d;|T9>YzKkX*70?^xq_oe;zB^3Dt-JS{(?-){cco>sz1c56C= zf^)R;8)eNLO2vNa3kAgt#%i=I zk9)N&h|YZoJt8HO$V8P8gauf(&|qK6x7AmT*0js!7p&eU8tUq`CFXf()M>WY1G9*Uz0Ywt zq$Gv!)`#C-evbNhp6`{Pc5^>HEWG|~-#Ev6{CUoM;Ep+c6-_kFv7kCUBBSE1u2zc$ z<(+G?o1vyp(YtsJ4dD|!Yr|U=0o@m?oIy>sTKLQaj}*aVjk={!%h0A(4pCj3)NTnM zc|CXaYe#1ey@mT;?>-G4EaG$c0m!-@GL|R~tiWxp4o0;JAQ^*^iezWg!k{-13G%yJ zy$L)fTkyH*rS(F4{duQBOs!8Z`BrVosv)1_)*kW;gd1Y?2hl31vt(pNm_p^3z z6yXI?&03NjBdRE|dx|%Yz^`vE3c~EXGKLdAQiT6BV$8Hw)#*C*m=U0}Vo-MME zMJVz%5~MJ?D|#}IYE_yyJR|xDbArJd#~-J!c-^JzO$cTX$F+QFSUEbR3IIStf(Zft z{jrQ0V5jjbU`C@&WC>3&ceDCOT~GuJ;GRg2=VDmXX>AfnRspqKRBfIXVau%`Or#b2wo3WAX63vem)kE;7UkITk z$RoM<%8rNKdaPOrMu~7r=&(!Br%jp(Yzt<7ad0^8&xcAdNRX@5tlV2JzE{##{|XPI z-QZiTjlzg#Q4NeYWwY)C%9v4F7wd8yx$$JQ8O#g#9+r8Piv|zi{s=$}AUTB&efmgj zStpJ-BfKJwRDR-gpb7LiA?7QY*#-{_Lx%leYd#ZGqIHT;1HC`OsqA-WB;=v4ZMGH! zxgs3cLQ?~MU6B2;Wm41La_Xz+*^Z#1rDwI1aohd!L6Ntrff}Md(4fpCez<&oRU<}= zjd_+!Eq-3zJbMFTl2Vf>9#M_hh7EJ*(9{Z-v8=Xly)JDG0X)#3m>3rJpAX;)DS$a0 z%HlD|et;2y+GA{fmk@BQ8fHa9OXWWJ{_60(ST*Mar&kqMy)!HN3?XX~{PytaOXb+w z=pa+?TPObJs3Mr*^nG&ck&r-fxX#(A$S&2VO3vZ>a$ zoz7abG39ynhb)Anz*a<}&1!H%S#L~%+y+-uqYsI_@x&iLxeW7K>524Z`iSJJ4Pgp5 zX36C`Qpxfc9|v|-6(5~x!Wc=6`TGZpayx$T=kk0A89hwlw9k#@I)%m@h8(P8{xPej z@{?mBPF2R&qS`QyFVfFL%l6*n%QosOCEn;U4gqwCsm2b{V27Q}vT^<@`3>Bb2;iMS zic?%~)fgNy$Q>>vJEq07H;uyg#tXp2G*^0SUzL`T9{hPGM~SumE&2Y(C5V%Pb2wrV z{XNx;e{4(B6(u(|=`B+1o1e*AD$R!hL|Z}vt9d{V^mp#59|1z9Zm%fQ3r?-i3S=|?Z0zeOG%&b$?=0my&4&x@RPJ8KeXfSCu6@EBdR z2nBpW>5yKd)R+1Gv76@u{Fn;XgfCR-F6`$K7I`k?_XYVJ7lI`1VWnEI=tRHnHp)Cz zvSR4Z>qyWMw0VtUPyr}-gV@xSDIf0yh7A=^gGFu+7h*$i2qg|`RH6(8iV7sk?+_Z? z5zdQQn1#qdR5UF64(d~!)mcQdWN~K#0KWVvkH`sE`GY2M=9AmF9~|`nDl)s@x{>$1 z*4n;VPD~8=C3BGKAts-muV!O-L2PTl9l6~mfFkGssey9=z;JhWFj-WlR%8r|#x8ju zEvw38k{kGu8PQOdQ}0S=gmH``G*xXuz~3cvRErXp9HYk3(^2zVad&BvccTE!{L^}$ zoXf=nM@00o+|Wcmea1?aY2xAv4dn^cZ{ZCT?6S@Pu!J0u?5f~YkDxL);PS&R2BCR9 ztK>?0?VdIKt|+CF>=oO*x}d9PJB_eb!hKF5k}~=&2S^iiD$lP)2~94|%s%ew0Rmb*ShSkaH^(TXS9ANfL~Q>aafbI)K7%Bzq&aQ?oSYvl4bZ{|CCYC*`$oc zEN?uZCr_zCUl5pJ?2;@Nd^x*6ur3y4_eqy=WDXg8xx7mrbqNwOZUhgSd3Dm)D=6dX zo}Ok5hPux1m7DAoXnGnVz=761# zT)k3I0d7cCr$K9+?{%!qVSD94sZ}<+^I6?9?wY96vZFjk$9R(Gi7J*QTOzebB&s!44tE2S z)WNLV93f@Qa!_{KoqKJOsU%nLEKAN()?;kitFoOlbJvsBa@kztkctDvg11swyw*=> z&NxRuO`ql#O}dIh}zmrLlP%nDGaUKpD z&f9jDQvDiV@Ko5CxzZ^=o)g7v^Z8rcwEJj7TTqY1`*UuMDu~e zf{C{+^8)y?5p$_xnQ_cVx^654Jk>$HZOBTH<}tXd-gH8z54%gf5p6ikA8Vu!qN_@1 z7VZNuV~UR&sy0M8UxPYyA2~kc6R8~aqu#Ok9)56nje^k<99$)uzR|pC+jC5AAiMtG zKq&wKYX2m3?+`D<639_DeICFeg`l}&p>jXroO2y`MQ6Dhp>TZwuXg?03&3rtTmj8D z#%2zp65zXRG@vWa9LYEvIdFt=K5uH)BOON=WbgYGdhNzfwU4wfqmPw0 z5Z3eI0q?BL5B3-!Zgy-zsOq|8 zx8B!u>0hX8YDGBV*eZyM>ygiqQEC7m23g+xwh+2!!S5;=JLXoPrk-JsR7k1du|Lq` zLKhv}N^2x$yXpGU(*iUVgI5fWu75@2j-QQuV{KZOBk~sE7am*|xLd$*EF)b$!+mdC zsTtoO#D#6BKNP-4^{G?}36w<9Dga!W9Os*8BW|hTa2Lbd3!=?C%(4_*=bvsT=_L1W z*_3hEeUTfj(-k=o!hDTkwcvlu3W%z06cldxb+;gLbEi^ziV<)7zb{=9a)c%RaL(`G z3kn)XMmBRouZdhEZDv-M5Ef^R_>vsJLJ&4QOpa^AulGM9)dTa!^d!RFOi=0g!Uznf)6%foe+D%Z^07 ztg&(_@zls4x5Ns8tcMxQWZC*@hQOtNm-jYDmKO~CImV8oz{=f)3pPZ=%8~g$G3%cQ z)`A<7Mysrm{P7*s4bBgzXvy(FAT1ejFz9kIQyG1;-l`P2$>9&so*`_`8hJeb^Z$PV z;lnIY4RTgUA~4CednI#qrU~oo2b~?>-CqY zRSFbURY6R!VwDTFcFA4sug{VOC|8JI4hlOknZ9)rT)CtD2xwy+;)SSyu)UffK1Y1g zj&q0hG@tqbUhwB}6Ee>;uZXZPcr*@cXk5v%1s;bDaG=ObkU>*LNy+domBUuM-D%Ms zM0GOT1Bvyl>(l0_J&vuVEApT2(^e&1@5A^+jtz*eUdmbpF7UsYOTJo*W{|l42>;ig z5sO;|c5z70uM>@lJbf3|Kqn>~6dt{j`0*b9j9z$Har~j@PMtA8M_$s@lq{dYH+;~) z+ScAqO+hhWvj?K3!|lonT!o}5_g>S=3B4-XFynDUdk!e@XU#aikonGgL(nn{4C z?>0}|Uk}XTX%2cwANnEQpf=V+YcQ2Rk9A@Gdf+2#bm0~{Y zxB*2R-npiWK-y-d#)}L#a$i2AaH3pTd{7nauCF++=j$h$Z38(qjAl=dxK0h?c%A^^ zV+%x6n!jJppX-sSvOfk&nS-?HUoBG}4b(TpB;c-(M^0ZI$Y3=e4yknzXFg&nd1H%! z{hY00DRG#TzA=+hCqjciGCrUF@~I`RvXHTv6$b)>L`7R)xc>uGLV{z|*O{1^=>#8_ z%W&%=^Bdp@{ZL+dn8xhou8*p&x-ghPz#OgDJESw#fR@m{FRiIB+_q=e3}^q@5K~L2 zOy0exAo*%YSeW~rapDfzzu?6oIc!(0*;>W9C8)koRK+#vpn6IluaO}1C423N&U@Bo zu{D9k>1$CazeOMfSrJ}kY~-U2eJry@mVWpPEeYD@x;lNV*WG{0~y<+{}L7QZTgI;1)`;%{*$fJ1om z5Ap67GKrZs&+DP6D$u|}QBzf|Q)L?s=xnJUzVzn6FMbG1-EKgpv@M)!M|C+0 zaURYTsp|VZ2to!F5LY?vN#B8UDzcje%c;fO&vXIZq~G z`wExd4Y!lBdekpE9_DiRpguArSD?n*ktZ{-CQgkrc;qqTTT)h{NXX0njlJ&RP`t%q z(FVB|{^NTI(FmPu+_;ASyywF)*r8NYbAeVTn?HN3ixK3ArZYt&lgQCbQLjlXmV!HD zOV&9#&v>?UG5$7ataOvhfsf<6quuT@73JOR)VZv^S*&DdQu7L?(`dCqKe2n3Db?-_ zS*_m@qVZ5t@sWRL$Ye@tk+?H`KNKBP^J9Ft7RFLD^YiY>j=gC8_nk8M#a5U;g?gt; zX1lLAe>d-6$(JmtKgJXJgNgaSz6fX#?h0wQ%o5z$9+5>##q(}-*{fUaNNYcGIX0wF zz8YFt$pqEIsN*@>i}Ko%IZm~Nr4HT~x3#5}lxogFzg<^;ceN~`DWy)~xA-H~yWRVU zPKYn3B5p9arA&fvxhD+TvH0V8dItJ??BO%qWz{n$BED}Pj%z&gB##pRHTPM7gPR49 zuQod$)lLdn3%k~o{+*VX;(_(>epLFgiG8(YqcBymmA0Ay;0>=qm?_Spi%piu)?b4z zp;K7b@4y#ujZ*Q-%&WO#5u4LI`2ZPm?{s|LCX8Hb6q_R8*t}xsM2uXCpMGflCohtY0>+Az3Zic&kdl%8 zLA*>qm)r~gtMPd-!0pC{Za$WOKdj(C6dggmOD-;$>Y@@G18tH;rI*^_h@}WZ;;ySH zX;-afC36hW7j@J|lWC_fY!Qe%E^Bn^tBuc43CV5o{4QB3@kK5Ue42|*ZDmoPd|oY2 zyKvACC-u1$NiabnCP19TpUz%pfj*D_^GR_QSJF!x1ZaKg%NB5(4O&cZ6!Dl+{@v_Z zWTY2=*bjpU>BlWNhO?+B;!kw{im(p9NT=w_V4`t8;dswBshGLpo{T~&3P~}jsVBr2 zhM|PGe%>!rT8Ah07;AEPh!-(l&li83#FoC>70L}u<@`2>uB<&^84*h;`)f6(Q{&xj zG@cY0v@8iDiU=O^NrMV8e0BP4yTwPVj*S%s9O z{j{iaHP!IbKnOfVrPw;jE+M%t72HQe zE3+U9%~d0wC`|UzfeKwT$Z(5K^VaA+%u$yuPMzh8PZ0GYdEao-tgCmkCv|!1b`AK| z9^>2Vj<|}fC?C>RlrrqN6*4W|a4tcZI&FXtDx@oT$rSRHLoL4NGo z`^bIWijoSJ?FcVMd^a3xg8m=YMtcjBYPeoD*lA1^>vsbcR3`T5K5H&WpPAot0D660 zFUdf7z@&_a{sgh#c`>X0YCxEtFX+?;wCfI%Ja%aTF=+2Qh{^A8Si15b@RX|LwE=vncxJ`IN&$W$;YYVL#$dlIb2OCuE`Se?fjg%Lhg<4Y zCXHxlr1=2>!`s6{84?9?&Es7-+eUFpAyq8Qpjz5-EBo0eLjQ(BW1~mITMgD# z=tXOiG{Diqs%}on+4}{RWJUD_jx^ZJ2SjdGUn4E9hDxJX=*&CxUx~=HjGJvq;{})i z-|+6~YFFt-{|1b}Fd&f6hNsW+t33R_mLnFBz`0!vGo_K{9Tz@JA_AWFpjjt#CPQTr zXET8@oK)a)laiJe97e6X5W6?9$#E#5Qvnc-=LLtMv3uzoR-_D<8u*zRQJY`0vVazi z8B5Or3Qf?pxK&0vvM8q|irp$aP_vYR@*balXDXj`f*i^|&2G!_RB1YO+ZejE+bqt& zfnX_@_pA?{5bTA;B9|l9Q4A*IXlU?fFQIzSqKe7= z7uRFN(~mvShFk>=Cl}f~Ih>IuoP}X>x~fDVdXHc6laCz8Jp5`V+V(D-5a_OVTc=%N zVl=JoX}pD*wiMqa6k?wJxjM)UTPLtFxxrY<`)I8)BUgrEwSe!NeMY4=p_fY2|6D=tnmL-i<)-k#T)^z{Ui`M4!q5*cPA}apOdZZF&O9v;g1Q zEsMUw}wnN zQ~6;x=N+SlY4cbmaIft2yr*|jHHyJX$KkCxKQeif?qyYDvuO&0x0aa30aEW@C5I2> zsTkh}2SnrXMM4k6awHCpHJ-AA#Ui_FxVJbi*jt`WoMXomBy>MWfNLxOy8V|D*>4t^ ziwvs!BvF0uvRsw;VU@_W;&)R@VoD!teA8tq-UGePR;Uu2Y}i?v zIM(fZF*My5z-r6Gqi#aq%N@M+)MUADfAo6(Yvf=jK|9RPfvD}aUnT91RBe9AlryQ;?luxZ$fOaVG{#*R{zBQAk zh9$>5uCyjxJC1ENDqKl=e3>=GinB}56BsRiY^FjMLrFnDK1037l8o6&89m~WPc{!0 zu?Oc*J#j3g&KYfH&;UfWgdVR~TH7`J*(I4NA+wf3~vs2K^d4IVd5BJ?{;Px|F8z!5mnrB)tMV z2pc4h*v9*jcn+9nIlHAe(QtIADWq(Pm}LmT>tVzjCW3KHL7h1}or~Bu?!^RVtVIZ= z@~H2zGdnn5k>9h+$J3qD!F6zL=aJD9l&h95p>ST{{0XIp_@l0pR>8hIf!1iwC`z&C zsGsy`#5{l4t>LP<9#$d=KH}HWnJI>$f{pbT7Vq{)Tu#4+ZC7JKks#8?4h2>_*7R=A zN8dBQ`I6UF1ri(C=`WstVb2(Upq_)D#9{f3(!Y@4W}Dc%ysm{HTB;x^h`bFBGHYy% z3rg&wP}k3!28hj9oS%CO-Di9Wivqj4h;I*UL0b})t`~eb*)SJ2hDzsa&8!!TnX2eB zIDEc+%dgbv95)pxY<~%P!#Cepn#iCHQvkJ<%3AJIaa>U@ot>_>vhq1!mV#QE_KORT zx$*KHMr6Ul=k^8KRYck=EzW~_1wvM2U4CvAjG>54?ek=-JIS7px3J!Cnyle%foxOI z*%|~R@g8QeI9g}~>ych8)A>k*1k|Ul#^XRHe=dIwBuJC`?vsKKNZ@l-N@<=&B(L_8 zS~{;V%weZ8%Wj0fLMkPyJVgt(6oV&IiYxG(d}w6{(kaQJs7niq*}76n0})v z*>lpW{Lpse@ciV8KO;t&i%PHV3>?!_nh0}UHfAGAB z%&h9Ci#F-&`Ew^;EX2{bg9~DPW36vqk1?ZfMTwo7j}+xgm-XKbhDoHH8U1=R z!?N-SY*0pTKkali1OjbMwX?+I4&N{cx|%n5$#nZ(mnQtcfGgx#Nl$6-xCFpH0G^Z9i4`6qI%klNj14tQ`nbK)bLlzeXH_9)*UTexVc+&=4&%kKFuW*m``rpo z5Td~%1RiuRjiK>n?Rm*8?F<6flGR{@3-jseOBpa&hQL*V9!fb56BlP5rc4tn-iX$m zhR^4WFp?(6C#O;qeBtUu>BbWWW&J;ImYI4)6;p%A*R{EVII5Y)U*`3G~{QXuW0P9DFG!-k8ZxnoG5KLS#CsH_*1;mF!B2M*?F(nK0 z_wkb2BCx_^#NR*ij5j;Nq@ex--blbfUJ6-FO;aR@y&PPG_-O1rB^ZLV!9I&;A{Cf((GNTDHNDJB&?j1G5zPi3D*wJF##Lq z(zsWDT5~sDu2aRQ?W>Of1fGXtH!0Z-;ToETbPd`a{3Ia0DUjm zSiozuu~;}f^RkkjQLRY^Z4wyoUEwQ#L_xpukPUA*vm$YmIYtu2kak->(TcFrK&V+9 ziCsu+c7fNow9ICh+t+O^)3qWl`*%{r;vAMvt0ow<2e29^$YuN7T;}O7RFnni@@FJ* z!xp;)E)wE^$6{HuK*G4@55*DU_ppWZ+d~`yBG1hZ3agtezmZhOJLMNcC4I&@wC`eu zF1=Lia32HE(9&l2A46*g zLbp(CWzq#ao{3+XZ8oOm%7m`tre{qQat!$!NFtG4_+*hxW_R4%qztMS5QN?Rm#_po zrDE5wB4|RZPzk=yM7FzEbUf(S6BhU0S0%zJX9WSQi|T{JqN#=Qo@f6bS8o{>*S189 z;&gC=JA?$c;NCa{cXtgA!Rg@c5Zt|Sw*(DNaCdii*W20W?DOt@Klwgrdd)Rw&8jg* zjf#EcDk!g9n`IAg`x-l`hh+)g$uY>UsIk@NQ`fUl0S~Yjjzp={8Ms^s(DnQw*L`W1 zUPGNk>$k8xJss8CJ7<|RmvieM8Oa1krRI0_3F^PnftK(CgSn1h{n!@@@7&nM^%kYA z*xWW=(U3Xu_-4&a|96J#HQ)|k^BU$zr91vT_0SUJNo_zFSP^a$l^|Gt-GKlX?$|}p zReNH&PF$~%Y>n)?ST(uh`=U$1UJ8)|Rn-iI#AD!A7fKXS9YYpryDn8^UiTJl_`7!r zcQ;5mgK5&Ehjr<6bSV1JfPlsRT%vJP0yYOK)HAM$a^N`vl+pH%k^GMo9IIe>Q=-Y> z`jILI??8#$4o$O}g<+AQMQ~)Z`WRu+2EP#c`AJ@_h~Kg>J_=;AN~@nF>_E_&w}(Qa zK?mp^mI^6d1|m!!Z>!Y@i^NHNl)msK!#*5_^q_yzMC*yjncfN5qRF|2EQMyMT7p!h z6V66C>;BTEuY>hIF_?7u8kq4!KP%TBE(vhd1FQ3AmO7NxC|gh7PM;M7DhNR$-C!@) zPc}K9OsuS3zvqjj=7hi+LrQ6KfA$l_+ZKpRFkx zpxwC{FI$2N-?*M8Hx2i|(9~5P5ECarSF|3OT1h|JjbvpoJlZ%Fet_qxXLIC;I_3*{ z>_gQkX{M*AOQ1&LcbG2>?yhvL=8{L1GxOWhuYtuW*|1ql%O@WAp~LJIiOUQMds3?C zv4ybw6B0nuP|>mOq|PUVM~Z=aI9Q4R=TUN#Mw0tf<d(H9x%i*_8pCZT1|j|PsJ7p`rd zkrdR*S~3VNPP14Bu#_b(37VZHzz)iY+uK)`SpE!aa7d{Muo&0V5U7}xOt+Kl%c5s$ zjv#G)t&)C&eQ~t-Af0a61}SW8K6b@!07yB;*&kkPt4dU7z#FRPgMRYJ%3>x^6(jYw zQyzGvIVQ1aY=Idwcfk`bn(}f_U$vHm7Cw1AI!CK9A0yb0#!Hj5#_klEmx`Q zG`^S+3=z~9ayVeHn$wBcXyeY2^S2vL$X5F=f18z!L43-P<@vhYzy;b5{U6Um0__Jn zYfD~r-$I>HyM@zOLz;n^j8#9!6Z;=IoRbeRdLOsaZr<==!2ho9(MA3ZVM7{&G}yh2 z>y2O2qAv*XS&hM8TUEJ7q~#L=2@z93tsiu*^NhHgin$%3ErboKqc$sTX6G!GU+o~X zFiRWlk*q3yEl-`|94*87j&dIQ?i|&eIcWFk-hVD|c)5=56PL5BuZ(2vKi5<`azNwP zDWx@;p>VZ(a%J~)h%j$)Y~r@!Nn;#Q#z3+6wEaf@%i;k8lTvdqN^nBs@=i4yjJgjmkn>Fi0SEn zt0Vux@CyS?Rm!40wpqq*OKa_PIaNd0TM1)cXzFtp!(#OrA{wYUs}C*hOZwfLl!xr% z{xBO&gz9F9!iixOI`d)IW~-5~lC6l9P&Q>W;;#|g-(N_N?ft+}xwiDp|b`a>H zF;s8@Fe{d%|+oF`p9#bUbiF+Ltd zgcrR`wEqDGP87bwa+P+(KLXg-UA|=4s8I{YoMe@wbxq{0%50)eFTo7;vMMaIx*6zMtv)TRT+_`y1NLu7|M4 za#+(J0KdB{3(BQtydDY+8l;FS^k`dathjZ{ga6QUC+wBEXLSo!R8UjJTXWNA8RyRM z0|7Wgy1iKD=^iAc(c`~abz;oWLSR#+ApdYcC9vNpGi`4#$B5L-U7v9-X~XHbYs3Dq zUo6AO>X%I|=~Kji-^IL`5P&Fsd;sC!+B(vg%z9cZc#*@Yux#H ze8Eb$&T@?vht%x4egmVFZj?t2GW)$$C|@U@1m15n>NoyumsaD3D1;ML+}@L=a?tKl zw`VhL^50dKanD*fVgtlNkm)ie$h;zN{;|y!j_21V*;^>==Aa za)B)Gy2fZ#`~w%Ps8mq`k);xOCkvAzOFrqN9HDMU`V|mf6fpuB8mYX zMyxNE=ySe<*~5W37X)o5C6`ol$9x%I$K|_SGJo?cEv4h(Q0|9p3Gw0p6R z{YN^@Wt}*JLF?Y*hPH|RnJm}x4JXr&ZYk3fJvd+vSznFhxgR=KS#ybrisD9%bQa`o zDaT!nlX=?TP^OGwAG+^b{CX}n$dGGV9^j_4AK@+@g}!ZF`I0%{{X ze1X`szZ=B2n+m`uc%Ehqh1e{ANE<~tL8Htp`anF$HkoC-|+X< z2jGieyxr-I`}_WpO&A1HCa z^m}%4!-OK~r-1d={Wv5NldhsP(aln!ywq4L@&)zs-0~3eqs6EC@+&X1P+|Z!gb-5~H{2OagqVWMLAu7VCN|W?^$Gvw z;;{k0*$nxMgrg%4MSJ;a0PB9bO_gmaxD{xU^WurQ&oF`vh-TS zt+lbqsTmU=QCm4#23$@2)lgfl5jRMM_5Sdc+MoDlAZ~@{82t!tJM7^lg&`l(3d+PDKb{ppz@ReU43<3TTm;t!IfK**{|kq68(;`gk(f4O{^kCXM4?vS zp9hx=%V>0jKcjT>gCAO>ivTLhVzMXX3oM#w)MJ$bM+g)Av93Ggco*L-e>u$%SRs7b zBOC}sej_HR9I?9XgVFj*j(|FaMCB?if5wRsLXMd_FU*|;QVcAe6JYTddf$wAiLO#^ zajE#9YQ!2s8k?Zv{C}wIf3z8N0M#09d+|}N(rp8AJG!^Yl!Px^F8pT7eKYX_pPI#H zQC(ZaVilG#x>?a(F-!9TopcJDCPT>$g`l>xXWJ=mt0BYV>y!&j7#B3-eB|}EK3Uve zx{O|ZuMoWh%>9(ac=LbI?yr}@_6T&|+W9&$TLWJS;Rr@y{|Zyk_E{s&s6_ zpsvLR>wc7cpbrz3`BD6VPdtvD+$x803p3J>_D%EWQUqd$w^v_DCp|(<<$J67q$2r6 zyALqqB9j!#*eh#Rwt%@ZN;(xBBdxUhHHR3MurSbX%8!cOi?kLqyS>TJo}_b<@1~Giq_~OY9!% zz$(-`oOo^{fdRHwK7|%1ZbK$n$@N+uYi+oGsU?H|&IQnlLK3sw^%Z-}$G= z+4~G*G3zZ);U)3I$8_upKj_wDAHMfjeH0tE9H|G#uMgSvDgP{CMI@A%*3f8X!KD-R zbQOvUl|?KrL4f#k>xa!hKI_;!@1W3O|2go)FsINNiQztij=N+tx%nz#81(LyX8s=q zz63!C+sOnK{m&lVaAP8OV0NTvLmwG%$zcN(o> zYXACaS{_su%UG9bCj9psp~9F`D9y~wY#L43$wxVhVSFWm8@1Dt4qIy%7Pq9Pf#|2B z({J-8Vru!anhchlrL;Lj*CP_`C;QrlAii#u7t@(46Z(8N&~>b}|ER=-OfZFzj%({e zO6qSzJn&}6s)^T)d}(I6A;Oxg+;CA{Rl;whzCnI2kSM7=0y8{g)=Mm6em|3tSly?< z!zZI!LN+6x8zJ)CJ>5H#skYh0;9WEl^0CBTJXtO#A&BEWn}>eUpx4~ z#J&K+>0v0#{3qQQ$A=RlUOxg^PD0atSsj~tk+wz%;HWD0o;I7F;deixhJBXs)ltzJ z-5eYz+EBzn(-0mAf1kK>jNQrY0e|0&1SO7>QtF@NyKH3N2_`2)3Wk z72}}?MOkI6n%l(VFLvcOEBc$^qt!qx#r1hN`qcBol};7n#^eIL|NPyE(89m9;IM_`y)(iUPQX8=T1pb8kXmNY z2ET()iekf>%KTdDZocrq!i-;aLN)P|P?$V#S$-;_aL`nTZDm(d; z?eo^=?u7Zd8k>=c6m@2$a!G(99nUX4Oc}LfqeAx358S%Je<)A1;B7zDj7C85S3O@G zaeW~v@Kkr;#*tb!_EtBwzgP6|esCs}Xcu52&nE4jcA50RjHASYgEU`3!y+i^X83*N zptM@qgO_zF)nD>rSG%KDN;Vw+9Ov`VoJ|Yvc&^wE+gu{Cj|nO&B}f8_ zL;z|{pOfR#i<%sLQY{7HB92YU>oW}6(KDBY_qa{O@NjR5SI<6tb1Hjt z2WEN(F0TmC^6CZ_M6LgcBp?5S>B}W~5oW9U{X4yaTX8i$N+S%SLQ&{KOo?KuJeqJF z8EKE6Ov*7`kcRT5Fq-9{QA~lLti#7xgR>fJ{tb1~;l2hP1)PFViMMhOSyhb$0(OOz z`2c;T&eufgyB;?7=r#s~A8x}!RN29F(97nC>U1_-fS(Q`umGGbJoZhb8D{ql-+lOv z^tuO?+i5zJl?6=c$)TsHxQKSvz}^_lcvm$xD!moTx~9(wtTpSFo;_wJdb z*XqJ0MxUA-TZY-mV2U&qKBO7s@Ta&>G=4kU`zsNUBVr2?d^vC#G*7-u-LjiZrwvK& zc-awH^ti`s_Fk+Cs{Z_elFeUePb?=AaYButl+LN&>7wGzk!`|-N_|jGZ1{rB?vi!E@80SuT9G}e3;!S%pRW@R zYTh!7_T^s`NUs)v!3Vzej`)+hUq^WVo)Ly=(-!?m_45(d9b|OUvvBL zaMPy9f5SUhlALd80=becx3I`ZkYP?x{c2lLW3$G<)ccs#o0 z{usvQy}1f8Ul|ps2^su0rnJ0GM8ocdQt{BCrK+SXMJo7%u?MBtT$6@oP+gzZBS@?M zyrp9zQ$tW3FQzpZ9Niqv|4?R$5yGVjeq`EWDJ6h38R=>ZF+FGhy~>;g;bPTW4By7A zgrf4V_JmdLhdl-3 zu)CsB=pv;2HlerJl1zBLF#rVSkRco$8-3hRA-Ol@$~hwKR2qCjAm!#z!#Vb`p0-BPGu9OPBzo{w_vp4L0}j`&ph}zVYaPB>k^8n*FHa?W-^k zHQ0`f=wx`r;1M-c2J<8ne!Hw!%h38}D~Rl7qH6>Yn>bM#O3YuMEG0u+GSgd7OB5g% zFfRs-H|C1vVi^&Dx$=~XAT&p*?q;WjX5-R8r3YSRBzhp|U1nOTbT{v8ODCC~_;O0( zkDy(l^}G#@$rQgSp30e};dpZZ7LVU+?)5}%vgJasxO1<#8;7Kma>(C+CJHyGRYm7^ zwRbF6mnJ|Yp)y~OXzH*WcolfP_)Dk5IM1BG!^vuke6$Zy0rTCJBbut|Po&m$<(mK~ zC+Ls?(@OUUHjPBuE_#P~hS?@8J-)s17 z_bVebkLEaIYF$xk4W_)TDZBjEDMZch+>y^bHb|+3bUY}fEH`lQH~0gVoqTt51H;Rz z5Y2D0bcL|(aLv)xBBwZiWz7jktw$aE1INmXJbUbpQ ze{5;+>gRPpC+ClV^t~lI1XEx{@EqaQ%#IOL*;i6&AXg9-P$$Q>p?&WnaVFWDWj2a@ zq!?zge`-p&UPHvDAY?s2lcy)MEQi*&-6Z)MC~7|H1gBo9`{z9%xF!WTB8YHpwai12 zDsaTZ+ht$_8AcsLYNav5wf&}g5K5f5<#cqBed?_t)s;02OVIs8uY7B?XyL2qLfBGC zLW9kFbD)qCrG<7$iBrYn_giZ$_tLOL*44wUS)TfFL=H07}B zd2m+3=kv+?EWKkKYIJ=(JBXq~3nJ_OqB};&W#kN9bkeHK@1VUkopEPdaHcN~|0nFm zVV~zpp33DYu+I3xx%H|n^@ty|m_gkf zV_s|K+SS~Fb;(f5(Xa?IolFGH-9716?t^R8r?qp^n7&nFy?MR-Aa%|2H7Vq8Y-Pn7pB;ocd!5CE(0Ux!w1Ip9${GW@Am!J(}Qy)jBn z>2IAz&LlFwiS96*i5pzL;G6H}R?czF)iaK2Io-6?+(2z1dFuyW6C9}G{4^*Xl2+5= zvNu4ACfl9%J}FWBPYXl@E`z4wsnJvxFRj<#zMslPIsU9~2e8MJW{D zTqLi`2!K~cHxzeQdCd75Hegt(Nw(`7M=U9X;}ZHSud|=;=^*;cd`?O`avFY@>zlyq zRY864LyW&*X4;uGc)5t*=Je-t%!tRnax(ckHq006j-1GsqeE0y&*~7;1{fo!^Y{+h zu@2MdqO&6r}YeeEuND}xV(Sj>0i%`5&WtM+|S>gI!W zDl{h%7Jn!|rfr^jjal)U)z_TNfTBQm?cYdmeqB;P-ch=6=t&m5k0SFLq*7TfUFp_% zt*?45@5)S&MBQ3cw6=HWYxa(qR8TQm{1T4O`)f6q7ZfwSR0B=V-wjfnzVX?09&e#B zIT(^2=E-o8UtbAS-LJ5`t^PXV&0b%@`1XurvDq5N0f%~~i@LI={m|HquJ<7D$#hZ| zEjL|`0N5`72l>3( zeUp<4kIC@tM$yCz%x3E{D*H5!dqthZoHHCiWO*oKooVHL1U%c0;rLX_kO#$-)-hf3 zvS^*#oDy=oD0+}gJ|2eoTVtwKPbTrmL6T1vgN>5(d(?MwR$Xhm^0G<&iolNjkC}Y4 z?L~4B13o%nU`mA0e6r1ejiu-n?$mq#T>iQu&Hr!->pJGoR6Z;jx~RO3MdH zel1ZN{+}BL(djtkC9)U}Fn*=s-%Q-Gvm(ev^i~8i=BHW#hr%L+GEpc~%4%kxApUzM z&C(;FdxUCfnO3j0oQ^3ZCbS|a@8aXgyBZbgc=NC--)#yJ* z6WyKaw2Ol9GZ@vUDud5r)VO$vrx_y1T%K~#PFJthNV5EaoiO1Oue-cleloc+n~KSq z?fpnAE+K(rG_@h0@BA~L!WRhGOKR`2WZ^2V)Tmq1H&G#xX7F0dcsXu$9^7Cl+dF+S zsKekWdEROIS?>$c-Etg3XUYqR94vfc_gt?xuh5{wf=lxk+Rc@#zk7iMMbwz7_Pgwl zZ^iC}4p(m_)ub3qc!DqRkiPEHfw--q)qE$=oTB%n(dGF&MQ!aZQUfu;Tm;AYR+Xm( zlbtbXxJ&}%%ZTGID@Dt)0^nui;}0U5Xx@+fJDH|!5I%?W^BL;p-nSAy?>y1$*KW;I z|0jLc2$%jG=My#!#pl912u_Ta`S>78H#D}3w=45?U$4C3x7a%rFYf1^p#-nMZl;G` z0rO`y_VZoplOK|O61KzS;5F-7bMVII+5RIV^MHD;VlTm$aLpmX=I(ht-M6gru)L93 zf!fmr`UmgkAo&XI;r>|ZeD5bR-mW=MG;1-GUsoQpUTGC&4&1#U`JO}t2lKi(=hG04 zlHi-d`#i-b=m@>D=jd8ATXip&J&-`b7=ZBRoUb4o&#UU8Vls6Sm4e^2`rhTJ?^pi#W5lYgJ$}Lv!Dxnq!2&I&QT5DI>-*{zG=@gO;h)?PJ)t z+iHhhK{1ls%C$NjpPX)k)`HQoiL=$5Yk{&E0dC*ygU@*SQ7f6XzJ4kLP5lmony-X7 zq}rl0&-@d+!#qyTF*!`2JJwTZi>wO>)YW-+`9#-)n`P{c>B#=vIhTl*d|H)nxmcN) zP`g3Yp2bzxLGrV2i}_c&F_rF+kLaxu#ZmolmR>g0-Gepp7cb}C3?myy;{_OR&exd1 zOV;~Wv|V&eu97`Oslr`ei&7KLSI1pa$LnR$nWQ<=P(9+9l1?3*RSkIBux%Xd(XmnB z<=XC$Y=rav!{6DhdHJYh`Z~eNN8{&Gd+>fqlH}GqORCitziM(J&}7sVYG$X*Y*Mnf z+4Cp+XK=0>yR`?2Om^)9sO;tgE|1q3Vc8OfjE~{P_p+tMCFwvyJ-!=1a|&w*8~Ud2 z5@$YlbLPNdh-e~4Q1cRfka^JZwz*J5IHv{zWaE<~WI-@VIlmP|<`x^tpul=a&P8y?>cB^7T-4?r0z%I_Wb8@MPtTHB zFM4CHtBUe7m=DEcGaaA!1JJymxf*zADuFj?5A;Z6?zy@8x8GPe>H|90;;Qs>d+hbM z{I12sXA5i|<-@S^Z9bDB9!i4wQ5sEB0=esUv3MIUMW;#$`3+Rmawk8U^h>w<o}StnUSm-2XieyqkCVv&A6Zt<`}>YE8*!c zGma>c(8OFp=Z#DOH8D*vMT_u9$*@FbQD9TMUVtj+L0m*!$y8+9^;5G$`LUD?kvIz} zof>PlZ_Ng%G#?~~4Mi@$h{qVAMkLkGL#fH0lq|xSWXS6A{sOZIG%tgz9=rv`^+Mz}ggxNne=C%BK0q91gu7E)BQo7922H~*M-4aj zNiCTeVg-iy>mt$r^#YizlOn?c^Ypu3G&nKuWb??jG5h#t>!8zn3F-AI0ZEKDhQ0Rp zAK{hCvf!Uz-(506nwh2D%+9)s5<(JA%h*&3f?`^43Bp@XdBok1yfSz!qE9iW&Cc{) zcL@wqJRu{upLm0=Nh$^`0)>KZ$cp%FoefCjgcncUO-SF`q$n_AE|OP@&S5tKEhr$a zQe^!3r99UQz{opVLAc+LPowsJW{&^*_A$HqDV22fA8P*K;O9g8ZIbt2THdbk-`eC^8E13%iWIWtyMqaO zeys3a$NfELXrV&T9qgOvF7wxjnxEceynoT;*IsXJlbF1e<>ZSdXnINl8irl*di0&4 zTGGTrrdY>{uUAQOS|)q1P@sRh-fee2rBQ?HEM%DpU-n@hH!8gcm&gMU?Y|tg=bYlL z$p+Yg)G1)^PNr>v_)1+vw5Q_7h%wM9J9s{=7+c7@<9w(eQ4#RSZ4Czs0;|r*334us zh6YnHy2xvfVk4$ym;pq3{UwPJfTfKFtRF18YH>9_0)F+U97pmxYP`B@X4f2jtagHa z&bM3C14U1YXt`~{x^3U1mNGIOrxh&DXKg7mp!F5+A3>KzZwE70Ryb!)Mbn%d`^7je zeHtD*=2g*12ey04{!Q9!1&L5l zCk*50h0s4}wf`HEf1z(%S1|CLPCTkv%y!fW9S<^jjAcZiM#F>BuiP)F&aRhPL0_al zskNdtXj=Le(I9>1rhf_j`?UrgzxDll$UOvc8vhw#rwFIP8qcL-P>I#SiE2?B?sg*J zFZdhT=-31~kqcQ$*oOE1MR>->7J@4QJgn}}V%dPNDCi*ACBY;h_oImi46?PO*j z|8C5?WI)AT;MD8wt@tB-gHm40YewhfG-SUvw^l&%GSI>*TRY25pjBU$wHuPj-AvgUD+n_6VdSJtx@zS z!D-0W^YS@bbPn<_RLx~^rv%BZKV)kbn`goeXm9L085^3T=T2I4`yApu14E zF#(IpHcErG@>ZJ-wRe#)#-3Bb9l8n@4=?0?f2v2#MdqCw+35PGPB8)a?1^OGRMcGI z-d^JeV%~&sNFNI)UXnCD;WNuT$d&sjT)_U_V7e|G#KFK5Z zu^!&JM5*zGvdY!2!&G>yQ=b}BR{QoGh}i7)I#~KFYI-&8rzHTl4~fxyTp`f1^Qk)x z=Oc+ynae#EF2?&HVSDcr&NhRw1E;7D1&2rcAxhjP_?nwV&Y3A6s|%4$Q0=Ec1=I#_ z#js)|fxjjsyM-#cQtjp$0(kjFx2A7TRqQt~T5bkG^elG1m(gLF&TUE&AA;M=qHCL= z_z^H}V_i&&p1jTz;1d)6n)ioaPKh8X% zwCur)&l)at1Ene|E4?@mS)(#}okpEk8c#7T&dfk#_lmb*Xw1;q;sm)hj74_VyR)|L~UaJ6<%oQ-bE{w%m;H z8(k!@y1;B%YCK>xKcR^lMr_^(rpquAv3pOevE$w~D-uM|lL?WgBk1#6w6XL>NDf=? zBwtrqVUy?^LS8SVWwuaP9#wxtU*oksIGLnA3VS8H>Y3|=uQ4@Tlc7$s$=mtqxIapy zE9le1Ul?6!^FR-75+M-v+Wxe|uKn<_2-qfps9*!p8m>ZL-;>yvdQACLuN~SNI1;4; zw7FLKe?onBzMC0S7L#Ild3kyI`j|pgT>UX-_VhUkiJRo&+#7Ggw(d=+vLV1~>d{mm z<0}nJmLJ8TIPO!Y&hP1*iS@zF1&TIFe2wIqqo0MHBTHExj)yj__tsO*Y@Q3LB|A>) zGo2r)%F}{~FoE{8+BW#(yp<%m3e@_o%Z|b`nxCxsu@!IA-jr9P&uAVSlmD>w{1Ohx z_vVe@YKBe*+1}?_Td7}E_72u(w0VGBaz?wC-;ZWnp5&)5LF*J5lLUWra5PEH?X%~s%SNBAYMxNp zZaSx_1hA3#sn(~^s3Qb$7}fCBi%fEVE!&F?9d2}`!8NS;BZGtEXLp@-SOx3!f1@j@ z>7dxCbIRbV-cLbcBoRKd?N%S68}Knnm0%Hgti1MFcBzFO21ug)^|4hL_MQl7YK&-j ziPyqIr=#F`#f7w-#(JbaOBn(R^WMEgd`B&EO1oW(5Fw+LL11*%iAbg>A|hhJBhP~-oU`)5msx{}fBhzmsQhuC z!2As!aM1)ng(@TCyWow!S}pE^sgDj|Lkv%O+X6m^V920<9$wx8B_i_-kUu+$GYII8 zP$Usxtfb37*DLl;D!A{;;5rf)1IS2Z(3Sq|BUdRK_|;oCMi+*(u!|te8ZeCmdp?kH zIv$9(RyG>lwVR@tjMrTI)x0FE_Dr0;R$No9`&XElv|JNfMHYtm=0_Jcp#+WJG&Rxk z8?U}@TJk2x%R*?Sq^@SM2}tjtlCWdtAzf&~U1}uHcPm8uj^O9-{`*C8WD{Ly*+ue;c7jaGMf@YVPh+k5`+Dvzc9oLWb`R;sW}HH4WEPs5zAJ|qb~w~XMXNhk zzGvpoN@A1z*qYZ-HMfZ5k2R35*OnfQ!~G7iykoppXiPp5@y!%LD@nry;nIWxe!Yyp zCyFwm2qMs9wcOHuaD8uJ@`2v5bCqmi%A6nm%D0=ZG-k1or`o~fQYT_9Px%>jY`#f7 zh{2F2M?Icm>$fM`20@hC;SozTa<0;dC8@WcYOlWJfI~p11&E{Sw<2zpx0EF%iOwVE zQE+Q+V4?}cTNHMSsXZ>5(JO=_E=z*~IR0ybg7Pt?`I=ay`_v=uQSS{u0v}k4FpJT1 zdyc9qeryCu)+cP|(>L3D#^N$xw%>|VfXff)Ddh+)Sw_KrU{XW8x!{5g6QZ6JisH(V z1#jLikvR+7!~MsGMg^IQI=4~ch6Um~*2Uj02yxEO5XrThdFWG`=fJ{YgkTYGB<|Tt z%j*?LZCP~)u$o;xxy-8|)X){?^S!H?_8S6)Avkysxuu(ySru@%R2Tcj=|MZgx+-)r zsrLR(^_vyfL!F`XK+n>=%;>X8L>m$$`{rrBtvBRdMc+W0jyjCST`DIDu`pyv8dz@Jy@&<&+;ki_hwgyS-m=nVBlJ!Fy8_pWo+Yfe*We z0-fY|W5|))jLul0r6k3B2S&`s>;T%)|rv4G`<5IXV-VMM>g9&yVabEGAiZY+0x+7o}rnNm&pL@w$dl-Rg32(_;{ zmKQU)tFEP7z#^$-3qfa`WA=H|cp2xLM9dJPTHZPe!U**&3~dZncSp#6&DVuyyA8Y# zip5oNf>b6o<1X4Q=p=d_o@=uyi+mQ*i*C-$-7-UQ8(Og6pjLIV{EX;`5%o!vv?81Z zAhcJ?cR3@4B>cjv>B4?P91(y&A{=V1c;d_t)&bHY)Kv=ZyZimHn528YWVb`#_q4NS z*VC67o%vWsM)xeptTd1^1#I`Eht(7z+diy~``aHY ztE21)2?x(jZPtMZQSh?vN+LMA;<8pB!vO)hLdV3`HwsVTIW~=`#CRjoR~T7e$mF5d zY0F@*7|*;OYu2Y+y2A7R3U7MOxg7QZ+E3ZGcxW2sCLR6Qg`j{wm|_T}6*cp-J2{%q zx`DH=N`$?~#JS6R)Ay|e=|d3Mz4!=6f%`WOL_p;wO~R znw zl|-+oxh+`!B$Lf6bJ1#PqK$ZZ%F+J0xsO(%0>aOje+K(#dQX3%cmEN2U%yCw%Or@S&UMt& z;2{<%8+OB~kyFK*3wQr?tYp@x*#~^H&^1yYdC+R#g>P~zGIcY%Td_hp%70g`2Ljj* z@z(dwqqH97;2@r<$KF<+l1_i}#>Z;kJ)2>A9`-991>Wo;O2QsiWo97Rw z0Y_D9u2r}zTOq8n`m=eYv`z%GZX=3|t@nm&S9wQQTQgIOP0yq5h*X%p&!2>pWD zQ&-V_jBy2p4vhY{qSl|`Nh@4|2rG}}?8GFxR0;1!a>rFEyM#Df zh>^XXe|R~a^-=oRVgbB8RC9(H@k~rl1mhtd-w((nf7WvTRK@jt!B(b<3kQu1*Vu7O zj&qgV_l5Xv4Zqt7mEwc`v)WV~fO2`@;uGqam^Cvv%CaN(>GuxFm}a2sJgd7ZImto2 z250qc6r{PxI(t{&zAPKHv*p**wvE@lNYs^1PggnMOjzOv1PUl^l?1ofuG(On*<{WR za^N5zqkDaDU%PiiU~~5<_c~O@>Ee5hAxq^bsnmO2+8dgyGz*Q<>ZD>a^i|1->oG2w z7{n?Mn`sz0E#x_%TWLXZt3<{8YzxpwLR{D~nlwl%KDV8+O_2m(b`^cT4`K={1cxx4 z0g!d>LRUKqi`@504Oh53LP=fIYv&-5zj{_kw9>%(si}uoa&K5YU9K(JrR{tZYL$E@ zXJuyGCfDp?fjCW5RiR35uBs@Ok7jdN)o!(29+~LK>$5m;!!H0N)p3<>XU}{E$<)aW zZtAd1BN;`Jp1SI>4h3sms+Ja;+Nxq=Vwyc z$V@-?#AHy}q*1=kDh+qAeJ&hvJTA_st$_@J32ChCkFT>8B9mAGw+(5LyVJY8W8nu; zkfImb%R@Z2 z1_x#Gt2EkdwPcgv{irAvQ@$l|zHD)t2;keqJf84zA}aU!^9SG=TQpdCV!YyQK-R)anF`^BeC<@d&KQe-jP&0A5n`BsnKf|to!tV5OzlU1@)JA z#mU%qWc>mWZ;LZik8sEWO*4NM4basVedZA7mjN=Vf!G@fmC2A zZA-0|Q@ z&FFHU=k4gj=3diRK#2KEmsU=xH3?HiP>JR&EyU)C4ImbsGP(*ckU_g>IrMuA(Z5Qf z%%y);tl6<`m^(;l-uFTp-MpWuV;3U@DKQs$<1+;Foh^CMDO0@92`d-9a^Z~x?94vS5zYziZqwd_H%m%m8P_d! zQ3p4>H~%%t>}D#55ABw3k}FTsH-QalbT9~LFGJM-dS69}K%G6ORb!&3O9<2O zy@QDoh2k>U7w@n-AC-&hu$3C_XiJ$ya9bWlgM6Px6|<1V?bBW1l`fJZ>hr62+bl&w z>HP>RL=|2pb+L)4WQ;!In4F>uWmKU@^PYxEOet)%wJD*Wyd%IJgcjU&H>vs1Rwtj*&0Mp<7B$vlCm5DybXHXNr_5U{k z*M|M;cLHhMn6hnJ0e9tT=Q~=8VsK|;6dP^KISuLORwlnhi|U|wvQ&6Ip9t)BG+e~q zh7QsX(sa|K-_&s4I|RGO_)B z6nH_j?562)9KJInQ|XIkLMGG#CnvAf`~t0k<(79AUwCQ$pO1BD|HlaPn;XOk)AlO$ z?YppcvS>g?~GTKzb|{?yHN7*JXNKdyHy?bz8~Mgmd203HVvRk$F+GLEft~8aoe3f zred|9?={_rl{W!Qw)}!F+CPA z%?L;6i7;CMJv1Q=`I81hzoQ3nRc+4zn{vkDg8GDSmAlD&`Vk1bwZnSnv`kF7*Tei) zOM_~L$tgcX;e*_)-p%HFPPeA!GpL{mddwlyo_S&hA%Na}H{P5YYC~saB#Z@NfwXp+ z?sbRgvF~!VULF(T(-q+1dv`YX_YLK0WgqSfzy^g>Os?C8e;$HZX972h6;U;VI zUE&x1;*z!!mIa>{`2F`m__5!79DwxR2)y|8^z@H5DBm$no&L8l#g-5o2v7!#u*XA~ z-*bk{n}?D2FH!vEmI?+HXu+F9%!+3h1TC$!sUn~qc);n4+xk~?^=!^(9+l%{v(p~L z-wY2MO0^9Q5S-pXtdPrY+0V$giqGrj)XTVo!HHTB%KWrQrC3V^@$sQbpx(uPngrjt ze{D?`E1q8q{QY|$+B>i$wQJ$PzL@7#9mkIGJ%B8mzd4`53Ge%(i?y7B;B>^SeN4{7 z8z~cBw}Jtg+zG6JS!&BV)^a0l+ru#im43v2NfT~z#73;3_x0rUYeL`JR_$;m#?9HO zi~?{u-KK((#|XtoiD-UzI8~IClr&F`;JumQxtWs-y5HocNce*{-O9R!R?jj zD1szEsp<-wotb6mg$$K1V@o*U9+%;Zden=+xe{|rhC~~bR!8z{eugo!s2ZmR+Eo6G z<=3WwS-O*)g1Ge4MhGND8U-Mr=EcThOgOg@*+^3}Ffi;t>}3~e)tS?_Y}Y|{v7#JY zr5?9PD(DStKGj}gE_dnt&+azl1w8*c#$eQR3|xHh81Ju&%A+T)RL)vpuO|xh%_{m6nEvY~bZhKthAi0rDyHdgl?U zcSxc_Sg+as*rr}@DIY6qjr_yP8T8*VmUO@cua_b{R4dnt!nB-#yKl~}@6W3*3eaL= zUrC5ukKR`U9GkbVNE1QFmY!YeQUVpt$w zXu5$7*|_04=Wb$mCW_8qEXnsjIYFojB$QSA{8m-hWTxXp{=W|cBpK+!G3t*Mj_B)t zgZd8NFa8prO1bnO~i$FjI^|Z0#$016o}>uaLKmszaGPEF>U>xpHSx4 zyU?P^a>xA)XJS5;)_FZlqRz+Li2r{-H82wMula*wd!~Y4y7ocm&Ygoo=z`2LC3vn$wlSJ zrDa>wl>nkO&O%VvTPaORImcJ&ACq~4DW7q5z0LzK_s=b)Rr`h$hrhg1V3kG#EHF8m z90QzWio=9*VvH@0>hRb~&%T@zai#Y3R>{^R@SG|dTA{K#Jnp(UG;JkFO2my>L7sOY zQBVNNzgGLT@uYNbuK769-*!LX!eg>x3Thkh%!Pv_o0$O_34AMpKD5cVX)@nU`^~oM zqQGN*IOeZTNEtsXtsxVQm5JsREtym%`kqhEJ0^dG{-J1@PftnNq`c&s&S#69m-!lm_{t)Q#hi=n5o2Pf1W6D+G*%e)F@ zGJp*YnUl6%2tYc)*YW1J>NRPolf~v3?uK!#eSZ^H^=cH#Vcb96*R;m^cHlVt{=wGd zVMw^pV4-0>LW4*Jx##9(;XuuCz^Z19dwyct zY+nDpF-Ju*%bF!&F@QSDeG|#`Yyfo67F|o#m#u0|vG`s5hp7~59 z$7BIh3|bbIkVweN`pnJCn<2DhLSRJ)Lx4lUm6gZs+4gLOibb<0m$fF*tp_x*HLY+& zWt!3z5KBqOFKbfo;GOC!s6{#-jpCM5BlT0!k7uWHU^gN7-&eWNe9ca^2i8oSd)@iV?08hU66>>GjMB9#!lYhQ_d{@*_k6%7o%SS@HTu>`~QAG}PS?+vA z6>}o=)}|v`zT50igX}tLwxliZu(#k%zkMUiq)0uU8B1m9r;2$ca-#xmM@;b>#Sh6Tsw*?O z{iD5dKMfn>bIBFEv9e6VF`s`nYXn1^3Wsckv1D5)<|q9JDo&9FYxez##RSy2C%(ON1ba^MgV z=kBHj)Jrb2U??~(4vh0 z9vMts!6&K>>$VB`2GVVXdxZYiy^HCk8D=}_I@iuRKM5}1$6*c98hFFm+u@Q5q&%(2 zs{Na-?RQmXXLN?%K|~Z6v7xW|kru-*ZBHqd+YzwGS5IVexPyIaR1@1tu=Vt@{H_MA zRJyC)bJ7jTIHSm}umU-q9L48erDSyWyi@{NdaovUre;S;pNHAuPgtS{Qp>H$IfaUC z*Ih%W8pN?sMKw(S7OZpjTz%2e9^$y-{*ASchyz`~Q~PU!q8i$y)LB z#!B$konDWu)OKfW^BY5#`ky91mpow1K+%!L-)}1&_cUKmGfcEs1+$mm&fHi^<91qrnfY>%^piawd+KU^im;s?^!xRa9Z0m7GOwN`DH;Rkkd)r%?Gy(VNsp=a zfLI=BI({&bH1=@-`g?t%hm4!xRqbO z)4zJh>R_4*!30+J^)rV@n3p_Y8ArFL3e$0hMX->=;MODHT&kO)7A9xQsZ@%?MFio! z1-nngyN)1be}xPi4^GI|Bkm5==4kYO(i6Q#011v@pkcj&`ZJB~v0Py_gPHr*EWUeR z8lZj)!@F+D`TV11n&b;O8YSw-IA6*9bkEwOyZM>-x~7cG%bFqS-uFt z;jp3Mcd&Tir&)x51%`E|5cf}i&ta(B8z+2^cl%4Xk8_5+27>sLxg1rKXln03evUeA z&CvxODh-Z0cU&9BFk^^F$PV)|gD}AxP($c=P5J?CE~bz4@Y#?8jl=!M5470JderBb zCHn5UWJ;0Y9iWDv1h62lb-U2=dfIRw>gFv|jn2Of5fyp@z%^aU{j+ z62v4Kgk{)}&E(|gjnFk#J$*rUV@&i^Z4%CV-8`ulPwA6CsbiFgs3^JD ze=y5DS$46Vcrqao$t3Ca@~G5{n3Ipf%I@M|k%Ncu3H;80?_ zg?kwIg+9aZg^VDLejR%Jtct}+4yM|~?eFT%6Ti-fQiSB-Z&|G;n!z8P4-*Q1$iT?5 zevj^G_UJ`Zx7q3q1j84haoK?+jIgiq1HnSVD;nAgEGUF)-?JkF0m_^PXhF!EWVJKo zOjBG3bds1q2@Q?N?Qc;cJ((5mbRN9sTR=RS0^VZhMEu{vUtBC@ySPO@2_I}T1et&B zNPO{pYPYq_J7t|3Xl?BC|92yuPZ&p!{K1e`%T0(i4b+$G*FJ$tfkr~8AvRbUtp>`H ztboATMb4w7{hgS}#d4jp88fQ$QD?~YKePwD^L&L^L8&|KLAVp$<+r8Wq}8|m{oV(H zPp(+1dV7wH2D8%X|8*<;a9<3S`QP?eRE2Wo!7=2d!LLf% z{Py)|_)^tEGn?Z7>e#*E-t4@Bwtv!9zUM6$)G#V={LY?ZacZK4t-=DYe_QPB>_#x0 zTU`^2qvX2WJD?Cw&83UaEJkMH#&%2VvOq_Pmn;)`_k63xzCufOq(&RHBx7jqI z0Bn-ZFbDP(m`&Vs8)-kMBsAd~gC@=dTx88)Uvktqh+gjPE; zKRj)-7iwra6|NOHb(D2Ah|WfOxBq}KX>706UqWZT!24TZ?c^=5pwPbOEi&l9X%<)& zkh<==97wW~{w5XXmMP!<2!yW-i#H;Ny&B?MS3xN;dpJXpFShm!=^wx@Oi}9Lp((W> z*pMc!uY~;F4%9I5wQV>!Z6i>Cb%vxTG%91eYqH9K2Kpf?@n80Lm`=`ia!6$aG>I>Y zMp~#3D5^-!gt--}Rur#)wdx&ncLj-lAi`SqcE%H9oI;GMhFx|Z{ zv9J=;NIO8pa{9obP(iRzRJcWfv>mOeP?~Nda!&yKPUhF4=kbU9w~{};qIjh3adYo) z7=)mMr)e@1`7~!2@PHM?${m~@H5UQg2rJ2l$A~%C!QQc@Le01=y8gnvjBa}&!%=I;^D=fK^w~zFP z8HG#<>t0>JKwp0W&hJ_UCjHIkqM@q9E`W&jkV#s5UqVqk!X#-p?Et15{H++>o4uDD z&^p1W?VKAg8p8 zpf~V*?PF!LrLazL3N%U4AcmCQkdXIfV517IZq@@U$AT>0PbEVP^Pc}2tCxFRGrC;x!ji$oAnWp8SmQMi81K=i> zU&WXoCPY$~Dc|;bZ=&w6LY4Uf-7qzy8)%jLPpX4Oa>x9FJ>P5e@c*4)AnU8cyQ37q~!9~OzxipnmK@Q~@-J0^UT>Ge&tyc>_ zG!;HAVdnj~wgZ_JJ2RInd>{?1MV&VGZDsHqxRFu)dPYji$AoFN89w2q$^qwfQNMuru#bC<02s` zv$F3e^>O(aw=kGo3864@&GI% zM{Zb+N9bS1!5FMdm_7qUZ)=cfcref3uo?O%sz>1gGsYDUkooPa4FDvC}aY7Gr z{=fY>M6cKR?A`*=M7dfH^T__)YIGVN}{jRiV;PPtzj*o=Qe%L*l4(HOZ!e;QJAR#v$3FtisBa9~-)W z913HUG>@V6!fTgx@l)1vyDMEn6>YhAa4oaAxk*O51eE9q9y)P4%l$tW{WfT8(QkyLf{9D~9p{kb0Q zw|--wU>|XL8?BF|CD*6ca*b}HN9Hn4edZ=t;F}TPlQ8SyN$+xOMqwg|W54bX5Mltk z8$$Y3i^boRV&lgui%x;)DD|v?xPjCYlK^_fN4tNt{dgnUnXAf$(cQQya<41eNVdU+ z!v5Q6Y?byxn#U^_u7N(0J9lvxR{!dxgItLEb|)o+qiair<=(3@3rPD5?p3)Txk`i) zZp%a@8{|TC)Fp=Knzepa3DINh z{z8Z8p#8uNKZc~6$l!xF<>9fJ&)YV<7w2^w||Ial!!wy4ODaLHxC$QQ_mlp|v;gz3e7UzM> z8R>1j$+9wudwBfX+KZGDuN^o%cv*C@QgBu*FaXw|Z#BFLB$$Q_(~UIBo1WtW9#)Bv zmu!2|yGJNw6*d~@qe7eZ>$k8czI*F*u9e7{Pv;#Q;BO2G#F3=q7b7=BJJQ!&pyNaV zRO7%DCDqfO@sn}T)VF<6)z>f@MWs_!V7$!$I5ba&_zkN)RDr?c%f=*TkZVYUz&0dK zD)qX-Mdu=5O%PxAdb%hx2o#(J74SF4XfAz)Nv^lXrq>%6W_uXOE%@u}Y$&Xc1)I)Czy|1aw{0~4^zpMY-)f)X08e`lQc1x{-x1`V! zbII%VtJ`22Y6y)qo2>w$r6exk@JV#`TWRH{OXi{-zud(e-|^m(I9Uw z8Htz5FH7a({5w$G`s6nAUZ0^-5@DwaEvvU2ystZvkShn(-xoHI7?p)>LdhhIhS}2{ z1|=+@k|n$C+d!h{iViOV@ALF2cwz%iYEVn&HboOlSwJEy!pz ztFN`vMSJJ50#cUuY8Y`h-cD+|8eeBTz4p?8c=4+*BniXy%l`){<=aeiw#W=~T^eYP z#%F^|$FLw1U3$KB=sEJ|pz+`QOMSjQKSzp4LV8LUM^iM8qPN9K@)c6Kk1tOQyVp~C zM+=_ss0kXe%%_V=x_=n?zXvKqS*Uzy@UK&8Fae! z0e(FYDr?7bt|1}TNyC2H`EUY=fq}u@X;hv03p~khpuQBP#p@lXyjGov-5j;b|A?f?;Y3A;xYUj= zoiw~vviE7#(*_>xlqaGc0iEOd+)0(F5TDw2NE3uxU+gdMpUzSpHsyk%#bK|M+_b-i zMjbM9q>S;9pXdADbga3-pfo{*ZSP;#O#iix2m2a~ysI*$-^t&swJCq+#yXvog)SQ zL0;vx>8hHn8~(JLt<%l?lgMFteT_u?Pjo}n0=$SMp=jMrD}jhvS;>~BuqELVo1Aw2 zBUl6O@9DH#09*F0>jQ<>AikLM$$g>PGidv}asKfTVh@m$n(5{5 z6Sajx3gWjXFOCa=?#Q9VdeQW&F!63$$*cqtZ!&WbjIo-%o8RGn=)g}e7rwEXdxu)pzU6@}0$;1Pb`3mj7KU{1kH#qC&K4{BFdQY$Y7RwB& zRL#DOUg@~GY!`J!GVt)HnA14Iv|l{5{MOCO(on5jQ*HZ6M3Befp3i$)?3$U*$;{Aa zSy65)#sSxs&+QZ@d8UAuEaAt+8$XvwIxEE0`IqZ}7h9qJ_lK*UX+8ZEBrWH%6Yl+i zEq0Ao1VTe=P#<^2;$I*lhN#|BWl3qG$t^lkJK8K3lS!Yw zwmz+Z}>$|B>7!(XQAt^J;N5)^vZA9vjjU>ul^!rRj7$^3{H(>8(OoFdqwtXC{8M zJzZbRp-z~dtXc-SKg8yPSfo!m%Ls*gPRt>1zJh`bpPwpfpIbM;Ch6)Zja z=bZ}AWqf=3%Z>wviDS7Um{w2F={Q6h#YO;1;4^6=GghUoH!vT#eE&3KAMx*vcVmkYR&7*Oy(C2zg;YP zs%2Tk{^+Xr)n|L}+68#i?av@GMejV-)5(6`QU-&#dCU6=K~(6byFfbXSfz#^-zX`w zHeY zRX9k3&T6mzto3K*PJOhm?^ir)6t}1s?@;aHjKVMmSG&VwYr9q0U*6|we+=f6ByoA! zGl@K4mE~YA`h1H7lqZL4J1l-&7?S|^Pc#1ptvG{*?hQ%75BGSG-|7de$I3eP5=QF zE5=k1tUEj74gT4y2}c)+*FT_sC0J}j=-35=R9rc5N}tydFaDdi2OQ$EAfNiGOyzfWI%urB>DZ^mWwKWS}Ff7W(G>eRs*? zHjcnzV^ffZpH9jowcK*~VE=ILNYEZ@!){|Xnv`Tq?zhl-k6Kwa6@VGv@2NLVjJ6Bw(su(Jl9h#XRyDvy_2iu7t$8@3OsL~-feDd z(xflV@;P0?Ow(FY7qIEfj&mVR%%|wUjBHO?%-Az`(Z~oSs0i87eJP!84NnXt8^-TS zAHXjnD=GRK!hd|oV18d@4*MMiI%ZSA;##_o}ph>?Z8He zh!I=FvHySu^Y(w@;c?(ovysS@&Q~|DF~UHk!W?nO63ypQK>WaS7{6xMmY*}ix0~)x zXA;-xeOem&O*mg!Mrp|Y;TDV_y{0BRbQ$%g8NjoB@i1L(+!#r{23YH`|2AraH|G5M z+;qX-2yz1vgCo*C7G;BW-iaI;H8D9Nse@lyiC467NWUk-GDp>x+52yaMX8aMlo^Xy zI$SlMYbtTp3+;Yjfq`L>JKm(Buk@!zvq5tU!qzZVDDP-O|S3A+<8GJZ%y5e*=WNdjv z70zLSu=NNCStQ}l_;>{2ORFChrRZ_2GtfaLag0?F!H6CBYk({A!lz_#G~=JDE>}GL zY17@WIj0$%x%1auxGhimr`P-H^Z)KX3;(8Ka1zvAy`}y)+7<%cY0Y@0CDXO34Qjq* zI9Wd>`^In!g@X6gp<0S=2Voq6Y<2a{b0@J24e&J9BRLGbnpY@xH3uudMla4Uj+i?C z3T>X7PV*PS)mU=a|M6okgNnG_7l9q`#6V?7f7^aO&l+oVl=x)3hBLKO2@fZWdb~bi z5}om}adq#-qOyhh2VFQ#;N7xfrdeUt`tVZHIMmIgtq*QgMtRo`I?7q6^dkK-_6Z4`6?Pj#4?-pV5KvKydVSlVY4kj@Tg@g)v!c>w9mKpQ zMpz{`4s{`Kp<4CqdN`d8ee>l9Tpjp^6#(?y^(WUrGxh23r=?m~`~;+~Ap;H^ESkQ$ z2H}_r$Fpc>)0OYsT<`xWL=cWx$*rNkzGT|b7X8QrX>0VKm3s=G) zjGh;Jsel)23aUGY8@j5LSCF~JdFq$YWh@8&=qWqkowp04M+9lTRg}j3mO2_O(KOH` zQ~1p1RBrtVWiWSon50BQ@8pUkm+Y~r{XOw010M#h@?vzQhgj@`HRGIrO)0FT8}BpQ z(ff%1iy%Jbrql)Ldn!_v_9i>L`W*b2v0=%qg%Fnt=&@=>WR56r#|vjLNA~|DQ7@0s z&t=^s(4D!CM5!YDb#I_SN8{@gnbh6oV`QB$wnQA*Szts%^aKI+$?b(eGX2`uPzAPQ*vPgH^&=gqc#6~RqIN?gkS$xXXUPVY`jtPJzY ztc|e{WAQv5u0(7{ej*{0mP`*uAUVtE&~|iaBhBK|EjrlN$gcR**XR+zMK=^ML`AQL z+j=j)ed;rG2V7Y9CM0?@znKLbsV~J;dBhXQS~ruF+)wCyTo^L=NgE^)fiYJr7(yGz zTQTzP42`QOaMSD2VO{)$*P2RaeF5^nII};?VkzD<#y2H1I*h73F2Ljk!>(6gqY zI&Em+*apn9)4jF>29=!LQ}X7;8m#6UH> zH$vzE@0H0RJrDj3>8@fOj>s&l^VuGK`Jb0e@7Ke*b}h{rOhD!#CMEc;7r{NlD4a@+xSC|s*m@&%BLSt% zyTj1EF)DFaqfc(KQ_0PSHOv+iK#-?gY=X~s6QUMlgxJqfFL`TdWuyM?l8i};V2DE* z5{Bn-(0~tc@qEUh9siw&u=r?NG|5m^cJY^BY zzOkRoD6aXHsy^^L$eQlGWc^~!5Y-hapubyH^RVsNS`e@BJD+)O^g~ZiPRR+$q9J6pU z?QslSWsZOJB1JF8Is}%VRg=s5*o>kaTl<|w{Uyr-zKvbb$A!fcaC`TKPcC#M67#3% zB0jg{3!g8mJ?{g+R_Y(&R-1SUETUt*&AXIV8_iE)&9@t*MYtQ=jZD(RF-?L+gMc-E zS5!mju8yXn%Pg(AK;xrQ%8qW4X~UcHf%6+oDaseb*}^}cm+>|sfQsvIaVE+vTs!#; z;`21zBEdP~Xski@G0;SU^9iU{a0B2S$RDIvK#k6rZ$$MnU1T|h2x3|>nAW=pajsocm;<|>4!l6yy|31| z-aPXrEW{Mm{*1izN}DQ1 zZM=4Z{WS-Qu-=clr{*gf!V=SQn1q7V~+7JT#@FTpdpf$EbHr5nOa#r`EX>ty=C)(*3$ zt8?@mDNGuKv4pgOo+SUm8OGg8Wm#?e?-^ zY6Bm{TGAFpw(!@m%3Aqeb7uWgq#T~uVt@Nji8P&8cm#1xad{HAj z>Q_Aevx!O2^>9!Cop8U4rAF4}f|#ng9|+0(F2XQ&QwjeXwcvr5!eYLc9E)*bS+7HZ zK=-k#+!g4IPvw@e!mi(j@kG)l)YX9)sepv3*WPS^?;5%zFgS>f5KY48<)(xczL|MZ zS^Np?&~l-Qv;!lrLFAvE9XpAsY4q}hI`gbhJ6i*m6b}9pJrKHm!44r+w8o9TBN$X7Dp5(zqPXG2)kdDLX8+&yM!Z0ETtE&+erS+S@I*9g;4 zS67$XL+gEvh+X1uwgT#tq4w4xc!-|Q#Sq{MMzk?;fnhkhV)n)6FuQ7?$%=`L+>J0r z>fo~a7*&n=Z9WGgr+tK@*-^ki_b;MPluY55*{vb zKSRhk{Ss3+`Av~z61F0+7OxEXkzRi~pi-$roT}nhhSG+j}ck*4FiY|^|eDK(9 zN1jb#ViN{a`O&)1F+`^P8^SLGkLtfo3JAdB z0spvH&%~j?Jqw$y6Jb$qYEBW89;MZ)ac2<};$MU)IHDH&$NknNwC{08^EPngR6ggg z=HKpiY1%t|fo*P|B(wQPl>7VG6Fp2Q+uTWHn(Y`g*USXIoW^j=ctnkM^zN)>%)u@<@jXB zh(d7qy9BoKrve$aB@)hmO>6z!vR0WISOcd8G*}#`O9V8P9nO!D{rwb?J@KG6K1&IN zBHPz|F1WrJ<%4vN|7@-E1V5U>x>E<9kp8uXfQfY;cBH@>9wIxNZNAdKl(>M*2~IT! zD82wdfu7V-E4Tvpg1g<^W2RBlQqR2dzY?g2O4neaxKE| z127x26zIjuAber2mqXTdqj2i%H4vUj@aPqju5;X%%hE0zSzp`bXq?%V!@~smht$6$z}xsG#F5z$#4d7hEe1z2{9hbF}NoylamY;7xo}T za}d<;?(`8ueof&sQvxP~08vphR&NGZz4BE**Xbw|d(7jC-T^w!8ih`OZx<^Oc|n%& z?;IkZ1cG1Po{mBiUf$ADr*a}fgd40rGl_Ds)#K-Bh8|0(t_9R`*4Mq)JLCZ7MEN(f zr|z!{H_s0$azbRRA5(c<-e7QSxY;Ue{537eOiG7rv=7);$14$LWej z?$#|93ZTSMBe4QU$u3=yda)52n7{0tAgIo9J>egP&`D`2Vb%FLDP!YR-RCB!PR)i;B1OP-h| zAkuyyMJTz|e4Ws81OUZg*KBPQuv+_$`&1SX`6D&Nc)%A>gQtwWIJEzXRLH|#XPOC% zh&1&&cK5ZG+8YB@(iaD*L7xo5a`9d6=f$+zZa#gz0dv1#QBfZ1O982GeysE>qG+c?BIRE^1UVtR+ugE}zH--~({EJad|b_HY( z&yX)D?y=T91VPT$_uJ^=Z5Y3>N&5GLy$~ID1C$$G-CI3sG_H@>P`%_vAH%0@m6;FP zgymEr+-y+TGSt1!!Hi~`OLh3btSK-M{7Uq<6gN`$g}k93?+0T;2qI(3@ylA(FaN5lFQobKV^yldW+AU-2p5!Ts>AgK zA$?I>8+pi3tYV%pWeQ4A^adYa`-^w0RkiPrg-UF%hEOIFw*^P07WNg078cb=WY75n zr62cMiVp@Qe({Hi5YELhUwZjn^1JW+AR1|Lwz3722917R+ygz<-?Scixj4eGtU-qQ z@H97E6m#lMdt*b86Bmu-p4I#f;WxG4fX*am%KX;}>%xU%NGWnM|Zj#FwhjRsAas}LBcqHc4NgyvL- z#P@cOim2bxzT_too^a#vX$dWZ6BonIbxZXOPg@UxH&$Xe5{QgS8W$*O{2*^hKsU~- z5sm}={@9G)Xf~(VeVG~LX%YSxXlHDjhs*w#mFfp1F&bMjQJMMDL{Jw{4;X9Ygu$oD z9tv~<0iqU7jTra&)@^O~Z?92eZd`?i!~658%}=9ZQ!|4-@2PJdo`(+Y$}6#NNVZmutt5Pa(vGtm^Lz5WHuWm1#=Vzh8V z+4=l&`0B8fcL$;83dVlVMT-S+cgAh;0yx>`uR_~U+>xP;d|G_}&W4|&O6}>~ldYr5 zU~>@7%>N`1Wb)QNt`?#`T(A^8Zk+SvdoG`fY?I?P4j9Lrbq!%1A%9>(C1v*8%$si( zL(J}7u>$L<7V&ChFytjctR+0II$7Ua%2j~V%q%d^5q}cfp6uK4o7C?IV?vsBCAy6E z^%m9-^Y9)HwwSKH^V1X~($k|1p83Ppp1eu)f)*aG%mr=c$rKNs6kfZy+m=p>o(KTb z(3$YJ(E05%6yN7DzBi%(bu6SRcey^^f{f00?>hwO!+C{lGG2P@kL>aju!*EEv;LKb4}-$5Ndye*}&C!|pDYJFNP{ zAKBa-;y|zHSGbtE70sDuSAq+^fD3SozJjNQnyF1F-%$?Ut0hX!?7;SCmd02zS<<;b z+u8bnj?8J8!Xx9^@le6n=m@QnBu3sb0C5Uq39_TIX>?bbw=txq%@EMlCZ0#Z0_I}k zs4XE7a`Gj1b=+4-l%R~dU&a_;#MrEO7 zX&vGt`-`UhSCw*GiI6vKhH|lp{qJn$AUXu>=0cSRW3d&){1)e4p@SmDY=S$m$7Vik z<-c9gs~*11Z@x381rYMHF;avs#_jr9&|F}#22g|QRlacD1%er;7k zs{&Jqb&pk^j^EbFeCkcs7|3z4k2@j5_02?zQqKLA+&qtv^_%3@`xOF@0ni*pWQZi5 zHNp1Ret~-+YIS4YnK?LZhxO)q45dE-`$1&ua3#!}^*L7`LlwN^`#bHu_IvK9Mi3?s z&Up>;io#>Y?Hz;{0&}X~4;9wi_0JuKjJ!n7RM%&DEkmt5Oxp8?Y$hu4i|z*kyIZUy zcRJvn+8j9&wih?hmK+%DrF?~pf9H9%H{%IX}Q9%v=j{PA8cvj{aUqR%Hh`6@^CPpak& zceB+0pM>Qv#L=(0h?*R;zGg(aZumL63w0h-?V;z+Gm|Ww>TQwv0v>hjhBaB!M}oi8 z;l2{dvbkDR|B`qcf7WN*q& z`x^mB0F&RKeJG>Csi4F{72t5(L}=zq+wqM1<^G)3n8sudS*92?P`Bfv;uB*WELN29 zX(zYI4bDHq#4M6)eHa)bf()jBzJo}Yd1rF<@Y;Jn?+IDRGc7KGT36a_sgdXkufHqY zyX7vyu>4R*sKMO^_y$8}@j;8>iK~@WEb;7WxlKw5sBmL{v9c^3atsz*xTV&f#3NDF zq-fNKzHfuTD-DbTFu7;O8?}MgR8bMrxF%>WU*QZ#pYmudB`lm7R7JDM9?s9DJJ2(W zT*)z=Z!NC=NV6GR$>YVy#Ghh|s37m+xe9BG0j;-(pbW3^^<*1Q+NF~E%y~Os#p|1q z!igdNmsBPIhHoyoq=D`Di>jxh-^hhk3`1uTu%b`?lJz&~`Ma@+i4LLc9>RplWO2vV zS}fQj`C2g0eA;Dw@{uY3q!^b|(u#rhm8w$uQyxUn^Ouw@vnhG8b`g72!-7G#qgrcJ z=_EewOvPHN5AzFqcaP8_7w_tyZ+=Yo$8b!n1E+2qG3Gdnox<}RnZDQBs6@W)9BBQ; zvKbuvySw>k945Xz!sOhP*>c$%3|7Rmwviy+0arFKk2)Gy`WNjdL-jg(K6<2#bVrnZ zux);0m0-eM05zOpuwI69xks2CM(RG)8&&rdDF=Z}cIBV19`bUjUm+F*JUh$@si7|G zpv<%?>lGj%fDWPl@ctQAIA?rgB_rAHJ zQRkNX16WzU7cFu0O*P`kemPjqacqIhV4Gm2vH`~0M!%z;&&!i*&N&tX2!^xS@J;7w zwjKahe!ZVAhjmHzw~*gcD0>WCxNC)G4z8v4nD@CI)~Ii2I3kzBE8!+U7s2EWiz@RNj@!`_%H6x7*{+Oy1OdnTU3K zW~xZ90-2xzT13(3sB*b|dIOQv&b3y`pUP&4idt^WwrkGm%JDh4d1e6a3iJrTduCEb zKNsrN7yegCedPPQVnbNoxCUk8;h9=Pjh9}0oJHvKyu}V{Vsuk3N-z(rvvgT1YmC6t zrV)D5ZAk>>TEnQ_1Kie*;vJ$l74ZJ*Bn(mEOl`7EzNMMsPPqc#?&VWD=xE?+7+FyA3wdV`#y0RO|H@U8U$t+0bOd(EZ~*SSnkD+W;Q z-^$uk!dLHW`A;h$KHBXkUxz&52Nr$8Uinj=^@QCa&Vo-|w zHY!S*;HDWpI_w}b;W^!&Wd8U(GrpAVm~2;G!pAvX9I}1aZT8l-LqY$ZgJzagbIc}E zL3w$Q^%|_x7G@DVuBW(ocxCI1t_!J^67B%|iVEkI0E!l=Dz4a^|%w}dN30QU_ zslm%uLQT&X&MpyGdG;0~q_=itemX=wTB$;$MM_0qJ}nqCMNcc$e+gBdQ$wba>mLz& zEu-LCRF}GTe?dFdOJ7>;1kthGP*=Ne`S|nXhOebTN`!l6`gs2za%-<RWBet4mN3!1Dw|!j2zd|_?G#`wRrqLY<+Wh zUfcF{@WysyJ86_FJ22hh@V34||Yp>N`btI4~u7T1Y7hr{#>UX1KP(q?iYWucK&4L5)b!7hV# z3JM=D@7pzwp|!+jHi~aW*&W&iQ^a_tR!rRf%x=e?+VgMm8?N8&4YW_hz7RT|qto^f zbd|_?L#43dXW<99j88`;Dl6ni1@Pmoi($w@Qb&+`66#FQBfBfO@f_n+9mmwYr(qsCt1Skp;>PJB5#vg)X@mkx|3vFqs! zq!rM^-;zRp8G7VC+qy6Q?otnI@C*zo#rF2_s}V2}&|X=r)O>CiKG4kH(a$f3G~xmF z{jaDYidkY{Af4R!PHWD1(bkE25&1f9zV@?y1GX~HGn~%jgWaoDe`m`fTxt3taX-#N z*=52OdD)uV@a?Cg+te47Dq%ct@GqhplCV{Z-}}@PJngwK4o$x7C7o04Ur(v+6}3_Y zdr=;C2#fD{1x&Hqn2AMo;g`l zZ9DE_zPb;XzT1_S;I~Z@9R=M^@`#JQaV;Y21lz>+%xK2=t!G|tROi+T=uhWFcQ8gW z+KjFwT>~WZ!_Gmr*}unCBjr?qq6J($OpPH#wlsa9*#SMaw^{Wh>i}av{LL|OCT%Qu z9kr_h@L(8;RFd$yu6!(!Btf%cR=bZrz0$phU$ok$B{`irWmiNhSMn|BwnB=;855ko zgy23||Jq)FX?Q7G{DKz(<@MHYqS10gC6m2g`DKbBX32-1ukZ3*6YTV%3BmlS2l-*~ zdQM-?NjGs2oB$D7HS@{IouM!Fit`h><$QcHwB45LpHJCq#qZlgWKx+({ zx<3&nmv}YFN-2sT##EGcTUvfG0~_9Ch?BLgX8e}&c$l5JB)6Wjk$^RR1c^CJ>&{Mn zf#?hWV1ejaA!a~PY)~e6rY={A?5~$k_pTnLR4&%{(FvHf(Vjy*a*=6J(E2<66`Fma z9|bx`no5xdE_Ef`O0SKb%58+{-@`suya%mwA{O~l+bApsF@_nUmL~9+ORnToF_0(8 z7ky?3;ckmWc|}94ph)h~d2?@YS7tTgVrS5FZKTGxwmcGC%xL41Dlk4G%&Tz9w|4d7 zy=Yj*I9ZBz#?zLyczo;KQLbgt?~&H|(Mir(;hnY4MgZ2udvUkQoPR9s2o_R+S9qHo zr+!3{6?FxG2t=Uh$1qONX!}(_WU&q7H@_|jo!m)z5~okX?Jf?rcm$(2d&m;cUhLMV zCGLT!i=IW&1J*at>=Dji4B5tF%21FH%bF>u0V{{BC>d_~&7u?*?pUTDDsVb&Vn{^e zzF@u_-I+}RnGG7Fe8}gddcR5d(LYu5%WoHwTXBP@nGc`Th0ixfu2Wh zb3^K7?}R6VKfl(9e%Rj19-PYNtNIGozOwv+3WctN+zEfuIwlwewwKSMN}!ln#I`zT6O8=D=*jEEJYjDm5?cc7$aoYX2Tn(?@oLaH5Oq#Z7(T~o%B4G zj*%q707H06?JEK@eaC$?sQ>mx0!`!*!ieLT=jRib$jZ`eaolDxNgkqbIyHI20j8R zG}`NTxY|;!(rhvvRD7hdDL;szvXSi{jxe(HCgjDJkTF^lJPrg~17(~oft$T96!N?K z0mVv<)fppDMqSJ`pOG??v-2vvgfLf?$g+xjs0^!AVeLv(i5qq1`n{&a_NYg|k(EIz z0(?KROm47b`h*1I$md9N->#Y`ICPrC@M9UBV{78K4mOY(R&FNqAt(+)XX_+*LGOD0rb z)a;lS!3>-#IN3Fli(H^gx7fP}PSMuDvyZzt~Od#fKuM zz5|6)1aXZnW8v=lD@K%*zD{O?W&-Ty;_X;4tdI@0q>$M&*Iwn#+LIiQpWJfo6g8@e zPXQP+x0fGL&3lTY&cEl~1?}>VPmUBL=@1AZB2Kad-UFk(2s$ID4eoV(bqknK@LA;- z2V$$h#o1kDsMKj$V#&(9JzK%OO)|Ed5JzJ0othb7i zYum}zOaZe^?~<)MYb7JYs>5x@VwnbCVloG_Xp=`e=dM209X>#Qn!?9wkBhb;HpV5E zuQ~mp3ecbP>`{_L*rKhITJd;}H?6+jUwQSeWHtFTiH~;Q>QeRO(C@J=WNd^iScHtI zV$UoTSCSL_^K>#hUZ`D;{0)*#vCaK-K-Qt3Ck z%^Uomz&3olEHWzqJjF9ZO(kWuN!cBA-0W?TEv?EAq&hPbLyupzUd;RZQ$Er5!QojX z4ybjM674}C>noHIXLQc!=o>kF$%Gg!z|sxh&)T~fHSB?2;mMf!%6PtTCe2wiq*BKq z>xD1P%~~9q!+RBefUybMLI^zAm^AN16Q6E%*H2UzH@DBc-wSYbP+Y}<|M{XE*!+Xa zWlLCPq+n`RKCbEM+y5aNhJg_eePyT(D&Ejg)BS=;qGU@@z7b*<1j(q1!-2Sxj-0)N zpMRKWgVXZ4$aZH0M8M3v8QE7=t9g3S04is-hB=Jy)&9F7?P&v*d7;9#tr}RPe)Sa8 zpkt@9-#RKZkuk@xd@K#cEBiQSN-*H??e0MM$ixItAIW9c zsDVfAe#5~a(yUvz7^vZ~HEQM+i=c|{cSy$#-zN%-q>3-3oo(JPSckYl8NXt+iP>gG z$;Bq9w1;OaWn2GEC`vLMcC<_tI@W||C_XX3V#V9CrycU}Y$@`^t0~f;nW12;Jz4&{ ze{|ra^ps1|Mj|vS<>p&5KHA5ommeNgr_(fuFUo@dToGE2F$JqX(BNs>af9nruY?+_##Gek5X#0hsF%} z;Y+x+@jS$?!e3|}(vx7fQHo-UFC=LjWX5lT5tzaMSw&+2dOOP_DHKqk$BXGKq4&*; zhx0kiu>m8SKAHxah?S>~@O+t2U38J?49sBMX@g6)x1`S0m8*l%iz;4MjGE%3Objcl z2yQ`LE%wRh;q)(vc3Sn%0Ar(rFq8DQU0!&M)NJ^L=(Kgj;bqm75}n##&?5GqnZ<L8VrDy=bTGON9BRrwd- zQr6ssgwbr82s5u48H!9FsGvwu4<$S?X-F&L^TMb;7bAgd&WZ*q;@T3^FCC{g+)x_s z_{_YYNGHxKH&PC~%@a{(WU;z=I%&rA^&!jZGeirZ92l7~vJtcvsc`X0s5 z7YLXrvM_7pQSAWxBwEA)2QzHJWS2J~l=WbXS5Wl&dj=I7rB;C@L#vF{uXdiDk6Bu( zY}~HvfiI6(xuL_W!Hjd51iyB@KKsy{_V0G<5R-~LN4{do!q#s>`&#Q5UUx?7le9}8%}n(+_Ec zlzl(-cGuZ~iSD^n^6aF{r3fAi^jdvmO2HH&OwavOY=fogY&p9VYv#!tmaw>GT&Uf_ zVU2^y)I9$6jJJCX3g7XBIg5?k*0Ew}q=7RlJP7@OrRZp{=q}ASnx7qKR(37CXsU`K z-CBos@o8>Bh(R_*2(KyYZRSKS6o~FBZq(U-2;|5jv)Q`owp>fSbeHNb zeCR9sraIPqO~Y70*?zL7A*E^^&_Q~)4BeMkqe19!0Ph4xP&Slgv~U8=Vu{JG6a6lD z`0(+h{k|)bk+K;vpyXHsbEU;S{2}<4qhP*vAeWPmAu%u(u6>klBXZux$Wa?A?nX<0 zP94#rh{xeJoBIFwLF%NmVDyx-G4Y=^T~O_iCXq9w$^=WAoj{CHMtC z9Zf|?nW7s>=i63yQ(>Ps=atvh!`>|w6RRk4$#imET}NW(K`u)5^i>v;g1vi236xQJ zdA#I!Q(xGDz<=n!{x!Idi2N-gf{c=q^5*W24582D*W2@anKCUti$13^UgWw$)0Z~O z)9ABG0|7a?x0F3LF0|V%rw&x)uIRxieUemo%zk# zyO17)2c;2?)~vI)R|T)EZ|t^8FwQ@!2ilh`_199aO2%nup%ix4f~%6DHh)zGU5xY zEpxvfW+w-+C)m^R9)N|YY;=As%;K%&#h=2Qpy0;7pYkNLp`0Y^IW+-Th)vUP{%n5uzUFSK+RQ=tdD zn~s)>9!VL(FYgRpQvH^CL(el70ya=ao6Yq7!qM%Bg!AF&oRWr?aFXa-W4%F}o!YwJ(YXHwcik$XQ)KbN*-(TT-CY;0}|7y)7l;Qa0N?E-(Q0)lH?? z$tY>?3?2_Icg;o4R9GthqmlVrJvaxEMBZaAaJ?f1npSd5vLLC#FxNVzM{pi1SM17} z5u@M8`?gmkk1N;v=WuInd#R($Lw*|cGNaq^ZVv2JXsdlwTBEto%gs68+=|pt17B`% zQiJV1adb!*SqOn5nYzOoE6ZMlGBRV*Z!yRJ5iOp_`>iN*d~7BuB~{MRxs+GT#C|W> zq0T4F)Kc%$e5}0FYaa@s_pWtYYd?HEMLBg*ln~)M6WzN(f!oPf2i_*ae_aaDk`oUZQm!?W=Nud!a{L~t*`nQL-O za8=mt<#|74*gGv&pC8Cvjt(hiC|6)MfF;@xB859E*+{LCrSeQO$W)smM2}G2kmGjs zfO*}xY$*v^Q^l8uR!DO>w*OX!`g?QW{d~K-`P|$}Sx_xHgCl|ZMs>Ay`Hx3$A)HX~ z1+twlbDI=dLXy7TwshaxRh+lUrsz`*OUi3?>6ZIUdiG`X8a9t(N` z8n3RFO1PNllYUEPLIf>Q`!Z7#pH7}bY8Tfyyi(0~zn_Oh{927@^i7j}j-V@mN}H(; zI{Q!L|6A-fhY!Fan0L1nVQ0a%2UF)xW8)&*o0gR*{0!mdNX{BO61zC>d(RMIK2u8x z^ks0aUAfgR6QqlQVqi_sZP!a%Sn6+Zp7EdyOf@&hIgr~=DtGd4+~ z>(7__AK~3uXx}5T{O&bvrNL}&X+GP@GQtWMi4I+xzwAgQOJMLK%2F2z`i!SG$(cY~ zA&L5gvDoI_YJsk6vlqBzRNaixVPSC41jeQx@e8+9-&I{7-yVO6%(I$(6Tj6~OOO*G z7r{9vWx3)b)g@IqSX{P=z+zu!{%Ey^C;9T|^=U31D~@T9d@bRBPv-BpOeD@hL0u@4 z!L_%uD?Q85u=BKCk|4=sr>MW`VJhlH%gfZnrIGC->a>V(BfeXAZr;wlJI_=7EKRM| z0KiYDh?*#=&8*kafV`e9a`V`s4v?=H8tNilt!=hL&SDdPb>F6P-rWMvf%NTPm2@V`P>@LjHJbl*4%|b_O}fUw+iLnQzTZ{>yw{9 z<>twGIM158nC<%#Y1QmV)fw)Z0J}+FEW1{n+N0b8a9Etnoq6iO{_j};Dq^7_|JK_{ zO)oD=<3vGCbYhL19K<`iis_TAMhetJDKjv$7+$G>-IYb7n4A5|0Cr`a%6s7@2ub;Y zTkp=Q84ky0`7s%JBkR+h2&dOC5Mv`sIDFh=Dr!h3>n`j>pjh!o>2B#CT-JprNk);q zImrnB$FGoqQ3Ug65?KNbHTM#$TeLMpwDpr_rv<@;?kn|yZJLcx8b`=Lirve#Af`Z) zN;?Yw*UQj=tU~v7NOtw%XuEH8gD z`t%(@UZ_psD)?a)$0hr_V0Eyya+4`Mpsg8~qC?BldM1L^Q=p@>_^ zewW5K-&~2-sCw- zm+I|JppLOQhl#EQRt{=;cVY`wuvl*qFYRu%pD)bnV9O4XSbbpUi=6%YVm;)yk^qtL zp?eR1MKUCnFqA%ttCl$T&`w7@bD(_4{7kjYid`!)3Zn=h4JVdLIEl`vT%-9>juWFZ z`LOck8v*WZl(?96nA!;M|HlSVLA{6>O(*eguC9<5EL@k5%X`C4WJbQ_4`_CgN=aE@ zLVV1*Oz!yDz>G{|pB9Rt_C2Wo?)2tS9gj6s^uaqYCJvW6wwjpMEiXK3UuL%GMWyEA;!dhCV{so1O4*T zFxHC*$M@I}mpE-b$KAuZY!RsVA z(?32hZ7U}p>eumRiLT!k01?boTBoTNmR_+Ja7C?bb-bLQZ{xoaKvw;wZmH~WOnw2% z4KmhDsP*%O!SaHtlxg3QkE{?t^78Uy~73h%6_ezo-e5pMWi{^|QNp`tH3- zAIA0SdT}POeUrTT7VfqOG-LmK)H(*44EvyW=DX6pKq&A&RY{*}y`7!>$M!c_zocsAK4(RD~Wt!{>$3e<62{L{mSb!>WFFBYIZN8Z(nCoCISEpH!nH2aV?S~ z{q#&ripq2iW3JUk|*3 z%grxNuij_`_3H6)sk~9%Uhfm^5>b8!W&&1lg#A*?Gn+NU>y#yOt{A$C0`B4A{D<4E z&%1B12cZ$HsUaMPo#r3d;4b!7rOU46D@T?+BfU{uJa1X(hStSrr+o7}X#)-VtZt+= z>pf5?dU(MYWYQP};R-RtCF-)h_Jwi+`x2-;9z%UZ1S_@m-2SBnU=4dpigNuU_R_Ci z3T=2~EKqoyT%PU}|&TZGQ3^|NIqSj<_>VKV|-#yDohv*T%vaKGqZGM?yDHfnzfH9`MHC=s7jQs%LCY z`aLahUh`_JmprAolvdulaJ^mKmg|~EZP%?X|EL+p2D3I`tLyod+#HR7@FA&WUbHm} zv&tQ}qLvd^nK%CwNz@Xx4@ucIE-vI%&$ubiQT@ht^oWccDY>k3giFOR-=~(LiLg=sbKJ0$bp$OD4L=n5uDz}DD_16v!@M^_L-4jY>%TyXmMaj-K;_w(gIMvwYX*^N-?$Dy?VA|B@};Tcz>tSiAZxMhyu=j1;5ju8M2IsfIs z*}5aKINL2EO5%_pn8#@Kt<^een*N8)UvsDWAh48VG3&Bz6yHa$!A--QLrZIS6Fa_& z`_j?6#uUpw#^lzZbecx#EF%)iX@0!in$^ObY`}t;vqTr5W(gf63l~N?Fxt0Ph)aJc z+~3jtm`Xgls(53Is!8S6QY0!!`BPRc5;*oXp?HP%ON;k)SWFqQe%8^mE9!4-20^1D zuQ9p*)Pm{-6BZt>Z}wycZN5}{SCFPD^-B%lvTGxxuTRYV@ZDj$Kti#xYJ6-gh=DIX zAknz;N{Es+<$5<=&syg?e?00ueP=$DbXRC=T3~0oR*}7Na98vX2ze39U)!|eq#UqU zTP8){z^eGgMewd9NA!_$HWN^Bs}$g|J7j6nWMWn$sug)as-)$G^zYxf<~Z-`^NW&q z{g9gR_vGsHXLqI|KwUEwDhBsTWOaJ|uyM%|%fbpOB#py%<$1i*V%c5xAXO9il(}#j zaP-cr>Xa6ZjRHEVItRzoJRE-bQzV}FL#L5@13qLJMbS8BTEw+Axcq{dS_tbyExcF9}X14G^n zLC&VwYyI9{iJv4lLgolTR?cLokqz?2w zA)Vn%(W>Tjqxrc}3bIkFO~UXZvE!qCqJh>DuNa+adDBV>m{36@?7g!K#d$@ZlhDu+ z7msueHZxGeV?lyvBzZ_-Z>)w;(rD$BNL3UB6Z#D9PEIz=0NEU>ISdF%i5fD{LwY(> zIX5e+CA3gJvJ+|ipMF;@3Q)x4sLmxv03N}<#57v>YUKoOtsf)^&Q$9$9a}1TAG41& zQTo6yo|zof8)kgTb^HFN4g88=_xQNO5V&O*WuYANE_4`{(lTZ95MI7Q{u7-Zk9vDA ziC6n`b4u>Bip3tzHUaBUyQt^s7N2$7k|KYbwY(-E!Fvu83aYq>ne+Rz(MC!ou1+vyR6)xjel!KNx|E2*Hq6X?jcOCu~{^= zo`n^q$p@eo8OhMtn{hQXLw(kHFzrN!AP85}8ADgqU>24RO#)V!U;$ZuV-wcW7xLeV z4&Eut>4HBs*iXw6*45gYD0DpXOi{d0n>}BfZ4VaJpHLi5V$4Wk6j_;|2m=0n5T}vA zs7EA_-FE02V6DsT1y7eMko8q*fl@ zAHjed)$*naFW?1V^uu>gx41T9ThMr2MJh$4aVDXKEUHvz&_gv60TCu7McVFeKOLkG9-v@ zkB$Rmt720&SfL?e4{oB+B1??YG@KM*x2ql1%aO9p9t{XsI|}cX=?rDcKU==C$yecF zS~-Sga+&_k1an*fsAv~Gi|>o&K+SnE?Z7~;I@o%5cDpR(-@ZQw6)c@ipFmqmz41%~ z!;&VoSRuQ~ViWG<*@^8dWQQvVWkrJ{lGMuk{Z{hLey38Yd0!4zqA;vIN|7y9nfzjL zKNY#-#}vyH!JJ{C;8Evz1=;i~9XYB-iyvz+B}$LLD$D>@5K-hAmvY0kvkk@fg~YY) zkjtcQim29HDq-ca(X_Jt9pg16(`-~S#V!#?~oCOF}23j(7{pZ<_ppi2=4 zOHgt>9IHP-Vw)vtve|$R4RtBd_BQ_goge5xF)i=z78M9^U1M0(<%);r%0R*P?{8+m z+vEU(lE^9^jQn$D$aWJd?}V?{i}N1*Lc*h|wkkifj=l@(CNR@*@F3DkEI^5QVC|IV zEk_y5P#XTYVgfG`0J3mt$ukOW%L2Aad49`#PtzGFa@Hm>B#x1J34)J}3xgKTYyksr z$yU!eRa;#ka4KD)c@vas&g&|;8xDm3CWJss$~lo&Dy8A!!H~`5hShGrkDE%0vRUbn z-EYhFhaDIYxpC;|{e2o4S-|L)kDu||b?xQ23r?R|+}t^d_`$|LN+WB_uGwy~y~~=qhZOgIZ1LLi#`eGw@i`gB69T zLP0|MmzS6C5^q+CfN>~55!r2YXk>sQX|Dk;k`wqvbcYzu&~s&1TCt}`yWv{qA4_Vy zM7(~Stp_mDY1f?1x)ZrWbW+5j#G5th4W#o+#IO9DGWfMg5xrk%Kkr5nY$C0!sNejY zNM8gYC!-jf7>Ol3LilqGbCFXxooMLPYWSs9NBR$e*SO0oeWz*YS{J7s$BpYS?&^{b zIa25cP{m$MXDQO)C3Z+pPY4G}C@%U>=k#~QV!u@XvFcGAFZU%MX23~IgJjs=Fr<-l_K2mM8q{5ywz|5A-iRd;O+xGZbvsu%3@o7BL>pELGZkCL{I# z#GPeQdbUqgwhoWT8wEvV{Z*No+wc^46b=KnCDQ^z18hEqe-7*?Q1V`r(FE!m`vXZK z9l6YOE-(L{K_KA109IKq5#%CV#lEpM$5-{b9Gl1Y=S9ANw37cTRsknMp8xb=eci}k z#`~A{lgEgG0&7^&#rZjSmp3S64Qv+4A7nEClk}em!=T+>Z*{=X?KQQ?;I~uxw7%zw z$2yq_#i9jf1@6%oPqvzpCV&^p_TB-@;g%ZQPL##*ujf!)A^XKfOM^&BN!h)*k2 zC&Q^MvN3)ge<%1l2K8rAg9Y_w@Fs zes3DbjrM!{_%UDtgM$Ud#lN+LL8R04l}8@3ZCY(#RMs~#u|$sm;M4fieJ`%=h97@N z=YeJc99BRHCIojGZnuASUor_c_^s@p}giZdwsAOa$(p)%wC=Lqij*V%M-{pxMg@@B_03 zj81n@S7WOjF){8^-`MI2euCkhWz4svrT<}8f0o!7X8+(&P}2U(O8<=)nxdkuvvcfa z`RefKcts781Du1cW5v_^*PEKqZxI2nkC)&Wr;G!3f8k58G|{I|s6IZN0m;c&R4{B( zNpzq-K0ZP^nwgpHL(3da&Q=|a6!K^QjTfh2Q_)znTTn7yWGfTdC$N(>j-~Hc+>n18 zgCC~|(a9{t2j)7TRtsQh$*=I;GLVfJf7GhpbV=p2^BsTdQGNX<{e`y7L={hi*dnb! zPk-PdfRiB;5FqO7>(>B9eT{of)?4i4m6VKprFN--;MxPAp`4zrMsr^s_sd@DKngXl zy*jT`)jJGi=$+}>l>?!@T(Q-_<86m{>(B^DS2XNTWtmF&Ywho%Kwv)IDQ5v$GXGg+ zItDCT;{DG4KD>lPXd*1z!GvG*p9jEa`o@Js<}wt0oC01sj5^HQ434)7Up6+j-^c`- zi3WXh4n4(tgA&GeDe%QTgN}PtTsVDx9H7>8Z{cdb%MNrzZ<&MOe|F=90#x#@Jj%GB z*bgYnNQj7Tz)I=#Iy^GjR|BHTlu=yFuBNUY5E+T^dEQ0I6jtw2o)9PJ*f4Twq5qBT zFK7`W1wxD8Iujb2qWIb)u9`<9KlNhbmDS?r%Vy5-8#$SfcNCJl2j28$* zv_=(aeaWoIPQtJ4Zm%fA6C>WoEoI=CNlDumo8lQK)1|bQ8%}tVl9D<4LN3Y^gq#n( zV!FL>bGjHqt@iUmu1|jw0Q~$CHK9>akRBc$wsuTeS0LamVmH)bUlrI%)oNnVb`U6X zIQ;mtov+qYTCe6#T!_g@33#l?W{PqTGf%D)Clm@e?XBTb^5t)zYMMcqnd^WJPmkfn zK%@hvy~M!wSMsV{q}*2RS<)A;pJSnWL55(X3ov4*j^1QyF^s61mZG;q2ixR{vk zfuN%CcILIQic_j^wMBNJT71Du}RX=6)!b0rWR&)-RM zgvZH76X?TFq74?$D19JDpN0BVth?Mfp!UJ9{&nvEW(Rs;IB0skqK1y1#K0GG3iai4 zua=N-onxXqS@Y2es-FqjzANl>GDubDD6MnSyGV2L@bLpCT>@HDA*MC3DqPVJMJ@fA ze(K6=l&6+r&-csI+)S=`J8}d*dTRaPzWfL6{&ER9T)!z`RHdx@N9isY?4hOO0-K8j zm=9c_-Gh~St7_^8WT>A$YH@|>4&~HPZEep0SYZ6z7ay*A^M-`0(tt`6-_wTcO^mXO zbItRK)Y_hJLl#p*bXfT+JmDZ+vH`slZ9I-yNdM=txImvNqo+sI(zdu!*VHDU*55fx zxhs~mbd|%6hsKBqmU?C{+C^>2tyA?zXzpQXTEP0#8ij#U@c^+FS4@R?=pu{66ppLZJHDe(OGsHn8v?O z%uEJ-l)U&cbJoC)fog79d7i+Dx4y@8^?m(sNd7lL@P(Nt0ESUtqdNylHCgW#o;18# zu6FC9iJjkRdZKT4t~y}0k8$AzqPq%_6!$n7K@$OIR`|*U)#Xs?6{Y1aNlOEN6U{%Y zMo%1&V4zSWoeD;)TE$ZEe5-{J<16Kl{f5e!r3L z1vJoK)2dYXIvh>@ux7bXw*br{F@&7KwKpv4$~Y@!WYY#)EEgdyyFK|g8uq8*WepjC zLCO&wP8xsqwli&9%#GwNV+HN_{Ai7N#??ts;`kZ&e=!9l9FyMv$6PTSx}-lsKNB8U zQHle}+u8dny*7(pY3KC2aduPr;h&T zODXUWqR9Sa#E1P9 ze=t-SIXWqI{*(ywfzuD%l&#O^^hl*8Lg8mt&``((WQaQgjiu_0gw>sb!aFdSOLBo z?7-7kDpCIk>M+)4Yzf=_`Vsk8Fzzy6D zO{SoN9dW`sLuo^18Du2xi$F+DSDEZcw8DEF#_p7=x?*v3-IAF{wq)odGe-1zZ&;cB$NM*sQ2hWO?MriQWEA4Z=+ z5C}8GlniL7a|XoZdx*hMNj;5gSo_QC867>7O}rJ$zcO2*WMhV}2lx0!*QN(4yZN9x zR}lk#7PIraJAdNlGx=z_{EoZ+Ej&;{NJk{LShvs%kEa{I2~VSJzPf?=btJoYbXN52 zOUqm>92XJiVZ>7_`L!|sk=<|bYyk#&E?*3U+kwRVU}?VI4#dUO=39?^Nu!5%Qk#fP zZgWnlX^dw!n>}Oq#EsSMfV)J5UH|FeoezNq%Q?~N&-#1ZnIfBzndK#toDA;B8uaJ+ z8Hb*M8KF{!|D^!Ig1=dGMrLO3*ccozRX{G41Z*Yh&aLRDIx-JKl0s;c@m=fg1bMtL z$eWDw&^uod+z(_zCsM*MvulpkGcksvC@O=%5mJYniiZ&gIOFR=oIcVA7e#9HOcO-+ z0>w`CPj|X=3kra@@_M6Q93Jt1H39SN-bL79n#Jdl62E#n?PkbLRjpapX@0~i<$5~2 zOx|B3GmBY9-CvV{CP=VN3CyfZ3iO`}!`})(n#`r@1i`HEM8o;a0zj3V)a5BtA!K>? z#wUz*!B6%CN6h|-gm6^c>E#t1!-iUW*-efiW_akAT|!yL8fDTE%~)lDoaLOBz^1s5%+ERP+Y2 zc`*@VoQG_lXPpIhg9&^2{25lpr?$2q%NBS#yI!5JR) zrfu+z^oP-$_9OJu{|Kie<$`d=(*#{iYFeHe(u*4DqZa-#C6g`a;m|i_zt3mw9#aY`0+P`gPx98dwZHo`UCz93BP_Uc~h}tw6T?E)Zm+z%G~n0S#iG z=N8(1A*NBd_9TBmuJ3_#Z2O18Ww@aWoPD4ek{eR2c62**LPP4GH1xlc9YAje0d}^j zm*X|*sS2h z^T$HR0N`sfNDyJHEAZVqXN9Rei$8RV#m(A`MaM4_1!1`5nL?`*xNpT=3FwO)9%Al( zu#(b^GY$W&Fv5;1D6W=kv1(9~h~~a5E+|2sOq2PMssoO%(pyX^FONg91w|b!m{S3w zMDH#sm!tfX`KIIKXrD$GqSU&|&~AKnxXG6%wDXv6!kdEo*QBZO0k{!_pA}az^bNvV zJ@zSG|4Cl|!R!x2poKA1;2GTtaw}DCa<&(I7(#jQQ~gC}kq*|I*vC>c)@N#91euWG zLS;Z?W=%Nv0JbiV^w;NBOsO5I`#&aDRQL+?!K`$84D2R$wX$6$*hsm3#W9qnvBGv+ z7B4dYh@~-AtyX&CJj)vUDtZ%>>up1>g|b0QP+ro}l0c=M7SCCI=OCz(R19&GFgfD} z#YI`jR+IuR6@OjA!1f;m1IRaXYyb(|mU;R4#N^~6TZhB9rM~a+iG|vgsBjKcbA?wDSXwG#Yi!wGh+>vP_d5lxs8Sip zR|-8D8;xmQe)*1gNB3`E#ET5a?sC%+x-N+4itP*B9SzC}qe-;n;cJek3nit-)994n zM53a_Yq!}+daG-wpt{oxgzop>1pJkN{l^uaFoKdJrEhQRrLbCHj%RW;BQm5Wn@|DH zaG^eoDlMc@lKXc?U^ulk#=0StI-#(Cf(BE9C__2~&d)h=y!q{l01I5g(cVhNV+aH`(Y}BSjvLH=Y`4B+0#Lr41q4 z^P>{V)sJgsGYce1!ocHf$L5|1_;#}+e_x`XQqWQ{gA*=a@OA$rTwksVn0%;5Zbi&k z@!c_8B>BO3O>|lvgWPJ#*D>{C(#@hpo0^z;%xqbS{6+A4wVd?!wMxOeU(+VB#An} zI_wn$)>a&{+Z;^-l?pnrBhEiwfWl+;ALUD;hG zk`wW;=|O(0J!tea%$`~QH9~e9eHyf_>$!Q9t&Q_(6WCYP*W#APQeQQ=?QM|-;#Rg! z8+%YnO*JmILxMsp$|^jCgZpd*oDl{E)&n32c^}zfsf&(T^V`0pIreH-#EBaBP_LKO zHO;cJKBr|S!z83E56pD-=~ExUC*n3Y)PR~+*s3azjZ5^~-SlA7N-2}S$77T0d?Ync zcCr-u%6I!y8QF;yg#ddMCKwT#si5>$>p#9`4Bd(xm5Z$fN&C{s4)RRXZUx zG*qib9|>268fV#w$>W!w=Bo<-av!rhhr^Nz53%J8N7)*JX10Z5C5PzS4NgnL8KhNK z!odR7TB=Cr%k)F5O-@$2+=?RS7Lpk%*bEa2Fe1)yoIvC+tS+r^5ut$LR7y*ec?A59 zXTjUK%U=Cl0sn(Fq`#wR=!#qyO;2{HaVwe!kP@@VN7xJhRDTyggEbG4Adsih_FXr$ zyU&cwjb*j443V4&3K?;uR!$$LHKc1jKTYW3llC5J-4P!zShsLav_|?NL!Yvt9G8=< zMLk>n;%WhOd75`BxOppm51jf}E#Y5=Az8)&}kSr zFfa1OeVZv_ewtJxR83E>;`=(^Ml#`Mz>Grs1IL|_tt%WdRXp?7w zEkdV62eS&q@+oV}0bAR}xe{A2w!fxaav8bs(-1xJNK`#xQ`sj2I!!?wQ?5znd2#9z zhWl;3N1$So(McaFaa#hmVoZCA-^psLC)r_7+d>bz;wLYMu4jEajPW|sgof1l1ST^r za%GUiep2}~U5)2RL z{eMtJHTd?-F!Xiifqm0>ZbkMeQP7dhKsJw4NFW9!*P9QgGSf5PKFH<|N#)B^`vN%f z_R)mKu*&)c-)Uh<%*6&?bI;u^w2~Ia)C$e_kWDG!sG}F5Ztimo{9QU4q@$tOMnLoN|mR z|H`TUx6SteyR5lfE&vV=sY>}|%i;)6jTgvQ*jFE19|@3>mps9n6*_*r--AByj@M{a z?p{y(n1F|f|A>AJDexM5?Pjh!?$+Sqm)+qP}9 zv2DAtZ9Cu9`+o7f-_Il`=Op{uGka#uT5FR;JGi+)B&(}|25`sd|RahfU{(q1oYL%6QZ;bx;*iWdw17wG=~x@y-D z_T;#3l&`k^pwwh3sru>+x6nD3N27oTf~h8e)Iiq+A_6w~4^Ao}H@l`n2??vqh{$>o zGa|5DY!jJNX4g>H#AuZcRn`2biXEwBSkLi}Cn%>bd3@>5x{&!A+%(o@6QVzEDXU8_ zU}bzVl-S5GFn-`9Fcj8*Kgm~hq#=NyM#<3laW7|fzFZmvT{I%yM9GJLJ$4{io&$#Q z>#qMQHrM^>DU>b0uNS@kzp&o_`zUo*!PTl6&tIzQTv0Sgh_C#(rH7dG=|WW(6W98w zJ*bByVyq=`RPHq6!SP~(8|%!t_g!32TRK}|DYmR<{P#O3Cx$dyV=-(KpY}5qAb6X4hex$lJeePI1VM z?Hm^d9Bwh8F;$8L%i%9qj^TRz2*YJi`W6C?U5cT6`PEv=GnHn`+R%d}S8P1u%!@yc zcc;6<_LJ*jzQEO*1`S>$p_Cd%Yq|X6XHo>V{%E=+3{8YeVA(lijHBMEw^HlbgEDYyz_-Uhz#@Qi3rycP;lP`bK5uZ}z>e|>hd|HZl z$KUUIFeh32a4V(=#pfl z_sXkD&kg;3ZM!M<_lu1TRpXk|eX9eAKV+Yf$7=MH)Wi51V5nIvd#Y)i4nGpgmla1t zF1kaW6n%urQPh{Y?_dmc3n+gekqQ)Ye=`88BUs&o-G^F|Zj3wJ9QlLGS|NU>M93m~ zz1gDz?Y<`?Vk79NtiMIY>vY7nS{_|*HbmkX8Fj0uZ8@&hwy#}>JboI0w%DW`*ganA z0Syng|(Il{{`Yd77SO1P=YdL${YS~K*=xudL=|AXOE zpnk2{4`BDggz9wp_-yiJGiQ~*mj^YxDi$2)zIqxtH%ChpJ?@wtX2#@M1yRNs&%@M; zWNwN5dbSA+#rc#ct;CThL?IR1<*k@2l1zW;$BIu{9Oh~pDuidmPtzF)?SIC_(WsZ# zHfSqOD9pQq|F;|E2iQij5+CQlGS1$cNPG{fU;Fkj2n?{Fz>9we!2lwc9rWVSt6|^VOl7m7N&^)~>_7o~1FFk#V#B zKXAlsG8l~(6#77O)H$`X-oT)s-t=$w!dU5wmwKD-6D00%?f;qs1N$#Lk$=MqMDMIft~!Pa1B(`3i{n2U3kPv;-mla& zG>~v`%!kv{s>Z}MI(f`>zyP&4o!{FD`k8Ft)Pu1ltVB(>nfv>Nx=W3I7i=GhM1_5uX zB;M5n@Oz!Xd*;SaDdyl zT8Xz+tb#G=_a8fhe-M1Zk_TeAL_L@Dz&_6P(4Y=5g6wv?XeWO*>J}vrZdN4A<^_ov z+*k2`#mMVE+$ijox0^U(te4RfPxobqbvKei-;326N9 z{UXxVKI|WmO}+Wk8D<7NG)r^PM)>|P@v>cmdEZEi1_iV{Wu7AbiA>k;UumgfFv^|5 zAn&ojd^#*|jUqeN=`Lp!J^I~=@^<|#Ou(i6LF?(<(T(t zH1_vNQ6Cql+z(9$1q17zpI5#@{wGlNL5+;|r{eK15dUqRpAlM%KT!C~3 zCpxFY;iuUmS*W-dlNlXStP3qAC6cpiYT)--g>?p}(W7JHKmiVJSf86toSGjdI^yw%YU3KP^G>+7b06IXDlw%B&5moZhIkyool3@ zZaq=H<&-quQ)8Q$cdR1)=iit1X_!Q*YinbiTv)3M^3F0qZD+AhCoK(|8E>8fKx<`Ykzo@T;mcL08!rwjfRdHn$D_)FHL}AZc>PlffG_SV&p-i z=VT~dXXmNm%w=OGhqO<>4n}GE=kluYM3;Y3g@LG?XG!gMV}#0Ad-DI;Ec_XSHq>ZRRH5OC{gHZq*s&%q#^OF+B< ze(U=wfQwd(9m>M;qjSSr`+fj?W;f8V1M!D?N`@9brwHwSf^p_<(l+7U?* zQ`_9JWsB{9KK>JG@o#WFtD-KiR-(<_KPO-N2@#GKQ8KeuQOjw}#W+hc#gZ-dY#BriP*VA2R7x4)uI~a z*T*a8M<+?}*Yi=jYbacQaB+i9s zTxT`HI}iqCv;&qNS)O21ojJS1$z6qP;AEd_mY3vEEdGu;!JrWRZFdHa9Ow?J@O^;7 zBbd>(9tW}FO)`#krdyI-rso{>BlIRGV-PGL%tUX|THBvC=mRB52KN<~W4V=IU`tc2 z^}x8v?u4e)h;;MpGc-Vn2=%}eL&D;-DI^mbu6pw5KZNWjei-PmYW zY^NMZE>3VF3VIS!Qv1P-%jXDq-NVy}3QG4w_aM;AhOX@OXT5Fg{tv+VqXJ@~Xy#{|@?+MD~c=jk^)hzZS>2(K zWYeM846d?dEVKB$^MK|Abh=g0+XD2y%_GX2Q+X5%Gc%)I;QDfTxTeouFmxN-2;ZE3 zcOj0Z#RB8X-T?E#z6R}`h0Wb;7g9LvBGuN_>xW|Qv4|)pxR_n%3SB^;3B3kV+;oqzj2=8SAkuAl$C|`!- z=pI!Tj16w_hD2nuMG?-r$3+?oSdL~JgC(8%;(t80&NsgTR^Ri(_KkhdsZSh!85LHj zvc?A(J@d&*j9Ox^`c0Ks$|^jLTKDXR{EC>bTr+}2lA|NE-sX% zf$Qj;*<;@_DkXz4t4Ox&2Y5_EvocRwNUG=-1w692aln-J7bjO>a8Jt}I5p_r`H($` zMZ%{$>M@g#eFl9RFx;=~xJ5^~m&mQ+qD7Up(?``_}I#WM77+6_BaBlG%`afP)HBNn=;Y^m1 zS2YZO_%D!Yj7I`$$nF`1`w#mHXf~D)MA{d)?n3b1SRy(CZ+t?b{#rUMF2q=NDRK*; z#B_WpsHvMx8438gM*F~h-m@GbIc?j>jx&Hr5Ieiw)o3djEZyXLW+e!yJ58qRc#>0x zR+A~@cL&}&dn@{=kt&&e+Q}F7_}+v~(CYK}^w7>$6MFfzQ@)k-=od9UvmY4~z~;at4UIi)Y@v(J>ATRw2zEsj>d~);x#gWfy~GH*)xEbE8kas5h%M{S(1QH55Uj zbati9>u%6l^??mbo#__Hq|9NyqUYSxEfU-2jUZh(KCZ8B4y^-kCLI*k^-`3adlmQq z5(37n!?%U{pCt@0qi1@-4_dgs^AL{IGRJnbhA*C;_jYK4irHT5d|1nkE`Mf!tkL&X zrwvzCqImEI!C0j2d^84Hz$M{(*Z-UpWG29LK1plAf0Y01}U9@SAn)_pd#>gLsE6R@yK%B6yJaDpP5>tx0SEy@g+KT9M97 zv}!*giaQ1Q_`0Hb0a-Hq4BAKXuULYUHBbK~vY3Eys%wBWAPUWpWR;F$pB-Hcl41Dd z#s-i<8VEG$E~2gh!O3mkWuwo#Nl|U3_8TkQcAWLlN_tZh+`N_Hw3ufsd$Kn+I0a28 zcZZo^wP~d9evJLkuGLi(K%j@QiG&cwYPOn}YSHshOlDffj7F%RqMxWYa&cPtA^@-k`3_I*4YhTc0I_l5kao2n{n+$GYaRegmi<^ ztY+cM6q~=f*&?Qyq0W0kxkuGaD&g?lU$9^-L*Dv!E_j7)2|g_!#{R{7{^@MGz|E4q z(Gu&7Rz#YRGm3vt4-;R6X(I-|mvoT?>|(VbAob6YM&1dZw>BTXiRDJnh4-aSdaXxI zppX)|n||6)W`z8wu~buEs)!95k0q7t+SuqEz;b{;M#5y&MsO)e7oEfwLo=iZLQofj z50aRaK-rS6lYIjF@%YP|KdZs!bz6ooGd+SAx+AloavE*q>UAmVDu^G$56kBOy~21! zmw%C@U=31iEMm6kO}u~h`b=_Bc8JFi($NJ?1Di!KrV#r551Tgk=DJJ^vP) zOCTtT+g)No3_y_(#kXF(YR9t& zNOz9O0⪚b&ZbkGH_8%p%bGx*MTa~O24AU#%aCV(7%65lDfh#ajw~Cv-#P=agkV0ug;x?us_-w;puJp1xgpZO(w z&a^RrBr3AOi(kOls%oP_B`pgRO-cD^eZs4N-C27h5#ds_46d|Bj57AV^a$hJ#NtB% ze~@n}Qg+&=_jA1xALP2cuF=-ltMMH){z16*u&iQ zEJ_CK!QUiLS{6rHZ#;XwU;@(BAGl^o9N5s@mf`B5(%VA&+3Av3b z2Q|U7>FXMN+WU$kRgn5{IWaczKySa6nJD3FOqbJIqC2s!1MZqJ+FaG6;Ajc)B3MrD za=9eE2I|eV;cs;r>G<@r$x<&CQrT4CGfb%6Fw_atZW$xJaf=uMaL@Wfv2-+&DBSYr z+5jy_s(ek@xnbP%+E|{I7NNI*BBwF;Nq%Boy_fvO~x7?`?b%(D?HAkgeakTcwS5mcRroJW_2<=g`U;8aA&s7ZAA ziL|t1RAwp__AtYLl09>>e|Q|u_UH91ca@GCZ*C5!sp&v$K80us3z}I@UOmhg>8yaL zcPCc!3OqKNR`7C~xw+)>eXpBtD3aF1G|x{X3}IWot<$X$&82ozzOI(%Rc3ZWdsbQ~ z_C@=$mzJ~JYp}a4>jFmcg_`uCz@Sg*>HG8;n#K%zl&WJ|8yt!G<|eG}8kw&U4>`w| zW*iFKP>qeFSSQK)A{7M|BH*G-zn3ucY9 z)^8g7=?7B4AQA=-Bp_Is`b@w;2_Gu4sz`xIx`qfCL(FheaaGMFJC#cF&+`y{_U3*C znt8?D8SL2)hyvUH{+YM4EFKS(!YqpJAkdzlM{&tX>+GX?WUCsdu+JPbatIFA~r z7Dy>FX;sl$KF@q%wgI*L$%a9lJwhL!C#d>e{d@V-C$9#$YKf?d2vYDzHWnMZPfFcZ zr&A4jc)NzbFE8?bJk$}^1bPMziemXu_&S@n01C}flK?-Cq}66 z!U9S^1yC&OwoH5VB82O2mH#2?2gb+8&-`Ys7E!i6JxNFub*ytj|1M{0a;5Rtv^rkK@71oc7lhj=#Z%PrwR^+tu8` zb$i6{J8uSHF5!)-&RO#-Xjxy4gz=_HkcP;mTE2!!NPkn$wCGH&CxDY z`STP+w+S{+#Q!u~81aIZYpvn!SJa?VM&F^v3WD-wJ(W$)I5&c-hAg{ei*K$Z_dAjX zK|7li-7<}qtsr^sIGg)a9uzMViaD>lVR7lvT!=mwqL$CggCl059nvWK#avFc%1?G5 zVQ*7KgFWv_{Xce^!ECW=ahWcv!K_(1$SZS_PV*LKu?2J@IQl5&`EG@@(*sL=*dbYL z!Z;Y_B&j1+0BmSu!!VYmomwW+Tw-5ly0R)qJT3I1$`V{#QM$c zdSW=r0^L9xE1-MM0Sasi!I8EBD^d-Rku{}tm!s#FXV<5F(?{bIc79!RR!hT+&!o~t zxMJ}y{yllIGdq3p`EE+ z!x{8+swaej%q^Xr^$T@m;eLE7z(hYBct?UV>Se4lHR7wXNRz+&SpFXSUvN`z5S;IG zv=@nvsVU5FqjBG6`vWTN94I#y=~5G+E$oJcU#x7zD0;=Wj2<(zFr1-&k~g_jv2mjdpbWTX|}xlro~JT*UJh6qJ2EJxt(j>hIMsY5xGIx_E-GpdHa zEi9;YmS=$d#Vsl+3=oc=`C_b#+4fO9PH4yi-?4r?)&(W|n_zkS{#x8*c;c}eIMyqj zSRBxZupN8#<$kp6)xg5!6go&)509a*NWs_Pd(!1Ptu<*~;em6Our zvUsVXbR|~J{?NoJ(bGt7&6^DcRE|vEbEiTnLHM*Wxdr;3n z;?xa(8|n=Nls<5z;7A~Sx$tQIPMsT36T*rj1jOS2$$~g;J?S!5+TV>)`On5sxc`AZ z9oUa;_^Yny?{~&~PYI^`Q>&!|+tX$AoieIp>Z;3O!j1&P#N z0{LswvX#;qYd*D*pY{knHQSe_uI<_CSSdsa7tZU(h!08^MIM{*RzjN&G898Iwo71DS&v zda0_QbU@sGoKzPyv_;xF z>@oJ85NSt6xy9&s3cqrDZM9S@D_K%vY>pybX7DJ~?v8}LiLc?V^4_t66Ypvy%p41d zq9W?ydN-!t&MYAr9Rg7m8HkQiN8-2=Ifh|P;VYMcl@%mUjl**RH4HyeM-lyiU!EVr zqYGDp*XK;#nnE<$piMGBH2aB8Q`$eRarWw?oBMETxN=9_ozruo84Q4p`jKyvB_0m??Spw9b<_9s?1|(yGAE({RGI!a2P+*~5%#Y;wD2Zkqi&5Ubz?Iq7LJOp@A71&%YGpaQ`is z8%%j{@q!AWa{vz*YP7+fwu;Cw5dB)PH9Y7tGpcu)hp)E)hkS4CV12x&x=|^a-Gfx6 zgE}Ls4aAxOgoivkvIyAT$7_A1&VRRIY6)2;8_fdun1ZG(aGK#8iHT9&{H^9#8?Pf1 z@9M4`?7eX)#%k3R@fr7aLNnQBz5rVl1*!A-=}ACM4U0yr4Qq^mcU*T)YlCN{nk(OHezV9d zBD!AF7cUsT@4rgxXWGzZB4wFCXpyrG+A;QP?QG)X8!RZ0-rCUhaU|i)BgBZI+H>`e zTkDWYQw>6Qs5mZ6S34QCO#(>gpd9dvP`UwhUC@Ke*x9a+OUKFxK(XF6>s~s(-s4AlblEcZ{W9lESg=#crbpif_hkWc5Pb>sEi*M@qq-0JZ%ON9VNd{Qir79+KcCQ^`8v-T$fHw zQoC4-i!N65WaZtcMhRMJkv+1QAw~ss%9|(T=evCArQXX6k!}O^nEoBuhs{LAyxt+A zy=5p=Fi0w!dYH`Y97Rq6V>ZwgylBz}t`x$C<0O`|sLmM1s_gzVo_smFi$4d=;OvJ> zcTg2PgyiUt7yH`KveG~M2_Znx6+7_(zY-N;26uyT9&kI>$`9ISOwk9acaGB#1XGM+ zS&Z%mM5aUrajc0w<1^yz`HWPCWK#qv2#O_zF+$oiczmssbP5v$#g&cp62h%bOds}j zU``uC{Lk(Q(R;b3i(6OmajB_iB@{(#m>SU&Jo_Liek5p;g8m*r7ZOHNxBFD9vzHbP zDB|v_Us*z^ckgO#=2vXaM|9o0F2M3$c*up*01<~MQUtXg1e(kZAm`oE=_7-NISkR8 zfFsHm{ekbY|JtF8pB}!7u;$yVLlU?UEq*yTAX`B8S_%RBAQ4K4&tsThRW~L)O$}pW z07xVI%Wf-j6!mLTd`OolC#46o-}yU@`CjZ(U!5xE614$GKeDeOZ0!s_bYH?)Q#UNX z*TKNVzIEhW0zq6l5GR~A?Y9yfM!|hdE>-va1M|0JEE0R4U0Dke?xpH$XIt2+OXKl$`OWT-EK|kwEK-GeyX(49*A!!&TV;QF( z^BaeH_fK=m?emj7Pyy@SD621yuRRktotb+pTF||JQaeQ9hGske&?Eu1JX(Z_%-ldk zTbRzXVA_w%#zjJF5YdGm_gKy=64YjFxRuUeQ}_w=yDcRAAr{7ZcFXihW5`G^ zf9(kY*pS3O1ps2F-DoA8jk$fbSPf`BxVpu{viyNNOuA?D)`Kh`v6I|2{a3`-%}6R0 zW^`+lA1TnhB7=DWDUk$jo;!6sc#@})=G8%b#WQ&Abez$;b%Be~k0-~#gDHI~C8#2IgFw+CjjC2>^4ZySgqP;;c*%|^6Sly77|t6X+mAt=(&Mf# z8c~GHYY$qZ=@**2Ja-Iu+Dgcx&P7RmuKI_J{v~04-P}1KETYmv6q~g4qwHNr*^<4@ zrc@Wc75I#gg0iH@Z82k8BD-w0wIbjSd%aWoXInQZG6s*Q69G^sp%9YQ$&!Xsn+u&S zCG$%oXA3u=QRxHvT}wvp=pOxICFTHxfjnfP!sMOd@jr0bIyfV4f}-2-d-w4#YJVxg^CKYrZ{AtKP`dG zX8%khtag|+K2&eQ^V%>BW%$;A;+zIErwkfvBIE*z?Z#4EHd-FzNEymjSXhV)Spedw zoQvStt0z6LBau5!P!`!!WNn9~G=Dg3w;Uam1e3s~2NaduKAGha{nF-c8k&fwTwz;RV!8Qd36{n!Eeu^5qnkRe107w7 zVl}kfzDy1+82??^4+XzSmqVr6i~aGqN?ov=a55F>_CxwIgC;FM=OXYxvs%sxDyinA zu)ohQUm4=TeTxYf8+7ya}31B4tBPHPtXh3ZuFz ze2`t8eGFol>b0*vfNhOP67S{8pVqvC?JESwJIE{-Z3u;mZa5inR_Kt5C}pPtA_6vk0>pV~=hO=VO@vn+-JVtz+FS30Y5JiDVk!UzxRNttE= z>j}p@$=U`y-fn5`_B0FoijJG@*3l%?HxTAME?khULlo0hj}5B-G2FW;LepTcTuBSr{Z)4WWq6{KD=Cieos9ibaZ3*9#zry^1>>YOYAm zer&3v?!#R!iNgQ7whXWGtU$^zM8Atr0rXrfbdeEXiJrX`yy|52GScFbW^9*pV0A6! zzmn&@jQ?w&;Q#6Gxnd$%d5cZ?x6CV$V)>(2BZ_L~jsP$$0>!fzxGfG9T}Rxe=YEqq z2+Cr@mIOzJbVcVoVj0=ux9izm`s{v%KfN}gRE1UZ&>?*=%mkh3*rrx4j|uN7CW*pE zQ7xyXR@oWW7p>h5cQ!@6PuC#s09L|J_^abGg;wv1R5lw-T^~2aEx*_1800Pg?*W4z zULs?J@e)4Qf|wX@R}daXiVFmnHQdl8cs7;0zHiU_etg91=OCB#ndJ1i*TIr~27^J2 zi*rM~lia)Q0(!_UzmoM=y2uJ(E_U&*10_J84!<}f|DpZ?4LhP#QAWPM_ygbww26fY zA6@t*>xW(Y{_WO3!-}DIP2?;RT@}7+rX&|9Qw7Nvr!|UY>m>nTz(2(*s}zuXO15-e zIQ|_MF(8A+#Kdf#p2BNsX=Rfv(WuvgzrMYh8+_6JxXY!GHaE9K0)!+<#d@<1F7zhr zbv$is+KFl$bVsQ(8llrdqP4G}>K%}#+E)qwUxv{d&lqxQ^azr(Ma{^(>J^Y3$}hJ> zYd>8eyXYAlVey=n&Wf5%R!IT8J2`O#kC+r46icQN^XpS$f>0{ic$2hn{B3goc5t*p zaZ%+giY$bLJF=WWs_$zO!5F?&8RRj6BLVbU4}>&puMn$)UP^1@8(dfxBP2Zq_e#>C z9e|uu_Z)8Y5IGH)rja8gXsfay!5b2F7t)8C$Af!3gv@V;E!QEOLI9#(jPdU##2^v= zxWlJYFZ;G*dm^1s|LaEeUw#i$vY=aATgVs~kYSkgDi{{UoJ#WYB^0@%N9?^ptxJ8> zM5l*YmK(vL%!P&f+i_l|8qd3#V;z|^oa>*#l(VUK^QFEs`s@I6(lM)zNELoAu z9-WCOlnfCok53z?s4dvU}C`)l~6EGeV890oe3 zRS~}zu3QBtTMTk&w?@IucNA)4&)h9}qxm$P`D0Awcx4Sh- z+6~TUxb9P?u*@Sa97GocBhvz;?Ua3+83g$KHIl+8J2YrkWQd+$sB4whMR~ve?dFL1 zk**9;=Oku{w`yYqN!gE_|17#5RtRWYq<+qRlu z)Ka%Ey->N2mburQUa98Kp=qL7W1$FeJ~q$+tUvyjZTHXEvmPdSRV?m@n23l^dpl24 zQ}KJBidoT_ssw*}sA1iaqKB&vp_7-gC_WpSuY7}2O=&|N4Q!5Gmjv7GcYF2GE|PS# zW0DX-!^xzzqIU(Rb7*VOpXe@}#qFGyBln~KP|5#>!Z8)dNWHLDRjj<8SHFhsjID=D zK_{b(-jL&9wr7u@iuQj0%@hIXw@3Wu3}1UohFJWh-EiS#SCy^#SkXg$OOkR)2It9CklU}vQ;a5uon4_xZja?CUwc)=-sko( zf*%jw6Fb@FHpv{G)LXExHCy^IJ8`+tplSYHam4E}a0?_HG%ME$hmyy^g8RmNPrsLV zY;BV$cnLEk`D3({K3RR^=QNnB0v56cXo2`aAyc^CwVx2*vr4K|rrn|z04d}bIbRV5 zw1{;s*$8)4Tq4;EJ~mHK%qjxRv!M_@0FoDRR6I&O6HDq{==B;+QOm<%@K`NUt7-T) zclV(6uQt?u)0~z>qUH1J$bS(M0J5(%XcS^PWlvzqCjcMHI(Z{nX*?v}UqUQXsa2Np z7VHOHhXbp$=_*83uh88ah(P~cF+HVDV?x5cL{U8i#{Ab9Swt@!4eCdo2BL-IbVXyl zXS>Pb%if=|bgm}qN)!RyDFD$9Bs2gfA2-0^{bjEzXE20S;fDctvV3X6h3KRSigFprge281w~ImL-vLH;CeCq>NvnM0{s{5Y?&^MTEB89XtO3DEpm>{{cj|1WW&?z=Y?%S=*$AjClEbCK*S z+S(bl+0W^3&kmEg?M})8YKo6c=p=IJbo^RvJc5BLbM+jm@&pN58bSyZ{#(2)v8qL! z=?%}>68^Wma)vB{4)m~Lur?hkrKw-~YDU&OZ;^#`5lhvIU)B|&z}r7q+b_^O<-#BG zYfp|ko4!s^b{Dyp)EiNGXiz!-Fh$isOrTGa?w)igxgeBC%0DYsrmKbZ_)9`}F@xs)w zO!e+X@L981t-Ze1^Y+OdHR{vXsLI;J@LYlrRvre~iuliP0iJu#yj$ z0YaZ~Ybh13db#o#92oHfh90_hk0;wc%%DO!3+FM-kBUK2NN$pps{#K0z@-i+Ggq?* zYvk{5UT(&qMk2O-SBVrl!la-Ad(ry3SNs^=VDZkrI^hEH+;VbY6rn`;j03VFe&XDp zNI@7gooPc;k&qljSc0L5zJ}tEZq1NV`9cfGtB4;jO48m;q!}9IZL5W;p{%k?rzp|C1E9cXsG^ zz7fmfD;L&_bW8^DchGoIhj1-8+x@7VvwL$82zQh!I z=?)t-(3?;P8VfX#KLEJgV!A9c1o!I1Ah@sVD!8wnSy#|mJo9qS(aLH|%E|c!zjD$R z(=7^q9lUIFq*INo)+THY{o#EF@ob0OFwIa;)EDHDA`{cEOY%4Q3k=0TYI8tUvr9nL z3svffAJJf)`JZ8O0gA*OUz7sG^~80qJ4hH**Y3UgGqSif$v%qOBKB?YljWF`H1>~O zeAEsYvi!FIxKDZfN%!Qg#8?O8=5Fx^rCrX*Xib5a7J7mhDub=pDxRl1y&lQn*1777 zljNfZPiEgq7_W{_coMNToO_Yz3@(LLhW(vK%62sIzQfAXfTvxu6%!(|KaP-0PPz`^ z2posRAp0p6{Tl3Jp_lUN)VN7m&}+iH zRl>)~Arl?JXL0#p{~B+;Cc$WjOPOi%5yux^;J+X|9R~ITZ~x_sXuI`-SHe^sLwi>m zlojPO)pO4hGQIy$-ges5mMlTF;1(Z6spNyU=UpOzk%GsBvR0&eqMxvM*6Be`S8S&Yg zo0}`;^F&z>ug??^K!m!G8NlH|gtCb~cSZN4)%g^y@3!kPV6QT|6#G3C4t_R%!umy~ zn3UyVT(a^WbUk3Xwg3~*@Wz$W_KT+}iK}QaAyNvbs{Ntv{92OC+1U=4f}|Nsm0_tI zMF`Os6BM<(Ka1AE8irny>bvS7^#%8qif8*A>MuQH!AssJhyGWkc|&)uKvJ^FwgQV; z?n#`^O9?sC`CI-EHaz+~RDaY@vIRhNtuhhL@F+e8Mptuog$zI6Z1dcl=Yl$&6g>TC ze~{UPL*=T*b~uI}{1zifDC(gdZgu=F(~vSc*|N4qi7J4%JUMg-^`**mG!zeS=(#|(SiSJ4lcd~v=$i_RUWh!7j?&DpWC61ju0Z+&PdjI=M;Ld)-1d7WvQ9xIM9hmvT<8v=Lp0EBvz~h|j$Fjhn(>M$b)8C+) z2j@S<{@*?G^#z(=XLyAL5G=aZ_330=dtIdpB)Y%HJRrlgoROPT#uYQSAl2XM=ya$kL|ksAIGF`(~o`ku%QlVlEnu%$dp`%o0T?1y+se zTOabYJg_WU$OC~%LWZ9a9whpcP(-{|Bo^>$+c)QLGY#fI4)VC(7Tn@Rnq_8at1zzR z!@V8*i769K>&12yz`zZYAB$@!<{4Yw`1DMKSzbf#1`twRgpMT`3n2oATdt*8|BMLr z{=A`Z{aj8y?^<{ZT17(U>VKp;cd)M@^d~kq>b1m=FsZx2V4E{{TRqba!@_=vGgH$U zqGaj<*}-6^@WS!8Hk157D-}n~Ifbb905X%z7A5TRgyys=3b_e zUR!bBrG@Hj%P)>$H;ChIESJv?NKXQ{vWss{PZ^sJ;w~MJnw_;99K+D8y>5ov(?N6K zdE?<P$;9f`*7CYz@I#1M`Kx9(<~H{aL8hECm+ zm5oghx8MV?I9mVORCJNa?p+EBo0wu-iYA^l*A`A_Jkzz?>=`z>s16Y+RpY`30mKml zH2NfntO+{;n#{8ZWn*Yt;#g|bs<(U7gZ1dkeBwRb)VX4R=_8VKCS6B4vJCHU8p2|G z8MulxLmxdW7U=jR)Oqf7t9V22v7SHnI=8p0a$6E4yeNYNPFmo|zB&6E(I)Rlm|_h> zX0Z_s@aH-X(AOj6cd2yqgEHF4Ipb73PyXr&$~|th>j94{LLBOGB-r@e6RhOmyX-(s zSZqZ<$%N_(l5>=AY<0PW^lfJ$8HVsJ{=|aRLz(oCbM8xnaY=H(7VwZ%5`V>TY7X{p z+FmXe>uNOiQU=UbEy}S!k-AE85Cn1yo?)>wysMCZFLIMVFS(z;4#%8_k*zGKY{UIt z;C3qZ;R}fqzIkXU_sBCyUG&7;y#Del(0OtJ+)o-AY^NF~SNG(<&ueeOXjX7+$u?J> zl%|h5mpDIu)<;ufb;+Sya9vtd26U!m{!-XxeZ$!0nFSqaSY1qs^5nB$=j2EYV4gQn zPB;BgoZ&E7(D7W}4Y#MRk>p)?pRMVLU+)&!-AnPeWYv^-Z0@nFx z2QTGD2?0pp)cD7hw(`58pOi_%9me=0gTjq$|x| z4e-}S>zXKL!gB{hp9o&k$x}``S|DY*u3OMu{pfRplOaOy8H!HdhdL=tCf(fKx86vCZ;jpc4p?4>ZE|5DncgV4HO{q25IltrpDzd5XoU1$;{u<$f zsDJKxjiN284ice@VEdZaU2Q#^&n256D1ob zR$PvCxVyT7b;f$8z`GgKiQJL7hY-s=5Hbz=frq%_#Yc1+ z8+OO-IMvZ;ye3Wt799LhJ*Qt)@GtASyIYcUf)<;NSYO%NYb0<&?qmYG%lE^uMz{2@ zBcdN#XrOBT7p&j^J%$9No@nQIXKIIiJaG}ut6)!VJO6-aad2B*CR=tm;cOVgZ0t40 zy+f)l%3Lp6mKl?^3|UjQh$d2ara?C%1X`&;z%xw`VEK)EgMev7;qz#0ba*h!H_q$+ zzq+kFnhk7?Q|+?_MO9F$))^z#sfn#jluB(yLm5&bt%DF#QZcqcMy#c_SNj%qs3py) zZIDJRjVMk~OKd^a8hfME^R)V=J@1^k|9t1%<=lI|^Sj^u{l0U*PZanI6aJkERl)Lg zX1(H155*-IF;-s%by<1EhWW9qt|guSC;&n_m@f%4oUuNy{?mDuG+i6k5)>7Y(kcp{ z8`{>P%?X)6_$))sQ>2HKx~`UQnBWu)yGB)po>xbweOJK3jLQ+ zX)TQGKkZr-R2KLpgE~sI(q)G}3_n>kuijG%Msa+Ni1Kx{(UB`#pOJzO51|wm~ z%MB|HC%IGe=87Dk<^`(>qEqW+eX1$A)&dH0l*m37Q;52zhU?XdQ{V}kQ404ZbEG!+ zZKIY{3Cq2WIPdj*^K%d__#F2Qsbs^5zwUPXrlc1TbhA%>PuIFLg=F;lQZDI}ETTOm z{l3COB8dw>x2gbxkI?gH6gIH+9xuL^%7$ z`G5B=G%NWzkh~hbnJq$oN^a~hdhoO73!_l!P~t&3 z5I_NCO@a{F*boA)RSbFLIk>x@MuU~ z8?oWY%LXtA33NI=>9tta+MIvhsr)>>dxp~7q;pqA8<&tV6hx5)D1E)^M`l3#M)<&J zx+3CYCu<^ms!J%%JA^OA{28DsM>@ro-NR&ABN zkbNn;11WW@JJy3~;**kYB11_Y{iAia4+#NcLeCVV&N70O^`>FIFV68`I zuXw#(4|Ig~OxBIUu4O76!N#W(Ew~tNM$)q#6lxq49DXsd=X`rQh_VV9`q_Im|6yU^ zSp&-_a&Tt}W!&=Ya{@%ANM1ENpz+?G*~?(fFwQWfl!Sj#Ks%LkpIBa%{+ZQ3fH^dO zHHY!U*6^YZ=T^b=a+jJ{567FSq8<*`OHxW>_rd%<^qZff=dCQ!m*Rs)sH$z*`UaW5 zYSu4B5hzG zL|sjB8G48*^{o zW4f^`F{kE?RoH!^_rG?FTdwI=!`pJ{J7QkMO4ze(W@~7WgJR!Pn>@Dy1I#X%>_(1^ zV34$=dFieN2|U#-pB6#^?ZDpin5d2X<(RmoYj1-3IQjH)vZ7v`(n@^F6=rZ}ciQ!% zli|F`F`#FVRD|3}c}fxU0H0E%7wJp($a#HIi!5u<+wV(CcQ;5#rzqXhjnZ8LBHi6cNViBREl5arBcVu3cXv1Lxdfm6p7DM- z=fnB1$Jp-}Fb4Nxtvlwt<`uvHHCLFjq7)i3F)|bs6q<~*genvi^Z@wPLVN^X!Ly@q z0S8Yk#l@B1h)apvTH89P*%_IbiJICPo2g2Paq#l;Kta*Rniv|YN;A`cH^MeF{QiT9 z0olP-_1(KjRYTvOt$nS7l;5*@vf@&7br%RR7doKe*opDH8oFB%Ke25G474Zopwz0W zM#TG{P*DCVh zi=li?c?}!$BN@(|%YSD{F0>3@#JZq~l5!&@E;FGB>ZmCsJ2y9&F3g7#QC5{nR;1F= zN7PKk+(R-ffSNr=6;I$e?$il45;ZcCh)`q^UFDV+lQbjJ_%)_6q95i5>bBN}mIQzQ zRSdC#oMe9^e{@*bKQDt)(x_kq{e)mUnt6M3UzjPI5T^RMU@yT!X|c;X27a)!2P4O0 zrYU2tpa8`P{zim?4z+}W1Ajq-Ut;hJ1qG7@^Y0btfh^en{H+D~C+x9W5EPUMl#GPv zOE>7SCwHu>W=Ly5KFIY0kugXi^*0N1g6kX)Fu zv>)944-ED@RJQ~KnSaMmp&8}p5NSkUAU}oyPFm>e>WwMnbrOjI!xu?T_eYz-(Ly}DTPub*b)Z&^TV~F@aXm* zE``l5>W4f6MP*T~tgBnaJN6t!qC%nC<*7 zSFJFWnNA-2=4X`6Zf~FAbtE(B;h6c?4k$k&7He0@vg$XcDAwf*OqVIAOn!Tv?783c zd^N$1?yo_Jd`FC=5Z13S?ZvJ*8D^w9Gs?rWToeCmVha|)bfTKQm<8_Ns*RP^F3&VP z5c>D*twV6_pD;#PHR!3RM59P~A0HkbVq;+$B^#HMf{8T}q2%OLby)4A)%s?R7C5D? z_JUFVO)Z@aUZ2Bs_1m~7O5|;nG=Hr!2M@Y2%b?ETh0FdNORijUlJ#Uo?yGSkMheD( zr}gQtf2I$GSJ*AKTP9F{gP4zhC)}scpH~kSJ5=-JQ#%nZe&9l%8Kt!+TuHNcba!^< z2vOzzwPOaF7+yzn7lhmxGR1GNP>yHn9FMk3Tl(gG|FEDDKaB~;W4bDy3PvF??T#dJ zSn5Qo+i$+U={|OM;CocB`_*aSqgFf=QV>71P@t+S!tbKs%hhCqzJ<2H)%VZmV#dB` zT$gBpXMwlJAmgKPo%gP>o^PJD)g`C&PryzQ^sFWoaAVKS&!2Rse==YlW8ic7!grYY z7&UHawLjTxFhejP*`B}8w&gaD1SwzfV};3Kd!Br?YKmTiOCq=_*&k`#xB5_(B7ZKH zF|^7I{UZrE#K^r4M0ewV?IS%C!X(Jdl5?eH6)nDiQ#&QlcN4AI3gqpvw5Racpz_{N zVR=5oU^7+O``YQCT?E<7(5h59JHVx-)^%S$^RNWfYp1*iZjXYB!uurF>tZof-8^}# z5@(5Z`zbbS1{RZ08?N2NS3QS6x94(qH+SIpPSfw0g({yz)b(FP;43(j z&mj@+z#jSq)0LD)t0<0K)?=@%+|;bju#H5jiP!&_T_fwYfAd7mviV{u0w(wJcu-g_ zS2oeKCyERv7$5ntrvtHP{P%e>MSWrO?U~NyW`5?%xQ^}E(9K7jjpl=7p^iXm!!P)G zshfMB znmY`=(ho(il3gGD(I|S=qD3JNEN+^XMY}o)SoW8b@LoB#7RG#Iw(Se=3{RB|?>%_S zsp8<3fmF3UE4P}R8Ap-3{Zt{}!@ zU*FwcBbWue=;zd!6I1;p|DvwOT_^Q?7JHR~>r3O&7L;Diaz2}dBCTA~u=AU_Pl?vk zb%Wlwe=gpHR$7lq9*1cKZ zS`n}vFqkD8IAdLQ+=f80fpG?_Cq*jkew5?1@k?}FugSyN{e0GS^=#Ucr@$O9M++mX<6fMMDCMermOSdjQIgixO=rDYaGgtvexelD zlexk0#^+K`f9pKlH0o#Z_UU)uWc)x2F?V zqR-3+2=x?6h3yvnP$`6TgRbdJO$Vud=v2okkUZb0MU9oGE!s zyv&>A)jaG`+i#hQn^109_vhn1La8^|%(@33QAwXlJZj>4iO-VGR@yS9M1TUj^ouc| zju3Xf`Ff9El&KSwy%hm_gKz1gK;`oaxVhv)wfwE8z`J`3Y7q9Q(sqOC|4`H^?#Ve& zu4%qin4dAs!n-ddFYv}rsp;mZpVMZBN2|g#Og`XnM`?(f6v&o%THrd1; zx3NSz@wA$$%@(3)(V|z7-9Y*+Yt-K8FDk(9&a7AeGQ;=Io!9%S@~d++xnO05gfJ}m zoaR>)a>s)Mzs>j9+AxH0{uJbJ=_t7LSvtO}jOp<>J<&|Z%HrMw6D=O6TMdEdJCnKR z{FSz==xRy?h5Ah)*Pbl7F=z}bjXCPAb%|0mh`3~Xq2FE)ZZ*>dwFQ|YlE18(afmqN z9wOnUdyTNwA~}yIKUhGr6%LK(dv}@7Uy`ts)umahj5WK(@3K=MpU$qHS3rQuQg$dM zL9ccoG}-XFMt`nAeRtbgiqwhsrg80w``+arT|ueamfh-lPUbVRTOE)PP`rH8SbePt z0>I35v&)U1)Cw#-lH+Hp1wtESoLQ(cp#@ka-O}c(qLaN@9UV$L)BplW@yKMU8P|if zKp~}N*8rXksm^|Z1oouTqvmEwzPah?E&b=g;vB@EGn$rDF9nTc$-9VQK*q!Vwl#7^ z&Jg`6+?wbKZD)5kvNlq}FB}oyC^rA2PU-gdBt8Tuy_az*Dci8`k@4!cCM_<*Gjrr8 z9hyJMnMy1*HnxyFS=^<-$fv$&dBC)fj!P`j3fo?ns&RBmu}UGnu&+F zHsKB2$WF1=xAw2jNaJP3{!irZHk3jyiIIC8tw^p8jlyRu%p&$i5S}AGk~dPOB3zpK zw10HH-!gIgu2uu3NK~i!YpD=#@E@^DPgx_Z_N%8lRqP%6vb8H~M|C+8*~NQBQX5TQ z{>=I_%=CWX`nJh+*uBGWr%m_n_-8{$&bnH3uXo9Z%_LlM++zY(wOe5gDQeHWVD+KM zRL7M~eZd^bQ`vfFWV6$1;$IF@P5f2Z2LsO^bBf)pzOQ88!cY@ue~oU=mJ{qR$X?>H z7V-3OmW!p5oc$MkAkQz?U>`;&*o7{XL&K?a>+(gf{ip9M9+P5#aL~8qkU*~@yEUt2 z5N-FA2bG*t_ex+|PP@vYn76HR=3r0K!Jy&V`HlV7iPq65&g=?)7yMd-YHr{6!R=?# z=RFxQX7g;Z9?qqT)U~;5C7dTM)K24skYnH z-WTFpBJP2`&Dg8U_n*HWlMbC8#d@bqaCueA&9F7H+D7u^tds@xIULmLQtKcRWSDx@pt?i=njFP1PnldnN_HheS9&(^ra%Fvhjz zeib|dYCeuCI3bq!c=>RJGv;P@!_=oa!G*Nu!^jB}AklPwgA-L4h;{Jpl^X2mwU~(i zCjqww1MU|&G3p6(u|u*iw7YPnmfPO(NJWv3s|<;SN=Ut^-N&e8r7A41pi~;XVJAb7 zSF=iz^hlvwB!@4O+qlkL_ul81JjzxHms%_MF5xVFSDeFREh$AhEbJ#PJJC!;fz;-3E^PO= zE6PV{M$F-C8S3jF(@rMa<L!fFoy}!$=X~ZHFt=Zxw&c?iwr` zPQXT9l;f2b$m_A%XFw*h$^?U(%f^A5TWV6Mk|60pXcReAz0t2*ZXC>swm-C{wOpI1 zGk7hOdnv*AMuHwr7K>#^1O~;gC6r&%0D*6Q{uC`CGu0&fMliKE+TME2>mxxBoHXSw z;^-F*rK^)oYP_(q;oa6IvS(TO1V7kCX-i?)gaeK!oA58Q(rk8d(PMRC~ zkFLXzTRMapFjGT?;%zwQ=0rj%R4^>#pJ*lPayZgIC38Q}IM$$t3;A>gUnNR{qdj z;Dt6C?e{TFS(1HIRbxTd%QxIo+t~%S4wkyA=Z{h@u-V%!7EF4g%9X|mDC3O)7Zc4D z(heeRNLMnB&PXnhpU}`blz*-*h$)BPm$wENVt$hOdMf*ODD0JZ4_mkoeaPcB)8X$$ zA-x2>$*(vk6A(zZ{y45Q>PE?t=y*n1tEjTOM!t3TYv1!V8S6LXBzPtfdc5*3$*ayd zp&eBvfJ*Vv$a@#EcWCxSckxVFd+%q$!-Pfqv2AVwJ_@a}HQAx&x65~YLI1V`uYo=m zdGgUx-C3M~G2MXS<-BcJ^Pp^lGIM{WK}oIs*%VBx`Z{SMxAu=W{oCgylUq0Oauml- zH59yb>!ab!mQ=i-zR{D{S@%C@^)JM(UPnUsPSE~PCKM#?*; z8ZtS;*ibgmlcBU$)`(mD^e^o3`v^$k)a6#0^*yAkrUGQ|gIrs{J=*)73RvXFQuKbw zdo)P`NKthiE;Hi)eE9#o0TW4>MXs@#<W)t244Q}kp^wZ>h@mLqxZK~~lW>Z}~s{W%#~$IBzLLcNA+ zND2&6C(6_9iR45k4Z5%%s0WygNDB?Zbtnb}E9@iG82u)XUQnFn0E)+Hw;^PiB$n(GD+5X zcr0lwXdMRm6hkncj0zu~B`)_wqYRV=Bjd@wEH~><+HCIUbzIvxC@7$Z5R$!qdE8io z2T)EK1ZT~?kq^pxm|#S$X0JLM(If^nN#9IPv+!pnNo)rCOd~i?6yNo)#*y*6NCW8R zOnOKCfXlxi0{l3cuS8)v^eI?`A%oXZGEd+bXq@#7;FJy3*}$((;Ls{JUy6+!e-5|>qYaz2>nB6^AZk2_p-s-5V_$2?~L2ghL_e195K5=8c=fuB( zK!Ex|?nn6jm2*P;XpwRBUGkJNzpl&W<=F`&KYVUStQ9*rEcPn|cs_m+E)9_A>35BW zmno|c3Wa|HW|@T1ocH;xhYyWctztTl%GPM%=XHfBfI&ibT(vWp!u zv>dKEFd@^EjiG>Md3ps~2-9o~y!iMqPSy&o*zwkYPn@<7UZ3x~YWUn-9C89cbMCtL zAhr0#BWiwPFOQS{b9W1mB1TeSzbxs#mil@n)Ax=epqBRGd3zw^3G)AzIDmR2)-+Av zu}S}_HGU(IQw&u+O~*HErj_jena->B?a55biOzh>TniA5ZI}u|OlhbElAx|mjo957JZ3$dNKRj3Q zW5eFcxQz`9h-ZShk_GHi5r3HB0u}1r$l0QBcD;wWEsKMg5bNPWW9z|AAhOrZVNX5z zyw~yjmG7|~eS92m|2Z9AFqCUMliyZB`Ff_MNw4-#_TqUD&M;zEJXifzrx0L2i5Gf< zbZ^Fs6-_#i{w)yQ2~(;df7-^!j~}l3+HQl7>?z8^Y&Co03ZE^bshEh?^I$;~!oNv*9VDo2fQBNUA)s{~FX*&^`J<}w@ZK^>rGPE% z{K^)uswmd2t04J)dp^(h;E8KHYVVaF5g0`(!?5;F;~_xO69E#d<6RldG}srnRQ+e) zmT=6Q9i4qNPn0qe0Ctc%t_2SDj46T*9tGd&zuwnohxM;4f0t(cZ{`Z`Yr!Imi;1Cx zs648{V%)fQ|1RL15Vk+m0@nS7%96I?Jz@jWz0Tue}bvmX2!KsM2 zjQ>>9bP32%0G?$woFfy)<8?Oq8b)mwFk9^4W0Ze4<}V_xlhSf?6k(&)-hg2W49+99 zw6>NQxHisN;4?O&<_5;QQT-C<2mb&l+6!_u%N5{|bCyS<_Hqss+mR)4^6#}p z@Oj$4Z;ZccmWyT4G&pb5eU^M52ZXQwTKna=91bA3w55Hp+wi%&ao_#*5t$2#?$NzN z3W@>i?HJhBc_A~GSs(5%34X5&J0f;Nzmu&oImM5Bv_RoX@`pu`ZT7mHc@^TTO+V1` z=T3)BSuaN#>zw01e(lv?m$L_};buW=oP{w^T-tP&CLtvy2XcVhakKTGA5{xgX}w7g zv*^o7Q}Wv-ra(+o1vvax@>PNdZ^L=wop0`O@U($4&y(YbHz+w+;UWOW$+&=ysEffcf zfR3mxnaW`ja!qQo-c4eQTig?dfy@2Rv{c0*kqv*b+@vQ*v)s7e`t5XeB7uRI%03DD zq&FTh-R}~enl163Xo?9GMbC4Eb~uy~R^2_L17+al;k`5GhCy(#ocQ|j&a^-2$2EHk z&>2iv z8Ge~=@oI@@xCb9L@}Flhq=ooNAceAm1T6`y0F=f*hGTKS^taw-^_^^vNTHJp6byP^ zZRSHX<~>odl=(0du;^kBmwUFcDu9YJ`q{L>WtW&_3+ zoOVjHEI$Zu5dQ2G@oU0%LGxtkQQ-SgaZr&K4(bd0C(}G3_ zrcKWKfZ&fRPMPDK0+OoHW)trr|4?#dq#Rmk`L_X<^O;QDmB5Pw?`|jLAHyhr;XPYv z8Gl?@_NbZ~iXaSx&AN>I>1=U?rv1_wyNhU1BM>cyJ{NQ7S)T1o+H?`gYu_Ci@_;G1 zzyj-@S+gw5P+>qMpo)Wm29p`OR(5B$p=KbBn`u(bXwxtL573{s$853;uTQt1#M3DZ zuRn@cuSn&90Vnv42~N=IC5McpDY}sQI%BB!E0FC4@(C~d-hj+Cpso&h;IYv1o9TxA z<}d5}?GhyN+RekS3BhOGunisA1WG+azH;Z0w9S`M{qEtkv&$2OI7FaABU++xQq=XPz*_mRkhr+iwJ~Z`N9c!R+QlMol6p|@3DB_5=Ju04sw*nc8mE@Uq z%zB2WA{$p|qQgQ&N?%+J0=S|T;xC#H#2x|@NpK74zEhQ!9$o1mf!j1&1KdJ!P`SEM zn;930C)Cdj8OL#94rmFRqlNJ<^+G^v(5kh|cKlwJQ~lU)buR1XXNKSx;om)ikJ`#Z zDSR~VZjP01y@8+}(Dl7MV++8}m8SE#5gVe@wl{~jxn`YC~1z=>Mo2=aegDBB~~^-(hdfzpyhK_7_`uIjqL=}RO9X0^q$!gCY7=_ zAM|||PXRfh%}lKdMpUAMV<_U+^eBPSlvw@)DzVrlLSKWnJ37+g2lPceQ(FdV7W zt9`L#4sOOeTtnj6`3uY6XdH*EsAM9-fcF3s3^L?LXvE)vV1FXOv#>S!Eq_mmQLnxd zs5^`%Sf+28X|p2B9@{4~|LEYd*YG+h(HcCiQ-Yz1u}9Jz4n@k$+)xs0!KMKnAA_NR z(MsqSy|vGA-u#6CM5%tXECU0nT>TYh{U=V41asp~u9&bbY?0!}SJR7kVI?tE{S({WAy zNB6g}&8M2oZ1;Zk0+KjD3XL>+$^Nz8cqx8HKOKQuu0Rd^|3?amBP2xf+@@FyRKr^= z=|pzgVIwve4@nrv`FrUn|4TXxDL&wlaWUnY7*uoK#j^eLaSQZdBIGgOy?Yk}nsGFI zNnZp7gACG+#=j#JLoSI~3vRr41#XY8ya>DdeHzdfi>Fa{}jf37QUI*ASvjloMKR&p!AF8Aa9B?98_qfZ9B-$9S~pRX+&ip;VC2 zJ6fn#_1U?xZE>05iDJ6pmk}Jqb5J^oi5UVOGhFpXgGkV$5(Gv8!!-5|V_o9g7Dci9=01*)g_aud{FRTPRwqF4atAr) z%AdRaKk~05c$YO*Hh~Hh1JM$X*?%J#5!zp$P8gshG3CTS{Vhve!-mY8h3pcg9%iD| zApp!XJ}Rfzf^1>1u`bK!(Q~rBC62oSmGMl0pdS%E|$qWJi#Wj}G8plR+whe8mO=@}{EgXn{)i z=1d3&wA>Z@qA18uiR&q#taYlb3;CB8{m5*s*(X%vo`~2|_i`DdVz)lU94w@qlhF4% ze9_hu12kqykc{T)#1blOUkbE4_OCh+snpT>OTaM?>57M5 zXspf>3YazzlD2okwU1A3{A#E`5R!L7Jv?}(NCo`_N4ch4dgg^pt(VY-(ogHuTzRC` zX};gbrMga>oJoR-|AN4^p>CB`;vSC_(U3p7|1h&ob@AM~@H`x~t0sQE+G3y}%4))g$b+sn1 z(#9xlr?(Z+hM7lj&2b}b^-TTOe+vJ(`J6e%>#!o7@TSa?mX8ew#S>+@9XaeA!NH=-S36 zdTVQIc2E)#+wOMP-ml#eg0`Bdp^s(q(b!hfj_TrSyA*|4WqmMn;&_E46Yq0Wph7?Y zZsgf4KpBCL&{o=mQR+uXTRqNp7!15m6BE?Qc0u=5p9LyQX|$FtHLxs#TJmCpmgcO) zeFj3AcZ-L>R1(2rR^5P`k4Hb3pXy*x0A$n^nL6i2B(&#?X%?sm9TpBoL0Z4s=P!cJv4Cm;U^qwwAA6j(~XiXc5Q2NY01th%8wHMS_%&PCXw*?St z341u($BS#Q8Y8pD5DV2y($-MtwkJxH`S%;u_>-TxE&RZ8{Kbx&i9C|k&VppB3*+y# zZQM$kK5D{<=C$8WGhOtG>GGfh!}}`}F?~s-+=0grX0F~(gQzTN>zTg#p=<1I%+eV8Wa^FJ7Opo@Ud&qqbz$dV9l?z#5u;%hy01MA!mxHAQu(J&2$2q-1fo>jKVda_0dS*MVA5$ByIt zBBdrwi05RYZDvEMEd1(i&1{}sb@>@IK+f_~-2D$-Ku&e$ysfPyIWGZ&@MHDbBhKsjE6vYTguOD|?SjqEm&9}5% z^DGhd$m53%SxdGS+v>@ScQ|%95+j`*MsjpoYfIilHdf_e;9ywD)3z0@&9S74d`Vm& zt&Br$Lu47;N0dr)gUX^k1$h#&O1yOo=yi30HU39rq%9~e->t9SX&vDv#htu~@X zggJ0O=Vu8Mm8iNsZzI(dkM-zYY<5J1{e8pG4dNu&=t9-BraalSx#x4ksk*~4aVAlb8zPHpHig;f;X%rFof0^hB+ zKnTl%RvR8T|2o_e32s~G9I%9mZ-k8M#q@wb5HuQQ_oKw=YX8hlsEE@UZVyuJc%s?% zUGe6EqmjrQsxssNHY96o3iMgI@71kW%3v9}UAO6|N^bTKkdhT@`}W!24peR%42+n^ zNUX7;q~juBvvci!FTgLxz92KgB-bVmfq)9o!6o<>FQ7ug7hnG3LQpkbic)jFo{$Ws z)Y>8|Yk%rnEaVGg8c_fo>CP0s5Yo$@(G(6!|%i zI{WGKaEVF~s7aAWF;Stk0r_eb<;BYrr4^`FPETls#8(Er61Y2a4hlpcl=A4y^pK0R z$3vBtYHgunFpH9uv?Ec5r)?s&Kry-K3Ql^WFUFDKx5Nxn_tkx;;bd^%Qq9f`J%Yh! zj9PihH07=iP+&m~rb~baoLD$?iM*pcM++ua(n~$l;D#x%Cg_`d=cW%iCYQ~0`k=t0rF4B zlG{kcdvxpeRo!@W%CQQYIub}O{tHOJgb{f`_A3=# z+?~LtBwEw0rX-l#RBssGRx|JO<(jMffFDp7N)Z3`(Q372jKAMV!erB6?o?8*yyt+r za6`!>&PohpkkRZ7>2G$&S#rg$RfNEqFH$ z>x!~FX$$K-`IQ5_TQ123Grgsp3BxDucGDzeLmz&I=ncwSC~j^U|MC9pmMUkKmZ138N64{H{W?_YMrC|3gKwz}teB*Fb4x z(F6O)Pc-lmQNtd~cu!8d)A9oP-wMc295G1V=uY(OvJ6wnS#DU8Z-@ssIMyT>ZGSH-`N8eTLgodpfx_p)DAPHFGxvXA7MSGg zfs}0C%D2<%MIcaGlus3Fm6nJuKUMu#!D&KDOXF~)N7sg%RLsvM{yls?1WPFa-%ktn zBYZG3%FwHG$i2p)pxAdZXBNJ%{s^)EgYAPN1iS>`<&et9Hf4gY@3E31K+o+G+t?KU zN zJNCxW_OA9U-@lz^K^i6r^~#V{xA41CZLsO z4gXE*4s*T&rNqf0P^smB&XxnxvN$FU)&|_n2chwM7994Q&rV-gz>W|I00yv_2|~I7 z1Af9V1W`;oFoY9;s9VC5{S+(n-wrHw?6OxVju!`uoFMzz9Q%@-&gU$POu)(v@Nk0H zZLGk^vd^O_gfmunU?1k2k`N-=Ze%uk zT~;*Q0L4w_WY$|`mz2j&gwJJ%9wg}T#$Dm=EjTiSytRL33bO$almI$idqLa2*pI-?dxMVgQ-a*A_ti;kAOZ$tZ__H+ zUy%fOxvYdl5UU%w{7E^G2>L)hX$l}?9M@~b+qMF&e=AT^rd8cL1+)qi$dj}oYV5r3 zOX$@*2f9pCw5TX+azEM6|2!DVUnx7-`&5Ux=Rab{38f)ZytN0EVaCI1G7JG7Ekv@B z_W~H^8Q2=vafeU&@Tgqi@Q1STz1Hks_CzF@=&S|``rbjMKzd_Bu$S+Tf&`%%2g+LD z5%2fQwW0zd&6PlSziEnc2ej3M+_H#3-Cx!mvdu2Lljr`(ND1iZJGRg;_wqM8kkv;} zX}j+|8KNh^vqrusm=AJ35jZrb=eYOYBENCK(!bX6gSxkc0mvRb@i6=QqxK9aJoDf4 zFmU%B5dZV2|Nqab#N_){z@qKm+&^?~?z z5d(2R7L?ykWRB}UOYl~~I>%;^?<@p6#?+JEAv*3npZAIn>Ic|r6>K%)Bkgkr=t*XY zh5$1&hSL6!m6yzuwyuDRmmG(Tp~dWdLjP;4U>ZTLFw;w;Qi)>D9y+Qvu!E23YwJhvn}S>GVf%#$gxsC(U{-B zIin5CQe;rN>`txv2ziD~d(W#Q9}nlsIe^xs=iit#MmxSQ^P{JP1-fx?a9~sTp0@ey zH=WCZ0thqxaj&y@xFX6`4sa+$&#Y4#K}(2e<_GmBK#f!94TEfD%kw!4uI9g?4-zr4 z^VU-th1ii5U4eK(mQc$$OGM7{#P%R07r+aKLH5q;b5rYMh|lq`4Ar&UX)xzrW9yLE$neUugE7yr>B9K>+Y+2 zO|`U|4M1;LhPV13$qts=h3R2@Y^%|NSiqg+*UP{b6$SvH68^O#r32RugzGOlAHgFv zWW5Lpl?J*b3o~9k-9sY@7HICe2Cbtk+LbXtKkNs}Da1|hHTRJVdRUl+2`Qo;h{g+f zp6xgQUMvT;H*$7XMTUpR0f56PK1z$5`2E9kw)rGM5n~XBl2`=A+C6JPl-MKKI-s+! zI&+`Nk zP%<4ms7`&+D8=X5Wv3?OMLv&JOa#2Izbz2qqxOvx%X89ZLq2@4U*ToSxZUXoMjLZ^ z_clM;NT{E&)S=`9rUIyP3G`F|)A|ef0*TGzSN!5ofzRy~r{yqBizWXh6t38NV(zCH zc5Mm%pi2n}w*ZG%;Cknqm5?skIc=M=KMXwbcK{=j=!*da<}~g^bWLd?WmK;FI(4HTRUCz3F?+I-Mbwa3QW-@)OHYGyH8?;O|2e()9~mon`g43xhF zZAp*43R`zzc%vfKI1A=vR(8D(1otHAja zah&I4(l%ceVM$6A^p23gQxdy!a1HE+nAUWLn!H2AoVvS#Qib?t$h;{ zlg?z#;41{=Am0UmGlBO>{)BaG39#tI{#TV2Did!?o7PlWq`Y?kns#F74^kA@q6J)c z%%M;{MJP|7Q5UQWm%=XO6W8T=(CXn#l;CSl{rwbVleMEKv=cz$)GZ2L7fY|iA0;^b z06GT0fPJ>Cxf?im(5KG{Pj{y$PQfnA>*5-ZItOm+-^yIfuZj}3flqPyqaKsxLFLR) zs>RApCUso5nfF0=tE($<#yt!x8Dt;$*Z~8>UT7Ret36?Xhf2gL0py&|WR8TVncQ!Z4EGknX6Sr{uVv3f z$_SwwCAe+pX5JOm$f|5`60Z5ceR_qYq;2;kKcOaLJAEVP=|#Zm^+NBqKLUA-6uG-Z zjoYmmitsOXt?Q!aDcPQ|EX<=Q<}#o8VK+lKz@B1=mO0gCr!-hU#b?`6lz-dQ?NgS9a@_bSw|_$9ywM&t>B;i>XZ^2n3667( z?rAK9;&ICvG|-P|sK6u>H~-9{p8NKB&U*ZO#xYvpYU6W&5Zg%v!?&-KYHukpe)cbV zwWAW`+l#a?YYPKub5yY%FwYYCAR}cIZKb}Pf>NxTHL+i^omK)D5aH}s1$(C>CYFG% zk-hVYHvyZzu9QEQgRLmxlGxfn>gVg1LrRh3OkUBC=RrTPN2d9tEAv&+j=^TB-BEK_e;i+fcU<%246?`WUNpz4TvhLoau)p6sd)&R>Gf`#G;8Id8 zVTh?GTbu7XyPG16XHKk-H@2TckWmseDPO9i6U{6FiAWy^>j9dXxxyt%uAx#(dX=F!Iz- zahwVf8u{f}n6nUU3Va05v)EIHUv4Y)_*-HJI*kSn=4ywquIfw%i!z6tfV%R74j8Ww zI~;z}EV5b41+le!3hAX+<&kP9E=Mc9R|4D)&sAz)8L(^0G~Lk3>Nk;YwAon^2ce%o z>kUXO^#J0h$D&>Kj;!U@JpK|fM9O^YpkMD)h_1?p;i0VTuuim|zdk;)v!#c=Z?Z)z zH&4Po|AheS$MUS}ymKKfnjU+<*nr*n&(QgJ>{=gJTET8i6hy-lnk!rOA}m=LVLP)T zMYzQ#3TAd&u@yf9iwk7V8G5dVWizO@iA)akacXtZI9u8dyWUG!?bIf#%fybQ$f zhK{m8kD@xKs}w7;x`etPxsS>>c$6zPI=WEst}5(reE(c$UVBCQby{c!E%OYAiUm6a z7eKSKa%1+IMSka2e5t0WYqhI6i(wU{RrPAkOpmn+TP2QJ;QD+lEsE8|i9;XvW{iZ| zYmAcbRwuwgc^03~o7%pSzMv4F57oQID((H@rI{-cy~7!IjTW2aQZ6H)ro?1 zAeVjFBiy5G``tPGmyH+nC&Xoqc^7AFxu$9W%$ zn3W%4rF<%#aHP6i`RRyT@Fw_0%PwfRFV?VA9kDn4l_jcJ9P+kbKGfV#g6Hx7^3eBW4-DA@Za4No?fb!b=XV`?a!h6?pY20I#Q1F#0PXG0RTmQh;-gF6b Ub4Tw%zG_J3g`&h)F~flW1=?|Ev;Y7A literal 0 HcmV?d00001 diff --git a/img/spec_infer_demo.gif b/img/spec_infer_demo.gif new file mode 100644 index 0000000000000000000000000000000000000000..c0fda87b7145fe4e8026844257b83773774aaf5d GIT binary patch literal 4302606 zcmeF(XHb*f-#GY_o=`&-Az%zex)D)9ilK*UK%|4ANr#}QsDLRbpnxHuf&xZVL@6Rg zL5iU%-B3h8z<>=A5EK;^Wx0RP&dxKlyZ@P;|GwCApBLAgYv#N-y?nmM#@5sj$=1uo(a(+2;Uy8|eK5q& zBlM8xvqPJo`71;Q`g8^=(GDN#I!uTSp>~I<9SaYPk2>5NrE@AKv@eF3cqF{<$WF%b z=;y}`Stn@x6MLV>8=pOOgnw$^V4}tOq~k+$n+!($5Ys;UT;lLKmrJQt&|TSf*DYJ7tvZHcnI(A_U*$zk6dWlp%obcfdY^k~l6(BY zjVpp1$6nt`sJ(slO>tt~oqWL^`s2HW)Ay2R%Fj1f6!GrgeE%S$t*ZE4P4?5;yPc0p z-#yCx_~_Edy30L}E8aiOf8O-qQ&awIbJ4()s^Qk!Pp!A-I!ec$*3Wm|U+TI)+1)tb zUA^2>CFp%J)6ZM%uYWht_IaRbZn$G*xM_K$ZT@A~;#kkh*t0Jay{l8bE3f&hg8uK* zL%-jRe1AXw`~B#T*{R>NuYSx;uYQ{O`FZx&m-)5tU)EN?t*@{Dr|SOw{Lhxa)^%Yg zE7t=SW)v$uT_Oy!k>A#zaEt@U2HOAR{ZBUm=yjlq!I3MuEiAN@rcY<(jkXl5vRyv; zK~cvAxt-w?oeyq4yNI_qkF%}1)s?M&;GR!c)$N`u#K7)++v?)JYX(Q>Cc3KcJikt2 z2`SpulnmS?XKVU)*W4YtL%C^JU{`zZMX5)1_+)o&>C5}nw)2Yi56i}?Lx=A9_Bp{Ceviym`h}wl8$3uX@*$yE9^{ zx4!zrbFRe&CCA4#ABRc~l=}5OuKo0~GO(x6vEkvutGc6~rurHleSXbj32$*~tXqE5 znXPrGzp?)72mVd_>rPFNzkM35j(FYQ)bQi8pzXr?mi^6*Kflfnl^%ND-1PfLgT#xx z{ZE?z{9c*<^!oXepsV5lsV<`g5iwKO{9j4 zjeKg(aOdrl=QQt`8?;kWNY)9J$s}eCon^@e>*}543?SEKwIhL@LXJsbv&@{(p>1V^ zv}H3a#&9NC?&j_bYGQYUZjps|k1RvWsxnu|;;5N>TKAh}T~Tx^gRQ|L%9}4}n;&6L z&o{V@r#Y8x!Fp5Vojf3SNf&v`?NyI{4Ol68c`(olzC^+~DJn(CcGIG1{<*C_1`_2e z>8>I+O1`5;l!{2#-lwELf3=>LbMUQJlKVT&u+6`k=Pfrq{cFdTnYMF53F#PrA#Vyp z2Xsg?I&sw`w)VsW5+hDRr#0iA7GFs#V&J1v{dZ!O3N!A(yfV~q5TOub@Ib3E^eYOf zaGt1w+>~f+dH)&o8=fi;9m7*4IJEVxT)bfd(VV`I?D7efMT82jgUsI49H^c*r2?v! z^<(PXPBHG=Y@lnpumY+X1wDFmLNC|IqEcB z-B|DJEN2asa5EVlYBk>b#HRQoM_GCmr4u*JpFp%gW|h|SfGvLaTYBdi^%YU-CLgk7 zyh!p|bY1!RGXgl_xVKWI@M*c6n=N$nUKr^#DSfK zl9&EthQ2yUlRP<>S&M3@B}ocLa(~c9n8vG)2t<3qV*Iw+EjfJb*)K2x^aI=+4<+dg zy}Hl&!63xlIM62?aB4jX&sD_UijtMaK9*;iFh%a40y*u_Hn(U4qA^R^D6=mDf_GLt#KM2R7tzHssjUNaD{SnFQ>}uw!zfjI^lJf3R zb#xMtJOQT)YVcTxF)<9()Bkhku3frunI{%Br%;rVoxqHl@68&F z#$yRWvysAKGbH9Wi1er=B7u}F1YQ_^opwsxBA2IZJl3IOS0u7UJ%4JHtajL!cB1~5 zkN&YPd&3c@BN5Ab8W1GS?k?828ZAcA}Qv3q;Ktx=}PfgzE4lchUa;ZZQZ|f zDY2`;GquJ9`r{ss3DkOrtGB^JlCuF2{?<1qsS*9#{U_V6hp#+u2?7>N< zL1m(tpxf+bp%8vk5?3tgfONnUJFlmNMH=+y^T&f&n50Te^mS!&MIo_Hw)cEvSLDFm z-9}9Oyl`W4>#)*sTPBrpKr(VPINqq%%urP-l0&MDM-a~K%i^MOj|GQ5c0JN1-$htX zL`BGavXiWgJ=T|4k$!*YS*02(($J&zu6fEnTYAe`J3HuS;tbkOqCI4bhK1y4Zs`TZ zTctKnX=d^aJ*M;+p5oT2O!x5Busex&*Ck)J{r-+MijfR_I-35ntUy)z{5r`b)N<1) z3fuxa(+BH@OvXui&e>v0jK}cTP`;{OVkM{^DV9VM1+Fp-k+A3c(EYwi?HHwp>>K^+ zDBq%EetNG>g){{#odxg2sH14;3r$>)p_AMT7_&vFl`Ka-Owj4`CJL zez1;&sGtzL$V!V>*|DN0Ghv$5W>?g+>V(K|Fye-%2|Jjww-^`4*GJG|1V5~uX8R1x zELn~=4&9O8i+3tY?G20mns8z5!(2wk8q=y{Umr-yu9?_lyUa9$I*4h<=zZM7X{A0`sBTC(6KlxJJ3H2|{SaBqEG+(t!BTM4gGSfv$w?-NHk0 zXv|G=T9=2k)_x@Y{Q=43+M3Xr`)L+C@X0<8>Et?G78j~UMI@}4jRzjtL4_V&fd|u} zy+u$plxi0Bv?!DroD5B(?zszN9pjp~rJm(r)dSdCI-@#|tM#_6a|(0WA{>N|S0>6LNf~AjY;X^_erOU|kAUU1Sn82(5u(WJh2*0mqB0wT=-D z_(~(1Y;V49gdd>Mjm;#n(_38$s#KA z?P}Ql;EZ>roDzGnRLNfRyY*gvY-s=SXM`a_0gL|vLCP7-Qmp#~@ zcU(;2MpO@>paJj{q6rv8fu9g(8v1WM+CT-c; zePh{)+#Dwjj#(PT^5msa2~O4}n2!d>TEO0K$O&L`oOqYih?kW&BgD^Liu{T5`Uz1T z%kekleACM|qF+|2N1L#-qsFduFJD&nL?1edBp9N}RFnl34#|NZ#loE`a|!jx03!N$ zrO|OLI;9e6T?um{aZZGBHlOrJGQ9c>lB-;gG~wr^V9~>E2txr}WDM$ozm`aay5_+n zxkd(b7!{9J<{@G#vy^}4sTrE|T3_wVQ)oGf&N|3(;$AKLiAi5UsdiZEX7UjtBh$) z25?pGDWEkuEllWHk`ti|@dBf+PC*cCUCFfTDHO!1OYq}BRg(=XOuKU1f~&gsR?YI| zi!&&F&uf2dIb<&mi+?KxkG8*eBZ&g`$y4nv^g#JTm*@CxSR?JcL!-K(tD&g8umJ_Z|w3J7J! z-TeaF3$41FvV3uE_Mq0o`0BPYj-c#!sAIB!Ib`r6i&OS$JZtLI-KXQYt)7?8-n%n> zi4Fa72QtIPVy+0}D}JfqWi*U*Xz0az`iC$?1Ue5&3hj5F}&7!r$;TZ*mhIW`e;b`3Rq(rPU1an_hxLY|zP z5hpdP*4?P~_U3AX2zZgLoISWq(Rz5!`Jv~9hkg-N)K}Hpg{wWaYEDCH)a@TdT~j#z z>Op+zrqex-Vuh<>gdd%>$DI+rlXz`Y%7r?Hu-pZ$;xzBNnq}nSCFEdQ^(n7%+8E0I zTD{h3biQ{@p;mqC0O3Vil{FvAUQ$n4MrIE}?`TyevLBaTD5g(8KEA0z(Wv2G&*NJP zxFaK8HX6mr<0VN0A(gKlEQU8GFd7eQ75DsZU~VN=?A$hTy6NZ)`^N8_ z3e7WG&20*oCyknCuQks#G)K5JPxLe|eZuQh!tJRIgQumkG%3P+^sIg(lqP=6Y z-z)EShl%!GW2k+N?b|%>JNb0DC2%McUM}-he@=JM#!-y6BW5_MhhOjer9UuDxcRq% zgRxBuS$pa(@>C3iB(EW5!r+FL9kj-0+S zN7b04{}Oeow=+k(^^|-=wY^EUPgnj#Jz#d}Hg**xaGD08-!F8k`x~hlcWZifHh)15 zDR-6`_Y~N-I+IFD9eN&X7n^7|);0F%=V_*-cWd0Bv|@TS^AcKYO)jMO_OE5m8oY9!Nhh~j(hy9tncka_7&m&clrH5lQTQt`(Uh3XSr_9`1y*@ z^Y01IcWtg;>XrDC{`~LSbD+qtDSQ6AOyZ9=pEZvpl2E3$aIp-evMox?5l+Wf`_o|l z9}!}r2>&!BuDsv75|KiJ`_uW)9r%b02~^ZzNEu2R08t*_Bn%?Gjt6=~jz0;41t6Mu z#cpXpr5*|;UNY5yNwa}C_Oh-3*bBo<9JBX{)^6rQQg~3n29~_RhV(&=C<(3d7Y8HJ zQaqrlcv+Y>q>_d-Ho0;jk#h!Ltj!+YUq5VRj|N~^b8N)E^kqUN6yOJ`ece8jFQk`W zF#6h6i6Dx~@yEkm@k1#BggF~=j>HM^<%ItkR_;RWa~w_~q3pQuaLQN;_a)&j%1?j@ z#0~@}j{D&`=PD7FSd_}}7lbr#KOPLNJlSMT%q~YZef8OCy!*IvH_5 z0LFifBXQ_NASe3z(`4PkABoQk@3rO;fI3OYgTgAuwJQXGhDf0!Vf1$d4S{mrnDY8& z6ktP#MZs4^Bml5GN`?)R>8MOy4#o+-YYLJ#HLmLU@k-%GPsev|neTiG zAvu4iZ$;0QRJ=m5Ax?&qDO89Y=|g(&RP0~uQQbwK3!mzBIA{P>4TFdfIdWV8$s9>> z5-5v;Jp#UT5?Yl6wtZa4cjD~p=WCHZ-*1M9IljHuJi`zJHZ=5JD){8Y(WQML*bd#- z{g?vKpb}~hKzqUGAR1bT1hlYI&rRl1{HA64mv8${5!Nvagp==XE#{nA`ha<|1lXSw z78i7hV7C9+Tfaro*hLHR+I@>-DcFyyKR+sK41dcUTU~sKqkY5C2XNTWY8p$%{ErT4 zUzj9dOhX3(AWi?KNJ10p=almx!c?>n_^M|6lJN7RP6b@qc6yi7c*y6s@iZt507c>@ zz9UCS6n6f}$05hBDR{`B(=0#nd)S$85nG_tix9o-998h$_H&0-%!)8$<@&|y@IT`! zn;}>VXEPPJ`Mo zzI*Wg;mPrD5u*=+pBJzDzWN?DQGe{_RQ}kH7(vTf@t8RnkSX3PBCw>+@A0ZPixRyUs$aXzqt6(NaN;#i-{-);Y1e>^X zZ;7#;_|4qMZKAJ1TA-fzQ%?FtEuEP_hvHctOT&CTzFK2XQTlvOM{%};L-7({sz^s0 zPj^%k-nBwm?>eG?#MxTm*ore*`BI$AlqqS&#bJjk{2UTcl!qK9mk0>Z3cd|L(+2|R z0q0}i4hp1i$GLe$)P8gGIf{_9^*H^lR&Q6$+6(K=LbKEh{?6ZudV;P+xw-k?ymZ_v zytMWq*{|TmaqsAcZ{NLX&S$@S9qHNi!zb>3-)^7dua5ojjeqm)*hJi?+8=%iUth%f zp8Q>#E`*S!0Qd!x2TAb7FGD~@<~H@R3Vmwn5TlDO1S*30MN|llj!(rZY3Fibf5mRx zOvY%EA>!%@B^ZKg0(qzijU5FJXWra)KU8@7`_GWVpM581pXI0v03cMNSui0Wzh+)QZ?P|#@3qSn_pHi{ zW{GeAI|qR9N}Wa8?!8u^NCpr|F~uSb1|90~?T@RU{-9KX84Hqs;G88!IFfvlg(6m!hC zt(se*`Ob^#(W3w+4( z9fcv@zXTPvmOz-Sv}6!ytbN$|f_Is#OoJ8UrwTRljE7Sklr3ef9c$>iQurp9VA>hl z0RnKmtsB$aPTdcH$!9SucGI8oArL?!L6Ibg9N?vj;(2m-0x31w-_X?64$fBtH3*)l zT<$AS1tb(zLA3M(xVoNa4H2S9RG=Rx{C4+adNM-Mc zIyXP%z38#!!YpLF7?$sYqyX{GS!}wiki6#lN-Z4DWy9&OoF(~2+oQGOeN;jW`+n|x zdv<3yk_t_0B|T-hrXpaYBo|G9)|aDaygvPAh;Wk*F>c2eKYl~{OWdU%Pdym4m3~T- zE1lZT5K8k2g5XY~$pNlF72htAW6ly;(uHlNY(1|c=yLvrbr9_Wims>7ND+FRLzt>b zlnnzi5dvWn;io(R;+`!(A}*KYBnE+lTug3?983H@s=Zj94z97#GHMv++UP(SD%BR2 z5Kn*MTz+?-ar+jPcOr9iH6rxemt#}1WSO=>l$ioKQ3$fVxZf9wNXyvSO>G7-wX6-jGHu)S-qYq-B zq#dtaVZKNj@R5D5^P`?I>!D9C>IR2FJdpde$JHlObUS8M^^-OsRW$&s=MH+ zF86#qM{9L`DN$dM$DROwRbJ1?&q!_Y>gyLdsDeZMFg zOR9L4q*-0+`}gbq?XO;+Q8ueH6McC2Jbs+F{u-ZbK!SxmR^E)2O(nwjgjN_-N?r@C zc(lseU!96v5j6>ARW^w1S*_Wd_GmXx@tJ_NJC|3?h3$hvh!V-fI;b9B%6N&XRf2oA z%96+zQ>Rxvs^1rSbDz?@Sy=TxMF%BQVROp?n23j1{LL&RSH4rZ)gf))mUMFs6(xL} zIZp!muBz4IC+twQ#CAE>z5#O!MyrOl#m<#HO=*i=J{Ie{9hCHpEez5v;%wV?msuD! zw$V;n7*1H2=3G`4K0ZEDsoWc!nkS(Jh@Lt=tnjx(wRIL*cGvR^E|07Lg27x|QF& zl}9AQrP0bC_Y@!7E=#pKhUboPV1-%>*#swm^Evf zh)pcc=7_eed2b9%*^NM!cWo`%lboVg|HIU&Np zk(t_LmN7Zmfqd47d@holl0Z&PC#U6;&zF(cFEo&hvpfsMwZIbSGqWsCSCEtbu)kwy-sNAUyMMPFt06 z&ug-MMS^`@y2ZZHbX;9}C9cQRr{{jJ$Rk(#zzO@_*`7L_L+gA?{Tlu8t@MW8cHL1C zURio&Z?|b=uXwye$An0iwnN{9Lrnyn+RN}wPwyCRZ*%DFk?wKOcI=UJ?2mMO3E%l# zzlR?u;xZ3uoUq&ntbOxEMuOTWavayECemLeIL>JI*M%qVo9HXXIvhyvlSQ{b3w=7> zYxyDB=^5MUO|R2}@ts;r>r`t0f%*PV6E@z`&mV#33l96g;gr4_!&gL}Kk;$;(rDF` zv)|HZ|61?G{8XmaQ7XfEbHTtw?o(O5ox*Kr z#YtzSU{SA$?kx+>TNPahIxfm4E-H>Ls=h92Q7+pOUDPvNGzwfa%U!gZT(tXKbS7PN z7hJagaUqHx*r9kpPv^i+lLPvW2iAA_9@rgqU{B%!gNy@)1qb$)A24bTrC}4t$barqg-tgUC9})wgs+s<*xQkt`2>!j+3rV3$FYB zxKcz9Ix8M@(K&d)vlNGEhNz`G{Y^dz%9JoEuzUSvd=AQ(k*(yE#{9KP1HSB z(fx>y`%x43I7jzmzV64P+)pIB$7i^oEO0+n?tZ$-J)zG%ane0$!Tro1ce+5kY$|EJwBQ?V#t-xdbe7VPkCXe(!kBmu=%mt5&e>~Wto>_{X939VW z6VDt+&r80Zm!mwdBzop%c;*#&UM=^$*5sMr=UFi6S-9YN{f{SC)a!<#SCNj_O%tzM zj$XHYy^5o}?j(AZWO&^z@VZy-RodiL*5_3|=~c1db^ni7rKtAoUCS3%nnfdp9(BH}-iqO?o#kct82$%@g%$QS@om@o6*hX?OJL@b!5b zr?=0iZ_=lK!RPrOAHJyXfTHi9j_;6(@35oq3t!)nDBqWf zzM~nwV+FqB<-V_)d?)&RCntTU7JS!V|M3-w`b{hP&FJ{OG4Xrr==aXo?|qctheW^G z48M;Beskr1pPKyU`}`Ir{T3JeKL7Ds5d=qGL#r8w zeij`1RetDq)1kG#Lw_a@{arY;{^t-7qe7IZP+cm_lnQsEBK)YxXeug+iq52B3aLUB zRN-c-NIz9{iYm5975_`ciup?@`Ah2hOPTshJNe7_`O8N8Z%Xo)%k-Sfj@>g2)-}2WVFBY&>DS)6GpllkT;uN6j7oZj$uq`P-Ju^U~FhH{+K&v@GyFWl@ zDnNHJVEf+yqFCS#r9eI1z@4Un>-tWCyZi!oM+fdn3N*+JG%O63Gi zZxBT+c%yf^=msA!4R&=3KIj+h79H%K6zq{1>{%G>RT1pn9PHB{>^l|gw-|irZ!lHt zu)osb0NulZriX)^4hQ=kJ{)~GBn(P#M)-Uv2bZAOaXliC?XIf$C`HIjB&7tZ2p&3)5nTw$p|Aw-~!m^aYIJ#lk zreQfwVVC^EE=Px5Neau&49hDFyIK);tvM{eKdfLXtZ*^x`rk0FSojU4@FLyto2KEn zoWgJWg%?MM-$@ED$qc_+7=EuJytFyItUtVbD!gJb{QlqYO0kFsN)c7M5!I#Px8`)+W+3pnC;TQQd zI`UakWM^h%S7Bs#<~04kaGTA4$89MaCyW2D6rimC5}?li5uk4WCjvC+KLRxD|GEHW z{Z9(ePya&!`ssftK)+27|3`qn`acq&3;z+I-~Sb$8}0W0mh>;~{1lT-h0Z>$8F#7vXbkT$(1Yordk?^a+ABiz7kNQswdTMGPcHrAI7XVT4;DOwqW zh@o@P%3=4r)kLU2V)++D{1f~^0OE#C?61cU(w*T?RD#a&Y|)S%OuDu*8}&{Zqdsa4 z(Xvu_VN|F5VC_p)02_XOy>ed|-VE1JeIW~GMMw7xK?_{AL0?@{^VYg~<$~D~Omb$Y zzCyKBkN@Q%Xv}b_@#$LD^qv;=8V=Uj%DXDcm9>QkJ6i0)U_qTor2#Y#0~YkARdwm0 zX1$fChxenTmKfVg6q}wRaU=F3wNz}m{J2rwiD}fkD&OR5b>|!E(qGkO{;p#y)N_pL zbG+*>pRQlF{%eZmxpj{~pFx88!TG zFqT)kzx9(=y8`~{@2Xz=66vrk7NH5c4n@A$kg->rnuHbqk+JoStgPm_3!Hfcxy9ej zYMY;YxQ6>`|0M2iQ*Y^$rJg6utS5havOvOQT{)gCU+xFu1d?Xy<6uJFTj-Dxg&ffk=GLGBe(VEQ(4@z;7FW~OTn)QxDLQv zJR%oE)X?ERTX}^_n12>3B*$2fJdwQCd~V{|nS^H@TSLl^zX+w$Ju=HI?FeHg0BWe0pjkyZkXd4;y_h-WNA}O?R9Fl8C8^R@wY49c=GEHa7ep+_|2wnXlc~Cd@}B@-f={ zqB73ok>}|0=Kr#>ON~z|>l=1be6iniXbbO(fudAO+ki(@_NlASm2HnEE)Dw^jC?PYC#&~Jd^ZJsp9Rq}T-|@JKvHJzDOzu9d zEq~Q4$|a1wx}|unN>R#h?Nw*NM0e9fuSs^0&jdbpqR(V<*mrV7XY{$~q|(;Om!dDC z*8wa}kZnnv3MF#lh*OpT4Z|V#U@!X-rzqIVq1e|?nh@SJv?Uff={r7FK8b6Y{M|Ho ziUa|CFg=+qOM*c0Fy+liX#m-{YC5b@Lcw$oel zA&NBgMr1LD1cxY%{aJV|4-srv8azjYh;U!~*8>qJPAGQzXxhxSpQwOtsM6mYd~{1P z05TOE^Zu6o{U|K}e)zr_6@98_Dybj+PTUs?<0I8JBh5(&R1(?`5BH>^Rl44&{(Pfi z%1IziZ%#tTap87cWOx#Kzb@yL0O3bOzAt~1u!tapp}k4)*mV-e+LV)6_#uV=K|Swn zG#9@4FN#8fhm$z5G`Is7rE(HM;=@Hl=XZ zR16`(o>A_CASmyB3P*MnzT5QR-4ijseFZEwZ6m50vf)OPsp#&C4JZpqAaWuX-;hi> zj=UK}#Z(G!10@1c0PIKz#SN0fhbRi5DKt2W2++h0nhTiy5NR4Zo(*6$0HwUsOnRFl zKmx%GVGJUL{U`=Dm?S{rr^DG`q8Uo~2ou={;FNa?#06o&d`JrOFJlzEC zTHn8_K%CmV{sY8(12lLF7ls6DDj%T^+c$LXYRDNh9RMx-h93RczaN>x1@KDfLosZM zm;@A9aY_1$(#PqsM2i$w3RU00;70omj3`|37fc|u1t_Fq$AADU0RrKs>G7Q%^vgSf z`)_#;y?4wvJhu3@{%ae`*6gg9s@L$W^DWy;y20ba9}ko`-*&p~^g6XbsqZp{p4v12AFsaLg)qoGk8|6WJK6mXtW$)TfiYaRGwahM*29Y zW2p_m^&C-{?R(#fJ^Gvd8LRY5ip-W~7|iPDi=~s#!Qqdg;EiRwqq&f>=5(_KF~^CE zF^K<6R>^*w*06Fl?u)JZ&7324nx*hhFxBeeBlZj=b{W5QqKK8LEXIHW*(I`s-XE7# zWl}O2C1hr-k^szj5tyf1%&60$`YUxsqKqJ-z;S&TJ~Pc1MHImRPJE6>C&C#s_Oi06 ztjttx^j)=0vRqgOMq2QOZJU?MfBqme2(-e#+XD89Xz33OT9ow z!SGhxS_OQ^gI#~{RzxSVlobm?R$SH5L*W&9z9x9OD}&_%r%$u&G_f0N9|InX--?$k+#<1^?(MCo&J8;!J{xf{5ZXiT3KhgkYUa||+}!T!p( zW-f^DNPYoHHi*Qd<>@rv^?RicyS$H9kDQ9AX!tH~5?3yJI+8(>BcTk$&%YUFp<{2p zFPxnF?f6#hdSU+*CCmH!xP)GgyK7oI1<cU)&!10~``i<~&DEf!j2Goc+b&20wDNbAT|mHm75l@ahrjTB!bT=i$!k~a-M{+?cHyP|S(p=Zx}!pe~QmOFxu z6YGhqZ|-o`zI}SMp0xJ$^~1!UzkjTs0pVwpe%Wu4k*)*?x&V*&0MNo>0134?i?p1>##O!%Q66Nd%Hy4FRwl&L;>nh3U*cgZ zw|+h(($%b!4<5MXC1qK{X~Y+bIJi2qY3li45S>Jw+lac*tYPPQ~5$=V)~^%$zhCLtyW* zw{90zppsyC`tH3!M;tgT2po@v=;w$+pA19bA~P@~o(H{fq07{r3&hm7LWI^}MpBiG znx6nFTt~PwL4kyb>E%dlwSd30 zo^Or;v)LQLUN~@ec4HXF;~_9E5ILg8Mq){-Dd-YXD|Y*O;W-s(gfvecCP0de#sW`l zC!L^?>M(Z$4kZHW3wbG7Ic+~UHc&~q6rgXU161for;QjEBflGg6~?I^V+Nzq-N_Yl zuiieuQL3^E@8z=KtDI6-sl#qfzkQdIf=gdHN_ylG*~HWr zIyL?+V7K6|5P*szDWCSd(zLp!P6g~8?F+MWm-l%{2q}&FmZN!X5COSV{RH?v>4*!0wV7qpu?MPJ*p7a+Ww_L8OA>NZatq)Am9lq(EV8@>c1m zP<5XjMgF(i-~)r4PC+s!@j83msq@FUaaA$n?3=-9EfBdPQEHug^jAnK-!JeZ*8 zI4RICE}L5bV_xaX+Z|DT@n!OPX^*tb?^#6@b?kVm23s!TnKg3*py(Uj+D!zHkAMi% z-=+3#{L0nZUGH~>c)uD`x?ell2AeRQZx`jY0Zrg5oa2V~Q>PcUXkZ_WB%_5>H$j(3 z?x%L+0Yz>eid*`uc9?T>UD8lHP_@rPd+wBeiX5eQm+dX5&sldYS6LSk+4UB0C3+J|uAt?_=&U`vpY1vVixLubdsvwAJ0{#Dw|C}51!67g?@ z9um2cOgOp&C|p@T#mW5}CqS*Uoj;p+&Ef$flIj8vJu>p#gdPtCWsj0am3Jgfwvb;# zR?^e((2bou-whMzNP=hH-r4TGrw~Duf}r_82rq!T)&(dnD@{Z61rTq3+XFeQ4JZK& zN-k0W%m|#6*ljOU30#+4#4JF|fhnoffA;-7^={R4ts^iOJ-TIs>Rt*{Yn(1@KyT&m zO3xNXG3a8WIQwfDjk@xy493QwAj>Dg0Q?`F?g61cpliaC_o3OuFOc+4bPP?7@>I@@OE?z+w#7?G!M z9k`9*o5R=_davp5b1(1Gqer_yn`hcA$d90@K*n1eDbTBftJJM~wYEdk0Jscdb1x=1C#sUD(ZOn2{jU{$jjF>2{@ z8lOkW6`~3U-VohgR47cOSKX^iKFMbBw`+1AncPy@Ljn+ziIy?v^)>VGvZtT#wuY(* zo6%sq)SEg&VKF#i^Hi9AuKZ&TQF?&a~umWK-G)oK5>S}>)uVdPuPuqtluO~2={@FJpS4`AvtS_#S zms_4@=wGiH=5iKK2?Y|BK}5%|Qe@443OeA}xN@>$7Ua}v-)ZMF-mEJJ_s?Q*c* z>8^RU?U6KwO^NO8mCkD_8@SW1#9`A-=_@LDh6*WQA6-v21tMOJ&nCMO3$<{Q*IV&r4F^t-FWt=uPi+II_T|jaD1`WUAIQJ zCKi&>P+PX+V3}j!`5xI>){6{Ae>L8}vVl8&_J5Du|07}lYWn`4 z`TKvB?f>2Q5AOUk@q8H$Q(nc);tKYnDK|nP29-|mgSST3FvuthDv^TDpkN9pLgf_U z4cysB5uK!nEl|Y&P_UxT5{k|nxKql+S-O_Lhgbk9=!YjdTZaFy-d3T>8Q1q8+_~Vq zAv>j4F;c6}TSNypye;9s;m&Qr6qU1s3cCg+xc`Ma2X!vF=+wICiVkh@g|>@q+EKzH zV3XGgB{192WL}q?zIJv(n-idw7)m~ND;l)FF#~u}RC^?sbFJWyq;47h_`=~5Z2~0AV zv4_GirX}x9WtgCa+|M4A8HJHUAzE~pBt6+m(XDxfb-fc(NO!d@w{<+``e1h0!OAIQ z0TbHyP>Bp9sxwKe5Cal}I}0%&CW~X;e5@ERMVK;|l5uqhZA)NB3;Onyu*B6_`k@SC zyj#L9`@laSs1FnTqFZ-0E$jt`tTS{~WK>+vzk_?fbGC0Z>eOiSBW?pV3T8I z&dXj(CQw;i4|kul{h7fmSu!jeF`VG&c{?)sbuVPHChg3Nmw}UMxxU?r3op-R4Bz?0 zNXhVswzAXWJ4jVR{S%Wj7Dg`yzq}a?v&2J4R`5Aj&%1rYuY3EhcJ?ZHj8w0%?rjLs zKbZ19yNYtx`%PYzh~_O=*XxPNn^);7WN7b?!LBHE{4Q^zWnm(G*!A5;gv~p9 zez^}&>>Iq4Ht@}7Bx+L6e$v9jckq&1yNT~$QsDE%L!Q7)Dap2mYzR$J&OD5in8@Llu zROjF>=j&6SJTVvdy3FxExN}l|K9b?z_xksLaHr;gkChA+{EIvPABa2G|G$7cDP8KjMh!bfz3__~iH>@i6g8R|HC7lkUJ>=GIclOm zYH})SeQGi4_1`FgSoE|~^o(xw8`J2wPSNlDqTfeHe@Kd+&5Zt77(G`J{i!*6zCU_l zDtd7-`t#rDC9#-grI;_eF<(t%R-9tK`Ne#Xj`@)kvzi(6voPjYMa=K!n6>_xKT|P( z7h~4{#sF~|WD54YAtp?3(?&@|En zRE#1DVxbENdL;ySI*29b0@QJwnftzfXYI50 zS!?eHz#l9i3CZXE`iAlx2*qTCVoO4CXF_=|gz^oC@;?j}m<|=>hT;Xogyh47SB8mL zhl#EU6WbIfz9($OfiQ`TFv*fIsWV~H7s6zQ!(<wqVKE$G`7pw2I>MS8K^Ba(k&m=p8EI!7X}>1YVN;~z zp2$@PA}JY>P9>4fXCk4`NL+^_-5y4+o{n6@jid@jt(A{*Um4|L9p$+u%4<`U_nxSA z2cmp3qI^rD{LVzJzYw)yIBMg=DF5lGP24D&VDx7B=q)Rw1FWOBu89uZ6uoUv^!5YM zK^f7(CDA+1M2B36-Z>n->tXcn>F7P&=upAEVe)&!SMH6l-W$1QZ`7u}(R=ppJ+L<> zV{dH9-ncV+_g&ZW1S^J&Zd&9e0Lj9{PuL+1fnyr?vV2=Q*<3FCgvp1iRl8bbpw%Z|KH16J6dg zEtX~4H;2f}IPG+b{f~9>^kL_piCSo#JeITd?$O}cqkca&%CiLw`ME@wjbKFe;BqO z&gVmn?W^t+kYxMvQtcnT<&b0x?U>*7*ZnNn&R;#VjM>f)H!OR$AFsE(x^)R6Z9m>> zUsi2D{?-M-wsT`w7RD}qw{5R~oVfPOvKfMHKfio1|LO^3+kTmOzVLQw-2CzX=y5YN zNA~@{*6h#LteryP&(>`5cWZY3n>9;avStgvS+hcaw`R-m|Ft!1`(InLU%y+k4x+H$ zzge@|e_FFG{eQD&y}L)$N^(4%R3vt}r)Vf1cXujBA7)VGN^`aje79!7WU^~KK8akk zXwB9D+}3_dCD1wmS+gU_6mZd+^;)uKyJu~Lu*cm|GT*FO17}_%#kY&rY|g-UYnF)t zT|9qSvn@YcvvOV_WX)D%!h0LedY3sW1%g7(@;g}YRbsoSHK|gNHG4%Z$WusLv%*u0 zqO@btnoaP$e+9B;d9R$RV=~a9>qKoAt=SDpQjj%^mw42`z~HnivQnjDcf!#t3Kp%| znni22plaEg#Vczby{@j@_LZckSyWZOQv-gy%uycnT(V|=bCfS9A$g>-m#o<@1ezRV z%}xl{vAGHps!P_aEeEn@YgtRytZV8wYu5HBYt~rvyETiDVDY`fE?cwJd{_dZt_red z>n3LCgt{ebw&G&aRNbOAdmMvt{MnkVTC`@daH2A|fbFbmG>IQrv}V~-K^(}MJ-%$s zQXPm()+}q;n$_u_F8XH87TDsyTeH=0VPL>c>1S&eg>7#J#lBgyjz6qfLxr3fk)#@X zB79xaYUU4XR;B8jHOnLroG5VW{uctmjYG?3Gk;U0yzy8vE0AS5kknX1y)e z@-JGmXCQ0Vlfj$w2~=OQX31iU*6hHdHGA#5HEZybHCz1`Yc>h8W?TPe&1PYe{M2XsYyfJ>n*I8dHJkavn)P0^X44pdTC*@sIaQ2T z&OckTWHG70JE~IwsXwe)4EAI~89%T9S+g|<|6q5BH*0n-<-0Yz>Q8IdI1{&M&DIj+B@X7`maN%8)Uq|3EtdO>HJg)a zCGip$zN1^?7i+fWCu_ED*_xHIS+Zu=E?Kj1V%1-)*;1;Vq}if1dq;Fv6=cmer2fU4 z1%!UFX5T{AEP2_QJ^dGJR&mLi6;)ocX0yc-{-ycArD*(W%{qyFvu25l)~q!tNoqfbxhC?T zt=TD_!9;`zlY>+vD2lFKwq{KrYt|OBW?g?+vqFwA)S@-Z==-ZROG{m}W*L84v#vj^ zSzPR2tXZ*Ntl9iBp+##}n*OiWtcs}Dc%R0SHR~q!7i*R(FQLZ#X3Z-7vo-4tS+kSB zShHA^3^YeJ`MWiXFHN=hX3cg%)@*~K$ZA5jV5%w;k-2QmLUj3(H5-6}tXaJMcWd_L zk~LesY|VN#(XijFS$Q@=6kN1so3KBuS%hs$v@w+v<0mUVheY0lQFI%&X|7y)DB7>H! z*`V*%Z0?dZdlIr{kyyx@T?g=F{neWFTDE3mP`_HU1Z>@3t=W@oCu@+g?k8)u7P4jm z^q!c(NYp_p(hIU?`*}jUuqhn# z6c(1<#pBwA@cPA?h1|iB0yYbKD-mrgh29N)nHCqf>vnl9BYj$GMnq3`Q!k$3Ip-?G*w`Se3KdjkfBNc287P4k@ zf3ju^ezsq${*UL# zYnSK9LO6XYlJ(a)vP&omnj;@3V5lI5*5w$xI7jB$qKHX&kNr@9tA*ysWjL>{Ix1jP z1EL*%nIns!^a0G`92vkubL2>9j@-C7N4D;wxKNSF%X4HHkI;4;m9scU-T|PBmgmT2 z=s)MkF!bUac|EEgj{Y`Bu3wxZmn_YZi=a932K}9Dwb=i5jtnk4z5g^vuKRy6M+R{JGDp7g zzip1}3eAy)Q2(pu$RL#m{60s{{`(wx5UI_^L33nqd5&z3{r`N9ycUo8WsdAh27fW+wJSO7fe>X=4a9YqDne~@Ba>-9~`dI2@m-uF1Gw~`=E!V3=I?XleamxX zN#xISWcK$t^2T50$N=tdbL1D_=g2A8U*^amf1M+525>Py&5@zG`0^YX5`>{Sa>m3z z%#jZ*&5`qGuz${xb+AiwWS&Fc=g1+@9GQd7g67C0*z~12veW-$jx2!tkLJki|7wmr zf@E|4I!6}7bc zPT%Ip(3&iJagIE%ObuT_P5*O_jOd==S4#OG&5=)4Yy+3)$S-d1<*$A4W#9bt`_sXa zu#4LeV(Pe#qP^4IYCufI30Z)ktSX?FtX`kzO`JFgBf$bi00v9FG4FmAKj07~>_)g5 z&7M)DiiT;D7sxLjS<{VtGm{fn_o#O#Z>5mgB!G0p{ywpZR3`iu;b8XySqLW1XF=9% zaqIeNC*m1Jl|$les*1DL;v8A0d}oFesYf&d5EeAMc*;&77@ngA(CC?B3Qty|Y|-`@ zP#-f!5k`_|_CktBfFu^2*!3inOZtkZpb;m2wo!uPID|hA_J-f}d6GR2kT2T{c`(!wg;68bJmpe+pPY6sE)ObOjbcRm= zf&$@>9u6XR==T;lqR9ZCD9S;YPfMF%Y%I(%fUkX!KY9Z@3RmH~3;2|(rl_M)*(unV z$0j&-(kYP)8}w5Vt}l)n|v>q(DBRRtqEowB-7S(CH7C4;N8 ztqn5N@#jPQEXwTQ=lf^#By=~j^|c7VTzkMDW;@$Ljyj96&1@<#HcSo0T$$5N36M2= zC2RLs_R+K1DZr@R$5(J)B@{GPEOz1tTDulrwXevIe$r!HgOZlotdcRF*g(%E-Vk4$ zBVWM9PQO|cF4*(?w9`SjnTg$j`8tI#K$#ugrFLPybPds(mXwR1Cu1C0#CMuBly-~_ z-!^LE7HE!)O!*C~prs&6BRI&7P|<>w)TBq{6X=cewi1`A2OALsxJS^TvS-voVIhzB z3`bEYEi#N?g29LrF73On3179f3y6ITbGteS=flW?j4DtsMTHkn0JfT4NAS9`5$b04 zSXC-rl%&XOlP^jv#xn=7%5cmm@FYH_Q{W9XW8Y+eeB+HCbN?AE$^@c-L-BkvYyj}_ z_)QoDVFoW%F?e@-84_?X#RY~AeQd*^fwq32mW)Ifgut^>_@U1=ySmsoC*W>mDeNe` zSAjNmIXRNeLdSR#vX8E^&RQjQ@#X+2G1hhKeiZ9`(}W~Cv@f}@Ce#0(^6RbKaL`$9 zQiym{SuOa6>e|#3Ncm3>Ph&pvIMsC%WC0K*SDLK$d2`;dFAU%V2b1IBXe1H%BR|%d zrg~GeD5x7V!O10h+OT;8r#$mMt_k&JAaIl}xHAWf34}9$rSa&rUxdp)&1Ry^| zV|G?nN0LAQsR%+FMF9TwKD2liMlLR%i_=o5i;HIq1(}%!Rp^qkJnu}R2JiO>i9+S{TaWrBL^W|7^v0&u@({& zEvmP*m3EbZF!D2zJG`gYYr*=pV#xd270JCC)Eb+p&F9i~kzt(L2d{B#K%+dv0N6Z;CAZV;B!sPf@ygQYqN%+#idl;lhBHsGD=cE81ow2BG= zgtpFe2RU-h0j{ECU^kjHT3(mrxOAb_6;NkRk063=g=yLNQ63PF3>-&cFd z(x^tS9Ctd0rV(D6pQd!4RiWWxfe3O)22_;D>|Bu19fCHrSzaHYsGI*s!40LVZQI zf#)Jxci^#19)X!XyZbAe2ah;OU_x%z3CE10-VUmYZB7wJ#>{D#7ALB4JFhW1px3vu z0Q@X(>e_k0q%YATi0KHe3R~fwyr+65c@`}Yo z+!HI>oHul@<666^D+^g9*{N5gS}>2)ASa_n+wP>L!%2V(&9Iv$OU+l=o{c;pM4vza z<2O%xv}cmphtoo|Tjr&Yx}3Qbmyd`*z$svsDbr6x0LZrrthX}dRf~QJGCv;XPCQmV zH*k0jC37|cv3889M5r?nPBxdebyk-w7;L&y-Bhm+I0lIuk5-T-($>TbnO&`M;h zaPyoN)dwjr!SG_G?g&T{ryTX1XP)Cl+027Pj4>|$EW4F_QQyMTwIlo$({Uce%h>oC z6bq+cYBWK@s33ozD15Y8e6j=9r49sX?;DWVzQambI(|m` zA9k>J114rdB+i49&;_2mm%N#Y?#dm4WV%=DfO`i#qMcD!=g=Tiyq?dw zfrW&=Hwq5sO0{yNsaQZ;@leE0J!qE|l@8zD&}eR=MB zKvo=MGTfCq(*aGzU3+xUggS-=_M$V`vEdw<+(>yWHyk3K15L%}#hsw3I8}uQyf7tg z_j1nIf`*vVJE5J=YKErbpz_)_7`Md&WSci%`z(nHaw~FCEXZHnif=3B_gE<5B>`jZ zRw*g0L~^4JrStN!N;0$+pRjB4%qrbU)M{m(B0HB-8tN#hKenQxHC;Z|9dDEwt{-KR zM-weJq+5>;*H0mByAY?bI+A_z^m!&Z-^><&YSk<%En?T|h#t{Z@_p zHZypYh{qosrbSo=*DyKpwyp=_#@Z(-bOjYbCF=0XPLBo)cjF~%R*E4{B)|l;1fA#M z7M8dCZh2TQTC=zQXm#H^+cF0|&yzh}#Jc3?0dnY)HR~lzo(IrzB^rH+>?x!nspt(U zNv_j#UGG`wH^OM%bJ3cW<<*`C3_*FBDyADPNgj}!9!WHh@e0^jPYq2PF_lK?sj8sVkG2F|wmuQ*Vpsd&NW6?3WASvR;q-?aNGldqYm*DiD0q z_*FfUZwu1k+5ppO{y(v1RmO~HjFQmNM>fb({#0>_{PYaMwAH8b_ww*B*#E$qz3oTF z$d_s3Qoi6+M(5r>9Z2#M<=nrp>%P#) zvNii)q<6!}(+$@xH})K6Yz$ms;&ax`2d=1(GW1~(=e@juMENR)8{zJQ;IrJ-_=fg? zB)6akilh!|E1cNJtQkRScCCM&FkVJnr;X+0`1H=}2mTFH{vSWn z#rr&oK5(ym*v=yQ_(!^871OE@X-}Z<_MkhOB<;Ls*!V=n5_(TWL zA9--xc|JvEBIRYLf6SNh7h=4#@BI^xPZU>8G<>F81SP8UAr0atlVnC7iu!LRLsmK)dJ=)n6{N1rzlnVFC~f0}V9>;|n29v6#F8%B@v8CLhu0~6 z1(f9;m-sxMeZVvV6W#D|QmDm;2TT$1)Yhb7)qRiH^N&@gAbAzWMGqxTSWI$215`caheNM+J9ALh$s zZ1f6*mgkZOC&jP9geT1P_TRJUyDz~))+8hKN4EwZnQ%3Hnm;np{`;i(sApl&=-Tr> z!IjSnINJiAZuaI(o}AfMMM`Q}zb(S(&a}g5BDk%??fEVNzp1G01+?c8=TjB%&*QFb zSIpf`Wj~KTKe7Gbc4XcJDDIUop1P7eoajHL7`6St{&7{uEnN2P+r~$xlJkSqw@uZ| zAhd4>W#|m5o*plI|0rB+%evJQS>nmrU#F@@kamH=`QqKG&z=NcgTA+5x0V+rt{_yBU80x< z)C{t$XYZml`&w3e`#YBBz~UVFbrKvN|KW(c1Nn=k^dFCQF3yqn%a^-m_&;E}CBtQ; zo=CVX&5^~WEoTznfedc2*!ZXb4s~wV;vD&4mP^cw_oK{*R`M0vOLOG$RL`o>_oO6^ zKDsj=X%8p}r7q2pqj4(uS8sz7hXwZB^h3cf&8yB^?@hZBA)s) zHaR3}1R+X+a1uUTC7|I16(usxq*5BB2X)PW-BKb*rBAxv5~cS8hxtqDr6S%6g5d-a zJ+2~eDiwq)m{e)4(v$e(jqwQV)3#K3les!pSjAN6iuKdRYZNT3OC+W?dQTxlgA$Eq z_Q0;eOsZZ=QYJO~VBS?o1MnR?D3TNt>79QzuBf*hWV&93roZdMl#F-JwP?nijc_sc z#={?=kSkAi5vN-A%)KKBy#I1|n4L`5#52ns-p9O}o4U;4TaikozN1dvke#0G?g=9X z87r>~_o5;cz}ME+C7LTD^aL9el%fr{&&k~kr>!WlzCu^*L%3R>rL1}<9y**@2$Mat z!#fD+eBsXCO&05R;w%M?&(F1=Yj5wcv*->X7^qh6c(u0!VH{I%lTXxNgwp%)BgpOA zHyf(9+sZiAa~>v~OqZX4dsE&y$iK6-eisrINhx{fN@Pxane#D@^5c1bcB64m0BYm& zD4UrmuacN2ALqo0w=Jd+_uJZS$&t7cddmxB%+bB2goX1Vbb-(0)x&cN4=(YfcX>P{ zt2Dwy=e<1mQmeP4uzacUn|gIWOdlzs*T%;kE15Q1f%D(Kx8TfM=b1fwSB%%ZK?su* z*V6Vy8_sti_9Yp2xCPB$+BcuP=F{Vk5mEt3Uh}slrV`im-9I9~P-eXlyDqN$fU!?b zFS`nfGr);a$CdA0qRR9VO+HZsaasbY3@)%%xtBfvxu=KRvmPZp%JlMq8%8XWTqDnf)x$I7Hu9UGuEceyvw#gplhm~joNb0Z8xl;2kM zu{$+aN<`zd3~~CxH134P&8g{;Tt1^_Y zxBW(4@SN(kW5m*`&AF+$Y(=?%cORK$s9MQfpD5eH7qlW7ojo98zpuB;;-$n`k0`-|C!*LTkjD;tLE_Vp=9r*Sa_ z<_qs${&wZ}$<6@#4{x8}JhO*u_4V_@A9K^wTx$@9Ga=AXQZWPuR-=`W$Zs}JUAz2~SL@`CBFh(&|y{1($P5a!uV!G~tiPB-iyD>^fOkcJt zWmqoEE3s^FrplQPQnAWeP8w~>*=}YZm2=iQo2uk``NgW_`R-^_Il3{9`%xu-bE>Io zL11C5YGH6qn`+Unb01ZYg$4IH!K{c-(?|RcWf3M)irpIE|X>nsXYpr_L>C)YT7|X`X7n z8>e}??d3VmGo1?ynw)N&xmJCz)IP0-evNjm#;azZw3@Cvn`<}U_S>i3GPa{#`|N|b zPui`IQq6VRo)+%YIrpNbU8nu`bDwlNW(UkycD}#6Z{_)qFWXmM__FY6WfzRsg4B(Y zjwfBjYIcx%_{~3)E(y6<==O@OkJr5{8PcJ9MRwn3-9Cjh3%!1oqIkUl^;&L+-k^5- zXT2faK@0t>hWFz2ubIB;&>yz^^jUwzhS$>Ix`T9r!3`(PPJ^3n=3fkMt#z?9yzR9< z!SIf6NT=a%8~1%N9NnB|X*3pClwfo>xVF>i-mdm9M)$)8EsY;U-AgbYkA2l?{4n9u z7vl*!ua(K8B;ye{?%kM)5Xg4Y3};{rq2pO&YM0zzVEB)RB4)(*^3iJ z`^{cf*Pb_fb*lZV+3Wg2EA!u*@9j5#)As7T`E=)}ujbGbg0;nLuk-f_U*16B(!YA;xQ`n{cN^?7#C+WO1; zd)x!oUq8OOV9ovViE9lYh+QyY4v4lOz(uOMPWZ!ZQS$$T(ClW6y z2-hb&S||!{WNKPxbCl6mP8R)ef&u^Z%?U(L=nfn^m8_%|c1) zX4PfGhWf+1>!5|QqDXb0gI+`C zk$7cgCG~zM_lB&z4rNvS>VCKIhV0_c%0xT$fwfr;Ih7VF8XnaHUJVVor{h($xB=>e zzBd~3S~^s8BB}>BzHK;q{<8`xQGIB$aAW>u3st@B>Y+fr#)50{ss`oiSA*Rf3-5HO z8Z}m5-4)(gH2zuDq(}W)SXSe)XBKK^H>fwa9 zjU`_{tC0~JBXr>=HrkSCD^fF(q}Nm`kU+Fo(zu@D-c%;uNp#e&xqdjjsa)<0kz%KD zBQvY1Le)~;*`wx0ZbQ=vodk8)0F9dkH<~JqI@MQ4)Z9G&w&|qR7je5&Czb*7S2IStt}$8qrG}9^+yu4 zwkm0k^}Dw;`~L$aB%ouz4jwW=f8!pY7m!%ooE(Pjw2%cjcWB9 zRk!>51a0!YvPtsFX6c;)8~Os|Lbq+|-=-M0oi-4x8nq)JX4kf%UD|Pbg0Ai%T@BMu z2;V`E-aQ=suk}q97;zI^b(kLWmjLIG`}Y9H``-hcO=H=M7XtpH0H>`i=f7@n`fKz4 z!wjeB?-|a^EK673rSE@gmn2A~W&7^TxvC@JYPi8dQB zeLZ!GPj%rU*zSuCa!_kiDR>fhX@Jkj(H#M`Hp*#2JdbNM&zu(*#e(jk3McrLB-6BP z!|#4?ZItlqS%eL2sI_U(dJMHTae9YTcCkxS6(X9lcveJH%?4%OSZQH2LPLdiX`k}! zNLGnhY;6vD?(BnFo1!bHnycW4#QaT`TAQI>nZ?#d>d_tH6kh#4#X%Vci*Z?^Y_YXz zTx@O1hL&3!s;cJM>%ppRQo7;V6+`uPTJUbYLkcOOORdfJLke}I93GitORbHh0-FGz zC9p-CyP)^l)UDH5wxH83_g9#vrlr>A{`c0#@2A#gul90lbA*PFD&S+{pw`9;KHk8G zr71QK=`x-+&yy}HHZQg|I(>Qvr>$|79ne&;txJumgK97B7iVc zOGBw~=P##s^S{q}y`p~!5 zCX2vU_@%X}OVSd4^rN-e_ET%KE)8mJ_+8#YtYFYjYY0_uV{w#L&=WxwXMitMq}Zf3`ODWZNTsxxANuYHbYRD@Jdi zilA{ZaNBh5(dq;x2}!NR*5*!`pc3)e_$Wnn+s4qnZ3`O=qWWD-@2rkKQ}$Fl?1gdB z*l9*{&&4qcjP3BfwP{;!ZEBZW8>v!iz6`zn{1}%4G=Fbxy8V#h$fh0!xB)-F=yNCB zePW|6*OKhpv>C~x_#6{&%IcckoqtHH(va(6a_kFDTq;mpeQNH{)+Qf&vh<<=VC!J7 zW_Bp`Ln?~aC~fllHUuZwK=^xWa}8>3+#PT+0B?Y7kobWzH4%fmP-}Bq{99`yc1tl< z9be_wlbEVdPuqG|1h4En+aaWtkpgV#HgbYaKtruf6G2|;pfc}LYjf&bYttxR{7Y-2 z%xfnxz#Co&)Be`lXt8BXuAm5KuYGTA)=6_m7|r^lC!T68w>CbDt&IUfb4YQWI{VoB ztnGK2&CZ(F(xd@aMh*O##1%Ann9yCiXZ&5wP{{#ZL$toSCetet<70eN-)~~{;m+QSI%Ob zu&ApEr_;~qqm_mTJl0){t<43$QK+?PfLa?k?^0{?B2nz8*2e#PYm=(K#KGfPQz#c>j`@VF-R#}V@ig-4v_zCo31_ygYvj|BV_O@a^ORY_| z#SFOJ32JTpzqdAc5ueRaYcs`@hEJ-VyF0R`ec`q<))1u4K&=f0mG)EgECB3`RCRq zD3c+HVPxFp-bofz^v64ETzZUH>mV0umT2*7Yx9y%QJ2?dv9^j5wW)C6DssNH zHrdOq&2RtO+GHU^q1Gl45&FHgDP|$E%FQ`YYm>9s+W510ib(JmBILL4Z2E7;$}6jF zSZ-~merjz_!ryH-Q{rHoe{OAz;T*P;BgpU%wKF0xH?)A1R+%OMp#^?xZH&6$5tFEd z3HT5kp~|DuwA|WE!i$$$o6M!wrWsF+{odLrbvdtw^J=gVUNK3rKeaZFcr1x#+O$2| zbYhQFYw{H+rsW188p|xV>LHr+dBwLObGAfRHfm=6y|roh*4mu<-r6vhTN`)Y<<=%* zxwU~KntGBi=gYu(*0z@?Blm6<$i=75C&I6%BTAETy1f5UYg7DBt_WZ>z-{P4<-6x?cXDi?@%XppI`(I| z#vY9^&E6bpb)1fV;4b7Hyf=u0(EvhCyO5bIM2CJ--d0Rz=%Eyr4F``k0&+465Sazo zrWu6e1jd?U7*)WEU}aVlu$Agqk`u0ib#!k(_GDWjsqHAHR}#zAyrid-K+DSGV4fn( z(t<5BIqrO3ysp%AYg$2%@3A~K{H!91Em=4-pOyI@aSNO6LCCC=%#IDs-atOC#W}7k zi5BEWOFlQmR}xSkpJBaesNl*?m$qP3)i5H7=+^Rlbl}P(MEtcBpS&^cYaauW-ya)! zapXoyj$U=)m}K_*tQ1a4e4HpD5p z-J9g8V8KIK7JsO7J&)c~?zN!Gdf)oz|pIRRD+;IdOD za5W?Q_R?|r5@oxnD4!%8%$yTS;AqfrJT5CQMB-V|=9Oezk`l%nL}BNJo4VFLS~y~8 zR(n9Iid1kq?HopHg69Nv<^E7}(hK-m4O}r;c%loQL_4E5rg?x{uwu+vD1{w~*X6l! zuAXczEW7@6b0lsOnH|cm1s}r$foyv(lsAwa2tD!>uv9q0nsh?I&+K#RiUx8Pzs=D~ zsiv(i!l(seF*_&K?~GnSi@q;zcM-0lt3~G()*2|@aBoE&M zpN=_{YO*vdJFGRo9dqyjPCIdDo8=y57fT88j|vfY;GetTThcm?)Zz{mV1Ey(k$y^@ z@)6y4PiU1hwr>G^7NSn~oh#p|<5MM1?rDlk zkr+C)q2w&j(C4v%m%XR}+ECv8v+gr4wg+*?n0UM$t}hFoH&O&8V8wSkI}^=M53PJ( z1Rc-7eNrmvsug)WC?vY0G!raEZbrFM2a%;Xtx(MzM%jo4uj-DWGb8Fv;J^T%t%_OE zSH6*9K3_`_^J4PV+paCv1XK;jLVrX(DhvPM^VMd(xVAo|6G6eM@!FKu6`5_O`ZJCy z$3#Bf6Y}Mz>Xn+|W4nft-ALEg{14%kXB)SM6=Eu9ln?lBnu6a7*1%g_h1-2V?9ssU zXTezpw^q2d>=#Lw-N@UGxRI5rwe2>+l(aon%tWUP_3RkWp(K4BlHaQijZ<0cb8^Ml z1A}I=ftT9x4WSo4Z^2Uz;twR;E*{+D&BCtI5|TRmo4D=;ozg3cHm90tr(~jkV{9BV ze6(1ig@z5v&vLx+^&+p7-)tGHcpx>mLD*{j-I^~_`WuC-&fVq6-fP&HbHwsq%i;9d z_BGL6bc@Ue7g5+Hml*qxpHZLesw~XG(p%YlhXF!{;4(atjq>|L^W%C zp=td5jd9r9@%QT=o=VG^bI)&-#z|xHY+DB``s?l<^&O8t4GGZkLa0? zr2MmG1Rjk)dL*xttzad3?C>MB-(!Q=$NOy_Ysu9ibRJK@RCKMfb>u`lPdzq1@reFh3$9%~YXWO?9M3Vaphd6T7b>>bpg@_&<(1y(0GOqkVGEuU~&I z^7Xkz{gW80=aJm6Ph^iw8LyrK-^eglw;eexGJN%kyVZ**p%+=Fr!scSrcCgpgw`5c zojWv}b&NJy;V)5nXJ1K; zzq;_|aA0N$V;&IoX4ASohZ22pk`CxwKH;2`i9W@R4E2(OuqiJg^vIQz-p5@szHfFW$A z&ie0vB5R76-<>VXm289VGaFraT5bC~c6i+Q;hdHAvs>lAn^cDwaOZLcvC)L0Tj6i1 z3Z7bf-ichp-kf_^DCPf5;Qe}?_da`CkFnpa+mk)h`F@M_2g9-Vr{2FO2oP-7e27(l z-+lF+8+7R6>IaeHchyuhrX-u+DJAmZ0E7Dc3|Q3!^!gY zSGCrNQ``lkF-?77CSP8(MP#leBYXF>rg_Nw3-a$Bsh6S+VJ~<;m^Zo za;n2WwyzY$MOEL-$nNKEufEmz@t*wGT_qaho8CP>@HM7w;c3QKy^^%w_K0e`e|o;> z&E4UzF4gZ}uYCD-5BIZm_JSb38Y!UcGcsYr#ER)`^6u|v9~;qJ-N z>rYm(6daBy``(yz%2D?^y>;xy)4?N3js}0y|E3(z(&C?58}VH1&A-t9)&m>v-hQ?2 zOvP7lk^c8>3cfA8v*z~i>s!N~UOTb=x7|YpA@AQjsGdSAA3T}I!+>~wxAcL6p zVFBdKYnpl3%Xl(QO@J4tI9UOMy*T`kFH!AxykagNfMd5l+wa@7W10T9(tP#i#r3k9 z33i0tYHN&9pBss_=lE^G{dd&^Os_pVqK5(=iQW*bMi{5SfUCD6ICaxQ*`c;L&888H zcXGEvtHsddOKkIFgv1ulsnExKY;@Tf_NOxGpzMisyVt3j7#jyR0(FroVD^su+EVrf z9+PX#4q_6x=)1F*2|_q(rsh`Hry?Xsie)=ijle}1H+&wa_8_G=UCeOTz$}u70+K^M zQt<@$^`O>U%BQaUu5)xU3B1Jr$w1>GFG4JmYRem?VaK%=;os3kz$peI0A(n3@SxHo z>44s-eeVK#MXEN06@BpohbUdC)VB&))NjM~$LSq{MfH0ERz1p4*!{gcA zufDoC2mvi3&`ZW+Oz1oZU@~EBD~dNn(^;%&5H9r@0I^bRgIsAcz=r$ulasyHZO(mv zkOy5uOY+npl)?ct*wwlzQP!JnE+9dJ0b~L(2PWqc`KTB$e78ASDQ|wWy<|(Psu@7> zV6Q;W%#OOlC`w|cChT==vEO^vupNPzN3MWWCjyj?yfi5wBLxCHGCA7s@6hGu$pi|f z?hiVZ+D2p0vxXw*(#C6V_j5fEbmd3ouvaj^%+HFRN&OdJU_la#AGgimF@Fdx=5l@C(QHlIIM)y?@sd>kTU6xim^|(jLI4DGw#~EZY zNTD<~zxYH>Z>}+dedwaUTVN7gK=|`^MiWX4J$<o3p3L$SC(y6-6Z(p|LU z;k}=s*5)WzrEl!$hV|;GvNl2~>hUEcEPi#MjvpSwhhba+X%T2&Ss_pridI%K>?uid z(9aWbuuBvkc9%;w8v#7E2>Z}j7MXdlPLpraYh+6n>b6&kI3M(hA?tcR=J8SK$&u~% z(vFge)U(-#`xz1jZ1kF$F8+yNf@Tc+%j@UoKR!4iW}3W3?3BdO1=H)He+UWNs1oRm z$O!gUSR~{9SulVKmn{!mH>R`K_K7W);YP3 z@NRZ%Hd|7}12F=jJnmrkQN19t(mWIDZLYHT zL3@G*2Xt%m;5|%Qr(_!j@ZuT#{6XTfab2p7SR@?usItB6yZX9=Bdx?WiT?q1O>E_3 ze%*FB9LE`mIud>~D^XGB0Rv^%>RGApmZGz+hPbQx)!}S78NR#85WusMxgsk7y^ljd z1^Re_WN~E92_B4n2w>K0zh|`}-eY4F;bCF1`gs(9Or{aK((a&9wnIAZ`~ZBEY+?*k z2PC@m_Hx8qfe4=3Vi)INmcm<6_uE0SI-;HGU1A;E=>#}=qyJ%V9}hZ?8^Pa&HPuT( zvZ6im&Sh0b)YCPTZE+s1lVZ^`m$td>yPGm9+PGh_8(>Vh=Iz=+sHEC|-3`Z%2_r?~ zh5M8@L}pX_bQKhX#O;hW-}BsQ+gc6UtKhca>`i_nq=LI_<6ZES`PB;D+7y+hPiQ_- zT3hW#AD6Y#xxTG8L^Ej@6;&@HeLu7Fo}XB$LQ*`{z2xPZf3nTB$H6Px`sgxsdIaY`ah7BXG8EEMo zLa`3qyQgyUOVH|T3rUwQdtLa-MXo6q1Aw}TbvFucy>v)@!>HRw@LYYqNtD3xWI-8U zi71>x$A(;w0}e3l;c+-EUh?3aAhGMpIRTE-MynDLLJN>A3png#k0EG@Q&$Sm&G(L(t==aFb0WX0HYRPm$U6D>6Zxo}MHYT#H>fMFu zbTIU8I*bGe5yajOo+Psr{5?Q`0%Eje7*rswr!&2_GY#xa$w!$4ok_tXH97Dy(?lEA1a{F9RdK7D6lBO`l5J@TJX|+I_wEEn082ZFK z?385nu0ux@_gh4@Cjl#ygHfk3C>*+LD@TTc)SO}XO~A!XuBgkvq^A%{K@AiZ+=PYk z4P6zpvq6^wvzdTvj@W`jCpuakOCe}h0zkFWsV2l^>3q@qz?YumMxx2ao@cow0fvcF z`1P@Lw@j;qRDNYb{w9%p-9lqiy-S`kfc*x0hpl!YlUcW5&!O2ewFOs|Us}pxzl@&MTE{CeaHIWdvGu z`0TTy&}tASo&CBRj7yG(?{QWo!*{25zOzNxkl+R}NkZf#zkEdGBty9hTP9zRqeLVzqp(T7tZk)X9 zuTAMxD~uFx$~`z8gA98C`>l?EFag(etw<88&bfbe@~)I9{@S?31)oV}2bMxYoRwte zsV9>b)lvoKM6g~#YSXQ&FDIp5zqZ?N-ioKyaDQi6`I}n|l9O~ahCSf_4}15))YRVa zdw-=t5|Ca3DuiAGp@^WA(2I!lstAY(iU<}2L@ry%SZI~<(J z+^wYJS9?#TA=z-}6-K_r@Z>eAQ%1pTaQ19ZlYh~mQd&-6hDi3MulQU?(V9-W#28?e#NJI>du zwm?FN$Pkz%7lu`q=-U89O((JfQ82trxC@Zai_|fzxxa%l;!9lBuOaqPG^;lqJ?dQP z=vJb$9XxV7z$7ILnAMYbQTJH;N@bC0D7u7;sFb@lTFLLR)X7XrV|sz}^j zQCc*Wo%iT)MzLk@!)2UMEZHBd8eV|T6VLHzR5$|mu zykD=47<9ARc0ntl=F_relzA~y7C1jdM&p5iC}8=j6@anZhVKGX3?3EyDrN}Vq%{*(G1~C2kNpW($1Gm z!1B-BjN;nJW9(63LdSJ<)sdFT_{aHZAEYILOuKYvt>7=wt0?ER`Ukznm6q8swO_vix-lK*m_%}=i5K`nov1#?%h;NRU;1024FV2?NV7SJMCl6fxq6UXTMwj9 zlarBZ)ccY36xz?@F`Z)gHf$5x>k?zpNEM zb4GNQ$iWFuF8q0oIn;RpCnr9ES_`ES(l?W{`|7w^#8lN0*3^x-*v6L*b9gA`@H24wzxx)+ODR!1-PAUTBoj{2N`fkyO}P+Y5@S$X#=)f`Q#)US>m>+V zWPr4$rNbOlfeVp}p(t7V&j%NVaztMpoZ^*xGB_fgsMb(lyxms zE}gOoxV zCEqQV2}eE(@Hy1K?n^qd*c1EZ?2$ODpsUV7GHOtL2275NBI!(Kb(k_HP`Kx6%>twH z_LEx6D5Ab|qMQpxR5JH8Og}veDFMF8`#7?Wex9G>2IeVT^9GkFGERwFNhrZ`)Q6SB zUEda%+{i_)izwB)BeslaZ7fBS5PkdG5!>ncSj^kco$cZmk=hrbhIq1&v$Z~bHhG$4 zM>}eimE}a37R5{}UX0}y<#P#XpJN!gq0N%e=M;EWih%PR}zx zuwU;YRp(@&!&bP1HRQTIrG15}<3TohaAMizI5qG%dC*6K`a$0lak}g{SztP5`uI-0 z;Jr@B{qHL!>u0SmQZ`W?6l8sNr9=2Vq9mO@iZLPl)Aw{a{q=nhZyY42^uo9Ho(Mj2 z6?}#FRwTyM%{?p1#)^7YAeg)K(r%#0@6R~>KGe=HJ$%atgFB2M;?Ylw@#*Z!%fn8Dz%ZW9Rh9%4b7!1o_lVkMKrfyb6*+m&y-(dXXx- z6Ed0862_*oXnA}84%6w#vbY>X7CK{>(c>Q-22PpzT$m1hlo6be=s6W;A>4iCSIZ`Af=Kx$A6Mjo?BJW6e*SR?RE>c%{_`Q~I z9ZILVN57@)JJrJsPw65ZhM(?xRjZ_Yx_=a@-Fl*=>qUR$X*Rj=!L09b^<>Rfw6Tvj zb7=JYTuAsx)@dg~NOAAvsB**$y_JWW(6fS6e5=+QBTh?@3A6^AE1{CsGkZVS4>f( ztW;MhE03b@9JZZ|7F6}0o)hb!^2peh_|A*pN)X<9^sSlA1)g)9dxJiY``Jj^ndj1Z zez!K;PAfzTP3XNT>v_h(v3L5A0D zi{sZ~6^E)1J~^8)PD0%_($G{IGCODzr;UEY)6s7HN^yrjE3cA2b9t`aDPivIL%A(E zjvb)28L3Aq-7XR~m3`i2e8iv!F7YCbzfkpcf05IpTs{Oc^UM|XeaGX4#73hcToz~d zd{))hw_NZ@?C3;abvnX(|rA*XI@_Ly|DvxhL3a7$Y33BC^X#POuq zI*F9;cHR-Qacca|;pSM-+BAOK0jgH3Ddz{&P z442c+8QN{VgF}%v-pN@L0yRyAN;P<=?i<8*1+nk<>OksYn8{|i-3$+kpkqJDm?Jhx z?jBO(gQwBP8p$?7qEF6xUr+N~v%6x8J)`24s^PcoT*_&N1jTri5AtXAt*CCO~M{4~iI8kB5=%hz1E zi=_9fo)Pr-X&>L~*o{RxdXi$w8sDP%$_Tzl3p5+_WgEF^j^h+)ZHgV<8&qPlNgGTO zZ0;o)i_AzQ9ou4tUrn_S{+V30kt?76vEu1%w=9mSE@`peJqIR8Qa>-m_h6 zq4yXEark@bsI6L?{y3P>mlF4TVL3sy&kKo#1f_|lGQoYk_O(YL$IRxC@2BRUJWD(h z_TpqsuR+F!7}`X8a+L_~c+Hc5*|*4iXVvz)%&|}|_MHA4`~DhQ@m+BVkL=nyD|8xM zcw%Hb^lc?u+Lzbg(H8OO++Oj{P1Uc)t6zZD=K7AHPp6;t)O<+r?lnBn-hm`pDsOol z^`!j>j%<2#N!)Mmp6wsrEebTTdE8xycJo0qRY`0yyGujs=0*IOBlxyKhcMm(yCsqQYzFao4sx8kB@)++>D*Gu9d^m1jr}NB<=EJ+NSGo`P$Pui zJkDt;QB5NVFSkY%CQ+2g<6jm$Iy6)lJWi@WIm3H}9JZ`k8!Iykmnx4@;{LZeGE#yT zmMJMF`Mur(JN}=w`DtyK{|Bs%4(^~`@zCSS+M2S{IRAYL&z|swiL z@g&}oF@#!u8%G45ETK4#va4?wXvW8=m5gsVR)14u1y40poZx1r)^|wjs>IoqOz>CL zcPd4wB=-wJR6~&D}kFaw`=UX(5#-XRywou zSmUG26?M8H?zK&7V}HKxmLj{-*A7*UkIN#ql{#iWJ&TP`npd_k zqH%9Md7B1r>1tG_mcHGu*7UR|LZkW`?wy}q)6hV(Mr~E;yTiwtn9oL5H0p2SW`k0j zhNpD5Ha;qyJyF&4d^Te1^%uDJp^utImYPAbT>Ad>V$J0xg=i)XL@)k6nKu!qn_CR9Z+*y*??er*+q^Y~g&>^_NPK zTHX7U7B4-zKBdv3)pM+DF=O%iD_y2mZ?w{fY~CBwCVJcYQp-N%soj{dirm(JP3dEy z-Hq3dE!zgF%08AJyYa@Ixoz;K(o#k0jki8}+lL;NE!9-rcoz`4efWjar-ny2W`kR{ zk1Upby0LiUeFSs+7)*J&jkkF&Mo)W!w|u!%t$F@zr1qqQ^5;8t%?lT9w`fnPm4Ci} zta&k$sXc9|{N-V4^M`yro!55dUmjOAe=LjCdAm>f>(fWgOSLUJv&YK6K3{D9)XW5_ zzVf$m-j?NCdb$g#<=vuxF5gsMnNMx` zHl?Tc zi%b38wz)k$>zbK#W-w{hw#}v5|4ZIBe|0r3HJo{DEbCY~?fi3Ea7FgTvFsC|ZCl9s zk(gS@3;cUx3bbvD#iyFfawf{+TFbBfk1JE_yzNta=4xoFBWFD{^|7n?x6o8~*^kgv zPsP6pO)U)G23u2Kp59yyP5o844Sk&&{4_oCZRR=Hnp*zGkFtp;HF-ZiTLd{=E7G1e?@)wb|%1Z|tunr$=s!?vk|w(a;I zwr%7W+t!QSlMC9mYIUAlm31UrR_TD;w!2~{T60VVPg=X~*gUJNO_dKi&ixL@updTB zpIEhRXY9jQZCml}(mDpu8GcW~nr$04-VE9{YvJ(`RnWHe8tm8{ou0xXbZynP)vVgK zYyIoCtyfy1>EX<(ZCk+Q_g5(5pm%I{VA^0Om# z@E6;560~g!8+JgA%SulEYTMkjE7ok=P4H$ukp$W{eZ@7~Mq9IO!@t_LnqO_(l!-BSrzHZyl z@v5`4f7rI+AGYn7XuWFNGz3mggSO4!)o);{Iye3V+zP zjCI>aHQo%`HV4{2+O|jQw#|~n@3P*J()nTAy#BOpyVq@7BIbA7#udM6+Y z{?)cwWBy{>I-tC(woT=qZ5su&ZPaz!R{Fbb>;2QVA*KJdZSx0h8~a~unb6`TgCt73d?P z85O|05BFUG2W0d3n! z#!uU(^1E%T{EKZ1Wi4N`ZDIdx+vpx!J=bkp8`^Wtww=HtT<{73plt)t`83v4(6&XR z_w@d-ZPkIl*)}iGwk7|tZGOE!Y+LHOZ99%k4LlQ? zw(TfVdMlDQ^}lS}KL4<7iL@WKtrEXx+wQE|wvxZuw$xv2TVY@Z5lAhtMx?FVHZGK9 z@2YK!p|KVaP)&xYYYcP@Xxn=Kt!?W?RsYSl#reYwgs~0CqaW~|H`)I{%PB)J$~4>;-9uHj)C@Hvu!@V+P1mhZQH<4+vc@u+iHma zCEMmTCRMOz+v4WXC7^A~UbSrjpl!=rwQWJ5Z6p3^+vpx>iW<@}Y1OvX{>8QpArv$Vdwgs)(HlLri%?G^l6MnO8)xX;|+8?$p z6dlln&IfH98+zTgmHo7B*M7BaO@Fg(g-D-YZQB7BXwIr_D_ys3PthxbtG4ZA@2YJp z{hMv8{%_hgmw&Wv_t0k={)KI;q#-q0(aXVXo|k^HZIAwD+oFEAZCottw$1OSZA|m{l&H=uiCafTnc`_+qSTOZ`&Rty#KIm`D?aq->Pj(|BG!aW_29< z&9eZRy@Oa*&Q&IINO4|?Jstven&!n<8*`pn3S z^JfK9t!60hhwn&-C*`@9wQlX{esxJ{GE0{a^lx^}GbbO;q_MqejzL#oo_1AEGV+m; zG-O@ai)*Ja$}=cak+&3;x1CmRZur0~JzzB+T7H47&2JCka*7X}Ej^dRcgUSL4lk$X z@lF(cu!#6BferIK;N)1@_=MH$tRtJZ(cz;5uDO2orS}H4xTm*1JTO`|K6CheaQ6Fv zfco!NFLK53Wok0Wthr<2$sL&)h_X2@(Rm)dc|N`Q#o0Ojta)M3ww2EbKDf*|Ixns# z>ym_cIV;kjz`dd*a(DZJ5=i5cW_K{P5X8Va<;X?NtVPhqotT5Q^rXe%5n#e%+ujc* zk&D{z7ejdUY?7Hj>*X?`C3_cbfR}s+3dCe_)t~oi;BdME(8umsm`M!Ty zSovPf=sSmbf$O#K7QM|T*Tnz#}K{oOuSA^dExlu zxijxy#xj><-k`$YFKeJiQ?fo^-kX{>>Ux250xZT59*-$Wqs00ygGB~ z46SpwqL^_w(uqCIz8hS_W)WrKiZ7e){c77Pyh^ww_)5k9 zv~87n1xachY1h@fVq|tHotT|`T2&xN(4}zjqmOl@T`kt$S#c?C+Kz6MXO(!6Pwr0M z&cc98@0BN~xPI$BbuXQf#K1#_QqBMxO@G z?PS+~>4CY`)y~H-VOUHQ^mtTX+Hz!SIP_Qsw_3y5h}ScwU7~ST=a$FQKD;S1@pxdx zoGHxTsZwY7XzKa1Pu9;=j-46){!G(6KV3)pz{qgul#>Oqq$vH?-S$mEu#L^B-s9&C z{TdbD8u&CBcEkxOvqxW|6!X=kr#D{v_WAU;&mVVmt4{O;0q@iGlP|Nt=Lqn`dvSyT zp5t@+eF7yjtjN$}x%ls@{ed^T4~`qMNvRgnxi|Wn8%j2?U-1UjD`uR_+BtVW3uDSa zxq{;C2`P6Sa+|)fJXL(oP&{6}@|uuIU}-zj{L49WU}tnnQF7Yl1q;PJMMv)_=8^nj zB*2bTx2z*~vbgI}OdDDOU!{-O7*O$2T{^*fQb57PEIW?xpBo_n68_RA|w8oOMeTic=8}Y&B`v$im zJ^6*egjihkA(NSrsAEgj7ix|#eSRguR@>m?ARp-Ao{_6q2#MXvGwGGH+e$8tS>TOz zkxex>L=Y2QQohubZ;yo#L(bF+9iTm$N=n_*4|b$pku{n|_*u;0QrcH#uZ=sbTEra} z0~&J+9Fz)q6ZKXE*c8z=l-zRjKF}gAEerVFnZcU#YtlkGB#U=k{x;w3a5s!wk#DaQ zAD$-Lk!wmHG~i9`cZRYZF&}Dtb6$LJUdphO+@!0%ZUKy~XzTlL#|qlg$6lro@Wl5G zGsYVO-d?$KGv#g5rwGM{;)}s=uvV%+D5AZ+ek4c{r;X-P(OYBLE(wa93YU=3pt*rsfGl5e#~I&#^)z=G-A{wit9<5_Zv*MhAlc) zN$xS{A&^o%S|L(TTcOjhs`Ufd0G+W55T^vUq6OG+rKjBX`nUoc zPkwz(!uDMTFs+nH&m#Xxc{2zRqX_}Z9~}^NtRO-=h@p*(lH(j9t!K;HPu`P^<l}yl%m9FrDLWWZVKX2NI zypDz*i}J{n6_Vw~oAcP^s+Sr$cdoL<@)&?$Qf0q6s1_X*vH7xhgw54I>no;p5M?AY z+@eF))uQ$Atfg%!bEDBfikh*wxomH@-fT~4;K!ofy8;I`5XvPLG~d{raInumiWGPw z*2hmzbsb#kF3i8t|498n3d(Fuax=H`|;?N$WjD^i*x@W?0M_eo*=hMz4x2+W@z<*}%=xJYbmkZOy9nnCru zEBV-;+%GeC?0x2-Xj{GIo%rEhJ8lgMqXhKQqxcWDiMh&(wegA!`fmU~#@{L=#cM7| zN}^eSt}z}155NubnPiF;%<(?eaeG@bal`H$o*jYh@*Hvx0n*>c*~oh9YXTH5j5zptfc<&gi5}($x>!n_sdCnBNzTI%Mdkw+ zn9RcIKo^^#5vNtbNb@r2DVL->lkHE7qhhZZ@%BrYOR>M=wZ)&5631rm*ycI!aNJZT%H%Y}4mo2mdc+8#Cq{Q^@NFC>31qI_vwsZD5+#^DyVJLs92Pt^cFkV!9 zP|Gyw%fnJ#X$o(KIbCzYtim|k9Y^9&rcPz+0Kf)5yEiGSUmkB!gNb-d2t)^N;|xfq zL~bqM+Z%XlG+Lmv=1>ur&|dSra0(1Y_J|hDhuwzlJs$q`yVmgoUHdk@-K2{*Sh(`U zdO$R*aQA8V57%TYo@pHsu$R-M?0n_=W)K;i-1xM;X)T)JR>AzFZ0DRlBogvJ<3LCc(TaBjS+pI5gnHlb+ z!xhcoKznY57JyYpIn+mixvH8g?b({`LuwQuGe|2wUeV*ieFh~n%E00SQcm3{6^Y<- z1`xhPemq4~Rd^e%?$ARxPYA%81WXsVD& z0faSiVWyrFc?;!nb+=?Je>!k}zWw$WfHegCplF&S4bAIs5z+uuI&oMGf@-~rO#)8n zLs`uL_9U&eRw#-f%hd`+5DTmajS+pK%l`Osk95W;4W{c+q(uc|$@t374#My910qASM%3jt_%KI#K@y<`Ds z8>CCg+>E%h3CpILeI}d;t(Q1(J?Eo-m;4m~!v|p4iN=G(O`C`sSk%i#13RSQPG<;2 z6DaIAPooOSx@JKM`oO|n1V-?K9N6M(Gwu>cbhYWYQX&;&)BR~|q~q zHUXhweqL#s#58wCKA+sF5P06uBh};uRf`IL0Q5YrZan|CD|?uveKR`UTe5QvL&`S+ zwp?Ubuel?;1?HOm2~8j11}xvC6{7FD4110iM2LfV7R z&^`hcR0qd25az!XP1sRwV3shG#Q}D-_J9_W*r^~TTPdcx5lA=)jw>MIlOgzN!V#65 zGOjsNb7Jc7E*(ObEjuBVAg8*UqB?ECgVo@n6X3`aR)QIvev>2oA(@?`$kv3Vw6_Bt2?!;=QDz~!jX6dHwz-~4D$0=5xD zR;uSXOC_m2WsmY8O@{Y`$6Hx`TPKkaTa21se@@RZP)7#ui^R@%v^#YW(&$%M!lFK_ zSSZ+GBnNZ&yK_d+hEshH972GVZ@WM<3&*0MHI;aySae|T^otw^{Cxu=RMr4sIan1x zB%_f~2MNi^S1ChUr+cd3Y-Kkj9JX=3Y8K>;;yQkP$5?5dJZgJ=FDN2KJ8yGrvM{%< zfwh{YrRUkY^<^w~^~aHI#;LMWRCNqo!VH#VO>v{cl-@z#6A)(Bk2-o`KISlW&cLMVowLqAQ`)rZ&&7Vd)#Du zDv`wEoCU9kU)ywx6$#+$vF0QIiOs_s-QGM=n=1S`bzv9R_Divi4;-8yr}{pwo|Gh$ zNs!*sm00m~3Lvf?%R2$p$c^UEbbTvd3xp6Cuj6*z&tY{Q`fOqOkQ)MDwlgq{&DQQ? zH84ww(=5xxxKFJe@W7CZiskTlShz^FV-wL-3jCVsu zhw__m*@GnQK}PJfgG)HuQO~(iqe)ww$f zj17!KDhL^Z*@_$DtG)XLiCX8*-t#%V0XoDTUEgnNbv*>Le6jy*f z=Kipz%AS0K=PYS!Skk%@TK_Qa#$XTJOFQ(M zJrUKFBM2RZ4gwzP0JGx{Eh!t#0jC{!baz!t5@9>L#93tlzD6L0k-n-WxhiE`iSClI zI&?zJZbZD_nwCuRNC_FDXSiLz;hZO!r9hdtRFD7--sH_QQ@)+9Xg5nlGdIn6<`PP~2~M4^?G=5t%`S24wa&uC99!44zOzR1|)h zV=2~}P)OM5JAD&cpbdodIm6P4Y&RrW30-C*0%UuDTC2>t?E#08aN_-(eepynzS@co zz|zH#&Ta(=U@;G-#}D6LPh$~|cEBS(Y7sfc2rjRE;l7u7GG-jCqY;>c>_yFKjC3~Z zO-~$3SmFdXXabkDhg^!wQZyf^%07)6Xg4+kAakRUD>}Zq^~zk7D!NaM*)@jmd0VYV z6-ftr2$XyKsG?)99({au!v|&OOg0$Y>ud?{_Ce)^HkfjezpGem!d}1c0>6EV9QXBt zns%%;cJIbxhcEV$jhub5oXJ40RvMmSw0x79Hcd7nzB}S)!PAPdYlUr@jt*`Fe+xt_ zcXy0RZ^jcHC6^8U)qA@dioyf}V)P5vt2{r!-gA7>A&1SdIrqhy>15C;xx z`UcSHn=<0(Qs`qQa>=iJp=}xxuw8R9-bdJ9p%j&Z__Fvmi1I2}Llp1{FY$K*Q(oCE z2XgD_^6U!QJR0Qg5~T95@6|GC7gs^FG0e1=-GMm2kyA#33nlU@u_xlF+)1wTkc6u< zP>OZq#@Pht)_@>+$`$~s+?$|@KgX03ELaIik&{XMYr!5W$n4unVgNl>|ZUMD0Bj50?wQLGqHRn7DGSG{ zZUQXYP}7wjEz}jox)ILY*b8%NK{!rB1@}@l$NDG?5*feS70L4@eKGvOrImEZ7Uri6 z>d~~lpD)bCaWjsFKOKviUb>%-KpECMk8O1egfvtkFw0Ix%KUa1YQy4L*6r|xW`rFB zktCXuoVB!~iB_?OU!{i?w}h2E2rC^8D|;VS&J1I4o~jT%Rrv{SQ6F8o>r~C&Q?*A< z)kU7FKXr`XQm+c;L7dg|-iJ>uOpI#E+`}8&K8IX+xWDGr8M3*ci`)$05;w&33 zpm|;?5Q15TxDxopT+#}d$0w%4A27pvIZr92#`jOu0{#~aZ?>#+mbS_h&Oc+f=HA=eZ*T%l;Bvz9B1UbDD$-wWI-?TLqWvE^2&$EmA6k)m(E3g&Wij} z!2FoS{Io0T%P8~B186cU)$I@4hWu!m;$?9+*RA5U*&VOfj-~1*jp>)Yy?IhK>#R@A z%i!jiCAc#F9)hcO<1=d$;{)YjX{p8iE!Bz8*h*O1F9sy}^rW>aBZ74YuM zmbZf{`ug8(J{=t3cH1wrpK@`lbA?*&h{h>F&4u!yGlE+=KAv2d)<|S;l-O^dVxj~W z_RBrkG2N}?sAeNM$2#SfdTUf(&ySp?uZBk%8}8Wlqa$^B30b@4;1NYjYlTR zAYK~F3$J&ix?C02;L~>w!&j(WIBp>%jR-q1;kbMt zh$h*cq)sEC5_lj>0 z^kat}%8O;H$r?P_6EPdNx4b2X|1G2Zxl_^=1Dt}rqK)KQM@k_%@R-esO(o4U^;c^h zgH9bjK9;#fsbN5V-oJ6RBUS!1;Oo^ATHM>BK*eO(<0 z?uIKU>UDbvNr4?MTgg z*wDw_X9+h<{VL-0DvjSKkva$a#S=8-Bn}SLrMY$l%I^Cx!8h89((Id;RtwapCFLMD zO1>l+tALwBG*-&;ynxFz^dKLdPoWiyFnlVZzwKekChz+SDJ2qPUTs@-X)Fob^!H^D zV()xY7eD7^c^6r$nWn_9Dpn{-m1TZ3FsRq;ON4*!BFGjB?mlaKA$200+y5gKpk z+=lM9Gs89SLpu(ea;MiD@_fT4=~q{A*Wk^!zrTr2JnrdI;_a#TLW>mI*DfGs(}#2R zq)4(aWPzv7`X*87+dB-d)s4~RxUQXXCVvo+z!i!Rg1Ytcv(bAnI~5PgaXI8x(WG~h z_?u1R@w9fdB@Ll<|4xeNbQC`W`%tn^YnM)LHpg4-eVYTfJ;1Z)a^%yE?GFpuDZH1-X%EBzY#VZS!pF z?Ov|O{#t+?CL)l{x&s_n(DE1NImMh1G_Ys^J+ z4+q9e1{!g*9M1Di7L4g;p;o<4CYiy#qVeipEQzupusXzbE8!7;M`Co43wij3bX~o< z4TY_JF7s>0acoJTzM#m>xDgF;_V5IXpG_zF*0u`*Ce~3K5(q3jVoR*ofNobGk5;mJ zFkg~p?1pX_8$a;)#-}aHVULTQnqww!I8xt-zrS3i-0>J&GWOc0?&(gWBGxP7`URR5 z@(xjx4&|uxGCG?t?|H+ts?Gsw45GRFOh|(hZ24^{X<<)Fu8CO=-)7|YPyqY~6pK7T zDJAVsbrQvUa;50J^J3i&bM5s?((270!jC)i=wFI-yYyzKyIhh|5}H2=cyIoiE2huO zgp*p`o)C7x3I8UcsIIq?(3j+92TsU^EzcnDT6)fq_0iDK{U=@|2nsMru~5$_xzbc3?uk!b(aMkcU?V8Q2&QsYudG(Tt7-022 z3{}ip^?eH?oLu7l=G0z}E2?bQGD>|8KT8@O@v0j}C>?Z&=QZ7wjU;Hcb6@77D9A#P zPQ6!m3LN)P@W0I*Y|ilCudG06-|cgc;QsN@!Zz5O4PM0a%?L@BQZZY8Y)G3V-hjW{e0y^2_zVtt~3 zJF)*V{MdjvqGT*;Wm$MSN+kmc}Rf10$-TNoFjarP=HUtrlIe{sL7UD1&*kCr@)j2CA$N>c=}@pFq1V!zuo( z(QdiKo?su+dG4MTb@p~7A3UUQPwJE4k#+&WiZ(IYp<0d?ZdjZQclSkYsJ#ai4~29# zut8zz!>nUt%;&SnN?H(E*%2$ITKCwL_*oI@y-?>oFmN~Htya!S1+%R4c&Sse*GB0r{s{337n{$U)S$cEyzK>kkDDBaTjuy_*@0s5o zi|f4iaBGzNd@?0uW6KEi^XJclUjj)@LOpeZjr%pP&PU&lkS*4vSiKHpwS&te=37EI zrmqm@t|%lBMS9IhV;P^aO(YVCZ0W4H>9cG=CsuPz>;vccVKB9E304`WbQs%q$q|z@ zf;-;UBxbak0kB*1k8F14=#S!V6`7eQ7EKZpkL#oBTR+uf+0jPmGLNfT9_;3#-W??N zsoS!xOd_PG)sZL|ym=Ja5ru9wP@E*9F*GAOL0>(SjS=^tqLn?jhgCf5Vd;Re`q*cT z5r%%Z?qdde`hoBeVPgOklWzqddAK9ZGgoo!>F&|a&AekM_s^Ghesst%qPoLUXfxUF z0fi!O8bsa(U3HokWmaTJ)Q-HtxxavPvNMplVUNlA>_dzlaWnAL=d1xJGLF%@(X(UY z`5avCpupqyjo&k4LU;_q&l%L%D3%QF+{AC?ea2w$K68W72FNw`*qo7GPt1xKn~MiC zL9DGJdqsT=#xaV#j&=V=yK*9yjeQ)+H_knakYb4w;PK=rF~Ot@2muJ}eSp21hvPMq zraFYe3PCuRTPwXEje)y4^#tLtPAoCW($atj@2L~Op(<=7XF)dT_QKVQ`wvbK{F9BO zC$G}l4Zci__O`zZG2ZxDTuFqi7%pj&DQS_Pd!I`OIqH_?PBEWA5XU&k+hnc zyh?Pi({V5v+!Nw&AozjeG@~Cfm}@1SXR9O?iNTOIpzo>42&HfLZAyOZi7uqyt$B z0y$)Y1PX#gWR6M~995DzrcrQASLV1$!Eq~@6OIKZ++~7&3W5VvT^zJ zxH8%JT6%o5Y{D&iLXT|X06lR;_Ush>?5ym$CHlD)*(8?2Bo4V`fx=`Frd*12VTzJm zszza|u3VZ)VVafPdB?)@?s6A=3NHl6T?{U~7$KJ)Q<#2M?$U+AOPO+)^9wJR$z7=} zywWUp^;Y549=VKx!i*8Q%&Ef6S-GsG!mJfJ8cPw4Lq1!eC|g86N4h9SN&cEf(KTK9 zT$7?)EBQRfqC9u`e4nEH0QrL8qJju{dQ1`htbE~xqQXr1qWq$wGWp`#qT*)xl3PV3 zJ@TakMWrM1Wm83Ev-0IjMdd5<43=UBheCxwafOINrF3znl0ub6ah0w@wMlWcl|qeU zagDn|txs`nfI?kxab1K$eN1uvS%roR#SNJXjrqlmWeQET#ZAo$*KaY4ulFe27%09m zqR>25+&rt$vQ*r%qR`4x(#oOOCQ#BQqS!87(ypX<(gml@2lF|!}vKP8alO|=8R!T1&%U-%GP5G2f1u&If1(&^wP@0Y@ zn?9>FbD?Y|Q|Wbn+3PZ;H??JNnw8$(Dtp_b^lqT+-H6icRN3sT()*>d_bW?|c z%0Fi+f5|WZQl|X1w)|_e^0!;%-+Gk450rl&QC^uUUzt^AE|oJ^lmS)-z=?+lG9aRO zs0;(DjE8MyfLRH+DFeO>k8olj_TpI%Fj$V@Sx+)pBk{;H4CFaH>LLS`g+~`K(B*ix zItE({p8Yn1{Q-W%6UK&7JjW{r$9p{ICk7`I&&687#i_zASY5#_s=^~v!K19gyS0K> zPleC4f^U}!zf%SOUKN1@6#_?81W#57Myg=WRAA1j2wkiY%2E+7s1Pn!*;rSxu|;Lm z?TSqgR5m}U*gUEt@~T4Qy^83k3Q?vCmbDVgsVXK|DJH5aE>kJ4tSYg!QbJEv(zH@? zm#UOgrPN+k=>wJ0M^t4_R?0-G%ATo|J*O&nu~IHeRlcB7zFbwIu2P{zRq=ME;saIO zlSc<=tZ@ov2k z^7o2q{pBLjm*mmwXGrll z*n_ije~pwL1|y{^>yc88crNQDAw#qp0gRNY<#HKjoz86Q#nq+tTPR@e_r>AGzxRQV z`yCW5*^oB*19F>w1L6p~bMY~Rs<}Y2Q9HoYyl2)7Xm%Uxxf|`q5NWF0v#Wi zVq=PCxqu`@golS?jTTJt3QY837(Avle5OVuLW#yUovFwV**yMYW zTRv={VNY}ME_2qbLGFS;6D1IGD{|5?S5_mXWq03K{eaw`4S3DKNU2Y&4gm$@2S`S8 zKOnc}YNV8cZki`z)*_|1qHUKqu&zf+x5%Z<2~Z9iD?v}jxK93n+!skWWRBf^Hm@*d zE*2~Omq@9k30|q~56Dga1#+7v2&fcu>^KNQZqtN6Bc*sRZ5yuDNa<4$a#v4mNZS#m zyb8JV37FML>4#Ow-NUvXDXr06i!TcYK}a0!-6&SvILx?rT# z@<*if<4?$~?*&&yRBwzX9{dq0Re-99ygqX z43|VGel=2hEMDyXdZhH^{B^$7Na^J^jRV~5tC7;1*u9%$_}JGXrBWl0BXdvuL97%8<_hur&CA$Lc#GE%|` z|7WDMfr~71a20Y72$D6U?`={|Sc{Yz7_3D~^8voUL`t#q(U|b{Na-M6Q{-o)lr36* zEmF!SA1oB@5tWKb`xA0^tU~UiRJ~3Qw)IHqR(-Qr*7Zo~UChr&>0*q`906&Dxd}!} z^}M*+DHdPANa?~lE9c2*I)J6NJ138fS1>Ge3hP-RsAMl<;-C)ly@Q_?OdVnXk3>P z(LZ_hQo+|72fEzmsR^ccS2%An_%o-fU-u=hvb3t`T9ekg?#E%TQPYX73Y1t1cM)s& z@sXCRn!}*Xr+KCsMy{}sPuG4di#669IpQgUvAny6^gdM8wG8~cNngm6P%SKYuVb9lRMCc|6OSs`QcKQXhBd&^)UeYJ2_oEzr)GW`DuO0pT zghgg%flmrN)8uDa-b%6KB@sDOR-Z#nugBIu(XFl&f=hLvV?Bcs`9l-Vk`}l2RwzRTFpP$7M{)W&}7MI{(;yz!Fj~kK7D~$JrpC+0l6m zp@oKJ#Z%NxKEdiq#503zS0FgMX}hHuNGJ)i$>Oy6QT>#ymZqS^c97n6=iLWCns zk0I(wQb$JEDth436q%)T?WRJ8{&ZO_ee?Opp;FXr`LIk4K1Ea(9xe?Q&&Wtnt%s$a z(rWq?Gk~+H{U}*Wk;!;0owKM{OOc}P`R8-SomZ8yo~CQEjl4`-w9m%1m0^<#>9f$% zwf(0wXnwARE~tjfd&;6fr4Fh(dlI921ayvTF3acv%h{)m!6zvyYU#6eL}!z^%ig4COl8|VQkRajDi<7GMbq>^-S z1p)OV0#WO!*7-8&35Qg7p0P#EjdKBdI)qf$rey9jSnoPmvjD2m5Zxa^(lXS(53_32 z*T_Oc6sGC;<`|Un98#>WL&yqBBzX!I4^i!bi!5k(0dg-my`1Dws(46fk0D=?*14|` z1-#+88iZ1hlxk0yr98@L5hr_~YcbvOVfRH~d1%z!6Y-NIhCXx_VkS7; zP&@^%KH-CQEn?3)?es@Cwhz)Eh+_W&%>ZZ9y0LkG+SW@uXvTS}Oi=hKYB?DN^wN~r z2yt}hpSJaZXEai9IafmcLoa3pXJvu;5i$MRTq44>qxv|md%`o4ZWNW{Pyd(*VtXM* zyHM4k0H1RyZPeN@RsTP7LpB!h-h)g6|DKQg6O5e z_gQ-(z!C>B5?c{aAl}9UP>|N@#kS7H_L0SoImOl{hz|Fx+}gljq8J_bB9^tHFi?r8 z$3=_CqgEBg7rvJSNEDM53T|UU#PK0)awXxz@ROE`R|OF+czAsQ9Z?Wgg-6B%40b8n zjZ{)-Hl-}d+#4sh6fKw)rS(r4M~=Ft)g2@q}J;Y`X*s}5?A4;wDM>Zegl*paL;JB zlEtOA7!7HzgDpyMz%NA~em#}98wMu_T_)*&exi~@soN|mVL9cX5D~^%M*c`W%VWju zWK>gcJqD9osD1)(%!Cb=8I&|m1Uxr0TI(vlKiJcd*96tjq;&L^rC6R@3eA2BrXmKXJ|AlaGDt^g!_5|( zvq{qNA>6kT> zD%M{M;g51n55A(3LhIk^SKDh5^th1mto6y|#MsreE=6uaam?NwzWi?SngFA;>?VS$GD2joFP3^Jc%-R~Z?HW{)xJJe#?XI(KAI8oro1F_}~3x3-vvKXi5h<(`)vDD>mo5f z#Hw~nq$|g5bLQ+!kmlEs^*wX2=L|*x4QkIh+JT^w+9w88_Ero?^Or&<@A~4$3%shP zTk^Eje?GY|$z$RE_$T)h+D9h~O-I(%v9&5-+8dP1U$6t^M*W=c!O($&=w&6;+olV| zEer7#$fFfVO$J(r$yLbyl~JUIS(1`l z;`boIW+dbI-G$n8ak8gfQx22vQYOjTJ!ivPFb?L5{fgpSn~9t^vdU|fa{U)R!0Vx8dy8HoT|n0t`qw~3lk9@ zmf@~u!4V2v@Y=J3F^Hqrq+k5(HFW8LFW*IbpAI{hF-iQgMx1wW7Y%dB3S}u?^`KKO z&8awj7R z8(fO)+PoOyWt?&tGBf;Q682)cgJubKP0;-14_$NZ!-o%4;4@?Lm)9%Zogzz{HSdd- zmjp*%8+bU$J*Wy=ihQ3#uO^XPZM30AD0l?aZaPf)(^dczU;Jv&kA-Lk zeuUDG>y5Kw6#RK5fL6CBzd03eZAdbWCEZ2x zb*ZF*mRNSS4BA-|?K-~gx%2)_DBK_2Q~eD4>^}DFcoSrXu5a3$4up)e(YEe!!CFgR zR+990Ja(GY=(0}BqVl@zntJN!! zSrxhmvjbM3)KsLxZt5nJRY9`l!F%LAdDP^K_qTXZg?HqYbgVux{Zam{B7IQE|0s@W zKa_8$@{XF>><-{yHR$WwVYtbIYkaI>1FMq)7oM@K@EX;?T5B`~ovtDB_ZqEZLY-jN zIR0mg8ZCnF`2E#1fevlCwIk%7^9=#tQsxe@kDiE=dvN7TIXKv^@h)A&ei}Ai2@sc5gJ_8vNIO7Ug5mg- zE2paYuJqp(rzcNSCKL?lNRU)4FSCLZFY`Ig9{t_u=Yg!5_NjJ~K z$7SZ+_A^V8ZS-sEsVe5|}xBse&%`R?Uba0O~D~jL3Y7Z5BvMT%Z z<-WU)L*4P?B`uFSn2k0Dt_wOmT2>2rJ^b^m=(v{P#}Ar)9H&q4e!I9alS=n$Wra%3 zzR_RIxP@?vQxghUqqN$+(71tgO>t-9*avL4Rf*Yt5RPv*);)T_%=$9DuvEVQ9VDjD zQje)0f>DEDXb~(rn})q(Jm*=_9K4r4?-}A^KG*hj>HEpT_|t9WjRd~R2UQ-;%8(xw z32vYbxxJEuOk^%qT30XR>DQUcUp&NC;|G`OCd8*EJ|0(HZ@3oE{B&b>vOPd!_L0n9 zrKyG7w~llTHRO}m*Qn1?b`2uBrTPU0`%uZ9MCcK>k26M-1#`2e{BECZvT?@w3?>An zMqyp$4aHFv2RC_BhfK}T`OHZE8}_cpofYh1!v{OYwL_$o1-{nZMk9-1?5#!5o0I!b z6BNHu@~ol84eYu>xXI^1Mfv%Mpi@&^=2# zT?NfY90SvglATp*!n^PLKEsngcpUku0qa7L_-g{aH;;c(j%sY zqze$9>rtO)&cZ$_mr*04UEgkj_Jy8SSsx0zgBUrBVEgAygt=EiM-K=-!ZiV=o;avi zBLng9I$iezKI(}pJ=*C3HEitZd_UDy%UMZNln&>(*tB5r59cq$!OpFrq#GF+$5M=$ z>+(T)^-U$-$pRaD<}xd{(PU%^YqMhHkHSS3lHqTOpG3n06gMHFY5a))Z`G%szZS3GV#Z9YOc{Yy6d5ua1tIxw7_B{ zan^)NpRw72&$y1Ap#o}m?JR@t+|4l%95NQc+@ADX)fBBAdulFI{P@hy23d8jwdmR| zE_}h8m6f+3ksG~1|1kEHxaV$-v}NCQXgQ_hN`ST{2~9sJG=f0(!Hs(JIK)TdSlRgz z)*nBK1VL7{)?fF%&Y!z4GP%LxSRsUZR23|IS^SRu+H4M7yrln3zgh57F)Yru2){PS znnps#&v`hr35p@;ds#i_>X@APhqzwsnxNUe3t+Xr^z`yEZapMUyKESLtoqz-9^n+7 zO)NkBcD`<%-9sgpX<_-fp4P3*JE5HqPGn{9pU&SG?lXdNmAEs+`@W>>Ez=7^fK92E zyEx9Fe0@owBdE)s`#2Y2BN?AvyvM|c2OdZLoTelkvqw%BUkP1u{+VOk@g_6!!DZ=` zD~`{e&MO%jbiciuVHc&ITWv@ zBIl2|5z@N{nV%my^lC-Eq8yzlgzH3f>SzS$$)_e{mQ^!o=ADh(OLM1RU}Yg3#AaL= zE0iiaz-WEfPpU^FPq6hN!Wi_E7Tik>mk&ClR{JGFEECWVvif-K6S%#5(>x}qlpg(u z$b=Zwl#M$8pv3~S6iFZuG~Y0;2Y>-2dhIyfVeK(8;L}@zb9V<}YiyVW(0Cll$;9&) zC&Gl3d=zj8IYrc+jGyq1LGWuV33e5b#=&@EsSVOaP zk-3)uYGNo(ay|uSe2~E6Ce|+=oKVLJ)|sM@w{t-=L)qYML&@b^PUh3di?xX zSc`DyRfdebP9Z$0+VAJAJqtIwFs0GQ_i~`n6=!;#90NLlK!yN$?9F&J2)G7hht`N7vEl5!PE~-$$b~ zKP#i40Q`~Q5ft#O1yGoJM?&gE5cV`Bs7*!UlY-_z4;)0-GlB#GILjZbaR>@T03C}X zU4C`^0|u(!Q)oeo+(Rb}O3Pc4c?0hm3b_*+>_8zQUk@?42Zvuw&p`pGz=2G5oP>N0 z$vLIqKy&sY`@{+G_Twf#5A9bm+S&@512&q&5b|bB+q<2v5B+gA1OTmCZ{08MI!$77 z)VejMD$)}kLmImVY9yl*Cpz<1qeZzn!ROnY;GhiJC=O|wI zlu85o_&{XHD#m${|j8dTeW6PLBD2->7;hWGWFoL1E&wlSa6v9X_1agG4*MfJ0;{YoiDNuM!X zErzbD+KES+1_eRkJfcAZX(tF^UL&rDh^yj`hU31&c^kQ)#ZF$39`*-R%oPGGf~*}W za3g~k^&oS!k!!Z1_G+wxO&oNawv2{Jw18b#p^zSup(;#cM9G!**+YzBb#A2NsA?3>;x>acMV*qIBx*kHZh=@hHld@SR#>fiv}t>+W&o)c2^GY* z{9I2(lo6(Eu^n^ca+PuucZ^*vkM#xBP7X|p2Z^Iy`T9SPv?1E zK}vRX#=@x~ZWSjy2(xRkUJ;7pD~Fk)ZE(G@0>{&+;`^xhruV*^rJMfCNi&)xs?;yI47CSr$SOq|? zbG)-Ql51oE9PJH>u%{T=l*7YaGPz*b?eN}DS6R0m#1`J8f)Cqv)?0NhcR*9t^8?XV3kXb&dQ z?b^wxk$d0fvzyuQ(#J%d0?HfGmGVebu+gBiFPL$+J>n3`)j~qm&_1g%V!NiUh#0YL zYu7Ro6b+JST#yfRNL_;JWjh3gKBZ%TKnOtlC0!`(aZ4haT(gBZlc1cWj*K+uv@u^I zEl%3S=1GF!7sXV8drtseelp0t?Z_wjRtSl41A5Pnga&4EpLJT$O4v9jJ}&i!XQp?A zS+a{6Uj!z$-lG>Rd^%jV(0mW8%?a#&!gN8LUCZo{lqHRy7^tOl(2prDigV063!uhX zBNg;xu8lC_WNH~lXsN{}xLO^=;A>)Y3$<-EfX4k)7l2l`CLMq7BAn>*G#kcC0$ub- zl-tHODjLNyPCeRQFr6J)1d4Fe>UUAorB5`>5V7*-d$em;vibb{|pTHAu) zjsU!+UMi1-S0%&p06A6~UO|F!s)@6KK2-syKb_!BB{QTb^t&CyL;#39K;UQWz)$F} z<8HLa62qbjTOM%U2F%b>D;7b5P6Gk5bZJqF>0s%HgSH&3{8;XKU!((N0Wk^nPokU1e7w;gezd-XoiJa z%#vGT+jV;TY{-jH)QX#}xS)DPDD1UnI{*M&W$F#4YWz*dyMLzkyh{ze_X6SwO!UL3 zYye5pTn(BXXXD;pbQDNBjD=>KKj;VQgEd@9Bye`;x5kbPzqO%CQY9Oyk`XZ51{#i% zG*>Z{!)H!L%Qh)AgMy^~-V--kKWy3xPFTui@rIM+(j`g7$ z5k?0T*@#?IvA9xas0XK@-!ECYrRx{E;`~JrCN)0m9SFiMG>4EOaw|!6$VNdXW>E?% zaZKB5nG8aMh9*dk5wVc%m)kI}H+EP!C&{)0hBNir2#zaoi(wg=qd_Jl&I!^GR9kh# z%S{o`(RQY>KcEz{^EPFO_NK$p4GoY{GFC`;Z-n34()uF{(iztX5@^~k@KQ(H`UI$k zNS7VwvRil55mM+THyo-tSV-xLS;iiVWw(EEYd=-N+trm6L$C+^vIXTDkWR#s*pHDO zDLI#hc{oI9xhD8(`S?iC+rO|JPA2(K7AbZDB{pmn(*v+_Sp`?mjw}Lx-7!}5z}|8Z z+`PBL2fX#a9K%P7RdB3Go(8Wa(OdCnhQpI0;MN!f3FHXi65=KmpHCpSm{K-sQNFxf zzCF^ut^o~SCzqf#~8gt*z~wUwlc&0ABtx!)C6yRv_#B z!el?})Gg?dj<#aSNr?{KvEwtKfDR*w5q*mM=g?(q^W{mOv+(gXh<#eozMXZAq5a!b z-E0!G7)f&X4U@97!Y}BaCcp>B9_*MmvSUw|pe#mY(?~nM6%eBcin=3{JT|sN8&Xmh zXTH1RsSd5Ei0LyY)9b|e7Ubv!oJ;P3NvhUm*JO?{C5^*pMT|D@-#LT8#eqgh?(=}@ zR_Zw0wygR2USU_(U{Z!DU~S~}IgSK4LLLNnArgRH(`ER)27{8`in89#p)r-@zT_d; z`Y+7J1@E_hYf_v*jMdy!@q z0x$_T;g|EO4{Kv**K^H|V_Q5|Eiy70<7^cG<8o}{pkhoiH-E_pG(Eg-WJG!1h-i5c z$Epf7BE?#Z#gPQy%D%gTU;MnBC6J;BThXgbxv*LZCT8j``X>FEe0D#%gbDCCL2t`=8-^zi@b8diVndVJsrqtpl}d6rUgvu zwOb${g=`=y6MojZx8g^WpXt-e9K zA6F^64jz#Np%=sMJsXX;_;&OqGcCRBw3?To3gIjRolkY+?Ezg(l;MxKTcN(6t8;6p zg6!eDuLmT^4J;NN^d7_2{wY#wgvw%bViL7JhbSM)@)&Tw=af(+Fp?r*m?m}RqtIA} zghQSFZ2{QBZ28l(mH#y4rbS9i&Fom2P@Dlh29+RIBhdD)7?4OrF!A`x69`hSQP-?9 zoeF>%tct%I?yN0Th7Dk1B>?(9oKOSivIH&>kX2iYA`jkjtryUj5)5G}CNspOigkNQm5%j@8 z6I-wPd1;{3nF)ZXC=B5888p)bdPvZyfIw9=pdM$#3irNF&vgS*PuDgwkU=0}^4=`e z`+xZLbNtrFSC1{qx9l#TM8?c8Z{;qsUv7Y=>hDY2j&t-yc;jLi)ems7m`7h{FR?jC z@L$!Lq$CPWY$_%T^MV^0u*y&RMg^ajv&Bj%J*UuLPzkX zpg+lgZTse9(BTLYHLc&>)yEj_s<`Vj{jwZn?u*i$5LV(LiWVeFw29qiefUkRDAnXF zi*)*K(JgHH*|sxrr6KMc0D#>b(6{!uCDT^i|b* z&Lx?)*I=Yg)4Mj8H3^Fj4cShWK^NJscNZSW7@r*29hCB~lc(#(VvUkpV3Hm9emMM& z>;T48Q*MyWW%)%HqeaMQUO+`}(hwt2aae@$1vX$r`~kINn?>9;NN%DpjZvA& ztjrx_pjh6U##BKM_3?aKoQZ;`tA(zK2dje2jLPov8yp>=jR1(v2*fD6iz{(L=3G>& z>C8ac$H60wb@`(lHMTP?%eTin6*xREJ$liQn|#G)yYSMD*zus~y%=2kIo)e$v~zFr zR&caNDqfB-JGPc}m@>^3S%!s3lmWkB%;8@&O;K5&_pBJKg{b<|jV`$jva%lM)xUlj zd#3gED`jQ>k+_i5SG*m%uq&prEKn;poKk>h&e@>87?&2mMwXYel(ZdY#swy@A_QTr zDSZM^59I?nytl@Buf&jOFTHgQg=1a!O{D`J(Y9JQ*vJ5KgcYSrJH}(63@|h8se5j` zL{UwQ_!je0WG$JoDF`@fUIj^%!pCY$Wj~3i(L!E%@1y@V2XoV?_T#?P8E%h}Z@kcJ z;)i%0gCoZn0TmQa5x_Rk0*m(1C3w;%(gT)gVX(n9CzN2k-J%Ec1F_-2Z+&OQSBXbt zYLm1+*515wD2RiIkLY6SQm$$W39yup7+j^GlcR)$1jR>&qw42i4zsaVVnNFa9Vnt^V#-(big# z^4L}V8C~bR=WrmN^<#*ch$iD{9c(No@*Ir zZwY44OCY0V$T8M`yiDE|kYl>z$dgCp)GUh;@tmGYU-n=PGoY7R`u21t&0eS~!Z`7Y zr+tHu3`db!mg{KS{pS0VjSR`smwa88(kySHT}W}PidHePnz}6Cc0|R_=H*HhvCyX+ ze;*_ZN@A4)V>JO+ldXXE=YK>>l}69lq&VNNYzph^dDsWt!#=I}BCL|Rbdjs%1#8WM zsL$mO;i4a7kMer5N9}OB4<;;&Tp(lV11}Kg>2W60vsY6h&YDcSCm&kVBBev3@Km(! zy^Q9t-fX`9K8+I}sa>&+K|cce^_O4NpJ);IA*>%Avs0yBbO9b}IUP@b&fOEITBrWV z0ChhHIdek@BkU)amVVCRq&u3Tru&@5tg_XuMt0!%3w*q(tA>PMC7XkYTyo|zT3lag zs25FS=-P$VUsQuDq{xu(9DyiEECzTNZ8O{Er`f*Mow)MrlicMQdC>*^X3{O@^rzqD z@8Ye-;?0c-6 z%XRNP6nA%~MM@jnPFzoRyu;D?kjeA1ap#+zL6uW2#q`yIlT~V~hJQp#o2~Dio6L_m zMQe_Vj?ed;eejmLq%f*{CI9&YlK~T2r1WdSmf!qB-_3Dt$&+U_CJ5oxs26gnDs7jA zk(5nZr1Z*kRR5uWS^{Sbm`~M+;o+&+4-%Y~?zBkhg|vImAIDdHaVogD<$K^3b&Bg| z-N32+M=7*ODOIXz?#&29t$^o}mt*TF7Yt!DK#P>FC?}L2^F9dF*i>U+bx8uDu#bHWy9u(&6d9!SY9> z^w(ocYFTJ3=~jvxfxV(}WSKdsDOnFqxnGagr<1IXL* zE-;(>RiuG3m;^M-+yqg`fWE5raVPjouB^_t`DA@zY@>Spq`+72jh{~=ETJ^j%cB!t z_^E#d+fKcE_;PAl@}RTqjVu(qxUkG|VQ;|woX4wI-O&c<=-{90(Pw+}UKSkmap80& zeZgXL6xMT~t&vIo!%=?DRSp~o^o8$_NNGhf`{K$SG`_;7pD~@v zhlMl1{5VS@fs*s|rcF6l^aj}KUhn-Hk00*@q)JFq@P53a-J{JunkA5r(38CaD(R(& zV<@tYg)0YrQrWZo0fj0QK{+0xq|i$f+gy&jWXb{Y=oPD@u>#RN6jmuJlN=Os1ZTts z)#JNcK->Uhd5AJ|NoZ1^#@I{#IWn%Mzz7k8n<$hTvm_$=r0_+0b<;XDeY zkqyBcvC5uaMoH@`Ve6`C-fHFrQ=CNXVxG}1|7BYNgN}USfJbbZhbd2=p#)T}LWs+r zi$OS(WJzR1KmOo7uk-LOdXe);l)j|qO`_DPql&_E9lF4I&le3StY+`p-Ck4_`6ara zQ;6LWuBR*2$FdEQGa_5L_2HY=9D`6t@+kH}r@UbaM9x@|xtmVE0Otx}J+wXbEyeu{ z$b&oEgMG_`$>;Jik+G&XWRFhcix=bFUXRVb6KBCk4$Z$nsg{fzeKu{!@tV4*!yXp> z>$b|mQbk--yFGhW1D|?`qMA>bdCX-8NZJDAa}>;(9G^L8v31oJN5ht14!fs}k0)=b zr$mXx_iNm%-a03|t0Dx%c=JVqJgH?bxs<$DFfnFZ1#&|#rDO_EeOXsHwysDN)$4F4 zqGGrs*o`y5o9Lpas)C9VP)YXQyp6Hz2mX{@!;`ybL58;eJG&; z{tYjV>wNc^bDE1+Sxu;iTQfhe9qi22_GU@KeD#M_I)VYG91`C zCo&K|B}D!XYXne=n;a@RcuVMNVe1&3Oc*^+RKV+obGgXvG;sFD{DHw zhakfW`tTDF+btvF%ColkoqHt$F#Q3>m@=ynAo+byn6 zzuPw|vc+GYknzov0aIkk@11*{`BMBx(RLD6uU>nHdvIHIwk+YrHU~ULb{EW<0vfp% zivv(hXb7sO?3z6&j=>M~1SF){EB#KQ*JLETf=dpi*KoW}i3AjSM@-grF<2653oTBHI>3)$>|k${wQhP&G5`2;viG zv9`OtoY#C9=8{^wyzmo#GC<~|p&op;A2H95Sn0I*=xjTKl-y?`pZkjpEae$0JtCf%7@12>>O$yxZwRZw(u|0rO-+k!4+hu>#Ok}6JZky;>>9zK{0sdg=Ayj6X zVvGRuPf))1^t_HNMbAl~F89h5_3$sE!)DHHY8x%g_pCJ>XJTRyDk3mi67K+2yf6A< zB&zRgeHg>U>f#2t%HHTB`cSJR)Xdue1~Q?ziomKXO%IB_2SP=O$i-F@47|S*g|0s# z9Ydi#zk$D%Af*Ck_;&&4Qx>PfE--x7MSZ5oB6Lx^GBWg!iMaEO+fH3X+qah08)>)0 z-#^f+(UpE;acKznaHd9bk5|GdKnu)!%ob0D`X{SB41bTC`Y!dVPeKkaGjQSEoB(w3 zed1tR)zgBiH_m>sJrCa&NHH^0`=4@4&oVJEb2{RSWFJ@5#4g{6yb&9{OLt=HEow0r z|CYy`DTdo~iQPv3#no#Z=K3={>nyL+jprDQp$-OjE#67!Yt#l3&E!~Z77R23gQ zc1DtOhntZDo6U%e5x09Ur)!~T&>ajDth(ZT(+Q4}sJz=}yVMtgrHhww&+XeGz^M#q~m3RfgH_&d@4br%|Ws%!79zcIw~QC%^?;l zp$^TVCso2uH;4JFgoihWN2^4{H%Hu5xtP^_F;|6liWyn0a;drbQnyOfaC6j@%H@UT z%PT5ZUNv9&pc4JHIhv|MqH7^BtCBfd$b704(H4rVYK%%tjE-uoVN0xqYMet$+)34| zr(3T2tHy`7#7Cr28?Rb! zd{DjlwdE#NHHEGbgIey_ z)?BJu9$i}=vwA*9TRxw9foNNSta_nJTcM76kzrerg?h0=Tk%QtlGAM^{_3USZKcub zW$|rgH`U9t+RAg)D@xlcs?{Gfw>{`quN-cxoKmk^XscRLfB35H;Rp5VuWi*-^%}bN z8fJ}Jj`mtUjXKfxI$4c+mG*iajRwQ^1`CZwhxW#k8cnC$oBTDJ!`qvqHCp1^TW)H! zX0^BGYP6NMw^eJjH&ff&yEQt7+dHN-Iv3hIS2Vg_wRe5c=>FQ?P1We3>*!(D?B(d_ z<{sdN*U=m>>=>}n9CYXyJgGT!x?{*+b2z+XI9hWgzGLL3=4e*OXs+g1 zX~$T#=6G|*c(>-naL2@y=Hx=h@Mx^DV3UTdvOc((dooIzO7be{|~{4tF0; z>HJ*i{<)&_>s9x!4?5JZ-Bbuvjtrq^SByY=5aWdm0zNeE!DbYniW?$gUL0dd9lg z^tN*7chK^jgf3rKl>_|*0BBWZ08(ny+&^-U1C4)-01hTDf9>X1=rT#*Yk~+ zIQ&m^Ui$WB{|85adv6RqhjJ8WpyyyRmrB?nXOQ=3y0}AA6j!j%F68?)P zGUWB+a);(CM=yoFd)6BM_1Dp;h`sgEER|^9%aQv#Pn*N3yjP;WeR{pn9L*Pf<>!xY zU%yiMXg5kcDLr6DO>{4m%{8SLel!Z*hY-G-(#Ig(f$m2sEvNJ|X&s^m&;~rI1K8u5 zm_b%M*VI9Fwyf>k5o7Braw}D*TFKS zxxbt~rTz1eWtspxnlVje)MA~{XLHM#Id=3i>#ULRy^L8C=}y+iW=hX99-C|ZWSz4# zI2u|=`oD}QNxBmD?uvXGDfltT@H#o-9a;Ht%s)Vs*c4!PR-0 z&-21+^Rrg-Lt6_nJ{I`>?-C^&wb%RWNe*lf;z-JJQl<<@@7 z&GEK}TWxp0w5LyX*6wuvXA>pk2hYbpugrW~eYC$e_hWPZ%hqqAM9ikbb+oQ{JPYqN zcGtMRWb%$`@SEk)`qHUfLJY%ElZLXHLZfsEx3Pxu$EB8qCihGlE1o=XtUvpFtntA@ zwR?ZkQPZZ%r}e&%E8NCuSX6Vs`q(|w=7-PQBR;)(KHglt(oKROdCgjC*7_4zBu`AV z)UFSw2%6qEYpvTH&sOkzG0|H8YAR1JnfG{G!|TUoHV;lrwl(f7)Od~GKi=N-?paIl z&Wp+R<_{}9F-Si1j+VXkk#xzEk2+dEy_zmG%{A|A+uvEN_gi_?+5YvzYJW17&!Vg2 zH&ODRMwHb3!$gVSZ=&SCkSMABhlrAe-$coOGos||e=$)q|C=cJ`$UQFza~nix&PY| zCC|8j6D9wDAWCZfdlMyoHUBQ6#P1&`N^1TsqQvhvQSyI0QPTXIDEUp4{3c3%6D7Zi zlK)boq~|X)|5BpF@&6;Dr0(BK zlw|xSO8%pWlBfSy6D5oPTSUp^|CA_E|4o$qA48NF{U%EOnJ6*-UnELQ|6hp`J>`FZ zDDnP}AWGbS6D9u!QDXF)DEUW-5*FpZPn5X-CQAM@h!WG^M9F_5QNr4_E3*>msj0Ls zpD;))p-<`*eaq z1jQc~CmnF7`xm+A*SyLB$VfG&xD6qVewdABvz|eY6TuM%k{ACjRKn`Z%kSHx^!K3> z>K{_c^0wA{-;`h5e?TSTe}+m(|0__*)P?^ERC4e)P|2fzfl3no8Y+Qy{uwGc_%l=j z={F}M{|uEx{0mglwMuqNcooDL@YQVde&N5iN{(*JUig1lCGLM~m8>cE|H&#LUK@gK z{=#hil~qC=@%sx_@@K0={?AqkVLRgUA6ALtKUpPHe^@2!|6-LKy!~sdg!>PxZcuA_uHdgftP)01goEf+^q;H}4BsC1U#t@3$sRHn8`Q7wmin8# z$qVLoj++GccBxf=?&dwvGyo-#94%o@>w&M2>MY!vcfmtM2Q&onL3WBHNC0dYqV;Tf zUHq%{!6lG%^+z3(1nq6q zQx$J4G0oSyBn)e>jAHM^N&0$Z48|qO-Mq^8X@@hI6Uc1 zF(aTf74n^d=`Uyr`ESsY%somjU;GYZ= z{I9<}=7~?ZY|O7e z&Ek1l1dF;wSm>Vf|p#8AdvkZ3G^eS)ilP#r9?z?1KA{U7nukwrPhaK@f7LC2s5)5)Nlgz$Hqd0UvJw zAtcaKsEgbZ35fBtUxwhYsDs#X#3`t5SDjVIGp3kL2)akj6hye{bkd(0z~exCEdjKQ z<}a%N1{j3rOQwH-D&Z-}jTC1L26L{lU*$U6!XJM$tdT8&KPe)UyZV}{rEYHmCn|a1 z+|01O;#0)}-Nq<>GfTxch;+Qg1aX4KQ3!6_FyOu@l%^8;gg^bb`ZsTvdUt`uE2k$t zjWaJ)QJ_wZ0o0Eplx$f3m}9fM=Y7Y!?rEE%z6F~MBB#pB*Tip0CGd*`UX2?S_i__A zajtMIc;%EU1rWdz18S!HVPZi@$0BIjRrHxwd9k9F8c-kTlI}P0vi!&ctF+=A2^LxV zyA@CLELa0$ljP=YH<;}XaT6Q^uGSF42R{2Seru?Wp=>A`oLD_7(JnsjT2wL=hf8cFE3S zV6IlS2eh+Y*#CnqC|cikC~90kPB`ZW08Lr73`$A-k~tT+pPJ7LoRC1tGsezFSuNO)>?jcRQ|c* z+>fpag(r3{mzCz7{HpS&tX4vdAC}VKlZ9imH26g01u07+<}g@g_X zy%?$?fK-v*n-m+Ch>D1)Ng#9)kfwqL1VlwT(jiolqErP%1q4Jz1q4L}IXut*p0n<` z>y~rh-Ev;dyIC`9?U~(v`|}-B9+0->R0YeN%6x!Kb(LPzcQ{~Y8kN4wpd`%PPc|(7 z^eC*a>2p=i#4dx9mb%NJY;9U-(!|9R=|7B1H9d{X&p7z*G+OV1?t-41v2VFBD z5imd3pc>HP^iASsG{{aQ^}YO{=C*`93U{OQnvg<(aC%sqTi?!pd-Ze1r}t^r<(=+T zbGArOO96@2P@u{WI2yBBY09zhSV)!DSqg5V&E1l*&m9<-X7+YcLWP-p*t1m^j@LS z5klJ%;*)mile~%J*P_O6$oN<_2q&a6%_K6?JqzLtvy*ra0G4KRy`BEkvasMlpGAZ- zF9|OT>$EmgFRFx_O$v}YhH0{{4V}WZEZm(xA<{d9)z~Xv(Aa!A9_%yEB;0xk@5FYcJFrw`R5#Lg0#TvSj*6$UzU(pQu^X@5NWv5sPv|0|7PtrvNV!A0HFhMqhq>cc3~fSPe-_{^^y z^gRMTIKLOb&NIvut<|e}lgs+%!hh?H@|syWTd!~s#z6n|nynhM$3%0?jBVh6wIkWmy%j@`5izZ^`arM=$OX%M#D&a3opGnM{HOV*}AYAKpW@DnEy z*K%IBH?N9YsMb$aT<0_U1bl$a%sd=8w`hb42gsgJaI?Yo+GiIT@pDf!?74`Jge z!uzUkDJGQX{oWb#ORrj~{8D$~9_RJ(U)l`?ej+lMIdn~+|KwMci@%Fr!Tw@Zg~&<= z>+Tn-M=Ey_R`nI8(U%+6R7LNyp}s)sXxL~;L|rrXlYn;1alxPZxqK%@BAT0D5akj# z4mt8lX|=qySI~7kgNXqaS)=AQ7O!;Gzuy=$CbdN_YCW&+P_oZUZ6;)NolaeewaZK2 zd&h?qi(gH7*`n-B349Vmgce-xh_JcVfuavM20quXpSb=p<_ZT;R*q9s4~Vactz zUvC$~OfR2I-%+xAkJ($V0{8!Zd&gK#<681yfYIyFusTocE40%x0viXNOqSC1d4+n~ zemvfLaQ@(dCrQ^*rx!(!e<|y~`1F@4nS0>p7a#Y;NQL%a&+bV!w%Z?z$LF7oS+DK1 z@!lTs+aXO|`TIQNwC>9F_AQd!p0S}jS@C;ywm(%M{v`kXn+1a)zPIoFd&!dWP5bd$ z(ZI#oN$G|C0iWw+o-f?#PZ0yF6Cl5(Kl2H?uX3-O%zUv3gxo|Caywxs3B1RqSIh#l zF9pIP0=dUo#54k;B(UVf9Yik_`Is<03l`@R_+M~#V?PC&lD8KKfo!(W)c@`j1yJN?AU_hJO5 z;mNzEU*+CRv!I6P(AB$UM>;5qe8kOJKItN=?K4f?SMI{|)eT~9)U$N04 zYt7jseFNP=QzB%cMY;n;54k2xOKdtQq_RYnJNQIlPH}xU4|W#J21EASroJY0n{(&A z3PWwxLtU^Eq8)u?GEM0?$ni+1G=0I zF1)izyV!({ZaPegJmfx8D3%9G-30r;rJY#2MzRbwhjvL!T8mIc+(a$Z2_3|FS}Po+ zJYGV_)1=|FEp(5|zGC`qA+udICS(bxekA5avb|QZ<~*3~U7CquM-?7Ues(Xq_rt~h zBbQ|Pv6HD*cry`dX9$lgYP~~U*h0l|@hUdP3bDfs`}rBDPj1Yt?iRW#!qm0mw0;y02c3ss%mFX8oE ze2^;^BB+%DlKOJrYKtpt%*01K;jWILHj?hJh9t&6*1pNEY_&G4jaBU4&k;CQfbYJ& zFs6Q7H7)AcJ<*AKNypNX`qaj`5)76P7IH=P2!9?^a)DNStW%6EK%*OQB{GAWTr2)QSd=OOW5O)C;?)AM(#xd# zEYdyw{fI5Ns`X6PBNw-VU6_P1hdOGOxkJ**3v;b?R#X0{`k=#SPB6x>)SO##yn0cL z3qKLAkbvEvqxc~le7NzL(s|XT^KtjiT<&NSwV>>}71WzfnXFBV(l7J0Zc&S^K|!tY z=jz8kX7kv0ep0@MU~_&$ZD98N z8FKl#!$YP}vUMP0dRH0}-`YBIKgz`v1XGwfVMbLM5O_E}`sj_}YmBb4g`-8er-as| z8gWzLHU6Z!{Ke83 zR<#TKv%ZS>?9*P^ulcr#PNYyyh^A8*F>%f#2?e>*eC zY9jsm6)o~*;ozs&cBPrT2T;Z1t?CXx){0wACl0{eCTqdp@lXNsjie(VrJp`fnAONu z6!~7J@sO3F=!rF3u5u!3ne8$&gPptoX9B$XFL+Qq1yoT&Z3XBjc+8s@S*czU8VYoz z27N-W8WOW2rsk!w4yrLI|8_RxTVFNrRBBRQRj~;j{?O;{eQfapZI7C!xDNfi;<6IV zaigtj;TRouUOl~5<5&0+=Fh;EbN04(c*-!??O$jnjSFl?l%wgVOuxA7-M&r~J0qSG zyRp9W%w`cGp9xinj+Ojb%-_5+Ia>q0oozQbyxtp27z70Jg!4v;Hu>aH;-;*;Ad*6ImrAo{KuPe`}K1`5& zs3oaW1AV2fdqr55q^+eHUe@~CF zWW8No<|u}?&-&!tC+$vbL&iTuiGc%ogWR%-8mE$aD*3w3AkLQ?h5bvEB)|Vn==QJp zpG3+3Buf4#QS$#%qGad)IYdd;xbuD!kqK8fzpM#&ze^%7JVW!dUU;8q6?y4`aJHz5P#m&Zw_~*{X zNnggzB`Dm^ol8<{!@W<@dz$64mBsuk_}@##cW(dw z^r#*GXSwd(?LS{zcko*)Ju-K;)`s@VY_CrQ+}Zx|=8DYU&4oL6{{H^bF0-?>@$Syf z-|Zb421s}Y;Pa$H@MtEV>kL?YlFDm}W??dCpemkby9voL;^Q+s29sujm(j6A;aRwi zCr#)!nyunGi#RWY28h3)eB!_$K;F&K0ASEzaQyBS&dURb^P-RlX#ogYPymY-6cNIT zi;76%#7Gi?cu8?0UWz8eD^HYBlq0Gr$f>C)YLJvoNHW@Ls`^^$6desCJ#AA1J*wdz za}$H$y^8y&#x~}r*USZTECucNndMsTirJPqwj%BhHr~$rOWpAP?v5usta`jLAwF(Q zA6l88Qk9?LVSmpf0bWl6#Gf3LJRa=Z7mTML3V3=*_VnRE#!=7yql)K_1veeji9B&= z@B}&P)RDncnnR~`E`=XsMx1;eVHk6sK78Krd89ErD*SmAbvVj&0>T^3S{^f3>Yudrdw1BDWAuHGa zFMXQu{}Z3)ZBcYx@y)ej`umbg4fpci-DA$&V|*`-Xu5xU_I~WshuM=4uYP}cv8}vt zt~`lbargf&QS+hg?owO9kG9;A_Uca^#p9iIKRa)K>VEjMyKt(v@$-|)&wUjeeGlLC zw=DP9tPWJq47P6$md`!oetp)s`K)STr0d&A>#vc=%i~?^m_$> z^2yh!f!|YIYp)0YKL*vncYqu|DeFp3M+{s@)wi#*per6NXPakTRoIh^(>XTXS9SNv zHG+APludQf)9e2~Flm_iHxIP8ZU0%H{rv8Edq}Y`z@t3FfFf{~SKx}J1j}IIrI{E& zv}Pt&*qt&gf<8py#EM2N6 z|D%gHdWNj!VqFVAF5YT5S9K^(tCsIWeq>WgGTQNnxH&?HDo)7MDMFjyxm4bfrqPyZ zfmFkNq}gf9iksWr<>5PHtH8~49W1^Or}}_qy+u!=`WMmTs7))3Sc|8?PU(Pk4^aA# zNyAOD&M!)fH4B4#c9+R&Wps-0iP!MR-PKQ4#NI>5qXhMD&q2znQ)BIgdFLLGD%wQP zA{c&U#05b)@nkYBTybu>+5J_bQ<)0d$5A0{9(14D$5`7nX-320q$~5F6vcRSSD!?p z%*N&qoLb=x3%;Vd;i)x z;%C***zy|_Vg`v>OBqk3>ieW>s87k$Rn&SZtXk*Q2c%&VX=mgksd1myns$>^HCY=h zekMUNzpRSXa^!nXf?}s;cLJf>*rMVI*pe8i02UzziVeVbWI5_KMSwZOmFyCR=to0l z?f|LwDVbnxAIEZVYWmN&aXJE=`3T;GpW6AnN4`^AOIFPB6{vY6b0qFI_-!AGFHmVf zDucB!Ku8BG4_KeL$j~F`#4inO@kZZmf#+Ji8f>>J`IsRm9EmWNnj4;mF~LiuouRZr zHA{KW;1#`f{uv^REVL=uvURfIQPRCi9?b|JJXx>;ck3UM20!qv3AnS_=V25e0ui?H zy#3=4*MzF@UIcgmC3)*MF?E%x6zeyiQa2FItAXOk z7`qUr39jey2Ow0nvDXzDIV^RlLYE}>;p%@g zX);bXbc0Gy^s4&KjMk@!YLNu(3N6da>QnY!J)YzSMZ+y$R)!j{q5<6(*mt`o&2W%F zrk$CV{-iRzgdlz%8aVTpfVQ76|N!xk!c-E9Vj zsI;O!H~vKeS{AW%9C>t(%F1F%Ojh$$0Xzc0gu(Y&hf6BB8(PLwJ!c;OIHQMjOX`s1EkQ{}8Wqpqp5AcEs@N7^3YBK=e1$-b)F6zjI@%4^Ss5Y$DD28zi-OuWCCcGkbYU6p5Lbix7 zUfIKlXoEEF>dD-Aa*5uK3nfp1C@vA2t-j=v;=B5V4((EYIlqax_e&N7tXWzpMI|Kt zIK~rLBT4!415$Z(U071;#_FQ7Q<5UYybsKSN(w{;ODl_CDf}R!Yn!6|^`v-s^=+a; zNHdNscR>?f>fQA|nK%>}@=6FT&t7d2CSOYgZg%LOW1@i5184gQ0r3RA!$Cs=?j?e1*^L9Ap(__B*|P`j;nuk2Sk_}B9$)zuhWO2^4Na@OT< z|2vb0w2W+N>l`OVh{psn&xqfg42?9ZGc!;SIzB+HjD&S2?Cs$orJ7~}K0d8Cw@&1> zm_89E^0^;ZdHUSYrSg;top|Nlqz)TO*L_-id#HRT+g1YdO*05(i|IZr!|cXQ<~~f8 zE_!IyOE*hmYO{o=2#)S>ZHDJ=Ma2bzhq`sP&5wncNfq(n-pQ1gW%)Y7Njp?0)Od<;i`P%VTR<5x5F9I3ItM!RTeRW3X%60612$EVE4X<*;~{%ZI@^X~~;K0OBAF~wO_nTKPY#c<&J?ElAG z#h=TpWPge2GFjr!nk-TC+5UXU=(SRB#IWT5#-tIB5t$LFZ}4JvpS<;FddZ=_>E+PZ zh~__UR#NM0G7rut$=`j;bHBRfr`oU<<(I6Jx0vNKj(Ge0%Am~%HSRHUX3VzTwiHU)xl3!*$%ti1#EP8X-`5al`~;LW z#j(q@_Jy7Op7Li}xux>8WRQBgOW8qG$QH%S|MdETf}CJNiK~+5YOu9(Fu$nzt50V^ zs5`}6v0xlT+Y^Mi@_~8P+ldDi>Bwh#%ty|ee!)!ne%cK|C@zJoB8WS!9YaZ!jeKeJ zPVDdZP9fMQ(Gd%NYQA$SbCy^4WcU_cgxycX6BLtRab4WZ3sI^IuvP>tgEx?Bm@uM4_B2{hOjiLfC+ z1?$se#jzXvVD}iP@d=gS6~t3?4t(Uyc`KDHO~b)MXx}2yoWou$Fz$FlGns8DtSz1~)Uo2sKu0qzbxQ)1)`l@$^xnEW*vp=T4owkaH#{ z2YebH(`72Is6e3_vBmQhMo50FJ^O8t3)Ozu4u}mJM&TTdCWb5Tcsaq$PRqY{IKq*`l5>ri9KHf&1P1#X*`#k&O(6KM zAjcqt>XiZ&G;jNGK|8|*lmTxKBoxk`5_aNK1K$?zfB>(=Xi?ZIdZ9{_6tISLHY|4a zE_RP9_Pkl_d=uUkm~*G<^j{^+-oPTE*`f$giPFIf2co2%9u%MaT|!qaX6h8)z?@al zJS(DAa{ebzu%m9=8Mq&vr)G_pW)7?6AXDfFk2KSk9mcs_R%w#$zP>a#^V7ZXgZCmO z!sDqh92|A(rIujV{TglaXOnJE32+a>#nAJJ!}TRkG50DZ9=b{Jei#!^IE;xhbTUrk z<)9%y6`>w<1dVHCf7IqC9YLSs6AwY%TY*T&m(e(76&%zX8-z`z-kH-6g^6$*I5kea zbe!%7JzFDm1%r48a+5;!JE>fMPDrh@a-#TVS`Q3aU}@#$1U5z1Gzb zP3EE@SHUP=S67O;QR4%hkY+tGtM+tGGgb1V*xdFb> zfp@;%*X`31E9vE$#Fwj7Tu(KbZsO7CheUHat2H5DBc7WaBkSFGJvf9l%x7v?ybxAA z7F==#Q<7_gUVeB>c^-+~KnLvOU`E{fI4F}65$f?%5N2-D4eheE#}G4xA;VD5eQ49^ zjwWVTJJ9KxbWT3Sv7O87zfzdLf;@+Yscm&S-)wMBf>@*Z+?u-Fq7tJE{xxZM5VNn) znsib>foPnDD%%OHRx>N|;6i?T312ZrUq{hm$}r0{=*x@aBB-3 z>?6aUNI_b-gX?Q{yr84?UC1U!N|q+i;o&>#1*qtA!d)qY{l0@k_F6+*gHKR|(H6`v z6-+O+q?jz->(l$bxj*(+Z`g7tM-!4>iQK^Q)yoy@4foblk-wUdIt;#M`{LG7wWxwV z;SoN>GCCf}zXBQ&B92s!V{E0KFK9i#?`zjeb#PXNhoO}99C@8>U4t}voV|IY^ms~l zGdy6zPoE(yJ3LM*(nAmMP`Hs9y zv%RiPg#xQRS_LoNKzE|IUvQmx!J~W{s)ilIg({ABH6>8e5-4K?ArXxNcS1S>CqZ*B z#mypPE(zn_VC)Pr!Sxu6S8?f$BcYi^2AEf5tyjc@4A_XOCeC$A!6$Pfk^{KaepK7-k=9f(05J<`wwAq zZO;lGMCGqQxE%Tbwe&(@=@p66KQ~L04fRmCGsUFCTWastAJMNi@ig`!{Dxu8;QKo! zLSI_3-%K!(=ObIEJqJUmpl8+IZg*Jut^5y|G#zl-@I@!8U(W-*YvN^H zh)1JIZdp{MX#PVh^)e1HPkV+O{8R>ZLTPc&pDUaf*A+FzV-%(d`isb{^B0n^7p{94 zmtbe*9OnL*IhbyhKRnMf(yx8#eOaK_ZdQZwZKeAqXQEQN%ZDU-D4*wN<7``s1&;}Gw0@PzTLjU_BT|g#fCj8IKD~Qu@y+2p$MwR0=Pv`-5mqMAJLgSrx zO_~~>stjJ8&qC#oF!mSS57?nob{ARSeDw3V?nC-`k7Q}0oSUnT_9mfy2bO|9fBjU) za|X?=c#VC18+&ZvF@y+zI_$uE8p(Dsb>;eCok^GUNT)pag)!=U-hXcYiuYTw3po{j9o~Y1JDEOmk*31**m40lb6mfKWOs=?el*g~^Va z{;2#y)X>|n*(s>Ix3u;4I(--LG7&PWWqi{qSX03?Z)rF0;Gf#NtPD_L_2{dn5rm=5 z6dB=&ftexMldpx|;cbRt+)86PuM~K}YMpb;xeG8?0`c{vQG?tU;wb}Tpv6>c_CF>~ z3gJ8SG|bYb>_>QeeN6~xny+O=UcfJ4sc`cb2_3f=VxA4%A))!fkhJkl?6V0KCYNyk z6^9C2v034b+{nOQ*8@~Y1`rcvOEnKjLk*8++U3v+naf>ya^l8(RUL;W2sF*ZNg@=z zN*>}BPkjUL3k|Bc6iCPwkxl(>h}I^O8Ipcuo3pdr3@g~?1^qpGIybQ~_aqM1Pfrsx zQ;3-wX?};_c#(Te1+ym}_2xPH!w~kK0G2KFV>e7XWsNN7IXq()%O|29GC4eJo`4Yq zC(Di$_Bg0q9xtsu-m8t*qxs8wjn3O-s<@ShOpPws!H#>rO7bn)vKdd9Wu{!RN*D6syl+)_9!n zi%;ya9@!Y5mtXx_E^K^Vc>S`%Pv*&g0p%!P$>bwvQYtGwu{S z?2rF-VSXf?_rtq)J3G0le-s23vFYqa`OB*;1FyG6&yoW&0>6rF0Io|77P-mq{|8N) z6*2t9^;IiE%z9O4n?;xgjCkEu+gkRgc6ga|PNw!##WS^Qw&dXCa9j0@WzH2ZITqD+ z+84I46`DP-ss~k`53cQ}ysu4(7o3_O9crcDxuR5(6rNAz2F$yK$PMeL{Q)fJoT&+9(jakb z=S`!if!C+_7VLa0|6|e&$-NH2f$npZ(`*XgoS0b1d{Ym&ov}YOqyq03P4W*gzvqm7y;1JeRgK=P9IXtZwRnfT-h~$qvLB0t z4q0|PQ@y~Cr)=}issCDVxzaEfA5tQro_o}nkTxI5mq7xnj&5c;#^t!~VSZBUBLjt%ubt~W$=?o2 zZX3Z`-nmDwJ~?7gWVk4s6CK`9s^1EKN<4dWf3w!ltuw>sVwcX2+F7NZG&0(A>D&vS z^V{b{KgV5)7`#AvpHeCQ(@s1Xu49wK&T#TN-#lNky*Z$ud&^Eko+))<>28kyy1al5 zYf;^3bq+0C^7HS-m1__GMrqf5FpZwSK#ARKleY(LW+d2{`Lm7`vd(ZtF z4E*gpX-xIYSa!b;C^|Bcs8QB${bDRJjVV+_sFCt)P_V3IRb^Q2$o}n(>Ha9FF~AJ4 zv294v8RrmaeRtqp&eCfq(mbl^?{;G6+vJZs#371i1=JLh1wW-`%ah?dmDZVhGYnp7vwaw9Rl1A(oJt!ZLA?+ivsvx_k3+*Wp#@Bs!Hk(AwOBf$(On}=zq#PvA@ITJ)#I@@cE&J zO`P2DZu%=l9-#xoMbGOEsB;0_-o6EiKx)d4OVNyo^NA&Mr0kwh0z{m=e9yh{(?shl z3sc3`&l-7(FGe%Q%IQ7$qb|rIxsu0q6(Xa|lOo=^^jR_H2{GX zMSSbW9i_9S`{1+`oYXyRUR~+o=6dGqz(-sh-QRSYGU??Pj{VApjtx0H9^1r$-#k z;O{Pc!TO3Rzf=^bCuGpw9tj3;00_ur3Mf#yU|UBPerpa1$5FviMZ5s{78JMHqbjNi zK;eO7#)bOVj6n?j@1;<&z6dB|x9C8M(4XT;?LWi4s}gbvfO$5-0Y?z;@hveb0R*Vi z<jwF6y;kc*X>nct0vndjkWZ4lqC#z`#DMrqK&U2@n(m1Q0-s(^>%5YBvz}+@|@1 zndW{Ep1BN-LLxvtg`Wxpp8mDW^eK-IKy^Ycen@}B>vsg72nO`i_5jM|Z~!QnEO*?o zKue*S1Ss(n`R9de1Oga;Lv}Pe9#D@?XCl^1KN`z|AhFWPxrbjbWNO0ZZ=54^4bQlx>k!xyi z{D$!_0ndz7*q^HccK7kRQij5);)wRJ&*@M{rpyR~cPU*UkdfZ3H~ELH1^{xynz#PI z#FZI@#69>6tT~7gP+2L3Hd__~@#`3U4A#XHYc)1u3KRxvD;>XSj5pI>9%~KY!hk;J zZh^;X9Wx*@HXx7z?qgmFt6v~B-fA{b++s%#?8T25Ep}CV1As|i$Br}Yq8XUzR^I_#>=6qrK?}P{B{0qAzGM9Q zI_;ua0Y{(!7qB9O`~wYTr1W`LU_5Yh!f9Q+LEAl248E%IVY9wNO0ye>2?7|{CB5_C zF$f4aK&nlv+X!F9*bX*1|K8%jaSO+@Oz~v~HbG51x5Kns(I{Fds1jE}h$$E~ANo^u zG*+G#rZ00JKx|^r6aYWP=11?N$~DT6JG24DneP|{fT6aG!2%#cdWXzY5JZy&f@}E! z3>A3BMlSUOfU|yK$DU5MI(@ux$9IgKUBkjZHC1_PCBw&p&0sjBed@6> zOtQ61I_T^cY+91y{oV+>*?a22J~A;*eukMjiwG$NpR7fhX4=3cfES7f40twB08NI7OKiE35z+PQh|@$=ABQOk*6 zpc7s^elu*Aa-2nj(yd8|=j8tWO&h^&oCpq3$nBpLYQ)dN4glTSS}>^%j0c5+w_*E| zgnS#G$QytjT+i^F83{;$$zK|B+r;?K#Hzrs2P#M8iOw=wY+2U~x$Shf%~9$w1Dr65 z+aT_|s>Q~8^Y~NPB!JCo0ACcpeJc%Ybk3=dF`}y=>fd0b(o#9$t#UvUD!I8g_Bu=m z00eT|P1OqZ>M{ytlWfymSeg%XG)Hdr!S)P~J)Ie|QpmR39FZ5Vs6v1t%FJT(20u** zk>5=Z<`jM3HOeb4uZ7QQ6M7&E<;ZqJNN5JUkq_j5A)Vw(o`tC{fdl&>L8VZ$&6vCy zu=}eClZ|~rdJ3p!ck6!l6IsMy_I0Lb@kZf`V_#nI7eaXG0M85+?*!s?^q8@btv$ow zjdW8t05y?i1;?H7Dlun683naM6b=JCY~ddX=8rTwtqT)xV4y0QZj2a}$bp>z)CglqAW!|MSe6?D&2Y>OHgQpAfHbinZ zS~36Z>BOr1L`k`+Crc9*UoxmT#?trn$1zg@niKqBZ{2MSba?96#plvs?~5l60Gi%L z>7ehvhUV5@4hb-GY84hf?MjH%o`L3)+U$*hM;n*`fCbhBto(cw_kD>YNUPm9#N9<2N0B;(|FS6N02P#mf>EQ^1M*=&Dz}-MQpXlf*y!eY9XAqAk zz(cK3<1n--Oc;luVh=-mGGLqrq6p~oBSyd(fBbI0fYWJSu*5_Ha1KL4kKLjN`!VE; z;4lz@dqg;lP!}+QTBQ=nfDSWAuncVuLsFip?Fk5!!Iu00Om4pl2jH#rLVky?p8+cN z+jcCZTa^NzuQJZR?|}?A?S4&|_4Kwu5D?P{N~kKY@jWnE4PyXz_`x7L@LXEh67$CC zXyRCYqE`dt#W~bV{t4nkgThC!+3u#@j#XT>rvGedxQvibjCbX`a7+?Jy*j~mc5@8ha0|%CXGHHhC zOnOwbk6)jUM;Zu3VQ8NWA~@-A#~Q$J@7(LbJn*Vy5^x*oeZaZ}#`PZm14Dep$bN0S z8~l>+G-$o-ad$GKMz;f`2?&=8>;JH`ZRp|nF#v0Z{UNH17w|0qpbF^C6#6FX$SsOm zf-S|*;0x5<8mh{O4e^8oQ2at1vF1v|KCehi3YfIKITLX2y| z20;`W@9u7SWVJK{a%(}eeY?4}23!xBMx2Zlc=0B614EvF9l(j9xPyMUf^sYYk64Dl zK1OeySK*HtZ=f|m^N8MJsA{^~+sWQ!V3F~gDL*Oav%tI+lC%d4##6w*4u1Br-ek;KF)qeS4A@1kfQ5Y7gT( z4mV639;+s~em0OqH~B}3oK&?iTfB4 z+7d9lU`H`4dT%hGOB0r#MoBZ;g=!-+8N@FC_DAzASC1E{zZGr^PPH| z`q|taT7Vy%47^nr(d;Q^XfI?|{m&|}Cw6%PBS0^`2%KJjfTPlL; z5JQ#%Ccnsh{9@|RCh(R8Qht(+G$vdFDmKy|xTwFdJdH$rh69(@l&8^W$Pte~#&9W4+I z(}JH=PY&WT^$lZ-PLCZgDC|ug@th1XUusf2@=fT#e-IkLH-TM#@$_u5=`h|sz%fR~ z-bk)*b^k6xQ_5~nZW;3pmb)^#a*?TMwYKNQ$F%iWpDjDmS3Rk$a7?emxJxY9x_#d0 zlPvFfSKJZb*;qtG1w z*?>rGUS$vh$H@ASt|9nc>t#%LGqA&7>2-iG4e@L#(_w2@dSPY&Xj*p>%6J}PjZg;Z zw4NZQ$Mk*GC*T+Ylgu}J0pCb~hr*P^F}cOZd`sD9Ag@&5ExSX`Q@iJPG9$B3$>!X< z>fR9JTUn71SSB$nDwPYuHxvhe;dm7~`OoQ4aay1XI^q{o(ouAN~cQC;A_Oh8YI1-2g*J1eJ!_6=-gw(2UO>gzE5uU4Z7Xg zf&DN46=>**s%YR?2%sr13n*A1dll2Z+j&g>BhYAW48RXU&2D#&GU z9=`(z{t;-*H~~@uQ2K!m5jN=jou-oaj1^D-b>UxsMuf$u+CYfm59$_<#n>ifNCjsskiD$o;=m%M|%zdam`HjZKNInn7D) z^yWbauN{zBH`fj(kJ<36^fry-5*?ZL`0n|1YlQ*ti)xQuOSr1_Z=(L3aBB`M6bZ6r zTvTszZyN2mzv!=^&O)b6cs6akN}h?C`V>|qm%VH!9B;G5fR=Mpg+sMjTMvcp2DX|s zyD@vc7FtDoe5=$mj%2}Lbl)ovi98aDy9o(+h#Z*?bP+^y}E9gNrATc zXi@THz^(1@VTXsU`mawkZ|hGZXqO(_=i?+FO4bZZ7h9i?4VqWU-y~bu>lUx{zD@il zX^F`1-j3`ErI{MkXSiJc#mX-VESyt%=y4(`mA~f1atfv?D75SGptv+@w^bGK{RPhYLf5`-J}%DO}goIH>(-8q}ku zqp)VqujN;@rG1Z5H7I2~PDiWJR_OgI@j#MeVJ!RLpeF69a;{B6I!lu}r?7|CCzwG`5*}n?+OgJ%AaqP{ zaUj1UdVY;wwLfS1}$c=7zCDMz!-oio6;+(khdJ z@NpJT_Q0LQ+HlsglsUsx;e!c&)=czn{Y#9xV~3KB8IPD0$l^L#OO$2H3>zUN)tg50Rd4FDS}jK0g)nAnvE_hDxfGTng^Zc%;>axXYT#p$MYAQ zANK3)&)#RP_Y&mvCf?i;PLCIuW1&no05f$>Tox{ zq(6>o$+U{;&*UnzZ<7n&m$sNHxqCEpqnKB>s5sDt3hPqVao zc8n!vc{Q61i|tr+V9w5SVh>Y|>V2HCrZ@5%4YL@rORoAJyklO{$@;N4#;z61!BAnR zx0FRe{#j9o72en*B!C4qUYonz2$>Xg5?-lTqUZX znKTEbTqb@nX{02vyT3DO-b=YNPYu>%%ZnnlW;bz-!Qyh1_MY>Vri8>tOYCUGY*Mx* zmyuLjh~CHXV<1W(EdfV*#Db3ot$Xj+E^$FV1|Brz`=CCyr-g8e^JuF=yob4}twPs% z3T7j>SlcQrpd*xL8ne$=&>KUFBjx9uGPENDDEG;c8%g541&ZWkD0pOyBdL)CqUDw*hQlP6HM2~eFW0tqLk}5T-Wv7v>M^(7is=U`Zk0Yt z>kQt!ow#L3`39-{u1m+_ISrG+k~unDP1**i1%Fe?ZVJ~mY7eKQk*1gnMo5&m;FQ zOhj|3IjJAoeQs%#eb*Ar_X--~EA)&<0^zf>^Nz#T4Gs_a4Dtz*#HjbgX3XJ@YQ0H? zfw|tAq5@ag^s~|1^w{)PTF77YUCCacIJ)k>T3_W@e*I`lhIpS$54?i)d z*5xtERUyS_)Erf@bzeC;zNmeE=(({x{-njlrMJ&>xJoAq=)D&N+$a`C&mDLkEF}qw zV5*p^tsN2Z;DfTypB$QHNT?3N_}qHdWNnEWwJp)zB$A7FN!6V7TD4Pa@C|RowY)-Z zqjwkY-jA=ey8Ddv__pCYlXqm4+pZn8jhN(GF=^KM8?7687kK4eU`mAWxXw+j+wBxaqnt}VHD#^UChJ|f=wXdCM~mz z5Qh6`(qxwTTJAr1`}ve*-;P@>da_Iu;pxR?kBO}I ztSF-p-P`-^JDo61VAg2S&EtzyB7Z>k>cSCn&Hb_=ivwpsxd%lg^Rgu)vvA~1$)UHb z(5}A2j-2o7EfEk=G5XLrDfoyn;DpVsxqfSc~hEkHRey zTE!S*Vdp!;tu`a&DaK4`m=*nw^|KDs5r&P37$K9((lFO3|M+crgEbXm;rgO>{s4RD;r%Ez6cSYqf(kbi+uo%@*+!>w1O2OU*ORO*vqpOWp#1eW6dbvlyoRfSJ z%qtDz8Gwo=^V|9Mm>?Pt@79O8a*E!yF!J|tcbsl_S#30$>n6gGFi&fo0gJ^`cx%E5 z_`JkrAKK+D!h(5_M|~0*`U>fZvZ$frB6_J#y82@-67+=J3|Ao#mNl+`=(7<^5hA|FLMnmiNLOL34jNQ!RYN3r@n za|#t9iN2h&B!q20+Q3Z+I}iXRi-AM|$Sp*J1Y^q~b!HFI7aTmYW|Jd3nZQ921P?1+ zNL==fALk8W!zW5!unF}&vR@)5BR@KECiUXXpjg9#v^ds9R~&a9(o78ufJ&Kjo|0{{ zWz*ePB1V>N#OjD!Ecb-hH0(N|Kk@RUx8cprvdy4_&?77DRRL=lA?MXc@)~_9>|%|t zr6D8+5YM(|cZ>cm%P#+E+UZ*$7K>r?8t{aeE)H>ve=;d0A{RZ>Dd583)S=*(8x*WT zNkNnbQhq`2#ft+rmr|^Fm8^AZK%~28E{|U<+^=6MlSJ0P6lnY8s_lj{&nJO~QF8tD z0@9d5hA+jOs|3_kLT_v@#vRLnlEW{?=-JuF62< zI|S>VtyZYNRMWd{hjtp_?vv>`>h<36GQ)y$(}Js~*!@^n`@)>W?HeVoH*#kRQ92}y zTxO*ZTi(ynGr;Er#4d)kP~#<7R|o0}kJhKXZ{VDxM(fj76F?mP0`mH_Xxg#NV1~dF zz!}(Z<6gRZpeu0C`f} z;1O5^BwIKuLn&JQVET~^)sbxp%4x&yl%Hdyc}vWTmjLz&7%;F?iV0Il@@5=p{$2S+KvdPm5G8eXB z3rkwDl#07?&_c*!%xw~h^92){y!lzIQny4pAIUiW5>VVVWV17?kp1~qNpT1Y(8G+L zf#vRO0>{*A9?R6i#Dww%bDhUjI-tN{IN zqCUBp!97Z)P+Pg(eQcJk4VibN7C^XmDQi;7io+?uGVJe{@G z#!fzau%h7#45}=NsmCh_RyFG64^^BPHGkZ)Mj=$1j7yWhK~)GTRup)Mm*y<2C-a%v z2|)@766f5`k^3^Xa^>?WV$TZE&yfm-}E6lPPP>Up+ zOmWt<+>D*%+C7*VPCr%!PJ|vCRae{^$+|I!^?povzf7T^ltx#*&1#6%&b+vE#O@|t4aSQs}o*T}hd1w=M7Uftd%OfD#M5KWFb6KI55DXCl7&8xp% zIe=Rx>~!vxx8pp~UW`yLC~6M8Uy)(xVkSdue`M)^6Tf2-z20d(oVfb+6wMgTjs8Rj&wW4ZJlCt27(e)HKRzHm$8` zTBpe{s$p1ZHapccZ_;epQq%Iyr1^VH8ioQ5!pfr*g7%<9Ao)}Rm(ZU2(OeRaS3@vf zhDQa|f?qCSyp0n?O;VKhVSP+bNUbjqe1qL=o~GzGaCINo7Rxgl`(C|#!{uw8r5DYk zydURhn`@S)67&|g)&7Fr1;?`e+{|r`mp4`hFTUm8?p*A3Ge!9T&yEdc+lI@7-tp{o zsoe8upzHweF1OkPZ(c3Fwqy8@!AU@V=QO;Zz>V_syiar8ejJ-Nq`yWR6JwJ+j=SJ&AmxUMUEe>>Q#oWHcL)>bft$eJbT(AXz#R3n zpWq)JGF*Edb8L6=gy17*BaQo*Sgs}8E3Fp3gj)d-#6t;oF6$b1$DQzgl?tdSP+t z#qyh1OeS-sfaAaCS+Q}lzS%f$Ud;4e+SGNu_TB61{}metsVE}!yEab8_cqSbS8N=i zFWERb=6~46Y5Ss$)Ao~%Bct>s8>jnAHV);ZjYB&3xsB8DWgAC9X~o7VPyh@4u8q_E znT?~fV&ioEhK;jU=}R_F+eaIx`=gE1{n5th{$%5jR&1Q^-?4G(e`e$8{7D-}{>wHF zz>oYl*f`;lpW8U8pV>GpM}M?&SdRW=`9X zeSc}=fR%*dY@5GgBgJ-ydw8fZw!n zqJCxLME{?+aWYnH9P)q6#*zQY#?kzRjT82xjkE7>wsGh`vvI<|YU8jR{gsWA_??ZD z_!n)Q?*D|1v$8Y&hKj6z zZJdt3X5$2YX5-BL%{C5<{eQy7@&7NgaoYaVHcs2z-(ur*t=KsA|5+Pn{;M|5jsJj+ zll9HU`Gt+M`+FOw`(@5U z`Cr>OI=^M(KzG2E{+NxE_LGf6`#l?{?I#;Y@^7?ps9&&g1UG!MasDnFXXEBiHcs+y z+c+U`{*N{e()X)2PAYNcw{0Aj&A+g5Qh%~>Qon5D?AZLb+c;0szO!*s{~K%^#L<6; zjRWTY+{Ur_wT(0Wx7j$|U$${*|A37{g)9DO;~f+7O`!=uyM$rY#hdljidA8zr@DL`Gt)GQS{RJ$;Qe0_t-e%il1zp?tjR} z8B_dkvT@qJw{eJyD>jaS!Vfl%rqX#;Q0G@|94^JKySFfkzqD~s{GV)`%zwqk>Ge@057;=QPc{zeZ?bVHf62zt`Li|-Na-K4adf}4 zaq|8-8%Od78%OX58%OI)Hjd60ZJbLAq(5onP`+y8z?A+W8%IFt%Qg-}@h{jox_^g_ zBmaNF#?k)F#-aQv8;7B|V&iE3(#ARegN;+K_}6Wm$6vQ`6qNq=Z5-Y2Y#jQZws92x zWgAEL-)Q3~{COKk@<$to@y*7mX}hA?eyygxPP5}iO-Gkz=U`3e1I=3xYi>Q)?0Q|( z#nkL(sqN;}>fx>JS*6vxrnXm3t8Z;>-#V>+quPEet=mquw>N1GY^fdCr8T&>cJQ#) z(2?4q<66TfYlqKjjhwF?xuSLFTJ4=Wt-CjB?{;b38?3$eK0R0n_g;T^SbOHk z^_k<^k4|2HbXNQE`Rk9bXg|4j{Yjnn?2YTQUD{6vuRncoNJ~9Tkr4iD;`(!@_8d#y z94E6xDNJ5kPm$2{jBDMCwRJDnt$S%y_tI+JqEp@Crgg8j)V9$K!r^>xhu5UKZ!4G^hnGwRLX0nfJ^FPWt;dbg#-kC%EkMnagPCdSO`8xlT zg0lOko)p%8;GZpO7Ct>&(y1-*^y+Q5(@)Fq9TIr<&Bpo1Z5+-O8^_p48=IJrJSmoo zM}KeR41Y6nz8N{~C8Hg6OJ6c_n))hd``FD2Ct2c}PK|5!~$SMA2%%i4Rpqg0>sNewY>GO3gk;4WwBFf+g$>r^hGcs)8T?aQq_T*bo!L3K&>mB9M)K8; z+uy6hGYUlH6C1f% z)qXN^0#TeQ8RL+Gl4D{Pf7r8TS|)A^GK@FP+mc$dd0|50;~PZ$hkpgL=j3g^{?Cj z8xY;a!Q4c5-k}vEXH?cC2hgM1yRR5IgCxl4S>RYfTryZ;qje$pSiXvs_T#h+k1eXuqC@jr+1%I0Ia@A?s z_UF~W*`-z|jZHEToQZSBtIhRVIyPT1as<9)y8>x@oR;%HGjfi6XXIQoC@eVg zxskK|Gb5+plOx%zK|y7Rgvg`v?}fULejuRDhb>Xy^&ZcUDJ$y+9^3m=(v8hJUlFkR z14d2>0nP#3R(mmKc~AnMrl>jGEaUkHjhtW*!FszjWi2j8WtT*(%ls8;Xw$Ea9Eq?a z=TN@;`d{j9|CN!WvSQ?H{hE=(&+juJJ5w6xGXK;c+C&Mvg$4vkUdUpEVE^ zy`Oe@${i%eZh?Qy9x&dZ{0k$;_A?{L<%>p+_0NnPhALy#p&l!cP%vM#b6C4zz)>*w z4mw~;IhVmk;LRJw9kRoD)|qZ-5qahhe$!PYIUk0PYuI^K)VT0ISyi|rrsn4Mo(n3- zFsWH&&Gp89sbIb^cN&+#XGYG@pEq*0|6t^df62&6p()UD!vt%`C?%@1tAy!3f|IOq zipi^~u%T#vua=>>!Gbf2xEQ96jdKezyF0>tdAt*13ceSJUH{mGHD7a>aGJ$cd;2{&IEpi#+Xj|-uURb0w$WRWb8x`WZxsS=iTUkl0ne`DDUW{~G zRlmJ$%BoEDFN(9}A;B|^Pf#scpSZo-AmLU|D~fm;)}%GVeqtz3X5;%On>FsNX$kHY z$gp_2b(Q!C>{NN#wa%v*(qY`A9M>w=(-=EXzq>nQd97-&^I359yL)rTu9X(97&+Eo zHFAiN*Oe)|Vn+{Tzdp94%Ws(wWtRQ`f1GU>UxNBkDrJf{b?kbE0aqOJ9Hmk* zF`UjH&b`;7wIWjU2I$0ardhJF3oP(O+`GG^x}rM*p0Wvc1s*8xWmR3b-7lwXell`g zzB6)ybx~(?33xAjn36My?eIg|(G8WqHgdx8arF|M_}vO&s?j1LgBlNbKfGw%R=gpONfwVZGmkv1fp21Z*;GXhr))y-24#Mel z{u873pk`;nOQ&?N6foej40IR?ca)9A7S4H>dvBZ#5l*(ot=e_u3r3Dog}&><(nNmE z889=A;1~axQ-w{LV7Mm4+MV?&Wml6z&GL>!9U5A6)-`@Xp`|n@kqBw-x}IIivEd{;72bYn(k|BlESX& z$wi}8-zfy#!%7>I;e%f>avJJQI2MD3zB6*@-y1o-KR0r!snP8;4gsm(s%+4-W(#+8^O`kAC&8&v(vMq@ zp&8Puu$XM;gYxb%c}d_B>d%avTVFJCDC}iM=%x^=mXUS_%%)6V;|v<2NIj}7YyEC^RWf8>z0s;1t?jQNie#f*0IPDnv}%2@ zbpX5Z5J7H9*TxnZoTLuf$2RgJ*(UmzM$WAtjU4)K8aV{^{@W5>b8P*;VdUts^YFlu z!KrzrQs>F4_(o$HC(Q`YwHyEzLR@S zo&@hkgf(gNtrl0+(TLzXBi=^ zP^zGJYsjJhuNgU$pBXv+?DtIAOp28E?7oB}B21{roNm%O9gQ|LB;xkv2URiL5f0L0jj}$gRm!X6G@-{KDS+_f}awV};C>FqddIiK7UTNlFCbZ(dC==Vkr0}VM!L9y7|tLkY~UR3n1R^~RH_8vd@rFL1>uFa-;333fmZnF`Ct(s>xx0bZFMDL*{4)INR6=n(Qw%xMJk! zuop1hOJsC&oa!NO*W(UrqLWaRumVi~ddNVUhYcFC$3GLgn@#p*xO=*gt!azh)|Mvg zM%$IO*^#|X5Ms+8yl1vnOpwK`7pdM+VLQUws)Nee&fB1KD&rlCC)J_By`xncaYGPN zSKsk+!HfmuckKnT+FJDl8NQ|atn6;~&>&n>Tt}NrN2j@b=W@pl6rr~U^F{)5>x)KC zH*yindQCJ}xx1yp8u?}fsYqq5HqWi?mD!)witAxzn?)Z1&Kw5ya1nco`Z1=0T~qR1 z7hKHhbu4Wq5!+BCWos5&Q+rP`+;#)Yer0$*Ag;=chfcOv|^-q44e z?-M?{I_w>(Y%WixpxypdzrzU9QVUgG-w$&|n)qK;yT^M22*@kGZE)GqCI)FaE@t|) z@ko2N*X8b}OfB{Iw|905+PL?<=fAy0EBA~w3r%;Oy-ha9e6xJkpabY^==*_s8y0Xc ztDK~Ib9c6c^*Qnel+Xs0CL1Bz8U=2IG`kOh#s~RyL&6T?@b@rg>KLzU82ypRgvuVD zktFM^5APJHgAjPwa&Y{_gK)y7-mKt#L&Pf2${jsW8x@hBB&e>x zIk8`emd8arCS+3u*S+T^jqC2j_6pqzU>c#7_T6Enpz7bUrT@an8T)ADFf#pz{^LfD zHInDdB_r8_Okk4o0on1m09=fcuiqP#J;|rUqb-aPze`ZzL7mvUF9y3W$w@O0J0@y5 z{!Z6Yd%5t^UU*NN!oi6G_x1bFU(?K3b0Kff{y0WqCx5dlXh@LhQG}!21J6Uy)#dC+ zHVsiC_eMkVof!t0ErDZw{IHGC4a5`h`0p4wQU9os)4yWm(2x;!+V=Guv9?kNm8Et$ z8*ZbPZJc;g_e|Jz&dl34lBKwTT+cRNb!^AvVXX^Co^j0Ft++NOtE*jnj~`Q=9IFbI zhdTiepiEtq;#|TMhfQ{@_0o0JnE>|A`_EFO+0UI;P+n*Af@8fFmqr=&xnkCW1g3gf zp_(at)7)-uNC#GdKB^r|=v<0hg$o|~%*bIBX^O6oLuAdD*@gw}gV_^^cZM`7M4u9O zsc8Z=+VW!pwE7)a{o@EPpu1rPCIzno>aLY~gYL4{&53ciZkowndLxC7TnEuhfik7g z>|jWI{}T4rZ3$XE;lfC!4s6b3jwNI<5u2n8NJyzcv`7XrPLPUHI63~1&y`h&n$ku2 zG+B!qHs2;t$XjB#Rh5h2#KX=iY8&=dNaEul-iNuDR2UHXB_z!|A? zFVGwv*iQy+8!Gnc%452|D|WfFXK;G@-ksOGM%UWS)qiK?q!6uacC)|mDNEgQv{7*5 zpEYs>R&z{a(;_OxBIoIq>+fsaiuFiz--%xY>}M+UQ)9jGZ;nlP)Pp(oWgB1#+x*ud zF^$%G98D068U$k>ughY0b>2{L2hJZaeBkB05h3hlapQIUZE`STlD&el$ZLgw0-=-1 z3O85pxw>E~?G+nfDtn;7_R3(Ueud5h^7W0~K&x=rQih?(a1ty1ukal{6V+ zv(Qm4%hmgZk&{e3=ICOvLD3P`we-ZK_!c#Oa8>PGnX%OkY|4=5@y^BJC>_2qf+L5? z%aeLhwBS4R;G?&@s(MlFcXqK(nYk3T264OWyw)kYaqA}|C+D|~oC>%=0YW_wx&6yV z&f-b=eBum$gP8KZ0ZWlbIb53!7`FB6!R5oI=|Qq@rtA(^jI(V0g^`ngAvcA&DT(`1 z(uVwxM$Q)6hOHM9l#$ymSGwJPl7HTD|BkI6jT|42d20t+v28>@_nwwo^l6 z?ZKddjeFk*@jQt<7~HW>bt0~a@AVV@EeJ)wlM(sgp4N%_ z4_*LA(@sqt*Th-J3&3h?8WPn@f2)#6%NxjxqKCkEbYJbQu7j)NlCC$XTW!%29f3lTKk1{q@L; z$!&5{Ousg(TTkPg(qLT0l^0!&$4`4>6NYPAI(K8G(jQIQs+|e86b?x`W7~${3^1}V zG#Mk_CK^j$ThgWn5@DR}Q+99Dj!b#`pi1iD%0C%7Dqk~l^49B*<(NJmy{pdBdm`ID zSDxM0>95fj?#BWc_615F$e-mD^?OsrYrR zQ~d6kaZJ0MQEuBEGkQpoYuk+V_@!shJ61lvT|1($y}I@ZIeqm&Z(?7ef3vWk9WqBW zue|IEcQ0)x_l8t|4?JxLLh6_(6z@HCQZg{taQa#t-}0-oYXb8P5!rEg!SmSkvivR8 zZ8mpriFsX}l0B(D>^f)qsCrCXE&`r%?*t(XiK&cK;1NM?6>K_DO*BLh7m%;B2PN;w z>^gj;*pdmy*Mp{?b-lb3Z>8!fj-^l6a-{Qqz^dkD0Cb6*WLx1Kx=qP52@*;*5iWD4 z7{j6@q7sTl!v@?u%C1k?oCPs3PA!KYDx%!YM5d6=aozf3Motk@1qOm3!VMj3y2mU4Acw&5&rK(U1;Y8?#wl(QnVsGRp8#`4%_n(wQF`7D9LHyEG7sy|$iFt` zv1&F;7lW?pfg$acOf_KY?sibg;_RTrFQ!`%EbFfjAB1 zw&U@M{Z+Rjtef+d?Y4%qV0%rMdSulBfK3np>JTl>d}u0rYxV&B8K&@HPNXuI8lx!$ z4B!C}kWAwc*Qp1aTGNG56G#MJ2fROr1&}@s#p7EfdB^}1;ohg2t#VuwL{)i_59MtQ zhEi8XqzDq)zGUQt;&lihK-PbcI>K*Sxo+Pr8bA|v)qyuvP}yVwDo7t_H#Dm1-Io&u zK~X^f0Yu#+4`7Wn#~ubP>0KPHZgYZb_2`nl4YJ+|G}y8yclpDfwrl>2Sm@#5b&uaa4|_0O{qW%P z!z`fZVYfmbhF$mYUl~pWk}kcP#t&zo0d8qXL_8_>AcS=^#eLPt5r~ul0MTypX#qGN ziAsoGE4+_B4$%OLiUiTRvs@r{MU6*ZO2Wjea(KA7DiyVyAiSh0tgA5FSL@LV16pa) z0I*w87YGUWaHoP>X@|F6o06(LU9GlSFfyclop6uFbd$nH0MKe}UA%QuM$*YLBQb%J;H7!VYTb$rNV@ZD=$^6SjJU5WYRzY6+5H6 z_XUOx0^TI}`$mpW5&jAxEUQV^po>{83Fi-yuRkz%8? zQ#ZsxAd;X8TnKE&+qXxUYZnN# zVO0TxA($$@O~@axQB`FJfJrZ77BbAV0)V3{E6pJ`z8W^I5s-SS*$jjwQ(cctKj5`a zI2wS2nv0vXm*r5n%rp}PBVE(mt7_Uk5YRhP5GaY-!gb!mT00rqW|3_otrlkch|Pf% zo-u1Utdv_6tHIpguL>9g9>{e70pMw^zX|~kh8h)knVE(F1+mopZQ+ReDiC1l6rynQ zNkYn#Tf*|yGRv@12=FB%$83FtDnO=k+l3cuXdvyN&jPD=_JwT>rHBt(7H~DVgp`0u z5Jd*y1*KMqS`}z?c_1Ro1*p$8+NW@zk~866Cdhv@y=Ezm-k;cePsu5B^1ib=$ zfG^{8JXj;hrj^>GB%ZRVLPMga==KH)D>77IX}5C>T`P?fX@XTPnmL|C8?HR~#uY9UAiOr8ClyjOD4`0RxDDtMX`mT3+<}r}2mr^yPB>7}RO?hPEjtnw zldj?6ldKyJOmS_((>J-$|4?b z1X$u}t|8T0icpR+vXeCk5dtvT+y~1KZVU|_7UmmZ+JfND09>b3CX=e5N`qxmCCp)H zXDTeSg2)Yea+$hmkDKtyzJOP(o-)9M0EkQ~zj8#4GVE2S3tJ|Y&3!F*Ce_Cd%%)Q; z5kiG_Q>9>u3Yk;^F`9xiz;P8e;|gF>+axjpmZJ5@7toh`fGZZJ4O0n5`M{H-)V&kU zknSpB6I0B~gzFt3AgmG;Ra{u=Vl`9(qXK8y!61L2O9*Fxxo6`S-FJp{Lj`0Y2sOxl zn|Mhry9_qQs9M=r`N&lTeX4xg7T4Y(ap+yGT{KovR!bDF727oyUY zBB1zN=bEU@OYf>Ys8leac3w;K=o9z40%{I7HHRIrngI%gsM@0m`$O-zwr+Ha2Z2TpbEYd5Fs7Ps)>*Y4xOK)u0(5T}eTwdTT2&^>RMl1ODb&#zDoKTI zbq;e^b@i+G^F~f9%v6=Sn!}B>+D+Vp3hHLKM-@xo(%P86UR4!j7*PgBH=yR)ht7p5 zxfwwGCzz+FFmN=$w-g~a*G>dzoiGsEc$BRS(4J0kopJB;va};V@(7`VoNxOV|1NV+CVx%YUcw~mDI)Yx=gH9R%PT^Dz zLuyOe`s`Qk8-ZF6vaj;Ak7R<0HI{_iFCJhQ_fR^AFAw8C1hx-`d5aiF~~Ufm+xu%+Vmmc9}x`zLAw=t}7i z?ZngcX3-Ib-i?~t8%NwGqVi^<3%5W=6XHI!*T52)Nc^K9R+*hIgIB(v1$?ZEk84I zR_;OsV_7#oLJF%!29!I#d3DP?xH37C>VT)BqI}rN-ok2k4;&irO4(S|NoMJ;fI3G= zx40n?fU0WMgIFpe>Uz!72W6t&Qt3~n(r0p@ibU@jkD;fl9zE5zyv}i-4kF!1L243? z16LLkF4;jEm7(a00Q&YWCO`!a@!S}+)IK=4}ef%$a_>3{mY2}L>Q5Rd&=i()LxOeUVZMD(D+JQ&$FT*!3n@K%W z7CaGvX|e3L3kMt5O=>(AVc9b;uqTY?Zi@`)(e_eljuE(9u{+vGb&Cn|P6ShW#yw6b zGJ@yzpm*2k&PRvuz%*yvFS{QuJfpVB_BY3z2l`a{AUrtHI z0b!;(r{V~a=C1=wgZ(LdL3DBuU3IS1ZP9h^@*NRY=OfX3iIE{`ymU0wc5SR(6n(|W z+4@I}9O20FjZ{Qtg~W#|fHPHbqEexC9F|XIyi*UAqjKcCK}ILRBmhxQg^mL1Me9?m zt=0+yhT@5v6h-WTj7T7ARwC!FO=>C0At}2=PM$yNSOIY@x)SAHz~9Xymeg&J6bv20l7T|UZ*wM;1Z1U<2<7^>-FX>_-11Cw7!?9T1GEkBh9QV# zMv4i+ae>Uvg0c@GXZG~51h6FYl^Z+*S0OX@d!pC(*mZic>z=T7ZyQ9y;kYROXh}4% z)f*s-i2&mI$d=XdirbxrAOtLmtlWX{fa+#6_Ec}+47&$GcUPK%bRu1?tT+KOK{wMw zkOS&()xaGA+IzMtZ-P1p+N3jO0tVy9xaq8t6@)N$PX;a=V;Yq#a80ZXj#8ezisaNB ztw*SQXXLp5%*Yumh_Z<7{oKguk@K?tWaQ{u;HusCbn3G3y%9^cG2NGF$G%~oc*^>K zve-2z^O<{Z%ch4ux-D&Dy%E-~khTQguH9<#>ll*k&#@(^3@Gi+6jHxD89 zmu#`ms8Dc7z2Gg5Todl*(*_2_G8;3hN!d3<8+6KIh&vDoCuELpRYwUD1dCEqE?lg- z7>2ZXBU!MT7}k8eJy7guc*!W*0HZt!@;GkA{}Q6?<^GeAL$nZXn@lhkZ{Ki0rrbWS zUh>+#{qJ@-)VgPLdzw-Y$W}R4^)_6Xb`zJSqvLNoS1pdjj)o0C-j*YpGHZrAVzNwy z7Ss@QzQ&^G=**gc<~OE-SqAN%9#7g8 z-e-3lZy2ai?9zIt$<-{4)KUs7)z?+(HF5H}+G~M)y3l2C6w`z^y>8krWYaX_C8xAa zYfM?M_iF6Wrqk~Ox-Bo&s@&O8{a)oR8|7fR`5C;xC4sBmLb=9!!#yV@&MZmmn=9qc zwcL+>E?~fRhVee6*@vR7b}iBF&>Q-h9QW+;HJ5FkS{#{$PAOascFU1HAl(p|U^7ed zk@vZyA7W9b6srErE+c^=lKSqJWKA9Niu+uXz$OPl3Bg1SW8?m8Z(Q#WMvn4|kpolu z(a2FbuS)x5H< zlaWJUw_YA)nb$KW7p~lI+JkXUb4`VXYZ|kCZy|4JWYNtd8@U|~BBV8m`p~tM@VGIS zBS4IGb~wztgG}jPF>;a<=wzL7@wJpz&P4whTn7zPhc#yNM2C6hy0e?4dg_yHA#xp) zT((OhxXhw(d0E{!v(z;6!tacn{_l*Ot=}0rN^~OP{sj_Tu@!RLonbh&SQIb59&Wh* zb)$joQ{Olju|8o5&CcBVZ4usBm2_{h4HXUXMKPYjk@`2*B6jmO;EhF8$JLMFJfd8U zY3P;umoQoDhBYP$-y1nLJQNc!Nrg8O2Nj1qKs`~m(cc?6J#s}q89AGYH+KY-p1vP< zTF2YD(~ow`crP_VIaeRm;(!(SXygZKF z7(?Y3*ULaWa}EvtK_h4TijmX((a3ojyX30x!8>+NI@vxJ$JmA9a)2v;X5{z~v&Bf> z`>I2l6`*_BJw>3|?dA>g_}3k=o*#{zNrm8(U^zw>SE;JhhQ5o;fwGda%R3KBxlBk7 zR`!+^2G>eSVs{VLE|gt4^j^xHd1|m8TV52YHM@yx3>KH8wD+8^G$kZHT4F~dW|Oil zxs0UJLi9e49|KW>X$d&eBNlu#Xx)3ic8Lq}G4P-v-v@KCJuQS&oJSEVIo}&O&Ht*A zb7O8qAVL5|dj^hbuOF^6987Qz)aScFVwKI;jf*9f);@~c8m*o)3RA3T;5bAM7Zn`i zclU;4Gm3cR3dY#?R5YO@&mGZ|_GDomFNce|kt6JYFs{;yW-Poo)=PQ}TdmqGl_Gyk zG!(w@CJu9crct&EUFRd1jof05RsLw?%r<79ub?*`Dvp$&bIQ<;44~X6!(B)QyakHn zWGHxKj3cR$1EhGW%%MYyd{O)S(DPOW{7H+8OK+d&aFtFJ(0eZkxKS*Oo;&b7SV|HU z!BjIGz!F~9MYk7s- zM(-}(y&qp`b@v(T@omF*Chy27w_Q7G8!^dM_vY3*f1`CH?*gy93#>_Ldx`uv8#!^Z z-WE{DOxWAFceTSXigBebW?~1yCXjHGmf2%J`!<-A-Y93S%RA#ll&9;GDTc>%vpe1OGsH~anv+o;_tMv! z20(cQbIQc%Qv0zM*C!u^TO_oKF|7ZJk)x7M^OrGGrL^J;5^P<3j9Te~-N(y{e7cf- zI%^vF^&3(h8o9}Ik`S2NI-5|SOY`{A$eB=Z$@1}tRZ?9sa>5mIP7z_b_HW}@ZnuT! z=veT>Vyu}rWo@R*CPl-gAt=0$kv^!ka;hYAb5~RzBb~yI0E#Ctdw97YX|R z*t-vZs{j9g{AX|+oWsGf*Rkr@n8V70{T4kHepvVo3ah>$qoO?$npBeMvmT} zjhs$Rvx~bNUY0tnE8FmLmxYx^SBC1`u*L$&6Lz#UKDJbzd8HcFhI zZFBF7&J=9y+eP@HwrwByO?xX9S~^C2!q_zL)ohjluQqE}KHXf*C$6`Ek)!)Bj2!RZ z8#y&o#H}Y_*N!+m48)FqNR)5S|`qv2a_F^Tk(>O%^Rv zFW(&|EeY|rKKf1M?#}_0!4kaL7k;S+gsHfa)8_&lAzOIg7HLVaIhoYqYihpw+s1V9 zWp=B}eyxrD;hY_8mLp@PQPLntCg2Ipf?mG%Kiae~P*D;ias2v2|~@6<#7$>V0;YWpI!^q7nsjSqPH^2CzU+sr!K5Y=HY2QcqFXLLj@Z$^;` z{AvT+n&o2;A^8zYEE??9uezz_Y4d8yswZ=Q7ZMkXRkgW<88Ddyj{Gqt2$S~BQOmFEfwgGbD$WN zse1J82+SkU%Nt>lNOPC>1dF!?DQv0g{I6k;B@LTEKa2DRiJyM>@0LEUCy4WVCF+PK zgSK&N^L}Zt?NWcbVm!RYc%{0T0)kU8;M$`1wr1P)#J%Z8j?ODCb3(CF{R*@lF)wSI z#*LkPlU!Qd>0OqH&Zn#1?GQE#&_fY(j4oIvVbYD%o?^&z#-gP2^zN3 zZ2R5_%Yx?gi*Qvnn=y|;Q{)~5`qS56H4-A+(IHlGmo&V-atTe&yNal~an9mMYr0Yb*2Ce=FdtPZwL6Vh+B~xyfVD z*E0^8=Tn(~W=VmYK>gFL9#MlULC*qg3+m#6HO^OFda|VGF!KEywX5BPlvj0T-T5c4 z!)tz3*P7x2NAT!e2ic6~Pqyk7 zZmaUQ8&;mLsuYe4d^?3#?W#d^8UI9Ik?Pc2yLF3MDY^2k;jPc#Wp+ncey^(GHL4Y^ zu9Y^bTU=eIZd9*dUBBF@!LqtxwbAXh)wkChHTqRIZZ~S$TitZns5!d2`KZyI6y_Y3o4WOdS$v#H)NcIsv^k`I$6vHA4MM-K75VNmo-{mvBpW(+`Iu zbd3+c^D)(uhvMa-+K#4La8sS223@$R3fxp{qD~vP{Cj*(0% zWZar)s(W*tV&v$t8aX`^>q`A03J&qk$bMksRA8Arw#Xg`tVi2-!9uUmM6{slD|XV$`g9KzoEA{=C8?jF553Swe?|y zSn`&S-vk3(N__65>mCpaTvxWW_uB67LfhOcc0PYtc0hQ$=ZynjKYjcz9OP9S&MB-H zDzan4?IWW4dwz%nZ@iPLxb|A8=*~^8rwsOf`XRc@zx}NBalM0LySH>-biKakr`Vo= z-m9B|Uqh?JPc+ca5!h<1Wr;iWre|-lDDKZkTJD4$&xb&CEXp*_q ziP4l*%S6Z0oIFpArF$F_eVXB$b>ivCz?-7aGItN2c$W1aBZmwT?J?$9h6>|LBeN;G zm~eZd!bQ8tFoPZw-b1Je8T1g`+66C=g^E;k9O82A!3*C+MQNrDAvU|1iVvcq|JRKi zCuSo@(bamTCx7aRuF*vXz1FJ_@uy{@$FSC}E7oN3AIoCE4s$i_Vun@|BFAP7X4Hs*44&GRtrOycEhN4`QXY}d;Qz?8(-V6)pcBF2t1A56ojYw z(al*7&%Hhh#nbk<37~k|2k+1*p6;PmipDe6&CqxTpY<|pJY%mYAkX-lQ9$E)Iq{64 z@l1Yv`~K4;0P_6LX*__mbLT(O4zskAUB)c!QrA4$(OFM0JX{V7Q?F@mY{O?)yfN=K#X-DIT6=_D=(WOW`o3C2P zoz*+Ci2pp74w6^z8dcgEWk@?+Tm+9onK52D(IYDOV5Q+#RnC2}_JxWlwI zQDZqk+R3F!J1|+Aw3E^aNIM}TNZE9e)Fz2KinKFCk#_1ZbZJMGBJHSBq#c7s%#}iY z$mVePVp3s=a<%{a^YF!;8eK7%D!Q~oqDecWfV4x>q)9tTfV4B>LX~y`66TY33}&RA z>xeG8v@?j{#9&7;fV49TNIQz7RB0y!kak8X(oV}PX$M1*c9J3Es#C3~L=AI!a4SXH z!N`Us0n$#(w6vo`m3FSvq@AL>ZyV{-j(0di+KC(y0$piPS*KG;)WI6#tRBBe+|6Az zNsypPJK?ThkR0p{r{9Du)>TR?7L9zzt^>Ia^EIB4b}$P|JMz)`#xv4R9h0G z%)9a(AnjNtE>8%Eb7Dim%^zt58}adhbRP0QNxB;Lg(B^M^{CPgM&kFNcR%xeHa9Y}l0;HWVK-wW8eI&6G zRB2~Rg4UZviD_vki7xGQGNhgJGty4N5LMb4p-4NUG-*eH!>R&)D|u0>#LZdK&M4BB zChg#6Njp(Y(oQ^0+Sz2H*0T9ykYA9(pjbn=W(140(`MWtoko#%gv8@0(vDUli?l-$ zoF(n3{6X4LrAj-g5}7@|A%h5b4RdctkSyqGe-{ zNijp((Vs2tC`zzMJ2fy_K-$@a9a%`)*-DdkN+Du2X(x4B+9{rqcFxhHom^BDi?l;X zph!Ep^GZAEzm;~pe1s^{j^Dh}P9q@gL@X%n5Eqhm!~tn1iAmatWRi9^3e7L=w9up- zXS%er4Ul#sDbfxayJ>R7DxN0o#DkG%@<^h9w4=YtgeL8Xav(yyIG1g4ld}riXM)X7 zI>E~(mHgIy|H~($47LOFO1qB$R4Ny{}Q5;_hJ7@*2)o znzYjfZ=*^(aWrYiiU((>KSOV#3XObu2;5UAI{6bqKC^G!^V3(IYN~GqXqjVGF93M z;vSlocG5ML+}nh5&lNiE=Xy0~>Bc%}sQ?c{+PO4W+QHMMorlxXPCB!+L!3|AdH&C( zotuEP14jOpv@`rCY3Cb7+NlDhoe7GxQ$>|_9<*gY0HmE9_yd}>tE@Q=^FmnQbWh{0h5vk%?+abZ`+&YctRU8^0kN8|$W-P9Hd zdofq*;;RlX*fskuujzxkR>BMop?Jf*9TQ&1CnF)*2teAoNt1T!I0O8CC+##&OFP$D zrJc^-NINT#bZMuDD(#Sv>Bff|tyJ0oX$OLwE$#IGLE33$NIR8?Wrmy%afl6My0lY3 zlXeO?69|Z+V#6W~;sNQ8(hiAxaE`P?GQQNLwSRZ^O##6aHv1~LOpFC?qZ4i;Pking z0ql-mrHXENUB5+xTXlrEt78FY#^Ss>CTXXND(z%4q#Y2~Jkn0rjI=YT!Sj1*=fSkJ z^MEStkS18A9Sp0qV?md8MgVCC#dGz~(oQ>zwDVwIX(tRpk#@Maxuf==T9^WmH zNA~8>JAblZefA~(b~wK6fW5KD?E>ASqJ{NbmpbB{(#dpbCl`=*iVzfOhlrdl?L7Ri zOFJu>q#gXUwDV|=wDSwEofK=N@jU zZpzQkE1NIHW@MJ8gd_?Ig^Sb`V@GeDskap@@PjbEnq@5N(2K9U1 z#A|Mn*C(;$SoY<4q@BTaLuPDTuiYZH+}_VB?QH(&p0w7`ghXKgy{ceX-5T+c5DG@N5O2C zv{U$JX~#-5oaBnKb`&)u{YKj94-y0Cld#nbOFP{uVvQov5{2-S=eUFCmv)>^Xx7R2 zOHia8XFGW?JDMu(c(6!2(Lv{0O_LJ|=bVh4ucFKM)?Wj_u9eB6|v$Qj5!Dm&H?2H{&Y{kTA0@99i=>`cV zY3KTkwDW)>?c~y>9T6=ZnzR$=GOBI-J87pYBu0}i?Tq@x>^RK0yN{l6LM%po$EA5bKvW-kz&UyS^+QBC%sSJryq@Ct$6)m9)x@~drWa?Rr!+Cdy2wCmi=8|(|o46t7QSuj@ z%k^Gl2Ck}g8{x`D7`(iw&eCpv*4?QBR8k!bpZv||TIJEGCLLODl*YyfG82uM3Qh-qo(470R@ zvNvLpcCtYbWxBL;Y@AOSkaoN$wgA#jCso>UqDeb@vcnnDjxi$L1&Z#YOFMlKD2PCa ztjYx=;8Pg*(sgVwEQnw?k&jURWY5Fqg_vL3@ne>Dg|ol=Ih6FrZ#Gt7{74pF5YKh%u0bAm4I+@wl7*IedGJ0%opr$*x>leBY*D(xKe zqDwnuK-wW{?gFG8tf|nSq#eKFX=!KwjI`tTduivuw6t^JPts1vY-z_&1U4h>5F-3$ zNjsr4(vH{s(oR*xJkn0@LedVO4^`Un3!+IoU|*WF<3F#oW9a(NrJb$6k#q#YZI zv{O01v=d5~cKn#69X}RnC(cFV&(h9ay0o*CF73E6OFNFRg`^!1nzZ9am3G`_OFQ0k zq#Zy1`J^2g4j;Ryuo{>-RoZc%EA7}v*#XjyH$~cU1Ed`yMcVP6mUcYI?S5^5w6n_f z3!m>7K-x*Mqe(m7G--#=b+)u4%i-%Esf?K`?FjkOrJZ~`997!spC#>Ztz<|$7#Ks^ zv0;&Ra2uJW9hvCN>R3w@`|4u|yGSQ4F^%<)<-L#(CF2}b>|ubk5(vBaqw4=ms@n4sAbY`R-=f9SA{C+3xRGM1#u5OE@N;~c@*a)pJ zGt$l`W@$$QHY4qba4a$HZ?toBVUMCpJ76C`+A*X^J8pj^?L_=Z+VPF7_We6)hYiLe z?QpYCOFMj4zms+b>=uxAJej2(cG&NvodeU-jt0z)Chcrwl6L&&k#-1nGt!QmPqUlb z@1&hRJDRj};P=wb-QP+(@RfkH!yf5s$EFk$5#$oS3Fon0BwAMlx-#&opA@u?9MO5B z^@s+W6N%Goh}$6pI`D1CvTRrf)oP9z9;mu4DFWgfzl#uwnkDUQqDnhne~@(vCHYwBtNW+OeA@?SSp3r5%5fM2fVt zIkIL^H1rvfO&Id?TEr=pSU-lelff+QfLs@nc9Q-q?fCsh+FAOKrJZ$v!Nn@=#M7l6 z=O~)Ag2W0H&^f&w=%ZsVJg(w+hOQxv+z0fT%RJJKFiqOg3;?maf|fXP8E>FVI~uQC zXQZ7?6lrI2UTH_0ChgSFq@8_qX(ws}Y9B?~;boF`lIYS7PF~hHk1FlFzvn`gcDz2$ zC+)y|rllP}nzSRtEbR!cm{;1VrAa#*5rDMQ1~&kt9X46}B$~8iC+CMrvosqiw6Z-i z$Smz7YxzV_r5&^_x-m)n{#?=B$i+@?!AjVd`D@~y@ScPY}2phm)$Z*!#` zdqCRh*)^IFQOPXrnBshrpevAyjg?&@adtziq#lj}L7-XEPP0El+Uc8D+8LEQ&MfWt z;;7QjD6_OvqM6E&c52u*P^6uZn7Pu9qEG8=X{TylX{UuM?Z|0Sq#c}T$Hw@1r5zGQ z+Nq{WJ82s>Dw9~H9TKawLt;ofF|XJZ^Z;o`UXvm1=uoAd7P_<(Hb>gQP^6tNfV4A+ z{xzzQrlIbQD+Qz-eQm{erbtbX5+j34IFtJlMqNQk>iCJa70Q6L6QNF#cES~K1Nb$# zk%9a6xMob;)CmBa2c(^o6lo`3A`^v2Mv##XG-*ehBJH?@{aMP2ienXb z56_l%;^&ukNbz)O$Do=n?I>{Y>g?~8atJPMT&fnFdU0S(>w3lggKE3ZDnEYiaJ};3 z4YfU2FFt-9e7$P$ry8)8_1K~HAV+!#yzfi{yA;oLt4@lv(~lsEL1TJo($1z`4Pu?x zSdBVx!hg54Q^A7OibdnNP{Tl5(MhEnIql>tx)Rl9fx_OoqEk9H)J4k*WSdP+Gk|C z`)dDVc--gm6T?uYY8hDN{L)SgAng>>r5zzXy0kO+duhj3WR|p}LXmdH=96}8S)`qt z%+d~VTG~lxm3D%rr5(85!qN@~&)3K-?Xa0E&z5%B#TnAhSJD4rX~*`<%Bf#dho6<7 zIc4(DVz=abyOfI9)4HNwZ!qLFSPbYJLi0D$PRPHKc0{Hisu&poy`h%&^{5#7RCXR|=cuMg{fxAOtycx4ogH}GENRCBzu0{x(%_hO3SHX40n(1kENO=w$}H{Z z&nNBZ1JaIHxZqcaVriHbP1?~7Gtps@cC?wL9W1M~6Y)3F4m-A7KEh0HO%NP`<+k!y zFqkFnj5A3)15|0pwN6kc3@bA&?Ibg#9XUO!w4?7&k#=-{C+&F6k#i-CF@lmm0}t#2EDEe(W6N_-)2cW{uF5^gevWXP^29lc@sHfK-w`; zrb;_}vO#odM+cC0Fn=lS5T~V`1{FZsNtu>*wCebc6;}OdkEpKQe#~s-!|{z4cQ&cr z1|MXL28zss7dM2QU($E3E(njr)_qQbDmH+UxEAkdG}~_bAm}Oa)O|3=s$QD`$cVfN z8h&4FG&Mak57pBu?%u#_hDZMZ8vxP{5nqeFP}osW@X-EZ`f6bvJCka#;=t)k!v#hA z&916;>aa>X+RW09N&S@+y0oM7J8365T$d{C=>I|5Ij-PRr#Ltx?Wh9M4mO-g+KFYA zcEF3N(vI$2X=gtm?Tn}pR@?bcOFIcvY3Ck9G2SFujW==~))KopI%$4sM?YjCX$R1y z0BJ{9mLlz}o{@G^W~7}Y%^)XcX~$q%+R*`|9Z@Kgw3Dv^NIN=IX(#9}q#Y%yw1c0K zc63;!9fR4@j-7InI%2c|H%Ho0v?7We?tcr5$WI=1GG3Ur0N+27+ch zAnjlvILlVCQ0)=2&5OqK`k^X1RA~n{zqDg3*Tsi7(m{&uiV-=YxwurcU$gvYuJNS2 z4kDHz?dUP29qepr$AC%N(Z-n0NIL`nRN7&a{Tpe=iz@BFO>Oj9q#a!^sz3v}MJNsv(9lgJlc1-@Qv?Egg52PJ#Q*>_1JkpN%RJi(oq@AbRmp$8i^V#8L zu#!xzNKRn`{8gn z&MR!O*R8K>B;N0ZTB2`#lQYxbX!?C;gpPc;cEc^4BlKs}a)SV%qOrvpcJoL(OK&?tAGL}$Zn^(zlQLJyxbBXo9NSC# zbw$fPFV-z`(=Ax1S-g`~+9`F(;l1-yb(Pp#-QWWiDdBONyO^aN?~|Vr`cAyKQE+E$ z9s8-hGt$oZaG_A35<}X#VOXB`sFoq^Tt%M=RPXro{`sTphoui6`uY9q=g(8pH6V70 zq#6j)2vy4_=$=#ylMF@G!4)!->bO*^QS}Jjfuwq#r9V&&D07MA2L4q>e7Dg~?#Z`> zJVN;zMSL@p8>hqqtNEHFb`K;sNge#b*NlmjNNJW$Fyg->f6P7Q&Z5(y{4LnL%#;?T zOV#{$RmuiZ?yBAR!GBNVwnXYZtyUv}R-JD5)K+Ru#AJiu`pn5j$-@%wnibAuzH3pvCGq~A z?vu>-_m@sdd}uS5KK-F%)pE&?T~6yye|+e1Sn^Yk@0rt|`T}oBejeEU6FAwP_Eaa|b=6i+fm6L*PxUH#S8YEekU4~YhP&fxyE9AR z^mE5&OL}{4_udr9noN6U^xSpzfkA;YU%Q?y``Wwu@Gk)pSa95!bDdqd6gr#JX&f)w zXBWK;ox^u*+)Qzu{Si;}S<&us3xhuUqleJBGJ?;otk*fDW})*Gou02)FNC6_Kkxqk zjs7@>>2`+Oy!q$u&(;*X-qei~scUl5;%d@E`i^g&JihLHM&gANNBd6%OrG#4KN&rl z=`(OT_)=DC=ow$~nVknon=X@%Jtplb$v#fb-d&NC_AX~@cy2&fZunqsXj$ILcX>fk z=YmGg9V|be`Q$v2e4gs2lbV^8K&m`+y4cXt=F!w2J`)J$cqUIL=}N0Wi?h*F(?W zjxlVY*Y93@d-rNg}=R}DGi;N`T_Un)E zEq*Ov*Ag*58*eCk`mD`a z7%}|F;(;`?`_t$T;fV42+@vVub_)9Q1~etoYBdUiKLF65i6d486!a$$?$S^wYo7uU z;ym36Q=p?iY&^Vr!&LOAG!e})T%dgUrceYo!LyWs{@|I>pT&V+0QBc&DSy+I>q;Zd z>XBb+k&21&rq<#~b}tg;-Yk2?sTWhq7icu%IMAY=NI`#c0saR-e^%bPG#M0bCAy=H zivA2l$pPq3lJrnx_)4Tv;nhUBEVGd&$yN&b^M-={wBqRKkA|{v$8(K}6`H|D?YJwi zmOxIAz!eKhspt;}ry^Oyj8nFWivC#3kQPG(Wk|x4m?>E=9V+_c)BvDA6Y*5^$0(7G z{-h^=9*olFA&u_gh}mm2jsAp7*CQg3#P+Bx$lyx^CrN1%2cSQbm}I>2Bo+NB+!;MS zNkM>-3J01N=J}YSLlJ#j*%~InE9}&>I26bliM;B2wiIJkCKkFIjPnl90N<7j5 zrVn`(wIyg8{eh_C=;%)Y2D2EI8z_WERf>fz4MSYs4~lPE;>gF*a$@@~;?A`{2AI$v z;iE#DcX?MjSQHfsADuyeibIUrsp!uLfc{`dDCmy?75%A^q@X`sGANtgZ3u{~_uE8W zU9(+@dtWPga@k$b234@2KcxWr^JWhE(@17Qe|ill=ue}Zz@O2dmG!+jV3PAR`m;IS zeeyCP^au_8S;2(quP44^+jJU)`m zl818^3+XIL)Ow>PHI4o>(9xgp1esQ^jePx))b{C!jrFj~lA;io5z|U6%$G{n3`n zok4#lDd-PAcKPj^Y4pc(9`t7?hav_2v7(?qw?d2o^hXIme}Z|a=+7N9Q6}`Kc^ds` zUNRf~xg$wOe{|4sQ^pRMXgQ#WCBSih^v8)etqreI@(a)*p z{i^0^^amwWV-IWQ#ByHpXGMRE6DjD=r5W^ReGRcV@^yIqQY=pvGx}pEk+tge7HNe( z_LJAjuD8CA_ycDW1)x6_^#J;lrgUg>#HQvfQpmLL zMNTb%{(!`NvGq$1wR0W*PE1%w1kfKcEBX^5o(7;l*tzJBj5!hmj#W_dz!;jEEn3zC zQzYZL;KO93l0_e}i8T5x-Cz(Y@f4-VrE{wO-ZbU&9(U-V5#5i~d z1oy@}$-p%1(9+w5k20e_i4^pwV+j@g>5w#&&G~j#8qaKjt&&PmtUz z&m=8_7II|b_oV_!V*6VhyZ)8vSu; zbLtC+9wI=;1YF17E%9|+Y9zOMJlOQ^n$SZiXqK1$0D%6q(b1p81<;=)2Kr+Mo0yCK z$g%4z-q=G$emX5%eLf~i6j)6!yaBkak(ORv2TpIjLIaU#I4Iqr`k8D?}uR$kzb>*Ok6 zMt_XCdT8j6G3P*Wqyt$kiH81&@G{V!NdW!HM&L09l{2l*_ z_iIPQHuG;BX1_i>8~rH&&>sR^Iv1f!TASpM03~Yc60{O2=nslx(IdmnAk9G$A6GCh zfc_k+v6RkT8t@rdy;<%^nnr&hyma&@2eBu02{e|w?|quRC@&Gu$&CIyW1v6$ye#NX z4EGHB98cvab2GC{ydkHc@A8hL;nc<5$5o< z(}@~^dnUoII>7O3*jrG@Gy=Cv(z0#_`tuUR$v}S+?bt<{S2+T%w{sE(C5G`jboG4$9{e-NjQrjsDbwcpqDHTNmr@+;I_2gj=^EN~!1%pm{}h zUwTrh3RPq;9A&S4QJBBiaBQTYaR`p9L{ri7QP5Sy60Q(T3>9aIK#rvjrOG9od>^wBO};dos4Z1S%8i zp1Sjpe$wP^?d4vGm$T5Hm+S?9M1RaVXV4!dE&%;$1<;>9n0gB|j8xYIpg(S~+-dYD zhBP1g6M^Lah+1z!;%j$$3F7@d`qMv+{`6zu>nCywijn&;?5aQN98Q)y#6hh{tmw}} zI{GsKpg&}uTQu~?r3U)V6@AzUm5RWJ7xHzvLLZ*+c0fV*29el^+-e^pyfP&nRy3~J z(%3&2{juTj)Iw6wp8~F-8VdT;1E4={a36wG4}ku}!0yD=uhH&=+7en{fUEb zMPPN4I2=|u`q;r7PIBybf>p4fKP5ax7Pb_9K1Szigo)f%_~UaAet<9w9nz0yser2$O(KV^ykBx zWDZCU1^szULx1{^l6@5PC#j8s{yhHQMt|y%HA@h)(VrbW1ryK#QqZLFg+pE!4vSs* ze)7VRCAtWaoq5WBKUAN5xD=Go2dntP?Ma@E{^T;ypU4^X#}mczXY|K}6F`3)Ska#> z5<=+oMN759O91-giKL)ECOoszpD5(9z5C+$_oc2iI>$dGXFL4e#CF+_qKn62?JZgd zM+&`M_n*IRl(jhj+|K>+4MnYDw{hSei7D@5H1RQH5{jz2hU{lTe>$eoA4|@ckl|F@ zlIEZMF*o^TjvRKd;PkO$MSnzi0rbaZ8vRM5p+7@(^e1r^`qK$38D>F$(teNrbi?*C zqd!FW8vy-jr=UMhHE#S4$_I6ox4T#b63X00Ufy~m4?ZvURgb*X@1%N+D)(vH2A3H_O5pg&~x0cP|kmfZ;> z)8AuQF85kyyMYm?dYR^sII$+MG$;)~e~9en)`g#fZ(YB-5nRaC{TJxZB|G*B>j{pK zk171Ax*$bm1L#t$x$0VIc_&OG>5%Vb=|lCIa-v3@CAFKnF?bEXI6<6l8%*Y~i=M%{ zedS6b@z9Nj1;6MuKsCQ`6E`NG_(__2RmGnDlLP!~^Nx(zii~Yo_KCUZ&vl;14^Yoq z`Oow4M@oE>fS}TTG|WMNf>SBzj}w6YOrW=082ujoQF`AIl@6dkkAI%Lqq-_it2@Uj z_8EZwSkcj+D|*gl+s693U#>0ozMZJx_V872*vp`yUAZ*$r-%jpSsUaM)bUeUFUY$g zR#1J@K+mL4bF#wfbMC$GeD9sm-Tu@)X@lRbGbTqDdGvkQ+?7vsc0wNCy|t&r_rbX- zkMAY!{W~8$efqfn^LDcSnu(Zb3i?yf44^+H$eqD08Ggs=j6CV+j|899XhVYB+WP|r zyPh}w+_L!YQn`0;jp~H`Jf*(w8*NYKc=qJU)Ko@de-zJCj6qa|{I#!P6+VN8Z;k|} zcx@A>qCZv{XMM!~4*dxd{~twv@)M>WBb-VXnVh%D$@$2pyfO~BczD-t)tb}Qu~Nq4 z!D2uY8mZ%ve0t#0B6`cICwe(a4|rx;FJFJWCO zW?8-<`twoFZ7&v_k5PR18}uhttc9>xst^uoIubHJ`crmNYd{7rMMZy{bnMLqspyXn z6Z%s#%(3^}M`#KK{kaLCKOecJ2J9_PP|%;1g1!S3^ylat^e1pH1^o$@prJpB5(}e0 zX*>Qg`eP-RP%QeHJ8LIgTk&M%05ke?S6z>R{zy>JAE&Q%bJ3rmY4oR&g8rn@(4Uo} z&ChFo`GB_l4f?YLHYNY7F}&e9ulil$dg~P;M|D`ypQc&p&+d5M7fIHM77doWfy01oO-hDfP2 z;E}_2CAfG#j`|t&r@t9Mf7m$LCu^dJh#nm?8v2vuGOFiTlC3_9N|D|#V;l1jjWN-N4D`p!h=TrvWOLcnp0^!!uGx1@Ai98t{`go8<*s-^9$3QBaV*C% zPZQ~|GTJoyct`0Ghac@w{qCK|KW>quJ1JZ`s9r2$=Fr&q05(spyZK5;OV}Do#g# zjNqG}>fYLSp9msKc^Jd`XQ4mxu}}m748nj3r)SWgLUL8hTG()k`UW|W@ph?RFVLMq z0R8zD9f+Mle{R#!A2m)8D7&XFX)gNnn1cQgw2PV0pTPJGzy7h)<#hCi#CK({iB3F# z{y5ePQ_!D^ppX1)hkKX5`1v;c$0+ zgx{-=Z{2Dv%5`h&%Vah?qI5ffV?3G1i;!HU+tUwI1A*k)G*0#Z9{thL=$VcF1O@@< z4_Flh+M#U%s)-1kh5igM(4Q;5Aj0zI+9^pkkdpjTkyk`;am~K{HDcc*R%g`QyxPwLJMX@g}_22nyN0$=D2_q#g${$~6bP5v*&7^2DW`(%GA`V)A8hW-$@(9oY6 zCiKTJT&kqvVwIk4yy5zPjQ*tV0MH+e@azuL*6$^oBjoX0^`!GbTwnN5IFQtd8T3aR zMA-SBPXRmx={JLv0kS@0R7_1y&lR z4A9Y^Kpb1YA(RbEXcRaf$UuLr!yTT-d3gN=`ePdcDm+5CaFLGwcr&9vg$trTJ~Z?v zm5TnPQqiC9;clULgMP}sj*H$>!>-~V8CC;1Aewb517f(_ zzM0UjqmaI38vSwdr=mZGFaZ#VC*#&K)g0ZjlpI;E)l~FHN2C2z2n^TOd57F-sgQ2R zg8r0mHipU|opjmlL-QT?$5POrm0t6qKlxEg6Oe5a;X5YS%&-xe;}GY*?xi0sHs~rK zs+_G_oCA-^_(Y}>UGhHWJlyj7A+in%3j)E&#!~iR4!gBvGuhIe1db3VH3M*^iY#xJ zgOow|E?+`+sel2A0BLL3LPLM5>FCe;B&brz-KjtvR4J$b7r>1E z#L&?ntqh?PglY80gNFVP{Q>l6D+T@WUKst+RwEI#aYS|!L9y#k=+BMF>Kk*>A5EnY z0$UqFnLS0DL=cxJYP*1V+-c~~3MCSVqu7<(Z3g|Z`7Qe6v;g|Eu?PBaH{ubpTjruc zj>m7&pZ;m|CkEzCLw~B7(4WBh&>tr%`eQ;)-XGfU-|Xi7TlB|?hW_;b0sXNpzfJNW zfUF2Mn~bGhLHAF4D}o>QqE3eSpTeoaR|DveQ{;L=giuj<{-{Cg zFp(7W$17qT1^qeVTuw)SbWy9UnbDu1C|ew15sxQ!k*9(;0o-O5KtX?Q_+D@|z=16y z%OI#)#Khg6`fwdjb7;`W)TuE(7z!kM0j4o=R|Z6EWe20I$XsQhWlxj+Dd^9AH;w0k zv(TRySO9?jBtcJ&Y25l*b)yP$9Xh~OG=u&G)`a7n!Jiz#$IU@&BM3YegvK)0oKIw5 z(2c;qL4SgyDd|P<69lrHfHyi=+8$g`m>LU{e!6e7ry?y1w0kU8j2 zFo6C9&!9g(J);)mOVs?|3ap+4w}IG zhyJ*;qCcs>L4Py@Xz0%o*p%wHSNvlp^yf&_G64Pg6xoo7Wy+4-cSXaLKgQr(a@i&NbO=*J~D_csUa6(Gzzui z!h*GL1j>|zH^0%th!A*sh>w?p{ASRfEq*Ql`m;_6UJZ<#+t(YUr-h_%mpy(S>D3eN zHy`@b@SUh2h2N%#+W(cPVI3zn0iZu6YUW(TWvJj+oFZuSGhv$4! zQnma#3Pmrki&q*LzMUV2$~+#zg8r1hR_waqaNgn4mXAsg3)fu_e$CY_LPLLs0Q6@) zXvr=vV?2Ktlp_m)A_w%#TnWxj$6~ITIR#MApOUFn>;xz~3PjuwtL=d*Wo21ooZsou z&>vDqCr2~($1sp!vWBD%c+`d}{lBiO2x4WK_xIu`)+N7LHQX_d-p3i{J%6WOZt&|2`W zlFuN3{@g93pg&yUEr;Mm0Qz$guI-djxzQJ)6ydr1E(9b`s>^D(ddIP-$9Qa|+;(_! zw^qWI$^;3FL~(k0enDkHIQOcrN`;FeQ*R#~-Bp?x(MZOa^XYy8dmmjb_95KRx8W=m z{c#jcax6)d9{B=}T|5i@>5O5ZKU|%)`A8=DnH-P^1SSg$ho2m}J{dI*zw)x#n-Y@TJIceXY zjs7%Qndo#_uk|hMSjGMN-2?Mue5Rr+ZpPh{vajzO(E-pO86(pUbi%yoPl1TIvP3fl z{W(y+L0+wZhW?J{A4x zrlLRRk|FXL-|s6`-{QXPHBm3VUQkjIM?rtEXLHdn0rY2ariOa9Va+SB`4Vjt3i{(+ z5^7YA<9i@UL4UOH69jHG3i_jkzYU;2i*X3kRKtmZ`^kp)Y5?>nLc$Y3e~KySkBU?- z1^p>bbf%&|aJ^z+zZsGv=z?h|Dq38Z+(DU}I*LqUHcxc36+PnLiEl42r!Fdqxko`80HHCT*&EKX98;jq~M zxz1eewST<3e5aJ6QEOh!*2s+xMgaP=v8FbuI4(yz%B%?o*A(j!fy?8Dmn5RSV?3>h zsHwyDP<6-J#n!Ht!fF)sr(FbZjaSxN9EE06fH|>kKsbmVrJz5)1bf}#8T2P{4*GKd z6Q!Hi0H8lwQuQAoCc}KlY4oRsv^uWKSP(#e^Z@h+S02Y;J4nwiKa<`q8f0kic69S+=&x--{$1#D9{%8V8Z*;?Dgn(qU zA%Olk(9xd+8v4^N0yALa;H9BIU>pVgX{4e*278W&skfO3HZ6qyY*oX|i~h(=vd8D@ z?7c!ke>CD0j}7y$*LBb+!|FB4Y&Y3H&l-GpS6|_fbO{~pY78Q&iwu{7KyT;KUi5KA_e{7KkR1Eky^AXZ=;c%_+>7$T+~+6Fc$Qug$4attg)eQ!wOAU z7AyJ_;~i7-{>IxwZ{Y?DqdzzZe>*e!Nn+Qs#XDNLbG#*P4&BbL)nh?_2IOhzPXa6Y1BS|Z-R*L0 zU7Iiq{aJ^jp+9-xyo0hz=W>*yhR_=8(?ajBMtS)6I`jUlu_ZtijKg}&!sS@eAE__R zS8%I!%uL0MY`%Jd2V48`j`bP<`jZ@H=w)(DP)^_Th;-JF{vMwL2l0wD^rw-A{)ihn z5+B9SLVq-w(Vu%uLTV?U%i&kVEIv7e|8{SMpdCKFVF`f#B!vqqnktj=x{gHQ@wGDc zVJ3EXi*%EmRxh(G`4!&~dUW*1Ojshf%$ODZDPIWv$(11q*jnUSUO3vV8L7mA{`eZ@ z5`$D%;_z0Yg~<-?zLp~4kJ^rw75hF&^KC_#id)rY?1Ko|g<%cwf_Ad9r6$DNGw6>2 z1^v0ZBojb?8gRcuf0C%^j}^GNV(bcO(}S3E4P-nLgI`2QU+jWPm%;PvKvgG>Yt5iP z_7XO{8gKBzDqx|9Yzk>&Yy8`-F?EM`n6Z2CDh#Z$4D$7IetOq^ktI3LQRMTJ8Mlox9#29(%bbo3i|VO2_5|rl^uBbbb@fRBcjZr@Y;&P(v!%|oQwR~k1r0X zQ{;gPktFf=6fNCfY5X7d?))vr|NS5TG&9Y#&$OEMNkyAX`<`i`B2kK;Oz z^Ss>w`r{1HA4M6ZvthR>$*Kt|VO069igfWR+%P4NxRiIRyvzVxT1dMXF{Q!_34^TxPfTJ*v4L z@t}13M;$2-fYptT-Qf*(e!+XM8+ zQ6iKo=~^k0HWB@)WuQN`Q_-JU^TyF1vhKg3KX}ayH}y&A&sCH_&_wjdfCK%>pMw76 zN68y;pg+H+qCf8p0s6CkBKjj-mGEr0__1u6O*2H;Sf7=21Hz zqkH|t{biS_PmBIkS^1~1(4X53^v4-Of3zo{KV+$&Ma72B;-&(9r^yD_Y*(${CZdQo zts(1HjL~Kp0`zC8G!y-KZ5*y)0HHq!QwaUpD5-%ftRP7)Ss{zkm`!@|Swcv82K2{3 zAYrad5lKgTxxkBc z#TxXA1$|GE`O(IUY8>ZZ^w7-J!D-K)uWWHd*7!wT^^R!8zd?VJ8R(C$mHi4N?lErm z(&nW+=-D+DhRF={Cz*}@2-0E%7rK6;EP8+T5c=?Zk++x*S(7`6UxoUe)Gt5%w#y&e zA#S*FNHP@|2V3}caHBQL9?yTZxSv)*UNS#=u7I6*;N>}=mRWDrm#$@?KVenIO!OyF z%$b^-xrKCNn{YqQpH~}4vEqxS&UN7D_1MnanHc?eUZIS-$)gI#^6AkZBMALjC{D?> zacvSY7`3sTbN`;4!L;a4?L_p)`R~ymFAKpO{nRjk{-8Bs^rxI=m-Hv}N0z`J7iOE- zzx~*Xt9l*wHom*ig>3Yv#^^86pI`?1W5@N7cf*>_^=lIabA8J5|1m!6AzJ-v)0)`M zC5YF7%g$CMhLFybpMPOo7%BMao#w?Sltb443HoC|6Uz}%N~`;_Z2lsTrTJS;%FkRY zpB1y&W{jr&U-aicL4Wp+mR6h8Sec|*9X&^TzbD!#9Q9!z&Dc6Y&lP2XALxT9l|Sn{Lf>xRq&RA(lrorrbJda_ie8{K%49Un8SGUx+mIU7#U` zn(r5>M&LE2+%hhzmU&$H?dz?@T-K)451p0LgbN+l!UpO2qAklZ4C9ccdZvU61eE~f z6Kd0Fvb%NwA9l_jar<`ij)t`#78^OVX?V0|TRVQ$bWw}W4@^Lbt3D@hTd>b!N$y*3 zr`#p!`rE&Y1vuxg$X{8yTYPNO%JUvqcYOXX9_Ts|{b{?@QL$s2C|~%jui1uveVE;} z*b!MhKbj4n{i{Yrr|tQ8TFD~eV7l@4mf{oFgS7v-yPZBy!20xt2X0&Kj_F58Z4Y>I z&h5&MUs5|D^hYr~erMsD>)6j9#a)G3dlOq*T}C99uP$2Emq%?f;&*;jO>e37SzX_r zfA`4zb0>b@%g8l0_F8%xvm|u$=Qq!<>>QKZ8~zPKe=daXG!5JSTJ5^@*OMXPivFbQ zIsQDEm10tSsaH)c>rRywaOEBg3*=JH3lqs#to~ujf&Qf2{Q1r)zEFZ!?M?=*Q!hI%CMsU4GN<^+v$YYw#9KW7$yw3jeZWlO zx$luRN1tyv86nZ?mwWPPYry#%5^aH{9Y@=?-1sHY9#kuPtUcs`*{qJxCu@#%?C6M? z)fv`z@>u8Yw>NgKqz!i*d$IS|kqd=mk=$}=T~RLAmRwn+ocl_dCcBr=6MyV1e|P)O z$BX8NDs`syCdqf=GJi>ZPJ4N1Y0}QxKVbly)9}Q+VUI)KF2a+=-!* z2j`Z7T8AfxjpWYxZ1LC{$AnzUShq3yRM6LGKCNLJNHUGZ&_q;Q#dI&wO3}q z)yAY#_~5R`y>j1LY!>_$PNs^zB=Wl1TFHr|Bsjku`~NlijJZ3Xk`vFja~X6>Z*@LnE`HYW$e?q+yUV$?;^*9+ z46eM|>T+SPc!8Jr8`rz;D=%h>pZ9lpvd#65xkO3Iks)7I54ZbkB`&5v8QNgf=Jsf>#HB3p zw|;gW?#-DJrTH#z1Kir&pWT$WTzuqhpudMlTc^a8t54o;+12Lp;C+~J#ZCmqx@2u z0f&}vIy`|&3OcrGs+JUb+=C4!?aA~3W)c9CU@d8H)<#Y+3D%OLbGN`?63~+3&IEIU zNeTI(u$grC+|ibE2R@zKbD&^Xa$)$*!t9>H#EJ{2UKSrXQWC{xlhRA$ZkOgyl9Mva z4h)nX{akkNetF@m@{BLn4(HrRorEXBauQ&Z3aT<5SCxLP%DhlRhw&u9CY9XD`LmpK z?Li@%O}g=@_*TP34mqi`=5aY2Px|n*y6)M{$#_yL15fIJ@T8{Bdm}Fzft>WD`{A?h z2H+=g;z^yaTfX*p{v7D&8SMBv*xfhO4e+Et$w{LlKfiq$8yiDRd;pL1`M*38Cy(@i z`^55FH-G&Y>3;L#(JiI3lX)b629Hz@^GFW=7kMN%-;}955_OzMx;&Lfx>wm?*qf^R zK$oyt=YiSmN}Z*rFu9@vYOhjXvUw!@ezI3iS1h>=;*q{0#>yVpzC)xzJW|kp+Y*RJ z3Ssa_!G|m*cqiH4wW8MePMm8>|j0=LPEU$UlR4q^}T<^vo6Jk?=a|^P5z40FP8+ z-sW1OVTP>qNl-7TWAI3E3F-r+n!AdPOdg4-26!YQ;k)VB>@IznN9qSW(zhZ8k5td% zkxp~)NLS2RJW@5WUO-m}=8;AKkEHS474S&ki78d{zB72FhU&Qf?+}mlg2&L7lSgue zc%%-biq6=BphtQOmAoEC>Wv7}l@Im<9_aW&uks?eRMG(4AMcxA*$)1KwT|zZs@JL;95RW7Q@kkGuJW{C*2anV- zfk(Olc%;!OJW>M*D};Dr^r?GUgepPO6ylKvt;GJFN7`2X{4y8agT*899$@fD%~N@# zViu2N5drZ?+c>S>sj_1_6&`VLpjR>g3>&Ucx+563ioQfq5j| z_6a=FtD>6OHz6MBscc#@;E@hZ{S&!6RK6=aIZvJW{_<9w(2~AUlaiD(fXcJkr5wd8ChkM`~yDNV-fOsh!CqX~o^t zv)sC`Yftvqq)`j0WhwDXGF}i4@@+K&JW@Lck5ntp!6Ol-=aF6%waI#GrhxCfAjBi3 z0UoKt2M>58jie7gjxVi*ARg(mm(V}Co-+QnD6^zs{zzQZ0r$IrN22hV2Y4vj2JNy? z$xlwl)yQN39?5Vjj}*L6kVJ)fq&?9p<|PdllBwxTI#gFqy=w7)z zCXXa&d7;csM>-DiNFrto9_gc;DFQXlBaz}XARcM+IFFR3IOLgJ zV$y_&P5QA$jLjo`2Ru@VV4os3=96HX9?v+BbQsgd;*s1KJQ8g>9*KrW<#`&s0z6U> z{~UrwShdd48F-{LHjk8y`8ETO)XU(JzQa6HHm;t*BaN_mq&vJzVtLybJW@jti$_8U zPT`SKaV1?W9tnkkd8F@u;E{$Q9_b*PN7}|;!pS3b@%6$yQU&iT29J~m^GJE)Jkoc- zBWVdjJQ4~Ac%*vz?wvu{420m{@JJ1G+>g3(9%-1tBkknmk&u`xA-GZ-e03Pb%*H&E z+p*Nzqy+Ft!#wi3f}A|kQHVzhf_S7YFprc1@klfd9%(0$Pp?A5uwsY9Ar_CMEeP>Q zk^&Hql#ioUP2iD$V`bTu>X4s&HhUtEL=fQQkt)eJz$5ur2z+JnNO@Cvq-{7hk5oC8 zM+)Je7Zn|*iXUR}NO_z*k{2(n+FWvuvLV?dM)IU`qxzaCgQZ4-|1po$40t3jp{rqp zR#f6wN%{fDe= zBjAzf*uI9WKENYI@%F(y5|wWrGE=M)g=n2+xm%F)Ch$m!yo3J{kF*o(4|pUW zUe7Mf#S(}|YAzDYzzet~o4;W5NF$VS9_c9Hk^F?VKGVWg=U>#edf4Lfb(qHu@JO=+ z#(AW(96S<<=L&;Ia>EP2JW?TEAV`&85{;{5@ksT_xQ^#69_a&*U69}n29Gqv$s?gL z<2;fVuc5n8RnUbhVs2Xz#3MCM;*n}+Uda`mi^?=`(y{JYLb3;dZ! z>Sgmtd_v|Pd`_)`8;AmH@EERC%*|x(5Wpkd-;*r#_SHlE65twVp{4q1pw>L!In+b>Q z+)>Epk&cY>NS;A|!6Q{p!z1O91YsWO3gD4`GkB!#DLj(C(DXdg>%Zrb_VAl1Du{b5 zBDM9`gz%>bxg|e$l)W>BN4m|sVG@s&2k}U5EFLKX=8?R3^?n>EZBjWpiAPez7Hjk9 zyb&!tFKoYO;C8RT)(kJAYz0YCvozTfc3?9iI$H^l#PvMci zJ8(ba;E}$wc_akp6(^6BJ|mBm0eGYo4jxH^|HWVMNLOa!krc5r@<>GNf5szyo5~~E z;3n}%8JK}7JkoInkK_h;By#T7So1&gNXt-)XO00L>Ama;|A$KxbzfX|X^b1ZgR_Tu zq;6fKsXWq1RQDw%{{ynD8t~9?9-Z0$^0FUR3!6%l}EBm`ZpfQ!A^mH z6~rSgnZzTtuz92oh(~INc%P)kL0!}WuXLvNAly~k?MNl_MDbQA7=1Kq>#0G0&?%t zHymg1NJstOK|Io_DLm5KJrIwyQv>Fa9?H(lBb_e(k9nk^YKTWtmk;C7@jnssZUT=) zTFmB=G$0;nJ#V$h?9*R=?-@RL?)Pr((%0tUFLxEOc%&AHM>-AjNSmbUK0`dx(ZAu5 z5;0?QIC-QzoWVpMsc{mIB!1veJQBYS#3QY?l81RDF(Zga%CZsOQk>%H*`wM^tkciI z(>&3N@*4G)g6|xPO#qKnKh7idjRGDin#m)%8o0tdQktu;k#BJpX+|DtFsx2L_9BBv z`cAOn3nImfxkibyc%)^3M{0z5q;S9^X)t*tqQC?mDS^QwZBG3Ucq9swM>2}<^HmmX zXt-m>=8@vfAs&e`fk!HVc_hDSc%)O~JW_U&x<-}^#3PMx;TSwpDa0euCh$l$@_NbR zJW|y-k925S9_cDpr89`z7vhmfaS)G0s;H1zP?sy?S&=m6nM09;d88}h4K%q4Jklnk znR%q}EE&KfX>syMug7_$D!?OQ#sADB<@NFl-^H3pAP?-x0X&kH4UfV{Hji|llSfK+ zr%d3HE+LR4h)3FGQGKRgn4~+z9XPyL6!1vOx*1rLlYmFs9<7+2g<8twk%p25j}}=; z3^REo%}G2`6T~AS+=u-I)xS6jp^IkVkwPFIX_(0)y_93|NJ&f{$wLU@kJ!OkzJvbjn?$sFxIW$Ec9$l{TX`4EOQ@2F{n z^ZBpx7KC}Ee$z=j(jnsQJiQeG(O8kD#iNb-Odjb4;ZCh2o+3zQ@R3q?3NgVz zZdX(f$^=nVB!jmY66F%qF&XgDl~cH+rGiy9qzL>>ksCFYv(O%Dz7bLZ@kl^df_NmJ zn2B+(s!M{<~fM;aLCk@RNfkv5O>NE9zNk0c%2zRxB&@aov=w%DW9 zQ+T9$7LTNXZI?E5?==w_y}kA9A`LH?N8;OEHi<_fdGb`50v?GJJI4>PG|)`CE@qbo z)^c0)>I!W?z$1}jy(*BJDsig;k7O9LUIp!<9!E4pnkq(%NOJN?#{eQtD)w6jdGtYnF zkwk1*JW|LXd8Ea*&s{08iyGNG(hrNf+=)bu1p~Jd;P-40xn- zfJZ7?{|6pvKAoZm^GGrnyeoy9UZx<<_2xWf-L|!I;9el;T3_KuXAI* zJXp;~r|@|ih|?+SS0MSw<2;fcgGbT>JW|0#9_bXDM@n({6OXjoeFh%Mp9ek8Bc1;v zkED(jM^F@>8lqcb*TXy#Iu=cMc&eH*cJ4=c$TU0>!kxh*Su=Q~&2??-|HLD?yTd$^ zI+pL9@a%V2Px(9}-t`_SxLmW3a#6REi-Zu%6Vv~4f5jhpBxk@Q9f|R&h*rbL>`A5t z8?M=1hc&Wg@JP#0RY43MDH*+v!6SKe!90?;A%5vnP9CWucEto9sV11gBL(C=H|6A! zK2740Mq{<6@JO2>9%-2;X&_`0kJJ(y40xnE7LQcT;E}$J^GG4X)AL9Hh#Mh)!y}b5 zc%()Kk5tCwk>+=uPs58lHSY7D#3NPwpWu-~8v0!Rl1I{u-8NJsca^~-b-_H6x921t zX_&<$mC*=1G{7Udv3R6y1R+PjBk{&oJZ14ni(&we6w<)rk!&aMNIo<1NTgVe+iQ7v zD5f*>NFi(<>B~eOX&FX9_e@ND;1hNg66^Ik={Z)5@|Rz0^*Uj zM-aAes8frW!XxbjJW|OxkL1_#a&f=$d=4HdkizDXo(9Kr@JM;n@klund8Fn)@JPkM zFpt!R8Pfzj(!97BiLcLmA9ZaRh-|}H03Jy{wx$_PpBJqn9jC|1BdK7$DN#%wiHLFr zJQ69+{%h#P9YL0o+oMR^qYQ^HuKm23%_C{Wg{9Am3IjaSK2N|Sg~aR`=aC#dXX24G z<1Twpc<5!Szb_zMDLTDZ^=9Ug7L4;q2piWKc%%;yj}$VAN17iE^GI&)Fpq?a)ti<_ zdOm?i3gqCC0`sQlk$icg1E=GW`hTP>l(*cbijRVMq;G&nvY5}oBZat^Ks=J-St+Xby&CDrk0QwuA|~)iuJ5MsNa9j3k3`*!;PycnX8;~a6y}jA=Q9*p zJkpb3h(~f9TgK#(%3&U<5b#JLMlg?5(G2lO1*)%M9;pfNNX>vpQs>~2iXa{-*cXwf zG`k{y3E+`hR&4QL@JNW24;ehttHx{Pc!)uZU&vNOQLkb#1M` z733}7H&VyJBQ+ZMw=;O8(1T)a{fDRFkxB*UodZ156aC`w0T*36htEqj&p6 zyr*_Y_~-c`73Z#e(g5*Dnxgmj@-zV+>5>NEk*a;Umuf_><*tW$q&7b%Ta{k&dVA&V zJSi@E2Ljxe$xY^w@=w(sd>3kcFuH-kBMBfGJW{opf4|}qLDk#k&QfX47LSNS0^sc@ z@kkv-Y#vE-S{_MBC7+W=>N7hC@krk}d87~exqh|W+`Awi>51VXh(~fwn9Io{1wcGf zMD-LNsc{mIlq+l{wfLsmQX=4yd<+1O?bQkbQM~Be4CV)rs)sNNCii3EhJ|hN?^n$@7m8T%*OyH4@$f>$QJd!=& zkw(OR`>LHL&F?m=LnC8Mb-W-RX?^i^^Gd@koIK!>8gwm(@%ALZBh~0yc8pb`)C@^h z&1T=)A2ymjcGx$TUL)%Xc%<79k7O@j2=PedgB}bXsm`zj^f&1}5^fGm9!XPR8XhU9 z8+#Puk#e}V10Lxs;E`HVqO{|gJQ9At*f3R;%_EV8m^>1xK1X5_kMt4nNOcO~Um+ez zZ2um>BjqKd%x-^xc_adw%_9X*&m$c&2RxF03&bP!fO*L~COpB?y#$^)OdiP-;*kzI zqDT;rlyzqo8RC(Ir7Z2L=IO5*=aKRmJd!`9(O_hpN6MbUBT@YTkK`+K+r>$~8jZ3T zDQ5FXO$;7MqSSe@5#W(bAReiL%_Dth@krb7f8vqI5r9YfSO$2c5hjnMsGF*LK&)oI zpy>KLW`IXp#pIE)U>>Q>Gu8xsjL9Qe7(+bLAjBi(m;fG$)NditI1`U_jzpT4N753; zKs?elh(~HjRK0-|o5Ukso|Z>?Kg{Hj;?4UZ9%&n1^vVC6M;c-9NCOa$WFkb}f{K5~ z;E_^-L!)H}RPO4%^Ge#Ma`DE%{(uBy5PGRoRw=ZX6s;)JE#>Kp5y>o)m|xH>xUH-j zAA2g!O2ZpFcJMMraV;rk1%f7ev8+}EQ@qbdqg&*fVXgX6(A*t@cqGwEM74eee!)&x zq`(%3CNqozw^3kqc4b{u)lG#sT_%t8E;)9Sin8=n9?9&ld87uIM>53d03K--#3PyC zHe>NfT4*i6BRK;eX(Z@xd87*QYN15}nBaSA_SNFgARftpCv2QY@=#J9!%yOo-Usi$ z;WjOg)UJSOn9L(Nn&U4D=-?MRy+S0xJkn7UCXX~S4Ug0%v=i`17a<<$ozw&#sbA6I z1K^Rk#irqrPEX*GnmKtSE;S~PlsAz_I$p`-k>X8e=8;^HqHUZ!QtCpIAr>=%N2--w z$KsK$%KU*x3i&r4NrLBJd8AeU#v|G5|A9wp_#g2|wqr=`$vl#PX5b_qX|2)qnR%oe z7Df|!q&YB;R6LPKQlKfpJW}x_9_hF-%p+Z$nMcY&&Gn3G38X1c;E~*j5RddoKaeI3 zcqCs2j}(s50X)*xDl^}SJW_6zI?N*t!aS0PIpC3sCh=Fg!6T7qGJr?QvXlslQgH)3QjcPt1$BVUBb_pVcqD$a3cw?wP5#6ql}+K1&{l$` zQ7RM(HNYd8S&FQ-nm_zZcbrFRlm$GJNHxqO&4qa+QznlzcQTJ;O#(bpAWdZiNw{94 z$mEe;6PY|xjaf#MKH!mLXcx&D#)HlC&`%5jkCZl%N9v==)N$3NTS+l_q#C^-={aT9 zv4d7|TToYmqfB5PX&Hk@N~g-Oc%-?2M{1x2=2}QJ03IpYNQrB18sL#ENHCA2Tn6(< z2aEuZ@Y7DLzU}Bg!1`NOw7TBpdR{SDjKx1U%9_ zm`9?jLp+kS36n>vkp(;wg~1~=K|GQc;E^g=JW?M`7VtRda_BP}!v<=~M>EFLNK zZ+Rr;zu}R}7(CKyEBkG19;rsL24Pah;E~GMJd)P`hDSoHaq>t53?7M&G@FS>%CX{0 z=2Dx=BXv&8BkebthDXXYnwCczn8+iA|2>bip1~v00gq%&f_bD)nqA2scqD}UR352` z%_E^r|B^>a|DW z`>RcoX=Z6P#)MlobSp#M1xA;wQgs(t+AesvYU~j@(!@9NOK-JPft8jY&D_*#PBhJ~ zzG`%j^&C3&`?*NP0$R#TD=}-X_jhic^tknFBmb9AH^1y0`x-a)4Ha!pMR7m8CzeFN zY469gJv$mBC>PGFy7H1O@IzHT4R#M~6ZW-DQM5a%yGvyK;v?#AmqWjaY_Lz)@qc=0 zm*~c2Crx&J8Tlsa=aBlP^@emyVTX0Z=&J`8=aS^FZhu=<+q`k>*hz!nm7mY6pI-Xn zA`SV>Y(;EQmunP%ByZp|%uGI>%)%HsFV~TE^{NqZ=NS z8^3vG?2?`-7;f-9{qjvqqn%%VN{0G1o?`PzF|!xCFaKv#j%Y&9=hnhaS{>`S1Jq+r zbX~Cb0Hy6n`Ex6fW|eqNX|YtsZC z$>zZ0fhRkQUYF9=m>9GlQjN{nm7#iDUd$tT2WlrasrFHpXlCqAy_hHy4}wiJbv~b$nt=Bzu%>GaYFyhXJHwKY$)!FBz~BTwBo-w3U!w0$dq&_58Qrg&ebvdp{%D5HgPu{ZQHJ`fG!Mr5&?K5<;=96 z#7OU1`M-G1-qE{`KE9E>{<>pg2I`i`%74Pc)o-W2#1tOwqFoWUbvL$6M+QVXbVKlc?HYO8zn!tZ-3DdJm;7ipL}>(@CM(_85Vca z@3r22d2GyAJ2>@gf9^jwhr@N}ot2w&LvxOdYStOKpTo6hOyqNmE}W4I&wRXCX2YFG z`QM{2ANLA_Veny3)_GF8k47R0WXl(m5h?ZQ+b`ckdiHAtT{ePIJFC{>T1Ei z7EhCW+=Ve}vB2%cM=OYR^Vqpsie}ecaWeXqaw^ns`+_zTjl;o5C zMMkZT2ltBQD2NZ(+POO&&J@d4bs1RvUmoe-c%(6yM;fN^uJ%O96DoOE4o6A0dtw(5 zs)UaYN2{*(;$KIomV7!KYt-(AizL)2NPLL5TkS3S5203d<%dMKcJEoY2)DJ4e%SB7 z+D94;OBy}>khH7aM}CY@XCd)1IbpT0qP*l?JCNv_-tIehf#f~MqaRcASFcxJCwbrP z>Bpm2+t<&Jl&trX_>^{c^#+}PBp>*%{B-;hh#QJhoVTah)~k6h4(b$2(+4&LZV@prxFKWBS@PkDe&*yauO zn>BWC^{d}XdJwD=5fVV%vE|VYW7^K3hMmTb!c1d!hdkO%ZrE+!7-5yTcgNek+KrL6 zhbY^cDK-bFVQ;DW&QW^JH2YQ2hVPpo=ow4V%N zt5Fg34KN%PndQd`N6~TuVLNIjI4UuJ`y@CjsbF_XVZ=l@iZ%(3I&nGS^W}YYSI+fa zNgKFwgbhd0ua4VMxii~QFdTKh8rV@;upL!$`xM}#IPIt__wqaLT_3u4@x%R+^7`{# z^|war|6oU5X2VgnPp(aCM?Gw-{n%FD&^~TQJ?VM~_$UBJJ?m*`d-?cB-_y=lEni-> z{dm>#qW?MIqq+y$fglC=s8<|()Zp-|U&F88ejNP%@$HAt?>>F`@bk;ZFQcE?d=wy~ ze*Kq>;v}P<2y>EAFV$FN6wks)o;i8Mg#ipQ>K;r+B~Kuu46E@I%prPn^NRg8Ad9(Z-4ECG;F^DptfJ{ea2N)Yex2~uGQEBwvxH#M+x z02!66(EZ)2Uck7EhMRLP%qL#H6(Xa4Kx9;_Crm~)=%^bs4d`68)GArf=~<#-j%=ja zX_Ufb)EPUCx1^VLO3e&1%0i(_4JoKVC;YY^n@u;cp?6tQoJUuv^Zlw|kWm5$Q<2&p z4u#0GoMhAjn2gG9peZ~OaE8dJBxL8P0I%n~-=0xXeZL9RD9zsx8Ku+J8`u9EBBOeE z{Km;BLq20qh>RLQ_Unw*79G^LlF51)sWT!-oRcyt8iwm(kWr_IamIj*DiC@^*k?H+ ze?s%Y`%Weqb=PW^jceu??+P>W+g?e8dyF=djMBI57)<&saBZLhCZkTH){j2GaMzrC z7dBTVhg$f$QLGRl@g zM#=Fxza}n3D$S75qjQRwVQI=T_Y1Y(h zH=mE$A}ZNrR3uu2`_dj7Wdr_RRGkyCRXS>c-06K0x=3#9?74j5M#&W)wxq>roYos# zVVEK7B`>x9s6oHt!POt$2BE7}p1m9uICI2G6rl#ls6;?UsljB_Q$R+gvdO4-fQ)jA zVZH}T3lHW?*1HbN^k zZprOSg~_O`5E+$&(i>7-rAg2Dd}!m-QsXNogB8|_H4quKQtmiJMm51?l(FThOg`eQ zY|3OZYGosBRvt`74U-II>l3fv4Q$NUe}kMi#DD!})nlC<-=aLJ&GN)Uk`-6zW{TF(%D*Xr^n=l3$73l}aC?`NhS;AzLLJTJv z^^#3SnNKF8{N-UX>Li1Va!AI`p%Y_6;sF_DjO+*%aT~Pb%`<&YJb_)K>@LK^cmGoj zA&(fVgSV313X@UG1*jXHTRts)$s(hYC0erp8P(5I-Y|iT3M3wMlIZ6yA19-Hil{{~ z4`|h9Jh*e5WE5U*5*bxzB?*yHN2VpCz5_C9h($&@=rPHtP9_=UI1?FV2FR#>n2d6u zbl=QWme@ab%JQmCNOY8xwdC|qO88r~cA6mp9qZYBqC~MiwihLo{sbo|_t$=P8Oh)ZPs{%4g z1&;z`locSOjJ)F^GD=)5pz)bpGMkK&#rw#ggUG0e=!=qoj2em4e>jX(5j((#jfxX$ z7vH(xv71}dK}>$@>j`93CWDMJXOK}L^2T@;870A|<6tu{6p&H7#>uEW#UbzH64S>- zKt`QpkWo9)Pn%c%CZR6&FvuuHp|@-O2I*#IbS4><#wMfUkOR#qV^4$YhiMnA<{xK~QH$u9U&BbN zhq8!JFav`xs=@qAuRSq8NuR z&Eqf`)sNG6&|Q1SYfnquZ$L(Q36cRBWd)H@kLbI@DA-KmL^3K4nHH}xSeJya(J{pcSjqekd<4n;{}L7<15vikBqDSr{)*_bOOlgX&kVWbwGSKe1p zhrTM==>UqVqeIb78^fdYDG(X86(*w^ar$^vEKEi{Lurw0!pM9&6&m`0jA~+$QQizP z%8o%s4dAFfEHa9QEO@CjGK?g{WK_>M8Kv)Ow8&P#27$(1Nr{#e`Mij`h60gM0mA}6 zSY*_>Nn}(u>e5A=1co~mnRfc3dd*ZaDw|104YSCobDU%pl}SeFbCOa13^FPUWrN_> zr~8L1gs64Y3OAGfES4+;B+sZ$C^%6*I`FEG8M%1d~xw6UnHvyzg0Ll$mFa zXm`#IuG4e9b}ip6>v%SI%lFekYPnRM>~!H!6;i0U*6E#b*Z@FA(RjUwF&CS7EUmbo zb_r&Z1>BR(d)Q>uD1DraN+WX<@+Oc`Kk&TnfQ+)?Afw91$tYt_p7LSjR}9*nEI_U> zvG=ztCJO|)@k?6cY8p_oaRp?0!J`IT2d{ulk>0l8b7I20Hgv&?1{E{1$?z~T=E?aN zucNc|U4 z!MF%_4<@7B>8S5+Vv(l!!vdDH0^!F-sHf>(4#UX3fQ+)3Kt}bm$*7-5C#fCVhHfZb z$F4Yq3WK@BBpeu*oPzp-uM8h*Pe^N+gN$;7$f&gG$f%cqjFRHrB%q>$=K*BYMmokx zif4B&rWT>BKNf_3dA}@Iw0uH5B57G%Dquw-Rt2mq@l?%z1 zLuAzPA|Yfp>djx1QJQ48=m*lbP(%s795E?{2n_LDQ`U}MEehq{S2s7J9rb579!6UP z*(*GjrJfd5NRo9d$5?c%)eviVAkrk;7BXgo*WJ~|n=MrFeF_<6fh%o54bp>tlk;N# zKt|;WcoQd+QN?UBDtDZW@}~R+8P&~8ZrI~MUQ>Hk@1SHsHQ)KBM3-}(Ofsq*kWrdK z^@9aS1>F3Ko!hftO6p3O%LyyD6ZKclKt_H28#1a9g}-)L@EivjmCYifUQZ#TUh!@u zbLT$Pl^@w2n@Cx`!W`X?!}y`yS!7fuOh!?eWKJ30fc~2mt zGXIT?+RI5s3G@B{WK9^w>m>n(v-Kja>ElMnY1xc+)#&;j0!~gMv9N> z*PygU`Ki7s>A&b>L)W;oeqp(O`v<2bTus};!~JV285K8~jLH=FHie8jOPk%8UF)v0 zyWX{Le(VBS9m};!ZcQ)kQdM0FeV;e=ElJZ_|1ig``PI@4BXM_8MQj=%qrR%1$bTou zr`L1c@xzs~Hn^oQ8I`3s4H-2W-(BMjlTker$*3Lf|HsHE$Nu42ytTn%r7sn1ATny# z)3l8-RqYHiD!c5Vsfg{%ngfcf9=02W4&3<_F!#Qh;>Y*KRkJp(mHW2qWn&8V^@|r{ zV`)iN8*#66(_^nHHb~O0`gfXa5eQB46q03-QA^X>P_o~WngJq!v(~KSPT3T+fM2-E z!AhtaC2WKec22gyuDJAJk<4Z>yzB%r3dgN**jd|l_E+tY0@-7!+IQz{yYz9fhR?o` z#hQBxoGv`0TbC@+-t|-D0;#sOq+az={l}$-y_f9caL->hJ-8NR()3XCL&1{eTHmf8 zHZkIVGEPRBW@mp!tMezKrPq9Hw0)5MxM_*PnvoT0D=jScxk6-={UkD~i%mwofXJv$ zh>W^3k&Fs;q!HJX?3`unAJ0fe=_$2Em;y3NW!NaYWAl@~)Q&3%mu>bN9_a3zA9;t{ z&~xvH+U-XqANSt@WK@8SKqSRs)GcD((ou~sNI};w`zt1C&kxMM?INT1MInztMm=?3 zoSzGmQU2p()SiayxQNqw)-V~R6|#1_fb5sz+YB=5oW!Otjev|gGnI^rfXJv_B$$j^ zCO4IglIFhtVrBS;I|Y^hDH*jrtQR7q&dKlQ(Ybpf=F0>!N^2R5j8Y^S0WxZnBuZ3a z5*f9<0wSZVxke9lSpYJs04AgEOVxda$f)eBjz|z`30JyzU$|TC?r!w zbBGsZT+5Uf7DW{wj|F7ZXii@j>w*)I%ikA12XDEoN&^os9EtSp3Ax@zLy5H-g?i8WK=ptMyYMhn2wB+ z4Lf|&>0Z20h|SVPcHIh36l^uFjG9Nqb8+MD_pSKu56Gy6f=$DB%pJ(~VrP?y}&CWyP+`qH(1#8Rbvz&RIOz)ozS!I+E?2t0m~PB<|uk8TC;}U5QIp zsIe0N(*h==YHI|DpON$G4-ti8{PGF(OZ;0m8a$t~AT4iKeGeAbIM&QQ0>0K-`YHt@xegYY#Zagy?Md6aAZjRSZ zLZN5*_8n6T&0X|r?m@!OFaOL9&0EBSJUB~cz(9Ik^K*rF)qf9c&SDr;i9c7YH`xs=@Sq2%U6v(57U9+M2Sh}(h!okfhPN*f_ zU*&)_0wKa6qlR#PL?#)gZH=+=8tfYQLPez{d&elS#k5rs%&l$tz4l*hbBBSEbv+8!B!$E3~4DnteAfvWcoKp3TKybV6=0P}c zVFP7k%?Vrx&7f^$CK(lhpvtX5VEW6*e3}Rfmo?)4BAfE2T{SQnMc|@b87HF(SY%WW zgN$nQMBH5j$f&(Mp|;~>)Mq3@H`xH8@{kuXN^g1CHCBU@!Bgdb5(hQD?m9@sBjp4k zGO9xV9+Qlc;zC8wV~|m22@Eo7Hljj*-Aqgu`#FzJE#BuSS?yEIdjk zQZ0Hu32y~tR3I`G!HuXu*eG!Y!(@~S2N~tVCZn$TCR4e<5$M<3`AL}X6W2PKn_4v{ za>bNiK4v?6t(ON{I|NcChY%w1^9X5`<~4In!bQUX8Ra^ai~_PfLXIm=5Wk45cvW7* zLZ3lKDIqBHhEjw_5yX^A`Kw$gViXsT)>Z^XmFLiaiRo%I#fL#gDG_X+ny|^JPaI^_ zH!63RFkO9-=3@pKMdb>$hRLXF!Y~;{jZ!>Sd5oCTudQax);KJUPjz<)JE5$fyCX%^$E^ z@o`bf*rkAsBBTA7WK^0M0il$3uSN8#oG{h{kWrF=j1t}m$S5N~MjhS^$S5tYvxi0e zyBtH40U7nccN7GkLS&Q&Oh!q7ne|i7pTsbrvF{x?)*ml*B5SjQS2LkeX{{ z805!6Mty+DD95tt$S7f~8XmQsgbs6!vGt42K%iE(J~8{e$XicY;Mz*tdn-2|QSga5 zOm)ruoc%Q5?NdP{8WV)TbdlxkxUdeZx~ykk+D5^KxoB!}m21q#J$91wg2^aRGl-0` z4)#L|hM5G*UzmrgoPxPtKUd$Pf0Nfff)j16sQS@G@lz@|3*gHGRY{n(3!|6D%Ypg>LM zkhK6A<&%cir;n3SA;VN831PFwLz&m(`7lN{h??igs|U!aeij*3%OImDTw4Jdbq$bF zq>Z**JYk42KF?>DBRs;xpUX);9b3$WaYbOr7w6Lv`XnkhouXK7MR5aN-n*h1kBNB@N8le_^04l=5iNk$P!MwpC}87HHRL#HF7JQ!ruLWqp=*jneYk;27g zNYUQnO?1?$Xf8wYxThWV!SEFCQ%7X-yL-T7l+{dR6h#ywqn5b*iHwSO0c6y9?)435 zHDXLeGA+b#O?W@n$bmsdtwdGDGsq}5cfs!^v zUee?xG07GJ`~G$ALu34efgS5jLmSqw`b#ouDT9o<8_kQLBAH}VkJltJiYGRB-X^8dQT<^IIxMKksr8OKHX$X-~k%Z41>(nBrkWssesQOjoWR!o)%O(Bg9Ap$_bMiPD z6=J*bti8$Xx(`crDX0n#GHMfbj?G7drvu5CCX!LTQ^+V~u7SXd8bYm@wZ$PY8TAP! zqZV?IQJ-Qh0U5O{wx$Ca3?kvYgH z+aOLdiXv=?xct)ule|`ri~E?MS12H(h+KRG%8}P8^KqZF7O==D515RS#^@J4$9Eyn z$u&6wrY}*pd_*pt8=DmdATmmxKoP2=b}XQ524ob8Dut(3%>As)4U5(3C5 zLF$+$gNz!b+=Iv{)yxt=Mrj+vWEA2tL`D@^e1*xVR&RVdL`KaAWR&N6lK>gDAak(s+II#Sg(zRlMaFD}$tW(RPzD)Q z36oKnmw=4&C9el$6n-)p^;q_~l5!ls&d|#PLdGp!2@3+~2=p9>QJ9QMp1YOkYis?h zpnv(k(K@On&Qzc@)mCX~ONwE@5QB`mgcIu=ZJvgV>N_y65Rg$v3@(KaJkhmrP+m6w z>`+__uF2jd36N1w_RaGV0%TN63q(d4TLChv6_8Pv4FMT-2N>b=qSxv{WK?ohW@k`H zok}11NWJoQM2finfq=v$IeD3)qet`4mY=2ZFa4%gFz3d_+JhfMqYp+mwIFKoIs^t8 z^|;zRV06WO0YFBjN*#Bg9#j}6WZz|yQP~x0S2}k8*zD4E9gtDBls%f|D~+cmqjFRV zATr8k#1mauktEhCFYCR^w>*{xcCsc9+y+mDYH=6{F+WYppZ5E-Rt2$4|>teRd8OCUy>1o-BBS`Z`AR2}QKfXx?4G$08I=aeC{sX2DH+5o0W!(~ zkWt1AGOC9`M%|=FO(dh9$f>kUiR?|1NYBv7`$SCutL4b_1mh%Q=)V*M|tV+Fn zF``O=jHVq=9F2(EwWYiBpMs-n`_j zWs6H93y6(Y0y2t+Z|!iDPZn9oCe7QLq)pSRe<|TKnk|}<9IZ9SV!vHly8U-A)GRS|}|CKt@$!3Dyu9dsP(w8lNw+x&3a}M7;tW$SFXwH^XO0Uyw z$dQ1I`ceeQD5a#)HP&9Ux=-n*`U;BGEEg1wxnsV?_na$}jLKEWR#frb)fpCRihc%@ zQPvhF5E=EIK}MPGh>Ox1%93cFiHzE$r8_MdrKiOpqizB+DipC#^{(XvGOCfw@;X&m z?Eho$&i|qOyT^ggm|?~~cCv3-lASDrv2R&YSt3hRwAv~vnK9N<)}m6DR4SEF*>@t! zT2W$bp(sV7#P^yR%UJVX?)&@s;r;`z$Mv{=Ij?iQ&Ns--0dM-pTgS+_DzIR@X$Rq7{-%Pwse@2kkD^LGRo5% z$}=n)rZ}Fw3K_-WD6}#eb(}y(ZNmkuL`F>kWKRJ0-@pSs6}(syh)t~Os?!x zo>I{eP*%YYQAMv^EFiv}$xqiXPEl&HJw6*kt_;iu&_MbD2Ge;C?E@t&hzHz4V9 z*v?YH3cic_xKf4-ARrlagiD!M$)zE%)_5|?vP#|CF;FlEbzD~yDm#Tt-U*OV6QO@3 zqY(6Zfn?%Ie5~agtEg!_jKr-4S>K|!(aZ0LpPVJ5k~36y3+-J-M*T<*FLYW?MwLOt zab4t1%g87VL^F*HA3#Q-=Ex}IEEzSioQxU)(fbTwT!BtJS8h{IQTW~?FW&7uuxn>O zhpwxTB}3S6e%+;HR1)H#_LD_q)azL?YOJJBTECP?Mtvo}2ar*_aZEXzD#}!FiWW&- zb7T~9bu!9Pi>U)oMq$uoev)54ok=H`Q4(29M(Of6&XG|Z+-s0gd;Ua5DJ~_WL`SCg zP;zV^?|J!qa9HL@KB`+uk6leq`n~$nXWD%DC9InVyo;CF z@EX;4-N`;a`f*(E;Y4`QtN6!k2>S@cPuxrMnX+YMRGp;v5;BTMrDSz7suZlUh>Wrm zA(Bz0i^-_X5Md%2Rl0VOv5bs@$^c}P4@3-vWiQYXnkA!d z^Usk{>IuGU_{ljJku}U=9w2i(<|MD!O0YSY00cQ#<4& zhlR!oxAw67RW;l_I5=OMr^4RInN0Kqi z2WXgsV_uPxQ4x4D3Wt#-A){0mkx>T9bs(O3GU`rw=EqqwN(CUJP^HvkGa%-PWn|PO z1W6#H)Mv>k8zF#<8bB8{1wPhGe`1uLeTCMIa-%Ex8}alqF?H-588w6_qe1~Pss~R- z6`>==YBq&p_*+XZD`?RVpu9E15<@Mop|tMycagBclS9KoEhiY#Up|sfHp%3Ib=zsGmeKDv)1(CUT4W z#W^yHh6fs?K_sIp&?*2KWK_j!WYkp#s9W2|=4jEdF9!%@)ZGO# zN~NO2jRS0^)41tcbq2##4WI}OE@dwL^seiDgE~}36Co4%kh)h#LkA$EBoPQ^9)OIh zM0>ofkTd|us76+F=>}^}#&jYXr9&V4fkG4Uv5O0z2WvSRVB?q5;E#l0Bx(~2COOj z?ToC8UmvJT<-7{ubs{07#0X^6vqfYS5+I|Z%CvGyCS6TKTNn~vNeRf1l2L)It!OQD zD3Odp7L;!&nJ1$zNYbwF# zMzw+Mvq=Ypua;tP! zmmi02tU%N9(Ph}nhUwHZo)45}-Ef#J7J+H%ns3wq6uG(8q1<21{-Ui zUsqX?jG81RqXMZzS0tn2X_4V>!;eq-vxJPyP?a_yGLJaYx|PzTD#oS*WE50}NJf3e z8kVg{MvYT5k57)HS0JOh7sx2Jb&^qc2xOEd_$B4O-5q=F&(hv>&byPSa@Urr{KLWB z!5zk+{zImB%g%aA-_E<=sq(;|cJ#Ak)(dpB&tD;<4nmkyqgNoKPR(Ey>QFRwsNc+) zJk5$$2M_3ER$3KQ zIv}g|7gQZZK08tH%pX}DQcxX%tcfeANkl%sUGV%Nvi4~~Z85U$SwUSr^2M8i7oEtL z{RJ<-AnSh=)XyLrV1+<)dm~$6Bd=PMNMVzdTJx5|W(~C#{lXSAwO3Y!uN>4~?=O6P zRPD`)!Z-eEts#Z25o&F5g>8vyZ*LdAeW>>CY2mwKwf1L)?e%K!-xR*@RO{$3?D(SA z`J=FNMhyom!qKaDu@!aks&|VNbxW!DY$@u|Q18_*>NQjEvnuLym{I?*zv!=!QAL=q z+G@R6{a&o#0U{abKQT(CDp9F`(AqRnzR^WfJ|4=SQLNuv%2YQq^$wt&4K?a!)Y{9Z-sO%% z0yQuYN>+Ivgp*bfHHV`21Y(a)7!yE7`7k)^o##|OJ{CG+pW{Vd-_^bI{st+#oS!xY zdTAGnO>9zxg>?6|Z2j^*Q%EHei! z@7v}c|DN~#dcMz3c=rya} zz*RS`9w#b!@*>;~iCl`8*}YOcS0qLoD@JeAn>41pn93bZnF(`%%2JfP`Yd)g(f90? zbkTiNmqpHzk?*)rMzy&B$X^xOBg^U3V#QNET9x;6_5W<_Pw@Il*_e6pE{8s^S0JT+ z;>-c}!C+Op0!T!nK0K`Inh{*)S>kA!pn=*rv`qLMa_W%~wbsi!Lg!4T-1;cx^GeA@ z{Re!?E*wzw6`=YhaJrXEi$({yKT`D2zis65T$aoJ?3I!7IWmf=0Y&~jgts5weVgZm zQK}@@ahmeVYagp*p2dHfa-(Pp*f;6H=M1N;N2yhCT419Uge$gn&v`Z0HkXULE$g+)IYfn`Qk+{FJ^0S=7q-vt z6Gm?B5^&vK>CO+8<)!q6zWB_RRqt_rM0bDhBwwt%tb=uX}mluZoTTyiJJN?{q;`;>jvb zw}SnbeSch^{28Ti4xHuF`+Vc1a=&Si-!#WSQV`D-*ADh^k!$XC9?z6Bg(TDV9QLva zE9Uab)nhmAQLbm?=l^vf(yR*dD$zs3PvP9DJumlsm^@VFn*isa$r;ESF5>)B@O^sH z^>czMtm8K2U7wqF;gq+9F(2hA^(*~OL)lLKmeDiN+Q`b^Lxuqhil(oQ&$!6wbEZ4*icTMB6zN9cjvVq9ey+16R7w6fpjXp)Y{*07aguo z&V0D$f97s9&!g0fN0W<+cck- zYZAyPolw8ePIQ?D)Y7*~D_?atp4}lT!1WdJPNeA7S?MHxYjHdorKF)(C>>JQ5U%Rj zLwDkI3gZ>GO2NBMVLuWCsV~8GS-E!TNS$E_A8OWD@_r>+tK%&6yQH!!R*By1ciHxf zls#O7*3X8|KMKE)2aEiirI+X3BO6#9Bpry6Rpnuf3iIxmK?Sl`pKY?*(B3m9k*iP4YVx^F zhi6^LZd)UqD*W0#^<4=pZKSMg`;2{1< zC?yp1f6B#rxLf4EhGJP_l>iqDK(TCtXI-q%A(})K%RWqV9>sDxr_*}QY6*(9Khl`s zVmY-&I=Drd5JjxRu}e{`+wm9XU94kQ>~gMz%)3}8ukX2kJqDNPwVI1XM6s|p4=-`C zaJSK;X~$PYv4|p8)I-0Lhc{v}{CgjrznD4eV#Q|%^<~FgeH>Qx_|D2MR^Eb(mAbl% zm0os@NMjLEtcR7!%U!G|BrX;a#malJ#Kp?)Yb^iLSUB1Eq_pWtWlP>XjkThSRrj`h z-o6Z}Q6ksf#r-Ht%AM z{m;dES(yLh3$Fa!46j*H0jZ1C{0A4ymEdAk5nU|vMJ|>ocGktB+J;3aFLSX3@h;X4 zQWtAFsW)0<1m{AhTutI)r7|F^Gj1~2xL~U2m%3O%fQuzL>tem2HybF!C6T&!P! ziZQNz7;Tbp@)=n;zg|IWBXt6lPtlzF8OR65wLpU+iM( z5nQa)ITtHH+Irr_N)rA{F4nyjU93jNMJ|>msf)#qcd=g1x>yRUxLBol7wh6;7YlG{ zEhw4|^#7KN<%M^#lI-SOEammOSYBAP%dCsV1h`n2&iP5N=wflqx>!srx>yGQ7mJGM zVpT13v5fSo1Ls_<{fk{JRlJL}dDg`$UF>27khoY5>vyr9&%0RCOI<80f{UfR;9?!Y zyI8RhdBDZmyx?LDWSA^O*~Q`}x>%th!bu3CizU0Fi}fpzJq9t9#DHGG~2@0bHzVci`09v8u^)7hJ48X@HAWxDFSqV*M_b z{H%-BKI>u`uI6HC04`SV3NF^3^|)Bz1sChaG8Zd~;9|L=02eEY+Mf;YV(C+R5?w6a zITuTaJF&!#PJIOzD`?)u!jiaH5Q2*pIOk$nEx1^(xaM4}!+}{WL>H?LH|t{Q6J4xQ z^3BU#tSgILEM@qDizREIzu9xKi7prq@#V%H&^dDR-NrH>j47gYm zE4Wz5c^8XuB^L_^Opf4PEZI31OBOAAX$AyQAh=jZ=3FcV5G07`Vm*UM6I?9W5($Mh zT&!=4T`U}##xfU6f#724jX;*TSgvchSkZrSv7*;>v7BaIEQRGRRun`JM6U1`T&$>n z)5U7xH<)+NBj zieKzvfhcWOcd?Ys;kh`nAL9@k6ph9b7pnyCVrk5|Sb9Vk3rwj;aIuhpi-jX`v6x9* zET^R|R?DAUEE~`@J)(<+BYVoY)WwQi!^P4ATr6>di}jP>Vo|T@Vr^fqi>2@<7t0uM zu|6(xv6SartU(;v`RISa#o7nBSbP-VK#J1HSr;pj#Kqb(ld+bIg_K_8Vo3rn7Uw!# ztnaJ1SXb~aRtpG1b`I}i>6$}Nw~|Hvt1i}_B`(%3i0%>>>k{6@+DveS6o5;m3 zmKeds!ma6I4Xxl}eU6-Sv1;M_h%Ocl<+Gt>E*6^N$X|4^azP6&mh7C1CB4kWy0n^$ zMTK{*~QZPS6!@+L>KEW9rs;=i zui|2rKn4jeR?wO*mJ?!i7wh7Ji{-Q)7wapf#_BFs?!1e|KrgvY7ps3gE> zukB)CC~WX9RurA|yo<&2PhBiM1jQh!i*wDT$*<~S@e^Gv<5gX(_jnhp9q(dQE_SgzH31h(0&ua6 zn^twPHUln}I>E(~S>45A;b)Lfi?uVH zi`2!c0bHy*E4x^4d?bi2mgHJ4)}z1bVtKijuj*nM&%0ROf)X-#7mJ}hns14V<(2UV z7fbn%E>;TO#kxe|V$pA@PU6#8aIsnlE|&Vdi>3Pq7b`jvLh53*@BuDX2*Q)Z#ljI? zthxZAi}h$F7prdt7mI&&7wa?L#VVh7u@0ESnXS8-3=BzJEPaLxJ!EX5J+LPdfQu!9 z?WsmBcCnIIcd=e_5nL<-f{UfQ;9>>Nx>z=Vi$zY`yx?MK5L_&Cj=&XMtc?fOcCorI zi1Pt1Rt;v}#d;h_!`BxevDn3u{fjPEhdIH;k~YA*ShdpGeB4OD#oF}`T&&ZxF4icP zbP0u<@8V!Tr63li)FaFi-oXT>|#9~Ai7w-vo6*U!Nt)ovzm()qcrbg=`VM&wk~$D9EmPg`kaeJwzi9PWWmKcx!__QUglz{Eq1X^;aw~o z`MO-JyK^qq9)gSYV!_4IS>$5L&AM1)1Q#n1aIwI=6KlFyY9uaJ7e6S8F>^m=%dG_$ ztMD(oSO#mmSTHFNnji0Cg|HD_EGMFiCB4kW3K7J+SPY9?tPIR%5XeJL6GU{eGS=f_ zwXEr4B?q&t;bOUvdBazCv200PtWXjc>(fdumJ89vy3a=HVj29=#VS?6yI4_zq%PJY zFob=Hiv=RMSb2hgizP?uV&w<$)ToQq=<|ME>|*&Ycd=lC)dUwSZABLg#AQH1aIp$G zh%VMCz{ScWxL6d*X#MdNT^un2P@J=|x*=!Qrks7b|qRi-i`Pb+JMgxmYr!E><+&#fm3& zv7(o`SWWuNT&!S;>Y#vXf{PWp+{L&(%fQuDB?_yambFrG{T&!LH9TzKF|Bo(~FW$v6 z0$i;4U?aSXW#=3js6gss4UoE6!CbC*7fXJ*i{(RbvF`oB#gdasLMsqmEcsax>(usE>`=ji}ihpixsw#i-l(Ums~9LQWwi-4HxU}f6&E3 z5nL?2ITtIQ=wh9-n{~1F%(+;)|HQ>Yv*BGVzs>KkAcpBXI-oYqKkEo>p$UQjW2Vt?*C;MYnO9LFxwIr z3pM9ri9&ebSUbSQk_`a&6J4wu(twL4y@rc*&L<6{m^traH3BY{(SXV#7b~8`#R|pp zZq4Uk?qWI5xmX2%!^H}o**xGFzR1O*SnOhjC@gZZLY!R##C_POF0nI96I?8VHC-%2 zf{TUzhb~sG!n#~6wEk)?7UfzlR@Oz2M3&))+dE=wdw}ak2OaE>^l`KheeNn02u>Epo9A04|n2 zC=GD2#Fw~OM*tT~eH9n0RB$C1>lol-(X8lV=~K%N^_tKrw~T~oxF(HYh38zX29YCt zI~H6lGJ4f{7t8feF4l_$7mGzfZ`Q?%r*Gc6^CjNJs-1PQ+UH%Y4#35Fig&T9TohJv zv9<~yn~Ge;#mc==8SxeGVl}sbf)KLI1Q)BKT-|kI`#HRe#f*2c%=kx`S8=iQ|LkJv zjhN56SdlBbSdZsiEF{6jvNl}gVjWn;#cEmNVkLKx0WQ|A^}ATF2rkwD!Nr==UXP1) zX$ZP^*2Ur@x>z?!U97wnTr6wA#nN5sVm0AitTHT(%wD{UB>}it%GH32rO}Ibu|AAo z@h(>H5?m}eeK_95>LR#Ub~u8IrH6O11RyeXP_}Iodzh}E17!p>!wv8* z);26F-oCJTxLDQX(xMA4R@mw;Rw~iO+OyQff;)ve5nL>OtTe&Jdd5a@ zu{<_t$!JL`S^+LrCE#ME5M3;Bm@D4J;-7V~?C8=~cCliKE*1^Z#VWzOSOPQ+fQwa4 z;$n%z^p?0-e1MCUg^)%W5M3N6Cg5U~k~gp5Vr9`tle$=zYq(fLfQyC0xMty&yI7(^mf*kUVvUfvSd9c1 z%a-V3DdSzN##tB3U=0_`de+7ACvmaZ7Q0x&JPzw}u`K@oyI9(P$Hm&Zf{PWtf{RtK zx{Fn~P8X|OXk{1c%B+i(JMUsSEpoBG0xniL!NtM~uk2!B@h+CjVi(KhpSxIrtGHMd zfQzL~aIrM;E|y<_=4maxl;ti~1>0Y6v0PSiu?%KitQ?|?1)p`Xwk~(Ev{+9Eh}r-y zmcU{c%X!@{R)zL57Yj>+nsc!<0~Whjujz>{7Ed`FAK+qD%(_@8qKj3(x{IaFUI3kQ zvG%XxVr`#wu~JuZv2H>ZxmXo*E|#CxA{XoOyo)t3>tZEq5?m}D74}k*RLILI5Yfeo z5}tLjOzye(;!BrK3_(;>JSaVr8Saf0rzFv5w+hEEM2keWoY6SfzlA6)33#xL8rF zr!_b105hf%T`cIFi=|1+PH?f|GCQ>S8Acd@iqaItKbxme{ZxmejG zF4o`yPQZ&2-k=3#GIr6V6@`YnfGt@o=_|3Ssim9)CMMDsETdj`^#;%|gSorv$@)Bt zJ+^7aW>7kxOSL$v5o;;oy~)*3T)_XKMuQ}cL!c7;+k0!Q)fpXHca8_3O#+uUQRS4< zbDq=QR$6W@N#&k(&YMM1L`G{2OYNhr5jcYpKJC>~joBhc_cHa!>F)Apx3r^JvZ%+f zY+JHCQcAz!sAysy$-j~*(IOk&eYESe7B_#OcC)!=Na!PGIjyUYxUuidsF=-el}DzR zfnjojqZLU_5VKOtUs`t3{DB)=Nctk78$s{}x?#O6?}M*50!`oyeySIAa>tvM76 zDwG34RuE9EigM;-Gl6`yL=>yisvkvyV!c<%2mQXK*x@@q<#>!YVh+V>K_7;BEudK0 zpji~l1wgUrboLWaEc7?#heQ<1UA%k_#mdbhpje)G6f2&9Vr@T0*>N_oW%C2BPb%dk zC|283Ly&j>fQ#Xf4pwpNA{0xCh+^5Rq$=zbj6HkVTb~BlGwJEUrgKQalVujgx)kcH zbJ|&lIuz?GSD|NTyqUjt&~zuW@H>+*8!H=YJc`w)KtQno6KjTmV(rDyB+Q{$(vJciBT_xYN1MGb4n-t@bt zr;Fr2c2IrtjOy?LiuFa;_C?zM{M02|6Hu1PsQp(&x(fX)ra4lc`=}^ZeXaMfJORLsD(pDBFdEqAls*=RVT_- z!Xbct_f1MeUr1wQ^h_*BL(9B)*g?ajU0ba|d3ypxbR_S(3RnR+9F zi}jP`ko67#m-)ofWrRoV&p3V))LQO#xRo( zj^`HJ>f!w&A^|RpZ|sh}zxc-9?>9?paKaAL=O?G47@s=PezPb(7|vZSDSJC#q~y9d zHH;&HPJS!9|Dog?SC1Xn3g*xsh!Cs#d=|Ah2wH0*rJcYOc3mZsM0xPSq%ctj{Y3FlCHqiDCC}BN%oh~&ZP?hLdTYOrf}hp>Uq@AO`4_Z5 z&FOeO)}P?~hNS6wWhgDMMv$Hy>9Wl7&KsqHOmfn^bui7k6UA1o8wMdM~&K3o75{2~KCay&Hhi9s)0 zTs&xM>pfez=W+)aB3BoZst4Ix|{47&hdv)@aDKvXbVZ5b7Jmcud&vX zyUuXlZAZ_BwpP6@MGO}_V(~7PzG??Ugl|g^&t$8<#xz4DhPjuI(#Aj=?_%xfoe|); zj5yP*q~;rUlQm3Zj6WjHTX0*yYY-AtL2b_-AlrGxUg>QlOiZVT%iD*H zJJ^{@!qG{q0PkYe=)HTTeQ7#J&o7Uk-OYTX=@(GLeo4)c6Q{9sX9x7Qgz8cHvIkH< zVaGN`_Q1ClW+YJV9MF1J9r24Ym8bA)liE-I^L{lwa?_Ch~e?Yj=v zr@60&kNCc`?VM))pNn;UvVFhC3`YZobu5W;x2rbqhQ=_!aX8s#mlKt3Pc*Ol`AGAM_-^u;O_sy@4$&M4nGu$H>wkaH? zgHJgx&(|=kscw!=pJy68-=gBCdc_=kYaMt-FTa`kpxo(O@6R)q%=UZ0(BWirBG30b zR=+>mbe?=u%rpKd?)Q+J!>MZ{Qm6Qd8%`UXQGsCdL+!@bPq4@WR2DIx67TU z`!#rfHpWel-Er{yRNYH`h|y_pXUM0 z@gw!eRj?=A8&0Uy`yhi(de)yrHJns$^wU0j=J@a#*+zeZXtY-|TK_!8XBeZnBR~Nl z#H<3907Y!qnu-`ei0uj2TT>A;U80D2#96(KbNd>%>u(cc*wn*BMQjNn)_EJVL=iiG z&!^_zrPh1rNAI1Adf-Qd#7Gpefbz^+W0^shvx5Oe3?Rf3o`kKeh~<(fVg)hvg^xZJ zUh6NqF;J8|UKF2J9N$=+`LXooy|RQQidb7!ZsxPpPtP*4t8aa(e)zrm-jnBd-#;%H zdY<*QHZ!m8J`ob@ep&MQW$whwM}_r|iW{;@8=uTWVihg9Yb#d4EvDB3m zvDBn+gU6|WBIZF<#G>(v7}H9MSUnT0`|hTPMXWkBXs3$hikKL}wn5y3sEBz!%Y3@a zYgQ49D;YK9x?-MJKdXpUFx=M|*#RhGaVscdCqUYWqMt=v&lW3U^>{_hfcI@RR~G7* zMEfJuTRWm6hLGPR*CZyNt1gqN(P2kW#Jm7Stm4t80@u;oh7!Zl^NN@nKTe#CL=j8H z5frh5RKH6gxJ1wenuu63SzGf5WGg9Rw*W<~fo2CCrZM%$#6h%as~9&?IVT!6IQ+KQM5?D^%R-WV^dsd0~-3UJ!tDsekp zQ-1`j%*u+`SArt;B{D0(`UZWhF;Nk_<;Z&i>WEuO5woWsGQW--x(j#h(>`*ypZ$o*+2f2_t>`L*fs zhbeuVJqEX2ZF>;#cuhr2J=r{dD<;3FZN>mpiVe9z%y$zGdHwy<Zk@FrTM_1H?5(~(99VfXLd^bHMi0b}YD z84~1RjG&b#oKz8Whiwj!d>f#=)>x918J!zG?eF(c$*0}vPSt)8RL`7^I z8cP;`D{E7Qv=V!n!DcID(4H-I0sH~VS7O&z#5~qi#73b1CloQUB-lVh@&I}}IE8Y+ zkuRm|DDxw#&1Bb^|D=e0S)U>%Chg54bL+$j7+w+cTDKw=w;o08F08+qBXmE*NtJDQ zMQr2`idaJf{d-E96%?_NIYn#|nog>SdHe%KY~-IPVjEzpoOng-htExaQ3lgno_ zjiU>S*vP-8h}q34VjhS>i=R|W6)_Lg&2tBDB2QY!HAab+KZ5X|wu|vPvnBFtrQ8xl zOzNK~VjEymS0JH)B8DJQ#0LIVMGOrmDq`pn=;o<1iz_)6t0-cU>ruqaM#%i!lc1V3 zl4F663EUP&Rof3#4Vaf%+^zDcOF1|-r--G)?8)d<>VTI~e_2rxoBB5tF~karn9MRo zEN*Q@34lskp8=hnB>2vh*8ldyhqfjArcQW+rMClH}a;}q<{QJMQj5sH;rb! zikLgBJptje|M;QU7h2<8TCc5u0-aUdF{Yb1Cz#^xD|a1|?=unm$ z_nXe{yyr;EdhMRRWICxLCiQ<^5%ZW;#Nw7HVob0{e@PKzf%zdy9=>CIcb)M-?Gwl< zikJubv2D+%;?F$uikRd&6)~CrgNhjMoFdk}rXt2Mr-(7ZBv(_!5HNm!YenTi#JV@{vmet@=KHDir8Z%%g0Ch-oE-|mElx*LD=^7P+P!6Xjj31ToKz2 zC}LujUzs0w^P6pxp(7|_er4?rRmCB-w{l*tUlG%%{~*7rBGwGyp1klk6fr4s{+J!I z1_D21J=3}4Vr6SKdS!nzl5jrfX(V|n-QwY!B%MrSS>GwfhtidAGwa0a>%N&N^<=NA zh>2a8c~7%5d!y!Uq9XPw49dCtTcg2?l$IuA{@r8S<1Gz|irCG+tB6S&?nwBvB9Do@zov-slhfDsZU4ETh*_YvGT)8l zTU!xBcBLA9#I>uynl7g7xG6(xVH|QJB(5pjnA&Ks{{C;v6tO};5&KS}h}~f}c*gr@ zMa-nswudT}M)}h{skRVNf+E&SDKo2x<=_>uyfsc-|DDuEZn4$u<>Y++ZC`lL^${*psaefq>BHp!`%Y4SZ&uiWthA z%==$d#3Ed|x|^@&%H39@#1a*;GT!iTI^#q9h=`$WN&Y$mGC!6mVvTYVPLu>i>c-$BGyGOj4B~BGG9={T;L#yt59~7zAURW2nu&>My4o65EQXA zC|mV$=t_!M9voy2h8^PKR^tPMB;7|f;BFrjihW-a6fqFRKn|Kp5`+fpfNJ&h^O}4s z!XPwgFo+L~ekucE)JJypADhsgSHw~lDPl+9-UbVb*jF+Ti6T~yn^~-g@saV+3O1b% zDN%eLg`pZt8$=3#A%RO2u@c72qe~RA?~G9YHjR(JzhFB*rSaok7 z3MdFHk(GT}3r0HtdZ{WF^hw5hi6Ul>;cZ022BPR3(NXydt=jX77>s?nBE|%!l~Wyb zmgnJ@mqq;(MGT}}T{^=J1&Ir(vHw94qhOs?#N-)Wbr|_%7$;DC7|rfchNIPG#(*LQ zA}C@KU=K$XUM@wLD2xKG&F3M{qg-^GR1rg{fkZL(UR!vRGk69nyEwHAdKjo%HC}sd zF&(SWgI2P}m(q!1S_1$}O#4t~`4$Z9+}E@6?BZ%I1-vnO1_IgpMx&^b(7a#_1jpds zigtv`NB=5iw1a?@w(u&Gfm+LW_`wJeBgYg2r!<=P{flF2#CxPL$g&CRIPuaBJc5zAXrHrJXFOz6qL@1?mQhV;+4ze zru=z`A?ux%hqNjPg=QLv60FEG?`24((I19@zyqMHgy3@!5X4%>bRaJ)OWL&6^yw!> zZFW%luRNhf({!fN!#1QBBm7R+F_kPZ8sNi>XT0YXF$6|H;f?I%>kEHS#L(HX(NJ?k-Y_(nU7ceO8j|rwObK-H6-bjG z10GOjK9%_h0;1?8zqk{PK%2)q7dnZe!AaJ<$+_va#xcT7kvD0(Ji;Db3*!`Qdp7Xu z;qWSom{&bF*{)N2%|Oz-R56%ub}GhrMU282^@tJP4oG-wDPr!cC}L=WB9;XxVj26* z!EhhY43*=XCw}ivoo?gdcs*kQrmzN4pa_bX5{5hpErz5(Sfk04a`;)mpYEd_yzF@} z1V!wLi36kh#~U1SP}@9VM9$+#B_~H z`=Yj`fj*r_pLkaZX~^TzG-4QIC|ZvqhJe04M)QXD75_cul*(XSMIOL&vJ(|CMU$t$ zqjH8Bj&OHCtkD(PFcuJ6@U;?jBzXToUII5LfGZe^czFYual0pI4i^PAadwtB=g@tw zr83wBHh2rl38E``3pR)HBtLulJ-EP+f*pZ|i1uuxU{ru0&u>B}1?4L3LnHEY$Z)wy zg+Udzpmbd|?<5=#fA@7&mE*M066*B_j0@n>#!Orw2nu6l(^IsA98CL`9FB&TxKNBm zk>lDqE-^M-qfHhIeG=9jU>($av3Zv{g-H^bMQezXXh?Hq$eB=xe8!G_$M?zegT^UF&9+_sh8a6J+os|Ft(u=D)3SbT!{fAz(sZF!{$K{*KSq?U=CYvt=);V{)|Rle}3{q)T!WirxiEnYdyYx6&zK&r?uQs7l*d&=9@8x0D*?izM$KNQChK3-cu9z8>i7(Big>0Q0oN=Y-dC-=F$7PC-0 z4L7=&N!}+G&0lU`e6zP8XeddtgSp5-xTej@gu-j6f(KB5Vk+q@Uju!bC{L9j%XJsgfkD&=*aL&HOh3Whr$pDGKx zKD;T`H5DqyZ|CaWI?OwGSlzUU4p7ASo?W6w@c6u{5koeI9~SJ7a)TGMx_Szo=t-ccu|=2{XLs9|Zxdtd`lO>~DvBO6 zR=*+XDa}?aBjs9pxScWdrDAP&(qSCSrV#hQ(TmyAGe=tk=&G!}&0RfCy!cLYc~x<8^bZqa(9f;XlGgp?i)GX%?7lc&~%dRKd! zNgafV#BLLTZx;!LZ27eF2SsPz=4kY>c%L}$&^TXyxB=g>EWTio(1+HoFo;O#Ne^iT zs^j$B6ktanBG!^wA7Ik=uzt{?fbBsXc~_FZ*V{FpJQQ^HYWEBknp{clx-C}19!+IX z@UZp=*#P>Z0@_3pO@;>17=Y*|AXkHwe;w)pqd}BDf%^>#V4|S(S`d$~ESwJmv92w4 z1`YNxn7g+j^g=|u(2PlFu%j=}7Y2Iv?b|UK*=UBVr2ZH zZB5wG$&Dv{-&#{>;wXNdg;zy`58uJCS7U6+LVd(6dJVRf^F~Nav)(d(Bux_$q-Rk* z@MeUc{xpR8v-QAN!Rkn!Mz^kw--Mn=-g8MB?62RI5TR$$lJuz!|t9``2@@y4IW4m z2T_{B>9`zaQ6-Ar5p_)$ccp;Z7_Dk|^BtVAgsQNB=$KIr+8;MS_tK|@=4}MqJRgwf zk~Z%u1(|Q-ZQ#zJxW<-8JtP@=@?5El={Dt<$3i=`Fk((p?Cj=z0h{ir0f{fTBsvsy zS9-JPRW!ZIQD=IzfwUug9)B>yMCg@}qX)HVPST@BL^hFwI0|%4fCtdfhz1WO1V}w- zKQl$|d8%I#kYH|9@Q$ZKd?g4+u^c`ut;z%rWooFzA7s=P^}@m8UHgN*0 zN2Ou+El|TFG$&Gz+ONAxE6BXX?A&xgs2P{oNQU+{lPDL9Al@)+mAWjsZ>qsBWY>9( z-=e9A=xs*I!WYFfzw=$PIgoqPs-u0To9dK}ueNc#<2L8Oo!uiTM*e3frA>DBwFhwT zENHj*Anko1=*4EMwi=vV_SkLQWESCVG$i&A6X+?s=EiUY+ zvDSJ)q|72Hs^HNlfT?#Cw4qAXYlU1J~{5x zvqvm@#o-v6i2J9cim3E*HWwS5Ei)?C^L2&N>kfz?GTivFQ672loD=z#hRy-`&1x{JOgR7CjzER2>X`bjZJ7aQN0~ooh4Y!TZjafB ztd`!+`r?;%9pm&^6qbg{W>lN50@1UFPQ!XeyMp} zlBu^VF_c|&=Xb@G3I}S%in(=hDviYj7TtO-*+Eg>4X*BqC2PhjV^&&nh{*=_PX1=owo{hP(sg9yj zIVlK-BqRT^ZC1)CYOm5#CpT zO^cjELMJgOv{dVWxyi9JnJ1JdxRq=}b0Zm#rV0hw3aj-aAwPSfEi(04l*ENS>dXXb znnQVSmkJ$Bbv)ymB#Io5611yK(Vd36rUxq1GIAvIzZj*E@7ByY!|cV+uy-HT+b$*7 zJuSQ&E)I(HH0Cph0=3;Fie1fjXI?+fe0=|i-zK{r@i)0mj~^T_++=?)nX=&93Vb7=e9ls@g$~oVAsC-G`2NbazW8}PaKMaOJ zwp>BB7ceKaL%%lIlo5AC}NI)BK9><$)_i! zqMEF{f&nH!a^kvru&nlHKoOG;;(Q>N5la<~9cPRwV*txt$+a|8lW-Lf^KrexxzREJ z9jhsJ-mB!p)Hc|AAQna^T|R$IQ?LRxr-*R?ikOr+g?t(UYn>Cycfqns9j}PxppWY- zLuIGzl6N}ZFPsP^Kcjp9K-kx@3yvOg3I5eDvj;D~p#JDZyZ`)?UGH)q-FwhLXB0~& zp328szOjm$3S%U0Ey(&dl7U_cP{iatE;n3yBAwWfZzrJ(^uPU9$SL3aIq~rZliird z3O|y=i+1nfdoq3Ys;`~)v8;FRsm}h$xdtd=?Oo)}q8rnrH*d|Tfm=BQr9FOsr%|0b zi$;d8(xMl2;q>-`Zq|BBBy*0*{iJtSCaRO(Kc_t!a^_plIi_4Il`|fxLpZt$b@~ z+|Mt!nB;mhqMPn;Ue_|wKWfR^IT34zSH#S#>--}F{1m@@LZ_3|+p3_J?;_ zHkCSvlPF@!;z7l?9&qUYy4TJVmFv*{nexlQww~U1lDf~IM$IW=8Ty~ve|bMtJyvk* z`53+7(NCww-dog6kCtzQ(*zYyebDIrTpD2z^X0*Nu@UwE$KGAWMZvZ0!XIYnnxPwp z?(QB!S{zbAM37KaQc%DkW@wO9K@oz|{Sl3xDo?(#k6P2?=8|ztMaYmO-N+o& z%9sZV0Fp?b-j8jH!U0BX-0R2D(W=lUd+CTWaECtjgLaJUC}Pl37kzKP-nVb`yFJ?Q z#+b3plPR@t5`Jv)o`8bijf^|85QOx`)6qigBO&Wn?eOmvF~i-8*r>L93JpE8+*d`+ zx+W1tUz)-&&`OkQ&3J$G;U`7Rx`oo3eldmF(1M%sNNEePeXZK-O;$GMF3QwVHm+mk zf*{Tly1f=8hicV8WFhSK$KLs}(z~T{=zj`cfF0OU z#DM*t^_=nDd>@b1x1`V!vWxg9Kmpptut;f71cQ3Zwjy>z4f9~0CnI@Sr?jJp*@v+t zjY^-jIbY~6VB1Whctw1@9*@;9?e<}1IM%b8`ja7Oc12Td_UogHPt)}5KsZjiu@@WU zEGe!i5-aT}Vur!o*GE->B)Uv+)g)}ChMv*}5cO9@tdI4gs_Je<47gvG*m{3YT|!2> z=7n07V_{t7W8A$QdYNs{7$58Dv?zI+@829Rg`^=j4;U?Tl#|)D=W*ymIaQ@pVcj;1D6&|w8u@f!qgZQ zY9aX;MXZkJ$hIQZpyIa%I-*J;E=d>VflO*tK(yw`#iBS()0mSY&FBWLOGei|WzUdxCfY zYP1a)MT{$U)6VZUMiJw}e$&mH*`S1qktRM=!~?nuOO%^462do;eI1*RQN(6CRoy4g z0jDp)g7JC$0+CWKH@NmFVz(mVri0B9CQ|!i6I2)}2^{7QUSPv0 zVs-~c5T|z(G4CBk4Dm@3Q$=@vQp7Cx%ki|STKM-%@WyVQh#I5uB&3bu<2Af+BFLLJ zH0f4I%yfLtiD$GbOCmdp7&rDkUw3b$m@)YJI>=~C5wjIN3nW+1*pF=(Fj(t+v*G3P zHCz41_EP(R0mz{Anozf_2G1TtcYb*L&Gy3( zFKKHjAo`5cu~!2oz$2|pn6Wmhj^dLdR#(e&E3YdjXV@S&=>V+)M0N19BId7NENUdL zp6hP->aL-A$+1`N`dYG0HB1uG`3*d3aEhWGMXV(5avnwz;|o)$#VBGm6w6o!7jjyo zgO50|>x^VNEgrnP22yBYd@#qQRFhlXb;?l*qllfF!YE<`$P$gbQk}fJsU!{rlkqrx ze0g=esu)FVNVO$f(?UQ3f9gXpB8fVUi6h7JzHgN1WB#v-7$8(ZMPNq}Q}7E3xs1Bh z!1PrStH+T|6ANyKcYts`I!cGJbypBfNP2yqZADB99AJJ+lV>JqwZ0C1NcJ2mj z`swz5o)cs1^Y&kF$q9^|MCC%azX6!2f80^bNDttg_*AH=!qIhdZe}t6ZQ!;dHfIzo67u$B z-Dro?@)t!cR9&tCqloz(&?6JbXeuDPpJHKC&x#mj z50iwt=tE!Erx-;Hi!NZ7A~t{>LWxnt?D4cAWkWlP7)YHK3(XTfkdJu^w>f-o5Zj6v6K;l66Mfbj@`uEyKybM6+W};wzko~cwjvg$Pn%Uxd}SeD zNw~Zh2t864Zvfta>V6y)80H1-+azah+)>0{8v{>*-wG-4sGCIltChpnUD|>N$r5|- zzywr7H%(K#)l&tLA{FY?JutOkE$V6xLu>{s9JV8A)RgU{W|DPU4ce00y-5T>1n6c1GqLrSU`xZ$n<5{ z6e&`mq*5oI$b?DP2OqRMLoTeYTzy0LmQL&v>Eu<+R@HBW|S&r{yRUL?6JE zJqyJdxalP#^J_%t4j3_k1UN)qtT4B64|b|5+$Et~4u*z_a^IzUE?SN?0b8+^qkPQA zSRf)^bby#oahdXTiEsl!x9SO1lGB!8v?;|xpE^cM3n)?+0Ok#AhD+uTacKOD!%wb{8;4)Q&1sN&alBixcIPG#ug+dZl`jENp7mC z{!#;H1B`@+w7{|`M6aEqej5^Vzz=F#2~p2Jtd<>ajEvCRhQ!juJ5H!=L1KfEFu0LE zB{oO(HYAqm8{B|aqq?{aiK!q0sP-M2^3{0D7mFV&Ozwy`#3cU!@F8Dn*zfFz6CNzr zwjnX59*Q1-6VVw`z`g@L!jn2DkmWF4`362p0;+RN$)rEN%z9*4@8Q_G@@T-sAhi>vvGrCeT#33puR&1Fpp1`=CzWye5b zi=QDe>JV_l1u&8*804+2u4x-4$lE+)*h|UDpdXU31&OJLJG?15kGK-FiVm%PhJnOJ zNHCCCdTCraUs%Itf3~xHWmZ)sOMv5fG)5AG5jWL*pe$nUmO6Nf zZ{OXK%D3_l-k)c?h}7Ju`mZ9kTM@he=7o~sAY$(A{W+sxg>ul9!Fm}0S`!~F!-$rh zMi}8M&)=z+;X!N5C@-H!F0nsQ(0jZftp4uh$LXqi*$c|6$!bb^_~OfT3VI+1G4ss0iX zujyS+G8>Hu2HCUAdlfN$vr_jpa(lf5aR>Wq?gPUWXXacM`5))?RbTC@BvxH*4x!XU z6v(vD6kVf8^Gp;;)9OXfs^mB_uKA5&haZR|VVSHi&ayWzIn`h4u|e%*QE_JGX~}t- zJId##lBlb%9XP6Z@zT^|idT^@Dx9WBS+HmIuNS)Du0@F3=iR71--e`fyPkj_#HpHrC$1N3+xb za2TU%fFxZ(DF3IOGl|$_1Fsluyk~G^j&mi7h+jfh6kKy9EaHrTuw#*`ld(JCr1W&8 zl7%=$~tg-HCpp}%cGF=7>~*jq{cbb*Y-E}>+2PUiL=p-hYQujPbI40Mh7Jb z_1x-AQ9te4cvajynwyMUmB2|WZ#l$ihaqE$WnXOaH+MZo>ci-IEb`3dhz1s;L>Z8B z!DfOQ)IrJ>Na(fc@FW^u5r?nGE#(@-XMabLJ@HmzjbYo_p^G%bo)b5(4WFZV%`bxg zp)*M3T+tHu5p?_d>wtF4BAs*h5zLaV6kxJqfjCIQ>#?Ig<7@g6bRV=M`W18J+CxlF zJFwdxO(1%jY#laa(yNv9fo$KsRQD!p{zMTj3;(*cr_C+_19n;`ZHBK5c}FPG&!k_m zygri&9(cujmcAwA?s=%f@ggTK%D_|Bv!YqokQ;i>P@0O-$CK{gDqfZg844-9OS@2` zW^?(&LVSV~V@zR~S(wBvS;W;9ZaE%Xg?bjr#Ag#wuMU2#g(HKmPmU!$aBt#H4^KTS z7ZY}tez{{G%gyDk{pa}QFQueA6d$N<6CmY1M)@_di1 zk;Qn{zLa!Vjl*c#$GB(Z?%lj4W;(T=C}5g49)0Zc`Rh53ZrqiljjYEv2^Ld?gwLg- zpbXBQNe)RhkB5kN_tqd6iCpa{g$P`0G%!6qhE(R3ibK)_DyG z)-|;O>GYeN6B;WoUqC_*sc$J_Zk>2o-2SxLx6_!h6tq$(-69C~Wzx*~qErU0Lm>od z=OT|KE=-<@RBt#zS|a~)=@KQEBvREeJD8N3?NF4ZdL!ZG)r0)k$C2urub7$7sB&IU zR}=0<1Whq?p&#{F1!%z^(rU-;C}NgwLJ<{BRvIzfNY6#+(upyZxmxXr%~bMk14b8- zew~B&$*&g1*;T`y@^V|LI2L0RF|U5zqaO{q?!~KR;<7SZF;gTwHl7qq=+`^0L~*?{ zZc-xKZ2u`siW~hslTsD^`<*XQB#v$=VrKehk|=J@7*EOd_v@d#Pm#0`H)Sqi7R70v zp0w06rL@-Her2Bj8o{V02b)9G1sOICM@Z^X+khyiypCkDE7R({-Vk3?TNF--4UJxH zxU6r=vGmi^Dk!VC&@DwQV2ZpwMGz!ZK=VQ-Sc0uK{DwQbVvu^X{xpD7C(v18X&cOhGDKHRc^aQ(1U5)!c*aD^r9uVGT5v`iT51~ zXq{G-xkQM27;eYZD82l|o>}gEko0I(~xg#x+X@zyPU|Y^p zip}ew;*U|UE-=1-1V^ee3*|OA(}bG8WTvTDKXX$d?-7lvs2X!CKB2PrLf9E!EuMPC zMwv-KqU_xi2E;RjQX*^aQLB{l$0l#R`>XDH!)`HO%62+5rE=srs_VkXc<4Dlip%Km z5^JSRnX43=QSI-TX=@3XZ~E=mro&ODts}cSkL0zmCOS%6Pxo{_P_gkYO1%)Q zZE=hunf4*C=|ZU4;4zv9w2wruE`&Q+*f6}HZIpewaM^3nh6PRAq(Z$I6=Gq_&O-NC z+jKD|VbGRaneK_f)y24M3p@U!bj@Z@7q3I3?S+*5);QCK=5*twOWj5n}0{#==i zVc0!OnQ7MG+Imm6rAO{jrdMWt>rX5GtBCzv5kvnaMJ%iXKN3O^3n#c1MR22oAQ?iK z1}DslBFya|yaORBh7*-V5mj{%J%A9`!-*TCh?_fz+aV;~aFV_#lII;HFCe5Z;iOYh zq^~+i-$KAkaPa#m@J0t14S|3RAb8ObqD}~yjEus7j3%0lp_7b-jGWzooI9GFzmr^; zj6&RiLOPm4zLP?kj8e^jQZt%Tx0CV!8I_R%)xl`0!<|$|$*64%sEZ$LQw7@La@~p%iBCRqzX*3%d zn&a`sN4uQo^*xbxwFvQRB5kJp>PmTJazzAcH<5NP?$3#|qDSc%B@IKQ?f7VSA7_<5 z$^Js5ReVRJ-ES}YhDiJ3qdn}cT6kLjxbMO1zQ=2QHH*(0oBQkL`kUAL>ldH5bPP7G z4mE!!(puL>+Ezw;`o=n*kN4~)()xE3X%nwUS6{!Jc{{Q8c6xSx>cjlhn}ttA+UC2( z<@by4R+l%{-l5TGOrj7U4fsI7s8xWfMc`BLSu_=-)CWP>brQ7-Qy+%Wik}>8DoksP zgsO!yY8RzHj^#1Rv3OjR(R@wB=1HP+m)?F}=JN8%P2IBmp(@X% z*`emLJ1-vif{2*)$_qy8gDLneTgnSxHb$}QCh1iaO*F@gpBiqdD4uFZsf97`ue>|c zeak4qW&x%jI_}fI91LgDx0`R1G1W9Qs1E+@A%b<|dXQxvx(E@o85RR)_Y*bu?1_+4k z>DAFNkxZJ@KzcXYmk~1F((GXF%;_|UF?Rnkq%5_=xQF~l)Od{Y{D>{7Y|}g{Letb? z!cBY^krw?3Y-H&$neebxiydj4Ix~5rJfP4%@aII@lU+nw#2zAzqdmhbkS|Ad-#jcF zb}W595H>sxN2;#_(9^A2i-2eC`xL$qX>PlSH1^*TX-z*N(%fi)*|61d|Ib7kZH&GE zhDZy+5NSD|h_uM2pAuwz&4Q<#X7pK7EUZXg4iO`Y=U=)wDmtA(wcS= zX^+{v71ZxE0>2Vz3TQTzw8l?~w9M%D!-3+YsPVIS!DY%f@lGtk?q${>eoLgi{6eIi z&>CoYpqV)n!%n%ENJB+RsnSNb`G_r(jBrM-QpkYEHl50Ehh{{Js!r%ij=b%@Expjr_c+Nacvrp(fZ6{C#~QJ_RCsNbAzl zBmwlwE{*D6{7XdI`TG89ER@A*xT-W|x|B0-gvG)g-wR8435T?_5B`EkJCE?Sg*@?Z zGK3CL_$xDCCAui`VCR+u);z{ z9-<`#|A0uFwL#nmMx|BT8hSWlY4W2t9FFadCFx9Hx&$NKCUI{1K66%6vWQ z4~ew&TBl@ot^E0al}LNJL!^1uXR8J&`<$o?(9r?C6;TE2kTo1pO`;oKNN0?ujG5^2(OM8^q`Q|Rqb~Zh-ca?Hjkc#_vPJ`rVK^H4n+MCkrpTp zVP|sgu<*59i9LYtcii~d(*E&ULDlnen)#G)>Vat7CW0pXijKhSrGbtERqMSv1d_fd zp^=8vKPJ-f!Fz}_2=g~YT9X>x2x(dQ6~6|j8ykbu`el`Ut#fCqHiqUemldyDkB3gD z4dZ4PRlgVPkdRqpGcc#wH;l3jjyaunLObZbq^fh+$obUwmseHj^`nDz6JO9q5;`kK z%SDWLJ0I#)5GRHipY;yNNUhEs#=NG?5R3 ze}wtW-y_n_WAiFu68Kw0TFftrG&Ppoks)kdM`SUb>YcphAgQ)LA<~jm?ws==S1MOO zGF5EIB8v|~CqNydX2Cxv(ux$iAc2=RZc;`~cmj`OKkYXC2n7|7kxZ~}6KQt%wj2QQ}2vJ*E0-r2>u$87KG=pL!`~)7T4jcs^L5x zAxVZ2n?)&&p)q?5{)qPZr>AW>+|?#(DQbh#K+4rACqMj(C3;| z?jq8x{)|XlBl<+7E#PaQNFH?XD8S_B|CmVY_Pw*omvJ`~o@n4xt|RbLl~7&h>;hjP zuAO2-fg$k(*bHz}V?K}KaN(055owXW>dU>-q5ee^nVA%J2gN`?oFAoGSz@JgX;?cHHWpv07< zjy&Xl1(9}&(2$VyIGp$hhDeLUt%<^M`yr9$_6s5nO0o*$vc@=98zokDh5v|1tR{GUX1L}-JJvhSQ(@~#@G`~M2(ooWnqZ9~(k}f8kyiaCxdwFRPv)Z#+8nx~}3M z5^3xBr-*sPAX`Klrbxk%9^d;4?n3~lB-#plytzDsqUD?)IDZ;)HMGIMwwrRHn>m6D zvBC3nK2tES+4y>o2_dO|1nF>14~`>=hRa>qA(}S;x6H!ceYZ^wa!B;YICS3B1vRJI z-0ql0Daw85b!uh&N~A%4M5K}ZKSiVkTHW%YzqN7wR+yY5G3}WQ{^Og1FP3k+MDYC; zBF*>TBhpyGe@dji$L)L~cJXDNm6>-|nL-kGcIFxH@P_=791Yhk2>Jq^Y^gl*PwR%&U|g_I?E#BHO`hwx`a8arO$F?{h?6#2Kw^v;_e(1j#N z9qk*MIdVAmF#k*z|2{>Y_;?|RIX}drdHUF^xmixUOS_4*1^o36eY_Ns03#*in#1IV z{1+wpoeyca{Z%3@8rKZ)0vjDo`jto%0^jTcw+_!;4F{*)5R+6kSRyr3q>z8=`&K+< zk(aD$Q>+Tj{8-Ou{v!-$01}~e7TUTR&O&+i<47x;({qX!%o-5fnF0lNvvB;Bg%U%A zM*@y9l=b;D`EvF*tj@BsfNCYtQ3k&n$Ga}c&?W3>oPFAP@7x}g729H25RJ`qkRCZY zow~Zl4+;FML>l|gi8Sbcf=DA?V?Wd}qJku#60&{PF{&0!2G0CzL>l|lH4Krqf+5o8 z_J>N0RU0gnh2nwtt7Hr?%3Vp3l>NU)q`l|aBGSe&M4AEV8Ly}D$9EBFn`Bz!4H4`I z+XiLbXBt18{w*$N`J7MHUet* z8zN1TL3hrMOcn2goG@@4~AKxD#{KzCxVwv?8 zOD%__*Hd$y>gfNL9b6lnm%(w2MPbZ+p?VV zb9KwOVQ;M&KKuhB>hM0p0st7H@p<$2S6&4b09}jc0yXZRV7?p)0EA$LKYkcFoeA z{jU>g@Bf%cyB>(<`gj%5Fas8D_B*PrGaV`2gn}wJCF9o{GA%kr9qe{_fkuuiv7got zBupPA6_kts&qis37e|5T52-6*b*iaile>uotE+Un$Ox%_L!?p4{*p+Whl?tFL!{yV zD@2;ruZT2>9}{T_xDH*CqH`>P+dx{~uJ&Wsz%-aOmJ=ox#{@QD_wo92Fv zeWR?stXcwPt4jXJPehtS@PsWVN!NUk(jO3MnGQc7(u)7HL|SJM&j>ugPUagTE#u!N z(zyQ~k;X=H@oxQRiVk z5osYL{O_Vkut!Lw)&L-)DBqjz5I^4)=o7>1i;dqBY3eNN;o`^HUQaq-eF+2yFT@=S zBgGTw*Ua%1OKxN2IE_z;cf`K?S^_620B)uoMEWejfro?vP($Wv%Afe^b>s=?H$)l` zx=p01d?nJf=#3t^1@+h99*je1#!c=e(nj%xm+N*DX{VE`OCsgCkn%!&^_m=m;O2dKHcRFA`}ZxPSsbK>)y2Tm?Xj<8I{(Eb#L=RW`w2 zaidC>OFhW5S(&+0ezIQdG{EkLen+H< zV0#`4(6mA(t^!R5dKG^`qye#iL!?>n5NQ;OKP1vRc8D}9%wgz$3j^hSN2DDCfndJH zAQfy8Oj4#!mC+c-+!ven&xo|we@>*4Yts5oFGZIk%-ltKNr2`OAIPvf`cM9lNQ?SR zq(xy%p>X2)?_*TI+bhv-(|$k^faa0IoCJsa=K~qHh_n!icr4lJod0MZ8B=_CIlkw$Qm_CHOe9s0jSq;=rQ?ht8Fh;1Tm z@-vYZtPR*9(r}b_h_uV!5NQCXpAl)jyNNVK*gq!HuoQnvq~XE-5s}8Zi%7#)_zjVU z<@CRiNCSK((rW(GL>fTxZxCr=|2mN-0%FETSbQPUJ_LZK(nRLneMEqGL>@l0KP1wK zyl(kPeI?Qiz9Z5APEmpR=UqoLenF&>{RfCNo-ah&z!s5a^)n*PPwY>Lv|!CIL|Vt+ zBGPPrPNV^_zY=LKe~Cz2`i@Am`XP}Pvxi6<*+ryH{)|WqRGI#UNW=H~Ln3YF6OopV zA=0jYL!<@$8IflEiAcjK`<6%pNV{MD8Ii`vuuY^r`Ho1d`7M#wu!~4TeM_X}1UbLj zCei>ryNR^Rdx$iKs9z9iLl`2>Z5NRi`-w<%+ab~xAE^d@BGMxF5NWZ2J4D+0pAc#L zw}`amZ6Xc&iAcNmOCqgnFOe3sL!=pgCeo&VM5OU<5otWzL>l4~kv1LsnMljnBGPDq zk^cgbHoK(tJ&|U!MWms&iL?mzdw))($$Uek8Pf}=V~DgK;y)zPhE=wRG>P3rn&&Tw zw3tss+LJ9J?JNSH{LFVm+QfH6+M92PG&c;9He1~C9g(*B9g#-KyhEhTY!PXxV96~a zE&30LG}>PgX%4%Iw5S~-t!6ioc4-%p7WEq<&5bsBhe%V~Cep?+M4G}5kw*L{M4G|N zZ6b{XL!^!U36a*dO{B?+e?z2c?IqGgz9Z7;e@di9?IF@eenzAP{zW1!V~0p{zz}K6 zyNER2FGSk-4~eviEh5btToV63L!>QyL!@>8yF}XJULtMz-zCyEalaC2%3c^E&DknO ziFg59_A`;j8uXb+n~%Z}X&K)WX^;OAkrw!cNK^USL>lofBJKEIBF*kQB5mcTMB2zG zPS>|Ynt?K9_coC>`I$(&`YR$00QUR|kv7e4x)A&yAku>W8IiV+`DnbtYa13P#lC+= zq)Gh?MB4s;pGedDLn5t4h4yL+T@ovO=66I|EtXsdEA!-UiL`o-UlVEfure;P@nxp+ z1zJ;z{0WiP|Cvau|BguW|4$NWO+O;iGF4Jrt<;k^bbdpmh5sEQt$K$@)A~fDY3(M` z5PONVG4`h2L|WP(6KPF9BGMFqygNjiqCeS*dg1wNk|SS;G;-$ei8O`Z5ow#Mv<(gH zpNO=q&qNyX&xkYxV-to*Q`@FB#(Ky?zAgnU3QEdCT>buE~^SsiurX*K%H1 z>S^+mmcQJV?sQ?SzA;lRnrew?89^gm?3$^vG)+hxyCBgkTB8J9I`Ve(LLHMJQtdfh zB@(}gQe5>$ArpcDPHKDrb1Sb73s0PVqXE^WcA(aDLp4!E5Xse3Gr2PFEJ^0GKaRV( zVN?W|H}Z{-CbY*#i{JLquy^}t>IVE6AIfpC@_0gK@!O0jO4YbQg zYhcItXicC^jE~lY;nC1ve6)uLr(nB$v>`c+kCrJl>~;2ol?O}Y7az^n5pTDT2KeNo z*0)RxMoOeXT7$42|laDsCa#eTRN3+2AXlmO&8f42y8ML zTxz=Iqgho>Ro#D!@zK;Zm*i{ks{j8=q=~2KzOJ9SbBXUANclDB_1k*caD==kHlR>x zPJT?O03_dnP;NoU7pN(WtIg3oV7{-UyjuV2t=b~PgQW+fucWnKcdsoSe6VsU##@58Qi8LZALW$`pYxNL{-x6t>mo`Y}if7(tHd@{Ge0SPIpCesi(pc(D z47arix9&A_>vz%DC{#Z#%6L5~K+pCfOcLY@i|1?~WW1jfd_zOw_zjWvD&}Zw^~cpW zZ^~N}ip_2#afE0Mc{<}XqS=i3aL8E&beFusg=N=Y^Z>JrkyZHK4wi)uYr=5AM&2@fjTH^P}(rdqnWEb)d)a>ldiRc&sVN2u|fjup&^0P}g+%qMju$qC`x?%eE2+Ef&4MY6^oe1TsQT)G80`wyHFQB z<#`evC$rsd2$i`8u?R;LW)xjGJrklA5PvElN@4NDLzLd2J4GAD9ukjS++&ONJ-<@+BD0`H60{~$X?}Z(TmT|EQ0eRa)6yEokVyKlTiGaf8~Xa=^Q$| zeRh1bi13+{c}0oS$CENNEM~L4xL)P-a$o#F^~@@O2@@~1PIGv|cUys{ETCwRJWGgR zY%0<5Q9pO0>DZY}Dx0~{r<%z4mv&6{gYJzt^`sKAt}A4o0K|=_fCyucDqDG<} zqu-rciIn}|lMe~xHvJk1l-m3f$bI0hK#~P@`JUX9QU>Ly$52^3cu`AEZ^-_^?3~Hu zPJC`YZg0t}vpKi#)4$3q8D2*vKw{j=KxJ!>SR7tvUp;-U&g}fKBdL7szL}@)1>81u zvSHU0pbogh3WKg(k=J>{uZ`rFyJSAGS1_~54!LD>?i`$%kAD*P);;WMw*|bP&f@-H68XWd21o)9Oe7OOj`Fbv{8q z@9ps8uhB*KUdOyXSyeq{hKQGM)N73xC8!razt~7EO+c3%bte3AJq2Q92x%pfiZC^~ zuy!KRfSc62H$+nApu1sisaYry7RES`OXuyl*Fqr4Kt_ojjO!!fM1vpFc!(INat{88 zg+d`B??g#lHIE%%@EGNk_dx}UBuD_S@hIV)v(>@KR5d2 zi3{slraCMvOS7^NcXP=J$#B;h$1_8;N1<8f?X9bLMS1h=5;zWlqJ)MpBG*|TeJ!d+ zm^-N@vYv(UhK@oyEa<$Ry&{c!oz!X}WrN8{L`iY7GN0rXW5xh;U0zlq0y8Z&X7cEe z3)<+#(_LVi*+*P2=VO#7J2^`{LSnAwndYq4QrMh!R{S6nEBMq_9RW39eZ!_Mf{Y2e zC_)&Do4~@PAex%J{Fe6u5HXVoqZBsXM`%%QVq?jPqk}I%^|AbsVF$F}dcZMWciQM? z$TP7_T1QhYaV?e)qE-HdI?+2sTAPcMuGjSI0j2fm3}>jWcb8EcXPGN0<&lu3@9LqkL`nsmhqriucJD_B>6e#vGqtQSGO|9O1~P!d+*#e! z7dzCvkZ&Jqnt2IB!KnwdUg}A6y?~vdj7i$^!&CV(C8qSf28@zFQr?=a*22txIS@({ zB#Y(>{wAcZXBdA)hK@jkQ|4hg0TxxRS?{cA#X#>-Yf_Tby1H}K=BC9-RJR>WUTJ%( z;0d8pE}XjZ%3{X+VAU|y9k1S3mTLnCAAF=L*do#h4j-y#rY;OId3}uT`JqN7>Z0f? zuWfk`n>AZf7bo<-wwKlIPvsidkx99%uKL`pJBhj^TT%Ob=-sFb93-r~IMA%{fkNbI z_Jo|?H%?y94?xk>`ldH-T>>a&XhT-G^y*W&w*__VOm_79kHhL46Qpx#I5njw;ry4|u@ z_YvN{^+awiAh1Tpk0>q}kG0$?OjIUHBWg84pKgmt^GpQWr$1{Gt;cnIX$vDfcqPZ6 zL!mm><<6{7lf5*Nk4zs+n@gRyWmTXl^a)c(dV7}d7Ln!^pqj7E)#@iSYgNQNpxPwB z=X%w3+>d4hYGIs7Y;Z=V_Q|2AZWp-XJK;z+=FHrN6uc07O=bG72ZyPZ+v>pQ5*-<= zWP@cZsYtCQvZ$hHH8~fp4)oF@4!~-8tP)4k>`R+THagsR{QUwnG|}{Z6;C}1PLI({ zCa`xCRNyKfFqP^>ruJm_W3v=Q$zASLX?{2yc&lcf@={F;fe+7{{E8t*XEfIJw`o4PRi1Yau3@_H6zQ4aV#&K4fwI*##x7cLH3EqQiPt1nh zu03F!6uq|A;$V5^zeL)?HT34pQJ;+$Ea%x00al%MFB8%UtkCp1ssbe z3X7owiv1PUI~m)_n7R#^ z`l6YhcQU;oV}5DCJQdCSs+0LG8OxFZ%ll}SjZPLc85Cp)#fyOwbwR=8tQ3Z6U zU92qRZ0v??+%atYU2MYS?Ba&((lPAvUF^!_9BPIfnlT)@T^t9sIrkYu?sGBR=exLl$axTkJV7x$VO>0tkD9{)q(A*`^PA=GODA*Sx_`FN-1-Z~mL!qe{ zp;ui(Z^?z1426x}#|Uq938TqHKt>{Xu_8p>B47$p3L{aPSW$*_abwrd(siZ=@&{et}kY|yv)29^gCqketzm$erU{{OEq`WVheo6 z3nQ)+`F}>{f+p_9J}SvhEDc>L4QndPohXZ+DZ7$f9Se{#Yw}7GM-mR@{ z{@0niSM~nwNPQTZ<-q;wwWaCq+@tOH?`^!V>L$5y;KBV58_T1wUbH_*&!Gh1@{IUm z6XNgl!RNXK-M7b?i`wJN$<%K-bH}K-NUiH3a^E>~He1fzONhx=XAYP$9s^q*Nhg(S z`Qpr}j*gQlwiHZ|nV*Qpg?#JGh3s) zCCA4Ntm8W9NG-Erow5eU2y84qFa+IHqtSW$dH5>^!LN0}{W zPIJqdGg&n{MfjlxG?EOsOdJ^p6gv#h2ma8R>ri8DCpISa>kLc+4zCguKm|V<$M{ct zq>Tm&ebjP1?<+pi{l%F(wA-1JGal<0#2E!ni=!VFL`tbKVw^egWfCjS%c~Tgd!4yF z(stS#s>>`ln0GmIAR5h2&fJ@qbO6HuF>{GZKM5X~l-AJf zR$QEhn{%GrC46xOi~%_sa=_ir98lCa)u@%|m^V5H3gv>NJ5p0ZD(U?d{fX~*1D5fW zPJDId-hJcD;LS`N$z&$-t2Ma?l<)k zukUu|{OO*#;dW{Tedo;CQ1bF54nNc4vOnT-u43In!sD^7>{I=-cTy(=U1y|k^ymAR zY&&z%FU}m@9%oK91s<>GS9QN1tp%t>1f>z5@akDs@xd0uxr=e;>_0hk3>arFFzlkh z51lzC>+hVoRpP8G>=Xb#O?7VbkkEI0xK_msGH;%O3LM;N85k25CV31(=XRXAKp5DD z=>*1^%ldoH+|eI6bG_d>bKq@fu1iIEJ=7reaw%LC#VFH=11+2T?9BCTJ9Bd1ICJR; ziPw9bxmyiv4V?$Dm`s@yb~$r$s%c4tP?}6D+Dk^1#~$dIKVqA60j{-I8h0XSzBqF- zOf8|+k56}IOT7XLz9y=^U)LjkcFo_)ubZXdnVqM6iHa;I#p#c*0#BT=5aoL?V#dmm zqce}obhNbb)CV0;QN4OXQ{d{+dNDuzPB$LUGyq@olQU<#*O{aHQ)kX$k28lUXXQje z1KomOwu0n;y&^t1=UJxU;SaCu<)3puI{Vk1xz~T) znN!3#b0fQ)IUMS6bgpWJxEs=+UY+jT{-*Lkg+{C^{V3fyEejm?rSOeGp0R>0XD-_M zlQWluapvBnVVpTij5GJr;0Ml}ACV8ujx*= zJ9Fo@ojH+R&Ya1QojG%iGiL(|u=;^B7cdXZv67xaDJuFBanTBR){9^1VC&}g8&Xxj zat%1s4U)5xu8u|AQ4>yp$ZUwY4Ur3IY&&ymQQoLxPq>(m1rNp4L%wPdjs&;uM2$A- zipR$&9SEn(${4u4L)?E^X|+HA2g{HBpd0=af$mj;VP64;CaTUa&K$*$oVmMSoH_R2 zJ98!@Th83&FU}l-=NC%5g)DFQw@a~dOC&K!J1D8U8l?kP?-4VHpSST*N&SNrz+Z}g6!?;SC`&pV%aEkAlrMjY7Z{^dRBVtTE}udRJ#n02tb*(sK?)Bc8F?sarIdUZ z6l$~W%ysQ{=3)q@t)Qu+&Xg&Hg1{uo6cAv5M%A0V^M}sd^{>vHqE$M@X!=>KOm1th z1KvyqIT=ptnJ#=8e&VSKWbV8o?o^_g=XRVqBZ6ZXXRZie)d;6|gd|CX*nE#O7xmSd zy8_3djVE>P69w027mKO2Ju}%D!8gY^bJ9O{=H%e`<@3PzxY*_*#IPE9JqPUq5n>l@ zB06<&eFumsC=Z6pYwrLL%@Ao8NS<-cr5wfAM3LO@;8EP|%n{(^ap2a@=4GFgpLm(` z$bdj5l(fO+P6IS!c!Z>gFt_U)XHM*^GgrUu%$WmjYJPF%*l?fm(cqr<*?ezTKYte2 z&+YOroH?i#O?!eLo)ona4Ti0jjwJSdCJ5@DYcw3LMlzTg&l1LS zXAHm;2QkiE4k#E^*R}1;6`^25!|dZyk2g25~Kkm zG^wmN@ea9=;(SRDViHI+Isg%J;r{?-l`Ht{%$cM1I&-pH&K!ImcoE~wi4qGfBdrpd z40RqEoqRM6uQSYfPJQ{$LtPzrmju2MqW!&o$^iFA_R3RdK4Y>li>4Vit!P;5S zL#D&nJ|WtkZJ|33(>I*ma^}PVpPf0vEoZI)z$t}x`IR%*`@J)FghcbtoH@8v_Xql3 z3{iL&@FQmq3m>RTB+Mr_+mXr(PZP-@Wy&E{AcQ8MNP(%KS*vGQV|{cgo`{Q-v_oXN za%47TWa8=svylOH*A$L4iyPXyu>)(WAB z7b-71j{7KP2hEXA50;mW3aKa+4$+fU#f8aWN8_6Rp)=R}y)y^@u`{O$-RsO{3A;;s9QL zI*=mIK0#?v`H!4An8@)^DjD(J&fIB`tq;|zR0BwKmB`OFI^hEf{kjS# zIj?sMd_p)-nMqvLlEbWPTsw}}B;U5bYeF|()UGYv4AUE$C`)BZ!QQy#%)S5O%$Y1? zHsD$kY zpDoxlMROj=wCG>7Ye|rFp0tQMcKkuIN+j2jfn|rDY(G;Il4~BPo)y})XQGd66k0q# z(>FOe^!&Z^h_uo4{M42+clWb1*BgI4pl}KG3)SL8pK?(qBCCU|Vg47%WI<%cQ7U*wcNa7m9VsD9 z%;%REG@*#)!n*tFr-X4>V~xeM*xrj_+J>*ii9O~zb9Yfkz$VO1N9cT>;hjE|dXcV} z*CzQLk%t3CExhe5i~D6>-qi~N_m6kAR=LP`wFxcc=^hYWtB#eICF=d^%qgU%u3+;M zg<>-tS?ttmP3`H@Wj(TN6mP7pLUafwS(2e6H7@a3`rr{U-Gf&YJf2<|VZCf>vEN+W z6z8im*SOc2bJ=p{__U1^b-Mh{KKDNx(c50xMcn+{nP5WKqM*rx%EGBkkmLD91*~i+ z&%A8vkYn#e+)x?7+|r^`*UUAy4+guiUl?J=wS_5D?%?>EnMsL@<>@6y%0>?AKQebM9T;d386s9yzM zJlo>2*O_~_+nH<0x^U3;83OUfFk+WvL|9E~rGWVmmSjm*qg>jEt$kqECdRVJ5 zBibsK{y8ZRj^*}q=h8QQcIM<@M`B)^W&^n3eniAw{_9d1B?%T;?8Ut_1~!eRbx#ww<~AUz|D4UpjNm;&CVh^ND&E4gq|0ffl9gGZ-FIZ6m~` zFxuL7jCTSWERCA?w%cu$&p^YUa?!!t~k^cBOO2@}#V+knz0r43-Wo4yKF?4g1KW zNq8C_Ole$4p=AIZyDFYzVG6pYz7RxqdC8w97=&NC<;+PVUba{f=fph`=0`g`Ineed zsy_PX&YV8ewlf#J?aXO^b>@OUIdf*)&Rpkw<7a0Mwi>KF{Jw)OIE7HJHcNlpw8pEH zBIs^Sih|k`E(h3Xy3Xv#pd4P;l~m&lQ4&Mlpxw?KDL)4m6KQ8HWK#v~)7FI|z1=_z zT>%QV#6U>{P7of%&!*IBWD!R_)VO#(HVtZ+W$^^bCZ66@m|0>^l)5jFT z{@TMY3Db#XU*!t-Yx0a(VVpT^LQ$=t3*(ZyqUBH~#ilK1E)s2>K0;^cIITv+R(5Fw z#3JxI%d}#)_uhcI64yO`xkRp~T`}GHE)SU+?CE@9%eh=lA&!u5+F9xUR=} zT-UkX7o53$`VEDXDde?ZqJw9Hc#TSU7o53#c++sOk$4`BVYTGU)k4l3ga2)TVwCCX zu`3z~1OS53)p;;1W_}p0{Q!v~0f74~)0yj9aOS=)I&(W9XYS)qXKow-*yrHMFU`k9 zR{H&oGgr9e%*p)4nIjxp3FLft=C}fdemZlv;nWku^`ek7XC5l{)0rcEcjjW87o52i zWs-ipO+>}rrnAVyZC!C|W|uj05BL|Ix!5&V>P>4%hC*2*;)Y(!oVfs=g^eu4vtX+zaJ<2@%^ay6UxUtreH?+Xt|?R$%~J7W`O3H1ny`` zM^7n6{0C<)b`1cm>Li~qL=#9LF@fu2TQ zPRW$2$gS(5>N+AR|LDwZfk(_Cfm*;+4i@I6Au6PbCZSOnLt@xk5%nAAmOFFFG#Ipd zlPWvkaI?LEZ4kPl*r+K|3V!v~-Z^Guv&G1E3r%1exqQbA5C%C{D-pojJ(16V3c02%3I7%NHv z3L}~bj_uKwIz=nncqpLNEJ~ZuCKL*IQYah%Fz!o5<6;~u02HpWG7;e!sOi}sgQx|a zZIS344Ra!WkFTysJb>71DQVVTo@c;op_OVByFRPEs>UKB5c$v-fh2*=yiyUiI_bzZ zt21VDyBPM*4*DiXWlx(7tj#Y;(461hO97~WH@hxC1o&HPuONVZdrgac+Z`i;q6Dzu za}=hw3Ik4Kjw03?MbqHa*cZY*ME9@eV^;6P4Fdte0xAcK4l zqlz^(*&iZjBC11_qulozNDf#R@zw=@cjf?JBv>JCQ>1AVL5jV)5>U|7>&gS4Omz(8 z)dW9-1;0XX#l_N-V=Wz7D@R?y$^gX%-&$k}e4`dr|I~RgS6RUsL?%x=p@2 zb7J!kuJFXZMu!nl^0m?QPWT>y>=Vgw&FyxrVAonn{;>6(&zZqn{KNWnSXsI=dl zIX0(c2ReDdnH&D$%w;&wbx8_aqg7Y7f#^sl1$9`J+$yeKdl9BH2cEvN&=K$f5hX4%6yV4`Kwpdon^ec| zxm=0>P~$(GIc;we2+MI5JzZJkYmp|&o1D)v5*A;2GJd7_K+EL4OD|GQ{^-o@xho8J z+Y@X9kli$Cus0qW=G1|8aVT@WQXY(lZyZuJbD1+YyWE*W3IDS*my6^qCkNQVFpIq}+ zNC1-q3i8F&@S)y!c(CPwY+h=7IiSBGoJ~(rCK5z;g0d`sapp43WpV&?$vXBjmbcr0 zOIGG}6Wn$Mz|vkIyv?zbs7YG44P5*AllT;}24f0+dYPJM;JLV>%&cw_D% zyG`*uC*wEamOFE(k;W?MVkMBPih3vlI#$=drc}J(4iSEYa0MF;(R=_X-bX>sq5>wX zvV=i8J_bp+liL8!U-+yM0fKPi^;g*nHJ_!^F1IEKCyZ5?pZFLU- zVK54)6yig)Q?Bi)fMeV;(|l2w(gax`o88TwTEkN7)@z7no8ewHQ<>{~pLomXZPC@n z1h8yv9gGa%3-GG%+Z`*Ka@K%9V{>$b(pOj(0RhWb<(TH*V>!BDIyYtx!&7^&??(}8%8le z&&GsDUKZ*^oR}cr{91jz0e%fJ$o4IYvu`Nr9k<-Ld&C@r;tYH52s>&FxI}@RCSY^< zx-;*+wgB1@$>O9s0Z@i1`dUCIT>nafAwD#Gbo?#=@su%Ex>zO@#n|MipU@gPkY5}eM& z^Tnp3ZAfh(rW)uyl2`kp0UiFW^DAv>cYbR?X+*}C^(6r6folco8^W465 z_Ts};tgf-~JI!JvkFB7GBJI@@yg~XLJ$fHGlwPm`&3oGGgRM6XYHVnga==;DF#O?( z=j+QToFi8`v^-2=ueyVnoC@i0mjG8#c}!ikb(Ez5Z2YXTcMZr{;E5O>hm!zIEr=Wj zbxYP|R@YKyVjS|1^eJNS$?OFby?6|Xc+TVu>d)53*MPred2jc~7^$!&U_ zdtPAEr;is_pBqa_Qwr!VdS)}2AoJ|YNFlxCB;kPG79RPax1{bo^P{^P<7T9L3+&I? zUkrLD(^uqnE&K(WieNMXff5QV4F;n90F690Js(~=0(~5dE8$o#emVSf28npp=wA4W z##BDqyfq7mV8P9SZK%3_giQ8jGorJS1_v7J7)c)P=|P91OY}b&&%mqLvmf~2cJ(-P z_(xXy#;xqvV)AV*@A-gyi5=GDWDJMR4mA`cb{C&YLob+ey$RX@Nn zh7h2>DdXYZhiwFV8+10!8Mb4ko2wiJk2%t8#AflS@iOghd*rV=o~@O+ zHnjV5u+wnG89pB>xJRMNS)-?}@QJ6S0v(rp&$a5^gM?wm!1K+1qzls)_*m1oUdZCw zB>aR<(w7T(i*H}5$c@~(>n5)AxqFnVoegYaVcG5R&gyK z3AI=W#n=?vX;Qw1|3jlltI@T4HD(-cq!_Hu{Cr2I<|g|kui1u`8}5lnJEm$nk&3~{{Aq@a zs+M7o&ey>^T{^v?3G>H{M0{{m%eFA2FzL2v2~dv(K`xKCIp|l0RP`&+e6s+)!p*g@ zxLB(GOkfDMMe{V>GNzu-**IEGOUwwVW)PKk8hr#fXnW>L^v>I4gU_T)v*8rDa^k3@ zngNw7iJXXkTgQ8ohsx$TOZUy!<223mF(NB56mO67+s~ch%}S0^T&$`$2uvq zIyAumW|StUC}^M#S5OUz!6PSFoRFjJ_RkJ$kD8-5fhm+Pij1R?myct*vN-d^ZUu*z zop_XZLeI~&V-xKTl@7+JrrHk7V&dGU_xvWiM!_piS$zqJ3Wtk?YU`?Wk*OJcOAHz+Hj67$yXydEfL5HWjDx@i@D zgNi-5oTS<;=69IT1v99jrQk`0Xu>X-?$=H2(uIl#QQP|i7o0ibajrvgy0hMvD=o`# zyzgQ;*X|w=u@Aq}tgR59c42Tr>00HTeF_n|vO~}8uT}M3SJ+W@VdzEpwdzM-6(So} zt`AeXdq%Viw(ArUCBk`)`eOFM%4l~^+HOSL>)L^vCihdqBak95Nb#!gQ<4%$W!Il2 z2E}XS4YMd$)N$@7M~MrK2zW`Lc(P0Q*A$I#Y^!L%#pK2s%K4z@5AQ&Udy=CahUvUz z6-_*-(ga_*5uR#FlWc}!lK2yNj28*mJ@Mx5?0M+59u+io2;7>jyj!`1=y~uGJ?UdUNy1 zv>j?l;I1(}=UBW`op>Yhi0N=0chW2y;h7g^jcv2?c^tyEge0Ei+^iGf6(wGwx5HSy zQSpnGn3L

vJH>e&dTlaWAeUd4?%x?Wwp^_Uu^4j*VNy0)nr)$rjwbkc8thNvF%? z0sIXj&FpMpZ8DAs{s*8P*IF0MZaKSSof$jGr%Nsk zoKC#%1Q)+LbMVA2EyL|)4~H-ZKU5s6XUIU#To>`|g;Zh7OLeWzp8E>QA0F${!9mU( z!PtJunY)>xg)6Ws%Bem+S3w+V<9Hdf`}J5nueyuI{;=Fx$eEj@1wx6hfx^}+r7`{? z9S@y?+?<{Y7#D~;j>LTxlf)lzSMN?Mj>z}Z5?^_V%_s-A+3>1in%IgrJ(teBHgH}a zcCWU|`Ra|g$KKXXkZou(WS<8<=C4@J&#K>DKg?Sp+gPtLtZJWA7gze``pf+<zu{>xA_B4D^8uz?=#sh{Khu5GXA8BfY%Ih*nva@K4I1TtdPz* zET0lTUmsvGpjPU0j|CI$B8azWId_K{_+%p6*_`q~>)reAy_~S_3BH3K6@j^l;>}L7 zy}!MCg2?rLcs%$NCi-4x`3jupVce(TJJi2AbLN%`p$2>#PxGG_M1**oMVwwCD5CMx znH#y8l_Pld^vdoI_A}l>H%6MiJ98uLcyGbir$y|Aq=bgb8*ls9HB5aGTgBgvKQppw zwNHgSL&B@Q*RjK;t^v*(FTQc~9VjLker9#?1I?O8<^5KBemZlf@#GEZd+*rb)&~wa zbG@szzQU05qxVM|N~ZV7d>gI5eOiD`oJ>DPoNK*IIjjbq4_cNV17VLk9vZ|oC{Lzf z>*{p^^^fq0t9d4fW{;@t@ZEFKK+eO0L;Z;ICquM=eeOkfT}Rse+J>O`d2wxPGx6gi z+I_y3{s@+h7HQiXHQacQs9<&C>Rvq)H!zQrIzD3X>9#qatw9D-r`GXmD}&Dop@3yj z9BG6}7qMM*{YV3oAI{t-OLeUGP}Vy0a3nz}uiTF=yPL=A#`v=+tCUuW#%ad6jwqXr z?1~1|c{vsv`a|29+vX1%?N&*M=kQvaIz$4+9W2IlKR&YZjj z#egbUlw$AUZ)(K2-+8#a#Q$!(e}@)BNUtulkj@`TCpj7L*`E3C%<1$fsR$%*hn%_7 z{@(05oeehGNUgl%Vl4TNpAymc+M@FGtOQvO+RoonuzONIE*=Fra|C}=Jy>hy#O0ig zcjL}BVjHlOdJ>o+>1Am==T$&DsZa72=P^=8xD6tn?1WG5OYH;SEje?Jus95f zwm(WW)P402XKokd%#jYIPsTYLIw=^ZLCzeOb<>Q?-cR!v`a|hC7@aGWM!HDoEB}=a3pBn7p`nxVmki!jl^dzmmbVWhV+!3R( zw@3XX&KRyxPA^14j+EhGN>aJ6m_3J-)DCr<1k^Zt^P1f zVesU|(euT-jmqR5G>tAq*iT)tcPsaqiZG$7-v&hFxQiPD4J>&t=Qms)t#2>IC*>Jw zM4dmVSI7Q#9>Bk)+3r2x5g)fcB-d>CY{+YuD<2VBw}o!btk$W$P}99RSm&7d;M9{z z@J4r3xk=Gg^P(%qIX1Db4rDnhQ&2Ch!{E;mHn=Tq>M^a13HTbv$c8UDbJFp$ar$EY zwa4TQxF*4PJ(wJU#u2(wOOF;$OUl`S6qyA%S>Hi8t+1^f+-YF%Nvxee*C0R6?A4xH{=UF zW?2i*vJMDV=&=B{X7wqwRtASXnR)iQ_+`uqZsewc=RK*U_?H_EMmj^@o;4h(l5Fc9 z^Q9YUG}dksfY}&z$)0R2;Xa{Tp(hNh!`Dqc92x4=K`QEZl9Mm7j~nSyAZJbihr^4I zkcv%U*c~GUDvhwOiyx#3y6Q8Xx$dYqj1gD*6O=yg+6eT**@T;Rgow?_rTKW?u`N9C6;>)c(1{H&e$YR91gZD@zt~c|@^o=tKYdgtV0t(dR z3qa}Je!Gy<#T+k#WL`kdTmx?SBujostKhj8Y)#nB3>!RIl5IudChEEZ+hW2+wjI8T(IEGy@8xL#DX*DK^8Mw-M`?>$#WktOP6Xe8j+)4fSkE%b3-zV254wk zkyLt8!KlX;tHnS$(>QPQ<1_WSGK=+*WsOJ&-HASUVU&i+f-|SE;LNR&)ITz${$U)>R@l=X8y31SLit`IS;vE-LyM4i#lTM@}4;gXen9f|;UC5c+L+9+}T4mK^ zg^5~n=76}?2K;u&nY#;2X)~fZnp~zJXYPC|GHngd3)IjLXYTzMrZcyLj}kB~fXDYk z&RiMiT%&dGnkVLyj0(VFO_U7Qj1O|=-ayWr>dKD|laMoa0j@L6W(dZ%?yZmI_YON zZNyMraOQZ7`M`Wr*KVFQbEeeQ?QNbKkTWL^Idc)FSm-}pc-RV~Eh^!B9~md|FMS%# zi1Rht>l?)$$JnLZXyRcZFV_Cd+6gcD&?-`x%Yaa1oqfS>OrP6SE({& zX}ajl%~Kn`aDj|PXYOPBoZ5mjw|f+P=UDv1nX58g?#u~Vr0T=&ux?!c$lvqek~8Pd z+wf%b`j|%+z|$?(=j(qsa|bxy%qWy~fT@uUM)wMjUoJcmwB*bMK+fFdi^OhYlG!cz z{imTz&Rh%R%pvf3_Ny~n-&?;lc3*PlRzS|2p~`K@nY*p`6@OWT>C6QgeRt+Qi*5h^ zxij~9SUanMQo+#7x-q4>N_Q4vnrNs@WV{eZ$dZ%U1Rpg}A`B^whC0}J zwJd~YBSO=bAy;CcJcm%}HITpePZTNWM+Y3Y^VsdN-x^=C-FZrgHO_0>!(23!Y0h{R}#K4;O_xiuCQrr1WXm`Z3oj;+k&TB**_Z~Uq~txL z*?V8lHdJ9Gvy$uC?z6f}34gTTbnvd;6~_nkh6+b1RX-sQCGH@4@JaV`B|R}>!&!?xPI-$NFos#c>^a#bEm}|q(Z*YKx*S2lUM(vTG zTt4qGfSrkVLxl$%G`UWb}HNSceLX{GU5eSL;Xw{JE}I^`yG?_7pgeQ%!SfhXR^ zX&b8wn;fvg*ZJ;yY#%z}k{EI(NG5ZC^we2X^{Ly}hD9s}L$11Qdeinglr zJ(zcEGDpH?@fEZk+aib-+0^?d75t{)#d|vmNdgB}iEw7p4hkq#?#VbIb#*((NwjX# zp7qBjGS=@Gc+wnl-t?kZT4Q{r&pUzGV|%oC?M3g@+r7#@%}d{rv+L|o)A2oEX*z0Zp8v+8C7RqBiLn&j>JR8=1nspcaUQsd)x)se^N>u$oz1MKttR{{L!Q;spPHZ6{dBfw z@+&S*A3usarY5&DBHxv>OXP!&GHzq&S%KJ;$xwlsS_>eGS(kW$)W{z3x9luxpLJmGLe3{+Ls1qLoy>#2rdzYbK@~QA++sat*QV65OAzysNBt(nS4% z-C>!0sCcqnb^Mb2H(3TgxZpIHY=?FBN(dhzI4GMd%I|M=o~R{2)!xS6F7M`c&+ zwaIy6@m4`{7E7tSw8DGjzN|DNm}e-p>c~o?PYvt6$s@eCNAyjsPP0$#?rAwoj=xdY zAy}$!__*!f%`W4E5vf8aw?sazjX zI}au5w|&5I3SJ&}qGoL2$mu++Ys+0I=SnA5$My1w*zq}2r&;3*+O&a>NDFFF@w}Xf_S>^&J?U>Tr0^XIi&Vb+a~8X$8TAr?VX!0 z;lK4dgRgsn-U^Sbb2Y?If1bD{Ai;+AeUrnwKJf5vl88{ZsahuYMdMg0x2Xaa*{l6d zo5$-oj@w#jiF8qihjxX}P)y0Arw_L@9m0;_1{j^Q`QMJ*6ZC_6-{WDN8b!!-4d(^>>-z3FzP3AKx_wNj6-QMLx@cJu0mt`IHn==X#%Q}7I$zX(=xAB$WtH0Y zJECtm_g(oK<##n{ASBC33)L0vmJCbF-EH)uS#}H4nd24gQt+o^w{3Z1d`!K?zmR5p zAn;7kQ`ZL0@+%4YM&|`atPiz@D;utx6Yu;M*vxyQu|E1#yYN@To`%=!A!klY>GrcW z%ieA-|E(y6>DENc&2`ulqFT{Yu95nyM`DWe%6WI(=%@(WoFwI;HMUaCN{W$?9huq5 zGx!4sbHA5&U)QTjw57euQ>c*jR6j$NsXG}`oK1hv+c!I&S*Hjp2j z$d)cTz0zE?)othrYTp(id~Iue?x;Sv3jPjw_q$-Ob6XElY>xgYPoe>0l|j|r#0KEcO$%Xn9E|Le@I$9o>n zi_F!n_%<8AVegoz=*L?w-(DYky!WZD=%=ou-`*B(*f+I7^z*>|Z|^G~?|XSb^vlSK z`MFyg_P;qL`t_O1{HKA(`#;O)=)&7@^Eue@xuo#9ckylD z=J&PZ4@lwP+{M3@d&M@p6+2T_?CDx@fSd5Y&fK4!x%vNTXHM1EK{X&vb#ssER$jGj z4r)8o)b{kK9pI%f94N7Al!PA2VP5q#2lbL+^CPw{HxI%u3r)40&1afw&6%t7;7 znr3y6=1pF$1_!N{G_Cd?t-HM1Jr3IU)3hJNm-fO%8Z{q7{5|D1Pxz}VX-gKLz>CSZ1J-wy}@KlB) zH8!1^&`UjxH%oIgJDP5GqSx#c-aOaQ{9L;Eg}O*2J>qmdm!c;ndaM{6R@cBEvNYwUB0ONkRLQ(%rctqVBz+v!t9a4s9P5@`Y#-Mb74>-xv8t99L1n|iCJdmG;NUjNuz{i?6w*8Q7p zf1&w07HGb^gRPx|EptQdJ&)VpKJL0VeCOlvop&R>_eVP)j`#km^F8=5`Dpan;6Lkp zv;V8}S)8)IUiEIOzijj4JAbS5u`bv7B>tlF1>h%#89-ava1`E|V*88EM`AE_zEymy z+3XlN^}pzR;!K?{7w7m>=lhm3o~k=L?8m9zx=iQG(0@7uL-6yUs`Zsq^)9 zaQ&|HHP_jyPox`hs1UXrGIc&frp_mhU8eIH&k_^$yz@F^^oGW57j(Y<4(e;5|E{rC z9>~=ByrT5OFeFIl3rjVCbUt1*Q|H^r)cLGARoTjZ)%hf_pjV@qcARd5biVcl zov&43=c3Lhgl6h|oy3t(;)z)GVFs6EK?J0U53~xE=`HAd1GbAgpRKa2Z1<)`oo~o? zLFY4!m%Y+hDq9oyW@o(Y3v%O6osTF3XB8tZ=zLykCQO~r@lQHmcFKoGG0L1d<2%rC z*K|*!1K;RfKU>>C?D$RRvt{ah_>_A21)Wc6na)R9uJfteGIhQocpy{fvr#o5Ksuk& zTMi;q=c6p?d^l5$9quAqI6Hpmd_%ygxmFj=4Y$VlO_p@NTmis#=zL-5wqsi(Xxm)A z{I2tD##x;CtIn6Qp!2 z>3m1y80|vchv9s|bU^95&Zqag&SzW4Sdmp1a!Q<9JVUM!`l0jDe8FIaT^kI^Xy(+9a@{&>&X#fCr@WX&_#b4Y7G0ftq3VeuB~eL+4Y)ADq{-O-4)R0E;@G(jRrcF(X0& z5Tk?<7Y^?906pLFT4QJdj>~mETmA=Os8-fi%(d2-!Z!~(ZK^-rGiOx+gBEqZYd>{9 z?*d5YyAJ7mlbi+1b-qsFY^xVRqLL3-j$bXm*6|{{6(KOpd95Oa));c)^WdcQwW|9a zGdrq356vcBEBoXxNCC=wUI-d`u^CACM=X)sZqrmOmID+;2LeW3Y)cJfDbiU+Q zW43R_Ir$AAJg-XOJe8j#6hJ}fB!{sd;9AuA!odZd&-ka#msPuolk!)cFDx3X5cXZ? zGa~5_&?F3`^BpwAx^7hW2A6a`7m=tXo$sCy##gj}sq+O#m2oh2K2z1USxDzQ1nGRH z3p$_OA9cPA{YyKy^XOCN#ettDTMSY!x8~h8Muj4nJuZ7iO2kX$H&KHW7 zrz<>2)4BR*ov#xm50_3)kE0;|v(D%Eo6fiGyUwS+pz~=BvrUxHi4vj4wcmBVqf0uU z%5t4gpF?*^=j+JXy)zV@_3w4Q*57r$ouP(WKXg9$-|Kuv97;<%pAx8)4(WUU0fYhmhqJ8^7EwQ?%Ubv}ayolkm6=d1r+=hOX*&Zh$Dd?c*N zEaJP)XTnmypz{TE{?z#je$)9-&9_)nd${y*w`CXmi2xS;d3v91$DYpz5XXy)&p z^*Z`08ZO2uVutHo)cHo)w#2gWD*vs{H-w8>`&7kT6W6hz^QrtNozG&q&PQVEeCm2! z1^=M)$^KV#J_|N`Hco#s)&qxvMWAjbvxMQU+qbc00h|pbnhnVuleKK#^CieIz5ImD z`C7YOk9MVy8cGpjdp**&?N>{F)ugQTSDlajU+8>I%XGdrrp`A2>3o9dHOCRLi#p#P zRB?``vF-n~&KHXDUeNgt{LuN_5XQt`b-o=xbw2eHuGV?4%kuhp>ym%f`J|WWd}9Ae z=PN>c!XaP2!hX5VcLC!MI!$=pZvBhSH)KmG2t@^Vo>J)I+PfX!ka)Yz<93IoV#k-; zEm&fA4fjWB?zehp^W}t_-J8d%+oDc1Z=SBdhC`fzbiP$golmK=c|dPL=d1l)=S%-b zolj{NZEx=AABwX7gx+8ST)g>(r0)cI!7G<{u1yEB}xniNmpcY>YT`?at375a}l-%&{COZ|UE=X>xc zov-%0&d2yWolg*58He=(e%JYmf9QNie(HQ)q3FRZoL|6>E$JNw??(*!T)d=qxE$Ti zn?LN0Ry}w8{D#vPdFoho|Dy91=zZ7uuKhQ3zWb=IHl=-IMPBQ6pSz}&Eme4S+wR21 z;ts(k3anpf-n#@(8-mv@=zM#Y>3rc!I$z6QbiQU*f4#rxe6GlKOFCcHqRyxPUFSL_KXg9uA9cPk)VhDE z^Y#8!=hI^uz#Y=rLF|}I6vglOHr%l)-;gCPpv78RxA7iPcR}YnV?W&L= zc>E9Pd|9Z!*ZCq_^&p+^rjXJ<>3sGG-bI~H?8LERtAom;Rd%mp526oSEb4q>ITL0YIztQ<@&BWMUAe~S0 z?{&V1b5mxe55VO6q7A?5d?D%#q94i3QRp9ZK4D1bqrjLtpW@%@e0+QYcP3jzi!ecG z?1BA%qw^{M)cK75p!4l&r9(QO@OWwxc-H9#IowkvQRw~Ym_?nBKj{6ff71C#|5@iN zIQ?JM`L+e-9x4&|a6xo8M!E1r^shP}nMxDdHJN+VGeVrC$kh2*8L7nIbiQplzvz4g zi#lI|VC%cuZ@$2ph7k4ttn)R`OMc5(uJft3<`8v%*ZFk*rOs!eoSKXf@mtdQ(kZ<} zNau^ig_Lr8doXoADx~wly)lr^cbkr#u`ShvvMD>3>3nkyuXjhVvP>-Me6EW+-_b># zkNgihA5|?T_lM3GM2^GT(s`+7i#ne@Q|B{X)cMBHS0x~wPl_?tO8kS)cRnO}^n@Qz z%0NwXM;MQ6*0XVY^^-g92rcM*)CHZ-_@~YXe%JZbA)RjmyG-ZPUC{Y#1wh)r)cJB_ zk9J?NU^n+S4S1a-Lxgm`TeKfKUw#y%^U>vg)%lc`>3miHQRi!s8nlh}`>FGhA)QaF z)gRLNxFDS`=f9=%MKX0hj8f7Y#NX+B=>MeiHL(9noe$@}_ZOY7d{Tcw=i9E|aBk9$ zympW!cvgnju%t&~Z5CSdTJV6^iOjs|gJ&Ge>d_y!1JQ zo^a<$o|N#(&4qNn78$v{Y@2KhIO2DxUdh*<)N;&0&OerGYz&eZw1f9ialD4+x;x2W?03p!uPA9cRVn*q?EwQgP*3CKuj6MC0)K8t9_|ANk! z64F|4;RlbH{Cl0xfgw^_d7)a>IHHjS(_tQq5G@7L|yre^)N z(FEDVpIOIAqV>}uqt9^5_$ViF+hruuxmP}38(p8 z_i5FR#I|I#m(Pa;s>@BLOsDbeeakKC3Xy*rKI8RTI4b=uH-TarQG1=1t8zmew)M;E zphliZGF^x&P+S`;({Iu~+>q&*1M|8r5?{~89z(p10BUKzwscfyW8Vqd`ef4x;uVvA z7P*8Nl526ixy};haWWG zjkGVftd5(>ZGvb%Hi$LL9yjS{(tOpsdMWe2(tP0F2-6~mc85ipZ)$<&o3+}wNb^1X z6V0dJ7P?6Dk(Ow__63@+9r*&H`DCMDwN;laQl}d=xO-!yWC0bjA(mtU#A&{2ta@z} z8y;8|q8cA}|P^Ri*2130vIE8U$GCs%TGq%xjV(8C{{xWd@M zhdZ`9t)AWe;*s;wW|_2kL&_#H3>XGKj`ggvcaW8qIinX%k;LQ%F?g=W2bVCkP9|z^ zcl24uB5IZTN>Emm4Dz^YL?L2TF!r`qJzPyI%=1?qsJ(S!4(UGWcx$fLB|ir?bVs&l!b~_4j&0gq$7f$(y4kwsbfP$T(93;+)(rZOV5J z86xU>0cbCPbDWzm6pVOhC1Ie6js#pLQhAfxSG}&4o8VT_b3x(EHE$u; z#@^xu(%4%;0ZNQkCjF`rdR`@yzS=mJjS6rRyTj#qx4s~GbVh|q-cLG7RDq6fiYCqo zfG}I4oRPj$4ht28=~+md0Y~1ZUr6%B2yPmp^ z1w%tetM{?TzzJadJMQD{8s&29ls>vXz=<5iJv>%Xnz9e;`R;+jrYKmPj_9W9qsFoJ zLpi6kdqnlYtczFQ51u}C@%}p`c&40}K)joO0-1g!IxrK7mxa~Jqo}rsbztRa`xQj1 zo2YcG2nj$0I+i=}lpHw}(h6G<$Cg21Y)XmWpHo&AoOos5z0CzTSV959WX!dPWoG@h z?7k>I5JcyI^V|L1uU$7i`hwg0>l!{Uki`pFI|XO6W$?NJY?<{w0FA{JILK?yUGRu) zuKdXaH=zJjyzp>6o`wbiwwx}n+J|1Q8*b+VYz4p$2N;{{Rq-4Ee}ayt0vsjwL^}jq zg+7lhgJrmObL1ly9IP-4&K3$DFE4S^(?w6ZvU%@@l$lOhJueD3#&bv|zvXuiu!G@r_ob_a~@BF$&Pr1`*(Bi_;<2N!6*l@QJ6 zNFJi(v-njYWiZkE;uzs%h~|r>P#EUxpKR1Y)Q7V4TF6q@lO%{ScM{|3RN`Mi7~nQwlP z=2INDAObKZ&8O!D(R|uZQbLBkdK~U}5f^Aa+Xb4>eSzkiglIk`p;pH6iIN)-&6i*x zIpinu49R(CJ(K1ebdQ*2s5!&lInpwiG#|ie0yJM-cY0=cGeq+dCt>&eW#XM4bv7r= zPJp8@@%~)54j7Z>`$Q8W!av$UG#^@~l}Yne=uFWn7=mPYnF8yY?=&BKknLuO<_mwa zNb`Xrov5ZFkET5noWOWtUVupH(`!wOG+(6vPF-%MUF-zCY2|}PP1Bn4mKYN>CNeNS zX>ykdAW}JbqtTsM3C<^PbZIt3dw53k8fj$bT8bOh)irg(<}~#grmTd^Q!gHxH9u_m z=FT>Q!e8AWS+~wb8^zT5N*8p#T6p2c;o}!ay}&z&K&H+&@is_o1k(9f!h|86uRy4R zu>!-G7>2pc!4kaX7Ii+`epmu`%uPt=yGujP!@ioZFm=ApMV&7}C@?mx*!km=I~F@C z(_%nhNau_6=OFrtC=Tx4F?uKIVdbLEM?uNmfOI}ki&FJC0mQ`V)J!}s-(wJ;@j^B+ zkg4-I21azg5FL4;ZLPyOLx&L;bw02d()k#Sx|5L3_nPLhp!4nHZ0TFj`6iIheIT7L zbW0{aRi^Lxsj&r}FLF!aS<~jv13ndtIv=+o%Xgj6Q4bF3e3v1ekBATf819hHr$mQ6 zrKv9Hd}K)HBY)TVHbOdI!7n;rFr@Pp&{WSvd`&T3fvw|alT`{obv|H0=X*7r zuXC5bFC1iuT2_lK>U9Am%22V7SCq-z?NEH%#Y^qEXLkJ83TODWmI_ zB=D`pjWY`2 ze7hEcLvm(3_odcY&J7sET@HtEDxXFDL|JV7>U z&87Aa@JZmg3SgxG!a)c+1~_LK$D=Dh`HjIqfHe@gky?W$0Y&YA$Sx%ek%q8szv&B3 zay-vq`JwXx8H+mKnnj(jV8D&Ap!2b#R0U=FTS4AM5uNKnz zfCZfoCP;wQ9tjGrh}rgeK>EARS9v6BE@SH%NaypzL5L4%i6Os;^ z8%tT#`RF6GPAVHy=Nn(p`81bwKDVn(oljgwAmG!-T>f*pKXg8JNaq_mO)n899MJuj zI^StX=ko3e2pn^F3lf0 zU%(HYZ|ILY-#sh&3y{v|q;{F9^O@L6-cdL=9h=G2`K%$GuiQf37X|5j_j{Q-pOQGF z^YuYGUpX1l`I`Lr4naB}-*=sF$j=@_n$X*RTO#61@;wzu=d=E+&X>ES^YsGJI3?_Y z&ezzW6EJ6(z|{GA1&-R%?hiy@@* zh5nPy*97T&>ECs}an%^2*k5(Nt10!8zvz5NM5JvYozK{IQRnlM$!F?(w~*bC&bPjv zfmQT}biR5yNas^;4aKSGjuR8fa!+OvDh5OIDF_*rwM#l*r`Lkc=f%|dyrYC5ov(O~ zhTRVycAo_obiT{bd^2a1f1T}joloIUI$stIozB$x@|Np-N>NOm55tv6gmgaIPo0m* zX~NX`Rx!x88~N?#e0Yz8QHrbd6RAv{&knJok*V|Pb9zZYI$tfOAnvEmNBEaIUq4gl z8zsgVwP<2WA)T*PlBx4yyyLu>I$w$yf^5sw`C?b-eAoFD>*fCsd-oOA)D|_2I*lGu z=$+6*5$U~zgx(At6p*T@C|$uuLI)dFP*Fou6hw+5T_7|Cqzg!qE(p?9tegP4w{G3# z`=0+f=Un_(>uTN1IoEu~Tyui{$^GVPg+|v03zUh39ztQ=KzJKa`5Q@&%^@q;) zjH2_keAoFB)ypJk^vizI`EpSdoiBi*^BL~weA0iX^X2}m^XXD_z9r~4ozLzMolkH} z=Ns|*2c1u5_8)Y7>U_JA+d5wrMd$PHj1#Y(VE$3(djiB(Z0UTlf7SVxP#3#u zzUzE#T&4jOozLKZrSm>M4s=Y=a|OO z?#AfL=OVy%TRNY9la@aovaR#UMr`SP)>}HC@4xDNoiP1mkVJMZE19i{f!fbl0#9Zg zBp~4m%0K9QmHUK6e%1Mi=c_G#)%l9S62rgO`I4Zrw0f3Pn2V>aswUX}QRiD2gPj;7 zdX;SHeA0i_`S4ph-!${T>U_Q5biU;+oe%h3=ga+5=ToKVeCb91LFdCyFpX^Ke4*cU zzDOArn!oCN&i_g0v-~%mkL}-eKD&R@`P!8JN$11;Z*)H0O{^SBfLUq$Y1e*4CMOG5 z2D33#+UpmcuS|#Iat7xO0j^uW=zQfswN3%PnLl*CbkX1Hd?L7i(fJ+%bIu4#+{%=Q za{Wo?TcPNDU@^zAfk5!jI-kR~&bLQ75VxiCdF<$XxGkNpyk5)WXPqynUh*$GAO2UJ z&-G`WkMQ?8p9OgrZMX^^xTW*?h3WWW^>658w!7%w5H|i(=Zj#c=zNaXk1L!VMbY^nKkIz(Uv$2REuAkAhu#4Fp!2;#{i*X=Zs~le9i6Xd*Y9;c z98UwVC5Oz3-_iMWm3)Upc67eu;75Mnbw0~&oi9b_yUvIHrt@9b`KI%60Jn6$&x9SF z4=GaigU*NlH=U1zqVstoJ%HbJKFBwn?<;0Y=Q|Agrt=N|qs~_r26d$9eB@m}>U^;8 zI-de~l7pi2nQ!ZS)9`_IUPlN#0>pw2)5iZr=YwzQe8Y9S{z2z!spb7q=NtW5=Ue{o zI^WcWiq1!rWYPIS=WF0y zz`zK9=zKyy>wMu`IvEB$}1^XY!q`PjF0KDby_1LsdVUjs^E6R+&c`d#O1U{As>4Up|qw|ITr1Ks8QRl<{M(4Bm zLFW_r8=Wum?{z*8oh_Zu8 zi_RB_(Jw{*USztQ<_{G#*u{=Lpe`c>y^5b6C{=R=3(4gXH(QwXQF)Y;Pc;(`B7 z=YxneV1Cf~Vt%Ldb-{OZJ`b>I0QT>7zE;4z;K}AIrEfZ4FGc4I_*v(JNI>cS zs`F+2p!4B(biR^pozH(u=Sv3u4|P7O|8qK@{qJ->PvVx&N22I_5&uc&TmAo3=X*j( z4E&$ydYrPj=S8dF8!c4KfV zTRNYc))d%aHcS{1_J$eDHC89(tE<_&t@GUomvE^^W5DXrP0==xECwv=0^-iD6Wt)A z+ko6MAX$pecf?g|wqB|YB+~$XFG*J22-j)=V?&KLNB>451K`9sgEW~Cj=0zNcd?|HT4 z_$>Wt*@u8slm05lk^?#=A6O1s-?6%5`|u>|<`LU_`^t~M{b(~&mA>fPw3fPOw$qs* zqh0OM`9oc?9-(sCNBl<0DpqdTR4PB?X$hF5W#>!vnQ}QRxmgjg@u;vP{=%2(dkkI0 zg>K0$zn#MyX0?xPlJ%J z)n+6AY4hX@rjxqLPd&|-RTdr&$EM8i2dN57!Qlb1J1pR- zWRdfD#Y{%F6Kx|g z7||XXQwMn#Mc`boKyp@GWYqHUhFKahF06HA4tM1ws<9tNR&kIyJ(E_Qle=Oa*!;zC z`%iE`=#1Rx^2Cc$y-2Q$DqHbL_IoiIeMj^lu!+}3=1Cym(Z_+3PkbABQ-}TAHBV%b z+9t$qH$FGY>pY$2_L|%CfJ1a`$Q@QTjtGQ=1m9*Toj?>f79HUQ6X&^S8W)v=7rC*U z>$n?3^Y9?HJdXe5F;ycrthuT_zq<$X=)^C|mU`P~ZVS2V>wUJ+AGY`iAcIGwAX zx=`G%#r@{NBPL0iB5$0@?!k{qkM58coWyRHlI|SbpQZ1rYl7vN6LY~H5D2(V7BUVV z58^uDaXDBxAuK+C!>@j_^?f1F+J)4=z&pP0s_No66P5x@F@URw;9 z4?(1}CK>wI$CFrIE8aNdqCv`FsF!KfG)+jxAzh}r45ksY5EfA@&L57Mx)e|wD zh1;|Nqb6O#St1mxW@g+D7c~J=E?f58~I0f zsa8LUWJ?Gr97XyPi7ZnnI)@fNMPC8ZzJxNccY$OJ%sK2iHOhQ*RlEoLn#ggy@sPkuY)cBIOM29U9PPJ+sG|pT zXm8^?l@`&aTC@{+rxea}i$J?oxn|%Hn@Jl^|7Q6ngI!+BIL;DWo8*}fa?uM+=)(Lf zEF2RGk))+7dxhcJ$2=~3Quf*BJX}g&HtQ}-Va!g}s6|VM2~{sO;OGO&4B}Y)&1aW| zRL$2ppFh82dh<*(OYeBIYNC9{=^L!~4%y7_-7k18sH#D=1%C#M#YZS6>vR~#Vu)w>N0w^ zoS(|d@n5|C@Uw%_xgf}B@4#NCd1v;@!4yOM$zqQ3)z|_sRsS)mE4!`maU5c9kQ^>- zytAPcqhDh@3u1gAQ^A(r&P6vmIMsoY{{VKkjM?v%G=w;dn9z?HyKxu}(HG~raBD;- z;{Bo@pS;vU=_{Z!mqw9bXqq6-E*F9eOc0Qi$+W*4&m|ZabjjToTAr+4(5wJhdq5*jke@Zt* zusS!QP)or>{w8PpA={Fx=Xg+^8_ymv5XLES;-K~>XD5+ueM{%V z@pE;>+OB7Ej(8bpb9E|A*UmH9h>ueZvbN#($UB^xBAxR;V_YE4b5m@@bOaHcg0sUr;c0@ z+V3|?dtYpIKB@s5T5CvVzUAosT5*aFbL3sw6Q{+JsBqq~H|?4+vJ7r4=Xlvb6M{BG zT2t?6?qC-%!}VflEHV?n=+fZvwjW_{+QIt@ps%qGtR9FJ;8_ADaMxy)2#?!DG$d-zSRhUy>SH?q7Zqo@87$wCQ@+Nt_3-B~U_E9zEux>D zszB-V(IQ44e1EgQdEm=o_*AmqcUGWJZxrPscv+gnH9}F-Jk@Db>Z`6Y0fzN7MWW|UaT1-kN# z&ympnpaBfOq4CE8myZQ-ATu7gUl5(9%U>pGKCJV3L1g5HtBOpWlyJ0vHficJkzXm} z1jJjQ!GPY`Xk%0189hw@~$*C*w;5Y!VN96%U*5ge@W9Yk2tkc=)GI_$C4Ywm{G%AQ-w3a3qr1 z0?D3$*Z zZi)L$k|masj}j#-yCokpNs%q3niHj3yQMmqcJ)~9>QDT?I^XYfzRmxC=zM!NS@2*R zJWUFop$`vdwP3cfU{A5&?z7-$wG^_k6icy`?6cg>Y9()DrJQ1=-e;xFYK^n8Hb}8H z>9gL$YGY+%W0zvHug_*btF60@?ZFh=BYn0$tabr5b|+HoPW9P^vf78+*hi<>pX;-~ z#Ojb_efu7>I+AT1n^PQH`y4x1 zoqBAX`cs^S`kY>}I#1d-ze;hQ?{i*a-M?nD|9#5-PksA0SzW-kE;N^27@oVp*<6`z zUD+?YrYNES0D%t>ItXABKuJmm0H`3HplHgImeRkSmIlf|&kF-1;4mfxoRyi0lZ};| zgI$sf&d1Fuz{i8-rx6z57Znl^7ZKVeF1lM1ttrVbw@X4vR$4`Vx4NR7mZ~B~O&O=D z>aH!SkI^u~XP4#us^bwZEda1^&w&q5AEzNRm_?&I=N9_z6_aZ&^*%0?(?>mVd zbhh`}@6f!Tv)Pr~+ugCno$vo;buc9+lmZX#sDr6-#Qz06=$C5emumka_0US{-v4F~ z9@w!5Lo>X7PVoAjKZct54tI2k$pz>3NMGAeHa^Svy79xieR=jJhew`1%eD zjQmp_e7CI*g6L&2oTy}3(lYkunP+wJsx!%f?TQ>{0=Cq_fGu^5eBrVeg=R|iRj=M76q6m>9QTOA~ZZK;FuH0U4HLAJlBgVs=jYug{{ z;IZ%OAR4i+QzC$8OC4NDq^N_=;ZNi)@D!U%rPYF;@?4{+gQAOCRdK5EZQJT#RjXXu zmO5Bu&C^Oz2N|B;`9Va3h8szhwr#6}kBp?=$Ux|Y`!sa#H-mHp#w4UjWMdjXIcFOBU#adccTXQ5Qj@8&EpZuCFd(Tk0UO zUizClxE4M8^)t=stV!D7Kh;4MdR3P#b?^-ctw2!+J1FX4(T+MO8o#9uPJTXCb}ueV zrg=vl?BqCyUFW}M-oEtmyE^F3{)ajU<^t?({7D_8V*FDb4EOv|9qiw9$`}Pt2&5e1}HpdTBE3gj+5|Y98p7Q08N|e!hZtdGRC!~ z4nF&?4jKktJn=??j~<^Z4|o$Ey~>n{CTCRw8Zp?qmK5k$6he@G)|=tb1STwUTOI7R z|3e)_b8u>E3^wlin8kudr;iR8iXL$CJyHJAQ6cDwiRyFn@kX%5VD zWn~6!5w{4&wd{>%_;+>C{h~mli^mIQgz(Uc28-0CSU_Bz(&InWL4jY@!T!IhgO}_% zB;fx|9lX4w4*G1XgJ6Z9)xr5%U7@C~JwRStzH4c}sDqiLzp8__6m^iaqYkeAMjd2P zIQ}8bh1kPi^cQsy6sKK{`mPR|{ze^Kqd{+}gTXuMpf^Pw#Qv!cic@j@57j~CZ|a~Z z2|*ojezG0>dvy>T$&rhomS=J2_1RGe@xM_AiQDR6<-qUM!OB0?LAUSfAR3Q(WzO4?X{<}JeSNNt5j($@I9sW)o)S;+@3p?r{o-kf>Lxk;5bv9o8DE~Kgu<&Pf(1Vvm zQ3o->Y#U*9bfXF>L80df8IBBkatTR zY*uRfFY4gcpVdLyAJjqn=xufI%%AFDzR-dzMIG#M2XtPV#1P92P12IaXZ&yzGY3csm?X@97L(V)2>)WK_isDp&RRR?EkjbLws zzQ!9PM10>)z&rn<4${z@l+wN2RtJ0IzpH~dMsm__O!L?)cJeirOV&d zLFWIU4g%;(txv|2)YG~nD{`6QN*GEJyNbx3-_$__!_1C42%w)YjKq&gU;C{(==Y~O z*hxAW5>G?IFzU7_>XF3`vc@)wi5WzL(qg1uw_HRtNU2c8W!j(F?Gkq@9Mn4o|3ph) zY)F&I_U6S9R$&rgW^Xr)g{A0`BTE{%b?$@gZ4kEgN@tjpcjDg6C#ziE8v8Ic-}v% zgOql~9QB5~>@*_Xl-t4t!c?y&7ynfq%==LtB>qkv4Edj{gZ&hBun1Nj%<%#o_nwn< z!Q$qPmA$LtB-*Pa`U(>C1qt?%)GmkO&&C7&|4kjd3ae03??a*7;$b~tRNrz2x|U|H zGh>dT4i?kRxu7z~eOQWqQU`l|GsP=-0EsY5wH)icIkqQr?62lnR?scFq~0v@`znU~ zXLS%Q6UZui>qIJz7eyTmr*mDVF6rJ<2m6;9N%$>waB`DU8WffrXRN$b!A^=g z$O-+X4g%@)3mNNvRR>vqPzQ;0Dpz4mzW19@IWL#D)WJT&{m&9P5909Y7NKRvQj?Z{ zs)LO`tAni+b&%;#b+B<;9gHXFuJt7%VA3$l-O1W z>FFKRzNv$!cho`IEp@PeM;&Bh@P0(k9Pqd5;LGpoVCO%mgXcgt756dfId3O!ZK;E= zVR+2#L5XrI`uZXL zhJ95L+z!?sx75KQiaMyar4GiC>iZC2&^hX&@9JQ@3xk$g&h(L-9`{?J$XjVfh=sgs zXb%|ETf}~S60*x?qZ6)qixuPT116@$U6IRQEd}d{w5xgA>m#&=>ne!kIY7O2T#)d2 z{2YnaZ8@_v9v1eaI!LOTYAv%q8GAaz=d3Bhrt~Luu=StS!BAfk#NsD)a1LK@dAG^0 zE%V4{kR{jgfQ4$|HX3Ipm|A}GoD0KZAuQXQeFk&&#t`iRg3Sy?9fZ;~e+Es~wym$4 z&;SoTl83#-tK<==eg22)U?l>48v%Rk|4khn@@4$M1(RL=K^=5upkF|w{Z1X6#+X~u z(;b1L-exzYcpzQ3ucbq7GL4qz=ygMIF4*6Ncbe$FYjMM(pQd4sl{#F7NkYPukFChHA`@ioD*Q`du4ulb@6Qd0Ki7(&b}f{QfBjOvZd7u7 zl$m(qAUnh5wLpvaFO}H#JCNc*z|rv${?jbfA+_yvI>!$*>kmqK@JV}|p*rq%BFI76 zf{yRFOTgVbA6Z{WAW6&MY8t9uoBX|NX?1G(of7x#AJX@`bd?{Ts;^+s=9`+V&3`6= zSX~C4wi7eDq2t~)iL|I^baS74kD6(@ZV_oRB1pV42= zIdQUeG9>Y^+U;h;g%|l2=b1y>k<^TIc7lK|q)zO3U$G*-i z1L>t7g?_n>NW+)Bw3oaPM%N<>F6^~q%XL1E)?*z|4?svp%amw!z2K^QaPaK|2$24R z=SceKL&frk31=%Td&9#*=0F(_L3Iz$NteDUa1L0e0e-I7mGMSAqtsg+5nl*@6NnhX zt4!mlszd}HcH5P#MtLgG*jFl6FU~I_sUPb`9SBBnFMwJR_vZ)nzxF4c99Rr}cHtDm zBGVM}i+ji-lkylG!YiI?$qX@1iGTQw77-#9v_VU&o#J0=7Z#2^*^sFa$E`7oEXDW9 z7X>iB*J7sJ?-aG}E}36SXj6bJ!L17!yJp>Ky3coP=(xUukMc}B+_mz?72;Y5Yn4xI zj6rbjmsGS*b-++D6$lJx~)`w1P*Ze<) zeNi7+c5R4d-F;wqV8y*9L1fP@$H7(4wkwM8O89HIcqWW0}Yrzw_E2PQ7G(F34RB_ra1O5I)kPc#@d(@uBmbZAG!4e6?CbrbKG@ql;=F_SRU&N8aCtWFezZ11u<{DM#?2EyvG ztjl%eL4rP@r_0ZP@6`H=0iW0~!-KbYz^b8eD2v=~LGn{sBlIQ5Reg!P2dfXovU>fS z#a9lu8wovPv1*on^N}fEs`-3zo8(a7n%PtTS-jbVYp_=TRgvpJdd9c|sCK(@YY z>c2Lj>=?S)*YCW~E&+Jq?B^n5(((G`S2PjV8dJN&TP>G5LV-03m0>CEWp4YLb=>)h zAAwNIME8f&Ec?7I*mNa<`7`^k-g&_JDq){=O0-n)fY=M1(qPL$hjHe~8SBOt*PBNQ z=HA+G7J8Q-xXE+*z(cOv8JAz!7w0$aowqJOt9CfMXY_XEgQF=l%rFb8_4P z&sz6d0q6OkKKIvQ$Gd7Q3M-E`F#9ZUnclKm^lK4HJ?;NgGJ06&R$umz|1}LWer)~uVbX(b-33|TI&hJ>i`y-8L<D2RmG5iH=JME?PrZ4ptVLrQ@sa_{a+;rxk7bN? zmg|Gqt-WW83jEUH^Xl)kc`K%C0}V2VvcI0dKtz`f=w9qg*7v|r>Epwmx{Zw`uGiAl z+~vA9yKfT)i`K3bZayQ#nV~(33MYOZJB#}KG_WC6$H^xA%oi}zr}Ocu9D?{L)og1; zwHTa;VoWsM@Hpb_6b=ULGH5CyA%P;~vBFNNPD3m#xR5b4#uxRmBojY_59jfQ@M8z` z^y7+RZ>8Z-FA4PwaTe##=E^Y0wU8TD5lvheT%0{|C_Oz*0(J|Bcrw%_ibw`?eO=i< z9vmq+Cyb*xDwr@=s2A1Xi!(TWKH&-vvoKKQiLoGUQo4pmRdH>={d7FH^cP9hnQok5Q5~B&##EIu(5}s z_KsBK`WLWL9X*qGNxC7wpwxpui-2QIt3#k0TzHp>c2eZAu%P9!%K^SXjW{A`(+d-n z!N^NH{E-Mho$GtkzE8q#&6s&3{z1W<3zxltq z?J70NOIUx|xMWPrmA4YhPV16msS}wr4TNr^peZV7o|{2KZ6SMEMxHs8Xr5ZT%r~1& z&wT1V@=|USqZ-h4kg2SWwhpIk4TA6MbZ>&eIL~){ghBxT5X)#HwG0Ztzp$Ch%s~PG zd^2t zB6F?s&_xD~u||uNN7Z=ZJ^1d_KRGF}cBvSM=J9qgbFzJ|0zi2Rvwqy~d09Zw8$d7^ z;JkP2Mu~Q2LX7#rThFe}2t%py4E#ZE3`CT_nsia5j)K2I=wbqiTqSf^ckn5umBBUhcbfK(_!IuonNb9L}p)N#^#Jdi1n68}g;fc6Cd&Xajq4`KjHmUCAYD+rPwi^~2o%9|-tdK5Jr&@2z7Uh+*bC$=kTTc= zu}U|f*dyY_TfT3BET}mF0EUVuekE|-1p+9;w1a`!X&qHF%1c1V@jxI)_9RHW3;>`; z04146W)1r-#9%T2YJdym>vR)9_w#aiYU()S9ng}fHfJ>MY)mSEP!*&Q4&+hi-Q%n( z*eyT@pK>yI09z*oO)&u40Hc8TU@<^vxT|&+vzG5B26B2V{B~aYEA=o8ZJ_RXgLz)i zu#5qfs zHJ`W^@0}=C1Oee)y|3~C(!K$SC_>I+`2BRoClh?i>Ww|Z0%@rLwE%r#q90z352{2X zLqolR<#$!!6|cFqFPt}{{BE9-*=co8fF=M%baGebODNQsNxIPH(+|;Ik`m+@_itHI}<46;Qn>_ zg!ww8BN%{I=A)2-N24)Vv=)aU5|Fi}D}jknZ61Zz!?GR+g8 zn^Gzt`q-CmN)9Booy}#3>{Px;e10nvkSe%?&)ZAbRM*HaV!m zd5=O4w&*|YB&X_C2AyLB6v2!X8=rU0MaZ+hV=jdGBpu_ z05^Ojg9{)&yj5j6^5GiQF%TlM_8Pv%G*gmKZnR~mJxEmMNV#W7ZQN;e^t74f$BIYi49R(s_W`3wMK1EOJ|QhE1TVHmk{0k!c`y`Ma-5j^K}8EzE~@~#h zw%(BfZb3=74Z8Kj=gk9sM|qPl+v*@=ZvqQN9eiUZBu7yP3ATMq%rUJ36Bwz!ge`Rt zV5~N{r4E|%Y^j5#pY+{!)IofdW*Vr}G@7Cgf<(L3BZ(e2NhbZi9eybSTX{`_JJ zX-geMtHZa|L4KYFjNZ07sA7UP0whmuse@zTKd6I%WF!aSx_pr+gE|0@*-udi0YI7{ z)jpwZb#NMarLy9Il!NRC`#}KTWyDZgZc)NfsQ=r&yFwy>!CHI{m8oVlmhGgQiamT3 zb?~~1AI|h`se>|kLypjGbui7st#+T=cXcqx z1W0eVr4BOhsDrtd{0?CH+&ZLvI8|q@H$@$!1?KLkgE=yeGF$3k7SxefU|SvZdgQ19 z$R9#xL3H@KOt_bKr*ae6hbo>nMoRsl4q|kYI~h|_WJb~osj>D>ODUwChr z{aLA{2=-_vCkY7jsF}Vx`*qcgG2R91iBmvjKdmUA_ z)WIH;Xu8n^fzk5Jef|7(6RYW^zvaDcxZK8 z^WKEYMw>au#1NRG4iXTZtq8!$m_0NTG0;FvKwiaPi_!Dq~++r-=rH5V9602=mrt*IAVwE@vs znhojzFFBrp@T+}=HE$}bDnX^6_R+(rw$#CCq~!2WAZ>)QIdR=|OC7ZRK^^p7t_nnX zQPe?kqRmKcQ{3`&C`BD?5u&JrSFPiew$;H6i}}6V>R{})I_T5Btq$_V?(V#o3ShOU zJ9h*>@6#^nuhoN~B>?RSd}C1!`Hl_S>R@j6LEde3usc)IYN(tOpei-fin=k@MNtR&chtc~6B@_kkxU9|*RPohD%90gcLHbCF#!g2wBOZ1>cbOy zuAlUn(vRABqMFTy6HLXK?%{QR~iJicL_^Y=J%W|;@G%-^zPLwb@x>3COIluAmeZXq7$ zIC%KN(wi!e%j$<{@ln7Qg57ff_BoP#@1WS?LfQ2ph5*2M6M$#!#C#+HFc6+OUrifB zctzg#)vQu9*=w+%K1RuSHy9gfPBXHI937pUoOIdE0q}DyfBb2Mc6Kk{9W_;sVG~MJ z1~+^Q1#HrGzVh*YLk#Cc2QQidL zb@dfKaceyF3f~o<78fdEyi*)%-p~G`+!#@2fQkh#B!uwgbLnxB@ztacH-Aqft3>rpz zc!f!Lr4>abAIAXY?WwZLV}fee&Bf-a*;y>uV=UQe!Ml&xY*LTj-IWj=lY&25I{3I`I?#Sg9gIzy z_)1X+(el^#ZL5PSRR!c<)Ir*at3j&Y)WP70gDP9C>`x5Y8 z4hZvT(EtDwiSpru@UBoO#sFQFs!fJY!qvveI!!`DrJ9?4_*F(A3{A z31u}z&7>W?aCDwX337PGzJSM~DAq4C35WzZ-kG65OOI||4|^;`gwMW9HZh7l_(8-! zWYC2gJ4^lXByD3N(EARNqlLH^L>q#}Phl*`{FkJ@vK8QpWT2NK_3cwFQ# zfJcFs+PYJWGXV}vcI{56Bb!RP6)XT43#JqInhW9y&yJ}O1v)&PQ{a)okJl0K7W|#O zg`(HlD;O4}W8%)$O_^J$CRGdDV2R?cyE!=Us4&SqO+X6~IA$3GxGT2{eXVv+g^xS@ zw4n^azCs`hxe-$l?Q=vIdJ>L7pe3mQoHcr8nt)mYH@ubJUEIM z6J=c{jH*Srh?k+l8Pzt%Fy=-o0+Gh!8SRzk|M72 z7IoR-aCwg1%dyP~`cn>PtW45eY}_MkCO0yTL*qVR%xq@H!gy_J#wv7 z@Tgz8MBY8@EIy^^X^&aBTQ9mS7Fl+E2eH!aVb9e2!jN`jW?aOvKu-8J<3 zWxF05UaU8MtRLb9f$PmmcpF(Qf{kiT%cu4fGMZ@kCELxMl5JuS4WPH%hc7f}5{OQ3 z7pn1kSQM5Tv1xg?W!L_y-Hpr#dh^-JnoN6CmLaMO?V6uH%Xca|EXZ}~soOKM=u1|w z+Zf7!QRtGPIm5!Wn^fnK{~RIrFkd^$UA~|7omIBk%XnU;7qW0h#i65>UzCRZTH8sz zhk861ic2P3FN|1GDo=MCvbi&KYPWeX(}%2A2S3J>Tv~?{_Mg5rm2%3YeEN#A7Xv+7 z?nXCH|4f9@9?Pfh^REn!C($wmqUiK4Sr-Q=nMb`;ct*Z6?>xytd z*I2b%A%a9J&sjGpnqr9^;}*qXaceA#WT=4IfK`vEMvIIUTl*)gHm0wi6T+eNPHQ+= zBg>>4Oxtl2XU(UYZwXdVdegIKQKR^h!jvEZonXL0FUK6fAwUCNGmtuq3g;hlXV)03 z`^@HcQEG}nlQ*Ep#?_U#kg$wSXU45swxE*l6h_*)W-wT(AnQ|n1-!<9q7m3+@k4dI zis_wNuqdWr5(%0UV|0y}BnjaAs;-`Bu<_Dv4QO{2L3=|P|0K;RF`&i%>!s;``n3poqI!fcnfe>9Ed=-p+Ri2Wqv&~*ztRZRH#rg1oD!&K69 zN7`5Z_zOnfR*enlPNfK6QTF%_oPbIW0JCq$ib z)}<~j6tU{AYfYX4+N#Rbi^i3A91-wEyT!K<<_ zraYd>CtEmjpNQrw4e-awM9X3WRuzdHzqUO>(4H9nJ9XJdNG@l+NNA0B2||vInR;I# zUilG5j7*#v0_~ru6{d8`&awmuAU#FtUyQ4Gx%KjMrOb&nAdNlM|QeKmb)VU(tn8?cg27OI<5`6<$! zJrQhgZN!!ZkW7qtqF1bXOyJCL3${iU!6Fk&0}j#A-VAA#DDsP=i5&1+lcA`Cn5&4K z80}@hHZJ@cf`y_ED#Z=*+xkCf(v<0Q-0%k6a|4#l@_4I@B7!M+TgR@V!xaqV%o^ zc0sD7`gQW_Sgq_aD*3WH*jbqfAs$yws4z9t4H@>``QwZMWn{$VtZ1B+8_njW9d*!R zER4CNtdWVj=$yOMI8&ucqvU1Ti$b5N*MN*HjkILtdKv8y7eT0xOScvb7^!-CGPAJ- zL~fF$QCL24Ln}%Sw@RsqmdW8RkS|Q8PYi#@lvIxd%3m$FHPVvu5<~}kUF8B@a} z%&>8xsDm-v>R{~3`b8H%`Sj41XAkDCv@^U8U_5xCbnj4kQC>kO%s2uhK|-;SrJERF zL}Lj@A-17-BwXbL?MVf{E1j21WimP|ouncu_rG5h%gDPHZ$9KQb0R)ed|!0F#?=i~ z`Zexah$AkjWN3sw?tR&HiaIzoPSuMR%}?BATKI(4ZhvI%!;w2(+AJkdS#+c2EPh8F z?6|DKQf79KG<0>Ng*5V%(I@)s>f|}*ayzxNe%T8sc*u@Acu3?JHyYu2tbM-ku(Qo8 zE;F>SEim@0xF-v7pK5nfLEtTqbQJgFr@AD>5#3VvBmwrdo|4Q}ob7eL{#rG=(wg}(a>bRuj05IMBy1DtP?B+E+ zHiMw+Yu1V7u_KC{V~Z#kYcvV~GE??8gV>U&Ko>q$4^kN$mKz}_kCH3`QBR1d&aoYJ z5LN1Hev;U?^v1j==o!rfA8_wY7NhI*U)xSjqu}qxrjGK{#rCCfav%;|LVR@Tz;A-& z|Dq0BLU#xMp$?841!k~wIjhzFp$@v%rjxizGq_*0Gi18*)QmUQ>1?Zmu5B!?T&o#; zmg!Rcf1@ATcJXMOwiuk<8YF)St>DiGlY>pDas$tcpv#rA*pX{$&m3c(!W?2QtFvy%U z^vcn$BKj(ts2J07JwRHy2?g6q7EVouxx~Sv@1m&6cldTRtK@n z@`{}CL0KA(b>4*_+WC@32u;S+jC0r1fb;6L|IGN>s#j#rkwYR)44#FXpeAaLoQSz2L zI0`{0U$yeJFviq}hBQ!l!qHRh2ELvywg=iAZyAi`I7{ownsSm)#IEULP=_OsM<6)sYv}ku)5RG2< z+%p#Low2PBo@`L#0HX85pMA#2HXJ;Q-9(BuY^#IL?P};}SeLLt1@WZMUhxx@UuYbO z65^Jpy`oQw$KA+{O`J`;>ovfkvnE4P2kRtQ#30p#z>^Jf*6^zz!B$)9poi>95R-0> z8HBPM)cIU*m|RcwFn@ADNe>hm5lyO%^S|8?d|T~>D7hgF>W$slAWQG4gU@t*Y&(4U z!mfQK(wI4zVZyi6LBunj*`&*n)9^7fK1x`d3}R8(RtE>`>(IIR-FNSfTIAi>!=Zt| zPSr{8sDoj;C9+oPx_N=hj!SE?06f~f5c;@1JS@uH|!ZC+LPk)v9i!ox>iQPjZ~ z5fAnAOO5j%WHKJ2n@OYUm&mUd$5Pb65p3HX14k(Z+E-uBhu>fm;@eRNWuHh!)Co}3 zK}Cu>2#CYzNd2J>#$F_y)#an8gAqIGAel;aQZA+g-wCD;>MR|Fn7jq^MR1u*k)=sG z2}YWM40165g-e3zwU(U=ao#&)aqo{+NW)0iwpq!N{z$g8= zbW8bBznR6k)qStGn`{C$Q6U2RPL5!XdHi#4Buu52U0?>S_*E|4=af$g-; zmO7YS$NK^j%R*TZ2WX5CkEVCjL9i|da1)*IqJVM~y4&QR&%0tL({`^fnAg{ z%E_YgyY3uhk}hfM#+Er=fJj5W&_73`;h#1k$p?;|*TBoB4@%<^`2g9@F^gjv1&m8~ zCEkQH?)aWj--e)GhX2ND%(e+yk)}X`iC20+`u1!i9t_<_h_gTy^GQICUX7)9= zt8|FS3oLDt2}K=Lh=}p1GZEB7JPB9N2J$Soo2fjrO|YFeF2E(Qt}(2Kvn!YQW$LWW z(nFJ06#8Uqv7j}lrEx+%@2&{k5MJjJZ83{H_F55dI5!?!se#mu+f!%z z37amnt3h-}9aKbynk9?1HjFD!N(6M+$n?4@dTJ45i`t7t*QIrb@c$2c_x(?G<3E1< zEPEZ!$#xthL}kx|W3S@avqEH)O_Agreiyvg{;*IV@<-&?*F6?N^~6u z?+;a_AB{(2jP>D=oz(zTEBJjjHnLT1N6=;%lB5o*gof{EGT%}McPCRO?_+YK7#own z2l$XLJsxo4gRLKT{I)vC1&4o92LT!eef#Egox{ojdJ2uZ;oIuqJ9>4eGL8+kMeIxX zO_0)Rn+Dc%_d{yM=$ggeF!?e#|TYFb~+qitLl&L^U^o3O| zC~2O~+lx>AxYWxX8=>iWH{VXBMlz_MW(cE+4GmCM*K&y2Bhx?>@^MT>y4Ln+{l;+Y-{wyTB_cL)NF^AO#oy{?L5VOA_R(!fhdm3Wo!m? ztl!mQp{PR~8 zdR$q;tx)Pg<3=NXgECB0ROdd(e8ID*NTBM@VTH1X)NUSNctOuDLhEb&7aB(%6gX8? zWGYm0leq;GKdFOtAK;~k_A;|$yESgxR?b(|z6j(EQrCK=P)AW+$AGG5sjlZnH3(EU z>_ZV{s)@>|My={b15}fFb<<%~vr~1m2dd>{bxSa+HKMvT9(DhG_5Dm#+m-6Jo2d4? z)$R9C9WB)z4^a;Ws~=CHx*{9emQ`4)g z*r!#~XQ0?`UekYAalol&z(aBHWX)i(;!s4*P`u*s`I_NO#Yb0a9^F(Nxmz=GPjR%R zX7r)r*kH}rW5x0Ln(-HkkKfch-c+2RsGVR?nq;Y+QRk%h&UYOrwZVpA1<2Xb>a_|riix3hwrkFrf zl?-Kr;#Ea9f9C`+28F(VdwZqw77HF3yW&4+7m? zn%VTLUFMjF&s2wJT2H;4XVtgj5#W(qybZf#-RiB4r2HUuyYt!1%uH$^{NimR&GR|o z$w%OAS{Dsu?Rqm$T9wQsYrv>|yTb8V@X8m=AveydUOGZ=VmKVu*J%{kt)BAi{r&Dt zM3dMo?ByvF?TYeoIkCO__Z*1KJS3>`%FwK2jY7n@HS6ZXtknufR%1CszsqK_iOuO5 z`KMXN{bw(oAOEmg_~P=JC2PvZfn7r4h)tKqfO%<;GCysZ(&rWTd+a)SB@uZkOHE8N z#V#85GS3ZnXXzG&Vz{1I$u(1<5w4P13D=Iza_zl3on4Z8c$W_kuISoMo~5fXGu^Oj zM{Xs=!>Aw0Tkgp2AK>znj&wRlXFpuA)XJM$V^@^pIUIJ;w3SJ~KXn+H_Tm0K?UM|l z{@Eq0WxxE}r#{)40kkgzS^`{eyFWX%HlF^X_QRG<^{!taGcaUYD|4bN5ibgn zwwh$VQmSruA*206lz-uoxkA?T03%D?OJ~O>m}x~2z95=2CWm$!cBqrYEkwehhK)skVt4E!`*8nR` zA(2|(a-`G7T;{#8s8J8OoOAxnY|cZkq7Uk4!d!Uf1HrdyHw~_%6O}{e6$B@uxcQExRA*twIyXa@3U)% z*Fz|k-F+B_B#(Lx`BT*A8kVPP=d)y^>7~_Gn1e=h51bQh6pPDB;o9wJ#Z+>s4LKkG zqQ3KpQ?dO!_e70g($lydJOVvsq%^lcWXmibUhB9LVZE#Yr(R7~>9Rve?VN#4C|He@ zBhQ8r49}9=9o`YNgNkOrKL$#}D@I!M<(VZQK@~!3VC@ZYZ_;fFn~y?OH*!v2 zme4-kI9W!=?toA#;r5f>sDYJ1Q%h22MO*?do|&CKBo>>kW(($U(RT9cNoy7b-V)8< zxieBtBwIPI74BlvKzAx3AJJu(AHCl&fl96$R^D}BkNd53o388@T5&p@e;fDNBsS)^ z2@kAKKSZXAM8X>GlX+2M%s9oZR0&{mUjpAbDoZDYOLa=&2}Nr0Gi?2-(`SprlZ7@% ztW@z7`X2ecRE>t(sD)%GZefb_1wKP)vRS1Gt+l5ws-5q!rQZ2kc)r*<7t?jzzG})i ztDdZ^tF}ar=8YWgyDKdy{AQM6sG=@B9}`&{atkW8r%g6>f282(gpl8=y~ZmJT`u!5 zdi!xTBq}gnX4#Ti1>wQ|0*$CAAG+eQoWoXhO;6#?B15iNZqWkNp*5F~P#PqJqq%eV z)cRz`W!Y&~#l!;G)D%V_3GCgOJC1VHSv$& zJ|#7|-*Uuiu$H+jxzHdHaW1#P()9}UqKoAjb`#~dM7tWwDGYOdUzo#>%LFuSmM;_)96Kmr!N5T-M||VStOdWM zeYHx>&`;9BsOQnm6fq;~<2p9>uuR$fyMFtFmXk)puxji|8gU-5q#Nn04MsG?8_JzL z(KAMUbTQFsJ`tuAG6nTa)H+R3#R(2ma`pH0S;5}xhLrR0?BK&9>2T%}-oESS_fL6V zfK!V<239t=%`b8v-oYEErJHpuNa`Uk{b`0b8Nf?1m(tnQy>j?(b>1h+C+^@CyKa{2 z#@Jg)p|WSPl>SW)f;+!Utj{)FSCHb8;BV?+#EjFpC zn~&sKQ$2b;AwYYxMeJ%SrKkwwdFCMYp?c=m#Zd>3pA~sYyO*e1aCYptlQPGn?l_ly zOUTX~ZH|%Qf!AdfV{RV79HW!WZ%FE(yH6&^*rM5+%Hc8hllM5rS5n_pFCO#=e9ZB9 zW8h8g#+XO&Cdb6aTuYsf>Ij>>I=5tCWIcG?GeX&BjLo`HKvA4d;Qec!C5_fR$~Lk` zD=}wiR+{5@4tXbMjt;}*oA<0@BCL)zPO+YhY-Ul7p{G})Gw@%_Q@xQOfT5gKVR_qS zC_=5Z=%7%k$9SPSi$X*_d+$M^Q9hj^7EQHKmywa`6j6FJ=s6OX}FgSm*2 z5lRz-`C_!K3Y$T)#Pd@Y2am_vEyZ3V!j`bRjBSB5Ej!;%P#yNKWvjttFT({^J!5a9 zc3%YD4pS-z9Ksbv(%AaXH@)*3wnhQ{4V=ThTR#?56#`Wy1un=uS)%7KG4U_){va_r@Kh;jmYG#GaZ}u7;>jBp`*g*nLr%*S%_*og;f=HtO_SQo_l}R z4((Ptvqr((Ea>Qjmp=tZIlzfoh1#9RS(icn?F4fH(YSz~13?A3BI=x=M%@RH&agGy zb)0>bKzG^`i3QTwCXjEbgT*`T7mUioq#?R8r?U3$=P&HK&yl*kM|wuR52=)p?a*Ps zvpxx{jp$%f#pV(&p*@$1Y?J4%&`&%>mxouSg>p;MLRsVJwG!caCA%+14|5WwnDzVg z&+@!gfgdnrc1mPE*3Im}$l_zjax#%6u$v{Aku}tiH6oEUrkgdMkuBMf?R+9zdN*4p zBl~8KA^Vj?_G{hjHyJr@8FJiBI z+2z3`;A13kGD#q?M#J!mEZL$Z^F=E z6EsB%nx+rU09S#TsIa7{aP+Bg!&UiAR0UF0h5JA5{lA2JE<*?fJ~? z1yb#W2kiGTACWLSB9nSVVc>`|vxAzMgI218-hhJvv!k(@qj{>M^?>7H=A%c{GuJDruGa=! zZ!#adWp?at>anVUWA~Vk6U~mdq#kb{IR22?t=G(LFx72j!0j=!`?Q(+e5(7C0rwXJ zA|Np6|92L#H*~*wnC$Z~locM8j1TI=8$FLuv58dvhD98TF}IJ^yciq$;;gP?oX%jJ z&2XHxQ-a>*gowceyP<@`E{O+*lO2|mjong=vQwh#QvFFR;`dBM>+*{c{uzf?GLDR1 z@;-Ih{`%$QiAM>DUHyNCMO2#1Wkj_W=P_D3{B0Jo?N?dEiJ!5E-rFo<&^C+6 z*COy|7V#mBV(O~!&06Ly7P0kr7SZ)97SWq^i$z=r*iCl(PSDJ;?B zB3Yrd#Uhe^u_}SOkC0fz)@>H?$`*^bO7$CysQ9n3h@m{)->`@lpIF3@&n)8f?<}J8 zS1h8%7K^wDa{ih{B=CR9A|iid5xf7)BC7qtBJ%x-McnaMETX_yETYBFSw!3qS;Y45 zvWVSGTP)(r7K=#ymPKU!8!Y1cUt$qMe}P5x*h2e;4+d zMReb05t9vm$RbvLmqo1n4vR?nHH!%SH?WAKf5jpylUT%szs(|YY_o{1pIF4~Ef$gZ z8;f|vXjSb`EMgW-FvpKr#7q*4sPtVHQ8Mc{77-$(Bp4w8pQegV`hrF5hi$Ql3EyQA z**>#~Ap-ET)D$6F^rJgY#u`qN?Zv>kJ?7&_ZZf^JafLph;O&k25sUbmO8n<6;-x>a zhDk{h_4`$0uJA>h~3|?h;OL$@;|eP*q^hA z6aNB>nDYY`ahZz5A{tEWz4%uwVmINGKb9i%*IC4MsxMhYgW@>F5hX!lbVaTy{-a5>H6Ex2iE zNT%Yoa~ean;>jokor{w6R`Z{gOM@F8O%l5FnvLl{%OVzkW)ZXh4vTp8Z?cFH+brS{ zD!U2tTMfUlh?(EAh=e~_#MED55i7r75p5|e|B6NI{!12d15)%Qi)i;(ETZavf<=5w zWx4w|7BTQgEaH*BW)Zvp4vT2;A7&A4KC_74+bm+{H!Py}znw)~qxueuxOh;?n~HyE z5S(?4QBR)n$X!PDNBG2wZpV~dImSmKVvl(2uw>o4_o6Ms9&p?PBbljl;p{v?Bt0mtiHj%GeHVI^ ztJgix?xv_Hr?rEUSAhyQ6=Bb6m|iF{`ky>eVk>9B;s0w>JR`@!-i>?owKT)cWX>pIg4m2y|00s3ImRjGKkA|mzm zo@mo+o^;HYu4(NpsszlaD$#c=6@hpd_SK#A%bsDAWL82j@{b~=kJ6tz84%4BkZg|1 zXPV^GpL(aFuk`-LwR!UHHu0e8B3sK-SMH)N3l!%2oQiF{@qoQa7C6MYX;%!#je+XH z(5f=JYcvXcJUgs4MMoAIvuP6PEfa_Z(rjtmoj2 zrzol4TTHuB8Rs?=tXLTFoPJ@n{O;sl6{V6fcE+ld7+DC8+!F8@vROjHMH6S~4YOk~ z>w6h4^)Ngbf1Vo4kdrPhrEK_$-ck{!Fo=C2k@ZrDvFg2e)h6Qpe&gkjXtELLS)~)m z2kWui@Dm>=9`L%4z#bzsq2y1BQGAxMlvyixj6(e`keMKPMurs1`JVB3$jJe!mBc1F zaCL6CJu{K7Ac z_6yl%-=ltuEb`Kh1c~Ir4(9^kh zrAW^}Y@<3^L5`+hCvJ0$*7Wu+l`E3j*=rEtCs7cNgD<gS3GqSPuCpsmg*n~mursX@@1}#jfL-z3x z)`lZ}RYD1lB09dDdIPu86 z=GH%(GIkeW>a+jo1Id&7f?LRBZO^PU`R!oqT4>=eqI4CY4L*Zew+z~IU|r-jh|XeS z|6PT&;j{ai{Wv9G^AvE`@01EKv(_zXVm_+TXjJD4DjU|#@Y}mFYaGxT5)qUyrJfvz z%{NsN^FCx6%lW!u?&9X*8UmF&9!ruC(?&A?aKG9tceK6^%Ss`E-21i5N6Y^F%kBKJ%2g^m?;t8nn_7C;(F z>+m>Q3)w^U+6vuJ)r%Gm^2bYf#AcvL+&(kA+IQ*z&0Z!^_v8vB3udjYF$zBSZD^BL*Vo3D_V?kuR>U2rzy%rz|=DWq0^)W`D#kr zKH*K5HiOP*3B*EjSV_fIkBq5w2d2c~>gEStOu`w9Ge&ZkPIPkmCtosZ)28>e#vRX&S~w3KX;I*`V9Rb5)jFlsw|}Z?Xq>kL>XX?k&;Z zBxkJy&OLv)I+9=$Y0u9TGFL;N!SazwHotrUD3+UIga}X}UR_`mlYC0%z8noxD>mkr zETT{-1T{?rHS%5c0_$t1m64w*R(;+@ms&?;UM&y{)7mwCTNMljV6ovkq)v_LX`xyvbBvO^4XH4`Np|D42S**1Kio0RU=F0AO0QrHKuSrNiV5ggj@w5d@Erg2^>5+JXSA z03e_U2;GUU18Ji{T)CmNl&ombDPKx}NFan&J3^e9d zzMj(&0GW-L3h8L?AADE!HmjMzbpeQ*rT zMt$$tlPwvMnHrWW+Af^lcdt86|_T3z-5d0E$I9pej#kfOHZHi$^38 zJip0^wC@?Y)=>x*@u$6i$cSDNDgf5sWW?ZOcV~qw(yQcl?T_$pS4MOz%n?a4B37-n zev=CV*u7Jcl_Vn;*9DxaV<*XoX?5)dX!%@1@e^sp{TgmLCB(Lj$auQOru{afb0Z_N zgmE2>#HmcZO7v)L&?Cu+05Xz{s6~52jf=b-4&s_0MKlG=V})5Ez!cp>p=GSlhREg2C3l((NzRfKAX ztTM97w)oc7;BusDIe;GTceyrh%ZS=+H%Kz#kY@V?G0lVkw5>rzHBj^6k$swiBVzGtm~9y`QrA4T{sNo>i8Wh5C9 zUDuV1otr)|oLlWS33S_*5knFv4wMU|qaiL`2ilf(7(Q&vhzV^&unGi8Mg%m9^N@3{ zF*?i2B2@4;!Z7#ZmYrh2tI15R6I(LkPQL?=>svCS5E+wAaraIhBW@l%uSpW$2S&&B zZ5c7LhjlG$bNDt>#3vcCD?FqX6v%ctJrbngXV{AEmgGKH>GA5Jv_G4mTrJ6+C=1&j3~M-Bie>#tb@!R^(#Kp zw3p(ituoVYGjlt~b0j<&XOXv-J>dFmfDQy9_XLnnpb7=PSPFmBy<)fu z5nhHqBZOtBuOlGqHcXL#1$8xg*@Ec&t2}p-vzB9N(}0d$p+cVRo4yvLLyvC(7D9>R zl^PV^7>Iimu8$n_UK^sa0)x@mTVNH4C~1LcgDR&9nH3hoioKnZ1i|QVM`3}iN^JI6 zaPgp!EZ|%}K!uG&N6EnhOH052@>Q~PrT{1wlXgVWUJwf;Sa4@m6j|#f^TQJI=qA0Q zN-jll@C~;t^xu4zq{WI|c$HWdI_xXjPY1G+{J;q58g})0$OW6E>`#^|$v1Bn8(4qEe5Sr4z{IX!U&wp5a7%oi3Y z@t{#ZyCR+ubFih$+d#7S8I=sU<44q~ARZ9)V9MX@Cfk!;lma8pCK zem2pt0Py52_T)$txT_MOqRY6-SbG}?2(17{mfW~yV>47rh6QA<0fC-?hg@(C#xX;e zsNPFd$5vB^B$hv8bmtleTVNaYBH#cl z-+(M+o!oh$GJ^{cvJ(%1cIFVVo7ea-l!AaBj>p8_a{A}()Y?`+?MHyRaQeDOKnn;W ztL5$oe04ZED-;VBoZdywBn3vD6~+?6D`byip;hIei3&nfc%wC-NTbG=FoEEix~w37 zf-Xr!>7lD$HKnK?j1vHs#pzm3*P4(4=^l|$Vj*=7kUcn39jOB)zles{c9AL9UFSYy zB8(61nt||kDgv8uKuTb32f!T@4tAcJ2)CayWg=$<9FUA)ZJ!!#$6xFR!)#3~gaM2l z*xFb{wDz>ABv!iCauZ<(0Ag`o$k19y2*c1Q!#Wmn14A=(1hIjqm#&%JiBHmxpkT$C zUXrepP-y1F^7&#tL?Z-_3$tJWT%Kx3!j77yW` zmJm-3$^~19vrRgeD?=xFmtI83uC9amOqR2BZq<>8=>-v@A&RB_oW(l%{yE&OK;DGR zJGFU1gb+>wYjDD`+XzZj0gzAjNzyuKJetaoXGcvr{to@%&E4}#bT|MjmTUp&Zn3}J z6lL)oi@w};YA!JlKXO`nPnGJO%jv+xcKdsrr9u{{>;u(49oB)q3%*!D=w9ATb@)@l-f?*C zVgCEW=BDI6Pa6^SVN?JdXegx6-8TJ>y?taS*wYS4r9AiEm1ohh4T|^TKc?eVz*!f{ z0u7y=09vmDW9*80gF^gR>5YbfF+${ZuOq+@aga^mheKrPB2m6wAkG*n>%|yOXA18! zvyJEa`%lQF-~o=TSX!(jouC6k&L;4T{lir2$_JtpUF8rAUZnXDB_%*nw(@ZdmNK%o z`q|?5*_ z2DlUOHv+vT^J*U=%zPq4x%XG`w5}YG#DZJ#_hL}Jv2J|xjINaTn4P_Mo}A7g*Rg?O z?%kd`L^Kp)nS7$Is4s|J)lwm@~Ui^kdjQFQ4BIQ405dnW<5goQz#OvR(h>avyeTzjDC9#NvZ&}2v z5gt#}wpc{K_gF+S=WP~|<7X`57>Pyn{+&h4+-4CiUWO@qZ?lL&Q_ZUQ-&n->@3V-g ze*=q%l>E#h(n$W9MWhpmh}~ilO^iOXhzmbs5e+DLLjTMnW|3G#R!}_8#hi@og4SvUTqtETaDwi)iqD7BPv$B0BueB8E~j z`5-sczh)8hI0T))W)Yv^Lij(ih)Q3uh#g20iUS$i`b>X z%=&W{5y$^$7BTk^7V*rw`tK}a!e6k6-e0qbGk>s%w7?ME0;>V-Y8|Sj3R;u!#77FN>)8GZxYArz~RDcUi=1-?E6=zrrG}e8VF4 z{@=tRJ|(e;&tRWfL%YPxqP}4fK|f{@nZ95V8wI{%5g!VCpG5?J%_3g;0gIUY0~Rrqv-KMmG5rq~G3Xl> z5%-xzToCw{MXVS2f<;W^|B^+t_}5s(6Q5bciN9hIok=WW=vOQvdr11*Z5C0T#3I(i zpmCpBM7k{&F`)q_`U4hm0{2rE5%3j@xby{!h}vQi>tO5u8jGm;JBxVyGmALQ^#_ZH z+hP&Lwpm2fXBH8Ki`rrlAwOjiiCkMOV(7oXB8Git5zq5)v4{@^zGV@^KCy`X0>80{ zwEs>P(e+Ax)E-(?ZQ8g_ljA|6-WW)YSCU=fvwY_dS++aWRqIG8q& z#AXX)R5^x}c-F!Ewm4W<{hsyOwPTYCeYLOxoD3uw0SHxYRD1GHO@dXK$2Ro!$@=xf z%J1G0ct_6&ci~_&^{O*CxU33`tm-<)y`{$=cLBnbabPNDNq?x_ud;~p)qQy}T*qw} z5?*Oh?%FSq^*(?ou4%&}pwD?n@)X>~C{ORg7cAn%(oL2QOnT+*C8}Llx+Fkca=_P4 zf(GF;t+RZs1AEl`W>%?U=Z`;9EMw^|S)(yOa`+7ITCH%w;J(UA7MgeIRqGSeAwI*| zF#_H9RB|4s`Y|$C#-bMKwQ!;@=&8)= zc+G77;^5t2?kyHEeY`4fFU$b)lDurFZ|E7PD4Y$Sgje+f#p0}RU0Ily#m|`q`uy`R zqt?p{&JE^zhQtkq5hLZFs5a;;grty`7N{1p?%ul@-N3=180O6xSuV}F0hSXbf3lz` z@ce#@+Q$@|q-_>aw{t0A?$QBOFX7Dn)VXSmxGpv#qnEp!7>_T$^gj1e^~?FR16C@% z_l{-^H>MwT4`@k8cudZxT9doJI)IzX=;iQrzd+8jglAl3mRnqHxp%;Q_^hFh(4NT3 zqt3%KX}FgG&B+H&u`OIO+q(kkRvuF~CfMyXz0rhAIi`l0iG3&8XUOB0f(ag9pmt>0W~%ceeb#14e63 zm(8RDn$xW-&nAtZdRD{o`1BLL7B*XK)RhaHM+C3Wvj*d`$^^8m>gS}45dXX2`j}l+X zf#scA&(O2tF!;{a_;(ka1hdXwa?)IQG;n^4MdVnnlL8Id+O}RGPlK(gv~YM*$~Pz4 z`3rTW1kklbn;oiLhogzMWZo@&31;H)PT-mOEWU85KTT8#+8sk&%U8 zA&-4p=A%ljx=`FeB{&tJ^mg%}*@sT+PG^3E#tGL0)i>7I5}9WNxuq*p5dv5@Ro`W; z75qGdvFs%6N_K+JQ_-_=6%V;3ydm<@%$c)Y52(`pDZJSGN+crrP)8X@sV#?8S!$Pc zUZ7iOoz~+52}IF^=oSV4NVVW)A{#MHOM;pr)@kPq;-(>mDmKwajDl4{|dsg`1Qge2*b+K2BTSAU(!7Gv1Y+Br%%VC}6 zgSDv9?7a*xd0%xgBBUrAVRJr=o1>MU>cKN!@oE>i*@I@;>0UBuH8BqMxp(6WcO)P? zlb}&O+RCy_P6v)EB%_B;(n8C$IF}PSDCZ{`4y`_vJGT4rxb2YM;g3vd^)}RWQH2a_ zZ3$@$=uxrDK-%MxIgxeCRw}A+UafLR)|(ZL();R(d)2j(rmHPGYLYaTE`a3h7`2)? zjhZ*;9Mmzb_bsO641#mr1$zmdM7X1b$g)aOzwl!Wd`L{7z(;jYTSCwKEKbpm;#AOz z0Q%Ch{#`l^y2~Qe1JToGFU*Dsbi93pX16d%iw&2rMrxxb_!3fTLp49zx{NBxrql~M zp*jOn1pfrWQKlE`?et>xHz z9(twNE8LwkS{=>2&@+wGMpehyJaa{%A2tdTiUO`FPw5W2jIJU2fR8ZMiI_@Vqy4@9 z6tBitXpLUFu)lY1eliNe;Kk2goRcut7?y|LKD&hDcy>5o9hcEgaud%p777N6!|yz} zbK@zChV3kMe*a_(Cays0VhC;6^`*nRONsIKU;7r+?|AMRCESGkjt=(=G_%sB^x6o--7+-A~-3Q?&P3Mq-oH{1L37_r`cu{ zBL%pb?QpsF>K5gV=&tvbQu7^DJ5DsPmNqe{A|s&SQQk+n1j@+-gE~1{>bLUwwk)v5 zPJaa9Ler#s8EGGA|C+hP$prx?$#Hwqo{iaE?V%{I8+EH~~IAIv$!BSWXGJ)U?-U}gp^w(GXW zxcB*cr>AZO#j%_CG7icy@c_pgON1Vgjd2AALsm3T+a2=F;Z}Tn%apiB)s9SvbC_4g zJhGEYH3mu|6fyo~gb*dDpeNm;9m~5RpRxG8%N^35d0_ZL*L9~EAkF5W?|wwC=OS=e zg|1K`TcR$ZcC4Cc&+upieY6C|M$dQQ7eLJWRk#LDzha&Z)Ks<#q!ENNP;<~s zmb}WxYZ!7ljwi&!!*0?KUYcaXD+Z!W9-evt%{y`j@!@sF;}5&!RIMfkDOf*7CZD9u zp^-MEP)04Cv%vGFo@win*uOt=)ED<=H*9c1v7bgYc7ZM7ru+=CIRnf5e)36O2bCs& zT_0gb#xxhiLp8eL-R>^XzGBI`7Z#j2wc2YtltdAd5mI$P1KQ=co6Xf z`UG=Ej`0#b2g-$F8sku^LGA2LNd{^vy+XHIU#tpbEyhx1bun@?GU+tf;_cXZsk!#n zGh#5vd=+W9U2G$6{PHYY7datrfmu_X?h%+P61?YV0yiq_bgu34E_G^vE5%|b^?-Wt zLR~Rsfxtq$?0S}n(XLtEZ5DBih&0k4iS`Nk|w=$RtZB^hzkhB-M;1wUQ>SW=ac2q zd*w1=@;S!xSCZwg^~&FbDcmwvxSOm{)vIt1h9VlHT9Q%iy{LyU#a?5@!DPjeUd6{S zrDn=)20S*IJ;COl6z4r@20a;AykO>DEa$yA z2EDjhy!p(%1aVJba}O9WuQx!{VrVvGiWp&8H8ZHK9wRrK z;&4`qHQ>ktCEo5M2Xe^osKxGHO4<1^F95&$s;b5^Iu>nDAgPMuGng>-cYw`>2ih+I zLk~4j1VXI#9fEC`%8yk=rd!~m(zq}N_;0GB`VUpbgx#N1#hEQt(aI-^q$+09i=6*W zReVTN6^nG`5XR!Dx-NYwVaq7K$`Ss`=WQ6<)Ys7K-u1~6>(t{V{fXww)bA`kA`TkH9!R!E| zaB(ZiN}Qx1T1tCtDUR$5RT1pCex01G@zQgD0jb?7#hl@#czKWjg;xO>y{C8NW>Q3m zsLfARMc1FIilv%2iss{$&mNC9g+M7a21SEWE5ECX7XMgPtZO|-`)5_L7o$Z7=$Csn zsUIZB%BF}KZrbzq7gfb-Ai?H1MpXu$EA1h7)@J1<_Vg=?3l`cs=BmG}DyIHaRZNvx z;ET6edFBZr3J%Ut(p@=6QWbY=sSBL?T2)M8<>0)qGAC#l{T!^;Y7{{ehGsaz<;)$N z?Zhle9WVa8liR!WG7rB)70Wtbz+Py971213r^(jg5e&L>WCaBmiH`zA*Gt!snmMm_ z-w(*|sfq&@HrR0&48Sky6z=-0EijD1g7sHb#T1pjZ7yq`-&Ylv${~D>-3CA|GsO2) zMJ($XW4Qf2U8`o^=boUAhm|Jvp)7=Y_e?(R8{bnE;h$B-;vq+Wg*z&8{IKI6(Z&8` zU#g0CzpE-z2>#nuMNC2qKLLvOj+}k~76*;_GFdL^P!*ttk?9l%`K{?boG}!nz{kjhGk6D!!{KULDHcWBuXDQH2qK<`aKb71h`#=Jc*$nEfVhU4>mgF;Xw%F>5@v$Cj!%^M$HN#vHquqgo+h9~Q!{&UW%ZYdId(0z5ZN z&`Z&_A@h9|`)*Wls(4FPWVHWHRU~{?6;1Y}to*JjR%g*?W)V1@(O{*N#6MI;0=1H> zHJ^rOfQn#Y!UYC(t{n<&j?voVjj^Q)Q1c*)SHE{lLgNbfp4ugpqPn2?xOd;Eic@s6 zBvp}*e&lOaaU84a(zN3Y7-`vM`p6jkBUSOLt;`ZZ5tZ0#nAOVRdF0E!qK{;_jgrMf{vKHPf=Zepw~}iGfn)&IR)-MhCBa?s+df3 z2fC+(=qT4E({=uTl&b8F0U-Dr(beE`u5FCWf&wt!a`qVj24S zg`5F@sESubrcoli^jGu>Jz?zR z8fIQzv3e;)pu9++t|ZVtBGA7jJP=26=VE~NblqjUZ@dx$B`KBOGR%`u$YMo0J8?`^ zFrKH%5~OBiil(oWR_H?_?XmQ|U}WENmS8c(LRZ!T))<&gwP1_Pp7elcQSSj=h9`mn zk62VsG4*<83srIgLgcg%m{V zYOa5wDxN2)iUsr@gJKNC!dv2MokOPY092O9Ku0ZFr@Q(08Lt$xUo-t$Rjl}=DxztX z>lF68=oF)AJuPY2)EOG$cKsby z5%Wn^)TSloc*vm{uAuwxudA zQD*|^n~D{h2y_b*)b^Xjke{iFqIB!%U3MfI@7*0+$C7;*+4mgechJ{c#83HU)xvfp zK@6)Xc_KAs9ZY2%=wow@VS--6yDPp_6_=>f1o9rC6-Pg-ihWQpD2BY4prC<)oL&GW zU}%)&^Jb6b^&ZI&X3S61fj+&KE@(o}v<5w_Nnq?YT_^41%-?}N?g7T7MPCpvdRYcm z;eR0SXrT#J8m+Iy5f=arCeeO7V=)Wcs^T3iJq|$jhpI@ZCb5AgzpIL=U#W_1o3^EU zRB|m7z;7+#|BJo*erjs#+kn5500}K5Gyy3=M5;*dCDhOjMLI}R5J8HdU?U+k5$R1t zsUiX@y`!NcO+cy$(gjfj0mYg(;Ax(t=iKLho_F5&hvyI2Gi&zDo^`ER-~G9Ql@W<_ z8iZoxkPu|#niKY$s;EvUwi)V}!DXu7ZsyUxgl#r0Z1?TSIr$lE%5moOtH%9gRS^tV zEbdss(ynH~^E~$~pwn;es)`GaJ)Lya9iPFo#GcK!22jw6YCiZBR=yBN^>0ueCubb;>cS5!QyW7^r0g(aO11+DdG^N7rl;8g~H^X*MMJs>9F=)3zI^6UT$M_gk^biys%o>wO zcSYpv81mLh#FN);8tczQk&`6{&r*=o6RU2ZUINAaCFAs^&mc47Iy1wZ_&Zwf5t~?A zk7d@j{AB{DO` zgg@&BrA^uJH+lbDmi4H0RvZ{KH8aU`ftf0h*h{T`=6HwBxTq_)nCnH#GtOrNtmRCo zxz9MBF1+)R<*6_-a}Cl6r5rj^Y_pNpq*&Z9eAnt0&4}Ybt;c+GI;|S_{4BA!PZ;rb z4Sc~;K<}oy%fKAcq?z8?WdR$MdRs7FE~w2#7$G*x_kd@9zU58bxl@R{qEF=dXi{#S zJ=Z-Kl;ELwyF>TY(_)iorr=&A70CcOdXSu0N>5Id)RsswnwB1;D33N4tp%3jD6er1 z`(~Di@rW%mpMQ49cS!_Uwk>ch(~-&?{nBtl{is_3tW#z|6Zh2-ryWP>*Crv zwODoVm)nR`Z25C5k*j{xPYyszhAdeN9L@-8vy3VEAw0O-Y0hSC|+dAYspRXx~#y97b4UlUh9vy+5yHz)Gw{ z!4J>&Z-t3n&U^?q*_IG@)meK89zcl!deJ;Hb~hMdbP0EE(3%68kYJp}At4DNNL*Qh zL_fiF$WIcduEv%7IHe@3eN}e1Kvg)OUzAo--gyV9QZy5FaeFG^!;KUNhPl4`DbyFJd<{-!EsnwiueX?(i1t16x%(WRbw zR#6@BhO8={HDXrytgd0gseLf0-NY~{*Y2jwf@QYg`?ICqGo^xsA`$ZgH9oWL@gi=k zoacNWjJ0_@bZo;_j+_%g+Y;YS$>t9xGN7PGd9XJFj9%T5e!zeZB+w z3?-#m@;(Fm_g+jA)f2iqG@zvvpvS^ohHEo3wXEB(X%y=zHN5}6E6XEO7b9x!8Q03> zJn5s_{Z}8PQ*_fq174fZIYi7$i#mpJRgKwRzbLpzn!o9=?%YR(qpiukORrE#@hPejBrLcEP0KQ3u;2 z@t~`hK9}leooQZs0S&$0mO4o2HhuFr81z88J|wxf#@V(_-GvAL5kzO2;BvPu$;QKk zRZ9evw|F%DP6hjmcw4dL2+_bX0UJ%}@lH4E8K${K^R`YWo|DDPTUMkJkJ96roGHhX zIc{gCJhdt-Zns%6uf3$`k@sZkc726c@(V_|3FT(S?;KcmcFM(U8?$w#)v00=(+PG`QD~IHGbtjr$$*Na|?6F)w5rNmel2D8TnkN%2*}DsP~;) zt$+WZX1Qn&)A`pAE!qzkZ*IbwAl9o!-~>79$(96%N2t~B@3_dvo~`x_MZkZa zzJ&VR<=>jBZf`=k_yxlFDLPJpZ67vVA&*%`F;bIXCNhHh*$iT9UK0))z_gcUAwm2t z(`JQmiKEyPwb0dD$ovZ_0hzfiH^U< z$+RCN|44ryEHOiQU+7>pCbK4&cM1mQ!-@+Z{FuurSb*on`dmKD#!nq?U|ewO1idXy z8;`bNAFTr_hNr)bP7s$#7sUsW$YK<3%-AoaL}IVjpo-wouQ*=V;1qQ#idxHcE?u@o zsM!0anZcZ^jlyZ7QGmwsK82ba(kv`t;f5I0{T}B}6pU2N(yvmC#&7c`oMw?WY~lHY z#)}(8YT!R-@qKkas(B1SB`nx>QyxD7%+(t29cd0<`&^J-018XkcY@$Mk%2MuFim=} zi4mdiKmlr3PGor?vy zf;+QV*%y|LqMUUzJoK0%8xZ?i&W&8-FJRphg0h*cEAYF=Lg{s}C|0bs_+FJOPfnmF zOG*<$uoca^DWGQ>SvE zEx19wfO$%~+s>F4V#%JWY!PERym1PVOtmG^Fr9yojLmANmj^kGGuAZ0Y9Vsw;8>e} zmv%UuJ^JxSIywLVGfyQ@iK76XSENWLwk!ZRm3><#FAXD$z{w5GL+C`&VE;jNim$^2 z0ZpMUF7oNO9}fXUiL|QsilWaN#$BA==uHn2zg}-0L#sU9W^(SJB4?Z{_pRnf=Y%(6 zOF)90p4LY87Af)o>ezmkk4KNC%t(_JJIomS-BUNq)pGEW#%}q2>5Kd6sL-@LzRt8b zl%%3)DW$r!A0~J;KC7i6_k`AX*ItJf4jU6Vm!1vgaY-mYx^zZ`VS-UnxPz~upO%~d z9Hy13stn({WLxYsxv*_f#%01)&6fXG}<7B@5? zD}L1)at>3qXu>%tI?s?xpavWP3ZR4v9S4ADQ6Or699$CR2vCm=@B(qKkd6S?BP$|+ zK;aBmbe}5LQ34~ye2jL}PmxO?0Pw*HR6BzNs_`-TK3gDW)G{xCrsRzC_1|=)0BX>^ zAW&Y~fETfhPnpRKg{>w^rOK=zw?94wt-AZacHZSgM9a0DuAU zCgoI`4izgZ#wEY2jifK>E0^9ky#E5JBh|jW_U3|r;0~!Ml3S6c z0i|C8dPRgnCnU(EVt09Tk1d+*;{%V09a52-Oe&7;kcx3dGNYQ%LjaDKD4wnfWfR2l z1j%tZh0JUy0$JjF0reGH5FMY=l9@CYQd$xn!6%QSeTC#&Qs$D7Txn79AEN?F@CV6n zn2$^8*S?=kXEfoL@9C!&Eu;+Gl%r9Y1VfI zeM?>&z&oocww8-M2Q8Q7}Fk$7^25y25hOy$<+zzS8 zO(qrlkz`Ww=%!TsN)wVyDhkSRlS#$SW-cMK-fmEpl0g5;XFH}BHk4nLPcyHzK(amVf+yl zm?cV)>){kjI9K^_kuk9U5+#I)GY<^cM9cWE;UR(g78jbzwL0g@6|;6oMG7*hm_;TP zm%>8Gq@wtPP=B10ZWyg25JxbOUka;S3GMlERTDd;V$n(+Uz2eWq{+*Dhg6ho z#W_*ns%Z}w@g4RTjHo1&isHn_dANlqMq^10o--iNb*MKY0#7Ct3k1^9q3%SZN7H)n z*Sn-5u4@!h%Y}@vU2BnGr`o#^?k`s8 zD@<@J-ZpE14{6HH9?q+ku(VZ%7&y3RA$u&3kre(G>Wc~&?F_?rg3nRkOpgRB z9y96i8IWRO^({~mEG`++zhhex%D$vV$PH%%0C?m>6|NlVTM5~WG)Hh|c$BalY9M_& zT#5Lkv;T<>ONzzP03Y@Jl3h|UPfDIlDx$uaOK)M%Uo#I{hh8tHHX%|PhtVAlI6_S$ z?$Epc=$B#6bLJvQs4xr?KqeKjl)8?g*SEn(x`ve}b)5M5X^Si{brzmsyv_kkn%mTy z2_qe2!(LoWXh#6*2+$A3GyCBDUmNjBDnSEr7Q>88k=?wrXwl(#8MGx8pszRpp%DZO znV|>zaaEt%op5;CP8_=~l}Z}8N-KgSlLkJZ9YLduL$D3{aV4Ib6trKW@Vzk-lbj_( zMsWp|7_bQ&>vhRAbGw)h%QQ&fJF|n~d&MT)#x)pY!go23=L{y>Dvx3<8 z)XvmbKss%>g(A2G0f?h{r8Z+p7`=BVXip$bGMeCyzkDgKyxcyz;)}Uw+5<`-fNd?j zs$0sSA4_vSoEe9MX5mNy$84+bVp1pL9beyPcEnLS0#X}b8Y>8s6+jc&?6Qrcv;r>E z4k8MsXqGA$hmmXrIPszhJ32fR2hgMqI1wkDtd92F0%!_=00R(>ZWUh^z;rzXsteE; znDSUdXbPH;Rs_m^v}fQHB@9%M0jBZ6rQa&C6<32U+2qpMaj}Qd6imU~tL>D5;t6Cf zg*x|u!QQn)se5sIChEEx!$fTX*pf950)Q?(L#rqC(qu8H39Fy@VhPJAVjB=E-)>s zNLQdp0)^qg2$l7N%F%Pvjm1D73y?>Tvt7a!apAQ4kwN~p-oyzv90(*2NFI0QQI~qy zQUeBTQVA|p+a*yVzzxV#Tlogc8k;AY5!6%hyi>I~wxc{vvu`u1TCME_yUz04hQ@5O6A&yX8xGOmO*a;9 zE)N`_niAl4*zbe`5nKHNB}5;ncxG9vh=a5PD#1V#-B|Z1ghJX_8{+74eHcliLI7+C zsN|RdY8&YNf@45WQ&$2iBS`OQ$3cTWC)5f5KpaSS*nLB(%&Z3_r~zFI_j8xz>?foT zmpph~-%t;(`eaK(MG5b-k6A#9JhSx&LgkF{n}+xlZNMTFfa?J5cdfHm9X;2S{Za0q z;bTJhBa_7TsI`UQSrA_*pG_|)-8@S6Bc2ll-ZojWc?uQo1YKT_s8L<t-PMlYoF+#@|e|j`lQX6;Wg}v>M(kSG)w~!g1Ccm9#r?n zg}K9DDC1b)#lJB#zE8#IkP_{vdng`tGQ10<90-!og>x5#@no^JN!klyBc@SdTv=?+ z+b4s?*ErE@Je}T0vL3~wp$AJ!Q?rsAJMm}O+&r$ndEMZeqI3eL8xHEkS&jf2BgmG! zZUU>XYHm!>o(7@~aIVDJl`tGR#^tOu!Xj}mT5P`>)gMT5AHUZeDXV`7q7kM8om@pu zP0h{CIUZsIeC=uK;flV`ItAg3}0-7PwR1nYm{?fJTe035XJGrB3uga$g! z=%Ma7-(6LJ$25O6#doB51erHvs!)02KsS`uxrNNJRO{T=ygMB!_hap&P$zw@VkD>W z;+>StSUx_454WS4qqvU>4Z-+5l+RHd`yOf%5nl8*qV51SZO2*)j+Lw`hV(C%#zk;g zIy$TqSH*?7Wlgd0|c-rWGdll5ih0qD^H#aQ@>F~vO$4$B*_91ljwFEy3#1#>>u zg2)4&OZ6RmMq^@rT-UlG%IpE#jW>leti^{KGjBk9{P+ zHKx-caJzPX{0loNQ7RiM<5IuvcDciPG#lZhgVWPjv?D^{XGaVVPdhAM+6dQ}cD!R8 z-ZB@OsS+w65+TFpaf2QZv-B4Oog7C)UEM>3-P20Llg^+)Qdr8|T37&a(^z1IY7eu? zo=DR@Fvy{kN8VFS6&{Qa3`lj~6A+jhh=l2~1(czv1OpFQjYWX>L zTsIi|l2L8lGJi_?ybI#^#KZDq{#HAx;^oBIuVhtGQ0j&)WJgt$Z@AY&RuzGlGQb~H z#nAKsg^#Y^RK-v?`5jfUs2kuqv8yWTc0X{zEe#{F?$!vMhy&kLMXMmLbuu-%p(($k zD&o=`!!H+xOJr3MWi~;Tpc?qf>o-+#ES$XSGG&2~fZd%b zDo?@c%o`yO$*SVgyCeg>%Wfa|{es4=!qum#KAwZMC4fA4RYjZcRmDQhQgOQ2Fdb~? z(&!p5!vzTKs@2juUuPWmz=@dyZ})e_6}o0Ezj*w+s;K6BsGwluN2(&KuE6xR>0OU4 z;gMpSYQNVnw~B^NFvIECd~j`5Am#+nQk*dHS5=WH>mS=Qn5>@zSTkC7+oMjBWGU*H z0XQ?dU-Kmg#W9Aw$hrd{>#ikfPI2r^(=Vzb)r~!Mw6DY>&s}NSRTV4$pekyJ6sZ7Y zRgp+m74Ph-iVTY0E({lR#U0t|c*Bv~k$+MZ8?=p703wbew%=DC0jP$#eWxl`??(|4 zjzTpk0=?q)G}>5Co;OT?GX57;acVwJlm45kXa^4aCspw>p(KJSM9tSP#{6|%>>2Fj;1cdS@ zs$!EibMsi}0aSLhE{c!QLnral5Rvj%RT1gJBPbpsJdca@LpPrlJ*Q6hAE}B*zEc&a z1S8bHQxyy7cIscy)-n{XBoT=_fu&*-(qyZg#i0uHjO^LGy&F`2uPRn%QQ15xB#l&- zRo*>yQN(Uec(it~^4^(75r->(t151Ux$H?GSegs5{z+AAkU%htUxq@0)YV8q-NL25 zQP8k4_jOdvzyowT;#Q>Enr{yW_J2)Pw2i@^(~NxEZ%x#HAMba8g8Q}nW!ZPx@T;Qb zjW3g4URKSUrj)8_f?pC3<>Pc>r`u1(cvF1Oq2d|((={!Kl-vkSQD-PA)`E)9K|IW4 zI)tgbrj3!R^oon<3}d}~n@Ea84Bt0Zu@y#;YZg}vvin6<1W_sk&gHarf?Hbus47x< ziHGgICsmR}lR$XKNOl`QQt7o8|3Xz4%yQF>Bf9?ar*pE_&~XOK<0{Fx6TrUh>U*^#@hac&{Mh*s0!? zVGoBRFF1?@_nU(*e--+Rs+jc`RdIxgW2bgQ* zTALN7t4wqL6IIbpgK3I{?d%E0g=dm>mq8u^4PRDy+?YG)?)H z3(qN)Q3c_!hxzx|F`tT_amUvleYQ&d=H&R?{DhcxS9QE;*gVt24}EGSn3ZRr{O>CT zG~{<}GU$1)oZp-@ee`vs<)2l>Dl5Q{^p!+(=Ta=e)8^{85kasQzh9<~S5hKB~&c_#A%t&1>T)0e#R} zZjenTv)&DwuRZ4$Pz>*;=e>BSFAt}&vmuVhB0f4kHY7DmqIj8PXSxQCB68T7IkK5d zqyM5Rax4Cwst8i-=jC2>>S1=`c$>{_`V&?0;Qv@v3~c$4su(L5_y<)n60)NzHZ`ld zswcDX$sNDKb8|-SjLZ2_4N=!_dX;3Uk1bGM)7-nZYFN-1vBl#uiBH8sk1u^jWhB?| z7=-eaLE^<$MF*y26*|?>ruAh$4bw=0^cf8>y#ZOxxk;hgI#ksOVYSv2LlES#>TFA+ zynj{|jTGpdjE0)8-O9sUXi|6ne?e6=gp7?bRFy0|PFm1UhoGi@ql8h)7WnLApc4*HUGs^D)GH&zNxrid|K4U&|g8h;`~8RYeL3L-v+H zh!CL3)=IIT?Lgw^WB9v&R238C=&Z^FAru{!Ui+D<7^=wDBv3%;`wU35x?R#B z4e%4eXPycib-1c1*r#FlAF7IYVP*k}Mx6h-P)T!!^p6m;ujrsMSBd{nRV0Sc?Wl^H zKUNjdWK|Kw{AX1W%=}GN9BytBEGQl#&4>k=#HuwNw~@So2qzOB=ireA{r#htWNKa|A6f#-{NWTV+>k{~QT>_$XzhOwKMq?sY$oiQTTS?lGyxVrrI-kWe+B*lsGRkK9# z%WuA^iiMivb^Kj}b1qS#imk*GEcnBQNYPuJCCKY)HAJp2XqKjxiJ8fMeJu$+ghbqZ z+E-d?=;o77s|?J)sfwZhm8!T$yyxz4(EX{XmyZ^4a!CiI4&HHN6f5r;)Tpt$N+Cw^ zg=R!hkl_=FBI)?4Xk~;%hV>6r#R39inv3ezsZNgDuNm9uPC~F)^EXdJJ3>T#hBfHN zEI6(hv)wK!PhP*LdZ=a?jI#fOswf1}{>;eTfg?qg-pGGt4)$PpBc?2zQO73Y7Z zDzb|DucYY-?+d=K)-OhPGE^SsH)Tbz4A-LfxpogILZ^w){7h90)Km*4HT_vtEYyr= z*`VDd?2#+?%~9W2qM=J%mmZcNYJfNF-^~0>RU|)E&dx_ggvMX>`gUDZ5% z>cG{!*6MLNePHv&t9iXmErM#avhsdc4oVgrzpmXNfSHx=__jtC@0UE9PU zo-7*q`SNLb%DiRyK)8PE{>RyrNCvG74574zx ziaxO*bDoeKu^rVkMnaUGnx;R7r^~2HDri>Dr;re@)cYF9v`WxxCg9Jvo(c}dl9Pw5eJ=-`M ztTGnSI2Nlip4vE`sq*x8p6V;6q4^^IZG(LN*GCA5f`CMh{MdQ>PmFMpppOaLk zp%11RRA=@)nBh^K75I0mqN6-nRb11M%XqM&G^^Z0z<7dxQ5B0rFs)x#-m7vb?y8E# z8o#QF&BEWQiugaMit0P6;^&ZERS~kID%ug=o&HT#{3`srstEd@sfy~?IWeB5@)PX& zdv70hY&br+dZRIL)pM$nWBo0Y^?UoKx}e8Rjj*vy+ZHIx9{=<2?3Clrd@;#C$A0>F z@AYr0VpE`3#+WHz*tLiH_P!(ji(2uw(>7Sa2W7;$deZ4Ofmse6u!f3j|P^t4(> z#<=dX=i>guO*8g)^Qt<#sdB0y7IpoXs zg(5YV&eDE3bdvQo8%-OvIF-#UbcDvET!o?`pIt`Lodv@g-%PD2`iwqzA)R))b_vE| zOoM?nfe%y(HXGN!LHPM*O+EFQk0nhL*NXTyGHL{Ps9$_ScA~;K6FT=DI`5X8Ih5ta zomI!^%PF+IbUc}XMG>0sOJQ@01s^@`%5v~aZ3pY|^zppC$325??`8Lz>0q^``Y36L z@04chJ?$A!mAw#JTHYgdo$oZ0?J9dDR+GUd=9;yv#PO^P8SAc04pbUtjEA=~m<%N- z8T0#Rv^d>sWM9#V$><4H`uCs*HeYZ#NK!9ovQ``PW%}r^=P9vKzBDqnlAx0v(4?G= zNY!|(p44{?`tm@MX}bc0vBFlVHrsx=@uAEhi}BWLr#yW}I{kW8_DuQ@A3MyZzun2l zsSo3QoOv4d2=ZJFyn$g8_&P>;fTXBw?XGZR3NIwCQTI_wF2iExRN}qC(>d~CzRop8 zN8WwPJuL9mP;~BWhJVNvqE`DIS3=546eB){*29p_*Lm1EP4ymoz6qu9*3i7e8I6K) zC!2*E(>}9LM&5jE(?cKirnivEa^g~0xX`y$c8(F?)r`xz|0+`$+Xe2J$oy0JAy2E?@gQY*R;CVg+F#aY<0?@rldbJcRo=s1zIGw0*z#ESonm?d0u6KA}8!Jn( zW3k0Lt%)i5D%?klAJ=h(Y1TlHx>hd`&LWZE3rd}Fw~9_kY~f<;NU7;r5F;!}4Ah#S z*v^HBKg=;a{?WxxJigrzwZNcugO2tBI{#q89`Rrbv$Pp>Tx79x`wVV|^F`KGbjckf z0XuJFaTCaiP}7c6M6EQtem$O1DC@|FmTNVi^=YZgnz_6lQP<0-F%3HR-#4MdU1B%p z|D>00c!@EWeRxSn;bokUKO~P=CPG!iX7E&6S;!z;2Zi>B+K|uiqsLB|(R>+*_F0>A zG@a&$2go0oZJ>(tNzj=~Bgr8ywzgOaOC9L+7miEq6pEG!@@Nb+Nj$9!DSB@egFT?b zzu>fdqcpKy$@M)v>!W7DXUFix!5pL0icRK}>~4+E(yr%!obyPDI;v7mP`yNfk?a)@ zcim<$KCj4o((G!XnnTWwK;7)Kf@MP*^-TU3UPA%3;Ze>-Jdb0eJ4Vme=E=~7}BHfYTCC0m>%uJwd-3Bs^ZgOWlXv8*$OP02JQG^8K}HQB&61YB2cTgF5bRAu~6EceRk{c?BGu4G*w)Bs2EAw7^ zK2>sArSh3^-L(kf*jX1CuaV5_y>@g?*^ChNw#3|G^TUqrbltO1-VjaLhRnK$DYo@g zTx$8nFQGxm^DCj$WeZ~0A3vBuU1he&HG3tfL3|H;%wG8DaOoXhX5sX*#X>_2oplqk z-6kxN*{S$^O*tcaz*8k|{1Av;lior+o2gL2`EW4DPgGV~rQ)=qjhYD6DP(NI zYWF^cs?I{~vc8Zpk_~Uo$h0#oVaQ>_232&>!wd1hVD3QUADEL zuY!fK&|Gb$DD?dt;MmanF|Tr)Hj+;DWXq9*6|$!JJlq&sTu#EmegzUTv@y=7%=R0B|*Qfy-X zf-63FGp*jNeSvd++o-^9X;xUXWVM~Vwt}X85v;nZrPIb@v-)K zD-eCWhgF+V=Szx%w+KA%UkwIptn(;Xx3RNDBUrD}CdInsYcEsm;|atg#9P^mV-*H9 zmA<;Le$IaVc}7N*|Lfvo>M1;$sO2$sm;w_=G1Kyebdzwt>`Ju;FE=9ZzyrKdMiv;k zrY1@W$Q`HrkR7xn3=+zw{~Xe!tp2J5@@cJyGXu>kidAwT9JYY<2a0eXxXl>Y-J;)5 zb00yQtC1Xy2T{*p`vAI2x9P2VskZ?o;4lr5n4B?vp#<2xN2SrV#zBlN%|oGcyxqF` zSx-E!egi@!E|j{9tS?=OCZ`paaukTMgd>`*>Hqxll5vYx>)mk z7D4s_Z%;@Y`#qiOL!GSRTJ)WBoabCb8Of?5Z%`gCAEBkc522Vxq@h{nUz(Qba}%f% zk=f2}{_6Nba935-+xHaLI~mi4N-t$6QRbq~7>^{-o6yV-C7A5}KUMMnR7GCMJy+m`p)Fi#@LwcFa`nl%%x0CdXhV<_< z8_{@~9x{B)Z1lw3Xf(-aV#w$@v+>W1YVKL=9Vk(eqDl%+(h{a6mh?!in znbNSC+W)DFsVOI}kDSP4^~kmGxSirrG~#ie^<=rl$?BAo^&=-AvU;{ycy^?Cc8_>I zX7zev;We7#H8JwPtSXv9{!dl>|GTQ_^*5^GH&QXGFyK*9%2?6WjiU20#X)~n6_d+; zQ5DmEq$(~~C;t^zB&&*f4gWQ)czdPu{#IvERo9*0V8z1AKuwq5iK-I!P%bS75 z4+B+gLv??I6(5a0cs2Uy!zl6XSjXyk_s8*;o~JEuCwlsyb!<;|4^DM$J|7sK?)@;` zw>2{~I@|wre(2M}$kgK4yTyr5i(~(Zs<^hf`uW}4jrVWgec1Z+;T?%YB3HY^NWd3r zPTi8MCIXB}*silAyCnj_r=Ou)n$vcLP0Hh0XK8LnEJ`zqQ?D%VQKFz(q1~ghTivPR zE?pUV<@vqWt!Ie^M3Aeo+;RE|68li5*q( zXy zG3z%~G33vx;@po^#bvNNSyjw>OjZ?3e^C`A@GV ztPc3VMAAnHyzp9FQ|D~#!PkT`uObOj5KyyJMj<|j*EkZP3hGZt6 z!LH83actx}RgsPU_BFDqDEU`avEnyX@g-SRwE0C<)creE(d9>~;={jF6-WQ9D#rdq zReb3}cmR&e(fwq}P1+<>DvaG)Ncmn>T;j77%gWeFa2u&qtX5hftBUPER~57VtSVak zy{c&btEyP~|Bb4s$K%pI7Cls-o*pRK>YnRWbfoRWXOGDlWGmBJ-0Dru?od zJ}4!tio{)2F(vzVRk7rIRnhfls^aC}RmJlZLaKsLLFQjo#oAw0MbGb5MP^N&qCcvN z9sgccH04o%K}gq7C*oc*kX1#ih+S3j;_s?r@y}Jox4WvM)Sp$w=l_|iSmY?TLRJ;C z{uflm#qU+cfgh=g&v#YDq2E+R%^#_XRsTv=T!U4apNq{>N*fHTD`1SurLFp-s)(Rn z{8d$4`5RTy_jgrs;G3$5{*P3}q5nix%wed+AX)-(Dj3z9lqQv0N>yZ4aSg`f|36n1 z{dQGF>wl>#T79P~wtZ6-5%eDssjz|5XE8!We^nK8e^(XT{;DeG{8?2j`bSkU=-;b~ zqhwXF^gmM-{r*W+Oy5xzhsmm9+>WX^ysIiM_h&Efs*20sRK?QoRK=lhs$#%OUq*k4q|QY@X%i5*q3?N?QCfUGLA|Eemk z!*ojMoBmN%OozAo+;2zSdcL-!Dh~f#Rc!lSRqXy=RczZ;6=VNPRZ(I`RUG}MDl*b~ zR{x!<`21H@ap1pD6|H_z6^nLN#TBxuSoRlH(e@uy#YNy(RnZ2ndY;CV1Ag4`7ge$O zo2vMcCKCZ~FHveQpbuCAJQx)_7peklDbpNxeX!4z^xcoy^krA%gQ)B8uyIQiN zDlY!2DlU>$#m_LZsyO$Xswf42j+HO^_o`wY0&)uh-!lD0Rh;ml|G)v4So?#j==4)n zae=HVeuVv1Rc!yJDw5I3pQ(zYzp08GaN`^B=VVp!;$Kw7y6;rQP!j};u5(ajrJtVC7sy{~%n=9T0=jwth)1cCcaBu?TmE#y_c9dN` zkZ_TiZ4zmqh;*q&YCR2)uN^p+l&64vDk|}m*9k{CAS{Nq4(;9h5Q1W*$z$f=y-0;B z8q7WuG{#%!TeYBRqcLLXC~(FyHGt{Xv=dbJc6#B_?7NKX)N1|bv1t3YLYY|h;ue~_ zgHcC{eCSzk6k)`Q>wtL;Rd~!G9oOyO>R#k2!|1*^Aad`6CpeV+mL`&s- z#u+~2*-z@ms-N!_y`UQClDs%q>}Y-NcD3?Nfjjx9&qcT1>*Z*Z2aR!(oJyGSli(%@ zqP~j0h*pV@m&H~`d}66Jhc+H=9XD>m^`3eKJez1-+WCbs@&P07m0;6%)W^}3xB8@c zHUq+=aCT;@6iWt1qRS$)Y5 zy@Vj%pJ1H0$>`Bt1zCrE>^Fv%(ubL;niFjqO+_xsh=+^6Eyyk;d?nJD@m!8#B&xon3mQHdY>yxg?Q{btrf#(#YQu0?6zW_JMxq{sm zW4JNM&^xu763vi%T~bEPWCL!k!lX2cdn1+oRtQ=DS+btQ{m9U4?JJsc0&!LK461iK znuqz!*XiDUrzV)5bL$|eR!WrltfOJs>y@U2fNPZIDBg)Nr5e81yxs~5K!d8p%w9av ztHLJ}wLuY2sjXN1&hKG$xhu$%c5ZDy-2UiN&h`fpL^uyb^Ed@b1VKLwjvLx$>>C!s z6H#}b)}bla^wyw(TM3Mbl7Ns1J48iL`=9`%h7T$hcj?%Do=YtmeC*29rA^0&QEK9D zQ7rP3C@62Rm8^nIP>nE~9mF-5Wm~otBC$=2ca6XHC5!Z`o-+Fj6!i6kZ`$SBG;(9y z7oRu=b$-J5K>c&>t&y}?A*5Uc!{lSci(bataK=_@9DbrIW+<>Y(DDhS8E1)a9lT!KM^7U?_rPSmItt2Qq<-rur(*Jq zjDp2?s^ZA2@sRDPnFd$d*S4V^ZL@^mRKUzd8nCM>=9BC{mD)|59$h4>itoLrx8pOL!rtylRh`<^NIFL!+L5a?Fnd;3PAd5@a|_lKbQfq3YP=g&!`v={?7 zhDGT+;R<|%NJ6d0l#+3@f3%|**A`&DByW^9!N>I>rrU`KX3!!R|HYs5QKLmRV>RI| z5TpVGX&$EzHLFNb)pqjD?+%jg$rD{F#Z6GgpTn*A3EjPk2)=7}%9`2C&$js+QSrdr=j12PL>Bku&KvypS|m z4IDXbc)VBo?4jTeN_odi>umun?EOm}JjE~%0lMHz*lp{J2aUEx-+}3^rVXo=(#EeI zY7gL)-sCOdY1%InWMQjc+QxcZyVaEF0j?U?&j=9vuwWL{83MhSFQb(hgUdgnD&gmF zB${)xb}^lFG?Aq{<>cn0>=TwncpHzxcp2K0cWWJnR|;-hob$T)z;>@?kk~r*h`N-r zF?VY}y-4F-1^%_w=1rwF88pz7An70rRyw^oohPA#umq$&xKn%yzS;;#z z=_(B%G}5hH6+KINCFT(cJg4Umbnn*#S-ef48O#$%6wKb*LJF=1cFKjfz?hSwu6z}T z(s($+nC)hnboIcqDN==Y4vbqq3QBVSH}49hW1`xMv;u=c~gZOXZkn;6K|My>B9XmMmpLvyu-Y@qSf1~ zA)z2jhVBKkk0&A*Tq)^?)*@7~XhY`0IF#CUq}tSb?9R{1hE2DP=PigqRZJ08#B62F zu6z&lOrHMB>1T@2!Q>qCTN3o91`*{Ese=^;wED*|ZnH7MC|)l3BRt}>Izw>ZU>3YF zln%8OBHELHf`_@&Si*#ubvt+@OWJfUrYGj2OmEu_hVGNf>nzQ$JWgAt9lpQ7qDJ;* zhL?7e?Ho-Wox)1dp@9g&DJa&9Sdw!Vio?`vNt1S8 z^`&VDkLau07PT*j8=q>a9%wwNm~miaFyYBPpEm9jX0&;H`BjzoSqAZ^SZvZR@N(h3 zIwfN^AbVUFQic6*=`0GwvVDG+Dd1nA0|UqI;dEoks4Q@+?=pGTx8J2=K`d2q+-vjmdJOETpSQg10-LP|)#qCB-0#iUu1$S++CRk{Oin+^!& zEA)jJ>!wvvT`JL7?Wa#6wy+(*Qub29A59KgYdSi|1psl-s^ni{9yT zV1~%`B8|qk-BVf0BAl1`c4Y^&;S7*G}lt?*)j+KnszKl~)WW zP_s|2*Gk=1p(cQaz`7Y@U{rJl+`v;E5&Oy{D_Cnu;(a8CS-W4vXbF{N8Ud5Eh?Ev} zqQ?5yJZ`I^i0zqy9olZjTv?4T4eRyfRp8ijaw$NuEE{^LrWx}91>ww^$nH)M6{UZgsn6Ez)<5%Zr#R zitq$}=wlwlxc<>$@zeW)lwO@B?UX?p{aWkufI8_EMz3Rx{-Mi}<@x zdw1>+U_dHP4iN041~#+C`lm^u44eoM&o=6%==-2p6avhRJn>g(#}|Yu+=w0FklGtx2vR9#0vq$zJthpsUd*fRQ42TZ-X(&R;7Dy>$;!U`@myjZ3u1^R+PR{ ze$03kR=ler5j9grffa8z9lo;z!%bheV`ddrL{ebI=~Y-Uxk_okWDQpAM4d2Mg%$bZ zR1j{2F^~qJ6vYG8dBy-_QqXwfp@3al9Q z5mt=YaeY#%@5MX>tp=&4g*dkgE5Z)cIJTF=d`K|;2{Yw;}y9iHG8X(shbQ1%f07UVMQbbRvbyDz>2P5 z#gyd+Rtl`Bs)D4ziUW-Z3akiV5*A=wsx2Y)HTZm`S{`7)il`rH4OTSd zzDR);2R5~jG-X-k09|VkagFps(fq5hBG8KhD|V9ZodCKdwS@318q85k=Tl%sfER`W zD-uHHD6k^H8W*i=c6Ak2j5W4B+)!@SI?<$_y9z5tQeeeg3amI6MWn!rnA)fifV)`~ z1y)Q>HBp_5x;9VkPk|K!S7F8T6j-rBWpO;27KLfCVlB5akyI=r@wA0>>Q@$+RBeVB zl&-;wKIkB0$fQQw>H7u8NUSR2=@$*7jYf?84{x!guc|mN^W?Q{!6(3#xaG0(ZTP3E+w$P+-Lhu@e}uZ|9an@TLWyB2yTcY zfFv(K!}}WMqpFI~iFTA?^Dk`?mIIzoW%V2B3dx}n-m}GPDLLqDlms=aV`-PLfCW+@ zTF@#*=snD9$-Q6jLN9AZZ)-63ajU|PJOQosD$dq;<&O!@nFzaF5o2Z1oSD4S zw#R|FaOG(!U^63R91h5(#qEv*mG(=h0@C{cI^1~Rv>JT>jSx1ou6%F_6X%K!Z_`&A}<5@YFdt3B70*I~;XRg#Z(9<^s+o*cpMx({m|Q zJ!po80cdk@8$_b3MQ9f9I8w$Ez3X*&4$e!1iXKxY5rzYI;-siklyY!9ym%!$0P}5{ zX(t>s>6Sze08+Y*p^17S7;s77xM4Ec{48MR2yX9iGpO?hLdO`k0IMwr0*IBs*s_aP z932NPQ{w=qUju=GfDS&mKJ0Qi0}Y1&^-;|ALqIz)BDdxG`{?Q@8ZIafBt9-m!>R;AACbc4L{+MK!v z1y{3a*9Wt9$Odff76=A64!7%o2W|rg>eEiUf_)rM@cFUarRg+@A&7Q56|!!dWZG37 z>%uIKKoK-A_wkk*6^7*_TY-W}r?1wE9Lyo`=5R$M?I=gkqR#?_RA*9_s2;}CnG3Mj zR7788?!P4YSc?e{;KET)1O4qbw_Bs_p5rj5ZiP=J?~finpuDjPd-c=_U~;>A9q(lc zJ9OTb>fqZB`$MKfaDdoTK5>QH!6UH4i|y6aDP#udlTfefkXA2SRh@$I%3F$9xbS^O zK3pA}VptkZpvk%J%tUq6vmBX+@LCs|Icx z8$X@))QX?L6R%s)=pd0DMnPwJ>xrCD;^YX>VF{SvR?-ti*u%waF#t@Eps8Ps0d?TP zWZ-*e>J!p&A)Qpb33Lum5_o+eyRX^2eQvrrR4pwU;KhYw#Cb7^dm_{v_aAca$Z)uE z2L1!$_?c$9#njjgSoeROqyW8A4(sWhJgu!4Z&bjZLg^2shO(^}?5 z0+7EYg8|K-2)Oh*xi|sTcr}ux;uYSrgG~7oILK_!Z;GZJ*gZ8d;imuuhwaRQvr0XD zdU|F=Vnvo6_v~Du!A+38XDbcYh#H)ntjjbBCBxojFxp@UeOF^IBVwk+K?quq69%X( zPXIm!sxU=^9_ORxLLyL95QaSwT(5mvB_m3Aq*lx^M$l1I#T|07yJQ&loy9BPh%5-| z*_kVN^QgIBr>DuZx~$I&wyiILod7Tf;GYwHaevV0nc5D7O>is`xw%T9_2m{t9H=$A zE&<(hc$d&)m>+E&htF=|z2n(5MvhRky7DpSCUXhS(HT8WQ*qpaiB`T01rV&0ll5uX z<;WU)SGj#GXV(b3LQG)fe0q5)IbeXmd~+bg%qM8?o!5;186U0(h(|J?^7Ar8n*xS( zW95VGKzR=b0n@Ma-=vfl9tkmw7ClRdX$XkZ0V>#07qrFhX${#by8|MYW-dl+DHaP- znXx9*P-4h5-+6KkrXk>0I(-P#C>s)*_t=19@PJ@fxwRUG-GD$?FRHx<28RVC*np`M~D z&Ml@pw8aIy6%F4v6{-_MZootqu;BFch*!zy|1Z?PrgtUMZZ%O)Z7RTXQ$Qx%VPuc?aEK3}Mc6<<{qbH7s+AN?*>@dz>cqpJ80 z^YZJe;-eAr7)4b)Xh~5OttqPF_)^}gs)*TQJvBm66>CQqLk^EGRYlh?RYjkls*0iCtBRM`R7J&KRuxH~ zR7CBsEW<-RaMb(O;s$h&{b4^(zEqBIxkUi;*+Y#7QCh^8hlh0 zbw8?#?rW-Iap&(*6_d1>qglgpau^F@jgE3Of`s5dt17mAT~$1-<(vfKX4j>tij5Ri zktG=YJ5EkguwWH$PJq&3>vXMp9J8Ge4+`i9b>muTfOR z?!Q1)Jo8CaJl9b0NmX2+sEW;bA5}%8kE-JGre9VSubur_Rq@C#s*202sv^kF%IL?c zV(r&eMbzJgQ}RK#Yj;V4~*wEQ&h#9pR0<| zyHIETl&T0|SW^}4Hc?bX;+m@H9Q}i;s7Fy1bJtYG*x#xu@^duFd{h-3g*b+PtSV|a zeMMD#UG@vAV(-tZie|r4RrLQ-Rh$;Gc^UhgRK@#+@Dqz4}4Ws>`*!InW}hLcU4tv zCj3NIB>V72yxCYIST-j>5AY1|S>L?L5 z6-FmSM3Bs02fwGujpSKqV7ZW7-YnFiY1n24GV(#WA5)9sFC66P8Uui92aeKH72A%a=i`6J%BhMi*=lq?@#&VxoOu9F>FHA{Z5K zN2ZfC_r5lfAK6M|;GH!1Tvdb<8POC~ad1^t)Fi-m>Z0cQf2=BseyJ*=8zc6SsJ%MG zChiyndb!$o-*zlC7&+rCqqnLm9so&zDXJn#=*OyJq*hM%s;VgcQB_Qpp-o>^6-&OR zDqi?rRV4kODjxbwRdoJkRgtu+Dwa@GMa(Bvk@Qhjyz_fh#fDF+BI%E)ilmRKBJu00 zqTVm6ii1B@74bi>D#m`HDqd*1{Pd%$Ncvr>;-Jv|9gin)wL_6tHl)IS5!r5;-6C$n^nQTqAGU&jH+0_swy7* zTvdFssw!r!sfwiEtSVmoUR5OhnyP5_ovMibk*Y}gL#m?fXR2cTud0e>a@ogztSUYs zGL(F&Dh7O}Dh~ccRUG_MRowWOs*2M;rz+ZfsVb6IRYk|&rYauzs;YSLXH-S4UsM$- zr%Ha8s;I9X`I)My_jjs_J3m(yNkY{CbSr2j4;S02%YKj?$=?o`(f>25BIzrt;-z0! z6&t>yDn9+~s^Y`nq$(c%y{cjfMOD=MPE~yRkEn_l{-~;W;PR_Y5?vEegS z5x=S`0*Gs>qVB4y==!;;cu4AVRS|<-;;DQ3=z}OAQX3DV<4_EPx?!UjxRpbepzeBc zY!Wu*VK9$PM|jji7nWe2hRH%k8H)Q@y^PbPwyCeGipPBAn!i&OAC?}5wNQ%|d{tF^ zNT{SEr+iWsrAk*-MPY-Fs^WQ}>swvX+j}UgV(gcyV%O)YB8Su0RmJ-jLT5gziU&Ta zib_gm->HgQp3`l1{Xtc%jtlu-RUG(2Rebt2Rq+)?RjgZ86*(a9KdFj-->ZsYYpUW| zc0>N3s)~j`RuyqSuPPROuPP=}RK>YMimHf(i0%g4Uu4H09V}Q?6(47KXAhE2c>C{b z;o%M#eVp;Vs<qjr(Sxdey%ER_^PUS@_SVg8MOJw zs$x+lwrfpQd?gCEY+Y3q-&A^irz+YHPH+eKf2JzRrcY*PZDNuptQb0w_<5f^m>^o4guuX`)L8@paFwk@HJ5X`WQzVOx#T{H6 zQj6Yo$KkHeDyS$qbp0TzAS!#T>?5d%3hXg@)AAjtSm-;jFTyMm%ole(x?+cz^1dP! zi5&ynBs`y(>y}+~RwGp01N8zDV~nuF`~s(fqK%jG5qIXm!f>59;{tC)(aW1{@6waP zroGjw=OFmIP_e8WA-C5k7e9;9$O!$TQ8EXd7lFzR}}tC9|;)T*S87I>}#N>7)XCYp--)?OyMg11q#gJ+p%t+XfWu zUWQ=%GKP9HeZ&wJt$gv))@A{7AW?6$J$CtxM8!Q6%Swfxo~-@0c4v+&e(mChO7YA= zVT3EkpuXTIb7!5Cia!;`9*ie(?@yr&sa4J?=Zrbx+P_ zCmFnxVzXYGlmCm-(RA|InZp?FMkw-qaysK^WEtspW+=y7 zrCZ({jj)Wl$Vc;si=4wY+{tRu91CK&8)^aT$svNvni<$KMFeP48{K1VuheH>zeYPG zLJB3^L01o4=hUV3&7t!ccBf_Fgd0lYq&@rgQR#ZvacI-Rc(Jjvah9nB!O$M}>H7)J z7Cr-Zbeo}~8ru$_I3T_SrN>XX8^EsC7=+ygY}p^ag(=qzmMpb>!r8TzJ_$wH94(;r zc#wbG^eotn)t-gA$y(d5lu9HVw1r-KLw9mUh_)M&Qk)5QwPB?gN{B43!jxzRU! z#0LoqeQ{K|^6rIt=T4w@WX|52w+{j$EL9&obG}}c|KRZ6Y~6>}%WoRjRK;wCmFX6e z#KXiet7YCpZmBA&%t46}yb}VPZ_S_Q`a36np(@H{zP;`19DL#t?_;(8w|7@nMKbR+ zWVEGI1M3%p^hCZ<5AQtoFr?@OZ`ns{kxfA~GHH*lRZeATWuM;}{)ax#d)k~suk7b7 z^Q+JoFX5+db0j?_)3>SXyt@|Xc6OAm@`HLj^(!#&Bqe>GbEN_%m2)YSjxQvPiZA>1 zVYDld)z{nrQPAm)4{Kuf3(V#|35=fLqYqhFKE@PikT%deS5*0;h382!cBxM;RRKM&|)E2F&Gt{j`!z(rH>yJ4L{l5P-W2~7u-sRt?v zoe9SR847U96mT)Lh$-dnD!3>}0T**P!#xg^@-G(BV4p*ICOVLC9eOIg75^#bfKgHY zGp|bViX|u*llQg)*QeEi7p71wPv?}dK(5ky#=a!DNf+E&@M%&Ev(|(bJxRTUGE_ml z1=4ns)67?1gsy^%AU+DXm`wo}bHmCv^Hab@>>9X8wFWNU+%87Wp@@qO^&je|O<_D& z%z4bmCLAyDlVEaf=7`71h?y?L3m9_V9Qh_0 zx!i>$!}viK{E!rWhHicqRsl8(0nQWwo^AmotDumDpje8aRJWiktB`_)kV=Y>Mz@eQ ztFW$xutAEjNw@G;RuM}J5!)0Ihi(xUR#6WN(d{XsJGw;!SjB=Z#P+6$?e7+gU==4? zh{vReCv=M^vPz^{NMxo+oamM~%_^C1Az73nS==poiB;-~h1B&Fsj63+a{= z>Gp2v4%Ur5790ChHV$`he8eg#E|qs#w+MP{;1rWbN3J?%3Yv*un19W9`(R?lj!z^oZSg+}inZy7Nq*^9y#Dd25$9 z=`PECE@XCBkc}%O!A(?>;{edi;L2R}`oS8v9 z{Xt01-9omz#WHtG_3xJD3|6oWR>=(3=nvNB4AHd>F~|%t=?|g2^|iF!LwU;U(7(rp zGt|R2bbDs#j{Z<1C4K-v;4PRA3?KufDeIX4YH%l245iXiHsom`j12UA%pfQWGb@yZ zosE^7gPn(qQ)&YXf`?lG!HY#ggaweILIM&ZLK`JSWu?S5rI7L)C6(o5R25`3l;pM4 zlrZWlx|(Vp+M=5<51SuTc?S~Kb?+8^V?hWkRtKJcYjtbx15sv8$*XlZ`8-FNt_KF3o&5Z!N47ngshx=1JdV|5Ypuc?crpR0>WzgJzH{pZxh#Xq4gy8cvMR5JWb zU7Yf-F*s4fzJtGej% zr`5&X>+0ejX>yi)Xu1Znm*w}VivT)`x|sAWtBaSvrY-{LzBzSq`4`m1E9>gwH=-^s z1L*!rbrDGS&#Q|`>+0feXpP0UrYr1 zy7*sF7vFq?>LP^h=ha1C)^&C9FIE?a{~~p9@z>PFm>;W)i$AD~i(gk4Xa6~MadBN; z{2x~rTfS9waqt^g7mxh)>f%At->fcr{xNlN+0gVy7+CYi{<~Gx)`&r zF8+eL*!_*Gi`0hypVY;ab#-xFUHn{K9Qc2&E}mIe7uVIr|CYK)YxupoIPiC>i(%{P z;_TDc#lNU7CVnI8V&XrbF5dj8E-rt|>f*D%LtT6hGWtbz@y`DTb@BP{QWs}e)y2zy zhq~DLr`5$rAJs*V@6|=jy1E#P{7hY3Tvr#th*{M(<;A~UUF`lx)y13tw7U4_-%%IE zg>8Uw>+0eMGWl`E=@a@K276Ab@BG!tS$-)|CQ<@=99Xp^V`%#%x_T_!NNaQ7fI{tVn^Tx!ijfy z$d+wezCCr3TXsf(CzL|vR+R~Pei=BMcyQ(r3f$<<^38|orh_#0IhpMO*8 z;`4QNabBYyQeDr2Zs4qLK%yJPsvBj|O)Aw*+GvtNHEAol*|xgb1$}dS_00fu%iiji z2y|;qb!#HJEwj4qH2PLi^{q?j_UqN{b?DnI)wes)clxXEJVM`nTz&Tiy5miC2N~T7 zsp(|V?BcBHLTYx4)pW~h_NdhKXlwQw)bwuE?6a-ubJ4uFz2;tkX8+!r{s_&1n3{n^ z&B4r?!PA;UMKwd0G>5O(4A*JiZ>hQ8q4}V{=D{P)hmUI>zR-O1rsfe@a|BX5!lE_G zSv!i<8WXD>lhqnmsU6qWnlPxH*s3*YTRZ9U4XTT(f4RC?nE!L?VjDU6=hVfb|7ms6 zfcK~BqQQ^V#lZiPy7+osUHpsGMU`Ju7ejtUU7VHHUG6ePhmXdhJMk>(hIi1W<400o zO9{N}yt6xOl!fY~fGJAHpz*0&ZEry%9Xb9!o#m1`hjKx1&yHN}_#)MogJ)u{FXd|6 zchLyNd#E+IJNqq+GH$JC-+QOMY2yf1w?7x8;RlV~xpOM$yW8 zMw}H2CnIg{6BL~+Hk5W6KoAxIu{BRvOIDH-rRgH~wM--TeYld}`Z`g0rTZ><%c30` ztPV{{PMZ*xh)X-SN1y&-1sxyh=5e{CXy|3PEr+1HQ|EZkyYY`VN;W5#a*05q<%LZp zZQG1($ILuVN#$cu3z7DR&Scgt#EiQ(DjU9S+A7pUg$&)7FosXcxC)05^gPpLm?4bQ z7|5>jBq>TN~ zFCpL0w+C`OG3`cQPNB1&#&Qpt#bj@yhaC|Y=mgTlIj1R|;G*G+oMMkU5QO~@c7$(K zk%=VmB3<_IXjZI(MMT;{fdyBr)|i`g>ST(CJbSCjL}c^vxn86e(^JJ*a(Ln-?WCKP zJR&1IsaUh^h^tJLuYzPIhqrSNU7;kc5|Q3Cjo#BQ!|$}(6@0pwxS>KiPjfr z-ykiX$9Z9Cg9OVfl;c~NdX2fR_U?SkS8s(fq@%ce=K1Zr4_Rk(IlxU#G{h#%^TKL| z6XAg~kq^)L@4*0dwHdd^n>_WvAO3l7gN>L58i%fB&$?&ag9K5<+ItC{Ke3PZ3>J;4_)|GbhQ|Lri5y*ynI` zjw5f%2^@4bf#SU|mXNvJ%iMl9Td+Y1Hjb)$=;X5C9JmvmxBpx6U?*~j7VG9DJ)$Ic5W*rrlH7sA3) z%NRqMs-aL+Un^WAUUgQpF*w@eTF(wY+NgLx$4Z`Jn66D=WSRY_us+ue;&upDgp!yV zK6kz3ojH+Ryz7L=2Chy#$3l*RE}i9VYOX?nWqNB@yt5&kVSjn6CWj@=I3e-`bz3=B zd6wm%)dusE)O)Urzw(WKRw}SkdQM+`W0K9@VpRh7Yb?$@&UGI-wzKly>x=s?q^!UQ z(4(hJ6zt2YCC;eiB|ldZ84jZfIPWvqm9bcUj;~SGX?f@bKw02;(BWO6ssnwduCmiO zWPCi&gC{m`a>ntycJAOAI4XYsxU9WMRza|$WUl!x}^d@vt50Z`+e!m##3fD#H6KP#j>8*@Ii6$eVCA@ z1+G@zdw2Bq>S%_vvnk*|iCIw}H+Z zPjz)%X-O}VBu&j%#A=4B*Gr-=?xa6QxO7nrUW%dj8E33_-8%$$oQ-2C9e+^(U<}Vy zxd~y=3h?8PWGoX{S$F_2dNKdI+zqM8 zJXKvhbuiv0bKaI@-u5ou4j5mLIbVM=-*6Y-y1J;KlB%H5qoB>EsB5Wckg90Xqqvn# z$Ct+@raf<|{U%j=xksDKh5=b&AZZweUJMIdhs{cdGfju5R|g5l z3Rz*r(y&s!SXsEPf|agHnyyB#t~Ojx*GkVIO^+hfyv#-UyzLwB>y?F9^)|2b$rz?&; zsmOR;k(g7N*nZ>U)Q#-Ls?>t&)Q+06nVOT2YxAdTa~Er~it4h?)@60oU4GhdrnoV? zxAE$m##2v8#cxP?mzr}g-z*rsSvk{ETHabT(|YN3Yw@*P=YD}!tnIj5*IC}sb#1b% z=|xw~+pcTPy;aY8n-+R&-u70#yw}wBkI;$~S8@K$I;}WxWyhnt|3zByZbOpVR9eU# zC9Z?PM}7;f`0i`8Vlb^ZsYLJ~0n0(xrO+alubkiwE28f;Wme|60xG$DG?E1l89Og^ zsg}cv5$ASWK!inzJ3$(|+i&bymCaC`d5JjGp;I!Li|QO^X`0Y-`MPs(oBUy}%cjzq z_494K$IZ@vq7`q$E@;nLm9)!e4rrKNEVJTm$Grcr+;-tZ$q%$5x<%yX2je7x(RY@I z8P!IJ9U{dCJYtcvhXKpMa-|zickIrXhnvfqZM`j z5Uu$1Gg{H`U!oPC{wl3F{j;=U7O|bTJCTZg7XcuT%|AuY&zYCQb;uI#UT187Q}B5p z)!U96HVs71oCg2XLWURTP!+tl>)BW*_tP;VH>Do)@lC^onms560=C%^~!&1`73aCS}-#{2kf zWehWn7AJ`VM*j&~ar&R76>kHXJXch8@`4s7pL|9udN6b74n3_cV?Lc*&+CH$X=UUx zMDP%f*guJ3*j-Am<>w%zAGl9P{9Mv87Eiv@^}ftnLGnb^IKi-fjYpgx<{R0lpK`=S zF)HT9iAXx}Ypl|Wst|EFh(7+fE~A}yrF$g)Fy8_@d}o&}B8jT&+77dGm4rTCxbB~( z6)pZ8t*8KrQeYnTqV*Jp3b)9LC#f3~X&&LQzRm21K>XV}t%oc@(Pp4Ywr!KI)V-`U z(L!6Nck8xW9*CZ!Dt1+w$S6lZ(fELjbrRuuD`syAYOz2`%cF;pb*^%zT6 zrvhO?YpGP48i=Pb^t5m2ObY29;iP$5*?V!2jRcvYRIO{eOiih>ntG=9dj7+-V&4A< zt@!R2XvO?>TJam972o|oq7~I48ayDZ#+m(du9+{RsN|RtM$mp}sBfVQiIrjJb$T`r z+6sjd8ZX&QF(@Lzu48B;V%a11by-kD`6Fnw;$tOa4QLlm;56vC62{42{Q4fYjkBfN zcI|Qprejnbc-0c9tZ!VxXq6hXO$8s!@eR1M$K^*pJk2|hQmT?P+ThX&`>0AhQjq73MQiZUJIM1 z0BJa$iYB=VOM7Om{qzgrV{Yo@xwLvsytp+!D?Pi`$6dkWaRidGX73pVCAm`ulnjb|HU+VeavbUw+^RDa*+e}~u1 zwdehYK1%CcgVN%4w5`nZm|r%_<@Q?xPak-HuO3EA<<1~0ss5}pPuwgYb&Hwr7BiZj z|9Cw!RSM{t>i}@9pIax4co5F@0A`{N^SBPv8;VY@?D9%0P=gIg%MA&*;i$VLWiZ)_=+3k|*u_!o zJzH0?P)Z68+nEtJh;q^K^Dem;SYOgm!u{oXx!6Wod%S_qiPeST$c=JdV$;#v2l+M)|tM}3sN+OL6tbnm`KD+c4PI*iZMJx93BnRxm{L&_Rt zN*Q0?IJ#>hLi1eAbLQ#$71u{)bhOGIY=BjzC8&b&G}{1=z~nM!xw+&?X7juRvn3gp zlie(L9zM@F%#wdXUP;@0o_U)ln?^tGg+lHt30T#Nd=(kdw%Ky-1BQASdPFOf|IX53 zBs}!P$Q?odVYWwzO;DPdQnb*v!?d|CuUZiI9HX}47Z@JUs1TYJ2vAc4RBOqN^5E-t zl?R^Yp9d#X>l>9U?ykFj#S^%bVSG-Q*~w)(Z|SWRG#Uxg^#+rrpiE;_Nxe&~cl#vp z_5A0CZeXbAjOHL=uYXJ{DnMkC4tZTd9%{-K;zrYzHF)>&YooTuv#ZLZAOaC~%4+ue zDkQlaL7oxpOUfjW+!6!cGx^y29P+a&>fHAb;0NEJthmZ7e_HDIfk`YnBE-S2sz->| z_p%ZtU@9>z4?3Wa@30mzvqteQ@=`%hGrkqF>m1REq+^%y9OxX?jfJsXBno!r={tyr zwc1U}M%VGmW1WS!bx#9PFMn$&&NBTd=KXYD=~nyEB7=Se9uH>h;giidWTa zuKG_7_08Cpx-}maak%$vkgya#TJ6a&>k#SJJVtcrf9_Ph@AdP^;r>hgq1ytx-%BY6 zx{>18WxekYy!2>I7TH?pIQYuz_A#Y^DaR!D?RBSh5=EVc7JPfp;ccv$GWYBlDD&zl zBs(paISvQ+Pfa}-eiJaFVm>kV1Faau9K5eTYkO4VeV3bcl<`(^_diA}`W_`eV6-k5 z)hSZU%XoN#l zKQsbDNW`wULLxE7?wX2udyYc?)_E`r6`0%@0 z<&VyTI6WsbB}2|^ni5OoT3I|T7IJnIEma~X?{@a=t7o@Y-7>#_SJHTrp1c2+{#2!v5dsi7SgF-SzuDNd7W0=vra2+BZhbZa?9B? zb13WCq##!+rm(6&VH&Oq9Ag&HtCJ`Z*iH_yqm=HDvp6s@x8yVpS0OIgL{KQ%?fo0%6%MuQJ(Trih1MxPRS-ECXI6B2N2Ev82KB-)< zpmCEIClrDUK*fkP0X0jvG0zXz4OdA8YSdqFHJf&QFl&cwz}9YoU~uDbyAF8ZHgKRm z?X)Y{$MFQ8AIn{uPLmjdXs1&l>$XXzUDdHJ%;E?XLGyASZ>dpXSU$29D42BmYOTmR zt@t<7im9N+tC1`fukfB7WB?9#klCQ$6iqv@dun3BPXP!H+nEJtm3sK}^vsCFiYzKy(E4(VA`a9VU6+9FIlN2gG0cy)j>BiS@ZRxk8Y4%j zSzY;|N#dv7B8a>KBE;o z*J;HM92a%VP>hLDo6K6L`{x8$4uBY*+fBa|YE44)_&yPRC44KX*fZzJ;}=8OiJL6C zOMK@G>w;v@mdwY*+-eMUTX|payCB|OV4Nwxi+;E3jk6X-78m?pOWrH7zrOolp%q*4 zz~07|!h2 z;B{KD=L=eKVVzd|NGmS0eB)EQR#T@d0yTc$Q=h%@wIzl8& z`lbOq>@a|VdlEw{Yqv@9ER<+h5zBwnIr^Wb6<_}`T5;O!=mE>>F^-N5G`wQVg`DBz zORYH%ZZQSK9C|gLz*cFiekiEm8Gt2h1n+ZYj>=cWpGO?(<$q_U&(GQV@rEtyw4(Ah zz-IC~t(Yen?FL6ZN{ci#gF5*P-eY-**;?Kt=0i1jph@Jn(u#y{mR3CcSJR43yqE^) zMJ&gGMsX)9b~8cY+dBBSASW!Ff)aPqo_w_9sFyE;Rwu};5R5L^AV@c9Ma4w>z&I)g zw?!~2-i}NsYwmq*B0sW~$iO>kfC08vdR@tx>WCF>h7%e8AzCr=x6z7Cytjd^7eE`# z@UgVw_-K-H;0e*zrs#LNF^53c4-(L^XOE;!$2t?OmGx7PHSogee4pDM5S$iU@Y*^; z+?lJxF+$uQPEzM;5Gx_xUBSqawjaWhyF^G-p+jOW&Ue(s?_wPZgNhQVD?UkMoMH4_ zDUuQv2Yh0}CE`z=jZ2x%I`1>Ur8AH6f_XMb!o|QfxR7v?ycNswcOc6Tn0=Q$E$pwl&xDx9sf_ zV^Y2PgqQw7A_j2r;8Lf^aYK_)7*t8RbB2Z#6Kc0q!f0s32eWv7gNA}atU8YD?L9lPf>(Pd`6?bmqT z>OR4}f?uN2TDYZ;3zY1jhnwNgoVtjryaJtL@ul0%Tn?hoYLK*JptIaVH$@-Sd*LGQ zjCu>e;^{XnO5N|obfb?PJjn)ROCT>+A zM>#>|ES}9!ht~`%WrpP$&=ObziMZm~NDUj8YG3ad)#$BdJBwEVM<4)1ZIbTHvaSM` zwty>fF}PvLMSFQUNAUh3DH5J-q5(UBhpPfP{~B5`0mQalJmO*LVOR_GAEFhn|IgEk zajMeyner!(Gd!uBrrlsl|KFw+-~1+8u}GC_emj@ghY{*RRrEF=HS)=acnK8t#ybU- zZ2M9F)QjE0d8))9Y(=rn;&T30u@9R|#`b215EAOqHzhoY_nYHb@kbf_HZumfTf1i| zALTAIrLfj?<*z_+?~F4tf-3WrpZL7 zkUt{J(bOLSzvP_x?D8Egp4oC0!gVRj@MYVaz5eGs*$AFZ=ZjSL)ZU8 zT9Ncu(TaxnF*@Ae}fP+VNG`;K<9 z7N}T^5Zd*uhKCoKd7LGjZ_%a+Fb;Mhx3xh{mu|El@GVMm5!&hgcG!hX{_sY#(5D$k zR~^@y)5HuvqN||A5$^!~%v1Efk^8-;*mVA*r^uM_ou`;Q#@_KYPm$p}Pw_0h3!!)E z=2&~_nx|Mn@f2yl@D%MGD>=XJDGsQ7?$Jn6nCy9>Z63T4Q5`oq0pxK>ZA1iYY*Rkfa6(dY@$#_;J1H(5Z&1== z`I+}uZL$ROn+}D7_QN16P=ezy$D4|_BaeM7_Yb)=HOdV!uz}Kf9=^OF_?U2Z#wn191ofCov@Hi z<0B>8Cx8q`4HMK~@l#7Nka$APh~tgZj40t3W;qT;WLl~UvuY4uQ*4XYMKRUW1qRsd zrEvVtgd=hb@415`un(x+<8cZ#v*&EzKNWksSfP3ZYKUyI)ysl_t=*B1^#q9wOEwu? z!!gsaP1+OT31Vr_=@>lFd8EQ4{6cxGDg3@0s6_Z0A5 zmXPLLJDpDhvh!|!cD=PuA7b!rt=)cpZf zRHUGaV8$}H<&UUh#tZNIq|t1E#R@Zz;W%V+4OKkwoVGHaVZNT>BdT~Rke;Uth71x9 zW8OFR1*+KcQ&iD^4ONU@LlxnlP{pw|R56T#D(cjJsGs6D=ZZ<@vd!xyo#juXpo&=T zbc(4c$VJYCnM238mIXE>z<92h^ITu26|CBe?0fmFeR&)0tk=7x1m~R$JesZC&o&N`{+(tj!?lW(<91 zEbQiN*5;h)<~)7oNOlV$YYVY-3#mQ}S$0bWYfF`MON~BDZFVbNYb%3vE0aE}t?bs8 z*4DP^)((BvF6=fQ);8PIZFcn81hCr%TifnUx82`o8^LZzw6=>$w@c`=OJv`cYP~Hp zecOq?ZRFGJ_W9QKMd|j%efF2w9j;hATu*oS|3fQc{tT^X{RvfM0+0dHlxGb9YH%l2 z45iXSXlQ96j12UA%pfQWGb@yZosE^7gPn(qQ)&YXf`?lG!HY#ggaweILIM&ZLK`JS zWu?S5rI7L)C6(o5R25`3l;pM4lrZWlx|(Vp+M=5<51SuTc?S~Kb?+8^V?hWkRtKJcYjtbx15sv8$ z*XlZ`8-FNt_K|Mq50O(Q%B` zPLGXnj??Ljvw0Dxml=x zGMMT)muk5y%_1)?zCI%?DART*)AwbjeNooY??uIs;|^a%6%YNhsN#urRPp7LcdusW z7M`!8il#L;UeEj?RB=f!f>!h_zrGKAdV~nLb!8+HZlh|(0rghRiR1{K&OO9MB~FA2 zXBY~yi_)7Dp=vvqMiWqloO#h3c5{s%Q4LoSVi7Bx&WBoo2i+t8kG=bjYWm9-hW{m% zP(l)rUV`)@gx(=EK~So6l+dI}7eu8bgf1#gL<9^S1t}s@r3lgl2~7b-sRDuuf{G~c zGULpBes}KN=b5?h%-lP7{PX;K&R%=%z1LZvt-|WH5Tdq6SxD1bpK(K~^sZB`7+8BP z2FYp87NOO9tBcWgZlrD|%hO(OZ$23`6o@%lu=4g+Lu}2NIOzw7rQEQlFM~{GU%r9p zWbho=3>nJ0&h3c)zX_@s^cz%B;Lo9oL(IQH6?^{^R59q!ql$^YMHPGh0#s4*uSFF# zW$tUeal9|72^tPQCi$@U&!CDL{4?Sp1V4d&55IqSM(HSJMu&KE+fssVw;eol8@S4F zEdwO$U{(n#Nd5XcP7X;~2!$uqYvsZ{H}Q`O+R6OG3~or$aNJeU>um-E<;b36boAmL z3n^M^&(QZ0QFdlNetGW)JiKt8*3)ompv|CUB~5@unNG!Vj`|9^0`ipKfAXKAirBvw zswnacRV0RQ!f)Cd9<$NRH_Or^HMdUg8v-5V_(BHH3u9Z_Q4kFDK`@! zM{Dtx{6ZDgRvRD!-NU9dJl4F~DgUjgV!X)T991+;=@H69!Ge?L2f?zS>2nyncgGNA z>XX=8@KZwe$YsRS?=kFUD3~0JD~})5k$C!Zx+yKmBA3Ti;(To76YG82k|0{C#@; zYJKgF^Fq=;7**{3^QfZk-vLz=jTd2=M?id8-~Ot~0Fgx|(9>h&t3+Ow-GfiDv#!p#@QsdD922zuZ9Kas^e1~A71A4 z!nk2z_5Vvy#V?QyP*zE;#1ot{`(wi+HtMl1N(0gS(JI*)1@uRvW5UEx>o|>Wce6k* zwQ_8BWQc#rg&V;)Zh!(2uEH*8TQi^cp*npo}^4$KR)0Ja)rV;K7sIO!_~<%;?3 zybb$LViekJ3SAS0VTyv>p$y7m`HIYFf*6M%idSxmfzmYZ*P0j0spYd{Jw3SQ8qqfp z`J!}aOg6GfQEeQHbxA>vg0bW4`J&bE6>9zp(UPVBu|mKW%=)v`&>aEgv7~~5DK?#p zn6ZB$RPh_6f(6#`uHw#lwZ-|n-GK<@Y-D#pRX4Wy%{t?Ky7F;i)erIF2T5i*=1~ns zlJh!rdPboe;&Cu{jVIL>42w*5K!C1GPzq6R zYg{QQ`u<``3A4M6Tv#Z38R^r$H@7*b-O=h=3vrmtRF&lp!cP1jjVhY{qfo`5KZ_~` z{bNu?@qZwyxDGk; zi_WsW#b@U|r9DHmdP^ASWkB@&0L`}#Y<)6@(|Fa(A>WuOmlI)r)Z0?SOyMCcoyi0J zjst_oWd?r?^e|vYTF~Fb&|CPDVo9EEhwl02{+L_cK5LJgP~c(|^0Nh9y+E-nwfkiN z<2TL&4vwz*c=4l=BNy{~S)A$V)=)Qq+$fr1X8z&oF|-l)(4x%H9Y^Cuyom)J!iPar z7Da1ebn*-lX2D5&F&5Sh2r2yiP({;!2~<)1?}93t{%=MV1^!A@QRpv16@&hzQAL6O z996{rRjA@NjA6a<7Wot&r3)3K$QuR8&5(GaZs0g_$p|vdY=}wSCVODCzW5!JkT7v%Uien~~@lMxu8Vi!uBuB<3P) zEw^ohkbZS^&E2?t;k`_9*9w#LQ(1NIkw>hqFuhAT#`I|xbjemvlV(^~2Z9O0CFp4n2PjMF8Ox`ofO*e$@(D`P?pGVK z{25d+?_VBORG%#}dN*@mjE`I|I9V*iXki>*8-HWufib<2yMf2tRdFmo53-Mh`k~Df z^=yQKd=$fg{TwYhkcLIZdeYg@im@2>@X1y`IU13jUWm0QbsUK~ELYT9Q&M+^{=R;k zV3~D;;=3GQ{SJqDx*`UZ)pF@SqKXR$8Qb{q1%;=$1_Fb;{W>V+qtQGs*S{{R$npzS z#N7AjP%29$L6dbk9xe^{KMDxJK%yqK-tB%Qzj@od6!9@?ANWXq7QRGoJx7E=wgJi7 z2RDu2jLX0?HL=*$nsbKafzISXA3O(Lo6l-0#E`@{OoT6|A_>IQW`zl3xHLfIr{ZoI z!`VfN*hD2>=jb`G8c>bnLTRtj(=f`Zt+>kb^8Z2=f1!%MP(@{4?KnpXX20qlK^dC6 z=@{>s;d2z2;4f5>_ZO;oa~ZCx`?A~2!FDk2odWuadg(#*M$7D-{QX2RfX;$wMoq|vEl1Oyya?R6E zP+Cn3!F4?Sv# zt3$r15?|6}BmwM1=rWC;?eV8p>b4(<_`Lp$Z&NSDy$n}8(yUXLcZ(+VsY@I8eKA}0 zf)g#l0}hwYEuSL-33;B0B_5C&bn=&{Ezq=mdelm&d&{|AcN=BxqWQ*tc^wwPR}4JF zHfH&-3?hH$>9*p9me@6l)N7VjC!xMc8!K4wS{H9e3<4m5CZg~75pLDFxFkFS2RLI9 zT5o@N^RKw~qv8UEe2xW{akrD%P~^oK8i#F~%gz;}5z!&+NQ()Y%VMzhFY(|Z5@erd z_cZjDWa2q0i2E|aVdXNnCp_rk-x*cB{LhFgCj8T*ig6+dRkpuSMajP$Rm}b;K^43H z{-|QmpQ4JNVzh$(1FCrK&!dX!8&|Y;MT<~r=0++)*Tu9yV>9i%8;fn;%{btQ4Y zdTh+a)4EF%IFNU?zA=i#C%ard*>7vn!{p$;DO$R+ywO498`vmbeotrF?0QXqQkt4Q zL~O`l+syC;qbV&vnIE&q?yjT$Pk<^8G9Q(Av}I)|w<|x4gGaD%N>e)ADxNdQ)>UZ5 zxyrpb^u@C1#G4cz#VHvkMunHY&AW=P0-p|2#=J(|=mb#R~b45urw%jJyw}{Xi+qrP_y~yW8{8n;z)2k1Nk3N(03ooVg`|DFJVi%a(zCF{bHduZ2{alq= zNJ~lo7Sl2R)k|B`7JYl0or<&!v28!zYAt-|xMrCCv2t2;UDK6O%<{|Xv)y&RgqERe zzfi^Eh{NI~zfeWnT+0TxSiM5aumpPCWkb(mtp-wng0Zp@E$0KYC85~B3_LNx^ssty zC}VL@kIb$9?@`6x|39dr`u}89k<8uum!pcLe`-{*gBf-8-=m6;ng8afqWbSp#dhX@ z7^-HN^OHCr=Eg zi6mKyqMfS96l-5*Ai5I2-T4?V5TYqw1WPWe^t$z|>t3s#o}`543^`$DMV&FfR|@j_ zVgk?fP{`|;GZ$lip^8mw1X}?0Uht0sVnVO(p@{t$>{DP>pB#JW ziOmxJ=tuE%+H`4*FMN7hwGG%UQT+F);u!N^jVkst|3Ve#hzeE@ujNat7AiIE^Q#() z))tzo7TfIHz1Nv}*f>pt<{9G5sn_nO)2~Z<=cO)b9J!`_ju+(#9Oc9qR+xEC*Cn zh&0zZsXZK`^Y!m!E=v=p^nNku;;V!wptS()HkYAp?4q8(A&Pta-a9*O{~W5Q$@Aw?MH-$zK^3Wg4XQ}}o1=<_|ASFQB;9cW?z>$} z-&Flk1(TMFV<~YVS0^kS~1X-Jq^c)}P`t&y;$Dn+FL_qh3~SyXQz9Dz`?wW@$ou+g(~G^VI_I$+YM!h47gJtBUFE(ifq&8 z#@(f_FXm(&Wt8mtfe=ysBdRE@`ac&{lu-R!po-k8e?S$J{yeITiaFZ%aI6{-K- zsG`At1yw}w{8gwTB-TE%_(t1S{5;{YqT#nrQ-MwpU+9I)bEK5aIu>}ep4)XPB%EOx z5BHseWw>WISmCe6fS~=>lUI--iB#bQ=aj4I4;^48luo2JdI6ep0wwrWM2AE z#kafpG=O!*WJLXb8p3K>hhxet{x_&1^%tu6H$@eT|H)8A(%&0Zbi2o5l_e&ty5^x& z&33MxiN80**FTEuRoK;s4GiKIrascla}c`H$gi#6 zrv0P#9fb1RQ_O(VKT*Z=y=qPO0^OT8H*!yj3}RWLRXJBSw4aKB*f=Y0BHA>ZzsFssfZI?hlr!?%Z( zY!aR!MwBk!rJ*Z;HCfYvpfvnEYtC%nYd6HPWuXWj_F{5vUMgHt`S6kbcfDkm3Q3gz zgf`<<`a-$(lonl*U+$g&?Tk*xx^tH+-wWm((-DDuIol-s!zbPBZ}*ZP=sLd#$v|1F zv0F7!yj%IV-6rSfMX}@%IXn{nX0Jd@%04E zEzUi?&%0p)8l4}msGbRm^Js~q&nRPh*Cu%Cv_$K9-lmN zhs?LUHWHKW>X*UQXl>iek&}6~(S=KmMR7mV7^K z1`=5}X#J@uYJ(3H#or-{Zop4Oapt4TzWh%`QG7q{r=m!DR}KF52SqU-_=BQ&7JQ&6 z$_KQGQfaTD5PzyDVi$j>D1tu}0Sp)OmAc{dya$S67hMhi&QC=#`s103byh!+#qxoo zSO^>by`tz@32!u_+a%KcR1}kN2zV-rK2X#Rd13kgr6}6}Mo|R*s3ws3JTaM?Z@Dg(@EP z$T;dVcGRE6Ao!%g`3!?gV+K(yhUAlm@fn7ff1!%=R;QP)pI#k5{qc~~rj^sz>rUUt zo%RnAz}5tKCV~Drfr-_b+1i;c)0ykJGasvqfVGQAri;XL7im^kIcry?Ojot%u3D^a zy4G$6nQq3<-Aq~CEv?;cGTj}XyF0O-akW0$$^O+u(o_j>GdXlX@ z<1;-kKli-G>XmNobtBX3)^o4htlovz-ldt|<9_ja?<1?f%wL5nYX4DD)cuc&A{xnud<9rs@Hnn6AfUtdGw! z_ye10Xl#BUCrVpfJHQjoEbV^ciSo9o750J$gyMmp=E!DDlTg%m z%Qifd^IJ&qxOdJ0q!@bE{Qy!l_rHArDV`0y5FY4tz$sb><=cgp{S*~Xoi7d!Bc8rc zJaj?hC#vWYUg8o_{?k=FKo#Fd$^AqX|4md(j}8A2t9gJb2E^SX#a%ma75x(GPAB4n z67Qc&tU7QN4@AYAm!m&kHuz~Nx?eL2PkHcLPBA>KF*z;g09E`cDz0Z(ABc)^H|h^i z#g;7MPf;->+c7q~0Taby4`kl3Vj75%HygnWf!tN{QKJ?MZh+n(pMy-ig_~d$F%PbE5ppwR;z{?+upR zYizA7n5#@Gta`Fgb0zbBbm#q&tOs$657IwBNa(J+lk+g?-oyUI`mB|POiJUmr;YdC zHsw?`k9=uPFKS62dQ`LaD5th{bg8v)xi!D6Evu?+s;PteuA{j8an9J|y04x2@4Cus zx(hqHM<=@*S9_{gd#mdD%BK5U*7|F^pFaQew5D<3-t1ue_F(mgXN|2x_uGf-o(wMy z48K|&?)*6X=-Wuc)3N!Hv6pLOom*p%y2o2Ko_DUj===7f{nJFxhsmcqlbr)oos%!$ ze0n)BHoZ1IJ-a>qbmVpa)|=t+nZa)}&pywNP0epkEsVZd__(zA_WR=V*Ka50-o9Uc z`}WJ*m*3w`d|iIMxU%tMWqR)2^w)Q@i)*vnYx64~7r%X6-d$f>`=uzhG;S_Y3yp98 z1&ZQswrN}I?)Rmp=+=)I=6bo2| zd7V4as{cVzod2DoIFGjfKZ+tUe<4lx$4t?ILGf>jqRFfvv(`Yh5VKSGa5w5t6~!wD zilXFCMX_WMW%i}-F#`8HMG=?NexN8$b)+cIWe8r8XZI4gqIq~+;6x#;Zu6AO6$}vketj%eU(GH~z>#8_>A?Pt1 zV%#-eq)_qlTXZSi*j1$B-eYjZ{i`uUEPqfGx9Ewk6a0%z0vwkmb%MO-)|#!S63r__ zP!3eFP;Ss2{PwHIJ@@O4$j4-ox$i%d$m*I;%Ow0kGdxK-W#T|l+&CF-ER!{% zW>#5s@<%^@cW<@t-d+Xx7+3hZ4ytvh^(3fD?yc&kQ-T%Z8SMh6sHKkQxwO^eG;`3KNZD3iTAh#gdkO2uc`-h zf`3K~#KS-Hdqt6=cs!?ChiJcTxr*5B`n{sa{YORd;h#|y^VU20pE3ykPEm~fy`nfH zxkm^(P!yLVE^Ar5tOVjs`|{~ys#ntjEi^J3nWZd$Ub`M>8i>Lz zTMnX+S8TpvhyW)b1ot3&ol31-e&ur_6E#<-GvVc(vfX*WZcmBUe6!*k`ZwF&OXU|{ zMI-v3@-jSxi$+L-@ui|iFCSG(fkAs=cO0f`GM$xpwslDiYrN0SnK86Y5{V6?JRh$v zG95`V_?+&_z?`+b!-mLtVPYDzl0*PCHt8cj=nem9t4!5Gq{%1ujY* z&Ac!79P=%uHrh}Cm7&(5EW$xGy?Yd|0D^GQ)bs$1u z*obwr(>e_9VOy+NcEbd=#%ygYL`RP%*O&hUw_&vjj%@#IhGVswhN5bt`KtIMM4;S2 zq`5&qDtQhPJIck?*Ck1xH^bVv<0v28Xsq(wB(y5K_=x|C(yhs>(RH^vCm+yI96Zf* zlvs)seZM6;zQ6W+MNzQtv9(FQ*$wqSD2h+W@ddwA6c-K@#ZQn2x5&gZK!0?X1$Oe% zc&A@J)6tw6{DQi+G#t|~jo3`-dhj-iE?!P}S)g!W1Qzi&Ric&WDuHb=oBPJBe4r-? zKbKIZC4BeV=?>s*D7bVw5_p)rr$ZKyc+D-8gh+CswOh6AK)-XY0I$rOZvD_XI&a?X zv7>7;E>{$Dwg(i2^@oB^_=I)I2(l+jUIInCspXspEy|$rxQPd9eNI#>pH zsR0Fr#^S7ZM zJXwS;r5!PB=QCaA(14q|JLZ&vV<`M_&kRkSg~!Z49{VS$R@fLc+jjAoVE64_h()$X zD^9!*hV&icgO!r)uCU=3HLjnrS&xl~cwv)eA17spk__0HKhy;AFO?S+af=@0gCnc( z>Kh`g$s}FYEeVZpaR{*$VnUOEROkMrZ4%xnzaFp7L z?f`pwO?I7JybjuPpKi0gi=`WT)xY5a{y629306%*Y&S!M)3KIPF8`F~g!Y9zESa@C zCC;hB6`4&>EsvZ_6j(n_f3dYsy+L8wWU1^rXh%nF3Q(bunPnXpRXEM{ZpFIUUW@`u zVSKCI52~4Ay@N>+a~fPdEfLG0NE=#xA)o=iG0XC@La01qaP7?Zq6y?2xwiDg`$Bv( zex&l@!}{s;3oP?5XOQP*QhMJ9{fVL&YOR&zh>oDX#Gbrb-wK_I6Lu0bS}o}19-%i+ z^I|!&+FO70t#9h3k7KV_wU1J`r!0gUYeF%~c;mE{5z8jZlXIKI$J%dToeSoLosx(g>Y9806cENvS1=e2in^ug4_DUWsH z669O~yKz@Fjb$d8LlSKmiDey?>7Yan7Kz?Ze0y&bO&j9o3!o`x#)WXiMY5Pf*k$p2 z<$4_bFhh3E;EH^ z9U8$w(h8M6V-qAEFqHMc!)Fjf*ceAk`EVEOtF-5XT+WG@CH4kgUD?JD1X<>8Cyw@j z3O!Xt)*1GQ*YZ#jnWGZr$`KtFOgH7B%e`==Eik&#y&)`BSX8ac8Kmw06Rw(zqY^qI{h~8RPG4uDTT=| z5v6W(J|l>(u=XshL*Lu{!gU3SNnK%8yxNYLvn5_E!b`RJ6Fr&CpWGGHWholkDaron z2IZJoJ@xQW+Z!M>(}<-bY%%j;Fx~A*n=isQ&e@?DI7oGJ%By5&3m5PMfvjhCm`5W* zVTBnZW>Ql^Xw`OzB>{Z1n~mQljl~MZyl)_jypBRHrGRAe3;Me>Np&~9I{hE#6z z^D{!uqgM%+<5M+q4R7yFvVU6PsQjF_Z%2V${5M5W5!VYup z9z~Z@4^JOMt5R=AY!r%YFbH}VJf*=dtrwYb7F(zmpR_Nw-YLpnM+Sf6;OvRQY=CuD z(Z=3vr!HbLdWv4m7TGkFoSiB$}%)XA^f*RYLATvSTq*$~v^# zAheAT>Y2^$H+8pCyf})Wnd=u0z{!kmmG^6pBI{HM5qe{dt{U zt3B|EDx{~0VV9EI(hhNuV|!qPPHBgXx6@zw1b)EGB#F13>c96Alww(JP7#HoLdJd`O_m{t$Y(rA z?Aw*yvqdTUPaw+Qi`-@kcIb-p2*shqTHJcA*$x~T%5h2*^`<%%!p}&EKl4prvZ^|? z6ekl;RBQJ(`(kkcR8tfGq+l)pHyLkmkev4IM<3`i?^q0&FrOp=%9LB#UF9dKqHQ(-pg$VX{iMKIe>0XDY@%D zQj#jVO2F&I$~|x`S7mf1}|MZ5o(H8BtSe+~BOD+pJjVW^wYU zzaUa$vOZFcn@C?gTU}eUUNoHo>sW`n0eIi)me83N(|)=UIV5HcO(ag!&BCu$Z4%JY~zIqHW~#_>zov7B+Zn!)nK)#dp($o zAPF{SWXUY*H!0q*h-iCxv_qnWL~m!_PnY?%XdI_8=?#~> z3r-|FrUKyoDOlA^q(Mr&1c0cmE;Qq8wG`KI-C;Xze-lD#t-qi%hx0Rq-c_vS zv8i{1Y182z`qo6Q;*4IlZ=pR9ZFlNe4t))%euN2A0NycU5O|5v-hrwH+}7VG^?*Bi z`4;YRp>ySMx$KHiNfa%_2-P;DbE4tNlK)4MdrP;Ncemtv5#dGm6#e6< z8KG&j&fsul?K*Om+84@?%x`Q=PJ!77A=|hIQh));b;c*hrCZOTu7#?-p)ke<)A{1y zJ1dyoY8!)&ryc48t!D>VTo0JV0pR`sm-B6mIg|6dz7d%g!jDHq+zoDLyT!xlad^6E zGwGrLHB(ivrz7JCp=ufcSKyzTPf1J{u#u0WYYFX05+Zz!4WWUi-!a{(X6_L$kLHiffoS0^F&W$rLJ4~NQ(vN(^j zT@faJ7aeIY5mSF^&N*f=#dtIVIfWnFlVm)Oe}J22d%qr7A~UX$Yi(lBXfh|LbMJA? ztpW!P>Qah^OvLz^L2gUx$WHc{)9y(9SGG;vLx5lY}nncNF{Jm zGIf$)n+P2im%!Ow8susPW0|K4(VYC@Fl>22ey|w!!TbbdEFWgBcsM^DA`@^tf0W0( zgn8o%4}TLZ|COC2OZOnAH)86D4N7Km3UMF#sEHv&qYJ)ofV%F>Vm*kyu6mBioT+vl zyqXfQhboK5m4&}6-S#d^P*tF#4KCpF_;lsx723H)cd8F*K2J`$Q%%TQ5 z)FZTtpke|K`Ets2{WBCUhI(;jdbh*CwH-d5gL1VAJu5hm%bC|sYkq7ZK(Pik?6*96hxR+)?Lp^y1kA2&wj5uFA#R_rCx zZ7(kuN$9I>-?%D;vXMa9Nh~`Ve^_yclc;SKFEPV;7>`>I!8C{+stMejF;-X8zS;p} zArY=1@igFis9BwZ?rPl!Ue_687sEJsV26Ym!-u4+zFkqO7vdNEmL?PGU+`*cqNh1$ zn^G?;Lxy2agx8Q7B(6R6CDQ!%WwQ^OOu73bOr4q^t|c>3ZplitS+4BaYA~yf%zTie z`6M>m^aIrVxw7|o?Z+c08J|tK!V8iImbD2>OfQmWD{|N0cR*aLk^SJb#|bEulZ2e; z4JQf|6MdzHtkxU3ky!|Sw91TVMMGX4rCUOXnxdSr86Dozo0rs#p10MmGnxibw_>HD zoZ3J&UvXHs>IuG68I^4TWPUsD|w)`l{8 zoAvB=U$X;hC<--xn>EHOE8*BN%Y7;n5BWOxNT z+haY%KRCp;{sxnFyw4mXwD3`K9DxcA`zByZozad#9FlUIpw8+h(0@L3*|uh;NkwET ze6-}rn>&13@^*rDBMZke5F#aRcg(VLGiBVL71@oxJ&~gl@?l|ebjg&W9g7sQA3NOY zq$x8fJ0*Y6pwVvV7u%1&v#PMG^IM!6U$v>Gdl|8zqOdku<1t?8{_^>Ihlaq#XT=UL zKAdW~@cF~y%NHM=+E2VVfyg-@;M|pXNczmHi4E687XIc_DwChw2W~3(zvX83_Ncn8 zeNFh()YeLa$hmFuxyPRq?KV%}#_pf>_@eGr>uhrKz5jDWqgOp>f0r{&i>Jz_K9c8B z==+zAr+N-mUOyNo_nnfy{r-o-){6$GLSd)ZKfYB}4(;B&hnPAmywp1oXZ=dy=7Iiq z`_@(70-qCSzRz>NKj~2Q=3sQuzE; zD&ldOfKR#DTy5}C_0J1;hqP!0(@pmENUDo5(x;DY4d>xFs;TU6iq0KXlD>|%Fgc&( zQD>rCckIN8#ap_ST(#NYHVae1`7u@|tLx*|#^F{c9o?IX8KaXr^h}~!JKS;|FGnPt zJTa(F-!I=&alK7ZEd6+&$FY;oZO^i)y4gmkYgbal)}0SK1TpD+8S&}R*}NM(eC6T2 zkbB-YM$SDw@->5FYPsk34O(vxhN!se7q2huSG*{H*H!U+>incp!(?nePuN^stCucc z^bWs|Y0{P4*bKWfcN97#$H{N)(yHV+7RhSeU*e@Bsqrs#(#}nGh!g-s-s%l-CeAM1+-{|I^if2x(qT%iN2i>8(#~}4m+g~i+Tr{Krkt{Jy6pH>pfV{Q>L z_n(;Yb*Sq>4+}7mcUX&W;_=l7tTG)s+H9t zEsw?oqV%{d2_4dr!%Pg3&aiM=cD_7MvO7>C<~)yK5-V;Gh8#XGG!hej4y&W|{4Q5T z?|cS6f$zl326rtxio083<`#=u@d!N4-WZq#La9 zYIg@jN7%^5{>M7|jiMtErJ_{(M- zqh>|xxk4DXcS(Wtb<27ZQ|2trC(m+0?|hI7B34}L|6 zJ?1*=!UAIXUE+oInFnh*Gn%K2Q^Km-6-u!c&Lr{#IoetTI)64WvB+><*v6pO$}d<* z_*Hg`E(3bSK%MmyL&O7edXT*VnEw;ws?H11t`NTqLxCqYOV}Q*9PxYFMUQp}KPYYD&0SE*Ke9jw=D}t{ipW>`Y;^az#H9q)iV!byTF4qt#yS4pX5f zgm(beq{AjL&L}C(crv`B2b23v&--*xVqT3amTjBS-!@j+u*d@=HjlXh=#%<#-WjJD za$ln*OI?&)3ND)(tF}hx`fS9R-FD@Sr0py;F1jo?rp+O&qsRUlPryua49VzxR*9o| zA6)Uct)%+1(ChUxcB50zy}oopbcnB**oh;TGnXmo_ty+lzDy^Y%Qs)}ToTSPiuXA(t2PX{3BiijY99e@zkldsqRSjifhbdWrDP_xz#LAYeim%?P8u3tq6DOHhs%lwA+Oc&QHh2&YN`c(4x` z`z8k<_5|N!Frn?BL7}2i-n<*BH7ck-4$A+r+!>UC5 z=$xoLU-tAtfRyt$7SDMO-Mwy{yy@YDc;_X4(cN-WmYv}nt^E!kcV6^V(ok=4f4N^8 zQHiF|U^l-NE;SHgz>wK_<{j>UWwl49KO$XNqh1JE;OR`x(ZEH~j07O&rF_3yJPrT^ zQPo$};@L!rEO!(!iP2*Ms=#^`Hx~bT9-y&Qec4$TY*s6$951X)+*M1>B`dPeYYf%G zfL;e^NlW1tk!D`Hk!nC&z$Bp5R-5tP$nl764bwsW6}P|ZY(XGz76$)SC# zJ5-aSg8?W|1}HL+{D~?kbvR=H5J&)qjM1tBspGdAl|)mo927;Pop-rlonRu0R>4aS zz4wX6ME#LB{o~-Q13gmRYgAX$QCI6YiEg2G=K_iDm5xz?j%p&DluxJZDJHqq{GI$n z-J;f3E)|hNWF97hv8jITK;`9@7=hGSfm9`c=q{Hk3;^6+8ei4Wbn2Ux)58cLo(A^)()RcxPxafWkK~YSh>R>7qTn1R0!Sry7{K~+Mvt0;5qjP69 z^1yv4Wx#|A(*pXq1AtSu_4EMn+}HrV4z;8JFy)4l3h=1})l=1R;4-2W9s+M~G{#5! zUed&nfGuNV4vzi;HyyZ7Gf_SM_}Tu_6+VE>F(LQuG9EnTi<^$m0SAPsVZG zYQUr_zm!B5cXs%jsRDT7J{r);3s%L*^f_&50JH!j?xDww5`ZHj=3^)hs{3;>QyEPJ z-58pg#{A3ZsRb3#gE%e;q7wUKI;=iW*7SNoWkwwyn9xK+3;sNBj7WI&^z88#fg5~S z)+lP6zi7e*YE0l9LWdC7|El*?i|HXF5{+l{jfxhsM5=!oRN4*Wlwihe9v32z>bxv3 zTodP=oQiH&;kgh;rxEW#JKS~L2yqI^I^G4pl8RJ7J1J{(9Bhf;fN@2{(2D|xtf8o4C ztoZo{({2|5e3qKS7@wukZ>U>d}cwv-Xsh)kT5?4^y~3m88-$)q}}^bK}|ogAVZ6Q&F`B*ffCflnlj ziJmY!p)UydjH_R?ven0$=x6Bm7Ohx}Tjg5Oa6(`K0Bl_gbBqQKSPv%imql*x+p?900LZrO^Yx01?gIInZ<)YDT2N9*l>87xqZtx0IBHC+ zMxjQXvmh4b4P)u|!G@CZYm6*;0bPH%lYE%_j#j(*6!Jf{0Olz;NosEr zjd76@#FWbu_MXj9G+H;EE97I}Cr?0W z0qF8vgkUPv@a5iTC%q8nEHT3D z7(1ZO63@Uq`|K8s{F#hSm@*axK#nyPW3k!^&ie91&ZD*%{sy+~IJtHse!vAx1sEx( zzrLE~e;&7Kr9Qe98>|UmV;tiZP%Bn@N;9DXr@{goYPi?QG5p5`iJJO=8Dp^4p8 zcy(SOl~dgW!x0muMr7IUMD5w^Lp;0S%1)4pDYB5~Rxa$ZTD;qbXIfWN_fIf53MOiY zmhurm@-8QND2CT&3izocLEv_#lu!f}!~?KLQM|6VT4`3(2pf8(qrewx6ZO3^9;U=) zwpu+gT2xyk@}36xs$sR<4)ukujLLJU0e4IXf%1Mam^y;-l20upk(p;J9BahG)RKIg zU5-zAXS@v#1H4<$_;kBNu&KV!`iu0@bf`HD+Btr0-pUZ4Nw3nNrwz@q|He(M(XI7J zCLR(vRXR9V!~;aB!%3P|-*`@Qm`~_Hr=yG;xiX0iqC{Mo=bFX@)0fy1HR5;u)VY`Q zDbMopidP7|R_U+P_ajDVGzm1aDOdQiV);?*P1!HSY~kbVP+nA)6UsZv%#Vlgfq(hE zCB-mZ87>3V+}dVoppruI?$-C-Bi7~;EtlZzaWw0FGz~-{VRZSpdyf#BQSoS;d@3%0 z7-dOxMmdi}5P@~_buJBBB5}%iitV}&tZX&7+BscM_V8Pu;zK&^evdYNU7a%;?n?yF zdfG8hHmYV#8B>Wg4K>kQPbtjnw)7MECs%efK47vGfoo7uMgWicn z#PPSrQ0T!E*8X2w0SF%O35%no`=4f=I(`UljqE7V1?@l61uGN1*qfW^2O1-_O(it< zZykl54;U~xi2r^jWF779D!*`Qxu9D2d<@7INca*_=ra?* z`ns@x+DqOm!)c!)CVit+Nb<_iTi>pPOa$SyR+cr>WJPG@MQ9g~t|-2MjqXUMr3CNa zn#c@U&h+P}HN*v1ZpVuCR@zYG7lac6YO1vPKBjz$1qcArvmZoQ-a74Hfox*>(D?cN zfWS!fpVA(!eYX&~T(>!1`76h@>|eyy!Mk#y-9Cod76 zKr}Fmdl*av&8UpD*1)%kU|fHVD=@^f9ChkVaDxn-mIlENgo&V0Y=Ibqn1_W#dgTfG ztSN9Htzvbd1Q|`yE^jBk+(omjRi3o<3P| z?&Jw^(Pk8tXd2%TqyEN6ty;Pi!S_ji?Ugw;y;J`^9_BUENDIA72|wD=P6VN>^D9=s_*^zX zDv*|tG#Z^k2g@B5CKbS-;P71Cpa1~G58lP9Q_*N}mRaKSHE|j|)E6v+1aSC{!eDxE z)^*D9$AFKhHZq(2Q#(gS9Sa0M@>zPOJsQp|b#{LnH`f(!nzO#NEwj*@CSqB6_M7b6 zfovtU=ek#d-P9z`RBV=>h(~he{isusfsC?5g<7k1znHjPs&bVnH*0w4;ry%ZwVdq^ zSK8&ir7lO~)^8465h*v7tNc#gZxx&R@%j7Q(D%>lb3N(O{&_WXTkl@f`hRToyl3SC z_ernfukkaZYo)aj_xgNi*NcQ|=zJHLYl@LcCj(WX-obWXl` zUNG&L=9jDN2G778nu{8)oh$|e0li^Q6>cmrIsMqDm~!n6$bx;14o}7j+s_0Ws>gc< zCEX0{R}snGqZkP0U3k~%acR2Rg`Id}T}3qil6a>85RH86aghnTpqrm+q*|03Vt;YP z^>$J8osd$QjZU=rnLfvy9sFaj3%hG^2a00TyNTTNepQKkTd6L#?3itHiK<7T3WEYo zsFJ~ZNvA+YS6jtj`V?0k{zfhfN!lo?9eNU6lsWF*e58Iln`8C=NR=cnhwRQSAq=>tgGr z>NB-5UUzs0%#JSw#Z06vNY`$wK|*fkw)b?41@EgpLDcH~2&&rdP~W7;j)vD>s?9;& z>c2TjQ>yibV3B=uQ+~4c<_pxP7~W$hq*pm>s1}aC?1)0~5xYXw9z^!c4~5&w{~vpI z85LKa?~T5o3U_xX+=IIn79dD)cP*SiNFZ1hg%$4Z!9(!iPJ%l@5<&t&0#OJOG<8W& z&z$a=K0VztbIyI9b?@7Hwb$BvuU-G2d_UoG_lm+&L&WXE@?prOOa|tudj{(UO{vu$ zFHcg=iJM#GvR%Ip;Xe-jLABZNbTxrm9K0iW; zIiC01;q7Kli9``{>T(j2N$0d#1j)Bu?kjI4A`c@874m&2sN!{z5(gW5T9_}# z+SF>lP7ztu9!`nO`5vS`xh|1@inj~Qx9>Ixc|E63IfDa@w~zK{yXt=`zzKSblUZHV zkB(okCy6>?Lt?W{f{DKhl@X_OEV;w_FN0Cxm##- z@-*lSBJUZq2@;=u>QH!+&6@6w&k42>au&qohi*up5)W}cRm(Y%8}ij^e>Oeq6aJ}j zJ#@||#9t*(YlJVU!iKF9n;oiUq>-}+CNJR9HN5SO+M=NPWikSs_P$~eTL zMKL^#q>4D zxARP$=@sC215e13H;9^Hm`Y?dvys9rC4?bdDy%$nNd1;l${wzph!UK5-hAUMS|8Rd_k4X)A9}@W3*=oHm=;T3>?(J>YilYJSYLa~O|6e! z-S`evk8Un^eJa9hW8fu~Oq@7fF6Zyn=KpYJyFu1XP4 z+Z3KYqizw38l3`?Ib;vrJGv9Yy`o6hcKM7-=a@jIrKGvy%qy6onNYr^gm$@OnDDlH z#dD0sM*G@NM4;OB^s#-S_c^a|tUK1-aE?HI9=C-#Ki1;fWXsVk`J_5VuM42o60axd zS;zwVK^*Pj64H|Bc(IN6{Qc_`1C4_5aV1GqB6b)CKmz6%w70AUrPh_tMtFr~w%EPn z@@fq5BiJpKRV(6$2#sf=HPp8g%mU@ioP`4e)^o|A8=!jdp@6AR42zp0ZG9D{cW805 z^7DrwuJMQY1CkhZYlG4e)a=N+LviJ^Pd+;kO(i@#~Njgbo zIC6}PP&g4^z%aGme%&r@{yDeQ z^U+E3A5C-0FMafH?poWLdlDJMkiDm^IDXdyE8qbK0s2Ic;N%W-Z*%LH0J@zwpQk5@ ziupg0<%bL1WM{oH^@;Tf3&JoeljQV%gpfChrb&^M996#IJ3fhU3II)JsacDncWDWZ zwX}^-0ll9Q?raP7;d6+V5PUHW!seW1TRoW?FuM$Qalm%`USoQ9gno^wQt~rVrzyk3 zweI)E^x!^}Q@Jzv|ZbwCnOR|{VmR%+Som2>9&wJHhtQmNGCE1Y+#OGJY z1~Q2bO58iZi%NoWT&F0iWlU3A7&nm-y_R8>XQ@t-_^^C&_T8=a+mOKbN5I|(==}Sx z6E@w4Nlex$#J(4s0;qfLjKLlwu2)kEVO}rZR$QPI0GPQSF>Iai!Dz2vN%aq)-2el{ zi-GdCZGuc79WVw7H{h&- zc8c)aXoZ65D@XBq)(vG_bnZ?T6D1~JA9skq+ZGGq99e0ZO`+fkB+O*uC1mp8S9iu| zByY;(Td>&~5nMrBTM(R3rG6-o129FFxY6ne2Iex;@K7^*jFopMh(m|yZIKTBR3m!k z4#7jhwz}Y+aQR?(NI5gO2^U9v9gO+bwH%%I75F4Oie7s7%j{g7khua>vF|C+fQE&` zJob8-gTvY0Ob^)G5*wsA%*vu4Vn-k!KX4Vu!karnEWXZrjwh0%!)rYJ^~t#GdXng? zL4jZdW&D8iK{DrEBBV2ITEQQhXFuRT0WLI$B2jNFiy;E?N&DY2iL?G ztD_dYL2j!Y?9_YLzU^`lA2P@VTI62W2~|WVGXx6U%@noWc7T){VlEQjC1#q{rX@vC z%c5!8M{!c;1neg{LC~Rc!=DV3IKdkZM-SCsI<~EEIXG_JoMsLwh!u}V7l=o@{u`#@ zg{7XGY0rp$YcP?6zr<-*nZy<7iqsG))2Nj~6>zMDOCk{*s$s2nB- z{$!9BRzR$aGujTc{M`eNXz#B(VT_e`DBt;zLVfk|$*x-lj005pN@jzEc%$16vrLVa zoCGkn+)jtQyFad%`*Pb}W}0p=B$sPG`sF2qA?fR`RlaAYQ{HE;)QDn<#D zX&*)yuh`&fA@FwSlc>EAVEW`a3i$bGy6}3g?*zfvU*t?a#NPrBY?|^|y1s9mZl{Rq zU`2*|IPrsV@xoOI{gI4wgecoe?pvUI?aip%n{b>*4vrsPwRE>%l{;T-nlqc|e}6$(cnITMs4e8Ro&P3Ab&~$-CE!CPs6x-jqhRNDx7Q$ zE>6jOZkZb7QkX4hSYDQst<#X6vYivg8{_o+jvqhL-`eD#o%W3J7!X9uU%{N0kD>&p z$k+Bu)1!@SBo-@SE}kP6&2^59l_}=*mz|^P(I1&kg%j8=q64P@OkeUC_A`4Q7f+#M z*(vExCA^Km*|Fpup-}d|n}Rt6*d^VMy1qI-I9F=7nEx$e_+YO}eABeWbocKtlS1`#x*-xUSyVH_L8I+4E z+uJ9tZyx1ZJ5ko_Q7XdBh_Bgya}HdTuJ;TE;@HHCUOseOfrY8!WalQA6!y~x)Q|S2 zQjXe9U9W(Bh*H=-@aoo4Xn=_y4N=>ogDt$iuR{dp2;m(eIItd> zwW6)6AP;GM=&b~+l)_inJjVjU-%Dch$wYB`Q@BSUX&woqJ2Vs{A5w;FN~@1>7uQi= zAn=%OAeWRSRE;)=X{n{6I9B>+?s%I%x$QDFPjp*tOmNV1AO8LI@R!&Tsk=a^>{QMR zB;TqqKHtz6R}`GkaFM)tfQb6>Afw;GNz`b-K7NG8&=f?v{y@3&AzfAjZkGc$J3R~8 z$Rj08j(!+HF@E;jefQC+i}%AMuj)dOn6A2_9+Jb#YDZp2Q8lB&c=!Xd0AZF3O)=eO z&RxiMb}~JN_In6A-qRsj$Y-3|VcF+I0J#|`@=7Y@pQeu zi2Cc^o;H4>qd@nBvVD88V4i#hX3W$~XsG9Ec%$}dMM^C|RAQK0J(}|Li12=bhRiVf z8E_7CU@Sy0udAVdRwrsdP7Z}9yu_iF6K?Bl7Ef*-ui+Xl* zEaxRLeiiDWeDJ2u_b8|Md1sR-E_ZDS{hO{097~f!eAdHS_ED4|N5|P&fg^trjbSaF zlf}o%%A0;X{8_@vQPfwjL_R>gXO*=QBESmxWNc}5eA}ST#5u_ z!R*3i6^%Gaw!&H_!Dxw|zo_Hpb5}!+sukf}$~VWs1NdD2`T2wF*24n*e!_*{F87p$ zhlkvhhfS1k3NGD*En`wjYfgWn=i3;*EI!(xR0Bthxt3vi`W(SNNU($qX?_3pc|#e_ zgtL^c<9hq1T(D$Rkz`x^i9;gH@W-5mQro+e3jq$x1=R*bQG6bfvJ3@34fIkV@&KJZoImW_klvk6kiten+(%n_Z^h%Ph zsqCz&mwwdN`KV92cBHd*=BE_JH_~HF5zhmz^X>h33!9iZwSV3Hl=>>d=59g^rCQjmSB*8TLh z>@$<@XEw6KF5Sc4vLpAqN5W)Bqq|3wWXH0)$BJafE4#<*WhdIYC;DV3N4h6xWT%$9 zr{2g;zw4giP zn;mlSK&MuRF9%&QR1G>fUM*%_0T~gr>S$C|=;G~ga?1e*-EbaVCHjC6=2IZM51jlJ z8h>`r8!WKw-5&CGH1VUkb^#bGG_KJLx7CY)9U_#p_P0R6tKsmOH-6!%f22|%pJg}(n5dN>!p z>>bWJ+pEz2{bO^l1AdqQcKGsguM-R?xdn8D^c^{1O2dZaC8K2Nl|^3xkD{@pAbqC> z3V{&{3Qv*WhLjxx`cBi7PomY%FNTi$P>z#fj+P2XX$aY;W8W^6QNRcVI1qbdi>A0dcuIXdwc#`^J<}E|6h>i1IRt!|0dVUD5bop4ho2sQg z?wK;}()x>Vb=TL5-x`Z1gT56OwggdmTlWN=og80K`PdF1a0Jp#u<5A9&?B`o2A@?* zaECxMoTzWz%7NDi%iu3^^73l4p*>_a6>$DL-$ z)>`w3K`fSpO?Bc0;Y-=!3}1madxHzrf15eI{|)J3@I3HV+9L%Y6Of&Oss*!D z&N|52=vhD-&-wSybHevL$UNdpFc>}JA4W2nLuu5rFHb&v=>v?Fu+v7=y;lWeqfOTI zX+t|)iU}2DcX=FaU#Mimze-i4Y5UM%%ifZ%PF6%=;k zC4Pj(Xeo{pDZV~=>lWBsX^)NXEl9~tkbwz9nZC-^B`$n{fpLXU3q~ci7R0X-@JJBV5az+-fDx#jm$GCTfF)HM7eg zOC~w6DK&!JL`D5&Ga5gSy2{-Y4KuF24FW@yG6rMzdN_eR^+p7qd3wc6S-&AT09xWm z7Jl#L)0EctqS%>MS9_;~VWolibd~MvYKE%VvUr8rLKA(4$87uqu)mmgeBP){hRXAQahO*^RpdRStoWvbM@uL;Y}fw?{EUEMtqfv3(Ah8E$_cOb_pqfzWHkFC^PXXf$z33 zW?rW%UdF3vA6_BL(ca}A&NVEJ@1rBZpg}r2pU?ab3ziC!@5necFc8in1s_aTl&E@% zqlCz{x}m-p9uPuX18&4E+{UzOJ`QGmJG>aJC%l66b9ALVgz5Rw)!7sWG3th;#~&km z;4M+OdY1T`8M^HNBF_J&rC73Q#6TR@eMKh{MSfql_KRE+@05uBg|&NcP?`Ql2mvc!=BEQ^dfvVRUJQb}F zV%($?8Ba0C4BQm^7Lx~mIuYsGlVcFT#e$)v&OJGiyKsdqIEiPP`onV@MN84Q~s(ob&^lsv(##eZWVe<$r?y%OObFz zah$pn7lU7^t0f}%J?>ZDD%uNhbz4oOh)%8Mi&Cf6(i1&gHU5}egQvzttxc$7#Y7z5sxfElV-Fw&tW9(y^a5mPG&0DzOav)p=`@q~j7@yJkE3p7yUjFmDZwx8 z;%En{6+=6i$$LpgeVI1RQI_Ck5^Mcuf#e(ILD_n)5E|y13MU#n!B|pt90wO}OFKLp z>LE;S1Q-gB(^;=0PLIKqG_J?q!7SFE7|E~`!IUWf$~X; zL#})TlSJKU;-|aB4t5}*TSL4H6l%uXmA`f{pNFu<-1ai^7{{Q{JKRg5@+O82rIPBI?ij=7wrqY9}tSHq}MvC*$bRT@@q21U8H`?Df7W@~OH zeY8uBna8!RR_+#WP}f!CrQ^E5HFq1-OV&%`durLu$@Wx^OmDPD8{umnE(&xnIZAf5 zUICcm;y;FAzp#C#6k#iPG&%f$6N58yI(g-l(B`0upEOT@g|18|MW1x4;b}vjoM=tV^Upl zWc0ZYem)FNii*}aw_n-9HqbWR4P3r>EuwXo?Xf$;OE=PRR)gv8J|g*BiWV1JGkA{P z!7-9rkh&?6Ry@NJeAchZZ0;A|uW_gr|g&%t6Og$<4Ge^^_x4s zZF5!JJ1mL(KsSE|rwi!|x11o64d;OuL7Kgc(P2AmqD z$=WO7h*=q*Y*3+2jnwqIdMoMDbe`A({ba}GCdNi z6cV~Q5+)K-Ry|VA6jI(fQUMY&5j`@A6f&7PG6fQHWj%7W6mrcu^4la726_}GDHIlS z6gDK34tkU>*Dt~xrT0%MiuY5fg662gNT?Bd)X^!_adXs3Bs6JyG+8M$dH+s|;=5Uf z&wAj`dQ4Y}Ob01U7Y9t=gun-lOl!857zt2Dl^aSDa>J$%C0TU zPB70v-$e5um6?;2BZQPAvWdgxD|2)cSB@{I|5pwJQf`bUMt=fOoiE9yB4!;a$g!V^ z63(j#ptxk_%j##E1oP!FGYrn-U)tde3Nz?3^ELbO*3a_}r1Fk5@u9{@d7q^6ei7z7 zPUW2<!Zkc%dyc*q0y6OCRFH6fE=UXYaXlX6cJ_D++G^b-tB6AcrQ^e2<{I~43XlvD&u z)u-_mqzRAs$vv9qQ3PY2Kp0}cn4iY7-}N#s?HrL z4?S1!F}&%gsF8D|{N_k)(@-U(Mf+j;?T!|mzPmbBhFZtz*!hN9Gk0|tpWgzCDX66@ zaJMM2rz;SULom`IXA3lrXo`!9if3d7U()2!L>0a@OKpfO*Xc7@Qy4oK8M|Z{yDu7h zQ<(S}ncUAX30gD>qcBAnnMP-r#x0sAQJAF}nPn}Sm}D5CydTpz!p$l(%pWb9*Hc(D z8(FkvSadB~EDo3zJy|L}wwPG7obiWr=wnRw(}ym~XOV;Q?Tn|;Kp=ACZxE~PVyifR zEU#jD2Y}U&6h(2knea0w;C}0MKN= z>BXWZ0o?97MG<2OZ)x4uli%+4b&8_6@qrlCb45z|Oy>j$hJC*R^F18&>!@scV7?mSG=rJg6~+v*nRMw8jLcS${jz;g z8wL!*kZ$WXRdP4dB+PEfHe`8mso3?K39OpJ#>viRae&?tg0ThhDafLGjz;}@nb;8a zU~`#9dKquBAD>n+sNf`Uk=cF<4S^DCCvnSkgriYUGDBC7`J2&PNF>7^Q7P|z3(gV` z!=jGHX$`x|TEJ}r`%%Ya7&$vE(uk5#+h5;6G=3&ia)>QauB5^v2{H(T@Py1y>|T(< zJ)+lNl`7j~{g>!pij#SPF(=^;1SOzdD*r~R(4>1&3yXLS0MMma%!gBp``OVHk7+0e zB!-0f4i^%EG^Pjs?skj#w~9eAwtiXHBfV{&tKY2Hm+7~%{X0q!8^=5r{rHRlsrC9e z*`_|;#T3LVc(<;%N5C}Cnd!DxJg&r2g-$Scv*&)&5xNudg`?>=6PW0 zl@#`{T~$wiskX5-g_%f^bPc>yo_(Rl>R!#0PV4$yyDnNO5jb|gbZxUahA~x5hxDWC zTAGby^+=|pM8>-EqOIVf$YFyA;rryF2QTaS6B|03@j!pz^VVh~%EbFs2tW@iuxF&C zCuh}SbCrIltM}@x_wb^pF{AG)s1#@^f81q(M-SQ!s^&!_+NT4(?zG&LVZeXY`$A>_ z>(ZR&)c_~$Ac{SIuy1it$g(v0CGUjHkh!A#!s@dr8XCb19;sk+BMOi`8jXrYj}tuz zns4SwKjJT2?~ueJ8^hI!+lMLp(VqikPDb3of4gK5!{q%9#%td}1Ysh^4;`}20 zv#D2uuQqrXdZydJ%@|jvn{*&^=3$h@;s|_^?A2ld#gfz2ko#J`x7=)w|FX;4@(j%i zg_T)!&oatAq~S^M)3hGHz=9=)s};W<(~OXpav}8j1@mDPt8yXT+69CCU+3dOsvQ)E zrxYnW-aPV3s%3^m;&+i?orx!tuRc42Zye)~RSfrCuxCxd}7v8vm>vyqI$LEpC# zg?mgryK6zFX^h)+JuA_zCe7KA{xXKg}@~Y!`l9`~LCZdww8&atOteLn!@x{@!unr-|M#SD`KS!uvt~`$n(# z85n48rF{^a_&|dyatgwmFTqQEZR~!+b7ppUD32|TI^?!Gq6_{yS9mDLbmZ=TtkHKg zb#`nJX5!L!Jitg(+Ko4FM-=?p7{1!1i8^s8GWPE~v3vCmZgc9MaoYEKnOEVN08_Jm z$Qi1mVUO(G0YaI}j2WwNF%x(Z{pMo8=5nN{bUW?B)nB1c;j%pJY@+XCL*Xh*VNB$r zXSHy7BIFVcJ))mF=sAsSfS>`s*_6G9qWF!{r$};M^X`~C^ujT~o>8Tk*%yq&kCQP5 z(uAB)x8UE+JA-jVDkgJ8-CuSiMJlHYWP{ExaYUp`d%S#dSE!X8&!pg0XRv?{L)EmI zx&XRj@y9sRO`C3Kycz9R-@F(5uX|QemMPowQ~5GM-|?kecIV18^QK>r`&m^t4nK?8 za7q$!vE;4RnS3ec@uFq!W$&8Aw+}1L?m!~z>SLeR!%2s#QI+}~+t;#0@NRzH{@Sw; z7aN9<^Uv>>nM0R@rF+hgKdyF#5h;A1dj6)5WmUg@LQ;0;L;bKoa_HsRH+mQtdxZ(^ zwuZ`j6NbM$@TcI3DGX*wRJTWR6vnVXk36$eO(~n&*9`>* zp+)hczR=ACH*E@Kcj*V)ZnIwRFfWa z%CZi{smRiG)^N^oE<`%#y46tFXM6SHxa9l#itps!dyRA{eDE2^EYI&d4zxHN-JG*1 ziZK>i8f(4xt~gO1*R_m|vJ+aKX&viY;o-&YT3Hy1``#tp?Uq}0MPaAQ!;-pjw?}p3 zqC7QC)3@9oOVpgZ)pdTBhSs)z$Mt9!tc&EWA7+fJP3t9`@MxaKwD4$}*U{t;o{&ey zdA7bZr}cdDIux(YW!X#9t7E6)g=hQwx{12mn}vAZ-Cy5ldv%?>j`P<1GCkql_x-i8 zcP~1T<)>a0W4zCx4u+P`5a9>6hC$NX`GP}4Ht~&>BnFdrM)u9~?~JmL;f9ZJ6j=&Y zver+2eMt8Z-*56s_$%Kjan7@*$($V$MUaQ3qsEMx>0Lz-xrcHJ28n5LbF=6heE;WP zw_mlCn*jh25~@aK0N{qHC;$N55=vpWBhnRXzO6ZRZ?#gdptaP*rN@88#)MoPpry%% zz62d1cs%Y!q;neN0}S1LWp42_YR&dyO-p5U>q_EnYn@fVPA-*y`*skIb=z`M9O~*& z5~ln4Y&+b0`oV`h|3c})Vv*01ukC@viYk=0{P#Y#*nfVoKR9IG^rd;)T5q#-^|OTF z)Ye+pK^3|}@bTc#jgZ_?gR;vt^6fQ$k_58#`zMw9-XZ6C4&J>vtGAzDZBj<`KtKSt z2FTGT_t@U^??ltRS5A8&q{yOAvJkY!$j4Y`kMaQmi+v~!K{vGE;tXtR5@0A1fN>PK z?KlU)v*!sXo!tYFV#1rPMDp?W=zPa$v3mu zmz~c$N7($muWjqoa)Ar>2`k)YlV=y@LaS7hc07JMJ+c)dV;Pf9w-&Ugf-A&|D5hN9 znl%@@D8gesS zRcSmTpHHbt_j@K=tr?v$pD|9(#U4_v?Ml9o^ZqWUSWoqB=JbUE+(iz#t7^R`qR-1> ztE$vIdHRKGo>!$_*G(Vu*oX5hR*eHdL`^j&-y-mdN5PLAwP#2nx@Gk>Vz(Os1Ek}) zCiD5n!tReheS0H&FuRpSr+Dxp$zSrnH_}rGEy4bNLJ3HQMxoI|JqY!6KtGC@$=J{Bc)IIz8xUpgeag2;*oJFyR zIHpMMV{f4aW8xU`wx^%QSU^h4Z+Nl<0@yX~6iuXUma;#dcznZ9+7M;B>n6dO_jsJL z%6)C$I6|>xtuFA+Tf)5u5>k!O`k)`#yW{tJj~>^De2IH=`Tl<2<)`{E6y7=j?|NkP zMgyEEejS9@PMXoLjE_6?^6=IQ$7jH4RfYwW9gZ|A^hQ|&2q@nm*V7$Ol70f0`3~Y;yH*idgBODTo zzs;rDF{01!Js zDKDoR2(JQ9`xN>WeCghjMcB^RTMyAv3RyDno_m6c@zog#T>Vj_EM@4(mU0r(xmz~p z^JmYGD)n=1WIuwEje1tiQL88@&vV646fE`J1;sjTOZe3vAA8ld|6JSt{DHAkhAFAe*m*h z@oTk*kKd}u%es5b3kzPF^%kNBi6>o+3ErfO7O%g_Yv1qtyP4|bvd7mPq;3@_?2x#CE5p}?}9cDa!wn?K6`s(b$2rhv^QdS(I$PN-73-RP~?&S z6t^n#mNZMM%x?N1e`s|_EH8a zbxzm6E<|p2i>kD>%thb$dOPfD*YL~Pb0$XrqCU>o7D<6G*gxH^ohbMSooQcOTR2ur zV))3>cl3HM@t_sq{3iT%`xGAP{m@9r=lI)~t0cE`yP3AW1WQP&a}k}k3`Kv6Q|J-_ ze>pWp^d6Mmj%`&_pdEAQ{hDkYvh%6#-nabTLo*3aJ9~j5!#BN0-C>=(uBgL?=)5f! zeYpdP`qQq7?{BLlWcMrUE?;Y+L^{Jl_q9JiIJql#(TnOn9_tNaGYY8XrbQjid^y=o ztLloRIXzdM4mmcoxpHrl`+PgX;oxByijuQ0b>&Ma`(WruCCdB6B-DL9?6ML@mL8_r zWQGQ10dR$4XR{Hs!>D55(g(0x_HdeAI1_d_GgmmPdN?~YoHHhzyDFS_G@O4oTmTy( z#Dx%1M~FcY5-|v=s_XD4gxoGd0XssGD?(X4LKPaJ789XS6`?sAp}iY%8#_{uE7Cwc z(g+%95))}w6=^XVX|)?^gB@kZ73H8F#|Y3A-^#*vJ$vWSTlM1B%RwLFQB; z^G1;cyT~H!*b=VThw8EA(AdhD*y^g-N29TicTutR*l~?qan0&+PoQyaF>xJLab2Tv zJ-cyz*zp5g@k8qI&!F)mG4W$n@e`x*Q@imq*a>r72@C28i_nDSn1mNq39F+CYr6?= zuoE}95;xTox1oveViNbN5r76f%tz3fB}WWC~4n3f))=!(Iv#PAW5ZDyv2+ zyK5>ZGL^eJm3J(ae=k)4CryYuO++J2%r#8{nI=`8CNq{Mx0j}Xldi~}uB>t0Ye`o_ zrfXEEYmTLB@1@_y$_a?LP7zRxhL&afEE__6ValWE7D>7bG6QxlMUz2MrdS5x@Jcsvys)=abwvD zd)Y}iIVs#ZX&N~ht~puAoSf>Mys?~uy__PP+!F5GhZ?!%uDO-S-0JGwM`O8<_j2oT z@*27Gnl|x1uv=#R>um~_6pwM6mD=AZfX>6yB5Af7VcFSei$qKv{(2Ar|4iW5e*H% zdZ?0q5qrw#~-pkehdCspX|GaKi*FF z|0(-pe|hx(=Ra24d^X1mG~4Rdx&jX0uT8Yozv+Vk@ffw*8#abw$at+M+Z#7Wl36ti zwK|%%C$dG{Ur%;4zndvmj$_pBY}s3=GPpi+cRu;B+~6=#sNL22X|>(&!|SQ8wl8mf z9I$n|+YdHJ(|B#B{{;VW{C_+DL9Ttxe~|s)KgfRY9}BoBtsEXZ(lyZ}K1g|BV0W|6TqA{5Sq%?_c9TJpKg# zk@4U1A1D6{{0Hkl@E;q0nE#;vcleK^KgoaS|IB|3{ollYz@n`GIsY;6oBT)rzr=qK z|KL9){?q&i$uIej_FwQH#J|gb%>T@P#Qcr_NX7U~{sT+M*=j|3Us^{x|rK)<4gG z%>8NpgXB-}A9H_%|2V||ng95T|Ihi4jsF?`gZKyk(ep3xA3gsa|6%(x|IzbT{-b9$ zv+0`unEfmNQS*cU==>%BQTzw^kA1wObG)DVk8}M0mj77zHUFXcYyM;D-{(KB13Z7? zKkEJ<|55U<@E<*YfdAZ1H%zrfflK+7HNBIwrzvn;Z{(=8!`eXbD^pEo&bN|SH(ErEy zkEZ`~_>Y?3;y;@H2>+q}8~jJzf0q9s{+a)1{U7B&+<(b`*#92?0sTLW{}}&G{sZzK z;y>p8=lPGOKf-@_{vXSK-2F%XgXDMl55eE%Kdj}2#{ReY51ha8AKlQO_>bHF692LH z;wSzi^#5-D!}4F`Kbo%jkC^|r_>YW8ax=%D_>Z7}!haM&OMmbmj>w<*k0Y9YhySRD z{!j8B`v08&;QE>WDE$xdAIJEA$A8THF8{&xga3fff=!8!NrbrXHNXkIqLNrw-eQUD zMsg^z5Qak?aQSvs1aV{WKqUBJLQvO^wkKJ#RBVzs-IEor^v}Fp3HNl-e2F^R57{bK$qzAPk>4(obIh>O@DTA5QL35l6{OZK`H6jwnUu|B+(;Oa7y+v}GSn;I8P;{0GT@ z!+#Y1;6LvF7yqIBGyh>%&G8%j2jGwMA5Z>~|8V{l|55fE{71=8{D9u=(tC;=<)ynfG~N}h_8P`3am#FEDuBP&j$jsVL(7A zAyY5_K)!woHjKEF;P+}%J^+H`+7I{B1!Ir{`T=0T)(`%J?d?-E%zgkM7|^ps4M2Qa z`y4g3%4`fRcnvi8()DPc6woh@gbKopi&#fU)t*YRrJyyWsV}%%iy(c`0JNrbfU_MC zK(_FMqmL3cAq++wVS5;p)U86a4nPLndDo!OPy4p2A9RcDkTe8_R&fs%^(;v4j=_HE z=f-aWqyQh}L?wi7n)<6mRAV5iqAb2F5q!@kgk>)6^^ws|=kkEgJ%WhWTa85_pZpR4 z1c00wM!69TQR36c&b--oFvOd^5c^ZIh10GZDd_q>rFcFt(m4Poq+R<_w;%TP;3PWQ zb%!67WDtJPaNv3$*@hGp(fIm8)*S$& z*M&G3K+qjv=%esEapEfml8FnQXc0}Md-9}bohv|LR{%g|9YO?xfk3dQc@QRQ*yRw% z*=CcWI>v=Eet+H|K)BPEm1IVSWQjG!MG$_S4Ltrql=}jJ62AmsUI5TkQgF27$mXKx zT86kJv2+12T<8$-X;e1efZr?tix>;@w41y{gw#$hHwZ!cLb7o6iCjykoun)P8?Kus zPPQSyw5#&`kvOw#G|p-7Xj-e>M`iKLA@T-@{G%eO2FQ*4GM=)u0AlJI7%+2uuy3f( z^eL-YoETXS07sI9KoUT@hl#%{J*Fv1Y9>h<2n3jllewY;Mh9L6^^sNglZ{<_GvR3J zf}Q{vk2=Y#hmD#bm~9M+YFP&d_Q)t6<_zMdZF;-T0wz*f?#)69QU6U+l5h-gN-C== z!x6%VUX3ge%|SMLT1G+(uZw6`)js!`1QLs57LJO$gB!deYz2zvj~Kn6fsOK#V0;v? z1}iGS2JEWBF-Mih9>Oh?Z<2suYNHrnKTM2Sm>o7InoP8%APiS9(h&l3APWZuXsDI- zmujgkf9OcYhXdC^p4W@RlW16@k^Wr(094NZF2Gt41J@RY@EeyN#i*XZaF>MPnqsK< z!1B95Y%AistR&d#3Qy8WN=3&x?h3Y%jrz^1R7TvYmW@_h>6U=N(353cq@$Nb)RVSY zk-=$FKIv7&QR*wBRvuca!qSiOhQojW3_-C?APij)?Yo~eU|mh@L&rl=jD-k59uNb2 zw*zfu(y_e}$xDLlnj$spq`!)G|=!HAPg`IaPY$n1pZw9rh>i( zM^gz>oG2X!Qul^oY8rU{0*On*te+1iiP5B;WXd&^M5#(*4x6Pu&?VczAWu&rmP@f4 zol|Rvp_|TO$d!ytl7`@BTA!j@aK#7^XRYkv@CN9Ew&N1%=UvV)>wo}_f9Y10T~MJYKJGM>A2-xK28)HHdC!ImnMNze zL>s_BH&1bdM%iwa>tlR+u8b~d`i#}AzzB41s3HZ@%B?f@H?p88RE)@Me0)myWW!AlKrk#Pg_7?=C_!Qoi;V1O$e)9a*uB@!H4&^%n)?*9ZuVS~W!?89;CelQ4y$ z(lBSO0EDnHo@+bA)D%@3$?ONqmeWWuYR3_@!p?5hmW37Zrlj#f$OgvgA@#9zFnqGk zW(=E!5WsU8EM{z&WUvK8$&2}9i<>#uuO5!_rbW;Gs-~4t!jP%!?quOL5EKBXTlk4gf0d+jpF-!bb(5Tg{bpbPu9k}aKUxH!i>fJA#y22dZ z)aJty31Hv~3^XwYs_(8{6POUD2>KoUShxbB%g6~+sTeJsi33^6eU+Ym5*0tA{MfOMaXe)^Wd`gE)n?AjC(+EF7lgSAqu zqgfkDz;nXaY1#Q>4BA)OGcfFDLHyJhAq~Jg?H0kiSTffGu#dMiy@1U=kzE;DsWA~Y zvq;H6JPT~lhVQxx`-b$crcSo)%;q#bHGr^x9Jc|}KoUSmX--sONn-~?8&$qD+CX-# zaq5~XP{V6_BG9?|DHg^psX=tsD!^I0D0Y(YAh@?AqLw_IMI1md3;P&y8+_B=)Ar3X z8EtFZcYeFpN}n2=ETmiL0fl}5PvU602{Z-M71~Q6gfbJjL{V2^AyY026Xx|QhWC#BWh3v%jHBzDUk$SC*G0`N96|s5-#$Qg&NWW{=(oYBo^cM(7OjP#L;!|HIxorUR4+JEtS$4H;!hyinp5(}hbkDYw zV#jayuLy{Y-Q#Nn1SS*jFXP~0?%sWkfWSO492#B?yTbPd*1MD+1 z_Gg2tUn3yUZIdZaT=Gri|HIy0cf}EJdxNKOcXtaO+_f7B5IkrIBv^2Fm!|RH76|Sh zG-z;lcL=V5putH`Cr9qN&pqe4=b3qCt(i6R0;>M0RrRaA*WTZc&mRQj7KVVpu`uCP zKA8t ziCnK~*Z~3Kj`sYKWI-?lMA{`FoXm`cBVgk&@y!10uLuaL#%}}!%<&fl5P7LU4;TU>=GIqm;9}JL2LVC69y>606jROl3j%@=61oUS>cr}huttqF z)pFr?2SY#re?>sxE_n?Pk0cX#SX=^(T!I|!Q(|18b}Y{)=+VR)Bkc6T?C*$!0ue*@ zBj2S49XliWS=tAol01lBU2Vy)H$I%Vk67}dHwoL(=tM!SoU^^(CbA5 zxj)(7yknL&aiM1Ui)`G#9gtUH!XfY1 z?c^QUP%w@E;((<7c0izihd_Wro?$m(!G@OvsbGK7U%`K+Kq4alPJsyjodQAlO@X{* zkfqUpQ6MoAfrL3?Xr7W-+#ztcaH^Ke2;o5h)IdaxH`jiPxYb(#IB7xvsuLkLCaGg! zPctDLHX(3ho3d936`fiE?-igca-Wluo4czEP92wulJ<7bATwPQLeHmu+D~4}(!tEl={%52YZ>~gm z7Cx8%nwML*U|JnW#HnDFYowtn*@{)Wg`b>TGrdZ4+n|4>EuhF`tsYr^M4!h%NJNg{Q_ z5eetDaB%1a8@YlRv?h_l!dNb7R5I#JSWyyL+hNEka5LL>sx1-SgC@f7(o*ue$v&PJ1Dc^I5q}?sU2xj+NJu`|=mf#7xbIQCSR+sxb<{a?7 zsbNwGxi?GRgIkZH8M;6`OF=ySC*=e z9C=TxB6t5IgqLNPKNAfqn~z#-EP{jX*{i$I6%@^k6*j`EhuBt9_+xi?(*%3X-O*U& zIDIN>^Df@(;#8rtA%9$PC&5X!h=lPUhh<{LRf{bT_K6e90Is&5s!UaFchlu3EjwSj zq$v)OS}^IJioDH*HpzX>q&9MkqI)ou#Lsh>4}5s~$!xyz3{cSwbM=)DZS9qKa}YS^ zEa5>lJ|wpd)}G;R<+{Vuu&iYKAY>lg;+YsGU*wyN`$fq=^*H3*?I4tIE}#9&7lLNz zGM-;J?eb1$kih#jt9j0`Y{i?=Qa=})IkN?}Or{s*nU+O1t(7eK4kHpvG|WaO?pE@$ ziQfos2Yt=hO14xACA*F6Cah8)xgnm0lSgV?>bC^p*n-t0W4NMM(<3gTn0V1rGH1Cx ztb#0%Pm)h@Pic~lv$ft6I!Pv2_ynDE}ix)jO86HgIB%+Uwku2sOeyxSYtIkQp? z7G9!<*2iT0oTRa(gkXv0WBurY6W6*Lz%iN(ef1f_#}^kuGU9^Gn-GfAHw7So@=x`@ zx_;@N4#DYz%VpKHAGI|f@J!|3@A}LqnUN?_ zp{^|D2XX1mC32EGz_HLW&0yXzpztQc@mtJI`0O~&s!7HoAZjY4%(A;ypBv@%wSva( zIEwR$uG@X*IHv1rfV;7I&`V(-@%=4?Ba80u{0CpzRP^8Y54LGTOX)xShi2Jya#>OV zhaao5DVt`pbr5EHn)#(ScxL>dlQ__%kD1W>nE>&mvB>u*RH1wX75hEJ!@Dl?*XoKA zc^4nR3A9k7L!4dPP@Hv3HjInZk=LnSy& z2+9}xcpGZ`vod zUylXv4%1gA8yXtgp4+>WudMxWM@|O=9DMXUH}D&q`XxTw^MtSN@{%;FWP)65zON8e z(6-FCaXKV_SUb76ZeGl^cWY+s?ogXWs+=ab7X~jjk z&o8#vRsRJ96H97-OCT9L&2o6gV$93Wf7aSwo!}617^k0~;oNS1Lvs`{cH+5099)g% z7y4R>4GnlLMMr!-@2Wn@GrNuQ=AaMPS2To6wufl$00|?M7J~vepSta~>yY;2HWj-J z4Ou-BM*B6nG%7-$mS{+)(FYve@cAJfIbD817ur#y*F91h1!D@f#*Z0`Ax3g(sh662 z>^;R)S)|>p{5+x7Fvo+P`-IT;yf^%(39`X`5Pcf%jwCFD=bECA)fI?&fg6-;>(5`n zAW_4y@Pqg5y8=mKKR#t!^HGCuQC=i{WY^zmGd+x$lOb#=hKN#1J8 zsCwYpj2%P5S6k*&PcxlyTf2HNnqW3O)BE!f8N?u*Ed?${+bImQMKF?Sbu&k*dKW?c zCWbO;H|e**Ah?T8Y-enKjCRyxp0CG*lTTe_19edfmS2y}r5>#I;_HBt=3|yO1Osl> zb%P3;I}YpkkbUg57E=QX-%dmtd_aWY85r+2u8%IS3srJM#%-$_f#%jLT)brq_YkUw zvU6vJqJIX$$hU-0DSRN8uQIpxkdip!QQdkk-Fxyu++|6p<0jpn;95J;Z^bIx$J2BRt^SBMS%R-c`7wb^aI2~>e z-6F{Ri%uzSkWDZRqTEL4(7VT3)#AP{dM*nB~dc=P#P?(_yk5G{|6+GLOt>;2o569Jq7Iu#l@ z4F>FShEPiSIJhspDc-|Y#s;TS=^z4et z&k^T1zR`2Gza`wuvp%NhlG}ipC|uYKJjD4t)C{&n^t_x5eDm+{1@i+X82D4DC^LHa z-!TYckF&hn7Bn~Q*Eaa85WmHs#1-XhuU=Dje=Wj8gA|@dv&rV&C$m z@3#1l7$iCMQ)Y(;JLf4G4Wx(*K-7%VL2@^R=rgWzVu)!>5py_CaCS{;Ov0F_0xHt> zJH5*E{N9Z6Z^-C!&`J;fm?*R~JHBAG2yzQw2ybv4eKX8N>8c4~RLxTolHOL`D+q^~ zD4nLav!ROj2p3oJ*pyv)KOw{{%Hpq+)#RR4RWR^t6ly7w3J^wpwv>5AZN~ToEFAZn zMY(=ksP_v^O3?HTr?NobxL!;aZ_%(gqB=k7nvi7|Y4|ss!VQx5hFr6h1hQND*i6Qv zvBJT5R)f;PL`+sd+$zhUn>)Psyyo}(%E?K)4Mv+bmgbd#&;s9raUY|-d@u73$|CD< zCSy5x##c->MWNWEz@fe{lWBRS{xG|Wz>IJ?JNJIdCZf`-_rCRAh{*%C-l2Rfqx(t2 zPUM)5Wf)T`AV{XPAsfs@0gFoq&G{iut;GYK3cb^I#b@)L7okheLK4#C#2;xNGA(R4 zl`m{x`pNFOA&Exe8PZJrM3e2b-;pxrEZj8+Fd1Xc(L!|*(dLoBZStWgQi zs}qHz5{kvb!3sFb0w1lb#vri97~+$J%N_N3p-WLQJgyt>5~ zOPzKOGxuox9G{Ud0z5|%0=z?o2@g@o!rZ2jB|WugHX(7|{i57p<_d&nOweWNg6Iwo z6D{kJmPIFKs)Z4JZ{X4HtFl6kLs$DFI@hDfvi^*ItBQ-mtRvQ}<+5xTg|ZrL^@l@H~!vzK2+0gJS5-wfwsYD)y{0aQ)7@-bBmYc*=RBNBXu890!NmWdLp4 zOS*bXUh%S2Ho-KMASPMTgjYuzSWrM{Po93Onw zLwP{Di8t^!4F&AirElR|+C^`B{$$fBJx(f)&2kQYewEbCThS9F7mcPRmc&t$cRY0Q zX1Lb2rvHzIvKadjx2)g?UQA`rfb+ZHuF3OJxwe!!!s<>OWe}|aOS=xj3|%z#>W-x_ zOYo<8oUh8_)o=xpmcd&+lf)-eoRMqX*LI|v*X61LGg0yX~nH1)Ha{}$m7h|*~S15rv+VIKQ$5am`&Tp8dh z5dFm;2BJU%v7@@aWOmYBbqj4EoRu*0aA4n!%AAo-jO5Ab!%UO{6Kh<^^Lo(b8rG+> zQVZHcyW5_M>Ya!f_U6wYn(x`FBL}+gW%!%Qv0aVrxozzD7uVd8ua=Qfz%h|Up?=<> zRK8O}2k@9_#2a`6hn~`p^?*a!(T|~3QgH98DkkW}V+1Lan0*0voS{|8oHALgw?FfNt$=j`wO}AhQ^J%}sH-QQMFsE# z6AFf6p0FYsSiHBd`ZASz6@*;{od4O=_Jrlogin*CLD3Z$N*_nW9S0u)ChhDZ(KWR` zMm~9j3m~6Jy(Oesgd+gzR!_wqx`uwj1)Ms<2ZJ8YSDXI^}Oi$6*gmsbq6;n8#fH<9ZDP*_lH6O zDIv#>`G>+waRy#gVo#<~OW2|%_OH%AS3604{nL5IlUtt$i)P!I>t%J>P!O7Nx5nL= zP0P>$zr7NEk8=hdUr%S-NoDQ*$(hD?d>Kda>H(?fZd_>p)PdZIB`HBqT#pV^y^14m z%q^6BO|c_c!@dH|>i_|E&WUvbC>6@Bc9cOPL}QEHn3MpNc{4~`tzBJ)%i>D>E z9Q~$8vsrqSGr8H>1&hLlZiUy$7b1|T%)eu9xd;^TLhI{}8;4OS!eLHLXq;Ani^@~9 zul&xmvx>|E+gAt}mNHUN0H#4#My@ETUf1zBB@Tz={pl0YuOxDHKq-DVi&1Ba)##6y zyu9CQrFsz<Ex^T`;srCiVIR{37mzYgMtQ=wH4>I!1#a86s1zhT^TBdQ%x>OqpmSWg2 z7lC8y6{0tC>p)&-N&v-!uI*^Q(}SFaQMaQ-tuuF+)7b>>DCE`YRks$z*e3`(5F7I( zer6II76v3f3c+)6Z@!D=*mqxn|LC@O{XF0mZ*ds86{DrHiy8nAqrM=Zv4D5dLvM53 zro$~MgoZ-}jK$Gp9T1A`i)Y8}R5c2ILKL)f>1T!gm2`TD#j5M|2qB@pt{TF0%7B*) z9(o_!Ws>rnLTSN{BuD$Q6VxF1y6!iHk^{;RP_dS8>aBpPZ4}9ocn|f%<6#=@ktKxs zrsO83%j@ZUS$e%^IT@OoFeYCA%=>dF@$ue!zy4S-ve}bSJd9FCzq1^~cRT-SQ9U(% z--C`SXaSrvr?kUykb5*DC002OXInL=y)&=q6so`dkQhnQHI#Ep0Ilw%3nqZj5jrlx zU4V#O%kRvV`Uxq)8)RX^B<4;AOC6+$6EoDnA z7q=}}Nvt%st#nAN4z#UKNUSZit*uLZ+i&}JDzSdkwhoopKxyB=k=!I{-=vY;Vs78! zlH3+*-R;q%&ynqdWUbW9Cvj$ZSCukkq8m z%*Ss1=8MfzgWChKy+O0f;rw$noMJ+u#ggl#U9~JwKwm9!Pzp(uQR@dXA@@g8=l4U2 zthzBJ3dLn&8uygPE-%nY?=18r`4B-EA9F3k$$HE9o6$qJ_a1VbMxt z4qoyjg8r0V&buRNLRDhy`%Lm}PiuQ`A2zPu207pRqhcGh2XFMr%c9t?gS`54)(|G* zDLwCYr26+grX|Efe(WYHP4Gwsj6cNY0*FL$#GQ9U4kZG}U9r1SpJl{#&+Y(hh=o40 zm{)vv9v&lQB|I!>_(a`3BnFU5D#2kDMhuJ0a33nohDN)$#>m9hYD5*_iYZyjeQO$g zm9JlbTmFKcL{TG`*sB$RD&cc3IUakS zdI1Xpo%Xj3mLy5cpm8kIABQryFh7%(V(Wi5Cd308#$O~Rx{FfwD zCcL@C+m5nvq$5pg1+0Zl%rug%;zV;=+HbWPJhjNiqdy%CDHYRnHA~~#evS;!$mkSd zDbIQYm=kFG|J={gi{vpUu#HTE@hAM5A=!G9^mU7-iEkV9oXx>SxfQ>pb@a@!6x|c` zet=Nlx{(c+rRoX-kaK;b4ufHvc7+85j(GI*uc5RvndkDs=26=@J|;Vp?|dCzHm#aJ zf%j8Xr17FO0=S7v27BQ5B{-UjN){gF??hOMWTEmGbZto0iH zmQM&wtM$_6@}ecl4;K`N6wxhLH?|fxU0Y_z_h%vX3n7muX$38{m9S)Gs|Umj-?LK_FRlkD;G93^i17it91C5Ob?|-SQVx#Og2SC@FRTi2JySumecS}%R9zHlCn2QK0>VObn3<6Y5Zn{aW-q~tK9sW&2YVQZ@W z1saSvdi>qnfE$Hp5vyu1@~JUq2ti?3c#vK&TN7?g8RE)TS&Hb}6l=i<~U zOwj(cWE7$dmW{7lcI(6y3$U#v$lS?qA8(Qk=&dH@PYhx+2c%@}sGAmlNYr}-yp%`Q zd|i1t2O5TtVv&0ZQ9G6=v`MHLLoJ8sBW?>eFR3`_?6`?G!wW_phoKx%KbF zjR{-5wZy<8JvhW_1&?Y3tiHM5c{$~?_pAnrv^@U)Av zyD};j(VaT$Dm=9a9*$Zel&tSzRDmPCnfP`6ol3DJQJEoTvFfdbD$WZF5|0A|)#WAJ zsiEhx%jLxU-#n0fofWdd2Kuj`c;;*AVa$o}(mJA|U>!9gpC#SEbLT|efMWrnoxpyh z(|DquU0Ge~W1<&Ws91Y`(Utbk<+OWysNxg>#^{F!hAw$|t@jW%502I6v4o>0XAE)} zyvW?d{F=tUn3!uW*^QpSX}tEk)Mj3<)p7b_7e6~|9e&8$QG6JXJTt9Pi9T{pTUpgo znRDr^E=wbH!q7hcT#orl|L_9POx&DH<1SGyHT|jmhx1Ld*Lz&3yRqxObo>z(+vY)IOIMV~q`4#I2 zbAiA4kIW2SRHzAYwHWt+@9JN4Zd#V!rhN>mCa7|}V5*%)u{{2P-fi7z z?V5(1;Rw=r_|1QOPVq1l;LBaKZpV0+roReO4)N6MVSVTH0cCdswF;fvD1$)wE#(d! z$=%==*>S{d%3ZV~a$GlQB!@)GJ)VzuqwxgjlZ0V=A`5rpUesis%3t?EB=?gM?_Bd~ zs19H1-cP4}b1jylI(kKeCRM_TuP(83gtjCa_B_lENGhozuV=LT)-eMv4B zQ83CGDA~_hvGOvx_=nHPOl{jQQO?D{?CZY!QZy_dbtfz_&aZi+B4YgF2%cR7kl`(! z2^q0KUKAQd`P-TiYu(ZijWYE$$qZu!u==J|8yEUEqEa=!gz!isA}jh1-S-EZQv&Qv zu=+xhJC&4uD%9y7a38xXC=RhQ47-u^-9`A$AttV(MHi@C= z0RAZBU)X($&dTWTx@c%8Xry1l3vO9yW0IyEtj$Rn{{e$Z*?$1JyLVKa-xJ@% zt6UwwDavabJ_Q4Nu=0W_-voK!c7XfsSIDjs~BeL4%$d*2tZv=Yn}F z8Vo`SFxrDb3O0_X!6=`=s5H%}hW|`c#THQ0(1*kCM16+U~N276-yd+RiN z2R=uS2FE}G$H+9t1pf0Gjpqvq&sV0Ouj6xWJBzG>IdOtGPp4^q)N(>EHMs8Zxt?k{ ze@;^!UvME2aHCvuKY__o0X$eU+ys|ARCQc{MDBB5-UnwMiaIW?8D3^hUJL?$F+Tq9 zynO6@+!8K)uQd5@Gz8RK_)+TkMKt+zT?F4I3TV3UtGIBl8S|upga`^i0WN5UtQ21yD+~dl1+Ds zjdjwbbc)RYh`!E<)+CBH)``^-i1+b{mtTt3B}%qkO4hlEkMN02@JTGxNv!jUZ6``K z&q!S+iVpCBZd@eEW+kZz5#eXWr})I?5+&+pBsMjrza&bRUP?h}W?^-x^!G&B=Y%pS zS~8`HGB|{?yPC4vS7LhnQgexNpiUZG*B5PdQf~>Rkdj1az+~w4GNpC$AVLLs!WTC6 zpfG-!cr7`GBss{HJb+(bCQ07!N-lj?4h@EV5F$n;Db~5XWFST_&OK0IBRZ zki&EEW&s%CftocyDp|0mh&JI4zZUBkJn3Xq$6PXX0nJ@F?R0>ajDV)l7fmT5ZFwSX z);SFxZB3auO;c^Hm&uwcMB3WP+H+mnGDJGD$vP1mx=;u0*UVouxZHI7=3v&Yu97yD zs(@Cswod#{ZE&(~`cJ(Yfj4eHk;??MssvuY`eGn5ryu5~Z%9NEBcLae{MKvkt(2SY z+??K}fc|c>{-&FD$Io{WMtU8|27qgW;yJ?;Laj@Fjb}e!UAn42T&W=lsHM{pqia)P z3#hTDsBYJ*oe-)e*Q;1Xi_U?~^kMu5vAL;^xkZY(&Ahn-v4x9{g-42o&%8wdu_Z*u zGAzaN!@OlIv6Tn0nWoNsDjjhGv30(Vb#aPy`Mh-%u}z(hO`{G~N{Us$s8zF$?Ldkx zKtN2p6LS=7u}`Q6${{nFrvz@O>AT~M>tN67SVBEEkazfzhvyxj#EyRRHXRKEI)aWk zBu*HXA^JW7%i`dr`h*oWc z_D0u#ztQ5AkQ*q_|B}S%yAJj>94e4jGlKT*p%&FJ0qWNrGA%d^%uVO8kIv~`BtSTf zVLpNc5*HAhPa7O!99bryt{vp^|(8+O{{hT;*H zZ;h2i5(^c!{w_#-rb9IfLe>Ie|MG}v22uHfqp?6pd5h8MAV?eRG3MZX_Vh}7O+~pt z$#$1O0dkjFAo1;@^wqUn;-j!`j5~Q9$u1BxMCx@y8fPQo_|r4CO@SL}CdKE@(s02n zeN)oi=vV9utYZ}eQau5{* zQMpOeFzy28bfcH9!DE}5nyD20&32`ANTY%Abe}|%Y15!L$ZvAv+Hw$qaHw%0s@l!W z0S^bN7H0?Am}X-%MIf4q5z4QnqVk5UlO;>rB+-Z2V(ptE$R`VCZ}U2EoY_1jX_`gi zy;*XLS=zE$W{Wu%%##sF2q7}mMp`!e#y~`Ch-W5}CK102;>T-$Rbol^QsvYLV)D}ML7Rs=Lt7sl7 z&Tp*rNxbR+WN`p5KK(kO^g6MXIw^{JnH5S7in0k6IzUZ!eG1YGn zd8oBmsee1)AVAR|q3@bb7tX#3!gHx<4VEwkR?+!XL5wH^S1_1-T-UF2xRRPPTN`(; zK*c_8xeqP7DJ=xLtsnRa!gE^NR$8>-Lej-rD=EJArMq?LH^=6tx_LG=7Rbb}w1ZoV z>24f}zEH*(W5gIY*Il|3eC&9l=sbT=`6=dz@z{YN4$DzheQ50(phNXujwKQA#-i*( zp>!gn>||Z-hQ3(n86s~70c-Xtl-lo`P99*BHTZ0lWGr4^n~6X9c=ru-^%)!VLmT?n zuliklAs(v^KC4Y*XUlht`OPy?lr6$cam4 zgkF7>uAZsU%jk;*RlHhMvskq@1&4#hP=_hY{ZGH1&Gsctq;$v7;lcj=FBviE&Pg4w;HZV1qhN7ua zbCkJ&=-ce5u3trLZg(l)Oz!o25FByLX3xaq9Ef&4(S6?LY5rtouFhp4zG1=6Zw^8= z|D|CO8nD)W4(wbPwE4I;8vk}7xh>UdZBch^$%lNoMrRq^)?Ily%YZ26kTC~->Kma` z$MdvaZ|FDeoSJBxoAFy~rLtb5s@V1W1_7=;&3;QHUSH?`wkt94khxYUxUu?#Rk~iP5|_Iz%t-5=SjD$xO{3km2$ahCZU4dPofdb9|*;#UR?Cud|W!bh=eW zeb!Mg=53doF&F%?z7sPT1UVnxe_0Vc?Q1iexL5JP#0lp-y!OP4;vraiu8UKQ6s60( z*`6}R^ltT*=e=8kzRr&fb*8b>3jJ1pEU|{;rFxr{cB&-Wqw)8q$9UJ)gB}q*T81wp!{ElAtP5}{f8%>mH^1|c81iZ_-8N;lV#JAL$uyR6s8!M?CuI{dpnt>nAwRiy)?^8D@wI%iI9J` z+ewDOTQtTxoo-Y{hM4Xvos^pCd8CKydq{TpKK~E@k+6lln}bDFhonclnOjnbZCrLJ zoZ)?BUPKeI%vy>8_C6{pySCB*(Y4*T zW6X6%l|;dHT0>3c=eUwowA=if4q~^tcL7*}y}Ax6?n~wrb8d^)Rb%e0rrB5?tIi8a z?kn!=(Ki_m6Js9hY73bj8$me!tKa--Vs1wR3C2CQV?Mn1+)7ZhKi`Tqj1k$4*BtlS zzj>48b&yvDZEM~y%9i9f$gdjro-WMC_BoC&`{sSp(DB%E+;VD9d(sHS79VK7RP{Y? zyrcH*^^(N}P~hlw`x0Zn#RVX(aK8s35;F!|o%ILc_}dJ+oH`Q2!KwGk*rLJ(Vqjyz zHe0tvh_1+jq zOqC3)&J8@Zq2Y-|_J<1xN^>~2-`-W5;jQKp^epr|YrzP|T7s>JF9Hp@5x|N`CWD`Dd%5ws zBHvv>iHXQeULf!yf{)p|kgeKVO%`Nn%eS5Qft{daDoOiE7wJ7!)wGE)J@cp(mil?~nsFz~bTD#A$?P zy$82Luc&o5(*YP=R%+tg!{YOPSTws_Y~#3O9f5FZvP? zR7#OA*x9w9@{&bC&6+np!L=}#k5$ponZipBAtfepC2-P%ip?`%{3o=DROHoH-xk-bPA4F1U#>>+hSYneN$HzYG+%6 zTfR8h9aR%kSy6MrwlwM=U7d1bt3_MBH0k}JF6;SoO(6U7Ea!*%BD?1}Jmt%a(s8w) zEA38=*jHA0;~N@I>;l!xS2mm@zO+BLA9iG4-BFBc>bK)uiz{C}5R7Ub7vOX(`~ZD> zpx)Th-e&LKVUu>^ywITilE`yng>x>-qpeO49<@8<+pRYSG3{%3pVJ5O8~m5;k8>#| z#Nc%}wL+rWqbhKrK)epBXP26jydO5^ra{EVt`xT_5Pj4ph}9g6@W+`(>MNQ{bI;xv z@BlvXD3gMU+)hS&r!c1OO~R2>7HsN-4-OewinzU0d_QW`VTX(TrstqbZdHU|)cYI` z6FFJ?3j;!#EJmdzW+l$3#3~M&L>A9sgKucz-|LWWO`3-qM4diPWDt00k&S8wg`}@z z?hB82jk|uX%RG(R7ys@x;rpX58;W@V#PFUB=B&@fi9V2J_nu5HL8LYTrjSDOy{3ho z2FO>HQj~4HXEJ`&7fWFs!9o&d3pl@&szo1Zm3z-s*ncTE#QOev#CyJ;DI9aY(^T=h z=vZab6}~^#u@Q#PVlVRb%kaSC_v}7Pqf81l*;pr*BoB+DFx}%j`-!cM{^D#yLu16o zvE2u;<+Wc8UwR~dxT3VK?zK0xtUo`-4}?_#mO^zNKwP5Ehqa5M#xEsEyv((_`Gp@*Cf-ryt{ z$V@|wc$_BO*+=&@%<=qtK5BxNNu?N-k>013ISRCN_-E78Z z7MgWaj&gU5bag%3;eS^n(sp5&_>go=edARnwhthD-0$hX>mhmC2nl!uB)@$a<&v^V zSodqzHGDRMqJAYb?SIOU^|-<%d8_0=TQXz#vu3^H4!Uksz8_G34)s664fxUMKkebS zWf6dIK)(IaANtA_8`U>V9t;-_ewiD9r{JemjE6!>i-Z~k(}erO0x%Z?C|SVla>UG@ zwDb!>WGs*@A!ibWKs4bXy3|0DcnBk@i{Nb_aY@kPKoF-VBe!KRUrDeA03s6}NWBOV zJphY01-95XGQi)r$~!g>W_5Ff-5hF|}bX;bAe(;UO^oBbszf~-0vF1~;$#l0h z8@1yYb>JR#)Cv-qfhg++3nzy7{s@tL6DsZ)S_uw~oek5l5C8Hm0_Yr3>l#rtAE9IX zK{o9}(CG*Hg&2dSm^WszioS96S#ejl0IMd^RzO28Y%fuBk` zBTL;{O0zaggD=axhsskH%cEl}y+2is6;(F3f6m+b9GPF$w_6jPQXAA$Ta;QKx?7+4 zvp&54OKC_~IvUQ0$*>-2SNV!_wMiME>Kwv4*=nSVCt}I!=Ajq#tGv&-9;%R4jcJE!}*dk?$w-}V-_ z_D&A=_OAC<9`+V*4!-RkoE`wr*LQP)Z2U0%k)xnzl zfmm9ZNXnPB1w)A}nq>wq}>f)$WzEEY<6G^4iP_a~J2{VO98Y=%EDVE&*bFqZwMF1*O z3GEA8!h;P6+<4taFrkic90{(ua859>JFHVg3NPG)$C#pELd-;M+=C}-d%O98yNDz& zl-!wgJ6hOVhyjy=E{Y=l6qA^_c>!EHmZXg+L5_d+Y zbhh7oSDf%x5@R$>EMfR9mN5JlOSb=$SOT~C0ce^*8wOlHa155chl7@kDO|!WPP_nS zsQ*;s3iv-Q?-tIl%7s|sdZ{Yh3gO88RX?P4uVEK!8K{JU84lIoLs84lAAN#97} zxLLePsdFt`rGFxppi%f7{}D?JP?-9EDNfRD-f#bhSb|DBH%M6ryysZ00nQoWKktq% zW1svjmK^dnEArGzEgRKxNW#xSU}DK^4f|Ki%i_O^C6j-|l9zNFtngHHIi$aUQ2HD} z<&0|SK)qYd?`TiG2pidOF_f`M@Vxr!pW$ocZ@z`{Vdw0)ql7m~WujPK0^73Sdg(^~ z=VA#QeTx+I>Dz%2JaxkL#c8}%%ML1%e-L)6Fuw<=0j}krh$Yy)LhTgxKZqr<9C7NDAqgmo5PG-U`IrV+d}Rv5ri7+4ZTE>d|!hyob2EVb|HDZa%$=Yl-TvN8Ej1M z#}e~=$5l%BS7OOcTwRa?0&bF6mpK1o2}%}_#^cd&gOmmsg8W~%EF|m9 z8t?CA{+U>E0#X${Kr!SJ)tVwkM@~Z6Z-SfE@w1%Nh0~a|d_oWQhgRsIwA35?MJ!Rp zuI$x^xOO7@?N}ovl~Z2~m(B0FTTrDM^$Ojy()NtY5V$#}$=tun$$N?|g|!cU?Bl33slT*cq>?)Vu?F29Gvpf~P`J3& zKpf`=NjL|^)#Y&MDRzd?yHIdT+51>tg1oXwWr%wNkqs+kus>l;miuo-Fr0B< zc()IUHwhf5?w2IPUr38B!ifNQ{#&uc3{w!*z1qXE3FTyyaHZl|x!>5a`9r}X_A;cd zSmh)i*b1Ef3@h%}61@n2izOcahFCHj%4Eo>_&qP2avI}Rf)(NO?}LqUdv@_Yr?cO1 zrBHiV;m^ZQAGTgCRWfT7|Bqsc9+cKYE>&F}LijJmk`;u}XImdctHDv!%!B_Smb~~Q zmdtjUDClPj&TrNcghQ}wJLUf`izP->1sfQwEM?f-MrpggbjZBUle!u(u>{N=DYNn! zRIoR!?Y8mH#F7H_dJak!YEeg$)$MpB;tJp^->;cc)>VoFE~Owo8ULy-c*;3o0CBEI=NC6ZT^`nsCQ8U9*0LMwdq!M_ z0WRh^QNe%fW$}g zeyjQ=_O9EnKt)_4$Dh(KB^VTbKyXGhaIsmO89f}ca^8Q+!5T~e8=PbMOwu1DNF11o zP5e`_#CeiFu3I`ZoX-SKZ_ZRq;sA$-2sZY^ejqKf<{TkG$Ub;M^J_0cttoUn0Vz&h zDIW#R8^WsZ`GGB0_&i!{AGmjV#bXkG|c+|$2XN|`tM>% zJ!VNGiq8hx#J?k!oc$NEgc0eW2}KAI5wF>-$uph_#AM>ge$VZL2#7)&blINK!+oJt z|2MH@`oD-JK2UY%uM8--bWTeJD{aHcOV>sOFc?HH^Qh1f(^>EC!=7j z=^$#tXVZVglJ@^av84SU#1g%YKVnHY^2mQDmS_N3{j5Y=xBmBtCDVTsOEioS()vm0 zbQ1`*aS5REyx7UO*!ywtJXs11LP5+~fiLmiU#CupWYf|8FNh`6|BhIKfCbNsyiY3u z2ag34OZ*+N$-S|Kh-g1KVk6U&Kgr=0Ptvcw#1oy&_B+pKKL78;5&)u%D4vov$y;yy zq=ABk%>vz)BHQI6HF9CvPh>z3cxhccqc2wrRm;1#|AScK zZzP#%>`ClYELI46&;N)e8%XVc6HD6vB9?4m06U%fL=gvCev2io|0tGJV?@Iv))U}x z0ADW;SFXUvX;dqkY?P09dpF8RmRnP%s^{;+VP2tQ{#7hN$NHCI$@U+yWCLkz2yqm> z24DZTSYo1H>C998vpC@0-oGc7m~Q0VqZeiw{U6-jXHZn#x-a@}8k!6ZO^%HOL9*oB zP0o^o2ofX*$w7h*O=u91q>`f|3Me^ekRVaAzT8YA;c%EjzO zK{9u0B?uUD59JerpLFm+1b*-@VhJz!zbTg7#pinqins0>!-^#z!T(=~CGkRFzJFgV z(fcEo?BGdz=BZ%9`#_!jtQ+Nw6geV7Idoijd;kJmO-S4SPAmyq;at?~_6Z^T_r(%7 z82;rp<*y1;wa&p#*`YSKAu79}ev%r1AF2X4K${zWVq z|8I*W9pF1e1VJY)dbS{KSfAcq>823c^NKh!jPM(Bn@|426I3=6vIr(^85`c zi4iIZ*?E2LQi7HiBO>rAWOV$Iu(Xro8Fv@I&Wyws25ufNIgx{|w^DwQqwsh1|EE~; z1rMJGGLx|Qt5eOk0)HwSVrS~_#xaaASw-}k-lz39iSd)cTd~F$f9dk}zE?TrDxp$E zuz`?e*ITjp?b439ve8GzuVMcOV#!VXk&m}StII5<9=&=kn|)SNZ1^ZX?cK+ZJ(6BC zOu`jZs|z?lT+%0XME?b`BndBTM3iO259NZqrzDTFMUESYYt`W|Zur%Kj5T-6`=b6a zQ8?l4!Gzjz?sS;D$bn{vO8B-|jDcu)iy?lcH-XSD^kq4PhO3FT79 z87J?S-+vlUo}YD_x7~32x0xJNX8hk4OKdC1K7vxZ&=AOdUJ>pWR!Ml25eY3=vp($b zc@d~}7fRShfxDnev`NT)2VwvR!33YO8FZvQvM67#X#ODiNl2akXhC@CUEr~+ls z<1K=e@vFeue0_4cDltHq(vb?Gy=Ovt8`t}}S-1$u&+{-^EVa-+rc|+0Jn6*L>!G|X zB(NQ#_BDMZZ=l%%GWr-Yl}8iloQ~GeFu;tF<6c0FaMC47f9@e3#to!X+W4Hqevy1y zSQ2z7CARmu z(sAmY{rvPdiw3VRBd;uG_AOh2PWHdd&wRh%p);dFm>uA0(-TWA=&&$*XctSx)@&*D z`G@^bhS-zXZD zCy+TlP=eSSd}pH65kV1E=D{R+W^ zT^-dMi#6Fl7z_J_r-#hUB%k*`ZU+kJKs_daOXGbs2SKA8l*chl4%YI=auD8@WGf7g z7#`_<;~kX+yNET}b9qSanC_T$KB2QOEA^6Kl<*!Uas9d?RcrBM(XxUt_^Zx$R7ynB z07bE@Pl@}~8}v%JI4V2VAZAI;dsTogl8}PfBs2;(mxMzcr%w`^JxVMxtJHCtk20LK zriZ&}&NlQYY4RU1A^R<3McG)H(U8R29Wd{yBqwA|4f;qnh7T+nP5JxKJk)3-1t%yG zQP`jKL*oH^aA3qIJ%U{Mi21ib%pqe zGarHvLwX5Agh9d&I@!OFXwH7c*L6?Pqv5#XQqI67(Mm4?C_j!q z3wY=z(KHZOd)fx?@ZbaCbA)LGK`pF&8()6M#^~|6*c&a&E%Tt1Y)o9@GeekJe#99rykx$dn$0xW-)xWo7^P|ZWTs}$>KNhK<+-VJ?fjPrfOr)Jp4F|7cyJeNhm(4$M3qQ-&it zi@Eg$sYt-CX>m~Xt#MAOg&2VhobFGdJ?!~jAYB5wn$uqom40}VX4G_3f8{?k$KkVe zz4Zq#34h`nPJ|05;9OJ2;{agsk^xdiaeXqJ`~)6COaTrt8vY^#5bk&Z>z8Kmt`&tu z_-p_G!~jryyXG2ADQ=F9|Im|GHw`bn4j@}17B8u=gJ$6LKWOrfE5_m_qCuK>-c3mW z;1B4hF3e5At0Fa5Xp#HWLl49WM8KU}z3Eb)9{-J(AQ2F8`IsoY)jrDB3uW(I@7P@7 z2OJW!q#kBblIk2zlstd%64Ym=U9BW_rl$ZUfw;0wpZ>Y21`~=r=hbMVJUfO^K=&)H zG-8o41QsvBrCaE*ro5mZ6OK5MiVXkeQ`Xf{d~m|K;OS_2G&ke5-;JvXSR|)IPNddt z6|#MMs-Azk6q_vC>b$kdm95T*VE`a;$pFZQXw(V`HkCM+!bS20dXlTK`?)jIst+6p zAPEEj7GP)y%o@l>5KcZ~1I8tR1349O0Up1?iLz0D;U!2X$}fNLk{8!_2^HgCcnM?! ziNXv3SYKUw8@RZ@dK72s6UDdtFQmWOWLq_Nz^aP@6q4p z*^HfXRzMTq+w-t^2|(f+FS(n?-p}wCUQ&~)KIxVF7hV#`2mo&40<>A&?Eit6tQ^I^ z{|hg9bhugb2QR^3^^%uI*Lumju;{)gtzL%5h2I+`gs$=fKdt;JX-VlW)cCr_qV@#d$TwG6(L z$RE7~)&XUccI&QOlaagDOOoJ@sc`Tgy~J-&d?8gb1=@_)Q$@+6Kq=L{pj36Om%tEE zTqFT5vVZLnN@lAif4fOpdac8UoZiaB-)j1!hG@q~}?it+yhNM8Yn zegg26ld(y^SiOYxSm|%Qq~uyJ$=3RZULx5_wJJhcAeZZdqNJB9WUNCROWOus>m^1L z;!rjrR&JHqU2#sVULx5(G*O4(Q+k$X8)y-6`LJu+5JE6DSX3^^!4F>OXo3M=(||0oX#ocz^39 zc=m8yN334bprN(VQ5>%}&)bytmtL}kO8cXi1Yz}(61gB>oKuT0>j zz$E+}rp#!9T}8hE0kC=r2VYYud-4nAUMzir)k_?M!4cTsa#l!JPx`HhknX{7xz@E_ zQfSviw>@0w3U@uP#PI}il_5=V0q;dd3bn5Fl5$3XPop)=N4xMuY$8C6Y>9+jyj66!z&7loZhhSiR)nS})0;NOn9J^9ck~ zKhw7Tqn9vX^^%S;I$?e2HXdDCGL;;KEy1`NRxe5aL?~Amvl%y+Kt9~ObB}eATyl|RxdgKqnDf;wi8UMQ|Dh? zPAU)?>!0LHV)c@Kod2Mg#0sr&=zcLEm~3jFjg;lr_}qmUIL4j&&PQq zi!YInLuY)gmq6uAxPa5uAK%-GnS@QTN(cdP^PBLLu?r7?=0W#JR~~kQ2z1iKVEGa> zmMDJ6#{dC2#dI?+?OZ1>P zC$%~%Al8O9>9{7llAkB4KqddsOFSov{;8J`imy;-Z{a6%+-(t>yORT6{Yx)dkrr#l z#fS>o|D~5WO~169Td|v^WSctE1cJ=~;EvluNdO7DaJ(ddBlM46LeN0@1u&C?G#0U> z3<@Vqzx{?;h0GI)!B0XGe~Wti4R7t4#kF42QHRw_EL?F2JA`g|0`MA;Oez#uy@cWl zfYnPt+2Bd6Uecj)t(T;mfr_FyeURCq8p+QZ2t6$cX6y82kwv`8FWlfz;2;z7eIg?g zV%Su#a(^ZiF#AU@k!`VHUi~~ihSf_5#gT;Bo}yh-#N7##^iZj!oak}BCIA3Qn=Vx( z(;z#v0Ck?D+@2biL*i95`=|rAMWr_;!S;I6PneN@dw6(4I+>+_LXlBy_<#SrWs{8L zCFvj`_h*2f?cFhC1~c-bDpmJTtopj%2(N z3p}e84L@i)55c?S@d(MN_8bFqZm>>&g#K8JG$f||Lrho+NW8)D4%l8?h}iVpVwfXV z9RSWjr`A{C!S&t|(@fG7qq;V5$;9P;421(%02O5bSD`2nx6La#n~`S}U^*n6Dh7hG znaBdienU}g&1rEX!P&xvKL++XuVmUn#eVBdw3n$#?V$0qqXgn%E}t!giczpe z>QYkRvg-nsLsp|y09qk{OBG~wj#=!xWoM@rp9Y5`w+%X*l^z1s6!e1#2nc44Bi3ZZ zyrupWyQ!LW=9t?y?26$V1BxEdPIWljEP@!VBeKL}g~lYZ%+geyiJfPiZT!kC!VJmG z*6S!<@Z5{%!tbO+zV$KuJ29LHZ(`7La+fPw(3kRZsaC ze8{FZm!q93fKtP;vg|HZ0f1Y8g08J!#R`wGtTAnHkS@F9{o*LlT1K+Fmmpnuc4|)K z-5gM|-v_yMhO@HyjN(a@k*$-e;)XC`KN1H3N)}j-bzilrXu@DWh03c<*?*hq(Sr#9 z*fwWd?(j8p35W&``V7D^rNasDENy>=2M{s;0}mF66Vc$o+`rF$a8uOVB~>y; z2ws_WPSr35{C=anJj{^WY(1fS6CV~0l>7*6K=N@>RH`@(b5c-+^~Q-(2zeq=h5%cn z%`6r$*#c$ST>~b|rE@zUq?z~}JLU0jglTZ6uJzQ-ak{z97cI+KsV4X|6UHF3#V87k zQ1AZ$CagaF@>syc$>G7CYjZG?8eeUatd63h5!j$k?QHYRLF$s)&>bE=PUGw`B$G%T zjRj272ERiQ6H+w$tI&?bKX6aK#d6COR%xbK9a(cEpguRy4!-iN>b0!#wIuRxym z{&x+i@hwddrAjzpT4j(V(%D+3-B7z@#FH`zyeR|sSU4aPbRS^9Q|Y#9G@;j6 zeDWJ}IIt;(xR=z&QE4R{Th;M!MQ115z}8hy_H049N&b#{QMgsrF#%k}>NOdj7%rhR%y zuZG)<1x(V-?w-$qI%JA=@g&mCY*dl5AsY3iaz0qV1a8p%yJ=jvjt>i%s9#|L6Z>nx z1RzfB?3y0!yl!^O^BOQQ>646ox$M2^jY%hqx1o!4wH8KqTni@5NFcK$=Yc-y6f!We z@jPIS|Mw35X8pE}W@4(s_osj9CCMvNXK@d$!}(y(<;+SR6Vi#N_yR(~DBjN?bzoa6 za(4n?NPr|?iF_*u;Kpvh(G@T8&>);q3KIR54kQGZR)9(kxQYZZjl2SQ<^yn8i5FpX z5Mexo1QCP<5VVR6CnN(SZ*0L=01}%W1or^;9XJs+bb+!Ru`3#pFKAS`;r6Q{L|`x# zvLbF(krlAjf<3M%0SvQg0*zo>+2VvaV%&6NyG)d5f=I|7^boGAO!Axzen5Roi4bth ziCx_XfCLh;_YlwJj-MXu-GZX!qbyQJKTM2 z`3Q4i@A5tJ(-+ydtKG{Y;_AXZBr#hORzdf1R`DR06{}DAL`3n6U${A?;K|4I_@KF7 z9|Un1#@^Z)eSBB?5-XOF&64uw8Ba#O%y40NbK3w1$>W%oatc>waDQ~`t(kWtcWHU? zsm85$O|KKkv`IR9G9IP`l*lLbf4r|S`AeFw@7 z>H;~=>w8WwkUt*Ot{ODIt+c{P-Yv9AGv5z7^d-F6eE3mDur=fcuRz+PN zP9f^pqA3Q*5FAqgMpioq7H@B>}r~ z&aZ-hE@of>6G{m;HFc3Zs`CR9hg+4DORwa+(e6*((gDJtU@MaSr-Fu{LKLa(`Oy$W zn#xeySDJ2r6cW7U8Am!eRD4I_VQKLwW9=;2ysYh(CrlZ?IuF-3f06+iz> z%|4J>vJGI+Y~kP%c{A|(mo80;Hfh1T(^rU|y;L-1-!(ViW!##os{v2 z$u_Vt&cYHWuoXwIkWs1q^UU7`#U`0f_5!5TPsMWK3?)(=E>%LH$%O;Rgztv0w)4lv zZ8$oM>i$IAcD-lZ(F5)5yO;i(Oe&v?h~{J`CG=>?J1>e@5KN09&%?Xcv_)7h(=9UY z0hJ#g|9BA>HzyL`9cf9X%Q%=GbJG|d?RX<4L8*IH%spv1K-HVYN2ZhR3P$XdRP+Cc zB~>-A1pbI66B*-FbeZ`)x;}3wa(>jjmc>pMqHawVP(FK|Nr!SC%V-^`RK=H4qT-eB z)ED~V6+y?y;9~8>z?@&qPzio}QNn$A919sle0|s?f&s+H%L`pvod)HOx`jy?>?wg#F z6b?HA?Y)>rR8synW(t@DH+@|duyfZD`<3tNvZik_ZKTAymcE&*SE_@yRV3U=#P)1RqW8F6_UpZylw6 z|7du3=|bL=EUPsACbmt`9bFKF?c}7}5Mm8wQY7*lQ{4<%#k(I`MCJ#5O3O?*e})i4jD?WJdTK2L{Jt$jEGnX-ae?P;}9W z&Avp@g!L@%&UMl!WCfkm!HR|El_KnoWUn3#_40uA55PLLQDe7lJ|;ge?y7Kd>RUBA8BECvx_U>!dDuBkH0$ z?`GPvPxTt<8T%Ii@(Dsga;K=m$|Dxb|vGrQ6}mSGt@ z=;3)F>=vpHqP#0VAwpJ(Z=sH&Fn`{~5+{tiRl6Q}*9r*DU44q!hUX85)xFC27U!2V z#roE7^rqYCx8&584J#sv_<{0_BMr8ZJU9Hi;WXK4k9!wQBc&D`tOhc(dcQ1v6@;e4 z@OT2dsI9DfSjZ|9Ezv+|h7}l_Z;Z>+A3?}23uJ7u<`v#Bw$TPeFv+rkHLKx?fb!maf;5Jt5PTL8c@vWL9)|jHgJ4Ib`O(6;yW5?x8nh za4IYavyr>58r*&JB&RWEeqY2V_@NdES~nKS7iSTZonuQrXdU7*?M2`?zXtKmxqA5W z&M&1J64KWPk^GujCAVsS*x z8X!gjW)3ul)kumAU%UWXI9*;&m*Yh>Vowfj%ouhyq_!QUrc|bRzVKlSMYjWz-|Chw z1Ib(GDy*O+(cQ911&@;0T(-JpTUHr*ZQc!`B=ZZ(B8Q>B*g0j4qe+ZWgRENneX=GA2oDtxEONTj&jv8ae@=B;tP3lZ`>*d|9bUQ zEj-<`ypZc-IB)TdK4olG>xRJBHE1F17xyY*u?6`>p+K(|zxtFA3;S)NB^oX}cRqFz zZ!GH8DWc0RW>O?(RphC1L)?R1;&O?~-!3YYUGn21V|KS>F1xfm8DHYMbg|t))Ivoz zu?to#xoRkqdrl{qYbU?7-cQ;+KcLygH)Qmo``{(M0t*Lq?}DT>yP|(q`jzG&!3!oy z3ni16Dpnk-Rp5((xT$BMA7D(}5=aeBqGJwCsJdiv^>UT2s?cuX*fql+x)14-wDAB6sjB)%iWlk}!0 z(hf|viZlcq9Fqw80`TN&k^;QTc>jgvh>|cvQQ3-;oEglmr`xSIFYJ`7XI9a(4c}v@ zmk{j`D1`6EwDs-Eu`vHj*aQ6Vx2llhX#Xb7C>D~)ktd8;(!_I$M~FNMfde(_7J!?z z^XtZhaqo5@JNQS;$du?L!$UvzRvgASw~V{OyLNt&JK8?IX{6`EwD|~!yN>{~Ws%}o zGX-Qkj}wo_nb5%xVTlMS-h8C{CPY9_K%c|yoQCeOfRCy>FJ95)YYP8XH|ooH1~#_# zOI$>8FHicL7cL-%QxpogE?t43f#7Gxp)!A%j5u}48Y$(Epu{)gZ_?c=y!BQgFhZ2X zDfk{I!({j)b#50l=owAQ&FszTm2Q^&7}^!oGgVe-V3%~mAmkk9*|;;%CW4n>+jR>S zBoH6<3e4p+R2FUZCU+NF#hn(R&Xw7(6oH7N z${5}Xz|&$?tiDlmiE<|kPj%oj3X6d56}vS!J>CmXZ0aSXy7^duC}Is0L$?QT9s5Wq zS(@BSoZ|2beHq6)gyeF9{7>g3V znffZ12-wN34rD)9Ds)EiHEiehclJuNelTF(M2El62SZ4*4gL1|qW4^wJ_px?v0*0g`vAi_QC-M}!_NGPg7(VnHpwWbnDN7Wrtp?ES2aQw~i^a6vTYUrYvg^%>0sN!d) zN?cuzyIJ;-WN2U$mNB8~mRRZj!52|ymAG}4K7kWH_EGizsS?Y=4RADv_HZDG*|j%Gd+KUlEhD`x+dnV6*O;IDV(J=nLE(tGlMNFjT^jH zprR8iN7)&qiUX1_ay8B~DTET=lNo_bv5d(BMD&N=T8Gm46thPHkh*tP(FTIGTzvQ< zAMs-jN|?jn5UF`BXebP+5xC-v-V-8yk0QO#z5e?Z>Kz8_m~@!s470$J=(2kkdMJF$ z00$&MT$^v3+c+-i`_;9SS+h8l@8O?MKpa~gP?*Q=`4DX_B}qq#+r1>PjUHWOC!zSzuJ1bl=&L+RF$5ne;e`$XTH}C#4`f4YuEz|6$2jk5>0jah>=q#5yGtm1EK6Of$x@={+YrAw0_bkx;dT_}@bd<>yQLLG^OFF1BzCwah@o_G zMEssE4}j#`-9%;G6tdJuX184K32izYta(&0=|Bla!K=9)Q^z@^D<-1%hlxb0R&FpL zV#`RLWb$!9xICY&U46}IAcIIw3C}<4i4q}+#|ZCArn+-gx#YXj1PhJS8WiRB5^0ly zsPPXPvS~mR-#&JV72I(VLeQ@d5aK?eo$a8dz|U#GryuE-Y3XE4cXcBfRtZlYy;e=a ziCW2^dq|>hfQQ@*-xXg0g`Y^|H2 z9EAKLL~M-@Vik$6>Ribt|A zzyNKr*$%?M2r@}19^emY0-|0%(^pDzkvt=*SiG%G8z>|p65vBKq{dSI(j~uI zdIFzV_F0s|RKzYIFiixH-ak^prd&OoG|AkvG@7jc)3myH(PTK_P{bjxm#0LH2DgG{ z{b_{;txH+Q=%k7B`vTE*3@Yvq0d@1D{Q0iuouc+juh z&J}4kB+60j?X9XZ{f#d9lCFHkBvQ>8=m(9~T@|s3IGtw{X|cW_9AlpY&k{l6*^Ygd zW}Z+`j))BB8mi_oW?+7gEUR8suBN$hMy97E$c4Z#d`_l)#$o+8m)2Qx2f1wL!xdRx z#}3+-E2l!f`TNYnL{Yq=SizUCv7s*`D z8|VLjCziBhK#>aJAVM0^!2haPk|E+eT@wU#)Xk9zIz7JlpNJ(cMsEz?nAAR^#<-f4 zTQvHgTvEH4RoV?a7Yn9w|IcDcZ7}UatJdI))03aHp4OcxLR#?CH2!+gjfd!e2R5Y;-#<-O8m8h(sk{i zq+5usMlps-+Il<0f(w@F>}GAyQm%nm2ZbJO|AKsH80(-AI^>5WZYx)ZZ(tpi6nsT9 z8x@Ovkx${5kxKc&r=o_AxeuMCzo7qOP!2sT9=ClcgQ~SNZ;d3ZzInKwexC-NSLVou z?@+kP@`pj`n~2Ytccih}SX6epW>8Y?DuNi7dZ@pbr7abrixc>1a`nVy;*VdC!ePem zQlpNWrN!6@v2eXFoss>x4jryooPwHZ`SFA~r~g-OUGYV^9iyLJ>WhZ!i?BZ9BISH| zQOEb(OQWp0Jj~1mRsY3g3XRitah^#M;>z=m4t3zEQNqSMe#@GMizYmYxP;N3(SV=v zL--O?iH!{R=S@|P3>CIDbS^x$8)^KXKvw|tASv+SEutJKfvf`}_BTn74$DWQ6_OuB zKhL`({0a0uDn7mDKL7>9=Fh%dK@f_Eqx^nJj?qY`GeU}0BM%P1oRg-IR6>qOs-JqW z{zJC4Zxu26uw2seby6~w0yX~?lxvNJRoE1@809wo6>R56S9Ls*?Jq9Ok3 zBUD%kh4c1dgwz9Ttc3FPR8l=8a84~0E1?LaxVe2by#G#=K$-Qj$i!JP``wCNj8jQs z{BQL{4151!(^aj*^|bSl7+)g*@+<8^0*j>{j@R z-lpSu-Az?{|BBr7<6(?Q#>E{}>m6$x4PBudjXeDFRjkQ;Ix{PF6#^dPsq|A@mTRvW zc&&?)01qJaXe9!P1a)J=aPnfSV(FhDz~ZG2=Fd%2gCnoTUI`^^Yd>*rwOk+M(+)&%@1S+^B^BF7(PE}(dZkm#7?G%0s?Ouj(j%AM)%nc*@*93k?J+_Y!a>h*FFfi=%a` zPVhZkf?6J_?&S5Z^yR+ujkmtHT@DDxIzR7{L2D(+6H4Ia@ zG|k(hi8Mb=t#Oy>N6iVNZk;b|>X^kc<=;yXKA?^>KMq0@(&td;_o!^s6MkGhQG4gX za7(QZf1Fo-HkLw>>6PkS_sM6)LM^r(W#~%@qdMw^a|BwBVl6SXH-Vb6vrOvaHsOj?}^=;g1{-lS85D|OLpJeTR&A#PL&VBN8nT3!S0nSZ| z_|CqfQ{>r2{2LM}(o;*as@B6h?%By4$?{m*po_MPn1OQ0{2R?he`%G8kpCKW%#!lM zb4bE(^G<7yn}qTc$V~nRT3E59QD0lY{4BDoN@%%J-NPovh{D!G2`iQajr-(na?!cy z4wt7}`IX*p-OyzEG2IxZ$*vJfMALUUJ79JCI6g#-znF^64D^T3#wHOfR5;)HrASJM zb1O`tBrcKm*A^yqS$yaQl|@C@GXce*sC&6Engv|JIU&3_2~uU;jIJ{E<=|z7o9)%! z9A*zZM>UA{!Tfw`jyHv>p+n}tIJQ8Daodb~q+6$sDGDA`kN295@S zQMaa2fCUrj?XgX|4+?E?gYWCh%dD6L!nTAKE%b0I!uUZfxpx%h=_+wZnFQqqaXcJTG*6&tHZIAmv? zPIAl~Za-4ELVaa(c4ebe{IzQihQHeR(_7`6!x+ivfq;vr&RoCr9v9UHn<-@v<1H|8 zt#XRBy-S>@{{nUy|K8O0P%?T0E)6_|eE@y#A*=Qzo{4<$fN*Y4l|>s(xz{@~HbyC^ z&;R2|g25<}=K4@E0U6|b4=n1U3`yd#s;llmJWVbU@ujC16-e1h^xzB5v#&i4KzYwz zB9cf+GA|l6e2=mfX*zwFd@{MBmyTjMZCyU4@F)5_3O)>MRAR3~M(=|O*q)|p-Ppn1 z&SSlY=^%Eeh*Ug^X&+am?V3Az;;#eqj=KL=NWKg%;h-gjK18?;X`inu>AD@}W5tq6 zJ`cea&qQ~U2pHiHb)rI6VoC={#M-3|8}R=&vBW@NGeux`T;MyUAXY2EoTUi<92dk; z3V{rT@Kc3|CxpnM!caqDS`op;PGP(*VfG2u8%Nku*KK~o+rme}JQK_?5fLm)f)z`C z70_oQL=jL?H4!oWquXK=BAh;Ag4LpTj&9pQ#Vu0B6bvOE`be<)h~M`S@v0V&N|oSE zl|)rb$Q((yr;0zRmP($GjH#9guNGM>5X}mgRx=cPS}pxlL?**fO2<&9(MR@`h;$BA zvRy=k9x4p)kfX-JCLMCuunB_V>mS(U01KNGkQ`Lgj{_A3v9Jk10SlWPK;^$o$n74< zZyG9G_$Vxk$Zt*{4n-6Jpv7!nIzKVpC z8d%swxkml)NHN(+QEF0dqJUohSatD8)6YmN*hoF^Sh?9(^+TG5>7+)zsFs+oX27Io zrKsj8m8M*pZYvdT$E40?wYI6z9lU39^H|sf{s%Vc(7%RFI`x^wpp!NRcKO7U2yy{2 zn(S2r+FD|PDS8e+?1Mh57`EDFaPt|#8l2e^fPW2}tP-I+Ox|N*lTH(NV=6j7Q_ETk ze=Ka$NgwHFvJN-PL6`>knYf;qV0?_tf{o2AsZH#knFLUqzIbMerAxw%&0?p_rhw)F z#^&8q=CLq~qGuKsC&qTQ=9ScU1B~xJHfE0SGwpn4{z1&VcIxh^nB^Bg%Qw{cOMa&7 zeil(D_uZ-QH`iLFQqy+%-E*I^E~mEfsl7``V|mlxihkOPy3XuK%r-Xv-qF;3F>&j8 zYCAS#)0=+A=3>S-YYhdlEJKJ(mte%gJR#$(+C>xFpinlOKy zcEjL<+`m14_;dQ9n4i3HKFLt03#0KpL@;cD|t-)`lhAgzxG zt?QxqLr{^E7_F~TfGcY~$uR&Qo^Mckiq9uO1~K)br**cd_e}uce$02}siz=9_y)WP z5UwYl%=aN&_X&Ox_=MJ(m6mprjmE_^C;^UVn{S{KKxo@R&#;Pb1i%kNFq@otLSHzX zbTHA>dnMHQ)y^<^=Hn8a1rwfOl_tFB@ZhNeh-W@t6O3^(oKd`%#1??>Z9=#~i+`Mt z=LvH^k^rZz5Iye*q)iWhFCG|0i_2gZp>*c84kr%-;5s}b08i^rttvduC(cITVcz9K ztX~9M%+QBnvrASKL=Yi%w76^a#BdnBQF@gAnK3nW@bDkiqyd2YyFRAbI{N2XWN1Sq zbs+jlU_{nz?9=B2mq+1p5)pZ`al5Bpi-vejNKdS4(u7q_I#71APN@YDlH z#)uB4yBE(GZG!yHXu~?0-o5Z76=lq>Ph~bwxG$D`AsNCP-($5fOJZKA6wImx8)s8mkD3=>!q>Inl6 zA^eF8a4E4@q76ySv>Cv9;&%Yt%b85*oJWCF=2Rg)Re|OqRDsR7fcMgcRU+(Z9f@W> zE*ciy-k2&Jh{XHssXxm^B8_fDJSU%#v&bh=zj(Q5NZ3J-^2(={ZY0(O;D*8IC)pTD zG9Uk}e`!gJPnp5^JRdg=`ItZR&GPdC#yP(HNjbr1Z)57;yq|M1ZgQ>;PtBNSQvdAI zF74cT>GXlYdEnBSxPTZf_N;QQynC8{AMkeOhW6`AG7Q!<`EmFC*(0X1%W}+RxfkL! z)--uUT|v(1`eLSd^ODa)l;9xqJ-f{NUksJJ76sR&$!ZvXR&^1LLimfyzBN@|M3q+a zdv;0p1c{JpHu@B{ko$~vSg`~0>9Ynl|#r6^tXNG)`W z$oi@*bQx!V2zb$ZGS`6s)Ieog5FKsr&hMVnQ0J1Gu#+5L@_h;N_L2ok!ArZ z6UxN9ky;aVuRj_1KAg@lmC3^{{7JbC!RbBhNGB8E*Wm8+vEj}!3b`ukoKJYa+*#$u zb+X2xwV$?WCJ-xQt<9AC)}IjYj*I7G6KoUEU!zisO-+eAVM!BAQbF&}mRZEgY@hfwg-oA4yD2jF!GA78Zub4+ z?6&`WX-?;NKsPkNqdRA`m3gkSHS1Z+>-zX@a&K z4-R7r%bh^Axnh}Rd*;#{QTE%S87&Vy7o11J9>6@q@%E7 z+S0e%#xn^&l2BOWUnmmx^POU&Uoa$gQuySaH{tX643U2pnl~8EdRFVNk42ml?)N3P ze<=6A2wJ{4%k0-^)|Xjyd1G-V+;$nmd>NH@*?xbcQemw2`Og5>jmo8_`>dxg6y-nU zTwPoazX|wRole{p{Cm57_FKCPuq%TF0&9pb75#@;l3sBwmSq1emf#RFC=&?3RF9$L zxx0YAZHMWLz*lPukxGUor-=&o?kMUFN}34<%kvQy+774jDqPCW32~t_l=PQ|8ll@t z4`2_@o8!bvr_1j(Kl()^UN&3#U@%keiFo;3t=oK4D6z!5g+|{G9|7UoZ>^#!YhSL9 zbD9`t;UaKX8%-ZBw}<1GJ~2M`wEcj%!l99+yNwt~O%`a|Gpn}g$qBvDY$%kIE08#4 zf2)l1;dX8wqhq4>qq@O~H>UMrIfkchZ%uX?6cS|{e|#Crl6g!fcWD;U1f@+Zh;b;U ztIYg>wzT|yvd2>&fDw>w89kh9490)|oBQXF{ZD=0WBCMC<{4$`6lr>x&&t&(g19rz ztR8_BF=@+^KW(O$;TXI?R>Zg^+9sO8qu(ZmB}|f;fOyT+faGfrPJuMYU3^WyKP}KU zj+bG&&{?Q3{Zl+*QhiOCYYm;8C_7GLNV1ddn=B$p!LgEbqjk$JUP|@rOD#Bk>_)Oy z#MlOsCW62r-8NCfA;Upj&GH$ENtYpsux~+($cP&_CG$>|RAQFrM~&Ax?5(q3v%E%} z&2qwYqh9BtPJ^7(UmF`F7FKdV?-ilJO*ab=bS^wfPgQSiy?mB<{w1S0H_qkl+jqAt zGjjwY<7uMkePbfti&O2EtxrBGEvkA{Q(E47D~-3Jg-qs|fwWhgo8pHmU$^SqTl17p zUsYk^-0S8%2KnpW-5}P>HhB;!Ew~{@}G( z61-4>i46*m$*;u{8S`tgMECSsED8FWs-hdTnal3p__gpUM)#3Y_T$7yJG?Pgk9OZF zgf;C{jgAQIzF$rZUaXuL3I0Aemm7T0^{}=1doSx_u7hsD(U7qZ?Bt=xEVq_JPNrX6 zwH(hqBxgREc@eLMOHp>1#JOg-I)+D~S3#;oqio%I)`RK@{dJ1kZoQ(n0|EdlvNd2J zpaYML9smF%W1`Hukf5>qlT`uj1ez867e4tP&NmhQ2YdG!)l|Ff3p@}?Xdwv!LT}QG zbmLhnrh0jW}?gVKAKUIbJ?Q4mxFL`0Mub?^7=_w2s!Df^Cb zKd*0Vtue>5=9=^W1286|sl?a;8)Xpa;X@2v03#7Et`{Cv9%0b8cE(&EMMdWmF@YME zRzRuJf!{?4?BaE0V|;xDw1{F z@uZ1N0bV^FVZM{n4l;vQOhcZ2FP=0( z8-(&qCqReGl!ok|c!DH3c8lNiKWCVXNc7AnsfLpDfHWbGp+(lb$fn(6~< zjQs^|KDhHdnhAWU(I9Ah#>msLA6R<S`o}9N6MGdc=e3ha#$+WqGVd z-H#j+JE~l>(wDy+jypC$` z`P7wX(&-1kZ2mUhf{mlO`L) zYebblNfQiPC^PW7b`PAD-W^V`PB~dZ%RZ$Ysk~Yh)2P~a?pB<9IN=8TvwJ7^Q58n9 zY6G!CCgtu{m#{b!UvO+Os89m|-SK_NeqV!%<$Ya>mShSeMaYPgtXNhe<>sDgIQx6M z`iz0pmxye^QJeQ<*xR9-qF4>6-rM?|q;N(^yydvZd-5DD|JO1Sd_32@8uFS_UlzC# z^!UBsZ$O8>QE?q$@m`R)YlrzEmQ2TcHQr~V+D4t}-k8BlIwj}veewS!mb?$(7VwQm zz%@t0y`;=n3rT?VE(?t{P1W8BJ2pEv7Te!9J-qz~u|%Y~E-B%cSmM>(K*kA_GJQHf z#~L8$&=KKr?dHnr`{tG=VH!@{&DB>TEsx0d-vw0OeD(s|f?pqhPttJn`Qhv4w)F&W z&Vsh*r?#Iv4yZnaeZIN2^8G^h%Zqyv{%u6;d_B})FE3_@Ix)F{OdkvQqn|D_5&TUH z<6Hqq#p33xQ4Un5^#NiCMzqN&wmWK-2yN?ReoZe{IdV0Be_4QZ0V(^%Q1YeSn>`!8 zY67{DB-YY42+P@~rZ+9frv{v*%<@K@6f>#jXkK#V+ncS-CsQq*DW%G9b}YR`XC}XM z)YEtD&~xH2hBe-KS)p$wTPhdqMUFPhJ9d$uZ*l=9ReCtibo3<$Nhba>j9S0cKn;QoAIHsbX9VR|(cRY8yaHxU# zD4VzAwUzQo{)^6!_gccXd_P=&CQp1^e*XJf+43%5DO$6eU za=UHU>mO-^zN+P={D52h5H5bCJbr9A zeqt+r5|l8_moSS;n0HH9#3ei}PgogFc(#@B0+hJUm-rHuxapSo2A8;9p13od_--rl z11RYuU(x|8>62U1XI#>k@}#fBN#C}T&Opfo{K-J=Wa6vIq%p~470DDM$yD3P)MP18 z{uCPR6xypP^f4(66)8+3DJ zr%7E+yAYEmTahL|lBT$wrc9QO{QqLfyY2iBWcNP)Rk5W1-&ZWT`k?gMXsPGhQg8Ay zUx6}zow7jpvg@&BArH!Kj+TYJExS!#ju9w_!vCLzP4wJz@wUk?*u=>D9^N+jM_`jb zRZXUEqy9c@67@USWd2WKlY~o86EFPwP#HLEG`xx|c)AT)O#skLowavL&wJ+Y1eECJWT!ST8o}Hwv90;tN~{0Q^NF^W4==j06m0tKtsh?wrwLwp(*Ee+&D4F}d-@%ZKkY0v zTzm1PqwVttyl8sHX3*LGJ^2%gq$Jftoi~sAe$txHhHra&z@4zOt5y)S#N#HNo zMC*63N$=l+O3Qh(Ci(Z2ZDEzi)ZQDI2e_u2CNU2W-OlJJ=-a_ppg>)xitOFktWq z*LR|S2sUvb8y#b%Y$oLk`@e@xZ2nc)Wb7}(CLdvE5(}`u44csXSHmW_4Wh_dCZ>^o zA&UHNx<@BAlKzi$Wf$~=9^NSt_TIj5_i18y(|;9glBAKlrt(Swd7c#3`i$%#xxexC zZSv>_?k%qS<1-{-uTum&SrYB&H}5e2fK8kTRW@+t$~+H}YZo9R7>2iNyaXY?5X92iSz`Phk_?|7zGo?jtYVA7PWAzY3e!{{c2h z()&GZQuMFFCau50Cc1wSHp%->u*u9{gH440LD(eZe+M@4{A;jD=06FWAU3AbQ}M8g zQ0%S01eU`C`=7%mpI}d1aP2iSZ1~7divLE~x5f>0-NOgU&AJO1x`8t9k59f9ya+5`n#}6UM<)P z51XX^W3b88KLMNMK?E~r1uJF3@XJ0q|1fNlw1)g0Y~oKQY6tu?*u?I4uu0M1gH7_1 z1-~r`-)Io$t<}*trWO(<(tVmvS|RYgAo=BDteuHqx+^eKEuyoQNNvpr!H~^?^!{Il zO=^k$?O>Cfe+D+m*#rB3VPG7(4f|(cldPYxN!%~kWa`gglbqjRlk0y0Hc9%Muu0VK zVUwf+slN=H)Dm5ipl{THCe;$n|2^0w>+iuPMUZ00H#K%{hV#F{CXfC)Z1Nd|lEh~` z@Nw&Uxjr@tq{X_pVi3nQgX_h~XH7&hUyFaiCVBq=Y!Zc*?{+slG7cfEtBW3}dRAYj z6lBTFM6X>*bw$SC*1r+>Zw#B{{kOv=^JHBsnmvN;7z!(+QS!{s#q*+yPk+KDxc>y3 z?1AcQ?;A39ng}R5p3vJ`=MZ5!TS8PfM6ZI^oe3DBHj&idx*e_S`aVfRB15wI`vg4X zA*MxD#zD?RDSZr8OcJ4BX66zL(q2#?824`io5Z0>EV#j4%;QObaiKk`0lo8`0g$91 z`4RBT=`&s^V{N`K>=YBfIIt>}{C)SF~Qv9NiU3FdzVHc2JhWoMV` z28UfdC%_SWBMc6G21Ut19a^Bs$5Dye)6P!Cvc6A+Ql4@k(ZtgNf}&P8Czv{c>@>^w zp^O}%BhQpSH zEg)h~m&xkE9e*1(iTWRdO{V|dV3S3%9f$Q8);j8Muv?ul)@!$o7RcQGCTx=T3pVln z18kD@=dg*x?_iU>e+oAF2>N%0P3nmLQP^bh@53g#wOxeM?1BL8@T4`lfU=!6v%DV3Qra ze?!>B_wT|cA`G=th!@2HnitMzK#ldU$GSG@t2Jp~ytw(dVUtJd`nNkDIpx{Jh2cyu zPAE`}N)A84>5aH*n=V*-(2F}+hRGpo@UV&1{|Ibip(g$>!6x~-&36MP0GIDZ zi?}qTifpSsmAF~cSZ95ALF$>(y2rzVg_ZMHMbP=x(?#3JF(R!w9FhL@a&*#!B!FO7!~@@dqPGlT@y-MEQ`h zo7Unq$e6OiQ57WlK*%FOBc50Hu{6)?%%9msvX_~vo;qW2NNcVu(R8W*Loc{zyBkYj6yA@eVqzxj_G>8j-YTH6f@~Xxr9zbn1GRDhPG|tXMSJGk zyY--Cz`==y_`9Zu=nNyY-R8pGtv(v>@?KF!@$54OAuieH4x4iA0Ic)g8c`H(Q%GCrY)g%a}8@I8N!qFTS-8pheIN%6-gQuE+lz-#FPb})yonWq_hxh$2R_C3+8=I zwX_2s8Et9d3>Ka0cV30QtGs{m+@cG1)BECcC<$(hi*;j4YsYdi=@#lf%d(}Z@#{h= z2WW91v8*Ub2~sGmAIw&HKspFny^_~|y^Bj}NJcxj2;C>a1~&%7q^yLg*ttIFe#I6 zeYDW``__I{n?Apol%zj0Da*exDJFkpQvOs)nUq`h{}UzU4#C)_c%@T*PSALc@Ux#v zij`(X46~>!UP+mBx)JbGNrB(ciT4G zQ!qfc%nv3d=pNUkv2wpGX9~eo8fvA&Rw~Ai0KjO;43K?61ZdCqcJ@#OfzfD4gx%-z zqz-lJH9*V_-<_$SObT8}IrsU8lJfdi)cjY)K$ES=_|r~s0-vlafW_{8?kG(TAf z`REIKig*0!5bZp`)t293DJqXWD?5K-DI6=0e_<)Hcr3-Rs}~4h8^dt!DuW^Q18vHo z@}y>XEX6JyxYk3nE|0@DoPGbQ^g~NARK=R8a{km(QsQKeP2_53+c_c{VcROOFYU*` z#;X9d+M4QlBJ?>T^sBq7crAtb952JGM!c57q)xeZx1T_~9nA&(AP+s_g8sl#L{;DJ zNNaxun4bc8z5+PZQ~33i@mPv{qXwT$v;hE3?lyqOQd%kc{HFk<5K@xkPI#dd%wG91 z9!q&D*SXSixlzGEMhTCl_@v3utc$U2X&zR}An{nrarahQ>*ZYy^V0!%t*UL`J^ETz z{oP`nKVT`q;?e%@rT*YKHnX@3G&uk=87P$u0Jcp+xuvkJD+jZZ`+=oc%Fx&o0*3p} zPH*%igDiR{Mda3~OxTv6N(#7!g2gm{{eHSPI8)Sc(i07LTO_>$(>A_UdVW-f64F zV<}5So?F=3pIAx;9!qgk&iEsia^GJMkEM+K0ZWNg^Fg;a5S?3+0iA<_P{{doXs_hB zW`Ou2&8YvB=1_#**@yzE){@c>EhSmeQ!e_mgw8|Ba}{_kMU<@!L4sNu4ci@kJnLt3or!uOQ-<&05ogJAs`^%w5d~G&qT%mRNH2NMt3p+-kV0r-+c^AkvkPi85ryH zwp0M9prN4t&YKD_SzYw0G5`di5^Z%|lLd&P`B#W~92MABBEIIfqvazAw&Z)`pkH^} zI2)tn1pyb>`^ovV0JZIByYd8zq`t}R1OP1U?nScXuHAqqt6cQ#c-!WfgmN_E`bG@W zj%4T6D04zF90A}#>+Z-uhJ7KiK32y|DSUV-#Zne8rFh|`l-gmul+uouQVM49Qi`aV z{@E`oMG8-)_{<>Ijqy~9WO~YZJeAUF@`Fl2&7N~Ed@~LU10;Iy5udM(4BsR4eKeY1 zPN=15+_^TSQZ4la2*9R6nNtO=Yl#>U$V@>EKr5P}YL1?9PG}m#Re&=w#E<2kmj%q^ zw?-)>&vg};np9_)way?$&KJN4fgplb$46e-K;a29jwg6RBkxaSwzwy0f4?pzl?3j{T+uGS(vGGP}11 zTn)kva9j+kUL=GOk@x^ej*&2J0#bd)iUH(0Un$mr}k;Ku5Ti z{rBbx?^vtgr4*lKn9&8MJIhguQ$~0x1!UOT5Rw1POcz}bL73Sm6ZuujeM)Nt+Fhc7 zlCtpufPC|vcqygl0WE?o#}3<9A9HI;r(=MkvuWY#RM#1?51M|8WRylqx7CQvBnlo3 zAWa5@e_QFE=qeOW@f5Cu+o6f=04N}k%m)j#1dxf!*iWH}Ez!9v#$dz?GDOQe?iBiL zw1(QM=PLkdFo4W$#@2GxcKFI%27oLZfYT))3% zyhrdEnDqszN|!$*9Do~Gm`{1coxsIN$mAxNX&EiIN0%$wygDEEOk+W6_Hkt%$%kEh zg+S9=%6ENzWEt?BhG7*}Xt9OIQhYl6QJ6F0Pi=sC5@$$zDGNFUucaL8H&+5WzfR$$ zfmhfo3tu0>pNx~OEdwA<)>D8c7W8YuzI(!T1HZJCY)7`|!(UDx5f&^z`07Z463-Vz z*WS0b_URxtTu}$F!wMbh9Oc6=LM2(J1_KpA!=``0Qbt~mOGbiAmVaO=2wMjrLr)46(EwP$MJKVK_nvLL62O^u@iVN&`yU#e!#Dym&wsIG624 z(0_O-!4G+#%!`*|pRW=L9Cd;xet0S4zr2(!hwg71TR*)N8+f#kn&XGNQN?&KCEMl< z52kS1NKD|tl-8Hdl4wO+YZ%AlA7IL`9*3yY4=|`mxiX~1BkAA_efs`(dM}>~kRCY^?&8&ne zGf^~rm7zvH{98nl)}$R5`^M;eKiz zwKB$9Ieg1llwZadZt?N%QikmpTC9v~bv7vzJIJ$@=p$LCeGb(=|s+_%70EI{F$o8M(gG%%ZFra^K2)Ki!@#Z|PMtr#Wp*>B&x z6FfgI4!*#5`Mo+1bX=(8^-2VxCA#u`WZ9GbGZ|$SxgKQ2{P|IOJj2 zn|2d1O3I7n`*aIcE+G@C4~bmZ6QbFR-qk%GdGmhx950~8wJ=dT9!B0Wo2uRUlfi3-6~GTHn${ zTPo~8=VUK;8qmgPjUMWtR=zx@Kex6o9=i&CGxJdfM^F~3UEDIu0+p1 z?cPTNQqXNeQkan29(!*DIlZ{4jK@?{2o*&Qlb^$&oTr-0R<`}{LOhMM9iU>^rJ$PT z`|A+djfc-t<4%lgJhsOMH4q}aoC@@>84dEwTxz%97+AA}>xPSaKG-X;MC&8s!TZ3E~*CV0jM)$>f&z&RT)mFbZGm2mJdNBk*aB;1)3ZWQ)QIGhz z_Kd6t12iNdsttNF{k-2*4L-~iUM)i&^4U)&i?AC*2nx*&#HyUX(%=PubusK0xe_YA z!~XFHxpL07HW*`NO=;*Y{#EW_NduJJuuiw98%+o?ul_`UenJ7rlpzvrx{9$S4u5jB z@1_bioS@?g$J2Nf+*73#44VTW3=sa&lN(T8Fd^g(#2*O>6LSx&0YIICcp{*T%Ysk% zYSbMD*Xbe^0BWv@Bbw#LF$c~mMv60rTRfdq%BGqtippaSJISv?nG$(^167Enzlo>^ zvo}+|ih~PYP94$pXk4OBR&^=5NGH9Dcx9nP;9R8gqP8%}*wkG%S;^S#?4^OS?J?Ku zOy@fxJa?Qhtvq)LF$d-LrmH z3tM#6V?Pz`zb4Qi2mrU0>_t)UYFPKP+-xtz5yaUP@m9nuOzY{Q&g#@zrc1w?> zAs2BvMND5WXXy&K7S21|EBit5%~Sb3qBB;7kF2A9s=F<0rdnO!L;XigFFi@C+UK^F zz|n+c%TVvS)vE=m?Ub(?Dlen2D4v*(`l|8_w6O^dei(aUe|oTTHkrxBYm?fE)IuGU z!1wCXg-d#!PATG)ST~4n3aPuObQ>x5qib#rt5Tle+iwBM@GF$|pn>4QCRPHX^AlKS zwP7O2gK0>ZWDWaCu$nM?8N?TdBozY9NQ_2`gyA+oa>>044gnF)lCA$QX*l_{wf#IfUewxC zbw~o)PbcVS&=|O!J?@6-eXx*QzO>y`5yPB)m?)%=?CegULdjF7?N893A z!8;|ViPs4Ss>Spimthi(TDlS(ek57lGZIHzaWrmR+zK+0M*S^BbWa>Xf--9FOGC(+ za7c2;J#wYqwHRqub+#KqQ8JmeM5;O*6rvzz3QJKPOtCS1ak)=)pb#iqi@B54{!|># zShJ`Av%z6}>Rh9;D&b;2p{{ior8V2I8dsNUXHX@xQQ*8b@iZoByr0}4J1FqRxj1lcTt?0&<6A@t+k&=Ni5dHV?B0%@^O>82_vlsXK1J+%j%sEtR&&o zp5s8jPsy4Jo!p6amnrl#MHUkA6n(7}1=~(=ZsJFKxM##fjj+PKcxoAHE1L~XxnuF{ zKB3oQX^+NTl>Bql{0I>RU&h^Eqwc;GdLujgXk4gVBLBeVHA$b<8KLNYeJRdD%L$Dc2lj}0W9+$smyz4l*AD9w%R4e;xS5z+47xH1Mp8`zh ztKdLZHXMO?kP<1UqfXuo86vCQjAKCcQ_x-}Y_oXoXQgKXFPlP&Jr)P$ySKklI{Wez zd@DR+$a#INefJG0BpfH0E0hA}n&B=BD4i@>G9Z0y(r6UT?Z!ouXJ^bZEJ<{U-FVXw zh7}|t^v77zzpI(*;or!2uX)A6Y}qCM{NVk=5P%;URao3~R!q`DnEsafc{wXY=U~s> zCwmt!G(R7xi`Q=3fFGQ-?sBt!q@+6uPa3n-WI@-Go=2!Ylgf?eD1Jo9UwlVPW|4Gx zp7o%i&4ES8E$k9@gKU1g<8$X(hvjkSZdU_0kFlYG@CvfD7noXziw@n-Ixl-w^4_r* zS?Key6yozjNJ`Pg0G_F-m>U7%RCzKQVzC^=h+zVdL+t=G5i}$epG>7E%>QkmmQt); zLExQ|Jc23`3?Kmc5=*EM?r|3oimQ{yXq&Iuqph=5IUt5fgyAG#Gl zz=a<)RxENfhdaTV$3dX zZ(+m&Gv@%=s)`}P=@`Sm<8_T)9AdY}^qg7wmp+CQHO88C9&ijaP*tG{n=moPUOdcN z!zs9LZ_#AHL7tBgn4Bw&4A@S(D#m&;w_&xqF^{q1Q7DowzzEItKyd`y)qN7dSNOiD z@(mX7vJx=iunS=$$&FZ{;GUxqLRMokL4=`T)P0%3j2T*{K;QKJk4d83%txMPa;1}U zcQoX`pRp>mdG%%uXp?eIr?hyTnL3W&5?++F$6-L?HLq)K zpvQVoM^it3^PIuaIRjRQlY#|PX9mOGVrI$`gBq%HYnP0rOH5+#i>MmC-`o(|pEOl+ zelkaR-JaVbsD!BVz6A-fWt6jEejigZw{>ZWbq)8W(IxF>Zks{wi~09$7K&{|*6?o2 z<@elI_6>}lZ(cbn`S6al+bJP@J;jdab%TcU74cGsljoxB1zo2X=?Bde5HlGHr7l5> zmiuSUu3kLPImWso65~gZ%NQQ_(o*-D*Q%?9U80j_wM@!keL@-ix&f*@l5x^Dgqn4w z-ZcrA7q~Bfcj-h%bR8#2yJ4B#G8~;P;M-W8aoQoXbipkw{Wy!SOW7iki;Z7d5aA_( z+FqyBX@q-*a&EZ4CKR@5qwIv)=*OCcmW6J_JHB@gd%|05q_$2JFP+VM!*5Q~kAmeg z_bocUJE5;HbCxe3Uov3WQqOkk9`9Sw#tI+zUWm6j95?2z)#WwiV;Y{-lPkw1x*^EY z;lzj-0d?*KJsy-3qRA~*)u`3OV407>U}>wJsLZl<$^zmxxxMu?drSQof3;V0;EXSo zL_-V3W;sk48>?~SR;2V+H+g}ss(OVPLHzYwkttPZ3mJZ-0(OHEiOY=1E}F=EO}X_9 zr;-~^A3&J7)yM6Gq*qd9fsCl2S(#SHxei@R7`7DLHqB2cC>GX{$6V};?x{Btghlx+ zA^aw}=(v4#7E69rlmK$Ayl(XkZXBC+gIG)(gGk+O#|bR^yq&LF<-+xv4uC4Ys12F* zi4@Y+Va2&yF?rUNkG@wZw-VClm#PBJn5B?$%EPK+ zOL5A1y!6&|4}9$#tNO~C;#Eu^G{3KGk4;F=yMw3!GCB2tKM6AJnMxg_DD|t9`|YH@ z^ijC%4xXTR5zv@JUdB0s%wPc|4ttJ#_hMa-DPC-`jGq^AWUN?@ zc-K<{DFlQ#Ei^go>^~ByY}%o8A|(HU_ei>70pT_KZ0bEs5@3pl{uno?GqoE~Dca(AY!gln# zgqYn0eI%Nxio|L83-mzs!-1UlK`KsWyqi@oEjUadyl$qoTJ7Ux$ z4C^U0w|fsW1u}Caf$dyX``=c9&m#ta6vMk;p9mWwn|4GOD zK%kcJJQ#n|#q}~s2A*9N>2s8#UIwcZ8Pu9HuW-k>v9`J?>xnSI`?7KgnIuzm4}qKj zg0?L+y4-N;ro@CL00f<)#GKt2!3r5BP-33s>Nb%dJ{z8`xC_qhmt)4HfsqaS>NTI# zSVcG0;cB3u=if!E`oR4#DmzdJGKlsg4R_75Q>9A8yCu#KMnsk$>)9XIzC(aFNhpYz z4fq3C?>0&km4d6xLi+2CkAHNRP%Z!Upu@F$*%2D>=$PU%6a_$9G*NgR~*M$2#e^OQiRu{PdYh%=P^JJ7rB!jAC zHJX0B+lMuROJjyB5{YO>(#jxhQzuW#=O9bd0G%GnV-$;z1UQKd;)DT1ph~T53=b0+ zzN!)5Qw@}^SKcLL(q*fNsul1Sk-FAbrldM2LUitYy}%?|Ww(djqJ9gn>5QY-g#5&| zB+V53n6&-c3d`|;%kIhHaxk-Ni1=q0f|RX3av|R!0?hf){ioj_C^znt$|r$wxyoCT zVo6>k6zasw08gDg>*yv#Wy)>!aTYXG1iISFYkxOWD4GR4g*l|7pLLJvAZbQdD$9ea<&M-y= zcfnP|uksU+;_xoQrq|?}$HjXW1;2eDBg(PR-KfdymUFZIQmdltdp?Lpoq7JcZdTLf zuRyV(KCVMM3F>54s;Q{B>F??k0oSlrFQgXosO6!2DS3pJ7p^16)t-e*8*`?fPq%Qn z2fKEKf-^!rvCjW4XRvZ@g|O^d8Gu+@84#Y;NkO4L+tf&5nX7hzXyEJ#4JGb~tFaj| z-u>|~0}kk6(k4*g9q?;Aep(!n>7U5~CB8O^7A)^aB{e0}NrO@z#1N)$jkGAw5$JIB zbC(+@t_O&J@Xz{mop-o}dSrMtgie#_1QNh7eG+o+N`%oC^bu9@jh7+Ira*nm_jONU zf+?Te8qms`hhzm~3QJJr@FNu=;9XDRJdfYdfR!~sT`t_h^CqQTV%AIcW zQz@hL)~c2waCtlx2%JMtq$C3k|I(`T0eP32Ch=alPHQ;Is4@P)q~gjWDxU@NRlTtq zY0eQsy%2)5zUOJ2IW1Oa@2_5JlV0nCyb_fvA%ZY7Gltr9#eb=Pc#(-%M6K4Y5!Vw& zZQc%ZyKn?#-Jeqw5G*d5i0AA#9Tsd6axzr#RdYh7cgi5knTF_z27wAtC95trb>(7K zMYWr0X~aU?-d(x9}OY$lmwAmP?JRB2Zxk@6gq zk3S1Sm8@?MhCYKTsVhP1I^x5CgKSDV?-6%=;wUvvk#pF3|IW8%2~P65@;-4&{Xhyb zz9vLxX^qnR%7kRwMM@d>_h(9bq{`awZz0 zdDHd%!`TMpCLKV%j^^NWLLT!WLi+Yt!11inn>l`q@25}qRiTcW2~4s*sZ)_;XI3Q#U@K$^zvW>+N4YJiFH$OCR|c zV3T?6IX+|08JF&Cj){9hbfO|s{VnQ4@s+d&JQ+!$<>#fi)2iuD+x7lvr`u6q9=bx|%F>`V^A(aB8hltiD zxrc-$6w(Xg&Q0zm{|TE&ttR(@6;C1k5Dn&(e%M6?D84S*qzuqq4uuXPoO4qK&lo-1 zp+n4pt0_aQVW-ex_DJT`Va@~v*vPpIo754WyinLEUrBE2s6cf)Y)q(WH5Ct=oWjOM z2bj~w#U~VKCM4%=(k7&zh0;8c+00FQBKNMHW>VqPYTBgIw^N!a6=Ih3DOD;(__R9h z<@9MymN57XiaRfTMn|{35D%No84n$jz%P3B49=MgmqfG1OMc;u zwZ7J*MjLMzn6apyq|Fs8Y}!+rz*kl42s7is+&5P}&2ft3+s!UnbSI9CUf^5qCb3is z{;0)j{!OgZQnhI0h{Ct}^7Tn|53_+pzT*NMwP4Y45!5H{;=K9ArEeDu&aU4_aoxdm zeB`ygbRRIl=F!aFch}>9INL$awZbe-lkTX@E|Rnq%zz8+nId1kze-`5Q`wM$agByrMKKW-sHE{JrCs#`RT#%k@M^dJABf7Zk0<# zhWB8)QM8L%gpt(d=e>Spse&G9_&+n74+gt=aQ!Ppe-%N*HQSeG) zodkKwFp@=hiHBy6?Ee5^u0NR=B4ah<^x`tVQQu_d<5LA9x@)h-S!V9TCm?NcI~@O1 zPY#9@UF_o>-3XZmvxyaIQ#!2n9O>rpe+!9GDnQO;<_4UmR_v#00=hkP6=wmDi!XJ2 zd~F0Xf2NpiVT_&KIkq9Z056)_sqN!{MSQseAk|WBBLi~flvXngYn;;i<2psqco*VjpPr9Ekd+HYAd#7^GYvL*bX}6Ybg$l2#!g_pt$o&n*uL z1*a%od^N>(Xc)oX`ZEXyoQtMw2nI!j?-VmGMX4h--8m528ExP!NwVMVlSr znbdYq-y-xw*#x!drN^Hour?98eA@b`U_t`B$0%0Z>qZ}`ZA`@4kR8z_>en2v4NhVT z8R$5zGnDrcvRtZNp*vn5RToP3-OE;g+RZU~3+_PyqcYdH7HAY4>DZ%yRS1t|g#uh~E41s-zM1Mkr*p2pU?e#?JFm>PA3< zCF@N$tydh)4*b(K&?;J48~(wejNW^|_Dk*|qswD45hb29Y#PNkrW0aoS_r>o3#=L1 zc*AwWr%#rZoKPs7ragIRHu1Py+j4&shSOEIluPj_GT;RJDA&j9+BKY2_eo#XV6Y?H zRE)x9bF|v*lxMeiGimL+dFMAl*YA9=>N7h;rk??~hcYFTpuk||ZR1D}Ct!&FB`X4^ z8%<@s!`(uvxf6~>2dH*VS5?F3I}{v8QGzX&ic9)4_u0U8IPLgLL2v1}j_&lvYVbNT zVyn%+h88NzfyRX6g~;JKk*408)U*$mnCfU&2b!;45KqQ3Uo~;9MN_^Ea*wn6qH|u_ z{gFe|Kvo>0urfhS=<%#eabw%2FWieD$+|>n0w`TIx%;{*SUcH@!e#FZvqxpNzKjRr zWhBc3D?zI3>USTUTT$acP^DTI_Fs{&^U!+ac3bjf8u2t*p+9(x-XAst_oZ(d_Km|P zEVGp;A!LiD-`<3eWi>r`VfvB&Ri5SzWN8AlDm|fU#0XKOnV$RBx=DEMJAlsp zW{mB?$Rgsj;O<-^l(GU7(n(cmCTCRbe!F$7e(14QWBQiM%a4vLtF-sDs4B-@`!cCd z%bW5)k3UknB!NJST;}vDj16JE!&S8FqLfV5B7F#)OF0@UR@CtcVPIY#oUw@t`bgf?0!3b=;Up}*n?OaTjYDmLd#=z#@8d{CvUVD z1gYGdDT-VP4k`$M?=GtpPKk)>oO$(tRihzuTR<*BA`MZ?@b$Oun}OC-Bzfn0?es~n zLtY~)YCN3DzkeW!PS#QBvE_*ZrS0jgVTd!P$nHVN?;DbrC6iYgDjpM1IB(oka-fiK zU}GHNdOSrwG^IQ>VctBzP5!z~E_@9Xi$m{HmM?V3MDc0%XZ`^NjPa9u2Q?eeM8tf@R@npqF>~@(r)nF7UI~Bf|0{{s?c`E zZiusrN?IT<5kpyphUkG<8P_SShp3p`?X#2VU)i(u#yYa|_y_S)WnX+*grR)9i4gat z;qRrk@PcL4$qR_N+)03(Z6s5wmo*-<2l3dmb=qylZrXLCDWinQ6ARfAo#{%^rKR0z z;A`M}WmKId5N%Q>@i&C)BrN1SREZdmPCJ-_7Nf?S1V)1N2T2U1^>Vq>sVYX7*0{<2 zF5aHPu(b+V7?Q#bqpd|;T;OHLop#J2ZxURl+4k4zKs~mJ>nsVp{@o(vwmod@(`>_< zF`ybMIE>XvEGRjd-CJCaGS#DB=h89=FTw#!g&K~eF-6j+BbnxqY;aXBQ&rw{Rlzyc^Kdl@Q?(1}YVvbx%5Zfx zQ+2I$^)uZ$bpyDDiK&J~y2hnBjVo|XM^jDLbj@pXn%;0Ne^afq0hSx-oHyX8Zv?0? zQ&e;#3VVnO3qUkuK(pwf5%k+>EXqkIhOK91`;!! zhB8`n>Rt4ncn}QeTGEP?O0Rt@Ui6#bThXQIlUZI%HgxwrLSi8P-!b-^UD-Rb_5F!XH8aG&GyJOdwFwHe)CHv(k>S= zOpxp(hbk;Mc9I*Jpg97X8_lj_NR|j`lE*46huvYu5I0Jc^qz$=B^q3{Jv&_AoM=;S=Tz~JiZ+H zoSAz97d__*yylO5KTG?4l=h`q^b@-fAa&ixyW4-BE`aue*X5S~L*0G2HPxp3;$M&u zdPfM-dsmtes-X&qfYKF^UIpnOH6irgdv8(&1f}=h1AA30 zhw`>`PUn%3#P|U5&(0OEt0F>uiOQ>OH*aMh@iay?04%$cRi&Vuk28nQ=)iSm7ngV>4X^R!+l$lZOGGOAjLy2ONtahOeis zwh|9zMb>@Gt2WNqTg;DGEWj)VMIrQq7o$-0h2+^-{6$y4kk~epLAZ!muMnIomAfxV zPgG>^kiH&8IH%^3jUN8BhZEy z1ZPTSO_{XFqr6&ikQqfN%{^zHjJjHRV(Hf4o31snQGt6z!Ikr*g!Al-5 z(GRr}j}?+YaZ34Gi|03@Cwr@VvnkJv;(|BmCN>i7S_~R{6Y?&eVkDQGyxPh*zCBG@ zs!Q{&h+5)I_f*F$)w4kpKEI*wIi<4%z!Sat)zX*93-@3v)Za$Qmw& zHtIX_={sod`~0=g5z z8mp>e0fz{-52F?R%uyij!#x8%mTLEC!MW`$PK zKC}-c^WsH*zVgX@Rc18DL`xl6ICp3Dl|E~}fZUuM)4YCXyte6VQf~vkU(e)~S%0}1 zQrVayU&nDA`X+#$+f)z7I2setxsdSljhnZCE3z?Au}F+XJSxB;Pf+ zmur~3T+pvb{I@d?SJpo>y{495B<^}EGfzJz8FfYeEd%q0DRhfIX!G&HCZ7Ce3?r@d zsSciuUY*I?Z67tI{(l_;yUk_VC6wOMa-`TSmuU@1taQeB|HDE$krW z$0L~So&<#^FoPd4&+=`Mc`}3T1He6be6aoqswB`s{D|N#*o22kh|l*8y?C zr(3Wi{O&iOSr^@mLRo^h>DN9ptu-8(9j%!ilja^PEgTaTEZjpYoP{?#9|RuX(6vv8sR@PKfMNqYVpwtnAcdd}ec9&5Y2liz(6 z%=kKZZ@B=ucJo(%{x9Q|UpVZlbao$zuF6IV7b&6|+7R|(f&^u5Gy?ZKB4>(eDOkZ& zCIq6KN)Z%Xwz5%FidX1SL;#M`cpUaHcHzl%NAOk5-zXLd(Y}z18V# zMG^L`g^7Z>O1y?2OF z3(ItFeMSg8G5I~{bL+6wD-XIpeu=j%%aJmK#EG74%*tz*$VX|4-~bNWt0cWcV5Jd@ZRBM7Z$a&JCvEtdraxS-HFfa2h7^B_1>iczO(C{?;s^);1*If(Etu*m?DcQb`2twKfG z#Ixw3S_3a_Xo{odKzWYqu{h5yueVWl6wY1rPc=Q42<=s#P>t;rML~G4((UbTm}zG6 zmaAk(U2C(`3=BEkDWMs(-6~D%^W5{wwtZ8U^{k7?spk2VBipUw;!-1$IiXN>k_K^V zrSfLm)ct2frx#B3)nqbS4XkX$HT4ts-wD=rsbbn|Dwg-=92EJ}!5dAJAFLlKqWJ|r z!Io7G?G@HkX|5p{XEYvnp1qCv-1)Lx#xYjJ-Zqih^Ek%b4D-w&u_uU%v9KH9krnR$ zdG$xVG!crwp_qH9a4L4nUdnLj^lmJ+&Bu<>dKf>j#;Aj1D(CTp=(MKCWY594YoOK; zYa;U$Y|Acf??cDE>6`d@Zv8S0q~aBdF}KC1P0v@F)YT1X7`V38YHtZ1*^ED))8q&| zt?;p*eDS90En(v%7B=x&ao-X=Ruzc%z+X`3uxgI;-8XHv6nt;`Qa6yCtYt*q@~h`m z1je*QH-;h0e%bvJHpxy=|N53oBItCa&>!RC^=?1VMOuSv2x0%hsN|DhVa=N3xA!lx zu*s(`FJ}MGT^O=}!#?tdcMsZF5(19L>;`3yrfy*Dit_vd%Z}Mh-Eh&m)|vnrlt_@@$pRpM^_YMQ#ANGK_5l5YcT!CLPBj=4|%+6nBb$L zFB*}3&@prXzh@EYX!t}WU z4x4gI2q4j7IwBZ>_h}~t3YM}h9^X>ncTalD@s5p!P`!aOIq7Y`5}3U{JVkFa*=S@0 zj~$M5HW|RBtOvMwc1P5z+|!J*-tj$+zyp9n0v=-&0IY|^k^lgx2c^Q_Fpkq9imbFQ z6EWT!vrxRA-dMAJ%_Azw6t^e!3MEo7tWpz1dy=^lU@h9HE^9k0%HiZ&eywhiB>It3 z_5zrPkbYs(ckC64)zuC&u%~H~pOTTqILxx(FzqwtQ3(2J3lEOacFz{e?f}EVyp}k| z$H#@=mbn$KFR7sJiF*?cs!3r^N6}8dis@%_cU4#I?#5QpL|v7u+>`c8N>DsupY5=r z98*U)wKWCunO17sEzIVnla|va?MVsMkK-u=0HF~=J?v^%{x8oeZW4YyG1a+fXj&0- zA#mMi5%l!sNmX1ZkN$MFYISm&xpCQufddghzK+ampxD3P&7Gh}foY<*pR9J3hT zddOV)%L0}-grlKM{J4wNOPak$4teRx_462o3^cV-Jf##!oat6 zwbmn8ZE2hAdOE1$nHQcgW~J_aa~lnG7zbd*jfEAq&2pJ}6vUVJi8F%O$-o4bBYZU&wZuk%kecZ213gF&(8{s+xHB($5@mK+OVvX)*N`E3+O z?8f&yEq#o%@501LEN{59^mD%4icpHRBok^G5G-Gh(j~Eyj&B(hAKHwuinZFRX&I8G zwT}-rX=M89JHmTIATgn)j-L9-s6>ZIasi_Pm)Mgr4o}h4x?EZ|-6!J(H$>CBC#3j1 zo=iOXBAPkEtiFK6uKCbhBYmkj@ z|LmOR7!mvS2P82RzD^cE9KgR5-@AJecR1JNzfO`d-bGF3);_|$L{it%uXN=owr*IN zrp|Y8dX-XW<^C4MPL5^q=22>U{|4Jru%H>)N#=OKd-1x~$;a_0x%&Y-vY%V0gUP-W z5(i?NLEB~$;=hy%1Y-77nx~lnu}`VDSD73uURXYhKdo^J+&B5$_NJTctUf02qxJRn z#p(F7rs}|hM~~Z=*T}xMjOTHP>L~S;#rssXlw8p_3|YgFo%aw29R@!pq$Eo?9}oyS zVr1*sVj+Lh#}IUsYA0LAk??H-Kj=8^v#jT}x^L*4`6u~x0b42w7n2NSU-EATet3xa zN`)u{-!%WZ>C;1brRUe_^QWB$R}%awcICe~>Y&(l^1r@YBLUxYqGC@jC%OHqN2umT0)R8vf_uj)J^xA&vbhz){BLRm@}?dPsIY) ze#G4(`__Du{-+%yW(O0nnGuBF!UiS?!aWaS)ww2j^n_F`c&H%=u;=ZN2pa1R!n0>1 z;^iTMA+FjZ@iu}8>_g}`k?eOvMsVE838)Jq0tS02uR*CPjTrd|s0DZ#VZKO+5u5BT zHLEWpe1t;LH{{lCFncCS)t6C-0Eso5#WHDxVWAo$fl#Bc?{8376+@*nL+?XFjUq#> z&QS&}6lO+jTIV4;BO&)jsP)dn7+S(?2*L@F;hww^oFk!byj(tN;lWTw)0PN_-EdZ% z2)g2McDKlYbEIWt#PUu=0yJDcGQ_rm21O8=waXw2jSAgGgpY(r8buN}L?(~WXJ#V3 z7NW{qqK!u|LA4c84GXk<1TlBdqX_RtLz5$0pwUeBQCK7esu)w@%Z4$GVe@4l=8YMT zq$R3|9cqaAkQhTx6^ADh*HsZVO~5#%7TZ_B7_AV+ycxJY61xqJWUq)r8%4fRi!0%c z>okg5%ZOi&j9-N&^b&AyLF3<#B)EvgpDn}}jKp6}jszLUo<=4x9w!jebL;JJ*={EO zL>-Bws?FylQ&kjGkR1%j|jN@Sa+`-X2Z8vzMw(n0A*hO;N~b5u@iWlsBOPUl`u zH(_osUv9s8?w~{Na8&MSW$yTB?&MzXG-2KhU*0SAym^PbH&JPQYlQh5 zeED1I`P&ZpJ5l+2mHGRl`Trekg86?QHbH4rggI73L{~&rRm6-{#C@npAgWB_uT0UX zOmnQvh_1}4s>~Uy%==JTKvY%4Usa+}RpwarG`gy?s;YXdysGv?RUJ`v1Ald+M)eEF z>gMR`)~f3EvFgqb)!jrjz5F%(8a0ECHN(+0qg6HIV>Oc>YNm;5XZUMhY1Ga;*1n0Z zU975I9;;pbP`gI-Y=i&Vmd3Mf$7egy&-SXG?T`0(sAQQZ-L-HArssbk&O=(=xJ zb>GM8etxLK5Y^)d)Pru<<2lvi$J7&6*AtJ|lkV4(6E{!_G*I7exZ>167t;{_f0u5l z&}TL?&~Y}rC2o{7c%KsaK+(p;K*lt+&Xi~PA^orggQ8{nLo3HeRwg4>%qrH|YPPwf zw(M?p<_HI)F$Zoe-Ezk%?{~fBe?Yg){wKO6_+ z%x&y?wcGVC=q)34FGed?$2vZaH4crxJ|1sqov2-(=susU-<<0CG}YQQ{k&)9#o)}& z#LUX}O#kPZ&Tq3V<8xcHbITv+`oGL|4$XHQz3%_`X7t;ep5uk#PmALhi~SQz{foCtr`xzMcN~c80-V001Ds06z&C)#?j- zQ3TZdCL{Gl{Sg!#8o6o>#e*?WF~^mWhLYg~R;6f0_2;FdDZIK>CZo^G#xsPi#&Xpg z%O`VS9v@al8=p=Wz)?g@8ch{5C2A@BrejT&ub%3ZXyj?UsG6@fYIIy3dr|$S&b&XG z>GsQ-#YX#ARi@)FYnPi{x5n~rH$Pi#_dWTrI^JBj){Xdomu^w(r{qjmv!vx_cI&5< zd33%JBV3OCs<`S7-HL#it+iBK-R#CI#Z5Ll{WWp zbW76jbW6Z(k*rI)rB3h1CEfD9qgcA-i#q3jrdyPMr(1dy5yt1b`-DFSKyUJ>A`mUFm_EmECkQ6$Foq=$N#hzuJgx!3 zgj+r8fjqexQLa3TOU28E8D_Bt(C(oJf4t7t{ zf}`%wuG86XF=_Bx7^G^G;G-(pP;5)c6= z_*tBzeFhpuIvKLa_O3;Y8Zd~8C?>IgGWSj)7<>$FNHx@H<&d65BH9f?|?`tG*-|bH^f>d*9Ui7gc2^YP%d$Z z!|IatjAcvaU!hx$h?EW#v}AViA6}Q%o1x{0q=I%@fH{Led$b{NZ_fTFQD_jR#v8w_ z$wXvCrL~4xi>!9w0m{1v68J6}FSVNac&zI6dv80YEVaMTEmr>`-GauISUB(ikIpkg zno-7abntDtG+X2Ku)x4KhMD)nuRWj;biLTR(guF~RGgQ`I%JXspR5ipfAsVZx+R^E z?*I|mX5(!6U=dg1S$ykM+>sZ+7RE&K2iJEZvfgrCVhGE4syH=JRiKi+S&Vrdt9w z%Vp00o^E+}MSNl4ce+K8s^mZD77_ET?Ep52s5{#5UMAi{r0^efi_bc7V6LSP6(^Q% z@&27|fhql!Zjo`NTiHqqqpgSE4d}?b=~$;R>`|#8#-gU_;}_nsG_QWXKI@s)tr7SU zPGTKWicu)PZp4Q+XKGx)L2{Lnc9O;%9ySx=;yov;k6{}-F-JjRQ(yl?x4=P-#iZ|a z@xx9?m%RQl-Qod?a0R^s8E+p;+bGM)a%k_mD~*ACBG9-`%_a8}<>0QwoKQjEezB~- zrCVgoCEGnvbxPNB{!F)^ac{Z$6{h&=PzB%TrG7ui-_C(2MrXZQuTHk@^$VdvamDpi zQC*Hh?GuW8{2m1vuREuYYeAuP&c8&r2ot-b*%SK|!z0Bk0pocq8M$3@kt+h-R06w- zQmgJ!ax?-X$1K=GL+zIEtz^6;EZss!_y^t6EKEE@CG+4VDDNH*J`T8!m5lFP0mlbi z@H^e|{I7J2D@f&)7ejc2-!6GT=9O7taB_*{DQl!a6G=rgzW?EWN4LDeEwPv8IVAao zNGSMs=@v0!Aqr3umTrNAp5XD#!YE)$Al?*05!e4jx442tT>nb9P=Gv|@JryJ33rea zKG#q4P}12DW*V+pbID3jVppP^=3eSA3f_XhrCWslo^J8^pU^EbaG47oi+gasKnLme zO=1*^Eq1_tK$q9MUl4knQK-(>jAmCg%D|;0pVc*iY^T5UASzfiJmb#rG^QY-N4^k) z?V~L2xr3TGGN+8GI%m0jOHyZbt}Ss<%3$dhS5Vdv9h+e?jXni6=9VZ~8U@)-BCbfj z8i!;EPd-AO()uiOTB?AJjTG!dxo;j5ktCO=R`6>Pdb~&f>@*)^TnOPt6nMd7%6zdXugP{<;^UrcdSxRdDA2qMK5h3v4( z0l1`$ez-xdWLN#jB(JdM{4L!ggDwa>D&aXI;dCq>$6?($EY-PMc2BxY?|zxVMQP6e zh;GUHjc$qhoo+e&JGv$354t7ezo1)^{t4ak0{fg33zw3YTZTZPoD;xy(f0Dqu`+<{FmsKg5T&Ck;6uv5BOx>^rpNNYxPNBHd6Bl z+ixm@b@kp)g+(I%MY`n)=#`54;|G#6vt$>#FBnrI{M8xRB4PJ(0*4vetj>FpK^DW!Ia667WrFF|3tSe z{fTaA14Uu!777q%3{tlqbSp7Jkbud>wt-jxE}4?8C%eQ4D3N zBXr-qhrtRq=($|tUfkwca;oQ`3mp=UWh#Xacrva9Dj4K2sCR_rT`dD;p15!xlp9Y7 zAOLGnbge7iX&GERDt$;&W)VQf_M_CMV~EdjNa(QC{LRo3abM6n*Q!pxM*vylA=y59 z)SHd0sI@i9^)GZw5-=h3|DA3LAQpiWt?FDab(PW90NI<6&YIV)Qh;jV<9C+?R*UIS znv{*^J!DD$kZwu(E8SxE2i@Wh`oBrHH1-r1_mlDa;fgL5e=IEem{6SWkDJ^`jGgGU>Z9IHFiZ^kqKYMRk*5l9TQdk*!KjNHJ zFxXHSGJHtqZ4lulgST#uFH!1P_0X-B>gb=*EzRazuwSI3Z8f+emK}%}lStSqt)R<^sV~welU)fXH+^-eox_+^# z&&I|EU~X?1Me1obR&a5^q%kjv{E1~S9Bfn%RE1=A2i%Dxei_A$-evsycXZ1cq1`{F zTMB-sTax}lw`8$8iyvwPs37-lMBEcccIe_axc?2^Qt)rlEtG`+BHbdWL-Hv)-gzKc z`boq)mz~AP<~Pg=@|3GrH`=gq)Zku-nfY6A1DQxq(lXiIf*74o^5l7#S@M4QPYJQ) zg*l=E-MV|fjO3_gX4gN7;T-dAwEe(sKYcc$+4xyVk92Cm3comZVpqX@hx|?KMor$~ z!9S;48bO`=)P!A>;1yM(um72DX*?v=c1M2+yB7Sg8@J)B2&?t)bc^7h>6XAh(Jktr z#E@i>deprEcX5P_R}b#a)6(Wv@3jwdv2iN#m;S&E2-6A&me~xbPK99lDEm6iNW(BMm6T356heivH ztb@m|30obSaWOxole_$eqSqv*;^4$#YvQT0PxnOhBj*iOIG9rGRF^Og|Ju^%<-vB#n_T7qS z$~?AXc7E5v74xk5t44PBPni6s#}AW z$`asOGNr_j>k>v0YU_J67S>AmM{WdsAJ^$O3AX4v%sU=LDv%k&IT}*!Q|sMEBU3}L zIqpU6RL+X>+sjr1Z8On|S`K07ivvG2#A9&cEC>}fIm^h5N4bQK8RL{GIk&~sHIp5} zc+hW|!j8!zCQzi}nd}rUY%2U0AQmrk8#StpFlzB$|FuL#Vl7t`m+~gH>_V@oJZ|Xc z19L+EmR>@|!~V>XNWQPITT2JcV{WAVK2ouWvGx9mcVNJ@Cb` z6>@pDIPZ_<1Vc3Vth2mvLCTvDB_n(aJ*6BbHDurq8jXcRpK0IcXkXD6Jep#BcsW?b z`WpeZ zl!drd1`WmHf|hgwB^aQWzfN)npZBd!n1#4CxiNt2VTRNX=42<9SqQz}$qM1eMMAn@ z_6k#NcS9!nh(&sC-GquL1~Xbb6RhZ`B166zJ?50UVJbL%ezBm*ZO|~xam`(OPU)TK zn^t~5Z7P&gif}GRBgxSqm8EM4tr{EE=>qaP@{oit;48TIONjS?aKdZbN5z7yv9Q^Z z5ZXwb=C*AGV!!XpXQq-ocsVY}C{J||kI^jx1MH8jO&Su@2zKdW@ zTG6|g;k@{2v#$yygRMQ}Q48wVZiKi?Z?sorfBbCEaMTNAg4Y6NAILi$706iP^UG*N zfz}=Q;H6dM0tS~xNJ{1U__#nyA#_xk%{2*mpP1!~;w?okN@&NUXsXk`ar1iBN@-;1 zHH*ISz};lNgUwi>YKH8^ujVP+)S~48fBNvOQ8e(jqk>_ zW*cJWq)`vaU%&47jd}SrL?gYD8QKdNAd9uI%uB3%L^d9Bgo^3A@|c!aRDP0!cElnn z#>FTMrdH}q#RsC!WIdDen>qC0%~jq+fT5lZ4i2{%sUg}hZaw9L`z>tiJDI91J%^vD z_WDRXb;6@4a3GS1Ealfj-mw%xWELYo7^~6G`L4wiO3M0gUg`FI{#_VJ!FmpY6n<@U zmUi*{u0*vQ=Vv*0@I~%^4k6EMI0Rrtf`D~~%|mGrE&^%yj$x5&vpH&)!bv6L8>0@k z!IK#PBnjBT1V(TI#dVZNCJqir2adfkB0RVdcZYQd7lJ?lh#cjaYj=eJVJE6E1TISi zf*hMw4>Iw)LX2o9Q&Yx6lNSQXI{?7F4m1?>XH8(pg^nKK>O!WQiGm-l)GXMrWwTiEj zAQm#pW!R@gqDTk|lrOZ-o~xG<19QL*u;9l70Oygq;aU{9_y~ZF0(T;(y-y$se;uoc zggG&E;nzdlDiDOclw$%(G<&*f(BXCjFCMc(esNH? zfC=%`h!?HJ82HN* znYSEDh60o#VI z^{t!Q0ZxSsze0cztLY9xy&$(AB8NAULzQTXl<>qUoqGnHLPItK3lFKp+w58vSgzEv;c_ApO@I_%NW(RxL2P1y4*qp%r26{Q~{PsYG%0cE5da;+f zgkI3fq@jJ(4gJ(Jzd0|wV!t^rNhGP_{+G^6l4p}F_qqBv>TryQdO~S?&?`vW?H=;E zB=UOo-<_9!hLJlTQ3WJe*a9bcMb+P0nCKi4w^AM%km73;!qftN+7n^YuhxSedHJyb zC$XE#xL;#`Uq$0(GAaWIu<2owoen_C3b8nR?t#VwRRFp=5M{U& z8w?nFG%QUsSrX7QeKJk~LujGHF%%K-AjjOF>_p#}6}Gwl&P);+Eb@YqQ2=-mS9TQ_KtQeD%V~}~x=hVVVxHYxBbNT+wD}}!tVpBSm;F( z3%%G&W1*KIEc8-13S~o1VxgDBS6JxfrWfpY=S5j1>Av|Z=!`D)UYK%fl04RVsnAoy zIxny}vA80E!M{2$Ex$Q0i#M>&ivicGPV~5TVEW@j?0rhnBcGL4;b~SN4NWBNp*TPb zz?iGAvnubt_#5>C*AIc=1}B299Yi^efQ)_;8{c?sXB;LSM+v#j&nAqZNR)lLA1>9S z>nvwb=V0J^gx~g&P^Ui3=``li9YZj?sQ(?#fXO#BM1Q4TV4=QH4yxlQVR+e-?HIpp zL(GO~gb-(BoDl~9*icz2R5!am%*IHpY$1&DH|qsZ`ptSV{;^W@M#k8ai;Bwp&Q&TU zHRsVp8=(7l>IFS?Nxjs|C49t#VyTyk->4VZVHdv+xlg?)gXXwXYnogoM;hp>pI%{-w;)y%Hy z^>mI}0caMYj7oXX7Dw4Pv3w1QBukUk+`aEI2$mp5Vl1veG+0~OAHa=dfT0_dZih%* zVT9%B3N&kF79EatC7c|ig3LjgJyAg|jLl}tyeITm@Ff|~ zbY{M%K`1Fu4&PY~2Vw+3TyF>`Vi6)FULuf64I#x&EG@LjZUf+a1QKLUg+vl!XT0Zc zk+~x96A?EK@LnI_k!zU|CL#zETWJyz;a0eW=3@M=0A341RfSSA5h3>jkTe2>b0Iip z0GYieeF~0l?vi>{r7ef==v1BqpA;G>2qumpPTepA|D>Q(ebb z>$wD95>>xVqUy)`?ix53&!-dMW)#B|w z00zBmIJlE!xn+kr7HJ&bk$ai$q5u;-6pk>0>X2IdQ>RH z`m5VPZCRcbX;IsZE8MT#e#evotYt9lDIcLj`vJRHkA*?YG$HyD04W;5WP4B7+s4`6 z@{9v*R`|i=+`@geY0If|lM$Hq0GufzG+rU)bspR> zQc;H9)xvSLwr)^Ccn2W3Re?4A;yBfuTxQ>W~Oj?IiiFMrE`o@RUL>Obhu+luf@DJ zz>vYW@eXmEShcp|x*opm3xS=x2yHI{m@-==b~eyO~pZ`41`AiJZdEYW1j%LHsGNo2$#ly%oz zx6NPJtm5pBdH6axk8IMUC%sfLw1lbyMzTAKdn$k;95UxufEh?fGO+&Nxco#=xR#-Ln^*^nB;-xOYVh029Gsd z)nNUSdvWc+4Fw2V;I9KM7GspDEPx}kWe`4~>u>YL#q^%_XZ%N&kI2;9l;mY1fRP)N z(JS2u5uiIAAz0$3hGftGyPmm<=eY3S|ik^Ib<$ zQ$%R`hKTh<2;;gxdUM7@fwHEHKxnvg*&Y9deKFsy&5XEYUq(W6*a$3UM2MCSi~}Kt zL*Q8uhS%#C;Ypa1*A~3Pc$_q*%l546U7{WnV{^NJ8!}lIkPr|9e zN?AWnie>hn)r2v2-cccgNt!-3eDCcC*9M=_&=H)7Z(n z?dOIhHGV?Cz7|^4nc^Jey5ynO6z9{@Ikbs`(4?AyDA`-~a6@+YLt9lsZ)4?`$o*zg zgc`m##}WAfyEh*y(ki2v)*NKvhnI36UjiN39348}i~s0r92X4q;;AFJo^{D9w=3;{(_KyRq_qj-qBP3@HhPhK}!pM(b-Omw|%L< zAiy>Vd9402jm@s{N07sSS#fy?tN9M~GOv7uh6nbwjKERzr^@=G(e0^>kSA&ZUew|( z%K>|!PBVZ&5=7?Og6QdO^#}w=5Iq}=$v9MsCcFq%wxQS%h#)$AW*8uqzDCmY9)Uy1 zzyz51y7nxI+w4FY3ulExA)xCb-qNl;R#?wAf=bBo3-4^HMusSt-)3lc#l5HZY8_Ac zU)S7k#?mb}KGZ9`c^?>e)nSzI>Sp@Kq6X9u8>_Ka6O?`XQF+CMFFfzXyV!1Rdxwdw zeav(n9jm2Q{W?yI;Jk;$GljQ)(U6>=U#0U|ZKa~S=wDqgMxIZt~?fU8NiL4aou_Fo($*j$5$nB*yfb;q811?MWSF#I}2SOkKC@YF32 z87XYO0T7fLrGf!HAQb@;E(JJAAU%oTF*RYpi;#;~E%;*=@~#jO7!mr|)DQEGEAzIT z4MUbqb{77P!aS~U*sm3K2!kay@0a18FXfJGl66Re89Z0wHq$BiWhnG$eQU?UYl35# z%GkF7#M+>)nw52F!Pwb4en`MtFuzkUAIi(J{i@1a!mydiwP)pbgsK7kREg&wAd^%| zfh93~cawPr#1x}+Z^tU^-5amzr175b5Q+{Wu3YO)cJ+@mKIh~+%>sxJJ+XVrrj?+Bt@74r{r1_J(I*b@}!!SPaVmo zz!DrS)LYB3wt8S^0i%R*p31YCdJJpb@U%mmI(Z|u3PhD4;0^^~sWjjC?;VXONp7)20zZ~rD1gqb*&1Jc5) z0V-F05VkmzBN{`kW)W;usr0KWDJm_?ff?dq?)*%Ifae`jvui}rPIEB+)e|mCOZ!QK zDmGD)=Q_IXuwHH|-hN>R=Wq#Y4bsOI{juj8;3p0ndhx7r+Uhtg1v7g5Xvo0AE{C92 zeT2blw@41af_!=bbzSu=N~s1#3WxUb93~O8%9h;O2L>n?C`7K0C=ekM6^EOOJGQw# zlfEq6(*JUoLClPt3T#}0^hY>Rhsm%&IvgcB)VNALYnDzx}{A^l9Af>IPZ zl-3o=@-xWb}>Qag;&k{Kz6g}a8lAT1(;+ao1%v%6!=JdrX8 zh*D$Ke672ex6C1n{>hox9a7vtiyJ8to0gb60lBy~x9kfg&JOkOek%*eY3U`KN(@sf z+@{D1t+b!A(5?B1D^x*QdT%fhPtcKrN~sV>?`Q9n;3xLHibVyc^h5gY*B@(uzzs5j zrFjNs*CqnNT8D-mG~^{k0*6eRT-ExbsK_JwC2To5K!n^ff1UQCzEA+#E5%pujg(p^d`Pfi+<-Le*fQnU8w*dJ z6~y?BL4ODn+oPMi&N{CLom8*=JZ|p(_47IcB)E=44Ta{g3WreFtmBEb^pZ%_qkf}X zGJA8N{klfuiVy9L_+_9DmG^O?&oE22%)L&%gz<<Eg;j&w+K%{Yu~Vz z504f`*H%`XB5Dk;(Eiwh=ZXWiHcm7dEpCali*%Fv%BmL3X-QDRdUTWNB2~zvb=>UN zpXe5`>{m7KZ#1?}x&(>k{!X_{d3Ska87?Z48R8XZCv&Acydrm#?f1MZH9+Plv1_-Y z$ncaJTivBhNK)^&IEOjN9Eyq< zjOw&$w*HQJGiBi3+%M!}^}v7Sz3th{;hzT{jo%mC&o@5!@j;v;*jC=(N3b1o0a!m4^+yW4rmh{H<=&iDg7*D zXv&3OWzk;tS6a@Q&(F2zqpNi>dh5RYqbb-oyYzav@I4vztt<0~DyFt>D9gEiIyfTd zCPE%ubX|!>1~wpcAFen;VhI&1m=u$^OA`Cgb1uzfW4>QY+(DMhP?N276w?|{)_iT7 zJ>f~XL#y|0YoMekUAujdKQUjfCADP*q{yD)3g3z%BM)oQbpo*}{VpN?D4rNP+l?6N z*i9N446mv^M8v2gL*)r|f%J~jb>Zg)Js;H-zN?%Xb!y6PAJ_#Gbx2nFeQ5Xve)f=P z)av@h^K~nlpRb3w<=q~LBx>v|!8-KLKaz`du=B@hOFLy@qM34wFWf>knfn^PA9!?s zm&h45%s|YS6n5?Z3T58A6MXTsG%x)b?Zb zV(ev`Q{dKHVRoHjqj@D--%}>6iJOdfBE>Sx$8u7uM4UNO>xIYLl<+l>l^}fm5n_$d zB<*ft7)=uk{Dso+-30M~G(vWjZ$GJt%cqIc*Uyt!zCT&(w81&TdIXXE%pVolNwlXw z_VVOnLc{~GoC;!@u}|<2L6?Vsvd%K+BI8~lXd-*b=Hj5jW)AR&de}Hl6NrwSlD#mB zGm@3zWf6nRpzIfI609N6X;GQDPWD6lZcRGTQqlLGLS`yFmY{W6#y+TW2u*AsRjwkf zUcYF)LRHzi>_ETtqduyRY^eAuvja$e9mQq2DUgc`Wn;(l8hoD?hWD+A{~0?$7bDY9 zp50{Lua-@nIX3vHN3wtAF~>Wf>E&)opZguZ%_qKK0whfQi-W2P9nGW zsi2kE>)vNfy?gp%gPy$pILU6mYNnXFb+(FG z=3Bw#Q0u2q?qgnArb=BpnM8g5h7z+h6W6kWOK%C~_ED*!VD|kGmoe7TD3}(fieIk4 z7zezQ1v3Bkz#@};GF?@g;+Adbtq;uPwu*AxA>tHosp@eUbnZld2nqp+Be<}AJ51R8XWR=Pk{f9rc%!__~bbkn|HFmYSbB^tuV_3&)_ z=KR|q-)yo<*9`H>&1WGNp9=4}=k2ef9x-uSt8$+TCgvwWuRW(A6xCtLpSQ8Z=-8GQ zTHhlt4C8i4FL(GbD1|#y4CofVuzEI%tTJHST_Fj0YqgJlV`>QU`gaWs=kZj1>Z!$(F30U<#^dc?u4`+@=FH>sd;{ys_{8%( zkxq1q-@)cn5qKA(UCE*b`<*JD1{}Zt7H9W(>giX~{;Nxxha}UF2K%p@5(SY}*zxW7 zUgbr)JnRLNelyQv6n_|^S`n(n8)gtDxkQ4^{hU<^laQqnV$+Zb=;O|fmngIj%isO% z`QE3BH`*-EXd#xu0VP3Tb9|RdXb+^fPykcz+N$Ism^i43zhT&5^+ z$V2f?<^CNNxnHly7;)a8D$&Yn%T_>=T^tryczpc$GB&T?V4OzFR_ZD9={Mli@|8ek ztuwp!%2|1R#`yMyZ3}w&3}Nrl6H2z~N_zoVhmf5ouy6cF@%5HT#m&m%Yy9virALk` zgCdRwq120y6wPo`%QW~YPC>h);W^21@}5p3WC08QM(@D?bytBo! z=Qec`*sZ0~1tHChXKgJ1pa;&?Qxm|2h)O~<0fX#gR@M43UR1cf0x9P#8QLJGqn=rX8*;Fb_xM!F@M zCv{ryoS!(iy6YubcYbAa{*-is!m52g@5d0RC#0P@q&>YFty|*eG8c3}YoqwuU8N;{|IH_)o^?n`RCRFrr#-Fv^vnm+O`rDD2iJ){sZl=L zxb|$5`xB4&r_cY3y}JyGqh0$&KfnNkyW1eao!~Zu1PFmZaEAcFg9S-~%na`C?wSC> zB?On?1a|@yrwz3ZG)wY$2yKlG>Ws=mAX>g)dhxS^&^yF}id z9Q&O%ZE`pYT_@-A0;QG5*ev`XnAK%=lMx7#;c{K>Zu~?i$0kGao)R7IfZ4mfr6ZMT zfc?%xQk`m~5dkg4<(crz>~rIune1S7L?vue!Abpeme2k5ao@llblIXxSpX_wrX1Zl zx303@jtUCnG!z$*H*8LPv5z8?2_TF_koJ&*=8G0FDnMv?7hqnt z?V+S_&6ql$2A_zi@_mgzamqaDc3mOLd|CCic6h2tRx0yWcJ13wClV*Od+rEk$J^@+ z%S9SwP>$;xazk;d!Du|4cvIYDD4m8&5Efe0rOIy3Dh$~$6asE23uOLX&?^!NQyf|- zc@KoOI-9(qUe8ub)Jcmnu#-j`P7=tbcWAehWQr?hLhY+B`>umec>KpJk|pkLLSrG3 zif9p-8P_3_Nk|EY4BR! zi7v|*tj+uME3goIuO9<-)>oMxS7@H@E8HFWtbhcf*FnPrWQ5>`(_(b$VlP~UF6QT- zt%g2xEJ2v878dGZId4(M;es^dkSUZh+ER4GZ*<*J!ZV|<#6ZMGUv-P4X3d&D&`V7T zazkZCUp;xswr{^6+15#u>Z}W#9n9=7_UXc6K_nKfVR?6wsz3!C`5#5;S$!6@_R3U6 zye$EAa&cb^@Y}iD#`M`#RNAB*iY|Fya9ijuLB1V^+2c$ZWW#6yUgiV$77lczypoev zQInN|5?hO)N{X@&9of*%Y)Ef%qdJ|{)9ZT+?y5X8gb9pcP?H9LupY1GeRP^l^c)SZ zrv*=>2&Z*GUx}K}i8!8V16$Iry-jo*WsSYm-AzmIXXIa^6|=L%m<92%ECeuA^E96e zR%0NB(m(b=Oz6+CrlR!wZRj!~Qgvv&Y*`kl^yWQi3|N~&N^;S6;i1oa#h-NRZEli# zu2c4RTzw-2CoT!u>M~U~fh|a^R8LDjN2Ly>8~q@DlxShthN?{|-qW({^A13rEXnWW zO*VK9m9jb2v691G0vwck%I0(s9;PSzBU(k*DLn4cRm0vcJ#Vi~ZhLo?R}}Q9$=hVO zyS-I%Pdh-+w+lZt6QU>(|K=G^Pq!23bl9 zbpd?0^EmrUB38R98Q`Z4fbrO%mjQ2CH?>PkP^x3@kmE`kI<3>gCa@fvA{$7Sge0mX zzOY74)sy({%_V+Mf5^t0gLj5j>wX}=QsHXdyS39T|Dg7V3#V@8o!R$zCdufZJr&7u zulKT8S5Yvvb+p^lZpj3BK^-{=%OZ54ylNxdr05d#@4M1E7xwq*pdZ;^ zI6jM`SR*Lsf2il9(Q72@HHuac7u2m9)GHPF!p>0JF z9J+PhHTex{jmTI&QkC>nW>uz<<69p>DMT7) z=;E2IW020vE82@f0!#HQ$)~3$7bLbCdGcZ8!p}%|MxBpXPgO5T?X}Cuj|MaEPF$D0 zsU*5uZ_^*!)v2)@Ocs7l?qpc+Ft2`~M_^{`b4+;m9jiu;nA0BpWk!h;pE?3#kzB-w z@)x?LkNt5-FWv1nu?8P1_eUccf6y%vK|V*iA142SZh7s#Tq2)lT0^z{C3e=@qfp`0 z>+R4;Xqoe$=oa3F1uPM^@CatVSJAR&x0#~CeXv9M_~Mc)+JF2`x4f~bA7JaTj=Lkz z>STLp6L#;LI8Tw*seZonH`?FnmV$Ct9Dj`A{8SQP`bVITdaPaAfM@AOqL!)Mc-L=q zi|g%cT;UV82>Zgho*cEp!~S9q&(KJ`)ytZ2aq8dcmN3U9jg0Guxln5VM|O=x zwv~nuwY;9HDJkLM6ow$pl68WS9e9<)h+|#*J2%&#Pj4=2aq@7 zeWu8MRbY#j(R8ua%kZ@2w4bsSE9}k@AFFJp`*SjJBQnYSHD)H6%Fy=_N=%JLd$r;_ zw4TpoV^E`PZO{4`CdQ(fNd2ql5xCBzdeW!wSBrWb)#=xl8Lk(-N~H5YJjLqSUvWpp z%=BUHGQG+gN3qEX-4tg{e_kybPP;$-QKszVf@;Xen_bY%*R4yG)6c*C&1(L4j!XLI zo3Z`+qaR~XsR`gDaZ0PU6Bt~SXf9(yG@qi0&=tKAo)3pq<||9yV0{TB(1d}-ip@n# zQ=PcvCYF!Ep`@)&k2%R$mKD^ST6yYX*_%SUR^>vw!%t$F7?-l>#@ESfmpACy@2wH{ zDuEn*Q1FPbf`N@-5$DKga{omY_IwCNSZRKQG!{BTOK_VA?FPLMC{R=TC{!tP6#sEg z7ES>`zHA_tiLI`S8N}Gc{=}JGVlxjUfZ7ulNQfT;QnjchP4goXcHQAFvQ{QRsPWf2_nWmKUydxj|pY-zOjj$Fm$qrJsYa@SH;< zx>RbCsvyDjVGyV3Q@@AP>Yxwp{!n0;Ae@CvGjiEKwM*W zOA3B3ta>u(cH|b84SO6(7bDXdJ0tKlmb!6IZCd>arj$!(n8rQz*dP2>Gg5GQ`MWHE zZvy*w{0v6y?gTP?FGLqF4O0)l3#7>dp^HV&4Rsv^khfK{unq4JJS#WptDGqo?Q>uG5aDFF^Okbi+a``LWAe`*%h?@B1rE2NJ<6<`<~qy!za; zn-{Zo>+D5zItl`9-mwsA zAdaW^OSD(bVSTq8C4em(1(LGW44YK}2E3(tsKSMcGj-A3;Axw^RPEZub0}XC-eFC! zu8D<%=q~$Sy-fa$OLQ$=t?BUa&Mit+BI`RWO-;t8;DH*c+Q*Hpn*0?W6d&<66&Za! z*8-%}&(ZzCWB6yG7z}r7jJ7V58T5KMMF#j=uBBF@2t5gxNH5*#IlE*rV!#sN*=iH&YbWAn`!x^3 zt!Qr%CdZj`_Eu?g&_kLsDCAVHsV|Bt-3UiorvqhlrFxMM9kbKdhorbQs*Cg-5^?A| z|5n0LUV&d$n+~r~fn>{vU_4<(wwa!tRd zxJ;3HfcJtaG1lmZomK&A3hJWh4oWqPtI@gh_lDH`*%K;^a!pB@CSH$!}6!23mnJ#c$ghG{1Phm}X+Us;y+_7&yJ~dgF=QtuN~?LXzPRbaPY3 z=+KQjx*Zn+UZo|_Y@B5sI_F4rMOz6jv1kO12wuf50FoZ6j3Q~{N zqt-F9AMnafZ~f?K9=-L7;HlJ;%7r_-qs(8LyW-F&n%rAtbE75Q6wvn0sxu5nSnpmZ z$1iV1VK{_PpXAZVKVB%HZ!ZAc51-1|y?%5Ie)pZIk)y<8QgXje;9Ro{g`Sxb&A6$= z*QVjE`s>{v=f<%|o&=rCBpV)^A?pMquA^_ikNYnQGMuBF#%C#8d%uE~+* zBsD8$rqB6|3kN!F_fJ0kM3BRF_O&45`~GLoQFeXd9fJekM=vzOe$04<_saTBQs{{Y zo^&_nrS8ih{}pO*f4*q`*!}64v5BW6@!)Yu-cq4+gYUQu(^PE$oBEmP-8c0SeNkN; zn9f0b)gHlvqJE^ids4SgW-FU)UDnv2jc+*`^YrcC{d}@lD?@M+%1WvYMP-X60fy50 z;5tRZ6Xf9JtS)W*`K6UG|MQ5Ko0#9PPe&O#Y!?zkflB?r5a%Sn&rFwnOX}mN>NYqs z)ub+mXlz}N=c*#u!x=z%fuQxS2mFsKAw*lH=ojD;$G6z2;Rl(HOu^M4zJ1{9Jm>JL zvnR?#8vIcYN0%_hY8Swy06WYNTWGE*m<;QA+E7>0^}J6O%baw97WCX%x};*GGc5&|T? z2of*U9Zv+q-8go+i9&7!{R8%F2~9R{c#B3^G%ArjK7NN33b{lq$$l_F{(5aCBx$L4 zay-|u0TAmLXXg_40m4RVu1uHH`2 zHWHLT_8p5MWW5t~6}L#$L%dhw=xfmFU)IncYciZ6E6WoM*F@kj1OtX3CoKmXt{c-S zV+lVhBibb6wbhr=VAt3#PtpS>)6ezq?UvhfTUeRGz!P4(dyGnx>a10Gwv40=9oQHH z?3{vZ{BBH+5hUJuc*Ov+jj*Sz53%b)S)RM8vEGjuS9jud!QNlXkA@R8tP#~=(%r+v z8;+V?=xE~Hqa5hK23e%9trNAwA_R#Wy+6`Fh0>qGJJZ>`o}ETk6k|E40J5#uTfs2INMraw>w}b z-tCi`j9CDCY#Z^Fs^h;bnj$UbA=5FAqwS-CX6^ zum?URBJ~=4b*Go?q%TWdVncj9LHKVv1ZG1E+9nykpiqF;pF~52_SnOIY=xV5GI&lh zaBgRtR^d64(6e8bw6RibQxdE(oew6ld$z8(aPUdL zoISDScmcz8VGO*EwZoYbd_2B=#@}B_+KK(DRB(Cc%oh5jX%O4K#2Mp6F(N#_66Q*L zn@J>*K%x#I2d8L&$Z=?^6fiS=?!#yO7}io6WZUUl$&TfZ_1G^)V0G`wLT=~)V_<1N z*F`7EB^Nd`Y9vHOLa85nt9Se7mP}h-M_VaLTXkCd zKADb|j*ecEj?uJ^8JVu7j;>9TuEVshGnt;dj-FSN-t%cae=_|59sQ6b{fKG(7&3zd z9fOo4gN$i|>l`w}0v*GWB*Thn!zwbPS{#6-8#noNyatPbfdydqdF$v zp(YbaCW~K9W`#{B>P?mzO+M6{ew;S_I(dF-M4xw z!(=l|aw`*ZE9(X;rBgGThDT35t#wYVgvf0^)Enoa0;4^NzFA;IlLH@hFp$ITZs;0D z`y09jY>%XC@*8gGntTyD4MzI}PrH~IyS!w(k_P)Sa=WBtyD<^_>14YXQH!VojuMt^Mr!EVp8mUQ(Q{Co=42s zLLoot8VJT{3TO&SGJ57C45gdX`$^XT{H!~v)>3?00e)E)zVj(QBNV>V6n@JozBhCY zBc>Y+ME#Slp+Co-f|Amo!xi&Xpr;|%XDCvjXPX$@&2RY2ai1guVV!>Le2tCSxX=nh4 zScs~3h@y9}8D(gYcUV|ckUJoZLqE*UJD}zqUN9G&r61@H4S9MFe{&wxt&cEiiU>%J zN^6SD@s5<%51&p2`RIpNc}LgJg>H!XvZeZBHNntReGe%7KBV}~&r(^T_)kOq=PBaG zQhe4$eP7^wZ%_o^rC0iOK+oUBs%qKfj zrMMfUc+Dq?rX|GXQ(3`NLef$r=2K&+(h>~PQqt0{Gv?E#J5&7U*%23M74zv;PyNz# zZ|EA+c^@xIV5UVPlCIgLNc;-P*vQRr5Qh)xVN_FPZy01-il-N)u}eM8J~qg)qskhB z5V!$=OS4J-YgrrM%>L#~`T5-M^B_nru48i+O&;NCT0WOVb}IyL|APH+K3{U7K%Oeg z9ZF}_nWwY>(uQCxo#iA!Ff#Lq{2=IVYZO!P+)jf`8Uuo1c;?7O0^YOY5k|~=xf!l4 z8UAn_V+htGLpp3mOygXfS5UepxZe;2!wnLjU|!}9AutAG1{#!GgRx#huw>FJhp3Z< zpA|qmDrXIICdBbi){w*wJPZ~wrbl;OeTv!AL8ZNhjm?R;R%Km|`L4&`Dw%GImy&nm?iYv&j0?$acOb=Xf_W=~Vs zn=PhIT-YMrnwB)0+eo*j1Nh0XMpm2(d)=Xix-P3TceN#xh$hkZGJ)!fBAh0x)x4Tb zf+{y16lh3CoqpriJXrwBwB|lv{OQ&}N=F!rEoffLwqzSzu}#5B-qEy&QDs?MWFg&} zM@Y8@{PJnWja$>wigat5-Yim4w%)ik)F{o*TN;Tdk@QRlQ5%u+v-}&k2Bo<5!l8D( z@ZhQ~ga&@y+uGuB*|xFR^YQ8H`3$Drdei$$y&mF)^7B-rXB1Xzoiz<_6j6XDkp6dG zO~NR^_vXF?l6eV>d0*EY2q*`Fq>4u16~c^|RvonDErjV7+IHjGW{b|w z3tN+1+{>%67!fQtsh92@Wc#f+_W(dYFx3pNWANoI^vK5w$3BW^FWkj*$tJZ`dDX7VEFqQFDqhO2p!zJ{~1MnJz#V!BR|y*|RY zhS|Q(Z2F!pdz~TsJsP>l_PIR*S?yFSE`w2kVZd zv+qktu12>V=iBT2-uY3o*)3Q44?}t`&)W+;WVVOqKc1y+yXujIu5HKBZ|i_-o&C1q zraNgX1qs<(ah-j7t+tGMTW|b6=$PalH~as>)!1jx_xoiJ{n&pev)`L?@L~SodcyRe z*Dq&J=AfSur2YN%;t!t@`hyLbqGOrERr-Z1Q=jYXPXw!<@=W%i9V>%6UQqDX8`IUe zPU1@$D(tqM9I64G?_c_%$KtEU8{#LuUMH$=YVI>+YcXt31 z`|~axR?})%X&c^LjeWoBzP^k}`!?dg`p*Aob~Sa80oY`=#j}pz!2qmmPp_4w19TCG zf|L8Gq-+TdDqW2s1KUihSv_Gt=@x;4O5QCXu>!~~vq%ybN*IC9F#(%fbhwN*1@q9p2frz?$G{qS11ao>bHu+@q2mI76b75E7+e8VR?Zp=tE0PsaR@sdjEdH&?#5 z;_lVg(8NTyGkyKKzMLTHEZ*L7v41cbiDwQ&#aA|mW6+1LX?*g&CZl&Xe9~azLjzyK z))Nr+9uL4*3`Np2et{OjgslBpA>_gA?}J(H+AxJO7V9LdTyV_352CaOnT42Wcr%6b zKl6z9+nw5`6*Lu2Fps~>J7|@lE;GNt1U?CQs9W;{9U=Z|SU;6p0AX&Kv|+rHlx+Oz zgSC-`uF9iiqgUr2(%BpASTpUzo5^j!xCRe1y{Q#GX57BKC^3Kb#jZrpZDuBkT0;ig zt}tHxt{u!CW>9MAK~%GD{j4-0QqRz;Mb|J>W|N~dKmJ)tS(e;x#*xV$K4GX(_pK=dQ>JpL$a6ceqPNC&iWPe^H zI3(13@6{mJh}8I7a73p2Fi+8bZN~tApQ6OGCyX;*wSHXuRX8;+J{?mNmGeFR>bV5N zG^GeM?z&=3^6{DxS@-`u;`W|~_wnP6`ez=VAG7`OFAMB2ul-V*1gFH~nr|oYCOpT_ z>u--$B=YOg~;WnIUbg7`DbDqxFc7uWPXT!@N)ILM0;`n+12O%Dwz`p1G)B#?T=M& zT9-PBKQ!fVTX~3W&yV?kan}UI+_`bC-R99c&v-*SaqTtO*R|*cZzoo{F2NDlK*p9k zlpQl^8i6b)YuK8pT?<5x+9xC{n2T;*zn zDA~IuWNxak@L|2x2zN_#7`9@&ANKZ+AM&_k5to^rkXEr+-pM%NUOc;h;(su9N`|aw|7DNW+{4Tmu1en`=cIx)k4Y;#9X!pOSuvf}GQa zsyhP)0NNm|Xn+pDAuLw)i3dQl*ZN*oxUQ#MeWiD%y-wA_^o!ua3IJpmssPb>^=dlF zMyQTty1|RN6{hH_Z`*%(yc)vgwwhMfZ?YGzF{4R zbjPzSk>`aR57&awqnYZ^tO`98LX-e~Sh=a%3l$ZIg{1|O`i9>3_VR9XBP}9TZ)`D4 z92OT<@9H)VFS9r&wDznbSeo3>u}o?WuLHY@!Byk+n{F96rgvl-TTv2ul1LnQ_Nq6&OPF2BUb{4{y?J z548i$+oh5lv}B(8UbmC>&aD6;+9u2x<}T1qIVQEL4rU};H#mDMSQ6J#&9A9fk3=Ah z;Ng2$Z*u_ZIY)$Dr4q2;dtfC)AQrl%i``~EsCw$?!rZaR8pGVbSo#X;=d&q^oi<>M zkrh{4$=f)Y+~A4#DYcAxkv0H0Wd3L)xfR!@*0*`2K7AMPDbfZdQM|*uT?oC;6#xhY zv*0%gWgVPviE*V{y35oRFsE-TUaLNf1bf#N#nA4kI9wV=wbzw=P1{k6ka&>wqpoa% zYFD#jL8VBhzM?s8S7%5<<%M7Ui#)15gFWh;r1tu%v-CX^92!XdkNO&SBrMJKJomLs zL#=B1zO^ciaIarOy|d(jqh-ay>?hA#1uqU$Gv+lQkl;vx2Lhzng`ww4l->WLQBk|^`6(WMCro!PziH(DZc#4bR;~}n3r9L09_|Vww{Z8yVB|x#87jIRujtiZo zUBxy|r=I(L)pdV;P>V>J8(KLI^a>?!@_s(Ks&Uks@>){&^Yb6Q4=;wD+mCYyzHh7g zUrtrEf31r7zVG0Fwe-3D>!wCOxn|yCB#t8gWBR5rHd_O)d=LMHFTK_nydh`8r z%=NcD|Lbp`-&|i4z);v>Xv)a{f3`5pP#9Jz3}*m_w+%BDK(ilz`bi0%{1N-81V~j1 zrx}3LZNsTSEDB%*vohkAEyDB&&QXfsI)^jbBKW)!2K@*ji+P@Fwbp)`=%78PucD(e!+nu=zchpJi{sCFKxsgKYCq3R3-*ymzI zz@CvQ2bZjq{dgrr@R2Oo)x|-d^7B@(bCX+mLcp+KP6448nxS!_M4=!WKie?*1dlXtj^I?{RI$(kWpH|F z2s||uStgel2n}z#rueKI_KY#S+#)wo}4AM$e$b7`pR41Q1)kIc?#b$-2q;e$0b8wKbrbBWO-2*doMp-V=|*PER!K9qy6c#xGx#`Wf+A9>4QkRC5$8xGgIwV=FI=4 zTmGkX%k}>c=oYGq8k(URy1g0(oL7uoub5R|-Ew%v7V(Os;uY7>E1tbqd^oiNT(v@~ zwZaayq7k*?6}6H>wbFaFvN&~6t~zvJ`#ANQT=iP2^*Rng z^+>kcprYPrsNQ6+-VCR~f~&z&wZYn z)SR`~oP*Pn$JJ7x+EV1uQWDWpR?$*1)Z$5v49U3wVgUiy$ow1Rf({%<4MFZfn3$L# z5H1c5J^>*S0X{Jy0T~G~6*(CN1r-e?1w9QF9X*(lo>qBFZ18O3@@na-K>d9eue0MRs6e5 z(^p^i-{LgC$uxPTZ&It?EVOiv?{*edbY)I;)t`0e?e>({^c8gWy_@fA-upi)(<}{7 zE$=Tcub(duO|HB>S)G_(8@X5;{klH2uyMSw`EGUd^T!WcS083pw&vb%9c*uHoo+2& zZOxr+uYA}!`nI$Des}q7cm2cu`uYCG|CMRf+nSCx1`@c;M%$W?KaLlu=BU4J`MNh( z>##cdy7lbyT6e@X!~OQQi{tHg6=q}YudmKpXD`z4ziIz|b+W#V{g76PfM-(?#0bt@9NscLHyGoOh&M94n6t+d_|W#&Nb{GF|g zrl^`n!Mwl9G-^Z|r#o`kf;TdaVBs24rdjw~nFfdR51B>J-DT!KA?Patyn9xphL8y<)2QM}-N-bhwgJlE=>ghW$R+nu4P^QcnMU+RrcoDoCA;v4Of&a4 zGL0G|m=pCD;~M3UpE8YF;kX>k@LbCQ`=4c+hQE<%iormpAzT)G_(X6v>dHAT){RUP zkNEIo`xlwU9Pb>+)FONRePvq|mFW+eCL5LD51B@ATT6{tt@7CHH<^aFN3@;ham9O% zxti$ZWXvI?OjG<9nI<18(`5f9)0DE`iMS&gi-8Wp{6(e_rT%Yk~1U$dV zG&X<8G%1MRWtuuLn4PfDiHwx6h1yLT0g8EceT*sdsPkXSG&n@6@poR9e~gihatzl2 zj$XBHOS^YC?6(q(=yg^Y){>mKXynh>o#-2@Fa%sRvcG5U9>sHhjiMM z_1mtFf5fmbw3eLba#4j=p z83RWD?=p?i;qNky$-j_k*eD|5^5iF6(icdX=IB?MX6JXA22}u^o-_~3T})^D)5n+_ znZ{|3?#?ch19863Txg3%ZelxZw~lWABE@8LXaACzhm-d6uunuK~OC%FI+M-l!fnT8PmMy6^0lS~tYD~6P5 zd|jF_k5I@KD{oc!A{*YW3g__O2h^6R9TY%XA(Jl~8XA`qo|0}XuHkzL#10&|Wn3+7 zt2egHj(qZLy;?r>)Rhf4Ch{F$1Zzh(wxcfuvzoEo-OC@Koy5_Mw1;al%(xC$;5gfWg4l!m1%_kkZB4JbLooLA~@LJ;tSr$Gy*?m z8ZD$u69oBPrg@5#X>xC58V{1u;9q1KNB~Z;#%&&2c6zBBnI;dNqVBIUP53{^G>7KD z$ux5uQ%IR+?JqJ-3{s}C?MBKpUJs;_k5FcT)7`p1G=9o7mOo{hpkHMgj#!Di?=Tiz zH8_uF#aaf zr2Ld=#Lz$eF4MsNQKr%OlT6c(>GKzvhC?H4z%6>`95U>tm31EW?&FP2!|_*{<~I8F z0H!Dlkf_)Zf8@q) z%!Zq5aV3WQlxfC+ifb;ERH=+mXTvp|bAFSoNsOUfg2!QmQhHvA2<(DdM~qHId(Edw z!d&z6tWrN^8m)g{rV02vnFjGUGL6(YPz(0|qD&+An@p3A>i-v+rgWTXS%paI-;`+> zf$|bWs`eBn&xjNI3uo60^;(J@7mDvwi-FUrAStMFdPL@iUOwX(ik4z&{21>1Tv6cL zhG4t}q)c;&zI}|R7s3&A{jAg~kl@cUjiEVwZ_ShLm_JQ|OQ zIB>ceCyEJJM?%2+n@poseb@3ok!cG5Nv1hOADIUZW7iNH;qtn0S^fty%`{N#s?cgJ z{}Q_hDbskuY6X!p&E?tnWhizE9_U9hWCG2 zrYSgLv$+u#HqT?^Di=u6#(7yzmRF<{!ymkMandi|5KU99+UemS`<>I0i#_n z+{iR4u^~MFTV7+a?o)%Fer?l0ps9iTH4G(Md??PV!||^L z$_OkbJO$p7TD+r(Vuf9?Pj(b@NDXOHPd!+`yW@+$0Gay1ji&~w<6ovYICL!*n3m4c zRX4;_f6pTKqAM^X-$;7=W304*@ARV)TCMTP%k(L;?~{#Z6vy&pOuK}Jv?Q)3WC!B2 zHh8pMB;a3Vn)H8`X{7#bnTF^8tV{#`Ri+93S2B$~W`jPiB|k>%Wa(FFl8x8IAEb#q zZ9M;wX}ZzLldf{%$eoIgh03vKI;CJdAIA#m9~FO=X?XulnMU;A zkZBnILz%`l7d>P2HvOhYfIZAuUIulW0GJ9i>+lrO+Iob~@K>2e_|GzpC1wrcKapvy zG2!EHDi?_-vWaan8qoeE(->A85kKJa5#}0lN%GXFH%Os4VtS29hDILCs|yW(C3PDZ(Zlxfnlk_D_s3k}|Fsbz|LeA=9U z_wjx%GzkBe;S`gMnaqDB)A$~V3-8a@*i65)UYt2FYI6NBT4Xf)$)xS+*H0f7XFr>D zsLiTjWqP?-^n{V|JzAPOvJ4|-Y&8*||6)CyChWXLOJZ+Rl>?1oHCZ^>Y2xrck4WtD z2uI2^CE#nvUt}6}q)elW{hLf9OJAke=tuY5^I);rq@V0%60)2;Xu_9s@J;w+w#lr3 z)eISbQ}uG}d)kdm!}@VxI9PW{IL%${QBFoQeg2->qpJGREkF&U8L$)P-<^@Go}#HzQC6iBsh*T^>E)9|8`Stbf7k}tbWNKo2L zY4%aepByQH{b^m5nDl_WFWihGz|3rW0^*M-LdL;W|7bC zKb2`*=(kcgc3!XdQQm6PD&g9nuoN|xEe~TUNPJ^G>+7|`?97zFs=(_;E-1*6@3onr zw^F|vU>5Jak&==Xa?jXbmDUbbMLi?_L?t}L=-ZAGgAcnQ8@)_<;}0&oA3x;JLo0YP zLWV547st+zXZ`(+bP9~_V64aZqp{=`Z*@1&p;yCH7Xuf)$6LMz-6A&ZfXfDUNXOSP z8+$+RyA~|e4DipH#VleRE7#ml1q0krEA)0H%z>dz_pO zB^uMCAR8aJ{@vSm=bEe(?g}5$!b?rUL>L+9I|yROcJFYJix-i-w;>5pXRt)j=!F?-9ogbrjn1>w?WJkYHzdi9YTm8D}O zm^|tD$PpXDwLpRn(v5u`CIdmiWv^s6ATLMhoFZNa#J#_kxrRb6V%odBd5*g1JYsA9+Y|q-R4PI{~U2_;e+jc|O z)H;v|WXd<-*!57x4wi}>KA@3uf#O9Qpe=wHvBs1U4(#5>{=?nq)9eVGI;4VXTYv=^ zb!I7SjV9IB3uJ!!SslBfswT)`i}8KQq{_C;in;#wE0ndwie`;gi45v9iBE3 z3ld>{OoBT8Aj}O*dVr9!!a>B6$dl_MnhcPcY~LZ@Oy~yrx=~F?R?WyMkEGo62AbF$MCpCzS>%C4-jP482~do^L-PPsZu)>*ilU$Cs`sC-oq4-ip}2B9Lo=EG0ow-CJ~~c-)P?*%2R?( znM^YC@Za)lGIbqdKoYiBE}}Kxjxk1-<^+&Elj$81IKuEAYugglYh+;OM-YW{0LW~_ z9$;m=_$xnbcey{@wfMN_x#YUS!WuG>*aZOak*D=_h5&9K;zpIUD$t2)(&fL{mCIqq z2G|0CfWu&DBr-^avJ(Vzi53Ez1F+dOnE-%eLgO{s9{mmgAt4NwpuBRc934=smQSSL z@sS8{jS2v`Ag{wil$ceNu=2wz6jJah(!)WgCp>otFaiP9kdK&o`9Is?Fj0Ybt>BcI z-KbpE000IdidpT-*J?IRHhg{<64?P7HS_@peZ3k*BR}ESATrMgC|kQ_sk?xH0SV$7 zIEqWNdsF?OYM^^+ARb7TcwWrIQj~ZC9Z(IgKm)wq0|L&avXGJu7=alIU;-l+6Qm}& zh(L!($p+nrL>7dUY?_c~6gSTmM`#O6cpf}-EdEw9$h2CPYD(zVjbzh7b|cv|ASD}o zmbd|&fE5~KLQ^vdRVT4jr`=xrXPh@*HULCY(o}^|nkjMM8o+F?@w!|h1a}R9LR#L) z)#MHfxUOjBija4gkjq9=Hu`XqK^T&44Vg&<9dTZclUk|8&eO^pO`7=rA! z3}C#Lj$eV{1R!fELp$8;uHT51Y}mwLGE#9X5fXaGP(q|+GY40cYEcB%VX(vKz>;MV zoxOW)2h0jKTts{B)ZoH+R1Kr~H=2s;%}E0D#F9R7{FURVt2-A8?BjCie~lP>LWOgc@`!0|-)YA@))Nq{~}FjBSf&4kd52bIw({`73> zZ8QM*H=a%Hjc3yYyVlYORuGe55(CWn4BWk~Uhe~4B^e>Q0c|j32}kI>0Vd|HxBvi! z@et;AkXAJe1Jd@f63rx4=Ek#W0_Z9SVvbAS(?ES0Ax9hr5H`L~#WA|^z1}k_T+>Qo7^R82;Y~OlUtKOlxOVXl130UQ7r)<_-6;g00tw;3^ zc-RKGP5{F8&>pPPOu7?>?9jmY&E>B%VN_G&nA5UuLM zU8H9No1#3@MtU~diE$c8&t^bJc>sp0Kdl(W2pajtvk9`~4CNzCncl(T@}5!n5Rpwg zV{;Mw2@O_|Kylater+_M_hxQ3hzHC! zG^eznAFamd;hW$LD)xW?Ni^&j~QGPb`}XLmlsT4n^RW zw&(5#AtSDFs!<*u25{<+x_bw3LZ;)i0R-pTxz@y));xlH1}QHVQJx_&8`u`J0uVXy zu=l!=@_qk_*=!&&8;yG;VE~CDu`eMn`;dv zblhMz=0+qFZL9L??I!W9NX#a7Cb`KZYwq^z<%sK?tmRubnhlgBO{QBq1jdv5mTU&8 z*$nW+B`xN?^)+PS(Gv_qVT_W#cW&h03%o-EcSB+}?BYX4P+LjBEE2N;AW->X?&zi> z@Wk)-D9&+V;xMet?U0duleY8a$gE&GV`T3$ic@L3Yd278TF6=-1>_CG;x}BsWlG1r zM0_;oJ|0YFi)@hw*x&?Uc?S^d!?468EY@J?`pD;kHi4K0Hn^Dpnf;#&vobP`K7nDv zU|8N$rYqaVWN zDiK)TfWGNkLr?=|tjW~UQf~q5nj{)fzA=CXXeWlM$={CKUxM@0wxN{Pt;?_S%UFPv zJKe!Z%0}sYI&OguudoT-6&5Akqz=OLsG=9*jpCC!fq(_zqY+i)>}#jB^J+-=lX z;iBW(tZFYXkpSZHHOYG~Ud)Dzzd(>5gf+|I3X0D^6P4e3^cA?%jysMv>^?FaVfKGi z8R73dGV9gXdP#~OCT?umdOyj`P@j}0)Y2RdGqmkifklME#8m-DwoeDV*6oE5?wJlk zDm&EN8@=O0(ahx$lo8yS7~xOC8hTIiKPWJ(~B!eM!$ohCV@^>h= zU8qk(jaCDuIXAv=rp!NTHmeR8d1G~1swN~tTi5^$M+Kd;u+G3OfE{CB5l51)z)Fc+ z#H%o?uSNhmg*Ky=FNQV2S0hW8G8)_M8t7I_yrUltIF3cC08;L~0}pfZMwqQYl2N#3 z$cr!)j|USeIz~3i`){X!T%8^<8fdYJJm37a*9s6Y>TbDD@Wq-vJ!!zm^tsS*VCqNw zc^C#B=GTko{VJR0J{_1l*5p!(!NvzH`gT(Q9IE$t^B3tELj0Y=ysFn&$f!`(O1Rn; zvvW2ET5Mr&U{j6^O&E4_z6uq_3q_HS2Wt&Fr%2)oBR}L^kvAjZUWvOI_f9m#UJjQy z;xUH<0bhWF00)kwgL@A697%er8UtcULOuY50pPjmCV4mva1=7j+l3biqw_H5jPEcd zbi|O4uu#>qcHF`aGJhU$MCD}d&oMkC)A=X?uy8+Vr3Q1q4~v)@_C!Wg9uI)0_M`Cb z@G=Bu@>PRku7%X69c{Q!Ky9Z4+QnClops@4m;2SJox^C!CK5QYcS?qD;{EBA%<2H4 z8^$?r5ozj{dV?`z7or2;K-bIqP+?@2)f=1_{h_M_Axzuu@?Qb}fY^kuvU!xq znI}--<}mhc8(^q?fB46+;p)7l(*+)SpbZ-~tlsLoGhO*(?6b)OW!t0_L)@1yNQbmf z-4WVcFzms7^GGM{3QzX7^D1%cBL?_(&L^W!PXz2Tas0o=>12j1{dNn|0e&e6nN7S?K*sHupNcm{IX{kgbxj!@iN$gR zvdF~X@C!>=EJ3-_P~GdkQaK1ij}Rn2UoaZ>9~_>ZGRhhLZkME7Io%uCHG|=ef7b6n z0!(2q$l4`LH?a$0m%JSnf`E_WJLG~e3>b{Sat&(vbRh77t4yj~XEKnmTa@0U)5kp|31E zR=@-Rh|m0#CY5loNQiHMLd?Mt?ix6n)Ja8v;All$BGbY+mP8=ESFEh%0W^eELJoZ# z=-NdyHnjAWZthsPnA$XG#s*nA{=s4Ho%7xW|5CHtc869{H`7X+-{r}{-lZTC5sXVF z5R3ArE^<62ynTN$r z)&^1pEq*UxC9<2CuB#IQA`F0v9*UOY!!W??7bvTJ`YIkaOsxCKnrYz*Q0i?1sjh<1 z84dVzsImMfaEQL99t8Jm<9ApIZ4M!G5j01>-xdd^C8hLQweXuaDU z$F|FpglT~kidTWac`>*M>4A4*Oi1k28V%u0u>wFN^OCa=4&uppwnF2_RqUrbmBZkE zRh%I2%&b4k--PVJ)1zi$OC&iAWt;!tgc{bQ}g&{i+{QagTSd zpW_KKL`^o*iS)y)ZO)qu^O4{-@o&z!i(0&{{5zrFJiqa0h|ppxc$>LK-~!c&R~|^N zIz`2buPwuiLEbvsX*FJ_OKT${S~f)J;^`l3j%!2s+a#=cF>=!N4pf2CKNuYiKu@hS zkq>_~&OIPvZ*BhQ=aU)g*0|sP{*C1BBL3>t;B!980E>eoI381+LdUb|!AJ5B&SwZ- zv>q-vqA0}-oKU|mmqX-#?;^}xOz8wZ0G-~Ag55gIPr-)_ZYx!9~pwPoWDyV_Mcg4giC-?9t0kh;s zjl9ZL!LOZ`;^n}nSpER?Mc;fcb-87PkEtFLjdG|sSU60uuv^SoAw+U^^`U+&oTSfm zfLg5x+ajPV0zUCjUao@b4Wf&(_-;hHG%1vwyol(#sVXz7!!^d&Zk@t8Q;{yc=$%v~ zrr_QYHdDuV_>jsmLGEWZ5ds4xk=({C_pUtAlE)A`4TD%M1uK!$NgNiNVCh-mbn2BP zE!i&#f~A&PRac2oI(SwzS#}|mi;2NfOV7#N5$G0`ajcOc1AR$LB9f=NA>4^XLZ2Ei z6blN+!vv;%V8__3$%(3Zzx}a5GyMld(ZHUklMiMcar{-9Rc47IJNyZwX`1YOaz+RsxwM zdm^}z2;zWWy|TT+VP@=BG;S(N1cOIW@3o;gx;|l)FOsH*w#vd~Vc|NWH49^IUjDKO z4%kG{k{+mJszB-?(G;Ay;K2Zq31~tu2?W&OfbreErY6ZBFQPW8}~$8hO`4kQcDP9wUk?ijXOlr-`8&P z>G=MmOq08*mDnY4Zd)SwC6@}qT0>Dd*yz0_9HWI~7BLvdW%6mzlKKV}xsMkr-k%uN z*lJ0bqnmA$4@a_!xsVN#I&nlJvZRifNsZfMXbo{h#N3IGSoRUjwHkJ9_s?$z6@ z@);B^md(@!-mkUh-+MgOI{T^Pex3WAIu~8yxb!{TbmWY47jW-ZDCla0;AI_O@+_e{UXksXLA9yxS7}E;TM8hfAb$ zNY+8ERH$`7K{M8h7%>^adQccA=B{u2Ju<8Fqm_sCO)`mdWw=`%+%Uq0Gu0=nwYMgO z>V3jVD$!Rai~Iqv4!d>AH=jnG2kX4AfafG@EiXW&jh7P#lGs1^N#5WVG`G4u$o1bolTG;g;^cPV?pT)TD&Op(AU;UV zV!Rp0ho)9iA0TBhZ7FP_cK*=7BJlZGHx)NCHjj^P>;|~Ic~&s??o2LW{(C2ioh%Rp zW_n8f-DbDvj$4;wUgeac;2D@fgvJuj7o8a2JVUB=5nhe9$sKK-O ztC>0K%nbR`0`)8;ZyN?lAqL?VDrKIeM)3k-5ko@ykcFvggWML zg2e{`tUEojhDvd(efVo(An6?WT%>fff}jzL8%z;Ke-@Z14(Fwh&`k@Ue9kkNRIh|g@A<*q+NH>DKGp^?Jpl@9U@``l)jHV<(=YRrhdj>K;+@ws^Fo!stj@;5 z$Un(5VOxRdLcVh(m)`?T*fT0S;1qB0DUtqysORJG7%H`NfmrQk@ZMtboFz*lV#}@+ zU4C=O?N4&_hAe<5UW!+^>!)Msuemd7KmyazK8?2_C+V6^&#{G7+wve1Gsub)ZPMV+ zfZta=QGTb#vIrJe+_0l9^W-V^RZaXb!x^|$5!j!nvoaBX3|OGJC-Q4M@FanK+#JqP ziT?4K6IZU>uM85{xt{ONQLhX$=Ypnlgs+8%zuI}3k4f)S85z$Nm0lUev*MFq8C}Pf z`<652M_5dUomh95)L>=Y$_@iQSNspIc#aaL{IR3TCt>GYNj!|9i>RF>N^WN29^rJY zn_8WCcu~G@MY6HIrYBeY^QyEh-mTf5i2m9ZyP`M=Gdr^eBtRh&{)bJt0{xD>giKHg>PYP5=H;;51&M-o~-qKt)e_L0l< z&2AVPlC^Q7dv2QJQzQX9mUf2CIN?h7v=Lh0TJw2D&N0(iQ4tMZ@p8S6OAEZkSKN%; zyjG(%=ev7nvAn-q_AJzb?IC>Kq&QQx9A25c-iZ!gc9G`@;lQM4q_aVq>Fab$syc%`41*VdL^DY3JYPD1u%b?w5^ zwc2S_<)<+_KojE}e5fUi9t<1rL{)D|97~0Rb^5nOqYfJ%U{hORhMOuw>6&xeB(urH z*D4s?06SNsyf5NVG>0Ro9`cCMfhdz7MhT=l>7sA{Ar#p~hUILDRt>Hkt(<}hn-!+C zBQkG7q#SA~eaFq!`1*MGe+#{>>o=~uQpD5d;NC}dQKa*}r ztq*ctokC+c9z@*8uwlX!xp}yx+T2vF(zI0kbab`Ha`2CLujG42!F8CjVX6l!ujmj) z_hWTi@7K$hnk|=s_~XEqT8Er!J*%t_S&QIm<)yP*VUmf_46VL1{oXORn8VjC3 z7sPsbc!8rshKxmU0NGaG?zSqhKi&K~I)VModRd#Jg_xI1wyO(Al^t*1t1=b`rNXZJ zRvqZ#a{_w3f2gmj=%g-ir(3rVf-iO`5;>u|Iq=|XKc+aR50)8mm%bz|i+p`uO5S$h zV!_vgKa^>+04MwapR=mn@jbS~19EFZ*{(vo?R)r+Ud@g|sT0g7b>KD(%fcVi%O8WP z(vE2kRknAXWX_IpF;AFsOqj_;@Eju~k!iON#$-e5N2Xz;%?RnG*f<3BT7%g9({5av`v-o4IyY2{MO1Z=Xydeq_;?Gz%`PxktSk^Cn8Aa zCU`51s;!gf!-NGh6BZ*y0)VazWAylD;t&~8To*; zn4*bFw0f~JKt#f^jqb|GDX=~+S0!8h^{txn_V&>yQJFZ`7!!>TpVOpXB z=ST2Gs59Gk80%^nVXYF$Z3yA?DOq6m6uHt$_<@?^JrYAqSUF764mgk9&+Bt!gp=`w za^P{0&>V4*YKWU@g6HE)rjWNGy(*xJ=^XjgbB_IJcdjtYL+@ln4!4WsNYZ4E2VK;5G79$^fjT{nvUcR7m}hp-h$4tj%f3bIT7N*aQuk zUaXj0s!FGBP7>q~MM+}&CCD*1SHt8maEM{cQ>hYKS|_r*O}Gr-qgVU1D_~{Y&q`zZ zD_ynan_=Q-A%#}bK`GiLy>1|`m%$<0@;s;d@zC}b1^He-bw{!1mXYIT9)n?% zg-tLa5}}$dkwOIwwYqV(6m_|#vzbP?50Z_|WsLOPRl0_|ika>VCHQSWJ2%86ZtHI` z6Je+F11Sd+IJraUP3=ZQ2=@C1$<~E}mOL}>-anLVxTDP93X6X4Q^o19$oWGA_AAQq zZoZ@u=WW@;W20q*HhObc`P8-K7#ZS#V3C|^l0q30HR7qCDQo==&GkR=m;0#)74D-S zOs8*~TTw7E4|2yP>`q55c?DC+6C=>mU!wH63}O$uD`4;{64tx&-o0PfqdNELRMq6Q zYHRZdM{DSQtK2t`I~>Ss*peph;^m>xx`MNJUWB<(hEqOudb-=U`v7ImhpNcryy%j% z;4`+7ok!>yG}?N5-4YaKPWiEzPy|PG+;1tAKpgz zcBp;)V27zpz>=h2TDclCkW0=V9lP39i4#v-aCU4zS==ujL?`8Sb_KCfOJY+`7xN{x z)kx!g{K?~*lta^5wwzr(*u^?WP52*V8e2LR3x`4s9@4b*l#UNaiC%wX8iOk1kA4&= z-f^!25$6-mFD4J2vqbD~Hx6fHgU(|qZ+nf_=d=oXujz7jzda5zW7|!;h??$id~<$& z>d<658~01(P4X-b6G?#eK6CcF+7)sc_wu8Oh{`bzWvlar>l6Hn=QDp~n!Izfw)kJ> zbDzc_1>!XyejS_F>exJ3Zi=Fk2&DHtNkAywP!6?8sqIWgIZlaxHIXlw8C)qo;GL|o z9l7gc`-nf&6xShFlwC(37;t^MJJ*ELqbEs}bqHudF(Clh^ofZ4T=!Rv7!*$vyKoqk z1v(R79Updq_}joeg!h*cdx#(GyXNVDhc?p0@2;Id%9&>U@!V&pxM@?FIY&JxTIV*% z0KMl@(f}i>h9gW|-&HS{^3B93lZ8WMDETj4NOy*y*GYsb)7v)E5x$R0$s+<3wW^)u zy%UPrv>sFCI6>Ki8@#`guP`WL#H%Mqr2k2#c|YQis$mfGTwm8QpZrK2+GKmS-bHMv z>uakGy3S3V)ZlL?pVBlronlg`$?C4rDnH_k(wSX_c8Y86zm$}||6gSqX(?>O91+{f z)vCu&6IsZ99oYxVlU0e?r9b&6nZ`@uqom*0g6YEl?Z0XF8~PY? z_TP=k(H(r4G0!>pv>ZtHvw1T==V$9d2i;-Y`AW`V$FJXXM+j`D+@o$HdHUmC3X9z1 zeug0WlfiohxhKQ?o%E-p_rK?!etCeRKO0wJ$~&9Xl4m%dHnPY&pM4g@a4~ONkaw}@ zJU2n7_eaF}_QpGh(NA&>i;z23;&;4}Bw0SLF!?=o_c%9(5 zf>fZiIfQb1Y>qa6t;`iOXXo}0iQ{)UK)pv0rTRu_OTGYAMIjYP;hm_+Bt99zs|F-Y zh%7kw@Rrp+BMH{KEg+u+6M_7Cm@s7X$(2`0oLq|tS9A+7*i046C%S!deXwwUS#=TV zyRz|agfYj?btS92S2h&@e7Jz&;MbV&ok#ji=|yF=+E4OvP2la&KC`y!8yA|YvN z@yy-q(Z8=&DWLCE67IkeXdTBeekVaB8yu&&&&a>zifs)$4D0Vf3YQk*>FDbEuyN-T zedKzWas~ytVBu244iK!fsNJItSVhtKK5zd84U5Oxk7SV;#8z#^qSC)&WEQ-JX!~I$ z(zRhq7fzC_qj{VN&2^qlJv=8E3lSZ@Dt!yHtP5cx<#*mFu2Nl`_}bzOPKXMVUxcCr zn#Xwq#?mxK^EmWJ#9w+X*~*RvOOz+s!rQvpKrC|HXASqVmTZ_+PKM$EBE$(`ES8ZY z<-$yT(B)u+h)IivPpx9;D=_1OX>VEe;B`)qTgus!u?;?4Us*_F}R;kSGpyxKuO3Tv7c*iO|5q+UuoLkuUK$3JSE4bIM zABN2t+?5nS-$V9!eeeMZ2O5Yx5bvV6aqDqK^n7Wh~xkAxFw zM?Mc+JByHV-;R2eox%pg;+7@G)6bdoV7oiw;UCnSDxFPZn+Jz+Azo`Fq`NSEK=wV5 zdTkgqZcULZi6g%A>JEZxKVvvY8pw@O~OF; z6q{43vCB{cE?)kC(ZNV&#CFXP#HYm%nDUktQfqALzdd-!>fff}q@1L$i+n)rBU^i; z^eJpf%_DAmI!?i;+HeesL(y!u{uYBVRMuRh??zq2uO7A`TS2Y>dWelf1=k;acoy$X zSa%`X^e&c-heVwOwSGS%_v~oPiD9pG=#^`xKHxwV360T^O=R;nEaD=%g_tsrGf{OY z5tKy`DY?O!J{u{kO5rDHOhf4F4b1j%rNifhhjIDAmXO0p1Wq-SB-k%z!#@}Va*{|mCPir zd@V!Nv#&#@8bH2x^&y)?8*;>W1g<_(Wp-LF{_vJnX4E`WneKHu3{E6)aQ%}w(aI6uBZ9Wpjd+*z?_ zCuU{RW&?TUIPMcH$b8e6QaW+BhkLIs!Ymu=0D0m=3S<^Cc6oV~Xd$`?GFoKV@7#+>-d3^kxvVu6NnjL8>zem6kx?j6m-Fg7_0A zv(dJ6Mx3WRw!s4z3bo|j#Ar32Or)$(PfoEAy%z|p^y~fcQNo@Zc)f7E_b)Cw3Px`Cfde`S)>>0@?`f1jDFng=Xk#H1Bc}# z%N;LA0E7TcR}WQv$Y^^H2Y9h^8ylU;KU)Lc1I{U|y)@%9dwX{VhA zdJQ8^!lD6TDHZ$26GH@#V9fxq77H1$k*&4@-@mHvstAl%1wPV*lx%~@t+MGGNyPTv z)fIlsKAe#bCf9;|1y9hm4q!bHN+D-~q^O_7$KU-PAyNkBO6vImAXFm95TpYIYWg7r z`GYaDjx>6E)%+34gPWSGh7_>7gFNyaRsh=psbssRK}Dp+Kzie@_sww$PuCcm)fjh= zx!UU3j`Y~iUNi0MRnv|$okQ;3G_vd0vtM;FKT~8TAzI*2-j|PctBw&&^Zl-7!FooK=Ut0Eg-)ARPp(H%#_4eRr>n zle&N0Cjg%_WYShT9yex^%U9?<#(__zB}>>Ox4kMkab#$IGLH)m)C*@kQXCHmXb^(2_jKSYC$=@FKJ3iu#Pj~PvKRYwHyEJ+ig+TO6v!<<0 zgw*cyr2Sq{XOboG<5(kM^iZnSjLG+kK_)|A1bG4> zzIxUq;8s(Jb{GMRhl^>svyq^ONhnX?24WOh$)U`bOZO&sqEoBG2Tk!mT$5JJ?Fd)Cdc#4i>m*CH_S4)xjl zZ2c&rG63bBmFO>cd;=!w+eLme@_lavY>L z@F+X;s$Z{)mwlCJ*^c5HBx4p$xq2U8{6W58BP_z$Q zf~5`u6YT~i&`eI7CZz;MCI*ix?hdGY;Jkmp0QF!8`+FplF&@@85u^H#4+W+e4MghX zg3Veyqh-6FeEwLj$jWkS^|aCG*&LdsnR)h|!hGGxd^^p2Z^rzP!s67(;xf(RcE$ol zVToyMiJNXoFl$LdX+>siMU{?LXRMegpR*Z1=S+XjJNsOa(puEmS|Z(AX4YDs(niVH z=3%vm=gnEWnOUc@3un}VxXX6B{UxQ#Zo0$4C+E3z7yL_?TVt2q zi&s^%49%ZtC`}xDW?g9JTv)u(ER7?Zx7&S**N`UH!&$c?FS{T`EDdkS9UMo)1j`QX z$}$;nH74!4K`18hLyBn|KCE_zZY^ zjx>1`Q3Z5*`wz|e>|F*-W%$iCd3>YtU-$Oe_71XY3QF_#CNRaiWAVSf^vt^qO7Zr4 zB*Bo~<+t4A!5|sPmKk_D2W!s=M$Jfs;F<;ym*`caq$(p|wJ^C2Y7ekRSqD#+kR zrk+x!0oIZc4>LSeEO9Or7-B7PqUZlOInm9LXeTGt2d7Pe;Q|qbc5-q}Bij*Cc`Q+B zrjb$ekxBDWS*B5O3jg5bAn@p+3=IgpMQVa+EAqwWxQAAeo6WHd(D*pClf#lQV@iB7 zPqStkKa&}^OdWrk8GqUw`&BY-&NTkJY2tQs{I{$4Q=f!eQ-GWV(eEs8pJYZ<0GAb(1>no>YQ!OPobk@?)Ar zcVNo>mN@^Gq#&Aj6pe3&{NpSm-%K;9%xmiOqy;=lnsi60?AKYT1=o@5rcr%9v0bK7 zRr66rrqOBhG&Wt)Gr*X%`P>-O$h@n_6rYec^PCzc3^xk}D4IgdCxy7#g#?R*B(z0j zPl~9ri|7`Mm}raHo)mLt7xOL_3(}T|W*4C`8(*}O11(W{Qu;8v^wDCe4sDsilQNU+ zvKe@Z3@wLTYuU@}a!2WCqYUg(1U+qbr08R;T$aKyC|2Qo!6l$#)3SoI71zx&@+G3Y zfVR5irpSV}{8?~88Ewt`C&h7#gcpUl%CIOJX}sJg#DUV45jO=Rtu;vsSfjAwjO;Wb zXw7zZUC|-|YPPW2daZgdyZ-j3AWoWU8cMnNq#+50{Q?%%o{j$kK|{}iXIh9G2xaJB ztbqIFT_Wg1vny{~YVUoccSB-%-MsU@p=w29XDYm#=>oeUvD=vGrr~s@^Mo&8RqdAe zhBCMpNNhJiskJ{&I)ES(Q6G`h%phHV{sb%9tmXY;<)$S`umaZfJf25OVsVl^PV?nB>CJ;BodMxe}0)0@-QwHO$eWh!P#Hy8Pzm=jP zEb>OV&>`}QYV-UnzkVx_`CKq}OQo??*{;yGDNSSk4NuXGVLqEK9fmD(i}gFFR^|>n zSb-s(`NNo12Mhhsuk30X*?Y5=?^Ec9RTe94S*p-f4tkM$1JEj!`-jT$TLPOZ^ad@_ z%zh&nMG(>h`=mbi%&gXob&XCeL2%6K{!lrqjoz)^@4Fh_-VHuniVZ`bGH(YyhN>vI6PvL%W5ugHf_$&?j{ljqwer2D@4k@z*m zD!rM8e53a=BXhIF8HK(jTEz-o>d;ym;FTG7StMvsz{2+1qAV^W zqm~yK7AptcEXf0skiT&>^2RwrV>!^Rm-$-* ztE+m9+j??TThFkxVe#&aJ4}8J-}Abbz44xRY^9iQxybG0eAIU?WozcL<7nIFvMAADOm7_-_R`F*gR{}Z*ha{b-sJm9Qr9F^fX8yi zNl520ja+S`UkqBy`2?U5_`VrqQJo-voe^{epT91ly-6#$jl{-#&SC3X#m|>RF8jvXk{aX zin_xHAr7R#*Q&Lz$+=nIizzFMz_IB3?(im&J&|O*@t-qaucHu&d}h+2T#Cg*Db)Af zJfc!I2Q%;S7KiSPJ|4(bru&(?`@Vd*NF|3+c5&*t76h6l-462P=raZ9T;GDkYv&tn zMzSBoh`(L@==81i4S_`6Qj5plVtb54{Yrbl@+8b_}I{H`tYJ^OPt zWLNkAOMa1$@R$AsHXX{R*KRMy_@$K6H6=Zs*OWrDFEo8}@5!dCP+KkESg(6j$bU7A zqw#(fUav=%|LpeSXwuxLM@}GE_51F0jag^>gU=TSOP~FN_IO_GAE$~cl6O4aAdGBG zWMOleXu~_(ow0u+Sp3WXYH(vpCy2@aXP0+!(77}GVj=f+{*K<(6m!d6zTbA&I zXJIn~#h+fDx&h^2i1k{DdYSL{b7af zDAE=II~N_f9Nv#z=z-`weQiZ0vqauY8l&)ffbvc5>p{8{r+0>M8pu#Ej|7XA|=RzC_}S978XawIkliT8}t1^vaMif^9s zCkUUQ-1Pu*ycQzTYx))^&v>o)PSad!F+=Iq!4_B#)zPe8@Ukv(C&@GaW2;5Q`v@xO zt`B`vF$Ij$+hw=YmQ)QRjFZ|QdHE#Q`k4{}PCjF_-dYkX@~slS6BF$@8go3C4W2Y` z`E|}=%k=AP=G)I-l1N^Uf~#eJdFG;SqX+hla|H>j7wZ{8@2+e3v4Vn59uhz{@m*nP z+{7z0qLaU`^eHy?w;j^Tw1QC>{D-Th4f9n_wq~L^P9qX5WJ*n>KOVv?mgoDTkeCx6 zivwj{fG0JN7)ldz=G0t;8sjBV_VdDU_KQe@@RUC?gF~pY1__1UM-;{iI9>6sNyhJl zu}fP=i4qn;k}7+*4U;%bVxLpG+*k5^TuiUKgbLwP{nSA!-56_O^PKEhs9z-i_45r; z8j_GdGL0Tg$*`oPL!4N4`gIKF_4*8!`XC;yNMgtms|aP(pz?23IdXS42+M-nLw$Xh zSD)7z<8Z{)q2$Yg_b(%59eWS*CikkSt#A+@<>PR+fo>Aw1@ZTn?{r^*^; zAot8aGEG9%==}ak#;8-dXbah>jfQ*nG+L&qZz;ExL(4S#<ttlOhp_Mj@?A$zJ2c5HZ%vORcbm{8vyO5OEo;Jg`WH{sE?khl@_cv zJg7G5v^IJh?OCg}yle6`+PF0|t=7r8=FwR8Z1b>DB&%!9v%?>T&0jq01F%P)9E0W& zzYXf&@H;(4@yz$~%;tuo*BaZS=lhaHNk|g`bU5x&!!MvhG>Hn>pM=Q>)c}~uAMLD} zVirqPKYdso{gYu1?P`Dm05G^E4gdz+=wi=68~{uoO-z-0HOoHZcYVA2WPX^k>C+78 z6;NR@IuZG?^Ff_r*dx@^e*SdxY~BkuYhG$`q03Lx&NwQcj+cITQ5DY5)cLawFQ2oX zeja(t@9XZr@EX9{eqt+V@}0o4s=ukTuhhk7U~g#&Lji-l3*qSg$TA4+Qj@>fhOzN! zVmhj{U1$k;22I3nD^mBI<+_IHER7MQG^f$#<45Gkq-;-K;H}*Qc$&!Skd^ohj2Q~Y z<1aiP0VsAI0APk2Gcq5%)G=5X6}X(p<^SpNEp2J*RTEaD;btjpZDH{UyzUGz!r?~?uTjGZ;PoQNstZ^xORQ?Yt+b<9uX}xkNqa z*2-R0Q^fbzk&>3lk|4?iZ2ac9hRFdX(s_$gu00?5Zj%Us%PbuR9)3lpogShZ2NQy} zB>0ZEQ9;<1_vnKIfRRC%YSK76Fam2v{7Tr>_8Z|w?3Odw>7|;`Dg{|3?57Zn(hk2- z;g3wi85#UXrWp+8-w77P3lZiDLCZAv?Ls6XL!>Iv&U4(aJgoh-rDk9>|PDBu1BseJ|L@hEbGr}J)GKxBq)iN?} zC(=7IGMX!js3$VPE-H)~Eww}j`$Xm0MPyY*#q30t*hO$MV{n48itr*I03dt{sOS>A z=*r6IpuuQXWORBnj$;EJ!%ld+DT6{6wI(h^1?thGMlSX;cHqi;3K;9wb{p5|2vc6&L;u1%Rsmge|D9Kw;l(nN}y_C z1uuY4Gg&F96Iq~9ZC9_EhDb3V$C1+{;B_T&(!>#QCvg@h@kAw%EhJM$B}qiF3TJ_N z7m@&vlL7M#jNFL|RSeQu(xQ)3xvJ>YuH!X)Q;G3Yv|Ca%qf(4o67{c9se%hBtP05l zkYqG!OL&~7^*Du2Jgg0x zX}anuFnjRptRz$WOl}Nx@E1+dFgW>Ua3*JfRx<*sGL)|~p7vy(U!@zp}zh9kVZo}siMZVc9I)ge*gOS$*ZIEs zaQZ<`#jJvep-}wRbi5zDPR+cO4uv$)RG2rs1fYD-VkpyMKKU>weRUD1vezJb*pBJgt?Aes={Wd@>XT#M&eKPo=Z>A1 zF`d^nowp;MzxFy&1QZA$9|BVof$fCAjYZ(sA_$g@fWZGVrJS+XqQ-7TkG(44Ua!7- zNk8*0bN7Ln`<8w6eGum7_B^2c`J0-FH%ifgUVnv@qxUNEqj{8qThu#`sQQ^GW2+eD zteD{W7?Z=8hjVde3-M;Q2^s;3AMz6;<`XUKle7YoTVj)Q7gB7TQ}v5eqmNRJUuT$v zXEZHl*tccEDzg&4WjT3fKMl+7O3Nuf&asNgZC%Q9_sf6&Ha`Vf;GR`jvr_04SoE^7 zDE*|^F{IQvwX|d9ADEoA^1#z_w|~TP{s$@NADNuf8vnnylvDhdrJTN+{|%*_jsKK# z{@0RntpBo78W8bZ^)iyOhnRy zJhg(~3)CmOGVEi#Uv5j^9^o$o2)|+x&bSi4)y2`Z#MMJZ zHydP^gnQD^Mv2hKfDr@tB?r6#_!hEcSYKfzHW*$rY^_gkjmY*e!csbIga{`5_=-%Z zwm%y%s)OIffP>UR6~)68KOt14@59&9-v*l%9DQ~sZ|kOOPNXP%Yw4@=glH<|trwzL zHW81LkVxdH-uVdhGXu+a9wSMJKn7H{CQ~*kS{}4omZlEg_0QpCCr;S{n^P7-x{R*I zNo30@WS_cV>Sab`iPUy~!i3%%=CL(V1xwdeuDz-)8RCap-r-Eq=L=p#$=I)z;2( zhig7bB(6EIEktnGK%DPHwFb`4`G#ljYP1Fz?WV`?+Q;~AIeNDC2=~~o>6BB)=bYkX)DmcORTJJ`h+mPS!PEKMhfB`?5Ll zr9|`5s)Du-E&fYFuIjT=r8FGpqf&UH$yb7VlH*wJgk@##N4()AHK-kM$_<#xnMzCx zA`v%XhtZtCO5{}C{eiwrk-u!&b$YvyNafep$#E=Yx~Vr4#7r+~4G*U5H%n)b?AjvJ zyqeA2!!uZKh!@zVOyA6o3S$ert=Ax$6?)?zBrI>pQcT>~C(JTuG?DKSH;Z7A1&R>x z@ks9%7IQO~@T1#AhKZ#>%sB{RjBd<*ilrLzW2!cqti+7agS3aVL}FnW}0w zS>_mNHUlY^DfsOqMoQmY+o}ver04-{5v2Pi;2jUGG#UnzNzhVYZ~)t~kxHwBF$c(H zmLay%kni)5Vn^`E+djqDOMFR7mCJ6}kd(SS=DqQn5z9GcRDdg%)q?8l8G5}crAzR+ z5COD-7~R$d_rnVrt_X~-509=iD1ixaw!RkszT3WnUs3608PlhLW7EJJ=kQ$=y25L< zLT2Ut;u4g;?xE}b{XUL`g=-=Pe0gD+DL@$~z;X;<*voc6b&B4GlF=sM+|%l%r)iS` zkl4#osk;1zVvYw=fF{Nyu&3_TKIzHq0M(A&)S85WCdUUp{aEkxBap~n|DISo!UH}< zV)&VGLgS&S+N{EEWa&)`HRFD9#Y)BbM1EI$HOCPsqCoVFCkA6^=q~g$&aaLZh~I-+rV?2 z14VUV!>Yj$2?C&AgB18rZ@4|sKz>Ahy0iAArcpM&G2XPzvH>4;wo4Y!NS5A+|J%6p z+&LIR^>?gIif>;mpvg-Wl3vKT{{S*?uC!f3@m7ehvDc>l4V{e_<&C%UjecJbP~c9% zSdS10sSIX-ShKsNckwdgq3yxL=gA?7`_OIJOf|HIvo%f|;(JK($prEh0pc3jhx}-z z`f+f4IVJW7xwp+Qatde@>-(-+c91%Mtxkacn+A-pfT?EeYOT+=#BsTcb**_~l6o*= zbU2BJJoqnWf_;63W`oBB3I1BBh7e1lf*v)>;_shP*dzFshE-cK_So(zt4+PtC?axN zbo-Y27o>eBKcdiLullLg)m-B7SCw|-P>{;RWtZ&qrvxPYDoAUA{6YdkKSP!#cS`L% zFJ{rr&A!rHNU`7gjbcgb@z!H`fS9NlNL5r<(gxIWi&Dxz(&064WJVr>T8$dp<2jGe zMcj#JR+4w6WV>g3P0T*HMVvVaWFN)+>t`P*{TFS3N6Oj7v-YB_^%&J(Z?paEK%h+^5y91ZNn#GH?CB* z%m`_F1v~%0OF6q1NvGk!2|3!&v-#JH#QPQJzuc&@YxLV$ng422>oy>J_ZJ0vzRqjx z#s8O-gQd`PQuI7r?nDQ7rc%>D^ov*-V$oVDd+=AJ;thME7^Q#%(zN;->z`7 zG;`}~O7+S`=*j24UX5Qbe+r`;bMqvuI9CFeJm1MzMAWoBig#TtZPI+VnbP=kJ@IPO z*_Xuq&hLO|5}H8oaiwR8n31aS`!Itf@*$Q1h9~Fnj%%&U)%n zec#$z)Y|fND^pkCk5~p7W4?^3kx%qBD-NL_4KG9Ne?PF!TiSlosk}QH@JjhaRenI$ zGD61Urc=a^ep|!&%oh$T(&ahEyC?Xfz#SYFxa z$qm!fTfX>HeyWIYbq|=e>w0jjtYuRJ9nVR+9>)!@d2bf4lg*+}#zjDTbn%@ZG+viw zPn+gG7ZMgieWFhcQ+#|sjg-E;L}kGOGYoZnj><0^2#Z_vw+PDd)vONJ+-Q`))sfa6Z*Km>s#|) zjH<1qsZr}mhvy>txGFbO-P5piPrRZPPP;)a;e8Cjr{twyWN6tBQCxqB+JYKK6o*Ge zHuI;ry12L}b)nR8St>+jvVgZRUz~=T`36_XeDclHy6Zx7lfvidVmo*_oe8lBAw^-J zuMnU1NxC$F$&go1Z^l^27^yHzE2hUi`fRx9A=po9Vy|g{%xoi(eri};-Mgzrf z_kdU%_^8Uu30FhXz^X(6rjZXQsO&VDRjvps1|j&i$;ISFyb*0sE)rfP&3}LGq{~mC zOXWdj$(A9<2DERU%{Ga>v%`HBaf$ z=$%u)dO=KDS&F`%{UsxL7h0iSyekRrAL1yprC)+0ik8(ZG`u!Iedjr+=0Wpmg5weQB67*NxX80*TXbm|_&`ym-K_Z@ zHF{%TGw?Asy?VIqXsPiM8W-C>c-W!Ix(t|5s8t%C)}wfc%5^;}$s}|qEBVP#u!f^t zxBO9BA)N8$h9%Smapane7MUDXpdR`whJ+}lO?r{bGp;Yz_cKVzb?UD1VWv6?QJu4R zV>F1_Yj#Lj!wnj6c*;^WtwsMh%`+<8h|f92K1{wc)l+Snx1gJ{#ZQLO*T8>N4JG>| z;2)ZU(?`i$u+c>%z<%^jxz!H-LvyaW^-e{e&WMTnx{=oZe`roYpTv&3CJD_krxw@1 zCTOPtc~`m#zl68Gm~)m*KG%CGdj$ZXqB=KVZ$Sd${EJN;UC99DF;Cb1z2N%@z)Cq2 z^hzy&I-Hsc8pi@jm?*jBk_Zw|8e#y{Vx*^3-V~tJZD%DQe{uQDvWK zW0~)ay%oAL%tnV{7XPBd1}^#X9?3$)ohG$|#>BWTKQZP5c1Rc114?rEC4k~1@4f#!dYo>}I7_*;27o5AOHKZYcTFi)3Fll)*3gForT|m`B0ZiBNXA0KcmSJR zXk<44?$56SaNXr=x$IVstHS^h2mn+uz1z&177)FvBT$Q+69A9{07SsR?Ex~KE)eVn z>Kr5rI8p}yXw)#@KLNC;0It9#Mx1WdL-jpRMsljWVKNyd0>w>NAb<`*HvTu_kZ`Zq zR0#t_lkO!@xIPw$ksF|GCn&4M)si*%clQ~zQ#kS&0{~6epqy-&LREtMHFX^&O2Z|< z`88K7%BuAzj^1(+YT5Oz$l&WjBGi_1AHI^ z+CmIWrV~bu1y8l!G4Ci>?XcR7A(zxbS&;#{qnRCnz9F%9euLdZP%yp@X-p?{s;9^Z zddb<3;-*tP&CopW(_|<|y_e_}Fz$V!4PbC>iiK4M4g(m7s`{dmsR66>FTN#VFnI z%|YvhDj`A}ALx$*Vfx`%jW-_}Me|TT01Wq`PW7>ljt_=x3Hw$;mH$L-DX`0Ey9pMr z4SU+f@6qH10O(+J?Z`2v4+Wo&^|T%qRU()JFtk*|(9_thR|A)UB*Y9t0O#~D ziz&f?TH9fGSg87LSR27w(|rjEV8cGUrV0njA>s^qj8%QqC~0!~nB?9Pk_<3;!_4Lb zu^9zy=DHzsr4Jf1LJ(}PFraepp`MVT;?jZ$2^Dw|hS~tipag(@2GhrB!uHf)XKJue z5a3h=9zX^_4+?Yj!wm=E%RrDh4v3cX1z^N&p~B=%ndAVs9B8VJhDiAs6CBJgXjCcJ zkuwXP{eTkNPKUS}!T4|#(&=k`hIdZzZS6+zcwn^}fo)AS>s1#d?z)f4s8T#Z7$Isa zK@+Z|_!3T6E;#Xn9mAvsHKS{A?$*^r=((4ZQKAhlVGTehX(I=6fWZ{S?JzI}5#SC1 zYpYT^Nn>c0C_Itm)dDyWn`yevz}}t_UsC4^7Z$Wj?t6$|S;JCJ#dM949&fF#aE!AO z1uYsg0h?5rqBx$V@KGYw$5ZHpU_SPd3x%l4%B557YG0N@_vYUDKam4Aj0gG~9toNq zt5F(+^e7@RWYXRKDg?ec<7K$uY*L1@xl}nD_QPSACuuz#Nc2s|64R zz%e>UAi(fi^X}?9tEFiw^G@?P%q}N@?MAGdY6nb4U06N+o6q3AQw;zT!&E-F%c%)l zHmS_hA}s|BQ*C!JY5-}B(O2@XAk9nE*JvU2u1f7_V4TKYE>tMF)3_nlO#-9XJxDuj z2r%ywOV*H3rwowp0#cD$#-^yHds&#L!E)IMe*s@tMSKn+nTduJB4v&-Vey^guqTOF zWLwrf6RPR=%40$wJa!hn0k??lx(0QNQx>TMH|C`?p*46yW3AN*cgk-t3dO?Dz&8Om znJZ!}^3!c~)18UB97XeRKJe|BXgX1V%a1u`j%lgQYPTZ_IhD9rZ`}MEwFK3o$$a8c z!-CBj_!I=FE`fg2`VY}*(e552Avzd&9UIo(1*uliL}ZNhcY@#{LAlVnparZhy4bB` z6A{6D({-MGM7orp>=MKV+yb@x72<7tmwY1Ta6sFDBSI38#=MpyhJr43XX$UKY$mf{ zmhUojYfiqfPdy30rhKyn2f z`}OyV3neo3H4nA1y=B<&AL~ELK zFzv-$lCfdBbki3yI`_lZge@HBrzr26Q19%-TllPXLhL`tGa0Y}gmj?CV{3tFOP8qk z-X>6Tw)e23Rt_zvy7x0-@)>O9mRo(~BN4X7g$>Xa8@C+tJR^19lO_a|2aCCLHrHIB{!OuR(6Gdj*idqdiMVu9mu>h)rNDN z`{h4J2k?gE=&-4f9G&a`b94|lnMsb0{{YF+xunSH_bMkrI)JTzkPc>&xxqf9iBfd^ z{R5}T5)5|=rao*0l+y`Yo-o#d{#*kJ8m|gaOPneNP=)I^g`-0RU7ZFrQj} zenoP0z+YkSfh{CQ2g6R$NsH-JCOJC0NI`J-2<`UIQp^%P<}S(6IeibL>TxVx6QXtb zpQGd1wB4*WGa`YRGg0^P!|37L$;vzCswseum<@_gY8vS`n$(G7E6is*65KApBuw!C z93220?Kf!^H+wnvAEXlzYYel)06IO4omUX?lo|-U&2F}-0@dr!H}U@{9ngQ2PISf$ zr7DD=K*cln43_cybEaORjjb1Kfa*E`{Jh(%r6qPhwI6*Wt4HY{r33i+ zKT4<2^K0V{`~`{9!NkG|roTPe^T3lNO2@#(SQKO2F8_-}>G*$4l>>NzonSpAN(U^5 zHsjy@kJ9nKr8RKde<2q67s<09)2BhH+_~g2*7fbfBoNd2kJ4dl;y%PrnheejXr3SI zIPTwpQO6+v68m(ao_m%s_GoWsV8Y4Hm@Xha8iUlsoc(c}VcGT??!F8(_^P~vW*PAq zL|>1aB1DgpCyRMSA@ElZV-UnbWl@}+sHkZc7UvDxrXjODm5Ml|kUf32CmbV2Tt+Sl zsUBu#uz~ve;`KtGG*2PYwY;gp2Z4tGUuOWKHulfj1~3WDfVkr@?hUK(bXr({D73Xi zqI8zpU4kbaUXhV`I05HI4{sQ?k|>?RuWy@ts`LP`WuISkF3fe9dm8C3JSNGvDa`0P zS0tvKFZzGz&H=l-!Uc?|TN-&3T*4SOfH~uVkbNN^k1H>r)9-EFhp)$xH~o*&S;_h^ zxwGR8=>o-hFi&eGy|^cM25}9sk4KYlf82JZW_imWb19gETbzpZt_4W}^29J=??3oQ#vG22QxwfbxwD9Vo9J5h}cI0bpS$iE;7QJ?(IJGngp+P z6K~p25Y`PJ$pAh&kC@Kuv$I{cshBNY=xJTHA4B$0FfKppJL+Y|Ux8|=b&DN4wV5FBMJm9&|kEE<1p zMxxFeB&BpEfrd<#3a5?6U4BcE3P+a-PUT1>oWiFF4x%_g(zK~O!GJ`7YM$inX1W;G z;ZMD4atMH9N|H2$=E!mnEz$oqcL#9CxVBTe{Jzo92_2FYHHy7ct6}!Zdq2W1z}4FU zt|??5M3BRPa=;f>NZNrz2gUR|Q@w^dV~!g?cz$&&1(`Y5v|0)U_$xqvj(0)!qRDJA ziw7~YO+j}ydeLS+>d!ZzkS_nd&|^Fo9}<9&lkdMwz^Bp$Ae^ii`%*OVC?T)6&S&argsBIXAFYzUrbYfSfw*34ORZ;zLXd z41lBxAe!j^=j+VNJi$KYo~#a+cJBq`s6HeEnu~w|##~?o2bvE*U~j-EHyl#9$eG9^ ziwK#TX2bxYYI#<&uDx+|6c9RJhXAZXF-U0-^Qe0T4;JzZ2c!Xe-+PU!@s#sl29sQ! zhYv}v4#IDB__2sp&D}HEkK@l2T}(-?&hAvPe#RBwbGa{bl{WQ$^pF0rI`)dArIRkX za4G(yn)Kqqm&T|_#izxuO$2#1F4C`KCpWVBCqZRtMXy8Uc?c`v96ZsJT#IFs$*sbX z@|7IE=VuwW^&ey$fB%&D_oLRBN>q3Q#!r9H1$siQguqg8gcN!`4`@Q*=~YgN_(@~w zG+hbXKo%K*zyEar-vb=xmIWOIBmW8Mo_9YjQq^E=2F5hI3Q&`u{USnaC zMgpNra61cqp&b<9<%2iKR2sVbDGD&2ZY&w-#F8b33Ub7paFc(g`zZ+FFXAcT1bF%c z6kP@o)a8L7ZDFDy>v--6xws5<9?YO=qbLN!1gUHje^u@MksVhzs`3i`$XvkLxA{j# zzwp<%l4K<{bcqSfw<<#deR?RpDiN&KY$z}(Hde*Gb&8;`^Vrtc6C{6pho^d@A;1BB zH2tVX!wUVuOw$GV@Jjp9sOoD?nzlz*JI^%AO{A)jdQTPp%2JuY9!HdniRCXC^j}Ml zw;C(({cJhb)uD5DS+3DoApk#GJM$UWh*qakPDdZ#W zCn%{x@zmV9jtcZ+%BuZ6K#nhAax;cTV2KVBRqhqlTX~BHFpszolTt1X(_rz_1{n%= z-!!b5KyKQr$v1z;x2iOA1yO@)vz5L>YQY$FU0N%_^!~v0EPWq{3wc1^O1HYYD`zem zJIQy%^@duUV5FhEQ@H#}?&DLRcixKCt~X0yxrrKWw?`rL$lc7KP6Rp2j)BTvoz>uj zokmWo1o2PR_2aUWpRJOwj%_!mT%D<^KTz3^6Q&P4cHB2pos{h3IsU7ZnyS#L)~An{ z>BkeB55LS-?;qEFn&jJg)zw**J@!}Q!hmG(+;%g*B$R2%Is8Je_xRq{nbK}b($VI6YK~opE2jZj6DDM}Nt&2Ae-!gSX z>&%T8)0@tMu0@xp9_J^jMb{2gMGl}WZI9&65v+`UG6V7}el%h^6z~r%%D9oblwZXq z+{eRD?Mhu~?ph~v^M6Xc=fSV}=^0_}uhJ_YQ4+?Z1kqSH$}=E$ci!7|^A5+->_ikN zy+D!1zxq15kMBDN8uSRClwY^0_RFkUGnTk?5gRMI;G=jdTITf_BNb%~ImEMC50Fss3VrXQI`RK>cl)Oov!*?3^AR8b0q*lz6ud>O?J8Zk7 zPk@vTg|l+6%S|{RyAB*hppyUE4)MJ(k9AuvvM=fx5+0T&c%whSI#0>YVk^v#4(sUZ z!*_23sHWo?bc^9Zp0M>6^J&uX_UR!bhA1EV@a~fwI<5&dzkoRHJz-eEA*FbgTJoQNRKu^3-F|qEODAq zJK14O$BL3C#2jlTTZ&ax`=bTxOpAr<6jPHJC7(}$3@a%wxC4j#)fm#rTplz~+aqdO zLZm4Jc0Vv1`UBqsN-3x=FMqOl5mCY`-FV*)vW_ivA;!Zm?`^gM21Bdrk~2hYHN_k9 z7jm=JtBCG)6D0{&EjzN7$XM+{GD@)}Qy4!XVW;=RH}JIIRKOzFU}>hCDmGj0BQdcF zs#IdKMDD*5Fb~BGSJzYHSO)J>vKR7~R1^CW{-9!67(?rhl{{%i4+(b2lBrk?xnfxL zuI{zA>6A#hQUR&YhE{De-Ag)U*Q!681h&mSKR7AVl=@?*@{vgP{2bXgeS%V3IHF)BF9I9p9H@j4aV&sDx+@5UY{ktTlWrr& z*CDaQN~^}8HL^=ewelXmJ>oush+(WZ^o2ezqS!bjR)?(L4Dm9t>3Q^G{b#3Q=W+G7 zh~u{#fB!0WgRUH6VeLe)rcy7g<}l`3`zAw_Qoq38Zz1(7^M!nSReIvz$TmN~Tuyu8 z8ZwL#k{@WB#1@9M*Ha#a%WwZc4BQT^+;O;z`#6)cc>1`I;`a4ER~fW!Tc^cMsb^ori8wwUqSN!Mb(MiIr+cRk`we_hHo9y3lWf8auOTq-fU^PCI5$IJF) z8osu6zVkbf#{Hs@JFF|1$l8S;c8Roed!Ot+s}JoYCi0g+V+P@CIHgh~`V1s^JjzKcH6n@O7j45JKQzsGYH0EAsiA*{iyJc zpsiv2CP$_lOmY^ zj)w_6(ID;wLaHvm(9L4Zc%PR%D%cLpc9T^jX_+ISh?SH8!w?-K_&MYGIqnw-wGB!g zQu6rWxKFj{ucSqr*m%Al6yf17pyxwV6+iFjRcYy-uhJ&3A-Urg5?2JSDU@Jc-A{eD zJUr&$Ia}_H__w87UT&;nZ}@}UKJXg$DO|th`M_P!yHC~wCs&B$*zA+k=@r=~XcS)J zJmdQ$u4@anYcWJ}Np1D{kK2Uh-K6y(lYWc}k?)B()kc}xJUT=|)Ibr^czz`T-J7Ml zm4n8*B_1tA-CkO`?*J}~!4$h&`h%nTvD+R=r9zQf&{7mr)k1*(1YFHZVWL;Wx=-p5 zz;56W)}aM^P;j|liroaUFZJ9o+45F%^Fde|DwJ8temEPk)%W)Zswm{w^}JsBEYWU< z*KJ47+f(AUmO8Z&{V5q^T%Yk-uL=>Pu&b?3)NV$(GT5gIEeXm%3bzcvBn2Q z4@z-up|tQ$k3UKfyZPU+?m&90>4fhN3cK~%idzT=74ctb<>PwsOzH=_U)Ezk28w&{ z+=SY!<y#?V_$|aJmvJ; zpeklnV#sg}DpYQ>eY7?6C|D;fwrq;;4t6)=NlDTPM@F9K!qe-vtuiA5~?z=yA1F}tc_H?4|Hkl>xRYNDFSlGMK zR02FWxVbx1JKn=B?x8-NsS!S?D5)(Mvna9K$12C))Z{~E;$7T(`RZ*}fp|mj=7TvY z`h0Zl2X_+Jqh_UvMnMKZlJ8#iWSNySdH3?%{cM59o2GQeg1ZuINLG$Li%1)V|9wJ)3?f43NcN&v)7pzhZ-jVb%sGbn-4P};EY#VI$EWz$c z;+Cwi|2aKI`QT;TAXhtKd)Not0Ay&uVB&nyYT6Lyp7dVYbw9|OaGIV2$+ z2acuCTQE8cEo#MC${5DCh~wB(tM$s|c{kk#p0&JO{1{vf0gB>WeSMTDacn}~X?`kv zhz`lS*m2N%=VG*@GkSM_cDM#ycW{2{N~+q;D%P^&d!h-qmEY@~Cw{p4`N}sk1J9$L ztK56iNj)&f8t)S+>>NmV0kD(>US79f6^FB@$1WFFe`0~|GWc!&E#^k($34c~?9ku7 z;(RE31>Js?zN2?xuabSgP@!R05{}m~Y4NbPDj-kk^}&cbVxHFIkEX=*igUkm=w z*2He9H}*t6M+2LI_)U-1aS-J_+)+jPuos?+P^%e^lk>QWrQ=4wkNuiJ`YXt~x>r=! zFUlJ?#!BqlvRwTnk5Y>?gP!8HYx+dEF13W0zIk)`Z5>?Yjn)%x5ZWsKqH=nu@<3!W zl5C&(2pVST6JAUC3QCo6NS%8~mFQ1HYXMuce5c1EF8x}wUCUgI5e@IVT2@CYh=x8$ zTz)J`f_fH;^<*0G`wYXC*RyE|dWez(K62ba^_A*vXOW`0VVT3zN9JeeHJ>_2R!^_+ zVHNOPI*X;}`yk=y-GS(DZN#?Cxo`$2^U#9{J5G-b&G84`em;!_ z3(ekVKI)Ne_zpMDzUps;4%atb@0I%W{Yu8*ABuab#({#?fr(D_^0%(Z3&bhS0eBI7 z$(;x4^WBndfaWqN?_rhG&SFH?<^3cLmpeBei&~k(thH&j!3%Y-yr`}ooohpkZPZ#Y z3>(*uw(x3ZC?Pjlt<8Fkg0hgi^*VR1Df(Q~WwoHG8k2PuVjt% zcjYi^Yl^O7BVVvnxWPzalv-1tbwsZeTjWXoW+l|X9Ung|Gdlw#%9h0;koRgCYN$ZhvnG$Iy zBot{e%J{qg?sIo|XUKg?c812&VPc2me^S4dqCDz}nj2Fh=xJbM>8Ko5MllLYH(WI- zT|OWve#FzsjT+4ca@jGr}8GZ1^?ejwISetZn z!^F{5Wks{H6GYv~VC9ulVeFeTqh>4y51JhAg43 z8Px`s-P24%R)wl6K~es!-?Fvx?j@}G+A?E!JeXIm&MPtY_9g$Md-4N7t81Y7lL67sozliuxf%`KDuP(` zOy(?s`^OCTlQtBh)`)(u^$cj447QMOYLtDup6P$sc=}uWna`$VOZRQ}imMU1%$s(R zt5z}>V4aVl5i#E1KRy59E z;Dh(@**;=Yvm-%JW{RQEesgF8S$0|0%Q60l?RB+w5Lb5(uTkdJG%Y50Vo)jOe#ab7 zmK%heEXzWF$^`W|qb2^K`I+8B%_<{Ed#~axuQTuFq~Cpav(KcW`UgjqJXI%c(E)Xf#z=&&}u{a{A4o= zGFNzwd}uLNSy-4byIjcp?~)Epr(^Sxk}3xT+4#n3W-N$h_~&*&nRP&W0c*ECn`%JVX+6CvFMJ^9CXdqt8N#Z?Q~ z{z*Bvjh|lgT4)G`-7(EqW!(BYQjgFm`a7&6zGODoo^}4@%E|JZ4Xc;bPi6NO;g{%} zIe$)M96y;Aw7~z+yc1Bpwr(udZ14W#lLVDkGsP^&lS<}$&K+`#@vFaC{at(TtSj-` zV-$=$9=9XJmbPb$(#kC*CBBQtDR3R{wHSndU0@xR4+)=PlAVF~r8^Rdv%8Q2cxi+# z4SVT8JG`9y9iOAQ%L}vM$8etjiI^@kKIY%K8CrU|@B;pkZmfXSbyX?v@IQaJ!$Xcf zjPTA4-n#x+&}@}W>XN6kM*-D2hty@u%*y!({;>QXDTj>xv2dK=Gut6n9Nn`R0>XT= zh*#>b!%ONYCb8mp!GkafttJHB7L4W(%DgA|VnndF#@V8Z+V@cAi1|{Zm-ZmT^J9)k zTnNrqk4sj;jzG9$UcmkC%>T2<0! zU(O@b<(M&{@Se`UCe}H@>Rt4%TEg}3Cuii$Pri5+7nu1UV(G$9HbDe24zGURF?b$@ zq(%sO>z(3%QjWZn6&DYDs|Y-#z@0f~X`=q)ZF!`YsF}6Xe^QQA`D~0KAh|D>EQ?q6#`8~MN1BQCj5i7~v-Pq+RxMs8=iJU_eqm>~S+E)EBH3ogD-X#VSebSe1r^L-TW-_xCfg1_hcUA&hUKQ;<3|Ngn; zCGqt{0JSX=!~?-nTNBCHmynbi5FGRgkzC9cMe7W~v(FL1%1bE5NJumve3Q!1RtxqV z5+i24Nn^L9#o7spm3gvB=VhzSxeg&H&uube}Xk!;3B06EavaVyyTCE*Z32zR~0h_EV=Vt zSWsUv*kov6`@l0x$tZQbhki-=7H`NVW2nJ2@1X6iOTOYGeQCtw)O7d;e@jb(szDJW z#pgceQb4Re+x=v-ZGI6!yX%J*mNedRdg2|MsYdrLXo?Rh6 zs_|$2wH1=IvV*vO*Mc}a@igiNri62qKAjAwR_U%w#SMR8n{Ls+7NnJOJM!8yr9beP^?@ zjTW&C9mqllJpHC7YJ85Mz~S&T4%-7PV;&hcOc{l0b^?Mw9#0wbc$Cy`YZ(OKXvL=L ziX}}9jG>kmw(CIdN1umSz6CI*{0z+gE=sP@eOVZbBkwD8Y$;ZhHl$VQ)PY+{<54&H z=5ykT87=q9I8y>rHK?>iUWN1vD1{R0nk-8#5nj&532DjBo6wk+5_oASEW3BEn6BtE zN}PkBPsLwy8zJHxEFKfK{oxhzy^z|?03$`mbe^c~EyJDw3Y4or@>RYemL7{V`N(PB z9%8?^%GcDesb01rBDq2miZ;Zx+``7Y#coEDr& z{CxLE3#0uZt}c&Qqq$p0W6A~Mq`tWaF{#rN)9Isk6y29<4Iu=c-zvfRi}9#$J~khG znD|MOT#`1P!M29~MINTitPGjO9auM{sV8omD-lCHZ+RTK&Cm8uY)8scnNRqmyAaKH z3$gI;JbByhVSfTVl3Du49k}kHM#I)m+L*EmqQA4)08^9hPdEQ6^Sb`2e59R(*)>Y- zid3(S1b*yGa#Yg`7y7zg!qM=Ot89p6AkXH}IP-`Am_NZS{)X6G3*xX)SIEE%68rc2 z3rsG~>_(V$2k1R=p zgx`-2YOQUiSf^1cbxR%G)O48hwvAB}6*E$Ic)*(%O`-I0fA#OV-PDbk#>R^M)bdKc z{+k`!+{Jtz5x+mAq#}mW1)R1Gjj*u8a?$LVD)7B=rIu$0McGGj)+(bQ*3{iTZgLB1 z+>AD9V=G+6%Abm_sq0{Uxz92J+##EtgBK`B>1VF*zc_{qPh^$5(aPR)JYFex<_(gu zfl}6o-AZok5J3I1aDoll(_C8bf~2bKf?m9xqo;7&edcCtEG@ah>-(#JH9zHj9$}4_ zQZY8Vtd_=p_$ZNa=w4w(f__vS_g)6&evz_6nkx9iwsOqFk>3H|CDwmEF3f*8?%CO= z6G-YlT~d1C<^@(}!4>t(iECf+n%$;p&2WWuDi-c-wCi;Yc7Zz&2`(2^s*k>cHwBq< z?#+F+=_+*CZg}tq=qMJ*_ta99O6uc#-6+X|v-R>XhG8SWlAmF@w*5s*CfWY(YI3CA zzbM_R3-J6nX8HZ^7()EuJbJP`?EsOK^0m#3#vjPjwEGmEc4C^6r;w0xrJ`v&?uY$V zf^%wOY4T6H(d>{-hJ(9_m%;b=PLoEMXclfua3qjgJyQ$Q?tMu}qWIH8;WmZ%iQ=J= z94nk0a<4^*8m0~dS?%Mg{`62e#B&w*P=9kv?!KESow%-GbkQQkp(8cqCrz2Wne<%j z>6#hMI4$3T8FjNkar6}LIB72-dF1T!N#41t z;6SxqRH8#EV#?UyH6wTw!T1L5nRU_1xafdraQ+j=l;g||GmpaBvcNyM;I9g+4x)Dg(J1X{OG_Q3X;_+t&HEhPN!O^I3WrzuliUi|p z0A2rHX=FNUnd|#lL;8=&W5qP7qoX-(af!kZ`rj!j#OfLGX(k^7_)Q=m>@n;ho(o-* zn1p(xgJofoU1f70szE&8Hko)g!uer}w^IBp^16Ubyg;B#gLeSmcHD23$IQy{wBj>x z$2a_p?#cZYmGC598om;mqD_sNq#9S~l(!(ssbm&cf0ZP_`!g89T9?UEuI#6j+l*I$WS2>z9R7<7@ zBvTU00djR2YKi)AC9&*LEA@0ro@asT8!(LzRvMo&G``MiC};43B_vKV_)Y^UU8*#{ zSs_0OLt91DLfkU}_xP^Y&ApSj=yqLZ=zU2Sn6==GygKK_H2r@ZphH6urUoIl6U3!$ z@_!Ie_q9oh90nYz4pRG;oEOtkMf+hQ$77mF1QDc9>20^ZT+^)Bw$Y;IP$m@NvlfBeOh$c8CRi$tvD&qe)G9( zoQaN{xk}LZ`u`jtrO#|!H>$(M5&s;ZP2E3yMJuo1m+uibHuF<*&T!(p&CA zYq~3#GTXh=S*{qi)gUvshn;nUhv0iOWL_?`#csceZm{d{5G*a#f@@gtO`0dk2Y~dr z>y=swEMTvS$Tv-+{6RMUl-d5Yi~fwP z0Wg~Y*6aYz#Q$;bz13 zvEn$Du(J=b6l@p)CA^p|j#mlGt`zZ}FY+=M7`?F=DWViD(h@DRgxgt+wm6MZZ^6sw zU~O8$>l7t)kOcM~UNn-RiveF?p^>)amOZ?2In)=Mba5^0GuXJvZr&VB+$Or*RKk2vN3)@hyo_O7YpJgxu5kl8}T~r-awGNijJ|UABab)8sy-#GRbP z&$i_6a^mw(2_%drDkq_uEfq_`uUnEx1-hF`sm-?OXG^KGY-vHZX{$?Vb!_Q;t#LwU ziAA;$dMeGY9X+a@R=$D_smeUnOQ>@NXo!B$J*=CrvKAAYp zV6=N;*qU+YX~sNT&J8g5Sy|hR;eES4vpzxjD`Y?Kn!D+KNr=OYgM3 zh&X?NQlSh!FAO^`<$hE_@vPiRrCdUV)&9JU!oE`XStYGXY0^ggX)cU+jIe(}%Z}KcRAH}j@$Gviv`^!G}b!cAea;{tG zGprV29`!~f_l?Bro2#7l*Bt5oqwWv>h7sa~q6S8_YNxtsEL{avL318?{&K zc*8`Vp;(+(o9=No2RbxA%xw-^ZI0q>!8>rfJ2VC2nqoLxGaOnc&Iooyx^YxP{c@RZ zH@oC2mrh}xNEqWY2Zln<#xMkBZ74V^w|#82-7vQ$@wYJl@AlQz4zsYgldg=9kmS9} z4Kc3mJD9d=&bML?T|1oAIz&24&UX9?WSx@)gSB_K($6`FEIITDIQEK#y$wWhd!V|n za#7p5f_pY+k);VC;%eQU7FvjuCwt9y*KYsU5@KKrq+U&U6qKasp zG1R7x+;n`D&fTd6T3|tkPx?hXP$0oe`AYtHXC(scSLR!dzqZ;SJmuJ-W!T zuC2?owAjuzZ zzB_L3{I&ed4naBI1Jwagw~=_8@aYb~RA25pcBgl*)n-Jv#})HuMOLXE=Zr0`j;ZF) z;lthr%oKsP##pYGogNyDA z>~M1T;N?7DcdHwZjSJE-_co$;R&~zPo!)hAb?i@5vja=J z1NS%Q+VRa2SFj!&Yk0d-OLN`X-$#j(FpcAKF&&wk-o5lU#gTf`J6dK z@+?>IBg~#+RC9A@i?SVI$nJzM@7Tzt;Yg#t9`#;km22Cn!J+GoQ`8%`f;XN~_4GZo zi1T%mf?p1+?1#h;4$3D-BnYVIG?n*k2zwG%a2BS?N;(^7iYUr0I5*_%qb?Yu45doY zJbSxwn%zapWqthz0bz#+c>k#8#&Nc{P%bmI^Zw1;81o71byE8W0XaHe{wlcqy>SWb z&FjXJ(~_hdf7d3D5hj&zDw(am7^li5)S-!{zp~zwIdV_W+XXUT7_%83dsdkZx-oewD#pC3H2(N1`g_|tWgF;~UzwUm>Nyt7S8PVHDr z>*tZ)C~Da{#dH6UeKGW^-8Scey_2=f?6S_SS%)#5s>xG%O5gWhjm25?ygUC|U#4A? zlc?Hr@%z`$pO;if)5-ZQ+B?M2Q3*WjJeQjW4o9`+Q2(2;kJch6tiP>YgMTtdTGGb# z7GW9tn>j34XP+Q(^hpe+vDBQ|C6FmOEJwU>d+<=yaN8(7uj&0x5vYr)BL+Pa3_QM{G^V?}9~m@RuaVsbMp6?~J=Y zdHis^P{{VCo?yiK0?l?%FB~^<&w?|c3i8ybwF=VMY%N$bI@w`c+uGq#URU{TN2soR zgJa!BhK@`)r}1Quxv4w}dvIs`*(JYs!@4-3N9((XWOv#&OhPuA>jf>R*lcbouG%n9 z0hR6il&POO_S!pbx=ze3>*)_fW3E{I$bDJgaq+HtsuOhGW}Q4<1N)TCGF4Zr^?TFdmR|#V(r3EynPvG`&mY zN;GKUtfX#ROZ!6cdNW9N2VQcKqjpfZ*?qllS-Qdfsg@RmK}t`1yH9!RDoBgVtA`c`|*RrO9J3{5|X3{Gk4>ztmz=4%pH?2!yG zjQd7wv=>5sN{d9wO3B!5Vue#ABdHw4ViXUG^&j#xapnr%=8vKHma|H zNKHLW@srt2GwR<<_p8=pwS&I2jNVTwKSjm40_~~4k_}tBCS!ch<fD)M;}?|z>6G1u5p@~m@~>j2cxU``peOpi zT@5jev96dGUQZUE(_Ftj(Uhm=&!_TXneSo$XyFmKat>^bFNN4Fc#uN$1vzGP4JK9i zlcqA}s0bCU(0nhu?56zX9276*oJ#K>@cN}tt0@I$6!IA&Tg*LX3sO<#bYII2Oi4}C zhsW=oCJ5-;S)1B@YK~<1wWFIHH(#bxUM744b=}jt_=&YtmAW$QM)i+MjxI>`a7LB3 zyzB5bJ$L2k$MO@<7n!wt9<2k0qu06bzb69OBFKf|B8r0A@ zxNk=lkC$?S8r3O;9oWg18JIE~GiqyGuEbll$048PAA0@f&0C(13u-Pqtaeh4Uzx8U zZ!Y()^|T{W+TavvKdbyY<&DP2L#X>wJ4y=U5%TdK5=5^y+I}$gB8wjuH2|4!rA4$MfEujo<4Y zwyA&cIo__MH>>;CVSTus%lluX!MzttpTnsV>@SJ4d&%$AW2nj3A$j-uDC)$5*zuB- z=UE%g^M;tL_;rf-;KBOCqnPWqZ(!-cL&DuPvBu<1ud3RH5+<==?q1@zwj0A%ZHC0$ zD2z9b2!!Yb5+D5BU=TWQlvMF|Yytp)g+SH|1_1WHc?5tx5CVD0)2syWeU=%!CK_-v zPxxbxhSj*)Yl+N}5*L7hQJ5R&pFPK7Bas*K%2`|T@p;Ebf%nlAccz{dJD3G+DOJw;Vp%uddU)k`pu33jj>bU9OCY7!LJamn zB(I}6nXRRk%x2Fjo^HX3L#o2Ud)rQmYwFqS7B|nZedkwxv%|N9>W4P=W#f6K%vA`B zq4wLZF?7r0pPoAh)Ox>EQ52q$ks@yH0-ykrfSf<5P2{Dw&r6$!6A$m9Mkq4<>zTl` zOV2XHr|;;z$!k1*+}dYj@fkx#^WJXedouwLp(JcgM_?g$S2OG=p;U66E}3^=Wc{eY zjS(D|bOIVSK28d!-(>QU7nZL(x%E(K%bxLEPE8CTIF(oD&_aAn)^t||7o%VB;jJ^HE<%{ED#IJRs!qy?x?k`zszdl@Je?EzqauN~0cRdT6 z7kawCW7B>g#5J z!V(<8;dYEftZNbDz5YK1)i^QnQj1;esinY2HebhwlB8IpX=pZrBP z`Kw#)uA6V_s(v6qjwc(}qb zT4%=8=$SCPbNWfXOee+|D`RR(8tR*j%pXT-J;o5aW>NL)nZ)yS{yC3Amf6i5v+iQD znv65=!85a*GAmmjnaw>2g=UEyXL$=KhP7wPaWLO)&yM2ExgVQzuAcKSBnK@(m|;eL ze?7-OB=bpew%~k@(3k8-R@zT@vwZ~Q>h!Xz8S^Sqb6Vw@^I~P6L7AH%xt_)J7EosC z<2)~p+=k*jubJ$+)cjQUyj;D!D~R(Bt^Ba2}es11%sL zfBdtZ{c}jcNyy{Vr$yfdil3$yQjiu6?iPLsDK3QOGwB!Y?`p2f=RufCzP1-_yBAaH zmk1~ni+IqI3zjfj=W~k}{faFXGcFY*ErG4Di+Pm%D$WzGDjnx66ZI%FnaS4{d@S36 zkoPEM*+Up9l=(51?v07+E0kIOD3&rQzq(hBf|uQ_Vkcl?>R>=RD){QGIeFZGJ>jtVWo3Y+mKe%6(r1fB%uK1n34@XxKh zUZog8T4|vF6nnq&PHs}dIQK_}tEi2qNrF{39G>Klb2s5w`j@%ihg?35>v&q!QCSd? zVj@vp>QQ|iUR4lLRkg=mxmR4kRBfDA-Kt-+d80Z{0@?M0yR%AQK(JR#=8(zfado>@b^GIWhkJFON$ZaV>%ZvNfAy&U z9#?-}RsVCm{`X$}C20eJPy^^%1CeI~Nqhsex`FIv1I2y=6&6w#uxugp!%86%V#3{&#sU)i3|N-Dd*oK<^1{|q@0Of zii2J%@;haH56oeYLuyoPTl47=A1drk~{9}N4DkN63X1Q?9m z^%}XCFcMNT@?c^l>|i9Ed=w)*iZvLG@*0gv7>%tNjh`4zJQz(TA4?S;OE(yM%1kRRene1pO~SpsL6(RrmjD z0{t%nfd>Ci1c5gHF9`Gx0D%N)0Al_%G?-LQI~>AWiPZL{*;&H^Q%!4;H0E;C^e|Vs zqDZ=+o#KZKgqE5J?o8bhI$mCkCPK;UY&~AAg1#i0$)8~}QSQF%KLP|w{O>^^LQ)Ae zMr4LW(V9kK*h*iD+(rg>q7;KWSE35%0+(Saamyx|;C_bGtMZU&!72^CKOm5Ekywn{ z%27(B)(_yaa#HsTu=Gk>y$F?{+f+0isU8H$9<1REi|D6^E;xpj_`Tv_jl2flto)XB)C@90ur|`YA;G)V9vjR zKuLt}&d8x_!WYgF5g#sg!s1m;-muwIoPjwzR4;$O+mR*Y{C`29BtohjIjMWwHW z3W_BvZCHWuc8pIANKIt!O?^sCg!sQ31R{g~+aQo;pOVy4B}s`I`@LUwYfKz4gXMn- z0%`U^?-C=!j!?HW$X$X@>5z=NZuIR{>PWDL7D0hGharn@0}6$;e!a%H-%C)WV4?E1 zoQjC6`YK;%n|62#AyL#EGxOM>Fj>hOUHcDPsetxt@~@48>T}CQe0F8?mXjh{@&=Zh z35smoP#Ur{WisC93ATsH=+G}DIoxCY=0?S+EG-ULMsIj=(X*tMhguJw(7yJ)^Kned_K%n7&6$C4yI?AW+nQ5fBJMDmt;) z)D_`peTlrMlZ(I8Xa?$_j9Dq$a!3j0z*q#P8ts*I{FwOzbI>Mef@gzFu^E%V#y2XF zP^T1z{5v3!Bn_^V!n^mX_?;Oatp;PE-);m{sJ;;|&q!^EL$QXlO3K~PjS8i+!7=g; z?zkn^&-X3bUXfhScw)0fE4?(xo0%PmU;0Co9Tl-wti=oPk5fw6=X?ki|F=M(=zl*5 zWdCo0K*@hVpaF*eIS3T>HxP(FML~a#c*R%4ZHV?E2x-@QV;Kv={XGx}_pg9JF@!t+ z+d&|w|1uyDKPq}J6%@yjBrJFze&i+dRnq@_(>#C_Tx#vzZSg_bpWmDCh}pd3uq1j5`E zIycC$SdN52|3Op!paEfqldK{=f!1kG9m_02IMuAFE%ArU=UCFhWs7gTS%FqCaz;#x zq})E%!6X9I%#}T5g?KT`6I_}xixOSofh$deP0^s7VR{aWG@2VQ>dUJVl#gJP+bM+N zk9E0ZBlsSp_2ACmvR}y;{-=OI8vg_W#Sw=4QZn7Al%+-H`BD;dGX2(pS59-T>fsNu zFzmRL@0gt1qvRz4a{H(Z#Z+#hP;O>jZvLYz?}#$soRMiH9PzJ%Krw_G7I1s(d!f_B z+D>vgV*hC%5bhrY0y+KTKp?08zaWtF|7Q>g_a6g+_>txK7I&2O6;DzkE#~siVnlxW zj2)x#-AJYLVbE7J>4t2=;4@0mY5skCHLWPgW~XWe>gP6^jY(HKx~C-{nm+eAZhHYq z0};qU8=qWjbCB2Ulqq#KG1Kwkrc_>TeW=KSCM#cWXevJ{Tg8JwM?}te5U8OmXszq| z3TfdqMemV8FN$J&n$#H}3WAm6E)~n=JuI_|Q~A>yK?I?;*nm)yl%pX@^&|2#HC}rQ z<&u{zcG8WaD7iC5;wD(_2;%--hi=6WsbG#y$Z?Dozq2X7GpTKvDK-C{Wx=+u(4IFs zcnsgG6{7lxW{SjRak;5ZA9g)JusGfkB%ZN>NPRrh>{&Kj?2+ z=YTQYtsxp4V(ygdE(A<)+bF8vy-wDR17BreQ6bs*@33BqFtd^xlOPvx>HuR6vt%eG z3ro2LNk8N+Wuh9fumah(nv*~m9`0$o-oZIS)*Z`So-s;)9y9`0cbT5gCz~N`ouxwb zgMOgMx!xeue-J787HVH&MnHX|>|3?W@B(SE0tOi(Q5XcQ59?mjFl-xIJ}$E(FLwx~ zKFE>r}_5IKUf zR_F+&KWm7iycB;*eYvTOpwI(wJ%9WQ9T}jB-V7{z90-P_0>P zmHzO^uR?r5xxn1Wn!;#CLz}m&yAr3#Tq~sH2Hy1VzNG9!xTKgOJoKrEn&WP-0|tNpXMx zvUzg}JYP&~EW=rx4i@({D4t_AuAn|nWaa1}E`DnzOfx)*7!6wyv4Dv$uRxzszUUyo zt2{_@c@_4^o5p++{zxH^%9yGN2HsEcy?|BX#41Dgo}779;uNGwAps@Kw@(t^?G^aO zd5}D}BzF=c?wqass!YB40`Wl^9&|gX-d^5_}G$Jbi!_R?@G<9`!_0) z6ro<#$`{qG_M37S6k{DVgyK}9MfdI!ZZSyFNwL8>-Xf)WDes^5r*-3T_%aN}=K_adUaDJizCyI2pj zt)>d$4iz{XY}XqkWBSpRlD%-+|GYJvG!-iGGO=ZrKL_b8eWV|%joFosF_FS_IgvC* zK*aW`%WC0zp5_MEV1;PAN#$K@p>)gmoiZV9#j~s=QJ94g%t~m-?%Jna7ZS{LPt6;| z%wxpuyyqY|^vARji7`c#riyig4Z=hyR9fhz@1r18i;d=f%O^IcwQG*L(IkF-A}I1t zaY^2T;R+!!TX(jXajlDN>MHP6ruB~0cscM4v7P-pa3iHeZ|WOLkkrS;=rFgqbh{w%v9B1A-D7gllf;c?~w=*2!Yv>Oh7t3&!7%6}bZ zhs@~nxbo$JqWw} z{rJZ%vD8YhzU{^n8PZf@J+Q43g!xjqZBHgobdU~dp(^TMdm>FTV z9L3KAb)UGgY=lF=8S*#z%&N%DE^>Xu{1wOKEBd4Nh%|#SV+7CEXW<1XEGX!l%X0dk zgFs|oY2z&$*E;0+S3>71hS;l+>M~Z`R)mp)i-yuDEOKLddb7Y(8L_k0}&WB&#M#nNze-jrWH`6~#- zLhW(8zEC}x7Y_pMw($p_#Ygr_$60;3Rf@dyy8TVby~)8S=g^0awB5ax@bb4pnkrkp zMe9S>2SJB#+Ha1~);&5q*c`igrSR~V^yy-oT`8~KyI<$^b+3NsynwD8@NSPzM4P{r z&IvGdEzC<|E7>=6{kh3OZ5vaC_c(m`U^W_$o4-#qTPo1z5Ch+*jCt z4DrBOXsjmMIZc-j+kB!gthb4Y5s0tx7nv!UXne`qqa!qpTgDX5rsKftK; z_1i{he+j3MfYw$*Nx4R~L z#19V7>v4$%XAA14VJgJXDVWzK=vcd7>gQa@yX7~@=oW4^4+lLqc%BzH3Ea$!5p``! z6W!HVmV8jx+-RNmT6$Gy-K+k~>o-^2CDy%VdO`Gm0fCf!e+55&b}+zv@1H;*JLb-7 z!$OOp0eBE7RO6W{{cm5kkJ+2MFV=?{c)CoV2p!Bg$=a$`#d4Qs^tddB-FwI7!>AWl{E%Hu4G|EJri76gfnLX*DMDn0cMF)4Bh!2&P@%UPxwF z$&R|=EA&cDo10TST3*Z-^X?58R*hGH5*)~BW=3d2;SdpUEkytBF^9j^6)beh-h{{O?Mv&eS(AW4>b(kbE&AjoBsUIc8j{774i_$XB{ujtMk+4{EzXv)^(1;iW~<>ZpClpRA=XgCCNjyS2h*Qvaliy`~` zI%FG|h`VkWnw*9$#c;&)Z%@k&|GQE%!N+G3n7LvXZJ{ZT#_!D*FLBW3Ed}}PPn-iYBgn!LG%7F!3TzRGgi+s-SjU*((sqs=pG$dR%C?;;z zo#MkcEZ$u2rQc8^;E$h)ZXxck)l-Wki#SR^a$`u;yf88+9gE#}Uh|EGxI2gNF1-|- z#jj}Hy4&=igwD0K^ZkvP2mbsw^Kk1d#nQUSGgR`!?APr3ZWtZ3XG{oKUD|t83xWS6 z$LYlh2?trK-nW67AvvL<(;B=VVaZpDdWl{d_T$SnZRvL&)pMzTHX_xz1*3+ha_}}L z>DW5bT&o$74vAqv^pj{!IME-TUR93kRnt1+VC!ykK_0-U{G=S7GY`dBZP;i>Ok5K7 zfQxs`a@WT09u+fiNxzH>J6Wf>ea^Bm@1NEF)&sF-z-)!W$F9+&9Vc&a@PB50ZI>ok~r3hj^=$T&WbvCbDXYVE2o; z9;fk>=c#DN-f!F$vn(|$Kxt%&HM*+??$SKv73W40P_PC%&Ti34mNA#acyc?l6}oAO zs~8X6(cYl@VHsm;2aojFaiYVy4T$`h!Fyku)Tu{9-KWQ->~*D6KQ*))Q=d_K>pIyz zPdcr+QcaHU;3v!yMGy@NetBrB#|nqy%v(50d07f zm@efRP+Xspdfkl0Tz|UO@`D&S72(QNtzKsiB_8B(I2p7K-6D1%<}ffx=+@a_+`Oe= zqn~uDHEa*ms2TJ1i$Q`4H|MdVPO-KROS1X8v|f$2_9*|3mTk~r;U-mx&A>nnBKzlr zS`~S9xhZuc)#-muoy&hx`QBaaw%rnpee+73LNyYz&oev3xK~6@&BICVvm+II|Kbb7 z!f%%TonP{ne!HgRY}7st=kGzP!P!4Cgq#fG5OP>h&4AhNjqjH++)FlT4=1pDU3M=Q z0(yoYE3Rv;ECB$oF93vUvP}A@0TM^#O+8&&?y|R@*8skz4u_2Yb=A7y#Ne`F6Dyf3#D2E=vXc|yb>6S2ax!1s~q})Qxwrz zH!Ct#Fu0e19}OG;KoOb**Bt;5!T>a>TaP*__0*>4+XGc38jX00W{`^^MPWz*Gyx6> z-&DN>sUdbFK@4bO0D!GYcru_(6jfF|88!kSLb&HVzsNQ<`)T{Z0KYP#5koND8PL{x zd@V8{8%%*yN60H?r^_Q|h=2yn3<$XVunz`;RR>I?jgS!}IDiuw0isd;Aq0o)0Ek_p z!gO^=1F>HN{@hXtgyAqZbo+zoFQ!T-icz=3G}ODKcmTjaO_O#y`ZlD^2T3f$oq9xu zC8ohBZgo=U3@CIB#CrZo)G)mTlVZD0W@ATr)ASgz?Vr@;+$BS7)-Km0yt!9(wlB zmg&R@!7aGq3^ZDo;d#V>Ze&pCRbW~NnAY$IM-ZTf{g{SRO^I&E=?v=Xq-_#65g1}R2-;Ivvjr;$+tGvZOT)5tXgIDaOToR>MoSYS0wDP73KY^{d|+xJN--3{ zfS(&6XOLRJm8v4#E6Wf6=BYO5G)~P;EDQ$VzMZcmQ}scFtrMCvd7}YEfF;eqD<}OY z2YQuem;e@qvsg@km!6)4o%lxQt%Nuc_z)od8|BMm?6zZQ!k@I$kaQd06mN5 zAs74;SE?)kOi04{03gat%U%f{pp?og2JkQd#&g6#Wdx0$R*2>`02a-#JWd!$xJ;vY z=@2|HoiK51s_4zoqZF#eN`RT?hI3j1h{m1{TV2%@!xgDihBmz4qq!oa$OtAF3FPe$ zO%GnHiuWT89c>!|Ck&InI%5RHg;Twon{QQKRfXvOv4&9N;&^I^VKU?|)KEB{8d}0r zL(ztRs3G7FHKZFkhNp(kUeybakSzyQnbu}~1Sf4lB`dYC=T8M#UZWy1hAWfk+7JAp zh8X(26iC#%jK0s89*3sO`)1&&AwA3M!x_z@Eyb5)O6s$bDgX-I-l>JRhUnaesle!u z3GyUe)23~65V>9y4j^FxWNt*?=juT)s%nYADa}TG*3|x3Ln*o>V0A?MtI$wA8i@B!13gWZVp zXlt|{?=F#_G~A_E838A5aYA^?O)>}&5Y4CF7F!GmCJtFJagx-+13>pZmCzRcg1VArf#%+bDS`uEza2G+$rkDO$Lum8Icxwn^#XoPJN5NsPyT~Bl zSj{+Y|0^D(KO7Oq~3w*>w+TI%3px5gKT@!dRc1sEfVgVDrj z9fUE4B*j+dCI?m5+iIRP#2E^arV~LlIqHo*J3y#rqX|*tJ~%>h{=U(JD27UatbxGs z2rD=}9T1Ea>|MNUH$zhfTa36M23Bh%Y;3!=SJrDY|5!siY94>AA@u#4vBR{#SVNU4 zR%750Pg^mVBkRE;#IBmEqvCcZl{*TF&=V zFXnT2(iw2`z#w@0rnP~tG(6kNp(wIVvPU_x4YXrN{a0$Ja+$=HT6cz6=?lIy8kF4| z`w(@Vc$kc2c*Wgv@Gs6#G{QjMK<@Q5Q{ulkL%?|A8q{2CPb9TvRRZq}T@twgG5-%V zl&$2jgeI~CK6oxtRyvWQJ7swPKtqvp_RkN{Bxp4B?DegkwOf|XtdH@~5E`#$Le~`e zve9OLpdo-9Vo8IChAgk*p&=sluzT<((GCFA{&9v5Uo=}o+DIU4Kh54BmZBAsfN7_f zAapOH{ko?VoMx3kRI-O4r3KoxZc*n1865~fMgXvJ_SbMi6$(Yv&$H zl|UQS(*k1IyS)H_W?1VXiBu^DZ&E*9LonKktDuR_`T|fG!?0f61!9+Mwa=`;$^j8k z@Z3ccnmF_ZgdY&0*h>N$?_63hp?|XiSDLTxtUjYOk7j_NK@c}L*GcuL zZbE*$FP=NZu$GTPz`4+S>$5(hrT6iVKOo@S=liPThVTcE=cz9GAZp8E3;;-sKvjN+ zMmfq{eKMidOinoD8QtrFv6>KZG%^JbBWxWej$psv|e)!_ft;fVdN+eSc7oH}i zx*L%X+2TS{%5{(0B1SzT82<1=i-apx9w3z?62%k(knH}YV{PZxDDfSpP&Et^870v6 zMh!w})&ROC02Mz|&a=$WaVK+M!|W=)z6HXj-r2KjV$PtlyeaAOVjzoO4CGU);ZH?f<$nQhqsegj| z$-eg=UMP5)pckFY3#YwvAjS*7H*D}(3eO7xJ9u8GVO#MJFND3Q9)RG`ng{kqlznKk zLw;3~sP6070XO0A_HkP$s&79ORZJXr_W38H0hv_d8Z7m2Bn1Qdey5Qt1c0O*j5?pN zN1`tuUY}Hy?LwUO5CGR{bGE`swFh zR1_a-BdYhgVQZ6=T;5$Z0>B4rrAEYfL}cZGmGi{vPDejHOSGwsvH2e5auy*l{TN#K zTm}i+=EUjte#L7*c}tIOozr4d_G`0ASdb7rGUQB}=^)QV`~LOqHaCOpUNQSXm)k*q zLx!@|+{3-dZrS*q?T7qQ<2m|P{s$R~W>V4+2s^pUmwLNR%T0JPDEywn`+oBqGa*5W zeyb<(yl|Qp^(%jnA>jBE(V_W0%*bHGwwLuo2l!cUY}59Mi;FpR%xKq_OQSX^@9dwT zqtyUi+2KW#W0jZ-W)xy3T65{c7v5<01MrDq!!d{0X7A76!N4_Y2)Z zLZScm2mnCfv$>1{Pv4?A8%{5KAXMwFe=otFUdvxVpdzgLZz+x_+D}sC9E%f9vqGsj zJctt%I-`YZU7FFMkZyyG3Mm!A1{n;S;b6{IaLJ?+Vb~WvX77BQ2HDFcOoW_J*g6&a zl@sxp>oiu*#j2I_(>kgbx=tgSGV1N+s_)w_i=qrd-DXgVPG_dzEcdE*-aH;6B945& zartf2R|cph7ePyEHgUiU&k-R5BY=p%azqVLYXn-cC^ZL8ZXP3|V79dadn*nAZ0z&v z(KgD5AGJgvn2+rvAVwYa0{I1BfQX(b%o<^|sF<0^R$os}3Zx=HxFlGfOGG-Q z0Nf*0^WP&N@Rh|#bn~n%1K8e`*95m=H{uBi_|^FaC=`DcMSANi`RUw8VNGJ#H#Bva z3qw!FC7j+Oc-U9EB5+a{JD1q&%(Z7t{!|C+nfR*Qquf{9${c#cc?jRZLv=nAuwj=+f-d;;7 zAVaPN?U^Ck61n>Ft%0HGHIuutmE=N~qLyUuwHNyBZq-2M<|S)ry3MXQ3ltH)UkOYb z71)4jlk*1B%*;zAd0s3L-|icd-g_#^(zsCTbDqQ?;PMv6&NIT5B1dG4#9(BlSzKFsVkJY!3ItPSh`DS3^8y*+PV`X^HGdQM9Y!2(5;_lMUB{f;*;yR}S)&Nh^)dLQbdI58EEAd@OF3te8m(&L>fMU72W* zVPGixi<6W3=_m;zH9jeH%lH%Fn4X4lT1J=i@2)o8^yWSy@8I0QY;_37R4H6}w*T7e zt+G2Ae(7eO`{ZBomQCq{&}F;u_k>I%^p%GgzrF4(q40 zMUe1gb>r-FBT2Zv@W@ol5YbFHX1y~gSnd{>l8*|={}?$gj1r6Tavh{R+(%q}EtR~~ zNUgt!9Hc(&E!PEWI3A#wjDiDG@jWh?UzI9zKcXWH^P7tOC5kk;?)UuCYcA0`2a!r( zrg7ZrOxH_eGiS@8NDG| z6z9HVj1MtScxJAu?(M+Bdi&_~qaZ2=zLOSsGM#*EoxVYYW&JY#%1%Ss9c4X*G5eWzm7%!67yGD#xW(wSoeH0?BMB156_T}jV3K-&!Z5r*WW@;}>--{h7id5?KG5 zCyuAyX6C?SF>hB3qyBy+x$6`MPB#^rcmig31fm>SC8(O{*6Bz%KHv*W|NhDMq48D(+^Z9j|MDY*l@Ft zFVk<%hK2*7R7t>jl**~XO`xabOF-&7P36~SPbzgtLTX}$MS7JGU7SV8$n>ohI4KUeB*73ir{6v>ddE##41aD^vA(bR1qi;i=jF(Cc9sFxM=En8vqw_ADv22a*8@Z z7b_*X$zKtfGVshzdw+9Vr*JR%$bXE@Jf%4*0(rZ9hzrs*YWy<-Mq_uT$(*o;eHehl zBE=NgMA!Q0W%^0jku)u8%%-H0RGjM( zl21aYZO1t@k1+b*qeDoqb`WLK->;xMDuS%ww3u_({BLL3lF1=uE4}ypb|1>}{{evp zM87qrzc_GJRb2TC2=wCcPBhGt3*G@P#$ zEb}?N9jGO6^xX>r9ch9xxT_H^36Kxjj}M_&%c`OfcTXnb*JTjJ8! z9Z--eNnOyCkT26VPmB%QOcaljE1_4J&J_;fl8bgiWo+1mm0gJ~Q_xt~F41bklxc$1 znUVb}9~~KD-x!VooCn73+PSK*GJVJvUNnknW4n%ykZdV45#=$xqo=d7WRjx$oY6{A z*HE_uVkV<~Rmb9nW4e_62OWQBJsB4roF=k&%V5jNa6=O=ti|j$Z_$A!=XQDx-^>E@ zl&xlD%&$ovfhBbbt?(caWrbd)Ep4=m@dT&CIJJ3Dx=CoRo@j-s%tsTo3e7s~6|IO7 ztq2+X^UoDKUG@=o1#!L^>0yu0sSNH^wKexFJQn-4U!^5tFA{QWb+rdb~d0 zcunsVm&2nROBRJ(seO5ed~-Pk$7lUGwf$!|B+5{5GngQ`9q*Dly11V{v|q&eHN*5M zgA}3Eq&l}3PA-&>Gjdx&i;yw2p8=AyV5xrR2XENxCkUBsd8ONd?OcjOCB{43Hpfr= zOmdYdiO@kC5=R7r6Y8ETeGi^naO-N0oKDM@_3`b11Oro4D|14u`J;6rt|SmJ74iFZ|@s*3uV;*x`yHYESa}BxBbQEu?FSr@MbyL2xDdDW)`h$=&6vKLY zY0iu6mk3PI%;u?esWgau9P-#DL#tmXpH?tbLww@NU^Ag!3x9UBKvmx+wdj+wyF40U zS^RfC@^Rt!%mP@WCv_ zySA!4Zo!EOij$~R@~taAR@DugD12q@pxxGSDO1p{C?gQkfbN^=^Q!DPXwGXY>vH4N zJXPmGSxGYvhl0;ttDoycqKWreA_qU;-EVxj|NKrG_o+jBMRkWtRZ_=7tiE+;=H+~6 zozSl(jgc9l?vI{6B~QCIz&+oDvhlzXAlye+)5j*<@5)ifD?A`mGs0LspaLG$5q9YB zlSc^;yB_Go3Jv?$49oDCSQ|2U-&86Z8+lwqGweBBB|I+rK@8ws2%P0BwoxLtoNX4K zy8pJv^N(%D2!)oug(^yZX_N2xHJZKuS1|?0YhM zM}*T*c<8>!Vvu7r%J6F642xT-zB8@kE{HU6Oky;{?!_QIzw^@TL&^}Z!HwEg{bIqk ziFvSjZLxO-a^2<_Xu;r`<+YR&#@@+@{%Q~&(x}_y<&aJ=*>}UPl@PA$T;g>jO{(<) zhKc>HB+9rIe0&4;~TN@@t*6c zVhFcf+$9T{}1a@@J!N7>NpnY|`t%-{#;cExQe`AMOWOQeb%k?P z&X$kYscozM`^rV^0zO@14pg1eDy z`$_>QvXG7>@ATuMk|3pd<`zwBP^Zc4)^}CunVgotmf66lSHDuxN%Ry}CSiiFO+xsk z{{y`N(wA2_%P_3`lS&5-st4{o;r%|0Z0yKHb+ZP|8cpiZeua4y942rPGOdH6{ic7! zCtalJta01=%N)O?xqhP053|?sNtDlsierriBRjJ!kfzzPHBuzq7NbzKV!6B zYHKLUX^BT>C|&yNN?Fx1e%^}cr!M-e-8N$|;bzn}D7<9AtQo2CbAasCKy}pw^a+&w zbEKNyAWuDh0tRLXi`y99`ePiWfM-Pl)a)7!wfuBxG1q=-dau$+J7@(y%DT?3^~X4pitxcN5#na9f?80UD>9gA!zec!A`Hrk+L&3|L=y~kZxBCDP`P*4 z68qKSK}!@)lOn~2;=8}ZqFP~lufp31>0<)DEu8lP`lDVWuMvi<(pq&)iqqatS~ybS z7+AffMXibtSLLdnF{yArmsX6?!bQeTn~5GD!IiXK<2j3EN_p2?k}Xq z<2Mp-7ZtxYX(oJ=00)kTg4D0qvH3ogo6t}U4v<{5W_^FmGTun+RLAzpo*LXI@hTy& zq@Ur0;FFic4pgu3zp;1Me^GAz|L6y1$RTFvh5?jTx*Mb$MM@+LQVBuY8M+&!Te`cM z0Ythxl$1tLK?IRAy7#{KzV}`GbAP_)oFC5l53cKRt+lSldaw0*B7Wn%`v(74wPH7gqbs<} z@*~?kqy*k^NoQAcp;z?e8h76{LfbXCW@sW7qW7!6{FLnd!;27|5lx+}#93D`2id5o zsP@pf1QLUesZJ3TCmiCROa>&Po8x-cgsp%QyTPxCU}k1_BK5}g%JVESc^D1vo7@*O z4h~SsQnjD-)OdO+$J!$=7$BH_uCM|hDtmtUUd1#INvGHiTv*JgL*v$Z@bAHt0W0Uvo&CyqQ9U^YI zW{Qb+pLA6cZlsO|w-;3Z7X6#!8Y6*t&d zD_ub(y=U^P3h9(DX-L|1q(GScq(DeF83+|qr5!CQHUFmLVvKtC>qR_YPc4RG4sx(+=6()Vv6M+=HziA9BefWg)h{-v_awiv>#`Nq@m@nY zW-3a2?>Lr)_TOsrxMlk6L}c`JUkEAmMm#D{<#mu`iOuod?koxiEZyvq0UWky6Woht z!w9}(t5)~RRlBY{rL>M0K;n^e>SxyDd8#k9`tth8=rT1M-7aa|?B{DuZmtV75$39R zfA&drwu@-+5bDU5M*tb41kc8Hqn;UjcurrBBwUw+9Lv(+y)M;|?Dghc;tv%w?dxH@y^pK<>Xghh28B(JwoOZppf>XJ zN6lkde9RUaK^_$oZxryVcMIjA%9!INzVZ=O?Y^I@ciDXU^mlo#XcW9|2=XwL+TMEM zjc1w90PTY2kQhmls7}nxbM#-&@a9*wIWsS!qP9imWZ#5eD5bLM#N^D~OZn^eNKRh9$3ay-+KGg}%m=%LzmW?)r_=yjP7bh%)5Hjx{;6&chRb>Li!S z8*NgU%-pd7!&BTZ!Wia&?C-F#qmS^>Akd`-Ekl6{b144e+|iU_ygf_EO&%onG@j}*NB8EesK=0qn=0}`wBvFF|k`qMd5 zN^eg82R}g|%4XMC&v!pTppN=sX#WQJ>1VkI_*aT%5h5MI_}|vzw;b612m-~H$mP7^ zaEGu--!GsVe2I3IHeo+qCBsVM-$9@(!D)9~l{%(K8gC+AO{TG1aXNI;f*7zbz~4b2 z)|p3uyuc85c5Z&^A%2D~phPw%d1Z+Cs^D)RP!3p(D%n=Ug7o`bk51fCsO!-R%)wk+ zMv?_Z{VNERGS^6h-wQXe?Bh0N^1rcoo#E@8vdnjA41#Qf3C~DHfq^78^!COTGxfZRvNTCt zY{CXBhmoqf8A*J~)W&tOc~xy+l7t+xO?FiyHT_JI#NykVoJR9%cRNVXA&r~dHb&|u zZ%JM@Pi@}tm{-65og^7W_MSJ?NW+SOG^PK?mV*+7n-=O$YHD-Zbp_zJ-w|YwxApBOO_8K-%(LD){D<1 zd&O~QN6lzK?_~#B0bk;dhK;d)+FP%k?1p^ zg(p`8QVvOJks@7*G46}oeqg;O zAWw_&tW%eNiJl`+7`cFBcBa6faT3foU>st-K~CUTzh~K8j9Xfe8x}DSZWLs{n)=D2yY~yXg!9xt!+#+jc+HCMjMvPsFf~^ zGb=W>K@{&8d67H9Ss0nco;cU~5jpMyVIM`61P`5-W@FZy=I>S5R}B*5+@WM(DSqPC zkFh)ppJf~ox_M_xf@F)~CEZ8I7}ra5J$1r>!TwbsZH0NnRpWU>Nug8_8QE&+xY7yd zZJsBdz;7wiDyYttZ23DLy3_A`uGe>*JhdMvnZqJ$|Lr3ALe8@#vOc{Gg!D;MJ8Fs1c}b?y6dh8TjRufpNjpOGpOp0DR&&sn^mY^ zvLkidA~Wq4%b<7#kE8+AL_{%>!K2r{jZr(F?ygORzc7i@Xt)lYr2@^cb8*8ipi?lPdp6 zh;LaM#RjCI9xL+~W~u79WT)i^vow~T-S8W;ghBaCTT|EIC$qHu%)Z<+_~<;Z%TKo2 zR0@$eT+mvw$`fFF6-G2&Vfi{;hbQV)c0!6t{d*D0nxt@^K-0x<)$aA2R9&T?OZa^_ z85(V`LEBpXR)Z4VP9DR4V9&*&t!G1VP zl4%g&mT)g=7C!flJ>rRXh(?RoN`Yu+DW9uriYrL@8ZH4*=9j!WQ$ruad-w0m{a}{H zE>s*NV`kHg8~vDqpA=`JgDnxeJ+q}EPq?u5e=ti$yj0(u7Jp@yc3nq3eW~WR=h*!p zjN8x1FTpB)FiTaHmlKTlKmA~q=mw3O72FxQ!4yb~OZ)P-`&u6|2~N_ZzkajmpJH0i z=B_zpqIstE{Tr-;0YT?`fA>kOL)43^CIsVRSJS?@WitZj4THYv7hv;yUaLghiwBv)|1S zpIQ_5l{tR$wIg3cg4=V-jCm@)*)gULuj^^!7iAM$W3;5k;OY($Lb{N>M4R4V0_;)7 zficpNG29#;Y=JSx@1s;?TKEPCiaE|wBFNj1BmEl#jCBw&C3Nz8)C7dN!fzG$nBY>p zicy-k`c z;;J^E5L3E3QF`&=6F$W!?W!q)Qz7iB(~di@O2R%S3^O`v?lhS<@@TH_@N(c~BejcA7HZAHnWYR>JbZES)Q10EXB z5BT2Wop2{SuPD|_6ss;j46QJo^X$fqztCVg?`~}=3 zhkh`YV2vLP6AdwIvESC=4aS~B6q;dj*1L9zxc0PC;M$&m!em*ACVt4__2?%;#2?TM z(`B!A-=kXLr+Sq|C1{4}L=kp$D%Xc*n4Xo2Xx9?!2{5jOfhD!Ln)Q0R@I^~)85K!k zL3>1!sG21;NNZHTwKhH{&keM~RClB2NxbkY^KYK&gfr2*Ng5R$1WgqKB@`swdt>Ee zmB%!beD~GR3X?1_fpjgL&hV%tLT((bFkuRwms!5AFmIpI-tQU|pPXRKPH>?+jp-WI68+ zVYFO`64BT#B7uHC*ajzh%2mreFZsbRp%tb_Rlh1s1~&*P%Y+Us+P@3xkUyZmm8(;} zrt`%@r&Cja@V3@_eZ*o`ZfW-TvW26T5Gpr!7+0 zd6(f7iU@FpY=;@pywcMAmfvSl1mzMOpZjFdO`Wxfsh@%3VKh5${19htiEQx-5&j|7 zIULLYYaYv~nS#^YK`TyAkD_u2DzYJdgbWNA1W6I*nosNd5-B$<7(YihnuFieXb=uK z3+Lq!aPCo=2|t4RS%wNc>~45rAlx!k10iZ4fDhqZtn_nQFe2figS&OeY=S0T{HPqp ztC28@qU^6lfGt^Gi^US<7OB?Vo>6vYNB!+`w8BB z)W1^(VwWw6PZG*fGQ^zrBk#DRJ@*oQjy%<_glLg%toCaS(rO(%-vfgp%a#nE zZx_8}3^?VAcPnozt&_B{(%pB&i||6+b;lW+@s3f0YB5+>l0c$lGG4wwu<@QmTUIia zEC_sCGiM~y+pM_3SdmTOxQeom1eZ{hRg?)4hwqZD;>Qp;HkS+8z-kf>@KwUguNFl7 zA8V~tm{)F4!f>023EYv22+Ah&7Z)#so(A6^-X#JJzDc(4w$7;x+gCCXAVi2={Y5L3sJ8pTqk z#L~{kGE&E(qkq{_;<)C2g#HQ`#fzlGOU%d1P$wuDB`BvPpd&}LsT1{$67QxYn#?EO zr%tjmN_w1rY>8gHh)cPrB3gkXH3nZ9`HziGoL>1kx}2A*4>=C zY?L-`lu@6Oxka7%&M5uDDC^QF^}r~z-6QL`Is1w_4K4j((4=FDXMLc~fq9|@pll4! zY}7pcSTpsFh4e+E96sY5vGW{S@eGOcJk1t#x^6CxaenjHl*cR}OHb%G1h%C($QsE= zfhtg7BL8Mln1W;+GA~Sn(Yq}Wd_&;5iPP&@6rx!mE1E*rg#xehLLcJ-n}vcr&%)w` z0)%mqLuyeZO>tssvBN@ft$1PMLWvbk(J)PEj(B;YXSo8hG`OYc%Hg~yexcxHOYyt& zf_3qN=G5}p^RiXr5};Sb9!=Q?@q$mOWf$kgSLan8;??EnMJy7aZ#iX9uflrI>eBO4 zD{=Z%xv~@S0-@GQ3EC>KNzF7(-9bwo<6=3>Vy#$fDbHf1fLAStNj;lH*;9!+1@k(q z#RA>MaWpNfOwU0c zm=NJFQJ*h%9?*84q;-PnkYLGDLutfCvAEU%Mz|= zdY24c$7Wgw%)6UysguhTIgLPGdi5~2^-#BU3z)t(YU{oCt;X!@JtC0w2KH~wvdp>(~C-Ypl4Ez1|} z^NTGglh)?N*78N_2XbxGh_-T*k-Wv`dast!i@HP;VM}9rl6&Kn8RN9eFH%COFN;kq5N$-NUWBN0ih)pXD>YF0~xX1R4TN zRnos9L@iy>uQV@%XPB1mTP*<@R!M!wUrA9;$x(baEmUI_DKol+Q-=C1uMBRx)yZ8U=A)!D?fookGyIEfPEe zEVi5D*-PV`u3$OCr@ef!lK|jk+0`tDU7_|BLTOO$+ugx?3!CP|0ZgF%McfFuM*{0uEU#9kjfvRMSgisrMQlKl_w_tfD>LrH5`FkB{URcgZIyx9ACL1ho zzK!(`e8{jk>w7f$b_aU@xYFz++x_uKKvALDQfcN#5ysh9QU!OB;HV5*K@`|v1=Q1V zNGe5504om8JkEfgOlAdo=8P>P`4LL0S!BJPX-1Of?_8e|`~cwUAH zl+9H?7)+&aWmCM7Fe18); zm(tzQ!sVv3^#EV@lBEKhv^Yt6!`kZzS_xl`JuLN*xW_+1AUC576T8KZNQuV7M~TUI2`fOK|OZp4c=j$x$NxEMF<( z=8!$x)9)2A(-o|qVo-S%e%JZiqkXzi_k5>~Sn(xZr!EqK&sRsL{zTHaFts`?Ws5)x zvk;3QnkRj$`r%BJT?EWgqPbeo%n-{k{)|~YVQ}+;BAEz}i znUQo9x|0MeVo}d#c^I$3&l0I-TU?MCW?NEHVM|n$TZ3;`R#UONTh{QF)2_VfaGS60_`Pi-2-LBI3w$I<24;%V=UW4kf86zCKaNeajbPw*{ zrb-SN)*<3IVd_3eNLAssd`{Q5&5 z6K@7n)eZ=}@ZLni1WO#pl^7%I;`D;x4&BMjR{ewhqU$`d2)U)tqB`oDuI=n*^p-Ow z!Mv{13VuH2NmefpPC~82>Fc#dL?;^-^@ZErW)0qzyDia<-`iO9?#EK*s71z?haR* zc{U`i9DCqi<+^)_m&*`JY}N=UU%YQv&-`%6cKY3`U~ryM)&zCsFaN`ED6e7f|t0@dkz>aNj0?teCz#c{sV<6HhU z>+}?-`ny;k{FB!_a|sR#S53#_b${yVd&+v}eS|Z91fg<;(krl~jUCMflZ!Mi-%_ZK z9raQ+r8q7vS&&K~Giwgm;*5qm$T83)4bdjQCD)gaPF50Jb$+JO4L&^z1`&WsZl1N_ zsX2w+ewHBdtfq_n3C1H)?!is@m9W$18{zA7SIna=DoHnk#@!zb#ZgCY(+GuCM?TqL zw%_Pe61U{O(PGD}c_gaDvv4c~&$k-uMJaE-9tWGJTM>fqzE)9lj-xx*At6jDH{2hh zt2xS3e}GbcXrZT`I)%q3HBX{L!xr<5`<1D<2gRM;fJBF9n;cfuYQ}L;YTP9TO32#Cvp1aVg)xcPWf|nol~dEu0NT94?TEf=QEo_XFL^4kYO*EX4`Ej z;^h`E1=mmC7$OnzI30EjGEYAsNEKVO}&;-cROij63gn#9=)D0ndFRbZaB$n ztFsY5g=@Wp6I#2r+nL_Uoo$J7d~xs0&fJ9GFnZUi#1IVv354BxZE35v`vA}4!FU8U z#Guw_jr+l4fbqDRL2bbax2=MN@x&>fWyZb!#}=)|wH3sVTpX!uqw^W@$d2J`M(i^69zkyee?M(B%%d7Aeka~2>ZeVJn;zpp< zP-L0fwQ_%`su*75E4S~ih1VF;SMgk$x(TtI0&b-%lh_dwC$ZxMd|?XrdEd}N>B|?! zWp6>zD%#CQu@#ZgN{R*c=u@?-dcpIEE`DwG&GD@o^*7ZI$e)iy2nX`V(`r1tcE1_5 zM9UUslm3uPz1ij7;f?{Z#)e?2k+IU@(M-qEMiHtR<;?I&c+dXv79q^RLz+JGqOGv# zxVggGtTu=Ephy_#IUez(E{~CLN9F2q=~zfb-K&gzYgIeX$@IgzLYa(PYWF>-@<((E z11?%l zlyF0B7U6+47sdQwMMHh$guRWO*W%Kl+i8){fn#_Y+x%%m^JwJZV}GxueKWV_Nx~2A zgA&twPa503E)PBSyxx7oe%Ahh&{1!z^&RMD6H4{s1DuO)94OP&%}8{td)<4LOxvZK zCF(fB&bwBwqUnvg&v9h9_Xcs+*&A)5<(PQyP42A5eqEwZr6p~ff|X*8_ZdE=TM2`N zKQxbiiFC;T`=kn7HIGNX-OVQUx!SqIEk1EK0|!q9-M`urFjb3WBT&i4TpXLgVs%6m zzH3|8xY;`7apTnCF$TJI9kXB#-NKH9YXi}}pdVvisZaR;Y-SrKhKbyF#o9k)nRzX~ zGyBr~q4*GhE0Lsxbn75xSa?C)wn;8>*6*@#^kj>amPGqYpUAq}qIuhf(9`{a`@SbC zgOa<~hCO-)r9QhAo{Iu3&SwiV_sc$9lHIGkP=53FQ}+F~Weg?+*FE@w8?kitoevk= z`rxyX$DfbBeEvF5EAz3_8hbtMzRx{{`{x@kC{DWHZSC1{T!hV%tgrh9jFK{a+#WqS z8;pLTlF|9y%HT!P9HS?t&T}x+6$T&%KmhRlcGvF9%NK;AY!`)a*7q+AG2y*mFv!iH zyM6)Ux_d&KYzemEFx+d9ZN#w<;;tUz5C^~*gHyl;rZe;_!}eoV@mH7k#;Z4O#JohjA^x9xy@Wyn#-06dv73nZ`5UhJuODFmnE-~$0UBpE0?7d;?lAR` zz_5qzF(f!nIM_8Lq}3@nFhtKz1Q3lIia8ccjT;(mh_Q~&FyO`X`Ur^b3rTYe31ts! zs1C`p4ilmb%PR^q)D0`#z6xV~8CJ?3u0I}DX&p{Y8D5L_@N~l)x5J5EhPSdusE&ts zT1Q+`M06KLDC$PM*^c;_6fwviDKj21Y8|;n5jj~DIqe>~+~k0kfhPJQ->O9Ix&^F; zL?MIRHv6KA+@p4IqwnxV9jHV%VMiSog`>Co7wrBYLZYww!uR^3fp|_|aAQh}qCv$m z&fC#AJ2A?!F<_3^m`5?hp&?MQSn6hLa+}y^A+a=gaaO&t^v!|jdH~zbeHPU?73(YU#;OHWO!n~FYUcy(l1jXV6lHhpduQ964 z3A(CEN;?S~IEi{_5NJUm@eZ|*abn{AVjI0%cT7{_39Ij#hbCDmiDD@vJ*39ON=kB4 zwY>Nw>%b8|+pXB{>+1&%nx{3$HkdMfjBvzlSl9p&|cGX zOyFwR^SwcKkox#AWej*?uifm|NW8`tsk_&D>fhvnX|&Ep>67AVP+5YH9z%#%sY zljO|j+s%{l%##z(SM|(ovC5a-rBK6v#T>?%F`mz}02Hsr)Wv_L&k4FSkb9Q~(}*)4 zO*Ms&rPGEL=)y!q`3z92Y{E{_PmM|E*$Z7367E(PXkta3Q!@($8V*g+dbj8a@EO>r z6a){@zUC`>0V;B9;t+)vMthPzGp6zMEHua|^5wi6xl8L;QhZ&vsB^X0<8^U}ZAlYf zNq})lN6r;#{Vme9wIYky$P`=BoEF+*wUQ@tu%=YC(gE5wPJ<55lI{|6T6xw0N0)fH&$;T7XX*TTd7oO@rZH{rdErKCS-Cjr zFb&;+G3nt#@tSS$t%6{H?Wj z(yqwbXzJNo%dYd)k(1PMjMwXV)vBr2w;<{tUex%c)?0Gv+1b@UwIeIuZ9pE^TkO>n zfE$$UV2{&kFU0G8N@3o%bTWf5zwidf@Fw}yVwci-Hj74Gzedy2M#_uEfN)s2SCcTf z2@$TDme!O-+nABo9C^{i65jagqRv*LWz~}|sg(6xEJ+iyHCw$^3)WOoD%iJL)(kE$ z+G{N5YRw35tpu~t7`MLhs<(r*>eL7h^|!U`u{K_?)`quDOW5{;+4O2!H7D8@FIomJ zuFu)EukTean6zz#ckGmQ>~(wqcOG+fep2r|vFrR2-ubn(^J1{`a}sHNXrO$5ppsy)ntQNTW3b+SurXq=xoog?Xs~^Mu#;d2#XZ!mG4$Gg z=uN~>f7#IB(9rPy&?v$1IQQ_R#_+WL@NC5JeA)2g(D3s9@H>K$Rql~>jgd|Jk*$c4 zowAX=p^=0Akq-o;$K0czG)7PCN54djek~il7#h9YAH5+($NYbH(KahSdvIc-!vZ8AK4?_l~q;fw{(jFslhLx-8ikux^sGj_u>4hJ(&31^*n zW?eLAT^(khN6vbb&w35d`W(!@Ae=++%=v511v<6P=2sbNw`?evskOSSg+ap z-%?F9Fa|n#Mx1NRj2zckcsQI?_h)L)3Q7>eks#sCCeoxyYy|gn{UqJ>YSvHe{h=K<$M35)AX(g5mxNl zQk=P3{GX|&p4!6Xx}eRv_>;Pj^oFqVhJnq-6NV$r>*h%ZSkY+ z)%)$~wH=e&9l1rF$(5Z8EvT_w)T`32^r^1;FWou2e{`DORPFUu*7p}J547zM)bpK99x7iM>O32+S{p$gjx=|S)*#30UypAMjW2DEcYhdf|2ok$IJG`8wYWdk{duar zce?Fprh9+3|LZLBpI zWOZs`{nNt6u>Jo1>HEct_j6yi-)`<4f7@AJ+g<*$ySll* zdbYpu|5Hu>d#Y)pnDkewiTziq$^MU2Q_SB{O)A|`G}UBDah<`b8!GV+RMR>!Tfz1^ z)Ce?h86ZYsv*9X1l(P}1@NH#-NMh_LJ5t%eX46?fZFPf0W~^$H#LOMl3(+^Td7tun zScN6vcJ9vmw5Cur)x=YWx0T`EQZNkBIi2sqT^ zXkR~+_6`$`or0Fj9N)HA=S!KB(5a}ny+0s(`Te55{CgQG9<9&5Jf!2i!&Lct?{~E^ z7@BGV6H2W3YhN$4cEM$u?xU-fJ?CDkmYse;!qXL5!ZpUvB`~qW->k-0BeU?JhFcnQ zIzE)EW~!R2&H7VgDAxh+XI0iQNyW-Hn5wW94orGj_ODcvdOA`bZgi%@7*FDv`B`tvGzH%txQU`LC#^_#adg)4!ma z)X-FuzfmNm?50%WBcqi-AD`K~$;Lr9 zOd#Bj=iTHjpr?g`>}(JGh8b~)DrKdQ3n@Z=r<&05L@f^;EKTMyg)5?(MI!X{`+KqmE6b_`A z>oq%Z&j1#RzrW2JBC(?|L=|71TF(+J4^_Fs|I`;i*7FP1q@JCo#UvrcahbqIV_KVC zDmaLtD(8_!r9jaG$G%^tK$Hp|EA{;t%5uaF_U!1FY!cem+A2!K{3tIm2N1*HK~YSz z3z^aG?ijQhb7W(W5x$ojz^q476qCeo+#lXE<9bLSg6mf0;n;+8vP!mCcCFNBc>mEw z{v6S+e{G@WL7r@@>_kOFL*sJ12if}K3ej`En7#w&w2P%}t;Uww;X}99i{&FX1DP;W zsL$9U)9t9n4y=Ko>-SmJ_Hsk$CcwIJHe@_|eT_=*9b&ycOudE3;C6Fh9*2CmSks+9 zKqrAiO+envr`P-Neo#$KeojU|sHUWUOEvBNpqj$PtA3}N=+IPC+$}zD$uQq3l%3uJro1C_`ur6z2YU$--w&z@ z_LFMTi^018f;UWIu|h8Y{lJ~)D`1~=%jYDlCnO_ozfeuG3gCR5I&M041_?*XrS;fA ziYi%M@3ssX+e)=JF2!;Jj4B$>!-Cotrd7{YCZ45qs`>28{zf$!a*UbN*UVucuH_T9 z6NVohHR54n)eu#Mqp2oye>BzfMOQNE7^4w1-EHt)7Fe50xS@{|_=#}A^;fEi4NWz5 z|4ucTp{XVpV6Y=_18B7QQT(Bzlq8GxwzI+j@OkhUW}&&rUW}BCIhd7-+r3*LWsG?o z2eB-#82=n|`x8*YT%^^-zf$358btJh*?F0STN#OcIfkk1=$Rewu0!&|n4R?fYtB{{ zECI>~YgMX5o37vCY7lbA6M0oB7L`MP;z2zK5xX<%bH}Xg2Wi=aeQ^lWV*;Ns=IuDC zZ7YdUw23mIuyI?;w9t=3|8Dcuy4)ob*m;aOrb|93M8NVtQcZDKu{!E`lz5*0947AJ zoH?QmOyF@636ln3h8`QvF|dM(h~rES<2fj+ipYo^>#zn7ha_0!6a2d)@YbX&ZBVf1 zHnDdy`2;U0E>GlyDTJ$zplBAy=Lm0+cjDDe-y8b{lIschzlK4qA>HTY}so_sl zlgmG&nw}vkqmIC{Q0&N;mrc`R#=f6@-KFFU`6BD*GFINPUpb-<{!VzA_3fA@~!n?s7fmfbI z4+4nORtmj1DP4>UnO@=ZPOrs5Z(0a0vol(& zKqdl|b!yPqD$sNlIPxQ~j+B^3)^K5y~wxvjcRg**YX|J>g?hWxl!L^hrFvolSG8(!H>S) z;;yK2E9B)1{)<$T2XOM1>QfVu@d=`H{d&6iU@w*0*A}&Gf1{etK|iP_raw?k4Zst} zU#TXZ|25T=kEWX7*vJ(uD}bzhRhuiCY8viIs&==8>&g>@2MspG}Xj~UGqPp zng)KOnv8L{TYwR0stE$T8o;ht@l}oq=7!+a{GX$mB6zX6IsX&Yg!)M})%->^?P5v% ziE0u=Q%%>gcmV&7YU25gYV!S!YO47I)f55$ooeFw6V>E`Z_o5Uq?+u(e6l~NrhGKj zqz3$zYJvdE{y;U={57h{8i}oq|C4HR2CDxTRFk^wU!t1y{u9+y^9$AF`Ddz0;xAK8 z-2b;yO*KcrU#OJJ|Dc*Inf^>QRs4-=+WehrvK;$MRFguLm)_q~O@+TvO$~pb znt1*q)x`D}sHS7!530%Tk5rT8zoVM0WB!3^atE3HYpO}`k5rRBPPOy@glf8j1OFFP zlhGfjCfDDnCd=QbChmVvHKDN0eo{?*|Hr7NYLMmMQ%zPpXOgk5rQb*ywLmlj;wuiSH-XBnX#qMg9ZT#Q&3OQUJRCfoc-@PgIlMe@8XB z{wu0UT#Rkbdott)HI<0} zN;NtCq?)+?k!lj(1l=_K_f%8-52`8d2h~LP_f%6&;9sMf#DAfhm^hA%{w3A)g^XtI z7ph5_;6>2Cq?#=LMl~e{qT^TpOf{Y2-1Hp_o>dSQ`$09)qN%1gbH7nd4`P+F=>Kb~ zN&3H~nvUoOWCUK0m0Yy#zGXWP|gAYFhY}YC8Q% zHF^DKs>$sK)wGGTAc2SZ3dZXfa1&v+8uE%!WB5k=d^}jiy9>O$*IiMvieo`gyS5PE?FN??kS?Vy z3t3O#56RL*#wylOjZq~5y2bTuqY$PO^rNxLWXHPFFIZVq_}wv&L;2W0#Pscc<^$b4 zv&3Eg6ij?MPAqv~iC2-V43>@!rAR-(;R^{MGu0)&e`HP#`q~3AX2Rm|&Lif5bmx9{ z^5eEw#TSX+(0NdcQmWvP+qcTI&Bo|^@LiQQKjcv?$(5o|fmm(otJL?x&!1^#K6Jqu z`&eh0|-3kgws_t8zDt zIX;Z%)y;Z()|&(I&u^pfwsb;d0^M-Uu!b@TmI9Q&Rf7xwnAcBYfK1}lp$Y-V*?60j zpi7wCi=J$H?xSGJx@x@ew+U?2HY}RY>q14aw<8}tv@5RR2nXy`mbZAg|92?@Uki@&dZ#ckOT#uiS?uYW*!P{ zy8Z!IK~5Tl&HiY$MbCYKqwR?BMF@%_IHYI3rz+HydzDlo1Vpw43p658q|m(<5?tSS zi=(aKQHpGK0K(E77w0S8GMRI1sObi<@oMM|q!^2x%GmP7lQWwLQANqmu~;U3h&SKS zm9HnV{#4mc!$n0Z!Vb)CG zyR4_xgCTE7;3uja8l_aB)5%I-VK|Iy1UJ&Bvf_4(X0ASUQ?T%S#eg+ma zn9_1=OnTwY*3G-vFHkl`oM+BI#Qgz&}6F#c`75^Pb?D=>xiF58O6dP!W6(){>Ek|`h7eWD| zSI_@QTJODd&9k?z`tq?zlLh%2lkTW$K7oe$+Q!!Q>$jtb=Om#(67#-W^>*K%Ym8kU zChLq3+pGpPW?9jA0~BV}6Tkvva@5!5K~zgCk4ZMYxAW z5_cSNZ*_y@vsZ|zWcQ7TuZqXZ4~bPF!pv#NNn)zi<9DT4du@*`Mp7J^9jpNW5>Wu4 z14gtXzyzRj@14JE#$2BkLbP}NGW$82HwbxJT$~uW(>U!yQAw$4Ly5987#VI3}+w z5de~Vri)0@!`Us70_%` z06~=;iJZh$l8YEgKNg?}V-g78LbFZTot|@nG;6oM(ysokWX^JCIw0fJdYE z`&Y%+XtqfOLD~#JzdWHi;Dfq^h$If0ZDL)iJ(Ixs!8UP7ks$o!mSh1)0s12)^$mbD zSWZQ<&Ba;_%{E;Zh1*I(@jy}!Y855XY||LRsjkiGvj8?bfIhi>OkMFPUgnBwL+Ke8 z)Pyyb=C-0enr)H|@N82CK$OiSaoj#5Fk+OR8uv!42i|s9>IqRw%4Z~skR)mDCgo5h zK?ITG!k^lA*XuVDvkwsS))CJsky$9qq1h&QUtxfsOI0^M?*Qa12sovj@C@K&4v^G{ zO=F3bzbz}m9(xuq`?^+vf;EU21c$gwj0->|K4G*F^h_-Fxg-NjZ!3)Fsg-^UB6EL} z<e_oVlNj5LpS^vhcrnM7$fOwjn8=T&*1$U1wG#J28)Ov zU{kV4ABdozdo+X^AjBQ()K+V@tt_pgVwvrS9Wx+QhA@>K_CbS9eOUHruxShpHepLk z1e@~0@p-Wx%7O+-{eV>UMzfb z_#vsTcpo|>91S+X@eQ#Q9pN}VAXdZn+pJJrh!SHWlwevXUQv*_XT&8$9ePcd6b&|I zEBpYPWaVwwB$g7i<;6!uVu8w`VHPm~Um`TF9%JYWtjSVn4DSK~SlO-GAULKW!j@GI z5HpgLh9nfik_rT1?}KoYkr?j!58WF+b3s3wOp)Npg?kqVB7P7hX{!uzab=iY$n=pR)E=l}sI5~lFV zrd%L6WOM*tre5|$_^k(%;w1VaIm~`z*btTwkGX(nYoT)HL4tv!LJtin)DRfvn3z7@ zhg`8wv$4`2#F??*CDjdDW`SB5N?fnTo{4~xxXaRe1U-`qeq407gC&UcYtS9E0hSZ! z6dCI-6l;d7LY*^93d0Tx3;?qOu3ntX5IY-ie|=?s9LU3odlT}qcrla{civ$-{H$(H z?<+Pn2rw8AUC-q(Mh4J`_LunwK>=_u{8mEy92@IPdQr?UQ7o;G!7N8OV##Q;DM*ri zzEgBQgD3HhIU0!KO(nqu;$i?d+Mgu*Kld@tJfX?KqP)Rklnw|qHaG5cG_HJ=#bUB3 zZfnxm&J^Jj|FqA~x3oq-1f+Bp%A3tvw+XI~G=Xp`pM`KqmjzG@2*|ysNqL{n=596BK%ddNx_DQ|HbOHip0niO45PHGc zev=8h@zn@`MFl6AhGLy*ks<(A(7H`UKxEha-7b=n(-J?uw*xa{E#^B?nob z%I8Ss0LMH~q`>{pk+GNd?`|M4)P!U;Lf#g=2HgTwO3Zw6EQsD5<7q6O7^F6B8|RC+d0WUzMn;gTDF6qK;h=+x>jpi1E$~nUf#V3k zLz?0@n5(wEy&^#@!qhRa4FTA)BD^sG5hesy3}BEHo6r!BWHIn|D@;5cK~6^iRujPQ7E~k#?(q#EVhFHL z#}+pO5Haa+#sF}R%p~&y9nc`5v_(=HKq?469zO^yLXymDKNW;hOk*&c$nZ(x5KLJZ z&#dF%_P9X&06J>087Q-Kq`1mjvblM>ia5LQd|&w^GHT&7lXo^DlXwK^KPdu`kKENC z;FpFH+=%UNH^(;xFiQsiU+mpwR9sJ=FZ#x%8))2}Ai*WLbkhV0P6!S`6Cgm)5J9?e zm*5iIoj`DF+$E6UPH=bU@_(M0IcH{`b7p3(Id|Q)?s;Epzu8s0c73aU^&vUb#?8^T z*T4jU7EyYKOf&#uq;0rS5PeTLs%SlotbP0D&jZ;vz$G3$Bo71ca1n(M=AF90h-PN> z_(V~FZ{GTp(8M+;;M7Tfr zd;rBIrl7SZW3M&ZC;-p;k6EsD#P5KI*Q^~7y|1 z654^A;^{9aHI0JD&{pBC6|b-f@h3&KCxp;qVrR_IY?2eh)LX#u>F zfg|y+N^}TI<->YdaLQP*AJH}X#9Vj`hVUkucY!Yjg`;p=An_$aGs>s=4C9e%!g%Y5 zb%6!BM7)G>$6%u^HwHhqiLHqWxG*f?Q}omaO|U#6?$jpH^xNa= z?N<~%C{3+Qq>9ek%NDc%S|hQC@i4gA>gtJ`Jq7b6w) zb&9r+^@ubjj-u27zOqJ>1=tC)Mt|B@sgfY%bA)d{B280orM=*QB{Y;>7&aWPfWE9D zjPzq6#56rV2zLPhY9r#0!Rv~gGM%r5w+H29=s@n+45q}<5OEDNl~Fj#N8$Z7U%*Fr zf6X8>TRYaM`()?-Ln|{1Dh=RRo6sZT{oT!<_+OYLA5TtohL|N^n?_#A2C6$ zANqz(>qxPX3kpPXsP^eE&_v-jqfpjrttT?>M4tlGJLP|AUrhV%Z|7j>xoLWMH9D6% zN3ar-%|iwMW-}e{W9$oGcw;?BJlK)MFM}D7A7-&Jb>5hPXR{+o1d>dm4dX2uRb1P zycSIn5Aw6B@VA;msDlIFj$RS#1yn?%+86l3D*VkMbwNb|Fb!{SQFO?qm!ZZrx(wny zJ7zcl+Zz>&h|79(v7T=dU%>9;p+=tDwe;02Ki^M+Kq?XE-OZ5%63;!Ot(fm>86+Oc zlHcvNj-L!B3cMwgbXuYLC#va$Z8SqR@R4c~di!{fk*J@j24ob0)rqOi_YBg33IO5* zMB4ytIgh1a*1cCyPIT6d45jHp?lx3BMli@RMs~*!4b3%mM>_+KS%NuybanHP;o}4J z!IZ-Qv_aTXL70+30Hq`oEDRdmHV%|FUjnSTrJTp_tKTb82};p#Csn>+qV4lj6gWNz zkSGL#G)i{YOtP*dD-12$1b#{?)X3POs=S;d}f z07%88UmXfH4k$QUMk^_#1lvpm2dD8g^EZ9#&#BF*hzc+%E-#+Jg^)l~&K!f&T{@ z4kOaqjO5aibH=6~=w%Avt~oUu5Y(SA8{lY>bN1t;n|;WNbi#yH{NPT7W>z-Qu=!zS!Bi1uKZs|Vm41wY$AOrWO9XVwWhf=06jgb;2 zQH^HHTnO%`cjGurc|kG6{E377ej4LkUSeicj<0ln=cbrcw7)Di`6FRw&B8yJ!gZv? z8l5;7C_O`C9x6nCAvhu_v%;*#BEEC)Jm?If@8d|CKt%3~6XXoxYtcBNOJn2O{~<8B z%^;vA8HbE*b5~X>kzLXdjWDL9YDwQ7Me3hzoy|UY%V^G52uv8;EOn@e%!CprOHrUUk1Y0$G*GIxIo|#3hJq)(7Cfrf}~47VQXzH;g1N?ijy&% zmzxCkIb-Df0~SYi%l_g)NT3kcOZ<+4-sb`WKJuqbEX;kxHHzQAYUSiS5&R^>C+y5@ zXN^J~U&HqXWX^n2(Zv=&7U^YeL6Uxmarj_))NkcsLHAR1l`i)&`_YEc+$RR|N@ptD zZ+=Z!(F^3X)(te~11Zh7S+Qke4q0nec|=aI2v-#j**jOIM6FpqBVY6+Sz#FhbuiePJ#h+h4$5ZOkpEyB>Uiv8tCbinf@a%LiBp|n&r|>q>Nv-N*3z@HsGm zA(UFBfwVJ}q5;hsRM)5GB@i1^hc?DU8MMzl&M>TRmAuqp8LcNT>ET4= zNdm(c(Hp}GuJ%>$%%*ZPMUV(o0>i%fSPDC*2k0#c+1sIZ@rfyfnz7=YqL|Yf4|lSv zfprdOSJ0!Qe8~mOgmgif7@igLU;)iA^^eOqs7}0!=uB(ex?*_}Q}1Kng^X&GyZ$Pv zu|&{5oo>WkYkT!wfmBhI_;pa`@PM@fi_djf^z;fkJtrofT7~bEkB^}+`csBearB)# z227Hl@-nBp7}7Ul572xFjAaU*tv+E^$+onLGuOCom9Y%2G5U%3!jF(caT%-}gcN6T z^10~$Jkj8FC~5xLhyM~vHYzIGPF!~AFZg$=iJ9lI<2J33wTkiyI~|3)G_BG-i1x~) z9Yw`7t+C2n^@#9ert}&VWby<_k96W{+#0gBP_#2v6Y3gfK&jKMk@(i5sE|!bA_;bP zR)<5hj7V90^9&TA-je@w8b-{cZUx|ZQ?M$RHLcyH+@T#Crm!Vnwa^rju#+)-siqBh zBHE5`;OP6-qkR+Be-Zo6waSbB`+6yNg(@oVC6c)1FhE9fg}D5(OR(iA!dr5UnfI#i zWy^8=L#yPvVENUMOUp^6y&W9KB^9esXqC zsaN9Y*U5rfL_2weE58eJ!@}<`(V5q#B>Br%iT!g%CBb-%bFHAOnu3l2CPEy$gN&B^ z@?%NMXdDjo+P|o#>+$mYv#RvFK6%ZX`S&qZJ+1fWtupt&2u^QLS|7FuB=4?XJ2uk; zu`^KUOmFt+QF%;nP#92&7|`TRZ~CCMcw?Nw2;s{GUMo}!?r>>r9a?KN#YpJ^g3=DXSH3DnzP_HXl?P#x)xIaCFqyh`{2&DIr>O)DboLMR^V1G-04jE=uV}$T zn%`wRt72l(VKg^ahbbw2SRCo$ps#2Jl+`lwbdQi)ZrSt`$UhYdj7pE~WuOw!LR>-N ze!p=xwWzw)+c>C;FceF=%&rEP3kiE^SIpv~7G)|C6q_Aj>rwD-49Q1?P@U|vwgiuP z2!A5^XBTxJ zS~6kUH=*S+BjyQ{=4&VXh@$f(lfn`~p4u)V*RHFC;C|$TcH@SR%)@up=rwvnuVgb2 zSjYpA!jKWeU%RpDsUR#dhJ=c@_l5%XskPHl2#N zLX8#+KRu;*vYyR&)s7R^&avC3gY0C`wvfO_m-g@#p7;tON%!NngryOlDt086`U+cj zNG^3qZo(9Pqp|5|!^+V7Hao;gUMUR2y>O_w->P$(Ur*L}AbT*|%Z7p$l zJTY&Uqx189DekuujAZq*jI%*w=tVG)S?YJ^P7HR$a-+mxV5l?cC-}b<&YRU}S&rVb zdUp0lnCQ7lzm;+np|*pM_Dnm)v*h8E zzQPQa>LIZAm6q0JtR17d-{*7H{o+YJu15dp@X;KfX3NfPeL>h#|$Ou(kM4 zkvAS_%yP04j&YpT1#*9=BNR1AY1njyqrQcP< z>)z{}RK2KYv3a5->=R3{vLw!SR<39KB%50X0Wad;3C9+`U@y^PuRktsXo(gg8Nx5= z0N=tHwEVb0NH7<)XKja^Y=`KP7y+-t=i(c&VP1Jbc*|=Vw7T(|Z;9!m4@lk$=&Hra> zR|k2C`z^mVYG=h1zs?+Af(|ud7-dj!FME4A_w*e zAKpWXD5z9^N(b+~pIh7r>*1#F-64K${L}S^w-j5j9dUFb_3$&+a4dH}3_}ZI;xRnt zO|XO?i}f_=dwykf(qnWr?MR>c!Pvb(~i!D&QSTmT=F?s}fdq5?xplx$qP6 z@A$}7Dx?CywQ{tnM^Lx`)W~^)hEH|Sm!=0@NC`o_!vC{PfUxlU?UDscn9uXnc7g&1 zW?tyCT_0Yh4ktWxo*m~z#W(Z}q^JZi3Q;7>dp|n1AaV&gs+$ff<=;{moA`;9tnNf? zXy2{8iW=qIdWvYrL+X22Zxh)7E7T0-Yx0C!$TgJnVvx9Oq@Cy$OVA-a;%#) zjM@9*Yo~bJN(>LKtmtmamZ#5Ox+(geDtxJaq-Md~P>okkyz!ea>STevn-CH-txwNz z^*h0dg80mQ?4k&=_vqZA{JKCt#5ZSFP9^pMzZd*|P;WtIe?i`x@0_D;zpeaOhw%+L zPhNlpweS!YH@+X7CLi07m@=m&LE(Vh`?YT&kjRoM>?9# zgWFLpZy&XTG4_lTN;)=b0G?#F2AN|myML8TSo>5dD8S0cR#501uM_stPg>FSHL>5P za~Pe6y#k3ZcLAXs;DL-9QVm)AWkULn1zrw8uQBD+5)C{PxPjTgg3S*ABN}ps2hk(q zbr3tTv)-vAW8q&-zh4>m@vJ5ATFKJ)V*5v>$gE98VALevprx$3h0>xD*uX%SimK^t zDOE?{GnB+jUykBx2zESV$*_vMK^}FT%`!3}TaK&I?U`ODP7sW&j=?1Ehtr9(L`r!+ zft>M;Z!DT%-aGO9#Wh05kB(iRE-SvuI5zAGiE9107H<-#dnt-{a3-EAt1{y5UTX*B ztGRkC>9Xu}R~KW@j-+7F5gkD3{fvA^5bWXEIRcWi-ywELuThXpaE0np_#vsa5yV^$ z3V$AeIqUllag0xJTQy|rZX~ceMFU{qvcfq|ukx`!rxRE!#z0L%3WRBmY zE84_qkZ2bIS?SLN6B^ojvSkKW^IZIV1V26>oD`8<^HJ6=+>v1Z5IkJ|S+-x+?Zp;P zsY48UAO0<5@CwGC3c!Q2jrYLElr9YxzZ{Xm#)`;krco8f6Sjjp!b>_>O~r<}VLzAA z^p%k~v{#ka?E-pwRGF?`(>_i&XpF&>>4q$z4c&|$KWWD`i6$iV;179AS@wCjOa&la zc|qL+f}1)!7dBgQ8Df{*gw6ab#WU2u*!ALUs+t-|0wCp1?^P_`R*wS2e z{y`|5DAR+r_YiBw<3-7z*)_m!7)v5+=}Vj-&Aw}(-{o6^ttz5nHc$N`tBmM0IW8Q* zKrcen&S>fEoaLLNHH~kuviMW63AUd&yzr*cinCQSda9jUV@ap}^cI-J-Uc@eSJdv=* zmijaKOWn{U2tRsVeJ&!g>sITD#!FbTZ6StTuY|u&w4x zRO%AGo5S(Q)=jP^Fs$K&o@@y9(?M`2_&H6qT8U!SFzCHKrV)6r*F^pOsRi`S zgVoJ}MRMM0TfTV0+u-boiVp>=exLQC(4sCqzQRCkXGj4eduQd+K?nxzV*o2y;><3W z*%7Ge-5KQ9MzL)_sn)Yic6{}T^(|#U=TFGnGA?|h z7uqEuM9*0Dh-u>{O#%(rL|#8qO?vg!X4%P!>(Q&|vz=ysnD&QPa+z!LVu`CF!rek)LHJ`D8p0d_iN#YCq@$D*_A0>65m2uu;CcPM zE@CawDlKU}Is7^gzk2wFg~&MJlNOcm55iSyPb0JJ^pDk7a-gX7V7ayQO!IMi8|Qn{ zs2``%&p{r)#aM_FHXg;WV63x0QO4VP0pTj0@Zh~JHJhFU>3BjIC{4tsC;cV37F} z@+j1zd8q1QlQskDZLLx!MLm}|%$K=;uI7~j$C37^3`TjzXLqgFgORIQFYpR%KYa3B zqwFI16W7WI6Uh82Q#6jQ$~K9)s2|p%E{u8s9LNG&)Fg|6(cgoE-1tdN7|0VC--PJx zjCPBsEw{RrbbLt%4~bxJ0!KBY;9UhjMbwUh-=qJXYVtnAsi5U%35;rI45$OSD&@U5 zx$FdgQI!9SYJzfQX`UzkjcW2O#;i%LMsvy-AlI6Tc;K}L2Wu*>lgRWY^lnoq{tv3D z&sLFDNIy!x`ZhEC7Mn}(Uet$NZ7h)4&C>JR|Dc+>zG`@Vha@RkVF};r2Usq+hPStb+dy$ryk?!dA8R%4{k(S!L+A)}((Wz}AEpv+7G1@ZFZCoHN|MY9e zLAq;-W2o0bM^+j9YS)}{RXh!dv-R4hO-M~we`RD96rq&F5Z*XAxZZggAI*V>B#H*UhO+4 z%^Gb;lQ)dT?YkBl8t<5pH%|ZB|4=_`eBediw5(brB8v<*G=(*7edCha3pcz9)+ji*`RL;Ex{?!gkTKv|WcO>QpPE9YY|>n&I-%%y`u5yI>?nx&hPhY%-btloFci{B-e-@3 zC3*~v@{05438*K|=~1eAEP@A_QXpAeQv2k+|J@74fSQYZh{hglU&fUuIDk`N$i_9j9R=LU<7z zoeAVed)Y=IC044i2evj=%(48}DzSRhpMbU91Ke2|ORs>l>^(gFMd5Z>uJ8i-qvjqd zOQ|jP0blUcr7V>>5lHQQX7hsM5f{mAyoMYC1k=Q5SS1RP%5d+u7wwih2V*xEtPn;< zRni_ySsE!oCn;ClG|lb(6~I|YO~{FHkiDHw)$?gN{bw|Egh9G7&=HD|4N4Ub1fml zgGYSmtKg2ck0EB-gXHE~;&Y*Rq9weOxebWI`w=nz&@UkW9TS-s@bDvCHFOL1E71V9 zK?#y*-FD&!ZF<~cxBZ>6o`y+S@G}~7=Sbam277j`&ljucT)o>?$x^ZX1F82TgJLF$ z+aoSDbedg1_I}(ld-$w!8GWfAg|_R8b*E857@`4(%_s#PO20t@qg`b8?=u3QiaLSp zJm}V@xu3(c!QsMoW*e&U<+Rtkk$PA@Q~2k8h7KN=m*+`f`2{}lu#yl9m-gO?h<2we z-o-85#o2SZFC=AZccd@%T3&AoUauB=WgnWf#04ppW^1f|>D6oTy->I?qb0uY5jik7 zKe*mM2pa6Q9lTvpCS6+owHCTfr9SP&jWpd|X5qn&cNn(~yL(eIa*SwQ+o8ZG_|scH zD{4RUU>Q!g0(_v0pd%W-aUN;#80nqk#*OD?T{0tk3hfDyohT}qHXmU(G3&mCV($S4 zu7T9%_P|TzEfHcf0*~$~%0wIV=9;m$h5_d$6lZk+XBkQT6a_7`ecL<>CwRCY6+u&? zazD(6mkqh^9Bpqh>(7D@@0t>m(Hh3 zyLK6+;ZCK*bY&g>7;_-BYe~I$25{=>H zM$K;WW7VpVF2U)W8QbE^kV5wKzauUp_;B+~3Em)verwVMm*SbrPN0rnR__93N9i$t z661cQnct^;KSnP;Mxnurt8GRGE@^o&cEqDGPcuk=I7avK=eoGYx_K7TCzCvV2<%=- z|GBc6s=UjNhcJ?mGJ6N@&j4-(Bbo7NZ!Z9e4V|bCo1Mz)O5EG2c;DRu;A}?TvL7rD zJ6#oxKCKkstUabx;-d?c?aqU6)pK)CI*+d?V~!2em*C&kEDdBau)bj=eApPEZ`a~# zHkqE3!Mm0FZ!uvlW@u~P)vYa_sG;XDbuw!_r zYDRJEw8onRXlvC)?+~KlF6@smmGHsc;MeO2K~`;0d+#z?fQrN&LLxAXCJP~Xz}U9S z+hU|fXo1iBhmXqf@GAL4EIhNde|G>|5cjf-9hF#e75J-Soc?l$dgp*Sl>5meOl2}O z_6Sqa>_SGa2+{#A0_)v0_IB;}QjL?Y4Cu+|F|G0bV8L zQQW7DglJ>4S*|aN$J>`eLDx_m*fQD$a<+znq{L1+)RiDYh@Q$0_>n;7Y<%*1Kza2Y zJzR*jWe{Hy+D!stxgeO&;lq8i4=r5+9+poQX}`#309SG05oz8J4vD1T(l$fVEkrkSowmPWyZfSWKif(iQHdz{61 zI>stG>8I=Qt{h+$p0}?Qt?Yn0KS+YyUIjC?9g@sLBIzHnmw#cKk-Up*Eny(M<<`B( zxbz#rtss#35zU_V%s|0Orx^vAC8QM0O=umr@cAVVy*9=y0}0u_-p#vL?-6?RW+&R2 z?L=lL`^!9>C@rp9H!rH^e3mbsJ`OqOm*t@#R5LOMBym>s-(V^z>DL1Wbv)kv5Ssq4VTFY;4QralJQvQzq597>5@f=q>z<6An|L$!Sdy{g*&#D zHLX5K%~j6%?s)rRT#gYEht+eH;7HOx?{MvE>>o}(6w5-p(^TF+{4QRC+My&k0w)~_ z$eoCFoyc|HTrsY4tvO9IIcHa0vLrb1ieyS_xA1_(eFm8l%oW@#4)%|M?NRIn;TADm zVEzQRu`SrFDVeMpu%vD5wB2Qv$u(FOTMR)yV~T5Cb5It>o|iFaF!?ci3pNlgz4J~x zzYSmhfa2T~NS8S{%GmuA!V~=+FNY635&{c8w0>!a+!~ ztN3h!(~_lj1~FE-PXvlr;}5b3@*0L15&h(MiY^$BV^U>ybno{Y#*X$0Yy6ExR6%Fcndyp@sRpo?58!Anq-v1T82jMn(BqIDLto5}SK6ns^vo#Ii8{ zB&mK>+fbR(P}|s8Pu-B4(y%De z_$Q^InYyXau&HmhY0R*>Fs1q6rr{)|C4IILXRc-brfpRM35Bv82|Fv zyn+6<{>%{hK-|DzesNFlDwL3c);u&;~&;7gGO%sTOXB9$~6YBg_T}?I~cy0!ew8I@Aal`DQk} zl{z#}GyI2UWG{6%nBi|Q6&#xiN?QiU=DEY2fRgas{d}G|Bz8AOjTjFGkf)kWa7f|H z&r=G28|Rq+$!|QaFh8zvH^%Jwlk?lS;J05AcjLTL6AF)Tp)sx2w{czLsn_4IU5$S~ ze3>6(H2!HUHEHo}{4MRoFX)7r)Rdv;)GN=)_q5Y-o-;|!zg++`V#YI{JSUKMKda^^ zOO1ayL8pT6evaJz9ycDhZ=Q?!wouqSU+y`tYCQYr8+MrSY^&#D*ZlN})Ue36Vanz) z(r?3eG$UuJBU^Kn<|w1TprczfD~qW^N0P%EO}%51;%SDoYIJ{In*7mC`(wE9=QZ8B zg~|HcwDorj>+k6{Tue6H(>A^=Y{DlgG1E^ZWDXi3c(*3JfW;H##nYE``%X|=bL6q^BDOXJbNBWz4uYAQNeG8vI4_Y; zAdbdN_Bc%l1`&Jn_iMD?7ekC#DjB>Y4lyzS!E}bK z#+Y8aKnRQ&uwqSatQc_0AUJC2ceC^x&%IBfZFd{550|C!PXX8v_|IVY+@cXJAPa=e zV4PZ5r-dy??gp_e2t{k3<^}7b+Y01seCrNm@Uq!OWt#2`VGt@DV1z|JgwU*0A^6%c z}DH0fg)vd^{zcW0LEwK3(X!k*T0H4c(6&02Mbs83PW*;x6waq zCzM1HC5efN`%K2ZHbIsN0#;OJ)n#K<-82k?eff)uTnw@21H9`1loat8s|O1|-0f5O6g9$cv4*yl zS2f0eqnhNqRk6&2m?9L-gIUvpn2h#Idd)*=Td*y{~zR))^ZU=X=eJUzD6Cci$-HbrpG*Y&z6Usm=cG)+qX#HbYIBm_9qiliuen$)tt z_f3ie)0tVwBh~cw+lPfA<; z%tuuw$_C1`hvyJ>O~=E^@0-sZeubF|%%Kd3-roZcMaOEC9g{sTn}i46Tnd>O8# z!V5JOr!KPBgL9S)=wt8O=x!tqJD4Jy9E3}&ovHess_*1eQM7_w1_efTBfH~0!m8Rn zcC|a$O|!(P*vS&8y8cBqx&C~Sa^fe(-Y_r^k_Ml#$Jld!j* z)#bzJm_AS3IhOH<99r?*=bY78VLf7BXgWo0-H}z*!x=T_3!V?xBH5z2()P;;PNBMK z_+nfHtDMljX!g_2PWP2aU+~r1#M71|(caw$y|}r&^oY;f&Nd$IJGP6tmgP$QkLHvC zLw0QoE1~$8z1j`@n|QQje)q!kdiuek@#OgV(NA=RXK2??Qq> zL{pi1B=Yu9#pND7QGD|DD42i{7M}ekKh1WO@66URfeWkLJ|Vw^{2T>X>P@i38W=sz zHySkV)Ja{V5v-G%Nqj)o#i4WbTzBw+X?Q{yjkw4u+)h%5GcmVa^cqMxi-vq;Epi&$ zP$}zPGSXtWC@ilzd3uDc(jO4@E`gjQAMz1n<8*{ ziCwt+MV}Smhot`pF}7CbB4Kv*n;#qY=pheus92%CA3@SrxkEi1~J% z4-x{yS)Y(ms8XwUtMs7F#}N+NU^cZ{9=^a37O3=J=?) zzZF-`aJch&gJP-)HJ~Cu%8v19Myg=<8KtMxE)BO+m6F*B$J^Cn(T8h2Bv1Tz{~uiU z0ynbN4mnk^ce~JEzkk>Df2b+mjd=u|D=-CsIf`Pl#&sDYbEqn{0!w?E9zfJ3gI~p+ z&&o8Tij9`bARn@yNdFC5k-2GEpZ`R4bVw@j%pOGzL2<9TOy18{? zQt+kZ&e`wKUVkFyGQ|XTBOjXi$OW3rvRyB0NUh4htO8dl@*UC$?h{+ii_9rIvnT}) z9mAoR5;WvblqB}~L*IE=;O`{%NZ&hsbun6FO`vceSjA^53Of2XFK4+Rj(_3kYgCQ?@;taHJ zvwLEl@kfLz@mnB>?1Ko;NRXP)?gVmk-)Cve@|jDr4BqyaQp&N39o?#ij6ke&_j(zg zv*)i9x2ZiVe_~eq6`GZ9>yIZV)ObBBa#0;KJ~_sZXYVOI0|S4)5F0I%7Ov2rw>77x z{@u4Jolre1wP&MPT3DWG#JqUl!|!0$NRZL@T%^8r$ib=gX4dVLv--kr z|3kjvyhzD~Tjh?81JGm1_4q^miTlBO=G4VXkZ{M&?qT3$Eu4tlwcSQ_7&k#|7X6^E zN7M2!bG{>^T6Za%iLbXMVq_PffsMnGQo&z8_?@$mQ`|v zp5UrC#po!k+#{Km_iFH+*U6W>rav+S7K1Kcr#ZZy#!>{=6C*mOdGDKdUN1^d6nmXJ zSV`@^kNh$@kape_n7`*jaJ!`JbkX_2hR#%hnQFXpt9Lnt!306rqYt7IKAzm!y1zTzS#(`(l^LxB z000{3qE!!=r~m-(AwL=bfCk3`xZvQ50`A$|uWG$XH-+!m5W-k+R7E&$5S%U>4lwn8 zU_wQqKu{)rAhAYpVhd6P`3Tk`&@TDkEg^8MeTJ5JXTd&fmp;El+_8 zQhR-K6)rveD!kyoPlf9mj1ws?0$BTd}H4uhiHkHay7jj6E%OhtX{FQZVV{Qpq! zq4z}zY((JnMY~i+Q3e}6)?_{kO{n1LQpK;#N`I567*Od}0n9jGg=(W|ZDJgbq6G_M ze(J`EY{l3_$4GF*A}3>HY+@}5V_}7{6}qtsTd}XBW0g7LawlWeZQ=|FAL3p|c>@itR&uWjP=#o}%BKU&$us|Cl~;v`(UN7*YS*dN6^1t)NbCb;z_ydaPN zgp)Yl6Ypsg<4&0vSQzx>k!qqO3kXh(oObo?OT^wvjIv3BvL(e_MZ|qd`eqZAjFW7R zOiEKq=154&3=YfIPcBw+&JRxhQIK51kwUPMTuvEMDV9=S|L%uPN?1@z15WC<{KRIl zq*lt*o-5NPrPMdpsl8jNKXX$DIQ)mMQh$BY8{_x}Rs8n5@0+Md>U3Yi?DV(QPa0D= zX`Jk7YlUeQmT4RMh?5^_M@lX{>uH%{lvHf#R~$|Z+Tv$oNij&9YmSEukgig}T6)Zt zeE72r;I-rMI4+iW(lHAf=tbu9i*MlVxR}fg(*8_W9i>YDOwS}<0lxRNwpr9{vW)$S zEXWx)Wyb+GaDKiQ6-0hlyp8yFUs|A`IxL91YoFgz)U{+IrI&+U9~+ycbY0^k1pL^mucj4V^!imf)oE7A_l%HA+$9QF}%zzeG zN{zBp&5%>AoMxS}L|u?Xt)oO0EV|B(v;M79-J99E(W6?Wv0DAvDi7*9yPG=al-i7U zM$yG}PVefoshQm0HN@;RSZLOlQqv~ABS|T){Z>p;^Q9pZSf3DDQ@}~`UAY0d)94f0 z(0m@GLCF2YHXPK(y|1EPANB>DYxA05G;hY{v3N~IT&c!Y9vGjwVvLzwo9PX#h@^CP zICneQ^|*)id@AYrGT7s}+v5%DMR4`{s`UEX^#+FZ2AA}P4)%uc_C|vGqPhBFRQlrV z`VzwWl1lng2K&D4_N9aRGr9V+Rr+)PFV*xns_EhXdsLJCY;E{#ed%oD&}{SGY%AUz zl6$U0b*{^Pt|xr1uXJu;Xl`h4ZUk?BjC+1ub^e$A{ABq2bm{!;(ER+~{371MGWWu& z>cSuUg^loqtB;pNH`hZVMn z6^^nM&fyjA{S{vPReqjTLABLq4yz&&t72uV62q%f`>QheYfv@W|NrPyqo1VwKRcUy ztxxIvuk@*@sMj{ps*};C9$)LjzNSvcSlY*GWyOXZ z#_BjF=msRz&Lr40CBh4mB4?BA+>(v_laX;Lc}FQ`p{WgX-(3C;GHysiS|ZPqqKI=u>~Er*60B|F1rE zu-N7cr2oJAR5s?1)&GJ%RrNpgsZW2`rvllsg+u>y`jqWI(5L3U|D{j){!O16=1go` zRiaJSC?fc~J{9$sK6R+=q0FE7mp)alefyU_wey!gg@!}@<)8E^tbeUf5&S!S%Jd)T zQ>K5@r+oW3tNyM}<@Y`6Q)@W?PM?B2Fl7k;Px@3F+EO|o0u&jGDqyHlhWekQg~DEcfXD8GUN*zf+$w&3ZWqH2?hYkM*fHmH$P3ijeq0 z@Hf%L|5~5g2mPPXrQley{x|y6>c7>eB>yLUDgcM$KhUQL z{=4+4w*Nq%O8ZNna>V=>eQM=@sZTBcLw#!LAL&zj%l}HBn)!c0pDKOSr=tI7eX8mI zq3%B1ntaxM;U^Fvl!OkUCv>FuUP4ED2kA{v0cj#ANJ8ix1Vky)R8&9^P*lW(BA}E| z1wlGe1nCF}r-=o zU7xz~_w=cT|I7MR(4Y0Gg1@0p-TVKoPnDE<-}^(Ky7~9?DXIUQK9xn(rzrkKpL!|v zZ}h2Wf9O*sf6}MQ{|SBSCB@Cz|AanOLMNI(Ec#R_8fQg&m#9xQ|08`$_Ln}D_3!j4 z%rAZF=HJt&8vn6ARq{WiPrdw;J_V#5vUVn4%75{{TA#WX1SRKXKh>xIXY{FibhM#I zZ0ubz4F8-yb@R7ARq(I%soDRaPu=^AK9%@a^r@`BuTKU28+|J4w?5VIxAm#4zvxq> ze^Z|-u&7uD(Ckssm_H$6Sx;)HAp7^8nX^9yce3r;Kkbl__|NH66qJT?ZqXIO0>**9 zudS~!V=nrlVTZNv=PI)Z|4N@Kaewf4^r>h64f@pff2%$fME-B|sn1j;cnEgO1Zx3V z#{VDbQw{%6pIV}PH>r;kZ9!8zT7LfP`qZ;O=~Dp8hW}c9O67k@pK73^{f6%eM?lM7 zy-ExEvp$st=>0Fyr=tG4K2`It^r;4-K82=?T4$wHr>8zW&t=)M=FGh=_Mwppy0}-F z<^CR3e%@Tg#`pgbeX8L<>Qm*E>u~u0EBaK!|EWIp?62xmb^nAuHAne(^r@Hst@_l> zf2~i2{Gm@RQ-1TDOXjYpJ!QDn%D{ak#_}tr@88v@O7J1X>v}3QrOe0lrXk@1c);J+ zr%F(6ed?(*ZNe$Kci0(J8b?J>r--${EkD5M!A$i54|5-`~!VTy6^v{K7}FbQ_25cpSt>o zKE! z+CUtTOr$erWSysHb(cHxx&J=%z@oss6q-cK|h#VlfSMw5v z2`^?9TSa)^c+slCW@zMVMs<&@|M%IuUvdYphNDzGcNRHqBf7=?wCF_qpWh}1aJF`A zQHe8jP){Et_oHxFf$khQtynkrP6tUukvo?oj7*UCZvXwbn&`#7hvXRF!C=sr+0vs(7AG zYCDk=fa~!oh=&8k!AL&gR#+p3>})GmL2lckV27VCz<)JEZu>ppZT(X(ab^WydiZI! zZB0hefTMZVSW}6xR^5j+)4lCvMH>%TLoNxVG$PCC1i+Jsf*@5^Q@LmkK#5uUp7)WY z1jjQ#>Up1Y$T50M^r}wmS<;%#(;K6&kNK6^Tto+&**|HwClIdN)JeKriDR(PDQUW8*ET1gay0x1>TNqEw>;y7J#2|nKTkM{(q zZYn)lNWXase{6|@gmN6h8`&-$n3C)=HF)8$lvPDWBpDlY(K@$^)t4M2f4-BF zKWvr3_q)y=6U*fTjd^-J7AVxJ<^&6Ug)rD^n@B);&GcK*_)`0Ey_kw~!Y)v}TuGE1 zD@%{0nd2a%KR1mUhsJ=yz#pqHLbWQMNQ%ynF=G#9&H+#qX|D`ftF!=d7QALQS|orf zvb$3#U&d89;B!F^-MKmxIIM?)E+U`yMoopAy)h|5DG>^&14z}~s6m@Goeu;g7oGtI zW4^Q_*0NVU>yV7rJ#C_6*(SZNm`6CSH6I!zr@TgI{u}XJ+beIf3BN?C6KA3* zH9JNWrJnW3ZTuFc3=eYie~D6nahce|hKEE^%HbzH36JPYLIB-GHDJfJQlQ?N^xESh zZJLkcIv+K~(mlnHoV%#&?{E0CxFsPuzTqhZzl*O$|16)W(oi-hG22mTa|b-tq8Gx# z!H)O6`eRGIl9_ImCuqLW9XOP)A6+L`tk&k_UsFoX4Nf*jY4=fF|0!U|pJ9MPvf^qF zm8V+|UO!!#ZwkUp3`nPIpwMdqL-A}Yr8JBP5}H6m*<_j4ZCKMOXIs^YvYqrr>-AHH zz^ubByRFv~et3`|+&3v`vaVHK8Cf*_(Zz3wAkTJ}dE=-=XyXpNk-)5SL=N}-v3*kYX3J+mGFtx zwY-kVQ+-NkD*VM$xf45D0g7~SbXq7fFq&=)9V$f%f&s{ciNRlh*FRBIN)&$xRmu!y z%?yvp0No0|getl}gsPP$qEL1ATc~0o3RPQw7OH;yAyknuftaPWK*ZNhd;dYGg1?}d z!b17!{~}Z|6NRcI*`3>O6DTzkt70FIvl8Oz63^w37dMC~q$RCsw?BcslQ}6h2B7+X zSyg~ns)??oTDnS!hTQ-m{_JO7N{M{*E&vq!xEer7(+mJ0-&6IO1tEd}U^)^697jmS zGDH%sk!)9dvrrV^3SA;q0g3^LV>kkkJG}9QYvHK1tXJZtO**A@NC3@o1}4COhy<^n zs-Pfx0ri;4fy$S-}+CsV7kZxTnlTKTIy>~hQ2>eB+>f_L60Hj+d z<=)%ds)>q0U~v_U04#pAUZ)wW6SmId25MojL@_C4QwO0K1_9)Cnsrw?LII=ztyo!O zY$ys%Vo2Y|M=wm*$p%6Zq=(VyYCWh@17}oct7>9`EGE=FmK5K`Xo>q!(R86j_X+^O zrPbZ3(=F$PBEbVk8q{n$G|6<^P?awj9_uq)YO~>gaR~s>#T#aTP#6$OE6tDxhPt^f zcIh zuBz&JLzJrkL=X({k_LVC5Syt+0SO}FRVZVV>{LE_D<&inuL9X1u?xn${0W*wyh`fL zN7-ztUElynuaERYXaVYqNyo?6o4NF zP#h))l*Tfl^_<&H0CWj3M-zH)l5G=RkGhT(8Z!WsY5+jXqEV0Q2(`{nk8l7m+{F%c z^k~qT@5s@^l=?%fbjQBE7CHgU=n8(0y1n@Mw_oM{=*yyMZ5m#Q9i=NqAIH?-O4p+i zWc16g!W-*V16>NtvOp*UJelX8{i+06RSK6s`&DtBM87Ixuq(Hl*Vf$H8Ak^}lg@$y zRxostP$*_jfT?YQ3pJJjsO2P{&Dqj(wHpc00x%r* zFs~ySlm};9^Z;N0#G%F4LK7ebP+I`OMsU&#aTg%k?K*Kl2P9h>{RI~sx-r2;l(?sS zppr5Lz}sA1knZXf!GJa(0291!O=;0-d~HOQpYlmyOUw zyb5-$Y92hOh$BB9s9Z|1J_(s&Ll#oQdDw(KP9XjX<{MM^19GM=+ zC!}Ym6ZI-7Bev~vqFz-P{Y$UP)f=b1F(S78FL+h;RPs58&#|Ma7_!fHN8|z^ROfB_ z@ym}dG{>=zzaRe-UKOGLm>E&lm-8F1iZ>{6lG9ZQlgZ@msUR)!e?kWGh{lU`xdLg*_ml3__7}H(XFs zpeUGQTyj4tMA;+&dKY?0{cyxZgqEWh8wE0RV1Zk*Fq?&7vn2{Z=P$)7%UF4go^AMoMys>GjTrxQ=~j-WcJeR2=i~oipCpvaNGF220gz-_Jq18i!(ri$XFTL zv7Fcv87o-aED%26YJoBv#Zp<|0*Qg*&pDW|=Ua ze^RUp1^vFKjUI@isJXSuE&>#--;qRMscXq3oKY+n*Fq(-QxmixV$)2HtL}N85^<=f ze<)TZ{Z`iFW7MuFqGHwX-JGacDdn%QDX$D0`pw41RS1BAUYPH9<2?^WBBRQI!xBWo zDjMsrAeBn%MQR|D{Aa>yOJbMG`?207l@)8VWreN_r0#vEIz0D#~D zf#LS$7cU1yEF##q+lMI@}iYfNC1JPCJDL@z~zFNkh|Tq7u=Btb)s;_G+24m#81uI6w_oK~Er9)k z_Y5dq0#y9=t7L5#B-XD$P?xh70~jbT-8P%A&RZhT+M(F1-sO!A*;kNAj<`#xnF+V{ zuMq*O=D>~q?_cx*4BLU?@t#a&s0R|+c?lNnu4HjjScFbkE4_0vYlo*fu!ie+;BRo7V@>7g&vwq6_eHlxUP z8zA9}IB|pzbd-pL0ZpKIO|rICwvw)n$Mf6~Fo!>(Zg|WXbfB^&P_`Xc%$pnvCgo{O zHb0WR=hv_z`l z=Z%SI)p!%>1`(|S%t8B=m)j_|4%hd>h-MX(69BfQ&pivGj9Pw5G^@teoq9ew*CxOUEfJmufp6i+~&Fepg1KwM(Jh@ zHawu<+*o&b!16)ipTH_uKrIohnzc89{RXRee;7xTKWqiTKp022AK`z1Rj{HTGiIKB z%!!}KjktAVV`hwhfmK=BGE6qAz7Y(Opb5`4|&Rus!J~)UJbY~U-IC^w7r-~;%6h{7JhBZnl-k3qSy31=3 zkYs0EF{(Ehnx zBzRaQ8hSk`Zm&Y%mp*m8E1>80#{Rgn`lc}13`5EJJ2C-ldmQ* zHJH<3gA^{67-~pv2o5d-#MVA$^?S2cQ45{na!)6v$#U1B061@9QNzNaP*MsDdIvB& zOlsD)h%-nl#wYR+k%UNr@!DG5jo?i&gUEKv@2ZWMOW$Owu;voOTjVOJt_NXs{EMVO z{Km7xS%&nc{E2!(czk9y2jW$t_Dwh?Gx6oq%7BPm-C(BX&>PvwGAc=7uPiIom4&(cH)~Z; zJJ=fgKMD36eOllWVhbX(2p*MXf=Ous>EMr;uZ?+hApsAAo&(&ez#$n5x428uB&2)1 zGxx#Z!IH}qhbdsv0VoRl(q-QcnfMyIRYH>4d_xPcX;f+wmerr*sSS4LaFQK2-_a~d%>Flu$>t^6mkiA&Ps9;r2X z3Qg5R9`>J$P7!j({`=_?4`I%C=?44+%*2;3J}?&d`%(VGet4vfc?N#2AqB@hCZlB6 zq}Dxtt_`){doJzf=tmc-N~varBDiOYl0AKB52p@F-gK!f^%lYXkQ&geBIzv20w zQQrujS`FbqX<2otv^1cNxL*44y=Mp+3H~6eS{DrFX#Le=+p0ElvOAb+-Nn{H-Ds8Z z8>14GQUCf2qcYcPz|UH27}uBm!l(!&1BiP?44Q<~_M>a-B*Iu=$^(uXu(1!F4wg(t zO7w#}7l3J;ne@gA@ClnOscaQ);^-O1_(=C$2} z4b4IUS-fKISz0`e!tXzsF3`U|q+(SMqqL|BObevppm;i-%aeQ6%Ohy8OJDdlUqlugkNJlreo2X%CnSG5e+awW>$Y-IXG;t>}npe`LfACOOIpq26lJ!~U8Q~9> zm-VD8)s|qU$scT?QW^JqDLDuiunD()(%3Lk$=``Fgpy*UJm2UNg*vvH$dk+JJXp|J;)FK#t_bd2&eI~}C0SZlwtA*{dsIy@bzgrN0`1;J(BL(6yKcvU9^ zP1;q2l#8?&Mb)pA2qzK7npn0_{?XrTPl8BHl1BLrvflS(FbFC=;EmgE@1p61$Lov) zUpaQjW?av{?K6U)wi=1+PDm1vPV!(m^XuXsugfSuTwr^vGtV7}OSl|?ZctJ_$7E2R zsW{Htq!dZUkzqkEcmtU8O(~9P4V8IyqztNtNfPXUCtmhzXME%D>2mvNZhi)^wx8LU z{l2KnQ{C&YeO*Wl^HdoWP5_L=NiJ%m%RF1z^U7fmPK zm|*q}RW@~@D0fc^oh*H6wQY=hc{TKT6d118MlMoRK62;|9BVA2e66v>8DE^p3B9Xo zW9~?!v9lnedyvA1N9Bj!0-th+c-~h74diGjI9_+R(|soYpjY8>!~OIR-0$RG)kb$` zc6n!>d3WT19?C? ziga#DJ|4{QNy6P&Qs2~CbD?o7f+wlcfi)WGa|;YTXm{zl=ycapiEb-{kXCS0`j-vO z_Iu~Nw%Wx?%Xab_T9zbYlzY7-cJB_id{ua-+#gyS>h7n#tU*x5VQ-GezBB15Ka+ZN^t`Yu^{Go!@*Xa6WMU8 zhow}taEgoE)U9xu%d@M+?lfzL;1U2eF?HnoEX8X$?O7>BKb&^j9ZFJ0f1{MMj&P5Y z&0dbVNORd4x-J|3oS5E_7htBT|@T;Uuy%AR@t$R5gDtBbn?L z6GswSg9aTBK|_tutcLBPMTIH#4$>%C!m5?jivkVK5%ETVy?7^_Mn~)gNMpR0-g{i5 z9>*;Om%lhJraP?bjqID!R>k8HcoUu>$_G72&Gli?qBi=ewvTa>DwPBamI6n<5)+l$ z3Z@oq(WqqjhYOm1Iv#p_$m2{oi}=Z`c;Ou0v>MWp$RUx~`9$n|sXEwL`yrNQ(NYqM(?7i2Z^m=rM6W(L#u;xIHOAm$|WOS_^ZzaZ8FHE zHeMAw!xhiBcfGr#2hOKVi^`clE^fOjug4wN?yfO6rB3@ZfDi{`>QwXpdW;yFisYL} z#&J6jsa?iMPQ{wGQL2rgSxv0uS9BCL2B_-Mfo%h7CIvdx7=IFNoX=-1G0%(KLL0r9 za&cT~bf*)sf#Kosg%=N;tjKg)9m2wg!={&nd3|k|zIsfPaB&eMA_3YEkT+G~d-<;_ zjaYka0kfz!T`pfutw9w-QjT5{%TXaCxmiPJj!$xYypLH>=kgQ`F>-XJmm-BQA?YC{ zsM1KXaDmzSCY>?AN9B6L4r-6?I|>^Z=_Du+BMN;n8hR{DpFHN<7ghyb(07^Kxm9u3xK<|w6~@rpc~#kggkH6%+=Po{dfbmk)K;KzHue^i z*HkW<`c&nHkmX%i)e_V3O^J(s`_QOGFzRbu!MS+TJYlum$M=DH=n&#&Y$z)8SU;mk zFMOmZQ1ABHL~)2s$>q)x?o~LlRC7J{N}COH(WN4Za}^3J_sP#WfACT9;4gkX(!C*I zVtLL^>$1qPm3)*AByS*ELZ?~TTAHa{#fB7t@Uy6Ys1($LtAApZ+%D5{nHdt>CHJKE z`#DU%$fGmI@B~}8vj@X{%wuGtCJ<3Orl;=uUh+~T9Nox^6ZXAzz;UBty{R^t6~VnJ z5^c5IDiW)g?*;fc+Yp9#8Jl9=w$HtJ@y+z$fv^T)u{+YwW#2(N#P8+cQ-_JCwzF4| z$FZ8W&Mbb~TU@gznR7Q^B@>i2Ks6U#-v$Z1T5-fy!vglbYq zpL}rYy`jC5UO1*cq*&^7XD>%ntevYC^N}fDZf{6UhJ+vM#5~b9nl7;;D7musbVr+P zSK%39YuUQx5=C)4hnD2Q#b*bBl0TmN4~I)0CTs==)*LZ77uQ~hmn2*iTszn#96ZZ|>SP9*iuEy!;o2FLGP|hiNHgbl53YhV z>$0bzpQ^1vA84Gb{3hvrybRA9T$kEVM z$UxAA=iOsdwh+e>!}THkOLy|KjcT+B+V?5rspje!9u1%DTxFEpHhriq3JbbUnZuOa zAp)uo-@wxM8WKe;1svv{mNeT)$t0Elx{ zvY$+fpR~;)m>N59-miWqtdFq}%rOP1QI40=?2vUIesIZD-riDHmEOy77Ro$p;i1g} z-`WIl7nl_=-R(t0cnJ((wz&zLW4X_*s(0=W%1S{iolu=c;r)cslBxKG>O5_uhOq}?Wr?+`h`606Aa>m&n)q^MN}h=rUf4Lb5W&ee2B zDh|zv4K?!)l_Z213-1c1jOs1*LpgWVLv1v?q?Nu6#d*Xjq_s&L4(FoN6!Sw(Ve;a* zG5FUw4Yj)pd%2puv2!78GLRaL!~DRMbIfD(T5kK|YWtMB*GLcYtM3M|DCydzw%gs& zm0!sc>}_}YspEV_XJBo{WpZC%$v*|v%QgZW;oLKH&e6=v6iN%VoF#ozk7f;<(oF79 z5km`^R2O#m%RT5d#uD<)RxnBf#3Oo0r;f%qbGJwLz?+XgA0>OyEB2yfr*qf-m4$u% z?t`WSF}IsT40u~{9^B`zuRHI9Hy`#m+!vkf2IIA`C=+o;hh;~BouG3$!v7h#Y(Fn&ST{K3cpjV zYg8vd!7h%o8m6G%L7Ns!BqwLZY|N?QarzrLQ%Pi@m+vq` zguxJ=p~K;ajb?$ocLVYQUyfY^vY6m&v_~YczMC;&{5r*4%H(C*^Zm0g1|;iVzxhK5 zt$6i*>{W`0J4K3Ja~K`|>ltbt_cTG(n}Blo2&C-`b_zuzBmx5EEOs~Bo`6HcmIgL7W4By6ra)dd5x}l*` zyl7m!XeU|(78{R*wIG?5(8u3FF2`Vwq{@%qvZBmkLk~V!{heX@;%|37oH)MP9jcqswZ`%ZZe7V#iyUpC8GKq>w3R2q!Ht-$Evg}=(vZar0TxSr@}K>83lN^;6&QW`gA#Yq?Rij&_a~T<8Kwuy+V@NJ_1y+8i0n zI(5t|htzJIV9tC!6v`0s?ID`|NHkZ;ltyJ#w8hXO+8|2T&`@XSP6s5)<0z^L2?yo& zMBf3#X~sB0DXSNZW?LA)1r}%VZRyL$9D=xG`Rm-M15P{K+pcSu3Mr5n<;z7f_?*uZ zH66>wn17&*IQv=;k2IvwkffGQUVT)f_wJrxs<=oG`LJ$`PUX?h7Zn#+V&urr$5A8` zc+V%F$+7J(uf`Emb;)>bKU`cKsIq?gJnT>Ul;xW0-MmG$yl?9zJLg%0+;JF6R)wfT z?{}EpD$+sg&MykJM9LZqJAuls0FCz_e3`BGHC=CRPKGd^MGtY-M_I$ z6wg`}3)Dcr(q@#M|bP@b>&&CqTalO-0Jwks1^HRm^ZWyI;!(sO_ zqr)NYwtC7}AYJUOskwyhH7R<%)KS;MjTfuzrbOndJ0(4Js&!II;nr4cg48!Py(cfD zdPodgAex-p9U{|wO(G3d`uBZ`hCw{b773Ny0nZQhIh3Ki^>Q9P52ivu=0=nEtzmbC zLPT1omS_U8i3H3V^>t8J|VO3kBk;Rn|Z9^5POD`)8Bw z#XML#g~?OQB6NP~Q;}+%H0I?z>Sy#>=&6s0ajd2Rn+Zol!#%m63 zB2N$$riX2;^q(R8$NYv~+>Cd-ycZE|N(|AtYjlSf-hT(=sz^qVxZN`Os_?3(EzzSx z{O}&@`Bo9S;E*fatyu};P4w(@qaG}$eiVXPJ1IUZ?Zn{dX~mOzs0q_r(HEu_ElQQQ zmy5LY9u5^VMIp#|k7CKQU}Pw>ZH&`r(?YV*6@BH)N}qDQZ_ci-7#dv|3a};BqWcgl z=Qk@5VBte)q>QHzd_nQdd!$l%bk!{Kv-SP6#()s^Bvb<7S|MY^B{o3H$aPl)T@lG& z^r=#+{a`CR$i8xxwx?RhTjV@-ami)dKlG_LiM9cCa^!Lr?$n+rO28bS#%}PF=;?O5 zKlG{5hB{PiYTf^wKE=El(c|R0Tf+Kw)pqm7#H8zwR@T<-yv?XpC%2!|tnZFSH*XzH zx)BIxtXN{d1B%kwos{;dvVObx|GMZGa{$FcnGj=B+E`#va>fI4_l9s z-A>NkDQ_W7whxy1JGqaYeMH^ZK01!=(k;&(Ib=L8psn;W+O0%hx@2al0@Q$(Z5rLNldi-Gf?p zvIVSLeD^Iw5Q>jSK>7OXFR{>|bshMQlD^#c;J~)5g{dRgA0AtMwEH7Jxdb0yZ|wnI zC5E)|d{e~XIrV}&H;e0=$x(TAKJ7reHgQxd+kX0caN`X{in0EtR!_mQwDgZ zfvgG$CSCEJld>IrHDg-sj^X@L$tIWo#?gnXC!km!_AHie^vPBxh+ss*p^ad|;@0kE z4jrIbB5->OxU&;9w~Ys529QHPYwvY66O}ksc^Txz?2q!k{&o25N!eEt;3|I&ql4!s zmb!kD47%KnVLWoe#$#0hIZ-D;#a=MA4jI)?nl%OiH zfKQ`YgmIq?+ZLaTuU+2acb^+O!NE3AlOzz1%YoE~NC<%uc$EL0OBlU~#Zmt(8-G#C zMhap7X2L#$;}AwOnri<9@v>)|PZANW#y;q$daepAw9j(5W$tr1U}hafyIA&>h-lUJ z{X(=3OPTVsIf}D+dZCbdG>tc8ny4RW1&Y~bA~<|M|5h*onJ?*{#@X=+_Bb&spD@GS32}-j?A@B!|=jd8TQ>cciHvqGmS;K2O!>|ScLE+GepdDo>-ZFUa7ILXNWsFB2WF9HhQV*$^?BZC%RFCVBBI0 zWZi~$tTgbXd26f{rTP{Lkq+7!+o5c*RDIQ;`Yu$$vR=ZdZR{ad4Ogo^v%9d?%Q7** z)0hc)={{Ai1@|jMZ1(be7*=~bw-NjOvslB|UxpTN$wV4|5UOr%ISdp3WXQ-dk5{{! zE-#zK?<%A5igCT;IX`wtg0Lw|6*R}SCicZ}L=iJIbGd)ebK?lsr<57`A!oRMd{7w) zog^Y!y!HK0WCg?qX$;KR3MJWF2dS{b3mpyO@9>7FK%=}|ej=i!5dOwA2x_K0Arim>zP2>ejM6rCIu0MAdAYw|5^ zsdTH9liAQ2bn1I={{Bk!9cDLXyZd5HHb`%Sa7Nw?&Z3?EH1xhUnngdH+qRsl z#(fT!Lq`gAV;$w04peK%xaiY&@zJ#(CGrBWK8>qm4gmqwt}mFf4t$Ku@rQXEhFLsd zkFVf4{!kV?(a-`t^F4x7rW2X7Ec3U|`5(TxY`$~xEqd@tlVdhK>+nTduL z_ti@T(hZ+r;3cj(Qd`6@!yM_Rho%U+KWdoTT7Dze11v&Za=TtH(`ZQ0EJ0vBr8FXe+H|zaxzEZ@;B0axVZ}hX?YDiW7ly~dfb(d^9!3}?^FvVLxf4tGg zjXms(p>SxK#zj%W(2Ndf)&exgIGSfX{HSb~&MVfoJoc#^r8q2BvAJGi&HVeO5NOi{BBFJ916Cf9Le`ehcP*JBvhtqNG+iE@RRIkaDt0uxAxuQH zu+G#Q<3vR3MD`b=1x;spILq}&i#k3eo~R;e)FgI7IqD>CxIwv6Adn1?#a0wUvspGS zgNzGJjkuF_-LaI$k*eNd*j=1zW$%D!Xi3No^hz-GW(*N(1EzDJLGRpe?`M#G8t2&o zvVw7A8FRUmwBHmIsN8WpS>rtV-vKdAkhmkRk;4s8!JQf9yHSygDptcbQcR0;8yS5Z zIg&-9X2N~n0mZLqHtzCtD_MMs7(A&NS1C)(mdG>ezu$Sc$N>5iI}~VZ!CjrrV1xIX zo2c0Lvr!&OsDbx$hr@}8mQ^n0uGB9?D`;-1?ke?n82er3ZSB(B`w z94L{$O&A>e?7r4LxAZ++veBHQ z$=a#OzOcz@vdN9(g_qL{-@+GHCSP3Tcp2*S@_OORo0BhNIGQm|&54E0sguo_9ItLW zy}DEQil{Q(=XhP=^!icZ>!*{ipK-itaC-By@XedaH}5!F+MQau3R^x-whVB*9d>&A zsqpROV#oyEor$zInH*`tD0v>!~W1;Dm)t{W7$mEm&$! z99IkO=;u3%lU790JLV$X(PW#ba~od~R+eh2)U z===-OddT@3(E@(_g=m$1{Do+hek3AVuUmd0TCdf(EXqDUJ^Ap+`Qt0jkMGoaaYY|$ zI6p48eB9vt*jdzDKh-4KMy{-F2;uc_XzqdxjJYIIwz&zB2yGKaT) z`|+q{bW;ugNR8_r8m}DvA@X#X5aTjpUOdL;GA?~O?!`6a%QYT-IuOh?a{cY6fZ~ZO z(fEewab)TEBd(8mTtn|prz)I3`~}haS_~dRFn*1mmPK-Hxsd-tv`{lXIJQHs={KmE z;<@Q5bx|MDoa4)(|ulSaLqo%)NT<1lm=Y3k2 zuW&D3Em<7JEh1Z2yv|l!&WLQ)YVp}xnZ|M`Y7IHPhCW;DxwY`db){)~DHOLt$iB7k z>DKb(^ujfbjY96tdX4Wd&o*qQzipI&bEm&`oo)8HuKjGC*5sZ+Xv}kQ&ybbOoN~=> ze_``R&yM0|w~Kd{xu%caPH(DDOq@zRbmp?Vzwb1&@5XcB<#ynE|KQ5Z!Bw6gp>98} z-~VxQ=0^Iyb*;L&}a;|jOqNB57P&Ky7E`Ptz1a};@W zCr0XC%+GeWlPh1sC$}`-Gn}1fX^? z77e0i0Rh|Yx5d+Ns6>IK%R7>pL@%@?Nmsm2=TyCM0+y-l&gM71rRwao zyz*K|_3Z66sV(5QdS zmErOWtAE8Q*o(|Jar`1FA(1O)XD^(XDlstMJNfP~7_1)z)D9pECzQF_rnC8U+`x*DmtJFWP4ckU6$ZK_a^C{~ z)~DvRhkTMeQ`$D-vZ!1l-kS=#RJpfB98%_*AkOoTmw5HV^1fIdc!XD|9G{Xbnz?i& z+RD_>58hR=2)-Rhm3Mcx^rqr#K%9zE*hMqXa~Vco3&43v7w=uJ?YUTZ=}EBT-H`rB zF7WdR!7OFjD!Q(4g%CdW{P-%1NsPhjCp|VFIhjhWyOevkm6OhW2D>?&?xZRwMZWM- zhHfcQ?b}!p*xVIs-wp{F;Ng>U%<2f5V_zp_3r+-hvH6FwD_CtGw0VoVT$HP}{bp zm3q^pX6QUopPEjUyg=l!NVU(-zDYK0L)E^TJ%aq^vC#MHO9Sr|=IQ#arYWxrc0-!< z5sTVvs0(Qp3|)bDFrua59@>#TRktB;p4-ZAS1E_}q%#QeBRFs>n`&2EH3&3ec%&$a z6gY<~eJ@#6ve@&zgb(JC@mIi&;Sn-R^{i2l2gDlTvKZ?cV3Jw$RQ5VJJ^b_KXD*(y zXCHzL#1?u@uYXw(HVvP?;Z|V$;TMNB9h=!>-{a4d5iy@~zljfw><+lT7`mVyv6MfM zmo8e{7#_Jia;iEuU!F-YyRk|y@0z^$WCqga+G3Y)D$`@5e{evJ~iO( zP@246U--mOnsfbl#I=qnN5o$y`R8cP|Cq1XK1FRSIB2;5yXZ7BiEvyh3y-S#g5vaH@}$#Nt#IM*k+_ zu4i{!FsKX@99!mXmqf#ULn>Y~cWx@&uaSGYE=ifw0EDD_O?^H%#U?CcQp;Tf@$asC#*s$4#QATV%`7iP)drErfP(I2GGK6>~< zY>Ym`gNC2$M+B2*hDc^kDw*~@%TB}dAvu>;?bjNs^t`(vNSS7@Yu?-C8)S1iIm3%e zN~=a|s3FdBexF5?#E;TU_H-2U0%pjZF zqBbLmQ8uAw-Ys}CIZY#++~ZxgQ2K^pcLt#fS}l5$LD$9wcfq1&g9mB`03A4rk&?78Z19h#QBlqnN^>Y>{NGs~VZCj$)l zjqp`w*bkXEd_3f1^i|oVKwYJ}HIEN`( z#V~#qO}nzq`#foG%rxEIl4ke2GhQ=bD5~lr0j7c3pM4(1%w5c1YhbWbKNA)7wEtYK zrDHN;j`IGq+Ri8}DYfgW^qQYJ%!>&lF?} zpFQ6Z~Zuk`et|6inbVext2(+c~6<#7(*@tf&g1atCLq}o*H=TOKtz8(NKias&Qaa&Fdr;sc=B|F$)K5gc z5xrmgXwBfpyrDy9)AoCT_v)i)=JE8NA{9e%Z8nzwV&)Zrj2#d!uuOcK)a3 z4@^SWb9Qe}(TL^I#;+aGLh{3b7`jNYrW6yJgP!1fE8U&~-E?dRsg?_?=Pf<>)m0A& zkY^t!YBewM)xoWa58v)hD2!dX?Vs|YXHD{9SQsI7KSnZQQ!_rfPfb<*PSMF3E&8&e zt&=isc~eBPu|xPYDF2Az^6zb{^N+&|+c|f6UPBz@|@=lk^N>0#xa&OP6|KNoho4oeF< zKVNkCxkz>Ar2m5IrpwvSk8~<01HN5f3g~|O@;ZE8psD^MfoAzW0IHZs+^AY=nsl?#P&0_hO>qRQ;8%!DKlm&7uiuxzWd1mDfX0M=L%ja4#~ zRGyXeydnwZmprkNlslW4pb$@@u>xE`9t!cD?h*GYlE*T> z^n_v7)V{%#3Gy^zW76Vm=(M~1>(GSYj07Jm=p!8854eYzyOQNwP<8RJFpt?yk-^0IDq;n24Q&$rB zzoiE-5Vrr4u}XxfSlR9!#HVvnWblCRiC^;Ny+q4zEq`+pRGbw@lCi{eGinD3Qo5PK zQW==<-g1c)x-BOCZr$4&U!LS*0R@2Tqr04WH9^ zT#`3?et+=WJu8Zb!$tSK!>&aqqztttybgWnVU=R&A@8O8Fz@<(f88v0sfYOQF766I z@^BV7{UOtER`Ne-EW#>~oS0Q0pB1~6&Fr6*E1aV|n_b|MQxS5vq%z0WI_L3L4mC2T zS~!<`HmCl3&L4eBILg-}w^NwDwKBKoIqvCJZUM!k-o!_uL%A=9AH`+o3{&K#>pU7~ zdpO9J_tyIMv~Hf2N8apK-mTV0?}YQ|C-XjdvoaZ<0AH)B+eg2{EEKZBE^|H#R4P6{c=UZHjhZI_Qer5A>3Y2X>=Z(Ey zcC{@eO$KA#mZx;n&)&1#O%Th%U;b{+@M}=H?~f3rkCZps^6dE`eo2oTz7+>EmRnW-?#&>W@9^za`awud4q!QvZ9q{*1DLL{CBV8(=pY;K>aX z)eTg+^&}K4f4h!mo3;*7ujkmvlHABv-N=5c@tzj+Jk=9!(I>p4DjYYS2qr&a-)ZC> zeZt2sCQQ{NmD?!#?ujgwnsjoLuzr)=yC?j)O=`)FN~28@RLxqsjlvftG?bpGfxse; z&05LL=)z{6(PpFMv!*{NmShcb^y$a%ti~3FnYfu~M&NK1!~PfM_#OKFt=43!Hjm}!*|#`qcjzBbwag#4 zq_?*{N^U<7YQMnU?(47BD#x(j^z;TE(XG!=dZVRT|2cHBy{5XOE}5gf-MvMr{pM)< zOm6#JJ90|@`s@z#i(4HJ*Vton>3}#hZ6sIV8J$w1YC@1%onjU$p0ge%A%5 zZs1XmWMdCBqVt%Fg;}h36x2?p+>3bB&2P{nZik#o?&T0`6F=yY_Ua%r=|dU3ASF+m z?0uwGD7A$?D1X0%S3k9XFN;^77@<$!pl{`8mq^VEy$)vGvEFg1=UU4BHWB?-cc0rS zcU@FQ7^ZN53=po${m6)ylS(ffJGxJM`>blN`YpT+@P~v&QG4ga5kzXTFo-)GaJW{>OonSC>^Vje*gRxny zQSX4Nq21B*QKNh}$6iv8_ixL!kBv=wjlWG9pQ{;vH#WYwJN|)sVp(ir)nH=HYhoj1 zVyk9iXKZ3`cj7bks{^rDUkqLyd%gOW^6GodtDj@9e(%0Iqb7pHi4a2~%$o>LB~sK9 zsm6&kdqg^#Nu>BBgW)8T_asZ|BwOty`}id1-sE|jDQ@v8Uc)JV?`klD%wWW4Gz@37yk{<_&gj<8=#S4B?#&p} zy#9as6iJ`@{{M?UCGkrN8uE=KEaKI_-h);;RhZ%nLve_HpY-SCr$|R^4SPU+X$=MxHYj6 z`Dx=0?IvDgGumh~)@L(5Z8NcMGkIb&_0#5E+AV^_R)*2meV?s|X-Z&PjO(rn!B>P4hC=cmX2ZDRcmV!gfR{C~;B^8Pmy>%+g8SWAy_IThXi4G@c@ zVI4nB{6B(N|AmIN-CzB!zvzFfVSO8^85n)@e5`h9?8TR{Co2=(pC+Dny?W9;`Sj)F z#@OV-+GOA7$ZXT6f3=Do|QJjGIf*z(mth45Nhtkp*#0p)iIy zc(dp>E7(edp6`K9F|&XGUV~oQ>ts1uvYe$j?%Yk*)l`LWc^8CW`KAlK4Rp>q<`T2V z#~>x)9{Ib$DaUi?`Nu zTM3QV;v||WHu6H-$_^PrAZNCSR%ys|RnYD$i?P!#^4p zmB>z}%&W4nIFTxKUFdQ$+N}^BkDlL8kJ0=AoK=kLd;w-(X<-VnDLPGs#28Enc~*Z7 zPh?at3&rbz!9N<-HTuVtGqiMaKUN?$PtsyuF~V1>MC-elKT_db7r5OJylly0>NkD< zsbL8rY!MfPv8qp_ycBILP~o1DfOqz$>8kQk9N(9W$jbc01{G(wR782EQHYaAH zl)wLjhSf&+_CD?+ZQ;jYctVR>9^Cy4y0aLe$ummLM~k0~O9Q_@p{5{dSguh~Yp0vu ze`#2bG$-UdZKUx6?>FVa0{6ZiekORdeI$paf$6gUXjsqwXjsylW`^{JHD4TPA|YA2 zBn^v;L~OJBRIdmvG^Q;+fRFY`4-^Bx2g}t~+vPq`eK_&ls1BPgrWhH`r+x+(iWGyO zD}=O@wdK>vVC3*h+r`EQPV!tQ8hGkmu8B}A^^0X3u63I8bJ`l6csiCN_TGRZI3(V; z27(r!epjC!50-ZQhlbURLJ1*Cy_pyhZRc*PM3LVOKl=h#y*BxuYFMI3{d?MVRU3EJ zQoWL{K#6dzw~}|1rT)^ee*V$0G!U5@J<2k(74Ty8`H)`@OXs)|26GZL|EghW^iTwm zp(6LO_UhDbp-0RpHXV1C=T&$VR9zF4=gn=%q0@lHM%%nuV%zBk6)IY)E|k&1c&jf9 zwzq2CNe9DXr`g$u`^Bh=yZ_bg_d)M5V4)CuU7Co%FBpT-l zj70UDqULVB?(-_Tt6jK+{-}w$0H<$Vr1+BF-@JZ@GQLS*lec7SiY#)SAlk*5=E$;^ z!|`xg#^3ATHLPL4D;V;4F$#E=@RmNs2X=m)ouSIgLH4f5vXuw^V7!NKN^mPyjNDp^JAe6Q=##l2NbDnC%t0UddD8CH#GY+BBd|SiN zhK0FCC?yr)-yXq+cW5O zBSwqdsYu;bH9aU|rCj;sf2U!|&i4PUVTsX~?k{ncFC_~LZPQ9SX0JtH$fR!)EzL+8 zRx~qg5Ul7B@gM^N{sq?ROqN&QUGU6QMT*`hd3(G|?mS(g` zL`R;QXT9EFKqcnlc^!k0oc~F~+Q(75MwOmn$}h|c&p2{6ErIa-r8J#1$@}{)6kymB z+J+47kuQD_Hid~0Q{@K-ptsQ1eHN!!NK-M*#{hWj7n=F|56)*Wb=ON z?k|~cLr@OgSLUK2i8#ndi1pf`qMN3QGOzLGO^p#qNa7N>!cl%FT?OO#4-G5(-!!av z@FoLP*-^edAi7@TLhe5`EZrrt-%DU!zwpA0P%Ql&8zF}Emm=*KVHa6kR~mF_*Md%Q zdI%=JV|85>UY$?TbfcySTA`bKA$|pguDSozup<9!4J(kylisTymvGS)uzu~Lq_RmT z#U%QIUUXCa(&EiT6-Lpa!}F)>iN>vQtHm$^NyB3KcMa>=-x}5cB+pb3PAbLwTf=$? zEg)%F3b0R4{;6T%AX?Kw>~Znon{*LbOjB6sy%PCjRDx(Tb;VnFD+4m_dXxD5Px}q_ znQIEhpdbM=@KP*fEDCZ1&i~snhGr^?i;;iIQN9wA-j%Lmu$6g=5Gwemh9%UXXJkqz zfraYNWx^{(P7Bjl-zGVliDt$_9%#gL5usX3eynU++!$Y*Kh=^~^H#`43+a3kNuYU1 zW)x*flNVW!mZwSBeOb{t4FP4e(hsbY8Cp2fL$Q5@Dmt1wrT=EXnNV<_80+D~LS5l! z|3SlwrYxAV{3`G;&>BI_hJSidVJF(jGz!wpn>Arg-#Nt}T9P@1m6+#8Difh*I7s#Y z3%BJx#w!Shv&+)7_Yt%k>0qgR9bWk;!F-%9()DZBt4jsk+%(V-B-=?l}L&xO;3PNq{mtxPAnJSi<+mu=SD9!yh4GRg?P(a|RiI<}2+J|j^p|Ok?jy(&naE+nM{WlHk zA%%@2erG9|^NVD*qUgou6uUAy!al6wa&VVpuzL=N&wN##NEyyiHP0!O+5fT9OGu(4 zm4u_O);_EY3vJKHB>anph3WQxX#*W>qyAl(*G`1mUSw(3LlTJ4*9}yu^w4Gqodnuy zex&vtINhv4&vmJKG(5aTL$%tIBb%gQNm3sv{-+vNDWcS6qt4MC`BKfYtXPUUkQA;|JyXjreI3O`HTmWqE+mK8Xbh2WZ`_nWYk z82_?n@kygf<*`->M;hC*QW7_4mF5<=8X3u;N zT06<_K|l%=jgR$Poi1s#%aypAnrr*=(W=a~+*abiQI#z>G?nd_E~b<9Ns_q%XrFy5 z2S514&(hj`ibYB4S2EPE{b*~kWLQ7A$yrjXSx!k@5Zk##X*MN9_n@r8U4$kuRN%Y zvMtze_h#(qWp7PqDywx2Gq$0b&ZrIPE>SuVS;i`crAE67Kiv5v53E=2ZRIU&+1moq8b`9e&x;ag;X^~?$%x05?BATddID~PdEY^?v+=%9hr;G?k~YSdT< z@~bfN7*Spz#@WX{uw*-ew;j0N(fbSqDI?Mxn^LxXDV0qhc#Wd@I!_}_q6Si{ z)YvG9q2wUeMzKAD<>ag(iB_tT&VMr4WSS8_&6+BJ`zbWlT`poUYN9w}I!~h$NjvX2 zb0$V(h;Ei%WZB#IFE@FumSsLGE{YO zQGhV288Qc*DIznG<0;C3O8FV|Ommu)GaRIHa<`H7EZB%K4vo`~Hd=R}7>xp=&}wchdb%5fzD&6%pHyPdqCU6lE#N1B%(NAJx9! z&GSnNhd;BTc14r5PgQ(VVOZ;=-%vpYUl0Bp#5%d^vVRPN3nS)g7k~F$b|u2!JVLlw z27AejV}h441C|af!4~mBO8iTXaJf>?DhK~s`U48BhFfNmS9_~uQ?{zR=WAYtU#URR zM0!=LomM}?tty_<47b&QrRXFILxOldvdS>aaUppYP_hEF8J1y5hoOYa%H{W#*e8~M zVvH2W>MAe8Cq|?rn`*9J3NL%C{ODUnxmkEhX5GkqzXIVchg1V2Xc3Hr(j-5lJi9@q z#LG=@t1WK3+3!r1cxsh8Y{K!4Vkc@jnX3`;n;!Q!i1n4iqY#!iA;a}CDB2Ter=beo zq=RNb9lC?%0@mGW&lmg+ViEoUVr8SeW%ueF?jrA zS$>$}UUvq9)y{&(6ngW472XUIl70{;n^Ei*Uv1i>&~R+&@2jSa2!DaNyq-m@A8xlr zOx{7fEntrE$VxcV)iop1LBAu+!C8vpKX);{NyAz6P5~$A-_!?-;s;x(gF&lT+^=4- z9DbnBup2g^=U=BooI+{&tcXjP7z!o{*IDASMgo%GsMKn5{=(865oSHD< z4?j*$MxQQ^=@q&w>8M=2iBfB`C7oR6vc=)K=?&vTOe`(G&CI4rsqTFVxA_?XRR_$) z@tZbGyyn%X`)+IVuXijPoL;8!+rHVgj^68nE+)y5CcgTwO_bTb{dBb>?D*5#ySJYm zy74eN0lV3T$sjY@4a1Jv1D7EJzj664%XPQrhf==l3v*wt6N?y}nSS37^q8r&6xCgDPNFA{*`d+?{Re#mb$kq{T`PO2czY@=9;}%-;BTe=G8XX zE-*O0@ZD>(_3?C_QM>%!o2}RN*FOh-y@YO+$Mh|Jv}0Q!X1KtO0hl&VfDEBB3Yxn=T^mxJBtIQIy~ce1 z+`{HyK_1jIp88G_@!`8$6yug+dlO@FFJpT=t>)vaRZg6vyZklI<9}FFVN;Cr%19*E z;HLNF>&IuG{O3 z=ZX^L=HHyvXI`7%GAQC|bI)&o6Hl6xm?y}vFBv=CJ-7Z`)TXZPhRx*3)U)KhnuQ6b ziB=P=l2_NyulhL#QGMQjG^}5K{S1N~YK4uOw*SzuL|cB{oF8NU;y^bO-*YBd>RC5%1Z-+qTI3H$P72pCx0d>}0?%oDnv z8*=;Ea`IjJu(g7gym$lG0xG7B=EelLEm`lzyy}y;s$w2cc>hUKlU6Q#+*|V&@#{{@ z+38h>_MZdfTZ3nJfZLR6z7W$h@JX{_e=59D>4SCDQfHDR_8b|5&i8ZV50-R3;|ih% z&hB`c32n;yMDOh^k$;2Iz>*{{|B`b0tv9PKRfJF$@j`lkh4-)oVoP!QC9YOt> zpRV=Hp_+g?RX`8Sm4!$s|SR%IM#7P);)$D(L+p3YjsN@=sPnLVE`skzPpD z=ZiX;<(1Jwk_Bw>^mpSnOUNDewUE6Vblg8kriYd+bQh`) z<@$;R;T0tglkUME^%#re;FNEQV5Y9c_x3U=DOci+{KFeUG%GhBk*ohLu{8PdNh zac)W!11S~qp41t=X8K?|?H=Q?u5!(OOn@XGvPUFKg}XKJ!AW0c;!>j8ClonW-46xh zBv%)8S7mE1ft{X2$;nBXG7(JIBhw(X^QgKn?kQ`GsKoN_YsYg<;--=m91&bhUw5<7S z3jPEN2Eo(lEF%+_eRVKBk%$pgU9f=#WTAc&2L#Jk?yBscrJ+8cwfO9d=px2=kQK9f zO7=ym1`*C&U9-kk{ha4awo$FzNY_klK0x-aDk<{~8R}Qwl-$kp zjBU`MGy!ER$ z+2ZV*la>LvMFOwZXCs2NB%Uc`5pk`7(C{c0~Jh`D7KN&l5#8TGI?6>*-Qu#r= zWR0Cx8#PX#IAwdQ95K)??GA~ZOj`ktWy>HUMrxAaLQ2hZaPtU;sLg2WgJlK>}a^@B`GFoI!&h zFiBN@XQYDLqa6S!#KDI;0{}1rz)@DHs)rIbbGx?NRZ%#AsS?Mk5KoE4Qvx_p0#1up z?Tf|>#;q6#D~=2R5GctTtT@(No&C8F-)JNR`$Zcf;Mba>!qS!R;Nm51;qp3BoYD*6Wl z9&M+{yw#^ht%w;nscHvm>eMJgXK`A|E%z1S)U$v@ztBrt$%yW<3$4+7 zh-ltz7F%Mx8F;@Y~5a zXJTA!H=INjMK+9tTd6n9kVU%3H_yQRQKPr5;E`GJ^0x*H6E2ri7oP33F%D}mU+QPX zlQEOy?nLxG+L1w686Xm*5VM2KPCDv#NN?QmNXlPyRs%4=i1)>+r7FkMgaZEVfJz`C zP@kaTrKW&IZdbpko*skqkx4-T1AHOiFbMud#MM;`XKO`JAB&oOwaqJ!KB<2V=BT?I1*?SNH;|sbwjrt zV;3WL7b6}5D6}*AgMoBiJ#%ele;wvI5Cj_q@{{rga3X#L-Cq~qK>?=*!i<-ipb%D$ zF;2PE_|UHF(RADzd%wXiBx30mBX<-`2l<(OBKub~ScS-x2s&1VD|0C4OrsIx#@Fr{ zUBJf+Ea6kFDI;)?qlQM@SS#9(jM@O>b59FEjv7pCI>?>av|rJgd~qUvz?hc?I}!GC zK?lG=uUa)<$Kq&EWKIf=0Yn(20cVCKJJTW+0V)7u-s5qB(`1I4Kg~>$3OFNToEmnD zGZeV0piDw?Gy{q7%~-Pv(wgj2dJ4H}g}&Ml2WbE-urcs-H3Mu^yX{NHM8Nsc47kfIU0=+mSnq%TCBj&9W{0e z%b#B^-#|2janbqBz}X5E2RFC;dE;#WA|A$pla$mp%fo5&y|KODKJMr7QhcKRDcV|HQC_n(C8 zztfj+tlLicRkzl(`|DS#5Q?ao~#i98PY#HF_s$W`~nv z{0TT9jI*!-+;QK!yKwH={rkQo9um>cZc>|71v6*>IcO@oqd-76VExOMA>3AC8PA8J zFrTgG+lMP!Sr@j(sVu$g*|#0O&t$=B!_HQp#RG;Q@JAXQJ{6D>@vAMe){(<@HxJ@D zVM*9l*x4i#^;Fs7*(J&4ao!d2_cXUdcODw{*F>cD zYI#5Hy@#0EEBmV#C%`>B>~~=w0G=4KP#jtJAfpK$?)SmPn;Y`z!<^wg`ff9Zsmt|w zufBQ%{7e~YSvYX_m~R4@+8uqk8_QY&P&I&@PohP#NcTvzNc#MeIgVDx=D3soV7$t< zQ19q>^3rcdemn$1g4jM@hLZNN&7mGX@t@WJjD*ttM#9`DnG%7!vPEPHxL>tV5Jx9! z90P(Cw^uENq3ZS!H)HkzXdA3HT>9*VaG-`ICPLTGXfwtK^zZiQ-wG#6O(d$c#+}j2 zE${wbNy5i~68f2}(QK;l*EEHwYqrc7fXf0|a-#?};Sd(NAd?6RHdHgPTAqFxbB+N& z$P}jwC~k=M8z}+e_DFvq3M`2ma$8mKXY^jeGb32i3!{sx5^wKM%6v*@C|~rb=!b>k zSc!3D9dZVUIiwjG6AlN@0#1Tl6w3QqEeNZel1%P6m^;ppDw+ZpO>c#xKqxuQk}xe` zPHCPt>l)?Bb2lRbYayWD?y|UxgX3@%Lv!}TRePNc{d^onA@Ix$PeE!<$-e+c_d^AOt4gwbq>mKxQUx#ht#A>4`LIDo0MjO9o_`b7{%F5Yt3C<%8*D^g0=~iRH zo8GY_pcg^lfG*H}arxKCS^EP}Q2@@s=#+r4PgO#!-4$BRag0;faeHC4Hubb= zOLWCU$Kody03?78+tn7L4IHi^FIf;i!`2~II0j`nQ^VVzE|%WdxZm%VK?+2BEZ`=t zMpdB}8;b*@H2fqm3#0CsGJu~J*HRZt)4c*hYuxo(22H!TCEx)Cpdonc?8st46yOe4 zedYtORCj`js@!$440eFnvz-JBLgD+a!)OPVO3#qBRfZIxmnIq{2{5oSHZjkbp{=<8 z!nB6SCK;^(!`mKNw5t-?TlkZ6ZA|3mL$zo7BuxttF(t+ekBCQX!iC!ymYLxz%n6o5 zwO3Qg^-f~*8yKD^)XlDV4Bz)ul6-AMzOAe|%gKtPB{!#Ph$DeoWw%|*Y1i33aG-7j z|NX|Qi9V0kWi=rXW*`Q(D8NaaN>U4B6(VH%saeVx6?z$722_Ge8l{)5fVGJY5R4F zDNHxq5UFl;gXzSeD$WV&Zb&}l4s~?p$oEgpCzDi$|FW=}dq&8zr{2vZf5%KNy;N$m zvcQuClXt9FO2lBG z;KOl|-P3opxE&Y(zJW8>)9#sm4|KlQIWi)YrMY}nh4-ypopoM~|7xNjlH|2$ZcBHL2ix`v-4LBEW(uvh zw-3)1@6DXVekr?Y!dxz#8>QJwBQgTL;G0PDv}9Y#jDX4R><+~arbUCsg=2tqio^-(`^R2&FB4eAD77!fp#K06o@%I#mVUL{C|0t;q< zbH%o291b{%KK$87o`BPtafvkQ&Xn;pQ;Tnoo6>FHvH0MUpK`1@=1x?7wf3_+`z~<7 zp75gyqCOBq!wO)`Ce_FRazEcg>SKl&T-k>4wfZ)Cg%_Z#NfyF3|L3k00XuIUyzBX! zzs!43%yAwGpuN#hW)r)yv{2^6!(MExAbKsK;xHJEH%7zEh2osE1n+D@E0DDJiTQpE(}mBF*f&T%=Y%cfH|;!R!l3@{p9b zq10#q+7NekeO4{q;qxU`d;i%_smf@6$M0h@I0&hqDx&8K|M4SZ!gX@3i2hf3pTE0a z(1_tQul#f=KQ^e{h=4vu%k0L0t$_2NVeErp6)Z0+2YnN-xgr(N*lJI13wI6%_@V1p zBUUNztltoT+V#wj{))zryp?W;MxaU#Di+~sTR$K9b8+pb`VI7C@ z#tX`fJ6=VFs~5J7A};-oyjgP5gfGuI=mFEuT2{&?yHer9|ur&3B(GT)p>oFzII2!`{t+J?-$*PonV=bmtT5Oo(p-jV_w)DH%m70`#Eh- z6qq`O%nC|+W>?^8$k6G4ZuS5iHfGDJ00N*0xM_YhP}l>lp@t@b&X9p0AUp<39QtU$ zk%UhLWPN5#*Of4>3T2$zZ6ZsZc6%83dwX zFMS#GMwy7FkPjtcy)4!jQ>8Zo&0)SgUXJLy&>+*v)FSIJ{s-0rF=p4UhM#?3yAUgR zkpy{(A;FrR-4?v$g#(UL6s+scUsL#4S#b z#z%pFoKso%EC00*t%b2Jk?uuiO0Fkn(6XW=MUl#!qC)QDph zkLUsNJQUXCZ%M1|N+5vYVJsw&izk}(RKsh?IX%e%41#04M-jn6=^luq+;H^6ff$t~ z(cv4)SY))5c{946irRcM{9}+bvE`hTdO8$ljv-!T`Xgx}$y@J5BlKo5z~jIbB0z~C zNm_MW4J1h`v$VgO9O2Is25+s_ft-;ftv-px>piL4v}ftX;PlX2Xh7Jga4f33N*>_e z_!JDBgT5|C0~EE2GLu@>{ce@IqMc?cx{M-eZeLEO667qd#?iewO((o@O*H+H%Kk&A zEzm9H%Hk`Q(^5mZci!#ZRg?asc~x>x8%*8w=NA^)WDFaQQY9YGQgb3-)$kdK^H-MY zi(mO#tY9#W>!)AM7w$_MfAm`V`72$n!6Zp3l|ItI=3|~*un4ZyP%!Bc4=twO(eyG> z%1}dbcflK*>l^U%wEeoy3zZJHU8@6Ko-4L-bPm~-m$c}0m&~%?+LIkeB?% z`)0D|y5o&`_W7PrhTaw<8zNdQ^s!x`KLM z1SC2QGa|o)&)$yIJ#T00X!7WZAje@tyr%yLP_$4#(VdjnQ}jVkCHwhlz0V==y2wk> z6j%}yV_|Rw9C?z7GPvO3oHYo!zdJW5Rq?q8K&AjWGhEVK2O6frYY4rx81NwAFI<4bgTq+@D&=Vc|Iz{UeDZOWH4 z5`NK#+AmYoPk2+VXR8aWw5Dj1J97qP#)+Q|?I-!3sIy1ykIiWF(#GD(JevH|^bOGk)P3h8C${ZseQuv+e*c)8_|m0yY^YyJDQ~aa8gFt?;mgA z-bZ6*HJ}&C_9{#r`}mtdzanoDjpBlNv_!0i50MZT$|)H}zmJ%;$-S@jcX8Y8B?1se`GOkhx+kPoetxl}!zXa#VtEVKIfF zUo?A!6IIc6|Kr1>4W}f>??YFqBs@t`a_2^#&T2{p@}xBv zMp3(F#j4Hwj|O$@v-u!msDCKRW0}gyis3=^L zQdTTADRxKvqPXlm20nibNW1+cyuB*3%+%$FL0-c{MygsyREPB}#;e_;>~4-a?mc-} zJ7>a(4Qh8cV=v1pGJi78OYf@F$GAOjz-7?GVrE@KwZxrP7jv&tNjD}<4G)fV*P<#N z>bHhR(@DjscVoDlWrpbG82P|VX8Sb>Yc&}H*k`N~eo11HCJ^aJ&{GRlS!2Z!h7zZUU;(J<&V~G)$P2n8)e2;*GaUQlMwP`Tq? za(?DC>S?12i^qaos}PaQZcLr28ZAC`y182}yhFvc#7i#}+G4blx^MOhQY|C|L0to- z7BO^gYX7BS9hF_|+%meOI8!2VTuJA`VD>`sO;g=*Z2*aF`>Z(IC-Jq>H5^fybR&)q3Ii zhz|WjmEYJOjkF#*5UwAspSw4>BezGf?Dt83%s(!Gn%nW0zlPO)4FNlj8Jtm-ZpN#2 znb0tK&k05kdwxeN^fYtk`CJ~~Ff@&8+jZiQP zN7x&{Xb1HWivoEP)Kd#NHI}J<5NgE zQApLhMDx`VHiaJ&Ckmc`+1w#)vvbO>XT5=o1vj71IN4d+D7@uFKMZnWIa@T1K@P~) zScAbYJ&C%uZ)%O8rO#3P(oCl*;2dc-tyQsvFEPv12xi;At6}ieHmJ8w2XZOShmq_ai(~* z>QSC}Yj^#)7Wc1Sk;52qnI1vqzR1X4&au90Dt#BVR~4Hz_zXa7u2C1pGDFN$g0FXT1~dc|F@=^ zd*3rzUJd%YGD|TZX8Hw!SAuIb<~@)?hiHM|I1!7FT0gMbK77{$T{Ud9AL&}ky5q%X zx?N^_^MB`55OOMObDdIh=2N&R)4_btN~9%91wA!5BGq5(Xebt`pI>soT8z7A+ZYl+ zBD2e498TvuzzX-0E--?!OEbi0Y(3_zd1jWKw_=!otzNmtFS4snZ;n+|vDbZ823Fyt z)GReH;H%n#7>k$+EXC+d^$U;{U72%}UgM|ST{Ac;v$Y>!w93T>Kd=*Ge%Zs*Cv z(ukM=<~D{my|b6>`@AyL3?cN&YP;WieO++3m&M@jDyKkz1W|d+hN0n{0g=CIS;JeJ z-fm>gZUw64`7lV@ELe{(MDDs9-@1TUxws#)pzr}TU(~GCb=LtqN$qt-QcpsS#uuq{ zv=razC2Kw+o?2XkzO+aANDvO%T)4k3-3<{T;>A<)#rp-YlLD0q5E<~i;8>shr(QAP zILFHSRs!no)`F1t|18|IKf>51ef40K?4hScXxN!th18tbocJ= z-Fv^^_l|Kc&Q;yiP1UHHv+9}8{Qp9E)WZB$!aB^utLY5QAz%dNM!IaMSiFIpM$hD9-B)jjDM0kKNzt)%161g%>r9(Xe^t!NC5 z@WwpRPRbFL^8uod$g@AAGJ#ch^Ckf=KH|&U(u+`QXo(}G&n)j362cEDhL_{n84)t0 zla7(UBMW`HBiBgxx%$Ir$DMKAX_Ko5Pt#&_cCba5B!O{rs}EO$7!)~1h?KVoRm|a# zr(5t%(sM8JIg&G;C@|z}Kaeziz<|g&10pYb8C~XWS$@M{XY-LDeup7Ql12&^mO4qO z$f8i9;ER!T1?Gp+iFLh4YO+suDtgEa2pJo3HuysrU7kZWUq{8rr9T6yz`dH#{F0G) z?pDwg_N1a^H_we$7G>i_O0sr41(+uWObFj{^DuZo@6CNO>zL1*DsP6nf(~1zY}AcM z?pY1@=FdX4rJimM1Tx7+?uo@0(yr*Ux8*Ub_iLnivnqrep)Vt#v~Z-side$+3#^$n!wysYFF_`u7L7o1`458CeJycDNWuLNDP^*k4PWkSmm^0e;li8i*J+{ z&B@T|IrDCH@orpCUBeGc0PTP8GD5_oC8qBR=qz)zSD)6J%zX34=( ze6W2)Zbua`BxftJvx5B)y><5mRKv2%8?br?vMFLN+^bs`Flk}zgSGi~K*8|QnAz6S zP2xH5P?-5}v6i*7dxF-JTwJ@5dLDMpq6G4d73LDY2a(Mboel85`znsrre{PyQk>+f zMX$*45&Ge6`LM7!fZ_wq{mcl5IvmB27s;~->SRfB@=}ykHq@L_)aW3+DoH_u_B#}f z*OA>CM~{RG;l{=m&$#-4yn#e46tm&dvS*_f;}Rj@@j8rlfceGP z2{hR~;tB{ZKkUCO>+oxFkHR1lrxNl+CHhzv>0LtNk8t_9tOV2NV^k=u*V8$$B;JwO zR4F^<$1Fg_G36$EPKV@^$bO!OKnmHw$Qy1%LzG`uU23ixro&;+RY3mE2P}`r6RT;}0Y~ zpUBQipGSCIKIi0bQl$EH%zrubl|!GOaR#RXG7X&)n66uRo%YY3kJIW7(oVGyU&k(2 zbBZBVJ`9pPzNUmUM?#1|L&Rx={M%WN=3 z0LF{c4}3#BuOfZb>usn!6&+`IAmXb6T`D+zsyJckFV@sl#j4aHZ>W|iM5L?8OW-_v zBii|^yrNkpNPM0Bd7TU1zRSFHDS7*z6`TJ?QkE7>M4WBX;Eg8bjeIM~Uh^!_-wCI| zP8`XJP6#YVgrMJg7O?w4w}49TEhJD!*-zbf%p$XzI*&&U31HMDfx(nwKu*K;gvV6g z559KXq$JX0Z7R(4N|~uua-G0(-V1#wU35r11|>=h9zd@kPScPiPmrL*_a)IzO*FK2 zi=<;>Pb%Fyo4NA(Q>Tma619AlPkOy`2F<%?bnh~ooJ|8HG0wbLyUtyq`#-xP(X4$X z&bd`9tSb=jKKpe!9HkXNk*JJA~834yyJ$( zlliT3j9&#*nd-_!Z_21{sFRbRUZ1{#QKj4lhbsUzL_OpXzC!ReM{0(5&ROhGJQ8wA z3(waHuC^xgvl_h6V5}kk2ze@>>FqE(-mUod^!k>}^}?&iD((NduZLDD0~B=pk9juPbB0X$ge;^o?P%8VbC zx|#)KZAY5$$J|@aobQas9(5JSao_p&SOH295rXTTqbt<2obH1!Yua-7Ug$Nx4DgxY zHNy?LO>79%evl<61p;xykBa#lZ? z-%3V{itk)c!CS_-ua!W72qe)Ug!F5L1bPv-c@YSC;gdan(Go~jnzhav^YJu(;*OYY zh24@I@oxQQ+(K4N(xfTefP+^ds|dbB13w zls90<=T5oyp`*cVryjWr%-q6$sgS^FF;VG)M>|0O6>(<83u}?qLtOScny^hjQtCcm z)pecvX?==5{D34#38J4R0;{@26HgZtiXnLB;$IsHpLhmav+#VJqw)eto-rQc4 zYejoyD`(z6dYY-)f$X4Ouw{2FR`9(->r-7Q9rvyBK}WyyTRQ<*XkIRohJh&2%uQrB zysOP#5C7p^iZ^UZlM6HABWj;Rw4m1sUG{H=?OefQ@be~h{fRQ%%-<}rzhJQ9mSC3m zrz)!Jp!vd~aItaTHjMO94VhR^+f%w7OtkDlVk@paLx07s5dQ9+a zaxFpK6v0D?(1=}!ObUDvafDbS{7?ODT;saYirnuD4C8`Wdb+68rQad=K4izs<@zW= znwKR)zXRtBywSq^Qh?hdg-BlAV*}(>p`(tbPts)x3=49``9f$3MY`nZlO=%C6a;jU zkvHN)zSJAFmHmGCU~vUXt@3=oY9&b!F40m#4l+?2{vg6N41qZbi8CEFeVlG$sVK9+ zu41PT-wARSvR>E4OnAhzIPG9IN4V!J0>K1&JMlyr6qUp=kI&8u`wDEM5!5^aYXt4>4tA}>HqcAX+& zEoyy!sJ6rNBq)UDHWJkS;VkeP7=yLbx#8`Y!VkJqc$u4ujsyFQ%0WfznNXG?D%K-_o3BpJWG%#@_`M!Un8)bh$Cj#(5(icC=N zO6gOqYl4vqrE|n`u=3e8JOU9Po7-W9)wQwT5sXUQsM+T`at+@i4% ztDkZW*&rF&40}BqkVT~*H3V%zRokV(m2Z-jh!E)_zC+V_$v^q=7~v#4P`1vW_U6&& z$<5fgNEV+kg|t$HRN&{XHMCEjx?@Za!i-E-^i$(ygr_pZyr4R!iE||L!4WFJlp0r+ zChe4ON*s%HjZ7nW2{ERxU?k-Mi~=6$jB_MbX3BGOB=Q?$`h%YQYZ zeeUc_=qlbMXx=JtRQi~zP|6Q{CeD!BoCyllRxqZlkn-S?{TeZ?V776L3EQWFhZOZe zv(Ok-96KZ7Q6QE>^Fv&Z;jE#;Kt-Ds=|or@)5qSxQ1>%0NA9B@1|1iWj!6b_L*)RG zRyjIHpAqWI#CD1}_BLf#Z;!WgOPnM#D*glZ6sfXOk9IWMrGV1ry)|%)@ZLeqD4_{h z1lzJr8^I*^zKNzN61dc$LC08Ts+a|z))NMz$9 zE^)G}h#b)u;3s(VoT-aIA8J*vaD^co)RLhN{7N7u?}H3$EmbsYWf4D1$4a`+q>Qlg=b0(X z&!~cI{2+Yaah>xA^kvQBGly`p9us)XT6?NeAr~;op z^s;tG8-F>FtqJo}!%E`5u#~Lf$OXtNap3TWTFKCKoB00Jus(MiaA*DRYFM19B>ff| zr-{>ZFL|3t1{~r}leTru1Q(&D?upZs>p3$~2+5Ea;aM7rp1C9y>2RRNSq90xxr`|3 zNMzhu7QLQ@ygun@!o=BU{&@>!57M!8!t-1?JxjGz((ycv^L*`jOU)+IiL$u!LJK`B zoka-gWbMRxvCF)bK7@1%On6ZmsAp|VMK;~7aZ#QyZ*3+@HZvS|QJJS_W2H|vJ3Vnx zT{~}M=Rr2NOn6z-t!Mixm27@n_^esp|ZXx0zNSL!|^2_v?ku*m{ei91^v5IK$mJQ~J3p_}n*N>QyF z#8EYyB_C?2qVOBcVSmeoC(80fdhSWS`Psd*J^KjJI%E#3;mBWz@&L;?Z4P^QiJv$q zy1j?)>B(JMw)Tc15G=5=@er8E{%0%yTGFcF~rCRqEj*!MVSrAmO;K(4l zY(Bis1LtLn6r+{Re6*CyZ#>c(_g@mpF1oNIhcxMdmSAK=kTaY+IN>Rlru@a$974?J zB#-$({ww^4&YjjNRI}haK~!&-foS6sK9WXJ>eL&b%CEpkO(FiZvgE$o)eGmQdZ8Ce z!MwU)Vh${!j&XN%dZre}pLc$}Ak!T}N8Ob~O9f zaTM?S>Q~@f98LZvKuG*NQ@nH97W4$M-{C2aK$M9gPMP&q_(;6)g+3|-;EC22alnCzB|At}YKJ{hvLR#$Y8W734?9Co-~WI~F=IyAk`j(Vt;kUdb{BR44 z3QF5ocoPejyz<42v8BTTV3;hPD*7wt>64b3)OOj_Y0y zyioyeIbz(}Vt1tfQI=vsX^+FBCCUsi7BmhL!ohM4 z0ubSn^`-4-u&tv5u*}0y*8t=Kr<~(a>`hNGG{h-04)$#wZ!SQ)1^{w{S6HtdXg!|7 z&N_0?j8c)SU^FA2qEl*|pLtJteu=^$VBZwg+Foou(5>YM~z z5K;;DR6Aq@Se{)DRwR2#@Y5p=aZKqkGKh*2rUlE@g*a~KFmtvi>YXRyBV`FZr~3sl zU=-HFS@xr582Wqh$HHEZi7IjTM+NV__(GRSvJD8#m)H5i_^pO%X-YTlmL&!8pVUXt zRj2^XS%|wvzdctIy7dyG4ocWF@PP#dW6UZ%C-H5 zeoR$`iT-VjjMS0HW=MQJlJ)jX8SqSn*@>Rr30NM6iu)P{^z5lG)@u0)&V4!-RTLV! zGAg_`p@G`!9Y7Utoa{lw-5HBa*&uqC7kL~jz1)BrMJ)aMgY}vN*)lI`yz~8(7$PJI z0s3=gq*5^yBQZ{OwQLxo1?Z@Is0C?_5t78ZH458VJ6a!6SKJR(<8G{hNt&w;Q}w5Q7tvJWwC!1{~hR4cKp?bSP7 z6V_@L} zJe(r{DL#(Pc3sa7NiUbf+n5$y)L>yBHRPSDJuBpU}t7d5Hzev z_mEpT>WpX7^j;hmBNkHpWv6`7wo%0vXJwt^+51tR-5RlXiiU34AAu?duc(fQuffo< z89OHWz03hBVLCi`5^IW`qGX9mlt&Du#r;^5ziN`~1*YBdn|ngT_q-9hoZ~MuPcWp! z5$8XyWf2igT;hCiAb>dA;(R!!6sp2yVNrmKvLIn2?kmI1Vpa2%uZJz@7Iz}@gR%bd^pCry$JFX(0%JYcbnjd6OqCd_4(QNvoVV^nYz$Pi&{yTh7 zn+B8Iuo@VK9fD#6@gqJ^IsKTU-4b$vVJ?nevxI7iMH{3Iibw#CMBzCvW8@kH%MBkQ zd!rIZB+gzpg~S1P<5punq8^EngoCkamT-u%C`y$uIL;Lpfvk}y#C=Ps=wR%{Wt`?_ zK(q_RL@=_z7bzH_dk1;(9qvZX#WecU=( zvxa0H#LAdX;`6rLXEq|_g16!tl}_o6q8jFfm-b>L&FL~~9}H2J9agPari>EYy2dt# zBTlgaAXor@J&rK<0H8EgS{`I4@ zE!ZPKV>qEFCg55V^Rbex%X6-Z1B|_J@~pul@VaQ+8Rvt#sb*^ z1eW*eD*bRxlNTPp zPbWzfA6KOq*A$*GydQl$KgR7gA=)^ue?R()V$v#kLRoLh!)=PyZPLzd%&Tz{5}Q0F zm^>ZbI3<5S<(3T9#%9vzr$07MMK+Gjr;mT~n$^^sNNt==6`sq}o6*yo>vo&35}qxh zm>v`!qrYdDXkH)zQ<*m}NO~gPdEs0L!#@R6!K9$vEu+JHB^d!M4K`E7!z_)02w*4| z&*v9z?-wuimQdW64uuyl=T`tCtCx*SSni8>?#pD~mTBAJ@B-QbzPUfYsrF)vK4mLG zWGUHwm1JQ@vYAT#>o(xqYWj;+xrK$%bP}a+oBQ{>zAyHIUu>6rTkmz>gp8-`m@Vvd zitIgh-}PVE1&i!1Qtm3I9QKRA4=o&AHtw6fI3h@0*cFC<>4`N4Mt<3R5-LW)1wLWc z#~U*|HETv2^THC*CrnyCrCmf5StQ}|IHhemVHG_UZ93&$MBWgia0DWSG~YWwj;#JcPnCjMF|2ABoBFf_mK>d-E20oPU zDQ`|1j#(x?W!qEi;5t79nLIT<8ND!aECMBUbJcXIAb12L^-bAyNQEvY&btQ|M@DUd zWX2K?_;9lnyh+puxwMZATD;M97Pz7+R+MN+c;7HxIryn7?qogT&QOXTCn-;|InM|i z)G#ZrK2hB?wbH0~R%A3zO8uefyN1;m%v-Y1=z4o|kH%NJ1pa_VCKAF|w$dI#Cg_gN z&#_dq!=Upuguh~=KS}WYH*|r@t>G;BPa>fLRXgK_I`!@tf;`(bQC72GLj}Ja%s0Hh z{f2S(hEuydm`pTGsP<&N?~|YhrXI!q(pZ5`Q=;TCsMd%h?fHVEMukTnkF>A5lp7C3O$cB{>$ zkq;E3E^9$o*Y0XVAK;^ILluU}9Prz3q@6%_jmKMbjeU)r&Cj0{@gi0;A5MXYl?{+B z;JqGI6rOL)9cC3%~wi5HOi`OHDvSzR}WqNT_0=FC^x zQEBZ_txUVgTc9y^j$1GMV|q68p+U`$7}-uHHXr9RJUac1V~{UN%`0E%xsKA&g0_w( zky`;Dzp>S0(=LP?VADZr*XPjbS(KjLPEDqn%5Bplju);dvJ_i-DBHH8vl9 zYD&kJ*2aPQqZrlbO$0*Fk>;C?VblHBn<*Zc?`-B$gNY}xU;Sg$FX@ozE+|k<{8D!bE z5|p&eEZ^pfazK|s1wodbue3sBSIz zFd}+o0WQN2ijIzN7-1%;S*N>eTsdL{TOLqHf_W?u9OEhrvki64yL5NhS)`*PG+wG; znpgu!0%HvPfM>l1m=*C2LZ3aO=oO3(qTB}1vAI6-hk=ebOa^0uI3n5mUr28yw)JiB zXi5#BL3jC!S7r+?q72qbvBCTHnc{-;`c1|ZVwP&jSa zADSTIJyG#{f}I{W^!a_4QGcKjb_N=SvLuAwQq|0$GL1CWkZENk#5vz7+k=S$ci%$U zL>*g;b8G9B2@Z=e=5jV?%r>97rP}7w7`ecD9=aQtYI@GSiVdg~4lTt^w7B4n zg)w!v0z(`-oJ!od2lS0`rs{X9gnXF?4DrxL+dHz$%3TT#_7jj$zIO7*g)$kYj!xIV zjQjY>M#&tbR!h3?byXpP-Zo6or0z-uxeZPNO&nR|{M9H&cJGb(d zr}6RDqdP8&c+1}iiNMv*N(+$-4L8FwTP%GQ{epwhW4^v^XG)2?8Jjhb>}bi@h-mi} zSBpFOFxAW%CLd~dmq6ZW+IhO~Fl)U>3i9{t8sk0*kJep&*m;$kX-Xa8LvH%$aH3nb z!!9o8Zis;wr$@lHlxKE&g{#0t+A4x0o<_cJdfZ83M^;+5`e-AigCyUn`W5m~(x(4` z%RnA~a4LYp{559cr)>7KWV}XIc?i%!c$xr%a<@-ccJuYHn|tNwP{HlQ7bKh?t2y72 z70PgxxQrL_sBE*`S`<82m&*nl>()hdX_=K~WEE9?u`gQZJxz22T!a)!!dUAW+fP+K z>Mx5f+Cw|wex22QnA!=@QRJ+jN_KfuoM{8LFxJwT9N>AahCZoL@P2;Ug}cT~YnVAJ z6D@W?Az}r#^)2gr-m6339UO0*w80~K z-TPYPmG;=(2A@9ltw}LS5ppAB^YP*t?&C%h9X#R=(anb=oSA3ox4Mgg^BY>(#hUp( zF0M?h_0wdlC!aTCCj)mKO&iW7s;FZ3l|L)HVAEg9T2<_S`XIP)`t_o5@_zp?R z!(5su6f)|p6}w)@Y$0=Bu22%1>7wxNXiC^KgKAhv(NNf{!u#nlER|Hyp+S@D8BOu_ z_NerDw@l($v5ecEY15mTPYV~g9gS_ws$nCi;)MPSbu-$mt_x5(%LV4^QJ{y}>Sn@e zYRT90Q`~p0Xca;=13qU}5Pg&I*4vsxi>rEwTeBTxVpX~6<}zq=o!9Mth1TOb92H{c z^XX>t5OQ82>T>nKV@DIB3P<-2!Pz6%*ApdIieOy@o$qZUlGp8p=S_#lH-E2WBu~69 zFN&@28WxDo+t101#N3zgM~b@4Ni@wEW*92S~>FyU7Ol zng`%r`FrIC&~pX&Zv})S2R{1}5PhW-N*wqxATWXNjiY#AGM{{0ZeTiHP-eVT;8tMH z7Jr&-P(fEvyg4K&7dcobGpI;5xN6G3ggCgihP}c(xN$4kEFid!E@Yx5xXC=Elh_x` z7t-5B*B%fuh#V@I8`38m>c0{)9PjfE2*7PdlygFvjYr`80KioN%tnY40Rcpwq0L=^ z<6B|6uR?c`eY9bq(FpKzZQp?}Ha5xcz*ia)u@lNSKA00NztO8Oly2a$c?7d;_(eeY zx2|xGwQ!iUFd{D4bFB!BZeQfQ2r3sKdUqtlRm3Hqq%|<|J~y)8Kzvpul8iq}Z7UKe z7d?#&%U_O5=Ac;2|y%gvr>AItp^CFvv&Z*Kg-`dOC~>2t}aWp#~=DD5r>lfG$`9aHv1WAyk=syzNMc|Pqxi=wt?kmC*)+~ z{LikcpUrkYtI&P6vW&60`TY9%2M2~6F1egHJvq4v-!-gPWjSt7a#yZ%yb^PKp63SV zdpif^9_HqTFyv8f<%T~GjTFdBK=q2X%C8(O2uUT)&q;in`6R#29Fm{A zlh2ovUqBjMbdz6oV+*CQa%BsudkV_@3Tk%(>t_mDNzI#33uEXC+wu$9Ulw-O1wh02 zhM(&~Su8K)qLH8?w#P-|@B9zqiWc*|={t*bXIQY2iZ|bR&_0x3-O1X|aM?mFiT8;EPZUbQocySW;w6C zUgmfSr%pEaCwo4$N+nZOD2Vk;K$*v^SX{pRjJD!&Z}x#o#a;zM95&iBD@c`QaFupJ zl}>M!-fopXTD2i#wXuA)sa3UEaJ5B2wN-Dm&2F_F+82lV>Zmv*)s0dmo$P~BM3zbs zx7{xuXfx`KL{&%1RcX!Y5_bp`VEd9!t8!S%I_bx;w#@qK+AT0?Pf zeWT8I7rhg$u2sIFuz;s)w;|(JD72y>|2=5qxM6^-QG%^8Pp6R)yTPgyF=VLnll<4W zI7D+BB$qtN3QnZz&m~3GOCyiJ9^J-KdK)U{e`(_(y{s;-i)`xOfY`rR{|eVfoDJ8U z0dCq2COzo=3WwViZ_$*U{QIDg*@;~ErG{iWTxib_pdXK zWt-WXQVg>Qn837qO*U_u#UHhjD72EzfxoJ@B$u_)ptt!9w{E|0ZDLoZzat6f`g$JR zmR8n6nbdl#(^kvgCL+`>MBbis48^XRp1Zb-qj$KgwkMXgOOq2RHjq3aCznwmwd`x> z-fIViwCY*6J6cm1>ynt>b+mu(G~DZa@6}0u*Zz`;SIM=^%C!UffA!wF^^H)c{hou* zUDxG!XNaqP)LmzSE0LFO_lJ;Hon)|J9~j-*b3BI&ND#;_NR-md4`8|T_rDb!o0(ANm=jbHAqPwKMJ?E$m* zg}HX-@AXVIP>?3|y}#@4DeQhoKJfLpFBYeN%3&alykfqvf1RmM8-1`{m-2p=;#y~L zrEt(~xxXKsd@yNXx@-WD+~2V`$f;m`+SfkkI%v}|bc;S*wKs%xKNzSm1S{O1SJ^f* zJct!aaiuWaoc?9|GmwU2m^PW{%6brmLwUVycpltAbU%!@KRlg2!jVyn7Bx!OKl;jW zgkuB>!~>c2`sW-*A2uRBm>-oF))brv;yf5b#2rIFfs6|Dk7@2#s!-47D^8f&OqhjESQJfI^-tLBPuO8hIxtVZQk;BkGx;`j(xqtfegCA}{-g)Slo#`q zkK&ZC&6I!WRAA9maQ{^3{!}=|bR_e1wBmHE&Gg66>4c)`PyN%$`_riyGwIATnTj*n zHZwV)GkHZb1^qKc`!gjNvt`UsMRm5yX7)?yY;DnOegACZ{%jM*9GH2oRdKG}X09`I zuDfWiw|}mGe{K+CewcZFRB?XXW_~hs9-1~f+dn_QKfj2vu*|%$s<^Ojv#=SuuwAsU z+rO~Czi^1Lc+9+bsmD+ubi=yS2r;^#i+&2fIy}dtjEmR;9gm+r7@Pz3$?@-hsXTgS|n_ z{b82DXm+r!JS!|US1+kwOTgF^`B5rFjwPWcGI?g%OT2&Lo*ZSV-=@CXa*7>D&3Px+X@ z?wBb2n55*GZ19-k@R$ng1jKqmqkQtv?&O*h8wPX@j{*+^fnkDvV!qLk>llvjHxR!ga`64DA5(%!hIo0X-?9vonh`yC80= zAT+$_U1CxH=c3xL#ThHbL764(+a>;MWubB9-aF+XZ5283m454$(dU%`|4#KfuaC}Z zh#qV#`>A?m|$0?6tnPe6_c>eYk#oxOIQDxp%yMbFzPQc5ruba(a1kaeaPweF1?$ zzCZjBn0pkcppypnLB?ay?J7-g^~a`pk|bM}(H>09X*t(bmf0B&lnf%5E6?hVrc=w; z?Jm#m{m5qAlO$L1x&IU2tDU*-ik!hzF&|WlCzZLwnX=IgdOekSqd6*>Pm-Th<&PI= zRanmVR258?7_-AO_&3>_8?MasZQaoSpe7ZB=`=w;D$rBEZQlX}F zxitWf@kL)v*=lD5jeLqiZTWg{Jg3z{Uv0(aV47qwrD9#>_GqqJ!HfR7s@=&_nA_*d1->W`|I%D<>ye62W8)r*?&5v5@(j=*13uT?Cl>a~ia^}FhY z301u$@0PMK1^Rwdy`-1da7Ft{*Kzfn$~v(Bta=4PRWGJrRj+<*m5Xj@Q`mRai!G@c zs(Ow5N7YN_ZYfyII0+l7dTrVNi|Qqs^j-BzD*YhtgzOAey}TSHpsJT75*<|a+Q5OT zUSXI#H|0N6FJ=7SRWG9<&^p$iR4>}^s+SY_57n#XyXxg5#~wlsRlVZd;84G-URb}V zUJ*l3)$3Rts(Qu$LG?oGfvR3`-K4I+s9p+4-&L<6(DKb{FS@{z4;EDQdWHN))yv>F z)r<4H>LvYM^)mROdig+Auc6;nFL=~nRjksD zmRdiKeBtRLx*J>o3RLy_*7}R;MLYCE^(ug>UMYW3y^Nr$7v*16uP(%2RWFKPRWB!~ z>NWkF>c#X^^^*Br^?D1-Zt$z>RSE>s{*CGd{Q(xy>U9YJo9eX!|5No!`M0W9 z@IR?u(ooe)=vURN-N!!z-P~ntP22!U;eS-Uz`v+o z?tf6df__oGw7@t&Rj)k>=~dMZvl~gJan7s{)}2t*%kHP@Mf_9sdiz85V*7*YCHP;e zUdPas`=6>8>R(i^8y~3ZrSZG!#qvY-aw>(YUY37Uz2XgiQ@xyiSH0was9r62e^tHQ zo4vVzs9pqAh&jKhUYbL{sb1QDt9pqY(-H>!s(SgM&~N{$dUgM%dcoizP@o&4IQ*`9 zS^Yuv()u5&UU*Q|iw>%KHA~^Q{i1rw|5f#REbfv<3`f{z4^_Qreo?((gJ_P>|3&rM zkr14Ks$R^0QoSNDzN=nk&A+Q&rszCS)$1)(^*a7n)obPdyXsZ0 zuP&5dRWGgYs#n}Us$Q$#RWCGw!rxUd^50djp`WT(1zvzLhSe{sm*jWVOUn!MPpX&a z%4Y$X_HOi*2SHO5hwn6N29cCwa&*=RIDe{M=up)QNRhD=Oh*@l$@X3K0{u|ERH3Su z$M34w>z}IEIqtxlUsNwXIJ68EE+%q1N&%?qwd#dHSol}f>)pSoURz?nsa|w_P}OVb zFRE8CRQ1yGA^xd)efgn!vHeiJB!8%0uYXm&XrZcC#qX+@9#r)*hVwW08`aCd`B&9T z^rz|-|C{Rd7Iy0tPT()9mrN<*eKV}gpHwfr|EzkY{Xz9|{b$w7;iu|l^dD6(Z-fY_ z>V@T~sBL0KwW$>@67uz3HuTCUa7}P=F2VFHTS0jxuLm&T_o@vkiw&;N=$}-tw4bUM?GM$9iR~|{m)Q^1s}x@Hr|PA( z^h5P3|0mT;;D_p^`X5y<9IO#FUBvac>c3UJjQ&RT5-5dNHTb`%dU5Jos=ljU#y?cAG=pp> z&k8oE>a~OP?^G}U|5Ej`_c#ZGqI%gQHZLIeY;F{9mYEj6YN_fuE|EUfQ2jFaGbUmufSt=Rc@keLq#NJ*3}M zufyL|uU=%cUsNx26mZ{PR4-^C`u|k*g85zb;{2(4vHeB$ivM@27wS*d>oua)QI=KT%=-Js7y>Gp+3Lsn9(KpycHbh*CIJ}|5=CC#4)EL(JEtTZi63p52_G6oA z7?1A!{l>CY2ef`u?cJde->T10{|zMui&so^g0L%xRiYEw_n290Zo4k|!7w zY;Ny8*hSSr8JK15?4ZeK!zoUo6U`tY00$ZqgTP)|1}!j?xd)T92a_Ka_^BBaHUn;N zX;LZ!*Za$)3}~3bD+>?wh72dmYAc>VPg*(=FTNi)kpdT93%jTsHoASXVObD{BVCv@ z9jC<%rksIP;3KSrV7Q_fg(bEr4X%J3He4sZ;5?400M6nRPKFhBZ&AC1!yHrOT<*Lh z0#0;sGH`dOpx0nh3gT)HZ#XUOFix(TCwYl|&W0vhiixs`<@FBPrzgifsKhTQS1r?% zjkeB#XK}l60VOBb$so8Lh-g&G7;3r5DZapHxPWK)_T?pN+^Unx+eL0feLedKSg-l% zc`DZjh^_|xDC}=cJEcd-O$aGXT+d${n!VQgq6WZxV<4IT`G#PU0VfSQa1tSmJ}i9R z8vhlwu>Yn|(-(DQsIz)!waF5VH)VA(qOg|%`?MMMohGe#n7Fa(Dh{XxfcN<|1}+uhDpZ6Z4cALa;XDeCfV%1VCl}=y98a9S^LS(_#$uMj7FOD{0fj zkALxs?lE4Mpc6KuKTL&)>I`XQ<;(U;cmV3v`dmzL6-QuIpiK}rtS~M%l(KY3ep4D?hEAR&( z#IFzhOq{Wa8(`a)inpeu?{cEwY8|{y33U$M$63Q0DZ(ixs|}FEwi-g%RK(tefg|tU z$3Cw`K0i2x4AI|&rxp3{w1A+gF2S1UJz_h4fuVYl z^d&OYW}7Vgw1^Yp{GNE6Qq00#0ys=iNt8{u2Vwg9-Rb45km0- zn~nn|fo6dOe9$-+VmmEP`sZ(I7znK4o_i2nUbMnRUZOdegk9eyHq;{k?vV`dEyUyf z0^?&K_z3r65m@36JrhRD1}V=z$A{44@c3hm4q`9NF)ms#|FHl3Z= z*QxnC)k{>5$nd;U4BcJAKRWz&r zd;`zf)QItJSkcU#%h`5oi^yiq#Ut~ok4a=|WHSy^#skcGZpC%)0u5%fcti)5!>TS8 zf?1)0SMdJV6POROVz3w{Ea0i7=x~wSqv#_^Y)SmcM;8O^$5QxeP}K_`63tk)qGaG8 zO{(dH!$MpZT_#Ow(cmBj^ug!o!hO$G{&;z#PN9?LlhCoum$JrqrQ?0uCTV?j?i-4q<<`EoLh^Z0h9^7|=2vh@j6)CC!d2w9CqE0=sj zS0921@2jiRsVi-*#6G6EM~Q~TN@iD*xB9@TM*8r)GROJ(t&@7v)LP6}zO5dI0=_dr z?soR4ZtSWWdT!@!T5{vDk8~#3a)}kT&Nj&uCZ^A)0uoVB)TLqB(D|Xh~G?mRav|?v-kSVW|6EV0YS^ zwj(oS*W)7Iq4(grus*A@(8(g-R2!`sZ$|g6Puyqhxl|Q5mBiE9PZCVybcp7ngv~R! z`Q>AtDP0-ccxTJ3~RaTX~}`pjb7lF;I8IDJzg9q`;d!|WW@ z0+O9jJ+__ZKjC8a;Y=>f%4QRsRsUp`9J|_O589l%6DKkzT)F#Pr=Z%#2JZlG1o6Gy zS3oo(-l8R*>#wR8(&XSK!=TdLZ>pD2b$OH1o#Vg{)r*l!&}js35){UlaiC)x`fTL= z>-r8Bw>NFq6*FDzPf(KHEWE#TJ9)^a?Y%~0MgkWNa@TZs*)T+Nzj_oZ^VMbMbu;`$ zt%|l1S*%m(I?y$Lef8a5Yt$oq(STGa5c|5zwW9){=iz>{+ZoGUZ{eiketUbFI(k!6 zBy)SKKgt8(rD!O2wY>&hKFtub!0rRC>HxJ|;TmBbQAzJ2Lm_ENoi*sic0QmsJT7cx z39@zn52s|ys7)p<2Wcvg%eqt1^t-53pB*t8;Xy;Qro@hHGW^GnWZ2t)h&^l)Wd6V}|F*HOz#{<@d1bC@d}?+QSsn`D#z~v3r&>5X0g#MvAAa33-nobw)VI z2}~q;l_CnAq};bGh~%Byg|wi$OexSt)D!R^<0$Ce)e2HC8#0|=vp1d8GMTG^7cpse zIG#beylNN8!(fR|m?GthXpX65+He@+44f)6biQ2SkV;?y|9--R5CM1XG>!F*!Y6@% z4PFa;a*Eu6coBXi0&1gR65LRqgsxnfQ_nA&edm<+C8smH7X^_MNEak+M|gXfbN}q{$UxqIWb>Du{J`{ z=1!1_afI@X1k#mrKVufVmTT5&E{bZt_yoWQ;=b+U{mt1@J?Hs&EvzX&nPS0U<;{d- z-Lt~BRVH(_F^Qq5iAts*WlYZCXtpGp5-hOH$wVpQ$-O6nD9l5TI&6fT!ew|yJ~)F& z%ZP`iOF2)g0O-~LtWFk-zWne*OOU)BExL@Or(jr`LI%=W!h8=l!Uc!f^KC zRSWl-TuQCeTQixJ@YeSgKVzL+&~JZQ&m$9WmwDs1HknSMZXx6)<5te>pve0dJ#w`Bg$SB%C!CbU1opw~R;E=y3`)c+sJ_numvxqGt0M zCKmmTlV`r`j*Y$NhW-+g)=ZKVYPkSX!QhTod5kuL({D@J;r`HiqrHo{nc55Qy{iTQ zA;m>ex8pJ}2+^6MYY&0}W!zV;LQ?JXD1dLp8KY_{SY-2|5MKC#UZHa4lV~~ic)EZV zKm}8USDJeF`;uJ1K^zY_ZrXSW!3LAj?9Y0zUE=-0v6Y%qbb#j)+7#}qoN9Bj&OMAY z{pjleZ*zU9xNzcZH7KWx3D&9xZSr1=g_d{8`0y?p^V{n&t1yzXm&r^1{|c{_%_#di zENn4A2nngWAVx#s0|8O)ctP9lRkYrlLCITY`7de~)tg9MONyh9tSi$jT}4K}uJ*hf zAqAjPmXQQnPrky@4@n~DIE}eo~ZIX~jcbT5De@IeG zZjq@^?!rwka@40RBSP$0MByP>k((au&XsCncF3fM_)Z=ojZtvY-`_u^xZ>UwqtvEy z5MWLRfc-?xvSA`Xp!4KD@QLjhDG6HvxZW(I+|5oE+JUEm6y3ZR2U zOaQ_RwwkJy+(-lo3j+_;vme?zvH~qrCb%;ETO93VK$H$RTy1M<(ps}=7Zp0ib7|ZjJSchE}?-j;^pPsdVICP7ZAjAtxRyCkbw>UhaxDDJ+AEn zk-ApKfk|Fux<$_05q}wd+Z_xrYarPfvsR^5bWAjvEc$_I6{J_Fs<{f%4FLEQH3463 zTL5_dSKWIPiVFsq5n!BM%d@v{QIP;#G25NqG#v%_Ba&DOi72HO)B(U)_R`1hZMe=` zht-6Ylirar7-)`Xv;qZ0B+CC~^i0O1a5{>_76ct1WQ=4R!^5LhCUf}Fw5snhBb;71(1j*LiIJJ%oLN~<;eI^ zWzEyN!&fARZfl^*sc59(F^a^9Vg~FSsos~LbL{tXFgJ40O}PAeI82ZM$-Q~eM& zh87ZKbn&E6i^9W1G_l7n_dXA?Cn>t}W>aoNR@wbzh&GjJ(X&2omr0lkj;5xE5ulew z8Nd0zCl+%%Dzc^*)x@{RKE2xiH>bOyrQ-Bu4RjFzGe@#(0;q3{Y{`1i@7kyXZImAb znAAXffPunZ9@#E*WDj}(1i_O*n~Eotmb43!=Z13S38ht-U1FjuVLk*wGqQABb*sMh z{>b=z2Y<}BlFb2FqmX0<#q5@+XRgjt#$2t@k)Xb77L{xYEc?Vx$EwD(+z1!&o_|TH z3!~t^BtKnfS`%zWO_IRC%m}1#S|5%^T2Xu$4^)XtRfX-7q%h=2qI`Uw{MvNa40PB&3v)AaPSVzAx4WS9x(h~LXG=X5iZL*maAR_au{}Z>$0LIBTd?gS!MZN#T zt!lst4`uRgvBZ-TRE$t0qC(nPzVoBpdVNTg{%75!hc%=8L!+S)@yE>9PRK1_`Kd&p z!w^`l@_SmYX0<_?3sVR!X`(18lA7ePN(o0W1!#iOi z#9h)IyobD)-eTL7D0PaczS+yta0akzy_Bx&p~n=W*a`wDiee5he1}zQg?0k!lrnR` zK4b9=G;BM?aFZ;ROu;eJEPt?qhbDemoY;tDR&VNYtxk4OHABjc9QSG$w0&Z_3ev+x zU9ypJq&OStt1A?ln5LKHBXvGdG!cEk=tMlv^*AA=Bs&d zut~(UvmCNNXfr`xJ6X*YZ21Ew6TuYX!f5K|LiKg2Xe|gL>H!TJe1^4mo+Mb7fo=?{ zr9p|TzMZvt(_d701U138@d-BDNurxcn!x1!W=37=bQf*GTB{UG4Q|P;)(0Wlrbk<+??qET>It9j@M>&{uy&?I7{ zIPL`JS}P$#6zQzKxf~ea?WsnoWnD)>d&m_cyURqA4)s5rE9(;4PiG+19u%1K&ze0Z z`NEx5kJ4r(V0e0IXfZJ|9z-xiE9o(wSVKqm+2yCvh5)NRGWa6_UW#O5UsQ-B5yV?8 zrRW{4@tmxn7hH+Q7FjQwAcpEX0G}B$WR(O~1YW8w@lGv%Jo$gSfP%|mO|vPn{m*fG zx2FZ?{D>V4!9*S!Og+V8I~eqlqjU9W0+E3Ls6ikwUkcI% zfHi1%(1;8!#45;nxbJ6J^2@X8**t|r<=dWBD*#gj0Lz+mFI#fQe9&}G6g1368Xg*q;el~MFdVZ!4uXa1Gr zb}28Kobs!`bid89{u7xig4CYd@>tZu$_6Q+pFQ_!1>yoM>Lj6EoCa8(0_+&yKc$0Q zhTQFlwhS#S#Fn5KBG_Ejjj`=C0+W_15fXW|75OW{ngRp zPmfOla2uTmR{##m8*sG_ypCbd7jJfo5u^!MGDZs@Sn`Z383#Y zesEZIg8z@q)g)nqIuB@@t;WaBXnL&bP~xl9m>rvIH%?%#P3aSuOX`i>Z?cH?=(P~~ zHA+OnOs7LyGL%41(Mq}f^^JV$PuKiXi1M%av?^-g5D7#lVrzk8f6W94;+vSnH|r`S zBL*lyGx7NQu@omV&r^_gGajdiwzAKJKQS26{YtIAhcd7IfD?K48C8I@IG3vs8o&39ZMv^8oMmfy)Ahka)>NTt0z_>!>l$R1 zdgeS&ClnB5gow+!Owb9&xbQ1{kLi8liM>!_8$=)0PXq%LHc20t(>HfSD+J-owP@HS zzs0K^G6pa%s-#eU_##0ZRA{&zRrFu;E4fHf<)Gat0>rI`mh&D`dgpsOEd_=RH)4$oLCV1 znTh#i?bT(E2bc+lGF4YTIGCY4h`RR)$F@v>Qq-D3{ZgR74bD8D7I&-XAN43nfPfjY z{~ko44a$xoN`(&VF%y}j0=p|i7o&)oOxDojmq1L2gcaLRty zg$tB(e^mk&_5w#z+z;{zhRh$oq((M?;(Fpe`3gL7qS8S#KEZ324Z4cezruokv;L3C z<|gGtXz>Y^se5yY%rc8Ki0NfxeFgK`+kM9?l7Q5I55?94YsdeiuhN?1b!MEkS& zIXo^r3M$LEofVt>pXy~5*Y;r5|M!_od125B`bEzFt9q#}r=Y$TC~aeRQ_= z>Mxn`8mlM<0u|Ry0+Zpo!s=z6k3oFgfJ-ny1utTw&ruyCPJHCv)PAx^mu%CxzbA@A!125N|rq!T_OY zP1+S{27pbh;lkPgF|g{HV0;!4bCm-KbqAU&J%9(jX!+>69^JrINb7X;mtYWy@-PaR z{2UA)0AM(IG+y}A4miLns(Q%qh^WTIW;V78&^saf))j+VYyyf2Za7~85Nj2AoeJ9a zQ^O^2U5F@8un>HKOA>^u4GK>TilPNi4VLAqYp)Wz@czY$Pzn2eQJN_u4&7%YIY`S) zM=bXD!3;J@m+b{(R>}>k2OG=`{+Syynd#%S9vUn(5EE>o35Iq$p8Llt(0q?nEr}vI z@dVsmzbCK1oEtHxP5MZ?JT8n{q<~s6m1eDT^!m0Vzi=Dfy$C)Aij?TS_H?+H#o{w#axl8PIv2cgfdul zjH#giQ@9Yjdyd!NtM#2IToKRkZY<0v3RjR&*@?ondtIoQnHe|l!_<<>3~4`6xSpRK za3aM&h2u-XnbEiKz=}m#S_r8?ohV#A9!BJ~`VZkUZ_rZgV7OH6Q3N|B%PBM#q3M|} zxTMl(;Hm1Bm+cw*JW=0Xeimmlz5mgc=U?8T(mUy=#{wUU7~dW$58C=Uu%6;P^U!(- zZS)-KxUhOXwKWyCHR!N3cD>f&&6sDXg1Lj5wVb`QTe))0q>hK2y~a6K5~)$d>fA`f zgI|uV4G-}w_WsK-kw?j0Xr(*-N+#z%rXB5h^R*;6561k}Q|vs&s{dKXgYBO~s=Yu6 zRGtgJ{C7axM-F5P&Q?^T*HK!$687{uU>jCM^Gtd(?# zU=xYMoYukZ`b<>FRDxaoLkH24txGu2Q2Zp-5$$;`1%9~SV<9igB3a15qw?)liQlta z|D{Svu%K!$6?hdf_ zz9)O(YLe;_QOkq#W8uy!7@OK*cI+MpOftqk@f%f51ynhzM)Irf~`=rB9F?(5^npi#|Uyf>)i-|F^kg59r_4Fq_**~pzZeMti85_(Y`X{(cE_FJ`xn_}>rNf;^>~)5*2?@Tx&nUPmoWU1h z#1>MWsE%K}E$_8adL;}d-6woJ^1}UL?6sKnv$HG{H{=&{e8l_tVu>m8F^}_QcTZzd z%OQ@k6_^j3JuBU#38u|C52dwWrgCd8amDc*?(nl-h*uH!W<@tRIX1I&!ABkK1UNKS!s zfYxg)Aen5;@**S? zBZ=LHPVe%#m)q$9g$$5ye>vA+x@ikD*?V20seXS z`gl{|w(ejgS!v&co2IGRiK^R7P-$6&$7yzp%5SAOv@CJAYbvUTY^8^`EDIfL_8XLL zW#zW4NEjUSX(%ODGXKqM95xqwKbC4IW+mVY?UqQ^8wfh2Ghy18C2an(2=32z?rdtY zRi94cog~2a35E|f8dDNL-9Xdo|zXR2_nP{N9^rOw|dYgb~ui075JgCPy z7lk!AX53y1sLqZ=pTpkaVX^n%%pBqI+QBR$&8w46`YdHbesM>-SCFDyqiuUH+x32i z*8iQ`h-Ew}A8bV3&*O!js9py8+hHT*6_xgb*FHL1CALuu_fPb!cKyA`_QUU?2K%%T z{a>{UM_U{E@U?%XE%o z0PJB$R8eU*Y!>)i0JQR(t9=<6euD(Z4*>3To`E}9J9D>B7{;r%)Dmi~n|VV_*|&H) z^96I*U3#GjDDhl8X0;2UIDy!=N8cc+wiv11n7BEGSF)Snn7wzCG@-mofeVo38jkUh z&Te5g2&+33TIz0yH@ffFjd+}fm-R3Up3mE*2a_ci;eEtQsSl~B;r^HVZHWATes9Qo-9&C*BvZVxM8I$022O3 zWsmgkBaHOa*nw)HCbd`@LT6w~it2Zw4Gkns=XV_!wlV2A7477xHsCrgF8#fe1IU*3 z!`VwzZM9S~zw>e&JBQvC%l$Np=5@UMbHy%)$6G|S!0?#&%9$!pZU*jnuCq&B2*-FO ztrU0zz*gwi&g(f@;s%k1jp{+{6^+GoJ#=fJ*2UdO0=eSH7mi&CF79r=8-|rbWKDp^ zXAZfOKVYdgEPl6OwT#?Phe)$^aLA5al1RarOG#bI(WvX1TY z8X#d|+CaJzWn}Nu>*F}+6Z10X8<(k4Z?`T6?wo3DAp0o&1ts+m3+GzpFsIxJ8SEbX z(0!65ulOjm*SpZX3mN~3?ONw`Zi;@5u`WH!=My2<W5ugt9Mm?soA~@jS_-%qem>oTLzKjP{k}#t-tY;=k^8VUndQxg3@o?9kkg zOLJ1Prlqpm+|_M31+^10Gn~uTs%yh4?iYM0{V*-8$;Nu2H0<%DHFx*d5S(Ge{G#1_ zn%|mPH$>mn&-{yDMS;xy535>encSll>43``b*_4?LQl`;JXL3?AzzNlkY&t$=`%+S$2p09~)-E%$%jVG+<8S$Qiuz1U z+It=Vf74XDB0VZu%llm#cg^cA!C=si$WX{G=%l6!*O#RzICg0h!Psbug3Ch%G|sTw z7?gNzK%SzS_w?Aro!TZl*!%srfQ%M{>zQcbA)#vp`sE6BAi=yP?Fh8+CP}m5fqXVO zbe@y1wo7=6WJ5Q`2!gSSolQMRQGyiAl#F^OI}u(xAv0KW=3=GOlS%zWD-RG>79>XK zI7R4)Q^Oq9QZ$H8pa{AjA**xzrBjmQGGt7aqbx$Q*bmMs7)>Zi(CUg?wO= zYTdHFyx6tQQzjpgR=MhcsLYLH&~JK|1dYV=ffVZfb3Qh3b=(Q$(TSr~pi3zHCi35V z>%I@m|F~+aLwzWR?^Jf2#En6uObnGKJ2koQwK}D&=FISY57_d|mpiO`cLuVkoz*@r z|I2CPjdIPeUg!zzWi|Wr)XGogO>eX!8Pi;=Swm7DBCE=7Y+W@*pTr3~tM}kV$+VDP z*<|w(&xRLBCD;dGYb-~eyfUihO zr$j`qI@O!&j#7sDQ)Ey*?|q--N-ONjAORD+&vnNA;yrg0U0rFM^ESGt0eShE_$x6rwobZlmq>+?Xzm0b z!q-r0MdFb%1FjMATzP*oxbg8b``xGXK?ul#@;nB@o!rqj&n0C7m(rZ!lkMP?s*%O2 zC`(=;x5oR4@7N(G7O#Qdm*%VL?6m)aq_Olc*QY(|HPNU@;|uM#+#M{HjFqI8!J!e`t#uvdMMtt_CXGa(FbFqp|9nz!0e^tHj zWQ%5`7uv7=wi{-pVnt(qe`C$^DPqcIKv!VbPSZXSX<}SqW$&LU!9OBw1eUIZmjAdP zaIQ=KTE6OJfxrmS;ZZk2)qA>kM={pCHk#~z?b_3u?1(6to)@kLqh!eej_(VnZmdc)m~b-6J4d~+ zbqzBJdolJWJ_P3s4m8=9xjZ$*%vJ-kJ}?%DzLUn^&5_l)ukU;#AlzUg0&~7V+~e38 zazO5JjnA4i5?gG}SnxMnp+9;7$y@)OXQ|a4$k7ezHM`hHzK0#)A}7|&A2{N zHx1NcRa{WCm;8oe3+kEa#Hw;zmFU+{wUa-OPvqGBH6Gb;c*yrp=nn;(cxV~?<`D`S zJxv5{w}Hfe&9Bo7R$>QVt;jY=l=aMTC|InZ+y)2QGA6%lv!b16T(p;CEX7{r*bd?& z-m2>Ay?)eEpwQg5^VUG|pOI^tI?4@ZFby>7me-X1%B zcoZQXn0SJ5DZxGVtsk{}TFU>~WkN5c1ohZ-?@A7Np635_xVYoIb-dJqTEAA3oI565 zUC4IiQ52!}#Hr4m-KO^HCPIvI+);a%67la#ZF7VI)Qs8O5&=7{hnk z=SOp(hRoRO6dsLOfygbXHu-JUYP;fgH-kfO_?v4=GL+%lk$ZhfnH{LsqaL-@+kb{} zw|chqCThj1tfyV;@a-|l>RtbYYyED5(T;Y~i4UrCbPyd-F4wG%x+{ebpNQWm#7Q{X z6fWkY!mmG;O6-QEKIScbJZxODuer#izf`swzpB{@W#sSdT6=hmN~DUh>^bQgC1DeT zvopA^xb)puyVgA_jPZ_N?~FZr*&{jfhSz50)oXwqQ)1IyQbxrMO%NydQjrfcHz1al zFd#25df%Hs^I+yYI>=HVX()Sq>WBJL#c9E7MYj_S+SeWIG#1l>JJr@?Z%fDs#*ASF z3z9n%&2X*gk5=hXLo#34p1~vqcN>SX<95py*pQcNS54z)`G$0h*4~^_VE=vHQU4G0 z!F&0ZoC{ALn_qg&Vh&N$$+1#Y;#ZY%r%|V zYg5^OrqV@49P~OYI;REw8nPP@d7=R$mLspymef{ft{Q-1^q_Dt>!n)iz4f-Ee>0Uqd-<@<_N_+M z*WoDrR~5Gq3If~x;EG+IFH|ECs ziN}Z6>G*O;{^%Ey*L1|6sqQ=fgf7trV$rrPRnDuWtfI%}+jj;2tUkP7`X^6+*4GIB zbfNU`&vPG`vVTeV8A(2O}wd@)?<&1lQiM4Lkr~1C$7AjVE0@&^6otLh+d~L zVgcMsky^v6ytm14?nXv^Ot9fdA;VAM4(8}&h*DOwlfku)B=KUyGoH8}-(n{9jS@tH zv?H#n?p1bT?TPADTsqCRohF5bU|$v{9e%;Vm3%314%gE>@ab}n4|dk zpj>H)8SMvD#KjszKKuO(Oex*MjwcscRWvE^?okHqL8III)NW3ENV3`>g~JWMd@9_* zh<6^D89BJDJ)NBL>A)D9l*Rm}z%YrysFAe{KXJ_ZYtsDmpYJJWzMn>~O0kQ4AY~X= z4gg`n$^PGha`MGIBiq8WrBc)NRnI443hYhe_Q@PRmI#?uMRbUO2hZu`VcBuv4D93s zEXuWyL*{y7%huP(Ovf+EMN_Y?s(yyDqLXkjN^Q==0B0o)CKF4)EpK-_dkFnilP4Y~ zeZe%vGCp3lhOyg<(_Y}$B*OaLc(rED^OApxK00QURO1_stks1~qHrACx$|s+{Yo+UEyPR)3Ecibt&X`7rK(7fc@l4P8B^VK;3IdRgGpzxwX6dNc4?g?BP}_ zHhDGQnApKlK+2?mAKYd&>M`Pf6CWhI$hA`j5t|NyUMxG!wnKfd`(=w=(h$r^dHT)$ zVA?rLmW9>ob%9gPgEJ+oz|CsRc;ex-OWzeQq5ikbZ*)D4-aW8>#-oIzTmiB8ZF&gV z)8F+J;8Au|n66%TCn9#U#o9H^Iy3*rH3OE(t0o?(D$o=X-)hZOr`G6KK4?Q{e^0X{+ZA1(_ef$ddTki9@jaySQ z?!Vi3+P3mHqQ70d^mm@8{owzqUOZF~_a+hMN`PQ79WbX&GVd&b>3>x(R}a=pm@eM& z%|xwP4~{rYH;#KN`F~X}r>)fUv;V(Ty)L%RBN!xuN$g$*_eOTh`b;&q7^GbPPKUYl zm{zg}1hpSve2HXkClb3lzMLhjtXoEEE72d-iOr*d(QEkIK@0ptn+@vdNg{j5I*I3B zR?_n$kF-#f2;bj-;Et`iO+`uY%+qQJdY6FEa|E$m3XZ3av|hUw+PR23R_-n*Ci>>h z^8YK;d|58c5=Ss*_1f%h(~!S@&%cO!;!C{fg`b(Z&s$a)7qt|?Apv)ypaS%MCbOKj z8(C!#fxn8$x*5pyD6tsv74tvi_hR{LcSNtjUuZ%9V?eRAv^Ecgt z>q-WDRl3j~`-~Zfz!PCaL-{P8EKtWTz_v8|atKQ%p`37#qeUU#4on6Kb$nu2`GSNR zaL2wRtZ&DB;s(Sx7;3o*9UyT(Ac+pk(M__T*0d?JER^q}kh~FdO0ekg5QacPkrzd_ z7cu&zDS0qRdJ%PFQ$!a5wFqKX2--bCp698;Kg+iP4wi=qp59F+TN?I+cGEOOGY9@V zT>O{!17)}IPhMo(6CtE1&zRZ3%udG6XL0v;Fu$N76u|r@hG;72w>xcqkHj4T0%?I* zk4QpJBvb~7pJf!F455eFSsswsn}nf0@R>Ej_rt|)u;H)OA1M6?%hQ|Fy%`7>c{+{s z*{S@OCh7AH4wm8-?nXJDo7wx#3b67rrZA5t?&NlqOOw&O5A^T8;--?++bwm8?nL_iK7=zIX^X$*nCohi*(@ES?pcQ%W zR(Rl^yuBy^cY-sCuT#JFvh&N^vhI_3QHBf`IfSG-HmM-Y-^GnXrc2859w7Ism6~N1tfJMTFMTP2S9lK}=4I4q@eNnbr zMius5lzTpe%|KhU;{KQ^eBIpH?k5WL8*U-4&%SB0;K^B-4OnJF^H(7)+=dN*Z;FV7 zo{4eXPezHxqja95S%N`AbphYlxJ*u?zRPpWOUN_Mv70*Vjys`eiaE^~Y`;d7!@q1z z2j+=xB6(Re#V;xHk)N@|qc@XxwJ6AQM~3Il{}S0V6aqK$6*=-;{3I+qJl_xEISY3= zGIVjs5O`pCeq(XAd29Mu3L&pRvusr6(|yM6^TH|JW454fbqvkYttdJ?=2Abj75xd@ zU90)35%0WVm`=jVWd6*+VOHyGal4=EehADXbc2muOAz1P> zN^t6h)-NKq^4D3V7vrk=LX>ic+X~2F4=#}$M-4FMw-R zSa{gcOdC z>cQpSsK>7${9vgvbB;3Nyn4)dy&{jyNrz>o$e$V8aW%1vGSVY5*U;QSxD?Q z(UQz_oOTv66cigLXZv{@MOX~cTAbY#<`a3wqS^2sfo89^A`|UxlZ= z-|we`Qifd8CkoSNCes%%8OxgD`=pGN_WGjKiwMaaZcox+NwkWu0 zCE-A3DnI`z;g0RaS8^c_%ix?_m|Z-pfX6r5;t~z8bjxE8-WJXzu{|LoQV~J|i>$|Y zZL`lk`%Ek3S@FOge)#HT^ZO+S`-fjwDoAYdKQUJi3fPTuv-k2x4k|n&*T(V@d_`>W zwwk|$|44aY+W@~2xzBAxHY%DWqa6Doox4(4<{^>CVkdhclN~Idm%_n+BPdanL(m4! zo>9KpM&yhhDM=VX3U=^(<|#3_P&8J=9a*#>24|^JgN%}}+c##iY|&XG^VjRvoNS}p z>GxlcAn#9Mw?RD2pydpY@*&&bf4Er)9axtsEQE4V*rhxwfvsy8xJvlZCA|6pF+#b!mi-hVwu{kfu#%R5(J&TdWd9Z z$S3#%Ha@n?1>6$Du&30O*?>B|zufGNEPvSR*^L^15gS=g)Gb3+a8yI++bAgS+!~Qi z968X=b&!}wb_nDaSYk{Ikor{A^?$cqlow5Df?sy1U+by|PlYyUgbe#}pWxKf#l zsjRb94uKAoYX?tp$4SC6R-jYVwNtXV^VDpof{? zZd-vK2iG3w;+~7MJ(mP}FT3{o6!-eio+#OUp{{-ZpH#2I{<)5K?49qP&tjh)zB_@v zT3z3F9KL%!E7+hrIN&n7Y~oP z(vG8P_0c06(SxAY5v~&2+u~7FEA9Ou?WlN!zx4xdZj`5$hHM?2D<8T?`Z(b_jBWjh zJt4N-KFqm}S;Tx&)B7kVIBKgm_*hp1PaWs#z8|<*0jo5mE zZcQG)r(*Mf$rE*pC^!)|Hy(8~ano%wC1x^6Z{p_MG)ZsfX6s~T%tUU?)V-srikOK< zb2Bl5v#;jHZ^z8NZJnMIoLLlv_6knr$4nH>P2CinepoUeIX54mH@hr2Kh-*4KS$qk zTXU`Fx|V_$RPI=-)jvVE)-MxT$8a2N?%c(T>yJuV+va(jV%HzFt<~$Vx3{goDrG*V z;{}5OrsOi@SO-fob))t_b!+1&7F`{?`J)ZVc>h0j3mvzyh~FwP++>Q|`1@}Ia%qe8 z($){5jbEi3OxR7y`Q-pU_?(eSu`?=G$ZHV1ry0pV_ zNsu#c)BgVUIfHG}g`GK_@`Qh=qeA$a{ zqG2jkgIS!Prdm=|9u1f1M*TU2saDgXn)rK&#u@bKhXYz&Lqld%V!J{MY3)t-*m zy1u=2#G)=ce&t!!UA^1twbL)I&Af?W)u?;(#Pk1Ey{?F#(l}AQ*0#-|%@ftj{M#SB|5d&AoO7GLPg%0x8K6K+Br5a;j0>!I zzb?<1;q&~a&%0F^HQm$U84@2_|MowsSIMpKUuWN5WX!fKkzHO&uTE~{f0ct7<~y(> zE#G-Fzb15gSX43d^+0}*enAi%E-sshHN1WF%#`&^|M4RmoMsc%RDCU&a*0gr(3}D>dq-e0m*JJp`@zDB{S3?lir*>&*XVde^cPhy8F`UdhAVV+{8p6XA#(mf zk?#j)pJEMUIW#SNKE*0y*>aqOEgUQbuqUTX>0YCCY2!*re%?o5`}{h(ppa8Ymj zKd-6{9nN0G@K`C|NsBjOtF{T56)W$I6|ekuFIsND%!U(k7qW2w5mvZ z+P0yfvSJZc7x`mR*}q>B$&QytfLT<=fO7bXa*BUD0z1ed|CB)Qa0r?Jmhu#W9=a4 z3%lOuZkBoR-C9fZGtXS}guRs4yh3*pN(1y%YG$SEjW~If+4nE#v06tgCSo(PMa(Hg zyX6>0xRyQPcOLsQ-J+cu`@*kz2iEtN#|~^0f{(K;v-!+2v>k^hNfGOH?f-&Ql6nZI zWtV3&qJA#xmfqUP{tDJ$7EfD>+OkORQ~&->VI=x?CK(a`xUmteovHsIt4iJy0@l?T3`jyu|0$c+2#I`7XleUD2ylkbOHXO4M(u zzc&P0e!G)gfgew1)4rvY9+$WLW3Rtif4(s*Qs6Y&P`K^Xnd>=umC4`n1S7d83RzKB zy)P0mg=t|*xfwMkk_OUz_R1M~#Z8E;~p|QAR z9ZGa4{wVrLEBAH5jk?g7pH^bJpG0g|8$)m9HyCRf(z>Y;DuO!#KI%WDX+di{@kfU~ z_|M8T@{=Hna#oe80*j8b;_zL8Rzc-s1dJSY*Bi%~!l3-lcPLZHtkMA|En)l4KL1So z!?PieE$nYuXMIqWQBROQ{X_9>f6K#1ntJ6{#w^$o{~|YyZLKY4fs|^Q&);&*NX*{wL-yO$i>af8br4=6~w^(AuXan}oW{ zVW%W71Kpud!o0$MopRn`aq8UrRaXVADGz^wZkzse7yaUix5E0Vo95lMfkR;IuQhZK znaYKoll19ZoH8N5lzt!{plSQ`?bw4bnOZryYoU@CUzIYK+J`)8JHC`^Q48=f zu$+;3J>Gniq0U6;&Bf2tdv(LzHDUGs$C5qPKE6xJw` z&UqR6vAHJS{L*FU#XL50bAwKzUs8~3p1{+HKA|65nz6F;k{1eqB zC%=L7b0MWYu0AS%53-VAs`e(9s|?4Y5GdEDXAc!>dE^!D-l`fj(tZhem8P8*8J$rr zrO^HCHg1g3PVJWH9DTFfi2w9Z{nrmd?bUMH^d}!PM!Q#88M?wnl25L!RE5a-mwcZd z7di@SNNUaXygBtMw@$UHK0ToRTEW!VJvYN-LCASkQO4dc;lJMO|L{$Un*FT)>Q&S; zL`ZJb^lYp~^XIw1i(Pq{0|L+eU#$n;=!%-#QNMZqeSc8t{@)7V(kA^G`&RN>uNlU# zTSAu`F5QF5Bya^il2^*0-@A4)Z`vNw%NmNw$emqE(>dsOj$M;~B3e74ZPmp-eb#5+B$G4>DFYF&fXR02LiSu6j?8;pqz^**y*ENpMEr)%!Imihe$S#=EC=>{$Aho zu*)9rt|hgF&TV=LonHE=KRotkYAaQtWd(WZpp$Z=JG@?VS@Jld|5fNts{8H&L*}gx zRP6s@?=GXFeBXZIPth<;P!ck9cQbT%iL@|)0Rjpl2ujQ_3>`x$2uLd>QX(M25E25? zN*EwWi=e20AkXM8|M&lQ$6n7~Yws8P{rTp));h22jN|xzcJ6Mf4XLbMzx&8_*C1d0 z!^@Wru8AkVKI#jHo%PpJAg%_484$DkM?m5Sf>tSiAad2TAvfq(z8q2C|%xHw;Agf+%IC3&CkC8t6 zDo(&FN@z6#B^)K@rNJK;B}I*wX$}+Zk5cGYlNA1?dQAt&d*L-#Rh55Jz4|@Kq?fUS zvW{1@`D(Ofxv^1s^r<~LGiri;9Kk}EVB1eP@*p@;$G8jQou>)s`o-LYW4y~_IE`Y? zuf~K%#rO!v26|!r_F}H>@dg0pW#|>z*{RArzB=K)4og zt`N-v1F~qvG5yrO;-_TdZYf9D0{{Rk$a=|7R!dS#{U@vCPu1diOjD^0xFd|qX^ww5 z&7LouI7^jKY?M$LmoQP10KmjI^MQ)U)K@mP(JOHE)xu5+n7C^NAH+NyGl= zIapHPUQ&=l}Y^4)MDokupBb1xI8Ky+K!j zX4p?`MJC;KfJ_?^h^%AauqnsMvml#o*lUP-Z0C1;1*xq&CmhAOEkBd`ED)F z1HmXja@;MfNW_hqTd8aC$bsy;O^HnNpj)9i8`DHMHYY~~N@!sy3PmTXw;h(CP_k~i!>#;vouhuJM)$0=VPt^-( zU#4}v%yGRe*PzUKmT=~v?A-b}chS4HMt9FYyBqPX#Jly5&+Og6b!RWyd&a`|u2$aL zy>?HV7IcHQ{QCag@Y&qkEakBY;Zp|Xj0eK4vE`}PZw$-pC(@D|i>?wADu`6Jr}QfT zmDX}MD~e2R+*qQ`r!BnA3@)y$Tva+Qze}5w3aYFUtzxFKwO_7$ErOOIhBiE_GP67L z=ym}cOVtyT>I;JIr0bQs_SJ*y=UkSw`)?Ptu-_OmxxX=ChT1k8r7cVVgKjyIRWE(B z)0MN=3tu_;%qP@X6O0y3YN)OoFD;j^ncOe9b^pV9;TGwBQU--9E8WrhgJarSpja)~ zw3foBmMXE9rmB{1u$Ez?mXWRwE>?#$tz-77V@<3(Ay%h;p6dO%s;}1zw}_M@4ax#O z^+JjDB31QbgY^;{^-^>VGGYyKrVR={4N8d(Dpd_+$xHoly)1b!ZM`fR)RL|@!8RIK zHJT1Kn&&o-=tHc<9-cCNczWh-ws%=d>E=ycCkH97k< zd3}Fq|F|ijs*%ilVZn;-bFKgLw6kb zJn5M2p!IDB3^BvnNGms3QNBoOaTflzcHT{rm>NRZr3-P$%tX(`;=(HH+rjQaTKDPV z@Fn%qb*ohSYQADdwRMa8_MF6Z>AQ5Zy+X?SvMO#OE%Hd}VMr6To*x`tC)K(f4!cB? zSV6u|4aIv;X!q#QvsmRJZ4P_PhL9K4p8Bx#+7ES#R6lh~dKz%pd&Z^DbEuCE*n6In z#mg6YsjZi3qDR@6_1xyu_@Sp)^VoY5`-Jk^(_Zz3bGB!Ob;cwiqr+IEs`d6BKc!6? z;127 zPo7f`k1~81Wn>tGON=4S$C&-bSd+(2-2a)^!S!K`n_-+sVw}%>T)=N!D0y7u{hRKu0_a{wO}dHDaSUMcf5HS=`O=NY!<8KDbs$pxgv0`sK>)|7=4 zrT6wJ?;UI2J3oJa zX6wBhblF{U+0$bA+@<9UDa+n9%RbMS{kE1bLstSMR{||of-bFGOIZo6S-J6iC46fo z61s|$T*X_g5-zRArmV)-tR_BRP2O5fg|4MZu4P!PWnEgkm9myovzGgOEq`mR0J>iE zm+EEwzen}D!S#QuuJz1~rO%y1$K$4(=S5#nmwwNa`e$z&ozELM&l_+7gY|ZJ<}LVl zU~BjfU~4!)j%;jA2TJ}sur(5@PPl%>BkZn6c-2_AZbC%xhsb{fwlw|%wqmR=#XWS5 z*SZp4e>MKWRJ`3^x>kzwOp5clRFg1b;~iq`OCn~P=;obf9+uvcl%DrL1h%$w-4pX# z=kqST$`AaF+4>K_)=shCpTO3e(y*qxxv}>GZrt}77&lh@!2V0p zr|G_}nZbjZuA$lPnYp*0=7#>CuJwOf*E+gw+0lOVeRpBw?MTPjKXonP-*qkDzw27W zf6=v!zzDLg<-sDz8vsV2|5?}KLpc7WYaz(Gmfqp~5)?i9r>g#+KMNpTLpmwp z|DS8=3MWWuY;ml{!X;+dp+J6j|5W z#QwXk)tE<$)()bMBkNj(D^X-!>y+V7T}u!_*0r8P6@PU7(zR0m(6x5yupX0t>RMOc zZ9cY}jkm5Q>snT#L8za)*8HEk7OEa9p522q;mZ3>*UC#!CQkm+wUU0XgI>QbztoW?JmD`Eg0{@!Dl)K^iN%@!V7QoU+7vUf77*S#2Y7n=~{g>;vb}$ zXn)tWjCnAkAf7*UEwi7xmg7I^TJ`^^Yk_(HMb|<}{P<1RN>0?YnLCMM*n|A3Yw`X~ z*XsIB*J=i|4n@kWo?<|lRe$DY_>a2QXkt3ucy(LjkvNwNC7 zuJz~-T`QQZYrXzM*Gm0U*RuadU8^2d_z$|4*MF{S(ZT;4U5oD@buB2vf2C^`vKhs( zwiIjsi>{@GcN(8RvC;f*x>n}jbS(gD%Ylo0V2i2B3ya_fx633huPNouq(nW;_@!&P zcvNX3SIrpntAz zD^p#c{-taE(82SGz2Ox2L)QW#{?N5{|5ev|`8Qpw^grlYGJn&xbpI#17LZPttZSkF z(6vPWS=S2wOV_garE3L~{!?8Gj|}=t*E0KE*E;OFoS<55Z}`}uhsy$W>}W3^DvQ`Z8sv_dZ5lNS~`8#T;L1%05Uvvcq7y4Hu^b*-5{buADNfeVJC zx$;ZbVzg+sGWPvT*DAyOFX~!+WL?YXr>+(KhpuJz@4A+;c4!fsE=vzK#;scSH(d)s z%fBuDFS^zz9rQF=*OCFUP6;cxa;&e}J$4c)q#XUFYdr+8Rv?HXZo?a{V&~|U$hubO zPhE?Ld9GTZDtvhJ{P)sn=>HF0>)G$RR@{H5Yx(`uwOk-!yk7->)3r!Z0ggX)t?d6+ z*ZP#GTKku-g(B-(#riT~E(sBnf77+_48CMt%YW>$-Y;FN6Kd1h=j#)#Pwif$b5?^G z6#ecln)=^$EeMPJJVfYb_Ep>_J)&$I!&SOXCn8)4l_Bd|; zZ4lKpv2^@w;vwB~|3TM6|DkK;!IZ!AUU`iWHzH9Uje?B+gRaH*-|JdMg=g~$8T!+4 zE<5!9rLLt#cjbSsYo-53U8`A(ZVZ8TZu(u(DF(u^C-K^eaCUTZP6V&7-^q z4!1ym>RK7J1vBWD|tEta)qED@U%u@^x{H|+B z{9V`jN_Xi3Oz6MTwVwW^YZYP2g0KxTWL@jvA9StS+TaRlnK&$(bkXXoO(>|YF7|2l z>-sv4U#Y ziGR|yI!5IpPBmTLX?{YT4hAaI)ZH^}c307TtXOi^#@666KU96@Q3RS3OINl~+i-Wg z^fi{U^OvsWTiX^i-)6Z^TQthhxoy^I#jr9@>$wdHf|bP`qsvq-I@~Hs6V9ju1H)YK zfnij{?Wb~@+w^-{f*5=C;=zY*3Uv}z%6npzjj;QDi0eV_?dbPu5T3`>JF$AgXHN;A zrFAJig%rLtBihVN+c9U*Db|LgbF^Nkz4@(lN?Lu!iAuZ`oP_Bbg@KQsQP$2!7!hN| z05pxYC06Y1r$p5~4_IBDav-?&N1;0Jq`jdF=YZ^R7XsrE$`->us5SlI0E$WMg z*%nvZ1fPSXb)Tr_kPCxya+cXsbx{XFlcXudP3Ts%JVn~NEH5@&w(|DTwa0UmW%je~ z2lqj=&y7yz(ba(-jWZ(Z!3S3Kd~*oxbqch9f&MYB58T<$y>y=go-Z$*&n8GA0i%YP zz&hr2%$uLi?3B9Em$`>Qc@9g@we^Zz?3LIq#k}mDrR$2E7kF*e9T)~}+=gz94g~W+ zi&|Qeu$1-!&<@TaB49{jo8hUMRQpv}YB2h3Ap;?U))z%}u!cCQwKwb>>{J_S_Z?#L z9O|bZ0@M$2xECO-8Qs?h#%0Vb~OIz8|!CXX5Mv}6DKm@4D$z-mlqf~yO5|xQgLxNRP;JXcD8u4uxOK50_dXI zM4`|mXt+7QC{M`sa3=U`xREN{;}KlDH73P)?A+N>m4F%Xv>6_4EXA0pn6%@KVUBh% z3OQQ>XXgo{L{$|R1&W{=7pEbUMU>WxyhRxh8Go~)2`=lq$em;^o>t1DSB~OLy~Bw9 z@L36an9R&9O+EBUD}9i957n_DEX$9{ZWx}WdX>?dv2h!`LG=FtE02>d4_m*tcd@XsrgxFiI3&3j!^%$+O6xg!8;YDF{L{ z>_Zbig`^p(-MunTKK=f#ZJ{z$Qz1n4{F!WcL-iSz>t*-U^1hYdJ#{@Pv*y{e9_4_S zoHDgc3$H*sV1~pd+C)0^F&_`Y*+2rl+7m?=`@-mMf#n7tUx zQEb)CbZhN26>hYndJZvm2XP^{2@Iq;7`33XW+3QlIMzBNY{bG8#0LGdf~+1o>ux;U zI_WWQ>R}W^70@YWMZcAh;@1^cgt4D-H&86O$9ZzAz^`)Xl4^2u+7g+~Cc zbMBvYE#7uF4Vf|7S$QFwd(=DnWlsO1Ythby?x-ki&eUC)s`<}!E%TSgG9ogX~;=9+l9{pC8xh-NH+iKz7+ z8aAxHw^TBsL5sse*K5`;V<$vm0@^zZcP>wAKvuCD^KFjMXW2$TyCartfb$53R zQIVVhhP?I=$(m>W;#UR`H_nO;RL7i6Hxj}(elig^o^y{CNvysiF;+ZOH^|kYCT3P$ z%Ae84*Kk#?QZ@a0phd}xS7|g|<$OM3BJUak4b;9YRgP!^hchgW^>8NdAyOFQ-LX5; zCu&F8-j-f9P?E}m+gMynxbV$p#+&gaU&vE-)vLH94-8oxRyp~TESz1mlBi~6D=ePmqi%QvCN)i0~od#Yc| zUYjv#o*@)*g}jStch}{PI^glKOh_(!%H-r#q0lKlgo3GVlTr2$+}jUPQ51U(Kgo-adGX*LNA*@+o^kEn`53zE}E6` zoZXon=U#?id&74DgcQ`1Ous23%~O1BDZ~EF!?$?1^pM3{x5BB+TvEldmKwFasa$Bg zE;wmaKKdc%{o{x3UR;&Ljo3j?@wt)R#aodP&ZkS94Ja4xMMnNm$WQ&i_5MMm>0;B63g^0z{F$;`#A}sG5$kd1_?={SJAK5c#i(>4mG@O{c&}8n z$R>j3k*5yRljR6;-=YXDh0YqRIsFyPJm^VZJeOTJ_fvApDStP$ABvZ5e9wSu9R-b= z#ItBR^r;?nQW-^e(Y;>muBlw0@K_iXc3(s=h@2(`LW+Y0-bIH6V>RS7syJ%m7Bgkz zxGhL&dW~$OY}tTrRhx?`IG1s*ku*c59czt9e*PTU5YSG%jPS>hIE}b@f*Y*p#*ghH z%>$f}t2B`1GdYGzXDq7W2t`RGg2dN%61m0ks3MdOT`7uidAo?h!y!|{f=G}sUO1o5 znc%B9-LD`J7w&7zAPhM!qRpsRQej=Yru3y^u5$=^WfL2wa9dwq{fZAL8tXf4CA1dG zvAVF%D(pZ8E^j+dQ!k!6d#?nFo9749Ax5KM@=@;+wb6F-j8a~BBqKq@Qg*!e=2jf? zGW{q=!@-n|dj<<|oZDe|y?;OH3*?;oMmz{oRryAbyEsIbIc@>watPp#!!l^A?k951 zmnhT{4KO%UrL}ChqdrEf zF6J~|xvH+zR&CzppYm7{!rkWy=IHr`XE`Kg9PC6fWp~9Os{srV3ZmNCqfvKszEH}L z^ikRDsVrKjEBU-XSkT^d+C+F0XWrMOG|ipaHmPn^2sLCA^|%?`)6KoN+6RRsFz)I= z+86zv1$Eze8|aVh_)ZQ!sGE~L&{<+Bj-&b7R*NsB%$E52<+7pQ0@!V}zmy-OOR%=) zb+cuoe<3_2ls;N!YCFw%M(1f<%l9}#EihGqm;Xgz^?c0I0=w*5tl~Sm#Rlo-r?YAX z6o73kHR~FPnVJMM*^Q?)ItBomP#GL%zPwxzEsj&{IutB0oa8 z83h7Z05MzzU!E%4(V%BG*|_etq`fP|f*o9}kt(EqL7k`R1N_TO=AP!Nq)(ZRE>)KS z^8>P=-G-jL&I_6Q@GC#oZ@=}0ork>}PmY%>+n~k+8$tR7fNoS&SPG0WDn(%YbNs>P zdxU!Tz^UF^j#s0!0mSk#e$`$9$$*u>J<#;ZszQ9v4z=_q=ovxqtH71v4b$?~(I$WJ z{Sg@kO|15$z%1d!sy{ta07QSOSvnr^WR>m10cW@HHx(ziyRGnd#3fi`UD|zLjqmOt zp6M+rdjE@-K9l3exnB+WJrarG@I0w}s4^Y$bu`jj7YrbhfPig=0JA(GkVKMmc(Q_c zGOKr0`g5vE7*rU51u_D#9F%YnL>vX-+3r*s!eW8Kg?jM-K&_?~&_fCXpr$638tbhh zWp$k&`EY`%*KBqxXDirbh-tMOX5P z+=S={FU1c+D4Cd{tT9SI@V5Y3kBx2dz*ji~B9=WW{As|)8Rb)9U*+P1fmrrx4gl=< z7y`IxvFRD+K~0V@hsBVtW^T0UG0p?x!-`fPYm(UCm6ISQ(YuInEU+cQV6g1|727?J zov~DZ1i(Lj?w%IMNbM)3l)4*;d`@~BM%+%UAMXP4?Vi_7^y?2b*BGeV?cCmort8U=>QDXr4Ac)-(|nk5_Za zb`@67N=Fca<=`X*QtNQDVUe4HGKpWrBh1EYOyqX9n5((Rqp-$w3r}4p+004iv zG(iAXZm69~EsaMF&LxfIX9GYdC;^30Or7m~XhQ&74BMDNnGdKc>52RHlj7=uOuT}h zk;>9%4X%CcUH&v42LNJ0<+`4(X01uJJe+OX_`F)*$6_oa(fBN)Un!nSa2l7GLlZU+ zkrJkK!bDYI`oGW)xREGC$|@ zvT_BmRgTe{ZsJkKF76ueELl9b+Fmg@3{%Ehj#6s&n{)YN&j5ZZSIgf`sJ1C=$;#Ds z^QTh$8Lh{^lq;;A5ritF)zkBy6En}ZUUkFx+8(P>1&NP*HX9);SFZzq zB3C`RcQIt!l%Kyl45{MJblZVSu{q}g(JZpiEQu}tlL)2!9)(9gSR_PG5T)-oee0ZZ zW@tZeWE#rP%Upy=sZvNN%N}MuVXgB{wMo|`5e%T{SNIIZf~0}W{AfVjphm9hM?DHj z0^sg<2z7QVFmC+V(@Hi=EG^S#0zvaw?l7hn41loVl=52Z`O@B24}1h;0i#&I8!_=< z09d-tlB8OfSZj%yXl!WGW5ZTVw6Zx)ROD*=Sk;64!7A0*>kvRw$wbBDi9i@tBL_g* zL5nsEuogJ3Dr&7%#2Ymlb6GepbRKfGk~#o`w~m6a60B9IDQaoTkpd9&eo#Q+P}H8a zGXb*WNCn@G~xV#5k*S;M5<6mhm$!hZ20RJ zj8UhL#dpdu`veJ5YAM*Qsk%56GuQ|Q7V(N@Jpww7##|>sq_E`e0Vu$TLWVj@RVzBe z$}Y-*GK0Vyh9%tSB3zDp;jM4y^FDfD&@ME9CEmQ~G7$}!Ry-kd|7JbGv>d3h+vrRt zw3<6g^QZgxZgFXW5{w7~x}ZoR-bsSS4-@TXOlVlpbNN8!jfJ!@kdZDtiUbk9K7+@; zu)-9QZCO2GLdc#_WOl4=$c(*H>#y6(&0m!j(D=MjzV5opg?g=@Ir*L@ygW8aSnB;3y5;;gk|%y4 zTwdA;GQx#=D{Nj=XX^G`ARR7k3BTTn3&QW5!@k-?Xp#{whu+KBC_1ahw|6Pn$OuFdc=z>PQWlV8)OuqjCcAh5@5dgKp1K%`NBG`)br z3JaXK;IamE$iu0%PQyL0;1D-aWh|^6tFVs+_P4-K$v)T^U!!-YoCa8(YnlE`0I>lm z72(+>svkG!1zP4OMyX!<(TKmGIWaL$Sj33BW*f2rHss)q=!N6J>{mbP7UI<5khlee zNE3Ve0^7d%$t?t1?WvN-5LnDBQ?@!7Y1TbE+Sya(y;uV{!~+SZ;uZ#gM`xu|;9`;t ze$zPSZI&D&9?}pW;V&okBJtz>*WRP;6#f9VQObu^4W^hWYV_|8*JrLp+29%*?;HSfd zy+U@l08M0v3tsZe;bNPnyb0hx1SmORS@L4sT>(%n3sE9~%4>nTfkG@0E7Ihtk&8VW z1Gwtk2QF_#><^!iPdg@y14kY$lW3}H)3Z2>Wh$-qK(;6VTd&cJG-JhNRnk)Sv`1cl zqve}KtD9%I6IHnG#E9g#l2rInya%0mIBolXav+MQ$vTB5268R2S{^ASwzob%qx!*WBhwM`TZUhjZjPehm&4zDkIVt*oB6)fK6 zdZ!Y_upH-GV(e6)xy>L_3r7=0morw-zy~h~zx`aLNbW6@_C>X1&n`UR-fsxkHnKX# z(v~>M@eASVOff|`VhvAvr*F+FsjNlK-%b*Uqdkk?Dk6BySCA1dp-PFbS9n#5KAbrR zK5oDgW_7I7iQT%rR|M_abwADzfz#^;Q1T0+l*5kj2>bj*xTaTZ zD0~`U#KgEhk^DjcYsN)}@8cz*TSQCxJ$nO$7Xm75yg5E7G<6qF`XN zgtB@8kP`-+T8(G(zw$-{$1O}D0(;{Im`N?YBF6egYY%5z4$k9^msVCj&UMH%@J{UV zh}uZ5S_*c7!Zs+jfj-+KdXjBJF91z3U7LdN0>#BG4XX02Wo|nx`?hjviY0dmsC&Y_ zYi>Zv)ns2Ima6dM{oYFuRN7I>h>`)ER(XSbztG9_!H(2f)U5(}(#1KnXv+n7GS^U? z1mJc)&#|@460ry@91Mg7`1N}Q-Gs9(zw#V41^`rRZer<$0n&v#k(U;Vzf#nru_*@d z&tN}PDi4oRGDi9VZ{Jxr9moqk$PAV@iAgkdkJ6PCizJp=QH$!WvIkwbM*jO3(GX-F z#RXF1Pdtvd#3gc}W?ozm>?43W?Nlftv7Y3~PrfF?`XClOk3ZC$LjW({ae0U&TU@ou z6#nNR;aI>nxQI`K8VlH%`>a-GOIdrB^-U09y1sGws&b01)wTARo4$EwD(xJh)Y%@$ zG*#99Mbl9%(5n8)E*=nq{a8beH*TXE-M`Be;K2bb=D>E^rWG&miP3LJ#h#j zn`*C}pSqUPmCTTL(H%ry!*p@)PyEj^MeXZ-*T&zfcvw{K-Vmv`5Kcp5_Pd2Hy@?NM zw&>r9J;kp<{Gn4v6Hjf^U&@mscx1P!cPf`}gwS#~OMusGWB2_l9>jR+gNrW@z6lvsX7ot zkat5jRB5TFmqri6OI~~FAD7t*>+j?HY2=9Afi@3+)Ss5PAT6A%a9zfbxei zfgzY-m13@|Bpn{6T4!pec-_u2DzmsbPwKr|BF97bN+JjCX0R6RiiU-I`9u3gD=_}!^h;o z=DWe}eU0Ngn5OA7SCSq)l%+QqY5a6ezuoJu7A@HOvoUXZT*Ii$ts!lQ!33v9XaM`w z^S7sp5?vkAY;w0=qTTb}jAz=Qa1f0!VE7WF>tUhgsm9Uy_v_+yoyIJCC=$NZ&1vxPhXnQg=eKIq4#eWSOb&#@kt1#m zd=;r9AF7URC$hh8bVM2t-8$9;z)KiTorTaNwsir>@_KJI4_4L_^BBn;n&%?5QE95& z$F867)1Bh86SD1~Pb< zqP$IB>rN-P=*2iawc=tcWA ze*)~0a=LPGcDs#>wE&v*lx@+AUnCF9V%@g*j1<$S&TEhwcDShT6#$YPkl_zfzN6XB z!}72i&uFBzWV~F|dUKbLxlx$kxJH{=XlcQSNDV2O)|SteE)y{5;TL_#8;kS@5o^?j zz8$-Ym?rjIxmkbMO(Tsn;Wy&*zBV&qze_&H6|`aJPJ-|>j+F)v>2i)c7-}}+Qe8#w zvyadl%ipW$4ce{eQk;xU&%>l0t zw*zGLEstT{sz8MAf+P9H`ZzARTV;`=7>vAVkJUS@;Os<3;@i6G2-U;hXQc+nKr?Qmlu-_`xcS`Cyj=Ru^ zIWg!U#8J*qA?aO0(TLS^wIEWoAU^XyDhFZmQPZ8!7kDw69!4AM*t>C^bRB!0vWIB| zRWCftUU{DDjN1|+$Ti$;m-mLCwwj}51-t1g6gV8FJGJXB|F8e2vP5REoc>^W2v<2{A} zD7hdof8L{d2S#N*xo4b);;k!E#%V;Q6KpV1try82mLA-$?v7THS-cdl4G+cDUIpfo zT`Hu!a}vv>9(`e%c1D~3z{g@+A{raxD_6b15V#kKZeH_FeEIU$@u58bh8K`uC!qgg zk5-Y{lp$9e2cus#pNR>@In6E)501bn#&^v?d6|XRAK!tIXcTD7W(RM@mQWnj?{0*W zIU8B4O;yW1O>(xi26YxHWTd-NA2XhpUHsAtqu;LXpzD17E#&E} zD*1Cl>I=ne0*Mw7pRcSXw8 zQtQP1t)K=pTOLNKZBgQ5=)~hUyTOm1e7^rN{B!P`JhpB_SaJ_KV=37Xy)LyI;G%aJ z#WFY@odxTRR>d4l9L--%nUQ`I~ z#724mm2U%)e>bgkI%E>HeFBeo;vr@Xwu?^Y#w@4zYL zJm0bQ1uENUt)-qbV`2@$=vgv#&^xX2G5a2`uGhzhVIXs_x$ou0YBNW)O5cd!zo1HU zxILE2m3?!a=?;}dlU~`(#2>oWjn@YPw_VB$N_cwd)h|d?^ zen(cL4qkNp*vd*h`T#!OpDu{_(jpb{+VWZ?KQmsjNBT`D22^T15vwn9Kq&eo3 z%N)(_j63IT${Hxu^+IAf)FgvM3DFv>^XwKL)UFFOXWb2a-A(j8XxzID z%QP7RpM#gk+0Chiu^}PdYjONF2{bN+WY$4tfM-<#%=7-8kz~gQzJr`3#NQd?rnn^1&ztn`m?6PNlmx zXys(Zh-pHQr^+*a^K_#8^jTA96=3t$Ndh}Q*Szjagb>S@E*=dMk4I#YmzCbi$vlzB#ts*R4Bn?$o~+#Rc4f=*lO zb2^zh9SfFV?w+@ST)4jMMtoy@SC6+fn%_EGd3Q#3n_a@FQ>{l=wnR#VBG=(F^=q%}#4Abz`7@a$jqnx(5JEn~=WBu5*S9qy7mPb~z?2|K>CV&X zf+3es1^Y8Pw&zW978sLJp$5^R!3(VE#On#?NraVar?oHk!e#5V#Se7VLbQd&yU#4G zL@WyiE$DJNoZvs4W%MHRRv^Ko>@?exw@@X)TUw%FnX+}bh^h+X)-Dg@+^0G4b#Suljp&;bb=bvvYVzAsDT^eVqK4_1HQqdJLWJv?5P64$lB zma#L7wVxaiSw7n?`%E!{t>dd$!C{pW?IzoTF7xV)&KA35VdoP`@#k@R+BqrkODB77 z-%(WSE(PA}4Bkwm9uKhNxSld*0=2mi@v`5>dZ25`Q?eKm(}}jq-=5T#6s^|W(dLS- z?%Udoe^LJI8P3tDaD;d4A%%oYjqXC4o~U$`R*Ne?KhMLE0A>J*tptI?3uc@x< z5Dnt<(p4Yrg7sRc+g=pb(it~ecY6UA&F|(Q-Cwehd~a9t-XvK*4v6$WzL`}PWh^5I zOaKb%y_Q|d9F>Vuw$J@|NkQ#{RxyOkPxTpClYq!2eGY)_BCX2Zacjc&Sau%kkGfdgy zs)nLHklc-pX#xQBArNNugQ3S1#1)Py6)?uH;3w;&gM))90<`GnP<(* z?`T%nt;|dSj)~#y!#r@44R75u#p8^V`siqoP*S?0S~)BCmC1J(q2DEWZ!D*?F-Upi zubw*@SZos}FfQ9T5j{ZBr&up^Z+w5*UpVrLNqv-9W39ypr{yW9>1Sh56tL=CC)LyL zLSnr{pEYE~U-IoIIU78q{xBv*Pqa~%jKJ8&0hvG7%LkMW)All?fFKybaX_r3F&LxuBX9LC=SirVjsGFcM3gI zd1L=ZLvA=n7R11(@zgQQHW6Z;IxZz{cQp$n6{mei_;`y_DB0q3<9WKETyL-81m^2w z4x6JoG5|}Fp~je-tcKoRo3@ks0mRcUoqR4V(PRf}`ew%0fH|~Ib9N|l5M32WaG@dH zfO`{aUSLi_*A=C2fl>r{n4&YiAiGuGO6Zrj?!lYUAd8n2k7<)xmzFIOdsL;hg{#4b zFraI(+h&ZuZe6s-t4658_%j@TxX^IuJ*C6R+mY+tH#~L1=Jc*`U&9$b3Y7b1gW9`# z0ft;1SLMU8mg=}ZI*|~1%7_Upt-kl;HU(Vo))@4murg4$LC^8qQ%|=bJE<>eRu7#r z(~T#CJ?E?v9z8UEc!FWq#Rt$MQ*bGIQnze6?uZD@XOKvK5-mv@385)#jy@IQTHa#! zVXqwbz6cu<1D=TeI1BB$6)Dh?@V2Kg3@inpV5M=i(Yu*ookKX8ZLi&?N$qHa`C_C- z@#87UHtAcH#8dZsjNI}$ecx4Xz-{QO@sTll^unhoZ!_vDWTMU@9s6*gd6+8W7gS^C~{(>b4psWJoilxvoJtAw&+d{ zTO2avvS|qK%~~AiS9MOVeUF_a&mGLI>ErLGCeGH}i*iwWLcR@7`B*wl<@Yl6N49=W zS8jA42jU|Wp7Q|AZM$!KmXOo1_yTTZ zt(&$Qj4)%WY&bN!`b2XztnMR;;{eYIQa9aheSl~6{!;giZ#@2T1OxQhSHp0zah+4_ zj+XuY`J^9%CteNT8aQ1aNv{Q}yQaL{(Gw4b)Nt)>6Fz+_$<*}LJcQ$xEa#w+wN6?W zR~Y#urPx;TVJnPC?j)gY_nx>_o*p(f-O$c!9yuLWL1r%+Lnlf#!al!w6536F!OG0H z@%8p2ojH>n*WG-gV21fAy^2y)mqXt%#_GOvOYoNiW?zRo&w-l?H(31alLcpLcV1e~ zX*<00GIH@r@1MAgIAZpUvj0dzzD>yYuafo|CF@*SJs0a2Rb7OZ+Rm_T9J5vA3 z8d}epl*mzFHuHec6yYK57;~HD>kHMavb?x&p#)<+kL^&EI%o!4v&6yh;F*cb=8D-S zD_;mzPVSs^GWi{h@r6725Uv>lWLsTbI7uIq$Abc|N*UVE1--llZVEvEbSF z8%N*wzkMsPOK>x59W1!3@s7Z=^0mwB#9NyIuafri&#{bYhddR6nbHRYo=AV5u7_4o z^hVyLo~~%5pQ014J1K5-S=ngXYbE;F%Eq`Xy9)6=weMv3A-w;GUjN2-6G9KiAX7mh#)0Ory>Tp(cL1clt@Sj zsFXC)C0$aI(iot^o}u5a-}iUl_j4T2aXf$R@2hs5*XMJd?^h~qP;sNJS5C&0-0UrX zjy*)7R8)B*PasNtC%lF>xL!RJp`m6K#1XugTSjkd+{z&{c!ic*vRg5v-KW+8AAD<) z(*3EubFN#^6ZuSh&MQ4!c7WF+XJ!D4ibDgR(6bNMl9)WvT&M5lo{Opyo=aZCrtSrM zKupC|b&X(_g~tcaM5xEY&m9k==(j*l;mHM@0Y;-&?wfhm?Pzluh*t5OHQ?H=siu(b z(6km-fKO{XJs*B}7k-BBydv!Qknx=m@uqKMz=YS!>TzSY*IzN3mn;g2$oOj0%$F0G z7RR(7;r+%LpZFkN*vdB&)Gh3+-Vo!z5OxBSjwtYOKBuNvsY3?ou>~=p;SIecVbRDo)*fM z7$r*LCX~tM+?G$JupQ65R?>1+j!PgK!=)f|mL+AqWOT2Ag2iM(f_|mu3vZa_#M_k3 z+5=6xh2$@1Dhgsg&dM&Nuu{S15>fh-IWdHXsXHB=U-(>`MLwfAt{e<7ZHGM1Z{6+I zRbl&R^KoC(rnFqZ^JI$ABNsOHIbH9q!5hfq4&V7a7op-K?6${$AWU#V zU;N6q5VL;l;cY%5{`Ww_J}80>6vMi|)xz=UZh~JO`S{L|IpMmu50d18=T`qo7E{|< z?#p*wvHmgUiy1b|(e=$vI75iuj2(N1amwM|XNu1a`aW|Q_0X%l5g0TV-<(#9WRB{y z`v<<%<4wWJvX;nC-*qiZ-C^SFg9V7exgWxnulmep5DU9|Du7WH`|Lv@A{g6z;kY|L zsZ3ucd*U7t1v;+VM5y{C9%&bYksH+3;79sN z`#d9&K>NZucl%HHG(d^vFzve5jJqNqTvH26$Y$AHYU!z4?y!cT1(Jpi$W~>T~IjP0{_Q>D4XzE#@!SMPfZDxQ=h7ajgAIeIcI=D z7dO3oB7}>W=jV3=$^ER^hD|4=V?POo)OZ6C@#4nnd;^j9A|TvWt=y(`?;#i4I)_Yr zLvHtF3SOW+$={6WIT!yfI{Vy`{8-Xcd#z^fi_v88xlH(~I9~3OsxRZ&h9C!IaT<`` z_Ck5TP-KOxe@k?uJWTbpgsmEf#D|j#sxhRud^GV)XZY2wfslSy*`6AL!Iu{CA@Vcp zQi-{X#77@u;?)qR@J&O7EDt{1MdB|gcnr0NdvcwME*DF0Z+w?uG=JXbmv$*!(nRA8z7n%(!7@rG%jdZ4}tc%vr#6w zGSHAWu|j zComLE(VlX=g#pVoHx&uOV=spxp((YSsfieWnbPRcLhV_UBSsxOzdO)J)U~j}`7(>l zBrJ*n3`BL9kHIN;A;eA@J&WF)&Wgp{t1O`$!e^+c8Ay;mxM2MK%xe{WtlmUjYv)vC zW3onj>>hKMCBtrFyXjS_N6g)JM!U&llUGmFG55G7?53`mUQ?K6?!7g< z`*45qnhJrr56|!+o!pFg6R`A$7=6fOp0Yiu!ty32;X}57ncZn~mVu<<4>@vEcKY{N z1~V8w=AJUMH+;l0lxOrY|KgPWg*ukuX9*vl+L<|AnkKM}R1SYEx;5osPGA{rVE9xL zV&-Vgff{Q!`c#@U<#N2sY4 zqt8`iQ_eT*P;a*rKG&?6x!jpXP45qXuHBz@^Eqb_+9Soo61|doB0YE$ZIn{9C4gWA1o(^dr-W59vXt6t@HTYrZXb8V8&@ z7wMjI0>x|>+(=}-B4pGXV4g>;7cl4nBa!tQRi#@q2uo3!8r^ceMK_Lvr+v;~B(h%Z zMAqvOBNvMy0W^|v`IEU2XtAI~jqVFBM%kDXMj*Og!}pZk6Yn&x-P$}po)sNv`(X-o zXspncB~Em`tdCr;!KI>`t@Bk3VxRrFVVhBPw6&W|8F$6N+wM&+q(s;2q^_#Um!c!r zt3FsUn&^7@vx~fg9;}@vp2yuLmExcv-ODO>+;iDQsy;(j@lmvM6X1^-MAqvY$(?($ zgzf&~5XJDzVKT-IOs87Eb&^Dd^Yeq~yv_HGOb_>GJ*I`cgH=B@NCZ6iJhvc3)j1>a z$b0l#>e|An{v!eS_|BkB|9zDzc1%v3bqGxSD&h${Z zif?U*!FKdjKK1g=^uky(5DB6%onm3K62OSQ5KRf+AT<>$7BLw`>AI=K5(42IN5l5GR({XS}7468jSi|^c0sWbK0Eo$9JYNZT@jRqc>XS8l}OW~Xh zidZHcE{AuMl16uqYPWL8 zK7+BbkJhT6W7(2k5y)K9fU0wy*PC6+M6--rP<(LRw&e zeGfA&Ha3787J5tvc>Sap+8e@K|El zU0|VQuffw4z2Or?ax-}#1oRPY_#LY^(r2V#R+7?rJt!_Q*p*(YFBtZjO^R=vJ8LuS znHki24dP-_68oIslQgI{+pMt-TM0^?~)iOyN?l6bMWpDKIB=8}Jm67BXvTL#Z_2_Qiq zFZx;hQ2CsUhBWAvs&ualg#<_rB_(;nRdZW%*9b`EIf(MnLp~7w`Uy+_xnDfX0`?`H z;hesP+I7GLfV+{JpMqw$79f0p)xzSnPL|WN4Jset6(VL?vn=TC8_m;&|}&g0EQW=e8sC=h-4lqqKX;7P+y;`p;>$iK+TkikVjF57a*+g$S-6rw{YZFiQmK6Uz7Ng*Oyb>7hIy0GOS&x|un^#;kA^XH#$F3j!F z9c0kxyHm)E!NfBeqNNtU0ne!oq0U7UxWu$)qbS?(tgpp{Z}u4J-WSg6Stbw{^q%2) z)rPiu;#qW6l`eifd!j_`NwJ2_ahN-h{j&21nd=k+aVl=qhtbaKbp6Yq^9J0J29_Jr zv^T!;6f>^j1k9<<`f&-Wz4;2GMP^~83M~xFiVe@4<0cvAAbVk|Ag`Q2T1MHGwJm0H$K>r}qUc*5mHM2i{|LJjE>dl4vH`dr;j=o5&-v&GJ%(i}n z>~q>b{$Sc$dhIdYicUH8o~6y~@yi^}ms8F%tqqaTb21uXMZVG>lZ+K4?9Lt=Du#b) zu%aI3ph=MYn6SP#?1I;0$U^f7B=8^~2p8gLqG@Fn5bnGWtnzvm+R`~_Jm_lNIpkd_ zH1Rvq4_v?2yY5=9ys+SKk-7HOWl8??!lzjmiOk(K0No7LO6|F0`r?vV7^<{kSnUko zev~`+2^em7=M6y4`VBZq4{}^=(Lz*zUy48Mh3wE-r;!)4nwIocBX^W6_uDwJ);&x; zu5|Vdcgzfh&~IR&o3i)ZKVKcOJ;0JG-3LuSSf3klTQ_j?Yh1QE??b}4@dD$mZ;2av z$>+1-slSEV&p=ETBbwuUZuG$DG9cS8siBhS!Wk|X88wGSJg2cDVPOOnPUVig381(1 zxfX41J!Cgf3fIJv!@ZUUPcV{=BFo+AbM%nq+w6|_X-q}+rAlv~GGO`^Lo+C5^!&tp z8wO#|$%p}}Qs8MafviSA&WOQpU32WAQDLu)RidBsH5&`Hd2!SyGHx&S&3{!*Vk8jnMiOQDZS^o%1MO@BvbG-FdP|%_Eb$JO{nMgmh%Kl+st}&Lk1@c_oCc& zBunnlsf-IQwiFGi?G_{_%Bjyq(|nqxQ2o3F+N5YhU&sP5f8kuVw}hSZfC-kW^fQxB zpx=L}Bza~DV!n9LH@o=;z+AIQ!H;DRm%nggiVCd38_}cG-xE zsC;GLSX6bTQ|(*fD*gy{4XOhaIBc<=8RyvHWP|}XqVZ4fxq@Ou!PX5NuyKwWeWs5u z-~bf6o(buli^Z2N1?-J~Es0z0aHD&NB_-)aK_y_AJEdv{Ow1@AyA%%9 zh@*7J9`LzQ*qSU|F>q@V<)d$aX>JvGoN;roT-WP|ed}3HR;c6LK^3O2=AgJcs;PS> zzG`MLzQ9$#N@n@gUByKE66htC=tlBy&~A*^1KY6&^3;#_fppPv zeiEvrBhh7JJuojS>m?Q^c&ztrvrS`m^ z?Ac`P-8Jp~oZ9Z4}sr#0(mOzTHY^>eVj;Wm47EbWcZ)EhCj z0V%VA6KMkqQv)h&gKB1jC({N`PYvp`4H=pZT}T_cG&Q6##b(9EZIw2BfFHJ*8YW6v z4&Ea+2g6gA!?zAb?y!x9OmQS;-}c6Uome?ji#7Q zZJSN)A52XsjeR|shS$DbQF_~IHgoG>Xo>{P<%9fYNyfzvzFN=0&@hL^(0{X>e^AfS zVKJY=$tF`v`OT7CMw#uL<@~YQId1m(<8S8_4(F9F&k4Mpv-X*{dpn1{{7&TIJ6-mL z^A8t9-Y$43&)nJwCZE*;_T^&bxw41L)rSjphih`m>jbC6ck}GvZyC$|KJzzx)*TL)xRu$GvCGeu z=RVfH|H{7Bd3obL`&MD?*7Vzz`L~<p6rhhUm+h~Cs6y%F-4F+Z!ct$DMX{K6XVW^TsmA*hApg3SaFl`%rCjN|_IRO7wc&DQi`}UT zcMd7#iMm$y=@jJ%`QwYtt0vVxaD^9ZW91GLwGj$0H>PUtd_9CKR&32ShoMv=73DUY z^HUV=Q7Kh@_@QeZ+$y`+QGJI%rChVO{jM`bCF(NU?%LW!^*w5p+HdPB99`qYIuD3Rt(%3}?z^;iZf6(a(yy$5=}1Stl(R>&WuLGc+didLk$JGa6HP z9WQMqi2^brX>KMPUab+-ue7Wdt)ukllC;jzOck15GxIA`&yAFmQXI9fn_Sy}#wjS< zQ7F~sc}9~tf?xPaoekci=Z)?OUz_N9fHhST5SH4Vp!6j5`b8e|Z&y2Hi_6?qVX&0HD z^bYMOC&zLY)c%HxKv|l6b$6tN9xa}K^ZiPt(#^HfUcEaW-NKS;_o_Cj9=xypI*d*) zx_?HMT!Tr$tG7^ld>dzs26E^Tvabw=X|jek`|^Kda~S zZTSw3l5*PegZwNzfTZJ@hI>u>hE;Hs0@ok|puhlOVJnzfa1+m}2&&jCAR`{CF}b{o%2_Z4OB9F^Ebosblmol>@qC#|f!4h#@j#sRz;<+l@ z(plU4R3&9EsxqZ%B@dSK$vn#IrgSSy9=X0qXbj14yY0cM>Xp;ooR{Ft*Oc0)9wT4e zu6Gl*@=%bK51O0Ug@avB598#7inr^C_LQY{7?_A`msMQjy3R_Wo1i8}*H6$b&x8*> z4j@S0KE>evA(KO7R#T{7k7S!Sjn8aWOR0(@`l0mW9%BK`OD5_Ud2*P+>v{UTN!22J z(i!a-%jZ`B1G`;~mm}zW4I+#O-B^$3Endu*Oa3y*rFkQtnm6kFv+Ig){d`23 zNwfAOHXpsm>ywa7RiZkSYrVee)wMdPlYhU^S(n;$p`%5!XXc5&$Wy5Kc3lQh@@m%- zfUbREYi5;Z-s8!)=6uiA-r-vs>g0G&e1fgp-81*0E~V{#?(q)34^+I0(YMZ?9`Aez zc@b)}E@gI!ef;X1vM}+dF85AXPwHNO)@5od_rXeNi&@(y|!^}`?D=SV#6s| zsI`fSZ2LILh|$59z-Gkjjlj4G%PVFTEwgNp(66Da4Av)@73fQ%?u8c}eq?M;ir3KY zmY|@Y&k;Mwa0`q`W#SRqm(kG+kAqX%1&ICVN#d5 zr*q=V8Vi|}i`lkh^NA^xbDki&w!0f;v4aMSa&e!MTh;PI2Q$K(ZLxg@ydR9I(`*q0 z;pBd<8|4{(yAQPVliwVae^9s+XQ$yY{`mgr9B#BAMT6^0b=K9VPdG(ZjjoLhYMqyI zJeRio*|#F*&Zn~Cds7*cc}Bx_x^{`T%i5Y<@o~*pIkjEF)^3SZcl-K$BJew=T=BCQ z@4lSzJgOVzhZs)6U3IU6kB}xaoH+FZtO6oUW*3EOT z-9jNBn`l884cJh^yjO!kpI;zgKCbjarosEpG7a!Z*HN9+S-Ade3-`m6hzqHs?G_@8 zZ!&`P2l%qyJMOhg9kRsSOnt_8va;h^(~c$Wt!ctWWpA$8IlRqIaS=cc}{Kg~Jzaow-(&K%?PW^1qa&gq(Ou|({vVx&n}di_%^q4rDf=Bv$3 z@q<-!^Zv1GAKu$U9ey|&_$jIQ>b(T-pq-cIjT7uAzdfKn|E03Z+IgC*mz zMS{1*(!Gy2@9jM~Pr~oA>*=!!!LPRZ9lghIgMS``hgDo60$XPOXE{IO10mkr z%dT9buGCC;A?O`i+wZy-zn_$r-|@s#L}N>C*@M{3rnKy*0`n8~`*5xWpf5trciv>}R$YU}tM=X&dOY z9OwcKVhaki*9uZ<33Lk%x|JJrC)e1+IOtxY&ds(UKds<^06a z|KQxHnoJ@kPQCNtYFWFwxJ->(CpmMg0@i4Cj_P^(6HC> zp@mvu&uzm@n?s)EhLuW%y<85fs|c;E2y23#cm)k_4Gxdy4{vS@7q<>?heq`026c{x z4^@cv@<)v2MqJT~7+#LZ@{O3_kDOojpW2I9S{9hqiqr{)AQ3=^dX0pcNMj3iMg-Bf zf(u^(A}xULWh07%BZ15(R+ppHf=P+vl$!M#4rl;i3#2Ix0Eh#;N>mdNF(d}jUbd0M zuPkoWXl8PMNK!Nfa}4!GbtF09AXk+O7tKT->rxTJCJ@KI6C*a}$CVeYSx&}F9wRbF zDNqjQUx}5m3zU$HJGml?#>6`Z$MLL0D+$DG$x`9a@ux4w140P#7X<{5+ufHOjhAna zJ0=iAq`w5N0HFdgX6+HidH0=GcueyW#P;rMl*j1{B#4kxcH?60m>*oSjdvzb90We_ z-p6~itKP7S750yX+kq8M0YpwEn(RHmpGxYLNQ?=&7g(tpJQl}*i1~^Bs(+AZ7eHKm z$O~b4V3(ZpBuTJ2$!Y~ihmJvxkz~se`=R`E^O9f5u{_O73ICidK$j9ql)rEgRXK_Z z=G29@gj({nEnsRlxo49cRZFERas`MDi6+WlU8mAw_EP&#Jsgurn-#b|9FiuOo%Tci z`T%{nEZ|>r@!@AX=A~1Q;KyR=wm^jaM`te*9v}hfKzXY5N;rA4>TcyD728L9SZ`OQ)XtFEOGj#FhG#tD9Cw7s0fag-LD3n{+cSlnHGP9|sTn}nV2$bPqyZD9X6pZ~ET zEbaX5$7cB<7xN##YJYrLJ0}X7W0{<6ZIa`l9egb`Cq6I7QSiyFg!L39GKr{@$irU zm!BQ$v{T|MHG$c(hLws5Ioy6xN|V9_q9{AzV9^6UB~|d4Cj~D{QBK4{(e;LksGNhQ z+t`BgC7VkKhjQD{a@LpShA1O}1wy%gSPt3!>-*ZzPu#(2ggtjzrPBJUVq!pEihE&K z@WP?<1rgP9p?K*g^wQ(>%Nq_aZ-%|RRq*mo=S%PRFYi%Q;DsvuPFMI`vcHMB%87bm zyGmt`P(LK8j1j7gJ6(C-q4GgkWl}+9N@r!-`^rZYRT)B6S*NQWJ5)Uht4dHHu`ekL zQ^?!cf<;z?%T8B6cc^|DR$ZRLMA1V0)&d^Islv?;8*`h9h4XLbGQn$FIe=MFV( z6t6laY6eg>y%ew7gkB96yee~eHS)FkUfHYJOV#tV@^2@qMdvu?P_^$Uz>A&L%LSmp z0y6s&sKxuL&tI8Bww+&OxGwH%)VUFIw0Ja_QdmHW^}ZLmg~)Ou?l@IvxE>0!QBpc{ zE@|yKG-Ssp&^|>`uhrEf>j?#%;F@IeNz{P?8*@!vSwq7JXM?~Ri;yXsh)%;jkA~x{ zERr=1^lNqYPf(fuY-En8i=mCYPn(dGwk$d}8eNSi*BZqg*izu!rEDmY}?#!ecs=-w%yT4$=0sZZB5zQk=ikSx9hb|7g5pb zr9}0wb)|N-R;02Gvv!0|b~e=X?DjWKuC?t>eQ;!(n`C)M$?-+8Q=OuBJ-m0Q z>orM4_W`isV?#aEtQT&^yiM6N%GvXcmE%ySe=W5e>C`-Owa?{S4-9V0vEIiW(ZyNV zPt)DbXx6{jb^5X}2SVh{*0a825o{5iE}T{aJcR=>&-%#WTx3HWqGlZTp1zTEdLzNc zp_Il`D8!}nYC!)LlUy1PMuafP3L0G2;n0p4uy-9WSRV>p801k^JwL^HPKoR6)S!9c zu%*JV7R|5&e8f#;#6x%FhSSK+h>=@`BX_z-yw^wW!AJ2Tqkg)h0ZyYq5u+i6qhZ~n z5$mH-@G;_BC{B0mzSG!)h_R%?v6SwywDqw^@bL_h@hsi($4=u|USjg-`X1 zObzHx4LMB_(?-V%rzX0mrq-vX;csU}-p=d3U2uB46!CVY@a_BVx9jU~H{sLUBGbFN z(;uCtKSxaO7fyfeo<3M7OcUTUK+zczy%|#H8OZKs4ep1EsEYP_~sTh^)^y$n^u3D-esFHYMZ%u8`Znbwzzx= z>#}n!YDb`WN2qs4WOGN1dRIbhS4w~PxXbQ|s9m|0|)?NVu=MH1^0o1 zh`&$>1OkOpQczG)BaqZoGze;XS{h~sdg8PziiwdO#mvTz=45A8<$&;VaR~5oiwN)u z3kph#2+Bx^${&|hk(W_WQdLuyw^recQ>Q+yfk`}xIIFGkK$~9c^cf?)Q(|YMOwQ^W z7+$<=WZ-n+q_lC&Qxl#R6D0nUs*S0Uf?0g4IrD!czjW+ViRzc9qfM8S@EMm!MD=UT zmEY0r)?GKdAMn?o)vvxksb6b8(gV1Y5qPh2zQ+grbT0VI#s%En2>6Zq6?FCw>ept7 z`VaVPB;wL$#Obj}lV9Mk->P35v4+3EUv3XB1U`5<{=lv#5to}3HId|aBiS?{xnV2W z{C-LfQT_VieN8^}_yhU1lj-;i`8E9;@@x7D{vL1aYV+TcUpv1czvg=Cf2d!-kY5Y^ zO-Jh2Kt<=^{F}jv->P3k@@svxgQ$M}hWzUM-;iH(t5cs>XTPmZEv(J%t-V{>T-e`S zKG<4X+g|yyv%d9V z)=t%Wme~@yFe@WjWm`vL36G?PX7T zA7T7pY})0y{aIRZf@U4%c>_-jvb58*pXU!17?;^kbv!Q^DYj?|Wjp=i=~$WVK)zY$ zi^7SQuJavfr(YILReSBOPIbO4p030Fe~bJQqyv!g&)~o?*`LTS=iicFeZMEa)Q-rn zOa?yU=G|YAU)G3YIm^~GCclzj3`Fuv13ddX@=I^_3N5y;#D>=V?yTmY$ghZJSBcy= z`BIu!RgUfv`2~XgAivU&$glD5s)x4MDnPDLD5g- z7sV0z#T|n3`9Xd;2JyW<)c%$Hiuyr*4Y#qDVZI1YKEn*#(ug!hkgC)> zBEM`)g_S`=Sw!+{^jGo=9Lm$Ls9xL((mf)-csXa74z&ne=`yMzKzOaz##i;{;5-yawWdb>wSP9(ouiCUC%!5shS5Ausd>UAdIpfyKrzFKCQVtUVhso>U3 z<}c)zI0+_C{CDJ6qUEpTm&I?$FKUr!1J;1#*Ycoizmi`;z7n>XhFug^w+ZEFG(RHS ziJlHo#p15y3yr!>*o7$D&O$owyIjY6Fn;^sR?!U1b$TsmI@7ll#UMc>zmheJ@|U7i zA{;}{gZm&FBjWcI&(IR2hG`C>%Y1c+UxKgGd7gP^k81VT#PZCBIUB zPkxa{{FeOEBa&b0`0Cm*f)=2`FDR4J_SWD#>^(3IX?e;Y$ge2P^cl=Mb*vbKrecbG zH@dle@g6Lw>@Uc#q*-aVKa*d+fEVFElVAD7mRRV;_ZHY+$ggjznzLtKU;Cn_Hzbwq z?|4LhMUjGZ9lV+)ekH$xKnoH`%zq%itXaY+`V`=>bWvUEl5#p>+!m4i+W$xLi^boE z@brlMN|k8SsC>%q{Db_OA(CIAr11spJ01DMi=Nlml5Lw$d=Wr5_4v_x{z`u7ekZ?F z>HbW9`SyWMd?&x|bCCT?eqG%9L4H{UivBhEg&~q(;dnKM4?;@&IphL=L4HXDu+e3Z zd;Cg%Isb975F}#i@l3&(8$S+VR z&z8>b$geM|3X??g>o|~c_@BwI$KT1XJ4Etp|M%qAj$hdm-SsR?HResxix)4a;%+g` zPs~ugl!$IycTf2`xvclHYNUI^v-<1QmZyQCfX+kYazM1Cc| zpfJ&{kuoCrWpYG*B@@Xnrlult<_FPBvS>-v^$;3-0nRaU`ypxp<+)4J#k)hHP;o?l zL8Zcr6urNbUjg6AuhE~$FDbgbkR1o%6C zkY5tt$uC19`Q`T)dj~cB@AoR*FMQ z=VtQn$uA4aE56wTY>wFI-;iHXoE^KaO}~&|BflZPtba#-@e?!c=6@x>%>M=Xh5ecQ zO77$8hv27F)#HLd%e3G>lV7K1$cW_EsXvfkQvaF!O8XXzxRzs;maP@~h_=of zwRh)~OrQ7)7+$C(%wrEQ6dpg5U)F$&L)HpP%hVF|LUlAk%c^oy?jcHaLTx)7hl?Tl z6@l}aq_GC4vyPG9_7k||6(*P=`#bUr3I2)vdH_x*l3%iaOMVR){7il|Q|2x~d;qXM zw7;pL^j#~XNa#&>tLyv*A#LuFD|?tOcf+*3kb&>y7e4@gh>J@9E%_x)y1WcIK?g=w z+EjSO@L~C^U4_1g-vI-|N!vW;2QDHsPM3ctzhY;w1HUD|vX00vAA*5qLfG3~4{ z7kzw4xR_3MI~|&w4y#J192X{;!@}s%0|2LjIr&Fi=w)sNmhC0jx(0chZ?p`iajegWY;_L-eP^wL(g0n6jF@{iA%JU050or<9L z*k@*Gy+?xttH~oz+cTQoripLO9+}O)P?h62o}x)-wg2w{O2xQ!cP)O*Y6{`Yrjz0{@s?tj|&a zZDHQMP|z$X`bY9>i?n;}JNe~6NhH6l4GP^QNb;hFY|Ue9Jn6NZk+FPoXnFR;hy7LPsuOnyngErPD@YY6{a@~h#W$S;2MC%?Y@CHW=u z-;-Ym>R-sOi0Gfluh_pLznsPYcjT8NeBdw1uMG0PBfnyQCco@vekH$j|C#(6_%r#X z2-p1s`DOptxc=j500{}uVw@t5Qm0{kDyFMG%fTS{y3zbC&w{z`sX&-@GW z3kcIFxnuYT@=N@GL4KY17vvWb{P*M+UzSJYm&woMSC{L3&!5RJ>z~OlVT*qxzh=qI|0Ve~@MrSt z%3qOR{b1|glV9fPe?xvjj>s?bezYPCp zag8b6`_v9D9=U$wD<8c^~ z{K`TT9It(l7J z|3rRG{vf}uQQiyvTk^~8FUYU3U&$|-U@_lc$uA)y`PDg!ePoyOo%~ArPJT)M1No)< z_vDx65Av(nPWrzlzZMh!g8UNxh5T|+jr&f1NlFX+iTu+2J^7_VB) zrlP0Y|CanRU{3yP@+ot-5dVNHGy`y*wQzepL^*_k3 zV2+(;kXO0|n+bowUyxshMDi>1XY%XpAIY!gCYb(TkY7!IB)^PjqW%T>MNj%0@++gw z?>qSg2L6uxO8lRbUu{2;Umkv_W=4Moa6{ae{YJXPIWoEu^mp3j%#7znm~X$S?Gt$gkY`@8p-# zujH3oqRyYlFRX1mGtCe3EBX)Q*F(gg$S>>>`6bRe^Aq`HqRA)838S1&8c9OO$f7v| z%`laHOUpNkf(kR$CO}l>$|sM1L6e<%V9RKm+a#$I50$8&ZRFAZkZE21-h-W1 z|4hu9PCd47NnEDXYzhJkwx~vn*XJn}o;mF@&OwL^F07{=Sh-gw-mfPcBO?RAk+T8- z(}>n46aq(y@f*4$+Fzi14kAS`?mGj(kf4cd7o{o~YK()b1N=SGfj|-iOb87SVChG! zN%&ub$Z$A-PO`ecXp9H|caIKmY(Z67foU3TP#j);ZE#>U4nWCIG;{g$sE^ z0!0Hbq^8PH`n=Qy(I7bl+8`5FKf_9IfF%Wp0~QpfF>m% zAW`ZL^Fwqa2^R*;g+5?ClS{Z2g6;#-?5p;B1YwRhTLNM6!OOfLEQvcD9KyI$0KJvKo&djYIy*8LmJ6Nw6!YbF7ceuNNFY9 zCgNK4uRXk!ye~bkB*9-(&#}5o>t*Z9R>738M{1}V;{ za0~5+4sXiZCj+ODvH<5t@~YBkQXud+KL~t~SdJBq<@H zS^f~?DjfsRO%(Bx0uE6+%qtzm9~u!PH6e;Ye2Sgrrkx%*|6#u)G^;Ts#{*FtR?{xe z(taX|4jM{iL6Wm6Qlz`>8|2G{P7;Wg6=~b^H4+?177qa{ z0E|yQzAIm%PPz~d$fgZ|dQ}1Ux@`i?w)_J#LV>jVfkrq(x@V+<5i#OK zAgd3{%sD_SK&Bi3CK@a!yw4H?bAR@+f{%QxQ&3`*$thdkJmY@$`(W)fkQX9I4v@dv z5F`){KBeXBX%UFB_4VF{m@XIISN9i#f>rt+2-x}y;LhW%-jukWRU3+Taa91c{p4fm zok(Or32lf z{Uw;*ki^1O`T`|zK^ABdI^#4W`kWN3KN?5sgyCiA^XD1mbm<>5GpoO*n_)EbhH1Yj zfeS=}0A_rB>74FI5K|hZRy`-5XXLsKi9ZKDfdT8oM%?K0(^W56azQ_H@mk3x8yE-N z7J=%b1t*#XSd1a(#<@DVj7w(Dhdq9DxH=~P|Ii@z~9&l5C8;V_I!Pq#&&ji4J^>M zw!Y^80k3Etp=^ck*5T$mQ*j0JDfn@_Te4@ZOXs|-&xVE8xSh8_5%_?iBI#I?s9 zvP8veOP=Dp;)Qyu_LJhZrA<`4BBW!Wxdupk97*C7QSk!4#vKRZf$2wzSK+0eeAXGV z83077vj4l{wR}A0zMp)_w`tz7w*!4aS`p-IeB>rLQqyQ~8*eb143yb0m}_JD#mq$% zAB`3h?%nB=U{Pe?ItQibIXWzkTr}qLX->+AW*@H@N#cP!bX7Wt8G?VeKav)Bl`T-a z;(6TrSCR@iCK;p~z!xMAxaMRfgQs&E1u`&=CMNoU!=R?F{HOSf{D~V*hI2I*bAE+^ zild8=IR`T4KNPDFN#Hy`mp9N5P0icOEfsdpjpGa+M=gAWRFs2jqi2zFv zUvj2rf?fd9dVstJQZ^C500zn$1IV8SUc8K>(&`XQ#6i4fj&!d(I2JN(>J3>~+X7Vl zrL{6HR|uKjjG+xV&DK{OhXOEU$;;;~nFa!;6d(jHC;Z*x5{<9f4{{W)^1uKbXjIYqlR^lIZDlqNL4_-L5dim^1@6_LcxJx6nX~zBd}S9c z(+ZST$5nvZX5z9|$!1D|gm6in7MB1uq+XWAua{_mI?20oL%J~!W%NbKYM=-s38Hz* z4^(lpuT&=mbkC4NiT~dYi##!;eb!;M%eFUYBYFlkd4td87H$wff_`g!opCCXGu&Mk)p6BS>V~y^$TI+2z1Np%v8tbV zVz7z!^wH`Ml`X&k4Jg1CY%ve%*}`4jpxt%`POULDEH@i##_l=#G93Y5jv^ovR}vLO zJ`=)yOcRgh4>0){WFBndTMyy6u=51&kJchpki8U!_VXuv96`e(3Ks`^0DjiJpwq)#9%>Q3 zHKLV-K%5m5;3+GBQrv+?Zhnc+h9<*K@RJrT(hm|XOUfVk@U0C23dJdJ$xVdepjyfz zRM(!-s(v#1;1O(}K3Q5E`9Y+pQJY#Br?+O$BUe+l2hFu4)}Hd{d7aTC;45s+4}b%# zCcIXKC_-KLZ}{K1$_-3M0iw?#(`sv6NNg>K0_ki&(w%5+XqvIiC1XmI^HD)wfjiom zdqOV;9J9R=*0=460UiNg;$+rD;A@M-k?{)uiAxh}$(5fW*GB@MSVdgFaMQCc#BAlh zxde3LZRo8EAY#bh5|&_zZOY`*C!-+yMoWqWK-%fFoZ~`|uNU*hRMb@W?)rm> z<>6;is_DI?5#^N68+Wf-BLNE1?^h@ziNF`c&i}>{@Rj?GB>k<&mc$5$E{zK0k#&cf zA{eK)Z3R%&ub)x3kaWY5z@N+jaQnq4Xb8B%XOLi@Loym++mduP1MtoIogn>5MY{K{ z8Hw{^oY=ekWWMfFh?s0Vi(FOzUN;jg3UZ-T5wghAnJ7uY^`>0!Q6n3emo#Z*_gG(2 zUCy!t4U#7_4m2MG26N9oIa_Eygoe>Ay>MS&{U|hAV$~We;_yj$yxi2A)V2@X zj0gEqVfbb0LHR%w1Hjl7P>(0O#0h6Y5G+Y9AiTL{04r1a zeCiVrsm$7O*PY#2lxV#`wb70XSlefl*nP9Gul4z>>?+0^n}2~ZG-n;{3GH6<{Bp;R@dV2CTPDljbskswPCzK!!vZV3P& zOdH&i(l0e+V6H)OU8%?*Oqd1|l0MHK67=O`WWf}QRrN+MM*ko7?lLT@eE}Oj!wfaR zz)(XC!_d+tT|+3{DhN`df*@&t0y1<-i*yMHiiDujGPHDwh;$A;DAMtc?l_*~KIiNw z-nicD`M$ocwXSvF|8@U%Qh6;}$IN*wwG370Em0PCyzYkmSY?*0D;keFpy`Bmt>Rf$(~f<1HXX zypt_SHs-Mq7?iD^MQ`D=70y3aE!Hl+fO-@uaebIWiThCEoaRN_Q`seH7cA%Xb|H@Z zN6N=OGkrZ`t9256Vl&}l_|e+q$5L-9IdaX$SoQ2A_58o}2p6&!GeP@(xTA%Z@4t=| z(mc#PYyM%BhT$W(@DR9(b4>j1r8kC*<^s)$M0x_wF$?6@;6l+iQkh%<5i*wa31o1M z#0z>8=&Uyw>oCe=nR$Araft={HsHAhg9RB48m;My+0JXkTN=~{Uh3qO>ety4S<)TQ zR2(?=CN2IKpZ#i+l+QQFZ`NmhcqDM$aU^-Om1XmTcoa~j&a5$p6yuc7rmXlaVb0Ij zFsC@+Sa+&%&JOFbk|4gTOqOTAzx-k|VPm-V)Guo+*)EBZcKZ~dd=hgB_+Hrly|NvT zdQR05RMcU_0%00}tByva>`C)X_SpC^WX5Gz*9F*#CH5Y-JK#S$=CVoDMr(l=MR{Bt z&@2vb%CA%B26;3YOS58|9=&%9k*dtTs9x9%|56sJbd*D)<)s$TugcBZr6v(Vsw7H1 z%W#b*6x?+Q%-L&CQf0`&uq;VL*H0K`6=XToz;{k5*gfjHI=PS`d!8Ui-rei3$TAku zvQkzH%t2nNY@J-;D$Y@GkwES9{>Ex@G_UwQRRFxbg&m5ru5r;vS8!}+v`UTuM5%mv z%JHI_XbId*n|pZ8k?8m($&Dve;aa;X8al>>yAHJi@x^*5cL;7hPD%gHzJ zI-_elrjiBXAT25jJl}>7g+XYhll}mBC5qfyVS(p$Xp^%H)aBze; zN7yj$#cIN)Vr`EXmf!4{jYKu&#wc!x6@~Hy#Ga}aNpi*zi=AUH(Ejjz%?A9=j8vE% zOyzFM74d+Jl*1Vero*and{1|Uwdj3fk#rDWK5&6M*OMFqXi+kufMO~*SZb!Q%>k{_ z)^Ez(jU|7R{(^6)D$v2__3IK}A?KO5fqufb@A6*>S8_v>I%x(|Ye|MFE#?>w>k}bT zM^R9#qzHEZS-nq{KrjghnKE??rHLld-7O4Zx?>=1L`J79n*i-C=Z}bZH#K=f0`OYH zAfil@kkWMY6S~TUT^HV|qON0U#O6T~9`%y5i_}I8?A6FdvC3R-X817bEYG`9Ayk18 zt^0u*ODYb?H-e0|#!68eOp6K~;mCbYk2t8A*hs@P1gYKKnyAvVaen0DRQcP1#HDNP zl!ntL?mqe~K@&7-R>YBm7pf4Sk5%uzv*1n0ZsDYG50l|OG`Vjc3Hnsy|fxIlOT zR7Y9!4I*FZ&hwl&VtLIb$UPznR0A}KefTlEBb-$5Zle=_U*Jcvy-c`+TOiN$Q=;(& z5U(6hK0Bz1Q+6y)HJ*Kq!?5a<%t4hO+#pJHS*0eaO4#14A{S47)lO{q{Yif9G)Oo6 zAirv-PEMp7fxBF+HV+4Uc#%oZZ2ueil|LC!h9|#_E=hmv>An5psXx)p zhw|e^N;_3UimMJW@}DQHcIyp%A=<*$a_!lV#K!NirSkFiY?t?BLo_mw``2Yi0`%Ld z?BUg?#g)zUrFd03fT7K;Tv;j0U0rvM+1t=Zv$EXdq2xheq4mJN`mz(W`cYIdPJ;J3-QKP`slH!d(v%OaBH}lkYSpTQjka(_gM9q&~c{Q(L4o`#8921at1dkK-mg$AXKV6 z1#-}#Xu8o-)2g(Ek6EUh0v&-!iYt08lK?veA73URp$+YJBdiN`yZDP*cK6L7_s!Kz zKN&q{5d^W^%_j_g+>KZUD&YH%o#LE=S)xWBHYBH;Z4eYdi_%ckRNX_NV8T!3N;sK&o1I^dnBj)#s865ry;kc@ z?3b*&D!ei)T9csIYp5hXrPOv+`4y+313*rZQ;80xT?JTQS0Qg9sP;Ce^0TP^MDv97 z^LsrF3EcowY_psLP(d&mMfkC|6{N6jsZ}x$>}h7e%w4Kv*XazG7q^Cm69o*+>Aum_ z+i27a(a_(pH{E8HISJBMYE_8E4DoVz!TAmpby~^Rp?h zaT`jFS$Z!}pt;T57B2M0(rf0Ee+%qw4YD|aO4?YWsuudrB#5_muU@!jb&}4%6X?dI zZVhR%Daf}S3pdXMUVbygHrWrp-)hHTeAVyT!{eEY>WjAe)JT6_hVWCh^LeRf%{T%F z2qnD=Sn`;aG>4GAq>=j=!*WYbu8S8q7WIhGT=Q8x-2h>q=%9Q6M|Uz;H$Wo2PhmWf z1s5QdM<_IkVy{%YuC$cy&wbr4UX0%eae{nmPUC)IiTcWoOZLy$RhK~3s(iL_c%)32 zF;FH9cus*)BK^jFmt+PIiZ**mVcz(BBcXx1EiFiuu18C`3eC{i#5mf*I0_J?14yzH zwH>Rz@0h;+AyZFu|1Q604(8ayF7U{Qau$?0wP!vfp=ltkQidAq5gMF$twUQZ{O}tqd zt*Y#kz%rVt)N#i1TVT-ZCv8McT>61rZiK=P%W-35!cl0MPiUTgRa#9#!Mp$|Njf22 z6t!l`;{&x^;hVuB>G9py(?Xsb!_4iB^qxcGg@=pyjh^>DE!3xGVo#(&&&xTO zx5CLNM*+MWG$n7ErOieZyb>aN0%eMuvl;k>-I|`)gh`FkZ))K&F@RwAsC0yy;&k(! z`Y3o_QNH$t5qjWNS7$Yb|y<=(^+B^&9rJ47i!nYOpnP#S#k~LJSj4Y zHpah2yo&0Q1aG?YvdCVO&nd1+6(pQc!dCLfH$oR)KmS|;8gEVhiZ-B*LSNQH$va@0^#KyYYRs`nL0vWe@kW?KQ+!GDwSlOKh#cHaHB!|D zy4XB1$3J235tP!fdElrrUa z`z)t>i}cOdA)CPc!6t}*08gxIRBNE|hAtHSiB(~ChoKCBz;)gvyZJ%@bB`GmdWffS*oWjoPPKtxyoz z+R^I}FXhn8YWV5#6X~w+@3e%E?Gtv$iMpjxLG!EvlEV5@!E9oSDuR=O22MW~C z@Rhg_ixKG~10Y}irJi^WA0a$HFNy8cOGH>iUBb%m*JlPf?H zHS`|R->6v9A@f#K*g_+^%zvd=U9As*TkmGE4rUow@z!fqr2R0}ZAW0g zW)w;o+647&3mOe+@$!B)i6pG3xjU!d#mcuy4NP8&w1-*I2gV3n0Wbb85w_y zhqp9l+;3r3`H*q*RGm7piB7V~fartag`yRZHx!c_9rO8d4h71POxO$>$08fg(z2Iw zN&dEXK;ZrpY4zlU>VVQPnSTGUoW>=|-MgF5P4t53_fZDwL3$HmQIRcll`7P_0K|Zd zrG=Vy;Q%CW3zdBK@}U~_D2+m;f0EFd%T5AWhx-`dfGI4&CKTx~rwq8t_exZP>|>u~}4cl)GnV5gH_EWWuLS$SAQNHf2~&<%O2l zi2I6-xWS~vcM1x+#8~_0{>AfNwpt8f6)|c>@hLb}^{}B}#~_0bg~|>|s;leUSY^p{rJbmXXL zCGd9W>VDo=Xk=i3exquHr>X%#jS0J4Fnw*p)ENo&Y9U9~eKXl4gYYC}BAt3u9W;Qc z5>2mwV%ffx45?umW*@U2cr-Wser@=G@_p53!}6^A{H(2=hd_~HFIA^+#Z*Cq@+Xkk zC%2a;Ln2S*=RamE@1YS2WMS_v`PepyIVdd&99_jsJ~pQlrb91TDDBu6s@WW!b3#=v z1IdeUVs3g;T1eS1bfH(A9BOZM-|013zwQTraCiMPT7N%Z z^rmY3?9mo@HszO9p6t04Al$HHxa{V`A+?uSHau^&bv<0(zmw9;gWCpJDG3{iS3-N=IRUQhiGs++#Q}o)eCzye z&#(B<@3T-{!|cI8R-<#*CSMEP)ozXUeo1OfG(^W)th7=5CW+Z#uCdz1m-P&von31- z8?tcwBUxf#dHxV@`@#0{$eLS)S$ANUVH3%kLpue2>9wRfzAY$Jch^Dv`&XxFm4(~6 zx4+Ga$b}Hv`n<$d8#~j)sZZ?QFu7>nDqqMHY$%E75gb;ip*>hu~o$S(&KZ=9*k z9cJCY_||)kcACIxkZRJmSr&VrrDFDr3Jy9jEwEK{S9^ z9YlCM5bdbmFS;s|b9rn40v_DByc!$yfkJ{(J_?`)8lHi8SHYbWxHT^)6H~_LJ{pxo zoki$0!Hj1}EH9LV8V%=2m9&SCtV5t-Zq3{HL```V!+0ez2+oC!_DuX*)PNAyb0BL1qbKFUP z2pm6SpdLg8CEAKTYMUk&@_{m3wILQ-!$n$00qCl5YW&#K^NiBTpv=dTSmTtNHG0=$ ziN&(fS`GsNj1IGB_C%o(=cIBA{OCQ(9D-uyp2wbJw}YPq67g|~#dw1_=nov)MHbym zrFiCx9ghPlTlmZ4bFZ=anOSWGffF2CJPYCph%6k-lU4Sv!h-w8&2l4xkKQV?}()o2;I67N!zyEwU0~ zqK^0(VN;)DWeO~=$csL8v>FaMt&|Bib!et~B)YIyh$Xo`uUCKO;#P$DxT$Y8Bk!^Q zmaY{m?cnp9Z2kS`h~Hr}nvMb_iww0`4*`@~-PAf?RrrKm#_fJ5zlQ7;5||AB5Bc>g z`NcY~_pOajE{@zeh;`v}0f{2rV7RCzU{UlPP=M~$m&B>@PaTGVbXARD=Ql=7J7M_6~EZp-j$Cdc3w-Ts>g*Y=x@YaC8(z!%)ZZ9!*-_vXaNpV=ZpXLR(cbu2TRJ36bXK!@a}r1L`KoZR(H@Y+QZ!JyP=B{x zS?S|ze{WtkTd#)%4LCmo&4mX@(qx1c&ose#B?gZBgF_(r^yqojc{GLNxZ1WdG8F6R zdcUJP6Y&{+EYC>zU@YP4H|0cVz5KzV*+o=LzcE_{RP|(bAH~VffCt?0H+m{?-5gg$Znx1i!(kz;-P0EQR??O40$sg{`#wXcmZr{{-BKZ<_sHM0IlKxhyMnnT@Gy!3i$=Vq?9vHr%E9- zL(6A`#2Hc@m(tFap3-rg%(5dlW{xWj=*`7o_r+*|1Bgi^OwN|BriAPm2|m9;dkPW- z)7u?0Y{6$;plVyRv;m0ta)u57ogeQ@3kWt9yTlgQ$R|Lzg;1cW9?+n~MPv=I$m^eUwW8?5%{;LK(S1!)62F&5C7<@8|p~Hg{Bxv&3I4j-1S45JYaC(y1Qtl>+9RceI#ouhv@D@IoS z1%zhhH{dalIGeA+qfxO^hx(Fdv$Nq(HIF=LYD92tLN-v$d((6fS_%3DgnBGjIfh~^ znl!M4dc%LYP=&1#BjxTXszWRC!JmE}xelS?|Ky686=rBe%ARp&E}{{*)Gumoa`t!z z^kJIKJa&CUg;|tF?ijtyhGv!fA}B2`l%GvIX?tu>!}+0+h2{&*%;2oUt7Qez<#{BG z4NbM7LM@0o!~zceguhaJ<(SVo^mk$WYy!27X4UmzIi<(Zt;AC;P^sVWROc?^IY)b3 zC5nt665~z)I!cP1jiI0QKVJ{PGQ=%cx*{&dA*f8ih5l4GZNFVKRVTwy(x@H<46;jz zP&5r9Lbhom64kG4p*DvQHD4L~jlgC(4AIexokL7`xh$<0;Wm73Xc$s!451BJ4$4tR z^vmp4APy21QsdUwL}jVyP-iv-u5N(KkV+G;aNW~bFzQG=26#1Koe@VXX1vivU{a6a zD%v75xQ!GTG8Q{)BK=L{z4$qLj*UVUQ(4gqm+lvD8l!JWa$y#HcG1wpsM__*{TWCq%_7G)jyz+o zcYUI?M$A%Q!7>*P70zl%8`~VCZ744ycTj9^wz%~kb5Kbk2RD{GkT8(J)noKkq8QdZ zwEZRk>rAsaZYVoPed(GBGwutLWydxKy`jrcZm2nZ(IFSr zi$NnMOZQ_@jH#QKKhSO22n#Ap*&idU$y^!x+?Su{WeSd@* zzCEX(#eO&wJY%J72=zeIj|}3g6VB5Y;s}3_Ac~qsTB6Q5m^0| zosPJB*B7uLQZ{f1$a&90;LVCvFU0{$?EH+tE(Ta_L&xvxN&bWiB8^CnCf|eJxHEF< z)$3{Nf*_DaDr`cOkQ?z3-Fq}d+$P{foMS0Fh1;uB{!1SD>@s2ol<_7!24%~4-M-oj zFK$=ji$sSCUvn~ov+0Z57{JmX#J<=FCN*nT2gr&X;dN!7~UU{4ICq!P#C4P!5B~3@t|`{x)N7MAOrJ(d;Yb z-N5YC>_wJxn#1g(uI5JVBBLHaU5BcoTnQgJ2G}_UAZDw0$IRG-0@!T2BRz9rDhUvD zS@>C(%PC)J?U26m0PLO$^T-$Q8uiB(Q%b97@+8q%A(z;#3+w05)-2{9J!T+MWz=ou z@gf`XVkt{O7;jr)qVp+_B`A#yU`bc^ZP@|>|A$W!CcQ$sCpv#Vk4m< zGOs^nyjyXa%4Rr+CNIg}_ajWdW|t0A6_w9{P5OheRWKn}WMMXFiYbSD#Yrdjx}%Fv z(KMtD%CzTC&W7IZ2W$+hT!en3a>lcO? zLMKDY5Gjv*48=5riA^q-HXu_&IwvsImm&3A*c( z?F-=897IqFL!r&tqDGEmiHrmsWu;4&A~g%Xi1VwXsKw0`a5f6yvA0rVSIn=Wd zrcP$kGos7tq6|lv?Bkik5O>!p_OdBJ{-ypWyJN2no|V26k$=(4dLG4g(GXXcL!HlF zdBy@1T3)X9>dihX?}1eQBjH_wP1>Clf?3lM2^2-e|G_s~4Jw|L?Lp0q+53E1e zN-sH;CuElTFC^_s&t<2=o)8EffVD1^Rr zdNaDW`kOolW(!;#T|?Vcw@*<2&Xw*yn!M()R$$ztN*243y(&pfv{}kfCJZ_i@PgiP zaWB0uJ-0#07Rx%)_tiKRp@fu`mqA{PrIfE-FUejKf3wWMx+Qt(a4+Wz11wD_y#sYV zyE!?&x*Qp=@cuyzGQ{#dYKsF~g~+b!`2Pa=W$imWff}Jz9dVjK-1HrBg@3#=G2-nz z;ywX)sQ-wb!24dK#~7w`%;;%&#%N6arxf3hzE?(X9e=u~`00u7#}^r6PvK*s6QAE6 zf4*@%_U!m$wc_U<__!y0qDyf+(0BYb{PU3Fnp3OG7G~XuB*(d*$^MuLWPw;rWk}wCfU&ph0G+= zAbPH^5s~aaH-w^>$($9knw9XI6PcVtoqQL-M64EN8x}>B=FgwZ>fl8$zXgkvS!=7M zvzbe-RLF%1l|c43RKplj zdzGC>DsU;(&b5cM!!tYA8p(%ksqQ{x$h6(Ho+7_{%H({yW39Y%oQ z8v#jFBXk%*9;I?vt^(IY;ax8}o72PmDQLqK>FpV&?$d*X#-l9E;k(m=iid}_Q%B8H zNBLO?cmeEP*5MT52%wq~hdwu7=&9ZxR&s+3UX+l4V=4++DWh)5b>To6}|ZV*TdUR2wCl3xenc6W*`{xkXIBRxCZ{WJNsG5)D|e*9DXYtlV4+hl3z7H$S-7|#~Mb25g{W9h}* zo`jW`f@FO|Z{d&!*<&#J6 zgqRF@&`PpIVBpJ>uAWRJ%d3E=lS&E2up8+!q|F|L#nF+AlE;;tX2fI^qKmO)?3ho^ z`(HbCR_#67TuX{QKQlF0dX9Va=}Cugj6b?J>9Xl-`M5-pwkb4A=tqYnpW2AO21ge2j2(KTHArc{P|DwT`>6L| z{moQwxdNEQ5ryI#O0nm2l6VDg%euCMlicZa6cz1s zSSR@*rAKSQWUu+3>Px*8k+a=yc-~7%%Oi*cQZ-E>CfBMq`yQU2$iE?-3RV#YMZr!W zg)FN*JS<8Hk7Og-(!P^lFXOVE_{><;yljzLYS|<)UnvxhT!Xj-krU|{1k*H%Va_3k z+*$?#^m7X#9S+`CwIPh|cMEO6#P8{nwy=Z6OhM!rYBSPit~Z7u3eFr;ueKsCF?NQV z*6D3VOSa(8Cq%OH&vGY{wlRiGYp32nHIYipiIOh*jr^J-ZI|gBV7;L`$GRETuFzQ? zeShsK;%i5{GT0;bmO3YL6Pv15A{OgPHz=`khpa^u1Gg;B;}&6p2fQZ;x7D4cZi8xx zIEQKy8(U4Q%61|j8^;$$4e}eLWShzvvma@-^F*P@0rrQrVEuP>(S&W&^Z$kNzoY>CBy|xJQFk!UL3A>%fmBuvYT3_qFT=pQkKpr z_e`c;EXZ-TJKd4&hWN0S9$mF(8ZPl!?9Ou2)ewTn9>GEs)pJI=5dVaJ(46925kV6-eBGfP{wyU0gOv`kz!9Q}JF+TwCX@?#moV=lS@U*>;e}a-ybQ9IRDakJ|I&8< zhZ38pd5VyT(Uw#kJy7BKRg2`Pq!5+TN!f1I1dKtRcG9bf@?t^J)zOYGp%vp>f(|kK zms?_`WD9zn)wW_)fsdcPE}SU#c33qV)i-%FkGaLGQA^qizE|~jZbAb=-V>U{Df0Fp z8qNP)b)Gmezv~*mYxURDamR)WHH`Vfo~3E&YY?~E!3!JrGRR@;9&c;aV$nha5v*J1 z-quaN7QUaOV>PcdTz`1^rQ3YkR0wlY6-ZdLXB?gUP5U1C7m$d1CUd|g^`w{J>o{|` z0xPLbRVmmnCwM)QmC~T9na#z`L%rLU#?{l!{hO<+p|0(QUsW|ye|Vl_36R{oV+CyEuQje^UOCKK;kI>auL;3=&+L7e?W=@az-uj<<``2R%Y?{;1!`yS z`YUh5lg$OMb3M55_G+DY(tyJC<)r;Cs*s9Qz0T|6nX;W+(Y&Z|3apBNBc)@5#Lc%w z3I)$K@rP!+87TBFsGnxt8F&Q2JVo9z))HWr@hjTS7lRw>^(9DW+?6c6c0XJMOxjTG z&GoeX!Of0{)WL%2JNGX1m`cBE`|v1JB%E{q28#T_D10`<>DI(ZQb@RTjJhsNCES_u z)$O<4B_Av1oUx8n)XlFQ?~<0l*I_pwe5&1&s%lO$|N3EJ#Zy#`-w>XvXb=u;m1nk1@gzscC0poS81OVn`ErMVz^pz zZD@@W@o8n~NY1&bv@-@dL_AI0cZ3YqLC zpY7wNjal)cu}TVks4!0?;@!qFx%K49p5zDjD+OD;n+AQ@`J$Hn-bl+&*{7cy)Ku=P zZBA@fmeyYs>dR81cdee3C}RlDvt#p#jf_f0n&_THWM`$F{YXi{3awa;Lm zyAyM$*xfYbL3M7f2jjci;GJoJVhvJbb=uNXF~35TpUNEymnu10v4*^76U!FuK#4p+z(wzzPGtlV1*j(qVzJqgukGfr@;$#R~)R zRekcLChYHOfUVNeTk(0Oc-0co%iU+|MBK{^h>{v$yqg~1jb!B?=slox_8@r4j2 z1zS6W+z1PCOb)b_3At5hcC9ePoiEf=GZfh#;u02$A_~3J5$cN#eTX%?mmC^6s{4R6 zEHo@EoG&y$GwhL`9?UtY&yxM|ehIKvGzE4yk%7Hay=U;#tSt z+`|Kwm~FdK*CF2H5m^RM8PlBrCna7kfVWJN$K?;G8r2 zclb;D2mEy(rmYhjNVD#=~DW zr-=qBiJ4n%w@BjBq5WewVhz% zm?TC<(c2h*%lwITVWNjt@(}6?S~lPwnVNS|yl6{2)ElI%MIaWQY$cN%9G=qap8WXz z!-##=D5gZ1VH`O-fFPEDf*?7)DELWA%1dwNw4&7KODV#j)a&~IdYd@p837`j60d{j7=0C_`2IJ3wghKiTy%A53J0%3l zWnWBrLZgyRBnR25rlcHG-QCYpE6h65%Gq2>gkBEfd7tFG9PeG7Ly?lSMW#j_ks~LQ zLm!bFIg`cBe2ZL`Km`1hus^4o?5W7*AMlqbq1HBmNN299SMHgJT!Dk98kZlT`19zr z^UihVwS?tq)nw_6<(Z^L8i4b$XYx&-=a)F-U((JnugSMv3BR(Ek0C3tizrCYEVwq7 zdLy;KH6qNZvmkl7!0qNUU%qFa+R1nBKl7~#TPnhnUratL&w}rVc=H!t(JBn-EVK+O z46jLy94kyXNb|>&U(Xrwmmg~)KGnROiLjqod5PCeXCq3+HA)sD!k1D@Ht(CRc9uk& zx#4b>qDXIlm3s#A^w?b~gbsrlM_23LzNr5!Ux)r$zmQhDs3tedQYSo! zShQ%s3V>F~>(*)sM{9|l%1G@6Y7bW#!^B))&z!og$ewE`FVMBuV2xxZ_Gt_;=M*_t z5LbJvX2y+>1x~H&!p*C7^aAq~BM(bwp%rX=XVeHRMv@+-Qg*Wd1mW>$XXLrk;sAdD*_FL&4>pAVknD&tFTX9kCj|G`x6q@gj zw}+dw2@7=iv9@FizSkS?i16t+`|N$e+Nn*VLMPjk&LoPiX!QHyy7v!#gmaJHr&F|5 zjduzkbvjYBQ|@-v`*1faz%T1|l^1tM=Cr@J?5Z&8*bwWeg!Zh8b+sL__C)pE&~5J> z@0_gb?u_dGtP3AHV*Vi5lTFc47X=@)?2KmZZawP#0&Siae80omTi)HgX9-{IX2u?| zo%nQV>hx`O_kE&hC+X=qLSeQ67>bNOs2(};eyM; zrRW9ok_C(2g)18i))0IFzIauC@tVuxjp#+kl0~Q9#akPTu8<{n;U!P~r8_Q5_oA1) zOO_t=F8OXOJ%nJK$hc#mlO1tAG<6+iC#`ASx)O+ z&e&Maf~@2SuRPUX$#+?K7Cry}U-GN;t9;*AMXIg;S@?_j|8?~XPk!k+W&Eao;mNO$ zZfAZbzlJ;|{*L;!{!rpq^=tiS^((}9JVfW;!CxDXbUsE~{5$vyuYTbYOl~C_=O#w{ z3V&^;SpQ~!{RV%HXLv4TIRD0f{X6?>D(}IYg0zVObm%kpq-Sl@&-_yh8zYMD)D&gT z6#YZ}!oEPqyu6?KvZvr>bwf$+Y)Qn^($>||u;1*jmWpS;*et4nuAktq?*A3|>u7o6 zH~Z`Vz+anFje+C{{kp2HxsC(@>)yKRuS@+;b+Yv9Yrd_HRya-d*4OTCPB%xMA`BYp z_P=6>URd`v)F1BFemqV!c-L@zxIMEv)%VW+IqmQ87vJyj*R6jKf5`y9!(Vm`JRG(_ z8ad~^nMlzBSZ)Zj2jko$8GN%wg@?a5Y0Uotf6?IKFZHA8JcwL3F8k5>i;fE(652Cd zw94IO+<5qFfcO{qD-I8T3H}a$QR3mR_f2G?D)@iHMAIApgul2R;o&dN@9-DrFYuS< z5e^T3rGJONu>Su9e<`OM;^D9K#_#Y~7#{u#_V@vRo&OGh@qCBBq9IZzuYSN^F+agy z_M^xJs^8$RyR&Px7stYlig|y6znbvym-bVw-{G$o+6Z{YpYT`Qclc||G)m{6;4hZ% z@E5HXi_b6cR~Locbl?y8D~$capYYd*-{G%*aXkFBEMBesSMb+IGQo9GI&H50AMlsf zclZm5WM?{2{|3%4RHKC-@E&e)1;R`I(!oy!D&A-84 zyuZO;FPg>MGQY!L%0I(j0eJX}jp7XnpRXVu{^I)`{^}quvY04OvXzGKsi7&>*m`dp zQoQ@*?_V_p-;7>}5&RB+#s4$>b&pW|5BTd9dp7T1z+agE1b@ZabZ#`8Bpf5Kn0 zpg-WR%)bkN?MM9#e`(|4FNNRXuhyU7uP>nQ@Yi*M&VUwU_D>H!G`r=*oJ+^UUkd*M ze@XlW{PhG6f88cjnGGvc=6N1tg&}ah3w$==_bdEm2KX8Nn)?a`Ks@|)5f6V^{tSQJ?#IJlH~$j;dipc`rTYW^ zYNGuW{_<-Kkop0C(IbeT1)4vCsVpcyzJBR<`0H|rusOA$+X0ZF2ITrol8@Iupa($$ zDOFM3;$?{a2mH0{A69eimYv0WLaCDI>fT3NcR)xYI5qlr_$x-g%24apPwN=(SW&dg8vGCjp5-hF#znt&+u0)mTq}2ifoNm-%=2 zD;W=e(fwQaYwkDrtMWVi)gk^T{H5{F@K*(G*i*_|jpzS_zs%4OV@$iVQ;-I!ZH}tF zQtX|e*#bGzzkK@&)xXILr7%Eu<>1{`b=}ys{sF=FX$6r;Nnm z-jWd#y@He_;NdR^Ad^h*uCYSW7NGqH{3Qyk$fcbBXZVX34}T5)GyHWG7-j>U2bwQ@ zk-Mg@sK9A}^-${s-V19aEV7Yaja5|s75=(Mi2Vwbvyra#3@lX>$)J`wVIMb)EK(M5)yhJOXZVHKdABt{8dR&*iPa@4etAA z_>1>n;IDSVrw($wI0`)cmGd+F#YKeeAd%h%{t18C{}uctNC1qRMh%Jry@}2Y%23Nl z0QsJhiP`{t{t5oF0pfR-edbZ0D()&106kks@6i;`7n)dX(IOH)kWGR+VEK`$HI)FFRUkhr# zliVkpg+%@jz+Z!Z2mS*70e=xvvfd~3cY`qBhe*@2XW)myIGB&mL5uo1CbXb({aLLFn@AE8&`Ea{H5|8{)+nv{!+%nU)rGdnji32n|1-QJ4Kj5!hf52Zk zzrkOIl;7bm9Oys6U&=U&pW!b-Lzz9DzX5;I|113U;Wzl}`FHrMp8$9|_8tDxj13i_ z{(l_)D*pk0wg^n614$O7>X3Q$s5Q49V{sa@$eS`utcTHXiQ)t7Z#`st+Z)`{CoH- z?oarupV)w()caeVp&d~7Kf_;xzrtUFl)C={e_j4-_{;cj!e94)g1-jI*4WwQ8X>-V zT>MC{`@VG038C1MP6#X6eYYX`A;V;-P@D~HvG4>buE9@WeSI#f+mjq7RM+3clHq=-W zUH1$8mGj@=uYmq{#h+;gGHGt4y#Ydhz+Xn+;V<();IFQq;4k|>;4cBg{|J8#{0@JK zHpYF17-dBoZYWV@oDNboD{aI?QDvuz3%oN~J~30Il^^^S{xbb5_zMHF{}=cx_9yu3 zDX@N(maG8^8b1ey{TKL4l-*wU2mB@Q&+wP|pYWF~Fg74wv@FoL#Y4hh{%+YX@R#Zj z_)7x#GyEkL`|XmutC9k>*E{O-i^;uN?`tin`yNt%$YKa`Ob*l1GXCEOf6@O2fANm_ z3?<>=FNCt3IhXn1yjCI^Z-IT0;s4F>m)75hzj$x&Mc!`uMuvyKvVOo{3V8TS8xMbt zko^XK$;B)GU)|6F&pqQl-N*Gh$CzWyHP@Wqk5KKoSN}v9 zUOV@4)!d=3&i6Yj6|L=SY<+HWEt7Pe-QTUP`3Sr}!!+r441Xp5g}*HSfxmuOdw~}O z*X%FWw)}QU4Q&0qBJj~L3BZ|!UW7${(5EhxeKcUy-J%%7FQ5J?$bL$F!-VsUdc9#2 zOkkbv{{;RT#U#n3RtGB#mk!jw7i!a!v#c%`&*>Fw2vz%c_$&J_{3Z9_z+a2;|2_P5 z>wg3Os$h-1^}h*!8DVNrG8Fg!IsA2pZ}K1bD+wBt{6B@ieEu8wEABtwubm{N=wtXx z$Gm&dFXu1(HH1O>l>ddlvdxe}VXd71XYiN7Kk%2*|7-XwNa+6o{_-~B_!s;o@?YRD zXUM<7U*-@e<__-*1~m9f&gYQ9gqmEXa53q6y;^5y^c z@K^X>`0FPP{z|4bUg7@%e<{=8FE@;|5iz#m0Kkx)^%wp^%EzHfP#~6aR6-zyK!1q_ ze_3ZGD472j_>1*_1%IuO?#<*52ZrKxJT?~u9cb{E7aD#He|5VNTl@Y6fBnJG;4kz) z@E0R;voSWjz)k4aOW9Q;9x@V8!uQCp{1Q{FEk)FKuN{&G2kU+zqPp92;xPc=`#}6xgyJ_2W z{|WvoZ8H7~f9(VQ1%I&!k;*jcvi=kN1%d8VupKe|DSKj*zG6SWlCyAWt6Hzq_aaOG z**a#MhC=;eW_`H}BTtv~PrR+R>iHM^b;Veq>o=Htznunu#d7A4Byv>$1ApbW#}*oW zVo+{RK;`qZaiblciutTrGv{4cW%tSYvRISxa>|)vQQfQ&(aU=be{~(hUr8{!TWniK zbglFL@2+*;{2CHOyXXGtXLS9P>K~&8?$TrUt2nNhAzxu>3Xmg-C^>bB1yFp?M5GwsC_)0$Xd{4SKsU&& z5X^3eSpiD}hP7$Tm{#%JcM1^#Kojygj7$|`A_lzI;~=@oASEpb{b@9ic1RL*#(-fc z!FtO^(2e%g4%xbr4vY*Zg93Rwgia?2=pHIDyY55OfcUVSWCn1PT8>R&a)_lUI8@56 zYIoI}eGLRqcRT?QJY)h46ioC|yWoNVf(R6|EZIwgzl7WW!e7^7@EToq1g6}+C5b-( zaPUFeol^pfl@!yt6eL-k27f)ZA!ruT;4i_>ZE@Mxyv4$?(`-{FFWREkhoRSB(|$Jb z7a#nEzc>Vd{D0xE43;7MyRu3G4gLxO_n-O){%Spjzp%8l3jiXGV0FW1glLQ%!(Vvu zO67drBpw?4Wp{#|5Ig%`2asGR*p6PB$20HgEPB2SSyEHiN|MyVL-Y`pa<4?<48All z0Qv~9>6fs#$_&zY!S)j&0;+OHbgRpbyS%W5f%3u47z;JAT>Cx=31~zIZ8?}z@*#5%Isv-PsJ51tZL+~xkfG#*S zv9t`M5(>EL2KaI++gL37tFexjG$KE)**i}Eg>K>pFdkm~B1A14@(XxwTK`&AM@T+k zSF9x@#00ya$cZJMj4R^75>NPvtC!~-t6xEJ_dPn>gF4-4>Q|~J$2|a(7UHk^#Q;ry zGxAG|9jJY*egRH!_<#UzM&CHvIWRcdi{oEyh#?qBhHrBbPs%Nnp(@0jz7R*#nk|buJqR{PUki-~ zDZlk$){O8oH)8(Fe5EjZWYOJm>Y*RA@(osYqOTmgUU72O0GjLd!gH|l=3odM(jWj(p|o}9;^)*p z=*(H3C=rbvN(_`vj##1#ol7xtOB{CUA5kCTv>*U+u|<~p8FJZyb@bXar;BRKq0NB* z$LXqP2uaIE;BV;KZd&f=!GN@7ng=P&)acqBnagqc=WS9rB7p<@St`( z=mCnZo86T;&vl|SmI}Ih{$XJyNa2}nt6RU6|D8a*E@0wc+SZX}F+q7tou+oM*tRJr zjP!`{2%$l@unAV15OGS9Hf|_(;%;v8+t{W=UCK!Rn<4Fk;#>jU1(t5@yMzI*c1NU1H;bKL{8J6=O9Cq(1Ur879`!Eii zbTGlt#RBe|k>YG8?z(E_Mpq_c>*8#Ol5Q~4Pw5C;WlXr-UK&Qr`cv}gyGyU-C;5_q zR!ejSSqycA1S~ZHi!S!Lmg>8BR#JqHAqYTYv5wfyfsm|;G0T#=Hxrf;w-HMgh*C5v zAujncAb<6?VHO_b))k14WkKuWwi0-X@yx{pPbyfp{apwiEL%Bz-W~vRvrD?n9%z*t zO{Evd(Y4*X6{3*r;TC&8J2v8Sr6xYbHJ_k_i{EqrqeUSk&z zY&s^gYO1U%sxLB30QT**NB$C74>b^XbDE!~O-aa1(ugcYjg$xy*aW~YZd?1v|EviK z997966VC8)HR;GZ>w@F$#4v2a6UW4>B8*ojxFd&>bg3ZSc8HG~sQiFlHAwnV(9|oO zO-l{YFf|o|A3L;jlF~=eNl&xOyzFwG=B+pHTf5B@WL^56uGxRglD8g?A9XK| zzj)^LtXu<_*O>zWK}rL_S)*&|h))Wv{zC+=q0c#Ri0V?|}tXky+O7Qu9ZFr&oji(Sk+*O8e-H< zMh!_&Lbz%4KnJ`O=6vL;@-4;_PBNoVX})H$ZQ7jlx> zM?CoeB{kQx3jjzsfQe-5igk6Z^aSStnDX(#r+`d=my%g{L>UNTi)R6hJiLSF-Ul3W zTLp>oE&v$STs8{;VVpkZwnFfu39beY@Jfk*P|t2~elvT$IaD3dFaSI_tv#po|F0wcw=*djEj!R4> z#0v1)tqE)r#L1Hv>*|uo_lW#*Xx4PQ{%**$kK>0*762fe(3+!jOUy*VAe*9)?SFZW zhH?>glQ`S0pM+TZ}R zlPkg(vfL9#lB9gWx5Y0CQ1V>efC9F3jjf~FC9 z3&eo(?T3DfRzxZd>$+NaCbA~3dp|xTLw|dK03-n#ssUHf31GbU&pY2G^AHTY05eVq z_tf`^2Mp~8aR~ewdpQ7fOQms|-Uy4cZd?foTAGI8k9epI>RRtb+Hv3^0BiEXVZl=R zc%t3T+sUnjMS~=Z8hh#YZ)zLf1MxbLA%?5A&jqP*?WP^w-MY;`(0tv?W`c_%+!r{^ zjedwDjEbAUSfKgX_qV2an@GH!8UBEEx?OD1r|)eD&VT2vFV`2^wLP=j6TUGh8rUUa zC;_^F0~^3)d9x^*?MWO|f^8PaAQ|G+ccJGU#nugEz?;T}P0*%(8c)%qvX6OjLEel& zFb{7%Y0r-i+PLkQYvz;M5j#2VyQ;j1zvTiz)+IIzl8BzZtkfm`RX_F=Zwl*L2>OhA zp%Z;EFk{NQaZ5!OrPs@)cg4~*q;V}I9B?!ph8v_2Uc3H{mNZ%#kgk{=wbS>MMtDJW z9_*bG-fG`v9Rjw$(FgZ3QtFk@PHV%aw}kLXdXQh^iRTzD@rg`Z>L>cr0g7(onEGgn z7kIyBp}pJN75)aCf0?FuJ&~kEdnoOU48LSkp6y7OVCiMhe<82@pno7Q=-moiMp=`k z`!0bH@j8N%7GWNLE=LQZoL~Qd%GoqT2y^ojFuNvQ>606>4$(>k;@$z9@Y#}xz3o*A z$?S;M9tT~7N)X`I8Gz{Fdk7=m5YRjo2cS7#l9|>!m+)#lPG%6i!u*ovIaj%~UnGIW z(!!pcQQezK_)!hSQDs%!?gu(nUVubHpCKb2!t0OY3?bgcC!Rd{OY$WG-VCH*xrp9? zkFV`C&-kN(roDl|@_OAui9X@L2S0W=+(!$|w(URqjylpyVa-z~R>HIh$B>tFq6ZD~ zB7vRmx@TJb(|4T6SKGknU_i>})?0yV)S0hv zxUTxt_oMD-muQ?9PW|UG=heOkrhEyZ)c3Ky+znd_vpH<=p;EH zTIj$|R(#U$T}j}L5&W}~-i>72AfX`6NI2w+Bg9FIaKpfp1| zI;9T67z2XP;4caZZE}BnnPt(Yw;gK;KV6O6DD8GVz4aol!seY0;WR~sh}#r@mn(bj zx!>~0*A$NLUL(~$za$3>bgpl%9>ZV7>QVG=?IJYGD;}n&rCCg~y!ijIynqa9G|LNs z0e@g~5K-5LSlzOsQC>Jk8s$~M=^!P@n>9a8qrB8DHGJ~$pjFNq*-2)^Z=-MgH5Wi%InDBF=2R4X zX{vi=j%IlYehUH;;_k%&!WV-S`vFWMik#@;Q3}XkJqr7QxQEw(K+VRM0Hh8uWi;cV zXAu6@XE;9;|9E$~9k^!n``6E(M{O}GE$S!&Jq&9}W4sz}3&{<@Z)@r-xy2d#7og)M zPg6aUj6vj(SaivN2wYTZVCZ9#NJd{TlOaW=^wf9!)pP>5Le*H{fGU3XqtI%FRG0pOdq%eV~7@FlZ zUeL)E4H@Dz0$xkV%F*HsjlhQlOC;dg)QKB;chzW?moub1XpkPlt4sicmBTW?)D}sG zXqM{N?UpaK!Y0bo$L}y%Avn?WRv&HM+5z-v+exeX4`&4+z~aq*kh2WBPkS;b0?)t< zCA|-E7FCW9P7-SO5rx+206=e7%FomPan}KaFrzz`fIwv_qDsx{FB5=x`^is0+|Z>| z5Kuf@;_b$}y(0YS$(u@M*KNJri%qQL>lG0u4<@-e#1Y8w$0Q422Jms-M1j zaF^j6fnIgWzw=^k&sD=+7;o@n&Js>6YNM8@X0Y4IW4JV$XxKO~R5By|V%VIUiehII zeOP{5Yw?a@fBn?GE3LaWGfi_jPM5D5=qzgrcWDl{Ka#S`gb^kVdLucmrcBVE;a(!2lHgHwD6Wwv zaRtfeC1!eY^zR)R#HUaqGB~V&e6gvxMR2VA>SzkrYf~=vmG}hJ+>+Hd$*AZoW?gs& z)Ro%JmPgK%pTGz-wx85jerlJZf?0ML zF<-0sMK?l8;%&Hflx&YBhySsp=RF|CXLHBV>xdd#c-IvxIc-*cG%1A@6Gw^cFY6Mc z3Sn=VM*H~Gu?grB*P?hft0g=m^i(&SQN6hHBpDA%p+ho^FvygbHc}li-HpR3wa*s> zFI6OK+WyGIHkJrUF`D{_EVDVH+!>e(s?fYr@LXRLRCxNT+Gr*CzVOR*6Tbj>5zEZs zu6>GDS_K-#%~g`i~eF)`M4`?mL_R7Wjp$t-_Ux zt&%{_dAj&fp!_}|)4lLoS#Qn{Y)lC_(qxWK#$oPiOI2?~`3Bu(0G&c{37Iv7)?qkQ zPBxaGLmcHabx9hi+&WV|=N4+ztr)G6<|#kV_ddT{y;UV6q}r43dMQ+;ew{V-VirHl zXJ4p&$D~5dD<^skG@s`@TfMCI;EP%^6ZHPtYW@Iva=caZWkZV}H}1{R)a==? zqN;;655qk=Io=b^qceU1Rc~fSV6U3rM+ICTeefluPiZsPoWVcK{R!=?tcutpHkUAa zvG?-4$_X-Xe+mI=37^$bKhIic)LGdMW7q2C{bIjFV$oR|?lvyeD+0wD&Zt-Q$($r} z$p1F{Wc7Ob>}>GSTfuJ{w~bC8oMMQAlS@tNN;q3ltd91^74k1IDn|G12>TtFfr*#FI?wR4vSAc2c)Peyod=Tt`D~GFnn}EvdUx$= z?M?r35mrYw%0f=<{-W;cPbtQ?HVm28QA>HzPYelj^hl1#bJ>4b-$w5cf`ix9kP@eD z{iPd+``8a7TMCnuH%}h1h@(9CE@2ZAR?_IKaGJuCwseYEV}W=heLZ81_zs@r&J*e# z5$>sh@exZrI3!n3)YtP=5clWR{(AN3x<3!kuS}eKM&Xb)}_roOGXUo%$exkIx6dAOhYeq&N`cLWpUja0~^SX zFpZRtf^1>-6g|7E6v98%D-?-bmJz7JKG%)VYr6x}+m%5Of-8`pCKPn<;9zD&m}u zB7#rS&bh<>%G}VEMY)?OY)I%AGS$bA##~ZX8Nw0bZ-iW_BQ^cA=qvIqMyE=JL z9U-DaEu5X~2Ge?^4t7l=EN_V@u0wACd`JfI+Ig=NJJjU8*So*@*GPTElwTPHnk?PB zIU*VP{QEApW3t~S$FxIrOF|Kg(XBRaTl0=tF_yVRvN}jl_i|nW8_-mo$vRm|}?|s7;7FUsZ3N_ zV!U^|utJqXhV_g%-0fkFY}L!BRdcp3Js(fRgA_tR*rn4aUv_c~KYgZ3r$FISh#xzs z0Lg2~mRme+VO$Cp#fX~-irN>eSA!KSt2jxqm6L=dSvgVVA5RAJp0>(1X$qf7l&iV@ zLur<%-IpnDY9zieqBlV#JUhDbM$%l|1ZL0y3hV&=B!d-K7pE~yiWFdgLY#q%4d*n76f)7YAi}-KCfBwK+g6FYKE1 zeKtL)Fw?y1Qe&)<^~?Zgtb*-qqd8#vppMuz9d+4ig-#`km42i#SrTr5?Nn`-u3Pi$ zQAc>N$7^OOh#VZDu@o06e6}Xe$n%rJ@~t61uhr$k%Z8CY0u+MK0D-Ux6wShDY=LR3 zGI5-=!VvQQi#YLY!Z&VX0Tk%|Gk|LYx1ve{IuIK7FVu<<;KFWV1!@U>1W1qHJ#(bu`pbRDqhw=%a=f`-H-Lv=n9kx zQv6D@vQYk74kDPehic|R5tlK0UEte6wP22+kGHSTCDlrf6%`3*B$DDz>&L;mxD=Fu z&ngJ4K9qylI)iecg6$V+*=HZ@z~KILFn5q*$r*vIili(;x&V+>G%`@bV!9{2*}z=9R` zJ5F2z@`Z2kC4flMOG=GEj3Mfx97J$0d zyKHdgbaBY4D%OeIf%$(;$H2axlET)gIR^`!JZo5|aLWGq^w0&1z#BeW_ZhfHoh7#CupLv2VU!DrNXcR6L`P+E_JppwMI5X<@cO|kj==3HTfg< zK_(Mfap}&1U~a6p_yq7CYnLeNPd{x^t3)RD#-=H}vG4;pb4R)-bW>WOu}3FVt=Hy0 zC`I-`7n57CMG)51DO3)lo@J>_80gfBQN>7qLYJ%cf+HtC>1ptH_Rd3Xj&IS&C(Ke2J6qlIE84QY^1kpOdnDkwX-MU zjAbaXYKOm8%`6RmHk^ZO1qf|Uar4>x&$JC0sVG~^^!*Y^eXK0B*|vIGX(Qci3hTtP z%2avHknRk+##rDIFlg|az}vGL6Sh(V#vXf}9#`U6S*0$OebTcGlg1jKesdHLenb#b zPQ!A<2kVGYW*estxHE4BVBF$6HZi$+}06Py5l>oKB zeYtbxxAo285P!KY`6nixN;|>l!Y=8axO`F5>pdICLbjZZvn&N;-VS0h0jcg8s{l@4 zvk@_YW6sWBRZETKkpE>`8A`C#FbWFIj%YmP)_5uzVYKkVJ{2D6Id0k60Zsd2LISZg zg+=@>4*Q9?vZnbxgeHs1kvFy?(T2BFq8+IL^}D1d{vDkGHKa`v&9$w(#|RPbfWmW+E+r< zkK0!|cOw0-oy!qEoB5)B`Q=E*^_y$gF4Tf+ftpv?!rW8A&UdRZTXB;mCMsDYe&4g* zICjtjrxo`iJ0lp34BpHuJ(&8ZVtbp7l@uIRwDKguH<8?lzG zIg-5d5rk61w+>E=+l_n1FIeIubCM}IFf<

-vVX(592{CMDxYO^JyP>g1c$j@Rtt zY&c?ho@#==|2uNI{jeim>k@(E939PO)3Y%#_<8nv%&VJwH>`}inCSM~e+#j{eHnjA zm%Q-MQ0uDHKDPDTzK)Zgh0>oI-O&3NUvu#^Rj!@IU+*w+)3%D!?tEF2@@hT4O6$o> zs`Mw@r1x4Ds4*i6PTn+vfIWeu3DfLhvT`0PfGbSp_!4(!tlepB#2;(_QwKX#c3# zBgP9QQESS?g-GlhHWTiwdj{3*Iw*`{zj`*CN$XzR&D4X%-J`f~OnPQ?e~MY+#V7IK zf?GLW{~qTMx_#-e>z9xMujpka-m$slsdIeUgjXqg#$;X(@!a3%U#pH6#oxSgwOxy@ z?o{V{ft$pb6Ast+UUkOnY#c=1c|#YSY*w*nL7LZl8z)dOWR3%YzTge30k0qIJ%_Yj zsQ*M*zVUOEEfV>>%Va= zO|^zYKlK!;$DW5HI2|c&9olsM_P>%cQKeYAz51c?%rwbFmH35D^b{8RKKYSo_&qn3nnNl@B;2UZFqcmRtv)fYEjcp!F zMTBUj`yW$~k(z%R8tzG2xz!P0q&oS|(}}8Q8@;mCG%`v;CX)u*xV&gi7adn_EDFd3 zWon5Y(jm}`e)FkoTj#IpN&Fs~)mO~7O8Nfuh(pv}q%b1cjr_XM5qGO~*lmcwxJN2W zt&(qiy+${BX->>YC`cqFLFRnw$h*sl^6W($?~I>UNjM1RShj|`88HiV$>)5~oQVv1 z^a|~fS&Vc;cry%7zj=pzIp{&V)5ba1CsyzE>v4I`Y2U09nsEYyMO^Hr<`o>R_t->* z+;)?Vz~p_-FqH~v3H6BIdqEf~keJ=RgcmDzUNkX>~ z-{}xHRl}4DciwfpyLV~3x9M)71onJvz#7-Z)$tPLseqP)ail`ZTUNEZgQsdmev{s8 zLS$@=mGoG7R)f%=FR@(hVz=>Qe8wrn1#37$j*i^bIG`h71%yKDN_2GPyWrEw5{3D5 z_t^75OyXU>ai4DKyHuwmTl0zLTxW)5Yh9m*Z_MX8NArF2t73)L`nbRQ5*<&gbZQW*^>pbR*yT{>sy5t^7Z#U(DYBQPWZrK9k=k_xs25 z4+a8jjh|c|ti7B`lwZwXihQu%yw)o4^X>NRoAql(7!wXh)CdG20|$oj>6Rxe-BB_E zh6S`r6cuc~`wM^VKK({V$HQ}S!?iqr<1U}5jA^5NGD!8$j=7|YnL!&)6NDYfL;ct`f(R*2u?ur8| z|De(&D5Gx`Sk%D}H51T6rSu%v{Fs zmA(b6wjL~DjX!TDTkc+strSnM&t04;yoSDOc{T6#zNit2tTvMnDRa{7yj)d1mPZC8aN(*@f&Va^g2b5n8DBUas^Qsi3xNijM^ z<~0iDUaVuU9u?hhKLIR@>k`38``V@b@hq9zGZo8g%rKv%ltd=S8m5@(nm#$TjOJrZ2Z7#n-Azy7H#W_Jvk3Ptg;YxS()XhS(NV+Yljx5PotWnaD zjHR5}SbDenvJSll+|RS06{xSGl5KJGPha`O39aJe?c+PD+@6_98;LOLV#^6u&I8`Qk@lRI3aAbDp=P`K z-Bnv@@w36Hv>fHgdz$~sN|fx+y)}Et%{0e&YYwUfagP?c^I}cfv!Wex_u zGken-&Z~qP|5Euo-5UdHde&DQI0i@}0>sARn?MEcEz8I^^;y}3PMzV2Ufx=Be8OITZYFypMs?cWfRiW0|Q zCmkh1DoL*?E<7gxaP>)1ik_ZJ>Ip|8uSn7!u^}T(S{}sdfDE^SM*sdWYM>OIY_v*L zNr?MBM=}_EKTa0D+2>U7{Y!n+uxo5&O$c!}t>2Urt+>D1+qI{RF!;QI&sg1>C;e_Z zx#{rrFu$)8KEW8r&nTMqCB&UHi@KqJ&bVu5LWDZSash0W%!q9J3Twx97m7Xy(x0NAVnACw;B zUBDSOh%-po(S3E+jkM&x2+#P_PGn{P3dq8g`pBeq0!lG}ypQ44BXH#pA=gW|(|~+> zK!LOpZW2)7Y#leXgxj*_usnjXC!dX;!WbI%thL9u? z9zAzXTn*iU2>muI=W```XCrDKTsSKOx(#PzibWdH>@E>DJ_L{RKX#YMLD$g2vEAhk z(Z?W;?JnZ6-4&n$Ia*ueR8(XxrQuz8=3W??;+Y@nU2TzU(WND4$|9phcIY<~PC%Z5W@9IFnve{>OZ!fH}Ega)r zL^g^WTwY{niM%ZjTYnq8jr%#LS+CVgA@vcF-YWanB{EgYyXzq;+iC}!9!fEp3{0Zx zA5JSQ2V%-p*BwP*l4Amj8PJo8J39pUlMT5tH|G6fZaN$EuaLc?Sv)FONvs92-hq-j zz!+l)#=p-%KK{}^DA7y@@|NLgi2Xcw%?Yaw3Hq#seKJOX6I)a#c{dAXX3hI_bYCw- zXRkyDQp1_F&Sd;cmpD2Tz)r3P2;3f(emw})8-xe5(Hex3i2?f*xNwdlbaqT&Kby;D z$gsCwVQR~YY;Vn)D}@-7$O>TI%sm7+*k1lg%2kCQkXeDZSymL88B4j1)X4iln_puB z3|W6h6iu@~Li-4UVK!1ygLVgvQs)Qf^VZGkOHjYYendBNIeTa^ZCfqZ8n_w>Guzvh zp?FAS($4x>pE`vqj?FB#g~SY~zT&ADxid|iWPqCGk+;p-4IrDvCTS>C(*88k|D#;D zGXJ*`=5OjdE1c->b{7u_7h66_(C{v=4b)z)P}?08esF=>%k2 z<`xa_!hUp!+xh`hl=`XRdH`H{r#8RtzMz?XP9+Ozf7X=1C9QDQ6fm;6bxR zT^?!$#)bwiqu0%(*a4qZne!=^o}r+q+kYm{`j?^9zFM;yoe??8T#&R~XNQqX8+9~B zJoQ7Qlk#xG{xrNREbudjS*Bo@8aYjbk%o5#{8si66?#_u3(X-Wm3h^09LX?b{e>+e z#X906H*;^8q@JSP-E&v6GguOTb2eQMcb%I<@c^!7BcVhp)cwSRb92>h2zlij2gL#} zhM7qSkHudSgD0Y{heWgx=SMXCUIilZ?*gkd>}<|T)9fxY%1x<&@DRdHq&xR{_eE}Z z?w>?E1tO``UfB87Q9PMn0Yg1vzZSOq0m?wbyMUY{qqJ#ujCXYm8%JfTte5=qr6$&c z?Oi>Rd`gpkizLY*eB+vueuI)XVx2c=L}I?mEl0k~vBG)-iF>6yl;=z;b@9tq4(1%3 z9wOoTUOjchoz1Zu_&Qq)5x`u~lqGilqZx4do%L;{*B?okOqF{%x?ge(`ErfV z=bq%#r4n;%#Tc%L-f{InHfKwoy_f5`TO7bmz5r9LsWJ8T@O}K8T}T7ExhXN8nb@YN zcKJ*G=#tkzcGpogeoGL1_ln0|S0g!n8s6o`oZrMgwJ!0f_$Y&~;KY*GUZy)w!1c!f zwlDUKD7Zqx9t!_2#{pZ`$*i+8z=u5|?*@Y}L# zYWkNEp&2l~LG6c3vG=w$3)J|-dLk%VnTf7IBA$cahl2m)UrcKw)#ova%b&T9bk^ z_ZH(7nc;a%!vv)LHS$&<=hnxCks3-`0!afG#Epj%wplcFvlI_7h)~KLu971J(jrYcg1}PG|XH*IB%ibJc{6X zR=jxj0qVvSTlCb9SqV=Q{-ytemeIsK6&M@N`VcU*>q}nTtc~hdp6l7Ekm6XLZQt+1 zP*)U-cY&Nn4be;jqed=1{B%a>(}|gJ?}Krk zgHPuUMttoc@=i6z7Q;{m&8lqcb974}g<~Gh<2mlSwU; z*)tQSGn4fPQx99d+VN%xFtc~?6&y`sIh4Ep?}XObghMjWWf{|s)H7IGj+#rEC9Am1jF#|eMt}Phwcm`tB2WXrQs21lrWXHsfZRJ1vwhKU_es%i;Md*ju0l7Cc3vZQTiU44 zT>td8?av(Tmmnd4j_hVRQ)X_vlkeORr&l;#FqO7<)e?fov`;Tt-Zdfwa~i2j4~ZBF zYTaTKcci!<8O1?mE~IH8%9}G79zK%~A1Ss-P+&bPes2`j#cOiniI~L}NwQqUOWSkB zbVm5mpqI)_*&xfAYp1e3Eki0gpBxB6}KMVazvb>n)}uF z2&ejm;6d+K1HY$N#PUpCZAjsf%{>~zRVK-V^szr@{(=gJaa+tjPS%K+sBBNWS;%ug zpc0yIYVyQ7VOqMx=(fZ(e?XQ3pd^Fc?*1>&28LITuFsj327)?a?_c!XhS;P{MKn}D z&d^pZzm9u3ZU6eGh-`66m0H=2t37-f`R~hLK8bZ3?3d9Molk5BcxGfyMLe)d=i_k- zehT>b;1ZD&sqfmsrt~3n)Ozo`_1I~f^w3YGE&2Iss7o?=+8NQ7%#1hVqip*h;8DG& z3(dl75^jcH9RA|-6$g9ppwf3RJ}DPaSGBNts(ffX;wn9(*M)RykcHug@c9hG=nHCT zeL>EPdC1)GZ}|@&OL%za+8QBBqK_VKl=yv{63YI@sVH|sKIX((7VpTne;h6FQPQy5 zORJ_#na7M*!d)M)m3-#G59|`sc8|Ud{(7mkFk)j9we6$gq0MON-?sW|_T0B-&o6G% z`PxWVuU3!mVtLUXKa9hYdDzqUqYg^7bCS0I{QTH9`oW1OPOr~HkWRiB*gLGhCquc} zOc7$VXwRazP*zhNaXwN^CPLTyFQVWb05(P=kYrG-c;f|4mknm+J3$GzpExC}Gj{U6 z8!`}lYr+_qjD~k5TbsX2RL#vUM=@VbrXBM^WF~is-|tL9+m~__F?Nfqs@$@#7Ug`5 z>z4P;PBxq_<=~h%74aBO77RY>X;4(jlW6ix^z$ndI6wc4HxizN;_T2GTB4r{m)0wt z;uqeY5HWbPktxSsj_e2bscJ6hEKisq*q>mBpP;-j@|-ndq3f-D7(z zI5(V(60%9@sh`H7on=m{zRXI~1=(VNzKe36_bk?igAHEBSI9fOKDE20nl}%Rm6hl@ zRocX!_u8ofpJMS%q7;4?OjUa-IzZs36#c1&88XR!h^xtY z$V^`%-!qaH=o?z*UtYj^)Nej`395Wng|RZoOGH7PnOr8qxvYit9U&Eu3CdoUVRU95 zP1*5!q|q&6{rJh5SxRG^COuhVlX4o4JFO#ab<*ad&go)RjS7>BdApqEF|zvhlM^Oo zP%Y;1x}Bd~li(-knew4Nv8c+X^Y0ulWkWp@?VtIzq+jgjJJ;}NTj!d@2WPVE=QHcq zxc>4T*8?c)wQKviMOl0%Y3)?sa**(hQ9DoM&p~*sReK{(zP?`QQ1@NF*Iu;Z z0i{t5U)JsAV{`M%ROQ(_SbqgQ*NY<)x4m9YR{39<_nDl2lqc8tV$DAR9%t5L|FUcC znLmMLid-}Psw~|*xXR(kagdVt!h3tgyN$&q>9wbPkN+BX#*?X!#hi4t*eiDLZ_fHW z`mFZD|EI+0Lrl%iQHUB@=B+wVmE4Cxi&uJHP;deHJSrWZll4SPgnI8=!3 zz3nh;OMT7DTAO5ODZgs>n;4p#p#DK6mHuRp?B^%S(N=ogoK?K4^1`X9?^!(aD|pp4 z7>McV4?OvPX%P)1>kZ|qI{8NP`wRp8v#wXWND4%hU}wo$7x!H~130>z)>Tu+Z5L#c z)bmQ^WC&v=Y4v`XffEwH)ISz@!K?3mV7b1&p!|N%}avmTv=bdYHKO5JEa@{ecDK;IcTY;4<|1y_`5d2Eg;nU`i#)P zPSyLU^-#6M*|o3}zzv0;{gkQM3hP9VH(43_diPbfw<_9zH9xzd(_eA=;T=4!`H4Ay z+NRbun+#!_pR?}HiO`|W@;Bt3R~KV|NxcSIt>@>~zFk!0`M=n^�!lyYmRmOuU~vA*wx%*`MQUbYn5B_jv4z3 zN3VrwZjA2dV0C0(t$OI~r57>3tG@L0G*6gZNg@joSN3bUYacq5K_{*WaK5r+m)f7j zoch#~_G}AuE@-|6d%5$0^{sVO&0ww5$2-fCTd~eTvmZkrT6|W#RCGdTp)cWMjZWx0 z|FzDiqp~W60S+`km(Mj?NrA)}>F=$ogqCcxzl>jh{-NxF*SgGu8^AV>+`#s;vQiIX zELSa84QLeKI2{GzrVl?AP2ay080Oz-82V)-FaG7n!=a64Ihuj8pbwUc)a#E9OLuDg zKUIZ&d1JtNkYT=Y2l*msCoF7V=1f>~3Hz5@@|yY*nn-1co%8_mJN(78ic~3& zyvzF&{whZdwMCi@xabK)S$>1RTyI65CyRa-8D*;(?cyG7up8x29$g?6?Y0_yg)BxS zGy0NdjO1jr4=N_GJO&7h@n4N$@rwx(Kp{0zC#Pe=P^f)+RMY?}aTTS04Ud4uPBfvC z-D5LQu`&#?X#=r|Cb3y$aRoD|?3vh-0jWHJxQg;PAI-Q@Jp4t8jH?oeZ)%UK*^O%p z5NXhizc&zn&ON@JEFq*Y{=Q~H-)_VMNWvhBAM2hVS`MTI6L@;VGXaSfU^Qki0Yg0% z(3ybNo?z4>-n2YnIUo)SNU}ze;0N5*z2V1-*8(Az6aXLwxM8o7DV9v+on(nhSXoW} zib>cPh*)b++JYtTkg2^V1H7@Tc!$Dh`e&Ysb8K-N}kNAmHZe+9j`bt(nXJ4PFE^NG5Xh}Gm`)2xhfb!7YaON2kW-H4r|8JuQb64_6ue!_b?X8P z!Yok5JT?VbobuM4m|Y7%<8fOaa@)xxqr&L6Z%2r`hswn-sZ3(YAn>u`1zNauBM|8y zm=P}s35?AwdzeA8l5uv8fI%hs{R~mG<;~=^h`7PbyjkX?!7M-KEMloFPyB>cKJcV0 z=#K22hpXxNg4u&yck0&sOJu3bJXGGV67WED$cTf2FaXthK4E=c##%m= z_BX`~N(fy6P*vu0TI2&S=9AAA@W{n;QWU-uEIe6RSm<8Jf9I}XXQB9wSkd*ue)1xT zxS|BjqBBC-GIxrUJ7eUBit^TqR4x{K3lzgUGtb>9*0YM%sw}=XSgfyIVntSBBy`)v zs>C`jx-Yim!W@&;P>GXvq&Y>YykMzwWvQ%tsoR~K9-XB=b6IxlrT%-IB7$Xsdnm7q zWrt*CHz>+Ew97&$O2bZ+$7qKKYnKnXm!sCpgJI?ICrT3c%5TX9#R*lQH7jlpRft0> zvWAMX=PHVjWx`D=Xi)9#xiIjH~FZY}mV4vtD_Ztg11t%1^zjMJ~4OPSpc# z$9qFnv8z?+i`Bfl6rsQ1)pLTE$FyrWW_-ruN}j7BriN;ot{de?*1*?A zlc#H5-nbFhXSpa>?A+q^YQ46}O}8$;7A|KezFqry?naMX-KAC0t+=|Q^{a6&J@zRI zkLZb3fc50VuI%dd3KXt~mG!jp9#&84Kj7WO_=Ztwrc0&`3K#45Ga9&9)tE^LS*9A` zaf(otTJGUSn5pHv8;#m`8oxeiJOi?HB4`q?Dqifp`t)KGT~G*bRnvuaYQZYSW5g@J zS!cLeZ==}&)M6yuVxrSxcIl|aJif)Us>OP^#df3R0;ttNxYbFg)#XyFy|qW6iZhXX zvo>DwLaWa5wE60^`Cn?g9^V#N)fP0|7P8S625Ju%Zb$00e}}(ftJ(vEi7ufVysXRc zikJ62;#-$GGU7Y3syc4RJ5Vfkb#!B8(EA-^5$wJ4UHu^9fsKx#D#BL0;)N#T z19i++F_E5AHI!2mC9VnIxu4olw5-#;ILtgB&$7Y9dL{niliTWbi`|`k-aA$IKR;zo zhd)SfxV%8Y`fC3OqD6sDGU}#j>=vzjfGNHF3B>xo3XK|numy76--pm9JYcS7rlx#g z?ZoC3?t5!oW1s?_)lft}Vo%w1;>5@Ufq69j0h&n!^H#@6Bmr|qmszOGQR?8t;s?yl zag7<_tByz(E|CYPx-f^sCtwO#wqQ1G5g)nD6Z+KnFjl>*S5t&lL6=2Yfz^hx zhtj%NA_uEG-{ZUqG4*6K4u)J5VfAF=G4$;7ooDi%@4rgPYEQ{x=gD@1jU|{3%g}Wt zp!&o$wui1cY{r}YO$7syx(|~Rn7q3NLN+l$BK_$iyC>O7SlD}o6Su@C9yL!cjXv?(dUAzo+DCNS z_y56P|Nnx&+Ws$uzurFmzt{58y_kdNyuMprMpyFioY$|G*PkJ;M}dk_H?BUqq2d@^ z>KIZr9s-XGz4m*{OUFDCmKGKK=2y!r#`q7AmqYy7toVqx@fy>Kc*V=wC0RQ-r8zex zW+vr=N2=a*s(tXyj)a>zc*yI!;S-Fn*YyYYPVUfchM<@M!P%WI|LkCvC`ua;K|GuRcDAIWrWwICEi z2*c8Tx4bwUu(W?;c`eiaVtG;HEid@rSzdMln0~N{oyV)JzEMq{NYlc#SJ~~czgS+Q z;6GYkPcSL4rSv~oUdzLR8|hMyN&=(tdcDTMLcEGo4)oaa^8T~s)pTrm`I80UEw8|< z5zORMCNcbY%j*^Gcgt(9=l7P^3JBpi`J3hC`itd-6ZEGftWepVX*{;Pe6_^mDI3Rx zGkTY`*s}~PAMw6X3XX#%Pkyt!Jaxn~X5r4-ZRK6Z=4B{=X_hhU=n`+?hy>6R_ zy#0sTIOyWu@;A%t5pS8hB-yW)mkph0PhzDo-ts#Ao8@KGD569-&GMrD zo#mAQle(_j3ixh$jne+GyaEP!ezCkJ8+R1>L)PCTtk zxlSl=XTKb$3ccudK;r&n88*q+w^9 z382>QhH5el#nN8?*z!vIo#o|OkbLexvAmrB((>9Y83_q#yc^mRE*Fugsnx6g$R4@9&)j-!sZD z)B>ygw7hnEuEfbzn`_*&=w;W39$8KchN20-TV7Kfzgk`kQ6VAI=9xb&uVuzRSY8Tv z%M10x^14PY_-D%tfVaFT!^yA0%vRsLPS(X+UY`H5yo&7qL(7XNMveB^@@kJ{=>2z= zS1tt3)ay-pp&Cwo2ZXB%U5e%TgXQ&3O@8Ls^1?9vmF0zI`pxok`DS?q;w`VOMTV!< zyj3Bio38uCGt_hceama;hvkK0dkiwV>A@hnG0>*|$}2VK?c{T?G8tBoa0m5vf5~XJ zhTpi1)|xn&`xD=H%d0EG=O0*JPw|%5QoLOKzbr4xC0zO)yyc}37dnY?)U>`U{bUhu zc_FDh@s^k8_*>O_BjK-j%S-?FmRA7Y^4jhlq+B`~H^ct+hI#uVph0?>^$Mfpz*)X; zme(e2(Y8H%@dDoRs{Ceo1#^A3yfX2YmslM9X{~DUt06Q01yD%N56ep$)m8hLa|-rP zEUz2UU2{`fy9yya!|N6X6u81%_U=Ew7t@V|j`FVtKUzgBHmDx#gAn2g|Gdcb1pqpDiy*5Z>}i zUiiJ`<$G*-1sz*nJV0<$qS$T5uXxJ~Un%;V<<$%<`WwruJ4I1%KT9wrOL&VUYy;_F{6Eq1k|2EzRRnyqyh49k zURggaud;t&d1d{FmRAt^*z(GNl3o=Pj)zJeTV4y~-3uhE@2SjV1fz~_lsiY#Wc_A& zT_^vu<(0fZQm=8X+y0vS9k$DJm9;`8p}$yONc&^U3%)?oEp}{qCFh^%#@%j0sD4}k z9a~;DcmG2maaes;j?NCMJ$V$fGYBeliXuY-ot-tA5tlpmfcGjg?ue zis}N)*Uq<@=)Yxob>oRayyfNn?<}wG|HSet##>(Ar04|_CxD7)eV6}2m%$K4E{+Pl zt&7H6UT6xZZDN0L3F=6`ME0UZMnS4TS_2V5unp2T7?`r%C;1namoRA)xaJ|<4S$Dj z`ImU(>>k-p%vk|v69H!mn_?4sfy=W(?Jvn6JXOaCcZGv2jfN<0e=a_@ysUsCm3Z3m zo8>h`TDK6Yfw#Q2$(rhm3{P~M2q`-4G22;X5r=oT1*$EHc~CF95S*a3iK6+6cCu=~ zev}4B1!f9jg*@fKriInU*IbBGu`JcBlEKt$Y$fKv9x{Jwlo+Xq9O$)*lVDem!Nqoi zPVR@G?pW56^a18C*B%n9y5J^rK!32j)=B=x@&X)NUfutd;PtZIi{&V}GZbXLg{*Gx5}&W% z^^UUwwpvg+LjP!a)!EDH1N}eQI&Hrs0Sbbr6Q6y>XgJpY1IufX9R4TEtL&%c)oQ;C zJER)ytR@tv`KOi_KX7bFBBZI>R`y0onL^Iz@=}u<3F);%L%lGcX*ThChQ)cp@0Qmk zko*x37oD}bsQhZLBPiz3B5}}&?KSy^<6k$4H*B$nZ?wGOaQugs7lzc%{yA|2_5VrB%klS?m)+l4Ua!do48V5Kbj-!m z?}OyEpFC*VrZTvS`w(?1?7{=WhR=U(d3k~U&hqlF|C8k<@x$`+_uh#wQ>BLa z|H|_6zxIX40=J+VNzQo6c?P!#M}a*k!wU_!&Ew3+GcfVO){l}Kq$4uX!mY1&U`{n;3mRF_R%^SY@MNj7c*7A~jNqpK) z>%(~NDVBO1XOOONuZ z{Q}|rjPI5g7zsJqMCL_#Y8Ua0^XHW5#*CL7IB2K_znT|@( z4DAcRTV6xISzguOEiYZX<<$hiLHG?;?3nluC<<+}QAxhT^ zdXBfef`3|G0~|jsFGwFRch*>5WKGD*hf-3c+qjQ$bIK3POZnegUZj{R#9&%@*0JR! zi?_Urs1bj(yl76l^Zd*568M+p<@-05S6%>|F?tbfjkmmz2r5;%HPEr;RrkMPdG-9T zyqt)Bv%GfjmRAqn@>-w_JtL(0)AFh(`L8T5jel%;<>4)_u76oxmllzBs{_R+3E@$C zMum2!5K6q|#g~HP)iG_Sv6bjUb!?Dns1O0Oe^_2Czqhe{K?s0s0|4!+U@jFjp&FQwGY3ma20;*N`;h_2_K6r_KyfC#$3krX z_$0Sq>_q~ARGI)W145cO7@zHALkw0~s0{e3a0{Ic0gyoCfB-Q9pzZ;i(k=iXuzgjv z-2LUTE- zKO$L-JIdz;)L*6>w+En=fN46RDEwWBe1BM8Yk12`Qw9L(CI|H9%nr>&;w>)%E;6pQ zlWjjNua9cuem^a*@}HJhI{fKM=_(1{@^Ux{K-2xSyuf(N>x3*M7jJoi_P<+R2cOy> zeSLExKjfR`CHafx1;ATg2)yMLfEwQsSbwp+Mt)je^)gpkyOV-CT7Otxv_C8_*bmDKu%PfS%S$K9R8!@s zHxqAp@;P*0Eo@*@~5yLn8`0l?9a zwcesjgf~tqI-u_R#0?I!6UhPmyrShUdFbtuGi&n}XWb+2tZnxnfnuQGw;3|npK7bV0 zUSCXPDLH*{D!hN_9NU{F__Ld+&~q`ANE1%IX+1`_gGaqcf1qBGa(LA1Zq5(X%QGlC<{RpzSKe6{ zU$1AV-&E8JhXBg&bwafCf1qC8$EcSLF`_w5zq|^Mdc`!d0>rkJ2k@wu(%BFCd%lfi>i&OcDE0$s&KJnB`BN4;*^2^i5Ek%sZmIv~#QHAMU2Q7`*r)QbU+ zdXis9u6Hn;)P55`EN0gz!wGow!(GRZo1P{y`WP4rm8_gqzUzJQf)b=gCKz z%$y8^5r(B8$+u!J4Im|Mbx}`E&{&Z$C8oZ;D99y-07E*%Hlk$%j5O{W1KBfG@S>OD zVD4_DwT0;e*n~db@3NYbyJ%*9AFA}t?-I9&QANd=1Vog=~bW@b+WX@8(w$L1FL$+Rw4{lB9j;j9jEJgilEXQT`0bB8>)uxWg*W`?T{}uy&ja zf@<3!o#(!t0Deifpm*pfpU_6x-i%VD3W1uSR%1epP$m9k#N6a8zBBhTpA?DGX+RJw zlBtaoGrpOVhg!2jcn1qQOSjg-pgrp|H73uS#+*M^&=eUPWvYXBu-Z)=P?6T~Zw?mN z>W70hFj4J7uQ^W2Zb5J*7Ns!|;bTN5t0@+h8N=g#_Zl8S+m3WFMuoa-RrP_Yw9m)6^%5ac7Kisuamaais07UuU4DK{UV)Fv ztKO;9V#P$c02@?hj0@^ixyxK+=Iybq6Jg$`q6eDCr{W*`n{r<2xP3QzCbAO%KrnB2 zl&8VedY{pMnWNJ>|4b12)GV>!K@(9kkH$cc2JK1LXD$7I%u ztcwINr(m~0$F+24U)D>HR-|>TnM#4o-9+rWYo+DXDn+tuI(zI|Wv;~FT`Sl;!}&!6 z|1_pkWVLI?5ipX|nuMp@BTWRN&%p=`a54OJM0|iq4SOP42(buVj2teS2TbH%8>0$| z;DVF}0*GbZfgc@TSptaTwj&(_BCiaXX#q&@qSD&jYuEQ26l)yi5F>ifr5WTSHvFbw zF-%cUY63&>y_(~HZp-3;Ah18{V=pd}bl+olMhOyr)YcM7h*sgeq+<(?l>Bs-$YME# zH#I&e{E8S+D+8gtG=K^R0C3=7V@_U~_K%*9+9B3_q4v5_5?${*@3SGwd;wlyAn$Ov z62S7L>FmAid(^!~I()Q-nF}-LEbzwFo0GJv-pdS`y~6z##AJQIOnc%1>lCkx6uG1$ z^1#^JkF1hX8TBb$PsZBtk%^;=Ercdhb_oOrm?7JQiD=*yz8VQuZ=y12A|^0}2xm!& zNY@=?L|1qohR+Q-JnSaUY9NrV-w?@+5)*)Bf7h)X3El3C1ge`*XD*X$6I{X#qZt*< zo*sA)}8<0+y3Ba*#)%K}G6-FWmA)cK< ziU5QX2O#3x- zBLflPALR(dEr<>5iI^;69`K=7pe4HYNXG!73q68i?U9NCWGZA^EbcKk-bJ|8SNnqB zWq)U}1oVzsER>PNW$cIfkxDAs@9?J6Z`Ft&cg;S?b_jEIqu7gk#bVaJpNeLf@)Kz*1!OLQmDiSBItJs5>>9b zyxvf(7wx(LD1b5nKoZ6s!UTL=>O^+5K691DdRfL?{=yUy zK>Tc-;z9~xhl9fR(kz^v!_c}9ABhqtUem;dD3_wt%Tu=pArt>^H!+`*z02+qT zX#8wELM~Sx$Qy*9fdOi|R|wH1ULmjAQnWY>UYlV0~~-NA`-BHc;`O> z0e}#I;mk0;(3=?ygI%wsK@fmpIdBOTH57;hY$4)dj29Q@wD%zSTpAdnCv0L6qaq>% zh{jppio}-SiCmKre>E`_~O)brjANH_to8$#-r89W=^^2dJA*6jbugs zSD5XS5A&Qk_&syhg)XC*1>XZ~*R7bNl~2z}xV*b@YNVyz-C)#y;!$3++=Z-xH(UD~ zGEA16Zw4Q?+=Ya}LnW`gIgtIZ>eS|1bNQ>>r_I?sEwH;-R7-L%z+H@S0n10MT|h}@ zzm>y-)Ygd>fQk{D`Y?2HE@Y}LYAU6#kP;FFyQQ*Hkol=lyteU15YXdwPXQ*`{7Jy& z9HMArP<1aIXf<~ST82~PEQ4YWWglE-x2>^^@M}3O`7QqD+5=fkqN<5?~h7Mv% z6c~s)dB@0Dl!6!$c`b0ggFnyQi>Lh)de;ADIFS6S?zmdPES&*pw zH6ZmwUEhfyk-Eu`9U@Jmu4Q_nRbq^{q#s)Wspt(tisA*m4*7c}^1mvWGt$5#;iunRIzF3zVXLK0Dz)+p;kJZ%`pzfOY1bq1~)uI zg{Hbtj3HQ&y(j*VUHKz_BUk+9l)OU%S{iWj#i=^jC|{FtOB^nf+h4~=vbKjuEGH=YX9GrWIh+FTGR4zSX3@%N{J zoV1_zsA6Xv(7#9ZjK2a_Q-1nx;YU4%$5SI!N}><8o_ZZpzLQyu0#baOixnPgkS;bQ zi6#yaQTp;^0#-O|nrQAuF2H1R*fHtF`evR+l9QFRP#-w<;d#-$L%Gd{yeQae%Ts+u zDdYL>{Wl%&4}+Ephr5|rMd}GQ8%Dqq3XNy3r^&rng>x!?jGCZU_%wUzf#T<9K^Jar zPi33|J>q(eN!ab`Rv!?Jh(jK16s3pVrfw}h3VSeNoFg{;`Kw7{%@JX{2!Ns-Ci@QkL(4p9!naVfjYGYH=;Pmo|P#kMakaKqM&Ju`rv${qh zj&fi+bbrV(Q%_B%grq_YJmv7jL|+Ubh;SX(P(J^fF{Epe>7vN6@QgQc?#EHkr-@0Z zc2x3}$pY$p3NyZ5Xv{@-EPeNO#0ylpW((AtQY!f5c`4wa)}6CA(}TI_KecJr)h_}w zRFbX+Fh*bZY($mP$17QQQ+QbTFe$HUvmY_-P%?=)u^2kT?TVpM_v-DLk{3qF8&if$ zZoOnOgO$?Xtk88Sd?SL`#R#sWCB*|U%*qzg%8iCVuMtAdkM(@g$Ub7h%?R?kZ4ywl z7>G5pDg6y4$iX;ZFi^Vh`iID3#kY99QvcOmTUw?D(!K=By$K7sl!((^#t%&U;cWaV zL?;Zb74-ZHJH`$KXCg7X4<%ZwVC1n=zU7z_RTa=K?q zlrOGN-Yql)3Y}c1BuC8<3SDcMS?0I*-R(DHALcNFcoJ_tPG(VQ;FN|DNcW`*df7~J zmWhoO*%dV1``}bvIT&e~cO&8$+ESRn2p^0K%=?wZhESEA_Cq9VwI$m)U~FnXIod>?mho{DgX;g(2Z>CdZl>!#zfWg4@uYWzX>@V zyqx`9H^D(T^~~Z0tg*6F!xH@<s%;N)x=R}TAp>_cgt&{y~$w8Vru#eIuJ z-YdJSG1POGFYk^yF=eEv*)55@mYwJryVfn4yQ6@tK|>$ics@jVAo9)fy6zfYKwsl7 z{{V)IPkNtgP#>Xzx4Z(?n@c3OUaMQnJx+TfRlXw3~fFT{Y)G^ zO6^>>)q3#)>yiBIlWX}4)ymQ+RnGjKKJX?#{zS6OIHgi1=Vh31#jZ(s+wQf*hXSPP zOazI&9f&KM;PRR&d8vrZ-Zr(1+@};n^py!F>*%&BXcSL97>eX?9`AiulHHu#^N=AXMR28G zq3=vc6gB!roN}lOOY5#|LC^Td`|t0+<2LZKW?9(cBx-Be|_xlM8i1rxHjv7gssvsd-W zA*4FuBAJ`E=i)jPSTun+fcK(FxhXmlBOnSra`hOfCY)~*BNsqO9}poL5CPL9U=_jg zA{HyD)Z}-Ob;XM(;)p0tA^AT*H}wiQ-V`XSF3M^aD4Vg91lpJY90l>+txV^iSVnTa zbFeZkxGb*99?-)b5FrIk_XscGM{p@M!xhVOlxGMbcb;%}71Yfn3!EaNKgmGZ9bN$A z7U?gP7j)q1q+iLxO}W^HJTtud%#r+Mq1Gn%^x%@EN}*DKD)Xs0X-!pe&jKb@ZnnML zk;EmzbVn)eoYR{a3FxHUKsZFR2TWT8OX4}Sv28Y+x97b;y)%JTydX!BN8V6$uKkEd zwlInx%2VDI!KTo|Gt#4=X~J%p3JZv!ycShw&vS;`NuF&*Dh^KJih;~W98K|@JAZQ1 z=Gl(r$`yfJBbQ?Jei9Ao=Nh?D(uiBO3via&n@S9ih3@A{yTga|h9sQhU|kWQ=tA8w zq}0F^$7ZMYoUfR{Z1U`!hTQnf6WLexCVZUp{F80!1z4S#zk@FoAm-DY^af#xcaG#T-GIwusm$;IcpB~1qw-R z(}VZO{Nwr+ocSY!+{k?l$PBOBg(Ye)zhogS@i8Yg&XsXVg4 z!--Dt?vjE1r`LC%2)>{~o+Q@WNpP!ghO@=0E06Smd+qb7*A-NzxFd%+Ow&$Jj)Kot zFh_Xgf50ko&(n5Ah)x=a@Afb~yG6}LV6%Xfa;#)_7b@e6Q7XpdtP564JaXB4UEP0i z`j##(<&ilGDW z<{48>x+o<^=B(q)7i{x+;mfxCY>k+zxysI3;d|o`h$BuB$3&VL0V`Hy1)k_Gk)x>rbtQ1x{2X+CWHoH zxXW{xN#l82g$L=95Ztc^LtQ*c^R076sPQ>IflQ?tHT~EZENB|G?$ov7)QJP@T6+d1 zcdP^ou6(WUyaYyY<}L%yc2Iij1*PCswVHEFaB%@wEjoU1DWPzVF8{4)X#op!#1!?b zC)|k_-CR`HtFhpK)GoQ2r`>wF+-~z$E0qLPl?yK!jIn1)N2f#k1V?)04xVz4x-Bdq zAtOko>Q_%s)lN)ayee0_SuJ95z}qMlC+dMbU7dS+1`vIyH)$+ z7u8rE6b2&TF|)S7akd&JqV~9}M-m#r08k@;TuR-&s9?eP`LaE=Rb9y^TgeVRKJ*4( zP2Enni89$`U&G52BQLFH-VPRT`sVY!G0Q#dq<&XQ^7RCWvq4qGu{xmjsgr)HR4-Pp z`~uUinv}n`RCt_+y0=tC0fQ$uE9G{NmYT`^Ix_hfyk(`J6!T79qw|o|t{U+yuCJ;V zI-yzPTOUqUf9p!Ix48m*osz(r2}FaXreYnV!yH-8+`vTjEb=;I0+iI)o1@J|Xqj85 zrgNG>o|k%`^n<;l`R?|BSmOrU)y0u{o&+^ZW3$?? z#%DGO%Qdm514-~Q;Z?uqLpD3SBRytgJ=%&m{F@~?&-1Pt%ex&#_Gyl#Zb&4p;k49g z8eW^1RB#9^oGtGh7_LE{=WWqj-9uwxUOG*6=sfL2Svl<^voIaep(roJM_d!h61NxRW03(?GPPiV;WA7ynpgkX^ZAl$2 zoxxj&!V2c~2&5$Qr^i8x3#2tov@5toi6TVn(xoOvS(M@8y_<12>U%;W=p(l+Ro?mP zzW2-2=b{{ z+-0yZVn?g=ze`{%H|bx(k{_T2AnMXyCNdwJ%qOM4xECEJq%nR`W4dw9*>C;X%TJeO zge;n^ty_?OG5-E0*DqUMzvi^%i-O1O^+1!QbRVg(=TFFqedg$*b5Jp{O_9!SMALY_ z6?hA!H${3eMpKJzS-4B^?_+5!j1=n8-cvIC*+Rj?*Ikt(LiR~=);?R>$V6Vb9%&%+ zk*vA>wo79;PW}gc8u_vxY6U9*H|OLfeFmDkk8 z_;aP!!Q%OBuA2mx-R!^2FxnBcDOU%Ze>hp1JytrL6Yjq;oIN?nO#OKMc16%Rrsn8l zXp0TQ{uN-ycXgDAwgPt}_C=>pK3kKpQOxO4!Z~zX+{TkfooeES@fkVF_91n^XCj^# zOwT!zzfWip>Sv#NLa6|MN4}2R9W~R87EW7I=6fZ=_r~@U zrTz947dYCI-p1AGh-=}{&it2+^sfB_=&S&`7184c%D`pBf$_JtfrsfCTmpwXP ze88)3?g1ylaP<1@1K}09D$h}ErSSwxe(fUh1U;h+yFpRapU-(XYY${A&h5h$6nY++ z?bljN*3N{P+h6$5rqn}lwngO7b$8f!^nzLxh(+3O7wmX8mR|7u1k1y1@o{nFMg_XAk;$xlA`g~#7)KI%+qZ0`m_Kwg6`sM@t0U$qQgbh5G?pu) z*nVp73o3~desQm=$+2H9tioJJ3)bn^qZ+>d<-K1-UrJzYV%Vp9jdv2;6JJeg)XnDfUpp zb-N=P%ing8vY*ug@)g&IV}PPvU@WDU<1$@OWt>(4ryX9{%7bnUaz-8u5*O3ZH(q8( z=QI&;9hK#;CzE*X(Js@yYJH5$tM7m3Msno0LR)o3XjwbPoUCd~3R;qLb4cinpAeUb zSx<~kSTAZQFVud9j5}n&Pg5smPb-Z}dNi^;PqK!sA}^lfu^y+7zQAP#C3u`hu?Qeo zQ@@GnrX-gNqoePb`d6zmX${`H>BiAyevcX=8M4D0K1&V1jroeA@DzJ)v7gvna=>yq zdMQcga!2(YBIAL~p&+x4(NI97u$LbWLwe5>tJkKTsK_+yEcnhWjwi`Gj!Nv-T_#@X z&~3-)0uBWiXk74MgxtkG_Kf+f$z}|rv0$&J0vXOYNy`9cFLzttt^3Ru?su!oj0AtR zCCfXa$XlLxB(~*O&JliOo4FXx9tv6dW_iVxO;s0QRhl!GlN1jjD=FtVvQ}=I_p}%2 z>e*$z{APJ&y0`|i8<2LaFxGxxIUA?CX;Vrq$S2F2-GnBp3}*DQ#Nr9LCy6L0d#ecZE+ zgfnNmhe|nTil-#wG9Wr!D7iY9|hA zsq@MT1(i|NPPDqXCtodQ)R?=)T(<(we#wbRwp59t%36();=s?|hb+4W0V1Wg7vmKv zm;As|FQ|hwWmz5@h9e=p`u)qXa+`|`9|3n5ImPb-``~QD78++(Vv=2823$+qeT>-R zL`PyA=N_7ya?L_R{<*m0oIMu2l0xn=;sn-ckL2g@IU_sc2!^u}9*r`$>l>Cp*(e57l9tOTls!V@Q2gu~(L7OY+kNZizVi6&+>@ zF!CVvdjiy`_bh;>uHvL&11Y(W#vk&MHc$qvqnS0x()fFi7ODa^{b-hUZOQNU;!wwy z*DWFV*cX(eJGPKwyg!4(CsIxUTLe8*E*d8uKQXdQ=UJ|fZeO3eZPlW*1mj-m^`;O$ zRa7c-_jS~@_1J?cIB>Wa9)Zt@32s`U2`YfkK@| zzc@*S2*zA+4F9X~WS1Onj#m7auAdXuJrVuUtWYq&@-z+kgdFLsuqoXa|lN)EJ-dw9N=ZEPL-=h??2 zW!C($XW;v&t$jPx3*K&$o;+e^qk8G*oG-vblEiQ6+KA+KA1B+rSmp==?yoX)C#%^n zgWNSW#U6J@*#yB~G}nJRt42#AA#j$lwN6uDS7jr)Z4vT$0S+}Tj8m8Dd2tPPhy`&* zVc+uZQ10AgJYj_L)ChmY7kjYso;r2-H1S(+_p{JlebSlUr)kbIGaGgB*g_`;O0y! zpm)~eTDn3f=RnN31fcicvB14$FyK0%rlwos*nP8k)f?y&PHqGyT%=)^;$oQe+CNy_ zx38h+c)QO)_*M$caiB-!fFq%g;H*ty8dxNW!BH_1vZ`@0pcmhZr9Ax<&J@6PD8)sF z$F)pV>5~Xp=eEC^)-c==r6??*YE>bS9Gl*Rt{q_B5^}w3uAx8axUf(+eX|yP^36O> zb;&gRos{a5+G&coW%>|N294B@CFM4R7hxSc0g?>D9=OvT)>F`b&gKOhI~#3*m2ry@=bk-$Cl;c5C+0rzxbs zTZH<|N1|)*2xfz~APa8~oqaw&rv7l4#BpO3axa;qHkpxTXA)wx6mXhCLY!+MoQV&! zz_7HXrvLu1p38pm#Q_2L7GmWl73;xe5dI<1I}i%eaPSpup$QoG3l>`!xYP7_f)aV~ zxE3FVjir#7-jRpjaD_0O?JPCq(mTE)0K~y~zH4#05U{$5pjpAZ?M8DdjZx!mt=)VE zp~ol1Z*tljFg1ize$d;?sR3_7pB~#<;R1;g+-{Gd<0_0RnCAjxtnV*#9#Ao%7lky$ zDEr3Qe3QBT>G;1Ed^*DA|1b9Lv#qIa?Hhdxfe;8J1f)X%>7euuh9(^;A_%A;pcIj& z2uexly$CA3Nt522fK=%as#GZ|MY@83f_dUq*1Fc(*Lv>fzW08ykA3Vn^9#&lj5+2w z&-4F7JiNHLg=W$n6+GjZW5Tgqt7J3YXM2f3Edh72qqL)K%oj#U&f+J64Cl;7Wf_F2Gt; zHdX*t&3;b&U7g4xY)?RNTui-`)prP1v3&8waHSVJ3tQgS)jvdYE{csn^!67Ct!-*8 zxO3eVKODtT8c)^q4%Ccmq(ALT({(K711_8FUiP5fehf#u;MukD=jyFK<)At2mj(QX z*l*HaZs9nh-TlxuzT1ms>E+m25}k>txx8jLUnsmZk7D*jUgjTG>M`K>;;x~0_0X*P zz)M{7C4hb@``vAQ$w`;<^yBNN!?R!Xz^~vcT(lEB6YuWPsE$*8^lGQ7t9++)z|n?0 zHxt7tD#Bzm2D^H&Wp*HCj5In^HsUwoAlh1E74!~QX$2)YwxZ{Hf%I%tCTGef>_l5j z`Py#wOjzZ+@6>`oye3YJiLweMU27c8F?0L=r(6lx^Ei%-36m$KJGztXgt$`4&`L$_ zT_qx}b*#S`!@*Efrn_Ig@_3uo;?U%*{k_!JnH`R~Tm${|SoStah3yHpWJ!h6QmGRx z`-8(Pz{zovXUu`w1{N2QuF7;@;tg3rbUmW%X~^B#EOzZlqcfnfhM4uCvo-V7Oppl7 z%I)gi{o8A!FrH{ZSbw4-M6 z5BHx&+dXk+XOZH#HcTbgzF0D(m@xVcBL*X>uqa)kh%(&2e9rde62A@=yo`py>B`(w z>Q8<{?+_0}v*XQQiLrk*xg{%1dllzAcMVk)!y@waUDWbV72LVmI$i?D3aZrVbIBNE zg1&w6B)faLy84LAA>37d3LyTCqY$nMroVmWO!1|T_Pn$UJ#lxO`ZgcO4#VxjI4+RU z&y=8%NxQBmRx})=Fx^ozLk|wQBaaVat}VD(rEj;nE+R=xkI&I*zNTZJL(^83yC(>{ zxGB3cbHGhF=4m+KyV;t&xOWIzx<1cu0oh1i;);`SHjJhO;m)~cqm-%|u>U+d|BW8O8o)*w<*N}u zL^1&BE>8@Fb!)3oR?GJ#vuBRcOkYM}FnsS43hw1hQMHRBXbA_v58taO8%i_uxwcao zl_OSMgLF!R^kilNa86IQzuh(qy)vlcG!^RKaEIIQqOUUy>J=K>(5=$GC`A>X%J6mx z7?N%lQIHf-JRMQa5LtbE{umc|xN^$55=noXod?O;VHP=(^mumi-CJjFrWgfwPHWUt z^2h^}HG3=IPU`k@E0!`&0v40ao)LX6AI)2UrDgmYQ(g`q7LLD8&8*@QxKG8(LY~A;YOQY~otN9K&j@15O@e;^pY{A*u=Z z;l0>B`|SCfE)OL4rp4;g62=y1v0rEQZv9}Kjz*LnA^aXZDeU}6!XM#OFPQ4oj$FddA^;U zrGv||gEy^1aK7UrOQ*PHr*vAU{CwvnmM(&-WtT=;m(G0G6_##e%Wl)OZj1k5d0Ef1 z`YCYuS@xbO_u`10*J*F4LLaWVcY&k#S##fu`M$h)_LOYKe1-m(Y5lbd1Mxn6nLfRZ zEPbUc1M!xFo$~{)S^D3x49&3g_OT2mD+~?!4A0H?Zdwj6%=ayy4i3x@pDB#&rVTGB z45!btmNYZM();UKM&PU?bS)!Cr~ULTM9T{gIUD)FGWM*w=PQUr=?3ytnN*3D#I}u{ z7B`^`pgZN5e9^|n6j0I|?WeD2`@Xr~YpUcYV zl}i~D!Wna>eyj2e-F%AHD9*%V786ujP-Cd)xk6YXDH{ZlFkG0+$UGYEKzIhm&2^G3^c+3C9Zt-ivrS*3i z>y53OwHfQ_Y#YxqHp1UC5bPvLjN-% zizXj(@%+JL38&fSaPflSbcHA1PstDL#ILhOGATw#yc)~XO_lS7O>*>RT`e|miI6Ow zDz$w5_{{O4{Y0hhz_|%9O;Ci`>Uo;*>Wn0zCemdLGuz>?)T*>s|n$(lWxao-ATUFpKJZX@p9e^ z^BXnYaGo252A~K9SYi~1jtsxeqv77e=F$?NQIwHPGO|xa?|#!iCgET;RTZ_jLjF|PiX+5UaEq1o`ysVqcB0Qj{tdI z^Jr*U;hTq9*W~|@$&*dYnvdgiHey3kXMt=S{Sd1l{ep?&yC#*&8zBxvo$9|f-_1*ph zWs3pmypC(=Ase(Es(t5*DB}IM5c(iDp$ZB$4yqeL`Q>oW3MqEmENG;02jab55R_Ke zd<;jRGxoODCPUCkJRBt_%>a@DxlWmT&*ja(9KKgNnU zBZ~rX?!Febsh9A40mE)MdOXle95dfEi~Q%U#_o2_G^QvrAo=XOsELH=GxT z@_Nt0#=YaJhd786!s}|t7B}iX7!yy~YF?86AUjey@ik-CB8xIC93MneN%vSaU-e2T zr9uEYUE?}~=51V*0H=74Lw2DdK`5d(N-pswdf(m<6JZ$5W1Pct2-Q~qjF*p!kWTgh8QUY`=Vp;g@XHT3gw+&YWDl`qNw+fzJ#l6L&zh ze8$e43Z>?KXj{o5&5fe>t&S!oDi4d36Ov5ZnWh-pL{cpuZeDLMx>l+^S!U8*%>7BA z;Q7t265oe~%JC7?U1LM6Xk4*LZQJF#J0vM+dv&$B4YS@=!zzaC61##8vq^_*GXEIcNea00u4#IcZi_C}L4{Kz$IGfv0JjGT$uP@Nsyhs;0|FMd~ z#oo7+#s;V1(Hl@l86SL4d5JKGSD`;Ymbar)lKOm0&hzzI%Fg9B3Yz_nXUz+WyALNL zO*1`5UN8EVdl>iJ{OU;3GTq|xIgaDjZliQ_;SC$~v&bcq7h3SMLl?1PzM4BX)u=yf zl4{m0ExD-3#G=afg2i?F;X2ctv~?m-5z3X~2(wrk{{2v6N<(eiUEv48njl3>Geo3B zhfqvdpC{+A$o&vEo?RP02&CP=vh0@uF&u%X0Vt&B22 zsPo;M8&l3IZ9=QU1!aTRrgTy!qvXO@gg6r+UyQ5=S1-Q4KH%DFW4=lmFK2T+?5a)* zrTY9)O|VJYU`Kpk=vD9of@}6$vg=fu0c-1$WE^7&=SuQ$8tcNU=3|Lcy#=D_8{^SU z<5aQ_3-c(KL<&@?6)*Sl{8(Q%dC`O})0b4U>kq#E?mN0Z=8F?NdR<-c$xMtxNqM-m z9BfXGw95W6rR&+&ilpIeSAJb>W%{-~udVfDYh7ig@Q%X^=J{9W_?f%CKaEI9&&`iB z*TYqITr%aycc#{qS6D~AsvXcxEAzND1p zn{!umxER9sO82x?o6Iz3?g8w0XIh(Oc;#dChCLdCfTIsLjIt3(wv28^KE72R=v~7A?NN zb<5dny#D>6GJ{=_Ea7B+P3fR|mk7jBpDt1OA6s>|?8N#%dXMry8FXp=nn~?TU}8U+ zxYY3FW$f9OlmCzTH>U^1)P8oi-u(D@@y*Fd?9Y{Vil`&y8Hgeyx8zOfiH?T*uP$=^`|7%)K~P zO+RAu69=Cr(>jjZK7g0O2hA5C$ftZhEZ`zEP?XP2pf^C$Oz{HWZP(fxCqvZhWDuJt6n)L+^k>J$ik;5I|%F>8vriZy2P@1Bk=`=d$F^ zA%N$yLSbg1kD%e!PeY@Ve9ef73kD>!)ubQ*-~vVg3;=+!WUq|Lvt$5?Vc1xP@Zuy6 zG?ownwGU4!49{8*?*vBx+QJICKrg%^%8&hvdL!D8nG@C{hv_4eH5HsNk(s@bL#}dj zu8}QfQBDnh9rn>BDpBJNK0Pv#>Ag|e$Dtqg$VLmJC%mH98U$v{Vmf-F`x>I^7$S(8 zR)G!h1w+K9rhi*l%#Ue?&tb7|-oVaJzbBxwH!+!`vmw( z4tzbb$Rje%j5w78Xy%LGdlyH||I{2A&+hF_n=E&(FX|j1;txzKD~?AibW-N2G&u{m zR>Jkrr-d({?yUo95fNF_U@`t^`59l1@C2O`lw5cs&%1?~x>q zS@0qnXi*k2C6@AY7M(c_LV(Cc`<*#t{|L)wg5_AwWOHe!jxgr%ng_9H=S&@E^L@@q zYs?W9pb`F_BdP5pAn>wS^QBb(%QvAfWntm+voBSP9$$jxdh+F}73CV;&DEF<(Q3*y zlnbQ&oV!7uXB3gwI+bhEA8eYEcMEpcqCd}gJYJWKag*PV9eGT7S6|`)Z@lOcehr@3XHfQ&DguZ>l4qs8qmBtk(8be_9R-9J^Lj z&t%)fUA)p~RT5GBrsxi7U2#r|2hs8}daFhyQL>^{T=1x5q)D-r1k_ejvJozgM!&)r zl}3Ct&N46E^DZr{EBzpM)$3#FA}llXfpfTaS&x9%ctjcGGu68yX>-iON^JQ_QTdPl z^0Uw71Xu-7umY@80dcAzi>x3ouAmsGfNxe%Q&b`aD^WU?a6y|#t~V-~%J(*?9QWkC z!BrfBRa`n%+)h=zkyZS~Re}Rm!kbkWDXK*UtHpJyC7r6JBdd7?!H!ua%of=@co^Gj z5>=fVb*CDQ$Qm_fOSQH?EHBE$nqQVzB%wx!qV_s-?f2WiEU(LvwdR6#=!052r&??I zI&;C=I|FsjkXp9J5FxFr`Y74Z{w1 zVAc*+)(S?rBFugRd_N;Wr!oFDszePLJx9k{EFC%EMA=1`v1wPz(-a=rB&95#_XF{K zweW^@BgJTDWijH}CM&AF@p@%bmXl+_TqEV%rl`%=7RpV{#VnnvO??!oZf5&liWHDI z5=|1nph{$$U)O$WHfe7zO>Hr+Z0RVzXtm!$zTDLB)G8+0y7Z&zrkm!`6axLAVPvzF zilgNvMe|!h#3!dWnNCg1uNgL&2~9r*TaSuc*PPl0blMgtTP#Ogsex~-M%xhkZEptN zz_Z`bvNS)dYx|Mfe)78geQH~aU>likhaj+x#H~FF*-jzaj-aIZnu-vPYR^jT5VvfL z1-40+Fvy;^b8mJ2bnWPo?})DKfO2$RplqRZ>ypxKRrH}#F6oRCZ8dD}gaYkLq;w>F;{-CW5lRGVN{lXX23Bs$coz?hykP~JSaLOG&AH{8x4x7N(Onl|XTH{2{bvcEO*%6x=oY=oeT zhdATOqVeRVc#0uBd>c;{5Y$N?}{XtA|amGM0;iz7{?HUDo>Fuv$b@>tjvAf6LVRTTtt$Gn;5? z{oew$#{K}cw*LgR_Wu*8H8(J^u(hzTe7rC)_P+mMaddKN=wxa5aCu_(!`Io5WB1d^?)ur@ z+SdNY_b=Of2RommYFII z5!A{#$HRkDrBS$dyc{W#PoERQ^X;z59QAhK`nKX z<@3}E6MI?F8rK{?xr^&B+oF^wD(q0Vh%JN2e+RYViJ;bn8uGPtucmxGq`|ih5KIii zh)xvCT2jgMo&8%-i$dU^pq3WNawNtsON0n&?Zy2IsHL1`5T#k2g%48pgeDO|E#H6+ zBB*84^9$6vLqi0$sHmmBulxpTjnI;<6bY8Mo+E-mt+;jF0!MOS!sl%gVJ;-`NDwS06$lm7y0{rClHp|qHN zeh0PgY48w1Ed(bK)Y5?~ECpWSDYOsN{41zs`wvh{`WL9BFIshJ*7i40D}e}VP5%aJ z)&Byu4z$ky4r=LfpanqO9B9U$YQI1&y)3*6*8JF55O&rDUdjZ-u*4>U#4HTTKy`lu zYULnEIR5~(VnFYYDanbTmTf@5r=RP-zkym-@MDsHfLfgY1hry7)XBdY}me{(n z9*thf*IRI3aFW*fpCoKu(yh$*i&yxje*?7!I>p}R0M2^yFMW6|wMeme?6_9!J)c9~ zKafFLN51Yv6Ozw&S(AdX>uVhMh+W#Y{MS=V~P)ii`3)DLK z1!{325>?w3B&G|=axmKODTU?5vV_~ce}Y&%3P2_J4p{HYm@3f?9g^iApLTRcQYm)S?#% zSPJ+Z)S_c2?FqadO^^Ac`0S3w-akOC8wep+VtB*9fm%xYy!3H@fLhPV`S-D5O?J+= zEjB<>WigGz(fbcUq7XLPY;up^K`lKZsO9`mP%GE|zX@uI6G1I|BB<5dMcey3sHK;c zs*jdeM46?@&%sjjU$3uHot>fU&7Dt=~6sy?NexwT-RSKd5%Y?WX~_ItTvI<~L9)_zzHvj|ggw z{}a@@MFh11!C_eNNAUGeU*zo66cxCxth=lBgCB)0feL>EwUk}|0JW0;0cwSU)=?x1 zR?@FM0?Sn|rv3?PY2iuFmOxtn25N~z#OZBUDzvCRJ^YE)M$ow)s%R;4Y3>A44;Ue+ z_}tGwa?Q-LP0fP$KE+w?Q~LbxK`oCzK`q<=64c6p#1la+dh&mOTJ^t!T5rMWM!aN5 zl0QJL7bKZPP)i1~Q}rjPg$1jRKV%LGeY{TXlSDUW@E1_4hspY;5LVK`jZ$dM}wYEeW#GuE8swN14a=??9~!XmC6c)G`2j zL-@yJ5HiMKz65eHEZFB4s3raxUNF~)+E-_moK zKBYB5pa>dbRL>ApYjGga3{5WS0NxC&7Si@x(uiy(2M3(w5pvTiGBjQ>T{H9N48xZV z#w z{~w@MH#p3d;)0cn`W~d!;1{SB{})h8nFwl`{|Rd0Ni4+aYqXK^_}@S+8j>0m^#zRC zY=7w-D9*S-+x9P@R`P!ZYUTbLsI>F~rC)n(%%+8?ibVKp z2B}B|tJJlr%Sqx^o|gt#WbVCorfu$EZiu7HE4A{zVh$fqDD`a1QNTlaDdA#!<<5Js zofjnD)1Y_}3VB%GihSOfc)tH~-ZL%4#*!5SWF*OpW5N+prsA#|D>@*`K!y?Hsc z!B9KgC*H+N7jczTZ>SL=>?(MM=1|C@?iQ&MM&{mX10NBt(|Ri)qED6IHz7x_gU7?I zjh8?;Ocu#9#f6C=%QV(2u(t*RZ`aU7sO#;$w4A$Jper!X`^_yJr(`$B;8MChYa$f8 z6;v7b=vNRasdG>VSCbXMU&NZTSb z*mXM4qbu5zHrP&y9&AQ_<3}DM@-u~4JDJ}uS@h#9-rjBQs7HhI@bE(;SpC;r$+)+Z zX!y5TxELN*%aYeJCj3~rmsXU5YzvW$@A1pVquTN16UfWW!=FZnU+U^CWAv{uk>91{ zD>}$keMaiMmy3%t9iiZs!IOAU=H*onJi<{I6V?$4-x2iA^b&eV>F7~=P(v&y{9aK< z4NIPiaNdIJ#99_y{cL0mPol;=?mJ4wewXR$_PE2R+{vZ$`eGA;x()pOlO(=WGhE|l zzEgvukPD^d+VCk6-zWZ}m#%W;P*rw|a=bGc&3F(!)yAO!6r&IlklV`2;X}U^?x$kv zr;>%CQ`@LO9pJ5{cWOB_0rl@RI7V4D7NkcR#tTZHMh|)1o zLifAoWqE*DwjxtDvg0nGN`Oi31xQ6cRu9AKh_K_Lk<&teduZhskalv&#c5=Y6JnsW zTg7c&D1N?h!5xB3C`(6g;)@5Y-l-9MTuH2FF1t;#7`{?HLL7=vC|6LCuTkS7pn?4b zei7v%5rxJtu_C-pZc>XIE$_(-3O%hNyD_A;6@t-@i{i@f1+CwstRLMuf<6E6z~IrM zBj!53TP$G^;lTQh8db7Uh4ZdnbU* zByOD2ka6x3^UkDJSjo`$64PMV7`~_O^M^N&DWo$#EXI`#2q3;LfgYRkD#s|>7=A!< zbij>lRz$p?74XH(G%U@8^uQA`4c2ObFOP<+!vC4hzeuLH5jg2g5!@U+-ze`PE(p+Lx5*w z)@KgNNg?4$i3ADA877WenH!rtUOB>cke?LAkCY0f9v?xxU!cjLqVe}RWLY`&K@x;T z`<}6I8yfu_1|i0Jr)NHm{1wzfa!XB^-dpJo++5AF|EiaI&c85#WF~0zqgEon$%T@J zF>X}{&zpY_YONba9s2->Bq%n<)3vh&x}^UF)UxgtyLSY!pR9Bk$p25E77lXhhP2(@ z`;Jh$i+ARy_icLP`OOn4>+S3YlO*|pb#Jcot(R>JD0ppY{`BhlEfIFM-eWgJ@r8`(huVaTp$pAY2h(FV zn=z%cr``t}ogGR?1>bn=Yg5w%jhScNrtO9$U-_0bJr1{;%dNXKyc|>e?S16Mx|7e5 z+pS*!L7Es4-1b8I^io2s(vQ7_JynD%Z9LDnVbOgxTH~v3>a>Ie!O9f_E4Rx`rVo%8 z7%CGgFS9u`yQ!fAX~lYJo=8-wFRwP~_Hd;t>}%9NpD)tnjSh0x6sn#WlVI~I{L(8p z$LZY9`esXXu;%SVgtoY^uDgy*L6yxbnN78#A+}VJLAmOtl-o)QP7?`N%70Q`#S;{_ zuU_T5YH+;rOr7hDKLJFYE~>BN6eMoU#C=#(;AQ^f!PT_skBP0a>-}!UvIp{#-C{<* zqK4OQ_#SqdX-z)Mx0n?vOwwIHSWDHNnmL>f)BQv{RFi92J!o4L+eBYp8B;TuQu9)_>7qy0|9`?5TH_X?ms z`=!$OC+$r!gHDyv?@EI+oy}$g1pU@ir4B?3*{i?TFw9)nPPTh)K|lXg(LP5~RwvO! z|JcSk@R$?o>!WUqx@z^_vr){bu7;BBApSCwiNmDhIfsMg>^0qNpt*g{-9wlIAuqXZ zVk0Yb%G)XEKr6<{d95$^we}nfqRxBE-Ba-MbEOHznoV z!`E*w_bp|U4E_vPCrvW?^SCGzhje82J78ag;g! zy`fa&$B$d%Qb1|8<}#oAs~=}vB7UU&)*a~eyt)-E_rk95tE|%R+o>HKbY@Fe`_gyzt zA#7FJftH1>a5zaxHk6LjJ(7vR7M6Z$D{6_qDApVhNqPcm2yj)rB~w9OBcVw`-4Uu- zi)Q093q7Y9AFfp<&S&!^Q?|XYec8Gu?VRFAS{XSm` zpHgiyDe3wkWN$p7f`2+yBNTCkN<1<_G>xmAx?SVTES~fWq#dFR;P!77ffUpLA;Owq zt9Zvao`$6Yqi!7KeI&HtZ70Mi&r$23cgkU`4fsCf>z6^XbC=>JiKP2^as9xm(+mZKT+vt)<6Kpszaj#3jtw*P!9lIi!A_~70gqz?UgD`#0qNL^Kq;%=62tT*5&UpBprGxKC)=YHSScnY?M6Em?bs3uy$F2tN<7>1s_=`h7Z<%fO@zB0NVoqn6_I^l;|!B`r}Rn z6;kly@jlK_=7VY&IiO}vO^N|qF|zzIfp>)^0gqYGJ8--No>glCVBu46KSgi zn4EGDnHm9T|ImjKi#wy*v_PfTL~4lF7uw?XfX|l9TeGA^-q)gGiEm zC<7ms&QKZ^9|>pR=T`mbxyZH2 z7()mqW|;0m1&OW9(rPJGatkfF5(-Cp4vK~ADqfctM?lpOP*Fq!hE7>|Q5ETI)uA30 z-~ojIDger~M07-sSs@he1#q(m%v!{&iN`PLG)c=KlOpBp1G{RuNQc1yWJRrqYCIGS zy1u3O@O6hr7T~*hi$|cE7Nwf@RUBQE#7Q{u*{NZqDj!Zpo&u9;;Fo5hl##gn{Y2bG z6$wki(#~MPBJuZo0gdvCvEs-$@pz)C<*JJ01ps&-D1mg5bh`1ZA^>g%05#FnDh!}= zXgzQ9%hX~#>B{v0J&{nfPHJ&4d7F?R=`7RuK2?bgi1U;{a+(29qm2YhAC*fL8a`_A z4AXBt;uQUyBwvGp4p(_>qY}LAL2SJYg>7UqPcFBIy?tt#2^48TonXi}gINv&InRd( zxB@g36q)hC9?i-CEXj>RqN#=bWokA2W@=SniKdp?v=pZdAcv)!c0${qMd?+w7E%VQ zk0-rU(Z}iqxB-x%1OS$Q^pLFw8y8|pml`Tdz&B)mGqoxJ*EEC3T;+5PKwg5EP~iZ= z4;`ky{!-dbBy)45LZ~UrK$XIP2bN^|$uCo@vpC-a-5QoM`80Qf}wFuz%V^KE3riR_ynh6 z6i}Y`l(Yg2AmIZFCdyTbBQr`ae+FX#(^%&xAz?&Q3sb4z&{UCHp>L&MQ(dPUvPrJ2wVQIZunGV(n)Fg(K)FJwbYTEni#lW-z|?Y$ z*L$i_K*Uk_TF7=LL7M0vLggvd0AK)?Wl(iAClg|T1>&{Y3<0rf@lcEbjU#dSy%UhZ zMs{PRfH+`7BaKZ}+2&AT^EBxa0AM0Jz8MT+z~a=AFH+^o(x!RbrYG=Z;=bespU9aa zr&}wor*8Q=4&6OQOT2MMmSP4EGb{LND@|>4@cxuoKYcg zGb{n(D#N%9Sj+=hR^LE{17QI1%HVD)F90+NK(ifOwGH`9H&31ffbmJwB>`@G1d#Jp z3U~n^Z2&Hlc_kQ2+w}0l$#A`wmaP@@ znvkj&fiuJrwO2%VFASKKDisb1B|??)^|1tDHfjrw)NaJ z0L)U#QH05T55j`>nm@r6BDk#TbFnFKtWa4noM9G-P-N|zJ`=F83&R0q00iz~D=#(u z2tn*~6IsFg0P4sFP(|bO(%^OrY&OK^r(!%^Yh%{t6y1sSmcm?nBvu+82owRJ0JMCb z1IBS9&u?G>1^~qvP1JR4$K(>6XfW(A1@5I>l!RVwDBVpCzP-n9gb(Fw5)5&)fRjuxILo@U)_N*0 zo|FNgK}$yeV|gZ(H2yG<#@<4g4*(~rs-Fl}cD-#N0pRDkj_PY*I3|^F)!|&f4W7or z!xll^w?(E%L8LA3w}SnZLccFu;lnXy%cUW;ehM*U?sEwqsV8eHZoc$$S4V9bqEnVX3F<;Ec_OVfr749fYZgoMP+2t z>=QM-BRd@f?1xIMu8mXSS0Dh8)6&2j?q<#R0I}9~d#QM))mcZSP#MMdDJB5sr3Q9C z9dvO(N8n8R+}o-f8V{AopjK;-Jn!0H2B4DH>V+)mUSTgP#k2Eh>D>vYrj<(-*JUSZ zuh+u^dtWOFYLElOmP{<ORqbuVm4+wXehUDkxU11PV%W|X@ zaghh`a6td-!bUS0KN~HB(b$7=Qn-b`n z?LCFYmaltIO>^4tCOIz)-Dxb)sw(;*5P-!xmA&m|Q-D_Nd)7GL6~tvT1D@I<+448i zxR#_RLuiwnpX+OM+Wexe7%F5zSK{U9J`#TFBh%R^O}qm7TA&8^LvLenS1^#Pd?B~P zc>R~adq1u6%kuJW=9Fhgo2%p$xo3Ilc%eK$Dro>LY_OrzT$sjNJ zWBGaMEm~BlW6idHl8E#10a#|*10ZHe7)($K%~I7yjfSYAt)BCQv~Q9;*CXLYg-Wm6 zC&B?Nm%j4G?~KI>OYOeM_!;EA6iDE-2Wk=xtE}jU4G>yQy3G6K=W)AsXt3~?&`Ic>LT|H0&uE#jT{@xvEfphL>*q^sN04$cS8(J(h}L0EBh` z^6oXZ%CTk0kqpnaMD|u&V4|kP%tbi zGhBZreubY60F47^8icez!GhxCGUds50s#maK-u7HQv5>ndoz;NCq%^znS=%Md7(z# z*pBwq7@A>i*!a@mdPSlv+H}(^Fy8JCiHKUpy!>)%4>x)!9kB6FvC%I|Hdo11TgT%&(1Xxd``g5Ay6I zkmWeY4-KZ}1DOW=EZ+}(*vssL-SwU}p^fu<^YjF551EI8lzJI_@ML|FVJCOIQ}FbD z`}Dns-sQ)npFm*`e;(WKua&bh`m0ll-CN(|=nA9ZH7c>+=X@K*B(0=|TVd&qSF2Fq*I)EB42^p0dBcg3qR#P%Ggx1h1ADXZt zFjz9PoCX7s!O#GjFK|$blya+~h=))Jn3PX}0Rj6Mm<@iPMj`${c@%V5FmFTU@%P0%7g}rl@kS6eMT%@Er3YSQOXtQt>WM?;^HsJ6##Lf&x9sZ3vnwz zs9a_mpe;Zu&XWwlkOdfPtN}j0VjyJ=FpBdf>!yq3u@!~@6PrtZzdluT-`Y8o_lka8 z=-eeOh)=tGc6ySs;*xR6ryf!35ahz@t2TS;K=HsShIMnWL*qQ@#s?i{AI-0N3O;*C zP3Hf_-hDvu4ioIR3v{$M`ND&&-ZcRpytH z-O@|i`+Cf$Qscp)+;+J`;O_@}PgR7{jya;l`x$aHXQsV4W8@lg^@NK*h9t-+TM+Bf zaQoy8@b}$R?~>#SH@SR=6U$Oz^Y^gnyJO>aqe+wem`)wHiHXWQt+#w|=i5N4LP#F%h_hE-2K)G=S~VzXqo#%WVR z7jJ=70M(Q2lbj*pl)G>AxE$s>h#86dhr4+lsyW~Da<$yn1nE`BYb!eMpgT%o$f|0#J~Pf+mY zr{ManhI=^@A$V;fI;&TSVR|>38%0|uT#tm?76P;TH8xQ3;VKhOGleipnT|#90D@%q zQVUlFK)EH1FS_`vI$nv&?o>dSO0Cq=Wg6-nOhTW~0xfx=G9i)WviFZTMalPa8=D4o z=ci{?Lo-R0#tc~4PHI%k06$!WpVW$#CyOFYYC;*)=pF@gS>IyVxx*Dxc#5iYc|ue& z)-8CR(nP`7qAH{}P3ejnR)jE*131c5=N9BWg{rg%MBKxqm!2~FdethKHOsC{!z9&L z;xFGHT}iy&ay${{HyJ|iyxAix8SEl86tpFc3+dlQOHy%orj+XOARWin12QDI2H$Zk z?`ivQpz~{GOm?;|q0zLn(UWQP8BN~?6{zBupI=r{I+(cbr*t@Z^9NF+N^GNq4dpUU z)o5vn=_&QhP@GXV91_Z)NHoxVH9a1OEn?wEAZ-(OUD2cFr9)~nNFalY2gG6N!Y z&g}48oCdNxv2(g2^&jUR-%$4w6WVqier%%JE>y{@_6l?-m3XTB|FUnu3bQwX}6S8h@q2bKP+i|z6*@3$Zy}~nw(V|l>9H&#ZR7*DG zY{nmHowe|)><*t&F5f2eZ0os%*|Z^c6(fMWfOfLH5m&HA!frX8Oy-O00f|Aw)AcOxe<(>`t{49ZW2Ym1&oP9; zlO7&y7TFVKsQ}BI)cZiH^O3{ z@0Qr^Yq`jLd})UFUSarz?gg1NjDaMeE58p%M(fIHgF+x)pI*71iyo`*|m6&vR5oo0Q_BkkA zrW&6xRgI8C^h(fB-UMV^LQ8Yy^M|f)H5}JJ2I!nypul8iQC-5N2acy|i3y`nnP>4z z#lzPh48o!31p()$0r=n@BbM~h5R>zOh`_K_+Wut&73J?wv2(%3$Exj@A_DMk?W#UF z&b(=MZlpDrFix-7rpaZ9HZs`qY#{RNRjdgMlsL1c8}xwiWREOb;U-U$(DF zX6%DZovV-7woeDW!?4EeLL#ZN3zPrdTOiDLBs&mcX zUC9X~wnhiA;*{t>7v7pqK#nC=eIt22B6 zw?L!KOjmN)-@(KMyp0b1O>+l^iK&3D}@L&Z{3+R5V;2~sjE^lt;9VPQ*;I88mowQgl_`cD2)hS-7Fqx(EEZd}X zjz5@ClS`8%;9)ry)bb7G88brfKT|z#J8e^-kX(u{Qkm!6D%n_?F7N`2wQvJ5erJoV ziP?HcJKufr-oEw$PrMDvl5_6!c)7?3ad!mOQm^8sz3BHZgZSloz4vs;{LoS75<5Y> zO`6&7uP;=M97V?-Z?;#Rynk+WFU;#T1u#otK3RD?*E*Eku^y0pod8U&t9;xk`s4mY z%W~Z%Y+w`gY^|nK{T;E;jeuC>VWUdqs6Xb!DEs^F0|a@My|Oa69Xa2Bz8OJL%oxya zl`0;kB9-7pcpa!1^hhj-Yv&c=4JeUkk0R3WEWT^Yawl7omr(mLCL+-Uz|Y2aBa1B(3E%>UH7M7WdJY9_YjbK}Wq1nuF~kIzi$BD&}_*g=DVtVp~|KE(JwosmGi$6;B_Em>W- zwha38yQ$+JuPj6rf{G?=wdb?6KC*vq&l+J5+)U9@P{coSMF>uTSY)y}52rrDSSAjx zeIKFKlbioxZoz#z&K-#0QRaXjt8tnRU-C+tEbbJR8dmHIk|~}OE`|$=;0rhgjTA07 zYX=GW>MP`Ci*nh)xj=kGIbx#|9|sr3(!##tCX2IidU4tBi08nb3oOgnRv>M zLDEEOP}5H6+W^*`08s}u_GuVb7+l&kCnC;%BRxmv8!mf>4FZh`dI-x=TFCiU4a}`z z0-c(k)W<0vqPgg@B;G74>EytgEHb~JpTgQhX)p(F<;!^?7XoC=^ zfr0yQrExeEp{8P(a}I`cdVMtF!LD487%Rh}vWDtfc*7OBSMT5C{Kl^5n{IZxqQjAU zmla5O2OlyHTaj-T%c5OjObjT2(~lWiyrQXFWgRzMByXI2p)732W2h`cj>EYAe2gZ z++}#%4szUO2p($O9$o4bgm&C@4iuFqU`Y>R_NDd+`Eop%>y?o2bt`>jz9a1=ySf+T zy`vst6~Ei@z0h1B&6^K`EJlh#U5^iWr=D_qU_mVbOO^d}bqYAviv8jRK(s}?Vbtex z%vV1%gm=Mdc2vVC3PMEYY$I-m-gi9K>(4t;<8r{BZICGPnUpEKJE=fwA1=OwD^Q@1 zJnmp$3z8lGk~V$2dbLvm*#Sm$DE9X7Xa$88V>$(q6!#mLgqeCJxNsTM9mN&lubp_q zZty4H;!l>b&2^v_Km|&L;d4l}3u)j94EH=QoudZ_2u||~%tiP`Mbk+I(1Go84UKsA z6YzFnMW%3-wiUt5j<~ezzDxwP6K(VkIv@gEj@}7L2Q}L}g}Jvz9va`y(_dc%1y(5b z=Lrq-W~otf6|(8|;%RF^ld@z{_+fFlwC&uC4r)v`UxM_3-?0*+P$#&TFLfEt8?Z zFs8>Obi-k$MICoz{N-O34^c*}jYrj9TvWV<97P9yc{Rx??57rx;A9c;LY9a zui1=5cd|&~GV0=%iX{|=tvP5!waUQ!+e+p}otb-a>Xwb$WngY6CN~ z0=F+;@=BBi!==mbDijBWK?9itL>|yOMkzYCA=5M%zM?GehWmec1bi^cU-nH*L__ug zNd|T?F+oWO&(1YcrZOVnFj-2rw|AF$=P;%1gSD4pX}D8ds3SgAHBK*r;5!q6KbqpL zw-z&6^{z+gslM?yaQjj}35U$kImdfwdm7CDbEPktoP;CSu>%~**(U_#NT03MN`qAg z5U8yvPT}#LagUyMjM$DJ=@5MLG?Y126Ay!blx*DT02k^wf@0%2k&Rw!hoX z@+|bt{X~!UY+01ZgIg%@IR4X?Qt%pFp};j{KS&md53ojyVeJEd7j=qqz!F2Ye%IM4 z{QQ>D*o|R$o#Tgl}N;7+?%;+Yas6TC(^3J9H;h5YYJ`~K znOlMU=E2HRm>nwHt7`osir+isvSGg0D85$1l=!F_&q}3| zvRF~L`3SCCE$n=X?T1$hsGpN?Uhbm2grHKpQMGA!W_y?8)^Cwgg z|F;@%qv}tLfjjNOj&T-M2c5EA{e@Q*C~6SQp0*oLFQ#7<=#=vS_#X1~+Jy+UT z&pWSVz()ypYjN*2da6TZTqQkSX=EstB(!_DI$W==33t+G;;3AyhVdOqIJuhnbZWR0 z*h>eJBRg&r!?^e;1nJx5M1o|G11XLvO>VEC3NJnRuBNb~$|ZZ5GV&E0dPnKGq@Q6W zw`PtSEeTKUb*1OJ=goJ5=iEfTO5Cgs5cTU9BCOI7vQ=4yUA=+0SA#3hsVmx3#-BBs zkLVB(54Pm=Am*2bD^#l#ADkl&{I+7$9IGunq8__Xn|NF?y541wV**2Vie!R?-@Sjb zivfK-hMT{fFv71_>C*RrfIJsoP%sF{x<5DLp4-dRNo?);(eeV;+a}L-_xU}1oh8|P z!%H)AhS7}=q`SDtCe{o2o zebCJ`ZStT)&Q+W}fyBwI0>*d1lIdaPH*8f!%(yCT{oNxO*Hh!zMD=`(Aqtn}T070d zFY0`3F178`TOQU_jEssPJkI63SJLY1bCPd9_CRc6 zj<5AR5Ko_1-D!WqxPH|8;R}mL)Au69#>T1DJs2Ylv?P40aY%WOB(P~9!LDhI@Juj>Y4hsuM#!^=C5FN&qUE}!qq%a zBv$l~q?OxbBQJLr4S1!mJT?%NOWcb}9Eg%Z)SKza#b1}x^cqua!e?oN>6;*Uya0e> zIC*+=(xO*;(6ejvQ8f=5_xKJRT0Td#cG&mi8VeG14|##?&z`)SL^1|T4S$29F*vQR z5tcb%0VdjZdJo^N$wmcw)A>LnMuN{hp)keARt-49E$d&Ho0huv9dp{a^_m9baFr29 z^2IMkDrmPAd~dN+u9mDkJ66q|EowgdMDzY3$yyXwWlIKDwxTbceVCBb&M(euoopq8eDvI*j_+9ki zg0qvxgFKrGCc={){xCf2_P{U>+I`wwbP30?A^2=u;{w#u%-GPNzS*w(qMF^wRoour zEKH_xcMR~3*R&vlwtA2MrePL5oKD&se8)KNa;l6+|2|y=Q`Onhy^b6Vk(Q~kwTB`@ zWtf!TvKLp7+lL ztZY1r=x+p^vnR|9E%JgzB2eV%a@!^&{6$N>eeg}iLeDdwl>m-}NUyz!y56-18RYTr z-qTs~1Zl=s+x1-?TYiv5=S)PL{m~?)rzG&)QtRkQQS0Gi+tcgsX1ZnF{Q5PYU-DDt zkL59O5fb$qm~7CEIoKWfwwCzN)w8$S{B;S3p9HL^QbqvJkh?nKP7b9q5Kh%W!i{(n zeZq07lMTHA$`7d=cN=_lmux0CJPO!B`P3tujFK3{`9?@}y?QM)3Ztw1-La$Lnwz!; zlQUw8c|mn~>De3Lm|zdrGs}cNo|nY}x%BFtp*nuOVMZa314lJ4C*>jvUx{i`k3mFC z*7FB=U#)AtN@5z*WC4qHI_u{w%juAB80w5cQq-2KG8MQvOa|2`t-iFg%!0#*bLgbO zqgvXh!m>g7LXJgD4NUh-HZrS3-W0NIY>4*lq^(w>WHvUthtn+!x&0oFMM(>LVS|NGDCRMj?L$Eb1v_q z4Gj~R&L#OI1_@|}Z;wQ|<$CtKe7@ivd;Z$eZ@v1tb@q;{Xb(p2=%!?zH_kTwQZVg} zKR_)8i%PAIb>j%>g+D;8$99q09dRF%uKW&akz0MCK}b9%D{!udt7(JcE6T zG@w&A{{z%ow=aSPlw}^h=y=Y5Z7(lGON*cU85DWKq53CKOTF}UaeZ~%>yS^?<^|y= zLi)?(rpJ6E#=n7DEW3Lhmt!soUs-XizuCXs9`iDSLUH#OP)q62T4_Bt!_zJ+sa@!F+z{r@Roy z#tO!QTCHr~H{UL1eBW-yu$}F^GeBBrldwU~_b03|-#tEBk;RK1do4na;A-57-dx-H z$is6C#2+`wlE=LvqZAICya5R8`$ytul&(3ub^fjwX_ug5{%f)Ru>h z6_6ECx>E(x$8bVrWLC5p!cOjdN1cm zCdroBdERNimUD~CJd{=B3rz3^DAH_PlPhpIm~08oWO0ng`MMl%CoX`)I>nBLX*A^V zaW-Xvm%8+Quf)J~@o1cNyQd3|p;}EOAkV2T`di|${JQMa((;p-$EOuh*P$Ksm-a$s zj%|oHR##|M;RN?3+Cy)G2_VPNA>`wlJiw(o9xy)SRe6wX0+2Lq@PYQRDgjkX8@_~Y zdtBIY7dSauFM`YKP zJTog>uVbHB<=nix3u?;o$}u>(x?(`t2ga-o91t!qRg=vDx8p3sqsX>O0I*iw9T2Gtd1sUVkQnhSi5g z)DsC;!WhC;4fmLd)*KUd;=>KVoq&yZV|g)kC=`G6YfS%G<7a6}ZI@Nq{9yv?aZXb6PkiU^ z$0o{VvRFf11xQq2qYvirUS|oT6<6`h8d~UTD)iEnTXrEbq~u8x!`+H4?89OEa2Chx zPxr~yt}idcYnz;DoSh??ogH)ZFsBAD0l^f|+8*^RO0* z^7}7=rnWr_tOiNl7GLy!P7cJ_~S&T`FA*gaQx*v1ARe3tM%VS^+gy|I|g3`Uu{Ba&bX zuN7Y&Iv#r!OJ_X@J^g5OL1*Dp+~v~}t9<{|^a*^vTI|pe!A>Zt%;EjY;rgAH%_z*_a=^#Sq-W@{jD*fmf-P6CVCd9+ zq8F0ic2822KGlm-dlhpm-Pe*<%HF!$9JE3f*w$SdsupuKCRu@`EVVfJLHFy^ofqos$Dy+E@#BwX;dvnR1= zmMl917Y76r(@ulX<7g?PR%FNH&i=9e;H%W|*1kT+oCttynQdx&T_4+{3L_RC#+>1@ zxdhN4V6(|XkPNDE#ig$q1{}ix+@iIrO-l2Cc(#kF|ln0PE4vpz%P3A(G=KRTbM3*eL0B-LgbNWC$gK0$=-{)wO z>%w2iVCYgxR)YJqB(XFEBHwaxBw0o#phnbkv4^ULWPD=OLzFX?loW2I;OkppO%5un z5^6HF?Of=>cKf^~CCNFAX0{aSRx#mJ4s|gkehfqth%9JSOkBlfn|eRNVMt09`!ynZ z9+O6Onu~*5A+lh+JMSg}aT%P52_P8|1z!@~fPURt&6>Ddak9E}EHeVQA2Eb2E#Ts-AaumldbKCl_M0RmmQS^EGZ%+E+KnAzyk|4UCVHQC8Rt@S)+A7Dy7WH8P>hPyn5-ZZ zt7d)DACtr%q9j}1K)M2j0TfLm5Z^JHKpH8tIPtYnNcAW~zaiPx0J<>Y4_$7gg`k6^ zL5QP?%z*ok>fLiPj_E*{DB}aZ-Ac&5#6*(V{O2J)(ke0GXlAP|vhsG}Nh37Me~N5r zLr8zmm}8zg7GH38zRh2)*%q8rexgAG;x<;}uT~R%DBW2h&D}ANDQJUBmmv(m8x~uv zmxm!d<=}domE26IV>_SqAVhu~-pqcmzrB<`wuQP-vo>4ACvQDOyfc~!6JDsJVKyCs zlg23K8FCCusPmidNl9{jgh^YQlDXirSqOiu8<*xb7Il8V6(u|mjGH>!Q{fj@wJPDF zl-hvC&~K|UUN3FyjzB4?}J{$VXgqb0Y+2ydinV zTuL;H%hICA1VrLsv6r1m?SReOneMvtdCcg!<34vz78|3H5#t1doA5^&fL0!qSUp=) z;q63!DHgSwlrf>JHQDJOV>k}jO4?=!gsr_ES0pc;z%<31C`~|6dg6XtYzdSko#RKx zOXjbtbO&6`UX4v@R8bwXUzy+6;}+I`F3zx8y$8mnzlzIxl*A=s#I%=1Kk}aPG{BgA z0s^?gv5VdDnY9lkQ68Fx`wE!yDoa@=b<`-Y+>kB%RpY$q2~;KoU<|Hvg~-N|BVfy7 zS8cxg+w8GV*FmZ79b7#UnUtxSILxvpw_Dk6qnR6=s-vOT`^uh&^CRVetSyu7K;I5> zl?++AJl@76$|MuAh7;QN?~`8phDJ$9E787}XV zTY6>WCt1L$Qed0$eb+5AaKO=K4HdyI74|RPJCJ|}y0X#>Eq!1P8?>dA*#WcPziD7# zm1I6NLWy8MA-Oa;Oh=D*1S)r-&DH~z@37sfroJ|)Eb4k!Q5GIW&md@MtXUkR5El}zJZLY0lnT4wXZ1*I_IKT!ugIinxL zsnZ|Q1Aj9O@SvdIhVGH!QgF+T@zn)l+=UsEYbNxmf=%i=Ns=6{Pd(L^0gjDM6xjG) znF`U=1%_*PkqMt+N!y!Na#s^{n&Qx0C+RmQ1#dLlK)rRfh(t0a$1BC@qsz=h5uVy9_8a z8Cd}S(h{gGbH76tO1vHV7{+#(CBP3@;&^a4j&qGwh6$B99}^JIS`S@5wx^taK1&pD zB65v9AZk!`udbTPxQzM`mja_)Gx>v(0@bFGhh{iknsf{xGfW`kaTOgrKAuC*?4<0K zvws6KY+=!^aWq36fI^`CTZH*u{|-n%{ITD3Mix&By=y`q<$P}Dl_e_H<0Wl-%6$ax ze#+L;Rs9_|dWz!I!~Wrc%k zSM$77o>%qn1@fFJ)M0GKUl*}m1$mh{>`eRqVp&S!uTPc|JtmSu2S1+<((K6Y^ww{r z0rHh*rs_oS>i0c*4B6N!sk?XXnpqTCgg{nxHov{SU&I(cN4NC8Qrr76-?D0c(r?T`!v-G)I_UFv?yARYg7~CdV243mccES!Gos3$+O<0rXfpG?PC+S#&;9W;IpY zJki)ZCw)G5r+Iu)2@7XU4JwVfD$U<+o`hV$SyL6~BfCDcNAt5itPAn;i-kVu*_YFc zFBcDdCf8ohepOoh#JaR{J}LNP5e`K|GDaygrZ4$^WS$>^+YlY9vPRhu-B=(Qhtb^l z@ktuS>gY?uAQtIRV?OSKgWR z-I?(H`ju^$*6$l*>$W@2xB463PJB0En03j;ufo<_?ywzF%(?+)+r)Zft7V_&)ggcD zfvDfX;ElbTizF1*dn$fM>WjPXDxYgJKBu-|$*j*lY+pPwzTCNhvsNcyt9RH=tusEk zDu2G+f;L)EWV2#@4>=#RKA(JbKD&6n2>G#M{o~849~+B5zWxVLi=!RLerEm~sO5b| zB9Z&|pq8dgBDeT&pq5m@a2DbxsC5SxA$U4Hv; zpCn5=JFfcPEFF^h5qhRyOD+8*)$AoFJmGqyZ^_s|S~qrEOSA$O)S`!U{-A8SJ!QG> zkSY@6@Gc6KoQiV!tkoOA);jKdb>QlJ2BYL*3cj9;vxmE!e#-p9{AQVmXo{)XZN46I z(dlv5S>IbBH$*R^zfA3^0c7Vdby8mw16c-=xnbU24+1EY*alN8ug(V2-QY7Laoc1^ zki=|03*wwBbk`tJ^IAfYU1Pv|?vY+`aGB&7tR|O)4Iu+OfO!Utm~tZlCOVfPN<^e5wGHjorfq% z)_dyDsgk*=8NTwvHg+*OcMD%_2zAvzCO&%TAA}F|F!HzkD0r5^brwv>py2iVhaN5 zJMTAcN05t}b%H+Xl57tG1Le4v<#kA&?B65EczGu~i)VmQVTR4KKMTOEe6!<%&Z1ADqy|9U0yP0NY&#h54C ze6x%l!0Z=*v(kjocnPlqZ(ChxZ8#WYT%h)<1ggy_ zv_!@~+ChJ#H<%7V)KatG1|@S1GhAXNj15zb5aN|3dI6J*s~~i++;LCE)&FV03sgk3 z)^mXKQV@-N38r4;`ytF!k|iO2%}CvU{$E#Dw>4l@}3LvZJoNvC{iUVS#PbI25T2F8mc?r#PbRXRk}ETWSsgp&lSVvis-LhB zw=A5Pg%{LRS}8f-dLnWCL(SpaAG#$mvX`%Ce*0G5B`ngg%5!brPL$3=DpbgB0BGJM zL(qO89S+v|+n?gz)nC&1!`xnz^bYjT3vS@1{=Qr^(s z7c)5AL6ba4>0GhdVdciNqrfJ+-mhk}SSADEhoqD+|0oC-6YnYV7d{pL)Q@JZOD{^1SZZtcZh^v`LV5zmLC+ZnZfCfqt?|*xz`F*iWA<*ugYW{`9uEh1>m__;Y0|#W&T8eq z3yXq-7BTE;DEPI9IT*6Fg_G3&h+EY;lnHmV4ZI(|Y^jE;HLr2knhmT?b%xUfYAbOx zjK2~gd80W${M<8ZqD(`p_!-JVmAqlNOGL&F9Jzt^0d%!w1PG;x&T$+9f;KaVH_0YskI)W%}Ed)Dt5n8WG%! zxQn@6CaXI!CZ{XxlOT01;0#nNPmv!m*v(S2P_KHhdpF8&c|jLe+$Z2rO0bB6qyMJXy>O_fl9i z|NNk=>5~HWqkLZD-N08|UtG6OdIX=XXVb3&Rn5J_{90BL5$C!5ekW5s%6o-n&07y* zzt3&>9-ZE5-1MVD&g}UfcZs&_#>Sp;cd{L$Ue)g;Qk}0}_B$Q#Isf(|_S{aT^>prQ z%TX(p@B9sw?;lmZ9d^e4*e~)sAA|UvEUKXQs{DTJm$jb9Z~Go^ES7(3ZN*@yu>C{6 zEwvaw!fQ6?y#AyG{^SMzC%65nNOVbh{b{rUXc>F~Bms;hTGWmKtg8VKEhIxh04D>2 zg(Q$SERa9Uk3cI>`1mqUL7+HakYoW;Xf^QCv8I?-kV0>eQi>@@SP-mNPJT5=-7#2$ zBv`E=*bN@6xf*Ox5U5=jY|<;CM}h^lo`U(Fn)W`$u1=YggxIe7VL!ibUK6qA3%OGe zvZ@(!Yc(V{FyvmZ&toVqbsN#7JE=zsVS5lRwIS}TjXXURmmV5Q=@|4NEKGnV^r@_m zIW}U!gtt;?eo>EToATIy~WUUv_PI&eRx)A#Q3| zxJr3KzgFV>QG&R3lJrtMEc~I2oV+Y!JkAd8H8=G+w zwyE{}wPfmxJILhl)S^C6T6o%{&tdq`7ghW?nSF^>{4Z$50K_;roH&Y9&=+Ol$Txjy z@1~gp_|y9e)4T4ZzgtV|Cw*z#n?6{dHiQMWNH0LG=RZL$^$Sqz`30z@eF19S`w43C zV?nLQ&n`f%wE!%rh4=|-(O^L>(!>i;iwP64xcZ8a>?f$@?RS1BlQ=w+j4W%jCzDbq zmWnBh-aCLmCyUuTonbvoUp9+nJ&Nu$i@X0hlR!3DC!41~yN5oTUnfFvCR?)qi6~i4 zIA4y`o1EI)IhRgDlaJG^<_fAjTi@1Q#kuU#9iIj_GK_HuNK$nWEI z^0=GnnQ-?pSwT(hW!0|yqdUA}^abG--lE*5PrYA}zO{=GD0~~LQHm}&O1&m2R`?># z`=NIsqa$y8e_@WdtEjreE1j1alK3H-SWwF%s5$t}&SRVG^*3+RZsF1urJTCe^cUGD zBgm7Ab_(AVO%}b=fi;4$1$o6g+Lze<3%bcl-c_3K$J|ZzX}FB z`@Ja{?)U0nFZr`tz&MQp)!#b^bZSi$Qvj14coJH8aGPZDgzgEg4N*Jl?2>%Z5x&Qdo%X~bV@yaoe(AOf*E zG`$f7tui*2-4iusZ+fE9*l@3LI?b{3Ulkcs}Le#XnKqlRmjR7FBE(0mDh1{w6`)ngUMf%Sl z3M3))?&?w}A*>6wDGHD32+ry>P^7&Y)fEnEv7>mWVA*Yo>Sh78IkI$EIRURbF_`ND zAEwhfP;`Qwx?Iw`hL(CLnsm(Ox}A{S!7M$WBYQlAx*naq3*GGw6QT`m>W+-Bx2xz$ zM%tuU_P(T`?qKdE5$cLWVvo~r)6|@$i0Q5=>esgHOB3qLP<;2CWgvU*T|Raa>#R2d zxBqQZzj$KbTk-nN5av#yfu2p!a~9NFn3i39OSDm29o9J892jlv8xTTKa#Z!A1{rNp zfKk+tF7sg1Ks46f+8vxPejoaFkaAgW8HIY^#JrZyyw&vn$g<>X@!&CP=zMbsLotji zJdCgRe?cwE*$Uy=D!tja&a<`Av-Ksjjf1nzTeGc{bM3-&oqBWcoacI?=lV+K1_tK_ zx8{Z@=SPI+$MohWoaZN_=ch~NX9wr!x8@fq7nXzc+FJNZ zxwtF5xTm*x;JkPgy?9cx_asU+&L$1^uq(+( zM89%R%`95T^z_ZTU6GaiE&EEj)5FBGkJ|1*l#=5LDD&c9T&Q2$ra!Y60btt(&Z45JaefvT%m>51dKoS|D^ z`Dq|U^4-SCddtZigF2B-mZaCPOs=BxMzPa(}E4nih12Jf6I@wztE3zJHY5soH zIC+w0(AskTeSc|Vaj5kVMT_rOMa${WidNiT6fG_M3q{M0j)%h)pIYIc6fMpRMT?W# z{0~Ko8mnj_aF#v-6o!9Sv~-rZXjF&GxoK{AwG;i5q7{u*w1j_Ev?#HP)-)cO_=Tb+ zj)kCJ`F!|~ik21*R?%|Egklw~P5-|rTB^S&T5Z8F50WsfqV+g15UXgJgkC6GJQs@A zdw|U8=ubr}7W{{zWk1f&{i~vN_b-Z;2g6TAtMWq83epyjV!$d|DP1ettZ9aCsD4qj z;>Ledv{L@0Xl-05TKJ^QPkvFfEQv1^tvGh9qD4-rfDWRjQhryo#8*uysdbC@AXr5!j1;SA0e(}o#(yeWjeo0X z1!5Jg8~>zetrGpBXw|a*s%Uv)6|MQ-6s=gH-xMvgKPg&);x@l3T9wdD-oH__D*mOS zH30lOMGN#((K@$x(USZxDq34GeTCl?ErEYhw8$o__LzLMYvi=e2L%> zMa%y06fI#OejHZOQvFTQg5&cAk&3(EKeLT=2zhepR$of%xv##JQ^YD7Sy7 zXl49W(GphuCq?VgPerSZ0D*R4U`%0$JupX;AB$PN?D%(zmcma(%lLm&(Q^A6MN6TL z0NWJs|DK|y_J^Y7`F4xc?q*;%qLm^>m`ik8Dq zMJvN5&%?h;9II%p|4T(H^t*Pds^^Cec`<037C+>yj_JMSb_ zcPNfkd2Gy-vIpK;OIHXR${s5aRRK$SsJ%UGTNFOU6RZsIb5ET2-Hx7k}%bRNh6pFU3E?-X!S0GehO8++a) zi0@=9W|l+(SXlDRh&xC;0AZ>`Lh|HaRPFhju-a9fK?6q*c}qBRUQ#dpiDwT1?B?)z zI(a)@?I;y>CH|=)1?Ma^Y@JZS<&`=npa;Lb7xJ-$8T3+0>?H#ap)d{!fex?*t)^G^ zZhAM@nj+5@31C0Vz1h^wf45tBC)Z}OdxpHj7tK9y*y)Z0RP6vZhWk9(0a^9+QT{|` z+<;aVR2&XUc!%KNfwnz0+pl~Wt z%(kznji_wb`$ol^Y|ffnUS8F=pwwUHQY+~Qevy4$Y4}T=lDN;rtw9#!;nCByaqIKZ z+9UcsI4#FI(1L}=)rxjYZt^xIO9u_Ksx3Ra6CeJM>Xj*i@hl=E8IG((0&y4JtT9%j*EGA)tn3X$S##z>;O;@{UY{9ZGobkN z3~42xxq-q%sf!q+34G=ZzB&kcCjXGih^h*QU?a}u2e2SStN^*5e|)zfOkSFd#66q& z_P6MT^_MOY7Q{8zDD1JFLZbx-N;E6Y)Sr|AY~=KbJr1qxcRmpk^8>N-+UM<> z_T|JA=|FokPbaQH*wrOix0yA)=haXT1wyn9u}tpWH`cBt)Jvp@@+Zb(RvmAoqt=Sr zW{QWruM|KjkxoTQKZo3RNCL)CF?rLB3=$&n#02- zpp2{`dqS>&;bf=2t|6?j`u`Ajmr+qg5C7;Vh@qz#8io+*E{S1=K|(@6R6>waLK;O- zgc-UUL`51w1SC{KL`q8O?pCCc5D=LE@Oz$T-B|a3*L`>2oR?>vwf0*3oW1w=d_Qg8 zbP?xfab3oozi8I&7NWQ-na<#Wjd1PC6oDwNt^~!NCKMHXn+`(f&t4tJ;6xL0? z3O9b-SA+LjTv&DfmDZ-0{^XFp%b? z-;mmvAqWyFOb&`+}e&*{D$Gj7VKw6H-4D#WB^|DP2tx&!7Iv&wnp zD?;M|L+RgmM9^3{a{==~vZ6(E`HjO{Ba7GR()a$K6)ir@U5s>(%%mK7fd<*VT4?cq zQM72M{CBU&Z%kA=j+FoJ6fJ`>U7F0tp0;gK|E*|6A$dPrU77gvUy2qd!o~4@HZDPs ztZ4nN74kg_32l)l{|jNAk0#x5JXCeAw!QXZ(*s3Y=Uf9Kov|l><$8a+CXnl?@8(pU zRR?o<+R?^Rmz8Ap=BfPtSnb_>!Mn4k$M4Jgj$f3~Ou7pG>iiLYYg+z=kN*AaSMl6= z4-D?_Epcv|TE74C*E+_dacqsFPdx&FOvJ7*2K1;ME$8=PAS(N0MeCl)HYZ$9?n2R$ zt5mOL7?OsTi{K;Dna?_6J1MfFPy1X4Qqs~^giC?`MhG0Y#GPNHck=6@h6 zyjbm{rFyVj(x(pWNiig85;Z6(C9QS$2ePEtD*9RH3La`HN~Xa~3?D~3el)~?)HN_z zNx_$MR40SKnp_uH8b&eQOdGkS7jV<`&b|6DhT!OCO~atZW{1~zVxC2s8uX~rb<1^> zrhQcqPSNjj)HNM7b7hfnFngwoD@P0O*z#F=Gie+SK4{k1EAbhK`&9O*%;ovu!|&oO z%qmz#(uGLP2<#PEghBw;cd4f-PA6$=)dtI1n ze*dj}v)D+;PxIvWa%?LhD(xtUB%wdzHeL?*ZN)W(f63um*Eo*Hxd3qJXXuXnRM;N`4y+kY6YF_r4Q}C-1?thJ?sHky^4^yU!5%RTVq1@Y59jr zi&vxaGg=F-@ApQhB%P%e+!UI?EdNooR6S4qvZ^*(IUfIyqIH+!vtGOCSbz^%(F)M0 zQfE2a;aW~y`u%00o%K?~jUv&F0ei(e>LpPE`Ke#;jXirj%kKz62x>}rqhzFb^Bym! z+@Ae77jie*Z|V8-XVm(466n&FKj54rENNRDC)G-7&PAQi5AjEl@5G-*e76^y>f2v> z9u#PKGv88+YN047=v@AF!Y0?s`yjoc&?(>k`rS%O$fODlsE}7GI8>5oy%4uYRV?hZ zO07V!%OWjGWHizT_8ay~~>kY9cVin-HuIn_ zMNwS0T6w;)wz3MeLflyS=mw9pxYp2&!Nc|_tXYThNhP(8UklyLRBL(30;SzTzmV+` zoI&_zlMf)z_tIc!fG+_fi+RUU7P*uv6Uk%Hl%!eB3T4Xxv?`mp#1rofa`h!?BX^B4 zK>`9VWFkSkQPM)&evz2SdCI%c+^6R|x#}mxA}gr{<~=9kWz~V;Y9KFFVq%Nv+5#O~ zECii})}Sjuhk0y}1pu>Dj8S5kxbDLJWX@<;h4D`LOOel9O&R0>Q9sn3^O5~xCc(v| z_7C?>s!a!nf5w+~$+~XMznDoK@bD!l+x=PQyy@2|?u3O3yH=%qb82pE-=P(Ux6#dl zG!6oqv4Wk+XeO~v?mun8emOQ==4eVmz8AgOp=HmO_uo*3I1YH3d`L9ucZ$ua#HdTT z=W*SG%PCzmK|V8e<-7WJ6HQ~B)sRnXs1$*xZ+=JUN+G`7iB#8ghGZP&GZN+nz;y8b zV5nU1Vhm2jY@SKd36Ee36TU7p*q*+ngYcm5=cqmzF|qYQ0S9@^de%CRW0@)MsjWwW zDBhLKYVu_J;hDo15-kBho^S#KPWg8X*L=SG`$R39U9a9eZi=@Q=Gyd!g-j%xll-^W z4)o_DfY;T`cLkZdK4;ug+2%pj8vK7%W*;Q*070)z;_JVW?ZC<@nG=mRRCXlI zVM9eo4poNu;em(tgWVA8)y@I|T71}b<2-9qCHhez=F7?bg?gs7OjgR@BJ2J|YGe7S zGo3tLfTi={s|)NhzatbD>6WUcYQIgIhEoCnLXH_Vo4{sw zj1g*0or22WlDhfl$-vfez^0_12Equ1KO`{%WQT`J%)Louf3deh29rf9E^*S zEnJ6$04qROGa(`9;DC52U2y!R!M&)HjTN|BYuAnT3XbU}TCc?70RfZtOA$0@wEQ>_4pg zK+N}yK_8F?x(@XkP!T9%a2`ilSpumgaAS{#%Q+XT`?2lHeu(L-1X zplc5y00;*KN{Lz;MP^wjYz77R9!9wU0dgri6wCx)ia>JGfRoTwYKHB6gyj>_dr0#MVY8$(^+uiF*{rq2PA z2QaWqn!t3KbDlxA7GL7I)J~#Yh$hWn0#vxeM{EF180r+gz(^lW4Y1E;<9Ej!j_}_g zfV_?(fVGzf(3UyTJ>Lf9h7x({d4CJkko=?2{XqDE#=ye}f_$4TkT$h=^%4jN_CAYE z#ZN64tx_&V@hk`c=HVr`i3(Y^D<06+0|9n_)+~wOX}W)!AHP1^8aO?y?K|7z$a&Tq z+JA2_u=%>dr4FeE#6$7t*Psc%*VUJzOcGNC1OL1@MK$PZr- zS_FY&spl%xWI_vxQ-l+PXIqs4ky2bu*wu9c?Vj?AUjZIP7NXSP#Bc9=^LEw2t||c>8GToZ@v&iO2xCBO>|Qs-=Nfj-rM{M=KT!E6~#`3TPewmbH_; zAhZx2yk-D+bDgDY3kj2>{ZtOd+S%?y zPzdqHUJD3S@S z`St%HwEAl+3cyxE|3hdgg;Lwp>*;_VbEw0k0O#v^Y<1nG6s=IYhG^A@TlC*6#lHLa zQH=Rq5Lzv--uOTpV`ICNMY7%$#1aa?VGu$+(9htWkxYtcQnF<)d{KYceVjmb)fjmP*KLGAf5I&?B z7Qee7w5rI2){c5-CSlROr{6)xx6NwU9Bwm@0@LW=oJ4tPFn`3xnGzbNPoO$%Iz%K(9bJf=p-uOptHGU~x&T z@=4a?IJVpY1TU4~vkO9t=b)bn-XpeKXtNt8E&`E+J|FoVB?1{@SfR`KFnH@YoKyoQ zxTe5{T;5vLhO$?D4Qj;$ng9p*8?cr^u?e6+4#ad^AP5TzN(`adjC#-?B9qp@I5Na! zM~R9{+ID%%D+PeD)(5SJU-2{6{$fJSm4T&X2{P)+I-;;NGySSGWVn)jZZV?la`W~V zH0D`~M`y6K88tqV+LS<{bwnebBVZ~&q!l-UeJN!Z$8WDT3VeA}Q<@@I=)KJ*-OZb$ z!mv;(KLS-E7H&+WZW9g|& zC2KK&t{7;y3Km)I_VEOZj1O3t5*QPWUGBl}$Gro_+_E(oKl1{Eu>{ilYAglW@2hQA zp%6bdC(p`&(qS0oybf7y12gARDG|I1gTXmw^kxVsE8!)F^rfl#nSg82ZUDxe>#6)- ztc3_bfEmKJu6XELq*Zd58pAX6P`a7L>qO~@nHXoaNk0EjpbiEhtP-adp}U!#S~iHl z0g}a&to3qJxo&SZ+>jvx*Es9-GR$Zm}cuHtVoZ#RdXof;|q0IdDt)$gkHjQSxrsF(=^>I{I}M~l*X zjj`e|vmQxiCjzAtL7Fav#uE%TBhYjz*p3q@%>dyc10=?AE82>W6PI9Q5k&Hz#qA9O zHGx0_9k7N@Sx3%(eNCXrA$Z*&(m+26W)NWH`g9`zJ=z566++{LgL;NgCN?=2&rq%s zatnfK`~Zn@6&4j$)KV6&119402{Um3AD?gF#CrIEF1=jR$6?@^Mjg6ti7=YGn8A&Qoi&Hk5&hWIzv{e{W0a%#HHXw- zW#UN=S1$;yCNiP5(_5T>L1>k5OaLn`2(5|JD|CL!$7DilUUm6QC5q1GQvu)w13{Eb zXaOdH05#6i@l&|vJ;DW{CGg3H^jOX?N)(WG^@@l)fbnT9DCbIASy+=LAm?E5R6Ge; zUY9*Iiu`T6E-=*;Lr}ONvkH$r!V@;ZaHAjvoQ>UeIe-wO zIlNAZ^(b7{<~#5Be+VtlIp9Y-$SM78+(;t6iNVS0qQBu<52U?O$|Or z_m78{S-a-jY_yVh9D-TF`q70Nmx9f0Le!uotZ0`DLJI&gNes3=2$NoAFTC$kbV!W_ zfJ}?QnS3IVEAL#&-3=}K6zOvm%=S6zIx&XJiTW=0tzQ_Ns=^h*e+aF&WI{{I9`0}H zgt3CZwRNM=MUV-t#0x@;d&>#}Az+T`rpSbr4yKZMu9Bn0*LTAw((d(im1Oxw<;RvE zjEw*-hX-PH4NE%^DGP+Zva-#T-Z%khR292%L1?*TwYM>Azpm%`1=ztw^^tjCPA2nb4wQ&A9?pPr5?6e1?IMG8yS_dV)I3AX)Cz$Q=q(Y?z%; z_~SnjYCGpx0>wL8au&-{_H&;o)q>pVX{Yw{wR`OO@JmW6s7>1j4|E$mw22qh)w@pjC zi$Fv;7q}TlO1Z7>gVEzYJJOchKaw#K*|OK*15{cNn>4}jrlrtSH3~rlwc_|{G6R4^ za!-)BH9!!TpL_oL=RB7uG5|_VnlOv7#1d~T{60{I{}qhHd_7XTz2+fA6^D&F%JO(Q zp8jYx-GdjR(_dD;Ln+)J;p<2FEHPM$_PJMcuAXDRHH1L5O2%FQNmQP|g!Pi4gVEpoYE2~p|UT1Lb$B~w2+2(K8K(RNByi#kPrY;0oY-tNW6OGPv|0$ z0HE=sxCd#V76KG6)%h67KmbH=fA{AHgsC2IKY63Mm`$Fa#FT*&;ytvgvo8AHF z0TUKBQ()GH;&nG7c^vp#%Rbb-)k~vRZJiX1MxouhMSXo8EO%>_qG>}(Ii(vCrDgreTL6AVR zhaolqJTudbP=+7&(tj~nfVBm1FnBtX4(~~9ret~`kM$=EwPqkB%~OpC(cc`I>FI@_ z#F48p>g3e`aNZ;WP^2V|eT(hC;)Z~@8%gDi)d_IG=>5kd7J#`C>_%J!fZJ#c0$|pE zNb3Iu0DzII$}F;kMVf9wdYz++;+#yaKC{w7G&Z>@Pdi1z?L=;Exa4+qU?eE)>v)yp zP`TSbg{|rO$9r3wDxbJrzCP}I73bmS9?*RjM+|K^tdkte`g`84HRmcgT;ub+?0U39 z&{@L~|J<4C$^P1yPteSX+Ue1sUp1q*V$v1{CwIO^v}^7w)@RyHXya888A>0U($XY8 zEzPTzuI<3 zh8k96;KLX9YrnD?2yn&nBwc0qFK~WeStF35-?(=*MbD)=1gV4en)V+s=n#Nf14y|{ z9OpkYLM|K2bcG9=&)e&$hC2mEzwqk5QuJbr*Z*>s|C;b!#j>13W5iQh<>IVv01Y;$ z@L+;mBXS9=b}tn^RQb{fsck(eEl;hg7l4`#9$)pUIj|RbJRtFkqj=2a zGg;A6_$^Yk5UnR5XA=8VqW(?VD~Zp)MAkQbT|YEw`T5AmLfpdvxR8&XJ=ORx=cRjb zt{FDLGgTJ8Ly`@pGSgCRu$pL%&&-wq((P;y8oaW$uWd|;+1@IlmaXdhuVoMD(nWt5H0V>Lqcs7XS#o)v z>@f(gXIx=Qt&R$z;dq2pWP*;CgMGOy`hvqb7otQM(DtQEbm*rxR}y&lUCyd5yEY6m z36R66`cu!XQz8*hq-bc+K2?Znk7s|={|u)UiBQwNRQht972|z=B|9(}Q|}y@Lx@^w z;N>;&Gnvn~N-c2`nNrs5^!m;~k;3Jss#D?hopIzSm_j5RZQS=2>j%=5Up(^0s}zB7 zh3knM^I?}2OYxVWYB~!v%ie3UKhTEb)l)K$9Ec@?-`WJ@eTrkH+r`d5kgBOU)}1!{ zWL*J^zq#F;!(IGrgX#x5KDgv3D2@kPeD8Wh?BDDNk_|A^Li7(?f-F60p zK)VrB&<6JLy%>X+(fCl8&IKF%9WB)-qfDlL*LYMStzA`tkmtI&O4y7+nQE;Ml%&wMF9+v%l)e_4czv z8g^D0=hgLCdZQ)b488O!j(53&QK15@I!*T_?hZL5%PQ2cf-x0) zG8<6b>phzymDUdN02@1M%_c^X)XSPghMNwF;~}o&O(Y%D91xF~P=bOSv3@htKDTzD z-SP}bb75d**}VIAx=Sn2pJUAyW{0xdyLMM6?#lZ}VTY$7bzB=&h4KzB?`cO&{BSSL zh`ag*e9z~{!FWFH#{nvs9e@ViS^x86!(kRcx0TSc$Lw zrGOB{{&v$LbA?hDjdznTBSuef;J*h;R6>wKrTBaX!U>o;>RYd9qg-cT!w8+%>vx$u z#9$93n9_E#+5X--Q$#fU=DAX-dO#&O01Kk5a4ps57qv;TWu$c2X%)-Kou`zoZDtJo zzbRU_HclEBidH0^Twg$n$!HZID_S>7?O+dr>E9oVNw8VI%IbQ{oLv#e;Wfml7;R7e z?^T4=BSZrF`X&c9W7bjHOS^f?=E>ouDMq{bx61wVRQgTYbn_)N2Bt1dLI4oDMXqox zorcChu!VYiD!AkIaopTy+LAIK({+i1`dPvt8@ z{!7tnS5RQEMugb&|C5)qQTUKgZZcr`OgY@B@F^v)lt?@=b#x}IGD#uIS(ag4urz?w+}6qcRfzE`VXR z&)Iv%7(Za?e}J}*XI`$b5Js3pR5_rkt-extJ~=e)n;hCKYWK8EkhSJi*?i_S&p=|h z<8C&+0u4iYJHt<#$4!$=WPvM^$b`xzw}f`zB#q+RY+e^~(Op__lHwhYvRc9BI_WIH zW=)vItWmzXEcLIMv%n}uv(?@H$$Q^egSmee_Oi9i$2{ZJWeE0c;^iWDmg8*MX)_KM z23OOvVqP15+guPoSh#x1yEb9F_Mv41&Bw1`_co5Z)0nsXcut2IF$S|KH|CQuU&Pzw z@!#r|pZ|V2o==!>Xw~_ya5?q<7o^CWMgLMha6MoDe28;bp7<biRG4+bO7GzQHQbWNM)@aq(@JvF-4Z#{_x5jK*k+;AoBX|rLgo0M$=FL6KOEq=MF#Rla_2dx z!-|oi5vsP&o;KZKQqUa~{MGBGokEwCKIm;H-51k4s#492cc@(y9NW(Ltu%^!N+JYj zG`wtDxQ{fX#qcT>L|(t(-e+LbKlcNHAcs++@auaXE+L!&j(!SVD$s>Dp|PD^xJ`r0 zfj^4^QQ4BiK=x30V3G2qSj-Ci)dayNhB%&4E>zIB0u5?zKPr7 zT`2c!BW+qL(=QdA1RXA=o%w0_FmMX?4+nlKZWAiJ(NY}qhI-j^*>^rfu0arObT5^@ zCEcuh%?bQ;@^;`&Xd9Z;??W*>A_hK<92Q*-VR2oJH59H{T`{o`PK(d99Bt;F|B+m* zA!>R})hUiCH(mki{Mg*6aJofQe+Ca*_1OTxjc%jN=0e<($!nkuPj!3XZJn_|u@Ee7whY~AqwDb!m+Y9fDh`Vdw z0R~@23O@VRI+wckb`;G2<4w^~t4zj(__PLBA_(e`2)r7FQ4J9^GF6#x6}Qolw$WD6 z{HZz$eyx-m1KxbomFryBO2qbg*Ov5Fv$=x?fTgxK21Q^B9NKZiTakmMG~0 z1*btG$TuvU5eAE-kou|z>NX%afJj)wk53eEmX5MQiAc|(8`eXt=$46LqF@D1dOyUX zu|<^Pf%NI6bo5VaTa8cGfPAqy%gt~>Pun^RL-l|qpPco=50jo5VsAT(pP-#T%H?z` z#MT=2cKp=hn?`z%XFfV1BkMpzzGm5daBsYU;j@rYvAz$pSk>t!2|ptFa9%J#mYL02FwXybed%LU|9qi$QRHS?BzFSQ}!uxMjA~sCEt6)KgW-kp; zORv_6AF0UtweXjFYpUZ*YK+w0xfRtXnq^{%f*v+qzFosRNX~Y(+gsu9!BLW?$Q?D zs097&SGa#E`^gC5%a`92@Mg+NW4JrEzv^V9ZzZiBTLQe%oCq-mSt53hsg8B_0$e z+6*+M&m;}{d&|8^w{Yn=XROdxlofOyl$Z)<(92Rh#&fj+IOY@7JDbJ6e0Tx`3hMy* zBjLg?;M|Yi>qpSj(lLmRBuL6dxB_s11AT+JhlWE`eZOz2s2*UiJu+1vHMRVh?;D}i zkM#}oI38qV7{-Zh`t)rybM>@751UsGeW-N}N4M;<6f1`?4S9OChf(LP+qB2|2^czj zUG)PXsOQmn_qwGXZfV$qc0itET;6(}>ZZEFExdv$*nso9i<}*&z*WDTV+_$W-ocXT4t^J@Xg-k_3OJxk9L}3&I+Q#XtQW%^RKGQi2f<#j}r29`n&N! zwg=KIf5BWsj2zRu6fdMXJQq|e;!SV`3KhMotMB9@n}w$16e7V1m4?h^t?|qur?E0? z<6eI?5-E%mUurzm4Emw@FGNPR+IM_EzRWPK$X~;C+xkbv<6!Vj(V@M!vC#@`Sk^D=kJN@|HGXi%rroBNp!%joUkX0V zs8z^$#K@x^H1+z#+DBfKpEeSY63BFKXpz0~!TMfi{MZ|Hh+KdqsT$B%t3G@b$HD^- z@J1hp5TdpH=4%C;dSfJy4`UBUG7f#A9!F9wWe53c2!%T_x^#Yj8lWgarxvcR1FT-F zbxarOY+tWv-{{wK;|?klFq2^*+YM4Xzb;(h8IiRC$E-w-^=Qd&Gz-QN#hqvEs&O+b zSK`w41*Ib{GjQqEcs*78@zh>HMAbR&U#o=E2Vl8}b4Pp3E9Iu;5$l8Su<9ChB!1?& zm3y4QRW-Gf{+8{5TuI-PvYKuVwwB_6NOSfF-U;EHr*%8v_Qf*~Sjgl)pmRFHYyuKf5f4O1_ z8UEl1@8uT~q~u*YpxV&N(EwEJ&_a1!e#}DuWtyPAkL-9F%tE15u05Ttu2n zxy^O=Ys)R|rKe1=n*4elpU+P!pFjPkWxUePATaIKq-?P*i{Vl!@4M|<%Jlw53YPo@ z^{stqeIxAh4Lbc*Q;RQnygpZ3dcR-y$A7Je_iwN(mLp!nQygcmrt@CKPd zt`meZF7RA6*9qsqjX~0=b?F|#9LS26Zn_*>)g%Kl%U5SlQPPepXwEl>Fw8fr$|ruS z(G^x@*5*j_{M$ILjR*F8(bx3*VnX;=vB}dJ4PrEDIg1PHJAG~Zec0_Fi@Ip>C&N4# zK^3P8FWQ?TBPl+=M?XGg@t93f$GP z{T_4NJmo(hJv#cw0iMYYv{Nb*cn0CrRZl+AH4>mYlddQTWc(o67`QL2i+n&WaYFyK_g(?a2YJEZ?2oLTb~| z->_`|0_eL8zJ!3Q9B4F-GV-2v1N^&apGLG&qEXapT3qU%h_{uW)?8{M@3R%>gupF| zb0hd(7U#vtOh)ppR4n(Z1O=8L@+*x1pu~+26Sff?(|bjXC?}}Ucb!YGQwOiDsF>0U zBsRU5(Np`F%7AXj0lq^S`$tRMt#bN3bj#{$4??q`3&bPrd{ESZ=RGLOXl1>6k54i{ zw|H!54ZD)gC55|}kZb+yaDo|@Df%suGt{WrA#wVlRV#JH ziSnsb6P<2Jk=3KkLZpoIc=17wzo+IUZW7wat32DT!E~5#?1faWES# z-pINX`(KJy;Cua~agnX1ze;1@#zjXG-quzdhQ-~4P3b^y|$X@^IlaH@u1rA>ahyqIvoFxqP1J1rHE|n(qLyV z9uySG=o)6Ak<4IrSD5-AMT`67;as#xOpalQE`Le-fLnyUv99Ft>NlS3(?31x*{8n? zOSEVWP6sG>1ys$c2Z&Yc+U27IbJ?Wx3q_0T#+>OB7BG9Q14=v+DEKfR^~s?~#D#XA zLf8yPXAKQGe395tpN3-!gog4W7a+Q38i*`tn6T*rt@*eHOCvN~I(32G*$mGy3yruk zyuj!`j^{atM&gi*&?wfof%@(eC-XM27vq|jF_kwb6D_T?02Gj9&1f{6K|4w=`NOYk4I-ydzJ;D4>f;rT9vHOHHDH)?@@2642YBS0{V~$8Z4CE&B-*@JiuN_V5v~@h17S8*+CWu zK%#m^MZ|4rGs{sV5aIN&1H}yXtd^qSKLOUS-O=HBUkpZ!SD^KCfXsjF?uq?2zy5TT zfycy;5+ji+=7kP_DgBhnBoP34ruu`YtYWF0n_Bi6y`1>T@-H8FFh{^X81YY|S$v;J z^P;vB;TA>YbSESP&w^;oq&2VoCI8A^(M%^~HLkjSp=j;fn_!p3jS3S2D+nSfyVUSf z((tRmX*+rqvl~Z&?4|`r_#z7mBWc6s1ar9*y#EtpZSL@d^o^z~#jLHWPBw)~$&Z*~ zWZN{IqHokRGM6vfh5XALQbvcyiQ>7gd3?xa3NRCV!Be-c{6`sDZz3n&xUPM#e2K|q zHA*A#Z9D7!05jik8ckmg(Fffe{%9P=T-@(qxnyB6l!ek$TGKmH@&=>*f+?SHWvSm1 zq2x(6Aei+ls22K~*7C?s_>~P+;P-Y5F4S(M?j5`T zR1)9XjW(aQWBJP3E}gy`>wL$aV~(}s%II#q|Fk_1iM11l`jZ%S$AO=RtxNaTpX3+Q z4wu#0x{cHSq`tj#PxLO^ck|Id>2=fhB!k#~*rN6_$j!mB*=#+|xAtC6Oy5`h%GT?Z zzL)v?j-$#PTc7{vUe@unqdJMLpNQJep|Ntp^RN#@-P+G%n{m=sXCF*T-!Blha@M=c zKJ;RA|IL*d=j%c2!?~z~VqGhj8`+aVvyg9so3#wlnv)B5JLiI96CEfFm|spA^R4Q$NayZ^d(Djp#x2@JM-M3(c! zh<9hUvGxc)_9-fGUFD3skSJ*ET?HO_lnBL`@zUCOhSny0ak=BsG6Q@p(~zGS?u4A? zIL}Zl&xg_m+1S(F+iCywSKKq^@(WbWY$!kbRziqu!#vG`k|fCMLBuNx&iPJi9h1gC zbva#z>@vyk31p)HQI>)`1TkNm*@-d1L+B)uyKHi-H@*01seB0!Yi1pwIxa&5#tI4LAtbgGrQ<(Bvf)o&`G2Sd4X1TXG zs?+ejg-mgba=x1RBmMLXF8Fb{NAvdE=BvmY@xN^`ZM(clDy^FY#^e2^(Z99Yvma;u zlRHD8;{ZH693ETgVLLjTqbQ!E_S{G_m!8L*a$_A%i>Xy!a#LzoJAy`zl(xX)n`*d% zvvxS-~a>YfYyK_{tja>5{=p95ESngftqNeQ!0xzBis z9U_9W;pRj3Q?-JsN~KVUEApwta;>})+wCVvU5>ceB1^#P06xxg9WLQgqmleau6Q7GP z+vFcjfGg*M)%gI_6+8OSZOAtN4kIIck$Xd8<9-E|S2N_Q5%7%cYS|(*h|?u~JXP06 zlzDfnw)vOiK?}e+AtOp-Dc&O-E!ovFsh}R(;jiFDY5+N9u7k?{<7$n1AbkJhYAKLi zEg-`X(}@1UcpjR4(uNM=x#-!$7~adXBLPJvOe!x3oG{ROxUE8n3oQ&>tnSR59tc+V zy*wK}IoO!e0K%Z}ubC7rwgRzaFy+ZWA1;CNa6?_qgL=im;lbN^Hgj9RInOwnvu5PB z1aI{2Xs8r#WIrUyXo1yzj%2+%N`|#I7{`?RY5DthvU+CAum8&-( zA_46rF8RU)@5#{mdR?mP;gFG_eb~!*gyr5LyIQF{R{BPov;Sv|<}rU%y9j9f@Uu1M1=0J)dlS@6;f4LCSXisxz>%Ukk%L3Xv8VMk4j zAoAnJb2s*mW4CzK6E@=GXg0k;9&w!3N%Z`E$kR%|9;3X!x4dkEgxR0XPD$Q^KYWVG z`+hgjcz5P`g3_0TaUsvi7Iz+}3Mj7(?HdA<6Yl5z8}cmg8Q^@Isk<;7BmM9IPkJ@t zWs1B9R2DeI^J9>8QR>3g%5)o>{zKbrd|2GWLl{zA+9Un&BEjt@gPWzqw}ok{dFXvc zOFd`LuZ!@?%HSBdyn1LWaFW|e3*Ai<5Z+B}{Qfn&SwSXBy!2cXK=aE2^XHutl64AP zZ4i%>)70Kqn37gh2hF{~Ue%SZtngUw!qswj zTj@_dRUMQfGH&O%%?kBTARpq?%5Yxdf|nj{$i=g4G_iY#<8|#gZVzm5n}G6(0%Ziv z)?Q8f6ePT`~un4vN+(5-(>OFJbz3)7O$J11Htp@-Z&VYl`uWSjT;WJ^rh^Q7Rj(y^Hwn#)*`Mo znqqjiMu%ZHZ;tTH_sFDda=(eAnL-{P*;>}f8ZPac)gUIVQ<#o&Zyh|mex>aCdudj{ z!TyD-MdW6BB%n3GvB%4{QqFJ!H02-R5gg&CNuQ-h*(SwN=i4x=jWAyt%@n59^{N(h2D1RV5%dK>Jccgp}g7`UGWhW_xIuy7wo2_7G(c zOtjN&nRuA$!GfCmL~u9A_4eB;hV2RZ@%So=g?$Xx>B;=b0M@y~9heGt zHNa}`KCqNzg(kt54Yj*Qhr7`iu9h-RmWBwwaJ6vo?4eDQCGMBK0RxJdJfy&QjllF{ zp5ZYF_*US|n8e~^3FSu;N4TIHt$}|%1y+Gky)2S`yqEql9hZ&`fBGPz0z@tQBI4ti z@`&7J^Q2IHwV|fbPc|n=vtd>xqa|@9iFPD7ik@Y|W*}qipH9Zz)B+8js3mjm|fVF3O0$8_HA8 zifQQ+_`oV~*bLJPj(KMms|&I6Dx0!cnr}!EfTW}2cC?$ZmvDA_-G3KSy<@kJg8M%* zIPoGsHPIHk%MXqVjI#=$yb*r63~zFR;wqM~cC;2u3H(@FSBqCn&2UlS{og@?xx4~< z_xx-;Fh%|f?}^{NOXe?LL?n|O%gt$k@!tvi{i#g%6E0w_TZiLtM*427PCnVy;+=4^ zrFHVA#q3J0D4k5Ary&#j6}Ed~W?TIW^MmuOKN#tSfctqb_80qM8kJ9;o9EaUy^!2Z zsWX_(L*G6oyIQt-j|~nqa0jL0OD^%d(Jb_2NNdq3{le8ELU%ScwtJbQ7GMmw2l<-1R zjIi%6W`edGz(QIs2iG(`4|Dq~q%h-a3@arZ4EjTa(@{Ugsmhw>GkZ!XEfGWgwDocC z5G`fLfbt#*q@OKa_erVBlzTVp#0Ftnlj-m@9@Qv*p!ysX5@*-^LTI;)K`eoDxqR8- z1&=aL#6Xev{`>Q5FK0^$O_+0n@kD_Hc{p~JiyZxPZM>3%`-R{BJ;nB(EtIF#;-Er33;m_F_63`jc z(CM$*Ig{HFr`nZpr|b29T&=7IvY*vm*3cbw+WGvnqcXFzFta;q`upcQ->Wmb$ZXa? zW=C`8_oRj&-%fjm?sP6q_l(@>nh5yOdZ+iGq4yVO??^*W+B8=|0~?5|`+a6F-m|{5fWE2BfjHHU1%3(zva4lFrJzP}pX_P@hc8^M+~MaYJw)cT9)B5r#x;EN^$2cegk)O3=w;~I;2 zHXOq>u4pxqYc>Ap+2qqNlX$>{<=F^MZPbx#_`%s|wc2pav!S?|$#-g#A777+eVO`d zHPQNPsAFcLPi<^SZPwt~EKzOrFWJ>vm{|ET9Q9>3NNqxq>}nAwx}FWuoKG`Z&yX#x zw^j?yXA65=Q-8S@7|zFm+%wdT^JlLY|2><)lefTg78JRMFL6&@KVOlx8p30#{$RPH zu~fc|kYQegZ{ROk92dcwsvgVrhpW88{ zM>zGr+N8sk+5Kl5QAJJkO>MeD(2LzL`)DO#=*`lyco zR{JYVzJD*J$PUK3*D;f6u=qp=duS8vSD2_q2yukVJ)ridMtZf9G7@GMfu+EH8Ts4b#__wQqHA4Rvh8P=-R&A?$;DPu?nqU{OOAn5uIEZv=Ie$*C2 zk6~L0w{U-91k?Lp?A=#bQ(MA@ zk|i#5Ur4c^2|3N+>>01iAe`j!8b@iFc2gU4cHfou@HaaX4NT@Yj%nI;I?W){ny7z6 zv#`MM(hGs|RzE%+LA%Rh_*v1^OUZV*DcP6&lalk(BQF&d$>wW0hm%x<|t_~FCowC_bm!@6YXkX{Wy<`LL>QJE)OL>oN zj9*skaWU7t(nz;QOQn5cx~mPO?hS7SubC5dljA6@_dPgDOJG&#IFUcTC4NTM1!^^D zvf`Z;Cw~l67>r1`-vf~haT;_ANosbAgWFCmpwjrRY7zyG+?29esOj_S6KiU~QCj^T zrw3#i(!2&4l8?MnaY(XOqdNM`eM(X2hP$;i@v%vyLrohxs)5(n^h zxHnxT>NzHV9!F_OpWm9#h`F)7^kC@5&V0C0Fk{m?kwmBTqGC(^N91ik=vWG#KAwVHD-=6pL0TGW0pc*wUtHpYSf? zZ160jQJ2=^I;vZjPtM8*GX^bkdZL0wc=R%||~DPpFNHR8G>Es9WTvRM$xVMJ3*p`6P!U5myFWTH^#? ztn+2uA;|*;xL%PXz0^cD0zu@s}edaS+}m-oAHt>3B88 z;(5-Qnmp4fgh>}_W);!Q`k&GxQ7t~98F^*{{IuBNNRcIPularEr&_%np;IeG?TyUO z3Mzbyu4p=&b-CAzI*yR9A+g~-+%;T|DF2=gorc4?iCiSN+4 zAg(viRpeG_A=-OUQg~`Swx&c~!PU`pr^^%ToQ}L5?kvsn^vPxVE60GMZcl_PVu-4$ zA8Xz}i>RBJ6S{KFPg=x9lJTZuiaLGug>VrEmc5`?k+P52lHGh6yrwH1Kb60D%it?V zeLg1Y(}On~Vpq%SU)FvZzxIkiJRl?W(Z}>N+=~NZH%!B#wr;FFer+JiLQwYtUpPts z5_T={wv%R-g7?KGolxy6thI-hSqyEB2a!aY4Zz*6mA*$kL>y{M6{FsIEg?MO7};xD zixbye?+I(r`q|dz`{jzPN<>~zoTWQj*@)j#9^xx<7R=?>qVHA_>=7+}yXW#7m{?na$DNmvvMZ6Z_y8Rg|pCn`_8#p;| zHdLBzNa|wvI8&S77hx$ixG}%*)sY57MaIo5x}tlzNK#p)a4$u!2JHvtL~;4+*5a%U zYVHc7@ugL&DtDa>GNQ-RFB27MioOAaDX{uGdaB7Uc4=2mjT87+7jeT(#fGPz61Dgy z*j&SCNMD`ocZjkJJl8OiTst9!(v-GGIXkipMT{wRx0c*`Yo~58^|16~b>)3mFH@Em znA00N&*cY&kx%Ou=P%byin8S)jXEOTzNQDjG8>B+t9DCp0 z%PxWqS`Ju%DaiM{afD%gI4Mip*ORiyeI{Y?dzh{F7yCV#*7F#q7md0u@n$Qn-)pK_ zGJCAf+snDG*bVI5s_`3bE8WQ9o?f=L+MW=%yqp#oyx{X-#~|Q~k5D>q-<9xdFXdR~ zJ=gCpN8u~J`poA2*v=dqZnUc*S#mjCSP6^dZ#csK>Z0<(i`BHXU0v+?4J|FDcXyzl zhxFY23*3%vfa!pO=7FAL@w;&ko~_Rxi@Ej!$@Z~f?eSV5p4`M#AI0x=D-BH?T8gg0o4&0p+Z`vB#vs^<4em106-hM^7 zx7la4vvjfPGtJ7Et>wVI-L<{3<-kC^LbI&}oTB9jCk%%#o(TYU!%4N^#6$=x0R$Bh zoZJ!7z7D4;M6h%tFlP`9L_xs`2v(gSUdN!mPYBM!AWfMdeoT-!5pwS#Tv!L$HXbAy zj#MZ_j>sX8V30^3c+INRyY(t%h!aO%`I8TK7`U$xUgw2K#Qgh%r zmmN!N4l^@WWaPl3D!MojyZDx09as$G)~n5 z_@Q8l*a(nB1hGySsxXp)E26+}@A_@Oh~&+PyE=*)M1Y&&FmCIJd`u)bDYBR-ih4Bi zF(#laJffo#T#1ROpCBu!B(Led**+22#2EEtf~8I1R^P>_itdO$qR1Qa6oG(Slk&Hs z9d9k@@J;AMw?4epT^LnO6bazKEAYlkB#K~g~v2Ki*ApPnkI^=-y|Kd zzWHf0=6ylTE(R4b6GPf_4euTdq&)?HJ#g<0*!}@f=af3ckXqT3)`CdtA-+3Omp0IoR*h4% zB0_PBR>gs$b!K1DO5RtrdV+C^R>*;(brh#)3EtUPw1|;7MeF>5qQ#3-wCWP}6|JH@ z#OK8fIpR!UWM-jbG(m5skY^?_N!AzbO!6hvzM=&Sq&c3ol z0Z!41(Jb&ij*Lz$2qd}Q(^)`X#9sz344b;vq@jUa%DC#~9ZqtuM9x&Jqwve5u_XPy z4Cbws*=TP)q5GFP3SA4u!kx~D_)eQ2) zqUzpakIUAz+xKLj6}J^R7_6%_U(F~Ny4K!XG8|&ry_Dfc1B{s}d6J^9{N+04PU6;xQahw*aTn}BTOj@H5eS*p&pNjle8r6TCvp4WJ=;CmHU`P2h#iouu4h?Y|-k$S1ql2mu*>AGeipYPSO zzROk`sf|vhyRShNUQfqZEa!0f5_v9N((;AU&bnaXN4WZC#t!68O(D3PLR3XVR$n_6GZ((pL-@l)T&ZKRD7CXZhhi$@@IdY?l$d1~W@C1_e2CQ|D`PgXKkhu9A>C^hjcp?GY&!4RNSW0LPHUiwZLVc$A~$I^lWg8jeH1R) z(!A5O{@k1j+RWNd&7cT@zMv!Qhme;vzg>p#tu$}c*X{H@rqpj{+HKtsZk2v<8rSNk z5J1fl)n=*MdhUJ6e0?)7wEY}jE8|M5){7=l%a)t5O~)EqRS7y^`W;-7Ev5~0$}SKg zk#+&HcFwZK<791*`P$4qJ1+D$QATwzN3|PRcDTDx8%1^UM>PUlIwG?=2t2!-^xM2e zTJ+MOH$*ze$~#RgyFK(fo?PoTS8NBBclo{O#(&oxAkq=E+jYsM$2Nf8ZG|zZz9*cl zS7@gG=1Lo`ZiQOuxNN!Cek-dxIiM$ar8lL&CuXLn659K4g)$x5$KB9e(7>p?(np{6 zI5(gtW0!@o*rfr{U)$gB`NCSapC#Of-UiA<2Ki6wsutmtUuJpvqJM^rx=)02 zRDYnBbD%hi<;4q@Ig!D&G&3|AG~0Wy&StQy1l&{7kL@3#*BheZ8u~2q{}ipaQxE>1 zqGd7o=G^KVD~fp=v3Wbg`3r9I7o+E$O6Q#i=UrFl-60;{O;@#E7 z42q>Jv85ctr98K#f@p<*r)YIWuk@6z^sO%9;r(}-7BeRsRE?dGmy2DHms?bjPee#a zN>oTzQVe!PN*N}5^q88u63kASFJ6OEPg5mH8*<{f=AGkoI(jCj4Rys&#GE~0U~Fps zKcTc9U6M9%zI)l_Lcgm>zq^Esr+5`cb#R*2Yq-+ifm)$|2DQFwT5C78pWU*Uj6QoI z=6C?ADjgLzfwFRnH3*2Si;7E|jJJ19IFX$YiM@Ts^Uj&jI}fMsI5i}}3zK7>C%gEj zScasu+)m9u(6ns7Qd%$4y?@cPT)t{rI(O(#raeloqzLXu11Abx+}RZ^KG&WoO^h_kCq`{rA!R zO`8K{uLqk~2kRP#Dw;>CJ4P1zM_#-cXjhXSO z*$-25W3T4c7T(P7ym|U^e&Y4~Tg?3Y$NA?w^An#iFW)S_+g+S~y)^x4Y39w!%;w76 z;@X?7HOy!1!pghF?e}l7>#MsT-+%b@{^Qoh?$*b>J=`EY0MQ=cGci=BEVBhkL?dL` zU6$1r2H`%Qrc<8Xag$N{(u?l$oURxS^+>30MQ%^LfN7CsPeoqe9SOVMG~LSlfmC_# zr58Pw1w$DsND|iLRfQutI`Km1daLe@6&U9nzk9s8=t;3z<)!J~>f&b)tlJ`4^=e8c zD;>v*&h^#YfBw*8w)d{y!_w(`zYj~(eGebJYJ&e?rD@IGqXB>g&~SVb`QK?;!#``~=tpA6mWjHfOt<+!khT1Bi zr4wR$-f2F$V?c*H_+<9td}{5!rX})C(;9#nf9z=`me|*{B+{DyQq$@aTuzV~%JUBu zDpJ$lMB_BAw0%tr(V8GT=lD0$G1(T6$?^YVfn0 zrpv^i+d=3IKr}Qa0Vri*TnhZ1rj?Fv`>tu-0KVKLBhD20V(k_5<_pI4rt*pBY-qAW znikt1n%0e=ubNg3;8A~|1jclLO0VR@E3$8z)@qwvBa2({YyPjARxwV~N(X%I%2%GP zmwiS0YV*=UvHyfEPSaxhu4xqrXx-G3LxBmpziC=>-!&~CoTfGTUDKiz{f(w|bIJf{m5TxTFZZNB@bYHAAAlp=t>Gu4$3{qG>H5ahlfYKWkb(`|15rnUW5(-Li=eGJ>H4#RnPAj`rd8fde~%g^Xf?2WUO0qI7N==l!D(6> zGqlf3c#E$OtT^oCPEaiU|DOk7edOyV7VCdjSsUbec3;y<{98?H`4>$Kiqo{> zG$jKRBX9iBv?^)BziL`8BWnkmmKlP=u%*kzIarg(CQHp;3FV8^v|zu{wA$&U|IoB@ zaGF*=`sTi-CGm%*Wr`xOx-Jo^{=8H*cfR+OFHX}+v%Nyivq3$0`J1K%BF#6J5yfd* zuzgJ{i-5l5KWSQ+ADY&LV81nM#qeKiTAxf56W`%IJkYcx4ryAx-!v_L6zupRP0I?W zY1x6otbe0vg*5{UtmT$b$CRu|c^F0f+NA%`w5|X#vmn?nnwB+Dxe3;Wla z*4bY*E$m-wT6ws2PAX#Uu8t*Wl=wr_s(`C*ph=Z*nilN4rd9S`(<(dAv|fX7nwH3S zP0RFOYFasvT)Tx*Ye(ZJv!p^D#$T`m19jgtEs=kxX;uGO(~|fLO{=u*a)E?o*gw>? zoPMuq@pI&DPE{J86?gusX-OQ?wDLh49we{jqS`AcME=K`RxUPo8V+v89@4aW4{2J> z|4GwQ`dQNwIncB!{*|WHdq~qN0!868Ex^8}Rf@i$jS3S65Lf)~X3I~_9snC8mDQ=5{Ue*rX})=rd9D5nwH3)G%fFcuW7lEN~r8>TDdq)3-(>p z0)R>mX<8NkR@1Tt>HQa)mfkE%mZ%rax<11>ZHTnnRkF$p50IH3Qn$w44uXTDE_yX=VPRY5Cz> z{k5hg{U=S!>wibnI{UYpmhgYlw5$#^Es6g%O{@IRnwC}OAx$gi&zhFiAx*2|XH83j z!(Q(|(-J}y&Q2bz{NPSYy;MbnZx(6pp~(zIm%lcr_*zpiQV<20@O zj#q;0O8-vN>iKI;>-s;?v<&{arp52GiS%jRAjWB089y{F!{oO{Pl>;4TFU=M(^}#E zp=s&;k2S4T{QpMN8dLqxnwIB*rd9l_rUiQwEN=NHGzztgmQzG_+}e`s12I87_@_nMa94^1l+OJGIzho&XcLifQM zr)jkWRc-;msHC8MO^bbB(_#dY=1YXQuOkjLEyK(ZQG?%WTFBpOT1#IwEuXKN)@i?R zQ3n;&fu?1R)3neyO>2(y<$G0}rq#@QplRu_zpo1NOS5J@%lJD@%k=Lxtu_(^oTf!_ zNYnEFMbkp69zBhY`72G!^bbvI1OF#Y3yIUTAQqbd_OJs@i*#SpDm|=e5&rj@)&hur zvpIfys}-kdh0>IK)wB|uahlc>uteFurbYP=HLb9})3kCAG_A(WADWia$PZ1+;15j; zwXbO3FL=1~SIwCku(~2O`!CeAY9cWtF(0xsdT8)4nf)LvP z0Ed>oYFbVl_zT^?)wD9<2bz{bb2p#V4^2zqZ#6BlZ<^L-5MIgOYFcH#*R(<{4{2Is zUp1}GeN9W?tELtFi>76!8EkM!)9Mc+4z{jmKcs2J{-J5<<1{T90N}f(#RLKTS)0j$An=E#bqx!rHbE0CWU7z|;2aT^Lz)(jZrOw%*0f4~*0e^xXmd$~tW%RSA z#r9`Si}?>ti}^s)iY6WY3r$N0K~o1H6rduv{-J3B4m7PWhOe4d#8*wL42sjVW)Evx zgnWO}w7S1)S`uIZIP|YHEtNx>*2;HHOE2W)?=&s&cTKC3aLvgc0!)p;8r`x#Yg#>!eN79jl-UFIfLmf8;-DP#vNj=x zz%&y8-q*B{mIC#bWuc%#_$d|qeN9W{K+{6}drd3+FElNQLzBiuW1FJ{jH`o zjMKEj)WgF}|3=eNIIL;mgFSFZ92II^yhEB+9Mum^>jF;GqEs$8mDQ2h&_;0W`KQ7Yxt|CWrEYR zriu48tyqSn15IlVKfsSXWcg5Vs$w9aF`RT?(*hAYwov}iw9xRZ z5+Jb$PScV&TPF2Q(@KR;hy1K*^&i%>rU`MH7UUOAtGlZE$;}?$lkCoQiFZRMf*;{5 zEe32|VZ6MW%rwo&6t(W@@ndiTq6TjrytOFU(m41+;iFhqz_k?u0@Of8F}xst2uIQk zs=sWCuLU^t5u+vnWwvLB0hz3T;kK5@0rB!vp$tTDki0GFW$nG$gj%4nc|xi!7+r>| z%BGl_pCxJ_08EJuzM@v0N!d|wHK1avF{~h0Byt>+U<80T@RXfwYh(yEPfijL0SIbm z!+2r|ATLNe(MG=UbcxyK*nkZTu5YL^Gl3j};!s4hK9Vx9sXUyqaf-?C9Fq+k&sxLNT9PgL zvD2oR5+h#q<6vO33cGWkB_x>lqXx+DHHtU(<~4-7Jg9~iC@Kpe9|nA49Tr-(btSSM zdOmO-z2J?q(hrep`0(g46kgyBaD@=^_Tec3%y~{u?$xK(Q?#2LtC~pavZA(!u>l3EdNi^rK_R49OgLA`JP$C7W~2gvL2h z1hN=VRuCe~0TdvQ$Icich&g4&h%D8`^=w`W1mLsFhrpTz;~>bO$_P!Tgc^YoUXQ1J z?=Q%_feW99vz37#&sns1+mjQ`GW$4C$Ig`8p7-`MeSO=UAQSJ_>`cvixe#MRB6))g z?Qj{~l&)B&G^%Fj)&T$73f{hE5lCbnVj6hh!tM{ywWoY zz`i_PdDpI-6%e5qc60Cnc>X*Sler-;*d?HSk2@YtCIFYvaoQr7gsz5^9fcEV#jX_3 zr9@{L$dX%2JcyfcvVsMhCIzXZ$OT&Qvr9oSo1y1)E}6d%&V&UQDk0vLf~=^SjfOAX z#GU?<%2PLKYljh+F>h>KS7{pT;&&DZIYRMJy?C{7DZbc;*XPH0sc@M>luWFI2Oz=} zV2hr4c^wWwBQxrc5y9beG#2kz>pATgI+X|op7w}-ueL6}+aKg-aCykZnYj7s3u1SX)Dxybt5M8>SnWV-#fSakS3f_d_ zqBl=N0$r$+A6^P6mL_>j`RN)>-Qr%eP6!|TMdZZdWqCX!`4CfqP+LF#Kn!wq9-W)c zngobdDe_B%(}FtOdUw#t^tyVJJr^=7x#BicvAa^uEaNcL*G${ zy&Xgj)%4q(6&~qZwWT)KYXBWH6o3fVWHVG*7lyZ+- zU?ZP6{^9zF6U9?J?G{2$)U7?=vJe{i+jq}D;C6BkLVMqQ_96F6sO>OO0^EHoKf={K z%x#N42#!C!L}HZ$yzWSNW;jfcD93||0b1b$ej5U?22c?70f`p(C_HLQA3>uZ0c)lj zu60*f0En#RSq2{f%m9)(3UcTff+Toz{UZl)_|-=HNijIk4^9HGQU#bNa1Zgb?FZVl zjYFHg;XpVn{U{;tH8`a_;C{nVaP7IM&Iw@tHGl^kf`a2g{HW{RL0@@l5Gj(>0#JDb zHOJsBHy@q%0HF+U%R%FA|55Z!ZCoTy@Rmz8()ALzI(aucDWm1c;&OKcQa$CYAqN)G8|@xVB$z@dxEpK=+Vr;CMqGtCO@;8 zVkQis@~2?aMNr-aE*BI0oIKJ6q2CQX(YbVPu0co|B4izrxrt`e(atU;Dfq2fVbtVs+I za`c|q^Tz#Qk+9uGGI5mSU!Vqa_~@L33}#;x8a)945Tl0=XcIn_lHPOhJ()h(#EEyVIq&c{NA0DKgp?l!kcMK|w`-3-bzP`~L&)>zOlbh})O zG=zR6o6LHf*ltR?*hlBAI4J=4i%5H^jhHJx+>}c4PQPTZ(^!)HtU`bzE{4Y3FUNz^ zw9HeSmj&zQ;?}}ja^t2j5*<|9j2GE_MnxCf7?d!3ZK28){b=F@y^5>|`b{YH5NAr- z3BEMXUUP?Q+Ee?QR_8Nv;R65m;B*8@N`aIz5Fp?bQ1P)@jhlR1AXk%5fj{qd=SN+k zM6XWY3v0u~qhN1xyGktQW9wM4agsmbYx z>P3DieUu|pl1fpdv0Gkkn#R%B!n-h)(IZH>7g^CO+Q;8VYzptUV0AxGvJw=A;-%I7 z2RIBc%!s~I%l8VOn6FJ1JUw@3zKb|X<<@hmQMep| z(xhtg@qD`n1w#_5&vS5im7^yPuWIHq^e%{!48`G9V>r7Cv$AhjjXJT{l$i`Lge~M7 zDd!F553y96m8J^2K=B+3t^8JoU}5?ugi5c*O~T~MXL5QNy-w!b;%^M#EP7t&OeoEC zeus^{tci5bxvY(}slEHrSZ63@Qk8V5vU-AG=h5D#=@QrNGLJ|ZMBi(Wi?76cGSWVg z961)4zF?)SNH>(Jh(1oJD@dAE1sQU=ft&3EMv9e3JX*^q z{%X4knN`9i2!zH^*=0oJdXy*Y{SUJ)SOhUE*$k`zi{Q`Hegvx`l|^CHT_%j1Ow)ku z!W=j_imGEVtu6B5NT=Z5fgCl9pU#o@a(5<@yB~e-LNIxXHB!~8+eM)i&Ld^ReS{Td z9Mfi(EL{&CaAfWe=@@6FumQEL`$XURSQg}sHL0D@L88)w+Zj0(DDQ6Afv+xyksD^J zcrt3U9AEDetU|{u*S6|?8P-xZ+zL5aN^Q6)-y#?336xkwCv7^Oqi66Fk}@1-fHf2H z7@}HiizJh~)569LRlv&KKxV#Ug2xXut*Sh+>w@}rLSxCJRr#_6p?oeyV|U+I6)1ha zXXHsTo<&_JRhA@wP<#Ay*%1IJm=&GY>T4VyNK(C)kR;D!ekp`t9%tt zvTXYm%7{GMqcm8zVS9$a=MmFN^X*5b+Y-#k1)!0F9bg*tSRTbk*hujb?zCFeM&?q6 zA5aFkoidD{x9tIB2LSa?j3UUpeS(`QW@7cM5lTCV2625=dc$*rEXPQ686M?VmtGfj zu@iniIa*Uw^H|i)yZHI^`sSf4x zvg#1ppw^uh54U4OncWrMMo=oGwfej^1M@TjM4h^4NK4B!l{q-W)W>HavmNb>QptLd z*W7pYti{L{$`!PP`fhJiKX8SG#N5E3kww?BiuQ;I!NLcv^o0#lD&9x!2DwpKqX+~&X5vdf4GmZRQNUTty z!bMrC6jPauRQen)m#14UVAKi2$_PlI=U7xS2iYAOOW8s>^<)_@5{=4_`9`B}`pHb} zvG++(AbA&vcdo`weZe9Us#-!#{eU#RHRq3HruN@(Y`b0DDA$-f!@O4*Iz|;T3rp&K zVCp<|tcb#DF2?~9sSH!+jltp0C50Eu2(}Z+kNM+*|ZZ)lUbv}2TqKUE{-FZyT z!hxW4ngA%Cd4lxZwzd*H$3pY@Tx(6aP^w3l1N=t^X}-r z;zxgS+9lV$o@Qy2!Yscu0 zSa(g+!8qOsvPGooKLVDUmvm8NW|E$J^pHXV6)M_kO=_gi$#&*g8HK%P?FHyC1LL0N z#r05%6FK$ISJL+sF%P9FmBg~5^I~BBk7^ zWk?UQg|jqgyOy*v+tZ4W^sM%oX+I{9>wqD@7&!ReN6sc#p&ZW_uvLJJg)BbV1Lg5{ zuuMrCJ$6Hz{V4)w6wt=7&~PuQ%_O8nbVH+EApBG!$c;3AlOE53Q zLw_bl?>Z5t`?s#nv_V8@S;7sH%K>Cb^xFPVM}#H#wQ0mwvouyqb_*n}5u~isCgo!^ z+X8+=)D?c_Y(t*BX)@hVa}Ad%Oi6 z+5`!&t6oSRqcY^AW1G2mBwiiO!NUt3x3r-WyRGdxGv;UG=x9SH6hp@thJ#&*(z93^ zXBJ{-6q2&o{ii2Hpr?mzC?;(f#NsC|a5A=X5|OKU5M^_+Wl#ENv2~8DyrGzgxO*aG z0C^<)xnj1ahV3M0_E4GHbH{A{8?W;atOdkr+EkgVwUDN)^h?QH`EOIz3%LZ8$cNjx zM4#G15z}ucv*W7R*xzJ}WfO`Ep~Y{IDZ$4i*tn&H$oeJjB$_WRXe&0F6ED*%3pH$(%V>wa#h)Q)cA7MuBJ$e<*F;$Gsbag#6@cu+Cv&!6wGq9 z+;X3zax|~F!CgVU@{@;!wTQ3X9Mrb2nA-FVLwJyl$9 zF>BM4}@DBA{0dvBDV86@z@w zK~fzxdoS7fGriL=5mj~xwoU^Czm7UP4h0L36Gg?z_5*mv$@zvYSW!L7{``Jm%T$vy z)ww`nto?`GF^!f^L97GOVl!1kJpBtjMa-Dz$Aanx9u1&?_vv}!K>%Nr1=u_y#rh(v zOc%^4o##kc3W`72wv|%^blT=*fDq}CZoMMMw$QXP=>HNfRcz}vHi)`8cNGG-3|;z)B<4@ookJG$yb8A$mmj;`N#^_C;cY7$MjA57kJ9^S07j%k99aggE0ra^i>)1Oe z;V9WULOft)I1*+LL+A=uhu;4J)JL;2dY8_D&m}ysZ z$zZw#x*bGnJ~92hR$s-PQnuW4bFp8YU!?(UHIqrSDoS`uU+Jyo1zw&KVm3Q}j~3)x zz9vVGI>M-0YOn(`tR(p;Q<;bhiutLn z^~TfNh&G;NHwnC5lS1Lr+ziWgKDA(8;Cf%Vu&U$UimsmSk?>jKDCwj(p~5(E?KRD2 z0K;ytyGyNzi(l1nbahkGkyTa%wkSq<#nlhqW$qj_j`}gO z!KcwI?{(fYj<-E2SK;}vH!yY`u@f+H-oGGWeudk!iqXiENnMh8c98kGS{J?v`}5{2 z`l{?{K{boW`l%op<3U)ViIABpO*5Qa20;$MqiqI|skG2W0iTih@>QyK+?PEmD2XT9 zg@6zRl0iMG`peIjAmsg9lq(T!s@!tbg6O~GuVbB)LCR|)V-o>w4xqGAc zWfX-@n(|3py~o#WvtmFaoqRYqJkS?6oBoP-Z5VQMp(lvI@Wdv;8#1^qJ3jqM-won^ z6{1#oG8Hw+VsR;&y<+i_Xvf#4tldDfk8KbMlw+hWC|=iEm6OaE>dQsXrxJ8l+}j_2 z!}!BP7sd@`&T9IIL2;{1>WNk=A1W4()lvAHS&D*(*Ta;o`2;86j1r(&F*&~4o8+5q zB#xkDVPCyIIR^g`jSD`PKagtg9KlUL5e`?6*aB%^^s~KE?a%T-lQTi>NR*8ACzlC8 z5)&c0u-6uGEWGa%ack%F#6wxhSWzM>m+-5b$&rQYAX30M-+I?H^m`I9mw8EE$dU#z?ZQv@FupH;MszCW&!Bo1Q z=P>5b$f5;Un94BWc{Kh1zg00yYB%-M`-Q0_=Z#%c!LlW5*5$Mz_XvOX;)Ki zeX^Pa5^|2iHsLjb9+Ckdd|liwVsX!B-gjb9&hiK!Du^D_pS}#{a}1J=>LS^d%OgD} zGH^n4dQ@;OF1vE4p?#YS-*AdiI4P|)y1v!Ex~z@_<24%Y^huf86uJKvdVyuGnkG&Z@#hfk}Tmkwd=5=@%7QQzVyjB3j0_t z%8Jkoc;eBQkTSUF^;337)}zBRABH5Y<9`lajt@g>mvlL}yx2T{i=s3p`t`!56yQtL z@@}nNKj+e(3PCk=AS^>X4cO+fAs}PqC69-myTd>*C8*QGP^KrowIc{<;Oo9?UGSke zV|!|B`*yfO>GD;X3;;J2$m>7JS*y1604V}(BDm0S>G%_d-5XzE(cRv@ZV9_RpzYO9 zZKdod)kG626~?&-cIQ8leiXUl&J*UD>*?1F$C(!mo}xk1p~B_Lt+p%GJS!(wO(Evd zaAHD|wa#|DU}Xe|c#s>@N2nSIrt9PmH?q-)pb

Otbo_oh_DL+it}G{%t?z9L-m9`e7WWQm-rxa~hFdSiD~_bc$Wy zO3}&t)KJEQ3tXbMqq%QyD$_ee!Z#_Z`Un3XFP>G;JI zYvq;A_A~bK<+7^R7aml0l`8FC3sH3KVK&uo?76WrMrrTP+6*9oqMD-`JsmT5Nh~Gg z0&?5Pk8W+-=x1MLvxl^i3BS)14l&%)X@h8{IWp-e;<>lS&jrn)OtMW7R6(qX~&H>#vitR2X(?7MFgk4eN5hVH2n1f6D~ zAN}x#D7Mq%g7qb;(-U+S4%oQ_7~eF*H%*J7M5~LYeMBvVDYx%w2Cu}sutDxd`Mj9G z50A&}$dCBDkin1PjWe($rnMwiq^v;5LrpIyIi5L=D=A%UFJW3L%u7$XR9MX91`eze zn8!!XE7*uGCXSed!3-5T9IqGiVI5q(WVga$OJ-a$I#rli)J1vPLJXn{;N~6}*{MyT zm1Dv3uAR;*%~R^ZM}B9~qGUCiKzyaw8mvC{$jTdRC+puWHg174xX&$i(HkY2YdyMp z;!4e3Awhy6hBHM`Co(VS({$bD-y>z+M2E1a``u=Xu}qD#A$rQbXKn_^qhWn#9;7|^ z{{B+Y>wBhUW49hXW81}cum}2me7YXDyZ*!~f~n%CN6#1=H{Zcs9m4s?*r1pftjDI63o-)n5;{i_r#A8j;zMG0?oG zubb$gdV(ZRfe_lhE+=#HvkepR4&k>qfu$>UvVRr`wPQO;CMyzq#YM)Hu`3+*(7vAFU)Y1JZkI%#6FZEqvp7Mcq5%;W=@4O2Wh$PqAo?ah;q{L0#wbG^s*VT?~xnQ8Bc$L67T}qv%)mFwzGPL*G5jQ3g@@;cOb3)Z3uWwFYQuGhz#Z{w&8qTmUaqtWd z7}F1B&Ts;3Qk+gpr*hIQ>ImD)cGB9B<(rIA+mBE>)y9(*Y(^hlHYpb;i%S=DUeq(N z8%6j9QH76mQtR*8iMx{FDO4Ccx>ey3(JyLJh>U6<0kn`9Wvc3gndS6Sw1nbDA7A&)byE=At|A|PxBk4(CXpAbjD+N$kh5*AXV z#ga5UGbvkF;XB7GeiyG@kU2!_OaOj~-IEoE`{qztM*90l>z_aJnT&Z@nq!+|kFU|J z5PA-yUUl8}y)Wt&euC8{i&)li7NU*{ z5v~Z2VRT5WoP6R2dB;g^Ap&OKSvQi5ERVLkfz+MajFzEAX0D3aJ!}=oHA8Onio1>o9Ww%kzt& zUH9{b!7gOiC>yY0is#=_+Hs^pM$od1y?0%&AkC@jPKU7OmJuRWW=Ku|kldZOTuv`s zrCFn|y)!EsBBRlUeu*V~>@6B%ya{5l6hO+=4y+-p6#FWhH~6449)TPUGhhatE`KT_ zOXUVGG(cO4DBCO13ZE4{5vDTd8Rhsw{S7~7Sg-%8*X@-w_R6{J`W}15|@t8fl<-k7zFBZJn?ZSIn927c6ds zJ|+3=wT)fJ)*F3?kQ!{k&0xk+4wNPTK74&agSO zFh<+Zl(jHA;IWkeV>NbXa{V&$_HV4?+P@<;u8e1XM50h;}Q5n$O(ZgYMivq zsvo$pQy*>U{Zv28suF%iu${D_iN<3YgnR~<(xMDVva`r*tpS2!&sN}=UCuQi;$6{w z8j;pl>5ymP(GX_C7sFXICuiQtX3fYOk@RxWc)QXZeYbwYogfblQZRX0Bl@fvj7B0m1EnGWb-ZCd(mf!!|cbY*6@QEJw=HKY>94Si5N;}HV5eTJPqj3Gpk;= zK*h%IDJZ8_$#-guNccQfHgC~y+`_!Kneckcm=5`3HnQIB_F+Am&4Zk=JcccIh!!+5 zO;$eRhI`nA18qYuz=;N{y_GM0|6YRYyy~l6RnE8Q*>G}}_tB)AEp&7UJa$Sp0lc@( zs%NEI92hyL3K9QdSS(y za*lJYxtU#NkYbgS`C}$68#NDa9Q)}Qy7DZtrTbfPw;(IS_=LTgF~n|)1-=>a*u+S|AhDs>97(eIj;D^_Z6l)ZP95p7KJ|FL(UUrnv+ z{_qnLAfYDoPADS1H|f0y2nZrlq$q+&6-7jlgdXWddI?oPs`TCwklwoiR6wa>K>>LJ zuC=y%t+ju9pYxojyf}Zr9AnNg$90eIJ?C}ZA7dMujEL#EF``!YFGW8!t?`q7!xOOo zoPUWWKh8X6v>44LmEz9_Tw?9y1qQ?gz`tZdEoF4It5GCoEbQ%5poqps`#XMNz)oBF)_ zJhrMluyx~(+H8oVX`Xitt&NZ}*`#kyVkK9Y^D zXj|tk<%*jCv%bWYW40w;quFiLTtc~CToh~7ecAh!9K=TS>7OOOB3T_I^gn-^IGA7v zSk^bxhr1|KzZ*3;r$W^z!W!+qJ%?dcpuT*pd|CFc^@kE`a>-dw+}o=#G8eI(-y`B? z(FXESpzU-Kl0Z5P)yD}y>@(?`Af~47(X>H2pLS5h76PR9^Fs&7v zOS{Xm_Vl`r0F`mO)37IHK_nBW(R4WQhl&l&&=)QqcFMB04GabS2|Cg~bCT6zM@^R8 z^@E zu5et0+WPWj`+jBf4TD*)*7{1|Jr?pe67pyLSj+kTD)~8o7OF0v3;whQm-U{OdHXr* z6uzLm&gLQooc)4ec(+G%MBX(FICuH^IyWntzN8hP)yP0KPr;Tu?XoXA2var?wQD) zHH&#`8Wj|ADdznpya+zq2&f916pD5ZNpw&Pk&i?aw0+*Di+#IhVHtJxp|f*g8`;z@ z-4`S=6YA{<@b#FICR-J$xF~Z}7D*`*b_vG91SGMJil84`j!&X}W5e3^ag{4_-$#fv zLE=VE76}fo61)(H?;mwVGB)m-7gzGTlccAxr8<&}QQ}{bU$}7>DA~p6Ew^Dv0*#3x zvsmMuX$q{l3fm}2#Uk^{NV}Y)^~o8f%ly~Km|=pXk;|KHND3A56diIhW*}AJ)nt&2 zd(te0eEs}8JtDXJCnG4cTyZ7-#0s6%r6L2y+M7U8e(MF&8-*`zUm6) zk)eK=!fIS^dSiChpKMviCYpXHeKfMb&@68_5~OYPV9|g@JeFd?z{xqK_`*Aj_SmR0 z@m)K$%8w}mp4{BM9AJsR`H~#-a)ADg%P=1N8^77FFtGCZfKc?QGg_mfbm#771 zucqFZh2FTPzT+(Vl=i;lw7$%y{z9ML2#Y@7qy7-N{%1bDjcKncnO-F>47@xVcyRQp z_Ncc{ZeWpVFo0=jUT!eXXRw=TU{!8-!(!;Q+~BNS-y&OoDsUuop?^(oWUXmrZ(%5J zVFZ3WdfYVpIc<=(x!1i(LP=?yww?8+(zvn`!I2GIff;Pj&Z=bzJ!&V>YNj7UO$@fN z#v>=jfz(Pb?#zqJ%@c2rr)?}>E4Q(x$uDX9&TK4B$5@V!XVEA4E=c%d2N$v9i;G`<=O};7 zGqb!dvwTh7f{i$SJ#2}Ew!A){@%kzA8)_CX-St&P`IWN8H-aUc0YAIkCKUqm6iijXYR^Ce3Eub%{85iQebOnybRvgO;@rg>}CS$chrP8-Oqr z1D^mAhbnDWC^4%kZ+2g&a{IAa-2xxVfVj2OFZ}qh+O~C)-o81Ei^aZ3SrH2u!-($4gC=D8o6 z^sGC~t=ptlyXRX!%Ab4$vJy(3Y|5VOEu0gdgNI9-em~w*Mu@OKWV8-(6E3pH%*__FvSrG$Ig62ZxYF?j(=!cWH?Rhhua^TmJ|4&G_B@eG%eEK zG_CGDL7b+gVcfM|d`%mtX+17z`(4vQ!~f8U{|`;;{yT9zf4C(QhKRlX z>0sK^jed_k;F3zcO@bMhS|A1OW9XTt^-hQ01xctXL2i=>5(j@m9e# z7L!|rXPQ<-pU^zfa+fND^<9)_JM-N4B@wc){Fu7V_RU!SoT}{zaeEWHG0T z2yVi8q1L1ui)IgRos2V;UxxSPIG_br=haGQv(ZE!kMv?G|F8`j6KM@a}>x zfc$+#kk({R8lK>(umC74>>ZMkFioyuwnt_x4|}qP+Mz-g)?8{S9%gLkoSwYNO#FD^ zO@62?u)}IULpP~@vvdHbY3)o6u#2Xtdklhe!uQp04N67RpNm)Oiu^G~WQ&R?ph>yF zDb`2YX^CDzDX6bUFN4@FMXwN3uiy*v6RbaYt8u}Da52So z=;7LV2@eyJ2W=;!mp1Y<706yElns4Xs&gY%fo4}-I#lns*?PF$u+wrk0|8c+dEb#( z>b=!dnXboI^5%ihKKZB*E^$$~XLML*Pb zlS?&3>+)lY*m9;k-qo@Ge#~93@^vjS2z0=jh>}yR^Ex{vnq*9XhajIshkq^?$K-l= z^)bB^P5l;cpTdzcPIXFov zJyh{Xr@41t?(GE{?>VtYMDgvyDYsqT`>V9{0kpOr&*2~5o2M2{6--SQTjAfaSne7NtNL90l7HQhJj}GdK8M4% z|iZU+( z6ZYcf&8tR@*XBR>R@SKVImzOUOm0tAIefOWjS924KO0eAT8-e}c8D|`NMNf76@YjX zpUka!ayE=xYTY09U+i&Av>IFNeIQG*)DoH4_^#T;`}LgB_WkL`ZRhQcDf8u_A@2)?;-p>VO{I^g<4L=2?}1=5N+=7n3>=<-0-0sml;^ zPIAi9-4JCtGRT@CJLkLd5Pehy%3) z7R>jsk0K>m&v#)+_L3$$WK)Y)6k<2Ac()&3C<-Y(TKeH~p%%I)npEl2AYw!>WUo=V z!R}ods$0{!YQP&lxnL)GV|LI}PA82`)Sh_0zN{C3QcezESKIID^I^Gut~NSS@MPow z(Z2!NWk`RAP%;}sm3-PENPnxT={-iU_bE@rQpV6@jxC=+BL3v??Y%cP-Oy1^R>l&? z%e1+Z1&`TVMhRGrA;whs$<3HJ1X-|L!9B=G|8 zU{8(T6?)JYvYUEz;?ffEQa6!z-=< z4?eE&rPcL{>D`U{TEM~MD%?gGxu3R{i)9bt*l1e~k}Dq@v+=AMwt1zSwHkJkDOCG* zI8NbSzSLsTQ!nq4oTa%!^Uq{Yxd})1VZ^krQ&%)kWTwJg?v*~0->@m$opIiVr@!*u zmRx=~p68Xo_j-Bj!SH6{%Zdjjd}BM_YlpA)Tup zEx(t?U%G@IZwEvjy*bmg6yon#9j&h(9SoJVd_L8*zOsBD+i7`GSa!S{cB*OldDQP5 ze-fiVdgt1D_?p~rGso)qlcMn9YShUQPSeuM^gBL@GX46}52SU1#A#Y-85{_diZV z@^A;VSNRJ$1W1Jj4ATaP^#-7f11>-UmAM0T?)%F-1ZM09s1^k3)CH=01?p=_Y4-*i zX$1+b2O5M132_8n=?$_>LS0P?vgs8xUk|c#2nIodZ3}|eRD$i-gI#AH->wVxaNu=@ zge->;!ePKW(eeqHkZZznv@jr5IcdQSAlw@GfEy*N6%sKc>RAvvlXyyLsYjntT3Ec! zpOluHOu`5n1Va;YheU=xp~XYznfk_=o}d$F(5c*V=@0WIrt|{)ww#0~@&~Ti=1%LtbZUE!g(HlvAV-Atu*6G$AqMB5V(VW*Fv(m>(XuBa0aa;DW`2n%p>$u;N==5HY@VD>_Jdee1$7yd+a@>yB ztBuPWkE4MCa1&=kbi``2C-JKx@fZ7^%7(;U<9V!qfz0rsO!_)-Pzy~aOlAsAn5%ti z2~8Alk9Xd1v(-+>zmc$tJ*Bjey$Sao`nu952HfVdf+kI`J)y$lh1Dl5(M7}Ck|HjU zJ*+2vdQHafVbaj~q`srXzIDW4jF6E#Etrly$%1ZjSLHD=0|iUhgRn2 zml zF!V&(1G8BiGC_0<+2eItoJHBFZ&~M4vUnP@Mf!cX-e(s=azvlxv}@)_ybqO1$x-Zo zB-@{JZ#_rJG56W|T-AnPb?;m~qJWELIlAw2jo!Oy6Xn@(=NT8}72L|Z`XuODL!Ql( zTbA$h^db4SPx8-e$BF(DhWnb+Pf9?JT2mSYi!voKSD!Bz zVYUG!UX}P)2#NQ2%l=bM>rVMo?4_yhqW3Zw9HrIbAw~bZ;@@jphZUk2g6@kI=iif+ zxK&(itCat6rfJPrwyA^F?^K;?T8&C7ud4JuR4MCJY4cTIOcm0ZnH87EfcYazNg6m=! zg(+?Kf-;AqM(+8F)UEy55S1S+dh1NaSpWYr`CUa-b~8h zM!V3lPuwBWWFj`uF6BhQ;?!xu)?qPL{3f-H^Mw;MK5%R9kvjuzKuCnCB?PAzy@ z-A_>6_=Yd-`8(Z7+O^Y|d`No6V|w)EdR_Rt$IW}MGIfDsUV1k5f=7G3NxG2>FCBFI ztbJ&lHt7;l`+_(7c-kA$1D#<^eUSs*_vE@FP$#{K7JdE${Yk}rQSE&dM}5_s<1Fw(Zef(@A{DXDC!NPFHxdp}*lA$dh(sr3`o zGIY)Sld1JrP;2-vpw^b37*5mr&nc}0G;$JcvV&H`fm$iy0Xq>o(~b=V{jH|;lhWFM?)`sL)B5ternT0L*=$a$Y#Co~`5RE{eb@7^U0D@9Iqf~;vpw~H z(X=Z5y{0u@|0hjrZ>sa#blcWU|Hqk^-)B1q<~nERUw@qc8%pcj|592y8M^f^-o8o_ zyR$SA?-YY^V^4o>E$(xmX;fwbULhy%H}zC{A}g=~k{1 zj?&tV{wGRHIoCQ|t15RSQ27BQ2}f!9#^TXICp})C&yftfMS-KV$jBvsto$>j^@$j1 zx4>ox`$=iJtZX)#%%Kg7d0^%+-Upm`I7(~bBaYI#6o(dx;KUt}pWLSd}_`p&;lp)_@qLZBm}hz};lCpE?+R)0om-F(5XfHxTj<*!<( z;A?W&uZ8k&34hY&9F0zYoXo>E%AXOxyT=>dejLGk2xXmQU>1|E5`wD<$%T0sARpXmAxJqo?zeJfI5P?9$14=_vDPfCktp(1iIf$$YbtSA4J z(o+3}(pt#IQCb;il$J0^sX+KQN((|*XuepHWFy6Lpo${dWEt|ZCTU&%XG$xjM+Waf zh-)!ONnm`wEIQF&^k%lv0I{w2NfiRY4a;?=rh+vv+?Dr-M0ua=6Dm0D{Yy%V53ZA< zQChSXr5NG#k1PfmpLL~M%U1V?G0%Reo|WbKPfE+Hkj`0CDO$C^ob+o z>Y=St%PyTiQ(E*|p|tfyCXU<&qVpk&C0WbpK3gas8FjO2$!I zPIyWyp#_RuMN!xLeo|WgI7;h>N&pAn64SX9=o4GWwGR%;6u(niU{#G(!yfzZs+T6k zas!>#aFkZr@01pOIR7I&|C_RDiD#6S=r5F(&R;358w_E@BeJAK7(qSq@+vBRBj_H6$g-|5RW{6{1MIhCrS%XRdxYz z0VuIdGxaN_1tWW>+>cj}{RgGhO(Kn>w0v=t)-jO!S4!*qUnniQ)p;ygtw?n5*4@;@ zh4o9d4by|$o{fi#yPhVBPb}cRBl8I3$l4bC_rY9NoEJZ2Lzu^i4P$Rp34G|S)p+L| z^Jr&c38o0?G&KDtjHFSOfgnWsxmEmu;7 zc9OcZg^0L}?x>X;@*O(>$qoLQ((+l!mpka{BU#~mGQ+&%eYI|wP(S{eIhaObri&{f z=^kp6pCWe`%bdT9qqN?WiQ*`&$A$!=cSo)oVeqX2l>VT!9_|wRVooWo`;|ZFYM7I8Y zH&uG+p$C^wDXr-lR< zU#GM(F*j2&q`k2y3m~cA@08XELhjp`>KkKui)a2kMpx@kR+W#Hi0_TCM|Y?Mhl<$& zu1(&Mm0u?pq2_C);9FO@uy_|OPsP{!k@d$rw0>RiYBqk1fldxB)H{I3+$)SXOS)F+ zjMC~kqqN}Q@}HDeA~>_0>>7^JdI2S*2$9-H{y>7%$L})+hj_1(`y^418~jdbb&(W& zCG^{cjvxXpjKqCxO(G!=?%Lkt`4gqJc1mgS07!9^Rwj%uVVv)|v~T-m@?;#PbwDUZ z1^$E5vUB76F8l}#49D;FSRKCxQ`f0dB^E@0VzA2N2ES8UMj-Aus3;QTcS>og62dAY zgcGQbks#6N-zY6*1jyqPVXgsa!2h2pt#NCqLQr&Tw7l*{{0|s+W;EaNjDUBgD9;(C zC1~&$N-N})(rO(6tN%f1?GhgdTc)oN3}DDz&@j2H590m*OliseN@;y04hUci@4DM% z#O>M1Pxmnqqs>$MCrT^e?q7dfj0({f+1?akj(LeR+NdT5#s>o=PNRq?1*&^S%xbzm8)prUC)}m zbVlW5QnPH>9%e22P9-HNo-M%|LLNXt%9MhKbR%Q%CX=E@q`Hw2urnNMzzfbOExDhR z7Lb(FG5Zw|v9_CQz>s%YCePR`&*Xb<>MohbH+qJbeiU%<|B2E{{EgD`!xE_9kWMxr zxF^gPfe<$nCY{@bwgL#&_ermY@C2QB7u*aaPsKia#7pa8_6(5-{VSyv10X2Z^lY{E zbVy-xnJX&g%|lu%q}zDWxIdHa2BF=E1+87x0R*iEr<7I+Z|Dd#R@g7HM0SE?X&nT@sB!X^yb_OMPMan-v^Sl0q z*u@@po))SlZsnW9eNM;4zQO1|b-*oq6x~5-&vI_`$>b@ebqkyQohUEUI`09pQWQsN z5kbg2=`Fcoughb=BBa(Kx4x1 zDXmuAveX0mjQow#68t+#t93-|u~CidUR^UJ76g=rRy@$!D4BEfvayCMCzIYHIGrL;anTjw>} z`I=C~*9_mEQd(2O^0VHA{QaN^>yA+b=;S>?Irg!3bcg^9TJ=9iX+>BQ@L&9Ql-B4O zrPTrYZcV~5PoYdiAonLqOX7^u5=2127h$bf6}`Id*}YsVk~|w9GPa{!yQUr<#~#7o zDJ>-S+@e9JhYwlRFO*gUhT!TsvR0;n7{GwwpD3*WA{QmX?;8}yzC(?22vPA@_ z^cPC23P)+hKz%l8q4IFzivn*awNU9!l_cy5ty27Ef`iW{>Ss$UnniX|B%wsBpm7&^{p(nyx^VpOfKzfLB5f9czkJpf0wlT zG?Pd<)$#%!9DGV?J^Gc>y8An&^%eTTb{Wl5L4Hi}xP^k{j-TNdsN-KLtt7DZ@03>N z$lY(Xenc@)(N_`GW8BFIXNg}aEf2zq*ngt5GLcq;@*8G+ai(GGxqRxf2Z@n_u&aEq zYkccgI@=rf3H?V}i{~juGbrp+Yd~=5_mNA4Hl#tS3V%>q18z?r7*w09eyHANvBT=v zY6TOzw+mR4Y(I%~?(&m)81lwtZ8o%annhUwzR0lL5ECX1UM8?UqqIc-Eu}S%-4+9W z{`tH^!*sh$c9~9dW3WHhkiE$@d z|4M0f{z7Seg>vh|tbd`js{WbM5=Gq9JEOD&ex(3A;KK5UlZdcmiD6J?HqMwwOOspbH+11*|%#S>` z=IbqAQkOnG*j(+i6w2H>lK3=TXO+WcwRH5YtaKQd(#$ir$+b2x5NWbdo$R6alhXR2 zcMJ4}Y0KEWtoiHpFxQsp_l(1eApojqrMF~0gYpM&a)uQkvY&QUVRn~ym?(6__tdD2 z#48l*$QT9j|2ve{D(YWTTC%a;9+z^a7ApP~rKNHXU)cJT(h{XFAEA4ldrw13=qdd2 zr3X*!t1qcmYwPK~%~367tjq?rUe*_1ouDDV@^sQf$LET%gj})(r^v4+b0*3e_U`#tCZBrT&?Qy%+Re#y*CK3 zObrpdRT(R|p)xD>sI;oWEOkm^@zPI9tL(Puva@6xh}OrggIG$=?Z_+R#bzhtgCqA| zjzpik{Z@>PIz4>TJ|07|Uwu?wDAFFgv8-cOH=37w=*~46J>%OPy&+|xScDeO`JK{o zWsl=h6O5%16Xp36rIj9nqqK;h;wUY2$Qh-jh@-Tg-~K_Mb`bhpAobr+TJwSPUZ<4S zImjuc^^8r}KTrsDeK}&EprFuUolp{Kn{l!tnAAe@c;wQkFoy8?|z?64)+}aNDd>kn&t<^MGV~o|_Bw@%l+=m7Y;rnR;z8YE^X6^eKN& zX(jzZX*~f1TT%T^X`R0=4y4(4Yd^2B>|cqmOc0ZV$NzpW3?UqV$mI4vqqL6N@$F3G z6)yh`rKKkhKo2P1&iPwP>*G&K%jC#&t$% z^|F4P@b}8aQCbLfF81W1%)k=gH=pte0_;cKFIPpMQd-#+3QEE*+01sA#icJ^rt&p* zVplKRLO=~OnmUQ}ag-LtPf9CwH&jtA8AoZYlKEpfLBtdz{xEU>l{jTtGc38V!wpLE zARc|uv7Ur&2TE6x5yr~zlb3Dc{y}L;;QvBtVL}6_Lzkn?!9Z5@DW&y|m<2~^l_zOf z@3{|Uvw#c1*Fyellvb}>WTPaQ^`%DuV;3!$7Tg~99oR{gv=U5Jr9}Ezf>${Oi=(ut z2}SFIPAM%~r7J}ox3?}4C75ke+>V}oTN+t2Z@GK1xIy9}Aw92cLSR=X>%lr9Irs_r zClzqZs&oJS&PT63+$dWQCwCrK%!__k0n=hap=i^7t=^D?FTgu=lEn=vankr$sgPS9;7#qC1%pkfEBDq4mS}9!DJ!jE_>oG(@ zXn8=i5CHt*1e#bEM`_g{D4BGfvYAgQE%pw48U#uZ$c8tJhZlGM%^OE|Yw+3(j?xki zX3qTFEo}moo!z8kepVl|Qr>n(Y02X#Et#K`mV1(}?4KzuWruirin~AFr@VHGzD4yL zr9~G2EXPw%Kfq)2_aW1bPl-G?wvYb4`IbVtv(KnUm; zBSeBNf1$MI@yMzaG$oFNag>$=?d|eTQQh27p|<4?&WoS2Z&ZEoAi1%yX00BmALFFk zraZDHEcwE6i;yVXx{>V{N-O>!l-4C2r6mnO{!VEz!T_X70h5meN1yB6BosR_9Wewb z;kpO`(lA>v#Iy~f0SNTS1p+|^N@yw|z$OHXx5nLrM}R~EbW_v<1!DvN$XgTmT^Y{anKjrh^qv_?s6cg$%f!TD7DML0dDWS$Jzx3OzE*Rk^pAb ztVR2%-O03_l(sn?B{Gk7vn>H_1RDf^PFWxzN$e4QI-&!_t^{UB95WdfoIFG$ut17$ zDx)4DN)lb}K%)52^>akZpu41`@v4n)p6zxoqgidv0RWVRRwzk~&0ANhoX?yTpXkyQ z@D_*<)v9Xcw?{WU>s~ExGbPCnpM@LJEQ1&ShFJl$$WqZ=&d1ZD#uY;%vv=RJW>iC7iucoMEKwb95Hnq zEj+|3>Tb$(U;r5yIVh%#paT$W=HvN@T`>p(tC5`a@pT@z%Q$ktXJlhubHq3(p#Z%~ z;{ny8#6U$76ge@9NDWT8(7eO}cQ)uy!%9%@>jRJAQosy^Ze zXq4)`P7N7V88ji}$rI^~WACwkdFx`e)jt8DMQ?m`S**#4oHbBA|Q7^U79wl?`;-L(e+f91ULJAp`iDaFiCHDsJf6^Gn_wjlcuElYjvzx55<3U^fRX zFFVBfAlN@CL{I5_26x~st-(Z#L43U|YC6Dqx{DLzbzD_J(l;rr5Qdc32>du=Y>*OM zg~9jYV<=2zm{Gx$@`MgaqgcLR%5MZFyM|O1`0$7rS_!z-h%$Z5C?yZUxu8H2ZUD=c z0i0rxe=pDaTaW-RL>QJhxdG)nxJ>$%I4O6`eZ~`b-mc2QCa1DS@2PmF^<72fb(MaG(GtdzUI+)^>280eG zC7>uErXFr2S~5TAF@qoKj3nC~j$@R!;zk0oS1GSuC>XyOY#AFMPwERBPtY<;j}{49 z46_%QV@Cx7wG{3b&rm#^_#raOqYeiHW+^tRsYGX8CojpIi=)F_%IXXZu`|}CtAJz1 zZIS}bW$TytDnS4ybW4KYnd1x= zptEqJOlKfCqTJ3D2=PKf%T0=^&E9pIslCyq91W&+IGvUdVGMvG11L<9(B2vw?3ttG zVMyLTu~K01XLH8pm=K2hUmp$m#ldlaPei8Av5Q-y=Zwq;|rVG*Ry&M7VZU9$CN# zPw!Hqy!MKDq%4jRK$FwZl=pfe2oQVDA3)dDHW`y6A0aEfkYsI^)+_#UlDPMZG3$*B zPbYOYKV1TZqA~(x(2QN9^6(dxGalSiV>m|((BxqXq)Lzmj|AIc8lrHH)+d~!rI*pyK(0s@^ z`Fd^_0NJBDyrlZLYWy~HK<1+DkpR3o_d;gn6xFEB2NLtPFr?FJ8&DWPxf^%x)+;0A zaGWI)chHXPJ3WeP$kq~T;vplcKCC}BMc^F7xS(V^3;za|=kiF%`67WYciAr}+qoOq z`6Gkoc=cj2fz~5v>4#zOib`QE^Nhp$c?J({>{;{L*`qT@YaXbGHRY7M))*&y!d^8h#}#7$upT=1o1>+IT%k_t zV#>!kTIvvhH8|8fvUwd~@{V%(*0-)VboJOSYmHd!O@D5X?vp$v0X#Fe07V!tJ-|__ zErbe!cR9)5;y6fjogqiVG4BhZG6c`K4wS|Y73G#rHI3J}9oFH9XV+YOUDAlWUO?RiL^=M9a1P?`4(3UZHpPbWC;aOBBc@-F@A<Yxl07 zB3chzUX`iCiAPo_Mr~rMZknvOWu{)}SWqU{H3u;RP7$qm0PDQvi>Iq)XNcC5At8dL z>-67Dbp=ilEsj(qDL3*&NP6So?F{1={Bl{qDWa8gGdYq?Pcq^Z(Q3PGbrEcw(R1uqaRLTPwY7%Bt1a5B3o5 zGeoNwDZ&M>alSQ%E3_dq9HJ%Sg+sKEwj;}nK}f(ZD6_H|5{i6tIJZkJ%VxaRr9gNd zhiI+Z_7a%$C_egptFBD zeXE87aE55rFTXmJ1PI=AHULg3Y`kQE{*!k@8JEDa-MYNcZBflY93?ImX za7Y2Th!M2%&h=ynhiD-kkpx8?yY>k-Ce(!1`O4)+c#tzh%duP*hiG+fyHTLd5UrzK zn7#8UqSa($#xdHrghRAmH6AvdB3iK9RwE8%W$s&@?k+>G>zIK!M2lHr_TUWBn%>4C zTERj9Ya|ZQ3i~`nwc)XQAxBM34SXstZ8Fa) z?-h}Jaq9;uZM442wA~R8OAiR%uqP<47y3byw8~tQgM&jK&4NsUovDDmRiqKlYF-a4 zlSgvX;Zl=u5q>BLZy3&Gh;2i{!L))@_CQ>)Wf>{~xM2Wz2;yY~9n2kHS(*sO0!UuQ z;a21Xf$RrLSb+3e55xjy*M{d`!{8-1g2rkGCed8vHn3w8C&md{p>0o=RSl|JyTDC{{g6fx3JHaMqTExuq)wVZrnF1V+X)2-I&f&60a}Q3S^5*i zZlE0{fw6gS+{mkp+ca~RVM@@YffOl^?agmu>o`h_Qa7f9_5DQA)k>GG?-D;Lt%wwI ziTM|6g+0$0C0npR`qjP3#CSks$*v-l{V*c>VsY$&6_Z5a8?xKkCDrr=d~$LHhQ?suyI;)Xea+C ziMNaD^Aq0!znxNA$=9VyeN18~J>J(pOM8WdC}pHgYBPZ0x4X_~qrY==rbfu5FmM;i zU0jsf3q7Bg_?=3M$LkoA0C|@6M26SVh1VNLY4u#~x3|*8h7?uw6J3#r-W*GoERGD; z=9U=x{=7ImR^}$m-~~N_YiHz>!WV-whx}FQDMb9$lOO2@siqtoo$Y7t9+uQ}Xz-;o z3f7?(QwFI-w{tIh#_>0Hb}$MxfBr5eQ1|UUj?y~*ez+60GN;2Ra80sbQw&swwqUD+ z@sKsF#@d!U$t`3H_Y7Z(!4m*H;=kj|8dhxI%aK~lBEX}25+maL0JAYHm0Y~~R!-|h z+8c==xpu;s7|07Q+2M^Z(C&)CM`B}9V=*0E=Vj&BF#0-r&(e*|7wHi^H)F+}v~>d( zsSeZ5g7o^e!q)2yXUDlwlIv5{fbtx9BR2->e3(fOeSP7P} z2C$?m9D6mSD7a%k);X2e#i%8u_`6pqe&7`CWXY9~d=}FbbOVo`8=(PoG2;JNbiA{mj=AfIUL~Z6tmU z%Q%e(U*SEx4S4Gr)Cj9iX}w2GONFA5w#M#@)-jHYZ#`gn6JD#p-2r>z=E$SdJ}O?o zyvGyz^vnoRD zFJ5M&FwvA)=Ae}0?(i^x2kLW{c{~uNb!6WQnW$^`El|Q&Fl%FWcEUryx}ZsEl|!oi zorA)~L6b;fuebgih(@~r*iaXmBAmtL-)cJRc4A%e@>=$-+j8sHOb@TVurPB@Y&x2T z+FX$B&*KVGdl5qIpk0~#IfuaG10mUEVI}ti4MycpguFN&YpreN;wg{C2_?8_CaQO! z)h-nt$fY*jAT=XI_DYX+PC?We^RNhuaU zB0ri9MqI3lyBvq!d>TtAoxUdL#tYI}%p#C>z*Ddg?$dM1HuTd{hJj=9&g}sREid}( zhgmlWG$^GWLs;nYm539`UWSXSuR`^5PNu3~RF}B;@>#6%PS=lBmwGnyUEeR7ZvI$Z z_V}338pJo#PEk`HocH{iy{5?|orl&WkqVwP!ndE4XASJ`UO(>TbmYg` zRl9(In7p@g4l!t+@UrSZD6NDV3E>D5$2;cZ6G}fREm?C~2#(TvT4MI-(ZbY{B))*D z;JqAk^OcLW4Gnsw_fiinKHN>LxyQX$XK|IlIo>9Czc=uGc}~=*6*XF7G>6loe zBp7sFJ$3+9DJpy{$hToQ49#!flUtQBH7kScTemY@=ZVkL72-8d7H8DB>$Q+5bH`zs zMnY+j$_1q=@WVEIK_rLwc=VQNqy&C0Y?ma_2*MFiKm7QIM^k(n9+zKF-=hga9+3p- z_`M6_qy#(ru)!DV&oHByhvJx4k90jB6-GBPQ*=q+t;hYb8KPVg{9e0S#&oJo=98qX z9TkJ;D{ry*42hR|l-zkga3fOB`i4D8v!>6R3pf0^?q%tz-XWzpy!F!M#ft|5>3R?%s?JU4x~bOq3L#FPvV+8;>Z3%1x4TGjWmS=4&v1Tg*o^aBe*uZ(M3jUS+sI z*~G(t7{0Ym_9`FcYE39RcmGIg3@m8l|6v~cr8I)Z?5eX5l})a^IfvD!) zL(CU|Bqnv;tR&$K?e##0xsjKw;AL&CF_Z)TqtBBdp#wi#I$q68yZCyIw=KLrBPjjhwq{u9$0I?<_7VK?zOCsxBf5^l>78y#%b2DVU%~0 zL2v8z>?c;$Bi7tms~Btm4Ms&#IN)9KLjP0cL`d4R4Lvz8QgC)KUxCWEE1FyblnPz* z-|%K?nKoy=?)qEbwyfmy(9dI7eD%`{jXG*z(=vpIrLo|Jr2$L|rpj_NS@iHx zDu;F%C;ufZh>m0>$w8IREt2s%JJ%)6e$J_D%1gW4>HTudEA7^Ave;VfR-$3!jt-sF zOC2)@piFs_J5@})Q`rJ4iy!65m6BiIe!Fsh3@Ef@o6m_ySBIjZVstX_kAiZDNv;k< zF@o%L;*Yb%K8~*R5(o|$OW-IiJO(mP4ksTDX`-bIUG35t95VSiGNm_~IVojZIOH-S z&Uc)w%JuwT?A>KtlzH1f{u#QN8M?c>q&uZWKqLf2=@bJLbcXJhknS!CK|mO~8$?1H zK>f*Y35i>%M>Yga4!R>G|;cI*!lrd1p6uz{RJKlonId(URl|lkB$f zOMmrBrAY=1W`&b6;pfW=the&Os4TAP^(nYo9g0LdEw9CMRP3X~o?EFp=cG5GQn;$a zQ%GrcXw~v^)XSKqjNzpu!F?Gjl8Nz`yih68;r-QZY>qy|w!UYT%(@X8($AN5F<1gg zTI0lUFziuj`eRd^1}U(?{rrk^iD2$^MP|xeLjqj6GOKIOtFS^JE=~1a$M8#HESIil zVeHC5q+&^1G^A{aj0RV8gmWaOSIp$ zEMRmb&%@Yg6Fx&(@az(W67i5^74)~!2qWOav#l3)R2*ESb(l@RhYgxX)t<@DnCI|n zZ6HXIzZK(fcwCBE0V3S0F^$3F)#^M*Kq0sG%(2zQ9^}@#wzJ3&E3 zN`UI}&UB*|TSK?ClduhjvH~kP@<&k{hi651RHPRaB`a>q5@(8ewYis|a^+z>ddX%V zm7kk}&$14_vu)Ec4ruXKbqY|m(C&(oNC-iQV#{ z-=~ooZe{h*V=9p!GK5|TpT48cO?kl|@T(V!zVQ0I`i~M!AC)0gA zXYU@^t~=()xz+Z=Y6B9B0^((GZalX0CYEQ!+mJw$Pnv`0g!s}k`^4CD z&S<@-i-Yqhq0m4%szIm}QoyIZDggnQghyVyQ9S;%>-fFCvow1BD(&eNYJ=2h(CL9i~@ zLleir>E%#&!vTDVxZI+JeOrfDs}#VI7~io)+c!b4 z>v|EKMygAyx=@VYVO?n?co0wKOx_&|lqoFmx&&t{@y#WMPldmhz_VjD8gxX~hCE+l z%v52WO%2vtk?`Q|_OY#qcVtZAh7PDSZ#xW8I>I3(EjT=_1e!#S;rCgA%y?02lyJxWx6;|j53U|nHD4)b#xOsK9Rhwf1aP_uM zgs0;N5_qOcmoOHB)84|FQMmNW&d}g<3?aub)Qtg}ZYWg&9gY6*`?RZ|s8(WO6d(l% z#cl{z>kL1W-ynEq)I;HxqXHt2=x8b19No6hF=DIn{iNWuCs@uc?y|4qBJgxQqSKcjhjutN`^%}uKc07X?p@m1m)QBlsfl9|h^1t~($m4F=>V%kR61@ZC-Sz% zu(`?pVp8gYRg1w($|(9v+e@Pw%bYx8Dyo|&ZNj?S(u;}-7F(i8=9mwhCvm{U49xgXM^SU+wTvzH&9$Q z@m!85KW0{nXmkG zt8|Q1$lyUB<0&eW>^(xKJ8Y_8b|KXFVb7h`6?>Z$soR0`ohqy;xCBRvxUSFHd|~tr zd&CAKQl-KgiWSU9AmEPiU~>hRkV2m*WLX)2Y@Z%hw;Ze7!O@i97CFg&f@Adv2XrN>RM{hI zEL22kjBN)%9PRSCwVztq2e-XViWr9LH+sVH)V^RW!`IcBCo-isFPsQ;pZw^o5aZ@D z@woxNN~ zWH&5V2NYvohu(VJ8b5@@U!7#V+JraH;%9d0uu5VSP>Y?+a2Eh22$-o4dwRM`<%_q~ z?h&xp(6?$Q_AD{Db#nOKqgTUCh{eeWGO~41jhM)TCr9K4j zbY4}S&<~F^aC(?8>Gg!?a;y|~c4q_?DIxiI%X7BNmqt4IDu(=VzI?=vV5dMRN+r6# zyM@AIBvU+>fQpU6z2C-XYRcGxA|wnX7nXnSBRSA;<{*59u|PgI;tP!_C%G8GtW=C_ zN`uW|6qT1kQtp=y5jYCb6^#?2;FGrFIgT%beyJ`JG!mu{uj8 z`8-#X?wwema|jI!S`Tb7I(>TWlA0m;q1~=wRmFtSD!H|_(Ljl=oU)>10$S@lE#bA* zKDRV7X>R88!u{hH5r2!M!nnd;GN%%+z18Ee{sTO`_ACkKzO42BayO*9r@cKl7dO^5(H=K6uG8uW#i}H4{_zkHrlUO442&= zwoa9V95V%buA{UGGz!LJIK9u1gwYbm~XC;$$t>QAhixTx=ky8Udfuq#gZB~s?Hy0^8Pm5?kQ4*3;(5N(Fn)P(J;Rn(l=}A5$$NtJOfj6LXRDE#UjW&(#{M+%Q#YNe zRdAmuO|&KwYo@TFd+FJRCM9{)DVAZ_FQRd6Yhz18>@#ee zy1Xg9W9nWX?nW!q5O+7#d{ANgEwW0FHqjy@%3zX?rzEMhS}Q|iDfXU6Vh{dq-mT{r zt^4&X5G?c_B|0m}W%ZQDFSD2+4yGeZL~*;7Ymu@qHv^MYj3aKz-T0!GQyW$lTCQj2 zP5W#;EBw{^!tI(}{Qyd|+zxaK!6fW-R(r@$DY5;|2ivrr%gk7!9!J^O4l%YX@f>x2 zj1Z@=5eyRQZD?+pSH7)AdfmISzGsZj)fLvu(CJckTJdG6`+}kI?5^!@t!$9A8c!LoV|Em?bI?QVu4~p3odl-7d#U*HjZwv8Gkow|T}Y zoG><#C-+(PZlk5Ah@r6>hfh30`g1DH$$D;j+R6LdCFl2(e4(qV59=M3mrQ^I=U~Eq34dcrM6&h z7b4V&V0#QMk-VU^?)D@ky779AThV@qioNYO7PnY!#d{vqC9x!s@X%{1lG#+1je|mc zay=w=8zq62PAFVyN}2J*b6Ax5jfVM`QKr2kw6;Vb+YKc~B4?u*F)#GYd%`}%0?8Ny zXd}hEse(`Y_lMd1E^E34DS}=wFYZ?O6FOOZAP7%fUWR6e=T2nP&9m!2rL4i0z`I4$ zpD}ue5w2nJg(w425@O?F7^#NF08T$rlv+>-Rq-mo;x1m3LaS`COTv$Qp*ln;b; zu96+qbdd$_HO@sar^W%7})h|D5}aY+Yx5}>UVtrDI|OHvukvMjoN8B~7a zOQFS{PQRXkFBFuG^xC=}=YhcGD)*n4-{j`)lX!|+XlWV^YIlDYgFUw1o>z^WJwnw3 z3pK5raYiwP3?Bu4R$_z%Y~|k!HkswgVa=RU5`Y}^VpY^TS|e6!Ww{G!58EBx z)2Zg9yfy3R!<7>9fB_~(_q2wS0v(odNKsj{tajcI@~pQE^zY!@y)QUo89rlUmw zqENf>XPZykb_K*fiqlf^A#DZ`pfkZ`Dzol9lO^3t>}iZ;GFj=rZMgUB&8)R(GsU1= z{9ek2sg2}3#gOOZUfS`jjVyv<7)H9Efo*21NJlvmY`C9AF=wkPOF0@Hzn{ZlW~X6J zIhHiJpC>+Nr}KnzJd^aG;If(h)pW{vQ(knkgqM;}1&g%p6SS5tOg%ClAUz z=N!xtlv8lh!-`-tM=Lt2H$8@j&y(gHZ5rCCvKEwhD|Q_nW}eaRM`5wG)1$jQq9QDg zQmZXD+tz2Y!q;19<*eAn;4lReJ~dKiSy^yWJj#ZcMxoH!TRJ}prKFvUZ>6e;-j>;K zrTbJ+D;u%stf;2AaMS3hnd8-lvezRLk0=E?tkFO&HF>;9wYrAfb0=+m>eZxIb;OhO z`03j(hZc<~nBzuRknBgTOC~vS6>|UaLi>lEjG)&Y+7Ttm%v*>rbtf}Cf)7GW zY2L3yVQRnHj(l}Wy(1Q_+T^esz(tKZulo5?e>8)?vWL&=^q|^{RNl`W?m2{da#CUi z3tKllBnRlCk4M0fkOw-mEaC5tlw@SxdE?TU>!Vtmga}VV#?u&+(#2_q7h@iKoYH;< zcrOGaAfDtXR5y)K<^3`}cx`8>sLDHX~7Qsfzn>mI8rjRe}Gp5u4dsnTIs#u!K~ zf$6$XPb{4ua%8+>Da!ADSH6R#xNd@DKo`7eSMC|M@R(6hhk(`+B=He%FoNZ=4$d|M zU3d8rZ4}FbCCS}w{OWL8wCx!~Lh3RLPz7x6k`HwwJ~dYBT>AhW!VgAdXo+^dNNc}H z9t9(+SR}VcI$ZJq2N<1w=6oU_`n3vlbAF7TOwyc1###JDqhWN*ifQ90BKt*JW*_X- zVaC}l@|!RsKY7MaKnf|#X=Vi!E;vz;E~Bn4`t@>bS(gLmrM(p>{dVRaNC&r`pKO{R z+X~5WF_4ML6O%?D&VYdI0U`@R01M;ZeGsjTGiLS$gONIF28ajIEPb6N77G^%#23SPgda1^RHQJuau3<7y@Y}KF$PH z5W1BgY?dyc@d0uzFo|?lUV>M~dFe*CW}+4e1S%+#bas6R_t)Vwqa%oQAR_GwiNP+3 zP1$J%;t61EtgVL3`fo&@e7cwE#gw*ks zbfyuwclG8Tj?O%W&^Bg5ATf+#QS@Fh7=g~9Zp-yfFjtKpPNy3u<{%YR8>f|k@I;PQ zMhEWkHNyMq$kolb$-tEVwA4L1z+3kRG5G;hk*ihjBbAm+zqD?h@lYC z!x0103c66oy5TpSJ^>J7WI{lZw0Pxn5!7b(fWpEcpk*FpPPvD}43M`AvlJT5?N1->VD*b~a73?LhZ@&wakRI^T)2l%^( zmbfNhwp=LANe1cj2+91bhi{?u^g6UIyX)<8jOQ>JX}C=B7%q%Rr(e67D@En{USz2J<1%{7`yxYA|Sw(o1X`*~j|52Gf;tIL$#4#xYnIVkHe1b84 zgno~KR_G)C3Gll61l|1!_UreTO-Zfio$zukD8nZx?@cn6XF&G#c$0N(gC~tJ%Vq{M z=+c*SF^BuQ75EJGNgfg z@D8yR*lTrAVCb3M!yGinhJv`bxv4{^yN{c*l&nWV9V&q;R-Iqib z5I+-|C*vWURKO?d)F>Kh=ro5f=(esB>@yn0v&?|BI^;On6pkpT)>iIB%N_^NAL84~ z6W1-^91b!wiZii$GFrzFJzT^u$ncwqT4(jaDIknBF`qFH~bAL z2V0h-fQkM|^y;4W_pYZM1SZs!-M|ON4p|&Ot0Jr}haAe|H;MbMfet=P~~-IHvN+yX(36`Dssf zksIRW9C}k#5X})xw}a8Z$96r$62hI$5j8&#f#u zwtO~ckc`;a+B6*`GP-`qKfOCGQI!93H>g6q>osK6lAQ|#RQ#PS8aI}Ce4QMmo$bM28f49?VKi#U0FSrbt<4)V(tJ2=K=Z_gxZ{K1fN z=mJJEJ!b)uh_x4!NZnh54XY2JOb z#jfybj|zt`%kOltd1I!9kQz>83MWf}Q_UikvQ{S3R<@K@F66F{x=qNmO*ExVa<)yD zx*eft+OC??t})xLL)~%JwBuSzhskV*Id!L%X{Sv}r^9Tg3w4*fY1e}kB&BuXX!)9U z2c~p~&UQyq_r#j^B&75t&-SEK_hy^+=BM-)&-Rv6_f?tpy-4Y6nC(M0L9}@V&cgb! zefo!H=>{A7$4m#_P!G&C_D{^x^q&o^QV+g68$>=XQh-BSvx9qQL+6bHiz$O6Qo}P| zL#K@cgmc4~X2Tz-N2#Po`=my2r3Y!eM+D49Uz?8cd5^wp9HlTD5%nIIP8}2S9_8^K zoUG{{*v&VC@=VoY2bJLa5Z)#F!V&`VyW(!YK z7lO~{5Oebf-V1%HOZc>l@5~m1Qx`wdEOnSIiF|n@E;Cn?x}w}dx8uE>+B7FkJGYuT zH3p}6-?SLqw8}uc#zwoW@oL^iX3_1&3ToPl)tA*p=~bq*RkJTEkKV3qm~GhIKyyr6 zkM(|sO8dU@##EpgnzaStI2_9wx*0AlHyWn!7sb} z4zQOF{KO=?OV+$gN4LjDw}(By12*4fn%`A4|HP5L%R{#(lD;5&nK=TR3{ISiG zV_9=DXW7pb>8ID{PwAc<)XaY#lRfT8KZZZqOGliATYOGWKUqYa1kRrWTZ3L+QL!c; z&l${+hdm=6Og!k%B)j*BS|)kbHBy3)U_wN{V1%KKf>G&6GFUJd(fgF!343rcI0yO6hVG*>fO!W&1VXuMFr*tjV~qc* zq^-HuI4+YsBRw=+;kDi1X+0&(vH!X$>2MU9n(HW8NEcKIwM#;~;&R|Oj! z!2$w)v&I%B*n8@jHryMxvXT8vnF6SeH@J5#2_7{@q26E}tUx2xY!9@GFs|&fij?_+ zbt_7&KHn<3CihYotwPX(nxnYb%_s@mAbLNtp$cdR!#EB3v`|YBv?K1Bm{BSg?L!71 zLGyr~6bW(eZUurhF-8=KSx^gO>WvI+s)T#?p4n+$l$UKY)hXb9Q6ap&w%HLfDmj@k z81crJxjjZGM4x%UTe0f@_%p9 zH2~oj_E1RA_c`$az30A?Dek7->*!S{jpI~3s;$AJ~kA21h;aFuB}iX@aWMyCqcn^=Pc^^uD-}BA1KAx2o=j zR5bCJ5p>>ZYsc2;A4Z*uKP9P|_)2MU3AEN=5riXXH&*k^l|}X6c$~(GuUfB4TrKA! zIn}GuxEDnB(zIu3<5lE=CT$v_j@K@+Wb7L}&u&5Oy+K)7?Tu!gCog&ZLS0+g$6bb= zdPAuQA0I6t)QYea`3b#G1~i7ePpNJ8dbyZi>c|?HB#1u#a`G|V7uw^|sLq^PA z_flMD7zT$c49_tGNom2c)Q71tH5btMg5Y@S6+V=k8JI%daKaE*KYS&6Y`(fUB3^z! zLbM?v&fCE;{BWAH;!J#(byB?nfHzQc9@Gg| zlK)cD^+;RN!@k6XG^7vb_oyTh8 zF_r$e00N!^3=uTD1*+m9Ws$YVR%0?RU75S?TG%JN zd*{-tS%!Td4h=qtkY|hE?aOIy(aM5*p54@Iw?p3aYbnIW*Q8jV+`0m>$+(lUQk^;Z z<#n$itFPySXD~r*AzaN4SFCmYd5Nx(!3PAaX={l#(qj*XyzcDro$cEIP(tvpzZ65+WEluY{0xW|5;?;MwU8#ubfwl!MG9%%>Y~ zIqIR|OlT;oGlt$0)=L#CwY%Z`Qa$yReqPdU;dSVrtHJ32O|V*>~6ABEAC^hZvPW?-tRy6xPGgbv4BY#;Q? zT@NcZq&SRqLXGX~vza4nK9JIjPgS&EMJtagR5Up?*6t+bt}5k`Ho zXEPm$$*pUacINGpj|mRA#MR%lWggprm?_tPT|X3Frj`)OI&b`-a8a|=bEueoI6A%o z7}vo%AP~Btjn0%r!RI<>0y*lv#<|ZE>}?Wi4qm^;eNF;@LPxa#?7Q?5!04!vaTAEDpqK5%A&BtZEFTiJ4d6;neT~KA%CPt@}H-@yxIoNBNGfTLNt6 z)K=jB#Lmp>%Fu?aXzGuNUG@8JY?Y8(6hz{A@ki$J?4eJ|TBjOp-kytrMWG3?2%zp0mX-Hhr8Xq?|BU&{)B22Dq6#*Vu!&b$gJpz}=tb%vxHt zUhna%sy!4Mn}VJC;eg3o){OyA-yoblOO;gi)3BZ1sYrerF^=93<2awX6$Hq&CF84aQQbuh~<}&(HdL z679)GD>1nxI}!p*4$9qlHv2&hlLOmA)wy%T_EpDqS*!V=JoUWvR^aBN*^W-pMs#f#9|z-tDC+eT?}i(XsH7G?X?rwV z6?Fy3t}Y6O*ZB%1Y)6Gkv440a7E33Niq4j+wy&4r z3CF#5uiQGW%r-InA?K-H%zU*HGTO`kT9NIb?P%eE(6!uk}X2= z?ZD&vuTIUjUDh!T{K6YU`dyRu!-4d- zAB0i`max+wj)l@@ln&j^uGBuBefRd0&g<0=l7xNpkED;PWZ8E^T}F}C)W=t`7eAzE= zK7a7`%jXZx?Yqs*2m~P%$PO)yLp(XPffi0YK`nsdc0=)}ywN?NU|U1%bts7~j4TK? zLgoqShB2$aD6o7Okd)Su9%u^2W~)x8=EH@gv}|FlK|TTnDxB*+B1lS0>||fl-gHH%b@#)HkSYOI&nQ)f3tOeb1XTd z0)Hzp5mT%Ho1g%F_JCX60T8PIJFGyrAYX^6fP3sbF6@Dx1%Zref%n$~!+ioDu?P8f zL%ojz1Id|TYC)mhL5DU$L0G|3SwZ1yPae_(A;TDeyVwbd=KSKr;06@*a9CF8gDF!IR@`A<3nWe?lH z41KN^`Z6eVBRdpe5i+QVQb``x;{j{84V$T>>JAEDdl6PuAmg+UR&N_VI4m^-3V(eR zPEZg&ixn9)8nR^sTbv4eVH?qq7`zO^`k)rMofxvq9woaJj;QlK6pL8J3U9yyK!&Br zL7@${QU2snsCLmGu0#<9dt)WZtOP~k+J(0);Ek|HQ;I`BdPH+zlTp~kbazD$FGmky zNsrius>(&6S%mX&K-mjpr0Yq!3uCiB#|#n0t_2}49sm#k0J{aiLJ{|+D^`Z%sS+?w zXXC!&20}`?K8ju!*iaBEI~=!Z`}Bb9=~cUU*(*=2>fMdrNEwUA0strvDod4H;`<* zI4U%b>r#8N zb1SlUdY|Wnm47EB-$^C^o_fH8XZcUmtseE}8?Wa>?F(jE3Vd<={T~*D_dd2RDtOsb z5TRbkf>jtz`K-{eFxmL=YCs{KI%i*3VRq4jg*cV;qKurI_i}m>Mb4rIxBZK7ru7sj zi)xAT9nQWn?g+6hAB5qm;Yxw0N5` zbLXYges6hM_2thY<=`Uqeb%D$XQgRfWk83lZ9sWe6?!~9E|{y5NTZU(p^_}L@-HZ@ zN6)q5G4b+BaHul3TQF9MB*crWB>TQoT3pqN8r8p0TE*2GUnwme+?p$=Rl+S8%j$?K zsrLlsu2q^XHD(`b%yDZiXKGk1(QZ4`+Jx5HIVj!it9ANNdsCyb$4~1N!Q2XoxAq*4Z^sKO3nFJFQay)!Rze zbML%xY{3W{F1yC{l6zOvgu&q+T;|ZdP%--DRfdex4=;;95arI;VcS=^-Y0o}zsT9G z!6)8PQ1~Srm)Ll@!7#I-KvK>Gm{~zZQqDzF!_^=y-xy5l&>~6Jq|v1A)_4cj)J!GV ziu+1>q^Z-PDKc2l*ivb&j|4>mscgM+5_t8tpGq%?!6TL?9q>l&KLH=Ajsi125LiKSXt zuC(a6wcL|$F>-6AaBQXuYvxXAVl;(QQnym$wSAUsrY&g`1h!5)w7EyOUaf3p*K9Qu zXcygV;qz)D-E0*PBgf}%6Pj(k(%5D&3z63BU@B?Rb!%5@?6@-0p%&JL^s;14$xf0x zG&I`|EISQ1J2`+IP9+^Ijl`Cl9e4YQ?{jx*Mt8b^yVN(4J?UNc+{j#gr%PDp9j`9! z%&yQg;_${UqLS{%{p~S$-I1`S0L^YPfu6vyZq1S&_mr-&6LNW}UTmuFr~Qb|1kKKj z*&ds)n@Nl9g^eOrn>{b^y1>-E5qK(CroGChT{)V4{%5_e(LEh_ecdIUC1HI7VUQPG zeJ#qL!jzetV!Szh_f=#N^{=s_SKy6shTdyJXaKi&?>h9RV#>ydF{9!cYg3@W2NUIdb zi58+YLQy(GH84W6HA06!%D^+qq&3RoG|Cn}%27JXH89GvHOhxSCcrZ$q%|huG$tB8 zCSE!wIWQ)@H71KcF3&Trs5P$aG_D#xez|m9V_;lsYg`9^;tJ2iRV~DXfz!mb@QLfC z6D9)_W?K{H_>-1AlU7=jx1Ago2cWjEsVcoQ#(0|ERY$;KJ|d=IP~TH{d2@cqen(o%>gEYy7^b z*@Ikn59kfg+`p-AZTZU>2S~XFl)?fc#{=}7gKBOB%iRfnemA%rNp2y%t)x(&kD;os z!~fdb+K#*SZ@jJDWXsT$m$Rwv57Tazro}F#yC-E7zsHJHZO)_G zyyV5a$3OA53jOwq9)2UY{^)JJFArF$43B^A)Al?su{vP2I<~W>@Sr9*r8cCrws-YK z;znJ3Mtx*oed*fE7&kjHBeg1TKd~kMrh(I8#_5fH2z&SS3w#k6|Vi9qg zc2{J!1%jB=Q`9Q6Izq{X>}R_xv%4bc6+@^mKhNojWxrZv+VecOFM;2xH|26w-axXL z`}^6Rs{EmJIbR$a_3DC=Y_(WUv)<~$v3%WZ^;GqmqKRUoD*L(Kn&Q{x7HuIk8nq>F zs%*!K%=&6eXI{82_oiyRD4T2W*n2@b)&zoa=9yt>HnR# zb?aAe%SibN7Ha{$@^8E?-QRj!Z5Q4aBN;0%tnJUdt!0QQ+MAnx;-q#f_arHi-j>Sm zye-4Ul?!icmB`X>G2q{NTj77x+tT>K+tT>K+p74Px3zQOZPg*YEm(}gg|~&tj`X(H ze(P=NkYFwqah1O!{jIlk=Lc^~m-7Mrg}1dZ`rX^2|LSd__R^$$^R`lc^|r!(>uuro ze)G1xfAqFMKX_Y(w!Z()+cFe*E<62uZ>#IV+d@|U=SKB;}b{8=uK%Z6z3n+Ttpuih5DsjaSpC~?ju zr{8*8hMBqvf6Lo){WEWi?dRT>>)-OW;Jm1XQv93V*1><*+w%R<+tT?HZ|mB> zye)&@cw3;q@V4as(%ZU%PD{S{i?>Cn{abG<_n&xM1sC3y63H*#md+o&EdzPa41QFy zAH6M<5SE=V+=s7vr0PW0wciyyMZx*zZAk;ke&%ff$O<7*9OeTbZgBm}+d5$Qjkopw zf5F@0Vb{wCo1%F!91LTW{;bAG|I4U%V~TpLtuZBQD>)Exq4*TTrC8 z1?c#hx3z%A_KUZbi}bdNc0xIS>uniacw4%^@wRd7%bt{c~^2`}f`!^apQi95eT? zyseDid0Q5L;%)7qsVumXQzSCTyO{pb+lu+Qw`Cy5;cX|;@H20#ZkSQUC;P}u_HTPzJ7@;~*SxKi@7`A7-|)6l{zGre`?ubfDbm{l;537N>usg{;BA>&z&MtuS0ciwkcn@4xD8 z>HGt4OXu%-TL#0UC(p*s&qrT;CfidaVSEoVrY3STBiRyvZG%tUN<{w?Z>#nHq_-9H zkG(CDAH1#Mzu|2Gaa1ZE8StC67Kwyy6t_&53_Ua`{O)bl{f~NEp+9To3-i`m!+rq>_dRvzN*4xtgqqh}~YdV~Eas$f4%)$GY z-qy{3;%ylqh6&M*LHa0(V)&<<@-87gi4aToBl>f^kvX1`4f@3{!y$%&A+nu$NQ`FB z6$s19NA3x1&|i34ogL;#Z%gdI-@Gl(r7Cw7 zse_fGAz9!+tTsXkrq1I3kG-wdzv*o$AiXU)q_@SnTNN9 z_qN!+dRuDWy)9GrpLtsb|K8g|#l3>`wm{#!t&|IIEBNQ$RwTcz@4vjQpuhIEvVu@a zgBC%iNN+3Xf6Uv$RwVhs+e-Vz+iLl*d0Q)_Sx9fI^{3v}mG9oxQ?|qlZ|jKAOyO5= zEA1C=YxB>&Ejh6I${)Qg&VS}@Dg5W&RyB%p8aXP`+ll}Y`TW7#8vgs<)|Vf=t+*h8 z-+5cM%6?aV@V3tX;BCeH%iGfZD{t%SpL$!6zvXS!fAhBP{v&To>L=b77U|EuEh4vw zpLko+-@L6fcBHqZ>v!R8UHwaMtKgfr#rnVCZL$6xZwvYdZ)?59@k0x)0~BQnc8e?e z&azN~l_V~86$B;=36@++eon86M&>6ESdaruNfDv>04-XaIa@?c{0-FP+M}cpg0plb z*_R-&CZc0$NH7L578Fld+K8(PwB80fssI(p5evZQK1f4G0+^NBIm7~W0Z`nXHtMYH6a@J`(`#$7XtRtYR4>YG=>o~*0eqbz9$_0726U=Vq40|U5JNBY^&v0YztQ#;@TPY4cpp~yTG>2F0d^wSE%v@wuSzCY^$IG2@)Z-EPbN@rS4iJwv~a_ zK~&xAjl{NMk=PbMXE;DM>IZCV8HsJ7>wm?z%KwFJK@HWuVp|s9v8@>mio)LF{R?a> zSSpb9tha*t_t;jbSC0hccWf&PE#9#ceP5FjiEVNH0oyWC&GeEeQ9_>$10a6Ewrsy+ zTN+4gOX54W1>^XNZKcHh4%-sYxxlvgb>xuPR;|)EY)kxCYzvjE5o&_OwtNTklfPnH zF9P20n9iV*CKK9SK||05V&Tpej)uj!8$? zg#s&(T00CCC8kp> z0#jqdnEX!I*%R0l8dFWKhM7DNHFD^RhH3~8SapjoCR~qL9vaCLeM5sG=D=_><*HyX zaD!Mv({)O@!w*aa6Q{vYud9ju&?G1cO%nmT0en#Sp%UK4mte$B3oz0h(xePuw5s68 zMwjzCaPvFH?QLHnFC)l@F{+A%G{uIoOae%tP|QTAJuL2UhrR>NX!ODMfQEtMAASXBC### zS8U7oE4D?2#J1|b7-- z0^5S74lJGUfE$t6R`Db;-17atF%sJnrt$rTZ7~~a=(`TZe8sjTCXv_{;8m&xPUlt8 z`U`A}nR;FvVeJ@YEo#?38ie#Kwxx!|wlMU?0HpE8 zWVBzgEh8m23jt~*wlzE@%MN`04cjv5LtAuPr>2;=k=RxPLCy$55{YfC&-c?95y>lTF@qIcmq?=8 zsnU$l0LobF=4h@n;ia|+!a!w^1_97BbAupZ`c-#oX7>XC4`lTRwUP zDBrNHtV{Me7uc51@3F0iKDU}m%vi4yCVs`XS~ME)8VOXvdIYMZO@w<@Cn1W5%350;~K-k{i>(%=bjxcdvXCBSZn_&hg7d4X*~F_73+ z?5)HI2F-8SR)w9pf}ef?$~SB){TsGLZ+{)dKSKd6PaC#N3| z(%VEi%P})wu`OsXup7$H8eMI+Hp~)DR%;yjE4I~QIg7-$khSvPu&rg|k4S7w>0O%) zCTnL8&)Chb?P<=q&4mx<3S}xWZ4Ee)&YMQ9>1iUSM0v{jG1mVOw(Un>&QpoCe*8YDz4PiAK5y zZ$d63u`MYbB(?>xGjFlQPjK5Rc5|vVfAJODqUOE+8HsI0bxs~4u`LV@S?Ciaw)N;> z0POBSpOc|;{1kf75sv2spNWPF0d_+(GG3NiM&e}*p`@=Vft5WD{(AF#=(W@4cAv} z%K(XO5j}XU>Wsw@piqZ_Knn?yw1rX9$G5Bksdq>h1W}s7oZ^8VMvKLi>zGoQ{#Q4A zv$j#|1cSrNvAssf9*fT=ybE!WRx~f)xv7+B z<9=UX)`vC6SaXi~|4qQx(6F8+v8JHGeZ~C*anVq^IMQ6P#!nKzWFToJ z{Vq0o;z)`8x$w%Lysau2CHGY{|LY6Uzw)-SPxWHAKYX$As5M^N+%D6~p35JOZs1@% zk&w91QjprYdT-&RMYl~}?$e{DI6+Co~QsI3HS$$8u>urNX_^Xj>R?w^8sV(;W z2+ojn)*+6p2|epm$xUA_8^+Hzu!l-j6&@ha2vlntubqO1y4ud^H+*_5B4dBfjDU?bw&W0=C&^qzd;})kap5HC0N!v(jEExj~P zK~Akz->Vl|dqD*yjeQSL)K*NAc)k~k+7fDMEGi22y?8;Ym3be>q18*HprV)OD_<2j zw2!xXbdgq@YV66!*RLilzLwOqh;OE#s4ehZ(jeu*c9xEtAzx!_m{fk4sy5pIT z0xc+SYd3Pf55REuAxEQ{2=dE)6Hpx*H*v#40&BUQEZjoyTmK^zwRL5?W$VH4(cY>X z7-o;6wjSsU*}sD#f1|dte#xq2nxLpHl?q&FG@c8GO#d_vL1#&`jL2x%STqJF8}Ic> znx+gMgZpAZn+nrTmQQO!Vo3^Rt)ZVBUVwCE=c8u;T1VY@oBeJ{%WegTDwQhX%23eUR-#;UZfSg>$~ zdMDi1BN)KrJs?`qp+C>wB7XT<03ciuz{u|Op1LW*I;Qq3k%j70#;bH(g|q6I95ZX0 zYGSw03Br_Tdiv@Y_iWg4n%#rhu-{rI4r%z|T|ro}FVtbu^?IRNCWBIEEPh-mr%jYv z9YDoP5UYVcwvML&mHc5h1-UgLjddX*Nin)ZyICD|jK0{NC9tSUewSEfMFiUtWVbKB zLx`$?6gxI6X)hX|nbusY01tyJ^izXOaF}THnzGGodCP-r4B*|iNO4L1&ZQLPkfPBp zwretciE1|NbhG83SLk7PCVWhc*|k5ZFSVJbY)O>2vGF1AzY%4&9*L+2B;Cyk>^Ip7 z^DpsWTGn`>#4?QusPLceztH!qJI2K2MoX7`n?(e?V52hG>vk~{y5nt~1&kt8X6H2K z@W_TUbg6^m-S;#+f=fftmno?!&H&7CHgaxvGu~!*E!m+ZYi>bI7HhHy2zhn-A9!0h zeiRhVT&g0s-qzbJs%RG9Y?AqTe>HwkJad#HJlW4}M z8l0`+lJHDT84dYSC*@6_0;+k1+pSr06l{)sJ&W+y*g{7{0kDY=0NLo{Wa@c!1< zMe5di+ZAUa_i@X9+MwyLB5ri1g_le7APEu`2fCQgCXz`3#9z!QsuISVR!Rb00)`RJ zM&fjKlBQDeFQyMOubYHr>fVo*lfKCoX7#|{pD*vwG{$=loxlBgBi`zy%=RTKf*_QQ>iD=@ zloa%2OtAKBvbANfrlra2z^rX%RgS4K%5!|p&!=3_kt0So`Cz=g^ClRVKf8|6S?dLT zftjjoIO1l5HjAV@F-Ts;ln0K%KhPmf>nGF+(mAsQR7|sNgjd85}5gq&AC0hF-QqlHZxeR7h$n6P;{08;?BISI5 zz%5J@LOEy)MQHzM0rD>H@T^gkX;43}9d;*G>OUhy{1# zVrw{DKuejHbdjo%h~MW2(_4hV8X4Tg+(zBP#u}3~oJ>UBltWdC?MjKm48r2W%EOT* zeA9OtU?IZK))?%^f(J!IPd|U@D{dW$_7Dt&KBudPFu;{SUvN12(7A$#+~dIlHyDDs zzAP$A?4&Ok^O5Mn)c4CpAIUP3i0&*2Jv8G&d0WB&sNBdTAIhbAC<|8(u}o)K>yRE% z;wh%37AKq?!4RPgldMh?>+}j0VU&7!RI>4WsrqA)aJVIuO0;Zoi(T)3+CBQkmNhH8`eTyH8=(565S>^a-Usy;uV=4#9*vW-^d@ki(4>&A6~)mx7-jj-6{M=&^P zu&w+PvEk>yXKJz;GbP}OV6{*ZgcJGV!Fz+-QzlkSu>X~K?jh^ zzRcZUAHZhh)-naT8=W|+vW?l}luobpJ(5szmXb`8u8m$nJpFLhG~ALLlTHN^))?sT zqUO1{W`nin(>T?A`y=v5C`!|5azro@lI37oLjk=qEl zW8Bf{)Gpgos4!9}u0_Z-pj;|VR<^;z`vF$~MPTva#v}(yQrfOcMhCSJ${&GU{K0ad zLJv7~u42DzT67t&AJRS;tS^0+n^c1$`#t93#KU=Ai~VXWcmsEA>b&ggXOMv}hCZr1=;&qc~c?y{$CQ_|NESHdqk2I^h^k z$wiWCh>d}a|28aUd#c4?I`eM3Tae_04*|h?Mc-p0mP|m?m(C-|@qn^njuKHlVqj63 zs$P-w0c>c(o+|;6`QXzUZh#xmD#ruG`zK$Zc>`;bx6S zs5n|-dSl`ZR#0YUT2PGUm&lcT)eUJ6>cWJ;!b%?T=8f(^7Tco^<~1XG+75X6()X7F zPz~Z=W`^u0IyT{az2cfj7{zTdHe*V_{xJ&qm%SY>v1i};2bK6puJkAZw(RVS6^e=a z_k8C=b=IRP&{ESC9*j~&*kfKQuLp!4vb5)*#40Ebm-?oNlzvL4CJ>K&mSb%N8c0~| z7g*_}#zV`shJzMs97|9x_STdj7?Va8Xh*pPv&cVYEJS%TTQe6O-^O;^SWAfy%Skqq zq)IW*w4CD6k`!ctoF3XlYN(jC^&S5Hn@SxgC0r4+7e(m1Ygjw*9j$&}rBj%1pNsD? zEq{_c|D!?234*)dFSfqdMounb{1A`-(HD5xIeZC4u~g{PDqzMGOnXN9wMJOtwX-Io z_~uF#S#5;z)8>k&|Eh6^Xr zA3MIX;)}((YgcaLR37$=lAjiGS4N+gx1R!vFSV{Zivu-SR|XV!kelhO%8S@c2$D+_ z>>Qvao_8c3@7Zer`(jysivJfFI_V`HT$wq#fV79Ubtojj7=t=9~?y zG8|g7yor75_xdFUr*(&GHQJj?WudZ44(Aw7NX+BIqe_2QqA4QzAByqWN(2*VeBpum zGV5UEn@-rfQ=$0DG1&3ep6yFtLO2@hw&)r?%*IB{>+wTUlkdmXiVHeU1bFVe`*q%q zeIM*JNbc!v%1Lm+C7O)ina*0*O_T?tfvGpT4H0QVS$fe!toK#wTv*Gwdyx^AsD z@1*j&HUAyDNw&hcAd|W+Ejb1Vjh-+b|G5(INnT_gV!zGvI)r$OwcbJ6-$= zW6md1h^3tZ4}&0(oz_52!KvXB4MKDlLVbenzcck>Gf*EOrjPmVRIJv`3fCyCMgb}o zi}%Jg+f||1_2~-&*Qf8avTB}Gxk{zpqGFG>tKGfB4rN6h&u}Im1rgdaY?gG5y{2|d zSIsR_RancfbcfqE5f<YWt*7 z+bBx9NIWIdb0;v8gaAIcnj*XPAJ&t>13eYuks9B<=OMmP`&q#8X!hP1Nw-b+=cs*P zBA=WDBMSQ~rHkVCoDx#5@D3*HHBLG|257H9zEruwRTa;}BlHmgF5kpc^zKturkD64 zTHAGlAGT`paz8dk1$<~~yIM;R%-{XWcrVoE%;6WJBgVUX5sRXrNLB8rY0$1NR-%2q zxI0a)%zK3+SdLrMXNX7DW4Q<$GfE}TQE$1a-6?RI>gT{_d$~OVU(jos>3sTQSUmo~{+9G|$@VxV0W^WYpKjOYaJ=T8e-ODtfEZ0WF2lxGKhsjs0%oZPFFa{!Q zo)>`QKx@*#;}`;XVFnmj#irYn+EuEWpA`ZCbmDK%NG(+Up|^GCzMbq<4r#aLlL-GA z+p~``y=jsOn(f&ehbo{_LGBQ0u@96sV^P;}z7vl*3=1CI7>w%U@gT>>^9{jr@5#Ai zTHW->0>lntbrMZgmc%yvg8R!{gZl!h)Ho@)Os=c%xv=S&|EzoK1tS<&8%cH7rju~{ z-P_`R{KY;ZtTovrJSTD0@i~D9H@WbO-X)#F2j#xRHCOa*_S03|8y(KwQ;jL$>h-7X z1L6*Iuflnv9wfQ1X`*lTrpl)qoOC?=Mk^vwZ%qe#l4C}yGD5v6@`xkDb!*1P z`HAo5<0PBvTW<^K?R9-7WOvg2EtyAhKPZ;x$`~Wg)!nR@Na6}nRCCRq0&OR_hWsKH zV>K@%k??x2lm-x)9Z6FsHC~J4G(ZoC{9w|ic2@^S1wRd`RZQMZWF&0NN~J?jX2qDQP;$zY!nj;GnZjjH0G_X-3@3>7EY*dpq6F zzCD!{ z>2&{Bh)qic3d$x3^#{y#N*Neve9gQ=1-^oiOG!QRq?$CcjNh&bP_oBMGp=~4@%Y(~ z^^Q(X>ceg#X8B98sc*T)?RPyorNa3=Y+Wb~;x5hwJL+4UF^zOul5&fQ6)5sTWxN{1 zr9}!AA3l8a(UWH0{a5Oz=Z}w*Udab`LM*2~EOZ9RAfF`a=L02lRD|J>>(NyMBc&7^ zYDn>65SvrMy+SxTIctP27hS!QUajX_J-urxC8d+|^6T{t!(JPwLkr6xr9$ zl@o4io$-P`!-Zm%;ocX;?-8Zu=KxVeh*d5xQV%PG^5l1Kt7O;JS1?j}3chK1Jr2i_LU~(@yWV7&-vmCiW2?l$BxlkwpPLCq;LAhl zOoFlUp7#)Lt6)&dUc!I(wgg|DjZEZDa1iS`VaRW;1&K+L;Tsj(%bPyclF>j9wOQ++ z*B4cl-P;e-!gF#}{FAr!9N*RI!4m!Sl&VAvS)_B^663A6HBT1xeqxFFd`eB>h71a! zTxP*HR##>wj}CdTjAW&oR#&_Aw&IrAxsC5@J|~Y&npozLn!c~=MINWS+1;f=&aTI` z1?Qx9d}jFFm~O906$c&3Fq#yx)gM}u}3SxK$#uV{W%Lfwog1dkH~ zXCvv7+V%55ZK6Mf&!=_0m0t2|CKAs@T1lxSuHJ2Hk7glP26&8MxD-syN>cgisi5(u zkDIH?u#N`R_;lU%Ra61LoC=a>&*oAZNL)U15k-ORG@RK((uM97jx38=$cc5GXyZ0c zQH$iW;7)QgBm^)picU6A9xa}E7^XgJ&U^?cl6{G6UVsco2h?dZ)gi=aNO?|tTQ3!JUXXG^G#hM1Ts&+R9i z-dh7l1-w-vd@xeivfmaOF!6C7{gZ_7q$0LJIR=Acx)SxC9{!^TO2il9{t7$@Y#w+9 zHO3G321K+YPZK%=_NlVK6^O+Ba*38e!e3ODKQPvjab^B_XNxfem*)%&^6J5$>f8&C z;cc3`+))__8QM-VG$$HUY?Qb%KlC&-r@Uzkq#1$}MO`+n8MnIKJ(>$qQ=9{8cuS8_ z9UMO=R9evggeTb+O%hl>w~eF(<4T@nkiKV8rav{iRO^AcwRSvX_gux*KO&9_T$|fn z&c9S6*o|rsxB*2u_RLP}5Wgq2+`Z{(nv^mHzE^3YcI;^v;*xjs6k(!!VARP8b#M$> ze!3m0;Cbqr$9gWzH(pyk+uEH5hc2&8}3uN(6rMdanF?y1tAObmcWtLbFmo%#`j z!DZeOIuqx5P|L0}$^1L!tLo?=P&p`2j7x;6#$!Ncs~Wgs^J05gn~FOpy}k}ZYg!d`$iKMG{~ zo6P01ONdj(UK>S#M0G9E1g{5zMj<3y~y@l_;7E9l_}Ra-pX8{rI9kcC_(&B<@mQU7dk6t^$ z{QR5DC2dZrLyUP1!JOWvNQ}ay$vi0~Wp>*k_~}GMMWiA-D42%RNa~Wma%3P14vBUP+b-;*xl+4uzh#c8CAtFH%4g=jBNmF(g<;_`r z9LSFc_XrW|>cmW6BB?UVD839`=KF9rIWM&7GmZhuE95xX0*G*D|?FceWN1RutjDF<3u#a=qN`)It4uA zuI8far?k0Vy%-=pvY%B}()bdd!)1 zqc3#KxuT(`^CE)3VlR1E`7L*`s1}7A$oYmH6{BLdjKIXy6AY9)1a~8kP7uVg6+8uP z1XtwD+)E}$GC@$O)K zh-Hxwa@@6Z#oE4tlF9m`S3A+Po6VWQ`k31k4BcZ|qoUeBCKO(fE7Dl37Axuu3+SF! z9~}{Kt&adZ%bAZ9(WbzmgyabhpY!`~I-@J_S?WcVOZ16B-oIg`hki z)#)I6N@cMk9LeYcS)cq4oQYpY`CJ>v%#;o+$J)3;Rk&nSf;lj*hS8_6V_L$Y?U6`> z0yHuPR)eL73oiwpU1z{&4OyNYojaj?F3>4I37tRUKu1h1M`yu={Q0l%z*mtnbUVy^ zJ7jjQ=;!7y?;T-vyGQ{KEVjo{K9{MVtI@{<5Xy!MKsWyGNag~eayl>uE;8N|2GUoa z%%=lK#GYQ8Kkcrxaon-Vazore3Fb#lemUD&3Ili$!a$Mhu3G!SHOQS) z&~=j;WlQPnvPlwTr8tmp2oNJ=PD6c8NbGR@8kqeEG(t=wZ@$X0bC1mBrY(lRi}Yoc zA|b)S*-(Og0bJ;t0}=lr@Yn7lVfohC>jjr%^LuX>xhOc_Pg;4pQ5vDUy{Mz$i(;&d zU5JF>sAW)F)CsV)GgP4+@Ku3`hMj~afm^%Ut-9`xk?tLCcwebmS#|WnKH0t-0(>~7 zEUtS;r-Yz=!x_qGYd(e)yW-bElC!i5afKu>Xyjb z7teYI36{YK_S^@p!G$3p&ZbFzIulh+X%H=_$BB`rG4(n;lxHCXwc00u&(Sn58dq zI!F*B31xWyUDbTGd?=c8`a2vJ$xRn4VjL@x6e}|wtB@3Z29DG8#k!LjM`r=d7Gl+* zi+h>`HFH}~+~F)pR{>?(#0ry=2q!~iFTB3G?AeF@UiGdDP4-IP)N$4F$kLE&^(<9 z!AsdLwIDM8Gz`C*94FcOP3c;Dgh)gwU5ffP4M5sb>S7jw5;Ml6s!!Z|MC!!J<0K4C z0^xQfOmPlGP|$H_I(b)F)^HRxFoRLi2~iN6Hz2 zrF!DsFT;>;={cBC>g#FX0f>gsd=(1HqZzq0_(Wg<1J+Ihkp{bDpHGPL>6tI4Z`h{` zFu!`IOCReTF1>W7T!!5w4l!gpm5N14{oEyk%`lOD2JTFoU(+q(=&sA7C#O2*JtV1I zpD8sYA^Z{RW2w*FR!ROc+7+9jwgmT;HbqiZ+q`t0&@YzwuaoDKQh*^A8e0;=X2lG^ z5VA4h*Nrn-CH&fYhV*MKwS6RU70Ws+e|6GN24f+B;!rgeM%}eyouMP;2N>YFOzp7D z2ebExJ@|5;X?A%_^r|f85estF5@|IPi~BxX`HTk zg6(=-S#Ip{v+ufd6C$aTFPbMQn!g)1k6B)Se?K<~ocrq*z^u{p$eDr{ONb;9p@+Rt+)B6SL!I#bg8Gs$y0`q_^v&oQEAAr-W_Tc!(b4#jk# zl&+a)SPx*p<+)p>YZ;|B+Vf?(n`H{-)rVhx3C#Z*Wm=K=vO+Y!;^{RHX+c~wt)rAK$(z+*a*F|95OeKZ-quU0U=@arv3st000SDk#w}J z`vBw~h4(%ia-RW+|GUx!MJZii02J+{Mv+M!{lq+tlCbqee}T*vh>udb+*^-VeNNLL zCw8sJrV1xkX{U~9r~0PH#tX-GrYB(wC(ghV*R)el;MsTZnO)jh9q>Hb?BdhHg|5Q! zW8_(m>A79n`5RLvx7L%Ig|i0lvjX5nhxgTV>s6`(?pW&yx)KvNJ61H*;4x_kH3Z*P zPE%4XHOqha?VY#T4mwJK*g_m8>O5-1?6ftMSV?;6Z_x`dujvujW!lB>4$40nQHff_i%srwvy$-#Q()7))s>*v=rLX(E8^P|n5tx*cCt!=xPm#uuv^d3hnN_x@w z(Q}MHU6H3u<^%-!{&?0e1HY&QT?Ls3ZbM*!UdQw8MXCQC-FTt$li|d&~eB`zH zSO5_$v2+R%*`+_RU3j+LbA}>&=8^B<__=0LD7DMiRf)WSSkSZyt`hZ|+GTdDcIjpa zDSt2%b+#n-Xo4!+KOY+Lr`E~Fvr37LR<}xZn-8)2;+(`~m9`TifeS>bUHZh4SJ}k$ zta#{MB7D})>V675XM}>R)*1O6s?uD(@n*s4sBSZLz7DZ+CDczUGB~}Hh85btLz|W> z!Ce&HLbqA$wbQq0rY@ObRo+glXSLI3e3C4Fj{(-OT7ihOi0iXtp$&%QzMQ)io@tlWSq|m(~`wAujAAPO6z1+_zsGwD=;uGNL%2KIuYnrzNOd@m0yN2LTSfwWKhA%#T9lbSd`F zTft#87?|Nw(ogDMiwiyBEAPoU0XRlusA(O2#1RR9JEkE}k|v|0K;$}V@UE8Cd^lL| z7a?n67FTz6a9j6}dm+UrwTm&w#&21pr@NLc-t#1UCA}b|U`5uw2miX1-l0YDhLMT( zNicMNBJk2pbUajG$bBjER_z+o)_JcVWGbKJ{)jjbrFOB5+Wep2V6J_+lhZ#rBH0J0{*V@!T&%H}N8j2_sOM<)f~*)$p>uC(7Gutri&@ zgyI^9zwp80gG<&wZpHebOVdOF!za#c!!zi*SHF~ojqjesjVs)|*I0vinWs*e&tGqed z3%YNgNs$)Zc}K}P^nphvWl``qKH|}!C-$u7jkKM-)MU!TX2^s)O^aCXdYmHc?U!t7 z#V|?c_xx07yXj!03`EYjZ;igbU>2!X6TR41fov_kyHQv5SLb9(R>WYXz?Y`3Cy4TY zko}l;U7eDuGO~AszI+z1PcPJkpCKiSdw4CL?s}C=giB4h5lSd?Gpu-Rp2KU--Tz?h zK(5xql21RW`B`J$r+P+QGJ|u$=QqmGE}v(&IE=11 zF{F~m0taPtg;h^_R^sl)FEXl0fdht4ibXzdd#p-v_kMZbHR^K`ZY7U=tgFfRMn<2G zLlH2g>!GBUhR(DmLclZZ=cPye+&NG}AXHymqcm`d(MlMJwN^;S6(>>UK1mnSrcdhi zmJ!6X?x0~C8Jm0L$NSnw`VFRG(tLUD^qT?)mp)jM*9iH0vjRNf>6DmUvyWco6Pqa{c)|eAb?xQv&@KJV5yta*@@q>v*!0C<~vYJrorEn?`SwhcXrp9YuK zwldz_wfnX=Gtt7#QVzJ{wqTH{r`{>dEsJ=)F=(_iOou$rM+ut$h zbKCL?yjpkrwNuzv!EN!=23*=*SDD=T-IKS%gqHzt_dvrTNmfF`sk`qf0vArjdE2mc zoH}Wn#+ggqY6T*lCq3#AELXsX>|ZFY2H$`U8;IiHHoLCliiN1(!h z{_|3oBsTh>DAj@drJv>rd6g@H4N?P-y&S%>QXmTz6zoPm6E=rydTxA}t{)mq-k{r) zlpQNr`hK2tTmlF*X1tP~iYjou6V<%=>QrRr8*{2l9n)OF12sD}j?}zQSE)8??9V*;vEhB$~W}f4-X3aJ~*$1euG8m;cngVY;eCv*(f`{^@$$lu2t`w`GYiE zxygIF@5hioj*zt~QXIoU#v^p=kAKK+J&ZaZScM&zf09eGaeVbn!t1P4qJL@iMq@or znxsul{-d27@yh4tIm4!2hd;k~7^opHrqt|`-jp3~DVtu5=M|hL&DkSpysy70c)|Pe zy|w_NH_$Zib6z{-<$_-N<%ELv;jVYur1xqr(re4gCv4RVJ^wdvYt4rQ5Ax=L53V|d z_LC1;FoZ52!q5X@qBWr~f`IaEXx1QitbI9yeYvOHSh!H$mL+?>uMn4?h`OJcqz>Pj zuT=drR8)uD+O4-m8tf;n?r-+Z4;=iPx8?4rY~-)&uCMmHw`Cph@|V9}en1v`z~ftQ zEAW0|z*F_W*AoHe!GSjUfgF;7mTQ56Zv*YPg5IbHp-%-m1_x172D$bGd9DR1eF}o~ zXn#Bl3a}0i!gDnR1&85j_y-3^;e|w7E5GXrj<1)DSP< zF!d}XFHtZeN7Haaw^4MLpdzhc2okOO5+NFqUbKn`*B(}NIy^# zs%b#484`c78i#`wk8Y!g84`KlEE>xPYHIBFq&MF2KK=9FgmIb#Y|#YT5cI7-+g*>P zJ_)TQINeY51bu?gxx3z1d^a77qPwa_c&MY42x7n$K1D z6>EN{yY|ArN~JKlC$({>+JRDr-#c8R=&tnCndulkZuFnMsY^a7KNY`VE~iWuKt`of zH*GxbgnVI~ic*$}!^V$4kcw0cNx^yhWtR@J(fb8SNBte;ZRK%9Vw$Dn@}M2zC*j(r z|HC}B0#labLjb9jMGT0hJfCQOi+?ni!nNaIY&Wq2t-qw>~ zl($ucpCufc<*J?~7MdiHoTX40DASkazn-ONn{CRKt$dMi??bk>M!@~T?5DliIvP0^ zYoGN)b8Lii9`@zPy(T7D`_d1QBFoCNuyRh)fD`a6_UvNQVVa3L)l8r)NyrRm`B4*X1n&hCm zS!h6n8uc{OMLa7z&}zE7oYdd%aTp{fQUl@%_=hte?K_lN%)v zE2B?2C9*t^Tu(|?r9Hck2$uS?=EX4A2+E*O^0pek{In?D4=uZcI;L7ix1k`?=6TLj z?&fBG{Uqy&TN!3idAO=2&d<{4qa^^|3V48q?Q*%;McI)=1p`2d@(u>IYlZ9wxuEWf zI}9e!no^KVHk+Lz$7ZFBm>zdgr80pIn_7tgZ^a^RrASfsj$q~V_$}S_$KDoijiF}E zLpx;6}CD)Xwpw%4M}0Das3Uc zn+>sc4N1I>pT0Jv*)`??8Zu-Wi=H;-Z#JeBH5SPtq=1 zX+Qn8W}BaRTOp)vH%#mR0CO8{I{~y#Yrc(wsg;nmgn_d#3EQX!TAn;=<;Q8q*}P=e5fKd|^k`p~>BuGP;A8CMx#|cRZC4F% zQ8(!n*y^+s>I{nR6y2hd38$6Zq8DSNGb-+68|WmlZ`ai7G?Qi2=c6^8{Z<*+r904N zZQeyN+xdV{T-K!A*yJ0nNwnn7 zZFOjx^!c0gXEOGs@%0yG^aa5B3Ou?W%l4L%^?R6fB@XmSKAt_;$zw@VqBFO8VMh=g$;b2rEm5a?1~=T^%$rg7y>cc zA&NWuOopE14zF(wXAKM=&knug8{T6aNQmp`t{A#DVO-%GDS^~Zx&g_qhpApuuV@XO zpzDl9kMw1H!=4*K9~|j|i~@b@&z#4oOUBH=qrl2hrn^7}xq)8&F^WMP^5!uqqDR~% zz#Gi*!~L=I(J`))@q2?+^4lQU*ZOA?VwQl(2WLrZcyOA5rxVE$!go#lHD%W9F!_e+;GhnBT> zmUW4L>GS_G)cN($;n(BHU;l@^t=#{Vx3#3R^UGmpHF9UYbZ2vDXM1O7mw0!dfA>&l z_t;_gG;;U6boUaqKfkk!B;G?8*u&J_!*<-mjoQO6+XD>m5$*1gknEEQ>{ICOum1m= z?)r<{^-s{QF>m#4Z;^2ywFrp&17ERmKds09lCc3EJO9}2N(k{m(OtVCDk!}xIouCL zckM=KOh-PkjMDIcR)2v8O+igaxX7KWO?p6jnAK|DwC7QFPb6|3-HiAYR_mU6v@iYhU%B(p}nrM|Um4 zppuJmqJyHeHX;%7tlvenvv6t-|C;W)e@k~+W$Ohg&g?<|i0&Hk2fO1X{)O)H`;G2; z^|y2v$zRi5aF8V_8(kQmlFQ%HUGD6^(OrwTbXQi0P$WBw?n>^&Bw$9-T~Qo6@+i71 z22NI@u>Sx>cO_5B=#`c~AnaGXzWI&rV(Vq_+EXHJxo8Ow;^?~3(EAJB6~$3t9oxe) z*~3_+c)>SYp*U$xez!fMke5`1S7wY`q`}~p?y|1rlgId;h0j+Fui$O6KKz~T3jGt^ zrGHCz-MOW^)RC;|;@Y=#*V8|uyIP`8=L1EE)0f?FgX~| z;gmt5mJL^2)&dLQlGo4SA3{5;SG);=syO}~x~mXB)B?|V(;EE0OLuAhi*(od|A_7? zB`)cF>gT?PK_dgK3Xx+@o-d*3Ix(eky$({&8-(#VFPh<#TaFg7z~2EME0!?m5&-{>yKDJIno zFt_j77F&)xCc_)o3O}dsJv971vU;kdtPIP&1Tb9E0^V<2B%OWOFyY=N4^o+RNbl1VZPj_7ea+tDfY-Xx6jREvyY$)$;_SC7aypI03J26kH zfY-r|u^+s9xu`$=M|9WITe>UxpVD22w{#akwh0o>%!wxwUMlO!{$F$#9A=^ZG*x^O zQAHWNMC8~OB;+)@`)|`-TI4xr z$PbXOSAF--bXUb6(p`;#O*7x`qd3X8Nv#^5d+CHPeY26^TmhqhLU*|<(@75fPIvwI zE4s@MH{g(HCjPJJu0xy|Cnf7|R9=`YztdeIe?)hM{2kquJi)q8WdGAtSd&B1Fa8 zO3u5cBsJ#}B1g&Fa{#8?+|(Ov5QnC{a1 zE4phH*X!S=yHv2(*8U~BD+Uf8*TZ~|%{?YbDrtzxm4z?tgXwikcNyZ6Rz(OVP+j|A z3Pb;j?ppqz(_MQ1fbJT>y7y*?=w*8$EzI*CIFsqEu-asPnsdhnm1yGF1c{GX(| zTyR!d&dh$X04{-bo4*B;Kj7m~?(IIe=cku0K5{}*&u9{$t6qq}1M zj_!h($)}pTQ#8_Y`vDT48_DK(Ooek ze?WIV_*=RwSubmql<p6EEaOqae0?;G9W)e!r!=tl*g!1UV?W%iX6+c&|#29*@|K?im;9 zLU}Zn5RqBXtMhw&<>hWDx+~=WExL>TH@ZvlKS_6$tr5uGV=m_3%u?5{ButKip@JbQ{$ zO6%UjYYH<^WWqJUTeM?Kt&b@J=-$7-;CnCw7{{Q%{QfgZvEw8{Yn^E7gO z67!ATF`*{2tLALR`j_@yJpUrym4WZEMT0L#2DnCrFL!R3FmLhoRZ|i#A7%eP+}&kV zlt?$Y`9 z=q}^`Z|JV5e?@mi<22Y4JkZB}KUVrfmUOj+WL=in$IhqP@`f(X!{a;m_x?(EA^$D9 zOQrgW@&6&+bqi;>U-)Tl`JEe&i%Mj&ewG#+J&sAK?C18rs{euRO8W=8D-TzAAhv#tGXv%#hHm(&1bq>UFcb}H4#BBO`k&BUVK^ZpAIj%Q zMzcxmG8;flxMw4}I97yVN^<6Zp}Tzb>&=zE)$cGoKx$d5hvRs(^H~z?M8~;w1xoov zthsK^M85sZ0GA_PpjmE8d?tyxkNtm5ca8o7-Bta+r@M6i8Qo?6FX%4c5sb}}A7{%F zA90h~lA)9qCWMB6&|SXn`{82A`41w>6x$^ee%$hKlaqn^et@cNqz&hMd~Xi@6bPNn zp$K(Ii`39C{@>AEzP_g{rXxsrC@wj_&Gg8!QY8Kl>Sv}k$R;VSMgQ@Je?oWBWIq>r zFkE1Vpnek1%TzD9S&1{0X^`4yAsPL>PhhYhLU!}-w^6Sm=+x4#Nvy}&ro&iZ2^%milNqX$1TQD_W^Na4{`7h`$QR?aus-;42 zRSAJqVnf|W(RTH^O7$8#I_qUhm9(|5K_3kDWL76Ah;F7%nrQjoG`4uqu}&BqGpei` z*g9(S(&E|ENMoHL8T@|H-tyEB;#|qvAMG@Zr%atGHGk7xFz!8v|BCKf-0IDF0d_1Q z2#o%hbeEqJWAGW1lU{Ue(Z8mu*UKObIe@b^fv_xXG z{DtmX1-r0XqvX2?c@)-t2&I&;D1kd-TFm$LH{}3#r})#iu{xAGUp6q zXHFl^39Wd#c2tZVVmInxSeIb@i|%Sw5OmF_e_$voscA^|)YzF><<)l>u0eM5DxN-? z?jrdM-Ib1}yFf%CNLCO&$w)9%GSVkQ-eN`*9VueAuDbL;?0mB1R<^QV6Hd`Oz(Df-60zzPx=TAH<3G|}?sO}E(p{?T!c2GsOX;7}VTqE=@bEhd??yJa9#%$F zF-Q?$2-jqPw=WyWk_6g5q!P z5Q6byEZ;K;w!Mn@o5Rn|Ix#0@A+pBcXkNk?HCgar^pfVJOk$`eUN{WH3&kNi)% zYY-DKrNc~10GQsi6#hu=0nv0;*kN@P%emCfzXNE){(=qY{FCm2q3JFK%xk*K@ZY7o z*Z?rnSNDR%bFJ3h$O?Y2l891g$zjanpDWkBmE9TL_Guhg*r7|49gcGIUlXAP(xn{~ z=sk`jKq5XClp@~Oc_o1Qq$TaB73}-v?0AT=OoK>2B5*i~A@+z^uljLm9AN-}?Z2hF zj6%HJ|AFp813Y~VkmhP^P6WMr0|}ReoY#$p+BP8`;uDs{MwV7Cd_@&^I29%Uf{7Rs zfy1Z(*xSVz{!!y-t@U-Tp@NXffXRZ2-J3I`mG zccb)`7(t9UtLo_&5o-}*yfg&qG6DokbR9vwWs!|zA&zHBlmr0WM2I;60Cv$tD71|f z4(AjBuu%aZjNO1;Z}fom+4-aS~DI@fxa$Uo>^mTLL_QWeTrsD&6n zA!!)cw+ZO`r`~0b*1MYi)VqM(zx1w`S^q-s(nsrE;wr!OE{#9+uE-FcW`G4+?@Asn zDf^{&0YIpbYrSiVVCX;VU4o&u>O(IqhOmC=T^v+sy{nPEF6>(Gg8fzR3jdqlMJkJ9 zpZ2HTwRf#|5yvJ{{?@w^uk|jr*iZsa0K<2EVv_%)cSZ4>7!t1IrxpIvyLc2)xFEjR zH(}bjKES5?)NAsWQb6#45Ve8$PtS?HgVUH zi&+ps3~dVhhAnMxpmGW0DN2^Zq72mkm<bNrenS%UBjG`YGV!o5~uPd7M>uzvCN;O>uFBV*4Vn z6BJ0$$(r1$&)Rua118#RNhDA$;`}yd@?HlQK*lzSLK*_Yvdb+Ym_YEo!%-MjH3Gjn z$Qppjv(@hv3F7G+H8TbfAq?yvkl4hQ0i8_l@l9Vwfj|m?s#*n1K>!L+4VShVKe-LS zq5_mp5Mv?LNuvaHzkpx8kRqvV`Ld$fKE(plpr({~$dz=U$@)qgs8>Qo# z;dhckWC@=_$Dga4y-W}aTX<&2H_IFq3RIVSSUyeSH*qC2!=*xu378@IQcos4<2I=) z#giBVGO zf|*+p&;8AsBWJZM!Z*wjIFUdX!S_-Fp_sAN4@`kzUj%No$?JNvjZQP=HEq(-aB_PD zmOVm`%Rm><_)GDV`qi z1g&=^{L;IUf9YK!XuV4kV5^1aI|Il?>s^4HYrU&Ia;U26x8B8RjrU9MLZSJtWZ8F| zf9hSy*LoKqXTkb{kGKV`chydjjaq#pFh}cM&Z}*|^)5DBEn~!|Z13;w`E|Cipyt&^1YcfAYRI!@aP>N~;BNYI?#HTwmIlp@2b7lyXNk^NnNd?2cW^n=wSiu#k!*YD}s{Y=NrQ~!8+J+a z(0UgZr~N!Cy6ww<(7X0Rwx=u$(0Uh%TX3mYp!zSpOaJ=1ZRT|9-Ba?A++TVZLSKif z*piw;h~^t4ESzr`^EC@-Wy1~iH@%Bx&+fUE2|11>TJJK#0K5IEca;rx?)=uf+Pp6I zpmr|9A)gwWt;|?PKQ3C6-2PMVf;!wAv4>Q7eDCyd9bRhurFYTG&77e1uC$)dzw|Ca zfF%O0cRl+tOm?kz6)1P90FcSQ^seNaXuT^!vTZAY5I~J{$bS1;?-E{mkva7r^{)0W zC^X-7#158;_m!6TL+?8F`J3Ja))=X%Ji+=y@5=o{@6x#Uhu&4qi-=#`y>fMW{obmS z#qxEx^x|*5tKd()>lujtx89|7jFD!iw6|aQ>xtMTR~AG~;k0O6J~q z=?jCwL7)+|-i4U0w~uB+zWr103g957dFw%U>zCezG&NWMt#^efZ?ygsGuHL?UHy<20;2U0*FV19m#y6fJ8`c_JJ*6585z- z8)$rGkWomD&~yq-PRj>OqWIX@WRWA$%Zf0-P8T5IqY}V4o5I&ve=Dm27vlye74lma zNhCNDib=MwpsPao9HOvC&fSR%;J0T{_X425!TeI18U@=6eL#wBY~EKpGLY>^F>44_ zz+L=~d1b};^h?s{W4UKyJ~BsTYE^Hom?t#~W)s$czV=qxMEsmCGkNrB$O z|*VM_)CZYA0U{*tzTusW~N`&hZ<^a$&$Q z2&*I;valxk8`{u*7svZ@Yx?nSiY0E3rHhxxJ&)oP8$K&5k4ciUf$G2E!glM7i+=THNa~!oP9+16 zw|??~ijaKD@x6=PwC8z6Lx(EQ0xf@Y&_c!#mGDlXp-(dJ`_2wpftDXW(JmGbE%J{6@u8$?N z9PM}MqY{N&d~?x$S4R2jx{UgptTnN4g?608B!V~W(x1Ly<8{{zJrNm;pF;ax9MZCz zxq4bUB`=N47jDD2?2|;DHMB$M$xoDDtZ^rK%4W-TZ8^s~4q0->D!bc=Q)4S=v>{LsQf%5+ym`*u_|IQocM>}Fsu|_ z`&|=OvEtW$SFzmy|DAq2i>FT>zguaQu53DJeL_n2X?T_vZa=0=eMPohM72v0-%wNe zWmS=Td`m|QTDrUp7UdW09GrLCYg_A|ZB%&1I$BiSePY~9FvoYRj|l7X$7fW0xpN3+ zA|Au-=fph%!EFSA&OoBsTr9b-#BD&^5?o`X3aRBjJ|$TYg`_3&P`HtHT=e@RhP9Zt za-1E|Xm>|0t@csTQu-a%SJX1&xdmxQ&n8TrAMZoe@V-KYa*EiA&p|49M~T4?o%6^t zrZc?vtitt`^&WGjh08~>cPSSJdv;||@3XBQly;@>l-W;F#ZhQi$hWeS`OdqV6} zs$*UKS?Kh7K+AbG5q6qMOe7|1V#`XTGMpV=hQyfqtW{pVf|QP|%MlaJ?N48+W67Dd z(Yv@~L=D_fBfVEZsBxx5;%2Xgr?B7m?i56!Q3`BojLZy3q(%ZuiV&$w1Mr2C85!<66jlG|(4rvE6w_WFhcF%(qrI8mjAAE|LSAmcaN zRy40hROMNzu{F(vbxzdVC1MhTdG{5bp9s>E;eucD3GY$Lv_-HQ!r-Ri2I)_-ddCdi z!xcuDY5ob_WmuO_L5K!suIaA20*1>{H6gsQ#H6}H_8*)a!rvyNqY=A3qcKC_k4o@bvP6|v50LSfOAJPSnru- z6JL`;<|X(pRjC603rKH_sLCp?PQkxJcabzyhZmLIwo@~ir1Da?bzi(46-vbCDpbRj zg|mJHpV5EddH1r9)sZ(WK41?Jl327ZW1ol19aHs-?z)feCyYIB5EG0gaC9;spHO() zddcS=B5h6ye%siST4DC&$^6u!I2NBNzjuMT`HJS-rY4O1MmJA}yit3I;2wbJQdY1> zmkNA{xf=!cp-?b#A2`B1Y(IFz#y@0ag-;q)Py0lP`p5tifTCEoHU@W!T3W^8nm}=S z^m73Qo!0omN{lkr9epT9;C!~WgRmwRKR^XYAYNkhE)U=_&p*Jcexudy$qPZWu=yef_wC*CN zQ57F;1NxKDu7wz{?_Qs=^Q5FdSgrUz-?!WGjtpMT7_`#&w`5oDhx3w(=Ha_!aCBCG zmH_!AN9Z_xac&xaz;D7F$`PV0Q}Vgc^?|HL3xqPw6-BT3T!)jcVcR45{XDX!KGdrs z9cd6WZSSo1C_uJX3~#ReVW`Adv5?zCFWb)>l|vL}o6i1ZRxjPAGg|M!LTV03hwr3% zb=J(x!4#CFhSIt|h*a*!zPDwdEs(g|I_Qxx^!|P(Zt;6Heg^NuwzUlSyZ+em+h6z6 z#SaCQ2E8f|O^t4JZbDCa3kK`6KhO=-kv$%Z%%;xWm2tRL>Nj4%|8TslgqEo$3JE`7wl6VV~cy9znrI5e579W)Us00 zd(+R{b7C zWZykexBX&N^;oxvx5iH8Y;UCEO$y_k0VS0G`&*1vy zGFE7un4S&?2E5zNZg=}+c-9edRqHBbwC(lb{Hag#&wHS&z1|BtPni7Kc-7U9T1K-{ zH~EXzK#}RroT~$Yw{*iEsLR7Ziyk)=>c`TM!h&z8c`2EeO?T%37zQ$7oNv$hAqLAt zVlbq%uUufuowO%UDoM)>3&0Y5p`ace&39S&I^9a7dk8Oe7{2YuNzD>|gJn{ArvMqe ziz-5>Z9o8__y!Tos-NT93>wzaY|k>GPst!)N9B?5F9!{ELV|+vJVdhp;yjky2RKPBWe}M->Bwsl52t6QB*dZm;KCf>BlBMQjRg})f z<|I(mbZr-KH(*9VOc$GFt3vm+I_bQx$Y zu4WpxF7~?T*1LGOawY0!RQtOGwnu3Xm@gE$-Gj|4!>O}u6WuZR9HCCw8-m}{9+egs z*gtojW-sBi_7#OZ9AOXs!v3DX*5!o#-Iq?$3ydz-XX1h&$)QdU8YL1zC3*i&0obO$ z*r?~#n*YsBnYs|u03%tl8L0`SN8Arww1XMiiywDfb6w3((U~wDJ;UClI)xX_J^oWc zK|=SZqIzkO01iGi|9ouGHZ9_j>=$I5?`7`SH4-}I{o=Z0BIN8lDAICSk#M7;@VRDS zsk)+W+~o#CRMyT+kyF#IMq)v z!fPPNM;Xj5&q34xTX9d2oMuwofD)R>&`{)p=D;Rzotc6J+KkQ$Ra10d9cX7V#^_8` zfJFM(qUG>~(tu>urOuq3zIB_im8t`ZWR>Xt8(%K}6S;RO@y_Qca7PlZE+HoWs3aJY25bgTny|zY!Z1*C2HU4ZqgL~yXI*|4+mR#?C)bDrvr=rY{9P9}<_pVvN>SB}+?Z*6@A@?g7bx+~Oy>m7o%kkyeYB<-xsogyfgqZ?Ouymm8@`U0B z8QgsWKOO`bqYbZerJ=Z3)+Pn0e)}in_Mmu>Sb1{)o`P^wPpa*;;kAf1ypX8mRogGi z>^q-~xwd6?Xq9)?`om|7E9YKU&TwnLJ_CtB?*+*kz7kTRDr_6zbarfZ-R34DdkX5} zW?`%=ZZhjf_mBgw6m!M7-?}D;W#r0d@ru86Eb?4=y@7^@9bfprmJEIEnr0`z2R|GQ z7or<(aE&eN;C3{HY&a*gUgRIIIeb(c_=vsTrrJAp)+yS!Et{BkAmiK-{i-v7ln6{r z7Pal{UVdypF0qG!%+Qs{9tWi6$y+8;ByOYiE{W%wN30#iXuYe{hDzO?Y?SLgI_^H{ zf(U>(_O(f=+v7WdkPOvsTdt0l+W>(mF(_K^>ZDk^<=htB&Rh{lb9}_7`s1*rpp6_IAUT6&pXMgl*k3R_V^3CD%r?kl`E=M_-(xV70v_w zyzM4CCNG^Q^O>ixzfbkVeLmph4|N%zRlOLAzd)CN`r64A;8&wP`Kdd!Su+fs6T`=! z+yVMA8Wk7ARkc~$+r_~`$JO$C!2_)&R2=sx;}6)`3_o4jJ8D-5Wu?oX1TkhMYtqDU zbzm7m_xNzYa+|#@P29zc?3|j9GmFsxWsNt5-mnMhwWr9J8W9>> zNDm-bp-V}49P92ebHc6K30duMF7y%z61oCtTa*lN+#YHz^}ndT6> zzM$@&9vMdvZM)?T=2^C5R!Ah@-CJ*4yVZ{+F3_}8G01Y&Ff{pfTWKEjCFKG=}7wiXGeSGUG9 zjTYwMmYd&^%B4(vIx4zUn`W{%b<)FWS1Wa{B%>aj=&(Nx7@gh{bELDl*(;hjDh8~m z&G~LhiaR5PIcEO8j-;gLBVOVc#GBm??Sn)wrAH_=o`P_GHBkXKO@NrUU`~&qnuf$y zIJ-NBO_I00S{0$^wYtSwguzu`cb`bh_%mWLQs;@S26v$XVOxg-Qj`oRDtBvMo^;8` z&xPJTp5uKudmjM8YpC_YmAL%5p@f79R*;E<9zKC9R!Q5#PD5|KpCy2wro{lPAtj&B z&%~6+jTDKID-{n)5Uh;~GJ{IQjS8Hy^@gu{JPNQy)}{_vC9H5Pjsm=#MnrTyASPar zkhwzyidg37anKTScpm{;281t(Tb_p>jn(PgI;J{*c;tg)tW__sFP3uT>0|Ry-udAz zFHOfPAAHMt6E88TSESPK#!Y69U6izr5GZG7%m_Dk7oWbiw=Y+n6c+4&I8K7^oQ{;@ zlA@or_6+8JUA27fYKqr3wCOmgLwK)S3P>7ry1)OVF(<3{6W}QgB*Bk^65(Zqa25XPh&}J+zn1E=FV(84@n?s|c(z{|Wuz-hxDQjaUiAj2tM?_`UEI<3Vilwwzu8R~4)><~Sg74T;948{;SX%Bs@ zdt;37!}*gpQ%}0Xq&qk+W)dpD6rb}@o4zn~=dzjXY5?nMAetYomvPUVh~z|%amB=cU z5%y7crz?~f<8e`kRzRneWH5RDiVQ-JP&Y)%Cu%drY_g1nLoj6ZHOta+^K|@WDP5z~ zJ~lF&)aPjD#;xZ2_x$CN9WdE32>Yj}xAPd&4CEUp{KRfbvj$I94f<&)lF!u3CdsaF z1fX9O(}x>*boBdskmHLtk;$?%qFTg^3*_oTB$0|_ksyxwu~DUYqmtuAz}2hNR(8=# zqwtoM4|3ZPyUJSP#+GxNLQ1ueF&p+r?Y7%4A3PJ2*RZh^@Um~{`Yj(1I<*NAIpx0;dl)GJOam= z9Ih|r{6%n9#_dQ;=D~qG#9QBjeEU@SMWc;+@x*sro0BZg*RrEF3$P@Nq1KjI*9<6uQuGdR$EvDO#ND zN$EV0ymZ`*PcMYO+Gy8Y>rKotc9vgRO)b@?vVD1}9^8|b%qZV@MD0Xhwdo)eSG=55 zh^zOyCOC>xB7n}%bA2FP;?dQ-q@zVyH`eDDU#pKM2NzknMC>yfC43c9+t(xTQdzA? z_#mgxqKl2)0!AG>xJGIH`Hh6eHv|~;7u|{EP8V~#(WFoMw)_^IKePkPc1?HjHA8Qh z_N%YG@VfwF5zT3iswqKaMcES_u+P{CsZl3VeYgaWIG#F7u4P9oIdv>R(P-U zfby)7WW<8J%AAE@Hm#i{Mj{5d{=z@KooyzePQwL`a@hbrz?bYkEs7CdM0C%2IvDso zllq#FY~`Ww@#cJt*kamhTvVo-YDiZ*n?-y)X@Ab+IuGUKq#9vxMXn|e&X2V_GP>E@4cF< zAqqmDmSRwsvi)q=bF-pSrZ%>VZPvh`XeG^EX?TmJVb8*FPKzlTV1T3Hj$uJ)yo`5y zT9I-Vi*Xj(JKSYO@N9X2>V`%PyE_xa>4Jly%qr$(U0ysRhP~Ly2^PoR%?3(2;%I}E z6*B3L8{Bt^pWU8Vp?Ei~A`n0vWAz8!l}#M$@ZZy2Hagh9=q~TUl}MDEf>3iXXA*1# zHWN}FseVm&!ScEVvP=`jQ-SxfW?0SZ+BMefL1dRa>awDvq3}kIp=>Ehuw5-v4^ z`|MRWr$wFMVAnl?fiq zX1wVWWyop^VGJ1klzpaD>3wt5Yz6~gja7@zNtIGLv z6*OeWoZr=(hvITF(=a!ab_JFd%gKe>91JT{pe^y zXM0SFwJ;MIlT->9zl1dV3hwGNltc}*DF%Oh&FiS>Xn{g$P_VY1hp~%4W7HT_%4;}9 z4`5poXiUH@8;>G<-kV$DXL`uzB4Y#!gcE!u-Dv7^IfZ|W=D8^h2r0td%Z~P!O`_6x z8b_)lx`59NEo|wl6ZE9VqB0*U6eQDIsXhR2N9Khy;p$kHJ#9X`Xs2_c7BxQGNG1zMt#WZs6}i={q{8k~4|_a6%=%(sC%r`d$#3c~ z`*hw;9z{J8L~)dhd&gdhk!Cc?iRI4!lBmqXu9hzuJE*A7(W``a!^1GO2>>W8~D%{@@LS`-iDNmHC@|;c9tndP<`WUMP!-Nz%7H2o= z-$)C&Kf9O3gq4FqTUg`ab^==N&+g#72#V#;ZKvH0m+ZN{=ovV{Jo6IK)pt4Sk?7_R z6WPULO42z9LxAW`Il&`gi%@UGeMV3sj&Pv|QSSXvI)xWfqY@+!17f)GC3*IlSlB0oj(%$`p;tY7C%)l5-42Q@Nv+Wdh{H=4GB2Tb+?yzbrRs0J+I<)N z<|+Sii}V_iQJb{n>xAvQteFD@xTiz=mBcO}I@YJReBlfN2mHp1%*Ep0VY2>4BGpaoeH6OKl#b#XmP8$8)=23xpISG<&sxQs{0Z5 zI_V&NjlGQEAYu~*3=+ampksO0Q!dLZmJEr$)fzyQQrZ(dm8!@sjVj-;+*Vjr4u};z z>MLroz*3ddA$tJ7oJFDFXm`sH0*Q`gSk##jC!(uHpxv!GnqaylLON{4xs2cLmKDZ- zcDLYHD93AfYr$6lBvB2L7sR}Vx0smfDi$S2K=8fKQorCWa4^nOC{x|{U+`8C>>A$6 z`wUhD@GXtK{u*!7~KxREBhOku${d9Jzd;%LCPjU9Hl?ut-aXG3*eQ{cTxqS zD=_oE36z1x6)R*92Pc?uteT<^2z~{H;{JxWfX9Y{3)k=#93Txx3#5O*TUStkNFu*3 ze6Co52APX5F1Vh;M^rb=$Pf=z-kp~VVq)DPNY?oVfvfp&2y1MWZ)kUrlIGb=c*yJTGunY>9OUS;-h(uH$EXtuvFbOh8;s z<)^o6O?0ruH1lTkatnAFU_i<;#wTa6mf?nH>H$N;TfBKIV2nK**B{LLFzfc^iy+p+ z+>LTVT81k8lVvLEAWSVH2Hy7FHY>{WywxU4wu&LPWI}c{yv57JNZ5gmv%;p~z5x!R zvvs9O_T(zDq=_O~YzA%=!@eFMiPo0)dywnH-x&&*iFBFR9wxD!GMz*d(~iKFd)($A zOhkhfcxynqYC|5a15U9dLQ^;c3NLPL#bEz9Z`m41H<5(BSDXl|IH86HmzR zaSG)pDrBNu9b+6MBfDR!%2dB*^ZrDgYWB#hn-I`;W={= zr2cGzHpT7GGER<>h_Ek+4$}=*p`(>L0bAyqhv_QRv{OGzStF~1;06ojd8<7llEm+j zcr~tB0;5DS&4jw@-BWJrAe-%xb+Z*$G^~r7Mg}{T>?$zLF_aBkb;^U zJU)n##_f9ooN7u}kGgIxNe32DZ;EEQ?k3Oew9ofI4L-y(Zr98G++~;bqMH=EfkH6m zrW*2*TZ>>AEg4XWPyVE0qC^Lw;He zN#!+3b!Q4z(?&93$ou4zt%vxnWx|<}Tt-t*GJq%J<0^brs||$70~F%HVf%bjF6p~T z^BbRL!^LNg^K5W}HxH1cG_LcQDYWtN>?M{MXse+Azlv07*^s;9Tl1)KW%6$@tJ>;8B2KLE6z0?p8ygWA$j|dAVG<@ zv(*UVrd=;o8<`q0zGtGjUoFh>rw^`(R}0g;^V9>Ccw*5Q!z6%GP&0?od84lf`i$JnYOHK@l2Uf zkKvqLd*@q}VPYM+a|fi>%O2A3xhn-4DiVC&giqiU!h{O^34yRDzRV_RMue_p3iaWa zyp#tDKOt7or)g0v7?B~6>;#N)LHq+w1nSMF}#>--$D6Hf0tr@iqn9io-a8Bc}O=ebfSftSU zGKP)x8~l22g77ym8#jcOE-x;1@`AnPGTPqC!>_jdE~uDDnV*Os%qpEniOj_lZ^U<~ zguO~LF^4-i9o_?Boq|@Wq?@` z#C%HB@m!m3q+GLNJb1IAuX|h?PdGejTLdJef>3+6j`w-~IX&9(_Lq?#P-bdE zpHC7fg;QQD#vY`7{rA-#0AZ6iUP2yYV+FdqlEG zfTp+Lr*gxznu4zuc0@wl2wcplS~3{ZZdjC1&sSEi%H^e~2;`$Khs#qlUY*BbKP7!a zXux1UhGtmYYG$h^?JN(+l529Okrcd}$apK0BLp@62#k^>9{W8qvrOmAvSFz>k+|CO zgv<=zs(}Iu!L328c6_f?F{c0$zq3dG4NY zS5a&bp7=b%dPQgpI$NiKK+NLlNa8n((e@UcFX9m_B_0?nwOoyyef(qI_WfD_#qNpK zhNVfu)65p~$W-FAh|@jTigfGSVc3*;Ymgw5cq8N9`xX>!8_1#!C#wyAzKw{koz$Y8 zGOL|-zMYY-gTcRih> zZ}o?@S`1#o2iwtl*TrD3%usvt-~!X&Wb@GH`JwfB#;kVg4VmH9tl=LrpGN(Mmiz}# z>4v`3eHyhG0WEwwryHiEAEl-r#HAmZkr^d?I!3!N$a80meqo5|as+>2OjdSW;N=*- z?AXFQ{q{Vy=F8!qbmKbo<2PHzB`$|=wM;soRj0pk+OA;wsj@-a^+*oto&Wf3#&ywxtUSf z)!vqcKw0`M;3DzWXM=#{TZ_va3~T!f>$m|M(%DP$+3Pn07BsS#_2kx77&euz7M@+L ztH3q_v!`9MmtF*XeX{Ub#S;5Wfj-6(yLAyfVMW?{^-Tg!-xq-0sz84R_-?mMJ(B(H z1K@j()i&y5_O}*>?QVweL)qK02*?Bzj%OQ~0Z#v>4GfXTp9ms@p!PKKzEPug8H4s? z6)3Z<4sdfI?C9=&&OYwa9`U_>j-`DL)GjD+4?kz0H0O{OwNE5}!13Wg;2ynj&c4LG zWBDbp@x3GSrCs2?Jz4o5N;&&#j0cnO18VtW>A+*Lz#n>yKOY93_ce?_;5pyD)9xj(^S#rYz{{egpX2h| zlsVg{t-C8owBcR3T=*@VfvD7xpExYGueo z2^XRIQ+dss{BcE!2h&BJW}Bl#N`_y`-10QW5iK3bQ+`Gx8!h^3tVlak>M5Ss>xoy! zWo9kWVr5em7CD~Vxc`CfvVJ-A6;Gma{+;)qbl2C`H<$R5)xYU3nJK=d?igB;HO^oa z8ZZ6WL47r~E|L5VWnO|=9ASK^l#1iNyx}*iXu3=5$1}->Kj^NxCB4thPi_&uEY>3; z&dkMom*sajGMp1+3yD)!&L}IHlvP}le(y7NZ+7hpBG-Jfzt+CH7$?^H4|MTpx|$xf2nf_ZmigjH|ejTK*vydNjk>b)2v*FZr-{}jyB z!SZT-4-EfW$P^5{C)24&5$Su(=$QdzQKZmz^|aK-#rP2h?zeo!=J9n6z=VGz?JyFo z?#rYYN2lD75 zVbtC?*~a}51Ev^;1>ewqtQ8G@8d^U3Av02;SHZ&YOOf|eU(E;w&N64+=*(P)VXw%n zJ*UjV9h$G{AKqkyflF`3h~C4M9`qhZ(_PO+Vn@&|HWQ~5?YA1i%_EcamIjDQ%v zM6Q+BW}X{l^c)i3Y%c7()w?v_{c`!_c-AexpoE#g z!2e){_JjYAjaykmJXPMl0q0sbztjIb_@o!;ym(JI;Hv)_@@@>aZ2`dpDtk2Dl?x<_ zY{TSs5843a0;s|5U`_WRNLDVE0B1X%wR;Gis3kxe49`O(AZU~=eX{pUwcJB-cS>{c z(R9}vUq|fa@tl)zK?r%kGT+=x%(Q5+5`$xJ1P=l8<4CabC-8pgs~~MDajQ@^VFPDg z#5g2=S~*TeDEbSW8Is&y1MhZ^vEfDC*KW4!6n&k*a-sPyu+w zt_Mwbr7+-TgB{a)F5rfV{k}cqx4eYK^F8Ux$dY-A8q-W&U7y!Guk+9sXwk4&y?AS* zBk1QBr6sq@ltIS89d$a;L+6$Dajj3-BVEfbPLG1l1Dopu?l7exVc;w62g1=f9ATE@ z{Iv-wco7Q2`!BudzQowxopp$N)MMm-SiULsEm-gA*Q%Vf3ywP&XDv^ocS#=SmVg@@ z(|n4Rau1a4;4EKHwV~jmm3R|zP}S69S)%bj*^fw}6f)1^zACy#l{ zSmJ%6bzWmcnfY>DNMl0ac6!~#2}`9))O5>=QR^CNWOfH!Bvv(Gi;ESj z9;#&S2BA1Fy3h0_Y20G2LJ5gD8W*>FYS7YpH1QJ~Gd#Z@U9N-Jg87eHoVQ;Zdk8<1 ze{W9yShCB8zt*{w{U$F*RYqpnj)8t3vA~-CkW(vbKrV(Iao8jG}qx^B+*2zp5Nf~|Gr2t-@NF^+;)ch%Jx zOD)LmFndmQU;WTeklof5o|_qAkFFc%wa|aoG_l4nVWyk;eIHkkS(UnvH%lnQq2%26 zn_Tq!Wxid%T{o}oxCZzh;g0j?_Y0`FM{NcR<0_S;!|%j)n{jq_AJJMYy(}!~AfnF* zzkl0vW2dp#BiJ?C?slRGr=UG7L@CiId5s;Xzo50`Al0pMJwEE;V4}rPlHT+>=Z6~= zGo_GuLkq=a6&q;8fz%CqW0)59Bc z7N08>+p>jXg=H&WY@N+?Rvy>yPJX`^@x*&wS4-9+pYHB%C$GQ1d%@dFgI;6Wx?gDptw(jlTXP@)F_uluX`5%lq#{A6jeOB@xi!4R0oE5_A9m0}| z?%b*wGn%hIdMn$&;pcWXp*X!-Yuj*89Cop8>2WquX|SJ-@!8>C?f~7;0?{LRaYln! zA#|6S*X|LBGskP6+l%j-?&^*45|Z;CAO4B%TH*D+wd`Hz=&i`$qbcWuJmRhD?-PaN zbEm`S!LrYZqmMR&uTr^>o|&&HzmEaGuVoI0vA?e^k{@4=ua%r%*PO4tzn=>M$WhDB z&5Xe*#}CNhkJ{nqZssol8Nze?gS0$+oc+VJ$O1b2_bL&wDWGk_#bWgXbT~u_DWGqZ zpm%yfV|zkJRrwJi1>|;cM4APzW?#ZXxGmXmF&hfV2|LdX1;qjN+*3F8RL=5dQ3tG$xisc7@QbPB}LUWM9pacA=X2Q_5!sg^c z`*VVzD4^TDpi6Rs;N|_H7{hN62EZ{!pg4!mafcq|guh2Z>w^lxii?0*4#f+IG@%Tm zb#Wz*6DC~=?t~3_?F=_B2lc)q652VEu`|k`G*W=@8C!rbhju7Icn~TDG*mbgns}u6 z2M-APt)7b`wi0bgAN^M9G8zRL=0-|T^)akcb?6;=4>iUY#@aZy8DH`_y=aem!G#P( zQG&S2kE$Q{GDI%IBp}8M`0@#(gQWn#IzYG@31$T;2!jLgxHE>|ImV?k)~YlnK)}`= zIrc4e>^u|CBQUVgFDdS(en0FyXU^0e~o+Olvg};asuINmdbV{i1Oe8B!=#)1t=7$=VPlCZr zd?24RtPMSH2L&Y#Gq(pdjGVMsnb;eb&@b?6CC+cr{8dW8t5xJ=vyN9AA7Zx%laD_5 z?8zs)m?s~vBx~d(pP662yf{gQC-R17O3{!{LFh{1^-n<_k3tu&#pJ7ulr7`Atl3Ju~`=>FXq+h;GW335cCralNd`Y8_F0`6R z5RfjW}Q z_IdUY$dscZm*d-&vvw~lper$GJSR#46o!&p@}q@Z zY>j$QC->YWej{PX6)v9LtR5tNCq$w#qk6 zz!tB-+^~731^o(7sAiPAm{O=^3I+lTwMGoy<>hO{6izdllMG5uFs1Ymxy}R@eS~|s z5T6o94U-pNv;+Bm?32HalDf%cwVPM0*@kssQH*J!u!EO(7GIb|UwEmLwmDOj&;oBw zkBWg>ip5-tqgZ;wvJ^k4lrX=PxVw~et&|+KjFP#GTCt4QvWz~c48nCWE0*bGAtJsk zKtfI396*>#7vKvj7sxLc>Mj>qD;GnpkYKL(2Ho|soM!-Go~c}D^%795R<4xtGu>7D zn#2IkQ1Okiigu+f^pS)hq**)&mG39EEpKYibxN)S)dy zJH?6(a)s1Ix1wq&oN98OU9>SwL_@5A5iP2g>XIw)e`ODlMLFb>_}Ob z*2$$N51#41Q^cE7tlPd{U-?{b|2f`nQ2jA#!$mjt>GR9Rg){uD#|?@*QpwM4sBmtq z;qmV3I`VpeT31x<1ht-88RGCOSwb&cEw^|xKECF<~?>e?)W+pUA!?9s5* z3)&&(*DcO={X|?BrFK;og2z_vUX+b?>+Oe%9S*@AR=RJWDdD>IbU4=)c?7o$l5}XU zxB6Li9s@f~CrlE8JM4u!f|QyckT68(5;(EohU#8+w5)X+V|BeiYokUWBh;N7 z+|5v0UugAKHt`mc6+zBCF2N=^v_b1(M>+Rkns{#Ex{nJ>&=dOJ#6TNU39EhxgcWMXrg$MCN1_=uXiF*f0HwMYk zhbUQxsFjCkt%vACh8PQnn0tpaY-D@NgloTB<& z6?t}zFdYqLHBIe@ch#)#$?|AL6zI^j>0klxOB(Cm6?zc;R*&$l0kPzxmwJX4R))GA zhGe&l;^j;dJ58wUP4zv^wYtpdlq?c!EV!Rq8TVMTsM{v@+HzRgySUn$_u6ycbx8d^ zw>97*;`7Ymk!#i?xBNjjsX+Iq|1Yw(-eq`w;kKr;Jiozh<$3SrJ?qSWIhG$1R`@Kw zus5r)x}hj-t|&0Qxb26zt;FKmp~Kp!jJl})`qK6K#PWvW<%ZOp#@OP<@fz^JDmXL0 zIq_Zd>nm=nHNUhy9o#K*2?l*-WEA8gHk`yb8 zCTpGdRwuhFi>DhsVNuDHs!C>C{Bf8c_EeS5zYV8We5F)fw%8rZZ8_CbUB1+xA{9iY zTvM?!oTHiluouFMj+Pj9zf!J!v+=RYac!!%wsLE_$qSWSrLJmcz9WiRudl9pZ)xD) z;Yw*3Eq+fwzi?Ylzu~rEe&Mz%C@8+gZ4Le{ZtLud+miYwx25tU+!pX3!zf!iw{p@D2D3be0T~G;#=tCc zVu^E&W$|u9Q`p26k9srp~nvgO5yD@i=2Fu+k)e~=C&UH%5B;FhTF3H zi`)GRfgU%0K*Z*p6CUvpcYKXY4SjPLYGD`ud9v>E92=t0}tmB=vg zWq^_>%C0>-*l%)Mj=$r!=6~h3^qcN{@PaTrunVx-i@Sox(jvmts>=>iU9e7nhufO@ z7PrOy*SIaWKjyX`|IBR}{VBJV_E)$q{vYGEMiJfqHn+tCzdZI8w`F9<+srk$C64g;n%mO(!finzEbpP;^JDb6e3oP6jgdH${hSHkQ74-s8(HV~+!ho< ziR{xx{ikN}BsSxDMNFBVzvs4~{w;1xvqZsYCcpdHvq~w^d<&Al%xys-^!|q1GEysa z;4C@F_IP-RYGaB?|EJuR$Hl?Z3k5;gF8S6a@GUI3gM6Wo7%S-}M-l-qJR8Li zaa*3>21Vl#6(U8dtu&mL* zk8)d@{~)&|`yb%8EKzu~r= ze&)7}evsRmMbOuuf-41ZHAQ>`s3!-hZHi(fT@GQih;D`jW27W-F*jmecpSEvLW2 zZE0Q(V8ESV-iC?gM?YJaunp{t#Wrv}rno>GoMatbp_qMpx1X}NAAfrqJ5s4r9h-Jx zi{(8srqlPhEkQxX3D@DQZ*W`VNq?K$0`3UhT>nsN`o75cfLE!u(&a9ce8%5CBOIkz<~`i0v<_-}JtA!v_(!);N( z64G38TaplNtDSW7H{8}Bgxi8an_Po%ThIuPW(ODy=CRQk?=(3geV^M3{{e0b>W{ds z*sr*)EDj&reK5R{RFGqbk=>`^!y#99y&j;_}n? z8U|5#&_m=}y{jE3!8hTJMb-jf;PUgPq{&nLz5ZcttLrCj%hBr>ZVS2vH+BxfZHa%) zZTb8ew*24o^S!-xGz2%M!8ls=LE$3gktz(RDa9is?#%FyG|1I7nu`&24dh&27P4b6cM|uDLDqk}Gb@pS|gu+*Z?1+|~u+-{ZE7 zuDLDsYik1Hwy^$)+sgT4ZmXL8pW(L7VJTUBnZCzu9fSUi+wv9r z9=BET*SRgP-*H>1|Kr@2=kK_!W0>!9TMYjlZVQP6Gv@sY@?Yn+Qm(nJvF~wPzJfo< zZ4JQuhT96@`crO;@<+I>NU<;6mjBn>*0C28gxm7;V)`bx1^iKNYlMmae-*cd{P(#n zVlQORf0x@TW%`NR!n)?Rpnt<{bus;0+*XLt|9`oyH$TX2jfwr1+oJfI+micB+!j#m z``i}Rk8)cSuu6f~+?Jh}=oPnR@e{Wd3*ollxt58pxh=#iZmSByZ3S@sO>V1{=|9MA zd0w*rDYq5-Gq=_7J8rAt7jDas<1cVq5uCr}wvJ){jN6*w{EFKW{D-(LANp@_Tg!i! z+amrib6dXb|H^GiOMk^}A^ua`mJj2A-&0R`x7feVZFzl%+oJuaxGng>!s+Qa9f@XKg4a#UUOR#zvH%`_?Oy|bTNO&ZK-|D zZLR-?+Yxh1=r0;w1Q*dosvQaah1)Wl z1^kBF()|0}*7Dcfmf26-mipJ+)}ybvE!=Bv%k%r(mc}=@EvGAPYtZjq)%Un9o4?I% z`F)4ms`w4Jh5Q$|Eq#Q)!)^8a2)8x)6}ROE;kMwu!EKQdWc_>Gmhq2qTLFy!2)Fh4 zirbP!unzxWZVQ%Udj@dLZ6W;;x8(@?3%8{S^+((m?%(6K%&xetUfl0+Tbln6xAh$9 zKf!G&f1le5`I+0Y`S-akhCku9?*19K<@3LS+sgiNZp;2V+?LbV+*X0lFVnMT!h;A}NJSK=Z)Q$KesQo`_)^rb94t>5dZZ9uv*xp^ z6inWri&8x-K&sD;%bzi;f!|?Po)Y<#src8J6@j~fMn8ekO~QH%;A+%1Nk9-O#=g&?P(}fujV= zcHYcq22IG)M|kB$Nz&Iy;8;DPEuvzk3%T~9Fg|S%)JKD<@{sApCkYoHF?Xqm4NyIq zc$5PXpfIVkNerBMR5(J3xyrhkM|?RB<2XGbbk@9;8A{y@Y@N7Nn@Bg!vO2qOy5+Vw zdh180jz`XzR8LssQ6G0&=1`Lo5D=JLCOKC@1MG$zQm8G&Ve4?AH(Zd#%DM?;fy7=SdjrM{uXISh+%D2*KGrb?Hv z+4P}u7dzg(d|xPMAa(r=NWk4swo3+OIn8GNy8!_1u7M~oVl#;tsBC^DdP}XAHoInF{cJI_Y={xX(9_~1 z6X6V6;t~J!7%q)yLPCcqlRw2|Svo?h4^4Dm!qF32^p=rDffSBsv!Ip&);WyD2pt2& zLDTBH$-7WJ5u;)bDi3e~CDhb8aM=Z*gk#mq zO$SYo0n&OIi7F>4l;O)6yclrW!i^_k);tHCjiNA<#OVpvp+l8AT*h_v;-9#bpqc>6 z&%qMVNS?a3@OU=k(x9@1gK^D3k_s>c`@q}1umniWxclHwN^|kv%{ZRvRy@XnX{HwE zMo;RDoR>2mL!c=VQ5&A8Xl6pXp5s)+@7A36gNuhmItpCN-u5ZY#!mknCA$k3vH?N!)p zRy3E#vSLx(ZszPSg}x7FKQ~g}G)12P!xh4^X(^E#^$UyNVS3jjNU%CK(}G(ij;RFh zr^(aOX%Wmt!+4x(scU@qR0?xhS-1*H^d|IO4b3+;muvmEY1cjTiT#gS5xxqJ~S^cFjwY=yOgE(=)d}BJ~ zN?w!=H~^@8PH9u70ZB1ANvw?KNa+F^B?@%|Po^~z_K7AkqB-!V zQm=NiZ2L@_xUwBLO&_b6+SLFPvk>AATgZ6Gtp_rRk(Niqvma;qF6#I3nwt}kuO!JR z4&^Zf$;f6ycq*t>ky)vjnPom&@)jzKP!ufO!4<2jT(LUzo^8pB_)xliumCzH0|5b^ zc|;Okkr;_hu5?&bUkn~rMT^<3rf(yGro3SGR#*&uKQ9wR3@A%nWSD623|lyGGmlKh1r>+zteDJZ*H2;Dq4ESCvQLH50&F#? z1S~9}*v5%k;!dQj7D${HxH}7ZGK*b=SDsZO&ZpCopFArPW(zY3 z%@rxs0x4{}zE_~q;LuDlICel(oLHjvMjt{m6N^O!31A{c?A=Z`96K6>wCdd1IlU&W zR%{UcqJfxNmlxaLK3Pi;6Y z=7m}%NK(wLTnl(-JjE)SxD43{wpp>S1T36!NQC&?`+L}gHO-(BY`r0-ON`b}b`YHF zn!u8Y{z71-SXXda0t`5@TdWD9%7s32UlCaA<)42duy7y*miU=<-2^m?ZE$7|rv=vv zW-7$LiXh_SBqrCDYP6@oQFCweq9GvM4@Jie=~Ybkl|Pdaw1g!=btw1=zrrTIBw%e) zCf5xY8;hqomlUguOdIjMNl`!*AtB<-WfA_8nj@T7NrJYtf(0MQ3kNMe$<;R!#P&7; zGfoA)g}aA_oE*o|iJudT9h*yWCkqafQ*Q_Be#PTNscS9^#A_vsI(BpD*-d&EY=!_0 zrf>~`gz_?)$LPx;W8sD`GbB;oL9rxWwhOT@a*OU7-y<=FkXiLiq-OnSKa*Jk)0S8T zcW=e;{zPV}G4EwHS+;0c6dz{mzR8Bdf{j5zV`O3e~)CHWv6 z4Wdz!V(JMN^#rZ$w+Lo{Xn6AZmtjxWk9c1+o9s})uE4U83<`@&31>D78x87Pw1`P6 zDpMBQhLsBK~W)j1}i1ii7Qnht@-$&WogX$6MftApraP)(Q?7kC?V@R zhC)v(q*dQ{UCP)U%jBE*p}8iAA`!lG!?SbE2tV*L;8j$t@%5M=xU#m?7_6fSt$I)m zAl8FjaEmCD{yPrVZ zExYU{JN%=Jf#atPFYNH4ZG+$3p0vJGJx@CEh5@Epj8#5L7+LwP%JK~if_G3Z^zyue z?d&D}zJ{meO4JyD-LQxgi{-gW2qFl+exZO~u_O+hSo0zEzS-J$1+DZtw1xw^h!~iOaxq+-KL8oVm z(ktY5SzFsER7W0LKet9oL%ZCeeNTm!cY|@g>Pg+DP!pmsQQKzyz$On%DQ}{7NY--T zo16et;a1tYR%RvFd>mWpH35MFvGdYK)w;JH!X04>uJ^s}IxTgnZS77ye*4^@L=~~j zPq^o;Yk6y>@2KY$Sf#v6srrTrXV18H3Z<`Q$;?!cEUbrIWJs*LZkvZ1r7B8L$g-jZ zNH6jp5|?XOMAHPhT==84AOdW??H%*AZ+h zuE}Tj1FR*CyHSke`^YqAYEhD0Hh|M|XDmaR5MqI+`^Q*D?ay{3U-EoVCugyy+>t+h zify74$8A1R;eunTk|gAExPO9Urj{xdfOqp5uDM3$L4$eAQt@<+`M6wP0+H=yQp2-7 z4wC~v_J@tda|EY*D0s=YtwT=>Gz0hVEA!u0NO>q&Ap+PT^<}An$ zb=2ItL<3zHG3ai6P?_eeMI|8q&D4u4Zj0pE|2^ zH0SYkv#O}EZtB;Dwg5!VFHak#?bIU z(<0di&cLAwo(>oly$Y6pB*blY;)AHhFE5$+#Ch2(?&2e2+3o1;Ec?Cy% zD_5^DPA|`U-KbvP=clw-TyV1rIFNy6o`QdIf$4>`_nvGdBaeJr3ROilEH{1k$*YQDw zXga_y?rpUkpG+i;N#}PVw~gKKr<@m)Mtg0!hn8Z7mwoz5rBBTsDPDSzAV2DMzns({ z;n=K_N_@JBS;lIT#Of189L!8dI{cv>U3o=nFY zzl!@5kF2;7S_20h_|&#kf!7b5E(h%WrSTjX>UC5oekdhu=Z;5irca#lTXxU0JiK@)2Z6Rp>d>-WsQ zLfOYpn&tLkj1Q!TIQD1pdyd;tKfK6$ zssxSu%rNbNT`D?jQ<^W|`s^2Oi<9GcT!raVg#Xih1I-kI4xUYMUr{A5!rWsNhP%ze zf*EP#_KDO|cC!eAa*w3+aTh3%A>7tDjdM$B=j9pTgPg1=c4S2K0u7_K{1vwq_ilks z;DfxXJ9cy`?jnP{wu0s>>=(JW7n!s_DBNqnetE@hnQJTFpK^#y8w}trYHN6|&7$29 ztwLq5tz<}s!#Qw#>)d1)mCqBXM51mIao6-_`wT1}p?W;=CIzpNJ5J)Pw1ODF0b|$l zJpXts!oBUCklfZK{$VXYt%BxYms2U?0RK<-CRT7A;)hwbD&)jk!D!jBnWW=5bPqnI zA}gC2k-4=7@iXeE2hAA~^)E=3mW+}wPGZ{K1HHPu?Y|AEtMVdoAqq+1;o){0kipiNWdKYWqg0#{ zMDfqkOglfVBOKXg3E_O8>wU8lPyW{OW;okXb4*MXRO*;g0+OYEVC%}Rte>p{CSNQ* zt`n3`M=R${H0`5me^Oj_?1;>51LK-Tvu5MS@JTuXdWAdiTfVuvi<%=|YPUjj-j9?T zBC50yyaIok=~}7JUcd9^V7bgnf|8e|(Uf*NCvj{*2bTAzFNsF0M-GZyCN1?Yuehzll7I(RhUCN_I`1Bq#Z6k7 zib1%omxmR(53DWqh(CT9Ieb$+X>IF4JTZ%RRMlB{$2K?(r@Bd|`orXA<)9ZC{)#t2 zhT7J%dEgCvU}f#eq^%aEA-cv440Fj20;4W8#!G!4+66URspA%G-EbH(i&@(sJz?eT zluFzw&k)bERQ%83Z)+~>(Y%NyiHPxGJ2|HuVm$n@4o%?6GG-8ij%QIcFkxw=ZR{R@ zti@r+KOM-$abzrzAd)@u6*IH3Wejd6ADd|=tShyDg+scgZK}k&I*BYNJV`F^bUMho zd+-n~ZA$G3iN7fVBe-O8%=!6;{peP94_YG4VkK-~wm7^|_;NMBtLwg{KbBwyK$vLiF$%J1L=tek*+_nQhp> z{f$sDH@R~P_#rPZl!MJ%^^De$Sz$^0{G9yVZjsLt42_OdmL#{$BT*TJ>kfNH<@UKX zo@dWPlZvj$)^FKkOgWLAi$wWRmQecM)RHF1T9T}*SKreSCZS0WZzg1G4m^o5!gbf; z8=dwHetv9#X$87L0gdedz(BHF@m59gFQyM>r+&JHLIK94Q9+rrp-aR#wJN60!JxKK z1yC4ZI&7gKgwelIMY-IhB!}WSzZ-4>5xep>(dRAV8F-VH^F-w3m;vkY$bP|^4 zncXj9*DT_{id|q>O9^6vlxZQ-XX2b^)o;*cQUokmKwoBd@_OHW>8xcGPH_ z*I?Hy)fL!9Imy-x4*|QHwyFFFkRsU8owknI9vv+-QEgHl$@K$vIPkGc(Gq?JyW{|{ z7eLtl4V>68SoXBTB1|%eO_Yyz=&T@erc%5PAUbr!4fZc!7w{~vWbO*=0>cY|F@6EN z?nb}|sVl31XEMZzraVzYOIFv>F)q>i$)DO{Cfd%4&QiPDV6XLVqk&f>0i=ncH`0o6 z68p|F#98VFr<&ABhS{N})lk|wNQ_nH*Ok%GipkbC$Zk>*ZuDZa%pGNTE^SgSt#4pm zNRU41r@}6qLK&n2aAM&{tR#jmeJY)TfL#v#Q%xK=x$Gw-o1b_}U^8b>6vLKw)tAuN z$zl4Y!VR#^Imib4_iMtAmQ$xtG+eP=Umd>G>b|}{20K`wjP>&7B2%%BZAz+upLDMYB8RbgV3=_Nx^I8b=71)(Z!!3!% z`iMDodkq0d(y&Rsw##;zIYT}hIbR1{%ShSUZo*hFIBIPouL59fN)VmT;N}gnltz+| zvy+w1qFHQ0WAtHei_hhVlQjW3(7BnGh8RXYZ!A!LN-3hYwk3C9!)u!50&n9;mCH#kU_nSwKudic4NWVDI6z#h$QNKk@? zf}ZVyOohw=8v7gy0|AUJ7e*#sJkuV^Bp^wq(mz9H&sSJ7)nP!U8gZ&(zz`9BG{w!a z7e+>Dut}9qntxzconhv;ghK* zDy@FXZ2R@rCfdsj2}v=a_i`3a&Yu=j?G;6kV=dF9OAb_N`pMgqwmBFvBlY`mU6^Z`q@>FD)v{=8CC|!TT95Q zp*Av_s4Ql=wtC6uZmZOTKZk~`Nj^U6^Cam2A3W5U0UW3pN6r(4Eil!9o#=3$DtnK7Dv2;;s8yy+Pk5DQ$EIV`{~xK*(M4xn?m+&t;}?Vjb66b@Ahko z@VCXO*I1P<1?H%yIS+EmP%=3-7OeZMRG*ka zNgNDTM&6T;KUwLJAX+^jL$je+Q%2-0N5s0pkqKs~9=s1eAb{)J7zowKgu>#c(wKie zWyJpAf^ud;lj6btiQPWIG35~(2BG)tuF6f>^r9(U?1zi*$sdIZVQrGpY+_kHfj%)X zy0s7Y)Ju@siVoqDFEW;NzXZ0c_T+$umZ%1^^hMluKlrx3C;OH))`_V;U8UqoM2` z_r0{roZ)PL;{sOxr6DEhgGhqlr14KKi-r}oc$D?bJDS^1S%3Z%`#y?4&#U8-twQk{L~ zhEI*AV!sr2;{=T9(2w~D#|3s1g@6(|ZWJS9hp;$>1cz6IQ_{7Qk2hqLn8?Pea^!?y z*7>qu8~%oo?Jl|u80Pi88CO31B7AI>h#S12SSmYYfcMzTtn&SXCbOypv4hwas%j7L zz0>E>E)i1k93Vw_7?VYUs0nzZU{KO9jBXQp_mQ*jb1F^FIbR_vzoS%N9Ch>O{C>(H zKZ!J52@rW;EFEYJpASSvqwH&thChLggwNrfKPDh^hg*yi&$$u@tq&sun!Z>Z$%R!0 zLey6^!4ztR;J^k&-T5Sws@*m=>Hv+=(~tSkK4kBhV|*+GUWGusa%x#M46%BzgvSNWQozNNBbgVYHNXv~0E1F*w=<1SdlE zg7Xo1Mk)2(xM;)bF8z@yK>&SZk+eXXc@!5G#vQaNZ0b4;-}FsR;}5YCs3y}8vkQoN zSv0e21rsCkD25Y;34(S_d7^~#V}SwGIZ&TN2@^|EOlFJq_j;e^LSa0_B0Tp*r2uaX zKrfAmtCjAfHx1E7%%T-FVU9q|E(x@Su$au<6|U+lvrCD4J*V#s$_}03sa57InjL_! zPaugE6t}{g$gT6T{lg~H2&Vf9TA!`RMbl@9+12W4gKo3ZI)jFL=gREDEW*p7V9(Vj ziQ|rC+(wtQi66!wu2T)R#2}5dK@X=qXl=r7$H?dp!=p?k5z|Uf*LlJJjCe9`-b4ij z<`5BLcFEkF9=F9EFP^IcY`xk7wozokhU3AGlkZaEqfst}r)J|1oN3%cxrCrybsxwz zwy+Wo(MyQPpvFHlY3HGSjLs1}P>Gs1f}maC&oBFk)3%Flh6#M!!s;9+B}^dS|6l+c zKa0S=B5Dwyw3U4)tZYZ8SQ7(nwA|BJovdjH+bH4*dcqqKWE176xMGSa!Aj1>loz!l zCle7--cgsjz*QHN13@XMG4ZRpRLV+%_)Dz@%+(a)6BeB5PfQcs; zo#mPxrCP^8T&ej~q(AfS$y~G0>e$QuI>Jag zqjWoC5<3$nI+IDd(sjGC6T9*!x{67<%XPae6T53Cx*JJ)T6BBfLaeQco_>W3aY7*>Dq{$|pMDd~vbgLls#jJiD-vVJfk;x=kuKjuL?9F{b82_${zBQk1B`XSZr zL)7FL&}~d&l2mPyuxRq#{RbaPAAG33_>ds-zV_l{Z~b_#$hhBw34x2j1Sz<=Yr1Rh z3*7~tg3w*(2GbC^Ye0V*B6l5dZ~VDo^<+Pa_a12`r^UV%H$Na`!wU! z6!gQHjiecLve_H2W;Z5hxkab>rsn2JXOzk2DMc6Q-4|~)%_BC9w9-o&_L{?Zo&!j`-uGEDH_vxGNE6ktg z=OJ>J#KIHNsn-olwPedd4_EJyty?#&Cr>S=Ppw^6H_R7KEtR{kXN)QOWGi|;|`E}rR~dhxyF)Lqu; z9ZJ1jM!h}G>D>_hU*)cnCL{;)E4j-->_~og$4%^z9DH;occmU%djJA237<_LSwqCF zt1N9%tFp zNBMfk?}5i|9_Pu8=W&f^(9IpxPqcywTGK+KCA>1;vGQ=qJ_sd&MP*YuE}50`M!{cw zcE~Z8?$`fVgoJ;I^0M6rmX9uxA$&$Ml#rAS_hfrs{s|+#=$cu{T+a(MYocI&_2pYW z-0?OVqZPSWPZHAlVh?)q@SnIAh$a}n2oq$#o9pXC^u*#0MDnV3L;Z@|ay>i!mD>vD zFP&=(J}SF|C{VW08O?mZAy}aNQ*WZs^Ur9470ZJeiV312g0ENK7l@TxqY3@YZ3$Iw zPL?J0TA&M8ZO^s_5sQTiSMM&aKHPH$h}7(_yvx7eDEg6aZ)2j?(U=(oPvn>q(lf(2 zM`-Lw^+x6s?5j4C-K^&ecFt3^pD)g(gx&)5Jz)TH`alG-<$l8p*2@lk^X*iyEC!YE zK^*2%Z^YEI4S8QmZ#jR@cJssBy$(wBSN?PE4@-dhL2M03hQVAlIr@RTQw)TzH96d< z{=!t?W;E4X9j$2n!c=gmB2ej}KFhhI@6lXiTV6d6H|WR^5J(h%7_FHHtAx+a0nQNg zTi6ESf5>T3zGv2BmyE9%8wX3yzOa2iftv*gkHh~N+02X^pm5D?E$iJx=wk5L?gjf{ zQdLN+Yp$k1G_QxkEiia~R?9TL{ZP~}c(FDbw0>04D-g{q++x)mTfS2g@JW~o2K#|O z2Yu1Qif4K`)$Xg=#ZA0{R%H={VCl3zQB>=SA)U&cvJt9FMDqJA&JtvN3C`eV>jSbFqo`oWtyB+o>8N_tZ{!iTqf(JC%h2#mO2#`b(FyeEtN8gPrf?K( z5}nfwPrCqpkdA=L0*<#$LtH+^+X51U?2Jy&MV7SZ(=dg1H=7z zuKT_JGL@E(`|F9~t+>q?W9l;FPZ??aTk%Z$sZ^my1wH&M7F!x-_ddmfN)!n zdI@LZB*=_7Lm}K2XkKj^ioHYh0fQDuB^4RnhYbr28iZ?Q0LO~d zOaNjfE_QBJpPtEBim)ps5+@m{jX}Xa@ODAk zjo#8$x`*~by6b5-%9q!il(L2iU%+td9vSZtC2xA`l2x0Pgm5_I!cy>sczddS^P864pi1=RPw-=*aBq>yxuNpc~55JzE^i>ceE z<*;S!`HWc}(^#>164{w-Oyo!+RYm(Y`a|?1x|X;qqi`M%d1cF2%L&-JnMlqA-H-De zlWYu9kf7uhscg#QyF9h&E2%5*H#?H-J$@l_6xF}CQJ#E&o9Pa7DvN@>pNwcdBhIeg zyYEUH4@jDkL|-NK7UfCnk7W?QWFD8)E@>lNhj3d5NHvu0+^;%4XNpWb)fam}_n(k} z6dsHX+1K%8b;jiiSUelX`}8>bNH|m#sMDJIDlEH0AP>cByjgKUBAbeT@u^8XAY<-b zrXp*eeWJy%xWbFPyT*XKuN7Jahh{RYtn*>?>>FNm-2`>|=OEQFCBClXCY&+KeXv7B zQr`tCb|_?ahC(F)V?WLGHAa1OKYgsBof`xFwyFbF_gEtnV!1=+IxA}sNgu*cNr+v( zxoWMzz=mT{MWU%dOm@joFxnkaQ7;gw?o)x~0Z?BmQ$%ectPwhYIA+>TTvaZxu zo=Y>yqH@0VAa(WOG25Mn?9^hnmencb)qAxCJ6mqBEHlS%+MA$cw%Pmd-r4c>1s}t2 zdK%uJxqDIB+)0Y)-CaY30(twUzEd2ax?ZsR0v`;Fz9rquiHdBCz%nyy=lP%qkDyzN zo5v(5WHQLGqK*Hi2p%{rnb?RFhp<`3q*P7hc`NB&Y^UrS>qtLgJtiUTR5h8oNXy_E zi!_`zBeuPuv4I6v?!%ZOuUxNpsc3JkBzpOc_cW$SOtE_6`cPh%C&*L36yA#KU)CzfiRj46vd9Erc#en?X zVfB!l2yt%NH;;kg1z8l*VyJ6Yy)|7H4F@O332|CM`lGL3r4;L35E|grj$v0YuZT+N ziG&%xej&0Fq@R#}~!zG_{InRZ`J7N0^nj;z$!XO;I5 zbNs>vx%1mgM7ZLpFyOqZ5nep4C03Hm!TF z>$vW$2=oDM%&DJ_Uviax-U@u;-kqwMwJh52Qjct@T+CZvq&i)8qyjiBp{84u%U&x) z_A~AOMf`*8mE_D0)6PkPRL)*nyP!6nSWi#~olRd=^~Nd!%EFlr+=)O3GIQ|#GD z<VV^zV7Ok^M zlHaF$dW|h)vWKQ&nwy4tXU{7e3hGrq?|yvJ*DhOgQhVvPjr!zbeClv7Gt6T}b9!gb z%HxD`)guYfbC>T@2UgDW7Mwcr6Hgnz=cs13C|b5hF&;SnE@w-g`2Ik`93XKAkd#0U zTMkHR=76^hq%{N4`-2#BK>hF_s%21)42YS*i~E|}dOiwb&+#(C`-$6dE<9cKBMG`Iz|o+E>Y1Ec-q&^K;_&vSaXjzAW|F-_HZdAGqxKti#VoPSTyhKLEmQ zAwBV3_Fwh(4`v96k_&h^iWlY|P=p)sq9Y(_IY3(4KaL?Vt~?;wEHIltAeBEb-|S|l z|7Bn?Qjl{_V1Zl^-h5!Ge^51pZ@E@bJppTFP7s(OSg9kZ!7SJxCb%UhxZBLD%{jQA zpP{oOm?!7T^RgC~j@1v*o{{|GdEt6rd0xX+K`}@nE6z8E%tD!CuQ@MqX=@4yH37@U zbH#Z*D)n5)?l?z8~4uuW?Z4rd+#fBY?NuD4ZK!rgCgx}*2htG|G zaR|rP2B9#9vAsh;4+y)lf`(Orj%gl2w&G1#9lXcOOo`zAx)MdC6-6ZZCp|^9wZpjCWGe+MLpq6I5e1Y#A&B_zWmm7Tp z8RB_GsK>pKT8_LM@Ujy4!q~<0zP!YPm2h0Rhy!OhYI&#|oi8PwUz&Hul$E}8L3Xzh zkg(H^BnS`t+4Fkox#IQwLrg?9fyYX0D19upbgaS^=M{Ex3>#b>`9j+(oG~uboFJMp z{;^XWR&<;Lg!4*4WD-D4iHq-*dyy58Fbs@;9q>3`08l6(ae@THh#Ustydd0`i}Mw? zRR`g=avd6w6R~L$zi?g?{)q#*po|ZR({cFyok<5-;SkPi$>o*chZoqMuT}&A@2k;w z2qiuWyrN%Dnh8iArjI&^1EC9$V7AA2 z(zCAY$E(?%7CE&1*nHowt9ACLm!fh<$az50|Ou5$7-cUIJjd#eDB=c4!R zkjtQ6t24jWRD6BM^78e)px4^@KXY5CZw#2<{KRbqy)nP$woE>nxilFeCse4d0<6L% zKY3NU<8%(_K@pSevv~Qxa5LcE7eZ zp{6pZwj{r{j=3&jt+w%bjWKs!=f~T?utP%y4~*H3Qu+b=?S714wd~ z)zi%QR`%9$l(u~?(rG&x92rtj-2|((4Hj{fIKCNA&MaS1+J4d@-Vd@uHkF-OYxIM@ z6g>M#cx2g-9?`J0M!qf-w}VQwXh{q$-0)JOsV21%i-Zs-nD|Cdqw!uNsS*K^aMOil zgT^RfUZNx5N}F1$EV6o zp*Bj@5z-}sd8``6m54lqwZr{pR{^$XM8B!k zv3=quv0l(TmCD>j$=Ho$U8w66i0m1r?5)@OxJlVP=l@|$=>uV^=Svj*cvN?7M)#5y z>q>XuXO`~oTg@Lc|PJI;cPh^7q$66g{Q+>T!SJiH^9u@ZwiS|*~Gka<} zlP(NU3J%^A?c38r?||9pqFHSP2AH<{8MRq2Y;!!li`O-O_KY}fkGQ}`-2_KHv`0OiM(;>2gj9(@cS^A{Wo)E*0V z8Vijc3ojXq>=}#R9(w{Gj}shE&>ny4H2yq#{6)!lO3(Pq?eSOei44JsEbWOLr-{7i ziGq@eqMnJ8?TOd$$#TKT3hl|aPLoy9lkZC=YkMZ^wyOLu?t?w;)Io>37s*wC@q zJN9wfzVYv);#0>Hteg_BWhO=t5^uOYHwt-PHSyfOE(xFi;>qL-N6+M&!O2ZeQ*!rG z%p+54re1nHNV6zOOPG7*@gluwCf(~%hHZJqv;8cmz-*_)?1tYVuld|Ze*k$E{Le#P z8}%=LgS;}oHYEM;SYCT0EnB0V-$xtw$J%$sKM_Ud?unN1$=Us>?$6U()6?@u)1L-s zx(;UthUa>|&-H$p|2(mHFtIc=yR^Hyyz*mtWM*Y-VP$)5W##M2w7=f zrx!M+zirGfZ_OWVEv@e^f8Sj@A*^ogtsn1i6ApKNe%(L#w*U3}m!IFio}HZ$6AaiH z;Dn4tqb$8C5Jtmyv!g7dB^1GVHC3ZL^J64~xWiOOc~<)qv`Pfal{eX)2|NacH#^_t ze0nZo*_C>wBDW_6a}^=jq+3plUu$+{mJPJSHDZ_f68SSp|e5a)jI zVDdk6UQYjeoY%mAm-CwVjq~D)mSC{wjF#sZ=G4xDyj}dnyO}7_m*W${SEzgiJRhlO zoh|Z5oR?mdMn!gikdoJLoR?4hFV4#{^kSA&qzx*M{4*7`)bRqS_)SdEUpTKNBIh;s z7tYIJVXOAWM7Vx2FT$*!*NX*Mpm;RZOys<-23`@3VtGT%ZCe+^S(Eh&skwLLeIqcj zW501;zM7&*W3qasrG{HwiYKR=oh7HmpexM2Tgr%r;|514ciZXJe)jp}MNK$%E|8i# zkJw<39`0bNP&^ixDpwrFp#+v)FYp38k@HeK<4Bj%&hA&n{~_n~?hiSyR`ySnTvYx8A!)#wBTBM4 zfm2+3!1C#uU!=mdNj4n(2>ch$i!J`^{%)Lo%@T|>4M?4Yk+`o^1*qwMEVgE#jnXbY zxC8eCg^_Wcg4tT78g=e%0P+HwIW?YRnzwNkSbvquiAMLtuRWL=-qDBnSO z`~*OXc|2;7YSOU~QW&(rY`XlJxis66ia+HRTd%hs<%fAZzIuptH)i>F&MUr6259?t zoEI+Bpqs+t!Py^kUN%ktlJnX`XvV9R7Op-~h<1#)4(^9u9u;|vMbey!kD$I2IWG;) zf8@Lv*b!ReLjM)#Wk>(MQ0bpIuQ|reMv`*O3kmv!ls@tA^a3HEhS+ns(PC2Ahxw~gmao8F_~dpcz|jkoHyr?Y7j>*0$jLv$av_2&!UATpX{UtRFfRtJ zFX|rtA97x@t+9mps_SIgS*J%l^s^VQSskk|MN;(3!pYNxw5iJ~X$A0vd2*G#|BUmx zDcW)QeKDK!@0?fvKXP74xK`<7UUW;Z3XRu4b6!WyE)g=tM(VZ3t*koeGm}x?fVlvv zv271<=ea-PyiEVZd9|GT2hI!oH=I`p8SfrGtj5~O!fYKVRT}fYH+s(#_&ew2@jK_G zU9ES;>93qumfe4o^Ai6P=hgUMa9(LVM9wP~a;Mnxcg|}eoQv>JoY%xJ&TF1-m`Cz9 z^Tzs(T2tP12!d)wsSEgij;;U|%X_D1^EQ9ZUpTM5dD_Y13q^iCTQ)zk$EX(miSwde zoSdV&FB;pi?Vj>uYE7rAX0&I=tM=Uy)@=!X0Jse=e0l~TltIg>JW=6#rm-<^d}}?7WKtOJR18O z&MOP<)Tl1%Jg}=$rq6$3M?$UB)b4mU=rYVaL)l6`*3)|-7fWdEq+H;N7-QXiU{u`) z)lJMZYdt5~AH@AE>7M@ zmuhhz#pOxU>lId4I(U9q<3HiNlK&m&Rq@B1m)&oimp=B{9?2+pxaG#F9;hM{zGV3u z=j92r;zDq?3-iXxTpjvH&MWkfIIqw-U>;6-GgeN?m4b^wz`I2}sh?v2>OZZln(!RB zMgzv;q-#9_U#kcwBQQTW+^10jSNi`CIj^*ItK@WeN4)>dJxbp{b6$OD?Ds*HB}2KJ zc>%lzM(@uSl}dl~I`xIj>AIC@NI?0Dp=HT^{me4hem*M(vw)Zcq;#pC$bT9mZctncoieog?c* z2i-J~@U^%R1H12TagSFoyu&?a{YbIf-QeX>_|Pga%T9)uKzWLPl7`@aHpE|q@oTz9 z{esAOb^KSHm-sKvD|Sw4P!Hro$~%ZbU<^P!fn;KMknbD>T`0|`917i}ql8fma#gWLZmxGW`>D12Q2&~cl zM0fJ63P(uOWe0tqr;_~RD%@B_*<(HPKXG0Q6oG;4QLXN+20UIZ0u1}l)35MW^>e5h zKL;YI2MumQmY==zAb&M(^p*ShJre{aqyJlR%x0kZwE$2hXHu^Ts($c-cUIz{p3uYv zB(@*?A9G&paPWQPCN45GPVS|~tJ86Y{dLCDFKK5c>5vEMu;g^|nsmwu0nmaH`8o6; zz^Q0K<`oBejZ>ECx=h7lIElO0ySaZ=4qp&XNp#{AbQ9 z*_8^y!E~aA%Jo?qx0EDHMx!xxw=dkbTXsjbD**j5@&kfxBnOCm81yq5aEwP zOPHeJ6LVw@|G;@A|Aq4k{R7S`_BYO};cqyv$A80lg}PD*;an~QAPsup+JwZ|bjgPP zXBBv*FLM-1jcy-J!RQnwI3K`1OChkr?at;vzuU7)s^Ec#8)yTNVdqhvPzRzF!t7E%V9~5e>!5Hah5iz2*ZfFAx1#JpHCz-piPU znf^)<{0_$BwnrtXoRVSx(#vqr(B&&GX3_(LRL2HySrS4YYE*nQui!=J98FZ{nF`VUtGx-q@!BjPzm+(Y~mDEgT*T?i)3itnW>b=sMj>~i$5}W=dxE_4~qwp zV93hzG^@=osnkkkSzR+!bK#xVZIv>Nr`tBvH=08+1i$9WMbj}*CaMsk^--rkWc7t+HVX+kOxCGDsOp5_g*%f#=c z7Pc3v%421~V?};5+kl40d7Q6$ne4-)S`&YrKZS{Y7x}Yq*~21o;|{+#uW(#*KN@s4 z0V$n(bR{-a5I|N@mZitkV8AD5c}#!X^aa?zp~^>jLByVF-kO97c{7;$q|wr}yyc4& zBG@O1w}sD97GaoKe9g-m9M{5F%q;Fp#ln(f1Z{?SQibb51Y{^y6)kz|nsptjb!#}< zDH_6=a}qn~zk9WVm8|W|}l*>@;& z%zdfYe`Z_PwedQ%2@$e!qiquu&5!@gc~uaoHv2oh*r_t!zmJOj6Xz8R=oTWtKWYg# z+(SI^lATS52YZk^iTpP?FJB5VMe=Dq;cQn~T~mr92!)IRAl`*vMhv+i>XPnnyV=gcD zlRneuYZGuaKdK4E&!+-#ZN*BFY5Hw~TdwG^+{+s}rf{7FF6ENupp;BAx&GByIZ?l1 zn;w?`lJlx)%*GO_zY&_$UU9=zzv#WC>b42ZrI zM@9lz)Kx2Eh*dwAW-pqRy#|soWglN?y`ieFq4bn%g`TM+o;uOq z=s`Y}xks%}U4Z~^#=4#&@;A^nc8GJ*1F5<77vfsR_!iwYpP; zo*GAY>a(npZCcESvz1eypdK}#*zP{oUm>&qx13kn@0^!Enb@c3w}U)MXeUX+Ro~10 z>yn|@B>d|vq2=x{kxiQH66BS;H?%YnY4|%ma_gr23C59Y+5DGfkDkW}A&mGDCj4u6 zGUdbyD4UWz za++zrCN2^K9{7(rufc!jybS;EabBxrJi3U#<-98X6VA(z;$LxIUjLf&^7>!kyu7@= za~k*0DFu<86S5lXpH~T^_;N1NxO}eW67R$>&TF;+tt4T>Wiq&QC6Vl6u2q3v^1tD{ zX&BvL?o$W_(8ek*Q~CDD z9W7<`$-`t12`UKdYr8BcO^H2KS_6r4g=#8hA<}b}iK3V3rrif*7;Hc5w=t<4?x~|6 zv$&~q7=eXKyIcj_J5dj<_&bY3tl~8;23GE06}YnKAIck9>@GNv*#vVMly?0ag`!cURcbnZvkzSU8AdqrD&Ipr7UmHdnI z68L-0t0o5?5YeQn^SGhO>gCNyzc54X9xO$dM0@em4^q4dTAj963`dL|>1C{p{jf@< z|CaOmmdG^mA8=mwf8o4d1}B=>6i7A+3(sKhozO}rYR$m(>n8xuMb8hh3!S=ounE;^uDgm6w+ANpWbP2wpy6) z|8(o%eD3Ssb%(iDvyFe$*P~bVo3HX41^N?Lww0tm;JgF^6F*pw`rV)5bO9!Cs|vY4 z6BFUhyuXxSH1l>g@J_t<;`8S(p;|YeY$h&MDusudkvH3#{^Gp&?N3itj>3urQ$jyk z3QYDLELWaVzf8H&LrgI0b{ui``^FK{g&|E#Uj=30f};OCW? z;!bhL{ka#@i32X)co~cRC6*iB9fA&uzj0n4e{o)O6jXg+Nk=VI{(r!Esr;4m()cUqujBRl+qv*U*My^|tCO1GX$8S1V zxF#oESFGLoC(i4x%HMHbgG9~?{db&K_2vJH^9p6)vv?NN+H(GA4N47;pgvRvH!M1J zdA2;9^>C&6@MC!QQTe3Uv2rWjt1z;!f5Um*B?-FyC(cV-Kl8SsH{}S=xr>SYIhuxJ z)VIVwhQ0q0rmm<0O|^4%ASs>;S^Im=%dm~uT<8y@=LONj0LZJY;S^fLOTf42A!aSd zOjfWWvjImlDIMBhh=d*32LvW~E-g8@;lQh7Bmla9%Xx`?jPK-Hq1Cq3mTlZmth`y| z0+{O*10GhirZ(9oekTrvZY0k-B5V+U<-Cf4ms5|L(E+|Onu*CV7X}Z)kGGalaxI+( zZ3O0-GIF=r`~j|OZ5M<{$SG%Ej6O%lVc0|=w-n#^udaEPhL&Wij4r?{<=Z8G{^Gne z=q z3;<8Y0szb++FO_r05~>q_@U51k+va`^Ga%24Fdw003?7qa|EYiBT!8Qh)xB;VGMZS z)gTxECJUy=Kt<_90AMg~kc;D9xC045BEb$!hxY>GiI=S`_|76@#g6-mnS82#08)k; zC_vF4s`Zma5k>+KAnYsU+bs(|_UZNq_@hkFkkztAasdFIL>1R153T9H(i#q;OeX=r z{2!Hw;;ApP%)!O0Rf=5~+s;VmBhKb70Lq+E<2XR0JTyTwCd@;_0_1T~zv%mxEovPI zkkXI_F!YiET2sfn=DgRW5DOf5{{tBo#*@cuh_5gsYcM}5`Km`W$ ziYuWR0FbR*pl^ktM(`O}HL6p#<)l-_firGQHg1*^X{YE9=x7`Yd@DvlqDbjaP2o?j zilkYRqZtUfVA#rof$9P96tGt05hOFK$zu)=MTA0P>uE1ZpuX2-Sy z(T@c%keKpI)A)#>!#+-{Nb_T%a zz+{J(M(-Aj#8>j1QZyNOG7ORm13;}q;P~3XMGR^WgYp3Z2vRg|K!Ba{3QZ%8QWMQM z2}m!1WKP5hfalYQTb6xu9SkrJL+WB>F6OHb(u9n0syhf=G9C(v{}F%+muDp-@!~>q z7z0pEm(Fh9Q&z}te<%^h(BI~Pm-91|tfBC?Q_Z#XP4Kn@vQ;zWHA1C5)Ee<^@mjA) z(Df)@W!Ppg^I;%67g6-Ws|m<4_mg@QD*?R0x02r2>H(k-b$G z0;TF)QIKN8|)_!Z}8|tqUeQhPRaE^ zG(@L%%E)v|zaWZU5$0jwB;PhtT5YDTno3#qa1emBxm`s$Hm|6MeWsa|Af2-0l+-|d_gkhoXE6nwRI34Fo$=17!nEs zfJ8)f@@rm4is;~UDzc~I7n_zrpmQH?cuLmOZ4KrSt zpwO9$U-0M|u)?}$a0K*+xbTGe^n>p$kSXIrhNC)1?)B4}`jfZ@WNOJp^T)a52Vb&^ zzm3tS(T6VZD45Ss5H*Adrf3xZM6ag362x5tzi z%Li!KAs79ED0u*A(gA9Y9)Z0a++MIC-=9%0h9RnDiWjYeXE(lwREaB-fdr!qgEh-Y ze*CDyg7K7k3XlSU0K@mvXBh3q60lK zLEacr4P4MG1nBD3>k^)ZCh%-B{t=AK4J%f(>E%QfM8rO7q``RD`enS59Uf{2AmfTV z=Z$xa?x6+Bx@;SA@RGctWhd7fbZZ@flywU21(NaL)qaNXbl@wp+HQxDP*ek$(GZ3F zfY;bi>HGk;x0gu?0H%@~-1Z~YOd|HIH-e{MNs9o^z(+C^ssIoG&)lkliGD?@hbQSL zmXQEYB;tsHeVREPf&sXf1c36#D2)>k4ESVb(F>C`Gg#=Y{4vNH0C4%z;ASv!>oZ;w z`7BbVG-~XE`PfiDRNRRyZS${2q}io-B|0nFwA%qni7!b!*pVEsl5TPBminV z_{}!y#)wv{qxkmsQ!=F$u&dYJqNq9O+e`#X?St?}e7J!=PH`EI@4l5$% ze~8as?o8UokQ1h1B2`9MKmk88tyPi6{u67za3V8+uBhh2$2+sZ_{0kV0G`&CVPvix zT-I(XDa&cm35 z5GAwu8DgC;6VHl&xJ62u-t(#!pO>Wz;{o)V&3s6}0KGKH)5igg3#N1U3|2sYi&A9I z>o?a!7+`pihQ%4>BlV@kY9P)ev1kD@R~E_-NVXg|h1HQVObD;cqV(sJf4q0nMxJl$ zt$^3TI;5yrh$o*!!N;OXg7mx9I4O)6R2IXy;z)U>C`WL5o(WV!u+s0>0x-c78)+sy zkTfPzKkv*53*n3L3D@aDZ*aqm@=J!e!ZiSpmthH2cBLD9^K#SkGO=1Zdh=rmeXI-s zYP(MqWe@`lo^fP?$>tWLJ)Xo))eR?lKBnFtU_wN_-qi;I8x_w#)V#$I#C@s?3bTm2 z5EmKg@1h8*;sOdu0jTEiM}qzQrxs3VT;FETZNjQY{;XE8SlvNQ0}DRyA;1X%y)a3l z02@E!vKp^Vspa^rFVBtCL*oXtjcN3ice&z}(ihI9x3cvbgAtcuXdJkDGIn|mm6idFX8$fbYQ8;M(!hr z$FLwYhE%qZHy#lfP!WFFKJgt-F}L@~C!mew7rel20J;p)@YAd*(29y-ktxUqm0@0* zv*ICAf+BgFTtO0jFkf|8&c{&q9FBDoF!R(}^}ypjy;i***?`eVpGTkGTt-q57EpWU z@!d<;*A&yUF10KvQTtAUS@E@#;_iun^Pt;rOBUa-0wS=XQN6;j)7y;f<2n~Y&K!eU zIaBa(9=x#e`Z0W*btkw?7C^2Rw^h585|g1L1;>fJj$gCCBNJqh6rd6d=V=CKy(79_ zA-9YjO!kA)Wr7NH{P*Aea=mot93qK-0RQ589a&rW?a*{vCQ#VH&>zvt+vZUc~?3GL6)DSCiy>a z&2bW4FZ^AB_jQ{Dn1(5hPmPTCq~;ueu}Xc15&$c+aWLF!3q^XFtbuSGPT}-e6 zPl$#0#m94jyI<#W(k`Sby9A7RJoa{R^5z@C?;fS?ri?wk{ry~E^>RrPlqE?|iy;@s zNGGa%7vUEo+z?dA3|gjh1@-~Jih#S`A%zA6`>zwm=R(a(o5~D;6!%p@7`%OTZEJ0J z8=D)-CuA!@@t(jQ&GYTvgubdNb5q%Y?o$hty=EOqPKO6iDzbi?V27M_+dgn}-I=Xr z^)FKqKW{UBEGg=i-Zrrk1RDwwM$ZMvLC@EI>62LZIJ%V0g>=G!fw%1~U|1bv6IGm; zn>Q#%iWmai#9iXo*x+NP2Y&PR)GH+FP0%y9*l5q{U|P)6wA-&aeOv+v9#N|YIj12O z{jkJXm*d=UCzDX;?{oop@bm`xog|>29n`2lloyue%1F=Bd=I)E46p-GGkgLvtOL%w zzAvp|jgbNOPSizqyfMSWa2S@}*8pRHXx4ja!aQj@zOlB(Rv7=V*J?rt55(gs7-p4V zCW+#GJZu11EM8qNr12r~bD1<*=!HN$0s|AL${!O+<5*LzE4gzz= z;6YUAxrRdjM}T{}m~-+thhMZ8`g?LCz27{2$48&iBghg5-S4!Q=In0DqmZzyagecp zOA26b5nuqrBLFZR7!^yH7#jc9;|TEQqc!L)RIIaY?7##9`VtZj*^lG zVLp60QW5YuoMbLO6p)J%R4l;tUvi>>x$B9fk6yaKj70dXh0c>?gfZ!{;sH)0ku-pt z{a?@eSBYbGv{)H>ar^>V7dNHGQQ2C0s@i~YaSj?E-}<=vV{`a$zb_@)#oO7p>m<%UsP5|}(b2STrz2{!TilCd^Whi6k~m^6 zx877%N9JjJa73>j{RVp}o{ziCEiUSaWoIfW%ARy0Az~k zIbX(^z?5;SJlSzf1zazmn(|FvEVwx_ExlKqm-FHnQ)U0)q}2;nP*7Oo;OKSrn6J2e zL!I2=j@Ddg@%xSBVTmVe7RgeTZ@kEOR9RitvY%;|jqV5(=zH z6DhMVA#US%a#FzjMj=$3qK}c4{hbIeRo!OH0>K$O#Ua`;suR~N081$@dVI0yOu(*- zzZN?yz^jFHckyam?~_g{wp_vL4YDjt24ORx39(?Yi?U3l_lW`_32j^3ItIld=No>(62bN)1zL0XthOOt}>eMyA9 zDvpMz&N~%XR4DXSBE#~0rv}v|ap=+%`IDrpDe|sk?%$nDE8|p?G6VL?6*gJKf{Cz~ zWhm=HN5~Uyt8V>hsk9?tTfWO?!>w*>-;-M3CZGbgtQn=?OT)E$+l*NW_Z#eGp3AT+ zKwVW)wZ7QQI|!54M3j#=b%+ZIRCLcc5fqoZC#pp#xF&MaKOP%Qou#G--dH;WpPhW8 z-gX8egAB&lwX6Vv!U1KJkL{YMCXz_yH{u#eBl5@$#xB#`oTs3tHK)U*&wWwWNF6oY z_U0Cdey@|?Ok3!BS5&iUNEMT|uNuy%N|TY3zeF+a?f7sHp-#RME1K20f&d+=k{`we zc-ooMCNWHU5!gd?t#lvq#)c>)<+iG%T9D^oo8?VhdBUw%#vmkej#0umZ~?F6it$}^ zvS2Oxb`s!tINc_{bgDki_Ug?1H@2 zX5)b6F=q=OeS>@8l+=nHqi}mK+uO+9oOfr%%P(1whnrBUYh{XVEb7cx4%>Mv z=cu=;7c@`%)f6PiAUY4kbdEHdq$A=2MD`S)AGxy9F@_b2UhAjFHbS|sr7PXc)lUvg zZU1~tFGQgm$VBDGqsqxQ5Pq#PldhgeO}ua*+PyN1jWmQC%QqMoUzyDU>L==;I9rC#;?w+{+O z>-XQjessc*1M!bFq285+$O%}`6peipdsiN_Tx52?ePkTtab@`7-2?)Ao2Z&p$iZBy zyi`h|-cHf^jHu{5!T>cJ9#K)3+Abg>8vJG8-Me@7g3caAlhZrzAg|*{Tps>NW&&4L zx5){)du|3SiB-LS|CHbDrKZ_zZaV2;y^xnr?bMdLbaj7^kY~x{^xlpa)%MNj+a2j} zu`O4&q}W^Dhi)>^nSHNblUth>2C_feY*>SfAook%R7TYbe6+>ac)bA-wlV)kD#$p+ z86j=y)?U+yvK{#&-d9$p#&;UPD-SVK=Gw+tUnw2#2y1GTSfMm#EL?iiHo4J?77!z^<*TU$xA$F%p%jZ1G!LH zmP@Z{>D}5FZ!hv9r55suD3@2Ko(D*ldX#Kl@3DNMot|aSB&XrK6>AfITO#X@r8Nuc zhi!ZAn>Y9CS@jc=dwOJo_N`oDTZo5e!*91%3rFXHqa~c3wcmq2W|!=ER>;cL_llIM zC)Du@Fvo0*JSqQ%X}2<}e)26OS>lj&cMw)P-7uiMT^Rj@qq_obNl{HBWnewasUOb^7Kjwk8& z-n(rhv)-_EZ=gNRGasppx%*glin``YDRu3b%cMk(HJ?_;imZd4H)r`g`A#1CK_flby#3KNp1`JHCH( z`t`$EpQFp;lf_rlt{C+*(sV#6#X-*<5~}pl46{NB3Y(8DhiEj|`PKCJ`EQrerSO(^ zr>nJf6uKW(Xb6p-#_6ySz2nY1WHIOrx$rI4B#9o9q6{=;TRLUionfblo>n#4%4br( zAQsZ5-pas>b^(KSAnvI?N;dPigtJ#adD&534D62KIY~tAtA%CDAT7F=nHSkBpfXz> z$G-+*=r^T_VJ~38{d~dBh%)A@3xLSdXb?#l(%^8eOe^dTWR>ujm*eq%f0~#{g1q;(kBOjrBNwY~#iIU&MvV;{xbh(OhFn*E>U| zD@h6!1K10Vn0i4+!LzwAPBAJ=*o7>ql~02AI3E~tx@0Q~pN6m|E;@xRIod7d05TtR zx5!>=Z_x6e9$w7p;HodOaC-3%}CZgs=!kTFG zmYnA-9}oc2D{Yr@vvwS{oT9R>I9t8Dx8(TKQgOwY43(B{~* zOr<|v;uZ6V7V{A`N#C<5?rs&^;}%mbpGcxa~mq+|ylA zd3ksm#f=sj0b7qZbIw64w()rmx!exPHV@vcG)pv%X<4H1O}sqT{O+94`%!ew$ht8_HU21pi3Bvt-j{6ORrx1SY0^jpxZRk5i*D?_1E*nboJJ1w!WQ&ld6?5bZ32zi|4J z<5wg2SQDhAMbMk_d{7@2H_S`4vtrjK{%mIrq~>3{(Sq%yqz;>R0_5ls?JSrM(auto zRMK-FdbzSbaKyE9WX}vJOy5y^X5;YO!QtsuE%KmtF`n~*w@^ibzv!%#ohCT?` zx917vxIpYV%ZPbehT@H8A{ySB+FV1glCG)8hCt+)@QsYN1_O^uta}+n9QV>s0tIa?wdGSXM zPI+QLX(VBx&3DE%*Zzi7wWtX&eBNAh)~DZ2m}hf-3o9BBdXZ5BOyYZ0*1*@RRqCj* zBl~d(`!Ph~p0(2)0M8ma-%yYY8Q6+(u4#kJ5LY41DkDCgg}@_s3!<(nR0CwUki36sH6II2XOxjm$X zeGhgwaNk|Hl+5JZg>f0Kc!oeiXIDrWU)(0Plw95Ei?Z-YviioOoqG9Z`L7Gm zNN7)s*f&i{tAr7Yft0j$@;m@nwIa4#1zD}g_x-E|t1;}jB-Z|x&sDom@Su@ZT+bU( zj_eKj4m`ImAi55gI2LM=i`<1a@5^)E9NEK@&MUSZ#@S!u?=;W8Y&ynrUY#83PewD! z&VPqR*xixR4f&%5X&KHX>_($deqq!((e=$mBN5GSfixe=umcHl(ZkRpv3FKaE^mgo zv$V%YgiZ722i0bU9I z(<3p+z~u;dGPY=|5W3t7kxwKs>+^D)`|9Z0iwzqPVgc55135)3LZc;zQR)VkI`Nl1 zKZtzW{2XG~{*~FY%NcZDUh7`sO)7DHPp8iyFHLDrUt1D$eJ6vJi2WpCTEyg65J$sv zVQh<1^CzrGkDxC|z8mN<(R5JV?4H*AL|f1sT8+OG`Ew9*oqG*`UmcsR#H3BaANJbm z`C<6`0J>CVQHfB;?*W|q0M3q8nQET(Y)RF-{@=KoXmLcd5y;4;ggjED5ecxpSRK_@ zT}X>@$`*k~IfSUd>N8JxpBr*4nky}*d|_URjB4+ABoDx>Ed(Kw$V z1^xv33$8WAC`D1Nd=1$>(J(`WuPTA1=dhV7-?U76^j ztq+yO5=o^})r^mu^Lz*MZXd|F7;+N}eTsac$FWV)-W*hw3%uQd$l&$c&N5DwS*&X| zXj&z?bH}{||J9>rz8Rn!pKPtmnUBgR1Gl<$S^vWPiS7l{2eseS0(%3cBSDO@F<19P zY?fu44j!l&wa^+q?FW%r0VOO?8T6 zdUS;inj|7`&a}WD^`vX{Om4+izIR*BBUTCHXES~^{vny*dwL~i^u_btDzcQ1v-pDfNIX zisq+O*fIL>#pRUdi2;pbvlzBzoXcOTG7wi|^tHBOa|*csFknALVPxN5;#I-sP2OplxV)5qE^eQD=;`oM^G4Fd@IP=~nlGe0k7*qY3k(Ww zynQfywjrh!>pmh&N_I`u?<1A{z9hx$8*8zXF9)i7DZ(!X-}QP>jMB@$NjlPMjd;tz z9rjqe*Fqx6y4LdqChhf;!QHAef!FArFQbP|PyD!;mG8YYexr(s>W#Ybr5Xd@>Z_kN zU(b|y7Mi?nx!)D@+DmNICxEMD)hs% ztdw&iv{v%lVtA7^EXXw;SUAv$QI!W$JL%Y~BwV_=u1+JX=ApwRi)Q3)QGxA$jNt~_ zTYfw{a;!tPvv^L$wX^wkBVPqiEfSM#_rW|VS+%B64Nm6f5{6pe7aR!PWHcQ06Wzp> zmouU7mS%w3%YlpO42ZC4B%_~*qVE25zbxuvzZg4%THkNSewj+>lRx=M)312`?kcTF z?or=!33U0Qf-?N_A;O7)?=aRtWFR-%jM$}lt|{s29^GS|ORY~OCNP&1%r?STwTMbj zhjfJExi7J1Kr&A{4FpZ%=xx7~b7Q#Cp;ZZUx6j17?1a4vyWiH_e5_CG^=IRiTY%nI z)*Z1rsXk)EsuH@-QSC7X*(5C=p4gb#rg@FBhh-X{u#eun=Tb2I>gIjPXS0t^PUz>( z@QkJxWfFAAz`Fk9DbQ@*TyM$Q&=d19^Kg2{OtN)soL;2G1X@Qz`>Wj5^IANg7h+wo zX+TQdN|WAxB>&ZbDBG*@?{D~ixt_F|=6A}tcG%if#pSy}W?KKC6`A+4PoyA$J?F(~ zahglnT{~7ci+CwczaT5}^G*%+?WK=Kkdvi84FhT{?0W;rV-;^NEyX%pH@6FAf0;?y zZ8*{Cq1VQi8#4Pn8%LBleowM|W?MYOg|OdnyYL}+WAE~iO5xr`S$Q4C_1*RruEh^* zA1-A_i>bh0ei^y-_+-@MVyicSoXpggPDJGW$LH1)uz)90K%_czz`~yG%S-sY3h6U3 ztbGg3*x9jK@<(5ZA1`cE6ViU{)Ssov*&h*tsI?jFEvdAR-Ifu1?B<%Om(We2eqlQ1 z*?C8+wy(6rC+8NG=V;~tWX6ghwtk*_1D^$n<*unPKE$Uo5VwX{r3s+p(NW4MyO1}9 zTI`Bp%Oxh@6=P+ZG(>Q*0j^MF9;r*q$u)roGeiwN84B(SWb8iQfUS;FRHfklcy%5T zZmi15CaT+IGIv9wDUdtcpL*Dvp4uB1`f#2gl1Z*i<0~p55TU|kPvoK;f>g!RkkR&r zoYoVcpFKLXWMlkJ$+F0vu*ec>oW|a1dpJDCRHS%SF_8b;))k<>>93LHJM=W;|hRY2B=g%iwokmvml=!T&AHa7j^3Phx+1z z=vvQ2DFmyV=_hbnO3;d-G#w=-k*5X1k@SKg-dy)@Iw&JdJ3<#FbJ+S0U`fOuUr~{R z?d4+N&RTn@s$BpF&n;WcbyK5Sz6*#v#Gn|}Q&tzHOb4qYd-O^1#7G#2Zdaf{QV}j)|3yD5p3vyqyWqYn3rPzCA=9)=O8&G+wQRkP^@~7L5 zcLxpHsB+I6Bz^R#Bp3`8XH}hIZXt@hKEGRgRu%t7@Cwf+wG()h-YQm&M(9&$5U(|x z8EXArpG5iP=}fY1V5G1&KKyorR8Hmf-3OyZ;Wd&y)s!zloDmtMY)3cFH=fIN>IEau zX0NDlAC&C+oy^Y^3^*k9s_+`+Gb1-2(9z^o8%c^mAikA>&Q(dx*k>dmWyI3|U_aab z;HTR(uc~tv0HEMTm}E$!2n=(sUodt!#3fZ(Ir&zY1l1B*J&Q)$f$1JCS}d~UszaVm zB1E2QllqQb880lZY^9O_hbv{m2m9M-r6fp&kV|RtS8qE9g;|KdVtn=`|H73n$N1M7 z+1(B|I#cp8sRbjm81_V>g~g_*kFR&p_qhrgV@Z&{OA+Us)DAKkr|k}Yy7k=kI?U$1 zWv_XMY3RksVeX}Ad)>!OpEKybgqP*V*nK_HD?xA~=QSDEt`|sKolfpptwe5A4Z2{c6pmr3MVzLC>9bv-ri#qd z>vUY$C_z$boaKbyT(H8@z}?3TpvmAcwb~Y(i{}~h++{=Xo9k{+-a~(Ozv=+m<~i5{ zOS93A8P!JK+j}Ugo8%5*YHjuN)PbC4-9&K(*3zt@? z%%vEe#G)TrYq!*|jA>uH_bNe!P1n9gVWii+it`%V*=%kwb19C}ZS?0Dt3n7Z?Lc@G z^}`EoE0@L&cH-qVns(;Wo6CLN9Ip&5ElO6^>X&W14t!nSl@5o&Pkw)JqTWzXe#J^Qjg%rQs_PDuutc`xGV!jeii?|g+P8wZKc&pZ+* zyL|q^w!1XTBAXuqtZz6EQy2J2HL{CY&cS`p!Od_*e+b|tS90d1l;cUKms_}QaU9Pg zPZfh`UFgyC%&ELh{;ebQ(02ArsafTmPA@zpnr#9;E=Gj76cJ@j6Q%(yG4TJ3y}Jyn zVr~D%zk-g%qL(xh(h>sFi%_}|2_*&T6huK;3y^M*P*9MNk`R?{q&t*U7L9};ih$ty z52#!B-u~`;-}e*$H_w}yIgaby%v^IE*E#3){V*UPlS#eI0hW{otPlX=vTH&_TT+%b zA?(0RrQol2XlAS(X6o8|h=?0hY7^Rtrg_B5QR@8WOe|EU^tG59r5%PzMFIZ?oTIcC zVPQ!bfM)-IK}cG@R)K)HGbyjcvh%^AqOSP#r?*ueZOx)sH!Qc-V1#$!v@bUC+c=Ous*X6&OURfhV(B=%!G_!khE4>#)*3W;$&+vSdmU&B0Pk z5<|L51fE|cWkhOGpLeB1!gjJzLKfuXSK?@{yY2_*5IMNf5JpeO@{^QqAY%AWXj@Un zMNwc8CqHC2Vm2%>z`!Az_CyTpi>>vkhQ zX6*f8HVf|iK1@;^rc{7AmvlTjlZ4a_;YIe|tbc16GgcgTxZHGblbn+IgRdm{e3CYi zK^*H0nug1AR;Bs9g}^7KEp`hu=0KkPy~Sc4DLow^ab+T?<#CkBn7GjI8(C?;Sk6_l zaO6jE#Po6i5*HVSNizWN#3PP#f{Qq41}g!&7-8-l7P>n_)3r?t*)uB}8D#UBTg{Ek zmo{Jt|p!`5B3NOwoxSYyyXm5X@im6 zm6--c)5ywiBM4jCVv6f$Pu1f-MDLDdP$cxvY_QO|kP955rx?%-K7%Y?51U@ z08>oK)B&qhiM~8{EHi5?YX?6<((_Y~hsc*qkq~E6(rC!=hco&KN?2?ga~vRXu?o{u z%=gSp5;eVd5cq7+Jfl;B?;3|`Mk?23A{Jr~WA&*WDU`jgDf?1M;_FGOT4C}~DxR*Y z_djk#Yo3f6V7A*tR3VqLQLJ{X{0Y6x!HFt$;635~g~jr%9zfy(Z$jcHo}N)99Gb50 z#XzX&Q{q(43CJ(SveRsXC?JS zy-$luairYFRPW8AX*W=eG0x}pPcj9%zt{MHlVy3Tq)~YtrgB%GR(+INOZ0*&0CBOB zrIV{()xB`SYNjuW!7X#{<|y;+=W`vbbkU!fN}tfhM1e;?fU(aFc(TB3m&pxnMiF_W z%h&oea*)KDRvICn#t2+59$HSI66nBZK2nV!b}hGtHV^@<>mRPkD{qW}^o+trneqBW zpRY4@@RN`||FWBDwjzRccq-B&bUCh^Y!_pE>bV~CtE&Rf83X0@Z)lUd$x~Aa8|a>- zT@hlg-`bqYV7}&c;V|=p;|FUpfh)>llaF+W0?03hmDvz}AmU=>Z}Ir-uw&~!Ivv?d zvMo{TF10V$2;;#A5EIH5az?H0@%XnN^ZluWi62Eiujf$meCd zSM-QZ>X?cm(Vv~n zqw>`B##Dkl)LFe)E7(h;93%S4_A=o#C2%}_&&TUTRF+?)E67xYs7hvND!jzi1H7HD z3ktoauYY1|O7oG5)iaCCAm6}@a)?v`d}x$B!9q)`_nQ2J4)ug4`G-xWr->Ai&WfJo zmUm7gq~j=HF>2t8x8vGKaL_bvz&E%Y&c#aCC{L|JE`zVlwIRQTSBXlt(o{c*zF>v^ zj6qezS~L5g?VHi2vxcSt%mPMSoCpft0H#-ws9OQFSd<>5#`U)L?$vjR{8C^VhU=82 zPskq0vzDMBn$F14#<5D!p-xH&`1~F5yE6(HeS5^H8wTc90*91^B#)WjMug_`Bj$I; zB4fe|$HK7+=aLO0Dtzpf&Inp=d6;8yqZE>`ES#K24-~$iNDnMYRB;pqK>5s!<%n-D0nC zcrwuj3p^&1tg?|D1!JVrBF!^6H;v>3e?VY+W{8L~@i9&0&8XdzsqBhaE(Pz{?QRZO&W+3juKCi7n0=_qZJ!a=^A&jP2Xqu`42OtB?%wU#)k z_tvGiKQ{|LMhg|~VN#n$@6|ndCss^})j9Ez>)!jJ>imZ{-(M*eFG+?k$a8X|;fLOZ zl%{n!G^Z8!h=SLh8-!svdp7c}DJ6|0ZkWw0vBa%6CNkdJeCDU4#t;RRWR`8(*f-5Y zZW5i@nEY7%%yN*dWM=y7J=T*oWSqY570|SmpfYOu%J)2Rr9v>WmI@zrYt^nU^cJy-P_Bg9MbbS0okJx%qQC*Du}w+-TOkh@9KIqv(<(Tu|(xt z`j=*VsOv^#+5z<|QmY!xb*q#3`U-(xEE_PUjreJe#8ZuA3{8}mnrPFS=%<>P8JgKH zHJ?gr=ACLj&CnupsYN`kMQW-=j-eH+c&Sx6tyO)hRh!|Z{-u`}(_a3c7;*Xz=~H%k(0ED5Di;lS z;v9(jAaepxx%x6D?z1u$9FhPkSAh(pPUgh@!|~+H6Zwo2B{Gv`>Er2)6R+zgrWq%m zr%z^1Pc~hi>YSd+WSm~QHL=S0Ci(It*zC>dt(nQXnRB4GfCiQ+Gu6&G(S0~YDLX-X zdwhL*hFErn`oXk#J%-urE!*w!Q*Yj$mYo)nod(UEW^-~676{FjSidgmGhr?=EjfREW63mkwSMvPgJsj(Z?kSMoy%BC zW}3Legem#Dns9j>K)H5~QLa~S2u4^bU)^38k!PH_jfVwPt{v2dP1A{Gru911MmlC4 z!?Zs7V14?*#sbqu6!N>uMJLY)s9cHpyz>$;Ht7Cm@7iiI`}v(0J_lYM(a zfEl>>!2R}vocXpKFpAw`W8UUv{v_hReX3zwPVS?!Iini$w)Wf47vB;%z5V2AzIEd5 z)`f;0Q|4{+hufn~+d>VW^|7Bdu{&2E?nYtv;^cNbQF}sf_uR4DRsJ6e%|GY7{pi{B z8SVd}-G8V1?FSpVg9zrYnR17Zu?Oeg?oYoZ05zv_vEf9W(F@68q49zehwEJ&2xah+ z@L1HurJSA|&t`cWhdVx-fc)#t{R6@? zc>|eB5p*))XYz+~H112@A`&ea$u}r8sS6h^eDy@_dchYWv7)iEE8X{GBE+6dJa?L` z{tM^ja^2(4oL6J-m91MOzi?i*Ubks~a$Yo3KIH)Cb#a5LX*RN|f_}7<9OG0ZLC({( zaidqEZ>?I!1DQ4`329{=2gj}ufKf3k^!98e)8qpcua^_u2p$hHex@LBa({=-QG&lcjk!XI(VdY z;|1mgg|cr-mYGq7yPX#7R8QLw30uLo`Vq;MDH;5o1`2c@{Zu!8Vh)dMR?2Fuo7li6K_VD{l>7MBHBU=oW|}DK{euUWg<()5j7ZVgHUdoWS=Z-%wj> zC=0&ojkc;&e!k0+g9GQ(?fT+U=LG{L)=zjezX*f)I+sv6NAS^{M9_MU-@jYrG2u|s zyJOzVr_kc)(a#@0?%ndqG|D2~y_xX*ShYx<6_cv7^HM6Cch(ex-=x;*w9M19Qv6Nj zDH6{VN@!2zrAA7QhmvNy2M1!ugkm((N?S=}rg|l9>?%lo*Piw>jhyt1(9zp?{G!j- zG~)i<-ddZ~toNp^5f`M5+C%P2n*{sBsoz)J(Yj^UHzV)6GvWHsL!_h^rOY=MseOBI zZq4l0e%;d3DuPqe*|!heMSUbb@4Ss^xLX}A7__^Hm6QG0>53BSKF@1X#s=Zh;rE!+ z{>ntqL?CstdLU5J$HFb$`Zme)j~*vp&#iimddx|x)N z`9)W;8Y`Nu@^kDr2a%R**psgHDoUZXoQ$c05W3-AaT-CQh${wjT4T#?(7R>f9gX9U z(+E0{L=-$q!Uojj~YQ{Ztz z3PU@bVq8GWV?MqElykB|u}kG1#of*Gpe<5awPuBN!4!vL)nzx7r%Xcgd@jb(azaW^6|i-%Jg>$CgQa zXs*c2quH&J_=vWaYXi$tpd0j|$U_0hu@QhRMNfLUM>UlLm<~y$vrwCJ#7>5q4q(Ba* zcny~%LfOVQ9%+FbT$lGe#*`iyl=WQ!wAY&Cj#-zd;nZt3q*4@}dM}e zUz8_#@!64g8#mGnKP#hiylg-}X}OLuzae70FUwPl7mnDde!3xIvss&LW5qx24oWc>_O0>Aa?KS?EVRcu|5xy1L zMAz9s{!HqfN}JY;P3vI1y(}fMrv$NS^DY86Iy}R|5;7SUOtmELx|V&2E5B54Xz0e4 zcUhf0qSjcD&8u7U>(hI@QnmpPjQFmKeQ>DqRbqJW)9)9oHPy4f7#}~Z;L1I#Rc-MS zdH;QSrEiHb2|q*$+v3w2$@$Ebh2Q>Ma>-D%0C6tm6Z;R`&s|*PHxk;nE{b|q4-|G? z%WD^Z_i=w^B%g_2SZQ-ucJ=1#tiA0*@Yz28>?~p%8udW$p$2o;n*`0AVmD)Y8%%g5 z$65rujWW`mY+_<2`pk%5|ue;)T)QEfXmnj=n@r-fb z)l1tq-yXHnP$V*R9k#EK5{HiI~op`ole@g-R?VsYi`^;IlZ}h zu2y7AC(L8y7TMEE;cl?|nmR=MT;H*vxOQOW&wj(QZhrG(C(fkw3$9zTxK5 z`I`(S*K5z&yuUdnm;Navw{GKli2=Lko7KwCb?f|eemV{J*7IC%Z(oqxv4WXz%%#;2 z;BM6tm7U&uyqtgn1>=xaf7I!$SSSzI#84H9vM?O%8XDq28RFFua%=a_^^_2_kFf7@$Q`>-MS{?P zywD|u(2(WOn6ZGc%Fu+fJW&K;$)RDiTw#eFVMbPAsRZGfKB)U+VL4|x9&&}_ryOxy zP8L}Cq{eV#5b~SHwHSEhapfBYX4{1~>@w$dM0`yKA~QhB7V=Ikz!?fU_PB27$R=$@ zLIoq^jQkt(qLxoazOM8Kq_581$lhJKegaU9iY&WDZ7G&vTL`md!3^wj=TjK~(Ws7zz|iC=oku7fWl;v8@uf zUJ<*S7qdVR1$bO@f#5cRDEd7#PFNgoK7uYRe&%%?HX(+TFn)EHxYr{3)LuMIUOeAQ z!W~S!)D1r&dsz|H=tb-3ZXcXI70@bIBK25;9QR!md7^fhkCHJ~_Iy?IOtpVd31Vps|z~?t7Qfu@4DRiT0`U zfvMj-uBXeXxhp|55`f}`h~vKBg%=HYTy^93;SvcHGxr-LV1-qr?GmzOsrUD3@4Z+_ zlSkk0KH)H97ta|MEn=MBb~e7yUbc%ned~2vKlg(%-Ft5({9m30t>iy|xTSmMKUgvT z&T(P4K`ZtTwrL;Ct)va6KE&P#{nCkj_~ARK}K|E223r} zf*_N;I+@ZWliuVGZFuI@{7eR(M{3KN%sh8lk2tSDfb)_)=DY*|&Z|29i1U&R05~t& zW6lc!a9$>{N1T@=8sNM(RgO5X8>;tJiE;*vA84xOScYcn?nkJbz0i}Zer*W=M~zX>s%dr#ChGj=Bf5L!0xfnyT_!#~(BD=#5S-Mxk>>y>oRDxe<WCLe8-;(hUy(p-I}P5f^; zFRABeO7V8XpBcS_Jt=*r0jhB4t?*Q@@S?AfA}2rMydspmk2x=|hzeiz${UjvXn^zL ztq3%&jPk7v2RJXl^)juDp|46Nu1HX?@^chPS*`M@kuWZ=x=GJex>fanzFI-B`i5yW zAH2%36fc6M)KR@g5v;vEmkDV_hN|5?F~U$=LtR}{6YO5If}5!}Y3J&WK#RzniAJ2GJZrLt9XHH7?`j7n*ik#OS2`6?9z~rgq^A^^z}Qg}6>kq;AWU zdc(JF@P3^vl=jj{d_6HOp;|r05!=u^Y5j`c5*%3%y;Q#zf%xc4L&VoWc0dJG^k9*Q z#vl)ergP=$wY%!{ggi~SBn|5kjqkf_!?&8+Y=Kf!t*v|$e63mCz1e!OnYxye){o{y zErMo^=4@>vTcic|C4^99Gb;(5f^^Ht+Gb&r*3&($s;sT6HLc;9E%r?<5|>(>bYIG( zwem5%gw(!N(`e0fZ&ogPiPOV!d8*Z3gX(-_TbR&G+qjp8JxzAx?KW$6SJqzIlDxF! zqoasy_x5vkE@}%Tp|a-daPfO-qucJ`)M8cEfpV%3kwygfv}egT_!l)N1$PGZbbg&_ zkB{u|WT1)Z>2NDDzqQtazJ%?3u+>@U)Jag=nYu=iG1VF5*ZGX0i&mrS!B$(AUsp-( z%QA*mS-u|5DeDs5=3I1b$EEIGKdKee9=)^<@9d`R!R`@1x{9@~b0poBwTSMto^ih( z%v#UF8fEudZ%%D@=2qKkPuFy9ua#fV%0aJVhU;+Vv(HX_Uk)f|rkEQ)(vAAjNv`!h zME7k)c75#WI?Q}=ur{#d)Q<%ZKn@u|huwpi{*kzTvfe?z(E;pK*=wP}S7n3j(}Sm? zp1n~V9M~GO3qSH7xODSgLneW_?(WbVQziL{W3(ob!ls)QIYn z5%t~?&Giv&(otRhQGHG9=mqD|i~o=Ff@;m+y3F85&kz>R5ckcHZp@IuFysOlN-Yev z3x+lt1LTX+_hA?}FwC%7R)JYItyvD2*;CQ8+{LrJeY5-So_ee;?d^V+ZlU4aFCt%VCN3m2mojEfgc z`xY*5ESSR^>wH9q$7OzGx+7~Z6_ANSZEV{y$+y$0AwU)eGmaa!H-6&r2?OVFF zvE&b1Mhh$lYAxS!Sq_O_4l7=c=v$83SdM|M#0jh_x~r(tA6eOgvTW*Dl8`{CMBbwC@X0#$C;o+rhX2Yq72tlJD03Rt)i}LsHJ*Z zFUCYqQ%B#}%uvVqqVgG|xFQqwmnP(BQw5t#hEkUk+s#k3Thc37C7D}0I9Xrnux2`U z<(`UdTBq$v_p25td!sIUP7Q~&Y6lT-Cz~E;K3&&`y{>{z?l=70?R(ut3_UW9z4OrC zF8$t;m#^n|`k>5i9w$u(J(^*<4TPP06eZ^jLYQz0>(9x9r(@T`i616eTQp>e7x$uFU^C9@pmo z1FA86LRSf8x)Tz%79Q}O<7V<0iJR&C|hTt@=0d0%bS_R>I>`a`X{ z>QAd<<&JOr>S{i}pJ|D}GHch@?rkp*7n=9i*M0e1^ZHL77dy{iJuatTJ+1*-xb4}D zAR6!G%-aZv3Y3C9N#zj@=Sg=cg_P4SCPFZm_E9jM2VySj>`iet_^I4wHVPA*ajPH+ z8vA)KDT>VbcqI^K9ws%gnGvOIXusegqJ?3nL=Kd4P@4NS;gRc`*)OKP>{Ve4GI+GI zc)v2L%r=<+8R1g8PjxOSxz5fwgh&WjOC}0s3@ALj_3F1C7ctLIk4psvcwAR9SOAY} z4Hcxc55ng4s%(SC`p+RVjy$f6#vpkwg3#8YlQ#mIX$eR5k3BBVBaf>OCbmEHH;>DD zgp~u%b|QjL!Q&5)iwkZF=kfaHaXtO%arGT}T&h1ju1LV+8u^pQRbp`P)8itf_xA!ni7k_$OApt`F z#^WmcuRN~LM;;dxo~YO)eP%2V{}Gb;`hoe(vBw4blgFjlMCgUba%)q^Oo7D4=Oax9 zLIv=+9)W-LxO9+#CPRS7Rr%fHBKzraL2!R~T#x=ok1OHdd0a~9n)-393g~5E$RiTF z8zb}b{x}r4&k_PrcA&0+Ry~$+-_e#Mk1GT4xRn0laf$qUkL%?(j|=s8kIMr5hsSlG zaDGm=?draw)~HB!kkf03KT8ZQMAOlyRVW(~$>$3RxFVGn0$C6uM+%MQifw{fTvCgf z=l+q$wMC@3F0UoEjBkEg(qNE=2QLx2R0$d}yk$3V5u`U_cSsa;8(ZX!U;XU6$E9`b zad|fa9#=SSVjk$4+y30&J+8+zk!VGlkGwK_fXB7|-Qybn!{Y)8-C=~I z2|a##Tu#4uTo(VK$Hn&3<7#7xcCEaD)X2E6#UdrceVBNP?oxS1;hAo*ru>ZuEP*s_ zD8S=V{O)lP{?+5!SCAS5JT9TXcw7UdM;_PnUp%fmB;o|#rM_;J`0HTmUp=mg|K8*3 z`tEUs{s$fx!Li3x``zP8(E%Ec|3{B&^2p|Y5JGUD!b)RUyS6P_5NG>7AA4Nw#~v5**yGawlgGsccw8+%JudUc z3*(;95I5*N)Og{8q>Yk{G@JIahhi7>ddL7I@7Uu){@vpWfh(+zAxIEaeyZ!2M z`TlE<%j1W~WeqYOwU&}uCKICOZJ^*?R*;zX2$P}W?fAg_bumo4GI;J0?(aRW7snnK z7^mc$#})E7kIN1ECyxvHi^o-g@B258i|a2Q*XtvXi|e-@S0%nU4i0&hO{Gs9r#z>% z8}GiLFAg{ox5IO8*cg6J{h0_cKm6F^LjLf$6puVEAvdV+k;ldM!{c&;3jOf7kZ`Ey zGyE(hw8!Hgd0bq_9v46IZywk6V~?v5M``BI9#``(9#_C$Jua#L)Z+^LoyX z5l7z*n%YLodNH2r0-O@74D2F-!&EXrF0Oy!alL?kLVfqR`2UwZt{lMQLg6)T;$8tEol0xH z0FP_p7msUz=n4qe>tA?W$Ul2r{D8+L^&fg%6F)pI6MVjEXe8iq!J*hLypoyQDzPDa z|64t-NaR28xSEbVuBSgdu6v5redR+W}@wftsoLK&2k4s4B$m7ZZJTA2#9v2+?>Ba3iHk__IUUXUFX3k^jw9k3D`ZpsJi@Um7&$_>65Gtj@OhLdnq!BNO{<+6x_jivg z6vw>LTTFk^7RLN{kL%2@9+&=q=y6@ZNBskj%j<{7Wj&*Ar4o$q-o*E>J+3A^bBi}P zWiX+}xJj7q!w8)XS+aEO5Lt`tMoc7G#(iPl2K|+yfT-F)sgB!QN#)SGRZ7Boa-6C2M8_VNIZI-b!|82bDYaJ(fAP44SgvT4 zHb4eh_&}-wF*63~6gX3H8NYg5s?gYgc%jljy;hGiC@HT-$kO9a`xx;~!uZAnIPE1v zQUx53BqOvU)aw_IOFZ_|MR!+OX}C`VTzN62?_p<+DZJkwKKhU*$UY@hRaGB5K!&pi zH-My^CH=A{?;75jLSgB-%W_CEIL$w}!h&fx>}T!mr`>!@5wFpyOTjV!o^O;8?)5); zT#^!8V@DnrGY<>$&mNb_fA4YK+L91m8!x*$T5L1qHp z7}P9!9Qg)U@6HFFh{U zf8}vaTK&3SB&h1|2{p68`M%@_2`fqN6Quu5nJ+&ibu{*lAWrW^a_aa|?x z4=sKD<_5Otb>X|#qE{2w$JHvw?_@Ii%m&w6{`9zVe&cb`gq}tXguGTfBmWZcxYz)X zt8?rZkLw}hf9-Ld|8G66O~(1y-*{Z?SwB3ks}RZ~kBdfFko7Mf*R3Oui})_!aoHbx zToW?iJg%K%kIVPpdtBZ}9vANay2s^A;_-{eg=j(i;c=a3eD}@cLLPZsWq`*uOFTUZ zcwAe!R+s@U%Q*^C?hB2cpl=?R_cxF0$*&$4{6q?xY^+0+BlMP&!FP`<>$e`)#y5}4`=`fc_ub>NYpQSC1U#;J;x`b5 zPkv2E&W?B%GQJL`53K<{85WEt|H9*v{m(qEEEwt+k1Il6%4jC$wB5h;xTtY$WLCo< zIE4jol$am-eDkP8IP(IbnJ1Z z3-uT7iYKQ$>kE^~k zFas}zf9I2{vg20ZvByQEct`ul<2rlnam62bT$N;(CBJ)IY=3xM^MCQUf`R5jz~dr> z(hxMl)mjO^dt3>C$93}9UKC!Nz=~RSmd0eZ1cwFToXG1vu;&Bb2VLdNk zwWWAmAjVR05BmCO->MhojY7QSyKLlDZ1vpt@j%ER+DLiO3=%XWO?iL@TQs`xG>Yrv zUQj_|TT`S(rXCh$iBW{rQSQl5po1w1fafk_#47^dRfO*f!t;<+Lh^4!$eneXl_CQ? zLAan&5RwY*$|6Z89YX2@a<>DGtH&v_#l4l16z3vO_LaQq+4_tPuNMj;&v@=35{Lf< zY&0cvt+3fM1GL9h=i#ZOa-gVcfTr?0vx^3fE$0;!CByIu!o$V$xW!0F#M#g9t%z8O zkQ1e#D92GymN=IVP$jQSG#hyg8@V_LN$W5Ekzw;TrXJ9f-*^7QnsocVO&|N#~4?zY=-JF#wAVyU|g#J#&t(O?+D{6&HjOL zm4XbvVO%OdFs@bg6IESBI7b*4V5%6BbeHiRV_cp;F|NqSPD!3l@wo38SMs$sT%2Q! zOXL@f>+CVc<#U8_J(0%6PXB>%`TW4ROaY8b@;k=G0{(_^rKcTZTw{O3xO9HOxKw{& zTyBB9&p<~QSAtev;dhKna8s@Wz__e?2mXq2g~?k4kN95+3YR7-?!7}kt00IRQSHAPEa9W8T&IW6n1aNm_1Ecq zu#01AWYBPSSZm?oSCdgmhYJ_$SY&S{xDEse8HFf@;v1lF&hHWmWw0BF4xf)2Mcxy% ziej^tc?G_AK}iB9ldHsHo%q6qSKQ=5cs?k+SR{om8o%+?YYo=YRYNTf3JSP7`J+ZQ z>UKwG(7m;I*%TDSqSZANLUOjoehn9q2O`b`TP+9jEO&Xj1@epyni!y9vAPa66t+=? z;0QxY&9OswC=`jpmXsrL1VNz6CMg{LJ!3#g1Qm^tDW{nWypL9k8jM7BCJE)*DKL}^)-H>?(VIsT3e=`EJwBqNjbSfP;V2z!}{lQ)9E zDzcQ^Qc*G?s_PFEpA1pTgW~z)%+xaz885Ea>I&$x@k-t7-Z&140Z0mlu|+ z>kJU+<58nf_$eUg=O#rq^^yGKM!l!0?NGROC?VoNLN_Rd5sI)w%yJBcYXk}o9K^!L zE)$hLALk%PWPnb8F}b*g!bhP9;e%%INwd&54e2Pt43xV*nh^eiGX+In7=W(>B4N>^ zat|c5lZU$n;>I@G=S|}-qcV#E2z@}YQKcrOPw@p!(*4GxvryAfAWzSi(Aa8ochyW0aQuAnBwoH_y&QyvUfN?$R7>(%kI>NY;(u2o> z0E`QCjB%Y+I5SN-I6yd#Y3tQE1hGIkEYxO9vY@BhHK^p7zv(1bwWTMBsq zL*TU9b;S;M;I3q zfN|}P$+Cc-f5W&O)i1+?IChkvKC`g^#ubQmJqvxo0O1!yk&b~rGY;@i)pEhS9PhV|IgaEP!#rT6JXrjEi8V|C|w}s=@|4xqPe@bxJE^n-LDd zIRXL;oD9?0UOVtH#^svSuMr(bqD2B=Tw0u=LIB3aV{kGBS8NQL8II1hh58}EM;O=g z1#?7TvFf`V7YBb34h!;*Ytly;*EL(9FZBrH@^MiwI~PYZGfRQ7jH|t3 zc#Ls10~ptpN$3&ARqhU8T&vUN>DFb8pfKr>h`tiMMY9tpjI}uMocvm`Z1+I`#wDm? zzlS@{aD;K82mp+$aQ52;++F0ym;#)4e($dnQiM2Nb3cvYNrX0WhxR9~c*< z=@x)-U0G_A#pim}$-iyY#bnCcx;8^RqxhmNzPpAr29zZl_c#j*sBEMN5Jz{1EMUQA zEZQpp7#He2}|=M-uxd~C%V3D!p98hpdJ#sW7MXR-l|OV~A_!_(jH7~^t#|2g^x#+3(PTp{$K zKQJy~N3vVN0szKE44ubw1Gj^|V_b@xc5h4ojLQPRxYQv;u0JrY!k(9~Zx~mjC&Ly+ z$a-I3|BD_=6W+n@4J!)c9~c+QirI`Eyxo0$(%toyc{TMB#>Hf4ya!-hDQ)9N7#E(F z9Oww+D*V*PeuQyl0T`EH5J>A7<5D@sxZI#o0^^`OcVXA~B>>}6+rBku4|{WjaRC+= z)~ga1z_`)@jLY6NV8%1z^>+3*j0@$u_Zy5Wn+1tJES%Y%#)+4@iY2IB$&7}pa3Wg1PtNi%a2(m>85-QZ({;iX%~m-`%~KMu`#1FaM$j4L7Z8>GE$f9d{GxGPe? zYCQ14eljZm<3aJt)zGu4G9c=+wS@2n(zAC!Ag~tjQwblYbOPQuQKbyUV>-K}qlAV+ zmChC!>0ZJOBbG|j2hSkEIC4DI@`4tbL?JFzD$=nOEnp}W6A5Txv#Am6(s$=XWyTC7 z#m(@#V&~hWlEng9(pGK2rz_Oh7f)Fasg&0`9 zL=i<0e0HsT6RD#_nhQg^(Gdm1%md@pQjbADfaIRJT_Pyg!h9uwfJ2v zbJY2F(}yDhay0cL>`3Y#lknaxGbFx4Gx~_cVxQ1@LZs-LipgnWPzH*K%AtkuwR zATE2qr+7^YY^3{K+2=SVPUYNf+X>@I^7yPN{m#ZXSGLza+)S>NQJAM7zktXzJ+2z? zxc2UPm>?)WlJgclrSe_O#$O9slzqk1OE` zQw@KGqJqe|3QV}^M8(`adY7kT4(;cOolvJvi3H{q+9ypTrAnTDo~=3E@YT*&pz*uM z<#eU`@Pim}O5$QuWV45YcqqSm_;zu1)|sM7ScTF-*>n92!R{uGrHO}~-1yk=r(OKr z;>1Mys2)>B*Zpl$&ecST{sa-l8@(*r^JVPpsE*D)ujU(1-Cg$g51!?CWMqVOUs%o0 z8#R7E{`l49hm_dry=x$H8(Qi;uBZ6m1n#l+K#noB+-xjezg5*rdcGKLD2Bo7=r@p0TL~% zQwrmcgiu-y!(>}G3D<1MaECVAhvIz*3^XtXD~jia4w`}pX|iy0=oInnk!h!KHiBKA zk@k3lNNI-30FNuH+dH;aoMKQ@hYqK87jeqZ!U|uh62^m?AzY3thOGyXvhTp@#YV`e z^BVDqI{<`B+Ff!fW7hduoT@)EfOH*+KE;fGIQ;o7t#UcGR{*lKdG68Xmhj??pxvSqHu_WebmpK*N@$f zr6cN?2{A@yW&YmdY8+5fR{ZX9@vkUr7aV(B9HTfVE+zC#jmL{IM(Fd(>gsUbf)XH8 z^qd62vW6QokN4BswXFy8B?o5kG#ZtO&*T$o>Nt>bfgmi6>^Y26C=C@Z9&(dYZ8SmU z%pG5vscdzz!wb*cAG==5be!e+XC9ZYbK%A-p8K{uVK-fDC}f+t&UZAsG2I^yvDL*@ zyY&g0Sh_+byAK3mn|UX1r{HqtX4H=l{qVRoCh0IuRcAqbu;W42J;J%+sMQqi27%f@fn z`_b_b>oNOBBRDUKERJZ5)xPrl*mE)}Ks76OC|I`t`WXzN^VAbd-i_Qe{CnNEW{pNm z#?TzPtcFYTtPBt9k36n|8_RKoo_wTPxM5_BeI$e7X3rc8{8}u(^o;3iy(oT8`A)Ya z)i^}Oj0EPMZdrH_n{;6CNz!DX+Ka_l{9&HavthvF$_jan`1;%~Rvg!+p^R~X`vucN zs1wol;2Z6P^Ys|bdluz2;wBl4&$r}pZakHD0=deQj7aK8B~bTMVi(=en7d=y^lN3E)xQ+3d!h_(g6>wR`rY<6!p*A`{5drzJpU_3J} zC0}H^Kj65raG8nGL6uB%5DgRMUAy$ou%X&f=2Bdq`2L_Q<#uOsy}AZlU2+;{^;p^0 zXF!hUx{H5}{PR99OUg;-dk?N@w7k%_UE0{7>8#1Y6qg!41CpfWYNxZ(b^`WIuL~p! zH)W0QPRYDsjl(MYHK2m`dOgi7?=4a})~36T9bd_v}#X2WSer2eO3|HC{BJmU(n z%QrTn$GcNuw%G}{-;}l|+xVY8z(CX2UP&#(=$!l}yIJ>I+YQ-64-QuJ{ipr#Evvk4 z9L`;QYw}{i{&UOeukzZr4=T7D_FQjAi)S}{^SJz1T* z^Mkky0@%HT)(7Zw2@FLOg!L>WZGe51o;yO~^bG1~-y&X14qw)g@(JK%j}o&8IPAzE z*X#cJ5^GMu#2P{xkYu1XZWk!lftJOjE?4SOO!o+HYSPyQ$oe%E31uRp!@re72M4dhR(8W(12rR-Dg>xFD)349+S8K}Qq~4IWDnKQd1K7#CdchERxG}HPtdo4pb?V9iu6c}|bIE|3frxCGJS>4RUNc%DG+N%jN zWeEnrg>BjDry$}F@I^Q9h&CU&j({a5&Z~5tKX1~-oZBv4Id^Tjn`(qn#zak{uUpNm zJ59csi{1R>I_-f$i@+4+LF;*~jVm9oOfuy~Az*qvsx%Dtpy_~u9&G&I9UXUNiX5yExCDji63Uk76p0;FA=@Nv%9(I z17KVaZ0#Q{Iq;BBhDMz^*+iS6p!Zad7^xuDdR~aj#& zsg6%yn{cQGF0K}1D~KK*E!P3Y@6D3}H*slz+4yvzGCI47Kx%&v(=)qAIaulkLds4c z`MGNv(HnL#eQgnOdQuwsw@mEsE)LywSh-KZA3ftd)APMi8gJ#966BuOsgXC0VO49L2R7+0iCAy z8?A>=n;zm7xL9!Kdf4ac*|%nZeE1YU@TYJ*nGp5^6OC!yZVP_A56wH@PkjrD3b$lB z%jHe+SR&oVB$|W%OuLC}wpYFS9Z0Ulo1DiAxmO@pZOm*fpP~cX+BkxhnN*lvsRHL1 z9*^nd+O(Zt@{2wz-%O3S5~R0W*Yw!WHY6Ij#cp|W7-;J5V+jfKf+Xy+Kj$E|JeP`C z#Pn2;K&AbvMKe=n0^|lH@Rlw@m8)Stu;D4u4nshmogztN@TrfsVwQs!`ns6}hf=Du zRvOQf6t&2C6p2;Z*U>50R;Oxwy(-GiOcWYOb8yk6R^gVn{mD3rz^0;GZNcb$Xw`@O zuBS!aG;|askN0;KErRiVDvLPtG&}LuB}vvbn%^-AFFTE_q%Y>nUCTSgZXWEitoy*u zo20-9cQv*zFRtD$eioKcu);6R9kY-3bxr$gC>l|Vrj|xyx6Wtl^3Xpso_J#rPBGxY zFc7sD7!+Li9H%hvQ2=dmk=z<2#A9x_xlr$XF_-*M_)u^Fg^MZ;R`LE`3)8`>qlIqE zvmst|p`4q5lV5Rj7hcQkn9ekWyppojOyId-)BVZ>_t+JP?p@i^MoE^Y}f&%ghn5yK;)g_4`2B*dm*jV@p9R{1_7N+RHK^d z0z?CiZ8uy=kj)`xacAgU`U>nlVmtpo_U(7I(8Ewe4~R&&w4{`@A_5jI zEiEcBLwAFOAf?hN(nyK4Ln+-7g0u<<&NE>A{N;81ujjhgeLw5D-=6DS=bLlwW39dS zaeO~DPi%J&&v;Go*y$}dvlqVLv1^s#p;ohN(q9vtYn85VrLPB~7V+4tyGizNh{So? zynzj&#netQGE~L07u8Fti(;wXb9-8E;}#T5g)g)#FVs>*c3qlHN;J zKZs54-%KyQ2rC+a<8Rw+fR3(6%YoupipCgX#!3b4L?+0?N=30w*OXDTE_rMb4r{;` zeyW`v*cMh9IsZgp=;FrC4g0$p#Y-1%EgmSY&W*0NJ1i@dxj$4CgW$7U`HF_)6Crdk zF!^_}v7DK)T*>RqD&5dQxtU;o5+ngv&IWfq9^q^A!}hp?C#51rRB>JMdjiGw#tOs} zNVdBIk74q5=gRnfsG$Cr_e$j+!gvW0JG=zKf~gk{61&Ll(b62kFZ6IkEk01yVS}B(w>xCo%1K z0!j9SA6Ge^~1h)J=SFh{3=H>G4#hzb{PUrXR4g}tAg(?Ig8%%bkiMJ%iamF z7(j@*+^TZFT}7{H^Z1RtMmlMFKT>OZAz(@i0}{JaB6h#(v)JcCYE{J*8^zQQOZVHC zJYy^GYX=RjSNUK*I|;Z-lLw{N4n7UE>E&>^=Taq-;cKvc$GTRlLX`!1r8;07?-+{G zNnr0F9wg;u#+Q7hls9S}usdZfNC`zMiD+uq)jBIE?<=0$8xY!U<+Vb|kBO(-wy=0MOJ9?+%cW+Yi4(B` za)q}@m+rX*Z`W4chj!JyyeZB0oCdLLvFcCAmb)iu1wz!1i7n%BF|@RY#ZkP*V;As^ z3yX6u4vBdX%HHNjG41~*WFQh&3ta)RyMyl`KeO<0oijy zXWXq-&JAB_7`}P#%`L3Qb?)t*hPOWFMuHkfqRx$e!4uAZHTs}otfb*}*{Whew!4)g z9;%XZ%D&_9%mKrWY=CLY5dhJ(=g zv_n<7hvP4c!Xcm*@qn5r1Q5}Ry6!Ob=!H&c$FmvG__rUUPI=))ub2pF@lhON?H<`EY=XxJLvF_>geht4HxQth zZa}>TW*)GAcvBElcH(h)d+zV7yv2+qc2Keh?N+{h1*W)Z*}s&?xwwT-p}Yvxk@G^Z zfQtFw8wL?*B51RjSf2q{Wwj4>tZ3`tCT6rZ+nW}ka*)HhgsVCSc-4;&37p6`vgbht~Rq6g2DNSnxO z@%_q-UX%l>`0pS-bjGs zXy55*U(mJ;!DqfX0?R_V4FtrIZO_xkWx%EbPkI&OfJlT9Xh;h?P37WYV zXmr8A=hKI`YMM3{iQ~bi)05^x{p?nwecx>5sD!CpT_R$SPRdfz4z^l)60O4Ikj@zm zP@wlBWnq;ST5Y%z=PA}j5yHO^t^80j+R;}+>!aU7xka$C!kKihMkNZ(#Dt#t{Gvv> z)k6OFXPBJw4OPU~o*A-RuH|qlS4+M$w5;|>j9F~G+h$nz>TXWI^4X$|uv<*SBbz!1 z&pmzKPWu<{sOor*R)Pj153Fx7KIbdo+2%>GS!#`6>_79^t?h0=MA8@aBBusTS;mh; z*k42;;aZ3j=OHa&9(6AMyhYUeoGo+0F&R_plKqfLk(_(MX2CS2K%;Ve~9)CTE&z#qZU%{{Z|H- zScS)Afe3&5hJHRvWc?^U^HTE!fObRDVf z22U}F5gRc}9W4jq5&%MI<)MLs@w3F%28zCHWK?3Yfw*KSGChq0@0 z4W?HN;^!Gw3{<3Q@tMsBl?C6R2p=+JDR~l3F-d2VSnnyAR2ds$%4=OZ&MU7=h06s{ zZWySk-+;v>4_#QHyhW~M1B=h3e8-_?c;QkyEFn+t9hc$giN{qI|BlDnQ2pu@>_N@Y zJHET4wCx{ZNzIgt0u&c(!iUsJL*;`7%d!j5j(v!|WqQFpLrq&PJEkD3zBiu0;B#zj zsl6*IXB&ZZF^E{iM`Ro6`W?+XjTGtO1MM^pf|^d0X0$1rYFHmLRF|?YIF8%=;;QENFt>SOL@2tz(bt zeY*P)D(zf9{?kygK(`1WEGV~Cy71{GRY)L7iO0)}l$cOD_jh`v{1Gd>W6s>D9!1i# zglVEk9el;iol~pTFIf?SQr11$?0WNO0F`LQn+uU#jdIrNqhtD2&h1iZut=FkJz{G) zQzmh$W%yV#Sk4oN>Sog`zre9aSzH<=s0dq@pi`s@gbTEr&TE$(HVI>f2-ybEz8xu) zPx!-4$=vn=)jlCZB=_d1Zz9{60D=saPj-urJ+89Yx2G(v`8ioG^(?-P=ITsr=idkvC^yf96^`xhB=sPh(@QMHNK3dkyC#wlt$q#XZ)wqkx?9q z5d^;G$G6&-;C77f<>ppECfa*RP7%ohlQeHHbZAsNHWa>v5nhG2(rFDQK10>rlFK3m zDf{y4YzfyeE=yILY$bKp&)vD(1y}>?mL(Id7}G8 z!+NV!O)K8?3r1PBQe49`U>Pm-qk3;fOHb!0=&e|h*P}{wF8FdX*5ev=-sQ;PQ&&B6 zS@Pojzx24KF?4U+c9l6qQABnNbR*5wpA?tnqfmAr;)^=Z%RiKcqA2f=^d~+qii*MG zbTpwFvGSuVHpahD=nHey5oI})C*V_UCViIlsdlu;#)IuDp?y9PsjY*uLAAQ z3CKx2{8*+xQ13NedKhd`(n5N1#!qg~5tkF$O#XRFwB$hpf$vR*`SP>Xlmk$b$os9# z1UH{kD$J7EIc5@1A@8&aPdPt$6-V7Q<&T~2CX0%CI1kE{;0I-{#d3ogW;*cYePvt& zGjJHbK!c?t$s_KAk60HV2SZ`SX?^E5sh{l?n+8 zSq67FrGE~$lbT`P`qDxG+K+K*-ll2}pzJ8xL({kI;_4oDd_YD!@ZD;~3&@Gr|aE4kZ`u#oN#AL4(5zTgW3Fs~OA5 zw!76?c9=#z^ODKGeIHXr#iTF) zZmel!890>~vC}k52m;cVN3e9rk2c98p7)L(vcN@6sMyfV>nQ5*S-2>WJ{(0o1EgP! zqQ>l@sE3L^3+UiTpmF4*8LU|7pr+$%_QyPzPXvw_ihA7y;a-YQW;z}Yq!wj?Vaqy{)B65O)kAO^$CKL)R(_^oh z=Wf51WZ9j&jiA)Voo$}ggF?J?5PLupYZQaGBO%5SQOZK80Aw&F!yUF~#%AGy94WHU z%*!TJqOyen$5V5v>WcZ&LfJ+*EB=R$4B<0d@ot(htN(I(^7Xx-8HeuTKwD278G!J$pa zy3WMlFxrM@zPGgn_N7lCA;MK4szDPeIxMjE!4vGMW}3GW?eRthK1$!7fUh4dV2LfA z4c16nW{1`7=CQ;N@SaUTNZIF4TAx^>nBqz3r=yq#UegGZvs!dgsPygU7*pYbn5Rtns%Mly#Bk}IQkXHc-OqztZQ%+irWdfQ`07e{Zv znCelm%`$|I4&s_K*qRU8fm+NV=S;XrnB&AT-%D?*MVQk^x+lkkAR|6xSzMc8%0iK8 z08hI(pWc_>3nr)UlAG;_8izEK_C*jb02zvfxYY(YgoQ}-<(V+V#S1e|Tc4U|je!h9 z&Uo*lnYNtvwaPgk3@}IXlkmjs!}>_GfHd<1%zeG&9ho#_&NLJk2}OZSl-hJof+Y1U zRQ3X_m*Ns~$tOLdCKiA+3@EZS1eAkiV|jM{8Of3sjAa3}@m!ObQ5ZKNf+1X&?9yvW z9%r0W+TwZT;$|f*!P}cTa%98Xr*Fl6F1&Qg#er(3NvgqQTyJw?+krVgnnL#?VJ?!) zN`56JPF$sg1kAeG(og9`N(>rc?y8{Z8x&6!5EUF`J{*9P>JsXssM~r+i7QApbobG{ z%+DyKXr6(-2!e3uCVZQPuW#^3R4_{qj*JM*Vsr3yFP+7x@5X<18Z8FUvkkJS6ce{a zKtW_n9{emYhmW368fH>r_aXXlC(>zti7Wl=F0Lwfhg7oT5KmnY43UJP@t>21)IMsB zvTs9}Scu30lPRTtJTjG&Ay}fetznL&9aG+a0{KdO8!TL+80K$LqL` z#+EC6)>`yvCLPlqwW4HM(1jnZv*`-Zd2ADUi){G1O5Ew!I|5$5?1uOx&$PQq8y8P~ zeC~9 zUW(|5H%R!f=`qY)$SYnTD5rK?J_5ze5cMIF1wNvsO*^y@&x;7EG&pLWxlu8`K0DC{ z)oqSqUaOSJ8)QFsiDCGx#6FTaBjF-1m8l@XrAa}!=*Q16AdS8Wi_%AgYZ=oHkb$a% zHVRm`_+4G6!HJXHv<=`%r@axYJB}mxNMINu$mQJEq=6OfEGMM#+D8D)} zdqxo{pcaw(7W+txo!Ng8AndZ1zOKTYv43QIHSNfRS?2qL@OCPa~ zLi7^aw{GhbwrwvC$niej+C$raag7{JjL^lrtd!~ez(ccba*EpV0|%u(%8}>}#hXY5 zgSgS~S%4ynnvjNzf|a@?V{HJ&P{n8ac9=O)Th;otGZm?e^&5~0HA`F31>w);jFCjF zZz=V}oC!?FYpM6YMl(a2w;(71vA1Kz2=e+3m;DS%r7c3on5%c=5H_FP34~o5$l+el3<9;(r}Ef3>yB((yH zs2KTy4-`vf8cSfxCWfLJLlTN65!pr}ax?a1Xdf(j7>AKvP2+XSYa#`2!S|3v3?|Fy z%-MmA3s;_zU@V0YQJKWWNT?^2i2+C)85OG@yD*hRvtYu!J+OGu;Zr_8N#C$#b|x|1 zI^0_#3D-A1O7y|}UGF=0SI`Miz8YeW*JAs|`y3|BO*2@}W1%U7xK@uaGLyMBf;tz; znb$-uD;zgiLb7I`;6+AnSrSM^#wg8464|`bgoHVgJ-WLAWdzc^yOMsX_Wa5yRfNP8 zREse8GN>QLyf!rI&rj*EJ!x9JqN?rLjL0n7fL4q$Zv$x{&9jl2Sq1aBO2&Xo^sfa=~O@u8&@IGjDobI6#~Dt|YuOg*iv! zT7|>}rnGNf(j-A;BMTE*lxdWIH^YG;HHN>*5=*iaZpLDgYf%i8B}?VW z%#!lFL01row}HtDQB) zeMi!0%r!L0}PD~-#>hff7Hj4_L?fKe}7_t`})ACG_-8mpnO`NVA=q% zVestHkmB{%mnMgluA|kwuuj*oMZ@qF#$ly~A>2tu$_83z#(w6sVYjqluXAs1di7(E z6Jw6vVC=o#sHcq_N%x&T4SsT+{LlpN$vLoP6Eh`nToHk_xyBzfG52JQV{NY5N#a8j zg4%P87c<8Tj>eu|AFp8?e{pW2Ic=l4;21|04otu2iI6i(fc}iydjQ7~llF#z&@&Fa+rk5Z^Ha za%^*DKtxY$E>WKyC6;x0pKW^JZrHKSwJRt~sx?h3!@Mhab4Ox&NBickw$C; zqHxlUp%v7M2DQey_w`M#HJ9FXCGso!ZwCFf$91jl-q8<_tH?$#?XNs8&o2i*J+406 z-NLz-;q+48-#sp22lmy{#omWfKRvDjbvubltjBe=&YMKa-K^({?Rca8pj9xHWzWi4 z0zrS_szrsNYXvHwL&Y0`J57Q*ch{UqU*J_zfhfOozU6P3|dmZ%U>z zUt-R@Du1lZ1#2UeKFsR7-%N5Rj`&{Er3ojSmnvBj;z=`CL$RWjIt?F>e3oRu%#1J1 zXl`q-rey~pjpo8$upblOOtfK)5C`fd40VA$$ZouPBN$kRv(VVvkH6U3!Fqmb7|Br+b|H z0@OMZg;_Q^6d{Xk5~Un0%O3^nv@Hj6B|cG^f@)$} ziPl5dmlkoUS>&F&j6bQ7cA0Q~tyyK-%OT|FaBpKWcHDa-;aZe!vWYK&67+)W3mchH zUdQ_>+0P$O$T2+B6#Jy1f!-hnx$xpE`eTqC=KaB3!aOnhNNo11q& zJZipvXT5f#PwG`F!=fyeF~$6+J@%vSy%^TEUz%Q<2;u$H_k-XsYh0nB*{ysg}%{T{OKj<7BX{A3+8pI zIuA&u*PFAm(AMkfKaI~|z(RD^{1?*~;yUCWvvs$2Y1G0JF`cKs5YmE(_?(m4c-SxY z(13U)2mC5MErclDUyTXy#&$8N2$SJzzSj`;7_MIw-WN}^x9sai4m{kHc){Y7yJ z!_$)ESF{}Glpn5Kdn!DZ*L#OafLhr)P!L0)!?jrC*L{s$Xy^(jZAY9)dFe1kfSeAP z$(4cN`mz~GQ?gQ% zC8Ud2a1G;ezmf-kNMs8t^0w(DU~T_3vyz(FHO>^Rfo10Uj-MiMiY)S z4e;I`w#z@2Y8aV{&J%7;Wa* z0tKVJgz|Wh}(a!6`n(&rB*>CvqZEN!6{bqZ#Ynj;l^`;vqGX{eIH9Ddm6 zgci!uC?8M6*@xBbmc0wJl=qn2*jhQRBFfK4TpM8HmjG+J3rS>#o4?pJ@4Ebao4<@f zy}>TI^G4vU(~?%Ow^NTpE|A(Mrx99m#_ji}Dpe>LPEd>TOFWM>mR^dr0lV@@$jul8B=v*hk(j|Z)sUkVsZ|0?n&W2~xP z{6W_|$6Bglk#$Y2NK=hWV2l{c();(5G=0udq z^ZJ{eNp0w1pDAXxw=?Ii;~jSYRPpw~2kY_2<8MVpgeJxUT(QksPNSCGj~LhOD*`5_ z-MMOqdmXB)>(F@XZlxF!$1aAdDTp+)_AX9rnfB&wRBjX@NaG<73lh8(xQ8^;|Ngt?o#p zv!;!POVqB+z3!yl%`C9M=!2yt2X1n@((*SfYl#UDC8VcXUSS-I(NnRYdXkzdc9^0Y zbZRi7(T$&VZ1r5X8>CHZ;o*+wy6daZ*J(G@Kc=-2-K)M;!K=AhKYu0D>$Bm7K?lnQ zim8nt6=^ip>pSg_-fk@wpO^SljBn^jyC_Q|y?B~zuW7n@JL#=-oWzEC&0DXJg?G9p ztGv}FMGi=FgU-20og*ESx{+1&$m>G^(|zToub-QOY~mF8w`b;We6Cx{-Lk%JJ6z>` zP~_t^mKx$YQy~1sW7>P?%lo4ZADy&$N?DcG+g}wQU`7X?Hhx)hINI&@`MO*3;%FlZ z!}}a_bcpY9xI=~nav*V3F-Y7bqc3lL2so@Tw~!>uNHPeD+`tWzi=j_YSelzf{2F{2b=u1JOt9}BzAtNsud0$SC&W*X!%uk6o_oYsY)_kC6&wEKcW&8F z!@^IT!#^y?Pu9v`CD>mO?yuP4e_7984HBTs5kOk)uW1zk-}S$g8*sHFVC71{6-eOM zxPa@bffiPQTr`2!sv2e9N3cUj zkhfK^08X%9Zg7}*K!8VZ6ofydBREbq7N685#tTW+pU|Bd9<%TyQY7*$Qch z9YZw+Gd;rt0Ra3+IUFp`g~b10Mld7}=&1I~GYFrCFum%Cz{w5oSq>lKh`>J|0RV)r z34^dM&pe0U6eRLvHRBvd6u=;|H(1UV894}v>MfNSFOA&7GF7TkpK_y%VIX{3zb_t< z{gCKk@v!+#eB702h(tJ1XUt9CC|W}w=x7usH*y#PfR)P7l!gyOVjoqnjq>0ns~J%R%8W5KpUq>ODjweKX)r`-7jusS!NXyp*#}>21Kchq7>BP zuXJMI%4!L958^*m!~^aEhpGWE0DzzgAX}3_Yn5QonP`!jV4duKy;J6EWtbY9lAIrY3c8aR??{iNbv!HQ;u?EGwG|e$ihNHQ2dG@~{zgP1GRV>yHDUJAKnChb@tl}%<`NG2U&*7c4|M#3a! zx~wx?0jDlx*;!|Cyw6g}&Eme7znwHVUBKD_PGl=?7`Xc zT``JAIqJ12mCzi*yc`X#$KLzd+BT0>$sb?Je_U<&_{u?!VeMlRLZsIF$1fpIOhcc1 zRC!`i8+t3{$!!~N+pZ^B%TF9_o<^Q|>XH)TcJHazd)K@9Poq1YdSA#T!OunA3-&Y0 z4ZC0$6q@Uun;XuR7r2-kbs;E5GB4T2BEcrlCO9vJFdwfyFO4qn5ncY%e8cPu`B$tk z`MK}&aUbUwjQO{I%C96ueq}EZ;^KK{SI;OBM*bFJ!f!#zBV`d5k?Cl1yuvD;y;(Cc(beUIes}5t8$q%`d+3>E^q!OodNH zt6U|n?PlvnMasH`JNYFYxM~;8OBnAJPgIrQgvlLJg0KaO?Lg|bN-0UIk;mC0g1RiC zReMs~vQw2>H)5u&utoT7@d8_K~U%TCI)> zs|u;BjvKFzwyp8!u8FIwPPku_W?Pk9P~-bWI9Ra!vK0-V zSE+WtG>p>M^W32^*Tc{*NYOClH{N4;5t;eoqk2PJOp}vS(+;=!RCnWH*bAycDiR(V zGHKe5g%_^!$~E2fjMr!Y#LfM6SVp@Aw$?;xM+?WIG;59IYjt#X&1W^J`0DKhuF*0a zzW8u~Ryv%9nW2^4&P4KBgV41$j(R$A4Ho5|Ce=e~5xdq)3{)D0)E5t1m04OGSlaS{ zZKu{+wKdwf3)>V6TXq)P5@uTrc-pyo+VzNEX*9Rpu5UJyrnz27caw+4qp)2^y4hml zm05T@vWJdD;}z#M!ytxNVQXZz58H3py>e`Rb(f*jNw5>;*{-S4k<@c7bfVqyU1v<@ zi%hJ?W!Dj<(W$b7p?>hN>*|^c;vrpReJAhQtCaet(yhiEo-U!n?#CK0v%|XzGGCSO z(6<&+7cz8LPIQL@dnyj;&;|5W;oS<#7sJ?wo|jO*Ll!dWqr858nwXg zm(s64OZVPu?VHo+B}-#;{K80i4o!2^XS4Zw)(K6J#)#jGF4IK=iu&R91HKmh3|^&p z5$L0914o_%f|})v^ZjEn1494TCn$c_EtjW2y0N9!9H(oLZDhyN8kmyn3K z)c<#cuKy{|l^BBjCUjwWu9R@U|CZ2oFWs~#J#Hf1W#*Afaz?>ahP!v>t+LF-?QDC$ z9Q*j3#(yDn<@xR8-|H+$94Uy1D!P|a)cd5Urm;9{t~fNi;Es$ zHJE?B`+3oLmE(L@ntE07WSz&(`^oOAlIa)7|Fd|m{6F$s_P_F6Q24Jrmx1DUo{Jr& z2$TL3&t-PZbD6<@;kl@>JeLx9mW@n$=og;LoI-x6)Plko*{}R3o+}2+bMbxWxiqmn z*NOt6&@s;?lrw|nxyJq_&!q;&@>~`<0$84FGx~Qtmwe7K&y|Drmv@IGVRFzBc>143_7L8Tr9;C67qy6o2QrzH}CSEu>JR_x{Fn*{QI%f4u-@ zojsUWCy}1<)5h{#eyYFnT;?#|mWX_Av=F!0z=}|fmSDN;n0Yym3}`qG%X5vK@LW~L zJlC!o8(R8&$t#d13zp|P&4Qr)dLF}?A*z{!KH<4+|HN}W{|lb$2^7rQP0U7$>JLo+ z!E>3%{>F0|lk9PJ^9v`9QOPVmv!kwE~o-0X> z?JJn+JI}>Aa>8?UV|lJ8f8x2ausm1#pLi|^{x_aWjOjbi<$>k7X8+7{-TIB^dH@nK z&$`q@WO@(t41wT)=GaqFLTl-q&-p^4?qPQDWh`g^mggcOSC3UL&R>d>jj#*73`XNC zy%q407No!kzkwYc^IWH(@#^1rt|#AkE;<&d#wg$KcrNun@LUMmW1cHt{#TysqK==@ z8!XS&^quG8|H*Sjv;O3{p8PG(mHMxEE{(t7xrA_^rTQUle&xBMlyk5=m&PA>u8P0m zx$Ho4bHTZ??DJ7`(m!~v z3xDFd=#F`=#NT)>hZCO51C8am@_*&IlC9dr4mc65eP$Hyzwum}Cp=fnU+`SMGrr$= zt^{VhS3h~KtAM~CJlFNVxgK%&QFlaRc`mbWJlA%}A9*g2qSP3c=Q{fb zo(oEP%yZTJ!gD<)7RT~jURa)M|MxuC_J7WEjeY02I{p>U_2JCLr!iN-o~)nPRe#5G zrTin$wR6mK5lKD!jpstlq2p7O1U)rEy+?lVToshT*(CNSJlE~>rMlclRzR{p@LZmA zPo($SI*I4lLPr=k?p>*VjjtX5^y&iD*^xH(#3WbL8V@XIGvgc2)kP|F%yY^AxYw$Oo%NWaZ`QZd3 z|AOZVp83vm$^YQFlF_XF_^3&R^KlPA%M@VgA3RqoRQwC8^CXN%4UKm+15*2+^IVTI z%u_Q+I$}}Rwurs|ndfr(!E>4a8$4Hxj(Ro?!973Ds~%xonc`IfMEx+StL-N|7x@pK z>jC&tDXF12?s_=^J}gLV2l*BGd!9?@7oKYg-+PnbHNyYeB@s_k{Yc0iXH!?sKk!^j z$2=G33D5Ni%AGjKT_En&e32|!1b2BEUySlMp3D5NcrHE!PRxw_fX)w|>k}|eav$u%i*~Qa7Q3Sj1VMan*@yjyx;L$sWVUS zbJ4gMJw-eqFfvB1&bTw~2qa5#3qK3L_LwXNfcFQU%jYN0^%5u8k%-Uuwh{oZQRkTF zipes27SVF!H#(@zF&AQ^Iv$bp6Le&@OP9J$T^z;n?MsEXY5%H}w&?r^L2k{gwu~f6sIAVR;D9zTPYrMpD&g_8ZhEXyY)mU{`VNSrBbk+h$M}94U?5N4isI2v)C^X=15@#!y zofK3rtMH<`1vsXawvb-flazts@fG|Qh#P5y&cDHP^`G!uEkAfJLOkg|@LXq4crHHZ zKk;0AKX@*mzv8(fqQCQ8F@PSvKk;1l0{>&4OXw%hW%PsRLi~y6iXre^qaip)PIPpE zQMc{=HKsM5u4j~_3%fZn#_hHhRX&6<1VY^rl>;2f2>Y|Y@?2*4Wxf!BRf?PX1 zc&>TW-|}2sQr~&5vcKiIPU8xk@LWQ)rD*tcj;pd5e**bMtvjK&Dzy|V)igEVr5^KK zDaSk)&mVZM$5@`L^SqW%W2brAwJ@(>Jj0MF|eaiFgNg$ zkuT2R+_e6Co-2WF>>JM&MC=p#w>+2iKl5Co5CLCw(3SI|Cp;H7mgnl8`Gx0VX8K2- zD_!Gxs%tOcdTJ1#WkoFCio&Sy{o-e3MyYSkPHO$&xrAozMVoMFJRMtz#H1Y$J<`k9 zTIuf`I(3|S;OX3TlfF*9o%^k)OP|zdPla+>nj=T1#iYOBxp;o^T(?={j(M&`AtBB` z@LXv@KX|T?6Q0XR_8ZUj^@Qh2{TDpfxc|7vG0%1S|2@wYLF{tOb5(^f>bEk#MBvb} z4MOa(7S_OCAmiFMo=g6i=b8;t^#l=5?qGQ?Al|P$7Z}TPjrn_e{K|7Z(rk)3Ujf&l zP5En{E9ocC6$-$+N%v`db7RLb&jlu)B0T1~n!fQ|!OYu3z8*PPo(rMG&YIl!$iK*I zVfP81->rV9i_fAjp{>F*${PQf=YlpDAUoqwshh#F=R+V#d#$9XUwE!Kc`2hAG65?y z3bFuBZJ~ko#C0o*+r()4ii6>cCN9*#0cL~V_Z|D;XMg3nGLheSE-;We7)c^8xkAMB zN1iL_KjXP<73te(z%*bi&joCyOd1HJe1|H8uUla{>KNcrLo{JeLQS=em3*9(`Kl2hT-Bpem<`pJwG~11y{gT>b;k)&7m= z;>4kW0LU*4g?#6^2C+O>)(OwW+H%5k0a-w=L7)?!%gGqLG=k;11pdTxmHrjaCG{6P z7cI>B>-&^xJE$e}JI@6_;knZGTChA9sYZOt?|H6G`l%m07dGDsNJu=L{5A;@BaRUK zJ9NFn|4D@LYBp^}q34;l_0=zwlhK!9`6JgDXB20{vQ8o=f~! zo(n+ogXhxw9nV#70zHN0xs(M!h=({NCp=d?mPn8S)BFX`Wd)|j@?1!4cjtGW>&_3J ztM&)aB?kclcs93x@LVVW3TBAFTPkfLcrK@JJQv{~crKIE002e)&0lz~eR#Ue zA9*f@-*_&D6P_!Qc;*Msh5S9wWq_jSMB;Ohlm6hj1Ob54$2?ci?|H7$W1eg0C(k7; zB94FN4?NchiXVWqTWun;MS@0v_fJ*Lu2w&=l_FxOfT3hzF;a7><<l|7(faJ z^a;x&-~ha}Cm>JZ%Q^uV@cHmgsn(-TNt;tKBT_Nbr^v13P=Jo6LBC2NBA^^Gii`+F zc%GbM;T%PO;Ay=!4sk3{gCv5qk?-MUWjDLbAn~C=c+m}%5+bnuhV1)(FsGWQF3(jo zq)gG$065TyTa4na03!J1q@ozNG(KUp>;QL_O89xXhiz1<%_@G6;HvWCb~z$BaURHw zT%=!$O>2`!t7-frnNy+^nMeY0a#C>sYz2=5SvN2*4%-rk-NgZHic&a%09Fc%6ipQJ z%@m_R935ZajDS4=$)y_ePOAJe7+?}it}QKjCRb&EB5>rCiVcs1!C+wQ7hhP23?l*1 zosFE;04sD!V6M5g$mG7dFA@Vs{}8%}P*&%k91C4mAf|6Zmy2=}vOV^j(8a0ni_nEs z<~c`?#&y98T}bftr1D!j08*^bwFe3u!o{h^3SEKcJ49K<0a=tUDTca^g|3QDa&e>< zI#8yx^F-*XBE||`J10U{>9NpNR`^5css`w*`r{i*YUunBy1uB>c6JpJ9}8V!()d>x zy33`zU6A0n_kR<*A|7^1N_L7Rp9o#i;3Us>T*?!nOX?S)OYTJI3O*LPFvZfiQ$K~S z;2%PlEmr8N{VsHIpuP!R={YAt*UV3$t4u4mwj``n%NQ$kJwFcz(Ue9gprdcrccClbSm+Y} zRp`QV3y=9GbWvf2uIm2R--IscvCsvJh@t%^bd?Mk(;imnvy6l2}LfEENlHFN{eQ6e<^>xgEr|86=U5 zPOrv0Cw{5(3;Nsz7<&R+^ zAN9Vj@aq?yy5coV^nOrmACDK|=O*XxAs3)(<^KqZbK$~e5jQF{lfvm`GzYz>1ss_*_0%dZ2OQeIua($U9m2fu!bVd5?)^Dom1gshL112Ad z2>>wQJxQYT02~049vd$xL%%K(6q#K39*2#!RaB~J{KEDj$zZ89>5#uv6Hcsfi#dw--ML;o=^R>VLq2*;F< zkGNzEeKj2AK1UAb7=^7=QW_aLztfUF9S1M%%WMUQM(An4%gCE9nPm81RjirfuE60i zq9DgdZ~$=7g9R28s{MFOR>0daArh(rsmnnRa*XOrjkMDO7q24mgMs$qb&{=Mya)jc z10ci$Nig!JaMEaTa0C`Id}xHiGeF|OD+H5}VjL*kB%~b$9;pEmgKI2K`w=kx7;v@X zIxGZ8xG7ZTr9|cdfFuE69|Foh;J&AtAV>lb!mFr~046TJ1n@F04*+fl;FMB~SQ3(& z7$s%^Ab5y3aUDR$(Z!X7#D|PR3UImaBXiz$CVdbm+?<36R9%q<qsKxQ@~43Iq+EMQZ>a8d#fy9g)U>m_<|oom)VKX1s~&IoQCOQg{~T`(8b)| zSAi9}Iz&ks05}3b*}JAy(zyHPCqh@pd@xq%;+PkkRX3tH|N=El+>CBGAy=H_1$gH}*30<^r#^@|h zgf6RNp^F(SbOA zn)|D%J;M0)YVVEBC70akZ$ek&PoZlZB?B47u$hlyg)Y`1T^X#t5%QPu|yyzlO1Ory4Q~)D(W~)wwt_LL2_B>bJG>(NXH}@rX?_;6su6K92 z5;@W49Bj)Zw&%Oh)ru9mJjcPuLYJ^JR_MaHQCT!!&IrKLj^Ta$kgqprSw^)^2V(5} z+fJn-u|gNW!O8*2DC4ov1t7!wzSm;8iQH{S2|Ycx_jGq(KCD_#TEZp=mLL$8f#nX zUSnzk-Ub%4ZqjEC?1TqVHRhwd0lV7bRIjn^zq$IH9Ldw2`1)UwNY zJ_JeX(;KPYfx+3MI0mTJRKV6e@)86I1q5qBNEw2K5XefGXgsL2T_^ zoQQ(;>FJTAmDCHz+gpqRuUF^iFd!@@650(233Lhhou}#15 zTov9){||e285PHtJ%AodfX3asvEXh&gF8VZ5S&1QYl4LUjk`;5cemiu4Z+4TfY0rbImw;EnffRxwh-4Uxdc5j?bU8 z2TsdeJZ?%gnUj|hCHN#**AXAjQMciu%88LI@RU7=A1#BYRD1drJCVV&P#*9hva z-opK^D*&yisJO|}+F1$Cb5-rV1XvoXj0ToA?j=w0MeUd*3)EHzpfiabxtEs5RX3U9 zYL7F^ISiBoife!JT%Vsdg|O8x;-e1{E`^AcTdc%f?^ zfW38KErqlF?z)D(={KIs<0N|T`s-IV;}^xKyci>7R5YKtn6X>-W7bY=MHgtfduLRm zp=@X=r6pdBB``Mge%5BuWj1CNJV!g{j@^;RNu{s1MAgSBH~IZUvruEBQFs_dX0d*t z_cR^e@J=QUM?)Dnbz8TyUs??nWaxgdnFleO;=i|5R3KO)yfi1tW66qY&3)2|W;0?t zFq`$DIE9cPMQ}Jv>LFRpM2sy4{h>tg&I!7qI)HQm=xH3I(jfIb8oTTx!`|-K z3!#C6np}ZlWKRWxs8<*@Gb0I0r7?1QyJkQ8;&A&(efX*j=DUT|KB7e;(vq0ER?p(| z;>+^+Vh=9rt%Q&hqGI|C&`{bD@B$?|{heFrfajMyKqk%zIS;KkG(2wcesoaMiX>j8 zZwR#qt$SXLZ?g#jx_ECs)(`0iEGJ_=XYkCSM((gsX%K~t^*AWnTv|l)1YPB52vngU zg#}xA-aVVkEI@woW$@EEKuKn2j9g+kmt4ON0CIs<%m_jW3fM$Lq?STUiaxi)UKT>MA_%SdEZbXCe5XD+9#XObAM5zwmsniuh=4 zxZKqId3Y;HxfrOI;upysqfyP1iV7ce=AH#R8s6cWS&RBddY(}GB^_>xzKITIuB`a{ zxs~14spRY8SwcO(YdEI+{@0mF2{_Ni;TYSjOTtv&I6QTz7kaZWe=qQW{S$|C%~#zm z%lc+$c)4@Q2mRxi#Cj+D?Ivsahc;2*!1G?e187dNTV_4G{W;C@7i)xkyvuN`~4>6Gan@c&&28Q6UiNa zqkdYWSdckh)X0}ThMfM?LyE%HNjtP%aogvpIz#x)H&$0~#b;69gxt~O3!TdL^(!oa z9@bJ+PuI<*)_eo`wD@u;?vh!zD`&-}o#PXS_7B6FCT(nqY2z4?WsC}7v!$Q&#C2M+ zNg8aal$M?{u?ZZ$uW4S~J*^Ay6-(ZD3Z8SY4s?qck{4$A3ia%IJegZ%=a%?p_UL;! zvE^GE4?=Ld4QWb;J~i@ZV#1L(Q%+wxbrz^aUyBSzbwQ*#&Z3v!0O{6UjD}Sl(+`_X zb2ai?U$y~V&pS4Az>R|$TJuM^O^?L}ghqYStvCWNDOtA768hvWQhXZ|6(i@zWu4w- z$qxa<3WTR?uyt{DP4^-rE=GD&&+>n0PAI|h3c^*L%cR3`p~fN$>k;bZAJS&KJt+!U z#k8uvP0lq9+{k{9uxR8gB}%3hT_b?~V6M2!L|CT6EV4eacw*oZ4fE>VAUdwdv}ij| zHzX~%ySqB4PKnG0iM?0iw3$4|0F9f99lTa6{-j7;u4V{qF(Y;=2-Vrmuc$vb_n%uK zXup(teDe*z-xrk*^utNJ?X>0*I-Pv`^|W36vaGAE!5i_bnf3&B*Xz5}SkO>Br?~BK zv_^Hygue^$l1*7v8-)un-Zpd#V{Wnc;yFh#l_rN(~&Q-%#K<1=Ih;z1L%Lw*Wf zgNWk0b2}Yl_^X3F7ehaVE^o;)AsVt608;-lS%NVbE_7LEo@Y)F1rY3R;Ns9D3Robv zb#)3B(STlsJQ!XEeK&ri(b-ZMBG5&{GMELb0}JY5KZ$~TN6)5DM+I~nKH5^|6&j*n z3)nB}7oC;-UFbq2Q^BVED3#~|W%?{h;J3!moy{f}E-0~V>o>wSEr(Fm&!9&|`OSo* z?Ws8CoSRH`anQ6qHC+*3miseR7t-}uZo1aVY&!`$kP?*oL5{%MGpr2?_mymSCKDGG zMT|If+n{wy!B1p9i|)0E*m3ypyZ$cMGUsz`%k$NCG??^M4 zOQOp8P-(|o&pxHdRHCO`b1zyI^~<=_@8@Em;|_`9v3=dP;PW^;kpD~$aJgnT3l>`H zRv7M%W{no)n$+{*JU^S3OD-&%2~5QJgzGa!;?LN%LWFl3}e zdgZveHa}b=^AkgK=4r_SpaiDtwhN|$OE@1GT_*vdiMr`2(#{!0WDq5#M(r$K}G3G)0~dKXHW$QfSCuOLc z=kb;VEDU<~KPEhH;NjBVln%DkwFDhLlOt_*T?6z!Y(fI0|}Kl=FkkCAE=wqGN6d=B|e z4TTq8VRm&h-fjt4XuKE?1qo^QthallYI)Q5d2Olb48-x4Iu zWV2wAepDpxvBSu=9N=uOnP2Ek$fAQ~6+O)q)4db36%kv#%PvzCc^`0pG4fOB>VOMf zhzQ@ib9F2!TIS}rCPT<^o#@pgu8{pB_UkZNi?WeOKf{GCpFsE5ll`n(svOLUnxWGV zeaT@*io~5MSH2(=Hf!_f&K6C!;1Y3HTWdd#&f$R0J#CFeDZGx@t~a)Ie2A$X6Aqs& zkagmp)OHBdBkr0q9d~#4U3U(va=?wRlvXyoRLkOSxuI1FoFzPA2 z4%+z+I^{}p;noT3&=Z#<5#$Wxay6cXYyz2Gg)! zIJr#kFTC)@$R!{%^}$)UYMX^nE?59u0mn^`o@DL`caX$pX`JSBXptYzQU;zm+95yC z7M1adWj>szADeG*uol(BkF6A7v-`0%4{yvDQ|3z>?tqq4YK^&%--x|YUOD;7?&3|E zTOr4e=JwWg=l-6~!SK3SaGQB;Xjb0?_q5QuOS8de|5DKdIhKgf6|F3OELq9wLSpaZs-H4ObO zx^K(Wl>rLXz@ti2AALVLR!IKKz6Bnm~?;Bd)nUqtzE5-?LjEh%NI7< z@A0jcgpKxC82M;F)>;obVomaJaxP28sM&`wD?w&&S< zCp^MHPP^wV?(stNV7?p^)${SVXMOR54o8^==iZW72=C&8wCpZnQg;jMCSDmYU$Q$; z^KUVcY5h>PXjir2d;fM@yobI8(io-D4M*$CS_a^5!8b*%9ee z2x`}+0c*oBK)p+S{fBlJK^R=<>fE*LO!@ZKWBBdIWPXnyaGtBaR`5cU2sIU|RTxn` z#C7D3Ea>~V!M7*)C&{Hf#V{B)L0kqWxqR&e16>3i1#1OH>oF%Kh#M;|q1Bx&b%7Ov zq4lmWN=$7|YC}f^!)F@8mjxrX8zPPcBY!kR!UUsG8>4VwLeV5{y{+HvWAvJix~9@w@BGO7D{xvx?J_;L-9evK}`cNGS`mNXl5o>2&HtUKT$wt z*$H=zl6tY{E6~-*$@PuZ)Q6KYQ1H@+&l<{J>&xjO#rmz2=MrMkEkFyu&xEcn_~h!A zgw;3Z845S=cCzO}=&vLSW5y(X>|0+qpAVm-*N#G3zSqtTZM5^mk1TQ7&YLQv=AcEn+Smp6pKH0fQ2DBz9K6t{xF-4V>rE`yeRUT&vtk z=@GW=s!A-TdN>H-GC^mFYWCtm+BNrGH??X0D#?vNmN6spZG<^qpEDaKl7&e4Mn(h+HP}uloS#-E_iu^HRNS&nUg)a+m z_rs|uDQN@=SaM|t`6HZ6q>*x6k&gU)!_&~x=>?6JLmC?RFbwCpw$)3OeFgbUL^(Tq z=dXA!rRW9`H8MW(9~@;4BhS4?Ux=lBa$+VL+9~>uz|g>L=11%DkIBcm=Bq*L%Oq{9 zYP!}pKAe1WtB>2(kCQh=+cwmmY`$#UG=x3bdfT>T_hfsyp(ewgml*l2%ah$iPnX#( z!UZ#zoz90zZTs!_Zj;iVKLiS}N!MQZ3bOg~5%vh@-tb(W_Qr?_DR%OB6s2{FwXC!q zc`+Y6_BtQ%EU~yQeT!snEJ8agiDHICYdyu-aKO-T-AhZq>neU3dR@And0x=IFWv4{ z*F>@?l^`pX{Z>k>VU5*8s%Xl4fYDT30ofnP?=yLj$6Vi}Dn_ zQ}297?rc4)xP365NI1s!Y^04pc%uM4yYuBEPTMd}bqd31MmqeNTs-@sYX0(pcdUoi z5>q^Dx-*gPuBYpLiI2U2AFpGs4aB;)k9Bi$<}b3&&iiKAKi12MgeCT6cqZuBw)cEO zQkO71sp7=wheoPaS#)3E5Q&cx?L zn&QL*QJUPa_mlkd7Z9rp0uSL7WK7Ki>UCnyuf3+K@mm&M5pGHY4gkZi+HVtRaV@5@ z&}hU2=EmQkn?}5MHhw;7|Ai`X!Eq=r=!^Y!63=I{Z3WsG&uMh}a0P^SBnEDDeab0( z^Ae{CFkOU4=CPZa_>8zE0QWCATDMgt)CMVPgZW3alk>qoxqQ`C$qI0Y0ajM&NKYutX z3rc&(c<`GT`-7<-{UT;ml!CxCtc- zunWd~V^ItYYzdXm@LUopxsJck>EZS0z<5P@+)D74iujwG4(MYtk3+=CNHuV3bu%O5 z{(3W+F^axJ!6!_hmZnqsz9a=?%N;s;l8F^W4W9y5YbP?x(9r2Sqk7d@L}u+6mbK>@ z)=1Ldb0J}x+LJz1)RrPhS7%T-?_}t(4`paE2ab=9v*nLQTny0>ydu@4h@$TtQ;-eh zd8Usyo-a*UZOXN8{?&C0E^0BI25H2t5f&*#JDVaY1JIFZ6VAj9Go>(|OX*Vgng+5n z9|mSn=Lqr<2Qy@%5kX7Ve(_vOr;0DTH0g5@IYojxaTH0_@of>)U31IA-d<$DOmO?; z3|mY9>3H%NC%^GraRXlXU*z+#%CT%39$-rrH3d|jhAC!}&Z}B+gKDKIm94j*T+OL@KJiU|G>5X; z@et3udcessZh%*P4B~c+E^;5YBg5>@A!AJo9AH%vZL1Mj*TFIGeKsg8!-6H^ao|CX zG$PBp7nxay{Jh&K=w)X9NJ@(dwsLB8ru8e0jC;buy>@{32N4 zwgh;N*6+9Udiuw=Kfg}lv5StVjdYZATnb@ljltl|sr+ON4Z7ls zT2CD!?vsZFOx+?_gd)?uM{ zCN&6(aF>m(H#MaXoxuj2p=PDGEEDS5d^uV4!!*8)dqJa2DQaXyFwYf9?IA_^3ai{Z zc;T%$jEt@VMVg}RH2f)=n#C*XFa0M@Z3^a{Bu?IAk!{AP?3qZQV1o;Dl&9uigs&|Y zqE`uvOCl|Xz7dr3_i4;65XoS=1HO#PQ#=&gxkiAEVi)xD1U$zww{VvxOzn6e;Gu6_ z>l#UW5{72D8f+bcp!0}+sCFvME%aq?vP#W~zksJUrcL_~iZ(7>IJ*8{~PO-`;mS_;b!TpEq?2H=OdRud6}a& z7suX@^gcgO7x2XKy5&)hEa0q{o{x){_WfScOpDBhvHh!tRY13b;OxxDi`NR0cx*qZe4yFxqj@IdZ*E=2#7B zM+j_*_wYwdEFa-K{$dOL2L>Wb;V{CtaIA{U34-1k4ym9SiS*yU) zI04VbOo@Q2c;-jgC?`AKY5+gAM?svlj>iCC7A~Y;`@qE6VA^|| z^)OuY_^8>4C<60UAP|Z0%RL}~;k@L8ofTZQNPW`RBrSRn*0Cp3OhXyM10P8F9x z+8=8jG)Jr7Wg@W;aY?WpFcU{XD~Ke5l7yneNOzLR;30xIHDr*<_hMY2c{yN)3yaDJ z#1Ku~D~WZdP6R@tkdY+pki5wsUHuLwb_G8W0^{6)fiJ)W5r`C;8V>^TMpagqbE(M} zyfH$!_GS;UAC7=vj%v7{B3DIMX&pK6XKYRYlDh)fRB^UM>5&9+5jVLAq7BiDEqIh8 z$FO;I0Q7`HH)3s;9rK|baHb6SXpMBP44*RlNAAkj5qN914DSw+%3*{C+i~&X7!BYQ zmjG`!?r3YxVethb&^B_hlVU%+_C>1 zi^4Gy1Mcq-LC`aBfGmjLIxO5I-M9jUDx0Tn zoOOUxB9!Og1r0X-B*lFhfJF7;dlq)2152Ja;L@9T16zDK_G+Pn=2;|74&DLO`_Ubu z6iiG~YMlV5(Mj-*hCf`AIM0a49I~>+F2Jopza6=HnQs%#*D#N^UJ+89;Hf5+%ZZ=T*gB~V)!t4M(2k#r zBR>b&BLb&w4)wk%#1fo1;;b5!1F()1a|pG)ES2WMIjZ6=Y9mOF5a?zOm6@~YU|&x! zavcFVt85h#Ph}&Y9!^l^Oo8(2UV%(N8(DyRcnd+i-Tc=3B+Z^_*YUXv|(yeJMzh5*u*1MBjo+Y5xslBqE zB)wey&U8%b13pvwCYNHns6DQ~It9{J6+u1%5|`t>Escjs5aS+#kn*# z^y1fcd66B!Pv;BK9#g}E3z62p8%DvO8W{1)nL-8Z#1IfQu)VUd-5^ken5L4vTs)}|x@r(6I8BaXm`I5i@e zzWhF9Gh+kFoTlkN!WIQ@;g-u{@!Yp)0^CPmS5RXB5-@cn%4jStCIzz-{5dn;z>3PQYVUcy#`x69KkcQ+E+dn|`_HKMC5T-ypd_1p?~uCl3_{qTeiXlEzixzrj)0neqMDkBI^%hS|dh;Tl{P8uAWf zqQYn?Lr(gmtWyKQfahzPG}!^HYXLuu190#xTrLA1O9gV9=yCgi0tD$jKanu{05j18 zM0`kQ$59DRd<$H&rHFXN5Q*O<65NO5WJ4G#5YU28Wv3(-8@!Q_joQG5ak*LB8#b0 zBs90(qn(T4BuiO=L~tj?ip<4|lf_Bv#K|YcDbK|%27ok$9?6W-YLn4^MFJp5bEfW6tB#HjMbElylqYBqh4B!StrF%q0mMRsc+P^saNX zo@BJCF$AscnX*JypPc9V(pp zlKQqdtwuOKvRYk!lw1MsL@0o%5#!ANWWHbk@Kyx4kRa+t_2p0Jt9uTlT7lO}Ub+g& z+^vYINv;}kD?z1@n{e_r)&TAt@3gF-TbobJe@%jWQiWx*hXWMiDB6@-c%QN)y_ zkwOi-8bsYnwGoU*BDI?kR#rsf=k--CDGB-sd^uNuVH+nBUTh;btE9;EiVjSB!=Fj zBfuq=J{-JCQMHkc1U1-Ny>)AYNv9^Yy9JEQ5#NrrnA#h<&>KnF7pvEonA(@J(3e5kpQG1bklJ6e&|g70 zP@^|cpE}UIFwjmp*r_+zlR7xCFgQXv^hcg+U2k|hb$EYa_!w?;>5cqI9l2Q;fl-bk z>yM(Qjbbd0;!uqd=#P=4jZrL)fvCpm^v4;~##tA~IjP|E*2Lqq36aGKaVj{;H7T#J zxB!`?f=+5Jf;8_YU#3mz>rWZqP3kOCsf$fp=ucaT&D>>?B}>jYP|diA&4TZy%oeAg zf1EXJoq>7XO~q2pg{IBg>CeMED5`y&i~Km9)jC&~Hm{rZwWf96>TW(QZLYa>0kZhD zzIDE;b$T&tCI!6cqCeZ-y4WtZ)Th7jQGaRoW3g$)MLn606%q)@ykkYy z`{AuI9;^j<`0`0s4fnHbFx~4O}H!TA{dKB{5iI_FThxvP!bF zLGxsj1Wt8%uJU@Wi`=hEd#=haZBhZZUM{V$dT#5sZF~f7f(=mK0N43FSA~|=Nq`%& z={sagJ6KP)e1SV|Z98g9d(j5FR_VJ~OS?`6dy46MjZfBEJvW7x4rDt(@7nfh?l&O@ zn=p%|m5D4e+xuOt`@=kgqY{Jtk^Aj^&)sk72Z2im+fR8FPV zC?`uNByGom250;kD;V&XFv6LHpuOq12$!JZgIqi{B$zb*_O=6K(iq3UkT`YqoBT3{ z!7|x1uW$0dU@pVjJN<6xdac;Z2F*O*rT(KI5tYblsD2UABBxWq8xLd^3f3TVQy5zI=-a zMI(Nn6GqC%qqHbV=Qz39Yuptr%!Z0eEc(gbSXv&#Nl2GTV_51f36@f>`RE!98iq>o zM6@w#MaiGY40utfLDl0s>sEx6%7t<&-Dk!hh$vJ|E20!g;Q6k@};>mJc`gSRMi^qg|mC;t`^a zXPbj5B3`yFNu?{3CA#eqV!!cR<+g$sryBz);*n2UufO%=`Fi1qKhf);)lTJ>|JsY0 zNt>D_zzEgNJCp=rIIa}v;s`OpYBM~rpP(7rtX0Vy`x6@k83&Nt?u?k@d7_!vqNG>J z=Lw(z zJ08&Zq2ZH_J=fk#rSZWv5bQ)E4rn1|2pztqMItV>^kE32UD@?V>su+_55|t~lfvE- zq(#El4>Dq^F+gcHDjOPxkr&lY`t;k>=aE5B3f3Z&Z5y|Ww~Olc9ObEv*j&M&$H}e` zD0<1g(kq?-E=2ZEtlxXq3K74*norJsaHHIT;tsZo=)1J&O4%Fi9l~czb(5-td0zS8 z?y;Tb3-y&NpT!WQhdTDNR1BWIw#wCU9Gt>q!tmMqB<%~%TN)TjC8L1)WK9bbl#;wn zT}6Xfu9ZXtZxMd1rBmO!9N^rt4+gc*skoi@N{ehbbiIvWbDG0TUvQkatQZmqJZiym zS$H#YryH9uL@o-^R^ZFt8{qlBIJ>Jk!3B$wB8S(B%b$CH8c&%fKF4*}}9#DTxmdqeqp&?#p%0gEY#D?kH`y&kc|B$iw@vF2C_zaHuE*jKH zxyPk-ix0t72!c{4sQdG=(Jof60$yJB1*v#EJ@l;YWKwbnA>7M)kKm4c zI~mP8Z-(B%1)+Lool6HwqVya>2-3k^!|je%;8Y2ezsECJ;%V%odaoIssWQ$aPL`=o zu1RyD7|0No+5a36`Cw!`mstm$^d(eNv7m*X=Q}Zn0>BC0Y*z(`HF!`4DPM_3X#ley z9L7$K97`iJ7MP0u)I!rKX*vv7AV=t_*-=Fb?sdLE54WEgbTGxZkAbJFPX#y0DS78v zLBy5?)G!1g1fA?TSub+9G!1Gzo+3Jph@?Kc1CSRUp*aM?v#L?8f$>`c<6G@B^ zO#CNS581v&u&u=R(p)|ZlBDWU51miTNv11O#xl{$Xc|3Kh%eS@*}Ko|Pb<5wEQW_O zz*H6={kHAG-4zxs|LdtzOZifzvRs|9;uj~CQ)SvJ^jOYtp38ld=;o%#SUD&xCPboq z`f5NTT2d{OC827{!J3WQYPM@UuzIqJ?TuHlPKUjKV0yT<|H@^W?HF>ww3lI zhgRJJ*ya&`Ga6*ixxON2Q5q+7&^LjOqoOIoi=0JjWh*wf^AU?pk<)@%`twFyGY*Fg z3RvpSkzG?i(b4s zAkvrGz1}p{(fYoo9ZWhMDqFhI9|+9wRNPvgJM$zcmYPxPKL4!i!4n5*G23*$>PMBU z`6Qi zdn>l@K0}n9W}&Y?9rGqwAP_}`gbUr8Fy`IoF58)^iGyvkV;pE1^Jld5v;zXA53@B@ z0-tMaq`s`#3ZOY(2zsXM*=VKkFUbisC^H=qpbjkT068OI{revK6k=_j^|*>n{_oy_ zCkw&&MxHx%HaIjZ$d9WaAGME7k@Z$k#D_ZZ(ogVJTUIL@UAw^Hn3K*lR3Bz9ELq%# z{p?!S@=sm6@mL*WQuO$lVEj?Xt;ebR3qpvVjkwb0B*_(dWV{NEBTvyTVsTnwI|4u4 z#{D@kvMKI%UM(1oYQ-iL@}}(-Pn(oJ*E&Fl&F&7g2wPTje9LE~tl5jRnk5kV{&>J{ z`(1^|TleAjT^tln)(lUZJwCaV94!tYXOBEW#@Ty!rXTMmF1q%u+!cDWcoO;Yl4t5v zFMEvfNIr6BElK98-%4-xQ~TEz*$y}Hk;S7@yH?Yah)b6kE;fT%X%!r7n~VX2Zkal< zrEV5?^^vw8_3#dv-=5sglZbcq;50t!m%I_{g^pW}+#Kb!-!AL}aYiEVFR=7K%2X_U z+u#&8F*3X|AK#vJMf19*@;G#;g5f- z4L{W3xz*#n73i7mqK^2F>UjZh+mVn=QF2zE>ND@Ek9Y5C>e#%V# zS_=Np&HW!t`KcB9-x2x0-0?R=3os`2)oJiIW75`V3a}^)u<8vUlMi@<7AWK&V51QD z-aOF7T-%N`@PoFBQ&6B6S`fG}z`Zxn|5Cw|DJU4ubGiEYnFmEO$p-!6x#F38BG7`9 zf}~^2gVT0`GZdVYnL_Tnf-}uS3W7o$Nl0^gL!i%KAw_7RRZOAUe!*qtp=IYGHHD$g zt|4`yt<0InyA ze}ebpK#=#AwWfw&k)`+$5V#QB?4=S%qmXf<-Z4eo2SrmLMG^J+!V?3ib5R&%GFa#c zSLRam5>bSG(N?a}WV0aVx)sPXc6Kp z6=JE2qJ(y%4g|5iS7PPK;%*h95={E`gN zk%(5Ju&$6ZuVRam{BkXl8;i&aEmDH&lOIhc2cRQRN1;Ap#;Bc3Sq(~P>Py{*r1bT@ zZx_ObxJhB5BZ(7k!!AZ{yJCeSwMzrWP>{ zLRlze3A@2kXn-u9hDca0f2Q+NNHJN=^#s4)K=+;aLAZ&*&M)h%CamusB9ikKKZ3=5fHpgUcP;; zwXReiFl)E7S1uIfwBM9}d**O#S@FlP zOH}FujCi9e@hD0X<|{G#IZ;`w$UeL)jJF~x$)V;$zALDr3$+m1s(R7?mQ=Z#b>Gt2 zxf&aT^uE8Em({4hOM%lWk6IVZH&7#GszvLqCHOV(%GX`Ys&@U6E+%WOsZuSeORd`1 z=a2Ggm6h|6p2OaK1Jv<~cuOkRS?yuVG>hvRp;t-#f#(|dY_tE_4x`?Iwcbg&{=L=j zJl8<|$NhRZ&*e^G>5A}%>T8|p9=6q}^mS)LIGpFgXpDxdT*{5{@Cx?OM!3qAGSHZ| z-k2E%T4?je0McmlL#s+Oy^^MLD^l+&+cIPOwMN;`-1#yaGK3f~^81^1&pBGg z?z~_Tn`&eG>tsr_K^v+9g$q_lilhh*B?U%ld;M7Z4J!?TNAd#|imMVzB9HcBFl2bO zw!W~A&;Q_%bZGS^Tk$ypmv_Sq`2})z#i8TnX&oq+^gTlovwgpPQ zL2@OwPBk_%m9TcX14<Ll>V5Hsd?}9pGHU##Cc4K;rOO7hQ+}}9jIwu=rYBdj$5X{77_%q5lr*pf zYOB`+p7_F0(&L=knNZp*tJmvu+r!S-mu_to*3w5z(jO7lAAkt!%N6Z&xn)oG=!*;c zQmNO^;L#5UzaF>@)ZNlIw@_*h_7|}Y`qK137WzxZdbX;1N}~tYtNPnKD8FD184h-L z>-A5I4ni>pCk818JjjPi2S0`N)M8SNFZ2bZ4zzjPZlVbRLEC6 zfZuQXB?g9fRYu0bx?%855lF`YxC4c11bv9~aByg3b?7n-c#S!>qc?=b-ZjB6YSl9Q zaOf>X*(m7gAeGG+c6c8F)!15?@~c!J=Hb}3!zkSyWpJn+8SMnM%|wvH=!0+?Bv}y4 z5b%TbI0ySUdpJ;tioU3XM&aXxGCL_h6}{-;1R-={qXh<(dpcpoHK9^A<&!r_i6o_= zPov&Sqnb9UYcs8XGbJ%T^>zrpoic5QHRHfO5cG%!i?wkB2i}ShHaE zS>LC#{x-9L;j_VIv!O$?;fJ%4SaZ?rbFok7;%(*#=bc5baBdNaVC6mu59t^(Bk6Z;xg9KD*Mv^&vWfWV(&(C z?8d6@#@p^DM(ie)@1_jzrXB5OVDDvd?B&1}NZY-Fh}r*l@LZfH$Z97K>`qW4PtYq) zFh)+Wj!$rKPVqQT3Di!B>`qA{Psu7yDMn7Ij!!{2XEdB=bZTdh?9Lb?&zLLDSVzv- zkIy)9&bc|ydDYJOBTxR{fPED{2O7Lkw|Obgryc#}6&_eu%1rNth<;p!K4Xs|S&uP= zlu3f&YfJ0bdcCiKvZl!jZ&Lf-&^o*|^0v_Kw_s4QOl`8{b+$Hx_}_XL2GJF+r&;=c47OciuU;?$mkvm?G&gWP*Eu-fIrnw>Y+-P6@nCUr^>T4we5wCpd2DKB=xSy7+v?=} z`uY6E#PY`R_U6{j=Je9m?Aq4h&eqoVt@)d+*&jPgo4cpCyNhdki$C^OHxE`X4>op> zH?NL&?oPH3PIs@*4o@zQZoi+M|2X@8_3ifRI}8Sc2N-BDggbOfh008bFB$=hUTL?SSGDvy}s(afkdv?{i%vI`9mp!PJ0V| zH3cIX621T`rP{)=9EDgG{r=jbi30T;r8K3w;;9nt8q3B0x{{d+qs|a2<fD-=uY zH|#6uKVV-k{u%pPfUl6v&;J?wdj1ppVx}?r4f|4sOKIMEms)?uzMB3D`=b8O*jFaj zCE8zNUs8X-zIb=E!tqs0&oh3UrfJYU-iFZU+!@1 zYxD2e7e|yjrGHGfFp|q3urHs#V_yx_)XaZ@eVzWVurJt8>`VGDu`kvCh<#=L!oI4( z|Ac)R`M#QmV_&`hJM1gxKVe@lveXm*4fd7e>uvpCV_&?CJ9IgN;}1V>MYFb1MHrE6 zCQ~M^@;Uqg`x^Qi_T~3y?8_efckB!EpRunWaO`X3H|#6-U$8IU{{{ASC#fp-JN9My zC+v&mC-(LEAJ|vKFYL>G1&)37{~PSfc7Rl{7#S_&mKCV^i6>t&iozw(#QQf z_GS2Qu&)A=xPM?@-+sfsuKtdFp=kI6Ng?R({(*hj{sH^4{~uys(s1mn5C%B!l_Bc? z8}_C8XYA{_q+13TGD&|X9Q*S94f{fWbP|T)-qt7Dz`vuqT^Nr9kQP~hV_!o5f_;FCWwf7tLg+?`wp%G(2$iClOFSlhA{o#0W30EAg zlNsqW==f$$WA3l8FFhEwi(Kk|#J+BRVqc*m4S$J!E&Kud>cq1B1NLPv`4`xi*k58_ z0)NE5R;Y?sLRpyhuz7!CUmCx#ujij082bMM`}*)Z_Eq`T29qgb7fgOUj-_7nSp z_`?sxXn3@wiCb1E^*9wt@`4mpT#Ek*`?|=)H1)}aN#=9TvdqG}&lW*q1RJ`-(*liux<;>*P1=OX@GNFFF*lKVe@=D-Z8Zkd*!f z`_g*AOJcfKsffM#{@z;&m&pE`l%g1&!jUh|fF>?BlOw&0{U5O}um6mFz55;eGW##F zFX)56#=fS1VP7=AV_yZourKXDU|&)HjC~pX0sH!a>h)h@UqAj0_Vw(+&L!$&`hQ?w zW`Bi!adQ2_zKnjyzFz;tzIdaMy?$a}(!XP0QOLZ%V_#BS$W9lixkku?G5-VXi__>& z*jM5&>}v%@?sx1}%>L_Qh#rCx`p6&FEjTFDW?oW%)bywemamwek!5qWy_|asG~dJ+D+U`#*zy z;XWMN{SEu#v}ZN@A7EdbKe4Yjzhhr6-Zi{%>IZloKeI}`*dNhJjp z5M*ZP?nWe~6zPx}I;2aG5D<|L2?4?R4!FX#_gZVOec$)lk} zzUcoG_T_Vlec9^%HTFgNJN9L}_m8kIqiqUDU@vVPE3^3Hvfaj1Zum5*nhUis60Rl6MX7OC_@MIHAA59i8VHU8i5_xjD=* zI846xfhbI@_0vzwdVzZ*uv(0E6D=8Nq>+I81eYT|qLub<1uu-;o9F(mfog~Y!0VO3aG z7;#1ITpLR`442p!e>?I=NRaFK61wdr_5}~(F

Vrw(@#neM&L66)tDmGwuS> zCH5u9)*DYxz}-vtW#tO{GW%2P%jgRGl9A(g$)&L+5*Aa}C-F0MV!TnjO^>CQ)r5+p zCl;dkuh`c|eArLe7cy~Twixr@Vqe*zsAR!Qv8JG3v9E}K!@hd{JM631ExJ(zjjqQd zfU1)%kQ{`>zOWQYQgBGmugpCp_H`Xos6GgZeZ~A8_Jv8wX`LL{*+F-*rt`vU(N_LY2zeN7YGk?0C-xWvBh{)T;R{WbO__Y?L-Os$FR%s`W)GT{7y zeW4?0keM$W-_C@jw_Qn1K`=UT)fcZuf62tz8eeqK-{)Byj{}TH`!LOF- z5kCKceL0q2VP699KgGV_zhGZmnAm@fed+u)_GR+}`$|OtTmt|BaOf}C7mPlW>7QX= zFyv+HFW8s2TrQWA9{`==H|$IP3j2DXP~`B5$KQJ(;1c_i`)Al!x!cEX1o~1qg8mBo znzsbNWiS&o{tf#Q{TcgGLSkP~Mv{MyeJ%a3v9HkohurEwXb_j{QC{Y@0p+f=? z%1`?T?5j#xEaYFYuSF>SKrMb#IR-m`wp{c9RZH}fhT1B)02la#3EaTc!jXXiPz@xH zlm{%z0T!i*abZA9xCu6ZC7hlzt?JSXj0+U}7JIAv;n;Gv{?uBc`PCGizLi9u z!Z1PM;bKJ`BDlDs%*v{JNx5(Je#&SeBp&m|qr1dd?>iva+xLUC*W0IQ7* zPzp>U1*YYPuuuT-$1wo)FcOm%CW9LQ!eEN^&b%imKFrcqq0R1@-SGj}o!J}aBaqS; z^dWQHmGsqM>ZE{>%~p9L45sY;P5J^lD6w8jUv98pq_6wRaIfyTAJW&l+@#5_{^!U+OF-T=LRmA*`o(wD?9(icDQhxGL<^Dm??-Jhf{ zwO^$#NB~z8;8OZZ&@L+dMfwuhlkG)HUv~#b{#E)4LrPy0zI~H6SJIakEaw;L3;k01 zV*gqC3igZy6I)8qcn6aprLUsVp&!y0;8OYmN5xV6kiK#*r7zYi>5J}Ygn;Nb=_~Yy z^u-Mt$NPknT0Mq@z7Tb&5T^w2_`TxW@Ib2aR{wxtBETi~B}~e|Js3}``vd#JWZA=% zNJU~_!P4voeVbRCD7ccuy*cbgJ_9gUp z*cXw)&)ApWuh^FWumXvFN$Fo=U)g=U`$+5yfW*Eykl0s7w|OiQ`+`u>W60LS%A_#G z>R~jmF0n7lKVV;UVt>HCniNOkudpxpCH57K#J>1-!S|5M?;V#IBzzEZ9SaI9Y5*Hl zzM35@e0eE-A(Z|geZBfs`Vw@;dhx6DmH#i&m);ELZ_<~($%9Fx^hHdW{TI>~1ycG7 z`5}FkTuNWTv~Ec03rpfk`bti`-IV>S^cDMy^o9PD^i?|e@6s1QjQ@A(3kiK8D2@ON zNa-u@4pH2t^i}@w7wHSU9aOQ6zK;Hr^!4OY`r^8hz8G(S#V(~UpxBl4HH(zKDEhg6 zlD_gL)h?y4MWpm4`a}A9hLpZcG-!X7zFKN57J?x2k|B&BfC9G}jo$n5`hc#eu)?k| z%>=XN>^YUAie!%&$55>z8~`8|_&h=)jHz>6I_?z@*j8&E)ffVo3?bbcD$3J#)(tLU zngN#gnCk*&&Kq#NBu)b%D2q)i7m{JbmbLkiG&~4b89F}bJrE#GT|Iq%&D1s)3J?P@ zw^$+`3u}jQL6UBWbGiWVpVXsdHzd6Zf}29JkE}0v;z-IGp3hE^tXpkz&%z@hA~XR& zehBd%F`M061ITdF9SA@dAoWz0$OIWeUc|*P5?xw^EvECbfDDUX?W6@+FLLOQgs_$H z96w+b*RyrEw1s&Ez30@5k43W_2@~^rgMeW@RB#B0zjG@3iNKF&qzE}`y ztkuf`tI4D|3<`iU2`1U*(sTw!>=7!lv8g6I0Qaz>b^=|&(ppCtz)ogD)F4KA5-e=_ zmT}q|brFE?^#NF8%{PSsJZ_rb3VJFNayn?p-XZyV_ASus4dw`jyRIPZUO?=Pc0^Kh zW5cE#_558TEOa2oeBkvs<`r!shCmK;sM^X>?JDpge z7+vhNRF2=Xfw7}%hO;PMhnctq$P$)N(%%z>hma7W=og__90dukT`RD=S9pvmPl#ez zkDAGdCCDz8VVtP!7}5R!X5akMLfL=-zKjk+xj*u@g|ta!6(CU7X3rXjWX>FALPU8M z(u^QfBlWZ{8gz}YHjf$j+wKZ26;ByVknZP4cJ}tl0HUzZwbZ606%bzDI9f6_O}ij` zG>LR3P1-|vt@H@6zEPS=1#3^>!wpjj+F4+1uLpeX{K4W*5;%U_&>G&2P?4&{nuefI zpeAK~h@D(A2Q{L`T9Wk1Xw;?(_uH+s|&S~JQ+E5Sr0LsQN9cTot1EvM*&CfFOQX=AzV1P9RpqLth%YT;$ zYU-eThp5=f8BL9tG5|AH44poh_V^A61du&xn2msdR9h-&K2%V(KYFy~5o-Nxu9C0J zsj{Yh0AB{EY;!4necS_yA*C+~Ne86#1p(?-Mji$LAdq`m-JSSSpzQsJ`ObDi&}=F| zoFSMdc|C)8QDhQ>G}JjsMy1`r#3~hoGFu!d8z+vwav=joWa8r*g~EDYfwEjeNKViU z<$|o)gCBnhk=@(Dzqw)3UVOj`lKzVMbt<^fI_hh%>uF2ao%JLO0g&lJxOXoUG#F@w zooofpt2!RB-?P>%dLy8^1EAZ(l>?xa2iq6DK})eKi!yR2T)l?d$WpYYEDS+S4Q|wE z>bG$Mv&Gr>%3w*U5Q*+j??|0EY0$i=A?kACV~wlnlpNU%L@|cM?*v8T?^C~%p+&4- zyqX1rB!xGZlAc)*6NfnFZ)=D0JFgw0vQK;L37GB<#LI*{7g8r04OS+!Okxd&Z=xk> zp|KKUicLEt-~p)k4p^Z(qY$3!A5xy}EYiA>_!43wW{g8jRiOq;>qinqUs*$CT2G{J zZMd>yM9GDnWVxnKCqGqhUW{4uW9pLCU!S zaTspEn9Y*$r6Qrl(!s_M?nd-bKyc&IIW`2HP$)ehfYuy<4QeX&I7TVA*UZ{RVF!Tf zA$u%=u{XrO1TF(104xZaEvN~T9Z>YK!9!ON1cIWD^f%rI%nhzZUSE8J3Bja*Vo^Y# zY+&kl?zB3A@G1x{Xy&%VEGk7)(I&bW$i!9|pfJ(bgxcbjiDbd_`))QOBSV5K09vne z)be1b>p$l!q!s%W_EnEcUJkUDpQG(X zVqd&WNbIZM;}ZL-gg!kv&=E_LHQuE91|wuhjpD zeZhIrJt}b!Yk}+@lUa@#h-Z1YSJD;sC3uN_ttx+TV!Oh=USDEg?SG4Xt^SIA-S}T&U-}`xU|%PHhJ7hs zVqfEmr0VQHu&;bVD-a?l28n%HUt(YD-K=JRgME$uE%rtAPq44Mnv4H}eX0G1edQyu zFSS3zzO+dHfPFoq;;X;JzDWK7_Vwlp`?~WJ_9cCVeck*s?8^{|eLemo_7$X5fyBN* zf5g7V{uTSWMt_BUsjCv3lU`w8>G6YEzhGZ;ZNFh(`pTEs*ASh{74}un5L)Ozx2$~K z>No7m{R;c4&p~2e!B^Oqeist^^62@2ec6s&VP8bQU|)7WV_!{|*jEOXg3XvV<#!4z zR?1y&g1V}@^#c^nF_26IVa5DtuMi)1)xd(wq1>l|_f7m7%T z#Kl*FZ5K2KG{ZRtOY&8Htp)ysiRh+c{e;V#yImQ+d) zbAjGF-FEk+*{x{9-c?%qIHz_>P2;#XNDlR}25VMgl6m!2Jv-{ac(SR2uU{_bPOUB^C4$$AanIKf}I0m6|^8nVJ#vP@8;mKVgrdTR>SP z_+T+pMkJ#|cc+vtCo-yVZ#0G$LRi(5*~2ZU7j!gQ`y2MPGllUw4e0vh+che*x-0DK z;WyZdU|mB)5|{fk4bu-V;OJw`ypOya=e8b*){hMGK72JhzrT}=zipPX-JcyLxaCHh z7JJ9*&|Ly^;me~TlRKN%29n>p?P!8ZF&>w>E51`M_SO<&=ky^Ark#Vr@iAs_fcsf1 zclO(+6dbUhay+-WZEVqkw)#YD&WaT=_{heEJwPTBBFE(W65A@ao2#Uv4SzM8g2>R) z$!@F7b}bJxw7v^R>05>Z?%mFub9!9%wa4ft&*483E|0M(s>`_}|ZgWEqhFXTD0u9N6WTu+t}_JLQ0={TUe;@_C+noykX%LGWG;Zfa_&Q?8|8IZj5$zp5rxXRo|^xyD)R%Jd3-w)I{Ix zdtOvlK5nMbOGqCaR0#ZH2LWvnK0!>@Y<(_yw~zAf6;od$FsLj4)y~5zN$KVx{&JOs zCQfdu=xzR(imzflHn-|yz6PfYAJXiOfg0yqmK3&2q6nA<#_{groZ#4%--InRy!3t{ zCKkpOXzvt48P6})i^RTW2kzQ@!;ef;84~hKi4^X8S=ayWV28HaSC}v6)I)Xlnf%um z_F@R}5%*?W!skq%8aoANf?W>y4v5zE4Bj|IBKw!Dpq~v|C8jLCSw*PMkH&tVN!1(j zG`EvjZ`tx3?a6vrM5-Wm-(PZ$;LTAPLF2URtZ;!Xr%0BF>0__f&;>r4uR%5K&H>2o zg zN-y=7-clp2>TQ`^fBU3vz;fY`pRM`uFxsz$7w3G5Sa!{}l~&=&;^M~-S@`{YPivX= zY}O1~&$f1+fCig2wwOMgy{qifZsz;GPn`Z`r6=liIOf2kQ~qM-3j6YRYIyqnxD_$% z*_e9r$=*nK0ibE^#*+zUPq6!I&%J3 zrz?Q?5dP3O8x*X2)@OkgO`jtjwndXFJPIt$p~ppHU)>gy&f$Zd%97Q|BEG29SZzbq z9c)gpF$YD~xnW#dtC~H;^K%XcqDRIIPGJS9Q6dhgG+PE)QA1Q5;300s5A~y5Yl;*j zw~3<}im5Hn(-xs~ggKdnHUTbFnVW|^rvKh>`ygYjt5c>?1xkTLouSXyr zo47>flc$P+#Ejq=4$oP)&`u6Bh=||rqt*%NEq)&WKPvkv=v9X-D7Qn7Yq&+Hu4160D97>n9yEJHMgg2k$BH*ECecW`CS zp$`Lrat!%SF9V2u+aq<(!5~#IC(xR3vSJ5E{0PW)|t+?2ziyBnVw{1{CO?1rUO?rJ4Y8 z;_|u=#-FXOkDf5^oD@?5O0sv9lkM)M+`E^gu8AGkBgjq{aEGXrN0A~QKDbust;5VUOSJ5=~+t4LN=G2}b3Ok$sZ*7!j& zq9hEZh{#GWFgU+z4sU3yJ>!iw>v@|b&j_0^1x46s;Bl$!`TbHGWanGB$bg)aFU<9x z0htv`^SL}z+`43zw$lJy0$|gFq)l+_g(;l+9M_Ep z77CrHi)0h;KQD87+wwAp1L$5BBMW-r71;CErTu;C%aJnF(J`BsU#tSgf}brHXPpIv zJcWI6WDj9v3BQM#6P;8}#1Tcy5y|PeBfOrzfRcE;=DI?FSRZ*Z^Pj39>hNz$mK3F;cH2jIL6 zAd1Bg-f?kCeJQ*4a-}Rywm$9cm*C>udrl)}_JAQKeA)gT2f1=O z?M6GXUv@Dv$sjkBPI+pQA{U$X#ubQcJ-kENiDF`N%|(xVJ7I~|r3O?w%A1zcf|APl zyu5{TNVCjIbw{#mT&gQr*wfZ|1puK5S!yXx#~9s;c6rl{v5;AY|3QJU3lWSLv0EIe zF53i83Tz^A%1br0Ydk0UioIMIl}17Fb6UKI83>j@ZR)|02U=HgyC+|0LI4^o4@S9UBLV zMf{F|%YvHkffKGf_*)0~ZWuG4JF#p9%R84uw@S8K`beX6z_EPDtEHA?7DoMfX#Khl#WY-2MNRA|m~R9TBh6X+_2KnNq8`1w z+VMOOTVszmCol{=MDQQ>JIR%mDGnubAn_f`uOaDdZe@vNF$&Ikg^$);&UEU#;MR+Z!~!%o{6dcw06) z*6cecu5OP-pCGZXfU4mKP~p@vk$Ou1_Cnt=dy`gCeZ_PlN=Z{*xQAj~s5RRNQ%4@P2NOe%`^JA8*Bc$RwKf_J$jBTXOIha<7C zyARc;y0z=-BmWlrYKWr~k7sL$M`B;14GFU1i7E|=I^s#U8j>u;lkFRlU6I&VLyEt6 zYDhzBjCh)Xh0p_)dnpa+uf=E1+X$xN6B1FvU&)Wo0vJKG8J`=n_%x;9;*Y)7xlMAI zcg1ta-_#sNQO&JNop}4FjnRDal_NItfI!T?Ntf^mTU=4QegiW!9pbLZxy zyeU=*8_!+GUHqKOr6mOd(u|~u&7mkGpkVGJTwLAkp*OkV!vy6zvh*;##5OjxBiw;0 z0z)9)bBQX^Zywbwk6ubh3HdM^0w1B3bfksL40qse4YEyihz&xj&P&HKhunMDabLb; zlZvaVLo_yi3$BcKU0vtnB(^U$0}-={4O>un?wTwU+UVreKFaKV2rHIJY{UI{+Q@g_ zhycuWclr!R5%O_5Z>(xkpfJv7#^lUQk-z>@vg5VX+xr~^ z0}uut!n)6L0jG(zOW&l#pmw%B)gqysl9O_cq-J zVt9u~WSwH9sPt6TIZdZM0r!(Bf9dJQjEBL}Gbzn8&!yiNHop&O_ypI=d@Vf}aVlyE zB_1(g`g?FlNs*uiChh01RqJeqG9KztEvmNIoXs{BkMg*`_08!|G;wvA zZ}k3pd_m>qw$u?uE^6+%@a7@Gw^!q}Wu7&UPTmaNzi?}i-5}WH==mNiA>%$gZz;7? zX^}MfTEWBpyjV}_sAbw-pOINJZN*6N#si_@F#bs0+;E*ky^Ul+!iJ9o&bsbQ#1gve zpNS>R(>_u8=Y?xwKK)Z;@qmt^Nfa#H8* z=Wv#!7I68M$Q5L9hO6!NlyE?8Z1clZh93^S zv=%q4Kgq{crg?~>@4w;16VEA9;Y`ES+0c=qQ>|2zF6pxWnN-Q#y#vosq@f_o+nggH z!svQb;Hcp(=OZ#DyM6m)?Md zl}{#{a(!N3Pv+Fu&T2G3s#6+23Bj3{-yY{`>?zHZm z@^R4yII_O^-q;%)rrk+fDsu(6-!MKsJ!4qQ@c4ZCB73UxjtX5=&FBlTB-e|w!nQm9 zG{@7n2rX&}6ho9L6YTTOsC&N9+t-{4!3uGj10$@o3m)nACy3?2*i}*;uzTLxOu$>_ z)~(}nJ0H>hk;${YF1qLEUmS@Ft3smNJ+j8y zMFxn0&l!t&>NV%!FhwfZn>zItAUsSGtscZc{ zs2Ir1eY4C(4;M|HFjydQ3D?nBky392hNPrf=lCP2dG1o0^#%&drL+_AD&o}ykrW%>;4%kCs0e5Yq_g9_P0YcKn+_B{bdM;$u%fJ^NEVaP<8k(KC4-pU zrTV;&#T)3ZVlwfO_HapL&PcM5<66rsGqK|FZ>(SaYf0az6_v) zOy-ov>S>Rms%X{hTxQ!q1yWyk4SN~l^igUxS>zb6#Xz3td-h;H!BI^=_$KjNc!r3K zyQqIPDruZ?MuyD(Ti2(InG@A79!}{ENgf68WfTPuxPY&_2n5+IRTxVy3-tP!VWgD_ zC+?+lXoXmJQROb8@s{uFM%>n+vL%GUj1REbMwSTt;h3E56pU0HMGOaPZ@FD6G9}XO z^I5tgmBnwX7)j?Rh8o9;M9GD$liua9Q_Oh3irCMV98jRU*vqA=CoF4%7PH|tuNfvO z5cos}5?g1t->eG{F^cRgF(kd;xhD|8-?Z#@R0@2cCB1}QVxwHxe|Xi2>94@uPat2C)nENkf5j@l{3FBD59Q0 zNS|iY!T&+zb=@|cQg}C2Q7pQYp*3kzY{yN!r5JTDD7IHNx}*2zOVOQitqi%qu2sFr z^rZksN}h?*^PDKvM5w@)aEyBwbT0h-5%VTt@*}U$t+JZO)_V7x&Sds6h4hu|$h~h+ zJR%CyX;J`PFc^8XO_+WwL!-T2&`wqE{vIK-*G~7Ps7N-l?Y5$lVyiX9o*(60q26^N z-fikI;S#xwm=>_`Ya*5fBDVPu-RX8uifP{`x${=uLCuF>ZcxlIn-_@n!zfnTS-+1^ zKl`Gs=*eCPa9xVX46SXGKki|ye}gThn?pdbu1HoyAFP6wT0``SlP%E9mbUni_xT9d zqn$;x>s|yMXQ@F{??;<=Pjx_{C&7(PTg_zlCU%~XG5Fe^utezHE*^6YeM?_?i~5}O zL+=zy`+ZC%w<)~WBN(U|Po|}AH`FP;Xh+Idj!5~c*R0@5zT(SlqdM~w%vZa}^%}AB zjv?!@eFwZYQm;mRLD(ad41OC@jbQQ;G&%A>o=oT(m8r*i)jaC!seKJg>Gr*>^p*v1 zEKaP+%ncofajT@;;Keh#P8u|Y*pGceXrjy-tp0J6Z^|Md-`r!Hu3Ct#?2ub33OGINeBi>gkkr;|~sNQ7oN(s#+ zAm#Z=7404)gKN!R>eb1V8X3}OA+ceY`bNLRoT$Kl2^A5w6?1RB|}Nck2~?G&e2^Rf#Q@crA8xNcnN;cMt~K z%_U#|xd5_mF$}hHE8N>~RH|2n=&}wKvyBEH&ZghVeC?BuaL5j<0kk|luCvhF@_>-A#g-__=&vonabEIG)MFFS z4$&pL5WeB9ld@vT)kLe?Si+Sb2jJ2OJCnkriG`!Ki?Z3~VoMiw@%Qn+oX23V@Zm>r zx7MrU(v}g{Z4hcP;xv>H(YCJ_E%xm|`|A5qFXRalb{GWROWF4rxCv3O6OIdS_hGn~ zA{njN=+XrO^d9_82aP`cn4=>2xx-E$QAHmuFP(NoX|z0A2p{2Y+3J+b`kGZKr@+yg z)#337cE%{>kqr*wI^KQ}0FMrZ)**+e$cg#U&fGvWmKF+G`T$)!m|z00rd)o#2||bA z4sIdXTFfDOfKRCQ0-K$_;-es8H|2E=G*bAJe9i<1yW77)I ze|<~UABByJPjDtjo2ZR@o-?<%NSm8OpxH*fD?oD!-9-UY4FUS3Uf19P3EbXduK=kz zzc6;i(Z6=byD3i_8aaNuV(l2(X9=X%{c>T(Pj)a6j!*4ioX1G81{=?B3J3l(fzO1Qzkmu$w+Z72 z0hh)&FQ9BuR-j^}q-9mx63%WAONPZ)cg`ThXCR)ZL!PnA;KoH!PqeMB&-D$;^%@&J zhWku+jC(v@>aB;ES6pc&fGYa|<@Ie`ySIdoqUX1GS)yNV9lplT(8N6+r^47rHy{ug zjio*CkUbrjWe}oZ@nB9Jz{Z}~p_q_&bSces2V5|G#~#I5j>4vek?B^JoAy-;eO9dbPlMN&hGulH-SxWy{u!8nQI@x&M^3LFfcvfDd+YPo!D2d6~< z@K>@l4IA{cRjUJ0T94%cKax&;4=d zx$<;Xi?a5-a-_rr!xMC4o?0qMOsfrpQb7J=tQH-t_fZ5A(;E9eL5FfQhQzdpj%bbV z6V-8^>c^jo2s7p_U7OL>UZ0@Q-CS7SRw`aZ(SRx8JS>Hk>0B&T0}P0uFhymDW3g1TNtSb;Mn{t?@n+0Eo>I=?B3Vrl z8TtETu#+3<;eD{v9<6XSQT08^U0o9dFGYPhQD~V$*wiD}TO31{gbqM*DS2MJ9HLwx z_02Y1J>oUkGOZxKcT!!6HtdGcvbTGJx4@ohp{S3&HQkGM%wG~&PM}W$mVGitywi5h zs&BD)Ezw1I9=N6W&DOCxd9ol$t^8zS<5GfLAY~Q(+1z8+U=O_NejN1}f6P>f4$9NU z>Ru%V&7(SqDGK>s_6}|vCRHj7=L7<#sDb6!)!p0UoDPts2-L8aM$jY&YD`;oxN$V} zW4DV2nobAZF(yfMnlMidzAqNIWCe3J4tbm!@?<*1pCUBCI5Z?RG@?HF1Q?cJeQanQ zre;mo$JF;a^|$rs?*E##=>~$xe8jp3)!aY5n#ohUjr{55BigYohP_kWqjYV_c*QS5S{c^CYxiOi?|Yr*wHz3 zW)WmB%24XI3y3ZU|2}!@F6Hwh zrPOzB@n7He7G>){BS7Nx%B|tMO3>X$rE@4lo9ef4%P7ADk0f3&(7Ck{rfyMmn;c9J z*%t%I&gEk-fS;vw%|0wXU&=|uaSc95D=opsbOAf z!j!g!Hxf-9%DbSMMkfVi!TZX9y?Y{Zt*?^4=3Karq;VYO(dr3se#I!9C!;>1QS?03 z8~p@BOQ9i&YFXX7KII8QclpUThBr}Eja8 z-(@t<&NhFbYWZl|@;RerbGBuds`bFM^=n4!x7k(%6&%$Jj+qI^nS&Ehw-KAQkt4r= z=i2C~+nLPT*)rR?=GytFJA}+SL^C@i=Q?DmI}wUzohq4~8grdG)LjNYr!muW5Q)H}|${jxIfivRi7P zBXeLcI-Lp^tNev(y7$sfSOg``4+58l;A|eMi2{_2b+bxtMzk zyg-6rBQ(;Z1kXk;q(++OXa<@nIiC%@qaNj<8NJ>-N`5gQ+B~l2H^y=?ib^w4)6_?& zie~-{e4c}0eh2LyoQ?uIb%UPtoZ(#toNk2vT?PZq0~);Z9IOY@H2Mth5O@8iEas=& zpG`e(e&s?#wton|XaPRrTz>#r(Uz`S;H*=JKTH zZ$M@P{H7h7r(>Q?CHT$EHcx%Hn5?-oKi)hyPxBrjz3_Q{cJpF#_s-nc=9zEJiw`dr zi<_tE%+b!R=I~_RRY)%tHqTl})1(9E_Ae#{o_-Kr_<%{fJUahz-EV~|Yo0C(33<(O zWPRY1`N%^1iRt_7{qG-d(63lzO=)G#KYqGqIzNe^XBq9GVr-$4`d#980huglXE zWnsYOXifl|wr!NyI9djhi zMY8jKM8D-Svf?ee+$+54)I%?T=4)HnVmQvi84y z-`$nj!))Cnu%ID&zDsU#NQVFkTYQoHu)SoyLnC{@^n90%Zf{b4?^NcH%Kwnm|A3F~ z$jJZLr1i@~!11a1vAF*p6tUy+;lSBq-$VXT4Y3o0_>y3;%hP&d@chiN_0-k>6wl&o z0Rp6H@zvk|ThNE2D%p*%SsODg+ml%v&9s{}S)0WRls3SvN%^f}+V9V^)`75KxoavD zj=2m9fvrKfR1&@*f!xk860X}#p#pi`QPh$?=ODrS-Z*BBbg3{w7DRX8#_c!0SVAub z(uD87ZweDC9C{}86zQ|I^@DG%JDr}KDYL2Ku#jfvgnyMpum2pw0eyl{v?0uE3{(prddx@*G57#PIp8*ofq z3AZkKJWYpDv`4g0*Ty{=qgAcLShITd9@3Z9n^+)2)`6gCvljtOD8&K6AVtU$F3xT0 zcH<|t495|m+bczCeobL=^c0ND1}kxu#d)ZBzUA2g;?be8o#c$F^(++Hw^tXe)BrP9 zsdp=T?J{_2M$y6_45`^ado)y#o8gmuG;LuC>W*TyEQg>(Yt|+XFlhO5tUMW_K4!eD zUh8S0q5U1j(ZZ6%(M%AVw&J-&t9u6;&S8$+YV1}Uf1rfsmOXy*lcDfu_Kz3B@0Y#V z2?tX(j$&`!^6L{!lUSV~G>Hs;kT=9vMB5Gvv}Ignrf|qcEcNE^f+;e@C(SoiA9r7T zi4b4Ms&x=qq~zMH4qG4LEIRDF=G2@y(|^C^Z0GH+lT%4nk< zJrK=^LPtoxQpUZ61xK)!&=unRPmnTX?e+I|h*jNm&u*}ZBE4X1JbP$ky2am1^M+AyOEPMj-{9yfQ%VQBnpnakkNyyVK z7mkxy_#t|?^SagyiDZKKz_E?EW=dn?sYb@ECy1iq5W&9m@de!{P1>N>Lq415A|J`| zygLQ8w`)JhXn!o+{qDu<6ai^xZz3QteC*H|?fa!arM?KHG!p&fux3-+_Xy4YV>lP~ z?O|zcMWH(1ljRtM$w9Je%+no4x6cfIGt3FNeqXMzuMaW)-_J4*o}Pb05WoQ&ul-TD zvw^6pa8yn==urS2D&lKA<`5tBt7Nng?cK0I;WnC;&1_6r_M{u50H5^;T5xxqJii-@ zco;OjUB6yorCK+Dzj_GQqBoV|_MYHVq^K30-9e)eg>J5&gXf(}NQbg7Au7cnNJzv+ zh{V1O4^m$!p;!yT8Ef24{KRergz@8 z9{E6?@BL9IYD|JFyRRPK& zerr4K+Aa;jF`|qu8^V2*{?0J{1RcQwJAHA5xr+FNtT`O+TWbA*VjYx)O14|P;u^P} zeo3huV6paUNHbOEBg1m+W`2&PMS#8So7%}N=OFgdODJ08?Cb1zKZ zI7Yh$3;BHHw&&}CX+M1N`dNr?q4Dy1(?jK9B5(RaP;zLB%L~P<*EV^;$b9kqbM_!F$FazR-rOs<=W`+v6Zi>=%z1F~LE~c7%HPQ63 zz?X&`JUr`auTDqPSPNe8%{$Oqhh$Xl9ll_Rrc486nT2p>|ILRFu!oqUwOQi6I-E@mZ?#&iL*7?$2G!*IZEsss-O3-Cs;RYI5hMIE+;ePGSMcej zz5QnX6xqA7R;^IQhZ%G4-`Aw2yefP6$rB;gcU&7-!JjW~!!Vc8i&D>MymP-XBX&3V z9qY%V2yd+jb8yi+_HC`+C!$mzPRR;dKFybLr@U!7s!VD6N^}2dn}!9-x37I=ZDem?I{0BAzjO-pz@=UwQslY)OnSfUc#N( zH+ixbDCI%Y#MX}+>JrL?U%ZamE^&A?GZyhIGEWQDL^uGWYxVfB_QW}jT4`=2OQfz@ zddn@Qac(tOw0=g*)wI{r@iNb1mzS%>x=)u=qCA>Xy_>{fcXIdRs>VOE4SV$(egDK$l2`of5T}DN8Q( z^04xD?xqex!nH$tO3FBwTKjVVx0l&o+^yzqyXO*jR?uO7cw1U>n?*g)c%kKE?FZ>}kKy}t&t;A}WzRPJV$N5%{7(m5TMlL= z&X+Z0XM6lxj$X%HY~A<&hKKle*y`#FcJn`9;cGn|9zI_S@V_80lR5i9fH-;U|2@M< z_S>$-ov-8mi1Xc6M4hYu2^++)0O60YhhT<6u*4n!#UOYTTIfX(Fgui375btJf*%Tv zLxYm`LTT2Z#9mNJc9?!Ol+FRh77E)VhcWfSRt#VqNbHL}V6YCx;}Fn)028{xzBq3M zh=KxNCIm>U2HtQ8ynY-YUla%v3RGDO)By!eCI)J#21$+w>V*cGfPxH+gYFdZ-6{&Q zVh@(+4KjBKZa@jXi^RUxptfGYuEuPRy}=%;A)vKjx6lwa#*l}-A%5)s$TK;Vo$<+9 z$XekgtmO=mNr8l#(o6qdXgf3w% zA2)<#N_jZyUKstFU#3IEo8_>oUcVf%@PhjAVhZU}5Fo`tl9@idvM2((D6FnGGAAyg zFV(L}HGFIwqqQi!XBxMr3a>pha>NVrb~BI}tV3q#Q+RRKL~aopU`_tGI~c* zl!yNUYgxt#Qu<$FU*1FlSc$PJ3AJ|<&Os=o@bFR)n#@d;O4w5cwZvN-2qI;*B-50{ znyN%*q^y;VImeEp-;lIxpJZH|Op}vjU;N;XIKG9qWGM(`mOUIPYuT+Qucjp5Ur*tZ zkA8v`o+}oG-w(sE>3^Z9f86T+j$aO%@K}JO|MT#ukiMJxt`vJmC@2?pCbn7 z3Cl#W%+LwT>@oSp)*3wk^r&UdU}m<>q<45fn>PuWDSnpF_iRBe>mDd;X+7nmN!Dgs z(C4tMd&OB>9M84avUa_b_Qju{&IBAeK35ETeukAT*o%06u^x}0%*OPAp{i$JbIb;9 zWK$PqW}{d%4JgrGpgrumB3iTbNS8) zIs0>kZa<~N&dXrW6E4XsUC$Nu$rFE*Cx7NEvXSQv%2x=_?^nscfgPffp0DHcM5907 zel1`3UV#-`fx$+Qkx#*$H~){ly9|qJZP>;?Fm%o^bj{F>fHVWl5Yml^grsx|DhlWj z(v2b#(jg)xAR!10CEcJRQqm$SDk1P+SUc~%_kBP2^S=KN@3-}BeOlLX9mhJ)^M{cy zDPZU+IHy_In^It-dBe^OFuoieqY5>P3*GpO+6W6hpJ#c?7F~HBcxf-!ucYYuo|#8a zk>pAd#-;cyS8+&BaN4Ef*tEd$!eT@NZ!54&ebv8@S~amDJ6g;)W$j+>XCuRcd!%Uw z@}KS%UG=}RNRroZlzm4PTugGm`gUrEbEo<=*pB9y~U3EU!Iq{4Iv})sswtg zgx0H0kUkU@cz9Clp@i#0spy9?_aDmjK9pa7s7P9k7N{062fOJ%;P%f0U@hJkcslFV zdZaZ50yVnp7HZZm8%m3J#uRyh^Z)77pRBi zXeo-ioJGse7!`@sD`ZyI^Ci}&Y0=zThuwCi*-Wr;DRgBtrYTOBzQFBLDp&u|oJuLL zK~~h|jtfnWD{S*=qXf>~d^w92=f(;-Sba22nSY~1(Bq=?hR$`Weyzq)CaNCNrW0&U zXA&W+wJ_nE37PA#*|U$a_eG3tn&P+{pYA`Nztudj->_lcJSOl2;oU5q*mTA~V>%ip zOkK~+AgVBT)byaSx$Z@?p}~`_`;BY+&EnoqHX9#5HEH=E(8}oDLdM*>)7T{2`s8fl z6C_ZK322)3ZZ$Ay?UHLlxV61|LCeM5#Di=-p$!#`X%e=eVF`f2`&t(16 zHmp!-n`dhqc?ON@7pzEWJ0-OJR3ELRHVm18J@Utnb#{hsonM3$j+1fOcQz&P!x;AI&>2KW+&O=nU$E*b8== zw03&3cX@etpP1{s_N4{$xWiho?J6_f1nE-}H`d#N-9dd_H)2|iW4dR|=%Qm(4&=I$ zN}t*xk6P}Pwp>eW@ttcZd;IjF1vF*jsVPoJY-zXGG*v@Dk4tAy$(IH|`YN}et90wD zF6|4+u&J_OWZCF#B^-LJs5xFz&c9f>;J|`-i@e z$bAXuBNrNm=?v4j57WgCGdvh(>K|r#GYlsmVG|nR&>7)!AK{4|;d?M5&_5#dX5<9< zsHo8BNu5y%_fe_XQJDv$a{Z(7Z$=f#$IwD!$~t4J?qh1PV;T>}wED+%-i+yyj~fV$ z8|fU4pLQQV6FYwP!MJ(anO6x2RFJs5P*tv$?28@!CN6wUO@Y(;lzS#JxUS_S$^lwdLk(D~cr> z;UzoWrSl$3j)37@w&Xgne1a@_KjvSt5)<-pD5>l7pn=!sAvV%T}TXR$@2*A@*hPU&X$3*Z=>ZZ2cs*O3gTbt!(`V#8&^mPi+0YuN7|S z5m9q45_KW6$~Uq+B@zP=TW=#(Y@$`tVnP67>us#&vpBQ!@tOe%)prxZrxL836LkWT z>SK~Jo+aD6-!#m<8MSrO#QXM{u-lKG-*$eSiYdMm|Kg6@#Waf>X|1=?3*V*NL}xTi zXZl>dYy04C@@$q5P%oItzH}|eu`(y+eV%(rzWdGm=2!UvfU-6Jr^?p7|2<_Z=YL7r z>KJS69a|U~d$BQw-5zTKl&z$oqRwZ1 zroZvg+gB|iq;TEGb?+9slLf2>9@l?Z9?H|o(rs$^xIXpJ_0>RA|N>pll@q%9ikV zWlIN8wpNu$#Ez9MF#wACC(4!v-cMy~E8#cF7W$jAHFT_OrG8bmaDOUW4#&zC1)yxr ze^s^ysee(n?5Ekz|CO>e@*gW(fqzi8w8)V^m92?x=7*>~!RboWgguo|TWpEIkeGnf z$f{Vq?q4Ze!$_6#E?k{sWs8dq@m<-{$sba}nEhjA3-Z^>mi50BPd=}_sE zez5wXY_a}U+3NkFY$-1^O1>g}wd1mU|H^bOplsd!gR({RUD=ZQp=|j9%GTUZWlQMK zlr7w!%2qW3!TTr5*6@F%Z0-N2$`<0QvQ_e1WlKNAYy!l%-tu2jwuoab-o#@9J`mpF zeA;HB`R@qtxl&$jb$`+`k)7q*@qZwZ|0pzQ)RsS1hOBztN0s&>~<8PI%wf{@Xmf5kgmG+m) z*7|>@Y&qfliL!6E!Tj;OKmJy(Ah5VvyUHPGGUHqYJP3VV^ z7V3)&0m>HgSlPp)uWorxbQ`zDh1C*`VKPX$t$I2G?r?NHqRoT-2du7WSP_{zA z5tu(ywr0O8Tj)PkwqSV3KPX!ofU-4*T+qY~hdx{!q4t0M2GadC^$jq9~Zp z=uG|50{TFN^ry0A|F_Ck?!QpBI=?Dg&>zay?XSv~^skgH%x{$~{a-0tlZ1g=#6yTs z3nTFWTa!4V%NJ~Y`Sc^ZFT}0xp!zNtW$r|dE`L?FK;(y*_$-J(%BaA7>A+S)$~)rt zD^rA0)W0ZO_J5{q0l!oeXVD}2Ka?$_V`WPW0}lL+vV{SQ{ZO{h2(Zr>VZJ`NFZkb3 zwuJTnplpSIRknukRDUR2X8%ChLVi`YjQ)qpR`8!FTgV|iBh3FTWeY}*`%T%R`YUA% z`AylvA?I|-?ZrVXZ{_PV78uGF7@HNC?B%C#QTXgKFm_$1g8n0AEAX4Lh5D{+nSE2X zK>u3VGW(;l1^U;@R>!fjC2Z}f3L29zzcAIjFy4`pizZ=!-Umg-p968@oV z8C7W7|An%}Nd7UcT#xZSQ3u0^GxyuYg!3W!cFSefPWs~ur2HNFhaiIfXWx}AVJ`vu ze?!?C|Gly$_Fqx9BL7a=BKfLpiTq00>ik9768>)~TcAHvwyepQhe#Yif2nMtLF79q zZtFAhxxF=35|zS$vZeK(C|eUE^3MQetLIqR8b*9qwlosLh5pYgTf<+Kt+pS^76}3J zcgmIoplnIv3;(6ECH#+-E#d!$vL*IY*)jt=xsi`pm%l{(&y=lTu+D#=Z0YWM* zo##|~&=#7WYc4;uoFFe6IPTETZ1kThTgZQ`Yzh7EDqE0$q-@!v{Ihwb_f^@_`M;!WEd>3&vQ_y_+2Z>XWeflR zn6kyU<@~Rety_%Wm922H>rwxfvK75_HmO{k;dO{yrzDX`@b8qZ*RV@1ttT02IM{m zC{-!a9eDknvgOI1%>7N-;`^PlmHAWIiu|T*DU>`q6mUNLrfjAEy|U$ZtZZ@pQQ5lZ zv_dF}vAugVFPz#;c5O&^SR|X!*%uD&JQ2#bK}Gz?PKml{>6)Nh&NY-oYZ)f?r?O>+ zg?Avp-;^z-Un^VaV`Xa?P_}T%Uan)yN^$;7+4B2SWh?FPl&vTbffe;H%9c2wYansFjW|}ec!Nm`MQ(U*x8mC$D_hw&gmiwVZ0Y=!vi1I(vgP+p*)jo? zEf-V*-?6gg)!N*BtZdwTSkDgm0S5;*)sftvV|ql z`&(rz87*fvOZo3BTOl;Z$`;-aWsCF=%2xQlr)>57O4*{rqr(G~EgUTMv9eYDP1#DD z1(YosLb0#P7Uf?lTWY^3TZDhFY_+ufqHN*(UfCMwJL1ey?nyoo+HvT{v7zd*udk_@Qjke^a)0Z~I9Z1mfMoL}r1B*gzP33tb|bdl|2V!(|`{eRk6J zS!P<(AbgMrhDf`9S&>pUQq~)TBMt(w4S7lsY|Vv&AKMUIj-c`A+j)XOzJs^pfhKg= znaDs>%hqB~Vcu~XZm3O87m2I`ZFWmg3&AP@2rGvJNh=!w#{o#NZv9-UeU<7*@|(jOm+E?7ZZfCJ4@7zfES_Y-VNQLYr*}1uY!>r0uNc!xS;ND$`*!2y_SkkO2J3whq9Ge z2Pj*lZ7N=9dOQ#X9;W{!8bJ>t*eJveIC)zC2A&Gp(Nh8JL946_2Xsm<@fAm+6DkPQ zjT#N95hKM>BnzS>4I)v2!ls*MIKC=dG2fIej3S_H(Kj%BSGF)y-;}MBfU=dC;sYpK zD%BieeaFg{1dpIB!cP@3^h4QV`l@VgA1hnRfU*@Cr0k15CHqs^x+g&vBl#!F7UmJh zuazz6v9cwe2!XV~&}iP1pzq37@voIFl{_EZtIYB?7(m%Nb*yY5>QC?{u8byL6o(!w zTlMwkoKi=sd4RH&NDnAme)56lHT9%Flr1OR-z!`6O;l`3MCzSlI%#r5E}@n*UtcYLl)0wX%f@BE*veS*zs+d{?$W z$I6!9H)X3TNdTGzC|j!yzF)%1z#u$c9R7fY3J&Nm%2r5NI2Z&LsnpGuEsw6y)gP$J zt5u=L6g_O9*BB_mwz%k5;eJ=PK-I|uMfY{Dax~)X;2wqa5%Vfcko9+P(DAbqx$lGp zr-tjGc<%DzIjHyFw&=&#$)TYK@z86Ijn;9O-jD`}EyS;gt$Sg+=JSMjC0b+~b zk$}<>F<2^X!hW=g0ML1-KidTJ+9ESuAiYgEO39F-oq&$W93VN{(%os$8F>pFhWgmMV$JNA2XzCyc3`T*ki2=SAAjn0yV<@%;lbM>% z@nUdh&%n;g6pw0#Stfz(X(y@U;k!eXRIR6d zM+@?pVgcw^VhfmWEl*|w(dIQgE_gM41C+!S45Dvu8BffUkCl_2PPI16=oar9C+$9M z47ZcHHLkU>tqY0>x*H-J$<#h953PDQ<-*V(ZBRYtjKDsi()pme8LPTj=xNz%J?UiLEk7^C;6Xv4sa?Rc2!bQJVIW z7Ka&0gXqf4=p|p$J7aLHRlKc5S;CrJrlu5mgC2H-;wuR^BO>0>IZI`sQmz{^D!7 z;GSL1+U-hy2=W&G zW4ABQ8HAR7Fc=?`Wnf|-4*|XG(Oa$xB^{cl8n#KSwl`U6$x4@Ln?_S=n}b`WPDroryq;p(#IKM85uHlh z$g{r_$EG7mZY@%t)Z%QVglC+JtCB#@+Y_4I436IkvlMYTyBC_Rgjb{+^tKsnMU|#k z=MpU)E|OZAzC&F%8+2**wUy2$ta>47-H`|phLg~P(~S}3hSs_}i~t|nfLVfgqNpDz z(g+ItX@(N(nH}J^M={_@myaC*_g1- zyUUe9iT&K7ZCe(>n z!UUdQ@;n*<;h``t7=jXxEyola6BA%nG#ZOk}^pU&7)o$YPhf2_+3z*MW}hMXwMz29KHJqBLYvEWoFpJY*Zwj&|x zo!U9M8A>!7b^>ILktc*le;lA*_1TiiQ)xd1noZ^=0C273Z@5;=N&-2Ef$#(OSpe61 zAol8Z`h@j2TG!x6-#1(<^&76mreXCH*TTfl zygl@AEornV=CCg5kelKBifiS4!?hmX2?4VJxE2U`U+WWYs-xDW@3_`mF1$E-nfGo# za4iskYhj9B{KU018(lHaYn-CEW*_~)wXX3%86N@tUc6&mYj(;^{RJ3;JjS(Vr|eE) z&|@0kajl4vPF=Z)!dlTq;5r;|rf7nyE{HOFG+Ecx^W-!B7C_d*m}EVAB8R!~$WxRs z=IDp46+&B#W6Ng2+5*U0W$b{gmFp7;$XX>q*M^Q|Ed}W{1whtX*-2}m3!J0ti2W{W zDXiSPPMoSs zoiMc{4#3B<4m<#PWrJ`Lq;m**h&Mi3ngqfMx-*BtA)zD)W#2)~f~1!_i7X)JTX2JG z82yk#k+T}1skEBB`sdliNeKw3BU+;7ln|heb`a*NGR7#A*3THVl2J>HE5l1aw4Ol~ zK@~q*sVc|Ove>JX5-+Nj?{yY2NjA?OcB}05 z?^&d}b~<18$|?KKV8`yC}G9Npjc ztGOoark5GT^5G(I$;+i{GVpH+*R)*LJpsOCq9c?><9AoAbVJQdh+ND#TFmb$PmJSG zg)N(7lOPC#D;`2Mm4kIec ziIU}a@!-G8UvjVWOpl|L_EvaFWe>?|iG-DrJCdbwVH&&=1AF&NW0GX;A^KGe1fJNq zsC!lYvipM7s%a#GHRJE;`>7{f8r>bIF6=+3Z9659$s}AC^fIlFT5L1l@KUm1Besp{ zMAOH;lR|a7YmFzK9PI7Cjh}z6^+NEhBw%ZS%Ofq=Y9V|S4GT%O6>iAsT#?Q}UBK1? z`3t%R!lO$ayE)QJX9W4Qj}k@QFK4d~O5G`4c#TxA%2<@RfvhA6$pF||N#brA>LD+wcT{fY^Cw-92W+hsw|KYS=^Xsx zbn1{a>5d#Ff@51t7qGQXK2#h6Y^{R8>G?1TYY=O?!hv5ynu7P#`#QJsx?Z2$R<`YYX`Tp(6Lw8vd6(9(En=ve6w6x!~14`81vvj8WY-ngo5Fa zdhR^^TeA4t%Cgl3z}8xk*Mbzk93~PM25hZqkFA!)p67Ldtu`e5qS7UQiFq*(98>6Ttwj<@BA_g0|o|6cHOusRFbXDIK*9t+e&*2eCRPVBPOB z>tf6!1>QDDl$Q&iR_m~MG0P@rBm;6ZJ1=!Hdd$@A>Q`C|BAQh|0NvmJMr%0%wANIb z@0J09zKY(}V_GYM8=$ppNwTTn{Gm(n+#(h9!duh~oTlIz4BAr!ndfTQ8RB%aBGw~- zgITW=JeN}$?x<c00cTO>(0a~kiHh%f>yyh{j#Ys${X(p!g zoz@b3hf3KIVWB1_DiIPJqC+0jS_rghn86+YjINQdv=%!ffjqAgtYqkhSar^?l`VkQ z5-b^xy-=OUdQhw;O7czF`nbZ~XU%BSvw5quGBpjLwXV~yG6C1BWW(ghjax~C(wWOh zFMhDr%Nzn}CtL*!kz-oRAW$6*!AmUQdIuu3)C|^(wr&t=K&2lLvCgT;?&PIk(3VDcD@?{Xd-bO#uKY!ue zK{uz1;EnieTcD7{g4alg#7MrF@|vlbd<`tmNY+cOLtdNqeCD~(V!J9vNcO_ATQFXOGNYm?`6uyMB^b4OuQ`FM6&2nW=i4yuM-;ZcLoUdgnw1lMux@2M$9J

?`8gPkb_U0+g7%PMStOcDX zj_ayh!{~WoTJ}&LunMwhGT-C8yhalRU8=_si&6?XFLUjNcXH$OY-LTT&x1R&IzdxT zZfcjW$#+SRylA}`Dm9WX>Uq(}VRES~mey?9Er8l4(_<>F*%A>_`5rc4dCLb|`RoM( zr7YW*(Ej9P*}K?A2Uey$sdLSJ-f8`fcIm{0jcUTIz8_i^)6n%jv7=|#w(dxN;8pJP zDf?h*B!gXs>ERywNgZ4EuBx(T=JRXz%6JOeN9T5JB??726YMX1lpEWJ}%1b%lNE~K$z zx;t?FTK(I*t)7F1du`!|58XtKHhiA!2VQ#o*$RBP)%Ath8=<&2T7LNPA>6FkQ}N5f z^^;TBti$&wE;A2!9|4Q87M-3)M;~96-Y_qw*0S%wz9-7YHGxmJW_=X0E+q$d>S)$y zkm__O!?rL!C3QCc4gXXC1rgYS#Q;6hHEAr{HrHn*KYoFLyEJIVU z0tS7bEr;%cCaBDc{+iA8drAWO+3y03sBK}(Ef7l1)|1XSkK02e6LVN(d%#sztTQ(D z2H1v(D9KVb{%RYR`XFf}g+c<($9HTTSJjA0UU77iiZbYMG_8_OxFmF041IVG|bJGtBhTMJ64)A%gtI(pz0KZU~ZZl?pP*Ncwqk7Ha6}`xKLP> z!)m9TzJWU>G5rVOLT$>h|MPQX@e@ zK{gYS;Y_n29-%JDTmtbHE$GnQ+mDYp8e?p#>&RVm2>3CrJV?OT3Z^wZ_O*-(!d}$j z6sss9;^-I#LO!{_%HLrZJ+vjD$&={p6|!Hrtdi%Jj}m^~$r769>X+|gx|p|N-b;@r}yS`~+TY0er0>T3-$8D6Dl= zn1sbKujR0JgmrMib^yK>#(_H4!AUdZ)YY%P7BXD3BuqO0{ECVJYJf~(8|{O^QGnXJ zlqw(lTBe$SuVwc9tFL7`aIAnmf6bnA>}&a$WqaZl6`5D01!UKFQQ*F5YAQDBxKh`3P zS(%!}QrDp%+O2~z#Rg_r*?VM^zB#U2`FeZGkQE(d9Y{d}t!FrzF|j(h!nv_>k7?^( zR?n$hCN3#vE;sqLNvylXAXY*0hWCWPfUng%<>=y8qUc-l9yKhrT|~Fp8f@a|bq#hl zjmxwF?VZh~tjp7&?qqeyW$EE!r{LhJ$$g%rkV(+VMcek$2KvMBDN$w$Zx0=S-f}Ue zSaLJs)Xs@latiG<_iuLeoj&%p!e(F!zE7`RGkSlgB;tJ$1bt)>GSC^chken~CtP}O z)m{mnF$xL{4zfw4l`n4I13yX8+(xt&I)#vBck-+himO6TS82hisZz@pmb1giO2LXQ zrIMxh9`A)U=04MUUnBvsB8dp0T|DFVI5@!N9^ztBQ2jL%S?6%uiddw%N6Q2durG(&q34HA7g+?lId!j$psS=-IXPlH|{{g`L2CRbe_ z?yME%2GbUnDsCu+IoOTXt{W(9a&u8d)t+inC|0D#-q}4;hHtMst0cCjQt@AOUGnf) zW-oQ&Esc3MCPdvTBjfg3RtuK>RA%>7DwV#`sxoP*GL;v_jTvE=m{P^*jfJE;iogKd zKv;1Sze_bU#k6%v+*k*E=2ibHhSuH(F=3+JxP(r=abJ>+MuOd@VdDH~Y5wS&U*e z5ET%lZx>k6HK6H-Em3*Qb-oy)x|7|Edz&h&yo|pO<>7+dlqndM%?T4H+jN;lUE3vC zD!X@QX=pvzqdu2lI-!jGl@dix7@7auBDW(=#X695BAR<~>v89?ua!}VDVX*!o5vQH zfv-ZJ(cyS5u@($@#ClS{{Ya&M4e+(-?+nM0Lt=Fp^Ez9h?+9H(-r0JVzA=RnKa<5v zqVo}XS{PPNlDH~TMl9#PE#^6XDSRonb9K*i%|jEkTE2eG1EvM|S|NZJ#_fc?RNcaH z#vQ8d!4qi68%wqNzOoABO}^6qc>rlMASS0T)m$9P@9R)5PLW6=IT9{#wOCWDZN!ZRj^)j>Q^7hGHVJy<$ zlc4IoJmK`%I}CUZ)t>sE4(3fv7)11W0s*sPf{VF)@q+AQA17H=iXt^ zi$^NqP}}9(eP~H^QtWmT=EHDOuOQAK{IZ*#iNR!2RNj_LaEDqFYf?_|lO6&)*E z>5uBupU7l%)n^RKWKPy+zLL2MC|hr3vOd>mJ+gAOn)5BiD)Z!ho_U?cb2YQe0fsE{ z2DB&p;Gw>d?EMf4M4JqMLt&PTg0F<<3YJHDFDDvXY%TE(0Q% z$4q&dB=|O~T%@GZ;abBZt6iiDF2i&%`%sVrzMPz6+mZ?{iy(0R>F={3aM!j_9QI0_ z(-8KX3sUr4^!onub$aknduRfh$_Wj({`80knXCn7u^{75l`B@{!S+&B54N#{*0E9b zBdfvEHn7t!${b#C65X=`L#^E60q|@kZu^F|M~{=}z%0~QBqG>H1?f^3d!kR7qb{g} zQvd07a6`Pr>1$y2)Zv1`*bYx6IJGk8PVjUca!jgT^6@^8co5SVxH$fdd_)L>NNKzg zCt}_duD-|fMf*%#vS+F&yE;}11uk^eP$$71m6PvPYkEm2qtJ)T;Me4&Bul@IBfi*l zwxN|}3@fJ2%^rZQdi!A=fqO|%F5=>I0k0Ci+%}w`BM0|aO-m$;aB5}>T?-Xc>AGlfCqrYn*_$`LeulJP4z2h;{3u#L@#U85nr9JW zM^<6FjoZ(VV|voN0WCTUK1vAoTv3L%Ul%pL{_OhXdvK^?5SW-&+z(=-bc2*Z(su`9 zs}c$0GpY82+MSAJlDV|K1GQIA{6*PPb?7nHn`IFzt_~=f?CKX%30ykQAM6BU_tq$U zyoNPmW>@*}WtGF_CJ(#VL-ygwE~+EfiP*hu=Z||dZl;xXPg5lPY2D3gT%ir2ktd#J z>Ro1c)|&5y^WVKQU7cR!elE%Uz~f{6-aV*MQ3&Y^7UcTC=~~48gXD~Hx_cMNt)dsr$izo@_b`H&^R@7;osL&HPTEJc-I+Y{+UT%$$Y_RlL>&rMf6 z(ew52<~nSo^lphzQ4mYh(t%rDYB+z_ymVbH-IB^;2=2NU(OzCi9e|(Zr*P*y%t}n7 zJnvfRGrSnNb`6mv^O1B%T9vi7<&-1!fYly2Ov>3H<9q;`E7`M~DXE)17D@G#^A__8 ztmvtG1fn@(KG1eVmrSqhTL1h=*WakCrv z?rjyZsOU%f4A@S1b3|}pn&t@4bXaX4$h z*pT{OZA8JkF^?EM9brGlWf@2_VR}CEmJl=gVa>hRt+Ni2_3FO5X^72)!u#i|6C4;s zo)cf1DyxQa_EG?~Yf7Og=iDB3vkhKW0|SG0Iw=SCV+G8aUH#48Zs_Uj4d*|;;D(-E zPQK#YME5G-hKW&#a&1)yu+1x%$a9U+AGATwI;v6+MxfjVpWr(-u zQ>b_Doock_-q!hKj7-&19L4~>NW){mSAW+}pG}!G*}$3Z{4Hp`YQkCd*!O%We?A4`R?z7%|riF`f z?e%tglD9=KW;QI>`?DC`X}ELm(ZYc9a#n~!+EyF6BC~h+_HbdUd47Gzql+Hsj@#SM zue^p?HSgNLk!Q;|*>nw1m}p$_sFrC6V4wM0`7GY%-B*0MQuJ~1<7(G!_Cj8BCb6_s zTuG5JhZl;{!JdW9`dST{99%$pKMR?mf5ZgB^Z zE{@1nv)#GE3xQL*>L_jn3B=ASGriijD| z*tuF_;VMziMzgu2%lu$+r&6NCugVr*;zj#5qdRpb9FW8!KITf|K;PRV(~rm$FU8&r ztn1*Z&Q@|)d5~0j^H|yH4X_lun1+WHWG)~O+nEnH%U0%$O4KwOS&Gt#4rQGzBp3V0 z1`Dr#Eq?E;j_bK(@jZwdz-1{ri@c$o0L3VBevUfdZU^K(Y{;{C<9domB_R!}(j5^x z{xh(w{r6$K(m|wdi?D1QniUjr3DwmhJ46N~gqprbA6aUPAQW0r;WIOcHr%FnO~9oy z35u0cpCj=rZs+G`BMf6~gWsABrrttDM>pRLc=Up3p)?2^((Q|w9}3e^!y2U?ii>ZV z7E{k0VaQ|mg6Iwtx)cv=V&WNX8I@_kMnPt$^LA~E7@c!z^Up#`pZn5E8bpgL43G2b zuxXi`b~U3N9y&dXCbbdAybz5!}{=JwOitRI_4h0VkO_fg?uajSIsmU>f z+)%7&y>ZgduGmlhjfYK54Bxa3?U1Pjr`Ryz-mKmEx*!VMX-J{gCp4sENY@@ecsg_1 zhNT%_SUiDLib_Zsav9PtBZoJyl|Y055eQ=w;1sV8G0~2%=D~K8P>I4vw4HC7R|svSpW#o2LKw-rq(wh(MV-?o?#uCA~V&$@qxV|~ODt`HP&OKqcR zo*&s&h@g<7W>YXS?Gjk5CKEQ4Iw6tsJ|NsJIbV=V4QzXkbK>32ylNK?1Q;8TW*uAN zX3KqKh|QnI$jZ94kw0ANxfV3NoYalYulB>z#3YV@Pma}eCA6_4bId~d?QR(glKgg(NeDL zs4LQ6jraF`KFGMLoQh`{!{=mYv*X7~R5QD%u3h_-JoLq_Ue1K`u6fT6=|7c2Mjvq# ztAQC5xOQ&wTSagxlQhT?Jr@0Tq-(33g(&s$i8a8;BVGPlfcr_NPP=!T*H;u!wi zHVvyy#4XL7HtH)EOHX7bn8DgdF8H-0!~?@%w~Hrtr=*{VD%n%nJcCaudB8HJvo4CC zO@9x?GpZ={H5Rdap~|ZM=zuNG?kFtc2q#X>Ws%e`8AsK^JdB|TbK86x-FTE0L7}~^ z^4=J5SdK9a}DJ3gqd`4i!8v8Z)xq%|2Q zylYEEkud4Arl0Lc@Rfi?BOQO;ysLcSlzy|puw3i4z60+M zfmd_KrQi_-s+92!QLLY9>4+3t^hhsEU)%i*Cz4cQBk|g1+_rEz=C9tz!IB1l@K>#e zn;z?@2>cT^c^$$C({$l?J$&=osP=HH`c_WzCy)v2D{^`b9_{1_m~dGr9J_%Cokm5#rv1!lc1z$Z9ixXroHH#TF z$&5=W7}hAr*@>A-sPJ&d@DwAx((Ewvq<7FvKB(;=iX#2p4wLDKH9UR@9(C6dXMs2B41kvS#vwN zp}p!5p7yo#93D2Xmiw1ef_e(GiT4&P3NbqEHuU}?4;B}aV}@AiY>Dln=kr0N_+N#t zJr@L{PKy^g4mmFEMh~2diju@FuOeu)4g+8Ef=cR5074fnIrBgT+%%dLaUqX8DTklx zSm@HMM40IzcrSb)aD#N)v>Xm`*%*=>if~2tFoCp5l*lN*pjf4K$N&f(5W3P@0ilac zQH*^TXKo(CG=L~GrpO0`E|aKZq3h~<9-1Vsqpw2ObC+7#cO##2$^fA&`vPN+9`$5A zHCtT47aMN75jdj%$98X_Yin&X~wyQFmg=&k4rMW%Vr)dpj(_1DzzIC|Jay zV{N6}vWD>I46WDWNG7^m5@!_NJwkt|M`990t4)$Sc7j?9d!tEq zZH#GYNQm@-Ol1Ox|F&9?vHJa!wB^R?A=9@xViNg1`)$rq_ ziIt=)ect5*mRC>?=Z3!Gn227?s6I)V%3!{(vFgdwMmNG#nbx@Mwuf+I0Lu%;K%xen zhrQAtW~ACqLvxl}s~_a3XTiwNjKj@7o+bn=FZAI~g$5R4f}i`qB8u6owI*yF9yMOm zox_m!k@ZNy1;S9Wl}q)mS)3<*4qiWG;yq3;so6&Frc+?OM}n)9d@F}r$n*f67i9z? zrJk4?q2ED|UfQ4!w(0tE^8imtub_n{@$}h)oCgjk^EYh@s^blG;^6c>G^fc82PG5@ zN@NZ&@Dds$sd3vACML2MVIeriq#jT&8yc^7#+379s-kel;gOkaI7Y(6@fL@kWSQZm z^JvZ8{8{qA(-h7owB^}U*;IVQagKo#XChs40LzQrklNH6P83H9Fo7|&4hJZQ4>kpL z2x9C-3nTZuq?DXbLFe&J+vXsL&%4rAbi#{rQXRm&Xm5MVpu4)+AUQ+yhq}~GLEi3SY>&4{ zoHqEF3_6-=&mFMM$52nnN|h;`aby{EfVSmVQTv{+DXRnd=&)T9y^TF9 zVw7h0o8yFiT->1xv)hsT^pV7cnpW^OOreLe5QnZj`<;C%C&ZVWHj31iA|jqXQpe6p zdHjJ&j&Y!Lq$+@UA!Y&gDwjU`p{YbiKAS*zCxP@?0wqHt>}(?4okS*v^c|ce zx%p*k+azLJc)kc5|FcA?JIVWkHcTbOV!<^#Fg9Hi#JxeS4iFdd{3LB^Z9-MD-Px2N zew$;_D~Pnrx=YyhCQaK-Vs^G;(F;RzDs_)222pH1=WyrFQTm;1s!MYXG`Wb|BwMJX zxK_%#StutN%Zdwm)wb#@1cxO+dS!>XQ?jz6$rv|p(izqsrqV6iz?-XDJk}>AJ;4t*p@(KTE^M1u}MbNS|kO5vNI%T9Tu04Wed4M=9CMs6L^|(mv%`Xps#dw0CRZ zN>Q}MDDHdKWR%IbtTxG$)@~D1FkR~p7b0gdbtR8O4+EUnmj^lcn@6lUWFJ0f`#&%8 z;mvvEl^(hFARl2)18`nwJvNd#6`VN>l9sp}4|2A^j_f>{&7>|S=k3_9tVBMZz+Aq zI9Gk5)g!Vb_!rzHOAfuDWvDrI**&)u+S#?w zW!B_{t@*s>bxm7`W!c5z<_NR>no*-#zaY|6FRB4$%j$J9nuIcyWR_WX%xSKeLf23A zM4nrU)s3pM8PsHV>j-^m=58x2+wclL`#gbO>B1U`w5I7<&Ebip+Nj#09h(+6ljJ8a zTHzTcF$}@6eureiZF~V8B)P2uUp_%>L-%d4M{NINWh;I9X%=%&o<$E36T1)05i|Eb zwCH`5(c3uP`-Hg~R(W;slkIn4BO7|t@nVL8H+IU?|K*^&1_z@WFIiCX<7X|$&E+Q!fVF4z1kfySl%MoI7&{H9RDqLv zO%t96V<9Y4{+W|jmd}pj1D`1dPF)WiPi&eh$b6=i`8==bnfbx9*vzT>O)n~6J}+u| zmeVx;GG`()XxhfRfESHlxjBYwb4T*Cgfk0J zg+;>VIrz0%j%)J*hw~?|%}UKIlC!>6n|Z}_ZAq(n!JT!{`y8GL>pa)BS-zQhLe>R| zyUWBg%LnqWeOQ;Bo0k=5R)fy1nBQGFc)4PCZdK;)>OF~(7C>$`!k zpee72F3d6HU*B0q?PZ~NZ(rYuKH5pP+Do_E3xx07$lAFL-+h#|oB3)d$7=81tG#iY z{o7Xi3$ONL5eL|;1OKDXAy4*k+9FVbYo~~*r8QroIef-s%u?5%DXFb32?jW9Ayfqk zi5*mmx2&|O7Re`C|^I($@VB7MaG=ED#X+DHE9Z!EBdc{!wESCs{J4f z+F7Se3CQ`p?4RD1i0!K@(Rew2|J0cCb6MqylNIO2-r?+P)0!%*J;r=&Fj=1{vr>)N z3YRLMX$siiJ0O*=m~9OuWm1TduAJ|PVUoK>CR4Te^rnEhu-lo;rTz>#|HB{3mO`v- z&DwaWdE+&5x!R3q)vhme-9J}tzI^Pze@Kq3+nQ^S{&0zVz4ra^w zsSVG$U=rb@ZtD;V1)?0qI}GYJH_Bg!6Tsyo5>C4)5lM#!Y(MKd`~R_bpFvHvi~8^b zp@bfK?}+r?q&F#21hG(5Kzav31OW*NC3Fx(K#(p~iu6wCozRpbO79>DDj>?cKz+7P z+h?DB_L(#D&i_j?lMk89tgPfN*M0qvv}~-HbY)*x(5zHV5cYuErKCV4@h#FKN_tAl zJVO3kx6#AeJa(-xRW?*Bp^;s8JK>NN8y11wapqvi?!f(5guaOe@BEU3G;%InNSVEz zU{bKBMJ*_#lp`IoxC5u2%xfDkvh4?@Q)|^EftdvtV{WF3@&#a%D>8+x+RF&^+Ga(` z3`LJl3XVN#{rk64AWRjdU;dP>)M zLe>}4&kJ4i%`|BHmpO#m!3k5hub4e`=5NwiK zr7cT%y(3yn1j>2kSV_wV_|Bb*Y~4;>O#Xy-yMxN+aymH$pQQ19&9@*X`eN5BAZZK0 zZv;L9U$kG`P7erXKYTb@EKQ%BB{~#Eu{JN*oJgMIvulmu=%ZO^*l5Rb58drCrtdE6 zzU@!sIYC;d;yG#U66xureD_lX#Bq=Xej&YAHVooBHbXg>j1=m+QTiy)CCs;d8vWLf z*KRN^dB;8~G$cjHt@N|JuhsaPcR&yX^n9T9-mP$=2dkOUC&f=iBhcx$*9-2Cdv6pj zyu>jmJbfReC{tom{zcCd7DpCq08nqfo=D3{Vgr@3p?WC(#;OM49+ZFiKGQQ zv-fPvLc)^TDW&a%ay7c&+|%Ns9v-ci3{kTegoCahzfHKTVBZ>&FwmQx#9nTHeE(JR zz;U!F2Ezbr1C;dXxJy8#w;Iy~E(&r<$Z=3_i0}h6+|AY%!NAaUOao~>T2g+%CZ(jT zj8uL_(c?|-m_cB=n&%a22BF3cMAo)jZH`0U-nZ*FE%H`@~P7?Va=@||SSHgk1~7Pv)PYqQh8gQ)Xg z7ml)ABxEdGRTF4hrJa8ncq?%>iUa>{OyIFjgSDQ@`6*Y(!cG?TBl4`ni9x5RK#mBG z!>&t5khu7Y=yRX*pdYhU$kh%wxSYB$@5tSa%c$t%(mG_<6(OT_Ex*p{654lDKrJ|I zGlzLAlY_v|3TH{NlPYFdgTNE1+{U*o9EaC$MVFV7kzw^Vri>q%~_%G_I-s*LKw%@LWeKDr{wnS{oTWz&LZKG$67@}zlv$-$6 zX|e>gMyKO^T;T!A>{LBVGax~E*tGU|F1_dZry96$Pq>ft_Ejme==^Zh$=CJiJ6iCBJdFb9N~aQE9+)#opVG@y#kTbT7RJwu zP(Hl)nE0wq-c@>IV9ARTgRie`)I(J~Pco;#Aj5q;jpuIRt*-_5TBToc9>wAay(qkO zeii><*rS9_HCXMU>-FU^?6htT8=Vuni6ue_s-T%d)A2aNCBCXMF~6c)Y(7tW=&q-E zm(Kb=&zkIIOOVc(aj*$EG+K+l9vf}de#?|#dN?C0-FNGpLZ4fLzPOD%W}}(YOp^Ya zWwFZMIewwqqiHyfWOaLEq}yD`n|E&$YU@Aty2TZqykYG(i@#Pm;O@sevqW57*BI&S zbXa@K!<t7C~^tGs8^y@@PFK0`OGr$_t#c%`J+u$+ zMm~99%tcn)eJ%KdOHRM#H_yXf@`1Y)`I`$6AJJ#R&gD;C+HrW}YF~C&+|m`yjs5hTYBrtxR1QI8(LG$zu&JBj;jeE$ z6D^&)F+03!jMe^095*_0V~nqUa`5e}snL_duZ!0^(W})Z8-DM7?+4F*{Zx%Z zD&RUQ^k%kFP0gl0hFD%vS`2^bc{6LV>!9|VIlo4)PD$eAsL{I4&EV=jQ(cBHkB)<> zEVX*u<|Amme3$8qE_AsS?>!!Rvm*KA;^6(kz4NU@C(CJvVZ(@L`q$ZtdIgYhF5NG)i8>-*iT~)-#)(=_2 zFZ$1Hl_@;jF(vsTOL210^KS3M7lsETfU@;axN$x*pXv6>+|fH`q4F-_Z*OtOkLS-T z%%Pec_Uli+`Y(y@XAz(5<(hugdT~Nq=!O^k zGEI<)*9`+?kSQb>P_~G3gKpXe3oHelDqG#bPBk|GXv-~6%WgIJo^8l|NU$qch>wPb z2QtJTf()2IJm?My@=|@sg$&I@h9d(4ZIRIv%E)eHoJQ#5#CtKw&^?aOgznJP)li%J z)Ja@n)fJ&>wqd!*Fr7tYR(Dv2!AV#?B)n`Qv~VJ<@<6wwHnTh2(Rag zXzLDcJP7Z~<40*k^mRvEw~gqAM8;%B3}{4-Ljs3rA}4FOMr|YI^UkO*wCWR3l##2- zcj}8V@Ra(RF^tH_i`;`qyn#fECt?HCmpWS238;|(fhB=H2Y~u&QFKKIFw|9dexyjpsOs<6C<~ zs~H1GUvz;d7)Kg0m)9aL^~9Ujvg`IdW~F&7G4@DF^D*-w!C-o<$%O@c&EMs=;i5C{XCWzVYi8jp>ce}&}{U?b0hh9BO4-OyE2*v!MzGf4`LIZ=m z6Q3HBN$K&!jIgZi0df zPhYP-F6B;{VSU<6d#~~WadnRp7!Ol4KN^s}n(R^#G%0O%sh#>MqZiz}^Hb>CQx-s& zKs(1WSL&1|qS!li?E=lD-Lr3Mv6LY2E^V5scLEtp+TI1?ceO-c@|Bh*p9v$Mt#hBG z&veC|TtHCdKQ``(wMj}p@_PJkQt6vrIu}hk*gnG>^^APo;ZO?1xSoMonx4Cs!D#%$ z`I6WNF&1P98D>!1XV6Y%ib_Wa;%8yqagQ^ZSMW#IPbdLfT?8X_1vH^Y8X#m=!0j0_IVG(^1|@*1t)SNFGWYw ze%13sTe?RurUG)R7$8m0UDrY|+{NB4YJ*=?z}__+E?Rba6JonYGQ;*Vk8sY}MY(mCE8 zwx8BZ)>m|j(@PADO+|M~aD56Qq)RdB1iyxrlJvQ!F4+?hWKoKN_i)Q-r)>q)%CzZh zhzrYDW$YbZm0>wh@AJIm_Azay&^XtZO;r!$;VqY}xWb0MA|R8qz2Gg{S3Ysi7&E-w zxS*Wqb@@f!YjoHZN`*O}v)woBE4ZKfOBPm`uM;gzLVn)ID1TLe|+yj2!j zRaOqC%9dSWl|x^Z(*~eyRl5PWmR9vWhiX>`J8wNJa+ylwbz(;rmG|s52(6kRhnkS^ zn$W_U@V=VJjhbkJ+F0J&IIY^p4z*9hYm*9Vky>C!^h-aJJb?Ow^_phic*Q&29tb0LFU%OHF(xJX_qdqshzK*w{;Y)poOg)OXp-v0PXg1`8 zH}ufgJC8Pu`qYhpFQC8F(Z6IGlWBa#i#4%PH`RyP*@xrU0=Ys^x7J7PiQ$}8<}!hI z5jV56E>=ZFtT~mo`GeLIwpQ~iN7dcICv;zMN}6|@hdJH+vzzc3X!9&ua#>q;rs<4~ zo3qQZHw%^JdRw*#=t#5~u!~SnQK;5A6z--3zGG{Ya4T^{%TE2hTw%=)VN_OGHf<3d z>n1(7b}LwxL5hJ+g0I!>RSSJX>v`?=FjQOSTw9ylWqxhCxWJ|_eeF5BtqO!KJU3gf zGPKLdws*R<6I6FF61HzWZNI49893Z=^r@qxtX-hLti7x=Z?4_!X2->ZPArB_58sYy zw@%e7T{6C1X~LaWfS^^@l}y_0>`1M@N$Zxv=)%|GwAtle)Zxw7WyE*a?CPdfD~#xEVW4Xz>`UnHD);UG^0+VM*l2)(ftzrk{S^i7*Aw}aR!FHc86#fhjRMSGi7M# z))3{)5VZ~(`k{JjnT3C5Sa^$??iTy!Y^HbGLu^hXn5okAjKkoqQM?beBqC!XPGd|1 zI0B5U5|N{LcSdCxnZ&-1X!@OuU1K!4*f=I(GAcPRa&=(jI?*cw6D6G&EQY|k+5c0v z$aUu^o#+1lt8AhFqsrFc*39W!J zo^}a*j(UGs{Qk?}`>)&YPl$Ig1b4ufcd%V{aHD~pp&f#u9isO;BqY0}g1h9GcPU+V zsiSvkOLpmpb{XIAGLw8@75u<{`NKJv59gylaF=}G9s0ok{(~UN9+(*lVm-vd1J3n< z_<#`>2pbEH83H^*aB*=V5JCb1ViG7R2{9Rzgp!<$mWq;^nwFl1nu(s4k%@(si9wke z_Z%BD_c?ZcZcaWP9ua;X31Ir ze_dBo@Jh7t6&*bTqyM1r)&EoYx_&of;I62VXI_A(^N^?Xt@}CdUa;HVIYZuZK@aZU z@qK>BuW;B;CG4T+h`+{vjeG^`|B8G8!WY1MC56M^hpPj^*RQlm~DJ>hFC8}S|a z3dwa#$nAWW3!BaLPtI!z%fDBb|7`BRK)#|9Uc%d6K2NL&UaE-etjzyd8JbcR_M)nH zsXB43CLz5xy07-da$QPE{m7^Kxa@|wp~kY!#+34=(bcB^E#&K0;j44J0YJVsU-tmW z7a)9XPxb-G*T7WUN#2$PDPAbp{*d(~GIqxAgm!w%8S=%8|Fs=Fn+A$vh>ph;G0`i=tcv~oHdvH6 zebEyaS*?(YYrCh|nE4OnD+=@N0U_Qj-#1G*eCgY&U%0}RX(kq;11z>Wg_EyuS0ypo z9)0@oKGv>o5rQ4nN}MDmc3-I)RM+n>ylOy1rd_nh{72-AYoaV_CIPn|K)y0TU%Q?w z{EB?R0p#oUpOCLroP3j+vIO&L#sifA!cC?@PXPIv`7`om6XH??R^%U@E{O?&3tMIy z^b_29aPl|E7wX?az79{3FLr3W3QAUNA`drHk@^0y=`0;9RC`i@=*e{I*s)UO;8yXi zb`=!fJscLl9lhJigjQbrROvJtw$!zmN-S8)su-zGtU3%DrFx18xFn&Y6j+NCG^gC^iRCLEhvjZJp1ISnO59BMGCW6lg z6K*M!5`wuTOsXjSZy;a0O1d&XkuRRVM!r1%fqXd=zi2l>c%lL1E9`HOFHhGbl_UN1 zOiiew_8Ib}OmWca5++@AL%rUxol%$N#E6;)j)qH2Y`J6gP%`6G|Au@S?F9Mxy}prX z94uh+H^^5mRm4A#udN@**W_==7p8C!BQyZdUFzEE_DZx)jh?2{1f$xz5;y!{lReuO zOz(cL4BZ|L@AJ2L{2U0!p_4ks-^n8U(Ha&dZ;c(2Vhm#JVV)1 zKIWe9{BxPVM83p+MZVG*&ycTPV&PNdOZcylulqldFY%wq*SLntPvlGH4EYM8D*ZF^ zl^onKHLiIuJK~Yle93Rq>>tP%8bH1#{)Bu*1IU+c8;#W9{VTEwyBJ+($d}?7@@0V= zv_~`*1hM3RvUdq`#YkTo#hG$dw(X#N2xc(_kS})t|M@HA3;p-VR|w`RIhM?ykS`51 z&Q~<1#{UHQ+9mY;iF^(HCGzEtW{YW64n~Sv{M*P^)$ft7kUt?`uD?gV6n{a!x^R7W z@P>Ymd~y8&`Feedd~xj%{sr>Y{X68#;;)b|e&`wUrT7#1QaMGwgk8bDr^uHmF0?#S zD1q{;D_Hn9RRZ^zA61oz+lmyj>6(o4EmNca{1gnZpUL%v$DRA*gjXcJkLTz(*5 zBRGBOB+j8w*&Fxc;rLnQjyR}`_WItB#d#-HIA#7L$k*M!K)z&Junb)PuaGY=5vDTE z3X2>FlZYV<)8Cbt?g6nRCCgJ+VjMQQuWF=uXto(mQkn0_*Dm4z2>A-ZRK7uKW_;fl zjdRgLDoF^(U5Gc5MeGOiwS&F7M|2~EJLu#=o@F4()7e}f9%^^vT$U$ze}R1MV3(?U zHkoUzFM%xe0{{;EM|F4j*9qfUgAIKNKE3d_W1NrJWMZPR&Gmr3d(#&!^VdcUA@^yqu>`8lz z3;M1!7K@e0EX4ZoB40_VXP%HS5+(!G7=OCrjagO}(ormq0h zrO%1)ratuS!cwwAkEIZx&XBK{KasB^Y?(hGUt(v-7e5q&bqU%utE^MgF}a&ezC*l$?(t+L&Zw=8jKR5aju?n`#)A6!|Ac%w z3E>{Ck$o+_p^oZ9$@Vun_ET8&cN6x5UiQJ zhJ0ND+ul4wzM%gO@@0p{xl9M~{@9>*7p(nnAYa;lK)#IsXUJC=rpQ$0W_reEOcv-R z-X;r+%)g9$@%TEZ#1d4lI`R?|R`B?Jy6a`y8 z(m0Q(_;86&bK3lGkuM&ZM-SX}vtGT4{afSs#I#~=qx=|DW)$^1~*^1%&{;=MRM)&_;oGcYgcbt)#VVu!+KP8{hN9$Q*Va* zA+PET$P)C5brn5smf()l?y=Fl`pVFeTG`UjPhP1woqNap?~yM`x~bK!v?maI8$y5N z-y&ajD$MXBR)@=Bk*PWHM%MY_D86yTFUS{C@E7EZeZlfl&%`gtR~n;d6UD!We5w7r z$kz_za^k{T<6I}rxdww=-pyeP$(srVp&VK9%~r4d?!RTXVIx07zC=V=|A2h?{f2zm zogrVwKaj6)DhJ3yexHy&3%=>0{e>s~4^6IRnW*8+<@)=76HkraVq7Zm*P0HT_WT>< zi~Enr7y6$gUrvPXX0w=$)uD{n0pyFOjfP_s;>2=_d<8RZvJk_^EU3-iVR7nC;fuaPe#=5DAk*LyFdJaPxy6#5V3 zO9()|YysqJ7(l*Qe}{YlAL$hN+R4DS0FW=ezd*i-{u%PM{T=!8{0H)7`vdv1MKyJv zB43LH-;poH@5mQ~`9nM0E8UFo?~$+cvL*!~=S)T`vVRfzY9kJq8h$CEcktLBXno99n0lVFnE`EaXCYfE#KI%P76JBikABMF+>GVhIrV%7Qs`gb{adanasM5o-kvAYag6#eMxh~k!JNQLM{1nFZ8usrkGD2Az!qRkFNQb3k~Z4$d?2NhQ|N`y@PhO(L-T` zvOkcos{rzq*tUYi1kr&oKn>CAt*nLkilSkDLKxH1_f&g(y`@Dt<^;JV8h@`^$aRG=m`E<)BVSl; zPyzt?3c3CR`NDeb10Y|p-yvT~VD->2^BAlr9-Y`&I&{5tzeB#H0&odsKwjPQh#$xo zEE^mIfqh55MB;d3Pm!-(Y0hJu5&-#P!Qcs$F8dAnnt(a_{6xO;>dufa-SSi93zSz6 zAYZz9ZHf*6^5uo#tpMG`f>p%n<`oXQLqlHD!-RGex>I2D76WKY*}EBRaCEQ>7t(tI z%VQo-*(`X>w`bfEO>rdv!xo;QEf>if>yn3k0dT&!esR9af;}Qg$;_k~y@M&_LTvJe z2RY81FVGc={2;*jq5+(*2_@Q$Q|F5_2#b&l#QYIW@(i3A@aC^c>I+5 zqB^C%@>zvIq%;8ag(K~yuikhCmoZWCJM{&@HZi=~EH$bNP+!L5()QPGG_uH_QeSZ4 ze^6g|!s#4W#Yfd5PpPjvk?a<7uP~B-P+w+S1lO*;;->vced!0_wgA)@Yw5;y-E(B$ zsV{cQE{9XHNDifeCDRL^o35Mjy%gpIBHp_j*Tk9gg%~~|p11`#Uz6uZPo1wX#y2)$xG)$Vboe%O`Zn@yb1Dok z9p+{bfCsJOOoWjZ!g2LLge+Gmyb*Y|N>DEZc1(+1-VF9CETagH=LL$1EHy5Di7RBB z>N^>g1)GTkxx06OW9rC;H3`LHE5|UtD%WC|l@%;ZAI;o?OfdxTazrwE zZxUSr=c}T7JiO=LRc43Ugw)`Pz&hZclV+zjFHTBXY=(SzicItR7~p(C8r=0T-bYBU zkKtJ5)y4jBz6R3DVQ0=4Xi(*Cb6m}-^A$4*DxEi;0Hrg6&~2!&0l~6sIH%5+)Exp( z^^=A88caQ-_^0zYv!x-tpk#wdV@LxwXi8x49huTx@|O!vI;2chx-W?uAhx4IG$1e+ zhJxLlDp~A^K{%<%xN6(tbvkpofjMbdzpmok1mZFy4UE`Oi=Y&6zF;&> zQ?$CbXn#6iOi_(?2ZjnP7?r;{Uo?T|0O!kVJ_c~U5CJYy;3@`8ehC=S1n4s(nrHit z6U1x?aK2_&++$3&gG3tk0Ot$#!}%J;PyiWmT%A0re_Bu6t}n+)st1`Jy7=AsidBla z12|udorb5*7sL!J-6Uqf7ULOCzYMNR!jN`UETJx;HAko}CsMd2+mPofTVjyJEI1=9 zAj28#s~Et-62!fF&6EaFtht`!Z0`>VWKqPHY2khZMZn8LFWSXdaq)4wzUup6CAJ0Q zy$xe7#c5nLLAfsyK^EvdtjS{MiXJb!dl}3>Hp0%sh+H_MzGxZ}h#DLzrafd&sV@)f zm3zQ3T^3R=XRWe}vG}v|WOL@R4FL6}2sAykE-H~&PJ1Xy1}8g_@+lMLHIr82Nn>Z@=uIV$4^^)>M;^@WvLbw+(@e5bww z^V9-%tInt|G(dgf3V)}*CIUQHm&~2E$r~)qFE`a*lyVJ{L!rQ(%OE9Za)%mEeo+uiWB_5kw z2(Ib~B%%SG>(fvG#-8cRcB-2#2pFwt*QB)s8RU%m>X}Pyss%Y4IB8%^=&reQG{jQ| z1K+1)f*tatQH59U$!ph6POUH4o3M#BPsLxTuUW+Q@@zIheTlfhyWRb5&!{ig(`@b! z>MIYRzC!4cKdCPf2NJ(vfzH4J0`MY^D@GSUeSsXjf(ul3Y~LCiQihlnqe`_f@z1C) zyPA4{`syHdBlA9^z8H4NLbgt+uYPl5-r>G&fcnyE05_geUo304XKkTfZd=oCF0V}M zC<6fMi|M-2;TiQcc}jiZ=qiBV0QFV)agbxp{oq0t2dR@(AV}Bl4x~X>V^Y;j#LWvF zDKH7ja}#lSw8BI*acR$Q*pB!uhB^w@Ehh5hUJW*d)Z^6KCG2)C@LBip*L&IDg72Uq z@i3Re=b=tUA~vU5eVw%+>_L;|qyP*P1|k5xQvw;q zvk#qT0<1QWhF)-s5AeG5Ga1}-2oO{X^rE32r{-2f=LF_+ACMP_G!cdY^&+d?Wq7x% z3TeP^0mZz)u9AbH!s~8(fn#)l+d(z8UuHJE3lAE=9qCw!aI$A_BSK?zDS#YWu9KBw zkkpLER|VVcRK>r5H;@MqAwQnPZghim?|+qWZ=u0auED_ygUFa+(6J=6%>>ZzP|dPo zHk0zmHhUS)@zSm0Dx3rw%z<;3!8Iayu_buE!_*J$Za!TJb5RtyGlzI~^aPf_e_Xs~ zgOo~=15rvf;6a;-q=JAci!KO)g-fhT3vHRYpaI5Vl3LMI1*lC`sUk!Do7kZQGARZa zvx*p43Or3pLS`BGAubdevN2?B7+?w}LeSC<3b4lpgJ^5gXE8%6=~86TJ*?WHi9CSz zl`~(Cf2Mu)-sxZlt+#++`DfZ!%2lCL?F%e6?Wi( z_-x%LPJVD-2wIliQ|*hzL&F0)m}$cUXkRRNvjFlHC*X7>c8Yw-s0Oq$e@DKYzDR7p zsk^_oO=LnM{;qDNr}I36WYglePE~2$FDZ!*gWmoQS(hc^)~2evzH48m$B`?}XUJD| zz4g{Nx#NQmMyJ}Bke$=#85Ik%qo)G7SC@yOLm*CFei!o`E&hYl(QtT$xH<~E5aN(h&D+om2 z45P&cHn?jSAH7GSvL8)t3SpCJ(AY3}jPTR8Q!A~Wj0P)|qj&J)wW`WOsSa4k zNZ6jHU*QChFC%Lo^(pW3&%8Td>+t4!j|Dypz)#KN7vCk-N-KFp>2Qc9;TG?vfv>df91Hb4}aVXrh5Kr6=l(j)pKDcQ`HN} z^v*9Q>|d)9IKj@J7Y)oSv`d&#lqq@nc^*K%zS;T+v|td(3Zby+9WCp=evlwYj9*7Z zwBA)#NAhch?G@K$ixo{1SF0YERT`uV^`XwKOr`a38WEkCK0e~HV2H-AV-X*z3K#M9PoowK8bQ`bE&Lb#LfTSc2X{l@C%kf;;Z z2sd+eh5i6_$jbA}3OR$%Nf@dOi>G;=STL;fOuaUTq>M2h_i}mF+9<$% zVaaPjODBiBMfv&4Bi=e~3gz@qRRG-AWJY?&k-@}C1mJzGo?xAP{d`j74D5?toxsqx zIDmimzEH$diP-Y%u`SraA%ORV4R~MSR8Vs&$@KJ(%2Euk^#;5+Idp1u^I9ni^c;j# z0!CH2m_8_m(yEY~=R98wpYU+>*@fcbFT=_*J8($B$5{CL0q}dL-WT+#=MHPIp5cRe#bAp%+7OBAQ|*@7@;fzSXK`FT-Wd6w>w5%+o2cU2KBDEx&%C zgELqQ@V>liMN>Z_a2{-Q5$TmGy3wf9Ykv2>HX2&BPQ5R&Lu}m#WV#26D9JEj+`gsw zIgpBDa1}pSq(*25jHFBwm{y@F}?@N~FJMtx$ zuc;I^^7v&%j)3?YXZI|f)$(>i#hPJ?@vsC!;MDtSjlUBYx>I=GYj-@hOKNMXjT&0-uRq-wLbD9WDR;i;NFYrncZzq;;pMUwmQ=zg*RQ9l8}sE`{vTPZ$~@_ z^cgqIRAj!iS~CU~c6t`Osk~MW_0|^U;_=)GVw?zQA^tW&a5BrZcxx9muFQr1kT%WI zSjw~s>$#8Ao4Io*1J^B`Xb|#9m?E2(Bfe=&CvQO+iui3h4cS!|N9#?L)v5(9vgRUz z>IS_sp+);fzM7TyB#BeXt8U%vkHu~-Fpuj=rMcY1)j15BW7Lpp7j3+o@VRnHpj@r} zvTpQOvl~SGZ{RqkNv-@aE;ZAlSXt-z9EH{X&dN*}1)g<|J7joKl)Mfd+<7BjUubFc zs&~v0$fI2ZSh&p%Ze6K-n`OgkN1;b2o*=*P-BccW*>=I1$f8VgMO*LI2hpy=haJQM zpLha@@nZKE4%Y@OqP5d4?C9h)d^cUKLvM>^-LkM^Ae*-7dj9g|{TGb-amfSy z%Bmmkz>phIpOf**&6R@J!|+!x*n8^mE_P(U*uGazFV{FIRC*E7z|BV=wZ#=(_F1aS z@!O)y2EXtzcyXd3%c zefx)rVCOx-J+d`Dg9o+|Odfeg0V@X0(yvUv%!O&p499$$Ow=C)oL}j+rcICGu4f-} zN!g?v{AJ#d)O{=_sh@D37R#}slXxZ}bKk2P`Bsqeeo!Tw0YX7`{;PmhCH?-pI-j|C z0s)<_72*%XC$YZsU$86s-JACCr#+hGT%K9H7v9fIBVNn$JMx=jPcHPA+f>Ic?`|4j z>-D+cZ}xUy=zQb;ew1930KxGbxx%VdHIwrF+1Z764~csP<*PXKEmsYik2be`AOnqB zo9yq7UcKzlZ4~^ro0jxpu{rW^D0-R^~fY;@w&WwqS-#IKwA> zdL&|LwvEQtltnGmJB8&~pRGyAU{qt?sp57)KY~#xKX$;T&|cIf2jxj_~nDQo;Gm>3u{@5-!K{OSX7j!cq9-&vBPr)rF-e zMFt7@KXr<3XK+Bsgsc~IhCxlKUHs40+3tqKmx7Yss7VbfAs)}Y&R3^@1LD@{l5Fi1 zeRD!Ux_UdQf1x3rwQ6j@5rxC!i#mVvjQVmlu9Z z5n>C-xh#qKK7Nk{@2d;W>576!!Hn@)PRMLW+w2_Bod^AGGP)s6+5t1;SvlQ{4W)~f zXn^{Hf8Zwb@3;(5U-BK79alLnt3_As224fF2_aGBdX~<>0UlJlJxo?C3&5;{d> zX*zRMd1Rj=TFZtQ;%C%Xgc9Nc2bibVl>Ue+RR7P#Jx^VtuLFSHXD0H z7eIZ*SW~51+g(Dahq1F|2Qc0Y(drGBsyMf9sjsw7D8Hk0&lf|H)zYyDd`f+J-~-f` z$@&@f8@lpxiE4?fcoOIaetZa9Jp-rDA)cur+u>ZgO|@+ zg#sqD*~xq{c(|;1D^IB}M~b#9b-4=K?4*fS{88or^(C<$tOK`j=*79@iL1ubqha4Q zGJ#|9Ag2zNqkx0q--t=1ckKW)>B_@Q%3als{)|D0)|ua zu3T@E?dT^)W;lUz^Z@D$OINl6q$sSUcXu>(Ie++IaqGa29#oLNu9jeJ|HR(@u^J&h ztV@K88F7oOkiU&O9X7CP ztr;=r?T7Y&=jCe5X-#KMFKg>)cE@WKwZE$-EJb`bOvmcBwWG)2N`wK2Sw_qi+n{+9 z-$5JC%~R?t6vnGIRaFV9WA}fq~7|r&YR~J17}`^m#EFn$Y3E zPlXD{yoHt|Y=GX^4s!H`yVx60nJ+c$D6z+`$s`jywzo)wEW7O6o=z3Ib#FW!;DB96 ze8_-F*(Zy4|ZH3Z0EwPaWkSGmy&TuGI2-naBqvPCBMat|CHyl zNOH0oO@qW>Fhr3bGrMRQ=Trqe&F9z6x5ePc#hgo2(9Wwd4lj-3;qhhdJ8Bc~u$5GH zY7od-jJm>6#^uCkJC}T7DO{7Ju!GUkq(&tk!Hc?5bELA;inA(Ee~*^&*X7LzaiQ&D|}Wk zuv#?VnJ6VJh^Ct2Q%zw#RGHE{m$(z4KS!~i2_kZR^SM)z1I9(AvoN>5naqislwMe; zMgn_vey}w-h^?sghyb{A+O3D%@rydx&Zw^z3Gm$5)v}t`n+u2cLTc%@F78NY!an9)t2u2$+*OZ@(GrSq)T-Vw; zlTu9lP8o_i2__0J$wyuS@G$8}CC>ew#`$o@{(#g>#oSG2lliv%;x-@tNm^3Dbpfm4 zut+zK_n$7#xtTkH!02Whz7CK(-~aKqrV|r zjIZwWQCaX+g8KTyZ4dimg&OB)sN&5qX9|Z?>dW;S`15`u2K;Y>x5g%Kg@^87rPd=^sNfi{snEpf}ZS9gY=K3l`j0)B1)>I+O-p~!Q1 z(k83%*l|&$tCGi6bBO<}3KX##R=dXTw)~s zDCuARyma47gHn+QiNZxNI z%`q-u>|R`3q$S$4ynJVM>%hhJzNAlewGX0v$UZf>H=csT45}ac!k*PVjy){^3_;aG9E57pV|tbrpv;-!2f5 znf{mH_q>u3CiiQ?N~FRq?p*@R`|AG_T*n_pW7VGif28)Ul*?hW~)G9v9a^?E32r(=fz@seHYP+X_5Tj^2uYpL${~|S` zywH!3XIVKn(`t9r%1_ZIuwz&fEfZ2**p`Dj4`DBB3zknm!7Qmlo%cm>u)-NBRPwy^ zi^AIkvI7P*5GxT%EV76#U!}ZeOeOn(@QbYlaWZXt)nfgq^Mtw_c|I4e%yGu+ax}mb zk8W_bgF6KfIQ&Act>vAnUt}kMtpPCeap8Qr5{r~&TUUdBGaP*JL;FTsoy7}AGyOJe zWtIb80N(0-N~PJhB*LOB_XQrC@Q9;U)BSpGzj~w3JLSeNnB=O;wo46y>2|{Vd7XkP za#+g1Xl3w=zT;ID77CMMggS@D$oYm=9_7Zdp|75WgKPV?a_7k&zT3J3H5^)G`^Ukd{ zvGIRk=ZHiI3QfNCs+Z~qOU4o#R=3Hbzumqgzg_=%c=QWH#r5rzhtWfABC<_u;O1Qt zv!$ES4kiS-%*}i6Ocf5`(jK>I(7GpQ4-ZmZ-3Fo69(2 z_P6cvR^Y`^wu$hqh!m-Oc8$HShFLkPkQWlGFsAYatjcNk4f5YXzKp8GY6Z~z@Quxh z$uYH?GV&H814@;}L~$1+!nbG;c>^!J)0X?$4EJqEypdj0hl>F6bv@>w!t)Qv7sR}G zznTw{rEPKh@J^-94~!fox17Xyt9tc=lpx#^9IKs0sgCI`zT3dk}`w` zxm@#f<#C2DsSTCeF)BCl4n#~yvhdYhnBw%#ur>K>y~Du5==Mp=2=1^ob-LwOMrdmJ z&6Hl}xx@$uM| z*LB3KPSP9>&+eIxsWUju593NXEi2tsGR7OhRlB+2KF)j_Px+CEQ^Iafy$KScN)NZT z%c8p)bv40Ai!|w+jMEeiB3@!sF^9-bvM0MP>e2ChE2SjsJh^H`-mr$mi-{ zRC3e9)=@UvU#VwLVAxEFX2^XRTYOX10iAG(?7-N1s%c0mv)h3!nDi?Ami2shg?T%_ zUFXgkxv9onR-%UQWcJOd=gRHOPNt!Op+u%AT{@W6+sU*f}c$OU_s7sF^ zY%web(-NVQ->Ofl5D++tZ$f??Qd#m?Ee4yd5D6Otk=s`wK4blv z$h3WyplL$sK{jh*yO|!MDY1)~&Mllqu^+~tF;(CzoL4r}%W{x7Uc6pb@Th{vu|BLT z@EApZW6fU4d|`mtZx`EX*80Z($KHK@HTib?q7R{k1XAc70!T+GBE5v(lqP}-NL4_T zCel^xiv4mnKCNQ9uEK?D%`u+UHsOjCIC-abE0k#(8sJ_g`?0G3Pbs z_akt3gBtRZQc?>8DeDYjDR2#zyK6(29M*#eugB`RqQG8QQkt$AnrFvY4zWWTRFNyh z4T}zz!-XpKuhIHT^NP6_Yzn`c3ed*_DVDf5b<}Q=MpNL0y|hus`MIkT0bP<<+DstH!}Tik}5EUIVRWbtvg6#~}!>Fe`3$@3woLurnH$T*{jrN_EkNH}rao1}|jKdwLg-F^MP$kzkIBHCFv)f?X)jxiD}t=c-w z*)^APHO@}*XfNHgwcEz4S2G@6RKtfPQW&WwMgen0{YmWenLp#YC#6WaZ_ySCnC$5Y zs6^u3HV`Byp-SY#`T@O33<0N)P)tCm>sAk^B5A4WB!ND94Oe3y2!$Fwws3_JWA~yrOD->bD?YOEAQWp5B5S!&HID0 zMeMrDy2-f)U(maO+)-7hYIW4h>jho5bYRb(t6iiv;0-;iDV*e1!7kkW0>(;`y8*aF zP~Iu$>c46t7m0GCykphFSdS*Zb*c+_Giejm-{uzMwY50HPBsv1R|0_Cju( zMC8B_F5)4BynB;OSQ-%(u(ipU1Ghjan^Wj(HFMtT7jtNoUznh`t99cXRNxvhy;;V@ z;Me{$_L2x@?y5qYSzo1e43X83yVpVL=$x>Q9OLnC9wAvD$aO?PZ;UL=kamUes}VsL z*?pq^U`cN%k5Wkg31lx@>lLe>a)iCdms14|56a%^3Jo6R{zz8x;DLb{`Ra!!GePX6)f(Zp6$&UimH()&CS23%#d|#o z*z%cp4c%2PYyOU}0J6uEVv`ug7AM^i8K)~%v#PiH@@OQzijE0@M|sZ+obmg#II`zJ);fGxdQd_rdhn` z`R=et+*V23c^mFMnMb`S46?R9Uo_<=p`SU{-2N_V%x_~`YrZznPKVr@?arqTART!B zXI=j~nmr;P|7dT0JzA&!g7YrHLQXsXv}Ei-^Kw_lCpbEH32f7&M`rOo@i*!eW zL_1E^X(w*m8hmU6PC>CaC_}b4;^e#cTJ8B3`=RhIsGvGCkO;A%$q|8p5z42v;PlkK9_X!tqan;6qoTtiMEN%tmM#^#}MP zVaeZNj{C?L`ZeGv&d#c1qpBuO#7}1@x(xtL8zv%wGV0WcV_JAoULTCZ!v4x1Q?T13 zRG@%r;pa74`mGb%@e?ocmZh;;?;+yDe}VunVpxA_{$u9kz-DW2&7>o9Oe>&C5m>pJjBSdE2Z% z@lbV;A(qYbc^zwcUEC2Kl?aFmAq0#svW!2rw(5%S$LJ^ruKZJza&ZBg(}Cy$ILP5U z&DVx$Q5-2WcdH#nvkrDuKZiyXx$}EIPk4vR@UHpuIDr=u!N^y^Ng^uXSle_w$%#Ll zM7%WuyArWqk>gH{fPXN+GN;-WTzZ0YWa|%ZCL8vCWsd?^({?22L+-N=Zr}`crNdmh zb^ryf2KmVkafv!b9uMbZE;CCMB;YemzcNdJ73x+lf?6CtPRwzOAJh~e*@qz$3>H|# zkco%DuEZ!gXk-kzd5Cs@>rGc#Mlc*GZ(S8%qubaz5UG~^KA4C4RSDob)8T_4R%>TC z-qp!L2v8Mr26TsiW=xJkCA~k4)s+alF~hMn!-=2et^XK| zmLHp8#DZ`yz`}*f6VN=N???W$`wExq;lB*lU;e0%m|{Fd)=qz63_w=kx2GAYzW6(j zEB=__ND`o`*k$kJp9D&@og&pAO)>jbsywQRB&&g)S}U_j$RKRkw*g|3Crv>~q&H$CPv`_+z}c^N4DQ6PD~{aN zGyzw~7R>@KpwO1?u-4~-XaG)B;BZ5>o*8pT>(2{veD8chk}kdg5!R`lS;$xf;5fC? zE>f1gBT5$I+P#$w+MTjs=!HI4z6GRz^D9`-Q35&h;-uL91A9Pb1Jm=y6p5*XeJ|AM@7+5UA<};`a2&^>AROJmCM_!HmXa_6lOVaPx)LR zPqMY2pc?w6O_1*f1-xDw6^26|f1pi!`IxYRmLS<8AI$+q)`C%Z-(TUC%7}3xhd;_o zmEXXN-%#HsIV*2HJ#l%`M#%FsF}|&f@P79mHzRQScn80%OuM`9nCtM^4dcjxoVkZe zpheCcSPKeu`Gb#E#^;&?FpGk@HCgUnu&caPWrge25T_GpgmW$va*-SyQJ@ zsE3@IbAY{qteWTg9qz-M>~L;sdP>COAR!|jnwKbE-+TlY%H!3#mm$tQQc7f48SX&D zAfHPG$)(QwYjQ1Vh}B%*VDBCFtzqC97WZcl@OxtD_gmPo=eHlX4M#Tv^1}@B6!rQ> z-YaVn@`!dT(ft7Lo39M1 zEwlxcR zk`}@jz~e?2>cMDQ_Y6Q323~Cp2{a1}O$(z?p)Cl{jW7#OO$)!x$ga~Ao^KXWoEAX{ z3a|Q$d_^{Y{`-^I+DgPETh4TUK4i1oVdR&{ZkCP(+V9Mc6!NN7h^R5U73^*tj{GkGfH)~%8B=Vj$Rd>B(Y^s^;dON$u z2!v1yBAfb~oAkaR7Ngyp)ANK{!rB&o(8d_9pwkQ1lx->5n!YEc>e8k>jWX9MxfAKy zh0f_1nGN_y=>nrNVu-LaBkx)qnh8pm`~nA1X!2G|{yI@wiBPx*b4j9Xh}e0tKLU1= zGP{Lo0f1RT^uXTdg{eZ528*j&w`otU(E>Nq8gQi-Lo~b#^&}6fFS=&E7mJB?hjSLo zJ|j;AF|+s&CM8D?svLNO%66f* z9Gmil>KwWfeibK_@(*Q&8GpWiUNA&u6UoY~TO5$iAy0HMoWhCI?Q>mBujGtalZd*r zguRzsb&4#KPL6)^uyx?QzT|yB0kau6ux(1N49e9yvou1R_xjIz+t5<`cb1N$haJB& zJN_(n;8{9}%{$4nIw8xQFxD;_^Dc(0E|%pk4%TjN^KQPZZlUGw8>~GN<~>qbJ#x!E zO02zj)&B#Kuh=2BUs;^LmM1Q|Ce8@X*Xv1q>*SgIQ%y;-7K-4IgTkj(jH+PzF9Ws3G~yux~;0$T=VUHi>a_|;ad<>yB^I|7z_sH;5>_DwJLz38ho4EyKM z_TASxU;N&z_q^Ft#qKq-FBh_JPF#Jfvsl8bDwF+EW{V(v+d;D^nqP0eOQP5SkH}h; z*?yr9?Yo#j9N*i}huQ1ji8#KWNMC@PBHQ zPTkOd3_tye$Db_tACEjb>-+S}qw6f!|CmT+1;<4aDQ#q04Wh?LyE5JlEnMrykZK}@ z`8BiigPOCh$@BB&N<@&rGa!SgY#0$Ylu|k za<$RiLR;kjGxF7KwDX`nN~Ui6Kaj7R+mCB`Gye0~JAKRtw+YA>y=53O_)uZc zI@J4qYNTu}EB^*UXwYEhweH|r6t{ADXU~l2SN*P3j|V|39nnHKW0y{&Ty(6dYt(@ zrC~kYJ>p3=s~-T-jZ$A*8`S%fh5*7XrR#w7A$n4_{u7tfKzg)O?WTpo;%9ZxkmwhO zH(v;Gj9}@sO)Ze-3xrBauCJk^*G*Zk-dww!j4xTWO5#hTZbR8HCafpMZ3EZ3n3fc^~9Uj4$A2EDDHabxZ0#KG-46r{txvN1#e=Ii+;U zvH29Gf`a)FWn8{m+_pEfb31#=$EO`vR}j}u>k1^co$T%7Y~jTc+i}YI^#x%mjh3f_ z;QN*5=7pAN)*;-BADVdHY&(qBaC3{Fj@;O8#bwnmqz#I;GpdI6v~GV1``#rRJ5bfB zh^KrpdC;u-VoKwC)C)V#jn8@@ond%@3|dEC2jq7*duB8vh;!gh9GBo8#!oAA>VrkT z!=2O&S0^YYBu&%>H{bs7ThUQKq&jHj+E}Br0)&bnWW%1{mxvs-GT(^Unf2NXz2CTH z@i@&&1u=eSOf<)$b|lMxd&@#libD?1mkfEVb=5s7bq#)EK2WQ@jF-N-<4vH_wo8|=7ZNsc&kAGSXdgO`OX_k zXDsmoPX!!AbNB^$|$zE-#xs zH`CB$Je5mW59=4#Q2DR9uraY7<}24QHQ_v1phGW@U`?2oc^=K%G*xC5w=nPY4XVth z9x@qQvy`}D>Sau~XoOo-W;{NHl7GG%47O&qI973FCt(|rsfc!YVZhp^r3qbeiQ3_y}8K$d{0?W>I)*y2*HDts~?c$B6e!z%D;9ae5g)MG&<{R-d%&6Z?gxn ztb_AF@p&_)%U;c-f}7U_l}d(xRM#dIA8#Gm2;|&vac5OIwd=BzNxa9^lY&i(y{qg+IglONGol~LB}KGZtf${5Ux8Bssu_VqDLI z7e;x-4>bjuNRB0IMei}-gOv>-mM>VDR&Y;conm4`>H`P~=duq2jgsG07fwhB=qU(x zCEuH*_U3d1u`^}G6sZ<*^*P*rG@OyPeKT7AM8LqF`)*!sb)-ZX&ZJ6E$yB~7&wPwS z{Fn(?kjE?dQH@Mo1bnY1C#1YaRLHc$iM1gkq~du6zj-0o$DzrS@`Ya2%d5?dwnfQ` zYJ{-q<+*m}yZ&m&pna}t@WLPz>bCh+kgxH6OfIFMd8S> zTjpz*!f7GP^%!C&CHn*GwKB`In85aJ3rh5n+|s4a=aP7tYF8GW6_75Y>6lz4dWI45 zIzyHIiYm%5Dsb)Ov6&#tfvAQqCHv{@MXAaXXx82qv>o)8@-w)CXh7!OzJN%v;W>_*$!I|(S>5+@hEw_o z^6+Qs4%4=<5Sf78t)ukJ^zBE?A!8PAj;=zQRnkxIi9TF*cm1X}E*;xEj<$T3Be}TQ zdYnEok?g{d&H7mjuX`>Zw)r&It5lz!>2ajd9+2h6SVw?jRp;yV7a;j(Tda`}W`e~e zOHpkdCW#q0Dz9#qpOD|{r|W-j&#q}tB(i@Swm;W6{-_fFu*4EC+c*e9neb74bJ#DM z?t5pY*ccac{j1Ck?u22IGPb8ktA|Th-`~~{VwvuP!L(v0`--Yq( zAFmhK!rTucWxXh#`_%opeiG7Xderv)&C%yiDfF&&Z$-X^=`Krrlx!m14^0aGUBDy` zXzDu&b@O*=-K1*t9+bQeiSfJGy#C6v|95R-QexI;ONGK4@xyz^4^B7vLeB_@Wony<#(D06QZ)+I& z$;;Bf=i$9ilYZQOD7dOBbk--e`ZM*>vu%Ozr{hvIFaq4Q@tZU9R_gMXuutE&q&Tdm zHs2o?>%UGlg`Jkb-~4W6h+Lnu)Y;{R2S;+5EdF1CQKU^I2aaS92O1= zkDm{XoDWL|-HhcAPsfE@YlWxmh64h_Gx;O(n}V`W!;1%naVipCJTMr(&f&hEw>SV!+OViuaBSEOUs%}|>lzywZ- z8yo$FKUR4@=DSwh9wL@#4}ClsP2WrQn?D9WPkB~Hbq9wna>sa5T-+~E z3`{$wTN;X1j%V|UpT-dwAzs$-1mI~rO*)X&G6WZ{Mb(pBvje}yhE}OW3sVh zN;)#hR>0S+S?R%99D``gf1+JUPCfzlJ}G_z436Qc-UBJ;n<+QXi0G7KHg}0%-jDa+ z!}t!R#xBsm8A`KcNIR2AGp7J>#F5hqKq6$)8+Q}q1Tq?t>2I|?l4YQ&%}Sf6L^LJQ z1hgy1F~f#2Bi}I-YnV~z=!PrFIPb~WJR~CK1oZP~w#Z;&d@_e9=$jq0)-+=24uMll z**rc;)STHfGSH4@s-<+LzJ)A!Sk|yW&cm*(Pd-nNafz}+v6|^Qi{6PHLrSZTIpmBv zn~rbq8Ri`MI4(#7F8AI7t8+s4@NX~g|ATgk0_lzcmnCmuW^aG{ygggM(g;Stz`2_O zxwNIZ@ea8RrCCfZxk#rlg7(!&k;fI0=c$!L{3k>CiaG^f%`#CD1r}lj>ca(!paN}hlI~)Gv236|xKLQ2@NQ`#zeAyE zY5e_`LhD70#g{@-iXxkcB5JK7`}3G58AUESZ#9RDOurPlIu!$(iacbaJ$;K`e|h6m zTHHKX{6+_tN{+*TBLg4cBAq-#BXGqyT$EtR%Tt^$p-B=^vc^}OJe-%BQS#RJMdok` z`fHMp)bC)Lhw;ZX=&hKX{AmXmb9!UBed>8+55$Syoj<*?Pslm<n^1K0qWV>=%K@tDP@|i`SHz#HjrTGs&Kb{?~$or^UiLQ`!vJjuF_`K-T z$Xsd1FS6uR`R&~84XNXvZ|*TS@$63JZ>J}4-YN-M`>zpI_y-QR-&Ji0(oKV_$@d+M zh%|`h@{e2IfFL!H!YWF+{IN|hL}X2Ku-TeU4a%wJdkO*hGCuFEVPh%yFzYrXSPSrb zg%`7;us+p)Gjc0&JMb^ob)ya+2DeTuNLF!RL^{Tq{>QC!6BkQ%x>UBrzZ|~RZ zLmCW)8jMHkQ#$d+QuI}#ECmPEWQZ;KNBr6{jB*y^aeYMvMIP3^e%iX_$c;ZKVB}G0XJ#0M zFp7iQruJK%tqmxW^3CO(f~KC0w=gY^I2~`R59qeLV5HaO7*~XtXZ9JChudEx+lHPp zu6$tHk$dWw+m0n_|NWHVY@g|m5WfA6aXaxSJs_)n^L_h04@P%d=Iss2k zzM0i^Ym|YPwMRCpLt3~SB}~uL+N1oc+i1P}O-*+mraQQ%mrb}wO1M+Txl2jDuMyQt z@}yV6yzlaTp9*EaMSS14Uwt`Kz33mkdQrVM={kvrY-sOE!qXVhJ{W0c4t7R-t zS{cHeS#m}hu*-uktwYV$195uzp+xiH^8Vp8%8|RM?kZO1JaeI-S1j+M7z)e>GFk_- zoLTD1hbzo!dsv71oQG=4N6oA|>k0MC=ty&Gzb|5}?Q75YDASnpXrVJ}=P1kK(%6Xb zxOq*-{KaVF*V-NHkvVgg#jMdL!|^5OvF{gS#u^iw^5bj56FZ~hP3uFK%j3+>;|i?f z-?|#$#q8wK6OanJwXb8l*0=k+y1%sc1K37?wN8*+PVRO!ew~`4q?)d6B?GOnBNV3K z71Dg)SfJmgxCz}Xk^3Cq*pKt4`J+d6awo~CX5=awZYm(f$7WC>v#Pge)t}93M$c+j z%<7KK-u^bLPc>&KGG}}nKWFl6&NO=Ne#P9wu{n!xbC0M#Sc`nHz5T)d+5eAxU0?tI zI`WnJeWEGUu2t=2*s-m6-DoF$JFQw~$5?utJ z>!I=On`ZG|_tpCzAneWW$yp@TThR|#F;`i0JhF9qW@|BM%dT#hq4hXx=rI@e2`k#s zY}k?a_S5W_&i13vgd?AGSG$Ohxri3I3i-J?jk!q>aIPMYzc|;wWG;6vAp*`7=rlX0=RZx%lSTROBQs=Xk8W z^>~AQOxROc#lB3*eOi;7@-zQgK!NjA!D|B7m0Z;Ot|+sk*!Lf<3muMoL3pg6C_(>= z>uN2>&XmW*R`_OAjBQj#I z`-R67V)_6eJm8Xo;NoDr0zou_=7ZI_J)v-Joh&WFZgnK1gwyh1P5wYEr)mVNc5T5> zBENB|`A}`)NUEsqaF%vm(O9Om$Cu@yy5fnq%7I`u9l}ItzE+~3#c+MeOtC?}PPR@% z>0Fsvozu#2L)phFtDXop-FM{+bq+J779;N}mKt3*hqHAXD_7ooAAMOFX{`FxfhLBq z>orxc_k__1S&lZ*nY+*M1&JlXyz#nAh!0U{xd8Z@1L%&ESm7EXP{@-?h4k zGW7|A+<%VO_diW^*p(sxWc+Jr67Z>-KZqB{sqXymRu}xgSY5l_aGn$`E_z-lq6dES z|7LaNL-~q!H{oU^3pPPA^p0Dea_88s1jXGo9;%y7|I_MHT<4*dZK}RTZSL9?0yDOB z+)nQs)8Y=go42=}*%VB$y7>NOb)`hZ41W1^gM|KCT|!Z=1gq=AKUNp*?VF7DG=Hrw z8iLi;Osw~BR+s87AWGRLiZfJsVN&!Tt1E3x>j*GEp)MOmu(}$e&_O6qkTk*S!iWP2 zR+lk3Kf&tSh8Kgkpj5Yj9{;ksXvjB8g{s==Nhs0Qv&cdyH&djriu?18ukY_Ih8vc1 z!yg(5dh-0Wy0(52tS*f}ZLt`h+B?E&gTy+2tu7eB>dK0cPM9TFT|e(;>58TOwYqBV zvW^g}F32#!>f)wg@jq69x1YD0lmE543iaTMg{S|E(K#4d4=bM}8)U9(oyBHhuY zLUH0ka?>T^&5B~RN()xC!ivDTKyBfM*?(DG+QJ8-BoRCl|5#lLNGm}g71E0FN(;|r zbxVtpnUnr%R#rvnC*YJ1%{{_hh#sX+DnmPOwRx>a5;Hch5N&*e?GP=>8&q-c#)XL! zhpSrEV`I@$imE`fNTswWgJvb)osHPp2J&5?Vqnn0gnzr#muPf&LB)liT?)I53MN*W zm=`)`VQ?KSc=QQ;ZXiKIy??Qz@(EzW6|RKYU%UN1cq!u)h@VsVQ|%=R@iiOmsX@Aviyy`D z+VK&9pH{of+b}+xZvTFx4G#BvaSoI=?%ZjbWGtUM5xYGxSCA}jl7N!M>@N3~#0(`| zzsyR*Q*Un3+;KobEZeY97?v`26<80XIN>tA&KRT|!jG*J2?{5s_y96j)@6{kWfKF? z?hJa#9Y6Xc9mqHbZt7 zrW|AS(j6WLQM?Z~1C#hw`I=onUJpaCI%ILLeO3k*zqev`Pn)}dt-FShytYcZzkr>P zckh$xL5${^N9K!X@y6T+XtBaiLe);D%IMHz&p*{?LOVkYt?XRwdJ}Ec#Ka6Kkx$w9 z7*!OaJUL%qIOdpl;^YT+_526JeS?z>zTH`i-HeySI*`Un6Po7n)_>C{G>z(-C6>Y3> zN`v8Km-A)n!OV*)6;|$_SQ*gsrLmOfVC4wvXOBE5VRQIt&eUWfvS!{;lb#_=viDOs z)jT%VJlo2BR^sV{)ctHvt8T>VS}__d7khiQ%O1L?iHV>`0g$hDP#G+)fi(=kNY68Jy>VHZ@^_b2Q5dHgZx&jfyuQ`iH);dhh@~GG#>g*sK_eTmH1C?}0>pZ|}g#peEH-M_Vks1!{rF?UxN<_Ma&};9Ba$fA@W4>ZNg5 zBjy=gzl*JVt8{BhnyS9mtnCG%S<;MkE#yNJP zW9Z5RVe;dkEsP4K8K{Q|T?3V|yRKYyPQf42DIvUe#(T7O4=a_;eGeQ4Ik4*4Bnb*K zH?W=cvga>09dj!5W>ywU8{mh$2rw_uQ7&|DsPw9!73*F5*wKzzXFm?T>yVgIy4_Hl z`Yq8~ZWlHh^R7O2<0}y5w>UrkQh(bp>bc*ar9SDrcLTQ%+%x|y@7`%_`8am)^8M2N zOGbC~W28%<*4fG_l%FN#RXl%1B$vDNcUA2a!}{{gRp#(U^)D61zH%3w;Nq|zj-2li zn(B>MmrS5DbP_nI4A)7h#3<~8?u zJ*Rb<{|39B3KIR&86xJM#W~FB`|~(HOvSjAzHiM<;M|Zr5_#Km!7Ow_gA;nInuw!s zu&| zRuGE=iHR!4nq03`)tKp#_z`sdR+9X)R1jYr_+=e*KA*T59?%saB_Bv3vhj*)UPP6| zhuc+AR+;!^zTC}S+2dLYOx+F6k0M>-B+jlcRh50;+Ikyoyt&6uv(+cqazrBb+jax1 zP1Wal<*EZaaXAs!R-n=P76=luCn=>OFk+g<;S5 z_|&9;@nOOnS~1XG;akb!nV{DLlJ}c8D6d3gv2dZ}8KGj(>nOK$@C11W^y#Qvi0 z@43n09?{--Z@No3u}~byZx%JLO8lb%=zdIIxCR`hCRU-ib}1S}l}A`&yf$Seh4V1) zL@Q{7Ctbn$v2jAD^CG@=;@sH3L>J#0vARzZQHkJ{a@<;Kgq5*SJOgnuO;9JAM6uJA zx;YhzbTI=^p297%yUB(LBi7+?Sw63%1aMxWGg*zg6VfM1QfOY4_ok@axhUb%7x20- z`yNQ?AApQ3aPKkZf7v6v~cL?^Cx;U-MQ?nliXPe+_XHO`;DcPVHyl_#IDU0!L_yQzgSGL*sz> zscbE7DgS|NUu}dD$j@`(@z@>Lr?p3vKrXT! zxB}oOCA};V?Fyy!h2Hv4qU()Y4y`F!h??BKi+BF>$-0-cruy~Tn% z=LH5C1(^WYvoj{fRDbGRVg*@*x)Xz`FLnHrf{&O2!v{r=eT%f2rI|CB6s3U5ricfE zUjChA>c-M(a58sMp%~7aW^k$n0JyW0bZ3*wBvink8D~ony3oa*ctM6|=SA$H6lS8B z{py!&N1}y~@GZp+7oZIkv&FpW5yi6V#NlxeK4llh zPSVab;+A}3o)Ssy9&v*!>I6V3i*@T@E+3wR%`QN+swF+vOKFi5TFO*yll-!xaz9;x zfsz$-Eg}OaB&D=uO%KWop2%$)m-b^QZumlbCo6m73Krt1N_z|01sN^`{eKVGcUHe7GjvaI zRr@g~6|qsXL0=@RZhBqoC5;(o*NIi@+)N8zr!7dvFI%O92ZIeBkc+(sN|zO&JFQaV zgMsT*blUK#HP6Q{AN5j0RMNpTiJ->(GE(~+34Czcr)K?&BD5p6Atp6$$ohrqV6h!Z zWjXjT8kysfE(W(q40`8DBXSU@bZ11&(>qk;A?pn}pdNG8t+Ij>qawY@I$bbKzq%;N zIPHE6rM_r{z|louQh>a~`OaW22WGt4~?Mx^i7nT*0y{S#Kw)&7qRBXsf4HtM(T# zwv(!NO|Ms(YG(~>(@El4U4U6Cka%F0ii+pwt|7)x`I*~#_%z1$^hn6A{q?ylH5D=* zHdyvm^C?RovXC`k%Lg`mJlES~gX2kZTR=6TD3IL~ay#HfzA+8I!-8NVDY!8*x><}6awunw#-i7!_DkfdL+~^%K>RK<@-b4c~%G%YE+r4vWruV>; z;g!M88py^fufY{6PS(wpRp6glAoTP7yik)aIQ*5H%X2-r(J#G$X876?j9TR{&c(V! zEjLVh6aH5HM?W!gko`pkOHJ1G$t-&OFA_dc(3Lj4b5cczzi%PF;9hHi*()exR)M*2 zKlfrEUtEEuX8*Kb7kcg5a%Z>GD`*1%wlDe@=gRZ}ZmfZxSeZ1u8mSkiX!%Ojs&}J} z5R39yZZ(c7C=u+;Nq$Vgxv-|1$dM-Jk*V?#+L@6KO3toU#GwGiKI=ph=`I-mrr^((`rJOck! zhpnsUlR5i(RzcSWa9#9{wjhNCC4Wg$fr=-Yb6i0n+H?SNO$<%qSYA*-JmQ6c74O2M z&*9)_4E(ndLT!@{35~IMYO2TJ?vmDmTVn;w)iXPBR5BtHQ)m)}htsdup=?i>wM1qd zf(vHXc<#8*AeS4t!E?V~LFa9!_1fk}e~@q~R4Lw`ZfhuK&_s<<$FNZonZYaEfeEA^Tc_jz(^#n%EF$j-pjMD}F$E|t7pS1j-~_kl zroO2SCmNu=v4VIci&&gTWIQDVWcI1lED297Wwp7J)+FR*bMCvA4P!sZn!A$Z&TG`n zvi`uKe#1`;AhP&sDESV|F9Y=2so0yj3N45;_CQ}RZgzROx^lTfu8H&HjO>GxSyu~% zmaa-*R)lg_XvDni7$C_#FYYw1N^D!2!{dNm%VR4Xezau%)uR-Mm$r=>gBP~oD_g%g z&qtb`_lk{AzIj@0mSE56CIu1LHR7h0fCTnw(kl=u5AUh# z!m(D5y0dqUM-mM8cd}U2rPES_uEPz4;6{-$i_G)`QMZYT^!$@ zV{9CQq7^`$jqx(NNS)$R`F9;2C04qTZ*vkX;^d4x8NvPTBbO%%J>uh zz)XQyZc6z;!up4Rb<0IS%RY2akLJ6u8r5)63Og^@U%2ayG=wHRtd;T9`+Ge-9YP zl^}!;G%Rqx?keZ$3fv+;IbWCRDT(h$gp=W8xz~tYL(DLnK+nm;;JJj{YoTS`onnW_ zriLz(^6=*!aP^S1Ui@I)LwKJ*eB>?t8;Z058EySZ*wrcgZeFU$v-7V=mms`j8a2w} z6n#O7ToDo5<1`##6~B08eI+#cus}Q6=EXqpwTVJQx?9Qr)#|eDi@Emj!`J)uj&qgve(o!Y?;Jn%7e4%W zVA1MxazH8&_uaI^pA^PxK7VN4E9t>o*^>6db};^$zNK71h>|w<%~+hI%Mqx_Q*k2l z*V%gF#FbMtKvz;=e=hUdSw6nz%zb+-TPc*?;=z&WM=fjr9E*oX-zHk5j#l-2CG2;n z3iW@Np0Q#lGIL&QxZfGwc~T=8DHFPN)ck2y$Z7SMX;Nl0rRvG2_U1po$JXDLy&YUt zc(t^>q$hXKN~|w`MSa&5MP^cazzX6{a#aBfC5b9jZTmhWfyfU zS?;>2(?7hs+eKsB8YkIFwrHTi%B^Qu)~PirHo#^<>#oI>pN|RQu%{Fk8i)xH)22yn zE!ED?38GeKj+}MJ$=WRb)WufHBM37_9)cx&Ei^}};$wFO$0QeoY4l{D>DZH8w<+7?iw=644;LP73)l>V>azb*XREhLr;14cYe z*&0`dEvllgSC?2Oxn}48JQkY_}KQrC($GKb$FlN^ArA0&tquhN_$!sapkDCgndNm<8$Zwg$Od5Sw)XO&zd$< z{GPXb;?@7mQC#Wg(*EbyAD2#IhF5jC#92SrUK-O^ZvFVg7D7^25eh2fduQospp^I@ z_nMrT3y+pD8d(i7Jx7)ov&N>EEs9ry&sdSu|_pNxLY?=Qj-z^opvwlMmw{CI4bwYXM8gjd(CzGfvRGGYTkLlzo zXIMW?*C7fXXppgncN5gg9cyw4WW5Q?CuyQB4n{%$%y8Vfm8;@(n$Ktr3XEork5;on zGOyxP*?LiNzOyRi`(~O-SDX|EuGT*O(sYtdfHIo%Ek`Zpb?mNJ3@YlnO6DPys@zol zm0J&;l^!*4s4o153Xc9ef|y!Ss~;7Ph>;QBl$q4ZmGm8m_GqOBY69B$Kaj9fPldvo z0F)fU6C5JPno5JS5T0Hec0K?p{~ZHKwsK`^{V;U}bS|+yoP<_Ux{D@fEm|pylhSK? z095T-z?UQ(C-ox=c*b9J?%)!c;?vLK!c^>>wvj4b8aw%-f5iH+VA@cWp_s`_J?7Jd zbdCp9RKj>E-Q=u5F!fL{1oV?cUA&f&Pui5xCSEPT6?I>hkzMpxXpjMQuRrOTYWf|{ zSVTE7q_C5Ovu2?f;p~zzrnj~a797J&|Dd2anY_RjNnyl~H7Cq)YC>BEBLH9?ee#so z#hHF$XCPQdT2qZ0Q~EO4!#}eVh~i2Q3q8$;2AXYMfq@eNqcrHrhZwm#Yq0Bfwlo~1 zjyhYS!0YZeH8Kxv{918$aT;Ic8XREfdA=s*cN$8~E)D#oO=c5zxoKUnKryZa2-_Mr zA`1!P#-NB_+40g(p@kI7@6QCTC2{VXbu}L&l>Lv7i&1QXX7`O)g4|8)wLc58RYaKJ z=dZo1p`Bi{oddmt4b;d8h3I1`STtM%BH^u87lY4EDo+Z>D1TPdUaQTZ{iaj@%#o|+-Z z2i1uSo4i5tHP}h5ltx91`WngCP@Y_1_fR3pwH)ifJG-e(-`Z}d9$Qhx9dom+d^5d6q4E$FP0A!CFfN*df{>Bzt(@R)ux0QW&pE)m zDAq0t2=EvM0KkX9i00vSYHl@-rBU%mcbq>zickeo4IYLI`o)lqN~_>H#gcN&k&fn$ zWLo1>@v5Jx0EhFcE~|5ih+RqnW8kP^pL;5#mQt0`_r@dkV^$MR^gRO{jU|6j22=%2_#VWSPKZT0yv?V(Qtzt}SPzk&S^SXw zOO-^~bs0+AIUW7jA(u{pVp5R#M$}_3x*u6V6!x1y02)mMSfg_L>PLhpszyuPs(1|Y zd7$zlO+1q3RSXZniULi1`Ux&tR9#s4IYohf^gIfG(30CAu%93?K(7SLN&N zJOp5e;Xn+D@QW>i@C6_nbQ7YTJY-Vl2g@-4hACy}AKFGXOx=YSjq@SU<@rdQWwmlzJrd`f-sQ znE(;mS#@{He%!5>!7WC|4!aNEs1KrrA}s;f{T=|IU5*GHGjh%ddEQAJ#2J*XYQBYV z?IkM#;&cl|hZ~DL=IOVr72eAeETyb!TSeppDJKXkiUzIWdz@I?@K*msYXakfXERu2*BS2uz z`!*2!i6`C`i4*Oh{_Cmxi$m2N3meHJOkxQ>+86*pdyuIM5!_CvtVgjAxF~~NQZT>! z9^HMFwH*ebDV2Na0Kf-m!h8md#upHXM34MXP?dO>6O4sU5c8K>hV$AyL$3n47|XJ5 z=z0Tpb>H}~&c0e4bgjET@6B*+uUwxVV`vvO05fETfxPZ<^3ak_i@T&uP(sv*)g_AC z^dGs)J;M^9R~$|^0EC9!Z$#hn?@$obQvv|IGYI$$orPWiRbUt8a}3#4zqV1MKLCxW zCTJuPhCx68dn{ekt6+Sq5OJWttb<#5z%`L*J{*guezzo-Mi(#uWgr8iVdDoS=b=4P?mb+A)PB6dPhh9%ckK4v5JU?P(pjOUQwt z8Veb7P|VZ`!#qa?0kks*ecpoRnWsB8iZNwGXrNuo$RFLaHbF~2C`Y?jlUtV&Co*I~ zYe-6lIBtG4S)Hg3~;jA^-qoA9tHya6`J(;Q&xP+E)+=4QLTU1E7h~6h;6lgylUP`jYah z9ohnYmJvH>i>6infIy=uNcNX)IQU%vfyHq!5j_+E@MCfTt+hgrNtYS0s(j#&Qe_~i zv8dMZ%S&en^j$D48i4GrPi72SNmsSV?v!>&)0S^BrG7Ir#WJjn&5Hdn-3lP6zgkj> za_W6w0P9XBSkW#QSfg9iAdFWQz ztx7`UXwj^lIO=gEP+1)xY~Qo8U`PW19ijdt4@~`3B4bA;Sg(VP8Y2V&1g?1}q#Ae} z$(ij61+EdSRh(WDC_6`Oo!TMFih|5&G8j++zFwsd-sMJuq3BC;-;JLcF!221AKWLy z=bxf2a(RUDHta9AYj7T@8mn;#ALq4$J^1xw&p?1zG#)phI+BQ1%G^n>AAh;F_(=hW zQY{`f1E5AS#yDVT+JGU-Y6_bxt~12}TPIz1~_UPYr@#B^~rH1!NJ}evaxIxiik2GUDez()tjR>k@N& zX};gOR=wFR~*AQ#wwJ|6dWs! zVNY!xJpJ11!a+`%M_-_FpB+iJ3%0Pa<7 z%UA+=;X=2DT({FaxdjlLhpKjNLinm-UE1B27pO+JgXeEE&Dxi00-zk;)yoc4X=siV z0;M8J@jjTvS%siB3=`5un3@Bj?HpzX1CBsh`yG%_^c>e7bYc&@w$1e0 zNGOrm^m$=lZGURm^*RjTnnY}J`UU|wP<~KRMO&G40WRroh0p;Y+S{(Zdo95LVAWtq zwn9>{JN7jU%DR&_>?$=z!2mdjcr?0D>v+ckNXYm~iJ264{YnWuBqG7WMoS4O-^c7g z3nQEjBG%(bR%_oByrEj_UQCt@Qf_tsdyrYmRJIIv(Xw#swj*mKNp z5i0D$vJh=|I728nD`PH;5^wHg->8B(JJy38 zh)Dgu&KmFO5a;E<8ciUdcBDdul0^i>A;;rH(WTyD*-VT-F3)fPB$}4rge>Qae9`@( zf@bEgt~osImkOG3Rf4kNcTqjG0C1=*As9<(hlV2Ci-#YQ5dlNj03jj(vH&#Y_=+yc zK0(fmi4{OqMefdqrr-w@=(PqDWgt5M(9VZOCp4*|DfDo)0D`8t`j&yAy$4M#ZUlhc zVp=i)RHQj4v=zv=z2F~Mi8N=XiB|YYYLe6Nivhs!mW`FFU95`29cYf~C!lHyM=+3` z1Q1{O;zywHW|~K^tHHX^no-J+DnooA@R)2G4{7qdM81S-EK>jwN$JFi2i^|6#)sm1 z&KV^92V$|7*%E|_oIe|TT$p@;Z5YC@xjrWFvjyw0z54aI;wW}8I!mkZWyqap()RZ+ zKR$|$bV7poF+?n2s~uniMbquX6eysDU}%7mbk68%BpAcb3Xm*>$vCMsJX63qzW##fnnE5rJjQp=Pq7Lo3T0B+OI9atq^`d)UpL70$sY~;Rb>N z9@+qkU&8?}%&5qrMix8Hicks~lo9{hoC=(Rofj7LHJ=a2#FFpyx)?AR$7kOh13>A% zNp)2u^EKM#27`cpy2=zf+AUesN-QSXnul?ql&tLdBF1`NIvyu3emCrTGU1)*l>DR=T&b#lLCfF`nI&k#Y+0(zaLF@&1#>-$G*?XRM*5ggv<`9%(zhjg!nDhovI9~`(COW z-btaaovhD*#CHSG)Ma0$8Xmm&vmZVE^DXWV?tSFK_)#BBZ`yXj4RcrpVpHAejn!%) zUj2mo*VIJGdLkrXE|6Dl1;@h^kZ_+r&qDFE28X& z#t;3nG!w^}vJ6MgJB4_WfdNW?b&(C(Sl<_3(aQkU7ll=&ZQ`{Th$muoO+%^@FIyJH z%k5td*iy&n?=>aW3tZfWO8txfQBzs*GnbXNmHb~_iFuO6@XMLX54kN11NfJ3Rub%n zq2}7{c07Eio3tYPHp(BPHpzUG;U+sxMM79SDaAby(=6%uqU~QpT2NAB{9ABsby+uM z!+dEEtv#3o%>jhZ!Y0&zlWXFEM3Z!;yOs1Z$BP-+vLUFzZIS-)OJO>q~a!ujyTrp?RSEGbqHMab?EZw&vG@ zXVJT!wUo67KhsPp)?|Mex^1Dm&@*)#-|=$M+@x7*JAr2GrEIjOU>(*#EXOFHU9%HL z)hB$PeBx-Z^ZPTrjrzLD^P2{pXPB6EmQ8Y7D!uQ{(ri+F2jRSu)I{GSny+b%F(T?E zZ+!`G=_fzx0c6P=k?aEDsAvjJ>09CtuL2jAFX;+S6ND}RtcsaZEHzz&lp-0)rf25T zU-*k1R3Z9`L{IW!>C&MyhD11V&6??qDz-CP9fiMny@>$ld@mr)X+e(*-puCWq8L3c>LL<-K&;v%i8Y$fG!Sk9GbiuQER4tscfa zs6cp97gs+!Fvd5)F#O>my1hM}JPty`xu(Y`XynRqDRXjzV@wYv2m|r^9VfYbZ+_N! zCM=B=8*yiRl{26&BAZb&a+}eUX*gU&zD#WNj#Y5(Xq|}StD4b!X2DDzJ3~BEh67FV z68iPfe#yU^v^+Z~ZB#e;)@vi|qtYPWLuZ|gy|w78W#4#{9*sEMsbH?bOG)VBCBaHO^KdVd3#QMemV|7J{-SQNldOrEO zQt_RbS?~+%Oh21@?6kwYzx0U7-ywVqnJDvarPPYu&1z(H>5Effe}g+YcvZ`?CEDre4~aTpWQwW$LIjt%fLek zFQ!BLSsgh)!~o~#DRK14K!ZbNG{x0?lLuA1HU;c`M4`Au<`O51VpEsHv#6zF<6lm?x4UrtKB#+<#2j#segWl)pfyuYfvNH!!hKUmpt4dC3R zm414|VD*rKon`#|8-^*ocGxBUeaMLCV~w1}6En}W{dWUGga6jSS5+-*LC49#`PmTj zlf2D?CPC2OJcgS-g;Rz>**W1Pp1CXu)!*NrPlaz?w46xVzf}Bot~47p9kv*H!B8~U zVS6O5I+;;lD_G^YoAvw^Ro>T!-P-RyTD!4mo`wE;8607*ZV1xWuKu_XFOQ z(e2fj`BnR0SzSv!fxR>76m13n=C@IQh=P8M$J=Uapw!(=J@ZLlA@vpT1B7$2^>Ip+ zdJ={%?3GdE_pM9KK`zOLzoS=HSD{wstk(6@iT0ZZrAivRGUu_u<({WHvk*i*w!9;5+})JFn$hbjY`j zzP%Ux{@?1i@UASI`Uaw4?ZVLjB&1mCG1`Oa?+}H^tv}~~nW%mhzSTO_5X^zkoRVM>&s>FD3_uB*$0 zH9tXJ-3nZYN+U1&Qo~QhI3tfY$@T(4Dx)FIuq!NUVoi^o-Ie+`>h>+ZFF^QO z#SUIB*&~;FJp4NHTV&FqAl4Yncqj}Z4Mix!x%h!hoN%^B@GgAsYDNEM=sL1vEZaZ2 zD-6b&UNZO-`_z_#-rLx^zn|NT^1_{BLeh|XZ;mI0ioLm?w{T6U55~nRpwbGcaTs9t zFr<}=Mdt%>IR=75I`<9MdTvv2^=UJ{G*yql$>k@|=eXd)H>xf8OEX98BMig?fC%UcOHHJNN^g^wIwomzzA@m@ z_i#}5M6#EqHS&9p^ooeYzEH=IaHVOXo7Jez5_FDGX7tK7%nw7S@KaX|Lx;=}RHe8~ zDLKassPoH&V;BLRj)aZd;{bueejz)LMZ_>ty;Po8uvEwsS?3{<&31TXpqQz9Jx1sL zu7|+UhU9Fi7E0*49|Koo_wVO8%?@+LIz5+ZT^W>-Sha`I=V;9UQ=?gf8Rt=qzNcQN zmvKs$vE$4vep$5<$ZBD9E1~yR3RppT_==yfI_Y$je_DP1nU2k`5U zrUo!?HO@IL7SK-%dgz1HBMpK@$b1LJ4wNgA2<&TS^O5Q9t4iL{6D+qheP~cRU1krH za?jVFW8d^ss+G5kbGMjoD^Sw?_N`9D6drTc>BTnp6)vg*?>t)JE)2Ux8{f-CGTWVi~+;+?rIkE9(cPC2n$=58|Vsg9pP>~VwDsq(t=jqYzE%S6*BqV zYyPIfg0bRObNBsdFoSL^@7anYGxpfI_q;d_^GDz>Z@J&`AM$Tr_xGJc_3#Cxe?{xw*RUWzuY7R|stuyt z5Q@(9>ZJ+?BdWLIaF|ZY5Z@6j=6=~sct2GgNJ9LsqW@0yY=71Ld8K9%h4~dI9>m(n z>ev_UEAC4d;&nf4-#5D7{}fwZiFo5D@pfn))>ZNOzh~D734P&c%N*m7~#>iqt_}H9{nXes4Ak=G}6^q>d5EG8}MG8+1?e2 z(g9>abnm^yrGmw{eCPYKGeRa)T$ky>{`Nl5d{IK&&8X}l^RCtUsT(BX@&3u4#zh@- zh|r^y{(D1p@~qg{zrB2~{ptyOtRZP-r`X9zgQ*Qa(k!Wiv!$WZ0x;SMi<|{a^=n4 zq#50Y8G@Dd(uG9nA+y&UJ8JZN=AxLO!hxcaR%di!SY+e3X` zuD+<<+N=L9P6phsrIX;<=vD9PcP<{%espYjPxf{7 zjo59QYf~576Q|hOZ1%Y)hf}eKj`<*^zDux(zpkRu0Ogw5-f7pk2^ot61M9VKRQr@? zZQ3Ks_0-x%;x_jsLk)gKdwz*b-*)`QM`sf`q_ef-?||x|>O;du>pD`<^r15sFJ$y~ z(Rv1Bl%YG^bAcSKGTcta+jbw$B|oY!%hXz6;TSSn?(tZ&onI+vp?tLZ2$nt{&GNUG zEJLPQM$Cyag?$PQ{Wj*=)o_M0*m^THjhvjW`@SWvhfw4DbPw+kwnDDFT;&~^En!J-jXh~!DV4^7v( zs5TDnIIbDEEQ@;8q04vmA}I51RxbOIjjtSjg5)?8q_c5t2jvrzw$9w29Z~ZR)l^KN z8RUuFIqQll%^jBEW?Hx$@?jHocoWAGkEHo1%it{nnJlXF*o`rlnmq{hS6e{qC8tEZ zf`kki?x%BclTL`Au0bjN-$3P$>1Y@p^H6&67SE>u40X9IT6xkD(7S`*xFu1j1F3WF z`Y|!hrt!+e`spZgI@q#^J|o7UNR>zPr!1P?ly^nWVH#^^-hxb3aCq$eB>}8G>+R5> z;Pf`6y`TkY)8O<`!Da4O@HG+Fy$T-%i_YSWRTZyJQi}_{qC49g2(X4Qi&1hljc@Hz zsT2$>G~IjA;%@)uw%c!khD!-neeAS1Hnv2>=ZT1~s=W8JH+PC26)%4m9qo?|n)LJ% zTQ^hm`>1$tP7%K>tDvr< ziv036^4ot_*QF8~LPXPF^%D~@e9FUvC! zi&DnN5%Hr;bCBO^k$z z3CWv|flM}v3(TLu$9i2ZbnIn!HOBPBEqr*`Px+_2`KoE$ovYNgdic*OG`Dow8+EQD zdeu=+1oS;EbdNG=^Rs!!2K0pZX6phK2ZG=lq_?&UI(uFF{PPm{-K7iU{oK zF6fEnBrG&pXxgLh?LQa-%arKlny%KZxAv)zOtW0K4WcIcN=IT=7tuo_4Dar!4N%;A z+Hx~GbL4Lv@n7LEmRq@3B+0A$nfjkQc%vW;y*%Z+8z2*t9*(}~p~T)!0Eqbys26LX zxT7sU0&XbMkyqB6d3Z(+$EZ(ZNEsa;uD?4n=;gqB2@ook0(5w`bhY&Lj7)l9BUt5W zAX_8wOA!BzMFeX+mV0O9b?r$0aB*dVo|G5M)L_In7X={}tQ-P_$On4P8$23*_vtOy z3WSO_{*;gZ>nFa*cHyTe4~nbzYnhKqGe>2^Jo4>J&^-Q94FJThTWfw%`!o{DpTFHR zx}$gX_cqm2tTMCSwa{L51QsW*r!|dbTD`UZzO(UJr|#pU|tC(pfPeXC2^Z zpKr<$+GbQUpX@F-XlU#GLSLjAbN+SqU7~rqAkO{^JYVUz_^nPWjlJQ#ahcA@i=V9$ z`Gjwt5ow|yfBpYfm+2%sr>G;+#Lc|Z#Ls2f zN`4D3#z4Qln2uxI-(~;Gg}&6<^AZN?34CjHUKjmUaoTIkSG_T$n5Ff}mV--iNNPjw z6M{=qv3S1EW3#CuEQo_VUbQc9P~Ad*h=NJwg5+I@+{156HBfk!G5dJ{h0|0sIq3Uf z2Ha_VjHWh-!h(F?P0pFqqkg_8^*W@S%Q-rXVZW5O`jyoMe=#oqcpfU4Rm+%GCV89u zg&v(0cc2epZ^-2fvm_k62p0-Tc}OSaf{IyHu;7AQpl>CJYKe+QjqnQ$sJ>E#qosOFQoBadRJMIFo87R zpWh+O##DDt)@y$6i)N=RYjA95+PW^3*cE7la-X#yLuL!}Vf{sSU$d8Su>{`c*vUEg z=LwH!USRJVbKZ;NG*3x`OrgcPqmbc=Oi0=jQyQyc+!{$8mxHtS0wyAB*THYoYQTmM=N?A~{Oz zB2;2T^WmqT^TL@omV@KR|9bPq?8s2$K2^RE&pqY>iKv+U``q|fD67>6~!o+QKY~98FgEhRB4xTuvX!8Y4maCohpS<5jGSze2(uy>7-)m%sGQrUr4d( zwq9m8;|&hJu7Vp7&Muet=l(`?diiHTVFFwa5ZCf(eTK(OW-TGNY=qGT-(!{gWKzMp zVhp%-!ueImPNWge7A=_nfpt!YJ`xrqSrT{8e}V$RE~u2T`C%k!J?6#{6|FLcWG4Tm zoyDx2-b{O4$#0w7?MVNQP|d=YMRoez0YcBVI=mC40#CBuy3Za zH2w1=CgM?vO}H#gJb*$(lbi;OgFS1V-j-s~P1SQQqxRLtOWRa~^B)S@z3EEU*I$p3 z5sG0ME5{>FrOO6dy4nBi=q6~j1oF>2$ z;+rGrLc^}7+(V5@qg7pKiigB3VNGx?j6 ztjmV_+Aad(DgVN`VL_>mB^HhO{QkE$6KtdU^w_dt{VAV!Q^f_g*iZnFAa6NVaJq|q zV~tGuMY^8TybrB~v@VlP7>xj|hxO3NfaeUDB35k&87BiPu-~M^IqNZ(o>1`h?&U?y z_pqs&`dc8oDLtKgS;zWoZUR^-JW9m5>^F1)gG;V|I{)Zbg(fot^MWuD)Mv;~VuD|+H&2ng>!%9&UOu^4-0wr`=4_+|;DyM$) zPug6!ZvBG;8C>t-y3JEkM$aj>M^oP@$+Kb%WB0pR{uM{P>IjovUs?apUpMZU?^9o$ zDCq_%_U_;3P(Hw>vQuaCAyK;W2I$cFAU=Nm_+QEepx`cap zN&jtnlcjDxnW-Ly$ZBxp!xILlmmcbp>pc(jiF~=AfBI>TW>^fCab>gD#($#Pd->*4 z9Jb}lk`Vavu4r*6yy4DO5N}yk?I(s;>mXW-(8GFF;T^BjBTmn187B8& zsM-Mq)z6cGrVN)mPJ_Lyz9_H;ztWaiu3g-}@oVe?O);EZWR$n1f%ID$)ZzYQU<7|p z^FA`nYlj_%rk-V&Z4VRdg}cJ^Y`*`^shK4tvZ}Gp=e?8cf3|l)cJk=;CRlsZ>Pi0Nf>UgomY=~ zE=b?b@)JZCyA%U(q=!44Ml}BdKEGz}dBf;2cUE#rl5!!+&FV^HXM(X%v+$4$m;$ z58b^T`;|5Vyh%XL??=3@AbWwjYCoEqeC^+}R4X+8G{cR!*ImJ19{@xX;Q$yNMp6~DSGQSFnD#`77a0pzG= zjOxFwUfVlG^O0Fnq>UqHflM?I;UgX0vwnEYM8J^_hop(JUkTmi$q3R{o6#S}-oU`+ zq)T~^f-|Os@|ig`uMQisq`$HeE>Kr^2ImVcr4Op6J$3JH3$W|7*t59D~Ze z-5Z^}y%E@i>ScUwx`?NwqDVi!|{96-`eL3HEccs^; zPR`>2|0DrHMt+8kivu*1A6Q8QhLiT8F-iWVZ3%KOqkiHWPs0eWFbx{lUb3ir9x9kG zJ}An6z|UT08Z1x`OHch!X&7V zVWDG^QCSCkz??>pv7=3x{ed!=L%1}A>XE3=&jxz`Dj^-2+|F;Q&B;6}!F>1X;r@CT zs#hB$C3dgTvJSxt2y3{jpfELohF(XwCHW;%M`1obUtiAdMB;rh>YM+q9p-4hfZ&9O zujJah73rF3E1>pfH}gWtXt%C|-UW;;YBO(8D!RzQAC@Tab?+j(ee%EZe*HH0DEha6 zB-=A%_Jd)@AfpSYwMd^1-3F8?0mx^9p&s*u9l_b=yBIFC89U%K^C;LJKg%A*#R|^) zSy=MeLz{Y<;mBGmv71M_gpTOLbHNX}L`M+T=r2{b1RSG7=TS_I>-0x(&I^=HHbnAr zjdC7PDO5tIxGoEr(tJ977s+vqT|3Tc6JLE=G>O)8_kVvyWapr|SlH*PVla)K;kyXdQ@$ z4#x!wZKqqh*9E>nQ_KP>+d$Ndp0LJZR>p3_I4@HVciB)Iv%52PMR&*Y%~^%4k37Jf zjxzeB ze5^o{TC7@8yiygR#A4OOEMY5mrqXW15Pr5;Z6UvsEE8L^3q8o90NZ4rn+#kU&Uy#; z07dyoA!c1PY~y*lonIWi)RpPyQR0zkK-Z{ECp1t5yV^r6xqGojgJ6*&pxEc!@3t$Z zz4+~ySV6bf&&TZ8Fo4<30D&T3tb z?~_gth~d&K&z28WL3E*+=q8@?LaaBDz#e798@N^2L3r!7eA16^_7#Cds3 zsB`17Vy4XVb8I{AeACyaLI+acP|`|T@|U>N`1Z4>D=YR5@u5AaN>s`r3^cxz0Ob}`sQ#g7{Z!#+~C=n8{cRe3@#4`CJ6STwnD<$$GPN;DH z1;3v2nwY-U*Ae@8WpyoHi9-^G@Xv?1~4d z8|V+u6A&vN+c{5y>#PLJ_iNJag?_UKCwOzhPcTfcla(0+a*Nk4Xmah4dWb{q>Y5z5 zC7$on`7?}jzUEJE!@oB9he(sOmO2jIM)1E!thx)&uej7YH{Owb7gh;U3weEMZ`K0iEd@Zp!K1T;98-rFWmWIyap9M1ZkxCibm^W|VT{K5x(mIuPZ2W>n0Jv;g$ocjZv z2W|67aonUR=Yc%u4+YMB$<8E~@DFJnLq**E4f#VQ+yfQrA5yu8yTgZHEf1B154oT5 z1mts!Ee||(9-eR>UN{?WS07wD8$Ik8`Jg^hH{n7hX{ zv{}yFClb5(NT`YDa30nInzIrps|Jsa`vm0OIEZJ0Wo4Y}e1gYioN{GcFJi)YWgPA@ zNn0>^jb}=sV2XBS%35Q>eq|cOGkK3^Mn_}rM#P*(*No`9N!s&Cxs~xtrFT?>gw*Z)%_yPy^P4c+Katct~{4-R(IPAzxG~0<_ix# z@$TRXce*1F2UmB8UH4`%dj*kS-*==|c~w)Rm$;n7Oxp0L&~8{b!k zqW#C1uZNnua$1KI^_ zt6T1_+YYPSrkxzVI-h4SpG{p)^j0@*Be!l|tSV?mf?Rl--Tu8T`q%#H-+R8ZUbnNs zqO;*oXCL{_r`*oxiq4lloqyuH*mS%2QgpG;cV7GH^wTpA-|mZlpDuyA1zi|&YEE^$ zbV(16hD$XZDpT4Q&n)5CftM*8NQP>J9zkTwNohhibJgQzD=vq!B%RvADdj3hp3C1| z?1-1E8Y@tLdg}iniFL3$1BGD{W*OvF3%vwn;sIDC{m2UQCQXD+zQA@1u?a1iS5I6sbg{==`1FrMuo-JJuCU~V>L zPI)H=4sbpgY0?@&Oh#J?q|!X-7s<=Ac??Qp(1oUh%cQEQOTUOZBs^!5dA_t&%S`Q+ zlkn<>L0x+>O#`~k0Lm;0`B=V`(2e^tQ$KiKC4s`W z(o?Z-*0SAPC!|cC!g>%nQTJKz8A@E5J=v`+a-GcK{dsh6BVSmcKBH zy+xtFMqoy~reLLvrs+!n?N*7VmJ2dz5&0>Et)uP32-?g)yC~EU!c>efjxV}am#$aU zfQ5!3WOeK3>gjo8D~aH3Z9`UqWmh3bthU<0Iz1NO3tz3mYm#r!Y7q1U#0T-(+&Ab0 za%zOln&QoqN?F6^^+_%a2@bC>>6ucYDy&bs9?SZdaebO+9SGw`E6DT8N(J_N5&dFv zOW5dkl~dF{2c|Y03o--6NX!hA3bEvCTZ}b>Al7Cdd|%m(+zA@Z-)ZUT*ewC4DUwMF zLq_h|O@x~gb*eNdrOv-X!jrLiW2PyE6v~~@V?Fe0e8c3=K1R~bemHR1Dm{0hLgmTd zn{(jcoK_m1>-Nt^ZY-uJjY=lUq=|LiT-M2ve^IIMmO$rE7Mx359#JTHK6=Geo#miogB z6;Eab4@Ue^V(m7ziy?1bYvjkSnOtp}cqDx;HgICnP5k(Tza?em<&7u*oMOoB#rm$* z1x{=)B}>i;j?U}{r%7D8tlK@V6Ek%U%Bo#iHu>5%F++M7C$Vbw$Ubc0_Wj!pq=Chu z$DQmqK~JV`-sCqCQDVE8y{YtKT(|uwO#Utrx!47x2tD><_wl~gUk{d&-*E@f{Y)TeoX(m*L24%;1 zc(7P$rTUkk)jl7yKNU>l_r#G5Qk+xMxEX}$!`gQvzNaz?+!j%CNHP&~&<7-X@4pT~l#>nhWmu9a4Mmn!gKE``mj9#e;yHee6N* z!r*|MUQ_i4emT~+q}bn$i7Wn2Zh5)Ti8p$!1y1#&PgV|CO{(Ts})QGCngdZhK4j%ZJTtKeFB~e^$ED>f_V-W8$}Z z@?)aDicKfPQCsojwniOwB08t>E!8~e6I0aRkq4bWinMNS3Pzpo+iCu6 zNIgHe8})0ls`JmT|M_8J)Y;bcsJHJ^FVf;ZogF-Q|L@cLk6){zE?&E6oqbEaJj-yq z`1$Sf)?q^w@M?3G{WARQOBBS&9AJc|CZTDKo>1V?3?vh(9W;wChHb|mhQM$k^qBD& zK4>&N$CZyi7lIUXL&UxJ0blhj=%Cd7y5hu+RLKAUG6Ez$gE9sI2qQ&4 z^o>h}iiMK!Z{Eg5oW#W-@b!9l04y%;2QZw7&mcx&A@LO_3h9W1SN8FdC#rtv%Xlm_ zVFaME*ce}uld$v!Us;_9h)#H0{iHTD9zclCuuiOTzglYR0T211Z!Y85(Ylkgp2Io5lU%NpT)&yn15GSurx|ICUqPmfLsB-3Qp)b!reMk8QM^;IPV09p(&a0+cF3#z}xKV3&A3>mGJ8v8G0X>}qOjAe2BT z(n~1PL3$SwI*9aMG;{<+L=;fegwShff=Vxn^d`~`y$DLPBLX5uLc8Cpaq)A&=x?Vi593w9}{)l+CbHo|Di0mU0 ztcFJ}q}wO35Fy98at3o#nNT?=)EN(}Gd?2a5N$3TH96yV?E>K;OCW8opMApYa0**+ z=Js&P!-|~DcR6t*XUB|lb0RomO)eU=#6jjS7LFvSCS(-8yvQQQNDTrB%xfU#FJ4wZ z6P0kW!Gz_~%S+dfMAu7PZhOhpeB^SM6QT9m<%W^V-6naG@VuUgi+xUcw?5&A67mSw z^2Q|c8OeEfE?$^a&wr?nyL%+xJ|X`RZNcVH{uA}Ar(6ZVY~VAK0=Xjvau4G)GEuy)^0x>P3KYf8Zv4~0IveQTr zYa)J|wz$2b2(Y>;e2Y1p6Dd|#WdvY#rNJpym$gyJUgtQ9)s=W0u)18y6sv1pv{X?e zhGKO|`TcdINLSw`I7lNa5;Q_RoGajJqUS3s(&>)z8hEB`cd6;70eEmRT|qq{OLLs?FG3j;nk*96(TP6l7*h0SFev&)n!^oCsut@SIN0o zHQ;aLNmbo*sW4S>bd zX{+k#Z`C7S)icpIFiSPC9%$GT(7=(@z*W`2bD&;^NL}}+j(>}>4peV|Z4^stl&ETy zI@4JF0lJUANnWZ+QA$ZVpb4nX{w78*zKE0#RrJaVk^IAZs(nbe^Rp%)|qse6?eOXKB0`u+9$i@S$ zxLfU14V}U1ouo7E;gem|4PBwk-8G-PhLXDFt-2S_bcaoL-aS*brrE^|?kY{{ihA7f zk-i;ZRZ1JWn)yvel9%q? z-^s$`+85E#3!Chf93zQK_o!6&DUJ0hXmzS-^=LA*UD@aW94?1dCRN^krpSIOtNv2? z{zG~F-5vebk=)L+{kF+n_Om@*J%f&`eL(NYkfFn4^;#wKAQxNLx2nElV_h7pLwwb2 zHhJ7Jn!~}#!!(_PCm05ez6@NE?@t>WD&iSFzv`JSJse%l9a}w;usR~_KAddE{YGkp zxH{}vJ(wfCJ-T&f81%NT{+>YHmyxDqmb^TXkTSM2$wTjey~sRzBy#X0Y-|hMxNSAo zoIEBZc*`{ER(qa6Yw{=%=aDZxUY|F1U(02P7nS^MY-@6?Hjnz=n82gt@#nnNvw0#* zfwy0c-G04#`whdL4e2}Y%{m}HijWIZ^!=jbFy$|P6KB+vLH-|I;M#FU`Sl<>i+y+^0SQl=zorliKF zWL{70LrlxdOe-FoRz5ndnli0kGp#v3z5n&J4r1p2SzU9lU(dZkJl&9a`tIP<4@aN= zmh$vd&C}20Prtt2e!7jA2g}Yw&F5i3^E9dRw6*i}x91US^Gu8j%>VaWU7UYsb!p!P zgF%9ysA#D`+aPJ+dkX}knt;RsPdE+D{~4C0xnEaLTLZwd()5@P7@%{En5|6>{v>4` zbua?3EI`V_yXfv>S+!WcCM*ZwWd(Y=?n+r$uK)ncD)r$7ysXe8o&c7WaJy0|Zk5&H~gdz{_%u)!ZesW@7)~Wt~Yha!=B~ zm=yIU$>@HHi+8F?YO2MX^CCt?9I%P;+4WcrQvbeH6eyoyZMT{R0Jvr4m* zj^{Z1RI>mw>*NlZHFC*6;W9b~s13un#dpV)=705uCBvz`>4I#=uqcv*z9z^1a3 z&1Dyd%TDjAS&LWB#a%rPs2zZpRb0`$P#OKM@^o@l*htld2URHmoOQoCHnS#j7tSiF z8=$CJjRZi=y55}hs5yOC&AQx@P~9>DkXfUx*)Ll!6n$5-E`RLEEbGi_=$yLK)o`yX zf3+)*++9+4^HF=x#7a*&;AP$Y;blD=sBHQNGOObUnYDQP#=lXsHh!pCZyyZ3et2u^ zVc*!_s97WLpHAMLAOA^aJzkoA@qGF7^Lum44;GhSlb4r2E+I(~aI-hPWJuFt=9Jf!^<%My|Lfn})~ z>|$BRe#NpZ|A1v}LYM(8%Y$8%g&E4M{P$RvFtgL|SQd5hQU-=R?}bqA{E20mFL<$N zbk=*bxP=0!(Vwxb1OUqt-^H>94r}F@R;I z2WZ#MZRO=+rZ3cd)E#dWAd+mepjdMZvN(0W7OtafAo=We3ZO zq!9zKETr(lmhc$8GJs`?RR{MO3*P^UW!(?n#j-3FrSztbD7KjcSk`Rt4wlt7BHc2( zgJr!K0kABkQIV+~EbE{kN(#a(Sj73wY+DGWYQo8l1+Xm1zhGI%{szkuB9n|2q3IMX z3kXdDuq?kMoxFGomi1}}%i4r!$LwNR+^M+a-?6MDOQ;xtWf5g@D(?#Ewp{h}SysRN z24Gns3KT30X->hiTH#^!6fEoBL~?@=1neqvcR@jtLE3^Vzku&k46JgT6wpIBDuAFwP50Lv2k8!Rgkd+H=|;&&`7 zSN1)amV#vo{Rzvuu!CjI{)%OJD&`pi*iZa-EKBMaEX(~nmZeU?vXc13Q9rRPw?AT8 zmOrp8CiU2|s2^CCJ_XA%hCN#PiDf1K70a^x3zp^ME{-0)_7lsx_V-xUJpjwn+r_f( zQm`yb#2kQS75~7p_)Y~wSQd7$EGCQD-?6N{yI7V1>d#nK^#qu1ZWqhKZQA_AvJ5c0 zSXL!hoBj@#<@6^ki)*0J67>Vi5-YR)j%Dqo`+;Ri^t1URqeW(Ru`DeB%j)_a%gS5) zAF(WRJ^;&-7$a=6+909oy#SWA_pex%Qp{3H34;?-k2|_2RbvOs8WE2tHT?n0s-RyB z2e7OxDmT}^Vp-7;x1U&6mckD#iwbW4E0zUE{u#@{?_gQO{~gOB|G=_Ne#f%v{|U<~ z1+c8pT`Wt8f@LWr4R}+qEE53Biu!vjD=YW`0x6plFSoY7gJq3Vu&f>c%Nk`Lp5Ktz zI=+Kt&0^#3=%Z6rJHZ0O69e5`yI5BBuUHmb)6jjIJNd6zmO>H-Fr&YNWvL|CgswB% zAP3plPSQ}YtX|{}mL>53z_QeSVp+<_SZzlNmZd@K>q)_~(gMoBFdK1a0Lx0-!Lo)S zbbo_o9sC29<&XM_Ww9M1F}lnESk~h2SeAR-{Q&^W>cjw8*5@58OKbqh04!_iKVn&>IIjZI4wl7?!2Q6oCjW|MVd+o)#Iij8E0&c3%LK42 z)!(tKcI*z8W&a(^YC`W|S%y@lBt||C@(z||vx{X3{SB7Y{wFM}@4sVNGdoz8GLZ_XbN>#@!cefR)qjU&&HOhkiws~{ z!U%{pH5tINgb?hK5CV>oEs=s{72p`Dg}4DMt9l2^Qr^X~m=U{J7Wt1@R>8l-vc$e) zS-5oymX(L1J}DDq3YOLT3zkLr ziDlVAlZ)uZ(Z?_ouwEq9cPxvGss+HZP~WkvaRAHuL~pW-Wy#Z;1S4+lVp+H!Se7!Z zQsfsbtA}=9BIDph?I2)2ktkT!)Ss}dkl(Sa-W@Ewd74JPih^YkcCjpGMgYs2*~PLH=ygZvSo{7F%bNXxWsUy_EKBNl zEX#Zc%d*+UvcxG^ma-*N^$#p-^)Faf?|;U!^nb^)o}rtr5zC$usRt)u-s?KYof(+3d5Et z16YXQ( zc~G#d*XR-<0LyyB$o7bWWfe>?Lad>$|0R|c_5Tl+wHjPC{#Psu3t(BK9V~0~AF-@3 zRMIet+T%}Hmg?_VmfKG(ON1KtFR-jqBvq>_48XE7f5Eb<(ciJGb^y!bXZ(9C%kUqt zEQf!MWmWxxWtIHIvVcoQf5Ng<=#CKof@K*~x&6ShN_VlWp6^&z9_Z-4U$LwcG!4Cf zz_LpJGnRFdCgxWxOMVB-I!?i|oPJg1ENkjHB8s>GU|GEj(fkZ&W09e*h2|_07PgqvKJb-06UQ4;0;y*#bvcO&dmQ`})HD(9P3i!`hR`wuk zZNN_~>)Nka)?UOPuq+eq|AJ*D6z^bJTE)Eqn@}zfBq0UWgD~{Mb1$^^@eKo5)&pG{ z;A)NzNItHc2~Lm7c46-RTsdr3f9ecu&lFs>^oT2GXTpPW`850_dAv)tnmw$ zW$**bGD5bal2JdftoKPGV;=nCoCD&P0G3sOWwFUM*dJX?@IiX9VbDEnSCEY8A~b+y z#lahc%Ow7UWu;TFEU}=nzrnJ`cCoC3yI59HDILq?06km*W}v{rC9amk=>uR{p%g5u zK<;DCE|vw>zi5eKVy+jazdBFBvc&#?WgVyc1P?^xEV<^P0bnQ%@}u&g!2F#yZ5q+nTB>1~Qec=T~mN)#;1fP!T~_mE3=v8uq+VZ=L`T?7RVKC!pM37$p9gx2!MC7tUv(E z+W3KGt%E}!09Y0X`e!UF<2#m>1YlW?x>Y+^RyP{Ys(i z-`oajv6|B48ULfpZS!&orfpE~%59Spe2)CbDmOKT^f>T4pPoxTJpFH47!Lr_8ddwhr z7=$7ZfItKcuo%_OTty&VC_jh_HTGpeFpUUR0u^KDAHHHFQLBq0@$fEQK3+#qWhv(P zVWY@BJ;d$LD_>or(MAPSv+;w(zG48&8eRahWN6Uev8=j~dpG7j)O%vzi5x!r>hQi- zP%LQpY^}J4Bo5zwpAUOF4z96C`WC^Y`t>H2cHIt^RRLgGNNRTMy7?KcP!M61wLV!d zj%^pqihxW}u&jNYn>Qa%16Wr3gDS;PY?C57PDl>)`7%->NrkhcYfO(Okf;UGQh`vg zED$FJ%W6@cR!h;ugx0IPA|PvB_M@t(L6$pM7AJb`7Dl{J8NjlRGz7Xyn&z)M-XorF-Ng0H9lM z?w?gm_s&A1ECGN-{a}`gR@GoY7Lo0N!BH*MgYPsLi35jq2V~OYL@Mx6V7kcpMu%C= zPprmrEVzLrBxJR28%suGKp-AVYDp|e-K&#RX%NAnjY^88n&|dG%7w1SKyq<*medAn zDkwS7VM}V=K=h?m!jsZ;Zwc(K|K@d5Lkp;pIrq8rpc7@%F#9f5O&`hX<9nHL3 z8{(6Y@tWgsOuH8YO&Nf9825@#t6w1yldBB^0lbHV^Gu=!h^%c$uAC*s$l7Nq2N+o@ z-;FE@z{t`G1dJ>qfjrp>JOe6*GQ?d;RA_>>)H_{J1!(UFE1D21p=)3vPPo0L$axgxdiR>q4sXL1Tz*44yP7+ORNG{ z5zvM;2Giz(I4!9$MUZ@b8ucOwtw9{z7=*B#u!Uvl%b0B{*qOb)_dcBfQNxwJx(R3< zATmi-dTQMYM}-7kQc9Qet@jtfB4ku_nJw&|F)DR4FmN7U8FtYlh-@f1Hy=!EJ+huzLMwUjH z0gc`Vm+wXv6mY}{Ou#-@Yw_4Ivf8Am5#bagt1pJg7LVJT=|l^>M1CL|98O4X;u3;- zI@=kJ!@=WKSa2W*DF_@VC+UZys=zUp+rx9wj6^7G_~y31u`6r>lwJmb`+-h>*yqaG zNIl_L;MN2WpQR~kxvdN;NFpYG(V@kda>~^u2y2CIwp~3qH*^v-sYio?gOocTiGR8$ z(+2f^Nl)BEt1tYxS?)2~O+K0afxn;G6Dk~=Tu1&ZRpqhAv{(;3KFf-fuIbqOw07*z`rJ7cSY^t3fh`DY9N^~1rjv;=9wPl^TOLrXmFzfpD zGdMwjS`h~!Ei;+SNA0lY=j{Lo1|%|Q2Did(SK0yNlWf1qz{@XMMPXwmo%vnR)E6{h zXV;4-du#dMW@w4V?H7%A;;bQc!o&3vWM9%fp_BcbKMfP@4+h129ne~C27@O+WqF`L zM;MfIK3fKx^MMFMg49Lg7*1fvs$wJdM3?e8nxSypF7oB?VFqL@{5Vd@kcRo*bC5AU zSdCC74PloBF^~Wt^ET@tv!J3weH~+1td9~aNcE(8gPuk;8%QDm6oiW7X#!)495~+J zuYaWhX;u$)TvRVR)b`w-HUgt+7ky3<#6s$$_PGy5>C+U=Jz(o7j9&JKYlW&gzp#DA zIO5z9Jdot)8HEI-ta2Fy#3dZMjV4O$A(B-|G=P+4lN)8Shg#l|JNcep>c=?U39x-D zMaq)3&as?cjNOM1P6oIwx!YE;+2gm+yh8i zn#wM%G>-C-ukK?}i9Zvb_hDeJ;dsJy~EeI-#3zfiCVaiEPOolj}i6WXC`3 zCfG+^+_o}4l7x~*IuUDz{7vs;Rr}s>YGkxBoMf@A(AZ`@NBJu)T<6@(tv_}^nIN=TG<$l>7B4RcJ&oL>mW|> z&D$@A%et=C#IkL+lFu1}a30SKkc8AEdu&ug4Ls%)4KpXq9t-nAQy&W{xOeX9tCI9N zQTSs~crtsr>U?ZrFT>MsujgeGY)rE4imAD5_{{IeYrN-lkz^m8;b+f?6{ zBL$K#tkJ3Aprly8P4!jk=iSP-ArHpNr64pEDGTQh?`UIfPT(Q?fx5pyyEm6tj$JTv z#hRD9&Re~+XuM+gj&=MR6b7*boo1vy6{L}<7)*7{`%(bFWGT>mba)0ZSxw_WYRC3s ztfDF|z<{Qc+UoSTLq{%Pf^bwTU0XPXLYy2Lh{|f(r&) z2#jCP#L*QwxNbU8bS!#m^&$r|ZIJTVaD(RlBhfe}W2?FrqCSTTKN7@Hk1tmR1^V4h zIrD&S*f+3(+UJ#nBxeT`3AaD2H8l35*KdI^`B=T-*w4lBnrJ)+F1w}%&uPhipAOQJ zE+J?Ua-I`r?o`D=rWwwOd@mLTYc-{c{}wA zY=p`VjrH!3Qt4c=0D8p^_jEhJmDBsQjN;0n4(CV^WO#@+8wU~Bkg7$NGBsa56rWxg`_NLro|LM!J zM^`uHmxePYkN3f(M67~x_zoOkQOc{cQ?_=l{+p*P2_I-P2K6rZ9 z8>$X^pAnYzMk{kDawF~?E9qea8KZkuR4o(j)2AyKGV@V}Rf2VMNCM#8ie~OR_APSq zO5*({U#5L#pFhm{n+fWc1uon;{_qvKMDHojw+C-~#Wxy$ixqR4?~B(;4)0CGGn=Xv zpcw+>-^rh>c5+jmG(%A@g?hSWtFAv5Mx`!F=wLM^Y}b6#uRis!IO zoZJG|rht5h zX74YbPkOmW?N`b5$k>01r9Wu@T^^0##Z=C3RPI>L^=d@JWxJOIh}2AcU=pLn6R?Sr zxYG34;ISF>)8^SC>@AV-1meJ=jq4dZSv$YmR4?+w7=7(y$fQv|aN=Z)oQBdf3-+*J zPkQYSW!)NB6SIv2EKP8NF0!Q`{rO2x(p}pwUmlCB;vuQR&bOZprFuJhX;d5|Zxf|D zQ{K};v+K^>(o2P(T=JK-O-|NMgm1-@%-OF7D+C|fjK-MGole?Dr|7I!-hoTmpsuM& z%{)&NnPB7+m&@P`{Vl2?6yx>a4A!p14thkXE{eSqMO|nIy^g7mWJ(Mw1Y`8Ida;$WyN6GRa{!^boaTgF zfDwh}ZCKuC&C+^0`(GS@H*ID`+${>{5^-tjCfCU&ZrI*wTD%udolR~S;BYVyo1T0b z)W(|bGa!LXBz~RxB!$s{*f26)KyEbyK^tPwOEvKo*3oV^{WlvnY<@R=p zvta%wmev2hT%LkuDP-mri0j!*R9u%dA8=sFfJqoXEC?o}Y2M@M#2`2Nxx*DR->qMv ziwuq9=tt42&lZbmSqY=#(KIpeiAaAW0m6*W!vsG9S%HSnmDSMO)WekTE~1a>u16e< z5Z4!$oJ_E8E@AHyH&m>gObKZ&{TnRHC$!k*fS>Jy)`E8V=KY(P(v~Vrt^AKFd6JR;Hr1Uc@+?oHRe9ZCU76zNVfJkGeXm-cPYQ}DB{qo$9_}y zyYqVM&hfl+`6x6PV%1^D9=Z{A)eOTi&srqMuEt5DJ1M~IgVHq!rkj>*kcqS0hu4!{ zWZw22?GnPhv0Bp9Jsaw1pXX^7`tf4i@%OEx_T6tYmK%CREbGN3`31FGjAU3#)dmh;-1o8fi;Tk1 zZ)1+PZ>16P=2hf%k4j>r-FlOkB!XB*R?+zB2kvv##9sSj+Dav?XE|jax7^Y-dCWG! zfC)qTjPc|ZUG#p8G3`s@&N-!I)m`mZ>s9O@GH8Sp`V@J%nnfJz;`)!n7Ydq*%JBoigo~7(4^YGfCxTd827Foz+ zkDtl@rXQdcPPpyytEz;O(!$bo zfz@XjwBvi8L=@4pQ*S6AV(>}iZo5VA95W>`qBViL3;IiH+#@+R&y9xE`?r| z>0-fheL;tML2BzL_{m~E`)PXHOYW9}9=Y>}#%So+;_Wuw`6@1e-nw`IGqhyn{rG7c zedracp1WLd5o=TMa>4-mu1&ozM{<-K87k(5Sv%OvH_U45IL3n-5lNnR=bG1gpa1mr zFvkT!PU9%VnU&u~Cy&X`7MP)p9^P~a$=S)Rz;u%fl8 z!lFs^!Cr~Hd^&IYCD*=vj@C=)0cD@k*;y`>Gy{lRk5djK?PIW)-w?61V29i;5%eFg zHG`oTg=Lo$WDQ|rr%O&|Gr9!#NqmXothgtspevP8BC(~X+p@^g)x#rKXh()_217No zVq{QyQF*;$BU>Fmuy}8>cV@q z$Yna~BC20T#mDq4mzk{S_A0L~3MlAu&*I$xqK`<>6+QSZez9d9ikdLs2)CBdg=vcP zh}RR+sPJ;)B8I6D;ozt{elYPwA;a4fHo*|pX79DL-i}-G0w+tf>!Hq--fz8QEZ%xy z`OB@?pIdVKSl61eFfj14U^wb~)n%FK%%U{WFomLL9$Qde>K?S|wjd}2!aCz_Z{`UE zgV?eOAD&q}4e~rxf55J6Nn6%bC#Zzc>z;HcS_@_#hJXGLhF4#NDMvrE*%TGlzd|P7 zK5S~B$dL3pevGez#FLe*GlO}f?<@S0Bw+>?sZV`HN}XpAKG{Ntl75lEmlab3vnAth zZH}Jwd2!0r;DmaaX1|#C)QdssriHkLm4i z=%ugM3zlWjHUrat{62khbj%RL_;_ka5GqP`Yqymhe)3tBRO{TPhyo zvE7)sbAI*@uBNQKNEy9a-CU-l)c3Ib9%8SPVkjQ&ddpjP8YurU5N4oNtsvPX_u4O7eVY+32kFwo#1;9+ zZ7M*Tp^LR4P=d9;6Izmh@$@~^_b>tI+hcs)2c*rG&s7rjD(^K}i`o*e=rB~qioR-8 zc@@u$agE0=8nC zqeBucnj6W1NJy<0Hg5IpBVXZfe^pu{JjR%=C;Gy+XnB}XSV1~h2NZR+(zIM5o%QQA zoz+#F&A>{-flB8bwU@d`rKze%f%zifXeDbWxcIB#>A89+GxFe3kJmQHDjlrUm{6{c zG-s3l0FnKrjlK(m*Womlm)EgUqZ%Bj;+h+YxiYPnXg?8p`#o_Wh`N)_WFm=v>Cs#z zN#wogH6Po+1A&*eB`eP*RyI~vEn8GaN!DBt)E;>E_jq?tt0GnjDtc$&b>mi`6>PH8IMsry(^nK`@O- zJ;&fEv0wtW2)*JLgq&CvVVh*$%lsx}d;L=M8~XUMr`>2=ebmD3C>zdWN*b~vd}|xM zg8Q%e^1*M9+QY;u&8}{Ea8NX!C7m)}V4O}jh9B~`zoVDpiOmD`Z z++m~2vyGc%eGD8H8@QPm2mn$D6LRer(SR=d@FIXU&cy%;a8uG1yE!Yzwbd zWgZt~Mu@QnDH=CEeap}45_0XK3*vz}!%`yi_OVagb&L$h98=aGDzDS(z%~=>bcp-T zwaRUMBy2UlVYorV_$qjn`kkC6i--x!UhoX(&A0hWh|4}O=PcN1NGu)0CGFh%ort?m zW(KM*20HH-^T#~w0_hLTF<8XF4h3x+M7a%Ic^_zhS7+<96rx~z6%`9EXM0HQ_98i=P zH#$2T2lYivUO&jui4z?f5;z(oXL(yB&rU}1gW8+@d{=Q8-xAQ%2pQ2b`ID3)N5_~OOH4!JA_?#IKyLqAZFv7vQy4=Cn(3^ zE6<&^FKkyF5N&*-Q+dm=5WupoyXjaa-zM{E2~RmbpvRV~+!MX$w`TwJy1Rgi$2<8O zK44`A8C7XVAC+ZK!(|;Y0LxPGxwTCzsW;^)=O_F%CgYxryRXU-0LwChOCIiH8_pCn zyVaf;K$Z6K2nEY(aVkba~BwDOVH~W-HpKM%D6kYu^y&i!|`7~|TOOzfKFN#T@ow2&sR$!qq z^^zbfr^^S5a{HowILrG=;hIPh81-eqhgR=pD?t|2+oolp*xAF!%V1x-qQ5pTK7*H; z0W({-*WPf`DN7XtnL7oae$bvGO9rVY_rB&msACDG(^_V0g{m7@J(cuBmI-NB^9ggh zS0&;ZK?i4EYqV-Uxh_nrSyoW*S?=G^#}sEa`DMHQYC56(F-%>pM68FYJuR6LCF#)+ zwHecKGwg|0Xpi)Js6wvht8oHnxR)XsJ@#eP!J1I7H_$1zRr2o6u<@e)O|Ww0#7zrj zwX9yT^(d8aHEp(DKIyBBo5RXv9RD+@NGC?os|NvyBt_Wm_j>&&%n+@8VvW5#!-OKI zPEG6=EkCWHu*i`q=zjEoSR|ND>+AG0yc9ZS%5&2@s+>~|q3Y;|SL$O?=+)mBfE4NZ zh&o}Lb%wCtK+O!pGRz?M#gz1FJt|m_R%m~bD1jT@!`>RDLISfng86Nuq-=Y6we|~b z>g^2#v(5L&xVDbl0!!5)Q&f8CKH-VO_DL%FAiH!rAM4FthG8eoJ$LyijP8<}2eVQ2+E7L9LHBUj>XJY2AJ+RS z_CO`6ZR_H#nAPd-{CVhyH`*S*X|q-IC=e_OoS0>G<`rSAf<0wBIt_kVEU% z))V9`q36c*l;C~Jd9~WYdWvIE_4!0O5S8ZnrD&_Z7spWQLA_L4myK$GkyBO)`{r-} z%Sz!jwwEXIix4w~cyw?iu)R;Uu!!aat4^C4qub>K>dG*P2SWCXm0@ z=G)o*?X7Q?v>KQS86+|tvu8MZqy*_=c;pIEB2XQs5a}opeQ8|d3sS*;u6}?WP5ks) zHPUC&Ky~}n8d6lnWeE>pS!cg652IpuGel1#Czq2-z6dUN%B(|Z645~;X{FXhoSs4s zweKsk(2s7>rh2W>_7k&Qua&V#6_KFhuWg;{x-QNoG0{m$V!Y_{%M@OAfBDi!zLU^? zxiq7!R^(MVq_|Z~K;f&^guq$Qh)0%*8fKZ%d3`gI~lg8nsuAV;pf+>w`##Pj)wJW=L8-z(vj5y8l9&v|xzvYgAUwZWj| z!ZIc{Q#RPkh{N6f*4+zErXoWZ4p(-aVs~Gn{mAXY-nnNk{CR%P-04>-Lwka|anOCI zD|zU;x4Ww4^Z`$2W|%j>ALF;*0V%Z1>cIZ-MS)9($ujHFjJCb}N2$AF3SELO!aWk0 zD>oOD$J@MtKZMX_uO#`PxQFryiWwPW##&t#4!e4-3O*qFsFRm>1(X zE{~72Ep>@aU175$xhO5Ahq()9h1*)`(z(|bFLR2_&Kod^c;ILHt-iF0+MTyQdL22g ze(X3)gcft3{TWT67YGYSiLp!*-jMv0_fKA=$q>1Mjox#gY5#?# ziy{r%(wy9ZEDuMn5i+epM^0v_`nhN~Xhb)Z(@(8Eu=?hwafkfv4mz@E{K`iqamN%7#{1)8{6iYRC_(_G{$O?Dyp<)jPfl~$06ft%MVh>Z% zI+Uy4ovkEZm(djk?sw#& z?U>#kgnV!xF+ML&JNo#LPjD%J8;=!71<=wp6O>o-FwqH>hj}SIL$uc$ASIPyr(B;A zCm*D16E}v(v)*P&al@F~_r@+Uv*h-$Ez2c1v2G#^_B?qhpu2cHsmw}5LyLA|O17kY z-=^Mya1b3^OA%rt5HCOzNj3D5HLXBPmlv^WWZC{{Jb1+`Zv$ksx5zO)M^HbH_f>-6 z*kmT_T2rC5h#=cMDD`Uuj0s;97hZdjRcqFh8{f*P*)?+T9GaH!h*K|%>;==G7qqY} zic@20w`B5{E4X1vkRoru3pWYza}S4@T#cDbOpAvg~j+R|7FY-$XG&`NZ+uFl>W)sh%wm>Z$UW^Rf>|xTyr214-3l7VTYn^+} zbZRV$^}JP;-lbxq6`b03fNhd*ZbCCmD|Sy9dFaq*pts|9TVO3*kHe9#{KC7loQ`cC&ux<)U(RQpWI{oG>G=i+G4LDR*=i)6{)=3cZ=KWv;9 zse?JZA{o7>)i|1~8cU^EU&3UF*4G&(611rZR}!##Y{~1iZc6S~Ljh2vIylL z|Jdn{XZ5Y#2i@PqK;$pb$AH~KE5BCA9}3A3c-IX6<01BwIT!~8%V|Cw8G0^Ep>hr!h zXnD#Fxux3kIj1>Kx04|(kNhC}me8~X=w2qIJRPqi=N=t7Jbf@WO2Ds|T?t$ArhPru zB>wT9WbtUR)7WU%S`CwL!Ef~a18*!-Z%BNw_e~iQ;urWlBhbs-XyW&h)*^Y)t#_41 zx&61;)2-L(-AF^dJcpRhW;G)7DvT8S*Eucpp=0C769?EYMr)S!L@cyE6VYO3w3P=A zRu_9yOwWZdy?khRmi_(hh`AGm)6o-p3%(_HgtII_&>Q-b@sUMX%nLt7uj%lsdsGh2 z_lG>bR2uCA{CjFI>y_Hz5VcO`l;nyPwwsGH-}D}KOjXNS8}&p zkHcJ@Gz0tRzqjcul#ALbx>u@!V21x z*2(lm|LX7C!fNXM98Gj?m}by*#5 zYDK=4cF8sTTFPm_vEnQgX$h^-xxfAG)6?x#D!2G}JUKClo*&QQijT>X73L-fVx>*n zp^RJj)ymrG83_SDIAkwzqkTB4pRrIS*kq-QMP)@Jx8Ho*IJgL2sWDh2&~35@epDdW zi{7+Y%yAteKrGr46nN`JKRdJ@953XGq!uIYLE#x(aSR@$l#e6P7<7c zXIrU+-w>?(mPvS6_)tO2bHh^Sw-ZPvX67^IdtC9%JR7WQy}TK+H!_L|rVH%c8_`Np zxN4NFQey0F4+wLwPBT@W?oLIJLlC45Hb>i8LZV^kbFV*WZPZ=#{`cfydcTFkkU$U|Aj9^X)nF>M}or0c)l3 z5V!Frzv#Jh0fQ`7!e06uetKtPWA@A%2#RG0Nlo1uM4t~@V8zihE4g#`_w4y-BvxB0 zv$!a-)+6HXp&{w9SGi;_z%5z7^^%3`m08@YBP1$c(r|8wHwwhnh-O&9(y3we8|M*0 z`kehZzBMq?5iin@q8IIPcrYn&DTKdqNd6Lj@4iLRuXuh?x{yDM2oA$KPi-zeB}A7_ zq?QsUvB*4Sk$t9M+>8;^lJt;mAeDG=Z?jC$xthsFK$X_7ZzIXxF9{)^QN1li?tt4@8`HCC|1IM>CHa&giq0eYBsu`-}g8T_c_s730%U< zsXzrP2Xt@P?AM&pb6=W(VDyc$^tbV^q(^Pav~3L>LWk8bG%HxPU=a!WVq`l=@eTTk z^Q%Y*lz(({72-#N#p0)yG)y*oNEWx&CmIY~1c;+ee3v=DC)P1bh zu<U@u!}!KXzB|Lr;XL2m6&Gfe@I0KUykA z(DLqBSk%1Xaa+YN?p9v~9k=6@X@$HUxU!kmz=F{j8ss8dJBiO0tGqd3!{yF~Dn1=- zd#be<>FCa~0@CK{VHXvoT{o3--{bJ=?x8n(4!^oC&|lOzT=lFa95swzgnBUh5 zc6xz#X~!`~H-Ckk!m1Jut9mL{?h#}-lEJyaIJ*EtLR?*qW)I_i&#EB& zL4rZ{C${j+SI{Tk7e8C9=2|ND%n71$U@>gJP`H#wxD{0QOr7k)Jhu6cNL24p$wf%cOD~4!^eYn5 zhZBXG@giq(&UD;AbMyaW@2QEjLw9#cBh8S~B3&XOAteeTNH{PI4N^)< zcSv`qG$_lc&B1)hg|G`ZzcCcK>DVFuY_tqWtv~DZrYg&iwRA4Jy?iH@bOfX=hIAv>m zIg=xrg{rS3$E`1|twTMRBGotBFd%^by;UAU0lx!H{-|FGKeA7K?=tlh#(;P<#H)_0 z`c}AhI=AhUf}N&=Z31tXD`dj5ZcS!#-s6GQD3QEh!L7^m%S1@DS>m~lNPSv)e|Lwua(23dfHQQM>gs6fD(oCpV7d3NaMKzzJ0QsFU<#PLRvc5r3`BlMC;Us6 z|NU-nm72t z_X`?6iz9(!{q~D)M42U?SC*8i9j-cML{IK^{tB-47Lj@+LO}esYo~OeNSAR?N#@R|_R_jYErwTNQVe z*IKrO4`)fmfsr3qNW{Xah!iw1RpN4baOygG&2b3PshK!$;YA}E%tv5uQ(3pE3z3pC zYZOenLLx8wn4>=xKFVhHr7m*EWC?9zNHZ-gnJq4-E~z#xsZS|so-JvoE(I3JAEuNJ z&6bW)mra2OX~7Brsc<2)@=C^bp^z%0xPuwcdmkv zrjo?0k|MQ|X0DQfrt1GASk}eQbh9)}vovjA(ApLNFRQt2?Q`3LPutQ;>nor3H>vGt z^9<=LRN%w*BeM>o!;a@t?VqLE$mZLy=R2O$bTOoLQp~pt&v%K=x3bN5zWdyD#3kKx zHm&U(ZI47+yG%G zQ0}idPdiDu+g7!#7DU@tSmf7GnG~5!>ul)+3iQ-M7He$jKmo#!M#bjbyhj z$Zk3e!~ZC3)Qwd)b?zQe(mUr|K1&o7kU2A zOJtv`(eJKUKtt(wn`K{irfe9yyhEF+{J{4qqnhH3GxE*|_{nIg)CAXB< zg8p=w)-huN*#^mEdN-B@*4h~BpI*h0Us98I>n<$JC>!#o+$eD)~7>k zk;3s4EGt~3=y9HAs-z!|Xz`OG{SwpGaM6;dW$IEM{n%oq)0LO;)%R*27!H~$sF!va zzv{{fmVbpU+?Y=1TTfdLot*R5!<> z31XjOClg`sH;DrMd;x~5>N&BuByI%(UF2jt_o0b{PXb}J7rnTLEFTX2t+gkF$q zlBIaZ_u&?4)kS}nSN7%snDm_YVCO(b4=s@dvQVoaVr^9>*Dj_|tKeR>bpvd3foEDA z<_WMak1gyTO_p;W0x7o{yc6bCpHX?PO77r8a+J)2utAc*Pu#u?ojGG(} zD!L`l@p_J4q}Eomy$kj>u)1gJI$)b<;g6kv-ig%CHrcL^^#Wfcwud$@+jwPK*djdm zG7DQxA*;yF^BNUaVH}SnIQjkRO=`blj0u76G#d&V9@mMdk{8$qC{%?-u-7eh(+92; zT(+XJE6k2ocPz@c`>0)9{9xxw$pc~!a{2KJ@nIagG{uFqe2@KyJnwH9Z9nkvc>OfP z_d*Rb-7Zwa**o(_;EV*jD6vBPyWS(!?VJ0eMQdbqHE$goRwdDQik_uMxHj!Pn41ue zs#!K@dELj;)9~Jq>w+z$3;?e<@#9N!hXU+!VczL$d=ZP3yRk+*rGgw?7IcD}JVPV< z;ko8PI&{Lj&TD)Qd{pM!b!jqO6b&C9_+D+?Yr}IN78*03^Lzsib|0mv8Wz5b%TY6u zf_-1OKwXJK|1JC+Ny1x&D7Qzro@MVeYYWEL<=vl}ULCnHHP+LtKwp!f=s9DTSnWRP zSRQ4?PkR%l(C0iRlV?hKGwSBzt(VU?!s!kK-p>Ta-xpg5X3%~LpjTB?tI-r3{O5ho zYkP0pn|Stt9eFkSy-{JK%;qfMrGB+9@s{y|ud;s@=3(HKy&eTQL>w^P*3^_4bpCeo?$O0)+i6Gn37$+e!hX?qtE&OS z?vyHnN~L7zI=DmA^wabXc~I?&K>WQ9v*FxMTb=Ph0(2%zF;y`J@kmW7=`3jCBOr_$ zN3rmR;K=lMaic1N(Q59l!=OEkIFy+rMxx=onvl6A zz{Pz{#_$P@T&7T&`+B91QRLC;Ox*#IgZQg3JKj0+WK|XZx<^sgnrqZiZ;3L<-MGM5biFzkrI;$(JV#*D37K2p==gBp!N|xZ8rs zYK&*5xLw#9`Zb=H^hOfT5x-<_q@7)k&2}M{+^t6;jvm>&g`Da%#XV#_FXLUPhM1*bT(foGVVy3tc<%+Y$IP{JZakL`L(KS2fM^j-qbKGz4pNs11HcBlTnsr zPua~I4h+fR(>=`YQp)dmHXW{YyFXE=fEFoBN$E^iDm>+h%iELZr70g)xKUl7;;aqd zC42NKq2|8Kb6W%4$F^@L+%4t#?C6}to~G0lm-m*`!W4u_}O@bokj2ZmyR%Cp5$; z9Y-xi48mTw_)NSF7H--b3H2Gn1zzH<+ahuY{K`_7DurG)47yYxxv=;HFb|w>q(%5i z^q`(=C3T&xaZ^dyyo-01qMq#ijsjeo82@U6Gh3Tms6opr>0=&tLkt1MmAO?y8_GVx zXuDQ@U7Nkd>4G87XUIE|>n+OSeW^Lw;~8MGUneIWt~J{xpx945hh!Fx1*DabcA#8B4#9*vOQnVS-bSHA9PghKY%-IM)lXKJ!+ z-anDKO5ptD?i(D(SCM2#<0g-6Z9Ei0F|MfLnm)Sc>5Gohe zjFG^}hHGJ4_x7y`LqpLsgpYSz+NkT#NY#|zzF4dj@~Ph1T;2(*iwQ*F6W1$S2ahXj zwY^z2)9m;DUY?8#8Flllf1_BP#9qY>XZ`c9W^WYMH@Li(|2((+KC{6#kG?!zo7HWaU>gV?#aB?234Z*7{k&4UFvKmO{q8bY_Mjc2y+Op2vV>3;@4*^%JNr!D z{p(jnzTpT*XT6p+ic4O7wnNu~ZXY()c)GT}!he$_r@DkrVbvm(7IJSYt%PgTyDz@q zJ#9B_LF{(r*lpPX2Wz9F)uA)D#zM3`^4@s$@+US@%7GEdF;KhN*_e`rY*d+tK$o3z&Up@+Zi)g8jgC9K;Y+v#OrXApkbqPg^O zxZFSPKL0V6^*N`Y?Opql<(~TqmbKJM`1LJ^;o-AmEbETT+mhQ~)of3(tO6qd%W4<; zhGjV%sUBlll&Ys#7Q_Jo1!^a90W~XtY}%iYGJwh$PT~+iw-xXxB!HSTaI8Ck!6A^N z!G}pakVjmCEi{lHip(nrnfjj}P7Tm;0_d%oxWdB*D z=vI)4J5s~T+ZZTI5I|3zz%=LwBUwNoMv$+iSpGN)%!r!!;B_8Yj3!0dPJ8J)pp2(c zW}M;|p&)f*F?t=;<^JH-sh}%cAv;yUK80ScUMS=Q=Cyv5_W|DJ34D*jkibG8fXRyY z;tqg@g`5j@gMzMmiA7H%H{j4{udrCJ-~@+oADgh8>6^(8p`qeOAr?>&AzX}92W3$h zUd0hsP#AG34#2W*mU4=RG=zmEg>>)Zc{oJ09o%XJu&hEh0Lya0inM2mOmh(HcR+=R zgS;uD9z2X3g+>#DqZY(HCmh5klR`Yl$-Y-8ADG=N&5hXj^1- zc>@~pGU^)i&Yh$?Dd+CI4~=OJxPvj^`dL-%%XAp10?l_hv{oPnkLwmzatzHCri75# zl+Bn8Q0x?EY?LtyUx$F=V{B4EEbVrjd1VZbgfo*wob3dPMu&iA93@{ECuHKy_c2c1 zn@n&!KKxlQaR*o}IU$2Hk}xwt!-U|h3BdyV?&XPi<;Ei~I9Gxh7nxGnUBhR=wZ;*i zoRQWhs4C98_U?Btg?X6{+`T$OYB6w+M+F781cICpLaLaXR}zVJj=?K>Msa-0ME4>u zSM0>FFw*N+lA^o=LyMAPB}gN;lkQ##h~Y|3-X=-7l6=1rkyMnNJdm8OmhuTJC5tLK z$0ViXqklnIN?1`!DOc+HW=i?TdzBKYjmdtsS5nQwQk$^TG>yYs)zS=KrFMp;`HH9Y z4y0YUmNtNWzo8**MD4!jtF%X9_X{4TPKL!yCEs64j-42|zh*+Zh@HN9#d}UIec9m{ z%L?I2-{s0k-Hv^EC8LZg;|*8FbMK5VMK^bfG79>SGQO&1+F@m)sfR&KGjRvqu);HM z7iQvdXSF3}5Srd6lFXtSyh7%fH4>Hu}V!rAZ~=7!7C+_~XKmsQ{8Og1t*ivd`cr>_$Hocetpp{tte zd2$rzF);F;Z1ZdxYG8OF$y4~KUg3yIfn{Q0>?h-p$-*aBg!QONRzaM|MibaLOKWmrV?oLhfNDvK3UA-hWvE zZ29s38!YQXc=^gK*r}s%wm9SEKF-1|^dDFj`YxG5$2Tl%CBEVq%L1Ypno6qA6(ly5 zbi0+95tYig)LJS*+veCh3m!X?H#F8I_2`{-Z7cPc z=j!a88uSDkoQCQ*QOiB{gw7|g*klj}{*>KbDkf`%BAo9h$jXwp8{qI?>1aLF>~ z8ncIJl-C>ZiDXrqX`*OIVrc4~WYysjHB`^lrpz(s<2FR#(zZy!>NT3WX~;U}n$vM< zi$BB4Qd=rMHzb5N*ZMU1X*AzXC2wnPo}_6a1h;f)v@GqC_W88-4>1l7!G_J+Mu(cl zKR3=c!xrb-8UZZJx@o_;@r6%IA#U5}2-1a!RzmBeHtc{leBurq!Hy?9?Q^NE*ygbJ zyNqeO9Uq1|h=$wA%t_yUZinf1U|j1!FKvT9>V)#Pg5^6OX&A8acG3rTF-xE4aqi?l zB!Q)M5><5qKc?u1dBDSMxV)`rN@;{N+lW`|WqBJWHF^Z_y2E^W+A2HG?lFow%Rczr zaUj+9DYc7Fx|^h{_LX%ntg6=}vd4;+R1vR_n7Bh5uY-ZMmn^%-*0)#Vu=xp54{25J zMc%%f^CXv~AChIaIFxqo>^_t(RdH?UFsEgocg(#Q*zYymPr>`pQGS?OdBi(1p*b??JTetIGE+J- zH$1YiH?o8`y23lUra8LdJh~YVl&Yx0OfO9o==b~>+4M1H1&eavbxr%E9kgMi6 zi-~yaQ|Br>AsnEtuH7^I;an9a-vOvAj}z)@GQ}nMgu41m=W6l9xvJ0cIHj%t=V~I? zZ!y>BZhp^FKH^vBs;MabSL$l21O;eUv1LAhbJg}Bd-Xv$KwZsLM*h&Qesiuy8xjEL zDx)!axUuY6Q(A5F-4p66{*<~pajvoo+mlYItIf__K)WjGN_+BMyV~q6srno3s_}<* zHB$R8wX2U)9nYr+0qqK)u0~F&tNGz43wuCRey}j~cyaKAx*GjKUCpjfEUmx%XYFcj zXW_%n%E8|H)*rO1=RdWpgLj7?-ya;I*q#0nQDiL;)_FdWpnjz zYlHVS>K`mLd%f9N7^*K{YDYkD=`|ZFR=Pt;cr1n+D%T#yuxO-fHdbv6B?&t%4mVbB zj%6xF&}%i-Y)=&Gl~{~4)$UGJ*bJp>HP=0#YjEFP9BHn9xzvHgWzcSEc(v9a&vR+C zrSZ+?c+P*XUFC9r*RBqfPPHqSU$v`oa+sZXW)QjOR+c{{c)AM)XjiLb>;$j6V8VaW zu51W6^0(GtMnAQyubJy{a_h_Mc)~S5v@6*aR$_^o3N~Uh=SO{nziU@0K)Yf;)voGB z2zB0H350STYgb&MmVkCO+c_@2cA{Om{Gwe&a{tt>6xRXm$|h7G1U@$`aH3r`eABKZ zPPHpUkh~{Uyr+~I#oR@XJ!y09Si5=#1GKAAJTV*_NNH7c&=|>2?aFGJ$(DeHDuP$x z`mfqm47VreiFUR65YVowv4x^JyR>;z`$@u&wW~4q7qX>;;&IbNWpc0e0qrXFSi7p! zr#;rLXpXfj7Bw3GQ|-$6Av>U5#V`Zfl_sw6GIET)*g2=4@fYpN`G<0@~G_ z^NDsJQpg>Mz>WG*}jSs$JnkBf@{tt^~T-ziU^Q0PRW`9nh|(0PV_^8PKjaLEp41&lBxx z^rv>!_EWoJ1hgxSZ`zgW5ACWIbfZ`b(5}S3X;(9NfObWJx9{0Fw@`U_4Mh}vs$K1Y zeri{+V67AFYK-g%z2~pAt3!dm)2_JwQoC|J)~>P{j)Hs7Xs(F?_vcFk2%shLa#Sq5<_k{zXT?PHFUGXP9 z>;DS7ImHRV<`Z;6%F;L(c-Vs{{0(+SNW-x%vi`N{DB_in%<#+%MXd|1wtA zzt*m@e$}qtqOWaXi6uaAlWl7#W7*}|t^P{8VvhmG0ov90ckOBd%vp`iwGZ|Ov@78s z+SNXoEAvFV5{A6G3%ti-A#e`GsN*IBD zejoh5PrDNOsa*y9NxLdO(XNF5rFInyk&z@)cBC-6MI39NI~9<7!8G6QR=ygwICTn@ z?04e$fb2!}?dlzRG3hVb zRpn3ZYOdn1v@6};wJS5XG}Tk>$`V{p{JVA)_xIYBDxh6So@!UrI6974f~m*aRluos zh4zbfMU1OZghP_^x7yX*AGIrf*v0sux*H^XFJk1hhXikUh47it^8cb;X`X0T=YP?z zgmnmu|Hs;u$WQHR6<%Esle_4K*;j#KvoGf&(BcVbS2(y?%ipyt5Ux!Jy5~P=R}z2J zuGj(XisoNvS7bl6tBUX1m8WI4@QHTCg(!D{;qIPAzu^Fm`%CRghzr*>ub zL%Sly^#2#yRn|VF;R8CLUB&&PT^0YTT@n0^b|v;V+SS>Ap^XjfVPtXQ~#xQb@4>IQVDcD(XQ^uhy2j4B>rF2u2QZ3R=eW-u3ZtGYF8Y; zYgf9;L44N7+LhSfYgehq+EvQ2c18P}c9jQcS2KWi)iLvr+LbP#T`2(C7015(-)dJz zcgbf@wJQ$1+x~yAT|v{teri|Rf1zEuPcsABl|c`Q!LfEF$c^a{N%f0%mHdy|)y%)q zuI&C>?JC_WTH-IXD?#mV+SOH7G5KTdipnOQ;WzEdej3oO^ov|lqwG$!t5-Is+7Ib$`&VT{rcbpiZ~2a*W9@1W!wJx?lvLx`(eXlA^8oECN{Ka>>4$c8 zfB>{Bo4?eqGO>PWSKIg`_hN9t6;TZDEWtQl@HhuRyL$eMb~O%YSNq`KwX5hqXjet2 z+7-IusdfdhP(QS*6);HYyLN^5UAwx#h%f@QD+~CEcBS;2cIDqqVyqieBvPk|nXWil3trA_`hTsh;bnuq#9gu?9i;TLL;@P44MZ zxsD#BHnUq^sM~t6PFgKYqgxfC52Wxzp}xxvw48`Mnvrn@o?FD31|k-eUtVx8CU%a5RPk5dOGTFa*ko zMx=lavKR@#9yUb-i^D`ktu)xk%K@4 zNmJC|>9cJl)K?WX0(#!%54Apzme&GV&cUt+fCkfS!i-vpDGnrq#9N<)5>N6oU%NU~@?xT@OHeB7+YT$jfP!RKyY!hIlZ7A>-}Xu~k=)f(vsk z);ZGXnqufD;1xxGEwrdkbWJ>3Pu`^Txv?k>YvjR}II<2`|c#NqpM@|+W*XbtUG zx|(@yFbp%>oA;$S%rmlC#GsE_u+;$YDi{E-z(FVAmG}vG1%e;p1av%>YJD5m>B5L$ zhldQOf;FYUA0^n5a34OG7Qidco%)2trCxnn>8!Su_n+ z8TxfeXc*{)Q;TCW(CZgMgy@!iLr;q6=4oN$L?`h+$jC$>J>WcU(zCPaJ9`4)unB+{Cb{WT^r3$`ZH)9rQdPW*7o}4igj_h*KmZ6{G~> z3?`VVO=bZ*J4+AEp0zG1Jh3-CkcyKsvjgi)d~aKIdmuUsnnTMGRKDXY4kip89h8Y?8w$oORD75! zZ~L|b{s!aNyxRY1Ug_&#Dyi1dADdUfijnX%^oNsc5)sGd70LyK9+3-}R~R<|^Gc`H z4KS}%0rLtJL^3G3k1BkF;RKjhUKmOaptK2e=KZ>7$QUdEx$?I$slx+`N^mSEDPJWl z8dCLa5p|EypdlY5gb~p|!frg&7q*^*oR5iT+P9HAehSH<*zdF9o3^9`n?v5rU-BhNw{ zA7EZt_TPn}%sD6O#3!Tx^NL$vTvA`D?C}RAgVxzX7%eBpIkr@E1j5EUy(6H4hUn^Fut{ZzDymA{i#E4Td&^*vJ(8iy7 z>wRoq<>7W5n^&Mte$8*@RT^Mk9UPliuTRV?izqO}@|$@T3z%0@lB2$Ao`88p*>x1| zbc|l1H5g(KG^Ed|dGyBW0_N5Jg*%icfO+L^fNS_=nl=m_ElI{cateB7;sAS^0x++Z zr=VG)2(oE{m42iaF=^pQ;zZC513fLkyb3)ruSyyw0P_k!Zm2@&0P`wI_g-yG%-u+r zq+|0+;@G?j3DKOXHa1bJ4W0wcD|moQQj_3B&|AQ~qC^93+0Z0|MAQs4OwwB!OPOq* z$QXoS4B~tPhI*ZtSE}F4t0us_N;)>LdQq@r^Q!uXc?DwD8BfEV_z{5@1Y=9i$pJ905OI;m=9TZMd9`nx37A(&^H>#iHROGTaa0B#MF@zop5}Tjpq?+QZJT|Z50P{+5u1050;i6y^&ve=F z$Pzk;6yBT;5f|H=!$zBj5>2*0m%wn4@}LX=VKzXfF~N&QXybjBs)zyeM(pXvZ4VHb zQ+ON&tK(VDj|8Fc5oCQBcpb)1f?`&XGcNrm=kP#~<>mK$ofGKdVAync_hOhvg+B`- zhQ!$1K?1@JQ z+=KD5<;P1>e#??E8af9{bfMbRFc7}wAa+Htz5oK^KZAbHgwO}xEJUCY@2tcGDEMW(AX0H`+2MABV;<}zy|2*cNvh*lk zu)T^fT8!RB7OA{7b+ni&%<1534T97lbYGNw09|vrB0NnIi=$3)uhz`%ZVcs~mJhl( zgfc)T($Y%?q38kHtJSKa8NBj-9ouAf+j)&(+q5~fHv(Z32t5X`_`jeXnK?k@Vv0yJ zx5NPOsy{&2D^PY2lx_aXBAXa*I>;1z%7*|>&y=^30Rg+M(~UR4!6d`l~g zX*8$l!J->df_)&G_i&wH7<%+fhm7#3!8E!XT2Q=Yfq5mTS973?)TQAW=!mVyv@P+d zQ8oZxQP~v?1K<^cGT)$qW(oK`hkm&+hDU>*Jupl3JQ}5xsA3Wss@_G;0n3ulSekH+rYA2!iv4L2vV7Ry1som`n$!;FaZ; zp)R_C7g%{el3_L|ybgH}9CB%VO9uy)q>CO)9xz*nJOack9phv0ivJY6N#Q@+Hg4TZz47F`X!7#om2PDAw9Y6>TL^Izpcm+3>7zeBL)BSgy)xn~B3#%6s8meqwV(lAa&ooL>nra<`SAzcN0C)w`Gdx%7Duo4e zb1>gD84@RFSH<;S?(`AclX>jYfY$NMeyF9js1}nH+PIw#gQr?QONRwT;~MG&R+mA8 z0q_cnVW9I3yqZcs2CtUbq4Vs}DDph9>0|I}nc&6BH}DFo1b|mwXb1D(z^hd9#|{Y1 zBQ2;T|G4vZ{t0+h5$ph!d``d7VesTy(q%$tN#+>KVIB~qBDk_dO1`8aFgkc;w|yn` zYB~Altsc~t#~>EHD4M8~xk39xsiY1g2%}-s(rdE-&?Px!F+yxpkv;h(LfGWS8}}=& z={&rFiE%U=EOHj%qKwtb{>X=Z$uYmvMHPK-!`{|GN(XAJ)L(hc3+ee{3dFcidGpNI z16~@rL=^jLs&NuuN~|$Utc$P!HLERV`|GS@F6>hdZg5PZ(Cb_ruCe0eo~rk3nV@R$ zvOHKY$2iu_Pz*mWFsI`^dxU*8`vDXHc=ZVl0I!%YUB3-I2CvAsum%czH)kB=0PqUtdgBzl zVw&?e!Z`-7UO|q*E21|WVp|ReXHjLFjY!b5Zp4v36a;`*X`%@B6Ywgr*TVk`f`J^w z^S~6_p}F+Va=NfJ-IJl&jLEYd%eM@V!K-|dOCsGCxpMG51H>|+v$=q+FQ4$PYx5!i zUM)Yw1;8uuCf)}zc^qwN+9+XrN9bJeZ3q-XNeqPsM~9-JfeGHiLkMC(5KPic#hOe+ z7bKMS`d%mj7=b-UdrJ)phX<-|aZ^Iwhy#V7;L>a6jGzn1Afgesp1X`7qB}pet3)9K zZiS%sLfr!0GRJq^PfJbfuRq@r<<4-tR6m*;e%0B~*YOJ~HmHufiGO<06JttcNuSl@ z!|S`nF9*bKf2liQnLqjfXjkDBZ~SK89&1+|dMfq!Qzfw~^|ryh_yam_ZMu(?&Vu20 z926d|2yH#0I5V;p*eL6WPjO*7_HHZp7fOm3(G3aM8|HXdYK%IgF!ex;ozVF?IFq_) zbdHdghd*O&{^2J@JWP;fq4QViPjAm<2)Vvolxg9AId?Vn)#d(}>X5b38yPkl1fObF z?(p#i31#@WcwNVk4e!qhwAiB48gBx4E@v;>8lJ8`h;B?H9l@W_`$1c*jhJ{porAaLPv3cr6taQ`7}@i4D5~ z)(s4Fgia${BE88GNf`e*7aLo>6a?5J@yhipIq}aoGBZ$@5+?296&?fHmG{m^`UJ;M z9}B!6OQUgSq3GEPv%-i6pQZcMS}D*37;Hr{MZNFOIO}dYo1dS0>r;`OANQ$t#a&$V z6iQZTGb6Md!8@6U}Z3Mjh5Mqn6HhlE{VUL zK5&}9;~WNa7yBDKQFPG&nSeVMZz{WTIwQAj@EYGcPe#%iac*M^U5Hw{$88m46(RTH z;1QA2*o|8 z1*7#3yomc~r<_T`1+8qO z>#m}z1jW;WmXv1Dctzo_=2j2I>X!A!cFJ&rdq~o#SiP0lZ*ioI50 zN|AmuB)s&phPg-M&=NJW8QIvg=8z$vi#As;y%s9)@WGaPp6Yg1XyzU0p817Ac8r&( zF^cTHqmg>(lY_N0pTi4Z2gOkq;COz7VAm-IW5MoT+$4Ew|b|BB~(vilkA3#V#YDbeE z!*Fr@np`Q|HQ2>11ga3*9sQAoT+tyQ7mxBThO+I=u0nD%>kD0cDtQ*d;)|Fg?@I;1 z&XJcj4c=Bvi)N_V-g;3PLfWuZ=K3xNgKqaBuF`nZwKQ9r-8UiJ50(jAdYIu~r&Z*< z8Za2Z*Axz5XyP$UfoLP}hyz5*SM!2g@T@kaa7PHMQil}p8J%dhv~#UW0>SWU1?s=l zuBsB-M1nbdxrVKXtCRYq^92LK0PU(ec~m5yqi>&7^!oE~12l4!B@>Qb8y2lZe}G?Z zRamF!HO%Gtb?J^|Fyl-X20y%uz-J5l{CqzlbAKlzrhW!h%Q9S1V3RP@5KaQEVsM)4 zB5}3?dI@7vi6a;FnE#D-RaKKm(k5CqB3{TR&N-sMRqMazxqy5e% z^a2{O>?c9U%0fcKn~Glc^znx8+EsP2L0R207uPhmQO09%MOZ53Gw z5uKA+!K?vMq38q8))K?r@TpeUcV%ApODsNe8<&n|#YT0RQq~a#zZkJ0(Z`LHjl*o_ zu^Zvj@-fwaU^r7)v37ZC8CR@o>dD8J=NI$!d-3W7AX1TawO!#BjEvQFhmGXt3rRaE zIO6+euMb981`WG<@6s2Mm*88CKP1%ufV7mc!hI6jD|+igGX*2kvos>yBVY>8Zx~M0 zBp|QTZVk18CJIhva#4J)#Lu}`->Q|}rw8v&n6VQI+{=7c;3|cj2H#A1-&6IebeOhAf&sXUN8lspMxsBnP>+Euky=$p%dbgUK z9>wz)ZA@Q(&^pf~cuu0Q5B28$n-&AfXC7mOHqQvaGFu*0l4(&dxO{sqY-`_cip|H+ z3#n^%iOzpoJ%exQLE`sVXDLCe{mlDHWa_fF;s&0nlV`~o)AP$q6RiE8-#1S9J)&i> z$g21HP(3AczH!=a3BrFsH0HZ+%+<0|9z`I|dZJyevy>O)YqY(-cSLA3$&>MMh@YBZ zYgu|rajCFYBmHgm*z2ssxB01qIj#k_TNimOKDKs$9qik`afhCa%XJWfE!?2-94RN= zb1wa$i{YAYwyUYcsMWy3+p=$S2)%E8fSO&nhNV}JMu7b+)9eKl#W7tQrFB{R)3f%8 zXDk_C-@JJW`_f6&RaY4$;Y{4E9KraxW-{`Qya4&9>8^yz3=nnLw!8%z!Lmaon0K9i zo>un@THGUtF+99Zx{Cq%{YXZpOw3j2b7c!G*JUjO#XNxlG3iWhrDd$zPVsd-*&zAF z07g9e;8!w>c&<1!of>%77!!g6($i=s;1xvNT%PiYCOWF?cF++3C!!G_S`x@Ik&X_D zH!K)JwMhYPQ9O*}i5a@f3Ym(gT+ozN3u8QJ(l{wAwlu?fUh`6nl}}Mto@SO&Ll&ks z7~7AD-a-%8oZ8wr%R1BY(vuKuTBsptm9~N$?bDofO(*Vt%v6k?z7jJ35dM@Yl+|<) z?FDw9s)2rC7kLoaVym0oyz_-xAl-PpzNID&XBcaOr8U#C^da5yJ5|c`EJ^}G>SrtY zTVMueOuFkRsN0IsG#ZnIvVgYonP>f2+%EB>J>$&s0TB2ZeS9g47haa_Mkg zo*si&mXFs}mBn0H7S@v)L?S`d-4vXiyso`$?0tB9M^=WE%*@xd%Jor<twP*xc7aCh;fFmr$ER%o*(sgn17g<2lP(*WD^>vD)qbwr!MYf_2Ip|M z&Wk%2h}lHnS~GHYQB9*+h^*;HLKM%8e2uZL2Qs?pUUI}^E_DnkMtRN(NX#KZKp0<~cb#%A;bz$>I_)y8WV44qHwmh5c0^tKn6pXt`& z5R*pI5_FOmDoBdOVygy9sbUBf+Sq(T(tbUH%j;%=!UKutR}A&n7=$+TX>%^kTl+xQ zEEuyX+P@{xv}I_eIXYhjB}9FY%t9pJReW;vR@7#@Vx2&UYt%FN&FTKr9!oAEaQk^XTh*PToN$E`FgfyOw?~NXpl`+!cv5 z-7!%uMyjpZTPV(kb%>jH+Fw!T)wjD{1yiartTD~kPs*1gcf zYl+@heM_O6vZSF3(x|-_?|Q`Oq^pdwa0m`8!alGg+0N&{8x7v+4aLzum} zd|FWuxYOlxebdM}ded!`!WWHJHGtU5!l;Kbs!krs#Vf|Pxm|(UF;A51&(D0OmV{Hp zeZWu+fLHtM3|&sS#W};|L`rAu4GQfB$XSF$lH?B13abY5B#B5disjxq>w}&bYb%#n zQ%G!g5lZWpuI}Au3<#2bYyvCX9oJi`=!Q{_xH!7#Q zhwHp_hx$sLL&=~*B4ZI>LtGMH2Q|(|NcZp>;vhi6YT{8i9CyzYSRrLS%>UZIGT>Me zSlk=HSe*TM#9o)ZPUT41CA+vd7^kF(yY$6aX?Ss2`U!Z|fkqpXZ|JO$+k%iw>L}^k zA=lseI*)=L5Q3%eIE9jvAKLW!2Kv*MZ0x_Tu_JHAyN( zvTq%zy1Za_)OUz{WLJJB^1Qx2|1!H;EGscHs*~OAWu9v|-LO0=92Dcgr?`rYtwP!q z8G*x|KPt9qBp)_Ob6IQ}wI4vTRv0yjMSc<)2A@+dxnI8T3Ypz`eit?MC6hv`6b3qa zQK_z+R7Lot#rbvL?tBwp5;f-IuchWYrE(&C!7?sSaQG%YF(kdZjhzEIoFP_89XoC> z2&YR~X)h4-9mT%9<(3+G6*oq6xvm3I2wv?2>odL-QFy@0=K6XtOQv#@7N$L9(aWCO zf%_z8;9d#z5U$^NmEuT`0G)It2_BRpNNsb0^3Jt5#xrq+cU}%7sc#R8u3z9BMtl`7 z4B#$@ss>^M;ME}ln>AMM;3e71**pFQ3;M6{^ecAJ`ws3c>BG*5Q)w|%MRCx$3hM|h ze@Z8@d0n(JhqpY6yN5%Jwf~X%Dgo(B9TlJ6Lq0cM#cvNG zc(cDp;4Y{b$9f@-4O^e;;q~;rs8m;!X&UMljtUNGifpD)E17|*e(zZuWw}G^^e3wZ z9uXSPBF{%1p)5Rm18tQH4)-+;l%+HHM=Oz0%-6UI#?NSAkk%te?XS&7RXySq92noC z9m}H}+PGf}r=OT4+^-z>ymp7xo$w{0a$+y}CQ0sV5~e;KTD@wn$%lUHSm;-y9<`2; z*MK<2&N1FV2^ed$rFblD2)D-ditpmxy8osCbmNIWhU8k!`@??yjw|CU5Zb%+CI(&Tb-%TSM1Ge2WDGGk&qm>T2DU3^B*^p#XSAG9J&NuhAU0i=OD|I92De zBxYLpT1?s#rv~wr_r;Q!U@51FD@FY^hO;s(HU{Rx*Z&XKu0k4S2rq&;j*)A)Ec640I$#D*Kwu^42=k@~)13h{tU+)s2hf%B<O&=;|N(IndM&=!%2^>IV)*IzmyYm^kMD`>pU_OXTaVpjn43biG zl}Ht4wM*LX#J7;r{Tcc?Y@F{Qj|;(42u1DK4e{Y_oYf|yvt60>M_qK{oeWC744MHX zZfI=K&-rs7Gj9hX%4_L*Zar1){lL}Lew52_=(DTSO&eKE$6CnXOf1_2p4U+otzydQ%d<}yX#n-#_3uWf6DoX}+%PL{Wuy%{gBWFJ-OBBj1HTzlb z$;dH67au|lmSvU_yXQajE`L^>=V&t!ku~UPolQh$jGbHjeAK=1Qcas))qS?b3K;Ne zm?dFUr12mW1qGjB43J)ih%#ae6%v8%akrX#w|BcpL=@k+fElf2L?@7iOMx_+@u|i7(`MUXFxzg7(yCpL`0>#hLCQgyGvTSyQD)(=~h5d5s5S3ug~-BXPw;%>gf2PZi^pNbX*XlQ`lKsv}P(eJSvM&;c8u$I1GMkm%PRQW} z@3=M7H@DS4zgAuoj|Yj;F9mjIs^z(Gt1kK|cuGk88B(qbP8EgrGlf#-Rasn!ep&qS zw6k3&WK`Sg*;bdQgQ#z^_Haq#|!(r9O5?K_V%#0~VC z{FuGnzO1yk|GHuJxyU~=^xfLL!zaW4wC#q1?iER%{W~y%Z&5U}yZ2qApWWNfIo_|7 z@1$weDeoqG)croD@q5)`4YRnyLn8rXe0#~yA-tq?7+yX!KkkE;ooviQjtB(w}uCOnDnih?6g2HF)_gA;o$1ZbhxH~`CR|^5%w^xO=)O__V z10TvR4~Ap)InutQ3tT}Ib7GFs+u>e~wer(1ua;~Q0~?7gA$Iub4P?rL;RCad9CIHr4thWn>FFUC9a?m{E;m=ox0m$#|t?VoqI`%@LZrSOw8qL(>@{jT# zj|7;18g3jze||kEXZ0`q5n=eEWd@bQeV?{ z$#tf5GhV*`JyMJ?tD}$?gq4VI#|&rY?tL222eLTc)Ew+7S@_V19Hc0-BI|XJzR+JSxGdGdj)4tM!tDW&G)KsO&sUZg&LcWK`6CCV~Ua#~6F5ErSPP5HJ=hx47f~0*3QfD`R7CvC z-d4GC+VOt_hHkzkm=;IDn#|Vly$|@U8+Vjk9P=O0vA=OjV&~H}iWN(fP)nKTjQU0S z)CMhf-%L`~v!AcvLB7(5O8Z&jTgtHC7G}iPb7sWJ`b#o$jv);FNQ(2Y9^&KfH*{3y zgfI2`h`V!#=_*k$NA4a6h(cA4!>Pa z)U8u;tV=A?lf@;)bt6b2YDt9-zfG-|ux0Sg2t);&c>v^#f?UCrQ@!sX*Y*;iwOlGpmF8D+awFUHzf-R6aFDkt69;)Re@c*L@q< zos8-8^A?4P(*lIKGNE8EB|7G9ZBBYT!o**%o5v#t#AU5Tp;ANmBohbNUW0;1K!T)g z&+(@p5r89Lf#^y$LZy9chu@kppQ`dWgQk?gIvX*M$1%m1X4uOCOsv;jX*`H|%K^xpHbd_Ike!q@o?nVPJl3&hv2!K2@WHAS`1bP9lI6SE zYfWNfGd$J{TP5pBk+8hEBAYylcH58-`CgDT&5`IYJyDF8wYpU*lc}a(h8RccHo)== z*4ruw(9$7!mG~Rid~ZQc3$^CfN+r^3TSi{P7V;`9b1IWD!SGwNO`nH%Kn{ccQU`}n-bgcOxi|nE zhz{)fs9PYquOuCT#)qy4(Tbm9^exaqUFUApH$)*Cx5wnlNcXqB4#8|3xG}cD0Hzxp zC2mEb0~M)fpoA#`sY^Ji0(U+YHi%>0J!N>3B*fFF+0PW03-bMVYXO<`eB_|xShg4? z53Bnb#GF1Ih4R?c%zk)Um$qjAOoM>>UEI%x2e^|?(ysa=wzQ|O1iyImJT%ni&PVDl zW?P^@^?;4kKt>n(lkqIz=Apt~Q5jz95*#*EI%Lmzay5^XO{S!vt$ zn=Q*sJfKqLJ4%y-(&EttLD~_7C-a6w8^mrt>`~g?4=1)Y(-fm9?xE**Q{0#6#G)*O z00&LWH!`(8+7@c6rKmyEDP!w^youCi6gFYhWIDGr^0O()C8#50wZc$(K`@atAHdLN zOGb%+2HtB3p%E;U**VpU`{DeL>d7F6hQ=U{M%);IZX3{TYfT?bV?eNgIN+o55+NlA zioyiV4$L4LfZ$8xX$TClUSKQvRcAb%+8i|HsXZ_|X!J8vmhWkc?VA0_Fr!o+OlWt% zNbhunm8*h_V#`o-WZh+He?6>5FYNfbmV5z<|E{^~S}xS$Pmr-C_=0iv{>MD1-*z^4 z!=4Q4SOEY2dIOb4O_0*&Sui3qkIeiOXpgo4VEDW&l;SYVm(v6k!`9Q+AA6zL@^0R{ z8Jd^eRO+;MqvU?D-+a#*zQ)gwL|nkBqyhJ(QIsm?P$4p&a}>3@TnXE2LY*FHZ+Cl5 z>@2>Hb0NwE2MIW)eK0REo?*IQW>O>1<-umyv?4avta{E|Rs?7 zqO~_?QRJbL-e71Yu)U9ANH+)DqY?n)G}7jbt=25*8#6j^36~ydzIj(6fXS!W=roVs zC69h}L+j>M>lLaaLLRoD$`_IAyRb!gr;oWC#afC2tsp6ENm$W4#CHzZ8oSB)tifV= zU?pV=;~m;wGiV%&ZmM|Ljx9AxSvOMiDAfwee*-o2BYX6ePGsslUnt$@(bWG zQ&4gW(k$mfS8(txA=KKM)R^*|Fy!7M^RHvLm{W!D(@DKVL%si#_aVI{&;4; z4F)^BkGpx`p&r&9exQXpmFgMQ^J%twldCLpd_wXAv{x zF*B^MK7ecmMoNj_o5xoZBE${ggE zN3XjHXVBm{n&Cg0k*dRWzKw{w_eTg5vBNG5x)&}-dpS|^sfvuxnt{ERtvZiPT7gp9 z{61mN9oE@t=^mPFB!y1iNwI_fd7e=)ugF>+Jr$g)QAi}zhD2MNSNb~-F+haEg0ZxR ztQvMg2o-bI7VEL3y4hTDo8@vbhXBpBso1W-(C)%Er=MD?2ROJX^vqGryB}G z8w_!9n)5gYdlcA%BV?Ng%$Iu9; z*%#uG98tuJvc(@~meXck?V-8q))uE~!|$c(PgE6qt1UIJt(3&ji$qF1X7QLN4|HVF z+0e8TqW_ZX$3G2K)26ivVm_V&Jei-DbJeNFWxDg>NokMYdOWkm21%ei#l;!7SD#;C z5JUII^sRkH%|X&V1rgoE_++$kY%lK*!akMSNpEK~ zn>mnJ7-fdQ>fk?S-ahkQc;?^89lT*_&@1w1qctF&dAoW7YjeEgtx|GZ7ocZQ2-6=ryG!OLd61J|> z+%CWf1ChT2^R1}?HpFuF>E!}(uOB(M!5WEdX;T~-Qyk+lS_Zwk@9qn_!cEy1m@{=L z*<5Gc%I&Uk>EQQGuW_9O%U&=pK(SXypaCc$E~&y7i!Dvc^gNz58jgN2JJl4`UiV9P zI8oI&%>s<*Vrv^l68PUdWXj*-GW z2eTpK@4-7527tcsK%@V4kKi>m$EXvZ(J$A@pWJaMx95v!y5yPL%mS{( zSO(p@ivEkN_82k;v(I3R%`_4*zDK?cfd0gQ>df@`hgqAEr2Ra?mm3ZH`Ht;jb`BbD ztGV)k6hxi5(Nl=<4E*I(0CMmRWhL92_ZI5haH9rg|1P?yJ`d_2aI__cEc`XB#z7tA z+KPlSjf3dzv3D^jw&f|BeVypEKCNMW>T&V2O?~o=yx7E=cCUtub9LIP4K-IM#;bMW z2^@NUaZ()s(=Hs)mrGbxNk0)#I6g#X2sgz?Jeb*V>9^%mH0Of_^C!Sxg7c{Tl*4{l zs5dW0du%{%*yvL~3N-HU4a(76nMYDuDt_8el=t*r*%TYlPfJNlyUn>PSG1YdVJS)` z$NGuelCr@4(1NNljb+(Fpbmw~Tu*uC$!BBrcE~zI3B&qysJ1zUax+(CW)})*63P^J zzcN?%lSt(pOq@+)Y28&;g!`qZ<${)T$-TG)BHb0?pkf%`6p|Fl1@3)Y&KpGe&77n^ zfL1_Wz7FR5c$dG_j-h$|DT6u5I9osIoAOt1Q{#Tm5_r_{@4_O2$Hm=b;D;7hxcQuy zp#ea;I6Z7>=w$`i%k+HR19+PJrh6FZ74{4mHH zD`@FI+Z-)iXgul*9abW zQVqJv8Z6U*$ATJ3wl)PE~)+^>|` zue#c=A^rRlYJfm)z-X1#&}YDuY0!Fg;JMF$`4zK%`=I0MpmY1s^#*-*&yZ*4kbnDd zxX<8A;~|r);b+prp+18dnInlzBkrrCC0C<*S0gD`gJsgA9ZX}EOyh0RW6nNU&uX+! z`ooCvc$f58i}cWE_;3#Hg#YUBko3e*`^5C>c*yF++12D)`-l0=v0v?jnwh0i$Z6Uh zR(<5OQ~<%1IR#4xo^cN=_Zrbv9x3-V{TyayteZ6|U}nCDnmcQn*<_mCcZPLsM(}!u zwqu&rWS*yEo>gX6?0Q=2dRE?dPW5_P!(@RoYr)iHTKIa=Qf59RYyLL81MgMVtjzT^ z%4Ci;Ywj`g65M2ox?>?S>mx>HNzY^@ZEeY!8Jnpsd#^2PGOzS>%nV%5qfFMMx> zR(X6EM%LyXO{V8J=-qsmseM1aTl-YMwzBB^@%#1CHS=1m$=XTBr;T28M8QQ=SennVPoeB610#vTm>;1Ii?XTEH-~R; zD5_Y9yl@%XZw_|4kMepBt8Wep{0=KV9o2t2dYgTa#d27keRw;`a`gV`VaunZo~Or) znB(eC$A6xF!Q_5jVL3_kJIIjza*+L{`qLL|GS_;8odo=H-Ff`w=~rOaDf8{=1`8q0 z?IGjs*ABn$McLoJ%YGZ{q`ka3DSCRolYL$x`xTXQYI=L7Y5MK)gJVS34-D)SVfy`L z*LP^nPlHc~rZ@ZXo%^P;d(S#|U*GKHVQAd^n7wXxQcQO={B{jK?cn-thZ;^VqOOK= zuEsuJO|o9knqDvFT(5k-{>*x_WqPxhb93~scJu3G0H}X3u*`ZF-rMe&AdpH6_9l3E$ibmd{3~{${gk<6bH8UTQ>P+OR+s) zX#|>2P^>*(`m0_2k{hh@{=;(keo@k}Fcc4-w<%=J&YLSUg8~fJxTZ}On=C4YM9A(* zLmx_AGetox>tC}{r1&LJ*Joz%*-Pj6ml1~WvdAcq zg#UIJt6lLAuV5(Ei4ygk&hzAfvFikV?91=86@ikrxMajSd^0qz=|!SyK%L;>aQ9r- zscaf%f!WQ-=0o zJW5P4Zpws6xk@~XM6%Kn-*fLFtxa)yC%%44wjYUIX+A^p;IoWuk~F=BsrM5Se8X?y z8EcioQ3a840(%itA4?VJt)&x6T)#YV16Oy-%4OR1kvfB2ECAlC>1 zqamx~Xf3!KFcikhb(#fTW`nCSY-hz@j%^hNXifK+9Q*mKH|@lzIgQ*|{5j6X=QH3m zh72IB*L?b33{4LY7FZVe*_aOZi`sp^B*8ScMAlr&Yu87d~7(om%g4+fL9>Z+SS$6g|YZZ9n0;w>`e*v7f(qJEuC&91Q!h zUk+6;o_pH`_C97GTEZny_m26qn_E5MeX`uVIqK6ve!HkwI}+pbZPZlR`%cS)TTfu$ z`mr4syE(Cb7uzM1et-6>PyDXFIs0E! z!$cgcdmG&OUI*awnOSTW2CksU-KAkbocG-c#AVhZPF`mCoFb@VgE%>?x^Be(@xep* zL*`5z0Fk^n1Lf&qFu|c2@R4mVw`&b%EH&jVX;>fAVH{6`Q7$=dSll+@=#kdK2o>;HlJol{~_WPILd&k z;B`6Cz=!!7yrGpm=94g`XyYn8lJkx4$??PjLY8`bE~H#B3rbEnT5-WHAK*c$!wR2D z<0xS=?n-D9iQ2~0f?tt}%wV=V7L-N)>A1>OT5KbNacf7x!M zTW8Sre3KvQVOAX1{`>LWQu&Ngd>b)VJasGnxXj%sxb*zU#2#)OLCbN;1IOlj)aOpQ zPF9w{DG4eJ+=^!GH2)r11Uip9DVB_RUkG-X5-v2CC)>WScgw~=6|L)%>mknY?1D`z z<`uNSSD6Q;%Fv(qb)bluV2kZ0d^U6Nm326WtrFhznB%xd2s_V0lFI<0p-hJYd$N6MFrI;C1nvPnRQyFDYhj6`8qIwwu1^idBcQj+;_o=BY%wSw zD8=j4e$G2X;nN5RH6nGOfMDJ48ng{V*jn=z)$ZvpWm5GifWRW(;+ImlGcL&CYAn?O z@yE+4YAtsky9w>pR9f-6FO9vp8Xa>8{7C;RsoyO5Tf&fm*KA0v2trCX1PIKguEzxKF5=~E(tE?>SLZzFH{83tyqN%EoBJ4o{M8Ggs}BEfgV z3FYd2V!PED@gr)h}29re^EJulL= zTI$zP3X%QCfhyPPL}q&%#y=lhL2Nb8?%G>uv@KGT6>yhG8?4K&ERRmO)t7zQ*e9Ol zT<m=KhGTF-4-03$LxmCrK5V96xQuVWK6>ch7MdPD%EL z7{T8!#@lymJANBF^E;=rE|1Rnc|LdLcggtEew*Ym94uk+3212RvBE!zKjkiaUL?GB zfKIs#%vv)`s`fdy`lSVI@awM@peHx$@?0JmcYCKjTWcKCwz{YK>wVMO7eZ@+X|kK2 zttZ!8ELo?YHL~OCu-X;f^Oi|kPmf-p&^OA^tF1I_R<-+RwY|E$mc`Puo8b`^WIUBGzJ`&zat zy!vWi)&Fv?+V5-X$<;BC{&L}@^H&%7&Ag@QpY4;ozxrZtu*B816^sAx&y$Pa>-}%e ztGljg_x&!8KIeSv>bkup5BQwo|3~RIpmEM0H!Og-G@xB9fM7pB$|V5Aje$PGxGx5f zhhYe)Fx0~staeqH;z7?H*wXN?b`=&_;EG`%4y@4*v|l<+-JxHL#= zA0z4!^dR-V#Bh-OqhO-_AY@oD2Yax>aPZ?{Kc&=QO>TbG{op6IAq&6|t3kUQ|Y+eH&5cEs84YyyUDWUamkUm&&2&L9=-AP;It_X z?a(8@wtnSKrR|Vmh&U+#4i2COm?cjV#f^h$_Vd{fO(B9mQwl#63k&fGiz*Fk?hC_- z3(efZ4Hgg22=hx03x5}O%NS4^(YzS`CRNG~Cn7Q|VzNeJRUo2_E)qZ&Sj8Px0Eq0m z@T$KE$ALs-&qmrm0tZe-4iZK6J&MlPikcbr8rF|&;*N;Ml36tpRLHRCu;{g+i21OX zV8`ekIY3hc$}hlwQQn zmc(A|$7yWDg34TRmn87QBdNN>{)5bNh^0*PA^RQ2UPeVhoFJwYpJ^Gth9$G6!g8@6 z(H;rrbWt476W^OB2#+}Mv`8REqOfF^_f%v9BvI1Shs`rlnTJy1dD5@J#O?J&0^+3h zM`2`0u)@-9QfFwA%E(*ho+JZLdky-xnSEi5NHAk}m_yiGvlefoC9HPkkv^QY^R9Tw({V)JBy+}Di=66DoayljQUotB|iVTVMue>u4(wYv)bComN41ycO zGqTDu+Icc1_cOZEQhOvaN74cZo@c%e&m1MrlF?7ZYFF@owJYdw=6qVxKibvv;D5BM z=WgrIvy+;$wj~lid1ilU3EeEqK5I#JI>`R96nwyw^Jh5w=Sa?{(461lF@KhBbMQvv zPKk4YXEvk7)hR|;1Y?z+ zt7@UxwbiBGMc&ac%hAHg?XZMYPfM(*WWMDd)6bI7Ru{8vk|4vZ08VrmNf~Qsanxae zqeW?aMA?w8)&s?|W={!T0gp7^a`Q#p_j6@)#AYdyzS;q=PY7Nvo~$`7sOm}lkF zlG!Kg_YHyxbj3g$yj5E&RXcW7dl6L!a&0!>kZ-MGa`|KeQ_fMD0J?)r(r%$Xfb}TE?+|v@2Fn9TvF) zjRBsURp5hiPu59*FYxak)!hTt3nJ?IBOyq$da=lQiHdqEk$Rb<`Ug_=qHXo^8TD#- z4N8#>0*Ho(V-1QK4H`%Fs$&fgj~dV~Zr_U_IJ8$9oSfd9{Hc2)ENEC!{}z|#>B@Vv z3OtL9daFo;Mh4;Y8lt6$I_EJ6QM~nVNKTtP^q{OsxF^&1sL9Zf(yape<~}uHhJw2h z*xw0Is-!llp3I_1n}k19?_WThk6J5MC7Mo+K7P6wNcHiwD&U7?RvNFt5P4fH6MdI&R$S_ zZR@N>&@CKwv?@?vynrrww>PTZcB-MePF{2c3U=d?QUgi5zKwN51-rM}x}WrP32t;@ z1bc)udv38*mUR2ZSZlNt-Pg$O^*^onq`hQOy%c%f1Zs4&tCZM8b5#l|P}%c#uvgT$ zi;Jn3#ixzQn0i;Lmxr(a2G%Rf)Yp1MDdN+=_5z|3g&p{(Q}F529PdLJ_vuRa^N{u{ z`g94c_FulB<5z>IuXakSLG+KCZKRou?YlQ32emZYUS|%Fjki1TQS++}nnd*z#1B~; z_d=uxUK;nhkq+Vfq?Yud3-ca!9v`Np7_wjOeaa`d>`muEIv8#|5O>uTeB6qOqKk+c zj^-Q1!fo-DZNB!Si*Ig64~_&<<-`9jmJyOOa;mmmYhL9vhANARaQ_>L@WO%{YP4-(Z6-o83Wi8)BCI>?wf$ohJa13t{- zKP*r`EP8cV5(7Oft2(ThIDGf@uo`?+%YPKf^8fB+0f~r6K_D^g!~yl#HCbtfb|GJBbg; zRTYq_N)Rm-g%lO)N2;2->dN=DVhyz(qqPm5>Y`ujDT(OER~m8*7*hOYvxZFRhs+t} zERr!6k4-J@>@AIlEm;(-(jVDm{+FDkW|!G&C;XSq(tMqRWwY!ZJwG@K>pJE6I@$d- zv;KlvSS{h0S^r(j(hC;o1jsb+WKXmUXPk|B7V&PdrwN-v5oqa!of5N$+`^QT!Li^2)Y& zmz}tp?Yy4joR(Yh7sjg1Oa2$edYM$%Iae6;xzMMvI0LIDEZi+GzXnX#L>W!nd*7e|fCViw`xM6Wv%It8=ohd;0yr^xpXN>dtiE$#lo> zndY&%t(m!%xnex0oMMc%S%bhiKg zzCT-GI@#Ir=U3a}?{tl>&g(zt8%Lify9&#oIE38m0r(_z3V)p}yZ`QFp=tlSlO-NR zfpxOjXznm*22qIruTB;@*2yZM(ElG!7D?WAg53E!2UvXkznm<^jcrPq@tPe<6PLU< zkpCwq3-Q;*jlF39zJ*vyB2}U^+*3*R14%7{&28Q~l#) zeXPSeSt`!Tg5gwkF^H7@*%78Rtdqt0Rn|U2Ja!!GWS!~Qs|u$4<78Fo9FJn1td)^> zHg zHFF@;qKI{}mW}^)vYf*ab>sg!St;R&F9LHT>{HSYtA_vz@LV1M{Xb5Y^etQNebvld zxd8nO?IYsre!TS@oEWCqB!G|+x(ZMp|LfBmVQ}uY8!)WtK^D;VJF+8tN*+E&$^phq zzwrZnx*#Ra<-0L<3f#Hbe;N5eYlUT<^x^{E`r!8Z)4mvhCgJC|llbSY+e8F@0C0-9 zh&!qgr*+(4U|&0$LbdYyXHs80j}i78{7-#ios6$4Ho3-YW7d*^k$pmg**Moj#SgaH z?tjJ_^mcm{o*ylUN5@!58-d)ue0a!WZpBDNu{c74Qo=H$wc2EJF{TT74Cyh;gaewi zYd#>Ld6xMkW`~?F4$TyjFenKS3w>RQhrBzxQXS_WC}5ngJr1()x~+l1xFCfusi`2X z3{EnEM6q7C-+{7LwZPH0uU#DQ^Fx?OB19iD_L__sP_|J zs!aN&dWc;KVZNv6r*fp$9_KVLx<`*_ThcMC_;L?%vtGge5fgr<)b^4=H++0aL z+0~*GeD(?@uC`_c|FBK`kXc5?k$yfkzex?6R6(cRYvVi`*@f$xL* zmKtzjMz&}Y1=!$>2KH;gmb~Zt?DJwDX09W*rBU~Ql+A0z-{bllcKu00>pAz?3&v*%eRty# zoy;+2v^(icDH|e=RsiH?Sm}d1<*`qDaO~aiO4ht^8lx`dg4hwCm^i(NBP@svj?fs2 zL-R_!gDs8oV>!hw&B14v^6K-#g~9es0DqVU0pRfqkA6W8x+p$xK%kXWraNFqkOC?2 zvH1QeF6>oW73&tFKkjs+=?QUR!OaCX?Po46t4n!?NYIoN8A&exWAd5?D!zcz4HEe; zD0T6Dpy@qHooN~p!X&)CW}G=)U)w1?9IZLq>*XN7+e#W>YlE@iutG~E3mE*a-zdl- zm(X`xg&?Vd<=bfa!mish=9EYMVwXIyz6o}yvqPHvrDkrvG6bn=nD3vLVSG}^->Mr@asYN94`B) zTk&6)h;VaR1%6-!f~%2|XC>6Z7fA=@M(x-{XbKQE=aqd~be8)eDPx-Ry}~fHKr!QZhW(Nn8xZ z^|=)yW4H<^&nyTYO|KcDX(b*w9fVCaXNcKPNkLwnBht;H`|5k8ZvlsLQgb*GxOaPK z=D9_mG9Di2wi)u|5~9g=QKNu&-85yCaXh9Uj-DbENJWXBGwZbnL!Vxm~MhJ~W4q3Pk34a#pf&t}dI!;`{w0q?;TOADK_Q^qMDu8c8Ka{nWz4 z5AP(Wx?qlvDe_O(nex{oxw-AY0%mDDJ`{vPPE&fCaRjE`un75;DpcX_=o80vkZ-Dm zJ0-^t$_d@D_Np1yd4WEXe6$E36E};v1MTV-CWN{asp-?6^}w!LWyrYD~z7)#sCK3j;&GlPy0Xvf_ZS58dO6lT;}nTy~JN9iaBZgPC+c zS<=P}LE~-X8SH$Mw0R8dh@TAHH}vV1M^&q@_(j?wMC z43WE&g$oxteAp@G-~u`Bq@8sUKHCH+u$AOGCA|rV&4M73XAu?RzCBvxX;cLJ%|Nj^ zJc<_UW{>{_vjn{Haia)^owsHUAPTAt>>%u}x7c;mi~yb|0ndy$L|hn;+nHDZjpu!f zc5251Lh7RhlBuuJcmil5pKa7qy_*aUo^t~*UjT1h3-1Mx^V%$!Y$lL}igU(HtPC%% zGfqbBAn69eofU_;{&?4`UWltkMO}*=0mD~aN&=Sg-sHw@Ek>AWBa-y+lI4RtYw+dP z9q8y&;i%X8>!1q(ldOEgv0U<3;Sgy(_aqCFf_i(x9tAs1&$pt8CHXtzNU2M-xh9gw z_qk--CMYNf9@X#Eug&e&$4B=qC0CiJaf(e*KLtQRKBH|+xRcyyNs_aq_w!DQivfg` z9`jyU{3ys=OAD`oJ$2$Qm{pK8gXUl6q!6#c*F@u`4bsB(5~!X)ptnjw;I|O)UK~I$ z`w_cXAXj#PGKJO8)JcgPIGhaMo#MzWGBjQ$>ru|l65Y20`l=tRF3ai0oNHRhh6>{E^P0#5Y14e zC0V3nSfqPdn05+w{!Py?kf_EX7kGgbo6TaF~cbjBI&|xO0kN|2vspC{M_(SE*GndD?o3H!rQTVF}Eg zSl0sUTtpr_MNp&U)@kNuo6huVxx9+E$j3}J%iM#;u|z5aFT@Ol zFncXGMbHVuZ%a&y%@ysOO0Ek){a6RdbUP04;t#cwUgu`D*5Dfn(>AD3#Ma=?=YpbV z@EV}x2<-CA(RV9=IL#Ur^Yw}`FRvzf*$O+R6m-EJ4(T}wsYW^3@@e@y60#rZ)fx;H zL<98S^(y*>`12u!mU~rZw&?jS5bpr`W{TiL3;sAsEu|4D!U189e&-}u^P|N7=`QHi zGYHo&qDLa8z6IQzs;{24YP(Vg**wG0T~rNj6vo}Y#~1%oXtADuNm7($R^%K|FK}9q z9tDEk=uNmGpKIdr;bdl^Hoq14s%qSd1q4F^q`K|UKeapowY8xGm1}jivTpiJjI_!X zWTxWIPn{d^YDww7Nn{z|%_=CrHWHhj0bgprXG#q9epEkTUe5(9x>&A98{T_yi2snW z=mbLGs6yX59NCFImfqm~FeBL%i|VffBc`~I%#aU5gc~d?B%zH)^0hHat({ZCe%g(% z&zd`k67X=viEB$#o6RNU+wK=wY3V4w<^anqHTg*~1%Qe+YU&G43)jpDyQc^&aFBL2 z?au4%8lxmxQ)JzzYTanEy>1eVb$n+?QP{0ik%XOoN?ts7Vl5u7k8zN*4>0a@Kv?FK z^a6RuOiw0%?7gXIEdnim!AMvKc{fDqwZA53G54o+hxvfZy3$>x#ua4px}RcPDDNpzT}=U+p;Mbz z#`2Q(sbLyErnAXYZd2enc*7c(fxEzHdq_*3_FfIUW{p<1Ay8*HV^PvD?Qgdse3bRnTqY_KpkKG{Rw9ssEa2eF=v(CAO?sN75*PViorbgW0;JlWWa5F+ZPH-2~ zcr4C1;*@Ma?Ovw~B-Typb1qqkC5asl@bZB2x<*f_XRJqhywiT1%5r>|bR4H{oY6do zQj^^LaAZcX-R!EPkg3_qevsE5Bp^VthQ3$eAf+LRXKPG0V^+O}gI9?h(_BWZ<mdr=bsz7;GZH=lMBoTW0KrH#H5@P~V*yO>9MOp{?wYne>Z7rcy|yWt~K zMK%bm(SA8~EEb%X$yTQJXIe6h7 zUcOWmYZQ+pDJ+^OP_1ubl&7t&3MCwu_~Mt~r~ra#US0tM?+=WfcrfU40R;oxhY(hg zpXba2YuNgTKbJ?Sd7mvW5ns?jEpL`tvlx07%?xiP%v&|!qZ<4`IqbW9R}Uh%v`kzF z?r0@-mFWlGDnYPCYuyhNZzWyGHOcFy@Q&gfZXl&00;N8OB^P$3VUnUGM9%q)&%Q-} zI?Qs2kOVgBlbS;a+h$6C$UwI`D0gHi+@85rnMtSvU4NQbp3V~hc_53?Yu7zW<~6|i zOo*kPn?1(|lvbhqe8y*l_nBiMJ3Xizey`qw;Mo0U_9T%AdYp6C>0n>P7k&PrT zKvK8ta7rV^yHfl87NF1@{pb-*rbXf-i;ZxWTJmd3zfMY)7ygfFNIH4d=xMTqM=$Mn08@c ztrn5M5w!3fh4KqsHC0GffawRB14Bfje&l{3LP6>xC59iOhkzI$_Dxm49*6?{rn)Lu zD5tY2pJg=Soe^D5sRK31LgZ!jYm6wh5I&-aF^9L#==VnQNAJI~n6Im1M+bn8J-ler zuMsgX`~4+7L$_b;Erm5LvY=!s))+Qg<0HlK^9W7MKH=Ab1-j!`z?wOc=&!QmS+_If zeX?I;qsVhJgb-cY2RHgMP^p>qufn+Bcs5$E0Bt|2hSlm%?&^?De6$4SM~&~J%=X9^ zqc$qDPQTRRo7aFx@e+E&AP`f8Fn5Yc00CuSbbFB0u-{p75MIX-l(>@ue^rrWk(gTp zVggI*vAg%(TS{f2tL~Ic!)fX~gx}w^3sCz@0A{6H(*>A@@!}{4#;)sRQR1)0=e`8# zPi0>|4Yn3(FIZ3O`XUYX)}UM`a|Y(Xo8=ohAAs+-cFI@|Ay!Q zQpL&s9HU-GjHFVUJ72)ANvGHDP9MNFhIA+4A0}x}KT{^CjbN8rZJ+SX6GNO`ezWUO zt)qg8srjuwPHo7Cfqqa&>esGIigV5R%)S}ODPxflGT<<|?)-Loa703tCSU z8qDrK&XjQdy1g{Jr;&>a0dpG8vB{ds-Ww5G7Qz0Y5uGU%8O|R*sWf>1YG--=NUw%u z#q~@=^mwWE*?g7t%EA|;X2+e0BBRBxCheX-zV56no|tw&UQ{5?baycC52wCov-sb#IWQaso898F@4c7s_yvpH1c)l;vwVYgHRl*IOGo4=Y!0} zcEO}9k<|IAJj7b-go#pB2Z)c_J&F?oQ{A*N!oFNh8!M3po;p z*y~+{%Ow)LoYe~!Rvn3ZOW9vkaBtQ-D@y;^E}wjW`yo;N_8~@SCz#(@`FuK?pT1^_ z?sK7wqS(E+6k6);5zpSM%fDBC{CFowzKXFv5U)#1gKBGzhWyFfd0kbXCpyN~bqZvG z5iJkZ{X1K%vWz2r!*m{hkOqy43{}4Ey3doSHu6kaXF=bdme)$(R}xi~%YACVX5>UJ z_jBe&kKFfi=jrIicdqa3Qf6Gn?k*1W64xhVpw&g)Yho2@dSo zCqlQZR0gmm&F6)vC%TnZ@rtZWZk<#NIT#vA-Kr=>!{@guX*UwKnf3P{3+#>@NNc(gtay&y4Zhbxcl zy5l}Z_bbO?_7oqdQB%e?)j`CfkMo4oFCV$O2ej7?OzQz#2OS%Obe!$lC5Yo`OEKdI z<>6G8NnO^9zV4scO*a!bAM$I_2=Y_pyKg7zeR}^n$TY!YD3y~=Q{Hrgcz&}he-T!3moaqAl5b`5d@IG@&*({kks{$niNF7iXjbeO=ACMP)v@3e zhpE=H+9Kex6fvL(E)W=e5A$>*`a5AMk1dv24zS3+-Q-X0B=wt8pAyIgK6hcI7`zwE zb4*FxWGP40vFpcYpXY}Z?Wqd5OX^^@4(Lb;XV&dw9UzjY<47cWNd*ASTnsYp!oY!Z z@-#B~Lz0)Zp-L|KptY60s`6<9i;XF6^DRm;o+sVT7kO@+qrpB70kXof6^xZ(TS-D; zIcnYU59{dS=#p{zB(>~fFvc^?lkrMaXPUD992{vv9)PoOLGBZeFxlWK0#l+q@0-)S z2bUNw6TTkm@A@b%$=bmBA2@`uVS$LljYt;0AXpZpmn0t&^7`1Ez+68`Ry`o(uEK^* z0#k(j{g1<!MTzjg?a zwD`8oq~j5G*9IBKZC{r-px@KkyH0YKvJmv?5cR{Sr<-#)0Y=U@%<`=;&?D1~SsyaM z$(A;e6a3D1`@EPi=-ITRRzsZTlxSv`G&7j| zl^gn+bc(V5a$d_^n+A7=RsX}`@b9P}_|`H<;Q;)K@=p(0vu*LH-)umP&T&{n0?1I3 z2Oy^P0@12CMYA)CYnaTjI-tGL7g%ovp*fZL9N2={`Fn(1>mgiAH_4 z%ruD%!f>X1Uq-CG@^QCAOxt~Y)*htcz%L6&8)ciNl(r!{eQC?JXO<=a% zdqRPl5Y|+NSZd^xa`xw6)Jc*Jk13zWE$vjrG_IJOs#La$dIITqEs}%#`&cjbfnfXy z@^d--&aIcDu6^F0ogJWEfBrAN?mL>z{}25B6(Nb8h&^M}s=dWZP@^?VTUs=+ipDB! zDXrLhm8x2`DXLamyNRv#Rz+ivsw(QEs@%T!Irm@p@BIGEAIUja&Pm?ac)g#Gsh`hk zzbc&Q7PIHY&>l4t!}B4T(nr6&4c&i*J8bq9i=602Ub8H8us)`r7M~JL8ohMQ+J@O) zvOA{fYm5PA1(NON;{mFoM9nX8D{MupEYq*l%Qbe-SsTy*dqfz=78M%|1Hal3OJ{Tr z01YT4j|oADn;OGoMW;){4iA-C1E@w$Kxq$GS3TTO)Ej*ufPF8L@AdC)wVSjWUv@b8 z%9>J_^NC;^J4>WGsZUh0`+2{}I;*LZsXA#ty}_>18=&+nP#xBH<&f`P_*;o72e)QU zUBl&s;hA10IhXBwIT$c(;&WXb0AkfIf)nm9m6~Thc~AD`CYM>BLev)$(c}Iuk;1;F%5!yU;v;1CO5mO^v%F5 zrr7O^;QW|6K%`Y8E^i!@p#TPy3D&(L^;mt-_lKFHac{I*sY44swr9r@>v=|5&cBGz3c%YXf%)*tg5oB10Xfa0i%1S00r81vqvG#RVOM3({|sF_1ej9FIpV7Sf~y3R=uGnU4aCNv zocZ7Hm21w4$zG9VxqAu|009{`T67J+7%qo|bG{Qp^nU_=!#AJJiq&cDK1_CP9LU`b zn8Eyb=y19scFy}5Lxhx`ZlOO6@)(u#r|^ftV|c^&O4EskE7&#uZhK(z`Il`m8o&!|>M}He@w|h;ngS z05QMZEj;T%CyChUnEqP(vI_DQK&yseR0CMHAPnW*erj6D-!Uox(OVHA3IJkKsmv%6 z(vplU0D;xQKomJofGA;vSO>S>pa$?U2y=b3XnF6rUz-4@p@&kF%>*QEMQdv!^`TNK z$_8NRSGNn+J=ZGl4N^+w813dlmpA3W6`M@#~`jmlLmBK=ejG zpwx(V77$AlY}Jn8Qan0~Arw4I?Kuk08U#W6B12jL0Hz({PjnEcp~Ddqk=NAwXzB-O z?yC{$ENJw6h^6%r!U3xL!U))&&ZqR$%E7CarzAMCHBJ+Hi}hNa(tw{^#EcV|S{&%i zF7@MVPe|xTOZKaVf(>Ki@I>mD!95UGQ|@u}d&S;ZFhDb^`~ys+k_&~gY5+LH!i47Y zC@Gr72%zLIH7uqX6l3aQQI~cZk#>KWvFQbmuGSl-#pX**m#fw^x{M}-TQsv_D^1MiE5%Ug#1;Lv&= zqAY;vPX|`~hQlA{g-1gxTvY%{%QYFR^3n#F08=1E%9=ycRmf; z`MHCdCDJk~L>PgOQ5LO@@=r)bxS{z1qu8{f?=_B%h(sa(KyJXVvlc|MB&KpHr#ft6 zxoXFdf)Fv!2zqek zAH#wLK}plICIvO$r_C-sM{r(4)R9CkAygV~Lh}1%#=x}VfZ=JBSR>F-)Z-8UqALfu zQ4p<6s&|@DmCP`Ht!P>cfGOY7!-d7QtqK%>6K_BJ4-Y;EV}P1Y47D5p^pQ2GrT^Fh zG@L-yB!Y%wgbW3!*#seKMBm&naLp{eO&T*Nkj*21X{C8F_-echkr5b8zue{^P6WYh zknZYlzFv*Be;##(@u}QIhTHAj6#1Bicz4q|A@>N7g*H-NBS{N-^~dWMZR1ERkmjlK zvo-$(->uz2D}_MNcM!$z5eWKFO*V)z>3#ZvmazsXuXXlR=#EQ3D4+(MUmC^<=nuzU1~N?5FS`Ov;f++~>KP?8o^){08g*YOynY zaLRJ(B}78d#d;L}K{HH7>7a<*+PYGIxn(-uuNN8!e9*dT2?>b0Pla!1x#GuBZXYIrvkg`=*yle(}; zl@~Rr0re?0H`BH=*G_a5p|uc=ks}q`C|C=o&?wJ>3&KJRZ-Rr>Liyp5;+r?_!osU< zw@aUSgzSa$%SEb_B%TA{@cP&*?&%-It1kIW574^=rLv0w4fhYbeDBHLVXR)FI%p7Qi!SpnyFYQc!Wn(){COB&OD8@<}m z64U+C^iBgZccdl>OTxG-?FMNREsW7Xu`;4O4gB#vH6tf*X~5{>ZO|H54{HKBh{$!X zcW&_$0^SUQ8?!Afz3Ne?dS>P62KwT#>+=f<@b(OloF@J}&ik1?`@-Q5TR3U$ zZDurg=IAxfQXCh%yzYJYQ;6IL>dL=VYwSK$p8#qMa6bT2X|?CRd)tN$>L_}(R)dNz zL?5h1bSG(aw+!|$`LYB;eiV`Xcz?0H^c+bu|G2tzTVrex;>3a89lXKpb9HUTlyA%CyJWN z!QBzwYHDMvhg0yyfs?eq0Oh_4-p@B7jI&n{o-S=M_kaw^P~Viq^M^H&NVOMv4)25m zy~8Me@iPY{XHkx$u=Et~lhRmE+i0&p>|p`a3)^&eW2m0GL2r#li^Gb1xHx$0{GmT0 z0UW>tAE4sg0t9@TKA`zC(9}J*+b&p)I{?hBk$3nVU<1fks;e9?(PRST&SqCxV$ihn zoD7jFfJn!A7X$y2u4pd7PlU!04J;sJKO$93bDjqDQV{W?+y1so12Sr#IBi2+8gyZ| z!c&MKcmPY&FWzsydT>p;Mj*9>=A#zSk=E>h1E<=%Qp@S*PPS0e3~6DZlDRwY(|&mT z4F!!7d3{d7H4p+{DElT+)874lW|a4xFA}3I@KNHqjaXBX2*rQHTah^d4~w{u$&bkd zvt@!DE$w*ALEa!4!9AXWj)VAUw)U*hXL|=`YA>{K+gilSY4AT;B#x&`7l3VG^)X~s zC_`)WO$cq~)o(wupFv`azJM{Ht~BDW3F37akr{|Fg)#94D4>Y%{SqN?O`lzS3>6d- z5#Tmfs78n6dY~3fMGB>YJ+pBR#{qh*%wNPa12AimV2G$1=iXKUI1D_S^HV`yi}5-U2BYH|@%$Wv0h{6y6e*Qd zXqYDBd7`vES#hiWS-^6{r`fuD6V;x7Rd(l_A0O=g&^Y4r`1E)rJJCD9EBN!j$iRr! zBXzl%y#LNh4ZiJMn*8YZ@~v##1?OQoUCwO>qY_eoWCdR?ao#oMP$gezTJo{H zn8qyQ3~g`okH3nOFFSm7MP=`_Z4}Mh%J!)0Z0&!USF^P} zCm;OrsxHUtXBIyCRjc!zm8F~aI9RclceyXhTog~TOjOA|9`7#q{@!js=otL3C74X5 zqgm6zqI1e3^3x%kTgz_TjPneJ7v=mZ#r(1U<5qo`#5hdm8guPXYM^& zaIOk9AUu~#>^`xaKhL=%ZN2%8`uyL2%s;$NN5NKuf+qOQFxe+H3?Vo?(_9LT_I4_X zChi`s{qfF|c zYytiCxxbwtLEjA-&1x{dyoNl+htU!>6y}}AsK;RpWfa@M21$L+GcoALfvW{%AgZMe!7kJcvWg|lDm-g_1(W?@5M9g=zY z_Upp0U3R7)b}xT)u@5-(MMD?&9}U~ue|NOf_}k;k7g-5;T zRIyybXOxC_FqFgu`9l9^FK^ZU7kOHDllGa!YvGc0*V;G#ecHS7c1b$)m%$e>pb9kZT zaPWG{LhZG;9n|!1B%aqutxK`(LB^l=Ycd}Vy3B7T|8u;>C^hQ{sf`VjCW;%}t*X`Z zN-xpb9PD|Yhhe34UGt+FTUA6hk46pON;j8tnY0c}IRP5FWPWigR7Q1dv@I67i*6w; z(8?LwN>4~>v1V>yZziW&)eRF1`+ddU%11OzGVXD;+8azufO56p+wV7qoRY8Q7rAq5 z83pg8xW+mt72S1m;br-IueY?W?r|HRWm?wIkYvO@&H}ar4?3TEzq3&>Q%g1TR=B_A zS44O5+ueut8d`0`ay5o&t>RKVNk8P0YyVU0b-vY@{9ja-(jni^aahYjJ6ZQfMIy7{ zpcSJN{V^S`=4RMpQ&nK8np*7F@Oy-4t`s@7K0TTJuY*p`f0+_q84fExeVL%tU-j|x z%z+ngL$H!;@*h8=uesR&O7E$itB?A&xxilwueEe3e2MZpkUe18vb1V;OW;2&w+!8| z`h=do`xhT?v@n)(_9fGD*x$hwz16HJahB!YUY+%@2^TFE|MyF zDt8_Se2Q6=<@+zPw!`y6wOalua+u>o-({>s9(537`h8$%C9cCQ=vrf+TD4G zLQ#9YUic#L-xX9fV`s1D)s|Lz^X^U-3hALuJ}k=Or2j5$5~@Hec84!hi43YNKr!^> zGxXe@@{AkmGSFzqPzeU}{w57K=v|CkkGpkVnjc}{$G_h@e|;(;$nwYrMTjlQOjL0L z^X^#XB@XjQAcm#%zBQ&^-qhn5y^Tx~e8Y=7eI>}n!YNr320@Rg+X||sGBGU?5q3y+ z$~DaxEAHKb66^faKY{{|?8<}j6&D|s6~=p%${o|WM$zMRyNfbd4$MKFS~|ANrA-(K zc{Su!40iHLg(}U4$p(YjBgm_*mSXQXp{;A*mk6$(YWAACTidM^6A==^QrI zhLBxe{(2amk_#27xa!8(EB~AhSz);5ZzzxcqA*S`ebS@2D<}eEk#k)!9Rk`5dZjB2 zg&y^$mt$U04AeTX;V+hE-bQmz0uq+Jsy#i5l=FD_X5Ono;kJDIhuI-Qca(%bRty)d zZAZoKLlGXt-sq27CBjiDPW-fia`3r{=%C)clg@h{oxT1YnklaAxvuV4!wut{1T)FJ zxae!cQECk?+s>BQT?Xx)(1-pYj9@XLs*}aP104_tKi0jo?IOn7aVdbwo52~oy(ZG# zbIA?t{~+IorL2sI#B6t>rSAFFQM@A7?>#T`C>8a{a(z;_-XTSvH7P>)hf0XMTz*Ru zZU`r^c3y8T(=jQ6XSzry6^JQ;RCc3G!<_C9L#{r78p!k;c|j-UAx=-$nu#SkM?gqZ z2ZP0pW8jOAwGiCWSHxx~S4F9fNuO;fd{5%rUyG zB2-{?ICX5Y@uFMjPH4R3dDiVUTfaU7X0@c=P4VOtaZi$s#C!0I#f=xY-^k#%GM3Ap z#B?(3gv#cx5{_mV2FZ}O;nD$v{1GFXxEB(GBy`H=97T_{3d5{V9?L1eezQ}vpMfc6 z$rC8CIMHKAnpm@4%q=OX!HT+=rXdAs70ZAGYG%_V&Yu~r*59FwTSGmi`I z#^<)ED|ackW;p>@7%)#h{B0nw$@Cz})WxBc5WuvVFd>X9NV)D7xqK&hxa`qRxsH)u z^chk+y+0&x;^57BMJ!c03*Is!sIt%&n2qiK*X)6 z%d=%KC9g{;+YBz25qQgW((k{nCJeM!6>x!ktCMwLr2!GW1Fju&nN_3J;IVP%s(pO; zcvS91+TtWU`cc@vyLhzFr8p00LDGvF7KucDi3CaaU8Sw8IjZ!dP489a^G#+>B9_Mj zi#)ySZFIY&A~gGrwAmXskUW%XDO1O!ywM@PCaFRqSa9GZT*lu`RoAmsrerngx=5{? zrpu1Kxl9AF{!Yw#Jq zQn0wO6;4+bj!0pO+Vvu2R_Sb3tyaI%X?!)XAN8itgD`5xxQDCE+aB7iTJFhIlb0Q3 z&yv#55&RQAXW^2$6Kc7{x}8U2@R|RwgCtB8XE$A0+TO{MG`!dn+kBOoI4w5Z6BQ{` z)p16D@yqFQ9qXd2>J}2#zW76F!1Az`d)algOcHg+$vUX~m`(w6wW-oq-# zWL_0CQ)NoNu4$JXHmN2U?P_#SXm&>_`MY>d0z|&(rS>;jkg=U4ujU@eVoo*FS6z7b zc@$&(&o>EIb*cb#B>B?eUfcQrFDW!Pk657M zII+7^Sjl$0nvvzLDNk=yr{FKx{qX$EMV@np@AZF@cw#kM6SzeTNdBGaiA{}bA=uPtmhy+ov$!t|8y za|-*?{YRul&In$<%PaeUz~GDc+lg?B6_NEt>eODE@k;lp6T4Mo{9icoQ;p_WnpWZz z)avlPb8`J|)C1n$=W$Sm%xrCuS5vQUKA7yWSTWEcuD-Wf_1rP3br`DlH9@gC?-@o-@g;)PTV);Z^fP*9VV{4_-eX&G*ZC zTDXMmfAC>e^@ZkoPZ-c~e0(6Q>X$tGSfR}C^Ggm1Mjh?fHj<_na@Hen)yJO@CAYVE z`$V=^zcB1Ln>&i{9((&dRt@_2F(|x##4%JUvs$H*E2N_=c>JF2C)JSuK89SorZ-^a z+q$I@T2VeXxLomHh^$(;YCw*Xzp_5|hD*nwlJwYgZJ0)5l($;6u%X`l&frcG1J3eq z)X&Qr33nbh#=TREcY1hry2tEeW5S49;!I=WH?^dV#-yKW$^SJbpR1)nn^Kt1)l)f| zQU%nXi#0u$RZmlGO4CtKH*88bSATJ<>BU|3jC)NP-s&$OH@yr|&-}kQStIJXGflbQ z)blo)@_wqn{;%ovxq3ddIiDF_z|mYFfG!kkE|f(VsWunspoUN*mZjjkwbu6T#8{McOi30>9OTs4AzJJbC38@hU7wq7V&eMK6q#4sIPID0~GgT+Jgqu-v7Mv!D>5tguc#;?6H>f)5j@# z)9;?nlts@Sgulws(j4vi@=i-V=IP_UHXWkY;#bJG3CQd{t&3LItNb<{&a<&^O`6Al zrK+%OoO&d^e@z>w@))w9&$;eUa*$W%#mLRS)dK^53}`hx{Nn48$!WR&w`W*SYqhAi z%}+DU`dR--&(=XJd&AkIc`8;Qcyr~M=b^S|g!ae;2rXxpGms1j@h0l>2Lz43Bg@b0@tUz(1)uI%Q2!YKciZTOF$;4CZ@P{NNLcjjr5z)j z$a#geCf#PPpv@vOGWqThCt*!v;TvWCAd_wzwW}Q6>`{`(vTH>j%t() zsVV6wea>s_Lp_5rz9N@dYVGuIA=_mqQp03pX+NiW$)2i#0G@C~p zl#jljnx~Y$J79ZLkQvHHSFLKo{v`R1`&Z&7jB^Oqi& zN40h#dtUBTzi$ZYMZ;;IfVPBD|d7h<6kpRpA5~8FC$a@PpN;u%{H{iG0U)xBT+GG!pYQZb}3{CUogBQ z$4G)c&TV=$>}^?9AvkX`|JX@Te3K$lT!;CRSjOvd0S-jO6B#?GRfzyn<++>N%}Fmm7;<%Oza%)^yo^j&eY8M-i2mF2<) zkIyJ$?gzq^40p-}<4*BI6;bILn}Qpsuc+fB(v<2tgNJdyG8f_oS)|#!D-m(eHU-~Y zUEUn)u#ooKzL#sb-sjHmOUELHybBvIg_{fWzL>mjs>L>CeZ&66jT|XwM0&K52rSQ5 zbgTCCn@5`yxg_fw*k9eHeZmGXjg@cTTr@)35yr zT7O>g_q#LmE$49q_PccQE_}B9wI|yyN1Mfb!I&Q+Y1i5jy{xxPYP(rq-Tue3N1_qP z+3R4ugBq$3&|lc!ukDm8wd^umb$d8-7yQufH~_eInjzcf~Hqpq9& z;{~YR!xx{YY&65Tlh{WTPn#B|v;P@)CsK4CbB)IwZ6$Ps6{H9ypW7b2`Qtj%Z|lYw z=Kp(KkG%~x`Fcg&So_g!K4x^IiXh!cmF>LT{o#xs5r1qJ{RDQa7Eb#OrNU5*>u&z1 zmW?6?!m-vy>?J443?5Qry^=M1Fr7iHfX&2h9p^EU?%)#7OA9?wm6-SZBs`O!P4w4> zQbxr?9IdNdq)JCkgw`OQxo#878Uc?~H(K#gpIl-JoYfOJOTmOVj;#eiB=mfppN5UT zDDeL;Ra~oBO8SZA5(CU&N1m3;3qdQhc+96Zg;J(mJ;u zff(cG>}MF3u)#unR$AdjlA1BIsmEb92x-`dWqLDOA?e@Xbk6yYTom}$`>IG0Oy$_7 z8(AwAFO$aAdRp5fxy|Gc*|K9@fkoaSg^Pf>T*e6gO35{)f|d`4PgokCdk-$L9Q#Ca z3uP(;Kb8>3DUsa4^3Xe!PF4>aL$N|sqDP;uz=@G4_GIaU*`S`6hj^FfBZ)W}kpo1s zI0W8rYvz}S%-uT8hT9g)2=k|2$(p6%8`pm>dX?HtYrj|CvGk>zefJu7EBX^&Kl{c^`)tp4F5a6pf7qrMp~>*^cY%xZd;RO3_UTbh!?T z2ZIY^aZ(ud1oo_a?20b7@a))XbKhEfZO=(~fA*eS#Ce?Yx?1#e3z)TmDz`$<4G=Dc ziczwF$oMQp#KOnXr7&r|AgWk~`BdCp`bHgta)RQkw#h6d3gwOcE=zQ~MYxIy2zmy0 zewlrh`YXmo`A&HO($B`A99?48$xuk6&)hn26FsEA3iYW-n!8D(YyS!60RTq2{ETArd8_Bu;XEDsSW z$LhrVbfAkj<<0xB)fcE2+g($bwKtz2H@Rc?uHd=uI3izjc1y7HieDddeDLYUQiUtmsa>$ zL&eBe4U{YI94!zw8-ubwYSFo|1m`e)z<)4WaDJnNPVvd^Is?nPeC1B_qRlsgZA*&Y z;1<7yCYZOM{z^`DD53-2`>dv_itk(D=lFax$vA)5_WTO7u~COm#q+!Rfo#$oSzUS* zwX{IZLa4n(IQQHbv*jd;p`BzX`h*xMCw@M`W>8)r!$N@H8pJTFrI#WKEn>(|IBQf{ z-|?Y(5SFR0L`sc=OaIhIH}xj_DeM+r9Q!6DwzeY}0Aw?S}v@rt#N-JG}y^83WD3(Byxc;OyTq#siizp4s1cH`HgE zu+O>W&02+ev&66Q7mk%nPI_;|OB?I9)UsnnJXblFRDnw}0D2o5-{?rQdt{ z{qz-ds@`!#pNU%hfVOk)%EJmbp>l!ql1&LZr9&A2Q-5Jjw z9?}xg<(9AQrV$WK@)%`|8kg(7qyU26n(X8VYV1P86#jgUXB3Ek4W6KXK#vYsVhE!! zS)7a|c^C=vLv(J)_XOq#!u>8CWa>@Od<*(^epVCwvxPlO@b2>j(DAxR@J5SItU9YB zUlmMGRG~Kd-ew2cdLWpYKQ)Hu1DG(?ILF`k^Qn8E5W_`)_s5tzv!`|V0sf=f#sYIl z$Yc>b^5PtHUY6-)j4*#g0e=JbD4_6nhT!ydFTU2{tuE}Yl-G(!^=}cGtaU-Fb(ud3 z7GTvH>l*Cc=k(Ni@CWFw<6;cu;xgbH-KfAad6(V!&k3Hn5QU^8Gb{W=6qmxQlQUOc z`Hu48+gR>|A%S2)L>Dt8L=N_l4E-et`!bTV7wSQWWj!oF>MwZ^1d;i2=lTT#1pE&I z*=oJuzAlYG2L*94y&d2!c1R{{u4FosnJaph70Ap&g94Z<5yzbb2RE*`0=TFY;uJyy zO@-6T>taCY(k{5IF3+Q%s53Yi;r!G{1T_& ze_QOWXfv%<3{xDb$kZm>N6hQgwF0ZPs6u0!XkA*DC7rq+b_Z`hIzd7QiY|_f#KT$X z@i_`V_r>LfE_UnR3>CCZ&Dp~^8Z3Pi4nVgHE{|wxI4UW$T!z=ZR zr7A_NSc2}@oY4-;>O*2x5M*e;^LxETX36k3W0ribi~Sa*!!i9gi@3;=N|q^zGzjXT zLp^{|)*}pK678!=fXTmrlR{zHw%{SXdwb->XF_J(-s3sN^N9|!3R#NJ($U)$tf4QC zznOwCFk(F3XDP<+t0kk*WRE?AC2j;gp+JCPLI6Hdm0?*k{zhPV>7Cn{0HFZ7PSTb< zP%y8)f85DjLp%z`^F09s!laRe2V#5J@DnU|A;vobU@%R)Zc;#>;7gM_zU-i=Vozq< z**=;}l8m2FOHPA(;%S>vQ*M4t%|teg0zcgn`%^4;oFH97KC%XPU{j9l5!Lvph@8ex znF!L&Mqz}EM1pk%+OgaXc*r-F+-Wjf9F{!;W3`T_eS)#toY4MNPj75B9gLTMie)!b zR4x%jniPngU^y~K?CbeZ0+wgkird7GFL>$Io*zvS;K|FXd&!#J%u?(;7O8H|dc{g< zi-2ctt>Ns-hu7dR{DMeHR;~AV`G2ANXGQS#117)peOLL8W|ARu*|(cNB*u(+kBG*q zgUh;`FSF87%S(|*q%|~_(Z`+tlt`!Ok9nf|AD}2OW}dCWWSrZ;zfNLMA@Jo*FgIhQ zR0-2v%(cg4v(OGci6X$|iqN2cKm_Q*@?c@Pdn*}5GJ&O952i0|Pk6i)AFl)v_fd;IcX92y^6%h+I zk=r+Xsy|Ck18V+4wvC2Dd(c}VMM+UEc8jLYOjJBDSBvZ^+w-U87R)yo`N=FA1yH&I zKJ5ow%XoeT`Rj_IhN?fA_)<^cY+fS6H(&o2J`_ZB^jNRdq6(DI34*H)t**^EhRpXE%YQ_6N^xOmEikZHW(e^3 zP>5$3J>qO-C(wrBV=utn$l zuV3kU$iH1`ZLDgo#=-Gx>4JD-$*Ru6xnUX#kf=dch5tmC&A7!UIargl?F+VwA=iF@ z1sKxhz7|}mziEcPca@q*UiXk14HXdIl46Ch zvPQ}7tZ_cS0v8{mA6C88vEoH5B;bXG=dW7a=15L#f=t^VhjvOC+&LGDXnpa>Ka@r-CV#X1sJu z4kfWKRsOJ~Tk&))Avn{;)&*ni>jc~6k@4etm1Nv|GVF;GccR)8dZAd_u-DqI4((vp zRgkO9XP%voL}XLkMu2A6dm&{Q}yd!PHJ&79&rj2}xh!zcmZP_yXhqDXx zPW$aS>;`~5Yyv)#p!OiEJ z>o^O3{y^75Vdw9VYsmLGU*+^zWiBL#$XC$0;;-~%DpyEfRTB0r!auE3FS{k8Q9-=* z=@rVLxj5O-`4N_W^i^(IRz>ApH20VlZulXsBUdvC<4cL!vlXxj?Pwi#MlR**HCBE) zxn6@_<4;SY4a3afwPkz@&c9;+@y@a09?`A`^5?7azC`i6{zB|ev8$L_p}AkarK%u; zc>*eh(51@79X!K>>PbuH+u|0kAoMgTq;E-Z+b!WwD3V)r*a1?uEuKAglYU#wc@y&{ zjiJu=bNd-!@s|6e-<^*%y~=z6*=WHC&n8F>j;Vu^$MJ);8_O8*fgVEr0&l~e|F|)} zMfK@l`Kf$}Oo{1zoZ<$VK2hzR`5%^}X1HX!X1#xP7l{65qU?qsVlqDxTFiG=XQU_O zw8r>jn?Z0tI|1wX&5P&dH&dabw*o-$$NxE5n!1u{pued%YPVJvPpWa%d?KT3T}zg$|$z`v^`2ViHEth~4`m@VQhvNvz9pIvYA z@`HD-y&=4P;dXt|Ieqc-eaXE2X?FcDbNaLA`(N{ZF0%Vvmh-uC{_{KDfqJ`vk2wPu zHQ|dAM5o>3g};_ZOzU`DXm=XCCLy1TD;j@XdzX&qn9Y z#xKk!^LxE^%2@z`NA_SvSl_QvDI?NIISkJ~qspKbily!xM$wNbYC{VD&>$Hn#d zXFponx6c>&zc2Aq_waAMc((CsaqBVvg|oFAxVY=7y|co<`=x#N-QwOaho1v^KRp+J zPB`qn$=l=7+2jlP{%Ua_OA?rD{}p=nUGVPr{>Al)0-YJo9RcP^Jj{`F z5{==LBcHq#Ku+TSIuYq7LCozs>^Vz+%U;vBf8%Nh`CInsJbFqG zc+24z{jVq{M%{FyvBKWx+bZlknF>o+9}2tA@rF(0e-4upFc{>Vpj^q+34b+M#Gvm>kk-6QH35I+xJHiFeX{0jcVDK|^&=~`|VRNkbTDEo~`u+BWlhyK! zQRBn^=VX1{U8qp4{L84>xVO|5&!dy1*|fhln5_}Qq}6=5HCcJ*Q<7H8(T`!(gEJ=W z*5hBR1KB#s+HEI$Lyo>7%sTC-e-6KY{gkZp>Dj}@nZM`EI3g935iS%GH?#Qce@+&~ zF+21E(p=Y6)n{N0r40J46Z1KJ0yHx_c#t>C!Wn68XK`_o)iC=IXgka)L1Ea)DN*HQ zg(n`pYqSvUOep}vo~dtyM|$JR1*6G)f1D7g6uf;bBV&iti<=dFG~B9Q{Vp#Zp3#cF zRCh3TeMNp66OXRdeB+9c=%R3>T1_lhG4??L+R4C*y#$8WtFw0GGylZ8(E{AJX-SE}> zZ$C7vtXDQyA20C)90OstR=-M^xKPKWNJR-7`IzUy{9A*vC>y+#Xt>NdZAY;kF(sj4 zlCB?_FENWt5<;=o(!dI`2D(3pY6p4MSJi#rtv*_$pNC(a|Lpzg?2O(gv}yU;-ctdu z3@Q%fgK%07iF6CXhfkLjV_3F#-9@4dPd+cbCvly8{qX5x!R8ph{>r3OPrM>2Z61{x zmVc==yEaeiIG>3nE=$pitew5(NYZ6C^&c~+jDi&C$TEJ5u1zRUQdfw85xVBp&<7}q z2)cKQUmsv0TyX(*R)anz96 zEMO_>+osuaoGeqo%3J|gjv~s48w^|L40LrejdJOAw>PEC`*#q)JPq%-dIBpVgg)1FD#g3PH1d@aIzbS{uFfeN~x8XX?nHQqd zs0BZFo^l48&WHQbvsZvWUq5~o5 zW5mge6uJkuBlb-z!SsHp6)_>I3K+fJE@JxPkcY%>wP&y{R1!E`UaAn{8>$0O2LufW zj{ej7I=okXuD+sU8oWF)`cB3_M@`AkjEM+C<6r1q*$F&#pJEf|4zwTRc ze(Bkv$YdBY`~gaszAt5sDY{tLzalMnS4vFNR%GPihqN5F){VG*;}Kt=Am?hQ*xmJ5 zhG$uRhJa@ohW2(z6qS57%3?)THf}88kwG-dr$p_1Eps$fwDbOeHGK2))|hWZ`W=xQ zW5$;=$6q9|W=TA8zZ@w(kumKbEnm`wkzdUW*;k5DFL4hi%uT#T3zy6AZL{NRU%9&Z zS6E*uIaod(OBU*k-iiygJV4S%CbGb6)|PMfj!nH=7OPt4yl?w4V7eh=Qk(GbJMYSE z?EA&CYMbEi>=KsHhOqoBI=cJl9gmyU>kq4cFI~31OKU@Bd889{rHa>3BD})mQN0@B z3QaxToPf|u845=TR+_MMRvbnvU?uEk;E?OU`Uf4e_! z+n)a~;$u$erFU0{zAqd;xYx!;kPU3&xx2>{)wcUo7VbziY4eJ1UulyK9;>|jYwlKy zAdOGwlRJD=vd2Z4@2ewa2}+L%rJq)ZsIY56xiw+=Y(>8>M{DKWg5NRh5_u*S9UkLI z?H%2H>6v_j&LAIm%WTh&s(G9s10P@ZY@7VqZo>OwRN(7yP-iv zxjOovgF_J8Xb0%1jGnM!X^=HOS*?r^^+#s*U93~{ zJe#%rgZ7K(F3B4I4|{hW4fX&34}Uge7|YoAF_!FvknCfZC7}{66dDp+`pbo15Ca9iJ}Fu54n)57K_w4Fr)qUwaY8PaUMk5fDiiVF(Wfs+MJ! zVHkMMh=?2}Fd+`{gn7|D4FusxAPGV(mldBXQR@+@bBcuFJ_=A-zHh8|A0Vfmy1d8O zM^)8lhaU-x^OtNR8{mSB!h%flf=pNU>PQ6bGd`=|7G!N43^-Yb!h*lP4YF(tej*lp z7!l$q5pvQv>Y1_XATH{~bW~1Q)QCj1epA#dT=Wzo zd|Wkpc6I05u;}+drFI@5WW|(EM1Kg2`D`4rY#Z~HTl!O8%n2MV3kvLtF}kV~d#J&H z3k7B`X4G{8vp9gk$vw zR=s8^c~vt!9)tpGJAu*R@xGEV0FOoH5%Zx-GO)dF&wkZ8{JP#}WcOG?;DsdbwyQy| z$&-#r35UtylKPSD3A%3*L`-1&O+b2glDz<=`_7H=xa8a!LMo4b+McVNUhzm27!(Ua zVv>v6L+(`FczB2JZhJ~CZHkU)ir);Foe(eH4y{hU+O#L6o-TFpFkd6x&CElo+GkSJ z=|E@)T3a4u&xM;`B$9?)(;z1RFzfuYJxtGe^u=euhi2lKA*H`y&eNYF(7)^TN~lQ168*eBY3~wNf!xCU>}2U*k$0Vl2{xvxYU@S)r^8` zdUU)Q2inq^PG^D<*F-J`tIk}%OqUq>EE8RI9V(?SoS$iCn<<{3l?Kk(t>z)H2ZWW% zhAd`wN@ipCf?s=qKp2Sq8xZzgmQ7NYdVZEB;ACBlw%&VN>@?NMdXjhBek7ghWK~}Q zoUApI9KgwH2+MIjo#r-@<4YguNuN6?nR_Nb_cku~{94L|>Re)ec)+{de7ZbRL|%+> zUhwH;fDw-w3IE)lmvEXln*Pr98=;YV?|8c2NqTq3BkxYi$km%`cW(3EjF-y4Gs2U7 zdNcoSb$VWYer890(cS`U?);L7xKca7$qK57DA>p=sFf<*GhOh|E{393*v@UCCrSCJTTHYNiIiK&SAb(%Frm*UG*xakkVT5+BQ1CQ-w~?N(;q>G;NJuL&|&?F-2Q zaI%iT4qz|`tS;rUiTWQxmVbS8M6C0{<#!JhxyoHH9nv-^cb?7V|9FK8SGXW8d(`$P#M`-88urOgRF{^u1YYiy6RSSJ+dmf zpenJ(?QHrfAn|tj9TU#WfZeG&H?sOpL3KfA_1)R(d&rs+>6%j0ng?z*6_GVn1wbmR zrf#;TP#TIuJ;+GOU+AOHE`YTc)OL2(cF)#!)bP*?)eV}~Wv|pejjVe%Tib71_d>ev zNkQGXbbV)M-AlLnG4=YHOLcGE>R(3Ize7H3@2sDS`<>*O2Vu-(f}>@TPE#3iI-(3a<0B9YOEpr#upBnjto7Gw@^m z(RziA34@KM8hpn5P10r*LMwk|Hk$tmzvsIqb%saccvtyC7kioJqh`&9SI`#z{7zlX zuA{szG8BhYH2;oA?Lp*DsHqABDN1<2O`PO7GsGD7O zZhg@CzFCkM18fjvzyLU9%i<}U6?EHdfU-yoKQIB_?c)WqfILr^>`i5N0pQai-*`2B9#wc>y zT?D831* zrXzc%|Fx5~`KyyvYBBr3W40oCwyJ2hrf0TpVfG=@97S%f$ztx2$6RalTzk=6XU|;s z!dx%Ye81fMpvC-SkNKz3^UsRrpZCnaSeSpsv@kBW@Y-VGt;fPt^ukQh!femN{KCR} zruQG@-Y;9c|LF1lQ}p}KMeo1#y#KoJev@etymJw{Z;{4xkv3+L?%pD@caib^A~W*` zmYpBi_I=>+{J<6Sf#==_zTOW4?>`7KFA42j64|%3!*fYIW=Znil63Eq?E9si%*zTp zmv`-3R`y)p9kaaW-m-e{vgZ3`ZRQp1&K13VD+Zn`MlmZU_f|}MSIpn9?Ay5n27`n? z!{}h3O^^)miUp#9^+7^`Zv-vvKbl}X0%*+Nv0#cys_Op>8H{MJ`wI^CXBo`BI8xD33?+o~0KX5RAZ`W@=*gxQ4KV-0hK;!?6gN6RT4hMUcOZee~0XW$BouK9X zi~nmGEcIs@?7wBf{$zqZ{I5)~#lJPdy8eF?42ndXRThsog|SH=>#8hy-5M)mnr2pY z|7~Z|F1MFmRi#tC>3Wf9^Xjsh!944NW8KvcW}g`XJSDW88{={x(fyQbp(s|GSs z(4w~T!|S#L>Ek`MRm)R-S*Euv>Z(7^4wt(9TN6w)pZ&WDX8hd*bN|@{LvjCXg4yb^ z0w&l5rP3 ziLysYox@N~u+a&s36@}@5QVwlBc0s594?SznZHkLLF-C{YTWRa3FdFAkW4kf?%AwU zO|UOi6O7G-@ABN135GWk157ZKFkph2GHQ&G_lW_;dB&KZOfaV(CfIJO31&q#!JM{C zu!MH$(%~%=tfCz-!B$K}znNhBg)mZ(9YPqM4SlK!W}e-rOR!(HcBB8*1S|gw6D$V> z6aH?3rEQsDV=@~~@l+EmV9Nw^U|fX>Z<}Dkznfrb5awj6306Th!4#*h%~{P0mya_B zK$A>FH(==IZzh=J@RkYI37B9xznWm9nScqFwrzstZJS^S+V3V9`nw6{XUXtzeA@(z zk=-`IPW^6zU56+-eK*0%G3gS2Ho;uR|JDRUviy|^rocuu!M;*WFcDOu={FNB=bH(} zOEtmd7=JdwOn)`Oc#LseRry%VPbQd^CE4x?V1hM#H^Fw&Y@1-Q!Zh0^SkBnrnP4eY z6D<92OfW0J1T*^!6HJk&EQL%s{ksX~kr88<4VYkNznWlTe{F){e>cI}znNg9pG>eD z-%KzZmUY_%gX#Uj1e*d(uzTAk7|*r|c74kPQ~b#U)B86j*av#O&sYnMZ4+$opG+{` zAixB>_PYt@v1Nk!^#LYW{?8^@GVZ$xc9QMf?A~ROjHxBYV3Ct>^6fcV1fkzCfH{%$Im9%(*N898~JX6wf!d(Y#ykbkNakV z8Piiuuv(J0J=Fxe@sB3hGSvh_s+aw4f?)s?>;_h!w7$2f`yGi z^0rJctk-uFEcu5Cb{#U!2GiI!!BVzOFq1y`*D;96FD97Ye>TBxr8}i;n_!p!$pn)} z`fi(GPXEyayH67bm|$FVKTNO+`7INyWy=I(f!+INf?bE*0!%PtnuYt@CK%z5CYa?< zCfF41<;7o3Fv(v`uqmnuCb?yT-TK)C)2Er9rrpgB`(c7Pc}cJRVuI~L%3+{!fC*-~ zZGu@tB@*Zq3DC<_6KvPE2}Xb_626;YSPazn4Q;k1w2SmlCfKcQ6HIc;1e3G;lL;2O zWrFp=47N=$`)?-Lz)vO^=~oj>W6K1y`Fj(r;ujN){8uIz=`Tz$jXszS;r~Gs%xc>N zy8#K@Ho;Q1O|ZNz6HMbDO|X<7CKwCspG>gp-%YT~eN+=HSqJVbFCB$ZvIk7CMY?(r z{0}Br$`2FlI-UKWOt3f*{1+1}@P`TZ2pZ;vlymSj0Kw}msU}$5mI+q6WrAS=6U^?r z30Bkh-2{v4gFP-rMzO*G6HIQ~1hXmL>-3Wemi(Vgu@96&P&?q7ZZ$& z?(a>o$5a!{dCLU5<#78Pp;Qqt!Pfr51iM#!E>B(|>|dH-;eTy{N&Yawcxk96n9ffo zSPS$c;kyYY_rEs5ZUZJ*EW7~(cLwRW7uTKxOt2C9TR%*&K6+;q&AERt!LWZc!Q=oF z%oJYwA5E~2Kbc@7+a}m)TA2=L6kvj3pqnG`dt;YPVnbz7bfy32Ot2_RxQyC=GQs+{ zOt8jn6O10N@rwzj447bsG;%1!KbT;0+a_4xwh31HlL;0T``rYK19i#$VuF$B-R1vF z6U-l}Xh}7}ZUZKm-L?saf!_Ve1Sk`(}cL{hbMRnrebM>HKPf-TPsJ(ZGM0 zU`}KI!UX$3XHa~}=8q;=-nI!=@skNA_b*Mb-T%S_V}VgkuuDIiU_RR>*rz|4V87A1 z7XQfvOM*FUn_#!TnP73>O|WE)yYkN_*h$(F(jQE)TmNi=InxqCq`$RRt9DXvvN*DiNf+=F0 z%(qN1*`H0Y0Hpm76YMb61XJ8H!ITN==lcF+f>~{uVAjxcKbc^v|H%Zi`Jb6!=g$3& z30A#Fa>ReI{E=Wj2CMQ16YR##e>A~%b^pByM*J5h*#3WSf=T+VMkY3%jQ~urT+Al^ zn+ax>Jnwms{+kJ=7O#!|cP7}^ea*j{VCF7g{)Z-*9L@j41ba>V zTN6x5{ksWPvSorvOZ~|N)7?QMe~fB^DgLbqmO?ebZct4ynLnCf)wcl?tet9twg00D zrVE&0TG%3t)S~O(nqXX4c}KpPV4)0w5r1ofx&AQ0Zr)Bj?DDG#rhm1uQ!CU|n4E@h0nAzW(U>4gZnBsqHf}tmq{=x*4$=Wu-JcSd)^yPf6DJn|- zVuIZaH9Hc6yb73L;h|I$%uf583AV9if~EWi6YP1=b3dvHwgd6MHo>A8e5oc_#WxcS z1LYANM7U#4047+7z#P>CJIRTqnqcFh#{Lk77t4SN28RD^g26J+?HCF2_xomo-Th{Q znFA(RDYqrh4bWejU`anrum}+RIQzBf?AstM*rsc%~ZOfVS3 zOL|?Z2`08>f`th#4Uzn^9R%z#24a6U!EClnuzKykF~MHbTLLB+>YE9cLN&p{0TZl= ziTINVmVi~a8)KHoo&4Pd!~Jf8`TlN#DbxI9f~6Ch023_M9tIW+`(}cn0Tb-r&n8&t ze`kWVd^f?|^!S>&U|cXt!WtMTPagQ;hnY*&$m z{BD9x|6+o*0O3L+f=d$0g#fYa9STR90Vdc3%phQbWeQMDFyTfTP7F~FEClI=K((!@&1nH|E0SkrmAn5el{AjiP&3GEbJx`XWw_PB|C^ z?Tf>k)nfY|X`Q?Od6bs^ibIW%e8U~hyZcQ&M+7(H!|pb)4bB9X%J*BS#;K?RN*6x} z^a|D5#D@YgYLFjalIt%px1rs&nbZWx=^$PZ7*y+s647aZ7|KI1sZd4)HvzIY5dlJ| z!?;vw6*%QVFqp%DsPLKa(_oNN0uIJWI0qvDNdtaDd!emP+j$*fY5f3zo~Z^PFe0tl zI=>DA43b$~!sg-L$_8HOB7%slhcWP};s$!4^c}44(4j?J-Dffs4rNFO1I*Ory9xwm zasDyJU6p!8Uawm=X~rWqF=C)%57uW6paw141k zFX=$dsn0s5VJv7ASQsTVBU-U}AsjOXX8ow!e8~`1ONnM!V#i;m50=66+oaJ zdGY`V#tndA*xjsgq?b(q2v#i6pfkNppkG#dYhUg=y-P`zR$P@ps}YyWS0O~!dR7Vt z4HKEV%9(15;SvOa;@v)c_0gBA%1av*q*+!)SgJ&}KrrJVt}9s3m=0)6gAGOkJ2ZGm zHSD&gIbf`_*_@=YYxGuOmI`Zl9ouJ300d)45EwKxZKX?5t4*lYVpN_Ey8@O9f>}1| zCz5;-G=P|-!%y_YsPSshGWmhLaiC|Wae5+guhgq|OR!wOq;|@;xl9D!4JELoKk(TV zM+=46j%uDMX!K17eHN+n@zpn8*E2anWcOEE4F^7(40ZL?!f6o^^s0H1Dhv#}MGZgC z>;eXq>5~X-;{+%s&Z7@hDXkeT!V)7Aw;KckOfYc}NX$tKtii&r5hoxI+QAKC8ia$2 zN$iK~MXU`#s1WX%<}6>xMNv)1usZM5mUyzVyF}HCNG$<6;#pA^VZa2VRN3PjH?`7@ z027S6t;Lt18DOtmjRfQL#i%A24ubw>g1H$s_-&bBm^#)OstLw7M-~!=OM8I~Wi?VCfmRZ%EkpF%RD@MQ84Qoun)hC2(8|kdXHg|s^o3}pw+r}z zjxJU(03qkuR@&)c>tX`Dud=uhdQ?LFXmJR%5_Hfwh}J>P%o1{*Pn$Iyv@vVSQ`J!j zYeFFb5G?c{LuZLhr!N8a^b!Dq5gGvy47Ck{C2xUXv9RlBT4-SAydBkLyNswHSlfd+ z00aXbIR9~0N9@Y44Op<5uBMPBv2$YKjii_)xeeQr=*O*OZPMPAbu2V9YTiu+aEjivD*P!9Vx?1lL~_6(d+?0 zFbMzzgHu5;!gmmi)qx6vu>&AjUVqOP2zC%;#eOH4B`S^^0KtZ_JQ-9FOe{DEfHDL> z_A#^m0KpLQR1hp&8ZpQSfM8_<2S0~wRzL}ODhPJRt^vkZ{4lUd57Y+wsAtX6I3P1l z6ir9i42uRDLjse?bo1eEePqR}wM>r(nNQGgM#a59Eh#KdVBxL~oQ13T?Ki5kq7_Ke z`DTKFXzXom9;yyn{A7Y@1tfS5LlkXk^}}dw!h#G}!xaD%OnK1o4<^_FeJt#E6O1Pa z?ni*f>aZRr0w&l~Goj+ygMbOfiZW%%Y!KmSb#n*Zn2Xm+B0wCPJ!44>sx_{2!Q90J zWHI=}G*o)J!`Bb!wePpH0WrsJfnZSuR1nO3`0E8IREMy6uXGCptE8y^0)mBupd1-+ z00etLf~F5M*e0O32{!-`?0NMI|Gg3J1g6D(i6UDd7|52*u_|wHZ`jfJU`@sV%-}U! zDhL(|fMA*C#c=RDel@csTjNJT zEPOLg)G`vMjdU_c(9_8XQ4^jZEi9X0;{IwW&Cc z5ol~dA>{p=%)>|Niyu7O5yzbl+WE!q&M$oP5+Rc#RplT3^z`als@(usCdF|43sVa>T9Azbnh%V5W9v9LpVcLzl2OhE9PVF?wuqIbNGn%(1Sgf0u6@nLeW05^!)wi8(#Vy#Nx zt{>sCILhNn06PF6m;ySw&UJNIO9K4hQ4pj-N6^jmD0k3~-}Iq=6R|tuA|r{OI?!@a zh^z{TX$otpxW3XUw z5e>HxLns0unADJPQn1P*G$Wpv;Q{qGCSou&QUC~sAs3mvyX|p05D|pYp-~4wFm5tw z8w3*rK(HlerH=&ZV<1d1yzY%X#n*`m@tXIH3v1kX;nhiBKkJFBhiQtzQLo3#R}`eV zYp*fZ;@O{{(U|fj5YD&&*;pzF=I3ErV!(nNn_wMxjH`7zIIWYOwWsk7mf7h!RDe(e zfMAIr!Pyt3H&2#qfnbq6_u!Mqcz6z5?11C_n>R&L2#gXS1!LFGG|%`Kf{WD&2*bF! zf;Ts!Gt5=CK(Jxg>A*ZPpEm{e{1Y17Qfwq@!o*P4s>6N#{ch)e*fU=b1atom4k6=tx z6Rb@Oc=t@7bk(10AQO0ww?1M`DTJGk{2e&vH=rJamxh5ZJA(R)F|(F6D$uf z!D#uz`pmwWV2WfNLHTUZH*ha-?>7@HBwue4_YyF{LLG`I#nupd&n*+IvZlG9 z>k-ui3z%>?Tao$+1WLp8xvgFqIpClIw3#?SNt6U+}9CHLI~n?i4y zVEwL4lRr$b%3yZYt2Y4?%=Nnomi^5HBY1xPVSV6WNaDIa0&QbjcS6C2`E+2-P#&><5k=p!_c zZJ^k@H07EEN@VphKWMBu@Hpt->{1_#Y^CESN=D%ykljy<=rIk0TeU%M4Nq7riOiLQ ziRB@#i{*(ANVR-mrm&BjakFh8i}PPqd>gpox|MKP1VSB0@kMI(W+Bk`a~$yP_@u@e=1ElQZv;aJf;JN z=}6UMr2*{O6V6kaJ_ZSWz?-Vk*Dhoho+Pzb!r}JB zE+ibz9@fxt!Tk+JPa;DnIPp6jCLH9^-_syoVMj^sf=SmlK`t%udm~^~2 z3m(yd-xD=*IefGQrf%;u;0#{%C?R z+A}M^s-9|Z5$E4s|7N32U(MpnXp&pcg+RBgediKpMk-pL7#XBIyzH6X?cXDJ`;TdDQoq#yqK+k@9&_p;l%$h|Zsc(Of>tK@V>pcNp!2B71 z*KSe31Ur1gZC0}CP5k>SlChrwqG@?aJLf8f zm036?-AXLgyW_CSC8H6l3D)w|Tsrr{o1ioz{f%5jZ3sxh)&D-6qAS9gj2Gm2Ix z=5!YpII_~mRP(0HW*12DG+*_N#-`rSzsxK9fNFvz+(p;OR_N*K8B~l%*pF1azQN~l zfB5v%5TrXnTzpqhUV-bi7dq7TY4TQTE!1Z+3dFcb7X1=$Wf1 z+t0`yI+e3e`*u$bGk>{l(Q|2cQLsy%gWp`QT9~CHt=6ky$57Rh@lrY-Uz@CM@p^(# z{_`p~T3?=HU(kZZ50RU0#ZAap&280#PYhAXWqKjn28nxG)H@!&D0E+|Bt$3yV?!|f zIi;m5$WEE8^L!d+lBFtgU92YOB=H$q94fCjT{<~F>y>D4u4@O8bwKgu5WLY-MOf=xg45o_j&rlGo|1!y4I7GrCB6 z*&!b3p(PYf;)Du^68%|HqFr~QCyXil@(0U?F}j3X#zbICGFmoPnC-+MlU56ee$II( z&A?*oKzta&#&X%G znFQ)*GR_-Q#AWX64y+ZXQ}%nxz3E668+Pd9Qnc@phTSn8HjbhdZGOFI3-2nT^G$;L zj?12{s2XnYGWo5tQSN+B>E$6E&#w(wdUgq>*EneShR!3-b?=*p{MrRarj5iX#qAc2 zbb^wrh6+|7G4vZ&!MimRkbOH^=+;QIt6}=|ihia_hSssmv3B$wi=l^gvNHbK1Zx=9 zH`4oVg2~Pp?aSXX!FIgR=Q)zlHTo=GMIh2zT4%qdSbzuun`DJo1Zy2!8p~ZvZQbYG zC#gOJqBU;_MJiQ=n_G4;OCoeJ0H2H=Bbb>;oMy2TH{=Q`6>Nh-Ib8N{4!$k(&he!DI?a4nWfX5v=K3qul zdOX~tj32g>P>6A*F`!{?Xn?9SR8j7OTi;N)K<{WHUsxRwueiAg?K66KKF25w$eh8#u zQ~8LwX&R{gi>&ZNhk?Fi3LosRgw>Vwm+{F(n>He+E(nyuE`;@ z;o2v@D_z1jNhVo210hZvAJ z{)OohI$s&aEShj;fgXnbh+}1L`Tk9gU%G~^Ey{}?u)W)VFZpnoaT){4`=(>Tjm>N8 zHXqfF^i`k59NwLtXInN1m|$6MJW?)%nY|)NmbRV0 z<{2LaPhPR8x;WYmy%yQ>R{nesu1W;=a&c}_YxDEn2kYa$=|`>~LZ|e(ExbL77C3Ff zY~D{~dLTV_5>8(gKU|@5^uj{)v(98Dm+XYg zFN<5RI|uGuABWzYd!jDE%!~Jb-YfLEx%HZ8*!ud6b>ItsTF0uh7rwqe^vbTh&-GK& z&JFCo%j*@A4?lZeW>U?5_{{_hy!)i{;l^*D4$sft+Wc~lk7;U58vxL&Wp1srq=P0$ zsxs(Hj~vu6?geY5|MvLkS9d|!%Mp3M)STkTnmHQ{B`~wC5lsjNk#T%IE*-&iBcVBs zXfViMI6_yCVQ2yTt(!SR4EQprn`_hV+Wn|$Gj;z)G0`X$C(_rpbQY`5jYpfuSPT7yN})mqcXR}}@i+<8B_HakM4R!FK@Nbmts)s{&s7W*4mSoB6H zUHvQ3iB3gjbJ3UZ+1ni&!&o-sCfU`--FPsMaWr25BG(K0G)Xm7~fn1~e?U;x`*LymGYta#Z}f#Jies zC*oY=bCk|FsS4}z4pYvSW#hx3_L)qaEZIF{WI-ER?)5=zON*KkMZ8cimz}t?*!kFH zh_I9H)Le)TtJAT)tavAqipCTiKIL_;WB1oAtwOSo;uQTmIPT)49{_@-xwvLdot83V z3yV?`19PYATHTLF>g%dE8!AXRUs@w8abgb-WO9=r>_@UZhTiPHpMy_zI)5ZPJ}mp3 zOq9Ef%c1p#?YNw;s9H@geo!Yz!M~dyH0@^E&Y006vNDDXrcw$ zbofxw+FfAU?mTrGu%sncM8*=TX}Ox19i@0hund=Zdy^wg!3Yn-zCL3Xy?~49X^D)p zQa8I3VCQ;ua^ULenSh*7)ggl5YNJVqSm0xV0IpAMRJU~%yxP@tAVZh2uqiB`QOlBu zgn>D@qr%0UmD{Yl7InBsTK2(jYs0W(&1B@dH5=%y$}`C{Uh{n~EpC-lZo%_C93*pm zTyv~ko3lZFGI~q0NjvV2C=#&r!)BLTLUPxjd4|0l0Z>9j23l3(9Bb~Lo6dI8&U{L( zc3RnI>rI0pISwyx=j!G-K~6azb8##ul3QAKgoL7v1wGlxulaL_Epwb(45$2~RSg2xdYnG2iF5ssN3CwaIalF0Rx+h9pe2O32CZ7U({aDBlhclsA$M(6&mn}? zuX0W-&%7Nzuf{NM*7y#sIE^2gxjA`9`^>bs=HTTAa}4>mG_$cid2#i)_z9+j z{25sd$(S|x#@xP*D-`Yq3WpAH^Mhgb{?mMAheux8M6mYx@b^W1CI^KSJb)GCWszSMdMbc#hiEJIo?V8yDrfb6ui9hrp540HJe5|8{QatXaLT%e%+hB zC?!V{)m}86DUBbEnweab1+n0VHoaDFMC7fIk*RDvP7vnj()Gg>{vlWTGZ72-cW5QO zQ>(llpWMHgR%FF`cbJ=Y>GXTZ*FzfmkOZN-Bm8kA#R_<(QI^PJ6~GF6X&1YnCH^i{ zIQRo?3LX!0V|@_)I!AuU>izOD_Y3JoQ+rQMuIbHAJezItnAR;hm#ViLVI+npsa!Ea zVk|c~v2Ww!B{Slsk{1MxI#Ew`#=<-q33O7~@1<+u^u12&EeUG?cxA_t(1Tr-FLHMx zF^myJFUa5K#Nsteb_O0m1szx|(@AA@r^hVI((O`6*}HbFgT=j1Lt@uM8>ou&JV(`? zD+p+ zIPAN6aZGx4nc^tGY=dHQFq4Y>z>?*KN?i2f@mZpj2seeG*h;jyKU~$n&$_FG?U3TU zZoAldMVjkpw3SMbt%Dkm6KpRm>f0lKQ!4Nf4fU3A^=s3qnt{e2IO-}y>yG&7zS3l5 zXKFAK(t|2^>vzoyC+=Cu<1vkt*vIK^`H|)1OEufmQi5edXV2i9eHgp$%ZVSy#`H+h z_9(PIIm1+X=9oPr&X?Y+bhlf_-WVercV8A1?aot=YkWBNjDVF=Cd_B!6D`B9HY+S7 zp_d6*#?@Vk1UApr#`U4ilPAs#9@;nW#r^RuCfRDa6Ia9JF7wgl6)Lx_r8ESsWvrBF z+oPy_)_n+y&sFgNOt7Y->pd+t!qXnK6@UrW_whG;y1chIV1iYIU6}GwIpu$L|G-?< za_xXE3L&~S!pXNA?^=3}%rOZ!V zYMJtne{ktNV1k`5lRu-u5}Hyym>Y7sTf*bwmkJVKf;G|U9fERLJ{D_Z7-W(a5ceaW z0!*+~Z}~EnwhD%!29A=QtJOYDWy+zsfC;vGAWPJJS}C&i|Fb68^XlDU5b-gH%8{Ci zP>g1u9^OkA=pnC4`65uLix2uzQL%YxU4YDMhUF@Rs8mwe9Vh}ejZ!wc{7ILDR`qtA zp|FpFWh-kaF@hS2U)U?bD$VMGizMwfy^1enh2cv=`sBt?O0%FwtM)D7)Gtz_6!t|h zei6)RhUM1KZGK);eoI~aj7AsptJZx)UVUiyC9-zddeuIn#41$f43Wo=+*sh>#Z@=k zPpa~56j`j*9$xPm)WFMV?nIEPK52LWr!;TKc!#vCaWe zyO&@r(uS*#xn(RUh)MOya5g!Tk@N+K|MkLW#9G$n8^RCLXH+jL>y}+Py!@;Y|CQ48 z$i(JJ5Ry-ef7(1zJfbetH1$M<%C*-)Y&+3y?^R@du6(c%4>M1c@Dh37|DYy@^Ile} z*6yQ^X1G;m44ze--$Y;b|GYf&v?5S2@N!yCy+!zB4ze_!^p45TFhv&?oKxAv7F*AH zyF89T29_@g*UN1@AU2WRc**cxs~~+wwS1EUilM8KC?9m1!!M0jk^y=&NUCL%yhhHC zCobeI#+4?LrBqmqcx9UD_w6!Pe&YX}B_Dl=|BgXaI>byUL~N;5DUVyY2NN2*oEtBB zF_py-y1YND@#N`C-o_e8Fs)eMEgBJcQX= zf5&4RtIr=?jPYHniVq}h{KJIGI-f?dUBc{iTh7?Waqsect z9lgvmndF>SwL^?f2`HlTx)2nj4!Z*rzBt+dN#Qf1R~EitxHKuIUJS(>i~M%#s_Li<@3@B0$khFf`kR^bTDWIZ<6#wI zMnb#cne57>``P==1E+s$_P8hT;==0}h5k78Vq7>>Mvi&IYTR;ArKY0z^s=~M5xiqp zQf&W+&?Bna49Sv#=h?p=x$0Uk@M-kytDB!*27IFJrVlSy?cLqoP>b3drSUv#Q$Rqt zU64tgk$lnfplIH^g5EXDi|+~^Zp^YbrI9di^A|Vp%l)*z9rq-bjf#0{P)^xI+@4R< z(~c`I`GWV8%e`n7p?lgW>_(c-mp?*pX`2nnbbV|)&PHk3$fFe^J2E>mSYOi~GO8<8 zzKhvQMz7}(kbYP6F~aM^vxWmKgYr&BZIsmc%ZK#%UZO&&CfEViT6&fOKhJ!WxSuzz zWK_@vmOcD?*S)U<_wV4XXoSFpPAC@!w1sDoGTG?>en_>|aH6S?o%a6D8c+}4hJ2QG zUt0^V_5=F(%>L9OpX|G_?+^F;B?jG}j*;ah+dwjogG!WwA9FVc;%(A5s}Urz+;9`0 z=J8O&H~q~HHcO-yY!56qJx96R`}m$FQu$JLGMfdMZ=E~*#{DwgJ;8H23?;mRtKLSu zOQ`ErLvQ6SJTqNeE!OQv7@~rWr3}VSaGcvKtUK|BZy+j&naNvEy9FrEu=T#=BlW^H z+r-?kIy~G_)Jpk9Q1IT=PJ1y$akZx5!rY_`!CY2@#ort=<-#Vc7Dru_QXa?nzqy0_ zD&!O&tDk=%+DY)u3hh_!Z2l1azTNDKdPzBbSIvHdBOO;+P7ht^k@R!3VNpL6jVZ^x z#mrkFNfFXhHV<_bv(KKZ-Vx?90N_i&LhA91#f2N{`wJm?*tf7fZ#J<^DUKc7cvc!& zQngHsB?9Ts7gm+-iU~eu;&>=AK|AjvZ^7jRqPQ+%@?~zHn^Z2+LYc5iS_wsr4QQt2hI~4@ za^am1SpudQ`yjsKPNmMQNNQ;Wmt^CfPwBVrr!@|!KC+Slh<3JByjFmK;m2t&l`y(3(G+W!*fs*dd*#WUSrH60Xwa zgdV{dd!JW`^~5xCbzfZ!l04#wT4{17~AJB%=XHb zR?34&e>{C6N9@VcwW`s#*S+s2S+YK2F`9GUB?#j>k0A5XW0?X*Aq;Uk`haV6l6RSt zAu6dqD|7vchJh;>dDD&0YNzOKxGor`F$8;Koxm?F8lrXj7 z_yWEOmg}k&1{tYnbAGLPTcc~9tTBFNMtr_H!Af}`xc~4o%DGj2W$FlH0z@z%&FC(Y zb&A3rxO@$S^xi zE9KR*%n#lhQ17MB-ZZYdUNP|b<@tx3 zN3}ZT!n7o#mf?qM~o}LjgGOx+)FjtLM_lTIfSE7`Oq}JB2L}qXN5y6SR5Ty;X zRZn9*{wsiga-%jn@b$@+qFb$=zE@(N^}bx%O%jDB+r} z%l|&Pwe~<}%waF|=g&_!Hxlemtx3ERY?K@1EoAIk7+9w;1C$kt!Zb;txiQAk0Ag7q zzTkDlERKUkydYoG>0BtxIK0G~qd>MJ&k<&=%}ENEmSda6y$@*_0+yCE&nX-04P298 zeIG`SG9n~$oHILv_H%*QB85wZGPf>-_88|p_`cP0yz+@7r@#49(Hy6e1wb{uxfUsS z-%?x+B3jvp`3+pG(TMyU!_bMcY<6V39D_;2f1J?!yoiOPz#N6{0!|LAfDDF;Wk3{i z%P&@DdSZ&^AxFoU`7GgDQLlV(Z0iJeOGg+qV?IKNwak%)&s!iuh%r8$5rySj6k
AY+qM4PY9xP~#UGuR} zri|^_w@dj@g4idojUbO)8&^}1q|B9k`h^CyXd*LUk-;DWs{qgDmcP)z*r;zFO<{QA ztS~mt(g-?{?u}73V2_RFdgCa#XkgRTz^Cy>bZ9KDOOm$>q!vR_jq}C$Ev02zX_qRz zDo>|tBhXvPGj4Rroid{d$lz{Br$Z4Lw7~r%V{HC~Af6SrNk_Ihf)LSHTv!MZpU#~e zJ!eS~L=kB>%hP+;g;@33D@h=I7wo%rigH53k}dm~m^`ar`XJ|g$eJTMhxH4-mnj0o zQdVSiYuxDS{NTFKEB^g_BN*V%ZsVlj$H&M}bKP?%;iV zas$i_4waMO0WX%LkCo|Nu;vF!|1b9LIxNb4?Hm3KIW$8_cQ=YicPT9fp>%htAc!*z z-66_x;|_^W6Vm|6RxV`(4+0oags5Ju$4& zlI}8PK>XMwtQzrR4ZWJY9>_u;Y$EA|8s5Q#mo-VMYZ!?%I@5jvUcr`O|s*pDVuAoa(Y$lT$5Z%|1lfLuF_UK;m4Un+Jw`5?i42l^2+|c7|AaiO&qluEN1$O}m)V6n69WTgKEA z>Yost*L`V0^rnDM8QLW>L`zbJgB=uesU~N1#+`E4=tpHy;7&(|4~cFXbI{=m8{H8# zWF76yOk9qElw46S2E#2CG>0fOCPR$hixqJyHf+;13tySk-A* z+$UHGCV86Dt?2lOYFNO`+dB3IywM?8-NWz->60D16tDmt{xamggx7q>90gd~LulK9 zFjjn>jxec^_SldtI~il*4HknLu`2;qoHR~1M0Wvz4VI-#AjIkhT%knbt!@+>SRxgb z#ZRTv906fjK&AOd`ID^`SQ&)R(}2!KPhe~&!bP-X_D^Ur4Yl~xbM>M6H-<9`#h#{vBlDIwkq9qh@}fWkfEVZ0h^TW zDLau_;X9D+Y>!Iqd(2ZvnoyBwqPr76MNg$p2TGaa1(^<6XV7)+8wT3nqG(AW_eddq z)c5)8*w#b}m0%(DZQ8vb5cYh{KKA7po z3s$+^n{Y3r!I4^gesnpRw8#ys?1-%a!oOTcG}NU)JoNXxiip*LkbD?~50^IriT3uw}uCWBcA@WyPh?C26l z@ZqP)Aj%%xa!HB=13Nh6VP`S|2xsuwy(6)eqC}qGtdk#v`|cjzy*|}Sk{#|9h?$=Y z^(HMw8s5;3Ysg4~+c`siK1P!Vu?^L#8hIn0bm^e0t=AE`)M#!BGFUgri$yO zOu7UZlQ<8%u7VV!?keznP29hqHQ9(T!Q3!tCI+Qeq?j=1srBfz%;^&H#;+)sTo2jE zTs9;TNWo+9XgXiChhKwBof%T;I5+A8S6WhNxD!d>0wAf|tHD6X>uUpr3q#%Px&ylQ*)tHrb&#*eh0THA@=`Z6YSMQ-t@#9s>#xa zlVa0U)z4_Fsiw|=Q*{rg+Uus8&!+00O-;{F4m_J4dNn;iO_Pv9ws1B*`*3FWY^FzS zdQxoaV0vn8dZveJ?)=s4m+5K3nK|N_Nz|Fyk+V4r@r5f%Q!LaAWJ%Lh^>gS+3qoHP zIP@0CzRr(NQ?1sK$?MJRO)uP>S>G{DF0bnDzu$d17&n7c;QJ#8&PoZc} z^-wGtX-K42KwfilD$^W>%nj zt2JL&;d-kG^vmZ0{aP=e-tf`e=&D~m@6+3KPg?tsv{_ZZ-SBl=PIBwD__~khdY<0u z8}W@Vo~tMID+AOYKYDH-&1_Jde?)%0h2F4&MYB!#eB;XVPZ9N>T0PfA^ifXfw%N{C zd%k{Z_S|w8r%Hft;WVtMNbG99-es8G+e$h>mH2!^|D%ZhfriAEoc^xT`M%WbXNiU_ zzlH<3xdV6oRZIPkQ4+^bl2+sl&^}921skAM&w_>#!!Zpf*QKaho}<-EQhjbbHHDLP z>z_0YM!hlk?U)!U7wM83CTx|FQ$%0T|PJr z_c*1!8-C9Ot93v5iX$qx*}oq6FHNwi8uxQtanXbRrtc=$!RBPmbG&npYuXFnOt7zK zrzZ##ELnJ|RRLjwQTrQsU6^3BPxnwkhTa77BncqSL_^g_^4;oDpk7AvP*6Bog@O-f zN5uSNv85e+Kf((WY!qkAAVgrs-#AowyCY4ar4wWlJ`v;EMlI$qN%~0Ifz{8C7C~1< z$dLUD`!AVpWYrkpKLI_{v+ zYNdkwy-^CAhOgx2D1>#m$_{Wen%cQH=iF2J0V%~ha=ud*pHQ(_c(LppJ4u^lYoS-y z45L~CR|_12Nm#ad=wlD%%5|9ca=_uaGE6s=iXf{PpS`eGv81z0QPP*QyBr2=2K^ld zZ}IfF3>6imWDne)yr0U+mTQcQ!$Ka!cO!b;284wG{emm?6CHgSn0OM3g%uKv1tvmU zaz5}h#0{DIQkl!JjLJqm9O=8taBF|qx68k5D3GVuby2Ju*=;G3Mpekwj(Q~+6xU1Z zCDij&ED+@GeI80Wp5RaGXEZHs=@jGc!a1;(GilkIo3O+S6S__A)YP{XTDzAF)jX6V zp)b;M`vhd(cHQfhz%^v>);BD;7jaBCe5ztMiIw_*G27k=TVg6r0~korOtSHo)YCJ zqw=^W$URU)30g~Tq6amt)rJX;S57!}&%a$y#usAa@5Kxol3pQQz#;+bg6u)m=5s)p z=w=EY#rt5)9spi3sUJAZzs1o5-%lCakozl(7olK!z@1vv#Lr)vIjtOns@LN{zUnQE zfdl?Z`RA!*9I-g}D;G4)Z5Ukg(U}n@Sg1>W#1$^SHc@sB6o&-j)RKr`6I!cqZ_ac& zynss+Y#ESFONP;@c*`jwG%=lthpqFLkyBJ6zX`1fM&}|VGCVKE7MrP``CJsQdDk7su$bPSD zZ)wtIZC3CnHnpdPQLwMUV975+p>##ptj>vo#UuFoAET`$?<=oLSI7=9cxj?I@@FZW zOJEKTplG1HRW_%l%k4LbR!P7YrV>#S#*GfO&fs-oLpNmCjM3YZ6UZYTTbf56%{OrLGoWb;q3cZRRavV@~Oz;Bc!)d2jNmsa33Qn-GEZu34-`)<(5moE$Fj##u}aW&8%2BDzH%rM5puYkZ8lX39tJz#wqKCAEw)5=!DJzBEYZdp}K%$zD58muA0&uy`ZH(!C4eTz165_8)$9 z-F2U7>8_~%Ox1*)L^igwdB5&!-A8xi#F?+YS2EBuNgXRn4KQBrmridL`Mq2Y?ZrezAiXH%doTAMEQ$FY6%Ku1NL-|}5P$Q&&sE+r75!{T`uq#H z-)g&9T4dsbvh`DCYkdB6m{Q+v%p=8hMR|+Lowagb@}hKsw;=Ub>#qx+4v$v_=NVF2 zhx8_-j1(W_IM>}#nnM78FL2qFdh`vl9=|X6`nF_CWV3{tT&P{(I?O%)ma_fR$*FRU zgU}P3X7amgBT}Xowc%Y3EXCQwToujz1*>YMFJXL-L#yNf_Kz4GXO=R1>-GvhG8789 z58p(5a0B&B_iAQ(j-FH_+qFO$v>%|W2|`P;C+w^}7BP+mTH3YlaHlt8wW_KnN+Y+0 zeRK!Wvm8aJeU%H!6Io*55pps}a*CmdSmqAH?KP#gGhuv>wKj(QCf-=nJ2&xsML?pr z?@`i3%9XR#uA9P@&B2bL*3ULRg87EqSzEg9yyEMLgAO)$wiGeE+B9o8em^4qSUdV` z(_)qZy|s|CwWDt9k^X1uj5h3vXHg^evU8<{U>+JB3gF`Ud)g91aSkCx6_6DHC|Nc%&&TTI{SyjSD3KQQ391(! zx+97!gv%9zA?U%%#km7J}{wd1-TEJqJmGi(7idnz}VZpxc$9*&|xwi z-YTo!Y9JpCR-bYg@0%7rx)t7<**>}*J`Y!Y^c4;6y80N8*&=W-bAMmUY+q}TyOE-= zeVnCPhp)4u-y;jZCl?k0{bMWwV*LZMPy-@60+wY1;z5Batbqu(=aofZ(SAT$c3@sdpoP|XU=}FIIXbXF zG3c#Dkj!CVadwb4e^AA0Py;B~ry{8NP@|4CxZOXvQ_w9{D!4~bwJke%fHh<&PN8Zw z`2DJAzhcN_M@WZ7$QURzC@o}KG4#V&;4EqAS|$IYMd;RQXsmzeI%^nObLdBlurH+k zyMkdS9UKS#VdtQ5>+G;o#qf%SFkrl|8!eEa5zWm711ksBI2uTx4x9{^C8Y(DLc(`C zf>GBZf}O)DFno0oIT8*iZY^j?2pEi49vJ{Y4o6pX!Qd7G&}#=EA4LdtGSFE@JyeWj z4v6ILj5Jz`1SCf&$RaUmM~QX%3+6qez6b#62fQ#Ek1(~2t)Y)`KJvD5i_yLr&4U3TFpwox zkL1aTJ<^JOk`reh8SB^S?ZGCm)%oIiPIN;*wwz^LXn>z_Rotsi>X4jxQBeF%#(0@{ zS-G4@1xO4^a&(HDU*cMPF~Z|sOCT0ZP`s0%69D9{0uTTI7>$5&Lx9x|LEWl?={xlF$b7Hl|P>zldZ5Qmv;fCMQoCq{^eDkiPRF)JfqAgqY`~ z>K#V;XvZnA#q;Y#dgUbFj85ef_TgMlmAXZKWj$?OJ`x#{ev=&_!2arHU^+1`E;<0f zNGp#ANtdhkkz_}RVd)4HZ2Q6lOT93`CPEM<7}2c@6KoJ+f{`a*m|#zX5hmEyr3qGn zFu`(PU6^3+0ud%y+ocJXf-u3zUtXACRslipvR|!bhg;>OgOa0ibMiHEV%KvZtT`{O zVlPZE9lsRzT$7u*neWbX$KrExF!R!ia|>?e5gg|hC*)q3V7Wes0-?#dFu~L=OtA0^ z6AaTEVS=4rnqaFJCRo^o36}5_VS=q+nqU?eCRpf&33lv=Fu}$yO|W@{3AP@5VS>m0BPk zGb`04m%TOz6;Udy$~2X5&wBNuRIaP6@3E!gNlD6Xv5L}LhVpaO0+hE;$jbz@-aZtO zzt4oE9rM;RSE_U6t+|rH&`61~Zno)5XA7(fPt!ZrUFAN)x6RCp9hKfPvX(#U%HCEl zr))uefsgfvCfLias#hCTud%9AYO0!i(V@$wp1RrFMQD0uQomz@HDJ}kIcl4gYFkBW z_>0iAf~t$waSMy8l>v1F92X|o2t{2A5Imk&H(7IQbfaz-t9~S?Zc?d!e6nsnsD6#3 zZc(>>%l*Oxo4GK->OWF6e8H;QQ)-yX6FS^z7(6+bs4H!laAzd>)bN#}QJS}LT(^-E zzrnf)EvUbuO9`F@CdXc(WVw;-;X>uDBX57F6ut|mJ&jlOGI^F;y)Qt;Qc=7f(R8-# zoOcI)t(#&5xp@%YL=j9y-3`A+(9~hsG?D_}Fl;8Rr4%})=CNsMgSKp?wD9E%@pHCT z?Y0PVHdA=G4(=-L@HO=sX3253+}~_^^|;xQx$UNMo6=Ovwnl4DahoP~d+9(MnMd14 zzT4WTlnwlFrr`Ge;#SqfHjW4FYkci?q8&Dr9o@Ta{`pOT9vzO@orM}5?}|GfQ<6Wc zrF=?B^F)~{p}WI$vqK}eEySiH)rL0e0cFhTdFMgqyNJzqdC+&tryVaig*`pWUV3!u zcy#62v}KCEOWt%TJ?&x{eOK$@+;sY`(}TSDL3dejC!n;e@@yZTD%iOn8R9qp~ezP!`^O*0%Z8XVDo$nfxCKep58-8!^8)1&TUI@Z$+QjG5f2;Pf+8nx`vW!_{QuW^us^oJ ze1Bnsy$FDe1l%(WlKmK@kPr-eAEG-Rs$&{<%OkujDco-?T;DQ6#UrvRI5KfO%EUJM zc4~CsPV_y;*!#Y*6%(FEe*v8U`g+PrY(}nq*Xv6gBtS zIX*dWCi$^vig|I$i%)5`uyot#^!nj+*lfB-dFD&MEXNmFZU5Ca*h*dUAMju;dBtsM z@V24xwyKSeH*Y($|I2u=uK(r1{zvg(OF2JyFu^~|gQflf52o|`Jeca=&VzNa{X0CE z+czG}?>Bg`KBC{`!K{A9gY~n0=fRpTcre&azTgWUEUslSfGR;f7Z1UMc?1fE|HOmE z{fr0OxZuH%Z&G-C=fTi_$%ElBXZ;foX7CdacI|=(Q@!B93@&*v*iSrI)&&o?gz=pR zQ=yw<`^JMw{=|dn{3kq^!Jp&7;(n6{^T+tNc(ANL$AjVh3m(kiI}cVyOZ!_q82m5t zVBkw0?C3XnFw(!92Mhj>c`(&K&x7IpB_7P;cX_bD|85>^_iy3B;tam?U^0J(2Yb)v z^@9f^{!2WV=+AgC-{0iH_^DKgENE{}T@eyWqh*evb!hK~MW59xUmv^I$r^=E0nPn+MbU^E{aIU*N$s|EKX_ z2|swStUtknCHzGm%;S;=i@W5(f@%465Ior2uXwP8pLj4o44q%`VBvp-2Mhlr9xVK~ zcreXh@?dd)#Dk^%B_0g@&+%YMf5d}j{f#^r<|Plt`%50o;cw%?(tnEwOZqtv2KcYw z!9M&;9xV8ucrd`<$%6&|Z9JI3f5d|Y|KP#Gf6aqA|64qmgvHgk%6aK-2@&21U82o|FxXE#7}9qhjQ3YO7&?Ln+lc-L4<`Osc(9?rod?VM z2M^Ya;K8JS#e>1o61A~uzN@L-$2;=v;R3J>Q0pYmXNmpoW1`@h43 zwfq(jmj8nXn`8PX9!&AqJXr5H9&GL}@L-ugc(6HwU+`c@e=iT__s=|7z+d3OcmNQa zjYceM2+|nPouOiWqcYZ71dFni04hNcA+mrNF9kqkUE!oC09+J6?=>k}QUKa!CXz?# z9YvpW6xrZTk>;~bVJn7+F_DN_2C#)B6wuK)1bcfG3n+;V#lwPP%7O`(#0Up`neMc( z3Zkn6AXuOl@F7}i8r*pff{yQp7G6gr#7}Tkm*x&5aIDI5E>}<%G3plukOFl$ZV2$q#hY#~;JE5KfpDoP33r1Pgoxk8e&A+Q9i=0?|Mqv+S- z3<88Hi0NxEt{{M54GjlUGq@;7uq#O5h78ydAaq0mSV%7uHWEs~3CDrR>R!M(URwZ! zT`}T=NXcCkfT2HFO;ngATVasUcZ@;7ic?5y$T#xDiy#0YgaIFa7s7w`r4WYdj1a_myI z^IZYP0NJ1^u% zok0j;$d^Kx{|_O|8X<&L{}95U--NKF^nVh<_%4MoUR8si31Qkl31P^bwU7%TEKN1L z=9>@(;n|VsL>!VZzJUOWlMS)cq{$evgtq4{AAc+6q&et`bdj$;lpN(Lx z58XII(*WVM|ZSw(lIzj1%>aKP|@nh5y|2Qej*sH*BUW> z6)9x_LAL;4Fh&r<o~Mh>%3=x9-rh0ZWMJFqH=#TSn1}I=OdY7_1QaQ(x8&NNHM| zo<9()5{wioqVIn*5hdBHtq%h5`5UQ4QbKGUyn@B+?J4u!E~0muLd z1tR1+shwisC>7-a5ab;!$$o)fkri6+E zXw()!`-lJ`Mc$XIezEB~wM9B=uY8wvA?W@SfRkDz2`#&*`gx)$03>#rY$IF#dNVSY6L9gs|9GzT^K8!YZYerwP|42sB1Veh6VZ!NMCC zLYQzQ734w)OTHAs(k_KCU4#%Oi1b(qGkF})h!DacDHlSRR8Y^Gl5awoqR~0Fr@|6K z2s6-*&if&R8D9!vq!T>LvjiFlA*>1^gweG1mLi0(4gp-sOCik7sDdB$$mCK8>sa(h z2w|*K*rNvOkD_o9LfFx5FTo!|82LYhu)8FF_b!F7A;tj3pM)^>d9mqv;c#VD_4%== zOCihxA%vkJgfOzv339VbA~uST}C27MbndOIu5*|Xn-u=;-pVL%Dc_&KA=I6?@cAJ!0O2CLH=45(-k zYDj-X2w}+!#K|qxy;>;vav)j*l*-BQ=~bjlAQBBgsFWK!meHlVRIKk zSn4++jMRs1=TZpMUeENr6v6~Ag)sb|31P0IT}Wov4YL00>$#xR(d?Rga8rT$Rc9+{U}*QV4_a zXssRNj8k0*VE{~o5LUDFGBic?n-GRrU6=OOjJp)Vno!cd31M1R+Mh0jupCwBrwbvB zTx|{^grWaK2s<=2dbUm2Z5)MVaUp~yzXNtat}%nFLd-^(z!Wp`5&RL~g|HpNIfM|l z^Fs*ZnEZqg!dzC`#nIW8JGqdJy96dVhBxOJ=J-e<)|^$f8#^FH9YVKiVb@8eIRIjb zdm9@7Dl)UWgAl?16z1&{--IyQo5bVu@%2JT@+Jr&Y+TRMg|Rl~n-E5^Lzprox%}~* zuyd_Z(T62&DsUAcgaOqAeKfxbVRNvLpXV|VLYTC@*O>Ej*GnM`^6@a~hY(hIDTKwT ze;2}}t?{4v^3;3hQXqdIaX|V>2-DoL{5*jW!VE8kFsvs(gs__KHtKIen4}BmE@A9e zZ&Y8!l%bBuK=)T80!xGtCUzl&@viCZSb%38xAq(zwr6WDg)kbk@neJ#mfSY6j1aFhn5x5T>?!p zp{lT&DpVMyccM1vyAVeClMuG^T?jMhfw=dYOneiCBpmQ409f{|ncjWixmBTOz} zA^i?T%HUqGA8~yy)Dz&MCP*x9U?mRWh@w48XuR@l{z^xyTj3$v5R#8&G`aH51@=PLyi~|&oif4UY(hV}y97o8b z9j%@m*lJFKjm?U0I3gD$2^bCl&PDnGG6lILa}4^0YzaY5>b%MCg_tP7_-?Cy^T3n< za&;;Qz;-l<5b&V?^LhUYV#AI|RQSp@$O6VYW^Bd7qSfR?`2nr`3|dHJ>!|$fCi8HGb8LZqpuqF9#zThL zvkM;Vz4PNulu1zAzTIKj7#F)`e4m!7EdU?m8s zaHU-by}p{KHLAl!tam5qDs+pb+p;fz!O*e}p+_Te+{wma35cT)m1}t*tpLtaai$Mh zIfTQ300&7L1P|txZuzFTf-y?DoaN$&s{lF1ndy5WlxCgNfJl22m9>0#!^so6Nm_33iiH|Mk6ad9Kxls%`YvL=qm&J zXL(Ew%3n92Eqp0{7I*8VxWky!k=>iJ2-&z)uiava4cee z1OVglD#Ctl0mq$)Lls|-XhaRn!q6O(Bh;J6x#gg&|QK>Ez=b zI$t#tHFvgfUx~O(S=mG*%eyL#v`Bn28(4eZLHd%vl^IqaN9L@@*YU8;%u1BikUHRf ziNRCbxE9edEXOL5-cDv;T8|b*VizFQ>6)0}*x3iVa;ceC~D1Ggh zm*B@hQ+;|>^-vLhBQU~?cULm@(1D7C%s-d!cK;R8Ms&v8$x;t8H4e) z$bXv$qek^*7X8kHiRIjs3cTdOxUc6hzuO@Ni%uR zv0h$n4-xfQ#L{Q@b39n+hdkXU?W5y@&NoNgt)q76+W6!JeD~%O#RXpGs&D7fqy&d# z?F`^iL-0#$6Wcj?`Fsure(+$E)7#}}Zz6#9?k5T4C>J~!x6`8yucfOMRaG$@j<1w- zXET#g2kW>WyVXo>I^C%3@8Nz_Fgd-u{TO%aLF`sXGS1aa2kN-+hpzjM!ss)f9`|ZL z+%VDc#Nu?Y$mR7kq910A1z8$B3Sh|Mb!F+q~M{Dw$4&QML zKL(Vw@a~dd>(qshdf&WL_gu7ry>Nod0QK8itw$vOTVG@r@N_^@pFXwdZ-F;&HnL z7KyoYu#LMt#$hVSX_n+-iXNNcW&wKqY5K;dlmsW{?U^MdPwJ>Nq7r&~q`g0x zKtP-L?&l+KH&=4s&m+ArVCbwl@@`E>@L;7PVs*W|MG8^1Y@8IKTfAY#Uj*At?pKC= z@lD|0r}{Vus+q1IklxA-38U{W#WBXJ#4;(m=QUH6@0KYj7(n4|ZsSK5!HeL*xIayH z8=IWq2FEM(UV9cB%-@+`@$UVe9d(%pKTp_^v*P3{sV|x4g6Bg0j&-K^uNgknv>+}b zyFH#gf;A1zYKvg%{cLUMie`iGsQ!szkmB?}_}B3`jb0Z$GvT%RO_zc8q(|9cX+i7f zB2#$P2ZeYwWA>B$8K&$4Ndmf0TpRpnc&NVkylt|DNr*0dayG`;N!6oL<9ZRkPuEbK4!%Q>V1aFnj#I zq*b+!=j$$E!l$M7kfXk^J?BQLZ#)>!bDJv9ub&&v`)ur=pDn(gb`VrJM@|Rfg(|nPmEyQVVx4l(Ui>1~oL%7rUNGlnUc5~dRj}c0n zA{MYo702HL%u7M=V9D5R`Xjc1JuUJgWiJFgkSS5&y=5>K3$H;7WagHp z`-HUV1S4>9cO?lPM3KxXiTZ1jZ>MD3Nj^QIqcI}6-W`-{flQg?Mfq8S!oT^$95R`O zKCS$MTOxuiGpgr}8d9XUD@)4lhEefD`KP>iJYXc@@TJ-a7MB1Pel*PbL+`x;(ftGPh>Taj+bQv&L(u*$lK6x+lq0ek4h~X-(F^DNm zV7ZISh2iNP2K#hLp0B=C(TleJOEwlu8GzKM-OVDm+v}B~(<6%+9gKBFjOG1gjKN-? z*a)7ts$Jhi5^oK&S!Gna6;{3rp9r4k^@kIxFWJo*KivV`SZ))_Oxw0mwhd0ZX=hUI zvSf2=EV--&6!H@YZsHD?f(bFwOD;(&3sbgp63HY4dr6c_ zQA~M>mxUTo`>iG8zKAx_!elmF{-F06d1Gu^xC}}XebX?o*>F_;y2p7ibGMB7akIHq zbGvJkXt){4D>KVmFoi${`gACjfuB;ZuV9(!x`~F=Hn#W<^brIoNtbSuFGJ|sBBEp20$k#O#V|k8L8g{(-R-joAX(#VOf=AS(+(D7oG&%xUgiGH2(#Y#y0qg%H9L z%$}AkH}S*Al#K}>a5h#`jw%3S8O(;L`F3@dgt9R>&Xg;3o<$TWu<{fAd`+0)S2W{sLd3?dp_Epxf zV^-TnEw;DFh)lg0og8k%wHFInPyJfL=8sRh8u;$ zS10D0i7F-WmEi-4*|+aCi*{n;`On(|GSrV{@Ik7gl>o`>QtD1aua?&a4jH!(EhzxG z$=mWT%&cOqtYU5{V|uq=Wu<{VB*^1JCQWbbSAW8PT}-!HodII zMPy-%aD}t*ladAucz@25IJM3*!J`ij6zGt&arLsd(j+lts@subh*%RzQnl%56b6{I zTQ$F&*l_IFc-g}Uxd;1{0THwSLe|YmB6q0E+uwtih`(^a3J!US|I z43hw5x_Y;FSv5_><@XmN4-A^*e=>v(`o3DoNjir4J@xuz$?8YX6ljH>5*kxPz!pNy z7R+wB&A*l~gB1OF)qV-@d^L>JA4sVMCp^`+SJcTUhrZ0=Qq8dd@}VQm$4Mw>SLy{7 zg|4%C&~+U*bGumFkhZPm&R7b)%~)(<%V{y6aBjj^885K|Y;2Mz5envj->y8CS!qLC z<=(X6w+RK0a1p^p46Q5oO$ z$u}p}%BG(ZcgG+vX}xDPf1xc-kdLRIWP(#Rk?sgKp>7(t1J#%(TF;ckahUq@4!{Uu zB~e+NU*Ak%S3qpSR^P&djI#8$`TEf3H+{uIR8EW8KnrGmD?Nia;y`HxQW9m`88p?* zK_to=o8>u4hc{O3G_Gt#%~RXG1?3HJ$EVaI#j(FGs%P(2F0@hH7HJ(4ZT01MF}0lo zK&brY>vIy&1~xrSV(OeQ|<5<^Xw73x}mQ!Az7Pt zx(m%Yh0X3<<78l-b#Bvv&>lww19lZ3(u&N;uRU0ZLr|bX(eB$!w+pg{eRAR{#Ly;Y z8W(1aB!%sCzVj_&kJu0`#8j|7kmeapP+0fecCjjysI*Ih7j6dH5!ts3ximW%cp=UU zw=zkjBYUux9owth>;;2sNF`aZNhl)Sn)$a`fkC;LqfEf~G0`RW&?>n2g7 zMguRC)-baGjFpn2-7?@xN%o`jW>Livo29GmZ`mDG`ndKQsjjKJ;unK^eUFfc*Esps zQ239zo5g}YqBngKXLvBW3qhTiY|H*c%HW8J4#gmRPsgE8$>U^$?Fc?;1{()3@;DMo z6f?cIi*_hsxnFei3yiR#NOT}dEMx>-bN{VLSn0N}6J>j3pdWVJOL4~dk@$N~!)@w| z@)D4xw^~cKo65y{z2-RWk3XZm{LohKebsbC{w{;#azS%O?p0};NQEw6&i4SWk0qbT z_}7X`=4eZ8G+6l5SjtBl1+pY>Jl{hIYHo_eH?BxI?w8zyTi=gKqmtcYTiEATxf$~y zfO4gG?K&RmaHE96O~C_Sp8oSNG4}E=kFJjpv}+ivMshuB2>*=er)xS3;68e1Bbm24 z-z=-p`GGe|b5dTcrQ8a^gMEBF)yZS4ZtC@^9KnM*qQ9~qlE=J);K6PxcrZaf1_c@GW{)mO90QFL-;W$3!MaZnJeXjR@uOR#ZK@Uj&+=gUCk5^@tTss8 z2FT^lYe0j@VjXSF-cX`MM5-#3y~DQ>c3xA{HOrR`C31kW8^9fOYsla-Z*UTNUI_8_L49_z7hw$!Fl?w zt9GD)nk(^ySx~y~q`7C$qo*BhXwQo_v8HbYO%Ng@-Yq|Be`c4AEVwCS)6vW)+bm}C z>`A!H=$mKnv-~FZVDX7!qWx{-Z^VTBpFQcQlYofLOrp&Wp^ce~33b5Ylj|f%zYZYi zu0s*7N;om8Ory+M)ygTKB9z|cT)HRB-7fBAb29KJ9`;+H>Sk$$ejRT1ae2l7UND#J zR`Q6>Nl1IPhe-tEl^cQVMLy)I9j}K{t+Fd&)VF$RZl&);w2|UL%!hn)B)u{)V z*P4$CbNQ^$QHk7Xeu;V{8~qax7H<}tz`4`ny995I5Z}i4j`Ux3T*S0-V7z%JV2Q4i zqDUY}3#TUCbXSN|DWVCko$^8l(V}-pMk7=`)j8~6pf|KuOA#;T<#Z={oujQr(g4PaUL6q&GR$tNwbR) zY@LU>dSo*`7vAR&UavDOrp+ss$7)&2tDtr-x=W-R==CCP!M&_cO+uL+p4knfBaz@f_*>-Ee*} zTjeMYTnPbsJQlR(NK#Q%CstOU>%{{hr-S3fItI!2Bg z@kB?9{J!&G`lBm;B!Tk0-hwT+*^sW}Rz?vJ(r3*^hcZ5M^`|(NYGXfM44irywWfZp z$4cOniVBf&ME+4GhdR>3S>XYpAvSq)saEVW$-AuH6=8&qbVPB1EH^UN@;#Si)E?}p z(Zxik-yVo!JYmg}Sbv}VfnV0EAt#)yg%S0Q#o&$mAms4drK&Es=5^B}NvmCm_|Ujm zEaQyS@95oRIUy^dE36^pey*+ga)u~NsK=7^&Vc4?J1@#lwCz1t&rO2`e7s1+O-B{j zU3}`nOAiJ_`=yJ^csYAJX*A?|?!d<-iI^CjcXU|=Vw-(3UZmzLT?c6i7gHjAEykx{ zlZ8*b$mC#^4$r@?&Xx@AtHiO>sx#sb~W(3*{g0}VBv9yD>o#I84-7j{UtFH|d zY7$6?;Zu26anH-WsgJxhRez;^H|C?q#?Z~jkB~@iPNQfYL+`l-Ttjskx4huUtwIubIf0?-Rf@_9-YVY$d{pb>We~u3{I6aFrwaIbr!)DTit6T8;T#@&lIx7 zf7Xoz_MnAD@8+-QdRJo~#G%tq82GXDG2m1ne%Y7jv1HTFQ)s&^kjXqWu%GY1al;Xf zv12JTweWc&6GfM-_-Pw+35vFQkQd8~d9SOn3ABB2pRZXJ;i@Sck|ltek~JGQm|N1Q z>K&w{Xa~F*cYxrAWgw#+9n3Ag5m(AWnp_pQCB2J0C$g)kh~^poZ2y1(^~;wgvYBOp z9XQQ#QMp9X*@l2DCHewDrr`jNY@6SVy0FvEl^z_1Xu{5*%s0254A_$8BV)g5^kyN) z&Auz&ic^z=kwe4+9#YBW%f^Ad$(?)yGg#EUe*bt7l{ieZR(eTrO@S}YCfd^ix;#>d z-nzGmcU7{9W-uHoSk`avfg(jt%;MjPMe#!F=va#Kg%o*i zKEl!>9fbl{49Rlo77!{JCf9;mF!iXEu(VN>OweV2Y-o zEgMoV8(?P6jWo{D?f}v3Q~Dl5-1_+l$?AW_fI2$cg}v0IwURF94*S_NZfMv;`D1+y_&j<#ukD>lB0+`=e~p zybNHIt{-{5h7s$$_D9q?EFjEg7EL^cPqRurW zt4Ek&@Xy#i{~vqz85Cu<^$VONIfo`g6GT8ja!w5hNJddqKyr|r1XNN3l0lM4PLd^q zNS3C_IcKoRQAC0w0(~3MIXdGwGv}Q5t$VBP_h)~3YS*sXYp?bHQ3RAy;BgM`fjZ#` z*(wu6omsGl(7ZAks2orRxUbAN8%s?^}K`>Z1)?|&8P7NerUyurB z!s-Cskey+;3bAZiTWHBH_B?7>^kkx=i^{cI41NJfFdOfNKnF4EW+cTB49`Kodb0 zth5Ts$g>lKVtk)lS!@xc!U__zP-34*$X5j>C=@dTR2mD0z zo8CkSX|6F615K4=9*=%C3Rp2GvdMIzaIa>S} znI%X1z!r>g8<{v*n*O?hBqR-QyO(7?9TNzm;u7f!9?fEUu&A)T%P zT_X%&hJVR6UHzD0*%opkQwt}~8owQhgkqRJ!5aj629zHWt`FxR-&`JD!goQXhUGDC zUl2%WqTXajWaX=_o2yzFicEVyXrE(PS73mDPABQVP(o^^x1PiQ>M4$V0K!&MEDzR( zD`ugA=8|G`?<-R~J7TDR08o+5HbclK8&&NLDL2zF4-D`Lk!j|7dPf-u6B_Rk8PeV+ zBnjQQ6y=1k$R&bm2vD1&Y);1lrt5z;P=5u|kN}CIrsKJQ)tP#Y#?rAU()Ee<{SVVe z4}h4P1;TSTx0F62)Gclt6e8URl!GJ(e(zZfb|^x;ckN+-USJ_`ISkKxK>` zaU!}Plcpl?+Rl+S4{i9UOIc-5`M>}uV2pc)bO|t0v&vqxIY(au)o5vlSHrHT$QZzw z`}c2NM36Iaqzj4`P?{_HdTA$)?M|jZQk9j zG5Ve7^N!_SA;?6V-kr>xX4^Z;>ghrXAL}yb0F4HYOIrZn(g92g2JGAPT&iQS3{SHR z0d8e7OSZI`=~QSo?l04+@CuPh9@6m}UdCelh&j>wQTXWx!w#4;RTkvqHi%{(L{&G& zl7UP0LX$qQZ^#|2fmo|bOdLBNq)!=(!+4M}ZX}V8Y0!@^S$#bb$O?%Ic}TQx?-W9N zlWNicd-(ciqh%Y9Yozc8B$^*T8DAv-6r6^cfJCwfVM9%KcmuKioJO$?8l^BK#V(Ab zr%WUQB`WinQGy&fuRJ?Wr$w)H5U?Mg+-GSvv|#{|;!KcC0ttm9RIgFs#v&*A+DzD{ zKQq3QZg}Uha{R$MGJOLw<@0JfXU%?~==?f^9DYwI&s}={8v1z6=SORx2(SbA-!oyU^VLm zsShl%h8WQX*n#B42dcKo+Zq@yl(+2+DAPL$C!dkuEoe(9FwCafG?)mFBiKu)4FTcY z8X&QF1m2$`-9(~?-`?i1Db6{D6wXo2_dasuRLA@PAz4OBsg4=7F9%o|QKHO|NfHrJ z7~&rmvhoc?MGs2w2oE&Y6o^fgONnVzFg*X#9f^aI*Y;s9_zunLBz|UOAjF&F(_uv) z*}R+>&IPe~wRAjuJR)vGJ%Y!s=0=p#7#9C-R7`Z7c(t+=N(NgSU1!U756lmPE9}Avz|DwD94v?*e6V-@6 z?lO{)hJ^xJbVtSwhLIB2L85g#o{KZEYBC}Yu?U(@!lsTynockZJy)U7Zd^z0bk`f2 zZm4c|TT*w|bax+3&ya4m89N{>E8D=eY?7SpOX5%O!vWQ`cd`z zF_QanX8H+e2T1e=$dd=CX9nnL2buH+FC`CNo*BGCJH)3qB$PZPHZvqeJ1na=tdKmc zJTt6LJ90yBL_2vzXJ$m7cGOsJ)I53AYG%}qcFaj{%r$xJ@ywVP?YOVrctG-a$jtat z+KFhr3HbBmiNyMVY2-;Pl*ybK`fQ0)A*?ic@|DD7;S6o2$5egt)EkfK<8;ad@N|pb zbeG4>h{RMi?Q~xKO!@Kjki^uA-t60%8EEp`-TJpL>Sq_~r}mF$v1Z>E&%iKF-oBB5 zeVCagIGH1(o5MMId*NhiW^OtWGT)UvLvk`t;<-SZGB=RCAknbM<~h%B4<_q5m7E-O z6=|6q!CE3MU-d>mG9fVn8djf5u03y9PNds#xVQ0=ZrQ70v-o6f;NJSkJ+zv8 ztMH(P&@(zA>I47l=sQA@A14=h~BJ+ z(d}Gqq;EgjejvF)mbzX~w>*?i*(CYiP;&Qk$}T)*n@Q@O)SInqZ$C`Y?TC5pRyXW? zq1zSk+PR^>1DX8@(nph{|7dlxHyO*>CI z9OQ>ae%((|gYjyVyb6oTnkKDcB-s_Sx(NqZrym!VsGo>N_CU%-FI(KZ8s_1VSjicF zmgN_rNgUD3kM8U<_xZ87UbkTfy~Y=((%hUtNQ${(*p5Y!seFfK*T-JJpH%^y zw>#b%52=xOlH9>wr;z--X&n7~)mibIl=t&V!{o zZQ)7Q9eh~)#)Hi#cC6z|H5`53`^JN&R{QKYx7v*# z08>g82pEs{lL*N>Bj@~Su>+VR6a(#BAF$AwC_cS)uR4R=3f6A={GKP4S)AT_plQ4r zHii+`dOONef?^3bR3F64WK~!>dq@>u zven5L=(x$?hlx+gb5nmkiNS~Po-h;z;LkB}DXRB~5e30^!(~{b88z0QV0LI&y^9dN z&?$qz%+H8ItQQDmues;^*r;Mnaw@-edneeg0YV8uD>^yBv2R47&VAW{zJS4K&fx}P z5Q=w$G-0*b;^^}Fb3?dN70dz`i)qcMr_&oBd{J%#J_6Z1>#(qF&R7@k=pp97eI!#y z{rr|+;wL-L`O}9B zhi`EpcrYKS2EBwP5vHch#CTup(Tu>7w&lkj$w8QogmC|NQ4F%x7_I0}Bp)R$$0c0D z=ZhQ4WVo~%TGS@^rQ?(DJp+cOpa_CK`ne$br|vtQzGK~t{L+{ndyk3KJ)D8psir9} zB^fkV#;em7;=^!cAE=9FM3jmt-);&eE9!@9QS?>OtlD%0TU)=NHuP&za&`!p zBZVaB3glWdGg11kzo3U>wo79>p7LNKd+iFc?2eJ49L$W=2p)`gCo-xa^CEI}n=<=# z;e=fp+dKiCWrow(EIw6mHj zO`3ZFYhB&9moIRV6hZ|F8VI!<<`W}@T!y!aD41x;q;()s^O2K^I(a(NFkZ~NrX6z~ zr>G)p_EBE2(RwGtU`6IY9UFnU==EM$7I zixWCmI*2lXS6d0j1bkDr6actPON5n^SBt|@3yekx?(`Vms&LdSwD!pNn5EJd9o$cM zDI~a8&Z^CI3x&Xn3@kt{H|o47P^iCj*{BQq_HF#il7J<3BXTi?>w`!ylipe5>luQI zf&qc;_PLmcRP;^uEP!?-3vJbqy9c?WFAB+Q?lX{1ZFR@KDxzJn5sB8>+N`Y1Xh9_q zh)JFMc&oT%`X~J;&|3Ej&z9CS2x7hnmhhbh|r=p*%j?*Q_^Jq}dt(PAn%eC+fb__@JRYD5Sug zdxO8m#hot+xqfBuV|BNid_~$*rByMwplFX6VD)Ns=$gvo`*>haXYHwp`jSIM+74JZ zKwLswL&|wL3joQ}^w7E)k{}v+fB&VS>fJlCdm^s{1&T&4E8x%`h-5Io^6}D}(s(~9 zQtz@~d`&0FAYi0FdPBh9Ri#te3c-Vg5A4Q%*rk18o!XlxtG(& z`O@LQf{Rco3CsHjIHj{olK6E99t@b^@~BUww7mX{mFFAkx9j+BvJ7(eD-=h=nB?5N zawQX|JQ&(xCA3y}kTdfI4pb07QTRp6IS=;miF!&)aPcV*_R!&!2SY{hVCLsM7{zM@ z4+bp@A-UnPe#(PgLhxW}XFOOlf(LVT{}AfrzM-o?5STuDl}^4@*e~)bpIMr736ADw z8{3zOp(2+n2~L}eLfuov_zx_X2>-$~_zWxye_|n#x!Y2b^a68bv%1O`{KS%1%J62p+5_ zFZ`nqo=ntJF;(TKEd5iGf&0D3i#09rvyabtFg*kh7P1%M@u7*%KX~V%ug9Sr{>DBV z`NZ7I=jS|_l>R9XrX(+={6QngQOk>w9IpctwxFDf`U5 zG1{pH%pY%{F|7GxpkhA84YXa@_TCcG<^ZHV5rPMcyMu=AymiS}eB4(g%ZX#vM`6{So7GSGHy#XA zSw7J3=J*38)+cwq@n9(OPj0dLQw2QHxA@9~m5}=zulmQ|^0#6QIOoBDBmQ=|0knVs z*VQu~jN-||s(>ePdLFESL1#SJ)!W|wm_c#=fnkb4r#u+zT|bJT#5m39z@StF50)F4 z&>oaEuA0IcoQKel-25{w;K9WQ^11E7

^()vpLMr26*NGK-MMj z#)Gk1G%1Gm93Yy1&Ui3kg>D277We}X=6}wEwV&}|7GHU=*fSms`J4yCJmbOKzVcv} zXFOOLf(Hxw%7ZzXVBy4H&^zbBxWDmW2Z0D4Y@QW``hL)zGak(8oCoW_dd`E%n4Ixo z*UosbhfEmX@nE`VJQ(Pl2dh5g!Pvg>U=vrr@nA?_c`%@nA?l z@L-1LJXpuoA9%2bXFS-$uRNG6f(H}%jt9Gc#)Gk)@?Zer1erhZV2)=zSS^AFWAl7X z`FkGh{uvKeiQvJo9(QB?j0a;o<-ropcrcmod9aBy9;^mVhv3015j+^>4?NhtGafAO zlm~PADGw%%;K6FncrXM2mY;$D=?5N67QutDpYdRal-%ntHh#*3HC{dC!Seqd9<1^= z9<1hHJXqRqJlNfT@nA2`c(Cj<9?a`E9!%*EJeY`MEP@B?3`Fo?(Jv7^SVC5D@XN&f zGak(C8xQsyKqappr_`H2kZ3jRw{J0EF51ibPy_>%0J`5@*NR8*xVTp zwt?Wmg2PUEFzizvtoMuuGeGcQN};Db7>@tsdNCLFDGwH}SaLxp{gekwJmtZngL$Md zOUXApZ?vc)crc1H9&FxRcO1ckVc&g?;K4RLQOF9xUmc2Qxe6!8*n&ZY0_#Ilz@P9_N6pe6_k>>G9uW)VO(jBZrh3g?%P3ru>gJBK)iw9f!jR#ZyjtA2@4XghQK>%%JU%W1~@19Xjq6-qWql?(G&w z9c>@l;REfBnw=XC@J_3;b_5TWUD}yD-MLoU(Gl4J*X-(a=(0X)782@OAnGch=|mZ9 z&+DdZIqE7d=uR)~Yz^(YW8IZ})a_BwHQ3#Cyxbis-VrR`je5O(qM&n%qx)H1_mT=# z`*aU#R2Q_g3%RVfd7!tAqqhmUcT&8wEU9-^yknoH*QUE~8Mps!Qg;Gw&$e~XmTrFw zQD3%r--0d>CAkkRsu@1r_dsR<$)?xTeIPcezp=C*Wv6$jqOF;ums+*`i+F!v_kBW{ zUV{2Twy*)@S_{L{Go;Odk>L&{aGtw(db8#|}|d;H&Oo zF^M4=o4#uvaHjm15BQBnION(h6eZsxyyzzY=DZKBPlAwJQ;O@WurRpMiuJO zwF`#{iAVMEBp!GG72l1R(vI1^8>1N=6)ux?vl;W~8uic{@hTke5*ybg9uL8rh~%7z zR-JfeGx0oZBEE1Uv1cOr-9#$hWIE^M3)RUio5`H8$-KhJmpzk(?e38ZyLfm4ON|Pv6*fQo9-x_?&_KDc{kmMH#5LFGo(5*Vly)q zHZxH;Gu1OQ^KRxX-s~Lb?1Jj-lFjT&*z8*2>_*S*yLX84)!Pr8Z+BJS?%TZm6!!LW z;oC1gZ;#)>-@@@=$XqZ~HP{7P7)Ce@s|beE3&Y!j5#Y}eam|sa&5_y8k%!Mw7R^!j z&e3ko(c{lEaLqHR&0n;gzZ5>tRy2RPcb;=={tEsA57z>p+Jb=Xg3$lx!4}lEmTb3H z!nf9n{%_{N2*2`R+E4fI6z_kg{r^aq7vGS#LKxUh%ST|yS4G=TIQohE))UFQ{^Iuj zg<$`u!vQ}*!rlkTA)GJ-681h!X)OG|#Mw^%HYDv&ZXCJ zYgMsn)la*s3zusWi)#lz)<$R6MfcU0{x2h8A4Z{@V;!Hz8h6IqwkEm`C!pPv(21$H zJ5&E33G@EH3JGJ*|2-0BOS%g4Md~ew`H|idCZr%V6HfC3xU8moQlQ+HC1#FQ{O=)Q zbMizt?%iLGZ|PBF^3%##TTZNcR%+(YRe`mV>{gSlN~G~|O$M7+ni3epn+cOi^_Yc3 z2+zkbhw)R~X9|;K9AQ#@f&OX%hy8a*m|3QDfb`^cgrD3Y5QoiW%2j!02&?ggLeS1rlaD&0tE%L>a;%<7oRG5=O}G3Pj11KA7G)MZ%)5@r0cs zVe!qo9khuW`RWKHEHqd+a+;`6>VuZMGEe+>NZ6NCB#b18+Upz%b65Bg2~%guweSrD zjnurQtKN7+p}_)@-EXwrr_|TEVwCZx1Y2 zUOt@8!0PNu!L7t#_2592&i$uIm}`Jt0jf0T&{T26b01!lUn5~3{vi^^bB2V$PmwTL z!WdybW;>J$l6!+SoJJbVOt)%&hUuUQ#_^#C2{v8{1W2G`VW4*;Z8 zv>>m}2LsV$DDihuou4GmPnmG&DVhwk4K=k=}-R=5@!8N zBrM^78wor9M@U#WeqpnYuNw^AV=@0i%uBkR&iv737jxQpi?2wSF^Q{vyzHT7TE;a( zX;q6DNtwka(t{?uV9^3?h1%QAv}!>39ddRb7>dyNJ14Y`k$;4QNg|N27_cm0@3O?_ z*q~Q;Pk)r-1!+2HjN~LEHDQcVHLPXy}ov<_AXl* zIwAfts1v0c2FN3hVAJo}xOce)R~XZ!#KXP{;~1IjdnD}e-$%mM{aN&xl{PXkX$NsM zqwbS(ZnVFaU$%*S{C;?rPzn>uhVmis{g-*oiFf}V687a32@4Uc0*BIFYUd3t688Yk z_s1k$<#AUD_MCSBVkS$Pj>MppJ19W6Ed}3VT%w|T0oz) zm$7Ie^F#$2G~S%>MGf>uT|~XJv?FRNCn3(HzUnC3iRu~%L&^OS35)*|BrNeaBn*ar z3`4nQ|0pfmO@r9$E-T4$3w!O~K*CbfOcK8#VP3yR!kqgVBbsFV1Nlsm?u;0VNvsm^ zkX?R5e0fzyWX3T_g7k9x4*kh;kb0H>LdJzi1QJI6*GO3EITBX(H;}MT7+%}|00~R| z9TIl%0}^IxfBBI6F&c8{g?6Wfp*w_El`HsgxPYjUzeB>JPmwUMQzT3lgRne|JC^L& z866X8=3KC|7z5w+T(B;*j+_JZ_q%ijgE=@DPnZYs+v9nSQvRu8(dAG|ourcvn zDD_wg-y=QC#AZ`#+6@dH>gtuq*@;mWJMlgl>i;ZCg_33ae9}#7_MV z2{S{w;7XVk2p7u|vAq2}J(4xL6cyQB*VoA%BVxOS544SYAkAW+Es@dnN?)LqOGEfT z2fduIu#Nc91H%TfCB%QJ#rzN;$7*te)dX8N^9Bj4?F4(({?CxGM;IKPsKMt**gASK z%=21A04L#p6A248K<5y}S&=qot7}rXtWvLGY{O{?qRxtGC;#lyh6Xb2A5Ovf%QDATmAmCdr1Ydh0mtaTEn$s2ZYMg!{Pj$giwztP{}!hoM%T^*K&u6+?7(Ha{0 zM804ulII7R2sWP!{ zWp~Hn)i_1MvVM((MgA!g7WoYcvxK3mvtvHouhZ~BRRy+b+!d{HCqF0&#)tFd5f~1* z@C=a}4v~kkcz>8nUxH+zo_I-u$i>{H z*g!-L&SD8tu{O21f1Ls>(He6^amvD zBbMbak+9TrBrNhP5*GhEB+Ln;)CZG$gCw(%=$h3n6=lNIUn5};Fns#o6ig8hq!61Y zR-zL990_yPc%>t|@oJ0C6sByX=#Sw5;ncw03J$kv_7Z&DX0a`q4BggUUI8lwrxb#LKaM{|t54ZeZuYX8AsAUeY|;H3;iIAUMh z!A??~(CrPO!g5@1YDcXOVV@V_NGibZeq7+SYyTz^=IZj9>2^O1f zPZn~!6ecRlHtjx?-9n!a1PT6tgiR*L@|gB!-Wgg}O%i#qwKy@fqMinNf`9SuFr%ct zB;Wrk5*G3=B+T9Xqxk)D!|#zW(Z7v^At%(IAz|I!{DQx=$(qi8Mlb3E#a(;$ZPbpCqQ(-nJ~E>Gk(C#LE01AM*fUy zdbKU}Ii_U}u2*o$*z7}i!Pv|7=7P~l*9oPni6`l_Zu9;P4zlzggHwwkuERzKHDak1`3&gIPkV6x|U>*q1*bVVD^%7bky*gr%M% zVG)!GNKn!1mf>LCLE64Z1=0_il3vWr@w{;;AA)#T-@64$1|y;ClcqmD*-n=_1n1}> z#^rYHA(tf2k+8^rAz`Usk+37k1=HIxlGpzn348I+k+9BfY))b{+*xdyeRoJBOM4WM zfTR64Bn)`QvLT2o?vVWsP&9$MCnSM%Nhu{2Zaen7$!zae1@_&zXZR2<@d zA3}bg`Dl>XT^Iqkh4H_TFpIxN!qNi&+eldZS0v2Im%5o8jT{XUeTa-e!sh)+E2Qx} z1=v9eFrO3|G7R3TC%`x=0&?k_`AqlUsbR<7-6XyrF|kw>UOA<=tyEAW@DPKN-6+0YnO#~8lQAG=Z zgoz@MFdQsJ1QM3~6$#s%qvBl)5S+(ZDCe$fnbc~%fP@6bBxge<$3!Ah8vcfakj3Xlfeu;#M{t^kJB>skkk1@UEV zQ1~L3pj=HpKweU zB_{An<6IN7(q4vf#fB56@$`zpRX_DeYn4V&e~Y9`+2=nZVNrhz2{XaqgU3Nvz9M0b zU_vJ8MihB&6krk)9>S7AQHsGt!W2UzM;H+RZX`4`gF$9S=O9aDB>rd%G=R4Ynl};> z5{-99!7q?7N10rUB~Gu0-C!gz@f{%gN=YL&A_X2<&Y)8gqo)7b@Gp@tAj|KNFqJY+b$+-PAP5CDQSM0=)!FGbF6!=SZ0M-$ug55lC2&jDi1ufP|sn`Y96j3;Yj} zFcIOOBVqb7|3bo$6#o+>%=ZPEhQzXpb z`Y(|%lAj`BPfw9BC;|yvYyFCZS^NBRB#fFh|1Xd*?eCGWh;t;&|Jn~o80FtU!jh@E zs!owG(!YU()tn(=dS8*SsJ}+STu+g(_}`GQC*q|T(UF*ENZ6-uNZ4Rx(~n3P6_Cy3 zJ0vXYdn9ZcfrKTVAz|s)=dFvn|p{(B_M;S33@`VI+eK_FpHZKp_B zAB$-}0trj~4GA;*0}_@<4Kf*2J4M3GenY}g*w2x$nSrlJnC(|2O!?7?!SbDt$szqj=mycVAA+gBn*VE@f{MDc8Y}2v;T&K-TW&g?D{zp#&m{+X%P9H zB4MrwB#iL`#zYmwGgtZ?3A6qV31cF*MId34|3xGWgZ=L#VG2JWVFu{Zzd*uTe~N^y zv2=fjgdt=9hJ;1`fP~F_L&7>#tuwBBA&@Y%{|O|_q-0hn-j9GMoTrQ@31ejsGNJXv zQU4gz)LQ(>Q_pQAzxbskakI!Ewu0iy%6d*)rd-utNMubnMcYOJ-Izgjg;6&MD)m?;fHWbf+}kkx3!dEV_b>xLF~1Kc ztOUs1*jE0`yxVlRbCLV)kRbt_FQ}5qNtSX?6AcY8XM6+6m;at-7@i9uMoUB*im|LQ z0hcVjxO6k8F8AbHQX>jxGj##>t}agt)K?0gJNcO(65s{dO`qKtL@g$Zad!=x7*p6{ zUv*#MdR#5Q*h6ZIEXQ+b6t?8>g5y}zT(C&sLZ$}y<7bqGtP--R!Svh3<TwW*{my&gP z_Pl4~U1-g)jV*vN<`Sx$K@NPppvY9}PCY((l`XB(EFLR|;JXLqwKHoUUiteQ|AYM5e6k=UpQ^nFE4+GjfA)B?FP z4HP#P>yBuKAds-gw`O!q3$6yeDvl}dYE3e)W!Gyr(7Ek4F1-L(ccSE4tt}1;VhK2Szb0ozF zb?Wi&&@UWp-BUF6RVLY~fkz+7vc7Tix?r=L0zYaJerMO8iZSt+Mq(|`_|g5G6#auh zvkl`bkFU3Il-WHw+<|eH$4TjRD@%FQo2M3cVZS+OEk2wm4$zK4nSLxfZYqk8-*R|&18?OSXcO3-G>S_Y7%a$h`Behu!aem zG#!t_(}-_{zz6h6xY7g2nqqGUPgwX0_CqAsCRXbriRZd`_rj>_eI&g>g`jlGC?%9$ zeX0~gaGuO%`?TGp8vq}YH5efQGeqD%aznGPaA`XI-OdYx`oI%IGfik+Zm?iC1N*Qc zeKlBE7hf_4`4bW2#S{hX&bJqrari`3FV44ajp{t7!g%Q> z_y)TWR2Y9*I)V!0aY0aF(hcbWT=^JG->5K=L9lPS;do)d85IU^6F#-Vl!>-e;-{JF z3lLNoo=ponyEo*N3N!HmxEU}{pHX2|$oXHMifs36&7#)l?s~%<5nT4FjQJ+!f*r?7STk?|CI_; z+Jaj^kHDc^33u1NQDFzH5!h@t&8JjY?AA)GWgeHq+Vh<=D(nb+rDW0OP#ZC~d`5+J zpHX3C>?K>*0Aos>O6&5pRLzt!c9TleOIF#~nZ2Rxz^ShZkwR}Gb+p}T-D6>oC>?aY@HryWZ}Q8<1w4#a&$_C1)fr2 z@DA^k8++CH!NjLj*pr=B-$UpOtedMlZ^NXCzHSJ`!^amKNuih9c!&zARQd>PLX2PG zHn(2F%n>vt(jM%h=O<3+U07@O$1Okw1s4c*=QYp;)HhA25?csx>thG{0?^`Y>wG+e z^BA8+`A%$lcDDK)%GRK>cXx91(A^~lt{fit8U`!4^>4Bi^mNj_6J_R98(pWDXSG>n zOFhh!3S7O!of_c7dZR4Y1-O2j@^aMtYSdtE^s-t^!y2c?{fHC!lkKM`e#n%~U@~zK z(#O_J(|qc>iK+RR5CEYQP$T@a)$IhUR}>q089o%P1yY+bf&RuLqZ>EwZLwa36NgU9 zzqywyMG8&)45cUKur`T=RuOYNvsw+eU6a|gRxJ2Rh261Ux%-t0OZbio3pla0E+~*X z+FUA&m#UlS-to^bv$P(j!9Ox6C|d71+Fa<269(B0P)%|uo>5^2rg29IDokq?L4`R_ zA9PAHN3a6)m=;bp6A@Hc$;(LE21VAo->5JSE*qG%H!a3$ zZ(bt$z`G3FS_BnV0xu#swZi=I6TeYm&p<3_9~zdsX#2q_2rBHzR(BqnUxez;GeOB< zwNxS*1Wc9?@ZQkSN`Qejm)lza|27+`GjF-AUpS}8|s8VBMpP-5et&0q}D zPrjr#9JUg$SVJ7qr5s4rdBi(NBqoM#hjGj6D}Pknuw`v?5YKaKZ>(PPjWV3Me^J2( zC}6vXdhKqA{A_D4^Tv|*sJ*+nUBw;SO=UKr8#WdKBzCi+*7=wD_85XHtj8VkU^`d% zQe~p?IdJg#`T#DByARBC@Z6oS{dR@zlvJVvw7^b8qzc?tWc4mLMQ1?*FOAgS55{P9 zJR5h|%DlJh4Z_2PJIDClusc9GUR^NuO}%Z->A)?p%u1>CN!jv^nvL88ix7>RZuD1C zbdtKxF}nDCs;_K@eQn-U+Q9|w`;w0b(QgS_*wYI+Ui#FJCuCDm;wUBLtRNH=cF9?3 z#rj-QRnbl+Rl4rhv2S$}QK~;rZg*aJTQUgg|LVl0*)?*T3KZPp4wrg_QP*SZCmhRAD z78^RL13xd}zyM)Kxe7tpYusTZK#Sey-ZE(StH3wHp>1y3uT5-_UWL9D4qvGXe=q#> zQ`J+ra0Etm1OZ$mlDs;SUL@*Lb<`D+Xrbz8DUle3>KJvAXWG@z^hILLt7Gj%p1W2* z_Y#Qs)J*PBh20CdW%OH=ri> zsc7Ewn!Hre{G6KnV$qkcYhJz)EoiGL=o2j*t0{adTC`GA^j@_1Q%x~kv;?EJgg~s6 zytb5Htn5;4*%h&Jp;|<8afL!{g}T@)?b=uRVwL8#m3CsUcWZs znX_LEnn*8RP)k~eEI9p`e%nW?9a{CpmuI_{K@PP!1k@^2*RK4QvEmCm3{1KQvfhJ` zDS|1)n_70os$Pp-3a#s**6&b;P|5lBKK7MLbFWnfvmT%d+CwPaP@ADgJy$$72T*FA znwT)_rMBJsH{7k|B(C71)Dn2udZP+2Lap5-#^0lhdP60f{l#fy`KNuyTrou4B&I_= z9@R-m#bHfBQEf^jrlX+~u5B}4WG6n=+n`C>#MF;P`-;b1oB9Hs=aWe7f;J8{QR{+v zDBMTN;kAAAzN8#JmsZ7VR$6PEBw)spc6(q_ElFVwLZ#1Op*d&GOv3 zlH_T%LYd;6q(`=aKBP%t>w>0(PyVcXkSeibY8t4-9t7yz#xCB(R3T2+|E906fpqvJ zYFg3;3GE7-HXY?W<5eX|FC0oi=YxKFE;j$8{YWF+d6^Ij>bo(!7KCJM<_D(y52FmF z{66jMQ~D8`sl@Z$AFX&uVy>1f_VDB0XA%pIblD()z$4Q8*RzxZnsk;_yQv?YtOkMH zUcvih?6uJib)Mj{i1@((4UuJ*hTLi|+Yf6My2g3b#zO~u0(zJ@Ju57Dw7Oj8rmCDe z27+S;IEowXmwY_Bf<`6$pEAkvTj2t73~+a8cbl=uWpkd1(|)Y2d?#}+;)0nCtA?6E zB!!q4{=3)@x?bD%4#XD(=bMAc$sds{1ubkZQf2MUcw~JxYL6n{*-#CcUVJk_X+&}n zd%CmUH$w5XRJYEw6tv=-1Mvi5hwD+IV+}J#)T{Z*%t?5b?lCVM!4>d-oBawznYnA zue|gTj1RD%mMG{7&(N_T!#sXidB;mG(kSec{0@eqhHS-HsxKYGO0(Ft>F67796LPC z0(B2HG!#rz;+GR_8Yowi9%NtZMAObxq|>EaLwyQjhXkew6*c-R_-XVg()rc#T&@Ti zgW>2<ZxBT=HJR z+Gt=$;iz&&FkgV#a;!F$)E=3@kCpieOBYF}5+h+SQin+DM)C)@aasBk!axwIkxE2P zx+zbwn)V#pm2K~fekxa(^o=5=!Wl~I<)GTy2SE9nj{?LuiEe6lGs=EsAXwjI^`IS` zmq8nS4t)rvx7ll>UZKDfn}u@7G(~IAUX0zpbD1i(hLGZzLEgAPiQ$VZ3?5B$y-`Ho z9*FC3W28SuWNBNLc_+o1k3km$D-0*g8_yGnrlud2o^tS9 zI@r08;AndRKVV#0|AaUWLTct2oSSZeh8lEjv5C>XG*}Wu#OTJ@yG}TW5mJY6gQhd2 zYFHb@OT({?b(=A-?GblnMkbNaC|;btSwyA>PQK1MR*`G^jBgKmGede-@>cT&O}F(F zSOv?g!^o$;_8{qy7#VBWD|Qb?6Z%Qr_7g@l+F*kSVM zp>iD`t`DGU8*KnrGswc*n2g|1=ZT9WLH>Gkwfy}w+3d$r>+)OGcp@)c%4;JwdQIqO z!j@GIN@tSXpR*IDD+JLH>m_U(-8Q{%YPY7N|7Di^&1bBvI&owic#dNehex1%YEeYZ z5#2aTk`5GAD#QRoilT`q+@3X>f=N@jw2^ppTh<jeQj5ka~pO!VQ|vx=zN|wQ7SkkKgLCPKbSAgy3kH(K=%wG6UL^LdCmwCb+`XHdx)-7f;?;fbQn5s0j{ zT_z;5^n%G!Z6IQM3FmO`O>I6i6lJ~}p>#nUkx=UaYhohx376DGF2C6VrS3(`F-`*A zivqVXHQSfUUNOm={>CjuE^0J9PlHcqo)rH9!&&RU2S*OeEMkb#lnHc!xnP2T#GBrmNLA4 z8GW$I+T^*;;1atb}-z=P? zxJfE|agoOfUd<`~6m!5v=blUXhB0D%X>u7<^W1o(| za!8xU&%KQ)d2h~NQQVkz@&J{Q@I!!+I4V(?v>Xc~@=p2g$`W8+@YOA3W>C}x+M|Bp z-O-D|5B8VtwrMGPUTH3vN|n zMk)ose`#M zh<3|R8(7G&oJa?#qk(WhaZU7@1MbHr4Z%qMcN=6$7a5FT?bKTtpN<$lj$Y)yZdZUA zLBF^ZX;MtQn8vKtLZYk6n5l-%dRg3^;NbtTcb{QVCh6Y5X+Sa!O_M_hK*AUvzLZ9S`%DQzzorYkPn7zzjRhWLGNGjs8E9)kQn*Z0_!ELS;ffI$>J$K!6GMva zgQlQ>KG^+rxB!#8Cr!v2!%2tUP)0^2! z*y=oVfZ>lBaiy5YGEL^GR{V9SXX&%d6ITAy!Hk)+)K;PFg4g}6VxjVwYgCliNv`|L zZ@#OB-6AHTVIL^UEsL|Tqg|&0e~$0CQwHAz44-m>5GL#sY-8~;k|b{M73?fCA)JA3 zmL!42OJ(bmF!@BltRjYb906^>(tIi*uigOXptk%<;d%&anL+3{oQ@O0o>uxcxRePu z%@pLyjK@ZtFl39fU~xF)`gKMlP>=1?TGlA|uInxKCSGV$U0k_br2tn?5i~skT4Zp0 z1v$iiorShs@z51M0su`AFsKMYq_9BK0jM69ffGQbh=2}fGWWT0>?3HG0w|`$g(sEY z;ubK1t=l}C00tFtI#wiPAIT`3z(7<%H4y+@z%W%#b5>zQB`|O&IJw0DbRWxDX+gn> zU{yrpcAXf-(X4t{7Be(XkCVg~%h-T`Rs!b6v22$sP9I^}Xpvw^VTo1@Qyd8T_*kuD zPeF3t$|B!@(apd~?jwa`&#^@ySS8Q1`Cv(28L*}f&8CUUJxw@MJCx^yreartPE50` zHVYgfD7zY&0vq3UDO38OfPrg3muWDL0PloR*Z`<#G0<$xCJBww$3&$`3ySCc)PY$; z5JG4rg1r*4&DgYNmo@97!mj8Bc0xiF5t|DLApuD`VGl^@Gb%;o9Ig<{h0d z1J{+=d{kID0iaw0&dguCDQFNeT+kPes87^-NPol7 ziDMHE5q%0>Kyr+u>Sqw_<8Yqt##zy)gK=V&s>7~MGhM3NBQ zoP>Pnj-vT4c-;c72vXkK1+EW6CEfTcvB0iOs;|>$w=7vArZ@XM7`tFw)z12C45FAu zY8xcW0-Ubgc`X7?W;0ApjAXi#1(|VYQ8%D&5kJ_(PrA(dEYdHhDuwExsBA* z5@5mCT6f=EVVh^vbp{K+H zy9jg3gp(&;Lq{;wjRXuAHXvUE|PODdd&p>stQF)>l1G?ZfH1o{`XENLx_)k!W%sysHJHYd|wMAQ|l%k%@S^ z3GY+FP6v6~Mk3e)EjD}tuApXsb{_CF3vfS;l)VLu9-0KWE59LO_IZ~kiQ0iE8Ho4h ze6S2Po?-HROkXppl-C#{lqq9-)v=y1+}&M)(Gu2d1ZxCR0+X$me zLE@dkc^g*>Gx#tq-0Gs(l?(*6+B$Im8IOPYsTM5Qi;i;_Av+5^eZ)m_YZyL*W%R*X zUSsrZE`}~`*e;?;6w%}{YYI64(u6J1(o8~#~ zO0hizrG=Aq5m0a;NlBlPMZ5rwRFS1+&emKZdnHY`>1=#2ADRsZE5ZfE*4T0hs3;PB zCr7E-feho@3=3GsVHj8n%Rul#g;e_bEl$V`7?ef;YJt-c0~qvBvd&lrGXRojlwlDY zKwe2EdEec4@Ray4wG@UMmryFVh`n({1eL;GV^d)xOS)>w6jJNpi2!riH_|&+ka;3l z@rg*MCFI*L-hRL&-+hbG219D{nXwSeXB$W2l?7q0&7U)j#NU)Iin#FXpU^SfOnITG$2r+&zELHgLDV%q_?4n;&KL1J$`}b4jChmj36xK zM+B=A`t%}#wE=w#+9#%uH?Ft+d7R>|Rv1lY+5)6_%}4Po6oV6TXdKBBAa&ZiNgL^| z^9U>j_uz=Yaw$ATymW(noxW#jlv3spas-CrfjPHNz}qm*&lgxLksKk+VX;C7N#|0l z^w`s;86SfaaW~Y#G?%ISheO8%coq%AuUt-xyl#Ep!@5caoEAVD7?2>btG5iJ+e9(3 z(Ma|NSmFp&xz@opSW8vI;8>wYI18|e4EZ{K?A<;eI>2=TLCcC!zqw08o6JMtzQ($d zzOzf?Sp_}+5m*VM+r>KaU5Q6}r$~=?QB|ClzzQv*l<#`XRideK1FriI3^9%lU8bkM zWCAr}jqRQW4p!T+8l;NTOWq1W|2AoTfB?P%NAF>{+qQ!?9E4dp#68Uegl#r>j76B^;YZ>?-sisu!%ADEhCBSs!RN z*k3+oc~^EYbe4tC)WpWqYh@tl10#^WzE zis(%W#wssPHGPcHD>%8-&d`G7eWpq(RxC>(LEw=L{Pg&X#fJ@IAVF#x)pz&dCaJlX zDw&t2b>BE~7|IB0I6>&qse5;`KJGT?NF^@AZsYYv<{{H6bnS(Q7PqqSU#iZ%lDq{i zgxl+jeaZ+*m_A1xd)nLsvaU>$4e-4m}@XDjAA{o@dXhU_MpN zOluI=$~NqAC?}EF7P&tbSclt1vPce-k7CHohjLw;ve>*BXV6*B80sy!_F- zBh}cizMX`op;^Z9KcLCUW#9L$^%Fz&@7AtOV6VMz zeKhr%V`}5tB&FFjwb{F)Yg4_x(*XRA2QmSDj0Hg^85E~+95Z5mGhFYnAV2KDB*%0; zi{ksq57%Z8W;0q}X9N`A>wKNN)-WrpILl-<-+45CO^?{^G3XFQ;$}u{+X!XGES`li z93qzK18p<{g+z(wrjtxx+`*| zA=XYPEuMX~sRP|&GvAqqd^~FSC^EG!F}17pYE@xsQ`LW0*8G#S(we{0$Fmk6-KQ3j zQ|pnh_M9P$XDvv+D04(vkd#l64I{wiN(ZMA9CiLAb;=xHln<>jEG<*}^~#5j&;InX0TXW?wHiKDo#)}YN$xgHziJ#P<-)-rQ@ zgIhHQ*~9FH(LU(_qeQvJfFnCjdDRq&7PH|k#Hudib;XrB*k!L&QR^1Gh(U`L`721+ z-Gi@d`xQ06iGw@l$7AnT@=^odGuzqG&AKi{7b|#xqRUqX}W@MyrgXXmW%I`>6m}QLI zJ{NnG;@7arFx$<6AhJL$p5PF7Jnt3ODkRHI$C-(~7D2;7Xq948vvh*(%rO#nJx1$P zs)AFvO{sZn2^6^eC16twd32U+?UWMTTTFcYe_p{0F-+`-$ zkJ{L08wQuh13vneh*x??jSFLU6@ZWY=SOt$z}R0;dYprX{5FIBUWs_oPk>f zF=fbO2J5u(pS+%Q_n~6~6n88P?3bcyGIhiPR ziRjxi5h4vw&crAIy|D>*+ihI?!Ym4IEC_ag@hqp-}Gxi*}a%xss)YoDFj)6Hbm;PV&i&D+f{ zOorX33vDCkVa6}#m%$GUds#^A`ZhP*5JoUeCzEu{9_E*@SVyMq5=bvd&6UqX*kVhI1g=_mjOC>}7z)WQOpkkL z+p>e{Nbj}k!`*?{6X5~9L6W;M65Dzh_7#`VNc*;6o48P;D9bQ4!5JADqh{_qqD+Ti z3^{#P3-9H{b6;lB49~*OnmXzdtI}qHUdgC&Yr94zcH=?>Ob7VH3kPI}b_^J&$l7K4 zY$H<$o^ZtS5=ysr>Alrd7pFS$BN7Iu3DVa7`|DIC)1(ib&-BR-`I(J6}C4L+BQnPin2Ta8;#_ z;Xd6t%<9Q7Kdr8Ny1X1|qJ-j!XZ^(stEYmN`Xx|x3 zstLKJ~_lVAeo|G$ERITFa#YR&dzXEw<6>p=G>Re1C1~1aoHIrmq-?vN44Ha z!wwYNB_|IS&1z~yP{C{LNt8dv3PL=);I|o~kR0ts-QyV*X(2N9$whVJPK#);{b=6Y7`+;r4LUNsp$ANG8@Y|Y-%{`{n0$P2s<;H2{~ z6zUnK)ONzj{KmVCg*2&`%ckpo*=)FpKyl8>i8ZI4>_vkilF{t>+ffy(L+}(r{W_+T z&%B?W$=J=^DP8p~xZ6o#zS$uiv}-M*+Z}1Xd)=ntW7P(&O`~pgn^)uE(+&SA0gI2H zuGER1Eo}*ld~{SYTKVW>NvKktq?b=_T~s+JrqfW#ki$s4j%IOMTjs2@srJ6@A(9Rbd-XeLdnk^Q92B zUVfj*6GMK+JEbfvU)Bmex1edGV%XsNK*@bUQsPE$#!ZRvDZxM)_s~AhQg_r=@ghCf z%h#6ce7AOZ7iDS2U$35!O#587q=FOp*ncNj@}aM(UbLV0P&yFIMkaK#v|Pg;iK%#q@f0I}P`GSkOTBRZR`1s7LA2zPXW{pFh)T`?*mSrAIP*Nq3%$ahvN& zpBIA8^jdyRe>9pt(V^+YG>9R+!4-+q~;~U^L5)s~&yT z@ECM~BT6Y-*A2)OY15OrY$)lUOc6VI%OU7 zCNzkD?8=#rp!8c7s!aZ;zXU0FTvYfRD3%{QARTmeDfj}}J$>GL7d7uyBn9h)-Ln`C zzSME=+R{C9vJj+^#g){1)?=<$c|&gIhuC+7ILh3$vJG+VxM{l-a?3WMcCbrD8Sd~R7O0rjeHRnd6qe{q9fACF!Ci? z6oxm-@5UK$PH)p4xrdL?XR}`?hhMor!W;akAtgJP#5CSQ16{&We!}aA>J< z2PDXlC&g4JD)1$r-cG#m_MS=sF2Q1$hLq}E(cx@;V#FPnVBTa;TXB`Pdg zZ8_l@Um~q65Cu!IH%@_tr+9}8T)KJRU@XO?Gg*T#kpKoz4F>S?CHi*8-Y&Qwt#!($ zAoaqR`{tF&&U~ro_9%M;lEPUY+_6oGAx~QYJb1Jhlz3Au`DT)OaFUQW@l`E?b6RQM zUml>f9?pxUl|4Y^8mr~MO=hM|q=W$o;s~f3A5yEa-1Ad-hFW&UECC@AjDVgzlZ-WF!Z?$u zi51otmKblS^N+#I&=s#GW8loOomjbh@$$`C0@yOvCHe?&2y*7xOFGbH!ae; zFmJLm&rdtQge)JGmL6o1A1N0R8j)XFkRQcgkh+u~OOX~YSMb0tg2=AmQ5tJ%VF8*y zEP18Co4hb9qR>mTFy{^~B`>Y;S-aOYvx7-rJg|SZ%M7m8WZD zB{LBvLK-FW+A)i1B^xVFAG%83EtPEAJ*znNY=<@a(5$95cMy;LcsmeS1 zH{~NMF92KKpgWIn9QlSSu+-c^)k(sT{*R@C1J=Y;}$gC>HLM8&=*m?65U# zP`+9ec(q&fYRLZ8=Y#sx3fwDzY5n1oGhg4;pH+bqvNsY75|i}Qll2hpc9S?Xl39JI zV{%|Ud-`k)zq-R*Y5f4EH$D$8*aVGYW7Xy0AB8-<>%g$>49Fl+i zmi;wZH2ZFP*C&TQGK2m@ADFISKS4APlS2P;R2%D~PAUa<;5FE`!#O{np83^*(AC!@ z^&J^f@I}D^;od>EbNvqKgHYezljpeB1qX=-2P8PUSg40-6ov`T4M;{0KfyGKJmpH8 z7$ObqzOXhVbqzPX*E1|PSaZ#b%lOgoHHB_r>Y+2|6iz8{oqyU-UOs$@qf6r)SMNhE zr?oe(Plp{c-<(r`TM3N}1&vaa4M7RL{?yb(+(NnbRtn)9IT#w?20PIIk}}fAReMC8zl-G4sYH^H=-k zudUCU0~e6O3zp{>teqBa#4Ow_S+MV0a9m$-1}?e^FS?&!yydj$6|?AFvUs;|(RY2( zAGm}PUJ5$DbkAuiG-fHhWGS+5DSCY=7PuTQyqtJ`IoWCXe$4WNlI4ee%a7KVGk_~- z;gu)nS8|+I@?vl+1tlv*eJfAbSDpbslnH;RIRBx_>BEbd4=+nT)b@R-U;pq5xQY>8 zZ9c!+>a^M(v)Wm*+TFL>yT1AwxHcfXHgtaNjnmp_%-Y+MwRe4Mlk00!z;&$f`u`Fn zjNxZU*wxn`uf3iE00`iF#N@;TI08ZZ4vXLfaX(=wz9l0iB_$)HqND`U(1K`ybhI=K zU^*5?1|}vJHfAOW8w&>n28ZD9ZAnjZLitW|3Gtm0%BdQ(2HI!uYcpu-=wCjsC1Mb3VsK9HqOsX!J;y5=l1A~xCfuzi zAkez!W@ygOXq z_NMOFrLdmi$<@h8?3c;z!S`LeQv+|Nd-taMK4OQ*XLrWuhNk8=7UvfZ=11NyjLj^p zEiEj3T6lM`F!p)r{rvLQ;qv6n%H-!2?EEVB%j(?n#{AyK($VJP>ell9_S)v|`r)VT zozL5!_C6l&eZt{z$Ace7a72z*c``9UWMF>Nj#6}UC@q(Ex@K8cYb3LT-9$&(llEAc zY6QDhd3I+a@5MsX&hnh@`=XXz=~@-Jy$@yFS0*|u@?K{s2T^cnSLP2q(M;sO)>T|Uh97T^xX@W*{(-AFG?nBZ||&3 zcE5Nw_3A%e3UmC|NnrzjDTT%VrW7{*`=l`IY|-D6!aRQ>g<1YJDa>l@SEMk{?^0OP zpGjfiKb6A9eq9RFV)OYfg*p6^6sGz6rLeJIm%?CwRSJ9l_eo*I|9})W_D_?-Zu~?F zBm8Sp80PPi!uF1(Fs@&e!i0a16o&W%rLc#8rxf<^zex()`kSOMr|(i&|8GcPp1)5D z3zYgMg)#iiQdrz?NnwQFr7-tDNnwsZk-}{MMk!3CH>9xf z-zA0R{Erm2^>Zm~=I@un#($8)I@td?QrPUV6h?Wb@)xDBv7bm`PpBRL04XfWmuzk}11}Q9#1TTeI6DT{D)_MMg6lP6u!t-yF z!gT&#Da`Iz3XA?Og^mAHq_AkcKTryz{asR+|1U~mala*nIg0+ zE_8)f?Cm&tIk3Kl>Xt$aDNchn*^&NoFDO~ggGQgGf|mG0oXc0*{4mjcpOw5XcKKm) zrzyzXv)FEIN55Zr{{>Q*=dVd&HL`r;zbu8N{fA0n4*!T0cJ$!86xMYt zh28q86m~HCA0mbE{}EEy!(%Bd?N|yE{4*&m=MR;_#=lEpp}!@C-TYgmFaf+2*8T67 z!WL40K?)Q6b15uE{I5!3CW_ysu&=8BVks<|%Kax&7>wvoQdkJb>QAIF%y%hl_AjI` z;$N4-^!|7$Y!h?B>JOK~I(|V4Bl|s4ScB5vB!x8tQUACU7Vt+)VJ$yMVfLy&lfnk^ zQW)&#QrOwQCWVDE|BIxsrQauo4YB=;q%hh)lfs%{za)iivc3OZQrHajPoyy0{O?j& z-7iXEb^o9gX81EHtPc4nDU9e3lft$U|HV>Rz#k%oN&Lr4VR!#ADJ$5I&We}NSC{4b=i=l>&xk&6B6q_7&ff1(sN z{l8fXlli$6cKhEig}s#iWhrb5@kdEvxBqb|4E9Y5EB$RLj6u=DPVpC{Fh<2cL<;i> z`X+@<{e=|v{`W{>!#|P2EdEX@Y$-Ecec<;>Vb6aig~cCBVX|V!QrO79K?+m(O(~4) z7p1TZKT2V5|Lsy3-|vvZuCWPM9!p`gze5Tu{3eBQ{C+9S^XF1nusj+sg%ST;3hVoG zDQq12>r&X*pQJDql4B_h2zC4>g;j#W@_Z%+e=3E!e3QZ|e%pW9$E&Q?+2LDYdjNz|I zVRXMBg?S!JVHnc4mA@v1IsV5;VR|7-e}@#-^?RhSw?C7@eDG2j;XgwPdvn)5aTDGq zrY;q-Go7v|^{`NHy$Fhqiq6{{pn(L?R@9}p3yO=~`#A9Yf26SW=OoYK0nT?1(pZUK ze3Qa%eF&TrfAR8VvViL&ovTy1nI{J8g+0A%Csy60tNVL}Z#{iCxwY;ITr<7D){#jq zzUl&b5O?kNuB#mB`)$uY<7*$TUp#ZrX2Wr>h{UVJRdrOo!26s8FaK=<637^;5lHfu z61Vwy?%Eb+M8%fEk1xZ@#K^pXc;T+h1o9NF*VD=o7Ni&v(8%fIKw%NrCQw+00nTJH z(-~Sg*sNl#mx4&cs_#*%9k8iTS+=Yes9Sr#rEyFj={RqeENQlr@kUQ3!`YPd+`&C0 zL{p|kqTVfKuWDZSg?h_{%dtn-uTTq3WRa9TXc7wGRW~m#lXptWQCjM0tI7?aC%2h* zCmWcP0N3<~^j%51C~7@y=pDCDP}w5B#VTGD(zICr{>jZ#%k+A3r&AOq?=)6~U$7-| z23nR$FY4-D+m>iA@;gBz93D}os6m4zI9-C5!Y=h%#-7W3a+6g-(`Pl#CfrQ&iMgdU zJKe#J_T2JvuX+ys#I)XCmEdiw0J2ruySO)1tBZxBSi;e#T%EOmd#%||*KbvvQK;_| zE!9Y@;}c|ySrd&d`z+ILb)`D?b4Z%xF2}|oS?y%QfXZ57bS!621+^t*HKkSQ<-qqZ zi@kGYWWw2kZ#aaqB#6p%;H9v4J(gAnz^DfreGHP+%?}l zLVV7>A%m0ccdfUk&ET4^Z4rMP;Y$X`{P^rAQc&V|1%RbH6Kf8YR8=4N#UH~7*n1I<&-bdD z9D|hPXEubbtJrp@YVTr`D230pe*kXN;NpE|ju92V8#Q zd&dvUl~&}hu&TvS(b4c`x%=*m9*g(8qCEAxyD`2E!jy+tu+ozCD~O8s^z`hMAFxZr z?>V==)zZaR`>X4B$$IN`R=KA3N6TB!*Nc4J0;g@ywMXy0j@@x@L>#UAA4_5W4ln(_ zetLy_?cnTxG@CK$BBOyjkx5WOxzlS70A`k;@k|)yF%t%Z5j$l}zJ-213oD^+ZvQD0 zW|T=5qPO2^P96h8D}=9dq)PSziqJ5swoIxv^EZwWy)EizDpRCwX88*CLRdfQl>LaWSrKK54*^^j?OX6JlumruQQ(Ri7d1wV$TX> z|D?|r);vE=#A0g>Q=jophaKu)ZxBr!(&Tiid|cQQ#36~2oCp;031W=H%+(F>cy#iF zU1yU7@Y-<-wp||%Yu;HS`OGaiK% z4a0!-=A5RFjWFqjHrd>7Mp)DnEhnqz9&-+dmdXo18ewsWAX#os<1&v!!6T=?$`)Z z#v5URH{8pooq`u`B;?uUp0Z1`xl{hl2-E#$gz^5#2s5b3Q_{5rrCJNc92;R$>mla? zE$#bAwC|9r3wLUMGr}zI%-00uD3JhuFv1Aly6~HWxQ~r6{WkS5PC{S25f;w#gAq1{ zqN>CjVa@Cww9!8|!nEGoHgBRL`Eh9`^FJD4OT4lBypD~>Mp*Lu#blELVdQek%r_%! z6D5*4=kVzY_1JeKjOtHDSlmyHu=IR`??zY{){!7b?p4-r)h2?=S-0NR%vJn(! zuB)fm*xk3YOa5+z@j`>H(fweAE#*1*6eJ!4h%6SrBbB2ydE~Vp(hVVu4-&~hmPJ2(7^l8a?js~|B z%Nzz^H^P7q&7Cz(a-O3e76_d$umy;b5@J)8bn~mtB1>cV`F-Hs`^~~0w$dt&b;3Dw zF$UaawvK|f*fgA#SoH&?O+aIlI-_iq0LGwtUv;sKWJ!3{LDJy|BkZi;!sQ=~um?Xe z!a~-a98!ywmWt;}9w=2m=-v(~$g*?jX9sRt6g^()T`HbwOO+85>1P}l#7DC41&!-j zr*7hnF!~j|5#}=S8E=FI@G_p8#jdQT;f=7&q8e?QfDxX)wvcaD-wT3erdhR%4uM@J3jA3GkQ*!-Y^s{%C~7BTnsZ)-ObI^q?N$jj&axt2507 zCCzt*-m-v1R)nnw@J1N-#wT&wVGUjWGEV3Ju4PDk39k8Y_P59(E;4)lQEwC94rm^glDglnM8oK}O$=u)$@*@=Drc zBkapJBWxdUgk_7K>v|^t^cls4*-NLF*~2c({%C}eUa< zGVK)P|6qiP9~)tbBi9EWK3=By!3g_KgyFs!VRPc`Rs47(OlYT(18;=UmVx?0_V7lS zpx838CLJu z1mYMb4plFZyKfYl%@T5a!Vz>H5sIt5U#uBmofHNlAmy>?eNX)3ZK(THVtQA z>{~uf!#v!mq@gAAF+`*v7puVk{PV5TZ|K_fEzc(i-Fg-G>BBI|MRzIStu6=Uq9tsz znnvfmXyV0p>IyB-?G~RqZFo*}ia6?72W~%ip^gfM_ zQrN-MyFt7zgu)g?&;4u328k3pNO*!#^yvpLr!f2-A=N=Rd?wA5SU!s01;cL<#HNX1 zm_VE?Ryo;%dC3&0WJt)QiQ(vPs)fq$VS--jpx7*e>OKQRz68nNu2o`gu4ht_6!c-% zlW+2p-xVUPdDg_CiGa4KLrqci!5A*h!+Pg4HSU4!?ebfwI{JKs;bSfmaZr^fNs2Dz6Wkdv|<#+ewCY0%7U3uNO&V0#p@y)JL3;oHB`I zR#hE3n#I(b6|8){;#5bT`Htp=j_(9LNLP^WZ+rVpK{m|StD{~iK;iv6lBpq*u^S4q z9YGH=>y?+NgT;a`8}16T^8Mr_vjH*D3Yj&BoE(jyhBmFc(C` zkbXuaz1IGqtxr{9?n!&SyL_UdU+3$#Y(YVdF*!uO^D6qmBQzzjK%h^!DAsT8!9) zDwIsctGJ0vp0H~QbFf(VK?krZ623arwrq~!qHk&>3QqF*O2#O8n-;BPYCy5~fveR> zC6I=T)rWa89>#Cbvzhn>WQFlF$e!pP)zH&S7BGJ9vG=P|*c!QW@(WR?rM)LiAy&hR z#F%TKU`HH-RPvl=AlH(57k?l|=IQpz21LlRT?Q$kP6>G)S@~rk>rz~?ap29-=O;A; z($q@pJ{4LQofhIT`Al)jES zoAGVTG(8&Bl(A*h>YVLz<=HejOQDd2fWv5raEzeaE5)NjaKG{ij_*m&SnKyLiFkGENxKuWp($C10SU6%Dk8k*W(Z?K2VgNqaZYw^$abTeCrwe8rE2)1 z&U`x?XPFq0)|>3=thh6~*(wmbr>!Fym#|r9qc6=Ft3{AK6?DqL&07FSCmTyE!(;<5 zBl|O~Vn1ro^9>uta{1pLu2j8!G_V75Q?I)*A}4fj5zcYOu=(MMJ+-9ylxu!03#oW1 z%;tWY;ATt69A-XVah*0hA#BNYmcqe>Tgxzf4&KRDDivu&U7KyaB`bI?z6oQDPBEcy z%Qqt*yHm?WcI|SjVRJ?~SMpfKrJ38c%fTO`Y@t1eL{}*I_cN`bVJ8>T((g^c@&|`0 z2Fu${9~HbytbOH|bR)*QmX341DeGiRX^k3l&%Lvk4jiUh%6l_>YB)7tTd1kGXSmH0 z>(AwWop<%{xKhIMdE(a8qtEYsag!O6LW29(6|j?*ZiV@mLs!ll)_oedEHbGtSL(?)1n-jDkCh*mHLRd}Z;}|B?9T9< zFl`LtCSrkiSZL72rsu-twa+g&k;=2Zrwx@I#sa;o>9EH6Mr}cf_hc}UXLQ54dlSiK z3}#|2MT><88s9L8GshQMQ>$H|zeJyB6zAJ9D=W=v9+Iy~>oPx6&6z;O1nIKt6Oagqv-x`_fnB`HoC z9+Kt+X8H4Oyu9~LmkmT84!&i0o|D&`sYfw3O{z7LeUhSy=*!`G zVehv(<|B)tj1lUh!7?odLWE33)roXq&b+Cl7;fL&VQl+|HvawDh>3@Z%X$*Mc2e8t z&e2=O7w_c?7!qEalEbI;@~PiIv{D@@C-Vlshy}YM=u;zjq;r;w{pM6Jm~LKxCnsea z3?ycM@fYTGAde+Xz+X7YXHre z4#@YblvRof_H{z_&-NN(-YV0d0e zPL-i)(nrv1;}cPgTj_U)osIQw7NlFrrVw13fo~2iJKVf__1*gsFB@87Ar~xp216Fz z06{xHW6}$2B0?Z7T-%VS|pHily&*QZ}1wBnC<@(y7GK zasUNN`% zHmO%-8(Xp+tSa;7Hr=+Iu!Q26I}nWJI%;L?P->P=t;^d94;+xUJ*qqPQ?`=|On*fR zd$>u|duB^hbHpdkclRTgqts6NvgwU8?GDNMuFXz%D*UZlEgp$>RCdkNMJt~^z{b9e zJYt!&)leE&DDof@Qp4O6Fe-ojm{=>5da!`A21S9Lq|BmLIU$T-eN%AxW8t-OZQ&CJ zf~q-993}2iiY_=ThzOwpEfF=&e~+QQZZag+(3^6ASzVA>wjXi&E% zarPlQX%{dj`vgg122~xYJf)el)oJIR!8VEXJdH?Mc%ni-PGx)51j=|h+&37IVWDry z+!LU5;ZsmjWhP;Hekok%(&%j`7**bd;Ak4a7x)q^nKfpsMbuCQve5<|uigm*c1lgc znf5x8>zv5v$Bm1oLD2d@pr4+e;5^;gm+N#-E#nGlpHLGuUTos90dqsEd0X6KFq?*c zytxJe_mp(k)W9l3)cp4e~+Dp8KliE30!RE3Un1cqzJ?cM_^7GHT=C z$B77j1jVhI%)J%>k;cY#fz!>xnsYXd)&=XAFHrRiiWYHrhZWF`qnAfmrfz2}HE-2i zwQk_|y_cJhp_+_neh99NGd0IhRo~XMsLouI8GmdRP^(BBJO^^#5_gO<}0YwwoGs`%x`4zog zS8`^Rgl)5!tU+&x-g)e_Y`f;gqY<^p`D)ayx!amYetk3-GAT;9=VWPMW(L)j*A=qiLjEs6VYSgu~#W1>34)r%uC7k#sjWy4b>Dad^r03mr6v;{>HiH2?>nG`Wh#m*cN2sS|% zND6=<0u3uf;XRLi@lCK3mSF@5@jBsxovRKuS5o zvOoZ$#Q;aJ4C64m)y6#^G&v!PW~QHMHx7n}tW5S*YsYo2TCE|w90 zwUQ0ziP57AS795sAacgizY!&^LITY&P#*wQ-~h>e{9ziFp+gzE3uisTvQl2>-~@1p zmF^v)nLA*R3>1wM9Q@doXBWZN0#F>cV7#0K4YVLC(C6gYwjSJsEr}^c06>-TuoVPb z17`ma0Bu0Cwjh}n(HxZswoN2t1j$hehZyR!Uf$j}r90)X&#o=b?Ks0`j%6DspxQ*S zlh3fN2GF-4ft8H_eFSu-jKiP|wjK||D`Sc%-qZb1z&0}o%`An1afoyG2|x=ezs2LJCKFz?d$MzA)VuqCt+F@wx2hw8MMiAU6 zcd!W^ND)O#1Q)Wu4zY2yKBw_og*I4niLzne(3MnLoF{Tml=hQrlQhhAB6WaTqm~p>Zo( zMH+eaCMUQt0G2z=@MTz89|6A-PnxE@Kb#DE{@i}ckkU|`#hH^X(1HT*!zq?i@?C`E zcAkiURM;pyp#Vk~B1Vup!wrlwXh@7U1+D+Oh=UMLyNhNfL>zeH=f#LCT$uC0UKEvRh!g-c<^f4cI{6Auk~P}h2iQ9|F$EGpIngJ~5bSo!@Ta9G zFM&u8=^cmk@4me6$c40jU2gui5;jJ^{r(E?Ef=QOI;>v;oMxGz*&=kgiLj-3@ZOdE zL0Z4)ET|%ihAxXE;Is?$c4H+Rj2E>{(Bx^uP#YwW{)$yzg5YU4HvuMfoKh<7UI118 zWliUq>ub}DV}|aQTQ(0j7_t%UPUwriNWyc52Q7e$2Y^tdpa|>mYz;d1{jn-xo}BZs5`fAF3GRQ8(u!bbMLr^*0W8vjT2NHVL%>ynG|McgIEpm8k~%>dnwz;- zQUxW{S2(}`2VwWCo9M)+v|EzI@telWjEN_`Z>?X2J@~Avs76Ha$xanu99-vA_#dB$~B! zfT_g9pCPTNejHj{1G7Pc=*u}eEC_v(z^`Ip9RDRZ`xzCCB-sdxv=Bq;Sq5B0kwyTL z*DN4*1RzZ$g*b9w&m@QIpg!d-%rEZLg1*g?tChDV^#8GUmtk>*S)RbFD5P+zkl+;V z3GVI?AXtJEAV6?W@K99Y?!nzHxH|-QN$}tfA%P@7NNVZnnc3;xo|)a9p4pz6{&GLu zkN3IneV_N7bN@dXsMi~TbbcWG*L4*(1HR`h&5_~#W`>`3VHpJ@Q1uIZvikD*ES-Wl z_qq{1jW62h%!Az(y2}g}4qxJ;ph$;JvP3vc>NSNo{Mp$i)etG^)lD7pY5hB@j(1!y zeXjH#uio2vME?2sI2~e!^-cJ`eDg9?#j;O(#5;#Gv0JDV&)gX06<+rQ>Gc3x8ND8| zixE3E;9qQ+`T&?byl8qqKCth7iZmy?qNgSKT2w9G!J5^d!eyUy?@PLu;-Xc`65~bE z6QkliXdCsYGxM+k2Ifc@@oT*bja2MM5(D#YIFWE{@lNbetrV1Zfki<9(4_Z~r&=k% zY`6m(lQ=+wn&z7YLAY=T=h|rQ-TPa82BDu1lwUtDHepD-fzeOjitxs1Kv{nZGab;Y z6vr_)P>ju+Ox{ajt80^=sHb*JrVg^EPS&RWgQPGTt+l?7*!>@o>vV`;AH%YjVoaA3 zetkqmtkb|Hm(xv`GbLAk`~ zI$%98V67)$dA4h9FMB;Idt;|-J@eQ4WcJ!g*QYP*8wXwMTV2cR`74=0o290!pSw0c zOK#nmeyTRzqIk27A+`C7W`j;@*@xytp?8O(kIuGtM-h&B7X(oDd*9>D zuGhw)2JKPU#*Xou{nnK`GD8>fY4XFfS+M^bxdZw~D?J}dUp6?LC$|33U;c9_1gGoMe9^ZQit_r;>w zmsPWK)bC@ez*FYj&n+9DA4q>md2{h`;{qam@rdpsJLih#IXdjqrE>Sz7PD{GxjW)! z=r-nr3w>al-XAeCGy;7;0-h5uSX{sC#a*x_?>Fc#$GYT^Sm87*=#IWC4g9-FsIjL!;DpHyKFc9r&y36Xe4uuzMfHZs6UX%2 z(yP5yPB${C;5nYGpwE0Qi1^C65c)DkFAJ!jB-ZM>ZeJjN?}BxjEu8h8sFQ6-9(Qg& z^?LizqVx6NxnT9Xqsg4c-O-{A`;%GXe!ua={#PmN?53`LxzS;@J4U?ae7*Bt3d5IZ zz4%ip?BN~n>Cr-^aZjvd$M>_f4!?IB>SR}cB8BO*{_429xjI?xiIeVmbF%g6CyFrd zd0a$qKgB-YJqjikf!dsKx>vy-IQij?AgHle?o}`_JGprn?Hd&n|DQ1`mJuvD;g*pc zw~SQb54*$Hf?uBAt6=WfticF(oo9K43k~h!$HOc~~C$f%vDGuzFVQ;C;&dw6t&Q4}x9* zjw?LHV(>x%{uX_sG7t}M>bqwWe!<=l^fSllq-1910I5~&qAMYp$hR8H;c8F*{McVLbEsSrLZx# zSkJ0Ln$NZbLhX<^|IAJuvng0sg?O* z?l8hR+dZ|-f8wEEIrb(1ApSd|`BPQKAymXZ=w$-oUlEa_*D6$mqC_=`spE!3fi=JO`M$wML zGaoNT9aoWP%sv%Dk(&3CuW0`ERL$E|G#%?8I|CU48M?!~AKb=5B>PG9#xpb}`xppUr5t@B?$=Vnk}MyuKn_9c3U$ zxvjr!_G8wSHovp3v|Sxo9or_$Y&U)4 zYVr7qlFaM2i9OQ`F(bpJXN}Z>M=3s(ji}iNN+0v)tDW73k*1C-uR3VY&h>h5vo9E2 zJ6C_Py_)tj9SnclxkuBF_xk4P#h6#u?yGlRJDH~6m3|cdz+ZAqxZgLQrPfV4)goW* z#}3bUbr_ei`o4Yf6w#`|;-oi@NyLLXUOx&1Psb>J6G z*ozC_&8@1|L2laNDEX``n)%i)p*Is2*^IltDZllpJM(qov2D^u@sR-p!?@hnsQU11 zM_5X}zNm_}-mtylHOZ-cs|c{!k2XZNF8Q23AH7A@U^vD_^N@0Y%SN=tr^BVYKI5DE zLDq_BOyGul)KR(?E2ri?yZeVtV@+-pPz2rxSJ@b?2ON3-*1+pBY%vb#gJe&HQP684-Rs|$}G`(WH zpEvajP2D499mi5dM4a*@M{{lEG}|A)!2O$g$EJ%zKrK`9Z-OO!TQV-_vlcr z+T*f#n1^@!j27)Z_nXb=M_)GAbG+X_ils_$$=cx)5FH6DbWeOjMbUaDC-!~DVJ$j2 zo6Add%!Sw^)wp?Ic>2ca^}{nVR)5qMflK>*3!_E8^lV7*=%lIDY&6A%$+19DyK_y> zw`vcX$fc_GsY~ehBHZvMm}%cDE4d|*Q27q0B19hbIu%sSm)W5X^T`J`_qg=vFw9>fnResXiH z*y+%rBH9{uo*Q>j{aTj0)u#Wv^`+$gw|F7{?Lfl28Ll^%Br)_RowZ>Z_MK}wkGf+@ zGyDheo4>CKN?oN$2OPiscsnvI71#7xTf(4#diWEPOAG%;w|LLp#>+RC@1MLm%%z(@ zdL&_mh(p~4+FX}?-iJ5GV0~lt{auQ3+985(>OZ3sa>itX50=6KOYp%k!}R?($3NhB z;c#ogAQYx+cWEHqbwkYh1|v(?}`v&W_~;^8%d^SwJQ7$FMw zXHY>(9wMc|!Ln?@@@#4%Wk}^~PYIRa$0NZSe!rp=2 zY~hh5VS%>cVd3EsDu%B`!ed&sLdwDu*dmff!lDnu-(IW7t3+guL}aIhq=rZ2lPhH$ zMikpdmZ~80*di;v6^g=9k?+7!b!@LHMk1Tiheur@$^0K=&$GJeGhbHcS*iVi*36Jb|;E;U+xs ziERRTc>?SxK@&X@&>PRMfcC&A@zHV^$7rIgA;Y~DqE4PfaiicUlSDU~GzpVmbxo2q zO!fpP%h{!T=R+mvwuLE+Cee&0)8EAH0KnSiDS9{Y`VpyRa>>>d!6xa+kL{A^u>nw+ zJh^KE{b=fmcdFg!TfO>JpChF6k%HP$irZ*X&kUY$#9MfJ=+ou5apg3DqiGM>(gYRK z1P$edM-!CulF@sUk|IK5vC|6;X%ewBzy|38p6TilKv)|93IJgD0-}5}tk^P&A~I_% zGMZEa%0=ZXDc-`gfb`gj(ma_uqnXCunQfz4<#m~(?7rRX^1bQl005c~QA)jQ){JVL z-*VO`L%QkF>}b~P9hqza^h6jS<-rYLr#5@%CSn6S=kkbV8#~uTKj%PH{z#Mn#sfg( ziT~__%GGMky0Xj5vd9IEx$_zVc{cM1C{u&kfrL2uJH5H9@O(a|Je-wa4BtFDUkdE8 ze87VwFQ2!fs%b2~37(_5f{po%YQeNP1^gTo3^;}BHVL4-LU9fNn_8B5MIk8<0Tuwj zw5dRtS13Fd%%@h!Z(pc}Q@lA+q;Z@7gtA!wHdFa7vhu?A#B5(w zMRj{oenmyhSVfImrEFS7oo`%&QDsL)WNl>Sn~KU#j;c+s%AS?jKC!CNWBgRciLfYIH(tj_jjO zR%$LeLMCwD!8qPsRlJL`efPsRkErDxup{{K!Ml(O9$vOuY}_}Bd+Hdtr8onAIGlA0 zVa8*7wXPh7%4L@DM8lULtF}Uif;g(`Dq0k^d z8x0+l2ENtl`ISEFa5v&??*2w@deY=|U-k*BeetsaovrERc-gsm6Sxs0l?c}>s>P?W z#c#YN;G`uGw>5~f75TU|#Gy4T>VK2MR$B)`vD`5mT>VPVYcbV4W&TkLD|TovjcPBi zY_A+|uRdvihx<>YFynTCT1?f-wt{1Vnwd6r*gr^N;+;yVMC0R~6Mm{AxLvcHUBi`~ zlMY?jUpnV2yH+1}F8XzS%Zeqddc9f)B7<9lbe->2I{TNjhs?7EFWrV^yoOflhMQ;K zzu+3Nk{I?j8Qw@3>g^l)lrW-^)!=^j-X?3O}9(F%_-BWQ}{WA{Im@7Yf#9 z>8j@jbY~goQ1drgbYdC{@s9IM{T=LciyHVJM7uCmC+9(Yi`w@#ix-u8Ays_kOo7Cbb?h zP+yYP7`oe-^sO;`sHr@wIkLKWY_BEls5LdOEpEK6dcQsM&v9V+f1U$-{~yMIjsJ}U z1L59R?rXOO!ihP}CR*!uhT|C@=cu>U?~SMNJ8Vp}H5^Rm$w$#@v^O5jm+4lTO}00k ze5`#jo}V3*@aqbVsrRkD+fkMinU$A`L3I67Z?85$ARha z)=RHg)pJRqEhdd}HvBmbOq(9YfyPY#CmfjZwXOrsuR+jeE+CdVE*Xu_SgQu@zW@go z#~rUl9g;XChW6(tY<(7XWdqUqFdF~Hf&KMzV6qsrB>C8XK@Kb&g>yNAPWsPtVA`@ixoiXf8wciy z>)Sml(JFYTc~F*uh9)ompOgbLM!~$4GPJaiBm*_r6mR1XL|}YWo)OTXO1NX}mmI1+ zs@MPF99Y>VE(hD6aA10A7|)S1k@BlGvPJu2PklCV1M)1rpiHlbr$WO!Yv$+{?>VqC ztwwHYHd+bS2WvZZ7!Mj{wEa7?rS0FT4SST!^3$oPd;N19m;uLvIZeYRFo?MbzY9O= z;-Uo`4Wpi*_OF=(lgHqHV6|PVO0f6pH(ZT~)b*R3swAVzIg)T(hlqg9oyp5JH_s|F z4}T;v$m9#&UycK-$4Jn6T=)Rn7s+ny9mSC^(!zr~3l%pW0%hy4VnNW~!3fx|Wr1(e zb886e1}k>m5!d03lo9la?2ZWYB+B3R&7J^MbY}d%z+X8jRSMd zRYV01gZf73cB`o%*@8UT#|Lpv_I+9g z!}oWrl>A-UV!k8t8-KLD3XPZWVV>73v|VxmG67l${TfV=g&;}aC-SRzg0e)3`zA%r z)=ybY(TSS!e-{p{79(>?fOhv4(jIOiFNJnUh@OUS-uqfeci)D9E_cTLrX>(30h@0; zrfH5X9p)r>q2{j`bR-z2BM{X2m*c>UYmK2goc@BGusQ^NUy&Gv-I9G2W?|LD?BTI}LI!o~km9GD(zh7kRRNDnPd z1pnu$tV`r*8q~t$8uklscAb0n2(~r+Y?^*@`oYB~XrlV4Hk5hyGuI+E(Z9lhHNU5; zfXNCOG8xY8swQKzmROZ*{X27D+SplMt`>t4q>pUYmcG1*AffLt6muEO_b}eYAUt)>UwD0s=-q85H zK3i=w9VV?wxjx+)HGPzOb|-lCvGsW&%kz!9n|C$S|M@sDPLAyc4<(6jyA{*Yz{w;{ zltPdkGX{^D>eXxxA4Tm9`KKaJCE-UY5FJgg$d^r;@=dB58hd{!4h+U|VaH(XPAK<% z&Y@54vch9Frn&l6wf);U&vEVtDU^TXzzV!}lk_*5J|Ukcdhfh_n;oWZ5{H|z+bA0m zW{lJS@~?~ov&5ad{wv_X^7=WCx1wNNNLVf#0=G;i#N{72u-XuA?Jb)`>PSbSr4d2q z@Bl|Wd;vhjgtT2D$qbhcq6{_EXrt4hH@C1KTQ$n>_;oXVJ$cr}8!l$~b(dP-dxYCw zhKR$lKIy${U-!Srff>UTSs2o%a)YY_cCSh>kuPW6bQi$^{YzIe6But*D zuM$3*6qI=pF0Bv&PP_iga9{(Zpz~IKHgd5iad!q2RlCCG#(CQw6l><>-cC zc;AZ&QP{VI&VvcCD_eBnIq_aYvuk;yh`{k(|Ymn#zQioFkS!^KvSk;lrmb+rdpZHE2@ zIWSl#P42h%BG0hJmru!QO4|~)YkNTkCfdZ3y(27Q?-NIv_DM8eXo&ZIO>Q))bOCIR z@}XTd3}*GcOs0Uk|31$6l{yhf>`M15g5hfPX6o#=xl%0Kgz`#{$R% zVX5Cy%YcCZu8aRC4y?v?X9$I{6^Vke0BW6~%jSSyDXgT&IpH4amLLyS?aG_em(U|L zfRKtXfP4cRFqpMGc8Ct4B?2-K(H}9jpnM`=n?UF{xmnLJxZt205GT3pkQql7~p>-4X4>b!ud z2p9idekve!+SkG-zOi_lfU&|dct@khzrulG@d5zMWjt@i!^qj-B(lQL1jNRG7$Cfs zrdQ_h5{`2zwrAYzaroysFoKC@g3j9eX=a*QVRx$TSiiRBhF)Gy$TcIRjj@|O6BD2k z{2*EuuqgxBlpsb&0L^=yIeLY4F`lZ(pa#<<_}6l)3WcB%UBow%&>&0_FdR=p(txvp z=z4(Yx|XO+hJ;skgCF9g)vu6@@Bm`~S^+ZDK`t;cN(n3iZ-AREVEJ)^JY&Kq@pfT0 z$Xh?LmmY(SjF^)kI3%yhoj(B!gl4cN=~>zDkq5Y8>~i-|P`Q&=y-!T@7r2hNzYZwL z$%{o`fg^E5%GibQ@PwI^ZjSgZ`5`!IaN;dE2$tYH18C)xjAewxF(wED051FxRsev- zN(v|eA(2R+;Q=s{0|@3Z0ksGc!)``hMF3GK`O#pZ2bwRFq(yj_Th>q_QqYO5Z9Phg z20Q2#6NJG40PSltc9cU&=PCOMB#@5`dB7zDj0M|qfwl@PFZ`3dZP933$xC~&gdLQ6 zy@wLibAhlf=#dQgB!ub;Nza7I=?YNdlBAjmaj%jEz|l?9>R)OB2w<2NWwjnN7@$_T zu2!hRh!BGaARq5N@u1opJgKt6QHTg!a|Tm7Z2M!O`xQ9G*D9dqfH;RxN>nT91Yo;f8?W?Lco0Yc0Mc&V)qcDd-n5mOIRM zAOJcWkRw>EjuDcbrE~;>12*A~exYF?0Enkfv#kAHRIR3#Wk*mP+{^uKv?yl(RKE)*6_Dg||7(*0mO?m|UU;z5bp&r7lb+zbQp}=E29IS7 z;|s|Tz>KE~vgrA2@669|pFaJ-P7#dh4aba^ff@#3^?vxMPG5WSOp`egO7s{~(91|V z?BE2*IE6~5!O_eIUE&dVA{}<8A>_4i+*+XJ1&H%-%)=YRxio8}2Ox~swzr1b#8d*K zo>^!t{qg~UWZI6C)@;r&+`P_+QT*i<2o0DwsEQ}y9)O$bol5}rX4 z5ji}AikEq{ecfLz(g{v*@g$kC%Zv>U1Q-xow3W@Pg_|aZNa6>;=2H!fvg3Kf)}!ns zR~UkVfhtnO?BX$!VX9}j$<=ejvVcVH1hdEa$u#+2t82}Gbwf$vaI7?dQtZ8sa0|HA6 zjtA2r@j+tS_7Qm_G2(me%GNP1;Q5URY;Qn(Os!FEJr<8qw*PW$A$&as;NkHe6yFY= zRmBrXXj%jQl4}E+*H!GS-maU1m%2P)2mnl>e=(s*5?@?vEzQ*^You{xF>3Uc9-Tv5 zYIf*Sa6141C%UcH3wZkVgZU8R(0gd{p-|lpG8PiCE&kRz~e4x1wno zCTDJAZq|lz0y1=$jli84fR%?+pP;f^89zmxG$6Fin)UddU|U^Y^81w!l(w^MmcY2y zTgCtbFj{xQm=Z9E)^LY5pS()Ke+aHQ(m}|CtQ_YUvSH>hU<7yuoiy7#@Q8n|9>Ujo+1^DBFZBaB5n(Z}0;K_lOnS?xj?4~%L2XHv1TFC9 zw4wpAs@yqKf^7UYY1$yokO4Ya5ezm*&ohpnv<2p1PKaZj97_JRLsfsy|c3(WPy_*ZLzZ*WdC0IU|XYuC8X z!-@dBL+NP)P1veR+4u0)-A*yYs0GICY&Bo;a*}tZ;&(cdta?fuc);PFHV3bqZO~y5 zZ|BE#iV)n*ZRnOoLZ{WULz%on(f(aoLaS8}4ZLHu!7Uj;ce37){i2Qr5Frs3HT4d2 z-;9F7P?H(c(SH!dmgwje-^LumxasczViZmDi&fB+BIPFEkDV`56 z+cZPL?LZ<`($TH7?lyq4u9FsUN%P2qsWX{0)Gi#DfqOxmJ}>>@GjZDys#gnx4Zaq+ zbo5#!=!p!59$ToTH*4S~@_c`@D2F+V&jm5!5oqh;?#GqYFSKu@NT?XLb9aW=hP&L9L^v6SIp2^W!qL}{ah~U)1=xn^&tP$Y3tcH5P}h56o`nEx z0fgXjH1H9Mz_qoeojSH1t$C%>Z=~KB4mKB|ntBH?0Pq&J32$y;re zAHvb#a9qGAS%6_O<1{NR02~ic(F*PLyT31%BaX$4gcFGXsyjO{TTP>fme5N40N!v& z8XQRE4Q=~E{ppJWSORwh5MPbaED7(6YB%!+#cSLz2Zihj@y)1VZ1N!X7{?~0lRcw6 z!4hy-8sO0T0oocw*gBuw9BOyboZN)yqyiFze?ujli~uyfehGQ>l4HuXVxmWa#chKq zVW|UqK`&C$znk#tEosYoX_ku-rw2ml_#G5vWiQ^{$AjfG}xA4qO$2Kp-WN zDg*5-j0jxGEMwrN3=mz0qg$58dD43BIMmq8i?{i4F)>NPd&evlVSwX70DM#)uA5DkrN?ua)r<`iWh0+C$%4j! z!$qzdE@WmD@Jk3}T*)fmV*?)AdMu$l0B|4~ zf3BrDO!)*l6h3*KbhOek{6SeUv*(TZUCe>=Z;9Kh2@x-KBV(DRs|qL&z2j@XT+qU###~ zd8!ya!Ug2@;*i+St1^^^>q2!WMBli&0szz#!b}N@l!h4&C+uyziJznU ziV{~YWE7#Q+ z9G{_&8l0s*yTLg@I2omU(r5rSJO6sjJ~>AGTec!a7E#vXlp)MWj#T#_8WP9Wh9y#y zc9CY%we)-(73CE;#@Pw&w;WYBDef8M#Mcncnl?*8e5Ur~lF6z{3n)%pJ7wlcQ6&e( zV2VeqY*u~68%pk`MI%4%q-xp@?iP7DdBv8kNaK~3oeV1H`X&31p}0d9`*dT^?TS0#FgOb!=Q4sYX9|B2sbVHxG%cjT~0Zi`*GxDO?#0*6%}EN z!d+dHUJ=iDAM?RV?kGDJDzbizdtCT2jvjm>lg8!tQiJ6rS#&yyU)~3wUhA;ViYaJh zbm~=qK)svu?d@G-u}4vD#Q4*b^0Gz4&&#DBOy7Q2#z~inF^0~;l`*82Wy{q{Uc4)v z=8;`3T;+T1=yuHVzL!PQqby!_i0gObfHKv|;psHL6Z#xGT}z`jK6%(d$rI_~sS-l! zW`mklPA4XyO_{m(>9kn5mIap7rzMLpk-Du0Yzhy(!b#R{IDN%xn*){yh1pM9y4oIG zltb+R?x(?#_W2J=MCpz4q0)-UqC?_iJ?m9Yx2^CUT!mg#66)*? zLJudPs;}7{^QB0Ic2G9sq&M<)(x^M>N@ExTjw23_z%##4Bud?i<6e@bYUov#Br5B@ z5D3EQu=yBMBJ3-xC^Pr!HIyd4^cmwds=IUZ!cNUI&$9D=tXEn4K&^{+UW2D_6O2 zU?&5%p!Y{|lm(Jy$$aJkpYaqYdimoel^86h!JP{O%!wLok2Fk=E$&w>ZA-nbs^44t zA9|HEyjhTO`PD0nL&8R|V+)G-E%To2&MjNtdz5bFP?@E#c2u(;n~wcjiB|`}Lc0s4 ztM1rFd%`?~eT%@l4p+e8^L{L#q#J)3Z-g^;5oRb;m`@PErohNC8==!!K-R^k#9uKR zOO0867c3|F8nLPIN(kR2=s>t$l5-Gr;Fve`q7Sd|_GH_(lM zK%_R>sQJA{w^A3+e1~n0k8N{JHLtrk%`PiGc71Jn_vRO;`Tg0c587NCD#>L@T)Fg~ zuer|8sq*{-N2U!&gs-zTRH~m%X{6tkHgi7AMjJyd;2S8p_J~C4whu{Y_WJE*nj%YC zUib3cKX72i>mPar`Ifj{Om#t<&_8fs&d#RrwLQKEEp9Z<9xqo{Ng848AG*$+Q=-<- z(0;TnWjnuW<1|~J3d%mNprL4##iW^HB(4L+NvVI7FXsiExsS4@3xHKi` zxMak}8)t{1h(Xxf3lhg`?W6P9qFX~saEi4WVQB98|vT@th77?POX;MONWk z%_u_%4AA4W@542{6$J%GWT{CQ5!k^o(c-Om0T%#Bi>s_{fPA<`xVPkD-s!`i`S|l> zZKD;5p$>P28Q44XF(+y4Jr5_4V5C9Gv!SoujT6=%f|o{*%IQuk>Ozbc0781L?+FJE zaRn1gW9qJkvp$x6Ac;rotMnwGPjlRtz30HL5$-re{*)ZJI&^`t1}rB0`wQwzogX}| zCK$32s)Z$Up;FUcLSL|*)~YQy&r7nfGR6b84Hn;R1~F^XKRY<2r_SyE0|!>&b6AJ% z!G&Ll5kW{Zg*O{%*62{-KVb25Vo6uCxu%KuSo>YNVYo^T9+6wNMP&vm^-k}bn8{50 zE102hUYSAT{5=O&81|9;cav?rGKO_e9nBtlGwoiuBhKaQ#*-wa?k$ZB^M(##<2;(C zFWTrn&9aUFuOPeyF)i^V(rIGUzAN$q?~F$tih;W4z??IE!qq>)*&9Z@Qe6^bFw?Iz z+kDq1L8(zgdpurKwrz~I8yBbhQmC8irn@bVJd?Njw3#yw=X<8KH1?{58nZzJO`NAD z$Ip7&9VAT>%7kxcrHC&_-*!J%XY6{L#nQf1_gg>?UHQy8utTXilYs#8Xer}{v% zuFb*uIoW7O@m5Xkvqp`JQ0dO~!mRIzTVt<>$$Wi<%hzj?8}tb%W&a*{=*^^unMKCF z*TLiqM@fX#)jWk{qejmoe;M6xHG#j%%rOmHgiL+TW0%L%32cgz-fYwkzqJm0bhib{ zKK&rhcAwMkzdB8SGdP^e6n=Miq#L;Ik7d#J%IDXP{wJg68M_|?kABNOd2`po-gD#f zhCrmK=Z_p%VD*Rbp5Nbp7@nQvqJF-kA~@KT24Lp3bLHMa@&NlQZTUD2CeI{Prh!s< z-xp1PInkkStnheemDEOcoa#vk00|A0F+ySB{O5NGd0>K!q`{mZ#d+$g73}VN4(y)Z z$}U;Jb8G}T*bNAGY7-r*i~IMpFvSR2A%2bIL3GA{zehcX($a@P5UF}<%eF|Ni6E)1 z^utzd==ucTg$7j@LJHnj#hXta8wu=xP7%8WFPG!AGQUjIC4Do0xJE?C6eQ@>*EJ|F zlAlle3;f(+1{A z@tDMtEV0S}_jSLpBalKRmMQ@4t-l;wvn;W{B}+HMea5zkNUpR06Rx=-RMi-ZUN+U) zHg~&*GJ;A0czhLnHU^e|0F& z;@F3&grL!S>bRgmm8Hf+V(FW|CE-tZZKuXVTgZbT;W@NkXzZcs_{-0I4T zimoh%oM3=lrcdE`NU{Kn(2>zOjZthrc4#J4B(+G%5?6UFR2q#b7NX^03t~S-qu?=+ zXXuxwWRgovl<$6g&=`cf5h`^TFI0ten~>wp!)CvFLDksDeF!>1FL6trIx@jlMOgpA zjykrMuF_numZx$ArW2Kjkh3`HA%`7N{SvMv(%vOP-V>}71Gbh4c8Mhdo>n3Za+FJb zuNsRS!$HOc1Z0q+sZ9`_9v1oCyzJ1Bm_Q$Em3%2l(4$&wvo~lAR&ocYp)yb_Gc~BA z6+%ej02A}T_4SzWuR^IRq&wdM&M~H~@4olF1(suDS8!m@p-vngBfxT>e<7z+pNOj< zCqAge%VzC&ixeP}HJmFTN1&0I6gn^L3fGr7W?6Ze+$#e`ueoBJxNP(*#`V9yAUgfX zxzMW@G#!Wi@+&D&$0kewkJ%B<#xU^`fUo5}HN+RLj)4(NGa^Gp6e~Re#F{J<#{jZx z$ue?jfh4uAlZ#^b@aP(C3yM*sIlRh_=(0PW>VL@Vm>P;o&=FTJ4=}Pz*`G_XKME)b z6IlSzUH7Ywu>>x{X>4c2*5rmSfY%cPPxIyQs|La=@TIhZaM6LJ1+JHwTfpce=!YJ)vdx97>HR8=ASzeYuzw&gSeT?sg?Qc7sI#Z!Y<-+-c0! zD|~QSoF(-)L!qU&pfaUt(f|-VGM`q2?KQMiG~3!J)|yIS*hs4ARrli%v=Z};;!?R1 zE3}u^W;PbhLCB#Y=Fl)&6*?CZT$lq z5M{PVyxt+!D;pt;*{4&ubUbru?S)6Zp?FmT((YA4t#)0M%AM`$>c3wKGCju$A0WHa zckYx6cx}g!07dpy1q_7--GbV_mXFm}jguKc@k(#6<;_E}yjxFM$}~nX&&2S~)ccQV z`3@ZyjnVyF_A8O~=zQ(ICz&MGYDog0Ly*EFSDvr1%*yC<7f1f`29owS8^3NO6&=I`%$0<=q~T=mb&s0)zIISXyxjuH|t>xCRlxuE-iPw*)EDnk}8j zwPe+aq{p28c3&|nD>tHdhm>~Pi6usf9aTX3bC?8oZ{0|0GE3NrMr#fWI33qwj`lCO z0T9OplUrdnRndC|(&9V3*w3{JfcJx2akh!gKl9;R&kuu18o<@5Km&L#4Plgk_T=(BCpk7#d z(dj=qb0z<_2TRwv9JlQtaN_!A^NFZ*WT+vOR!T)6*aq{2(1cq1NX+|a^!hpU#ISTOL)K4@wLX$7pyL&gOkI{-LT+%iqd8Zc)l*`E z@YC>yeYkHh!Uv>!LnWx?1=c_xJg`K~yH-3TeTyWelOvV~1{~A6R+8kG zJe1dd4t)F+;B@-S(O>dI5>g_0borm;mw2a4&i7C?z;}>`}#oA8zKV#M%O{?@&%Wsm-?!>|0j%iY45_0t5I}(6` zR3HfZo&%eqt6>kh`<8Brw)V2cfts^Qu~)Vu&==_ zW5(pdS9i0x{80Pcpv_uve=`osSk0ZW58spDda>6$>)ESsUU8{)FS2SW3NZ*X6^-s} zXBkAAPh}eImlF#~xwrMreCKN>BU6d1{m>ztO4p0zT$+90eewMW;)e)jP9HS?w^hqQ zozTk+hNiqD5nn;M#%ROK<$lLseFN`R^*#jOXYW!UY9zBpc7;98vTPAb-3casL_6|X zh}%8-izaKhMlzc#3S z$`OT>v2%=2MHHh##r> zIl~kzEdH`Ik=-{7q6E6sF6@6{@8_mc7tJ&}zpk&Q~iOP1^E{*slZ#>qwrvtt*1Sg7gAhNA(j#E-9&x1A}+J9(tC@&p)_c zx-@oAy*tZyA0yFqAs<^Fk_Klz_MmQFVt2oKg(KXVZ_VW1+g3>Eqkc)UB7PoGLa3|2 zyr`#h^TkHRaYBTziM^I>H&&uD{^g4OyKl;iEw>K{xE`J9k4rMBh9rcAJSq0}cyXqx_y99SDO z3${Q4y^pI;QJnbwR2_rwS}$5Al`@VXgOAeJea=bPv%(r%ea~|>-4a`^52b}Uh^+Fb zw<%`lm||XwJc}1$jw+1bU#5tx*-&mt{15i-JRHjY?;rmx7|dYCzMHXc$(Af(24$^m zMUf?BN!Ez6%-HumYs#K25k*MGk|k=0ETzU0LS0%fY4JOU>e6yupX&&5Uiw@-K?O~P+@!Q`PTy+l@Bb*GSb&`X*| zBhmx!dTm*VcfORvFyZ!0_6V~~%@N&O^}DX14Z`_9mEqw52|8%OPvgt`hEWe>9d&z& z7hZ%|Xn*6t4l%dVqpAWtE0JOW-Y|*8h(MGIufZqpgvcQg?#2!X4d(c+>fqkk!muJ% zx@$~<%02sLI!UasYdc#&_jta@7Hg7v@3^$S;J%eKIlDsL%?mb~X~RS>$LjlZO-JvW8|tz%e;sxQcaQ3FLD{Yw-nJ^8VK-iv zdXY)4@)S8*& z8BQ!=V)WM5yaR-1SbCrHk(_cZ(cizXIX2c_*mT!aM5MumyH=v|Vyd0v)t9pi1uir1 zU43s~Boi}Zy88HpV%}(S&|C%lGsZD3MW-?_$x&eLE$lOA32&4RX*a99c6J%*ywQ6a zxcwT+Z6xrXM8HXNl$u2n`Ud(L`h_VxEKX{{{I-^SiJyOS=}y<9n;e)jTeoDuq}hQK zZT`7huCjy;qNoX}=6x%+RLQ?u0X3vZbpVljPPVD zhzYS3(aKEC2D$}2-FxY>%w1GqPpN-(#2k1G86b_7IpzpYrKCAAho#7HeNbAQKgqwJTpXB3B4ov0rYIqAW zAuXv3fjRt@1Is=*UT|2^)q3_K*J9TpYj~PB|POFtS?QXCf-XU3+Ec^J4lq({f>*ZKq zNJ1L>A?3qtislka?DrYMCmLdFrwcb!srZUbJLN8LMj7;WPW`F&$e*8BI9|jvhXK+& zNBJ<;yHB#IO*&YRh{<$Ok=8}%tYZa!2_|e>4fboh4Q|x05ays9k;eDr*pm#Hh^Ogm zS9L4$EIXW=G0)8K(bw2c%%$4$Eg!MFcph8nRKcbN|rq&sHQNwKEw5N?NKGOZN?pl z*@zvDM>lbh)|9rU&W^@fnM(FpuTpN(GzO>BIpfv^*&#mX`(V*uH)GmQQMU2VQ8O1+ zOOh`cX8PgogGAfb#jqJZ+n`wi&8&dKV|=H>6VI5=*<7}d+r1Qb!J!~F#H}dO=W(s7 zk8(ta#zM5%y?0{~74aeGp89d--4mLQ=n?b(LrF;yd9sTmH;=7 zXM+|XeTc1@)9kB6s4bGlA;q|BnmrE7?N}rbRLDnL#A$5LPZP!}I1O`3p^YceJ}Tx1 zWTGIFz$$^>!wbDy2s5##x^7R`O%zx!^BeIZBk3bIyU+q|)$pVIQJfa=MW{u2RDr9spVRt?O zRQJMq$y5NYvw}y)&an5A*(VF>f(ns^wVdsC?34Cz4||q1EO(ndd?U!7KNrsgfH{>I z>bwGuZUU;;jvWA>bY8J_;ZPN5W`GG>L$gHHigjT)CbR`6$?WC;OO5ArMhUzz5m~qA z^RGou;<&9-)q{vs-ZP9Jjm5daD05>}yf<2&z_ibc>xCWP=RyWOdp>R-e#Ke=npE@& zFlrx3Ko2AM8I15C&;v}E4VX=GSWF2Zx_rcEyx9+j@x4Qd`KD2*FJmG)6T_n8gNDn) z&&V;)8FM2G8RUt4p3`hAc&17WgFcS`-84JIMEJ5PKTd~-yO2R}n#V(snc}4pVDm$C zg(-;J_UvI~cC-(mIB_iyk=h+Xu;U!hR#`S8NCNm*bUYY_V?H981g2gP>eDpKOA}tf zLIK(bvY+kw)(RLJaEw{@j1Vx~;u=FH0WnuAIuEF8Xj(T@UVb@__(Iv;L;+4YVLFK7 zBQ&ysNFQ9tpn((fd;|Vi$UljZ5vrrRZA_hK%-e?HXZK~9#PM2t@ryrb1qig12W-nY z-btM3sW7zfD~=v!Km>xXflygQj$AzV2Qs@mN^$|q4UkqVWSWn}Js&{o!C(ZyEzOhp zPlhS<;hDn7Y%h)3H0}9I2#B{2U>?(~Vs#8@^)j#`AtIU0+KX?}j&TLc^ZqsLvne-- zg-vN%6sINNMP>mwvy&w1jaw`NSaY4v_L_ls`WpoT4Fs4mhWR2^z|jsK&<-naS3222 zmzqj{!k#~_0A-A!1$3F?8PtG4=YgRssMU2Q(A(HkRIeCTdcPup$_LulGY(#3-F=%i z>mfn-LjkJSm(Li0gRb$-PqVGznT)5oTA5KM82%s}UuepH6d3i$=b1PE#c7^lMJ#~40q7iep}NQ}30w+OCJX9iy6E^SLl#b$XvY24 zp8v7zo?x5+d6fYtjAqc{W*=%TnP&IE@V6lkLsH$N-=N5ky$iwH1mZt0OqA(&mvD$4Wxi~3^vp3j_9M|Al^U_BCb$nd6=VV)T&gG zb{;H%ucNz;L%Txug`=7GKm-=gqaP9({YNz3+Or3l9wveiST-Y;_h$~I(zoO3hw)4r zMBHwImMxLTsh)ccP4_-cYt~*}x0X*a)1hjb^7SMz-2m;kQV_!$?^wXwjM?)c!6;>4PS|;83kByxX~hTVz;fZ0MND(<%Pj_jUF%8AG(y*%=j6$%$m+g0&7>7FJ zLJs=yl{{c{JM8@k&v_fe5@*k-WbXx#R44pcG7Cci^^U91w7YiC29YegjNA9;4gNK} zH(yqyUk~>KFS-iye)t)MWnbE{La`m!eLqk8;*_Efz4q;E=K$D8J2}4!5KNw^g0goN zQ`i$|r{ziH5X5jyJy5!iVV3mcN?Buh3-+;4h>tI%3&0~*>|8=;!d__e<`&R~*}<*l zSkkaiicW8-pNFPR85KYhU;zbm^F-K8ofg<%bRI+JXvgMRD4;}ug%!}16mo|ZB=&)H zmQb)stUgm9Tl5)^M|SKbg>-IcI?FnKK|;d$*XQy;q7vq5+OkM3e}#B^7~Qz18JFmg zLWb^yrHQ>aOp7PC03~k^Z#SbENJSjB6kWwIr{NJBX$1`JcKp3)q&ye_k!Nm4v*@7& zv}6@wmXTJR7d#gtuPZSiO^)As$kB^tVMoOWn~JE9nFJ2!qo%ox?HK`evRzKRd6#g) zn8;}s8)qWdnb=D(ErA=LoNn2~g+eBHv)Fo}w#jHKy4mK|h`<`Bm!q|2rO$aug_RQcz zaTagb_8D`pv@-%HD)-iz$1wa_>E&t}#v5*RbmavGC;J(50YJ9NDT5smhi4M(D2C0H z!n6`*3%w*75M4MsIWFM-=+X*ISc$_TEW>g>2 z>AO~0BARGl$}%Hy+}I3OrRi(8RjST>^mn`_0BRADo^%RfYx2|QSy^{_&*lWz;$5-h ze}}2FFhe;{r(K_}6S5L`_z+dqj4!nc*f|lNZ-v0r*MGK;pYrzJo7b>kEk}XCY78I+ zSe`VrjD#t77@h{;7P|odbK2t8hsr>!b{OyFGvtrJ%PulSf z7o=8@xhhqekJ=$QT8RRW>o=HMYlTLr`~CUX+Nlcy_(#t1o92lcTQLa9)0tXY_Mv6+ z0^xcl0&!=t`4)-`?J;!Lz0s{%#)oXx6?;*~Zzk>$EN>JekLrB{!=Z1vbbwJuv4r~fM3WJPVObHxTAGMkqwa*`QQXf4w#b+M; z@KpXo&)_i%_C*-;J2rMMcsx@5p)23mkz3;%4xh)u)gNXZn@FA-_syRyzBOrfYa;#D z!_vW0Tryezap7d0IR5&$p`}5;&00sLzdHwUlsp+vi;?%kP@y^dvoxx843yhHe z_RAc> z=2@f-m7C41O53zL{~V9aoc@zVKK|#Zkojnv#o#ZCX8gffJBN*TR@=2N3_Jm}UJqzr;Pt{#R-@bNsZUXr zo7z{??GMlJAM*+8RkuG}(pEqQ)A5(2V3W^O`t?g6ZwY)F#eJ%} z^a^r$lE?2A|j@%mwF=^DjqHx6R&+?B&?5B=hbGl!~hUvIOCQ`ON z0dj1nJAp?n1flW^2evL}=T~Bwr;(_7eQC7XswIR$?Z(Pa99WPNnbZ}}qnWhl=Idtz zd1|2^tvS`??Ht%fjfe6!4y>m%Jd{aO!@6D2@`9Z1quzu9fbCKe>#!<)ffIoHkS|*? zsqEQkxD>>wL2G|%rcT#BlHDf8K8o98@xBWxg!Z@x)x|npCc%VVdB~4_Lr+7&xpxF( z^cyzOTqARw^rCf4(PGqhOvRn;1vs$WeReWSy;`&BdbP5hmLl?U;R;E# zL@lP<<;8-bU%Gd7^3Q|B(XP@Vt_K*!$hu`YOP^-C;YL9fNf?#Z)xAnfUMyo=3es!c z%z^W@J#ZBx*doHvk=T`%cHn4UYNkz`Ygw*`Dpy)Y2t9`2vx14BrREwsWVv(!6Na=^ z?$lzBYULD-zxPQ{i@n+{Of*4|{2`LIk6gJN%Y431i*a6A5W;K|V<%p3OVwg`W$@kx zcSXZUSpTWUQZ6FZ<(VWykDK$A3l)t^KDuZ-=^%o@?%W_^2VK<{gQN0Mvfv%rQtx|a z%E9lCPUe7~T3L%_K>RWT|%3B)f9b}fi5U6#1BjiIEa!LACnMz-c;p;S(i}61F zl2->_JDe;z;xjOc5{HK^Wow5MN}t>vyuiiN5mxw6m1Tv)`NA|U(QT$GDR!PUxIxHx ze(mngd9PT3+Y?5CZ+f*2=e)a|lcaqoP!}Kj0DHNEO3^P{8O}UDasRVbdYOt)N4`Rg z9b=VxvS0^49#ASo1$$H3ukK_wn&tnr&rx~iP;PO#sLFudfrO|FWVw3CNompXH^CdJ zk+ksPMAZP7#=*18HMb8~E5wtqwl8j6I)C=%bySSAtMdh~Fx2h!VBd;3o9av1^BIgg zb{o8bjl-Saw(hYr{B+EXvF7ooZn`U?#Rrs1nVRih--x@%z#=9sr_I!S`_j}>?frwH zpWpM`hS#m{%ldAo5QgFGzH)SJVwc+WS2*oxDr3W}ldeNWGX2 zbf$5nqO{=DwP|{bwZ0gmp#3jG8al-dyyIAx3PmDfx>@7M`e~|WN@=MH3bEb^jsOS7 zbwB2gV&xF8En=4YDZqhM)+PD?9N4U0kJ^%VvJ;h%z*Co8%_O;G=aE!}Nhi8?S>TM! zy+lYLhc{#p6mtYIgS?a4yVEFI_Y2DAiTuqzK0njc0;6GZO|AlC?$?;N`q82Z1^tKU zld10ymq;9;-EdQQeMvvY%yQ|uWNh)?g2U2E$F}RE9Cm>|d&~%K>Q1k`5 z{jR3IIS&(%O2sPsonBteL#~%8^~gmybqwU$_le4P^%)^^d~+AgFDK2rcG$*&;<{HZ zYi>wzBIEFByXyhmbZS z%+FWuTlY&0vxJxzzn)&yGh9kYH&rafB~aVtf=L&aMp7GEj^p5Q2xg6wG}7;qq{-)d zF>B^AIKCd^_{sd@T%ig>hT~?Jnn^G8&Q~63UHVecpMPb&wi4Gs#eYFE>{#1E`E#*a?wVQ6W!IhFDLzRbDYe%nG8VNJ_(~q~GC9!?{SYdA%laLmNt$Fz2E;bc+us|}i zZq(ad8hvuSYdHGaNS*YF(8?q2UP{Ur6WwvIkXjq3FE%=4`Sg-b5y?{(zK@+Q9=eHilJUO8vvFbZrDtmyNs8YqLDr0L0?bT>z=2u z19nR`O-)>k4!nHC^evl498*w^*qq*sVGh;ljp`u3c`qLl+PD`R(0ru5jtjNTkiqXqG@{_2si; zY(@F5$BsQrI}xiCO_jhQOXtp-cKSw_$5`Eil?A6im^!XV+@QUfzrtd0ZoIxu=|=r2 zt?n5QC0hTt${~46r_AzpJ=v$!WGMcU1$OuGBLB{$jsE4JWghjmkVd3_f8^-^!bmxR z*)C+!^Ar_cTiG3X?fPW#5Z=O{Bi4#3(JpHz#%u0+AbWqS4+7R^lskYPy05V>DI=g(w~9D zkuvZip3jR%AKQ}5(lg5yfLh~+$Ey#StUK@$U6~kf{ZhusaTc}ryYG4~ zoW82&eD)1qYx1M7 ztc!-kUg{<<&gAHP=s#&ao_+gK>)!T2&HGa?t37Wyt|qOWNt2bf9oIHuVmx)}uFV~t z>*`PLNS-ym-}YxCEM)ba#-~XR&8{A%nG3i@)BWTxZyAmS z>s)>EZf>XMaXZ@&F2f7s0kom(d_ga~8#dOAA~iowZ3MjIvHG%h|MqdFE5X*6LtpLQ z2z~2(Xk)?S52cX#9{hZCNYNQ%aCWG$2%%1wKoela5koklW6F1(5Y`hUQ0~Q_?FApl zi}jvm5)YH=4coC8wv#qo&Qf1ACTv%^ucUamN;wezMB%_@_l7HpN3=$TYq~_}#zg3s zM`vn~=-v`Ko|FlIE^9ltlLAn7PthYJm6t7Q<605jvk z3u_5gdP$#~l;+EmHOr|1@=F)L*BuR9p<0T7`vnU0(#j&GK#?h@#go3or1C&g*!seu zM^a!DDR3?w20G9O7cF6I3Tt2L$$(U@=V|QMQl)aj`CU`Ax@a(cDH0P1kp>3giZpqN zs2v~Dv}0uiK^bm(>FA0SmFFod%8Yn&x-M77C%ueA6%uk18|P)RGn8V}g`TH?&|r}W zFxB&vBe99*uIF7{g)Ln(rJBy;PNy3_KQHkCK8Q_on#i;e&v5I@x=YLqyh-qk)$*1| z6W2+j3#8hu50aqEKHQWQOqX-#RaT;YXjq_DL{2&Apvig$eWJI zZ0@_TK+Nmw^SiCWNDS0spo6IEr!dJf_R{4iHeDE`yGVLuztR%WBx1%0!4#X zd-La>hc|Iud^5p0OLvKGA(agT{dn`z0j@0O_Di2tHW;7ZWS~B(^+(_(HStSt`wAAE zGw3G6`Smj$UDLc(3aPj!LvZK z6&8Yf3lcLZQe7*=&=(1vEmogQP^m0N^cQOwl(f>8;8ZVZA1yKDj@OMVAyyh4-mk;$_4lyVEPttDj<&KWu2Mj+vb1>vpamitaYo35v!`t zO}Kux>ekuQH!G{!d#l9zMCZdYks%!tBQkM-}Y zICFJF@@)BetB;eV5q_?q_}aHw2Wqgj$LQ@8Qfg5=Rhg5ui2j{0sXDH6r_0ZuWUDGh zD?&eCt`mxPRhh3l-0#e3cui{A&ENkTBb@U~|1||EyZ$b{ohM7tRs_Yn^;(W*f_O9K zN2MRa&#Iqnm=&=?NHsVbG;p42IQZz0NNIziVHxDm#_4zT*A>+8IK%5Fml$aaJQPTp{izu{VSSns(&GRR3X2svksk)hW_h$a`&F>sonj!S0SAF!y@|R@#xVu!}Ij~wd zCxXy&v#KS|yyez%%WcD!#;TUi_?A|9YfpS@om6Yr-PSvKt@j_b04`Y9ax3ZNt*cVE z9<;Os`rex2X?fDAGMU#RYAW#L(JitR)l7WL++7H%h2~@}t#wuNo0FXQI*%*%UNblM z-jFNPnbTo~Z*YJ4cspc5Q0z`=tlFt-_Fn6`ytD(S_%7?cl_KWh)R=gA;MKPqhi-qA z!hnKusf~C(RPnL}-L51M?@hOhSaXXF@Jb+vfeJ*q1TL8%BC7iK2@al%Z+Rh)xepn1 zVB9+y2AtUuPPl{)?dlHc$GnOI+y>H}2c@};5^nEX;e|wY)=cZaHf*=}#I5?cL$A8s zJb~Brao6$IF1uFl13`D_F5EeR==OZf>3D(L&AJ;;?5<4hCfRq_r}msQ>NhZtOnOWVt*Sa_CQ_l{uzKia539WtX z9Q{cN{ZZhJzLGtCXFl!74eHBC=&rTy7Y*unNVr3N=I-@R!Y!@5rUU&I(gWd`9^&Kv zs-d3wx}NIPf!VtLwjkba#GviKojcb35BCgoAO;=`@ZJsL9;_b7PUvYs@C`rii@I?4 zR?xkr)Q<7$-j^2!D~viI~zuWZ;9FUQKwtv<2l!46F(&r0@D0r*7#_XAk9aY( z;7tsot z)$Yxzzx+R(0~`PUBL|ke3I_krV_&Mfd7o5RMZEOZquQ>%{(cOOL&thAi0533!^=&|MMvFFh`o@wjg z=I&tK>%hO)F<0-zg}xJl-p+P-SId4^Q6skt&29?*?oI=zq)j|8{Wqkq|8eGPEY&V3 z?PfySg^6^>QyHd38F8x_hrP2bqOuw%vs_!V@#Q(mk8<4oa;+nCiRbgmUgg=xUufQ9 zz8+unC13K*DX4l<;2%=xTvwR+y7*Lh$*GKzwx=aQH_Gy+%R;uKFY;yle}nmY_OF?* zgH9@J@(c0Ka72?KI~i>>-#X){(8LU<;30fiOzw^&WT4)UqAZ4 z*cbl)DD105;v4oQ{uTRjXI&(RgYMOnBUleBr*I%oC>KPq`!5!Ta6% zy`ju%zhGZN6zofg`N&_`7c+o;0Y^e}jB2Fs*q6byFbkGcE5c$EK%_yL*|^TopU-QGtveC5qk(QAKcKmc0cHGt0zxl`aFCn|Ef3MFa? zjtcEBrnx>FFR`4Vc)v6-O0rT*4>TQ*btphbV<%Ve5jyLjjq*XgH=yzRd$9$^33~Mf z?GZRX+L*3tLBFst4!Th@U9n=Ncv0rd^dt8emDY!$)z^|Dl3D-4zK)MzM36#U@zPp8 zCuUY!4o}9KREs06QY8F1Ar;uQ$HP6>8V!8)<>EQ7B}rv0V+PC0o5 zbw%@)d61!8*0`E+O^x|VKlbyNM}1emRKKC+30~1cwyn2W_lVs2VvvHFUY~_CsFB0( zi&`P2rxr-d6N|@ z^+I-}Pl&Wf=V}i`A5$O!!C(qFzwFr%EKo8kXxG9YSu>OR1^c>|`sb4<1;&Coe^^YT zW#$0ewPWs2IC;xAAyX&4`cG zB~4|*S~}(L6oEeXlxfbkC_RNgU2|Kg3Va+!*MHXraRVkEEDgn$iSLWww>yP~dXc8W zYN{^NzL9TDn}As1zvpj^xIKl(H$C9{Bk5VP^f&CwBHFVWirw+x(bbe_h@5qi*#P|b z*^O)H0($q41FTV@iFP9u%3OP28h6UJfh53Y__5OJ>I)jV4|qvL5A+d;&VMU(_7y z(k9aR(V&A@jcv3LP62P(3pn*$IBph!(NtPG;QUjV{k(d$#@O(q!(^`ICUAkOg~sDj zCTgK7U9py?y`eD>>c|H?Jbhi#@FFt%&2?)<-VBM^25vtU_1H>U85F zx{~5AYvLSF#Y`R7wYlTrBsB*50-60xwGFJ&_|<88?RU5XYKt)2ovP+T?2!u{K6g6HC>GphJs_g6lg_jl&Y zvh3QuE`ja@Vix?oaWFL)^u|i6Ft#`6DQ+Zxi6c=+08x(*sUd5`u zy38I!&}M%trT)R6PT~bVrrGJ#ajOZ4Qccp$dx@|7K$u8AB!SLH`OxCag=C{9V}18= zzI{tLiLkX5p&~mdkFQ@HkonNdA=A6Zcpp-bXReQ)2t!s#b&E4~^-~k&`&xsLYR|O? zIMeHLuM0(Mq4WUkt0{uD3!UWI6o@q{$UA^mQN4_crdEKb3m!kC7)xP1;Tg$g7w?KF*j-oDws=oN7fCB=}yGmR@FHtiCDC`v?X5 z%Kk^}YZ*EqG83;_OGw}s>y=Na*%Qn;L(0hAD;Fdk7cxRTu4k36^l(~%H71 z3;R;mhL;#ONCViH8U_1GjbyL=75h3H)jBz~^>KhF{g*nS)gRDCjfAEUnj0^O*$E^Ez9Q9$?HIdQ_Zc zzFV?I=s#m$`2~(S->|P>lHg*R`l5r{eL&S2)}Aw0x4s-EVP?`vuV`H0QU7K?a$cPjZN$e zOjWameMM8{1K5{1^~{P_-S$D*j}Hc z=WEyjAG7?m*au^fVi#5MRm2y3az0Wj^MO>Aaxl@9DMy%kaU8ariVAhpsVN{;6eoJz zOKM&A92Gc$y4QF1!4c$M!v+QT4kR?0j2p&61F0p3m66KYP;p;6IWK7NhOtkp5DZzL zD4WUl*$XOXM;kPUo2d84BB8z`zz+;M-~&Ag6Zvcx!7v=g&nhx(x4QzG(w3rb^gQDW zQanFJ>ht)Hv-R@gwFU=FnRZ~Q3@0*R6_Q^HQf9~E?98Mx&OkG@Bih=jbjV(uT-g|$ z=aDV!tApll0h328a*xHCjN|mh_3kvpy>2Ff=XXg>Xp5?0)z*#eP5(XipJm+5hR*R~q!Hx)ybo1I5}eXUsgW)PzbY1c zg%EZtx(h}xtdkQqI zU$C!aGWAc`SBRZPzI_0@_hq%~z&MBRu=CU*Q@3HMTGBI;3wE`UGVAfUcVsxW-Mia> zs;3$8xgfu#oytmqqrm``+)g!G3r~blHLw8Ri{)hhl}C^i(^`Y$MTANCRVX%un@nNrYO|5^KtXvdc zt=-M|y=fq%t}eQ}dg@x8hQB2*4~KpggRQc!jc)_A4#D|aB|i}QV6VQX)$XBT#&xsn zyy?;B^y<6q>&4NfYm@cHM|YlFr=nnA=xx~74d^?&uh`d)|JT@8F@Sw}(X^AP9Y9$3 z+Ezbu>%o5dd=f+Zs!_W!!$Lc~!zz^@vNUExtyIO$GOI9EJiQJI4zdpS4T7btb}L|4 z5o=gcI}7!qyVq@Z)$K4=UX!A!M_%b+KI>=SruGbYE-kdLqIJX_&BPt)txL>U#62b? zn?~twKiWr>Y9+vrnDoa^WgVv8RMwUie6~2IVkZA2V=+<jpO zCqf#BnV7FZ*NqWEkC<^w)M`IrUy1z#|+g+VPLaVWNO(vzLia5fJQ1y20jNWuX zD;#v{DEMasK9jB`YyK7K{svaYxwj?Bc6aba1_xEmo!e;+($faMYc=+P8liiPkE%8Y zajezGF>c6JFxe0J%ROMVf54F_ivK7+++Ox8_GR?%urJepiG9Uw!@kle*q7SB#=fNg zcd;)J)pCm07i4*qYx!cv<#QOQ)ijI z#r#YitF-mL!aPK$Q+v+zN9?QYci7jCe~EpG!S40T2iI30Q#o6DdC!GE%gfBp zCS=s~_jf6KkMYXYvQ9sSfTFWH|7euP#wT%7Hw2l}g z(pE}AJj*6!8}?Pd4f`6SU|%+WkA2CZ9gQ}zFXpyY?xRa^^2xL$XhZ$HUPbLnFLH{m(jx9od1vA6sj&wca2u&RpMCFA zUyBv;ekgME5_^Pec8tEh>37WJx09BIwv;9LGu1+A;8VZ1I+LWK|XM@t_Bp z-IpZLSot5YucP^ya!&V3jy#y(e?i6X<=n)Bg@XmSNJf#PBZ3;X8aoH&Cl!GO8ttpS zrAJ4X4pkqy?(uALblIYo{!!p7m0c?nbtjhqZ4mtd0&`YUA^_EO}uB@*b>iu&1ev73icUe(4hL-T=9Jk zTD7;U+DIqUKX{o96<_JFnkm+4HZk%rQnNW^$n9mH@)=U)bRIFe!>#@Hl`cAz*IQT6 zf=Z^Q-%Hx3kNKRVR9|(JW15~=c*AQ0Y5hm`(Gv01K0Ahr2I}te-QFW*R9zyR-Xm0h zTA@-k?X2&C;)jn@={m}VPD)Aqf_=$TQ}48jqzC06bK7W*1wYAne2p4y*5TGuo=KCkl4rOZCRHGPLuq}A-Q)XI>C2&^Uv}juzu#;YfPKL!*jGMU3tobb^n6=LecUob!#E{!hyL%eFNfb^UoW??FaIs<>kxo_xnPkJ z6zt1|*w#g+-@)7ke*)2Z7evI0_NJkkr1b<|cZCHM*zp}fc8mOqeVJ}zUmBXS9z}dk z%!=`r-h6=Ft^q*OZB8HAy!f9a}Z1Zr&BB`xo}r!$=j! z^j3?iZT3{ZU+1}}zFsW1KaTzpQuj!HU8{@zVhr8;*4pp_n(Q6#-+Ahud=+l9*jpkD z(T+5vU|*a7_BAS+MiMbJ&t^I%31DB#w7OU!Sl(CcYi^ob{&}?041Bg;wyAs4JPIf- zB+zouLD^~Bkp>NPaH9?5q4Q>`U@H_NDq8>}vw`4f{gO3@64alrGbTgUca%^FZO7*cbdK?5mdNaRI$| zO8JnmX_SccD>}r}oUu%FsxmM5N9?PEp+TcZaSgPIeVr%>mp3|1E88^PA#Cur$e{sf z)pU5gV7xcNFxB0#2}kPI@W1w907f5U*MgR9uTZ~YV(2+3u#pmTrJd!$^U!+Pq5aCK zN=hI+9WMy<6xq|sg9Om0@X>QJLsk2AU3URbcz>!8pqi5x-&bjg?e)V7N^W9bXTM=z znjkPpa@7Y{?(%F$Xy8C7h`@XZM6*zirq6)cqTDk zGJ`^scI7wht3ZT;eT~{}VP7FJzB*dlv9Dbe?5k}H`x@^;@*@ct#5e4V^cVJJzlD84 zzGGjdo7mU1FsPlm(qbF-rNVhp=4b4S=P&GwXA}EkM2vsKz5>E~)s&sNurB=?&Qd69 zV*(2i4--#iaKR_~?ix{p$%6306zmK375gG=VPCQo>}&Zi>`Os8T2vVi+mm{krx6pb zF;PzgA`!k}U!cFmzLd9NU!MqHu`g|E^WR}#Q7m7vF9qb9HgZk&2ka{%%y*3nR13ng zZed^13_oCBsy||1ps(21NAPd3FY`_8i?AL0irf7w_BDa0U|+r8u`kpn_LVC77xsnX zlf?^fVP6G5VqYhJhkbzo>?;-X9sAOtU|*=eurKI0?8`2U@9(g$ISTeANx{Buac^N? zUR&4~p5Z(8Rf{(@j?(VEf%yUZ(g(1w{UjQV0{uR0S*=O*?g`FrfEHiCN#`)dD&eNBJ??8^+mzOazcluhib_Z#+Q zx`lmJU}Qm?*jJRwPuSNg)$g&dUWl9}Ogjc<77(#_Emroo*q7Wk>2FFs3d@Wc|j#9*cYDhC+y4nUt(XS0QO~xuw?lW`vU!2 z?92N?WIM)(W+8V6IbuasZ0FVGhDWk|ukXnw)Ic7Daa zpy700(689n_!jmxw88il`$B!izE*yReVGB+*9}ns`zi*oFAxR$@&>Rk9MUWNEB3XB zFD&^N*w@G}*caz_>?=|AZ?P}X@3AlA-(g?%e}jFEZ^yo9{+rmBEP#E9Z(?84|A>9b ze#5?)f5N_Kx3Dk0AFwZb0Q=fc!M+aiapD;*4FT-S^lz{)4K(=r^>5f0avSy)PVkh4 z-rxnWFFeEezrem8fHi)?z7GC{eW7qE$NwevCD&eJDQPa4O|7&DEsP@+o`42n!DuA4 z1b}@ZzhPe>Dzt2*YP&=l?RV@;T67EhdI4Zxc!96j*Bk@wFW6W4SL{pjpRlh~`fu3R zq5nGe1p=@y*?-2q`uWF#S%((|Pu`k)| zy1X^krg4J*uh>`n&)ApjcI=D%U&OwOp*OBaFl=I98b4!SN`RJ*ef<$(_8eOJ@^{$R^PjLU&?feEn?XDhShDQj#J;eb z*q81m_GL`LzH}(qmorkUrXjf_*V>VP9)=6zt369^o(StCoU& zWdhh2wsfYRo`QW9^dIjQ04=tI>?*YJrg;GNh2F%zc2lq~YXJL-Ql6t=U)q0RU((;P zFW&9gmpOoaouFV}=%27J9m22Jm)Z~5SJxf@`?B7|z8s1VDs5q3VifGl=C{~a=~wK_ zfr5RNn$Hg*{|5U?ko*bzs{D$5;Q{Qcz7JDM!M-vFa9^>nAa|hujOQopD}sW3t)Quy z^|r7tx(E*J;zlQceUV52_H`{)pM8y!IT@3a{WJE(0AODu|A>9f)BS>das7yWCHio4 zvaD`lUx`9n*w>NQbl@&Af|MVzuXz}K7B-Owz`n#N z*jMa!?90#5+fB5af_;_6U?|wv1K1Y!`^oZGRl`xNZ!4h=G^K`2U7Cxh!7_SIlV!M-3= z?tap*jVRdHZXPuX_O)Z+wVuDYV__RB1^Y5&`=GXkeet02c5$kBeT2;_H3j>U9{CIV zlEu-t*o8?24M7A4W@vtoeTn}b`@#}7u`f<7d_I<-VO*Oe9W%pB!M-}kbSq9wTiBNi zt(hKxeJKFg7ZJd|im6MPwP{?jygR>QU;ck#U;baQFYYbu%lbG>rzr`jIRvn;8UXtm z6k12}D2?1_Eo+BSurH%+*w>ygh8I}eCic|?p;P<;`%2ovzE1re_Eo-xeUSj{s|oxa z`;y%YU|)RSu&-NN*q1nfeHnww__kwT$6;#$RDWS#-1K+I|Ac+@{ucWh?Be{6eLaxc zL&3hFv{(T9k^!);~CUFo1z`jcP9^;Ml#3cdj>kc~w z`+5_``oD*L)qTajxL&J8_#OM&`y1>lfdM$CMltG9CF>L6!(J5Zi^#JL`&wy7Ho5@Vm%@L- zzFL2aeWgZPkw8kOKVx5qSz{>J*FE%i>`VSH>}yE z1^YU!MZvzTf5N^PzF}XzKVn}u!!!bZ#J&nUwqai+Ml@lU)8AoVg%SYvRkw+K#rzHS zH3I&MeK`_10qn~Ul66{3{2KUo*q6b-#lEz*urJna*caU<_Er29`>G`}e8;|!KVe_l zVH~Fc?5k76kw~MVwTXQ(@LcE@`X}s*hJt;K{EB_Wvu?+})WgJogMC?TVqe;v%3Ii1 z592>!U*X~_VVnT=#k~#ts{IN3iWPq=#8dkZ*cX(?aU1sK_!stN^aJ+A-N7~T6ZWMo zU>?n}g?-85DA*Tr6Z`TH!&q;_zO4TZ_JyEeUxEaVYykT*h40yheKBuhUuVNu{t5dM zLat;i}ODh=P6fZem|xg18=0@GtDkXdCtg`bX@Gg#C(r4Pn1x zUvD?DuN=r%>`M;^2N77mV_z%ZurCmReQCo#vHXC2UHcLHD%gg71%Q9SzU;SQUm|~t zeO=qazI1a3ijpqBlg7!U|-sZ_I=;5FT-!x*S_CjUjpF=;F z?rqqY``=(+F8>PqiokKU;@Fh8urJB~hrRoZYO>oJMx9VX=n0|MP^EW~CK9>|2!e{B zGywsrq6mVTfb=3w=|~U|QF>L1AiWpqhz1lyKn1}^s+>C%#rEuH?{~lBJLin=|NUpJ zd#pLu9Ba;b{T};D_!roh>YuSMf*tHjd=K`OUn7tFGxjz1H`v$VE$mB_tQ)|-oGBFl zE$pj~r4qot8hOjxD3=NG*w@uxv9IhuU|)@t{|ft3B~OZGJhO#;^lXHPMup`R!XpJyz+A4zD1gvoioXd4Q4k+3o@Gkb{ zMOzGDUu*2&u`dNY_H|<$`!eH7@)8_E(awP#<}h?>wG7fKTi90x-!}H83I2|KX@c?C z7XZBO#l9}6{D6HC{cm7jDlA6durF0w9)cgSuQ?PR`vNSqKVe_aTiBP{UhJ##XY30` zAma_*!oI%Jf5X1eDzp(Ohjak@g7D7)*w-~Y_BDq>x@ZbYt8gb(Zed^7cd@T(I%xt< zsP_-p7yr%m_d&bZ*Ujhr-7)-DJJ=V$K2QaU_y^cmZ0l&O41j&DNz%-uU~rF!p558S zz8ny|0QQw|d4%v@Ifzk_|9kK*6PzEFFyFa90u>-_JrFVr^nWlf_)Ho-%t z!~CEdkA3k517>s|fPDp70@&9mfPEphurKW&v9I`V*casv_Qij3X+JU zqE;Y?2e7Y^{2#F|jUDXkc-j{B75)SE)w7L#?W5a^eT7*DAKb;hurhnFuc;rgFU)_A zeYMmi{fK>S`231}g#*}E=8xD{8T41|i|gORzS90N_7%bBVet$0Rrp`QzG(jy_H}Ou z`zqSTzBGQqzDC&pgnebJ;kdqIUslvqc7Mdato#n|!M-8@?2BLv`w~I#!M^PNgneUiu+XAkzJw}X8>`d8RjX{zDzE$oYI3;Uu`;l7E-zO4Mz z@YvVr7WO6d8|*7;3;VL&!oG54(BH8ynP+kjJoJCUz7FrjzQQRm0QM!@Bc-AZU|(AQ zCG1NVz`hceTiNm0m(ULO)%_FpMfwf<8X70JMgD+&mEp0kn}5K*4EADQr2jqa%W*IE z^`7BZ?8`Bgn7`C~5B60rqrHuNW&VPF8RP@lS3!V?G#>jJ@}=WhGgK(XV_%Rh>`Qur zd<**`{)T-u!((6T1iRRmf$RY77WS3*1NNl? zU|-9Ahj*|q=AW=Hp}p8w>JIh=S1C68j(y$S#lED!VPB=3lls5GzDRz-zG!!`uN58~ zfPJ~*u`iFGu`it;v9D4BgKlMrQ|&hPm7~0ceVJ@wUsz>4_Er2l?Cazn>}zNb_7(dF z>}zBf`y%;{eSM}^D-+wrz6k#V>d-`P%O&nqz^6F>|q=; z7VOR9l&*Fp8G6)T!~y)3#a4V@y#?ElDzhk-ddQq9UyqzL4MttSI8#eEqfDi3!M-$3 zBR>wWtYOIJL<}i2N6f?Pm1%=5*w0|8DX>%&C|2SnzGcDz8*=Q0wS%%!66I#rU2cn;2=9E z;yRp1yOtq0pMkoNu%HJyk0C9M;9tU02Sy^Nuxu1G0S$~ z#*hxDGOuHScU4;UaT+lc$LTbV`dS8W6zlvr%k=`bdaPjHC(@Eyn0zgTZ?E7ej3XTk zX+SXxNAV4yn1^sQGg!e4EE|M?qu7FJvl#ew`HUZ;khosa+I+UYUJ*Er#78kUY88@E zLN4B3j`~6>?Qyb(arjIsK-+L=j|+HRWUq%y4Pv1WBzRCoR1jCTGp>@(6EuddY)J(i z<`d21igX&aVk{NGCH9jTrCZM#qTJ-@40*l5q@pNhwdb@$scd&pROuMq_tI1fwd8MV z=86ciRr#uAMCqDh^`HP-YwRx$f@u-yAfj*E@c<9Zmb zO46@cusT0uiC3mlR5<~D&JhY{`uLiK9xS`?iN+mCPIgJ<#tFUEak#BAg)N#AZO$I2 zOkrP)tU^+5-r-a}kAbA8BGAgr%W%?led7oH#&5x-uv%I)4rR@y0eEl5IL0D<7AGWo zsH>)goBGEXh?4{YWO0%be7NJX=^ECwVH{E^iAd|?n*zhrYhcb;F-lJIi*D@fD6)~~ zbn_@Z%2FDR+EW{~7K?5tr7-MyQ8XLlG~TtA=dp~z7WCQ|c?W7?vc-&C2BPX%>Lrx* z>47sG<4mXe4&VI*?{#HbHK$i8<-3C-Gomp&+7J6=XmFyC`lJQVJ_Cr~9a|}s+96kX zb1!lVh3GiSVIx7cAjv*rK`Io<)e%DW8mzX8p)d8Za8N^e9Mdu(-_W~hb%oyulwPLDRfu;2A;Lp}$68C&5z>jH|bn9fz; z3%zYBBKtFwxinW~1SEF>;K^Lp#-%8+XF5JZe4=DN?l&96US7k<#9EMa!VbDwo;kHd z9by4LfB6b`nwTd6#nTwdnS4Z`GHZbgTWBnq#3gkJ1LjNymWDz`$y&~H6NV+2eH>UO zDGR3Hah9ok7z>SBl8kHv+N;6*{4kF6mOk=)iJ_PUQ^ZB4>^p1#J7-%&#g2uZ1;gGT z;k=9dzLJjnqHb`YIHt<3&w}Z0>7AHVrjo=s>KN#nrLj4U!^0zwFVxa?rc$r#vusNC zux?<_qb?!W>0O9p{oeb!C~%4{U?3;y$ys{con0h8k5hE&bBHpqHJG0~+e>>DEu}L~ z8ltax7sgQtgvsM9^;nXZmq;vSE=6B*H#@~a0fwdMxt}eg9sxt0u!u|#ekT;oLBydY z6tiS4EeK0xug4sRqY!ir8(j|LE#2pA!EVFJqA*UMJqXJ^;AP>n&lN%XX@T}^EiKN4 zo_Uxx&qcr1|Vz0 z#DZy}7Ttd*M&~q{bbR!|#~i^Z^-&CTeTb<6LtI}&*NCHm^^qGgk*3$!Y!;=Op1;Uc zuPi!=qu0F*H@*ZrQSMj9bjVcJUo)1JYCmLY86pW*o97I8jt;nHM2Rut=rsq}N|MUL z3$7|gw@V|=fw?A`}8dF~T1dqTeQeya(qUdK;S>(rQ z5E7F6T#ftiztX~G9%oD@Fegc@ zmRugias~^zhRtB;B{!Dg*Flll^e{|ep#J$xvl!AqSEe}R#dvf09TcSJq~q;`2N@vp zfCOY=`N?Y-sLa5%%@Z7w;I#8D!gWQ6;|p*u1WX(2@^<`WY&7YnaWtjz$;8L;d2j0y zhd~_f=4^pAMu+%LN}7eL5o=nTb(%4d_Cq>G&E46wBvH=Lal|F z7qaFm(i@fCwMtQML1J_JHax~qp_c9$-0T!B4ARHpz9x{NOq(l_xLKN4m#C!thOA4x zST%=NV=1W_#V7~p z42MfkOJ=X37?O*qf=9>;doPM{*19RwkRr&i6~u`Xr!%k|5#HqP5W8=dG*G7wyM{CD860l6qHb$SGebKY=BIn>}#leyOv5}Us7ddP%UnP$X zj*Tvkz5F~jJUKR2vhniWLEM+*5fXOXO7iGN)yucZIHs>Sa#P&O#;eC;Y|T|ne5Nm+ zkKqJf;v}w*BdT9WU4N~vG{IRtzB%@~;M#DWD3P%#eBA_G!PA z=`i+p)~4?sR!`-b&X}aUZAh7kub#X6bxu`z_J-25_4Vn^5Ywq!O78}*PmNYj7O=nX zx;{7j^4;3k_peiC-&Rk~vd=AEfA{73yleG*#`S5ILqzMGfPgWT^L75-^;r`owuIW* z_tldkhdxT({Ydy~p(AB+QfZ07>^+OwqQs$DUbBxvo1ZveEpa@ZJ^yr36}M<&Hg(wS zz1yMXQz?_G%8(^xfUSchJcR<400~c5Bv5SWW{@Y!Y)dt(*0oHJ%vLIDR+BkbGq!Xu z?5ouWW~(7csy;A@ODdER#g>~2o!KPo6p<@3`Gu#bKuM*X53x6#eWC$i9F zKb5R`vRa`Tb{dKdDht-jvMxM^Kmbw?A(XH;+!v9$i3%iViCSkYY}UHTN9r^Ob+c4Y zU~)`TE00Q5@h4&{43%57?7|#yRZ}ZB5O|hO=qtH#TAJZ#8sR-EL)j9p&ihz%pLQfEczvaiFP`rC6ZUmrrhVq>+E+@&^7-)}urI2EPnO!#kEuLtcUId?>tE%5PG1iqq7SDXyn=M$nnY#Zg?1iyrr{q6@k&CmRQ| zxO*dek(@zP7;|6V4~J7>7|yEGv9xYhwq!V>73|9{imNfrl;n49ps~tr&2}@ZCcE&c zu=9w}1&h+)3&CuSSWOrwdNRgzCI+qoJ1blCjLr}F;Cy7~I?Xe>dP^GHC{DG_5p`&b zvXDpw>&jySswr`rXYeyO=LH_0wK{59G%5Np`^ntY5$CFEx&pPF)emjXPrp)mJgg!Z z9JH_zx&-Gu5O=A-hqTUg)I`!x7~_a>ke_ZAQE$ zj7A7#uTsJA`ke}Fh98q9H%`?q6)*gi{8G0HwT`sCJ#@sa_rqoWqk%JVQgs2Q zBp3>x-TxpT?Z{vqQwweKRe98TYwFU*er~%XD*y%7$Hdt%oWHWvY2s$(){6e{=+Y~} zuV*gde5iFj!alHudrVvx3r~)JDu*CQkBe7z=*QknZJP9O3_A>UCDIUyVmq+;fTV%t z^4G_SQ=!46RFZM`tT_3Gv~wFJKL#D}J$nXD5EQzZ!h!RnKIXm9cx5n@OG>)gd(i_- zUF8N$UNm9ii_@zwxqayUY?j!u^80d?_u9gS=xnA`m`;S;jiUu6!-vf}wE0ou}=ogh6e5zzV6vu?9*LR&{oD5}N%9 zuTePf?gO0b9ndg?Kv71X3k(jN?n$mBB&{y~(mZFRn0b5^>MdCn=djdG`X~mn1)s-R zW>9T(B&XnloL#UPW1lR^sk`xuvEJf=vEEMC~%e{fPHz--Q(DVHY+F8g?rGZb5aL2E7M3uhC1+YF}!Qm zIudp3N>V!KjdO<&s`TD6xTsOf9~sYEel9FsI76U8Rf6dZJlu%(nXK|K@n43(Foh6S zVe7-Z#{Qq~SoUU!?7!KjYm<(8SEB~^5on*w!JR3Xn-rHaXg?R;5}T_AvH5hl)xq8^ zF&E1v^pv!PUP1#C!%vN2nD2y{_cd`Ao^PFdfhA?z`bJ z=_(DD!jjKJLyla|&N!38BG;;KH8+1ovRUA|6-Hc-Y&`pP-;8YeO#d3XpVg9Ah`u9u zAfxkqj-wy<83w^qE8Qi*P7PeVA6y1tS_qW6j^^36CnqXtWFC22Pnr=b46W1tmG4Y6 zNs$#i+&+*`M^m4D-ZPNG&mWohfmGYZ5<_v_=0tJ2<1M@W3C3OirKp_#&I|evN1Qul zq)IwE9nZ}UE8yl;{#tQ#dfz&-#`pz@0Q+2Li_VO#8o(2BaxaIKZB|BKKUu~hqG=gU ze3$&Tbik;e?Uko%Fa8QVm4p@%JJL1Uzhr;DipIg&(_m(7+5eQ}+~tp6en?--%oA1X zIeA_SAul(=2dhVQ?9dDYChxjqEAN)fT^0+qNPEmJ^;IN~{;R;N8T5Xl9H^t2!uq(M zf1EtG=z@{LdL8ZT0R<5miO_5O=jMB5O+=LzFijYyT8(nK{nr=5nj&A*9lPJGR4o=E zg*ZJYzEY#tk$*eykr-5Rx%pA3pGpU32q#B?1f{)XXjKxEc*wADRtWCRn|PS&?)EA+Pu%**wm$Mi^*NO)7Sl0 z{rvvq;n|t+zR1vaThw#s$t9ITY%{RXgg({%CHL>Wjx)?l=uf@8Uo{?aLI3$krBhNj zI{x)xJr<9_g7-@~vC$V!d}J<;nZA6Ac%%G`!E$42QgKFJm?MM8$MUwX7RPwA-b{E% zN}chrF00DXliPp(1tk$GUo2Po;anp$J>HxM zU4E?abs>U0V3PNM9JNL6r#Ia%CdV}VT^>hJ9b`3;6)Cyj;VQa_n|rZ2mAc{C9l)NH z=C;;|$dhf%3;+DdCGOqndh9uW1mkIGRfxvhC)8ECN%^;`=4=MjWS{CT)jqm5qphhh zG^y+U#96U)GiJ8@Y^8P4=)n(`U0$OX*qY8$zx{}%Tu(#9eQrKlEn{*>v1nsOUd=5F zm0Mh?@fQa%r9PlJ!L8oEPKA zkwc;;@hw%EoQGE42;JC!TdYd+N>;4s^!R)!Ikm zr%MgaR-f)IQN{3UE`3IlpPt>AAG&d9HRr~)zbeeu78pLSbL26#?L!mBZ8CT4^LpEd zoOCDIY`O~7a(bPQp80Z8V8991jPBILka=J__Zi>wzp)gHq36dq_hU%!yEJQ}1>$@l z%U%MXF_g>tV$=yy8H?<=w2@o7JLO+OQU|8rXY*0%nS2K~&k{BP6ypP%-3h6cFu z2e97uzhE1{G~n-+72wqzaBb-3Wsd-i(OJ*w06*KnfWSZ$Big4qFh(OV2pSa09~3=( z8a)&kYovcmD=4u!DA_jfPGAr=PA6eHCS z*Bs_^E^HVYJ{}l6Iutg^Fa3%?{9RUflveoEboho(_$>de#pZzb%i&+Ngch`Jtu^1W zvAwknjmS&8wV~zjfdtd!L!1psNrH$^10a|lcywNs0SRVMj+iSC1V4y;$w;NA5q&O(+n38XCoQI7%=m$|yYwWFC1C zC?#Numh%jhwu?Tx!X+CNbFw^|J6`<)E?U4YrlV$4brcq(xe{|dD@L324(NI0+0%jg z!_oWfZVSanyq+g9Ief=7KGH%Uws0cG`SDE~x!Wg5V}wXSG&QOWuqYwB*w2iyZgz2| z$76kiZh8c%8wTC+vWt1zO@R`K3ziEo^Nfo>%pPPHFWVe<=uOcemgALZ6Ha=91s{WGKp@f@&~2tfr{;uQfuwS9;?sB6iiTB79>;R% zfrTDKDRn2Fu}iXMOsca>F49kW-g33MMYUBf0W=Tx$iH0(OCEW5%g;0U-3rIBUCN!N zIMsJ? z;>=Ok?NSdJrisklKKb(1``4}pduHE%wuOBq0od0_0v`Kf4hjg)ihqz5D)``T^W9ta5AMMqL}x$n zn$C(9jKyPLWB~y7WloC6zWPo-$Y|ZnmXW;kfb3zTR(6hY^nK=sWyTo~k35XEe^@^A z@Tly=3g(Ea)rSr8Ax{J!1s-|S*!pOO;Ze)gTWuqco(KCkljXPx=JaLfIDUCFXdgcO zBxl_I;@C{i7&LbxIQOka?&OiMw+Xo)TkU6Cb6cl#=k4>(@Z~L%g?_%8x6$f1kew&k zn)mfc{%vSJm<7vxDxciM??gcUts@c&&H0Fj*N^$>P(Qp+d(NA-t>ERpGktyqS6fdP z4HWP`y#BS1jPv1r7Cu70*}^xmd4fX0w5o+-7m955of+UOI(1c5G9Pn*yg2=&wSvO^ zxRXVwkm5EzJGF|!hS=g`CNA+3N3<309~9d3)e9*(y=2Gv4xWiFanvRhFy|sSrb2)Psjl9=1KnnSJsN`zpCmQ5J&7zADi6m#P!Dmz0XQ(+AY3*lm ze02?&da=3seEs@;ta9s-OfF@tB};cNF0o!Dt5=h$ABQ(ctv9GWs@Hf?f7s!fgG}SG z_Qq^@W^p}@yA#sPAjmiJvn~yLwz0q&3acy*LZ+3do z1h`()5>2fZP46UHPAL|9CpEe#Hd|G;)E6|bM>bc%Tc;&jZ7(*5J=$!&=h~W{)H>zb z(x~6E4sUCcXtS-Xm-^C*>uJq;(XtuYoRP#{TiN!|p*_X5^_h0t3E{T*%63nOw$9MD zmCW|Q^=AL|HV{p7zeDTbTwA1f`#39eeNqSLcw3EY+gI1l+Q`njxsIo`9Ruqvxo0{@ z)|(ezbl8P~x*2LmNzh4tt(RA)XX8vGS6EN?T#x9*PL}r^^c}qk z!aZ_f-HORwlw-Z9F(%&7UTMh|88&1=Xy1X2UL}z(bsg@^3zsFhHv3f^`yYAt3g|rV zSnLtG*hhE}srJ4vfuc{;vQp~ez^R-8)f|X{V;^O2ze$Ips}52vr{7|1;6lzIK#aAL=B^84Yt zlp|P?k$XBL8IB|O!$uzDj6CcZ$$3ALM>$#`GFqfFTH-ib7B>1gXSAYYwCernQ_2^$ zA}{K6UNktqXbOALlJlai<3-2&7r@GTM8=-$jP*N?0nU={y`p?MF7ooV z&dW*1mv6&fzRP(z+wt=K`aYZgpV`jjt;%>TGkV2Npv^2MOSOF?;e zg3$dzCoMwMK89!}g!&GKnGA;;Ti-h39Z{ST5ik^CW*4dR9|&PDlHInEus;c5ed)g2 zNZ7Uz_UbAUk(&+j4 zFsd|)zwlt^@I2T#8sqOg7(Kv)sS}O!P%HL* zFdcvgd#6GsiRZy2({TU~Hu75@j9l)+43&=O!P2{LqTHcz z01tL8ihz-{|6CKE2eZNRU{usHs}oy1Sl1pN%xZ|!dWQ#d+U3CnU?wm@cjOii_6*O1 zMIV+3Ll$-l#n&$eu_fwdALIL=6dEiWIkd}z#p8LfJVO9T-T1nO=fMz%S-iJ-Fb7RO zfCqzd0X*1IN=2OSF}^HYU#-17nAJBP>;Rqz)5Y^(R$Dw+bc;~Y&=wC?)&lTgONV*3 zc`zMLq#yw=Cz9!_8lDH!N$*xc8!sDN_=N{6{S6P62_xd#;lWaNc`&P}?>v|}QL={SyyX^amcy;X4o3 zvdx3}?BT%@ws|mH)HfcCNc9IE>@C29&S#qk zqvL=yedEDQK>oWt*jYRe_WEZYOle^sL&Odbc8648VTT8M_l*Z5kRWEK#ge*h^I$pv z59aut2TQm8H+e8AfCsY$c(CSr+U9K@Oy^e~Ob6w8m*1DZIRfCp&TaExpMrSiwSVBj z2vikD03K}rPdpfm3eST*-ot}sP{;y2*foF$TL9DT<-sQYa~^DDhX-r^S3KDJ?>v~+ z77teG<7$lO!4m$K2YZd@!N?Vgzw=#pPdlus3x3hw8U^FkzbX`Bb({T!;XFE0rXk2SaV~Uzs-Yb`I2YqNs9nHm?EAB!xA$T|H^|U{J?|Fqj!0*A%Q-C2gCisgGB>8m@Od_ zz=Ivf^I(HpJeb}V4|Z;g2lLzE!2|#v?D=;d%pBmsdR>1L7xwBLC!tKacpqDaC>fCtmt<-rUH`J+iC(S+W39!zSN2SXD|qIY;O z6q3-bo+Mq5u-)hHc(A*>Jea^14<@Sj3lA2s#e;PdsqgY&#@jqt?;ak^=Vu;Faf=5t z_~$&B@9%gppWpCcirqv8=>LN}*zsK+EP>$0E)SNt%Y$WY@nDL7%Y!9;E)n<-d9a}0 z^I!tscrY?zJP#(mhX-pU{EXh=!9@SZJXi+6gGE4UKoDz?vSWUwJHUgDklp>pgLRWx zgNWV#h6h9a$b*RjJlN5Hz=M6>!-I|N@?iEP!mWg101t*F+#G@A;jSN!2oQmh{ug<$ zFui}kgY|6jV9$1WFfxeZPdwOufCrNy7KQyC4<@?HgWdQY4;B`&!-GYF+C_ij!FqV^VB^&Slu2T>E7M3lDaD ziw83xbl<~+$^I)I%-}!c!QB7EgFTTG81a6d*}##FM9KWXgC+bu4<_|DJQ(I5@L)Rs zoCg!|SPqGKb{Wru-A8UZZ1Z5c01tL@hX<37I>i3Zc(9H5D$nJyP`RlB^LBsa!5D5ck8JZ`0Tefa|Hy;cedob4e#3*wK*fCk z9!yOJh3CP903NIjw}%JgVE=m_O#9C~nC>nQCi!pjU@q)aalhfggzxS0V9s38TRfPg zq`*%+SaN{&nOo$y0Uj)9iw85_;laLc@nDJnf(IMDIqHGu!Fd1YJXjco8=eO%+vdTL zgiJiop^kvX)w}G^Hn+`#q3}FdZGe_H0mayBfCmFZ_VQpv01q~D)7xX42g}*!!F0BG zFg>P(-}7K`-*~WK5X6l3I}av3PPgb%yH9D{r-&E@iHv&&@L(Z$9_%i^4|%4zpYwbz zu@%6B9Z&TW(fK_OcJH@5*r#nC%wwAeI|=Y$_R0}kJeW&ub@M#HgAq}@B*XJyHQPKG zl;cyMk4L&Wn=w+I?^hnoV2cN<{u2*YM|KS0!C>1wSR$SW3j%nsIx5T_9xNKAV2q;{ zw>|fr2ebVx54L|V50;Ad-QvNBz#M^FJQz7Uz=P%O<-r2}JszxnhX=c$%2Llj#6Se_ zVBk91xL$wSVk9M=2TQ;KJQy8`WVt%PgHbd54G$LZod+YK{#za_72v^YYQFPe;G0`K z7_+M1F+30E0q|fa_QiDbY8wtwpOpuAFmh5YWmS^o9Ug4@CmyVEn+FphWPpO;NBV+x zc(CUH50sC^m_V`0A9*ly^_#~c9keTVc(6GVvQYC1&OJO>)OQ|C7vRBU z_wryM%3U7JaGM7U+~&b5EnwUL52hhbfJ`E!+~UDv0HQ&Gh~YOpm@N@Kz=NTI;!c1E zdj#-cSCrGXd9WwDJeV>R3=*FIw9A8G03Pf#67n{`hD>-557xKMgF%1b!5;1MU`ReZ z4>r8RgQWzzX<$$B?PP5pND6di;3}>-l8^^gw8GXgz!xl6J4w4TmOy1wbOIXyv47R;uoK z`O>&PzVBA5sjXnytq5WS8w(vkc#0zf8^%-*2q}_~MI7oCD$@*AQ%>r~Od)Czs9MA@m{89Lj1zYRp#`-fClv}$5P>WL;l~u^_ho7JBK(KAHSLAvjGp^P zt@+RdDY21)-Fe_#cp*$~^UNicLs?B;(vghajc(|J*G%?5A;;LNWuEhn_OK;j-xk8$ zG-}X|QQCLG$SN8E73i!V%Mu`j`3t(BHG~hcbQ8NhLV?iRLKuh&5WvamHtfCgUELyi4awxxLO9(5&3t@{*B-4Hd`FJ6$;L)xSRt`F; zg%`rK^>&4@wWCZetvM8RF!IU}MUqpjZH0<$ZfK%GukS)wSW=6;e9M9OEg>v|D9*c) zm{y0m^+~aUW)LBWxVdDG5KW-GC4>bwEBVTA<|-1u3`M7t`{9MKKpks6$!MX-s0*Dm z3s3mgA$f!#B7ThEP5AXa|&6%u$5 z3Rj}f5TT3j>1I`Qy1@bpS@p`6zQDhCubF9Dyxsshuq+XTcG#z6+kYBMs5pXbO!^@m-lq>_#;*z zCt=5FANbLPMKbM=G#x@Q74#rDA^ZGp0zw#)xQCjnLwF&>e8Epd5Gsxr!bpV}pHoee z$Jh66o?P)SlTsli6bZ{F*3Jj7t(7Sfp(*qZ?Wf}u>7ZfPA_!LY)AnERieqaS6g7m$teA7eOPs{e^T^Qk!BPXOU8=j_2Clwlyl|O04hBQ{E zRf7e8q8X?*>bFjgLfN88Y(b0tM&yhFk*RLRwBT?OxaV0Z4v}=z=dP zKU!nmpT8Mhbg$7g5KLYUVnIR{_LcK3id zXp?AJo?Hz?2tu>etICGmCDucOy8&e$bW1uC2<&4lsSsJv#T*~PtRV`cXc!|pfklFM z^vO#_lCxPukZBM|U9NZ5kAM-4-VeVMBA**N#A`A1yqiSIk@S8)DU;VwRqDwM5kTZX zzmbY89365??fZIqSm+>}h<}*oT?wu6>5G%PioDT``JLE0qTnz?ZN>t4je$k#O%v6} zFNKN-`Hc~95+px}u>1K#YlK!0gxwbG92q31C7w7P5SwmXnQwe7*?-CeO%e!pl&z4j zBZ7p9TN{C)9%#~`fk$J;Q_qKJg8J8uF%Tm(gt15>4t;8KtQOB(3`~OvP_>r zuQEU;MQ3S(z-0511=rN6JwVVn5Y2nP!uP~8bT3HbKxB+%ba5aHHy={Q0znVpAc45m zBM-!(=_xP=j6kGGkQZk`)cmc2acC0gF!Uj@kQX|AvL)`lEZO`RRJ`nzBItn-Jf&V4 zVb{jiEf&K9qJQ}0X@l7-KXeSQ4~W0Mt{h9QnPexKY80VLyilNW0U=d zZ$1Hm&|5-Sxr+7};@t?1{y_ghG%>W&%?LanDnHu?vCfi@Vnch)5fi6&+^s<0zjqAE z59%~|`7~OV!2Kv$>M*Eg!Wf6fvVpqmP$3<8g(v*+LYS82CdGBlH!}j~#X39F3htl$FqC87fu~p*Z-Er)8NU@ru^TqV;cr5i;P}C@aruaL zUA^(4=q@%!5VdU^AcPsnqL~Ipm~GB7*`Wcl(bZgnBfQEEbovlKrnF&82=h96mhmRv zS2e;w%ShhHkPwWsGGQ4Hfv60K3Wr`6=@$BG>4-G%n(Z*1f9sa@O50Dea`8zuEBd|{ z$Pq@uI|@Do9p2=z8m>sJ;Of&?;)~Qnn)m1!A@o%~@UwZ3H&^Kcj_BLCAba^}0!|uN4*JYv!9aG+2w= zj43zq92#&*LJ>G@alCA)01(1rDHR=sO)hJ{bwi^syT5h6ZbO8GdtPrVRELw#PtYt_ zM73L;+!Dg--k_+xM~T?b6{AuYV?Z2)rX@MA3)w)yiUFaWV$iRqOq|2Iy#AXGH|x0* z(UkmXF{5{@l*4Qv{PGn*WEzok6>k#5vAQyp=Hhu#({^SmM22w$YJd>dN_4MuO9(Tv zKfQW0RpsUbUCgIaybyK_XCDGw0JnuO9L9aw+Va{5MEkjDa$6|UADr3>ZbnP-!5@d& z^z*`5UTQ>2M@no9Ve=L^KQw55R|pdt{lr$mdFgGF5{bZ6iwNPlHknbOzBwE>PJ$7A zLHKdy>^xM<7~%Ou-or#22VyGI{6GPM=G)jC&eiyWSk5<%*p%XhFyvvn;qmxtIRXtU zM?LVcsof=>%G-4Qc7fmomU#sBx$@MJ-;aOc zGE#VxRGfW=+-`U{J9-EXpg%6Bd`uj=ZMs6|TRxc+N5UG{t-URTsa$|v^A~Hpnax5t zN$*0?wJU@f%-j7nVni2Ukz1Q@KtO&)jZhYCS6)$H(cZ}JO5^E27p;6rcmWanzB9V3 z?1hD~LQnfwOB%bQm4sEgfDk7B&UD@u{>t_JqN~e?R~2k{A&kRjcy+&oYii@j2S5n( z7enHF4w7(GeD0Ed=e8`D&I5OpB_c4jvxHXa8V#!fLYN0(xC{^iT&0|&rr9Znj2Ews z+fh&9g|JQ6h_K7$et-~`Vw%V8>FhJ_7W#HE~<**M>YX_3({2YeU4ZmeQ_(i*6!i#0Vxk zlteq)3#n2KftwG3E;J9&lw+uCp5HF@x0^4$o#Ru<0^X#G`y9EZby>_~+CRbqdGmsD zYGA%EJz#6R0@69?I7Kgb;p3M%c_ap%3B9Boh5gE&sn6@;5E7?j^iqQx&eKy+@WU|u>LJRYz929tIskNEUPSqUdAFP+0(wzT zJatfxmjw<3dI`tD*dS&-HZ;g_AOrzA)BV?G_gkRPj#gQoK?*%V+R8_vwVXelnxxTV z^e6*~j;cx1j23lVm7eL#K2>x*j?3@qP~rLKxsGcxA6`7UviRY>LMglb(<@zxVa^^- z-mM#vK{qS^l9L!r`SNu@ zt#bF{KX7{POYxi$FguWw`d1{vi%dQ%r4>Y~oSN}@n3;$oQjI@7lKBBo@@->57XIur zBVz{^9^KD~UzIJhySh>D4}I_;`-#1S`;k?_M+Gyd$n4FuaRHAW&m{CqM@(BL$P^d) zALc*!;p(H@n8GJkFr!`p4cE3@vh3o1V=s|XohQM{xxQC_*_tu_^B zSwxHBFO%9~gF-4P;HZ4-%NPd8SJM4o`j6~$|bS-j^XXI4i?g)L5IlSq+wF|l!$ ziVILx&PGklJ1M^4l4u^(jeI5yO?>q5#=eJ}!nUnK6^gHf1+?K8o!x8RbsdO*bZ$yf zua|Xl|4l_KNpvKUO#l;yCz2qr{zq$t%v9lM;`hOorhiL{7B*-oqqgsEuxITeTFqoSfiUB&Rg``#~l zHTIh$a}$*|;*KOLxeQ%gamp)>REtaVStvL(it;Qp&33sU zakTcinrzA^wIC)n1U4h<4f(K#gVzE~lWZzN;$8~`PPnc{wiM}e*%m_^&p7Hn&k519 zI_4!1;eRMDQ%x<^k}T^uhhWUqEk3<`Mp1D(CTSzz2{g(@*88=iWh0?&aelaaB{#cS zBjIFiL7;Om4JX%trrbC;LXn@qT@QZqm|ET?cb8KRT(u;_#kJS7l!=v$_1GO92!i|M zH0fOLQR=L6(>U5{k}0{iRIR+GJDdfs+-zn{ly2bjt&U@IGnHsQ+h}92$Zx?OG+1bU z)iJJKafsaYu|j7HpTBlt0G&rmQS#?|5U*Jhs^j^}uB@6YhnGkMi*eL*l@mw5c557x zCWYu;mDF9)g779iH z&lFeykysz`(GS5rk$9qCqhac@bkdzFysV)kIwr;@zu z8$#xULPNDQrTP8`g)I>ehZ-bH3&JNKnp|lb7?yQA-0yWEdY-dULPN%X5tpPSlbEgd z;UNb$H0=I-4>da)R#chPBrGZ6_qnIEw6sdZ$?f6j*oRU`UL@GrYb}A9sI0v4pysJmyyi@A#j%DX#6FXxPoEqrjNdVePNeRC?M_eCuD#$ST=6uSV zvkSEYDz;?T1@2lI8<|xTO?k<_u;AP5aJOu?1&ehL7(6II#rglskXx25oEp57Zw_i{8uJaiZoi#rTb}iX@ zsj7>2R!SyC%QWhboxHVSd5TixMH-|a{+S4xU&HKCfud7P#-Zu<#*$1lEve0<8?@(* z6yiy@f9I(f199sEU%^YSEzJ0wqoAanSx;x18A%9p`T8q~5b*KB(nA7hw}*Dt*uFoHZ-W z+WDr;!on+6Z2Hj&9UUq+KUtxnb1<*X{*t-3*@NQ*gE`zS6=1)H^qdcuixdu4b&BU} zMpp_5vxL7Fzg6%>w#n*L`K>Sh3DQe!AA6w{W7Rz>@3X^hakUpwo+B?Ox5_{1^YZZ{ z&rDg_AeNiw9Rir5#ATaRML&(UpR-z{3XRw7l)4rhD&6v^taWhFiM`lcTI|+}o7QMD z>PzN%*-g1_*D7n+UGB+>ddXpbr$we6m;7c*KFMcJ@ z9DNTdPJ9%#E@4@ERpB)rLoTY*G)wh~dNa~{VuP-@-K~0&eDu=OgEvyneUw;AyHEA* z-nFaO$GH<8-Z(gSn69X~y8m69S6RFH%O&xBRZC0Z*Q!Oy*T)furmdf{t9asYuU}oK zY8AU)%A;>JZSZt;?t>S!qw44!_p8;x!Uo+cv9Ak=gimjp!dAL&ExOg9Ha^@qwEkM` zhQs6QU;lc#+2!DTW8?MRF&A0QP2yBgF8N}I8JH?H7i*F&O>Warw}eCzIo%x_=J=wH z%%!Yray(K|Nq($B6)|6P*(eq2ueaJ_MjDR99t@gei<9mEKg1#_8dE76&H5dKJL)wQ zitosH6S5N5b{5y|bMWc4RpoopMZs=SGQW9uU4WCuosrWaC?hUR#hb`h}t)mJ^L=T>Px*+*VT%I7KVKFt&1^GSf_M#HiFa|B8aw;ZN~Z_D3^KfgM+qX=)J>ER;8ETIIN#jo-zW<1;|7^6OBOm%JFb1 zzd`X?2^{39^w4FgL^@3q!tCngS(}7uRTCn74wa%CUuTjpT zi7UHAd?_Hoqlz~4f>Ryx@!Rt&U35*VAOrMxW{JtwF2w-~>TrxJpwQ?{9RxH~YyzlW zgd2HHW^L|_pKu?XII;rDau2jp?VX-FIi=_skOlWi@I!)b(3Fc1FcowRZx_4xm8k@m zeY6>uA>77gLLnmgD~@ml*k-7&QHu;jI8v1e5*ak0ts z-9~@>q$RFM6aD);Kt{pnj2ZWWV!Siy`kJT0vxtPl|~tVm$bBxyre7`5kJ`=t${)RTsZNcPCQ@ zTb{Ng%wz#`w^zCpLn`Lv3$DuVHeI~5;ngC;_=R!g(bg2Ma* zKREJ-aq>huk)UEzYG?#vI0T{v9S@{-GM4d^KX1FQQ)9Q|n8NYc%+a*xH{EqC3mcKo zN<|Dy9q`0S@G)sB1|?0_Q8lrGf_|{Uvo28|2N^ZjR?))sSQGAA2UlSSOa}IjcvHH{ zAznwP7K2>05Za{aOns}Dcw2PeMam@>JS{?pR?rmQysA&zF`&S%oi1T?Wk;R7LY*d_ zD)hk3C9OhbyJEd6U8N~~@Iz>6o|DTM8`YsrdG_w`cExIMnymPRF@||zjZD~S(7cgd z+9A>yL$_O`O6jrq<+Um|f}i2i8fJGtLr^RVWz3QZ8u}acS#ryx97m}anW>CJARsAPEEVu zh(VQ$&VfSDq+(B~l#iY33IM?#hG{R&ARgb3b!+J*Ue2tddaDNN!G=;sy)TW{Q)vRo zC%}0>9=2^pu@51$iV(&7ZdR*ZrBz*zMdp~n7j{MM#$$&)x!-@503mwRxuq;7swinH zFR}K8cEOH-&Osl%E>w|P(DTg6b0;zd9S*z>tr1MvQ>-G>cKxJwVJ1U+*Z;uBsY<2E zEgN06AL&Nt-Y4hN<#DGIS1%qmsN$cxgU#IpvP=XkCwB1b8bF?uz>$*~)wJBevvz!! z0Tqf^#94}IpSep=I zJT5J%E`@?ghPx!b=wG~(G;?eGS@sTDu_||ys^XA3xJgy;6x*e!Gwrf2(f3@?v?{ecZ`_hS>iDkJ=za;JeO044o}?5rO=9}-`JK4>gHSKF zzQo8dinM3S-09QlSG}fsjn=eO5bMv)*IoAwYOP1^Quf{dO#E!4w>? zS7Dii8YEGBFjV*rK;(Vh2WF|Anz|Key^ArQ_+`Gv=?>{)RhfX}q8h1(I`lG(aLMc!$%cfH;-^O%lXb)_NC7CX0DRnnGZ8f^RzrQ$e!PUqv@I#{qwTvf8 zmiwd@8RHAD3o_nuvkxSSy_zjl zS8|zs&6%TeS#O%N-pXZfHD|w<%lXorgO$rAY00IQM=`dbAo6*9EqNE@^JQA{Rpbk_ zTM91A7hY>AyeVIFx24EkzW6~)ae#bDSW8KqeCgAc(j57+;+C>%`SKSn<*(!``dTVR z)Q;lXZxt$ZBB?}0CB zL(mxT5;#Gi)KAP8jHJu_+WZzR*dN*ygvABXycI7&(s`f-ZGu>J(DaLNo(eUdZgewa zpo$qjqYj#Vtn(#A@f01@tdC^132Oa^g8rw-)3&k}ep{rE}iquF&Eg*w?3cglwfNflE!b%A9od=f9*24^*)sU`#{G2%oHjiwQKm;Z@7?3yKz3i5!DUS8dRV&GbKY7sKrh-sfULbxuNj!U3Nv=OAJ-1DT>HGNG zX2lZ)ktQ@flX{2xx8|*LaUCS5FIH_h?y+0i*Xj4)){nIyI(X|r1oW8-7cyTuf9)Mn zS6a{SYx7o2GV>o8?cF(6WNi3;e&WS?Qc@BIQT}}NGCs1afU_y%r2a%T|8aea|6syHD0uQj46UF{4zQ+(ehm&zl zp75c-CXZbideZZ=TePp-v3*160&YByO5(31W0nur8GFWmnf?iF4*y2gJE4AZA$x;( zh@0g)%T}Q^=o!E9iq3NR;|CBCYaKb{{g%i{ahJ)M#s^pqf4|e?oi{H6AOZg0lrAAt zXoe|8M@$6E6jorD$lwf#t5lcO@b&2ybmVK|OA4SA?Bv%_;k4xMVWp4&!Dupd${<~% z8gkOtVr5rWkdXyqRDR-uSZ45t3uSu1mab0iF-uXwIW6u8F|dfnyWDBZuQ7ckes>bM z8D$~`YeHD^`g11poJyL4IP^wx^a>6WdYPz)v?d#kEu~A(ZK*BbQ>@d(!j7(Jz4R7S^?RI-JJ?U>Ocm8&cotKh4Tn4 zGSM2j7;=AeGeXhG6|gDTV|_o&%+&Zw3inmaQ4x(8HH%Bnbxfxj$V+TG!TfgN*wHnC z#lujB#D2W3SmkxOcask9oA@%64Va%#nd)!!v2xam6X;GCH08xBd@NAoaH{N?$+?2b z(~P;YZmbartQ2<3!EV^#kt}u8G=$3qUbP*|xfAzEn@(WLJdP{Ccd9|%_uKd}*h{PR z?u@)h$QF!U!K~{U(WyrAM(PcJ=FK#_>~#C58N!F%q3h_41my!zL1Or}=Ng%d2e+SH8Nu?XDZn@oVAK8MV>S>dWz3BQ#zw`o7`m<8!Tw`OD(Hw^?77pJ11Aq(p?z zZYg1;xQ^8Xm1y2!<9z)}xC7)rN=5Is+15hKYqiL`cgkOIJg&JyXB8Qkn!ozEVf3Em9;C{8)eUHV-I$ za;*)>F8r9(dA9DnwtW{h>_}{3`Eiwx=z2#GHvu!O-$t7zF0%-xsAst8Mxw~N3<{H* z!caYIqQO{{nD+)Hg~+0#6bvG`hm(M-CaZC`#VSO{JErL+m=enEC^atAU7;&BkAKp? zCMUykGqgkpvr)*R^Nlc# zN1$dCigjMY;=j2PFQw#akR(KnXNE}NG-8o*=`d7e8%#oBpO=I;(_l%{-RFb+l#j$Sf+kBL!G`xEgOWA zc|iB-hs1OjTVJ210prvU$)(rrgl;kqTFiV%ZCyNip{9a&uA>?9uJ^Nphla>RKt^{k#XAJ(Zh_Bb$X&iCc^54o$OWK(aIiJ z4443}4XF~dC|O2R-m;t&ALz_bc!C+>nc3 zS5gDhzw*Y{4N6gbIXH$I?*z6DO<{>W>J~Fo<9Oc-nQuA{ry7u6f-#=Sh zuYe(KfmHrRM#393n#~6^mAB)|K>3se9j2Y^_O#rPCjM@(SoEQpKYx)4B1FaDod&Ji zB&Fa}e8tqZ8m=U^J&QN;QJ21JJdx8^$cyjmoZ60Q;$MO&UwC8h;?D^W&Vpd~E!Rhd zz$9_;uW2)_!@CL(%@2gzb}my4O^B7VKMXIWnMdu+Fu(QfTf3ifTK17XBac|pM~KSZ za7@kX36bQ=Dh!DUY@GCorY7ZNVZ6CU?_i9|$+3T-SY}Y2pCH#>->x>*+?h>u88mpi z-R;{d5hDZEcMSqLZ5cXlIjCWE>8R&CQM+W?YWrrLjqB1UD_W7%HR73sHQ1-Gaw>{Q z(GEeE8=uw`s++~+{g+T*EJD6Rt_T9<_#oKRH?KrBV{Q;Ku7B6tf_|rlbqWDc`7PX> zt!Eg(mr*9t0mln+u}o8f!d5 zk$9>>UVHE#;+QQsD^QKCqy{<_Pq}o+{KKR|K^pKNWpI`=6nhyCbOC`uaL9-lNE*&o zV9Rk1XT{-Mn{bu`F-jgBzm>q_XAABzqCKLf{Demykq1ddkrvsId*0#zA?c#?IHYYs z=oPBBVpJ-&paC0>ZY1?d2bDD%2LMqh!$3t)4nGvQKNsqTN6}%!fJ3fMaw$P*Di9t7 z+W@6$fYRc=tAyO*@WZggs6pm!7_O>8_ElM=6WAc`91<9E%1*|48`dy2oTo+AF3vXt zqm8)5khsE7fFe!?F_3mLM%pkGAW6DmH2t|OMkwGRj(9;(N8(Xz+Hypqh{;xIi)_fo zJeh+4^nNJf88i_H1?;e)GD4EPfwRzoAv1X7usVo=3HTd|1%#m=0a3q!v(BI)bS7*# zHmw_v(yo)b5Ki8147qPYL;r!-TLLUFPW=k!fx#iGwj3Vib4cS+czLoFVx*3|nCB79Bw#+*PDT)t8c+wZhk-yy?oTjU z#4-6R6nH+D^cz@q!G<{-NiB`0I$5Em&Ev06;+?@TO&TlY)Im34G(KRJY$P}JD&4zW zy5|#05uB`&D6kP8*bffL#skmW(5|8(1!$`N71{zAE$#@LD-4GihW!;<1~9}AkDknj z{1cjSA4>aZlCCEa+UZ55jHGCX2|xB!-?F7jw4uW8LutzJNO1SYT{Lh3s$^`YIe^dv z!^tZoY2DNykBlMqDB@j=7^a4m3l4EZ5qIOM;=rsx1S!TtQJj`q1}^>$4rv{ee5eYc z@S^RQR!#;xo7pba5!Ek-&8g>$-xHIpEg7E7Vk^o|1 zA1Jx-=zRg4o>1CUTO2~glDH!L27dV|oZK3zecHj8n9HPkiyeewcg%yhB#H6lnY^fl zbfefm5YTS|C|T(!+&(IqUgYnFQ3mFc2HKGAuP|h+T$rn6D#G2`AeD!?l=vu$G%=dY zBw;Y}8Uc*5!Iq&2p!cpr(2i3WyJ5`g3jp7trKft%G9^X3iK3jxk!5H`!U@J17>!$l z$t3Odo{KaMFw#^My991a9*!EKD2ONVFp+pZ4pD}a7u0aw#bf5eqwqIVvRh%u_GZ*M zV%SCM9U)2D(R!8Q`udYpbPbf=$8b_(R^F?k)Jfp*c<^M0CIR~Lf(_%z3cZA_iSsIb zq^jOPhq3bt91W#efKp0hgg@Dmjd2NQqM$W(gGKh_lisTV`CT;A(oI$VvN4xL8LJ^E{|VN>bPg&dG#xb z<2wzXOM((B%uG`X6mG|1Fq>B}Qfmx@ z?}}^Jb6h!Kn1_-wfbaH+|CmK`M>2haQ7THq+L08#s^r}hPFqr-`3_*AIOs%`tw@y| zn<-Ay*+3Zz;;X!Gsvt`f{xnd;TzF7|gOpPWD?wR?VfIY{8=Bia&!ueWpeh197D%S( zJz|*8g)%-cQ$ezm-g5uS5b3SPCSl9K;d9mW4P?gCQTL*{?lCjIEgR2M;Wl%RQyXiU zIu6%7Z)#eI=?B*Dp6uVHXl9yJlAwIH56tg=iDktwN2;>fBT1Go@=e@ljK3;f%fQ z#V#EmK^BZ4EgxC=CzFlm0w8H&JZfqQ#ym$b+9Fj}DigLx;;~uvq=IH-dnhpW?1jvg zS;`m-vd0!Fr{l6mV~j<)3|KGbzz)XmjY5ujoY*Nw4^_56#EoJ`NaDjtxhCKy!e{a* zB0nwL@m6H{39XxLFxZy41A6!FJq9w*$XAC^U9(h9r);DwkJ6hQu1N4Tq+AnVrcbmX z3%$?Y00nleFzlQdy_dcJ4slI#I`*av%_@?@hCU)9P?@Bb<#U%O7zSd2*_GRJ7y2;U zSQe40u^?#aKiabAHYg99vH4+GBW=C!;^PKdvo{h^FD`=aPf%^*Mw92{pOS=!eR1cn z^%ZRoS7V!B5w5%{e&ZTT#)?3CQ(8F+!p5X4gQSR*<*RN~8J>4qSP{dbA1PMP0V9s? zZK=|I#jqLQ;~nxM`-n&42Lm36Sv>gZ4gAuA$s4?spbjeGWbd5>sGlY4Dp~>}PU@W~AnEzSJFyY( z88i_juy9R3h*OYYqxBHx35GWYB+?v#n!%hcwKOmsS=oAzEY=bS2C5d)Wpv#qi$pOc zLMfv^vd&~sSk`fzCy8}gvG%@fodoEqg*&D4)T`Pzt*E752TRc*hl0o*sz}|)Hri?JhyFo z2!Na*C<1+%&oGq%zMgV)vP#mRTm!+F8;JtK@MsVD!Bu zV%Fd+-L@Pv@L?pY?i&wwZco7;4An;F$m9Cqw|}l5Av!%Hl_W69A4VE>(*Tr6y1+BWV z@D09Do3-%r+rkT_#oD%oCD=kw+hX6+;`C1~Y({BmGHYo=X&L=warDW;4*TK~`!f2* zTbvWN$G(KK!ALn4PTAiMD6M>Zf+1gCV710jE-!+=zx}#|5m8=+WMe3mF~dvji*3yE z*-KmOt4bWJn(eE6-e7a^Y|vHOp)A2p$$+;g^W16em;9Oo)a+Ya>b86bV>%uk7Ol0%y=Ho0h*eRB_6Y-b21gdR=6(-*(;V9IT0pY|7$+p=?y!!s_{A)S+d+Q%O zm<^JcBitr%1C8Cf8APSLopzVA#v@d2f>~T$lT-F~rW~h~^dbjM9W*Rt_EUV28nase ztte4LlH5QX4|Y37dKt%qeJA{KD^A&(Zq|-&{%7KQu{O<55^N`nH|CF$+?*qD{RsjK}ar1D9uePDZ3U968?A?C9mVu zkGUmI`x%HiVDw((}arH=RVNXY@$F1?| ze%T{6gbW4+q|OzS3A)aejmweFRki5?&efuk{Cym1;YL@fV$Talm^p!%5J_PhTb-l; z!JVNCoHkfg1#hUlKwzPDczpEpSq6dWT;X+_ST0WJcnGkdY_I2qfZguxt zg-o!)Hkn5UKDB{ZVC#pd8h_45y@8AjUilG>!kO_dFFJW{18EP=($_=vh`(YQyLbc_ zU%c`Y&SRXhPNyi=k}5MyUKB>Fq4n3agtbug-79W-~Zt zUdp^e!ro}6`0BdDm=X2&TjTl=91j+`w{5rh;GlJYT)i)>w?j&d4i!gm@uJn#x?49#xy=$+Yl z+^DhDwYj1`{&*A@ z;Znc_iRjl<^GZziE8rH27%-rkh|6%{_LOJt*Jq&7A2CnkQS2go(C?X2;j-~iN}Axp z@==sJQ=dQ^2qNMM^n3c!PPD#3-|Xyz0eXKU%wJb8M2DX}LkwRciBy@Pf&6K=S<&6J z9Rn%)`qcjZ#aGu!M?50ErL8gxc-z5g7HEIM)2^ zcUN&d9sQ)GBYkO<^O#nyB|;@|3qWVmxT^x9?1`WDS$5%>z&W`j^6?6$B%E3 zF&Rs#em6TNfQN!B8m`B=TfOw{##cy>}v9I>ym3z}!@y%||_fncpaC z)-+R1rx?y&CK)L>neQ*oknI?%T^)Jls$O+07rTH&@N5+`e~L<-$E-kD#}zUEdu+Ds zd;YJTV(W`OMLD}V>m3tZYhJFcV)iNb|M0q}*?*kI*QOj;Hw#h z2sZaRtg-Q(pqoHuvX!*oS zT=|24A8dW_eqa9wO|2_((Hp`9`+-kt`uYs zD!1^yy*%aBX|DcG&H+clP+pE%~RjV~EYt+)uMxlGweIT$bs& z!V8o|7tfo=+hh1ny2-fjad$~ttu$CFExj>{vm`8(zIfS~(rM4ge!BAhCr#Tdqn)9Q zsYb0bp51U5fl6_5-iUkkr6}hs$7i$0M2+U8H`s2)$)n|!?Fz+X-noqGJnFXlY!mi~ zZzp*|@I3DcYXTPQ$n{$|5L3Wz`(8~68zPvY49G`1Vv-a+iycF&>{9Lyt9&}S> z#CEyrelr(q+^E(_)9ppO#^U;e-e!fxH~wA=oOE>oDy}Q6l;{e5{|_y5Ez6glxh)7- z%95|GZtP?{R1VWodw$+pO9DX-#eG4y$1ID_>jc%OcG$n{f@~Q7LDs3>? zZG@6wA0ULUy5o4TjpD0RB?yYyA6gj50m&F{Sr{7dC=5x>grw2A#@_Ei$%r5<2_$qt zbX+rm_#)Maf=2m*xXo`|bO?IDZKjYClw0ht6z-+y>m|?>s9EAC0u0tI3I4%@Z3kbL z)z@$cHVwb01N@Z-v+57Q;|(zbhR)lE-1y0Z-3iClf^VSLDdLHT; z9>)4P^d61}+YSpj34V|k76i0>4E(`^MU;fg@P!9&hhMV`kHYa_IuR*4mN6&c>9U3i z4iQ<~5jo-EX(bW)vwE4p$l~zGQuEL}oybZcR=cPqvX(!p9vED)9r?mRvqlHUgGIIR z`!)fi*$SiDb)tJ5qT|0Xqf4S!q@w$9JQy(M{+*~notVMN=<)EFIrHdA^O&V@**7II z82;F@{+MNlSRI1cwUXF9orn!zT)T|uCZGiThV{ zIPg^alzbSP9Mqh4511NWn!bZddzu~;-d}Gu0b13rGRbc#`V^Lw{X#lQIxpea~jKAW$yn-8}vv}%dH7Fj5W zBQ^z#=C=!PvY>3`iyY6Q?VO6fM-(}c6?gX+xt3+SvlKtji}Kbhe&bl|yIWjZQv6sx z^T}Cp$ZiBuu%t@2B=mJjQg}&3S$fobNfJxsH`3Cl87zrzM-{`C{sUzdhy4{JdX+^amE(_MCoL-%EbqUGtXwXsTokNg+p2tP8H16p z+Szs8aH<*%ui7Q6uIjCN#}fUXrTWXR-LYPElSB2%ZuN)4>aV+z-_ELu7Ovn4)f6;& z5)al~bGS$Ptb}FKn^M1)Tj$DK!rI4W;(ga@*~$HS@7-X2S)6ypnZvn`#@DC{TK9O@ zR??&H!qwU@`rG*-F2VIh{HVE%A zvz0d(li%7d(7U9MEml?tGAVyD%b*PQm%*>~lx!pj$dyHw6KnoI?M z1rH|FTBA?kZ1lWvq3mFVtYnbzR~}5LZDfy`+_vpaR9lJE4<2kusBJo`4ddK4zt^_m z+&)U)zFOYC{VL=M8CeR_&RrSK%Zo=)!s^-+wV@CM$t4h_@}5GD@1_8h@W99q22v9>?*q@sUaap)u4$jbik zQ3c1&b+*0ekw%w+iG7a4*1-b5;f3pCK(Fr4(XT09-DT6<)dUWIVy;$_3?t=~P0~4W)wP^&PM%<2a}0ildD=&7+F&~SW}@|hl{SO8?I}*t!u}w>sGDnkE~xh zSielQVJx~~YPfO5ZR1+(hDFte)yT$;gN>V1o3^5xw+%P#+&1sVZaP+NI*)9+9&EZ( zZF!1rc^husbK80lyY;YY>+#6elY^}Qs%@m`c97wAh}(8p>~=)ecGSpr%)xdX)lP!w zPLknHirdc9*q!vMoo6FESqD2gRJ$nA-F(B{Lbu)G*#GN181W}l_YZh@|Ep-Qq5n7y zHW8rnmvykPKhnW|p~2Fl0&zOn--!lW%DwN8x?77%{gCe(RN$Ii(Eb-_u(IIevPT2u zsk7xVag~oUDu;_Io7=1MR;wcOt9#aKV^Zq^yXuP48bUW3l0G$r_dYMpY>cdIe7*4^ zZKo+Uw>fUGxpK4RA3%fc|3-ry&2;U*8905@@nN>_U~ceiu4`z%Yi{A~hlRnZrTwKP z%*oQ=#PaKpZ^x%shEG>UK4GTj*N*4cC*Q8W-`d#xx-ql7IlH>~ZhLd{^XB~5&Dk&8 z%NsjK=Q~TQyGvhoF&q1sll}Fb_Zz40x4#{3?H}!&eRy~H@!%f^q!YZ zmz&o+ExmqT{-)ZdE0SHmv0|>?;eQ?trt{a*V5@(D2DAG2X)u$2ej1FLo@BL5sJfkD zogDGcM}wLCWg5)*&(L62|4V7GxW7PyH9~X$57J;J|DwUB9KQXJqru>RYZ@&42MtC@ z|8LV^#y@DVuK$k)bNOFFgIWC((qJVkK{OU@SjrFx7t? z4F>ub4fbbfFs*;lU>!vQii5^1QzrA2T2H9vRj0lSQ~s4SnAyK*u*3fp4QBSwPJ`9_ zQ_x^m|Lina&EJ>?v-&&JU}k?88m#fZp9VAjFVbKp|3_#rA;HypZ!N`7>t&-Vc*9AC zSPkTVBMk->{1*)-v_F1Z=DJE*#Qy>s?AAXA4VL>yG+2r1?`g1_5E0}5JPmg3Ck+M? zbg4_~`AamI)t{xo%>I2EY>mw5??!`l{f%j`-akWw5zzmqXt2KjDh)>SD-CA;-$#Q% z{|+=5s0&Af#e%2<|4JGR_Gf7@=pWEvK=HpT4Ho~mp}{zXG@by6nEn25bJ? z(qJTiN`oc*2WT*=p+>6K8e)C~dyR}2YkTbD=EnLCaUt*t7r2?LUEmopKqr_n8V*=d z1*|C25(eVgbhrt2$e0pc(NV>ArzuJ<=2YZM(?qnVA1Pf+Jdc&pGN!w-J?SpKee zcLNvkFaZLb``k-1frJ3xd{N1#ywf`uaLU!@<*lJ}uC9wqPxF&HiNGzpwA9q)BS?Ue zWU?g!(i9Xj+*+qQlD8zmWN8Rm41xff;5G(m5>krg0>^PB$N&KDl3+dnfcK6vo+6k| zF@aqSz{3Qfnj{9)1kzcwbD3%aK%q=K-TB`5kGPd=!`nPFdlQ2$y7D(KMd4g9aaW2^H;jusGf=&4iOyBv{1w;Mnf)V|2 z!TA5s1q=20%>|>wxnL#ZBfq#{fFCXxI3|JV7Z(im!v*90>4HH%jZxG5i3=7X1e~PY zAWv(Uz_DQ17X%2GB=DqDsbvQtYt5^GE_FaZ;FG#3xN|}ngXGQyn$2yW1(Cjs za2^YiNNJ^-o}|7-#1ND4-bsK{3;||p3D_f+_chjTGa+G5Qzc>yFdE1P5Luat(a-m46$1=T(^~G@G`Dw zzzI?O)i&?QuUwfCU=AksMG(iU(pVr#I%eJ&z-soc8uBF4fb_w69bEK%&aQxreHxWC z1inqTdwd{;>`TY}5T+Ugc@5sJLjs}g*WSLkc>8fnGXQnGv6CH*eM~uC)K%L{v)>*P z5U3)s)%B``VgNuBT7gjH)B;zP0TokVq7@z52mk>+E@2GC(nvbj6(qwULwZ^eVSR+Z!$3(-RO!1ZM~8MIUj)-wXnnb)fzcSyG?X5JAV~wb zKDVr}YmXKIn-5phJ0OT05aP5!q`pBk<_OY$Y1PW;av;5{_^ASri0B`Reg7_Aiaa{_jgy)NR-_)A%Y--Ea z7S5J#0B760p@9IXM(1=w0nP<8T1@l6xnK?b(@_KWez;(%S)sGPcfn-Uq?TyM=V^4W zPW|SBy^ogP{o#VOB(MX1xM1-=U9hB|E|@yb1#1G>>yyRL0cvnA7$EnD3x^_4>4E{~MMvJ!z;P~^9L@#fXf&+DxnK^a;!HnXFvnYs zibN-7KV7hFeDe zE?E2z7fgaT;l-29NggoEj8?o9_*WNf1+)BzE*P!V5zYm>v(>9i!ap-0a(wGGhm}zG z{tEeudSh?m;7iIlK>o#q;&Ot<9zrT60MDR~3XY}C-E`EzxnKz9n|*V?xL_b%`Td7%jm%u?ODoTrl-Rhh^qT>Y@6@1?%u;JE94DHxe}3IAm)nG(LE6i^k$N7Yw>%z2X4s^L)49={|F_<);hA zan<4!=Ypm6&i!z~hz(T$KU}c#Pb1tvT(JBfE?6+Y@TUu=^Q#LMBRU5t@sw~++`_qF zddE-39jSi0U^o^G`-=;fg>%6i-2+#=quv}B{NjQk+)w|+1A=Xp$gJs3!FrXY_Zx{%kAKT~+iJ zCMf&tDWde_dDYI{aM^Sv1ifr4DM*D%HUt1f8UlcXBvd-gppJPt9Rgww*)0zZBmsez zY=yaTCQ$^rQl=H&iYgwVs$e@@%mzai=1!-h7%$O@-a`ryb?~0WdgJhAGH6y1GxUhij+PkeqvHb_$#W2#F#)NP zu8jQw4W?j_&L4#R1m-m#3Il6vXQg@~~dnQ_Jwh0E00(cEY+-(c> zMNYCNqXG*rnMOHKv=>tdrPN*}4`GRp3lirKRW)gJneKA%3@1<;YR+f5CcK%RuPy8!YtR3n!Fu0Z5-NVU9-NIN%P5vm!3Xd= z`qdGm)wn3n_zN_7<@gFydWjs%fk_CKm6nlNWhJ!WXt2{1 zuM8&I6R=QK^R0`N+%3sP!xiPWG-PotEYJ1|$_1IbQ@ms0nRR9UEF#Z;&|pcG94|#) zsH>@KzQ9CU&A(X7V0EjTb$Sy@?uy{!lMF5?cT8PURj97}vikDEtM3kvMLT|_!CdaN zeET3xo|e3e#{D~_rX4P#A9-BWQXo~qO8r9Py!N?Sf!H9LXKOxZK!5}rS@&9GP)483 z3^DYK-TmzN67OEJ+-Q=d`a?>X(RS?}Ze;(!$i2=db)If#XXg!t-UT%ggID%SOQtQ} z&lS(yc=}nJEL}Cmie>_#O{6>rFV!o$U0pmX24Bcukht&Sxy#$r!E5MU5)bbc{@&1~ z&APX}KPu@;I3WOe(O^u;6t-1#S*38Kh?=d@ylO$nl^f5##Kw1jR6g9umPGmOtZkTV zEvBB7+1o6Cn6DiHD_dxGCh=yr{>lNVc}0$OuIVCw+uh$XIim$iZ%_|a(M*=>Re1es zslxTF2@xrU9Hk+2xi9r0ntV_={{gF_fk3^q=n#$WeE}DgGQIQ~)_#!wRJ~ROd#+M$ z1r0=yDIg*!Yz{R+shBvCSvYHbhqe-^q~lNuN%qpAQ^*e!$T@^hLfgOr3|nIxfv8sb+6T8|l3spnp=pdC%YGOs= zIcaZAQYrTW$oB0n5KSERO(cc`&5ST+JL;Mf=i|Yo3|&Npoa)5Rs%&C}2cd2altaD% zN`{GA91WH~=o{ZELo7e<9OJxFDIY}@2zG1@J2Jz0?j4?`BjR+Bz>d)%L zkZ^ae9hwslRgrq95qo1q?!9Q9?eNh3+|()hps3z|O4a>a2b_$KpK8s4AmY1f55u`< z)-~S&$o#f8FS=&Hu#*%baH1t{0^sFXr-4+yT*bf4(@T05NOBUcK_=m=FQsJ~e;jW~ z_WCf)LNy=t=V`ExSq*LVUuiIr9qr3yKWQ+YB@O25NkfZsiPG#*rb4R5M!ZkBfbcXE z7YRJ&s~=X1&ocWi+l>h*OaVwPb%c>iHAh@Bili0*szE!rc{o-O2D8yUp(6aP8)7MUcnz?8rrzSxbKLI7cE)BI=F~0tJdDG{hSNSqxGXl5eskxrDs1 z7Bob*RsQsq&uFMzEB72WvV!u?m^PTooG{K9k7B}C^P*HgoQ;kvkTf_ZTA_*J$sqvz z!UOK`L_OR#+*fHjcVF}?igJ>nd7vyY)cHI=gLG{&W-m?PmFQ3qFvz^-%v%`ch=RUUmHeMIWISZOPCP?SlMG5BQFe7brC z$zu$=y0r~s%W3~XgOxqpt|j#rrpzabp#FdC-Dg--U79caDsm`7QRG+!NRS*P=PXGO zRK!3Kk(^Obz$#*qqev7AkSIu$j1(Y(1j&-2K*>2MlIkt>c{>;Gfx2MtEtt#&N-YqvV-9XBz2Q&t06Gk%?V6RGn8MD3nwRg#8HJ~cg* zyFJk04ZSIyU1(N4jHAJF!#**d*Vx5r6W)JY!S;p!4f_|Q6WRXY?3F~V7K}lfWmTi} zt!%cMU1I{@H>ys6M+ne}oRNGY-A4#^$t`FHv?;1iLx{!EU@qytNc}lDe^u9`WY>gf zqSOP!`p46K_{ou-Uqm1E+tqW~&F*Y0DPg~t)|_HIvu{7Y!Ja2ozd&){houQ-=fBhcP-dD^I2)9-BsnQ~5h z;icdH#>94cYm2_S@g=6L{Cc&)PN;IzOm4>4z~ftwgp& z&%0J>OOKZO?l~z2Djf_nDOMZ274cIsIV^j6R%l6dvsUVk?^eu2Pb#HtUShy(`TOVh zpNgDf@H02YtihAT6F1mskW?`YpH-DFI?qA%~)#={*gWfOSTl?}eybN&u%LBkFU0|zFX>Q2}N zN%bO>FsUnb#xyg%;yVT`&4EgA^eZtGV@w3F&4MWg120k&u(8}vF`)|>UYVw*;znF@ zMmOV7brg#IIvHW71;-ZVzHv}fUW8N=>|zIuqXr>kMyV91`W48@ofbygGRM8pBYw$% zdzNVZI!YKH09NFuNvge8hQj6PiwM^qlzaYM}rA4-r z<2nwMoz&9#prvL0fdd!xQMc%|LQXfrrf9CK-`}Hqwr~7krgGwPN#T-Pis>8(l}XOivL@I55UPyF9RoJSNxW`hB7FxkyC&+hC>R-K0 zdO7g^4@YY4zVa)>nNoEYCtxadz6>LevuV(KCv2O7TfVYSzLZZdZ*RMub-aB-zND9p z45u3NING~9&k2ctD~FOEoYy~(&tXc;csi`w*(oQ9=6$7JK#LG5zkfFXkJCnNc_UPX z#^$arjgw8Fl)^F*(X!ja57KA3O0R-E#Fxp|i0mFO`QTiz7q<2}%MKUyA;@S+9w1}3 z+SSSgat$^44lOah`%jL8BZT5XnUi$ z=f>B2)El1!Mp{h~A7e=$?9lKq*@tbHKkz&qg|pcW$W5!g-vJ)=wqHZ3QM_tL7E>r0A;<}Ubd1sA zJohhkne^_cFiv(}A$qAypgP|XM1E=l0eq2~;D5nvaAnpo>kT@KsMy7lKi|VX|EhgQ z9>7OXeNQOmLg}P9T$OBGKcF+T;26J9>m!{%K0E@&F2nbbra(6HzIn`j7RmSKN_pNb zdLelEma{Jl)befc9^AicYxM>Z)Y*9lz`!<9B@k+oplDF1$^*0NuWjAk1h(Q5G3 zm1cB-pKVwSZiw9ql`;6KM+LI77RAZfm8U#|bgFzhQjs&$y~1_&>o#~Cb%}WXctQTg zjFM^tEVK&(wRZ9z*h*OqUh6045FJWs$X#s>1-)ul_IM>#XWz`M-PDk( zfBxVSw*?upo&NNOOOu-aLwnA6njrKm|ISdvF@F7C(TB=cJ@n=@pn~Hgb<0pNXkJwYLMwdHqzSZ`x>Ot7IrsYlG&xd*xnK*W+3PaQeskvh4IU`HUZ zIiif-Jzj~f40BbloW3-HZcV_?evh!EWH#z^r`G2Nh(b5SNeBPN5E13+b9dUZ^n8Bm z?y{8}0jKHc)ukOF0N7~+>vohDQFstUo&jO9!K0cIXcot9zc1 zI=J(q>?%#^I3w|%!#3X84FwInL|Ex0Yy4!nn3Lo*II>&{M~2Os#~FhYO7S^Ec8SxS zoCq9gYGM}iMMth~@85Oy%`RKkeXw+_zP>QA-s!TUR`D=HU7AFj$0<-MN}C*Ngr!&g zl90fUO1PN1#i89p{YhmW=}G}7y_mNx&_qP`(dM)>@fhb`nbRfqs~(A#Iokw+=HcAF zqA%%HofIoXo?fF4zIIftl0oB42Hh7Tl@LqUJx=}rb{2FLi&DuD&R>*o{mh zki4T^ffhlYouJ%{RX6P>5_V-RIZr+q7pEHE#Y4QOWh;OQO^E?tK zeD0+zSwa4OSYb5L%y&oQ7Wr4n5*O|;cRqWcE|vOK{Df|w;tB!|ui?b_ygnvu*_Cu8lg#?XFL^tU z;5!vax!`E9_B*Hjou-^CF2uDs8f@V3t5de9J1>p~t3~=Qdq_R-^EMvZ$lVXQKLnNb ze1N0DYFP{%UQVj&zo2|E3OC&Q95QQ&#gl&Y8b^bDlm50rucN-|pq{?D{G@Z)FQMki zHjV~+R4wYI01iuU7%m8P=;L#Fd{P^Tqruuq)o^FOtgp)O3QwvIO>2o}Gi|$Qs>Yh~1a_o;!3#}04HT+t zM8|R{B%RRK0i`;gaO?yscd6H&1c{A5foTM_g`qn*6y7Umab}!coJP|g0G$qi^!lod z3Thowjc>B#dA$^Rspj5aK`?9J_dN+xMxNGRLGT^nOL-xfe1h6a{CXLh*9QaZJ=?f; znv};+`-c^r1QkU{0_(pixWMscJ_k8jHjm*NjPQ1))=&jjb;)V`F;8My%jQW>#m9My z%1Pl9?d?t}&6A1hoct8W5`z?yx+hN5F)YoQJ?d|V16eE)Vt~`( zboqXcc3Qqb-T`b=-J(|gt>#(1bC&}&t%8z_DWZNqupU>oE2qQC=f2}7og$=P5}TY- z&ifw$*Byz4_0eyG&NqjNwm%ZBY@f1aww@opOKeIc-x*LdUsN ziW0qo8jR$W7NDW@6rU7FgAH($@rRrqrdoqADq+;*a#8y_M$hZqIh6Kzgs==OJ)O~-=CS(I2;43c;wn#1%m ztZSu`7lR=p>|NVZLLO1OhP+6FBtAE;?ZKMHSo+tw8A{T3-mNl9t!hrx{y7cyBEQ8j zd?}y2DghU=_F%9f|(zLe<%22CpPFfs5k+!Rk#yhl@xSQ3arg6*gr>$@Xl>4B3lzV)iHl??;H9Vh$*u=A`D3JL#_nX3^34 zI&%87nG$5kocu7pOxWzN`7{-M)(avq0e`9$rVGx{^JIN9&hK&HO(xxhy3g&=TAxeh ztG`b>DZdq-Pi3NUVbt{M_gz~Zr(PNH8vb&=&oK%maStXPULR_Y*4i20qw}Rr#cR75 zKBX;-HEWe+C>Ibgcgem+e?WZm-Wm8l*`<-Gt}Hf*FFSXh?)e3?g16(J_3L8X`LroV zceK!oRCiMUK!ep^;31Wahxxks=Ecer6cKPfo^HiUXVE6R#ObTGw~%{Jq^IP=QSPQKhO9A8kF>ijCkXjrPTw%lQ#m!pj6 zgG5T);BMG$8D&r^|I|%f`mp%|R(o z5%k#gI(Ok($;V@($7>~T&(~?&Uj#zG6z`ln?GF-ret6BduU*d6NNtmcu#^3^vSM{G z%MxN7^u~=?9ACBzO{=YVKj08QOId$Ju=lXb0)p;5FC>NqSy5SmOp=vHw419gl|prc z*iZA}@B(vf8y*qN`qC@^u5eUf4lQXVm9NZut%ddhTH;EPMnFHyxoEEPKv$<-(=NNk zs&Ph{M_y^%_RT?`q~OYxSRv*hQ#_Ofph7Zal(FNflW8`#fh3TpApEjN2PRDG^I(Ui z>0V%`YCl0icK)R|?iRA`fp7NmQXz&wmQ%*?w90BV*^mttP=z_ik-IkY9`*CzXt3j> za7%iJpNq)PU^WSH8pXabHXNT1 zh|d`FSVJ`iq-n*~Q}PF%>3=06w>knljQIBR`#73{K1Iy0-3+zo;g@4ZvIX#z}d@Ju+9sjUDxg9K|YG|fM|VdQ)3 z+fN!y#9yXf_)05YQO1!jM~)RQRuJjZ0t~qPWdz>ZOU8))L4z^$c3Z1mrqwH38E%H zP?5Pn#=Eb^e$x89uy>wlv8r~}0tcU*8ZSZ^sia_F%siCQ|w zNq0S{?P5i&==8pTJt&B2>r;0?R091qnd@|tru=L4A1Q(+s>91MD6AX-Tkh3%u?)sg z-LCeTZ}L+Q+0m%+9G1^82H%8HHrQ=xM>_e*dp8Eqv816Pw{xJ+?__&N8Plm%TOCR3 zzNy+ke8y)8G3pnikI|(V46FcDyFeW>cle{u_SVNhwdY}N>Wx~NpxF=K=e+RR9XxMrmE4t;Lw35 z#P`{Ac;%sX62@CnwD#MmiJhF;fag5%)$cvS(H2cBfeUOFev2H+&wZ|cV)F`$esuNo z-2$r!>9vTAd)eUu4mlwn)8%p=GQj~#%VE6zho6FrA_Jbxc{6C#zLwu%FZuZGo-QHy z!RI*~4fgSV;p-=sf#X?Tpf)TeUK3zPworhxwS)s?gk>K1N6=u8;)tL8putFjj7Y-b zNFoPFVyQ{pFLSJ+*zMF<71THmV59|cWK~gAxOr-8&F)~yPO`vKjez38Aa>*&#RH(4 z{w8D&N1iBePPyVSkdoM%-DV!T;0E0o;85wHJh5UUx8+5cH9n#@K)(&x#48#%cs(N8)g?Y$9e9$d!|3Bbzi_jhPw;8L5yhmqV-3v{Tun z$_UUT0JMvy#9>iyEZH9-sFvW+F*K#kF3Ty5n$Zd;fYF_zDQ!^{$q3L3BfbM59o7g# zF@>|htm!8awnL&cwH5hGR*=qe=$I8fbvz#@45Bp1a{y!uSzvJkfFodp4Mudt)@(jF z`Ae0kLyW`)2o7xJSb`BISkn7gvW=k`uC_L9KuLT6pjH(|$83tbFv1-eVJns<`Q4@aG0=!_7ot~7Kw2xroYNhX2shBf`VJIF2Zlab zmF&9_OR*J$#t=g?ipRcrh=Il7>boLz2Z1Q~n0(y}F_)sn#vL55Sm-pmWATGPkv z@ruP6q6MRA*m69+;PXSPE@PUpP;%Q$<>tNVNu|@qvRL z8nXtr5_2Lz)F}2BR+Q8z&>>n6J)$oSBb7o>O~P3pERgD8I39Sip8-LgzwO#*xTgN+ZmJ)>RyG z7Y2EbHdlrT+_PryKr`fFm?99jUt;KRRZ1~Li{4=&+J{)cXbdGa$)z}PGT(=JdN2+= za%Zx&qRg|RZ?$Bav|_U~)wMuT6e9$u=1G@PVDX1co_i2Hpkev}Yq$i-5`f-b(uiwc z7pII(s@|QDkgz~dHS9nXjPJW9Q;nfWY*jgI(Uc3OJhNVGZ{}sOgM>Rkjn--$QYWhn z1g%*yIb?GXJ(J;0faaKTG#G&(-o`B6a(C1?To6>PP@22MrrhTa-9b?l z0_d$sT*V9irS^5oGgmIGpUl zu@!wyt#{50Zjes5+R|zXi<1yhN%ge@Sh6otx z@s3u71TK41`oSQraN#MqFdBN}Ce5QWOw3CwN*8oA#*Fpu9$kXB{H7%& zDuGJJE8+oys@RIh52jxcsG3O63I( zPE50;)YxU|f>IcLXUXx2DDlAwTBL@?y!PN{Y{sk`+1k0;dn>*Hxs3Kz2o}!^Y&BXBw%@GXa;5==p8y%`Y`V1 z^(-*z#y!r0!8eSd-@MK+Oof9vCx}at!0f{@`D;vmw`4WnVV)^lF$kuS=|2ZWsImlz z(HLQv{89saO_`hAL0=v*B?F(~ALK`;61&Yaonlz9X_hc9RRu=?9BrAVT#+_-{eiJh ze9Z(0fjq>am3S#TFr|@G=mXqy438^9nABRBVON;2j?@pv{@A0vnj6xtp524y;+s}cii`h z(RF*TgQJmIy@}raG5MQp8tLN>Wf*(Y2~)6KqO?_$vzYl*wy!cqLX#DQ>$CX*bKWSo3=a|Z}j~~Zq1BZu#MfjGxi{J%yDMSmF<(~ollQ4 zKRucG^ptHp=+1ap=6K}Hcr4pQ;++ZXzoEfOW+qj8*(&{^m6=m#h^d;Hske&LjsDX$ zXHzp)Q{89NJ!~@*KXtH4|CvuSGpqiyI5KRwd1~{{H0I9iyUe+7Y;)UpX2|c(Q{0_8 zyEE74KM!odP_a+rI505#44u+EmTf^SV1a8E0}j9p&alllGbzo^tlwEsy}NMvd_h17 zw-x#MPRpXK(jvp%FWqN9bTE&fI+({V9Sr?b2ebOAgIWI2!9suPV4*l2%xdLP%d#i? zO33VTc;2$=f4-VKyIjz+wtc?5ue99#Vr%()bNB8#`jJl~&$-s(h$vdFVN}Y&EwPi8~m20JW^(x!Z_3q8q7_@%;+AD1|wlp)8LEG(TuuiP`qx(eL*J~UprZ0wZgV5nTpGB zS_WdJl`7-S$vSDmb~zhgS?Npa@LFq%h6SZpvfh$ru(`9i{^cRt;YuSG`Ve0)%XbwP zj#s@KuI7!fnnwF&0L>ERTH$97oIhzWi$Bm{Ezs+aQbg}}gg!u0V??S~2J`gNm7?Wp z)<$35Y7C%|f3q=B?KssEE&q!KJKsF6YQS_vvM9$W)bD-iO_zU4sn~F^I#P1$ZH(eC z8VoCZCb6^oxhGvY_7@G-C^Dx{fAsxeed=xOFBksaNCGeO28VjMt-z4)gk-sCI5SXpBB+?~Y?93zj@~iL|DwSz^Q0)cgxjXQi%yA@ ztJGiRp%LtbGREuo&PQ>nl@GLwtZ~;{g~j%jhskn;%1kE&q$ReKGEkZq%Dwg3#e)PC z-_xNK@r#d^n^t((3Az|iL69%Iv8zV8=P%bzb5j&$`>OKl4clKdSi^^hORrj5WO7)a zkigV5wRQ+o9hu4o>Ma<{Jf&^A!enI3kO~H!W{5jqp(&BdHeeq0YOrhLWk17K4&=)GN-v_ingXQ#O z94H&es7mvrh1k3 z?1j&r*heypKWVVVA2b-m@Jh4Ady6YG$mcXaX)uh?@^wghaUT;Ajt1NINc6_>KN44F(PPody&9jRvE(qTI0b8ZXFjG8K|tnxFL2 zwPS_Q5Qn@ohIxW^3B%zy8tmW?G+3l#=YK$h(c);Z>&w58rXbjLB7mxv$KZNN2kp`>#L4z&fXt3U&G}sQP^(PIM_=^Vn&Ljuqm-wPW#1LoI z!@=IIz2U|C!vpheD}1pX^+4|MJ7++fC#$;7^6*#El!q;lthLj4l`yjb5W8eHX$|im zX)wD#&|qEV8mft6Zyv}Yy;f;gWpp18B{d*lGI{aqw_g~hACyeiYU0-|DjMdP9LdXa z*RsLGUHUneEC5fg8gV(8J}Z+fvWR;3APqD4%Hi3wp^`-%?FbO8!I4PiFjkQL;Rl!x zU6d8ud$XR&tg1AwQpYpvTXPpjPrII#J!&e^*pJ9+_NXYcs3u^az8L5vxAJOrUf|Zd z<4^B$yvwU31+D2$I6CFVVhZxm*Okf5*&8^6EQB_xJp6j2NmR4E0~wilR`#hn^RO&Txk>VoUw-Sx zmq6A1#>}dSR=UCqg5d^&#bl9dmy@=$uECY4Be`Hj{gu%D({datXUk!oibexygk^#zJ`4UYi${3Lkp&;XB^^ zcwAlcqR`7NbL~!TZ4A@ZaxY!qH6n7B+PtscUT`)?tE@4a4j#uGA>s{+lcS`xt?Jsh z23)JMHzu{X9f)$9GOh-6lE2}8RL>Pruv&PFV8gaLua9l}p2Vx!En&`>Zb-wG`N7L|#SC{XZIG>1^ILWAskpb0L4$d>b;1~;GBc$8 z^CLOGeWTV5@Bo!$L^3{-?ly?=_}rWEMTlde@7{Sv=BJG5E}b%lp$g&$Tw&P!vyJZ` zJp))#-%hKwv>S$E+Yj7N%obU(b)Vk0yZ~Qei&DrLwLFSn9Lx5-_9*(@^w*X}#Z7pD{Oeu2&CiVNHOS|Z1n8LW zz1vzFFeUj-M%M1h;@lps>3grsLod2hwi@;DWV4MhbS_7fBDMZco?gHebXYQ zveR|@u>Gz7-t)PZ<&Oc!=}qU)7tl0cTF=>S;@4Z{&riNSefq6g`RO4;z|nK8!|?;( zY?wc3A^f$ffBmu{q|L?05?=KUzAi;c!}#02#A?K3nPNv%)jWR1Ge@lbR5BC zcY~Y69sJFYoFqte9NtN6!lQ%WxgRtd7Ibm^hJ060hPS0U!_!MgK`LD)ir)go3WEow zgEVn87)gjRUx9x}x8Q}DH}kUJ|O79^ooW|lWnLhg^d-{K3kFAR0;3U!uw zVq+KT-eqsM66$Ri_6Ql~D|6M2FU+6B+N&@O!G{deL7H}j1t5{rgvgLCWb_I$78y*E zf=sNlk0c3yjtoyN41ej3Oj-$Ny&s;=7m=$I5j~D$!XoBrBVKkzl&)YS9DE{*_#!>3 zBVOA@)*>S{86&H@BF#)A>q(-}d{OsmBb)7_NqAq$dMP`O1W*HgtnlI*aSIp&mF{>o8yD8!8<1vRiqT772 z-wR`dbYj1*#5M%Rp76!tPlTQw#S-5X0A7xB{DToTA7}pOMi{wlYyvV4b~A=^K0dYZ zKQO|m6Y-4_p4-K7b|=c^#L1F|@}?#TFeQq7*AycKu;Gj_c!CURQcPWi`MfEdR{uBe85e82RXG(o+ zmmEWywgybi{2r8Oukp-2Nh>%>$d};OWq`ruG~c6C)aCRqVrdnri2R!xg%i&hsS|%Q z!dg@7ZiZFeOz&Q0skP7eQjo6EksgRM!k~#_-|;(Sll!_uyNWVqQdxS7GONC2sMBT^ z|1`p^`N_wbUJ!Pt%#vmi;=S0M@LiOp{Bl!6j1?aXG8}2>W99a`u*37I5<6 z*YGTjyx)v4TGDJ1=FsVz*$f|;iFHwmyh%7CjN=&PaWj<$kOPyWq-dbvm(zeWpzhkC z7|C)AP$(g0ceU^oWKq&H-(0?%DHIMGg1Wiaj&j6w^K!&;Ro7g2WdV90^6=tvN4xX% z{=^8=i^#hplc%Yhr`_<)2r@5>l#2+9D6B3jjJjBqvQijJmKHBpl()wcUo2@{D{4M2d3Q0S;o>Wj%da{=ylP;4)qPypGx=(m`Dqtf={aBNNO7s_(W_6z z1rrUW7~Mx>Yo(hcWeX8yLfU0tbYqs&%C^^B)<2X@t(5IJysqMTz0Vwd`1tjiobC7G z*YCStpX-+28!rX2^`_QXH$V_5bLY0 z;JOviU2}(nr7-uUBX>{b?hT_dcqQVZwZuT>rN#gNOI1;wn1ExI;?%?LQD@opA4VAG zN|kn_!)>1G$H(>xk=0iOoKxRai}&~$Ibv%--?3NY%2J2? zWrPtlRtPj!={43kHr7Tq)|WIk_B1xHH@+orLJKst>ovV|YQgmr3{uKuJ!?x z2YU}YcN1&aTKfmFHvNP)$B1^oa}UX0cCo0Aj*51HQ+wGa)+TDLEcBhPY3Q_D2S>Ky0K#{E8KCjrLoLF7--zVOFADqz@y21V!#O8m>p0@E$SH8>5 z?|np5XU;3u1VLz=0+zLKgS~izCq}=!B7?cC=|g!h`-@)I3_)n!G;70Y=hau9Z=Bex zPe0_{hQ{@N*hciW3HJ2qGnZxbHuk=&2YnaT!P=i3_3<<4JvO5sU#P!#x_emR z;{+>Co@719`0&-KpXBPmsUK8dZ~zd^%b+;08r8|1*$q-;rMv^(b213<>;1AZ7`E|| zsJSak3brgbDBL&1Vld#8F~s5Dcfo*bOK^x_Xi$Rf0~5svmEs7%U{Er8Bp2N#Qp%M! zJxmJE0~d+&(cP{W-^TsJdE?kM&MQw+-+DDo(B8I}%^R*qa_`=D*W)tAoY zvN`HrI^vi;YM=10XYWa;MQYl^7~p{Xi^ zsT$|0+UTkJ(y7M2spie8w-nQ8q3L#m>37c4@1v)?OQ(DKru#OhKT^yL3e5}~%#1qE ze2Si#D4m(=o0-|1nWLD+2+b}U%zkm6U5=hzEuCHOo88=;-KM}T1LpP(<_?|bzD3V{ zFP%H-n>*XY&S5F$fx`3nhVz6j^TaXpq-FEu{qq!C^Hh`=urLNEAxv+Qw4u07Z82uj~-I?hwcBkiOm_ z|F}c3y+cK{3l`ahT-l{_-DQZ~WqQ5K@^P1KdzXW14<@q5b!G2@>mG0H9{=k-fscDa z+j}BZ`(h&d5?A&wx$aBH?#sU3m;bo0xV^7Tb)YJ8pnl~*)Ac|*_TcjC1HF$22HOW$ zs1A)q=nhS;9A0xhyb*hN^Y!7akB4_Y&iy|oh4uaCrLe!vgoPTrAZzZ1t9XQ0dWFAE z4u}7e!f;GjT2#W`U+;Qd(_J|8p=skoRcFuGUQb1RZ|QvR+oQhn zh5pv9{`!`Wm92v{?*^B92dBRbwr>wMe;=yr8Ce_}ncDcLNMTd|lET(MmEY@mVbJ(y zbE?jBeYUT$c6+Whh@8W)scv_%D^b9rzo~wIc_3FW%dolOaDBYWajw6)@!R%%dnA_Q zN=wuC{gvSoi;peMC*K-BeNVgcw&m>P0JA>#@hx{TEr5t`9!@|e^Y=<&Ie$Y6Gym&S znBo6?DeNQd-!FwNysxa)O}s1?4Xx}KNbT4UXG=FKzQVJm6cr&8KMpQc-Zk~p6H6VJ zH!6E=dip`-?EIv=^!yd%GE2aQ8g0I`q(_-b7IU9izKyp!+@t z^iU=pJBIpVRVaa)1~AhZW`NTRuNnoJkIgf!w*s@}ZYWK^6d~j)(dKQe(|j7oNf$I0buB+1N6ET4`>FpEM!0cOgMrzWuc~Z{bqi%1; z!u|G%Q@Oj_t{eU@k;1;K8Y-+1TZqV+4$|uqCE+jE0YZZxg+1dSdLMK% znnC4@;xpS@fKY$`Pd8G`Lxt~93%VUI(lrpc7D)2(+64A-5L3QZQQGBYh(oB;9|$OZ zPbTHxg(Dm8J6qpc#gi(Fsqc^8^~MwXM@wOLKcuh@+W%MzGx#Be1^qiIO#iADDug#?-bDPKNSWaOSR8z$R6n9HE6M!B)TnH`YI2>ss4F<6tt3Z-VF z6zA=?kgssm(G6PQju>a#ete_uBk`4_f?J>IE{%8aB&T>GHUw$%cCy*>=Og+5TnaP! zd!(>@BgsEVVW|I<6gJNP|34{A;y)*a1q^VmBq*-jlm7@+_&%im*;Mg%K_I`$je6{& z>X|6H=KyD1n_4JR!Uk}2^q#!p3Y8e0z*`!D6*buzk8nj=fv!D{^QG`Bb)kzn{~0Mv z=I@ijmWcy)NIyb@Z<|Q^SznJKdE{a3#cveeN)oQD5KGe%fa~wq`NZ?6^4Pcu9E(0707emZc`gp$q}J4{mLL~~ z;>XXc4jJM567mnpP|KL$^Cgms!|?;KMjlNP#MD*MqRDh;aC~t%iQi|{iK>UHP<+qN z#Cb;ey@B|S#23yigDHjr*=a8fSxOh-$F;;M>aQl9Q}boT37m}!J+2bxFV{7^N+oPW zpf{03Ty*g~J8tn)gr$iqeg#rF0h5ZF7OzK}!*ndF^Gx?Vk z_GePqJn?7-^}QPpow)l{uVYj@Z&)xqydLH)PdIN$ES2X_Y~fx?vrDR3{>0>ex)hfE zKSByK|DP^}W&eYvF!O)16lU^Ik;3x-?@M6@|3xXx@PCOER>J?^DTQV8|4Rz{uSsFb z#kJ=G_UGs7N5~SP^st_Lf>R$47N0*2xOFY}mKG7F;OW!zOPR5oY+qhK)td^Q@)>B@ zt++rkDNl;`5+)iLEP}X;iQXqF;% zVjz(m+?o{ypB ziQ-fKObW{v4RJj}6WW+1DP7GD5i|0ad`{80+4qq-ya4p~o5s-vP#oN4Fg z0DLtcx`JUAUk#I5AYZH!t$R0V+KD@3h#+C$$7dh`fOSX0$@Q@SKn--5Mc*lhjX;IP zgtLv19*PhK!tg%g;U#*1{_NmxNw7Q)1kj8A6Qr=CHk=ejsh^bgcS>Pme@O~E7X1?` z3@P&0rLbvYvM9?&m}qN}VyUs7>m&y@0a@A#8D4!_B|2y*6E7tLfRnNU0CUvu+gYdq z6beD3PlN|c3`~h7uqo}!$bXC!mUHnRCWXEFZ<4~k|6`@FuzxFsh5sj{FcARUX}y)) z5so(we8N?WS+9HUC{NB>MT1bLiI-nUue1liHGI&F1OQ3`K)-}4lo3F*k&ovmam^?M zOP~?eE#H3DE$6@$KQ13X#|5@iK>)g1hl6Uw$$=^$1Qj`gOal!0tOywly<#B7ay=A1#!m%*r_c+S|O@v$OI7IC=fU=>I{Hi)QSHh zUwMrHV1)!O?f(%AtN1He*ohuf_lHu@4=gNFf%pb%Po=`Y#loU9y5;2lh=ry2 zbrRD4#KPqN9TtY{!eL?9G6llfKVV_V-?1=992VB_I~Iocg@wJy`wy@%%Rj-wZvFua z!xwCV|G>g>4GSB8VPSBQ9p!Eu7UtPE_?NJ-(Wve*uivn+oZqmpkRMo>%%5UmMBcw+ zVRSewtZwk*FD#7u2Np&X9nbU&3#<5nh4KEx!k~nMRGj|?3nLX^9H#t2p4u^l#c5%+ z5^ALQLea&9`sKirlUfA=ILJu(5*oSA6Q8XBhw#p?-(O0?-pJSBISc?ordyt!K-pR z?;?TZbpRG{G^QG2%QAv0Ohq5v$xksP@O03QHn z!R1{?XAv5~fdjg1#(-GKcoIhV3`;hV48Wr_5Wn!xu&`ML;yZ=oL@NM*rtHvq2p%IG zehHi$DOVOXe!*&dcz{^KnY3V(lDqKipEbm9P6GTLt>N8e=TTQI8gKw$V&k9uG^O-}y#85r}{=jg_eTdE= z5xX7GH9lNcM>KsU?0KGfQ@Qz-%+RG^u8I8#0O3~KK5$Hd}(-` zCVceN96@9TCt|D;OMy%CAqZ38P7oqWGdPyeQikaeF!vI0tLhF-IFM{dyux3T$_GG_ z0-)IpsoW%7qnjp80gy4)(xm{bJOfD?D=zu~2)h7Wn(@*paC#6z+6+LNK{R~_K*jgr zVhWs?WP+rGP~ZtX?@M>erVQE642fv%4Fy1v0Qg0R3dFvLZ9pW61wdcY@b;a>TnIer zLLh*zqkR;dkHf<3rc;nOEKKslD0z?n4=n6?X6VEpv9LNd{TayWB#rUd=$paAX9 zI4sQaW>U%TSlGRvSQz7^$kH5*F%Ao>$6;X{o&8lfEUZh4lJzGR=4)N6K=|F}Cl=PV zh{R!GeAA#YOC#?@N*osU{V!u-*XYBp|HQ(E{}cs_nz`3)ZiVqtFi9}QySDGZ}|SEL&V-MENmW!g%SS|3p=`R?Y{--y_ZOC_X7*d z{(*%_@PO;1ZAUMFS!T83CF6g?!gj3Yaah<&P*!6z2@?KUU}_JCg?TN%S0d(L>K4Mk z*CRD0FtRbvH7~{pcNDB|THhhjF^Bjz$obsTp9e72YHxu6B;~darZ_AN!1Cbz_AY%Dofi~NcU>1Q3!9!bi~S^V_`c%+h6Bj z;;=9^*T8Ylr#?TiF!=V-i{G)Zx}R8B%I{d1nj^J;s0a=VW5NGI?}qmWEX-ub{_7+T z3$yx(g^@q{9Sdvh?PULjg{gQ6?n0h#_9uR3Ti4Z|NquVeHwW)Qs-y$zxhB?UoDbkT`O z)X__HqBA2p5j{$j(R&RNEk^GlNF<1NlJmT-`?{a!xy$?E{_y_Z=hNPw_ulI`_BwuR zt$&0tWaT#&)_OKB@f!;({Eda(z_t8?g_-<~g~dx^mjDlOx8!Xs7G`!5fN`Y!gN0pV zwXnaiuskdl_S`Mx$m`|SNzq?eSfbm{{{;)n_zMe*+&Z23kzcy#%0(6u(>x^ zwXnv^-WZ#IANw~J#_~@r?C5VS%w9OyZ_H-?FD#7VGmOYIReYT1KfuDOuvplHMI1H^ z8~=fYL42tvJ;2}p!oq?R--N&W8w(3fw6MFuqn3xo!cu-?VSKsH4Jj^8{J*iV0W20a z@COU)oKE|Lg{8gujfGuPWSk{kXg(G8-Hu3vL&BaYBcrNA=}K`P8=8Z$JQq$C`w=s0 z>f~i4ZF@N)5O8`&npVlmNhz2=1@a@SlRf}LKhXQU>YQMmola@HEQmNm=ku5Bz=){V zpYTi&-PwTSwP0hSU@AbAJ`n{=lo%wq!7Gt~O5Xht6NO7i@Y2^|vOt;?#P~w`9d0KA zmnhi8Khz4)O-uQiC&O3SHUf`;N1lOd7lRjqx17VQ>W)C;Q$D$+{2V#modFUxO*KHq z^xD&rk?~L=W^TtS13pCK08%0WC9;CbWmXe1E+8UL0})$@;$7#vhIlL2cm zz-2ZT1TdfYc0I8L0N~Lm%hHPnuM!V&k!hY+ZzFTmCrv7gAi>F<|00Fe1!k~^cl>)P z>_Dy&?9}mWJUh)q`A*fh-4{82~btlFtmOU~m@8fx;O4C#i>v^1fH^=JCRq*;=Bg0F88 z<%?CC%$f<(THcKnLmY6AIF47Z!X4WL0KvHYcT4rSqjT8fZ}(WKXaGy}yx8NmPSI!_ zfV(mqRtkGw==iqg9cPMeJBNCTj#GCxjTWcJ;`2$pej6&Q8`r2i%)dsAy15O-KJ{^# zt%PdIM>&VYWO@wAlw=-4e@kJzypN>bAipJn{77U=^M(M#%E+R#PfRz+lGJz#lbFgl zbJNWDn0d<0=a;08D@%$pe@ee|d~wwiK~z>&-s0r!qm7lqYIYxxI6cyxh^%bh&6<%& z+y#BWo)~MGKBk*pCns}6EIYzF^aV6QW zs%gls)q+{59=Mz{LREWMY~}+JY#Uz97VfzEStr=?w-n}ol(hSEVLW{`%lCSQfR`LgNezAPW46~V{VEjb`FJ|~6K>4oj-Od^=-xQIe4pq&k z$pLX8fx3}pmm}qkk@+SO9nSe8WbaSo#p|4G!Ta0SH?FU~Qy#eD0&m>?2-UR#gbIgL zlLa}TDdscq6?c<5@ncGejX&N7Sz<`(Y2b9y$dxk{&78Tr2fkbaan1UrT{PwHPegUl z(`wRrU)7=+)j+7C(sj~LzRrHfR2n3kiK4Fu34nN)Y9yygAx|9;G?@$wK1Waj1DiX3 zyonJ?8O3U9$j2n5Mr?ekn+aS7)eJ%+w2Trap{v0#cWM7Mm&bj04)xVoDU1XBs1I)g zT@&RNLk(r0)sVq(faG~_eGGtMchuf``?x=FX2%eIsz(QwD&s4f8Gv1!abrHnXwZ7R zCfEDPLG9w8L89m}*!E+MR;v`}{q+DtwnQ&iXnO{um!;@{MW3CMJP#ZkJy&b>!X=|e z{v)YJv)tGqSA=eDB(3jYUGCXy0>3>%3ZrUe4;Bq(ty4n2`W4Fk_p3U;Ce)QAhzayx zi0PkeqNQSzLPU<0)6d;m=@_HRMU5ut_NDGZF? z$0h%_6sAz71&jG3g$do-;Tk$(vfVv=)o`4VV}?T#D!Ic9Bf?FyNr^scNG4QX+L90E z!@KhdOB_4kDwNyt}1!}^maDm0V#{fcG?GVG*l@`$!<(w-`FhjA(HEFh3f1G^y z8tEgB|DF^Uzftk6?Y6<3k^c1M~EQ0VKKDzB@QG(ZcuCLzZy)tIwFJ#ol9Mxr;tK z*6<2~XH_1mbLwT!^~8CwSqcI8BchXn;1lWfQQo@6F?v?Tfh3SO-^R@3QTvTeKj}p{ z*Y&86jSD1HF($Z^-QStSDzT5hyaqdt56M7%*UH&$A8orGN*L7?PO5gntU_kpvtOoe z`}`B7OU~?ozGa$(qM ztRkXot8MwUBi9a%f$XhUiV`n6>!R;C;JHy)*GO&Y8d!W4@2?2Nrxc2gsZrFR!~lLv zVSD#RtrPT+uN|2bH2wDz?V>FuUO%$7Wud;X?=P;ceb&Zmn36R*suK3qCYWfS%I|un zVSlT94ud=QhGVc5C%m^1D}~j`DYT7=RBNQX=M`X%KM+Z%`7YgW^Pnl=dqkGRDeIRh zqSmGMNtJ{0xCHi*I&y2$CQ_T~`ytECmCuW%rK6d{?42VSlSQynn9$dS5o?`xcxf)=!oC;VOJB=Oc(mD4<#TMb_K1Coa6J(|q24hx zZgF@CJ}osxY#DdR&OW+;$7(K3CjR=AVL0Y(X(zkWzVAKRpZByBs3QF&NN$m`<*bUb z^`qN@M3F6@WS->1XTBX#%fhVR!yCF>LY3rKzX;hjFrR*I^~0o+3hDLlQGBJmPV!&9 zqOBkCYCj=e@a|9u%Cr6=dRkCIvGY3MMIeSFt0G8YUyHVGpnYa%*zetl)$*wbciZV{ zd_cPp=_LlFv~Ak~R(Za%vi3QUVn{f!k<-v-+qmQB{-GbyXq(PH$LF7OwY~an!oQ9| zSzp)t<9>`MoOpG@{*uCkgPfZKe|_t?9(Q&Nx?0O$a+lV)#zz9GNKZx|;ZPu}P!G!` zNbP#NPa%*SuK7!!*uLL}RFQZ0yJ)w*C%x0F3Oee1YJwz+F!(w6h&UdCQi$GX&5#(y zsX$?+FeF*uqZyZ&(H?cV`d5eUECnGf-wc_fx;Iwv7#$u#)K{P9VD+-c z?IJ1Dn(VIidF3OatP%)`#SmftP`X6)`uj<4??LXU$IKEqJWlKaeUCpxb)Os%oDV#9 z0Z>&qhs?AAg;!S>PD2xO2|3HP9mxAdlSx438XMjkqOzaFrbq>U_K6>&IEko5?AG*V z0PU!L!Ez0DpZ?T(<;)dz=`mPn8fNZ|2J;Gl*RWrzuTOmCS_inDmpQ`GhJ-hM9Chv` z;0s1`TSY3`gwVNLaejo|8;n@0$0=6}ftEjD8pAse|6EMOAx2?Mbn~_3=CGhQ$BVlh zZiULizam&u*Ic63ogLPT00l2by5)@e+jS93GiyZyoNd*djT5Gvi6LKkse^j&+}g)g z>`idq=Deew(D(yA{}Lk-g$5a{yRKNjI08s*_Q@2#K6KV~dHGt))uz#V-TBg5dD8?( zCQ>r7ODGW*D#OXHypAn&EWhFHeO!{QN6@8e!DG7Nv+B3}uSd8>RPBFu+dFml`*+DF+R^3O zIckS$#Bi_|B3P{=b;cs3>uq;!3}J_4isy)@!8pp$Lgz|Vkbjq~uQu>p<>BrIH$;WY zm%@$A2DZDw*{jawfamRrbf+#U2?impEauuq?Y5wVPcM=#RSA z)WJA>JhuGJ%AEC~&NSUdEhS32*1!x~!FVW-JdWgHgkFgClQ9DA7li6UgPN{{(~AVw zFE(0&i(((PEL=(K}7W93|7B((#Ma%c0Q#NsXex;kHNtsF) zJ(*K#biZ4Ekc=`4;{qr$_^C=oq%YqDP`(8-@SM)w+?hP*JUn+~29zTY)nC~;r8_yL zY3q`N^^5Vag<4Qo2;wmmc8+eBIQy3?g_WOLO-kL^X4plC+Sz;Lfy~Npnl>W5s&A@U za=$lm_>I2>I@##kxtnp>G1)Q8mpWS&_-><5&(uZbDLmu#Y%PnOeJQqI8gp8q5{(?f zS0DP1mH6zJDru@l{sJkc4hH2-pOjsfN8^@K!wthC3hNfh$M(CjTlBfk(Q<+l+!727 z(UW1{DwHmtRM=3mzl@O_PEr)clFnzud&^{Gxx|^JR4MgIewFiZdnM{7jz?935>c^lSpTqV z*L+6hL>1oTluhMVc<59_?rK@y&(O%gkgtwBk?c2PoCs0zY1P!cabVt;e2#|_J6X%P zso%EU)+w*I6Bwd!SWM8M%SUdSW<`yN>@q?9G6x({LR?ISl5S~}Wo&gkAD=&T_-D6} zw}X_5%X^`s^>`!B8V45v2Tazrjc8M*(m76NmpYxyOMbLb(@)i{K7wtbeP;>hc;Jj6 z4O;HDz~+4&Q0Jf=t45}n!Mz<-(n?k8M2d(*SLcjMrR~b~s!XM(%;B#QWd%;o6D$-* zRuy@>quZ6MeHqfC!V`4!0%|$XAEEOGwi)LLLk!Jsu`;>);`bqC&R`xoy*144eiokw zwiC8Rn=*Kod#o=aoV~K^7b!M!dh8~9NGf~TITW!AN+%8rO{uF*SL;f!QhjH)x&Xt6 zy=_-R`hye<*tLPEVp1p*)sCCNmtWn^H>O+Vta&KcJ1yTEA>nQ7vH}QZjl{Hb;I2kNr~iQd_ABAe#i^Iz4LJjAb1`y>eTxB*RE zd2@~U-c0S_*49&bkQ0ra&njmiLw>g7Iv0xQ4ZW3r^Ooe!+PzylEKzsXmesii02I1Y zgb$p619UXkVkYn?%WKX!uZ#6J@VV=a3!ZeciW~S+)d0sLe&EvX2#D_7koYOotq}W# zun8#`Dee{zDX|Gw;PIy;Oh4i%2%%fSh#&&K|GJNP% z($JDVix8f;PEg=${QmUT40XSuwSKbT(~iV%*z<)^B~N1n^q^G$4X&RUI`zciIsQ_rPK40D=Fri@acI< zVaX+_XC1&IvvSq}&fRrW@_m~-k9iKy-Q1sHrLcE?p>^X<2#Jg;$tK32t`h$#d(#fA z6!tDiM@Mzo*yH_<9HP8*eq^=$dk|I%lk&}x@V=!ByXVq1A|W_ATN5IOmBOSWIW$zC zbo&o0lUo^HM10|wd-?DgRtl4jwSKBS+o%8T|F=@uql-6wp*-%mLRNT z{7GnLj)MF|->0_#>LE+tMwBCHcg!MP(0ckIpLhvCl`HVstW6```f(aQ|<;1 zj`!`H$kW&V@>vO?0r+pP1-hInx`ZiW2TRC`Xes4fC@kXbax3Nyt3s_*8A@T!H$vCF z&`o}LgVVU3Um|!k5sU+Ue7-?$QMhcrep9Vz(M3EqB9&*AT^#bbL!Z$XTkrRDaQ20x z_ow@I9-%pCx;pW~QvH7sVWluCl#;m-$&WRTUK5oNN)9G}hOHzBpV7!s$~n*m9cWaz zINYycp!uImVRQ$?ZfWmCT(^I`rjM|>t9S!#0St4wrj<-v*9_sVSWMl;2J_kTGV+uuJPYXVOI)z zh&1X^a-lIJzH+%0s2&N74vUE1QNh)rSKRu%Q&!5 zyntuNzmvjfpa4Od4JaXTNaQPRY0&`?uk`btF1Zm^B0Ao#qmpU4%V>2Hp=y~p(m)gQ z8$vNFKrf*C`hMhHW5auCocA%u#nd8Lkh<3#({-21Q)DK|#JXtW_yO#{VjaB!R6L}@lD^v>Yb4I@oLYCI*P(;V)snF$dbtmlXr32f@CEf!PzH_84Thz_X zteo&gNa@rElBwFhTUTWZDXkrWsrK>DTdCO(y9#c`SGTA!jD+95cj5fGr*<^ozlB|M z+)7QoKi_lhhT(eguMH1x?*~JB$(~BQ z+--x`Kq_j~NqcrG-hut9@6kMr33>H;)vtd1j*>1Px_l%b?Xa#wl}uECZ&6s3aO~$s zr!Qz0p_<4z0E;h{C*eD-dvt#3!ESFmrg2?BBzgXzjfhpikNC;Q+RK{uUDT)}fvMT) zDmT7)XDBBgBXqz@gE}Fn7%H!=zv)US{|_nbc@s6pwA7?8G$mXb9V@3B%{iJvv}&}P zU@cN9G}$>rE5?viVN0%dhvptliAiF>z?zH{)1!z|O{&19o9@d;{ft*heUj>U@!tWl z3F~3BoE}B~6EOsZT!puTur(V+Oqy3WGH&u#Z$hr0tX<>I-Aj~EaM4ZSr%;k)r{pH+ zPG*qT_vGOTzg06CcRBTmwy~&W6lp;6afMK8w(ut1n3fHZ)4!bt*rVC4 zNV*txkud2VBSBpHil!855PQkz)Ob#R-zesvus*(@YcY%2DLV$@qfU}v_4H`0lPZ4{ z^WVj__$-T^(#xxE59=knQcmLudzS$6fYD^ca7z{KR0giA-Z4MA15HaUG@4A|yx=KO z+MPpgNT`K$lqE9uaN@spn38%xgqLVkr|+$ed03dt(Be%aN+8JXnDN-)uB8_D1!FC= zsud&@Wco)6lNl}Jxj$)~?-|1K71BQ{c5VAoGCG7#(RNmY&pW&wxNbfvKcP}nFCs8D z$YywZ^e+07GR;j+ucL=NlIh*yMOg)vI=6^Ss5pu;tljc2qY6-7&=Zv<1|YfPOx@)OSQP>)<;@ zTAm2c&QNK3&IyYgPjc$MU88i!l*^!E^}@)jqwW^*$tq)Nl>{o*ux265?YHg8+KYdr zurL05(^@{yap|-+ZkU`?#a+u;z`ajtMRWy9B5pYObpDpY7)Mt0nZ&gVl-Yxs&Ea7y zs}yhpRcMvM@q9M-w-lx{uTbHQC#Z%F=QpVx&%2?8B%dl{Z$Xe?mPiZ8Rq%yi zzooFVa*JARA$%hN)uJxeDlalc_iGFg4@SsVC1P5-j3$36~n=-Y>NP86CNT#Qv@i z+i^`iOKgh5b9T;w?69z0w(EVf^>Kb6VM5{%D919YyXeT|P@rw+4%x_*NCoTjs50t# z)Xps9XW#xcpR^z4r?goG1mfQO6mI$xDxLxO;w!6A1SX_$+WRFXF*_6eqcvIwLsWjg z-8=bm-M6okuxdoR%7^BzJp4OU!`AJtSF8B+bl1LXY-~STMowA|tC^iU>)6j#^o-xb z=hj&`^v)iF1YfNY%qFct&n{$?J7QxjHVwH*bz!eQHlgAEMX=CKfr2qh3hH)TK>0LQXjlnW@}n*c3%+u3*fHXci?n*dI-a6{rY~u?QDv-U`%?L%Y91OT-RV#L`f3`W_79ekUa^ zn$jNzzUri;f`W`hxsFgw{wRhcIBO4-{0v6MJH>wOPIqa=*k?dnqZpi$A?8L*^Eey}*az|T<`IlpbBe1RN)%$n3dH42 zGh*3a0Ud?1G}J&gq10GY$R>%YL=-#=XN;;PeI~}DRm)z40=vRV=FwF5JQ)2EoW~gE zMI+|7?u?vMT-T>)%Jw7(mM*izp}v4I*fIs74Q|3;Sxga52s+qVPl69i9xBwpx_=1lvsrs>$wrs zGUUNXJCSU7P>@d@ac3~K{VFXLilJAGy2XG=62_eDB~yf^CI#I~*%C zQOpCtE{_@6ae!G6>eU)xsxfJUF_1l(c>zgBTp&MsOj`)29<8CaP+{J*5`Xd-tcc;I zEKrgHvAvp9JUU^dVpj$t#5z2g9zo?<#ju(f?IjY}iKMnifU{5NVyvhKk*rt^?7CB# zIUh>SB}%@Eea}!TcC0-VB78NaasUDCTd@k%lfOR!Db|o*B5y4~iSOQJR6>#enq&)0 z)x3h!`Fv1!D`42{q`!oNE+G=C9*mnvpb|>>3dwYdV(W!sJv%Vg;1b6Hk`$1ad|<=E zVRmS?CJE|Gh>Rf=^i7-#3r$&~n2u1an*dtN6uGn1J4s?x^$==H^i6apr6Qcf6>H%I zlXW6*wHa}3!XfEs^#&N}J^)KH(H-HCts677yR!~J46bmDzfY*Up@`=@=?7LosYW+X z;n*9xS&VWz7MFv`-#ezIGGa(MhTfqD4nU~uMU>}&Y#}q0*G|Xxy&Ayn2yjCU+bb;i zl*%fOxFv-HvsMvMg45YyfX8q;TpT%%Y0W_^*f5B86#;QYfuY_^;wU096oDmHfrAn; z)Diojz&~envF?}?QYJ;2u@^n4hk)vFRIkvIQW)-guWW&Ea27yz>y*nCP0>dUeVYpLM^pI2!7R0G zMX1|X%DRL!dNZkXJ6@FN)7uZkK|XNM5e?V+r24`%g>*fz=9L_lSfauTdf2!|(1d(r z5r>2JX{hZVGZrIBk0W1PC%0d^0 z%*89TM;KSFIwAuotxP&=v&qYYVAeNH5Kf@}8HRQMqQwlqm95M?507yyU???`cS+?t zhSM8>fQjG4jg3hq(G>eg(#UtdAVZf1M11-K(w`VcU&sSnE2RrG6LHszD;&SIGo~v9 zHi8)3JL#*dAKR}!PDv%aVa#ll98?6?eE2EEqmwOuHW2>pl~<!UBxRYDCUybKng-C8V`zD%=?!3F6&Sja=aCi# z;DLJ(J!$=mdu+2mK=YX#2P?AHK;Q(etP4Y=6f_sUdYAi{d=3NkYhap%o7-ayr`?lO zZy}ecViuHR%9?K0qsXlo;?Q&?t2k^lrp!~y9NBkiAEJnpgDEFJ(o43y{`9c`{xJlI zqkoJ6Z7Sb$`#@ji0Uk?F5XaClVZ`xZil00vUY;^<0F7+VA|fY~*glbsKS;iUrNk;H zUCgj?o`D$ZQnZ@An?1Gb=AZ82ka2zkdUOSpKtR7lsHZyR!@u3z3N~_i&gyQOeg>r) zX-U6MXapBS9+q@cMij8Vf-$Yv@Ok}s_<)@ifh!^f1sS}895xCBz-kz?vu*207d&Em zQIHB4RT1L8?J9N;b4f>VOx6Pn5zO2kAfscj@x%Blq?-LXt?VSd+QQA^THqP3+Yw4+L~2gG_0rd&5gX%oz^cls{dn{^686r)T^=j~x( zMUn@1c*+9oRHjuz^j)+>*g~2kXYqZlC{}?bPv>2vu=*S`&=O7Gi6qK`7(7B14kDkG zJte6GQCi$0wnSA=FOU(7jy)lcX4b{-fW?ls#b$-YCDg*G!qT|K z($XI+Y#BS9n|1l1Z5bkr%>2Jc@#v)sX0?l%M;wzePM1{`Gn z+CeDpmVmb_e(jzpZhsHl8eKlzQ#?G&+oSLJa&38l7qmtA`G8Vs52W-pF7PW_aaRh9 zg<%eEc5Dx}f9+5_c!tHo5C_CTTUysgcOD(H<)7?Wp5a`diL-r?Vms5j-iNUrYjm6{ zLia1rOK_)hPyOONw3F}y3NN}NK)4( zOjW|MQkdFyB1$z%#*<;E^l0U}FPxlKCABoM;Pz7nTBn?y$*7@3LDFQ2=&x(q9PAeA zIVMRQqc2rMcW;zcm)?1LGnWZ5G=&+<&?@N?*cO-@$-=M6+yCKxcd$<8o!zaUGf}!l zHgSu0=VvQFMBeo>!I$Bv=6Z_2do**`H$uQZx~+5}ArZRv&GQE+ z62Zd~5L~#V_8F?VK=BFA5b=Rbju7kK0~nOl)2h^hQbwX46(-b9Wc?IZOw&3}ayiO6 zUiNT%++Fk+k&O>$UX2DNJDFrI74y&}CKAfj&zYnd<=*GRr*5N}Xy*E(fd0u5kzK~) zcrCk3_|E~`EGH(7$3nXUDjV6F)gs+ix5Y$5#p8+4DwG|iuQ?;ZDw5sc&0rpO58Ypm z`V`z&nx&}S%c)*c-OI9gsLcD7+mgHPc%K->E4O>8GdHXINo7U|cgb|D&^=kTV`jOT zjmt4ls#*?ZnX~GDk#IzN?nA?fa2YWVtafvtAyk&q-MDlS@{BwwW0%+LxO5{Nq1?7R46v3*>R~YqOAB@EyG$LHyr3QSG+&P9d_&JWklb2`$q~R;NPz3 zAUV-scM$MJK&7&L(OtyDfjZ`55h6IkNFASn^=hck!^MK0@C%&*&KC%mSA)JzJ+F6l z2S=#|SYD{KVKdkes$GHOQkfkg-P0^_@JsJ0ZQ;?=Cr>!dyg!^J@sP&sW~hcn_gapO z7RoSpg_M1h1y#}8=U|8iV;)z=CvBz%G;+Ax{TLSBJc@?)e;RpyHeqeN;WZLTA@~#{ z3SRQYN?~Kt35W6&&(|aF{IbaIli}zp%Fpa}o_ift-M{r9?)ptMktd|@AmyfUGhshf zTHuS^&C+r$3h@k35tRFUq-sExl;k|j(hAxYlgdU3S&b7O_dgj+nIq$nFwhJ*djL%c zc*^|;W$ExRD?jwuJa;Yagr7}I;!*TO0A>XIuDgHKgbo7*%`X15{T5T7^!5&W~nt7^s3*azuHCe}3Iv4Rzch@lb0~4%Rx}*(u!hg6As$zy& zgq}9vv@HIb?nqrMta@X_NHR=iXCe%kTsCr4vtrRKuqg(c9KI!X-jS`Q=fcFT>V>EiTq6S9fmXUyfuapSybP6 zXA>yzhdFm5fwo$x6BP>g-nK&QwjS_(QVz>OsmAr~6II~I*PLf+5jV-Q$ZBCHup;9v z&3?F1YWeAH!I>2uYrd+?meS%{d3Jqcv(a4K<6>_7IB`yn2^7Wp@n}Kzey5n^o3Nq{IH=VGaO+#mGqs!K zO(gRZ-zI38Zd5o%N0CLxK&r7v3J#AEWFZmv8Y+ttp4?`7I9>corGb3h1tcA)_NVDkNa8tEPaq0L}cL<1(8J!Q-zYJF!g~r~LvlpW&7mFVyW0I(7mT@MC%hn;Y2M#pLJ*7kE zlILtGR7ej7a@F0bsOXoO&YTExbwgsmF7<2X7iCe))`bg~)}gL6BRPll8M6l4T9LQ% zF#HvXACkx>oM9iH-8@sflj{Vm%$m`8+E7+woM|$iHCx(n>vl=XlY1xgZ9X3+LQ1T) z@5=dpY!*CwTLf}8rDSe+#p7d28_;a!b>3gPSYOgXe#oTvrFmNMvEeNq+szDW35zD% z=25{(c|-dpc86!LxdU9EbF{+01-=8G_X<8i3o$R~f-uN05;(2;-1y`&=B;_kfBNi1Q>@}nct$}`p8cf(@jyj3Ug zYbtl-rr8FikWo)sM!ZyBM&P;myS^gYkpS#C%zQa6sH%j;ukIj#kHHkB4_O`>=j4#WNgv^n| z?WLj}KelxqN#?q>X+<(H2=8M`1v;cYy-irJb!GEPQ)u_A5e_^||9tuVcE`7wFE-!0 zX((UuXngw)A+;d=b-dD{blG+taB3g4ntmNr-WZfl&oo_v?P+J8De?QF20e%%B-Hey z8gN|`L41AoK=i&FgC=6OA3=SNm@P4@APBys4*{1Vp4|%pa(hhl2a7Uh&Lg8g zS|A+$NMS_LS{cDHn$bCZQAvNKuxtmjD3j>?kA~?3(O4<0Xgey8D5i`-zrf*-6!x|$ zqO>%oPFkaq=(iNs>=0I~8QYd|yRkGDD}{Ay`n=zc?Qc@;)ch@ljXF5=MZKJqh7E1M z#7bdvz8VueaSH?TAEM$u6UC2|#w}{b$E?L+qOd?%?3zjZL8-)6Y5XxyLe@b1p+f>b zUcyOf!i8qknQy{%6aSBa1U#)o*X;yAbYdfWBK}~+a|jM~C&4psV#-W>cq$IH0nUQE z8UqA}p)>JRI+~a_d6Ng1tt{FU006k(c-BLJ3jl~vspDZ=m=X!qy@|zS09?`0gfhtr z%^ajr{S5b!V$o(Vf&$ zWwnLG)H~19B6!m7M!zC5PPY9NWonu#yptx*n6w5UvL1Y8$C&I`mfrk0?U_utYjc`K zbgDQpfLd9NAu(BeC;ciT-ESwub}BuVI6RnG-JCc*bSJfAf=sI{Be^-!{znF?8Jx6} zsjQi)VUwxxTuo~yS)()U#!6~YStOD-tHBXm$eYdNo28zbWf_el)(oHq0EjyQxtZBM zn%RwIIUR`Xe#gL8Q?)jk42S`aIB$xQL5|H%j!Q<)@J?>aWX^(?--MRhRC5*pfcsqJ zRc&JKs$*i@kKAKN2xccQPa|*5Do>p=1p;}+`vb5xk$1*;9ec!^kN*>V%8PV;mjA<4 z?LvkEA_BlQNW#fNKK__X5QBPyK+^7dshZ-501692TkpnL^uE64?8ftnhMQ$jEib*aOO72&e0NJ)N=y89a|3>sg#U~|@Rbf}l|~Ge7Dkmun`Ot&m!@PzCyAPj+NoCo5<)IE``Q_;-+48cONUZx;?hsvGPExV>sqBq*g+lGCnwW~Q zf%1m=l($(G9rHzXLlybE6`f9%#!VI7S&6+bDo0ylyUHsQhbqUk-+WE4ocx(EZT4n? zByu9=jbHhjMZT)1&)$6AjmOAV?d*DPI8|*#Rqc|z{nA%;FckNN>FsyzCnwr(rySm% z@4mGuetQw~^6KYXe4U78zG^Y;YJ%bF+^A~eY!v%kHE1E+g0LoiNK{p`hM7O;0lq#x ze<{nFKZ{Q7W{LSCer<@hS?m2;LH;0u4H91dQVvNxp(l0w^wnY&F)ZD+QhRl-CFaY; zb&r0kii?CO#nxBO+N!peAZP2eNE@b$9d+gEM(M8`9xS*cYqamSmfWrgH6d+;d*5SE zyl2i|%D{=s!QA*HR%3>?&Mv#uVZqB;=iLN8lN;%~XeJ})lA5Q?4Ro4~&-cm>gBzLZ z@n2Dq{`aM@qKoF42*NO~`jG6>!)gNaRM>w<3LErmm8d3oINVadOHp;yqO1J36!z(1 zo5?2C(q7wgw)Q+$3e#!-G~D(%ww(glzA@Z>@T6@kyL|_n-|KX&l6D-&wx2xgxX@|) z9^0{!eJyjz-#NRW)Oymf{DhNMtP|I~Q^mb=DZ5j^pyNq3!Apb2@mO>T{O;Y;QtJ=u zlzYtJeU=w|T^wXx%{Azm8qY&a%eI|ISr1sS(oKc1o6)(uMN~nYg_Zw%&4FRp2%=Q! zJqza|C*Mf142wJ8gYE-px0X4J{74tFxO)VrSIW8f-DHn&jtm|ylP@=Gn>X@>4eN`T z9^J;C?WA5g|2|!TUVZak!~HJr#=Zx-{WVE_PbB;Ifku{>EN$NC$Aq#oIDMjveP)Z@ zJ0AU>?|W|~4;ZEQ?@bQ$ANM`g9rQ6Di18ozQqw<)=|8F&eDuCPjHTcA{eZLl(6Hzr z9`*HLPtwqrnjyEZ18D-o<;O#nEW-!KgQJs!c*(icnw+x4?&4&o62j7sf z4a$$y?T-{44-JhBTT~9C5G;)oP4(chPCO5pyzyqt@%?1h;zZB!1cBK2+-d)S^B7y)z_tIxOEM>#kTKBJ zl;G$j{!$+w`P4_}sas#iI4mGwz3GC=Nf`O08tVkD#k7_llOWl&TZ%-c#v<8SR6a`pYTN(diklDT%KiNZ&woU1kbiez>*zUiRzkBf&YHHiCx- zA83qbt@RY1w?T9TXB{o(yaYc&=;q|Vs`|f~Ly*oPa%VyhK92i+v@-gb@PDN+C5m;J z(7LMr`fb;Bjret~s&$>Qb-ly&I}{s+LL0{V8~0o{9>i~$R&6{S+psv?cto)Y7uvMe z-?Vkzw2$9(tlE4sw&`-X=|-{TA++VCzxC90>skEP^QtYsv8{l^tsshRgwS@V{&u+Q zc4Yi^bk%n3*mm6Eb^^stlF&|y{!W_fPI~-KX4Ovi*iP=@PCmshN@(}B{%(=$Zb|(0 zZduiC#n|qf!`-(Odo@CPb^3b^u6ys|_nNErTF3U<4);1J_R&K7-TM2zuKWG*`-4^c z!(;oShx_9c2a`ev)A|Q9t_O4R2cN1A7RC;i4i7$49Abq2zaoXv{clp(m$NHYDggAy z4dNSsYk&auPYZy5V*)o4dnFP*YLTf~XnkXzA%0nHlK8%#5sH z2o%hs!bW(LosIVKCA7@Jz&H*k5NAz_kKX~x-W21Gnmu{FOhXOY_fh_N5etn&EP zBWovTYx4nX@NJtcO}m^yJ9ZCyt6)cyAxAD(sZ_l*0EZ<83utZOHw$ zoVxbu?e>Dwj*Pb*^DXF!U35`JSI&p7hVR{|-JbtTg<&f$3I|hojLXdjYm0`{MXZN% zwCajSv!%Uu7YFM~#`Bd!Nm#Y(ODA7zrtnz|)t60|7`)cb)ov)CsW7Q?S{iDon0ss0 z9mA^ASox{WVY=L6xbe-xJNLDrT%C7SORc^qyGz6G-hS=~#v=vmHdSM~qd@$YBTY4H z{fQhpdAiND8^ammPnJiT>$b*`sKois9x&Sx7H$bhSfN6q<4^D!M4y9XY%obx#*R-~A|AcD!0VJ%Mj zg%k%Bckwm{wduc6VbrMgWHkV0okDKn2pOkw-*LlJTpPm)Ql6;339|G>69A1Z9XB(3 zM>RRZ?iTE9W;aFG*hL7|6L01EwiJ^C4ZeNCB@tC-grtfhCsgtSX3>e#tErrE5)6(U zaf<9S9D1(_8rFt+_fjRsi=IdDm8)n1FtN(kNJyme{C;ql+E2iB>8R!j;N$phWu(DN z%{pXf80;xgR8J-Q^NVE6}`DnvR9;$~EyiZwh%aqR-YkyN=p<1FZS+G=CM$eiSYnDMdC6)^FkC9IN z@DD01<6o$-z28(A9x=1uUsRYm;cqHz0*a-=NXg|^LMgdQ9K4ABl?t=^Clx0Cn+ntZ zO@&$23Mk^vgbwo8eW>Ao?{HEw$bTTRPRuzWulBYdR|~qriwlJ!8Gb2Wvm+&Sk)0~R zrsu}|Bv(CnEBSzUmV{JXv4;j#xV1^&mm|a}kn28#QSX#;2^~(9+X?_Z&|xx83dCeY zJnr&28S#q1FRPG{jbGaVu}Ve@U{P-@43~7mH zuy=r#ks#4+V{&TU$`dy70K6z-&R;hudZgN!T`N|3KGei7y&`Pw7V9ejT=f?#t+h&i zCjETwv{mtZF@$(%*pjS~kS9O@Pq~CgJyu;RkpO?2pzPsN%`5mj)^k+^**(>yM1=wWP+{MHQ(<^isj7cbVFiCtVGO^iFp7Uig{l58 zR2Y)sHx*WZ^RHBxfpVzX6qX8W`kM-){3jLm68E1}Siyftg{AywRG2oF3R6S0wtu?T z1oVbR7LYkS|F8}VxgsIM9J+5c44pAjz%bDqNuf+XPM>OJx85iMkR zDd2}2Vk4ImiMt^RR2KbMEO~?jam}n^Ta)WaJi@%YM_l;&*EtXUXKo{#pQ=o;q!V)F zBuF7W%IbO=f#9QK64e8kzT7{kFuwnd3L^;%ysq>iY^gUF9k|_G3D&20)AKOYr;{LH zJqACu0{nHTe0I~vo;Aa&N9rdpq{w(@~S zt4R;5KIGbTjyHG$SMuY5*Nwp$Fu}WjQembm;s2n*9{wMxF!|#Wx~RXYun1z_gTJY; zPyeLC0MKw&Dg?2o^u6uFElRy611;x|tm?Zk-jMTsj@MRr%wA7xLfvNu=!FL44b-XF znb!wN(1cWF{JlJsJwy0tvHyp+`;2O8Yxh8(MnV!2(&#M|AvEc|C4e-Mt`wCfASlv7 zP>=+WrXWp41*C(3NN*w#K$?gFL@A;)1pyVof{1dX?tR|S@SO{jQ=kx zOgs6))4!=OB|>5>%J(-F_QBv673NAuTl_l}7QRrbwejCkVaklSKU5eSu4uzmpm+`` zNvc3AxZZdjDGXH%V%Xa-sW6&+vcXI>rTERz;j?qds2umRI58FcNLXTf^_b8E*#KE= z)gZ%964VOdda;;(k+X(u9oe3%cj}&5fAAfOl8}k{+2r{4=?T;Axv`KNon}$%WHyhO z;yp^KoP#98RiJ(nNZVJ8dVspLvC#|%vDBe!E(s59xyEvBH`aWl)5kazf)m(`5rg(Q3DyYV;|Yh|Ibud(BD*; z`~OOXWw4}K9V~*u&&5g}4ZkE+px!Kx9K~rJZ35@oNWg%QdqlM4HU*df$*Vy-l33ms z!eO`swJqw;0PunFP`>ztb1UqT*&JhJNJf#`XJVps16z4B?EKt+rou@7i3)rEiwYzC zro!_6Nrj2AthB(?ApfGm-2Xc&Ocn!9V^U${zo{?-Few(U5CA^^iwYC{n+gj6D+K&S zg^@Af;09O`89W^HKT%72&F243g#{ySPuYJFzaHX;IK^Y$F`q34(lVEKr)e70#B@B#(F zK^y~|sa(*!Ekaxo)#5#@S-+SB3<_)dn9E1GfpRLue9T`?J6#V(ffS+3L>&MK#h(ni z5P;@AhgRbt-V8uPMR(r~%S2^ICKWaZ@0ep**+SVTNM77KSMCwVesiwuycAckQyDPKV~94YJSZ#c)&L5AGV%u35w>0n%|r&2!8 zA<12V56vL`ZEQdCbMJRSjw*3Am|@epAQLsn6cD5V&n`=Lm>#-214^^0F>{}*e01(y zvw?2qDgJD$qGbTvCj^^i1!{Vu;w}R9`Q|-K-b&VoJX`jay^8ybFvTa9t6jaVCKi!W z53PRAu{<7x3?piBN7-Zpq}{FV1}W5hE{k@4gY-Lzk$}NXlwG2VNc#@@oourWaA8`=rEq+B+RRtbcis@HNorK0hxO> zVP*0P2~@4FQ&wMWB0#mZ2@k7g>uPnw9QgUTOe;|?>cP&z4d7Zf?oEx{2=Mp;Q~#rC zqhshD+eZG(g!3l#58diX#FFjldMhWn(>oADzLF2be^6mf;Libw1+~k)b!b@zY0Z^v zcu}e0M5QLa>8L?%s&QKfL+OHTlmEw-2do)jpgO#^>|l$VhQWQ+A`croBY!cp&Q$XS zEq*GpWWJ`pc%yih3hkO>IRPO1*0hJtwOdXj@))SD4f8H5)G`BcVgnM2DM{SZD$zLQ zaJ?X%lvxV~Mjnd|j)bLcJX9iYux*pYUF~%WA2qtD)XG|^Z%ae#F?aiM=R(~&v|cmE z(;Orn)I7$Hs{`*?v5C&$D61@5|A`7yme1uDXORW4Ld-B7 zbNYu`9!`BMc3~@Vi$n|W7JIh$NuKJH-6(c_-ZzcxzA(3M)~Y8s5?#N6USkY~38M>I zTQ5_g_WRJC{KIL0Vc8ATLvy)~5X_Y@tylS|i@^wAB5da^?njNiG5rx;d$_}Qm~div zfNdC1H_YdD8)wb#wmLYb(C)hXpqRhK!?#Z=5~)B&%v#A62WVNEfxVBR#$4~sV!(IF zkF2MqXA8Mv&9U{aU1%CSa-9pVgGK&eW<2+<9ur)b8EPP)mp&Dzx%N>D%`7$Tyt|;5 z2-b+t?N&ixb7H@hlV)2a*LF=ZdQagFyW41(!Z^WgoGV3~`kgfP-%(-aOw~y5|a`RKNm&7&SFh8vF^=SN?UxY>8Phiu}0vOO!|~Cg~~E2B_r>6{xM$%n24V) z!tx47LWz}y1;LWUhQ&$9(*mfqsz^a5L?OVuU|hhu6#p?rK)4NB@WN4s(DxWO5H&4p zk5QPKhS#AVw6TTi^uqRxF;~wJjy%R*)eK>`X0Hc8*3trgV#*Wb%Oh9IwojEOYN{Yu zgA4giZl=6k%?(I)f;Bm?xf7xH$I3tJ;1?g@Uh81PPKMoc)v$ntemQ<(<24IR67xJ| z_6ObAz2^Twh0VhaYtGyJI~7(&L`C{m>in#1c3o8ei5hIH0x7V|=AR7(EpaPzDhpzT zUl3Kq(V6xU$(!Nn#@eMpb3E1a-zgRv!_{{rU?W3{QT0{FHO`fkY3F<`FSR{)IjeeT zs8>DkIln>;Vg3aO2ti$HLYzlx?THHGjvC2ot@H*V6Mj0R3|jLq!`IyBlLTwof8Z{3 z-~>-cTfc#Or~MZyED_?|6{c$Y#tSW!#|Zk?9F3sC6(6NEj*+s7XOuV0A`j5sswCJb z)7ouewLz@%Yxv^3Sku!-%nxGns4kCn-a1KVI$T;QmOh}heLZzQ#$FoZD81rx@crAv zFe;<7Y6dr&hdX(z2@HhqFf3r!sEY==j{l&-LaiE)8mu+G7j&OH=x!1Z3#7|fvAs`D zJ<}Vl8IiE$w>*{D{9KTti=E}2Z%eG&U1m?=0dCPSen z)}}W|nW`1q1zIlqyfd#h2+!!s#Bd!ug3^ZsU2~urfI|a5#;ar&dL~rr)78>G+XwpV zYGcAXF~)DQM{W<^KZ0*&?KffA>Q)x%Yejef7EC)Vt15Hk@UHB{E8YEMp7d(J$2 zceEw&^+?Il=kJfVM|^(&dgl2Dm##z4P5TIch`4v`8Z^=Ss}{ou{$6u4*Kd_mxR*t?n53NGwk+{pt-Ea4* zpM1M<7e4JPzT7t)f8@pI>|k@R+qY5$3r`>N+FIm)XLt18gRjn){5zhn@;%W@01(p1 zuhEf@b+(s^p6Ie_ecCXO3yYky>8H^=d7c^u_p9O%BQ?3J zSR0G8$tN2v3>r-j9eSN=vuAeWr-pY;%4TLPN z7x@yT|2dLQ79YqUvdatzZuZ86=er6xaNRDoj^T z-mg%=6Q`_hW&Vf;D@)2u+<610!*a z7>shv!X=89!yLPuw*j90qNJtbSl_kxCU}1~24DTogpFG;CvZ&2;qmIgx62$Xo;qtu zAV_7!OGBaC;Re5{FaS^}fr_GNzP%(cSEO<;&B*m*;7Fk$vh=90&!D(#+AN0Ea^Y7mz7c{-yA9cibriWMET_NqhOe^Fs4zpZEW~WyE1cxR6{0@xED+b6)^R>*Q?IL2n`Vx0m{ql@ zL>T{z3hPHhlG#7$Lpm1E42AZbe-#{n?_}l{U8tQ=+|lplyqyUD(q0pr56zbS^4Z_` z^v77|m4RY0kio@6)`iY?+)OHrLn4Dwa?p-Rg()(rumV_U{Ya#|3Zb8cYz&_S>tuO z2E-K~W(l-%c=jzQ@z8EjIg|U<*rIIz) zSYQ?CKn@_5Nrk0ikTL&Gh1KxAs6hm#m5++q^zGXZM6kWO@%%b5U0o3P?^IYfszJA3 zc?a;D3iHa3RWy%dm2a8v7Bl-)c%oq~nDxYqWrqWChtk&%won)kb;IhuF{!Xb*ZahO zP+{r+qQVX_sW8=lQejlKzo{_8KU5g?4;9uzVP3@U0vQrxgmQY=n4%2GERGex$_L?O zN)ep$00_i&OiUys=@by4oM{5#poT#%Fgp$Sse_dckH7)FIa5P(y%<3Z zO_)vityuG3L=uq!#C_Hu4N0J?^tu5NS>-GHKooG$kIPvGGfSl_EDK3O^J0KGT!lSN zHP7v}Kc?ri7_SC*-u{>u$nS=Yol);ccfYhe`SG4FKWZe&KXOTl56G9jS!8(*%{B=b zDk`VmZwL;L9nexg$wT%Y(mg4SWwD~+=~Ngg9pz0;308Tk1(OF*#meQcYr(ilR02f} zNBcvC$qRImSJtSAEzQ?Qn}S!Al++W%)u}M;^fJC?;aJ_NdMJQFLk~Bh+iRdCfIy8} z5MM{?`PQcTE(K}qwkWn$v_tY16hMySx=aDgkpXks_+1*%waZVcOYPuAI}`F=Z??A5 zi`!L2s<@;F_-{HmT0EYW`Q3FKq?It+2AFMmOgWjLKzXTzJ!92v7#9-^U;)$s$OM`{ zQIl6Y0Tm7i^ae~FOg9isf2F0VM#5e{uX!@Kw?P#87!1JXHwG!C!@dI@Ug(5Wb_eGJ zzKK2v3N|#^H88cK5h9hhY0PJjk-mXu5{xxDS*=`Bm5ohJ%;?)&C3ht(BAbezqJoL( zegl9OX`NJ2>=jY08UWz0lr9bchrvnCJ>7ic^U7H6fxn$FFwomj z{Ff8v8Y9rcUDgFtJ8jez-k1Kz342Ta+X;h81ptg>9`G_Q1~HwmaLDoOx|3E^6cOrH zUK7k<0oMR*t)PYjO~UGc0=9?vUbA0LSdl4K9bmzT*UdK-2nQVBP-6oC=GK@_7>jK= z70PI-DFz=WX&tYL2iH&!ljC5nZ3j(2XZiGSNr0bg2YFhDsvy6dFedkMgzZtS%%k9N z$n#(@ghcJWRT_-xxO{m~Q)*Bx9c-MG?3M_*9@57G`R#-${lf`UXWrV?0)+o^!tN@w z?9DQrFz`iIY8x>8Zzt?1(+Si3%LyX_|2Sc}IlrB-NviMp_yjOh3sSHsZ@YI%-olFM zgsB>G1Iq8Wahv|_gayP(HvoP)VVM@?m4BQtfc%Eu0MiLO@tE;XCyW6JSxBO|#y^c5 zobX@}Y-m95n1a8YFqU6V80nu*SbWH3EY4L^AR?Z?bi&FC>=*C*a4NwJQjUL&&`#+XdlUNe~Ln~5Dib;qeCj<=>W zy2k*}1^8@Wi|rnhnF5OCC)!SmMUz3%X|eEkNv8*56|c6VC&t;GSU6MCKb*QFEDykP zw?(hTYlK@HKd^xbWRqFAqOAt=wjoCy>^jxQEneUL=`^8v>ahI-qHa{C{{TqAA;BOq z!Ila!+D?+s7qe3tH%gfx-;{Sx5j~;v40!Xfp$4RYbkFr2^6=qjlGr$CI2D>k##z%~ zUC*AI3)ie2vDk;hVGd#oyF@wqPoDwYT+h(Wrh;61{iz@}_4_{SEZpS)BvU(C0ZXk6 z1qXnoCP$s@sOU87Q=T}lluF>GBW{N#e*}WTWa{3%dNM>F0BE6WL1e!<0$8{KRZp=j zb0+vC0GNZBxK9l%i2>(3usPr{+|-+V3j1c-W+Tm#&j8RHmRX_?oJmw5zya^pT0U-; zcs%2x4l0T`e$BxtH%%d7_L7gzv`9o8&_ow6sgj<~79Ie`IZ)vPs&12176(9l?C2hP@;IWVacUoyJ0BqT-N}BP3Zqitm{DiUjC114 z&Ri-ypBiXOgJYV+v#Hpk7?>4+jc7v%jD>qsFafbFXI3O*h#;K+b zChwFRIh9&OolOA*2R{U-wc$oh*_6{8p8|jAw*p;jD;8&VW{RfOKCV1^BhUgJYJTl@WN?#If;x+JI;i|zn!o# zu_O}!G&eD`*}Hm`*G=cKnMNYbn4@MLQfs)|;$ z+n&@V0vjL1fx5^-rw<;-#O?cH2o7IJV>)56G=F(;6F*2+m5Q1Kd=+3wy>~yu>iT5n zvGd$=aGJ{>C+xw0IAP;7U4WyQ-P9h_2}Ac<>xg5mSm&M`aKM}DzZb(&(%d<+dj3)6urZx53vpJ3u3`r%JCSS_)k$zcGOfT197zTe zxmhGv4!dAtt4)~=si&d=5Tg1;%`V9d*8gzAWW)oWJ=*kCen*vdrV7?h z49;hFv;zEW&zJxwE#3x;wP!J%Fe>mSasz*3T<_TjeCyl2E)y0B>Wk#bw_#+UrTRr% zQoMV(MD%3r`!{pNOeajyKV~2}+WWT?7VzO~>fcUSInxPC;7hbJ=Pq^S;Z#sO%^nrM zzay>!3|@i;0Q&(DGT?N0e1*Y=_e&?H6Xwcv!VZEE{(n1RmBSCwf1I$cVE&J|gm)vc zPnw3^oTNt|y?4hsGMz9T3lf!u_||#O8`B^7ZYI$GnM)f7t;Rq}@QC9#rW2OkH}!_; zgh4Iz0Kc5D%FiQWznrilrV}O~2e4o|VcHfZj)ty^fdSx@{ZoMQKt=zIWu_BmwiPw% zgMJA#ro)&_*j{K03qk#AF4GC~@sF7czVv*n_>U7t_5b!CP8bz3yNYm0`{RVc{&K?V zLZH8#Fe;k$5s3Be9y*|IEE&f;@cQ|=9Fj;iUI6rI$>Q$a5X^o zmhr6LPFPk+Ogo;2O8mT+zBT}`IQv62xQjcU(gKAfCu+I^d5PD0W@&sIoO2?ePOOx6 zXSl<>H17&b?`xdx6u9IixJ413eh(fw%5}l#*sbMcfAW6!so1=o>(o1+cB|i>OjN(2 z1K?0^hhsWX@)rRBpat_#35F)~V7jJTjlobs^<^tV8W?P-UgcnYj3o)Fm17H>BLg9N zQf(A@*8)U>AHhUB4c7w%69m}+uQ%vjK)(?hPfPwBD4~%zN4slK*gWaFlo^R!vQUqJ zLJ5gAL+%fS04rU9fI9{N+Z@guJK{=C6CA8Y;%2NJ!=hXXQgU zK*l?Gaxa)!VIc%_V|j=lA)a_BF~r0UGg9a!^~?rCgwHYlL4|1<(z=D#|D6g$X|b!k zYFi%cli*kDSo+yZ*0lJ3Ec^6GMD&RwOV!M`)6Mvh4Ts7Zuig-|OAa zUsRam>B`0md7m?1XAMr^c5dx2wR)l(ko9*fIViiGJANsqqHi&TY6G3WK0$w$3xtK^XUxH~4%QWza&GhEo(SK} zu@TP+9CGwJZ#?~r3hR4*NV+WIXmLy|AdvUZgp3AEZUY;`nam{ zI0$j2jrZ1CaitWz_iAuFC8w_90o-$K3qZsNDSRwwfOWY#3EK#^m=4V4ajBjma{fUw7-O`bxAd%q(D850EHDQsrg6CdbvE+5QZDKd!6v~Sf;KcgSK#-(x z!dtlNX;vimR;rJj-~yS(FMp(Al3$;iN2cjo)uhT?2I1Yu(YoCm@O4jima&cgv5fFU zTa$Uaw^|3rcIk0&&LWl)oB`B_%r6IdAMe+I8V&=XILB(4R9MlYu(WnH+~^@IUP$jY zw?w3?JIt_UM3OoOUrDbJ1rq3IQ4@dq`77Ts_m(zcO;6`)jL z2`6;-{sPEyUmtwO9kxw{AQZz7DjV6RZKd%d`sNZG$wdYKJr&l)Ff=y!iwcu{YiwEZ zn+n@E(aU`-bNIznhN{3NTWMWuEAc2WD>>W7S01Q)%H;U4S{9;AqA%wr2cPI*!eS+ z@1IHC@{&qA=jVyj?IsxwbO#9D8c*=DW-*KU49==~OVHh^jb(QZyS|mpB3_)|F*Wu# z6?SxXtV=~t0eBAgiwg7eJ4$`=T9Ks!hv7_MR1_N=8{q z?Wg{n=jBHTmqGN(kP%|G?;8q}3fqoRL=;EzN+E58qEorWk14%*YChTiEcnx~NJ@-p zQR!H`?vu01^YAk-?z&08FU^77d=$0dFmZ2^w$D=d$nugf|Lu-nR9HpCN;*7P235q8 z#4a$xHk$0*aJnM0$L;&@q^(6$bt8V&`rZx41QTaAbYQMq<;}foyLO*7ofvJQL`SvT z;W_uuM(;EU7SGOFf!C5vg0B;O5nH1#Ei;We<`3O;t-r74m@m-ykphXR zqxb^+DQr(Qt+X;Zp5XW11kg6v-pPyNSoWAynBT34MDtfv$@+n_*Zeb5pf^5Qw4QtM z0DSFI-)s4^Bi^l|-Y+-S-{|gryW6-sADn;ey1n2n#_9L3j|&Q%GG#wBN<%kFtsh%G z(%E)e=U95V+Rmsherxc0$3b(^2iJ4N+a_y&Hc%kO@BE_gQyND?-S9JKZsvuU_cYmh zEx&)yIe5QhzPjc}!=a6Mz4qCnoG&puN6+rhQluA6ZNJcYDV)A%9N9^Y|Moc8+3n_= zv&)Z9`0B*yej4Y|X*kp=7fH7LTpj(R+!gBBta?1+ed^Ss8)&cM%nL7T`mcLN%k9pC zbJw3~t)O{*BWE57f9vf(B9^$j`_?vkHWKF68XECq(f*ZF6T|0gkK9j+<%Qj5$iqfTtK+-1+k4;deMc?N=>eemZ8EoavH5^E)2#)F`eR45O`ZUC z^S?|S|8YhL@^V@}Jg2ng(*1QiZFxG{!I&kU$XejCo1V{#zM0vZM>~+oUo{QyU{YaB zmMgdPDVuKt__R-tT00iAVy+b)BQBPr>KgN7ARlY<=;y;d7o4DQ0xIU3o!OLktm*(w zcmBpoi#2X`SaIVJ--8(4aC((O0e5OLu-kbLzOEoH{~_NtP_?Q;3v zEy%VV(u3TH&~~q&UJKw}ESGURXN5kCIgu&3*gov;uU;f*q)+@IqB>1Ly_kuZbU#NF zIcow(rVXB<7V*n-^PI~)S3^f#9t#N<_0{fqfZ3zGznY{y@`%U%;OqdWZ%zzXcz5Uu zSlojS)z=rWI_&!t#g~H+Be7u)Z>*R35;suy0W8h%+~xX0T)-6x||ImAO<5XAeG z9@FYh0CX;*zk!d$|5wb9DSQQ8zfm@ ztJg?@>9qk^L7@%=&X_ zMwcD!$rbp?O)SvHD?yoUpD%Sqc=#j$WfeZsrJ0#4kIS`ybdm{B zRK^OdW3`=CoT4}^*x+4Qvd599VC)N^EQXl>%JLZLyOeIDWqv2h$>-{uv8$)vMwKS0 zGr9z~yG@70qesaC-VDtb`u*#`?ctuoxgbfZWC70k2i3B=dL=S-!NmT}QtSt?+Y|EDiE0wejiA2ZYNQY|3WuyF0BI z4n$0vU+9Z3+X0swJ>iG~Q*4UzU{VmwJFs zdOCZ#)zM=6`u4>e3z`V|6Ji(n%O!38~B>4k*`~ z2jAE()6j6KxVEpAEW54fZI=;SRgYd!m1>L61773ywj1U@PpXIsArcKZckTV! z^`pXkMAC7w^s1=7c-juQb@R?pUDYE_UL0H5&bEPTJS@DWO1%8gAoQIk+dK2_RYAoS z-=}YL-`vp)DVGp)iT7Kv&hrjqt8idhOB*Rq@9<`NAej|!Wwj-*>_C66TmForaJN%9 z)M?WjD0e@C(bX5>&qnS}a86g|^&6@12}&DmEj|5dS0d`AjIZ4a zUA(GO#&UvoLXUQP4_`Zw_6926@(8sJMykv7o&!EeuI3C(*X5|5w>enD zc{hV&RXWmVlcloq<>jS5qop>y_8{KZ1Z+RnhuXa!NT|M5Dv24aUMZCJeUb9^&4w%h z>-%&sVEblr`6mNpPA0c2hY1Y_`yI_%vA7QtAeIciDjj4;J@=t9( zY&oBa$gf^DJNahEU~OS)t=D@@q z-mW3%;A9cREm^pd!Y#9%YeU#G3~iE9ryW?;^BqU)of{SM2(mu1w?`X8LFQ-Z4euG) z4MO+(^H=`7_GxkxJ^2wt>&I{v_-1{uOi9IEL6dKPP@D|jJ~t=5wncZFMcXk@uI5sg zK4OdHFNEDlOHGGUQbG;Fp&`D#im2hb{Sq#eD~g`?&QS%AP$XZK;>Htht@K(tdg$wD!k7Y+zJPxww1D|c>nG0-KTpe+|LTx zTfPh6{`{JF!)EJ|_kHvkna^IYFl7UM_3?tbCdye3BZ?}aXP$z6%T)cCR9MgP-H|>! zkrh8!Gm{EqeE#B_FCQquq{5mLBbI|yPez7XkF6JNUGy9yss*29Qen+}hfb9|qnPKR z^G2x_n@bm8TI_+;&fa5EVPDn0`f{2Wygg-b>)rDCzU9b_#`7PTRM^=D`4DYv!mYOP zviMVvNq*8kuUXnR*&o#;sYF8dvRGuexYlJK1{8U*r1O*6Yz4*$J)LS2S*1Z@qC_<7Qdw z&3hWRnp|q_y6Ne;C;Rn!X!|(S4(IkMsL8N`FpIw zq?WK@YInIpPmoI}O`!NbijO`EDHg;R$r* zL%TqJPsXR%)>l0`Il2o>jtm1*jim5)8Oa;|y*cD+YM)SK4H4X36u zia2B!?48ezS3gb;!{0p*!Acu#PjFWqqO-oydgg=QA7d;T3W~g5HAQR3Z6%8|=KH9h zQ_*j@?6@`6?fZk?(VI zkpe_Nj5glC!g;Ugmaf|2#<$$6Zx2j0|2q|ySJGjW^vnxcpAj>J4mY|ag1J~&Q-n|J zK$JA4vxz|}NKk_^{lntZ#ogzPR{Mnz1#QekS`6E8H%&gyj3Ydc_bn29JWi@_FLoz3 zIfJ%d&PcD%ij`8u8uQBZA}p0mRGvl7U@HXe`R^P^$p@JW$BS?FE0=SNj1Uvjwh}T( z=W?({;4Ral?h~iZ2b$Dg78~p#?G&6W7K$o5i4;wGdgaN#QDG~c2!p_KI&H4tV#XoF zs#x+N{yU7+O?1q@WM&!p6dhfdz=F6Uq{f72u>=#vVw$|~>_3+iI-wnUh35uCK%C&gc}G2Ay{XY=i&mx=Fy zx?&@#irAZ~U6yuo`qaJ8#uLrnrUo1lgd|Y6RML#G#-2l$23L)=w38q4Fy|+je zeaXsCX9jKyD81e|9=#b7$B+Gxes#obK9FRL9^Wvc>#!fc@gG!J>ppR~a=LItKwuoZ z7N`Isa&Ec?bc@dzp&}AtwE0Gwxrtg-?QI|Oe#oZK+v-_aQEE)iox?j(>&rZmX zZzTAdh<-8VP=DdYJFhJ~ypitRpV%yJT%ZaqB&fvH6n+BHI&;HY9-fCR+esj&KEeWA&=#Wx&+UUoGW9oWrHzcFa5`G-J>eC_c zHx>33_85`Wr2a&0uqy{+mZCjVv?m}SGAM-B`Vn)^|A<)mYUPt1t8=TBoj=zIJ*hFo z*QFajeYZwo!9({*TgEj!&8!~9G;fWsD=TiFUh-YEj%^BniGVc*=mcXO&kLWyxAn}Q z$_#%VaKY32ewM?8W8K)@*le%qJvDw%uX2}Y7Av@0NJWNUHYAGdPOce^VaMe?j0fmB z?=ILI{7r=&#*+72}4-Nan~iwfJ>uF)T5HNsppkvcG^&k<%OqQ9`jHoYkJ?GSiANNU!n*z}N5TT+snn2pM;2a8$Gqod-A5}G}eRb^=fLS={p8(-WC zW#blXHePrs-^uC>LIlB3HF1#m_Oc-p+*of$S!1DHUH9waM{M5 zdKxi83O{X!)v`||HW6PF-`OBzlBJjJI`tKcL&Mri4=1ea=f zN-yuxORX4%Wsf#lFIIIm)3T8_p*Aa531J5`zJV*s(pS}Z!bRd--$x-Hlm<2BKeb6a ze23+mnM9TnSr4Z#8b{=wVR*0{D8^Q9FUHawz7nJZyz`Mtta@{8m>9{YeA!A8q8x>+ z`)3y;e8OKHxLsDw3@z@BxO=;WgdcvX6j2(8du-_ zroz6YveEKc_G^wR>062L&RJcI*?V{ts;(o{Y{0*MX9$|TXcc$+}cJ66}n06NhkVs z056!n9;0>*Be?0ms4%YKL3e#gg7L0gtYjqS*yKFYhZq+s_2M@brV+9c+l;xKks8M) zeB>`GEZcH2|A?~BvBf-|m0nwn+~tVBs4zcMQr5Vru+SeWOqz`yt;93*KT%=$3ptJw z0>Gr3s5IZsN&4KObG(sq)H6_Nke^Ftf2klz-k>d}P+t$Q}rqUB+ z)%Wy?%sdwkvF~&Np$FI!{v8(g;4%$n-gX-i!}!iE zhwhV2!mq8V33VJ^OX)5GYmP2~gSpxFygVfcn^q3I4}jN;zZWepjhR0_L}0DiI&pk2 z`IE-JIYzp5sQWm=C!D~4U!X*3Hi>FI?O88+Mc!IOOKD5o}$SgY@np~uuA@<#_$Z>G){kPUklPBnmJNlU@wD`Cc2^P$S}&0i;My)}<8g6Cj1 z-yV&pxU^5kEC{$nE(+;g4?l8UAS5>Btj&^hnOm~jYI4qr{Gd@edaW5mebl0tbFqIgc%rna_JQ_X#I(2`MvY5F_X7L*t+-7pmxB9 zM!B>|SEUe})gCY7|3@mUl7YB~LDpL#In0GV(Zn}^LfYN!-j9&Y3^sa|&`_G^jvL#| zB4;)gYC#d-aATJ*U|XgzK^tNu4ckzl`=&r(8Z6|+1hd=)GkkQLlsO$5XVmJ2hHM(AO{PYQd)h;f(LP0NK!N@|P z1qzBuS}|j1ofSk*S)-QSIIqp?GVv_GQK5}_P9q90XD#+tpa5~6V~@>Upn=XE1>z}j zCze{4VLj0-M|lwE|4*>DfHNiz0fG z!fQd{*QSUviK}EFzI~2^X)HxjM88w`yyiK~0|l637om>-(mazjV`5Xh?sdHG?gHAv z*h6FZNdQjUopfnTXkVSKUjf=IgShHOD5hcq>UeGk2v*EjcqbPyI6%1c`5YHq4qb!VcrUB-MB581D$6OtmIW(+kuCVMm%zH)godP9GH^S~b$0Wc|8;JWfubw%M z&j!E&^O#*>LCq~pH6Ff7h4&JXf*Im(fVeMn7~f!lCR?r_6#gXuUVC1+5r~^4Vh4lS zxrO;p(vidUXs?g^t^rX4HbOgYyd^-fk2ZX5W3nX_ekVn?Wja@w4f`5h1FkQG`^346 zA+v-CI@RIS#>A0uZ|D+b)_h}De35PkV1)w3^O#Ap=tG*}VHRZW1Pme6s`#p-oOrTXc23c$hH%4n3RCdtd6`_C_&FlKbgck2pZi8W z_YTD_ssI;d$99cQumrLiQSe~^A@@0sVpn#HO!1f2h~xt0`#9DdTSO_A6+O=bvKG2V zNA5E0_h?s9c38L{)9#z&an|SmLS<@RJR3BnulZ;+MJQ7E7>f-*BOeVYKs(cqe(l0r z6yPM4B|l~yO9>%1UOhS^OpH*_dAx=3wL_pO$E8)oBY~L7eDtKUpqHEdS{LE<7N(bK zf69$Y%E9FQ5=NDkDT&AJC_B{W46ckfQQ-q zD2VP2zlTJB6(EKcggF;RD9nJqp|C|#@Qe`g4Z8UcH-X7H4tW}@1)cbiroZNT9H!6T z+r=*Y{FH<}$BkftB{Ke~yPz@!!LSJyror26@Vzt?H-&fInlCSeXPwDCZE-d{6Iz-G zKK(VUC?oWEv(QroURf1nu^Z3o93j#TrtRi+|ISV0#7W{J$UTV;`VG7 zG_{@$ccm}z*TJ01z~GYnU>7&`YjjPxh)(8~(2oGlV@HD6fzix4xRf8MFwaq(Ay*$L znAYS8Q-XHOd-r_iUYz$<8N;u-647qX5|cvO6m&0;<&%8S_XhOdWDqQG3tvCVwM68h zxpEnAheWBKf!pI*L1+96kl!}WfHw(C*Ze&7g%)VUvsCfuV2)_@#1mIFZe_w7XxyJ^ z(ENJdkxiru4YfK?aKFxxJ;&t8c*Q0qgbN&|@F7>ylkaB3z2;Bvd(Ppe&o@aV#Wt&b z2MQ%xBVN?SG}4jDfg0C2jtL7pOFv~=b*Uc97{OgE~)pM6x$ZGi(_ zz}-&f&jy@FDp-~P;m(1at8;u!6L<@J)c?cYeT6l-wf&+Ggc5oJp|=1^2dUC~?^QvH zf&x-Rnj$D*AoSh@q-c=dK|txf_bx>+bP$msMUb|GQ|FxTJKt}8-#S=(AMACWljpkf z+|L;IxX1tZ*w{CtoN308@e3aUf=~FFpae|Z^SB+Q6CT**W&2J%5;bs#+N6CSu@GT;k`xC zK264JKt3jO!@mnBxp@dK{tmtiAUV}%hGBs`TUb~YopGOpBvInk_ zXEn?ZG-hB-vsff6Fzc;FJmfTmofoG5VJ^EdI45}G8d5YVsPdq+yBUWXd^7d!n>@#Zfv2KXJd|7d_ zSTZ9|NH@oxfHd8xyHuI)b)*e|N1H~>UQe0Q zDde&$Fsp$3zO;2Y^$aZjwUg6CcCE}!Sk^l4R5Iz}^UNq-Ws(-tInP)7Y%LNW!XOJj zz?ouCyHj-&>WkyhAzRi=gg%U_i3(~r-6LA5n=+Y5Pia2161_L1U-Oq2b&swe5!j}y zw0=WpcJltheDTSd0-b7RYfyy%QCq`IrQtn8X_JpN$)Znnq&Sam64q660&{FTk?)=f z>hCX7!K@R%1Mc0yW|wKDuu4neGp0is(OJBuX^W*f+l6}0F}HoAYOC!vlFA(v>fL=y z^M?A`Yph=DXN({N6uVIj~Eqzm#t-O4kD@%wnRd7o zW38+GEzC(RtK(Rd77Hr|uU1T&R*jg}Y-QK9&R2|HZCIXf82PO^$*#J{u0Ls7e=fU< zc(r+JVKd>?%6-|bWWSBtg^fm8oHq+=ZnCS7U#%N0toz^oa^uw(jq}Zm=7p_lzb_H5 zz78^N=iT1cc(q-|^fl=A*MsxbW7*Beuh2>@%;kPNmQ9;yOq)5cR>m`F3!AnznsyhN zzO6Iu5H)Xc$ZZSWLASp`vt8_F%I+?|+GV^zOMOBY+&A1-l0m*JVrDVrGBD~l{-c}KZ>|GN_&2s z&dnhA^ON zBeYGqKv+P8vMB?E8(vZbSk-R&qX^vuHW)^Mh&*Zu8A2qUu2w*-X;IOUock&@J*lv?&#|bq z=_y@u{*WeXu-gVi8H7)nX>9p95uymT6t8Tf3?jb85j%8B-bUGC*TxaYA~350>J_-y z`VtLYt;Hr?6CrPdSOg-uYAlUUtlus&nHSb>j1jst)KXU96dx{NVhn!#;3#T1Zwq0u z<$2T!6MOlnB7mS-H^N*fh*yC>LrxX6QbMVkHut*b#)(!Zjc4nFgkE*C4&-Ns-Y9aO zFAj=z6vLZ};Ln_b`4aQz5qrrTU;90VWiN>L?H%`DdW>#ba}tGoP7y`~*YXbxX7W+L z4aog0Y2-|8n>Y&U@)IhKj$X5Ae9UHZ{n7?&>4kO7q}T2bniyU2=zX^E;L*a>uMCeq z`CX8RM}8&N^;&$&ENhukCqdEnCgCOU!Ni+DWcP}PWrhaG4UM}UOUqV(-$7w}(d3=F z=9Nl7DwJYm!-6?iEA8Z;)A;UhX#r^q3Ek0U{^Wl_>Vs~(_hEol7#8aD&0}}S$DKwf z)tr_+MZcpe?pj7s9*>lLozn;do z)gG0B1e!f9h&6(8nI`&ISY)9sIF5FZULu7%Ow(mpnj_+iCpy;?*XTWzNhIh#4&r%? zE6n={hzD&bsux~wh3a)6pf>B|!d~^*l?yx3{303K^ZKo`aHKuA7XIRnc4YT3B$f7mC>C`lK zi;^a8P0_B-dvJw~_E}*T<4$O|^k=sipD$SwE9z=;msHpnHW(3x3M2m(8~z}hZCAWo zjnGgmzKV_QVB^YtoMQ6CTLDr-pg#TZw^HDQR_Khn3QHs+!BU(o@rrR{SH;*1y#QN| z`!iR~``#*64O~$MCq`>SGn1Q&7YMs1`z+4mg<2VMlOGjm+GywA&MM9oEqh7F`Nlmh zJzG1~aV5narbN_ZdX>amB+E6rGJXCLOj0Xb*FAMSZRd)CbiV{Ci&ug@x;R3-cV@_C zY~z*qL>}OD@_H#XsG6dkT7v8*ZW#k8)L=Sy)ATt7x;-1F1H^s(5>dH zsX-0#qpPx~Bnv_f9s3yHR%P}2aFLkQ8T5&_3mi5Sklz1n2yBJs;mvQVDSS5K?g|hZ zOvyKTP1m!L#@-L;_(u)5_*@fq zWcAs6{NZ=2T4z$9*7{q$x=-4mc&SpHS)b63)~~+?k(wx>!iUyG+v#QPLtgJKf}e}F zOYv6t79`w-RGNnHukl9~Wd@4^B?BKu5fCK%oxRk)HOkXI^A`x6)6CL@4$eW4QVHl+S9vl7<8#kZQB{VGGA z@!sd{7pZ+HEu6Y5y^fx$?RUkidiA_-U5rX{KzM3hsepJ>{lZ*>#9Q{!js9DW#k1;+ zy~s!^-o~5S%#X%1NL%w1&WsG4QYFH;4hjqgZ{1+q7cV&!Def#%#XtKlp1M*6cbT8n z9-bAizJ5?5VIF32e{wjYNHp;NK(CtC)7c5#(cR!@Ul`XW{8jOy*}2+-ZrGX6;W<;d zH!%jd;{O<#yDnA+%>C-NRX#s|PExnbx5Mz%)4Th)yrSfp)6>`V3rj;!6d)Zg%SR{U zfDhM%6a{7+F~-_y?Yq{Z8Uwy;1X2AV$xBYQ&WK@i5Z$^LLXN33I7Vuje#gs2pUFLL zId+wugLL4=D`%UdB%;lc_VQ;5hTeVYYy9hjgSJC68Q)JY)*A>U-`zh&hZ^~OHpV9l zj=QZw$KEC86Q?3<1AA1wrN7m@`Qt-7zbDUAk1hG{{WGtlAHDUM-?j=l=`^p`d+%r7 zX+l;1aqjCuU7pO8v&B1?!-bh9<~sNdmxg6)kAqe=^Vdae`$6Dr&E9^IycavREpPIV z2aOhnYL0!@l;phe1hNoh%uu@BVChHOWlEfuP~oHb9~jk(se zt3z=-;>lknT)d%nlgL!b9(lc(n+l8>iC{0^&|Z-15gbfSnSr?I0S<}PxF z?5%6{Jigx9LBgFM<7&-6N;7%|6aqqi!;KQ$)24L4byISo}P-DQ8@56WG z?~aQQq`7q()jY}$w@=q!(CD2Vg#FxhM zaej#ZPby4i`ng|dXe?f6P|r^)EdQ2IN@xO&K?Eo)8AF9}hQ_OhWti$D<%eZ*hG!iG zrEi4gd8s1R!;5;tOY$!exuM||)5?V#;nnsSD(rDNXGHyxVpV8FGbpk(|9)LhM5mX0 z6K7;^eq?`*YKMK~Fi5(mCvsdps%0Z`Bs9v8Eo!nSYF<73LrT=*hVa}*)T(`SG$?v0 zKl)fPdVM2$Cp2WUCVKCPdm9wAy-0j&pQzsZ;xs<)7Hz1`%LF#A7aX@@m(lnp1W9JM;w(3lO5P+oc?aG-PBiFE z;K0COa3$)II1aAl-RUG3u9REk$9HI)1`*97ulCWB zalgTPlHfa&ELV`iI~(_u3n>MC!&MlF$t;z4O3Ue(wcHg41Y|341NaNm6$pR9U~KX# zWPofL?jZ5PEQ!7>eaDmy+E1-~pTu;?v#rCB&aTaX!EDERF~PTxgg8>p~=E+K2S zu#zq(kFY3#yQr$LNbRWb{cKEaT~X_g@OPh!0tt)T!i(2F6m?3*bf*>%YX*1p6}x;c z9&s#jcwaou9X2EOItsl2;xKDPGONK#42)nB;3 z5=mt%c9NS&RVkxY)xBmf|Lkq6NtG78YvQhk`bn-}@0 z8h@S|gjP+EQ%y)j&3{RSN!9jx;(OUudd}s2MdImC$mclK4fb>G;H-ZP0Wd-otpdT<;sMc$BNhp+?x;6E}9j} zn@8uGd19L!k$4fY)$RQ)X_-dqBKamzRUi*N1~q=@*h;P4TC&^HyX&^ZQui^}H3Nr< z<;%nCYHcLNZKZC~92N}BVik*Vtu3v2!l(2!3|E+lbNMV>n3CHTt=nYM=>>?}64%;V z6gqf{JBlD3Y#+tHE6_f;&rnhI`e8Q1!@hR;!}gCDFsz|d{y~RgdWQ;coBd&@R%BPs zXy+ZzE|h_Rp)}^lEjQI9dKEgaGIZ%Ow9i*`*`0PAjdrVk==ucdsTb^Wi0pAp@Ahlx zUf%8MWa(bn?J+v7^s?xBaN2F<)7$LU^WC7QX0&&CxA&eimLcK?Ch zKwv{pKtun@X!j^@-*|Cb~;_=*=ksAtl%C zsmEGVkLM?z@J+YZe=v^!fQXt7E18ZMoQ~R_jv<+eX=>HvR(S&UB;bVX0mTA*S@joeY@)NjU(n8SJ^k7;ct99->#AE30~V1*4qXU~moRVE6w73??KZ zE+zW^l?^8Ff7%913WH;Ku$?ev>j>rP2$R%E#7>mvhiHqLXfxXwO`q7RSFs`g=Y+6N z**@=b)8=#GA$e{|d0nVHcxrx2c){bkf>(l(vbnQ*yt8qJUjMSY`QzI*djIR`;kUi--wuxt zPLB^SE-oW+aah5!Et z20KgDYi>F_*NNOaB7~yLSnL z-6J>q3kIXYz+fski&w~`C;o)NbQf(X6eh}TDXieTs{eq&USeP{{$DVd4h9C>R3sF- zgu#TeQ5YC(=_d>(GKhh}gfC$*;Y%27PmB967)%3aH$wh?wmt?1L&yFL45pB66{%j8 zJrbzk1xmrdVE*ygGz60fd`=7uW_JmLk&<0MU-}7y4gZG0_KDzjC>GmaFqr#q7>o;S zuFvgtL&~a<2$#Hzl6b}e!^f@BuB)5!eH!w!C69&5*%x`%Kg8`}Pa9r|4I5}(r2AT0m%jD&EIjB>9!C<)rT)V$u zu+2YVFg!tACTJ$X<3C_9T?`E7@)r!2OWzyRbp`r4Caf0 z!H!ud{|c>v+bUC@^M4=ecgw_ub%ZM^JveTa^V&b-;p_(arFBp z+#kPSFm=W;B7=AbDBo7kd(}1PxMw>P3t%}=I~VqD=+4QK!3-a#VIIT+$A|ThUHvBv z))L@mb_s)}Vqh?u{}Kki7BHR!48wRU{hAxx1 zw1fz{j_xR5!eI7TWQP9=gTa?_r2(D2#7kGgrx|yin$!&A>m}xy<4}pBa5$1u9{GRK zCeJ~`m~zn(T%3etLKqm#&-y0}W`xAOgV6sI2J=P}`DS0jV2>$=l77NqQ>F-_TzwHf z3=AfH34>+gQdj*G2CMrQ7!2L=8wQ)^ny_N5Lj3~wFBmNKPZ%s2dxHW;`Zo-g z{u>6Y-1{418VK5}pCjAc>42#6?Hw-oclRh3(S}~Tf$P3^yGO4{-RydO< zdi)y(v-@vhus68z7#NJ>FBq&w{wEC9^%Dkz;FSLXgC*l2F))}o?$@^8Fc|z_V6buh z-!NDQasE2KAAoQK7HDB4;&bI<;gEf!fsQiY(%+N$f0+t!y@CN>b z!Q%fN29y2?gBkq+gMA}HAXp+h+&hdoJ==L{z9l0yxN1gNRZWwzA!K7l7I>>k?-KqK z3>NS&FxdD{7)*Ku#|ZvEguw*=H(;>w{|O96L4xi669ywEVMxXH{|$qs|AxT|e!^hV zf52c^B>xcxOaBuFgZwXHu;gDbm>>EQ21~WVdn~{c4HGdFAejXaHUsef1qMt169&s8 zF#87#77xI~z+laPgTeg&guyy7?>P~_m5T}huUY>R28&0EHjgA#VPG&ckpcz=GsEfb zyM)0SMt;Fy@gq1>Ekse|moOOrZy3y|T+{AvFjy)E1_KlC)BT3Q=>8oB%lQ)q+xrQF z0q_QAe!^f*#Qd&2c7MWPhd*I3j{gM=HhBqy-Tw)LA+2(};FUrc7>o||7Yru&3kEAM z_bd<)4Et|ku-2b2SlEAt!MOf}!3c3LVK9MTFjzCDEOh}7Vt>J4{Qn6CYyRJX!E!M$ zSUFy+0N#CoybGAv3XVT?l3fZQoCgyRLy4zcNlRVv^x^mh7vVKVsae5~Y3ag*Xr;g{ z-J;^%n7RHt(iAu&a55Wg&q=mAi<27^Ti8Jf&32el_mnr&gnGT+h)DzdHUj)yJy7v_ zkm9=*6-iP5rKFMob0oUn8QRcBUlU82Q)=a{=SVW1P~z3{Mj8d;U?34fm$?9lz9!>% zPh-c+6G;)~EJ@{z3Fkgv%z2>!-h7tEafK9fw11ll32D7?G(skwODS#P0yIboc_~!f z_713UwU67@L=`Mq+*+s5zCO}w@bocDr2aYzbid_Il`E-8ZX54ucJN6yk9)&nT8)8b zm`|LGmp0fKPiL?ej4|WRU@X!|vMzzQe2{Fb7U1`bTTpLlaMY>7LLFrE7H~Yw+6aY> zkVoR|67f$2TBI}T5q9Zwcdf%P;8@p#^f&jvVCxZjK5S@YVF2A8Be|~Ky)I9*Q%?Ho z0re6%HEj#z%|A-^>@l>FccA06 zzgt_dJAI&)7&g!VIpl&IfOB8-GPcuqq0IaI&AaY5wAH|Xxg#V8cM0F`WsAmkO$m@3 z&XNeB2&)-#8prsaD)c}Vi15FFQ%Ab}GDj#{N6IE3strS{qeEHRT8r|!dfWu}i8%}R zvz4mxT+rEwSd(F*tKuU#9>h60mHkir$%-yEzzILV)Xvl*IuNm_QG4u~7$K4eg>6*~ zIf{Ha^R5%?ND`H^;qeiSV>j+QO3Hkn&Uky=VKsXSbwyWbf=9cCt8Ws=mvn}0+|+k! zKoFO|q)d}!O2GHob3r*{mN%pooq{YMZj8Qq6gAbtB8{a;#LF%D1^I>xmc`dcO4mm! zK?qB4A;oS3enEXuenTGc{(~CJD5HNi1&b#=B-?2th0;=0HI*V|kUW)<9LE$~GKQVd z`JrJ^3LumzLzf9@yAOEFO(%&dcTzDr@{EpPTQ+h@4KT19Dm4$Wm4tknhP-hC_m^}k zx_#u0`bS#j&1Ib(>@`C{X$iK9DC;C?_RYC-YAQA^P(C z!pnlg^Nq6L0vvU2*FSAGE)eA9ds#(vS>f50^F%p*5>{B?v0k9Ceqx7Dl(hUv|H&s& zygOE|vDp6eGxN+oHh6wkg9L(}w%w}Z3^r6uHa-WBt?xfJeQ$(C>S?7^nEQkB12+VT zDV$=s<5!E#+EbbciwA!co4g<#AL)MgdAT{9NbJ?}r`Y0tZtxxo`>6?sLbSp?gJlRy z8;Rk)B?0djd7LrS-W~V2L8XiIP0TCV?O#%#K1LPuj*IorzQ{6p+Sl$K<0zTat-e`U ztP@G$H3$J}L5BIj3M1wAMddXY<-+8w6}A&^19f?8;2SZMDGZ%%$Z~$_(hUgF=(XK- zp2dnoYFlJEo>|ML_Oillv3$H9(G@oMxjbA0fcN0Ou$|u;8jFxG`T0p6I2Bg;(Ud3Z z4Z+w=U|EL=H08DjrwT=nloyyM3{WX+FiRI#W!G7SgGG3>`Z2MzS}3eKOt;Rk3i14S z1KX_{JFXgWUcLJ2wwD0tyuXf*Wt+?Gor4rO9!a+S2t1`PzLJGof0d=W&7`u8*dz@9 zM!jhD!>zB@cuiM=ZM?uVAF;bor5n?#r+EoaOus&P73CGN4OyWWD}j{K*M=&AT`<~& z4tNU&Wa!-nAJl>lzV2O&aGlA2St0E`0@QJ}f0N4j8Z#+eAxyF(OTqG*ytMyXxzkR> zfXL~B;uI=U#uL`p6Ul_*c@7_>U6>{O#H3?|w}q63aYce4yVXJncLF>!7|V2zR87Jc z8W#!ak$Txo6um)PC~xL9ZS z)@Ke1DZybWi5DWcXY#RR3RmGT$18@J_H$Drxggg=$i|1k3-jc!NXei3955`M`amxD z1bbgFTs1H}AHoKY51EXr+4r5#BrG=62G^#9e63~vi5OiPrKlCYcQ4N?7qW9!Fn!gQO}TYn#+DL z;v%C>;MMG2?qrfg&O13Xu8q=dZfUOv8s}ae56>hIzs$b(*a?pFG-^mXxMpt7dYOlRJ z{v5I0^bHWGE{{!O!{0iMN{Ep?K__5fFePXl`{9t_B@AZlrUJc4;Hg+Lv~p9WGkFBz zr>RJ&P-Svxa8rgMph7+5&qUs;EUwjQceABQ@2b5|`dFyW5f$jJ&RaP#Cc@;E|E-5- zj@7x3vH6SOKvma7xTdhLw!4;iUfaC_@h#=TL8df;0m;g`)Vs3MP7?`wW#`1kBNsB; z#>V-^h5%ra3fq}R0ybHOpstovps*3$)q|=$FSCU3-zT$PAS{PO}@)i z4xhw_u9Bu=zh-|YD#l_fJb05=v{KV#-rH}YeYLaua9QeYtAu%-+6y=_vHDUBs(EfwPj{r^v>Ct;wy9v@0m*zR+=is zCr65(FxV+(R_w#@I`^>dZy1c{z4W`c&5usqF)-Mv#|V#z)T1$q^~`9=H($*kM%@^D z_PD-_-`BI_xN^2fSeqc-bK%){-*4<94go7{4iZWUtbQhb-fPxGen;$4YE9m=-Xl1N zwT8EJ>0fxYtipT>50;;PX-{QG%ZI+k7_;6zK3$bqme2iuY<0g7t#^2&?{{+Y{q^{d zni92>lRM+$SYk{Kr9KY{u)iC=X>kA#z?RKnrar(%Yj+U-rq6got3zzVNVT#-I( zq)nvBA747O9d>|J0Z&y#`yDBeBLVH-Pe*v1;H{>T%6-!fHcDEIXa+>he3-ZbGgCfU z)ad76o;A}PlFuO&?jjIA!bidEJ?QA9!ev-mVz-|h<|E~|5k{>8@07Lx5bD>!oE2jIujB$Eo}RLT1_VHy;iv#E?L~S&SASZ% zoEWHmrj}6;?bQgyG*a2OIz;ZT>2FbxC~ynw&k6*I#9UOtiX^sQPDxXW*3Pg!_@SL9 z9l9pw0A*mxA4-vx2%%&Rh@hjfA!Nj4I+e@{R9rI4zUmq{x@bc!14b4Z@3<*a72PoI$If8tJ=2(G~PGdR$K+Z)#oh&xOojMxO}ES zt2oD4_8L^4VW&T_s~{+^s~y)c$5G?l81259|Podtv72&65` zE*f9ZGcea}bn#gNaIPKoZ2%xRW& z0L=Ak#e%RNS8L<+9%^cKB~jf~f~!7U6x4Vf$@&I&cT`!)M@RfQm>SHmgpXK!YVdS^ zWJVqLiSv>A35q4VVFBc=+SK}5aIxhBDZDS1uqgm=z5yscG|L_zYee-$sH;D&I!a4E zRK5z1vSCsabq*P9il3tZ!PJHDFt^s`DKgqq5;%*pekhYLgD(OHo|e< zDooeMNvkZb@_+hF4+s^C!eQhSaskubkH*PppIsAY%cc$P*}u2;U{k8g3Q(dS$I+x8 z%JQ0xBJbUD*irf$ImEH@2;BXJC_{*Nl_CTC)2gtfxG?Qd#dhz*$L!iyqSa%qsSRcK zCHxw%Qh$NEoj%m&eYEhmO@EfKl(l!sBxZ_pn6U})$3E@38UZOki0 z;Rf56h%I?w3Jkzz+js~fj&;=cfR0~e9Clx^(?WOpNTT>HswfPzNk{=!*!X}Ju@1;C z9*#V4#P#uOJ*X3O9&1p6{Sg!}0ssMEq(L-T*i-@m5WpgXqkIbh>dLADfKRj3v`BDi~nR+%vfczO{wsLqC8vqAmAOUnAc4DhG0V!@MdX@tqw@;kc zvYsgcpr_-r36jL#>X^RMt1%3ugO?v8|Gh&uZriQwnP|J^*GJt__>N^B%WQX!hg1Rh zg$$dCk|D$>cwc3{e}9!nEU;Hg+@4F`VOG{&3W96u5AB8Hi$h55;n9d{j149P09>sp zPnHV7*kF_fVidbF(9t@AIV_SyS+bh)vU^Q-QXz6i2oKi)U}y`}zs{XUXfv(R5dddA z!UjygBOAcjU~op$cN7w0atMj{?JYu5kYnMvTHz*cdPNnu0xb@J3qyVD<;GgCK zBP(y123_D96Zc8ApN~p92}3Zx7fLwJUfv%tEkBV^Elh+ZPwWpS@+UHeK)*D8!q{L| zZ7Pug9$;KR4P2hV--WY|P8y%|(guUin8zs#$Dx$!#Kq&15t8-@p*vYMM$?VfX~8gc3Cc*BK(kL{xtq z3*S?^w85Z@aG*fk!x4CmuuP0FBvv?18~|`tf^Yx;9LjNESPRrZflC;0l?FhDvBB~K zC>@%v+*1M21k)ZXXM12j<88E#ZFDbIO-K-R7O1@@9=Ew1=cx=~H3d-g)?QeMh^bbp z`6$QHOm}&N%J?}|*AZbk7_-|%CV1InGxgBqx8RFEQ*90Hinn-;#o0(9s7SDf%YcK! z${UQa!PG=#=%<4`suTdBftL9~_WA%)&S2}KfZ&g~z#4$DDW1yobyjh}8-`BkgywUG zru@oYj18tYf+t_z%jgBS1k_+`u*H*Zd^E0cIUEmFQ_+C16sWh%4+hr2Z|Vo)+t+F9 zV|($)VW6gq6D_*lz7pbg2t`ALG(IOo|6A#PkI=vm5f~c`&|Z}50c(m%@0F756)ym) zza+eyf|C;1g$LE4ACs(BQVRzHaHp%cfpBbje-Km(;Hs7tDS1(nemHuZmSa1K|J^+!+xZim}1u1@+1$%hLq( ztcEHpudC3&^V{oaZw=*-w>Vn8#STUAHUKPf0M%JT`SrTbgzK>||I)+(0&&?741-<5 z)I35U=c8am3cjvB$7_K=JJZ3G2ZMN)nUpjD4o=O<%V~+~Fgu%}A z$yo>auyU=BgZL#u0+T~cML^aYlaAGSt zk;XCn`IlG9Lyb4CJXgnlo)08E3v)stgfgVb2FJmUxKtT&=%-w)0&obd4!(sa}D<`Qw*QO2Ha4;%Zi-Po{@jq0sk`)LpS{)h&$3it?JHmmW z58UE_utE5B0)MRXTJn2BrCWm@)3{;Ia8kfXB9F`+3=D=gfjUSOENg~1#|P?M!eGg! zX0MQf!P^n`GP7)9fmo)po(S`*ffib|pQI{*U^6HLAI1p)wip#YrqCG0W3tC`%?y(?6-eI*NscsYOqtaw z30^mc6 zD>Co`Dafk{d8k7_BdRQs9zb1L*I4~&J_wL-H2}cb(YhL!twSQM{_&-&S;n+T&ni)` ztuf3)B574?8%GZi>i;@HGMuMlOb1fgKJ6hsHI9M7zzrUzSd_WqI3vNf<#n-)0H1wa zT%-=N9+01ffx$3!{&q{2IF_dtVJHR$yK9cZz+mu^{?@ShvU1DdUoe<%+=XzzR5%Yj zQKy#cA%HBR20Ontu^PA49Dv-udnN;Es**sqPf;#fe_=3hjetAvwO|PVC;*A13cyrt2Kc)>_~28BSU@{5KV*OcsJ;@(3S7>^Psq?XLldxf9|y{>#L565!{iC&;~9$sXqwDuRp)6jkdv#j zo2wvGQlsOBsXV8D)nFj@h&=3Bza?kjRXi083>G6A7a8I2q7Qt31p|WtNKx?b)$cE1 zFfeL(OXm&-21~i4_2O;ANnH~IApaS_8H~?9hb50Ty{didq9IF)tz%T?I>Z2MH7sfh zy`{J#5~mouM48{g_|6oEMg;`6!bvfY8Qj3KTZ8V}{g_yx?L~FEswSXq5CTA*)Eosq zY%?#Q9E_V5{8_9e1PTf;ObM_&!xcWF%f9KDdl;es3NUT~W(b7|iHoP(C#rczv^!$k z4Hn-~H6p87!UY64qWT*d8`Rd|0`C;=Uyq|1GNV&1p@4{{RboXiFTTfl%MFuGqHJ&` zo}-ibn+hfX@$&00zX2gaEm165$9JsUJZeD}Nwj}a09%;jFajF3D%}$Sur15VoaJ&x z0EUx{Sh$A&^)?;BEge2Q=g5vLY48gYPWUzT&Etb+* zWeFn=R+@pvDFMnz60Whp%r0Q;5#DWiM>Fi8Oii48U4L{J(2A8#zsE5Wqk`pEq=P7H zQU0FjRV&M{(2nIx6^t_&3)zR&0~fjudGBjCE*eL>U>2tg5%L0r8^M36U;rFsdm|_m zzDhK=dsT|nX1x=mg01!P@7?XCGUu)OvPfYiL<4vrBGK571__v97C%y^bJ{2Xb+g)6 zVgN=3a};^diU80)>7Mp{M-HyTf@x6oE)EchVaeJ!>tlV;-F(C=oyZvM80w#TvIC7X zlJDAqR&2In);}CxA0tLKJ=L-F)q{)$@a~0jJVzt8*Q~NjuV%2iAZk3Go4PoMa(}Se zIrQ5}-*|C{fGV(jwKN5vAy!|9GuN7nN{D-lEGU?N2&ae*xP0gW7zW_T!yVyxMZ#$N zTx%0beCu}#6-L+u&o5Ol#}=v1`hkvfR|@~&?W~qZ#Ji9u&cg}A?+2{Sg-83o-lcHp zZy=Jh{O;}5QbWL|qot9^D6z>C%=_#W(%wvqJsC<(KX(?FG!9R+cXgm}VneNzGi z>5|h$X*M{) z=Q`~pL{T;Pn2x4D0S)}Q7=*stjnM?BwGvK@HZ?b$1Hz;W_JYLG(>AJb#TAW4iYD)L zI3#7P`?l1FckhH&Fw0>d)Av*4HU4Df4`cNw&xTY8A`iS@k% zYF(%RP9PPmY7(}Br3bnpG_bF)=+8}-c}qY5DIa(f%#4lY%7F*hX2M2(tAb?*D6UuO z2=pkl0`9CQ>MP6Rupk5Qy|^7wj{w*eMl8AjR|WdEG$4RjAAGTy-UkTe;7t>dhqb*9 zvcbNwH`=ezWr7_7Bm&V6aJ_{MAyKGSQ}m%3a9!`>OHi z59@6fT7D;U^u8r&;$;srs++iGPY;00!zTymrS61lE;0w3`=38lx;!^aE4c}mb4qwE zYeq5j$ulz`G7cp2EbLx{nbG#$exLOtLzw;K^w^C5XTEVlo4lECFy(i=l_tLJfp|JR zG=6LS7@e!U4KeHW2E@H2ZLH1i35A%6vebEz!Wb9;0Q|4VM{KbQ0ZFOeDs&DjQq~N& zvad^P$0?Jy2EDiNP{qSi1^;MRB;yl>U%btZb8&rxD;w640OBZs*RrQM2ZX(fb?cMh z$oTSjIozH`!hqwo?^tbQG~mneOh%r9zP_3tcO7S}a@{d!c5%>K&Fl}p6nfsq0R?34 zFQPvTJBE2lau?*3;&EgZ6xMP3=7t`2;=s!#c}k3KQ1tR69hM6nUWU5gD`_43oL``V z$C8s~eR}gjL8{ORZ`G*gvnz#dX3t7uMz#I<-p>c*mb_nd*ZNX5*|^w7KUX~y$+CJ= z(^jm%&b+u`Z}wT4++;Wg20Qz4a;*gmUxUBt{pj1(EFTdg*R+0G_lUTqeDjA~pbuR@ zeqtwvZ7A0Wt3XfH617mTPqc+l-}dN5d>fShQ*cP?xldNFq}5PXcE8r`dp#6BG+ezf z=eKN@E=W^t^F|3CMh8YN*Lb0c$5eEplb%brM5mCr@d4~w!$}<~Uh?=_Xq@ zU)%@0etL}uy<9Z3CzZOaMkxPbF)tJgsne0`T~)>@dtl!+t4FEVwBWqfgg=<1w_WjPY*b&tp62XXCK-mES%* zhpUh~BKfbC+G!WPL|wZuvtuybuShC#pd*tNSNoLlAlY;)=Nd3c@z&Ikd-1KzxG&bY z=Cj3iXZZ^AKl~9;s6IpMXczb_j*#z2NwT~rUao4COf#T0KE;AU)(}l{FWX8>8hKS# zpF;&0UKl}oic5xO34`kEC*8BLV_qMvf~K>8t3vS21B$GM7mdHF`aR$ zD-jw-g;!;>%W!Eb^o0p$2d7j;tU3vFv6}>9Bh`MWf&)-jrtI%N%Uki+-I5^7=mQve zwf0uCe8Q+6@Jaa!`uxe)#LpH?R(sfZ$6dy*;{|5qpuybsJMWI7ByIhVpT6(;OGnI(d13i_Zx&#YpBXmGa^Feu0r?HJxARI7Q<|y0`a=`unRn5z$)O_-9);Oj23A zJPxZXWsoMrb!Gk;*R|wQb1W*k-a4o~37xhabH0|sGjjO7mBR0<&UGpI1Iv1-BrR&d zSbCc#2HZ`}bVuruK7*p#wnkxh6Pg^l$+=r{ zlq?wPhKF_gK#?CV}776watu>R`w(f*3ghn*V5FGGT%Zc8mkUr*ERG+6?^T9!t?Kr#2( zcBs6kgyLFcz3gH8ATd`04(vSNTbCbsu0w@tOX;>oHo;B({W?oRzqzcYz`^sqn0ynK zbVj15*(A+YpU;{Xp!*%Fa2>EJ7|MWhOh*=bvP&5)X{UUMhJqDHzSg}0(;mS>7?kfT z5l>-Nurav&$iFt~?Q>?Ii^CO`p!DdvUI-|YoFeMKKK?_V0ge*wA^^o-geVWpdr zf&(sYk8EtmJ(B`sRo23*f5o~yZ)>2^5E~&`WzE+(&(egHQYBYNM(*wBu!nRJKC$E& zj1++lu=lF1$Mwq1!Krj`g$`TK*dZCjbs3SZ_Ytk!&2XyQ3!G1L`GumT1T1gZ4e>Xt zF^Hc^a}F3Qh!dK)GQF!36HFjjP?&{!LKQlxyYtKhFd%$>;(iy?9D-X~{mf&Kc(*oq>WDSDJ+C2gz6u@f zSgOW{0P<*rD%b_kT4i(0VT>w3FB_34jaKM`B!)rUkM5QQfIJb11Gli0vsUTR7_yw?uxu)gV)Ow$o}LxqX`q9)TczTXG_fVl#$mE(gTup<>+}B4JC)-G9_}?Lg(agGwi&6n@BbE*W$szIRrG(@W8^{UQ@(gdl(C*W-aQh|ZPu|$Om(JEu51wL#wTj1p32;j21 zcLQ%pY2hk+X$Y5HpxYIwGck9eU4r6zn+(T_D{=A-pKI46xE(SJ?c~=xg3O1P2zbqgOF}QlzMA1CA{a2(adnLrP45&fQ&fcp9R4CG;$;>gi+mvS{qD0$}`>fR2@v1E$7IxunK z`h7!Q!Hf0}rn&EgsgQlboKCiNS}-M>gX5Rt3Ry+5ISt8aoaY-u%Y@lIpKGJ8Rie~^ zW2p{9GZoniQVJhyr%xRE2P%|X=j3V;3KAGSGoy@I`{Awpjjc7NPrTU^o%b!)A6!{a z6xAViA`qj0g*-d@Pz0AgU#WkN+wt}zMIXNR8Y5>8x#@S{ffg&yAqf|A3obMRT~P%p zCPnr7tCpZbbB}c!wdGreg;ycVOyWF_#-}{DFz;tVL^$rbcnsK@a5-AtTfUvZVGN7a zS`8X};oZsYzJyZLX$x7VQt;9A4|%qe!xw&V#0}OQ37l37I8b@nO?tYDcik?4E#C-; z#0jtm)15tm!7_VyIh2d+0!d3iJX>56=i<)SN3t=;CqK1WEeRnl2|R|us(70~1dWd_ z>KqhF&|4CR1=4QN-9Y2`y7Iz&ApsaC-zRFwgSOhu6BrC+1|ht@{|VR1wacScfIIOU z3|6Ys?7%97avXk`vG7pQnTLm-EyMw3kjmpu=wJ#gj_tLJtLKRy9E>mI6_Q%Jx7>L+ zMe!vF&d`mdk;8nQQOi5G&Xj*~x|cfQYm5uwaMb)8lYlps07q0I&M^$O)f#eh@maH@ zq1sJu!T5;Q&L%aKBK?s{>V6>GVX=b_xHUVT&-cx;$L0#3lcTUe^ggu^HCzV1@mg7% z3Z*H7-<8ZbY_z6AQlbL!x);2>>5y2Wxa72ynxdG0v2PmmFm>Ibk(qpH?$HB>{w1e{ zC-i4{-woj~3JNnr_i>+VyngTrsNXZTT&P5L`SQnJ1cw)j`fT3(vQsLgD=C@3UNpvk zaAn{zD45l$b-Axqa-&T*t{vpm!N{)A-zM9QWG3agVW(U_hfu6Sv?&YF_ZP34jL?iH zEHiG-4_+E!WXNd^NMd9wfBL9jL&#C}rc9}kY-y)B^<&35v>$%x@>AZ_P5q6C&AP`S zFO!AH7hB0oL1ckPw@|mW6l)PuDM+pj=cYjt)-hm4A)s(+(`-TI;p4!&3>WE%-Ys)+ zj(D7NX1?`-aoDdTkORsv`XHL}{s3c{S}TJKp(8F93@h22q5?mcA%)O;m-jpYl|#Vm zQbNQkZ+B(hjRz2{JXTxB*;@DL1#F+*2-&1E?UKH&a?1{f+b+K z2UH)QnBW6VUGzxMyd#BZkv>CV3O7{tX%t}`{(y?;t4IkQqX{j8v`!G}gz>_RLYg8f-c%W6a0LbQ|g-?ud3J>vk1lVuO3%<9i=1QuE+8vDX{Xb!_vL5A2ZPao!+*Mc@ z>@D+m7_8P=01JcFkd7g!OW%EbD%$!J2FqG<{Qo-)7NyL+bOM8UV_`5{$J+t)8dw<2 zo3$LrG3XcuYxoSqWzqoBmE%aFk<^#rtQTbUIRhEL!(d<5#{=p-aX7xiV22I@zNaa%Fxa8fF$|{Sh=su>PhhZ;5IJbz zF$`w;4F>z-g@wToSQsqO@f!?g`3(k>ciL!j{04&|eu2RzahNY-zQbVbfz-mSCotI1 zHyEt>#v$?;2Gd8@eZ<0Gk~fcGux2kT490|o!F(LCFqj+gjj6hj#t9780(#egg~5DS zvjgLPg~6C{B*TDAB_YD93UtRXn2+G0_l>9k>TIB6x$?(XojiwF7)+iS+9rPpgk2`! zSHj^)_hCJT!J4TX{o>l2u?x{CwZaG+e}Ta)g?K{`zr?wIhr!Zu!zpQa?Y_ZaioAxK zuRp8&0~oBUH;rBj3xiRKLDsP_*kj7p&L1#X5PR^rl6A%fEDTmQA!3_R&Qgqp!8*(i z9mapcV8xt*3zOequ%|}fVX(amCAzJI%PJop8ew5DLEmpMSQF%9JQfC9ll~P3>l?TK z0fXHbJAuIxuhe|_0fTvbhrw=)eTTuyOv>^y6wX{7Ct~;=3|7>v6$c7EhQatKywnl` zD1-C67qKuH<1q{-hsVcz41<|)t^WdpsaSu9!6J@fFip~*Fj#LqatB2j|6jmhT@c=H zFqn0ovh5ETOojJ53>Nx(7)<>H27A~MuS*mx9&wgf!(vhd%Bb3|99G3}*5T24gr&Ecpuz7W5|=?D@Zf!Ca9`F>-_zSQu=s z_RlcbweK*PQT_=G_TW!2*z<2NSm=G}UtqBN0TGH0-(fJA)GsjD*FV8v4k^Eb!HoV9 z3^sy3{4)$j6Zm%+>;V=AL;V>BtAk=;Fo!?GV9Ng;43;1JPhl{9?z?e+z+gx$40e{W zrv4wnU~2Y%hQUU^!(go6VK9z=0fVVpe}};ezQbS*-(WDUzrbMAzk$I>d^dGZ^~@gFXIFFj#H&n9&Ig)_Veju`08N9>ZX_@zsuDu+PUZSmN(s zu+|e8%;^LM6OxMh4uhqVJW+0yF7X1h{}~3;{uKrzCksNI6;c?fRJowcIA`^L1cNpG z4F=O|$9b6XsTT|W6#az3rv3th*?)(@VlOYjNu8wr1cTKsO6^Rd*JKFvR2D?qDoQH+MJ_oi6R8WW_TAC9WjDq}E80-qJXgUu_`#TH<{R<2h`6~>TcMO9CM;WjvS6%r5gR#?v z>XVC}z+lG272jd7R@iI9-@stk{|1Ap{(!;iIG3?7nC~$RR{K{N?Bbtcux1>iDFo=| zz&9AI3=wh+gI&hLV6pQsWeqG0)~Ngk3|9RE1~dHw25Xx81qL%^{SyqvY{Q-kX(5cpx$$f{xi2k32!D?bj8iq+)C`k*38|Spys;n6&thpxOOvG16_pDjA zFbY^9EL@pw%bHyN3hOkC^rL>yGLUhu9O?&W+8pNIvStNrb7A3;IThwZDAAS@*8z}@ z1I`A@Wle@NgOr&d^K|QZY=zd0(}A?iFe;-{G#o%Oe;hW-3a&yJjh!{C+!Y#<1tvcj z?Hd)gb{GxG6&61v!_7E~CTGT3D>?!kYQG#v;3?L10Krx^z5OYgqr+_KP#m^tYZlrH zHZ358D3`HpgzW&!am~~2%~N(%ur0&sh^|ms163u#bPG8wyGZ60Yu0rj{ncD18yD92 zIOZ=fwlCJKv~Wl#C0n}-l)QrF(KhzI*$iP|(F*c}5w`VVa_%QAn{lx19LQ@I8lnnL zzj?X~E?j;;oLNe& z8E?9!t$-9t7>E&uRLX$43PS;c zfe+{CmPJK4;Se)Xtnx;Eh^8=vi@MA+`2(pr5=4!1*!ptSf5Eehx`yA@dZK)5RoL9CsV>OS^=q86Tf)RWLVE8i6BybykM6eB~fLmP zEkskk0?_Uz>Q#z?(6iW+{AY{%VbOw?_tm)w}R*)O-b1@<%YEmU@)(H%$RSu*m z$4Ww+rX8be1S2A+W;AetrcxU(LZ}OsbR5NKBCJ%ltr*b=0@|E2zOF_i`fNJ1R|`aF z3Y8$tIphzaa~tt zj(Em)qz0y&K$Dt11K)HbK9pe46QO-aE4(Ku_$86r=8BoSJ8Ko3KN*oNAtlYt*zT$FvmteIjVZF%(X;Tl%OFvo=BNi4d3x*A5*>&94Ub>}^V zd>di?#oD{UQzHQiHZ!tt3^;q~gOPWotr48fc^-yu#I%59n1GpX&uf@^QeTwxkEyaf zL#1@Xlq+_Wei^7$kSN$O>51LkeAVP(AIPe#kwgLZM9R#}@arj_4kPy&nCI!Fa_$^m z%4M4dQda@=y~T+@a7gn=kbxJCL{9J*Fa;Vw=M1Nev|^3Nh7A+0LLh}RT;mJE^$mjd zxMUU2FlH?H1?k*zCM@)VpU8Vn%V(V8aH^XIOiJSCS1=T`Ko+<$1Y*tdV4H3NPE|if z&le|0un^f+OI1Nn?}7oFc(QPeu_a^U6BYF?98NUFDN0W*j@9Tf6o=L<`o_XkCJ-f< zZN&u<>8U$$v9waw;gcBM0V|~<1XXhmQGgpX1WeX6$%d}64j*CL*`Z5C==slc-v~In z4J7rWru6}nMq$FOX>GVixxUP6OqToKsiTUarmM=P`T~iyUSRT}p+4fNW(_h<*v`=z zsv(-lHj>^}-pdIfI7NfbR{R2EZ8v6m6mMbaE}81|{o97wCMPuFS`&;6W`e?Pt`JFQEK$Z!lB=cw)AOm=q@&r~MTBq$q5 zpPs~AZ~@X58v4GOie`hbyqGsK-M?|sINA3O<)@E@uNV;nNaQJoI4juwh&ZL zr0ndU(eI=@ag*cFQO#gP>K3RkWAs0CR;}lKKnTjs4`l1~5JSK|57fDGCbw#u`(wMUI zmDM5%49oQbDQQJ-(DVni=0!P0&pn9(-RDz&d7Apxt$h_oHWHw>Mc_u5MT8^RkdoLH4 z93n5PXrsfx&!R*Db7Wgq?l*ASR^Yrj7-b^mQ>Fp1P-QKGD=Z5Qa@;%Q7Yzj5uk?hz z4dz+-%!#KQ{~;KA@8CIA{p&F{etbWT;Ex7 z>6v|jw}xybjB);OO`9vMv$s}C4V6I{8W8Jxl|*>z_4!*kjE8b|1`Fc_%N+nadH zm}hlcpD^*7v;2fN%l;1KxUczCXvS3J%v21^b1VvWl$7y2ZRU9v%XFUkbWz51@yv7? z%Zs>SmZ!eZr!z0UU|!U*ylnG*+0yi~PX6Ug&Wi#0nIZF;nHjcp48v^G%nO#;w@tG> zUtUgrd9j`G@^!{+&&;dO=C9so%ur;`fitl^6t6~_=J3DHQ(3%ZpPi?%m|^gn!?jqD zXkOrEU7%^6AD>}anqg38oqd<~#5p}=CCv+DhI#nWOnoOLxuVJ$9m z#g}!prg^oAb@hJc8gh0m$722A>^hS5O_$$lAM3`Er^VX5#YVN?W<&GlS-{)0uWLw! z^&-~Q;;-xP6;?m`tqja=zERlR%X~xE^6tpuEw29x!K=3vif^bCccT4vFbeBuEb$KK z-*UFB_B8LbD7^JlU`a>5CGuaxk7*?XRBt5@0H#T@OsS*)D30v#x^L|K8Jz9Jm6rh4lC`3#%|?GI{HVmUw@ zUY}>^X8YWPILw;=jA8p+o%OjX>#&3EFd9KV1|x7q69N|EWCYZRuNbI5vVt3~Q#0n| zZN{mX4>1$xFK%5_4#pv3R9%hDQ3;cFrC&pB%^h_HlhG(AqGEH;y3^A*q_2&H^~MU4 z#EXaTENZczGFMB#5XU|cr4qP~_pltL=gyhIi0mDmA57LPY8CjTGufYtUy-@7pJ?1u zDP40-;$S>Xr@%J)rSbGc$ylhd+XZ}S_AM@VB(CL*u~%ow_3$Rtn}ko#OWJ+M?r4^f zyUh27Qw&kUTC;u&gE_uvj{XUQ9ZJ}_{{{vtzSMH>Ck*B%vpTobm!=r=0|vV_`@Cdp z_sxqxV6d`5udQPk48$5{6+kR>7MXj?9O` z7`r&)G{Ry#J^0juG-Aze?&s3o*dn}^eC3|zwG``v?yIQ|jOw{U>;1~F(=^J3+pNxt zinNKv5Mq=m(5O6)P*!D06}Dvr&nXw3ulBm++*TT>oOQsn4icTihvcWE8|RLlFx-ob zVkN5`kd)=&E)wZ}f^|}DD@=RplR>kLl&7;1j*sg%WtdW{z7n&C-`<4<5#lh+n_8{s z!vZKR^QXjHv@PQPg$)86ZTpazl}4d=~X6QmOF(5F%PriYp@7v zxKjOAm#MgM7pL!5O9ul6uMCGm#~B0SUoCDSey@JYPQVKZJCo*xX(j6O(@qx+69%5n z*7oRLDu;PodYRiEOrq1xaTj^xL9drXd)~TE&j5DZ{$1rJENvY?xh}Acl3o+i*-aH= zjk-Omb$Vde@dk(4?XkT$DUyiwWR<}14vYSQd};R9fQKIxs2V#k>v{j3y6Pg(fewi)+x{4 zT9B2}<}{ffHr%yh8hwPl$ax#u`8I*m=s7_b<$b@q8Ox}8O>(hhD$Y~fpZXtn%a9O% z3bL?*wMHbeQ9u`>PY?OLA52)$VHele@ZB?o#rV2&Trs=M$enK0UXx*IUh?RSBM(Zt zy{#{LnmOQuGNXAh#WW<7F0GB<)Z0b5vKQ-wEkrwK9R30Q;VyEAd$=QXZq zItNgfWZN?u$b&hq1rm8@6BImaer?<4&?}-oiLz;3OfQ$>6(1x{3nV>-7xRq*-^C4-dLrh>rn z5Bs+&=FIB^mPO&u`maCgNq(x+c^&6NuXBW3diDG%!J%-i;W1u~R9bP z|# z2-p3u0Y!|-=ZmeW-$->km>gw~Gdo=6r=KC7e7Ju7v3m!*1^1)L!>#H_5Bx>W-7OQa z-Fvv*Id#hyGA3wI$;G}l8 zaijqu=c`K!VjRyhUV2veK)IT1=!P}_3%5;L@h1GCTW*eK@_-SQFVZ)izi3@LlKNQH zd!E~EU+lVc{9E-RgI~p~8XJI?p%bZI<~0NnZiWCR-Mx?E|rQ^;9JWs}6`n za>`y8rG6xOZ-A6hytq!<2^5{CO?n;NT_@{Ki^(8MUy(1C>ha^M)GIFTl75yl?0@rC zHgnpLtGnT z?_9yM^52Lu~T8HQtr0iGj%%rEEaD z+zsG?=Jcwvrnfig@??evZ+G}HEl*Fznirbx|~haj1ppy*m7Ow&lb4JKGO!#uQ0B>K{G}lbDly-JWYvJZw7h z-M#L=kaXn#=&64a9pkei1PLM2c#-!zRoL4?0s;+h%I=$s!pI#@QxTI}*q(ujE+pg= za-ztr7=$>a3t&YdJZW6 zKVdLbCO4&CV?B zLU`7cK~i^ko<>B$YFH*=#6x=BT)T+U)uV{A+7J{fqEbq|gb)jZMb_B`J<*74Ns%L*x7Z4MU6UZ&BhasC(NyF{$16Htk~Yabw=2Vm@nx?Rmu<)$;Fm$KYzl z-dv3VgvUNT6^q{!;*F)k=0Ki7L=-9b)`=%nn6oN96iAPb-IWR_;*DSC!C`wCei1vm z-x<$y352D>geldIsjzYawICuuQgG#C|ViWc5+z;WoZyf%B&?x@u^RSB- z6Hl+*7h{N9gc92HJh;XXZ~riEN4n_szo-#jsSEk}7>{yx3aOm*gbxwdAX# zNs&as2qLx1L`i{biRd{}&4I21i6BPB+ zZP(IoB&YYSWz>(PztHp^)>Iv>O9cRMyoDcB#%3(o$42jGY}-TU*D^EJGZ(Eg)i@HM z&v&#DMe<}th=dP z#NPbPg~}v7;040?au?^aUu5U1s^%~s1k;3{B!;IXYul$trN)P>Wvh?n zN;m`y^W~kDVHD@fe-{#ui_SmC2as{dIM@4~3gZB%QK0iR*Mn6Z^3O)(U*;>^SuL<+ z%(Ip$v}e3`)uE6+qR@f3XrjCDhFSJ4#v;%9C^xO5ZTljx^`d%Ik@tFr??F-UK?ITy z)vJjL=|$y-p~B75BBxOaso^oi4|8}QCO&#tq>-Jp{_s)U!!*7}fp!nG9wp_-JbD-r zTHtW>$S(ZRBjVzh&mWd*70W+)P!Ul)*!`$_I-xAJxMjMavbQ*Ey%_CKVpLn)mKxh} z7b_h_wmvF}?JXJ9D&0vc899h~W>)%wICMCo)caBCOTNeMo~5tW@6F3RUR!s4?eO?@ z*yDBLvUi=2-}XkoV=Viq<@jE!Y}Bso(|Vb0VcF-1s4oX)_}U?}eC48A<)FUujIeT| zw476u_ZHpeq+Gj$m$ka^_9t&7#Y*g_V{EDq_L`ijh?%6IWFlin1pvHA$+U724~_ zRu0e|RhzzW&aTihZYVli9C(4`srBvir((}v=10+U;IK12b&OOWr>VS_hO&F%=A>OS zjL&$Bq$Zrv(5a}xovE5uu7{8LRMb!mNd8E@*cNa}O=>+`hh3qIG47ZL<Hito%6XCys!QM2qkThcW8{4!S zI~*IkA{%?W8^nh}m-_0f*2&AZ>UETw#!q0d=a-r;EK|*Yql~ znH-FT!J6MX9@}7R*!*6*Wr3t+JF@xxrIydyO&=p$=F*O&5Bbp(FBBWzx6C?n(1@aO zE~Ay5(KBgi0sWR6!=PyWr$dpLB5PygU6jq38pRtX)(z&ne68%Ht#uWc@e0??`TEss zE)UyT1xs!z6SOfnwbhHri!rnCf2??G(AtkgDK;{5yyW2PFO+6><}+=33v1K7%q-X6 znqAn|59~PO)KP;a!P2F1aTvY0S(+PqV zraD~}ah;BmT^nFSi$mt7+n6f^GULEbk(ZrjFWc5!x?CGO@Z!4-le*rFba!ocy6g0K zT<(tW>3&zyH8S6|RncSFSQW(F<<;2jB-h&~(t`^*>S>SbeOJ+YYp46ZK;NV7-V)}% zx7$4fBR#nBeS;N!&T<{Ho_#9@edU*XL6DwOQnnts{>qL1g6-bk{yy`Pz8tv$U;SCm|*_Q!ABrE6{!SBAM{vU4e9}#D3Q9XE zx1-`yI*BBiL}rW!zMUHKp0YBWiuqp{?0;df|CeDf+C3w^z4N#BOk?&gmhD{{+B1K* z_vL@7u($qV&k*X-NH606iD!X2ra@8(!9JV83T7d4&LO49keIO$gPUR17U4>+;ZJUd zmwltcuuj;!NOi0eHW_{Ce*_7;n|Y-)GvQ^X+gz4gN_O#VwufKN^@^MaJ9#$(^KT^P zH;?59zRLH;I$^JV>x4b}A92Fc%Nj;MG$iCVCJZ%I{9ks$wkO&)CVQ|{7}g2feBSrJ zRM<>|*ZcLEzM8UEEeKo^R-M}N`L-}BevAIvip8#2cI`}^x+kyulEoZn`|B!KhO$*6 zSueovm{pzz~%c16`&mS8m_kX3rIQ~Y3-T1#p zg^m233XA)csWA3qDr}MDPgK}^q|()V;onhV`cWE{$5fcd2^Hph zLWS9MasNVv9sGp~TOvW&PW`09ZvH}rQSx~}kEyVi$5fbBf~H6`^vNK9O8fh8mNflG zdRQtfDncr5>VyjO(H2QLrozgM4|}mx*ykQB6-K4W(Qj0k?Fki@(8FIjbwY*J^zgqG-qqw7{!WEKkEt*y z!&lXRPlY}GEft1$s_$1SY);^-?Fkj;`-2L@`$>hJ`V$qFa!iHQ0P2saFr%MTShD7? zRM^xvDy&WP8x^Mb3l;YA4=PLmR|&=QCo1g0FI1TR->9&7EERVBS1L@@_TN%r*ZwsX zMgrE3|2q{X@{#&}GH3H>K3%nnP1!T&~u%`yCj3XA=P3Y(+-7gSiVK;MP)cz?ImV~9koPJA%9ek(4Qvau@FqOYkVJd&4!nTS3DHV44 z8x?l*8x=PES1K&U?mHD`L*?;zD(ur=s4%)?DopZUQ(`I~Dfe zcT^b8H!95SgbIr}ho!=9{6U4~+x?eRnAmqJth=4M``=MvSv-NX-LW9c5}R*S*iQIw zsIci{Dr}y1{5MqC9PLjktl$qS4EP_YFzRC}%<~^mVbkBKux{r6K!v^KHY|#}^a~Z% z_zM+A{cowT1rph+V=Am$^q2~RVyUpyvm!n^5q?v@P+@swH_+-LH%D&$pu+V3OojPi zsjyG&f1|>dXe6fExlX7s0Z9HQb;NPgQWGoe?^am9-N8+3PoeB#% zp~A*B{s|S9`e!N(@h2**^6ykw0+tH1!^Kizo`0jlY)+`K;9sb)XMd%_!hT1Eh0Woh zzENS$->I;ae?f(%{iMQ%{|~9KtZdt~Z&cVe=}#(b=r>fDFp1j_D$MpDP+@UcDva(A zDy&BMPgGdh4=OC{8x{8C_f**EPb#eHgbLdt8u}+x*cK_43d{PpRG97GsW7o)DlBde zKKd&amhh7b6Z?e+rk8f1i0l^$OS>R<-L0EtU&rCW8VI*T zRe5ZUcx;I;=NnP++3 zdP#gCN%H}!cbG04MxAQGY3#@&}VTbuw8#{R&I)%3KttLCCNm~8qIA`?R+ooqRqdqWZ)$`lkz=W%bN( z?JkpzT9wT}aZ8o@45s;Ka_Itfz?D{L@;D-&4LTxuFv%A!7K8Tn%d8s$y;aoYOS}1T zR-K4kEj<}y`(cnCBjKIBM*RTXbI>k*GnslHx=-a1pJR#bELgmopzUMgjU|x59slhW~4H@v28Zf**r|`ydKn9YhauV=FoON=Zu^1FmZAY`f z>Ona#FA7k14nVqnrv*(ZMV?O+Jt1$ZCv}%^Cpc24ynln*WB`0$<~D@^MI|NPn^@ETQ1Y}o#@&14Ce?3f~OZqZIwU`qo_T3s>LSAYk9E8b&oPD zXm!M%wPN&i$NC@7haQ?dNl;u@<(sjp-`^(8nW6hhga?l?>I z;79rjuQ+wNLf1Kl$Mc`zy3+lRAD$r?>=pH`EHRh9TTmpI{t;DZ zbT=yTac^(Cl*=TuNICWVOB^sBS>#hhV((*ZrT$2bj6h}7N5L5&Z3csPsF9mBln&@a#W+5yGPsqbsyA6}=d`bAgb=u31ocTbBktX3_m{a7x^#2pd05RPD_j9Q!#H0b z7b%4|0=s`+7m1N_KQ^Qw+Gn zJ70f@)3EorTc`4!pfTCtlnp^{RR1d6YK3AlYQ7|6>+J}hZ8>=_ZhUJPC8Z_57*CQ_ zHa;XEx-n3;+i&|p8*bAZDxzjE-b*#&k3>9rlvdD0jKk^o-m+Q~EtOkjdalE}!9xC) zEjSe)MWMFm%3E+opn?F+fS5Vs3<%!KxNKh%v*FAG3xpsCuh2!@*noJ^@<79OR|$ly z$1u)H(cZ!QnI58ebOI&aWfL+(Jl6*okVd#3&f6gp3AtB8i;M zmy-vxdKxZK4*64#XVC`PCx>Zj8XXN&;O$cy<0MOwecpiIis(s(Si60Kejy#15gb{A z&Ucv%!3Kutwq8Nvb$SdSoJ;Qn#)&9BJoecy4F5`nk?he#UZ|L>m*$=Jc~;oL_6Q1> zyvT8J6ibB>aTnS?)KATLAaeEhRM;s6O9in3(P?pPf=0M~w!rL9R2cCKayD!q#eey*@=Iwz@}L zd~Z<%J#xDKLC(I?S~unGEfeB8`x>01FEa6ptYxM(ek|S|Z(r1zchNk)|7l~f$NY5G z+e3-n$y&=iF3Z`&{j$eHUs9TgrZ>1&diz66Ub?5aox7Hmc8@jp&ADrz7n$E0Un*<* zXc2jLK(Eq+XeiB~Ark~0b6)7T`6 zoTZ&{9g(5GK4#F#sIt4Q4)tetQ9oshC-k`2S-`ai;%&>{QxakuufZKywWB4Vxrhwq zjVy5$9Lepk=woS-eBCmVm)O*e1iWd%_yirul8ccIcvO>S@L(U2l= zAVK#?6{*aLFKnT?JCZ5NP(DomD*v|HnbY^dhPs{+*Q<3^sx{A@d!458n5i-qx5ZFT zU~!C=;{5$_6K$XK#+R?w7mx)+^r-6kH}}|PT#ob&Gd?#UN75_aRdT;Yk~cx8=enlx z#04ihW!nqB3h>9-yj%9CF1b=De;9SdD8GN?HWK}`)VD8{RBtnN! zQvZnZ$=N$cC6mSL?Ijb_p3k*vp9kl#crAo9+pDkz9Kam)BBC=25-(nRD1i|e4_v+& zUnb7*I#9M#l2ABcIMh^C65jFJDS>~{R;zCg&SswvKW|cUEl!P%)uWl3F$+m6WAZpk z&CLEtE9I-SU3#UYM%H3gr$?sF)3nb3X^j_^vZm)(Z=NYg9@``YS8ZFpY(CVQei&IPZa6CII}J%FRnGs zchYk<8Wr(xjM$vHtoSHgATP1y>ZGsdEc>lf30$f|t`9^-d2>CN5=>{SUj#3qr}w z*z{O;SU8)&{x%!9^K=mJ285`_Mwz;4*-yYJ$4^ND^B56L>Si^E(^M1AV$#Oe0mWtD zct&^ws!THaDTHMMN**v~LkqvqC37$zs^Nn}I`h8m@gpGU%KR{o^&*6f_dMDSFV}-} zI>^Thp&+JE%v2h-m?#>Sbq*b;T1l_ZkP2v%F?6_xygbS>6sJzTZJ^-K!I2`GjI$js z!t>56Od$dQvZT)O{<@W;cz`T27oVqj?~LJAm=CDLhlAphTlv@LBcTF;&@7$+l0xC& zTc0rawigl=46}m;&&^+pXO3`^oa&a~4D)p|CKCsQ_bb}xQ4b3k2B3~QtF9SV8jp@G z-z(`6bK03pnMoYE<%N*Bva`f&;?*tapcKpN^dzCf8PnXoMJx!xBb_By-t)nrxw_+_ z6aryvJ7WP}$76t@XSq(L3}ywD`Y!Djf&#CPxENN&8V%b;WmhyS3g66Sxe5`N(lVs> zHFi36_TgJ7kwIo-8?l~Ls0t;f7}1k}Oxp@mR1GIg{?tZ}oa4YHfeZ&wiU%wtz-7(m zD1_~7s3?MY^+ZQIA8guD-69=ksyrAsw05Hfjm6`G*#Pa-RqmNeQ&oq{tYj?iaAc@D;<*35cPKQT_heDqe^lD7VvQhY@{1qXV-+ z$$}sDZmBy0gDh6N^Ent*g7pk?EsVWLCwR_tCk}J!7}rvm%XEa*kq~JpsSsq?IXeJL z=0aBI8AaDZB^O8*D}`%2rj0xC0RSW+9WO2&Apop37EYpb1OQY+pE2n;<+9)@F&Uj| z#ixZLg@7=eAsn0p_oXEV7c0D#DIkDWID{o@uS3d&NcQ;#Jxftt+(J1T_rhfZDtUA_ zmrO@|563d~Ih%8`=$*u>%f+_=KI_;V=Yo34@7AU!_bYL*t~N zA|P|6y5W^o_s5~7xhj(jWK{}X;s?SYgCq?)J5->kPGZ>U#`#tbt=(LksyA+gHZNDK z&IaklIq5XQhdUHJp6tIQAda$XfC-}?%9k2w-cC|540?6b7$GwF{iq9Dnh#e)jt%TtK7y&>)qctVF5*kNM7zdh;OGZeKz|l%1 z1Q5#N(Mb`A&W4MQ!f{EnfdE3J zU#SR!f}44cOsrO=#Cfsv2!B4}2+9R0zX^G61wbnhBxq-bx#?WNb>lWD-ru+mS;GNH zYMcSk&k_OJ)1UXQ;(=Kyfu|^8YaBI4cfz4_K**l*$n8*sM7s-+I1#nVNestz{SuX^ z@?xoI6@MX|&6*Pcpn7D9l+3h#eT%YfkArcaK2ss!CCO*i>K5P4k;4zQLti$Tvwa7{ zZ}}{VF#(yAck=Y^-Xom@^yZ-u4OMQ4z;0Qo>kLYEy$aX)kDV|otP{p_?1V+TNsP&c zieQ~El<+qvj2eCh5*{$yCIbj7XF)5izDE$hmwA2ZsoScUxD)}m6oNo5u864y7O3#N z5(Jn+o(3SC-k?bw5jayoZ}yt`H?>KQvLwuv5Tqg`PIe*fi9GV=`l1CZkZ2E~r4 zFwjOJj*r-R{oo@!)u~bHndmU zmBcIsc1fxn1QFI!lLA22KxBx9(M2i9wiM(JE?`TN$^{2tr@Tysrh;Rro&$091AueF zHvkAejkwpcPtM~3ti!;1ax&Z~^--#jDRy-S0qG0RLgGIMK*AMRh=3j($5a>uBYkxF zcB=xa%Ue8-ez?;OA@6%hqMih4r&?&^b3%pH(ifo#&bXJE+Eb{YVu~ zs4#@?aEQV&6$UWhs=-oWbMLzdR)dYORM>EBMG3AskF0q)mI?!yXao^h$zZ84PbNi3 zINa;`b20)|6w@r2!(!n?gdq-+&3 z9@R5(dzjh`33Lv~)sc_pkGq8mmbQ(zkTRe$0Qs|ntq~Gj6(IMhIEa-D%bgG!`OxdN zBSRb^RQn(kN<-?0!I0=U28lR}DL6yL2o(&(9ek{XLGec@P7Mfb7g%iv3-S>PQznhC z5%JSMBY#bj(mr}*vL8|-rc8t@82tzztA&03TqB3o!t@m-XkdZ^5LOMG2&Le2JE0s~ zq0*?~%vzAVlu^&;Vfimm6Xy~7{l^bZ;o#CEb3WJH%nqdY9TNy1MuLr5b2lW ztL;UK?-_qxJ8e?RZt6Quvi|JMK1dK6=&lsxr4+1T7nDW${}FcAVNJgA-|#nbG^2YP z-7Vc+Qlf;4jFN5@1Yx7Q8w7@QNlAkYkWd=w99<%%0-`+gd;gy2x&ONFzjhq^XUBOR zJ3rTTo#*F#zi{>R9!kD2vm`4a5}1>bdt}cFwbG=~po~4In(MKcb$FYq>VzP40uXVg z(E20X62@q7!XO(LTORJ9p053i3R`{fM6a_mq{qZH%0&_ zvCBw+B*_$PVP9n~B9H<2)P~|#ERb5HQ8FNMnZ>x4JJu8kpH?go>;nLrveQQqI1C8H zJ#c26TwVME7kpu6l2F2(s2rPe7r4+}^r%#bN{0clEs@f=yx?2(^{?fkfFc-=caiGg z1C5oX&yJx27ZwZ~NA=^yCYTv8G=O^_z>C5ktGBNSG~x1EQU5P6Y;vaBfm-K3Fw8S0 zMpj2O4-1BsSasD~8NH3%{a;|1Os8Bw9zlZW|A1i&b2Y10yE6+^F|)T;NCFE40Yj5m zHbR;giJy&dqb4A?K-}Tm$T0o@td{~FH$A0_0TQ2zH@wmOPYVN3eT!)PhJQ%A@=pt6 z_@{-zv050z|7c+xnkmxR2s%=vv;}}DpJ3(x)51WDplW=9Kt##TNcJ}w;?q@-Xsfvb zpi%%_Frb2!fLJF*uv!>hb;rj~&(HleA{j=aQMKXD(mF#G%{9(z@vm{^Mm_e|O z)^8&h!#e-~1jA{2MOvGh!7BB^5|zo^ELIBx{nNruRN5;#=IrLrOdkMZ$~#tJ(T z<^QxW!1&{}k7+VESS_q*5zxD7wShpf0w()a=rw_(dAIFsSa@G*l&G>9D|(m!N{%v$xt5e+}KSpsCBbPjCXpC7%n_$I?k610NKJtMKf^lw%$?@qwEzC*7+eQqM z*zF8hRN_UpdE73&5O;$0V5EeuiqpBBcoj8pG9!PBDhsOx?GjHc|V|nW9>> zNn13)Hg17OsR@GG;I%C$ukPa%5RA$bdX-JwPfQ^*jCbUWG2#Y^_m)_Qe`U{(kUGUh zr6P+xaD#M^P-q0->BDExusX}bvS61GP&iZtU%rkqYC zjsJmR@#7+(zt)VfMMJL0JGbxwjy%LaFwElMhHR1bB%)p(K&+K=+_{~Xh%%HSvk`rt zy6^l<4bLnaMaH8!mC!2f*xxyepbE zEEsm>VEg8TdeknB#0dobA219HDYPdz&T@g7*R)b3Q^foMF%JN?r|^vk1Ykd`u=5`n zwk@!H&e{p{+Z$3M)*xq$?3evHn4(O2ce~bmg7+e1X z!|EMf%#M4*0L;&amKxrn+C@xGk1USXKQL_3+S!k@D}y%DIU1PDd`gX)QQ7@| zM?G@TYaGFoy%M{R1;da=D)?r+k+%N-z_9N>H%be*^3^;;7koqfJ>C8J=Mdko3clwp zWLRI*hPCa~V!^QNNdty58%8=YHCM_vks^cPRm`|MbY3|Bz%YQTe`M8zQ|F&c7PL|D zI&}R59FkZ6z%b{w&VkO+K{juyfXL%C6+gi<>a1_$STJk_ZY4i8`q!4qS-%Un+Yk$e zi5^&=I)T@{zg>8Hove4VBJ0&9AdZVS_r$!52A594L0B*hx)G{K2#V$}AO?c;>obaM8f<`8z z!1_H$fM@8bZ~Wdx>EB4(Nl<#S=S^9RyJeKeHC<>l-s&OovuvCfw*9N8O){a&s>o{%dHK`8^M>*^$!e42or zhAn{re*`!i0fhQfO?k8ZI@16tkPIV|Ya?0}5$L!MYyYt1r;d2M8U>^B8)-vC3`qXB zZXeRzT?Ew07l6ac0N3IF%&^{>jFw2})0T|d&@N`+9eMVTl;1ipMEv(7li|>CSCyjZ z`hRAat;s(#%oAA0e#(rRzKD;a?ZKL1fJ-A8T4fs-WrRQ)^g6ef{tbqH_)|dL6~Q(; zo$`KV2vN5F`uDux$mn-iGYr|E2RPe8JiwY^fM`PyIZL!S6!F0~nSfHk>jV>xLr4(s z?=)2`Lqg5?TBa4J7l{KxSOkUH0QzYu)_Jr2RU9I533wG4C=W5Xp}1@L%<5i9G(Lsv zJ(Yl>nf`2QG0Sx0qS#N*=}1X=DUq|9aVmhXF+f0C6rfyANTt$dQr4Xs9ajfxz-1Y(ouGdHQuRsDWZzSfjQO(_(S z+FPuhChUGAdHA);yeT-FJ>ug+F*Kr`AAm7u|D$k&rNVqeR&Nw>c=N`pnV5XN& zKSs@ITuOK3+b(KrC{Sn1eCW-{5dN_J%20$PSL(jdyL+HQ@fwRcDOx+){$0Cq-C?n>-qo(~lCgO{O+*TW5IttNlp|LqwDy%%`1Qx(k z5OB!%OuDw{X9{&7v0O#LX8@5(QR$CGCOpy<4c?Lzrb^DY8CLwvyj4$@R;ABs%1d)@ zWLlkH{~3q`Rbr_y7q^#sH~cjXhvvjC&kQD`YT6HTXC;&OZF8lX8zXgj6;EE*)TK9e z*i%|e^J#g1sUxmxp0x@TYPsJLE7-b9MKngU7As%pypbAM*Y?@5^C`1%Gk7h3l(O-p z%<3hzQ1_S3ynj?!lTZhcL_rjd&m44`a(J`6EC_#8MJPcs!N>}05#=NAI!fI<^-x%0 z7aN{8Oz9H@<<(S&@KoOkI(-)CR9F|}GXVQ|zU)1ikj|;G+f^{0X4$zHu7Dy;OTiQ8 zkzXMDMKsj$?YG3&tnrjSZjr~`d&TCiW95a91KpP(eD+i?Jal!a*J!Ss=u7!?Q^yLF z`a$j!9$ywv_*Ho{)Oj-FC^Z7crmLAmhr)aO4EH{isQTRN)oQV1?&V&(l$dpg)~KXdGpQ z`aUYc=M%Ua=Rdz;~ z+C=p^mr9F1-8@*i%g+;fw10ghgo?B`v}>5vpD?L0j&&gSbx3^@%ULIM|*I;_jduqCI1&VC-|+vKS6( z1zwz&#^7)xjrV>pz0BR%F@%fF=->(!d?hPmHg`9i*srqMwBGN??%!}wxjShSD|wG} zUvJUsRsg&1h8glCKXHfkWHb6&iw!>=baYYRg|o%XH`=^*&mK@%An|UOA0Oe4G-!;X z^&e?^`{Nx!;1MCYX}yX!i#D_FB_V(F2E}pLrvBeaEoE%lnBi-2!z&%MbZknf=$T5^ zl@}`=V|106=_I{EuQsP?k?NB&liaZUp|7UKk;AHEfB&wzo z{!wB0gWL-I(=v-TsG9FS|L{KekAMxXzyg}8r9M(o(3ZM-;gfhv zL*h>L<5xp-i!#2tvw^N@r?5dWEvd+hjeI4kyei|9Y6vPmq5O1;f(=30)RjLZC@vQ9 zW2&X4rCZ3uw|aT?q=n#p3eYp~cP$@(^2L?(Xh#g>!&)c?|3j&SY#<+$b3Bije^zXu&z zBdQ5r)p@JWYgGo^m*B;k#sq8&$X;@SMUuPIg6*eg=s7hf;$Xf)dn=1)J$So;GAnTI zyRnz{?$9u$SVS*-kQ=d0@}NLf1DbrZh=Iz?#?A2qHxw&rqJ^V;r-tca5f#80f}y(%ffRliaRBde6Z4>AA#Ebud1Ka7=82k~RefaIem{ z_UPf5UDExccg{?TIzh+DjxpAf@1EH?uu%PeK2+A&_^O-LBrSJrOfCG}9sxS047{8D zaJ*MFkHML*FkscQ!7gPr( z4?3LU*)A(Ak$Vq5%FR9d4Uf}Vol5?@m~ArdXYD9=&~xlJHB|7T0<0$E8X~_!(ea~> zqI1D>S+dlDU#dXr@hkt2(Q6{Czal>LxrZq$YZ7)VJq?WZCbdVn~7Acd^Uja`J%dRKVM2cvq) z0CS5e&Biif{Vc~13KGVd`M~3X>g#P&0{gfOlUCG;uu2x2+Z-4pRW^TfD1qFBtR+k) zg`E6Mh3ZKf-G;tG^aG|7RH<3f?KKQyOLuQHuF46Qr8t!3rwMa(|IP+3qti2})@DFH z^wz|-M>K6lhux#OplURXRT3$=5-JiDMwg7};_G9*1XD6w&;58f}x8Rh6Mg10P; zxqTwwN1%CZqLl1I>AY+>7gQgPM6Nait29DkRpw0NxK|PDWgrf5ayt;$JE`3-LVg^t z%{V+uR7CzpvZimjM{m11ZI=RyUt@6})1e*%4E zI7*+A-1a$*^paYx(M$0d(P%Wa@wUf?-Ro0;^zNW+*}D@r1NZoMx*ql|e%o%hb}G9T zK-nm%%wFE;d5H|g6>ASN7bL-^gnm$9}4PINOq)q z>*%Z(rX9<{UV>z`iP9gBlxcQ2v^P;b!D?a17YLvVti%ngg$4D=`RjqN)E*!1@IckL z{mDH^?NtwVI6rA|JK_5I6nRmVV-qb^uv(blbKk}dkMP~+spT$Z+%CC}uN!y!B+&~7 zSS<|5&+8!2uEN>svB-dI4AHmhQ; z`dR!a6M7y{Cil@u;^4|s;>9HCt~vEzA>;V(a;d6RB<-Fq$gKHz4o z?PXY13d0XFH_KoD)54G@)Lr_h9OZ^W@5z8BkSpqLu0BlI7x(p=UrGJ|xZy z342OaEriv=ddK$5-GZu=!>cZBrlj%q84l55j?cXdsI98FEL)Ji^;{a(JY5z}Hssva z!7lcOj$T&Wj!cfs3KecP#s2%~ODv(MK<<-x-@&@f&7XWf{sE^AD%sR2V)Jp(c=^lY z3S}MjsK3-o=_4TpGZ&S2RWUdfRB)5<$daZNvhm}-ybeR|E3~}OB#$HmL(Ek8&uZmc z*J^tT_V`$-VHLNaP*0Z!ba1ZjD^-q^1Nj0&x8Ytnfc=n5f8NrO_wZ5P7$@RU*m)^J z#tDcx7!+72&*C@byJs=FR_?}9p`7koSm*Yorv}9Z^sY-%29^1T4}J0ITgtAPtizp} zwy!ydhfPOfsjz~Zu&ChBb7$Ttb}nqs87eWOo{BGljW3?x`9$&{cMT`~=f3AQ#oc}q zLo|@Z0!@AU%u~mzv;~=0DP&mb1Qa90!DK5NRJ2*g)hF`v2f@DF^b7krNvpYc372js znsPQcxeGdBa_{WL+H#byfW3WMbh7aRXw$YE^}RuYec@v_Nw-AstPl-aUQ=-Qkv?_r zh&-!yj=1r||D}Z`lSiI-xMkNU@7HYC*Jl+@1TWqFEzIF@F26SRBy5Ks1!9h8d@wSC zy{7Lk33lcTI8F@^*L-6CPYde>vd~m`*!Ok*)53cDwFDWy9_@RW(0@zAuz7p{RZj`$ zpn7n!`3viN_{R!=6l%MV* z$>P3O#$&ZGfLsbTz529!H;#1_`L+yEe(dphv%j*gKTv3q5iD{j>@bx$=FR#2r#L0z zlRBrQ<#ZhxRqcJ|p~!yN8OSy4+()F2$dZ<8QI0c}A?$SIb7Vb#%8^1Hv9|jUHIcbo z?Ze;`KbJb?Hje^y-Epi3jVD$M>-Vzn!`6#{W)+-kPq2jt-&<4u+UWzF#%4-yGSK+d zoJw|4=uHofTZ!oX&+ip#-xC{cJ-m0o5^c1#ru9z?GnghccL5L6(cFn!;3F+!j7Wx^ zr^-~sx28quFDfhES3<#rWBua4^zYfH&Dl=n6&w(kt8o607N){~eb=v`GwZf3KJ;8n$MK7y_u|H)4cs%kNhxR~F>;O;lMz~)w?mJ=Mxe~{dxibVlMrE+#oPooem=H-s zJr5Ij#v^vpToY|a5K3@lox3rWI+aD~p-H#0#tjC{|4ZZwHkhe3~6E5yBtM!n> zd$+c~v`SnJlB)eBQg9w1^u6(%QSzX^aRb`qX2K(8#M3g{D^;!{9dc1tFBzaiD^0KJ zecAX3?P{L!j#cA=Z}U?4zHY{o7?!>9gL@PVUwf6cb!C1;icH=mDe|}cdU0=-YRJUS zFjeTq$K;>b_4Eh6Qsgf_yQx&~WBN6;M|MQh9xQ7q4z#%JwRn7gxiTW+ZtM_x-r_|e z?M?XBb4H7pzST!W+Ba$8m3X=D=PY_YGG(Pg3;z2)(ycE&rC)snms(Y^4s#xETaq2y z8`<)nTzPuEk`8Qb4Ge3Va6w9D*GaW8hG40%>F1Uor9*zThUn|7e|g~Dbs&$ds#qG` ztojuqA`>RMd_d;)O(6F^FenKW;zAa&0CULJV@w-gYuePK+nPkHDWQsf46#DiQh-@lPd#Z?R znsj@bvTV9`d%BTqhIuErz}^Z z-BDsBS8Co-`b@6uc}JP295w-{JVdS{s-q%Ft}?5mvQVz7yrb&9Ty<+l^+&myp^lmf zx!U=T+I6|Qy^gx?a_@h2yuXvHC+w`JkZ+*xY=Fo&@^m(e$TvxMHYv+DYj-vq$$v2K z{P65|ZD1I$7mlzEZc9ifXd0WPNWc@0q{;u?zK-S}j%*9N>+JlpE>?l0@j~<4gfZ)& zX;)OaYSg#}(Czf0%4Rt9I%w9(zD|h3H9D+aAIWSJ)^!0Cxe8P8?^I^!@1|Fi6bxc8 zR_J@Ba4Ce-@xG5$2M8G;fILCcgrhlhZo56@JA6Zjh7`__T{IP{l3%-|ulk4Ib<#wK zD%;@FXCtKrZ$E$On)=8llrPVtpf>Tlf9xV?Y=~gyA{3h6Ik86|gHTr*)tdAC<57ew zbFA(*+%KlluW0|~RkHfr`#1CDQA-zLIr)kTlY@)z6=kE}yc+ISMkuZ=6Rgh=EIe0~ z9S+MW>Q-j>JGI`fa3wF)hNfju?^XZPzIQ98gJ$=?QyWVP^vK4x-2BfIRY)@n9+?V_A#?Tc;;#2uNz>HQwbtAk`59_05A@r=fS_y852hY$PG=06e;psttL3EP-lqlsVro`7<9G@)I zLMb?yf*AHvp!}wzr)gJUdvvgA$;y`@ZDXAb0m~M@Ybr;B90}*S<^byFhHvEqeqLTv zI~k+YW9YsSd1kbVdhB1nqmQ&VQ^G?(1&6!e(Mn}(>x6RbYklSqMa$HlAAV;OvKx|t zOK6d>wZt`FO%*>(uCWYto^RpO7Rc48@A_HgP<>B`$Lbd`w{<>cjhID#DDCL0QEHwi z_$iOhX&5{goA3hocDWp?&@vakFC3dAm$CQrxHR5cVn(@eoyk02Bic9Ag}gg; z-vow91RM>LAs$xSOnar)lzF@+8#-&%fv+G!);hAtmmRS)Vs0}DEwAp_f`hIv4(3}! zAR)nj6z?N5sK&^I$4&XG!-kD@vfGtR&{BXlj!0uu4b> ze>|}|S(vVIJqgig(W-|V$k-B5@*pw(J4W!iNR=L_qrXpk(n^?rQHwK16fC6it!UQj zcfwFbkbMd#y>u*peFRhSaN$g`OGR54o8EY#Udd_7AOppyHW^7$1DO{4kX&O4Md^MN zTPor8W=!`W(P({6dIMOCnGjY%_kEu&?i5t4x;1h zJ1w7)zm3r%7Os~~APKg3f+rli0rmxZZ68OOJurEg!TAVtRz@XC!Q}So38v4C(noYQ z#lo(1^30rX#Ppb+GG9*l1`C75Z%9nQWK+-SPJGGp}R(wen*9?XR^r zcdLbxLV`E8iWo_b3pIX4s&98m{y`O-A##_J@rT{;255DI7RkUtbt_w7{X?3^v7wpo zHUpa{jFb)d(N&{`Xs2y8%2ZG>{?n4uq_aRzIzu6=NcCjKVb#Pk1!Dfore{~T-t5mE z#Q!>4}^{{?W`gs)PJDTd^y7X$Qs|(nVgy1w&e^ucX4bkqzZn~0b)D40A(jvF)@aIQ$0~Ur zO+6^|31vBl4vmT5ImAoFgwiQF%xUxcMf8BTpG+xCsx>r=3XYFJOf<*L(2FWGT%5kA zk9noC`Xfqni)-}nfh-gnSaAL}P%`srpqDbs1O(L*EXN=a@~?6gPf+T34hyPRVDm^C zH`LnjiY#+`vZmhw+vEe>TT$Zf7wxJvxG<2F{s*%s=|8{|miX~>XZe9&JuQu$EA#DT zGXW1bVW%?(ZqAP%FR#tLa-_r)^1=`m(iT%q0fzYL(py2Z+iXUqx;{vs-Y^+@&dI0w zK4esbho)K3Y4;JO`sLBg)BX;Lsk#SLYDtu=;qAg0&G$X2dMiEjJ!cu;gN|l&Uk2dN z>F(fJTu~<6$v=hrUDApg2$o0UxxDNpdUP7fIJ#lTB%xQV!j52m0uSHVB!?TT!|Lqd z6d?=)usKW^Y&60qJDt;!@+{VJ#g_gf!{D1ftq&!ZcFezzRd~z#{OwlkAa@MIs&tL| z8VxKEGR6_zN9Uu%~2Nv3x*fESEl#PCd zr8gJhG*;BZ^dHXqdq7FcjlvTsfLE}6Xa-Z_*?_2-8r%0l$Wq%yH2H|sDvbVmIHSvh zXlKc?dIJpU4kZsZHWu8W(riDWsyFWjj+1(3x85f9E4BL%y5RHMEFAZLJf;@P+#;Au*@FG} zEvu}66z=7Bd-`Kb{(ZZsT<|LDmu19X$U}aREDttleg31cX2MfE`t84Zd(giWus#6* z>RksXd)EyH@M{7Hp~T@Z#^FiE5tzggp~RIi#+6RSm7l~_ro>Y>#?wy5)1SnrNHY?=j0;u* zLLP;ZDyfo(;Xrm_ln!dllrX9i42K+;>=TBz8KZQm%4Bv1?ZHqR!$Nj$M6R3F6? zG2RdL@_u{Dec0jm4GlcFzHJq zU78o$rYiF;hS{+JE2l6uW2mqel@%D`=FL`u1ao*ZGEqV1;Osmm*df6@iy_Tuat~FW zO%z+a8jCoZf(fQE2V^8sW08V$iJZ~y0>GPS$SI7fZHoL9O;3pe6Bo1I_L9#bv8Q6e z-vA687#>Y39%oxNB{aE~534bf=FK@>u@Ao|jC~mZE`if6tMV(uS=Gd-`5VAgDcpxZ z+Fh)01LNR;V}F;t6bK#$fDhrUcT8~d5=_^E5BX9reKU~O15Hk;DrBbyu||^@03kQ3 zkO5WZOc<@NDU&0HZU71WfT0tfhRk8;jL{PFXfO#5L|8(CzDW#wy&Tc>RAQXZFm#S4 zELPrh>0s^~lwcuBim`z;2u*+5z`>XXIm0k!!oZ(klmKyPk{8`=uRu&A#1RNSX=J)Z z)8DGl$Vw2I`H=6bvalmbdyvoqG`ZO{`5_8o45Q=+^UT30byA^A7(v}B$S#mJ*_0`$ zk-=VEMp>2Vvk7Dl2o^`vu8OJLn}RG*@fBjY2VkIE8#1yCo|UZ_tLSFkhmG z*x`(A=aS_cwA;q|?O*jhVW8LzYPqjWa5bjsBBJ9B>SmG< zDkF`eYC4zD_f>sI13vEM|B-H_*a+@Pg_fww7kaZ=tCCQv%8#Uxtv8Sz+mN%PSi>bL z+2L%b7}-BiYQ|A^J(etENg*s~MDi108X5KPL+iu4T94x^~z{xT0&--TN}uRp?b@`bZ?NR5fTCLL;QR*0-zb77(|)CJ zYL%DVYETxSpz0u3;SK6)HD711R%6`k_kvJLAd3fwhE&LB>`;X{9G zL#~9R&`o17L5AP=ejK+aS|6bT5&F zauPDjXD|LB6YapXjX&w4QU$rvp_eEIB{K+0j4d(+N_?#(fx_0~WRgqn-Cl}AB&k&q znNT{kh#Iogiw|lhQ==jj7uO`hf%IfSjgh3cK+3{1CK5Wv5f$d?EN;IhXlFV&4yiYf z&fEiL{W#;^LMp|XivRuOvjr9~x2bqlJjeL}t{E%f77^uIvNMSfbVU09l zZH)dUMNLe7e=(g@+>}Q31zQl1)Qiru8p-ZS>+?Yd)V>hsTEviAM0*ScEq&GWgj->< z6Evl%by7;6!`Sla9nPj+>^xwOoPGNVrTq!1A+7qDBs7JhosD)8@^Z=U4T`7-i0uOp zaJAJ7gHrjc&@RL9X%-+QNKG(`U`drF6$*-H($JUXDa>Vn`4Oi=*#>@Q2$^L%2XJ3v zT+3BRR35P$tAH&6YL{=A-oT0Pa*GN-!O4Tb&%yA_LRCrkK5`coY+D^oP$SW9b3D2T z6skgUIz?U~$#w%L2X)pG(ep3^X{BVi)0@cHF>J6|NCC|8n+-7&?CpFDHm}?}k@tO_LKYBl)IjEMLw*CSkbw6D*bwh-RJaDP&H;l-Y}u@TNNk~` z98}2nfUl&*#E6Rf?rKx|pPMj$pBWG_fz&+a27m|19xz%y;gn$>3=D0$f6e@mWk4Pb z@uL}!Fb*wjh*vi~=3= z6?2^g)z)t=!v|DhSpp%ce5_ZrlFRhBrpyX-wERAF>~Ol*w;f31Pp&N7#|qqj`B3bM zy4ee*c_dRcn2gmwxqqq;{5GRptgjmRB&v_1toO<6E928FQfoTLV=I!>A~Gcf9wjiZ z#;T&upNa1*vm~0ec?eRbClk(3SgJB7e&Ip};B#bLuxE<1uM}7P(%4_Toe%pzQ(<35 zS?k_H>z*$EQC)6$yWII_xm9tcp?i51w*0AkWoUI}b`_F~Vw_W4oq4;uqqv5Cvoi5! z`G9q0m30k`6~b26j#yU-**1vSmakdYM-(^yyupyHEo1MoWNRznzw5tOF+xh4kOB;u z5@u|bb)}n8u3&YKbyJaTQxl7a{aw}W*|rScy7zaJnr-J(_fo4No=3qyB+TPKB#Z&M zrwOIKh5bXq#-aOpFxJq&q*x>@REgCTw(lFf=ef3rEZB?c!BSy+FADbFEA4+M*h^qL z2(UiL`OgUpC^-E1cdx(T$a8IfaqZ}?_V2eQr4voWaa!;}K+i#8!QQ*z!=s+P?|*kc zS)Xk6eEY_BM4|ls*V^&#zq?o}jPT);;(g(o6~mzp8RvW2J0%9r;bKPB#t1Me<1p+7te_nsD> zM1@?tuI*~t5d8cP39G{*VXD-1cQ^M`S=;_Y!hQm79ng%ug*V-R+qdxF?S(hBSSJi} z+gvQBZWAHOpL_o+y=$M_;O!pXEs%^LWlxFlPp6Bw`hTOss!6_EBXG$7Un*<|eYf{4j9h6y`#F2PSETGT zqnNrTyNq4FEW4!C3L8};G%8}@M{<}NquQ`-oUkEbQKk-F5t1v<@cvM}=K;C0P~3aC@`w7AFdt zw#49(NZ?6vQcK8eMNvs8p{3T*$?+c+==f%f1h;JxIoP37 z5ulQ)qff1ThMgYG@B0MazrQ(=&j%aq6MOYysW8#duJgG1VD{I8q4abLAoAAF2C-NXxY(s|?du1kq|Jp3!CRLJ<&ZoGKyL z*P~CZoY4-g1xdhCVXZBLD0OAU!sy2AibPq-WErdk-=uM#oot zI;3b6n|}}^dX7}GOdpk3b1AH3|xm;gB3^^QH1CCYhud)V{@<*iCpYFEfg)53VZBq zfTvDb4E`vi!KvpRpZeu4Qs4=iPrRx@cIL>GZv7t><`kC+c*mN=8!CmaGG5ft5+}wV z{{N{k9pB5dq!>3W6&5qBspgZK9#q095Hn&xKAn{7#_27`IIK@cr9WXbSp=Sa|WlQr4pPCLo6QX zv-qi}F!CD5Ej-9q(8o z1>g1A_gF}8@r%sxdxExq-dPMToux|-S3Ot$qHbRTlaNmeVdSn*MI2b!QF&1me(J@K|ehguB0;qrcI5WhJR+7>P7SpUi>e_$Qm5dVT@Vw5$rZ|1Ba zcIWZd+1k}(F{Sh1A=vhpmqWwnnpzKK<97sck3;F_&IIkm%zTGKhsRRl!}AsFL<6-( z?HvS$z?4}B6cf_ek|-s z+#OvNG0irXijbAvyKSH9N#MB}Am)6**)Q>UquEMvb>2A13a?b^o{C+NV>q}b`c)#w)2EuogB+F;ss?v)*UtxFp)gA}>?xh{I_OnMCY~tFz zpPfCswSUrZA>_Gpy1eq2jlDPSRw$HNq4&{AO^ZmPH?DUl#)jsQZh8tI&OCI#;DCMT zOfftuDu0-BZ*yScn|E?tmyD3~^+*)MMTUI&uE;Mb@%ZHkzhl_^NRG3rQNcwHb=2LS zw6xG!q4Q4WQ4HhcvSnOE4 zpXag6XC*_jwYhVU8=w@WqxRvp>pewPfc?v!%d#PbL(PeR=~lL#12gTwPu!@XtUsld z=4r>zlYG6Fdwjt3$P-!CfX@OU*Yyou2h98fAN}DJMndhK?brtLxQv@j62a3qtbe|l zum4^j3c4y+{tL;v+sovQy8HztvEul9z8Dg6^FHZ~Li639GoBx}_umGM@*r@X?wH1< z1HW^^b|MMyULgb@d2ZDscFG@-S0G3iUW0TzD4me>(N?9AA+Mv56bwiz9dp)UFEV-r zUPUOL6QbAL6ylFm!w;L_3FEYQde15B6{neeMp#jKuwt~2g1--6Tc~D5kPs+bw<27B zIQ;&8xRH#$hEw>1=odPm2=nL&%ZiA{!x2F42s2RRl4HbEok$0#$mh|KwZ9_Z!;zU% zkBMB{SS4IV=g1hQIK{l(k133f&aQ}gx1g5~iYvpRq_-jh30ONWkzWRt+btIVI}gCT>+E9_hsF_+#5mL=T1&FLaWE_Y+T} zliJyneher5iH`i0odnbs_`RRxm`z9p1-jX2XCaa;0a}buAZBdCS`itps zXHoWzjA-5rrI=LnkyK(?azt;2!*vD|U#4%2fDuuafq$l%bB5|jDgX*p4+pZ3q`q=a z^3lzT(&c`so2{Fj^~@~8c_d5wnq&-`9@d`y0+bnDnR9^3&dLc(aMnm7O4s5{=Lx`j zs0+|O$nne0$vep1p314_Ll#+T6bEE5)Tfd_fq-NHsZCBJQB>`9ZvP^415tjjcAkcP zUJwxuZEvc;HEz#h#!x_HpKktq4D+Dw+p?^D_2&E%A^En(>^Vm}Us!}W!w)Bq*L;a(|ZsM_2*x#6cRM-HP3acvkM}_IdVX3hB z|ERD^EEOh~|BnhgjKNZ2b^lReX;><(D(4>+#uWR9s4^>;DOImBZz(D%w$itdl>A5am@c&G?e( zH*o4aT*qq&tP&eCZ(t<}8hr7Tv9rARp(~qfBbC2#9kem<5H8`>DDu0exucQ4N{ox7 zN&4~2!58ilaivOPPU)V59Mn-R9w=&T@MDpvU?xK!T_IBrT!*EmGz!?w#i95iQOF70`PxsgGJrm_wfl zw8k8g+vcf0wjpYOwuR}pMYy&_#kKuksj%$duKpX(HI%XVfFgBuErlqm#a=Th%3I}jU9DOkaSN8bdMh~lEAy?x-%nAXobgdW7wyopwH#NiC3Zs`S z;~U3gXEkgGPyQeF?mQl<_x~TiExR#ej4{TT86stbY`m#&oT*@1bDp-#ceDp8jbbb0*x)Ah0Xh>S{n-PY$xy<-l1&d7OTk9f} z%u}A&Ja)4@;o*uUHv~9tJnbPr<+H)?_f#>jek#TFsGqsYC#qucjRVR7RjXy5sBWs3 zwkeez2J)FV9e2Sz%W}^*e z$FS&QKM{+)&kk@27!Hf>E`KH$`;6fdX1~m{L*Qoz18QSDpR2skv|BA=dr8FF^||w= zx|5By#x4T?sx7^y^*~R`qxxM`M zBcxr&l8DD_pB_`$>lE*BoM7x6U*Wvk^TZJ|mjpA{Yn85=zNd~=oz|X%4rcH>9c+*J zIUSet>CxxC>du=S2;Afnl<6N7RUf?HCd42%#J4YGyZwb7XD-}pxNsmloRDP& zdrxP3`^WaiukCf8I$K`7?jL&H{GzM*Qy2ABH+8W4MDj*%}2;DH7pms_O*_{4;J4z=#sq%_; zu;kv7^sCj0dRKHV>D)=KG>H-o0L*W>uZMxAg9f#(Rg`N-k}@ zck}-M4W_;JG^kU^RKMj3 z@Ypi~NBG+Ulujt_120MF+{d$rsou;(neV_$M^!%3%b>MP)WzJZYWiXV)^9)X>WI7n zbw3nWlVT3Vdr*I+!9r)H!CK}&qjkejniNn{A|x0rlY~QVJt6_rEcZ$0W_F- z(+o72(_$J8_HKd(TRcXC6;GzYnr5cKe2}NdXfT)cqcoV_6dEkpdlnkZK=$^Uh95Lo z(=;@gK9)vinnZ)SeW$_hVq(O>aW*hsaF)Q4)o!fkPK>-~QM2f0DG9YZN_2n^ z4cKQC&?nhEk~E@Jbe!Xpm70YH8#$lmrmZ)P}mS$@&n{7gR1!@7gvH0IVKhE0c#7sxI~ z_)DK;aNDmMb%EiPEJR(dE9%Fm)Cc16o9BHs&+~!rCEGElTwpQ}#Fo!U9WYE=Y zceSe6L1$27H7ww9YUFwtOIG?g4R&^n2BXYEgE`HO24hg9(O}Qe%#Yo71;Eu`tq!%= z13u>Dv2BRvIj1HX4ixw@8SW#CG%t4Q4xu20J`Y8cY+Q!MsP<-xo2#N@k?N^l{FY zm*?}9!~isy`wtq-n&U)H@v4lKotrz7LK)aKRh#BSgB5OU|-zR5;yf+QNOQV zb@T4sd(FYlbEd(%#%VBC)BLC3Wub4*BA|^Pou%M&P;=ajnQCw?i@*a=`w=iLaPqLo7z$r!PC|C zU9vB&Ie1IwvBOoIJX%}VDS0Xl7Q?yUEp7ytuxzJfjVWpx8tei(aC24G#G_>n8L8(@QIk<<9a-& zcZ);Y3nlMsE|Q1uf042Q^vdxYK?e*w{6S;GY7y;TckWUHFi8B!VyDBCX|U53!ks~@ zJNKzhPlFwupux<4OM~5BG!Gg~dr|LPXs}N#r~4L4gJ~`59Ar`CWrAdyXB-a{!-<*C zhXy-6qIV=)a}o`9ei98f!16a}usvfm*o)uOU|ZECTo0(+|I;+sRs!S2M{w+pAl_|Y z=*W6GP$(GGah^doT3bZXZAmm?17!cl@Tba`v44XGGy6e<5&jh#>@tXA|;>hZxqiR2>zJ{BTS;fe8y-n zF!Mj5!M3L8nEw+R49r|!&s4^q%0r{U%;!skSxnGiPGd9}JL^m|n9rZ0!2}j*A>-ew zFS6G`WEJ`S6&fstITy@qL8oq;lI`>h4Q2seevUMl150Yqd}uI77PlE_usxG#FsBI`%zTbC7?m0S z$7wM0sWccqWi}cNKYto*&kQt})66uO(|l;KtqY^UoMxiIR8%CIT&nL*LxVLeoCfop z0}ZCXP#Vl-j0Q7U5Di8jF-C)xjnZIuCevVyFaJwuu;i&Un3Uu+G??e_Xs|1zG+68? z4YuMZ8mw?`G?>m94Th1Nod#wC=IrFDh*aN zL4)m?mIm8OqrueFXQ08_*!2J!jAs%Jb`hY#+CXkU(_rW3O@n#oPtagKe})EoG){v# zxPPa?*cf5qF8~@$eJTy+`8yiy@w{lTyj=I6X)xh&8Z7(=4OTZD4c0w38VoAffRrSi(_4$J1q^CJ5GajFOUY? zKSqO{AEUu?i9cvCkue%9Zay@au+(%k*b$8d&|o*Gr@?OiZ5m7&F+Up2dv+R(MeZjW zEO1&HENwa(jDGp=X)wCAe~1Rtl9-(abN^>FnD-oLFybsU7@gGYG#I1%uQZs(ng!5c z0{{)iEHx7iM#nfogE6n2o(5xHGZPJ#K8Xg?^OzG2=Kc!}M*6pDFom^0(O`sMX)wCw zGtyu_{}v7AO{2kl7C?issEpEJXJ(|qlIBQ*oskiDchsJs!T6`8!9pg{U_2Ugp~3pc zXfS4pDKr>xmjN^wi&Dz8G#FZGt~A)4|27(oO=W>Jn7mv#ozg#}!DOY50~{8e$XbZf zC=Euh^n(V|kqRfUESv@l`2Q^phF6^{4Yq5|WEu=Jl?KzE7Y#nP@DK2D4xL6Aea~Ck=MzR~k&5MuTO{PJ@N6 zG3}a;1`GQ=4R(GGG?<4wjRvFsLW5DFM5dv^a;MT@b(3f?!p}5V=O_*4`&$}JoD{XL zdag9s$0;;e=qL@ANu$BqztdpnMrp9Tze2+xrU* zW-uKMrU%eqPE%USEyG(_pfbXs`mU7`Lf3*ftss<}ou3W-tv6<}^x!^|RDx%tV9PE|>;;JOd4OcM=VT zmj1iWh4%V9^4R&>A z8ccH<8mxB`4VE<%4VExYgLVHA8f@rS8f?qYG}zGZXfW%4hX#Af?~fBSqURl>!Lk-c zgAM&#G}zETpuq}oQ)w{kKTU&$j?rMHsEYq`8cYK9KN@V#G+5g&G}xB^2o2UXBMoLd zPZ~_`e>B)Xp~1Y#qcm9CWE$)`nfE9_gXJT2>G>!$8mx$%i3Dgc4Y)47(03Ya3we|V z`!tCLLjp8d_9PmNK}Z(}(_e=+CG&d!M1u|SUjI&meWlT0n((MuXt2D$LW5<~^L?km zbbqG7)}eI)8ccI$8q9@8gXK-3!P#Q8cdfwL4#?G(qIn(8cY+- zIYEQ9(P*$O6ExU9vIsvwgB47o!3;S+zmYE-r@_QKDq|!KpN!LBh6zojwU%?D!4zff z02*w`nf)~Y4TeYjpuq+${xA);a+C%;GCK{X^y;RK^nz)yOFTURqcoW0Dhk%d7snUD zz&$~ORh&;s_#F+lSavcEmN#D-EMbfWvz$zWC63cz(Z8p`f=6jEja2{*7A8^9muR?Z z$j8DLpuukZo(4NQPJ<;@^w&zXjL~2$=f-HT{){CNlW8zzfCfv*%>%C*xc-9%GaseF z_Rma%g-AS~kp?rGnFb3~>*F7#!JHX6rqE!=Cup#XG#c#06dKHEf(Cmzi3W4>mHhIZ z25Xs_23xyu8f=w_gXR<(Z0j^M*hzo}yEc^u>o)t92CJAK4W>9B8jNGr&omf}?@!QR z@xPZ_GOMVnCtPP8EG&v>B%(MtM4?J;}jZf?+i3p1)^XDjRxBZ z9;3lF|3ezg<99Sz^;8={6VY01uu1}gzyf&HMtmVc+g3@2!?_gs>f z=R||W{-D9I-)XSK*=ev9d=aPV>B4oC=DjUHw_I&o{MIVpuBd`8~6YRo^4&V`; zVEu*)V8FFV6ja^E-cN>yQ8CGY!9^7BqenA9mI%tK857yvV)$DsxGbpI02#&5&!>P# z^=b;b;Ds9Y2{q-yx0?y|sw2HM1fS!r>;R4B*{Ohbk4jhdIKh;O-*b6v?7pB^5Oj)elglra@K06%L5WLOD%VW=EQ zz)pk555P}w;LTO&J`x6iP%N;JZ|VX=cmxj>!HFLYQAYKw)sVEIh)-~w1kfjvTj%y)+hO{Xr08VcrMA$b@c3li!Jm5&Dt(I%nP zsN7BJGWx{Dcn!1*7Sa^M7G=-(4Fj>IN-5&N-{@dN0ON97bc>9^=*BBAOLJqzcOTI1efgaXG zf%fUiYX+}y#lzKbfN+HpCM!163CmMKWz|6a)*Gh5zT)8;h9Wj*LY*<3dPH^$s#N|v zskbql+IS8}bAa-xugRzI^N`Yw;S(xyANfc>I4Kxpt#f`w;YpNL; z!e{DzYU2R=RkYk9Wib&C2^7@(mbdkLWI zl`G}xP2Q>t>|TLx!wH7m;qNs-y_Mr_Bjevzt-h)be@%xups|)MR!*XSiwKEL9g394 zvhqI@3C?8^CUd4#?u3$fKj6%F@gZ@JOPg~y1JqTMDdG!`%NVP3-yU8T!~cZ{(!)YK z4bV-w@^4lO*#iC*4y_Q1X{wTb-3QCdg@^IMUWCeDt1`ixS&mR(gNATdDrkkhb?I}o z9#qs0&%s1gu_J=iNRk_<@R0*ZKNUwe&7+tC^;7elbR2O)1Xg8ai!*D8RG8run$Lt}B z4wF!K=^(E^9+9UuDC##p063}}}?_HSDSn=qW;Vqk_!(Cs_Xy0^ElTnXK8AXxPg zAQ$=GedOZ0i(Xo5??>Tt!7jT8f#sQU^nva7?ez8;g!K~aMV-XNFK}lb4OJZdKBbI`JXumDai^&P`uqSoDHOXu-Xf%U$_q|Lo`yup6wQ7&EPLm2{*}e z7;&b6mOqD-((N*-`tCjYHvbINNB6trlppB{A z-(nzYM=UQ>rDXS{+32HpJIHwLlnH>s)_p=)9oe@}SO~L{M-C^jE)s{^iAXHR2NGm1xrKC4MAS4iUaK6f|=C(gz~>>vojkAj|@P*X{+B^(#-|O4j5(qQLd< zXf{Quk)sam+Sd5oU!RzQl`GVj7ot;SZK4oT-yx~zF@pW%Har=%%^niOZQq0ugz#|7 zVdKNXI`r%t4=Wvbk==j|F{!~YL%@lG;t@vnT(L6HSai@Hfe6@TI}Oa1&r+#Fn09iR zLw50FBVmkUxeW}|md(|OL-H8do5)JZ1YLnS3F&@$I5Na_XVnr6yj(f$&Pm}zr7>(O znRv1l#FryTJ@AXF8;vTEaS)#vIO!@bXTTza;r z`I!mo`M$l+EiXMk)co8IRqKGd*F>ykbE`ewj6U^XR0s1x)df7L^=w9sxD3~YqUtXU z*AJ5r5%dk0n(MC&H{5to7kH_jxVq8Lwc*->x`)k;_x3hMqMA!rH=kVHbbobyxoZ=3 zZ?o6lmR8s13lEx$_cnF7w)R|VX>)CEcCBy5H$>tnS5S>_Tq$pcDIZX+S5Z`$+6($` zt)H8lk>BbZUEgRC+7M*46`^e{fq94s(J*E*Aft7o*@o!3baN0@DX==Ovhgz4{ zsgA2H9XE`bf?8hT)n1xMw`;bv+Z(@G8~sN4Tj#BouJ}`LPDb}U5b7SeVBD=7-F;c8 z$I-aw)wh?g)jG|i->%I?$DHcjl+)QG)EOGxRt=OUr&~FvuTrSLR;c%DP8WPlH`e5B zTJ&4!NFT88QytyMHS$)f_3cID0fY&Y;N|z4r{1LreZZTvDH=0b5V&i|tQLe%0qVf# z{!h*(&>HNZC55Ag$n9&w7eyL$7~$}3MG)2uI-LG=a^#cW=|T5a)^>H&5e$oeE~Ey_ z;ZGP!Q%9{M4i%r~J-TK%vlUX+%6f#1psX3{p!;@-ezef<*I2k6KFjfK zI}Dq5`DJm4)SeuF<+S$lDAkkSIDe(VzRK@+Oh43^6QFvpw>kUd*Kb^_GJdAPT$Ntd z)m+>2@a*dQAA6eapB%A$aPfNQBpU4g>07ocgFk4n+QR%1Zc_pQC&s{G6|NZtk*AC- zc9YMzD3WJ#ryqymaKI!PaVvAC#}I^0v6|Vx>tAo?BNpv#<|}!#^Qkp9hjrf(-dOT_ zZsD?(QDBZIl@xbK!y}OZZEwP(BLZrU+WtCAKU3k>DXiup2AemShw5h)?GM}Cunxbp zt4QPJ1?`mO`9>?{WZmV@vXV8pA0}N9aR)IgFrwSl#j!T4at>?;Ex~9fUg{cpQOKd2 zef0@eIlFJILf2s^zks~;U?GoBd+uWnWq+2}oE!I(3&LX0?v4sgi1t00oP1M7AS^D2 zUD(s159`6oiNI|&>Waa-flZY1HTb-S_?Mh}G9;|QmwnJ}$$4SS7f;q=%(dmQN65IQC5M-un@1AV5d+@QVBl-T9+v~CWrCo>^ z#RwO29@~AjjeF$0meVhdQdKBwyADy<(-@t;_Upz1-cv;Dpkk$i1(&-xi_3YWo18T= zxgUOZ19wUHCo0$XZhRjm(&FdvMCEPSdrRw=yBsQB>5Frm=?d3$_q-!iRvsgP&U1@z zSAq7xtzxLGg(rlspABf=>Uu}SRP1xrgOp!=<}^B?}!PP1;W$TY}Z|k=0xvTZa0q@ zxpGxZxy*F)d9Nsn+&!La(o32cPX&6iB6-fLx*Y#he)4V3$mYHB{uSz`{f}BhPrfVE z_qMXJiaP8Dd8%^i=#7Zz>oHO7HK3)6yX9HyIjmMZSiOJaD&s?-n}WlIY!Ae)Y*x6l zB@6%YQOOf7=n`pJP41GB%hApBb_6Er7mM zEzItBDr0Pv>LN8=a3cEdF>IIH{8tGc_LR29wnpG$5HGQOZAm09J0%cH^kFY>ndeun z!wkf}!`_eRgTEMg7c7>P-Np$egRQ*t#h>eWXRqYl+@#~Mj2ntH zC65F;dWeXOSVgXXA{*$mHAw!8Ia|?kGHQQ!+>$jHQD@5Od=L1)06z*WUgqz$0Tg8X z!XUFm^o~JrN?Db(34*k9_Yn+}j+dB!OzB>obBkIH;$@uK5GD+U8}F?WmE(gi)6;Q? zsFB_T-RM)UtC659TT;qdLgZYEVGNZmsPxNb+iV}?5ZTTNQOQ@{Y_l}w66Qpr%41m% z8;0^r_8XSSy)53uU+)k#Sd$pUd@$eAm(Ks0b&@7p62b3F2&vO*Tl4-RXY!)=Yr-1E z){Z>3*FSJzKohl>vR*Snf8U9uxGM!%o%LB&{LwW$pRFrQyCkZSIa}iy%wFpr>hg~a zT9K&lzi591ePx{Fv)}^NeQTwBIR#D~VqWEVPMR*P7^}3+S0D9w>$QgHYY`&X_5_&i zc$iiB!8`Q&fmLr9$R*KtR8p_&r)vpiOS+kV>qvfWDzzj2%k%W)^*49R#P97--Z9ws z?3Pv8Vy-au`l74~7>7%;`5|xr2w(M-cIPs!K%8FE!LYPf{npazhw96CywhLG${f6$ zyt^nxLFFcw&D!^1O_n=}cT-R|P98tBtI5{zUN^n1#I{n_Vo9gWhaxTOm-&_Zg^4VG zsB+F-MRG*5Q!FH*I$qMg*rC~`W!rtstEG+)k(c_G9?otE?RCE42k#HMm#rC2Tt;O_ zeXzLqp#8&Am()_@PUD&!#>g8-29OkAm9R`s+vQK2=r4Xgc!A@K)-s2ik_MVQ)NPkh$)? z|0Zw!SYmbZNQpM8n7|1aEZ?U6(6K6%j~p-CW3XwXfbyBw1BtjxdirmJmE!NNNOnBv zBQMGNkessgw1h!qGOadut;Se3UMD z_`~(sYy_*0X4Lf7-g)2MX|oJm@ZjoQ)|^W{NZ*qU`Lc>x)i#>t!gfmh9|x48Bi}d% zM=JI{Qpi3e{ua5uv}G`QSrEFk+e1ol&698Ucn_3#9z9NcoO`{jQ56d6RM)wPx&2Gz8b}k|Xa5kO5qFi=T52_gV4BzKuu77iCu4uzrZ#@}O&-(~0NtOF5GF40jzgGOck4 zzL4T@sXJ@?z7$8p9>|*dH;t=QZL5}kylE+T=L@fOlf3GuH5`Xudn{XZDB{VnGFzmh zOTwmyQQ^nambR2bg)1aPxZk+Q8DBW}a8*0(mUD?}%e%|9g9^|3_a;Nc`*v`@ZqL;E zQgzL!A!$uh?%LcF>$zL{vfn@V2qs?+ z^K>>id!PQc+Qqi5;&kofk!^HEr@wu2{$gpv;7wpl+I1n}lyxG3UWveDwVk`uZbaeq z$H&BvUZ-z6=|jQ^D@5JM>)rT{8!hp6JD_DKbH^>{vFoxV`=#Ob$U358(MhzpyK0g9 z>Q47HtRA?XItt$IYm<(ui2tC$-mmpoC+=Bx#$$`sI1P3g?rGTR>Al@^kNBB=G#U){ z%+n<43>V)S%g!@)01d|PaUkQ2qxdcxaWCg2uaiYy66?K=v3hUt^m5hq_O$Z$vf6c; z-`j8JCU{c#+Q)ZQpot_h?q%tKM3% zR=zhnM|^K-J0*+zfeL+7to-hI`?FhO)vmKph z-zIs!&N%ztO0tLb+|Hy?1NO-OO9Bk;bOqee8g!V|t@~eRx zUIz{-fryll9f(lQ4WZ^&C2ld_v@32LcM%N)LM8*MSFo&h9@D=6pcZ>c9on zj4=J9A$wkjDFX_adaz^t1xxYJ6UE_;l-Vv?WP}H> zMLh2hzqEl6` zyy%&JAhP(XK*oWn@OzOfYa+t{1uTYTvjj&$eN?~qg_7b+pU6@5#mAm1gDRahQEUvG zHUt6+SRGsRrHo6>Y?mqhmwH|weX${0VSn@(0!CLHJ@DG22r+V*u>tX(Ee6pM1Snt} zSuw_laGu;4E)~${OfJ}N%@OCAjp8v(C8ST5p@;^Lr5i3BJrGP(j^#yM`0S&}&mOCh zK|--#3C73BMA$%;=@fmhFzCgSeXb}@C}8>nbc#AxbOvK(*{{f-h}+Z@Glqm0TL7!4CQYG*~f>28#q}u-%cPG#I-NK!e55 zXfRKJ1}h04rNLBu-by6-WFb5bCIx&sLrhLGFHSnEb9F!K)j*Zd;N4dvPecJ4-epNd zM9I}F0x{9-$>Ea8ammRWj*nbT>^grnD>=2x{YF>v0P83XwoLolovgE?G}sAOfChWf zIZA_tF9B$#cz}bV>btAuw)ty z_OkG154-Ou4fbU#K!b(V9UknuwZZ%7*Q`X&_eVw!rtn#9tz4X9Q?fkMDTQ0=WJahF zM`J>W<3X@rDpUBDo9d}H?0EU}saWLogoaeXlI3uz+Y%=oGeWIJuf;82!N`?#Ti);B zhW6VQC061a(^TIaI&&;dhywvCP1BUZHxz7Gb0}f?9yhJhbc6j{g$Y}=8xz<(PHs9h za_7CMk)YI_qdIrQo$i=4ZeJOHXV1n&o^6hcIPPky67@IUJ=q1?dSBfLP{7phIUc&_ zH(#YIGF!m_ z)7zNSC&l#6FMFVrfl|tBojby6eeFJ@HR61#-2=S+yUcZuWD~c(T)UV92^z{hTQ8#c zDB+^2Z64m9Jrc!VW-F7pfk`^=8g3O@P!M@N_i<$|hqN%*B@(79$aWni>XMs9%xj^N zR~jJ|%TUUkc|P)aIDY{(mpttC++%RTn*%7GCZxGezN~cts?17^(?Z`r-|%|=swR|H z8FIIDfr+$$k$>*?ZWNDaK@L@iL6>Yfgw$!u-+7(9-ydbxRCpx2@NhQL*yS-I>hUqo zN2i+*N28D@j2@jQKgtSxM8QAG4=g&VTj}O2VSb%3Yp*3C!=jUi>ZUNv3WAT{Egz z7un~6W*#opD}DM|w}R^bn0wO`rabZxk<5##K$Ihf%bv7SpRfj?*}19)jh>)n3fsgg zPi8;mE4PxmQMqhWxr|MfP(Xq6gUSo9Qc~JZ_ zX$DRFUSZP*!p4`L9kQvl`TAV1>bX;St!q#1X|6h=Or86tI!~KAuYfwA|IuJ8ZQB&j zwkh9iQ+?95y0`7Wn+EIax_Yzg+LNvuyaB(s(9kV!vz{IXJj)2)rM!>++AYie-LBMv;j(}~RK)_51mA@ijfwLlD z4Rb@l%;T2NhJYO#N5B{W1Z>}Q2-s8J9|+j~x)~6#V;C9&mN$WbVa5S2pDEE0=9cP1nlm#2pH4v5HP0i2$=A^ z5wN;HjDT@}N5Cv6Az%#O5wLU&W;z6nJR<_eF$)6rc@zN?nHB+CIu`^C_XiQM$k`CE z%`+olk^d?J#&Uq zL%tYI7hD-v7)0=E122w2-F0>+`5{sRHS{6N6! zCLv%~xNK+$nB6}@z+zAd6h8^^k}(7fM?=62XGXxzj3Hp!?i>kvE50LOq-hbbh^Yt| zVFCfGlb9X>3kDD{E5>ONuzfTH%ySX~R{aA3Go>M5-qRsq-U~p$DyBog5U|~2 z2w2tc5irBQgMbdX!_1Z?*h0!BA80=D~C1dQ%y1nl%A z1k8I30i*jz2$*4t&i;RbfYD7s!1m7<0W+UKz>bX}U|(2fLcqNL6avOhL%^2*O$6*b zb1ogTIi0#qO7^i|5HNGPMaSlcfNfkT0(Ni`0@m<%5HP*@BVg>)B47%OmV)PmfSsNe z0Xsh%0%o)PKZbxEs9@eA@b?ig!~bUxFz=ZVu!Exr*nahy5wJx+5U~BZb49=m{~!XU z_!khc12NMeV5W0K!1l4Ey3GdxGiM>rfPn3ugn%8JK*09T5do`YHvQuW*#4;qm}$&x z2$<>o5wP7eAYjL4M!=5E2LUr&7y@=|CIl=?S)#$EI$;_FEMnmZ7;z2=*rtUdVAf*@ z*yaTxV8a*35U`R_1k7BVfmWgMf9;4FRi0{vH8a{wEMH@KgkBFb^{w0@jCYnH~X?or-|H;G!X5%V`Lh zRT6-Jg?KN^o)!Vi{)Y(I)-eRkdkg`~#{WRT7{?H>koh2Bv69mvVDE4XK)|j~kAOx0 zZ3OHy|NIaz!t4l`m+Vgnn9sBbnEZ4I7;)L}5io+*A40$i#AipqPX03lj4%fT%y||B zj37BX0`}GI4<_KTLg@%vM>aUe<}i|{>KooK-^prFxCkK%u|6vL%?ju5HL>(00F~HAYiNj z0=8%r0Rt}>0ZWzqn+OaK%m)Drl${>}Mv4&Mf;C0aN;pfYoaO2-xvIg@A>RAz)q-^G3ij6{knQ zT5$_R!1}d*LcosC69K#ND*}e0Az&%9BVaH!(}QXRsbI|Kh+H6F2wqg^d~=Zbr7)jb z=hf3};Tv1uMxHw3eWjjpCow;guD^|7$Vvdyd2eO~33|@~0dpBcz*>Jnz*?e^(;#3m zQxUM5NeGzfvu|M-eb(8UptGI|61mih$+*RRqju3<1-a z6#)~WAz=BuKM*ie00G-Ffq>ON65CC2*IwoQhY+wczaU^Hzd^txrbEEWd8Z&?dyz}i zM-eda91yT%8Ulv;1p(VU9Rj8cAYjL)B48e?NB{!HG!+4R@-qU~fSy3WT6lj(!0Jff z5ikwrQ3MQxwxJaeLcn4QzZlXGu3S}|r*5il4H0VB+efbmR&fE^n}z{o81=`$f< zHVa0;yk52$fq>PGAz++S z5wLt-00GmTBLa3{W(16J8U(Cs5(0L2CIrl490BY6BM4aEuLv0T&j?uG?+`Gne+L0O z$L~!qxRVSWL%{AX3<2x=w-B(te}I7H(oaRer2jMm7C450`Ju}G%Mq~Gg8xIn=8S+{ z`2_*v{*NGF$ulBg2j_`^>HZG^`zHvP2XPbuYn_aMB@uZK0|=NqQkx8Gq9I@fd2pAkdz+A==FbRgNTX`sL8UkiS zFt@*gfMt+j-w`lvqVVw%jq-CRZNtAldprE{{W`QZfPgX1jDW48Az*G(5U|#t5HL-w zsV09C4FPkQgn(%iryyYWX$TnW1OnDdL%=ppAYdkB5nccR^O%Hy8L)qTBQk-2@plAb zBn`^O5io-|00CP+Cj{)?a$5iagO4F#dm%p%u+DXV7y%O#WV=m9smKmG8F-PsqrfUc7A>c*pm4mV1BAUBVf;9e*ys${T%`(K8}FZ!)OSY z-rNx|2?qGJj9?z9aAIr7*~PBMy=O$g5~L<0V1?fiF#9P8n9&Rf*m*?3Vj2RrgLMo6 z)A@%8*wNo1VEI!Kuv9QoViW;OkP_Dj*i}A;fEi7LfPvQ@eJxaBIEH{_{(^w<_D7wp zvD>A0;A+%<pudNU4+0)qa+{pqwhJeBT76P_+905z55dnJv!s}^HLBPsr z2$;q62w1{jLcr{&AYl2wB49@W1dQw{H3MCV-x|~$6q#%fWFoA$ob4gy969Oju0|A44N5JA|N5BSP5wXnjN2SIPFxN#i z1dKX{fU%7tU~RBz5U|Wi2-u>@2w3m*2w2;62v{b7fHjXJV3Pl*5U}dL$^mLNA(87D z7}TnXZl}lQ(4${)pwsb)=R~pRI3%YAnwg#w+Jef*F1uJ5&&Mw--bV!4 z$MOr~p#@Zt#g&+JJRdk!t(}ShAcFxwk0Qe{vY_}1cmf?Zfx>r2106<%+TwU^0ON}c zZPY|EVA;TUu`hT&IV^Y>FBC=(9mWZON&LN$>Y@*>#R+!mp{0-Th^7iHJpy`2h6eZZ zX=p9x)u0bV0jqud;?UzZxv|1+oC2%_znhfgGnf{euMyEc2)Aq;{=59 zq9($8Yw+y;JZM`yG!CovoC*`BhZ!8Eyuvj`( z;ByMU9Z|55icH3fG1_BIa7YGyWJoGzq#;P~@oni2eWWcO!LE;vqr!?R(djs391(lK z3>=S>y+*&>6(_)MF9c|6?R2PGJfbWJV~gbxc!zSWfa{UCjOigR{Ybequ5T$QFo| zkS_F4yHq*0JeAa7Ocz->9xHfALr@K;xQ2vY93*_`9TbcQso{luiQMfNAW|G{OAk>a zamnEX^~fMSBBq_lTSNy*F%%dia5E6OU2tj;BDVmR=LJdZIf+f1%4bVLG@4?NRH%53 z20W14Eryi_KBIF1B5h#pxOLRQ$bLP!C>q%nqH7m8!VAq%iPN&~!5!)3#GoohAl zkdO`fI2A_h3bs7>v3F}ah{$GnXaim>odjtx1$D=8B~;^jOd+*+;YI>y9Db!?HJkym zVy(jp=flv0xga9~_d!#kb?N*;ri&9zK@5g6hjL-g6yA6&Yj!1E=>y-YJ3?-B5L=vq z-3z5)LqRbfh>$(1%wfqjEaG!6h_7aa2pJ@YM+K!qtd8tB>4<6|ZfB%N9WoOVAaeFq zgGM$V5oWB>wj)8hO%cc5@wLZ50Fv%ForcH?#m_N{Juz_c~$Q zk=O?5VC;No?*SMXf^Nr%AL23Vz?s>R*i>rP=|jZQ@t`U?Sdt;KC>P|52WT@z6B2(r zg-;I8u1AJ9Q264AoP#m^4Op#rRmejhEi3^jEKH@JY&rr)3vETJ5xK+2pxl@hZH{8T z^cIPfgXK`jazlj=@A>pd2lpL?cGE!ym+Qk}tcvtlcXjkZ3hX#WE1iHH*-L=95?MMc z*0aU3Y2C3KCa-rTFUq~mYP4cQU@%6Hq;0`*NF<{h(F5J+zBiuD&xd{!3{!9rOX&<%8Em-Lao7@?0x zp>!c)F7)e%ab}hzwuluxplwJwB4-1IUuHYH(P@7*gmx03#j<_VLmtPl`347e zkOSYogLa!9&D_ReaAc#OsZfKuK-E35IC{=#5`RPtBo1#`OTV}uE7U*|IHmyp07q<5 z4nzyzrL0e3M5X=@@emhxCpY;};m4lUAux1~&Xwa12|5x^Zo- z0_oju`Z^ywUY&jgUu6}ZRRVF1>Ixt5agGRGz^Le7PAxUKolFnU#({OWirxRehThXp^AEfR$Q3CqxZg3_N;P$hiO4k}-+fndU?;x=dFB`(~q zL;)^^+r#8@sD3_H2rvWRd59;F4DNb@%@vzO>sE<9r$2E4yV?(zav*h8M+00vNQJ`; zQIX0on~GvUqP|A)2c)hNA)b9uDb~`5S!&2FdW_dZcy;qWQ}^wdz3u89!JI*9OZfMS zuWH7qH!soJySS>U=94CB1N}-7lE6vfW!whgT=D4ip$lB1+@&>YTlUuO3gA`5LWk~) z8w8naKQLeFT6=Ewv+lsU5diy&Y_7kIYPhnuA@NefwdRIfsK&JB)lI}kHn+xy&FGv5 zjrn_^!_71_tfjfRxox7A&HxHUdnp;2Z zZEZwRIn=24S5v;OZss~g!5UK^?|p$jMXh~6WizHMIn^c;{X*mv6?Lk)8Q&6#d)bC+ zl{)oO>f6heLT#^6fcMpo`_`u1Lf!MN>CPw`)&UBF16e;@tcaAcrTZ2)?CVAII^L#^-YjR$v`Fd1F` z!@kncu&*$Aoxv)(gn*=Kt>b5; zEqxv&H3rvRyk2m!#(9HF_&d|QGY~&8by<36W!%O{wE&enUC*xXxqpgF^=^0l6a=j2 zOIjASzz@0R?5fOnugW4-PlHxx4ZNO$fGr)C@9Uq8fGum)<(q_n4SoH(V@bfCJ48v)dYbK@Z~!?yPPbmUx^wa<~)gT_~fxm`6B@wAYsh z!2lp&S)GLfs;Ag2!p+WZ5DQ;z>thj77!dBKa$EPc7>{%rhCfiZjN&h%ky4SrtZQ*5 z-s^mMikC9_j8aq3sfgfwHaKW^lFGxQ{S44k$@{cH%Z>*Is5Ge`d&DVuB9$pYxw~Hj z^v+c!pJ(TClEwAg20JWn+->nabTg~uSo`&y9Hm&nldRZ0^>uxV!c{E2F&^+IDFjdK zW`?Xd`G?*kgu>eqf?wiTFPy+?$kkdL+P}@?_IY>Zv)%+PxNu0;O*kc4TjqXMp1|Ae zhrOlC?AI+%J(SD9t$Ucv_OZb(?`yyyg2ULTC_v0O<*?iB7SIKMtZw$##2~??64KsH zJc*r$Dc>00c^ez~naf=gpeCsp97$v1I1{aY>+z`^jEBlbrGua>cJ3Pq{>llaDRzpLmi zKQO>B=*-{_!Ciy96C_xICRjp{!686^FvH+(32wpNZEy+huECN30TL1-KnPs+-shYx z&pG?q=eg&7?|bXsf4Zx>x~jXn*6LdQTfM#?nW1dp$NctP4c8#j3QK!(JsBvAEeh8! z7nDCM`{qHu!>}M|vB*YoCo^80lV<`i%?6j>5DU&wk>{O>1mVAP4U;xE7iJRyO7~mS zD)YR64o)l3aD7Cz?&aX>OvSU_s+DvZu^_Gvf624$U~_}A7ZQi7dV|d^@<)5r6Ximk;awgO(rJzZZ*5L_3$(WiTGOV2Wzl!wNH9d8&s3NW%bztXm;s0 zX*z_TPE;<(cEv+;CyKw22%IXb$}gVf2aTNMg2xk;qxQ6+?FpH*M-h18y$T6(7A9BXaQqOSH0q z=g^Aed(KPIudyWs$h&g*EA(hZ)O}uFXo7+vRJlnu8_Frfc0NB>^<1_x=B1-%)Czd~ zv20^UrIE+?wDq9k<{DN`dc+dbCFEOk-O;4Grk}YNo44F#;X`K%=+(-h!=mr3N2&d! z{fxW(fXUZ=h$iavUNq62cAKpC{$u4jeMR3259q4>dVm&$BVrv}jV`{#BpYWl-nhbc zZ!Nl#EIxl~?mYVO!S~zudr9QR*1h98K}=dR6VBd!wD;4cWSKWVJ2T~dTaPtS*~FV8 z676UH>KM`Ju}x|q+OGp)aXRx}Z3ByB;HL@~ITD42LA z70YC^Af=YgFI2Iz#p5VpN!foKYfpv$B3+4G%;?3HJJ7%cPay!?9H4UY;2a|L5 zdwIJ8`txpiuMr4~y^yMOChSa=cukIMn|A8=i87P?(ka#%mAZ1EvD;rsALgA<`N5aJ zcw_fc3<`99;7WF}iP>5As5$0iG(*WM5%G?0b9^ix!FO?&^JogG><>%`^VceTz7(9X z@8iP!Uk|vQ4eRMBr`2Cbh=!dtC%CQg71$7??-FSfM!ILswqJ?M44&DeWIw-P-`#t4 zS@-g0?3apWP`?f_GlW~%>!pYdj{1iu&@N}s+N6$m_tgBB9twV~%aDnq#*%V}0KYX( z$}IZ^KXxDDwc&5Rm-SeV=+(}|^qo-$MapP}pf0$yz9KiQOlnrmJf7w&>t(fo1v_xGw?t5&UZ-4^dzPugq z3AlVA->fWA)CDq`ll+*vD0?v<_v3J<=f`nMVCjCK!^omPQ2h!~{L#MwbeqK5*$05Y zU9$cvNBIB4M^N-SJW>7{`r<&%S56z3v}qd17iE_ie|NTDQHb0}DQFZecYNP}Up zntF&yO0Zsdh)GF^*-*$m4od^$5UUm^qrH%acA<~LL+$sTSaO6q`Pe@w33cNL^H2}- zQrCy?h58Ozx!Q#V?1cs5hUJw-s5K8o&kspYaKtQ^#0;p% zEbPTZpkh`zVs~01*H2@1Q$@DaV-JR6pV`Ik;l^R*#~!K2eK`&LKpA&d!hK>F_hT>4 zK0NN6BfhaW?#d_Bmk|TjjpOYJ2B%?LC1b#JF;-obDH$;+E#u#pL<5HtA`RlH_Crjr zNibI|?|vLi06+ku3cLow;z6pOU>*qoqeKM8a)MYXGlNEw8Al?EeIie3qGf*~0Fj`g zjL9sKBvl$N8j+-J!Yo;uY`m1jN~H=1By&exC6CK1uL>kq=g%t zC2>YP<86)K0pQ-Ve{R{DV55=JiuufKIoQtRnGt6)FBkxmSEdw5GOby6Z-7obP~#RRgrY#2n?Q901N;CcLP!+GTipkYcw)DnKF8){2NV_o0n4tX8tTH8P~gp`0$F9F13@JyhNy z8Q(=+G58MEcdO#uu43q2#oOYFQR=wy^NRT=Po}vl_tYyFMk;N?Dwoe=R#z%_R@^r5 zsunn^UYAvE*jDY1MDL@kKAt-qy{qcQt^O2QZJ}EIB|Yj}di9m0#rKiw>b+`!W6gcm z8lYJut#=J!RESY=O(a@ScDIJiEa(ol4s^7b2IWtFP}}2ezEoK2FKOEJwU%89iTehR zd9;{86v%;JKV@3Z)AoYKrcN-b-uC)5)q0b2CE;$Q1b0Jo=tCK^;*7`jO63jXckNYI z>vE=RHMtv^p6isqzH0C~uNM(-G^0_y!-{F-)#x9oFgV!wklS*)vCc}W#KzCV_MpjM z#n8UI2}P~{K&{q=yYU8YlSg^Udx0kE2JGh$e3WKOh+|7wR7*s8OVnsf%t1>Werp1E zYm#Q`Gso7HsMfUd){N1g5wMKb?od$RK)t_I$@>}{^JIncQFP@2x*ET&mbhHJeKS1{LM@tA1ylkHC7bWPUGWVI9)_f1`9aAV)BIv&})|+v+&jn{d zjb}ikz5h_~;-N7lyQ zDolJh9E>ub=r^C-rk&W}nLNmw80eYU_a9A>9{&KHY&RcS8XL#SVpR8^EM1#AjUF?& zm^z%E?E5&0qdz@y(TE>2MM5|kb~t?^&BWc#$SyqtiJ2+ynfiW6|HFS8lsL^FF!gzC z#)`*(!&19zV^cM^|aP&JC zeCbT>JDBCL7DYZSNX{(SRW62%H$VNs>hH7^HNF&cv=m3UoWQ%Bq`myiX*nfkIjwRz zV|+R5XgP;)C69L{Uwfs{X{9)3rL=OTe0-(yXr-EPwU&3aUVF9CX|*|KwY74!ZG5%k zXtj%Qt%rB5PkZg9)7oIn+HmFC==j?B(b^>8`ZVwQtoHi6)B0k}`f}y^>iGKl(fS7A zt1aGFJKC?_IKA49d9`2p>R|lU(b209gc~1uH%_!SzBp}si`h7<-1t7e@#E-f_!{QN(UhRc+BsY|*{nVj$XP;@f7{ z*=BX#W{=(GtlH+D*yeq|%}=x=$hRY`v-AHd1nlbnO9+_DXSKM`8r7dQCqHX{_;$ZAP@)_mjEB1h!_GTCW1kT$w**S zdX)$1>b9A~wl~}!St9I>N9@_I(Lc0<@MHMH zF()29mz@74HdyHIY_R!=yH-)kX;DERqSO{+&6i_MAI9DCk8jA14_k`2uustTPi&1& z%v?@-;FPTYB02In`L0`vQD{o@N{W4ZDxxGUZZ!?=m2MuA-t#P@=wpUeR3`ebz+f8% z{=bL80@F*nBTK!|rP*8m3t+I_j@-kJ%=*sRz0SOnuGH$T6?D(kyPm@G-psk)#&3Oj z?_QSI4&?U?%q|bK9t>4C4wtNtv>%St4~#B+8m(;|tK1yxIvuasp6Gr*(b_p#*FD|% za(Z`cdhN}0--qdrvzeCBxt*E0)x){I&vP9E^X(u1r42Uv|Jz_de0r^xnytQY67KtB zEwwv?@ywdpTCH_&MpK0yUyZfa?@s0_Mg4EG!CsgC(FTM6)&`rRfY^%X29rPD%X>aeC4I zY=hPPWP_nJ1fyxMZLrk-9Syp4-7;df_p<(x;_-8Tx4}}Eq;#vQ4KGIIe_S36S6)^C z!8Crq*kI-$wt>rAQ%qauJDP;jni($> z*3UFbz(Rza4$a1VbxHu<hwZhmwmB}<=CLcr2nA}mh=Z3j99f`Q~tG<0zU}WvJO6p?{9b; zh!z1@ny{R9(^dmHRrQEKTg*rrf*kXNqklSCd!B&5=!J;H|X|<+EKRecV4xu+0*= zMEBu|QOhLgc5<=VYqFbD!EDb{J&}h5u>51oUu>|lUu-bJUu`gfNTt;(g@U&ucYHVT z12V0>VJu!G6OVtf!Q}sn4Ho%(8_e>*v%%u7Z7|C|O7V$rdeYB7Vk~0K_uakJ1=hbH zd~NWv4d(S58?5hF8_eQo8!YUf*uTnMz)w(|W>c`*cZld?goOAW{}?|c#<*L+(5+N`fe<)x4X3`r*6cM zd+h$H{Fu8z)@j7-E@r-+B*O@$d< zpf5J(j5tKx0LXy?3n74hKiOc+zp=qwe`A9+fbw;LV?jVi5bF=iV8WRodQ#RIOR-X5 zd}q9j)_(FOgd;ni`^OURlX@Y}np-;h#C*C~n#;+cQm)I~_?^W_O9SrYFyM2g;La&5 zl}%Sls#GQgm%rIyy}z-+-0|}3;W#}i4tl=NZgMXxu}RBIo$Fft8#b5+Ue2=7SGE*S z69@qnvQb$40A-~g1Z-eVoiKrQ%&_|8C(r2et*}C+r?CDXwZRmBwZWeL-3Ci{CBnHu z{X-R6JauDTxO-fnOI@fhS!iflc=x;@{g}w(jEZ_N5C+9k zl7wnHP~3Y0OCBs(+$u0a7r|GGZc~dhrc=qMU@96xEls`sx_`34QU!3_1-PRbMSilu zHo=|8*n200Mq!*GS5HbF1QVxk7W;8gc$gM5hJpX94HkclU8DB6)AF%hI<50cSru2| z)8E)&VZYd5$Jm{^SZztk@wqoUr=Hg%6uxZYD|EZPG{qW36a2`{YMa6`6Q*dm1&yD= znycptLUx+UsQed-NWof57zk|>9Qup#0dwdyd46hbuM!@IFZSw~k3`&0~pwFL>~?4VGseSv!hugTY??lj<% zu{o!>Z<>aM>}>W`+!nAW+IomV4K)uT{?Tn?THE(U6cXZ_%Gt*Smx36+sL+4>5G$dN zs)9z?pNN*W&qQhT|L@yi=h)J&IVxAkL151i!|O^KXx2@EEXr#e4Dgq1u&4jb z28%}g%?67HjPV^4zSQFH^n}DcmfFZA4DrB&GyeNFm^|LPE`Ncm)NNCsojKu*W%W8E zuu^_hZ-smP1qDhAT5s7+garE^Qh;Tk_&;vZ>Gi)er$6KwX&@zf{kb6Casct-)@?~c zyKbD@c>ibGU@yRjjEvIVL;>0?TueSs0!X3jP<2VDbu&}}9U7}K{qRA7gxfMt!ZMQr z0(+W^SHQ$?oTd}VNV1X-rDpQSVJv@<=fTO?u(O0UmxrS#dLu6hOVCv-Zmx&l*>m@n7c3-dLl2)PJ7lxdCeCL2+bk#FQDp9D4OBKOVf+WmJ09f`53QuhCw zHrOusz}e;A-Z4<`E!gg#*3mtU$nt`a4i1gHki%tZ7_>}WrOX4Ic`HN{&O~%*T1&G zy#6=XU|#>q29uQFT>8ld+bCsJ5HVpfnR%^|3}!ESP^xSC=Qfz&xX_Bo^#>a6N0ma; z`FD3JOd6fvtjr%6)!?lL{KsuDRC2FZq{+$sV#X`;|7wF#C(6-R8#O+Inx0 ztm@h6;m+_q{+y$Wo1Yh3Eb`ebUR|73S52O!Rf1Oz*>;D1vcYct!v=HIwmtbj#Re;r zOnl;@lfST53+^ppcKjzcm?Ucm|9y?m)7b)4HB*!u1)i#6{Li5JI^L0w8g-N!HMF(g zB>iNACH`cCvHZaX3%RzzdX#knJ9{2vnMVbL8)}bB_UO-SFsol}u#jXE+tQmoJoBixCW+M|RBupH1ES&l*EZOq zl8D61H}E9pYa49%->|{xwVE#6~G~O3(L(D1u?jsg7ZJGnzi~b%HQs4X)>*OeYxYECl z|0gyW3o5zm;bMUID`pqWBsOI}cXA;C&KKUVlZ;+9ZlEj@eRfh(vOrqqaR+q6E9L~Jgwt*hB#N-kzS`zX z?Id_Qr86av3$piQfDZ5na~{IL%{Gdp9dDoV!1JEU-|VSEI1#x2WP=q)(B180cuCnu z$#EzwZXDE zg7AN`!47+{ZT{Q_Ly=DL=m(Wt4K*Bj2vy^XCH7KVS?Oj}Q$cJV<-T7P;?`v)6r z;3pgGHU29+#V`In-E2dNjKn-c^q=N}do5MnwpX>WGdmbg9LESIkrEkC?wgkW0U=^4unYgHH z8!Q%r9~8?va}sfW_y#7^H*7FaL$g+c=N4Zw&7wcR%7=kRu#vWy!k8dV5B~M9NRW#Gz(qc-6hsiS({2(9w)HuVIEdkxKpd`)M za1V8Bpa;8N`Ptzk*ghseR9y@}IROUrXD*LyVnOL47&jnH`z*~@z7dR@7_cvj)1G07 zoBeJWc*!Mutau7Q_aD*8N~^Dn_OQ1j7_3+Uo3Jtqq-eI)8y8657Zz&Yf71r@`EP76 zfxl#f)eDP**u@bb>4ajMW+s&Eay<@U2}v~8MATk`!-1fy5p$>MjP+}6s_z!$hMqD* zTbVmKGjIU%!4%O7fK7S8rZfpw5Qb&9Ggr5mW{9D>{8fLdwD4L^MZPF3qJ!j228P5T z#YGTE%NTRlLr(i3r!|lgc~U`zSHe&y-CpJBAP)dGpamdLi*#X>q?QH|`2gJP0Lz*Q zO3VoxQj%gE&=fz(M;`qR%sAsf1T?qNT{r;*#57)$@htE4$OW7+cer~ft6wN-T*s&R zi=0MWZ~Iggl_VoTxRH@!C7hxJ1Y#_zXZym|!ce?a1j!Zx$e7?f1!&=xiDibyF+;@w z02g5>I{?7;Ko&z9N-CW|Cjelj1Q5;Q07`;LO*)wkQ~;1r%KiR)56mYlGFIUoZkYp# zsGCk4t?N;;bYP?x3-pFD0M^rL2JgL+%~kUgNuV4W@Ic4}nB7FVEgj82O%_vC?=L`7!07wBSSM4;$EQu@WHy5TK7m7K*We+ zRZ`$r^^`!iB!3tx4$zqjmCYTd^8wsDZYBT#HV+0td)Njw z2%M>wngZZG4yk)Jp}-cz9r5pc{nMB7gwwp~gcX0#hCdB9H?349TJ- zD@Cxc)}s*xzz|#n8pB8Xp{`I8cS54s1ni?E+XYTF5P-#j!4)i7%M8uRRNV(60GkN7 zUuYN*02C0^DM4393FzqR)aQ%-4F=QDDe04U)UCsGMRB(S%&`y+NjfFv6CRMTIvRw) zvD{E5V%uhX>VdR--i@HCP#2DH-zhB5?Fa?S&^iC%`3F;^rbrCepgb*^814j@5^RYF ziT645Nc3<5nW0vQn{2hAp3w;~OG!H4P*Rz&M*VdFd>3oSo}@N$LoLsI7V!Q8mBgm3Uu8)ojFgdHS74tKUR{=ia_1r>{Bc=W5X`-S)DgIm!|WQ4tI&w`&+IcZt9_2||fTrx9dbMX&qcigZE{9p6r7?zqo^ zzyKJNShbeS-U`2$7$QR$z&QKd*fc9%Fl;@_US{QnZ!m_sED5Jnj7*ruQBHE@42c3D zkvHMKW?nL#(NVjBPz1(1TC`@Wv3f@^lm;n97OK&5Gx?xHND7d56Y2&C!W2MA7n+J9 zNu4&42&kLK*{ID3~zrZose5*pVqekitLSk{*9te&sm!(Hm| zUaL}_nr03)49!Okdl@n~wm#1aT?$4600_$C){-P~MUi!q=@pp9 z-8mTqHg3CzF~)~z$#-)&4@=MqbO66YZ0z{)oHj&p(rsK0z=YDPuA~-#ttMXlGN5ML zbQ6$E2bk*Xi5%yzy%S1_ivWsS<3Cor+DUG~)HO-Y*v8qc3F8K&8!VgRwqpZU_$D@B z3R~&lC7iUO^i4W-gzdO?vjUXYJs(5P-nvNy`_yn58yA4xvJ=LVfDKv0AK1L@l|;yg zTeYtrBp$NzF583yCz~2Oz~{w^H9vb|k|BB69ky_zlFAvD2z3BXR(MjgUDZ2kE17XC zNraY;{MHg`l8zEUY&?ozgJU3$pzK_s(z!=vkHD~0cC!?ukL|EOU6SR%XzU2a?3QP6 z)Vv1?X8o!R^x2MQO^Av@x`+dtSTK1-0YsaCZ*)^!@2#D1EhksU?{DsU#9L^E2(_P} zJ7^KbegG#3h;7mFM1owRaV+~HnYqK7Y2Sm6#)SkAmgsqBa zZ#{erwx5|`*I>l#Y&CuU2?l#k*bXOM^_1TAKp;GAcfFi#u^6E~&YHCn_45s;Y@W6#p`}1|;*!0EWUxVMkWxG$M=#cks-Iyn zbqAAA!MT#bB|6BDRo5^W_3bH#DDmIGV0&+@oZgeP+a!H|gv%I;5kG=4gb-p&XbyNg z%L=7lP>mN!5cmZK+uV8;f&lEHa@sm@!x67Z%s$e!F+JEFkOgrp4D+5?kIw(FC;_aaL$cRn&)m26%7@)31eL|$vTYL#7mWeYARFFF z?Q8`&8#w7=Ea~ifu(T(Wh1!Q>q~jlxq|M4Ld?0Dvzv@;GVMnY*F6}>7$G9yXV!#n< z?ZY0pgnIvWvml!_Q^+M~$Rp6s#mR?j!S%zp*bf;?&+cERh?KmkOa;+~>uOOJSyGV; z3ON!7gz|L;m*E3n0AuOpVFHEUS8I3^4Irl!4+1`gKXwfu=*& zC<)WwH95N}=$1OL{5BYO7Dq=85F`U)DE~6SvhQ&!k!_Rj1cSK( zWB8T%mJpeZzp{5(! z?RR}#GFuA7ib6od0hR4-I4$?02bM64`~W@(Xet5&;sa~_NW0)+2p2B`{SnMpJ6bf z@Gn;h2SWfI?;oNb-ISpUEjU<_;gWVpYR2b%Z!l<&$v3$$yP#asT|UNJ<qH6z9xOkcjf=s*}43Q%e2m#M^+q;TEh&@J2+#>c@S4X+uoR$&C} z^#fdol>i2rM$HSDCPgNZysa{3RH`XXmkW z>wGDF{`p;Vo9)r1%*E*^6UXw#G68$1Z|h2qu=5PQBEz>(=p=wmhu6ibP?PsGYc?vV zP}?BNj-a!6jO$tLUHlO8`4>|T^;28%WzY5&l)|^ES?#`tc@J(zo$8D=7-A zG;c+ktu*hX(>T{HIShv4!x8N4!l;sR`{(QOQq^_ew%ho-F75pIx-syj1$wY);1AG0 zK8fO|Cco{8?sZob3+L60JgGwGi&W4MH!EM%HX0NNjP|hZuH+1Jg03R#MtDcXH1P}& zV`g+N=O=`02gwqXNy19LK@XjZ=i}7TuOBoG6;4X)>-_<%Y9h*p^uM98OAxY3YGvH|F%h#A~it8oC8uLUF#K2 z=Pj<0BFIS?7TinVQ#Af4Prew9v=(Q*DBl>S#$!&qyh~Eioh#cYvNO3_xvLAgavH`U z*GNf(J%Pos3gr0~G4GrLO?|hT45Xy{Si}mi&90Z*e=F$57$%6MfgU$(hq{caI=Fx{ z1#A%BOX@&q%js&0`%;OXL(s;KsbVhKADR(s&riRtj-(a1+pt8dh{#q)%Ien;@kV2k zSkDs4_8)@}AM#_*91qSU`h**3Z5i%MsmxqV2ZPD`unVb`a2(|``LW)IIyVrG`2YyX zXKG^~x95-g#J7vVrnPmcvHDLb+5IhpKq@U0*Z&-X_Y$gzj)DkTzCvh3=RS}YcdOwK z( zh8b4&)Y!UehIM+u44kK`g8Be9`qwaq%C>FH+pGiN^B~YELK#oUM^i-AApRuY6mMuV z%tSsvk0^jcnVD-kLcie!c?XB8aM^T>dqaNrl(L!<*GxiULjfD_zS`}wnP+tkg{<>f zROU%zYs-nEbWsM}vU<8~0nE4xsrqmM4B5M%HjB?Q2X8-|DwUiAfV8{A@I^|&+PV(J zoVb{b-7Kv1n+Pop4hN_yyDCL+J;TuVUQ%1VtJyaVZpAL1d5+s$i>)(_RgXKk@1K+{ zc6@5Ae)@yk63D&O18b@YmEo}_DPMXi)Ku%|RQ`T}E5nv6;)(M^m~1bH>QJvMea38< ztseHRfG@!0ntf8)^BxrO{l_UMsn~2qxt+^1O-)T5yv`owX6p;xH-(mXUG5nGH(|{! z12WFe_YiAugeaQb=$t(st*(+bFrpVa-a9{wT0g@4*1DAC?A6MBe|-d*^{#B4vOy4s zZURVB3yhQ1T9h|isgfwOp@Z8QSu9ncT<4c2Zo-#LIRxYEv1PD{uj{6g1T8%b9(fFF zC~0oV7E_Qml79y^hPPXu(&afBD$&led=(i@Xfve-B{If})84|L@lm-cGNeFD${Yj^ zj)|6PAqY6$gfzP<*aZlOTZQ|`%;z5Rea|DDC2t)rOAK|q$WI6F%*Gs~a(3~Jp>R>g zMRx~2bvBGWTnJto-Y;c1EXxZqn*)d%w7ewl+rz(^SR7OPc`$RaWPvmqv!~p1f-x2T zR)GXPbsFT3SKv?0g|E*L7)!}!F8p>*YpH#~t`(~)XUkbLkpC>S6iTDZl~VNP;;@U8y9pNs_C# zuBc;zBLQX@_LcaMl5>V{xYh=Ovwq0?xl3XUPTFUk)+cM@z~@l|Zv?z2>{^-aUL7C4 zmAyKvY`oa=$Td$fqR*IeeE;SiJ)MIFvGz2Q=qC4}`MbMaXh#|Igc7myX<3q!;gn8I zE#{7tOg8jV?PY@kmfDeXV4FhYgqIb`s#AKlr&eE+f$i@5_vFKEg%BW}1D6Ss{X9!0)1ckO?=Nrg zr7G}8i6xFnv*&2No%IjiWeLBy z*f$7V_XkBt{?9M`d{$0^Y8)=Hb`+GBJd2BjTwJ2`r2cHAN)m0w3b>PzV1V&LFRN&d0Nwy}wC@vZ*Msb7 zN%;h^vBZd42K^Yyh3b!9zPz%4(KCe+20b^twQPqH9YV@(rR}xo!PduwjqL zFNhRv-{S0NVyH!m8xzS!D}2Sc!IB;Z?%ZJ69u>N&&GHK8pm0QbPJu+dkN31!%&v!0 zJ(k87GsQ=dqfUXu-qZ+NR~;KvU;A@XSjD5;)g zJ*mMUik|WGVTMchb&y^uynhxUJuymUt+_r#4$thR@ags11G3rm;3z86>E40Q5z=Jh zv5;Cn$=j0lXE?RLB|x7q5CjpjW$s+V6w3zPe6Nak{K91pLt;f$eMD8&bcCU3P`YKu zb8nP%j!wq(me$1Rt^1>y3cVaG7B`M4E)4tl)<`b;RdwD!`21j%z9fc{2t_H~qx^0_ z<^_lt&g`7ZEcrHeU@BDnd4Z}mzS>Br941RFRM*3e z28sVFRQ4iXv;yTerYM*@%<-9-rlyB?4|sr8J!aA)@Ns(V4qkd;`u-u;^ z>_@(A1}-iAq>!Xi4=AZhZY7ik1kL~_cXa6 zMre$_!R|%j6^o5rZ%4#9;v3f`;V(h8Muv8D2Ju6 z*^k1%FdzZE5nblYrq;gwN++kTCkLm@*^;x!-M&cQzP|wA!=v zkgzCwezh0KHch6$p)MM=zBb0!Vt-)SOW%@&>5GX9(4*Ag=sHJr)!}`m4Jxri5p;*L zf3_91n!YnJN-r>ziq7Bf4kf7QlXI^SZL#m5R%=J6Xa&A`)cx-~IVW&6C+oN?zkmxs46!;zHjNHS^U)koC= zJuaFCla*J7k+3Ou+Nqc`RB&i{BUX7yb`V8Xh1?-#nETFbZ@Io|6^Fu1ceXI}EYS9SPYX+pIs{Y)vN@z6Q>U1{&Y*j6`DU0Lah)&v`0bja+dOP;)O%zm;& zxq@-SpkaE_dF?D`yt(cH-)aK+-mv=#BGk?kW%lmz!>vV*pD)-0Uh%*U_j=eWx_IDy zvOMG(@b0^Zg4=!4?R_-u7|6FcY%Tair%L$ZJpE5GIwGsc-4bNUsr$|&N53|H;F;D)co?7uzc1wi3*SFWe& zktLr|_?femmYBq=+?{hs+BthX&&c<6tg@-^BwJDv)25HJtMp-&OO&8b4j(Xo7)i@x zCNWj6&?cUI zZ#{Z@u5_@yd@$g=r&#+qQ%MXL#AX*H8m)%UsCz}O@Fp>lGcS=V?LC9qC}dH7Gu(v` zG0a);fx8_CZ}P##K+<_pjfg2((ul&TK$Vka5Wyyb#aEz^Tmddq%O7|L5`5>hK|T}a zgvWTo3lq+<^HLs_C{YQgrruE^5Zk*I+$=RiILEe;xL=H5O#{EoJcMH;H>m~lCkfxvKjtY zk#c92FgFf+pCS3yePH69d-hBqC)_VipZnBIHC0wZCm`i-e4e}7vAa}IIq%_f+$ML@ z{Q|XlC28!y#n8ayDx8cUaXIb1n7gvE_GU1;edJ2{ zi9a8<7*w7uyX;@z6khD7~z@NV4&S&`mF=IUu}P)Ookbx$oY^#DL+AJ!ep} zE3w$rm#^?#0XOz$cmC#Z-(7dnNB&-VGl%&nArEI5#XKH0KXz%R(RL_YP|(UI%AQB) ze0m-7O6Lkw%)465ulbwUw+eD~rF{pbjH6w@fnEQ^M!yeYf#19v1U#joVHxPz;!uZi z4(BJ|n}hhpQGH+~qdrpIXKX_RvqZdf>^`VREuk)_ZUPOWL(K$p-DI`=r)c-ShMQr< zEs@oyck)@B_C%rvTcRh$V-{OtHpF9hTVg+m$9-#wyAqEFwZ;=)NhDCTCNM}OvbQGk zOC*W4Cdo-8tFE)J9|2}75}NtULdOLHX4iqU1&lI6|l@-E4WL3G8W zWaT2daznCe7hUy1vidLDV5?}c;9j=t)p+J@&0&l(Q%Z0+CZ#&EHKUELL=hhRy}9|y z?}AQ=S{;L;1XHvDNotOy(+%R%#iUJ@Vmbw~x*|!JFnC+qdg2(QliNreFhu*M7>;*CK~ z1!`sBxF1R`{GwG;f#Vb?>Z(BHgX%5!AEW3vm=0=n31U8Oms`4+n3aa}$nawYwSJR! zzMiFdje=WtEF9oSBKl;zL#1hzMAm={9v}(Jjuj7?Cj~Nc$q~zaeel$dl_aGl54E-L zeTy3bPKQ6>{xSHltqz!>Z-kEN43?g za*?JPxdfjaIE@tH}eV4uu@PpqZwZl^{Cp<6?96@~QJzc2Fqgk8} zlQ?@H+Uaup5$qNk%?noN8ghIlw|{fU~&@89f@;6AwZjN7Ei^9*LIl_qSz z(BMn*NGD6m_v_~~5|wKf?wRQcG%87ih1cGEY+n-1k09F`OF$#}Po?mwBW8jpNTQt5{UxNK!F?;WU{Asqq{ium>7P=ujBJOUB;IJuwHJSKQ?Vi1$9C;5`Oxj52C#8(I0i#!+8xkT$w*iH@3x*SLbKQirs14rxe{+ zS#I{eqJQdtcCx?N9LNy(G`q0#R>VRmetlxl2$7F!(hW#RK}|kMd?#LEQvv}C78fUu zQnBJ4wv~cjKh<}G40w5Hfj&zJ!B{U+Aov!UPc}7}0C+E$Yv2lXj*3h~{$K3fXH=70 z_b=>(8d?a17Ft4)E?s)>Dk4nR^K_r0m-m4Jlh-^Sq zth{$X-MWk0{ha5V@qfoT-|umMT5Ha+)|l&etrqkqR3u}CWaxBRMVcs&O6eUK4xFGX z;yl4#s2!I@q|X(>x7i|Hz{1^)ij031NiTRIgHDI|)xn$w8>avdy|P%|Hci3z*=KW+ z7jq)1c%lcd^r(FL2=7%R1EOls#Dv4~m|t!7++i$<3EfDIC{0+#VK4S- zGeOU-W56jZ+m6?>z6?w2um6g{LUAoR;)=1>z9CcNcN_M4&h2uN2Lww{O_!AljP&3{YbJ-({QBwO zP46&vy5AD6b|1of2YP?ka<*7>hy{qSwhw>l@i~#5qU~Zl{!^0_&md>Q&%19INnbBOy79> z?BZMUF3RX?xgOb$`V82iIHif49S#oeHY8M$6nnw#IB&s9anF0h3oFG>KEGgSycUX@ zFIfNV{Hl-KtNot9E8SAW1Bh*Ih=cswr6u&%Gq!LeY_&VN5V3qKjzL%1F6cFJrmFs+ zNayQTD>$zCa{)OQ7EOz$Ho2xcsQb9$z+KcKEa%5u43%H37K;1h=nI{Gx&d)p-Bw)2 z%b*jQY|mhk@`HHjG5T6cx?(@KqFz2fG`V10m_MC7`=O5>vEhCDSZf{M~a3wudh3ChYoy!Ff^tvBU4TOEvl+I zM&Co+8qeq0`s2cAp^loW&3f9wW*z~aLQ{^p2ao7HjE+Y08XuVQCOwqVagtAxU!igI zPJVJWGM`d={SBI1H0-(Y`lP*d`p}iYxoE0ST(*hvT1EbsZIN?Zc3f_wBDw38uT|1^gCxtr+DHrsv0$qdTC4*8Y8@D`b0xA*T?r^o|vPBN%}K} zd^wtMA;0l~Bk`J?bESyeu{*fihP}7)J2oNVCG|BSDO71>#!FWje6{7@5f|krtjHq# zxWla1E>b?e?Rh_I&^Z2BA?dqA{K-;ks`MH>VJMb~UhigH(+F)-=s6(4;*;7&A7pxhZ_I^KV+qWS6{*=*J9G7fq5sLZbS$4$k*u>N_b}a1_VaLzz3<&S(fS=zbK#3I@VXje@Bi zQDHtplW5vtmL_@cb?gIJY~tlyYOdoOG3JwJn4hE^9nCr}?RaYTCSs|{6ebqyKWT&9 zVQE!&Jt&Yoz{7<^v>iOYnJ7$63zb5&8(rH>E<9x?Y=!7BAKtw7VA4)37}1GldX>_4 z3LcbU&d&jkPNONOt-BD`Wq0jW#`3ANDTVYxF1<8IbgtfyN;WW#mEX^DK^v$*By7*h zU)@4XIq25tUUkXR7P#*oerWSS5)$mKE~nijV+gLjAn&bSP@1Qk$vRYc+~=_AgMDfd z+6AViSf9%{a#6V*4b;+1BO~tzQ^iUyeJ2GS#BddhpBxc2jA~0wj**i8l?LaB`s)Sl zGc>d}cxkRkpOtPeePPpdVVF33OZmiUSFcRA&bypZ{TH+#8rSj8hi0)20%f$~ldpsw zNi-2_BW*#kiA<()OpMZ|f-6k-Xs~0I(WUt89R(7OoI{Ng84$8Vt&LM}6(@YzaoMA{ z*ygyH{S${cxK7C)W>g%vInj%|LJ<*uDjRkEbe2!75uHY*^|tJx>WbBnIb!B8!)^(t z%ZF%jrj(`Yx`pXx4QH#l=8Z9t<&5@o30530%&k1yZlv6^x8rBnG&Ey<48ETGA!l)} zU%%@J1Elno&FP((E%|*jy$MD>X9g%8?=!GJ;>eSlipCg?+g0#f5jWyilp1>f95KB; zQABT*Tt{TmZ%F2f!*xN7Atm<=$xg|Uh@WbeNxO{CR?}*H?o>;(UtrE_V98fE8Sf)4 zUh{Uy#6jUiuh$H@@w<+JIIG&3(B~XhfwM@}>wd?tb9iIp&Ku8J=A&a|S7I`3vZ8~W zZiRb}mnwM5g$1cB>hgEL9tkVF6cjLhfLW_XOmUsFxaY0SA!53-bJNJ3cRh9m_X4a# zM>D;t8+M>XwV-wR0%7j@VkEWU4r?zU^$_tG67k0q3HK6-!HA^{iDl!76?=(QVI-P{ zB)ah=hkHqOj>1Sy3`tMKlb-A)wStk^7?PchCp*_m<_078G9*7APae=q9t?wE4Iz>7 zkW0OgD=>;ALyGJ16zRPbnJ~&6L(2Sk%Hm$idoZdBL#mp1sz<$4PhixIhSV+b)E&Ll zJusR9Lz)p7-#WUk^Dy6*I+BbOxsa_4@KX!o`vAvk%TQn)3_aJt9H`EwLIk7IV5RI+ zS;*#q5+T=np=0ipd1%^??xfUVwTB1TUiMV=LX`{am-p00`Ef@hw*${+0fe<8l?v25D(OQ9NZG8vW0^~@u;0# zb*dN)VxJM^B|Md%2dXlg5le)W=7Kb%n9iL+K+%Y2Ie=OQL!tRt5>S<3cy}MN-H0;) zz{l`Rxj1AI7_q31{7nM+DVr0Drd?NO_{{^cP|DH-fJS(xN+WTDPZhNoP#pSU;x zqSGh7%+Vd4sMW*Tt_nvm%%>Z8QuWNwwqLec-W4TI&v5cXF&^%;F;PJ z#I9^2T0HmyL+H+I?62xWX&XUl%8_%1NTq({7MgQGomOx_lKi;@fTsk4>7h7gQ8@i* zpVUyIv=}}7%?#J#29qnA)?S_601t~nqqcC&D>DpDiQ*FK@TP2513ZHzp3$Cb-)RjN zE-+jNhtvU!90kL#RmgwBGu*bOl*Tj6hH%rO8Qy18K+%*SH1&!RUlv+Ae+IUeKv(C< zXNBgFKyxghnSi_zaw28g5E=tCf&~oQ8emr1*td>nNSuKY^)tT#!B?_aM!@tSG%Zve z`DO+&kAe$&a6Ad&4^Kp~GicC;sL@CxH$v!kR#0&NO@@^$C<~X=lPuABHBB20fE}Zs zQ-iOTBVE;@zzz6QFnt6aYjFtbC>q+ z7sniGO{v615y-_Em`!oWi!&+fa8ee_I+_}UXGjAv!@VSwO=J++tfCbxP<1E@$7h9u zuUOOO0*l22YRM)20tCM;X{d}gR4!MYM;WLZ!RtWqIW>*fcxkTzc%?P4*4dyq!#q=@ zI2VLv3nrdOW9WsbLqXDJ4qaELN&~?RhTt*Qln#tAn3tLd2zCw4SDX!L0wZr*Gy7aU z0VqJtGgS6m@S`E*24MP~P!RG|Jv`iK|nI_~_^1 zpjc0VPmI?2^_Cm2HC()Tu``UN9t^|Tq;vJm2FLi;@s{%#au65AFpdH5!?XaV8C8>^ zMel3$p>!Y6OBf_q#vj6}vH8s~xgSS1TQd!ZkiUn)E^$Hny{RqLq2WHLs9^^yG{O%= zZ@>jf%qV>6j{60>O$;HkqJk+YN8TI1c_fnZN6e>bx}~5KjeF zhp%9q?9OoSL^Hra>QJ#EWTpD)8+0&xG;M$70S;64*=jdjIg(!5?ll_GYJK`G6aAp}D>h~$WwfKaVPAY?@%`hl|X%@1{7SPmzV7P$?UoM`?9S-00 zp}bQCh4?Uitdu1`BR19R>mh4pGlG;3u~gmgP1;Zp_ChS6oj$2MZ)8)Xcp%a+7Zzpg zWW5FQYNVeIBiFP2y^>tX8<=A-{4Ty6PHH>{z#o?8gr8x-Hc3O?6tXl^!=v(nATOSWf2 z=R+mT#FPycLYL@)+P;pm^h34N$!2ayyMf_%%)DmBIdj!b&l)8>WTfbL2GNV+8N*0d_#&3FOw?}L= zjvP%!M<DwJRSNcuAY-^Fl<483AcqlVn2N9O zrEKUp7i8x3ehDy5b)}f#fwM=^0Q(3#dku5G7cL#bK<8&Mi)V1MK7|FrOO0K3B<_c{ zF|bL=%;L>zR;7#&lzG_!M6|ZqB0Du!aHMu`Pz1>f&*)s6LyiT4i%;C}=be#3!BiKignc zQ^mqMCbj5maS3H6_<+LG8}^AoJ0EqS?YvmJ+C{eu2=@Ui{2PHsTTFE`OvdJj6Xppg z-(9_A&5@?Ye9x3UCd>L_ys`YRvK)hsG|UhrjmByXn^c~MwX{|*pjif{DGL_D%eHvlNZ)( z^XVK2rs6BFO<(lZ7rxcIYGPic4$m@0j+-je&?k-6!!rhapI~Tq4#APA5SB*?cU=r6 zU&2^s!HJ7tpn0-l7^NJI85YqTDW1(&=!JbeIYf;3ZlbC6jgTwZx6-X?>$r|JCea0E z*Vzol-aE#wglC^tpn$No=drgI4&69Mglxh+ZIR>@>F2G&tMYqs)Zw^Wn7KE=d|kyI zPqI0M;95chC}ERs7b2(|&VD3~pSGbTSVK+ONR3>B{Y-_58@|Wm`Qs1lyi7OxXd3$9 zH~62Gqdrge9c3R-)_`iB=|$fXdy3{Tz#qq+LEK_!x^&V|2?t-RwtM(N`Y2a|)ahZi zmuyc!=8k2|6JUB;Z7IiQ)LD*Elen&Z%A=R2Mz3&;C7mAIx&ECEcD{k5=pw3!W8xEd z;_m5*n#U6rpC;~JoS4Fo<1S7%-k9v0LZ)RS`aezfoSwpentH4}*`Yi!KQ%EnHT9Td z`nMa;7N;hOpG}iK8-G3Z?CGcJcglG3n-k2Oc#4~oGrZ`3) zBV;V6W~cA~0Hz-FoTqL|GidIZ%B)1)bJ}NfxQpWz1|&|FbRR>=oGeKk>X8iUi`pQ@ zkNCyZdSvI!VmcQ`&@<|fArwI>9ESKsHK2q$*EKWO;nSN_H@9A^yymxB6Smq?S6w||wXV4HN@Du8*pt=sPqwsYw$52C znp>@5Ro|Vxxu9)Lwxz}qYfV-(4H@Ku7d?3|rOr_qOjfJLu~q-U4u`0LNsNAYD{k7s`W>^%dDJo7riIAVunAm~C+55=rFSrK1e<6cTuc9Pjui zS|17(=Fn(@RAfE9t{OMGm6Ej7%`Qzr_B3_fu_MRWoZkD|i+xW+Pwca2Y^^9do^8Y^ zDJ>+!XObiv`H<0kNUsGEC{;Myal(zSUdmRUXI1+KXG6Cw-EdAf71{?2*5niMi-6F5 zRtWe^t?+XjtVp)_1C7%C^`F^b-LkJfLse?Gzu92xUA7*({lxiWHgmsms{U;qfuhSvlOnj#EdJW` zvYTR%>~8v)G$~h{O8WAddJWnm-hDd7x5=_q#b~A@%VP^J%S(qroSP=)6uh`Z!YJ=a zOg_9nVH&NO9{GrOG#3?vH_7uq`<;k{ShTu^txMdAE^t!K}7-F}u;xpfGh# z94oKSr*`% zD0FNC{0Hl=v3iAHOq+f5a5DZP-d-%l*9xPbYd%tR z6~~htPEn`59jW#nCgxA=oJEXeVbxYTN)lE$Q%@g-Tgd2<8^c(5bb3@RVhrWDNV*|d z6OB2B1}fSy3o@$$jG;p0DSVbOLir*mJb zU-V(<@@5jm-%vpk9fqd&3J}p75eYW$>|&QnwIkvaNytvUnCF>3xyt8!X9=*uJf+;9 z8`-so#O_D&TvK3Ayxg7I?Q=OPbj_AdX{gP>bJ$D!?I{2RQ!~gLOA=c$T~z9LkQ4MC zmc_szpi05ogFItG;<$BNTP&upd(M#LF7;XVodhEa3}559cGKR)*0DYhKt1C<)^|`a zrT?mDah~7>L1C=GK!QreZG~7zJ=xjV=JkqvwNAk!Y_=wulF|kH4hR}mv0Uns=E*U= zyCRrQu6yMI143DUgxa?VX)d6jtl7H4^gsc6FZ6)wnFHkJT}9fJMOB5b9bcTB7Z{1? zzpCQ(JNZ_~dXth? zQCc9mvDw_C!hvKJsb;9g%#-`>I!=3MMO_W*u!s9YwaYO2$vCTU_U^bJ&@ z6BZQu=w$6f8P!&r{T^+OlOLq7*MN?ZoV|Bx=k(kGvApbg7rl3I~6>3^$mhA~7~=fenXb_(oVuR{8tCe}i6l;<(azv&oqCSiS#rNM17jmGE4b{&7VEgHlu3 z*m;pxheM$T<-#^NL$B6bt!*Bi*qb6Y;%+*fwmO$nWw8si#pWc=&E4tRu(P0^2i?4! zk~bg64lMSUb-BLry4mw+QDS1O7b9Af_2T}L)XpMH7UNKPp4imB10Nssojkqxrf!_a zhu22xcbR~29AJa7>2w`-t-x-ZP)fd&%bL_@KJ(7ST`Jk2 zN3aIceQ`-AGJUE+g{5|Yrpf=xgxnfng9Yo2FUjXObuPHxdp+eEkn>@h>SBC-y5_nu z8+&~#Ox>PfgV7dDP*0!HBiLXhkL1pcQ!{t(Ox$O0tq(Q0|$Z)M!UH5tlQ(MSjonnbDxwrecvo9etfcqaq zwMyB}uG*<8jz7@rE@gWx?sD(pz>ui#!}O^&j!YM|WeZo9^N5#znTlg?KfgKIBQgGz z;-drk^b)BnV1qq5G6Mh28Pjag$9VxJB5`FP(={lC{1Nw(<~@Ivo!eWPfDI;CaDhB^ z^0gp;zQXW}DsuJDFC{rXr79i$fb9P8CShXO-nVDZ26F>!Fxr03&8JisAVqDnFJCWF zy(=ENK52u!V)?}e3nbWJJYye)9&7Gvm$<0h8afQwiEJpZ#D1;?Ym@$-=#dyyK^8;)!KXal5uT^ra!hNi4xR+3CsCH|hC?xFQ zHydo}kd{N3$$cL^Ncf4bHW-L6+!PYQayI<5Uc{aaR`NC+-5POSCc+T{%*zmLFwUw7 zm#9d#HxWLqyEfQ%*!h%5tl#MXNL1vm4W=oF30sP~W*>D)AevxNmo}IRV1uRY+F%N-j4`*UM}Dxu)Bziea@Pi1 zL5Q~JEdQkqmZeRw!D9Xs8?5r14VL!14R-UJ4W{tB4VJxYgXQhoU>CpHU_yVg!Q_mu z0XA5x17L$C6#_Qcg}l2?#i=p7HkjF08_eQT?V&p{P6QilRrXFt+O7>&6wxkp*XIzy z26JP+I~H@9V1vbY0yfz6t_`*f*kDd^1RHFHV1sq;+F+J|4JH&zu)$WsN89g-tPpIl z>jI_3irEAkEP!BxB{)eaFO@QO1R3QW0ceU)f;B*9bP)sNxSc7)9CdHrVVp8%+2+8%%N6 z1`ANFBG_PTys#f_FsCfCnDr{a2D=3|HApTv?X5$R&c>3#?%j^#xPz-S!u1+?E{8mP zPUey0&B?Ftrj*|Rr*5q3Q`+CjPSCvAn#_x20;`c;7ZG;Tkll*=3v? z(sUYIvYRBRpO$+x2`b5dE<<>Zu-7nTc{6Z$3pME+Z<>{OdVsxIr?W}#W7FZ-2G`>R z8|;omv)yp>VwsWI9pGc!seJj~?5Dzk&8C5kbASzY>gkR|tG;Z@f=6pZLNj24xqoa4 zV{ctCYUvDVd12IMeyTF4w%PMktNmnKb6@M{veqXO?JGv@PKi6MfDM+H(0;qNeJP=> z)uZjRL`R!Zhtp)!0p^b98|_6O+rZ_m`L&#llO1Fp^Skuz1UB>D?dzxR7fdRR75 z)Az_oK=E*Z^3cV&(Kfcxlba*a|IY@KFPT;Bo>g9(RfW!}i_U2p%xSyM>0X{YSTc9G zd+x~E+)?Pfk?6dM!Td4T`4gAt%}eG_cF&()o40~4phXvK3>NHM7tUT@a4cCk*S+Af zw%`U`^blS2G5~T97SCT^^e5Cxrcv62f?k{wRdG zd@qE-SpN43Vb++wKMP?AKNZ5p|56B}68eJ>W|u4eGa<}(R|vb9NW?-hWYh8wLYVF7 zHzCaRJ0T1vI20g+r8d2a=14azItmD3u`zN9qhE!v)bE6_4+J3$ zevmD2PY83?B?w_K?yo}FQ9j_Un%?(9nC&+qO!k`)X1gndB>_U%=&le3Shz36w+?<2 z!canh5Qaj0*7~^+R{fVk7zy_ug)rM6g)owDLKycCLRjiw3SlV+zZb$re^m%8__spX z+3$ofB1zky3t_mw6~d_K{z?dw*cHOw5riVY8B!p@Hq7b(6uY@qyJt3_3k3v|g!xtgU2JZXMLKw49 z=uGII5XOl7UI+t4{9FiA0feylFGAS0zY@ZT_JlC6T_NlWK?rmCT?or{_?JSM)E6PF zwTYqiXF^z}0G6pW9uUH8_Jpv_KM7$(nu_DUB!uOC6T&vYjNc1k&;N5FZ2XH5*7{FE z*vtP+2%9AcVN{BhzYAfgFG84pOwj0eLRb!s%dQaSru9V#GyG8q3j~C)?Y|VlMw|A8 zFcJFP^{hQ1tbI=iQ~zBEQ~N;(%laaOMeGS-dcPusrTr*`Vg5-7O9F&22V#U=_j%(# z3Sl<8LRk2BLfF9XLYVcg5Ek`ULRi!cQNfQwSnA&hVd>w5u-^Yr2+Pc}P5&x{1^!$J z^ZF`;+5UnMmH-H0%>OKeW$p=K<-Z_=wU7s{{|g~3^AAE;>kmSh?cWGt34aj6k_bXr zAVCO|`c4SLd?$o8k>~!C5SIBL3t|1=31OlC%R(6T|1UzA@$W)d@K+%${i_gG@C!m% z`o9#y0(%KUSgIPCm$-1;o)ET9Q3oRXK?qC#CWKx83n454MD_pb5*A?*EM3tfwKg)q#2O$f^ags^zBdJvf%NX@0R)^}G3%lu9VvjdU({yQP;@Gl5qum4I2b0P>~ zaeG48_*Wqe_TLo3;tc=25JpM%H$oWfmxQq3zY@aY;=c%C34a#Cu#_(1|6T|)BnV-7 zKNrFh{#FP}_$q`s_L3cC{huj>mH&?v!lM382y-F_gfLsRzZb$t$-W6;w)MXtgl+tV z5LWQNB818QFA8CA|0IOHp>X^wAA?z(d2($k031Pl}FNFF29}vQP|4s;d zC@(lp5W_}5yC?L>q6M2--R&eKMP?Bzbu4(N`1U5gtZfdFrRNi z*o!0fAO5EZVM2=kB!p?~BNg8j!X*Dz2umjjVQB;*OymzjSRNpRwGo7{wx0@N8h{X{ z@()57^HtXIJs~XeXF{0c??PDKUkPClaX<(=p(UeE5W<83A*|zH2w^vWMF^Ap*FxAr z%3la!B01lLFz#I;?3$$HpMo`)|p9^6*|6T}dB?w_8)ISJe5ad@O45j_ILRj6;g)rEj5SC65 z!lD5otPvXWM+`;B|A1+Bdcv03i&jpPcri5Vp?ttp1A-M#d*jw=0CD074i_ z;zuFu`K}Q5QG8bjb0`3Wu;eJQ+8>25YwfV33C{YpUxcvLKM7%pza)g2{vd?aqJI~{ zh-r3(u;gD8!tVS;2;2MzA#CucLKuqgdm-#*l$ZXWg|KIPLfF#35W@1m3Sl64(f2~w zyMHN!Vg5k~llUbeOiKE%gfKA>#`#4()j15&C^&$p2LIyz^>d0;>=pDOs`Lqp&(q5t zK^QGR9195uN&@PZQb)0X$X0I?1xg(=4BsKqifvPD{M4r4#FH?pm@v&l=b#n>YONo@ zR!LHU)u=;gsX{2V=-_k8@VM_lUdq|j|1_24%+C#!H zyGR(R7l4FezC*&ci6RC8Bn)F>7_oj6lZg8nxo?89M1Tl-yV0upvfiTnh6N4Zjm7lvdc;CCb}E~8CBp-nb* z7YU0eNeOHwW!Oc+6#s;TMG=s&of0L|X8;mLwTpyBeMQ2~0Z7=xuSi(P9ujsl_YX)| z*)9@RYG4f@Vbx!dF!QfSm>;o7Erx)EHNSIew-}gw^D1d}fXjn%K z#kLK5e?h`-eL=#)2}qdS_edC-&sQXj5kSK3_w^8vFtXz?0un|amw?zq!paCp7=Jhc z2}66D2VsR6_M(ff${){bQof+~=6y0{Ig_>zy~>|Z$KCr_-j!VACL(4D$^KB|nuTS#I4Ba{ z8$xI5zCnI%mBO%;ROj_2>DQ*8m&K0XvIak!j|nT&kqC^@FIO>Gn0w9}DD6vPDecZ~B#DVL)n_TA zt2aSsg`L!V@JzUZSik~KM~)Hz5%&%h+rjnv$T%IqZVA!ydgAFzk=JuAYD+DS-iTN@ zi6M^yyU0CKXe1$v6Sp%1L;Nrlqr-QmEEWbv>7$1}T7;09VaQl2BvLT40wJU+7-u*c z)C{vjYAuI&4Vumaovb)b9}T8lmn^%eP3s4Oq=4vOhL^u2U16N0NC8o@)G(%i&|aYw zEM-D|fVKzX(N2&}!7x#W$eMvDGRP)RgJ=cXg;Fr&kTFOxsc--$cfKv2OT)zEF@rQ{xcbZDkrohsl&E011Vz4ARIOz+GNb&&2HfP`6_Cl`N3!fbYtFqUz#g=u;t010~l zAYsVn?g{`2Yn6er?;>ITXKIv4-`VaWVXd=K01_rJNj+?B=#vBmkg#{gp=w`|Fz(-x zuwzV-$9Ivi0bYuO-;gjNyz&%YA->bV5I>rh6Io> zkh!WrNcGcQBrM?YX_hd)&sxM$XA<_1Fg4;DULsKf5{B8{-z)t2j0?)T=S7$0`l46C z^d1sc_d5~>R)vi1@Y;?6NEr8^kqTdeABF4r>c#nPm;Lr_V;0(^*pj1_28n43krsf2wUgvj?;>GlPUi2!vNXa94MH}n0VFK#=**8u*tXr7i)-*s zn=pMw`5ACC4d!~~hoglG?gsr1z03>YvD-tF=u?Zky-is|N%#L>|<>Scm9Y!2KU8_?^_B0U>I1*Qh~|QhO&;00{%JooyN4 zL&8u88OQLcb@D_ywg3_~X6fk7TYGg631eG_XAh_?{I&ycTdp?@=TDi8Sptx-kfUm( zrh7=(4EDD-GkE|Krr{Pk>J{v_i-ci*+rIe~3A?|Ggr)R;LBcf7!7fIKHHQ_k5zjNZ z6MaL%Ox7LWi~~p*dKU?!I{y_3d(_#?xrcZzGTTPXH*em+C*2q0mP z2}qduvgNu1-L%KcS03(b(~owMFr@w1djJW`Y93z$kgy0b)C>U$d-S?Tnt+7m0!Ww~ z36Uj$gw-0DjS-MAKjOiGAr#S1uZr~E`b++vOSFw?dbwwdTpsk~Fj4IYn zDj4A;)Vs8L=8Jgd)~Eg@+hlXxOe#x-!4xTnr+-TiiipY?BQ^_ZOb2buVoV?y7%0jB z0%ebqL}9AD;>lo2?rZocB66}=KZm|sa#V1{1-Tlc`Vb-r#w-wP4Qgbhofb&(Q?rX8 zCKFI%fvw<)Ly4bduxYx7;7DoD9Z>Vn8fZ*`OPD7cX2m=`%S25r01Fw?zN7{kj0S^} zB0>2I`_&4qd*xl=5Klw#tP%NrY;-XHrARzDJDSyy0|Rmyz61xI>V31*y9m6qV^C9I zmcl%zXy>EW+b&(o%FyXEyOW2)B-Vc?gjEEkaECwnQ3!jfSk3A5j*DXdS_*4Bh&-5pP5H{i!JoTO+ge}$#AHQ@J-#@^pa%AOa4yB zp3>)N$HyWd43V&D!4dxG4DQR?P1YK^boocTxZ@VLaX2u@T@48cVg9*}_sVK`ll1F& zbdnA`H8wIFo|B3$rHK$v(>Vxzo6j-$-(-na^SdtEUv@92%4A(Jr5#y2eEi z!d3*WWbbCZNq`4XDiqvo2T`bH<-HzbC818x639(pE#$p%)k26(py<^2lpn# zrFl!y^6=#csln^r2@QN=Cm%1~I&rQmKl5aO%Q#ZdmfpwppdS1g!?pu+o={q1*G-j1 zh)a)a$7nWbQ91)WU9LM@gA7rzJpe_z+^MKg?Wv@T$`73E*EwK~DoI!QlyWFt#eLNC zo$I~I1g(@?p&MnYFHn-<9NU35ksQxkBYOC3JB{LGGPjAE3*27Ro$j;?{PZXgN2ID$ z+CZ-w2Xc>9Iye?rCf-Y$Y~>^BmSsGxs?#=;}F7-$}-`*&P8Tudo(~WK0-C+ww6}b8Oj1< zq)_t06+XjK7Ey6VgtQs^H|L%Oh|TKBwt-A_1koA9R+e<>r+nDD6oi+hMC<8hdnY9UG-vckx%CB?lqRK7Dvt+>-EDjk`sI;~ zer**uUgwYnydZ}fm#TTN>#`l(A{VNhp&~ughzkrGDzRQ5rU*Wzg6#Lba2K^{<66PSR#Rr7uF5a^XSsm|Y=E z^uRJ-`#P)L%G$N6jg$-vFeO%InN1x+bk#N~dc7) z%kY?-*TJEHb4lx5%@R5?5wB)4RAkbN4A+X0*|Bl?>wUDG7+6JZMvI7~MEL8zuR_?w z)LJ#!y#%mZz=t$e5`qvW>gn?$bWXD7!Go*99ybo3oW7k!+W%P8*Z?(j&7zT9b>~ctKkc{Wp+Np@)3@9F^ouhliXWQoIl_y{&X;&-j_4Hn zACVFOPAO!ZqapQBvN5Wi^_z32H*mum4wONHnYI>YR&^u`0dkXQzMU>#TNe~oB?eQ^ z9q3GH9p5ZmREC2-&tjoBwRW~&#n~-k$)oOvQ)zu*)1bC#S}oME^B<#i$rx@mu)Hd5 z`6_eBP%hK$6#0=4;b=}hxhAQ{p4Z-0&5PFPG#x#D<NBH#mCQb9Ql?R z3>6MsQ<1(v@Y|HKP^7D^5jFQs2x-`NzZ84BlSisIu)#_r=2%c!GJd zaQM?)KnSZ)RDRqoUaFf^D=5Nt`K9=kvbSdvM49wwzovTi99WNJOGIAcTo-PITJXet^cN>UK+AydEpvcBiI& zh`v@S6HrF_3mrA=~Fk`^Wqiz$@d3HiF zH)T$q_j?lcOpN1gcvXW7R#oZwZ=!ZpY+KWh0`SRHqDNX@KsRZ35(1xpVyx`+s(VE> z;r&E8_@>Qo5?i1N6McXJQn-B0Z-eUYl(Z; zbLEv+z51RICKl}cAn5a(Cp$gPZo!{sGpF3;bazOzKqXYKx~#y^tdi`LMbcFEPaC&T zC=%DqsWIeRZBz+OV~fk7N3~Q(pK8L_>wV0!AQ6V|+pH)qqq3EwS2}eIhR}l6uO~Ce=3P- z=CD$f3F}&Ro@v&{cU;IbOb0qI6*&;I-wI`aW5gEKI6p&-aIiw@JonE4)Uq>m;zhwdVhGz3xc^{YZR`;6o#)obS(g#01dSI1E<>?jY zCElZlu2gT}#$)l~Q8>8aoa>Cug>{hZLbLqsoHb{Cm)M+xuC~?QbIu=a)E3OZ@{uy} z4WjYtSb1J4D zTC%MfB_DcW-trCci_s~C$`GdHgXow>^svr>z@1LMPR+CL8_zm5wgfgP#oII8uy;I! z)s5ld&JE$Pjy&8QAy;X)Vr!(nMy;|Q;)4OJapgMS(S!#!DEJ+s+txU_GS83F;PZpJ zQQNAo%=13g;d3DN_RMmp&(Fue)m9o6M9zAZ&$xyyoK4Jk0+6sY`wQg@4br$#eH(fV zSV+K5_<gb0r^`B(u3N z{}7X>N(K&otU+UVY#TJ@E@VZ=o9C-XY}Bk1#Z7dP5(bXucM)l9HH)7a4WX`7$8R^X zd&A=DOknX14AMlz@-ULF+je9F6=kjIT(ft&4n$EM z(x(h-krY5;PtzCgCuYg5?^?=t4lGg)E84Q|lOqsa;Jue@zv%Kx>2VIIa;kC|=H+Ber zeloB--*>e@RZlbWGh8LPE%@fZtHPb4Xrcmov{6_@Zp9=`_i97>Lj%5boZ|joerXn# z=)SNw#i}3A72DEs$HvICsyPRSx;dFJq4zyrSLaDsR=jE8+*+>yvTbo{Odo&Y(fT62 zix+boyP1cPa{yzOn}tX7uL~Iq9x&^Cmha47pqhLxv&6Zh?oReTut&)iRmg3>u$B(j zhVhg;y(Pqb{kC^D(b)co8_x@GzQ;xeg>E_uL~`$oaU##Ye6^HbFpg6&R>*NpdO7VG zQSzH5w>jFKr7J8^V0JSc{G*kdo<&}DNP6LZgF*+e1UV5tMODAxzU8IT%R)kdTpjNl zMZFzlHC$>%^X4ua^OiZdh&bTWc5Ef?r>d@l>l<{Kr~7BkG>-5e3leW&3w6|i%f?E9K1?>yz^Q2`1{J(!)Lz}2?l(F{hlvc6GA zgk|u>lB*H(%OpIfYZaupPc${VjwnSav$RgATff9vZ%y&(fYQ?lX>U8184!je5?@!C zM%K4_+4W&F*|Uri=(Gmx=}rXArQZr;52@Y=c zbE7kJ6}(|Abhm$*sJs&9K?CXD9#%`q3VmNkCmMS6lBR%_H)pJN0`oYTRx+C5ZF)oWpER#sT z7j0C&^c#7@rV7ug;SCHaUah%clZD5Fl01Y0KEfqz#V+RQOzlDUp^?renpeyNu3ENl z#5LKyoc~1W!M9lWlTxL=B;~jva+9qp+bb1o5uWTVi7}DXDd{S_sY9v9JqMc&XLVFD zb5&+@F01-wmfdG*T72J-rO!9lg-O~C=^W$nSh(AmS0t%{Ow{d&5E%jO|E+uzA-!B$ zK7*=oHsY5!%3nQHFO#n(8~h4D)co}rWtr7Iw#tv=&YifL!=d#`@cEYLk%L!HMYAt< zFCU;~8LU^;Jt+4&LacY^in37kTb~0%^esj<28sK9p2WWa?$b@YWS~Cn&T7R=_(m<= zwt4X+lL;N=rfMfZ2>Z=qux5veNA0p=NKwIf=kubs!GI7Z>z5(zeL!FRm`g*a^!~o#vQR}p z2$PHC(bYWH7}%jkV{P;y;x}Q%*puf0Ax!R)jnARsW`mmlYlSeY4|fBw0`5ej*2LAp zk0AZT%B^JlVId3|A0ABOgjyr+V|N}s>X?=&2w`x?30Y&=^l*%m>ih0!>}$e3U=CF^ zC1TdYar8DkLMpw(VILe0VY9~8z6FbIW0n3t_U<#P>23QPeF6ajfshb7gd{ZS(t8U{ zs?tQH2vI>mIw+tZq4!=yKnT4`5mdS~=_pM)8Uz6m1Oycea-+MPz0cWapYy-t8P9X? z9rx9Gzs6W|{>pdFkAJHgYfn3ihO|f^tFdyopYnkqSxapWFot*W=zeSW zP-^iB?eACK3u$94$4XDN$sP6#6tyx&hpO3=vt)(HiF_LxZX18hC7P>rLRsV4*PhY+ zz|nrn$^B4tZtJskN_o7d>WI#?$B{dMTz*T_b)ZK=t4HN>kYBRqbZyW~S=9V~*qvMz z<*~lGS`~%pAisfjHN47;1D#~PzaH{G+oKHoW7-Pb|s0$0PxnR1m*=C)q_+V@mbWwofk-A6gi{MO*JzBl{I zEREl<6GIsRfg3Bg+zwRT!d0KW1S*Q_spWoEStQ>)p_(_WiLuvYEyuco!d88{nr@O0 zPLg%*L<;DJunqJH`3HMMYoPpZPPBGOENY^_8h+K?Jj!H4FT1|3x4kt0Zk>$Tn(TXH z*Tuuw-ANvv9{3$h62f2wYBr`cpWpI4vD65q=V1?IT~9#^nU3tH9l|en-7+m+9G*F4 zto!#u81pu@N5&&@_l?g*ERmO|R6$*~@NljMi{fhUC=}R`ExKi=o61c_640>3=l75BJt-yE3gP8nKQ+Dw+4>P<+-( z|6K^X<9It;WVbhRwd-|?>N^;T^4;)Sp>g%()iaM-#SC)P$;MgIwH7<V#d#;Az;fPT#RKX1&30Tp zx~g6W()*_B5!cPSvy9fUp_xUmZZ8ayFu1EaorP4kl$bO4xOeKv{f(b!+J3t1HK3GiQ^fRoo2IgrRD(-1H|Xds11I zPkRdpM99>SCw!ZjV|rLpHcBw2nI%%`O&9YqKh+~m&K-FErITKHx-RNhAMvUtr?R&AGD=X2o^Z~(Xl#B#@5EQO`@*PJ>dAFt1@wSesHo`{m{!fFEA%%*L-HFuz*Iw`B6I*bV{+@ zra%+qFpE8epNnL4EqF1osKC(-)AoJCHD*TsO`N9vS~HfB90Rs8cwlam{sI2%JViX? zUhd5)kMqXP6}gucG682_VRk3q{1n1oO#3;)K%$<@)cH(>4C8=)A?EZ);ODDcXG-+E zL%ch~j7lUdaps2UtkHxO13!P1151pk+MjT%k|D5LKxc!@ie=XexW)yp2G$gPJ9K$ z8Eax{F8vh3Fw@Inn2`wktaM&S*snquYhO^e&V%CfmpHy|sSA{L`(OH12xGo)pjFR_ zqWeV%yW@)U_iY;dJ0Z+5XP10bX-`*oIxsot-~*4F>`x&~soyoj#H+{MMMLO`UaxPO z3$05Jv1;?xU7LJa#u z2>VcJdtYCS!c;`Fq#Ie|6R+xdybLGDoi4PB-%Fmh26>kc6-$+;tgj=30>k<2lHWG;M(a! z%{T2wLkDG-Ff5-3GTPj!U(cVfBEnJaq0n2##v-p7w42{C+`E)q2`i!_?=XNjjn1WXhTCbv$XDZ_m#t#uTo}LMZv;+?7xCSeQp`qZ+Tb8;o8~2eW+n%)`pu| z<>WE?V2u#XT%nv!S;2hxs-UJEBGxJ7YWrbDskT{M@zzqor}GitQD=p~3MWWH*vw-w z?F3s8%j$Rib@X?7Ot&zgps0nHyY0&|;M24JM+srcbR>A`7A=Z~0`+uQpaaKcQHY8y z;EE50x{;A^9*%drJYS4ay;#^bIqJnOdKbgBRzSZ1quIl;Twg{vVi40I0tZ;Qj|O68 z1+hh-b0R|xprO@w`C274)iT6LMn zdk^*m!w|Tv8tui*yoVq$PMJW4GEc5!SsVg-kt?%+F-(IaP@NJlabgQk|4Ng?9ZTs{ zh`NA7*oAN$0pXHBfdd?}63ej(M9pL2>lpY;39h38R#+Dc<2aIh9JPk!?8PC$@AZlg3(&TaY8O~xi2G(G0?bWzMVSG1t62;GQ9}~xd()A0r=nrtWsF^ zZ_7+W-bh)T815n)dx&HS33OYQZ1!eTm6D#pFmw_HNT3;ZAJMzayw%C^8I4e!JlS*&gg^HhmUKr>1OQYW+N|WOdTLAd3B!3~CR&|+e9*Ah8n;YMGfE2v-e3?JYAMHn4(2l?T-!js>X9W#yBTsLcCQkHo-CH6~I&R^oM9zZ$JL z)6I=OFd2tX9TJ(d?KQ7xuy>-t8fRJH7_fj8LvINEJdr81fi;$>czn~GQ@x(G7;9*k zicUUzK5YM77!&+pmubtM*$+z!k<$e(GkN0aWwFdxG&ly()C4qjBT?W15y4(S*NDZu zy{{G*BEz%KB$KWg1UtWthE3S(N)^yuo`mKBnFfd)n^-g)NL!4P74(+1^Wx&oytv)T zBDu_x#>C{ttxH`<*V)Mu+R3&}0J9%(Y!fI)8tHa16rn^!DVpJ17mH3h`f{eDn*}mV zU0rjW-$fQ?MuhgKqm2uw$?7SmF*<@7RF;wyss+eHd#-~5q^UamK^JsDon4G{=G58e zyC_~2P`G1GA*T2PanM)K+D3A&C3A#ziQSc7K{(bKMgiDaM+rRxWbL%IC8*hA;QVqj=P%DYeYhNkE!U$~$xmD9JE$6;heSywhD)tdnx6rth--*h zgl@BmBe^dP498_k39M) z4}dS4Qubm@s6(lIO(@^N5N@VdA>4@L!*ulNbaeJK#V(LG7KfOv=l{NoXhbutNx;0P z*l?DSdj;%=Dd@}sI(D4spcDd>rX*obf*x&O-luk#0lRmx1mcje`?xZq6VD8PT9&z8 z0hZp+$GK2Th2=Vlqw6G~pEdBz08ylD9k~=;X(#Ja8oIA3UidK1jm0l2js8oQCJ!zy z;(*PdsZovry!dXMk82Cyc&N4~B?Q8w0o&b}_;ckM6^ACt@7jPET7p2PT z1E0w$kejm4Ji5dpr(Ge%uOoYv3c(ax!0LiWUz|d15l=y~(5eMSRQ^}Y(6@J7(aRIRu-cy~=zn`x;(N3Um4D#mh$_R2MM(bP^QPI2O}RsLN-G zpgELpb2zo7c#$5K- z8_W-YIyrvGGycdnHW81{(HcpJykai6sj7~47p z<(!rLy3Ab24tHv#yY=WVZsN@iSy_WCUQv#lwtS7Y$nbcarmo4vI*3$0#7*IWDdktQ%CT^+Z`57rcNwPZEuOH zH3>)`O<6BJw?j@}vYoziclzqmv|mSdT7A}u z%IaL5#4s7YSi$D4oa$KBZVQ;nb-YR_GsC}_na_jBQyzO?bt>fDl zwiNQV^0~IJUD$s8ef5p%hC|+tb|;E(>zz@@#x~bRM&8OKX=^&R6gqatx%Otb-hJ)Z zWKr9~*zGjr?I4bKQ&e{+^LFWucZ6T=+_`YTW(VTT-`BkLzJcol&Td8S0?5&xeyR)X zi2szJPWs*dDZmafg*m*^37Nvt$J#OH799G#gv7pN!>b+o1b@1D{3$N@Fzh9GRUPS# zri$-`PXQtE_;2;x-d$`&wQsKh-|qsCCe^<6KmPU%^&NOI zU6CY&4Vc{KiGrWntYhpgbT`NHke%T2m6o{O5JUUg(3ig+tr-QvMDkjags`q~2#0FC zyaZoY9GhIM`FcHCH&M)>TSQu&+{=uQgs>=txj{^lqu`=GX6kA%q3Vug*?Y zSwBuxY5lmp*n0EWz5P!0#_Dj6T9Rt}mk(PDt--bKiif`mVc(Btp7i_>!thvrGRT!I z&0IJs|G*n6)pUoa+wQ?O77g)6m)p=Q&^8kAT(`g%z25Jgx){Ts7k!Z=gl#-?#gK$B zZ^pd(Q}o;;GPy9w&^R#+KHtZapd0Pk=gqF_pc}6*_PK!NY8}`i-So7cLxxGqz~xM< zc^#bC=77fPZQVNY9vc}+i7=@|aF+)Cn~b^5CG|;L-kZbG@?l%l%WB%Od_E zXt`|vu5Rj9RRe3uNO7Q8u9ED)N;O>8dLge-`>R1O#84%2Vv+ntNCcRJAyFyN|I_HrosQ$vwo-9p>h^nmFEehwe_|5t z5N|XbtD~6Y6&UqSLYu&a~h@ z_6$e=Jd`TWGYU2m@m0%{EM(U+ zx(}H4#U{F!S4iS4EustN6x|~*Z5*Xl&AohondY4QP^`^2qrJSwUh!D3cwU#|LcZeY zKF(O8L1BxTY-&op(x6wOZ*>uJC%RvGbDYy7eue9Y5Vl#L6e?1TA=eww6!K2F7S6+S z?38orsYHsOd8*VG2kHUwF68B{;uC6jkddQ+Sm*dxu)Y+%lkrh{(9{dBOq+-J9+;-p zjZBExu@zWad80x4QT*|QVO#wqikGG(q5;%wc4TG-kCgc&nORO6>-%I+i5MXaVnz)$ zN)083hG~DYVB{GRz2k9h)Tg~teR9@l;fg&9r`^Qr?xJs$DfUZ*yOE9F_0l^bxjJaX zI_HynI8~NP>C$s077+j3wOkV=#KsznPoLFVQQN;mTS>96mbt*MecbP5;_Q5&iL@o2 z(#$rwbfdhus28Jus%3&TZ;Iiw>zMtf;3P+fc?sFYH-=X?<8xA_%S_@aE@uPBi-n%2 z^(tM`7LKK5zjc*T`C5_)Aaf8Se>T7GopI4R5Q)QiS@!aE<@ek1%)y~Rq$-0K} z9w^pVn>2#B+Jr(KzOPrlu@^p9@@2ZQ@>GqLXo>A!<+;P%=e15FymXl~a|117Opw4z zhpW-?*|F+%YXKu#DOlrjm&}IOd#)ntSLa3;q8nEwL|yVLtp@636>I3+wD(||R1eA? z)*q9cymzrA0r%oq_EI0UX=a6-T&6*OaD3Fh_CYMLe)z)1g{cnm zoO|8}9K=|~%w}45@n?&`JD*+rAYZgj`qT)Kox7Y_!%(_wE!xIOd$p;Kp`#$`2!e!r zH1+b=c>0Smfym%{%cMS_aN^Uf-o$`vp}hkB4-TgZL3<&e5~F zkFQN+Fn5(mY?~Sxuq#BqIVi*4H8$BvlP`HFU3twVMoI|$xT~4BWag{ONM-h9&C_cE z4n}8{UiM9Ayf_&r72u>Td~=SEJI^i1b|$+tM*eoVA&W7!8|Q>?)6~0}2m4zaZYN>A z?G+Ef9e1}mVsFm(NXWHJx@nGbd&skXJdn@Jd3!A_NA6vpY}-|V9gb7OFAnogrlN+o z!i722o_>GC=-d+F>l-rN1VmiU+wi=oK&|A?%aAU?V3VESpZ>v0E&>bgTsb-pqK(iM4R2 zgiUIodD0F1A?Hl{rjFl@=@KzWrS;LKi;ex1PulG|pSxQv7MJR-nQg<>=H4zSE4fdI zeXO~{`QQtq>!Os>CpC!YoA8s$SF@j9A93fn7FcTZI5*3`LF~mSf_qGmhyHD#;}wbR9~$L-Lxcil>Vq&2TW>5ugj& z&*P(2VHEvK`r|`U+gpY^FFqaAsyX8AzIZZh%=U zMiz~yEjgL zlMe@yf{>pM6IykX=$hp|mM5Rdq5uE@r|>$iXwphF^QL*GXtFYoSbhfHD#Lkrdz+sRQL=TRZqJ%cl(M!#*M+Og|6E% z&7r|DnjwN|@>*$Z0iaWQ00rthj!jt!)Y;Q}S$FmDsR5elchXtMQXps`AQ?c5&ngy- z%xBAf@R_|xFz5c2Y}xQ^cWN>wVhZvzdEMvq<~tFM&N+h>>`l(OS&wpLrgAc=0qA6^ z(}IxRx!k?z+e3qQKXv8K4PJky038p|L{gI(>7~$%LuaY;?l#?BqRuDo-Q9lU`${iQ z_Hy0|fQ+29t%G?7Z^8%Ij`PXq+4iXkZ0o5^0CGqR;R0J0Lnnbw5&F5Aj>$^%IDlXv zKmZLDe7cgsHXkmgciY!FEkvP^k?r>97)@51LXD;ZBu!Bojv$)thEM>=#TJnn7k0%I z$yotcJOKc8GLr*4(uW%*c0+e@ilxpju?{Y!gE6HpG-WvhrEXR^9_(d)G%-HpO?<%#+gm%$Y&iWzBE6*)I@qhl-bh1jx(DvH<% zc{G(-LY2jpm3sci6=j>r6)lzZn_>4hD-Xa`_hYM0>Q+5yNqm%D_4r0m>rmCJjjB$U z>Z}vhJ=qCQZd8xnxHec>JvLDNRR127>fZQ`_(`jK3p8#svG>}`?=1?|5Z3O!RE%3z zta(G@@Y%4qk!k6r4Roy^C1 z>)eLKoBitOg>N=x*gzIa)33R};_5+JXKK{z-Dq%9$@LhH>eBgo&LMFY;roI&uQg@3 z@>La!OOw-;-E(jZRxIt4GH{YwLcC}R8I#g@8(e$!8 zH{M)#p_DrOrqR{o2m5?BoWc)$^&bfMJ+ND_kSTd!ZBWK=_8JA`p{7!ZiNV90n^5ys zb!&U-di6)wu0HaMdlXRh=;rXFptp~LAx$B|O<@L25m%d{;+kTrn&O6=65ci?L7HPZ zuKEM6h%P+z-Gp8p)1v8ZzWcU0AJS4F+)`xFQgXGWEUu-Zs- zYoP)8RnNxTUn|}bX|jhwk0EVc!fibUZJiJ{L`d6URa>@s+vwZ2X9jKkRc(`TZ6lC& zk`UG<+&(qjKAqG4a-nTuxP9tvJMn7AW8sdK*0va*j%|*%cRh-mIc>70=ywYpyTYLN zacu{~WW-j=t98d<*Q!=>SGMGOk9M5b`(~X!i8Aw5ZC(fm^jl}*9FJUI=^Z8aZk!iQ z4w7}mU9?OOB+^}lRYq}ga#VNrk9R^uxM6`gOop7))kwa;&X$nw7t38THXL#zNCn#N z7^!Y-JiBUOH>SGtItypb0g_>n!$H4C!nGGU;>1tuXcFIJR^6k#h}0V4uoCID6JfWB z@3h!LGDP%tEbEgQc3t_#p}*K;S>1Ix9(i%`iFex*&o+(=f&FZ`{nu&xf*09*b2)C< z^d)!owWRbBaeX~012+wy1kv`M8|lBsIrsrH(1;yKG<3g1J8-v}?RHzwEt`Rq$v)Yt zfymt6((2>Evo?dN-v(4p4Ar>ea@&UZS%&lChckggjmkq&-^3~dhl=C-I&Fq!1BZR$ z`Gl|=CHQNhMx0l{+zl6S97JIjH`rT{yck7Ao-qgH%`}E!R zyLUU#ojtLg{nI-i+;=`D?tHG<`TBI{=-tjSbQgGXm;B5wg~u*c(k^xFF67xR-OesO z{T}?}9^%X%lgA!Q(jHsw9>=piJ$ayc=0H1X4+sQcK7*)1fMb9N>Cp;60U9HV zBCTL5Dk?CT76O6N!{GE#1{ggPf`N^hiG_uYgO!DggAK`r#&B_Jps4t`QG$FtqJsP) zLPCHT(W{WuPdi}7Ev$pL#=ijOMu$}H<6mi>Rq0o$AI`8?CPJv;P%w*Sjq&&Yhw+`>!JjmiHX0sB9V zfE{I@dE9aI^>KoiZ^jiNC@r zEgC&Nz}cYwMP#vFecI_41Pmr6tTZ7g)qGmwzS_LgPXw%pTBLFDCj!<_EwU{Risc=L*1T6ap0@ef=8VQ!%xMaj&RK0IU7ernj^G5`1@fQS)T&k}WaMWL_xzZ~867urU zWxXnZSV%oIoJaeRN+3v_T)jeECr(E$nUVrbRbjnUpM{g-I@Ba3I=G(t+tWT?#^akN zkvoa6=|z)`IU}{-wlmG+Eyu(HWd z1niiEfEjB3GXlo>0|7hzpAfKUmgdTke?h~A}P1jTczizO#1c&bjFXl>FCPX7Y} z24HFaVk7O-Y@jfG`g(14iKxdLxtzJg&<9075HPfjld*;zW3j9ob%M=y5JIN< zx5eoKiy*2 z#u6fPL2p;}L$r$yfQm3IQ|tR|M=5 zdmLm;iH?RKW<+1t$S8u}1wu5B{yPLLS7JcBxtc5S7X*y>&j?ty=dTD@7{c#g5wLIF z9WI&FwI{Pu(AJufDyz=2pA#Kt~%@& z1dJ?}ch?YdqhnCHNoM2pdU*yJ2?1ODfq(&7rUhg!aBRM@Xtfd~P{N>V*delJ080f! zvY_4Q+Y7=^en-HLFn>b8zWfUU#=5dVgg%l;9@zHI`MS7q`cccw=)3E!Uzc{TTd2j_ zz=OsX(B_GcIw&@OMZmzw35ZFmGo$F+fk!%PZYei+re46*z&(Owd(k^zUzyB_3YeC` zZ9swCANh4@#71TsIzzpzj?sQP*~Z3UtUWc~mNkO;Jiy|wrU1gwY7y)#O}bA0DF1kCi$2pFD(fB~Ki z(!S!2oki~4IM?)y>P&i>HHcY;2s)9OC*r(wIDeLU2MJ zCT~VAQ1k-<3;uzCY5oBLOa21_c0g6~F9_JAy<7!(^5bL`!#C;QVFHBz4gq^S2GS;a zva)4iupU;skVA3XyRRrm2=wkTFy(W8>8XE?3x0|s4BCX>k1 z<&g_Lk%!lVU;dc&Od%g5K?m}KzqOBxN>RD1oBw@|_5B;Rnh$x$=LwWI2;gi2O$&i` zUWEJ=mWByE32>`=rI^o+-r!MUKdacV5(D(U{&4Ru2lft~RUvGTu%3Li0YOK0@h1Yt zK*yO)hW`x#%l!=jtN0@V2BhP4DI5l(*LRDJ*-K0nOU$iGEWQ-y{tpqbVB!x1EL)w@ zS6n!egn)_D%>!s2{}Te1`#SQ4mhvHf+YY)<$2${L{({2vgo7(7JG-a{Ke z`PdlLnwFkQka;|o*@(w}*o9y_y`ETs`iE$b2zT4YK-N+$jr#D^G0=1;B!L0ch@cl! zx11lkw?LL`R;T|*1T33`fWc@F*lEvH(#`Ky-eafxkX?I*y$bw&4h7OAVdULh_K^B)ngE?U<23U_13C$;rFtmVchpkK@$aHd7w)NOczYY;@29L_fw zTb;a0L!re{vLjC6rO(zf5cil=miov)B>x=(_IOO{mf1s(y{2|>DmhS|y1vYy38$dh zDqD2f)Lh4d7pgM%=#~;E9#XPg*HB7AzzCFG07^%Ix_e!l9|-~5q#=yab?q8<+0(5P zX&eC*et(C60q73Z`R&iC6b?VIlc^UomOnJ0Y=qtGXSi_<*P-+}1%zn@??q_|Ts9N9 zOk-1Qh7fR{6KbNMdAy)Q`feWrId8H_lliH5T3luB#t#H6(Y|{eO@6#dSx3C3mmDQZ z%97U=8MAko38`HA!g9gt4k)DKQGmuPacAhV1CSkV6G4B}dC982=YuRPA|Ok!N61wP zW?oordffq((!*BGAr%Pao7Kz3mGSSG(peMP5#B6mS=$h`)?62^bZ8=F;T;)>B-z)u^nv*E)a_OR_xH$ zC+N~Awp(mJ(>D+4#{Y(ZHSR*+jtySNKnt3i6A8Z}V5$F#fThv6tN#xXuy}~1{SO4J z^cMuom2T3$b_q>hqdsghFT7O95@rZ*wC{u#{96Pp^*02}g-Cfu=)Xh2COb<7m4{8( zr%dPRbb_Gs>Qmp9>5SAH#g|yNcfCp_o~zuoIPF4r`W3Inz3%XwLTi<=^<))^py!UG zoMvMaU$Up{jwT*`V%gVZ;@*T=aU#5InYI;XuF!FIBhZbVg@rv(p=>hA`NAzi!B%n+ zp!@$20yac@O{JIW_!KO|jnQHho}uVVZ$jS)1HDc5{0=LR`F9A|4XTGue}RD2*(;o; zy7tlb((WG-u-*Rx0Rz%#*WEM|x9zT!iGNeoIbZ$k#@PySx}V> z$t38-2!7YdR#~N)rMcjkMYIe*H0@kK?0f%|Q_7{@M7Emcuh`QHBQ+JLsGbf<1~pXM zD%>b3Q_lTVUTStDKD}mWs8`-=hEt-Bae0w6?2s<*Ax#`a>6iz@V67vnw9y+1iTZAt zFlfR_Mf3LJatv2J{Sm{h4hF8P!6uY6E`N)F5q?3y3Vubvvi}JI^P#Fw{SyL4_Fl^Wny-fFX_7YWL{Ky8@ew}_Me!2SY z?)Et7zeT{PX#NQSYuu$f<2`l|Ar@@+n5_PjB>J*Z-D9$Gv?xF?G>K?_mjSexLP&sE zjpcp0fEVQz6gy9z9GWUw7iQY)ErVC``}YW#-~R>#%+LQbkL4H<8&1t6c6oMeSu+~) zfh~@dDOOYzT>OE6*^v-1`9C6Hp7Vc$fOXiZ%8X0SOOupl@Sb*w)$`4>)#o3$zn*{o z_FNs!!vA^%ERBSK(OB(YC_^6yUisMO+<-fMXFJJ)rrG%s+3}Ipe~o|vvquHzxA@mb zMiMO+AN-DhT|MnczRJ06Ze82{=|Y@G$IK?jGqorHW3u{dXyB;I;cC${Rj|^*t|rXE zbcd6{KzdJ$(M-BttqIB@M!|F`T|%3A$$MOp^~#hNSY6DU*3Z-6D^>X)5iorc z0%oswFrFv6huo*3Ry&nje6bS(vC1UfPOR;Ays-5oW zpt!DgNVDgMFe_Tr#Oqi&AM4-wuHcm0Agi0d(%5&M%jjW_?+#+=m6 zn#OvooN1Y*)61^4`_l`x&eH#51Z;!w7YLYi&+iDB#t#H6m03#qUl1@VBby}Qp9mP0 z4Ndozc{LIOR^|MiQtL3fN;D^G_>#!NvxAk}!MCi>7Fp|1E|&!df0wzNw2gdS8*H%h zCj_hvBOV%lGUUQ?!aikrh0_L=EdFBN@hh>c4%%B|r^m$!RL;IAc;Crz!7T>rN0&4h z+t+T1xE9<}m+7j(C(!!-K*0KAkXAjYz8?tK3fPTT12TEohkW+~0SjZsTJ|e`*`w0K zlMt}4JvG&TM!>)%1nkxC2v|+@O1fk;>U3v{RwHZje}jNU{sRF!aX}i$yzdDVQ2zx1 zOU)t^+1!gmi-(~Jzan7Y{yhR_Bn^leQFHG3uMjY>n2UO{pzHm#KF_X>0RZGT1ngE< zC;vbynqFk!cLYoz;@==(h4rfH;_ih=2UBTzeN)CDb60NdifuHtNnTqI%^CS9O@?0( zu-qRA*c}oAwgL_P3j{3e9|&0VKM=4qX5a+Ma&&XxbG!_NgUZ_&GEi0J3oTSZbQn|g zGRy`9M8*6-z&=B`h($7WSvvN6UQY|TKoy|#QM4aR=#Qzt6jhu_*fd|+Ok2HtP_5kK zVhtIzsG~X4kf$Or1nd-> z(8bK~-X7H91{FOhmi|Bk)Uo0=bgk#+OCL|fW70&TTlEW)Uo=*k3;v9NxdB0T8w16> zBm~UFgxijUfISz;AR%BSKM^nu5(35%qpPJ!mG>(Gw)JNO%%+bNE(`&)2$Hja0dW1* zUl6c|XauLBYatS(&SA#gO~H%~5d&h#o{^EIU0Yps@v;Z4&jJC=e~W-={DOdaWf>}U zzE5wosqz322PMf48hUfFPU#B2B4EjmjQ>Eu8p*Wt4#_Z~fr)=az_2}oW_{wE#Iaux zFe@E?Nz6|K><$S5Plr6(f?AW@v?0wtqe?h>~{uTjK4|{%7Y`n_oG6?~*95Vr^|Hu!@!!Clr zmYrZ7K)6pakOVNP$1nl`xF{moxbY;OdD}?&;n-H?Ij19yrBB7 z>hE}6Wb%&;3vB_Nly4{jJt{yzmeftQJajiXw>pR$eZ*-}ejE^kCIT5gYmECusmt_w z0cp~sH~46<6)#p<-1RoV^BCg02)z2Lcj>P*B*>TA9xuKSaVg$79n4?8Xu{= z8{fL#G`#eVGfU-Zj7Q+A6bFzq>wVFg8;P_;{9sWzzO~UOntVV>{tBDA(~#;FVK{|l z2x1UVB>;yy;S+owOe#f5{6N6!GL@oO1@MemIfi8wL{|%V0Z1FJN?%@AW7grI6s2kq z=HnSk?%#>PmO0^l=8ySI;)?t!MBO^9Zo^kGbhrf-jqqNZXD6*x9lzPv()HeQxj z=kpt>i&Ai!EIeWvPmWG;AHzT5QAy;1C-K1L0RRsv_z3`jUpfT_BOzc|0Vx0|0IB2)#>!_OCbqhtnCaK4e&k%#{oG;BvQx|pu?-e zF&p7sj0Hr8+h$2R83UjKk+{Rqh(!wWCV;smrS^a+SL_6ed_{}ydWSrJJ91DqiV zU<8r?hPT~)L<;~A-}Y1OQkd7_DaV@X8pCYSt+wTnBtXo_R0FVGsD6plD z2YxsAHvudxGK!q!07;x7DAvY_pD})VzerY_6<^lS!m9VQtf$k(_&ymvOt=kT0|GRr zJuRy`eS@bJct~~}HbO0+I!!zJgoj0l8|-!%8Jb0PTKzBLT%M5mC6U2GSV4f5lUWUIS(l^Pf3X}M+NQ+$)#(D~P zLJErv+-?lZmNJeIq~uHZ!GCdW8^alX@?W1LM1_9izYb05UPH2qCoDcwJ`&bA{=t9U zkNAcEQUd+pzohO>P>$Z=B}ALg@&@aY1rx}rw_~o(hDjtSL)$0mT_~6mQg&PfxyA8t zSWEERNJS?T?IW~sNt!AJOR(`^9yr2U-u$t^q~YuQ?~2bA_0OJdCm02#d5n=sT1II_ zN0}#sP94Tb5_nDdCbit2t7nVbC-C5q)4&^Nwe&y*0<{;mA!ahuf|zhhe>`QfI>Y%8 zD#G-PVL|P%g&|)m1Iz$k*vZY*@9YN1-eOS8!jswedPIlPinlr+M6lN3A$1`acf*7> zhJ5g0LIdMg<^X!UiIW4v<%BBWRSP@O*`pA0a&?16}ashtopxfap=Y40Q;Q&dY$sVSiC8{qlHfunCp?7{UVy#FxqhWEC4etP@cqpRY%T|_%GI< z{1L0EN%>GD-Z`zz_awMFn3j48Kd_zs{5Rui7*c|5en|R`PN&0+34LzgT-a zpQjY5Bq+%*X4(JXzxtm;2F{wJE-7R@H`qEj4TuiO3sa2Y=p9#q-*1@p5tyDJ@n3*8 zA4?!(raWjY(t*&Dgaq6?prA-L%5MdfrIGkAQk}m*;=cfn1~mS20K!FUA|M}$f7aR+ zJz8698_5dBlM7s=QB9F-RXUCl#;2(?3%CQIH=D?^n$sRobXo)Qmo9!2g|`2M!VqVB7H7neHEJi*cZmqHg)-lQPkbK|(gy7P| za02Kphau&Z=JEiR8Y@=WC01uV(EbPig^X->nYASGUk#(-WS#1$tHZYFaNdtv= zf8oD$$sh8PiTvchK9)QZd_fwUPJFsxWH+|%6K$s-aliF#OFJi^>?XhsM#Z-PR3lpQ zDxYsnQ#$5u99QOtn*i*e@>wGEG~V&THSn*PvU)j7EkR&yFu)!ZZJpS@0kBv@EIWSg zU1e=1_S);D5-*3M$qi#m)J4gxJj2vrLTmsRxy~p?aHwfksLc_Dua2s8l&Kmw?Db|rcg{8xaO_pST z$^5XtCYK+Q>@S#d21T3ew|O?vANH4z#X3mf5{Y`(N1SGf5*S~^(-a*@6cxd}8&mtoh@<2SsOt31k z-S)Xg@ZE3CeSb1$g}vr?Jv0b*z&@y{e|_e?h{@(eC_w# zA*BrD@n)AiF2|RCzE$%+93!6my(KKP{e@xAD%IL@h{yB{=qfFv_6==+QB|q_K-*vO z8@8+tX#0!Cx_1U`e_=EONYyEumIr@}WANYFUti?6-RWhM7((sB5gEG&K!lz`!vQdN zyWJp!!)6{8*^aiq0NQFHj|FjFcyplp@Atl#<)Q5_a_7Ke_W-p&_LuhU!QHH{DW(^c zK{}*1+WsQ7eLiMG zT;=+s)75F@dm{s&T3Liae)b$~f2H@#{QP5oVVJ?u_E*%|2*tMhp;X?zd-pJwlK64j zPq34oV6y{pCEK^+i7;q!PuLCj$Ke&i-_x?bm@&Fg_>tny8sHkMA@q>js59xrQ+7Pr zIByw=9cMwD7)AV_Ue1|cQ$4p*ogsMYV-=O>AzWi&Uhq4w(*p$w(p-@c_2=5p@!&Xz z@M>Oo+~UEFlYMCmteC~Dv|DD8wdGDYMrhvh6#;q-bs0fe45k@rPI%F`CUst2&oXegkIq84geY^)Kk!%MANUJm{2%bw zd-sUH;IA=vZOIe`$sGkW{Izxd7yQ*7^B4S;s_+N?Qmb;#2n`MQMZ;f!KkygEV>JAA z7+fU>=cIvSED4t03CF@5C8O6_YKN0yQ}a_;1JH#n<46UDOAZDm)7@nz8dBFgP)Pz( zMvRPQIDmizC!G0QaS20mvj@+Z)Uq8juz}87b}3>>Jvf6(gHziQB6=4Gha!9|a#@K9 zW3LM%zF!&R@mnhI$BJ9o4fvQ+oHR?j+2rAP;iRX)BgKdJiBgFb4=K4j@iF*qSk&Ax z(AQvoBSVW~It;c1;ygC$OBx$|YfCk!OR9*!IFu>jzQ1=NvNiTz6{wZa0p6J^eOl-A z<5Fzz4+0kBLn8LAcC)0X2qfONbknDHIeclHZr@`Y;GXx$GiiOIsOXEt{jAUaW>=0y z``0q(hbu){?2b3L2$;Hk-E^sdwd29Cs(s?cH~z;lT$oIlL{*Xv*>|S-8`@vS3SfEZ z$M9uRdM^rB?5Sq*5eZui3VZiUL|c#5n?tPI{?K2d#W5U_Im{zU-4?p4WN)9GK%)Mp zzp5GIQoW_Wa^pZZW4c(J>g1?x9FAv8NoukY zE{Tz8wxvylY3qOs_H5UI3{#0pKm7!t`%Y6)_TUnp;@j+LbvkS^+b)jx5tp2-SuxU? z&p69uHPG}|I7dO+C4~f+_jOJxUQuqOG`F1-w-0w|Nssk_^)3BXS~EcKL@Z%%B1619 zHZ_7%Y~-@6JUUU@f>gVf7RR|WHnOC4So$|_{ryY=-Vf8qG{Y2M>{=YHzqv|>ewsdxY1U{WzitsgF?6lk>V80LIR^~K8e1_u9sv=lb5yp^@ zq+GCA z>@~3v*&5u0M7&ydnbB<=g6^7OUy+Hpi3Cgz4jH-4oJX1uir(lMEzmM?*(8cOsA~l+ zP@LaOTjNf2lY1-QwdD}!Ff^Zgr#PD?C{wb#08M|@pHGf}oytY(6vi4M;ROLpE1_a$ z7>sE8%ey&K-gW-C(V@CA;a+NHpevgGVioR2(_ikr3@iDeBpeoFuo&^2E6kok>p^~# z0W0IzzCKMWZ8DY3$8Ekq`q7b05Y*-m{k3dKb3ho^P*b_Rs>D6H^+1!fcwre&lwYuO zXx{0teQjW_Q89{j98G_n>$MVoD9_nJ8tG!krHLFDEoRtfEuod2 z%*jhXjhZrW@VS-0wn&BE6|j^2#=8HP{FO_Q`8CtyP$yLT-XkBb^iYLx_O5&R<^(z9 z&$)xq@|WNk4gVnpEvo_6A{^>0D4S=m+Y@AyTpi=&$3p+82WvUECd^tx7ecPDDz>bQ zmcKgOb;+=`S*zS%3R2s#9)(RcB43xNq2;f321ggnh(Re;;PX5p^-E=PC$#*fc+}T) zfklOuznUAvGX-|qBf{a}Egg?>tIuP+ zE>;yHm`F&h&u2g5+ zUh^2Qa{o^VSV5VgwW|I!jk}tq%c5aKFbSWNPz_f$?)oWoR@>6e^tzALjyELE{}3Z6 zp_&qF-C6YPTsJTEFmJ(s~#>=Wd1 zEgS!^9_%hK;seMXi}q9p-3&yvY_5mVDQX6b!Ph6n?a|J`(CIY4E36EbRm0{tt~UHe z5lBPN7$-)jIp}YLRj(j_TwDmF`yF10Jd$0NHh5NrcVdR5Gv!Jy&=TM_$`A#qxOvZ~ zN!!~1sKpeADO?cBd^hIN^Fn#t*W*^gv0`@$NCmlK^zc}MnnwL^+*^~gG1&up`hBNV z-%=LhPkKm+65$-2kPg@0FUlFuc_o_j*`47bTnXeNVh~Z$Z{t)iV-UUP)mO?6)17CoTE)nS(ik z?#UL-78< z#6+Qxv!}b&%tmDl)w4|pKML3zr>~i9CQLy!$G{Pj6!*@W*>6mW%F@H>U7tlNk78P^ z7(sawrfoy6nZqqFvhWLARQVY_PTJQpp-lrZlZHEo8R93L%0upzCkA>_otvZ=ym><( z-hQSZtf%mq2!BhPb0BN0U;Jt^kKc33+#8~$;PKwMXFVSDCN+_|zRgQs2zeD!o*zdTmxIX!IRR;Ji^FWACQx*WSZEivS+r zQLUaVlt#hZcirJQ74`Y5ozu~I$QSpk`@HOC+UhIIu(zh?YF68NRXz_3cx$ZhUmlKC z)Fy*X29;5MElYRl1_S`@Je(lxz>p9lVVeb{aE0eZ1C6HayAN*dFSxrR6Ry?RdChlR zFSB=_utpweAhW3$k~8rtneH|$UCJ%wHxk;_2ie+oO62H}4r$52F*)$i%2&tl#vD@d zsOy^NWW>24l&Fqm$;cU-An`GzZ2#;=b^}9ka$8agY*Da?Q_OOn1xPfb9kh9IgGm=#?XY2lqHsr z2b`g^UVT*YO24!8+!dmIz|z*U9Wfv~4+Uyr@n3aF*Z^pGM`#gBDR3o@W+>3XoUM|X zhdoA;^__#+Bc56nTHg1WE|^nq$oNNAW=}g~Uxo37Mp|w62)YfBg{rPH(eIt>5t_n9Cz$qIhZxcvmki$9;)hh`7)(9w8Q(}sxrX30T>G(bWoLT6` z9EUPbtmiJj#p=s@c@FuC{Bu1F!Fl%H`E~|td3!MLp-yT3-c~KZeD`WzBWp_zYkhqN zYcl2$zV7RSII%-a*@8!QUs)e=kJeu!zj{N2l952ge1|a>??VjnjvmR>yge;Kd%rw2 zQ;T|we7n6D3IhfxiZJnHOo4a=g3E$ga7~fQf`gB>*&r5Fykx(HMU3XZ&T;r?l+%*~ z<(mfuII6$7v!Vb-U!8@c4 zG{~-$p3SjwvIepnjNBq+bL-5xp=i#>{aKDPW>J9|Pyx+<(ObJ=es%EJunH`)O=Y)@ zw0iY^qg@pFMHbC}LAhxxxtidt^~!cMojRpO3bHTA!r6FzU}*kJY#>z2?S)+_j=DbX zed2yq4czfY9CN$%MwbG49O!GL5|QAOpKD*8$)A%^SbAt-X}?zqW5@K{AwhX_*b8>{ zNX9SDBM2Lz-_Nla9+O7D)gw*rppe7_(2`=>#;J|ipgXH0i8BrQXE4LCkvC68<-Yn1TnY^O|Ed!zZUE2KdDn*H2UqIo2C1s1=-5ZGc=R`JO#{{`Rr z`VanVG`d*lS*LUfA&Ez>{bBwiH2<~r0L_2Ni$NdhPG*1E9^GQy+bRJamb@EKNzu1U zeP)-;twmslD?<7lA*Wj+bjyDYezmr9C{^$%Jysl-I4z>yM+WL!yZ8eQGuaH9p{_Y> z$`3i3)oh;Mu$eg7y^yzZ)?l|JC}*GJ#LrO|4DIMydwn1Ky7Qf)!a%CR za)m>sP^%Y{4ZAw>`|NDwkqs)JQ?J ztcjZ>pvQXneP}S%9&%B$gWsW3g{>1ev;zcA1^7)=+?)oo23RgZ!i{& zImI|r9GsDB`*#Pi(qxRs5|PF{pv&t)k{#jN2OXI^m;W#T;?GKBCsMl$_7{d$5hQF2 zR^rPzo(elpdxdSh@7ca|-f_~v*sj|3cLHjn`L7@+8%z$HPOtiQmZy$nT2373Q0^G= z?cr*w(|Y(T%A6m=E|)(Vdukmq@6J$^-MnUB!sV*qy-QMEsG4 zhI5!M%$@8|gepli= zrm}HDiM!zb8u{_lF1gwTEaAo>5$4F@#=Yrdm@u~V;`mj;7|lnw=MrAlzi6SQ`HKk6 zz}W2R_6HlRAsw+dmknM;AAPDMBo4}{v^ux!Iy06ze49D^BE@_+Kl&ehtYxj2KI+MQ z3oP;}9c0>nb%O3oXbdb;S2F6P5T)cOkDkDBqz9h|K9>x&K6hrV7jOSaIE6t`;QXty z7Fqrw^qpj6k@u4{R{N#;$j_3|z5iz-V3W-f(MlX^nBtFRKDsl>jVarC1k$Jhnv!G~ ztCj5hd_Pvn{Dz=9>C^z!)tKU~2nu5WqX8z5K_Gp63*_ZR1m&xiLHdGaqgyCi z0phK`;L|{bzRrfC}RTe1jC=eodN4&T}kr7bLAWR!82|Aj#{UY0RdVn5DtL3RdmUj}ic;hpk1p|VUtqVqx1dbm;sa&z`_-q~^r-uSbfop$kZ zb4i^udcEI%E6>clwqLup^Y}EKgqF(GXMcAt0p;EdcDBuQfQ*qs=f0CkEyFB96oMTb zeX@<6sGdeWxkcq0hf@TFsJx^$py?N8Q^;RI6RQ|A;$cM^lww z4f@&F|3_WuepTq^rYvI)R>CP()Avrf z9Io9k?(uVlUY#Yl4Lw&1As}K#_p%b_Lm%g7FMlL`p%Mf7t)Ac}@cbPVq@F7JiriA; ze?q|SF4r8hZxG1^oOXG8%yB>(d4j5II=m85*{Ux1-_>9#+6El@M%N2vo`i-R4!w5B zPMjL5&(A(kgfe7}@jm;oW~ct`>j&z^I@_wuACGt6Ds_g7dC5_gB?*RH#J}E2)`aP3 zzPtKDsVNSt>im?iXrlIv*fbK4%c#fLtNK~Mp_?b2=kz>$bgLHeF^@G@@zfsT4|dseQIP`|)28u;-g4mAd(# zmX&^T79GMlM7eOt)Cgqdg-XhFtWLrem9rC>1wGg;3UVK^xAx}wUnPu{5C~B8w14K* z>QgE*KooJsbuq=?FESmnQ9u>(s^i7sL%8lr6`5j<=e5gcSBpk#>p3>kznK%dAE%$+ z$Mx9*NDP_CW)9a_N!05j=kkgkzmIuh_(I9T%3esVI{YeJnd>sQH;*waFYGRJNEgYd z?Ad(M=-&v~aPen+RUsQ>@KQ;4;zO#-dyx;BGDzc&gd^`02wfp5NCNT`0`EGgtp>iM zptUW7csE;MXN&*75S`M|JllHg-CENnN%!cqcdCJ%Avjyz`AW=y%c-llOro`*gT$O^@andF6o3l1W>nUFv>8w88{)Irh?zwvx8uR`O7^&rXNbo1 zMWvOVMeIkte1oy+f~?fQ>>?dZ!uvnS>3&V|&m7`9sDOTkQA$mzWG+dQ%@x&NH>3nc z%G=FaFNapKtL8|M2TqBGlqZjC1uWg%bE=^Hbg?pz^Ilk{$}XX^j2Cxgdgf3}^)GSJ)6fBDHZuY&2({R4sx& zE1#a|vJZ?R5~0NMan`l4kt`|MAA7Zx_s)OUr0Ex~6n8yX=IZErdm>MX?F6@)rirF2 zFW8Ru<<^GD)FLS8!3*(2%u%K=dJUb&dW+Pb4-(V}oO9-5Pd!N;=1F{r$!X5q3aXEx z&JOBE;p6GO@?2|}jEwzZ3X+XA)34T;^C4=n(aKV@KZrwxl9vf>4U|Fe^Irln^$)*@ z<~}^dB+)*`I~EsiOgP7)#Z8J5weq~P1L^Xy+}0+S`OwAAZ2>+ui>(Z{fN-#{nR+Z| zNnDYs1VkT2PiU{oHKUKDJSk8!Ou*c^tmVYh83*gj;uNdkeZIL^RihdIPD^CN9}=%_ zYw9+Saw6^^iR{CQ=zyaNCm$`Mx$s8Qc>A1Ko;)IZve;?pH;0TlS0u-oCz)b|vTKZGGCe8V-KewCsjC_r z5@wU65wG|7qVODW>{^mE$?G|*o7P5AC=|2JX`rK*<#j86g$;3f5{kyWrxKT=QfEbZ z+-WsQ^ncA153RGP-fij!ub6M>e!w3gw+(jNeX%2&&>52Z_I;VMiP}?nL^f)DoRmTc zyM0WhkVh~uTIT^bLVG5e$A&Wpw-ZlcLYfpVG!04y7D4WGg!3-Ecx7+!?p~U8r9`xO zFg}2%I5ab=QYlcX+=h%L-JRr|wv3C-QI7|?0@kFlcrEVmdx%1Zf!7S%&!&;0h}IThbOG0t~G|<-B)Vpn_p*VZQ`Eobx$ZE_ZOvRA2*3q z*;{jbDJi)AUN2_H)M9u`r-$`k++D@gPDH0^r{W*t`XY z0`0(mM!;l&{(iI^)It&ILbW8#iNQlA`X`yQ^H!-D6I{;iF2d=nexvfgCp;%?4Hmpf_`Qb}yc6D7g3P6=&5i}%HeQZc+hIe=~|m+DMI zE22Qo-?Hd&JCdSJO(Vhjt`|##>Q|kZ{mlNl=V{<(CrU z@0_%vktHJH2n(VJiXzbwL3@jQpJ{xUhn(tyqjLagu;5|%xR(W<{5zO~%5D&nw}wP$ zs*cBkqLX$bt>53Tmg+GL6{R2r*9Oi35h6o0q8gd|n+}`d-7_rVL{1#dzRvj*kdYTt zZthedXq9%xJHGpx$$%rCQmS@Ue+(Ii>wm{M4};nyJtX=Otx}zN<1e*Zk-gwG7V1z% zVqu(z8pLGS#x;kkMG5Cf%QncGErzpL;Go&iVN@4 z#!`pYpt>kt`N^traD5ZonoGi?*X9!_Os+->!Yqbdpj?E%(AwD3D>9M ztZ(r9;NRaQa2xSeqWd1!%i&7#34JVv=)3QCR9mWxI5ne7P>h$fg=v9Kq{O@(y+u5} zzg6iSw?!9eHyUhz@y%Y29?zM;jJy8s@5HEOUkb<$$(Qzy=BeQ(SS@`9-yBt07QXxa zJfsoqK~oc;QXPfb5PZo}T?W2-bVD$PK|%@uY6Ga_0idz}Re+cD9F7c0e z_EQMyH$lo9MRId<25cDlCX(zL3U)!z?kLmy=K@1kKsQVjkCZ?+ilAU~hMZhd*a|gT z$s1QQs*+GMf%)v{oVc%IM zk<695M9_-n;+dN>nju?G5mYu1$RUKxA%SCF8`K2_AHo=A^U>HB`3#IEIAR|nWhq>y26hk! zpa*kAV_(f!zfA3qpml*UTrPubAe6`BXgY@S{RAC30_YE*I(4DbMbLiLCN6^V%pidk z0IGBVi=jEgJ_eN&Gn)w+wJw0>8cElz1Nj2s4}g-RVgM8_5Tc`H>MsB)GXQufk?tDC zKn@|(n4qr{ru-^I<_e{^f$%+1;=BVz^HH>lQ2Ii0ine885fNG>9O$yl5C9bdkx`H% zh{>0Ub#sMvFa?1KLNxZXgP^)GBU7qjkX$CY!Js}x(B8C@&E?YToDh>kdD)=AeGCeZ zW40A2c^?KYSsB#&`r)I}1tP0AVy%e|;$J=7&t zyvgAMnlp+5C2fP9$rH4V2Jk;3$ye7*e{%-7DNFs75 zc_Ex=R~OQiE5-)BJBOs{QzY-31lL3GD?JG2&=zt%y;v@OVLO>30IYSYv0%oKxkQ}V zPDJd+zOP86Si@LPL3IkC5Khu2nx>i0!QW~p>w<#Xmq-+sp$iy1#2C132|HfU2E|vObAc5`oJ*PP8$> zRtUYD$%Jo{0Cc0IxLG1roYWOV5Nl9AKq->;O(+S~;}ju?v0)0!dZgIPAbmZM!zoBo z2ZZTKCkY`tB{h%H(_C1B0b!EJ9Et`HrsvG6FFhDB#VG3)K|9EMP{{pVBvGK50uTza zP<)J}1e+}z09PzuP>G|=2zpM)EtSBc2*P?8T_6(?atbK)#3ss&5(p5;GGi>v#nC`g z<0sGtVsH{o5Je+tA9&D}>Vv#7C<>XRUW&4x>d`WctHvy={YE-Gm;`H2(%3HFgCYoB zmJKq16e!VL5!idibk7F|Ir@gvWb^3fmqKi_tOpX;|NVvNCY37v3Ai3Tv31zB-JNHU zJ z%}nJRH3{y@^@RzO*&vCgn2bTL?DAi%Trr*^F(}F*cvPuUn+W193~<-6eFoB9z?IQu z+Uiw%0PbT@pc$QyDA`;Zp(4yLB9;G#uBF&Aq5?peTEmcNMi=V!45&n3-awyGN00jW zks;E|W@gz~Dwhpr9stb6Z$ARf!wJP}L!L`2-9gg0B1AQxXu1L@%$8k$eR;tEA!AM^ zwDTlz0Z{ux7%!ppqA-GaCL+p@;E1o`wg3u?r*2%-k6jJezKKAze|jC{h~8Od76ZUx z(7WvzblMWpj!)eTXQF^Bp1%_`d!Ldj0?dna$=%Y>Qw-=wL%nyVTQotAf(M2HsmKAw zdI0L^$ye_rZ~(M2xEX||b+83RYW$g?lyN^95y~zJ+O#9z(Im5 z@K~P$FJW|vISkR_TF|dRDJbdHGVAC#Si_qxlF0(>6=H%3j)rngV?4Ts(Hwca`9}MQ zjhX`UghC8S!#YXiM-wgu4WE{x+@aDn;lN}fRW#FK+^;)ZF!DiVx^Fb38ss^@4y1({cHK`xR>g^|eLB0*%EO3UqPOnnZz zq+#c55>TB{?FLY3kE>)Li4x&t+ltTb96Ji>Ct4oDcnpD!qLCF(ZMK&Z^^i2O7v9iU z1O~3jFgMO0RPwOg7bbxzGZ1|A$|WX(_XR@fqDXG=>Oo6Q8p?8po}sbw6|FhRImis) zZ>^+reM%geBT|uz34(6)^u4aym5~_IfB$JrkK|u7ywvA^RD1ADqz6%g^#F z)dwVD{Sxtr7(ZDO`aXe~ig(S>DWezsCg&P;ZYoE)+-xriJX%BwMDVep^yGE;wJqR0 z-{zv@M+}leNoE9QK&on2hJ9N0`sKGL-wcN_a08d5GGGMWibU85ih>MFqj4HcWAeur zCHF~5HUUg0+VrX3bQ{`fwg9R^2xA*kaubq5Cc=fXP3-(Fl=ACo#0P-%_#Bao;@w3u z>cAY62S7p>GXiXcx961HkGZsydYpE}G|zc5H1O!cq-*(-eFa9Ja4cX8pni751%ptA zQ-Qr9l%#2C;~(fVuJn!|{MWjmz6se~fD%!Ct8F01b#A`EhYSQcIh!kliHsx1ors#b z8>j>(cnUf_(er%IppH2r)G>T*Ogbq%7Jvz6KN^y%9RX#J1Fpv?J4lwtnC~YNX7i=#R)qhSv;5}E#T|tF?#s5VIRDKg|Lv@s z?L7a@qQ%|UjC&s!H=_K1w6yJ@78rMzO|g3zw^RH#GZwdBGVT<--G8;X?~7YCA&Q>_VRJ_AhQsB56l%4jgYz z?mR!e`}{}%^+Pk@;K{q=&BbF?h0|{T6Ws47P=%A{;FICEzaBit*8cwUW!u@N>3Pb# z4STc|2LEG)wO^vGFmdE%MF4OL_WRZfLy)uuP`zFLUG)8K8;Z6*;CIpI%PNK6pFjWp z_#JQKFDq>6j}_LA0$@f->#7jYK?+845~Ow*xu(vvmRUUADdbsfT$rhXFome@701J3 z=~YDC=!;47QP5CgBBf-t@%O!nJK|J(>B7qzo-DR=qJfh+1A)9?l_8qRrTgg$LH<+f z$=1Y4I64F&1XFg%G#Mi*zrwp#-K8QN+?7n0uQ;Umo>$6WLIn+dEKhuK@(E3S~ ztPakEH$Q+|5BcbSjgpoRc{j{Gr1~bUBM5@J!<$}>M|LCqZxJw=hP~N;B4DKwWxxLo z0w&jdy8TZCY((PZhFHG!{OBJD*x`tkYXFHt+toi1Fn`a*ncuev7$&ir83HGAX3?L3 zAJw;Ho)yuKNw*(oC7n7AY$lvk)mI^bq6Hj)Nh$) z>W|N!CS|H_m4SR+93@>RTf$DxJG4X{uQhZy#G+I^h9$T|@d_3iJ=`891@b!omf(|_ z)QJzgp)Hnf_c+1?`jon>5es-H#7NHzhV~G$`_y0;NF5$25$`)ncaj?j!mP?Fm8h-C zYezKfD;k=`H_DnT&zI@_@F0PPdPfDsbj*0m-6FhZuHlJ(j2DE0%x17X_8@)~S4X~S zcusWv6hS{Qhj-0vgoT-DJOnQ{bGN&JUEbHJuDo`4zw+ne-E1t3%Yo$&?X;KQ8rmJMa;YbDOSB4AI7?%b z=6R8UIUKE7HE#v*a!1G`Gew+`@>$#F?MfsMJjPYZdT??T1c<(e)kPP_N{I&JIdsiS zN}oV@g9#2qzWE?<=;FMMi@SEPZAWHc3bpJh?HDgc-TRc}Eya=k2nTa!V2Ys+1s%)+I(J@R& zIVXy^NUFdov6FB^@KqnaWY(b2@avr)In)N>X75)!4mi%2d%$si$KRzU@rX#|Nj{l% zJ|bBNOTrB6!RMe!;M>r_GO`<@{AoLKu;ZruGbHO1yk~mdx#8YM`^KThlxen0x$us~ zNlOuXED7JDYY;w%K?k9a;wS8hBJTnF94s&%a!eG{hnX|_4sKyPp*ZgAo_?IE$QQ9? zavZ@OWDplM8hRk}CaEdB?~5p&@()=S60&kM0%jv=Eo+1GD!yCNAFqok7Ar`I_dSj$ zLV+wB1ES(!?(yZG0=-uV0juHnqudS%!tM3$V&+J1X^;9CTYz%4#lRF(0ULEk+z0v0UG$^ypkzb`GE zm|VUFKC|h+hr^NdI@^M&>@+&a#9TznB@FZ^V&HLNuer$B3jIgPVAtr0yZcSLp^Xwv zhF>N$KC%|@_fHKPyV<1WX!Qk6)Znv_)}_zCfHvFUYrSy7$r#|(6MT^oJ@7*_Wqz+v z0_NWRG}1je3jvJ76A5x-eQ~!xew{chpT(mz>CHYbhpZv%sAoe`Hf8H(%Nbtww1r3V~~hD#OL206@4zLRGu=4aTU zvaF%Cw(%%^e81pi<$OGjy)$e(6QOe=ayKN#4bNbGMZb4^x@v`^e1pc`@T2#a`qU|T z2hUyl)o0KTi$#?Ne!Elx3Apuz^ z?Y!zF{qEnXBbWoTuNE|`Gl89!#c6k z)8FH=dxFa67ZpxMS>V+`{l<0H{nyFAHRi+v>!vE4jyuMh+v+C)zwxr? zM{`eGAa`6keT*0|15VTggQ`_Uh#S0LYC(#yXs_X^%yw75sv z`M6T}GrJ@V^jVimL9CY-UY^}6tJW{DactE}xUm-gO2lHLI)m?*zaY+Pdt)LD^-&qGd$$vcL2M&a^~l^;o{ z_MtDD=P4fxxwY{Hdc?_u@9j6ZFYz6`!b+&0aE}ewk&E3UZdu%yCvNMmH~=5Z^nJr@ zZ2!%(KJ?6Yg#t|4035mh&I!U`a(L$Qr9Ok#JbO|3t9LJg(2ZB#Z=P=%bQoh0a<0=OzPKD-{c0k3-Uf>tW4>x3uB}h9NzWG zN#(HGP!{kRNLoBe$9=t{Zu-zOf5x8Vncvr^&n`upZq7tE8}?XcT*$sMPR4|_9>sm9 z>yMJpm0ydZKChmNF9YokS=T<^Xq|^sEhC4UC&?oN4QD8BdLRn zEpt4-K1&O@oElZWocZpr{rUM+w`v;-r9e0E%V+l##k8D?`&$CeN#(ri0$;Y)C$NFz zrTbvxIT3OqfNIZjRRX+J5qH1BiSeG&hP&X;1Yp>pcN*ZW)Vd%^gnW14;AS8bjfoIv zko%gR#FL%rQ6Az#=-4Dmu?e-N;ek0B=Ep@~03Ec-$o{~%ys&rn;nFzSX-*YH0GSYxoeL0CYa z;mf|Tpg#zh$YvP4I9!h{JZ$R^0*3c2q7jXNJ&5@W0aFXVMZnTE{zkxPLT?eUK9zqU zV0}Tw#gTQK_sj8a5wNDMm-T&7G?r1#e-NbHn(lMpV*v5*u`VU> z+a`t3B#A^M3+X1^mkN=@Pc)vwk+Drw-X>J2Cz4l3C2RGEsb(aba0xQ!qy%pzORFd9 zMI_=#5hLI!u%;BM?G$H29?kw#HOW+Mt`xb5M6?N}5DZ|9NOb3lbHYyxqGfl(Prq-F zW<-%<8Ih)pPdKua6!0qDwJ#;uHe&;po@5vl!F4aHKS^mViRBfxjyi^NNrsC-Mp{Yc zazaMYHawg5-dm{@8kfYsO)#d6%Kp%jrp(VJpo;#iR@^K_)+}$d2?kH(xWH=8Na>Uc zZNtx=Gz4|vzj^;OTfrkc2W^7o;2KEX9eMTUH)r~k?OROqH>pEskkI-F6GT- zNdeuPM09i39BsZGsX-#B8-LtZm$l#BWWjDoTt=#2OTVbT5Z z*{fE$F;H0=TDd(JwL4dNQfhfXP*u-aby`}rZ&~%LH1eXk3Xt{WdZ#MK+Z8jaT94E7 zj!`jHk~{w08Ya9a-@2+_+6i@1*U%37c8Hl!`WC&Vw5Q{#C6?5#TB>s;(?g?H=jM>)h()pNGc(%9EO7=*1^ zXsF*UlKvg|5RHHtV?SYLdZO!F1Z4Gldbi=a+n9auga6J4Vd@53-=_-q8!UH==+8V! z${HSgLm2HggzONT8bOU*@N1X;Cj=~krzuIZDaF1iEvhM_tSPz0{#A~(gzrcH9byNU z`xF-c9s&Dc-|{i4rKzl?Ww7PbZcDi)mILXB+^o|5LxTK4Y%~HkICzVI4Ybe_Kt4@s ze#$>>MI&IdyRGAzpB8vNeJT62%+oeF_-WC;ZAqqW+xOGDecNJG+s@t3{ex{kj6Nj+ zKmTfJJ%1r{D)R}72|4#geZJ(uzT)|GGl;n}h~qGZ=cU<7L{FEcFAnvBd2&JwB`#6uko%d%WU^lVzl30EZLJTo1AHu#Dj zXv`=;kL-qYZX-Gr2jk>T&G{==S#tjoj!A zedT5RUOw{5p>OGG@Nj;3zI^zHALBYN{ifDPEpPuwE90KuV2D}&?PSu>6@Wv_G4H-#&&u8Xn%~5n0Mf!IpQdN#gZ?+H!&R(J-5S%(Q^{^o;|P*^eoQ zu_>zuGuijQNW53JCt|U4WU-O^;z{)NW!8+5+YI9WB49%Nt0E6q#hg|pVpsn+LBLS| zCm~?Yr>b$M>Q$$jqo)rJP9F&z|80UP{~HtRzX!qS{yPm!`u`6FR{VtVpBmV|Qeagk z9REcF1OHbI3{8POuz&NH2KN7z0{agKOmq^i8iVkD94Iy!q_rO`n-t>rEL7Gdv;q+t zHx;J4AEx-P4%l>r{$B`KMpWQTtkG=jvx8Wze~W+}ChGmu0sB`3>?qy%KRIA%1nlUI z#eeI7y|@2Q4%kVd+rM$ZN`g*Gy!y*hzm~qH9-I$uw6gSvZvC*7e)iUw3B`Lo(Y4lUg?x#0( zZIfGVxj)-7ia)1UeV%JZj_n`|ej?wLb!1OU~SV-#nRB{{gI}&(c1IT+O@Hc_VEwh2rZbgG&do%rL6ATT4?avOPL9olYu0JN&;9n-##lpbE;=t+m@#&@E z%cYUC<*B*Vlex9Y?`sE}>l?qln4AhrQ7HQ{cywUhjI$dT^XSXo$p=_qgtTU29 zv!Q&p&StXIWU!&)+eeqxfj62TD;HY4Pj(gtKURJJ4F6wkf}Q=pOfW7W1`g*E9Gl?$ zUH~3@G1Gm!e>K6zs7S3ObAqY7wsQSxFeQV?SX0%^DcR{@L1fYn=gU!oh1Bmu>D*{m zVY3VZWONU!u>?PUzrB&w_`1d_l(!y#^Nn|NVGODE&;CGsK}A}oWWn@yrFXAK z{+M8_F=!Kv<<24?`8_w#U1Fe$ht*Cn zzK-6PPZx|piQtsvLb@DSYq*?wSSplz`sqfsk`v! zLy!Uz7TPFYB{!>om|*OrM%rBN6t^bWR4v*B({S$>jHIX~=1J?C?5EGrE*)jtmp6`- zj2-%8f}snqv|AIb{PEBL+64R7k2b+p!|8mt{+M90y==E87-ac!a*$AZDQJ|fz(y|| zJpE4-Y})uQ6D(eshi9ThxJmxj1T+3)g1O`K)C~PG!P4+~_64URSVsStU~&*ME=*d; z67`MbtqG={UabUwa;{f~f8B+(^bRAIJ}wzk*hsq)v-IDXV3|wGkb%EUFsDBzm>KVl zv3cP7&6dd@6Ksj_{2bD(fVy7XlE9?;4-?EOkT~tu1p9zdIuszdr58h{UVieO@HLiq z1j`Nfey2nm$f<0Fedvz~Ru&}OlX+``$^456HpPrK!5IITV4PTr1vGz~V0}2ZCRj2| zg5hry%tMgy^XxxNusH5p6D$#Jf=T>`31%HAVEnfUR?5W0PMYsXNkQ67=O!13-oQYe z;L4lV{I@1p8L4LCe>K65$WbC+$bZGx1*&zDB#R(rMZXr~W-5YTUOiu;xHZ8l3Dds4 znYx0082M3Y(j|h#_rzg(eW-1!L}=-KO2tH@rbgYAi2!3O-2=RFggij6Nm_zI&qms5 zcdP$4!A7zFVS)vBV24bCK>c03cQbRS{u>j__tpf#_rPE0Wy$0z`LWfWTNReJtno^Z6T~SdJ zdJ}0P3MgGVK~W(T2_PUK1dwW@7exd`MT)%9d*A!n<$1?B=L=+Ij5)?ybFF#(uiqPe zqXnBD?_C4Sai!AOfL`!*Ob`rg;UC$pAj+L8dj%X_s+sQ%UQ$5mVW-MfH$j;5cj|-| zWd7R(Gx#qPjKW~>32%v8MYzi2>_<2m;MtIsCg9b{t3K3IfbA=vgAXy+4r@*#$c=BX ze@w6q=Gt!OnD9cH3FZgOt`_{#Up29O!CTB75p>x9m|&(p**f}*`snTdG{K~z zI>0J_nP7tO2QiK}xb&8^Z=E|uio1Gs@>H&E{87i>CRhx|f0$snEeXyjq5tKPK2sbD9a}_dh0Bk@f$h3HCK!AGLZ& z|Fx+b-Hu5MYPzInT9{{v(bSeY$d=%Bc~n@fJOI+hUn&3$INGh_&(th1-({p$UmK+t z*{ossR}*X|TJsgn1QRRb9RF7nES>qc3D!`^`A-uJ#-vJjfpjsr3Gu0j{dJ94_0^Z_ z=k`jcSZ4mq1Y`Pbg8jh$pG~l;f0$r>eE$a%ERa??pW*b!1WTmg5}GzY?bp$Z^&9!DPqL9Y zaG*uz{@nx%pD)L)cK??N7R#a3XDI&11RF#ZZO-!Fqoznm`mtc$?krv2f+>blPgy0x zT(2VkX@W&i8LwvlF~Qh|PW~~$#w8()_pBSG`6UH$!CY?^Y+#%-MWW%P=sWdh3Or)Zrk0u9vvGQqCXOfc&%E}98ut(CJyW3I-#PX59| z9u+e$D*Z9RPW;0J6Hmt-FZmx6%(>&lv?nCa4YCMvT>64LYpAUyVznA-&=0v7*8wj3 zV}j{e|HA~k{ZA9j8@##z!MUlmgvHbw$lv+f1T!jxA9R3?f-e_lMOw36calUeKbLO# z3RU9rnkSjEEJgex97l2lf73VB7BSg~VI8nVvPg!CUJNcM^t$tx2{y-cfpn_#Yh@s~{gnlh2SvAEk*uv6iCfEwycIz|HB09{cVDY&oa?Wu!4Uz!M35RO$fC`2&(yPQ)GrXUfeZUW*2(} z3QC3bg)NLZB9EChq8ViYNZKrZ6#KUcrUa1;qsO{IuKYH^zQU1@ZeVjb4%{GE>wlPF zIshcB5mAJN40=HP5n=~!vCN~X^Gcdfs6DQz$p5i&qMNn_#zT>Nr*=8KZmx!#qQ!ZzI80zcQankcvCJ zT;UbVdgq@e*hTsiZp4qX7X-dy@8D#V8dF`$Skp)_lJSK$xBoH0qW)=ul|kaH8D-sq z^hvNb%x@Dctx%)KES(zgfK&rJL=!B*%@lwGfuNQ#-A_a@s#aKxPRCUWo8^~{x~AKrqh@0Gg!F~JJl z%EAbbu$zyp`w{=s;z4b4^xoWO=9&`%BtKhSvfI}Wz~jt6ypl%B=(uGoArJj3>)6iOHJT3B%>xn*}P`i zs8ad$xw2cP$h9kZ5~3_w%g8gFg(%929V)9Dk^Pbu^2R>?9T_#E0U1kjx9R|2(<_9IFdlss>yj^E z#n59d-LrzBSz|o``NiIYU@L|T=Udx^2N7pRnN`huSCEXKNi6pQxaN^}^Q!622A`M* z20Pd3D1+elZiCzNW{54Hr(c<8tkug3;X#G>kB+cjiC}M&9qis8=rvd9%^##NVFp`K z-=$FBkmdJf1-kfpW}W+q&OP3(osR)XF_rn-5r(HxyUob>&}f;Ifbg8BUm3zqqt1;cf)T(=OF7LJU%%Fgx{Wr9O_JVoKj@oA1DXT3@_ zGNxr4riJkY*od?Y*752~zIF%(JyU|>6^?>qs*8)lq%e(;Y3QpWxQ*(OqAaLFusLN+ zz@{9%bwfay45z$tl4b9E$<|*mBXjlBdIK&fmCf7|^%V@<9ZS+zb+ln^og7<}*KP=b|jRkuQLr5W~Yiape zV^Oz3>OZKT7nFwk4@ z9OO3(cIeEg`4aZ4!vOYhvk)Fz0n5QWB%A(@?tNH$j)CCS^))-bl{8?7;#{r9$VN+4|?(TwdV z*c&5;NqT83q%$U);wQg-Mf3Rbvj!Rq7X6zABL>klupd~Amubgu7EA^(jjf+OS-=K+ zlTmz{@dWkW{$DJZ#%~r3`-cTnB4~z@z$?AqcIVVy&}VjJA-P>nG9Q74<~R_QAYtiS z@hVxxXA`PzDXOGzr$T~swUC#eA&r-Fsr4^foRK5fk(2j1V|{ZI4^2(2sI1VRNPBQD zPHt}#f|@8fL54VOV&0nCr`Xwk($yHf&D8)Dsp+kwXJ=lCx1YhwKM!@{5V5cRwR!e06U_a2nway%x?{}76X;u7;hNLa zkfotA=gE!Jt(U%SEWMum?Ak$uv5LD)kwzk{SXL}rr?x!%GQ_OP*SWmUHr`eUT7EPA z)tg#`_TV_U74G}0@uKhLs)s0l!9aBK%IrM}X0>67ri15-He7X+^WDCh zUmnD$223{wE=;_9{Ud%BjRf7j4yQqt}!Smft^ zGn_pP4?9Y~5k~UASv0Lkxp6bB{R*?r&()HbI{Q9jtZg6;>CF!4eb4kKe^t3kvxgY6 znWe6_h1uGN9OHBj4><-}@nh0JpK07O;_^)O@A>4{z17PXp&VjzB*lyM7$z&WL7l9; zz+fs?!mg;4>67}UAGo6&YId|`-y9)8vJXApXnAGSSb-=yCad%)TJa_3@pkA?$#P<{ zrB+Hbx=)mS;JjS@JyMVr|K9#rnL|9i+^b~&9%$x6I6w40R8t9fYbcOvS7<9TLjsN+?f@A*QBP19oM@B`nRvA`F8V^tB; zx5jEORwt{CT^L(>lFjpq27?tqLsuBD$5L;_kC=8F z10mwBBcpYy)s17>4jv=p_@nQy#5-l*u)njEXs|J!AQwfIn9^62I(E*r0OGl1PNCw3 zd`o@5P1JZ~7zqhnTi)g=N>ZJfIIPb;H?wA5BuMtT*D{gNTz9o#g>4fXH`Xz|+$`f% zSLg3EvNQT5g_c+x;uvbOu+|J{-}`QM*DSg#Fu0RZZ9ljhEzzo5*kI@!(kF5|Dzsnb z>3-&1Z_5pH$?;9z`9AT+X|A?a5rqO}0x*_n)3lF!Q=!&A!suV3Obj?_8PDemJQk z>kD1!+DU$N%o*`A(C`ZN`^Tq;zfQZh?Df#C^&Vaat}*BaL2M7fJB{Yu7KjGT_l{As zZOK?`UO3w1Cof&jtjTA>{mVUv*Zu4yS5?l(eEK*`_hXw0mW(ytS8zW#KBccvgw&D_ zKm~qBbaDTBV9&MRKh?eor_-^Hy0hc1vagUTA60ZbnQA0lg}D(l=*}{aG|~2$i;1Wy z5xNp=DDec@t_@W4I-78^X6Qe`&SCB7wGxCv9~7IvF-EN0i99CLx|2OBV-8+G1?<=v zp+sSN(&ASQ=$u#Mkew^6!n-9xS4WSr zIViAlh8Lq{t@BvjW)mey#e!{{L`CQ*y)bE(-fE{u&|wp1vepGX0gRz9oO@CHW;rG! zHY;Wp(L1RXlRhY#t!j9Pw-D2y^S4eBS3PmoA_#1nr?*hfQqD8PhcZS~8pj6774oX*LYha*!4xUPb&8ctVgmhyO`Nq$aptG&Wcv98 z%aF^YNM3CTBlJ(`&6!o6&kJ1(M|nqOv>~OEfg>gZ=WKI6q-C;yd!boFiVee-A}P|j z+QPX>IXhk1akELf%d-e;{a_4CfKFe=M~Az~6ZUH=T2)cO=I$<3EF@JbT~Wap^0J}Y z^8|;Qr#t2_<3w#{_pqZyI<4ilPw#QRw8b_kS>2(Co2fF6b<-wshE6oQjSAB7>SFL^ z=M?&A?uXBnPCtz3n04JH73vn_#;eXeuG0&#G@fKouIo|B2o|f}(vorFY_o1GKlUkQ z^-BODBCI5QuTw0ab0e zflky)XH9(o)Bl@C-^RhqX zoO-qbtXw;I?RprhcJ4=DqO9H)Y9kPjI3~8~_=RsaeY6 zs#7lpAg^|OeVM+dkS8LQMz1{koaeL+B(wF1t-6NA*$X4ev)M6no|OaST{uu0C8BRv zJtzzFY_# zR&zc!RypuB-I*UtqmQ7tPx*+EO!#;JJtz}E2uh5ZXQbYgpY^NVL;M~P$2OWhymja0;}r*M+=S^Rwr@-4q?f9SdhPN zA%aGOf$rrqz;d+^Qpx(`Fg-#^5CE|G06oJa5LUnqtc>VRaXT7z_jN0R#d-vKCOD10 z&$rGWX}cHxHo}wh>vHFMFg~nH0GPwGTOkAFC-KFtKM&}fb@Sj0gfTl5G9P<}ir2>KdnSV6k%q5Ih%S=G{7`~XA511J9@?bq-a!iMivt$|wph5q z&JzrlP{@9aqw)APQJtzhQzRC!q{6jxU-0`7Trld40AcQ9FMJken@fQ0G}ULvxR|uK zkl>I)f;~1CPQaO|gD)#RW4}Z=d_BNd*1Ut3gPvRd4^NF4EjDcVX-7V!F z)yBDoqM6&7YPsy?n=n%!Q}6toA^Va#f6cH~b-;vhW%SfOK4NNG{;wfO5_XZIPC-$5JMiH*i+~D33saKl9_{{ zPNO_AP3$PRQwy8Fi7_7!sA3Q0jU5RKhV#g&A|nRvS4Y^3V3-IL*J;$5O$uL<4W|NJ zG!V*&Aqc#sqMWF5C~?oVICM^~}Hw(ldS02Z7n8E5qcA+njEMzAjIfNRm&4H&%C3@oxYvEk#X&t-@P z1H=-Urt<^{=V^PbPrSNfT-QxZ3jvs;-m4C7JB#N+fabUs%>;Xkk^Yt$R1q!jkf5KK zU>giQhD^qGh}v@49Sa;cd5CqZ{cVClE}zhniY+#&KLcZ;nP8HjSQw47s_0e+|!X0+Bb|L_d><;n)Gdi^RF2-@?0t|&9PilZ%>RSMas)6=@Sl{)sJar02B^(4ak1fE!NZ{J(jNEEmO@M>c z3p@Fcf=s=c{ECLSZz z5m4m@!4e5U=y)b;0G8Oq`4)I{3phMYGc7KG=;$D@>dp^Hwfnu9zE_!s5i!!$cecgG5p!^LC zk_6be*+oy9+4>A;oge@vfX1~xb{pW?tyy|iC;5h1;?7-JjzrtO+Rdamcc99enjqCL z7qyCzBBO0N5q6w(C;Q|^vJ+!J(rad@KzbLUY~^j=2SK1ILSrf5kAXpa z<|?nckF{mMkOX7)1g2wiA{01cdnbG^qSNmdc@Iz64*^cQo4_W&(>DEyrVLwd%#vf{vBqmDy@b?z|kIxmnDj;Zswd$f(rAy+ZF)t zjynID0ZdACm)&&dfKk1Lafl8AfmsnVTBbb_BhHxB2WHSWd2mqX(z6%s0eta*24$au zG<2p-kF@PQ3W+6P#!hFQeV6vl36$O!mhKa%H@n`3N`GnZ**F0iUEQN-uHSaN3{^aTM^klUVOx2G$B1IgZG^8 zy@mJ6u@8#;VCqib=^2@e}bhuB;0Fd1HHviTa$^jxZ zDjU@uVcWsi?0k|+jS>n7AwizczioRaC#yCMVr659@ZbVeT4$*ELc@qrcra{D}u;lu%;$QLZ zCiE$8K|hWr1vw@J^KcP^q3^*dXClE@gApfb#-C(KsK#x^s)iZjl zQ#p)QVw?biKrqU^2=W2Ig}6886%m^Nq@%yh8JIacpA4G~^LYrTNzO>%x)qA&K?t8( zY-b+8xa#1f=L}JAty+Mot~z+=5DW%iwDYw`rUJ;w(dQRh#wVUA4HEbrqG;>kCqU$# z1#n`%{<-i_rD@4kO=KT%(zrQE{Kw(-QybqUWi zki-x=V(gJZ-vP`D3^3yZr_YySXc?@P#$(uAVIpv5CCyq32`#!84Ugn?nzio$HP{Io z0eX18&O9(cXF@&nsUQNeBHSq$Jxu3)f)`lqYx>D(!1#OU-zHer(LmlJS~w)$<6HAULNn&+AQ_@^97IFC1&=&y`CT5`?fjX znrQ;N_5@x3toyWr{Gs=u8PTZGXjoetUy~+F zrW=xfN|x3TLywnEUPW77vX8E04ZW2%IqVk~iIc7@tAk6FR92CtuNEgRAA?>d;bdy< zEXGWZ7Wyt!`Q`+Nde%}#KU7p6+r^aSx$K_|sJyF4Gr>lVUlFb9bh=V|b6ATg_w;Ra zdF|7=P_y;M@k&ZG@9W3(7MP`z$Iq$_kezEzKBY5QSATp{LrHl&G%k4;qI@f zuopPDp_0-8Gmo%7wh->MM4`ZkBBz529L$~DG-L$U_{6<36<7pweQo5&hrc2B=@a(T_p9Ah?Rm!}wJFYG7YRU^(4S{Ai+O!imPqXgcMuhwrW z`Rx$b1Rqz}yyT1-E4R`uB-0WL4^7VS$@p1&*>KHWhaeMb)XoX>MM)q&=hatMZc4TN1^E%mz zy*cfqxHCI?JijVB`5RG;QLkbM@I_0udxa^Yq+%{vq`qvxrrufzRju;U-A(-+p5P=> ztf;K}_{nq8r4c0BCssD)O)AAiqmrwJlKX95m<8SH?xE*tAxu-Y(5qe0j}`ioM@Jri zm=$3vEIjkg9K(E_yEZXS@;Tb@M7u5WrI2alEY<<2lD7|rKbzKnf3Bius>U2KtS8LB zAla)?ZjN~FEP@UNS!yIxLAyZ1NyjM z8(yv5cjd41U^#|HFWmNIX1w9WD)VOaoXW>uru7L{zT6JNJYAY<;_6LXT076E`>~w; zq8r!jLm6)mF5caAj}Bt$f;x}l<+d`&Qi9g@XQJHX!wIa$xrNb>L_L)b)NhvG0hpo^ zpDVHbw+YrJX{5xEYVP&h1d}u|kRDAxA6_c1Ep^}19i%;~ z7BvfzXjo=@V^GF&!EgA|Iy*e~ZWmKPIy8#Zk$I=DyX3nYSZK%H%wm+t-0l@y$RHeX z#tW~uhi{eB66RD;vE&UR9~yC#m;YawVA`@C=qr4i>jAMtcl zzd5kB@UUdwt8JI^V%&{E=zvOrOI;bb)IHB;g%$eC1bg!2N!zDD?{r@7S=FbNckn?m zGH>SCCZEVk9u3WMe*FY|s+lxiOMi0q&6+LO)85*n7p@Gv`JjJ_Jbkbc_?|En`h0_N zW+$3?B_wGg&{(M0tWfrB0{>Xic@@cD%g=f2<8{KV6P-vMkD4`N<#ht?`RRSnIiU9V#MLa?nu@6WlwyuutL%wr-9E!U zL5$H{AC)>aedg5A7hkPbuP@y%>Oj5Y79#;PvI zW7)mGzC5g1()N$?JGrWtS+?mHI*(+r93xLWIXbSjo*SzcEk1IsJAwK1$yEapoQmx| zVN#fCfF$4E#Q2tS#Yj|*K07NsE}eyB#cURJ@A9F5tjSwkvJ0QJ-r)4IjvBc)>m$Qg zvQqbz&W6cVmQA|XY3F4x#BQkvm6>+E`}8Zde(Xk`bjQ(|9b~lTIQ8+hM*#fX!Ad^G z$vZI!-J}z26p^z1BAAQM@Kr~MN z{2I87g5Ukxy3*LLm@sS;>TEmhuXp=rPLShvnLvS(!H=J#HH5GVyS5M2Z#NC@)%h5$ ztPhA*U-weO_Wg0>bdR@8sg2tT8`%gy3YQ5XsIqWoQk*)vva^d6X&*iAd_`u%bn0Fe* zK^N9O#j#43wtuS=8t><>o+#rBdMoSMJlYblJ<_%J?Q3Su=NS2@ij>orA>+j86xTg9 z%=MkIOhbiy`Gd6w&Y$_&pUTw+MSUrY zgMLj0el#4cCWG}=s%E&pIP!+7OZ5IAhE3-%h}H=J)o2LwTs}@~IOE9?CKTw)*fV?yr~Sm7>eV5pH3G`Sj-$^^D?E$$ zyKt#BW*<~6WXq}I^{%4Z-9H;FFrUGj7_&Ir#pU)MW9bo@Te#;q-^!03wJr{rg(%(V z*BZpLnHb1zcdFS_xYW~lBRX$|>q}MP*;0>43>}d>dP7~=H88+RuHA^+2NN|fa`nh1 z+4%_N7YtEJ0l5o&qPaYwr!Q3ynWGLf!_pW0)MHxdKpRAQ9kLm(rP-U-3BT-4&hY5) z)h;o63LMwTugBrveS$sXfo+{p5-0fD_u<3b6@N*CUPJ5mgCM2~(CkhZpBpl*DvLF? z6D`>Z5~G46hL3L?u2R;Csk}~(v;J_(eulbOoK#Nt%01|pkVN@miY;(!46!Z&-82;nnYmL!c%Kj?M}vMHl(to zx(Bc?+3d!!Zd+)PI&{Q{HfP4}efOm@Ev?g4@gl2=CX8-h@XNjIm!p_UACRDk5sEft ztCH&C%+(v*c@UE+on$$p#xBV-4?2GNL1?m_bNksFm1U=K57fjK;kz;Myy70;mnxjZ z&+)FDyRht0MTxD87Y{`ULV3^KQvknNK@0$=j4@&KPTBXzH-&J_vUVl za%_dpT-R-;!wPkk`_)yFM9vZR6BU746}o*l23I@t%ZJ3)i8vl>3DGIeh>ZJ7bPN3p+XDuNY6^Mqp?7cU@{3;#=p8(#KV4BXQU@7b zUaZ-5k0r<7`%-zIO@=X5Qu?Zocra_BEu#4HiH0LmX-B1Q;G{kouH{XDZ^3=fPl4Ir zSRDl8xl4&0vUuK|`+@3C|pZV8r2B^4Vq`vxa+qQUu|_-I$}X73#Kf zZTe!%mk-%QD3Sr2ALDkruoZW7*FlsjIES#gbS+zR^3k=s8(mL~x+kWr#-ouHcF!*# zZ4@Qmt*{F&T?657>C|-KxG5{alACq${kt*49vgRl#@u~T6IdZ#T?ZeW)??j;eCtUa z)<2&TVYkf|SomDO3zn&y&a8g^g&=g?JgJU;5a`K}XHwU%flJm#mkzf}g+#p|Kk@&H z^ttwB-Oop|3m4F?SJQ1P%W=Af;wP&#-=)=Rz}rpfeA6RVn!>!(#b6%r{5+_=ypF>? zLp$#L0`6#=)Taz7X_<7XFC6|~`}~$4!k4RlyHtKAkOgu)^X94G*o}O$6zkmi4-Y^3 z4IWy2{1q&>N%dRb{271z!8U|3b&Sg;8u{^K>mY@1wUXYAVY6RMr{)8$W-=qm=1cNX zd-1?YXu{X%&!DYSTI%|lDuL5XHq(7uev&Te5|XM`(9Y=Hn|PayT8MB#2z|Y*Kf6Dw z%yOxnG^qUkI}0{4b;GY?12k~{0g?*5Kl;HbBqHBHZM#RzD_S+Om%&Z3fdRA&{z>R)ArA1NmE??v|`PgAw@zP8yzR3f5T%t!L;d`0pIBBVn- zt-%;136P>0Vag}`8^`4R<%2X>JW`{Zzw7w5jCHhj4=i zcDpUy<1x+WS*J2cM#hZa3}ph5#=%-r7dr?|YX%t>1wPEZ{*sRawhc`62aI}*USg^j z15O4JRssvB174*_iH!LNN1&bG;Q~I$Xf|Rp!jH^9^xF?xd-!>!JM_KsBmcABik=dW zUjpiR1of=2vcblS$|uwrDM55$Ctk{&tX$xCcqIL+QBLJWmf;J5G0QdKVa~xrJ-j@< z+FK*})?poe)gKuu6T9(tR+bs2-`Xmkd}!^lPq{!dh5gNCR_6?8$AXLeb|<=wwBM)( z{5HX!zW(;m->dA833l-{>c>gdicnp(aO0=ZH@w3wA6z+$8i9I!MU~?9ABJC7i%j?- z=U!DlM6pP-EO-f6Gl(S&rCm%>i|TzEwURopY&Lit^3gBmE=cm~jKsBv-=Yvr1lFBL z=HZ$LJ^5AWn%$Tm?P^3_^;qMbO!IIREs*_L{Q*O{(GLwVW=#p#)e~2G4E-V3x;mK5 zGxqPw%ZV_ZPH0M@nPAr+o3+PV_cq;lub#Hjl=fNu=8vYEhwABw=5*FWjSQ~l3?Yq7 ziRMgsjV!h1EM1LkW14iLaqCp`Emw^k@8+B!joTNSZ(rBQO=!-&sd49a^PT$|cONw0 zt=GtV+MM@HBfqyfe^{envbo^B#=Vv1d!IG#|7gB{s8NU@7qa4txX499xMB%%u{^Fs zja;IOD>WvUTH(r0k;`0h<=*7-Al!qCKE(L!R?Y~X5X5Yl`o(ehAU^O0K1BVEl#t2UGEig ziD);Fv`ZJSI+2}95iRT#ZKRMHqjk-}oa!Kf!Om9s)*VU=S(C!+64SOqIl2?0mDr+- z>}qG!SJsN+a?|QcX#FY-ZYco^sN)4|j08=H=x~sT!+!g9&6c3(zRA|jj8=3mUU{)q zZKvzSqZV{zw5|(;Gm)sKwLkE>b@b^w*_u6ZtUh(GYY1|6s2=|6bEi;E3w0Q-PB74E zG@87AHLwt(zTV~6*QH?Cr5%0{k!~>g?CR9bgx6moxi#8aO+7Quv^8=MBBt7O)3x7s z!Dn0G(-*Zh`XITaHeJbIqo13#==No6i0nuMiroSE$Kk$$I_OA%{?PrsfQr3|!-J*Q zMuO^~r5H%d=T7pN>iK=Gal7`&bgrp_VcYV0G9X)t0<5jzVMLRnH6~Q~?tG`%bb=Qw$wk*)R^nLp$ z+Q+)cAc<4u^0tJQ{X23^(aOy;x1;U8VxSUBhr=rNlMb0ObS!hc5*0}J>sW@GXICC2 z3cL=#F(n&0sX8>T=6_QEkq@daa4+XgjSuJZjSmA8a+8nq&b>8jY6>xE=ynNY$id!! zZSj;%x}edQM03MX8y>9#rYGVmUT@1eD10q{nQ#~ychNsfqK-xQcxfUv?|}u+Ra!Bp zXuWQvZMx98b#7tZH;?+X-#Kf~saW0XC}_>Q_`c@!alAxng5Ow!|Er8A|6+m(UB7mq zYn&9p7RW3+>ftAio0WQ*q$?O2on~ReHh^AYz*(nP=3|)z7;Pv%XvWV{6++jg^BD}7 zV@%UYjHv!#Y5UoXs=}izQ6bWYJUm~5s!Wl^UHV3^JkCoG88%W$!gBU+tJ5<(rlIhNRsNTItOctH_o4~= zHw2>--`xD$1QW=n>N7v4(__@W(}$0py|bhghkFx?VqBxE6+V@}+9_nQwkg3YL281l z$nLnKP|etA33J1aj7Rsajpe?*infjG%`;6OGt5ZVuGm$O6G4xeF zLQnTUJ}J~XH~nBPp;>&TNCo);X$qBF*SEO-;oag!y};6?1?iNJ=W85ixKjqApBe7h zJs$pkLw0HZ#fnTU9fP=6_&#bf>E;U4&&sP(Jz9Yr{_Vd_u=fQ&-d#OKXTg|K=P)ew zyvZd!Mb6M!^iWX3@9q7RmfUv6AP4h4zZ&;j_bz{A43%2{hyrI`ZhUcH@DrBt__pG8 zh7uIx&3rwdoY0Ikot_QU$(Pfe4cWQ;i;>ACrEJeWCk+b)Idv^XiE5_0h+H2_K~hNG z*f`+tzDLtoMs85r}sP3S+eMI{_L}?zf7=q$OL)%dvUQR zTBRI)&@D}l_+0M{*`hw7UX!WQ#Wpfh_M{`H!jLuDJ5u5}1>wDPMEH5H7A9?!@J>JD zZ0j7W!fs5ePihbEXJJmE7p5}HU;6j&a@;Vss7U_RbVR{00(o=C%9L?hh?jL0irs}+ zSfR5H7qOi;qrKL-DSh0qn_2vHIJ||oxk!7sI4l4at0WrUsluQ1;_xxcNnrzP!@3;A zwLyh=jqD}aKPK209nD5-<>))N9V*?JaNeWito0FgsM<9-O;xTm(KaP8s7KrQ;D3%(EY5c zh`r2MrwGPX5Ku=7!&rY>=u1mxk4X=(6`Got{?i0|XUTLu7j`~MJkit;!e8NFjXx5e zb%lFe{250-z@`5tgZorS#|b*F+tn_ym>*($!F39*ovh0fP_nfzowj}>a<0>hi_9<} z>FRvuj|p~lSZwSu{VCv9NM%w_K`9dV?H!kRwvquzB*pdc@E6SlJNa4c`t*^TGcVe~ zCrJjYBs|{Dn}W*OjFsBg1ampqQ`AY_(r?^ZuXA?`bB0{46Ry2-kNdffCN=(%rU%zE z?jD7lt<+=l^vgSmJsJ~RX_wx3?sq~ub!aA7l8cv!(En!p5q zv)x#E`jhxYUSYGZ8JKlhc%z&gD_vf<4jE(Bkln!OQ6VJ0H)rb2;V+x&RCjN}E!XJ& zDz8x0XG1DVT8ZBv>2z+aR?}(S#)t%z!13D0Jnw=N2Do3d;l4ezrj%$#!vyvT7{~4< z`jDmPj=glZFv_pvF4UK5k_F$&K`Hs3Z?0qu=~tp%=5D@|%up(IHh5kD+;kCP>Sg8H zr6wJnOOzQoGcc1z+fU*GE5>BH%8#av!lFeph9yFoK@!|g zG*zBy+{*br^{G$9{4i^3=y=MRi!JX99;emZE-Kq(S4zY?2LfTyf}ws=iF2H29wyGT z8|9u+tE?l)CY~~E3d0(&%EP6RvnK^^4BTn^!cmpRK2_6A-PhiX7??at(!1FDOz?Zd zaM4}u<~;ujC6VppFYQ`KCKmZ27J@covbm4o;~6wRYD7g>)uygRa4=m^p{1$;iHjPc zY#Tps6Fjz?pfDh~KgacqS?trBU*d~h=y+WhakJeoENub-lj#Ug+@WtAH ze|U6L;GMhR5o;lGHOLhS_rwZsy9*AvLv*bqV94p9OG;;K z!Z3AWm`5=zn*b|V7|Ak+62-GAQy7Gb5D*Za2`YP znAtquOy{toa3o=w!FaS&5jPpnQ-Bf}8+p7r$Df1e6a}$%(yn2{W=Y|ynBzCM>DZnV zAW^tlQLLi?Llc4Xg#!P!jR4wK_-uvHP!YG~91=RpFkdKmwTN%oy=POHU(l9qp1@d$ z;kPQ{9&+d68Wpt~wcae^ivqE4&WcFcauyN=ax(gM=L8Dp1Qy&dTYBgKS{#8*EXG!H zZb2Ytjvb7ryEH2CiPCRB%B4&Zs|8r~+8Js6_Ei% zfphVEKd^`hB;(_BhI%AwlFC^~;ooCxdq?2}gBa##*>PjaDPh3-B91dCcJkcWcSPO* zcV3MU`ldo|R0eY(9;Jcg)uD0*6mf)aqQHvCN+qNog^T2_I3mMuNu;+bQn`WF`Nk^b z1){?X^E>ni7S2g7De%8k;xYXyzPYJUl&Q-Kk_BZy9wKUY&9X--=?HCa4B^=d_1IpG zT9b=7(lZ&EX!xJ4?gE>l<5=qui2bl;7L|+Ew`<%W3A1!|a3Wt>ZBn>cgw=nH;+GZ# zlJV$hYvqr&<+~KFyTLs6*(QAF1!GaX3^1VpB3jIy=e)A+@|+rDn2BpOUn^Dr|6}j2 zqoV5CfZ@+D%)rpyF+)j6hosDaAP7ha(kUSzpr9b3Gjul!h%kgo3W#(_Dj-OSfP_Ox zNZN$Jcfh#4>v^8{``$ml|Ia#Woqg8XXYXrY*Y%4&rv+VQt^dU8m@D!btj=u5-sr8p zH$zv_!oZ@U?a!s;G0VmQCXt#sX_*W4;NtMj<;1m$vwAxQyyTib#}|bq<(bhA&egHy z(uX+c*U+6@Y$Ys0836bWaQ-=MLt&9i#d8(rg65G*JfVdq(nhiRst_rT@NLdeThQqV ztB||dK;5`Fv=K_3Ip?VDC`GV*t2w81b2wj}JC=#oi&kOtZsE0SC34T@ltNN`LNgqg z5Eyx>uzK4-5*ba%FP&hCY5uVBl>f+2ClgV%(?pJxg_OA$omOA0eIu9b|&@& zthG3LXFCqN5x(h{)ZY-uE7I2IrO9w~d|5cg$vhKlE-U-lGq)3XwNWJZ#3u z7gZ-~N4O@XEjULEsO^|s(BytORNf<;XVBy?K}K4(q{R=7(oi4 zq;bKM{R@h`2#p-7Lf%N`m100%t9GKc;}Dr4oyMJ`N3+Tpu{_S9^|K{qYvpW4J9dJR z`L|qv#r`Ie_SP#W=3LfVgqG>-#pgB@9Ph2@RoF;oNP+t+&|)B ztP_q-GRRkG@}d^A%C8QW?OdY2l9F6wI_|?nMd0=XMN~iH{OlEH#5H>zFAM5f8f_Fg znJ?_y40#BG;iU~_$S6??)=OxFNM!~dRKwWZ;wmr}M33h5#!=l~LCC7db7#9>#79K8f8`_Mc z(?)S}8O3Jf=q!R*+CZ!E{C@-|%TL~zrWoAh7Lg{G%T zZbvfOUGp#{Z?@qu!O_ZNp$j z%gkK^^4def_s2O=uQ^ZoF_|DviCsf}YGpX*N#JBgGla^O{7Ood8}BMHsPw{3sSpkYB2-1O1?mSJ?}`(r8rk?qCIS#t74@opoB+Q;?>nSSVX7J57)c zZ5!}X>1^s)Vnzr%a!9`Uz?{q;F6ID%H~4|~XB!~4X3t#GDUS?8-lCp;xHn@H+T!w{ zikHS1$^XT_81Hg*CTmVEja(i#XhL)*hv&B3qmQN!sFQFt3^1x$PodA*Lc8}CGwxr? zK(5NV6=MTyKuA5Gd*&?Rw0=3qUf+do#fTzlAA_sU+b~J^QF4sM)8<-$cdi2AK{a!7 zlbJ_aIdo!mIl8P>%Vt%yLC$ws>+>3@lFY*nzFPXXKCrCM5B6;|we^FVN7A=3+!|<_ zVr9r_VPxQfuh$wZ4n#%Q`Fys8zG@}bGfMi%icFD5vdv&Gr(UpaVYj<6&JcXT6YRCiK^1$p>3hi9%U2(zvm#Aei5>CAx3{|_zqW*1)CJ3$ z?&UCq;A<6-b>se>rWVb07gp&!LJM3ZI){bSa8@w&TS6QVQeHGvIT$g9zn z^7o2C+}!2%b!^C!yRHY5?xrZ$QKXWVEb`p9=W`RC0If7`;K9Y^{W^?(kB8>2|JAOxq;tAap;ny1%z=A4Ol>MYEciwYYwX(Rx+E zIcR;;uhsB%D>9RhZk+QmI_{wC6`_d@#WNXN0&{Zjs}#DgI6a#>blDzis*LOo5o?@} z9o)Kld5ROKR1TgfX}z0zQ?bjV4Hj^XBATtgZ)^lUl|tRt`{cacqGkWy%+wQ^fu?Fs z6BO&t45^6i^9IF{^MO^*8hh_yU%p2Z<=ej0+Iu;Q9-?Nid7CM*+*o6K2RblD7=ovH zUCWWwLRjwmoaj2vu$j}i`GOOQD4>OzMrE+>?o35PV(IyXiyWLGEtm)FLwyayr{&=$ zEfwg!E74m+0bFm^O}jU7D_j}W&L}w*~hBb6TysijNq(4fD6P90CJvV3nJjMWJ z{*?Q5;YA&G&8)EjKI!6Dr4|i-dn7U?V-**kXDCG*Ur@fpc5${nquNgS;?8W6#jzI; zr<($HclJnS%kixtq^u`nWGiMnV^|e#kjXuo?@kdiZ;p|xTN%>@Bz~1M@_VaJDuB>T1)5Z)~p3cU6Vm?TF6FAJCh6p*>uCewf zW4Ub}#^PQ(Ld3y08&Ae@0}-qo8MB2I*vCbLDBI(@wB`BZW+-X+@7t3^A)=OSBag)u za77E1r$7bE$~*FKJW-8nx}gJRh>+K^#9FBN6C-I^rUBg+*l6Bm&e zXOXmYFI>|xk~h_Ie~B;6PDb;7xTAU@U%cB;dhT7-p`}HleGSuNQKx4Gc#=8pxA!JH z`S})PL{$qcW}lalzn>rAzPy+lDJZv;5yr)`bPpr1S(L|#dbb>r>Rwn>!Zsb{`cU{f zQ#<=RGHTUw)q-Hn@>aXtxC(LSoEPjpZ9_4Snq6|-Bs*CSR)osiXo`)?8)Eq${0aH767gre{sCeX zTn$=}+CTSXuJ=&o^@@tnnPgDx z{SEsAza}12e#$NzfLWbYMs0*`cCc>v?94E}Z)HkWc+!QKeiQ%XlUZhB;K6=zb<;Pv zC+3D0hsaBC^cJ$5SRexYs3SatNy?sDSp!UQBPZ@7)Wg@QAy_WTN3r=HW$;1^8Xho; zezV6#DvTn4jffMo0pRht9?^+R=`bcd=8^^mNwGdaz!tI|CXQ;O^V4YI5lp;%;F?RT zM%%)B=CWV6S3CQSO|Tbvt7EIdv*;VRP{({5l3<&IY}oK7fa27)$fv2fvh9HCVV_nRbfV%ElA+dqCmUg@C%Z z*a_6-#91?WQ3;svO571YEgWM@79DYqSV!O?Wg2(ZM~hPgYhR((Y*CfPV<$bslPRK8)3z|I zQE%11bW7auDc7$x+c6-@;EqUZL6h~)G^0~&(gOqDil6Axo4ROA!a0UKo&8Qe$_g;S zMo1)S^!e#>oK5uBT;`I#wl&A*5b`pqumL$$$6<+SEvk8PX)Jv1W0JlcF0D)h+pmtN z(?!dgsL5981yO!OFu&`98D6bzuP&k_dQRsE=UO^fcBb0fptQ(Abe9J8{)}l){rm;x zV%a(h5mr21rrbbx?eIQ%RiI(%8x!+dwDXMB8e!=r(a{)rn`}xxO z*JTy`C+33~7_oaOIR4(`lDr5uaIN%e8RdYhy}5d z%@9j&wbDE+4+S^AAr%_)VEG~%3JrMT=$1rd>i5E5x6z!n6Vvc%O@!(x+D25XB3X_dy8~U&IqR9WLm!?SB&>M!#-vNt2xTYLx>xP_&zVh z#GH6$+Ux87(Q}*3Nm-Y2prh&o(OB_YjRW(KHOJRK1VH*0FSHo;JaiQ(6I7zm9=<$0 z^ie7kE$pImLeB5LRmEjP9T(#V%p)l%Z^@zRB~wPmSH&+Zi!WYswYY6IIu#fiPQtT% zLD%5ahpNZ-5|I`tk?7%q;Lu~5WFe0JjIZYsu9cZOUGlOqpO|`ET~ioX?-_rmdEu;7 zz0QMm$_ovhDFv!E%AWHRrV8vU)7KtPU25>{ztjA=*{2!u5PIYJoyjTQ%aJMMmk&w_ zUc7j?<(Y_GqHHwx_TFs~O1SLkWFK&fwcp~IGRFrer^fj-1-UMvlUFD!zFiT~t-H69 zurO2laDmHOZpGK?a^}Y~i$xH>=+hLr5ki3n*SH9(IvZk^ugD6;++)ppU~jYWz`F63 zfi=URn$uOhOW+D>Nq$6=@tUlY<%)jx&Zy+=YbEhYFRS9v@+xv(U7dkz3?2=m#b=LK z#Ip}yd)Bc0W#2noH1Pdh*YNO#ho6N$ZLg(Nejzf(UJg^W`Ve{l+x+qPZLDfmZRUZ( z0xElZkg?{I(-XFLYRWs3@pNylTTiY?D_?u%Z*`^S{9flX@!jC;8*w?$lwQX7ZOu=t zoeKCG7;TMp23pE}2{oLEh*9=i^4HjTc`Wm@Qb=Hzt9ABp+2mJcs{Ok7?A?WIrJcJ6 zE;$o5yVE{5cdDBAKb)W5-+WW`ZDZob=mWn1*uljfCk)FK*v9D1{%ekoRmb1tQFP;x z=gTo~UtuC)Seh*`oe8$#8kUk+t~nRWHil*VijAt4<&wqnjyb0wtXn=@is81SF1U7_ z$>mrYR>(OR`z2`jzKY7f)6cGt7!_dKjUmsCF6Z|3909M zbMJJhXSII_x7~TBPy-DM^RbZVkD;8HP&*R~=4VFLCr-1vdE`WBMCZMTn5BpyNaP1tgn>z9 ztgKf$e`K<(#a-V>EF|hqK}Z+I%QS()2XhbJXFFaBVv<%<3NnE?;dt%`Yk55(aQW6vcv=Hrnr&LR zNaU^fi<&O*v^4d!08`ggx#ZjFX{-0s@5otS4Uh8COwZa*N8w#^m(mgnGm?y!0qIor-QdspB92RVb=7|%Y4#930~pglP9cevi`IJjVrK?RP| zC5JI8r;Is=wkJnQKZltl7b%!~P@BW{f|vaxo<0ovv_y!oZmgWcbV5fU@8pkqY>Qo40PH7OeF@Mu(6;Af$1^#w>9rO z)aN_ZKX|SQ1%W`P#H^iB1tNG$pqQP&hN#mB9wP|C`eg%Lm%q}4*;g~6E z>bX<#t`Gz&dRkR{NuDCGytw0CajRgdA9wnIeAZL>;wqM+SP~GGm<2tmAl9XHuC8df zC#PNhVMS!=lv;6@OYwsp(q2T-Y{tXarVkecbC%4?9@ag4I`XjF%;KY2!DD&QgZi@Z z>t$O}xE;UpJL6@YpUbMvz%fN2Dxhu<2m187d?dY`WF?!dp~7doyo0`C;x(9dpdh9P z!u-4#Nt*NIUBv>)Bau5P+zJ-F#U-Et@U`;d@|H(Z6PfI+mCJIKs5^-=3KX)e>YA4ssixV};U4=6&5A$C?4AUx@_N#f?Srcr2 z=O$@woj`3uaagb*2r$9&{3->C>(Uj#b2cCl6~WX9DBZQLNdU;a%1FIaUlx6@_<4Q8 za(%gL!)d{W%II5F{tZp8c>oS3*4@yo)p(qwvGsX;yLsbt*4$^&jU5Gzy+Tb4?;88f zBL)?k#?5gfu1)F@O|MCtceBDTx$#ujSrJLIb^J+M}qm{t~- z7WzyaV`eL7W*S{@YvMvH!mX{pu9f>+#hQPcaAvb0Tg&m)Hc9O)F0%Gx#%)q1?GeuH zCswWG8r#t&Axf+5W6%zjm=1c44mCH+)0rK*tNvQO9gS~0^xQh_k98Wxn1=gyTK3*) zE$Ae6;m%p^bO^ZpG|a(PFTEvt}%Ju6mT&j?0Ma4_LI*s z7gnEVxTTdH@7*7#@cs~!vPm*{yl+^$Z^W%{G^TI7q;I0PZ)&yg4cUvC<1gm4U%YjD zu^97Wx#Y!4?~Ap77a}G^>or}^dwZS41k6JFw@dnWd;9lR`wz$lz`_F%odM{j0ixIe zckXM%ipOIRi9SGS%z>TH!gf5V??fp1m~G?ujl1Jvu?DgB_9QOWBal&1>hYMlaUJT{Av)t3 zmqwHJUZ+}4gn3P5D|J?sPE_{MKcSC{u(!9-$+@?u@MvWsymG?-?nE#74yWMf~)sM2^IIYZlB+0He3e5veg?8NZi#Pp@9{@8K9HTub^@sYsk+fyfizN8a< z(^G5Hb8E&| z!0!vovRL8Q*cs?JW-Kcvx%X#2Uz(v=Z=bw0%P4}!%+0W7&7_bqaAg4ng46sDryZy# zgm28Ad^js+F?(W~etz$S_&&Wl#het!EZk~F!D?=se4g^;+m*ghBaV6Q$+=S^b2ozL z-^9*nt@k*J&+ARk568Y;)DgVaI_jGB_HN_&#b$c-{kOI`^SX|cb`@_OA1+o2FS<0p zaT;8}?k{*vE?BrPGFdI!p%=B!F5HM)a^YCk7m4xLA5FDd^31{na4d7%BGjIrx{Du85^=z zI!~51Q80D5uRedV+PA*iPq8*AvNn8nZNzGu_yl0Gm&s_GNwf{Z)#(Pf64@l7u+`1olJw64^N_%`f5&u!H?4v^eN2QIA%9NkbqMuZCKdJr21f%?{FM9CV zQ1`Qu$LI6$pH0g?oA-aV-1uxw`NdZBi@oj_2ahjK@n0^MeR1voa%tm>2j$nxqF=pq zzh3qD>J$IUvE-wVMVusb+EFJ)v7Jr1lOwv5r@M2{W9LEqPGQ+jasN)~#!eaKZiVP>rS5K($8Jsh zZe7`KL;r5m#_ki!Z>^%=+I7EmdVG5p|E;_1+w=Z!eH-8UDfb3N_l9-%Mm+XL&0dS}ccJ#<^vK`#N_|hO!vFJ6B{> zYSo!xv|{|wvgrnt)p*t9kwagW+8yIHv+7Gz&7rL4*DdO=u1$B|Ilp1m9Jsd+Wi#2d zX$vLiRL?YdZ}&8oT{(=+^uvXoRAK9QdzFd`3nNa)`^=Y1pQ?$CL#MS7}U#L)HvYR8JB6(JBF~9Px!bXRruF}`}N?qjxLMc5U zk>s(SDy5o~z8a&$WBt>dp;88#$L>8g&=T#GGSoTo_OYR^GF1AU{%PK-bB4NV(ndz- z9jc7j&s&E|8=E@at1>qC=#)Nh>GQVgy!B0}jESuh2{8z{O$h)0BIXJ5O9%_fh>QH+ z@7<@H?T`)^9#rT%O3mOl5qd@=pkl_CPX?T(1NwVQwP@y>_H{s$t@ zJF@&%B<2T^7o&D3HpC(B%r7O+DM2eB;qh34^~J=q0ZDZ+Ntr*1Jj1+{=zk>gyfe&0 zGg?#c-1p72{=1Sljk|Uu$EhOc)`wizpnSKK{N}s)NAmzNufDhRJC_God0PXOt%KD^T;2r0<#oQS{|lD~Sb1He&1+*l zfR(p1-VRuK-=VyL$!FiKJOIkuo_>C0<-Hi2=?AR5(b<6`F7M?pF7KC>H@!IacP?+? zH!knp<{}{Gt$lb0hb zoWe^X#eg{t)wa zaPvtgm+<-IG9ACfJiZwoYWa?GUTR|#{V>YEig}j+F;D2bn0GCfQtwNh3S8t+%o9l| z2gJPkRw`M13L&`^lO+TY^9CV+m{&DNDzKcw^Fz!-FX5w67Ad$e)aV%1k(lS(r}5!X z%#%++#AuYIphD2@#7uygchMk-9zHl1#s`Rbj#LjwUdEBj>|8k#^K@x`h zJ&NyQp7BpHkC)1vO2D1)P|WMS0*HAk+>v78gh^)wlUnj3SyT0j;(69jYLm!D*8*bR z<~hY^v7|#Wukc*vp_o?;hW7qqk+~3g2rv67$G@ig~yGLCm{_Q$=)>5mI0W!>o?PJiO4phQXnjr*tId z*^q6`e;4zdPY}TV5c8~p$deAmyh>0;?@f^US}Zw>B|u z>K2i!s>IV&myfR;t%A3pz^7HhX*sd1t7C&7R_z zmlJ-7d1Zv$d8dyx)T-SK$o(be-Lm;NG0(7l{2>@;`cups|C5-<2Z(vce~5Xl!Qr`N z&evbgqi+&EB7AHZjIsMe%yZ0%S4{!LyyJfn^CYvBR8x ziFrmg-^IK$VIt=b#XRo6ig{A+AJDt~5cBxp0^>i#yt-dv-iHK2)-4RN%XcyF3?Sw? z0Ak(}S|Dg^l{=gNH!&|lQBR$Ui*>%6q?L%OP^goivZDvuD$(5-K&7y(+RKz&kx`Wg zhhEG6FZy~0o`9|z1hdXp{VwL^Mj|%J05LC8u~vHN>|#MG z!6Oxg9ze{awq+RplbA;~4~TiCHh+kDp`;RknCAhAc~jUwiFx&Z5%XxT6Ca9sHUEQ{ zXZAzPi(vU7<}G*Cs4rYf^xt?nO{GlSE=s^ZY`^0=`AwsBN{=FrShhnyIA0xZy3~0lB6fa&qmB41BiJu;86O4 zzl(V!e-iV?Y)F5Kc|08m_Ofz7%=0F1B9GqOtbr3ilPJnPkUbyx+Zg~cPxnyFJAEYP zQ9&xNkk8M-LWjv;)Bh0jcmXlb5H{+m;@nAp1A;IH#5`^Y7V!^a9=4ny-{!<}f|9)0 z4>2$4mzZ~+U?~qG^Ha=A{VC?5^NqHweAtHoQ$O?Rjd~oo-KZ|+1e-ZO)4#hm)fkQDb_mcUUPzy_HxQT6@SK0F^?Az^HeB>@Q}ZXdB+aLyk->P$)94L9{xzo zGk|sf67y94D(1-?iFume#k|>n5c4X(i+NzmAwvt;+gsJD|0L#d{YlIlI1=+@P=uO@ z|53~XABuU)HqqgUN_RD~_s5w&EHgj+oONJ|gZbfzGjJsJII`ErA+u-_dc?qg6Y~}l zTr20u2-=_v2qn-g8Jih_uMGu@3&ja~M5+x1jEiMYojU(W%#-;p<_-Q3^AcF%YxKNU~mwav`(>?Tdle-!i16MPr*eDn^*yrgodiT7K>IY0#5^G~fJHO^CFU(0iFq~uLCjM-6!R>O#Jmif zd+wM@5kSm~`XS~`l_wH{e~5X7<=67X#IgS-=J}Ib%X(RPJ%%J6iFpD)#k`?EiFv%g z#5|WnF>gJA@Vl7T3`|RokWZ*1F)xWiXn=qDZ(?5ae-`ryDU`-*ZYZ!}NbgOSSK>DF zCqb|l8>j^c<*-R=jE4bYUI7k1c_iiy+fWp=K-KZEe;4!eAbr1!c@2kRo;fhp{Wmc$ z<2Ny{01)%!55+w9mRuQ6;$y31BAaCn2_)-`gx3oQqBlwYLChokKM?a4eu{Yup2QX{ zxn(y0Ma&aC67ytmlx;_1p0Uke#60FhF;BCc;5s1Y0U5u)iFpgZ#Jt9zVqOf9RokJM zx3EdFo^UAU;Q=wvpV-FJ2oH#Pjl`{u1XI7nykYWDJ=uIcn9F|@^FIAe%&Wu`;9AH( z6C8?p;t5ZOLF8Y6l>z2AG4DMIAm%0glbAONh*ig}vU6Jy@OTrlbM(QRm>~=CFVK& zA?BHT{tsea4*X=fzviFCynN6PF|QIcCqGTzU0*_gq!jv#mvF)!!4n5XcYm?!a5%zN-RG4DRy2J|;EkBQLurm1SVxH7LiFs&3^MpT%dB&jM#60`o#JmTGVxIADVqV$6pTs;7#D!BwVxHjd zVxAJI-Y+qa`B2OgITG{4F$(UuzlnLXkV7$#7UKS!m?!aH#XQabt76_jb?r|v4^Dbf z>~02)L%n72YJhgjGf5MxR^Q^GyoUAt% zUjCbymwNh#m^c3qVxHD_F^`l*^Z!uHn}kWf{3kJQp5UjLcT9zbOWXJZy$H{H}a~hhZS96rZ7OvQ#cg!oC%*DiFp&S|03p{>F9$Uih16LVjk6>#Jnqi5%cE% zi3HHv<!m{$mhd7nrKFJp;+iFt#+i+Q| zVjkmTRp@?so3!8`VxHh1VxILMVxHD-V%`i~<`f|2rH4{%QXh$V&EnHAe30opxMY1B}>>rXLHGX{Pn<{3Z%$bh5ZV2Oqn0|H%0ye|oe zdD6r!(&aCM;rw{8T~F(+tEu0`ytWpiUVT8!L;Ow5gHynNiFvvD-^IM};d2mCYe39< z_lKD02FsxTA?67@Mf?!+gsD(MX5Nv4tJtoEh*R`r*wgge>1tSt%BT!#2O*CwGAAae ztO4CJ9W^&i;#f!6SGHA+d}6k!E`mcb&vut(T-5CG@fQiMUK;HbrUT_so{6E7RQ75? z1q5YAFPBbF5RWY^snzs}laR!>jVHIF}KGN=e^0vfV5K>2z8=``aP7eg`h9ubGFf`75XY}eReh#U}@4FS4+mO9snhZIyIr^cB8s^Uz5fIuxEU5-!! zb*o@Har8R^1ZYqigdvu~%`SAIi9iT&+Qy99aUZ!}hb#zDfd+vvaAARUsx0lcmj)7nf{wcQ8wFh-kY&a}k#dHUaCq}A897|l# zwIukig&<&xEY)y#e)DX=RJa=j5`>^BRBn~dNPO!~mG@ad!kYoN51!<_qxQH)X?>K; zr*4>igDI`)1<)mWe)k)Qsr*C!d6zhn7Eq6Ue9S&mM+xNFoVg#+?%?LJWEiLbD_Gk|3c=U`P+kGpN#Vq4h*X+a11SLIVV}^YE38yXaMFb^1fW4n zEqy|*Y(V}o*@LJ4l?MD!s1zBDO!eEK{DGSm^_YF)8!c)l4J>E?HC!w$wE|LF1yQen zNZk@?CYz?U5?y86R1^uk#t1=a7^QC*cYb+hDIzjwkf#Y|%qvk%EAfqLy`)~^?O^$f zhNr5yVFnNkWtX%!K;AtE8^3J2nylg#0orC&@-{rB{!;a{A{Gr;c^9$hbL|=|s`6OC z$|I5JDwO1jks>^_@-CPs5(6w^D`4eOyO(2Xxs~Fzr~xid5(K(zp17*jM!yUKUr40Q zd_+J+0NyHh1%dcH6Wb!H8))Bx?QgbIpjSny-uOi(*2LPN=L4lzvD z>M7ck9wU3D5yM1EF^pC%Ou@IC#TjHlTuTB1;VTmXH}6_928ypORfJfGDOe=HASP$j zoH6&K36T*s@3j(%@K5v+^^B1{g{vqUTEz&d@N<(r55Jx9(hGT|09bjT_6ELd!KZJX z;z{^KBMn%2TcM{lG#xT;^`7G{HB`8q4_J91>*s)#hoQruDj*oI%+`Xu9{sdFBE5PY zUrYfl=-?$!7@vY1y|~=$xx_N=XYO0_AXyM~L|XxC16^UaQwm_^fyjJ&93zgbyavKz zhyqDN$c&3KQB=*dr#8}%9&4ALb+*{&v9$qri8wz6vKGaz9IqYx4VhPg_Q zR^mv>dk-ji#6pKkUKDS^5PUD1#Arx&8CFZH!lqxEmK3Vt4Bn-cbs`2IfC@mL=Z1KN zTih$K2n5_NmE;E$6D)D_(AqPK(l1f{DE=-zVA3cTO zyPGTzD0%#vxJ^QnEkhFCK8??Opxu{>Bu4tF=VWzmyJLWZ=5Q2-$iO5-l`2Z4h3D*u zfv)hOl4nuMZ8r3!NMmW@G%)EvG!%SOZE!jXy(M z)WWV5>|lwxO-HMM0TG}&++~AEPz_G2p$rL_9Uhp1O^}fSgby=lq75RK)xXe0W7Ah; z))Hb@H4?a&EeQ~LpaxP!3;_-;$0NL}k0GQ26@E)RAgt}8zYc(SO%Q;{!<^$2!vvRt zWPuN?93b)}9_HzIwS)#+V#sRyQ&vs`M4rYaxx&02Y-*C>8qKIcWL94!3#rKy*;koR z8Z|1+ywK?|>2OtYM4ZkLgB&Q@h}zZ_8P{Z-VKFQvJ`6$BYlw{*JBMkARZyA?TRDeF zI)|pLC`ql1F@nR{5EugI;oMl-({(UbJ;29d!7 z-xITw1$;aZvFEEi9kT^)DCSt3?>ZpV)DV#l4xLzgBYV+GQqh#I^r4Jv=@m8k_D6BuZ7gYC?1-CBc1^`-Uwkm98>@K5XWnrfGS2sTUW+l zlE^X0TTrqzQ!t`_RqM5Df|&ss2!fj-aHN)ct`NnIiPMooSYarmWeE!84(Ju3Y2*FR zUw`V8IcU`a;k=9)^=B6I&07624>IMz{1|E`I(C`o`WpRl0FI|;Pf*ZDr;n+V#zUsZ zMf4AGJbrrOLmclq6u|Lxf5Y*#U_{ys5$C#u`%{R6f8uy><00Dh4)0*S50u{J4mXM&U1)-p2F$Q6K)j@!tiPTAq zamBiK5mc^XxkW%^Ex-@`1d^}bHR8>Sygn0lEZhbu+sK6ED3JT{d1}h|( zkT;k$<~1PP^gkKB*$TA?w4~-Pc|kWG%y|-fT6-o`33G7P8?&W^d6YV)W8-AuNbO*y zt6EJ>Xhn7*g;)&8!Ax?$-9ERB?1ZH2eWU5@I)9{j}eroFK0Agyx9d| zM?g>D1tOOS;P>DfiF7p@$GNCp)tfnxj)1s~F>DF2`qi=4341cnS)bvzbYR=i zecE_A+S;c9MqBF^b**n<#pI$hHbcvULEKP=!h-?6+{PcuMLZe7zkH1!tb>3WP{C}| zj@_(<%v#~on|ZTb+_6hnZsOKm(~1tp3fxzou4jN4Y>^B>Ob%rUM9dm`nzDTMXau6Z*vMg;sy1;*7Gu8QpPr?(FR1MOG zGjZf^=pf-v-j`$Y%UuhCwyMs1C9Kjfhla}#0X1zZgdtn^ob9L@weTirUkOjy;GpI) z(3mIBD8mE;xz|=GS47ZF4b=aYVSG&Fn zNO+n@5}pYl;gKF#YY+rFho2h==lM?I4Snog1t>fa`Pdl(o28FuJxZ(VX>Zlr@U)P;=YvlK^aIWrhNT-WWqnrC1XD8D%_0fYSt4^h9Zx#VJUnPTl&vE!gC;TGzEh z2QQ*NOER3#3qv6WD(Yx}%1+T0ewusi2Xe+xVnATTSGr}OJm4~#N)cWFicEz{N`u;x z>#bxVk)>}TTkRPy74Y__iMKG^eJfQBZf?6O002?@#M_Y`%Q1$LzUN%ba3aouplKIG zR_^UjstB?9vT1y!jURRr2QYZVC-ksP8Vrq9U}k{9`+#i`7ZiE(6=3j~mWaeh2@j0x zAQ{bv4qotg2k*Uk>_L<};NaCAIe6V(n-m+4X|+kx7+RDfF_khIQz&Q&qm^I=ktF~V zV<;6~Z&i&^w%CC&Z49lFtSuRv7%9NClSj_XH4K$d5i(Kj}29zAv$3sVj!nYR033`E?hY3(T$B zu0shbz|u9O4;XkV`9r0yUqz-H{l0!!Te&S*5}WOaVw7}W-4dT~^)66z zKVBx*+!Ou&@|PNi#D(^Shk6uiwdTx$p^<6sYZ>LuJfq(}SI+i-S(&W&`MAC&zxf8= z7EUYbZt~JinuLRtG6tc?gbL1CJj(h?kbOSmc*ATst?N~# z+LBL#4%B%Qli#KsOXug-?txGQ$MSvpc!S&s&Ytc#^(c(ePqUIUXkiSC1%qrn*}|Az zYHh8s+h33oh?|RT?y5qN#gc>X#e}<0YIf^L`^uFduJ_i8`+yo|aejm5RbAjY>!C z(|2RzU5W%UFl}92N#`vVmb1N+`~-^%i&%1Tcj|Wp!(Q-!g-WXyc#BKxhF&ihm8Mpx zFjul~lW{gRFLRy9V_7MD^jvIVu!)2_s?~|NjSNQNobOiADYXCC>ooJS)T*we!#r}4y)s2S zmfld)E;|jx z*2su4ifdU}+)Fk;TX^rb`Pq=BI8Ng*iloPtp~@$hd_xtoNXcF7c1lnrQbtd5+il$0 zhI&c5dFH&CWpE~&(n5`oh+TP6GyXJiQP^{uKF0Top9Z2|n{db{%&Nzlo6JO0bZ(6hCieuC6cEc$1S*=^AgrxI|rVE2bhAw+> zlY!9};`tSaY{^$nlV!f5WKL=E$9MHgAIL6I8lZqKa+;kn-!8}Acsxkr5o4dYsUPG{ zoJT1A2+jSTx|aS%ysnI5wx}B>UM&R|7(H(*P|6>ll!_ts^$nG56L@hssWNjx@tFK@ z;ft$}D(|ib98;Ww$DHY?%=*%J?BvJ7{(u)=+56v)p&%LkXFQQ&cil3%C?TTUg>AF| zH*&%o2ce@@r+Ey?C+B;oeO>z{^vV@gvI{7}fRdKj)PhqP>v{z2^Ev9r9%0F@(1pb8 zAUtI((89Z|azZ>a^zhHxRGjjsz_9xT>PmtJ$we>kNmLb{4HP_=Cpc1QP*r5yBxqb& zG*Wt{s@Q5z(4>9mB~E_*K`8ufczwcwd`=KCr}Sf_DV#sPW_hkOFxS3v2mP$I3(mP$ z-ui89N8@^<3XSF_yB%%uSf@mFMO@&jy*(8ZBTD8GHWKY1Sv=l%?Wl3`cy^=!a!RRhx%IKmed9_F2}N#3t3%O5&Fka~E$_9$PP$P=n3l9W zOl0m5U0=C#5%)Q9ulA{x^zm8c12nhT(0fw9bb$_Uwoj(3RoLq*t^@QnW2T~UmpxY% z?526*J%=%E$L++@czW%Eq(6mHdeC;L>_EDeKV5{M9Gp@P3!_9%i{Gj&dFzl`OL%Nr zE2^}BQZQMI2HSesdZ594BzD}!Q3RZ^ox32sqArs<&>?*bCIWi)p=;D_UXu0O;<+(Y zY<~Gg=dl)YdCB_0xcy|sOpOrk1>Rocj&KP1tGA-Ta$v#=$> zoL65!v3lh94`MW&_vP4yh%MyhCz*6{&tEq0OPBl5E`Kq2Et09#_8@$WV$%6_r24A2 zUrw!CBD#J2|KaXF`cf6-F|1V}>fBtQUBdIu>9Akw9YNYRAerAjXWq?gb;CUgNQ z(uLO|6b&os-t zlD78e?wW?J9Z$uuE=zLx#(983$pDj?@?jF7C5VHRj`o6ZFy%4AVGvMi$_U{j5 z`hTepz4o(TV;MLd)zmmNyLGxHH}F@`QSCSWR|Kk!$opkP7akPqh#^=B946F)0C7q4ExM{Rp5|N5}Mvl9L zQi$Lv_*a=5d6Qt%sCI3Xy|xlV7D(Rcrkj?q@3ewm?4dY}wmjuXV3jrj6+!p8WNli( zE{Lgp~)b9{h!Y9qx(M4VJ{1yta#@e?yx4ERJQQ z!dypUUJ$e;s0G;S>COdK*{lR?iJzas&}i%+vz8fGOx(w{A$CGJdkxy>^yxwC`iltW z@jAT28l~9rRUb4t6JkjGR7(j#WG_sIxND`k4f5xcGi~8{MJfa{7?g7#I8MJ2HWbp*y$%o%swx+GQ=v6N z*0uwHAL??x8cnW4WLz7StVf0%!(Jcja*Y%CigYDGx^mZ2!HDt<`N~$Z^?LYgG$TOF z8Y5DdprJ>>I=w2`O>V%XJC{rTRGfnO%nC9Y_d|uX+_&P|W5LTb(iO=!CB9tY)9>Jz zOAyvd;JI*c*mp$FY!*8SJQ~-QE!J&=6X(=7i>0etIjibnOM(#;+d)i$OmF442W6SS zXw#}&`56tLk#G0bqymvl%PWq0n?8#%5|X;oAji%2DfE#b;{9g*@+OCe(AX)(NzAB1 zM69G2G6e4kt8-M&5aO?{Y#BD@;>9^!p|bYcs!pRfXtgsVC=~ONT+^`{WT1O4x+{B( zsTZ3Z{?+$PobR3tt%}>L=905+jo*vaw&ov|A2t*oAF@Bwe(*rM-z`?{RG6On>q9L{ zjY9`m42Gu;pv4YQ<`rITnNdITR`i)q}!5m6<{?}DayZ^9r=ZP zo%9Yt6iyU~xO19z=LZ=)K5B}Ox31vWGA9y%-bO4ErsA!(pKt>(>t;9GuLTZnrz)DK z&AA000+fdde5bJpMVG*l7@qDss?!wx%j7@CNkax%S`1a%(I*TVa3XLU!R*8klf5O+ z$wFL=0f{8&`d4XgBNhl(HW>?(*VYZ>vs@PjUCB3j`N(`9cgOL%(vRDvz1Mnj30`o@ zZA<|-xs9wfk`6u{$P6eyi!0B60}P6L{1e7^wCNR9BS|N0rnq=V@{8NYY^66(HT%C= zJ+8Lg*s2B|*ZAXRC!>^YY=L89B37xg!vL*hq zqMoL8nuxea0~U=f+Q+-*JB+*0F0M4sv00knbcYrA}t zk9j~BL&<8&;aD{ou;RwCZWbaVel@U=v!Q02XGFoqL67^8keXkol=tRFy{>r)9~luI zd#Oe=YG3zYE3QSWx#ybf)HIPrQ7}hZQT(tJ^9+1u?B|iFcr8g}J4rSpnR!RNs_u!d zu7o)8xBOeSKyoe}Boh{MtddaFs@p;|o(($|sv~T+QW98&|4?Cb&aq-scSTHS1 z-+PIA2zeH*h0E#oA6%|R9YU3gMEpEl`vs&+IMW_^^&|6d99Fo|@xmPk0t`+dsJXA> zM2tqty)g@OJ3e0dFSUA{vo!awvRB9XoaVK@c+(A&_3t#%i)%$Sncm3u9$^-9-AZa< z_S|u*4KUe5^?&Qm_iQiq>}|S1-{aXe+Hh?nPOY1gtA*IDmH?+pWjKuk*4^l{A-26R z^~fdAkIbxPOVs;%j7KdN)M#bI^4AL14-zEM{m-SDa%30*@@db}<77e3fEB?VtT}e` z|Br`ZN2u4c$LvJe#M zvyu-Ve{bZ1ZTb-f4kD6<1mig;PNo6JM$F)Z?yP;%1MQs^$;$hjz#wZ?-z@O&W&$&y z_Oa<&%X9KJ@x7*>H5&;lpoI0uQX4CDf1Dm|{*&0!N>CBEanjQgwu|$wBk(+`2Tx1z z?=V6C&i?ant&Q?NzM6H?^l$x=*IN09 ze$mzL1Hl$Dd=|1x^Vh}#>RUkdvH({nK=NDy{dTcN;Ns-Yf(x&%-VI&NZJ~OCql5f( zZT)MzySm;Q0IY;=+mXyQzEPu`mQU4`rQH51p0gS(2|{CJ@Q1S7>Aye2{1>|2#rs6A zb-BP6W$E(}lD`g1uC;sTvPzy4oSpm)#cmi~R`7Jo!fe`w%JfnilUU?TRxufD!(>VY zv=ZtGQoLaK3WXm(j;3sM^_z_otB=}fJJ)y<488|l4(p@quCJ_9sud)Cj9@kt0;sDg zIy_W#aV2%mDm0_Jt3o^!-4Z+9YGbS&Isbx1E&bdH{QQH{iT!wQjLhZ^n+S)YY*bEcU}D@T?0Tt z0gse|gO#4u1<2R~vVj)mKg9-d-u~fTVa-b6-;-4DXfxw1g;^OUqr6yagTuddMg0nP zuKTT^*F`^kRb;7OlCs^>3elq7u~OZ};4QS1eBABs_&dr^?sY$TsGQ*1o$yFGG5CLp zd5PWs3HC1NPJXJKQrn%$qZnSz@$9C{ijWJJ!~j}%w# z>-oNNll3NZAyPqy6wbF5t2wOQd_E*?9S+rr>vbgdKfO6n!Y{zvC)Q8SJWoW;BUxbN z>~O7q+rB1fpYYMmVb7HTAtdKD%8^HLnzR?KLgb>S+6qP(4zIZ08xg|=5$hwdt+rZ% zzWtik7o!#zsKyA%ZA@$Mg+oZ3(v~)Qt^bWuf@;W+27|5|L!9g~<(q*~<+c8~-xoe` zb6@X0t0n0!HkjkNOHt_{Nj4_Nbq?jxy;euJww)gKK3Orc?^Glf*4#&koSTIp?S43VAlDU zXrw{p0Bk5B1&=V@LddrzqhEvnt=-Vw=15qpL<(+W);)>u&H=kZMD;Do-(COm2I9n$ z`nhP&)ySyW;gmG*55B#%Wl>)+x_k2!vN7aG`I1jQZuqJ!`9^n-f~DTKp!4e)Q;(Z$ z_~3312dQk|*GDsGkSTB#$@~{dV5y%KT>e$s*nEf2;mYmXCea*;&tI_ap*0+f#FxTO z-;n03j8q-{k3xJoX6^JCw0Kh!YM9|e7y8v6$3mvr|5lcYhbQowwt8o$78CDRJ57to z&h2^3gpGQQvDb7xj$oR$I%9l!_y3|5Oz{1T{ok0szOwFXvDw&tR4!&XT0b(_$okl8 zsm=XEXCzOs&*#3-KPQ(wA-?NF$s9^iyrKSIrizq;eQf7%!6&DU4!Vuk2ui;Qg0tF& z;WbMfDzkj!FTqH-#=ZF8y&TCs2{Ul{p! z9sgTe$*K-ZW6p&Z$DvYkAdj}j#8GoFy(6es^*bg(x5BMg@zp)4R}Om&mDV!^TDv4S zwWMXWOrJ3OA=!!si4;(nY+=*_-E`Zn^RhAM{#mT=0Sa8sc{FniT6Q#NtjQO&9%h;8 zM4KsmwJG}VXXR_gybMOIyp$u+R}`(MkvwEjhl8)BJKryZ*Pej6-;sKt4o1mac?l1~ zqCtFmMs@i%V4~O&*7iN>j;_XE?{SG63+-TT!&JXf;)M~WzaTs``b2v$Z9sVVq}e0T z=#`o{Gw;F$MsoMk#9cg2Z}M@Mx=p_$?C(Ae<)0^2Q3pd%$0go{ez`Ykazn!z9hQstS8!MVFcYa7+m>Q~G$fThl<}nD%G!h9EcT3`d*`otDis$7{ zY_x)S?hDSOPj;5@Da_kjkv;eMPR(?|<&eB*mS3*lUoyCNBJyO{?1}dW)<_?@Is!S9VH1Rio2VD)<7U6uy@sCeL-KP72G6l`B z2nwU+Jo{B4E2hD1TG|>EcE5)+r= z?~&{`?8xgM^O+|B2{Rdvt*%Nj4l5?>j3aA+EfYmk$4Sm=@V}IGPE5OUKZ1wKE{wxZ zj}otO%01Cni?5I9-K{?5>YLoJ1bghnb(O!b6oCw;=&?QI>t^A^$O~xvo=58tu3J4? z58#RRj7^W~=Fp1y&fiNPcfx$XO}Gfd!mOre_52KCYpKA5mAUnpmb@rD0!)_!Nz(mJ zG~=YMRLw9=J~6eic~{Qf@$9a|qjp@P^^713-}j*_?}X7J#ua8jOYQq}-D35~N@YJa z$=6xiE_!dM3xclj&k2WYt^j)4pYeE~wl1%zaVQuR-HsHbuv^}8U{2}LdtminF;7B3 zC^awn?%F&v$1`pD!uj}-XB-2g>$aR*wTq;S|A=|)`!NMWdbSR5R@rtb3|AVDZ9h3Z zdV9!|+YU|tkaOs-LW@K#BrO@VWzWTakfd+t!1A;(Z^7*@pY^f>$0Jd%o)O6ucRNR( zRn9S$pUk41LIQjw&T-^_#60nmVAJk#eXGOlqGcyhO|D6^{KMR5cFs2*a!uWTf0*C1 z?0j@cIXyyL^BU#NifmUMW8t(R7@|LLAE_OO7$otbyFpT5Mk=PqFEau#XW zYal79)=cy^Nw-?kW;69iwdio>gW=0ncI?GO(cG0g3SS)`=G8ggK}+j`TGx_78gH`y z&>c_T(EjKj&ATP=_ORzQi|0iVg_e6`lIrRDKw3yH3;z@S`m}=?4I_5?e;F>WNs-uy z#R_nmOv0dy%F|aNfo%WwA$w1 ztwuO2Z?6W}igP+9Or&cIuYOn2lw}g{X3nr)-Msm{wx!$Sr9k4U*7c)cTG3y1Du4eY z=DqX$ys0}pz)wN?NI~X2)C-W7&81gtY=AImg2??g%pHR&XiIKdxg(Y zYC@>AO{s2UsSIbRNF9y;OV&g3ZxQ+46M3gGJlz=X@^!8hJZBU+jW?FN4h?Xe;af)X zTbV+;zf)yG=yLXzj*$G*ct8Mz-GIo#kA|GaFplHsn_$3v6kQ5|zGolUgro0`VHl5L z+>QaA#()7BCjNCMX(A&R9{87xacK?MDox)rLq8x59G+p=K|xkpSW$TPly$B;BJVOe zG#AUn<418oGpV5=LHiIEKiVr*uO3c)rRpss0gv`M zDtv)lMBoIBMX8mAi=4#(4|#^>W1nS;X5vH|^TV^4C9)WraI_UM4BKlgcabzqX8a_R ztoXtix~J=GwODY53|GMc3mVT^N90SB<{Zo7&GX_+C1;l=0_9ri3(YtNS~*_RGbgvQ zre)Dj#DJxdY$+I?!B$=p6HJZ)I!(?8#ejNov>y+c4Awayvm7!;&}9sF6rO)4h9Q3q zSm+1sSr?$36VjY(Kf}Nb(GW>l;37&O8pE@N9^XeYf6roULecHFvZfGutC-*c=AyU9 zg`SHZ#B(RUC~}tK7xbLu>{aH=`xx&0L%}677E%H-Md7(Qw49tJBa11DTtdv0U)+L0W}Qn_P9Y$G_X|0DaW>ck&1dS$FDe79hJe0c z8CM~!G_CCAc&LCJVHwYUilSS@u{u~NwaL+aMzUqhvUFnt;V;6b`K<56_!a{mgQ{@SPd_5M~={qXB}^0jkUN*DNl7k&fc-Sp;mc>2#8Xt6>W>%ZnLBrndeoEMu~TGv0D+5H-r{!ru?O`9*-|Dbisd&+c}>ltOMepu=Ay zCpX4A&`ku{<{G*0@jD2oI$+v;DPA*=a_UtpY+Asb?&pcaI~P9&(`^w*I!)>9@KRxy$BMG>QKa?90eR0UA1XW?ka=WmFm`kmKdci z{Da6!T@C>-f5pSH6Z<}#iJuT%1jc~*L@|wB=`@hRy^^X)QE3_G=;-Jr5bN5t<%o84 z)3*{k6nFuufSzr<<4w3tO7qQV0l9*;I9>1z5dx5l1*jgLVKT;Ep9q8~*%H@%uL!?o z0%Q9X;YVcT?P6e;;roeWs38}OuoA1uQCvBJDRu>bwfNy^`gxc=c%Azd8+X%{!bt?O z6R+4wf8sjKso=5q2k1VY+3G_;a4b&~;g0)e5Rt%WlI1Qy!ApUrUq&%0kOL=Sd{KWu zy?BO3GN=pEA6mhuti;#!2gHtKX++rlBJ-^!0-J&r_t233{Sbjz)=m`t10wV%Au^&5Yr<#OiI1-@q1eynUhM$P|AQ;!Fl5Nl$OExS%5=|fRCzJ9o zfak!JWVjU9?98^#uarv7OS$j0zTU^2n6 zxyCA7XBgUS1V<8?Q*bQfvZc3Qs3<`I@RPtX*BNT41P}?PoI0m@Pg50!=mq@==JbaM|hhC6*@hlVjZ5o4k=cQrp^}NYP4pYUxS^7FeRb5 z1Ic;!1?a30G}$l@eI!j-F!x3`m56n)+1&!6K@JBLlLiv~wO}JvtATt(j%50T)Q>y1T)`$V^8WXcAe|rNE0z^ou8G>fK6jA+0NF zlGYa64#hA4iywBa!D4#cceqO~iPUrC81Nu<6UGKF(8MuYB0%9$ebRAv?ZG>Af-uI{ zvl9BszrZji$gqp*qW~ATBAXS8>EpV*HKbg8Qu%k;#59^CBa@weS9p@sn~xaF!w$YzxFtY{P}8O^)H*IOH8poxu~}) zVIi#J=8LA_57eNIh&Hw+dfxVlzV6sVdJZ~LE`w^hZk}OwJ~j#WGnf+rdc<*>kIhoNLsPax(AdD_ z<#GM>(CGf0zCyRi^$yKXFYc>MSs)X>_2mmxu#aM=CwRFE);I)XX@^y8Ib;gay>Z7# z>N$a{mh`8q7qG|Y0_@1lzj?m)IKJHlU=yA><&)AJkL3xx#pEa1R5YpIZFI5 ziGXFoxCL?O5|cp;7ai@Ner?C{j#p4uk&E?YPVN#0v!A=PdUZ*hm*Vc~!^C3Z1}=#; zm+PZ{g9uM9VCDj}4ycFTVIO{Xy?w+te!mAzxoj|MR3_*V^U!nk$lKUjSl27X6ZQ3am9H6~1}7 z?ifh~?4y|sVAJl75H)hy&j`l79xAKj*LV)7pgNyr7qkiAcF^x%-IH*6&RWafNon6k z_jx9?vix$LyZ7&7LliZ;U&syQt0}@mjsxGz^{+}V-wlN;n)A!X#l$o$5um4EFV}az z+@=EE=bV%w-SlF%8f^K;@`tDnC=Lzn*rldlAyqp6$l&`tz% zx%x-52VazUKXveN>+(ar$hdXNw~RIT1plo~eHKb2GE@aAJ|@yfj4Ht;ua6sYRqQCu zBTc-7nA9W{bE#IEGggoh)C~Pp5eVj%BK{;CRl?t;&qYn)1JI}eUq=EU5XZ$YxZV|Y zP{#YzbMZ6_3XOneRR3G=Dcuw+p|v6mX#R^r_fR#))Ou@EqaC@Oa8&Zh!%K`!V(1S; z6q6O%)jvCp73^|p;AElm58XZfKmvelkh)(j`{FN4_u$h!&%Eng5wtQDI!VvGwp*CC z(acJKU)Jq1Qz%&u#fa0P+rzq~G}(8!)6{kFWJfZ~@p|Q7k0a!o zl4hqLnsB6soyB|w+ubR)bT{x9xFX1uiedR;>)Phb5yA`}=9^MUx1ehby=F{8WdqIV z9M1mY1r)CG^1}Uk>tLeV3zu1|NVl9CBje(xo{M^lw1Frc1MAl8N$4|9_MEeKty*L2 zvsH#A8tJMXy9uNLf!cXv~AHH7RUFv*Ij|f1{$_XDO}6b4SrSBnf5|u4pF4L5h1#vCpJfO>)xQTgAHv zak|h+96k*x!+n&%KiSVTt!})PZOGDFBr|e7a@Z!FYmw85p4)idEKzk*{$Nzv+VWi@ zdzk#Xp7%t@nTPa6JPDm5W4zz?n)Q=q+vd$l*nz2H-?Wq zFYPw*0jJhBzjQ7U+Ngo*>Ua4B05r{a5ZIt=$X6v6_XOEgf9B8&VER^rj9jE}OquLI z9c>02eC1*9Jr=N)7EgG0+n@bh+fKn?>qRR2LVx8jT1T9K_}_P3?+)(%+VQbM;}gW!1^Bljz&vLvs~ z8LcU0#L!6x5)G~#k&J5Ce0wurUM3l-t~q}@<-Bf6KLrU@5Bk5wywI=5Un`%~RdG3v zYE-~TG9n)VcPmFUT478^3xAd?pQJZvu=@8{9Cf^X#4%;u^;xp^RhaaFynvnBaeq#x z=4Ss?+uMIh*?JoBFkfXM>TQ`^()pH0PkAG0v%6TP=x;o3FH^`^ zUf{=GqoQ9h)Y=GdB=7s%hkjL{7W(s3p5|i=wNxRRCqgccT?efUM+w~8orz_(Lbs1{ zZH29W$I)og4nk6V<8$OjJ{#qz74u3!FSkpPC#~T5WuXK=0|6Z`H+cQ<5cHf-6sbo$ z34;Y_-LqhnTH9CX+t-2)A{3`p1{uz7ykROB%@F5?r`jFSaCB^sNF}tUI?j_e zIF?#($(FaKxj9v8=C`IPHur;Ycxk^9L`o{lveB*)Qpv zxcFz)!#CLyt-<0E?kbm@G>mcuneCsM-TAM%2wafIIfRh`{#}cgRrdu(s|C>hI~|lO z@XK(GB0{x8zN(lUOgoCS6;21%$X2QtyK}P?O^Sb$n|)^FVPjFWI4`2~lFr0ArLF9B zM6GJ7zH#J>oYG6RZ<^|`x1KH*MW6O+WCyt3U?kgcBk`gLQqP^(;XTNVHQZ4H$(qu?m{vftugRr1ky64~EXGo_2&lJhW+p0mWO@I6Bq zUbiS5wWwzH-Z3=WeV6k-;F-cP!8o-iv!4?k_*nP$3ykr z(`|3ZI|+ZHYky$Xn$*1q_8~I7)lg(CcTTu+^cKfWI$xB(ZggdvvL4eqd6BCw;i7Qk z{+lrN_KI6H?oyr~SL#n_I=m%(So#)6Yw|lAUp9IwrDnW&kl$5z?CtJ-gY;>g{X@^# z6A8xr@TIRS@7q4*i3bJne4bmW=o-jt3~@GGnH**BACCMLFnh!PcS2|Hk4vO)L&H_x zzEaPTbA$70nDsFG$@9=MPy1hIRxaHR&D-1GZ8;j>x4p@G?(=!a`t~@Tdt}J4!3_~r zle7)ihL90)pVrie44a%&A*1q5t?7@_HhI5?jH&%>&5UN)g3*VLE1pOvtJg8e5x$Qb znJmiMA-Ap>hE7^Gy~t}$`y%ccI%TJum{qc;9zYJ7cKZ1tr!VcRe8W=Hqz;i|(r>f< z>^?>QPnXW}ITGY0Hjeh{zErsWqTGcxYzk$#7~eOO4z zs0O*m^9wQKj%zDBDBUo*cSgg18kf-%a1+tETcH6Be*Tzn*OB|NQm3|BP*E5uv5y(o4Ar%8%-@tLg(<;v> z>2G=TF5DX4ukuu%3YQcC){(h-o5ljK!H}c(qa2)0@g;{KRXJ5YKeYzlk&ha!Q(5GH z)GhE^^|}eY(QDrAVF(^=1g1luEYPI#lKLiM4AO6(;qoZ*uuu1B!CsERMmq)SqPOrx zvspj}RdQHWa9D1??i!d&3Am;=H2k)SF%U+15QRW~VtdO_V6jgB5HY;mmVPo4hU4x1 zrY;!sIoK&TZprs&jY7inu!+9N)^wWw4&jGqdpsWxare@N8J$+-8Dr-d`Ry ze@%0pe|7$3`8%r|CB8p|xc{8%+~g{Cam}gW)A3V)Po6(cmlRyBNb|YMn(SK4mwlhy z7@sdNk0=C1dSZZILCBnLq?azW$r}0oNfdO^dAT;K<^ZrF4cbJ};rAgmYmE6VhR|&K zmol+()@)OzwBpA`Z-=d5<8jKJu3r)y9wI2<2#@}l=z1l;UoIj3ivC{+7}{#a0VP-G zW9rs#@nme_wbFj_zQFq1)bKU(n^*giC`R5>pZ(GJa$U;w+ad56FaE_iKRstby|_e~ zTVfkRNjcy?)GcmVkadiD(6uDdyH)`T_wI`(CR;n2Z9BZjhDlg~T7w<4>Kw9@9C8aC z@@XBT5Y#y5@D$nallp>#Gag|rr2m^JvrFLBdC2^+o z)g^V2CGaCC?8j4w3LOcfPntux|DfoE$FKU1=t^UJr)OzQW7Dp#NT?xFyX(?B>QaZd z(|@@#*7^c>Ptw0!b9_tXxe&tL>Z<@_rq`*=*!gCPZAm{aOy%GAymaw?AHy)|>P@1> zl8a#}^|93MSel8~dnUuv?&@HiCJ@>l?6S>)nxpJW-hLf>Cr>`khvtYw~Ug=@pM=kW?bYUR2 z9QXU#&Ux-wfc2IygSjt*{4=8w5%))21pz(z{HTJggaUv40^7? zp<*G{ROzDp2Obaqp4A4pwrMAwH~6l63y^^iTj)^f07Ak zBV_g4lm@*Xd2c$^br7Jp%&rFa zg@6Oqr$eseX+50T!tf_`_`C`H-LH858Jr9qoi7BhqErDlp#HXx2ixETQ(T0@DozS3 zZuI85*Wjc*Dmd%Yg(r-zPHY~*9I9u_AxsSM2zq%-#{4zfhBfezDdWQ&`U?a+N1E<@ z2CU7@n2=L#3lzzari;{zt=_zw6H}vKStCegA_c?w*`$skiuh+Lbl#)+3BWPw0t$oL z$-r9e{Mzu3wUjosnCnh)leKGxZ_6?lQikpshOXmJke3O2h zS?@?O)$G69{L7=sTcP>Lri7-XN{7M+n+JEZxi5hJR~}9Uo;k zaSVOeueU>P)xa9{_Fv@B$SG#i7YB0_XNvC)gN<=&N$q@)N4g^-28+PQmcW5MZ)Z#kKQ&a8fTeaCc9Su7>-8xUNJGqBf zdfqHJ8LKb@blDmE4*r)zx zz9ot$rx8)~Weo_VcVwwC3hC?YwKeMh-X{grWTfpSpllJVGY#)43CR6!*p+Z zu4hk)p{K0pRsDAd&T&+`ri^-!-mOjQayhF~)UAAMN`sqp4TK)OrVMFJ`6L8b3V%hZ zT2Q5XFGnPN1WwIYiig@238pc?y#}X}2L<{aT{{N(O|SI4A}OyTMUf$Kuc1AIOm_vm zvSPN7T8nyr9jtUnp>s%nc}PPXgxGs|O>y{!-LR)^_Wn(yg*C+nGy}7|$lrZ{O$@b7 z8@+K1H4#UZ!(*iu*hBIhy=(z{do}hKH#C|lP`T=~!hrG8pewR;)>`V%8ujto*i5(j z1pJvXZ&hxQmy%-khz_;o+TE zj%!{Ouhb^~L+1hB?)JagNp<}uO`|{U@YcO>)9CGL>6^pG@w1DHOCv|JvhgVTY3<#+ z#I^DRS&+>P%_D4YlNpV(O59%@{6Ave)IJ4y*<4rm zoaEOzX^D4ouhX4J=13P5S9%_!`a6ZsbDHVzwAA0E78lZMtUdq zd|rOi!PvOgH1buTvd#sL+R;UN7n0N{1Gel^KR0E3Zpad2#PVaDyU7C(VEiG-`-2O% z+_+_w1rLgpcwgfOj$-_f9XF?%{%Ywj#rp==9k0dq@?^J|2Ga9|`=(x3{3>1T?EsA8i>a zVgYBwP+eJpcMm`|rrbjCPuDqmyuTe`c?yg zG*|@an>tOC^w3@7m6OYre-f)?{A)XgYk-V3c-p*3#3w|q;p;M$9LP#B3xlyVb@J4Q zQG0IQCV`8X4T!)E2pZ5i^I?BZ=}*kSKU10-0{9nMe(U~vuH4EIU%@2ngs<=VIo}pm z9Hhg)*1ptH+_+^}jyKlWy5+MKWHY$;eChVjElbdrRrzFn={h`KaAhcT*hr-U1z3rR zvOK7IO(9~$tzG|)CL@NY6-Ad3plS*fZdentWEPe92N}A6{9dC*X(?JuFN2Aj8$ZZR zA2eJ0Y|YW_AS8D(H0twwc9bV6rHXd)7}v{ICQ5u})Oo)OTF&~EkyN6)H%y1XzgV>2 z7_dxAQ@3L2M}O3}ob~uaK$B$3D<~jNU!?89_kS?}3ly-FY)_D4yFQzaXv*8D3ff~~ z&?C^*_iX3=-o*VMVxIf`U4beZk=N>>+A4qc0qRklj}XXFi5KusJVeyqq6H)2=g)tp z$YGH1CU#lq#h&yAEm=SP+eS5}Aw?GmU@_o_#>)0O=zoiO%`Z3qA!ldWRx4@pZm6)m z{E#?H)s0~sechem30XwY8lZ0d`3DjD1s1-ur&s}cVj@T`wGF_7WR-=%L~s!96Qice zlFz|^#k_%4&RmKdWt8}+ zO%9z6ZofR3U=6#Tr1{;@Humh_ugRzrY3Aj|O>mDiwHpfXXTPo5mWDyh;|=#FTfvcu&;XU=j^9?(7noyKPX^>rowZn z^Zya^%s!jn`CSYZ1kQmqE@whb+Q4C0+PWqIv?)zk47J%EK~&l-&mC^H)Y%Y?qrj(^ z{}uD14A1{hF%Lfw8Z@Z-HlhuMg9HXrQLZl55_Sx6M|3%1wqT#qldKFY5li{r^YIYv4x=Y<(DBRd()t{3ZMT z*rMkO?Qi#kud+sZ-8Iw0-^=WJ{AU~gx0qK+C-tlDi?M>m!YIi(<=GzN$(Mye9e5}> zmEhMJ%M?i8_E1oLdt!Br)EOG|UomgKY3fd;eeTj|e&HZ(6XAoFAH3S*Y*Dlvbs|6| zW5FA@Q?4ZLC#}PT{&)xRUonq0k-)^3GL~(M4WG!@d2q3pj_z!IBotJC%%5&bs{rD58vRK$Wq;19L)`tw~(&3Yv;UeYFH58X7V2~ z@7?xKUYR>_yIJlf2aaZeHoer@;+HZp*=K;{h{CWr?)+?}LjT~tH_5Ss6sKrK^ zhJUhfzLC$-?KOK7wj>sHDi1+;l%UU0 zsy(h<$zfNnbj-FpL(ly+eKHamRgzGx$r1aSH!<(5M|$bZO^RKGed;#KM*ifsWpoX; zcy~OnDuP-4@%lwY+Mubc_ZOolVys#^x9y*I(zr&}c7JQKebgU3#}iZLM}hoFu>Gc9 zH~39e&CgI|;x-umm5YW>F(==u#SqOi08Pprf9k+K_)91Vl=jr&wM`EqZS)C9L zxkXRIUW}q&I~C%vAPllDioX0NKu(Wxf1oFhp%(8a25^SbBp%Y_Eba~R*r}8t#L+42 zuV$`}cC=a8t3grmMAlJ~8-?a!b)tweg**MyZO+AZEs<=eX!?i}k^@c5(he0yl4T4E zp}i&9Ij%r$@Co6xz!ViPBlO74l;!Tm6IC5Y7+QK%qjXGHDK^c0!Bk7~G5!f1wGewU zElrjPMa|5!S!`gdJ_r8VEixeXaX2M`(npKD=ZA3?y%*|gft#>rtn z+@^w28EadL%KfjTWeR1o&l5SpZ=D?YcK?`U#uFLNx%I>czk=a>tOb$du#*37Qn$B{ zNGN5~?nCAa4vuP@wu@N8L0F0~3j!rs3>mewTvaUPHZWL*F4M&cdWAbx_}E~>kCF<6 z)@is`EZ%9y9#wa7i4Ef6kHUtKhAbiT{FV z^SnjkSbS%-6U2-{`UIqoIA|cc`q*gPMzR<;R)*UqknmeUH9*x2lRyP(bzayqI>`obem=)1(L4X7~EVQ%cCTZa(}}ef;cqUNhA& z`f9$RL>aNa!h>(E%p0%Q?nS;<)=5}<+;firPG||FO;a@#PasaH*m_FsiO}3$9D-`R zj=yz}aon3zzEs)Sb;e<%FpzUtD#7oej<_nerBEqse8us402fc+@Q4h+A6qNC*`ePw zRz8u)d{eGes9|eNE5Se4Hg3tU{l!H2Up;|u73DW-wPi0*tyZ+ zeFOV=o#9JQ@KYPF)-`z3*j%Qy5jh<(-;mG2Pxbb)Hu}eKM^mW!k3VO_3ynu9b zLlk^Ctoc6CJM|W8Ysl^Hs}3VB#(wT(+|~6|CI)B7hWfXgX=n3h62pW$QAa$z!VvA> z9%7%Ne@-ifN6C+lj)ityUxmJDt(KV9;QjJ2v0_qPW1NI)c5>&RWI@84-2eJ#de*;j#AR9VKal1pK`#v*^R zI2QJ=?gX_2|9hG}x~J$iOC>Sw{KI=JsGaIjD#~qLFcL@ICHXy5iT4#aimD3cc?atu zkCB_S&eS|2)MU{+{ZV3df_kmX*SQzhmHvr1S`|5Y>_!Q5yRA3vCVX7&+Uj<;CM7wg{*@wp!g8>1OuP+`)V)OvVEy3Mug$zh=<(_=~# zVyp0ICN|u(xvVb>RoItDD;kXeAM+&II1FrBNU6&JR+ zHOZWaTNLU`<_Z~c47c54MM4npLJGM$Xi{;ka2O@*fF^Ac0h^|VOAl%I+G-#$w!(a*ueW;u|nk*d7UI#ot7KZ zgUa){62nh}IkmG3sU)!J@hwAX-&qPw0q5bOhdIQ?7;P1uVP(Ab4-$E2VaUxVZaT5B zyaK)Wj4Y9;{bH^Ei=?yuYwCah|6##4a-@_pQc?#9OmcuU=qPCzDWw7mB8Y6T5mKWY zNkvLhNu@hR35Y1oka%IBC@LS`-|zY5`~l~AJJ4*SZA6q;MGqOMGW61{4uCb412@gy9&vEAec$gpK^Z1DqUIcL=TeWXRK?1v9^ z9#oUCvhkHB4ptlbk21*fS0?eIcfa=c zci;kY_|FyU9EQGa;|3s>Y~+rc|7tp(9tl#XGH@lhQ6h8wCt>|8t{b=!R~TIe&Hztl zSDfO|o#M2b;_{f{4w^bgp5n=v;w_)zYo6lopAwjxI=?w};b=+_I*nR3Gax`PA#}`% z6RiC#A(rMyE-+=5Dv>cQSw1b*JT2WnEi*SQyE%R7X!# z(PJi;Kqm*E6c=MnO1D6&lP{i+RGXVo-<;7nvQV#y6GYBxiOp&&&I(qUzv- zGSBLeXSGdd^~z_BHfQt`XH5z$jW=g?`)5rOXHEBK&7hX7fEA9*%G_#B2t~(=BA&-s z>68<5u|zAYxtrm0HnOv}M+Cb~OF3kew#0-=XL6C~P2OsRCpN~zGAjGOoAM(?|aR= zm2)*W%@a!p?-$H_)>seG-aTG^7f6mLaM|2nxyeeIj}o(PU6>~q%ty#Fg>23TWh^|< zwT?YMe{I~_?uAJ<8a#iW@-VwI$%^sGoXzddg`SGItC$53UCYQhR@T#yMWSp%O!#8P zdB$`un}Ys@5H9Pdit`D~c1eo1mfogW*(Bk%UZi(y%urwHK0{%%P3e9DJjj-qxai_x z%R2NvDseGR?EPadyZWN{CF+J$jJ=QSQZe*>qN)80U9RQ(aV`s{;)@KeiSf-f?_MP` zR`1W(biU7oF6FMge_CVH6l7f=ZlBToesaY=IQ+vz><5n{`%Xvp4HJYhPo)b*fw&Ok z7z<52Q;*=`_C@5NhmDE74jB~=!--4tfI|S6!({eyP|fmSh+{;-ht>th(0#|r*bk%2 zOZA(}3%4D$im&gMncU@x(J{V442kK}QQC`5T&#$GfFz4aHEod8q>q3)F?0nMfz{LHrzTCObYM%Ax3Mg(eKw==JqS;;n z>R~hfhvzMk(##ed_iGdX?a0oj29J2-^3~M&BOd?9%K3zAmFfDtQ0*t-S{L36DcB~` zbRYayY4Sc}jvHHw4HBx&1M@R`1+rXADY6sc{)C8Lyl^uKc43`4_HXaO|zsho){eWrr8A#1u7W(Zet_cO*zxCC&*o|aK-wuyJwNR z;WIb3tJaU1QXH;(7|TVs2gY$fzTlz}i#Nz})7x^Na8sZ*ka01$?rdF*-qJeQBxbK; zE>Ylf)ZR|kt_+Xne)pTt983nbKyelhwVuw?sHhupNcw&^h zK-c+T`dy$au`suyZ8WXoz7yPOfbI5{=O3!4{6MnrM-MrwN7OS~j?Jy8SfecD$i?7o z^Uy7KVtE)ETw4L=d(ujhqCL@zy?5Q~3JV)~z$5PT0$b+Iom`DrXGEsUPQP=ylZ639 z44ZW4SLWTlgo;^*Xe1^APLJ#cF^a%T>%DoukCP`@bCW(-4X{0$I``O6-@|4wqYGqA z>kO&mNsvoT_}Gv%LBfZy5hG&*9$pkQ7FygaeUx8vE~$Sc3ekRqF7n+V#l1`r?a6&XU!w`qenWipo5 z3I<^^j^)Nxj$aSI*;~lmUo4Cp6CVvg8@mm|w6T#PLv^3!{3fxtBkP7xmPo#Oh&AtM z`puO~>4@o|OUfI}J9m03#^FBVap8oml+*6m5Q67!G?Pp4otBmu?~m}uge{I|`yYq= zScxW3vlu@+a?pihcB%ay5>u&0&(9Xr6?9ZsEc1Q^SeCf|=k|8?3di2##M`*Tx*>AA zRjiL$eI(_ORB;$*)1v(3RyLp!%1sV0x)qufiys;-Hm6sIb3_1p8no_>7OzN*f6EX= zp=jIXFd7L<2_sZkUK6{12oht@)d&Ia+5ZPJGmSjVmH(U=h6B+N_u0>y#(+1rQ*~;*W#If9_*3v(Y)oKq6 zI1vB*dI+L!M(=qOwn3-~C&)&e+K@GdVjfbs>Ldd>Mq%z-aJTE{jQau~aQm#;=(;b) zSeW$V%bFQ~-jArDH^%u@FT~i;!20M_d|IQ$jU%InG0H>a2dE@G3Sw7*v`3R|c<5i| z#^L=*%0-foZJ^pVB_T+%Mjh!9Eyl4*703+%S=S`0HGNsEI<6 z-Vd|vF;)FI!(Y*6(UPaJMnB7LG!Vvj3Z9{kUNoCfa zu+h*FF09X@;~~BS`>3OHSG7XfZ?qle;RQP14SF{&<{)hH?w*J-F4YEL)6WJy=T^fX zFw}g^a-@BJNiSW`a+{*L+sn-ucMk4B1-&X%vGWW3z``E6aGazcK4v%Ey_Fm8Z$K0> zA>8MVsuqYV{;Lec9}hffyT@t6#BTwxeVk9d^j1@Q0c!Jwez2=ugC{LJJ9j1IFdPdq zTOijwC7~CfC#NfoERm1bv_KO@P^Uh4JR97C;Y3a=C~_}lKN)Iuxl{ealMas?90fSy4^Xz+wr97yVIM#F5`j9Wj0BFYx~a7TLW%+kR!iw&NWN&Q#-QwtsMd>dZeyeydd$Ojhq`$b@( z=3|4y&js=?hcOWgO_Y8;gE*ZCyJ#0lzmNbob$)1Q{f1JuGRfx85#n5MzL`bZmQelX zyBBX=?%rfLb(f&r-!r@q-O6=WwBsQTeC*2;rlyJ!gAgM>ZUqYTOh2Qq%yX4d(v9#^ zx}#%p$gD&xesa>-XP9$=7#HoRo?W-q<&X~v98vVYkhud=EIG<*oW!Kv-HTx}71Z3H zxu)jV+H9<$%$xC4Q-kh^Udv)3eexE#|GT9nOW;Crj9?^BNvue!O-bCv$otm5o`>s` zn7=lzc^3@EN=q~!B1CPRBf8HmsNG#)Pv(w|6Uz-`|D6`k9t3xHmoZTJIun^j+j712 z?-941&51_x__-Rb5Fs`$X<+< zIa`6_{im5*>UOlvWI4}05e9Y|bUs;A=~r(7_M=z8WbsGJtqY+yUtFYUr?*w~Yn*t= zO=u8uyZOAb3YwiG#M#zdIsX2CB_!Jn6HpTChP1G~@Q6iGd*lwr|Yp87n zpJC}c0&b`okn+4_EWEy47!qc!boLYana0SjsoC~FX(sz-xQ$7`BEKnq!+^t7^mOgU zmCaxGoI-iVp;y=qugZ(+E})3?d}g;GRkjVe_a}Xgu9YQm8&!Wb4_QqL;}~0E%d3#b zMsTx?TEyNQYRdd9gcci%`Ggix9&N0RbKtlu5$5qAB4}fpQ%hE$YsATAg0J?18?kTc>Su8_Agc1v3!cRK%6T(Qb~+}Bkh z@BIgH<@TNNyO9d{YXTRqj(ShrD^V!;>~K+aX=ma=k3!+s{EOy+A@&+Gi@>^-3w$| z9<=W+{}-v;v3CCQ!%^Rl-%6A_KRaFyTiX5jvq$;O*MiI8yS|@J)|B7=UcMZ0vis>2 zgy{lZKoc3y__!jNZU!fG6#w1|Ob64$UWg`3`>is%VS4#KqGMF|R#~GkeZm*y;*9;) zI7%`7QciO5_IqpRdNBj?g>s3$e(U_}m_e0~a>-$P>w@5`L)sVQQ{w&fH$+6P4jVbi zr{(Q!T-3QbVqPepQRTNO?RIt4?xTEG``-WPqhqcY6mmxWwiHUQj{7(%E2SiNNkvg!Q$O#OWyED*Y%>Pq*pE=KCn(Gi^erP5L!;DZ zqDygRZ9 z_jBx*a{S1--7&e&8F|m*^V=Hpy}snT)fA>J7v2kd>R$TPC#R_DV^Q$4;@cg?_r8|+ z&XzukEc0zD%lTCH=xe!u#m8bZ^LxH~e_`b7!r<8Z z-rw*0zb=hYKMc+;?|oQa`Mo?k{c+^`r>VJ>iKEro`8DeM^=~`tb4weu+nZY_oA1`P z_Gw#NKem?sY%PA=Ui?Y>xViIXf9K<$o#nO9%RfJ_{@PpL+F#xIZ}ZoGw9kjze-F3! zzkWXXy7%S#!M7h@{`~m*6xJQjQ}{FJ_+ON{tGPnk(n3Y#M!*N1ChVU*db8oHu?|z1ZkAQ|UPRs%Gi6 z|N2OQ@$1^<_K6MdND(nEG-++v9L-kn{W#Xz zxII~<6Mw<9t!Zbb!lKG)ysde6zRqp5(6s&4{!+`m-H+q#umAhh6~!ocy`$yJ`cRs* z^F&AMw{2>v@zd*_ZAZHcjlQ2II@^E#x6&JbDtO~f$FFa+nJVYWH=QRx+oz6lZ@hi; z_vHKP?x)GO?h@<(y~GLu!mw~H0xn*G()Hw|twaF>jVmP1n;0q=(i2le;tr-2Kjs0! zdy%4<2Ett8EJD3VMZe?Kc=WS#CDD9=JnM;=P-PL8i_d85NK43~E9ok?_ePNF@1l)V zjg=L5X2q_jTn6A2VHfsl$XjTg+@2#BvaRU*OWy#J~UQ2krXS8G! zV!I%uEjW=C_ag^{kiqhz(q$+V?Wf@b-BjgI=@8~CJdWUT4dE%UX(@fpjIjhQopZAM zVYF1Gw$2JQ9*d`}L}TY?(WGl9z-h?1!53g*QWrx(Cm7UH@FcAOIHs>!*h9OA3o*r^ zFJ7t?zlD6x&~%+y=|>>Ax*>^lg9El!DgE*d_bem9xlc%pMT9qAMmx}MZLj_2`#7^| zQKUUiA^-_`fju5d9c*~5Ei#ObK}JqUo;g(w1u}7M$)X1u0wc-^eQY(Fhn6y<=+p&O zTM|npgC{QAfo}A5*e1P?W>pW@cMRZSaKJgju4qp zyY_Mrq$q@!05J=#oco796<$%)3jl=dSW$+>r~VK$V<5x zL_m?-gU#-Dl!SUwwbOu|@$Zilf39!{?myyUwdE0dxZpGO0}`CQUMV#C=^|hl z2&z=QDZh6U93kpY{LpL59KEW{%XApry0l#XM~lq%xD-dl>aVO(nePJ?9-$|5UknbR zhWC9QVog}NP9bHG$fm?dkhZl>wLWFVbTZ;!SB&&m^wqC%m;4VxKWQ+e_s(jb44Xzrx5%@?L~63t+d#Ogslss#4iQ$ z>c8(luxh9-m;LM6_}-KfdjfiuOq>B1y zDf1#uQ6YQibU-#aSB1ukO~nV_K_ZznY>_q$kg%7b*)l%@;Oqik`JyYESkUu!JYQh; z+#l9ee@{|4KHYY{=!RNgzj80zScyZtn|%HcW@CWHk35RG_H#;_n=pRoQxiW+SM-#V zyT5y1n%gxoIWs09#`BUqAoYZxqVPZ7h1OjXYC|-Wh@rSpayo9`d?k53MX|^WI;kAp z7n$O61Doui|IL8XAqI+tzVD#d0QzdCk8~c9c-S@mypkgN&k9~xE;-Q-HHpCL1keTE z(xRIc+xZ)mNrWObuDevI4ZWcMyHKS|=6kPQj}QDLaE`Fhmdx~<0sDmgwsQcLb?)>X zDl3DZzEwQNs<)Mx5#ozu9;-`sD8~oRD)`!E?Vi7Hu!dYdtH=!etj2xa9xOZh#3$TZ z0B^C`3EYg)@C*+lIa24yRpZKf3q$;K%dxmii(w3}Jjo_%{qWd=*^|o&Nc?G;W7^I2 zW6sgpMT)o8M62%2PO`nc)cg?2VpaF@+eV&fnPrsFR&aBy zhX=#Eind1ipZ)VYIvhS{9cIg_Wl`f(O_?ceu^g=jJjJkb_*!G~{T?RYvAKSh1}kCf z7StCkuNKLVA}$RP)J(s_`BV)lb*^tjbp5N=UK1hEP67KQHUaYAjX3~ZLP;Q?Z@JdWfEC9){HU&6lpZ73eyr69HPOZ>~q z%_8Ljk(&%OhFx^;I>a4fi*uWO;34tcdfcZxgbynpJ1D`J4zevX!j}6r*FLD>L=UaZ zzkNeoy^tP*^GeIC+-h#f+D~&)m0%r9dexM?wg=J*d%rLdd}B>6;ZE4eQZJ_D)u8D? zVBX0G+V$6M^Wz7>?Mt?Qc>;BR2;Ge`__ch@Qo)<~C`Fynx zx14^qi{x(;E*j0q1~Ty2^spPLwPAw?Vz1vvvzXF!IXJms+;!D~oyR>q%Csu+Gq)Os zcCFZV9%Xi~zfzIbuc2Ms4uA3QX`kiv%2`tvL*fX9$^v7tf z;~z^K1dvwD{M*BgknB}PzR9qQ0(r*~{KQb-O;$8odLFG2fW<5m97$+B5L@6MZBRw& z6;FR=@eNnIRb?O$YEM>|$HL{&8p&3dYIXExdKJEq&aCEYxAozYNQU4jmW3#A12VEZ zQdvy|A-nd7c|leO@=(-Y1A_wxm#SW&seE@u5F0O}=4HF(A)fxhI@r*sZXvj}$2TQ7 zHu|O7W*~Au-98l=GWQ4l^EYBWx_cyVsLo-qAH_Oqj%0)f3S0Z<5Zp8UnTK47c2i6b zslv2K^=%8)airR>9-Z}T>UIwzC3UP@X0-mUFx)w%3R4=iBV!eu4{c^GRL_2bXfElu z?9w#(tIP6nYMjzTUj_d>i!%v{Ue`X0KQt~Hgoj2+T4Um+QZNlG3{xOg+neA5^NTPd z`1v8TI8EzR36eu#v08u}xFTTF(aODqe<-kSV-T-iOz@DN10Ayl@@l9K>*EUTVO;1# z3M_}pq&cLh+k4TYJofg*i`Vo)DX}tZ21zV^GD;b7dGPQ-McdYCrhn+<9Hex{jP!GO zc(*od_AF#Q8>U18Gv;_S@uY}ri97pCp8#Qy?s!ney|rFjRz>5+bq2{UPy$tp3V=hP zk`ovt1`L)ELCB!M57B|`>-5Nbi6N&UTz^qu87w?(R%^im{G}fhm<}rdz+=K-ZG_lg zG>N$+@;s*)6|Y=z&!Ll`t{0d74=IsDkv?9K4Q-ScEi$^U%__M9)_r(}M3nl6rL1H< z!COeDTm)xukUH@YjV^!o)+`~RyOvbOW29q&EB$aCt5+OS<>G^Mc7~EPKl*xIAEAfo ziqZ=@;)-b16SRvdR`P+ZO8YcpM5IuBSzxb)L~ySRPij`Kp=1M9Sm!tsM8`U%;{YXR zHghoK3|oI6$h>2Vq~9SleY^PC&Ba_8+}N8nWXs+;Ef%7aPQ_hX5MxusLrhWNtbuca zKT|lYkZk($%o#|g7z*fgs=z=*IbyIN)R4_JGkbWkP>7Bh^ni_qijN_x=g~}FTrD)UNw_%+L1ft(Wssn3G+(w{>I2KMj79}oJCXR((nD0c7t(Yo2Z^)+T|1> zW_Hg!L!TM)Vfb=+HMUE%&av4^J$Q(x*R{Z;9zyS+rDIT6ECJ^EGfR#?bbE%`Dn>Gb zui`9{IFs-^=v3<5ZHD{o7ZH|dBCRg?{5N!(inRRw1p8*z1b=AF#o$hCuxqY>S7>F$ zNZG*aXW4Qg+-%QPtiU7`qa^mOhCj4(1>6<|7JZ>uv;(g7*ZNLn#1sX*<*Od1vd+da z>eVUUTdm~mhU;OOTc{EkwCW)R6s1@**CsnS2dVrHZLzB^yQR8rQ8^%lxNOJLMXl|k zl+F7yzX+|ISgHM*TQ=|@KbJ1zrIF0~G(wdxczc8l)m87Lm@{s&TKnhg;j0ntb-fhO z>#xuWsdIm%B7P2fbk+san*}ELU;8qr{A8_ujr*C3uJt8@U=AB1%}2oCw~ZdM(U5^& zLgl5=X)XfC2a_8GVXDd=cAa_V$e<}^E<+@BCCD?_xi64Go0ow@2mZRW3hHq=^&*nf z*Jj|oEa#|Ov{ik8P}6)JS#DbqtBM>Z2o>JTkw-eFkzNIH$nH`!uaC+Fg~Z6(3tTn^ znee?-swz7&e`Z8&H2%zD+Etn665q1G*nRJ=B9^uIF#N573~o+>hQsQR6&k@+Dy7Z# zGj;KUt({bb$Lq~^i7l-aDPTGb!tA+0w#(JtHl-+cvuk>HzB4O(zkZ-9@UW_E3ETLT zQnrMm@A8M@4>dhk7_E`8fxNP36vPt1Gy!3n#j!jC=#7vtW4;zsl~QsLFK3)Q`>86j zb5Kz!>%j>|O)UyMPGEIk3z{u$wc52}$^(%&-a4-^jD*S*TA%w=Wx->Jtc*v7@sO~= z!C0j(O%+9AWM)mEeU5+oLUwykeY=qlLt0kZEJ}Tv4<>F0j#=rR=mH)4&_Azxtb>b@ z?1I;xU5YC^P5IOvf1k7Yoeg1oQ}3<)3F*z!?Qz1(W>04QTjXXZ*ZWNg8Nr(^*(>Ft zinP?O#@+OQu=TQ6^GiFhB=?tD#dO20SYiNr@y`sKC^sRs9>d?S4cmkZyUH$&l~IPy z{roWi5%QQUd&$7|<}XS}v5vL3t6sLg)EWo=V$12ThK$-ZClTp5^>4J|oi^FLK%Jis<#!8Q~AFMWp7A9nQ-5^!4Sru_=hC z`b+HXIit@EO+U|l)!`EOj5HVE@OI?dZhhy(EYQt?s>PQSr?H3{fn-8UcCf`e=u*HH zv4cV>QdojaMWwzx2n^?>UuM%*VZ*;>Qw@!d&7rzO%C3G{kYQUCVk1DOq$E+c_s01< z!6@d?QbshmvWBc#Ph|i^gVnl(t4n+rCFcvb@e^y4~3e)kh}Y<}@8=OaA?gA?oiZEJW?uqAT=8tRP{v~c zD4is7l6m=Wy}mOAHkOZcQ4aP^AJcrdqWQ|&@n?|btsoWhCnuOvAw#7zT@8B*LW2-y zigfH4f!n!dw^UXQhHBm9VxShitn*oKm14Zavc8P2?w3M-LBHqYX1#j7&#`-rEv1`T z-t80cF!i}a!gffno7Cy~RrWW6>zuutKB0GjYMB}3{?Lq~i=o455991qN?seR8C73@ z%yD;|?T-=LfGiS&uX9nYYp7FTZ>V#3Z+LWFrr{KzB?V>kmj~q3U1GUJV=$&(3fPto zsQ>oUJqL9L!`o?|*K(MD4jxim{}%BNNjQrP^}eOx^l$Sp1tEPa;`li-wXVU>QaZ7u z7DO`&<05V!MVmp0?HsUrZMftvM}7^Pw*5_AV`L8AxqqByF`8<|Ps``m#bjs2Ttecc zk>>5&HlzF7T(EHb+p;D0;X-!2^eb>T{G{tTOpBSMpkY~L&u%UqrYJp(%m`R%vY+`} zdvL*-YU~7$hS7CPqM5c;lKj#mu3U}z@Na95=k+9umPX14K9{y9VHk)Dz4gj+;2?ON~wE6n5I3sv$j0KNsTnk2Di=gAA7zO-09uu(rlj^Ym#h+iaK+Wthg?Ozx&QTR{h z8uLht8_YFfXhF+qn{|R|we0-iz6->ez%pE#@+KAuca}yEWm;pQ?AnR#_f)(79cEa7 zn|C?r-mv{0)q`gtP!U3EtyMpI-n@!(|FO%kd?@-y#+l~&^FSJDjs6*}`xZ2Xk~}nc z$hu-&!2UNZC0mx!(!cQQC4!8oqyz#?5l)pMM46QZqWx6^D4>mtN55BFgW1wkUW=KEEZEW-6fAavNpS`q=xi=i`0$I?qeNpv`6-zab2HsKSpFn zUNtnOvi>_pT9#y8zWsYo=q~BhJDXkW-Z9}1li9MY%MQwAP(c3ABbR@&DMRrBVrC_O zC%)#-i=Eqn{qD4Y*U2=!ZvgpLJiCr(VnOf6L;8nb{ zu`so+PZ82h61AP)Fie-Us&!vX6?x+%Bi0&dJL9R9tt_-QvpBPTqezR)B4#&BGb=O9 zaCLY<7MmIwJ=-Slxmu4fcE_|=-k%1_?dK01 zJ02b#KqV>vS-*{dvWhz_9J=%=-V?8F%l_gvlp=P+$+#q1Q&03tnm^ZF1w>1L#$eQs zUoL5r{~S^PV@1i`xjf!qRj2j80=LHhpXtc%oAtbbOGKfg{kQ$eHsx<0O+pnsXjA1k zepa?R6i#O5hbx)0jNQ6buNZX}A#<Yae4wGCT3M#24ixA=N&l%4TMXpilYTRnB=7glOp8|IkGw` z4ND72?7DpMvw;;D*Y{tIi)z)T;4OrEQZK~YIKJt~k6TKOD=x}#n5f)lAd{r8zqmJ< zDf;m81j}2<_g-1p5#`3|HS5UjhHLza!$vxt_rTR`$$?QF1{Wc2(@fht$W#)wDApNx(D;EC+B2QIDwEuffi zoLFBf4io)K2`xp>nZWnsY%Ol2t~kQw_2Y%mrdEN=qDQu>)f3OW0D8;qOlPHEUQs`C zn0H*)Q#M*OU;5o>JPqmleI)VeP7;S|WpCU2ifXOSRA2T=&)Ce5kaSjzdw1zkT{aFj^zjsAMP+SZ##*Byg})eiF> z`J!S6IJytDkWskYmD2zzgW~a)yHNsHSW6)-oX^9%nNMZ}uger``y9LRI3GtQvIH<| zsHUv1u3#XvMK5U6pscN;N1}3>{3o*TD`l1Ue<&^_^#5 zjA5;%G78E}3d(-h*Bl}-isTDlkQ}-uZjMtFs3WkMVxa1w#3*;9D2IlD7Dp~MQLSu+ z@xgl-qQbvS{A>%L{3QYOOXAsYueZRq@M(yyd?1e`X8zpulxvX=TDBM-r^`QXk#OvN5zt|s z9CLt`y(o&IfC53)FJ5H#yPG?1va-t{HBOwaM<9-Mg3!Xk zi~=iB$b2EVNSGP;k^2@Wl_0HAZ95aSl8&O`yIX&-VI#hOf39_2%Es25pA=~2VYn%E zzUGP5>4Mm+dd}$;cM;V^mfM4*Xt81L5tnxbvlC37gbOU@gebk-Z3fe#I9>w^NUmw_ z*(Fmg;~7aX_fQF|8hH>;_CZXjNZ4eH9prA_GPSmb3$HQDlFd-@0DyrCdeS+e76+d2C=n4K!xfADG4XH9?yv^s@7X?*djRf;kS(VaG#?A6;G1 zN|4j`E=1;K2^4$o@eD!rz%<=$luXXzCy)UU;bvRc-Ktc zVq(0K&?Yd`6~!%lzSxdL$2>%#qM8WwARXGo!vjj?C|iP8Rw?H{Xn&0$F&dLq`p&MU znItMeWEFS~hSuFL<+bzDz0q^-$veiREFeQoFs*Tf=_72>U^t47jSm7iEE5q9_H-Z&7Jx7pJ7j^m zfeLw&28zyE6AL>tk`-ASuiN z#u7_`SmIa^=~xhe>;e-65;9)@*~~^00B(T5lKp@Wggfqhj<~C%Pwe~tEWUF;saqdF zEg_!}fsuT-IO|TfbH^$q%<*#MZa zvFAeQ+riJiBqFY%wB;`A!?+2bJ#9YBbHGlamuVkEPw9oIY1Z9}xpEtKY@9yzvK&rm zB7fT$4}B{-_jjh1Md)SJd4Px3F>HVOdtvl9zykz6m9|S_z_0v==nu;rO`(6Y-T&XN za?#<#^Tm*Kyf&8?mV~Y&kanAfq+zlEFf|jIC=KI65g6)~;F9VHNtM$lbikN8GZI~% z*Qp{6Q(^>9D-cfdkdpCAJfofC{7NIzU2f7?ko{RSJuxs%>7Bho(s~Pr49s__t@cC( z;SZql=m8!GV;>ekVgKvZlIT^WD|*`w&OUa7TVlx!v@BhPG%i#9f`u zyLu&-lDKr$X|$t~F}ep#VC8A{PDB2irZW=g6zF&p7z+l{`Q@x_2cE-HkQ}~Zf zSK8#+4=_i_@zWxMsMOq^wu+v!m~N%8R(Ko$&qgw#0r(<4V^`mxeIwm{jAt4$F1{PS z103Zc;d}KUIEc=m?(E=5R3P6=$Bdnv?dhXghMRL#Bfb69MrY_^!mh@hH2G3mre@SR z%=8cl5R^bkUAo5G7v){8hcKga>eJJAxP+h)Z2y#7eqawlNAgMV6m`KaMfJbouO*0z z(0O&hCAVYTO?H+(q*30>?EG=_|;=RRCbndB+cM0-y*Rfb`AqN(H&1itLRh zASB3IUO;{r3AOTSG@b%OY1VwFfI7f`z)vjR9zP74MsG`p! z0}Z|**#NLQg26c(5}lwMvoXph)mEvXM*tHbSjAN^0P-iE__pnH=oPjb;_Aj=6<19{ zfI<#J|FvlTycT#0W+nky9^ZWjrvhLI$ZYk(dn)j51N$buLPmfI^U9N^d{U*CNYm*mF8mO>mDb{2AU~R41h_Kv2C>qIx^FyBG7X4Vp`(Jn-7|*!tM`#a z-p-~93CH@rx(?F#_&`I%9PRfcaH+eKsv#-SY4#u1@H6H-mkSt^pI=d@BZ1C2T?<*mw=;d-R+DfdOYO z7=$_}mdeKyNrXdnzs3k1!j$oo8FkUB3sWtWeu?D_a}<@u_R?uNLnP>!uq4oQJ0uD$ zJHs8+L+Cd5`~o96#Fz zLYY}c=R~G5(Kb~%Bd>h%r4LRy+@c?5NZI>6&Z3E62j9du;c4Yb^v(D zN&-y?sv$`40CdQ!@b>!B_Axt~dFT)URo!wrunq+w3CxL_v_zHMv0sf2rJc6?8@n2<5uVsk9jKwZ%sCymkTo zk_CZY;>`>GMtzNIP3Ua19h^Xb$G&1G0M@D?H~}r; zMS$S#7`4FgqAnIJ_!LTg>y3ASQVA3i2<}A)JGyMc(?CzNDN=k5r+k1FDH&b?p2i~+ zAXtRf2oL(@6NQdIc-sp{r{N&N)Y)hqPEh>tUV_eAHV*8u6Q9BrcME_tnKCwW&2r6P zB0EPGAp`&bxUlVo;1~HsH-p*vEK@We=0mU&WpVE`X5Iak^ig@Y%%Z-?W_& zR~IjA@d_s3kw6wFfr08$a*2RDu!oziIL$U8rXuNq76_ak;iLhB18O#{lVJoHr89q- zTJ1z|RB}nUCLhP)ThOo3_*ijwbIl>Wr9tOE7|jN;c=Za;d;Kneu0`%qUliaESUhg6 zx}yp^{LjPWmMtSOlLH_$QD7H3JR3g>bw;xt?>^EqUCMj!jU!U-9U$ngNNF4(^v=Qd*#sCv@b-!>syZ;sT-5nXKknCt zg5UL%0l?YOZAnP2sutoM`qdMaxUm0fS++w+iNPCMW<}bj!(W z`*|Y)FS-m^v?jh5C3Y0xN9i<-_feMU3 z2MXN?OgM;kLD5}|`1*-E&P^-E-IhBN51F-l;^Y+>ZWCj%5F<_~@qC!c0lLS%Nd{~H z=0ihJ?5(ZfgQqNQu!F1nw-*~M)=w)$SGF)#D43^hFnEM#mQcr&jz>$D^ z43h4@`%`|Zhzdd&nOv+$c)feo2^$r*yaBe+Nswuujnac36FC3+Flya*X?JI;qpSFI z&zhNze*w5SzFpv{L?A~toB&MJZ~ou5Z8Y!jBs1axh;NJFlh`Ou0-epT_k0vOj{sRg zJb%uU;5(C$>$cK7w3y+ioO@kFwd2YJ+WvhQB2ez!eIfhD)=%zK$l45pm+Z2Fe+91J zS@%4CO;QB7vQ^=AYNt#98K40sL$+WHhz?GG_}cC+&$9xqbOzcCY>lr*f5Q)2Mc#C` zRfpT?MlmS?f8!58c$Gt4gBnYK5YYeu( zvF+YC)VK2Jvt||!r)R<&pa|BjD0&DSgDi8nss^H$l2G>2qKk&nYwgkM1t0)Nt=4P< zl0bkYk659RfEdIb2{GAi!FCnVGhU3|flF%CF%fAGKKbhZ16n|(zuX>SAsgB`Skh*tgdgRl06>Bgxvyaw1nOJI zV2|GTNIGpf_J|k^0Kj1RJo@zNkwR_k{{4G*Tj0CDR-?&~A!rfHrTZSo6*%l;hCi;r zkO_V7nPCYP_z(gQA(n)aAcYlLnBZgUX}BSW9a6T9DN3Psgoh=X*da3G0YKtWAVTKA zL^_-}BaP~{cOi~BPPGXtI{MWOJdE|1;688ps8~Nz5ppDxO*$!6hBZn#C6%0gqLda8 z;87))U4H*G%_FrSbsFqMJFkvzkYI_l`FiN0zWsg} zqQ3znl?Mru(5uxD&%~Z6Gt51 zOofjnU${ZdHJS8sg^?0_%@?&`2s}1rWRiXMm}R1Uwh_mEZIu}~NV3is{`ipK7Vf;= z*gkP$wNDD6Kx48iI7k)8AZ6#Bq%)_|W4z?1LH61TIFL(n|_HxFSRDdmCRCRQRsF{85QA z|Ge|h`%5he@*Yo^Ve~z7$MHJe*UMDF0uIkNB)GwAx>FH?dL|_7LjpQfbRvnteBuAI z#U_6LO#(Fjy!$>OLeYdJJuVdO@`n*}H;fb{=?lMufmyC#vhGn3E7YoiGo+`%4RWx9 z(yD>mDyTjQ(FYK;5@Ch#A&DCJATOc7K_=>!v^YdcDvZd(RLrosP=o>>DKN}6EK$F@ zltX`6nVb#HunrPrf`=dT$0>#YgGnV&Dh!f}?|NaawwwZmUX+TxYM?w1q77PP444_u zs4D7h@oPYviuK-Ok0^ZcTi)si3_jKm8IS;W-$2VX+~5%d2+UnCYJd+$kl=S)bYLA(7!P}q01SC>g){$RDa%>X zvX-{IB`$NR%U$xam%jWZFoOvUO9Zo+#ylo6lc~&P4s)5#d?qxbnM_N70V-|unij1R ztO`LamGJ2X4sdXvsYt?bP^^JCJsA&ivLHxtfPoB*;6orpk&?8e zCOs)iQ>xOHvb3cx<)}ly#!#xn(LH&4D17j7Km^9rdhr0-K6c6^BYLVIMwCjB@X?PN z6c49gjE5^E(FDj*wW?OVDps?q)vaz722f!ESGx)o7=*Q~W<4ufwVMBew6e9WS(PYR z-AYxjl3=ZJy(?a~YS+B-m8)>gD_!^6*TH7h4Ulw{*1m9!FMux|lCYX!_OZ0tX>n_B zd0Nz@k~Pap1sy4)MqjLPE~h3aZtzNFC~9zp8pMILt{oQj@(`J}@$a>HS|tg@NEtTT zwo_poV`O;S+xJY>SZ4UbK77zR9|)@nSAfPTr~y3T$zT|_d!1CKJG^AwD#!u`QfY!L%6GtCK+rG~tfD_C9KnT>pcAQmB3uB`-UI)WxWp#rRD4M=U!~L+ zh5G<;tt0^z0yBlShuzzYuO(v=(0C~}K5=3KV~I>$gJZE`ZXhRo$}WB~yGuSr4R{!e zfx;yX+W<0_v#ez&#RG&!$RJW$j6!^X?^-EdF^i#M-|mrFTxX`QnzzgsArqOwa3X}; zSU`xpka@{i1m%*^kpiiqBfVVNM6Tql=tVPHFo|;=L5RUL*H(m= zo)vKjN}8Z^5o&yF@+i0YRSdjy}^Fc#N~XmVp6_`9Vk`@P zIK)-C?|zTD%o5u%jjLHOZ?@TM8256HJ!Jm$r9^~-NQ^NdbIIXo}=(L1W` zls7%;wLFPL`v^`rg~#1eFMGaQ{PKODJ(htXcpX7XQr?7z5o&-J+!Nn;il6-Pi%(+t zO#QTVbOV!>Fa7CLU-I?#QMRPAj;3Ct8x_QXx7LvfOSJ16lHgqR(~qalPoDki?=2ww zlcR)P;~H-8WQLDM!S`(|86lhCDeS`)P}%zx;E|x7JJT6w(5AV2<40P-9Fs&KRfGZsijWz6!_MHt7wp{#&LCnWU*y>! zr@T?T&|p?LU6aH^92fyK$U-l40~LJ1%~1bPCICn#5TO|ufe`Yb6iy+%d;{EZMd47P zRg}(?Sc1J#kQZv9#PoxBA%+Y#Nl=kR8Xk%JEk@4x#TmL+8(xYPenlL1;3S-o81A7q z!2qTuMiwr{3i^e8qyU0s#UQ#tYaQUfF@_n)zz>0qIY~t`Y>E$TU4m5Nr6?euXv|be zMJQZBCgjCFNCFl39wWqoJxztin8A`TLIf(-0>(q&Ma+?qp&FK<@vI>qa*8bcVPf22 zSCGILf<+u0B32M1Rq&#Yh~Q$xLnQKrr!;}ZU80UqS~ zS`O&nbEfKIFk&T)`-*;#8o65o7{71c?vy!b5gsCJ=%o z2u)X%;wg3{4oQV1T!UQt9$A`YkPH$*qC;E0rSoycPzGgTGUZY#B~mJ;Q@V-~7)%`w z!4-r8J3>(kFv2=a#W1wvWBUI?9r^%5USR=F0utyUS74(cJ|$Hs<6r2adlVyIybD&i zBtSZfK!PF;aMmfnKs^}_c$MYa`M^5v1vHqWR0t;_ROacV&R|wWG8oJ5m`oh7fElEW zKe&PLRHr22KqoZ8A;iw?)Xw1`BX+_{9z3UtDO85J#Z=&c%mmAJcIS6i1$0g1I@Bj- z;^%#KQGW7gW-i7zz(5}Kf-+WRXTd-m01rCILa^k4;lu+8pbiY6LvcC`J~TlJ;6M|o z3v#j}HCR%$)Bpiug*cLj>%=%T$ zRskIlZGNghgesKy4IvoGHL!qGWCB?Bk^ku9rR0GO6=*arja0AzI*8)0UIT<;Dq2wL zgH>Z_Qp4Jjz&B6g&8=!S}E3IHb*@unphPKWMAi4N45mS=UMXLPP-*SIHx z%0hg4Cs@ov6ok&_Y{3?!!Nh`|SIC%+nt_8c>^3c@>a<1cV3$A8CZX!dm=b9y1fa9} z11|XHAIj^bwj<3V(L^?MWiZ!GuT>S{P{T*c0HO!w?`%ZgQy#40vCV zBB{1;E6YY#A+fEvmY6HV+6wtY!`LH%Xj=cKwo1iqsw%6BimSq^tol!_k}9sMAwI-|JETDxnC4bUf~3fRKcqmc&gvCh zs{h~x2?WppA#KX$$(W++dY$Mg1nAn0>630t%}xc)N)jbml2ky$KU#$hfEHSyEQBto z`B3fdE~->;4AO3|ESQf}T(96NiIFhUB7K1*cyFe1N*uVsg_1y!aP33l1CpY{W33kh zk!{kp;KBfbgo5IpLR2WPHA-$JE$;HxAwK1S8ywEdVud8IfDGUe z%AP9;_<&Tw?-v6U4hYoYq`-utu^MmD7?-hP88-ofp@KGkzK>t7}f-)}& zFhUBzLp=CE3NVpB0B{!bT9AU(-CS`ETyX3NM){&2RT!=~Aq^uT4JadSGFY`I@gkpznUjF#Gm{`@-)H!v_!haP1cJ zF;7w@T~a0u)EUpRp7;%%a>ZJp1$zz#3mDoyq?f%Jm0DOuIA=va2&10hB35`yOz!{8 zPmx7mdIf`Ng{K5DO3DYH;7PlXi202OcX+LbcWFtN$lXdT-n8P_GA z!N4BZl@Ita(SEUyEb|Zhk42jWKIxOB_R}HK&>UNHD%l5q=!XP%4bsKU6~KTe(+44W zMK5FyCZhD;?X$g>MLF|KdZja{2=&xhg*xkW_W%i1$m!7GgDlg>&b$#jUj;opHBw(S zR%dm@Np4lxqiSk3SckP(hZ|Q{h4PLyTBo&I4;Ns%C0t6@0JV!J!d)niqUV0~TIaQ1 z@3ryN=#9n!H8M?x!9X{DV{UTIUGueKFE(S(3$u#KvqHnsUW0>mW@JugWe)#zV{bNR zcXpkyawVAsgi4ZNIBDJ{wr8idYOi*Jurf6?fjwD*wY;mjRxoO_Hg4y(Zj%KoUxQ?` z01Wg`;|A^%#|lI5HgYGoa@+GVT17T5H*`n0bWb;RSGRRvH+E;Yc5gR#cei(cH+YA) zc#k)Em$!MJH+rYHdapNox3_!0H+;vpe9t$1*SCG&H-6{0e(yJb_qTulH-HDYfDbr< z7r22RID#j*f-g9OH@Jg8ID|*IgiknySGa{=IEH7qhHp5BcesatIEaV1h>tjlm$-?a zIEtsZimy0}x44VHIE=@*jL$fY*SL+}IF9GIj_){+_qdP$IFJXqkPrVkkr%mil~=iyUpba%xt4D^mv_0Be>s?kxtNbRnU}elpE;VRxtgyz zo42`}zd4-8xtz~Ao!5Dla|0(_0xeX7o%cEM&BHuQ#U)_D5)^u&!+|JF1)%#mqfcHj zjKLCMfhbhN87#pQz=0!(f+MU!p$mdPI0B)=!7DU6sTbZm5c(3FK{Y_S9^`^Q$OE7I z12I&(AVdSIV?nH!x~|6^A{e?Tz`+vefmG9yn3bQxuIXet~Yzx_4*Qwfigrp zH~0gv8~PG(fgE!;@ zRnWV=bA>v1gFZ-wJQRa6+ylM)1-{>Vy?4dF_xq~@yuf=yUkLm=jJw2pg*kAsvEl6!~ED!I#-NCp*y@kRC?e4!X=0ToKIIvhm2 zKg_%!#KNI7yH+H-KfFVsM?Ab6x~u1Wp^HM#kG!GZ1I+`3KS4_fF3IIjUBM-yek~E? z4`ReKOE~#^NAR6Mg$ENB6s2bgu}cc?5$xx0WI;p_;V4|$r64U@4j~RK_|RX=CpRPZ z^!XELP@zMK7BzYlX;P(2nKpI$6lzqdQ>j+9dKLd`R;^pPcJ=xdY}l~MCKlub@>0Zi zuPCLBIPwxAg>aM!O|=Azzq@e!DZ^VW7hZlH5xsf`lGr;(OAvu8xul<8EPw?QHq6b@ ztDkrC?(O?GFqXjt3m*=tn6YD%;|S)ewv!t;IC|uT_Q#DNv3oNgzQUU;*G^hFmz){r zdx==(%b7QK{v3LA>C>rKw|*Ua_NXoAHhd_L79u^fDG~EG@63~GWef5RDvuzyiLv}~ zS~BJp@^KyOK}1}CiL@T;$3Fb@}< zG1hP>C51K=swI$?xCbSIu(%QnZaRv{!T|jlFeNOr+_I=B38FGgEVU$v%a8!X5Hint zdgn+iq$F_(6H#o5KE0OUt0nUulkvww4LuamL=|0>(MBC*Do7#q80SbNg}fveICd)N z$)ExwNDqjL`O_bMUQuSWC)zac!Y@r4l`m6KO-R)b<)QMYPnQVQA6QF0HPuz$bQQ8u zpYRZlF_aNySASSb&L50Uq-i8xTS}{?Vu&U5(Qdu{7Tj>f9hcljBNbAaUn;fKN1K?|`D$JbUpiZs1x|WYx<7^{I17Ec6MD zlSn<>365V-xY@Wme=j*O@u za-QzhU!VQ<-G86C;u(XV{`&2|AAkGb87Ke#{r?|;0Tkc>30Ob_9uR>ERNw*`*gyw9 z5P}gzSwHf|kCP0ATleTs-hx6wpuB^FKJlOgNmxP?o{(j7@diYQu@DR{ff4{(5Dba3 zhBNpiB~J)iP?Y48cfbvWK@{Q;iFl(?xn&?38HywCG0~U!lV)!GRL|fT1$3La zCLqT#Do*4A3p(}bTG`rGBl~n>s)uFqa5Mg#(1q!UTa7Lz2>#Xc-gy+@V+;^=>RW$%h6u)nisw4 zHSc)en-2eXIQO~um~M6nTwvGsL%H$zu6Va`-u1dSzW)udfQ>7i?RHne3$Aa2rSaeh z2Y4R_7V(IQh~VbBL%a@#u!N&q-A7%xxe88khgTe67n2yrIliBX;{jd{*EqVS<1vcs zTVD63!@2mlv5uMCn(>n1ngulf4OhmMR9|*{9h&CPs~|% z?|-WdW;egNQOyOfk<0PiHXHEDSuO{X@d0N)|G7p8Msa!f{9F{uS<2Sf@|gkMXh(-- zA3bicdkrnh<`#JyNWSxo3E8R>LPn#-P6^{Rhjk0RH(&zZfnmgBKx zR^R{n%lJ68qH7)7ez4-J1iH+la#G}D=Cbf@voM8>a*T3#yF{6QZY$u4OQCdSK)$vo3W$Bwl@ zt~!@*qvAbh9zXhY4{xyiOWr`Ob=09nd2j>b&gyA5$leX6EE%~+K}Xr6j^KA&2c{O4X#3=e_mLv>Lr zcE%Bly6{68CzmKs#o-V^;3O&SfL2d_c@$B+KM9ctis*%sjMJZ;`C1PsW)CV{3?07V z_MWdPh_1n?0vfKbD})az1aK&xZX1ds_Nt;A6c8vfa7L^zDEx2xeggftMO**0vPXyNW^ti|`w+A=df<9j@UTLhKvx;Txzyz|bKct^on}feWi_AIQ)QBVr4& zFev`-C-&hAuK^+mFe2!X3n5}1sG$oXq8rfA4$F}G{GlJZp$okb9kvkWx}gm5;SY!f~4@BO+aXVIHtSAFAQ%ZiE=TL0jrV9bN(J7Rm)7 z;tyPa2~fZbcEBIhfC%Ow3);XR7J&!gU=AWd{&-*xm>>}J;UVn+BH&>M91}5E_~*#Gs)Uz!CvTuMEF1D|-PNl7a4KF)05|5*^lI_~fz;zhNy$ z%^%bAcb{xp4N@VJlBUGIz2hqcSbeF(}lJD>~#V>c*$?(I47i3Fd&Y zk{}*-z>~7T3;baRCPNvDLH@cR7r5X#tAPs)(jThxI=uiFlAsO#APIIMB0$n8c&-YA zk~W8d9&8dKa5MLw%O+2P8rKpc>QnljD+8&~C#H+Ks7uOt4iOA<)EW$q zCWnDGL9sG%66VY?9&+?PV=Yu6!uo-q}3m&^(gOhheboPEaucbx z7mfCUCRWGPAs1G#D8_*nQUSD>f)?ruBKW~XAmSYorgDCvVueB(?olZ8Ar(Tln?|-| zPqrjZfC!erWFHa*U_cFG)*t+!Q2BHRV4xqgmwV;m30$BCcmNiBHg89_S>KWmhv6&x zEH-5kRG~ILyR!G}H#VuEYLkLndjTD8a`uM7?DoO=Xs;jK5`e|_CwgHRp06LilJ;=# zS7R?&Ij|{gk}5eD8mO-u;CCep^DY0?6&i-2C?U}wrgcq;;v@l>6-&4%jS+-ZFDP%a zh0($I@Y8^&Vrqv00qfON?NV?-7bm|GF9FekL(FSmG)H@QLa~w+x6&?8f{3A_bQ4zk zdZAbqvl#s{en--MnM@suAr-VJDRLoqmm+WGL9K!}Dz*&omR*%T;fVR49pAG|>sq5)UXI3iv-mit2&AQ5h&?9Hz6b-XR*)?;fxrmIvb< zz@ZqTxrokT34}nMgJSFQ;bV;=blgClB{@4tZz@AqEADw1#1%*SVJb1$N5Ha#S6G!> zrWnj7eTE_z$~hO5h?c1)F@9kj^uZpF!$UTf8&=MDbK!TvVPlCwH`XE-)FBoGj~0mG zRLoc^ej%bAuNv0kJYHcQI&&AK*Z!-6Je6VW@pmI|v#T zd!&DRg#QXUaJ(V$h~gdkBrz5lSL8u6`eGc&SyaHGkyq#)aOD?%_dMD-7G$It#&{fH zxgJnZDzq9Rj(07_!FK=UAs^s0cX1jbumJ^mx*8IOr=gm#3%e=m!Iq2yoEO=laYY`f zW*n-4ZTz8|8##^3IwIIuEfVH<4bK}439h5!u_el`o5`-JBp<5bR3d_}du4^>svU)M zunn8GYg;Jzfff?FC}cNoQh|3L8a(=e8&Y8$5St%xyFVVfV9YwNH+!zj%IZLyDuR2s zCmNcnSs(7K9aQt@|QBv`D$B zru!nI7-GRN+_*2s*yS*a62O0>^jU zA9g&)d)&u?oX3Sc$9tJ+2P5T z!OxqV%H#ac<9yKVoX(+~&<(xO`FzjuT+I`G$a6~{Vu3;~r(F0UjlakPKHU)f;RQb3 z175%%41v@=U=;iz)r-I%$RO31!5mVZ4E&)8Qr#RzA=N{D)IZ(SQJvIRz13MA)J2`t ze?8TOoz(wb9o9)*)@j|=ab4F*o!5{3)RUdqKi$=--PnEo+JimYJ%HI`eb!06)^DBE zbA8vlz0|+`)0JJ@v%T8A9oXMJ+{fM7%l*^OUE0^Z*W10`RsG(Hz1IKT1N>py?LF0z zJ>Q4D*+1PJmI2z){nHVC-51{7=N;g&-Qmf7;`^Q3|DE9#e%T40-48zD1s>xA{@F|Z z;XNSYKc3Vrp4(S`=5&VdGK;0F@n9Hig}(q0PAK@-wG?KR=&-9GFS z!R`NlKo^*y?#-SJ>Yf?e-tEmE?bV*`>mKgwKJ3T7?9blq*WT~nKJL?g?(6>U^Zp$6 z-tGII@&o?|7@zPTAMC}R>;ZrA2jA@rU+&L=?%jU!|Gx71zVQ=Z^EtotJ^%1&K=j?7 z^c}zMWk2m#AM#P3?{$IlEkEsZKk;{;?GgX>H{bY8|MyeB_wkzE`Dq{Y zmw)Kn-srvl>$%?Op>$`P+=RM#Y_MiXz z-~R#PoWOwu3mQC#FrmVQ3>!Lp2r;6>i4-eZyofQQ#*G|1dKBovgTIj^OPV~1GNu2@ zl`LDjd< zG^(%0tJiId7InzMCYZN6IT|A9)>+^FoGvRZic~*m2n{V*q0Kqk+($+Wh*|$p1b{T$ zN5fGE_9sPy(p`w5M9pnzRxbTuHcn0)j!0sOnmw0cijKX-MjbhLkVs-QB(mN)WO$H+ z70j3*g&}^Ob6#t@cp$}%mAogL2me{&!7)P?iP|{=)lMP;J1(S#ciKUNQ z`sJmHW}Z3F4@H>Brclbfq6}D)-4jbbZQhAzovi~lfq2;mK8(tcFbNs)R&kC+7w}edm?+CR(5{s~Z1e0;H`n!(?z| zBw{Ig|D8row`6>K9dE)2L(X_~IjW6z&(<2Ll(3HKUaiXR>sYSE{!3Cdm-(ZQJSB}2 zELX}@18~C*E5}a65>L!;#T`*c5HpYJ(t-!%+*qI{9@H@kmOySRG8>g}7w&9zILgL3 zDX)B_f|UqJ&TNK=0PB^MvWnhbG{D-a6doXEane-z`|z;8)N{{0(J(v`HuFSN&sO97 z(sb8eo3)G~FTh21**_?aHboc{M1}|=)TRr_&%UT&s*1Hnj?s^5(=uEwkWpaXAr(>J zC@<15%ClC8fa5U+8m%B=cc(YBfwF#wblRFP#q`&CDw7L0{@njVwLiu5GtR(R(W#H; zuD>pnk7ch%d+mH~j(bFWC$zgl9((^Stv8 zc~qlxS1E?*^oPI%mc&sW=z;mv2f_2P&wdn4nl?r;3fEL{gA5^?a1_WUZ}5XF-XI1D zPZ&N1LL!1JT%QN5b0PO=;~2+qj}32#!yM{xhdk_|4}V9)9}01ZMEv0lYgoi2DshNM zT%r@7sKX#a@rMMN;tiX~!`?6>FpwA=J5Y$idC+2nU@ZUR*BsS?2euG`JqSk^t#Fjz zh-z?ejH7Sn_^3KU2aiS2qZ~^W5_44I2X?$;sZ7wVLW%~D9uOoUql3pb8lz)tGNVnr zx2s>M&XSz`7X#M_$}Wb4JEB=x3<HDj36!eoIYgOo>WWvT~HJ1Pv@xxk^!bfRwLX zPb^m%jvkB>S+?{gE_KPvVxAJ0=M&~GiRqYI%JO`Zv`r^tqK{&54JNdBMKOR9jc5o( z9<;dTO7tNjHlY)R+G110wRG;TlMM#!m&X%Zy7OD6q zuUwj(YWOOpCDF$(Xfaa~*%YTI9NP-v2~T5~k4Xd4Xj51rs4n*4rx6rtIEc#7sUVdc zN@XepoeEW>GG(e&Z6Q{lsnw-$wSiWJASC4QR4j#6tX42bSDQLiu&QOOOI_nx)B4D{ zw)L$n?BWY($`Wq;BBw4<2Tc7jj$(L&B=0zm=+q$(ZxrJ))wqW;l<}c(Cd?m|z3hh& zd)ZCCM3aCmNi;-DqSKzRWTynGYfsn*%Rw}-| zZEis+T)6O-BgF--YvtfK4L+s0$Mx!PqeB1P*Ftxc(sfF9kt<9?loloYh=o2j`x3*F zBt-8pAYxyKR6Ngv=??0V5# z90jlUU=a4&*hu=?H4-!&_tDW(_7cz*pf$tPoF|6|)LoB&c$6b1@rFB`;*P9%mo3il zi&6aHbA-6YuO(;;1R2$&%vi)E-e((OoZ}rA*~mXG)N(5_WFCjo1Aj%ZX%8q9^^W8h zlkH!EFEfw%e)AshxUcrSAx_TvHx*|VXDYHb4#K`jQ&Wd zH$CcAc1*2EIrXHA>}d~!nn{{|wTr{C>b%Yx(xP_pNv%ARdkA_aTJ{Gm1eQ8~q{2=( z;jDxGao>@|c_h_Xv$W%6XMb=*&M#TmcxmwqgMnk4rb8WSI|FWc+=Ck?9LedD?KvMw zdDJabg)_;)iYG6G6|44utvPM)klZNfoNz)T=*J z>s`TJ4CDR#(O16s#~)ksM+E&}SO3MS4D4KB zvL|E|NiQDHW5fYUQJ#5aEDGf!1u4qd`q*AWs9^nlJMffVQv z7T7@cS9}pTfqj<{9*BV%=zTq~fh33!CJ2IwhjD&k4s}p?2;orEgn}Z-feqn;D0orO z)mAk45I5+6oMsI8hhT}t9Kxneck3!_MirD%$| zz*4W^iK7!WmI(h7{LqS+2ryn~hjGVGEiiEd0Vr)$i%oTlxhN33*o(d>Ot=_w!3Y<; zIE=PfjKJs?$+(PW^?c_bUEBf}k(h=GWsHR+jM11r$48AB!z3YrjU_jU*TfCM*EQ;3 zc@Lux%8&}Fu!_MpY%H--WHB|McoO{3QpNC1(cq88pgs5S3$Z{omK6)Rpbv&s49Dg$ zv>1)@WIpR*1$6*Y7g&9M2niE8AQq`HWaW)lwUM~ikrxS)A!&;oS&=4*YA8ufa0D@D5-x4}!&5&H0=O_6peG zWn%G}C6Nor<_}tS4>>UosStdrIg4Sro9_vq@hPA4NuTu@LBa_}#4wLdBUuf@55K^e zD6#*Qqf<^?HV$6)dE6k2@<46_>Xe~Fpq>eu{Utg|I6A544`Jqek{O`8vYPe@q9H1x zBTAwr>RR}DLhs;&-$M+SDJ;UapCs{fm|0H8mO29?55=I0Bf+3D3Kry)U;|l?TM?cl z5t^eTBJYq>0Lr0YX`)XGrBN!SQ;L!(3PQOMb>UN9uBa>T5DN*Y5}GucZgASsDau-R+>N-ia^aKEZ$Iw zS@sIAps1Ti3!TSVw1A2U6LhhlJ?hY?P4}k685Tu461fnM@`zZkFp4Ae3gGq&vdI6Q z|H7w&3ahaytFtPfgvuPp85DH_La?_h(eMknDiivMtD8t?k@1=QfUGGog(Yzh{fDgS z=%lnNtqAsyD>FT1y@~YM< zuk%W;^$JwjS{(0i23Vj6TnZFMi9)P%C+LI?k)aR%8iiL8ulCBY4ePMdim%0?2djq_ z%J8m*x~?{%4th!y^KhC8+cXRNup>*dCHtfiOB`4*FsZ-^LgA@kMobzO>VT;r z8?x?cvOCMOJ!_vR+Zp+=1e3}MqR_HH@e7(~M!i>PJF%f*!3_$DvpMUtRcrsXSKF3A z+ZkBU1;5}3#1IXF02C_PNkF<2f`gDDciA_}=s z3`gq|w2-BiBvU()qXcsqovCd@dbYn1dvP1BcMG|ZE4hiLwZt(Ff`ABY>k{A~t|+8Q zfh!aD;7zTQO(l`BH6d&l>$t@tvXkq&uN%3QOB_o36Hk*;21X1``-#c`il;jh^6;gr z+cm8VyTxm~J}bMEfe4pi66&x7-%1k#lTz5gQktt0%8;(}fDH?|qdW1O2rIm-JGaM6 zzUAAq$txMldlJwqd%g=s<6w|L!Lc#XKfoHk!a}^}tH1k8t>_CBg7E(edY}w&Kv{^O z2ZGQ#m%9dtFb{}837h~2b`}eQAhsi6st}xQ#h?d>K)WvSv?;ZJnUSvd>oxhyzbA~s zfa@;?1z2zfoWKoFAPBjz1oJQtPv8j3zzOnz2u}b9qOb({pr}hgMn4?HOP~+c z@C2{m2xmaQSVqEG0lZx~#qF8G^hyU0@Bn4NLW;n}QxHKF@Bk8!4eCPy4?qD~EUzo< z6H71-YrqMYa0!9{2P1LAqA&&}fe29S4^I#*`e4K((Z?eJ$cTUiM!N*yixTH~#VWxM zAsmYB_{hUz!ffoPQvd(}00b;l2AjMB5hMTrAOLibKAJGf0)YR?)#}DQ@dW+=2h~8w zV;~9{dc&9C$Arwhf2?_ee8ju_1nQs%XCMm2%*ZIQsFiFIVfGWZr_AEJ%GIjLob1V? z9Lf+h$^fv&=mP?qtjf_0v3aW*uN=#CEC_lq4`;9h8@mMMln?x%zPwB@gX|CS9L%YJ z1$I&mvgi`rP_Q5?4**%P-rzm?uwMf05B$K6R4AGY z%^?_l(k8mm)eHdFtUe{}#o@pQB|X+CEzX#+1anQ-b#2#ojn{dN*D_(d37gfMMGKnu z3a#_kB*Dxh!JMhnJ(J}P%1{pls1kH?%wQeX6t5*locElMhU(4>PI{*>n%S4cRNf zZo_>i#Vy_SiQMX=+-RLX(9OvnJpkVP-hXP^{}TUhQ7CMv2vbo@j{;5-#Sjax5Gf6}n5~%wgk1 zPMbM?K0BV)0KfrsP|~a*3-~WAx-366IcQmsB1OR$kv&PSTs) z0O8Q(Y;K}pJ}kYG=0Q=ceXbm9j^{4v=5UTaqs#%}zyYHy#&fwOs;gJ6gseV3=uIV5e>B6E6)_dtI@e89j>z$G5 ztv-y?EZU;I&C1>DoBY|Qu0E`;>-72RyV72_&J(>S$;>Vpxqj?+sL29g?boi&RsQ4G zZtW5f4h8@KApk)QVC~cXv($Y}uVCiT9@z)R?vx?z<}U9kob0_S>+gOrl#K6W5%2Tv z@A_Nsy0WJGJ`>I?85;}j{r>L@Z@dAoEACG4GNE6QF|+eK+zQX|89%uWuO~Mb@iW0a zV@?zwJnI;*@h5M$9G@qS{PCe{u4RF?32)gbPxEZ7@_9lFR^0L~@eaXV7KK|d+^|{d zu$eS&@->h2JbUwa!gMkQm#$+D!~n}pq^&Y zqTj^3aMEt<&G&u}`QdD3)-4v%Ao4i7#EpLzdhhsJANikuwQzqY|IHQY`Er9#685|9 zkQ}~`5Bjgq-+-S|rO6eeN%xU#5`n!Ey8jZmUlP2(_MQ*>!%woJ-zG?&6?`sjHTCS2 zj@u)l`Z4kR3Eld_PyE;a_OhQ+V$K!SW+$8)Y*Rzy{G|*AGY<&!atf_rb|N*=u!#^I zC(2Nr;k1eEPyY$s*wN6e3(f!j_s^pMp-h=Pf9l?WEB4OcLWT_;K7<%i;zWuSEnc+f z!9zHX9X);o8B*j(k|j-^M43|MN|r5MzJwW5=1iJ3ZQjJ0)1^j?J$?QJ8r0`FW$*Y2 z#W*gOvW34I-m8XG>Qt&#tLl3-Zrm$2OnPic=F}VpWo?$`}+0OQt{@lE~`If{Hpy@&R+k1jMbto zjHVi*7NSKR-QZ)$H`(s-#+!SH^GCsj?s>&Pf3RUnFX#{~ZXy2wtfm@nveSn@6RYwL zJr!4Eu|*eOgz>x2PNcCaY#?Os9t&&ak;Zyx;fEhu>KSmW67zXQx&<42utR@@9C0Ca zcyrG{B$f2%#~_Ckk;EPpopoGOZySftHWu_oNvQ**K?!M4y2PKOL6A;CL`s^C7$qgC zpbP{g1*B7uPU%MJjv*lmx|jE_bN)W(`8@YI_j7-*>q|3V7jw_8Djo4}s>#hP?RLkx z0dwKv!(GIff$SH0ei654ZT!@75Ga>eCHJ?N$@OyjN?uvP9$IPyzS!mQZpUtisnmqpkm0EZny~t$fmisv- z+ks1VI>-5=9#6lAWtF!@fXc3sdC;%*x*tq^Ep-origkIi1Ld1F-q^&4YQG^-Dfd3g zjj6;eenk9xz7$Fu%-e?V<}X+ZrP1!?47QPxwn{Y)maq)5#LGCm;euq?-$)Ollnef@ zuYHrE#E+Np0;%u%er?TqV1?Wg^w=qDNLJDbKasQk?rGtzs|abg?vO+ImyIJR?W;Ep zu2tXE*|=@xeL;JY^JRjJ>uuYSOA+C%@v5X-AGC~|L-x)zoI;iy_!#jH12h7AVnHuC zZMt8`dGyr@euWs0b<2AW*&20)aq=7{xqsmb4|Kf99Td#&5Iea@AA|B<9*_x_vg$wN zALC8yR@!WNnIn*H>$x1s`_EWLUZnFEy^@2I7wMcs{qcNI>;8?7Z1Z2gw;tHF=Ip|G zX82&Og5^Vk}*I<^O}Ii?I$|P2YcKnygptm? zC1X3S*BJC3hCC`=z2=-$s>npAgCEgk{avrA{&Uy&Ox%vKgi%ZT43TtWt9_ldLF>Oi z{miFfcHAGB@93tzzOR01#dvpC%c%CXPIqCb`adUazV|}*)t&h+GcWoKl$9C3kU5A% z$mrNGjc4?zI*6q<=r~A@zxfvKa5rD(;ZvjWx6_>tk~IxOPnE%R9!@x+mW%-okGQN& zRcg6jjErgR_`Ab!NBMP`NB&de?|*eVlJ1>0JPJM=&nA&MDMDp2p-dA5xLTpQg(6Hk z*E1@?(@FJuN4#!n72;Z#le$EsZoG{F*L}cQQ%zRy^*=?f>uS#0hI69{X%qQe3ftQD zvifngC;3uw{13g~>ScaE$-f#y#3<~5vi~p?Dyuz3sV;#~YlOmE`8ai*M1#EBHARoE zI~yp77`}UrqrwAus<@0MrkKf6`7wm}aHUag?Nw~wK7!orLsH$;L>e+#NypWqj9I zWKH;PP2WGsiuUz1_zp1dnZZyqnt$-vtciKlNE$#Sle;L6@OSEH;PJV=muM-y>4TCCa=zSf2ppQ;qfZrM?&Q^JIrU?L2eOpL}b2 zhxp?C^|$Z`~8@VO*-ybuY(a!K_pUps^?Wq1w0%-)vzST{i`F6?G;U!;5~zWCOn%8laWjQic^ z4$qEEhIVB{E$jAv2(kH$UKDvGcn5N-H2HPA-aC2QA++5~C;Qq{HAc(>#a(vy7VGa> z24M!W_)kYFEL%TMYQ|ihg485I^>%Lpju4C%v@^*1&*oUEn zF^-W!?MZ)Iyx2x%3aPoDZIZ8B;rkJdjjQB$ht3yGO=Y_^p_%H@H+~7VU%~yFnkp$A zD}>~(f9!gCxrASXOWP#H{N^O7Y0Njki?6N5`G>GCzv?>wO1Vd5Rcc;dyY2H;Fy7Un zQ+|~#UB3r+@KgFbm;d+%>i$8cSAkT^b^{vUAKls>e<;5&+2xmo{?C(DVtTWz=ug9= z{NdV`m_7Z3Jf)e=8q4a%{pT+)1i$P5jJo@mSKHG6^(c))Wj00XXZDJ${|3Fi>O8le z(KWKvwEU9#;l?pA#%g#_suZ(V%AEd7IrZ~i0YGcPBTbM}gQg3#A3(|aGO9n!Eox!qmq1)eoMN3kYVk=|X{JRgYq^h<8Z zv&O$l{Kp@5vbXLr?JbGe$>AeJ-}-?t ziwFebk3o^04Jf*Xw=aOmU9GlLM5`}L(g7|fVut#01{1|#8%th0Yqe6h=4-4Aa)qw? zB_E!5l_GGQLaqHhJ^4NUCiuB;B+UiXB)Od}{3~C|SAM%Fc01(rB|K4!E_6H6CNb>= zmeyEC=|K7`9ud1kj5GvEg!n28c*XtlP%#*dlyPl{U5b>sR5)!{A+&YKzUug1`4yIf zet^pguwruSjdhBOFd>~se2_+ytF|Ajq!?hc%6cQrr76apnwY~o@0&-xFi|PqiVS|h z=(tMj=li8htBd_blwLBz*o!{05Dx=W7djy$oDyL#+L3*Rq8U}Q0wW1yQ)%Giqr&Vbnq78WrVuF_$9w&VX(RE_9CP97KuOnpdweK+DoqP z9?s-1uaUbOS$9ERdlA7KtLfz$703+tCNKv1!i^0gKb^PbCr4tN;U;(cuqW{AEbzRl zm@sDezuBahBtFy|p`~179cZJ%~Tij9Z`yGmmrH)wsa)MvlOv273px zlug(&{IQZjRD;2>Ft?0I8-wnTm$dm@U1M31By7yfM0jYRuPB_3-_ZS* zawEbZEJF-wqsHKi0kzl2xf7xR-&hw-Bd>+_!~zAeO3xD`fPUy=vd zML8u#`W1@T`f6ZO8ow~aEmNVMCk`0N{1?RDYO^pne3E!B-TO2n(vJ_NH<)4R+R)!Q zh>BA&39I!>jEn?{j>t2IVj_86zBQ%LG4Dt6Jy4lph#3upKYjyuHKFMp)irbOi4f`c z6KTW!W(y+X<@chUoOF9pierce)X$=Dlq0-D@Dm1C2I z_%Lf~$-rbkFNB-so0rwe@Fxg&f?76!0m%Dh6ljoa4_}?Yq14p=5V!)RR_#m=o^;6; zEIrC#KRE3^NDg}YvKQWWg4C#vw?paDtxxcU*SsL&sSRmHQ*_A*g%=>K`ZmGfy=cO5pgL`Eg-{=YmbrZY)-D?B?|@5I5$Ue7d2`xD|EZ;O;m{HQj#+ z_Q1PKL_vRg%}_k`^U&P~NWssx?dsBtQ;(}Zwz4{LJ2)~L%&6*)Ql)*?8g?rQk zrw?65FGe0f4RwPIrmdy}u!BK0BX_F@|Kpya2(R&rqC3E`KaaxCi%c3moGil499Ab7 zudDFM#g7;;zN(?;_w16nY%4Sw>mKP3IIsB9m%vptE}HuJwZd$0vVOMHWI8Ej5ceY0 z$Uf?6YNU(A1AfTc9{F#u&jxtF*!UD_$6;Jl@<2a~$*bLz)7!Yw<(2QdX?wLXd;9yr ziZgL7Bi-`7PRCjyzCDaTr%%{sLe=JXDHHRll29znmpbDXsV1~qli3QBXM2wbl{49x zZci;H(w-T~;JS{?HPEuD-!m%j!vDOnS#@3C+MVYaKeKx2F6@VoIsA=g*r#iyLh)ys zQ@&YPpE2zf9hcD_by8f^{VQ@@O+%8u<}BZ>d}~^6KiX66$%I18j6&MT-G02UaS!2f z9?l%pq6^yVu}w*@iI@m$UMytJu(ek~Z5tWN%vaZZ4}Fre8yF|?F3Ovv!+31|D(93H zmJ=0Rn7-IPc~jOf+Nq%?FlMLa`?*5>eEyF=H9s!8eq7dAc8~n{cWU{^J{i}pe4Zag zl30SMTS1s8LRz|WN~5@%)b+DCoSM}0yWY9%J^4Mk)be17=AY%i($r{oHk`pPY% z6>f(W9-kFn{0d*%3V*?hK<$cP_lnTeitxtD?X#6TuvHPJRZ*T*G09bN_0_vZs}c^Y zl0K_a_*Ln&RhfcS+1gdP?p67zRfUb!duOZnVQY#^Ygfz_Wyv)alKPse(VCjWn!3-L z27XO5ZB46SO}lpOLHC-@)Y=u$;=i-CN3eAa)4DFtx}M~^zWTa>(Ym3-x{=SiF@D`7 zZQZnB{c-KOS@*j6)cTW+b&IogOW1}L(}p$AhK=Ngt@?(Y(T2UlhJ(+BBYwjvZNs@> z<7w@NOZSHB)W)-o4Y#w6=dew8rp*^Tn;w#zp6Z)kMw{LaoBwBn;5U8KHvI}V{cAS^ zx;F!-HiI@cgU>cYU|U$GEga8QsN`0d`c}Bn*8k}hK3kFat*ErE=z^`7+O62`tyfc9 zaT{ClXIlxd?L?;SB%bZplH1AZ+bKrdsSev|KHKT|?F>@d_M3w3x3$}u-P>7H+wV5E z-=A$~!*&QvJ2^Z%xsp2{)OYfXcJdu|3Ve1x;&%$uc8UsiifeaDx_3&acFHz(KAr89 z!*(l}b}M;yKmQ-Kv0H7lTjQ`>>$6*j->pyEZ7A4ntle$u-ff=RZQ0muJ=^^P+iPRm zYvnqsnuiYEy-W#0S8`{_#KHK{S+aF=tALZE} zliVLy-=8qrpLE!t^4Xup@6V*|&lc>@U74@D_ZO!27dQ5=sGmPz2Upm+WuAi-$%9q( zgEgarb%%p1@Z~1{U@PrlyWn7__F%XBU~lSR|LMlT!P&tf?2yQGc*Jw~Q}Xaw{qV%- z@YLb(%;)eI{_s5Q@OQ!CpW4HV?!(Ke!@nDc|IQ9cFd}e`2;wDP!9F1xL^5L{)R72# zNrXod5$Qzok3@<(B4rPeYMMyBNu>Ejq`kk3z3K}7CL`wrNPy7)yYW!65pd+y6#<7+ zQBu;-B2lz7bVyo8dO8*+#%tGD*qE=~U}NRD!LEE0&dqs~pPNfqfKN7}V!*JG5u zxslwHlxj<+GAp(oD^^R}r_XGk^xED~vwK@+e@oLLtIvTG>ul2R%+vT({GrP`A6NUq zXF@~IMFL(t9eN>dk4RR`QCj zn2a`xj?sD(8+sV4Vf{+YKkmVFy!mYWWBUZ{_lc1+iI&bu4+E3xUcJsre{D3IZ1*fh zkCI>h+6nT9w3SBM=t1SubD@jWz3wT>L)cPrRSIPVe!V)m zw6^cGHa4p+cD*iRyDoLRK8?_jFwjs|*O;==nDwDKxw3ieTTA6`%e$@CoU_*S!Y>)+ zU*;OyM|RrtOTK1Je61<(%-QKI`PG^Gx%c;-^_Tl>P!>t|S=3nCG{?DC@pL<6q2gheemuElEexF|cJ|mGx|6lwh;4eA5W(A== z3_&ko-dmB=5sBo|&eE*R?TTdQuTDPzoulm+^Qqy z`h(ii*(#^;;wJ;OW%G6JtNrgD)O}iP@;%;J9H=Y*@dXQ^yrENHvD^_wFK97XU%A?o z$o1g8PQ&N*fedk%?}H6ho8Jg3u{RzzR&S3N>X%pyHP-A-SJ)1`f7o=j3DtYFickQIZwX6A(W@GQ;3-6_IWlZ5g#q1FOW_Pg z(nuzxgES$W$!9Akm>GN?ioBJk$;rq=feJ%PxSlV^iWFSS#j|=auf|FHOL5Wg7HoY$ z8j#J}gv&5FuX)Rz5!PNSEG^klOEeP+aT-R>>mK4_%XTQ~<_de1iPZ~rT73)WjW=CG znq1)qx!W6=^?sEOc)n`#&3C?yJ`ZUzXVG5~f{HBY6v0iH}B@YB&XDR11F)$PPC|3{92g%q1!&iz0%UhY&D6tm3 z9NaWqEU`k$9yTj`O-6Ikx}{u5GYr26Gq_0ceDG_}=Q?GTK9OkV(0GBAwyJ3M3``>} z_rX2$eyR9gRM~x^f&73-%8VT5Q?Ws5zvAD2O?_qm2&=s;fm_PR)<3O6aPIDZ+KC*? ze^v)5C6>Zyx%16!HE&MzvR5fy2rX19PGIPSzs3~{V#EY_$DG9)#6&;epT>L^x(6Or zi54jDtrTo_I*yDM*cGXc<{Fk)E$;z~a#-<$$vFtu{z;QKmn77fS<#IDCS;ZG6M=JH zELR6tK6a2Pr3CA^*79;W7Q)=H2wK^v-2~0aW6HUftTe?SK`K_&gaoYAJr(eI%>9&C zn1=iQ-&PVPYT)PKdqLp`-$GiiNN-qtUfE@UUD^#9Wn2OV!c8;uWJCzdbNqwi0>h<> z+PeIu`~%QiiMo$l9eH>dG5_`);X$x{x!yjN_^Ogip7dgS=p;Dq*Ao5B{XoV9Qw9#d z88h4&P?KN>6f$%;C2hgR2tXI$T`XrLwr2zhx#f;q=rE#3ij4lN^zEQKtzIxZBfY^+kMk+}5mTgbn*9>H@F znV-?O-4FbK+@p$=25N>(!YLPiUygD*u-Isg^fI1a9(_#GN{Rt~T5Yjz(DLei4e8e7 zyy;#tHGYnKGT<;f* zB2XoK2Umm1w>gdPY=tS}`3P0Q;n5KCDTIaMz_q)O>!JX|X75Y6lQ=88lcihNYPJ#{ zy~4Wtn{mC#Fl)!f5GA9v|_3E*xlm3A*4H|QXo znI!m^I`R2GwEivQUTq$R{?}aY=~67@B((3IND3gF@b=0)tr&0RO zEjHw7r0TEH5=QKh+mC8Cnl`+y<<)d|XR@s-zo;%12jz7>Bv|2{&ap`9d6q0>{@U=2Kxh8^v=R9_bi^>9Ukd5)FGgt*J&^xf-T z7*5pURV}7qqt)qcx2M_Gz{M~r0gO4i7YJQSjU~!2{QVVA7|U#E9i?%Xj!*L2KorAP zQr}Rwj04a0sT~+CfVf`sKMFwVAR1432yeUVXR?srNr znRDRzh?S~@z!fT&!O1R~&xV}F_2?r-7Nv04L>@u1oA*=P7_#M&0@phGZ>|1R+N-7& zv8C#EnxbK+P($-{hLPxWh+m}|+c!XrI5%0p(v!S3{r7evL)cHRh2WjxYtL3F4Tz!4 zfjAmzyW5K8fBJaB+r$Nye)AkH84;x3aT!q{D0ihhR>=&)O)<6;FYWn08V^0q_hnTS zjq2f}3=cKU(^kxPt@-q_Eagt;()5>F-13diNIj?IwBn7L&*}Tg)^bs_gYmW12`hVG zrGUAaVV_4UqOaTnF6KL=b8CAZ9(ZJ4ENuN(*El_N;L|j3`j^>5?Tq7jnC9=rbDAxd zw7?|3vRF=c=|dH*G~H^#>UY-YI<=iLeSf)2G{wiLj+^ffV>Hz2G8NycI~*NGk7S~` z3W*lqa$~x5l@?e?{9q+&u~&^ecIJv{tJqF+47c`L892A0>!n+Y+e{qo?ewTSYwmba zk1TW_+W+U7b(FIH4r9mC5$_)kcr3?5?`g%zZ|cN2@hs5Ij*PBW>Pq#plT}Rq%xaCq zSI+QK42Q@~gsi0XIIHW59qa|2+J?oiCa$baEU)&saBj>sD=@vbe50bzM${uO42=#E zO3%O@$yib9(MtTvEVU)b(`-{NeXg40GL6>t>?)QZcHPs?X+p#_A{@oZEDX14pGJY8 zI()2*G-L^^^(Fg60+@nfFF|2$_~z$F>njZtgbZ4;0$E0g#qQ3nKzIJeKI8R!PnV!# z3S;OIS|Jj>1vcJzLa~jm5&s>}cz!m&h6O7{PPY)(e6!jqS-)NIvS*%$T*LXhtkd2R z5u6se=bDu zABEqROp$-)h~UFe_=H_~dm&ZG(DqO%c@=Wul|bqlVHGkzF86!V7>LgYxw~7kC!yrH zsypau;dV|kM|U4(MgMGDU(8Cd5g+|pmq23&ndmRu9|SGxF7JQt+O((7PsOwp=r#Al zXb1F=ln4(_WA{v~U4}b#uMO_;C{^GH=T@k~roP+|QvUZ>l~q-R$1g+qRgCM$mHrYb z+({#xiJz52qr@$UCKLK{SDT;+qu>`h!7VYFMX_jk2EpE+H!lmKb$s!w%FuYl2ZT+8 ze;B{9bgV#N+ce#Oe~FIFWEsTIeZK+)q(wogs4?{ zv#3Y<^s1YKs1=Yh{wlP=h01*x|G+kQ<_OjOUJ11hUi+OZ!x+^+k%Ua7VCuNwM?_G&GHpgD*=jFbG8{tr#=e?4iARaY!d>tJ2q0^Z z1s8g*bePggJg7RR;Hv<~k0^~0!M;%bVQHlFF_=$%JDz9e79zetFn z6(V3lX+|G%R0Hd`6xkbIemg6A{B z{#9aJ1nvKnY57JVYJMr8B7uL3iA(7*R{DZT*C46%;jI`lg*JD(hI9^&=LSoZ=g24T zJYanlv`*1TS>Bf^dK9_IuCPkAryPDM5`r@-JQ5;u7b4a=iu^B4Wt&zgLqa)X^E^8A z`FuKrm|v&A)#9%j;Z!+K1A}NsRm`CgX*K#3Z~BZ+y3^cDk+5xS^)c^`t+gH{IyUv zX&HU9?>Ru!;FqW^bZjI};my|j^I7J-ZI-gs_rHH<1FQtFEP?DX0X8QD8B-!>M2{@F zmW;_}bD}rtG{+qy<=O>fd9!sCK1-%T|Tc;fqz2RkqW>H6_YGD5JSW8 zm`0Kjoq?f&ak5^hqw+^Mr?pTL?%6>}A9zQpZcyY7fY9rSdD@M6I&*pd{mH|;&C6J# zdG?!yIX#392T_nksXe`B>`#~Em^Y2f(|ugv;9sD{D$V+q<=$ql(lyj$ftP{qFg1PY zlqIOUh+s7Ot^q(jT|wT8g>H^h8%FYnuol{0>4&i&y-5Pso>6$wX^Q8bx zR-euO2&kGuYoZ?II<1L&mYP*5c~eCP7F@Ri9TvEDDG>6j$G)|~2d3*0@2+|@DHXF) zxx!Q+t7d##(}&&!xAl}=^N-2Zd5G2@D4%S(HmQ*uBLEj|0!x?V1-4nGCq;lqsTsA9 zBTL>6K9-%i6w0O)^tOn)EBLkjo!7ib1{ag=IbntYTfAZA3y$jPiU`q72}L<%AC}|2 z=NnPvljQJPZ+hX~#QXmZhJ|#P&3_j)(`Z);!16Hxwp8K+@#8A2l>S~C;L8vC^n6aU0(CXyb+K^`g6vHtc|{_A zTH4fOTxA|sxe|$p>w8p!*m6e##V#+IcN93Dc$MJHo1M=a9nTNJKKzJ_Mzl`qk3iS+p#QD;Vje&&Saet5$ z&#v)3Lue;c{lx6Q_aDrv4evT6#CJ4NJ8 zEZzFgW1M55j*DXQmpRRh?U+u>DdnzJt_dr+0k9xcO$v zoSMCz9$jvgo0Cj)OBO8TpS!J?vn}!g2qoV}qa_k)JRjZ?IC|+G!~k7HDa)d)>QHk2 z_^9X+JF+~P(HWtm84eT{Ix4`2HhnR~)&fCO`+uNhb4KUCGMu?$#h^}>UR?rwJC^JVMN>lfi%B8FWs)3As#_(VW zmGGy4Mv1nZgoSLmyZ@{Bh#&dr6Z7$iqA*le0>0((_QGLne{m_uJzN0(S(?gX3EJdQ zxT`=v+d?<5fb!JvDaOd?Yddu4Xuz*Xxrzo}rmC;p_D_{e3kWZR-5RYv$>WfQy{FRIhDMkq8722x}vXnvx zD~lqfT>{7>*l69;<^>+tK zg~p}2{8I_M{(Z@Q(T`oixiOLcW)_>8bVho_Z6rnzsh7EFQnJ6v4G+S6$(yI^c~560SPNN3T(tcM zSE3FVzh^)&qtg=%kPz%crg|(@n^oIS!;jkdM5s1p<6Toi?#e{Yecza5 zt{b9Jy7yv%)w(ezK$`3Y%%pA6qYli(vQO*X09QLW8sYy-1Uo`XzFif+Ofhu1{twcD zi2_7!MDak}`iS4U@iMe=m!?;T{!mDjy1(8QZu6>lM#*B6c4%)Y_wV7pE|~?Ertd>i zYZMY@DJasLW}*nCQ;uu$lxzJ?Ox1_f?9!83P?rPhh&u8cAsq4zC1;;oYs2hLFDMp? zw=RS%x0HYF3nERBzwi=Yz@unBqUOuPA6oC|E(4O%6etbc-xF_Rg>Lcl{RSs;z81uW z>E?OdahLPJg4f`uf0m^>ijux0BVnWjt`Uek-UN3H_81{XOuo+Hjx71wCQ3Xp)_pD^ zk97NjRKsU{CG}RBAv;2lec4RGBPn^FTU}C# zE($Acqjh`O=`I76{t4rHW7xTML;nu{%$nxDwmFRI2!wS74*s`tJ3yBt0z!_gopYIt zEGdPPGv2nF8Ch0|LUI3>XEM5?_KHQqZEa?BRU?r@E&i72*qT-fKmW>c>gzGVR8f0Q zJDG8gdzmuc2WzwAn~w-e5j5P7C;0ljP4rge=TfZoKI-QGm;ZQj$FS72#%+CWa@V-R zx+|XBY--Q6T4PjHE5PTOS^bOk;e4~{gD1%s_~rfL^h3)w9F&&F{M_+jhdzqeVPWRT zw%1c*g1G6_xsEDdvA)r6bmdGNk*2l#I{Tvp^(2S&VH|4}SBQj&el6*hskq}ef|}(< zW22MHaD}~P!O(BXjfD&q+xUfr-=14mchXloSdztu)%t^$_Xbbb@2tnT-|mt7(>ahK zeatTvDCWI8BkHmlKPz{9j2bV0exL2$#_s3&PfdqE#fge!^eN7b9;{wOB?MOrQR%qW zKeh(NGvR*q89uo4^@rG+dndJm0a2Af-{6P?rsb9Nb)xl|~ znhy43KJif<{lVL-gGDK*3XIx+^7gsm3Q`h#pC8dQka=~;QyP-rAoWujR_F1<7ir=@ zDc+)pC$dKnDK&+|J;$_#y^-T;D%8$t;s{p!YvI4!csegtZ$<@_g77!(GB0y6Bh&1x zek1-Drv}r;VL`n}qhGVPYC|*HJ|e6$pA?QN;Zg;(a|m&1Mx}++{f`y%z0c-Dwf&Y~ z1z)0~2;R-ak8t<-(?~O^7VrK#Gy4|Qe96}SvzVQn(u0)# zs~(`N^XGwhN>**{Ro7>=^F9~P>Q^1)nv?y)AG)2)qC>e9WF ze)G_|i|s18BfT^-yH^D7c5>}+x0!n}yn5r%@!A)J+WznZslQ&Wv4p%Ryk+1zrMACf zkk6dU7ai{@mp}!ddAV}j>%k@gwOSXEV(YX};%qGX9+uWZz-eORgLm-nh(=huMEIY< zou*&^0>~Sh-gf?}hm!(N6c|an)B}$~FpeU9jj+M&9*W47JHOi*&S7J$09l1{3ttHlWgL zv>|;RWE4fq3D&;tlmibsC8(63PF^ZB9_KKnts@F{0_6JNr7$i@9CYmnXE4#GT%z4e zU%<#8j(u$@^9PWHTieJ<5WX|)*Tfn4j%vcFXn7iE2*wqglxkZ9W)M_eYA)*4x`=J~ zSp9X$?j&s(dpqepAN@;tyPFDap?CbT@*$-oH~%7%Z^sZ~dV60M)TlmO96R?~FL#>t=aYnxFXz6=N}_ zdGCJHo7-W|MV2?l)jH%&MP1q+*n1!(Br?o}bV+(NeDZcochCrWWKpj*fnaWd=|Yi* zM%XR`zOB8MsC!z-e~jU@Hp(D*bu5iQU?^R1b%Az9T8q=|V{9oK-(84+1`oTJ%8|?5 zB2bLEA;Xn}Ohu~jcEBU`PSh5b9q((b(D1S6u+xaD7EPTFU0R1ykoo0tz*cBwPB@Z& z3r5LN(mf)f?4Y}fxce3E)H+@hA9!#bi0VS!Yu2H$JIkDU?{-8Enw1qc0-hQ^W0@>^ zoT{?ZH-XXkobX>Bh@#ECSI8Ypt73;o$DF896RlYc6+6n#2^%`JHu|9iTj=_n#<5qw zem2;!(BgWuD^>Ai@^!-!kGx_-vzIfZE_Fm$Qjn{iMEG?&I-^@=d~j2?P}8 z)@$hIdB|;^RBua3XRWQgv`ia#@&R_(so~8r_GIfUyndMbFzmId#fjur^R~q8?yrCU!}J&#C1{oi*iOwYEnXOM1=W^S_$iP=)M6$lXygz6qubEP(}xw zOv_X(b~azSyzd0F*}z@A@@`HOKSh@0{-D;NXQ2nMkbf7P6jxeIYBvlI4*&u>bG2Lu z#4sl$)`f^b&|}&lue=~!0i+$6>SSG#FNt#S)j*H%cG*ak_;IE%- zbWT{j*42mEVkj(H7?Ma0urs8a?swxj{R)L|BiKB$0Ir8}s~b;WujK?pr+8 z8y+X%sdXpsu+~b1@XXg9Z#VB^Q-x^qMHwa+w6{z_C0tl@PN^2r!J2r4HZRXMKPBd@J!P97x2X;UCuTB>3@Ap|Mba z0NH$p2bkXmqT_b$ZGp1(m9{M z^%aHt8Un~5Ti0@rb3it*rEk_%LKi%dQNdHRe*(8OZhDtXA@*$G08Bo?W4goQtoj}R z3%%ToP74EHLtHAI9;|!hwIBL`0x!s=11;tFK)0|+DC-h|(gYiEd{>HI1YlMW-m3S& zfWV1JR~O`XK;nmsJFG~>#d$2qR(7tW%EPHjgiis|dmpjg;2bNuxZ3Ot0IVZm03*pH zQ~^a_-AllWYI~3C%RG^;SAkVZIdXR_`a!!|eH#@S*279kJfQ7LfqXF_7x?Br-Xy{h ztK3-lg-12}gJ3OeRvw|%0(*gHLSR9!n!zbz4ZFZy*9w~RFC>It+nAif>FahEGJu!Y)#&W#5gc7C+0gz}3w!8+m`c5`< z%k{xlk0{`#SM$^HW;sEY1HFOYggNGg~t(DOb6n;;Yf+dr3$w zJQ_e?kyIjB*er~4NwwFuN-VtIF#s9!p&h;rP(>voPAb6w7=*b}mtuJn6X!gULl6*v zfKl^zz7-F3^=cky0!6K=#XxFY2-0KoEht~}J{_-eUFe4Ef~G3P(Crj7(DY zv#LUoL5U`O80w@E_%;-^kVp}cXk{|0MgYJ>3{XQjXLxO>$7rdT8`>6XU&)?=1*1j^Yapb*hzrDD#bn}x>O`^mJ4u&L>r;n@K70PjTUpqpX& zqX-!+);saB;7r&gGQrM-3pWOdYS6+OOT7z6Kr|>2Fm36YSh*H{aUjt<@p(N6%!d_B zHKmp@?XAbNCf-WGhFwrZ2S@d$b5r^}oN%sJfI0z;_4s10DlMmPBxM+#I~Ilo%=1kV zZyE!H;TjU=bS41Q1j{!LMwmd5SA_{vYt;roG1w$3Jazp7LZqfgq8VZJ`5pHtwG#lx z0^ogU_C9oK@i796MPx|QVF43yFe0jg*9l7&HAij=At4IeXiOki@5DLE96Ezd4g@2d zu)%-JP0*iV=-J%3W<>o0yh4&r0w8!I6Bydye%h?k<+1btHJfZ{(DwmvU`!B>MgWpz zX1t=tx0Awbw-IUAqg7qZs(j}a>_y}CP0CMXvMI6s`c2vN(vl43nZgVDH7(it*g8Qt z9dbCkX~;zpp93)g&2(F60t2oMJ4z2Jyyh2S zjVXFG1HVC6wl|j}3|f1Rvv0uwe^^)4OrurVv28Y8TsD1+iHQ`xSMJ1c2}<8LL0Qz0K2jcwTZIMgat>mhY2@T z*ebmfg!40L?k~>22ufro8-{_x8F+BtZ{hqEu&Q3be%BD%r)i(s9s$zWeYIMVYPngc z7DKQNC59^h%oE+mk&LAw_pn8bge~#5NK_l{kyYr!Q|KK<*0>F)gtN$dZ8*I|NOEqJ zaA92{fnKhiqCo&`G%2LrJx1+r4XwvH6SYet2SQy7f;mIr_32+%UY!Hl;Qin2mH6*DFAhu=D zSiMsQ?B@@nQ2_vx%e^qe#o$v@P`!rtCp~xr?6_r-8f2nSW{xv!fc4E&p#k9~$Blhs z#6iSIao{vAaXD5!?A{g`2)|v7z*}&u@IiS*!iTnU6mUhUJFq7sTaV_#)5XY=Zs2Cq z!VI6jAw4wNj^>LLMu;XUrj!-pyffA^{_c$hti)tGz!5M|g%*ea+z8$SqwV$GkYBrI zABG`?odDbetX0O+h9?%72}^^tzZ{BS;pkcTc7p_VOsfqb-O%x-0pJ*Wi_V=oPXP35&%2-|&eJ+93yHWb?=)_4rxk`m zxCY~g<~!z`F}S2yHw11>(8vveny)vB;IcA{w%YP$#y#YV&|}71FY$ydg~d2H?ZqU0 z=RwT#AmU*8;`R9V#Z*83?JqHr>xnWNh$7S_V|=4PfAYvo^canPf5`ka>VTOe+ z#0QY`DGw`A{BFc|gI$(_b{8bs{`#hsZQYL%XJhu2k^m|?^fZHln0v7T>znU`CBuS3s~{k%rQ;V6CsSiLmPkq~ zW-^z;?5IQtgD=jAW1ck%XX{*cLz68d#>eePxQ04QRv$!BSN>?@#_uhWkcmYjcqK69{#8(5W~U>@NE=NPhp}AFc_DLFzqNG}Q3G zSp3659b_1e213>_OSRh%7%=6Gnm%<_7VhI6#O49fr1g+Y1E)64)rwSG^|WU<|wS z3Ex|!>W;L(l~ged zbiM<M!ILj?<-TPRlYSF$-T#~;D&tvljD86V ziZg~kG-?pK`L2EMISr*+G9~Zl=0X$QJq!izxBsMvK`O=EiYSY-VHM;l8rq?av3%Nigyy$(K)AKYq-P zkuAQDll2!UE-G#mC@I@8VJvcL@mPxYk*9aT`%o*#2L3Q%-R7+hHS7?oXPxJz=W1RN zsIrQm5~}9A$0l6!rbP@)-xI?f8S&3cgpSJf|GsqEk$md8O8K%=0 zTAK*_RjB3K+anw!U{Wo=A;#D*z#@_uFu_dmF#;v`dL;xjhB1N9U-zxjiNf*i0}Q(M zfR&6KR5*#rFjjptS1De2@rpm|@c(fJI^p`$%o%+U$;T{bP=kfbU+vQmlUePP#wtC7 zrOFvX9Ecz}=Cs+HRPzp3Czk-8RawO6W3dtlB@)X;HOr+s!YxN;K3jAGc;F*IjdHZ39{ngtLn_jdlC-2IJt<04s?wFRw52Y6DNH>I(XcrbD|3v`-Xc05d?ZkT zGsRv!gtm{KJ_(7T>IV|7LL_|jqXx&*X&B@23Q90Ra#p=6RJu6z%s{dB5qF}9Ved|^!YF4;f6|5s@D_!%d*RJAquYUEaT=#0%zW%kaTZMxp zBgHi@T*C|Eiw7mJW|)06t#(}O+FYVGHLGk5vsOXJ%BT?-Yj_W;_nF(gR#}P~V4((b zFfDA2Wj#Jr=579qEuUIB0y1L8jkpEX7|U20-~LuURZZ3zys(cRysihuVgeSRp$cjU zk9aW{M($!~mFgBR8T+V#A!dOBqLO70z$k8HsBzOU_Tw5!MK64#1P!W?CMve|qy^ED z%~9#L2DBY6HuJk*QT6w~lp$_>AJe(3QG*Jzc!5=zFpKjg&=(R^Oa-s#4+~GQp(FUj zYNJS(LI1or!6rU2ia#~q5!BZy_f_FOM2ss+P{qJdv29}e7GrJEcmy_n3XW6Un8ILU z6W8z(cJQ1G0k;(xqAzA{vKB?zWOSf_YUo%mSUSNgIy1V_j&{tK zcPr+kyp0KMzTi3*gJuzGaLu|t{y9?9zTQ4maQadCQCN^#?*mkpUn*TC6Hfwr{iVpPGD zSpO{J$~L&et>yj-w;aj_qka`e?rR4)jOo7Dx{-2jao-CH-KICb^BvXTie%m}mT|cX zoa}aoamVT&AdGYk5!L9{d2~oS%g9#dR~e?XQYb? zA9iFq)T6GXYt1yG?~Yrfu@!B14_Mtsx3tthmD8}3z3k#Pc;8xC1K6S+>u^}R+5cpT z^TxgIc6YSLk%wq7<~{9q?-MO7@%Ofgx z=F4^U%WppOj!r{4JTLmuN2=|VH$CaOY>7n&X-+zYN8VB|d%j=%@_nDZmVqJoAWf=L z;DkpJYQPuV6CZhspZxKQZ{qq)J+*g)gOimn{pnL*^7alAx2&>`r(UBQ7sNrh*0Bjp z!0Q^6@LcuNzo*Vmp8e|o?H~fgBZgh$8gKyRhLdK&_<1WCBdee)>;o29+4~islAxaC zA)o=8mdWuR-7pgV6_V0?PYnnGA5np)z{4ReAP4fu?M>bXcA!p4L#9B7vziuZU4f6Y{C$p zK@kMu4o0CA-pe<@9atRB6l%rkNC_s~n+0`Y7FJ9@kQZabAe0OhTA-nlxZh*!j9{1{ ze6gXXRN+{_AqS#R80H~Y)W8d9T4P}0W2m5D;olGf$uqj9N^PcaLgGvi6fjKV=Z7j1RljG z$rz>~86uAw>fxu%f*&@99frjScp+KHfgy6mFk;0n_DBdmMm#_wV1x=2T%0BL$TVt7 zGx~{zIl?EbT?}~^=1_w!bQ0!V#Wr>$-tZY8L`(q!!xkLD?uA7rI{yYA{$f@jq9TG# zuxJG#9wRUQi7#RWD2n2CvCdpv!zbv17rfRh8bU0hqAL1BL&l;MoxvGIUsvqI`CJ1! zu*EA3!-IUnCtBnz?8DSpBwH|z)A)xm3M5Ht;)E-QYu~r zIuyYspaYTgfG;>CM{a^4Py*3}1u2>$N9vGOP{K9DrSFYpSrSPhh2lP-16#VK^MQp> z0%c(?rBWv4Q7UFr#!3++3>^vq7L>v|N)ZY;!a7{VFtFoe`u_tR_P{|}VF6mg5u{L9 zTw@?Q%R;fI9vKG?b%N2&W-b zX6mfYU}i-!Bunq0j2x`M8LW&yzya}8Cne#)Con-G%uemxPU0LRcFqbPJSU4W)P~8$ zRp0>45X*LU=XYj>bV=ko)F)=*=Y4)re)4B#KE^k^03Y~5GE(Jdy?`7D4?4^OvG4)n z#KQ=v&I_PJaYhV2FaZkS028zea;_saU=p^}00DAEIF6$~-~h_N0}7N<5r|_TVW@_- z%Qt$>hn_)*jwo2@r;hsPj@l=##H2qY!`iu)4>?&fVE@4gbjxP0reO#v-uzEDh-LxW z0c`4}Su#>WE@VNT!CXw_WR;@t`NM6F2Zc;{QQq> z3L~9#jJN0lgi;M9@Bt;s&L+Ip0VNP^eyTr&>XZPEAtcE)tbkQ)!dLzg0Og~m@Bs}K zC^R;WRjdFygyOGW1B7BKTTp6*QDbOQ!`h9&H&B6SeT6Dq=6}YJM84LvV&+w#rav62 zhc@f8Mk|U5L0L+UKUm~p7)G>WLc|~|xuS!)R{vv^cA#r!#f4_*hKeZ)0H+l3CMF_oLVS8BSKE#7N zq(K^(CRk8HrNn?gpg^q7Di&NS0P)2L3{U|fZOZb=n4)WYov11V=-Q3xlX^?dUIook zk|kl1RX~G3YDEl)7F(b!gf1xhQ0?w6s#SOl(r)i8ppR8tui!RGk~mT$c>yJOZ>D~V z9KgYajsTHxtwZ7ilA=RowU+~tZPK#ixh4!-r0w3$j%dMyBfQtTM$G#}Oaog5tfepU zUW3~5t(K7&|Ew1}z>ok3Z?ay+n8pv)#O(6UYu&=Fm@bCW1}$CktUo;MhDGhv60OyW zMLA>{Sx^EdTti}@0PJ{f58$jLF#nMgK@sqRaGaFy3!u#4zQBQ$g5s(|RVZ&YO70~s z?()*1KJft@B+kon1tqLN4CoNbp6dwo09C>77Yh^)5LDuzz=Wc)8h6ncmvNIQE|a{^ z@WcZfj|w&T(I(Wu|3D~$GVcgD0t&!GJoEqxIFUa9a2ESokcw5_TyYIt@az~y`Jx|I z7%n*}jUy=yC?jq%ShC;-9~5h{s_9z|^iB-?iV_cS`YJCD9IsiO@Y+T23R?yFny){g zuYbre`}TwT!tV{w2M_yj?H2PfSJEYA(k2zu8P9T`01ljj#aplidmhFLAlg2x*S#T? zTWCc%cSS!0W1rk2SA+{p_Ww*!p#@)xg@buTs06W1X~jIJug;+Zjwss{m9o+>G0NCz9oHop)1{uhz#iXK5BM?BesPd2^A7`%MXQBA@sp(flOfvB99#1$ z<;Q;XhXj8O(#;JPynrX)hariDFKo^wqV(VGv%RK8Is43drSqr=b=7EvI_vcJ3<*}q z>CoYWEZ;}Z#1T7h1wA}9QeQPzXSKvhZdTNzR(~~Ehjqkp^*ySlSf4dor!{N=W?Q}` zWfc&-bRyghB)llETHiHZ=k*2E=#9$3G&;?Oy#P0QV{Uql=bE)%C$?fQwx}>`sXQw* z9PKqgSZ7A&WLD-tGyk?`Z#HM+2`gWcT0kfzHR+IYOUoX1XQ#Gmul9woGBq#(K4C(( zysNoZFlw_lZs)dcw*@O-Lu9YO3;a*x2JREn3PbNUawoTP+p{uiMK&)tbVs*zPd9Z} zw{>4Pc4xPCZ#Q>$w|9Rxc!#%mk2iUjw|Sp8dZ)K~uQz+Qw|l=ge8;zZ&o_P7w|(C? ze&@G-?>B$*ArxPc!yf+x6wFF1oYxPw19gh#l9PdJ5FxP@OhhG)2j zZ#ai{xQBl@h=;g{k2r~!xQU-Qil?}WuQ-dhxQo9yjK{c)&p3_OxQ*X9j_0_J?>LY5 zxR3uhkO#St5C1ul7rBuiIg%&2k}o-vH@TBPIh04alutR8SGkp6IhJR+mTx(ice$5; zIhcpJn2$M`m${jrIhv=rny)#Vx4E0YIh@D2oXj5tKgFNuLKM+Hu6GAkkIv2>gsqcE+9fF~oLL4Z89$1AQIDryy z0VBYArRTY!UjnZ`d)fKA5{!W|OglIDL$DjV5_p0?RD%_O0XL+2vO_z$Lmey>`Xw;J zw);c1YybPSTZOvgf*%w*w|j**ctbv5#l7b{Sg30us?7P0>dsz58 zz{C2$7rZwFM!~;BxmUbc$OEUt0vb30F?_na7y7jqgAyEqCQN&vdj+A_yFaJ{$cKd< zC_%@2gRv_CE)+Ubk44H~15>US7 zv%cnA1?PJ{=!-t-n?CBR{#7W0CwM|N@Pk$4LosNAo?AuIZ~HjNKJ8mYG#r01T!rY* zLoHl_EGU2A`?)Aox)LY@8dw44hr6N20mb_RFDQW@$iks(!dEQ2KfFVsQ#>JPJ1?xf zwgZGIJxYjOQgTqqy@UPy5kjbhmp^q6P5n~h$6rQ`Xq0&J_wL}kh7Ti7L@1D;L4*pG zFjRT)P{ue98TkW)uvRT4gfcorh|%9nCkT0JGEsDZF)v(0YXwz;W*l-QYpGYaejR&u?c2F`_x>Gxc=6-4 z&g^TEr@UH>^u+NKiI~56Xq;4Yi}0{gc?Y?1l;zJ8CB}e7x^YaZCnL)I5k(2PHVbgT z0uO9zKPCE8;y(xpJjlR<5cEeu1|3`o7XdSpN1u%JDMk~wGCD{ee~k0TkN=J`x{Rlp zOq=H*e#rX~$RLFrlE@;BJQB$ymDFmChm@#qrZ}u{@*ptW*eNe3$6#uvffDjZB`(C| zf*X@G3b8?dHVbpiG7oyH%Y*I#^Pn-wG&4bh7EE)q!19}m3&m=j&L0?M+{wBB2s6yN z_*n9B$w(!gl+sEqy%f_-HLa>iC;b>_N-CYagqJvmLTJmSN<0XtVi@HI7G*+n;!Hh7 zRC6I$4}(?KiD*?(o;i_P^`eY)^@rD4YmM*L&0KXN#W==LrZ7GHv5nA>4&})ppk}&j zrebha0@6**Jr~_{)m@j}b~EkNNo9TsRn&v5IHVR+PBn2Umh|)|pZ|mO7zmAid|h_W z5nBpaiGiOp_(_c{0_rG#;{#Y=f+7ADL|ge>NS{EhxRMKfKJ%p06o-oAml?Sk3XDLV z5GmYo%XJs$oORxr=bnA;*%MQQ9{Q4TckjWYMHOLLyPd8&hOK$O(_Kx+iU{NdM~etn$g{D1A!T z=pNRST3zzmZNDA&+;xw zw9fUEP!j$evR8p#-U&PdHLziW|^y zlq@+xiAfoQ8;nG0K@O*WC}3o!3y#iga0QC;)H@N@>-^f!vpo<2`gLz z3{BLCa0uazd*H%wgs|2;z+jp&fZ-BW@WhISGKPB`GIaSkgcHDUF-xjakGa(4E_Znu zYy`0u{Qx8^ph%}UM52hBvjP`tk<5m00Ty7`l1?IV8BNwoFrHw@8uz2TCSBq(wHQZ} zV8N2jAqOSz@J0a*@(W(R6Q1$BVec;Ckb6X9C#Db+ogV1SG3?5S0Yif+X7Ua{{K}C? zDMlowbeA=P#GAAdCPwP0kiePB6nx=DfG$y(E6Nk3Ar&bFx!?pRU}R#RP$`|}S+j!Fo8F7NOD*u`$n}klK)woO~REmp{J{77_1>pJO zQ;&;8g9}nK2${6S$(G zvPPVfP7-1xf5<~99^nUHEE9)$oRi+Yj)@qrUjvMt=iL-~(6q!WiZ*cj7x=5QlidBbG*qPkiBh za2Urq-lB)c>kj;q*u*G4?|L`Y;_-s`$1@J{g^8?VCqG$%JdTHbWxV9`;!eswUhsYi zoDTBhqsdSf^O&V4-zk@)zgE6+_&ki{EeE(9?!Yja>0IZ>$=AmthI56%tUx@!nZOlZ zvz-ZD=uC~*zFaOxc?X@rHov(XUdD%^DP8FxdDzGMjdXc;h-frhBhG!c^r%U#mVHcl z#02JaF2}p&ZFsrSrIz)qtAdYAzyCTOv5x1d(~)X9(;C>pMv6VQyy!{4w$*TsN0^1( z>^kG4*PveZar;5+0H@>F&epb*`w@>3FIv}1lJbjTY+wqzL&&1mUboS_4RK8~-g9~3ox6RwBfstlx4aq7TyI-L;YqSJnxzqFf`bL$ z6jyb&mv``lH(Y`0uv?O%5yp7$1I_7J_>U&8>5Eqf)9nT~^9~Mbj)S~^^QgryMsB2W zu%Z|b)HXiy8;>K!qusv-kB6ge@8x-W8#l-K`)Yw(B9(R9{aCQnfrRu|b6q^_w)ygi zzKxYn{d@dS)IGc*bTxUyxBuW#M;ztB4U=Chs@>3gH>wh5??MF~dq2B}3;vbDPes0G z56j`Hf)6yi!-QDDhc(iHcK69cT>5xmF@zN#caocn|%hh~FRA;QM#3fqsANzW0`g9W08Gq7|v=FA*Wn zIArk`ekh~o{NeLPN*s2A5Q;)7+JR7vqMD$>6b=vx9ziK`Zz+<&_xf)9h!2L4FZqlD z9lpW&%+D&cZp69*8voqSEV3^uG>|FEZX29}1G}OdL@+60kUZK?DJ)R_kmCQ!MO?&X zNcsWw1Q4nUP$Pz67S6yI41p3dq7=^H3}(Rzu^a7 z_}~nv!6=qM0+AxVoI(dPVjsTY8jh_W(BT@c;l{oJAHIPaFbo~y;Tk}2AL?+;_Ms2| zP$TFt4wd2pk>U;OFe8-DC>rq^GGZO5!45N`8v=0>{c!#Kp&z=z4(|{h=+Nl8!4L7_ z5}l$P{A=Ohp$0UFjJ!~DS&p^p%s&kwl*AG(1Wv`-hWK?G@W7LCy; zg3%DsArDh=DgRdS1({+WdM_670UfAe9Vqb?{UH+_u?zKZ5;2hm*P$D$an8~Y2i@=* zv=I|EaVrWD6^#xRuYn#8(fEw65woogcTgkbBli0UA#51ap!q zlcE~CfgU&z65VhZ?9mra4fwjD`;;#k;9(fN>nAnh8j@icqS7C}!56M!9gxo#s394W z;TEA{^8dIFA9@eG)Nv>C5E{no#-PC$@Dc=f&kyl1E`0$Sl0os7Q7L{B9o9kn6w?sD zVK8&e9|Th)DAO{b(igr#GsABh711!GvJ~SI4$}|#)XyIda~+@o1l4c6G-3wT@-@2w zA5wEF>A?_p(iDeMHfb{|z|JlWvksMS9p;iDRC6kOGA#qsAC)2i%K}Bf!f>PtBK@Hb zlpqg`2npgr2gZOHHqsx;vl^747!+^}aG?vbKybRCBmKcW;WGuoyf|Oe6gv@!-uGd2U! z{QtgE`NU8oZ14k}4;nsH`Sw8@M^P)PF%i3P`N)s@MsvNUQbFai8wRuczH$#?NJBT& z8l(;ycv1VH!AD#4C%<96=%GawR7s%{1aZ_U7|#ZUu?v^&D`%1K;*Snpa~pBf{9Y7H zKlCU_(+xRrOvQ8@kD^G8RO+Y!9_SDss9{Xg&lxQ=9tzYl?~y2VvO34Yb?yNcqQUB- zt2^sJ4RV22{8I@;!VKgS2}nUB6wq+$KtBD!4C+%;y)z3QK?(lg512_K0niCgW6Dp0aALC(4A@taAH6z+D8x_?3;sL;h^%V+Lk*K5qO~=z;VZ|qFaJGJVlTEO zOZF{G6cV$Q2CZVv^43N<@Yv8HS49vU!Ao8JfnA^CZIL2LvrqdPw=*+BaWjHnsnl`- zwhIMTE1r@ZHR45oauPYv95XllsI_5Fw_E*zbgPwGV{_7!QFSx6DJIurvC$d#P+%=^ zDQA&wyS58eNM)f+9dKa?onjnnffa14Dr#Y|GJ+psBqQG8Voc{3SavCt!6BDIA6Nlr z<4I?E)@K#M1cpEfdX^BD-~?Wv4V?B5I@Jtx;05}DezQOxoS+42Uzx-UXMm@C$H7(lRLd(|;FH+P}(E+G>WPx!{{Hc6+Him4SYiE%C+lOeV^D?avj z)6W-Tmw}DWG8K3y4_M69p%_>pj-&z?q?amgW*+qKBewS{;*Kf67b?ove3ih$2xJ_1 zf>tgg4q$;N?!l(G;1cwKP;lWc2&0esfs-@Bll_4gxFD0b;m8JfJk~3D!(tF)kOfUw zEr_fi&a#Ho&^rol&1RD$wF>rx)wD%o~!EnIg9hBi7l)-zk0-D9i9|%$zK4cu&bFtnb8UpYh zuwj`W;~l`E7}lAK3NQ!;S}Ec#A8ghsWCsogdI>NUNOo^6byqARdKmPTNcv$d2Ukh( zl81$un0uxe>}G$Mf*1ao7sAMy%ce4ZVI1_KA)_NjZ~+#$A?U6b7qAx`cmWoOfjH=b z7t}!(Fs~MfAy)c$D}G_6Nv|5{LOx(&9^@fL^5Gp=;kIO<z`-AI;fLNqSbhO} z<%5vp+DD2(kHj4P2!mcv{eCc8wo>w07AyK85sxtx`fDo&z;bO3wtwUS1t)d>9 z=_&L%BTkyGGa@0&W*n-4Z~Wn&`x=l1TO$T}E-nUqInNtBsj;=9wFPjp#c8tVBp<3F zRx~2B{h=6s2(buqJ3)K2r<=N$f*)#Oq@7}U1&0-?H>K61AGm=P#-X(N;kpqdr6p#t zEjza%d#~8ex4Qzo#k-};>7DiA9vI>tmV3F^vmf>tx-&w$tDC?HoSavX7x-8iwo|6{ z?-zg$u@Qqn;wZh#hW<377-T^*v=_4VnC&#Ynq_L z$esnR;2iD%?OlNx(4OnB0PUG!?ElH0>%AWA#UAa=p6q?#4bEW(X21s&;T)pC2M#|9 z&VdsS|L{4X>ly#=6@l@6U>BHS@(KS9CZ8D=pYaLb@DX3}C*ScW|L+5T@C%>u6QA=N z|M3q$@+ZIYFaI1epYbGj z#oq^PU-_SZ{iWac(?9GNpX~X*?*ZamB^Cr(xMEh2#g%h+5;Aj@@Xom_2?a(xm(Yhr z4<7t^{0K6n$dM#VnmmazrT@y6EL*yK36o_tWie~oyoocX&Ye7a`sB&+oY0{}iyA$O zG^x_1Oq)7=3N@kh3zz3IS6}Pe zy^A-m-o1SL`uz*|T-dmT3mZ18Pu9J|j2k8~BsB+U z(WHx-MlDn{a~~eaWQBcB5u)Ve%h$d_hPe8-{2WK4xjw%9`HhX6zmK1&>Rsntb?@&( z2`oU@91$!qqeu=V%>TfjbfJyL5J7j4a~)Bl@kh=KA#~Od3lpBR3q?fmcMc*mD3r}P z*)&833$a~6M2RyrGy`2LvOq)>G3Li(k5bVGr5Z;i zT+vxJm0VFm2wQqU;xUL2;X#arG9#Llr?r_KtZ;fDOs=Y3f#!I}r0BzjvU!jV5hYL< z0-@9S%0dq35dX0uISY=;ZGO3Z3hqa+_`w)1?}UqPy7Q5NUAOMqrJ5_QokNZbtWo)D zio!&TNC~!1%S@c;k)a5$50dB{IRcYGuz%wTBMdnTc1CM90n)qet#?|REQQ;?3-WE> zs@o|crg)?dF(2vUi7aa=Qw_;7&ukbwHs6eM$U1kmA5qNIx(f>)ki%t)oOp1@DSeg+ z@X>D!#3q?;c6SG@Z=B;a)a1BW$%*7>he!z4@~Nw|50W;6wZlf?L2Ns}9oWb;D+vb7 z)%d~4N0BJw$E9VB^UK_V5B}3Kh`5kh;)-hocjHs_OcWU*guvY{Mbk^Ai_gwR9XV{* znvKDtasR{u87X?s5fLe#Lgfsn(0fG)T^^$%+b+h&IfQAeh^>KcI}SWvauZIU9$Z}F zPc?aa}x`Vdcj^)8L6ctYBT8hrOrg`O1nN|BSIROA;`{!r%=mHtwQ5DVsx z-p{`kGjmvvWP8ak?~fnqXaXOx^rIH?z@GsR7?NdVh6QtIMK~S^!3c(fd-SW|{B|Kb z4}{=#7VIEZ{Np+V&PN`E7=tok!3$y>;tBzvNnCnlD4$Er<1F4#84{m3HyTPJhg6(kljxEk3T5LM$DoiGzX--KigAo& zEdQe!$;ZVrs&S2MJmVI-*v2@@agA@BqaE)k#x&ybj6~Vv7wbqy@-U-Rl1LpppoqYE z)IxuVY-G4-#ey4F5r;eoMN{!@Bt}H zxvW!-GL@n904rUIA5~V;7(#=TBU#c9PXuly`p^d>adHnL?c^7~`K2>CMZ-x-vyh~N z4sr&s8}3IVh)4tQ)MlC1#?6;uy>^_WiI>INp7~YxIArWM@z**x&W;q`2{ST z(AA5wB&;AYMI1CiBwi3g50UT(EPkO0H+VuHgy4oB{BRHE5<)HhV6Jna>;Dl`=s^z+ zT?sR3>yKzO6G-BXq0+KR*Ys-hpC0(ER|dk7Rv3ez(liEp-P;xTf^@#=wQpwd%N6~~ zm%fwqFGm2p6#?@%y>nP#b^4-U0*~~*e~GYq8%(+f^TopTohKr|`;lKLfeTi^23W=F z4^wb~7jY;qE1YnNOK`y-gs9XZpgUquprN?GXt77`k>efvgArUX1tG}Ql6=Ubg<8eM zrt|?0Z-8VR=Ms*SKQdl_P?>OuYpHtcTheeet8Rs|OqhWDJg2z|^z3t1`09aE*f+D< z<~BgRjTLZS0n4w0o6-;IdQ|LnpQm#d-HSrN3dj!}Xx%H2gF!H`R!6T6fc1vJ} z)>{0cfZ!OWKZ?;ab&g0~+f*M~)3=g!4{SZpUAH>e`7L&&FBZdBB+|K$XqVAs_G<_*9u)-ToumX)ie)5!;9M>ZE2e5(64H3J%B?QcLHJTX> zeRyMu#Nh@vY9ZzDik|c|;}YNPuFY<8MQ8sv~cCwpM@oQ2%)(M+hT!WHF&g{XkpZPz}X! z5AW~}@=y&ivmX3V3;G~|6lj5SgE$g4ei0N^2O<@N#1U6uNg*gfBZzKFVS=ZTf+`3> zELei2f`YH7b~G3jFL;A87(wjCI8gy^;^$@Cr-M+zgE8oX@{}1t2o*z^dqtRM#^3?7 zg;wKGf2o%f^T0^TbQ8(67(Nyg+%R|`<_#3L7FC85XGlFCxP&=WQdM9Mf`VWE!4Wp} zh9bxVa!7|xVTUy&g(?__b7*}}ml=4NhYvJ}eb^L#h=_@33=I}qckq2jp;P3Ph>KW= zSa*nl7=n>_8sfK|H6GcTiM!h=Y(Q z35BT;i1{;^wUkihn1`8|k~x`bR+*5gn3*Ykb1+4Qc?Yh5eB~&Qgqd)Ud71gSnQR1i+P↰L} zp6;oHK1h{Mqz|0tGQ?vpGE)t_Q2!5rIT93D47xQA#h``WPz=hj7~BvITIdfH2y@(U z4-e>tX2B20=?}c1hWdb@mm&*9SDol`os(%(@Hv2!s=nuqy`M53SsXizYB3J1K%gZWE+?v^aVn>CN~d*dr*}F@J{m*wfLm}i zI1Gv|{O}6|S`yKjYZvtk#V`(Fs%1;cWb$Bhh8mn53PY317GwGmyr2xGr)7heDIPHo z70G078kin{r>Balsj8~0%KxfUS)0xTe}{)Z#2}%Z5}aWf60oos7Ud6rsdLRF55<6y z9s#N1va4J=m%>RFn3|kwcn=%(4s123A3>@hX{xP?t=X!r+sdsls;3Bqg-(F0Gs6#x z;VJJR3u>4W%mktrr3~<~7|uqW!&Iyu!LGG{tcThb%-W&qs;f5#4oI3j9aycN0;k=I zunDWM3(K(S^{oe_2TH&{_>&f$;tjFZ66)Za7wZrGpjNU#6T?`MRVELtAPY;Ru}hk- zmbw=F$_rl!mW>LL?Ewoi_Y0#+sGV}K4a>7V>$5+5st-#XMS2s;pafRHdInStezPRe z@C!t%5)?Wox!)+m5fAB<}zQWU)~H zCqf>&B*fsSXCZUMV6KGWvA@8#4#~H_P^$)et!PWQg=@Hnt9C*g9PfYzT%ZRTsuQ)K z1YiKQf+Iahq7S%WVsSx$y0{i&i@2dHx}%G>iW?k$zzSlK4_2TA%4$GWixcV)wqr35 zBkH*%1G=QEySvM~+-kbO!3E943UHefo)Dyn6s0$zNYlC&>Tt5Ri@Sr{yVFa()vKz% zyBoa*YtKWxHX#pvibxnJuQri^Y{3n#7s%s688WL z#V|ekP=?SsR;Y)cA71@Q=TwWW+GOF~JYYa7{&x8#YS;Hlv417Be*ua;=Y7^x z7-%dKZLGXp+(_eKmpd`LE&-;M>&Hrx#DZ+emrS^X%oBtF3wodoaqt_;qX&dAJ>%d9 zZomlhaQ_IA;0bZ?RkA<``DxG%Wxroyls;7%LYC)evK2a5D2S&6;q|9%0D--~>=?(Mp}hP3;G0 zkpFR}(9tA;u_hf7p{f%PsMYJs(`dWWE&b9oEz>we(*Q8g5cB~oz0+ZBwg#OOKrPgl zaAfmfm9pEq{*Vv+u+&XmUEU1U9?{gSzy+I94e47F+_1TxD-Y373?}9c|CkRj^w^QD zY3kr)Wi`&q;B4R!t$2yZBayscEh&`@*WEhSE}cVVVAeD3*30lf+K|?7o!USv*Em7f zF%}K1>j_ryYfd2C68Ye|XoZpU0L7Z^5$SAP<6zy< z)}_Qi42$uXN_um(Kn$?(t|>7 z)-??P5AZ-0z|uAC;Xa$&HWA=2Zi_8Zu(Z_=r?*j0nxGz0_jzg|Z->~i0yO0fOt>RAJuq_TQiQ&2YP*$+;kF_A?nil2- zln=63^M`^6}}eKBuA%L8Lz793TNTP185P>b5QH z+Y0NR!mqch44TThvQXf^E*!!B>^e&94`l524FDWK2es`A&ED<4>)*)B-r!XayzUbg z1@GEk7~G!jF#7G`5bk_F?zGL)4G<3Ke(%{D?Vh3zvrg|Z0lNtg9QHo&o|*6a4nZ`{ z0pY*_G`-IP|M09@@Seh<3}3Ve5AuxR@EgCBV=VzGpz|-zX5)cjs0019=Lk%GIUyrI)&nd7l?_AHdONRF( zkJ57Q_xDcsl2TrK4-+L#`1RiRf3Nti9{7<`uZIuGSl<@1%k^Hr_?K_VjUOp`5BV-Z z4C)LQ%lpWkZuyvR`eUs5OHvJi+xaN5-h{!+Ltpx*Py0>0?vC`)t>43W>lOiA`H5fq z!7tCLza+IVow}bANM7q}@xl5a58R-J>R_S2KKsIN{q9TrO2Vng&k}O(7CsEbzcFp& zAgI(2{MV2EfSmnF!m`}262(vr>Y&(Vk;QOo@OiuqH<50)}w zTg{%8dh+O6wr$<2;{6Es@{C)gW7^hMf_zX4-{r&$3Fu;_+%g?i70;{STZu0Smo8tVrCmO8^q!1x_lu;(M zdB%CCG=KO}2bX*pj4&Do{Q>74W$sbtof1zhF&Yy;^hTLz?g3Fm5@kHmp9?eeN1b=T zIk3XU_7gBkC6{Ef$+ieIGO=-5q_3%QWHHa5UkJKq!YGZ@2bOWhfkhuOe*p)Ud%)oZ z7Gj9OQ65~BE2b7=#tKVL<&wLwOn-Peke4Cbd^65D1qyS_GOK~5G+4eQ3`r*^rLg zu)%e&d-9bSzj|unho4&NQJ2hv^LbQOI^&(U-aTtsEJ$4Z$+kI?3o>@&jz2EgA7(R7 z>$-=320Cb=Yl@iXsO~X3>5`OzWh{S2gXPduWq~EIUm9&SUU2Aj7wgej0ehT&cwq&@ z#fqWYYOKL(+Ps{1R$3{Aho-x3yQ3{SZvVagIe4UMkls5Zbq(T&Rgw%RsPMoOoBM9Z zABWt(ycee`AHYXark0WsvYa4&Sm`^Ycm5QecRW@9AIFcojmx!1=$e^DvMD1o)wO4` zsU&;Ly~ee3MJUph9U{t}*(;Z9DciNL?OV#v@2_+IJm)+dJcQRZs-6(;rQ)06_8m0v}nr%P)^tBwaU5&f)0hS_I|jBv1+r;&Ad9WGxV+2by`xmG3S`0vTiW3B1zvincN=Wv z1uIy6ys9CJqfRu6gN(ChVbq2&e0GR7?{r|ZUZCRFHS}j*%t<#`y5%X^eg8i@lk~ci z1ZgHiJXrC4<9>m<%*H*wuupmHM;3fy-}n;3A%cHb=c#UQ8H%oe7cX#}$mb}q?=xm} z=?-HL`ub=JTvJeeaf(<$hjQ@?jWwrJ)Tep-6>N%d5;Lt=;tiZ<$>)yixlxoXH4*uE zzRi-y)ReLO^gbKrlaPd`WirFKL%L>Em(GRB=$Sebe`_0uM8P9&|3ECXw9`c{CcWtq zTemDzpW~voH-jZmk+r6RWHl_g>tADR6)6Z-(yF@_G*K-xP`P?eJy(DK`So?dfyygq z`D~boooKqEEmsZJnRjO0)x`F!o+4hQkrI>d;O91O+OqMN^s=LPSdJdZy1|WJrbFII z5IVI@8%F!xN5Fuj=*DBO`pk(Ng0TwJ+LjH*IW5kFkDni{YqOiYcYzdL^s8Az8cYg7 zT^!|*$A;`Z$|Yt;e5apXSkq=qKKh#qcNc%vvwb|?_GviGN$ZLFl-F39{(IJE(NAnv zl*}6@#y-`Q*H#1v6?E{jP4@IYQ>;JS`}LGm9GD`fzVf2I;^o~PUdCP+ zLR`FbubwTkgmZ+4*!QSgV(qz}Om#mPyYr&FZ;AKaQV;9TJdut?IWTE#f*-&TB5Nxki%_rOJvlae3mQotey_1Kyoz8hu>$@kMt9WxQnQStdZ~aC; z$li-|kuMD#icsw1`JBD5@iXw+Ro+9zx*yAY48Mm>7kpk_9$M9I4IFii`&_V4<|_K~ z`!{C$nRt=>(tGfC;QhWi_&|pXqgDM7Q;uo+mkUX=bF5`9HQ>LehfiGX{o{#ksj4l zkE?~<^P(<+UmwjaWy?2>GFJpSy3sFVUC$Of<$iZN^QxraZ*$gJ3}zyjlDs&J*s4J~ zi2rgE)2=-rb6&0+t#0;VIqMc~Ke7mw;Y<~FZ4_gG3VGDe&# z-`eHv)!xZ@V>iqrDgUc-`ch;dy?BlkTdi_sDzEP^$~49I^8O0^-RBYM>w*q^v=*zl z0w2>gQA%)+a)Z&Qp#b6UZ*I{p@w5g;%iC@jiH0ro?;*n7hgj zdRNo?Q}vI$4gEdsSFx?fawj*Bzr1U9bPE6T-hAfw`NGGRoz|=W>Qcjh>o>*-oNseH z|IzVxH|ExvmwMvbyN3=Myw3y6-u?}7c=soY;Z}bF@=T`HTOg91CYf7P?GG~_pB@tR zI~tMJMxCa1@ej0e2XShgbW$x3D-&4&w`{bDeR}!j*Hr6yBof5ZHZQK0J1fKRxkMO5 zJ7h`oGC_C(o#b#D??!5OAioMZjefYn>r8;!lHh)%b}OrQ%xaMT{r)BJU`M=EQwB| zK4YD>O(c~nz81sR*(}u=AcGX#%E{nF&IEPp`*xW}X~CkRLq(85di-ze;qlU$NG`Z1 zQSh3emh|%5XwsuD3#V=uUu}~RmcWz@#RGVPCu<~Mf;3~+RVO;l+k}uQ zHrA9^)Rx_@H+p<->wJ#aPFuj=3ufWoh;abkdQGvlacVVFv?DO7W>oN&AYN}94GR~Jx(dW=F@0G zE}2OouP;DtEs*!))SLzT^b;Xco}h?x6A;}YvYDAK$gMMqp!5~f^6oYw*B!A^3Ug3; z^O%s4KwuA2)K1~b2sG4xn3B<|p7gl*rI(x1K1TmpaZIS2{vi-o3v^ixjMxVro+THnJky;g;TVAgpov2rUi8<=+iVA&`b|Wbukad0}ro;UX&1MXB8Uk^4k)_wuyw#{n ziSpuv8`?$PnTLm6+{Z#Rh#8NIp&F+&&UI0D)cAivzOB_q=}JWxbG-B-3MB@`L_Yc! z*?uR8qZdh?w5;2VjEhGuzz)v?;n%bINVk)vb4!q%F}9qF@1qdU$yzHIxC}Mj^H`V_ z6XIYiCj{LwwrS8N&HU6oiQ;=Nycq6I=4jzeoh04gGhz5^b1O!75QjD()DH&}hsAOS zKVaa0Z={YBVoV+(pCj%M55#z3;GOk@FMACP*Liv&8Q*Tijz|n~I%`Ze-4{p0j7i{! zpFnquVM3MMLTZr@=RrXfh`1fZ76W25g$Z%Q3fvtPGKGo9a!I+-B%_Ftiu&Pod=E=% zLliQL=3&p=UId+9q91{~Cg6(#?<8X2?rOtD(_HR$1d|bXzAuI@!nh;>=1zb=3&h~t z@W{WHelNpQcu0?Ke?_^}9g7DRrT(2nmG)GWJ^7(Cy&w!k)2h7Zb&d69-` zG1UK$V94b>aZqg-PJ!GgMhKDMckLiJU^E01VO|m)N`c%$A+C^FpL-hNPA5Fy#wY}` z_*EL6h@d-O$Jik6TLewwGBa?e--2QnKTnDmaGsCWy%B87B!GcOb|{1B33w4L>ZCnF>*=TOir zjI2|9I$407qHM*uVa82WyQkN)qUNl>E3mgG#v1ZrJTNowK2Jr3!GkKTGZ_&tw#WOm z7=lXCHG-B8MTe1dwTIb=bRmPFUiQ2dBjbYUax9G}h9Oz4zqQ2lT{){mxzc+c>ape+ zHXg*a|4du>hKgBjokGkU)GlIKvla&7w=fZ=+cTe75tV;YpN=3^Ns(3rxRkDS0XggW z`BJTx7R&PlCCGnzQBBt2z1CT*XxGTsUYwjb-IRVU!tX>>hvame=U`D#47)5{Ti>r2 zZ5Dm1zY@0Ro}D4~_=F3(CYjaq);?vEBzxaW4B+;-%TFJ6)n!|}jh_gc3tE{BLef9a zjOjw~{CFTYwDr~tfoykAn!B4k&-0VR<@(bV(Bfgtoq5pJ*vL=-e$_L=3Wy8}`ni$x zvvMUuhtDA#n04?3iSa}hDhy25a~mf)7;EKazkVB#Wkc^m<3~33lR|c&N=Us9-Q_>Z z4;R(H{EB^x$d%;y%_0g>uYcT|p?J*NC!3V?R4MlrW6cW8HOquW{Z1Ep;15el;yK-e zS7C_JZO3KRWg4roT|opgord0n$iHvx1f}a8>(5qaZ=0o=+jcCLqHEWB2pzGxjkD); zaaL8vpv9>B{-*NG(R7){-git^GoyiOX9ilvK1baxDxRM5YCF>7{_VM7X0}RW9O2@> z>WW`=OJ99d=;B&4eY1PjePZ?L>gvsV$PNnr3tgOt@S3;Wnvb^Yv+mfr2X|r~AcC#$ zyrtt!wirOPwUz%c`a9SLk!>Sac;mg?MxORYzS%~B%SNI9MiG9aIDMm}aN|SGMrrp( z*~CWq>c+?8jSARiCEMmF;mxZ5M{R6=HrxE-vRUK5S&QGSOW&+7+-#`XZ0z1_n%Hb! z-E2ADY=v#Lv2C>rZ+(^9>d@ZmG~4QO+3NP+>cMaIrf>BXZuQq}4Rmh}PHYXWZhbr6 z8is9?*tSQ6w@2l+ziV%gnQf1|Y)|-aPvWch6xIfSp2hg#wbN zfORMka|+az0t=wPV-$C9+?Fkna3PiWE@!* z9a+^LS@#^-OddU0JF-1FdI&$ZV?VaPa_k^~?5K0>WPa@Idh8N#>>6|ImT~;3==gE% z@splo_sQd@YsVfZ$Isv=p6n-HS5Ca;PkeMve9ce%Tu=N1P6A?10y9p6icW%SPeOW5 zLMKnc)=r+AA7TN3?>`7N4{#1hTnGRF6hZ>UTwD=wI2|o5Jp+o7fu0G)z{;Fb=&clu%6~k18wy* zZN^L;l##xc?j1w3dwSQ5k}<}X*2Z^lSfm74YIRvLI@_2jKS=xhkgeSAQjZK@L1;Vlehlv&Vx@ShMq}nn#e0iX z`Chm5FCO++dJs_H6Nrrr^lT4Q9u86^2Fp8ym5zjIZa1^iokMaQ=W={LzDxT_^nae~ zTA2H^Dc5H#FLXc8BO>4ZZGPpi{Lq*}uUCZ^$|3emaa&#S+xQaSpCu6sC6P6y5q+iU z$>kw$%7&P$G_;d8+6zj*W{-XSQqh^a z(OG)Zl~>iBH{D&o&{Ougw{W=k!)9+qN8eaYf5}{b(^h|V!(jR0VCBM4TmQGoop1Hc z!=KtlzI2VO4UYU+8R^&^Y3>?p7#Ld`9h==6>pUE5{%@>dZ=$1jvgP+=`}S1-{#4K4 zOvmq;F4B*l*&p-!KL*Fq8wzjhRXJuk> zb(_4pdb~P&wmSWDZTf`#b7g&Rd;RCx`rP8i+{wnme_Knd+Y9TvEC211Hz;d=DQnyN z8-Mn<_6~Orj`z-v_mBTOI{WYV{QN@401)TEUutgMN@9B?f?3R_w=%aQ1|^`Mt^4VH zR|1F3;~%}B@_Lfdn(^FvRr!6XBE}zV`l<>BGNc^)v-PSAhq4rWHh%O~7Y)C=7;odz z|6Dwhr<*GFp#O8p_aaQ5{#*SoAI3{9svpnwe<_`;ua%UG4V$!-IjE^4VI? zrT({fYCq031|Dq84b)crYQ=(Sc@63+7dm2@#cc=cJ}vbS1n%S*)K{$xWXe4GIapu4 zHcZq^;Jw@MdHs8_ajET4!hmyNwqmz0POz=TKwq?yt5;+ROKv>h_j;Q^g;C zYpOq3Bjw$BcdxnOXmhIi$**tCjVHVF9SP@`@3%DlcR>DL`f#|V`Oit?$bWC{x3>KK zbGWeiYq-^^gb{#=%wr)my|=>=!o_GUHx}}IB+y+wAH`y(fMP?rC=jFA{K>h`IY6fo zD8Y1HepX=`#z>T``{_c0R3ZC&JeN1eQj$Wjya4@`LUJC;1TyUqrO5VZ*-!bHxcpLe ze%=Ww+e9HI>6krQ@s_#1;KZoVRO!rUVTaRZFt&ZP`nqdKS0Kvd{rYNFUC<{NyvS$j zwYPx{>3110$8lc~;%c1eH1R~N#=Ed@?IihMX%OTM4rlPtP5u$^-8@+N(j8H98q07& z5MJzqhQ|DSg4)AGRE*m67&_|qAK*N5NOupI8rD)sluFdCNW?^``yfO*%lMyO2eDC) z$n*&3$t4t`ifBIHL(2Vj0AB?3MKLhL7e9zsw6HDHVr>Wcgy;o06C^ae9TvA5&1T{Z z%LGu?7*QV%P>I^<;Mbn2S`E!UsW^^^7h-8`)qUKVm<9%+otrlO@-GG%%WqLkR0gEd zrj#)sKbVyDtNs1g*jN6KxP;&eC2OEsez&N=g}VRg6ZjT>FAdPj&POr`6*xJ@ULNb^ zu2%aa@#B-)7=}smYtjdC%yscAqmQoFUze`BHI1o~xCt83j1#LE{3PDwb`TRMwkh>F zPT-r$?TQ|dG@qR)h? z62Khwb5x7l3~N*~Dwqm^c{|^h*LVRq6RJ*0~VqmdO(uE3LpHU`GXO-%}72w(f9=zEymu$Es^f- zM$f)EMfeaM16JB62#KErRL;HDhYoX+PUe{}Z-=rbTe9#4Ow%{ zcoAZ?WKKAoI<{< zzZCGykmsa6M+a_6r2&%3Z!S3AYA@{ky-+0Do0cdG8c0Y*@RY=HN-QI{TeiUbiW;b> z(T>YIbNq~9!P6`%%r4Dxns{HuyGy=^)#rN|Q9 zpu_U`&k-s1_tlbZbPlu$O+p|y=*%GOCeT?vdZZbO=U}>dRwIV<9J|ar_x_JYutB(* zrWcjBt2)(~(E4A$bQ~0Q)5KonW^W1X?^LN)G}nwqxt;efZg%WcLo(fO8nt+)rw_eI z4CCh$=!{pFVD-W=rX5|lSAkD3NK$38O|G5yn9LKaj7=V23_s)C#a>KviNRp7-I;z} z2N%z#4zT7L{y`Yh>de!&iuPANU@doGa8ym+6bENe21Bb(wkOR94ucF$~TGR zTlN1E>nubwQQS~{{iJJh56t|S6v^MYU_z{ZcS~wSLnv20fFl=PfLN?f4h12f432lv zSDEsg*P-{+IMt)v3Bux#%eT@zSl+3i#MnFg1(){Kw>~pSIns5zP2A(A(LxJ%MxHYn zP`=7Hw69XJ;yjds$`13FjdPr&hOi%BO29j#mY*)tnoz!SgyLQ(I9*k%_}wQQ)g~i0 z|DW*Qyctpct$-N~f_77_V+mpsWr=Yd3vd=GG9P+e5Xhw_9or*98x>)hudi0%{^esp zdD^wk`N`HA+yZZBjFDSv`iIpoRT|~>szJ%?x!3gOumxJEdcLjK)c}yiST+N8Rwu;3|duOBj(82r1_OTkq8>}7k zdslu-!xQ-K`c13F3~N)6H;7c1J2Jakke{^650@|nQ%hfDzw$?wVfdsjBb4OqzjL`S zI>Ee{v{r3T8U?b~uvhfIOTLe1w*xWjt2C(jug2c@^`tkTK$%%&V^}N8yjw0lyF6L^ zBJjCF;q-RylS#CpPuB-oO4m*O+(txv1Hx4XVr#n2@HmkQY9PYJN)Jiqsw>^55J5C7 zTgj?3-!50(UsGfD~SKuv(A@BB0jMs?v4{X{n2R-m4!qwZC`B>_uIbOH+C3iV(rj_(#%D zMS`nQ5$zH3Dw@=ii=oIVNli$QfagsG4A?(U`370(FoGIaeGNS++0GAf^YqtH3x4Mq zh*^AYCc?bjCDzaZkv?(!Mbtxf`Tg_MXE=IxbX`xCS$8{KSa}Gg^50j@rQ51j0TH5_=5^oI|59kY&Pn`)DvgL(8C#0Qn6dIj z5tL#TZ)FlEUss&F9;d=0-n)PKtT4_X5Wl1WeW7-TxP}Oh6g5{!5KB?0rqGOllr7D` zZ;h_Nao`UWq%c|iTo{str8n(`>_kw*NAVXexPK_HR<$3ec8q_owha|h6{Q%g$?&{b zqYo1t+6K=e(cS8m)#|w7ToC*8%9Y#lpp;mNMco(lof2}d;<6B7J+hYdqjdkIlCx0a zufB^HA;Q`<7&1E{OTBQpC@}47=g%A|!s^1dp5lLi5J-ChsMu$*!;(SvPW1teNG0e6 zNu5Li1wuu?DWDV%Kq40uRTMZ>%Da&nj`B$gJP&03ivo+=AwtH~r;NdSUqIfS@OKo@ zAQM=FTIjD-6!Lup4~x)6MJx8PX~C*$$EE#4iDVJPPp2e*ZqL1|Pa zV*k_#^BM7W#^NdL;8)C1Ef-MiHczJd3_iYRCiAqXs0VK&V11PgZgD83D*D8z(oWp5g}GY*o*{tRGpd?O`3Nv9aVbAk6vR|=G0QE%8R2i z@~=sGdubsP{yP_yLR|5DQnf?}+J~;7RDnf0HYHFfhMvWWUXB%=iJ^z_bN$g~EFkeu z>M_cCz6(pqyD~z@OY?q!3cW(e*Kf!-n90BQJ0J5VKXab`>3^IY8R1Mgu&NTH)?;?_ zV5St;{7GECp;e(vaG@TT0@oYPn``gY*%__G0z$DcEn|hWd8n6^cpUnM34olgq;A1N z*S;f7V?@KbiXAU_&e$Tqb20X(G`>ta)Fx6ma-~1dfmEuJ_;uez>n{v1u+Z`={w-?$ z4sW>Kf0=uqf0}?S3I6bH{^S0e{O*A4H&o#j`Vvd~)XH4`YyHtsE!7r+5$y_tg=e;w zT^V&_Wd{~iy9oUz#(pLie$wOIQt1yf^nT%adv9Dme(}>HTcMJc`BhziW(yqoF}Ln- z3oTT&-fyUgQic9`12skoF5L)HD9sOTv&%?{28|$D^-!dFKNtUpo$qNhENPe>5w&@x zQ0Hqeub^0-Sai=wvJ5!lO+R_@eV(q2mR^%pQ%3b+`Eq<-OQRm7M%DN+OKuWw-5Zeh z3yhJp=Dwy3GT^F^E6qPLD%2xY-`SuyX#0?ApHN4pZTER5tHw}332W7rz>JHKF>0F8 zGS%QxrMxfJ-zyV(>RU)M&zHVD!`0Onr+}#xs11vBGwl^S8ss9K3~y_D93tf()dnbY z2bJc}s#WI_@@J)BZCL0-$}KmkCLg;dy*D&jmB_YvgElqf3W4S!0pe4ckDJYxv9ice zPZ1IL1U?@Lv2pYXs!i_bkWnDfovI1j>Zrc4{$krzDWax8*@RXQ%X$`r{1%#(qEjHb z4Tw)M>If%SxD)6s&zM=nZCb4M3w3QmQoT5LV`+YgRFIxNa#Wz)$L_r>0l~9HCyuRf zBhqYMf%wIOUQjOlu=&%o`6J&8!f^%3dZ_tJS;B|NlvR{X%{#_Vrl%wZr9#&0D$l@s z8MxQi%ZzHk>O!3~&ZgQZDem@$+^;JX3j-@Z_E5|U0rAl8eFcX8I_xWdc7AwPzBq`+ zVCJg-vj-V4R?N53t5f^@VE?Pac)q32)86sRhIb)5zkMGxD|JtA1Mz@P zq$Dk0Ll)*n$bXJC@1Yfv#z3aT>+-<^0XT*tGAijeismW1$X!P9W|G^>+T?Q((uL-0 zNmIVuQ2uP?_Z0$CQF54cQMau}!#>gTxN_-Aes_z`ADb7v=lz}UN38(O_=yeul;fyD z^Y~ypgkQDd=G}4q-!z|3*!MMAxi?UT+{~^ISl3=p*dV#vnb8$??{ib>1(iS&!SC1A za@VEufC%b!G+LHG?{imBY%jnwj0HN!sG-DYSIejzjE{{YIYIIjN2Vn9ruZ1K&=D~a zw58Y3rDiZ18JtJU#TN`imulob6-8HTPJw-MVTN-2>ZuS$PlLTtp1lI*+$5gddKlFY zj08*55L3(Cw75A+Tycgvg1+_*-IMYTs17RODa)N9#^m3H^y&;1=U6b=)AL_t5uQ}! zA78XbQyifr3nzQO`Qt*~o|_Nzj8cMEDbU%@LmRz|H&vOZo0(=+8GUs8KVTFM)_k{A z9e1LkRHCS%?H_+X>Dacx2eMK2^8Qbv25;5O->NZxaN=j;>Zj!V%N8z|L-XMwRXNiq zYE^8A0qQ~12ExiQv+hG_Pq4a~DGiBOQK`RFuS6@cvp6SJ)+L78L(OopG05NSe2}}y zh-*h)cl-qey(|(*Tnp@Q6#M&k9@TPr3DU9hIM@p)llU&*75s{LCAcp&D9%dheXZ`I zewo2@=BGo9XZnntk|+hcN_*wX>Pi`=s!AuPs^EJP)$?mF&iyhk6?kP;UZtZa(o_9A%WXW>LDmCJTR#Y0aNy>d9fU6R!a;ApW%7gR3K>WNw-et(lC(LuAGKP_KOE zKU-IClD47J(B2O3dj{)I()JDWnD*C82>uou}}polhk%(?Ds_N8_pscHtBl} zn0F;^Bm3(d;SPztQ|h*B3`1M<@Bi*@8$xWc^nH0LEwL!rLvg9zbPF{olSWdbk8;aT z$}3~=mrdsLX2!DtZA2|KZ#bVyed%#kOHHKH(I1*Qir^oKhh&YDZE=(Z>I5&_3K~I+ zP;>)!f9bttItxijSEV)a%%NPxN(hRI{0AcNzZAzt8s>Xn^Hlc1f|lV&zZc{?N>W-= zQLyu50TS2~Z-F}i`H(2l<1cfCVoJZZNmCBZ4WG%XpgdYpTKLSb=e^a|sE%+{-#fPF z)R_^AdIlthf2Syu_ZgCRe{bPKw|8Ze}U^?JQ4!U%O9KkER#0 z8WZXBvoKmznMt!ZDl#m%S70^1VOnPS#bae=eAB$rzUzgM^~9FtXPpsgy%7JW)^%Ph z-wLcJcOInv!7pro$k=_@hJ!K)+nl=I?J#D%;__o^&#~7>XN*WDzpYJqTBDdU>+L`xahVWQT<)rJPQC*LZa9~KV%C%5_|Q`7Oq zk01Z}knOHzEOl_E%6!x64_nw8JX*Q767PAnNBMW>K&HZhsC?*kzs)J>Cu=XJl@AUW zzgIlFb?N5nX4UM+#@&4xiW-DD?NNg_moG&fA&^E#_`)bW=6P`pAKdu*>-w^1 zCsNgfa+}52WRDM~=bQ3%tZtvSgZ+swsk79DZbbW%m(89II&!cA&lMOtqEpHy@IgyJ z=k`l6p%{m%J|;(&ef`{3H|jfA!{5K`Du(zxjsM`6%R{P_SLQ0O7zWm}Q;zKU1ahVxMrCzT z(Q+S)j-w3p`%bN=TL^W-*B(#7x!m~Ye2dMA(?zgUDnH?sSJo4U#hI=j>xZ*UU4oq6 zGtR(k-oH(XVOl)RH)ty^+YZb}V`vhrv;m=C$2NU6(Q%@d}Hi>8%oY zx0`!^yTgpvwNzLJPB3xpH;5F zy&P;5)2eZo`e2_PL77fK-^4Q5in)!g=J`GUFS-HNE*tfGaHH|$UkG)5M!`P%M0N~)uU?y;vz9EfO~z8bBt%n;t!0ih{bNVX4bryaO+ zTaHd0>gMZ6?{f-MC-2d46Tv)x=OYE=aM0yFoXJ?5MyY-;b0Mo}6!zsq#WO$&ZttKh zOZ>^Q{UymHa6}hI#~|D=MKrHmqtzl4IjEr8@-wlCD^iY8qjfxT-6{Go?)LK>5#|6D zr^~8s5!ZsTD&b|M%YPB6SL2ECfkZpzEH&JiXC+8SN}ctP0;PYcPPlsrKn?6PoV^jUvY9p#hUZ4~A}UU7*U$)NR7tNrk!WL!>0+dgh<0260$X|?GTzif{bP+{ zaL}Mq(ZcHei+Y99ywU-W zF*?-xvEX`3XOUinBbS?A`gY00(a@QF1Pvr8pAWW}ta!?o@}GFDGA+~nb4DlVrW)>6 zB>`#RJC-LL*Maz4Y{+gOk+8AAZ}fIUYNts~mm(OZbE%NuI;dzoD|`3saMV)j9SDpR z{>x}ufr!PrqGM%*CZ!Hc$7HXs zF~7j@f410er;Z=r=mYX`RNXhB-N=`gwuf@$rghn?-Ct=_OnoB+Iyy9vw!dx55;K}v zlDd$=()@99ZnR+D=*VE{c}ar~!`DDWim>$jO^!z_jAlFhYzuUpP7bYIe%GE~@`gY0 z&A&WN`506B{ufe*nUfj7g8%*Dr!hbSNDqv#FhB)#zSnanQX<_@Sa%8n!Hj7GC;EZ~ zLe4i}O1zB8+tF00v`83$X=5`g8>i1w;QP~Cn0_9yD1NekiOCI%*Sj(8eFNJ~&!zPu z3*t;ITBmnIwT*(-rm*qPZWIqAHM#|b?N_}IK@%%(evvJoVkM`( z_V{`DeXvxGqn=Y_b4U9r7LLS7>dvUwpum?1duW^L+%{zYL5}R_4R46!%!*o9xK8e- zUZwiCl{&wVs&g?2EKtuQ(pJDzj?ug}b2*(-*0ecF4!AEptaHKYzIq{kasa^j+-Y|! zI|`t^5&8Xz2dU_CCrqr?;+qc82I2+)-pF}?2B1QCVCbc})c|Tw5X;5uc zASp6zWLo_bfPHld0-}cep_0FlROyJ0CL-v?8*U`#(5nEbEffsIs)1p)FE054028_H z3(pul$>+xsCIA~LNR05-#X^`a3}w8iWxL8BD^kNzRFFyl@SSh^jsiLkc!mP6anT6X z1lUl*0%-4nfLRYz{!p1?JOdXKClC^;^cNi?-^&I7AyPowt^qK=^jRAgd5Hp`f%erE z1yCQ)G8TEGm_3-j5nQ;JM18wW{dxcf&PLd9u~V61y?m~9ZR6+fa}X7fnyErvHA2T9 z(6N6eSAz|rk01^IR?mSqlk0c_Za9CBJ0EC9 za#R*%H7kk7$T5RQWq?1CsGO`Cj)CS2>@z^bNdkcLopz8l03uR7@F6^oYs zZ^V6kLa2DsjFxHFBC63su->}rGLCJRP}u)EltXd1@D|ago*E5=R-xOM^TcalBPs~> zW|$YA4FLdADpYAzb(?^UdnNsRE5cswd{jpDD7f7n4Ge;@)KVSv0L~+}p02YUc8Z7A zwIk}QVd|=H9KrAfg`9&gg9(*(*c$*qWt7$ax{*r12{$s(&La_n=xymPtr;xUli8Rpc&d z#)j0ubL%)l3G{7PgqPZ7aR7}1paU=<5Ja7;3hz@v0a!Z{NJO)T1^`X!D>}Tqo3d*byzzDo=O~U zErO-yfI3+~oWUS4hn}W;TlSgu_-1`PgbLgnvKv&h%9c#-0rM3W~ znf2u=p(WMPkEF2FYSCJ5*z7C_no=|1Lx6MKs`!D*9_!=hZjtnC43UW8WYK?rqu*Pq zUcfFAj0Wn#g%*w4abp6J%P$q`nBM`gJAt}ZiM?%fe0kM10w@JQ7i+?N6F_6xDsL5m z0ay3BoRA*FsX=&Uy+I&0hZ&&JgmITDz;hKQdM>ac((1Uo?X(rdi)#rq4SxG0dk-SE_=s_)X&O~@p89F%ID01qSBp0*)V zFbHZG%L51v8jIY%PwRc3&r-;A4=N~zkg#AIP^T6lkNmxzV3lU5@?D)3QrR6*Y>^3J zCAS3l;F*C7kLpeSFfpnGo<#ZTO{nRvG~)k;R~Zqdoj_s5??3ao<>1kHLC5Dl@xW(&$n0Cwr{Hv=Q+ zog=TbVeT4Wkyw)XHlwPkft9X-nQDRta+1qF5~KhWZnlC&0ARlDs=?^4Ar^wdW^<=N zC_V5X;NUY6Dg`_Y1A#rwX^1cBQP`9ODQ-5-4MBSslFDtynn4klZ`|HEHGU#zdujB(V<=tTEoD!u(!g}Mp? z86M+gS*2^P`4%>7jD=xv?nI^QS&*Z%Yffk?xEp{d9?y(MmE=u&Ta2{};h-2SG`30- z3&;UfP%Pj~3!}pTP@vCl3g}M)j4vVp5&+?jY1AAR<_RDWB-^QJTipj`(EuVA`$&Ka zf&C(Y#Uer?V7frjTa+9Ypxy$`sdo zcuKI4!oF*4nDj*z=Hm_;6sSm2xusUC1vYSwW^W(@z;t}{d+D|EQc)rOh3<5U?u{?9 zv^8DEY_oNdbN$1Rd;NEm-!Uu5p3S~ywJ|DJeP;~Rio=-zJDf7sT@9AIX-?JFZL5pz zbE>DFSKROgKvFWKa%o)0&%#6ikHoUPh^C#gpu#G~r%t<1Svp~GfE0jqd5CyjRmH3T zLQf^gEI=`p;Q~O4z| z40PnWK|HbcC5frLSCZWS>xQ$P<`&+7@=H)fooWgw%*Hy5diOTl z&ZR+HyVV;Z3+@t)1xxx=^YyS^{IfP`E;2amsd|Kva^;=&Y;pLNe>z)T)}CP~_QShB zCG44|?lKW3!BV&3ZML&p3pE2j@8Jm^`QPM@XHNX+E%Oly!iZ}kg7-n+c>|$0=E0d( zxbqloa3}{A_bEiU&Ez2klwU$l_~sT)0dtE)D@uvR1ChbB)JmV2%Lt|Xx~4-B%VKi? zs~PrECceCErj~h$mS~SWTMnjkZY!7$PD1HkQe?%{!Tt5wLO^xO7PWu6-o7PSUxmYa zR~V0%!EPu!|Ngg0C~AU>byB0Zg`tUZ}>Cjloy0ztTbJz~FNCtb5gj(1f#O4qxUeVfRevDTj zn_iDHvIA*OCGyTNXIhnbkfP)N<=+hy)g+S{?Y0#y3eu+`sX4LUy%8v8>hs&v@mv-$ zI{woG!p%Y->s3e^I;aKtoxK~c_aRId#t;v%;~H!m;r%p&PyUoDYe!-B0J( zsEwpqO{5XG<>CWlai{qZ3nNyu{{&vAkGLpU<%Zl@HuI1!#GZo{u#enexd}VDY(KRA z;@{HU1#RtwO90?ZeBD*W(wprV=9aU(x}z8B&kB1i2=|Jd?2H6Rl{w!Jl!w#@ARr8O zZY$akuawB_33@h@6>__%x(Y5lwOmm7thdYOThw_!zV-&-Aqi)5th1dOs(QY_F$hy{ z`x!zHs~w2KJ%I2;9{B#71Sro~!>@wZ%h<26N2dRps$&j4#Sf{)SaMM5?(OlP#Gp@N z_|ClqD##ujct#6v;a}UyXf|IdHYYxWjzW1M1u8rhlvRFdzQ?Ffw7azNi^lb30cn@N^*+S%ueum$I z_kzcg3qk=e3osFt5tU7}9MJ$7du}jg30446ApmH?y{#j2)TupHr2~RfrnWQ~eyk|a zqTZbN*UA}8oBsDe1`Bd)p!lh5@%-lR{MkRZ=^YC-o-mhr;!%4r1V>}T1~JqkK>mOO zHI*k-(Yj$KB>l-$J5mAMh7E^;PCY6gooR)swC*-s%_Dq zOs9iMhJ;&fL2%})?&qgcKMVBOWjwY{rI)>@9ddt*+^}};{uY0>jf3{}{lWHFIn8lvw`)*ZjCILYifj}2jq!*e!Iye+T6~FjmHu0H&Sm)mg|Mp! z38i!1t&Ww{bUqhwHcEB+`XJreK{W5>`hwlND9flRlM@nZ>F9%)n%i;DDaz9 z`xPRNex-;Q4T@iV8<2|wafNA}4x-~$V}Hp#(|=P~kny;o;ZZ_(o1cqOrtg6>|H zqKD1MQS7j|5A8EzEWmu+96T-+v*ClK$C;mCkqcVi7GU2mP9q7`8tTX z)kQWY@}I9JwGyOzDd+R9rA#biK=E4aWyV6yN%Ji7W7ER925U!d0lQI&X3j#3Yb_Te zdDmJs_oP8WyJkX=8)4G8110Nt;ZF|) zX5miSi0#AXRQsS%3N#?Uf_ad6geHlrqPEUSQ!14$1gK#T?nZf)72YtL|43Oz2XCD!!o5!z(e&UN`VyEj4f1 z#S2BK@yR2fe7?_)&~EU51@_)E;s#Hne7-ovtKRT@GeR7m zd>1}obb@!@BS8P-kGkm_QVJ7BCSD4<{85D24Fd&4@1Bvuc302a>Nr?-|6aI0zwpP@)Dspi3!mu!*+~Z4QiTiX-^o6f{(>6s5oi3IdZ2 zOuP>+<)9x`8V5r(tYZY5uwlmhp$Z|qz)=Hi%7L1KJ6}W#EvgV97Ml{U8W0bIVq2CM z`31%>mdbfm%$m-e0zLKE0}5HxR<`=_f{gVe1|v}2H^!0;^!cL{RPYB~RM8)QWTPKB z#KI1H_#k|Ypc}&&!GCISMSL*reycN*0B7fm8sNhbBgkD95%@+C#v@)M_yQkZ;fz(X zvX!oUB`p7ADa%>XvX-{IB`$NR%UuEk6TAE+FoP+~VG?tgy(}g&lc~&Q4igh!Sjri( zM#ZS`3PKAT<$Ah-102Z5DU<-55^KOsOvZzoE8x)_UJ!#K;82H{z{HIHg#$nQCq6ZZ z!GMGy&K@vLfGGH5KhA~)CcI*rXlWoeALzWLFtni#eJDgDD$$8jw4xThC`L1?(T#Gn zqaOVzNJA>pk&?8eCOs)iEsD^r8T2V@Ob;07mQeNJBY*;2DfHr@vwhg~NI=X~KY-X2 z9^s=OHTd03vlx$8P=X1Hdn#0;D%Gh{wW?ON>I;^#f~#V6DKFqESi>sTu~roYWIZcd zsmlM*ucq~>Ssj5{+bY+&S~ae8z3WxiO4qmA6|a7!Y8(`qD5`lO8(z>}JSYLRzU-rD zt)pVq)RMHNHDzj%Z3;R@Mh(1J!+J@5&)U{C%2Lz-3pJR-XH)Ad=kb9tXS1Jb=~T)Q zaPcu_j4h_N*hR+N7PsngYOv7Yg?;qkbUo-56R-dcRZv5ByomzEEIb8u&ziJa~Zx9l<9^o5ZyY0=)wMFo;8ZDfa)4 zz`aF*uL$?CVOUXODgQmnYXw_36-%qdBY3eBpL$7X%wuWprMrTKPmj`K7JJk&A1DsLapmv z^Li@r2$rRN65w8ItJ1>?iWwnI?87L!2FC_UvX{Mzk8OT~2SUjMqg0baLK)*!WFmwFt^4)!})0=#Q~JKPVAao`0Si*p}) z-0MUOOXR)peh2Z(|4x;_3trmgP($GpulU6?zVUWSy5l1+`EEr$@|FLe{GruQ4$O1D z^NBjU<3aEFDqDikIaJx7ab6gJ^egc{66_Pz%m;Ts?P z-+$O0+0N0bk{X926TkV+e?H^kog->BWgSS(MmHXagKVv16PQ3(HYi~^=#zg=mv8*} z%a7YZ7Q`x@3>p}TBZy#O8DKo@9m5F87nUIy0*@Km;ib%i z9ukHeUd0G#;a1FnAWp?EKE*B82nQNQJTT&2WC{~RoFvxBGfGM_+R1}C!Y7nn3SAcD zP=hX16683=HD)8+)EOTn3;+TH7#KnAVMQet1|IGrR1p87A!1FgLk#VZWMfqcRyO5`l; z!_r9PSP+fT*oQFsBS>202w-GUDNV$o0~%1lfQaNbkYrR)L$6>$Jm{o9@MJdh$|l4^ zP~s0!3MDkiBVLH$7|a1(EaOvrgF+q+{)l3>-~%7X1r~HtDLMr@6u~B-1CR88FDN8N zZh|3D0?%LtD2gIR%1~2KLN>G|?tNug@<< z0Se#%6O;>ao+C9t613F7|4l_VexpC&0LZ`t3UpEtcw-(>sD+lxHd@VwqCtm#=vCOK zjq2x($|tO}q(3A>*`1aR9a%D9!3b2#WuE_LU-&58+)p=nCIH?6Yucq)5>i1PWI&>U zT0A6Ug(B|x!)$&e8ieFUW@JCWq*u;lO^zfQu%=VcpkLg9+hxTtv`kqBj7-v`Q}B$> z0Foor-*?8QY)+bELZ)3{r$0c0ZgLVn3dKw1#0?3#aN3jFeuYd z0v|Av>TCjP{Z9bFCa3y?r!EQJ7($R_!wNLTCT!&$;ZHqE3LmggfHGszG{p*_LnrcT zHZUlqjs>JD7&LY!HI&^5d;=A1mQ|?2WbS7QIV5U9D`h$bY5K#UYN)a@tFwlf5QL@T z_=80HMPE28CL~O;hATRZYcw_~2CDxiR7_}vUZ|I%0Bjd~*JEVQR~ZUrqs!RBO67Jz{oK&<9vMT^NO8YEc4R?~5k4qBYfbYZGK0_vQ6 zDUg-||1GOO=z?zQ;k!O6I>xLJIb?E5#ihcgKMX^H`2!z(DymfK5;f7E?rhKgtg1c* zt5O#@%|Z__60F`VofLtCFw$5E=<2|e-k^gJ2u*EbZBrDhKV)mn8kj#MShHfqkQQlE zI9m*4Uyl-LwPb6`GS?rW?YDy1E3jG#`2)h>;eYUfI91cmj=&4t+TN~!BlPC|z@f=z zD}tKEfEX&G!cAvN=CwA(Y@YvWs&>k%w(6_mkF17jt(GA^#DhDiff|VBRZv2s#DG7b zfUCkP7ECJs%*6=oPyZCH$#VUhgb`Pg6{<-xf)b7*Zj50VQZ}rC!P$#KDA)0FPj;LgE9F zqJv_YmjQw8&Y~l^2FzHL?cJ`Akr5#g?{fcenuPBQfXv^%z<-p2;i>{t7;iK>?js%U@xq}z@j)B}4$Dpj zC9FUU$dJyG>j?BfQ?c(B>k|&}Q{bS$gOafsSJ4-Tagi7o3cy1=^Z*JNkw5Tn72jHqZdKe&aScrH>hwkUj^9%VE;k8{Aqfp96K*m{ zvfu6=6I=4A(OV7FP7K_N5&v)b8ZQnE?^lg**){M9HwE~LuRoBleJrhC$ZkKVFZ<5$ zd*HAR$1eH~^ZGW@BSq3A_0t%?a-HA}n_h)kj0Jl7#R>%4K9E6E<1%2zW@|paoL5k&>%6$ z$Y>nLB^blyoV~yu!&MLHanNpYjv(_6;}1oLg*?%dqu!Gsy3iX_^C-o~eAI^nUyabM z4HmqBCclRtWrZ(H&Lo<2-sSVWg2g!740)aNrT}%*L zI!gsTG__GzwN+m=!#OTg#3NRBwO4=j!fEw9nkHD6wOOBaYVl=Su4QBO54vQc+WjNC z7%y7awO!x!1I6f!&cQPp4TZe`Hfm#ST8-wCHC`V!VkfqyBx|QID>M*oHY8YPHs)hS zWtjlm$-?aIEtsZimy0}x44VHIE=@*jL$fY*SL+}IF9GIj_){+_qdP$IFJXq zkPrVkkr%mil~=iyUpba%xt4D^mv_0Be>s?kxtNbR znU}elpE;VRxtgyzo42`}zd4-8xt!1Wlyie8TtY2W1D)@=@6E$JIK?G!ff5Y*pv!?N zIEA0{xuQ2-F_3{0Sb-^2LmDW76Uad%m_j7H!Jrd@KSY8RD1jWn!lIA*-@SvND?u7m zgQM>OF8G5y=(#@-!=w{JG@LpYz&fet`q~`=p__snD8U{y#U40;5^w<{xca2uIiX*I zt~dMF@wyU_figrpH~7P^6S@+3fgg*SLZK0HOe-}_algE#0yQ^-RxD8oI}dtT%_zS}!i@VmdedcY67H{iv< zzr(msd{)SVro%!SH~}$qI=c^gwHJdD9D*i9`=3{ZpwGKMsDsC2#U3cZ#(M*?D*-MH zx>9S!$vcJ0D?zniMa#c@%+I_M)I3#qgB<*MEhqt}Q@qkwg**^KHE;nLG{rA;ycPU8 zHvl^u0zbfls7w9jXa44Qel?5(EGPq1K)w=4KIN-E<~N1rbH3+; zzUY&_>7%|=D1#?>LN)M1Q{+Q2Xab%$#nET`IKaN_H$^lQe=#@(=+8qfT!Jhh|KIbu zC``H%C_@@pf#iQXq0529`vWg1!5_#%p=*LwB)dPn!=Ot%Az(W%q&&8pLeB%lB_#uu z+#A@>A0dKDc==Pe(9|y_fBaRnm_~^wfA0pqOSrJ%LxQIEC?O`0pg|=FQ4Ty*QO-g} z{=yKfRZ9t>iV6u*^cT|!L7f(f7BzYlX;P(2nKpI$6lzqdQ>j+9dKLd`R;^pPcJ=xd zY*?{l$(A*H7VTMO7zuhRsmbCySdlIf)uiMSqlI#mC1s_Ai$7CJ; z1nXT%V(%g;IRs8+zr%?aTWRbPGPg*ujQ$NgxbSAhj2%OkOxZH#gX0GBsuq+RI5~Ud zEe_0WAhCNmb;3f-3s=xuKb52zXIx2I>C>rKw|*UacJ14_clZ7se0Z!e_e#_$uNI>{ za{fdj=5JmaC)L;zER0m%KyDai`SYZN3|OEWN2GcxlFT1blz=O<{{kGaz@_q2Vm~GN zb5Ov62pq^je-KoV!GmxCup)W%sYstT#kGc)_^zPPwRY{uyPaS_IxnCnk4!kBwcB_5M}6w*i~ zos`l_Exi=esg`8&k8z@ulF3VWks~OAvP^2kfqW`PQGQ@i<})YC#1lj`4`Q`2S7Cif zRutul)2LJ@st8wqbd7b^_iD|IRVPlAgA8Q`+ruB)1nsC$odohJrn+J(##SX9&6M17 z%{>?0bk$w=(oLCE=9f=F9mtA9YB8l$5|dI1Pk-_mXpjGr(&&fRWb+JhrGJ$OxH*EG zv`8YKisJV@e*qTQ;a)wYRnLR;`O}IjxzMLGPc}U$Kk(;2x_|plGz72O=c5S{H z=bUxkndhB7F$L(LFBwOkamZs*4r2Zs$P-wCj$|o?oVBQ$nilq`;5*+N3}J+-1{mvy zD~cE?sZXrv>#H3Sd*iJ;Ua4BP_Nc~KLV_ZsrfB@R(iEIt@`t{G(%|M!a22k(=fo9X zobkpTe>|-){(wamP1YkM5@ozGMHYgb(CDgr&bCVQu1dcupL~kaXLO71QT?dYA)lT0 z+HJob_s6Rl$O%F;Va4f7Fxn%Ue4Q{Q_vDpdp85agoqztw-RB+26XTJG9{cRI-=6#K zy{CG5f~@%EKZ3&e9{u#yU!VQEG~heNp+E;d5P}hu-~=gHK?`0GgBjG|207S44}K7YArzs_`cX4}AVewSx`%-f zca#zyf z$~y>{Ln~epi&+d3tKjkxk0`|yeMtis#?g;3E=7q%0nQ(Op^&~nQHn&dn-=LL@e2t&mviAp@)5|_EmWo54M!&>yikFao} zoa7LRA!g1BT&Tq{6T$^pgds~fi9}^EIV-?;0wHP45Av3DiOSSs97}?QN-}30k|^OF z-WZ@jexXZuz7w7`+}$MXJr| za8p*mq{ti%@;6eMLNC1d&m{&^#dv-cq#>0c7o6Y(ibTv4DV0;4`uWdDm>M@aMZ~{V3j9uVTGnf z6|jL7toCR#HGc@RIqHzpiMYzlYW!)IXsQeu$Z%7C$bvk7h*R-$^42s^mWXk}NkM4j z4|xDZBmB_IL}WNv*S;3Ewu7BifMt=^8D+M%Jxc3xOO)HvFt))J?r@1)T;m=Wxye=T za+%v)=ROy@(UtCWsasv^UKhI+?8kQZ(OrCe7rgk0M?Av&-R|yVyC}@=dfD4vtn_0a z;2p0!(oqikZUev8(62S5(cl05S_8iU-iCb%Y~OU$x4`8PaDMmOU;pkGzXVQ)JH|_1 ze4y988P@P{`|;g)(D%OE;4gp&Y+wpo_`=)O4t(YNVGt`=#L^(Ki7mX34c{2YS?sWQ z-ErR$m-xiyRd1$PEZz_MSjIrM@Q`!dWG5qV$ML9djFoI&+eNv@3fAv{(=lFrF!{-0 z7BlqXE9G+Vx5_sDUWbpYWdN7M9Ty(+oax-S`1V-DZ>I2<5qM`e517JguJfP?eJSzQ zcgy7q4}E59z?sjL7WJsnvJWYXc)**!rFgZx zjV?2q)UlTJRPcdmSI7SY*6=iSI#3Ku1*uFZsk{CEaJ) z$1t1{k7Qtl*Z#OZQpQjDWcXtGNg01O_THUrlz$!P|2o6(0gGasC`BmhOGCsn4q5bt zAC!UT{2}y2N*r>65QYLO+Cfl=Vw#-76cP{#9swzMPbqL9__m__VDI>nqWP|3$k5>% zqA&fZBJ2M|Y%8E)1IJ?fo}vSpg6y_|DVC2bx}gM*f(Co!{g5IV=1=MZa9qFzT!N$@ zRF43k>HsZb7-j(te8CVXp(08F4bXu2{Gk-EfC?G#8>aBf<{%Dq&mZ<-4z>^tBn=(- z01c>tD3*W%jpFE-0s}4L4AZb8;(-#pAP#~r5wgJ$r~n$WVH11-3mm~8x?v8+up$6a z5DC#A@BtI>Pz{g52MzBVvLORM&?5Gs5;Jfes39BFtQ+>h62*`GETSK};S+64`p{tq zyCEL30UtiGDaxV06b>FNvFJjv4vUTzVKEG$VI45cA8v8__CW;E;T5$4AG(1WxNjJ< zp#=YX(F~!HD3&oBMi3qFp%NL77G+Q=_Q4nAO#JWx9jIX)Dq`{A0-*(_-~_Zl2mHYehF}e};0*ks5q3Zh*5DBsa0k|42?RkO z^k53|pd#Qw2WGM+_hARlU=3P8(Ae-O$_@_Ou?7rb7EqwOdH@T2p$CW$2Ufrlm_QYL z0R?*C6=EP6=)o6o;QRbRE4k7u`=JP!Ko^QY6(rFuj{+RK0W2#56T@&A2GSQyEgApf zp&P!h`H}%1hM~LSAqTS|8HNEGpur!$!56Y&9hT1*s394W0T`X4^0p2i!tuNOp&xtj z8=yhFYz!KFfig!h{$BALGBX=|0UDBl@v4#h&~F{M?=-J*F+mg88u2qNf-uoxHvM5V zzhO6*qT7@LA4W4VbFnciku&$Q6iIU(pkV~rPrTr<1P${QyWu!lQw9Bj9&A$~U2!kT zG8v+C1T7OOBy%(^vH8~FF%cp;yE7TSfiNv%AsI3$25>A)#489Vsyxyk>_7?hfQXPF z9&~^Vh#@8Y0YR%l8H#}c$p9C+Knnn;3s%w}B6LEv;1-ZT4*~%Rz|bFz(kTCIZXce( zDB_STN8l5DZwX(ZBA9?Gp|CN1;RT8g3tpijieM~l6h~iS7*GHqyr3-&FTIZR%%Tnz zD}n~=ayavn8p#s9!qFc}FbAvjJa3apn{OY?5gl%jD!?%z!Lh@>k1)SMy}Gj^{%{?- z0W`laFR5Ycjuc6)A?l!^7`cy4-BkR10ZlD}9u#v;^AtEo5Ff^JDLU{@R+nHgcBcWa8e~T1B-%A(<@1hf*Rm~74d-@CY3X@kp|xnEZuV$ z2+|+!6F;kib?Pq~st&pU)DG4l7i1+w(a9gqphA&=6fA-P3C9j9v{?T!bXfyb3m!oU z009t`$s$NJ6ZwtbEI|!W)FM<=6>6Xq%Jmxz0Szi*D*Zu4$&v{wf=G9?N8i;Sh!hj3 zU`LCTDB|J2`t=!}Vf-qSEHBj}R+UNjK~49vE-m6pMUVuybopTMOBJsh8Z!}5G5uoE z7vNz{6E7LM!9B@K4Y^NF-LPN(>tDx@O7VgFJ~m`4LSs4BI|Y?2C5>f)Q_}VUQLkcA zOYmkFGg8qp1LI*djV&G<(HXy?{jPK|&(r(rP%&vR9ZmLR^($qILQcO_Rh4!hr7sdy zFdl}%J%dppC-KJY&MaCBE5-pA+7=dA;TP0)DtPsyeANzeVF~~I0SRIO3D98{bl_N< zpy=>{0V|?eE8+~GRZyCh6p)n{NZ}f?wGAzT7Qn!AFV}LjzzCoK7{WCJ(E$apF&<_C z2k^lVMnFf~)febBUj2bbnUr<2vIg+M5k^2?nF7v^Vo666A)GY$R?`@RFEjmNVKvnv z;2{zhGyR+|V#yIqr$Wt!Hw=A&28|6J1T{)IGXuYCWc{IJog#OQ0#CP(`})ylE8==9 zB2X=2e1}0#b9QH~Vi>UPXo)v32{ZVlR(KQfJdc)3@wX!8mnb-uNhOUM_4g^l7i*nY zRS&gn>EZj(!547tQH9Y~!%Q7;!3ddR9BP3TYHKQLL9zcTf*)R_BHp25Naq*OHYt>W zBa=cOSb=Wg$!_shZwVKSXh`XH7wofj*@_As*)8rrWLhG8-bFEb}| zP2CqD{efi7mMI7m{hSdRFu4#Vc|5%{kR{EN(c$|*GYsRmDkOLqMzDT4lQq*9fm?Gj zK{1eRY&G#zP+|Ez8PgdblQj_nfv+NKvr!FK(_;UDvoUvYHYd~A&d-%`OdX1W72XIc zZ~=s+qGslS|286ps{-zn!iAk;hG|#{DojAeK__ITGUNajbmAUriVH459|Q##>OwH~ zs2}pVBKFxIc!3Kpi5rYeijxBRU=TL7f@8m|2CBI$)C?R=Q7T?=D~7Be1apRr0;A1> z;M6QS#p0v2!tqM_R+pj{G-oOJ!5fqz8i0kHEyAT?+Cbz%8I-|>{6Qj>p+m;uK^5yAqG15<;TmANG2Q_jiUF#% z=m3KNt&sxm@$pa%cupezhC`D9i|`oS=t_edx+lsP%26Q>yH zCV!NI7s#3yzG$Y$W-@+Z9Q0u$pW{Su0T#Fc=t>wDOc)$^0TzfsIOKvC)Ik4Pwq$|iU_m17?>TTOALM|z`{XPxp}6%kJp7ul zmGMYc5J?WNux-a13Q#HD0Z}F+pn*jmNW(D3VXTP-9Q*+ndgvX1V0y1OD=ST4q)gXJL~U?HZ7Lbog88ivrftD$0id%6=`!L_0umZ>Sex*{5T zyDQ=%$7UR=L2vwltI4~Y)7v7zIWGSy#)Uo48#sx+v7*8WP`|-RzvCnys^L{EBEbEj z7<%ZiB+@?-e8G2|$De{9Y9XvlG+7%A)A%KKQWvCe&Ofd z+b{&=jiP*P?9U>KK^7nbh41^C*A6pI94qj=7=W%?V4)UpVIDe|7ixj$fQN8~BeM%U z$J5!(KRu;2`=VjkSYSlGy&0vmqJ>N-DPX7mKpoaU9m%<(gNvaSGW=YW0T%@Qg|mnq ze!U`oUD*9W*nyqci~ZP#J=y<(z1Z0y*p9e*`Iyb zxgFW99ovyz+{4}5yB*rs9o^X-*vTE-nLXXjecs#M*y~;1-QC&Y{oKpl-u*q_@15S= zJ>H$29jtvB3ZB}x9p4Qe-xa>!_x;)zzTqi;;06BQ;~nCa-MREZ7COXq*o7aunT|Zb z@aF-sD^U5=wUwTSH9+Ne&ut1=X?I=gMQ_Q-sxQ)>X+W+YkupSzUaLk>A~IurXJ^Y ze&u=I=T{!+g`VtL-t7NPKI+3B?76<|kKXRlUhS)%?M>e8vtI6rp6>ZR=KX%@dEW3n zz#p`}?_vJw0bl8<-sH_;8Ls~AP5$xaUh?Z+?+^d$D}U`ZpYX@t@Fzd=rGD|}-ti$n z@joB&t)BHQ-vcn;^jE&~$6oez|G6YP)|tZOSpW~t!3S6X`F-FVDgpVCzY@-&`B~r! z&H)di-xZi4`iuVxqMsR+J6qyPDv-v{8}9A;n!d_WP-Aqso|{-fX=I063Q zpA(F~{?T6%>fZ-;ff)e8hd>{S&cPYzgBd4;1)~g%a&F;3PUjH9IX7bl!E+7;LX;?w zqQ!HH5bk2e&>;Upg(*Aw_=r&Cz>66-x`a8g;!KhzQ4TD4a3IT{93MuEsZ$`$84VF0 z&501Fz>_0O3f+}5sKBQJ3r3Atv#Cd}K4dE8Iglz*tqDQeH0su@kELR_E@XkX(mA|% zeVnVrmxU{4fv=py6PTH+zY<;ATfdGy zJM+uByLHcV1VK2Bg;Jk z9*AIq3NHWXnS1;_2w{YWUFJavWsuXH4=fnu0W-moua9}uHMip%+VT&%l z=$wBH&Pd~eyo^(0jymqhBWk?C2xO4bkyr*EGkBmEb2)4XM~5GV_*;`j`rtts)TyWl zB0_G-rHfaT(PNnE@snUQuZf9fnrb5WW0!8e`5TG5oa0KBbox*N3wjwz1Pjb4l0ykI z%(*0o9wNp99^`;1VsA~(NzM!*V2G%q=j`$r5p$k{$P6r8fz3JCAXWwoGdM;Br!ycn z!+WE$KtvR&!bxkb&8>;6Zd@2B;5fYG3T&{#?&g85#s*0u9C+m5fmcl7bxs)`VuL?LY@j7I6frdN1yjJMz-iBY10lB0+RKAf9syjPri>bCM`TW^T? z9t&{5zWG4Jum-2LhZB256W~3vL!on*v{)ABmTha0Q0gRB}ZLAw*klZ^s}) zga@hW%S@BXhB&TokM?%1FfNmLg|}KB6RHoBR^$OTM3k^-2>CKuG7CAJL&T)y2#Rsk zaLPz=)tbTb$AP%KV|CYFt64^)QjZ-UiLRh{4mmEE$SldB3S%@PC9I4zGv|I;h9WUL zS~5l+kaKq#dDp3>Fv5@%D26FxgJ;@Mvy9QwJ}(M2*_Ojd_16*;VoGM}5Tltsp2+`V zo-);Fj(X|=vcr1ou3rv2c5?p4jLN&P@VB&~a^e9Vr^tH_%e=z@n7B~NTqPbUUo?F2 zE`|{C-Mn2&cQFw4_?Bes6x$44(VR|)xFrrx`%KX7Fc;ozHe*O0A zwhSUJP{@D(evJ=6=gjUmWe7nCsI&{+rHxspBASK_MUL;O3>$mn&}CRqhEPTDGDHZ~ zDVQ~bQ>5(*A(+-NMD;yWo$7(y;-1h#(5LVPaE9|K-~HCK2QFN~AJt%AX5MfQPjv1( z4%5dCkBG$7WUGIHX`;nw2u0>lkT_E;4mm>AoGStci@wp~Z@6e2MC8m`vx@(s8Lh(% zIwTQTWa6O@{qYBNXyQ!W=|?T(vBo^=ku%Gn5DVzgif{;Wkc14!6VHf9Gwz~<9teR5 z6A8)Z)G35}RFgaiK?Y^Of)~U%#3796onmmKlB7&x$~?e>Ko)Y9f;?m-UnxFpoZ=K< zgrzNu!#^;UGEJA5hj_dJx?Kv>e5aHIE01Z&V!a1Wi^9e+jN-O=m(Aoa8L0H^phrbgHwO++3$Nf8$PRqVt+tVur>f!9jKmbC2_=MH}}C zP^+bk1*$~mDtQnNd|qLhT=Ec=RHjgd))QzD4Ut5JqI~fS#rwo}iy;^r4S`&gLGxvdu4WB-5Zq?34yADn2imBp?BX z1>rD86^xNkKPDAMOKs{?ql#1^Rn@6bJ*o%vq*c6F6{_Lj!3D)eRk3o_t6&Y%R6?<`0CxjXY|RSI_9}PNh_e^;Y-KMi+7^-atEL4lW*0iz&KB~tmtlrtUbI?*5|XyB z)$D$9+gh`B=(e#fEh}4bnPnvQGp102b76rRSO~T@_kb8~T(ke&(EXwobWD>zw%gr2 z?$4N$ovbked9F|Swmz=t7(RJGURItr9O;EEf7(lq_r`aX^R4fC>4RVWBD23=weNlc zjLQA4QWA8aue|;X;47Fz!16_Jfgvm(1jE;$40iBJAJpCnCz!(?a6xuC^9xuw!MTV% zjdV9-3OQ(kSG*v^9wOlnSo}g0Z}5aZ2*Hg%{NWxyCPaDs@#7#5IWwl%!yf!(JTEMWuruJ(Qn4+d0kp5j2_MJZ+MiN1e5`%J250-T2%WaKwFB<2 ziaXoeHg~oE67F%6n*|>=aD38@ZFWc7-SWZrx}SYodV?&_&++%O@o8d6^MeaYfCVdf zU78nXCJb`0H5vTD1yA@x8KWk|8+cd?SQo<|n!q(9aO@9}ll%|)P{M0Gfos&v@5Io( zhdlC8jbdM89?_`Bj%w}?aJ1MTJMVe@*l`(oXS@FjXFYdy+UtQ+pjp96XL^C1P93OA zeZxv8q}98v9j#Bj>t1hq*w3|fs*fA0xO*^s#LjiHf1TRdfxFt-9gentxbA2lJG+w$ z>E>QsrxC}r6*y-KQ|Lnp!mdTfHK7?;X9nh(Ir*!VFnRbnAv2MPIciLo-CFztkKnMW zKZ?+E`8kd7tzYRPV}dr?dm`O>DUK9ySYy%@0XwKX2t7r>#^dI zt488*+=D;(!7whc!W&Mo0;wT?d6`#vC@24U{s42Z#to9@d8lzmkj6tm6%G2}4ZhSP z+`tXBAZ%17ffT4Z(qewvCrEBaC7&`R=Wq;{Q3W7KSx_~CXw@8zae^L@f-1;gELdPA z7=trNgS%yeBp4kqh=MyfTasW*8e(tFVNWKgZbGPSJ?MhW0faKhgxW@g;b4Tv=Nn0Q zg84>v_~d?x6$wg!X{iAVO5g-FM+R1)FqvkC%|&V&7k~z6a%QLx1xR^>Flo2O8vWD{ zC*}>+Pz?9*4(}ij)o?mu!Vk5e4}(~UhiE?g!)H^tV8Z1mNT@MakWi91iHT;3mw|~N zsC}UIRaj>n_T-6~sELFmij~M5rWpSssF;d_^kx6U98f2P&NhXsNQ%Ybik}#ZgfxrJ zVT&nyFb#fW06pn>7j^%iS$VVaSxQ>I=j^>yg=!iCbE@;Hx8{-jkda7s$Cz--^#@9D2&&;+9|@8nDUu^ek|i0Eu_0v+ zSR4J24=w4EFA0+^Sq-&7d&`w{0eMV*)EEaT7|Xx{35k=0q?5z}7_Q)xKpB*+bU`~Q z97TDQ7KxKX$&*I;lTf*iZg&4xzGaGAcY8@Gl}*``Rhg7cm`v(599xNIU8#aL6j0G% zhIo`3CMlP5NtYiP8w6rq=JO6U2^-p&8ot5{fhm}SNtnD)3#T>bjSxpM zH#wG6XjUv>eIpW&lPO=IWtrpG8ybU3n#o_Bd6}P?nWDK{r0JOxl8&g!j+A%~-exxx zqLyPxTb(IUpovhfNsq88o9A*FwMm0)2~cUE1ORA0>cCj2qYui!3ap@*kvDXrp$^k2 z9??(@eyJJ!zzW6SomA$X#o$Z#@C&kVi1(ljvcL=apbp+y3yxSdl8Ks?6-an-1$bav z=?IDeNuPQ5Dfn4CBgX%msac=@nV$kWpdly;1$v(c8g~gQkj7-6c;E^-XqA)MpA+h! z`^lelAff+Rp(uEvYl#_&g$t7gnW+PMsWT3+09~VT57rqP+UXrMx}7hY88^aHnL(5M za3lWk3!{{m{_uxhBcHtKR1|`VbTFc>nWU%4q!21!R`-fgDy6eUrAykRPWojj$e~I3 zq*c15^Vy}a_>06TO!~lZq;o@CgF4l~3-th>o>7R!&|>3I49w^a#h?rZ!VS^DjQ&uF zpvMjO0Eo~S9{hl#{=f^|s1J9_FtR{sW3*0ach&2vdBTD{|fc5YW1~Lx` zs2RG-tBA!5uy764V0z-AsGGqH)u4L*(4MbW9^=3Yb4saJW2vbsuH#Cs=9ZRBl6^8b=M#K=P5fh`)$r-RN60 zvZ=!l1v39J?;s1@NE$!&sEDNuXJQ~eN2E4YvzfuQwScquiXJ@6s9LMCrw0zW%0rG= zv=bAqO^dgAtG9d0w_NtLdb9^h@J4Gy9ungXA%`03;G&1?5B;!RvhW%+X_&<(53C>y zzNEOjnzjdP9&F1C&k3FW8kl4P3!?W66kD$nbGLoVx~=QFuZyvN+at9q8_J*rR=|OI zR1NVnEYa`_v|Ad4N2As?bN`{J{D8cl!IGPC5C4R`+>p8vv%0WLz13^I*Bg|wyCd%q z2H(L~a9Bx=IxNKSuHqqj#9+1m;kduRzJJNSzfiJvd$ia~zx8Xs_p5!{nJYu&F%OdJz?uWW6s*E4%)(4-!8O7KK*b8+OBo#iokHGhzi{@C%VZ z4AD?nw802UuzqK$DXXqyX?uJOe5-W8Ck(09~>`2x**SywK4vS`2%@2*7L_Rh(DDoFMHx(9n#~GfmSG ztkAWg1aDvkR*(jszzt472)v*K^U$vD;|cO`2!YjH`Ea;O&_+pJaY~>M)o}lYkszz4 zQ4OOM)8Qcx53|*K9L+U-y>;*a4`2qzR0w4)1*}v74b1poj5 zKX6QDz}YFlN&)}?0#FBp#0jJ=0GXZ8C7K|cCkNFKYGlwk^Dxw!aMhV%&;H;9z--vM zoz1=d2WgOarqI}*fw-2<84_C?fT-MC%-Yzy*`58_q%GR3MA`r_*Msx{oW0u7ZM}9q z8?Y_gmvC$IV4Nb{!TyjB{IJ`-jbu&@+?mnatiS~jQw>=R8r-nJ4lMr<(NGMQ<_&O} z52N(o0e*k#;A_@3)ym*=;1IOviO-&q!_R#%1r6R!JKdhmN@if)qwU?xAV}Jf-QS(z zuq)oOLEfPj4IJzVR`7C8Amih}2*K^!X1GH&PT#=%2mMC|tSlP&;KA9InDYS43Z5BR zonzx*Y z;;wxlbqVKni5kCf%#dc~+*k|13=2kd=bFLXnW3wycSrl`4Guobq2Vyc-Q{TV~F8t+gGdA#Vk(&(GMv5>w>9!};U?%lh94cV>fzTUT;PB8@H!2DocumG2} zAno@T?RbxV8Hu(#{L((B5OvA|wP!d~WtgzTN|;RE2^=FYdwPBEjW zk}--4wQ!AB{A#t3@7GuivH%NQ)1v-953>#qwtgBbI__#x?(%-F=uSxL4&oF*+6wRh z0Du4rkn9kTxAZ06ae+@p6QlO`h}GGx1X-8 ze=zR=?zG>S4XFGHV*A8DmYXdA(@*`{4fQ%t{nIZ2;a~s&@BypT0Mifs8EgCp0}Eu| z{LH&+>0kMnJ^t^H_T_IdSjPUVQQ7rx_U;e=`|tnp@;@v9A@7$zf&~p8M3_+FLWT_; zK7<%i;zWuJs}-x5aiO|u8a;jl8B(M~4<5puM43|MN|r5MzJwW5=1iJ3ZQjJ0Q|C^e zJ$?QJ8dT^|qB%>BM4D8oL0S5iK7|@pszhSkrZSvY>sQCCUA=zwSF-3>vSrPlMVnUb zTDEQ7?mP-sZq#bt=HA7dw_v|o@AB4is#ow}hhpCzMx0pjV#bXfKQ`)t@Z^d2cu~fj z`QtrV_wMqOrCGFI!;np%Mx9#qYSxG2j)qOJR=(J^Z4-P~tKVFC81>2H=9i#4Zk}%+ z*I2ssa^}sQKZnlvHFC<_tWw85nH#aW_hA1O(>Kmvt%BqD4a}atq4MbM-M@z)Uw*jh z>hs5${PY%gi#}IU}5FOuZZymBW3u3MH%j5q>jWCha|E{BUM`w z$ffiN?W<^nQ&Kd2s`)Xm5Tis+$RoGpvP&!C#AH~A@_tdBObj{ zN15^r`YqG!=DEzEI2Ah8AyN-YRn-5eB(*eFS!Z<*Q&%S{21==_;f6MD1idF38*kN1 z9%Ym%bDnYDNz)&G)WIbm+JcRSHh;i*N11z+d52nSsg0&uXT4D-ntPs&cG_^Q^~YFb z{ZZ!~aB?NqBUow0H(z}rv$fub#^JVArN)uPQ-9DS=$>Kyl}jI341^^-SbhNqmV3bA z1r}n6IaeNB3@xS>VtU-sV?zryHe!Ex$(0v*Gv2u4j|p-Z;wiry2Ns4ax_4ine+F7Y z$#%|Y9&ca~)GLJjQJ84F<|U{YaK_QipJMD<)gUsX9*5j|Fgn;!g5-f^9)I3xk{G|T zE@d@IqMn1sJ8%EJb?UcngsNesR8AlEXUA%OG28F2s`=9&VF8vcQML(k zyO?{t@#Y?|-DVn=ME$wOV3+-wJZyp(CrDUc=LS&mh3?)vcG+i7sc+W{x&~~y?)m)o zuj;9VAAV}72lAYq%139dLB2fm%}=f+L}>j1JkjJ2avONziC^6xx_wUwcG|}$|7R4P zWmFV@7srQfmRdSQSh^AEZlslz?hp}`Ze>|wX%|Te1xW#6=@jWk;SUmm3P?%0bg4W% zujb91IcLuK-Fs*5{oe0qq5C814}V5viFU-NXt(upw@9rPhtB#7UvJ4W)+-4sk|o)T zZnFEV+w%PdZ~hW3T24YL!hV>(QR}l%M&6$=*+_KsvffDbEdCvc*FdNQa=oO^sfcDa z_=FX1QD;~oi{dxXQC8ffcAHiYL8fw)Th?|bb!d2ZUV%Btql$YN7$)O_Lkm$9J;^Z- zT}{cHR@8#auZe}f0{P6|KIRS96b-CZ(;Zt$Pv>|)^|+Ghv595t63?yB5BHe+&lYKE z?^8%6q$d4FWvUGrC*7|0v2U|Rc+6#Jsk^%;+H1OU7J~2a%ow>YF2jYq)x@3!m$CeI zgsN0TO5~3hc^$~kQ;5Xtl6+Z7{Wk>XC3~CFX$EGT{>iO5P}`tB<0~dg#N4K9)wdrR zWMcD<6yc%K!fO8KwX_MpN%~T^HRK-qu!M!$YPms#>woNh!U!QGXJf0tjyB2_aeJEE z!1q%Og3Uwg)|b}18x1{RP5uoHg1}%@Jm1Tlj+CpBgIe9F;eE!F243l``KHf};%`h1 z5NRlud$FQxdH#+MGUI1Eozk<=)M7&l@Xx+-{T9 zB|X@8$KII&zXp(&mND$iTzNV?2Z`UQzf`e1;oz1;)21eKktE?&b#_id>`+E^MkUr_gXMt_z12w`d{aHd|A z)YD0Vtf(q!rU9l~2}fU8&4k{k;c*&#Xmw?l%sVneNOFthwr;oSZkqz9H@=VayxqIR z!a&VqCGs?5VJZRH*ur_R>?aXsOGQNIkM=*gl}v56p(mqZe)PnRxpGF(R;GPJ>Fw<;wa(x3J}eSX~T@b6^y;}z5!166bcG0t_t?|5VBI~-v$bKO+2 z-q(3x8K$pJ#N@o7T`N?LL81Xo^}`Qdx9;%WLTU_p!d5sQI3Dm@xE z(V+QJ+^njJy53QfUqId zC=Zg>nA3|P&(;s$fR#3d7{7vXfqu**7{n5roCdPdztmxa#67Qf>J2QQTw@p6f-UBL zdncem|M4xV;;&x{?4>?i|Mj#U%;C-}ZLHK)l58co)C+I=SF3z@6pZbk^`vMbd{*-^ zlo%@=kCj&QCiQZQ%9#(A3ux4M{K)E2*g{7NUG<0EUv^P$Ux%wt8aI5sJjran4g8a8 zbNIXG(VkGn)g5F1uIL5(jE&?@rp)_aB3Un&oEA4G#9O*w?ne39aBi@jc9npAJX_RJ9auOR=_mv?39kVAeop8}Fs+tOlyf z-Js#n>gMXKKC-ZVFL>y{XXT?In`Z|ePcQ4JJ|2$T_;(P+X#P;;ATcy}Kyh;zr}*92 zW@K}R?O?9VyDv?0;#B_7^GeZ)<;~WhMCW$GmHhJ4-{;O`9}k~=R&v5u`+sV_aoS(= zq1@w=+HGk?9wx3MNyB*KeL5-Y)wS>Oc9F{dm<0R&N9};$(E7_GxPw@XOoT-b+toTe z_n*qs)4zjMMaz^XTxe@f)i#Q@1+uRTAB%(({({L)4kfSUVX!}v@M6UcF6A-H&hZ=M z7*o|cjV203Rq}m~7Z(V&{nVaUIPAl`1#jxn5L{4KYUYJ1BuSAfTJGqP>S-@IB3<=o zkt$0PGU-r_+4a_@ODi>34^ukra{zAT2wfRLeBVz9n^P_(#Eg|hDValq(nhl18dTO(TUadnqRy9AQUI0|7waSp zt8c|QWLHER!>pE|&x~+RRnYy>{v>?gj*-^=BnorIKF(sT3L&sg)qqc5ik;%SVt%dr z%l!6CIQevnL&w3W+5|OUYF`7StwI0E+Z3;kes6!xFo@2F7H+pQma@4bW#^m;<5wX} z_^yJo)`GIh!dHKYv=MmdyF*+kxlV+O@Z&0q@Wml~)eXtpC6Z1hU&45JLte)1Q?97K zz>(tviWTsV{^z9U0dA`|;<6p8#`N1P6SE%~BRYZM2nYIUTw zc(je1rN)abdy93;IQBVz>`fM&$?cTVKM9SJh{ReJ@6Gn9czsVvEowcaa>Hor1a723 z-R#|%KV8OHRoqy|*svmHM_*ozC;NWai)xcs9(btLS&Ve5;9z({90YwI^&)Q>YVSc? z+^q?u>SL0k#k@hq*>Ik-zA@s*c04@E46)NoM&D*v!iF&!P z@;OtnUn=i&RC#g-Zb>s?%dC54CtBZ{-JhBZUKypbYpBcfJpQn)*XE zw-BV;ax6Fm7bwijpo9YJvb!uvDitBeT0WUotTboi8b9t8?V7aj4V%+ zV8Q(rSfv|bTK7u#r=kU>J?AK%%)MCwQjYzRF-}>YKuL-b)01-eK{4X)mEp~hZ1|J} z9bD_4vatEP7K^V3*H2;k-Wp5(So<*VWhl@!b)I08Uvzz2nNGLLtCxZhdM1jQDvD~S zWB5ItC< zJP7l-Yj=5((@FxWObymC4XYde|<8;i+;{If?0Hs+S=bTcmtde}6Xi1Sll6WLKeT_dD-> z;cA{(Ne@~Q-~YnTz&tJUN6#Fvjt&oe9PVKp-oEDPoB0GhMTa(KB%qAGA^Wr5@2?I!M`HJ zVfr~BGnzDYH(Y-`9SiK*#<6SNv#yIVRzHG zM1rYHBkrzI5_3kF+o0~IK&i<75H8Wq zG)mU3yleRf=D~nKnm`B*WyU5dB(Iu(C3-JT;L2wyX*GkUl$rNrEl*llH={I_xRjwb zQ|XZti)#IOW?9_g%2enPy9sC1LRN#5Q6q<5$8 z>`vd`{o=km6SzAYyE~V;J72o{^_qM=u)8?3`)z&q`^oMP*xnMu-ZJmriuB&kJA145 z_txC^)&uu8V)r&P_kNY`Z8hy}5A5yC?CoBy@9mxJ{f7M}GW_1>{e2+)`|!^1qx-*) z-G841{yvTUeU|zAy!7`))9=fH-+yL)|6Tw6@8tItj0i9iN%)8$86sGl2(chS9}r z3?3@Z9x81dDxV&zz>icJkJR{%)URZYG_;R2EsnGv9Nh^z(#9R>WF6_29qBb6=?@+m z%pMtT9Nj%Vx(7ctVmvnHJ2sIyHq|~hvp6;OM;WIS==J8_mdanU|;wK#EmaN-_x@&I?@k#+L0?8LMA;?QB&v>57 zcm7i5JWcyN-Qqms!FguTc~%7OJUi?BRoQt?^Lg&zdEV@K{>J(1)AKj*ivq@rLcWW) zG8aYK7sVDAB@Zr2gD%Q&7v)(O@5(MJnlCB`FRErQ-fvt~pI+3!FA0p7wS1R#GMDw* zmkkz|jSnuHf-akJmn~VBt!0;O&6n+immRa0A2u#KPcJ{h|8xyrU;u#Q974(oTmiz@ z;{gB)nE>IguLwAtoQ#Z;3PnvtNrR%Ir=?+LpufSy$aaH;gN;?49nQnaF2KVjBFHa! zOITh?Tt!}5LrGpqMbkxvS64&X;Er1Q9qMdtl#af!nW4@tlNUy&RyL-Fl9s7KR$Bem z*Nu()3U=uY4h-)dZwxxJIJkH|cCj0BVbgNWdGE%pE88F&C>&etm0wKwQ5^ohBmh_HlU(YPQQCJ{ z>U#}=&MEI|DbN4@4!is=DycH4sS1ep&iNT>5mpJUzQI@%887ub(putJ526>z8XYo9pW*>kEI@=f7{vpKg9%`?bIO z>-(Qy-&VH1oo+3k@BCcfUH+)b{|7h>MX{wv@wDXR{lpgRgdD`-1JDD1_dxb*ULYTx!JGD&zY4%HS7zCddvu8Qx$fjP45?)y?>76>o-?_YY#r$`Zn5J^W!51OvYw# zZ8gywM=NAM)>8X(5YMIm+Mu;=Z8Tfr(f6^|`i;*88i{O%Z4JMs%1tZn$J-jWX9+H& zuMOL;D^rW#_V@Aj=HEZMqR80qcC_sO97+>%_|(yQxG_ioTL1;X_?IvcWQ%45g0~#4 zM!B@KZlgbP-?;I`1@NhLiVXt8IrNZ=5F}XN_SdgrJ|KXaigy`}rN{T>O zLs>dH4N#ieKU{y+Kd!?SZT9xpdTz_JTK8C<2GWiE;I_6?>gV9ny>Z%q^lH1ef;;t`P$A)%=r#mE6V?A zr)V#=Zfz$GH=^a=Sr3-{Y>AdsU>?ykD0#bActRQJCp;mUsv729^b47uG!o~ZLtd>P zw{uTAl|S)?>XGK#5h3r*i@o_jc5g@I$shN`Nl=Y1)T-(ubP`lm5N9zeNr5#;3bB~06jfA{p^@r_r_M0~ zr1iDMr`_-31~cb05aK7xY3gUl-53f|OBl zNo=``&J2I|Nm1mYPd&M(>i7{@tECFR&mC@}%d!e{t%8{iRc=2#?m(65bgqBzxoRuGnIvMQgOWISpiYYn_t>az%9$d+=t>m=^wbUmG>FM*(t< z!9l>+Tr%7zyz z!?a|KF@sFGzo_I4U*KupUaxC7b<~26s+LFw@kZ#nZPKCCZAUf)zDyG+MCZH zjMQxS>2yBju7D=2Pb0hSNpy@0*B((^q*^myExq*ZWy)Huwr-KBI(S0yGLrcuvXVyU zNvNAvrSFHR6dm;28<%t_r=LEgvJGP#fB8|9XhZ>rq$fLvGY2AkQbvnDv$D{zY8lCg zzba3plj4tL-kOoaE(uaayo{{sB17-LM%MFmVLMwEgbaAVSv)|d%-=$!ZD6x0Q+nF% zkLy&RFbj0Ajbh;~AbFt!tPAY~x8-+y zILKpJ&MmP@$#qmfrxtpWdM{foxutGStwG)u$;i#d$EAa(=y8o%>cvgQyiJ3PuVKwX zTy2!&@=x~H5rZR`#LMZzZi9(n&9<`X14uqQ(YLR7ktfW zej??>ILEHwf&S&!IrkH>rt#e;RK0QYl+7)JhI_ubmy4Tso7-l`_uP*)hUn+v9TYTc zfeSYZ+>H*j-WH1{baZ!6-ekTriA@Q8wEB(VRnXnFabgrYkCXiA*#|zhjKwD_{w%Mw zKJsxs#bPi$p-PE@blOnw2d)V4mDG^fW)8~duWSk!DWR6ePu+~vS*QElYTRbrtVpos z?8HPd_m%r#xivP27I0{U;X4a*m%%ria8|*1809jg&ReB^wE%z4_eCX)jW2VYJXZTG?An_M5jI7JUb_?{f(Pm8D6#<0*KiovkNj04&rxKuHvh5JfAeXoMt1qiDA1CM^n{ZQiKwpzqaBr{(BObH1@Srg zLtj!%(Ji(?k*))0Z@vk<+2Y9Sn{v-c~dXWnVrS}SzY=(EKQ)M+kmb2kfW?<5Ix4JVtuNtp|qtNA2 zD5NkEi1S;nx1!|HuRBEYU_eP-Y7+_*^87_bQ(&>IbUYJAGG#8do!&~C57Yml_;WEr zCqp73MdE0@-d%Mn9s*dyD*Xxps^G+b4@z**5PGozN8lC^=*qXPkVRlQ5~6*3 zO!))qg_(S_vULUOCpgvSGC?F%6B4!%x6B3&WLB%iL22f{u1u&Jo?KQT;-xY@FOMi% zAv7e9r(4L*S4N$bV%67VH~Lf{vkoW$E9CS@P>3TX zttX`vJvz&g62{4Leuuhvf^$ZfTHNu?GoK>9N%ET)g~KH1b?;)GoMOGUVuNeFa%geZ z5@q{2Geb}WjWJk3o?1hX(L9vqg?90*Q}I3dQdjFzZKN!74zuFmTNOrXOJq=pBTQ36 zHkAl{iWZ7PNtyu^^90gP40L^p!W1X)?4ry$nkvh&%ukT<(bbk8ijmX|6}gEIWc?|Y zDkyZ@EXlf&DhUIvga>w_108c%-K{H1D~kKFO0%hh8TL!%c)(aOG9L6(1q`&40B$t} zbH0*#-3qR8R6ANClfw9XxTqYSpqhF`rcof}*Ihx^jnELIXr17fLMsrBKv??sQxzfu zjgWUo&?fn+w-yR35)}i~q#~Y_U4-gu9riq)qRhJDQ&%-Hw7Bmzubx{*t z0dDRNy>F4Kx=0!>L{n5H#QYW>Cum(yh#wT8X?KZ{s`Vjhn8(N5nw3&kKn-I!@=aff z0(LK5oBY8d+ZYvt_S=4zI3XKW5i*e8MYUvN@sW6`Hlbd3i&DR!B26u^X_KtW^r^Ti zRq@2L53<7dzVL6Vs;UP_)cvfGi)u92ugcjbQ=F2qy<~_DOJtHmMWmtpJfoRnCEnRc zXv4Hzrtd5nrTN=5_xd#Jv{8uPKx(&9I059AMCd;kSQi<*H>~&_fph^NABT`nIa0nO zLbU<7?nSdcOHp(HGfjv%N&(g0EG2mr6m-t7s9*{n$6O<;1E!){O&83OHlV1fP6r}# z$Wu7af^i|<C9&mWT>Di&D8CsB-dANl+@%IkU>yCReq6#-qI_vRx+!nHp3)g;JV8 z!g&P2aRuFTu>mjg|Tm@Q||B?Sh%w zI)>^m-o<*qXI~oi>^zIVUeHAWWyV=`?~WT}vWGJ12A>yYN=dP)b~nt8GL8u4xj?R? zTyFm9*8bY8-Nso^P%KzgoHE3C+%y28cKZ}vgPi>E7tJ9~pz1BC6-h0!M1zS|bY6<6 zh()z8ZPX*fpfM{aeIro#)x;afGMR96^u4^B+VcIn*8S^7-P$?*4#ID(Z9&>20ZxB9 z=J=)=1`8mB1c#bk>e zVNb>0h)z<41klykj@IvtH7nk38yf{yMsMuDrctA^Vj2FAq+7>xI6JA|dZ)Yo|42lN zBM`FDuUNRNe+r=3ZKL4FBg^uNJ9mdCY` z!omcK760O-f_@BEPn*Te$q}YK0uR#axVBEzF#3Q|kbfV}cxIG9SzAu!GZAR~A6?{s zh-BWNOa2Mf(wkRaBf9cadTkW4k16LJr+$-DXb|c-zcFqTd`gX{Z#}zzN1o#D3b*8L zcbL#yL&MMO-b!5g)0X3``oziowkhM};u>^V1htA;Z6W1y-z~KnbEntt`Cqz76iE({ zJV^OpVs8a2WPXVi{z4T2zNEj^7%$f z9pqewOfT{`z+JhmO=n_X}mjW=x;>aZ6bA2g(fS9u@(n}v0!Ytl>cr}U7dn0y#uBE)+{O6bMd}M z4+lwr> z+u|O{LY%JYm*2c`AiCilO7m72bZ>t`;BObm{b$9eXsXOaan?7cUq!FeyhP0vm|{tp zmgWsjQTwfuu)Sv;lDnPiQ1&zY()t()rQ)@nBj}>U|J(*mGWAsk zr>F(TaLKVr4txI*N!|HPA6KQ6hW)pbovOl|LN5_WZYAzEntj8C{wg`WVRh==jhf7z zueZGhvu?8U+Ap5@tR#=GrG^i`TIE>p)Cs)bw$JFN!JV6%{Kw^|{>#Jcsr=xO!;O@p zdqK9(T251z`qM0)eXsS~{O_F&?#GX-2%yH~6Dx>JlBf!yl_a!R_OVbvpj_!ehY2-cZqR_WRNmi|fPG~Za?RX2)-}b8QXVhxgzr*U6rvqblm#n?> z4zQUVq=}5y(_xNZoY1+MtBFkM%uDwb;E`eN5&HT>&oh3rB9AyCoU!uX=&jnk1A`(R z5)v}=34gzGodGV19+|(d{CZ@5rxWffM_{#UmBBY}Yba6tSxLUC5>?!y*g+~0jjHIR z5{$LkwaknR>~i|aBV*&aN1kx1@tK-vaMqgezDd|GH#OFptT0dZ{4!~#-)PRIruppw z?KI39^oHXJedb%Mu`7ZcLV*Dx5J;iHm(W%lIFw4YG*9@<7|z zgb#<7rJkmlR*d;2dfz7NsOxR)Ggn~>#9$ppYJ!u#zS2L3LsBZsG zmZKw^=H*p4Dt+nIFvWXA;l%?@>&GoWGsB*=Z8u#$8B99af823-a{06i{N}euH<^V^ ztjzbg`R&@1$=Ra!%zTL+y+ZeF+H>{)aMIO^oc{5f_{GNgk^>>VpK*(Gp2L5}Cg?-$ zr2YfyfG-Zswy`?gR|)CUb^AK8R)u!R2}dgjq5^6%_4f5jm1h%Dev`(QX} z;N$Hd$Nwx;Z^=bHgA`8$pyED&mGL^rMG7$WYW9Jyk?xhg!PURF;td+Zx1RU3_Z5~d zzi zxifq#$WC2e+~YgVd}Fei<)rpi?{gVT`WY$t8@KN7L4IN9T*}T*RR~>nZUSmbb|K>x{ zRTV-0b*W2>s!b+zu2P`!&YJrEq=K=y1Z5u2?$W~qVt7rt{^K>{w2^7jm<aQr6bbo{QM?Q8Vn#wyx8m|jKm?%=#ZuCFJYpY+dS~#^(=^W{gXi5Z zJ~O2m)A9=AcLq{+4&cNB1X~jUmptJIlvrG4dUvy02OyO1KY&TF*1&kmBhigfF+9S3 zNKoI6oX5KX zdd70PTf8q*X(5iB{Y!%)2Rp|LUXgR7iWS1sGt@_VHRYOj6bl;Hy-p`~l2 z5W)}eLuaxu{#rR?2Tqs)F$QO;$CFKyGyll`_)tefpi8Yljuyd;$3U{iC&^56U8W%* z9A}dw8B0yb^cw~#Zo27p%VM>u^@0JmCX5GY9z_07H8#@r+b_@uxlh!{Mma3Wj>&X0 z9U)eeWx^}v{>PmE!h%)d;gyjf$(#w2HNOH<6Vj0!gV zfP@tdgz(~X$mqxbu~(`$FEoffZcfb=<)DIO8iIgnB+>?n~ZoV7b@cG~Q#RUN12~a>CpX|Qbaw86R zB~!Ccc1UTU`~`;XSQYr~U#|aciy`>WfC~G?B-c-Xp~WYk-jkBl%yI9ZZF}o@A*`k5 zM(4tVlAVWkXh`&uDvhIRq+kIi!Zc^>3<|6@13rFv*R*d>=}6J1=={nWCXC9LDa@Y> zx&Le={Fo%f2#R3YkKjl-O@-TE9w~YP7^XQ;d(p?gCIBdo#8O}SGi`lbV#|O8&b=tI zj0<0W*>R&?Rg1q13<3NmR7JN{SWv)$sX`E`N}OekIkCGP-+ogVoxdOja_Rt2DuR4q z^a#w^jgE7Ty4oo~h*gZjLk03nWwck3c+usB2KvDm1VNo*6Z3>$4yvhk1-lmY$*Msf zN`NL6)Agf3k+RuEjr|G)M=?oY6EHND2EV$+!rwQd-%T%xgBvM#nKtzcX$WcdqKG}n zO-!FW$t_{^`&-J&k?>1Wq|^jIM->_8wB|mugpC9q9*hCk0$?=$aREk?PVsiGI#x&; ztk!~(L{Sg`$hiuO5WtQCfKeC_jD-k%09kcV0vN|Ql$>TOr`VlqnhguO#Y|IHe-sv7 z+3{rxZ!d{r*8xD2vUj*qZW4D~CGoVuT7Hgqma&*|iGIVK?$Aga2R<^K1R0)PXPnf& zUo^-=19V+~Zx`3tdvqyvbvXuOp_?CWDUjH7^+O@Rl@=l9#v>S{9OHD8dMcNE?|kZ;q(Nss|lG56AxuA_!t_G1AUa3F8V zz8o;IG=Lk)8G2)~X#uDPj8VS@q$c}GQVE#re{v=V$YQv-P}zyClHB+h1Fab_1}FzK zAH{}%F(4x`LxNmrGd3~~r|(#rDI5J@LLam9NfJpy!VgLymyXbHvQ+J1 z_!V;@iE>5BfJkBbkElK8v`C#=DIQ)+_i^rMqfdUru0l}w&A#&_6d;AS$EcDrPk>2b z~F0k5AJbyMV_ZY7pmVxnoi zJAM3MGyp)#)XAcSrqQBhK8tKd$EY0O*oin9^t~(sOaVXHS}@5K@Y-bVGlE+5%|Rq+ z6H~?A@VEV(Y|>R##3N-=-Vo!~;e8{r0jXh~Q6-b5VTW6Oz|~ zV6F+7z+96$S)5FnOJX{X%okKpTs>Cwf@TD7AKKv02!e?4uz-Q82{kZ65Hx#&Y5|3~%}BW(OQ?dj)qi2gO6d*{#Kw zW1!_>rp@XYS^!bs`u2le&tuGp1e_fKh$%UfkQbugehRWRAIS~EJ#!b>{T({kX8hyb;Sz+ zEr}1=-446)y*3QiuF2nqu_!-zU9)e6Nd7tcZv0Hnof zR{{^QwB8xF_k2dXcQfvdv2i3Cdc7TQ!4!+Xh`xGuJ^2l-X5LDagyEzZ5awON?NtbW z8&dw-gjwI3gJaV_jlEAMAjjisofXqK|rqB2dM!b{F?VmXM_z9IQOcwcqabsRw%`GuUs zM+s2y!YoUT)BTfQF<%8g99S81F8j9f57nQr=^S|mQm)_@_!em)7Ntmp%c_z*mj;Ie zkt}D?JVs^?XP^S+O{}s<#8MOs8hRzWS)Ql=S-roAD^aUWhdp+%m zEfM}o@LNFw%TU) z-Tl{t8CodBnVZ?}u#hN}f?O`KXhgQ^BYXc{5`sKEmnVmr0qV~O1(zH4j-#~_e#1I$R$t^Jy`^uX9KhbM-cD&)Q^m`m} z@^=%0y-D_yqD|$EjSKnk_ov)jta2|$zIHz5%fx_X0Z$KD?%{5B=7KU5mytAfVY(M2 z41fz@HJ@a!-VSu>rzst4$nH>fyYUWMGJOi^b1eA$6|vB}5! zJw4l-&$#R@aH;#d`)JIy(X80ru)N0WOb_gF-w$;gvmNtA5{_@zAznl+^K3W*3WnJh zTA16S&FeuFz;P3imLzrftM}tq8+%&G(%o?HICB4NC`y%%{)JD3hfhTCK7H1HB(D!k)W@dB zi)^+$iH>Iqqt62{g3`D*3%7$Sf3#9yAY$T)`cZ%X+7No>l=(17rbSS;I|z+|#bN%l zTfh34B3Hx!{Jjrx1r%UtOm^-g#wkUxrEaAu@QK3jXTWXPCV+erFwJ^2AN5oVP(a{; zfaDl^hcY`Nu+_}sog73ft{$t z6GFX)-Ywe(X`E%YYrvZLH<-e#KLE1ShQ?q(ORb!DQ73^?@Tea{rwd zReenVme)|vlO{c52__Kcg*&plf2(BTzHlupY(7Rugh8XjDz$c;<%&$K$8w17B?YY3 zN}!K2j0eD=AxmC4bQz!^F(z(Bg+F&ryq>&(f&oGQ$GaC&J2=L7|5e*9++6N}2krd0fN%L;wYqMX(6DjF4;>zIDZ<&HC3!JO7 z_3SUIu%5@1;|`C}T|n~5#cA*BZ+}%VECF?T7_9*CXR}_V<`;(0J74bI!OF^V#L&AR z-Hc%NY$?>3kEWSqpxWu?i09$eOy+D#I(E{+dpPIB-_G286VvboWgBGMVCE8oJUHXR zt05oWh!tlEaeeejIFdWfP`S7`%~)LbCtl-~F_u}^=QN4koZ(7#QCcC?swmqfC!|zG zORtqT_i@`fZ=TNyjygx^!`sD(Kt)r&Ah~lj{l`llR%?90Dcrrn;esryH0TaazG9x& z^a3T@hSmb5wjV$}G>MrI?ZAKjw50N%`_ppE+N}KIs6D3#5bTR3;Ay>FCP9-|@3vT$ zZlG;n3J_K&8HH=t^8W{UK!(3pEV2*f0YW6ZhV%n8NiBhoP`a(siL>uWF%1^;inLEZ5OTDS8mkCAk}78W zpd5o(B=@-eW|GbrlE4#en%P|k( zsL36fQ9~r_eBsXy&vD^S%~b7!<460HKngVNzEVR*a3Io53CSd5t{F1))@e2+yuyf3 zAW&iT5}}5)3KV4j$>o^i0HIM>_a$jKKOAX2RLj@e!yn+usbla`e5KH7Tgetz2 zu)`}Z{2esoEM4?#%#k|;rR0@UUU{*bf{<(DOEsn2Gj=o{U~j%?gs9Q*fCIuCgTktj zAZ~90!ag9NL&YbNG&>YNTy#Qo(I7zMkFB)}Ldp^nR!s`6{tWT=FeO0vMd2*8ghjif zYyZ)vGB|@d_~DB`ex@?eyglsL>E_3t=x>KlBr@y(6e%@q|FSd0aA27cBY9|KM%YL} z3Vf8HDcQKh?s#Gj;DID-{4>KkK(Gk`n#&)m_yG&F^t_BnCnJROMW<+@3ipvPBfXLV zdIf?n zv5$WIBOn7Q$UzdakcK=YFfMV(MKZFHj(j8}ADPHVQnHekyyPP} zc99vD41^s))ON1*M%#3w1~ur7Mk2v93(DY>WQ@lt|51P-IIzG4Lm;35FL8{3bZVDb8_{ zvz+EUCpy!q&ULc0o$h=mJmV?Pd4_YECX41qZb+M-QPVc~XrA-z$+>tOY##uf5ChB9 z4+drgLHOv03>MeV62{{dkywJ&GOE#ya*X6^0cQQ-KkH58q%8jG^aonDpEcA1}j!6WM9a}7fctANOUYI z`;b`PKqxYsSPWws0ohhFf)031gDBQ;)kL3j|Fc`YkrXn(f(+o0*10N5b9unZ%!Ef* zAjJ^`DBOw+_e!P{rVuNE6>M!V+9)%8VIMn~+YS=71S~*971J;r;#@F{zSS)wm`z+V z_8|j7%%TJbJ;@<}fmo_cL!Tw=M>ZI`+SyhJ8W%weMe-^}?V*E{XX>jA?rPXg^47Oz z0}}8=*}Wb#LGTH;R`4c%kS%l5tKRjF z$+jTCZA5hIzI~LJPm#Dtawn2krUs3^p1Lmx@+*=3vKJ~yxx^;2F)lns_K4VR;|R;> zSqQ5_1_C4nGH;@WZ4|h}9{zBJ;$gla|9tNu@^v3ROy?>0!Z*Gb+3o03oGKRQ_QfC` z%7GIc-5UvlW+)H@P*MzG51J9e=qN$P(BayVbOKN}elnD!92F7!wZsgW*%Dy9y>&Qc z#pd&#i}i{=F6#;!U$(MCPAp}UPyu8*#>Wo0XohB)xdbSHW*fPyOz-A7i7Bl?Gluev zL#(;bhCcL1;?XEDD`dHdHr1CO)esqK`O#CMvJ5265K3G6CJ8MHdP{PuPLsOS9R_Y; z&EaSg_SRXb-ZXJb*y>%qdLgSe^{${0YFzWW*Ayk}L2fU?}wRN=$0;Rj%?k4wbpZ!KM zn@n@KRr3z%T{3$c1mEvQlQp{Z?|~B>O#l{i!Joon^C3LpJXyGn9d0Ik=uqMozqo@o zWlw6(x@J`7RHbEYTvszZ%NQRMKTodmm77}GL5rgd=IU~fzai!%N7^OyHLNVKR#(UY!p_3C@+AZfbOJ8cfMM!o7*zdF{l4y1}}z3X02RmZy?_OC}- z4dx&_+S6X8nQNWxYX5L0oLrzA^%EXFdpq8RzU{6LJ?|X`2GR?3r7VpS9z&49PyDWU z#z8&njbD6vag%Yj?(hu}|Asu~Gp~89>$*Uo%E&q-nvHI3uLh-B$0ja8t87FQC#qziD|@Qg2Np71`{g^x?O1m{^V7O+!Q%{y zzE9%AMkj+G0Feq}g2xi0FMa^^J?rHke?O%m@Cjw(`lQG{`pf_hf#Kr|K}SFSpYr&t z?|)1+#8v)*1jSKEJix&aK*KEfLN`#s7x;_zY=UuYLIRq>5FB6tULXeU3OLx@NGweT zcEs9P$R+HW_Ke^MW=lVu)>GUcg&5RIxS)g3o>Tx#QLx}_$)J~Pph?)E_#Hy~oFEW> z5({)0RCu6Nz+X}D|3(Qo2S^~{8^~4j5!+OlfeRGS$*fXFG=rGzK*&9Z7J3Qv4N16Q zR7Q{j7Hk4g>_a3_0pT$M9PE-tgo_zS2qU!LQ$=4q938eeND9873Nj812H}{@0ue$5 z50V53kl;zc0TX^iB5p(=2FUqIMLbyHP?X6MY#SE_NG5v8BnnA(F~TRrT>XR<*Gxk$ zgb~+hL@16T&-_>(RLk-K0~Z)U;E}`^MuiLvVMsV36qZb(bc7N%q96_lA#Q{jqM>KO zjZI|3C*;Bx$WPNn{NPA*;t^8hMm(ZX{2^^9;!xPi zN6cd}R>?7zVGV?pDzJbqF->WyWX$aVED8lQtYStWrXXx(+q_LuazrvTO5n5$9H4+1 zz>7b;{{iA`CKlDeCoBOX+)du>P17`@XWmI3ROWjc$x4m!*NqT~V7!~+Pp4GW;dVpa=2EI|p> zz!JoYW5(h%a1p1-fbxCBDW;-7)IhtyLkXnO5TIfac_(>RbLeVou<1(6oO=P20 zts&w0Ltmz&8LVSCj$=RQBTDw;KenS8*ri7NpHkR?&XvS4=*vqs3qSg!MkLI_FcBjF z|6gkEC0}+KR%)eCh^9Y417V^OJ~)S2LXpiyXk(sb?FTy+b!JQ?4gxDAT&repg=}!LP-u0@%*Bf0$t*DKgr{P+X4?4okyK`VjM!-4<{1X|Ie03#gc@aW*XimG(V zgmXZtl=943awVu{L|?+GoTdq#+9{qA&z`EOpS~bI#DhDiff}%-NJK)GxPU*Dz@6qP z7IZ1``~(ObPx3hIy8=myx+-k7rz#w0%%y0BipsufguY^t7I9HVKm##$|AY&a)l0am zbw(%aXl&p{X-15T!-noGuuevFF49IwgGf;neL*CU?w5`U9K1nyf&hYuEH&bTgQ5df z!It!>?8C}pt2WC^yll|kOdx9MiaG_v9;{9ZtUpw&cU^48DlEsAggJ27NkqaWWCK%_0Ns%7 z4)`l0K#&Ackl~tdjjS#Uv`f;mfO3!m)T#nTK<*}D?G{CC?O}&0ZB%pu` z_>a4`DhTYrMcHl%7gG%!Q`3|{cDgVOhfoTyaD+fDgv`$2#Dff<|H(86(I&{i@K~pF zN-hX6LJ7b_JnTRTNRU4m?*|9kftFOybg&F`@7*ND>bl-WFzqQlj1)bL89Qwq2e~G;T_@Z_Hh9{APsevaUb4ZgAkQ?Dhlg+HU{ah5!@r z-Y#+@XVDgQ(H9|83+Hi=5RHwFgiO4IY$^o`EZIJ|)~+p7OmxI4gG4_ZVvzLVN0f>^ z3JgK5giw}5bcIBj9B@8$L@vMXzoCPFLU0?)&me0A2D5+$hj7fT@xw@PyYS}@@1zRr zWRI~x5bM+q5V69RaDhZ}0TT~5w?r@f5|#=R6Y5V6bMhPU|Aub>2lkST!`%!PumBnB zh7y&8FMLfGx^vM5Gq1A5Di4fnz4DnLw8nGEl2DkT!OSopEOFFnn-g5MF zwOI3r9B)xeSSJ=G1%>{sQ;)Sh`vET=%tK|20g=aW-5v z3b4TNOzqKDFrG*=U^lj7KQ=EzVn=*3WLLIjUp8iE|F&jtHfMLXXMZ+mhqh>sHffi( zX`eP~r?zUZHfy)GYri&Z$F^+GHf`6oZQnL-=eBO|HgET~Z~r!M2e)t!H*puYaUVBw zC%1AhH*+_)b3Zq9N4IoOH+5IHbze7jXSa55H+OfpcYil{hqri-H+h%0d7n3Wr?+~q zH+#3Yd%rh)$G3dXH+|Q)ecv~J=eK_EH-GoHfB!dt2e^O_IDr?qfgd=6C%A$yIDzgGOLN79fF=D|sB4f<_eikVkpe z6@wTcffblSHJkwwL_r)hf+;it8z?y-_(LOD0TRRkEJ(SUFWoyNITD;fHB|W>;DSHM zLy-FeF=#m;M1z}W!JV%;px+!JEV(Jf0TSFnM%+OZAVC&1LY-^5k1shU1UjX!+@K?Y z7$^g#bAvw^`jR7oC-_4(Sb-RDgPS{grKh^Z!Ge-wLL-2>KWO@=V|qrsIxh5ql81Uo zc!M|O!$$DBuaAT}c!NGwgbYa??SwPx+xI+x-a=X9K7!@Iq(C#lA8j? z!#qdi{_Z0|@Be=A6FTvK`tc(HgcQG$6GBFKx{^D(7c6-#FucPjzmh}0p)b0alm3pU z{->in65PJ-!-6E(zdx+ItABbN0E9As0|^pjg(QnVQ%K_YQ-;J!wOtVTsY8N<5Uh81 zkPv&<2xGW=7yCVoIMGVQiy7Z>YXqy{z=H`FHZ&=bqD71vIeG+1NF+&e7k5>Qf`nVR zICtbtOvvpbv3s?U0P+V5|KcuOM{D_ugX9c3tVm*(HG39qTD5E0wsrd!Zd|!@>DIM- z7jIs@d-?YD`xkIv!Gj4GHhdUy*{>Hb#`#s%pS)U)?!@UMiI~56XQEU)!?-b8c^A2T zmgUbA5;3sGlG9mlU{8OdkYMsR5=p<*t6RTTI~u8J)T&uubbS%FKiRZx7v=e*YPAm%PXi@E{t9#6MSh9dw|D;@eth}!>DRY^AAf%R`^$;}jij@$ z;;`bsi^O1~DohF#BdxDELgF5O6htVC3c1mVpyg`2jid=Hv=GC!9K0w(4?n!{q6|Bl zZA0e{YN()hvVbT&|LwT*&O9VeQ-~q2R>W;R{)8No$RdqA63HZ$T$0Hq)AH}XKFA@k z!2dAWWiiGWL<_o$`WUE~9{GVq8I__?F*D{k%q`4@#w^pKGpmy)!m_$V;!A(p^yf`8 z(PXhXlEOrSx;Vs8W}!R$8OlZ^-oeo-1@rvMwy%m=Ge{>j-IUW#J^d8aP(>ZpEGYj| z<`*ifyvPbdYBA*~1JlB7MSt?S=njs~=qJuU)pYJwByt6dR{#lmEYFx|&CFI@buCs- zLcwLCF&+sXNNC@BD!Z2`^@J)W=JKB$eHE-F+9{c;%f}Q%_8}7hg=s zk%yctL5qW!|8vEI1>Z*00t7(n2CBvjTp#TB^cfuI$ z^Uy^fo%GU8AH15Sa$TME${zI`iFr<+o%Y&oza4i_JvqwY+49{T8|pPu^at-l`o?6u#X`|j832WWm&PAk)UU<3Lr@XdM${g~AE9{%{{ zpI>l7knm<8VvvCREPjxXWH?CqpIX>(hL}t!3cV{88)EVi`aKYW5tLx4!ekqNAnZ|T zF@-?RkOgw|V}k@VAVHwghhHE}APbBMAfh!v7rqdNF{}?iqL2`M^r#qKs9S_g;e|U` zK^|i8#DsPw#6=0{c>3^!6)@q25e@_@fbhmWaKSS`C~6+7@rM|^APFmY!a4@JhZyb< z1yAII5K#~VvzGWp8NLyYag<|Q%5x3<)uJC!hyzv(rl2@T!hcb-LKbQf$O~ox7Ghu! z|5YSWo+m~OA)ZiK0VA`&E!GAkwIGLoV1b*cVZ{_2d51TU&;~7#6Zu;M?&R~kKyrU0tiozqo)CdCEhLZ2Fq!`YW$1kyvLpbxw<3O^D z)L>#E3S{Lrx!FzZW#K4aLY61UNwWyyQkRk#%^Aq)33-%pAj2$3EyB_)1G0fZL;0H+ ztPoE{L31P))L}cT0TM}&6Bpi8=t3E~yvAjw9(amI7W#;Uu%VMr1LUGI?!X3pPz4jI zd89>p2@-=LtQa8KXKhl`6@>j_Bv(6S6Uh-KPvF7|32o?3dD_$24dr74fyGhI|DX<< zQ3s=jtj3JIkePdcM3Kal#bWvpA-+|`l4nq4DI-J`2#H5JUoj(}K!T_C`4p{bRqN!= z7MD5^glsWsD_qT@wYe6AZ0}p^UisQrzy1}lffejv30qjh9u~2QRqSFJ+gQgw7P66* ztXlg~*?n9VADPuGKH?FN&1Uwp`>5>rCRyC7o1Gd|ctu<(C4QbrA zw$_mCZnptj-@-N>ul=n#cstwL-WIpDm91~n;f~IF79XNT?R2RNy?d|$Mwnnj3NOfp zT`Y|JKG=zQm=k zH0+CC=JMU1f=B_IA72uScJm-lkk9kH$Nee(-i(vHf% zoQo_N7AmNDSG3W%}vZbC>1$3Q+KzN4VB8vO7NH@Fd=4x{D6@p$ckT?|^m zB{{^zrEPE1(OG=Jc*a@Qvb+>TpezuGf&H*T6hO*jB=AF-{UnH~E^=gltX9e9ux*nO zT)OwJSjxIJhdZLXWk3HJRA1?mDVV7zh#<@gD(mrfbnFk<_{SUGKo~Fb+*{{fIeDuM zF@5#y+&Tjq)S(_y4gE1Ad4y*k;Soazk#MO+PpFLJn9($>QHWOe|M_}%&a*kHj1N)& z8rbj|iK09asa7LoAP=$1XYzp$bl4g|-)NZ1%H!tlarU*e-7}_x-ED8bXCF$YZ*M)D zkM6*cvr^thl|TJ$cfUI?_z1CX*}c$kr(@iFhWEbty%u{&*&Lo-Yr6Fu4=nc^;lRcR zzO51PgYUH;|E9OE6JGH@_ah$k4S2hqlw<%0__xh5a{}HH^L_dCOTEmIk_eu?=-{ z%iM57FM86Krw*4bDH<6>`aa~Gj-wCh>Drz;d)_;5a-SV=|3hy$*17(>dDP+;Uq=!- zSW%4jS{xs;jYpE=fpUG{=e-P%`Qb9Y4Z7DIceRLXkv`h;eh~NZLMr^aGoPQ6&)sl= z-v-&f^0<^fj#rF2Wd@b|sRlQ~cD$}Y@%#&g2ML(a ze&>UrENgf}Tkz0+72ELFD-)L z|McScNU#9QVgc>q#Lxj89uNiR;_~QAFQDNC2O|UBVhhni^tNFw5^yiNVGPYe4*A0c z&mtLS5c`5KQ!eFFiliUXj|dsc2nk{sX2A@6K@cKgAV|Rs%wQHC5ep=73Be%}uM7^} z0RP?$4lvOSd<`A)zzn9LAe2A=%cAhs!U_pu9Lk^xMqwb}K@Xfj3uLhjc%clG02O#a z4>Cas%D^9ZK?}5CW`^+>gK;3nK@XZh3u56c|M*b%z#$v5VIaDYAohV8tq}&NAsgzf z8}^|ZL2v~Lq93~98xafw(IF4JAs(^;AHI<-%;C1+4j!s;@WOE*I?Er}5gy+R8rDI& z{NWxWa38=B9o{kCx?vhLkRY>R4E<3QA+jLw0Ug5-9q@q~myaIdFfI1M7jVt9&|wEaTo?q z8~<@2_7MBf0#$xt9#r@Kp*D73F=@L|8Zb0>vA9F5)D+~(@@bYMvoP*a37SQ3-~V@ z_8|z8z!8*S4*a42c5x6gfe0Xz2u5KXFjE%&0W&po6!akpP%#Mx!7MVb^FXp2=s^y# z@y&)|D19N^k|7?tp#v8%8Q@_UE-N1HFdLF#7@z?f`~e(%Asf~q0exW_k|7xavMs7l zHA6D9{DCF+5FDU^vk(j#eBn96Fb3Zd9Pj}-eE}MhVfiZZEPfLm)9lHS^1QZSZ zfga+MAd;0Y#A@?yw)ja3Bzs0co=&(V-6K0wk3ZB;Tt8e-j)? z>p}+t8j`acw6g;(HS+*9P_5zdpur$DuvFLKR6)=eLX{xs;Xd~>R`)XuOOP#^4^kQO z%?yq<`w{m-vLCH6I872l|F7{_d#_MqqCas`4%zWTC(>Of9{z)g99Ab@co`g9=d zbRbaHANq6-lwcr~z)uSz9&!t3pWzus5IR2;S_y(&3w0kx)iz=CQ40bL#jsGj5FRlV zFZQ7ukaHkbkRt0d8+-vCNVNhv6h%GD6g6-f9`9yxwzhP31eFyZG>~i8Vff+^YX?GC zAJx~=c0YY>A36{(|Auu8>DD-t^(4759=em_VzNFR5*%0%QjJqmIdC6~Qw~v570I@2 zcQz)%7E{OdaCZ^{O>sXvYZy>eAeS;m6>Rnj;VDRu^2?v5;2jXA{B6t;63s`{&SYa1f;Tj?~6$!!-l7M=vmwJf+ z38Zi!#uOcv*B{2=8q{D?z3vGlN&SZYyBZ?-Qs@B z0#-9n16guz{|91%;dWO)co<~WZt?amhC${CSAc2rH?8q;0eEU9m2ej@+@#hYT(}^r zwNQPHB5Rm~qwiZ4cwB+iUGae)IuIRvAs#>y!3OeO{Y)Kjfe_b%9BP3T-byZN!Kem; zAI{Am-ho-_;vRmXbe*y*(Lx_sA$D15c59b1M`8qk011NNf|P&+SYQo~cX$o<23VjU z09gyB_X?AM2xQh}n}8DX zAqkK`36$Up>mqGW(1#ar7;x`yPmlrop*v47hRebih5-fp!8sWa0+%)cn|6WYA|8f8 zI3;yI|5>mbl-Wb;(>w7N8d4Z2dsu@Btua<{PG!Na=IVsKr z)p^F$p%_?!yW9d7yf`j^W*&I3jL(=ZS}!e>p^e=bN0H#USc4p_;!Mh8kF4Sz$|wsk z;f4kR7XU~hYN#J(+8=7#A9f)NoQNA#%#qOo9wK=y<}_gSf*Sf^9P$BR1*6VD(j4QW z4e#Q_`k^-q0?VpeFjlV47Bn!p+6&V{tl>2+YC&q$f*-s=8KR+1E}9_Lx~*YD9+W{D z|IUpY?x7lrVIE>B8qTI7a3O8X_#KLYXTsqf9!we5xGv;cll);Sl_5FE0ZEPO9im|f z@8KHSx+2~o9EyRlVaN!7fV9t|_446$+2U;8fV7ohV})c8Mwti-RxpC|0JHW;`XM-v zSxKI=nzh-i)utF+Mt0I-7eIR#G6=2*;uk#P7sx>$xKb>r0~cU{8~kpJapBy;VHaS5 z7^LDIU||>3K^C;H7A(xJ_oBJq;kmt|8p2yMU|}BQp*!;79asUbWFhQeVJdA=E#6^AJc6d{L>^$mA;{sg|G{L( zZ~=ebp-z5*j5nGfFnm3VL88flt?MBW@uI_%jU2+89JW{<@&Ojyx*+g-Ag&=0`MVmJ zg})~p$&>sq>YVzUP#xi@?uu#&mSp&E`6vv(S!QJf%Dd?3u&#=S4n?qSC7g3320 z$2BR(BP1WHflLg7Ac~>Gi2O#6T*>bo&*_36YJtJk;)*SX6~cJAC8Hm>ffdMs$@yW= zVPm+(4o7NAX^{qL1pd%$o(PIkF*~qdMfKYj`LjAS3S;e zG+3M0u5YxudGHtd&cz*qHBzY2ncD{AToyC}%pF3aQ%@u;-7adq82oNg|6rjOaA6*{ zco%9x@9>6aroy`ET&Pl=)wf;AubZk#dOd1^j5E5d8@nvR2QAK~2DzQx+npc?eJ{4S z7;51d-9;I2p%`Rgq+N&|`h6h!9pL=|;Qd|T2fp9~{^0$6;Mw8d5q{wlp5Y7_pCfxhQo-suru zvGhR}lA~+pg�FhBkog4Z$B)fb2J51^mGf(tZO@!5`dS2>f9T|J=Uf-F^)GK?vNQ z9ZtdR%^vN^e(l>n?crYTQSmS^x4~ zKk)-U_H+O3cOUWdp7=MwAB6w--JbD#|L`N9?2CT&EuZY0Kl7nq_HV!VJAe99|N2`$ z^NBzDb^q|mUisZ#`%9nujeqiMpZYhz`hOq$ncw-%{{7RQvD(|+0m7fafdmC^*pPzG zIUXq*@;EoCVZ%x1B&N7>j*dlF%q%LrVv$)yi3%AyglLhX|3n^b&cT4eBhfi2JZefg z$EjvbP78ImOr(>CE@nYxwh78Cr%stPZQ=~dGib|~F=y($sdQ)0nm>aMEqd;#&ZJE?2rpb%7OIvvAeJ zIH?XkD_QR1x`~P6h0KF2-oBXuv%UN&9FJ$k`v%A{Mrjy=2f?cBS2|L*+Yx$)%7n?H{}z54a++q-`c zKfe6=^y}Ndk3YZu{rvm;|1V!~;0;J%fd(FU+&j1+|Hxp24n7EBgwieJ3V;?~h+&2r zZpdMW9)1X7i023>VTmT5_?2?m~{1UVaH?n8=A&WtnE;m*g09z_0;D z^4Sn&8$uFkWO;I`$-@R?==WqIhMWm#pi^EkhM0yn$d8NBRCj2jjy_rq2vKgz%-w;+9nqM0k1YA9OmodP`*Uu|I(wMp zK_Xn?BQ}#HDJ)FXNnR1~pT`W#!>tQ#fQ=9$ zyy^kRaUz|94d)1PYdNRxOn0SM-i$XaeV~ZTJ9z&MxT9mlT6f_WCh1C(=ajR;lI|Wo ztT1UCB7)OvGovt}VEnqk2r5$z@5(Q4B0J7M4}I>^`*&%1%rvb_3Zd6KEGHY#aZ1JK zJnelO5fxl#)q2vwbW7!*Ukg+jQA%zVjYkC9}(e9WhE-@gl z8hvQQKpLTpKD?|zwFNneKEW;2h(2)l}_>cmMkUrHb&lo)5fjsGgdy8`yvd#vi zf0ZMEO~Xd#f^?7+jA2+aJctm6bqaUE;1r9yf(Pt%3}OWkS;>mwoM7O#vUxyM1010h zGjzZVI_eHtz=S`l;Xr|~;U1p&{~dv1rVlNuagEO5*9R>E$DCbpj`6XfdhEEKatv!e zdNj`-$qFzu-FAL2tkN64yYL1z@;(S z$g~aUfRw0gW+_!k%VcwMAf<`T^?0yQ$7~cP=5Ziz2(O#uL|`%>;mmZVas!#N z)~m2_jAOiNp7gAzJ@1LneCl(bdWbrIb!pY7Fh6Y^2QThbJt!9DV2`q}aKK zx3=?(E>U%?DoZ9x&#KWJx~WVf5r;9VV2qW9a;;Tr>s#ZRR)o-%N_K^-T-AD}zQ$D? zI>;g6>e>*u!Zoj-3T$HUTFSyA z)y_R;)*b4!cX+?3#U&f1kN*CblO057ZEI@`QY!2s-{p`ifTmF$7}%K!7RQ36E20ON zBf=AIW`!@@;05i7I9&3uF$EL<@Zr}zr&ATfm%eBv6ZSi&fEF^rq4 zV;b8yof=)iehcyoSVRH7s$~v+31W&kV8UCw0K^?6;SX5+0uyZTggyYljXw0@9$yB; zg8cF2FppUvrnti$K)al<9@!t!5G$nWY|QYb;J|#wtYkMJV*mkXw#FDNg8v-o{|ven zh32y{5DlV5|2O);kJeG5{Ud2Z*H#Xp9MXp}O=(DfT6=`n^jSHbLr;4M)Mu@+26wWMLO^a3Cw9fC)@w!5@H#!W{@P=2?5<3=aH7uLW`sWi#6!g~);_ z0I|2`OFJGd(Cx4j_}@PQM2-WTyVfB=pVf$KZDDF`ZtF$D2^Pn_QkKRCW64sVKE z{L=TyILK>TWLW!yA6Y;GELhQP8LLW#NcP)JGMQ9^IAW#gz^Xrr(GPN{8%x5M$33_)&f_P4`Ml%T$uY0aR!FuSuHZa- zY@?&k_g(u3;lB6R1O9`EU+v@%UHAEu{tc{O;q5m+{7Ef;>{l1n>^wjE@jrk1R)6pJ z|8@YVf2U*&{MSzR7k-3SSlqN<^{`|__Z%}fNq4YvFOvnV;0;k=1;iG6{t#`sXM0Jv zdr%i`+#qYd*BqLpa@8PK(V!3B08U}T4ct%*<3?vQScA+{F9CRfZWUK~auH~94##j1 zRS<+8HiVO9gh;pnO1Ojw*n~xR9|{45QaFW}RfUu#A6S@#TexrN#1I!p9~gCn5hsR1 zC=NvEh4BG~QMiV|MTQJka$u-&gr`vu*j|!=1i0oLuz&@r$?!g!)H}wb(PrD2%4`XCc&i z3L}T_CWg!Ci_I8}&lrvL0e97SfX1)^Mz&w%0AD(o9rF-UUv(W`#UjrZ9o#^Hu;vY= zhal#L9rdV2ulS74gj-d>4jOZ5nsN}!)Q`IekOKK01gT8csEhv?kY61SRltI~()*)vqryc!}4@s$%OUaZ;Sq-&7eeIQY7MV_=q!8&5{}HaR3@Koe zRC!8PsUBImm0jtT&y+(~Ns=j`m0UTKR0)C|oIEe@H@+uW4JcIh*6rGs08aPt{|mBUiua%lvak#K zpbq?53$1uFuBn{2MM@2E1$2O21(}S1`3Mtw5f*wqD~6pOwxOEWp%)6GAqtxuTA?PI zcqocla)3@0N(Zixg>z|}5=x>g`k^iAq6rwI9ZI7~Xrth{0p2!ithqeEw>;zk3-W~= z@VOlLIUrJ+pH31P#8lil;jlvk7-J}mKXFOWeH_B5DyHF1fIvk`} z3{6H3#ek09Pz=hzBHR!S=;#lm2z=ad51+`62Eq?k>JPhs|Bw1$sV^f7)HkMoQ>G}I zTz<-?xP_|`s;hPyZ#?R&%PC&L>YKnytfqup!+Ma1dQ75#1ZS{DF+-*0(w}H%9Pa=N z+&~VhMGk#KO#YyP_3#cXG7l;^5ZJ1%sW_p9NyFSBj7K;9BqSUlBVH zGTWLoyR&DDwrQ)jYb&rl%Sq-K1xC9({IDW5^A56rkI4~MuBuwfz$h%@bz+KDQ!5aA zTMJiPv52CotV$47TMXe~tzXoNWLq=-%C?t_xtXiEo2zzi>qsmkNsgo-G~*3I*Bt85 zq^0W*{m@^sP#sSxn&u`CtRM^Egu2;^xFm}pTf3_4*`6EQnTY}m!uJa?E3q_Vxty!K z%gemYyR)AACDVEx%76q`(1V;*4HHBz(eMk>s~n{#rT8^<5|XO?V7|nGlmu}PA!WYY zaJ(v$yv{4X^Gm<=JDkv~CGX$`1manZh)aw+|1QM9um>W1#89^q!n(hpz?-SSzc93w zYqs_)!4piu6+DIbizV-H23c?ir>Y&bfCOIPy%&T=+oBJ+U~3UFiAISa@oT{^48t*O z!5EAseZUIfkq=gY1XcwSC#(T`iecZN1N-kF51;5Y;#1IXql^s9Jg59LF)<_+p5x6Yvq#K4Jp95_ep z$D7Q_og9m1j3uUk3&pU;(cuk)ILVHr|6kJav}o2PpePVHxE&Cyy_ig1ctjoJh zq%=bgfY1o3j2wO#x*`=;*V`QT01L&CM*1+1UTR-CXv@l@$-HdM*Nn~UR5R+p9hktF zRJCoyAji?c56Zw*@5;(}JI&OL%h~MC@65}fd?JKUp8lW?NRYSJ0fS_94WlW{#i0xe zJP+5Ppiyfb+v>sVJhSl3&<*X$@@yjXEFAfa#po>>;N(d`6HKn`6NY=BT-Uu_CFHxPs{*Z7QPYAq0+EfDzJ2-Zf$2~8YcyVu3R z4@xZ^;L_T(+Siahzf%AJ0MG;JLxWGgA$C zTpZkx!YnKg(NGMw<_(OQ55=_M9lmnv@NM|@*2>^@;Q+P%Ma8zDfCDrTS2{TE*d#M_ko|bbWbzt z!1I0&9X?F?T2l6oZ>MN44r*WUY%dNO5Zs5|_?rK`hfY3y{0~F#`L=ubl5K>#{BZ zX>ahd{_4KpOvEq!HGBLj^A7p`{Hf`JA|NiTLvjDMSgFk`=4IV_8P~k#`4IMs&=nxjah!rhf#F$azMve;a${zm8&7MV@R_$7!OS`@`Deo6tx^=r&D`r<+HM@QNGW^O` z@L1XBUvi}A|i5p_Nx(AkG z^ogpLg2>@#E&`{sk3S19#4tk*SMqN{r|xmkLlHYFrW$o{k!71c9Bm@Y zZ}L)xA6S$*@XF4Vd@@Zn*JSghD9y}BpXjuT#yvRg(x;j-={oXG%Ft{xP(cUP@J&B0 z`bD0s$YG_J7Z06~B8DW@h*C(M0+diqH|2CVLoa=ZoM^JrCqz)a<3~}0YEg$F|5hVP z^`KWzWh&E7Z^bpG2UAu+R3N0|o=TCdmD=1Hy~OA}hQA!H9~)>x5j)wNn{uN{k5 zX%Q;MPN}NlCOmFTy(bzjx6Mi(Wt35L9&+As)E|D-!6l#Y%8e#Gf5Lf3nR}FZ2VZ^h zjV51ty-_Bbd*-e8-h%b@M_qRPQRkg-x8m9Dz4*yQA?53@S zm*e}m43~;K4_qUADmSwF#J$U3nEVyba>3n;$ju*2Y96@Zy~m_<(CGMZ`n88y=KSk< z6yIPVZUi>tM{C8WcQ{sBdT;!*V~4BH`#ls}Sn02~vFuOW-|C*c*Bl<$?noGmIQ^FO zeqAr@FRFd|h%%8Se1FLM+v`v5RiEMIvTERsUtwa~Q~+NQnPu`2dor$ndW)CTvT2in zWY}h}r(1Eskzi5@aL8ZOklw8{Q0vdnhlz)@z)311uMS@wk;rK>yKZ4g0!9vH?mUI| z7zQD)Rt7?KIJKgoRCoa@2l9+2E#4(!{Q1_3PPz%{u>dr&&ncEd^5EITx$%8n!#mVD zayk-Dg9&nLA3q72_s10NW2~p*B`%(uIa_GYJ)H`}@;T*c+;GDsv&`%b%*lc!feFmkyiS$zL5hjqFb|Gs# zdu11F_$W2sy)ByVPZ@lsO@^{>$oZjpQL%`*S<>ALF7^+?@d|FHk2{$%Bt`Za9Oqq4~nCM?>U@+xPqlHeUvb&@$h86_0otUUc%Ph$1avF6xa9TPjc_@+*gb? z^UpvP@A|zSv0>S zvweIzzKP~-PV|jueJLR^EO(3@zKLe5Ow{{Y&WXq2=w}C1;eLczB^OSHxj|hWzvmfk zF1)gHL;uD3r57vR7qplgw)p6m39H44^UDrgfwr;&!%5gkyGI`A_`jHL+qjY>jX`+F z`RA@HJzzt|j0S)7e>Jz`s&HyG7WCCWAC~8;ayd7SWea$-Z|kOc$uyA|7w}Scd`m}F z;Y$`h4k5D)x?vnPCWekjibXs$@}HmdgCE1S^0vg2yU}xWhmpq29@f|J)e$+}_krym z_YUXjo)(Y6!<0RpAU0fJj-c9Lubu1X-rQ%`AgxD#cU-PoQmp6p9es_w>}(baY=(kJ zu5D2MA+2BYU;JwJ@G|mEdpxrdmsGdZzT@Gc{q1lpuwF>QD|~Z)etIgXb7d*RC*^Vu zf@t(w81;&&SoklB2PG<6!4eTLvGQEttnJ3X9P;sk@0(@*?R9?M;oDb=oI5gd&9b5y z$*qsqXYYm#gqiw^C@54;Ozrrs;+?GG0hG!Lfk8?cQMQ;TMPyPf4mvo+p zcv7}5(6A9JWsh!#+-&@|0uHIESkkSV)|0*mea)5{eIwN2HFUjFx|)o+ja-q^h%`iL+3|LtxGMXtQNI{9jHwYOpOefg@!{_ip5_r==f z+h!IMxF~duGg`|zuwo+lO;5K39jx=u?HuKE_QOB-EbK2>BW11*Rdz00cW-p=2q|&9(3%yUGW!jym4yJb~7nyxE|@;%aSkc-OSZPLnZHMmP0 zU3z=AzdCslrKp79Gc=8ad|s$t&P)QiV0T_{2;!!RAk!ZO*iJ{sSr;{Jx1*^B&6*PJ zQBr17_q320v?$rLLW8r7<_QvEqpGoCqKOqVXrXo9ifP)1!xHu|Za< zTfn7RI;N+g?ww*TnQ(oN)Kv4UgQAXQ{9{a?hLo0OAIyngQ}ybILfECZ6RRj6qe+l# z7pm`72`AH?(p36WvN&6F+eJ(6E#pvpiM?3AsaC(~6tyj&86(!0#Hgha(`Gz^R57FY zCn#2Wn5d}*G4mm>&I)x#`cSdUaEYye)5d#3uvRk2s>w8+z%J-L{4@^d-~g5=_vRGJHRd|DvVoDqs|T zls45DKczG}5kuZL#oFPXUj|UM`@L`@z^u+-i6KItT@ndUj=S-R@8@Fe^5YxCwP~qF zaOu#S@wjm<+}(6Clon;WE%GEfiS5hn*sU@A)mDLfnNW!j)6!e=r3qGCHh#l8Z}A~{ z{{$;rShBkty`P+_!$WfKGJLq&X9g7BeSGB9?S1S--Zi6+D%5BIO|vha^8{HOYeN47 zcpKk_-x5yP-%`LS4NK1*cB80<&LNko1q)j)PW?40}+v(#fd^mk)u!IoEYruwOaBeIow_LRSAUKo=6>^KQ zQv=^Khja7C`>@XVnZt$RF*nsvTsg%A^nY{9q@nG2LO%5)V%%M;&!*FevUs?l{&=D> z-h*)pF`w^&q->qQ?<-BaXjq;k#d#1A-fHIfra0?$4$F0TXtm|PWcZVJ74haQQ4OCP zA5LW!Ou5TZKDy?CIHdf!B12_GWxEr3I((QOl_AdCkmh!5q1pH)Q#y)>e0R$}tBk}b zFHvs-B5UQ9hnmTgqR&ci5nKZCj?%5n+Vi1E6kBf5*J@hVz^}F1H{FgA|H(yT)WH!L zQ{4F1$7!v^6@^H@Q;l?LJ zu_;Og>=8uxbR{xcd|n|3k*qu~Q9regN*QHcqVPfvKV?LQn)Rc`Da~0ai zHX?Lc{oQ)J)73jW%n074;a=$ZXEqL-I&J%94IBNf=pT!*;qj2lL3~rgc!xQ=(6~*9 z(ZXdOoQ(Ru_4zY(r(Ib zapvdJ@M?0{GJJpf_6+$_Faa@K$lh5`Q*r!>A%W8QMWIjV3jJI2Ph4infc306G8D=7 zyJ3dS#JY4j$;XnCi|Ic$KkIE+F8qn=-Vzeifp;FYj_$IqJfAq4cIwfvagu{1qB#r+ zG2vSfZpQhGC3{?#edvV!f6L$H%3z^H93YBXbfD~IvZ`!uqpzXa6&^*L;E^&3x*}O+ zYSzkEUm2b_u9)W)*B?4edd1U5yYfLsvnLNB2_sSGn%l4%^w9tl&e@Xxb>Jr-W4%4urP2wSJD1#n`II zUDub-7lhC5M0Ujp58~to@jin%{xaCjSyrhaXX&x;!o(MkP7r@Y;`n_m#MKho^$OYc ztpZQxl7GFHx}!w*em%_?pJyv%a$>S#`i#Q3tUYth%7)%AYjd-arOiQ$Y^%F^*1kMbg{Ql12{ zYF`bdDOj1Ls63zL$YU;c>ljQUqj2#_a1_3`QSeN}*qr{k-b(4)yXDxIaoKV|Ea#px z-lqaRjN8JqX(M5onI^gEoYtAnyuYfS@N;9fYh||Ubhhj7Za0W-tMmSBjN5L?^k~XV zR@#R{mg1YgZntl2cbs@6evKEPKY-$eS~`@FvoWPTs&{64z-``GpONzm`pxZh`)zt2m4Uwrs|`SJIkufP9p z{Qh_H`wC737>FP~B3PCP(IrAHh_DAlcrX!xCnB?ms8S+HBaw7~NH#+x-y~9;5-Ab; zXoh_%zI|%heHz_;T8n+U2mAEF`waMf#;kp&(tYN}{Tl=OEHnG8oBM31`|OAV48s8j z-vOuW0hjIpx5WX^g9F~+13vr#f7XFO>49M5!L^)Hc;-N4^FZ|U;3ncwjNwq6?@&Va zP*V3$%HmM^!J$m>p)CGTF6&Uf^iZMkP;uZ;Y35LQ^HAmVP!(~c#&D$0ccgJ8d!(s* zq-AlW{oqI^_(&Ijq?dK1UwU+_@yKA{$Z+P!X!Gdy>Cu0PV`GM66Tag+vd5;n$7UAC z<`0hV1|M7Ck1ex~txAur8;@-Uj%{a-?KY3^ogUjGP8=9c9QjV1WKW!RPh2cc?mswj z4L*5*KXJ=Cd02Yl-gx3MaN;?0;dSZPCwuCzdm3PI8u;KeDEKrO ze;Sf?8d`c9)_D4O;52;ZG-C7g$?0h%;tb1hc1;Yx$(}{)p2b+4#XdNT3qHf+&*HPr z5=ze!8_$vk&XQ-&o^GB!J3UK5oD&$%Q~A!H%bus{o~K)!XFNF13_j0#hCk2FI?pLR zf6;jUa^O67<~(on{MG6CYs5u9!$kq##T(g+LfwlZi;Ln17bU?LrTB}otc$m$7v+r? z6$2NQGZ*hRFW#SCR3R>_87^!1E^B2k>vS*cEiM}#Tz&|?Y{XwSWnDIxUbZw|whmmj z&HO)3;PkQ+@u!R7PdDG69@#&=x_|mC{`6D!UlU6?&Y>ty;0h4FZUF!==r|aE{Y4@W zWTd1NlxQkS3TiYZ9St=TJ>3mP2DTf_9BeEa>bo5Bjx5~>O^n#u~w zs#?yfy!x6dhC1r$I#k)ZXgvcHGb6p5cT$W^t!zw35dak)cuAJD1mctKuTihj$Jo17(?~iy1k9vze_H`fi zmAva$km$!B>CaaiAYm0$_BlwwF1Xk~1REXV^Ep&GKTH-EF5?tYF%hA8FH$loGI%#q zauTa+hLiTd$pzsQC!-BpVhqA#)$y?!X|dsZu`1Saiav4gqT*7f@OK`?H{4554@jt< zPPm(pbSozb_d7|~FNjFP^?62IkA?2EFlrn0nbB(yc_L!|EBYO?CSf()T)R#RsH2vxTC75&ovoqHBU2Zqg(3|`|3*9 z>(eSf3>|++%56+qZ_NJLnDMhIGru{dyCq|zC3m#7s<17srtQmA`>XBtyseIcla8#n zo!M2L^DW)uzq*SndUB_F8mjsVe)UzH_Pwe7STy&sd1c^T{b0%XV8!-eRqxQ$hvBzN z!)-gmbuArGUu21y-o@no%YWXzv`TmK;frZ88gT;}lrJakVPhY+bAAO&gULHGM znVMN0pZ{_A^T+hU+SKOy#^w6gt&NS7jfFoObKf@SPPe|T|J>jG`R&iorPW_cr@vOt zcYbW_uKfJHe*Syw7jg41adY?J*X6;^{?Xpy$^M^{gOl^)Kj$Y`S6Bb{^IrjfQ7n4Z z`Q6b-8bRB^>VjT8n(J1sUd@~SBt}X1#lf1wffS5pB8z@)(NG${X}RrCZSkipG3Vi2 z{koFTmva8U7KiFeKfhLsCSko*Up7&wmnLX8T>o~m)VT0g-mQl6=?cp__od;6ikT{h z-b7Y|50!IuZjX!Q9poM+JZeyKI&@Ym9(#;Wg~SO_VbVN>-=Z#<3Ay^*Gx z9|Hs~gI9*lwd2cS@L&%mccvaE9q+&89!d$%Yci@>3B#K>WOBba4DgVS4Z3 zMo#~z9#@Rno1Ysmn<8pl<9O;(n|UEEC8HF^Cstr2zZw%JogZtg@jBvTH(G8voz_T- z@gdEq3g-lkQ6c=@p@G0wI_>A;&^W@PlA2!!2SG;Sx)-SexL--$Q36i%(ftHl?1x;pPy5O>Xw+34qGcaz`H|qfJ48ggPI%J z^zTm(j_hoiOJ86f*@KW4?-d?bA@LVROFvh8>{IxYq$qhfKA@cJy+Qm=ip+EVq!hIy zy;>4s->r+ixj&Az!wO`+J%f^zqYE`^21v_9HC5zUtZFjys%#V;2iJ>Lt;da1Dgpmc z`GZ6~f#q7*ieNrd$hJ|_Dxh&M7#n<97mAgp>=QytBj1duEu)dUo0!F?p9w(EY6>^T zc<;Y&^=CBFyP#SuC42tmq5bctVO;qbH@+*joFdI<=w-|xCPCAWL5nnh_z%8EW$KR@ zRN<rP*t$2&|RTGU_Q`!_+TYpt6J9!s|yP-H@$eJ`j5Mo|2CBxE0neO&xDr#QI)0s z2xN?enp{PX0=nPjP{@qLMNHL1Vl`vb#I#$*%tV7t4hN=WvJ`_*lu(`#ChSQ;ZwP2V zZDq&?Ym6zdTTX<9qvO#qmk+VRzL+jBh}3lm{2;#nrbnch^xwjqQ=UPQtNnQR=8^G-0-=Skfm$ z6|^_X?t&{Kv1pFbFsFv@{@UOKL9ZxTaR;y0O=u8*_jS>Od*ooJC_d7ITT!;3%_$1| zRnQLyTyUq`kOeS{#k5PDv6qxN($#cz<%YDat&+!^cc#vMUM&MExlE z7r&(v5{s*v9(@^D(LE!2K}weB8R8pL#?75r;9O7$Rn6C}p{P$$9iky+h#g>j`IA!K zD1|`%#t!wzv8@JtRJjZqAQ)q6w^4(s+YW3(A6m0V_q!Pn z;ps(Tp|vNw-qW8cZrq}3yWITP7Fy-bVd`q2XF!lF10g?@&mQ*9%~;)rpH^?|=P8`T56SCTLnH2>_l`ABCgsZX&T50|$re9y> zam#{~kVE*cHT1>hNMr*PBq`E>lirMtR*Y%b&g>Xw8S&&$aZ0;}r<3j|mWf`~R8U@!; zKCOD#36*iS`qQS`S@n7a7ZL_;Ha;#r0(p;1>~b%DBKA!hLSh|f7V2W75}$Xnzn;{7 zHZmWoS%9pM3DI-9LfHd&*{@31q7EazXHQ?HhYiz^y*dAEp16+9#46nmk0P;0_TC)d zhmXJp%~edP(OrQe1O!uHYk zqi(5@qEaNLE5{n%M;*Wz&gN$%`4^)Pj1xV_sApPlf2e)T>4{i(oEv|3V)DW+|AE2f z{H&{t_;peK5lwIW97SW(fYF}c%ge>B+l?(VqkFE$nuBz6h&FQS^`M0t`L4zX+HZ

+QK{ONvok)Lsjtiz$FI3H zxPZqXjRY*w&I9>dx-5c0^-~WatjZ9O+=v4_934%6P9MYE@FhhvLnGzuS7Z|UjYjOX zXlZL%%jkX=Dvha6pElugar==^-en%YLF1sH<@JC!cg+@ff{9D5yIW3pg~7nsjK#QG zV)*YWsne80RVF~%{uQ8&DY1*z7|kdjUS_!D*WSZ0{^H0sLI)catCms4Q42@8naQD* zT*luw^hT5L=vAv8St>?$v1KG|-if}aLde>XA`33ANF#RhY_X|kToSq&2G{hW<=CqY z96(nJ4jj>df!>W!ny=qyylEgLP*&S3T-p!ExSI}t_VV3!RxXuee!iV%7sd$9V^n;_ zQFNKuT+Gi}j#k~&}NB#8l{(lJUN zd`KQ+lE7$+8B<6-F{;Z?R*?%O(i2WPD59|zf^krh#y|qo6eVUA4sj@K;Z3#yk*))X zt8tJjChV1Fh;h#oGk%&M4T8;e(3_ji%LIM0;=q3nws4ZWN ztWU0wOLji_fJT({3R&nl=cb?XhNj{epmbKNxhkb>X@TRHv}o*A{|k_Mk8={P-e_P$ zB<+_iCp8u2wkU7HggU@No1F5C)Cr2v>sK)QMO?xy%eYkyShASGs{v$aw17Edl3RtMURXLXA!jT6Bacb&+2LYK>7(1Ii&-hZXeih{?W z7DCE{1t)i@PjrEUVFDv^XxUyc9~X*$861lCYdM5dnm!E*q5FHa4Ccom!}`=`xFEz> zux~mdzYF|n6|9Eh`HPMr$KhCLc_xfyOPy^yeU!BxrvF3p=VFCU21P;(B`}0rw>2nv zsv(+I=~qZ_IVa&;aH74YP|7Aa{V=*y462y#gU-p~0C}4cNzVYgJSX^YHKl6;S`HDM zeutzW+LNq5+YS_(#v?SN$t!_Tyu>&fs0jq>E3_PtM&dXUWBhtd`2*{O8U3=d^#$t3 zIn`$~!63?UNn7as%z9IjtY(XYlFWZynJ{$%nVe$ea}_$?Yn7j3SZFR!w~(EmtOgVL znx7pt2Sl?PD(dhse8QJizy!AM!_{kyD)4~|pck(~rFsfv;9IN-6=k}8h_50-2jw<12Q@iaHs0?x^N zu0vHk&iPfJO2XlFgl{3=3K{D~!4L?u;ZvmdqUcsjk>PyN?XaTkWs25wCi>t=>T4B~ z0+r@12Jq~cNqJFUR!KHh2>pJEJP!n?M#_V^PGi72tFI+d5Y8OwSIv-W2lXQ& zsWjHN{i0%coN}_6R5M@Nzq_2a8>y*A-a5`NjZq{TgK>24Cd)-WHbCDV!9FNdzOhhT zl`Q|rff8}2=&pX>?NcrA4DY= zwc*a9^UG3`SgU&hvqx9y52J2hF(Ym!fn}14LaQCJd}F~Y-2w78043ysV99nBkeNvmQbTC9*=036aH_S ze@pE;DD9VX2T{J@fqS1i4 zFeZ?RI#dF!h;D6^mbwZKKId0bG=+>|DP4L3C!?E97tBd)z|oT(_5kUiyKt@r!@^5b zdVO?h5<2uOTJ>&_>Pe@vpmdZ|W~Gx&u3GC?x7MDhR=skP)Zn7)1l~9^f=3V%pWpo% z3qEm&R}h}4is5;>kkw*^dhYE8B+54P40qkphhN*PtTdK^hZHsUFPNyTV=4a|y( zTR02yiv%l+o((b_fA|Qcx<3|EMKaO;7sDY@P1%=UBa&KZiGdKUXg!tCk&EinEmXrp z;1MfF17mRb)%a`ZKB;g_%zwG8x(faJ*8LmC-MTOO?SVe-~6xttfpyu;*g7IMw%I?g4A?P}i=lc}2#cXe)&-(KaG+U0V-D{+hw~mXm8h2hRluw-%u9@Z7 zkA!s#@}R9AjoxU3V92ZsNtii8;h1uXf>r@cEo2t#QUF(%;p9kz?t2?{e_@>{ripsW z8r2N%UN91-Z5pL+$(s|fKntADT~m)5%E{bD(e=t`ycezBDAm*X5(;(7YGcTjvCqFN zLMg7gG{C*|_pef{pfqgIFMHc{?cLI%oez!~QLRTNQ6Ur?_D#}cNyvAJhAP;pktDq^ zz!3jU;_E$2VhT8Qad?S6>NKvoWmLd?f3%CvP=&~1MPd?zPg|r z6Qs(mhJ0v*ju%%X6QpAvRy0{x(wmtVqsh;HmEU|)QSY-Vut(mlPzk1@%*%7!$NW%R zq~l?uqFcmBz$u;~LKEMFB=aei;J%Y|EMIcm)>L?3bPDfikqDQpH2)czk5|eHE-QkB zB|_i#gssUId}S%9q6IC2O(4+A29N=%0v88LQHR@RvsZ&O^qXbuTNB0s)UPYQ<;Tye zfn92!4r;m^k+(mh9Mxb?RUWex1wLlR+Hk4--K4xag;@FoN&Bx`Qn2R|{Ei+(l5n5| zT6OBb@Nbb|-xXO$cONIyWjk(brC5$fd8Tq;N{ za7*lx4<1ZAv_oOu42SLQj|=?m0=xbwAB&;POp;)EZ8|S{z0FHjUD1#)tI$xtrY>x~ zS^Rh}qV3v7q%nm}RtUoMxh1po1^&w)LG8_5eq+-l!nUwVj6+FC4r2W(t|b9xhT-^9 zruZRk8}*;O1g2G`;QNi#P7fEK-28opH)+k*H@9pQb$L6hm&yObeclfg^{wgvN5LN1 zTN1C8v3J1Fv`){xrUX(u=>+EL)nFS#dq9)MbEoIRfB+@|EQ8eM9{_)#LGWOn7*(-H zaOE=oR77zQ|02PPL{~gaP8f|q{bwnJvi2zxMD2hLBk9B|Pdq7IQK`LWU4zv&Xf4-l z!M}9jKeMm``Uc~af6mZmxU2#8i*1lMPCuKP!@t{`>ho1QbqWQw-w21tcZToZsbj+j zO{jUMz6*~%wDS|+V*!o0Vn+IQxOZzAh?F$5ZU6pwqh%@4|DY7ymgCFn(a#+?TxQ2t zG~=mwZWrmHzgHzP39U}fPW;d%aA-f{w)(QBpDtkj-evB~xr1*BR_4F_EN~?xD+Kn#T7KwJ<&v!=B#mSbm%>~+p^E#ugU%L<IdT`KF9} zNFYZUq0Qe$vrSEyafU!nU4qf z^Xq@)lI)TFo8#Xj`#Zh*wn`*Uw?+lRDz2$a{%7^+Ri&uXFY+A{WukE0Tm0O1 z&B??};X5Y2B)49n|7==c8vN0qtr0o><3IkBP2)KSQidw?Cifgiz*n2#_L>QU2ULO6 z_Kmi1eeSFB^r`xNJ-9WzXHz8VV9@Tg>Yz>!$YQ{XXu}zd5!&ii{P*Brw?2`7XQz3qhCxT95jw2{;iw?5?eE9` zEYxl)M@K-5#sksu?GP1$9?2p(glaAOK;KyZ%E0jI-y4Z~&7qr5dV2c`%2p_y!dFQ9 zG-G6->alJiTok?lUTq7M#GDUd{W}QZl8cI6UUxK|>gp*|;-`IctQQ?@{aXHv&^u@Mm^V0Zf-fDd(}(9Tds$(Ox`5btj?$Z9iuJzAwFq?$cgLWXO$@Njh^rRP8GOH z5x?V%usrLhz-^{bT@>aGgrwLOM|8#o@=ozAE= zkO2=CfgD$d8SA}yHkyl?D8d^$f~x~Alz3I9o1l@m^dht$FDOG43LgMR`6y;Lrvu~h z*cWaJhedn_iy#U>h z)F%=<*2pmgCoO{*lwce_QH967y9AO%46iCP@LD%X8=gYNZr+iL$rnz@Ac;5cxGgJO*T2^lI z)==8c9+H%gWcyH!e>(0D6kA+ndUmr|2O?GOK7dQIRKa=5qA(57u{^?kWEg&R#-8#o zshMg;-vU4+Zr%7F1pu4mI{{uX&|G$QQcdeI6?5q20`)n4Q?D1pBa78%B8J?2h$y@& zae|U5n}+?<)4_nJinL4A7OhC|Mo_y!{@#fP`}b)a6H>v~EUL_wsw&YKA^ zcUfvmxMsl=9G)=(R~V4>nV1>h$V%J} z19jl+Umg%S@GWiwVVMjDM@A$Lu5wddv~8Qq{wq09&-aa2L3|tzD_KWDtG`3MG)N!w z*T|E!;f3juBM9a?0_hqV)A#JohkDThUF!MrH2h2iEHrC$h1B$=^Ar?}=dAZ3Wv&XH zdQC6QO*`dkRir+-QTCCo9_t2P0+T(|i;J>d`U!5AAEP2^kdFa*noiBoR>x^Ek9$75 z+xgcu!{-9I1Ygc^RKcr%)IycWSG#&>d=Nij3kCS|K>)!QK|#P$1YqGCKraZ2#exOEK*?7%Xl@<=V!;5Ryo48| z+@ye5jv6br>m92a1OU1KF&S+z2^$2gtiA!p07KGh-tff*7O69NC}+nGc(;26opm;T z!x)3i?P5VALD1{OA!5;z5yK!qBR%f>vfEsHIwQ*B}N8 zV2v=E{!AZ{pAWC=I)PxLc{MPl3{WIW26R290kWIQ(>NU9a$N`&<7W9p>^@jp4FWxY z<(7WKx?>CbVG^NHQu)~b6r)!fZ->->{&#+H0RV(17@(0){$OkNMm+vXwrZdBkit-9 z8jfpQ6ZjoaX0Y^&KIG3wRrZT(aT`G2qY{vGAaH=b;@2 z8ndiM?VuJVn2(J#eKB$d1J)Y=U;o@|@SwX)l&Ci*ucC?(t2#Z2{c|Dz#YWPfQBtg0 z2_n}CiIUe+6rc8Ja|f`Dq3p8lMyr}O ze-{J__>ZfJZmTk*fdf;;V3cb7b+(k)-AZU>6~^Q(NP`{QAQMX9P&geDdv>GkT(h=j z5)fh$BX?7U{#5-%=m zO?$U4n_#|a*qSsnvjbo+216PHk!eFxO|h?AW8KBDsb)~P zo4g(o^w6LU>qCg_4@wfO`Lks5DZY;T5PJQ6P3<& zyE0AIEiE4jH#pP1LpFHbImom5z*#K|sA2z0Q@&m`baI{f5Wt?SDf?2#)Y1@P+-&57 z&87ih25<(0Ca}8Pn3OUwtL=Rz56EG;xX{^2%~ITiSVQfv5G+szX!*y5La<=tYNKlT zu!<_9SwjGey|Ocfg5u?KlhC8d@@Exr1}?0WTQ|~8>+_A#Y)K=}OL@>Zc$?-E0%-4$ zDr^oW6ygEKc+7vuMG}T!?K@9K1JY=FtQv}G96|*rGl!xu06g$B;^MXx z6;C8F&K`IZ)dhA4Wy!*V8SZ~>;Mb-KWb-!eCeKCROyak~#?bh5_&$bH0|1J#gIOCx ztxdyp7S+fft9pQEC*ox>|7A776$ul~`4e1$Y^JlHkyN6u52C5aNpmSLO@eQIk==(2o`D0<&=7u@jv6$37OYf0 z$+T{uDqVP)XeRQ)?A9*3+AfWHHfqh!Ofh!K;C_~}F#tCPj=2bE2kaf~mVcpuIqViE z$0vkSCyIw4g)xWm>#?uMh|_-JBmjj5+V*lFWXNsD*=RnWajENtV6Fun#{%T$%#Nl^ z?_%4I%o>_ZA3m)C%K>5josjDd)2)k_rE*6ffTFSKQ)sA@ff+ejMy8mt!;JbwBmWaF=pGAR);dMcVt- z6E>X?NW0>qeUD`Z3^F~+nL%q4AU-o-E9ubU8Kjk7#BnOSGVnY)=F#pQJ-KSvbik1i zbK6F4#6SY#GYbYGm@!ze*q4;2gv4VCFG7?SXM(I-RoWIJa&5M5{O+xHagzoBTn^+V zrXUg>vs+o^f{cPYU=ecD1ZBc&E(`yy$!2oUwFw(0Cm|AomD<8WLjWxrIH?{Miq&%* z2YS{Qq?T>k9s?YjbLt~K@DSkqC^q95HljUm4z)Fl08E8&2)SxL2LMX6f>|wsrm#|a zz;Ok<(+D8XALR2f4}5piYFr#nu*_j?Ltt#L5DwVdP_Ith_AeT;!qvF#yu>4WeILQ6~$|pNUO#fC1H^TtHHx#R?r3>R|bN z!47OSoOE4%HzNF1HoEQ$AOIN=G!}e@6?I&$k&Im_nt>t#Xp`fvBmrt^y)$~xJ%VPB zHU71UNfbY9goMo!6Gs5YUPWAYe#71~{YsL8c98f-1RJqx$fw40cSKdp>46s z#>?`3PW~awrW9BjB3z77k-ZuyZU9K*?K}$|o=?WDEGd3GinaY$OMd%f#ie`+Es*UD zcmW88q$p+M8Hyr{iZlgsPkI0Gm0*^8^-u3Y_SI%wMv)Xz(_|oDWkAsryCOYG`%huj zd=2vOC);ybbsQOQ`8w)rJP$}31lf2twhAjsezuzK*E72W+0Zwc*_0^oX@m{ zQ*n!2j%MbMy^`B1OPs|8D59htY0A|qIA*7|fBjIN21suM%w=)K!t>Sst0ce*%w^3l zz@>?2&QBYy+1f-yZ2X(2OZ6CB(`!-vxIcr(_>K%7F1C)iw+6uw?!Fqx&_*NA?wjon z3CY07$>fs?hvh0e+52yUsuk$CFFnqvuiQkL5wf43-th?LSC8h_hpK zGgyszL=_VYj~aOfG2vvoo$r(_JmO%A`jKf+ZxXS?bK>8qu#C_vs-87M{IkM&@4a0USLtbiXT&uUif^ zME_}5biQ%VVKR=g+R^W1WKw9}T@IAgx(hj4ZC**Gi0-)Rcv3D?V zr)=KH!RRL?4>mPYC+Lx5^zl(0+>+yQIo4!fM6ZXn6O4YQ|Hl!pa62)?2eh9aW2#_c zQou*JYxgDC|e1{ibCKe)xb@ze4JlxIBTu_1GGm=LxO!tC?0Z19VYE16x&Brc- zH05JWxgCn``@FhVjANmF_C^(-OGQmzwY4ZMK*4(0=_8mYn?jtw_u1ZD#$|85bM4<< zj}ey!v!V^7vYTFKw;*nJ{n6ss!`Nvl1ff-rcoDtA^S}{kHOjWo#)^wJZUmD9#~+9^ zpwyvxAFrIvJ?*D5-3Xs}vVeRTT8)-2#W&K;H!@_OE^9yP1nvYx-5Btl5>2pG$J_m+ zi*kq=vre?P#6?)*&1dmEVsGqGnXb1EM2=$tW2~eU0G^HR{NT07X3LZ0aHja_Ng!5G z2LF0NJfz}#GdUJ2CXr+i{r9g;b+5b%AC`o15uEJ`#$e&`*e`Y)SDnw~3+aKscVk=t zMK}hV{nCSBQVC+IUttQ2k^BD$6o+pCB##28SdQkRy|n>FBmoG_h-H@sYSQHG^&o-D zYmo7t*uIw08y_ECju+h}$bN5Nsl+l8&c$y|Q@BNuV($3K{B(Lb!`Ib9Q4NT9$^?7} zAeHs}fP||y!B11;<@iDLj2UGaAcp0PiTSAN{Y`5`(Cw!5DyOI?`H!_6pD_ojlOw5^vxSY^8e zu1hSz6=wnfu%#|677Jc(=DdwQ3Gz;rxjvZWBl8x*Bmq*rnOXn85M%N9&hK??dry>s zWgav`5@2*WywcXBG`R~jwIId~orDSc-ylD}c{Ib2R!H!f*aIS_Cx(PU!gGtr(HBq6 zcSj2ej_FKhmi61&+UFW1aOyr6ekf6x_mEHb<3hameZ6przY&hmy256WnMavNuCCud zawy@cr<=SJWFr4opE+9}2~3sUDWz_=+!y>>Y2D~2U22pm;6>RG+kfFXsNvmcXfDYJ z#!@~Y=^i>LCT0rPz$4~jEsf|k+pN-RMg^}(#WbvjXrI%;CvpV(C?a?OEC%Y}^^!IR z9BjlFL*Y{DyA5|p( zIn*o{wSHhSG$W#Ld`Q!`oAZtSk&mQh&WNi6-s>fTs|%cKGj;5*D{=0}6r=WDm@eSy z$;D~!tEIoHSmw~4TUhN-$Y--&<;D~vn9lTnIygBwj#xU^Bi2YJ_of2P`FQG7ddi({ zjszZFv!|Sm$;Xb`1W%_I3F4VsuVd?{(YC=h^=8hoBnM|)1a*>!H{vAz2W&u-zZ!XD zYtJl!B$G`(q@ToDmJ~;J{&*t^I?u(Eo2R;6bBGaY)W!%O+@%B5N}L!fQkkjh8sHlsV3H}Y>xXK@Ar3inLEZ5OTDS8mtIB(kf>Bpd5o(B=@-eW|GbrlE4#en%P|k(sL36fQG+DweBsXy&u!sO z%~b7!<460HP)aoI!cs#j$ z>o^i0HIx#Ea$jKaOAX5SLq#0gz=97wbla`e5KHVb#45j(umdbE{2esoEM4?#%#k|; zrR0@UUU{*bf{<(DOEsk%GYN zTX;fr(I7E_3t=x>LQBr@y(R4FxZ|FSd0abTGdBY9|KM%YSG3Vf8HDcZoq?s#Gj;DID- z{4>KkK)?wBn#&)n_yG&F^t_BnCnJRWMW<-83ipvPBfXLVdIf?nv5$WI zBOn7Q$UzdakcK>DFfeh*MKZFHj(j8}ADPHVQnHekyyPP}c99vD41^s))ON1*M%#F! z1~ur7MkK*B3(DY>WQ@lt|51RTIIzG4Lm;35F@cHyEUN~5P!4Zo-~#9Tpp`qwnDa`o z#D2^S3rm27Cux$sQEpFj7r7=jv#HH(a{3bZVDb8_{vz+EUCpy!q&ULc0o$h=m zJmV?Pd4_YECX41qZb+M-QPVc~XrA-z$+>tOY##uf5ChB94+drgLHOv03>MeV62=1- zl30S(GOE#ya*X6 z^0cQQ-KkH58q%8jG^aonDpEcA1}j!6WM9z67fcrqNpvhJ`;b`PKqxYsSPWws0ohhF zf)035gDBT<)kL3j|Fc`Yu@o}Ef(+u2*10N5b9unZ%!Ef*AjJ^`DBOw+_e!P{rVuNE z6>M!V+9)%8VIMn~+YS=71S~{j71S^s;#@$CzSS)wm`$8A_8|j7(4qtfJ;@=0fmo_g zL!Tw=hc+0x+Syhp8W%weMe-^}?V*E{XX>jA?rPXg^47Oz0}}8=*}Wb#K>!N3R`4c%kS%l5tKRjF$+jTCZA5hIzI~LJPm;Ju zawn2krUs3^p1Lmx@+*=3vKJ~y!2~C?F)lns_K4VR;|R;>SqQH}1_CSvGH;>=ZWOq~ z9{zBJ;$gla|9tNu@^v3ROy?>0!Z*Gb+3o03oGKRQ_QfC`%7GIc-5UvlW+)H@P*MzG z51J9e=qN$P(BayVcmhy2elnD!92F7!wZsgW*%Dy9y>&Qc#pd&#i}i{=F6$~9U$(MC zPAp}UPyu8*#>Wo0XohB)xdbSHW*fPyOz-A7i7BmtGll|%q zdLgSe^{${0YFzWW*Ayk}L2fU?}wROb`0;Rj%?k4wbpZ!KMn@n@KRr3z%T{3$c1mEvQ z(>1#E?|~B>O#l{i!Joon^C3LpJXyGn9d0Ik=uqMozqo@oWlw6(x@J`7RHbEYTvszZ z%NQRMKTodmm77}GL5rgd=IU~fzai!%N7^OyHLNVTv;(UY!p z_3C@+AZfbOJ8cfMM!o7*zdF{l4y1}}z3X02RmZy?_OC}-4dx&_+S6X8nQNWxYX7h$ zoLrzA^%EXFdpq8RzU{6LJ?|Y32GR?3r7VpS9z&49PyDWU#z8&njbD6vag%Yj?(hu} z|Asu~Gp~89>$*Uo%E&q-sts>!uLh-B2PZH=D{V+ZY@2`mkH(&L*~4D2ejpBjprs9M zyn&3~b(r>QCn{y2D|@Th2Np71`{g^x?O1m{^V7O+!Q%{yzE9%AMkj+G0Feq}g2xi0 zFMa^^J?rHke?O-o@Cj+-`lZM|`pf_hf#Kr|K}SFSpYr&t?|)1+#8v)*1jSKEJjB5e zL<24Sf;UjX7x;_zaDs7gLIRq>5FB6tULXeU3OLx@NGweTcEs9P2qx^B_Ke^MW=lVu z)>GUcg&5RIxS)g3o>Tx#QLx}_$)J~Pph?)E_#Hy~oFEW>5({)0RCu6Nz+X}D|3(Qo z2S^~{8^~4j5!+OlfeRGS$*fXFID?q%K*&9Z7J3Qv4N16QR7R8n7H|Sk?1Lmw0pT%1 z9PE-tgo_zS2qU!LQ$=4q938eeND9873Nj812H}{{0ue$550V53kl;zg0TX^iB5p(= z2FUqIMLbyHP?X6MY#SE_NG5v8BnnA(F#;&WT>XR<*HFVQgb~+hL@16T&-_>(RLk-O z!xk7p;E}`^MuiLvVMsV36qZb(bc7N%q96_lA#Q{jqM>KOjZJ6+DD1))$Wzlr{NPA*;t^8hMm(ZX{2^^9;!xPiN6cd}R>?7zVGV@UDzJbq zF->WyWX$aVEDA+5tYStWrXXx(+q_Lua)dH8O5n6h9H4+1z>7b;{{iA`CKlBIC@cXY z+)du>P17`@XWmI3ROWjz!JoYW5yyia1p1-fbxCB zDW;-7)IhtyLkXnO5TIfac_(>RbLeVou<1(6oO=P20ts&w0Ltmz&8LVSCj$=RQ zBTDw;KenS8*ri7NpHkR?&Xoi)=*vqs3qSg!MkLI_FcBjF|6gkEC0}+KR%)eCh^9Y8 z17V^OJ~)S2LXpiyXk(sb?FT~^b!Lnt4gxD zAT)?Jpg=}&LP-u0@%*Bf&WpUOIHp*3|dZk9Vr9U(&c}{AjUg~=Nz)D(;KX9W`B*mp-LbW)ms-gp{ZsLWG zA6;@pcYbGhqG$;irUeNmCNRx-3dMRNQ;M$UX1eBO#wN+orgYN6ZIb3m)B+ToP1$V0 z7N|kCnq5iU7k`?;bV(~HMJC+P#N2e2Kj@{B21$x4|7a;3AEo-kE)XUW;%b)4qP{v1 zHcn0$t z7cAIW{P+X4?4okyK`VjM!-4<{1X|Ie03#gc@aW*XimG(VgmXZtl=943awVu{L|?+G zoTdq#+9{qA&z`EOpS~bI#DhDiff}%-NJv7KxPU*Dz@6qP7IZ1``~(ObPx3hIy8=my zx+-k7=PDd$%%y0BipsufguY^t7I9HVL<2E)|AY&a)l0ambw(%aXl&p{X-15T!-g&` zuuevFF49IwgGf;neSsv9?w5{99K1nyf&hYuEH&bTgQ5df!It!>?8C}pt2WC^yll|k zOdx9M ziaG_v9;{9ZtUpw&cU^48DlEsAggJ27Nk{@FXaiG}0Ns%74)`l0K#&Ackl~tdjjS#U zv`f;mfO3=q)UE?O}&1EB%pu`_>a4`DhTYrMcHl%7gG%! zQ`3|{cDgVOhfoTyaD+fDgv`$2#Dff<{|Pk+5huvN@K~pFN-hX6LJ7b_JnTRTNRU4m z?*|9kftFOybg&F`@7*ND>bl-WFzqQlj1)bL89S{qa52&zo&|$(oB5gy1da;~3Ir$b z>q2e~G;T_@Z_Hh9{APsevaUb4ZgAkQ?Dhlg+HU{ah5!@r-Y#+@XVDgQ(H9|83+Hi= z5RHwFgiO4IY$^o`EZIJ|)~+p7OmxI4gG4_ZVvzLVN0f>^3JgK5giw}5bcIBj9B@8$ zL@vMXzoCPFLU0?)&me0A2D5+$hj7fT@xw@PyYS}@@1zRrWRI~x5bM+q5V69RaDhZ} z0TT~5w?r@f5|#=R6Y5V6bMhPU|Aub>2lkST!`%!PumBnBh7y&;FMLfGx^vM5Gq1A5 zDi4fnz4DnLw8nGEl2DkT!OSo zpEOFFnn-g5MFwOI3r9B)xeSSJ=G1%>{s zQ;)Sh`vET=%tK|20g=u{K;a3b4TNOzqKDFrG*=U^lj7 zKQ=EzVn=*3WLLIjUp8iE|F&jtHfMLXXMZ+mhqh>sHffi(X`eP~r?zUZHfy)GYri&Z z$F^+GHf`6oZQnL-=eBO|HgET~Z~r!M2e)t!H*puYaUVBwC%1AhH*+_)b3Zq9N4IoO zH+5IHbze7jXSa55H+OfpcYil{hqri-H+h%0d7n3Wr?+~qH+#3Yd%rh)$G3dXH+|Q) zecv~J=eK_EH-GoHfB!dt2e^O_IDr?qfgd=6C%A$yIDzgGOWm7a)O>D>)pTf<_eikVkpe8G{%gffbyBHJkwwL;)N$ z!YMQY8z?y-_(LOD0TRGLEJ(SUFWoyNITDKWO@=V|qrsdM@;Vl81Uoe8V^7!$$DBuaAT}e1kq@ zggh9-GTZ~N7X`5YdaskjuorusBfGMH15qgZJE*$0mxMfoc`Tek6cmF$Ov0=yIj0)~ z5)?uvV7ifq|3s4Ox<9Ccxs${lAc44l!;wgbYa_d>jX`Y90nx-a=X9K7!@Iq(C#lAi*`!#qdi{_Z0|@Be=A6FTvK z`tc(HgcQG$7eYpOx{^D(7c99gFucPjzmh}0p)b0alm3pU{->in65PJ-$HFAozdx+I ztA9Ei0K_tX0|^prg@lX0Q%KDIM-7jIs@d-?YD`xkIv!Gj4G zHhdUy*{>Hb!U0y*pFCTQ?!f6YshGccXQEg;!?-b8c^A8Vp5^Zo5;3vHlG9mlU{8Oe zkYw^V5=p<*t6RTTI~u8J)T&uubbS%FKiRZx7v=e*YPAoN zPXi}G{t9#6MSh9dw|D;@eth}!>DRY^AAf%R`^$;}jij@$;IQJqi^O1~s!R$LBdxDE zLgF5O6hw%N3cJyXpyg`2jid=Hv=GC!9K0w(4?n!{q6|BlZA0e{YN()hxPT};|LwT* z&O9VeQ-~q2R>W;R{)8No$RdqA63HZ$T$0Hq)AH}XKFA@k!2dGoWiiGWL<_o$`WVQV z9{Gu78I__?F*D{k%q`4@#w^pKGpmy)!m_$V;!A(p^yf`8(PXhXlEOrSIyl5wW}!R$ z8OlZ^-oeo-1@rvMwy%s?Ge{>j-IUW#J^d8aP(>ZpEGYk51{f-Z*0 z0t7(n2C~KrTp#TB^cfuI^C<;I_X@y|ygq0>%08fb<@W2Hh zobbX8KiuL~K|3Wi#-jKe@yI2gobt*ozns0f6=xNg#xL>$^Uy^fo%GU8AKaRya$TME z${zI`iFr<+o%Y&oza4i_KRL?a+49{T8|pPu^a zt-l`o?6u#X`|j832WWm&PAk)UU<3Lr@XdM${g~AE9{%{{pI`7okoabxVvvCREPjxf zWH?CqpIX>(hL}t!3cV{88)EVi`aKYW5tLx4!ekqNAnZ|TIfX#Zzy)&jV}k@VAVHwg z2VfvfAPbBMAfh!v7rqdNF{}?iqL2`M^vD=qs9S_i;e|U`K^|iG#DsPw#6=0{c>4H+ z6*A$45e@_@fcVBea=|k|C~6+7@rM|^FbOMs!a4@JhZyb!hcb-0vBu%$P02I7Ghu!|5YYYo+m~OA)ioL0VA`& zE!GAkwjhUpVu72eVdWGYdB-=9&;~7#6Zu;M? z&TxvayrU0tiozqo)CdCEhLZ2Fq!`YW$1kyvLpbxw<3Q4j)MR2I3S{Lrx!FzZap5Rm zLe?k9NwWyyQkRn$%^Aq~33-%pAj2$(EyB_)1G0fZL;0H+tPoE{L31P))L}cV0g_3O zGZ)@e=t3E~yvAjw9(a-l7y5{Vu%VMr1LWc|?!X3pP$d(od89>p2@-=L%orfqXKhl` z6@&p}Bv(6S6Uh;#PvpW132o?3dD_$24dr74iN#UQ|DX<5>rCRyC7o1Ge0d?KNn7jcMGrw%3sDZn*(l-@-N>ul=n# zcstwL-WIpDm91~n;f~IF79XNT?R2SI*nVU-9>q! z79R#q@P;`&y45n+9JYNhd`oxT08?1E=5R-JJKSOy2X3|ZX{hD=D09Ee%xdyC+WKPrLB@Z8^Iul_!=s< z@sqjS<@fA^!QAa_mF37;3UlMaM)vZW*^C!_NEyxXQ1dpyoDMOsxXpRi^IGgdVInhG zt!7SfJT$y#LpvECcFr=P1>284>oy&LKJ=tp+>dy0m&kKw(t^>O-rqLY9sAv~$|F^ZmT`!G2o9e;s!yU49ho)Cu?8RRC8pB@F zn?pPe9rwB~v2OO5C2crh3)|SsQ-{lzB#jIfdmkZA2eJ9X>?uQ=JyAAwrVj_~ELS_* zznh0GfWd7fk)svHIG(EUA=`K)DITWA^L?(X=vN!=(%gV|ym4oXxEAT7tL_JJ7cQiL zqq*YwDfPPz*Kcl+o8hJ7$ByptjeXzd8+W~rI#MW)ZhU)DJN3rQy&)G3U$!mipgG9h zagBeMC2e~dhp*wXT9EsN=yJhFh}}W2q^kuV*GNamZzqpS>En3Ds8eQ8xgTqEBW%a( z3KYk_P-kBK&j@r%0j z;}@h|EkVG64!ZQb=|6X~;Q6u-Vzeb5%D9W2{ULd@c-}7fP{uDD|M|h!+&y+775 z`i#f>9*u?s0tvg0+)7Zryz}N%PENO_p{tREkWSB zUWOie%9jQ4cIhuVz`^*^&t>A=)k`#1KN!oq<@#vxI5*Z}_`P`J`(|-Je^h@KWxSs1 zz0cI8E>)y{lpXjOO85wZ7-+!^enAi;`ZfEo%y3I47u{{n6;q>muRp$wXU6b1qw^uP(S;04P-7s@~hQb88i{Ko`ye|H@(?z(E_dVIZ=PAfj*z2VxzlK^sWS8}=a! z<3Srx&ma218@dp_8V?=n&l|)LAG(k&%;C0(4IZZO-@b4lI?Er{P#c(RAEIF$q{|=b z(DC+x`_MrQ^8z2dff_Dv5Vhg^`0xWEu^<*v8@dl2@IeZXP7c{GE%xCTSgiB#0UfAe z9kkF6{oxCrPrkqYku5&N+I-cTFpfgsk982$|l zCGGyG4lPgx80G;R|MbBcOs+_Z;TtYRAk-liPR^SKa3JtN1)P8cvfu{%VGV!)4YJ?| z7a<4W01X`B_;LUZg5VGIVGx|44qy-mt=`BS}FYl0XR}Aqfb2ataY|9h6dK2s1O40UUmkAeNCC)#CRAV>iBkdDENr4Ec zqX|$F3P+1TLu}#Ba3EfC{`_(=!Lamb67fEeAm(omNmDKp6!G@K6wx96<^mM^5Em0q z^g8bs|GaWMsiEQsv_P+6;i91sFHc3+fkjEwLr5;P|IGDHVr4Ht6~%|b>;%RtM58sK3K z@qrq;)G9O4{Z!8-SCbfs(I05DHsNDzVow?#Zn1iE4$wds&cr*>r3|D~BMIX8ct#JT z(;v#9IuF$!5;YQ#Ko9hQkO(3?2?8FfK~pzXQ?nomG{HQl?-C?+Qf;9Qvk3>H z2L~cR2SPttRS8^`5awW3!BapBA|7&!SD^tKKo2Sxvq}jfO%wDFf6gXhazYD&`@9cB z{|&DXITSDU!5ffrAjXgq>rxwj0UlCR@;q}jJ!=Fn4@OUrS98l(LGMZN;qt(B9e(Z& zxpg3ZbV8YoUH8%tH4iT+G!_TqUX9X8Q85p@643hfU%>(ODAXuPGxG*kSXr+Zb@g0p zD_vs(MmZEs5q1|FZv^+Uvxs3e1FYV3+v0A^ z!bUCc@>tPcx%4jSHAT?@bn#UK_4O`@LDKqFaKX%4-ES7_@>)r=Uo-S4d(m|ZLQ4~r z$taO_Bi3%4j!a206Jb9oVtAr}B}Ey%$ZTH&qa!WN8bAo!s> z2I3u_0BMPKQEz|+`r&`G zKpvt%1!N!xzVlMmLLJ+|Jhiqv@=ot60Uv?@36dZSx&UnnqE-E&38pUz|A=5#KRAT* zAqkK`36x;^>VjQKuXhoT7;0`GM(=qmt1C%QaLeKsh@tfQ!6_53@t)Nxk@qg*ff$6+ zLZbooydjA}vo5_-U!#E-WbzIDfm})UFCiBl9FdA;QWc;0=UlRPf0FbxPb*h9FP@he zx({}((kv(UVQcp+oze??7{0dEM(>i1N%JTjQ7O$*Wrp`IE;f1L*jo1zDeKQIof6PS zk99Xp9gLwBQs^ymp?l*3XyyU;$k%-7Lgmt88Q8adc@qhqi#5o>D$b-l_Q)#kp^Ug7 z6K-fAashx8qK5i`mi@t&{h=4Qz=^s6z#14W-~ocy;yw*kFRGy*|HdI73RN&j3=|0> z{i^UV`l}y$vLH09ngzqxNUSjfqnouaEylS{)4~>{MlJZ^8I9Psf}PzNHsoO$ zmSKFm;U28P80I0Cq~UBTA{Wx8eBZ$scqSa)VHxgW8PvBf=2?^cK^m7KImm%Ijp`kw z;rH$V8{D}f-XR=}!J%PD_k(Qgb+xW2ozN?f|BsM^+@_5 zC>M80o|1~S_?*|q7+gkn(xMkcx)(Aio(BRLJOUWVK_9kJET|(FVxb%KO?-1#3O3CPUHb59D*D| z8cf20m+|Kv>I4|ZS2H@mwz5%`UBEc72syj)d|MlS>ROTLt+aFAPIs1W<^INFk z*S}vJ#_M}G*`gnu_MUh1syi7N@=dlKf;Cb|!kt?7>U$P6g1jAKl1ok`()%uU+!*w2 zQDVUsa$z2{w-;=|-tdNJrUI*xTfbdA#>3pXvpOz(4XM0?wKdtCAG$2U2QAJf_QYJy z=bRuCd@r=u7;J$E-DMea!5C(NlwF7&27Mp~-O&BP&O)nT2`QJvHwz13Cy z)?;1KbKTZuUD9cN)l*&9g}v8z9oJ>O)+L=C|1_N$k{#1Oo!6C}*PT7se|^)RJ=(GT z*o}SIYn|F1U9t3G7LucD=7k?JIfgbs-3`GXRzTf1U3t0RK?vxb9Z-SZ+1=gMJ>KX2-R&LU?|t3d{oNCu-WmSg^L^jnz2D#c-vj>L2fpAT z9^vJE;vHVy^ZnfoKH^1w;Yof2ES}%l!QTO1;0gZZK_1>!Ufn6a0RQ7zTq!k-IqP)IbPkT{^PA4=X-wX zM?ULczUycHF>{njw|Cv7HcV6o^fa{Ci>!qIR*}d-How4AW&euZS zQ-BW6fd^86@p<4JD1q@ApAya?@>Adn&Vddp{}q~{@)Q3GD!&;ZKk^fQ@f-i~D}VAM zp9k9D9AE$jcpwqZK?-<)_NCw)I)V0SzY`Qc_gOy?a-Royp&5Wb_S*pXn*sN8KlW$8 z_HQ5fcOUp!-}PZ%_H*C%lVA6FpZ0wp_=TVN&%yX}ANjwZ`FY^_pI`e|zx7{#`K{mi zb07M9{~Ufl_q|{F!T8DJ@O$s@c;k=jc3b_Ub8}b61|8bGmfv)21mlZ19IRt=hG0+q!)VH?G{d zbnDu^i#M-cyVjQN{R=p-;K76o8$OKKu-S8r8#{gsIkM!*lq*}lj5)LB&73=X{tP;_ z=+UH0n?8+tv)RO~Tf2S@8}GYZv1{ADjXSq)!^~c#{tZ01@ZrRZ8$XUbx$<+XcRPO$ z9k#w%_o7?Bjy=2fw!N2o{|>&Y*$pDbl;g}Jh0P7kAeY-fwK1LXBHEl6O)#SR@cjGx z@7&yW0uHEMFXRx2V1f!R|9ID8|2+s{X!DIh2Mik|wV4g-wZWcy@HxgIg*;jbAa|h+~e|Imlvw z!Qh*mDF`72o<9ysC}%nzniv<=MMq9Ah&~Ewq>61&Xr&O&*9ILnY(P>$N1Zdq1~y#L zj0jQ~@*$NRaC1bS?!|!}o;F=;baPlB#L>puyga}%CaO{TjreMQ4LWo&Tl;(c? zV}Rn0oK`G-SeMH?B(KbJj$?2Ma?HTZ_sV_el(Ry8o5~C4FccdiLcSI^Q>%+(B=T>U z^A5B@IYp0gG?6I|GfX*9o_B9HN8aqQz5`{Ir=X=yF=^@Z$t<-k zV-4DN-who*dhgA5*?ylEq+`wC%S&11gbHUT8_YW5D^nSwg^x%BC=& za`P1)h>A>{t=DO1_p7Q7u?I}P{=5JRXLiNY(H*#u34d5acW7b5JwCBE*KkW81+1V2 z@q()5J!OOB5}*e+BfiOmkTT_f6K6{J7!x)Ig^p2SWejn!QVHsVG%QUsKDZNLNYyP_98L!|d5 zF-P(M|Ag&g(Tidfq7W2pjWN10#yDQ^UK`NC6Q_trC|2={e1u&#P=N{;_HmGuQQnK> zh@(u*BR1b~8zLV`J32lhkCdb$rF`=vEoB24$k37}KMBfEigJ{sETt(+$jMWxa+Rz+ z*Bm9LzoEpI8xRO0fKj@jiWYe`BKH3MCbP#-%WnZ$Y6Vu#3VCduxVf;(0c zk2(-XF|UBh6}4wX__}5`h3T+v!snY2+NS!v*&}nBg&maO0XfU*Fm;VHo#EU7JKgCc zbyo8jWto;U?b44=2(K>s&<8UGbB{$81{eqWr$fI*$7xEmnB~LJgTNw=V^rZ7)?}hZ z|NnzgjdGNt8%R=wLb}n8R`jGt$!JQ&p@Z_l45Td;DM?KVA(V0yrZQELPNT?A55B1ji5L;((k=>s4RBM(~)wJrK^6r!ps$0cq+ni8etF&+_9WDxD0%b^4lxtdY1 z+G?!*F>CqI`qi+ql|ODRDO}YmR&2KQtW%7uS(e-ZiG|3G80q znnzcdg&9+I%PEk6*;w2q7EC3~J-l_8zEt*Yz_5i7aiovZp7w~zn`BqJdW3n~RxsMN03#Mrz^qj5!dbPP0uGs2R4)K= zhe`M&7Qn~^8$O{AKy;%IeYi)00YMvoBzVCMM$0Mga0jH;g`x5Fhcp^`s1L`naVdDM ziJ^E5#%@5osBwx}Tycz@!dAsBK8=gF8e! zMBxsFXfT*RaR%K1L(OQp$DQ-+k3n$Z6o5F{UGg!DHdZzl-qJ@nzM&R!|ENqsj7|%P z{UPb%B@eA97BX1HVHHvdr#)_MVyA)n=}=$!H>57#bx)nvYN~qFUvV|8V-0FItvWTW zF6paJ9oH1F6S~0ZwW)!fYF*R1)W7a@u!mh_<PeT`eYjyUY(RAR!j5P&8fgixx4! zfzV>`2Nyo!k7d9a5NuGOE&OZ@e`LbYjbI?W<=uyTAi*!6F!Wu57hlcZLmv6C#-y<^ z4{6jxK?3(jIPU8Yge!b_>@dr%PhAC>?wT~W-2f_5Ox=rT{J9$+O~*f8-iu3dQIa-$N{-)+3hc(MZ?ebcoyyw6cN6yn5bdSRv|Jal5@@30?&~KO7 z3TivWDfEE^NZVrF3G6cx(9&Iv@1u%>lj)D5a82uo}r3d78 zdEBELj+}el?~XQ9Q9I_-bOmF}@rr0uCM)A1y2=f27Q`!lGK@Ei<1Y^RrX{{KmEXYS zZ=HF`b3W&uC%kd1d`Z)E{`02}{pLANde#FQ^+Qs9>|-9&Lyoo{^^JF4!0p33_=6wX z;zBFF5d|y6x$FM$bJ@ji@3Z&Y&)xX4+V4`tE-SF1($L2@7RjJ)bi)>q_OSQCkGD|C z-t;Ku=!o1Tp>rU^ELC8?SKa^I_|Ja>^|!xq@ehBXu@(E*|9}2hSO7RvoH2m(XMlb6 z2$Pf^#KsxJB!9lvfIx+S`p1A5h++>Y4iPwcjX{C)cWg0NOuAQDlYj(eh8MAb1W_Px zVqgVxW@ayVS#gGLWakfIhjuTh56-uPCj$szR%m;nO#KjF-(U^Ka1ZbB4)S0PbrU1} zU<>--git7jw39qf#(~wvSWPm4=yC-nv z^T13B6&MLs9elPI-Qas&_6<>28c#Bb#i?WCrv{*-xNQc3=i;_W%zW9q7rDWVi2MIVCb@fueh>Ww?i^f=n$MB5o z5RI8Zja+zhlsH;_fCPn*7nr4v>&TAn=#K9QkL(B-9HxDQ(GU56kNK#N`-qR$U<-g( zSr*rfl*B|^AsJ5b3e1oK)i{t+RFKmM6$;6a4XH=vV~~-7kP<151No2#8IcRQk++y* zGKE-ih;oD1kR6GUA32c&Ig;Te877%xD9L^V67XR7}Eh-wDS%C z`4^^$7md;jOUaZ?>6E=-3#||fM%ibch!_0e|B2ovE&@rBg{4+0AbRrAi(TneUm2E+ zfi87Kmb_(_uZNanxt0zXQ()!;|DR}KAPI#Q|Huyp;t#+8M?>ikO2{(4`J5Ux z9%Cp6?`fD6Du)-konAF^ddQ(2YFr;`p%|K>Dpr5!378}*4k0R{eTAZXxQLJWNcx~{ za#KJlV>i~&3-!R8Z2^VGa9`wL44C*0#;^?4(GAjoiT(hEY}XC?@%4{P&Gu_~*xI!$>RNApl$t3^CVx-$F#3_to7 z4BC8B1q{YO4x5T;hN@`tAa}yrpS=J_{b?JXnijpV4C04rzQ-+SF%M9wXsX(q8$he$ zO0MNcGpnyU<^}<7R*{Q_IjzWDVncA ztptjS_h41;a9Y2b7T~&-;+n1#OR*Jeu@{S`wwgqi7zOisH~c^y;qnf%fQoa$P+KZh z%U~nbv2YJMPymY-Cu<7>JFK^Huvl8M^*VRqFsKTYg%BGqtctNiOSDC6v`1@V8jD2N z(nC2k8{qN{?{*jJpr29e{}268TC?C6{s@(mCJ(JJ3yVaxhDx*2S{pap3!1r^$-0y? z5({nz4BrZ@;3Bk0Yqxicw|P6RO3NUC8W_uf1Xh54M`R7ZBPr4V41P-&QTLyq1#r$` zrTiedZgG!laSzNSx!o|g-%_`F>$#r`x}lqr@fj)a;03N>R5>_CSt=>TpsTUrcEu2~ z&VjYSV7p0ayTAaiK`XJM>$|@TyuoXFe48Nea0Xm(2UJQJwtxg)pt!>`L64#jxiDtC z@q_hP8=Wh>+snP(E4sv+AbrpZtdS2^fCL4LM2{;N>ae-3F%Mpfy=bGo-b=srYrhu@ zz6inv4b=*&I~bq9|EI{rsDYtO4qF@Q5Vr9vzrB0E4eY=VY_9n`Ao}KR5ahprArF}K zx=BQZEejY`XdB&-3-zhM5eva5jKV4GrV>0LR^SD|&$5r;rQAaKU@=4KMh^IV4(t!LJf_AWVoB<5w8I zYPdukp$x3VWo*W09CJ)eAmjiDjZnpMp@L1@Obj)+cX1D~Uth*k_9ychhi3 zff1??g$+^p|Hf~z47NKD*npe<3K)p0yqFxWos7%5oW!3D9ihw?rA)w;+)U&kn}yN7 zc(JL^Ys--$$hz#z&+NOrY#4wL3wN*#aBv_CbO(SyLFCW}Y%mD(PzaOo32>lUvj7M* zJYc<0&iiQ$cQ6QvTo)ibRE&%rx4X=l49)w@&)X}_g@FWYU5E77BaPZ@~}t{1=Uq)nS^_Jngwt0002c1C&GtWo-g^Bme*)052UyozT_- zAlA`Ly4Ue_a9|B?CI)LW4-1_MAN|x_7X^uobr((8>d*&gV0EWp)ovlRU%eLHN*GHh z*(O}qqD$6hjn-|w)_By`0I<_h!~tbZ*O?8vK0O$Dz1NvgX!8)6?0dfckPrOe%l=T< zO^w)SQP{1(1>Rx}B>WcLP`%hI57J-^WA+U?nGbMu+{rzB>cD8Awb9G4aN)qPvLrg2CEuCJpBM30A;wQJ~-CFbIj= z|I}?3X#U*?$tMPN92fb3z5z!L^N`5WjTRw2U<;ne;ee^dPz=?vn}s@ewonYQpt5s8 zMBDA%hXUU3U9scc*#Pj_Q`FvY9SH4h;_!Xf(eaZt{*!kB43OMq9FB^%kjSw>K|G!o zl#LdJYIj9stlwbWd<+-d^4KGOBPAZ=>5Agzecqt$)&SrDP&5H$?bcI{xA6@aHcsa8 zco#q$U;QwCR;8#!iWX{a3syzsnD&o$cMnYX4c~C*b@2|iFvL)fC{hmQ=33==WZo){ z-lok9*$~=aKIlj*<|8v5*vk)~6$?6P3zt58mTpA(FrfONvjv6cdd}o=!FPcE|0sg~ z=;K=GQ*`JnE=7%Q*5*9`roHM%8|fqScJ=6)xnK*UD8hYa3%)*z#xM)9pfdS6?DK%< z;jrg+ai6H3BdUJuvfAoW^y=s>0d1WC4gdfKkN}Qe?L(XE;zAA^Tv>k}v-14rY{3ue zuFP?vpwf;b)IRR7dhJn^?O2`x5a8BrO#|WX-uG^?6_UJX`nM zvGs2smz(Y7UM>L@FY8{uF)`@1pw zwGaIJeEZY_v%QbZj(;2MEBU`4{LP=s!%r=+Z~S;s3?i)?1$@lX&iu~L{e=wul41?J zJNTU5-i-Ia@&z1N@Ka;f)ae^skl!+E2g!L& z99eQlwRbIF#++I6X3m{G7u_&fbktrLN1sNmx0fx7k^BAi_s(f*+L0@N#+_UDZr;6p zTh3csxW;ddhaX4IG*&C)gNuE8xQt({Wml75pL+ZEcJAH1|9=O+IXHIY`eq51)|&kK zkNT`xpFErW{G{5$-^ZU{|9<}B#?z+_7I-^InfwfilI)rVNWzsOTHV`>=ku|2%Ew4^>RFp-F>bg04WC3t%!&FdzH}`zM^U#u;*(m< zOQNMx6(-E-8GqgWUqJ#Fj*>R?YxM4;iafW~9{nEyDsVX$w$VjqAunq+39%seLmhc9 zoJWJpLiy=1hor%cxSn|O(y8o-kEcs7qlJ%jgc&Y6T*o~wjh@ZZd5HE8-jvohbbYJH zDChOA{$r|3+0)LY*^Vu_Gp~iUs#uQ4-ft0ZJ~W?~zvB`w|Q#*euwuL~z>3DZdSQzV$eN>HFl~+2(R_+rRWUBFEbY z4rZk~gd-i@vrhh3e>pY7VL#ITZq%#7|4~IEy;G$#9gFciJ}kWs!)3Huf(DNI0ZfX| zFBf<_s2knzj^V^-Y|cq!{yVqroRBK6AVK6rr7p_bfhvgk17gOPd;U^k!t7)u!M0GJ z!@8k;j1<3#i6){kf>!|Qpl6YMQeb3@vyr$slR628xcfy+qQ{ZnVw+_fRf7TU8 zOX=u;&_Q1FjlCw__$i>#38G$7czaENf3IplSi@!B`{mPAzW4eu?bt<5oliOm*M|~I z=`k9E>7R{{KFDNcK=@-l?ea7#8C*gj-sS$EQJqW4_1#8aG*#A_b+R7; zo7lS{$#_vt%&BasGVksb{JX>Ox#*O}A>-XkCL;<384@@&c&y;>DyMeX^lMVm9r6Df z`$VYXOfvTtpKShhH@w$$-8m)y%~OLJqo7hISMR`NNtmZGM?J`OL&`r_()0Ek33xDS z{Oy0OiHi42$0~;Y*xl(V)s-kKiy&Rq%aSa>Z|2_?^3~LW@4hQzUTgfpk|w2 zM-%mJum`@e3byl%lMQ`Z5B#4t+b+o*H>8Hfu-z7$d0WJ2>rx|nZ(5vSkTTKy8|D+* zqhP<6IoYzTUawTY-LR7X2P z+ZWH!G7CwMK~NKXV|W!E*-0@SOufDYmJJ6~h-D|^gm0WyjssQ1R5ym*FG1Yai7tY< zM~L7TEjxUrF~|1GGU5?3v!H9kp`Buwc%<(`=j;1ZeYEgBcudZ`cv1&ii$Sb4nB|_3 za4gRdRwuLeiV+rY%D~w=18-7vm562$;^7Qu6?<^Vk=`HQ!!R3IH#sReAaz4XFn$+5}VH=qmm*KzeqJ51+8Sf8rVUli!Zqr}z z9tU|G*F6-;pAQ$p17B*rbG>u3=@CP+zNk40^QiLc_y$=`fzEx$5mF3QuCh+7v{JSn zAF#h2laLA$r*z0z#I{73T*A?96G>ix%hytwGd zMe5v3pZwZD{H}CmtevM_!Fl>Yu*|cn>zm=JVZU)z!MUS$TStq5f+}2TQha*8d(fP5 zh($;X+2&T95AJ7b5%%Negr%XrpD1;t zRL9p3=i8#5YumiDhgly(vLbU-Q-d#;E7%?*iB|5g7IoAIP*Q}i;@cB~DM-D*1!A>4M`0V~Y~H5yxnDQRpm4gA8ln44nJjxrx1YfC>4}tH zL3n=?S@G*rMzS@8vnq$bF3adnU;E{3fAkc?V=_cs2_{Wb)1#36EvX}2lk$<6KEfMa zu%J0$qMv50U&u4Stf!Y3B`~;)YV$2|OCwli!yFt)QD>j3F}>p+u!@~b`xO}6kBLvS zzcQysZwD`c*5}FYoXc+QCK2xia7R8+e?T zuf@XiIGZ=N1<{z1&9#c*7S2|c~xx1{IM?$=n}As)9Y$B=Hd8WUMJQSdb4LqGK69pjD9 z2!CV5>f2i-4Yw>fG0QHmHx1)<*uKiXQ$ETt5f&hrHbhTV;tea|JSXftSct>gvU3chIv=eZP%Q@b&yVFwqUWxLsqn#OWNd+p4IwNW(S7ofHo=hB&lD?dS(|Pec@iUONy#&e zfZRYTw<36Um~8rq|E)yhM04U|j+^yR>TDM5EKx?!Y*EdAvT>yLxT7s>QhxREK0(OgpU#E$U` z>6HBI`3mAG*}S)60J>NjlThgt-`Sy5T$oJX(w$rD1J*XgP{H(;vyn!7EL5^ESlf#we?Pw(mz^Ri8X0S^oTM*;oZm zIkb%J`4JWeu4sp+zey0g--6nkDCPW_9Y2kdFo_%|=6WaPq-p*3LeeBnz~;rJ z95M|1ysc;KlhtIkdy;P)a#~<;A4U%G}~#2k$i&f@Wh}X#?EzYIu$n} zv77LC$U$k2ZrLoA#vHqCXPxHoh0@;iSS*=YCT_v*7j?>v_nL08Zhj1_?Xz(@^U}QS zkze_PD88UMZ*i(?aQ?j@oAl}QgZox5?psmMA$|@yJUXN~3rYfkZXp^Ge;K}BNdMV* zZYhd1j_iku&VsT1MBiDwKqc%Zk`)^40GqHCJ;_u&A`VIu1aNo6+fk!GUVXYeHP`rO zZZ4E_;V8GH^c#JI3~De7+Cd#-%Rqh$^Fj7Qg$womAO;BIxCi=cX;Uo7%D zNC8k5K5bz6h`y`tL2*1~=lb`_>QJVH@*iJDG0y9SC6@N?jias1zmjI_PTkGlBIgZP z?*+rZ##q{&=yfI;)B0jW+U`*=6C7Mu2*E4FVD~xSR3GD^ zLflGX?@H3dN^;5~#5=(mPL}PJ&@93}?T+HPHIu466Y1(vuD_B+Sk1nqT8Uj+dG5OM zXP6kw==fr3^<~{^CVVZAaV?*3?X}EWf%aOV`Pv)TwW8p)V!~QU=Gxn$wbHt^vfj1w ziM4l2YwwTND&XstjO$f=>mOv+tF_l_%-3sO*Xx4U>j~=(nd^;3>rHj*&Asa_6YH%@ z>mQHS+u$4Rj2j(%8=W#6UD_Mn<{Le(8@<6BeT0qv%#BY)8v}J4pL;h3CpLzbHohEf ze1&fgGj5LXZGMxva{b&KGv6F{-JA&CoFr^cWo}LvZO&YguX{ItOl2-b^+$Va!F+4cb!#bjYniaMlDV~7w6#{Zwcfk6F|oCIxwN%)w6zW2-eKI{ z<=ftq+1}UQJ}}=tblpA*-aaO5pJZ;I7Hyx^ZJ+mU|DD*rSla$~w0#NR0ho3`{5xRT z9f-~j)M5wbwgV5@K@fM4SvzFKJE-~{^1dC4$sNk&9jfCUYQ!#@X_tn7msWO{PG^_i zVwb^fmoa3QiMY$0waZex%UZvCt#6lYa`*c3F8lE=2VxJyw8zQ6$0fVRt+U5tvB&GS z#}~54Puvs8+7m3^6RO|4f>Vl2?%i146FuI$iP#ro+85{Fmyq3;)Y+G^*q3(OmkHUI zCGKOh_T`HAX)(y8afA>76)2x2ihSAI>ZCr ztOLE`1O55~gT4d9$pfS1gImW3{~->InGQ|(4{ysJn(7?hu{bnyJ2VeDv>+Z@W*u4; zA6nNR+VmaTP9EAVAKD)u-bEZaFdaGaA34b$IqMv`SRCDRJ8}&av zdG#H6PafT0K6-F`*fAKf%kMMC+WySe(SVoe)A!h{Th)tdscSlZ5({#J-cH$&=*e zla%9=RKzKX=`@Z1^oi_gy3T2a#pzSG)6C2LDq#b=IvBtQ!5)&BPq+Y5G$=7Ynpqu<_dNhHFp`` zIb7>=V7=??dEeQt-iacJDKA_W|4_#1QQ62N4f{yR#K@4% zNXb#0&K?s}vdJxr@lPJfx1ZdsacJDu)w zKjU`XQ_F&SwY%zCZI0dTtZ@yyMl2do3@55_6h%bDYz1??t?H ze)rPz=gWZiudcMy!H@IZUgzK6d+miU^v*4;{9X8vSmgJ#sC%U7YR%5SqBwN6827xS zy`dyG;ceirxA?iY(e>r{)OQij-wl0y|9bv?!t08jgNmr9Rrs$}PZz3^OFqOlS0{8= z7iZTH7HiUT>JrN9J{{I2F4kwS)<0cs$b8k9($VyEsVQfuxuT#Yy{hHgSnJF6*4&kk z`9~kK-nL~|w9Pbi46k*(DeKG`>#VKl&R^>;JMMn{p|^0lw{foT-Ee=|dVfXjr;@Hu zV?PI4HU_Gj2HzbFR?ZEz4}6*2`qJ3?wW@ujwr6B{aOB70NZ0mAYtLBI=dnNE#=dWi zbsvnio{lx`PIUE8emtA(*qiEGpB@;T={lR~8Trxo{m1OykHN8@8)rX1fBQ9X@Oxx@ z_RHbi*yQ~1%%A<$KjYsQ#+DbC&KDz+hrNzWhsRRw*i7>xury$^+-(gjRQZ9jc@^Epe* zc_2rxx@hPHHel_?r|RObuT-N^*Y#^kMhbM(h3p1u-i{U<7wG5e*Orc#Syp@g9H=du ztZ?W`xNcBaK3(lTT59*X?%nr#-#-Jn2KDcMHiz!7{rp^C@w*KNA!j#isGRGHqZ76t zY^eIvN8&bkY1sH-@pHDM*RR3G>gBI_8j0*iO*N~dC8lNeLrt~ola%WGY>$iWmN0W2hYH8U0)1NMU_e)FT{_=2v!K+)XO^54K)n329v^F1a&vqqVa{TwP z<#c~#wCwKJkFDp&%_A4j|7-hraegqj{`+fNp&%N73(Vr6s0PhQBwqCA_c9NX3enUK3dTng1kX!)2W* z`HNQmP@6%gonqt|Jr;r#w(=I9D2&Y(Qb<(p&kyZuDLl??i?Z^8&drRpkUJBh0Potz{YiNBZ;Wx(@N^0p#y6ktM$ zL37_0!B99Fvw{C%%5-yaBrWxW?vYO=gRf7!-h$t8M<=%p?s%4S&sBFv)h{)9RlqL5 z7-m0Rt1jIh_C!)nw$%Fjr@>B8VPqGhi#}Vlst{Vab|b(1-H4&OAy;%>*8tB>AT5F~ zV@p!6nd${Nh1*Ycom!E8Qhku;kSa!4n7 -S#iosZ3=Bi|=J`MhE(Q6zvn&tNKVQ zg>3luw-ZlsGAkJ`I79IbAuZk|GOUaW5J5|)t48=1tfC5&2I2!tDc&2zZKTSi3nZqh z9vF7&i}>QA%2%7Jgq#pWSrD}k5Pf%4sA_*!f$jg!;Gy!fIt5Sm>rd9<0 zk>&O8=nBq$fXxYESVPy18s-6Y`w(2nd36|0nz~yUDNXTuP<0lK++4=|Nd8I!dgs6L zV2ro^`&D~FC%p;!fcuXl?|lF6c5*oPEXJMxlKoksMhbcs(=QgU;m>r3Ja6DHu~T{M zuNYMRvZ`GcmyVsMo|O*mQkbjdqEcsD88ecrln^i#L4JuaY?5s88ofB4dtp~`wUNY$ z*$x-zXU{vzrvRSenQx$esk44Y))GYpREhX}lwe1;i^bm(+$OHR#p5nhRMB-t#;VJnx`#-x;_Bg7j+3|^ zb{^S}!=U$eTivMv>Lx80$D6^VZ|Ruzdeh*1^OQV_o*8e+2FHzt#%5KKl5^@3(+&oayM6+@<#oYuGe1teLzK~RN$f9vI5A$( zpwJYeKAt?I9|Vwi>)FzPuU8$=2%b-ero(|f~hMe_B4638jpI@^47$Q`rCW;zdJBUZTKQeEKG7I$by`w$a9z_O3 z{mK6qx1t;xi?0|TdJ$CCF)4aRPLbdp>KC)Z!;_HjoL>M{$2Zt?TBB3v z_Z?nR@>xNFwKs=u@@lF(kErSwi4(>2vgJ|H60D1(2%;S{u~qRQLJF}p8O)x7=qj}7024Fw@tyg;VxMF+|@xZadGBVD|0i5eVXfyq5F zC2MczMD3ZqjCc`~;eyfpQF>HaCKAPJ>Nc_1bRz%A5>F8<2F2#A1sv`p_$*6!q!y^S z5Ax{M!jIC7^VE_XK1`|A$h)AJc-Z;5bxD++F0r#+#F5z7=?IBM{GBit8|Ap%qus^C z)|0`RD2;q%O-!hv(f~xS99lKWj=Y-euf^^uu=1 z_oDQT$;nbB#>>CdzK;UKnNDUVB?W#&A2}p=f1#ahzE$@jg3BAR={P-{a%A${F3-*2 zd}hj3M!arl^8sB~+%#2vL!Z%>|BLekH3rV%Zaw2Q&tujRQL?`gd* z6isOFXra2!qCJ973HMs~$@nbf*5c4kG$x0O!sn!wpZ)2N;Q2t-w_0udTt0Xlt}|ST zT$tVh=IiEygj`A^#aEuA{VuR65R{}^x&Tc*N_UC=YokevNxf2~EmvC{gtfcG4=X$bYPb0Tuf^V!@S0seIFfS^hTwGlRzsH@&I#N5B28;#|t zG;jc2$~$pJ0|vU+!ssS`Px{b7P|)kPi0Je`QE)Mbzgzk48*}GMF{^LKIYcnRGZtb>$9#gL*H9!P3|DZIz3)c* z9>m=^SOpXQQX|y3^YI-4x<9o-jn&Yb%g(bTJ&HF${~YwG#(fUB^b{ew8`0DQx6x!P zei&KbTwj;$Jjy=xDC;?jh+(c}f2Ac2g)e~O$p?*jDJ4q_ynv)deV5t=n%sAoi*)%~ z9TzI;Fl#xgp@3bXz6lfl2o_%ElKVj$uK>OJcygQ(;`J>F^S5CsVg@glkzvt-X3U8~ zQP^5E`S6~+B_AaBHb24yQVvk@OsQQQL2_|arVY?dJQ?BxL8burZx*bX9>mN?3~bQ0 zrJ+;+7e*$D-U~l zts3Sj{-INI0pW};;WsM}+tsPFYM^rsSN4GrviN%+PDp%ed=3u6=dvRz<)Niu<1a0J@F34)+DO%4uaxVW4J3y2}ZyVWMS zAv>{PzZyh-JNWZFSe16a7wK0>!d(sF)MaqSesr4{R3Xn7{Va)F2$$&ElL9O!!B`|C;#voeCp(3I3eL=93 za%Hqh0i8lNGsfBN+EgP=ibo#%8XDu33&K$s=pLfz+u`b~v~GljRp3$tgbnon66Ln3Lf z=tuH28v0CT;k2nbg_Dkj|H&7*SQqJ_uq@A66#HJQFwt0|LLzW*O$}^1aOLkI9FLYV zLsQNSleLjxOMfU$iGq*Lik)JpvmJ^9gqXZ8*8ZYF|d5(^_L9kTpl!E=;!dRMsYpnI-&yE;*- z9w=H#WLrTkqO?@O(sf&gmwPHp7 z8y8qnRaFm?{P3q#F1pstp#1qcx#Fm-?KvYMB9U1R9hr^}@Qh(5NWQg^)PZX`k7>^u zrw7{9clp)p4p2%cpmYW(ozN6k0Q_Gxy!{%YE26M$nCu6dVu+q%%z>&5fa{bMDI_uR#>Ue0v=RV_s&>otYfJ!NY2)~ zox3vVN2gf=)k4k+=sR5H|7qWYUil9vUE}wrN_kUgs?!0#M->mid0EMq(9YNYcIeF1 z>ojq_&MOouFHGrYI;`u3(%kzJQ-K<3y})ovR8n{6RozG{u);ugtmwTJ(UCvY#+zsc zK7j|V91V=Yk1mH_L3hbTVq*TwxvnGMqi5Z-WZa?iyyvdSYinDu&Ongk-xm57gncKa zAVq@?EBO|Visa2SOM1o&P{m{5X1t$k*gmKDe*Q)Jyx(bn=y0$7LpdXcEyBTl#a@Yy zP=^K4SJ-~8-WaS`6mJ^*49tJNw)>J+joOO!(>G9uuIH!hq#o;yj#?b}bvxw&7pgX( zP^7(Q3{AN?Kq*j(D$Xfv>AVr{*hf!8e!h-Q2l^i6f2HgA`fiLyZg6OEZb%0uBGOJd zA6S@@*Mr0B>#*K&!ohWB5FuJE#`9ne<5rx4{JV!tC&md>RVCDZ!@(xs=%c50o`N&*bpYql`kJM&Igz#%4 zqpwT8O5C|)mP2d?J0rVIqbA9PRhWoK8kIX$`BZb=H`ON09ACQTj<f z*FvSTt77@9=mG#wwu-?>kf?l(c!Wkm?mwaj|D;isr?Ic6k+UWctcINz3gptJM5?AZ zcOzjPLcD0J2Se9dAQ%ek0u(D}7#ve7k>4zc`2d*$yX3>wWVkrfp}RhY9pA2xyrGLq zz8=*G@Az&cLffZb>ZF5A9lWYRz#&Pg+!TfZ1^uplb^zVKj~;focd51 z1%ZadUlJ4cRy(QSv>yXMIl`gv2;3mqZjfWKcF%@0e9Jk!Hpq@Y=uVdD-cZopuD37G z7`_P6Y>AL%2G(uF&oa6Wb-H3@+;^;Kfops=y@&3hFqSTlT1OI$7Z?fHhI`D%hC7Wi+?48OerLT{xUq zcwi^Puf4|Fd98Sh-J#vUb|n_k(ax(%vB;X@6&NYOLL5(78a=7&|EH=f{)1J#Db5QE zje=X@e)keE2RXfdGyVQe z8~d=|gqC;gx5yU{JAd&lHqf9uX0UsMXY&Ko4mI6W%fG)qXjy9XKPVNClPOTIkt8WW>8G>f-U8cV+8f1&#`;yZr`q>~w zDyXxjadi2=JhfOVUhAYz({JnHzDzi=XgSjtnY$mttTEY$=bT>?{X|+YcsQGNCD)rD3f91d<%$nt)DQcm~ z+;a9(pZxybNZYu{7+0>8j{mnYm8Qay!XOz;VI^_zbM`f7hRfuPnuXD~*D5kMX2iYw zvaWOR+5b54n@=9vOiNmQw!pd6uKU=$Nt`)AgXhJIA@&?UV+#XsPmr@L^3x@(o`h266AI9wx1tU2V3299=kzt5lWtTfkK$|=0)52uixmuCUrzp^6EuR&Ezjj&3N8S{m}r^Mz?I~6T=vWqT{;} zUj45LkFeLFbfy>rR$7eqB^?FJQ9R#s%^^jhO!uR-{67k>KvphKu1QS`WSF^z-OAEn zw{m{TF|%Cr@-o@&$6v59Oxv1Bj|CG>NQvkn0yy(cJQBv9p_wrepn`C5i&{uMc&jL#$uPosk;B zj_UFf?!RcKdy>s8M|3W`sQ5~C^DL7P6dILzjN36TO1SpD(&WSWm){P5&D7JaSddiClP;sJX2AGn*;O==SM>TlgS(l! zS?qt_P`I)Tl0DLfCu-a?9HfE8jp1Qnt4(la zKVC*mZ6x6%lnO&~(lm%66Cw~3lzBZnis%w|9#)hX++Q?F9~dKxUA`@M?~}4>2SEV( zhqJZ>qzvC>j)LwnNwg!JEomq}%sW35yn!@;;i>4|y*PTK-x_*&n&#Yo%KLVe|Bljx zX-TQcTSIBPyO6{@Bzs*YF?l!ucw=#y8Q8&Q9fVXiC&H!JD&V{&QJC8FSY8n*B}O2M zxwG_JTBd5@uRsvWbA&$xj!&jf&M40MKcn-6x?c%xAi!yp1N67%Nm4A+Yu#~STPTS@!Z%| z4`P(si6zkLV?-B@>0x_Xj~k7X)$eyAl%HrmKAkNvWn*qgwCt9&uZ%z0%52$6?!W$C z{K;eY=qCCi{a|jkVHZsRpe(6~=RrpS6xpgT;@TmA7&+HFwGMcK8c-w7&7p|_QSH!B zhT00Uwp(gQxMsoR9a5NqOAN^NMC=a#;2dEC^4yp)iTHxe&{q`)LjN*$8tMQTm{#a- z@;ynTgF0~a%=X>b^Lx_*!m$|i4-QJ~UFM`MXxTQF{8zN6mgg6zjOZN*FIpsnR{nu7X?TzR;j*<8ZO^|w3{^!uEQ#kt48P!WKLUw_mI2`x}7$}-hh34b}AT|sT z#z%TiE=3NAJyX4oYkzlL6#@Y5fS9rt7{v|&tEnx4F~BEjRUi0|?`%>ha!{_18{plJ zIds;^@HJx$GN+vljRZljB!-9|mdqF?xlgi=0O$(x&AReH8Vc+Lb65XZJJO+-84PZ! zRM<4gfC4xpj79)UFY@dDRbM9vY&4?^rj`K;MJa&x2s$9Up){S-0gi1)C>uA*CE#|! z(y9>X9xSK$7tRw`&;yeQi;~L2{ihH;SAQ$C=Ig)HvoipA(hdXE^C;a`rmn>iFJ&us z$@i%YmB-=umIcA>z!HO>YmA|PdsR5ju2^XRV}oBVgC`Z)9fz%dw#|)09q@*VYahS6 zz1i@v!$4zZRcRemqlEHsk*3cFPhh}eBj6X1g9FGsWuioVFu7$F%s7?tQQY4%xoE2rGd;>y6U*ZhF(dcYe7^oH>u>j z|o;oSbV+bC*TZMv{aG#X+h72nPKi52s15DE>knFDwI5Yd}k( zDJuc!sS2wwz<~yUQ8<9ZLj_yGRxMYuLdQ6?oMt1JnD!N*h85FrM-yu>3Xdsk86PFt zOQAV*0q_V`n+JVQQrks}L>HOHD1j1GThs4IZSVA@T zL6B@+g@d+?huU8f8}S$o>WcLg z!=>3m;qG#JJ0K5(7O)?%rT}|liq5$6i8V(_6le~RLa1fvJ+ILV7;fOQjD}!<56y2( z6||!=xNnuyVcpa7kOch`o!b=sSCfNWn>wyaSwI!{U%Jv&)-Yh(@*u!niiYg-UQv^{*e_B_z?7EHi5!5%adV@y6C0&?NU?@m6A&Cw0%!&h!XP-X zaivkETzFZ9(Uc*8!(H0hKtXYGIf>|@B)O9^I1@Kc+PwqmuKD%aP`0Fz_qiNs7Q9aP z83D9@lqzU^n9fFvyuU+;ibEJW6lG$e?!yMSjW1Ft5J&)=Kp_)pP-msu$+#PPCPi{V z%Ytd(23~2NQ(9z>jg)@yzfuS{jPaL%Pc8~D#P#meBs3t6zKc^OV;P3fz$xsYWEcP* z^c8V-ONxd!a))picpcRbb_iq3iUTv<`&uiYMH9sCW86WRgT9$qVnvFf^Zn=-0jC83 zGUku0S{Pa_I+l~Dda+oQJtD^r0*m=CtAU_E8fnZM;SPFfI`tJvBl>DD3cP|V=aG6) z0=fKg`;w$05=8>xk+dT+-r0xAU=;Z|DI~G%jNd4fWN5_~F(cVqju8 zDQSd&7~u01;2i-#4GoY0@s|)_vkT^922kLtVF~uzmXoZSzh6SjDkDh5ZG~&66qDUMZn5vBPHkxJ`KZ4h=$^%S8&i!K$8wm zu8V`>bRCC*&c*N2voER)S5Aa+TM8tmuZ zv-$xV!%6D`hh^|KBY-kbSHRaS=v{@?usED#`RsZN0%LoLa4-WugF&(z3(p6^bDx-3 z-pA4b$m+(|t#+OFaRZVF4kU2H+nO@W`Xmlw&k~=cMdl7b-x-m=PrgGFfqiH<&=+IW z^R-)fd zJmT7~stDX;N*wBCEXu(WgprG(v$}6N?ShR0!~n?0Ul6?yW!19ayvf*f2N+Np#tkGE zSj;itpbnN#zT1JV1`@Bj??$8xC8Mjk00NK^MdQFHI8n#hDuvj&!bvC+fHpX8N|KtKOsv9ree4g1|SV))M$o=#r@|UU3qb@#Z%Yf4fkyw zZs3HjDy$o(Y&JV{k(QI;-G=`9y9JQu=dUz2PO*;So%8A4J;jZ=X9lmG>9 z+?@0f{XhA6vjvDp@BE-s@z42x=$?cXrzilTT*@Q(Gs;pmT4{%fRRulPhCBqKSdp>3 z#$$H@NFK`yUfDfzC7M+%_7b~NlCXjgR3MXfq$^b|U;Cpp4v^mhSW5_RL}n@j z=25^L%w@qp(4~QQ+FuK;(cD0{Q-6`GLlX_I>awVe=y}R(d|QSH7h6QwFF-IP^Ot>3 zwb00udv`WJ2`j@WDdZ9h2C(IA96h%{mGTVSSOmK2*9(i!W|L2A9VZrCcKm-0KP3el znRe95rgEdJTdx8H?C*+u+ zUAMG7f*rHF!CK4%mY7(0)L;t4gzMR@Jof}KqG*Fy#z}e^DJp2zIW=gRt6?tY&mtTR z2Qlk`NC0io`X$(XL5GjjaAR6m0WInPL}&nxQ@?+C;ec5PDNzyO3z*wZBd>eH#MXjx zU4awv7-!GSduk=hwR1{N=9<{@#0LnHB_yg*`ou)963`)h_MpPSvls4AcvgA=2M-KA z&83OevtfqlzpV<+*X$ie3DlL2{zrqO!ZV&&P-62Yt4BzR0QQ$3S-g4$S;g>Q6G7vh6&HCDZYtb?MKI7nH>Gc87F@$A=DSN zn-gOyZ(@?qPck=kYp{B8J}~p~K7S?-g2j3I!e8ufW@mm^h7q484W5~9g@^!<5_rWI z?B(m;c7t@KLk;W(RmVNPJ}c%gVcmC)%Dxthn!aplQv41D>*B@_VBYNVgaDtDt?8%d zU3tzQE;hXeU25+XE*X{Fynmt(aW@Y@i{tiV$E6UYX5Af$m^ofIN1)j#dr}K0E?U17 zLJ1t!?a+bJKF#>vf401(l`PYN@QtGgEP$_x`uKIg74r2jhoTSuMutc}j-T%RE%bVzM zqVV}~5Kc&j`0BfOXxZ;ZN*q*7BGD%L;=-n~OHP>|hhqK#&UOW3aPT4k+rdlOY9 zAl@buSQkhx>s^P0t2V%o)8eoKAV%h=B~L+2vrk86qAGWntr17J8Zs)Jq8{f(Xga2_ zPDQ`{h8%rxl-7{xJfLwQSD&PWQEFRbP zySioTu@W%Li)Km$jLrwUVBP`Ae2|f{X+rf0kpqM1Q zS9UnN<7oMBYa+ohZAq-M{u?X1-2EgjohKq55`S_%_;q@}$NAjTeJF7ee|?zGw*(N zpN3mzlFVQnwHvBqXzvYrQltw0Xgb#NKBGp9ReIHs{3W?qr`0F=Ck*hBX~Ax)N4x+I z19iCnf-x5yVw_#5qwrVz$U7w!1_6Wu?{A(-ZxET@etU2C-Hz#dP3JOwnK@RMW(-9o~aLrd>}1zvNPZ3vsfoI|D9`LvYO*nIo|V- zYUu8LOgoT#baveJ^5=yLjy39=K29qP^7T%aQhllsOndx4?Px5P^M3$@KzqN16=w7u ze;I1Hp+nk?5)l^<@WCO8B_>qOBDjq8VMUT`Vw#CDmgt{_wABz?cU`npV}%Ox_#==( z3TYdLG8%bgYtJl!B$G`(q@ToDmJ~;J{&+J9I?u(Eo2R;6bBGaa)aFPZ+@%B5N}U)gQkkjh8Xz1X zV3H}YDHzGB|GbrlE4#en%P|k(sL36nQNtwb zfZ@*$&voHW%~b7!<460HU`jOY#!^E@aU|kQ3CbvAE*diQ)~Pln#NvohAXH)X5}}6F zN)%-qaW>j$>o`&$HJB2Ia$scfOAX5SQ$-xw#DWh!bla`e5KHVb1S`LkutO{^{2eso zEM4?#%#k|;rR0@UUU{*bf{<(DOEsk%GYNU4TM#(I7M5Br@y(lqofI|FSd0abTH|BYS9sM%YYY3Vf8HDcZoq z?s#Gj;DID-{6oV!K+p*Rn#&)o_yG&F^t_BnCnJRWMW<-O3ipvPBfXLVdIbV7>>(Dv5$WIBOn7Q$UzdakcK>DFfeh*MKZFHj(j8}ADPHVQnHekyyPP}c99vD41^s) z)ON1*M%#F!1~ur7Mkc{D3(DY>WQ@lt|51RTIIzG4Lm;35F@cHyEUN~6P!4Zo-~#9T zpp`qwnDa`o#D2^y3rmQFCux$sQEpFj7r7=jv#HH(a{3bZVDb8_{vz+EUCpy!q z&ULc0o$h=mJmV?Pd4_YECX41qZb+M-QPVc~XrA-z$+>tOY##uf5ChB94+drgLHOv0 z3>MeV62>DIlURb)GOE#ya*X6^0cQQ-KkH58q%8jG^aonDpEZP2P;-7WMI$+7)%$BNpvhJ`;b`PKqxYs zSPWws0ohhFf)035gDBT<)kL3j|Fc`Y(G)Vsf(+u2*10N5b9=zb%!Ef*AjJ^`DBOw+ z_e!P{rVuNE6>M!V+9))DVIMn~+YS=7ge*jZ71S^s;#@$CzSS)wm`$8A_925o)S?6j zJ;@=0fmo_gL!Tw=hc+0x+Syhp8W%weMe-^}?V*E{XX>jA?rPXg^47Oz0}}8=*}Wb#K@bYJR`4c(kS%l5tKRjF$+jTCZA5hI zzI~LJPm{Pvawn2krUs3^p1Lmx@+*=3vKJ~y!9*vtF)lns_K4VR;|R;>SqQU21_Cq% zGH=2LZWOq~9{zBJ;$gla|9tNu@^v3ROy?>0!Z*Gb+3o03oGKRQ_QfC`%7GIc-5U*p zW+)g0P*MzG51J9e=qN$P(BayVfC5lAelnD!92F7!wZsgW*%Dy9y>&Qc#pd&#i}i{= zF6$~9U$(MCPAp}UQ~_i<#>Wo0h=yjFxdbSHW*fPyOz-A7i7BmtGll|%qdLgSe^{%22YFzWW*Ayk}L2fU?}wRP1B0;Rj%?k4wbpZ!KMn@n@KRr3z% zT{3$c1mEvQ(>1#E?|~B>O#l{i!Joon^C3LpJXyGn9d0Ik=uqMozqo@oWlw6(x@J`7 zRHbEYTvszZ%NQRMKTodmm77}GL5rgd=IU~f!y)D@A5tY$d-I+1Jj&x{+>%N7^OyHL zNVcd%(UY!p_3C@+AZfbOJ8cfMM!o7*zdF{l4y1}}z3X02RmZy?_OC~o4dx&_+S6X8 znQNWxYX5L0oLrzA^%EXFdpq8RzU{6LJ?|Y32GR?3r7VpS9z&49PyDWU#z8&njbD6v zag%Yj?hp$*Uo%E&q-sts>!uLh-BM<*~rD{V|dY@2`mkH(&L*~4D2 zfFKTlprs9Mz=4e3b(r>QCn{y2D|@WiM;0<%`{g^x?O1m{^V7O+!Q%{yzE9%AMkj+G z0I>>Ug2xi0FMa^^J?rHke?O-o@Cj+-`liS}`p^Ilf#CxTK}SFSpYr&t?|)1+#8v)* z1jSKEJjB5eL_;n7f;UtF82F3!bb@hoLIR?}5FB6tULXeU3OLx@NGweTcEs9P2qx^B z_Ke^MW=lVu)>GUcg&5RIxS)g3o>Tx#QLx}_$)J~Pph?)E_#Hy~oFEW>5({)0RCu6N zz+X}D|3(Qo2S^|x9LQDk5!+OtfeRGS$*fXFID?q%K*&9Z7J3Qv4N16QR7RLW7IXqo z?878f0pU499PE-tgbNx-h$FP$Q$=4q938eeND9873Nj812H}|00ue$550V53kl;zg z0TX^iB5p(=2FUqIMLbyHP?X6MY#SE_NG5v8BnnA(IYKDJT>XR<*HFVQgb~+hL@16T z&-~aPRLk-OLl+o9;E}`^MuiLvVMsV36qZb(bc7N%q96_lA#Q{jqM>KOjZJ7nDC`0l z$WPNn{NPA*;t^8hMm(ZX{2^^9;!xPiN6cd}R>?7z zVGV>8E3kkrF->WyWX$aVEDA+5tYStW<{)fk+q_Lua)dH8O5n6h9H0Oiz>7b?{{iA` zCKlB|C@euD+)du>P17`@XWq#kROWjz!JoYW5yyi za1p1-fbxCBDW;-7)IhtyLkXnO5TIfac_(>RbLeVou<1(UwO=P20ts&w0LtmyN z8mwbDj$=RQBTDw;Kei(p*ri7NpHkd`&Xoi)=*vqs3qSg!MkLI_Fp(nw|6gkEC0}+K zR%)eCh^9Y817V^OJ~)S2LXpiyXk(sb?FT~^b!N;Y>;V?pO()3J@-&ZL zmZ?9S>4gx@AT)?Jpg=}+LP-u0@%*Bf>;e8PrzJ{EMxX#XoMEAAgLQi8OK2%|Y2sN@ zL(G8yI8;Gdi3BTjWpUOIHp*3|dZk9Vr9U(&c}{AjUg~=Nz)D(;KX9W`B*mp-LbW)m zs-gp{ZsLWGA6;@pcYbGhqG$;irUeNmCNRx-3dMRNQ;M$UX1eBO#wN+orgYZAZIb3m z*g_PXP1$ro7qCILnq5iU7k{F`bV(~HMJC+P#N2e2Kj@{B21$x4|EMV(AEo-kE)XUW z;%b)4qP{v1Hcn0$tS1j0B{P+X4?4ok)K`VjM!-4<{1X|IefFm5{@aW*XimG(VgmXZtl=943 zawVu{L|?+GoTdq#+9{qA&z`EOpS~bI#DhDq0UNNTNK8VOxPU*Dz@6r47IZ1``~(Ob zPx3hIy8=myx+-k7Co3Fh%%y0BipsufguY^t7I9HVL<2E)|AY&a)l0ambw(%aXl&p{ zX-15T!-j4xuuevFF49IwgGf;nfx#q@?w5{99Kb<$f&hYuEH&bTgQ5df!It!>?8C}p zt2WC^yll|kOdx9MiaG_v9;{9ZtUpw&cU^48DlEsAggJ27NlXGJXaiG}0Ns%74*07hK#&Ac zkl~tdjjS#Uv`f;mfO41u)UpCbK<*}D?G{CCa4`DhTYr zMcHl%7gG%!Q`3|{cDgVOhfoTyaD+fDgv`$2#Dff<{|Pk+Q76d2@K~pFN-hXELJ7b_ zJnTRTNRU4m?*|9kftFOybg&F`@7*ND>bl-WFzqQlj1)bL89S{qa52&zo&|$(oB5gy z1da;~3Ir$b>q2e~G;T_@Z_Hh9{APsevaUb4ZgAkQ?Dhlg+HU{ah5!@r-Y#+@XVDgQ z(H9|83+Hi=5RHwFgiO4IY$^o`EZIJ|)~+p7OmxI4gG4_ZVvzLVN0f>^3JgK5giw}5 zbcIBj9B@8$L@vMXzoCPFLU0?)&me0A2D5+$hj7fT@xw@PyYS}@@1zRrWRI~x5bM+q z5V69RaDhZ}0TT~5w?r@f5|#=R6Y5V6bMhPU|Aub>2lkST!`)04umBnBh7y&;FMLfG zx^vM5Gq1A5Di4fnz4DnLw8nG zEl2DkT!OSopEOFFnn-g5MFwOI3r9B)xe zSSJ=G1%>{sQ;)Sh`vET=%tK|20g=u{K;a3b4TNOzqKD zFrG*=U^lj7KQ=EzVn=*3WLLIjUp8iE|F&jtHfMLXXMZ+mhqh>sHffi(X`eP~r?zUZ zHfy)GYri&Z$F^+GHf`6oZQnL-=eBO|HgET~Z~r!M2e)t!H*puYaUVBwC%1AhH*+_) zb3Zq9N4IoOH+5IHbze7jXSa55H+OfpcYil{hqri-H+h%0d7n3Wr?+~qH+#3Yd%rh) z$G3dXH+|Q)ecv~J=eK_EH-GoHfB!dt2e^O_IDr?qfgd=6C%A$yID0CES8F?6{CC|K2>zgGN|_79fF=E4dt?f<_eikVkpe8N(PL!4;r_ zHKYL&NP!$c0xCd)8z^}o_yZ(d0TRf8EJ(SUFWoyNITECSHB@;X@Pa?cLy-GJF=%-p zNQ0Ye!JV%;px+!JEIBI30TSdvM&yAMAb}PzLY-^5k1x3;1UjX!+@K@D7%T&(cY{9| z`jR7oC-{RkT)`J~gPS{grKh^Z$%2ww0waLBKWO@=V|qrsIxqNvl81Uoe8V^7!$$DB zuaAT}e1kq@ggh9-GTZ~N7X`5YdaskjuorusBfGMH15qgZJE*$0mxMfoxh$lC6chuQ z&$^Oxx-lRDA#ehw8+k}1|2ZjWggTTvN#p?%i2FA%fs*rrl1sEnxI4Vd`$*8cz2iF) z=sObdJ4k$k92og6AioOf~lv0 zkEj)hF4>>7lIT9>G8eBovoBEQ= zLALvYzWV_!FgYiP1f=`JJ1BX!2g0Y*dAt+4z+1wn-^0H91Mn}o@P9fgWPHqX#P0V# z68wJf`@`@nIVu$Y@h89WFF7G(gr_UHqkqAY-$KJb{PHV#^c(u3i}~sAIO~5pDbRcF zXT&?m0whejMyR{10|XL*NaQT@cMu^$S4gz@Lq*WvG9*{5@tVj_oj^je-sM43>>VT| zgu>ku*iRxwi&rpe3yDIM-7jIs@d-?YD z`xkIv!Gj4GHhdUyV#SLYryVDRY^AAf%R`}wCGGa9t9qJ90Cg*%@SVZ zU;;7**Rn_;KMpbmB7S69rX?vfY|*(6iy{*zGttaQP3z=|FfA~V5Yrzx{gE@xH92c@ zH!_*1PL45_c_Y;)KX17RV^t;TLu^_ zuN3GCLT*82DS^CHi>Ma;$w#0(JW8VO@&02-g@o57vFsK-Pco6Q27^N z|4Yb`hn&YolcSg#1@c4|fFW@!5N4}8$eJ&Dl~h<3IeSRke*CHUVt+DrcF(mUmSWZqaq z!Vg(!;RLflAX(-cRA?E92~zgDCuO|I2JCFT^2ui=eab%Q9=6q5Tk5*)z8mkn_1;@P zYXp)a5Kdfa*b>q5kfv54 z9{R!FAt(Ckt-l`o?6u#X`|iE}9{ljdAD{g4%|9Rg^woc>AEo*6xUEj_nN8`n+E?qH z`e(X-{r>&`AApV{1QNb+kYfBSen@!OZJ44!x40n0*h3%w&_4W-LPPYy z1e_S-3xJCdD!!11E68IEo|q65fE7hYZAlK)(}yRnV2Lll@E}wP#5eB23!??1QSslzdnqv>=NwNCzvB5D6Gr#1Kz-41$zF;D0P3l3S4DKe8ao*0_QS zkG$g>NMOW-fKfwXP735z8c;Eocle=BQXu7+ zAVDBZrScuCRMR5Gq{9vgO)OGC1tXBa3)WcTAsYK`=gGixK<}}F&lXy^Thy)V-R4uVWIE7?S))dL)Qw2SFotc?;xT@R z5aP0ez%;Z~ff2$AgvK+SvE1=b?A(+7v|3oh9`Zf#53 zeaIGmuodobi948n?Blk&bw@hP(Ohmk_Zrc?#x$ra-D^A-yWD8*cAJ|H=5p5@*@doj zsasv?K6kt6aEESpi;v$H7ryZoZ9i-qkL5a-8`Hh6cDuXY|Mt39w)8n}a_xKH@aA_K z`|WRg_aR>gKX{Pz&8<6}t6%>5x3|GP)qrzr-wG%A!tK>Cgh^at?nyWv%tdgBd+Rt9 zQ#icQ)owa;i;oeT7{@t&9dc7_j_IkbU#S#-1ZF*n>H z>R2CJxUI!Bi#K-H=Ntu9ybr$)|2bDo#kzB=FQ_4 zz{t0f$l(fOQ196Icy2tF6pvw78b9ON*p7Pl7qeu7n2F7*s4a(w29X6ClH~8(St9k?K-muG!X&V=GM7`(kxCY15 zLbtz+W7~8=ZqEf1cDvxi$L`>l+1-MVYox>H|HPBWru0EQW8jH1z2uKIy1_Q;g(ZxD zFDX1o&;-NR1s@)2mpvF>1-W2LbJt=BA`l@He<1=Q_JUlt&qN!<_(fj&@r&Rh z7a`z42VS(UcCIh_^MKh0G2#*rW$b0r{&+oGcCQzFDB~Ba=RN49ejjc0-XH4#zRewe z4_O$aI@hs|L9DtTa?rvc{8+}sopsW2WF-(8JI8b zw69Y(WmAr%9|W)aOzQg(f*5K64Srz||0dxeOaTqhpcYav3s{i+z+nY#EDqp6^88^R z;y?z`AkENW572-b5<&^sFD*RoE#MC!#=#7npcD=Q9`xV|wx9{kKo`tF2~=Sg=s**c zzzqCh7Pepuc*YC0kP8oD9O%Fawm=uw!UEG`AHYEywBaD`4Xq{Y#*Xw9p1|y zCUN!lLI2PJ5c>ijynz~MFBG+*|1_}&T`?h2Q5*IT9q<7T$qo_~ur2oC7m`f(@BtmD zVIA<068)hM?+^#UK@Sna0TW^!|BlfYf$7{c)v`b-((!5eO`^ppV}h=I1^Ap*5Q8HgboqQM`);TN=F9Y*gLs6iQ&!4u=c z?UM2rZOb2&u>!#%8gi?_|Dxd++;abv4-mltAI|a@qCpwRZWh!dlG7mx-7O^T-QyExu|KM^h*pe^fu=Lg;Ej?y5aZ?$1(Hc$f_jKcJ;2{z5fg1j_E^m7U&uHjCKK;QA>Qhy_a|<2;3I0G2v_S9vlL!$49;(4tf7MsFAPF>~K;!QcYE=*P z02uV3PJ$pp|JU#!w(uZC^dL4gT9sg05rGb{6+?^Ay1o?}prQBPQZ?@sAr^H-JCW2&qm8~IBm~l z%QgA7F(KkXVHZ|hXF^JSR8dzp8&xj|HB+~UK{-E>H`@@xnl3R2i!aE*D|kT`TmcxE z_AWS;o;;NfbOBq5U>1k~9ctkQPPGX*4j;(xAXxPvUiBc%R#shs2wZ^|Tmc&p0zeT0 z5t0CJ|MM1ai2w;U4Fd8XSUxyQ9choDl z5qcAXPggX}WYKzQHgnZ3Eq3-6rLO^jenCe`?6!BfqA{r9}X62J#%)^ffeyMDv@!HxvnX{*DHhX z_VN-3JC+;4c>kogE(_CkW%havvn}}$j2p~fvD7l-ml%fAELkxv3)5rDcQ4deQ71WH zHS;Vdurb@R(}2&BaZDYI!4+(1E_mU8>q2Sfq52MZFOIG)mcfA^csr4Ry@-W8o2Ksupy!`;vK@l7-HIozV8Q;dM%DFAE*{C zj(~ZgfWrZ5i@_@1K^AyH9cW?hZlM^?M3?zOuHS*Kzrz{`yE9~A9^}D0^5Gp^p|NOT z-(&$EpU*4`DIes(v}I&5Ea9}Rb3gJy7t}g2$~r#=@JXu9tRv?ea* zWFjKSfvCqM9KyMP-T_a5p@2Ihm}!AMjNz8a0ix?+1NkDj55kxUn;iI89`ZpJBsw8D z`yjAk13}vwrUkTbo4wn6FzO+b-~y<>`K|G!9^;xEtU(NWI-Gl%xe)@G5ki=U`|iGh zjIMhx?%P?q`<1#oLh`{H&O{*)!War@E8tN(*W0}}oWt>gA8rA*|J?$AKgJaV7_KX$ zAG*O6$N|3j;lpPGt`XwD4+6Xed|HgTzz^InP+Z09T9#^BAMU|p?t#4hp~%;>A8;AN zy-I>R+{vGu!nG4F`XO#LT05cEs(``bq&p&B!-ir!uAh&>X@MgKd>L+8=t=^?_d?5! z;o=-c7H)wT=7E3ncj5-eXs$vQx&gg2yvd;)(A|5l>tfr;>N}d-mlIm1)53k<Qd z&?8;a5dy{kf`5(S7R1nAmVvEl;h1@d9YXyeLY>t8fz(Ca)K5LtOMTTv{nXiE)LH%2 zTOHPE-5;2N8ESpjXWiC$eb-gJ)nmQXeO=XYUDs8e*oPh1|9@T9n_bzRUDS;X{7p zUEbs&KIT~-;V-`7N&e-5KIMge18833*@5Fd9^^&7=zZSdksje?e&~hX=zkvNuU_ey z-sYVi=b?V*rM~4$zUsSv;=i8dJ09&fz#q(h?Ij-O|HFReYaZd#{pocc;p4vN>E7zM zKJA14?wfw^qrT_SzV5w#te?KlN9?^=Ci! zU!Mow;2dB826#Xb&S468K>DZP96mw%rT-I9|N50*5w4#HdZ8J-|M}m*`YANsGK`p+NxwO{(V-}}K|{Lg{>0fL9XW}gHKRCI2N!71ko8vJz5 zAdffaV8GykP@zDE4h2G#NKui*Ud<#Td?S&d|H*_48%q3WkfOzm5?wxw*s>!Jk|58C zG}x@yN|Xj|9>lq^p+$@=lj?*iFyzRR22ZBcNz>^|q!TN`$STlfim{Z=jUCJ59Hp}< zU^Qcl<(wV1&0w*$a}EnzwRg>)O}lo(27iPL8$OIUvEs#y8#{gsIkM!*lq*}7Y;D=) z&73=X{tP;_=+UG_3qFTBwd&QZTf2S@JGSiEv}@bGjXSsQ-MoAI{tZ01@ZrR73oea3 zx$@=ADZk^@Ji7Gh)T>(;%?wuY?cBS2{|-L9`0?b+n?HB_y889(nd_rF^y}MC zoc+B1{ru$$-XH=Q<>1Cc3g^7BK{KqC|Fc0{soC+L2yYDNn?(^B*k6VkZkSto_k9SW zeY}tpVu>c6h~i|YamZqey+KHZ9WZQA)@?Qzh@gV2G3X#_2*$|624wtZp(2I2h-8u) zVnIfVPFBYcebQ+5WR+H4Ip2y(Zpo!;5cYBoEJB9Kg9s^{m>9t=2F3U{8>=VUV%87aV!mk^1puyga~+ga4wD^r(nZ5Li}0In+E^frH29! zU6?F>$j8e&L@&+sm1Oh?bky8I2n&Vhl(T|_xk~KlFfUs3%8GiYzUrcU|AB+?R$ufTEyBs;dR8M~Sr7=^; z3jeLoep=GMpBw0^k>P;{LH2_B)wxb8bAid1<2;OIBW(y0ObW>0C;$pZ2#!icu3(Ug zt6{+d!9vEPoHr>-`7cL)d(_U<1U>kP&^y#KAC~f<1xxtD8rvg`8}300#VLnh`Y@pl zZ%CPAv5zf0B-jXlXqy4PMu@N}hof*~#MK-zHAz&>5@S<{lMxG4K&)bLnlT3)wkk?1 zY@t8=pbkz@2|4|^|AjoXh{iM?h8h1!!5m&8jyTG3j^gm474HbdUQloY9*7_w|EL== zK~RlXl1CuMunbx7q8NlAL=BT;jBXHek~qBB26O;NI%+bGb_Ap+J(rDCq~eL6EG2B# z*Ca|>$rAHu&NmvT%2-yftY!Y*s+DxWux*5!C8WTy)Kv*Oe$PQVm5f!rFqB`G6w2P%6 zleg5Q4#aUzENDkb3JMa&^106>^>cv$-O@n+*`R=u(`ABT2PJr*(1iv`VF@)TLpR`1 zh;r$m_dG`F|KAq5ih5Ym_e%#!u2={XHpBNnYC1>!h{ z6^?<=Hs+K=JMHODgNjoZ71gIez3B$dq|~t(6{zCSfdQdLRH-u6sZedvQjc2It0L*E zRPAZ$QYll$d}9nqkRfEWAO$&u#t(rI%{{Jw1XiWhF#6CdmikIbHg15PHMQk2B4Jfz zkS(J6p+p-A+f&3^Yq1(~tN|f=*u+vcLzaCiW+kgweOC6dbj++^no*Yxb#|U}G;L@Z zd!E&P)~X3g?PxE%$yW%58GBU>Dv$tNSxiP2as|yj{3RM`Ab|-=>#aZZ0gPLW(Mr&j z?j+S$|I5M__84%i7NcMr9#|~woI1$vCb=sP?~3+3;w=Yx&8tcDsu#TAY43Y)Dc`5A z*F5&UBz!d)i8-VfT=cCk7T9s$^LAIi1@=yV%UjO?4_Kr65if%M8{rMCz`2J3hAgB2 z+*=H{xP(E495|6GUIZc!kno2rfWZkjkU}4U=!PHsa1R;>f>kuOF^)@|hdj6yGL}F> z8aV6^X^_ZNN}govc&gnf_ms99XmD`?GuUI4wWi0QvXzO0ANX zf;qEr(!AxG&Y_Ju8V{V;?B(*@Im&Z3V4cOY=X#d85J)ZzFp!{yD_~>9g+YZDcp--t z|GPp8mav2t{1FH#nX&|+XjnlSh9G^qW-2rZx@5JwYf8jzqu=kmf|`UuB2 z3`36QnrPOASu$aAZ9eruR?1vPtT^T}Ebl1DvTh`{95Z`tM*X=s%Qm2ZogLYGB754z zsrFN^?QCdoTiI%{Hny{!0*CIGJKk<~xQ#vT?z9`+#LjHG9p>iU%$wNf+;E}!p#>z! z!WEB<3~>wd1v#vG4F1ppDg3dFO9#RYE4)Rijo}YYw3-nu_Q%CD?uUFJK{KRiwPe%* zV({%%xc87pKCIDPY|KL%^*9mB{Sgik6GrAVCm%Zmb8cr_;i}yJjd(YZiZ1^*|L8;4 zZ_#~o^rS;L=yPnkw2KpUrBl7?M8|ryq`q`)1GRJn-cHu1zICf7n>c3=d$`3xb_%oo z>sk-@ZVUZehNl!V2Pe!G5{C*@=mQD2zJ3z_DZ#vuuhxi*X{<)DK{N7X{Oe6mBk#Bs*74HuvS8CmG|7eyc8G-Op zg|%a_ zVyF)Q$an~-1*~Qn`4DizB>Zp-`XGpeXo%ILK5_Ph07hGD zLWBr|1^5(+lIUlZC>WQxfz>BjPxW-LQBIwRiJ3S@p-73hQHl+6|B9$6M^xrMw{dh% zIBZX-ilg`%t>}rcI7hOW8?{)1w|H*HpkW+F4$D=6pD_>aw0WGdQt>frmSGLc^?JSa z4SN?Iy4D%lXhV1}bTp3TIE1f(j+aD?sMwC?NE`8}j_W8# z_4tmifsgZ;iN`Q!2WAIL=o*~`R{F?~=lCD_sE<80knB*9wSkb3xOK+pZ@uLQNI(dc z0bCymk|8OQBT14a8IqgfQeU8r3V0d)kPj~jlQAiiFKG?8aE-kMbPs7uZG;yG*%r)@ z0tu;;bHtOcu@f*&0WgloqLzMd_1B8I)4F|BhpJRp3>MQYU*$NtI9e zlUKQvNr+3|*WCAPs4m=OtO0`F*H?FqhPs@%5RS zDVm&Fnyq!3q3Iv#xSH%ZiRbWVWb+?vS(c^+nhG_V_Y|A^IGeRuErNlYFzA-=^lJNe zIq%SZmZJ~L;0mtLn2D!zj-d|K86DDK4S`7*{NM`40G>$(p2pBh_W%sEV2Jmy479)t z`k)Trc?*q5nH%t%kp)L%fdzJeTIu+S4|!Pk$te0M|2h~}pHK*(`MIA2TA=m`cLAE9 zXt$sOSxfk72e43sSZSI6*`NqIpb%Q161t%kdZE5}oU(*@S}>WGb7)e4flWgWvcQLp zaSzzp7~F{*Hrkyr>JKBbQiNfX{4gT^01T3ZT!i6=OjDWl=~Mr4iF1IVzqzEQ=%fpJ zUQOqUQ%a?!Wu;B}q)8h^lDo3$8N%H_=(^Wp-09=I%F#G@vce)ru>Tx0l48}kX z)9Mc|+6@IK5244eGrFUaq_EOqt%bo0%OHUR>wf$~81s;r1t+fa$*vviu^$VvAuF=y z=B|X)g)oIfzlEX#Go#aa7_tx_@*raIS$az*55~ZlgmJJ%6SL1+o;2#Hkg99SUp4CJ5>w;*E6Kqc`} zbN=uRCAG7J5w>INv;8_94@(%4%Cn*u4!gQSji|H%!?aQhxPdFUgG;zOxuzkN2T0&W zI5ZDU>HwpW>ks`9U9_MXHJO;rB@eDZ3#~-Cz8bd;8y$4JqtPjy1FJ~9 z5GAtEdBC8t`U)_ATe!E2ySb~o=SsCHf}@?W44K!WIAjcBD3`Ay4Z!fbkg<2%iCp5d z9*@co)2kRRX&Cq5PSe{BwEHi%ySv{DzTqpr;n=$;@(y319BkEwCTnD{QVjAM9i3MU z@Ub443k>&58RB!l_UgCvIlli3|G)t(z(H5OCGrkwpapr*sGo5QNT8CF#1HD*Dt)zR z*D;4I86Dm`z#GiL9lW~*Tq1tp3doTUcm=d*l#EDF!JQ!si`yLYP^lPvI2!E1HEhE- zJhC7hB3iIyuJCZ7;R)>SgS?K z#RP$%qmuy)X>bqZfC~2@{|A!f50G$apnZ#guHzw~WgqYss)e4uQZ3p8Oa^ z#+T~E4_&CZm(dMO;07e}4sPI1w5n1RILnno!?|qD*KEskTq0;38kTUpvNR7x23%v1 zww1v}^B@P`49%P&4_-UX)LhHhEYI^y$=WO*gb-_mp$;V;C`R6#FsIx%Nx+BfTz{*v<9PMnxe$BC24ZvmK01i+Fwj>CLO$D1Y0S*8G z+dxN;JpqLMzgry|NI(uUR%wAyTxN|53|AP0K&#C4Tyaepq)iyim1?LE#1NerL@U^f z;nAP5DzlBL6K&bMO9cP`06kz!W#HQ;ut@>{00NNMa`XwtEdaYMzL*UflP3q(P-$bJ zIP+lEoUqOs?GI9b#)BugqK(lF zE6kF`FmWx$2_^LnE#~2F&|>l+3tA8k;h@tirZ1iVY0@AEezh3H3g6wU+W}s<^bOpb zR0j85-2crCakLHk4d6PSy94eS1zu^=fLESy1tcd0Q%(+yAl~0ChAL#`6pqGzfPZ5! zyp53$C+uALzyzp}NG|rufdC0+j^@JU317ejX^;lppofeg3rwI7X#mzQ^@WR3MjU

~WWsewU4m}G5Q_vc zbOoi1{|cNS2hu=skHIgnJ?K?Z=#f6Mi0#Z{D!49&vu1P?gHc-1)g{#^S$uu@P%r_2PBlxy4G?cW|5 zt2gfCp5yX<=)T?K>5l7+{_DT};{)*D5-+&)J~W=klHH&MiIxUjWNCy!@+Qwka$trc zh6?zn1;*e8*1!p#!0?fQs1J`NZA^X?D(4jy`z63kl*=PI{D&I`9%)^86XZBVBC$(`Kb@GgdZ@Y zZ2FvGw!2>xka$*t}VVEo4~4#o}p+W`G>RQti-vA7Q~%Rtb+-x$D< zx7vRnz)$_LN#7ao+Y&(e@SgF!?);iG{o${!*8eZV>iwO8h*CZO>;eAn-)qm8NRK~%YTNg8>ea1Z$9@tx^vd_*|8fW4oz^|W z{Ah(oKU8{l_U+xjhp%~i`op*K(Z^5s-K~ChJ+8`H-c zJs0D0QPuhkv{Fkig>O(veF8=vxyW(F7$P@Kai^I~4GL9K&!QAl|5smyH8)LHy@?!Y zgXC`2A>eZ!TcP*>cSeIo?omE&=ZMcSqX&7SY5Md~3q#GH!1P1{L zL5D^xIt3RwG^4b12&jN`D=mm11CoONgp>*n-LTo$KG{d>UW|ZfgzU9N^^^<;rJ9Kp?ggE6hqxmvu6xhqR?;2F_21m@2k$97cpea&!J1Z_*ZTXNr8LL7t_HuWP0{sP zyi41^Pr=0(uVV97uUPw}q1>KF=FL{7SxR5y^g7(>XIuRA_>Ecb=Y?jjfzW+lfk3CM z{(OI0aTlcl>zAhsUl{#;1Zq%(_2u^agdYx67GwlKas!`PuwmQuhrG({^85?s0{=e%_lN(R)5L6Z>gp zyLIzT-f6?{mMHGfCvW->n_mp-JsGQc@^h!?py93urTuxiUG?$m%dyds&_h(d`W?ZJ zw9M-vtiAK`lX|kkrXIRqh_Jj<3K8oH8qaznEDWG1`XqOqlVrw)(paYAcv;Nd?$OsF z=|s)+mKfD`fhug^Rn(xek&MvOc6cVg)-{%JOK5TpGn#O~5W1@;eCLq?a~6_`@7iem z{fe&)O>4=|v`vM+G)G5jz)_HUG|)ky`td4lhC#*xaM@H z=v<3w()Kd*6?C|c^Gnk0mnPypw%4M2)9GA}D@xQ=)-@R2abp(mvqj|iZ&Ih3z1a5= zV$0Rii&7r18nTB(k=Km|7CvV1;y6x)qOTC#hR)|43*mfgX2o%-uUmh;MMez~&@pB? zB;hx%#)cyDcvw;6M5Xy?7yNeEou_s(4~#1jR6P_LITBwDYuT@fUrVvH{@cI>uY2ch z%M(9I=5ue&7P$q`tGC`^5&S!irTW=w{WZEb!|~S}V;}hxQc}90u62pCg$iRrZA0Dr zs7mdG3&ZG%#kHV@zQ;E+U(EP^nrzB3rj+qg;bWwoYId^v^E7RCvUzN>nImY}J28HS z2I|`~hs@Y~rV$}P*jwRWk4 zE~!^kTh)OfW4c#R*E7zd-GQZPy6<|lXM(e;BgfSACxuU*NxtolTnE$rib>l1E5vY= zN=6SA@9GPcoA88t)1Pled!@csbr!Ll8MOT5mELga2=l)T*5TfneX1_f88bt!y56tM zYQY!M9+sb(U#(@Uc=8+A?_A+ z-zUPO_ux*s52T5O=s#)RQGOhc9NPE=hA(5Lv=5J5WUjE}Ta3V8sXcbrQV|j^^lQv` z{5T(fclN&(d6T5GjYsO)$=>z|FEikgm+0&q&ugPrnJ=4u`Aa|gO4F>j?Ed;(!kfQ* zJg{#+`}a6MSOOv#P}?({;wh*)v!c7&ewE9kZD97ljnaVj3FCiHU7!8;^3~tl0WWOG zYZqEc+m5XGUGqPshPtH9+IMGv z2}(gN|J~cu`H?O(F?h>n?M8MpW5_09=D6d-$IMQDwdUyc;QoszyNzG2mZTe~cCy~u z&sPZE2>y55Gs1y}mtEiR*P5=Ic#7onH9pSX(K$e}BXd9{cspA7v<* zmV4xQt}XO{`0s3a^vTiG`;SJWSE(d_Yj80Z3UO}-AOZ|zDuV|iUPC!vY%TmeA+U1q zUkFpGG(5c(Ol}H3ZGL}RAN61Qr2oZ1iNWbH;op;g51$+UcXd!YAgqw5L7SrkMLSJS z^#Hx%esSYOC4MrcqqSVkMMZr(SZPK@b5W269v0{{6tuY-aX#4iVM#Q58xUD?L+{eqD>Y%vu6GVe`u9@4ZEs(LBrGls! z^tP}$MRg`Oa4l;=DM{8o&p&{m(?ybb;PlYe-dXS#3>DEm~@A12LN0 zk=j2~%2xH8teJE)4cNyqg82Eqd%J!2o}~!2P@3$qY#ffIN zIun47Wqqra6SYDF?cv+JSjMosIQqbuL5i>dE>COfcts9WJq<;2e_w+Qnh^)5-hAe?4RmcN!& z(l4FYB~Ob#7fTAA)0Gy|Uz#3Bkp0|(Ep@Kutnzvt-@;Xsg}`_wmDa|WUJPbFES;J;%}Z)90(QGhDen&{u7svz3$f>ZiKIWYd3QxN;fn*npyQkzzAqsO%CROI&>O0_@&&Y{x<$*G0`xLS*E1V)%D$ zu*%Iu$OG!Ywa=dJBP}sS=iVdVyeJ`70{3D4$l@lqlldJKb)s**kPP&{9+u$zoEGoFlQg_gfg-eL*Oe3Q~n*mjCL zQKeM}x_j^=O97^31BA=+8P!?dvjO9Xn&>%g9F#YCE3gnU!T#Hv%6EXXVSJc=(4-*% zv1r8~QawszR~bsy$)DE2V?Pw#9A%7z-a5SMg@Nf-Lqkc3TNs2s61r9seHR(2Tmv=2 zm&=EI+pK}=W24F>UtxjBuq|l1;w!Zcvym5&x}&RjOl0u|k#H9adlME}i_Uz4dFc=6 z2M68eWq(P_z~;5a$@pT#!N=(S!i%sP!3QLS+2`?s7DP;m<~1EOz4cAALUS){Tu>+c zMEC=bMw&(CILLa!h*OtSATND@o97K8R9`cL?RF!MV8Q$%mx$<-Vh$}B=O03N;Sgfd zVCFWFqfuvvu?*sE5jjiRKeu^W!X2(0aY z3nrbh;B=B@=-!lbb2!6sco`O^gd{j4)1}skfkbeK)pWE4;TbtXR6j!X(CnT*Q8QiNDB06M6jrd+Z(6)ubI~iyu6qj zBWTM77c+-N(X?t?g>UGa;d^+a7^h=tDLVPlf#@YHy|GLtj zaB=VLE+5;)DOD{=rT>&_WDdG;|C zz)aB5gN3bBU4v9^nuij?TyqAuEAkR_r5v8?r=UFX0LxmBuBh!^`s?P!8ja<$HO#|9 zx2GYoYWx~YRyL&rD_7Y6WUUc|LSo4w8MdlvV#h4cb^nex3i0DNCjB<1qBo|~H)ft} zloo8vetJBmo2YvPr(ve5fN##TY%W~eT$JBjlGrRboJ4i4d$tlE%V&tU5IwF{u0DA( z@#)F6!HtcJl_%4U8=LTd-%V&jFiQE@%^lr;yLbL2eBRvm`*#ri?=bz}QR%aR6m(|icGUg zrad6jA-2%0Tl9ik3<_I}dRt7ETgfgONwQIb(YjUuA3$ce|-7^*3yREQi zrnhHqxo6?JcgKIvlCWo$v1eVjXVbKIw|~!eYR_(U@7}@QeZ)SVb>Cia-$7yDQE%VL za^Kl?-^G95m9X!YvHzfK-@R%7VgLT4sr|>R`%ez`pCS%CSPwh}54;o(y!8%zEDwBL z5B&TO{0RpE83%!72SH5-!Tkr%rVc_@4?+(Po+Azc*27C;K)AwTgx+DKT%V<@q5HcHS0-@;7P5*NuAzFz2!-R>q(>kNfY6uIpd_I?4-5n zq^dD80lMcjbC+lgK;AywQX^-A%ujOf<>**)|(|*F~K*s6kveUt)(=Yv} zLsO^2tEXQNPDc=DB-XQ0!Lu=ivu}E5MN5aKr=w*+(=jtLu(L2&8u8jZ|IwOg*%|k*T?{{`K3*I5X?JX2!BsFZ``_`fTW(ZSSbq zrPkkPsl;FD$8+3wbbst zl??WDANG{F<5d{vCG^}|u+~S~+OK@XPub4@wRZpz5#TWrs8$f95dKWwA>`d?h}K4^ z%)RF_@z4Fg0ea@)au355{lZngMclNERBVkjdKRTYh|)}ndbSy*W)rRA5nUM;ojgvs z{UoO0UaW>sZ1qI!9s78l*m%RN`0$_vqkjqd&PfK-Nq6&zglXda2QQ4Dy=Y2I$xBbM zdYfW9lj8I^^>$2}RdHH8In8n`-NhrrGCHH_d&Yy=tVh1D?nb@p%FlLg&-RVaY2C_k ze39!Moab1X=RTL`U6r3YpYQ#%z%R7WwW#p%cG1J|l1I5E)&G?|Bb0ikmG+L7hVGPl zy)O%xF9TkccQ%*j#=Y_R`6hhfO+-^gcyeX%tIFYzRYi+caYgU@cHaYOHQ^&QX}@X` z%WI?B>f(Cq$};Ptm+Mn18b9wf#^*N0FE?ebH>IsNr+2lct+eJ0x4kcJPpN4i8~>2^ z=R@w=$HM)O8E-l=-*EzDDRvGLiz<2}3MAGRiY2Btn9Pjzii_y3t0Jf7(r`rb1-*FQTqzdbiJ{^RfQk1u0C z2Y3G)otXc+w=h1nNcz6Cv%WMj`)ho4dF5nza&2X0e`WS`W#;GV%)#2v<@K$N^`EEf zKNf%gIQYGA^ml1xV`2T@^3lJw-{jRZ^6JL+@00DnTf3V(`&*~`+xtg*r$_r27Z?Bg z@?TB_C^_}33%VkZj3TxJ)rCC-G@oIPeoawdJe!RB+(1one=UUp`u_pCV#6SpVi*8Lrqc*RbL3#5=1x z_aB1|@21}4d*ZGdHCD{jxqW+U_ocCNw#j2@FxRN5>PK6^&hH;zn%@7{0YFe(H=C;$ zdSVzw?+rEAEcFxljPh=_)GmL?lzI4bsHJXoq(Cd4%eb|E{ad-&yL-c}4S%Mp9luaA`EEMtN0%U0$x_y$Xbvy7 zr9{Ob1xFge(zPWz3+RkPq!P=6U%o1v1;1XZjL(-)OIMQ%67}vpSoV^USonpOtFB%~ zTRVk)qBXnsU?r<>Sf4M_yl8zTyE&}JC0d}Kay2)gwRD&kw{Hzb3SrqXsX_owGe6{0 z7g}*XmC0C^?E&NP4c<{k<6?N_PQ371D$~g8z-W=Tn(E`^c&t}94^dwF6f=4Q4nwG7&5v%z2aJh6Z1RGFqUsRj0EBSl4->ZH*f=Rpqd)YnwQKXJ98eV%@l z=+`i)`N*J+Y(78j4yQhx*NPSXj$TB_N%e@4ZcurPq2*Gs!5+oyR3!<6aXxRUtBhj) zCd;P^#V2ETaovVuo@>XOGpEkogUZx_+elfu;n^Cj5z;LVtByoQt0yBb%7@X>aH}Zw z`fw7RD(IZDV1TSIJYNf25W#Q+{4s7`1Zm#$2mDX!0s%R?UQwhRx@ZVHk4A2+V&)P@ zh@el4bOIRM=B=L%hm3L?6tzGycfrZd{eOu;d>9@#!3(ae60IckJZ1nJtL4RNjw%>D zC3N2yKb3?kU(|Fe0I8~rbn|inJt_+gytJB}YvaaB)zU&{V#u$-H;-l7AAUPKSje_} zzi1lIgZUREG{9A`Uq}tg3TG3i`hn$O!m#E@_b-V(m_)hEJ zDwD4M7NhCGB*L)Eq&8_(-CtvfG^1TcAZ?8krw@05bXNHuYkfV!|BFukzH-`D=wt=&@I!S9?xJ zwGDUQTN%tjzEUyJsO!^0w>t5~9LVsH@tWs=StM3cM@7n9!ryeKe_TF8#gCE>Dlo(j z>=*U~P;8|ve7+0dFokyWaj;Ny3>xOt7$xS3=>$`tTt0`p7WBY??{kNb{BNLR61cMG z99YhHsL<3BPu=;y)e0fQZRX#4k6fm`3yLb!Eh6dl70&;;>#9(GrKNTG1_#{3G$<#~ z#Oab{8ILq|ip^ner|=Rg%a%w@3kK-M?-gDO=mp)?@a_C%Q%1zumL7V5mKN*~CWzWL z46_}vpncbOgNY&>63Yl}<3!GTA%--mQeIuK&kaXcC&x<|UD=IARPEns4zuui=wYt= zt@8yXC~T?lJZ9}iKvekqiQ#PDcU@Bw$0+K!M**IZLIV78g^q>AQ1t@68ru3~_0KXW z)~J5A>~%UN<76U3ksal!eR~afw_={6pNPZM{-GRr@dwX^B$jD|3NhK=I}t#mPHSjn zGFq!!?>}<^V&ZXevCSiH{lxWTH+~812EP!{%B0pY$WRUHMu<{3Ae25Aa0gYo;WX|F zG3kHJUIdeD4#K+bQRtf%EeqqCYG5^x5mo4L0m5rie#r6}LmlPcPVec69p9DGU zyz}}HL2D?~edUA+ZNKy)RuS1eEo5--ZX zFv$r~aQvQB3ZHkUd;T)4q7#ML%A=|i7zqE^JS%!r7?L3XN>@1RqO$dCGGj|kk?L5X z0|#4T@(#@?J3D!(wk`64iz8E=Fxqo(_p9HDg>jgIJFQW#>_o-N&n3vGPo(K~ikj~sj7AluAf%VW%>$k8 z-iXQF-&&6Ua5(fmOsf!C9~ppixS+&?2nv&`mtGKK!KB2z)Sy8|>Y}3&i@0SV9Z8cLMO66WFRgoIqkA*I${p*_zOlOR+w8Yx2bO;q|TOd~C(?WRp?)waAHF%XX4 za?fxft+jz!1P{{qjTPFlzi3U5Q^c=+`~l>u8iYcT1i+u6i&>7YMRGKZC2M7ACQnWx z`nffHSp*RpLSQ5qSykf z5zd?wS1mnw2Bvlx$G=ZUqdrVxnIcQ=A~c87-VV;Qo=EC!V%UE3WG15hjbE#m^F%X< zg}GTMqE($pRSiAGR08_d>bq7d&pWx&VpnfR+`B={STU#efBi0n+$FHarJi;|q+lEP zr5~ZlU2TM?xKMWBi2xbtUkPNK{BPsVB4{Nm+W!0=K!-Zrln|d_P&?pZQ zFY%&C69|yO&s%-dQc+x^yABim2o_!CRhnanRe@e!y19>|V-2mM7jMIoC5`e9kbx1x z7Hsh%VTuiC6lq)8>KY{Xwjjb3QURjnpVm0rhvWgYX3fxzmz0RwX!&B`d>*Ww;>&i8 z;M1&YOHZQ$QVP_fdzPs=C>a!7jmRTYt2N7M_Xs*g#XJ-g)I@-j1H|Tql4$D1(r#nsD~~zU!oHsl(EjWhs0|d*YJ4JlldTMVLT|t zd!f#nR=}v15hVbClRGs=JHh@i;UPt|LJwF_i&AJF9E|pAg&^q668!?0&o1V{LRe%_ zuf`M~gd7F-tVa}eg1;<+v6KR5=t!FIa84$HQJg}Vqiu(Wn)ZX#bF@$%Ai6&w9#|}m zAsU)!(g{>Uw5(Gvkl?qx#Gn3g_q9ZmSHY<}5gn3Hl>!fRRt68nQwuWc5M-Ad4j-(h zbBRSOBK%Vws0t$s!oCK|t=p*Q;XHAXUfpIwzI9@3 zUKyDN!VM%|jp=kS1szGo7V4Z{??{!=V!2(Ke%_f5(;!kSsyu&rgZbK}IZ!1iFh`(E z)Xqymlbz<5mmLESM5`Jqfq(F9)bpyaDQwGwuLnn2=m8f-&s_w{dVcetjm==9cOk^) z{AFht1$nSuVi5w^ro3Y}0`n7V^76WD)?)L|kMcnr1z_cZ%jWcuG=?)0N5McdTq;tr zBMf%9Q5|7g%&3ydhH-SgGTlO-^q_$2HyY_q+vtKL6)3EL)Ii|Ns0ti{r)6}fm1D+a z;c4N#97nqJr6k@-1A1wEeu!tW;1>1O!`kTQbBmL5K;8I@6cx`f@LR@#v{QhetIZH7i zKYGQm6?abE-&R`WT0JT_9{t|nH>Jr12jY4>SU$caq#B>$83kUTVKlf@h5I_Y-0$h8 z!aks4&_Xswyn|X?ClW+#>Z)V=#TnWhqvUElDeAuyqpnZM-B3Xf0zA29SrQ<-mrjj7 z5b-~N`pq^eZ!3c6T~2Wn*zmYYHojz6x>T>a&fqt#5wSE?Bfjwv)oJ!r8cSD73i+TY zc56at4U5(Em8o5Ns}#{-VN~(z1f}{-!S;kDIyjz93H>|;?d=}P7A^DUu8baB$8lVD z9+%>Cx2eaoNq>+=+K5VTkj4Q`Z4H8-N5DI;AbNsJ-py0ap{a+NsK@cN??7-pG{WGx z$%vyk(wChfP#Ud*Zfla0z3}%x5>iz$gNy)lPTjuWB3jI5EvW8-BffpaqfrCyVmX$q zv)N`W2I#VQbl_ox`W-*@{SGw|xiE+H3WvLS*tSWxw(hVt{Z^_M{w3pRH4+CxKm-z3 z&_x2k`*H9JVyL<#L4b^`ku27C|6oL=cK18$9e4E3$5L|!*wSsoMdfKJ; zy-BZ?x4580q@pBgfOW6&6O`Wht5XhY^urm3N4lD>x1dJ+MX@yoLbhglq>6q%r!mn= zKlm9uWNmMR1BYIa@}XNOvB=0q^JTL^rq=lqiF@#QD+%TBiuqRXveiRQbcVkrRkkj$4D zr3>_BdT;xs?(a~Os#NRH7trFDD_eOC8g$kipT{V=^xZ#aCiL0-?P>tP#ho;}(o_xJ zC1Ra@<7k?VK^mcIsR)X5)oDyToSaD=gIq;i~IX=r$PVOWn!Ostb; z(WfN2pbrQ))Z;LB0N{GR5&k;uxJ9rQ?gOBrT=|Uk5En;VQ%>he@-rP{3L8TvUh_W@ z8pW1gy?8Wep#05nkVf$_?RWgQf1EU0)pfi-SpQUemYR-V54odjNMo|dFT2qdELvo2 zJhI}U#+N&8HOy&59^GpFW|~-1g9(04uWnvbNW0K`U1QS1KF=k0qLV_Ef&nE!rMxa# zB=S~aU9xZ;T?~3jxsJg|6KOq+`9((kABQl3f6!x<>F+hsE7?RxTaz4r6)U}%7OR=& z*?JD^5)nXKKN-H#4#7~{6jO2V1i~?Ir3>4HF}09suu~yiL!Or>1-kY0X4lx&(btS& ziC4o~;9axEVoc4$46V5{!j@>^qZ!I@+J?8(Zo}w$H8kNdlm0M$;;}cK23-vfvWD|G zzaW-HR+ss>m;L))28hVP2D~!=xcsX8F=yuy&Vi`brwyIX)eQgg z5=c-Sw5mJkmqOtrXW@G$iaD?;1bWgyaTle`$3v;2%YS#Z=Qb+vr&ZJ&Q?^yg-xpr< z!AaGi+{wds2IeczO;#u!aInOWvYaKp!5qL{z8hz&bQcE@YY#s;?`11m?mVK`uIqCu z4NBoQ-TE=1RPrM+l3~yuurd*i`vtTzBC0i!f`1s(E9tRDceISqdAR!>c59~ZUfz1Q z`U<5ZACLh^piM;4IHT35D9=dzJ_LWv~slUPt4R{rFay8u_AN07S;-VR>MS!Qi z$g;MG`W8Fl<~zC4(BGRB@b=!QdjV)_TW~L~YN1*8KrqGAZNu$G;?wNEbiWvTD(Q>k zo1?YRZbaC$F?tOEGtK;^wP=AeZ*Gx<&v1VxBnA%gEka9MN(~!edK%S>9pzEw(Y;{y=tsS4HnyN6kFVt?-d8UkUH` zAHiGuLS#oRPzQJvBQUS;m;aw`F2m8?7ccEl7WljkT0oRv_}XVM5&TjJAAyVX62q%_ z;=qfmyT_vn-LWq|rl2AJC1a`~PyMWb9H;eY$a>1d?{ zx&-Prn1)mIo3Z;{8KIP!FYBRaoyl|>0s(JMOFFp7*D@o8>7_69PFc3*L@8Ezc=|cJ z_d|HjB}@khoC2m^XTys*lLD;2zS4Ilz`Jh_nT$yUOI=8Y4R6w+&!M#ZR-}22NJ?f2 zr)koHRt&v>Nr~0yqD}(4+>_pzf2{pcsSaWM)?>dkk_E*V&1S}yjWVU2zm`~kTQSa2 z^4wF6PRw!pp3<=Lr!|J4aoSTd_llkIzBnXlhKYDRbHVfZ6r zn4e0=)Ot$v$NMQIbqHjf^?_^l*2cz41l6|%9k>547`Yr= zm0H%0)2GjA*IgwnYg=w)$Y`{w_Ht%^A(pn+6^o%4Z@XV= zN)6BvQs3&ABxy5fvxKC2y-~dJYTK|R@6;ov)(hqSMxf#wgOkbw(&8sZ#MKRnjJdvxjLVrw-L^ z13~9;oQcI-7)@AAVXxuy_Cjc`iCHCbR68y8tuCMFkgSGfVR(aqwQHq?#XTP0U*jRF z4;~i?klBkYovt`cjM(Kn)RsGEC2qfUjg9jgaY>n&#F$xOi@Bc{9S!CCc9LBQkH3kp zeV46!Y}jPwn&xTqZobdrp3S<2|NW?T4M*;OVt2{f zDmnUlh7-Drr`A2{d#pYMR;S{mFswc+38!s}yC2T&ZD$$Mmt>4hj0WD8v34YX4 zRLEcdW$aGy{pME^aOIXMJ*$?*UMk$14f+iB|fqogCLuaWINW~5;Ek4K1I)1KLpBI4y$lZ+6*?*c|4 zzHD}c4}M>r3!vUrH$9E7#5Sk5?g4gQ%w_%5IYsv5*%h~V$dqbN48WJ1+$zMq8 z+U)RS;D_iuxF1S$BuvssvCfBVO<+;Hl;NE(@E6g|DH{191l&ci;TTI)#|h7M#`vOW z^%x4cq|hz;?-*_7qJ>1oUJ6cdp`Mugr7nKgkuI66&63naxMj+RD(77=Nrm&HH-$r2 z$UUn}sj+_s@Ij`jtO|F!`Y_DZDp}JG;SOP3lGKF}__-9;88$wtR}l#`%1gfEho1zQ z(a`gU4#sn%#E`!jRL6AJ*69n)=erR`ZERrCMN{ytfFdaF9R(Fv4fGCaiB%34ja99Q zU=XvX{5h>f*-0)90WKvx#4d&#lX^K;D?q^b%it=1oqnuiV*Sk&dHpUUww5wi*)IZ; z0?S5@rTUrte5KTWSvRR)=)zw_INuz)ppS|k4~K!R!+Gcp5*Pp!b%jt7wiZ*tak|NR zctukXR@)Kwx6nucKp;t`FjHK3@=JV#OH?D zEu^(6cRVJMV&5WFPJWdc&e6koGb4B+K@%*(>WLtl zsiGKqJ`gT1-8Gu@5lV~cbkH?Rr4)aq7q9F5ySthZwUhe(jalWgS+Su7b(X7{hQ;R_ z*sf?HXvAgfjp7DUgFOth%_`lg=VV1s)41sPN=h7Q3=5}a0v>V?^}I0&{B2g?Wg$={ zEOf`05JWJA1}i%mww0#+u{?w6CtvbV@)Mr-L2xh64dEEBhWUP_Dt-k#dPg zh0c!21d#0X<}*+SUPDaEaQU-3X0W+Q1RI#7vc~B>@kWz1JCgN}KUv z-+AC_Ma_#)w}>{*QmRlHYLb&S2t>2?K8_z91`<=$$cgEMfFx1;l^WfsI1Ny}Cm**q zhJqH!KogTA%rSp4t)-lv0aL~&v4JoYwu(}B6mB{c*)x6#mGVprqohSjf_k{)DwHlS z_ORT=Px@Z95$J>xHclm7-LI(%pz0-5L*N|~iFui#A>?(1y}`Q@1JY=XwU0hZqyW`0 z5PS$@;t@tCF0U7HMAm`jW@)(JsSVF7T;gU7pn0eWQ)DPvg$g(_q_EW~JsWlRE?3r= z$3s~?x*V0>eII|5*Bh~>8ZQ_5opWLy!pfD4v5mB+`D}%&G^oU=EX`<^%M&D%LgLQz z7AI4zda{Q#9^QTWijHm#@RY-W46iE}M(EaK#kywvR&_>yuN37bVt-PB<|*JHK%w)+ zQy3HgUyKaMzGCcMhgE6}iiXzFg9-pR2-|;E5err(QE+7PQ6tbGFjFrQRBr>JFZTmO zJpd3$S;O-VNeqNthWdiQAN@cym!nk3{2>#iRRA>u6qbVmKq6DXM|RRvZ&JY;y1BWY zxfc<4mvhPBDG-kF0DEIGP&5btagrVFGdY(pbCO5KS@H{#ENb!#ErqpbCls#K(aPT5@z+a3_ls$O8vrd4yIr zB93^s$l=iObuD%g?FhzbT?jynY;e0vJS(Em8CPYE)VOS4tctd~MX1Wnb<0zSNmT%} zF%^t#S^^~kTiJ{O^W1rO$P4A}X8rD4r(AM>8eoCIWz88S2|yR%R35!OMlxdNiY|B2 z1eq?_CBYIt0;rmK1H-C zd8xt%d_b8?P(-s51+m(xx`tqeCZL8VARzp1bP)uA1k^%nTC%-+pkex!32GoDKowB$ zI-md&0#PnO^S(7*oG(I2P!JL=uZsqK;U?Z57=iroZ*97>x$;4d^@ZDY|7xT}z<9 zPa1(4MEe+_?bqtr-oWwg=&Tc>>)I{vfM_Vqhpfm(vrL?J?=6%t3APnQ)sBWbU!KJ2 z5<%b?(osETyZxoPNZNhLqObw5;~UP*7%+R{twvlQV=nX401b36QTp3^1I4g)GX*yc z1&9dx^~DrnI4b%gjJq6VCdvQ&km5NIu^UPGuo-eC1jT7FW~~GRHK3QJMYx3kTJoUI zQuHMe19#nSEaX>)kO0ebkbWiz#y2xX=*A4WT=%~g-D8^ZXEK8#avm@D%@)#Ix4)69)86`ME^90wd25z%kl(-v6{k*ZOq7$ffZ+x(>`gMNMtnq2skhF3MG9nriTyl8D3c@yfDs|uP!b4o zb%JfdysH-0K|RqxGILLSAtwYtNKHp7z~z28WomcdaUK*4qVO=F(*Oc((TI5y)NK$1 zZ{`hvAxKNg00tT~kQ6L=g;5!Fc*7QH3F`z@;+wm7U|=)|Wr2?CQM$qHzEe2M8{OBdj85^#MeHtbH{A z)d%93;53SJ9d;2N8*K<&ZJLhu^+%WHq+9nrtnW|Hy^iYZvY0@)$?*jMhzKwo3lxN) z`*pB1RuRyE4tL}nlmsM1f~_7;oxg(dI0A8%))&3OK#woT(wJy13FHJTkC_t-62i9Q z6e&6-^R#S!sLjDis@~)<(4l7e9{P=Qk|H`iao#G$DjK_H({T7*&7vl5*0Q{0=(}zE zrE)#=M6*S1rgeCHOcsDjs(2&A4wjoBq-dvkr2uBYWfum;-b{%HWt)Ayw69BL&#^Bl zEE%^UK+~YkdFyh3zzjG&>!VcJ|H-;nd%M*`b7ae-yAG{ zIl=}A7E1paqs#v)rWjcuw(d{fl&kXw9GAcpLI52)1!2-=hW#3B41lnptLGmtW#(l+ z?rrv2i5>k6yY$mCBh5gSOR=yE2Yq4FX~b1<<)1+wQF})Z563V(jP%_oHlQiJoS~w* z@}K5^@<0Al+l8CB-RJ&`r62(X34psI3{r5PLTF~JVisWvIysf6-uZ9f9o#Rfm9J73 zT&~XCqh~uG0sqt^E{~wRvkX$Cc#4du-=RRc*`TUH-_IIRca$DW-s_CNd4JwnW(V*3 zx3wtVkfK@%PJ(+>*Rah;_%U|{FDqA^bY{sAxJvbw>}DU>5$SV=r%UG1z%W{93NqaT zm;5>5v{0+^Q`ZIkz}!S4$>zFCl36NJk_9Qr#&7a=>}m{5b{g_L2EugcD%*)bms6Np zh3lTdBqcc_blsx%5f|#^6lO6IFJB_B15GajQWR^bnmNF?U=rB<+cX37(1I{`z_3KE zO{fZ>kM7jF)32X5o1!xXe^8&KLJ_`m|Czaz5VkEuPQ}J&2CC-l?FN{q?B%$aE2U(qcm@1Y5em^G8rVIsy5;vpb@gR#}_mGvk z(}x}=$;ExaKl34;Ew{PI{B-;lO*gRM&1F2P_1#^OB5C$9oc=aX=>{|Xi#r%_Ra8vy zJp`HzHIsxnV4@#y6V#d?ksBmflpcqH9*0S~-O_|So)#X!Zj9f1oN}0oV`mQUTzVaY zU{$UV$N}m=a zDru@Ji|pL*+#Is@?t8crc|HcOl?7bo5X>@)_W@&XNu?z1_ga!ay;73Vcl@-Ep1on!4$IBnXDrf?z;5(2stsJnl)8rm9kJ zgjyjw9lV8FdD;|C8)iY5iqWNJIr9B3IhFhqZ*V!NM;!F<8VKwLAhC8At}K7Ejoc~7 zPj&U-b&eo#vr=1Q&4>&r^OrL0&{q1^y(^9YZ8P5Lsj*hS+OZ3?E&@d9Hf4>z4Z*79 z>_o}AS?b&Xu1D&>hp0L6D=;|!M!fL0-|3=?3CmL0G1~xGngKqw)l?Vx;3)9>KOmHl z_tU$pH_JA9{o?UR=);11LQDh$p2ED)hTVh0-n#OtVm9!^lj8t!YdW>gAuP1~q|I_G z;%7u<5pu$BKXEPnDP=|ljejW_KBJkYmRaeglp&D65fB<3vhxRxZ$r4X&r*YE=AjvK zAU^cv$FF)x_GiEbh?NE4bwNwYsO0^-pYzV1<(bh?{{&bbxH#C^05|`TGJ8@t+he3R z*C0=@uKwkBRpvkWwizD~80bet0bsN;zaRQ?H{j)sCGIKLBh%lfUn# zKQNDKiZtiN8c&tp(Ch;_Bq-9PANMWM=C5BCe^HfEt!niu)}$OYa_#E%E7-8+w9PuD zjiNTQU7-zu=ndDYY6e39@bT;+xLLj);+l2Vps&3ZtIdko|JQ4>xL#)!Za6F1!?eOG zK0DQTqOSx15QHeKjT56&;1~*<1Zl@$%cITlv0wl|1+lVc)2>aCW7gcdb-VMjJGV9& zOMd*IN!2G%-8Hh**oVenEEb-A>qe8Q3O+*c2xT%w{X6*Z)VJC$Z~i=bu-sNzq-!KS z{P^gTRe1e=;(D$P2GWr4|9@?`i8mmDAA!pUnklDY#R9?-kE#@hcm8-o2|CopZ0PogAK|4_6il8ddQzIL`dT0y zAY{@hvFYVkEVAmcmn^ds^#KAUznb(zG)^sn&r!d%0*bV`?GsEOMA(CpGI3O4VJ5_Q z1kO&TfB{A<{;abH8tA0z&ogNF!QdXXpg~O*_nx#*KNNww|U4%B(!PR?2F!y`%ilwe9U@y1evM{y|fObN;;qb?dU2p6h0CB*W`Q6N;I^%A9; z)Jha-6R|eiZ2O3kA2pZ~$8%t0@kVBZ zpi_k?lthaZK3;%=bkZP10RjT^I0(JGn1t8EHIB22`-1);4Rq#hqSkWARXrmtq z%)$a2_#J$Jz#GH>!ED=;e3P=P}86c|5k~{c&riy42lB_TrdO#Dlik77|2 z!E}BQ%N=w~dL?*bKW>(VCB#CMIqBXhzvnrQaI>4<{3bZVDb8_{vz+EUCpy!q&ULc0 zo$h=mJmV?PdD63<_Pi%P&52EwSra5X%+1iUxf^^$PkQ?VT|5@HkAhwZf@$gp1U*>_>IecB zg2je_Mbipf*cHPbRyQ4u6dJ&=j~(1?2aQ@n7NWrlYAB9zE+9tW^5&7wHclD)kO3iT zQ38gt1QEe#Y*wi8&lL7U8xe&qZZi}Oj6cC~)F0QZ$)mUM4l%Qkim@P^`K`0(mxyn|y%83W7;)vXA2{r!SI;H|+^@(rB ze`Q~oeH9HdZ+RjtzVb?{K(Zg>V+UYFL$uah0u)p;j^Tagc>UZ&m)77JNLdCV;H>CH zGnypvh*X&$0^LTdip-Oehz-9?X{=;f2A7UVrZ>$KhbpDLDp{4NQ>|(d7gw?7m^2E7 zE3H<8I=Ly7^{;3R5m>MKS5Yu^uY)b@j3$;LyvDD7Th&}i+d728+gE{j zNwu@BZ8{eF|JW^lakmQWUodba!0fwm%Ea9eazks|zG?-6+%4~Ur<=Fbo@1I-=DFU! z`G@&V*}e_ZZ+f%I8et+h!V}J>0-ssoREe?r8t(9*Kzzp&R}(&bc=3&MTtc7PXEt%2 zvn+#3)3;vktRwE_j-M%@EPpx7SN-gw)lmj`o%zV&VDp$C$r7*SInaY{WpqOh$|xi{ z&4pehTUbKrPJjA){|)t$NPX&}Hb+~nuJx^Rz3W~llE%L-_OZ80(9?cMX!rW< zYk$~4B!@uM(uOwRfX4Ak%zL>rl``1XJyz@^3m(=z^d&U+u1`PuZiV;Zc}B(IH*sR8 zlfe&&SOqh|hL{p_o+Pk(5hEd4wrs zfhQEjK2SmxB;F&$!7qJ8xuAiCJc9gDmG#8~({YQ0xF8L_Amr4b5uQmcBw5N z3bI5TL?KByB1jlwfw13I#KRUMMVc%DxP@VXc%qnKqLFylBZz{{<&Rl`4K?gS8G+45 zm?A0y4Uz3Zwm@GncmWX@UP_RmRmcDmn#2@VVag*!$N99qj*9*KqT@^WJE@T zB03=eB!e^&8}0)-++9!fOiLPDiQRwPAAWL08ho(w^>{NN8{ zfhpLc29>}gtiwkT!!2s1Km6bhNMi^>pCu3h5u$`BGGRu71SKv7A?C&-B89Gy1U@Qb zmN26l)__^D0t@&O)Tq`=+T0G*;!#8cEP6y^5&~Gp&D=O;NGL<36i&Rv{{af10loMG z954=Oa#0P4!V)OL-t^7iM2#el=AP_5|#9`FxzZlc9_1PY)78!DUGN0ULPh+NlJ81t=O|*R_(8WztPf+>L1Y|HEHCX_2_-gPH>KWvV~y zLSiOiuX^b%{;LFaBW8-kUCOH@5(9Sm!ydFLo{;GVb&!+#gTi`5!{RAO=;>&^QZ4KN z7x`(y4#^OB2N%&qbneYBDGfUOz{C*d$$G@3`opRIYjybpc5P}(NGOGRgr~Vc>M`hq zmddH#>t;RC&9>@$#loJ|k3V1wFh0i~#1bh{EC{fGp(!m2JOX1L&kxe;sb1$zOh=Vo zDbTQGSeB|s2xgt$X`AHfp7ts70P33#Dh=X8Jh%fJut8j&1SNEd3;06`@Ts3>ftVr> zQGmemG|$A+E0MV9tk#x%vI2A3oQrCxsQ~Op{3{oE{}C8@L^LF$NVtGn(ZsxNr**>4 z#~QAcdc?U*tm)PQ?0AIeHf@DOh!s^47*Im$lIfYm0UU@Y2r$UWW+OgKC^}?SZCTID zQtU0tYP8S<%^L0As1-as!fn}Vw%ks(V6R8u8SOra_3KWe0O5iUcWF|F&jtHfMLXXMZ+mhqh>sHffi(X`eP~r?zUZHfy)G zYri&Z$F^+GHf`6oZQnL-=eBO|HgET~Z~r!M2e)t!H*puYaUVBwC%1AhH*+_)b3Zq9 zN4IoOH+5IHbze7jXSa55H+OfpcYil{hqri-H+h%0d7n3Wr?+~qH+#3Yd%rh)$G3dX zH+|Q)ecv~J=eK_EH-GoHfB!dt2e^O_IDr?qfgd=6C%A$yID@>15@5yar+Wmg^THoIxv87PH+(}r zfJCqlJ4&d-H|T>$$b&I3!#xDMQWQI}3wuf+yRz$fvp@ScB*nA8!>fP0O2~tm%R(AR zK{2E`u0MIG8v_y)|3W8p`jVT3lmEIusDrwvL>?f)xqkx_JUK5s`9-h9ynlqfBSEO6 zM85BPzazoFBSFBI#5c&nlHURnpgFi-yh+G|AY6kMjDbf0L%LtVl6M26%Ywz9L?|G^ z7kmUNsJu#qLLL}-tfzr7)I%Qpfl{RW%6ml1?>tJtyv);i&EI@YJh?pBdLB$U#xK1{ zyaOt%dKxVGJj^+iH@ZK3gB1M1HFP;jki(O2yFYxn(gVXKpaR(6gDez-F|AR4f0+D+J#g{rc_&wlzgfvXPF?@vE&%-TT0xeiR(<6B(d^r*@LmFHG z*snU2%YnE1!@vIlEkt=IoJ6Jj!#g~=w+jNP=Xt?DIVvRlC6M|(0K7jKKa?XssiOkO z<9tX2|L`M0@gIM^S3mPRKlC?0lox_WjQW#L`WHa?El|A0OaGH&|Dr?snYTWW%RZ^Y zJQDoAN4$e907Rrq{s{W(gyd2qBny$qVO9_#L03q$_(RpuA2TFZuJPK)Pu;>ovfkxE zQtVwMB!$A=Ti8z{M~_!PiX_=BlB|gSCRV&CC*#JBA49GrX%eMMmM)R>Be#&(H7Q8C zg_Dy<|DMN+-4+(RcMFLif3i06;x+Y_KRHO!kRyv^XI!~+>DIM-7jIs@d-?YD`xkIv z!Gj4GHhdUyV#SLYH+K9Oa%9Pq>#k+k5Lj4$@@_TCBj-<~V*ci(k#a4~WsGv=E$o(> zmcLL)#>je0u4p}iNM$4GYTTbA7%OT>DRY^AAf%R`}z0x{~y2rcN#;r)7FyX3e^_+ z!i}&FQti6A)-nl+d;Ga%AuTl2qMM=$lFm2aUc!)~4Le-ut_c^iki-o+^l-P6dZX^8 z|A~Nt=p8LKIuAVY$kS}C*et?mtr1((A=|pTQO`z6nGo&}SlgGlH%0%K!f9mw-PB-Bk zP0rxbWI{VR#$e{6Jp6g8MDx(V|O9htb)Q&3sG7vFsK-Iw2f{S`G7|5OAX z_!4sDA;&V+ZqlT`inng zp{0}01c`*1Z&0CyAturb>>if?D*G_B1Is6$i1aa=pnKYmi|wiHz8mkn_1>HB`>t6? z3PCz?1>#Fm%R`!6p+%+d#vOkg^2jBhoN|$8y$qF5!UUZ0%02%a^w32gopkyBzMNKI zW@G4!(p`TY_Sj{g9rA0g5*GK|Kf82wB$$|o_TYsdp7`R8M>UkG8abZ%|K^>49{T7N zdxsq9t-l`o?6u#X`|iE}9{ljdAD{g4%|9Rg^wnSAu6~y0CuF)py+^mD<#Io+ckGV{ zfA;5k#~^*h3%w5QzKXhZHKJ4<-P` z7+(mSg;4Q@JX}E@V+aL?m_RHkP6|wNsGdGTafM5Kfr16$iXgsm4_+Ki5R#h57j!a) zFI?gZq2NxK#&8cON+A^aD1;QgaIG%JF^GW_bUf#LiDGxFdDeSqZ>v%zLaqNxyx=}ep8ROM{a|3WcfxMXa){31Eh6bfEk z0i-0A>Qt#3yr-;;AhMWB8q|?Y?YuOT*U(WPfuoW!i~*bepoL}rV4=i?1)OO(YXdC= zmI|Q|`lhS<7A)vzgWG zW;xqg&wduPp%v|DNn2Xeo))#KRjpwA(b|2y79X*dEk5EAkJ-ldwfkso{8rmr-~JXb z@z}u-2(pJA{K6lKs6jOF;fWf^01f7#Mh$kr4Q_y82AQzNc84GXKUiZLGLXRxW&j3m ztl|eKsDUkpPy=(ABMvnP#(U{&UHfulyW5?{ey5RN|7%#6zUgpBZEcGW-2xZE3GQq^ zVp|IkfM68RcrY5Kkp&Q}A{Zb*!4H-Y4+n!F7U+1eA_$_52iJfgLtp_2YOsfBJov%C zkO2q?f(8&oBQX8ghiq%>4tJ#Ez19Wqf4T8q0T*~53Rd!x2gzXCx+BO#F0wfu@!%yW z;=y3>hmD`%;2IEvAf9+|7ATSB;=-631c?I(z`O(yz@Z=L0LP4#5e+xvI5IkpZI7ee z-zg6`w!CHXpb70gC&vTG_?^a|k-U>NmjDDTKmiDLuw@WDV;?Sf@DTR#=txhx()~a| z3B()+HM7|tn)XIPW(?yS^XJe|HV1zHYmRi({}vyDF7&Q>9lISP+8p3zw1M?91Q3Yg z!GxHxrJvysdemaZzc4nkL7;3JI~WBPMnTBd+-49^!XN#J$Bd()jXN`m!SRqat&8jq zV)q)}@owCXq0DY`09x05DFF-ovEp^Oc^zCS$yT4d4JT z_8<>cAQ$H1z2xE$il7KI!7z#-5d=|Tm$!7h-?Fub7_=K>%FWEbT^8GdmL z38hW4Nl==kALW`PbiK?%&j zA7)_-w!mkivMHaEAjW|XoL~!dAujg88mcfNw1E)BAP$mD9U%f72muV@pzA!L3ecby z@nH|V0S&SMA8H{E|4fY_)$_KOAKK^qRz8$_@WC6X;S@cyATCoIU{V_v(;rMz8^FOb@j(;C zvN8RUF81LU-p#hqfg0AKFC&uyCo?h$VjsYP8c5S01QH_FfjC*RH$^Twky9IjlOT+f zIRQg7Tk|0CAu{P{&AfMFi6K_9L`2AQN7zTs0ILLFpb2J5LJ z38D^E;0Z`z3vR$4+5iaDU<=N`9~>bE{V^`R8y56ER`UX!5dPM0cDj|^Wqn&q0UxS z8~WiF|JGq0ej#K~H6Bt`8NeY@(^DbW^I{?+I}qb&rm8>vfew%W4;rWl;sFQ7pcoL; zA8ud_u7Meh;Tp$)7rI~zM5YU%(jV&eUbi3@iU1FGbS3>^M0cY9-~mV>Vki9}7tA#g zlmHM^0SUV531+ki=paaS0Y}HdU?0&Rk^o3M_G3$-2(-fqKo%n0Z}ZGur6IB>Mal@8QE7v7`@9`;hq zAYXa3Ag=Lf=pbMH!3_9Sc*%7OAOQ&gK@YTm4Gs1t3j!Xh!FsRPdbc16Hi2Qu5fhqM z5A*;S^gvI70Asz9Ae@pQQkEb@Hhq;~eG`EW;P+#9!WO&$fAcqgi69DsAtHu=PWf<3 zakdIFfhD!{2SBn0et}900#4g;58yFq56&O5KpQe)YN^&6R<*>u;WY^YJbxhz|3Os} zg%w(DE6u>nY;n@RT6h_j!4(e|AHd9n*MSK`c!ce?AY}M$GgB1@<8N~{Hct^8Vs$tP zqFN8PIR|ov38IJzB639)iGR3S4flv6A|6`Uzh2lC%@#02_lcVm9DvhnYl|45l{16Y zb(xF@A>*$GgB;9)7i7T|fZ>h(f_CT0cIiMD;I{~7p$NA027b2*7!V(>ksyedAdZ(H z5LtO$p$J@o7hC}wYI1rBA`y~6lQ&tDivS56u^<{29U|Eu#sM2Bb{zBo3Dkf_(Kkrx zmwiKaWMTP6i(nqi00~sqE)W3}Ffb+Y!OgO;f!PutEDRke0Sx*<8%jXj|8N#LkJc25 zF3CD^f-TX4F<3A}&UB>{AQ=xG@a%(2j5K2_h5ccL@#5a%qHRqP%=onHycy4Q*lW>& zok7)y34<6sPlyLMQZ4t084p-yLsctIlVf*fwa6%MN}ZlS9Pf*;0%Al~6x0OKBjp^fK)8TNB7^uZPK z_?Py$k7?8;MgRzqAP7uI2}pni+Q5+kp?J;U23VjUl6ni|K?+nr26CWa57sWWzzo`f zVLy31R#Xu+0Uv?@36fw7x}cQ{LVf)qmXn|e;CHQ4HXo7z36wwy|8#jblU36^K?ZQl z<*u|jXY3J}R=6MyXdC#MS-=zH;ki%%9{j))9ow2O*nk5g9*991hykIaAqc$zvtJd2 z2la=eff$IvI}6&J*O*j)ffi4Zv`xD?Kbr|t*PtVn7FChCSn`Jj1EV#2i6KH!+ZM+d zdJ`E{85%V^e}Nd1EV#e9N|k$Z!9ltElC`5DWQbcZB06;Ul(k7yP@{8FkDKvkakqC( z9gM*ha7ZtB!KC?OY32bLU0N_~kS=N(FLZjRk$}K#gB---O;niTAQIdkctH!ms2gZ29c*wt!5EGv9NvK$?tvL*8Zef;mHZ(-nE^V;VP3oH z9i-tI?*SW(937`);gWPumdffmZ}7K(vQ)O#>e{T)`FJg#Ba|4Bm@=0P6DBOl(u6%dOS)=w7V zQyAT%kn%wey8T8Z;}X3683QCAbRpCq13abV9;t)~J3Vr~A$sZJ9fSlW9=uQFK_@bT z9P%7Z!Xd&9=pFh57+9J#;+q!Y!x+w+9E!XiK7}ywogiv@*U4d|=V_=;Toz^&Lh0N2fiTSyCCFS-_!6L%E;jd z!{w!g;#-O0EhHbVflUfx;|0Zl#v(q$Rpd)v>ZjhlZ{ghQ;-f{z6;k@tIinxC!4=5i z%pGY|6xhZ_2C{u<{p-Q>E*Q_(EI7L3csnI z@CpCR^>LD)JY45Es(`@&6Fwts1BcKa)rB#eXn`bjo*B+N23z9ii~aGd$1QGw7v^E4 zL3#iQ=V-n{7P{f%p?>fSU-m~{)`jq<357fe-o1TXF!twt=3;JwarTFw_zS}84P&H@ z;TEbgUzlOkX~Cv_h#jV%Ag15?{lWUFAN#X^`>((IsXzPKq58c){J&rP&EFrIp&8D< z{L3Hx-M{_0|NF=P`r)7Z)t~*j-~Q`g{^Otg0fL=CfBpm-ENGA&!GsE%HB*T2Aj5?T zBM!8vaH2(u5-Vc#=P;wkkrg#gG+C15|HYFIn{gb7vg5&zF(IB*IWXkMjWltJoOu(d zP?-=38a;|MsnVrPn>u|8HLBF9RI6IOI#nO7dRUv@>k2k(SAJi`nmvm)t=g^|+8Xk= zqV0wi{tU_8kjmd(A^ynReR!9QKOuM7rMmkzZduY1*DYMRjPE{{%h#`5z;Xo> zmTMUClD z-`zKSZ!gz znU)?orlfX;8mOj(VmjrapYky0p^T#X=$@u}h-8R4>Ud*_E0%a-iXl1!ON-~c7-Fv+ zW>}(!AdXjIw9-yX;Wf-q|BG$5+HTA3Q*E6CZn)x(OK!R5o{Mg}>aNRfyY9XVZ@lu( zOK-jQ-ivR(`tHl`yU=zEaKHll^A0ZqAB=Fq3QyGxEd4$Val{f&OmW2)UyO0a8lUs; z!XAH|mR4nfOmfL4SJ?2zDzD5kigJlSMmg|q2%F2t_?)LVKdG;i%T}wJZwP5y+SWy zNY--CZ8R2wQH^(0{A?@DwR!&yIMr6uO?ctr_T0-ku&}K-4`8Ef z;o<>bst33{Fd+}ppw>NTkq-=taD>~s9tBT$v7WU>1Xz$+HY$My5qN-n_X8I)3_%#) zG0+)g{EDsPEPtT2QjrtIau5GO8Yz@`@|aHL;o_lXVI;V>TBVKUzU zBKn;zh-|Y1VR-b+XqF>)m5^OI(q;$`XmNl(v?Av~=`t)#VFSAJr8_r<%3Ln&3$?t) z9~xnlNHF6M9aIoGfFaL-3KS{LFa&k(au*N1bD{A9rnt!90S~OL7stG#_i{Hynz^qW z>x4}<{}frX3z30#97PT7QsF%qq~eZP@W2LafQ;I$Gk3bvXxU((#VC4Ua_wZOPbC)5 zfetJWTDXKiu7OX2xZxh5U}b_ns1Kn^^_I`MP>D)p!k=o@T^~JHSJ71t+QBQV;}UDQ z$SN+grppjWM4$1t3fFu&V-8dG*xrtsRDb+I9i8yof%)!6l%^50` z1Zs+sTr#3U7IzJ>flWr36vWoKvekh&{EG#L(d}kygAIfwJU0rX=E8*2u!VJo;r%|* zJC)!;h()|M7J|6LBlb^;Pn`t|lXK9K#C7c(7a`m$fKQxynu60G6?)Whz_Q$yw%*l*KF#9b5>GNDK2Z zy8LA*n|aHMRkN4bd^9&Nmd0#DGE%-V1|&E&Dq4_&9MJLyL8Rp#S3rXFZmf?-|C=n* zQZ?*?m)zVps~H)Egz?5Iapg-(SrVJ>w9!C)AyI2uf2J-Ps=cggQnNb3rw(;vS8X$5 z<=NJImNlno?P^~4+Ro09M8Y4k07fU=5DI+=q8oqs!##v92jG z=%`!*328X8G@y2}s|)ooDM+{16?+UjZ-C~7x| z+|%xLwzsA3`ntQ^F+GPX-wb1ZUpw71R(P)m{z8MlSmGJX_P4AY30k-UHezlRRA_-0 za@e^mq;QE#Xu%(XkU|~`VRXueA`LqSM&}7~57o2YAC1rgDg<%Hsq`ZaNHFSwc@aW= zgyS1WAxF0n27IGn-5={6dBvXNFZ)OcVDoYgmO% z_=IDaKV8U%U+9KyA#!#&g?X5F%Z6mnhJim86%$AhSFlQ`fC~D61jKg>9@q&+5qks? zg9L$su!jVe$WKf_5Ro8*Q^5;Ja0NovTDJfUf)x%s<`2fu{}1G_d;)`t@^BB`a9pkU zim_Ndb|HXfxMZ(jXXOA3`Ij!mA{TRLhQQbs!bpthVvKEpj9;jXjx>z<@{Ai0jaE2~ z%xH~oh>gAoaEk|B&FGBYn2qJPjnx>1<_M0rCXUh7jmKz)YIZjVRu3*mf>wcudSzt% z&_P<@3f~|FS3r7`sEM42i78l#{t$z)*A2jc1fe(;f>30Bb`4M?4f^m6#I-Qp&<(dh zeEyJ;8%aifagOb{XH~W~CZaXxfDCP61t@86DtTuu`H~wDlQMa0G|7_mk|8&llRBA_ z;;@o<_AWmOlS0W@Z^m33Vu0|%UoOdhN(pv6X_W4Q|CBgMmEG5r?nW>>B3)a@tG%y zlB3C)?{bCN0qeMnW!0?xtV5i!JF>ln=uKT zOn3~sabV=IZ6RqE^H5->h!sKh6|Z*{*RXB67+~LEh!ApoVIiKWbeYCETclWJ9Tk6SJ?CG9Qc`otUaLpN-_W7RdqM!3g|DTHGpZIAm0m`45c?^MPadvQ(9f1G=C(!wwGGE)j~EzWAIfXJCJT1chJ~(1xNZs-i2(qAlv8D%ur9_66hF zY**0_`GBK2s-ruKqt|c?;2B_TF`&+MWf@{FAOZ``kOC4qq>NRh=Yph4%A`$7TNh%a z5Sju>x}+c4q)&RJSt_MmN}qVQX76^Ic-V_n>ZMq!rBgbjWV&(Xa;9OIrYIRygr#j# zAXruLqJ8S8e~O}D;RZ;Mf+X|~K#CPKN)^$=3ytcij|!>0a0{*=3y7+M+F2E35o8e< zI6_LLwTz>Pd(p`V&*p*pJIqCyR4|El(es_Lk!rP`{Xs%D}Jt1cs-v#OaFat`5F zNODo9pSo(c8e*&JV7gkLo7tUs;6*{K_VtlgJb9D)UQ zP;Bz4noas<0c#=zD@-Q$ub`^116!~T`>?$hu?%al6}yykz+3`r2e1H?V%n++D-Iqz zv4Hrn3)-;@3$idNvd~%`0XB+Sz^Nhxky5~s)k6-lK$uQ(58~<+=6WDPd#*j}4^I6|9Z8=skItwZ&(PMUkkRkCbn6-wOlK9 zF?p~fd$!_0wq?7NYg?ST8Ld*K4-Gg=Y}5#7i9y%E3-!RSO2L@MV1(pg4Al7z#=s0; z(GAi-o&La>wd4=oa1V!xoe07YMw>>kbhsTv3%WS9%u}@*t7&b^wWNl+$a=bRdvL1j zx*w+l3KqKn2fMYanY25gcPmsxR&{Np4__c?)U&SsP_sz!4zkb<yl{~YlCQ95Fy!D0gFCv*Q@Xedzws--^Gm<= zYrjq!v%xhFE+=kl^bOD!|G9a?55O?EO!2fT2y?(-4CL^=%%{8&C=aZ7z(0Gmt3|;P zBECk!3(Vk=%$JWJL=f{JsS((|{d&JAjKV3b!Yjgjtq$C%dm{^|2n-~Q$atg3kPOVhEX>0^ zzhJB|L>m^&K#Dn=OUA&Kdm1&;01U{iH%MWw*!EA;vbp@g%{UgdDp+9L+zq`fIQ~k^ z=Zwzjtj?xt%n0)iUyvYyW|uHbby8Cd{R<(eSPZm~xezkSzyQ!y@lOC9z<=}0>x|F| ztDAV zR(;i6P1kj8*I?__0CNRj01S~(4AP+1V$rt}DOHT-Xyky;Qt=K1c~qA$Xy~dI%uo#d zAlGw!*O!ginQfeTEikCS3&t?kSnI&>&D*{0+b%50QbP`c&8t=+rr-S>^(nhoC6 zQV70h{}k$g1ZxZy7a3gYK*>yC+)a@OV~_^S5DA|w1)(j_olM`Bo!=LZ;dH&<)6(Bb zA>d#!51QOr#*ha2Xckh972iwE6&^L)o#8JI!zzlNWAV`%5f)G{Y@CRm0ao7H63{GO-!ZP`Tb|G}eilgJ23K$e zX%GtCAO%T>1oN=J`V+NN`tgZh=Uk57*F^kr3ZcaSgJK&!UTzjhKn_3`dV)}GX^sjQXcUF; z|GV5~ymU?ws7?^vR(q&GWE71Q_aJG7eiR{Y7SV(2?tAE(PRvvQ007Vf%~b}&ZUVC< z001BWjlNi)u@ptQ*y4$Da1Eh12COs>Y2FF(-QoTq1#az#b?$rP{s(Emh_5{r z6yEDb!3^+h7Ki!n2L0>P9?Zf{?8d(A$u3*V4gi_HSR63y(vI&aUG3Cz?b&__xHk{H zrw-l;bNRpz;XdxGZtesz?ykVpM|HeS(G4@*Y{l?_JJ$(2_6<8n@@}wm@*oRZ5Dwwc z<2lzsp}=|4AP2X;6wq7m<$UnytnbA>TV>Gi%3kozpjg`g@CN_$!p!MnvGAcM{|${6 z3ReJwQh@d3&6oB>NJ#R8D-t=4?^!yG0{@z$i@9cxH^o#HG2`?>x zs`-Gb+UxCSWBzDYpuxgeX_ip)%-1tLZ z^#9K60N?{zm8&7MW;6Q@nKZQZ&BChzTBx^-oC)1u`{8h#)zX#t5>FJF*yu1LA%50#WF zT8z25)+v-nnar-)#hh95X3leQrUe~Z^k~whcS4m))hbq>RwYea1V|IAzacJAGe|DvAyv+C8X05U+Z zZIwpt+ui_!KzzU5zlWdO_jUR<`N^uEzwUGX{rS(-hn9NuBd|aNS;{Ux1s7y+Ebvq# zuQl_=vkeyZ8pJR|4TYNTKo3VU=A92mlt~&9>67P{zDQ)TMc@+LFh&^}d{DgA%FAFI ztr!sJ!Wn-AvbzplEKnWdh(yvuat2DWNhfK7F-R$=B&|lD#$&A~t1=KL1FNX$(Kaf_ zBonP6pS%t~G}o+-nIqYR6V52hq_fVXP=mk(J@qU@r|}%vlTUHB6lWVlb@I_pMHl68 zgE+5aX1Yfw-3*w8D80u-eE%=$mumxoM}LrqgGoL%JoOscC)ZmVT;} zXI6dW=|*OCx-t6d<8X#LZME0lPU@?>+1PAMO{(r5S&Y$Vy8pyLB8Q)W<$jTAwg)G? zaQPese5HH#PJB{hu2HAvmBy!$uoHMiy=e%>ziE{gDUdqlqPF`ZZ4(5>FiM#a8 zHvc?!**gz?>Tf`H9nEfJaRqndiuq<%X6|F=q}(%ueRk!Shq`v913b>e+y(xQZjk6=N{h{y=S0TA!x&(^o0^@8_p~-|&YfN1EdFVI2PY_{kO3 z)UlFV4DclY97zHFham3N&w&qo*!_NXpW+xvULyJ00)sOTp@rmqBl)08I0!-wu0(zi zL}3c;B*CABC^*;XhSj>kxO+&$cO|5WJlrIgd5}X^)BoXzI(U%}tJP2%R`W+V-ocD} zFykFYM4}O;aYTZAgBjAeM-Yct#3>>%Av^SkI^Gcu7i#Z>Dnw%%fy6?a4d)#ptP444 zaiqW9Y5S}%uhl*}h4||GIBKc5LH5b{>e+G1t zw}_h%{ZY;VqI4v~3@JZG`cQvx^LP`rXi$YZ7L2-#bRTsRGi33|hqyu(pY$ayWN{*3 z+|8H$3ACV$cci>f<&*(Fh*bOc*QV| zv5dnph5=IxlHuxax4nIb4J~I3-WHdPGFu7Kyf|C#vZc1S1ur;rYqrQX5+VFBryTWR zw`;t^75pIO`S3FjX38U1%Vn>7DR&F=`Xe9i4T(Bpb`5&P*OV?lND^>7~zF* zgvD`;4Xj|ntDU4Pni-CMT;oXZ)n-rifl@4bFr^*_$$LxUJ6D86m)o@CN!+@UVxVuq zH)e?tHt-4w_t>_t*nki&46@}a6o??+m?iT;$|Efo7)S04hC|X~eHw7UNp3Ak0+eMO zcNru)o`a9YoY*;Rzz_>2$3}}RPyc=tj*3%@RrTNT^(rS=nu-ScXaP zygX=>q+$bB7;~cIs)G%rV!^kPtUiid52=RZ7WIHLFHfg1^wQ!ol$M312Lx$uI`ty_ zAf!6s91~!8l+c7`#|BcMXj)rzyouv-OW>#yF7Fy9-jeOBy{uzI*V@>sEHfE|jj4hz zo6yaW^@EQ+Z586A8P28@RIzWFBh?T%SvS8GPPj5Ry5Mor?8CWx@D1pI#|y{! z(~iMthf5qzul2ZRp)G4GT>m`dCokDCtbmh$ha67Yb-7*HL2;5x;uSZ?dCqmdbDsCy z=RXH}(1kv9q8HuhM@M?nmA-VQH{Iz^hkDedKJ}(cLK1UG0n5i7b2cTI>qun>8Fb8Y zvX|ZLXGeS5)xLJNx83b;hkM-RK6kp;-R^hCd*1cFcfR-C?|;Y13dj)eufwUvg_n6M z#4ZHJH{S7&hkWE4j}snL-tw2neCBz86Ulep^PiVIq8=Kn6nq z9Px14^7Xln6K9;m`~TkufB3@>i(9`0C*ntc`pe&v^QYha-(P?HHt>G>pCkT;-9Y#2 zkACux^EhFbHT~PabLc-I`@e2W0Rh~<0@OZ&5wr#jxduzX0R%q=ypHyxK=8Xi`OCoY z+d%wNzzpoa_>e(N$(@bP+Y;5EJ&Et zNK6^H?f_bnq%HkMFqtuU)EJCFe$e?se zW_-!JOv7$txs?160PIV(gcp7Y2U>6if~$#+Dvo=Q$h!E3trX0*L+Q0D2B!~%_jj&%EU}dyi9qt zOwEkMJOoY849q48I}1ya;;hYh5hEo5hURq6=X_4y8J(AS&6!}$*1W~!go)C0l96LM zIsar4P2A0;B)ie{5bksm!Xr-YOrIHBC&{u4zi~B|KrdskPkORXV`!#(00vKEE@q$w zUeE`1u+Lk_NcMaQuk#VGa-^34F_AtRsBT~qA*Bu?jm7C8Q0AhS z^6?!erP4Na(>FaHS;&X;@uwH-se8~Uh$1j@2nRaj&@lzk*Sd&2n+Ry)AespYHvfgw zMP<}RRT=!?Cx5Cze~6i1f~exT9I)EZ{{c~%2^x@CB4Gj(Eyu=~>Lk2iW2aL4wG9Fjj)N24lV0nKBy5$tsHtA#r^Rbr2|QIM# z;kkrMaIwAZTi?2bNI-?Vk_ADC1Rn~8kk}(VQe3W;15J7-U?_xdNfA^iq)5<=9_j{3 zc?WVR8-D0oNx7V(>LGGyroI7KncxRL&6UXUp-GYkN#h&QWiE=i2eZ`LvdNoFU6Ifw z2Z^W$Y;f7%)fS4t-LIht)%>-Qr6-XfDZZ%(e$Yyih}z#!2a3p>zW-2&zL})Zz21Bv z2XzoEs%1ohYMrKqFRj(Guoc_*?XtDyn=2>=RLBEox`Ix)1|)DSBuE2b&;oe~g)3+T zUO)oUWdqc+#D>&;h^)c}tJx)eFxch|uo-?xX}|_~_=Xs632fM2y|UhJkcS}t zSZU~nj-80vbtZRlm&@Sc+|Agn>4vN#8?fP6iLfItx(55QVi>|SdsQ6hl?Hj32XM@l zpUqg>%-+tBhTCBod9b67!Um-k-HWZBdcaBFG7hT^2|JpHdH>kfkZ>-RSs$>K-$#Bm z`t^rUNQ2Ws0)6O&MgR%_y$2*{nW_neKL80SXeNGOV2}vqQJx1mzz=nxV9Wr9D?o+H zO#{UM+*qFFkl@*nkb^?tg-(!zZx{oAkOFlO1n8X$D*9Oyc_k5vW0Bz9hO(Mvo@Rkv ziRw|NzNuYjx|E=e=Fr7vg^6b7x`rZBn1AxG?1`al<{La#m~hb9m@ryJHp|t0V@Yw? zv`r#3rVFhM2XYMETv^(#^)1rvhIx<&u|(gtAY^hj%Y10xf+mP;A{|GDs)^oJD0dxq?zYw8$=kieZlW(fGj=*WgANiGJ$0IRHlu-%0Bw^DW0}>tS^XxRz=oRha1x z34ASXex8fH?(3+==BUmeeqd@NW$Zy#U(1n;pIs5f4hf5v>;bo{PSygz^@qRE0y)Ts zaL@w8_=11pY|-{?lhy+Kpm2XEX=7jvSr}%_=!7pwgE2UgevpEL3Gop>l5V(yG5Cml zkb^JKf=CF5pdMY+EeFfAAgUX9gpBc6`s#_^O$=!ddidXt%Qgd9*+ z$ga+tnUeC$Na!hw$dYguDbA*fU9n5aSf1=B>V0c9*7Cj>^XbS3C;nWK$m`93Z~I!Y zHD(BTDn~V^x45Y%bP^~nhi({}?%RQzivNae0!MT_a#;_e3$MPU*UpTix`q!gWRhr1 znz6e)w9FKxd^{&g>ej_5tujSM>$*k5+f}R=0~+mv!hkkm~sclw}Jo z*Y#cJ^{kcS&#K=zxEN) z2OOLUVF&kc7x!_WB5F7Hb4T}dSNC;i_jY&pcZc_Qm-l(6_jD*Y|zr_kQ>H ze+T%07x;lE_<}e1gGcy;SNMfz_=b1*hlluxm-vaN_=>mqi^uqk*Z7U+_>TAZj|cgX z7x|GV`I0yJlSlcKSNWA^`IdM2m;Z(<`J2c2oY(oC=lP!Z`JV^+pcnd~ zC;Fl{`lCnsq*wZ-XZogh`lpBbsF(Vwr~0b5`m4wKtk?Rj=lZVq`mYE3uowHWC;PHD z`?E*;v{(DJXZyBy`?rVtxR?96r~A6M`@6^cyx04^=lj0*`@aYLz!&_%C;Y-U{KH55 z#8>>qXZ*%@{Ktp<$d~-dr~Jyd{L9Du%-8(Q=lst1{Lcsd&=>vDC;iel{nJPN)K~r0 zXZ_Z9{nv;6*q8m;r~TTu{oBX=+}Hix=l$OI{oe=v;1~YkC;s9${^LjfC!iEncPONw_(;Jc!~c#gdp7Obwr}Ikt$R1`-oAeW4=#K-@#4mhBTue;IrHYupF@u> zeLD5()~{pFu6;ZA?%uzH4=;W^`SRw^qff7XJ^S|V-@}hDe?I;C_V44*uYW)P{{H_1 z7@&Xy5?G*t2O^lDf(tU(po0%W7@>p{Qdpsd7h;&9h8uF&p@$!W7@~+Hl31dNC!(05 ziYv0%qKhxW7^93c(paO7H{zJ1jyv+$qmMrV8KjUy5?Q2?M;0lTuo#rI%uwsivE9+Nr0Xf*Pu*qmo*x zsi&fvs;aB9+N!Is!Wyfrv(j3tt+(QutFF89+N-a>0voKb!xCGpvBx5ttg_28+pM$C zLL05L(^6Zlwbx>st+v~8+pV|Xf*Y>5nC^8ed~-ZDU=?V;)6hnyqCdNM@YKW+KUHCQWOdc5G$L zZ7jlWEX{8)dvR*baWhtOp}=x8e{&~UcBsyFI$d|AnRhaCcz}a=aL{=@gnBcedN;#+ zKVp5W(S1YheluQwGmC$Bg@8xXfJanSulZAZjg*{q^J8*`r zfrgLNhf;2cwO)xqbcwLWiBi>xR*8#B?~6xaj755kOPY;?WsXVMj$3JwO?;8J#F1Xu zkzS3Ho|cnX*^^{#lv0S5SfiDT%9Uj9l~~l4W^tEShL^Y6mucIXY;~Gkg_^<3nrok% zWs97;@tk9kon?ETW38TQ+@53Jo^-CCl+B=YexYWfqHpq|Yl5U~oushPqcp$}#G&UOR$*9B0)X~bV?8>v>%f8{uvbxNz^vtlz&e!hFxxLS^z|pnE(dW$3+~d)^@6o;G z(!<5ny35q#($w4R)Wr1Ez4_F-+19}E*2BoxzWUd}^4Q1g*~-k?!}{9A+1uap+|1P7 z?A_hr`rXUY-pcgf(beG2_2Jal;?U~i)Y{|o+vL>e<>ve4*68Nu_U7CC=Goro+3o1u z_vqf`>GAyO-Qw!q_UhvK?BnL`;py+?`S0lJ@aO&T=k4<9`||AZ^zQTa@%;Ak_WAYv z`S$<+{{R302>$^62^>hUpuvL(6DnNDu%W|;5F<*QNU@^Dix@L%+{m$`$B!UGiX2I@ zq{)*gQ>t9avZc$HFk{M`NwcQSn>cgo+{v@2&!0ep3LQ$csL`WHlPX=xw5ijlP@_tn zO0}xht5~yY-O9DA*RNp1iXBU~tl6_@)2dy|wyoQ@aO29IOSi7wyLj{J-OIPH-@kwZ z3m#0ku;Igq6DwZKxUu8MkRwZ;Ou4e<%a}83-pskP=g*)+iylq7wCU5RQ>$Lhy0z=q zuw%=fO}n=3+qiS<-p#wW@87_K3m;Crxbfr2lPh1&yt(t|(4$MAPQAMI>)5kv-_E_e z_wV4ti~k=_zP$PK=+moT&%V9;_q);YnoqyJ{rmLc>)+46q^J%61}NZw1PYi$e+VY1 zAVp*p=-`6@#E@fbb+KlKBD4wTop`#IjSW!tDMAM&Kp7yEC3GM{ zpMAQ)5IEqJ6iqZ3WfRRlBGD5~qXx0Vizw)n6wjr3hAOH;JH)_?KytPrP&O^>AkjVm zX8$u%JOK7{Q4AdjBT_6K0AY|D0`4HINSkzEMMCtrvB{{+qPfGV0`yk4D2tCk^hB~sEN+N3 z5QYXAg+H`*&<{KS{t2MS@JK1(vIXHPAQ^b;O}OF!LSA=(cl)z>fHe=qYX>n1g3LhR z5D4w(9o&u2kO0QwPb3{s?oZyd_!Bkl0j7XY9-#a4U+;~2BcLE?C;uLqq(hDn;2pFO zi9>Q1L<^wE{8z$0)a(ERI{&4>H-M2W;t$|4c*8s4wIbeu8|c(cIj}7tQ)~y$drW#W zyl2z>_u-peui`-qARzC0CCDD2_JIrli9#4H3f({&6d)CxE_KF2M%`NA4+JW3dpB!= zF#4tf0oo3Cx-*tPq98(W6e=3%0Gu1Vferw%EN{LtUI)4miB2i+hv|uefd7g>5VlQF zK<8>m^#}qy>GjVa_=rUwBsad$0j^!{+ehIJ2Ozc;YjA)wpFdi(EI}+O8n~LCKmH`L zC{7VT9RwZ2{L#cn^siV+1fB#35{)-T2!oAkUPER zEgJx$1qLNl!w$s5nzC#LuJR|qz4?Poj#ObPlTfHhcA#C`OeBEdq5nfq`m;Q0;-Ehy z7dwBnK|m?sWIFp6#sj>u>lI4l7ma5&T-5Zwo_O7OM1?Xhk#_?ks|fkBZCFOCn5L_{%kEOdxq z2lOb`f%fsIcv``t@EGGV{UN962~t9|6M;XBC)R(O6+GY=QVB$MJpnP29n9+h@{|am z5?nJI_yEwhNY{_>31Ue2=$onpqEUZvREzEVC^7vJjl>0rodX3>m!j8KSpF42_*jLZ z@*t-c^dqkn1?p!DRRYQFQCt90>pn2pD*!19w0n&~KTz8OTmMo{7~_(hRSVaKRNk#Y zEg-`_a3uoCO^>W+O|Ey)5s5>1L!SZ~Y&K zVN33I3WIVY=GXi&;O+`!6Mqsx7dS-0y~cBIk&sdggm~N38a9<8P-E)Ks@xV|Cl5(z zfenDrhEsj6e*Ulo3zbBM++lI)8 z!87~0O0OJH5|FovE<~XVVc-Qi+JLzLx#}mI*ul=CVCS2}V_SdDbdFLfASsf?oIf1%-Q19z=1?Sl(LK z$WC@og6*JFD*M^cHVU%~Tq3-10~y4qae^R2=*ZiLAO=mTSO`PMpy-DiDt5HgWdj{O>i{^&Kn8pW zgBJM~gc`Ndk3)$9c-77bix8qhg}8hm=-32JTd)a}8N^8xiTm8^Zg8FZohVDU$SP7` z2}>lg=mOE%B0#}GHza`=s zAP6cTK8-~#tusc_vor5>E;WQaYbOXIAPRMZ4%ZU{w9tXTayL5@2`!?03CJx%*=fvIAsLb~_+?oLol++U5d*6L4kFk={k8*}fDTcZ z1npu82a<+1gMj{^0z{K_7Y2uNxJFzN4K@Hdc|#9+m_o=fdU=oso6rn9xHUw>Cu3!Z zYbPf{*8z=qEZzu?4YVvYRf(6#e8m-6xY7*%@Q#}RE0n`4qsV-~LJyUc1mBnkH53B} zd1bKJkS0MUU0^Hw;Ccjsi~fgm;iFj%1xvCM3nEYgi@-tAkOFoEDatqyHp5c*AtXO=?{Uj3AjR!+>$pc(*dXCB=jHxHUN}D*^eEdDgeoSrT8_p@^~shIX%b= zKwtw3=a6<867UuUynqkS)hT3v4Er#a2!V5OM=7EZI>)w9iP;s9Ct0#`J!H}Xva=5n zmIRa{m!zXByznO&f=DPQJcUv>%Yq?vS(%hV3oY`4YAF!#06oGrP-96uqClB?C2wcx z4{2Et0;zzr^C#&;VN#+3vU8WjSrWX`C3o|a(LgTNB@rzE1WBL*mBtqxw~-N55Dzyp z3zZNGN00)MoktTlegB7@bx9FmMwrE!o*@wp0Cy4S;EIQFJ(Z}Q^w}8b=^3k4p7pt( z{Mn!W`JVt9paRM)1HwTE6(G^TY^uUKpCTaU13MQrIt3*lV`m%Hxe<#wpc?8B1!@kp z)&a5L40>Y@&;uaOAfaAq4iIV%G!-D0U=9W%8u>;MqR=AKNf9kDpM5c-C(#TUx}nJd zAm%WnL|UXqdZb92q)IxZmf#M!`56{he-aS`H#!kNN*7Uj5|0O@*CC`!`lVnRrb7xi z6`C15=aoMf5mX9yc|Zn4LpYJp1#KWJhY&PT5DnR|3A|th=Hm@hhbeiwr(%Q;mLR8Z zDi>$!a}XB^QUCCO=umAO<1mdXD={>TS+x&~U*cVW&>8h3pS8DF#wWTvzay!3l>BVybuEzr>ne*4E+F|xBvuNGZ(2! z5sLr>)PM_(5(BaW4Fr?X#4Wk;W z&Va7$I;5pKoSMOYsyZy<0tz+yb>WJI12sLoa8e274^}XoKoG10Td=|)0=Cix$toB2 z>VEU5WxSAf1u=4qU_|f5H;RI>;I^eWC08mab`KbV;%Xd1+E_ravP0ShUC^#Yx(Q0^ z1}j?vGXG1Yd7xmIQH28G4G~EY(ZF?NR$s%i3Oy+b5+ewz12FOY{ST63_8$1M)4 zV}$jvaly0MaD^||H99*G$bcc8Bbh4jT#zKTbOSByfImpPF>yGDCtDmT+W|(pt}sia z^zx-eaI;0~4oPsTlOc<=Sh0pvk@O%6+qa9u;#7@-4g?fbHzPZ%nln@zwZU4syudSk zgSBrVxV31J1MxQr@uoz0VK_s&HDhE@G)8R%4l@b2aXTDyOQhW3vY{}f+zypZs0#;Fr=!m3ETh-%LEM6U=FI#1x4TnMY^{~+M30(clRhf zJO2{{&43SN!gbR#xlhYg1}g%ClfGRL3&m25kVJF3}Hobrh={%df1=vpg}jjLXCK z5$q`u^iZXZEFBC?q>@a@RBXi>ObaCt3fw?1p-?+RngUo%4bhOU+)x6lFv>;BE8L+z~uF5ni_uH9Cfa3D9Gtqd5`SEHT(2(SHTODk1<`XU!YD10V?+qX!xxcm^P< z0-?AmAhLtk0#r~2K{g@*lLYZ^OkvU;k);!1jU6$$20`0NEEKTK60#lHDOV6DdD#hq z4zh|2Va*V%JrI9w6hh4riJGPkK@YJI1L#n2{!piaaHnu8r|YVxeE%vn1ITid+h*J? z-PB!he(DCUC=4#ED}nmmGtt~D@!S&;cVf2+l%sS^2i`HpURKbL0ukK8-53N`sJeN19qyu`0)CJx8<2NzZ9AU2%f!hKR0|2|V z{YncuBOwJlMj+ef!$J#j&RBw(Q5d%g>hj-nPO$sz5@@~>5&!#sO|}VP5Cw(q57fXx zT6d@aw>9FkfBw*aJ75K8S2Irzx83JWi;_z#=amaZG)R|tE)Ys95KPPFgoSZ}P+o43 z2ky(cEwSqU0p)FgW591-K<4w31h{f(%UQ21u(lpAPEo(FMAsg<{DO~LXtv~DF7)RSpqZ&MJSg7w&MmO0DeWUAAR$i7U??)vG4oX0XE=fh6Ok& z;06LeSSk5gZm22+tO7)T6)i0GWdCIeQN#Fv$g5irLpGQs{S|q+mwgFINzf1a_z+Z5 zm%pF+>j7_n8JH3wbGdI3$GsAV4iaw6A!FYc*Z4B!!OLXe z%fRd;?Xnl>4-?Zo{`7AUFE0}M92oZ>6U$%!!VM7YtoS2X(BMIY2^AK^x6t83h!G`D zq*yVcK#LhQZsgd}<42GoMUEs{(&R~$DOIjy+0x}pm@#F}q*>GEO`JJ(?&R6i=TD$P zg$^ZJbR)VXMwPxqy42}Ys8OX(rCQbM)vEi1VdXkeYge#g#f~Lg*6dld8u9T;8+L75 zxN+sqrCV2ShZg=8b>Q0}R=mC)lDY{TFIr$X{*nat7pp^6j6fed?z_lei@tgH_Wc`p zaN)yzsYd=eIk08Sd^HE|-1l=|!+q_}rvF{r_HCSe&E*Mf$ys0^=DGxSMwxFHb3g){ z6OFqhUrT}Wm`ja!ckkbQgAeapT(NZ#3{(b!U z_1|wkwW;I|_7iYG{rZdQztRFEaKQl$lfJAv_4KTRn;UxD&1$3%L+;YA0Sq>H9S)fO_it|Kh0HFQKRa$)KfnU_Euy| z5+wyP`tc{4#Qb5W3x8}#_S%g=Wpme|Nb|K=RF8eiTVlcG7F%>H@`j9k2~e|D01=9_n3zu}B#kh|rdxBmJ=qj#D>>al0j8B36E0de1}ilO$6$dAU_ z-Fq)zsl`(F{!V9<^9#z;$)3OY`uBg*8S&|_K*#Z)^ZHl78Uau>0?bcewuitS)ekT4 zdy3BTgue4J#1eU;hW|f=(SXo4L7&}90Z|9EMkBTbo3(+i`aw}aF>Th zQNs$BqTo(;<{}89DSW$uoP8`IoE8i(h{fWDDwg=7`XR>>N~}*Trqcx=Fz|_JYT_0> z6h(7PF+WxmP8PiwLFKrwhc$6WMD}3=EAYYwxbsH_{m~C3;Q|P&h=&L^u?Z0bEe0i6 zL=sj(4_?Fo83fsc4MdQHBzPkaBIv@F_R)bTv?vJYbH6hhDLcnAB3_)K#ZXo$ z8=**LhqBW$l9{rVfLi4&E%Zu?#j*vcv?YNY(*nVDGA8C|LPThh0)ixgj&>A=2%Oc0 zRn(9mR)AFuVE@Mp^Z{X!1X*7}@{lAEz@r1`hyo}gp+;fOWM|0hUxp(rionK7C zJO>0mo|%oE_9T%r>d8<2;1e%-&?i7IG#qrMlO^tGLJ-UW5kWYrTFXSAC89u(JqjXP zZpcF&(N_>{7Nnc)BS;C@Ap(C`Asd!Rs6t=Th7KIjpcb-35~GvEndUTK!-++7s;5(+ zijt>34Jv`6lPH(6WFMYLLlb}j5n99mAA;};LF~wj7T{wqtB6Kc@c~Sa9wej$K}QLk zwGZBv^rR_WX-kvJ63oc6s1%cFQNQ@ppmt8KcFoTL`KnjG3g|a-dMit?V2-Cg1RtU! z0~vZ31OE}k;Eu|%LK0%(4>EK>1Yxl1M=>gbi%wGoC7?xG{ZRsVpj4%nutZ^FqOHKj zwnkl)ZEY#)t9-08%_4E}x3W^=rnD z6VYUy7Fo#)d4?h8d>baiIh7<30FLjIi7n`1!{*qp9-L?&E8{22ey&e5P~2kqp5Y9n z07re&T%Tk>0uoX-M=v;Sj!f4F9EsS%EmR!lOjjDykl4?dVQ~gXo1+?f*g|~mbBisU zAsP1}^`z}n<}(ZE(w(-oIY_ZyQkWVNXYjH)MqO$}>%jpUpe+UpFlas7Q_gor$RrZW z?b+lx+gmL25b%?Y2>`$W1^|FJz>)3*1V94KK=hUCv*`Pr0mvEl@(=)!00sa6#s3;O z00K0i0T`HL3kMJY0_>9r0B{-u5a=`l1aN==ytzJLwht@-AmsXJ-~p{}01N;C00Wqq z6bRTi3+lrJlna0W8pnXT7tn5Vz@X+2IC!w_qX&W;9N{@yzmF;JZGz007c^0RVGn0ssjIbu#d~fClJ52Ou}c5FBnF24J8*fA8-D7*GvxjMzRL zw+{vqQ0Zm}yw(3ccGr`xVRPsJ0FdCj2eQ!&7<=RI=9aa4O|AjOfBMflK>xhrg$#qD z*XQU-2LJ^OAMw*;Uh~N~00mmS=6{+40uxC4Ji{(_bELuS;r>l7Y+Xn$2zz+l# zV4n#jpx%nsvU>rr_$?UR93&useuiBi$KMC$>t%uZLyv*?s=3VzGh_R>#xsEM!w-|Q zwGInC!m9=VV1X@QJNJvP1RwwwFtuMZv%EXFgCjh17&hQDI;69_7K6Y<_`d)I0EI)o z5bUx0fB^yUKCly)?URU7D1{rW3$&xb{eihytB=9MkI1VJXE=cXSiih$KOf6F40yu) z!hrM3kNa!9$FnXB%mxAoJOF@zC&T~^@P>^ezz|#ja7X|U%#SgvkN?Q?xHAwxgaZIo z6TJTF1wI1+4S+OYyN~pO0Y79s5ezpT0<00YQ`Y|w*#WPt^^1#H8xb!@c-=mk=EfCQj7lsto5sJUCH1_Q{* zL|A~4g9IHwHo&uw25>%7(765ZNRVv7{j$1mG>1JzNtI+tbAY%ZID-Qq0Ig%CP6I}L zJflinls51ZoB)PNxW`T`%QJ$xU>gE<1F}j}Jpdqp69~VXD?V2|xsW7diyn42tp6(h4Q?QWI)DF zW3&#b23i~tkzdMhWVU4Wb_7Da8Gl9h4B=SUSPCdNWcM+Py9Gg{73`{Ab|b! z&H{SQIk}`s;RPEi2rHl!!(oXg=uqxViyo9v=J5vLqd)?oGJ}EzNFV_h)H7!YfiBcN z67?Snos$;Wfvpe)I`W4d@P{^_PL{X^ItbCY7||MK9#U9O0l}@A(t!fd0H@S5ZYTr* zP15I`M`m=mK;Q1b7gIMK}bP zz=Ad!QxVNaHua+R`qL*0R6)&BcbpnQhzZ}~)3zwnLVXfFNPq=M0P?F34_JUn^||?R z08FjP8o|l}GQR<>k28?e1n9>Xk+vtzYAxF{C|kF4033r@K9oWRZ~*b_g^8ue1BC%nOj}EJ08SeM2k0**SO5e# z+5MQ?{=ICg4;7z*ty-u?5gqjm)v3TUTPHp2r#+YWB>w?GQ15jc1zu? zbAWRVfcu4o38+^A&^Uin*ld{CWFUb6;9daWUHtg5reik(_||}<-vqF)bRz(Ut6i%@ zfO^%q(5(-kY&eUnRQ>o~%mg*7YXCpn0s$xh4sa6p%{o$;&2L430H9U^puYP+0GjPx z-|bgA3EqOJnG7WeQD`jOi3MS>C{Wl36>6;FN+BUpVsKz$f>?#dQcH{Qwv-4GsVFT8 zQ(iX3);IKqOn6=yj6eaHxc@G!22wBp>lFZnjm+HDj|2!$XEU(x9lUC=IH2o7LL&h6 zlgYi)1NLRZ95#peHRJ#w0dJT<4A21zm{|&-01VhQaIjyE%ZAjAyW;DGOs=y26~?Bk zxu6sPlUsmnsN@LLgUz+TZ)G~9o4}D1NIsTdj71BtQ-EciBM;Swueimn2JL?tgTPE!oPUj%@GY5>Y_y}B2k z00A~OHm;BBg=70r(f|27R|Fsddi&mW#4sknI&rl+RGirP9AtA~fI^PnH9i3MTgQpb zj|cc8zJq*TNAb5Z0J0Hy;meQN*=qdjl++s&QWhU3gx?poSS+yTK> z_gq_c+t{bA00P(oVqSoA<=$_-IRn7m{7}7HbmnLN5n%YOjG*RA5(H650zm+b6%rGF zaD$8T23!#5McRaD5d?MafV*~sOVXH4nCFTx2P|kXlZcb4XaRQUVuKZHDusFnPfCebVzl{KSK!(4;2&|6; zC~0ez;;U%sN32cJUYl0AGMk0f3KqRzC1^*w+CsD_9I0ADUp{V=)XUOnYD z*^tA-n-j9{-Rl3u6g5%2?o3#(Y&dZP&{gSszx(I~UykYi)xc~Bgw!QwPL^G7g1_*%ShVR`{5yYx*XPM>!ZvijC2XFukbfAR}&=YDV@C85cyM}hgH~zB;cW0sTGA1@g(hq07w8ZGynq70~A+rwgs~J zRllt~*9Nd*bEvr(GdJz++8O3ChC74rMO&+LIV-PWwk-gGBme?n02qS+RqpDOD`^3s z?nFoit6Kobo@Ek?JOcu?5fo`#tpf;Bw?v?nJOUS>0S%a8U!mI+W{$ zGuYgTJAnTw97v~=JYJl(2Sw@qNb*>d!3kgi2-w_em{biQf(qzDVlL=i^HwHsIt`!z z9&a{ehI7Z3Z?pggKez_+nFu!!ffTS5ooxXXfR)U$0j!DZ0atK;*n~u(subu4q*r=( z9_KtjDvcoQQV)q`X_jaCSzHeaZvZVz$%D|EmTC#{#@69xShuY90IqX@zncJ32mlPo zgo!nPArOE7urJw6-E7r>1{Q!3zK_;2drdvU zKLGy_EBDJzgxNmV12EnCATbDlv=g7rhI=#14uGa}fW_=ZTu#i;&T7Vk1sV1_L=P7q zp@V4<3GyjBj!3SCNN10*`jc1}5bgS+p!M2$my|O5?3XTBAO)GXuJeQh&vFf9|k^Eo+u*wiZct?7(;SsyQsz-gb>)CB{B7{$i!rME5UUGM1v?69oTH zkR?%o7c*|`_%Yz9PbX8ZZ22-~rrcB^^#EW<=Fp-?TTV8iDCm!yn=%k!00!yUvS+J| z3i&o}+@4D1&g~7}4Le$;{+I2h6h%8gyMK+BFvDoIrm9mEnNlny!rM3hy2H>H(XjuuM?Ni6?qmK%!s z<4q*!92XB+UpjY4tB|pSUd35~q(A5gl7t)hc;ki};82B+C0(>Z zO*WhGVuf*8bF^aDf^xB#&VFCy3wiwZ##{PbJ#Lht>g`Jx{ zn$SMy#KTPeNWrQ4p@I(J3>l@1L&K|`Ua>&f5gL29q2+HV&RM-60wLzJR%a4sKg~Q zv58K6A{3)2#VG>A5>vb)7PF|uEpl;-RqP@d!zjivZn0ZY+27N6fySvEsX4C5)DC;& zjTpp0EmVnw!@k#oI=V0(bu0mJ+>inl0HJx$vxFs}!>AaXvX-{IB`$NR%U$xa zm%jWZFkJ~ssCm*V|HF;ILU|l~SnhJe{9infwU1`@r+PfBNQ}H3N3KMO@nH#ujRoh_F9}!OVq;-z>hNeODcz!+SJKB z6{@{4XOqa#g?-qdW;Q@l608u7RZt^oOH+X`es=$}shE{%$k;~<1VIZCP_rzC0ESg@ zLycp`u^-vcW?lo!p=d;fN=rE^3vq{zH6}@_MRiJ5Nw?TNIrb@$73^b1)-7s8ffgy4 zN)l)>w`TI5>HKZ6X__y3to5NZRz}FtyxW#VT5`JW*8gENS3IbknQe z^_nOiuLA<=K+@X*;=^sYEo^a-%D(z)D!04kZB!D=-nUHG64mXjd<1c?5(uJB_5JSU zf>zAVp+f}PLI+>H;)x#(xWgX)@R?qBx{v?ZD+vN-A3M&5TM@@sz5yO!ip>=b76Vwn zATCQ3!gXML*r0@DaMX-NP=Zc!7qyp+VUZ}yye?;xdmU&NSMc5xT|FNE3^Fyn$#@kIfGcL@f(Pi2RPYt*%{4ejN_io z)n|+3na=oX!kZ1P=tT#0(B>#~`;v>V%0>jzv6D2P-`89`|C!NG^TRYeE$UIzhWCz-z%1uJAF#dUS2pVZ+z<@P+TE7L%Yj z#xp)(WNVx&9PhZH&0$lJlf2|6KRL?xH13qMyyZ}fH_Kt}@*AhY95cT;&c76JmhU|0 z9i9Y-@gtv`!UJGFKl-av?ed5teZauDIDRU0A!Wh?2rVE(U7N1;wSAoBUEg|i$Fd(A z1+@(*1UuT(uJ)rN8b4v03Om4=4R5e724-T%CM;pOY(#=~YyW$RWR7yd13s^Q7!G_Q zm5pq)!G$q=7Vi8uCS;^abgTc^hZV|O_|5;P=P36)=aX6rS44eZ8Fah^p%uFp>>zknB>f1*Ka109{U6^?8lkb?Xqn!CW>R+Ce&lPd^ zCmwD9LNsW>3vWcB3%|nMChW2a?UMlr*q1)~)4w)w5WFi5Q-4-k_8&_m8h7-EKmW46 z807P3nq~3-eh?mG1PfsBpI`}~iLf777$EH#!tLN61a3tOq=1Ss#``%&@Bv0zh=5;c zMFzG3Q0)ikEe08=fb-Ny9!*6vkcbVu+i|QQjf|j(h>AH)g(z6TCe+D3L_!oy-6FU_ zB2k5?kO6Zng70+{?Zy9t#laDN^n?Cw1pr3c6GmW(&_V?=1_O3Q2h?9!xIqVI#THgY z6y68!IYvC3AYJH55~NxS-p3l!NEwz$X)OXM{M+xml*dp5F07EpP{kb9;lD6g9$3ok z0Rt8U0n~W~3^v9DUZGWdULM@3;w<(9 zwxk+5kUqAYHLAVh+)c*PF(pe)MCR765H zB;?fXV?P3i1mWO5paVe~q)vH7HeO>#ieor}qc@5qIerNcIEn)9fE9?sAbyVsECM@F zg)k7}3!&t{*}x~ZU+7Ij5IkU4ydegbBUOYURgB>>9mkzy1u{&|^wwOgaWQq(C0< zLKvi=A~lpym$ZQ9WknwD z;b6K7Jcs}d0f8PmP;0)XoZKO~)Fv6+W^Q%`WIiWlLMLOYNe5sEGW^?5Ezj8@gB5gu zn9Sr*TIWqNjyLFJ=+OZeMhidsV+MudD0U(l7)L20Aw&v^Q|96>?gKJ*(=zI#GeV;; zl0kY>1@s9G9U$CS2*a)fq@y(BGg5`JFiQn30_5Q(Q#PfFsU%BUN?iIwG*smb;e%h` zq$5oQcy?t@`VHeGrB@K8Ka9$l;DXd>OC;n$2;s~o02Spl(^Af;Kh)?{xTrq}Lb+r^ z2~7XRCfp+fHBKbbNFE5!W}@M=OhpNx0}fJYHb~8iy2Xit78~XyHT+u#fI}3_6j!Xm zOIGIZ+zn8nDNHg+s%a=~iYb|zX>{#CKVpnO=%UbIh?!afA$BRAqJy5cVR+Azey!k2I6d5%5GNDed^_{^yOIsreG4LVVWjlhD9wjLCTyA7GOadm}<>^ zg>Wq=8GII~B1u@vjQZ5fTD_vIRTjdcg*gyy&X|sk15GY0nxdmtDOd=r+I_&=d zv_NIMnkY^B15o}8dA0y&`NL5(>$lHzES{vSTXgFB=_8u%nvM8burfIo(F|3e1#SK&#Y*Cf$_l(|YJskczgxEiKT^RIF_Z zEl0U*kOp-@B*-m`?uZ+-0c&;uaI~u_;zM_$gGl++=G1Gn-s_L`?XmW&!}|XNJS+lW z{V5^x4IzRq@;HiHU~M*NZq7XHZDn2cWEeX5&b?MGoSv+T`Hi*^>(?IY!FH#ZIHj~s zMYjfN;(%)mi7UBU>$!r3IiOcqM1mz`L)nOc&B&|`EbAg<&-Qc=)za>SM0{|-tH7pRMGLF=ECfIhsB8UJy#Dfip!1nME8k#7_Hrc_> z6Tqx5zcy^nU`XEj9aUg#9YMGrMH#=zFz z)9(Ho={oA!;H}>J1K-Yv@d9o?2rl6+Z*M@a^d>D9Yq1ECkO`rX3U#j$mq_9aXji1H zTLflult7#9Lxv?T@ecD26l>!7N(BppWyV4a_`1q+CU8UIFGDI=3Oq1E*?-^$}7f)~Fq_Ta) zuU@1@3qVoHFIG71~%mzJ`{0p3`_s~aaPa+IBPRHr?Wao8_8NlBd-55Jjb&g(4iedCArA#JukFI zZ!|}%NtoI!nMOmlX2WNlw2|Vk2P7BwOOAvTBo&IuQgk@wOhY6T*tLs&oy1wwO!veUgx!5?=@fdwO{`=Un;JwqYMOVkfp@FE(R0wqrjwWJk7SPc~&&wq;*7W@rDlW^XoUceZDLHfV>oXpc5& zm$qr2HfpD~YOgkHx3+7)Hf+bXY|l1r*S2lnHg4y(Ztpg4_qK2UHgE^Ga1S?e7q@XA zH*zPpaxXV?H@9;?H*`n0bWb;RSGRRvH+E;Yc5gR#cei(cH+YA)c#k)Em$!MJH+rYH zdapNox3_!0H+;vpe9t$1*SCG&H-6{0e(yJb_qTulH-HDYfDbr<7r22RID#u&H+(`R z*n%}AID~`UJj?@CR6-UQ0fuAv8<+xBOn8KUILi@(7Z|}5n1VHwfe|>t8#IC`G{PES z_#gN~BTRu2yn!r$IF5tdJ6yOClz}yjxEXgZslUp!gq11CM7xk>|LRE8HMt zxGB7W5!k_0*ntxmK^7>&kfS(*XLuz%xtJGRlpBE;D1(`IgFjGth8w{r_=7b}0T*-w zk8}B$-?_QT0)|(DBDlFfpn02{`Be0IF7N?{zxh^t!#CtZRVX^6cZE89gFZ}!JP?C2 z+=HS6hNCz7qI*T8Px_EsI;MZaUtsz>;5n*)g*?2tER=y041RDvm#{5{CRFc3o}Si>;L!aG=lu&;qE@PjPeIJv7l%ey?xTLU@B zf-+de$Qwb)pM1}+{8Y63%g6l8*Zj@r{LWK_GJL`(SOY&yMLrONCN%g|WV@S_!_XJ~ zR7eBX69ZMu{5;q~gKvGrNBAe8xDhBr8B9UR(|Lx!!KwShvGal0zkw!jg_rxoJ7D;! z|3RAzd9!1p*{L5ABxwr_joWBBG%d6vU?+$Xr*w|TuA!QfBDJIF#KggI5Tiy|E)Nmys;~x*0Jt?4B(nfBeZZG9_0o zfwug;F;a${MI-)5mNa=1WlEJRS+;cf5@t-9Gilbec@t+&ojZB<^!XELP@zMK7BzYl zX;P&}Jvwyw=GR1j@@y%x^M?OVqhbE$m2qONjMJwp)3}J%|0&vULmBh+LLs2cosRwyndrap%^(n|E(QFC^vZBdL#=rm!RZIh5y5B1Jw% zK9OuB)uSw~4*4y0{Tg;`*|TZawtX9SZr!_i_s;1>tdXxC-%JVakjt8hnvPFi>5;CH zd;Tgp%yKGVrt3XEksd6(rtSM63 ziLRB#ALfiOMWX*fs@<%NZW%^HWC_(Ky`oUJ4*JEY-LI8%0B zn(3yUej4hir3Tkdu}o3RDJPVkn(MB;{u=DC#eS{ls_*;-YvjgWo9(vUej9G5tud(3 zy6wJ7%e0M{C+@!e{u}VX1!waUffN}W@x&EhobkqmdWRgxC7+z~$}PVf^UO8hob%2- z{~YwtMIW8?(oH|zrG6CVXI_@byhkpDRZ=~rcVs__cGPv>o%i0G9>j=mNE+r}*NuP{ z>xQ6ZX^sCfPV(1@f2c8sg&@ zY0IC!dXxU}l>7Yk-=F{7_;F$(`Y;eNxbU;-n1Ty-n1VdK@QLvps)1D^PUQ6Qi78Zq z3(hM^L;moMd*s4kf1uJlxS$#@aG?^Z!c3p;kw7PWA|Hcr0v9r6!Bq7RhdI>Y4qg9U{?4>nM z(}-E-qLtay<~A#bwXM`6NYcoH5sk32TOJZxCsc+V)}RlGR6;G0go`3a8AT&ZGLuGt zr+~m`gkR_-WOBqHIa2hATwsKo303Gq{bnU}kwg{(DT6u!X_<585?a>S4-NBj8ZUSu zlAvh{AEtA(i4?C4A3ff5BqANsP{t!KEYc{0bUTLL6sI}$TFhwTqaCq?r$0pr$%6lC z5|`ypr%6@nQkmLRr#=;`QI+acsajR5UKOiZ)#_Hc+EuT96|6b!$5{7KR(zZlt@wyX zJkomBvhHK7tqbd1=~`Ez^kW}rU8_6NVGh4?1F+W+>@}uQ*uh={u!-e{Um5Gybo8~c z=1^>42U}Rf4i>PCO@}+y%2s^9)vlox?Ns}5)_CysuiPLkVi)UJ&U*H%l^?F*= zN>;VgV6AICyN}WS7PvYrty(-`ax44oO>|)cgR(w$Ry6IJK+w!Via}c(>zI|J1pDSL(n!_ERUGIMR%e4Qz7FV_R zQif_jmWz$pp4|f389ca$;q_bM)Yec$2 zJ$~;r5WHxh{CU$c26NOL4QWbOjuT+8&>yf#>3zI69g<#|2@57k(ZEU4s~zm&iYUGAq<$r0~!B1A+kTxzLL7<1Ru!Y zMcM7H^@Hao8+Z2yJM@0;UnipjKH#=Ed@wJ@`@;nb+7lo2@s2m7(tO+$$3*@>k(sQ6 zGbX_zOia;vAv{7QPG<=Zrrwi^<9+QY`Fc*4iwC*!2~c4dz-84wm2wNI@l}4<+^i z9I_!BNMiO%LI!D&B-nu(vO&JQVIOob9XFdK5O3;!<;O@a-x z;r7r0A7(J)rqK9UVjp-xzvvDh(19ARZ}6_rAAnE?^$#3+uq2evB-lX_191=)5&N>? z5J`d&YfvZB&g_?wKojtTN+K#` zvT6KCq7Y7i2twcrc)%ajpa;z03RGVOFD00L=X8mn<1 zT)+&>-~<$`08^sf60r650SUC=5rjY*_JIeE;1Q6Z4*VhNFt89XK?shZ5QJb9#6ce) z&>!@XAN^4uj-UYfF(t`~e(zAsg7C>v(}0kO3LO5GF>BAqDZQ{Gkz}4;-RFt$ZsQcp)ZlkMM>N9PmLU zc_A8*A>-^&B|;J%*um|F5)V<5CxOf#_V6UNaPOefAAS-XcG4xvj3w~lCruIyRdNP# z5+jMwAAph_qT%+4P^}^{_eOFEy@4!|()a)TfgYxkB#^-gF|sX@;Vo}(CRO4kf072j zjvZKXUd(bYkpUb+vLsls6sIhynDxAQ!aY3cy7RP;fSD^9p=H2bECu z;1052j!1$yO9CFM!B7v?P_N(!Fd;cxj}!Xz4({L=?!Z%cpgMP& zNrDLL@G7%`7vKRx(+)5jldb;-(Cset08{m_R`u>|6d&BKS=k}tpfFiU!bJ5`!?Lw2 zzcB80Vm~F3B*+ydYjg)aF&=)>$<&oy!Qt=r6D183?zHeEk&hCmm0AVMT2JCa2lPqp zl@rNs0K0OngrPCT@GoI7xI|$lZW1Tf0T6HEkFKLnKBFaUN+rg0o61xUe0RtSus>PkX6 z(Sc|EK^&kVI>q4*jKB=+5mP^vR715pJ#`?r_B{1*9>%~3OtmHOD<$6ZC+9^X(-K+F zum+9N2=Vn*33Xvn(mwxZ&sPbwCge+Q^^X^j&&kjsD~%N=J@Krd)gPo4CeW5c(V^|! zt`Wy|NV76rArx|j;X}=pCxija)OBuMY*_s@Zi&?})fGTP(i1(`B!Uz^HLMOvHzpjH zU<*+XFZ5iaj~?O<9e4rAW^@d~`aylKKpvdn1X>^#hf`=< z;tIw9W0_WIKcgL^?h^2U2aMndu0RX0HdDd2YlVPRLG?WI;RuWX3Aj@yvUTrR_v?h= z;Pyf9dUvdTa_|4<7A1H=7B6(?<1C*naENU}ep;qSa5h6VE~f%06VL32aV zAEH%a!!mHu;SG71A{X(7BMu^2HzfBi?s!uF&h-;b`1U{-Cy|nH>-Jxh5+;ApgnbKH zJ2WeMR~QU4CEt)Gk#b&SHzx}AcCUC?yHX{e4=Q2O$@ET&(@PZ6APCZcCRAW~Z$d}? z!51QrB&1g-A}1!W_a(NMdpoNw$blkY;OJewJP$`zHA3o9~&})~Ag3;vbEs0{7 zX^$n6nM?nPuLP^q~JgcId_Apg+d7{I^>d?6XCpbGSD@F+nA z40Ru*7ZrRU4*Vey-q{y?02O><9&&LCltz+`h9nk=h(M$B+@}%V;1A|O7D_3gcL?-+ zz@b&5s00OMVIpSQz@d?VQ5mEkkw6IQv?xY$>y$M@`e7vbwnAc(hl9A8YsL-oz=MIx z1?<@coYx<)S^s9h22f!J7-1bcAP@qf129$vL?8})fDzbX7E~br0>K4302Y%06*|BL zC_w~JPS`p?3Ak@3_yMGB!3#_wAF@CZECMr}fEa#ZE51P@6oedpCLzc{hK|iib|@d- zz^?!CV<{@(uJ5Hm@?jVFCMi^UH-7Iz6mF%V<{JXFB{)IewvVQ5dKNmsw~RUvxX%Xy zp{DmBdIQ-7ptq1q0u@By`h0*AjGCJ3xhJBcHI6MIfMp7P;f?r!UoOK46hsj;$O+(D zgJ$Khd)v2z0vR4*qD=w~lz=2=nj~zRnn_}(!JrCAq6$>arc0u;OX9N+d3v|6w0&U@ z_5ihcq7v#ykyv4$gT-LNL0W`FfD~j6z(gzgLvp$X_#B{;za-~kOf zfD)2BkY^zU;2{n=fDwWX9^fGbgmb!&8+xlj1bizW5Lvs&xx2r6C+OrJs>2kl#9;r( z;T^i6q~?0PNkS3c;2t~zzL{yh?Hk5pTqWEfnqdMSni}XdyQW1T^7df{5|6a~K?ynl zs|EZdeq5`4Km1=k7)+swrr>%6$O*ut8;qk7 zgS;4=UP{DBY-J_Am{9~k}z{NW22 z{?!@&3;f{+7``1&f#C_>-~~S67yjTK9^xN<;0ylXL!RMD{@^2i;t#&!5B}ma{@^#h z<53>u6@KMSUf?7C;5|O&b$;Y|J_B5y;@iREF<#?2{^xNX;e}q{S-$6a9_VvkpAE8~8KPhM zm7n>YANsSO`m?|Ik3adBU;3Y4{HMSAoB#T=fBU=t9KK)r!yo_4{{{f!$+=E93f|~+ z&fpC;=T@w^f#}>5g8~T}M5vGQ6c|EgA5&-R5(y&LW(gB zcFcHgqd{i6KAsC{ux81bC4X9s*7G7wZ zdGu|_;e27F(qV`qjz}V8q+y6+in=lQ#T+YSAQo*h49G@-2PSx0jV<1g!56$qhzKDn z4oReiRD4lllG5=LpES}a31yU0#&=?oR$h4;gSwpaN{?LL@PP?px%h(#%pfAe2P#wJo_lI+6vokg5bW2 zZn~!3@WZO^MrMZ*c07q*HBEr?Zoc~NOIWMw{tF(1e|4SOb(!8Sj9Fy{v0(%7&GGMw|np5>?saL!n2 z$ZyYr$->8bVa(!l(MC)8MV&e?ogIU(KzL3$C^$$f!`qw_jLU?4Ffq)`JbUC9h-7SM z!wqFn4%lCa-KCJh1XE6$07i@rm{H?QaYHkkJSWaf|BWHfN9+5CDS@doj9~hFVvBRi zSTlI!lBd^B<(6L#c;>)qc^Wek>tX`hyI%j@Nd|MAqU|{pm(B)S%HpVTk8?!K5bLg& z`r83I%-hVIJ zGK8SOU-{;n#eDj;{OS}!)1pZ}$FI-K3fBC`HPC=^` zoFZye*Z{ALaSm)C&ry%UU$Kg(vJI3;dFt!ncAN*kDcKW~gaE6f)mif|xLw|X2hqLJ)Yp?;e)s*9EwHYF6h-exko(74oAwBc$sag&=69l+z9 z>@-@$Okhb`Ua|(`ASV^BQ>2m=(4P1tl0GHIPb>ZN2La6|%Kp+0N!*~H{zMtU62wpy z{WAjzUFenmNzY=0u3hbXOh5lV@r%dkLmye>%soiunNJvD4IOQ%z9uQpn35BKF+veq zOfZgONTC?@RAWssbkm&fw5K#xQBZZt)0t*KOhyHZPJf;n;R7bm7)@JsSt@z6_?s^zg7-6@vfQ&5M+L?Rg12odu?K^NA7=8T07GRu` za+wQB^2su=fh|TGpLPGJ*mma?1tVt-ri)4Ijsv@$9S?WQ;a%`vQoQ8VE_cqGURttu zspS<9d?~5kOF9A$1#7^;NbY>uf+&!OpEd4ChAfz07$FN&cw~ox>km_8 zAs25rF)N%^2GipYW~{tz^emca-mL~bsI ztB-JeV=&}6?ucG37$fJ}*4`uEVw-$r!j1zjzS@p~B)djnzp=8zK9rt?b8GxE_(K_!{sT*`VT({-MiZ%4#EJbOaeq)eA4UMHG8D1bzSL3f?&!kM3;YES>01f4a}5{w$^!o!UGNoqw@2^`=)H>ctLD*tg#8ad_Rq zX3sj+yKP%Q>-OL!b&Q7tGljyL!W8;2!mMr4a7_p%)PebUU@Q)5BNRS+iuXq&K>nCq z7-0%1EvqelArF~{G%&>I2RU>dqRY$U9^FVB=RFU4$q5VIRgY;aUR#b=)Vnov;SbuK z{`$Yfe)g=X{aE- z{wIQothlX*hA$(TyK~VF{@}-xvcQUOIAIEu7JT9tfB1_N-X9+ibvFbJd5`f2 z@&f;8CZuc9pby`mN+r?_-Czr|Mq~-NfRnQ;-1mHLWFsDdkqS}nMNxp9IYD1(~i2(ly~=SCai)PdWUgUd#PB}f}L zn1VyNY&$p(Jve=(0fZi?ZdFH4-7hkWRV&qF?JriA+ST4quk z;`A3(P)~`dh0Qz+ zrWxM=jfR*3h)9cWL|9VL4&S0>UGf)|l#Ow;jos*js{xLbw2Pluj@}p>=_rol_(tt` zj;aBV>nMrE;AR0f2S&IWpT$-3NRQvhAMq%UIkk`N(2ueKkl5&Sy{K=v#Ro<(2$Hc| z7m1M>sgWDWksXlsh8U2tCCyA0NsgfrNan6WbI(Lw>q(*h|k7}_DCJ>M@ z2}d)@8fftfH<^<QbR$(C(-k(sduMsRV@^A0YV86yc9al(6Xt{+XWvnxM$OUju?DvM7FrJD6*ASj@)DW&6prCGXcTRMxU2%@K?5B&z6lcNY(7&+FE z3-yqmhQWu#z+vP-48r&g#GnkM(hbrejQ+rfk5hB^fQQh*4>Xz!yrvI#`XoA97(-e# zMOvSg)urr7SeII*nJT7&#i^Z&NrZJ;pgNlSNvhgd1EVUBXBz)WDkWqgqz_z>R!5Va z{y?II@eZ=k4dl=`dif5HRCB*j5AQ%K^YDLyk*mAfIJuy!*06cbk*I@_3)b*?{$QS< zW>@6E3Uex{iR7}q$zKn&#I ztp1>)-JqEA@T~RfqPg%$F8UnNDj2z-3=PPy=x4C=kPCq*r1PpTlxnUQi?JE2u^Y>= z)P=5RgoW%!L%KDh{KA~dSs1cFD)P`c==piNCJ)3gn1KPXKeMvS+MR2990(g2i#m9E z^$zUnqlMV5>iMl5i?m6rv`fphw)wGT1dL9AvXk=$6uJK|?_dkkxEQ}c4vpG4%FtY- z@^b$04Vzc9fx)$2TcbIPsER6$D(er#5DvB)7=1XgM0&JMtG9d0w|(ok9t#*P6)#&v zL-Rma&$=q#5OI*94lAm-{?HH01zivqv9=};tY8bO)VR8ev-aAy(u$nP$*+f$3rr#l zm-h>bS+x9Ov488juM4}eE3QvVA~LEO$}o8-N<+lZg=wiO((ntm+ZfPDok2p!bgn6NsO^EOdlL)Lbkv{Hk=vq zuv_N~N%0a_bW0gipbpQ`4dhwGgR{a)OvYtw##b4*sxk#!@C%JF4APLro6)8L7)f(g zSL9&Dj`0rkXF`>rR@Mm`$}kN55XNF$zh{idiLA(<$iy9D3b{ZGQcM}{Dz#?0TYo^K zknsy;a1Z2Q3it2^hSms^)*10ia)msphrIvDtIW!+tZa?UA>{A}iZHc~0c3M|PW*s{ zMi9h~(G4qQWbd#BCH1PC!JPAp%HHbA%goHpd`Ym(AzkblmEgOtG!Hl95I8 z@CMaf%u}4T$s9Rj?9Atk&PNQ*=`jecCK&2q1X`OJ)Y?+&5HChh%Zp(LUQhr2imtjg((&x>@ljH{n956H_-#83w4#~Ho6jKN#b<$TZz?a?1?zYOge zm~02i@CHV9LU!;6j6@E6um*}S4};JMpYR5~Wea~0$A?7Ibw~_$pa|037!q4n)eIl+ zyU`pC(n+n<*(=hWu~Jho1!eFF-QfQOHD(0!psv^R3Gy%qZKYfJptwemMO|HRMxYPY z;DwD4&Wm9UB7D@w!LmI3*2=up_A3Sr&;VjEOMrmaP=HAh&;Sph4RHk65m49K+ti$a z&ny;ce*j!pZ3+h`7=!St@ocMN9T=1y81Y<&4$Rm%`N01V%Q9+^n#}CUmQc!LTVj3=VfU?? zfKcOcj^kQwP8op?>mCUit2<=(Pzh^1T#821pmytA zFkIX%7_2Y_*4+(dAPt(J374+Nwm`_7jw+tc>KYsBm*nDg4&(2w3(KJ1s~+uq>+1W` zs1Ga;bE^+d5K{a=SBW$3wrcMD>I96`$sx?I{(J1nj_Hk2dd~jrDBkTzJMD2)?RE}F z+y2@vJ^=6C?|b|0`+|8S*$r7hXl0N^k`@>bAMsiA24UDZrZ9h5Kn!bO4Vv%?%KjLH zs_&i7?+L%L{|-k1591JE+X&DA0AK(J!0jr(v-PDg4tb3C zm~p&~@AWBv_~P34fB#0deF5We0k-YehClfkoA~*X$&jBJU`zTTqW7PFq2&DlBjEb3 zpWD^m0Iv`Gk@~$~pX9CTsty5TPw=XK z>bD<9ys!M@8v5_@tH-YyfJoHbpB~A7{i5000$~24=@e}000_rNef{9 z<6o}X|1Qc4{+Pl4oC&`F0O3#IK!ODg9z=*RLxyn;9X^B@QQ}036)j%Gm{H?KjvYOI z1Q}A~NRlN@oXX+EwLo>n(4Y0 z&TBD7pK>|vFVhBk=RO&yD^W!se*`i}r&!z(sc%3mGRc^_k%bi`(;DU*w3OjT7G?NT zvbP?EKzz_O$nZ_Lj0u?EY*bX1eKEy1L-PCHE$GtQoVk*6$jSRqD4 zJ$K~ir9yi;bkMTqyt7e9A6+g_MROuYnzG8A)IxsxB*+$ZT=I0LO$!P&Q?VF@G*wks z-AhvcQ)Q~MEOnF_NKk*kd)2skb{i;BgnWf**Mfk}6)99%MK)PwrE;~{mxxKztZuC9 z#>{)9dD7YLlu@S7dB}MuP=EMQCzpJzyH*qu34awOFr3}|4a^f zCWaxVx@xN*k~nIC=J`ezGt27uH;}dN3SohW2`6LjhzU(tg7`d}oP6s+i(^g&l1G;R zdHm5%pkYvRdmy;ktsCt&1p=CEm!`hDamSO*nrv%W;gu|PNcM+HYkt{V@v*G2)E{gs z7AVxf1sa@~d%pSRp1KGBO_opn>1Jc1{W;xkfj}1sTWju1-1CDNf4q3(?~I)6Zt8aH zp4o*bs~%hUA+8?MO%^O4Sqe&h_10hS*%lqk^=I-=wdbBw>8ZD+p5GDY5c-0MKfZqZ zgG`=m80Ftl8Ck6K$Dd@8w6obl7UTKFfBd1kZ-m2js8isdSTR9z^dlFn;MRR^c0dT$ z&3^!VpFe`fKY`HiekVlXgYrj0(ZtUu*4UroNM{g!aEB;1BuEWe*b@_`(1$<&gw6_g zD3E-_uqVnepi7EK5Ph)1g*TagKF-2et@84r0)e zj|WQ_Oo$UOHCD1SSA1h8HA$B^k`ReF;}1V5moa_##~SZo1wWh?sO`-|Z}RBb_BOBhG4n@0P&p8FoW>eMStZnBLXNOm(wKxnK?aJkNoF>)GR4RM3XVCGF$zSKdkaTD z3Mr6O9!`Y%(B8g!CCl<{ES0o-5-SFRHpG1lOCxm2Fl;qVdZy(E8EA$7GxfR8y0jt# zKd2{6syQxlattS;R3-4T=d_It@)rj^=RmnBxY_wqp)x5`83Ve}q}&q@`~+!8sRD-# z5Q3u#VdOXo0y=&S!-*k0**&;(4|NiBAS`vMU{Xqun$iv}{wd!@Wg?}M7BwlS$bb|? zx>TmBqy`x{g;8l}3j^w7kp7UKEW*(~m)5AG*0dv1%xh);i`NW%bhWn?sUsHB z4%v>#8eiGD+D}X0vFi8 z2S#v$6}(^uH`u`shH!)>JYfn~*uodaaE3L!VGeiL!x(miBj8|y`?{9D6C&n``}>7^ z-q*!1hH;E#JYyQy*v2=;agKGoV;=X|$3F&gkcB*CA{W`nM@Djz7lMK>@K(hZlIN5A zdkPl+z{*#~a+bBc#~Y<(y?UyV=fg zo^zhjTxUDG8P9qKbeA*G=RW^A(1-rP1~RY#84!BRD_F<}5PfJxGuqK&hI9rbo#!un zfe_1;vV}4lYElm`1I#!_s#U#eR?iTZR?ugTGzkkwM-U* z>R;3P)qizMJ5Xx^Vi!Br#|CIyl%3aRI~&@iE*4R-z3pT&+u5!*^I&}aZB_@n*Rm#e zxsjcgVW)f5$8JlvtNrb3`(@hRwsyGRa_@QPy58G<)m)2`3&dVWA5sMPX!xD(f43$B zb8vU6%e?Nkj634ymWjM0-te(3oZK8w_OkokYLKV@#pAjzdBXK{SfKdtUFp%sFEpld z)|~v;DNl=gA)fAugW4?%*}1zXzHgB;T;I|zdcObZ7@L#)=znQCxR*}zS6TKav&zp+ zs9y7=qx|WE(K)$$4tAgm9kllD`pv&?^0XU!?ND#Ks&S5VYP)^vTQ?PHd14I_l?O)T z=muduW3F3;BQ$su58eqA^>@GhFJUkC*a-u9XGGM8Gzx9bSa&e!M5kwmQox{yy`V@|ic}Vuw?+WN0Ul`%Dn)vKD zey-o&eBUqs^Kq~K=82vE-tT|?=pU2dpd=`Qlf=6Yz8i=e0l?|=J^p)(_LDf|>kJ4~ zwfP$i^((!?usH{0KxW`Tx?{EvTtMy01!=e#ny8X1$sEPlBjY#{)p>_^h!W-*LAJv@ z5p0Y0vp~n7z#Y86==(qV^S=y)J{M!YBD}97tidtihpl1-BjO3?p%Cyhif=fvl!#cD>JIsY)pu#r9 zk7-N7;|M!1e2gxXz%lf|Ak;wA`orV@P&R2Yjz;9eOfeYIAf{ii#7o4)OKg%CkqJA@ zi8-W0!HYzn@Iy9a4fnb)pF72}6GWy{L_~B&SQHH>W5iLMCV0x6Br*yEp({fH6k_A0T#0^v|M6Q!XS9HW@iS7szB+b*c*y6L6ex0oo+ag zgmepXTMvobaKY5agKC=V90h_pm4aA-BHW)8$}&5iamVGx6p^iyuyUw zhsG2~Q;|%akj%$i%;Shl)^V%+Yy>WFg2XfeC?pN@C=7XsNQCIVzlfpX6wcsGh#0blvr>wWVI9<99oE^J-|C#z z0S&%k9T@TqYakYKz&=_30gZ2{hkS@m5eWxt5KQv;Eq-Ag`^Zcg!A#Mp66!g@exM|N z=@ovED2lv?+%yX2^bI};oxuQ3-x$!O$cOdnik9)dd}z#Zv4+Ov&hils7jmHOJi*!o z3U#;}pmfXtYE7JYhtNnPeNfQ%ysUn}L`u2D!{f;QB#3Tkgiq)ka`*&AsD#M;1afGE zKX4suu!TREl1iwAbplc$-JMEk1XD6if`!#5qSr5w5Aaik&@Y;Zy|@hDG=kV34S0-PT3G}u_kG-st`F3IyA=^1kv*d z5U@OyGbIPXsE2OAL4&wZu~3Ic4Uj^S zhqO5qemK)k%?EN&2OQGStLsoWnNU>%QG+m16vfpigHZyRf?+TPJK&os&;)BRf+8}4 zGWdlokcUs00!7FLBS2MUFoJ<#R%eA)Wtf5i;)0|wg*ONXW%v{?;E#E50)6P#Z&inU z;DaeB13Qq?Tqpx+7z9)J1|w($Bj5uspwz4D1~&~L2bvakIaKDHR9@=W?vsXYkcV&B zSAhyu7~)igRao*&Sn>P~Jms64xCyo>SdV#BYv7J<0NC2|BGwU%j|muSfF+Lo9rxkQ zm$1~{k%xKzU_DMLQ19HsPxT6EI1UQoPmMfe_Gvc!NR61xl_H3?FthN2iQAPPKnSF-5pfzS9uuFSM>>{C8QY42U>03aXC*I zxz(%x_1@>o+D71m*EAO_&84yR&$pGODPUW-)!OwXSDfeuyG4p_7z86Y5H1*oED#a< z#b1Ho11jQzN_fp*_=Z35hb-`i$2|(r<(mqUhMA?=t??ob$y^6MQ_%_CocI`lNL1_{ zPSefYcaR5t5X{w0lcGJ#{uPb{Er{98-~bBM!+TSm*xdt_huNWCf6yivAzrRvi*Nvs z9AQov`Ud5t({E^6@RQ!2*kPbtDWt^_9qHchwc^*x+F>YzLa87kRaYaR+4Oy1fneWp zaou3Zq4WI)~;01=wC{ll@ge=H{H^_%@$O0#@1ukeEF-BuDZr?1}o^SRC_9X@<$kxS03QgdGGVp?p z=?5pM8+LZ*jp>Fd@PakSr8nS$ENFyqKwt&_1J4!agjENeV3XhK6EFImLj6v4Fld~B zWP(lz=zSkf{jD^KJq?7d-DW8sJ3+T0%@1vcYz}WNDPIO97??k}W(Nr7%9^{hl zRF}qyd`Q?bC5RtZid%*ybc*0x0iD?6;@{Gd(XpHkEoru3lM0r|-|S@YQS35 z!Xk%R8uzXt5U-s__f?7}u|zHTAVaL=Bo7n*kL$A;|4mh8!te|f`(WkI#cYbk>(Z{G!$$4Y_G`jc?baUb(a?v2&EWv*6!{9=I-wH?(YWg z@D}g!Chzh#@AF3Q^j7cnX7BcP@Aro9_?GYartkW;@B7B@{MPUN=I{RY@Baqy02lB9 zC-4F{@B>Hi1Xu6{XYdAh@CS$R2$%2)r|=55@C(QA4A<}t=kN~q@DB&^5EtC-NdU@*_v` zBv)_w`=~_FxzGVJG%tH}+#k_GDM~WoPzgclKw8_Gp*( zX{YvTxAtqt_H5VoZRhrG_x5iG_iz{YaVPh3H}`W#_jFhHb!Yc>clURP_js50d8hY! zxA%L;_k7p)edqUn_xFDX_<$GqfhYKaH~51`_=H#Zg=hGNcld{g_=uPIiKqCAxA=?4 z_>9;1jpz7|_xO(o`H&a+ktg|*H~EuC`IJ}rm1p^uclnpFhxwS7`I)Ernz#9z$N8Ms z`JLzap7;5m2l}8F`k^QKqBr`ZNBX2!`lV<3rg!?Mhx(|O`l+Y-s<-;9$NH?-`mN{s zuJ`({2m7!W`>`kcvN!v)NBgu_`?Y8Lws-rthx@pf`?;t4y0`ng$NRk3`@QG;zW4jT z2mHVn{J|&u!Z-ZGNBqQB{KaSd#&`V3hy2Kw{K=>M%D4Q>$NbFK{LSb5&iDM!2mR0& zeYy__03rDV2_gXh04x9i000F9&;o}5000OF2Mh}f4i69!4-OI#4;2#<85R{C8W|xT z96}!lB_bawB_k~=C^RoDIyEvuJ2*W*L}NcDNkcwNMnRND5tv61NJ~*vO-L_Nk6Kes zG+2>$Sw+QK5oKIfJ6)8tUly%k8^>T8Ut?-*V_nE&9z|rDtz{%gW}L}pBFSkcO>3QY zY-P-CEW&Oq&2KP!aca$RGgfk;z;ZKxb0=ALsLpmeU3aCKcQSK$fP;8&(0M(CdNZJU zH^X~BVtuR8eM4S#TWOI^e37@r zkzUx5UX7EUmXlc7lVoj_QizpUqm_%wm1NYGX783+ahF$ym$%xNY1^4>b(&m-n!(GO zYoD8Ci=4WVon?ETW38TQ+@53Io^!6Bl=7cv&7gFCp=REpcA}zhf~0Jnq_EJWdf%jc zho*6rro^zOck`xm;HQ9^sKB_WoryCr_-~On6!rFw3Dp1$@R92owtkSx0va-mFT#Z=DM81yOqql zuGhPoqr8)`ywdBvo%X$#=)R$+znH1Nn6JOuy1~%Y!L;hZrK`f6-ovKq#H;tkqp`-L z*~Yo+$gj4^sKd$B(aNpt%Cq0gzTwNVy3DQi%&^MN*Y3`_z0a}0(Y3|V=giUE`P94F*1+-B!^qdZ`q#no*vISH%FNos`r5_W+u!os z%+%fN-QD5(-OJM6%JkpS)!@$c;ndgS(CXsU+T-)vf!n9F?$F@966A=l$^K?egmT^6c^S?(_EX{Pyzp z`Stwy_W%F?00008{{a0797wRB!Gj1BDqP60p~Hs|BTAe|v7*I`7&B_z$g!ixk03*e z97(dI$&)Bks$9vkrOTHvW6GRKv!>0PICJXU$+M@=pFo2O9ZIyQ(W6L{DqYI7sne%W zqe`7hwW`&tShH%~%C)Q4uVBN99ZR;X*|TWVs$I*rt=qS7(BTJr4xw7TUm@{kM%(=7Y&!9t#9!C>oFt6t5z zwd>cgW6PdRySDAyxO3~?&AYen-@t;fpY) z7ovt5av0-{IC?i@3@4`O;|@^-*6v1W!Mv7-DCm?F&!u^WDyl*|#K4O{a<(B*HZANR z(LMlX|1(lN0QPfH3>^p~QY;++VUQaF?jWm3oOEDCLiD)7$*9btxx=XfvD1Q$yu9#g ztRvx*#DTZwx@)fm@d98MZb%u!NW~tDkerq@>#mh{Vgzdk!xl7(8-hS85I(fvp{%XZ zL?X-~f}pcXB=B(Jh9#8l6G^)cj{?pjZb0h~JQUL+OeFOVgzG94uQCw7{&HfgKjcyAesTz7iw|xc=~xLJRod6Uivq2+{{NF~cm-J_>Uz5H*oNdvK%a3WSWx zhjfYzu1SdOPd1S#i;qC`M6yaOZs_{%+5B0u3OfvOvWPPe&D*g+NeDQKKsyN7g+Gf3 z|CkLBh6WggKeTqx4?F<=382XENGag51>q_n8F=eWxZ(gpUUz_Z`?Gm~H4nsV2QdhO z%s}7}2<_(`+>OtW0LJ1^Bpp!hPu{fn6E*Dtrhrc$p!@S*?~QsRpde@`{~np7Lyi#O z9kdXMLvj~H3!upSSHeEj>;MEh|E0h;fRQZX58yF)!#m-%BHn=;=+sR)uq_}{YzNMJ zOnNiCXVd-n;hSBr;z0`_An$r5$R42ffeZkRLKrOy-9Q=?AQhZ0b;d$Q-CE!e1S)WQ zH*0|~`lbQ_+75TRGnPN1AVP2yDjMhjoEyA>4gj$%Z@w~K2f7f6PATt)>4}1X|B65m zwoOn#=W0mx2m(Cm_0J#phy@-bH@?sTu3hfiN8t_!Ahs22aDX$PKU%abK`bd6xSF0n z{v@&}P7y#I1RcZt(ZopfuUJXM1I`K(jXp@O1M$FMqZ-(OebljkfIEahFh#pCR4#W> z^T*USmpgU|uLC8_8G!bo!*~Jlli|r*fJz`CT;M7NeGrHc1!T5`EO8*f>Ocx+nSy4Z zLj{1^-1uyFz7B|t9-G`^7X1N=4h$}DY9!OLDA)m7;Hq=ayk#!oI6K(6DwedYW#4K> zI|uJ%^AfmL*Ca70Gaz+njhf#ae6i0D3eRf4b8&6{j}t_>|kC8 zkf%fdl^~nhzz2Y~MY?{3PY^@GN8eN>kd6AIqgreqNR8=_Xe2H`?Ib9Gx)eRX$`Y^u z!pABMg$FscpdWgzXi!00s1iudo8kh{TK9p$UI9o^q2+4~`jOfe|KO5x!Wft2u3ES@ ztnzLRY5^Jcfh!S6ZhB=!t8%@Ajz}EB8~YRxVdG&5H?+(l{U{L8AP68JG-@9qVqph( z(KkgEmaCyntlQODi-eR~AVl1%_OPoIff`e1*5$hR zI(>9WC|A~9w#G6kPX6Q zh2X=7(S@Ru2QuY?RC#1uK8T(6@#TxX_{`dAL!k(igm^?E{|Q|Hhd0(dARx?5CytGt zlzPe&45InZRf^?+l7Ku;bRh~|2m>$B(FV)~$W=e##13{Ag*)FQ9@_%6rgOAP0a<|^ zWN@5-fItgsh}y0T-L$G#t&~m!NCoi_Wr0LM3ki=h)vV67u7T1875LiMFDTrD@Gy#O z&Jx$VPPVdpB5ViYVcF1*_EDN$pl3__+Sq2cFt*L@ZVSVR*$%h3$4%~XoBQ18PPe+( z&F*IPL&z1`2fPgu<#yW}pkz?;MWUoGinPJJ4jmar(SnY8YvkP)nRia*&G3UDoFWWw z_`N3{mE3aTbQ+;Wzb7JZf^>kwdOgc36c+_>X9NfX|22p(=$a65{Wy*T3Nt~IC=s^` zNahsJxlz1PH$1RhBD`<|8N{e@f*?ca$lHe?2u-P22xG^f=!Y9BcC^%G106l<0654% z27CyE7Wo#08nx1oLyba@*47A%5TZhbxO^b!;6zPZzzLHX#7Pv1``qhpaGm>|C`-4< zDpFtxOC+)A0@2wbK*2#bB!L%6P?Q@=urkW=q6=cwV6T!83%$kw6ojT-gF?-fsD>3L;3bKlE5Tf50AZut`V(|2ksPf&fb>8+jAf-`8&w0}2EkT4EZZJ6Ba9x5RMgsA1`_ciYlQd+I4JFiKTG2NE zVg*`tJt`mwDj+_NMJ}x~M$)r0@N_OUggk2}2qG{Fb%PGq69csHfxvP%I}-^l!h8$J zEkfjY9bi5&&;oUnJ2Kch^I{2rqc0xEEx@t^WKecHAbOrdcu>NAR45bUwJN!j1TW`f z;E+iSrFH&bN)FX;^dd<-14!(H56V&y3FvVkH*lPAWR{Q($3lTx(F}OdBIj~F=0iKp z06h4zL*g_&SXY85Ggr66LH!U5ZO|f+|Dr;Gcr#t#Skps0IOq>A1uMP+E9W9P+!Bd9 z1S`k#ANq!gN?3;C5(SGeOP;p~d4nlc7>g=_IJ6K8(DFK1)&iYUCkGJ&s{jrp*g^fa z1Dt>kQ4}6$H$S`_%fCrq=3_Z9t zM8hX!g@|h>CqmZ&k9aKLD2@%ZEH!nBndp4RC0V%A4E_+0oB%77!z-oee855vmy`tI zhzB(k0||*`viOiDK_*>bEBoMj1fh!nh;!kySq>#jvJ(p;Py&mc6b&2|DXaTAP9QILD$m)@KpkFga@FI0?VQTKnW%N5QGha zf*cej(NH(1V_dv&Jw$1g{tzgffGhUMEqS9d9gs>+LJuNf14CJq|JVVlB9Pv9ieE!3 zkEa5Z^Mkwq1U8^>5Sf=D0dG;j3-|zCok9l4un%U55I6^SlrjpTgKP}Nm|X#RlO-$H zLnbXCJNqDENkA!eNjkd13x9$kiiC2)Qz&(_EEsZ^m`N$L&>}(DmI46}&@)^FWtOxv z3YWQ8@}`#lz?KC;kPJ9Gf08~GCM7B$J9>GXCBZ9QayLR54dilN643%ckOV4VX?)Rf z9Vt--@o*!}sTA<7_ARL5H0TK<(rYfxSDFR|Xu!B*f6FZm~AY*46*trpo zX`mkJ5D01xwq{Vypf~0KJpkei71{xKU=9;%4mA}ZmS7GBA{zNd5u?x|)=3d9V4r<) zqbJb}9r~fk0U+jZq)3{iO1h*>+N4f;q?X_gyZIRwcYhKQ13NkqLrND_dJ>OEq}O4j zPdcV#TBb(|I2oE5JjbFx7ZF$rcX&VsL_;`{&;@NED~Av?Q4kH;zzMuy1?J-oQ->*i z`ln)q50+r3b1D~W|LSuP7YR`if#{HS9OE#NDl0Kmj9Rr1i(mz-2X?$5c3_$uW?HJI zs-(o1rkD|U7V)Mz)dGS*3nCzIHedz3U<0`m10tC)5YfPOWoBQ)vI;;s3KAm-wu7q#RWFc- zDq6F!iN`GxyJLklv2g*k*^q@V*EK#n5XgWbol}`A09}$~wsZq6?0`Q^yD@WEhbvng zE!zP~`mQvKr1UbTMX<9;>JCYus*^#Bw#W;>Q*SLm52Mh1y*MmS)hOsdKtgphvg4{b zgSAylteVRUJkvK^`xb_Kix@c&f0Gb%YJ?buGpSoMM+QY>)JEV?lXP3R!*RDr$_+9r z3P;KfVE_zP&;|ZO2EgkE!+WHuzzN&{49x@#)L;&&&;>={21)w2OB$nubr}+QG7viu zySFto|95x!C_Fn81I>UBWWsgRgSk`NRSGKtgVVlU5DUm^ESy&t+nZAsNf4)t5PA56 zKUQ`>_P;$UHGyK5tYTZ7K)cpxyTXCHNSX?fccj!1J<#d~B?_ys>I@}duVk9+%O zC<2QR3p0wO*1M$8v!#7ur3Nu?-TRs)1P+kq5L>375usK1&<{DL6)qfa1JRd%i88A) zwgW*W!un)XMi4CQz_?+;=HOULI&~CmQ05>AT~MO2U=A_H!5)01eS4%U5(-M1!b<9h zd{J*|Ob}7#O!%r0-P;cm(gOP+3>w=KIcfzh0G~y%#slHU9aOCwp`0neoE4S?BG3o@ z|49(Mz(Ppq#MyDAQk)&oAWlcR2w4m$sC1-3 z6$WH*qU zt)$#=)8>#3HSNyP00cW5(tW`W8)q6s?b2suq}1RfPBPOSytg9C)bnf(*CWzMebvm7 z1Rfo)It|rG8Vj=b7x6RBAm~u5$Z*ySG1~)S*hX>G9MPz4>Jao03o(EW1?LZW|0)Q3N~d=E zu6_!rQd58|hq-Px-r2p~1t+L(0E@yPv$|5K<~ZxDhtP;TqHeh>6 z0A0Ge2w6jzwu-C8YC%;s3ekYCuIl2qN(&~A;@%Lf!zvK*z}XoV4>A73T|wdzQNax& zW8mO;kf$h$$9TeHD)*h>3cerWG^FE7dTrpVE5kR#T9uv$ddPEmhe|r2r%2!`tVVtl zY0VM(N)f*;5HUcoUmLKrpfeJZuwz8BbUrM!fai@>m>Pv~oS-fS|E}i<`_C@%<||RL z@#kcm5C&1e=(EQ`TDPYGxHaO_fBx`)J75K8S2I?AvRy*FgW591-T_5w4ztCf(%dT2286p zrEco((FMH3g<{DSS_+Ysxv++5V?E(=EJo7ry|M2ZI5%DQ8y8BYP=FZ1U z!|p3Y4alft2SSX-$U$7^56O(+ovrWf0R&xu4#`Hx z@RZX6vg;H1y%G7lMh3?T8Gj)MIad(DsD|zXH)9 zBH&1KkKaX|68T$^kxdIxV4mv^fGZ>^CYb_|0+Jg0>C3tE4puZmQCYXdN5i$q-7UA40|KaE$5yu>&_I=^~EkVeJ93@dQ z{mtT>)+HC>&Jn*%2EjbcPSP%W@%}Jz&Fa7Z24V9e0T7X5`6F1+;6a256)t4h(BVUf z5f|d)CDGzVgZMCRUN01>!jwD&q&(&bB-F=c*aXT_z)nK@t9)Y;SL zPoP1C4kcRD=uxCel`du4)ag^GQKe3$TGi@RtXZd`OQO}QMXq1PjwM^x>{+yF)t1~R zY;D=LZ{^OVTi5Pgym>ED)N7S*U%-I{4<=lgWQP|17Ik3UAy&M`9lE-S94}gAH~x}D z_7|%IR-Q;ddv3hQUyH_x7dL(!d2;2;jIV}1|2jIdY1E8WM{eEtbz{qo4G$+?-1u8m#R>2{Vk4LI6AT@Iw$~ z5>P`D75wa~xEw5TMFKgLD#Z&~WYNVCYqar393^U@1sO}EF{-~}?2*JDp9<2b5r>qp z$Q+wI%8iPglyWI0H3Xuo*=#~83FxYn@X9Qc;?m16v7GYEqDJY!idE2bQ>aM9BNM_m z{h3q3H}8zFPCk8tC$T;Q6%j){3+$o||Ew%Ds*URkHSo}@6fKHTM;}F0P)u8*g$#m( zi6lyYyom&zOiKkRNk^G<3PvOet#rE+t75e%R8x)BR9r_AWYm2)Ijx{1@Bt!MVlApw zPgR>bl2%tMO?Ihei8>b9TZ^?8qft^IqaS~w$;=;iy70$_Ys+0|+Bu(Hindl?ZLwOZ z*k#t;0o674-h;d$V;_S0u}MpO@r@;vSYoLZJvWnc=2e8xRG49V1I9N7D`MD(Q$Z{l zMPi9j>tf+KyJ%sbXrf8fOrv6nW}g;5zS9|zNj8~5l#5cCWtZoKdE}Bg_SvC*O#3$eww{7SpeT0-#2 z*V7qGrfy+!<*kYVc#hzYhTP+mkH4wSXji{bXJp4QUy|M1{+GP~9?(n9aGwDuWE}-+ zFM$yp5(9PPzymerd=ykt{{RuQKd2P#Q1qK0@hovDYWPDKUFd@G4v`H(L?Ig9aDyAb zK@fVxA_nL{M?dhe2u@gGczDq&dh)xTh*F<-5BNbTe zQ2?)Fi7O5$7UJmw5HQHaJ82P(Eb^i`z<8iBM#qe6>|l2Em&Ba7<01R7ffaaRgW>t3 zg#PHqm2d%sRm4LCoZy5Am==Q)EFuZ3pa(BvfDD3g!UiHpLK3`z2N85ZP5bCT6n2z^ zjp5NJkQpEytAw8EO%XB9IHN1clnqhXaz*IL8O&y>OGn`nm?YvQ$b8v?S`Je}o@qhj zR0$JvG~pq%NC82T{}9MQ3PS|k>cT35NDwRdDh9jb1^s}4NrK$3Ab7x%2;k8HbVLCZ zk+|bCWnwhyDU+B2JRLuisZTa00iY3L-_hch&w=1!JtdI>$SoESUX+sC7 zI8hQ=!iwOrB2bNbFzLhsJmEWPQ(u|Xr8-qY;JMUJSF#UJsG$kKa0o48fDb`9h9HCF zMGNq;m{ml>toUf=N)O^vf}o=W+}a27WJ*(;;#8+rT?uObDOHgDRH|%bs#8(-*TEiW zf{8tBVj<)l|3L9|C0H;AR~v$l){%h>$%}yqVnE33SRn}mAwvg55Qed`w4@>+DL7S7 z0$TJTc_n~HO>b%mOGp+b=yGgtWu!*m2A4#@6>f3grCXJ_0}B$tNkZV^4}A2a1?a>_ zb0Jb)UvRHIOt&_RzNvEMxOs1yknZXy!lj2UENzP&Xsd?BKTYlx&D*|4sKr?TA% z12h>B000B-TOtTRAOHm(P$nSY006*ZNM~ShfmjfLY{*Cl5I{f!dISR>R|*C)&Jc~= zMF9<%|APepK)@D4EY%8om?8U+rj!E%;wJ+W91mE4gf+BSb1Z z2r`R+Tq$n=pvDnG^Q({?z-BBt5m`QJl&zeQXBgtofWb1KT}c7~5V=5^*g_soY>orx zVT%M3vw_O2=m9l@#yR%y8O~4&a15x<14;%YAmL?m^a9o9DD{BAk%%qa!p3bbb*M=V zi3GhF7H8nJIjWI|E%cW_x7flNl2IU7f0{sXo->3-ooZp5qZIcog{&cQ1~r@G)v>m8 z9vq+n;9{Tvk2dr`{XA%cNW!w{#!I2g&BZti!MhUxfCCHw0B?YU-U$eR1e!tVF%Rg{ z{{=b&lS`~-Apjr&3;+O)HE;j~Xg~unFvk`SAOHjqC=mcaH3lG%Y6J-200G$ZfY?kR zSOCDu1Jb|)a@_zJ00006kZ~yxU~m=;hzTtZ2W)mn650Ci7 ziOv85J%9w}cz_2U0003rAjiE3`N&J22?Gcq0RX_jcdH?QfWKRr=VrG*n$Y34&&Azp z=O82i5bXgW+RHLd_?|UUfvFFu(ghm$fNlYRsrLc^^3DVRD9-9+5O@I%(18wCZVn-2 zoInh~zPv0z6JWHh`2dOKmY`JwgJh!0@=CyVuAW& zJq*|{pfkNTlQxyZJOf}q1Bp3dOEK1aylMad7TAKj%fJ3g00LkESz9(aBRq+dxW_Yx zXIs9go4U};u?O!MazbSyS&!RkYFqj9nidugTw)0Hf3bS0wIA}GzVxTt_SEmRs6&tG#7XP zf)?19pTLJIJ4Lxezk6&V0SG(-@j(0fFLTHP4(tUcP=MbHGZMH3|6YW^M7V_#3K+5Fn~7Fd8=AU6`T$6Q;0 zUMPhJNPvHv$uqbGpu2@?FaVNFgavRpNYDXlTRZ}3fb1)Ul1q@3TuB@}u&`4{b1=l2 ztjU`+hm8}0GdKVOD7#!rwPh?wIs(NyX@fQa3Sf`~fkZ`ugv$upxn>iBdtRpp@Y6$lsT3xg~V)tmCJ+%kU0Thuw;NSg(HxJ%S_Y*fM4_mpaXyd*g>L`xuF{n zhipaH1AqXS03vG!dINw0V6aTMxq7>=Ou$S4cz|9@1Obq_|D1!6%apnLs|FY&fHQDD z2#CZJ1WlP6O>F}@0oa1=3rhzOJ-$RExqOoyB8Y78hJuI(OK>KZFob8w%Yn?!HSz!f z5Q2*|ynkCZSYSdH>4o?#kYt#~RKv6tsfJ&Ske1VsW}r`VXommvylM0XSkO;%fQ9#j zkY2#FUZB7T@lOMpPy2q1t1RnG|8PBbBha zrI%8;PY59{pwfW?(15HoG;Sya2CdTn8PYmIQeg21|9@-=PbfkuT?;BD(*qhWIYlEn zt@Pffc2aR5>U${`uc2#UW49gs8d)C54t9`QCa3`-$^fTL^0 zKzxAZ?1emFf=4?8CLmBpL;wun&rMwsJ=LvRz)MU`i#T1@9yx;oIDmSazXK?M2#mtQ z+8zW5IhD*11c=81X@KdBpad`g24w;Rz}Al&5)2Rkj7u;W>p1}UzFWZ71h4=(W4LT2 z00CgohnqQ;q*i8~R5yVfl3>`Bz_4hQR!v1%|3~pnSOmsyH3uzKpLOlf133V5C6HEA zS9XOEhm2Pl(a{4aIY_uxSZD@FNU&&xfL@@4Gk^t#)4U}sSp)$G7$ATG08)yDkY??U zY-oW+8i_V|FO%qpfQ(qPnASzDhSWoV1t5WJI65wagzLPoWbgn6SO5nIRf{zcpu4;~ zHHU2QfCNB*1W2+iaDX$=00eLVVe1772-k`11q=vRxjh59%{vDmvW`8(E@S`)aL--< z*^iu17@6>Er_5gQGyr+%^IFq5QdQgg?+Fg&5EuX zijpe^hb$(DRnV-uya;tu2`H(G|JZ`B>#fsr)kAN{1n%uY4io^7L&It)1p^pg0l3)C zG+qQrfcDfj31eT!tA>zUIy8hd0;s>9G`u{>-#av7bMW6u4geDH1`osl9iRZ9r2q=R zfMWxP0WP_0uwA}`zFsipFjL@W#5$m3N&$$u1;_?b=D<8a-4JBgtD8Cxbh(C1>=JU)Q@vqz9UkOyF8Q6)P7g+O}0b133c@IaBfII65e{nJtE z6o9K^v4U#=68pV%U0l=!S&TzXU{>4UwcH3PSN+7>dqY{StpEbpf@odbb z-UF#UVcceJ4iR8jE{q6gQWAtwNCH8Cj2nuRbtWlqzy)~5Bu@AiLBQt@AZ#}vC6nQV zfVK#8z=8}ri9fN57GMW8p4e^9JOWU-R5r2unRSp8andqaR; zVBg)XaHDbT>l>AzyV2snI6|)4zh&9>5y$co=!E}tzlS9xU9?V1~}N> zJ_8+v1Pa*Ww`|A&U;sdFMVAgc27vCjMqyHWxB~Xc11VR$LpIxuwrt=4jEm;YOg(n3 z1`@cq`!x_9ZDF|1pl&t}RTzS&m56X=hYsL`Q5Xeb5Qb{<2Ru;caE7iEacqJ(Y=VI2 zOd^A9s42yGY{-`Elh~9_`4oc~W2A@&74XhTNTpC26?s|=$`#pdI zs0NOtV2<8nv(4B$1&78IIvtR0c`N_}sD`E1xS00sOsl>3OSs@J!CTHaSolqQELTp8 z&z=5K62rPZ%LD>Ifd6HVPy~rN>bAY>o>`U~M4@vs_D$=`bhTC_?@YL_Y^ZSqFxKwV zzXIunW&UXeF2QVI1lx_~RQ}y{JxOgl1KTCH1_%J@?r#(MkjfHpZUN`#J&iWO2XKH4 zbfAR}K$Hl_aDSk1f3R=~&v1D524Uy`5C8DVCW&1^hhF&=f>?ylQVL#3f_-3zRoI6l zK%!t7mS=+TAnk^Z)9;?nDSXUqxM5&`|>zyP4~SGeBLBI{+_q$gJzUWBj%awdn-8@?Pt~ z319&TK;3HiR1F}43P8hXp6Fv+S0>0h4WIxe&$epz^ZsVuw*UqVgP(|S0})68QaRcd zPyt^lEgKk{b?$Uj=Y&fMs}$&mt4EaOhygsPs*Q;3S{I3I*_Li$T4YZOZ(uD_!GqQc zmvMP<%^uoju(z}JfU|P|#hZXq2mlPo1dufVArOE75HR1gU31j{4Hf_xE|4t?JCA4P znG4|+Ov}87RrNK1_Z#g3I9)Gu?Xc5BK1YDU*F0DNWYqPt(GEM+Yybjq#{zwT0_eF0 zSnr2#xc};Y=u&`$kgHdN|SjN7x( z4gjokfXhV2Vy;ZrKI_Yag&$5kOFym;(SvCi3HX6Kj;Jn;xaW@`dz8=@7G3+MAa>yK z7n`E{^9KlX0tX7r2CGyyf(i@5O1SW$LT5l8db?Gsnm`W#bTnHC4py&)w;B%Y6%r!B zYOo-VMCsC8GFTX2LWH%FAgp8y4W4w_%qGoV2mm1PurgbwM5Vx$g!IS;O_i{in#)7M z0DuWJw{q?3^()x0UpsX0XZ9@Gv})I~ZR_?e+_-Y*`g4bcF5bLvv$aUFV+TIHYRzG} zCjS_23@b7Ak?|KR1*gN350kMJ1)m^zf-H@i?D;e3(4r@k6>Iu5>eQ-Nf8~~Ht41NL zXVb2oI@tuQvR%3c%Rqnu7`BHKFHV!}=;X?kds4N0xi@&C2^)UCZ_)GYX8V~Hr2RYi z@YzcnFK_<5?QEj9ZgG$YJ^c7$t0lViQaxBmnW)O|@BesN;)xeSVB%pBkAMlvMV*2T zI`|-j5gta~e-&DIo-;{YxFLtAW!NEzAwqxh0opCI8Vu zEJKP3qC{hw8DE)as!3Zc9VF3ZUT$>ek2jH^lU+P_!3o|qh76&Ga*Ol~!Ie4cTGgPQum|*w#KGj1tYSvBY4O8;3L_0y4R&#Po1}`8!k+figp`*mW$;0BhfudQ9 z;BgiiG>~z4%_PF}9$_^<|iMm;qo&TQooBp(ZJpXJx z+CtfSceXSM5MTr>pqb#7zp!QNEOJ|pJ(|!z@WkUz{YU|>`k{gjBn=#a$wuS+F^VYo z!%(YOj6br`5ARt4eB#p%K04qHVRWD|Em(y=@Ubq=Da||npr1c@(E@xdq67Fd%l;;) zM1RENQ61<49%A8)DN?bDR=gq>v#7-_axUvBA=Nryg(Sp6k4TgqApa)_H?q*OmbSblE_12N zUGlP*zWgOHgDK2m60?}bJSH-esmx_Evzg9(CNzBs%dojJEe}-A#A2Bpe35%Y#Pro&oKELo2G$jrudA9R27&C(6-_dK9E4Tmw+?{LaD>70XeyzzQvJ!%?Gp zFSy|$EC<`1RBzP84gl~p3e2jDw%S$Gd=;$EvFDe_(Eo*f*uZEu&{7ht5RFw(Lu*`9 zfiR$kG_%;XYslC~3j{$65s(uvh5&|Hl|zkd0!N6oWg=d=Di4o`jy-Nk ztW^z5S!uUfMtN2&p_Qy?Xyz_zM1dA5FiR3>F}QB>;&peE9S~~^yH<7p6ijuWUi1K2 z)DpM2#sw8+J7C$ZV0ME00Pdb5L6*;6mMkjW+jBuB-40N7-W8*8ibH1nWWvW8>n9C{4JU z94rYE<{vxohFlp3SiljUV2S!7#>46y$YbeAu9dWcbvOMNooPvKP0f3}Tf< zc+eO$<1o)K#F4`+<}uSH9$%XALQYF&?f93gVM2)lZ@lK@w7CUuhDe;}TsgI*ST4hr zOP=&B=t09+tN|2=nkP6~d>Ix`t?5lib<*ajbpNuOu+l07 z)WH)qqyN`kL_eC-VAI1iTrKNaAGOh@Ua)~Hw%zV_!MfUHFxYlYkY0O=)(KOEbBwL* zWe0D&SLN&ygB&)vS$4p&ls4F?Ef8xnoBvVQXi>MrE$&rq*U01kjDvq??sFd|-9Kh` zm+&D$c+m z^StL_3pme#?(-w7!5l+BI?~}3ah@+d=`EH7ixotmqrxL%PQUuEbM5nxV|~KFIJtsO zlp<}y0|+f3Lt(qF_PxEF=WSnmcPIAHGzl0d5Cnx%~?Ega~M8ViCf*WAcS-Uuy_;AdSyz06q%A|XNyhvC7g2p=$D!4QC5T>Kzth@cs!g$u$U*jR~LsGu8; zp^=m!S{NY`^3~8_M>c>0E_4A@RiPkUp%X&k;aDM+YzrCO-4_l{HlRab#DXwb1}GFE z8IS=j?1Q=_V_=L+xfDnnreZQKBQOeMKcq{pp#vF2L1Zu^I5?wPP=l6OLOhV8KbT`S zXbC68!#Z}3JF;UmbpN7cG+!9FfuYzTT7W|>noH*};ho?E9(V&8SOF0|;aQ*q5O4xI zNQVvZLM;X(CkR3$Y)f1yArl7U^_WE@WCKQyokB9CbcoOq?gKhZWJM}fT%e;lvSd8M z<2$xvOwJ>m0KunZ;0{>9C`jV_h`=JS!&wN!B>vD%KAa7>BLBsnBn*KE#)TiAAU&Q1 z9G-<7Zc}z33R_I0DK-fz7GVth)GDOFC4J0bEu>LK;;BRfBbtQ`prtYKg7VYWC>sj zCSj(7-GE?Cm9@P za>4~@ekW;yCui@YMLU!XfHX|9VAv&o=Kj;C*$%Qa53q*bjH+th)Y)iM4 z&?0~yVs0f@>R3+h?#!;pa#la_ABBkSS4pzp+RQdz0&h@6aQ^~Skxn;hrI7jqk(x!1`hy_YOE#2%S#W|ua!}`hqLJW1^sMF`l8afC06HAu zm}bM+=xAWrC~WN^Qc}aibpSX-!BD-$D)?k+I?v%i)t&yNm*iTC5@(#wX`P1G4lrcO z_(L*Ijf&K%B}^il9x6H{Dj(J-_K9Iy@TPA9r$3Y>Cp=tOjwK`x=cz1bDH*6@>Sbfv zC1gq_Wy&TkT;^cXMJ+Ue&fp9dV1XLgYSGDsbZsXYywB$*Di!0$Gw^rPk$sLHlxEJ9 z^8Xbg(Si-c(3N)Ui~vDr#86rJ{aHU|g%ieyl&h!y-VIqfR2@OycaC#gjd*+Ga!RDhQS8wk|^YtWW!x?eUh#;-@?p)=kjutx25J%> zv;oLSD_cZD38=vJfUBZ*fDP1>mLV zP`DiN$|6G#m+bxaZj#Ab3sj8?7*71+uIhFw3C2L%9+dMw-0gC)Stu^!`h(*ZNYYAf zKTxjadhc_LZ~1O58@sU#(U1+{kPg9b7T-wbNN8N>Y+yv@c9cM$?L&%XS~|IfIU#ae z^aC2sNEf!nog`=nP)j)B#YoCUYq`aupsxd|#U?+lxS>OLX0a154;i0D{-(hG3h=`* zak$LyuWTm=bL0YVq=uz{2xF8Dl<>J4u!6|3`F4&kBZww(5{-Tm3r>#)_pvtdgrIOen(P0~S9A zwKUKqvjsiW^E>ymKmT*M)ofaDB0v|kK_7It3A86hB|Cw(ByzM&2enWSwTnnG4I#+b#E^XY1IM;BQ9m_QN40oJF*PIsCRxIpNNS?)u2W03 zSAR8FONJC@!%de!3UCh0lC1rn?9jGszJ)bh$F*Fy#i9i7T;DZb=e1t%HDC9&U;i~= z2ex1jHenaGVIMYPC$?fQHe)xoV?Q=zN48{7Hf2|~WnVUCXaBZlZ#HLlwr77fXot3F zk2YzSwrQU>YNxhpuQqG9wrjsOY{#~2&o*t>wr$@wZs)dc?>2Auwr~G7a0j<=4>xfa zw{aggawoTPFE?{Hw{t%?bVs*zPd9Z}w{>4Pc4xPCZ#Q>$w|9Rxc!#%mk2iUjw|Sp8 zdZ)K~uQz+Qw|l=ge8;zZ&o_P7w|(C?e&@G-?>B$*ArxPc!yf+x6w zFF1oYxPw19gh#l9PdJ5FxP@OhhG)2jZ#ai{xW;wECscwhSc8X`INZ&{Je)-(WPuU1 zc#FTmDVzn0n>da095Hx-5lq1;SVI{YffKwzBb-7btp7oa|A9X=!W0<68^}V9H#y9` z1B)9$8CV03*Fi4$gFKM9KMX^U|A90_c@}87laIN^4T6iG!W$St9hgNOIDrvlfg)ge zkB7L6R|1*edBvHz5qNK{rHsoZ~s8+nX%3cqJ&pp!-9g z3;Ld$g{0>KAGA23yTvzrLq4Fzr-wRRsKYntgIUM}F(|`5eEMXJx~PMCT$nnlWBIDL z`Zq{MtG`2|_qtrj1Ch%@892c(9J!^p_@5Jl5e&j6?0Jg21&epOKd1w;$3-0&L9u^> zi|0a%e;-~%`&m%C5&St^Tzj@}d$;4oH@rcL+y8cc!62`!m&?5 zig$yW%L2T=#U~g67o5c@5WHM`LLHF!qn80O)I%Nc!DI}4!JEawS3F!OJi|-*!$*8x zw75J_dL6{LyqCOMyu&FZdKsknJVd#Ruem>bgA?$9HS9QCkVA{-x~zuz1EwBG_*Z2oCVU)gDr?S+?%|K ze?pHNfijfA6zF`SyZ9UMx<7C`AKd*LZ2!Vr%(*|jLyPnJ9|-!Di+hWog1J|Mpx;Bf zfBv>lyP%(fzDs;sXg=o~f#-uh>z{e)pMJWVzKainSp<5E$2k|cxGlIlysLhT!+x5( zd665whcCXME4&e4zFEA3EHr|hpGC7rx}d+o-mgU~ytzNHf|65uj#~q_PXRxO`yeE} zxxYgrEPFVhz7h08pHBh$=lJ@!c>BLYHvmKiKUPNG_)}(NinUx2`KdEv zWRR?Pb&L#q*9c>{dl&nWyO_(G6C>Tiy{RK_V#010huyPf~VU3?M77-gI>y#Mt0AP2ql#;YWkx8Q=0sfjM)gCJr?dJhl+Z#AJrvPI6IYp(5fLv;`2>r?DqB=M# zqn|39G;>1EK#goy%yVSwcVD{Pfqz3+)Buiha4XXdt(@Ajgtl3MxOeEJlqJfhKo?| z%+(*);7qk2f9SQ>AOCz~WeQ#=b zx3@C7f$+~&1LYRwlvQ4t<(6H3c_}XVkj0iw^7^9@WxnZ{u;aG6=isb>4r}PG^2sMD zeS#+F9;BB#8s?~_o|@{at-jhiYc6u)k4;)(x01o?kS0^|PQe=OwAEgl?Y7;1+uunE zJB2a0oH!fqy!GCj@4o&1J36i9ru3J()dL*y#1&th@y6G-#wAfDpFFQS4L4#Q$2H%a z^Ugj0Jkd{7QUvtUO+Ov=)Kw?z9dcHG9roB|pPlyFZNDA&+;!ib_uhT~9r)mdAD(!q z`Z<}Oew_;R9{dk`bOA!rzmmp8)Qx@U6NsWr+-YGWtR zU&@O$l6r^q?d`uG|NIj*GBbbRz0)bDP=-IskOgw|Lx4|FA5vz*hhM;Jfbf$aQvN_D z{ap})8Pwo*_~C?v=wn30-~!j`a|$lhVG8o_!Y2kqsuj-3Jg(EnC#FydF08L0Gx@_e z?vaa;{XtIi;KD$>z=cW_OE;Rj#|fSAiF^#g30$ZZ4U4rwD_#+cS(FM$x)DBG^uq~n z*onLD^M*$7gM7-ELKbX+MtA)o3$oxsZEn&Ckl?UM2>C?67I>JmiKh}Eu?0EABa6-C zY$vDS$p1UOVFX3o=NDR(ZuEy2q(nYYmEjaZc?TcJ zJ!oY zWPL4M3gE~Hmn}S}`UDn8u6g2}9FL>dV!g&iH z>iL_L;X6{}g@o71eKr7Jmwt6x=0(87w8rCCp_ zX;tf5+1ggOz7?)t6ZVSHJ!huvzWLVE0j2d>j_B_=rb55_{Of z?qjgZ3+!YmTUoO7V;_iJtUJrHKtM8(Ov_zspW=eS?k$!bhfqTP;F>O zTUyhO7PPEQhdUO_SbRX%vcVPZT>Ej@cyRW!+#oG#SL<5d`u4Yj#SUgQdtBtoR=Lw) zu5*36kKuk7yjU!*V%_oC=Zj+HE!*#64E> zlBu%~{VI30L(ZaN1H z@j!5z>C9t%kom}RhHF38jM{X#xz2&!a6jUqTp5R1M)5GmGk~EAPe=mNhS0;LDQyTy zOTrVX0LC-S0cgGv`oY|A$D9RS>d6vXxqqHfKg`i+L#P1@O&CHGu%HG#K>s?^s!&BO zfYEDU)I!&ehO{1FEo)leS{2W5$KLkg4$!&-(WaJmUJHEoy2hkylSbK28f zQM9YM?QKsyBiw6i_OqugJ57LLMt?x5wDCT1>%sAI?`EcR=3JqPhas9E352gD z?pE0cFbXaSwr4BvPr=7E(xG(E&B6!y5C4OHl#+%vNM#S8Kfx@T;#sCJA_zga1XzMlh^@rK5Li$};E|FIVempL{osXU zGg}aFpaU!MkVM`4p^C3GIb}QnsHJd?14t$ZtyYvmy5FWMlj6 z0Kd=Mos16nK<47`LBHhh4;P>aQ+&|JJKk_i0_sy7aN-YgqAJ$GH;}@Pq~a6?FbE!j zDO}GfcwqLv0`9cp8JfWP#t!&`4=Ki27sdbyOko$~027d44E$jiuHXuwfglR;5EIcK)C9mogmEeMffqDP`|trBs38db zfgM0m<5fC%hB4{2a2n{ppqzzodb1XSz|GcoD>fey(K3AEr5fE1n}C-Uk2=P!5cM4#+16;(-RdU>JlF zEWN-Qlz|uw&e3eqqALwEDd0gbks>T-!9@!J3H-nm zjDV(ype=(y4)$^v%r1(771c8RZu+{ zv=x;RvhvX%LQn=h(D=l2_#o3C(V+&P!pW8b96F#4ED{|$K>cFCA67vG7NIp$upoB8 z3Pu4Bdb0)mp##7n3ffR9=5$W$R4JkX`B*R-O4A!cb2Gs~vgkoP&GS{MZ$JIv`=TQ7 zWH21bP|hGTQ14S6=`%l(LLvP@`2W1??C>-x(o+aG@D>>~SyNS22a8pcf*RoA7V&`^ zu9ZIFaRbp$F)dUWI&wonG(^$j9>{?lQsNIx)UR&RALPIcO2JBkfJ*OV41m-w3&H?L zMh<|~AI4xvS0Z1p;1P_#59}ZdB8ecHv>@Q28ZuU6GxiFO028Ei0yP0*?Z6Ix!4AlT z2fB0@3gQq6;!O$SObMc8{ozgJ00}AMW z;Z)<0XHUy#qYqH=f%+2ELjOf>7kR-N{Q*}CqHcLr$M)g+xB#b0iENrRb zu(o0yoZ>5+qLcDNQ-b3utcoe@HJ|WR4&-1LZZ-%?Aqdcc7I;8ll|bY0!2k*3U^~!9I;~IupkQI2*#Iu$2SO!p!NzPEYU%G{Xra{Axp)<4vfGI+;U}Q z_GW3eOk;L0^Y=~VavsLO2y*r*Fl;IA^gR28F-0>JvjH-%6(4|afdLmO-~m-dQyHOF zbh!dJ>9Yg8%NsgC1pjmlRV@Js2!af#P{*#&S+hZH+170@7%B|7RncMksIMS(%+1h2 zJpYy;T=j;{b8rinD}({inssQMj5IeFAVbm34%8o-^;tL5B8?a+E*D$Nm5Bp#Drh)# z6;)h+wO#4q`p|(Fz|2^It6e*66k?4k7$F5Z;25+b1vKGe$Iu^4!3MaZ8+ueK-T-f& z!Xf}Dcv+NqjdvtKpa+b=C;tEmLLddu;ClT4VaI?6QlK9u`3mI02~MB|Qj$r*_bIMm z3~E<>(Kk5CGWIS3A9%nBj^GNkzCZgythG#`$@2#|nGxx#M4&x(al7{rcn z!*BTd;XK1{X#ba17{sq1RCD;8mO(XGYPVt)S|A2mKoANd3EY4n@IaeeAPYCN1X^H^ z@c{^0AO+ZPZQC{qEK(~rQ<#5P7#uT-g+Vn%PBE>Rp3&j@yps%xcq`0U7(Nh*>5~^+ zH8eMni=#m`kzo~iAu|=4p$WpEr*kyN(KHvjUmRMVvw|8#S6mf(iOrKVa}YaK)6Kw- zpzmuG)L;nIz$#Q=jOD!IYmu<_PvHsv(o}VPMZf!tgN`t-=Mp z;_u=i1^;Sr`}&q3+|?|KPQvuFuiGN-=K5VBvd?6XCpbDxEASpowGIk&I*c5yq4*VeyPTLoJz!bn@9%6C|&IYN+h9FW& zjD%wV8K@E7;1A|O7KSOev&aB_z_^*BtUkqerNU|2z_^j%V^gFqkst^V_AENn_w<@Y z;K2%7fNKe5HKW0b3%hOHpbrQatX#mgUEsC-0kX+(25i6-W`Ggap#uUT5IUfCRX_yd zzy}zC9cDoV#xM|EpaW1c8BC!AT!0cp0P!HM1C+oByMiBp$`-uf6!IYp6k#WH!wHDt z7ytU=8*bu6$boAnf*ha-kk=%O^5G3$+(Ou*5@4KSR3smEA#>J(uen2+Z?{Ftj=bCE z8z}ZEG@EEtL=kYv2~r$~q9w@h9M8uB86IJ|kpc~rfFRhLAl{p?34*@C zpb7}03gGR%3*yQPBFlSvkBjijePIvwK+L_O5+(?hT4A=oWn{tuUc3Z^PGk-GWF%6& zs+|CM^W4>6-74Ur32v|{G{FVv0S!8U5(fRIXCVdPAr3l#5gra6;2{N;w9x|{kN>Md z1iULBe45fX`_ePLDG)v48CUESB6Dcqo& zr2-xd9Q3H%y+z>u_F)Fz&&&Nm2|6Icc|9reeZqY}1Vq3TPMaXS+y{DL4+4SOxk4Yr zZ{%# zs($LP9_zK<>bZXEvA!LszU#r>>%+e6{b3oF!R*PN?9aaK*?#N2UhJ*@?fz?l89`489@Av-ir~dBe{_FWZ?+qXDvmWsaKk&nT@bw<=6F>49U-1t=@CiTc+ac|j z0rSt^?H51u7r*l%-|^Mn^FjagEx+<5pYTV&>+y;mn&73oVjqrS=CNfStf9~GCj)w) z5c~lJdOrhDz#j-f_%i?%{Neb2z#qQg_|^XSzrY`U;P~4C6^{S+gTMEQ-}r?;`Ile$ zeINLRpZkyh`-PwRonQE&U-+e;`i0;6v48xxzxd5R{Cl7Ig+Kerzy7`d{sAJx4}V`W z7|gJ&-N6iB{QX$i4%I?J{!ko5;g8TlRQ?tYJV?=^L5vy?a`Z@WVgJH~4IdtiII-fz zj2k;1{K!z|L4OiW+8jyq;L4jRUoK`uz(yu;9Uj3mZO+II-fzj2kPy zW681P$&@QwzKl7u=FON5nwWF$hKbQP&P^IUdeS-6Ca9d7!}=97)}X1dW_B7iXwjri zvsR7z2Agv$R@^{z?#YeiPtJ8(zPwXt=)(~~zoCoSb>wWWGkXplxpL;tuS>r^y!i3t z(3^vgK7D!h>)E%T>kj?9{qZ;4n~yIXamI<)UU|@+N8LHrLH}1@c=vht-g^A`cOZc2 ztw11j2Eun8h08T4pMu`$Vjq6XZMa^C&Z(DSe<>({M)=b(u<=;z>sFIp&Ii={n> z8gRk+hMH`wxdt0*%)CO|IkurjiniW<3+|!@RZDKU zfGYdN94lm?TAngQ%f_(99?NFD>E6IlpwA*gh`Ij$J1rFrg-bAI{7fp1GzBlraKn_1 z3vk2|x2dcy=e&~dD>r;#f@(T>&nv*TLZ(BT@WT^HPfT^ydOD2tVp-f1=_rt94R+XK zLu_@~)hf%z95Q5p+zl9xc1}SuP{GUxPiW*C%JDLzO9oFAG>OagX23HQ85q;|-|Q0i zS;^%flEDpzCk;6jh99kW3-E=L>&q*T1^w+m=1gF-jW@9 z?V~Dd&<9lfy3Hh1e6T^>gsW)`ArB>;IWzI5eXQVX#;m5{!7Q&V6@B+>46!%V99jn3 z{P01`8(f}^(z>`CKF!J^zjK>gh?6Jz@ZV3xTSU|afwW5pC1j{&KgMe@<2LttF&o(y;)?=0w`m;r}#e(PY_+@?SH zppH!_LreXz0xlGaQA)-o115ljEC0p;(vXUD9PKoyNeilrUuIwfygaE(`Gm1uW;9#! z_=6XeA&XoLV-SPThn9%ZjhPD7h%TwYM@NcOke2kNO#LG^P=N|!HFc_WLdsCFRf&0Y zi4u{#s#jy@5=SI;tRziSrY6=hY#75BEo-Y=-wM~b%5|=Et!p{d%2vDTb+3G_D_;8w z*ue7juYxVCVd1*g!|Ihyighbt4zP_gsTQ`!YOT3un}K#r;s&<8ZTECnTiL!gx4Z3axn@h+Vpx%M zoh?j1KJkmh=tCb2wah)ZhX0vQ7-0>~?e4-{BHHj$)-&xQS8m2p3@H?&wCXHxwa%N~ z^^TXL?xhxd)yrP;n(Myn9Y+m3govQSu$*SoFbDC40&OoC@>j*eRI&z~X zH5Jk@)VC`2smYq^Rr3~CiiWkRF^yEpV&u$w`GqW;pk^T^#?5^>#Tzz(-CX>k z4vp}KEPk` zm)(dK(XqZ6KI3t~hHjlAR**aTb}Pqy?kGQ$-94k~yziK_e6PD{05AB#%e^&yH`L!b z8u+{u*#x?s^rjw+&AJ?5}$V|0J!miQgjAT(Qsg+U;PbcKcL(X|68L!r1a;!q3|Dn z`zKQUNB?*KXE^yMe}cDO5VvcG0)YCre*|cMfAW9{D0$<+fTMRO6{vt^Mr>(Td)0*) z%(fR(phKr{3i@CKVrL7xw+VdlbbHZ!dqI3oX9P0XNH^#Yjo^HVkqbsJ1=!VGw(tw` z0DjT67sSvH7Px<0Xb@fq zhHxT=20@1CXNDd0g@dw&8PJA__l9UFhY&}HT9|1jXI6wVhjo~DcNm9)5{P`5WPW%t zfT)LpC|dtDFcnr0$Oa~iL4pre2mZhh3X=t`@C{Bd1w!|NHi&~XSbQ<)56$;<-2ihv znEx1mkUB1B4QVG0`tS`}MJwIV4YnY5{!ojzXha3!h=_<@>xD0^@h6~R3`LKFyR$<@pCHL?S?;sD>P(<0n54NBWC3%u5i9;3va_|^z3>Gr% zh&xnJS~OXcCx?>;p_9YNhf#Kuc_LXrxsyC8Qbaj{adMQ$l9WmrQv9|_dE$QM=>K-` z$do?`l|spfRJkWt*^F74c*PJECq@ox){B-g57i}vlyO~L({znt4QghEAt{1lvUZg5 z4S5-p88DMl8B#MQ1?^xt{)RCIVN`-ylZAPhc9NK;MwUuRn1;EIVR#UW$(T!71CDu_ zYM7am$&>2unZ=! zn%pT;-Wez1IiBTdo=W8z-aoaq@>Dn*845L!x%5IR&^H;Vi7 z4wR@r`k)N0&OiQr0F$M2qhTszA2k|OFb7|zm|n@6nTmm}(Wz7PYhjvcj-aWe>i?;yimKNJdtnRYzz^a2azQ0$d?AvtbE&VIUI(F*aFDE+N`FvE zueKUv_d2ij`m6jp4*9CDkH&A!IIQLwuShAd1Dj+88VX{-G{_w1S z@eZ=k4dft1CHf9PHGaQP5AVP=^T3LH5w>G1M7bcg)_{azGP8Y=3$r+HlInCOMh>me zu{>*|8PK$ci@1raxQolUjr(>~drtF!YviOEXvB-Fz-+XSx^UWiT@e4UWv^*KI(866Z4?26xvXjfbOwHA7%_RKGsgeb0 z*9wbl8CE+~2bl%yDjCEu1zyn1TC%5O9L7)@oc==Tq(+zO}6k&-aHxdpk|Sb zQK%DQ9qbrWpblcv4UpQ-@-xly%+L+(&|yl?l`;if@C%JF4AKzLm2t7Im{B8EV&ve@ zi17}gm`#MkD zPzLuv4ySMrZ$NX65OkN3xzg9tJ^Ru#P5;$ZZPmxStXM-1f1n8cix}?ap;yHZ18D^5 zj2PXpUGw%1YfxQ|441qZ)%ml~S54P-ZPzuODg})hm2k#GH4pBlW?s<1jKNUz@CJSD z){;@Z1RU4yY}bzs*>|1Q;4%nY#~12g1dJ^iv&dQMfI3Ft)rC<9UQh4s3EkiX<7WQ^^B}p4^a=7X2pyJY`7pyqz)%G~dq$uS z){u~mkl2J_4a)r9SyH{^eBq8g-=Qo94bT8$Fjjy7;!u!O5zqh+pbaC1;t|l{zC78H z0@~X4bARw=|9uLz#}|WewVCa;3ceRSz89H2b*G?RsyrC?z+xNz7rAX2CCcPd{oUXg6_#?KFC#m7ZJU6E-%xF} zz6RAM53-O2;SdhgJ#7l}2|6bYZ?NQofwqX=$HkrLSIp>Go>XG+=v{v4%K%c^AnBMM z?OVL*l+o!wCk-U_2~)s)PN44Opa^e_K89LO1?Q;3eVpA}^PGDy)Kj?)Hu14@pQ$WjXY!)X7Y`ru_DC5A=s2g?4}U&|diUyZ0m2_Z}}&gMa1L zJ^+_q`X_w&VpD|A*$r7Bb7jC#K(`mSpZg5;1`H`grvQpsKn!bO4V&-@a-SG3EBZxG z`mry+r!P{ekL?iP!m}SXApM+KpY@A@uFpR^(GUK_OZ_2b{Y-xW z5Afw)4g=or?d@N`;t&5e=6hxUVMO3Sf&~p8MCec9LWT_&>a$hP;Y5lREndW!QR7CA z9X);o=`llwaV1TjM43|MN|r5MzJwW5=1iJ3ZQjJ$5()qSIaZ1h0APg5pcjDh6z~cG z00B9jK7|@p>Qt&#Q;rOoRqIx+#NKfQ8&+&qXh=)ML&1QD;D1R$|Q03x`KsS-pMg&>*D+NDQ$?8>@PdL(ODaOGh6gyT=xNek8I!4{t;=NhOzLa={#nj7%E#prleP za&oCM%j1r0vP&<&1T##jp45^pc{(dI&5x{^Ff?_nQ4>lo#iX-NJMZk0%s9gma!)@o z8m1f2=CS1$IRE5uPCOT7v{6Ty(-YCGtocS#OBw3N7PCC-^U?|xeKb^2ME=~ngQGQ9 zX{V(&%w3SR=8u2KcJUow=8oK}b#IO+L%tcYunyw+N-rJp9dY^S4cm>g*)`#Eg%_@R0sThy^=?v3DX z$Zr3)#WuTdzd=Tu;Fu??>|BPB-TSk7R$T~eha9(P@rEEz+@hQR#ys;-11H!k$*k$- z*KW*J=pJcSt{g0Ripy{wawKGrA9Zrc=U3CE`PH9r-cjZrW!||x_uHkx-5-9wQAV13 zWT)MB;&bz_i}_dcSDiyXko-q^ag zK@KkJd+VCVH)KJ&%bYKL>l4fM6he$}kYiZ>AV#Gm$B?P5kQ~UXN1JN65P8UA9{&Fj zmO_T{l^HH%hbn|&IEtna1XfOa9fV>O<#j3)D#RMCV4P&s;XZ%B%o@M=VHLMxjkWpX zbm3Eo-HND?9TMXn-}uHobQrgA(aj&-Fb)I#u||LFkc(t`&sL$~&^yPUG}*OS9Z(L?vpMbB3)O>I=!>Dss_=AXF>LK$^+w z=}>4vbfPCk=~F72G@>b^c2@KqG1ij4LXm%ctVd0PgLQL1F znq>$X`7w-xe$%VOx<_btyAbh`cSzr5hUbfxf^jAb@iJORAfL3*#+=aYmfm{ zO!q>?)JISKK}lJJBO>N?;lvKLRa?M`!48(lft%P0SqO?B_v4~@Ir580?$^W(X-5W3 z5MUNh)T&{V?%g~RV^MM?NFzP*iCz2O7WbH(+xf7OanzoXC+ZqsUYm z=95kQ1^+JD%26C}kfmHCLT%Z^DbB!kuRP}MIbtO3@N$u?fzkgo_w~poa0Hp?U7Y1 z1xG{L)9xyU6h>laO#i52YhL%-*S`jK zu!TKrVi()k$3}Ltl}&6%I06nP7`2vB?IXTUyRYr|f@?>;ZEknl+usIvxWzqga+llO z=SFwB)xBJ#Tu~+ukEV!57eM?H{F<->F7Lwtp~if*0K22S+%;M{)y( zH{9V5hxiRfl5mPw+~Nsec*HgS@QGjC;~zJ8#y39lj)VW)( zS?+S0|6l_d*nkXT4)GO?=+q)+0}mbduD*@YXADyshAmTtN873ue#iCicOO+m-ipHU5h?LM?{h!6%RlkMJ;yUY&=|kp=0_yEX7Hz5l#H#be zO-#Q|l*8btL{yx^8{9=d^t)u74A5gnT>KG*l7{ioh?tp~E+Ukgst=NJqjz|RmT@F# zG(uGr#o)liU-XS$JO^}iLS!sQAvC>b%*9d5K=i0PdKAY?;Rgq6g*^iaNScg$SVXb- z25QVlN_0PYyo?`Y$8_XGUmV8cKt6?h3uk=CRJ=&vxW{@tNI$s;@}fZtxrdP?$&xI| zTo{Id?8ptd$BZ<`1#HNd3`U3~wefqseBy0$^~*5$uvTj%IqUSaVnB&Ot2tJ!K_IL8BCu10-yZJ&s0s!04rt4q4zo} z;lU#WnumYk7X|AV%G{K;Q5!N;Er0OL)%>?*SOLO(kI)p(2ODM5iRd)cFjA*As<&j1zB0o54ba3eTcJu;M#7=)jh5twO$%+JuyO;Htv;2jzQ z7yu>E4du`dwG;#WB>1eapXvubqno`rdD8&3(AFD zSc?%d%Kx;|d~5_Ra00QI2P5#u45Kr{nnsZ8_2`G1B3P znO35lesHSgX&in?uFt%OGZl+Hr4K$RBhYwL`j}O=$cK=Ei~32#eArRrv4$OGR8x8t zgkq*erJOMZ3w59)6-+Q-+EVs-hf1j@ebCiUovwaBN{=YYUx>lgY*krogii<~a`*&A zsDvN=1afGEKaeA9u!TQ(nM$Yx(SlcbwW3OBgi~OP_W6TGc%4sJ2>0<h+Dm+fT;(5m_~;<*0@lI zo+X#HkcSrP5q_}PrOgL&PzQNZR<2;yg?P{KqEa{sSqZvUY(?853)g^|f?+s?Isl?6 z*aT}Zf`2lCGWdlokcUs00!7FLBd}U!FoK2PTfYTdWtakC;sUlfg*ONXWuO}`_?CHa z0)1%Q$5n@X;DaeBgE|n{Tqpx+7=%;!1|w($Bj5us5ZVCjhK>a$W(pnMAqRcf)1ak{ z)F)!k$X!*9@CNkyNN1tKYt5;(El(&<@iAeL@`T`;^TI2w}l zfgWo}E9d_OoqAAIk04q=l81S?LaM1%M^(wDg$rpQ8A<`#bz&A^wM<4OntF&uE4qxY z^{IK7has8OwIP~Az~)JQ;Q%P8AO+q1tLtnRUk48i;dz#QDUF6okNMExC*J* zjUJ7SUxf$7z86g z6D}BrESM5gMrDQIgNEXQO6XE={R4l<0)GJFv4~{Te#I7k5-6A0FGa2gop*EZ@A@L-sQ1KS=0Yv z4{Hb`O+w==-jtXoz%r^;j|f&^dIv-%8fW?Gi^*bYmJcr86tDtYK^dwHTNyGIE-U({ zamLY%Fz3TLl%n3|g>YxLfMR6g=jVnKq9JDB1f4cD+q?E}{mR+K z2@7gYAdzMZU9tws23v<1$XA>XP6hX`YW8A^-rngDO={|51XLWmJ3 z@e(KTu`uxzH*w?`U{B48;d$;Dm+={=@fx@B8^>`O4~h5GmfxO;6x0?6#}EjbC-!1D_G3r(WLNfOXZB`y_GgFo zXqWbBr}k>M_G`!XY}fW}=k{**_HPIGa2NM+C--tU_j5=0bXWIvXZLn@_jiZ)c$fEi zr}uid_j||peAoAV=l6d1_kRcYfEW0IC-{Ol_=89Igje{5XZVJ9_=ku1h?n?@r}&Du z_>0H*jMw;$=lG8I_>TwqkQe!pC;5^$`IATalvnwcXZe^l#_xi5~`>+@Lu_yboH~X_k`?Od4wP*Xbcl)=8`?#0;xu^TOxBI)t`@Gltz32PB z_xryG{JWv(I@@VH~rH`{nS_e)o1 z;V1s$H~!;C{^VEw8JkcxBlzL{_OwP{_W@f?)U!h2mkOF|M4gP z@;CqUNB{I!|Mh48_ILmHhyVDO|M{o?`nUi4$N&7-|NZCx{`Y@?@F#E}!GZ=4B21`o zA;X3aA3}^MaU#Wv7B6DVsBt65jvhaP3@LIX$&w~dqD-lBCCipBU&4$jb0*E2HgDq0 zsdFdKo<4s94Jvdf(V|9=B2B7vDbuD-pF)i)bt=`WR$@ta&r%&YnMm4lQ~#>C&c8qfV`QHS5-{U&Eu0EqgZY+O}`w&aHbl@7}(D0}n2I zIPv1fk0Vd6d^z*x&Ywe%E`2)n>ejDg&#rwt_wL@mgAXr$Jo)nG&!bPTem(p4?%%_Y zFMmG$`u6YR&#!+!|Nj2}0~nxy0}@!Efd?X(pn?lB*r0T8 zUt?-*V_nE&9z|rDtz{(0W+KUHCQWOdcx`3OZ7jlWEX{8)&2ckUa-oWHD8O10Ou&Rn?1epAhC6VEt$&6>frgLNhf;2cwO)xpbcwKqiAKhWQq_r8 zdx}Nxi$`IMMU9P1nvH{Hj!D>#TWOI^e37@rkzS3Ho|uzX*^^{#lv0S5SfiDT%9Uj9 zl~{3>R@9behL^Y6mucIXY;~Gkg_^;knp}&Vx}lt9@tk9loo0KUW89u&-kx->pOmej zYt5i^exYabqHBVrY@MXAp`>xpqRci^Xhil}s&sKB_WoxP}f z;i-estc8!Qdz!7yx~_x1uA<_uilVQ@^sj-Hv4YdFjO4M6sIrB`v!|J~hQ73l<+PKm zw#oIji=4NM<+qs8xRmL*mdv`Y=DM82yOs94lcT(nvAojOyqn^@lv>%e~>tvbxNz^vtlz&e!hFxxLS^z|pnE(dW$3-0#u7<dI+SI-5 z)Wr1Ez4_F-@z%r0*S`AK!QI%z^4Q1h*~-k?!}{9A+1uap+|1qG;riXn(%#DC-pcgf z(beG2_2Jal;?UaT^XlW&+vL>e<>ve4*68Nu_U7CC=Goro+3o1u<>~SG>E7b%-S+C? z`RwE7?cwR~<@xXE>hS0N@aOIF>ihES@$~NV_VN7o^7i@l{Q36(|Nj6000{p8{RtdM zu%N+%2oow?$grWqhY%x5oJg^v#fum-YTU@NqsNaRLy8 zoJq5$&6_xL>fFh*r_Y~2g9;r=w5ZXeNRujE%CxD|r%fOt?uiw9b0}CEZxUk{Fh!ZPb%($`R z$B-jSo=my2<;$2eYu?Pcv**vCLyI0wy0q!js8g$6&APSg*RW&Do=v;9?c2C>>)y?~ zx9{J;g9{%{ytwh>$dfBy&b+zv=g^}|pH98H_3PNPYv0bjyZ7(l!;AkPPrkhQ^XSv7 zU(de1`}e!k^_ow=zWw|3;p^YezhtNl00t=FfCLJ7M1Ke-s31jQ6zJfC0LHW6gcMdd z5ET$+IG}_UcIcsg7y@Vl4kVUHq7x?KZ~_i0Z1~}eFs2uxh8l7hjuLX6v`TZKq?SEwA7)jtY&lkm8>(=yB>^$vtW^ar^9 z^kRYv_~7$MDBAo{M>R3SEYLm*b1YCbk3f5Hqv;BSjLL*`iVUtti0qFxk0^_eK=eG4 z$}4N=`tI5MQId)~3}uqYGY`$%u|P)%7>YnR2)Knmi3tCwO%H|!=!8GCZqN@r0R9P} z$nZ!h;IakbDqt9R>rJ@g0771OfOq?|d4M$!#Onqv{DI6s;SdPz=NsIO&yWDV;?E-+ zQ0`COwD=P>?E#vAPaUB9^Iz|cek0%?Xea+3nWRIG5a1iA@Q6cl7eou7$ox0LKGopR zLpuMNz&C)8B;pU?Equc};k6>Z!5Zn*O*ybCAXDrH&U;L{GrVil{rBOUU9aLniyk2F zdLzglp!R_b0EJ>0EehR08Wf-toGx|7LPp(G;12{UaCXgszL5?9u`F-CGTsKZ@Q6++?}zDef`I>oKoGV~P(bHuNc9K;Jn8k%ANY7h z9V9ot&;hPp?%PM<4hJB%6>D&SGoL?Nw5&fYDH^z%obQmun zdNMqD3(yE8qzhb`pbi2NqJYe{kR=WTSQ{{bEKT5ybesTCn;W0)&bI-P(L|p0ASk97`eXAMm_V@@jm`xi1q6G#eRKpF#!5eh}g4YX^Ui3AWlbol?tewIf~9Q23eV&@Mv2*?DTEQl)way?f$ zj28PK#0Cxo&JE~tAb`u=F9#V+Y5r$$H|xPZD!MFf&?g?a+FRiUHqouY5gAPj2PFU$ zj)wZN)pHM zsuD;>{jpIkw$G!*^hY!j7a(>D6hK{y-dl{IFRfj*RKvvxa(05N&SrAY`U)k*yrjT(<5CChB{z%0wwc9BS%Bh%N^GATUE09e5 z2?Sf<5C!uJ&%HfDN-7XyZBI*BRf0f`sUs_LSA3m196<#%=z$te^||`_gAy!MQW$cF z5qeOHAU*p?C&HjcTv_ZOSOs!~3lfi$5yZy^NwPxl;X~&_amfRLazUg#GAtj&PW#~U z#aK*c?Wmzp1Uf=I9&v;%gya7kXdX}x=BAUyMo&sRazIBwUM9BSge`2L z7wJ#~<^trZpJW0DJ97e^ZxWAf_4(2{Ql)^X;EpgjPCz}NMKwM>*MwsF)Tma9rU7(< zc!aV*AfUyAMQQ3(+xpf(*}@5Sy=xW}?m=}3#WiJFYg`-q*gg5RgHVa=W;@#`$}SMI zpH1y*^CTX_$hNk(Jq#mS``h3Sx46em?sA*^+~`iXx|`LHA6H}_?ly>%*G+GKhT+H; ziITb~qK5G@bYvDqOFHJQk#<`o-ZzD}!Vh9_iYR>H^^SN{Y@3PGX@nO0o(Q}NvVjQS z^(?7SoD;mAksbupAjAKNYeK}u<2Vk;%K}ZJMA#}Im`nWTM)^kF>Y#Fo^kNNR5F^G3 zf()P|Zy$dMG^JiK3>ai2TEan5(5B;6vZFaaefk;I`3L}!WU1P9k}gkBs$ zQEM2%$|%>1Er?Nry*k1x?%IM*1e$gY@-$bbs)AnZBNVxVpr?Cawm;C7+11zr2#!EB zCmaYKDri^Q;o$@j7?_R#qMYkbd3Y{0sO1A`=}osl*SINb_j4X)q z`+5Wuq~jGT@UQ=bw2?Pq_5J)dF`z)wq4g~+IUZryg2n6J3B?wE-#wTfTK!=HS!qLj zYcM$9KwW zaohqdI1mP7hXbJJIf4fz=of`D0bZ+;J4euRI~ER@q)=Gr52eIV`$jL3#53l^KKP(4 z1yO(;=Wzj-2}DK-+Hfoos1?po2Q6|g*JD1kQ-|+!Gg|;o(_?iZcrG=BJZaYtub>7k z0x2p4Jox{zLyI*%v{Qrr&{DACE3k4dlEW>I=!dX!EdPOTm3V|=C@xNr2(rX^iI6v# zB89G~5{N?!uK+EtQ)Md9DRpuXEszS~0D>K~Z#a+%=>UaC&@Pm4AZTba2IvnbKr~lZ zVQ(0RYs3}NKm(AIH}sH)DTEB6R|j>F3C^H{TSGK_B32}rc5(uA8^DOi(v9EPK+7^y zlxT^{H(ZZ}E6(5#?wAR%LOHx5ipmEp^k7Lx;Ei=qLoHyCRrZPuX%g6HEBnBD1aXV} zXLI3$Sqv3RvhxZcFan9dLD66WbOkBNI1o2u0w?nbBX9zB@Nv4M1`i{Rb9fc~Z~`MB zG|~SPM=H=>BM?V*&>RywW9i)05FaF4QFvD$oN*Z~~LY7aNz65LFNjH!=#95C}Jr0&$&36E}Njopb+5 z5nd*k!+D+|5e@!!5$Vv1hjBfWh@SG<80P62sWqPSnVJS5Z4zsoaukZ|da}Ll0AkP4yTzL)- zdJZ!cAe3+p1|k~wMiHUVBGE|^DiEK25u+#J3>TWA$pIke5Trsnq(oYzMtY=38l;rK zHo@5$6gPho(E>F(5j{#5PkIuK=cCsFq)FPPUizg$3OEy*89K+6Jr@yD>UMPy21G+R zkFW)5AS;9rG)@o=+K>soKn3RW4N-?FcbcbSgb$Per)?@1Wy*67_XtkVfa(7*Z5-n; ziz+KIB#cx{3__;GDY916nf|r%Dlt&;!+=3ysnOvI7qw@K?<0 ztlBbM1-VtR>Z_nvJL1!+#_?Zr2W zVzJ(qr8fmvDF}8A_<-PQ96-8QJ&>|NYG~^^q?v%EYml-d@UlW$2MGUm8B{0`->?Pq z77bb_X7e>Hsj!owATfVnJF_ZK_2PJ_qBRA3c-+FUI3`#P8y7jN4OZB4U30SofeaYZ zIgoh*%@s*sOE=Ks4){~F8v}=OShB@&vKv68>DsbFN-te11T;IO?r;R9IvKGzixX=& z6G;!DkbQMGEKSuY=|Dh0bu+S)syP$2PW!8p+Y3C?H&y!$bxIw?fJdhV}_TD!aiz1%Fr#VQ{-^;6Gsyq^OVy+z_$xJm5|*qoTbaPR)9AXw0lPwq3Xd10 z)et?)x&|Ps0)!yJ&oBb;8V0FQemDS!Adm>J(4s=>yhRE* z>sb_JJP>!>L1?TI#c2Y^DPczd0(F3&1knpDRD?&I9YO!P#6#M|=g`1G8pT)4p;8P2 zL8^6Ge7p=?2VTMrA^gSX@Cxpb4v1_Q;Xs)n5Drm1RjRxNtegn03*YT%wO(GPZY6sOF}tDMTmJTb96%dQ-JAEBNSK@U@!$kKtpLh8tn48>Ax z!L%>}pTG_D@(HyQq$WVc)esHm$_*oc3ZFcryuvMd;l>p43KIDck07soA zGEB|oVa-7bEGEDW;1r~=45F_Wq(8L8m2iVjhz5An}- zvC$Oa(EyDmaQi6H;H4Gp$wQjbM9K{=eGb~d(&zsi4Lv}!7~L1{ka4E5( z;v_0JUc!5?Ngp;kngM4IHBy zJ0J==>d>JjsyFK3pz5%q0741sARsEB4rN`2DzSHBeH8lK5jr{%T(=Q3Du#jC&o>$r zea#Yo9TNLj5Uc_Mjy2Z2!8-tQpf6gW5n^WmqACt*n*t&`bS*#xWe{W|5-v#){l*j` z-4Rzh5!KickZTaHUBp7M+APu9jD2ziQIe9KAn5?B$k5dcaoPj1*GA#f9C4^;>Jaqs z3N4Th1LqHO`UiFDrf{0Bc&evT6M!lQxoiJ+-Oc^n1IMRp(2B!gXu9&J+C3AV(LW&{3sB--P`>EJX=r)z?=|W_jPL2%R83eJ9bxDw{=h_bvISs zTK6@Uim6|btP+t4G%$Nd&|I~e2wC%&u{x{3YC%&K3enJ{sVd^KN(&m!;orckzA6y$ z0NEB64<(+%U4h{d(ZCJzV&TAej;APyr+C6+D(l_f{=FaI)T7-hdTNlXE5kRw3YDDa zdB{_Ffl4}_he%tntvY@aU(FHpN)fXy5G~-ZSi7&ZpfeIOuw&%0Xg(~oz~+htm>6Yo znV>HE?dAfD&n_|MEAg=Hw`7Cn1c3kU57odySXZb0w>9FEfBvw4I6wtuS2Ie!rx1@YvTH;ujUul6$4Ym#fa9fxP^5smvj(v82O;YdiLDViD^@&P5SbufvW@3T zYm!LY=8_9VGbopHE)Yj65J`LHf@N`kFkWj=2kM)-Em7+!akW`H5Q(r4>U0p*FgW7l z1hyq1v~pLnf(%Nz21TninXc*X;m{K#PFr{o1PlVh@U7msM)T*AZiwt#4vlEL>ks4; zwyhDH8@Tfp22`-zZZ`rD`RfeTxE?y~@z5##U;<{7s%MN3j*Bu5e^bpa5YfOhuM-W^ zJ`)9>5}Ug(ovZD2yfomxLe>9Jj5l^5!8nW@q;>uv%={hM?%p0fum$OWJQSIW2vP5} zvVyfq1x_F#&!{`L!|MD_IVzyKKGEJAF}-Ugb5WcWZ2N*Lx>S2PNZ1T;zf)ZHE!hC(c?#u zAw`ZPS<>W5lqprNWZBZ?OPDcb&ZJp0X1XIbb?z*Pljl#ML4^(_TGZ%Kq)C;M+&66L z(~eD{PNiDa>Q$^+wQl8#5MQsZVF!K{Th{DZv}x6@CAmR`zeF44Zg>@M?uMLe?#7FD zca6Uzcm2KEK-C=;poIy~P2^9-+_!P(*1el|Z(q4n4<{acxA9}Sk?&HTYk4l;xolOh zX5HE~Rk`Qt+@=5Y>~4>9TkbwvEO*Pf9(T`WemznyrMq>`fp#mKw(YsNb?Z^Fs91%0@s_VjFX zP@rfDL!kd*9@){KZyrg9P)J*PvrHR{(r~9qDTVQ=OLfXL%t|8_6{SBM-6s>r208+t z9!7Pwq)(j;btoAznh|C=?UI=*ORE{_@A2E&QQDSZF(n6-i)& z+N?!fb6pbKY^#m-Ta3O5V;_O~L5xUn)1By*PhNQiQ0O9!?HOF-C3N0;y;XPLhgzVb zg?$eFgOX7AHTa>&wxGAmEmYVinrPRy=*+&J1!+fUUha=__sERMnxXXcft%~Cug&{s+@J&`A-^s-r#1JlU91fXdVgdE~ulH zdTRfwtF}7o-xlt$=BvB*`fISm7JF>6=b9NsuFFqItTG$+Ok{oSIcQJHMKb+tsg0|FW`OhlBTg^+xq)I8X~ez_l#hlpaYsY;K?5rE zf(EYh2L%1mk09xy2dRh$2r`ig5ZtTBoM%3 z1L<%=Cmtb2UB)D4zpLdd+b6km(z2XZJi#ld5DNn=nHXg2514v;ZG} zpo>7<*b5clBQB|kMpW^EON$mHqXJ3C2#&Q6*>&`zAsuN+gQ}9psI#XFQ)y4Fc+;Ft zPOWZ*&;RHuSGwlMHVQmzN~~~>qb>v=m1Qx5A~Ztzvsz2elHp%Vm#pnVVXD71tuI*n~z>M+FrqOS)J&?Q`)Nix~Jq zK=x!$7R(EZ_V~iQS+EA2EU^O=Il{blKoKbD+uB1M;u$cogm+OJ-R({!4%pa8KeR#I z09&QI{;g#)AOHXch}S_7fIt9Z&>l-bzySb=g^S2=U-zUS0NUtK3?P7j2B0VfDF##w zIxL_MXNv+FI7bQqfPgG8xTgS4@I&^|%O0m^!8#6QI3BQo{to}huIE_5h1~-R4!dZ^ z@oBG%R;*$c!|59U0I`0ijH?;5&lxs81d(OR;~+Bx8iGhOdWLM~DvCe=FLsY5vS0@Z zpF_TOC}Mk>{2nONnLW<e-Kmh*;XwM-4AZZLhVA2Q>zyShK zX7*f}J+S~lj@hGu2aLJ_F#rGn4A5avARylVJ_o%GAOHeD90TB9z_`y5gOxv^-?wg$ z9rVp_f4`go?RkI*&hY>aJOBU!XaI?Q!ElE|+!7~G001zcTx$q`+u2@cw5@HBBp~?e z3-T4!UFiq_?3z7tPI8CoU1m>Iz~}7w^OK{@9$Wyx=e_^{d@TV0CnqBrmWaUuHXZ|u zmz?13VE_i$LwEW2jBlZ;~k%j15kj(VZP@%ATR;1SADiy zr^q?X!2IiAJNfeSJgD1)01mtd0t;x*1QM`rK))T}bvG}+&mjT=#3$G6F~Z>44*>C| z;QN)wfO%K`_$>3f;+;2u_1|M-R->@Wb24iHfE37ruq&?wOaKB%0ZywmE_o<1G zHQM7jpHn;%E5JjbKl=lK@k_GxO926xzFM+6=o^JmfWG38J`;Q%dOJ1tz`OS-Job2o z5(ohBb2~b-F%0-Y41g~T;6C_RKf&v^2Gj-uFoJWqI|u+m4fuwL%fEA=fCUJL1YkG# zfV{$^I5N0C)&l@gyF2>w1vvu%4KOrYa}WRUYCXVf4-2flb8vtRTsvEHLMZ$O4sgE- z&;Yv<0rz{sOhUm?frs&t2{Sk-7UViXEF~xtHTQ4;2Y3$(FaY-81__YEwre->gSS|7 z56^o*_rL%OfRFb3zXI3+0cbC6001W3LMpsF4S>J=kT{ClfmU2Qx{CpLgG73xGi{)^ zz_UC7Aw~Cyfx$DlI%E%9YsFV=4-v4$CPc&6dH};~#6euXmVt*JP=RrY348#uMQjZh z#Kz(gfVFE61Pm|rItM#2KwmHd1$f2<5CL5HMEyGiTp%%LK!6GG1@+1?Tqpnt0EJ{Y zgcpke1GvX8WWR;XGrE(z_Rs(Wumt~Yu!D3=fd#+?XM3<~9JK}T1yFc^1W-4MBm-Pn zIb5&?11QKtSb&UkgdOm;x@(UH5I#_VIQdXWh8#cnI=N*$hc%=~i^RxtFt{Bcg99Ld zrz53Fb47FXB0>}uHMkO)2!==)M@EdwDx$Yr+W~RAu|@Md03d)8u)dbdJx=Vnh6IJQ zYygY1ga+t10l2SZ*sywQ4|>bX_E5=MTR8wY02+(9j&nKoct=RQJOBuQ33xGQ_%;A2 zfcvrpkpnmJvIM*YfCo6WLlA(DBRToF%Z~d#YuGRXAcNb306UDpZvafeB({|kfGi+B zp>%-7<4P%-%1ALGfzXC;7>NIPpoCm12|<7ctt7{;!_5QY0I9=AxKp=Uiv=095MO{! z_ech3J3K+F5Ni-c{lK^ZafazU2WRlk!^_TZh=uH&IOog{Ul253@IU?N&i9DV_c#Oy zAOQ1J&i>I&LU|-c>4g>=2r6I|zA=d;IMCx{i*n>oT=@ptOThhDGJlE%M-Tztd$woz z004l1*K^SN@lQfg0UU6OP+%i}zyW_ygWog>Y>0yeWsL>}QCtB9>&%bWTB#T)01ePd zIO7IHSWg<29}yLm6IB(+QqYmugjR^r)tJ#Ctsdu!QYo5JDkV~FRGM#af-OjgKY)i& zNCZMa2`eZgCuPv&ywd+nA}=*HA~tPP^(j(U`Gb|Ptus{(D1}oLv4aFyfCR`s_TYd8 z@Kc&o9|suJkaQ8E?4RvxPxc^#K23mh+*1cwLZLJQ2#`5QG{Xl_O<%ABBgiu{FoN-% zLj=Hp?bK5b$w-M5P*TBuMI;v06;!mz|{n>04}>XT_gYj5K?{fxQ3+FRsEK| zi3%n4hyjaLSxr-P)t%0q#8K?kbI8%^S=Rl8j{`8)_P8`3QYmhtys7Gy(xg&D~+Z3;4-~a|#00#indVP=)0qp!&fQ-)=kVL8tR0oi>0DnM$ zza4M@`Z9tAK!Ayr53iL!3?KvSLp7;AReUspLlppmjfE3ifO9q2Cv$*cZGb|ZI0N{& zn&Zp%c(4>uu6q-J{t`K6MciOzfWUjX03bq<6%djIku^Y-jab)^fCfEy*|dn+A$_oY zdsdb#%*B+uqw_fbaU;T^z1P5d)4ue$Vl6=q*j;fu0BuMHkvm={?A>4$00`hXl4Ae@ zNV2hIkEQcnr*i;m4S?{Cg$WQ>0f0EqvsZ2S*JKcZ05AZNGu{8AOK~FrX1%w`t+)32 zHUbd1&pkN=5Lbv}T=rnefnzv8jgRCVOh-Gp2B5Pn5P$;UfFr?PrvnAgWL5+SfM6v6 z3R~QC|>V$m=p~C7d5ZaL$4u={Rh*T&ntCR?8 z8wnkG3eIXX+NIHA{lagsgxrk*&m_Tw>riU|1p@$H0kGHN9o_gy00CgaaOkh(#XDFQ$$>rHpZ4U-a%)=Bn z1KWo)dToK2AEvI9J-1;}yCXf;i5k-`U6BdXqNFza7Q5txKFsP_C z7=dIC1AN$k7U+Z{7}H+Z0+xB^Xr5+&sDl;=g*8wpiLiod`v@wr5~ZlB9-z}8J7N&U z0P#hD7_bJGWL5!401Tc0_T{xIW{=>_;`X4>^P^S-5CL^d-fMg?BWSv3JvmGKSMO}& zb9iI_?HyKqJ^%&0#(#y62T0^Vb-Hs1VNgK8h7-7&+`_uU(Ag9Kp>r~LYk&viHca;4 ze;v5P{A8r{T)NedVqH(G6}N-c*$N(S;AsE5%(N*QkgKOS^vR;EnikM6Q=ZH85DcVsco!)SoDJ&vBV$&KJiq&K3=>;^d3&L#s4bp#5yVyJY-0AK+B zE`~&hJ~;+}ZKoz+N7Fa){YdyIR z8HFEkS%?T`ci4bT2!&7>hG77vZB}O1rUl45SJ%9tShJFM9#3ovNwbF2C%a+># z$!@{|FaT>1x~VI(djm~p@VxBvywIlTN+!5i@XTv0Rzri%jrP$88@e*H1Oh;SResO- z;5gd;yxW%8i$lYf%dzA|>bk@r7HZln1V+{b==5E>Q5x@%XV*%#(R)H(Qhj6fqbfAR|kdt7B@B5zbw7&0W{)S=L z0RHZ8y6%WlNrzKO6@f?u#R`gEID&m}hg8^yBbcF8X_dZmaOL!c07w7~phyC+0}8kB ztGzK){=TE6)&}_Cb67bPgD>JmS`MBufg^+Do!XQ8HzD`nt1SR`Bme?nfD?m&N>=KQ z3vB@i?n6k1lUo1+=(uRzu);Hdh7`I4cv};<+znd*4FKObXAdN=?b|+tlj~H&jRg!f zfTr^}Umyd+O}GOf!gnM(!&Aj%%TJ4zj~YKUC!T;5fB?g_20!(;3h2=PSeEBlTUH|= zIt`$J6Cbvv+j2QCAn!hqV9*0>*q(=20}wEQM=@CyI000NEE;f{uYPa)_J>T^6Q(kO zewg{1-)3m81E6X?|LzE4F&1Q5Sy2ayZ@8>SsRPVn7H4Vjy?$zE2sfkVfT(i-x|;w{ z2mlPQgnu=FiVuMH>deVCRt>OU0bt|fqVV+9F_(~ndf;PUp^e(!&$U;PGL z0EH?`-Wa%1OjA$zzy;|{FCm2wYZ&EoaDNv`hFBOj3yFp8j0Hyp2yL-^ohxYYU_ocG zVj*mJ&KCjz2skj*X6X>9a0Y!jYDB{!LTd{(p0h*20DuV~SF&vB@+HieGH1%%u)!bB zoH}>%?CJ9-(4azx5-sY_ofV@>mj>0Q;s_2L_?YrMX9XKop)IJi*cZm%D-)TrlFb^1 zk`sJ?;QgT#sw~{Na_7=5Ym+A4yn6TY?d!K8H&ib<0NC;W_b}qbh?7k~X_#eTnhXRO zh@tp0=FBy3*6r-M^QKXsKYxQq*DBW0`x1?Ajk6!lKdxufuHE{s=G?k>=hpV=Wfli^ zcoQ$q*V-XmEq&8;WQnBl=+a}_v~9h`RNGeKS?^91wfFGi$CEFg75DV?>Wx1;vi?2% zzwP7Guivt|`MP01gwa+TnjZO?hZ;BB33#AR1|qm1gG!-iAA}J~I3a}0mXbUE_)BWAB`a72tt zt?`yW-#n5|Yw;viByHFT@`D+kB~k}%>g=;rCMyR2r681H3aG{glmK~VbdjOCrebQc z`6f+uZ19MfMBxyPe@EccLu;dKQfF)X1f$0f_6UW{8cxVY$uKyD!&)d?Z1Kubb%c=) zlm0jpMjfWrK}#6bP?2d+`}70bPWzbA3aUb-5(W-mtwx25{-pEA5o_S{%^GWjlM_Ca zY@tRq+GNrT6|4ceY-d7l0VlMIF&8bhG)-GAwhi?_BeOt-(ZVC{Y~c?Lty$qtx2k3L z(>@@C5)GxVw1CqZgg65NGRQOOl0dbR1j9c#{P4ru2~|f6 zp{()M7F6`o0y6$Mv4%FU;6u;G9sAQy4@c<#(nBh}fI!15CzzZr%~Zu>lbPA-EYo~D z``l2^KUXW{24L2_Q$JMrqx5Ic#BsEEPxX>hi|xV_&IW3r=+Bio{ZVly8|hTeNHfcT`XcHR41GslqctJN`K2 zk?*Vmw~6!AR>Ga>0ixx$;q!{->l*yw)HP0FARJ04_<*pqHk71I)j4hh z1@P)&xU)efus7;YDma*cV&6ES#dpg@`XhMf(fdTE3OP&uvyazjo4xix6777=t>$F( zXZ9o^J#FzY*^dd0_H)7p9fvcOHe&n#6N(f5Y?(?|{%F(B?5I$?+qniG8~DaBHt-b{ zq{1Kg_*L9ICL4bw&mX-|0X`D3f#%@^dg%M0KjJ~A4Q!zfuXqLuQ>elfvap3Nd?5^D zD8m`ju!c6gAr6B<2^{jUhd%rv5Q7Lr9SX6CMm!=Cfha7SfRARqXd+HpG@DlZ3528h zMhjZ-l$<u+J`L1qq`95q#wJ)iD~fB4;9Spm4=MR zD;{x#elfF|&U_{`qbbd4PSXT7838q`X%iDU<|Wr|Hgk(vzC#j3+ziSrGQ6*!}=38DeWGGZX1r@A;qb6Mz(dvMgZNcrN3liZ5s<)T#xl}+fm8o86 zTGP=ib4Fp_Y{ApalqWsT4s3W2c^} z1}m-D4{iL?t7a8YG;E^(Bbl%?f}KkT6kl{xmEuLGiGypH=9(9~o|P^L6IL~xKnoM# z30>K0 zlYc^uv~@b|22vXnma>+!Svf-OGNBDcb%?0-ORV!e=-A+&<#3Ec`7JAdHT2>!4o?Gl%7 zCR3&0(gA{Br6a6E*+hxZE8qbWxJi^_980!!g!4wv9Zc)&fvtO8^P8!#2J#j*SwJ86mhKe9f_I zrW?~5t>k+Q26AVFtO6o~#K=fi8&CX86wDIENl(5qmJ57S>lNh3=))_sw%lYUS`p04 z60<|RoaJsMfy!%sGn`F|=5wf7damu3UJ+7f%iTH5)RS$O$sA{z>0ug!J~W~^D&{?d zcf09=t!%fj+E<2`W{^IJr010AZnZ)*oBlMYEeBhc#ubDXt{KtRid{pVnr5jk$f`pf zCTs*q*0#PipkTXUT*qa+l-o70v*K$C2RlaiIA5`ot?c{=36;h)bIpW2q$7h6+BB2) zf|m^=S8BWe+ulxepN}R&6}U9q(N+VwzpbH2bXDE%em9aUjW{WPP~K3ZH@w4Bi%8_V z-~L8#tN$I8fD3%h=h$e$6Rz-uGrZwpLbk&rF7cXKJK`0eIET$}4vcfW<3%!e#6j+H z0Y@Uh-oXw>;Zd%Uvz*e7p18eRj^1GK+dKQ|&nw~40~Lfpt1|yN)CeANp#$BvNC^*z zOsa+k7QN|Ce>%(C?42Xw#2v-d#y6tZ0w%d56O`DSHXe~VsFQs`7jO94%kEV@q{cdT z(uOwFFhL8_s&ivY5;7ijxK!+e3d&VG;L9WOh7UgQE*(WHYz{8}&HaA0vYZNV;65iP z2p;+WGd$+S6Z42~KJzD?MyMr1D-R{y+_~&mKYTui zOY6mhOQnoIJZX=Msmc=mv!0(p(J#vSGcO3vb-(^VQGp2_92e@(#q?!y3J8qFPxzl2 zlvH>W9$bXM3Czxhco9xGgM!ciwY3HWY6t=%2%;<#PJ{v#WWtv0!y`BW&LzScxKU0Z zN*KsSB8;9uWnMh&TNQanKYSlhl%Ji^p#3cfE$Cld#Gg=Xfc6>18VI0I0HIFUV0U!? z-dw~317-!79D$=rV0S>FhA82JC{`kL0=bb6MGZ_f=mG={Oio~-7LrSR)d8PiUNBg} z57gXHRG?ho9}n&X0U999s0dF0AQ9%EhU}nDaG(c9)y1%eHgtk6Y(Yt_;2*Rg37R0+ zupo-ON*FL*4PuQpq{CFa!Z4skCv0FCgaIw=gRgwzRNP9gWXBNx;U|WoC1PSf^opaU zLl`*0ScqackRnf1gNsl?Jm{i7@S--jNG8NXFv<-v3S%_PAz85C8f*y@>O?p!qOQn| z2r>yi)WJ7|0TpCW3FbsP^gt%0gK5w}FD&9EX2Ksl!mA7g2#Vk(VhK(>f;P1OBhGze zIGV=&bl^Uu13IeXM-7E9@?t<9qcIlaFdAes7D*4#$@}2|6@&sB5)TL@0=nSDFr*;` zF(kRs03k}B;YGp^(4SCHVgD&(P8eZMAR#QZMwj?RD)M27_~8b&07a#O3AE9?U==uS z9Qi&_GZ61I1X(Kk@`J%t*>m${LISr+kGPzznAtPz!Xz5fFmMfQ-n{3lT;o zhtxq*epVn^&=x3*Kg1C&vWKJfQaAC4!W*BT{X9@*f&L&>cCR{?v2KWaukQ+(C4$~n66>I>KT;xgW z=0?WNH+ZDs#X%6-3OI_R{}>`73StQ+XjUHgA-s>P^7{{RwU_g zO-V5+MG9$7_$Ou>DUvFwZQ(#T_KQD!;>q}jl2Sq%2B|-AX*zhR6eg$Z;owd{resp4 zKj>s8kef~7q#0f&pM0hsZKqdqN?0x>S(+tUs-cJ^i{@ZFL6gD&Kx{;g@cnJKIC4%ZxIPe|xX`ok~~ z7Ex*_XFzE29FKn5Dz4T6JenwqMwJ)Q0u2}ti^}SO^uSvf&{P1X$GB0uqyrAr3QbC@ zg6afGk_&RGKw$X;VI}EM0H<(9QOsnWKbR|$bStSQRsM|stCn)sE1VeUIMB|Sz*p1( z7)g<>Zh#5MSiO!wA{?hbykDpmmZPO5J{ag(cH)4NOGirNl;VU;iYSTt1F@#)ipq_P za%hZ_A3nr`JD7nPh-6SaLVuiqKY)OWvM3cesND422Gq^n{A$9A>VtvlSe<1m>?OHv zr?)0at1jrb8V~|05Kcq`sP4oGVANF9>dh2p)+p=D3Mft}%CAl>Eo2Q&Jgo!8hOz+A z|7^h{SS^Hx$Qq~tWNv_INb4fvLvW_UL5&sQXsfMa%qP6%s> zx=pcK(6J`#t}<&;oI`UN#UoIHHYg1UY>dUy0IVj0?*NbR;B4mRN7tGFrR)j*eMKk$ zY%1J@&`Ke~CQ!f*?fbz|9jF1loT^Ve!U&uImelH(ZU7A^6WU_$9koCn(F+JLX84Y; z^Kh^C#slY|Zd0ZrBQK3;ZlMfiAfv z?wm3W*Ltl#fNgfHF4^{j*`h7%jz;a?Zp-%n@DCqQ0xeJjJ&^FO@PfF_c?w0tPDNS1 z#t2B+KKPfL`4Uj(&JNJZ6_~&Ue+K>#MK2i41cLIv(eaxKg_L=*e}yrO=rXWG znJk-CY4nn};e!f~2C9e;8T$l16!R}Pvok-lts(4A$RRXWvo&9{tVuH+nj|)Nvp0XU zS@B~!uA@TT&E8~Sx&h+e2rW3bvpc{4GYGjRY_0(m=86TGKo)KxO(x6c!m~gBGeFZ! zk!mcGMuV?v17R)X!IGPmrXf2AG(<ngHB?8nR8KWkSG84NHCAV}R&O;|cePi4HCTtWSdTSXm$g}+HCm^& zTCX)*x3ycpHC)HFT+cOK*R@^WHD2enUhg$u_qAXDHDCv}U=KE77q(#^Hex5XVlOsh zH@0IxHe^S(WKT9_SGHweHfCr4wq|cOXLq(|e>P}`wrGzwX_vNXpEhczwra07Yqz#* zzcy^gwrtNfZP&JK-!^XNwr=k>Z}+xu|2A+3w{QZrCw|I{?d6&0&pEr7^w|cKPd$;$ybpt0z!Yy2b zd)If-&BHw8gd|+S5cGF{vjHjO1b*8$fk#|1aKR8j0V!OA7z}|Du)!jb!XlhOfA4`m zECLh^0UN+Vfp>Vpy#s#_ff!r^g3rM&_=7yqw?7a=gYSVfg!mPxxQEa9y8(iKlL8wI z!5rYk94NsMTtOkAc!S6Pw|`3ljUTzX+4vA}fif(4H~2%2`}YuVftGg0K=t2L4J3Gj>7`52L&e#ffnQhDNs96aKaqW_m_)7G1S8xG&sEjgCvlG!rucd5JNFY!Zi>BEWE=tocbBS0zbe4hbQ~PNBqQB{56yVEGWZH zAp8(0Jj07T#N&j-Q@q7vyvB39$Af%MC<7;O!Zq*%PUJ%|T!MV#gs)F|IiS4CFgAai+hyfHRe3tt+8!);*uzDUGJsVs?PzTDW-QRt#+r57S0!~DEe+Ri1 z`1dXNIDWW24v!y;_DIN<#d^h1+F0q!UG?)P`^ze6`HLdfex>|ep`*Z%P* zzwYzC@1r{K3qSGq1IzEiHTZZphyxqUfjqD}^*cMN--G4*!b{^p5Ma!t$-i4xiPvv~2MJc->R-txz32p3qeB$itz zv7W^_Ll%-6a>?I4iTyx|Bss(+N{l-#Y8=9Gh{u{EOP(B((%;IKFJsQ!6|$#3i~5RP zia5moqC|QAZVeHbX9y>Yi@f?{GXxexiTrrBb^8`>T)A`U*0p;VZ(hB7`S$hu7jR&~ zg9#Tld>CE8UD00o3%vOEMYU9-EY;9iWuKr83F|?R)1Q#mObmypDN{MiO*&W z8GLy0Vg8(1(yp_UxxjrYF%OH1*^VDrl;P(jC5Br{ zwwVU3sK5ggyhtB;O5^FjAp-R0L4P7da0mreA}+S3Xd7?{so3B`8JFhprySv?8{(b+ z>I6D%!?Pset{4-)gAd3cg&dN|B8@x}$t0Cr@-6u?QwA9O?vn@#KyD#L3C-Y|jW~bu zk!X&U#OPYXynmHCM~@b(@Ztpl+#W<{q!wPNEww>Ny?F@oc%ImgBVG}f`!yY z;@TrbsVuri3ovVZ^R*I>^9NU5{dqIa4SAwf30r~9bVZ3UNYdoaYt6Zh= z$Fm_w=?airuM(r15qnhgM^M#Wm)&;VeHY$%In^Q$SZ=xGpgk5*2Appv6RWTPdz9Tv z;K2&+E1!IJ(q~|c?oqg{gyo%>;)*T47~_mJmg^eI_PAsfSS5*Q4r#KajFgR4UYX^V zU49wnn2GFo%85t;X(A<3mYL_Cef}BfpoPwu=9-@a24_fzUYhBqoqihXs1>`0jWDgg zI!rcdOnQiUr2ZQ0u*Dvm?4~)%vXHXXUYqT<-G2MhcglgA?z-*18}GdJ-ka~f{r(&9 zzy%+i@WKs09Pz~a?k8P-`ibkuds0f*Eymk=hw_~)SDf?CJ^!3;K!yOvB4WISi=QE6 z#raWL;W|T%ow`-R^|m(K>7BGe-<|i~eUEuSnEau&Mz}~(#vd`RD5szQguQF{Q#J$6y9J-lchBJl?ov>*vXQAHwLxCa;Jpaf3j0}x8!LY?+!DDZs{gdr4R zc*xQX)7_#UN?-#?WOW)gEFyLLg2EMU5yRnK0Tx{7PgfQ}mj4asHk^1I^fZz($w-1M zw9n2V}x1M;ug7xmySt-R`-yGS4fc>)8NEIs#_q0RQDw@ zkm4-w$b*QK@WYD~V-c9y5%k`1IxZ+Eg#prDQ^I8yofTpTy8wy*BwV7qE@A8^rK(Aqv5Tq$lOc zz*35lhHXxyF_q~|XQIUQT2!MR6{$&OSwEP%52xY- zs!_#9JmMi$s5*5YOnsbEx!P5)nx`N8h$>axp^kH$l^bcr##*s44Y#rt8)=m*H_lqu zv#P_ac0I>j)yh`4zLl+Mt*biRLDj0_BdcFk>|(px52(hYtZ3y%TjNUCyB7Aah-K<| z!b*xW%pxAt5W^huYKzT00~>`v=o#GMjY3?33&4=Z8GJd9Gwg`5xz#Ns9XnNbpp~+g z#q3qLilmibuUlKiR*xbJ1|)mXz?-G)p1pataDv#Pxm7p zlU;JBmqdI{=y^HM0 z4qHk4NCqbE0EQ|M0SH3~0uidfZ8yN-+k-$vBQSx9Mj&Dk-=4O$i$H`R=$i;q0K*_W zkqoBo!yRdL2eaAT@x>+^8yiQGH8A0ZDg*%t5eLH^Do*aeIGf&BIC;vYz;bv94dWX3 zc;GzY1tskPo;$b5J`Ai5jsvM5rf`E4f)E5LxS`_Uk#{4&P=u6Y;SRqD8pg>?Y@y@r z7bbu~8Ia@>HVk^__(*F!jAV^LNC61{MQ{Tq$g!SeAOZ{y#{woU9kXO}!|PwKn+fWU zq;<%0=l$4g;DJPrI}{-ZSU`l!fs~9pQ~?N8Km_Aqrgm-|UGQsT2M8>Z3_K{}BBr+n zd|>e+IOXRe{8$A@zft>d7TiNpj$4t3w98CU=U7;@p5wupARpwIL_Tz2aevT&9x#B970aPNejwf-iy$98;qd_^ z+#fIa*Z@z^pa#0&;r>cM7T)5l+(Hpht`Yu$uytn z_S8cgbkHB%!3H_6`}`ppHb4jJjvRF0ahA{@RscS*BoFlA9oWE;lINqy!7KK_z3hS= zqyr;*h%QKh3xS{!+@b*C!Uq!Y<**MfJRu0+!1v(d0{ddS(jgo+5C`?b8?eCP3ePWm z;0hTd1?$2Mvrn$%q5=iu8}_g*77;#v&@GUG2#av5;6VpmZXa~OA2<;{A_oeTC82~# z3ol|AXdw)G;SV0cB0?by!hjZ9kqTh(3&Y_R{c8=>U;zDLAJ)JY!{E)*p$@{J8Ze>= z&X6bK;T+^*4KIQmz<>$=M8P89;SQKU3Z#+ws?HQ*0S+*s2*AJ}VjoEwcK^unb8}@-7VUPzeq94AYAJ2;e(?Jry zAs(~=ANo-)tU(GSLhIUM5SX9{2B9CYfEp|U69!=)9*+rTuNiz`44R=M{b3M_01n`x z8Q=g6;NT(;G5fxu8dT69;6WSqP$3=C9~9CN@xdVTFdgt=9_P>>&Vjt*Vjp?|#HuPC zs=?*lfgaiJ9|IB}`SBte5hJKl5}Ojls`4TLQYtOtD)q51(xD+AlI#oK zKni>=5o~}B#-I_t5C+D;2mC-E{@E2t9HL>W&dVt6(;`UJMExNedLbNER4$SYF7Tm7>(eUpa~^l_ zJO{EDkii|IArE^}gk|EF2AByxyuXHYm&_8RG9xu=x z_VaOkG#c0r8NvZRF9I*~axTb@F^Yko0D~>^p?0zm4B=o1trA|Fo=Al_#R10}C-4^z$3+6F$SCxaa{!X|!Ke5J~-^2Im6%?$IWB z(bmfICvVRwf3hCsQ$V@WA1?5`c+XgO;zoyV5fySvgApw;0$=kL8yK$|;6Wkrp&Cqf z2a9qMfABlYbQs{$AKnyB%i|u(p&YJ3ed?5}7SkW#U<_~7H$0y6EI>_JppX*U>5E`M0}uC;qf{D zi@+jyl~!$4Z~cK+;oxs)btmEht_~L(pg{)%G-Nv#5@*&}_aRO1a$zxb?{>GK?RZF z8_HDp9v9cbYN3Ka;Dpe?`9BCa-TCDnqvb`^kN6?S12$dOb3k?|r1 zfe20*g-`h5fS_0XVN}y02xiaOQq}R&RXFk18~3(XYt8D-T39a5K|r^% z9&7Y*!t*{ewmeN1A1aWEp%o+G0e%xfJf?rcm1+ntYWGx+F)L<{-j)wtxd2xCDVi<^xV;vXL@^ee4 zmtif!O3e}nJr+Lo@sKfMWR3OBcCwL!FlFPSj(H+|*EfBamlElr1=FDygp6Vn%YM%+ z6s#Z#>_QPtU<1rgFHE37r*Rkk!4zn~Fa9j(;(`r!CN55*cn)|^6*zYPWx@k;UwfC=JQcI(wI;8UP2A{qp_9=~B2{!{qA^O9XU z9a@k@2^la{c^L9Ak#|r^`?!@yJ6-I&jO7RmX_ftwAM|?jyFGjg2 zi%}bTfstu6KPAyb|5Mg%u#iP86wKfU%m6P=0GIPZKz{)WE#jB|{bFwFqL}F-nU&eF zMxq=9LP5x4g#_XrD(DI%!F()Y7jO$puAm8*nzbP zACiC$5+N?w0S|Ow3d|3#&33t`Ks zbYKnq!4J;+7j(cB1Y;fw6ADr$uuH}w;An-of(xT(5!hf4<^dKIDZjhu3w6N2-Qv>n z!K3N|V${IEi=b_R1fwA@bQ429Cy;oF#2X;5bT0)!WBPvohx=vLpbnH5rB=YbS74W) zn-^h#222446oDN!zz_Uj1N!#_Kwu4YKoQ(Q7EWLn{a^()fG?B56gFT5CIJM#PwX}T z2|V#H_<;+=;1;-m6!PH;3V|f3LJ5ii7#0E?y5c*^L4BU095SewQ>2UXfeqSxIUGY0 z+}v7%Bp-6YYZ@cO-6J1NAp(;`_e4BszyXWo!V^fZ6Ia~)WMKo&i^u)I6Lr83{NV)f zz?aQ$1$vpTFTxZ+fZTLo5_o*M+dDAgge{8sDMSSdfPsqgU|XgljV+=O{)f&0;eRFt z&~F{r7Xuj@VZnEz43NMgRNNw1e7P+m#=T$)EMf}(e2&F0;?gf7)9o6VJMq(h0T1v% z)B!^huqSy)VHM_kRLVshXk|mp=R0nzDb5_Xl)!*<-QC~aFW?~wxc4nQVFl=+3^rgA zhCR_u03Oz00~8_b-~k>?KB zO3lq3*5&Ig)ZLWY9o|nK<>_J#kee>xVaV@~(j^@Ppl}~x00}+aACO=JoSffx!sek| z2S5M>JmI`80^)rE5By-_{X!okqs>zWOJ=ATRAGjoKzGbX33f!iREOQIsN_=~?8E*o zOb9PBoF|MA9`?Zmc99-T;~ z|MX42@-d(DAwTsifAmYg@?YQeS6}rzANOlN_ZvUJ#{xg6C{2>qkB7?vTrTh)dz{6jcgUi|-?Bb8d!FHt% z2J%N@APIkg4i-F^aNxy^0ylOX_;FzW!i57HK8!fAqDYJyIeIjx5@&{yHBSmmDR3o2 zhXWx>97xe3&5|}z>O`oMCsUsUp9*bBRG`tAN=argc@rc~h+Z@Jllk*0Sgc}M3T(IR zeF;6t$qzVw(QxoYumn!yLQPp=xUJoT>~AY-@hTDBTga;x;MyO zF+VSf|qx8Q*oYB(Z?1)=~Vdm=J8AB^Xv=%9)E@ggCG=iPW21l1@r#rIucbX{MTP%4w&behO-+ zqE z%WSjGK5H7S!cI$VwaX3v)Ik^KSnAt^88D-$ITuk(z?_=f#TzJOU|gjxw427a;f{-F!v)unK^L8_Ye*mhZ_M$# zQ9#sh$hGnFsx;DwjB?5<$M$c>F29_pxxAp`O2stS&;bd?8O#F-%oyTA2Pn+Eu(#lr z`Gz^-l51$gGn;epwn%Twi*!89j7}gZB$o|3+c4J!2`H$^L)IuTheEAilfc6gWy1`2 z+?K9PH)>bh(`qQJ*3Ea{ezz=l;PIN9#v3kVKwb?r{)Wy%E=0jh2TOFMT+t0P!%GHB zECk8Y5odsP6d4@cYOX*XM6vBQkVMfzp$lPsOdt}KZ96j!7e21%hdOPj z=)#(KWdII{NUll#=mY|&K#CR$Tn!`t&()%gv1QW2}^M} z6qMqFqRW5_j)9JBSW;|b6Q;vZFhKnwL~OP!=dL!{N@&gD3X<^0HAIP-Gu(p{RT{o*p-M@RgPiD@bD)(vW}~vK`Mb#<(fddXDj zC5d_Di4v+jt6QCS6h>rqt~f1Jtzf3KY#ielO6#j%{|eZ^3U;uBEvz~8%2&fGcCm~t zEMglA*~l{Xv68KkoPoRSMcGkg3t!TfBw+xn&Nz(m1Y^`zBlL}z zeetVb`F^Fp@cM6l_d8#K8CbvLm_bP}=-=PmH@^bbnSvz@V46DkD-bR)h7U~B>{hJ3 zp8*FKhEPw^TtNwJs7)UBV4HhDVFxp z9ObOCLLkeq@n%RdggnFIG_YtfZSIke+ejnN@365xfN=|y^6SuvZfQp3`r{w}8)K%T zOAeZKZHd}=`9B<)8^Q8r?L9!P+vM%q&78OQ_bO3TC}z`U2(1JGY;CS8bem7=FF^4&%m^6|Yf*9;9jdF-A;4Q5P z2_lW~5N|jGybhI|GQbLBFj&$HZ#bwP9`T98*BBK~Gf6UzaE=ee;-A8}#1q~IjfmA( zCfE3@JN~Pdqg>@47YN8V?%<0|+aJIf!WEzpRc-s(AEda#F4#csRVYFK5|X%rKlY%6 zIRN6_KIeoO$PEmnW9A-Jzq&sPaRpNB;en>k9@Sf*8=tVE;TbaJ~KaTd!`Ei9Iz+x3n6?$g4zy`5% z!5>%Pgg=zw^gXQi8n@s&pDsc6LXg`Z&i4oO^DzXJa)RupS;w_c$=yBQLmv6KhTOX` z4{6lHS>pGHII`^z^s7INM0xz&83*m{E7-ITGG0Cwhym1Ty z@dVW9Y1YVL*@zC?m;v4BjaUYb*r+Mm$c^WSj^l8R9absqSdQ+9iHsmuz7cztB3k3v zdG=V1@EDHk=#TXXkV=-1mZmBFxQ(ZWhNKmW?X?I)Fm$K^3qzm;;>QJ05OqX{k$CoW z#z%{}7=0M254aeFz<32?ryBW?dwWJ=;IJq6@DA@F57z)j^}-Lgpbs^9lQ}s>8R2pV zc~fBKGyGURQBYe%S(F=gl$9b{0-==Cc!@?Cf{qgZmE*FMOj%P_iIh^gm0YP|13^ob z!hiL+cL3>?P??lrDV7UYmXvaq-I$h&xECZ=4tds$voR0k1%lT=8I&0?lwwI!LiPmhFgp!bGXh~%lG&7%d6|-unYC7zOi7uRxsPu+5Sz)F zT{#1ud75~bnxjdSndob8AdigFWgxbirTLkW5}K;Hn)j7*wRxM1lADp4fW~l_?Ue^Z z00^m(XVEF0(@CAxX`R7T}F zp!+GHWu{*X$|w!WaSwWl=yX13b62^J{Xv&!4HAid&8ri6?%^jMgoRdE~#mxp0;F3N{NP2J+;=PsP?2v zDy2+XrAPV|Sb8m*xus2666mmaFEkJwDxg^grM)$!v=yeTIi_V=O&Nfu)aap8b#}8@ zLGM6{6r>NzunMctqndFK@E02EV5oT_4c8E(n86RLPz;gkjFDOlYIP5YmXr6O47acg z`mhd>dJ8})JVhF&K{ivr5e0AHW|~R=l?loSrPF7?rfdeQsk*APTB~LDswNn# zx$1$tx`=Xxs&C*5Fr*dKb4q}W3{(!V z0F$9nsHM@UbaJkaIy})3tu3VwxtFLFR0tF~LD#Sg z^&qO9ag)W6ZRJo5dkGH3pbU7^4brfe{&16D6o2$(egrw=WAGV#h8`Pw}D zDy%{_v8Bqi7%N*-*0jp{v`~BhY%{QCR%@ltsAsjwsG4V1<41vzoCB!zgf{x^*>H z4y%x|NK2#{(6yCoxtEK%nX9>*%XebCPxEkXipELcpqInL55SPJp8*U}Rj$BL4CO$$ z{t&L+Fr@NOxT{Njfy=rfYbS@B8M~ki$|$>~Sg!ss54*sV_?o(WqqLmsyw3~0(JQ^v zn}D9HPZODnt%PUV>NkZtsGI={cq0#86slTick)mSJc=2%n;GGYvw>@=yjv%}iy1-d zgE;mMqRVqYDY>Icxzr2)zyU1415ChBYrU;28R>8Lbzq#wPK)aakTMXjBwwQsF%B!#cTfi|a!!u07HEe$e z%u2;GQmr%(Ggi1N0}jo18tWjgLF^CxfM|`ze8?+z@~{fGpjJc-wYb*DI*QQ(8i>Zn2ma` zVmvT3{z9CX)O#k!6cA#Gt=^ zB89{t#HND8z+lR#;m4+Y|FdHp$*t_luMEp}Y8EN;4q|WxbMTk7aSKDB1$jJ0{D8?K zqhmoQC@1NieZt7Gtjx>I%&$Dj#gYfBKqvVyWAcl@c}C2z0Sh~vC-Y#m$NW6W+|1)l z&gCq_&nzofV0WvK$+MAOoa|B;c?I|S8pQwwT)@qCf~au}&Xa4-{|wLpO}*$0E7<2w zw?I$r3>)&0XP?|s#RFq2yc$rj4t~-No!ZX~B+dfO(H-s4VM@@j0tH$C42wVv(h$+I zF|xksQZhzk<>1hz@eZ-*PLhyfk9r%*Kn(qm(Hc$09}Uz&E!3PD(y~GdyHE@Yy&9ye zz;88ZL$C+ydK$nW{|5I^4y14oY!Gyd;B>Tcy5QH-N&C}7P1a>?*3~=9bW;v{PzVcL z8vVwiXT=W_IRyD!8r`5?0{0GQKwj_+8-=>cVeQXnP1uEP*zB3sbW_o_AqjOnRrAnq zdBz1BoEjlD4{YGre!UvwE5U+2*oO_;p>5VheJX&EcA2pbL(ti;F^sIW4#hJBZY>&f zzy)HU42wX-N}$xJJj$LO&SNdw!7bbXP1>en+HPwb^YF)RRSaTKinC$Ygo($#JwY2i z+}DlWu3X%-5!G{`3~V59?=%N{;8f+12WL;Sb<=C!r|H{-|-N2pQ15V(Lyxp^*UQhr9V!#RApak+Z1oHs8q{IpG00=CGXZaAs zLl9CI9(+Tf57&^9i=f${aShoF;B_*-^qk_KUEr}E zqg^i6K<>&-0001R19SBSXC4AsB>(^*05o1xn(*cVK<1Hr;IdJKY;X-s#|2h24-tL| z@O<38yO`kZk5?uI6vv=2-RS0Py2A zr2%Jd|L2?T$ax+cd*0`gFn05BcI!}>U6c>}Q0Rva<%*6OhrSB)>`rYP8o~?C#GriS zb_rGp4&-L+XFzW901HP#mMGsX!}pnXc9@8wVki;n0R*-ri5>lK{`u*e1c92)sx&FqDJ zq@WCY7VVg^2aC|~51)%@D+XfF4K7&-7pV_ou;9j1k)P2I5&Z3RW8Lo_!{q+y01)ao z)$Vf62kox%@Gj)2a-unZqEk)TJGbCF1_j}(1cmnVly2$lIt26t1yoH6m%s+nuwJ84 z|EO7h^8AwWH9x>CZ|>+W>TnJK4M0;7aOQB1^=2IJtD*C0kDaF>#&&iOk}%SNR&?#X zb#Si*fmZ36u?kS|=-nU&(r^iv0QE9`3p`!*B6Ia+55QWFRp;*V>5l5WuneP4_KIJ_ zXWutNd(85n!up^D;KdI!mPMS;wx2J%O5jvhJp@q|cYh!FgMZgyc=-L2_?9odjDJ&( z|ME6P`DlLb1AywbPs5nsH&f`H-EajzCk7$abeMts${$i}z>!@<3b1$u#c&4Ka0#3M z`=z0?vv2j`F8qss=H=e_To3aA00D1~01W^D27mySKmG+w{Coq`-l_I#j~es}|NhV; z{s1vf;6Q=}4IV_8P~k#`4IMs&7*XOxiWQpx0B{507Xbi59PD`E0RRCVcOW2HQRPaO zEnU8Z8S`O=4E}81#Ff98 zR<2#Weg%t^rcAPB&7MV@R^i2r8#{h97?LAK0j^LAP@7lpUcP+?(hM6|aMxn*1|LQo zR+{0&Kjq!x*I4po%9XW_^_yAqX3h`UYCJd>Zo!b_Is#ZQ?rvw+tzGv8Y+1J8b;@Sn z#+|zyu-v_U|Gu2z^>E_FUw(G$5hQ>M<1Tb;qB^B<>ea0$j163SP@~(u|9@Ai%rcHWK{Mg=gc~?xjW^b;V2g3u1>BF*+@)3ASiR|;c1N!4UyfgfIp&Bw zW*HobS*EfZSXQ~YGGf3r)=4)#b@kWpa`lAEaenl$%DH7ce*L-UcHFhn2NsH9u@@{5 z1dDr!V;8U>1~IfJk6nzd7`Gq>EX3ot2HM1Lx&xp;c2SsJl<$BETwn$L(ZByujb7#G z4(8IwJ{GnxLhkF=Jiq~q*;VF#`P<)D%x9Bgh=YFC|0)KmLWdKv@vt1%t4E-UcoTWR zVjljWmQ9FZm?7R|i97U1536RA45~10E(Bv3Z3M%9U1JrSo6I`==MSM>0~k-tAy=-E zH-Fd;el{Uo6>YM_V%!59;JAkp|28kd`J)@->7YOK=#M4N(I&Qg9Ty#zxD|@gl9wbH znc~=uK*{DFg^Z(D>H(P+)#HvR#8N&|xJL*25s-qk;4M}rJb#!`aH`D75=Y68Qby7r zU8Lj}F*!_P7Ke-m6Im#qqzqU{&>v-kg$T2F3s^7)81%}bAH@;KJz}$Ls{m&?`mu{u zh^Ji-tY$X@#Y|`V5}16eWHIGAPqG|SnU$I3|5mP%OqShIC;YIMR|1L?fZh`*@uX)$ z663?AZWZ}t$~(~5&+!=aI|yB9NJYAkhNi6>`eTaZdUDd7U^Jpw zW7$AGx*d>?)TTEbM@gkdHDwUbCi_%IF(Q*c?D>Oy=3@-}icv_F((W2X<*85KVLP0Z zqZmCkYWdFhlNNn%re8%&O>sI_h3eF35`|#A_;E)1>0@5kc*iREag7riWgbqH$JmRl*^;QWC;FY#r8O#S=$J12iR`0dRC9M?;Y1rqwlqXm5?s3Hn6yLT%xaS>f8!`}x z@mh7YIGH0qh;dPOy470ukgalW65sh^$-OwquOU-3uUd9@C_o!;f`?Ly3_Jn74<6|Z zGEj=~wuqYgXsSO(2@7#xrM~nl*{0Td3qozU!&$L#7gqrbOY!4?Ypm~4fB{Mc*LWxG z$Sj0)TvknvH@rfTpHF^Vy{=eErZz^hO?>O%9XENH>g__3({xBFS9uwkovf3!97dWf zqsnEH=9g>Sg$8fg%q)~}l*ODS|4gmf#yHM^dCxp&$1#E??QnCJu7T1#KlaKbU<927 zEjQw3z{`E6(k&p0=*I?vCUPEhq+_Pu2{W2X8nv`|9i4$009VqV9+)vCT<}jam&QV@-o4Xw6 zItO~kGtl#%|2*hNUv~yvK=hHLfF>P4`q62$=4ZPsZ zPI%*5=I|s)yy6)z_F^HJ^5>2$VBb-$h4gFg5E%D@{+fA*4}o-&_bJ~~A2lh#N7 z_>(C))^L^}b5KTe@CuArPJIgI8G`a3`*%t8(P4Ft48BkYql z+X`T)2Vv0{KFJ4pcn3b)4x_jSd5Es0@`oA{!y`n&7}UZIqzV3uJ1Km^K2$tFoWk+z zLKQ4T+Veu~|B$~hJV7dK!9Bb~N!hclP=;M-7mC3NX@H{u$&?on2Y!JVH=M-507FUS z!{oC&LJUMzygOD*#3V$;M|3?$j73|d#X~H`T~Rcz$|$VimKC|dn?RXe#6w8L#U&I( zwS&c0^ux4k##n5?S$x4fjKoF!4%tgaTHHos)Dn%7h9Cnn35pkGz?7m|5S!_vcX)@N z=_GHwLJaIi*?>ZHh(>3GyL*(zYed8^lt*d=j>Dt1e)JB31W0&P6MpEhRams8_$11> zhfV|wa1cj?G)3{7M`d(HdlbR)s7HOojpED5ctpIBJV9(E4}l~|i|mton6D!I4Su+X znWV{@|E$SfAO?wKNf#y(>_MnZ3YAomRiiciGY<|VN~C1Bl_b8Tbj$K!I0l(Zu_PVb zVxx%yi+!Ont}>fqND*lO%wl+=dvL!^X`Wly1$|ftz$7ud1dFF!5wk?g05QvsED*9( z5Xpqh&MXkT)Gx3IOVL@ST_}f{#3Ow%O-8a5ud*4>^oqJH%UUxL%B0IKXiBGq&E2#N zxKf5B@~^9koQpQ?r!+LKsCVnMsmQ`-IR4mC%tfiaz=y-m60t5kdeW8j2C8(8P`Obd_GQ ziRB?8iFwcoRnZk?(Nd{USNcx^d#ZjYhKpho+OeR;LeJ&I2?ENZ1aX*3w22h;%z^?7 zoorEj(gzIl$|W7mU2#&SaMFrQ($CDMKXS2VA`mNKu?&)izXFEH7%@^rjPwMjRUi-) z+J#++3>vCR2PIRBTm&spf+HP*h6Imq@eO%^IG)f%+~B7__0vA>34XeUtQd^^|M?@s z_@h6viFw#FIP#-YfucX!C-1n1&nX8XO`}r*hkD3|Mnw~GxP}C=i9y*X>j4LLsnUcr zk)q%w`RNC*(jLy?2ke5)d#F>eKvV_E13JnLK2;E46^wj9HN+^OP0WWSt)6RG(oC%- zVaca$!c_T*)2~>ELE=Fs)kIBv2UL+MeK^)u6|a7{%A}x5U=Tvy)K#45hDG26I4TEH zChWUDM?raQV@*(*@H#!9Zt9j|6y2%g(5cS zAUFaBK)8~}AO!?kgu&>YZa|ZFI86)sRy3iQWa^!Acp`lfRHWbs0Ueu$|M49Qk_IMy zrjo^FEJ&Qm$eCTCD*$}w04~sC?Ez> zm;)$+f=jrDA$X`E5CdSi0(syBC@6$o7=p4@h9S6##ckZjRR$=CCM^&QQeXpdPzJ%_ zf_Rw+CC~@X_1tyHhdh7+F_;60-32j_20$PMa2SF`7=k?Df~egJZ&}%A+9udZo}xum zs3oj!=v`^(26+Gn-CY;2O;zOG7C8h54x8p!;|_W zEXkkmxrV&j-q)#zOXZcRts{Au2Q(ZTVck@nap={fNY>py9yi z20$1BKG6bVxB@h(E|;xmRK z0-+;eMG73!Fr?j-KP6TuDr0%jhXi3}<~-CA0_T-69unQ&zF^CLt64=xo zg84;dSGI{&uAWxrgjZ$*7tw-Q1`AV#1van=$Nk&n|KTE+_J>6HwUntsv7U+J z==!OLnr9>p3VW_2%h{Bwwh28o>OuaBe-3DPK4W;s8-AFFcA2Dw-ra`ImlqLMDS_L% zonVSK?3ohbD;QXRkc2C^0yfA8akv5{xCJc$q>nD?kY?p8FpA9fhgB{HCBWPy4vb6C z0x{r%@#qI72o%#s?eXXaDByxK2rM?xf-6`Aap>t}ZtA622jpD`0->5cdy$iRouL)d z-|mLsPHQ<xGSy8TMnAgR zdWPTc)>`x?kbFR1k;RFAJ`B0eE7*GG5t^em%;-H^89Pd&LHce&p_*y-2b*D+O)l)i zc5tvtTF&_ja@L@j7K~)N2GV|8osdXhG?44vOTIP{_4bK=2v?nO2xA7VeiyPPR8~+XnW>u|_9`_dVAt&-8H}WG#@*-yn|74fm-iaM#7Y`2? zm0=gN7BY6BZyDF}INI?p_wuL6@h=ziR`w2k_&r`p@-$cTHD~j1-tsqx^Ej9DIj8eF zxAQy4^E}t{J?Ha2_wzpo^gtK%K_~P=|2OnQNAyHj^hIa%MtAf_hxAC7^hu}mO1Jb& z$Mj6s^iAjVPWSXr2lY@F^-(AFQaANeNA*-!^;Ku}R(JJRhxJ&O^;xI&TDSFE$Msy- z^_=%_ZinsWS$M}rb_>JfIj`#SF2l>&O1=|JVNQ=l<^Z{_h9>@E8B_C;##{|MN%x^jH7&XaDwh|M!Rg z_?Q3rr~mr5|NF=P{MY~e=l}lqe}M2Oa3H~g1`i@ksBj^}h7KP>j3{v;#fla$V$7&< zBgc*&KY|P?awN%;CQqVFsd6RDmM&kyj45*_&6+lE;>@XYC(oWfe*z6EbSTlHMvo#* zs&pySrcR$ijVg62)v8vnV$G^`E7z`Gzk&@bb}ZSlX3wHct9C8hwr=0TjVpI9-MV(~ z;?1jfFWXW6rF3Gw05pKZ6b}dNk?M zrca|zt$H=<)~;W}%Z@F3HtpKBZ{yCbdpGaizJCJ`E_^ug;>M37Pp*79^XATw`rS)`FiBAKL;OETG{lTSh!rIb@r zS*4X%Vwq7u03rDV2^|3c04x9i000F9&;o}5000OF2Mh}f4i69!4-OI#4;2#<85R{C z8W|xT96}!lB_bawB_k~=C^RoDIyEvuJ2*W*L}NcDNkcwNMnRND5tv61NJ~*vO-L_N zk6Kes#aa<&Tva<=l(b(Ktza9+U>aXzYHed($YUNwWSXsIB*2np}mN!J(R5%bIJ8 zoVuZ$W$~P2lAUIIo@3meW8R*0uAh{jplq$6Yt5i^exYXaqHBVrY@MXA(4>3cqcOR}!kpB?w8q1v-ovKq#H;tj zqRhpnvBsm>#<|kStLwf!n9F?$F@966A=l$^K?egmT^6c^S?(_EX z{Pyzp`Stwy_W%F?00008{{a0797wRB!Gj1BDqP60p~Hs|BTAe|v7*I`7&B_z$g!ix zk03*e97(dI$&)Bks$9vkrOTHvW6GRKv!>0PICJXU$+M@=pFo2O9ZIyQ(W6L{DqYI7 zsne%Wqe`7hwW`&tShH%~%C)Q4uVBN99ZR;X*|TWVs$I*rt=qS7(BTJr4xw7TUm@{kM%(=7Y&!9t#9!C>oF zt6t5zwd>cgW6PdRySDAyxO3~?&AYen-@t#mh{Vg&03!xkjU8h}755I(fn zp{%XZKr&1rfTW`eB=BgmMkSQ)14+9MixQ3^Ye4G{JQUM13?%gqgsUnPt1=M3{%T^Y zKj|nEh8?;PbZ@>cuK`K1z7hloxc>BFf(rQH1Bocv0AdFUz5H*lMdvK%a z3WSWxg>;Gxu1ARMPd1P!i;qC`K%&YkYv}s!+5Aq`S7OtW0KVc6BpXofPu{fn6E*Dtnt+cTp!@S*?~Q&VARuTb{~np7 zLyi#O8>j$@Lvj~H3!upSH^M&D;Lt-l|CzuyfRHHS58y3)!#m-%BEG>I>C{a*uqq%^ z>;}$zOu93?XVd-n;hSBr;z5fZAn$r3$R42ffeZkNVi+w7-9Q=?pc9-fb;d$Q-BjQY z1S)WQH&ekd`gQ^V+75TRGnPM`Ai{7IDjMkk+#0@-4gj$%Z@x0#2DSi+PATt)>2ZR9 z|AasgwoOn#=W0mx2m(Cm_0J#pc!eD#H@?sTu3hfiN8t_!Ahs22aDX$PKU%abKrAU5 zxSF0n{v@&}P7y#I1RcZt(L_k}uUJV0o&*OHjWKpR_IA4s zmFo7&h z;EZ&f08pD7pY6`K0g=%|lUvN9KRmI4!R1YjWLg#l9f_;XHPe>3gyZaB=PFpvl9qj| z8SVB62{o848vvpO1|?L(4aCEmvvdZo@+Uw>lIf8uROJy0Mad1cYnzD#5Il7F|H*!q zM@<~`hvZ`C4>btL1f491D+O{rS2>Ip`zXW)4g}5(=yD){%iS*r8BA&ZXK**`!9FUw ztZdLH9=O_D;RZI*t-%o)O$$dQ02Gdf`s1Pd;MEAeR<}LYtp#7JXfH4*vf;(ip^$(m zhK`jE5a56w#X8VF{sd1dR1_X#bfiDzG(AB|h;|_Ghw;Md&$5Ci977s`$gXEOcQ9`Q z$WtPKMsUq+-~&M0B3(bi2Z$lzqi?DbNJjm!Q7yL5qs8<`G!hpeb_x_gU5egbW7$^# z;X@UMvV)vd(2u)L^rxLI)CeTEM{xltt^2@WuK=Vd&*rrS{V=TxZb`Xe|BOp=RV|zv zRC%`ssUQsdz!eB2H@&f%^|;hwaZ3;Y zYmfd=#VxhlDGbV~m|ycpfVV3UP5cQ2Ti_4{^9s+sK|)F@5MpglOITHcK#i#*D{@zS z9XlLB1vKb^8cy}O`uU?0EL2h$a)%LmP>LWu`-msPphjF->>yYLa)b*KkCPF^#|BBV zLh#{3=R$GG1A%fuq&zY#AH+`k;PS;-OlIwQ{~Kr?P!Hy&lf_0) zN;~BV2FuLnD#3CQvkM)d7CwzoYDBU=00;10LA$4%~XoBQ18 zPPe+7)sG)nWFPJ}h?Cb%Z-0j2$QOx{x+tQC@iKH|7DY=s=B<%-TO{5$g}1^FVsMHm zeBt$ucvNhwiPLF>7W_2;cQAs!*I0yq%FA1k@nI|A=cs#Kq${4#>*_O`=5D zDj=9k{N_gaM&0b7a*6a}4Pg)?#tDK9pd)V|fCx0DUNHflY!Yl6D0#5{*b`A10SEj0h zUhG2@xr3mmdtf#|(3RQL*a8TSKr|;D2p%eESK8s>1P~aQjsT*Z>rZ)jE;Xp-18M0^ zw?NmpItd?KYVPn+xepa=flQ1ni1Pb-1QVp=6)Nzr|AexUH(~Yt{5CP5K+>W0Ei5@6 zVc3Gj>+T7~7JlD7m>ydFVFFoYLwsv6INv~BfFMQ!(Q*5-0jHBRVUP_aq+(jpHvmEf zSam%o00<`_K8r;ztusc_vorqCBq#${x5Ghz00N>=H|cOaEkFw!2rPHAGmy|C$ajF; zB1DY00p`;JDo{7M^Mb82FO>i|`oeMC0xUQX24jZ0jCK>Rtec~ED)#_&R_>EaxT|nKD1MZ z?{hO-08Z0mbs=~zHH18A*AK6t1}y?9Dg-?E|FT1iH9fRbgZ|J`u;MGQaxRj?Esp4i zuyQQ_fp3*~gkmTzPM`>~#CeL4H<%)YuBZ}-Lkq6}Ew58$D$pr)au6+`3gG~P9kg#a zpb6;!g-6gXm2e8@b712Ngkdrs`kcTOR454QScAyE)po3dO zG<+gfB$#$`0(2X|h{w{6-`GIQGE#IzXfq*YlG>=?{UT3Ai$k+>$pcvjM2&B=i6RG~km!$&VYLD*tE@0f~UL(s(C8 zIXu`4JwO8p*N}7>67J>%y?_tR)hS_s4Eqq42w`(>M=7EZI>n|?huIa52U)UmJ!GN+ zva=5lb_A3Hm!zXBz3?X(f=DJOJcUv>%Yq?tIhm9~3oYV_sq9B=g z1#e~P4`-=;q!?k4@F(dbVN!AevQwABSrWX`C3n-4(LgTLB@rsn14nQIlg1Ytmyr-v z5Dhmn3Y8EDH;@8xoktTlduN?<|49*ECYZx{o*@wp{&o@R(29p~J(P%^^4S>X=^3dt zp7WWX`njL{*`NOTp8(1%1HwTD6(G@|Y^uUKpCTaU13MNqIs_#kVP_lDxe}8&;uaP0HIuY4i0(_GZi3}a1I6{8uvyKqR=AINf9a#pM4RdC*cei znxV-7Am7h*8!wS+NECl zr9lcf6Pg)1$CW)75mM@Qb`Sn!0Di>wSa}EayPSAkq|1fPF<1mXVD={RDShWv|Kn1Dib-e&~TACbS`l+Bwq`q!BDQj5Tx0V z4Was}&tR_V8lDi00yRCoU{VO>4^$AGJ@BgmJFvq50=BXR z$0`@}%6{>OWxa5A1rc(JU_|Z3H;Q7h-j<~|1y?Bub`AJ|;A$K|x>!AsvO#KS>pG;G zfTU}XvLo=aLTU#H|8^NvC=lPU1@jgSS|?`nH7u&IlcFFofM7ebDp2*}c&MT^1$%hh z!mv0dSPdH&Ihzeu*m7NSvjc$)7}7bAc>>K9NncAh(BcmGQ?wfchjLi5#c{G5K&0u~ zvO-EPT`B}LJEZP#1f@C|u{etpYd8~052BELbvGJWt0`M9Js!)D70EZx;2(Qqh zLh8Il3O!ZY7f@;t@m9S80S_e<4vppzSXQ4AfmQg>4>qP1C;V;$ftPxTGW5F;RMM+V z7G(rc!Ul{RAAAmrMWj*3z#e)Ig|G!6`U>aZVikPBL29=_x+1jOym>pMi^v!AHpT?u zWV8US4N<-QAR#KS55qvQEHR^0paSbz6l6RQcicf}tP#a&0>>#~M*sqLfS&}>3oKNG zN1Po&|GLCO+QjG3z(E?tSInVO3<5!_byfoU2u%ZA`3F;sqDxeN!U4|;L zcVc}M`rHvZIuTsA5i=@=f!WVD8Wera5`P^M`&SUG0s@XT*1W+x0CJ!&TA&ePX8@uq z4r-eMB0F>~Km=tFWFrzTNf7b*wW!U;ICM_ zue6{u5;Cx3=uzxr}1!Y$=O1|Xqp$R7kfAXggkT9}qL@SLVuKdSJceQ}yrG`+XfV#5= zumT7n>l2Bs5jiVXJX;W&AYZbL=SpjmNZaO;3q><1mvb%nU-ySvwGlun+2V5Y#X@;^PFiB_XtOSF(Z(O1TC_t2LRf>F(js6C_Sscn|~( z0>kjF-nd5d=aO!S>{||vXuInVe$Dg9sq zW|OLCj1P{BG7f)J%`Onpz%#EC4bwgo1)ma|yDy!q?RC5~;J!lC|4@uKb|Aqxj2xtO z{vgc!9og>Q9zC!H>3}>GnTrTf@3gXlwMhj|AR*7FJGR5>{7yM4pt?TM-WxH!YeaCG z!0;DhleAhd;9EOHroON64Nn5k?sYC>FTMgn1z;~&=qnHnf?<)1Dv8b#Sg#WCTh9YQ z3r-N8-_Cz4Bq<_k0+7OyBQS$dWO628J8J*}-uLtR(Ko$Gk-n1<^{$T_Km%+>Sb$>! zYryY;b&{RchN=?4C*bo}k-}W>^<9<_GK>#|jJgF8WPu6NUvZatxtEc21pSbY4?!h! zsr!__9`4qce<=|#NBb5b+$$mIAfd(_()E2I{VmbQe;g%I|8o4zlAO~e7uL=ZxEuz$ z%*#&FE_*TlFagcp|NaJ%@*;81fg%4fVf^sl+k0=?g0cVk4-ozY4kTF6;6a256)t4h z(BVUf5hYHfSkdA|j2Sg<UN01>!jwD&q&(&bB-F=fuASu{+yF z)vhJEL4`j>8{}?y6>si_oNMmJi*|R7zaw}3z1l$49TlL33C~sJPsQA~ap%^(n|E(t zxl<1(9(=d)W4V#EzyTX%uRx`wx^DvqD{K!!q$C88LJKRz zP(ctw6mi6*R8XNq2R}TDtO-#}u`sAu+|b1aRV4AotkwVvM;^PPaljsO(hRYmj==55 z{em14C?k=Nci)FvprVC+4h5tVQTR3Zp~tqMx63V5*e9B3=Im3aSEAWR z1%<+jd(!n%|cDY8BK_!{epJ+J{_=kj}fTnCBMU>flCha0^hL9(-}e8xOk&NP2kJ z!B@6`ga*em*L-uiAeX$t$}i`9a1|i=yDL#1%f=!VXc@~Xf$T8-bf}_v+K$Py)%~2B zRX97pnvdLlc;btXPdA=t{D4t<@)^|1`d_hBbskAoO@e3(}E}e%L_~nxKN~>|iKrP~lMv z3{n3vg>7oa!hzAHXp$QP&tOX-b#1T|M zk6yF@83NG+4M1>&BYeXSAlL$x_OSscnCJ)v8>3C^;yy3_2s*y&pukS9JJq#JOCQzh<5!Vk{D5I`KtS;PEZC7d9TIRZjiYp_Eb zzc&zR4y2mo6UYeM0Rn$eAsdwls6bnyh7I`8pAD)65Rn7KmBtiax_Je1m?u-5YLccl z&FOxSvnP|PWFMR`LlT0)5L&bVAAq2XK-}0172qQ-s)$BZ@qtT=79^trNyi9|wGY{K z^rImiX-R{slEeOh>EFY zZG%?ZTG(DlSEVh9JFEZ^d?W-O{=i2+R4`3^y!|H!WASiptd6ABKyXvXnruZvc!Viv>c8vp>Yex{788MDtBHa-NAWy<3q zGXxrgNHcndZ00J8Kmadx&n2>O2MM1;zIP~Mdz$ctgSIN06b)=g9P?0euGtXkfJ%C_tX!>`pYdIU$k=thCAV=4az@ z%kXtiHYNZ72N(bV-w4OI6A%CiIK$6QZcm`yLxvkexXMNVKmr&5fDdcn00_{422ya2 zEF3@p{|IQ$Apjt03_xJg2oS&l0#Ii5T$w$w06>n}qk#vEx&bi&000co;ZPtT-v2%a zy$v7$0ze!C;9kJE&k=)_KcL^YZqFU`&2N9doB{25fC$d<01i9=00L+LiG9IvheP}l zCr;7!2&iO1B;iO;O$`m2G~P)`aXaG)d+`%+aq!IU?2gDb_Tvj zUGH8?{pS%r#|{ASh`4hg8_qcKH|WmoDrbS?8i+v*DqZKk|2@Ys*g1Q8{&N6OVDAUt z|2*RzpNs=gfW%?G=Q$uS0k2nmwp*vjIn2QP>tH+i^7A~X+k*fOyaxgcXwL)^ux>!V z9pH60FTc+r0s_P**X%LE;MorV@uuMWmB)a2SN`}c^Sa`lH-PouV`EmMu*!3?Y5;%~ z$bzsduLMj00!RT)t2HirvVQ9~cFQ%|<2j#GJQ6FwL!dwV1Ay^Mvh_;=0hqp8vO4G^ z1yX>%;*dTQd>(o`HTJ-}_b5E}c!m-P0Pu4=I$V2W1_CgG zbGSPQ074D;hKS3*bD)3)2!{k&{9sl!LOQ@2`+1sSvuUw}^cNCsy+JVC3FY7j;Jz_Zg%%nJDqt19F^MEN z(Bou_a^z23`3Blc!2MVLzU?YFP0e?_~ z-!zG9kb?zfjRpl#TnPp1%#YVvsTe2#4bVwA;|4@nPa2gU5fzmaRTaom(2?MTR*2En zn9(7v9_NZuDVkC$B~oovns0D|El3AIfCo`fghD_GD<~r;Wzgij(*H~%FEuqHHf>Y& zDNJg;Nx{g9KQB1js)2;D812Q<_sB2N=|lbP=NLpY3Z;_8@~kO@MUV zQwLZ=p)>*rkU2;+!v|1JU$_G!$TKo9g7KU~1i*mp)Kd@1(mNH1TFA;)Rf{}T)e0el z0yqG0>%Idhfc?8cxH29DV7P@85Co7$_hfPtg04MRBq zKt5c+)da8rF1t5fBme;rQhoEdhNRV1{g%Cn3MKW30gF{xO;dE$oz9%ZQS8-o$kFOq z*8PN!12ES1xHM&D*8F(KXzdUTJphP%gji1E?K42Y4}l zWy2w400-Cs?EF`NjL#U5M5+x`2avQKZ~*!;f(1Z;iIoqpl|Kw11MEXJsXbMEG=f7F z0D_H$6I+0DHP|O}fM9KaLY+7R__&(m%l3G%6i}{v6M+5_Ic7!NU}b>7d$|B0LXi~^ zk_C}9K$VSH*N}h)K6u%*h}j{1uzh=0mMqN0l)IzzIR9}Y!lAv_z?1e4+H?&(?wlY6_wQ;h?N;ofe?kla-CNg28Z&5 zeF&k#>Mao3kr@t$8WspuC@ia#2x=P%9eE1QYBSoU(PI6=Z@7fqjRDUj!G!D3Y6t}b z0A2yG*Ww-B_(%W&V8U?dujIwMYG62<>rg!-fdB9FNVvNL>tzoH5aDy!-Z%~b5%>lK z3_=R{SPGy342ZRG0AGm9hRHR%-1CJ&X0r2*4;vGJm1{`>u(1WmhCepIJ1|`IT2@~W zI-f(pjB`gb=HAKW*;H*0229Mu6gLCl04Y2N4&c=T_(BGlT)`Z=iag{geverM%@HP$ z5)KhXl!y}+h%HDXL0VB7c!e;is5KaYWDWy-*nk%Bgd-T!Uf2SbdFE)IW`D4Q77&Fs zP$-G8f@=E+DzOr!sHz^I(;z!y5XAuTMSvKn29{)20Z0H0o&fgcwJK(h;LYOppwRQ9 zRs;|MbxYoBd@v(ux@SE(OZ->wY~yoyWB=_PR((DI1-!<8g^veF20&bT^`w9R*+!0t%(TcM*;*c50}Oox3bcNT4J}ag@?xXa01Cg%!ChP-BC#)k3QcA%!WUh-2YfMWX@eygd{dH2<2l90NLhl+3FF%3h!e9=J!^CE5V0w zu#0q{g$zH)Ho^o0OO01Ti=0=NSTxA3dIF;xD(qomdb_~3I`ITV90;ze2xo-u(VgXEps zllwOz_u#870Cyw+0$_j>gMdm_>W&L-0SN9xNQRSJ00QW^Xx*^FGk}H^x&(Mz6Sv$A zTL29J-#BLvB(Lq;K7^C&RKtx03^jnJ^Eh811H(tBC^|uP>(EnJL=U7`-BOp2rpnww}wx!#0IWHjZK9FGGgKOBHhgbs;Fo8!g zSrs?|REaDaaGI}vZ~OKKP1qBrGJ$@W`I_HmXto2OYCiw&2x2i7WLa5J2a0dFtVgi} z%wiU2Y4E*%YG(*Hqvn99a{#)VfKUhk47h}UHGql_fcEOl$u(9DuwMaS;P)`HlgoEj z_PG9KH>IrlM?KzDHa-I=fWl2O$3D3))N%x<_`!<>E;igGBkYq)%m%R3L3@A#K)D83 zZgt1E+IHtqcm#&4R-_F#1Gw68;M=# z2e`{sB;~qvZ2zMUyja-asq6ST?-4ne29DSst+R;W$}MZg2)TZAj}R9HJ$j<>^wlw! zkb-*PhmUNCg;3z~_Sk;!f2Uvl23!b*DofrNxKT_~Px!zE=}a#ng^y|&<#TX<7fFU# z7&Z%uh3$+5M+Jy%v3#8?Xz*Y`XR%@-Yd&1Oqe_Yy_ww!Qw;{Jw zFF63%@&ETQ;>3uPO+aawWnh{N1Q>{+_%i0qHE-7K?71^1QlCG6gGbjY*3tVEjc$#z zA5K87XVb3T`mW~Ox_9T!2I^%N2X}ZAFV45xAzUqe(|mM^r19v|W7@QBy~R}9R^nOj zP87BG@Z!glFP|0n^z`bDKRdGiJ^a7zJ^~zVa$pMXq-dV**EuMwrpN5`vD3l7cg!m$iIUTXV zD;L@b;WItlxTADA^7tcWuWWEcj7zQYmOtM>l1^*!BvmAB*9`K58J;O(2W{%?vy>() z2LGiXlwk^}#s-xDd1iExp}D4FYO?tzO?GSmiI_y;5RHFF;L}5Eq;OJaYx@Ml#}D=h zh0GdG$X3ZPIEBNSC|hjt%29TNkq(poI1`2)rrAMD7}QXaX;Az01KUpfn9&NVLa7o4 z4qvTCg^T{A1IQ6;;PcHIYlM>%K9y{t1~u7e(hC)=0lI8wLT&*kw2CnoEwwaFTP?N? z^*|%DK!wo)B<^hC4-KtZ;ZC=zW%tuQAcztTrLVMr(;9_10|GM0G#Z8qPZ6072(Ku@ z=LSyndjyg|wUPwGKREpG!`lgEM+>5?@zoYo^wI({{y4ElHm~4A&&D15(@zga=>O7# zD!qU}!z(A4oG#5&#bcA1+3GCQd^`KxP|rVCE93@X*1S_cRQRLxXVA!Tw0KYTl2eQA z!V}I0YM|)Pl{o=Yu_hbrvq3secmfHXS>a;^C+o7iK{Wn!IW{1o93dOOqHKv%4<-)- z0&H8fYyx*wPzp8TLS4IEfhMpw>W?Zon1Ev6IHARN%S8Gkc<0gk zM5YQkOa8Nu*Jqo(_CONte9f)qWb|kDBq2R*@i5ts361u1!Ui3OGgUTX`~M?~6aH+e zN>~15)6eXvP`lf?1|J*v#xOSU6&0w$ANcrH+&m^5es72t*wUv4}=I zA`*cpESrFjX1!=4PFggZR{RNsqxnV)TJV&dKw@6RQ-KyI7!NIuz%*+x!3laGJK9yE z61I_33oPZoew|<}H~=FJenmkLc;wvdf2 zOf4Re7?ywbfh&+b5m~I^m9KDOEQjJG9hRa7S*}qfW6tJVUNWR8R6qq4tbwB@T^7;o zfR}B-?W7A5;RdR=m+!e$Krof5UT9j=(JXUDVc5bxXrM6~xabH}h=wYt;WC(=U>GFh z7*3WNGi2;T1%RLh2y&^EK?Gx`o}mUSt=Nxj{L-st6;L#6qW>eAurz|5O9vERbW@e$ zMW=~_YnbMm7rUO7E(jAAHJm^T6X4_sw7{AydqKFR9j=0kEu0`X;E9k9k12d8YhN>~ z+0E7|ts7YDOWKOQeN+}okgy3|*=m!2LXEU_I_(Bh8xxkYma|zoLhUq>4MlZ`sP#*% z^E~L-;Hp9uDuA6R^vD!5aKpCFeJ*qv6c5Vvpm8k`E&A{=wa%(lwJnXFb^+yC(1z9~ zZjCNga9au83Klzn;MNELu_f&imv1IhrQp&5f?uU0tV7{MiO?(H0TZ}Mlw%xAwsnN_ zM$a8g>+FH8dtLLQ&%w)zMuf}z-2yAc3C3bCK4_r8F#pgA!yp)eAcNajK3Y)4_v2;@ zoRJl141&WpzA=uC5|0@nxFLMav1_Iq(;BVhdkhA0XN0T*B7?-pNLCw9{7V$f62(bR zzA}~zd{gTcxTWF}e>%*zt9L%f{jZY6=rYko7FO^W7os9Ach?Ur54L3nx3Jn*hL>iLK8U2}l<01?LNuHHG^i~H zTb9NZgchzD(bkGxL!O#usV>N>Lmj4T1W4AlzBQm=yJ1|%WxSNzHLtVcYYGQDM)){i zv6HRr{0Iq^#x!%yggm4pgAm#@llFp_4I@`-yZ_tXPIRA-CPEdsG~Cfv1G&GgAxU&q z-R^!jk}HikDSuGjP@^}z!&3`MgvBwjSHS{1FwCKe*oUjE_bxspo(Zuh=(1YIJzWL6F zS~$`z6nQ=0ClV`9CEVP(>{ma0K8H)|#e++wj6XbSkBh0w68^KEpFzd(dWWpN4!jKxp*pBj`@coZI7guw~S&W3mqPB?>t&;Yfy1_Wvd0wM^a zEE7(M0u^Y&mh1y0I04Qn!Wy_yP9RDc$c7?}o zUtGkWP;7wq8O0h1picmyPS{{~bpPI5!~+9n1(+Oxqe)*wZAlI-Uio8k~FkKB|O*W*%RJ_74poJ%FU>Jk}E$oA@ zeBxBxO0Hzb5dPsOhN2~6Vn6hXqoqR_IKfzmVmOc@Pf&x4R6;!HqCfCrHn@l;#6vL3 z4KWI1G|VAcu;3bO2@~o>I4q*B$c+dx2|n1tH-rHdWKap_L^|| z575c`;Q$qg0vZwz2q*%&;KVSbAq6ocxzPY2N}u6L!Vl1&P*7q2DPv9;VNM_+EVV|L z_(Ur5VTkzQ2DSi2se%c#(Y#<4IBw(_@<}v^Ax_XhPx=GJSj<221TxG>%23K0i~y&6 zg&M#Nrx;KRc)}4Bg2#Z2$j}QBMkR;XK~jEJAX(5BD2qSD5iPU`Razxh?gUbCjXS(0 zMapGdHqcz!Wkt$GIGBJP^nwvSV?~+38n{e4q$Mp-ObBg2#h5@k{A3#9!x4Z$3mgHK z2qhq(AvGw_kf^}l`Tv9$g5g$FB}~TT zgE=Hb@<~2OU+F)2j~X-@bjW*RAyDyePZKsfe`KYZfJ_=l2K zLK+6CKX7R}c&QX7r|aS1PC%w)Ql>xXq$ZG?P2!{(UM8P>rXFplS8_^NE+tu-C0eSb zVzQ-C)It(C48vG~6_~-IY8+9V)@z0VVHK*26y?P{kH$dNgzn*fBFJ|3rYP{;k@|x! zVfpYTNuz(0H(*dQM;rA4%7-wN~?nE1WA$$a;iXJ`2%4k=}-Wta7IzgWSu{lE0T08 zsU}tajQ^{aa@H%H80a|A&Y8eh*Z~+xk*#ik3CLKzj({Q@r$4-3s1}x^r6oQX=vj8+ zfRal`O5~K{giMMkiTVSvrs#^wjf-+&yx$PAJN+PAx5D4Ng3*1I3220MP$z0VG&0goel(r~zbdfN4nUBH}}Ero%yv72s&A ztzzVt(#ce?YPia)Kfr?`l+~D~Vc4W$;kFLne(6_?)!%x|ymFS$>5hM;!|3R3&Qhtx zKL6O*{7R}0Et{@ua3YCJ-YQNAYl*r|v0BivChM*;Yf+p-a~TCBR6;f=4G3(E#nJ$* zDuV9-kMQ7Z=H^G&ngFHj3I2VBC;+S~+=S3dA;Kn5zz*&E!I2%P0ll26Pe8&5oB)>8 z>X&W+4JZ@ZV(%TbKpxQx2ry>&j<54@ulL3Vz`{n@yi7clFN{zF{M2dO+>Brn?FJ|U z2*5);&;SSkPY5-kMFOmhtt&Q_3-p3(yn2j(bnV#X1ixxgtpHH1WU#*?!vo`M@;yNZCI4mz((#Q0!6`|3p9d;DOBGPcVshZUCy- zl2YI!Q3w`LfQjxV2v4wa*xp(?#O4ZvaOl+VA2RPvL@-hIYpoP7rM#y5#$)%w<9nF^ z{k9Vg-0!YpuXhYF?#hiLbBFbQg;S_N9jPz-I`Jzx1yo2y>zHM-eG3(szy*H>{t!hk z7|aBM^1spXn+k=Ld9i`urbG*`1VU$d-9GaZ^FHg~f(f3sQfV>+&*Lfy^YWMH}h;@$`?IJdJqzyC7`xh8C` z0Tkwn1(`q=ZXr!3%jUwfKmRj8(@c?SERsfpuVw>bE#$$Ho0X;^I|npGN3=u>2ni<; zcifBt8K*zIYd=r4M}IU(*9Hkw!x6YqB@hXkdMVFpCpU+*OTRQs69oxpgF%mg3B=97 z-YfFztHlDWv&A$}2enZD1ei?jP#-l?C$&;9HB&dWQ$ICSN3~Q>HC0!&RbMq$XSG&u zHCK1FSAR8FhqYLbHCdOnS)Vmpr?pzIHCwl}Tfa42$F*F~HC@-WUEeic=e1t%HDC9& zU;i~=2ex1jHenaGVIMYPC$?fQHe)xoV?Q=zN48{7Hf2|~WnVUCXaBZlZ#HLlwr77f zXot3Fk2YzSwrQU>YNxhpuQqG9wrjsOY{#~2&o*t>wr$@wZs)dc?>2Auwr~G7a0j<= z4>xfaw{aggawoTPFE?{Hw{t%?bVs*zPd9Z}w{>4Pc4xPCZ#Q>$w|9Rxc!#%mk2iUj zw|Sp8dZ)K~uQz+Q_q%mNCrrXESc7}lchSwmJmkbAT)`0ZcYm`%DdYrx+c$wnTrqIL z5I{jGSc4c0ffBF*Ba{Loph184fj=+;6bu0yz(Rp{c)`5`e-D8eSObF7!7liNJkYm4 z5JQ9afi#5p6{xs}&-l9qf`69+8w>#*-~=5g!4OfM_ar33l>0-IOZk%HgqZKb9`rYr0|hvM zLq6!lo5#6OtOGdc15U_8F(|`5ym?v7xtzoKP}n)1qxhcpIXI9-pT9$wH#$+s1BJta z7$^ZTT)3I{_mdaH5Cj4yEO~wd1%J1>Kdb|$4+R|zL8XI(fA2znr>9YbdQOn~5Inh1 zoO-ISdaEl1IIux}-@*`BxT6O0cSi==?0ZstJr9(k}cY}_@0yHIF5w~P37sCv<`} z@B>cdLor~2eB*?#PkA|@yvpN*H1vEi;HX!1AdPCc!gWNdvCpzYx@wOJx;s> zEHHwR=R~K6xsqP8d!R%|e?IVBgFaM_dKB@!1@DqPOu>3AqgO7KEIIsa7$OEfW zzq6zIJy^a!d^r^C!Y=suEkL=5qkMmRetSdye**-tA$|PGyOoFzn>~z(`I{$3i8V1p zC{kqO4`arR@+fwjc*|d>AzWa=qF8Pn#d;X&3~4A5lfQcu`-voJatKP58hK*mIK<-+ zkTy%6OgUtwzm_kD#GE;#->#89^$P7;N^mp#Rh~OluPDhA5^)@80_n$RLFrlE@;BJQB$ym5eLC z&6EMAKKm$wB9L23QGzqMo+8eld?=!$B{BMuP{ajG?8nO?z#KwMGW`)qL!P#5sLLMjd(VZplh5y%f_-HQkicPCbpQ zlTt(-b&_)ADM!D}*dPW{vS1mNk-7Q+u_}zL(c;S)-HZ)82K({n)?EE@6Hd5hRiaj4 zaaB{-Jj?W_*E3zziVkZ!oOK{vrSS)}AxZ%&5L>kpqZ<)nu!6czR7*PhCZ!0V7um5|J?MvXo3jQmfe0b7lV2tij_^pKHotWZ^Exs7zj5Vgq z8q4~?WR+JZnTQT)u(Yg{ja6Qm<(6H38RnRW>{!Z*N)f3dB~q4|=bnB38R(#e&Y0$! zodX7EM~7aT>87228tSMSvxW^Yt-cyeHfBtEhb5;SoPrw zu;2oJz`}^L_-{D3>BO&|$B~p#W)f$)ML8$~i>K_%B&BG{JHRo7M5G27BGlp*x!6U$ zbZio{x`#9dQi|1}CMO+AodO?}IxmT#6lr=&x6PZNgX>7~!4&51aMqA|WJnrUVBruH)=4h3 zF;)vy1|86#53o!^BzH^+920rN9i8QfL%^nWW(ACy01S)#D~EtOu?r~ta+vXy=R5)T zB4l9%7FZI)I?z}ZXF?NL&CvpP&a%ThenmffIE~2$ayl{WFmleAQ@Ly62# z=IGf`kA4(x5Yvl=CL|Y;mb51WJE=w*wmFc#6s9qi=}c)_Q=8rtr#aQ>PI=l>pZ*l6 zK^5vyiT_$uqaKxI{ZQ&Yn2HalJ{2GFhzC@i%G7-*^>Ikm>Q=c*o__4(sZw=^I?hp6 zZk+WRX}!iY*s9iRoE5IzDC=Cyst&T!^&D|Ut6JIGR<)jWuIg}yRHurMt9BK!iQQ^H zoEnd@nw1-CeXCsSI@rP*R;lLsYFHin*t|w|8k4PTVfUd})1G#ojE$;0%qm&RTDGcL z_0(pg>e$wX_O*qTEoyn&ThFOB9%Kb=ZmY^O;IcNaX_c!wq>7K-`WCv;g&SCddycgd zx3pqotZkF~)^oVSu+kOpc%24T)<)L5gLUrS%-dbL4mP{wmG68p7FDt;_Z(A=@8P!F z-T!ki7a#N$@PIePSk|WXzf<*|eXVh4GGgjE@yNILABfhm7B3%fB_6x(6SVypk*vz*$sKfgCkCn40k|n$=2nDJ9M06 zH~V(Uku`FX_K}QCO>W(MB8Pr{6@-;ktq-tcs4OIvN5GW0XJYf3FgNgE;v7l;JOaI~3 z@DSRjJ00r4eZmV#>H|D|jgfs=R~?^rPd`rK1}Ov~2vTqZroU5XM}VOStj0nfei5`y zJNU@O#@jDU0E03d2|H}qS*Y=G)_4@D8ibGn5Q^Z2Oprr8$v^}cB8`PitW;)9=0>>1 zT{jcp`$+4UrLX&uSAzeD9C;{05U_xVSpO&)d8h&qs(^@x!>rxhPDCA~C{{qs@6~1{Qz-hFm}< z>3hkCBjybTghyRdSf2|%uAvT+!)+Ei$Okznkq;eY+8;Ea2MpeG#d742AO8#Y2P4Ku zPk4L)33=BGJ~q&kGpNBXddPbTW)YXN!etSwwgh2x?E-t75p@UyF-ybV@&z1I|k9V-)k;v;PIR?@Pg5-rA(b0$x?NSQw z1Hln+IlNqauy|Q(T`r#pLL5dOm(2e~w&@5*=XK8)Z(t$P5DqYW01s4;_3Q%vwhpf7 z;_?XN8~X1p8t^@W4=#|w_>S*!;6VpsZ69>NA3X3qA_oeR<)DU%`~Nax7-%63dI1n1 z0V6~q48njGUXTi8F#N(H2AyjS)S&SEVIS522g4x1(!mbGfEqNS2-2@7;^7?VqWv<0 z9Ke7HMgb$>ArF{93aHTOu<^ z9R4sP9#A8ykpi3X8Ld$dvtb%B0vqoTFwy}MDKHYVfgUoV5r^U3_VCCo@c8Be319#Q zexVG=!3$JQNW=gWE}}*931H;u1Tg{;N`MJI01IBgAI=~L#y|>!O%ZH>4aUF`!Vd<< zzz6<7ALf7wOwS+SK?XiDB==zj#$XIeAik3DEkw=<-ESX?pb8uz2&7>je1HfXp$O){ zA65<92q6<#jSzrf6v!bHf`AG^5h;~YABZ3ciNNuCq6nTc7~!HBzkwbaFb{oj7~ruN zDhwIo;Tv8r^N;}^hJmT#K?1V@8HOPmqQM`+p%=2j9se$`7pMUlkRci_ZZ4Yg6W(GJ zlyVfPkjpT_Df0jbWFZq|0Uvw-+Hlez@*oJn5DaW#2$*09Ce9b`!ua4a9o&KSh_4X$ zFC3yFs!Gck?GhtM(=`1d8hRldR+BE6OfK-DHtX^l^Rf>%#c_vNr87^W0%Cf5kVW;oXoS9N;n| z@G&3j;@ljg82Cvr+#(-nXZyr24u)V3QU?g)K?XEv46wi-Uf>F>K^cl6{Ip;f01a5G zKn(q%K_3(gejy0vASo}=A7rv82o4^!G9#YQAOB{-KUpymPGJc6i3x;q{y;GojuPKC z5hIA;DP^=n3E?oSf(fpYCmg}Kuu7?{>){d+BfwGuag#d-QTNI+^M)`ZB+wu9uk&D2 z^U87>(*XkKA|G6?Cz@dhq`(dR0ZhGN6oSAXC}9XvaYPHSMmG@vZ6OHyVF+TO2*y-% zgcPfYv?rqB^Hz^G_3|6+5-!6btLOnXY4cB64>|qe_Uc0Gr1TbjFvrSrN_EZ|r}Pfx zQZT*IA2P4Be(p$l;x>(~0UMD!fAKB!bWgA0;iv%~8WA6;0abgi7#lG7zS2C0As+n! zKI4--?x7sY!5Y%XKDBBg{UHv(oUMzYKwlrkg2uUz6l3NQl8)DS3?RSFzo z2=E{eu0R$t!b39x9=t(a*Hv8sFcUO_L_Z;1#Y`6RAVhq?Mdc6_ivS~fv_@}~U;Uv+ z;~-#ZlqcfBtr8X*pn>-Sb5uW70%`R~_dz|evr>CPN~shhJnu;F&lj~cFqRGovQ=QQ zfC=INBXWTWfM5(T0$($N;t0(a_!S5`6i+h(VG~xZ7MAxi6(3kHJI#~mBC!{AjXD$6 zXBjoX_TlvcLsI+iXEzp8`%oR@p*CyGQ=|4Se@{y9@;Y5FHtn(jt8oc+c4rxuC#WGB zi;)__;rG75^Nx@FkqW+49H6X|<=7OW(;#lb^S>qrM za)D-T0SMB87GA(WfdC2K4Ijb}Bd}FlCscF2brpbM6?S12%&_f_DYhD1%@gz+eb`lrFyOEsV4_ekfN^Zqu`cd6ZU5JHRv#5_>EZR#p%+ffQV$Dn1xysIAPMkd5lmnM(2g%m zpfIbj2mRp`XaF#}feVOjF4V0qQi6CCmp>bqW@o|!a$pF2KzNGa15BU{J{J#277Sj% z1o{Dsr9d8(pafK)AI%kY=R(^4c0|k9D|AxvCUXdgpw@IZBZAjo{bA4e)s6`vABf;H z4c0GyHuo|(^M*m^m^Sw`uODhN_aGKyhoOT1Qu8>LJG&QT|AG*pEeIyGLk-j)z(A9Q z(jPE%)#ia6_6!J^V18@%P5}ch2N@%xVHn2JAHHE2{*vgvGJ{_^9bS($eei()LWPH6 z{}#CSr1O6tc=+xx8vp(h83M5v-133a4(p}OBBrD2h2b(P5_7ZLNI?p`Y_^$0b_3N!iei4iIteJN}?PHB0SlEb{MLC5C&-A6kuQx*kJ?y z;14$7Z%+UO-arQy!5wDd1bXlfR$v48F&R!_16F_%Kp^bOZ3B=11OtN~xIheUfeT6@ zAFdz~OyVk(pcsImA;1AFyrUe_M=HtzgVb#Z8b*ZjA+PgVGA7}@tp!N(As4*nF+{sP z^5GOFkIn*jv|9!ovWPA|;p9G$wb5=CHlVb6`wu`+2mZhxPCyTQ80}V|hwV8dP5}hY zOa~~TxBr>itp`IO+(O-?B2=IN7^-Lww#6#e7b6nEf0O{d{U=fa9LICKF_6I#7+WLC z00}T+wK1Z#nHnQx+Y6=uBc>qOT00{!JR>w5pMm%TLHrl=Ko3S-Fec%8kf#(@fv!cR zT*P5kG^BjI0}XOyCeRzAl^}6-9M0n$FyJ8xx;8F8VFl=+3^srgh+M%<03P080~UeY z-~k>?pxc~W$a}aNK%lhvo60|YtFc_m0|QF#AvH?jMarcd-k}>7YQ4z^#{D4?*x(-0 zg3V9q&E=fcTiq_!AgS#F9*8^dDty90K>GFp29)o^{Q(I!;JN*rCw4u$bpQlF;1kdq zBmX4b7xdr{DjhKNfim(sWv*m~ilG!{2nuqie3T$ZxtkRE#VE(~2V@SEoFy&&SE9QffL8il>lUEl@27aU=z6aeqg}&%Tz8zG48L}SbU!Li;p6R_l>Ysk)zdr2I{_3rM>WiN2J$|U%AqmzwF#qa| z-rHp!tii{S~KloRF_*=jDVgL9=U-Ok; z`VpV{LtpZ{KLh+>`@#S4OW*oIU-kQb>xuvM``-LzKmC^<`n})w*Xob38fnm>KvBaGx+OLGEQeSgp)lnMH?|R`yjTl~-3~RnSukF8|12gAV>=O)(En zNMVH*UWg%78KA>qhaP?iVu&J+NMea5o`_MoaOqXJgNoJWm zDT!s8YOWcUK^=5q4vSQ!mH{&$nv(%o9m3&e8ga^a*Hk31Nob*lx~Qd@iZ0637N{@^ zX{3_&SkR%CUMeG>UBH1t23)P^!k}r?d1sy;j=E_L4$T;-A%b8EYpjJv@zA8UF2zrg z(nxb_ufG0zRHd>GOYDby@`8>ltGZG{2PAOCDGwwtV~7hKp#OkusC3%79~|c3sV9e{ zB5O{kadsOgFKF?AEINXq@D(=bY$H||B%lBw5ALEM779z!O9Br|=v!>T2D1ok!aiMb z&yb=Bsc^&+Pb}=g79YxI8gRIffnGHzR~0%Rxex_29V|f*SlW)t3@;fh@lhn;o|-|s zQDkt8%P@Ot7YT4&YmOlq)Nr%DLPv2kzAFz+kkNT2eR0%McUS{M6JH$@9YfH;Yme0= z5f0X3j}2+XQ=eTWp9OUgMV#A25=93ae$?(Uf|R81(#(viah@}GI4*}b3&S^`QMml- zF?OmNr>fiV(80GEC@pHuUAcfxN#9iKIo8Hm5;o1! z&bi)pVa~WYoWD-~p`feoYagY!v<@*Z>C?$A6Dng3`u5-7U_1Kn&wu&;-*_w!F=HXT zh=@5p!n;jm01gzniry5MhOEfURDg2`97<9b2uiSaD2N2S%z-L`U`}X#qujQ>*QO{e zVRHX#VVcf|KS^W$dAA3_`^yVhF?_0%aM3h=@XtSdgo@aEUDnpocy( zw3;E1b{xA`h*!K~wmxcd zMO_>bHfARxbMTHuqWsV(J5)*!ow7uNIGo7{w#iu5h#75|WTbLc$V2*L59`2$G8ohk zD(J$N#1x`PR?-GmjANP1Z00yJDa&Za(q#rwqcpGSB4wqqn2VA}A7%)PT@WJ>fzSto ziqQ>js?&ZY$-oSh>CAX86PnnpXA7}m3RBq0p8Pz~NY=TiNzCIxmG~h*4?4DXHbR~Z zeddOiRIG1d;~2;ImPId$(Tr+zqa5w1NB_++(TaX_q$Dk=M?-4Tl&Z9(C|#*bUz$;h z!gQn_n(0Mbx=~m$Lv4*9V>=KE$VA`(9zZQ>#7aT}cs4YiGZ;rtqX5xZEsl&&jjB$k zLe)H36;o4PNL7c`)eN*F5;fQ=SFy@At%4P+1sQ8uiPhDn8Y3cBGb&E{(FtJiL?8OV zWm4|Zu%&cj2xzdYVSClir7E_kEBTbA)-n!bJi!=FHOR9t1zE{bma!SQ?4>e0*~&H+ zvsH;~XvZ;wm9*|qpfv_(J^R?wYId2a?d)m|)mmoG)ggzi2{^bg1V-xQ3QAzZQufdX zrrZMxL)b-c-{)Lg=_9(fJf>0=%l}Z%3V!OmME0(^ylJII5XXW)S zc%8Xk?wa>X=)!Oj7r%AdZ-DJ9PgWFU8PAPLDTV-HSo8!I z##PEa^3fD&K=>Wc)dw(c@k>d07{tVk$U?D;U1KQoW?z|alw8rap3dOJ@>KCSSlnGK z!Pp!!rg1!NjAIu=X~#WoD3EO{Ic{ZXVp4J*2%)i!!Hsyn^vg3!7~wvIJ++hEy!QmNOfp0$;T-DzO2MA%b8_EJ}! z5;Xe*7(=)M6w>+T3j3oJSJ(v`^1KQqOkxsO@JAn%K!-pOy4sx(L!N>0ZI|4G-R|~B zBCb%1K9G(l{Sd@+_<$K` zL>m}@@TF`IWg{BMfhXsI7)T>1ID*Y&3?-N>Cm4b$s9330E4@_@Y$hQ;v3$Ek2mZhh zymAGr01iq31?Fab-S>Ulw|3a)4|6wf-9T;R#}j?9HPU7c4JQry01g9XCf(2tw_tGo z(1l-UKObR(FBn;rr7B+$BT{h;1@Q!I_+xHpTXBdEbC>~jXooO}hj0iZa!7}N2#DkG zhPSmMgxH6M=wAy(6*|`3up#KDV#|2P;ZrkRH4`yy&$AnRscCM%oP^f)acm?_f6!{Q!64qPb5Fz*Q4(}ij z)=)o-!VkBg56jq$&xk$|;cJd4PXsnCEV5Gt5d}`AjZRiu-MAv(m?z>mj?Ki4x;G-| z7>;W=g6-&yY7>v^=uDxeL@QExjVN)BNRQ`ekLrkzv}F+cs3QGXhyO@)#?V>MRSpnV zhDk9GewBVl0ay(aZ$M!U5Y~Ukn0yrSa7O_SA$g4%kc|b&Oyl(g?SM3`rYr^VPAch4 zEa{SxNFp)mP7Jw@Hu;h$qLVX8lPfs`E%}ot5|lhSjxm@g?EnXgC?fTBT0`lRIEjD; zA^(&^8Ck$qWlxzRQkjyd=a6T1SbH!8f$$R$rj~2TmTl>lZwZ%dSrmiC1tuwNLD3KS zpqG2emwl<1WcQK1RS;U)P?jYXQAq->unZ(Hm4w+$hB+cxp_q&rmV^nJh&h>!>6r5r z6_J^ki>aA}d26aQU*BkZ0Jxc#86uJinw3eDqp4*fVw#|anlJcA2c=<3(0)Dex&L12 zX;tjmRQkz|<=CGB#6}qqplo=X0OfB>s5j0t;A`RA{ zoG!r+t56IpYKALX3{*j&1O!KdO1@sHtgMW1X6*o2s9e3XbFOsi68}p;rHg zM>?jWimIeqs;o+o1sS2-qz_RyqNVyjP#Ru_mGSg!ViVI3lFCc$I2^8aY3KP4qg_oA2^5$ISunNN}uHET)_v*4Q3$rmRvolMxp0=;H zG!JYRVle~`5{WhZ01U%g6Tpy8e+mr6P!1jI4}01T+$j$pOSEjKu|>P93t8fbg)x8PJyI%XT+Pa|}y0qc63%LRd>jw5cz(}isq(u;RFAtju|VJyaDe8eo=C3~<63X%`ZwYWkQ zkwdV>Mga@d`ylfGu3k(wU`)n)%*TDayk^WLSI}^)Aj3(~uj|x`R}iU1Q4CPP1$4|H zBC5c6%(8xL$(M}DZ41aq0(XUU3yW09Mj;OncEiM^HOWQ1K>_~->p&shaHEtQI(nSR zvrNmijG&v0Bv7CQz`zK^APuOD6uoMN#k5?_RSu=R6Yr3Og)|Amm7-0d48+h6upG<% zYs=S+&DpGyxXdM`unWZy%0bbyql-`)HUxc8s67D;VsH=TPzv|J2H3_3<)#!si+9vK zuGXy0`^?Y%{CM2FCFRfui9otLA!)@4Q2daJLjcJ;(G7%kY43mre&w%7@u5SE&z0=Y z8_m%jU6^qiF|524lig0zy>8<(W0!n8I3x!?9n%k)3prHJ`xB3 z_Y&(c1gmQlTUb=RsFb{#i z2%W$N5_SuH0LP%baeUbe!nce3+Zw&uM*Ia0 z&;Vb+P=0{iO(0JZ&;Sph4bBAJ5m4MlEZ0hb)P6Q@eNbViT?$+G5`o~aL0z!2?Gohe z5p2Ge8-vMsm1RhTY4gl88Od8#whkRBKl#899PZ)g z9pWzG;j5s?g=DZdv9)+i3}TmOnDBn!kY`2>(cm>6P2G)QHov`sdL9I*g?oF@tYzy_xB=vv}0S1l$4FCWJfB-GO z_nyo35_8OVDewaC6OM}aWuy0rpSFF^On+bR7w`ZFZs0p`_=axzo4fcC)3^=x_yO+| z-CFIFUnU_x`ZSCA%cS|p4geQm@(1n;p&$F4Tlx`G3_>k!(jfd4!nalI`a=Tyy&to) zKMu7|^|w#x{=ER>!28UfxxcS4>wwJ3UlbSY{jXp7*x&sA&F3Xg@3wyd<8T26?%dZ; z{sa8%K$XtnZxpd>|6;QI@lXG+H2wep5kjEAFXP625dfgUK!W2EUKpoMqCo@z1Q^7a zQR7CA9X);o8B*j(k|j-^M43|M#|#<#y@VN4=1iJ3ZQjJ0Q|C^c%F6Ks8dT^|qD74! ztqDvVQl?FvK7|^!Xv>vVtzN~NRqIx+9Gd_D8x|}Cg9ycjZQu|9M2!_Mc7+>P?p(Tc z$F-E2S1-|b#rFOM95^t)Tk-@SMx0o&O{#VsKZYFn@mB$rEdz*9)}m$055^e)pkd>J z%92f=MxB}@-o>qZt0_)TlempjaB~!9$ffv;>C@lX3bmqO=&-uKZm}v z+^*==t?%`HT>EzJ-M#;R2fsD>_2u$_kw;(Mn%~*$tfhw^HG6pa_3huspP%md__=TQ z2aq;mx{+<3TYz~lKm~=GPd^7AgfK!0!Q*ei#jF9xLJbA$=azS5`)@-L8OvZo6Hi1j zMHSQwN?LDSKKt!oxpKa)-1b$WtCbHBL*D5l;H;!W!AJcw^d(7Hd$rsi?!IB z^pO)WX{dA-F@3BdRyI_trR~^dzXdm3;F{%@CSVK`tejPfiL_h=W4j67qU3eADR0Ha zH(z~Obg5pT%8}+UX#ItcA9P{5Wu2Q6=E-207$z90d-tWdVv9>6H{xIqTSu8WV~R3j z-sXuBCVFW)87GouN?BwEOT0K{nP(O$p%e!x*intPmvT57PX_Qz&!ty_kwoLfBh$Ih4T8M}8utfSYk>Kl`=DZtfcZmcetQYS%J-7(72_V@0LMMFcd#}o%pct# z-0J+Xz`~)6fG-i+&e*3Y_rY&|CqyBv`XxY_bb~#LxrYT=7?pb5f**vb2Lf$}n|$1k zdY%7!U<4=l62hSBY5usM!g?r^?bYyxx2T5*{n5S>!YqYX#A27~=dVxAa74<0g|Pgg zE?B^hdAEQCm4M-<^gU1<5@aA7)wK#bmZKlLSjExID;wP8Mz=9m(}KP{oHeouO=-SEXfY{AF`}u>x#i3zgkjq;<*ArtGSi;-yo&!b z6=%xNt%)D_6P!NUWQ}*Mf*;nX-RZ6BdGucCP>JNin za9684`QrRkOFPe zJ&-C^m=KV$s~qDCf~Eztj{kHppx z7UF>7V+ALw1pV_C@3Adz>k-@8>J%0P;)mt<_gJ6+Lx!-0E>7C<($!Yix_4P2Z1cJk zz&$HoOtFnIqdVT2R28+?MXy!|+XeFG2!ri~Z%mf)(v+sxzAA0WGU6*EGw!#&T{!K1 z1$R4^qXN3-GY=P z)~g^|;@}UrIDUh5Es90tN*Tje#Tl5vRb9Meni-?DF|P53ltIiOPj$X8eS(g8ykvbD z;|VPhagjF!Pbu%U9XCdDBUrIzE_d0>Uj}oS#XM#*m)Xo`Msu3gyk`G4x7p2ahI5?d zJZC!B+0J*4b0ZqjhLg6$z*W|eNC9o-E>soDhemXw6}@OiH`>vUhIFJQJ!wi;+R~TC zbfz`EX-;?A)1L-)s72ip5_EyWgWeESQEg>Y6q*OGhIOoEJ!@LSx+OKhb*^>2YhKr& zCA9{2u!UXgTld=7yZ-gCmA!0P7yH=HPBycp9c*Vm``FQ*cD8Mu0c%?u+u8=Vp)=ru zZu1%iE$KkG!3||{ms{88&Oo}Q-Gwd)V#KT75MJ@U?;T};8R!6bzy&_=fxAMVQ^^3q z6~6F;52Ymwhq%BSzR83~+~O10NlX51@r4^ayNGEAfJv}%jsySP<6ccn$X`nGlAoO4 zsS@qWQGL>pm%QM=md(R$PH>DxL`E?bAfw2F*Ikn&0XFamDjxGIA2WDlMeBv z%bZirb;>Tp+aGK0nwG$U=m7V1&<~^Yv!kQuIzjr`#~zrl?|klxse8ol z4swxCo#1=_I-0`E6fkB@LHYp3-~#`gyjvXbh&i&^f1dVx(=!8-CwkhW-gmJ>J>{H# zdRKGP_`iGJHKf0}=oina_adcZ8fwn!84r5Bn_f1N-#O)PU-r!Bm+7>3yzTkEa*jV9 z@4zp*@PAKyg&SY?S?X_5tl=CheX1PY*e-9N<_~>{BSZf$k&-)~e;Cv!U-i1dJ;!xF zoAj4_{mXZH?avQ?zX#m;$7jCzQLY=*-+%l>O0c5{kcW4`u-&+wdyt2; zqLMiIzs<|P)Z4$^XupMfKlbB2_4B>iC_npCKKzS63!J?Q^uNzT!4aImF}bj#P=;MF z6G({(X$TR5&9%J>(O?`2#^FOhF2Cx+qk^ zDHOsVOcE1|r&ijQDFKk_gNahX!r?1HDpU+4M7s~<4KxfmC6tXIbVAx_JToLgW}rhd zw8JYLLn3jd#X5>$8J7MrkZej1T5+Ivc!yrmtUdn(jJmtS#6Utf%nde_#5V-KNBj#% z)WgvOFetk~=ukONWJE&Y2i#hP6C;WkI*fa`!KVO+?ixkCkUC6+#12G9{Z5&3JN{^6JM{Ep~xLFR70)}~{$9km4T=A5hkVc_+Mrh1FcN~gi z?2jXZGI5L!-1A0GJTzn+M`8q!Q-cqP1V|Qgs{T2pr>K+X!ij$>hK@9#jx2`nxrf9{ z5Usg|UC@Vh=*Zui$fsb(03k?&tPX-yL+k$#Nc4Egmefh|s7T6sihrCWklKZEfW`!( zlc6je0*NPD*~zAm$bsZV_=w4f?1Fa0$)?20#2_eT$R6BCr*#4v3_6{8XcR?BtVIz@ zY8jX@3Yc4nDt}&X?dIq>IOSGOT|d6U6{U~ zP|3rg4t3d;xXjDRq|C}ZCRoS^c7Y(JnxcE?te)biauA2fI?TjK%poI?bUK`9A(;?? zOCPk%*p$uLw3XX&APB-bfAEm#8KGf;6g%=s*GNoifsvQM8tQQr$)wHYWX|R!6x$4< z&T11RQigsghE=d>1mpe=rMPoM;q z_#BG(ghlq`$u@$Z*oD{_o_Qpp^VCU3 z&;lh;&mlm?=eQEukcYRd2^w?_ND|Qy1yP%TBx@M1yg(cX`Va_$pqH433k#qJ!VvI* zphr>;Ye<=LC{O*t5OAo6e1K8s5C>~miYmFVvw@&8!Ot1F&*cb~8v>kuh^MnL8GeAP zro0CXeTo%Dk3A?L+4xZO7*nap2O`@G%BjJ8a8I$Z2KVGqB(jl4dZQf$oC9r&b)X>Z zdrumq!FR|IW6}pTeNxiuhkE}M3V8&E@5{%hfXL5M(6_x zvW8pegI}41Nl2fCu_#Qeb(V*Z5(L|R-SsEgehW8w!WCfagAWB5+RxE*& z--H)rtyO=hhiJ00oP=W5^!xsonTX#IMW<@lzQL?Ka`0(1&nn7S2&3cd2k+h z;Rj^(!+a=*b+{xwWePrp3D1m)IX$Ruoryv{)Qv?kNPQG2Acj()1MqX;a@>qZ91ThemANl>J>`@cD@XvS1EQkag8z5j|7z zp^|ryhwlGlU_yq8Ygtib7A&b}Qz_&_UWzxS7J`~s0x_rI(qWj8he^1CD`10s5Qi&Jf?Lpn2qI=>Hs&Y3g4+RVe{kYrPy(K9->AR@Ef51PAe??s z0`_R?rzV_kfC4T+1B+qxga} zvaSgk7NLH4xUm+auLcqe@aA1;Yp0S4@Uz!}deS z{)v2GTVsuhi_VLTZYZOYW9}m$1@z?$LzM&49||(;_qY~2e$6uRVU~93+Lo$zH5sQk zWYn={spukWSn7$LiB{Y~^4MF99BKHF?3?HZLY;{Ssuy_tt}K-amA38cKBghi?(P5P z?(X*P?*{Mi-fpN6@A5Wp?p_Ysby7jG?Dlr=_lEEImhbtV@1pR`GO_EKxV|ze?qWHW zGI8iEjbC-!1D_G3r(WLNfOXZB`y z_GgFoXqWbBr}k>M_G`!XY}fW}=k{**_HPIGa2NM+C--tU_j5=0bXWIvXZLn@_jiZ) zc$fEir}uid_j||peAoAV=l6d1_kRcYfEW0IC-{Ol_=89Igje{5XZVJ9_=ku1h?n?@ zr}&Du_>0H*jMw;$=lG8I_>TwqkQe!pC;5^$`IATalvnwcXZe^l#_xi5~`>+@Lu_yboH~X_k`?Od4wP*Xbcl)=8`?#0;xu^TOxBI)t`@Glt zz32PB_xryG{JWv(I@@VH~rH`{nS_e)o1;V1s$H~!;C{^VEw8JkcxBlzL{_KO-{_W@f?)U!h2mkOF z|M4gP@;CqUNB{I!|Mh48_ILmHhyVDO|M{o?`nUi4$N&7-|NZCx{`Y@?@F#E}!GZ=4 zB21`oA;X3aA3}^MaU#Wv7B6DVsBt65jvhaP3@LIX$&w~dqD-lBCCipBU&4$jb0*E2 zHgDq0sdFdKo<4s94F~`s`2-0f0RI3i000001q9FnhX4Qo2nPoY3kwbp5D^a!5)cm+ z6A~E~6&)HGAs!q;9|t8OA1NgxEh;E9FDyDWGC?~yV?QQILq1AHLX<`km`4yuOHou! zNH0>4T2oHNS`lVkRXbgjv|kpjU>nC^8ed~-ZDU=?V;)6hnyqCd$YvtRX(ml;op^0! z%xx^fZY<4jFwJo@R&t?=awx!ZGk$Yye{&~UcBsyFI$d|AnRhaCc!1D(J)n9w!+Sr6 zd~{-ctI>T!g?>2geluHuG-H4?(||`*f;DV{ICg_Pkb`>DgG^V1IKYHT?1epAhC6VE zt$&6>frgLNhf;2cwO)xqbcwKqiAKhWQq_r8dx}Nxi$`IMMU0J0nvH{Hj!D>#TWOI^ ze37@wkzS3Hp4pRRZIn}&lvRk8SEH4S%9Uj9l~~l4W^tEShL^Y6mubA1fZLgDbeda* zn!%=;UwNEgi=4WcoMrKxV|$)s+@53Jo^-CCl&7F;&7gFCp=Yz9ZStaPl%u?Yq-)Tm zd*7sd@}+RCrgpZbo0zA=;HQ9ysB@vHw6>{v;i-eXtf0}Xg^#U!nyt;^u8Q=of1%E=mzM-eTn5n;*ufN*%z?-_k(A2@S>cOR}!kos# zq}#)$%*3YZ#H;tjqOr!K*~Yo+$gj1@sKv_F)5@;w%Cq0gy}Hb;;LNi1&9UyzxxUY` z!O^zG(dW$3+wal6;L^P1(!=V~x69Py($w4R)Wr1Ey~WkL@z%r0*S`AK!SdM0?Agl9 z+Qa(V#o626)7;1M+|1nF;riXn(%#DC-pcgf(bnP5_2Jao{N~x-=h^M(-1g|+<>~SK>D}V$-S+C?`RwE7?cwR}<@xXE>hS0N@aOIF>ihES z@$~NV_VN7o^7i@l{Q36(|Nj6000{p8{RtdMu%N+%2oow?$grWqhY%x5oJg^v#fum- zYTU@NqsNaRLy8oJq5$&6_xL>fFh*r_Y~2g9;r=w5ZXe zNRujE%CxD|r%fOt?uiw9b0}CEZxUk{Fh!ZPb%($`R$B-jSo=my2<;$2eYu?Pcv**vCLyI0w zy0q!js8g$6&APSg*RW&Do=v;9?c2C>>)y?~x9{J;g9{%{ytwh>$dfBy&b+zv=g^}| zpH98H_3PNPYv0bjyZ7(l!;AkPPrkhQ^XSv7U(de1`}e!i`I=9^zWw|3;p^YezoaM) z00t=FfCLI?M1Ke-s31jP6zJfC0LHW6gcMdd5ET$+IG}_UcIcsg7y?+r4J4LGq7x?G zV8RV4Z1~}eFs2uxh8l7hu;640DU|S_ zjYn2#<#IC`;Uo_%l&~caR8RrN3{zfd=9$IKNF#()qUq+FxNWg!h99&E=bd=Erp*jc z_W6MZB0w1+lp<*GL7#oH!B9BiloU-g7-km8>+cJzJ^$vuqDio_S z5WoIjVyi#sBm#yVx)5}4zAm%jNU^>W1PHkP_)rJ@g0771OfOq?| zd4M$!#H$7@0D{aw;SdPz=NjCN&yWDd;*TR5Q0`COw)hh@?E#j6j~$@<^Iz|cej^|t zXea+3nWRIG5a1f5;D|$V7i0^d$owb5KGfX613Leez&C)7B;pU?EPTT|;k6>J0UPMl zO*ybBAXBUc&U;L{GrVil{rBOUU9aL{3mzcvdLqakp!R_b0EuE4EehR08WbQCoGx|7 zLPp(E;12{UaCmKoGV~P(bHuNc9K;Jn8k%ANYuc9V9ot&;hPp?%PM<4hJB%6>D&SGoL?N zv@Ad@DH^z%obQmundNMqD3y=sTqzhb?zzzZtqJYe{kR=WT zSQ;pSEKA@FbeI58n;W0)&Zhy9(L|p0A zSk97`eXAMm_UH&Tm`xi1vIPbuR6`BK!|EQl)way?f$%oh77ga!@-P7UaCAb`u=F9#V+Y5r$$ zH{-!RD!MFf&?g?a+FRiUHqouY5gAPj2PFU$j)wZN)pHMsuD;>{jpIkw$G!*^hY!j7a(>D6hK{y z-dl{IFWrshdKvvxa(05N&*$+@bU)k*y zrjT*V4*+YA{!qm&wc9BS%Bh%N^GATUE09b4=>uEf5C!uJ&%HT9N+}RxZBI*BRenH? zsUs_LS9~2i7$F5S;DH)W^||`_gAy!M5*Tua5qeOHAU*qtC%~XaTv_ZOSOs!~3lfi$ z5yZy^NwPxl;X~&_amfRLazUg#GAtj&PW#~U#aK*c?Wmzp1TsQA9AShlgya7kXdVy` z=BAUyMo&sRazI8v-X*r+ge`2r7wAv}<^trZpIqVwJ97e^ZxWAf_4(2{ zQl)^Xzz#4tPCz`c1vNfB*MwsF)Tma9rU7Jvc!aV*AF#!PMQQ3(+xpf(*@6jnz3UYe z?m>1C#WiJFYg`-q*gg5RgHVa=W;@#`$}SMIpH1y*^CTX_$hNk(Jq#mS``h3Sx46em z?sA*^+~`iXx|`LHA6H}_?ly>%*G+GKg5k&)iITb~qK5G@bYvDq3p(blk#<`o-ZzD} z!Vh9_iYR>H^^SN{Yai2TE zan5(5B;6vaC;=rXk;I`3L}!Wc1P9q*1Ya0IQEU*w$|&cHEr?Nry)r^9?#cpB1e$gY z@-$bbngUaohqdHvk4>cLSj3If4fz=of`D0bZ+; zJ4VoQI~ER@q)=Gr52eIV`$jL3#53l^KKP(41yO(;=Wzjd2}DK-+Hfoos1?p&2Q6|g z*JD1kQ-|+!Gg|;o(_?iZcrG=BJZaYtv7iPm0x2p4Jox{zLyI*%v{Qrr&{DACE3k4d zlEW>I=!dX!EdPOTm3V|=C@xNr2(rX^iI6v#B89G~5{P39u>dWvQ)Ma8DRpuXET9VE z0D>K~Z#QrW=m3RA&@Pm4AZTba2IvnaKr~lZVQ(0RYs3}NFawa2H}sH)DTEB6X9sq0 z3C^H{TSGK_B32}rc5(uA8o-Fh(v9EPK+7^ylxT^{H(ZZ}E6(5#?sy5XLOHx5ipmEp z^k7Lw;Ei@rLo8sBRrZPuX%g6HEBnBD1aXV}XLI3$Sqv3RvJ(p*AOeZNLD5hGbOkBN zI1o2e0w?1LB47e`@Nv4M1`i{Rb9fc~U;-i_G|~SPM=8)=A`nM*;0YzLEGDp%P|^=R z*dT_;K~WM7b#pqy>RywW9i)05FaF4QFvDZm3pU;>lI z7aNz65LFNjH!=#95C}Jr0&$&36E}Njopb+55nd*k!+D+|5e@!!5$Mp0hjBfWh@SG< z80P62sWqPSnVJS5J4zs2KvG5FfV-C;*AkP4yTxkvtY7R3MAe3MZ1|k~wMiHXWBGE|^DG;B1 z5u+#J3>TWA$pIkd5Trsnq(oYzMtY=38l;rq4z~Fj6gPho!2&fp5j{#5PkIuK=cCsF zq)FPPUizg$3OEy*89K+6Jr@yD>UMSj21G+Rj<5x4AS;CsG)@o=+HeWJKn3RW4N-?F zcbcbSgb$Per)?@1Wy*67=Lk;Ffaw1)Z5-n;iz+KIB#cx{3__ z;GDX^16nf|r%Dltzys8v3ys18vI7qv@K?<0tlBbM1-VtR>Z_nvJL1!+#^DHd5TwFT zuEG$c+Mo@g`l-)guId`3qZ*u=p?;`3EaKt`Gun0EdW8ZtJ-%R42;~n{5S%>ls{uQ( z!|(yNvIWN~7xc=0@rPx;aCQX|a*1F>?Zr2WVzJ(qr8fmvDF}8A_<-PQ96-8QJdm=`d@UlW`2MGUm8B{0`->?Pq77bb_X7e>Hs<4xyATfYoJF_ZK_2PJ_ zqBRA3c-+FUI3`#P8y7jN4OZB4U30SofeaYZIgn`r%@s*sOE=KM4){~F8v}=OShB@& zvKm07>DsbFN-td+1T;IO?qCF^IvKGzixX=&6G;!EkbQMGEKSuY=s-X~bu+S)syP$2 zPW!8p>kB;7H&y!$bxIw?fJdh6V~j zD!al!1%F5lU~s!^;6Gpxq^WQT+#n2&Bn;GG4ymvOLBIw>inm3&qJec84=E4|I}o_H zH8NLs_9#3%!vfBL4`lzMb<+d6OuJPC`vHSfzFQCr!)h#=R~OS8k*u^TTuVnOz=J$C zc06W>_$xJm5|*qoTbIDP)9AXw0lPw)3Xd10)DS(($_5~s0)-&K&maQu3I?iBem4Mz zACL&K(4s=>yhREsb_JJP>!>L1?TI#aRN!DPczN0d|0& z1mO!TRD?&I9YO!P#6#M|=Fq@F8pT)4p;G(-L8^6Ge7p>72VTMrA^gSW5DV^r4v1_Q z;Xs)m5Drm1RjRxNtegn0`~lwp%k#2J2a#yC^9|ZCIHCXsYT%wO(GPZY6sOF}tDMTm zJTb96%dQ-JAEBNSK@U@!$kKtpLh8tn48>Ax!L}d*pui3E0t&Sgq$NPb)DR8l$_*lr z3ZFcryuvMd;l>mZ3ljMdjv%joAGEB|oVa-7bEG57V;1r~=45F_Wq(8L< zzgxw0wYyMa0zBZwLOMF7d>8m2iVjhz5An}-vC$Oa(EyDmaQi6H;H4Gp$wQjbM9K{= zZ4TPN(&zsi4Lm@z7~L1_ka4E5(JUc!5?Ngp;kngM4IHByJ0J==>CmAisyFH2py{xp0741rARsA_ z4rN`2DzSHBeH8lK5jr{%T(=Q3Du#jC&o>$rea#Yo9TNLj5Ulb6jy2Z2!8-tQpf6gW z5n^WmqACt*n*t&`bS*#xWe{W|5-v#){l*j`-4Rzh5!KickZTaHUBp7M+APu9jD2zi zQIe9KAm{+A$k5dcaoPj1*GA#f9C4^;>Jao03oL*R1LqHO3J7)Trf{0Bc&evT6M!lQ zxoiJ+-Oc^n1IMRq(2B!gXu9&J+C3AV(LW&{3sB--P`>E zJX>G~z?=|W_jPL2%R83eJ9bxDw{=h_bvISsTK6@Uim6|btP*hvGcbEb&|I~e2wC%& zu{x{3YC%&a3eh01sVd^KY6}|9;orckzA6y$0NEB64<(+%U4h{d(ZCJzV&TAej;APy zr+C6+D(l_f{=FaI)T7-hdTNlXE5kRw3YDDadB{_Ffl4}_he%tntvY@aU(FHpN)fXy z5G>%YSi7&bpfeIOuw&%0Xg(~qz~+htm>6Yom!K~C?dAfD&t47BEAg=Hw`7Cn1c3j( zG#rF=b=rShBR=`(5BqlmR8V#`qvT5tAD3`)@F#x?;RqwUMzqo>;>v%lbXN;FUTO$M z3aC4504snHvOba68j-VN#j^!*3GyY|c&@Z2iL`AlxllBNayjP$akK)Fv}Z0@76%C9 z#RhhuzM0z+w4M@Io3#Ux2>YN;2SE*kBR)=GTM|MmcO@&xppWynOe=g~U$G+vzh_<@^Kt5sH8nL;7J8xh>1^X)4{4aNU>V+Rt9 z!^lBe=MTcn-;v$!?ZE?EfDXt*k-3Nv^iC@)SesPf1QPO$x??-5zVDQi0;=m1?Y$Ax zyG8_e2@8KAHc6}X0=~6FWa=9W-|!^x>|W<0_TnoLRPgnJg}wsOAQ%?8sFLU{arG(@ zzx6y2w%`Qe>FxZtLXskqB>*WLDFQPHMJ87Qw!;P=;C()?AAQrC6zMw&LGSvg0W-j6 zgatSyzy|vsSSQ(8ZKx^%Yyv%h6)C*+UEgI1A;b7U$f#QoK^B-G{S|kamwOpWM$ixW z_z+Ywm$^Ur>)~#F`Iiy_bF*&|!o3n>9ujKIAza@V(eDy{{KrueCC2~XEXg@ta#8IZ zfy-c^%e?F)?Xnl*4-?Sb{qAoNC@&KC92oH*6UG1jy{-4AEg1Tb|NajU{sayrSkT}> zgb5WcWZ2N*Lx>S2PNZ1T;zf)ZHE!hC(c?#uAw`ZPS<>W5lqprNWZBXsIwLM+&g=-2 z=1rVAb?)TZ)8|j1L4^(_`f%T{qDfO8Roc|)Q>am;PNiDaDph>Gu5NvpmFriqVa1Lm zTXw956#f!vV5=ckytW!*vWXinT3j~%jKuX9O9NGQRDk~V+cuHE6m8S4ZTmLv+`4z! zP7OS`Z`{L)*)ERjxGm(icH5dIUE1_%)Ttw~HD@O-rDt(@nA`sn_gP`KTg>r@Yi_b? zlx!)*wPQ{*+SadQx1C+P?d{RPO$sM&TygTYuw~P>ja&M6@ZrUe@5$QSdGzVkuV-@iS6@&XiaKmrSdh(G@lJP*L9n37Ke1tUa{!KNJC4?+ne zq|iVPJM{2F(^5zwLj^Zf%Bu%SEb%X?P`psZ1Wg2SMzh$^ibfnygV8@8Vxmm2pNyDo zNBesG5hx*vEYimun`e2|rfdO2pkW-j(Vzcs96<+AMs->)Y1t(eM;6%E436>R(m~3 zloG(`$De5O@`s%*{F%X5W=nEaNnVAjEJa#3-U#vFTX1fjP8!+|}s2{xv)m$ueWHox%{+OE zC7OLwK)B0i7H;@qJb{|H;)}D)m|=$tHd&;74m)|}h~jZ%Gb~{oj;(mC(zGYZELPGr znyb2*r=0hMk!KBRX8CBO|BS|wz2b8EX{e)?dTRfwpVlqm3}>!-Yp%QY`fISm7Q3yO zNz{65w9{65ZMLhExoxeZ;lM=(|mOIc~tI<}Ks|w*aN-z!O(|@vnJs zgokz&jAaW)W?=ks%rj@pamXW_ymHM0Pr;GC%@QRsZ7fQ`mVxj=N+3H(FWstWnzkeI zYO9?&g|g|JdC1&@7k+s5YSY<2=YS`EdFJ^vUg-LgZ$8XtDB(E;zGn}Mh14s`KAL4= zmtFg*_Ue3kHlN|mts*za8hzo_ho8u2u797swxFM%R{8f=vHs26Z$0_*pU8M~Ch}3m zWp?Tw_gY5@J5j?Q!`Q+WURMZh0Ky5;_y+$Q*bokZ&?6QsKnFVdVTVLqf(op&gP^EE zg+(P$C@&jP0lQ?q)({Rol%P!taA!kWaU&H#B$4`bLx~@rClqo_<#|N@C`eBU<**%hX$M=A|nhej6Z=3`?$EHr9ADU_*9P$?63$`p8#mG<#V3hXvXoR}jC4Y5TD1Y!g=vT+z67}ge4 zp+bRBp;Rm&oiDn_gFp%dd;_tAk3RnZ4-KHh37&8S7;&kSm-TLzty~}E$|=inQqc(M ztWWh|rZsW0GegfUk zK4Q1gj(#+xBmJpM5`)g2CJd!JrD9EQ8acGGbv^x)D_rGDpV%O+? zDV|of2}}ixPlWJIUN;JU@Yi{g}BYNCoEv_i~i8_JXs)bDA4l@@@AnLY^sC~NTdkz-ho7+ zknd_0p@?U^pc33Q?Q^q>kvUvrBmK}uaR2+2?ece)%76d>7yw=cK>z{)fB||c0Raa9 z02V4DLw(bOf&geELok2<0vbRf6r7k(FxYT^I!vnxWWXFK0008Apx~YQ_rMg{$1Zp5 zp#o zsg8df5n%-4%pD3cnWq>60kk+hmB_*!AZ!l!-eHL7S+aVZEa&q$qr)OTFB!-Xig28# z%jZ$XeUEI8Up(3zjXqB};G2sNU%AnVX7rfZQ)O6?fzalt#vQU?Ui9Q53uI76dIqs- zRGTBqTc*#VAsy>bfHxGFRs<+6xrm&TeL>nb+))NBs5K1Zp$0 zZJ1^E9#;YYaDV{-@QrY6I{^WRfHU~aWc2(wJ!Gh{gQr{s03?6`0O)WA4uF6RWB>)` z$ie{xfB^Ii0s#Myz5oOujQ{~0AON18@s(i#fE%AD0}p6VieUf%02pAyp+G>p_iYY% z8$bXA@bm?^y?}3<0|q94K)z{>o;%nZ-}+WL1JLt;5SZfu8+ZT!1keBx{{rC(_jDyh zo&W$`0J+r=K(?)&PG?VhAxSW>)X7urr|(Y@0EqQ@;C$o_!+Q|s_yh%Vex5!z`N`+W z1pspX3jnyc5&&SbGL%sX7%ZRxFn~eCK@M>AFyI2`fxCPkz<_Fm1HtWX@{M(10O+fS#;BIC8( z;JWc5zyu%w6d1Pin6)O$w|ql4TY~|cI}e=0JP`xHL2y6#1Ay*3vhqU#0cgHiiaO;h z2vQ)05)2UL6G7&Iw^2I}x>FCqJCA1|fdJ4xwUaX%y8s@%0Qb58>sycV`@7>pKy4s^ zA~1)!gMc2)fNwaD#zT(@Sb%Uy0CZcAC=|ShBZKHmJpkaexs$J7fHMHd077Fx^zbge zdp-X}lQHy=K=g0`**m*gTf!#%h7CYJ3CIAt3IX(E!Al}RW`T$Bxd}5Us1zKJ6$HfL zaWaNG4+n4n^_Tzy6rE*QQ*R%~7mSVE=w^%%P{h$AWI=~?DF~yx6%i2@bPfbX|I(?X zbV;KqsfZ#X4n)BKX%LBr=iPa6u5+Ci_jO+WrKCYj^b zxR?Ktb}`uF3jh%WY4pyK7%L^Gvs;?^;tocHINU_l?~AHt>Zb}HZ~UfcrU?LilA;{Z$tke$wFPBH)(4C2Wj$4do*8w})? zd*iy5+QT(>MgZU{hQN04=?z^nWF+3xAMIb$4mpUoh$Jzf)siKtPUsG&Dn>B>2;KpZ zV$C2j)zYq$$w+^0!LNCUGmQVOeZZ;inWJ)137TAHm^#$dE(cvqG8!NJ-UkjC~kj z)g{l7hDaTF-@(^^#9q!f=dk&7-to{3*U1bf#q;IRT#_bsED}5=v~a03 zKZM?49_BP8;iodx8(?nuY2wHH#5O5@r7Y70(CjkOdw5{Bn`gdj!2lBj>^M#Q{Wxyx zE2+B~&m#ysZ^-jpAz9MUVmXKX+m5;1JshYxQ}U~{gGC&o;j-5qM)+AB!{IXQcrrlu z5qCSV3;|KuN)+0FssPOHwI`_iOl?F?Aw2I{^qC#{)aFpgpb@|@w_=b3c(2Ct0BGTb ze`6gg$0)*N76K7~zy+n=esh(VjTb*(`&L}!P1j#W=~>VX07E1qURQ=8r-7mnWd)rv zK9xI`+`MKj?q{v0hm=_3#W^H@Bup>cOk;QoG!01*L(&h;Og0A+_OAL)D1dNm$RqIj z_({*c7F=;-ngM6PDXS`((;`kH-QR9jh$hV(#cRq?G$p9wGUnkS0ETwb`n)yU8!Ii5 zB**sG_O|IVo-+(6DhLAz>goZ@0FrbvnXij7;M(JU!x@rZ?D&vA&KEMENvMk;@RTjp zI*SECioI9QDx?lT^f*n;FjN69{@3Ot2>|k2-5G6ez?-P@O!a(@KtrI$5=^D9E^{?8B!Ve-D3?D(L$ado%3NXI`h-huP3 zo`7EK4ghQ*doA-`IftrDZhE9WEs{~2_VHcOM?_%5om9JAWz*9$@mU*JXWDkFnDVvV zS$pE~^B!fAJ6k{7!RItC(?E-a4F3g5?gdvK0Ia<|(;Wc9C^lV&mkKSEg#-NANZav{ zGlw{^@74IIeIUZY6GjFQodFitk4sI0B>>*}ZKLUJyK}0bjkY&D>I`D`kJIM2Enday zwBtN8SER;_xH=eVO>zhlXqJT9Tqt9SpS)qm03*3d0TxZ*4E!HwJg?hd&;1~hlp||F zM#T5K5$yM=m!&&=C+QwhZW| z4y)9A9mf}pb^y!TBb?kfpU)Rwx81?uhP%PRN+h;OLijAe zILr?7uML52TWNX0cmY(FYL`{5drIcfQ&)v*Fn)16UIEMC{JCd$BT0LDnenQLTeS}hD-M;zbTIT+_2!Q6u+aR`XS*|q>EzOV^Z^{EP;g7~qQnV@-b zbT2jwe^sBd#E!VI3|M#r2nc3U%1YrTLeEv$KYc$6&S=w%L9gu+VZ;T}88TRVZX4%p zQubnJgaX-6WjKgm8CPY5ku-6b=l&!{nr_>b%oH<{2D{5Gn`h&{R>|dkv~%5~&xuBH zZH3@9Mnn6z6>Y4xai>gu_LTuI0YnmduQ{1lov|U%-G->&q_(p*vWt0N7v3_Ou2ONV z<|sP)GW@+C3>Z7@ygM=lW&i!J!4HgrAd{{pp}iueNG4~4H~a~%8&V-LUbEvDd5itY zKK|q)FFXhwxFOS0<=^$fpN8NjC~#9`+>8~0Uv4lM5xDhU)kRKR@$NIyTWyz%{ON)T zSi4LDS0&u-{z6(`BAZ>S(9I0-G0B-GkJ$}?&?LWA^-7_p+bmbSXS;uV>YkqPxx$EX zI573svLmFL(O$gT_CXg4bs38R7)aj^JTg-y&X8|wDf#bH#sN89&sG9|cGf7HTdx}f zE(MN+fYG+6bZg%Ms%Lh@vg>pLw%y6m*<&iuj>@2_i2mlm2AJm!o-N_*)%fEo`~A?2 zEZd(LLSnhm0AUN^a&P@}G7;Ri9_)9L;5r2WA~(9%VU~EPl>!ghkYR|K*#%9xON83a zLM6^%rG}HrZW3v@HUT@M#=K$S*t25i|6cr{gEf!DcdAl)AzT|Nwk3~nrBnS!3KEG_xC~j( z=p#3pY<0UvfGF5l7ksjpXwuGAex4k&dRFIN@oNPamLzpe?4$#DT~WLbJZ}~qB)q=H zt=S;wVadw`4R{)n5ajriji>=7X)?j$0qrDXgai2C0N_x6=ed)|IN*c_z+?s&=jtG5 zeoGS~-q0~_NPPGGrYBvA1hSZys1C^bN>DT)WkR(a&SK@ujYGWZ3H~G#wq~ddDa)U< zT6gR7_wL*5@voch7-*LQS5?7Nb73=D-B`f6>4;%bhUwP#;3_A>9Sg$f#_8lyn}SII zefUF{&9F_~=xb-cUTQodqVwLL@beF;zVxKuPc%|DE`FNxN^>|2-qcUEVzx0C3Hw`B z>vFO?#pnQoC>g-Fu%{7W7p{)RTjVva!~=zEl^uwaPDK;jTaLR`j5ex?|DdpN5R_BS z!9t+MkC9J44pJ>9=gHX8e>hAlQcC25o)7?l{)EK2^K8sK8KosEN~+8?-#5xSG`Uy- zw7#u#6OT1+ZTK#SInR4UXREfbCR^|g`zju!f%gv727QQsNJN_|wcepndqXS%A zU;DQo*k5*r6gImmB=h*H@VvtM(cekGl(1*I3)YrWc~foSmo9Q+LU^6Yj>*uIBkPZ= zADV*yoLs)T_<3de(#e?N@zEj6moI-#u{ss~P=0pN2Ucff;C1;0fv^{)$Q(wO_|Mt2^B4b9w-*;?p#C#aEwdVDirDGRbGttopM04wr zhDCU2!3+^%k$X9Ig5(Mfh?oU2s@Tw8ztcWm^?Il6)voAo{gf0+ zz1Ct$F2$|_+!%xoq6Zm;77_7qDaR^&y6S@iLVRG#wL60M99KI6J>G}@TQggrXv*WY z4kFjAhz)d@rsILqV4z()RX*^mY|A9HCO$^1{r zTreTjH>*0y&}VP~|C-R-S3dSrWCE&MAUr({q%^Eq!)fsqxq~Tp#n&f0gH(>rjiQuG z$T37Jlfv}BZTn3Ih;)bE%DcihMqF{tZ|4=CkRRcMX5c(0Hu-Z(;fCV{c0Guj5)x%iJ5TT&Js-|LC&OjX#cBc6+DR1LengMZnjKk>yuD9if<|4@ym+ z;!?5AZ{S}nfsq4)Y}(2j>8?Jwi?SB+*o|Vjp7J!clcEA_bQUODk^r6+dcF6(k`OyI zctvoQBI350Ch)c)LA4vj9$qylvel5NeS|_#Wrk49R8Ac&an5^HLsE*N$$A0e+@&(Z zvhvTa;;(+0j3y5k&sV9{8NOq7+-Bo#z+KP{jq5FVKv9fBi+XYc17t^ltInLec!|Ax+UPS|Kf& z&aIV^a&pGlGtI zClfa#w!y!nrL{VYUWLEdezf?Gr1|T~f<^dGkBHbBf!@vW&zmOB^(E_p7b^ z*f0Hk?myED?+9$9O!UuDS!%@GbpH`|RqZpc%)W;ByJJNjN_|-zRR}g)3kh#crGB(? zL}Rf-uJ_2S0W-ig6^>{+PsZe1n(LgVJ0?6s&RKX}`}(@bhauijvqKhYnV50$yJ3~e zu92l|+lMo3k2E>=m&>{La(@x~fx z`0;l+x@pWd{=8L#rX}~*c`v$tNK^IawQDk>gHef@mv{P~t@ z&p?Z<*PrWIA8-x_)ugLfn{6!irJ!75SPHSzXg1 zk!HLYs@y>2P%>3+8D$_PdihZUp54S_KJq~&wlnEazI%CESsh~TWtdax)K)5aAX|Eu z`F^w9cLg1Xszt7(>=m_H|Azvj_N-qY)y{m6FMBw)YCP~@t!Cx&%NrBltHSrgQtJkI z8K#Q4l>h7f;HcdDa1#4fBnJ|SoPP1+`$?3THn}W7^rU~3hoVno#lJ~Rw}J+lD;~&QGb({C@C3#PQRP$bq_f0Y3TBXv)I<8RVmQA+x zXMM)M+Zm)-E7=TaX1{{!X(I}v`)!rE@Ft*nQFAeyN+de2myrLdA}-C;$c*P-5#i>K zx`X{@;jif~(Pl=A%sW-1=5wd3T>vsqC-=NqtXUC43kLi9rYUhnSncu;PYpI=)5?|} zNBs9LXhO&sGt!Yi@@M!TzDoDU`xpFN9^IJa_HC0E4WQ$d0~S+*F|I82gwIlAVNBns zWatQ=_FhwOOUP80?D5+eD~&x-<7I)H%TVk;S`XiRlXsqN=xNmVi3bk-@yB7gz47gE zo`3gF7FJJt({o4(o5TBm^?vv3*+r8aWrl%Ifu^-Dzpg>}^GJVk3!dmodFC&D)(5N+sVB!e=oATJq5%14 z2ec(w&PT;JAVwq6i@%CX&d0dP?139S1T{!xuJ+;c2|5=! zl5zi{{8dq3heKRUsuBOt#?>-TMaoCz;Gv!M&Aq81CSJD zNeV!bSUpX#E~7J3O7d9aC48utnME|7N(8_x#7lIvCQ6OfhZT6;352go8WKtxWy;!2 z@Pp)&&j6kzg0S`^R1&0#7)FmQLM@lO#&Hr6$mKH$9^KLUMlsOL-BJr>sk2A z>FMk?<$UOAaHxmLA&DFeaJfks<^qJ4Sv7{rFGjPn_yR_902;#O!b?<+XaEa8IZx(P zN^A(FVqDm&VMT zk6m4styt2YbyNDbq`p;hsZpHYN!k)nI5tK$Rvkvyq-Y+#lHU`zTxG83wh%uOxw__Q)TZU zd8+*5b9kzj`Y`X2;N+461IJ!)w1-O$)OX{5Zg>JC(lW z7q>xj)}`5tdA(ld=qVax1F)Ef`NvbrunitJ%#4`P_pw^Mxf+nL^QY2$n`5(=y~B)| zIy=wXa?rZ&|L`5Aq}7@+@?2-CiWjM|#m_TEBaodQM9^aXsa- z8mnD7XKmdd7ONhqJylrDh4eX}FUzaRTdV&Tb+p>40(s<=8$Zc;By2P;eL;rD!{>`| zJ|iH5E_q?N@o7Ts6UXP}=UKmZJ@oFJlWm}s)#^_%*iKpKQlAROGo|I&A3dE?KKIeW ze^PbT!os*QqV^(f+`=P}aZSHP?hL|3mLRPq)WL+RfL?xpfRGDPOFCgCRNao5CEBlP z-r>~R+S*3{-b1jKB*>}biEv}Ou!j8lL9#u6t*Thi>w{musMG74lKdL&86E6v53bcD zbfh^4p|WBg@-zL55PyeYw=+06ghwF@?R;0V2V2@8s<%{VmAbYy~5caE?n zXTA-qU1cBnc%M|s{aW;Cyo_AUc^G6mXmQ92F<4Q3vX&)M)!#ehXN94_@<@3{Papa9 zvD)05r__r*1E7vap%z|;b-{bI`oHTh27WSrS~b;buFtR-JaHm0Z=_GjRCM~`#G?UP zLxZf*eOnDQ9=p|nz1i>#u5RL=&Mp7fVj~}0MJ$B_1c0L;U@A;nPLa6Lr+?{tMP6As zU_6iXQSQ%j-Norf+}i@{pNG`t`ztef7v~PCa~Usxcb*1f`)cQxg8X;Z|18vSNIYY( zq(NM1jIx*5>fzpU&6^z7*wj#mG5THu*TA%3h=)(3>4 z5aRg@PRJEs>l+^hr1J!Sji4{FJXi~Jdd{d=E_6l#^dB?+*Boj!MOBh$QgKmIg?<_m z92V{LP>of>BsP@=Hd&%;F@;JRmEpb|Wm<3D^_WgG7L&xD=hB%HYSgNgv{#z6!!rYy z&Ix&-p2b-(C7}eiBwJ>;wWXU0mv$y`zOT%DmBp%6wchI<>gYLp=<_P-1S=cdYL0J4 zLmnnh>C!?ty>q(H6Yew{*DITx>a$o=IhHx3Fv*WAo_wpgVY1R}_DA_@bAzgGGb{)2 z1yj>U2<}h#J>Vr|eo4hzt;MQ}X+@*O#$3hLzQxu>#m>9ME?C9>R*QYCibGUK%hU9Hs}tLkCU>S3V2!#J66>vvDGJ2)%Q&6SE%Y& z+3HuX>fhGt->Vw%vNd2*HSj}g;1|`PmDZp?s=@oM!KbPr(6$h`S}0#zC{isBH(k^syxSsz)o$HtyA`Vznb;PYsdoE8+wDTNsLHmedbK-k zZFhRrqF=T}PpaMh(02EWTFfs;wHp*kIz?q!jfDNIH3qgc9N>WgDH4%FIUq~cND>zF zk_XHO19In3RN}}88N3jhAd3bGinSvTz!EqzqM0m*!=&iUYYMj`;>aqd7?Ix;?8sp3 zKd=I^9cc(ob-|}swrBqO4<#8X+k8q!O!Zq#kvU=gJp2P`?wDBVcFCjmoDYsP79byv zntO|iMck&a;H3|!DmWnD4RET`-<-mZmRBH(34s=?e*acSE?kfHhaxI=@Zb^zL(aYfM7EMMLBQ-GH!^OSeM=pofBZG;f1b!Ky zLK5b&MEiX{DuSOP<)BW!G)(@|fvuueX+lI`{gu=~4zUAyG0a{4;ip{j)hy)uYI-%3 zQMD}nwJi9$ueXH|7cPV)wnz4lr^nyP!K*q%N#;G zR!@4WKeqZdKex{1sZ+9OAEZ6?p9QU}TWz?b@=j_~SC=D%GD^kbz#`~_CiszMw~j3B zHb3G(t09si)PW^YIakRl4w2S0FvYwROKk50QzdgiFTPOa5MTvEiVA`%*G*L!(hwX0 zVMhj%9q+u1jULJDR-sYla8yf&ZUnLW#bEbKIAkPM>(zf+udB62yR`Rgws$*O*p5O)xMfN;}I=90B*j^uFHPZv^;*g4?vFwBxLy03hj)6+Qj z!qwIre>uHOA1e92&>Eq+3K0oM2lo<35&5c7{2beAm(n=>T|>Ut$Yi>zRX{YE7RCKf zixFWz`QL{d8YI0jCCL5ldfDJIQ+>59;yKJV$%b+QtUO6+BVhPns$-=dmQ*1t15M*=r|-jo_WP`xX7)9?Qe5`9(v1W^wC zgi9W$u?QC24+?)4iKwC7IgwQ84m&bdt4rNa*i~!&eHC?_!h$b|zO7GZO|_v z)GOK7=P)I*U(Pe9p)L!bXT-*MkQ0z~IfzO1)ssgVI=fm7mP3gBjrG&e!#G}daz zdl=y<@vMGzN?bolg4v(S;$zG~VUHbQ%2YrtJgl!=^3yWRK1xx~>Sbt|&mNV3e#AM3 z9u}UKm<^&b@Qfx$Ft`*e%y2_J)Dg%HeurIe4vR*1LO?A|i87UL>t5 zzc$liT5IYMHUDi;xK}i_#(WniTas9xHzY|^om+HpE+iC{VvaY2haRm zJ4H=d$Ok7Wl!0g%J+gt#ZO*-l>uQ+ivN+b`!WhcGxo_LZ@TKP4mp_g|d9THa>E(!f z<`VbjytC5YPZ=H;M_5Vmlnx3C9~Vt!dY5k|s-cX@V>B-wmQo>XJ-71c={1OG9-YFb z&A%8Y{gwc~8uCn62Fp)oe0(QVqogrw9jI28ZoL(G#_32lL}rL|dM_vI+U0$_e?3Ov zd!FaHalwR~jX_SkwA{F}PvKzytvS85B3-EK6;q5V%FORn;E}shTOey@7CdM3g$c*0U zX3i6TD;<@%o8-iozc&}y``kw-4)78sR#}mRMA6u~gcdImB+`<8+Jdg0ri3|O3Yjl2 zEI9nxzuX`fn{)SViB-|2blQcHA20uXy%$>*d6{In_b<61aU_$Ywy^NzaLqm)DOI;& zP$=qiU&8R0)13^d=?du3a^zg*UT&q_EEbFDvXv%N@s@v|&zIcDTkb4Ls-o?F8*+2&l<)J6lLIezdvs8w!70z%(>%~kU|?8zOsXUT)4@Q=2}#; zkpLE0wD3YhZ=*797?jScb62X*Jejfjp2goWB7Pb2R2Or4&L$}@AVSmjr`5wbn@Vk- zrMu4B%&K8B~pb;cJv(g|-`0@wNu8)z*R+ciJBiS)Rf@9^WA zeOGH4@}J=a$In^p;YYbbw&XGFJx~c<*MAH zy8*Xa(xf)oKWI1y<+Tl5e7~7tLdisY!HBBse@mU_a6jJSe`WL{>GBKBMYdnfFAmZT zqH>));a^KfWD3G;{`D^pReXJA$Ey_kk0W!&>GlznosH_Uh5EWwT*3H57zl9a{MJ zHj=TW9^*8|)`s$p1oJ+3-^L1+I^;gnC9x4W^YdM8`hV<>T@}d6NGdl85a-o_eQ?`Nz4Y0L5v9GAf1@2?peemZGar&g#vvYn}KPi zfg;EdL&dFzq(m7B_{?giT=p`YsJn_fA`TOJ-|PfO$G!HNfoVJr3cux~L+89IV7%b! zlgR*P!6I}n`7GU}ErYoj`37H%i&nI{Gayet7``0e*H0B*b{D=kV!)oWNE4X)3OP=e zP(L>U&-_Q_`1k3YmKKTOiSSu)V-?w8c z*%MJW{s16rG!P-&kJ1IQ_Vw|m(3ojcCj*PHH>qsz#F&m~LL~db_=mLqUOpNX2#5=QMz6~+XBpmBeG2ids`x`X&wJFqlGFy2O z?>P!DiS%394LN`pYVJn~ZlE|rIYcNTRd}KAzl8QsoL|X@q9w?lV$Rsqy(v7T7tgg> z%&AZ!SVlt81H~E0Oi2KaLm&%r94QXu*ehPnBXS+#1)_+o*<@Y_aFLD|5Wm9iv>lyc;swwU-7*PLTy|GDLVS>GBGk zPtd|Ln4_ zp%k;e=x2$N#|`vxzLPLIciz462~-5cVMt=q0k8`$fibv>C*kC(v&08Bg;re|M6#F@$zpdGhiNF$ z)1Kl3XSe{}m4EEKsD*_P&6U3dC3u7qj>Zek7Vk!r1!+Km68twbi0jOie$I`%GmQJ~ z)P%6O+&&%Z1?2JqBBP4t^WGvi0I;7HtbTY{9u8)@&ZkQG$H*lzi$7>K=aN7Rm$=r( zmu;%`Jeo4tPOogCpvQtP%x$cq?yR4xOVOIDV_ZorE9>= zLIJ{-Nf)lZL8e)W%d?B3@#3$ikO6pDMiy+D&K=iRl2>$I4b2&azre@NE8+$l+A?(K z=1^@y=ix3&iE~bw)QvFr=uU8nxXD}6)EBw9is6@BF0mj%kYgaoI}_syzck4qV!9!| zgzbSsips?C{p5rewPkt zO$xmz1*np6UaDvkm%6O&LHjiF(IpAi4bul%%$2N{-t7Ds;v{{HLud z!$iVOT|``1guuNRE-1ZZ`R%-w1VxaJ=iVd>#@sw?3}ra2H<#ZH=VyM4gf-*zzKV{J zg=-eQxOb|>Y}j8E@#-%_efo`#@T|(s7mx!dhY)yKA8PqG{HvQvxfo|00KP$I_F6=F z?4aoVLZCC_d<1JkpFa2|01I^4zRA0Xi(!~SAWPUq2*$ZNM#&^M2%a6Ss$a=1z65+m z*b^wgAId1%`}05^M*XJGAoBn$c>uUDg*T55 zebI+v;8Fg}24^VdMejk~nk~;V9S(nw@=4oG{K1?xwrPUYW3YjWD`&5fDk`i?m*Y>i_Y9B1A~UA209%F*_m) zbn|1Bn~0m+oe1LxIZ^6%#{FTULpYcguf&$Wc&oCyd7^x^wa~_v{bup*>RVG3NR-=N zVfdXF^nqm5Vwh{VN7UK!=?jW-#USu7jjwD^=`u%)O1s6pk6LdOM@S6WTfIRY#MO(a z{eCq?#%S)M&fTPB-3u7MFPVTEJN4o5bV*ZKYzj1(LA|?&ES|K3P9ydB#UpY})Y1h1 zzDFv7D3JXtxw$sgJjm7JgQqmT*TvGH(~L| z>%G6kdSv$YUkhHZ#Qu0Gu00=K?O%M}1_Z=wHkg^RzREk~0_ncm6w*O?_PeWy%o;S0 zL@vBW&;wmS#6!_ztGC4}| zidp@fcTwp6+g8c<{`(NFpgE;5zI`JkPqNB2B$wx`yR0r?4Y_HpSUC2#S)fa||17Cy z)f!+ZJCN{D;h&g9HReh3dQ0>X_r7?%N@yXROU8H&eg3as)ggSxWJVX0CGSmhQuoZDis+J zYV;6EiIN%%>S>*pD(jS-W3R5?J=FETWa9yM7>Vs@5f0Mi52JHixQWUE*^6*Km^p7@LRQkM z!gsiMCz*rZ2mg-e)lY7`HF6S$YnF9e+FGno7599HYxaGl;`MPka!-mme7DCA*WJHN zxo>jr-a|)PwR-~FZzA$Os>%lN%(f`y%_C+PBBR+Ngx6I>nGQd}f#p9iXp`8&h#a$I zUQ>GQDGs6x5dY5}Y(wPK?pxou0;csbnoyAKVyq!#7Go5$t`Cu7h)lsVr2zTUK*C~F z?8u^5#3F12pU~$3Gkp=WRm{fgblH!<$=`kbL;@MrALK=8yDot=L+uJ)48QZqv(Gi- z6urcCqmD824%`=v>(ya{Ify@_ zNU0Bl4uUX~7Snf#&aCbvkpaYn0Tb-#OlUV)FFbk`Bp|-nEK$NrD|&Hd9yKzDdi05< zq7PL;pQHUl1stJ71yftv8SiTGZ;8Pv^OBN4HevBBhyG`wJbWwInpY=7SQ$dlp|9$=_ zOJN(hCP8295;1?_dIX*bMmp2~Ma?5=IPly5rUmAZ?RTGt)4|$wF3=~G+cDo39qyAa zxdT+?b@HK?C8xOZJgU0#*ooMHab4@79$`_&AC)BSiO#85SOi zR^5~^{;YIf>xc4!^m&T#!J=piw4bM7`M=M6XxHkCOUdCy3{zTmt8`{<*|cAkB7O@f zKhc$A;7XwL%B`N2@ROCmn3Ye9-_OdX)PmKl1!NvsAp2xB)A>j7$&b6*tEt*6_U3*oqI^OfEtzhl+$I03!=e1WS zzmt1J@(Kj@oPU%qto>bB1IDg@?fC(Y-QdCg+3Z=LT-YFMSI?rDV4ae}<4f+IqBX^% z)ljf@A>zoDK^qCy??b}KI|_Z=N2uLfeaKF+-T$$m-MOEBdPv6pQy~6z z?-lXt-3!L4k=c#J-3(uqf{qz)A5l-ib^LU7eCJt1m;R3_4eG`yq${7P-HPz5^PGY;4<1hsJ!(1E|ICOh<%+xF`a9ncHL8f_ z?$7u2p_4DJ%qjcLJi88laOJ<2!1+#U;V+SY`$6A_3a2s7T|2&u6HnB1>I#DE0}`*tY|gAa4m;RAz2$s&`q_TrO~~7#{F{Ss z`sgu>4-*Pq8>Dz2`4md2UUkpZyYHY>Zxb=$nVfFuKui}%aIlK`oaCONu|9iWWq1ST zK_jpmd1S@9hL~j=%fIrtXQs{MnPX*U|2&y zchkH0zH^GK2Q+j_PyH%;gb3rf@>O=mOj&`K6~2*&<-U}3nr}_xMCZ5SdkZPm&W=Ud z{`Ea4Pb=`v>?Xe-Jc;0Vc{WU$;K|ZZfja-BLZ0bjqq5d1Lq&Dmj%7gG!fQc@ZeK<< zA2!Tbg016c-DutQff$zHZqtgCl?$`%BmJ;csjflTIN~h`D!mbw(Nvs~;wJnpC(18- ziqk2hvEbZq*-H=Ba2x7(AJh3&R|2@M@<}|NT4EP0M0{k5q-YJWo8-v_k12Y83>pu- z9twY4((M~QseO91K1P^+OIYXq0`|^yd@( zXDV$k)1qoCKLj;_p)Bsi{U$f1_XFzZF3N>`dp+v!zpOf10)6Z_v=qJjUSdvfa5T7H z`gu)(^#n|~jJz|=9}qsF(AN~$xiL-?3F3=uSl`$vceP+Yx;_$IzxFbR3vK?8K}eYn zILN-$`Ih_At#5{+`=W-`->;irkCD-{e}`wSUoGWo2XHFaDD)#zEDY9UikZw|{oL8f z&L)10I+3WXFcQPvM}0WtdAGdj88lnc-?`>%`!q~moU-O`*8?*EQ$}v-HM`_h^yP1D{pA(hDKdJnyq}D5c*ICSqQ(ne*Ma(y=bT2-&%uIP#aKin>H1Mf){?m7r#y6kZq@TB~ zPq8gMRIWC=&S@X-H&YT(E%3m9Na7P}+Ln2Val^3M@s3wEhtE!hP%arW_UKCF;(Au( z%#*gg*R-CWw&-Y;T$h)|4%B8aXGih}XC?31erJ_ar(aT{K56oadw*0`_&KraevyXD6#su$s4Y{UC^M9>FKe<>N^xvn?6Lw-f`8**v zM;+&Qf3fKiH`4aj=qIaq6wUfKQwY&0CN86}>jTrFW=~6_W=bb{&i}A$! zG7fzV5e+rsax^BT_CB~gcfz-)=FCE60_PQjDBH4b>@EjGGF}Z;#@%jA;mtgZt}pKP zTQQ?n9e4w>;_qySWkzr42@)^^n(NhYBHb;&_e>bJD4X>N#NYR#LHeWc^dj~-A>1pM zoTGltGt}v0U))@ndgsx%^n}}ESNHBcd=ovba;t6ptZ&TNZTdmK9X}RX8lGP|#g3tV zd>!7JlfM~leR;d%Sps8oQJ~*@^67m0g+5w5=Z*RPmV<)}c#LUrl67=AR))-;5UJ9q$-!s^ z)>*vX@p4#c9j@b4%pH@oaP5$T5!Kz;-E{_L>H}uf)4q;m`E}UG-3@<~Xd%vQw$}=k z%NQ|CH!n@ufbIPb-vnJhUENo;esh(wQ)Z1>&G3zSSYS3;cHwiKsq~?1QQaKJMZqAxi;RCUvLWRnw0n)9OHzCH4^o7Rj!A~Hw$tDY21gO#5qMmd%o9vs?PrBZ^sX!gPG|p*utUu&WE)6qi#y4tKTdOcRG0c)N}eenE)}dWWEbS zeQ$T@mg-&#z5eF*b^PgV5ordHkG{baPa6HU|1%Z>08%w`wH?5^1-Mf@!*eFzUM7WS z63AO2@Xs{7Ks#B)=Q3n7;;n&wZQu288_YMPr7naazDwVg^FheNsFvbXMm%-Yn|iQK zaciXVPf-WFsD^g`~gGIhb4YzPX3=%pt(gdvDGne^}0i2GUH-$A-`Mk z?Da=^+7(HsDNkhBOGZ*2N2b(Q-B3~C_j(}ZF_Yr!o!VZW@^DtFd^Wi^GWA)K=k~FCD5re-65ZVXe?M3iyhs=Epo-;J)MU}by_e))o>Xsq)Aa&ZjEzN;o$4I8- zI3zQu$5?F9Dza(J$>!X?G{nO!!H(+=37L`Em~^39T)>q5 z1{eL151YxEo#~mI>3Bt1UIww3Hw6=cF_$Co3d}G9by=GEbed3t7^~Tug1MKmnVPie zny=YWfFYZz$(p-an&1XqS@w|Mmx8>Bn@%#D!D*XNNt}h|B*)2Z$ytf2q*6qt1Qys9 zlIfk_37(5t7ib^^L3uszAewb?nQTE0yWpPh37_%lp18mYvXGtt=#y>14?g*Kg+rRW z8Db1J{{l#eDhFX#019RTN}x${Id_Jj3Hn?ET7?X1pbx5M3Ob>hVh{&vq4`9R=WuWD za}d@Epdu!rht;4Q+M!X&kRaMZmxKW%`iaC47lKrg!7*(Dq!|R7*P-h zB(|Xv`I=)Y4rO|#E_7#VI-HF_rjGHZX}Y2pTBl};rg565YUTzt<)*Jdi_cl2blRqN z+Na?6r??fUiE5^VYKn%6g*19l4|oOrc|HH81d$jy}rn;n}>JPg> z{|=|s6}UhS{J^UIMo~iM74Qf-Vfv@Jbr1;Y29(;Nyy}s^+Nc3Wd>Bcr#j0k<>Z`yS ztjp$!a*CkBimc9Rq3>3V(kgwH3Qzj5cSd?WhA@rQV-34d4__)4_1L9dK@P=$li*Mc z(2z6TAPqR_5B2y#{?HBg5RXE_530HgI;RhrMh@u$t6_nwkE5%NN@3a>tOMq-*9x)9 z`g4p*u^TmDAa=138nG6dv1j><+$v6);A?B74_c63ky8vKsHtJ`4zkb<(yqyFiG-hCv-?U*y0F=$f$pi2)mH|Fu_( zwOOmRTg$b^D6+jY50}PdTm%j}X0JWN55Rz~WC08mMXJD14CGL>{s5}o(4X>Pv~6p7 zLF=~Qnj=W-6}tcpib%IQN~-=a54)g`x4O1Hv#?zYxsfZmlS{dktC3&JOVTKaY?Nb} z`ZGcbq+uZoIU^4plLGWuR%Mdev2c3+Z6!|fcfVN)Wo&(*}s?Gw*;4NqH8)Kn?@jLD0ZtIn#O<*t&Upy|^H| zd+V9{!1kP$`3(27MdBPr4_{8(6~19 zxG2oTP3*)^OtC7gDDR*JIf7o`=u6W0YwJP`yc;AG*rfqHBnJ!(W!x4$T*hbXGfND` zZS2Nx49Bl1#fb6_VQ>X>AjWus3qqg;I6Oi8aK+@ZUjt_(<%pR;!p3o|$cxO#jf_uo zj3|1*3OAAu^hLYMt7C+07qXC;JTeald&rTK$c{|PrEJP149RtJ{{=Y53RsL6sE7%5$Q5u!IY?^!49%-!tG-`uFhJSR|~1;D@v#2^jQd>7*Chw(ID^+gU|ycX{ehp;3G z>cymd0Sy|c&D%V};0(|KEzqwi&UI1>yHE_v3>Rg~yfQ^(N8ksl$`-&72KPV?rEm{z zKvDj{2oRSSX)AjDtg!z)&?RlsCta!cTQlU~2ZqqRYSC-vX;J(jjr!{r-EahGAZ+h| z2Bj6tJ;{h7T|L~4(nW35NBzhuT{G6K7m`537BvsOrej>t|GjQOPV>+PPyN$$QM%MC z)KrVqVJ+5SUC?S`2rK6m>L3JOO&5gdR_c(yNAS{TK?htA2GGC=1FQrJea4UX)k96v zV=dT&?agJKCTH!kdod3_98<&)24%R;cu@+(aKnEM*o2MQnJvtQofi>J2hh+4!WK(( z;0F;!4tjtFhAda8B?T*DjaZH^Si`!XG4JYvD$CkY=M6KDwP25b( z*?6&9PyhvC&@^}FIN{^B$);~Pce01)8#lmRTh<6{oR zK3*3a`TXK>M)cYln?x%+w&I(r`WY*BKX3-6rjG4s1c%)_t7zhre z2I*)(YVsfpR{(RSEo$k~2?-|+aNyx%G0RZE|Jz`$E@Cd{@>}LE&QV{0<}(iG%z#g8 z9_Ovz#B@#g~enx_-Q`e&zs>=K6%| zH{J)hZtuIU)NB%-1s|RhUDf@z-24RvqpJi*w(wq|<$)^%wgd$fT?&@a2GYPp<|rQU^Na7XKI>{;;{ebA_cQ@4UgIT?!o2<#25<9{$rd4eWA`8lBmXs^C&Pq#`CQ5Jm!6g6{9@AOcg7Tus@@=W!(Qtx74xmd4H zTEFl41oka1>jMDigpbMtuTm9=ncZ*&0T%}3^l)8a`It{mZJ>=ER0?o-1;u~{)?f*p z!1rnKfDa$|zcTocFS&-FPl!+J5J2My&;S5n007L^-V@LK4|9=NxGIw(F?}(8{pFW`;A#K#Zho5zMc>497&lHefKY$6S zi|W_!0dWifAcK$=$j?3n7X)p*_;!0VV1z;qH(Y2w20sKbL=i_Mu_*^R zd`ueOP-GFUcf!J@MH%7qutXbg#4$%5zdLcpwB#AfMf{P`)l(EJt*JQIzH*dTWO|7f}M@~B-GDL_l zB!mnKb?#(Q%{K=nv`|B7ixW_(?%73AM-k#D3?cmR=bn5Tdk2a~Hx%?xPd^1U)T0vJ z)FNCiEmhHZpb(-I|6r_X445$PAqN;q8Z}53NnlMixK2d{HdtYSCAC+Hx4YCm{n4kCceqgYw_k?^H(YV4EVf&R?upUdF8h&(5hLN)QZYW?Y2!?T zI_X5+%)TABUw{Aouv~o?+Qqnmnbb!XEsRv+Psr|pB3@h=iTB{g_yss)jW@1MV2cj| zhu4pVx{hC8Y7yYCcA92qyCs2X%gc(?DXad@*pS} zYiRC?;7-VH|FqyGTD+miBXS&a#`#_vaLqTz zS#XPq0oyEVy1BHQp6`)H13BSq=91> z-hDU7n%;Y_^Bqax(8C(rao7F$-~DO*^~HASq>aK?@0)Y+#|PE(gUNyB9X`z>2N$fc zN9Z2aSJ&#DBk(=qol=zGqLU-K>4X-J5QZ>x14vgmk3Yfy1xeAN4F^o%JAl!;hUG&G zZPYhLKe2keEOOPIAqbY#Kf&q^P^u?x@V9! zbfOec|1gfq{82fCa3O(VD8b!KNzAEUAzKgve?8UO2ZPC zSRx@0(hVihjS+M(hz5uFxFRM{j|S_r=&)GWkZUOxrf8>$Q5XW zX;ravPbtO-mMs}fSDRPLegaR})$w^hF@ROp1W;9Dg zN_`n@l|G`wWFT1zAt>Pracct~4(kkw;FgnoTm*E^| z{}bKmk8B<$iS^eQwxQ31){5Ta@6|1zaHrBB@ z`|4sBvQk_S<2?LmWBc@R4_R2_9ju_EEU?CydB|cO@>rZ*E7jT0@`JLL^+!HF|9cR1 z;Nl(DXp}WXOOBQ{q*jszZdyiw0b@MYxW|o&F)%QK;4;Li2EnYDz|oJf8U(b~NT4kA zkq=o&!_Sx`#K>B6<(riqV4xMq#czI5(h3ahV7>_{ zuWGSZ8O+i#S=x=QZr`U`U=Vn`<+biXfZ;IPjxJa3eX5rJyWy$S*A4mwafmGPUqxHrtO%o!1}WZSjk@lwyR~n7mTttUqFmV2Qx>#7C}(DKPK^ z5jR=CGQfZ-j*M8A>O;WwU@16oQI8?(qkX%;1+Oq0W`AH=7Fw>yjd4+B|Le*EN&MJ_ zEaE8S17lbr6Gn2LA<~ZJI@!<1J?iD`{E+oc1jB{S2)byjXGSlizD)jeq_;}qE;O1+ z*?6C&HO-9W7FW`r7U@Qqq3I#*VsfO`a~GBjYE~B%%9Td7kLE$cSRbHE`A?|rMq7?^6 zc)}IFaE3SB;SYy+#3ep)idWp?7sq(UHNJ6iyg+6qm7v1PbM|#qgzI3KH-RV!?$OyXd z>zB(2zE%HkJ4_+-9dzC6Uk7{G#qRZu(7^0xM|;}UPJ@k%-R*CO``F9AcDkp%?QzF@ z-oZZiy7&F=dI$X5`QG=s|2^=BCp!iRPk6&4-tjZXKnoVH_7iM`1CMw7z9m0-v{xPj zmj`?ox#x|ay=eLsBQ zKZNze?>+4!tC*NCxAn+>z4HCrnC6e*`OrUy^i{7cBX1w;|HHLD_F=Dm%LrfmzhAy$ z%zu8@7at42XmHI|sH)zrsFBt_;E#ptWVNCcEwqY7ZC2jd_Ieh89$kcZoP55~9$ zdB_LPLa`}gMaZzjRn$aI6t&$bL*(m3VN6BYJ4GgJMpb;m2290gM8!O$#Ayu1O%b-K zfCl41v8<7XJ4z2+m>Pcohq!UAVcSL!j7Dp0jAXpMWyB15#65asMr@ozZ45$YXrc7Y{wyqHh`0gcX+o))DS>Ch<>28gN(y!Bu3q^$JfKhdQ8LK+sJ(UL}>&_ z-AF-$#6#;dNl8S)l%z-*si_v53c`Vg^MHm~|6{8M37KNJhj%cK?rTYv%tMw;jEww9 zj=V(KD@u^GKcOrOfxO7yXuf?LN#uyir~FAQ;Rh{ag&C}fR7#9{Ajho$2SmF{#Lz#c z9LkP#y`(hCO=L@@{73RQORI!RRpUpww9Bb1OFOxT7lTC1;D?A3%)ungT_A?A+)ECr z%DY65GJMOn)W~DROXovNcceMJM30zbuK3W*#4HuJI*wYIpwT2v(v*cZ;!2JZOp6H2 zh&n{iq==}*%*L!aA*9Vqe9StuOS*)@ywnh?dl2A+O*|qpgjlKGipc_u2+Xpq^N@z- z6q9?fL0-9smD&Y;PzTE*v*Fx|&eRZj|Kqp#08hzO5833*F4#=&R8ROQPDnvcsd!E3 z`kq}NhmM+}S)@*aiww&=Yl0b72KySTp#lt=no8 zp7|N=;w(W75)O3?4+X@^$WDV`6cJ^G`7zNKRnjG8Qpu8qe1NaqNv~X~(DuSDasUS# z^U=oePMQPJ}|Hz$g05ZlHt9~d3nIZ>VF_=6}3=ajH_40=)jhAjT zh%;SKen7yiK+J>G2MjyaMG=})B}<6lhf|HuM}gIgfYnw#R0>Jd+<`MWs|<8;GX#@{ zH3J3-8M8q>3lC+5WUUO|+J#+^kQtgFGd0%4WCSg!BCVJQAxO>N7{ttwhxqa>a8-;` zItWSlC5AB9iRi_I-~w{(3i-($pvfIZtSDi-BSobXKg}J73Jz-^OLFkkH=+}8sE2%* zSKkPk>rn?`yN9d<2V}WbtegZ>CSR2gvR`@s} zn6g!au(FBhhA!eLvrtro|B!}!NL4yIReu#S<3J&B*jB~R*@c+atx$)^sZdrG(PgcS zxAKQk-JOUfwSFK?ix5pXM(Bh)B8N=Sq*v_(a$p2MxQAk}23+t1DT0P{ zH3+m-+f%XzNniw`;fjRu14bZ+cgUAqAqTwGTR+91ceo>9Fa#};2V5uxSs(_a>56$M zMBjLa`h>2x@(nwED99xTF_}d>#fWzCgGR}Rv8`Jxdj}!d+kEH;b1hkGCA8GI-jF{PjC|QH3hs&)8et^k@ z@L9v56)%$zc>sr5|EyfKF&c7E2Lv2itteWKdN-Y=qoh@cr-j=0Ww)$FF(@F0Qh=mP zu>@-n0+iAdV7LN#=maP*gk2BcCvur=kzfkW z2R&dQI>?t@z$HHd2MiGcJ>Y`BlnQmo2F(>wpUs9#>0W&hSz7TL6=q>+$Oi63VTVu| zXlPe?&=Xt;1LMi2dODBdB^p^!0!BCjMz{hYTizQg-Z0?@H1GpxK!cS+10|>@b&#Jh z$Oa-+f@W2M>MS8*xPtOch~_m2X^;nt64AvuS%5{%>qQG`2pKw&2l+{*Fs7sW^jEQ} z2ZkhG#W-Ig|FwpBn1?NS-n^odLUUjEm1JU*U*H*HTsR?1IFtRo2O+4FQ|Saf;3P5O z2L!$sQ4Xki@See8F1j1hCv>#s9AqF*`J9a zprK=g;7RAblwxjXaeWAKxE5S61R$;xS=ghLJ&00BV!eHwYvpEyum&YS5@>j0f6xQ0 zc-e$#gD{?=J@NzhDGokLpsu{ypCQ~hHi(n0*Q>STvuH7JxW&m`91Ue!Av54^IIZVZ zW2-P)&w7V^Sl{F!){wELNS@@3Cbddt1U+chL@b`z;$)Bo9=ZvW8#CpSHi$4$g38E- zR_2Oq|1g9j2ofzoW8aWznl^}Z)}mY41*@f$Ck|$<5GxQOhkkHlMK+juNUC^QW^Cb= z$@Pbl+U1?HIY0~zGFlhr^kOwV*X{}JNF4_fu`qW(MB1T}0 zOJM}0!P21SqpQsdY1k|;Ok|dgtxx5er=IHJt_-S%W{2p9mi7ljkOg_r6mYnLlGTDb z|3Q_n=2fvah_9Zf!YPI@(1%PAg7dI$8jF{9{!}f9sO_F>gQ)9^!0YFQ>f}M8ZUyYL z7;MGi$;v2dsRm!BHtysxWTSmIa-f=%<&(%h2#n6`2|uw;z5=+uf-7hPgS~=r+U`nL z>C`suKnn3NAqFLw8?5FEOVENa;DYyw?o(oM7k{5_C=({ihc?iHD_{h0@NGYM?u<|e z999Pb7dGi3FJbG3dGXXC&xRt;Yhn)ahL~+AXoFoO4ie85DA0nw-JdJyhwl!k@b;#D zAYgbR27PD)48d(ox`JflAtl%&Mojj^eQNW(GOKKCoJYHD;G9sb6a%ck&8M>305F|Ck?E!hP~B8!Kz z1{Eh#g$PT9gpa+=iheK{>n6*<)CyRy6!%KitibDq=m)1Q(ZF^T&~%7G6Ls_j^<{4^ zta5g$;)-W?_Tvz%hxPP|C=zMc_HEy(L_?NtnDT`vL}bx*b74qiDb=w0_HbwRcE22M zhxe4uig*XA z_=%_ZinsWS$M}rb_>JfIj`#SF2l>&O1=|JVNQ=l<^Z{_h9> z@E8B_C;##{|MN%x^jH7&XaDwh|M!Rg_?Q3rr~mr5|NF=P{MY~e=l}lqe}M2Oa3H~g z1`i@ksBj^}h7KP>j3{v;#fla$V$7&@XYC(oWfe*z6EbSTlHMvo#*s&pySrcR$ijVg62)v8vnV$G^`E7z`Gzk&@b zb}ZSlX3wHct9C8hwr=0TjVpI9-MV(~;?1jfFWXW6rF3Gw05pKZ6b}dNk?Mrca|zt$H=<)~;W}|BfwtHtpKBZ{yCbdpGai zzJCJ`E_^ug;>M37Pp*79^XATw`rS)`FiBAKL;OETG{lTSh!rIb@rS*4X%Vwt6uTXNZ@mtTSzrkG=rS*Dq1qM4?e zYqHs%r=psws;jcvs;jTU8mp|c(psynx8jjd!YHed($YUNwWSXsIB*l)Q3`T zh_zmcL3D|*g^5PSiBi>xR(py??~6!bj75!&OPY;?WsXVMj$3JwO?;8J#F1W&lb)E9 zR@sweZIn}pl~|*di^`Q`?v+>6mS%C6SB96j+LvkDnQU~LTZNjzp_*LFnrnHSUyGc& zp`2y$oMV!mW_zAv+@53Io^!6Bl&zp^&7gFCp=Y0>Zt|jQf~0Jnq_EJWd*7sdho*6r zro{86a<8X%;HQ6@sKB_WoxP}fim7zrs)W(3g^#U!nyt;cu7kd=qT;TKqOZmDuYi@Y zg43~#%E=mzM-eTn5n;*ufN*%z?-_k(CWdZtHPYr!nDT2q~61(>cp$e z#isYgqp`-L*~Yoj$gAtfueHgj!^za_%Cq0gz2VEUy3DQh%&^MN*Y3`_zR$71(Y3|V z=kL+I<{N~x- z=h^M(+~w)<`03u_>fQG0;rZ<2=I!C>?&bOK=<4w2{qX1Q^6LBY?D6#O^Y-!l_VV`m z_5Atv|Ns90000R80R0IZNU)&6g9sBUT*$DY!-o(fN}NcsqQ#3CGiuz(v7^V2AVZ2A zNwTELlPFWFT*({Vj%brcUw(Z-vbL-yCySMM(z=I1PPQ1AB68@DrFn)bszNug@QXllsv(dzDcm5@ zJ^*U}Gg3SN_Hz*n8WbZ^EE@1&&>8~XAgf55Xh20m^sKSTsLY~y!>IzflY))Etng~A zBjIDjfw$(mYp(_U0^k;FNXf!T#U6{0oR&1}u9bFT1gi$a7BtElfj}w{KDOMUtgX^S zQj8&iq_avS@NBY%C6w+HNxKe@63!xPKnaqlG7!K1ZepuH=`0e) z9lQ{9Z@wHjyZck3jT9vPvv#==$#2{8_S!I}B~I2s97P+p$1K2snyBHwd_eKZ^+e zs0|N>2Iz!8v}(`~JOKU)pvdq@Dd4gN;VNJlcOy7%d9jKpGSv6Pzw}#!5!rQs55+DsX!@ zOF=REW)4tKgUmOq>z!f+Hy8tDMs8orSZ0I@7@zA~N$wh)O5(c_r4b56Ne#4Xn~4MvJaqW~$$pkcO&s)x zH!7&+43x_2D6pn`a{R41z)S^FEA*w;lQzC#w za7}IC13=p%T|dGXh#}#lZ>kbVM*Xo-Ew<02#q@_X5*HwL3KT$Hir!vh*;fGJV-<$F zgPc;(kGoFvr=2a72qd>haRDf;`@mqY0Hi3-=9LBgFl`EMNx5SGj7xG=Eu0!udAA0o zAPoD!)dwUuy|J40xZX)eBnkCoo^D4ZT0!mIZ~y7sNjw;I8H!3 zuthaKJ=cU{`qZdailzZ%f_Q|oKp(KhgGFiTRNMO2K-q!`cD?Hr6z)NH7{xVZS!-My z``A7CwS!QJ>}EULD9SDnv!6}vYV#x>#mKg{w_OY)TKn7J4!5|+P404=``qYGx4N6v zj~`cLAMQ4Ylh;jee}>`67m1R(D58e(GIV4XMN2y7t&w(HB;GfLx55u%aEd5=;q{Jq zRBXG6(`m#O`<@8A38H}r-}Nl3P@EIIoe>@c)F8$Gh-*T`#p5^*$jbsvqD0s#Aec-1 z=0^EO-R_`riTq*>VGtw636cz;BX1vp2sEW)Q4Ab^q91Fh*wIp_jdb)(1K|im81PXH zTiiPkYSca3q)s&_yh;nV1!>7K~ZZE!OAH2i!F#zgS|3BEbhvJPXwBF4e~Tsrka9Z>|+$U zgP^B*V75TemD$zU!Uv2%G$$Mg9w}&7+Tr1Z4;Yw^0HU1hPkDGQHK^qSY3WV3K-aiB z2_IZ)?(k8$j}&ZyOpGju^80Fp5~L#*De$lVgtU=2VfFp|HZh<;(xLS&EIA%w*n-9D z{t3kve&0Np9$NiT0$FK8d}}Z`-#}e~AVvbwar>eHr;{{cpbaIYVp`ER073;=bv-5! z2qqvti$yN2Ge*+0Gyc#dC<9lw!$E=Y0i#ei>2N(PU<(@vEO)arkDwayu3dn50lx z=MSaCQ2RzNk;F6R#6I|-ECo@39OrQXw+TdM3EFTh5U3T;a0e}NF4to|v{Q%gb2D22 zPSay`A$Tq|ggj~253!&IEdnVj1U&fvvO|kCJ+xDU{?Jme;w!LnE|SA7j_8N5axDLW zZQ;FCbfj~bvV|7Z{aiGZ}ycqTwOJopPd zFarqJkaQUm?&bu)fDg^pDPfQd`w*50VRLRrDWecN#ime)*%gilS+a6HWRe21vkwnu z1e5}oq@ydp@Fy68NG2ydg;F=mf+2D_nUrD+E#iY{$t~~zJ-{VUVM#lqAendtZ)NEZ zXQ_Rp7-5m{C+Q<$QepzKQtSchi&7KrYlJ5h=g}MqmPy#upoxkq}i74L33h zl@JIwkOFa?M-w-DXPtBZNfBNqn8SIVAt4R^b`j~&iidGMl!%`4*%;>O8L2g%^O>Le zxu5*mpZ@uu0Lm-_!a)WVAkv_0s=_*-A|U1iI~FxM1SKG0XB*SG5s4X~80ruMdJeOu z0kHrLdUFoY10c`a5x_Tm`vRvFfX!S3BaCmdwAT!us9}I z4I39Zs|{Aza$R$?1Cb0E(m9Z60?idkUrRU8;tu#zv>OA5a#*s(ak3gfr0LqSLP{@P z8U!>ur0!q@r8*h0IExc&I1@<^qmX@dH!MxnDCs~zKy@>+ld3rrwNCr1lKTrh(>GQ7 z7Jo~N6gd!olMrn>gcXJ}p<6RV21R4kM&S^XZR@tf0k=ZR4Tcs9K`Oh$Km~tT4PkJ* zYv4a&5TvTG3EVIYjx-F_a1N@l1wpU|LyEUWx}t$~84oEC3Of+Ew>2_XclIbeJHrCc zfDdHo!U4NNstS)6q}32T%en?2ngWI(!O$QA@EQiIP<}T6haa#AvCyJI>byk? zJyqHlP-+nIR=olN4e z28o$0=b(@Bw##p9Jv>EL4O?oE<^` zy2L};#OKh!K^nzZ%%M{J0YR#DS$w<8yJj<>ed>^5n z5qdrF<9oAc_uAs1NbacCpbE;n4t%CUE;G(%_{P?8!r#(nQJ)E`1K#z|!acEDbzB zvl!hM?vQb&vC}18RzO+}PU0jgZNYTQp+n8i=Wsn4{nJj(97a&l>l)KXebI2d7x6L4H~pblkShAOdlVto|) z+z~oD5nQ(sGb)CG+0Qo`6n)JSe;pG0R}if70gg4+yumvFa-c6-pb=tc0HP`mYMTNg zJ9I5T1Z5CpBN8r25dFp!BHa;JIuX^_5s+&TuU*7KvDz%r+Khd21yPccognD|tI5#S z3~|~6vDZf7(;RWAXX+625DP4j4g=>8a~cSB>ZWj-u6U}aQWJnG2f1ticHPbW+ylp_ zYtV|tU}(DXr`kOe$h{KEJrQpQcB?=+NT+n${bK4x1^g%wyxrUV0X$o92f&;VUH5fr z)XO`T-#d0!S+{jiCv`Vf-dguHmWrugk*pH22{SNzM$lZfx(HeGm$5plz-mELH44(u zq^T<6vT6$&&f(v{tG+4_@c`Kt77rz!!(D;l5z)X6@?zn@c#fwih^KhMV=C+2-~PQH z;MAktDtc;=t1H7dz6zC`=XuCed4Wngo`*Xq{WmhvwzU1(+2`2}C@~04yFtTezD~%$q{KranwSeQLhESw{y0Zqb5(pvd z6N#-6IV)B?TM(NdU$TwoN^6ox+vbuBMKdUub1o1^D-cP0=7ME$fiPZca0lv}xh+xa zDRH$~I}nSo59)Lf)i5~X;{>)PA+&N=vXTr+xduh6HJPsI?%~iABu-m+5Cr@I#qh1( zxJL8ml5U9XTMmtAyXz0+6Sl1pn;W?E76w$X+-@fV5c%s2)wmuy?eWkl{ZImCld5No z4~~m64u4b4E)dedGp{2J(>@agpAwt9FP*FHb-Xm-zCzXiP>eTrAi+3{9He#rAk6$7 z+3wyRJg^1nfIJkLiwIHgw6cP=Nd-GbuMEsz5+o7U@utcD-aEWVUdd}iOv#OuM+WF&jVo#P7t2o z&VMT;DI!?{kiwB7FoRHJawT9pYw!Wy_w)MEH@!)bzLOC3u8$fp18hcEfKvi%!0&-| zlAYCtsuI8^;PY3J!d&n5U6v3sj1Pp2x&;wrfeF%IahG|ymyu)y{g96jK_zpk`;@;P z?$(!oDG@M7`xYVGDQ90t0~%TCfR zdolhn0nOh3{sxiqB5}`wA^$O9{P5q~dvDr;vH$rG5dH)XBv{bkL4*kvE@ary;X{ZK zB~GMR(c(pn88vR?*wN!hkRe5mBw5nrNt7v7u4LKLC=u)p-!b*)#_EOS+#EEiV)wgu3-m$6T66`+L)&t2p%#oV`X=hnTOcW+<0Qx7K| ze7EsqxsmTuo@;q7-??m6uV&raHC4Ii?%cKi1nn-5b6f60TP%0WIUaYhtG% z`T6y~r@z!bPyF^1P`>`sGw?tJ6Lc-W0UKnmK&7O*PXh-lY!5=DBm|H`3oFD>K@dX} zam1xkNTEXqKRk-82~kY3FsNAE(8UH-B=N?q)&L7f9=oD(Kpt|^46&b#!0pHVf*cYk zBauu}$R4A-Y7`BqSVc-Ksft87C-cjap)U9PQlT*Q8}rPh;CTy8Hi=@e%-qzV>c5@T z_zlkYwve+vDC@iv&-V0eb5Njc2}7X&Vj|hmpKl^bhfqjcdb3O$i_&nXNhyW#sY`Xr zG|WmP6&0mG9NlLV#|AP2A09?^wWLp-40R|OG35}@2RB`+R!?umb=Fsd^=On5!sy2z zY5wxZoh|&CL0D)ziWNy{F*$q}eA0g~NoV z;fEvU6R3(W&bZ5s9ftT|lr`F?v6Nd*D4s|{@KLImU?Rcs;jm->faXbu;#0~_WEnE!xnpNvgeu^MXt+Mdu_Jc zc3V4|--Vm%PHtFXw#l6e9V%t5F`)#-bE#8Otew?lApysHA!Nj>)st{hXRpI6J?Z zkKBED;){<@H=YNaPI%*+cOFpWiPmrV=gWk~5}#A>n|7>NSiPd{rFk~?*|}FLu+OW9 z6Pn=YO0sjU)hC{P`H_Ss`}pIVOZxh=rN4g{-7hrxgU|i|NV4Fp$$U$JnV!Dqy|GwA zPu1{8F}AP;*cn0_fpCKVG`_KhHH1ST^oRuu(vgmS++h)$pn~h}V5n+P;ZY0ZNz6`k zz%XG?Him1DC32I3+~F`=+)#xOQN+IASR#n*iG^~uzyksrF-%01qJ@yiyCt5ZiMfGd z6=}pi4U~_DGjT^l_Avu0{9*>K^9Ka|(T^bUf(NUJ#|JjCi4WY21tM645mrHuU$7t< z0@=h2K41hReB%xu*aDUIp#dkD$Or`+qfPAMJ}>?VI=<_nTc8oeOgc#$p$KJ#pfj?K zd9svv8s#b*bjpRLG6kS?<$eg00=!l7CFe*&Lu^q3ff#{}ZY;(Jj!Y3jjMqS1vX1}ZDE88czbJDV$Ry4vo^>aO$agChr z><~2CNzeGe(=B(9Cq5md8*qkGCGJRq56;1mKpe_h!~C8loFI@n20~eDxI-JiHxOwK zq?+UthzQ*A0e?_I83)#2 zCzGmVADu8m5{AJLTd)8hfuM^(-1v(W;3F=pkVaJTflG@PB%=aJhX{_f57~9}qaht> zNrS4A$EdTX3sY%Nt$5R%PEM_Eh0p)!Dp$JZ$2JN)Yf7wej-xIFAD9b67-q+U53t~k z!J&c?V&RYfFld00PQ55ZKk(3Hnm`1$2&+Ft01uFg)Do8H>r1AUSJYOBim7F7gI3#G z*j`9ir7ej&tN;;wBm^G*z(+q)Fim{q7PrEst*v5vTnDuhxyxmba+#Z2eN3JAKb8Og z$FJi|``E`k=Q!wO?>)}3i;O~)>ew<$R3vZCaqN*1Qb|Tb*@R=HbF5@5q>>H^*`tiC zkMD2af8hGzdAqLbbzQI5{qeX9e3q=7k1;=M5zCLaYI4?D4q4Y>M*${;El*p)KDneV zpuNSV3YMo_2<9iYwS*Be@blj;m||E&l;|;`_M+dHZ9KU?k6RkOAx;1npEngzGeY@7LRwg@@xeK=c76pLn z$QLQ4!nLcyn&yxUub!V6wHV_2UFsC{1sk!Z=(K%#nm>J`$WqM)HT=$BqQ|O2W_s|7JN5}6E&grm;nS9 z6hP+oVFHM50?aPxp1c?VI70YxO9GhR>;%$B1d@k?hBSZLGP7J_8*k@s0rqc#N)NG@I9}U;ICI0kZk9wB}TE0U-46Ktm^^e1> zzCa>{5fpDC#&^ON02B*c{SoAEUoQK+DGyAS!t5-tZwQQDVr2!E9vKAN1jqlrS9 z6$46+gtb~+TD@#_FvnvU&jkR9wCe=XLG5*|`GI9wAIeW*GKfR=(p>A2!F%O#ox9jj(q3*;g0-f)ASQ9^4}jG9!|wo zJt3<80KnfJT_sluu|A$1$fi_7@or7zqSozRjjum-o}Sa^l_QuI80~?{>sB>?O+cXB z)(bVd^K!9g$*t4rhI|nc;d&Zd^yY26{uUX=ivZR_^mJ>h*>@Q1> zsrDfNtN`q9kIE(daTFHnMTilig21%*N}lgx`_!85H*a+VU^?U|kWRg6N`VeReePEI zi`78^n0*)-VyBfP1-|60bEygj##@>?lT)Y=@wK`p|1=oV(4MTlYiAiJX0)r@=%pX8 z@ys&0qdP=dTO30LW5_RY&rgMzh5V5;<`>YQ@jgVwokCfdr}W4cHC+4zy3t&1wy6(b z0N$$BoL&Q)C0ZPR?3(}}xjhCpuMMIpd5u?v`*Vl)_HNmIxNuPi&;~@3Y4<{Ow!$nd zTl4`0z#3NVv!e;%$$M>(46xRE8>9t7hHGn>BFXn~I=i&qdwYidNhH&mqNb| zSTm&Rv-!yK7{kJw4*9TxD5}Nw9LM_Z98^;IF`|b_?3M zh6|6O!f%iyvdy*~%tue3Sc>#VnjO$-pgVED_2Yfk2HJZEuII)sT2&IIw1<9rCI`>hy6(chTG0=jRihq3 zEfJu}yPCdh!0}u9bI8!Ybz9?86w#n0dlD{+cD!;@f2b$==Tf0wsW7O$D-U$BFYWV% zlPElyW9XFj&p1;s_bI{>9{wmx(JMi+qgh-@C zm_f2j>RJDEeMpH#V0t5t3i-x=e6Ip9X8;b2g#VY8&8{zJ01q4|LkMI@LA87kpxO;U zf&eFQ94lh%6*!I^;Z!0GE)my*hp9w*wPQU2_z-Am|2#Rx7L{)Ln$rQEt}Y$~{F9A_ z63B_!+?Cny*fHQG6OC5}juGaz z#a%w~A+L_N*#yapOuyR*V2m>akeSd74MKLR^XGI@oMG6PHwj;`|5g2W*C|K7uQRK; zcvJi62Cl8}?PvPkPpyAw?g|Xf@k=-KfIvG~hTdhFrSBvG0KwGG^Z-~aYpT7{o^M`y z83OomlGmOKWbMJ)^MO0(y$}>Y8OTukepGCA0|$7&ZTL{$u=P)q1xXt;(19S& zCwRDSkv>p7viLtjA1~S&;K)a-o0hb8 z@S+zt3^^4tKB>(`M()5M9Br&S29?0M8vEGTa3MOG@*)}=PPfNGd2Dn z>~h#F@hilOHO7C8TyvRfk(7G|m`l)x%VCt$?f6P8L28$L?)>NXmV+6tHiIa>%n=9d zcyY(oxy#?vj~M;DaRInt0+1{(U-KXXba32eGt8HK%Hf-bI$_Ha(4L$O^;8*b_a)aE z+{=ISA(eURnO0Hn5={H^fDD);HinhvUUKvOaZALJVw`Ebp$?23_)>;s@kwj_z;Xxg zRDHkh!}q#p;xuO&s}CMd{s<7IcygOQ1ex_eOz?x8grb975)j^@L*z54SbKb&JMbdV z$$RXV0_u)Gh2T#K^EPG!2LdN)y_qc77yez=8o8Too4y0Xk$ZMAUr@^6ky}1p08TjYS#oM>f5ZtGx^E_gQOM+H4tm zF@IWy=Cao3FYjbR9Z208hKn|-wnJD5WXH(C9j5lz!XuU1 ztdUBAxj|sO<4?KY?H7Ng8UBEezzDe$N6wm_1DBj)q%>6#zdSgBFSFOaEb!3TRy{zo z+cp$AMto3J^35tJXZn3o-0EKb>YLV8pTju!A>eT3dech6*;Dc7P6<$)A)RbVSCpgw z3yOCfhvRY!9j|7$1_%9u!~vFM_5w>5I+(3tt@N#m!(nUjvoY8XGsIbkpQ>Ba7$G#e>0D{zv$5Z1j?a7ezB=d!hqY)(7>3sP$}K^bDf^Ow#hkZ zB1Re~a)4v|OnV*7=&u&r{Lqne&YbhbF-c4lM=qHUUH|-Bu#OUD)iB*{p6WlIAMZ@% z##1a*zVIu991_cfgC12zUfvqD=;`8O4^O&I zE3;?>_qhxDTwQw`x!{A)DxM^lgaA$V0eb|QogNdVK~CZ(Z!PqF&=}*Wx&k6Gx0P$| z_l7!c(GI<|o|pZzFqm}EG(27-+s>%IH*8cI35JyHFdjn{H7BHSOZTn%U{xc;yJQwZ zBh2~Qt@jOIZ~NGjA#tTn&p5Tfw68aQ8TlCiC+=>wnL~!S4!XV4IH`cioORC>8*rsXP+Lno$x2qY%T@+p)&}>>By) zY778i#hkAR_`M{FvCs>Ns7vEev-|0-rDnnLHs1Q)yn;bCEXN&}xoBrhra_bDbO@!9(PM~|;>0-tm`d~=$1 zSv`*QZm&(hzpwOEVEfOXg;W!@nnQ#AcgsINRsN`H+HG<(j2U)%?1J8X7%!qV!5UUz z1Cyj-sHY2k&@nuxwXv}R7s9ZSoZbSx=j;0!DRGl7qj1Ii*q=}1p5DYeu+_)xNT%or z7ZT-#z9O*d`V+L`n>rgiIPYp-l4*)n`Ff7g57SH=|8*%xsld<3X{vC(LNGjVVX1aR zMh;h$=rT`z~YoHTvl2n8g>eSC% zRH)}%C*W!cu?%CWm4K;1j&heNn}*k3+n)^4DUzJetEDU+WTwm-&k+K*^A58M+F`B+l*BYwX5SatC=Q)UE+J6xVg>O zj@sH~-$f{!$jjvBDfT;bdfb2bWkLQ^gx=nS-JsEVxo0raSj#yhhn?a-K7D;R-h|m6 zWS&c2ZsWTL^DCGi%+`UPU6&DADkkdMMDVgcahdWkJF!?(N>M+&sdF1sjKYe(D-KBH zRbau*B0Dp#SED$t%X5Ztt_(p5ADLv&|cGj zxUjk5JyUNs9jn9Iw9h_|-)<3jmm~ViXFdP;Y)N4a0@szl*s`W>c|^m+{v zlK2h-8c&};^!+rrRASfstr3P6*)dm9U(4n%YeVnsI8Nxxh(<{bqTgA93I8FrNnQ$G{xLbt`<<_(%?0(x=l5A-ig#h`&TCnDrtX(JtSEvfP}~Yhbaj6`g*6PL z_tmA1w?2c9UH;d*$cH3(E#N0dq#1*IhTBV|;CKUXy@N|)AZZ@==nU_xZZp-IjC&DM zQn-^JPMxjE%xRI5QG4~_OmNNp{Cz3B$;lr3%$ltG@}B*$M!tB4pHyPxfE6sfB3rfHX?4 z-sknKeLT4@qy2oB5|mk+H*77X)&IfT-;b2H5OG5P6$e#LLh;qfy66-=&toI7y;W2v`5x_Us-5$<7?dh`=3 zIRBNK=d!2GUM5ogb7vn3HRoq5BKVv%@<_?N%nn37{gHeWk@j4?B05PS52LY`n9h}U z@;|))6}8eY{bMJ~MVuF{z9zN)=OB1Gql)#?7r*;a#;I>^>6XLJit^>uzuNzW=qHi#nY$dYV4s*g7D;MHx22P) z?YG6+L%R7@a8&cvD zUa_2Wndhue?{}TPk^6xUK8X1|`b!a^8mbrR$j%y2Lz{m`mj^UdeLW7?c&8gCJe$j! zzxLKx{qdoCi30p$PM1K8V1@ATSkfhv{72re6uV!Wh`;cMSdP`VY7~%~-CW;$wPbzM zsH~m#XM^jR?`U7CzF)qyk6_L2QV;2m(SL@cBEPEN-->>~i97GteDrP7=H}BhnKpi! z`oypkxba7>HKHkfx zy}fvE(rxXV-rny)xhV#fKZaPUdZaUgkhoC@{$6j%${lb0Iu0iI_P0%xZtu6HC%CB& z@y?Mht)llVGL0U5IpdEtB{OtN?*B&=y1w(JcV*P@S(k3;$2iRqWg`2>*C8HOXH_L` z(HxhkY(0*8%wF6^9Jk?`qC%%!0GHxX|3PG3jVs$Vrr`-ZLSpwwO@FqNZ!_V#;pU0w z=H;EmsamS)owC!Tk6qw&OE1t3si%RSv6IZP&xqGVL^QCF55k%^DI<@kupl5Gg zveMA>kB}RYFW?VX*U$*Nq-{N~i74Q;{r(Fn$Tw7z%6OKwKD) zjXpKH52&kyLHrObl87RCKARL4fq=H0B@-!a(11dl2+iIGa~jp%C7F^>@VczObWz}r zg;50BF6-%&;erE&1cmu4t{qS9e92qFog#<=TcFTpoD_QVXwtO<4E8=IBq+tFNTGzn zIQ&#JeoQigr7?rcjMG|UjRnGKZ9^&)%(RO54th$ORCcCtk4)G(6u2_c+$#`}2&TUT zVGCe*W((q$3T%Q3;O3*9$16Yti&6mbR5=Or2E+OY^SKI>)N&f7e2LRNRr6W9Lx-_S zLFb785-NtGutcqln0A;rUE%p2d7M0rE0T71H~N!Q{M-bk&2Ua|*G`wTb)Y6Y8|%*( z8Lbs*(z<0jOq6>#RoIHv`>-xmOpyRqz6Dcl`J#&MRD>$4;RHh~1|;BV$2x1OyGECJ zM!nicSCXHWzGjRuZ7bz~B9fTW?yNSZ(VB0La0CB&Xg}&YxXCnp2G)K3(}FWrX~Uf$ zQLG~PQ(44QRgjf?=)>N3Kp=%hgm(}w&(g#+Re6Smsz^2V1K86~r_Mxk4?VTBb2~ce zVS{oz1|{U{%pdHS${y>Cccx%9fLXaQ>zm;gl7RNL8R3mY^#rUkjtqr22*(T3 zPAibJH&XRlq|JJ~Ts4PWzj~SVm}zIx#rLCm3_Z-U=^`|1-Dl4g?5q)237uI4-F;7y zX(~?yz>WgtAiOXIZW{QZ^k?)RyGJoX(NivG+4Lg_uywl7fUT#EOiqwnji)y%z}trn zdTBuB@<$n0<3!kwg=)I!DF)XkX|RAVkm)*eYGKyAt6wYZ*;GFf`ISvOsrU#bDaq!J zs*01W#~2hPScgmES4`yC_~gTX%8yOsPVSxoO1j}RiBZ9O|?4*&4{^3nM;ka?*_Ls?E*`Z;_LkvabTa6STLI zjUuK$?8IkYFcyKmG77q?8}^D{=H=oP(}wWE^XveNr`B82UePSc36J3>EX%J4X)2(UbdF=R{u;?kb?GpcprlAx>#D|SJ{e5kp3fl zl6kh!mo^R*GP!4gvPQp%8gc+$yr|HV<@Pi=)5_xXOxARabrL@DAGrF(d{R$pa=RjN4OAcAB$iL|i7bT}n zf?l)}Qw5m5rRDB34?ccysPd!rlp4|7C9_6zU$~7wbDOH?`Vsm2e9GvL@V=JdoWbAs zqMo8Zx~j@i(g=Smckm)zb1QaoK1m!UtWu%*eJb{r^@*`yxfQ!c)LvT%NF%Zk;2kF z^X18Fy<10X+-N%%1v{`o`dYUPc9@ht zrmb>fs2}ipJo79QXO9rHBPXuNAZ)8{j89(B?U-ew2F;JGp;cGUm@37LV9jJq)*=4c?2=74R{eaf#+BN#Q)=Rmq%=cowW4ZW);)A9ijBYbPkNG?K+}^51?`4f zy>~}yh(?)!3K0J>C74YRyoNm^(>vQ%YjmVW_+*{oC!cwmTImONQ zTD&o3N3fna#%fbQ!p9GYzme@5izs!=&oI{WR32J}G?MjE_{l-P6I$_atlp_x^Fdkt zsl;RgYezqQQf_0Rw3DT*wo}~>UT1d%$+cdwLu;HpQFm5JgcRz;?_qXfMK%`wM$UmV` zwUm=HkjQUZ6pIO!APSN|IZ2|T5ER}jA}-e$S3r!~qZp~P@FPH?-NuyAFveSO6gve) z0!M!)D(z79hZ>AfCe%HKdd!YM2`zqbrM@Hbf+)^36~|MMt`y0FFzWlGhNyEa zsp!V0FEnHjQCt!ywxV?QJO+_M%W#sQff}v3jjYehQ7u;sz2;y^UJI|j`r9YwJ zB4#CXkXP)C`|O?n*B-)7EM;euHtUGageoE^v3k1I;8^_~kftk0h#8~os>?qUsx?oO z=eTPZ7LD{J+T^y?@x`7P0sR-%h!Qg~7-2|c#nyS~+4$-;v70qS9W^H;>oqEwMfHLo zekPKI^jbGN^m2`Jl8KRnCJB4n_x8Zrc!mV2J(}H2=TB(So(c6wv$9?EN%eh$?r0S; zur!IP(%q&33e#@A5Fw={8nh>S$vpwi`u!u+_J5_p^=@gT9S5cnED`6R@v86CJ_*U+cGq6Y4aiDHTN^w)-?j z)q;xmOiEWNA8Uy+kk0hIcD+AZ#e0;jr#ofa4JRjbr7d?2XiY}lZ=*iF&3bHH`(V%J z&A~Tz;>6%V3EXgU(cs72+bDJx8`C>c@J7L%qfkZ2Ue(@zQGAj2_QGO)-rl|YHY>Cl zS!Q6gW$0BW9C#VO5(rpzl&cQLXRX4ZF?VVXZ@Vm zRegRr;GU&%rnM(E{O`X;|Cdi?c_a7d6&suFY`dQowd-`rV1cCBqYV>cj{qxm*o->GZUA!@$D>~+7`qAGx zGDpIGJ$W!0zWi3`ie25fNsg=PjqbD~^CHlD*XVl$xvIPQ2Ub(lWtbe{+N=rhP&_N_Z(8J`{K(-&vWMa4`&|*whGVX-O{X?k!4lw zynpCE`Eb4{)llU3vuwdepQnXjdMT%U{@AtdkLgq@g3}Qa~-K9mYc5}m0 z(NFg``qLhur7PAfp5k)#%#-h*Q!`7}El7`Nw{_BM>)ZF3N#l(86n~^ zXLG@`JkF=BLH9$+CWs{GRApypbG3Y))QFS9(!saVu3|d)%k4RB37&e2mX^2Od8sEM z8}|$nY+0?O>~PxcIDD%~5=Z9L5xLDr^yUo<))#g$$Z&h!(=>e8)sYz)9hWXS{1gIQ zbpM2<&ifHaMs!#}heZeBv~71gRg~4!A{Yy6{XLBK#iXWqc0uvoyymHIPHjxjoen5@ z&hu=^?8L(_);?THe!Q>X$Mr6;qHIL>k`@o~>y8ZH5fJA;6ub248q=HviokvTX`Xw> zf=V_@-uQ=X<$&la6_F}2=!U6MK~2k1cK~O2ReBOC;TM1@dJt z#9yI)yM8_Q^pL-VD6n9ccyz&`SVG{{DaVDg55GFRG!%GkpS|Gl>cZ%wzu?lnAz4G( zSI3ABoW$i{ubs9jj(siy7Z-mTox1d)wf)8pTRyWYEN^hG%LQYSqAQW=H2KC%!Ugcy(*e z$+(ts_kuackJ`E(AJ;rM4bOWw~TC&Jd<*m?4BU)(I~u}(L@Yf0z92Rml0|9IFrQN|yq)-3eP zmeTaXzM`KZL1D>-F#izYZ{M7D+QdyJU_btf*&BxJHucZBIkXQsOj_xx4+Nz}wJ7f? z2M2>hoz6s#ewaw8p%Ev8ADF<<+~@D;JFi%0lu~EnB6kPwD5CCa3s3MJo$O&1PrJ7* z_S@h+e4rAvQVgy`8?Bb=#^m2DrMixMfQI%zH0AOsnC7{YP8FpCF!{VoY-hFg1bbHE z^k=4cj!Nv}^lD`M3`5$*ZQuQyG1`sF9F$FgU+K{j5x3#*bmSZhD zaO#DL7E(pX>*(e$iN}SM_(;TAzywPuOnJTBE`IA4{PXT)II|1aSb9EFLwzbv7MOw` zD?=qt+itgoJy~<6{kQU*em4xy`S*3`+iAiSRA!g)wD(81;Z7OX*!!30k|65w@jT=t zt?DOvn4%`?^5atfY6=TxVN zjhhJbf;jZK+10TxZ^O1|jDyL_>!%BVY)|pXtGoZEsw9;SyRqB}0HO-R6Jcwah2=ha zee4Uu@Mt4_{-lXk*s{#Ui5rq|SwwCW1s5OjX3jkKfD7bwqL8Hk?&N+s5% zYm(ykkgx-Wcm?Gs(%6H8D)xswZ}H21QI3iw>s$*>{7@PeG?mQhMCTJ0S2uAoMGwiOx58 zGfdVw<4G2ewwnP0X~Rk_Aa?yq7HBbi2N1f&qm*aQbI_w`J*?cy!2E2J{8=pgOH#OK zm{SEV5lr9S!ov5bq<)%&6fj*NzaREx;F1`EZRC&c0sL(kjsQ2DIfX4aSUgmo<3#}H z2$}c54Lt_n2j#_AFdW*cxCD{}MN(pbp}3NP>joVY9Q>Lr5*uXJXC%4is(m{*!5IQy z0>H8FCirnhaP*`6a4>G1$kn*y@J< z7yKhD1NU!Al%uxUSw&oTLv)%by2emmZWDP%!i*>>vQbzQ3XuOm?Bkti^+M#<6!;+- z@uU#mMHjZNMJ5oqm<-_=fMW(w|C^zh>BgA?NZtwNYw|}+GFaQl4T5f*@`bnvH4YXA zT97ePPgj!55TO++ZcPc{{c&xw;>y=>6NO5v>?#6k%Ix=X$~Cx-T3p?Iy#qIW9Z40& zCv;D(E(j+s&!@J^u2x5p&~{f%o@(9NP}P~neIHUxXF@MyP(5U^7A7p<Y!GLz*c- z!U{;iI9>`QmkzF?L-(gRf=C=k&*-~?g)knd{<&p&tKoahBGq;;) z2NFjY8PdTi`*Rvv%8TBnAYLq056$pj+uhL+dt0=NXdV;D1RBqbTA`CRU;qD}1Ob%q&E`k(JOqE1Fq| zdPWr4$BIP~MJE{iQH3IzSm9?B$YQh=5vyp4wL0Y{_vsq0Nu4{AET~QqTq}e(ty&&n zxyej<22(JG%o+ngC`6@)M8(-a9z&v##iCFNeOQnJe@zkkb4DaA8|ORfv~CAsnc|vF zRwSh;aH`rVY>MUyTzEd!I!_e+6Nt+! zR9zmat+{n^Ga4%T*JbbQb~r*H@)Dz>4>3Pc{7K48NpyJpbM+DQ z3;{7uxE+a+a3{xf0qpw>XayizG=&Wt?WWQ`d4ww7l_kt4GQU$mPxWB{dSnl3U=0u#%LA!*EvmYfqs6(;ua;*w~_cfBH~$r;sNE_ z(OU{XnU0nvB1oUS#d)uqsbQ~!5dVk>Zwg|i9%pJ(BN58mM{tUYhI7aZ9ZbPWNc@?O zwT2`f%PLe>T`;GkkPm(M4M=j4g&njhs(of=Yz+Nqs5UFJfkor&=%!M$)op{t6K%vx zbs;#JklP;HhN888{R@5+mQON#=-0%sDK+smhUDYnGwXGz!`ds*+a&3q5ZqJ=_c zKL)wZ^65@{6WyYI+!Nd&M1*5}H#gK1Nw@;RtJ!1G2Oz&+$G_lPC#ETtN9MW0xkMD! z(O-+j=74dh?lyX-?n)|`@;MY?bkwe06QlGg5N^UAMT*oxvF8%FG~I=AneZ+WPAV5? zth{lJ5gi413qQaGG1y*@CzX%GGfAjOV{n+9@J$uy7>VE9O<*j0+_)EQ`~cTPyJFQ~ z_YQjtOufa)i?-{tQ|p!NtkrvZYdWam-mvC?_jqWtn(cPE-6kz;J1}PVbxdrv$QmQ? zv6k5MlBo0W$$kxqB+wNvrMtphslOcWE=G&bPop2d*HFjHFBm_cF^+&HMf@TO7k4n}~>H2HeJK@DdTswl$eHw7Mhu8YA(WR?=oPQWwi| zoy7MM3%$v6;(KyK-bdb9t5mJ%VuP=#i)!~k#i$sr(2#B0JM0+;Zqpw%I`7>G^a;mZ zZo*hCe23saU302s-GNg2x ze73O$e*hBjCMY*i*o!I^#|hp`(bBo^v%lDLk#-XUi@--g1VlLDP68c#x_~44A$KH- z!<9StE|ppVX+_~iT+#?fp>;&J*H z3euUBM(Y8KtGw)M&_h3IiC zx@kn^kR!}7S1IhAaz#3>WeRQ`g3FJE3>KQvQuHsbCw z3p$g|C6=lLKm?nAffzD5gULJ-g@V=vcn}e)P9kQ51k?#o*(os7?ob_I zJsYgNLt*d02+`>L8&hojg>i3;S?>{1QY>O!Ec!iDp5ar79%9V+1%zx|>;{F?CSON} z%4?T@F-{77dn!N@m-P*Ywj=UCoC&BixJsv_EV1zWBBkFSaQPF#P05oWtaJ!~+-E@k zkjqx>pk0qTbXWuo$t%GHyz>WxCU2EvrVz~N^3pbk0s@Ne%iSR@ClZe1p9;?cW-4Ev zy?J?ME>iUaik>)%zAqif|l?s9)MWs z{;LlSJ3wte*g19fh5Od0z<3Af!LO1maIOS_AtKU}pi* z>WJSA_LtS(_`?+~ys^khmzVe#?;eZZ{kPAmUNTf@nk-_Fr2gvTbL+>t|FLPj9(mygU3BKj@&`|*Ec7F{adR)D(~ZzZ1(H_%zd&|GBOkm4_O|$ z7&3VI?bfZizUBw;)&K6^TB`f@_gYtE!`kZdyZ0xQCL1FD?Tx)%?JHZI&n!rFDap^4a3Rd?DONM-SMeQ@#MwfWl=kHcjldo?1 zcf?j!)h%Kr&v~^Vq>WWwEP8-fJ+B#}o0oT2@x3!M!A z{2x{~@`y8x8O?1IHWi1y7dCxd5D@nBo)OobnM_-oJHK+S+`BXTNXu}0GT%ZlMdoSR zk&P~;pI&ok;dSLe_+rI7uDeS$gEo4)PafR6>oz5|ZoSfabZz{19k*@7dT($yQD|0s zWw8S#kE7|J@Ozol)c8OC2{@ zU|pJ%#@ko*(FcFXt42m5qyqNrZ%moGS_hM-;mhfixQj<=Gd<; zGpX~a4%WqtFSGN0^ng7BT!qKYbQnn874D*CC2DwUh~sa%Xa5=RXt zbQO3GB?myB{{4uI7bKBg)eF8<4pE8(P2GgWoLLgG1lkW2+J#WgEJ;!~+K=4G3HD(U zSk9|PL8%xnMfLYsWdhaQd_B|NcZLs~cnTR99G)LgeA0GTOC%Sc3VGemvp^uWIpbKT zNY53nH{u)?YHp7;BjH6=ErFJjY+`gkLg;sc7e||!J>=K%7u);a(i_)x|6If+g3c_O zQPL5vus;x*KIobUT&xNUGkj&_d9MIEAC$p)iPjqy927wdNRi(P%2lX(p1f( zl3x#mS9MG1_NlONJtN+GKLOKCxG_KsDtP$Y)f^`}9&2K)7T*mKf*tkU7*_ODk3mSm zRQiV7el_AnrS$RoPgoUiM}HDq<1abWZZQ}s)5Trn^4|HvbfRmJw8;3-EBhN3qF*O& zzIwQcVdfM{UkT_I!w2I}C?n%h(B6mV&$L$|e;fI6DRU|Qb2|SnX^78qI$;*TB#htr z;9GPqPQ~Cde@>9ii-lWZItG4jG=f9G5=9zz)l_{8X;tEtu{;ek@H3&7g|=4hg= zn8J|_ec*=Qhr|a86^s8C#_3@)GUJ#sqoE*f+k{!!LCa{p%;cl*Y`q1q_R(T z71jrRQN~a8P1(IK)C&52@GTtld9t0;{c*&Ed)VsTMz1&6=J-TXh*C}bEl)`i=ihLF zFvbr^+6!SrKdM|8XbNE!CibJVGIrIf1$nu z)ybmwrG)jv+SKEMVs%nH-g^J!e@VW4FDg%4>1~*D7R|~@o-vM<`=Wd**fjO2wVOxL zJ*sA3oK|4MFM0nvy{;=Y58iQjhdAF6c2lf$^R{Tx$OV zZa|U0;Od;^H+MOK%jl&eq@8W|f}<%|2_31JjPANq*k zQ|>XwsCYsMXh5{Fxss$tEBntix}W>1H4bDv0U40?k+Z`GZD~`R*&Vtz__96iYBw8x z;HLICWKbi3V*6Loo;J5lNN#p(8^Y-Bkh;Gu?{Gty)&qPoI6N|4p3Jp{2-Nk z(4-gq)d$04<#1Lq7z2z}_LGs}2y}Z>(%+MWk18JRj6ct!u%?~IYXowPYn&SgjQMZf&!-uyYxcRpcPTnPU@3O0oho-PZc$fDp- zxko;xG7TXd;T_o3hcI$M;0k#i>}lMPlPzv>WIVW$$uj!tU;)fO%i!!aR{J>EE_2$! z-5heKJB{s*cekrJV_DP(lb|$nG@CA*<`BF-9XmyZ859ZwyTSa&n_MGR&R!@Q>ee z<#)yTy^jp_&#wLNM|=D^wEoJ`AN>|@gO7Y1JNsvU@(2HPG~#~zr)>ULBLE0H0;p{F zmm#Vr1XlnBEe3m3HxgIS3v8f!tpF30a0UM02TD)}f>3-D=m}z=d&0nh25}E4sDl2W z2v=YVet=I!(GOxE1UlhZy>NBN2Y1BM+UhEGU_U08*}5{6p1g)%S#?J!LF z=QwGIg>u-2X849zxQ5@Sa&>rzt)qv5xNKy?U;J=1XL`yw4$Nq8&PX}XcnZ=;jT2^# zeyBOx$c^0?Vc$rF;n+yq*m>kgZ8Q>kny-h)9l5<_h&@4zR$FgM&64f{*05lNZvH zKPfmtc_BmDjYauUK6yDxi2+PWjZS%#QF;F-RH=-}pny__IZ~;WT4ndE~aVR@E3M{NR>Eb)U5P=x_cFq#)v zn)Zg8s>zzI`FF2*nv%mSvPqk@IS!_IZ;g|i7c!f>$#ah2WWJJjkJD$VsfNT!n!O2| zxXGNW>72!RoNXsL%L!F`n3I3@mX6g3AE6Yo@DjGj1yDeOBk>YWCxV(7n1)%3FQE^G zNs*3u1vZ!z`Ot__mt5d*H}~)k?;!sV*icjj!VkHi4*@!$1qwVHf`HX|diAD4dDu%) z0BH~!p$kZ%7h<8JIhr>&p^ejL9D1P`Dq$d6kA*X$(^H}*8e!pAS&ajY)aiuPiJ}|I zq8>V$F#0$%%9=I0m5S(C<)C$(`4scuYa-bcz&1v3aEeHQ4O-Wd*aZ#+*%eFJ6ji!D z4(g(wCj(E=4xN;EA#@=!)};{&redl%WXgCzYN8pJrpakIY-*-wdZJ$%rgB<1bn2!U z%9Yd84sr0CghO-sW~Y#br)t`d98#x;8g2epd4jq)h5DfED5QZzX! zs-Y^Xqxz{!F>G95rN{OY{gD3;t?H_;3ahOOic^YR7viW;<|Dj9s3QOi%`gIndaDwa ztA#Txz51(_daJ{_tHuhf!D?f>Lae;%tIc|=@0V^RH=^o@lgSFL%Q~&dYNyt!d4rR! z@wcsKiCp!RbxM#IM?tFZ3a|0nsZ4a0VDZX<)Wm_0l-rs^YfB$_1rF;wim3#bNjl{q3;YQbQA-s2iWRbp z5&Yn~{?H5M0Bi=43wx2f{vZr3c6ArwpQHn^hAV9xVxesSx;mS@B+9&odvVstqR}h8 ze@DH`+q}*jeyo|f&icI7yS*%Xf5O?KT`Ib1qz{c)wZEeX;)y%h& z3x$l!!H1i18Emo{Cv+bivL0;05Jtj%TAk^OM#ENp>Z1=@AOz93H(l!wrkfG(APe14 z4nozh@4#X*Aq@5K4r4SAiFpw`?88FU3q6btD9IHId=b4Mnb@bcF351@&n2u-g?>Y!JN=4VekZbE(Ml&8VVuUB%*mbX$)60$ySc^}6rPaz zKu%Y>dlUb)PrDJaFh=qqRBX$VNhlA;V6X;3$QR+t1nk6N+rG1VyA2$X5BCml47?2b z#cmtQ%goHp?99(>#-m(8M5+X=3_NV0se1DcxqziXAq?g4z(N%bVK+v{2o5Nj%Lehy z;f%Y!tiZvHrMr9#;*i4zL792vCx6wv^Zs{264@DlBME7KqhI9(K0O1r~b4hjtxw#yGq-4m>eu)fyR z-GKkee521ajn!GL)mweNHvJgypaodr5$<_L;TglWVhqOI6(ng44LukWEevtZuW>C5 zbF4Qn&DD9W*L%&^=qk{*@(yBf1$CgXP>~Blpan!-KKwA&vf^+S7#Q=Zs!)N~eQnv7 zjoFv&)sC?TtzZ@TKoh@gK}0G9kj)gbV9;AJ4-!1tw^G@e4coCT+W@`Ujd2A@*a~F* z6flfSArD8;iuFX$ z2v^Ta0R`*O72SZg$&D+^{oLuT-s=sx(Y+W@pasIv2*xlC-t83c%b2XBa6*u~YF+;n z?*Ik%ObOGawNnud#!wmN?b=t(-VN^H5ALk)-593O3&(KXN)gA^+)r6|1b%S4Mj;Gh za1Z5R3irSU7T5?O2o-fqiwZ6)=ndg9F5@#UhZ4>h<=_X3V9iA_eNS3X{1Bc6trXpG z1ZaSL?|=rrcGgR}nJ%6yFizu9F6C1mWqKptQ9%hnjZ5<|eNV>)$A}aWG!JZ`uuwNAqYHZ5bGcWX}%PaX;kY#&_}T2LQw}?AO_LU2oJ3U7tXJ4 zt>z2<=8f*?5FY1)G3OeA=SwjUO3hElAO>^!6vdqsreF+1z35NQ;E!(Vr!N27kq#9e zUI)>@2H4kJb>IhpQ4V{628u8bg3t(_zy?v53w~hVCC2Ogc?@-+2vw*FPVF(R>Q6yzPyhvD;0fKJ1jr`@^U%hIJh)yO#lD@U;|I~1v?J{H6{Q6AOHm)VVp4Z0)X>+J@HMkk!*ks7|;I&A~6r)o(W4n z={q6bF468faT9wW1`1i?MWN^}9t~mb6j3S!RbBI=@$XB&&pOZZK0ow9Ph&$501zKx z8L;z85BFQ$^i1*eQJ?X9Fb_Jo4pAyp`M?iZ&l6l<@?O6Vt?=8_t;0V7$u4FLo~V17 zfRf;#dz+62x+f2^a0TKZ4z!MY6yymTI1O;{@;*VHP{8bMPa1Fk_s%T$JzryAQ1?RL z_stMtc<=YQkJW*{6oZfP(?AoR00pk71kqm(Ibjfw{}M4l{aSCACBX&A?GyPR+K!bE zM_>xk5Ot<+5Pr}I<6r(!=LuSH1Y#ft-JqX}APYyJ4`Q(GkBR?*Kk>c*!G)hcf&~p8 zM3_+FLWT_;K7<%i;zWuSEhfZ(VcbTJ9X);o8B*j(k|j-^M43|MN|r5Mj%)$|fQ^t} z1ONa5@+JTQUpD^PDd6Q$qD74!MVeIU$BY-9G87V3>Qt&#tzNY{QCPjFUA=w<3q?$? zf>31LQ}WKASh578X(`xCh$mA}go!bgiOn=L5BZ4^;s@+t!i5bVMvPJ8QpSxPKZYE6 zW6YT~Z|3}X^JasM6J+v~99r~f(xRItrifMbYSuy~g2RSd_UwuGC_&MZ&!4F)p4bqy zja&EIf~}yKLf%cxv`m@!LYxOhh$-6DtzXA3F=J`&-M#;RPf3|3NSioyeiq=X*-p^l z-M@!F`gC^sUG?RvpC7S2asKov5yl^Wq%+1J{`>>bpK_GY$De!3Iiec{-&wA?=OF4P zs%Z4%utN_o+Acm3M8LJ9m#Q38r9jzI&POgy;|BDvD)u}d%W3z0@K$1L$g@-90Mf}0R% z0009JXpc-e=k(6TFK;Yo!#npZs5L*anuro1v}lk|Ll6BbOga~3G%_3WL5cn#SeB6@Etv zA@Ex~@`e5;ymHGguQ+bW50VGo&EuUWDhczGMUl^Yq#SeAS7)75%}*yNmqKAbmlYwP zn5VvDpy0xE+C5CYb>W96Uc}d7#|E6=*A+wvCF_G?o#laFta$6M#~!@nPrs+F>685{ zFb8$B?j0z;?=bl6*Jr=|mbO1V7xK}EHM;49iOssZ9u$TW?`5Gx{^*w&^|{Z02Sgx8 z;P<(1jBbFh%8(GaKppFJ;|ORdhbIO^5Pg{99b9-I0jrll5|*%p3*6j2AQ!?=^+O(w zhzcBLN1GwNgAKkTh$lS3!ow8Mgg^fTBIZ!Ixn9(ahlkRKEVK|6B}lJXP$*O`8ZpBo zCTfI1#9|g%H$=(7rizH-2XBPnz3YfUFaD^*Jd^+jF??|_KGb3yS(Y0w zl0*t-XCV35n;r=R$2k_#kmWL*8+BBja9=%8{f2`gnLuh>ai)^b*LjNGWIldgh5QXe~t+re`wN{&xEFXa2dH{(FPmc&`md1c#mlakeLz@1{>HAplFyP7i^%8JY;bN zafm}7-SC4Sl;;mm^kOe?=mq~gEki`nCBT|x8e(RS<3Thf0r#)vY$xu`KCo->}gNGR|flvVd4L_<;>!ObRf*NGR)iS5Gas4S2f=iHf-a`>WAeD9<1YWmL7Yfk;u5h6n zLdGK3zV}sb-;5YtKGnfttuloWlyC*lG091BRYG5?bS1kcn4VK%#XXi-hdPYcz>wpD zgVFe3e<)(S!{uRp?>pkh=9j-uv8qL|v0vET!-%G}>O5Hdlo|U&#tU(ADMd_T9%t3W z%gaZNFA|MhNE9JKHi$l2k>aLRR&uC{6T`r^>BxQ0%qT#^J zN!iS222_ZPdu9JN+xN$`(+4)9kq`2%I2y+o$Vj6h%5}U$9Mx3Not@Za;K*5&ah|ho z-qB5il;aq4_D7%pe946HLw;GtwBOd;W==cy&9EbKE{stieo*$K`nZQIu%HhgPH=7|NzyLBjJ#FJ6 z0|O(#^p^`#>fDK=A95y0tFeJoS?D7lvY5tx2@P(4v^usVZAC$j(I1D7l-yxHh%q4P zY`P5av3F zR5*t?M8MJ9l9&7x7tf2B6NKe{f9 zuX82qNpFQLB*c$i$O5ze5IZ>M+jM8QypT7fqY%OO;Vq)Vo%6ll|Bia# zB`-&ocfs+~>V~>5pXp{?TiTNceGoI!jLl1{7t1U?f4czn(6_#XRkwWVi!~3i%09Y& z#{kM*zx(z%A|vgveX+2y<=`{3^CNHs@0EYOf@6U6#fMccgbe*y4k9DEUw-yi$8xJv zA6EbGaI*KSpAi`ZT>IxA95Sljeeu6)%%UTJ z2kXm-5qJVeaD@a^zy)N$26Vs&gun=tzzL+l3beor#J~*HzzyWU4)nke1i=s#!4V|E z68u0%aD+C1wv33q0CbgQb3tZmhc5WH{j1sdFeF7(1L1j8^CLodt-8Zg5%M8h;x zLmJ44F?7Q>gu^i`!!@MCG;G5-#KSxU!#S+OKD9bzASu|`zSn9yu6H926S-6 zSA@k_j73-&yjZ~iS;WO$tVM>%h+OnVSlmU$_{CvFhhUVYRwPDTq{UCVjv2f*WK2eM zSjK~E#?>1EXiUawRK==Ux^476XuHN}ghg$vk6skVSTsgqJV#(;#_C|jb$rETybf^0 z#%LtR>~O|#%*HmPk9@30U%W?eycv!Ii(Y6vSkZ^@6UhAV$8!wG?8t~XfJby-!*;X| zbDYR@GzfZx$A^@Sh7?DStj1>~Mv<%zko-lHT*!!FJgBp(d$RBEJ?8(Nqwx!uOvxjbj$jHN>|)TuT+@R>k4de znRV!f{VIoUNJS3m1>MjGaZo6E=mtvDOT@U#c&y8_97;}cT32YJYcRU-(9slC)pjL1aI>p)9i z+)T{G0G}Mj;^a)Y3{9*2Oy(TN$>fln3{LByPU-Z`g&Dr5h=zRHO-TQm2G6>aTv#M= z@Em@K2I9-k!XQoTG*0A{Mdf_Y~Q6h)2y?9UF-P!7$|%)HS09L@Vg(T@bnBpu29 z1JNFFMknP_nBj-7YXzC4i07(~dl1jT00+gJ(!=maCKb-i6jC51Qi?QE9Pv>KJ<<~W zQMq)}E{W174O4}=2b*)u{NM){^V2^C)Lt-#F1=GkiPAXz5VHR~Qx6%=MfK04B-7bA z)H(f<8N{|jvD8AfnLgu@WeciL1yxX$1}-4jk+_{IE%%6K8vR=@KbwJj% z^VO%wR9AsjS-lfjtyM44RX*9)Y&F*#0ak-hIImb$f>5|#D2E*Lhg^6~cwHz70YEQ0 z*Nfm)ZsosF=~hGSf=k8JeKlCbShZ-Vhhqr3Mq zf6&*1^}c2}0dLJ!kTqGmaimOh*D;e2p;IA(n1`~P9iS!JuLaw% zWg1ek(WEU_VLOI-Dz_Ichqm20o26Qb>IG)$9;ebg{3!^p72Cb#+rE_+vb~3-ExJ_0 zyJ+Z#V_33sXbU5P+v>nrgE$L+m^c66Cjru1P34D!{0c&SOntyN&vlAYwGYn)Q^WJz zc7>YLt%%btUBBHEz)f1Zvpa}UA-RLNX;4{V7zngePuY0aRsddzLAGA#g@FjGpz7M* z<Bx_y&7~zBH2C69t>Ai@17+_`12hiOt z&$VB?tC<-R2NkZ48BPf0C5&|_CWs|n1rc6|kzaq%+;>u7XX^)0r3gzUbRRy4~ti05Ca_ThjobM zg1`nvP=wg{sA(vz#cSaXCWv}?G( ze-MW|*5C(WmXFTfX{af1=;(zoqiFviUwLqzTo{XP@GlB0kY1)BSx|yTID$qn5}q!! zUdA1MKm$LB1~ectG*AKxTZfUNDd127;#C4^ohoCvf^9|!7B+}!kcS&92VnYO{3TRp zmW^qEjpUJsk$SGV^UIOSVr-~~1m$Jx=;lJ3hk0laV?MOx=_qnG=fQS1boPX?$c3uP z1mB70dk_NUksnXk1L4XYe(+~{nrweif_Y#AQdtLr9t>ePjy`T9QwT26HU;tI;DTTS zLFfg&00$6iB0p&1ub2ki_~2#vDzz|cgTT>(xNQT0nBe~4i71EjnO8xe>Hd%f(F);$ zKo0VfW4LhM=;mc5kdSCF5`X{LgAF<1gJ6Tv5o6y1DViCr@`VVm25uasWU_Xdd4S)7 zKx@>XIdPcHMFu8p(1#+%yM68kS~F&@{))VIkX3t!dh7a; zfP&^Sg+OBJ*m5n_9vx2@FG}!)Xvl0ufGvqoZ-Ovp192h(|F->_@5G1=cbd&^(BKsV zhy3P;asY?D7Eb}!${+v!=DRKkyXK$-hVVxJCUmytf#~t>g=is=F%73D4zKJF*K{_> z1$oed690;&rJ7zi0^1^Wf@lO%(Tzq3BtXCM(W-5)mLrLeM_7{nB^-sA9Y=C&7IF?kblM$@ta&r%&YnMm4lQ~#>C&c8qfV`QHS5-{ zU&H^7EqgZY+O}`w&aHbl@7}(D0}n2IIPv1fk0Vd6d^z*x&Ywe%E`2)n>ejDg&#rwt z_wL@mgAXr$Jo)nG&!bPTem(p4?%%_YFMmG$`u6YR&#!+!|Nj2}0~nxy0}@!Efd?X( zpn?lB*r00vV)`LlRk}kw+q#q>@WA*`$+CLK&r$Q&L%_l~-b!rIuTA z*`=3Xf*GcmW0F~>nP;MzrkZQA*`}Ls!WpNWbJAI-op<7yr=ENA*{7d>0vf2GgA)H* zsG)}5nUANtLnC^8f;@+$YUNwWSU@PYprD@NM@YEW+KUEB6MkE%4sG|Yn{w( zEO~Bc!fq^uZ!FDkFwJo@R&t@iax;E&Y*}`w&UQLoccqzkGIMxGsJ#G?S3=UfJanDgG|AMOId|F?1epWhOL2ykJN`!ZiuyB zi9vLUu*Qm1){0i{ibi6LMTCq?nvH{yj!@W+TV;<*X^~BQk+;T?UD%Ocjgy|)lVoj_ zQ=^rO%9Uj7l~-_A04lxsT?$oXxwh;=7afyOX25ld-(gy}Xs!yqoL2o#?)y zsK1!6zuNY|o4Uc!>cOR}!kpH_x7@>}>cp$}#GJ!X-sjou=-c+_-u&p><>~R_ z>fQD0;`!|4=I!C>?&S6F==tyH>hS0E@aO&T=k4<9`||Aa^zHNZ@%;Ak_WAYv`S$<+ z{{R302>$^62^>hUpuvL(6DnNDu%W|;5F<*QNU@^Dix@L%+{m$`$B!UGiX2I@q{)*g zQ>t9avZc$HFk{M`NwcQSn>cgo+{v@2&!0ep3LQ$csL`WHlPX=xw5ijlP@_tnO0}xh zt5~yY-O9DA*RNp1iXBU~tl6_@)2dy|wyoQ@aO29IOSi7wyLj{J-OIPH-@kwZ3m#0k zu;Igq6DwZKxUu8MkRwZ;Ou4e<%a}83-pskP=g*)+iylq7wCU5RQ>$Lhy0z=quw%=f zO}n=3+qiS<-p#wW@87_K3m;Crxbfr2lPh1&yt(t|(4$MAPQAMI>)5kv-_E_e_wV4t zi~k=_zP$PK=+moT&%V9;_q)&CnoqyJ{rmLc>)+46q^Ax51}NZw1PYi$e+VY1AVpym z=-`6@zO&$j6jnG86%b}PpoA87=%IcX0;oa{B$i0x6DIb+LJux%_~DB%rWc}y8gdxp zjW~KY;|wRJ=wlC01leMQITmT;bvxpqE@fbX|ZO8BD4wTop`!t4GvKDDMAM%Kp7yEBy=D`pMAE$ z&^OM5cS)rqXwySiznxl^v6Kys=fkTov!AkjVmTK_ZB zI{@}`(F`5%A`&Yd1YuAc0`?%QNS1U!MMCqmVace>qS?c#0;%JIjkn0~YOEvSlf;3y z=DKUI1>FLm7Hvp5!$`#*i;$d=H0!RFc47o;2g4TRNgIDaDiA)f*rBYg$~3~uAb*^5 zN+a%A(nchd?$b!S4tw&AA#FhGk2@5@!b>Cd4utC|6rVB>zy4Zct3T%$5{4bK5Oi<8 zF1OK0vAz=Y2e|&+qJj(f;M0gF*8HKzGcm&~&^`)tEYLHJKzneb=?WB#%7b(o46aFt z?2k2#D2tCk^E7e_D{bid?%DiBa*8?(Sz-t>56#=LKuHMbi9kIFsD(d<2>)nJ5QYZ$ zgg>-)(2qL+{t2MS?no)%vIXHPU>JDoO}OF!LSA=(cl)z>fHe=qYX>v@5zIi}5D4w( z9o&u2kN~pcPa_>r?oZyZ_!Bkl0jhwH9-#a4U+;}}Bj6usC;uLqq(hDn;2pToh(mH0 z1Ph?S{71q*&-4()Isd7^H-L>8;t$|6bi+I0wIbd@8|TzbIj}ASQ)~y$drbN5VlQFK<8>m z^#}qy>GjVa_+W({Bsad$0j^!{+ehIJ2Ozc;YjA)wpFdi(tUoMC8MvCBKmH`LC{7VT z9RwZ2{L#cl^siV+v;)oxq6|Gqt^@DDV51t?fqm4me}Ef=KrltSFH|mfQS-;tHkUhg z39kbs%o%|8p~H9q@sr`nTYyL)p;_Qc1$q#O5Cvqmg)DI(!0LbsW~qW>oPz~`+T8eT zcfJmYY#y83Vix_ei4F`dZ)zmdvMAUASm3I2&%9+W;W#_kxhj^ltYzP7Mmq`4P7P+W z27q9JK?&6mkl3WsYX0q)HoWP+p$SDA!DRk8!^>gFN=f(vjE z1T~!MbM^B_Bv`1VFysy+^Pm(#diIe`gh7qCve-eiI%EnL8f>e2ATRw=L_VMM5z4*-9QA42!l!SLQBL4|m_(nI@JfI-VO(%|xo|JmZ6AYsH z&sB=$fRcbbOK9N}rw5D^kN&!(p9bs^sfPw&v zXNcOa3*EG;SFMyz1BeCh5M_Zx01F9^GS#fkwXT7p1s3?)*D@&FgY4jmY|awbyH2*U zdm?NH*qf%>%cd{5C(kk0v7rH zgBrEck3)GvkJi=*ix8qhg}8hm=deUgU0?~58N^8xiTm8^Zg8FZohVDU$SG71iAW@| z=mOE1AvVFmHYDK|Nl;W9NU$=>-9igu)L^fYzzV(2U=xI1&K5Z~K-EduhAVvc5ar@E%r;{{cU=1bIVp`ER073;?bv-Qb z2P_~yjzuo5Ge*+0Gw^gSHH18C_Xi^233YQ0*E0jK@PWW`H#^e^Ey8>Y$Sp$TcpYFq zGvESslRGllI`a|bMF9j>U0xRbtIouM7I|M7o@*nz! ziAq?8;_?KBFiW0i2zi4kRTzsZfjF?h3efU8SmpwqQYQy71E=r}B-lay)&rJs4pW!} z?GgzGvW7OJfd0S&M3Z$GCWmymMqHr`H~=|$GY@>2LcuV4c3=mVa11@THAKTFWQB-p zCnrMJ0grer;3$p_v@A7siJ9np#wA&}at!_ukCp%{l*22f=zPF350{h#;b;dnGy@5V zWwQ8?CP5}zU@QCJdIX`10Elzpvsn%$OR~cXB0vI$z(LBO0(S)|%s3D?qXH+>2qeG) zb`Wy9qXrLCjdpky{r|uMB=84%#6j2N0`NrwabyRZpaRRn0ze5R{Sbr=f`S|rCCX4Y zr(;~Va6Lq6l>QJXmVhhv$SrxJG98dgPBIT7fCEEWl>gWPsv?lycZy#_E04zll=FkP z5Ck}&a1fc7Apvhsa0~bVU7ZpJ!LSczi4Zskca-7@p@VD;#h6_Id6Ok8*Fz>QAUpdY zVM#zKbxAt9atnWgA&P`@!c!=9vn&{LmzYT@u+Snw*p>p}4$w1P1Z9@A;|Z6!SMsKo z{=k+6L68hMJAaZs6(%JtAUk?_oF&04U2-==_cu0B{#^4zPF_*K>*Od7q83o}R&4=!u{H z`JVt9paMFe1X`fXG9Vm;PywP0&88}>^C<#iKCpvPqZ2!r7$9S38`!xKj%lDC>JSK8 z4z^}c$e=gn06hSL3>De|c2EuzS`IZ8AdyfG1|k~yMiHLSBGyR}E?}R1aib@33?2HR z$pIkcaHL3@q)NJ^OxmPQdZdxC4!ijo7I%LVF#|g~5kpEBReBPSN2J$bq)$4gWLl<2 z3OE^>89c|LKNk^L3U_u821G+Rji3c;AS;6qG*6%m)?f*@Kn3R04O53Hefp01F}@Z#X~&w}1n=GXo--HJ*tBtYAU&a0@elalQJh!O#!WISWCcHFLqL z6k!NK@C>uiC^H~C?jQn!6|K{%Eyk4usiXzAI;^5sJK{5{#?c5ZS`NFguDfug)?f{% zTB^vvuJC%K#AlqE;eM?;EaI{WIr??ws)YnKJ-2{T3gr(}V4OiftOuL0ydVO$q6N$< z7y1f+^|xiWkah+!a)w|;@WnTZ!m;4?r8zZMEGTvnIDzJB97gI`LGZFiss&mQuSseN zPXF2lFG~V8i==k2V3%=)0?`c!*U8h=?N0^2eyN&1XVAPhbmgLu!+Yl z61!uCHL-C4wAPS?FV{6bI}pKuA)Qm1EC5}SWVUnzE$VrKtMuuGqU5VIfJ!TORSoE z3p~>|T>BP=dy5!35Py>pb83VbhBK*KGe-tRW7J09P?L09x5II_NSX~Y%LzxC4PUSe zRGr7u%at7D*7Nix7GEgFjYwKlZ;p zDm8&(maJl1mO#7KXuHCJyGUvZk$0re5IxY^1|p zq}IEn(6gm|VWkE!Z{7QvCG-uD<`7$^pAn%|_|OkIrWGz6Zv)Yne~B`yGPVOjCBph- zR7Mai?7+BT!sXytN;-8EY*6L!2U<{~tWXXy#=#zZqD91R|gZ{r^c2x4=S3 z=)~D^q*9!uR9p@fe56?H#Uh%;B5v@FZbY|9gq%exF9 zy}S|cDG~FKrI0Kg5R9aie92dw#T^U_B)|#TKrft7J4dPlTl@^l(5~4)0;XWfNy;nS zq8D{c5v;Jk4$%m2+!y&g5&PUq9f8B!Tprzgq`;yA*&t3wY6w{jD5!L#LG=Y;aHM*0 zSHue?ED!``jHIK(%6EYerRWfg`VaoB3~5TgJRr2^st>)_Tz zq7r?#)<+T09RZ{h;dL94qh)BA1U*JRniGc25{MlV|5p&KA_9@c*1W+x0K%X(s-O{) zX8@uq6S|uMB0GI8Ko)8cWFrzXNf7_W6f4~kT{;oi*b$g(5V_67LQ&f+aod)ias`o+ zoSh)&P^-aU)(jEb13}nEanu~qsBP*H^S}x-a1I6M4|)0rd;dzOcKWV<3aC<3fGmf( zZWi9xo!tc|sBHj?ydbl>QmEuT6VkmB)IAY#Cw8YmIZDTL!% zUjakE(DI}2><)fF45*IQL*voWR~y+PoU_t z$3a@RrvbP%;?sZr@P9o(1!q??R9@xqVF@WmfA*&kjgYcyL@SMgt^n9ddi8+j)rL^y zfV$HLu<{2d>l2f$5kN~;LrV~rP+z<)=uxYZPkZN@3q>|~mwhe}PAd>l8|Q_EaerW5 zZJ-D88@es=>nY*2U^@_oun+5W5YI3;;`0Q#B_Xt;SF#cePuT`ct2Lue>h7Thyu^iG zcn}OM0=!VJ;kZWkCzEj~?PWfVZ(HmSixKn2wecO*cO$?i~P+}xxhRwIId$y^@6^&b7b<%3f=G|_Z(m6;`Zt*5LB@Cg$2I?(I6t=NO5o9MV%7) zTak|q3r}F4=?;J^Bq=7T0+0fdBw&M3gmNliJ8d8W;-~cb(Ko`0k-n1<|K5)sfCF!K zSb(DfZD8<(wUVF(hpLjmERgh90mEmX_F$F}IE)XA47&wEWQLj2U%{7t37C?U1pUB| z4?!h(N&KF_9`GiZgeeg+=ld4H+$+K8Apa4^9D??J!Tl{k$c7vxQ4;;l;+)nc7vRnj zzZ?d^Jj_ngE_>1bFmcW3pZ^9y^CIETfdLTsZuuiv(BMIY2^B76h>zhzh!G`Dq*&47 zMT{9WZsgd}<42GoMUEs{(&R~$DOIjy$x$7Zl@43Zq*?G~O`JJ(?&R6i=TD$Pg$^ZJ z)aX&9NtG^T+SKV&s8OX(rCL?!xg=J#Uc|c9>sPR0#f~Lg)-20?yv~kIyVmVnxN+sq zrCax-K)Y1&=H=Vh?_a=yQFd_Q&rk=%9az0f+#ze5#_yChZsRXWV}GnVRJ~brv*yHx z{JBV6_;BLIjUPvzjF@U?qMsX!CjX7tbmP>CS0|R7SnzM)!G#Yeh`5}c#*vUU4pL4_ zV`QKaYbh6`aXHt-HzJPI*iO0HgpVg*p1Aq*#L<0g&&b_7_|oGOD`)N;x^(vW_3!6D z)p$7o0~ByT0t*x{yQ#YJtOElZbg)1Kqf#(71|PK0!3g`)a6=9|r0Ks4Lp*RZs@hsm zL=*$e@F^21Ofkh3J(O`q8f&D;1s7fvF-E8Ma#6<+d1T7Rp+E%kK_P8alBhNYTJlMx zj=V64tX_kus3e+`@wlnN4Bb9_A zWn^W?K^c+kkiC*5m2CO?{r_>}Wla+qRvA3g$~eLjIx`7=NGq-THsPcPL^W>ac~SC0ZP7YY?j!C38=sumTx zY@UsyT30M7^Q951Weg(c-^#Ot#97`52(TFkzGO|Mu`u5;$Q83JF;tkEGV_6bqc=UQ z^JpaACyOBLN!13^Y9^K`12)!qlQz$@Xj5j~ZnK2WrK+3prNx|?mCMFXj`g6pD_HLF zd98S`iq2SgTI$_?YN#|D0#d){5JA5!oEDAPhL>OYUj2aOhK;M<`0r_XIu0vp;Xli9 zU;j2Qa#R$!KD~aC^e!h6C2UL`w=&riADwH~H!73pruQ(~VY%Rjhvp?`1!}~)y`F2G z(BPS+Hxl+=(g8#If3Zuq-!&;^?$FiUYb1tLOk2<~sXVW?v#Yo$XUSvSrn^rO;ZyU+ zygYk|lPS`-en|1ux3RyxW{oZL#{Yw6BK_LFH>uT?h6JDbb)Fua`jeOBR0Db@LUFVT zF1sg{(Z8n*zQ6D0Yzz1>)-`yDnOE8I`=b@K+H2{bXnYP!z9z1TQ zt`YM2%Ds~8f#%r`pnT$UKk$mCgT7z)yot zwD&9e|3&TzO)YN2+vzWEc<>YEU;~*?y%QGYyqzBgBr7dOmr16cN8z&TZpcW!&A4;c zP;`N45%Q?=$0Hw&fL{NHg@Z*2O}1zPqfRG+cQ7`t+xg;V+|^65sZ>}tdQ$EZb1dQI z8b+$^LlqUL`8Af%E};k1ZDdG6h0jo-(s4o+fV!H?Hirem5*{tv$nBYRWBW=XuQXe+ zfpR}=_~vR`y=#it_CV}pGL!}MF6F1!WjJq`|4EHaAM>3$Zbv~R`xCjV^4_DWhjRQk zrWvxMm+@51WYFFjpLXm%$*=*@Bf7{3HnP!%UV99pq{)uEIlRUyhoHcT6CezNe3T2d zBt&CCd35kHrLrNoZ@Y!550K6<7o*=VHemP2x|`=(Pa$$Ex%Br;m1k|oMydDg5OSm5 zy*-#9PB=z?Y{A}r;d@P`0Lfrefk$846F`Sw@DY7)uP|#0B3MEdAoC0SF}Mmr+?F*` z&Egt#cZ;l2RGWNtH2a}EhuwY#$M8kAWt+2+HEP{NV6}xQ3wZGSY#1dBsQ+aWY&<2Mjkx>@4 zR|SO)$MPylAZBr1NtY<1-RpHPbcAMtY{qzgGFZN>k{2_7a8kT$O7#LWt1tY49OsCI zvdWSBQOr6>>|zh>NO@Q^SH>y|!&C;h6~&FpJa@i{U{F(SK z%_$Z|a(q0J3N5~V$OP)qi>U)>amK4~Rdd~#U&YT%xY=7$Orf&g(#c1=`5n7?1Kg#* zvSVyWZOjRVp#t+Ju2=w|sd$~l4-kMEG&Tp1hXBh|=yY5+*bWC&{o?1s0wCJaG3qdJ zQ7-a~wyYByOwlxh?Q%05WOKB4Q)JjRVQ1~3=|O7Z-VTDO~pmr!jlIG$24-rlq2x#pR$QVo= zhPkUkl?E55Knz1T8?qq5G#pC>UGD6?BAp<#3dDe&W;0MFz-&7(jT5_ssV&PM-ZYD~ zn+%uCdgLfWVf+BX8Ab+|g4U_T({Suyux@Sik6X%0n91^kzarbGlb!06A3V&1mI@$)C;+TdC&N!D6btUp3NCkpmKSxVLB!qmZkn z8St=W5P-x&{L-j1Nrlrny?<=HdKVu1IV8NI|9%` z<4%HH_&Q;0~x8O3?g6fOWZL2D1f z4y5}xDr3FrkQ9u2-`p`#8}NGvGbNV;j%UCpp+mevM#-8V`{IIE--Y&&5<1+1df->q z1NEFcf*uvN|Eg|yxD=dE2qJC)eh+~k+yII zh?m_I%8Eo&2^KHqvUa?qf9jn0z#R(3jMm?Y34aX1pV)zpjWlQmq>8vS6o(XZQZ*n3 zU7%~Sx@vWk)N)>Eb@426+|C zBnTwk<*uq0NF8s=-qniH;!;PEh7}dSN;cJnd*B zQcz{^isq48J)nXEE)!DwY96OO(mNFN09N%`5d*mgl_tPF%yh5fX%U8$w%XmY*KKfC zB*WB-3&w*=00@xfn+2?&$|`tZjhI?onUJ%+O@UOMUG!-MeXTz$*FT8mF?t|jT$IO&(xIKAesZl_S0f#Lq;h=ah9)Dqiq`+e3x$r8kVLFdVtyksLfzf_iY5cWX3iZ)`HXFbi zXhs~V!fmfP9L2#6fS4Tt*7ptu|cigXK`2y~@3Q@u5nwhzSsH(R>w*{}ePNYp1W+%}1HfUmVut5EPkv)r@ZCvj)_1vG*0exUA# zS@*&a!C8mgNDvQFF9`tSO?KB&o8md9zjN)6N{Gco=eyi_E?}CEBMmbn6rOp0%9^xM zNGTG=+B8m$x>Hyu0SVv{repL%#c&BGwk^<`J*d|2MBY8PLeh=Aqy&}a(dNBTcF#N$ z>?aC_nxc9Fcn1Hmqu$qs^jCZw^2ki3d(4TUnpPwEx+#n=gqpu>{12s+jD&!!#7zz% zql?&Y2_mD!pd-k=J*W;lG{1tVb}~+P$4Fl9+3DPb`1Vb$5L1PPhg!vy>peGrOip0a z+0+g2FazKKf~2>>>r1Hqy9*dP;zem0DudC7kWT;yMBf;Qo@7uD0Ki*4epn)cUDfWA z!-dDQkYVar;#Dq)V8th#Ey zkV(31qvG!}XHlSUzzqS&jD?^y-ZLF9&<%hpS|CAng=m7PWn5cP`@!1rwhK$0;Gk&c~}|OzZ|X)=jvrDrN|kQT@nU=# zCUI!)B=&wh(qhuP{PlM!;~@jN2t%x)u`a1!3IUlqM(A z6%dRKz^ZRy5=R_?ivI;4z%>d$CSH(?`Xn*(jcnx{c|uA96;b=Gq@<)d&8Rc97UZ=x z=+AWrw$suoWl8M^+&Qq3a zkQr=np$#}3e?gm-UbFx{ilaGGPLVSlIU{69+yE7{Ni(%jAZ6|ac~Te6q}VT{f7G5c zc7S(Ji|W}&thFSC;wjO9IA#;3Z2j}x>Zu$sFNB6^d!Dt5yX?$ zJQ&@V*fM~h>04lzwMrQBhk{-ZGg90Jh|mEX5YG49@cV|1D|JkNg0+4qMH2GJ-Cy9`9T&<4K6NA^0rE( ztA(2EL7kRiw|~Rc_BOpkB^fzh$%7uS zG~iXBT0hhf7qsb~rE!q+_KVemWs|Zs-Z=Q3l`)e^fs-K+e(lvn`!zHGC0_?5vhr{G zU8wu)-d~uxzW_XSuiyK--Rq!WH?e!ILdwVGE@ada;PVD`!-fkNOju=ouyK>-+$RpPXpN9W62uYZ1J-ZR&UnDF;k|fBZ@v{xXl(`TO|N~2!|cQDfIW?B47QM zAaFO;T>sCSD|&oazJKb8&ulB|=#N2j@sd!6<1)$1>uItVgu#?x+HK3`LU81e;07N1_wAcCLk)k4v1}#2 zl44#j0I6k(rAhnRTp)b9S?_$CbyZ67Gp#nQBTfBg4G{g+KeFOppj2}J)CB>(r^5gl zf33g<(sB#{(s9NMz6$`H6<}n|67N;XkV;f~?GpRtLpvP#f%f)6?+$pzK~msOVWf__ zKK^l+x^EfB=iOaqxk2KQzBjv z4WRaJ)iT+?CIBb?^+TQlu?1sQh4^TxpNiJG&y5jx9lX!R*T!fyHYbuHlo`!w%g6=> z5bF3wMat0^{+uk;-P!frjgPd4R!Up|ApV*83-fC4zhYLEJgP_ExBp(5sdekWFX3|_ zKG*1dUR_y#AmPw3?I8GV4B|XryYWhQC^D2HkK?rG$i3{M=l;^Wm|ooo;?K->t!#DC z8sB5t&Eb^hTR(P=<+eskt8d8(<*j6ny|QkgkQf%lPYM}1d!ots^y#*He>B@H<@T1l z<>DD7J07mGU(^-+cyd~%LLE2hJ&joL6`RaY7DvOgBxG z(hDIAYJMf*j`9Sg+0}rAt-A}Vr)1CMoLg^{MM{KTjo}2{#Ugm`q^iEMdzNWvm&d~+ zZ|-HMfnjoEJc5E8F%Mm+7Bt)UaB&*{!6y2cru2G@te4ckCbQbru4bx+#@!ux{EwR{ zfp>5gWW@f-bga_$bYG6g<}|m2s^PyUj92O7RP^<7{}ukxXS}mKVa#N2{?oC(yyPj06ps@Q+uhMUA)6ux`p`LYK(tUI9oXZYWfvVNMP zMqB}_#k{PJ^gs2n1cj<)xlf;PrxPZ~Yr~B1M4OZFkSpSt1?lPVw+)u1=Mfrt@y@A^ zrZeuJ?Qg+Jw`i@jERNQi0tSZbLJ7eOt1P(;?SG8n-s0afi=5808-#dvCc0Hje8&gM zVpeYI3DfJ$6J-AZ5l5&ZK{K9#qN7<$q~Ra_o?tuO>Ymt3DZSG7+j-x>vbTmb)qcGB z{2ffLP86~oBLAvO-928JNTOCg`Zh?D&s}wg3l-{_#N!m-T`y9!EYS01T6a9mX~V#Y zb?{SS$qjhE1%0K~i~ZuB`z80L*tg7{e#`vU{pCVyx_Z|gMtKiQNttxP zBsLy&8ug78`lz(ZK<&y5t@*E+_ALM^VGDoOX5K}?IEMPEYyod{fm@-UCEtkRXMTox zFKS+wV%5RV+EDRcBCsyCJxQ0X7t<$mzb@^T_!`@%iav$t|d5rj{1~L_!NM=C=)cral8min5^gnJE3X49Fu*fh;{UAXCPMo zqfL7-sG}DAm1BI~U8~CMMvrWh?uQw}2NnF5TogaKLvqcnOHx>Bn?~y6KKNQGf0-Gs z9d`H>1{Sv%%@ZUO{bxF0+lx)AJUxmb6+2|%jFirnJ=eNt=Mn*=)R+CC=z9H!zL3MZ zFDeRdR~}WTSQr^9ZMeN^K`KyAC_7hm57*{ewMnO|c)>P-3NCrD16J3j-t z2p*B10##0JhVdnr>|bC4kpO7SWY>?Ig|mx7t}iB&kc6F6wc<@>J`m7^l0|he}zpN}~;X zRQrW%E&4iL)MGq+`%fW#P!0XI@%&q_o(+M%oti1R6vq#ghkX8~^Oe4xWVK&b*Oz$f zdD~PCt5~Vd9WAK)$F`X%Vrb>H?+LSQ=$59*N=SLiDzts3Z~93{S;Q>=Z)G2^@8(&D zGrmi!aaWcP-wgbHLUl=U1(0T0^3CMOR3K?=eapDi*^3)#r@a z>NGr@bI^@+r(KfCILxh}+#@iZO+9n^!r+LVA}!g|Qb~nnGRoK_-RxMswWzSLcU?D4 zV1899C0nvh%i*DXmjQ<8_R6B^q&bXkwuN;5+2o^c&K?tVP!nrsj?!kND1xWM$2zi6oN zSBi)@svrktY}?D}eL-HKt7GZttrKOhw_6ZBz`>3*JNFT)ZNp~%5w8-oSwNgKWmSZ5 zx?4u~B)7VuPe=Lhg$4ZWl**d=kN0dz?hVHm$*i5O)#SM)DVil0R?WuZyQ?>~N>f)J!)akwD8Xqbd!ZTbX4r(qA05{6y3 z{a`%-8Q+ef~R(cKTUpb!^T(vfvb$L?V{J|#BU{4;33}2t*Cyr-xw5eATPa!vs*hx0!EQ8y zc8s02ueikVUlLEF=iMjcbhioZN4+#O)Vb>$P6t9ZEiB6w5YN?hPrGs&*Uej-7lOB< zNK`Z35*2(JySY@2gvmmPI5wHbVR(8yjjFKMMv-4rkboG*GVUbX{sReU_EJAgrp{Wl zLGbg*ZdSLD5J#p+8h(mdVZU8UbW^H-0w{p$tF4D{Xy~m_tIfh-Y!Z~zBWn5goujlV z4qi2QBO?CZ2_%gF`bIFxv}Tbzx1@=A9hZ?MR=?`_WK~>p9nRwV`Iux$B_Ok_BVWp- zK)0@NYfUBr6dm#YK;k1e`-bfvxDK;kW+R{&@xIYw2+l#=gb7<(d8Q8v8potKoseXM zJ?U2dKc-*mb84)3C^qCU4)Ca6rlVZj z!XD#@OHb*D`Jm_xE6S;dUjFqjANPpf`-{gVq-oKtjmIGhuWAbvy(}(*l+5u(nWNZ< zdUqCY`B7CMqC8K0Jp~5)#mVLQ?Mrc2~ZcPT$Rb6D|x>cq?L7r7er`+<< z>*ntYb?7~)t+suv3twuX%OdNOsSES9DyAQo>V+4*;IT|!s(TS$liV%#-98-I3w{+) zr=7#_iH7im3YoyeJkImb2VMc>hl^9P_n27Sfbm{=6!G4Bm6R%bOS%Tr11&S&{MO!M zd|&dVmi$Yt+ABBoUT&Cv%W`@NUhP$@(&5PM&RQ2ss>N$10l9{vEHf1wNse%jkZ?}s?~w;c>Q-bHrdK2ZMt#KW#QuG{;?~lD&*xWO){Ta( z2NL4@#@6dUXjEPXyWG+8!U+J98I0GsDsXo~v4LCPudH5>!i#i_gjl~JcD}^K-yR+JZKuFG=`zDh!=XC10A!abe~Rl~f{k z4b^ePM7Nq8F~U)0JUyr9jiBd8Bo8rdl5QL z6UEqZ&8@?ZJUnWY}iHMe7wPidaHqDm>99n+sBR{B*b!a{! zF+6+B?MdtETedUePMGrGETHHIB-xBpB~MS8LDqJk#9lMNa{v8fb}zUz_2rLEzZ~YB z_r5qz3LO7lNXoDI2`S`;3?o}6+m)rWKs~z1%E4RIr;Ah}R<(MJ0eV&iRK9z-<7CB~YE!1_Ybh%@aS&FU#i4 zi`?9{UAE=VVHYsDrjj%CBt*`guV0ygThrj8?ymF@NaMH7FhsplPV;MnW{~|bF;7^ts08^>T<5>WtN!w8#{Y#hfp|iP z=w4z`|M;w$sIOiogDwT6O2d!nsvnC%TJ1>QZ=Y+uTfaWNF4`<_a@c6ngtHJv6eG@5#n@WE}q<_f?b_?qDJd_tJnX55Zdt>FVVEq-wWKX)DgQv2f8zSzU zX6&?=i@yy~uy1~2pYy(WZ`b~lg5&2mj!OzoU*9)@t zr68r|AT6a}{pMg3r4Z}p5ND;(|C&SnmG0eZz89+$me?GYsTBU8IsBP7QT@$PpOm6MH%Bii#e8j!`Kc6p*c^MIgon4_>6Hm=Ed-P@QKW?^rA$(4A!#Yc z>9@q0D92m3#5*e|{1^8AHc42WC~iPXNW9!g&`}S2FJMDpcpjG0PQZl0l2et_z7$FY z5Eyg`7y}YJdpBcCH-AzOe*rG#l%OSkq0P)r;A+%P6KVbM29m->U^jrIPkP_|^RTjo!HcM6pu_A-P?QgUJfTwBZ-3?akwB74@z>U;OX1c@DiZ6s6i9?^80nIT zXYHl>;welujHDS1Q{M;6%#+HmB)qtHQvOV@B2zW*vFfWx9k=2LH9^&y7cg}o{7OM_D5~-aUQaEz*%A*U{Unm(okpLSFJ6j@mP$Y#K=z z<~H`+Fm?IXl&7s)7LmLLz3ETiD?eARKGn%c?T}O_V&=~@E|=7PkAQ)WRHL?<0jcpZ z?#iCd6?E&{KOLVSZ%2)Kl^ek_c)gEY-5e1FjTrr~tiM|t)1xSI>NaZgXJ zv4?-4)fyf7?6j3@`K?WH&!bb)VS9J~=N5UFNaj6KGZS$vHBwESSQn}D?Qvw0KJNIl z^2urEm`^Xi^0{?b7Yh)lWpQ=>NZ^)Wio6}-xJ1q^&a^T#IBnm%8By9CNq?FiX<#(R zC@|O2S_1JeVTqmNJ!lgdNL;s$z_}bLD59TXG=I@Lo(l2bKe4{TC*AAXvx1AhvR+lk) zo@_hBD%z1aceXj{5oc8Sbd8&{ge{kU@y1om@fWY>ODpeExwLMGg?YHfK4Q`GI2#S} zh`o&7?>1){nTDMZx4-KRN}0yeuN;#^go-0VI8%m2&kv=a9q5K9fD5UH&+8K@28QwEnI%L0R zb&O)@+x;-)&VBLw!|5)A-8}NP#6gDLR*5?CH%vFij<1|=k3Dw>_c=*?z(A14P8$=w zoctPtPDhSE6**6qy(Y%zlcn`J0+gw!)PlasEU>lL|BNQx>pz``n`?nD^tJb4+KldJ zOruyBPwuETtp@X0MaPv)c1PciUnLkswjMJtgVA<9#rfb@K2;3kv!|jDf@>8rltcbZ znC8)ZkdRFN!2H@+&De5M=Ak%?T(o!^SmVmREr!aNwaoZ=Q;hw!=Mv%$#E~aQ+%jWc zWl!?wysO``e$iE6i2ve;;(!<3U2PdlcM^{>$gyg3)D_hlWj1^yp<32p6>H(c^f~4% z`JPkk_MN|&i+c}>B``4y&+l1Aa=l~Ij_w~!r@P&`uOq0n?%0R=n}&o5Yg3IF)7 z|6`%mpuw?`Yw7fGBi~WcJ5}8-JD#!a%qEWAU)jkgd8Mi?arqT~)7z`^_h2F>n#j7% z`#fFsWTT?2=+XA8>P|b=+q`x?^oFKphYVd?0e#~_y4{>V8mF8d|7co>iF7u&VARX0 z|CGS>v*YjLM<-8z1<|{-|@Bg@`9AkM7W;~xhIhecC#dA0xvh?I|G5Uh%XgMAg^D^f8 zrvB3NvyQJa7de@-ixr`TB2*Ep)m^-&J5$2%5dK8fKfBM4WQYqrwr)zIa_`WFYeJ(+ zDb#XF-u>=BesrNO&S-_|Wn<^d7~*CXI4uTAKlWr(&$EAM;PRnF4ei5ZD2aFD|Q*#e4_87%euCAIl@npdz3`v z|FK7Lp}YAkRYH^Tes)LkH1A4Hq_}`^X@v=g{pYKa*yN}}MiM4;JVQNcFkbK1d;WoE zFJMjX?*Ep-c|DltRDRE6)EHrhUUv)h8(+rUXWo=fK32OT?nuB< z(NHb8eUl{b6_XrQKM$7Ka-oPOuQMc$mFv92Q|;EJx>cr?q3Cm%?tOC;(DV_c>&FQi zkMdLPJ^yNO&CdK|xQ7dlL%?Lz?CRWEL;85_D6<5KK{5dMxYNvtceNoc<-IwLl8|$h z%LN#oX#Vwspw{3~mT599T!xn;-CBFtI7+YH)SH59Q_ z-?sd2H#=w5+{sdNH}AW{H#_V0MV8m$>E2pLZS-BMkXpj~eziqD8}dmAD$2OXiQ|h+ ze@q$b6d~^Vxh^BnL6PfWdf%;QV>F)v6*(WW+)nOhRXSbd-{_&P*;A{acBPMPv73|V zzje)ihVGAKH??G91a97qtvRVVe_8jbD}%tpvB3OB z$8W=S%3T=LHvq0so}K0yrOGi;oKVsVqSH;);r=!w_saOkSM0r$i6O*aMRImVP20sC zl;zd>`yvb9+on}Gzb}WR>}AvWU589g6u+q!_>rsb^Jt>DoI+`QZ0Dt=D%a1wkR8cO z4f8SX`P;?*r1%B?5bo27iQcV2s;R>KbEy)}Hlt?O`49?2_=|7ueGPhjap(vj-jFaF@Ppz-^jW18gbv~m>( zwPOHY|6=S{(~3X?=5yHEI3KYX;ZC1~q#7wPE4AtvM70fMn;yvoM1(@VEo4zep3mh5 z2|8}NOx?BZG|Y=IDCFG5o2h2KO|Qso7CS;53~L-x@nX3dkJ*XRL?R@7vRL;b?#PPv z7!(*Qt}XN@SRau)dRVdi45Ft3;eJ$~*!dLCVsN|7?A@>1z%5UX<=PLLf|AEC_NT?* zN*`7)I@3J^dQtiPDQYtk`O$l&qCXLRmalf`KMHL8wd&TF_Pd+${M{m_VONHN+fVwC zQOz}vu5ftRXwcm~tn(^6QOm`&)Nlib`VpWbO=)0q_uV^sS%RTzupG0hq8{_CQV(B7 z&+#+CwO=;JC+6b};ji{P#R`>TEq?R;@~r9-eRgZ)+Y`4t&&lcy`KzogrCh%Bq7bimu~1w zG_3U}m#Jz096)C=iaIEvb&{q@+Tq?o(-gU4GVo_9{Pp?Z^EfxuV)ZI{d#1c(wW*t@ zf{$bM@+iU4)Qu|2;w-#HHBmw(x;In77nIV&laKpeSl#WAi4m;qtsg?1gKEhi&U7=t z7lzJ^sRNvQn7!_QA8IjH87pUe)FkbRGXj@(`i*UWu43W+`vk5N+y>R1jlwFfBGBKO zQBIms)JQjH6%HzGy(`LBP@4KRZ0$_J6nlcdOf~KBXhTfn-q=V_xOA#dg$vG*_`veZd5h|@aT=z?+ zHEFr?`FIXW&hOWr^M4XN=i|6fpw{!p6d=VajQ%+J^ZJ@^7hJ)d_;7X`{-LR)_=@Xq ztZtgF6g3~mn43=l*_^~4K`trYalRmR8}yiIsae;yXz zW$wapP^7jQnsLOriGL~)KgV(St#C~Sif)UGUhI`f=HWQJf3fOjvsLr47n*(Z<71!1 zaa@%-q^M+*$(E+gk@D}BdUJWjEi{y%(+ zJz3gvwC*E(ixq|!#pZ$~Sy#>pG|xsc7tBMtr>^hzOgVfrFLJv%Crh{%1I4>s>o!*; zNN##Q$iJ);wAQb~paB@+I*R@s){5`n&PrJ1?n*Ps8PFmX(k%{=rkLIPU_#S| zecfX*z0_5Xc^uQP6^@!olq3TRnRU3yr!e$eq`7zY^GL;j- zFm0J-^Bt-luM&@Ejw7DxZqn7DG4*J*^A*!`KC|9Gbh|F;19FJ?9)`@TH!~@yK>Pet ze7~WjS zOOo%u^3s28^;#+)e-}w>l1gbdy(ku+;Y1APzUFYrjB$s`OLF>` z(A=@Ww1wy=PwBtbHazcU3wp!YwIWRdvq#oT{`ukfRsplFfSH>@RZRYk+UHqh1bv^f zt7BpJY0X}d&Xb6Rb0+!XDF&m}IzZR~fcMSlRtei;qd&&^tMXJrY@Dj#q8lu&P$sMRpW zYJcC{h^gxn7U@9Bu(NEP6WHu zHD13%AMyko__om%mG2sz3;OE%cVvZCxyrytI$TkatX@J@uq=w?gQ?-w}iq)6Rx!?P{nHeT-}%x+3aI)X(?kq-4{yG_#reADyA&Hw#X+El-C=>h}v8k ziTXiXeix7Yljow1wF1>X4uCmdErdXO3bT@QSvGnb?Eny7jAGm+1xPza|bR92@`1 zqTD7UbNP8L4qOoB=!nynh$rDxeScUITo5&%Q3F`^4m|vl1$8NfjTkvZ^TCQioCkc7 zMy?2iKmCjfTS75&6F)D8kFIptm-iu^VP3BAtWtd@ft8Q$49#m-^5D!iwIvhIdV=wj zKv^`27E;i$+9AYNbPdNR?pujdq>>`TReyI+2+rsk-c0oQQ}D8!cHae+0IGDs!wo?c z6`xVhTPb`1xM2wel8;B#h5n!zA|?sbbtz@Q@i_6p8pG*biXok5?1m)}7l2WhymL|w z$z2B55m5n#tSwmP-`!OFZZ1)-luIy8fQ^Zjz~S1ONc0A7`{Fc%@&5e66{KuzZS{nW?zbBV-=+@EHO;O>mqQ8`Nh@u zBD1NvT1b&%#1cky1ZDtBN#rjl5z7y;Z_fE%ymg@>x>0qcm&cbj#Dg0tZ6nVKja{uT zZrWerbtPVssO{$N1k~;Sl^$vye|zUIB3Nc6xD3Iks4diX=`1N*{-YRg)XjI?-utQ zj%&4-V|flWuZBT%)9$*8%(dk_Ab0-x(&w*^nOmkW?BU66`zak+u2+0~P(q>Vf(XNt z^NJy_+Q`YJ18RkET`W|xgt_Rn{Vt*RjW()-RP(9ad~$$o-S`O6dFc z1?9!vzjW63hG0@mq)+9qU2sKZsAAS}yjvY3Lp^s%L=>3EGu@5*Xoc5?zh zhO15Q2@glZ%G>R>$W+0UsWXHL5^+Ln0j2qNLdS9P8qcKh+X=lVJnH9@mON9|=Tqk; z>?xpW$Au~P^XWTpC(WKr;WR!QMNj*@os3=he9!T-JnR^yJ`y_g5)^$1+H}B5#xiWgb@_hac&zIXg3sV}4-RECg z7iPblPp%nWzJZJ>6ZVrcyHy-#8dt0=c^TG?G9@p|0kAV1l2sFOwHt-zMVysTql>2b zI;qb<2y_w8RnRJf(+X|p3MytrsAyH>-!fCo%4LOW7PE5Y-zuBt8e7boTFi=;=DKas zI?uli%f;m@F)MnV8`1yP37QZO-nCiWn#$rDE@msdXeE7d*;R9kP_z}fxL(QoEw5yfaVXr#f>h_jf?*lmj~Z|ySuo&6!UeXXuILx zPHgAy1I?dLW4?Zj*=^O_pe_2rSM)P;@#k3Q=Ig~>IpAln)6WIoT?WnHY%w+4XhzOx zQJPNZI+;7L7;3HsAMR!_*ZRxT&F#Zt?9gy%H}@knytIpQ7{pYnbt1E}U#)cv>plqr4%?TG$BIwWOHQhnj!T?R zUUr?{xf+sQ zO?BIE9b3r4QB!Cmos+Yu0XQ1E zcUeKsv*{OhDx1$M^wb{s&NkeZ^Vowc*DiO*QFCdgDc4=@A9Jb2(5TdVT)I?@JJVDe zHa`}sMgF6iYj6&FZuD3)UG>fORF!>u46Rzz&TOO4m(Fyx=G}#k$kTtcQB6yu|JP#8 z;0~-Y9DJ-6OQ+Em{OP?FtSdvK{ou!Zd*lV3W=B|1?~BJ;nY_nJ!u{jzvGiJ zU%E0cs+VX`ZCTHr=uqN3U1+@5J(jI*2ku-Pf;ca}DURuYH}zfPS*NWQqd5IAOKimf z2)88ujH4MZwxT3AKk86EeQJ(01uj}}DXy4+cyv`DRmEfUy&nVBxPFQ@ooxxTV^)$! z@EN&zSoWGGrDyKmLj#XIt4B@6A(eYxBPOU^X{%Ihu1gwPK5CRGu{ogZLI}u1!&TtI z2At{M_b0&ci?WSIx?Mhb3qy%nx4^vY*$9T^hY#gKLco~%ZSbw_jxL2A4 zhkQw;K+tLdq1fC5NUIy@#F$Y=7hhb~J8IkbayKOr^AA)?< zNzK7yG`d0)F)ICww&P@X=%-6n4fWnV>XW|&<_p)gBxY1VmQTZT&;%UGfmN-Mk?%`c z{vQB(K!v}urP)c^v%p-upcKXi2~2@G==o3v*Z}c?oE(Tdqr)Z8U=9#^l7nWzI$xP# zzdi@nN*?Zg!OG7@A01^EbneVFc3D1X4ml_^IBUb$m;=nq*HRp_G0z&=g&-Q&>99l@ zkn?pHVQ)z?Fyx&1pTyPV@Ill$S1b|DC(D_0(|%vrGt&F|AqrsV^b(jpn#j@|G13SQ zdF1G^LwV(v|Bd-?S(;|d#I}@hwysxW|B^u;qinlo*QBG7ma;c)%%dC_>k2XDyF@Z3_xoS{EE}u%j)aM?Gt!EeF^pImb_o&DaC_Re~;DO@Dp?!qrAGs}_tR zYEt09u7F{VY8!5a;_dO}m!3vM?hcu`M7&F{M6NNL*F_@r^ z5Xx|dkkMB8)Plo*d2ola>ECLq0X5Z-<7%`S;%SIj8Y7wpiLL2_6h86+P4RGwy)nic zXn3k6nGl8k$OAekK@8*cg9@|2|DqYsIGAGWrvz_EML6CF$2f*Vhg7U14_D#93~*q9 zcI=~VwxmEbMhPB%-~usPL5p4xf)M&JjxV$!$Vk#qVi~xB8|8?}H`39MoXlJ`Mlp&4 zb#j!iA>WZC38hEGBRP@a9x7kyJW4VmleC;;2D-E`pr8g#S-}Vz&>i_D zp$oN=Kk4}k(7CIfj_HRc|8(IPedt4ro4LoRI-?0AT!EvB=?AJvDo9GobEf13AdF6w zmJ)>H7f|>`J=b{C3*~gDJ^iUoQ#4eb`n0AQ025NfLR6oI;|2g+noyytRHix=MM*8{ zRI4hau1a;MpgSc?6XOjo7@>uXvBDDE(3w2=!87-Wf)QwO*20Jb2B&Jr79^?048(J$ zwEP7ls9LgnQpgc&9PCaH3oWlP^gfIQpkob-SjkqXvQ51#WD`4C`*0SXa>T4(jNuat zjdrl61?^@VTUy8xD7CM(Y$j9Tmty4gFrqMmZ(H%0R=|}r_sGX*mSKb?v}3n{5l0PD zpakPasU2dl0v%3z|HkpnlCXsRg&Ut`sL*<66$T^c4Yr#}?}h`siWLue$zfjfVp6^A z6>oRm8{b;ex2f$74}L2tUraLM4eQ;OeeWv;cH}p`-{tRtxAWig%5%U4uBd&;t6%_6 zI0GnPE@5=hiY54#agGtLU_{{tN}QAyexSo5{Lu#r+_&R+N>@UVT(_dsc16vMG zms9a&D~owQWd_cf)6Cs9YdJh`2J=kcutrgSr_OO6bDMv|XDj0wfO_^$pyjz{K`>b` zx-fzjqA-o%{|?t5qObxj-0)&lEa4H4u!2AQFfT&r*wLC0!;5u+X~5ir)TZ_aA*>(@ zKM1X(VlYApJ)8n)_(L3f*n@QUv4=~H$O_{K3|kdP;_i*lu~!cBVZ#BJUTLR5m1SeI z;fPsi8>-O3p*8>seC@~5Guqq+j<=f%?rUe8+|x$Ox5F*&5+t<0*EzSf)%|RCXUE>@ zZZ>A+ov=UShTqL5XNMQ<4}Ms|2(3UxBTGE#Ew~}pT=0h#G~o|oc-jwUFrh3~Z5c|; znh-Dc2gv;)@_ZN}jGCCWW6&YudhIqJ9q@-RUU3XE2-_cI$i)x!cG+?ZU zz3#EY|1a>qHWaE_?%j|#1EUagfSdkw{zhFlQ?I&&n;yrkZ#y_$w>sFv4t26W%j;JU zw@*Q5;OmtA>t`3cw1G4CwyQfFaJMkt*KT%o=N93;tvE~>V`9KWfpDS_g+7en4S&dD zr+kM!-DwmT+{JJBAiUhyr%cngSTS!XJBpfpmG`4n#aTL{Zp+E~N1Y zF`QI|EbN}P`hE79rXtJaScSk{vkJIpK<{85zF&%e{MICYKg@5s^Ur4d-b~*w)ra@> zp|5@Aci-)UzGd{KFaF@m@%!25&GwhC{?e8om*J=W*{Nz)=F&sNh>J|(yGREK=Ok8u z{|er)1X1v5mB)FWr+FoJc|vc-u}Xo{gI|B7*> zid)ATo`@i>_=<2eX6G{-Q>TT@wuP}+il1nUxA;|lp^LJ?i!$ho@b(KF#$@A=T#?5a z@jz0ihZ!a{DsfP9ks%GqRcs#SQmnU5(4h~5s2QpxL*s*q#P~*s1qJK?E`PQqeql-M zC`axHk5H%@^H@pC_=@-#kFddy^=OZ91d#fe8UyK%qWBAVmSA#lgR8+=UR96>DUbU> zkOjF@4#^G>855?xAlPq}|DYXTMhj7<84mqil zJIRwdNeEFOWjD!m8c9oQL>H=&7OPMUB|wox8AnDbk!!J(P3e@H1VBfb{~Ad-l_9B- zP>GaM$&_5lj%ij^_T`CQ2YglOm01avR7sR%>0qjHmSUEcDhNXDgj`Di50Md*ed(8f zsgjuC21c-rk%JGp@F9xfh>VdAnNew00GW{~nUncw$yJAjp$(Y{Nsj>!nz>v?Gn7?X zgpgGN;HMxMDVl{vnx;t_15-({eeyxd`uo+uaRhqM@AM|LOqu5mDP-tGWA9X33 zrj?rv<(l-=o3OZvz{xq!;upj@gLlbJwWffQ(+(nL8Ibc0bwC6}5DAJQ3@N}EkU*Z# zF+F>E81P^OX5gNmCI;~tM$51Tiw79eATL`m4z1vxet0yY*_(#t|3+aU1#*B|^oWXY z$p`~#C?s%*mzl;D1)=~C z1n3W@rVrj=qmM9GpqT+GN>uv+if;g;%gIw*8V+8nrSp|_wMeFAT7+lXrC$oBV|wWr|sH3Z{oe zt0bDDcRHH6s;dTh1EkffyLzj=d04<&kbH_touFhgqz_xLR!6fBWl#oykuQf)3L*dp zdqAFukOHMJ1f2>8HINIXAPoMH3*j0V*P5;52?HWP23}ADquP0;%6Kr=asIFjp4MB0 zaC)zbrNIiY0V}WrORxoNuth1OEENwT7Dh7U4ZbCf{lX7$fT)X61DQIW{s03*fCy8- zSBQYE;t&|G5U+r7u{Ou9aDcM%nxroFVkmZ>pyv<1@M!4Zs#p3m0D7=H%dCEK-takz;awku1v^J=y}1{h25IBtM(-{=Qi1h+n; zvv{kztIN8r>$>u!w`W8LCwWHkU{@doFR#D^y5<;=&;cT#t^VK+H6Q{a(700|0x*yQ zJNF8|yL!Hht(iNv%~7@%r;>%E2}`gARuB*QaDZFD1vU#9r`s>7`?}|gzUiyJ1RJ{~ zVw0Xh45g=|X5u`h)}ZA!Jh5V4)-e<_YkrA zpba{j4+^{pIE%V+yS^E$!5hrMYMHR8;#bC@R;_AAX1HXif)8a-3eu4RHNXQRP_E{2 z4_iRPYU>!HP{X$tw;jyGJ?z6j40i1sBGhGF*2o#MU<5T8N&GOvr=kv3paqSsC(;Mk)|1~5V*(@53Nu%RV+AHyv1=W$8%h_T?`_` z1{kP7!kLj$CyYvJXa)T{8NLt&T#&}aQ7ZH6#;{7qjqJ#e%(Hg9Awnk&vcN)nJQ?u7 zu;(J(Tf=YpsR9AO3 z4h32AjDEMz&ZGetQ{7R+zBF4`xWalA#St zU_ZOqW_9MLPy(h~i_Q9=khY&b^nnVTUE_zX+vFuU4}8Fat}Vn7UwfDR#c&5l70 zAD7aR!00y*F2*d3JnG^vH@Br5!N6Q@n zz+KrdogPLo4m8GTelT30EeaJE7=&=b%JpMLu-f6>4zDH(DP_cmaSxu`+l67(nvtCh z_ul_1-5u-%0000xKucht|KBE%Ndf=>0#Mv=gb4yJ0Qb$omfabk=LWd<2wYG&@$lJ{ zfY4Iy4@+RPkGI;lrr~)Y28}n)j)BzmO&G*r!ZYyh_S zl;8%-aIcBM54C2?z78tD&g?)t?0+8UI6mM2&;V}~0s1}Q@qWIGo*B_D@GK+CUk*5z!08;#5SBO*dKZf&bz66bQ596=|kTed|h4C7n?uwxe zzU9jw|Le@`|MGZC@^WPI#vVs3zu$m906sqTtPAr$qYHPQSK0uXnyp2hhPG!f23q6> zZ^$^JaDZ0e3uYhTB|3;#J>;P~9CO_b-fcm&!$IcE*zChJ(ybQ_z9QUwr*IE0VFZIOlv$`J+ zyr1>HZ|M5H0O2tF)E~60-!JI!^wF;wL~jIFZT=u~`{1vp){p&gB;Xei4i_Nc$^HHA z@3Z3H|1Yk;{s7@m;6Q=}4GQdKWXZUL4IMs&7*XOxiWMzh#F$ZIh7936egqj(R)548as?Y!>{zk}H5svTuk2d3ZQZ^V$g%2Nx^?Z|#ru(FO};fF@buZY zW`m&siiT9mH1A@@jU6w(3Rg1RcD*KF#++GV9wTlve+C^ItlY(n;eQ|iY_Ww_A86%9)CbU)v0Wd-M zI>N6(2OoqmLJ3{cPr=V*(Z@m!FKfk!DB}6dmngVEa6=O#if=*{S7fn87g4M5M9HM_ zMn)UUBE$$Wkkc#*bZ!KapcG#uvPdJ3MDn`~f}HFgT9||qD}KZ*|mqKFd6 zA(I3%OfkpIZ^$jD88D6vC*!dlH%b&p6HQ>1j?`3#CAQc>R|OU$TGUFGO?_H%PN5$)yC;fUWm)9b zW{VTnSZ}`tS4CvErO2B<$vu-FQLHdxPt3%i#4|eLSt1TG&?T127N;N|+ALtOymqC9grnutZ0EW0BUEq;y98`QMFJlD{eu!j^Ocr@7eJy6W zWtXACIAwyySw^yfWqvOn{|Jhrm4|w!Nauop#yP8%Uq(7(X%tty#&#ZwBM)f?FZ^)I8TUtTzWr&1 z7ERoM8t;ORPCItl!Cf2YafoS$N4Cb1W$Mxiy60}!$4bxFCE5{X2`)5QLK{uCSV9aj zwCT?Z%lr`|3M%Pvqx$RH(M9hi(8*^DH)z>Hmgl3N-g|+HXTBXmmI&MkZHM;<_SyI6 z-_+Vuj)yl|K`_beOd!F-Us+Zs5I1N-6j4wMyt=YAf^gw_Vh}~VmSF~gSdbvPSb{J7 zA&VslL=5Z+TLrJTKz}rl5cl%O@e&x9vh~k~H$>I{QYH;lxGQR{|6m9)_#q8lfM|wc zA%vCu;Riy9!ZZG`g3nSo5D#L{8I+*J20M5?e^?4?i=cx*(l^0^R56JHVIm$U%QpaZ79DQ(AK5 zGeR12ivsCmJpQOgHtG?7Z6^sx1#*H%jGCdN9p3N-$!_P1X?3)nGp(6SX_}>W*llLL zps7E63RJf>gdb^oXI3NTQm%GYIEm8gL-r|)URZ`7J>;D}?$HWqv;!4GYz4@C7LQiM z10Gj1>Xq2)R(_1Ntp3P{TL*#;S+qkMPm)Gl$I;J+;3=$+%?t`M@C&b2*0NRkMFvm+ zS%`d8AZG=UH~KMCfxJ~3*K377^3jTAoHDPZ^~YPg|0q0E48#|VYen!-8&ZSxB4nHe zZdrhkfmB@9xW|=ADl!0s;5Njw{>W?T7;@GjHY&A%O|0+os*CB$_PM1Uh%Tm8Sh(?t zw;;t(bJZ)B;l2U6?}cwl-jD%7thXTJ;9_*8u?&9nLZIbEom%^GrP@Nbb^GnFK>dN= zf)u!mB3!FK0DRqv0M))1&dMk<0EPH=*ux;XK?X*F;m^u~pZbU{J^1MjSSZVnPHzJ}*C zcVWYH26RP3jAlOHC>{hX^p~BS0rv*l(bi-HN7#|*8%blMl*TZeOTdUncRG&B&A^>8 zZ6jIabJXe;1Veo z^{$m&v0p%eBcUd?jIOGtXs?$Ywmvo^Qn77rciY?F26wo{J#KQB+uY|yce>TRZg#iZ z-S38XyyZP_de__D_l`Fs81aS@Je$zfK9OSqZeMnEVYtsmc)}IFaE3SB;SYy+#3ep) zidWp?7sq(UHNJ6>QPVn)T_RAoM!;* zSl2q&#SR1xWPk%S_`1(qz>yD(UF>B)d)j-x_6)S0>Rsr9AVke_84b+Jdbb@Vc7#-vaNr;;bZ@q%;(k4e)pMS zee)~-{MGvz_ot_R?SZ9auF%5E{}$4RLA4*9==(nVnBj;VkZ%s4179-2AAa#A!hGbP zzb)l=e$YdM|LN=hGyDfV{~Nz#>9VaLhE^a3Q#%iR(1qR^!0-UT{#!25FgEq`z4qHX z^veJX9KQC0KI@Y{&9gtZh`<1xK+z~c-4nq713?K?y$4K|HQNfQF%j~BK?rO?5nMsg zsKDIAKpsrL4zx4PNI}ON!M7+v6+A)WD8eJGzS4UTAydP}CAbdSIBtkVb!56GUD7?evC_yQF!s}B) zJVZlQIW(=%1$t16ao~rY|H%h$9BAtg5ocZLO{zPhC?F; zXV?#O@r8S6hsvNmfz-t73&_j(M8c!TWgJDs+ej9CM|}iIixkP?AiZbPM)LDBl%z;l z;RhBQCa}mO$+(AJ{|pOn*s_+C41X-i;`qdm0nBo zgq3^nuQLn{eh{at#LBFsgAN0!`=g4?Y3a7(WQ z7N3d^JPDt>#7n$vg&tBs;JM0+uu5_ALbSV>#KExSLjXZJ z!rUebBZz%sEUj3?X?lpWDl5xyhR_6&duYHvxrcqC1${t=va&JD6pOe_5X4kW^FYkT zj7-QB57~sx-rNt&B#7u@4vUB`S{Mg+8X-kA&eAE6aWog-42#N~O=vsJ#B9qhcuTm1 z&h7k+wju_4|L}zj!LO*gk~O=ocpwI8xE0~VjN&wzn^~}$k%iw1#$x17fWw9szy`{^ z&;FDdROkg0^DV+MtS1?mWoU=gTB|JF8TP!4;=DqJXw3t95|0s{=k(7E)zA%HuU5zh z-!h!r;*)!rhTS@>aqxx$b5LCACt47;0vU%pautDy2N}~A4CT-q)zKYIAfI_i!{NTd z>IK}=2V(H8aTtdITT#hy&vx3Ac3`v8ITsw|(JHmlD@7L{y$8e51_<++nd%2$5U6q3 z6J}XbxMm(Oa=afuK_E`%+QBC&B=)1hddojR`FAe@Y6lLQvlIY z!x1tf|6>SO0kZ2_1{2c-xK;P? z24igtR&9@(S&MuaHn-SPf>?%paMdBoQ)HDewYZ;mT@HXfh)@*^bm$rq)zbybREGE| zf1p#u*;YR52fVxpyVM2JxleT^2yIvdO)#Qy=ma(DQ%x9$Meu`r@C9j*1wRlWVt52d z|6)^@g;~paghf~w;_-%YKruaPhYa~CpM@^!nNu1a2ae$fM{I|9nANT52i027`{~nL zaaDqdT74L>47pmsMA5532N)w6epm(+(?w9!{1K1&QSc9P0iX~lQ^H@uaf?g1XFxnDIkOm{bCmqoRD}V=0hyp^O1tXYTVlaY% zuwC5M-9Abk3KENQRI3cJRf(Mw9qZfoxn5&I<-coHVFN+5frd(fbhwtNHZ5W4m$U>QtFfshoi7-<`n$vth z4%MR5a6O>UvLxs#<4*Y#3?^HlWgNkMACjS5G5(XaJ*q50pkP!Bdo75#{}Bibw!{#I z({`zV;3{)0i zf@o!eP+@_H&N!!D9%g|6<|X!BVs-;r|A2=rKxVGEl`>3Z z`sL_EZWR92MUIANh1TW+G9)^MWyuZaD~V%ywFqafUeJ{(W^v|$Xyh=5M+f>hRY|I7#a^-gY3~dN@+0E#A&;oAc=00-@h+(wBxmv_ zKk^`l@+g<`DW~!(xAH5;@+{Z#E$8ws_wp|X^Dr0lF(>mfH}f+`^E6lUHD~iSck?%g z^Ej9DIj8eFxAQy4^E}t{J?Ha2_wzpo^gtK%K_~P=|2OnQNAyHj^hIa%MtAf_hxAC7 z^hu}mO1Jb&$Mj6s^iAjVPWSXr2lY@F^-(AFQaANeNA*-!^;Ku}R(JJRhxJ&O^;xI& zTDSFE$Msy-^_=%_ZinsWS$M}rb_>JfI zj`#SF2l>&O1=|JVNQ=l<^Z{_h9>@E8B_C;##{|MN%x^jH7& zXaDwh|M!Rg_?Q3rr~mr5|NF=P{MY~e=l}lqe}M2Oa3H~g1`i@ksBj^}h7KP>j3{v; z#fla$V$7&@XYC(oWfe*z6E zbSTlHMvo#*s&pySrcR$ijVg62)v8vnV$G^`E7z`Gzk&@bb}ZSlX3wHct9C8hwr=0T zjVpI9-MV(~;?1jfFWXW6rF3Gw05p zKZ6b}dNk?Mrca|zt$H=<)~;W}evU1BHtpKBZ{yCbdpGaizJCJ`E_^ug;>M37Pp*79 z^XAT4P*Yi3Q%y8jk;PgOWn5M} zU6iz67Oh|#$6y+4V_e8%9z|rDU}S5pWh6*uoWf=z$z~#SX=BQ1CQWOd%xx@rZfC-7 zEQN0@&2KQxaWhtOp}}%9esgSDcBsyFI$d|AnRhaCc!0xsJkWVPpn5lmd~{-ctI>T! zVSY2jenjnmG}C}bRDv~ngFTRgdeehU!Gufgggse>I&g-qfrgLNhf;2cwUvl}UWq|; ziLlj)R_=*J#)?#8j76G_gzt?@gpN+vj$37qN|TRLX^~BQk+;~9U5t{P?~+r=lVaJE zWNnmFq?Luy+LvkYm|NSKY;>Aig_^;bnq$qHYm1z^mYk>Zonw2RV%nZ# z-JWx%pKGq4lzyOQ-l2BQqIQC#Yx1LQm88AjqA04}xsc1ct>(I%_PUd?yOY?vo8r5Zqr8*ozM=NMnW(>*ufN*Uz_hu+(CWdZ ztHPYz!=>cJq3Xn|_r#*H#-qW;)7i$k>&UOQ$*9H3)$Gc%_{yx`%f8^tvbxNz?#{XP z&a=MHvcb`|#nI=?(cACQz2(xu>e9Ex)Vj;mdI?9{~c)V=xCywlde@z%r1*S^o! zz{uFY^4Q1h*~-k?!_?Zu+Sve4*68Nu_U7B(=h^$`*zD-r_UPW_>GAyO-Qw!q_3PsK z?BwR{;py(=_3!BU@966A=k)OB{qX1Q^6LBY?eg^P^Y-!l_VV`m_51nu|Ns90000R8 z0R0IZNU)&6g9sBUT*$DY!-o(fN}NcsqQ#3CGiuz(v7^V2AVZ2ANwTELlPFWFT*({Vj%brcUw(Z-v zbL-yCySMM(z=I1PPQ1AB z;Di)bI1m*OW;mdP7Ix^Nei#C%LJuUCNa7PF_P|09E^PSWi!i1aqJ|oB7~_pNdN<1`bnR zY37;5%}67JRHEtTo49SUW`-iP3Fn=7x>gMiQ1&T82P8llAe1C@AVQygvcZry-jt+_ zG8k2(%swK`qfDa)sZ&cQ=9Ki#rFn)bszN=?P>Vovsv%G{F7zPLJ^)t#GtxT%_H)q; z9q1xbDjfu25E}yaAgf4~bU;Ny^RQ9LsLZ0-!>IzP z0^k;GNIAnu#U6{0oRBo@u9bFT1ZxMw7SstFe?Te_KCRfHtgXs4qKhDZm@`Tv?o`4C zB$V#cNV^Vq;!PoJKnIeDG7!K1R${9^<`hDP9kLK~Z@w!EHH|2X zk3jP@GD<3J==$#2{6R8`It*1($T1Ji+p$1N2-t~0JqWmkKZOYYSWOUy2KaDd4gN;VK{*cF1-mUl(HSGbafR7%a`}1G#jdCO4A80549+{*=ju7A-xX_40au?JJpuqe` z!amOQ5QI7ZslYdYjTGV!;4^T;JK?n=-T@ot)J-|CE&x+(2hMv;x-q<0)BX41n_aKo z(TX4-?|LN29-#IC3;=aP7cC0iKpGSv7Mw11#sWs&T;LA`DsX!@bAc}U#sUG_4tKgU zmOq{#LT?mG8Rh^S8@OQ(0I@7@zA|11w$O-9Des5rd4hodia-#yO;AARYDo160zB#U z&mZ_mg&ia}zR&@#UGCdQ;SL8NwiRn|fHR*zTC}V`EJ+!-nw~%YB(f+@5kMUT9mD+5 z#76Y5SV^=4&I+OoJxHzt@4#TA8rXq-)Ukhn6NEr8MLRH5E_YG$$J92LJ9Y`L10~EE zfcBxocmeT~;mKQoNFX6u;7SF05Qq>3WVVGYaUj6zfC^@*0%Mqi1%TSz_-uE+4v1_X zo7`d+{eg)N3@&eKB-64e*nw8ys&mi0WiH`3JJ`7@mb9#8-)crX3C>OpW~&B(T7f|c z)ew-_q@^)%l|KRQ%^zffqzYG=ghExa1MTYOA_3(84jqE>pXE^#2mSHK4#-Xo0#bn| z3-U^VT+dYwYsEeU(SZYrvje*v2;g!D%t8v2n*SNx&4RFxi!Q4g^mzxa_ExxoRdj1` z1V+=m0SN$k?VbkP@Pu2>fBZu>$n0;PJ+gNFcK73CNM^U|t81r$hjeAe+^|2Y|Lk zx_*T34@1I7-&7@#jryabT5KOkjp+|%BrZVhBq)Ho6g|Jn60iWm$0!VC2RXT*A9}55 zP(fQL5=hRQ;sVfG_kqD)0Z37y7?C=k&g2%sP|Y9AqDVFzl_H$@eetD#M-;R+{^ z|EQ}IGY|n8>ee}e;K~kva)GSC3n1~H3bP`hg1)%hD@*|cmm&a`H~ldRTz0op7?hJS zzvd4DdsiTp_>%~>z#$6qRi1lmgp^z$MBJ+Ou&We-8dGQ1<+}JfdPsr`a1aDFoa%G+ z^9LkYs3bDv4kPoR6hV6S5loOljkvPdL9{w#3K!%ZCnHFZ4Z>uF;KPT}g`$%OGUb9) zd1PBYh@JNF<%_-e%-T^yp$L?OcQhjZ30v?6H`Y9$Ak0lCj*Xs_ddd?FqWRBNisgWk zfV@g<;R#y+LoLjq2FwM>RX?dj4|ev1JKy9T+XA$vbF@kUQ9&JKaGZdG&36C<>tj@KrfwBb__}bSpDBOeW(1~o$64$#%xfH4!5|+P404=``qYGx4PHO?q>Bv$Q9WKybTiN zcH0}EWKi-&qNFa0sKLAr9T`T^Vvc%i12OBDOwA59@96jrRH^@K+eCVPT`Tm0%wbGA6 zbwZEU)(DFbqC$nZd?4niL`_{#36mMbNfe3u-0N;|o%@|AOSi}FgVCDdYC(Ki4>1zL4IEbs>`AU=*o zF0C_0(z7%0bS^c7JZtv{BG3tSV-D9d1Fi6Zz;ZV`(+DlXd<)1eLgaWIU_LY80(Fx+ zGT1uv0ttYlFCND&z|sR`PvHYgDF)Qizdxbht)bp$s?xIe9Y=e3(MOFnV@i2bEw9J-9VQ!zW~gh-)V&Le~M0 zcr4&3jt#UdHFb%Z=zPW{S-4^h{t%Cq04tQkE2Zdsz%mb)lmy{u2Q@SU35jL0_>d++ zCR<=D``~&6p^E^BbK$dD4kb&nlL{h00)@aq%Af*w1u4up5I3U&C({Tdzyfv(g~r1Yz)PiT>*KMB`eoMCN3a5`ygRSKq+-e zI=WH|e}W;3gmS`DD0Q2B!7E*IH$tfl_oDNzOSa3jM|388QX zDG=QMc{Fj;ciV}V6cJ{JiJa^i63PH@7hw*tco^4niS2oxjj^7d!CL5vpZ@uu02-hI zI-mqvpv*EL9E4B-q72QZDy;J<0%AU}gHfXsJD3CkV zH{<|40CEf!+5vVz4iicaH5DL`Kn?~X8u>;MozNoINf9n!pM7zoCt(a7`k~1IAmnhQ zNSdTdx};3nq)vLIkiZVR`56{>e-beRJ30|VN*7gn5|2lu*I}ejI;Lb=rbh}m8JZb9 z$D%(M5m*X$c0dM1LpY7F1!^EGf)F%MpbXVe3AI24=EDtBhbevfr(%Q;kYJ~CDi>@2 z>T?j+2u~1!=8$$A<1mpbD=}1zTD1>_Kn1G@cC{dOV456eTB@e1q{Nq|m=Smu@uoTD z0)Nm7A|P)#Kn1mc1GzH;BAGRwi36!%LGw@xGk|fu`m4au57Q|NL7+8r!KxHd2tnWs zvd}0qAUp0L0)iE-)2c1Tl?17@1-3e@qE|cOGpfeX2rWtuyRfdiaHQ2x4X0YF$H1=e zdZff>oSNZ&tvW2?f(beLb>*ss1T{UifKm$O4^&{BK|rhro3Oed0=BXR%qkc93V-#t zWwnrY1~GDlU_|i6H;Tfs;P$0CHCHSsb`dy%=4u>9>R3VWvPZfFTM(~FS_w}7$_6h> z0yc}JcCcWVafJfG4HHQa%D{DHW?#cH3P9Nj67vVPgR2BpFOY{STC=c;$1M`OV}&)b zaRIc|kcBVTH9k8Kz_dlD_a~b+W|`Yt~86J^fIPJ zu(L_Z4oRS@lR=BNSPQ^YZ!RzoozQ)~I4n=qDCR&wLUl8;wN*>3np+Dz(>GlE z7KVF^7&#DslMr)igcyc1sarEg21R4kM&3}9bX&K>akogS4Kk|=dXyQDEnSeGG@Cj+qqv3pzpLvweRkHWJv zGhht(Kqg!_J(xSSU8S%hFgWhJ1*w3n#=?1Z!M!=L3BCJnFWdzZ} z4~!cpOb(8vq*GVH21O2kumvTW3gi%D9sI#Z+P6o#BD`z8fSaU_$QSjt#sm>%&V;WD z;l2GJAug~Fx}dQwk)u@L0`OTBY&;Nu+(Fg45z46o%voVcAOd>-(4PcR3oL|$Pn;b` zI>kv^#pGbYN1DZ7ETUR00!PYqVGO+$YzJPV4JmxacK&Yr%*z1c%Ny~Y z5-|^18p+ZT!ANS!n2g0*?7^)-0-B%=^r8v1bEGQ3#m=A%?WzrIb;?P~E8L`{unSPa z0zqKLNIE*Ld>8moiVm@;4`I-DQPLEF(g%$ucl#*HK&Bo4Y|2SG(@LrhI86@KVAJoc z3_-B7BHb72uyLkQ)GvKjM(PYs;v_Wf!FwyBO+5-ndOahZ)L6|NN$}C}y3JU!+5@THz=2_NR4IID|AgNHIpo0!0s-k*>4ikzFF$y4%fDQuU0_gD8M4}RX z$JR#?&>aD!6XA6mk)vg3m;^mWKAIDV%@T?o68~2atRe!D)z-YhI{?CBg-JrmTu64pHtaVK`8KsidsbmuK&?L`IuC=ke<-2I_PTVMys zoDg3Zc4}12K33p9rdM0nby7!lIThbr_cfgAsbAr&5>W{_V0%dbUAkHbSwon%imSzH zK~>cW%7Cw~>f*QB3MP)?+z_n8DiH6$*%|f@F%HCCLE;f%!3`o~-r#tUrzncYc*0{U z_nqJhz8~T=q~l6@YT&CY!#BfPm7WKB$a8szN;;sYNZ=~0Mt&1%%@O)a5x*@EGeEFk z8?ddQGZK=pV??raJ}j+(=Z#gE8ijF{pe_dguIC8*&o1%iD^aoW=VX-t22a4~v&TVN zx2FNPHR98M{_uZ2Km})4Ggf}(@KFgVM}PLG5RH(sYeXxJg02AAN_zEx=GBH!ipiUTmNT z^BcM?5$q}9wO~6Cg|H9nbP&!kIO6jJxg{aAqF1s43{S}hOsh4eZtCu_1-!(CUU(1< zECRYvuHm>w_a~EaNbP4njc@i~?40-+2%^E%4^@a;1Z@hLI7`%=2*-p5PB?knUB$f#onLX5`9 zL0sn#%8cNht?%s-1Y2MZ$b*r*h!6o!D=w&;R2*G(SkvFz{w&zWsL{=W5CQ4VjYgzX zsnJM@fJiCZNRbi{P*GC9q|zbbMk^plNQjauDj*^%0=e~EL z9JtKq-dY`5yZP(jrf-~od)N=VjA2fU{0Ugid$qrEds`!Xem+1h?TY;o^7st?^GzCM zKlwYE`vv$72Df4wk-cpEkmJzh&Yky2E8?SXtR!i%vl7+d9B{4}G0{3HT#Bd_i~+9= z?;pNV>$`QQHxC6`&K4#?u0%PLD!{>{-K5ssQijx;a;i+jz7u=@|n}lnT51jV|d5yS)o$VSD ze%^HJ*z%%}>PfEQ@4_00<0DhOxKP)TlG{VKc-3wS#e{zx&3KlgOV`wFx;&HNU=sTN z;F|YnCJWmc_oaidp^YuWlt`IyRJaQcEjk{})QP$C@}=(19fit~sF9Dxb;X@OhbAAZ zJ(2C!8vedI{GaM_$Nx~&7oV4x=?S&DqW*X=lkzSe%Czyiror$VhJX+?3?tMC)NKRA1C%d^wE+kB@m5WTkjzyhutvVrAk#&kH-o64mLJAK;^8y~e)CF&nipPM zcLvZU$f+v!)cl4Kp$FROnd!y(;b+BFn9FMyg7u{sH*4~DU&-XHO^wPm7VnLfYknc6 z+e}8?5y-r5?eC9W#%I;(>*ST~X z6SR@N^#zx}WM)d_WwG-K9hV1+FPJl&H8Z&vi8K~ttdH#Cto3S@w71KgwRbf+^q#pP zrBPohw#$ff@;X=JVCC>YGuP2c+9Imj_m(Elg3Ikzo(fBQOa(e2|K%9^TH7u>HE7f= zU{RyQExparBc$-|)u+6rE}hJK;f70V*O8rK#-hdfFBeHhe} zf7z#b-E(ZQ~!h$KH)1UqRg6;fUR8o)aN_ zDSsQ*^R1mX;rM*l#3lXRmvcY)OrJz~hJUQT4~ouOaY|WQ&&z&MJs#lnV$$1ZSivLr z{kxOG9zbN$d^oHGbjC-RAyAIYF2n)3woZkpe2V9?x1|H@&3aW_+t&LJ&_6I&CH)T! zcwOcxKcF_5dfEj3B_c$yn~)Pz=>_{FZ=$zlop#>ZGdgNMU#aR8--ZnaXs@k2Tr^?T z3C_ajq-8NZnlrq9*_+FG%LaG`xh2gi$-TCqkCGo5)Vj=`DYJ*uphlM}7!PLzZ4smP z3kQlD*0D!a5O*LsarJLS(#huVT_5%&n{ET>-Kr!+kt&3P3&&VdWOB|AV5ZoE;W$%$ zjt);adoaP`Xp_nrm7kQk`sPdyKSsDV>CqPpc77B$NZ|zO;$OI z#MZ<|;=fQXvD)cBx=L8LXXFf*S;f{{g9y2F)Cl3LK6GiZPm*Gak~#>4b=5Nbqmaxf zOV2P-)Lg{U8&HHch%>WeUCGx=`Rm!dBCxiwUS748ssyN%-D~;HTEPVQ)g~X{^AHo`{G>MF9qQ*0bo7zB>Kdrozekh3p6qQO-D`j|Zl`V~di ztFmS>b>1(RwK0HKsoQW6`qEb#an_l!@?R>!+N)ZDb{_X;9&0-2uXN5>IyJGG*LgB_ zg_%nGk+SvFQ!t5j$xOSAMK+TCbq4{T&a8!Hw54FIdAu1&Hg_}H^#t|vkp1` z>}!I3YV2(O%+W*wD7SQiy-sz3>G&Vl#4lptPJ)!|w|B_QFCUN*e)y3sYb+xKYXCO{ zQW~3DDsh;z>KE}8$4zXESeS{$6(pmnDUZZvc<>e9O50sm%9z3OdK>b z{Zl4%{C+>f=732A*4AwUncP0~@hLfJ!EKjyvupKN$UsvNyOT3ctha)~)O+m%pmMw0 zRwtPy^iEkaD}w3hBa|H4=pmzBVV!$G$mVPG(B4nuWaN>>BCci#n0od0sK~`*6;m&# zRibAbo+kg-^qvu-%!dL1$c{@n0*?VyF*#zG6#yvqGW8>dksVQ_z>yIc3Xg}x?t{@z zuV9L=QJL?J$SgVZbp`+r@aW6TBIAm%M>$XOA&|8*kZHkFyY(cM7Uyso(jdc{VMc0_ z50o+56n}CxhiQ-nm{HdcxXGOh3CQ3ENT`dtf%)~tId1!_6C|2qiJJx(gzCiSQo$mY z7qTGhp0}YK&E!{gn;`zgA&uY9_{YMKSD4dYLy$00zE$7wCGhNqZr&3F&M!BR6D>9JujEHAaBLRr8aOR~r(4j;s3*`hz*Lr}DPCFQoc8`Ri zHbJ78VTktbY2CFHq|rM#VtzB+Y+oA$P^s5Tu%wuN!P3ppcD;hh)wYR`?*l($L;=h; zs2&fv7yRh^Q@mz;`0Wet0 z0aT9!n%Et>&6pn^PQ-x9REPQ_#=|U^z}pUL3K};7z!>uZGAmDqWp_3n#MJULDK3*( zf*ZaX*?rivY$MFTm4DShNZ{N*>f2^l4v9h}utuAb!Ao`!a`MLZZu*hI2sUMJ*>`8n zX=VG01qK>T#j`sDWXT+W?6SDa>wkRnjAa)4srfUx?cuxbhY-@yCD3t@4)gr8QY{@d z(Nx8VCbT^1k5Z1fOB+{Ek3w9^KtH%2LOVGRfSo}XMRWW_mC`FZ$cb&A@-%BWdxU=C z^@(^m6gc^*2#yDVhIx( z&nzj=T=fEirH(1#!>dv_xD3QF@Zm&2W(f$zchV7JaHKPE#gs(W%E|xMZ5P%9x~G1x zOJ&%+!3D1j%mbF@1aP(7dYAT*xgt1*2D(b(Q~-?vB~4Lx*o9u{i2nkm<`VB zRCFl{l7-WMF5hRtRifw(g{pHKX2IhTWvyDrWFM~X!De!z-Qd`}7lFN^rhA zE6&jGcC(MGF_<6ZtlW12 zxo|23h$g$>!3@M<26=*8w-H@GEm=qj1}!DIbR$gqsrXx0n@+rDh$0l+*#RWS8rC6<;A!OR*Pm zHi@brq>DV1Bx6L*H%DGl_phbiGay-a=>QlKXY`A*JYC8I4egDj!C;^uF9}wp0U|aK zNeU@JFYop1fo;xnF>3mubHsrKL-o1n>r<8&{ zZxt6fgSm{XTCFEoZ*YeO^!|&h>pNd%M@-T-=slyAf&xg0$v3<@rEEr=yqlo*%_RI@ zs>m)}1(+x(O3@(NbcVgb+_-DkVTHiTmNt>W^Q1f_wUBwpzdBn6fhkRbE%67OelwWS zJdEM2#YC@|or{2+hLNady&d-PC}}n2%se>a0CIgZNgtI|I0Dv*o#x53(>AI&zGN>O zXm9clt+bt|m&5o`f2Or`%2b#`+mys$3j7sBo#qZJhv)RGfUO%zLHtfr2Ieqg5~KjJ zvH%jd?pzTiDQ>{T(tyh8A-#A2;Y@a+Ld2q#mGEG&I2?`#m{%q&-0qmo*6;Vt7~XjM zLm?5YPKs-O0g)u#6-CJlqM>}9`C}+(^ziB9`jSx!l_xn|GYiQC#AyKNR4XGfpY4*Zf)s{&>t{C3<(U_ zrd#9$Y-a_Ak&OfxRifqcHd7NaoSTd0HE+OX4M+@p^Q?z)q@3w-SaE_fk>AuVJaT!#JEBSkP2farWUL4^xE9?ZLc9$~&vdoLj{EL5x z5D#|7R2@bil=6Q%OjUR7IP%mX>cEr8vA6jzpH_6nulF^c_S7WKOjLG8Y!lcHKE!L0 z0uiucmt{rf<*VEPB1dD@2LSIj(S&mvI6ayoNU$krek~j6uu8aVdAapU?S1!BjntieYCeVL0J0Qd zmRG%kS;?z~z1!CM(v@t-+RR{_BhE=G4LT2UZeplPu?aQR#=mEebgHX(F1a#36YmDJ zxjnt+2*%Q8FO0`40c>~+w%xZ7q8Y5gZEEcKeOF&C)f|7W*SHkar-!#~y3@oY($4Kx zn0wg@nKCFgKoW?j>_>k^YAS0H=z=L&zdrdKN+9Nmf81}lP7&FXn3(-MO-9`+&ze*9 z9Y_3mVIn#wq3i5~*QxN0p z+TPZ1hAz-Ee8P)-=;!9Lbb)_|X5qjBB!=Y3Gh1Rv0;@Bi#5yyY&ujW<-+gQeqCQv9 z>VRPyV}j;fP@Xxn9Yo^=d1m*~%$x|}>F49^IS?Ye2mn(lm)%LRkBQfXH6fB8Ae}nc zolkHiyNk1I2A{v}VWOoeMURvrn4T9UaW*uEA!exZ{6T@<C%baxD5I6Mb5z&Qnl*ns$TzX8y4_>MHO<_7mA!* z?{FOfh$OwY#zi|Cu+mVXq34}Ub;uJFsD@LwR+|IQGrq}=%c>V3t z<>Sr4KT;BtFAOz@D9HejXjq+jmrjIwsA|f)ych38Lm#~Un5o6EQaK#)vE{%g^nL3X zT_+C;#sf1+*3df|r>Y>^1q%gC17#6SzeB!_MyiAuUQz0P7D&AcF?9^j`3ml2n1Oy^ z(4_^cL*G60bNJ8|rSdXF3F2ir2Q-8Nda`U-xQk?1DEsHnI+?08_!Sa!L_jhto(s7g zme3KFFtUsM92WBl=C(;XrRa}J+;US$b^psvkxZJfO}}JHxnZ4ZGoLDkFDLRB_}~A* z+LL_xCuc4-15*=)t3^2eTHqv}cl9-Z474-I}|+TMYk zO~Dudw$g|g`PlwQV~vNjCy_+l>|{WM0DM7w-?Sm>&UNM_YRKM_!&}TUKnD-)gRcJE z`VgwW*$GbrW^@?ts*oBhI$h);GY{7xEBYxjniERKVbj*dmMGcpD(X*2ypsoK=riff z0Q%vlbk(*fBXIpY02vk@!qkPI*k{&UTSv59CY7WSA?(&5+q0K=i6D=&=Y;qkM6q6T ze3oR+D9yc@T5pp7u?VD9Rg+n@<;r_2c%dNF=8~TkGL(4EwlVQ7vheDis}r(b-qiNi z-=yXU(qIK(i2#^$pgAf)2^#=9vUq1S$1+I-kygPAO^*a)!aIJVL^@ixj-1^*{7!UF zljZa;#f2S>XDx2f-M`~n$$G|aa$4JuT*hHk2e3j4ToNdGuu4}==-A2%uxuEQH6lT4 zVW+XDOW65c?Jn;O1h>q5*F4Qis*50vX%Bh#URZDs8tdu= zOtmWJBH8Xi_Vy76WUZ_!+02p8+4#8p$wKZMM^ErkD?fN2fDs8`IEPHYE~YOD$*UN= zxr-f0<6w%~#P8sSG6dAZw>E!>j^yA>a%HL;7pv`N@#U_UbC()1LY;$a`&;F3Ks6&| zKkDn-A!jLt4`6}4h3YJQ4|nFMuM*Sw0#*#~(xx^|yln{|^Z|gJS~)AmzP6DmDXOHN z^+em8m5l)cNXpj-{gM69jg_|C`u0bTn6^~|m0hAY+`lcR6T2dP;pr>oewe@Yy~gZ} z7xq$8W;?|>%#~*jPCAYt)p?(1YMh#XyV%HW%{Tc^^HhiAj}+?#{@mH_@^0t=q7u6Q z<&pu2>P@99I}fgW+nhpQ+EaE@RJ;BVZQ|GSZ1URdKj;bLl@smQThAUgU-|y%vWf++ zQJwnpME7RS1Y*$Z?6R0>ZpU3TnZXO{7F$0)=^X9Qorg(+UFF3lb6<+~O#a5-flX}| z&NEEKrmIh-irx>vQTSlSDF`3?2w}w=RS4|$MQ!zpbhWh((QKy6(YPD}QbX*Hk-&m$ zx{h%^(r8^$k)5) z;btU$cEPH@{T8C1T7I*7Mx0l(KpWPTUBbp*NPVr#;n4I^U-R&9KyqLtaO=TcMgQW* z_dX?8KY5tqV*<~_FIuy`>Mnxn5?Vgpx>oVDZ9Tc9Dl_-6Y>SoNI(JJp-&5z&NlBl5(!zE7=r7qC31T#u?otCS zMGdkFdC3!~Zf7ChY7Kf+`!@>qndi~r1E+X8QpT1F5xiiD^8R|Xz?@mtoUEtb;>e)I z^x_p!|0r$pmLlzmVj?~m;gx})0u|EyvSM@ze-&J%ZCZ`6c3C(_l+?y64h_ymr3r1V zksJ}F$Uc~#i?z)tGxgvnRB*PEGNElMy=|pO5W||IDrRtC*6IfN)$amDSfTHKKI9~e>L9TU0t0{V^SAee93$le?eC89MV2b7;?uZ;>^Pp(Z2Vyc-t6{ zjew`tqjPy>mhiig&uaB5_DjJ9)l76Qfv=ZFzhr-Vd2}VG(v_&X%%5Qa-OmTHMT>Q8 z%LTs^NLzCB&K5tm()>PYr)OVUd$XYwa{i~Wcg?DLLgX6biRp}_%zAdHyHmo1_!y)C;8Ju5j*GD$iC*>x??wv zr@3nITzKPc_hTGizvPfY?p59#jHtkL)sRY4V`i{&h2Xm6u*TG#%-tGO$T`Q`+Q>@t zDQvQM(Y!$D7!h`h-&7ROqsPm~k@j6)jIpI3#@&EHEHk0SFzh1M<9N0rLI#sF^`3}4 z2Yee@j=135%6ba$RnJ#&(EZPh&}sF!wr+9Ri0V4nnUaP5wPzt-4uhhgqbV z=(z%Ip|AIZs{;%&=lgWxLWvmQU2ob3_w@#)D zkdLc$85hZEUM&?OERrUAQnutv`xOE#Urg-|tyBiWOnxoutF-O5pVAiHjD1XAeq0-H>!ir-go93+~rm7XNs$^@;Q_2J3qFnjc7q1yxZjpfR zeblG!o3);;qs}j1(eKBn-(q%|hqGp-K;*>9$X9&HN^Ngdcs6l?W}Ip6J+O=$9V=}0 zD4K@o6x^lCLFM(x6U>oKW|V4M6R)kYk&kU|{h@XAr{K^aXW_xKg z>WSZO_z>r%GB8=Jj5`V9e~x&)eSuM-y)n)gE)Q}zu&89q_dOS$SLd>e$v~!a%uiRz z@B{XV;1!?NfUGPM?Yu;`eAw_6ON(MR`OjH1e_t0Uncfx-cTMdUWxA=WTz)zDEy#Uf zoURqp%QNn;jpUemnyYLg^f2JYInT)lC(2c|>6gO;78BmjIt8lOeqDU1IR#?AQgK!| z@b)j-X>h0GZ&RzvG$yZPaltL=P0W^hvk9wzt+NlGnJ8>Y~>5Z*2!Vd}Mx5O6>Vre}#!w zeSg0ta(+$djEfU$g@>~v>yPl`5UK7}R{yBl!B2*~kt%B-C&(cBwWtBcYF&hW@Gx_{ z9q(kl5h1EEGQFOTf71O?7T(bA(VJ5=8b7^Z8aN*D^ykB$-JfsvwNE62H0I>AKkYY+7`=})LAl~n22y&q zBPwDhS9gBzHT6$!dfGb7miBl4?D>9kIrhIdc4x>{-j8NijQ9LX&s?(ybNp9pESmo3 z&&!F;O&xl>_D8{YsXImg+0Ry`2$0kbuQGnWdL0aV=BJi^!y5bIUV=_?<%fuc{@&V0 z7>3v)shYsFM@j2751C>cXlFJ@geB^9_Bf%xOXc+bIy$6eU#G>l9yKsrW_-qxUp;X25|#A3k{IqU^4(v_XwgUV zq#zDQ!1_?Z0gQem=~Mn{jB<%jcR%Z4a)lV}8N+JK0XBPOHDkjOO9T%RvaM+uN+bGg zv@tv%4rk1qwm`1Il$`Scv6HX{s4~wE04X)V{I+y3;1;2~RH*#44;TAOVfD)l_zlTK zg-DSJ202__MTf0&Ibzp>u?3M#{9D5O9F{6Y2Huqv*(8{RB$hviAiV|YQug-FAMKPk z6|(hE(iK$@@YbsveRdDRuMP6YSE!j*5Y!V3@qJpQ6?#?PD$!=*-Thjz^epYZ3dOoI zrV|if1EG8qONcg*Ex-NQuQHhNOJ>i zPc9zs-v~oddx**XhX(qUQ8_u%_xp|fD(zIDS~ctDZ$))pduvZ^=upg1X-LFxfIE^R zK9r5%qG&QF$+~)>zK9L3l4Ov??TcXcNLK?yxle>uP2#_ z`W%S9Ddq9$qzoaLMBY5eU9(V{tw+F zPuAyC)T&6rGZcrzb?a(CFgR1AMZa(!G?J4!Sw+(5_$a<+z>!8}79*Dpd_+5!D=k?U zcaXkep`w~=zOdU`X(VoCp#AS7(PXvTf`9uz2qD>7zt$ye+`^-tyOpW zRr~VPRDIO5yp?mwj;w}%ZN`P<-x_REnyo9VNEPa=hVAjAjNITa3ZI-PkF$SNTGgEQ zJWNz9^<`g$_)C67`U>6X`)I3c`8mwjz}a`C2&Fbc2QRdki_X)S&svCEJA5JKIZ0IO z$yDP}XJb~(RdPspdy0N+&gUFl_B?IB`;*B1+xQ%khP>x}N5}&7viH7Z$iG!vO~~Li zlA1Qz=G=fJ)HhI{ZOz)D1o;tf`dE)D8UEq&qn*n!zW~FU3_WGh+3t}?`hDWBsw?h8 zD#2?vW0(1Ppkc@}R~4%eRldhX;$ozXt0x-~k8GVapLiHQakqWq>-8k0tYiiAwM_$n8V74!W}!nD5rf}fYrUD@=N_0BXN zvjvNn{g(b&>yCi5gSq^EEzhpCs!f230oO^@BUU`m?{uVNtoISXoYJAlax4wd7ia@UT*! zoXbQG)Q?H8wdt=SpsPF91xQ1t#z`J!dfyuIa&`k*0Pu$g4AVa0bO-*^n928PeBD1) zYhzqfr3@P^*qY+ItHlNOb@(c-`da+!Q~ZT{A76#2*w?*0;eGee>h`_~->E)JHhqeOn%9tUev#nuEK1kf_Uk>#@f3ax1#@PBERb|6pkpYS12Fkav=|4% zR{-5>JK(%2<|5mXG~kMD1H<3j6s&VkW^zj@(ptO&|2Zc!*zpRSi-PFr z*IFSVTCz2_u>(8}cp#BrHU{dV13zTPZ<{{dV1bSaOm(zOb|^QFA+P>yI5jJ~n)m)b zlQRpGP}aYb&@IIPIFYNcmhAH`9A+jLHKB! z<*xzj_T*WMUEYbbPd<_c&=7BXiBGxs(rts=F#=j=4K+2w_4f?~-P8Cak{;m$;Lc1M zMgwM=hQVi(Dq7^A{DzV)JDv9oqHij2n_oyge1-(%f8oN$vY1X_qt{d9P4#z8VvCT) zyT+>|WLNEH8Tr_yABVB8<@FF}Vy3cEY?&P=;B4V)*c31Mq3@UcdKI|2tEe2$j2Sfy z&y8iEE;WxnR;*GqEVT%~eUToXPi^i31T>u-qb4)^R}*=gQ(CpHB}50bw<2`dlnJ~o zoj-yJB%H2*vVmHQ!McDxuVJFDvXN7Z(V4>YUX{jcGn>V%9DYk2s%2B|%4UTvX484s zDQ5%^Ry&#RAceeeSs!~c_RZ#6ESC5C=BuYaRV$7Jsf;;V9`;-Q-mt1sutclao>9pn zHnQj-7RGX7oBSUVr7fge9XwPVt>8Rb1{?t#Vk2fx^lNqWzT-bKGL^bfgw!mkqLG5ipFI0nv z+kz)lL+08-mQ_PPw}pOJz4E8+3SBh}-X6xLc9pmNDq1c4OnbPTT7+7Agq~WYNqeM~ z+BK*4YaVJ*{_Rm$)UID|zn-8Lo!TCqtrk<*9#f(g`(JzPGqt$(_P7^n@x$%$6KXf+ z+HWkY-Td5s^SfHYpY{a08VTM(VpAvcc97BPlrtR^IrT)fjzm56B$JLLEA?cjj${w@ z6#tHtE9$A&J5m$W(^5Oqvej=T2&ua2i*gc;|=?``9PV|(Sm7x`0nW2 z-=imgQF!^tTxq9s=wxiRt5?R7W2`aHeGT7P8c?`rSEKt+5XFe30_7y6_l9 zkJ_Jp1d&5&pk*Kh9)u#N&ie^Vn3)dA7Bc%J89xc) z^UH(e)HVK_?kM|1VQ20WLuh754`gIN*DFqJR)Y%j(VDIsJQSqKOdh1@545r+wF#2T zlnhUgIj$qSi#=bfx*e49g2Jw>(fGMbp9(?a`?;gZ_znGLb5Mr2L2)g(S8hO@55&uy zBq`r*;0hM5>v&v9keh@EZ-9t0@%>B#{nxb(OtlC7{|@?h8&0ZgAN0#lg0jAKjS3oS zrGYC8K^eo4g1L_3BS^<*kTOEUAPtg9f3Am$8U3s;mPVOePL$dO%Nc@|cPUaW5aq}t z{wlCuRbM{*_C}#|XU4AQ)OHWg+%N8Wx;7EN zVIX%wKb+b{lkm++aBOMf;9PjG8aOpa*iG?i0IPG z&_W;I5WLWLiUI4jKr)%)s^&Th|I^C6t>0y$U%m|KJnH{49sTvpb4f&eA6wF9UN3vm z&GNb42mi0@(fe(#qf%5eb6taX=0JL0t9Rn0hY4Do8n>^dd&}x(xp_A`sTbdpD0SMR z%{I{D9b4W`TR|J1o$u7EOc1Qn&N1lkFbC;X9$^e`bhSfphH+}to86qxxd3R(MdQj{ zE&qWLjKg*V*7hTt*MP0@(}M@|L(B3MFr$EBRfDjwX!j!a{Hcsi-_14DYs%07qAfIJ zlPc)UE~xMR$DMz5-HMxbv614$@H;red5J=+gk5)*@@WY+ocqbzT~Ba*#``Y2RqQ~) z{HD05yS)PU^+|+I8V+v$XhZJtotwF5U54uJ1+Ew6{JZlsV^>jsrr1UBuh8>lyc}xv zOJY^_T;+hpKWx(HXP+mujb#bSe>uPOM0hth~kS?cRK(@bY8WE8=ol`y7y&7ZFE>_dFM#dn720wLY(!$K2ra%@+Z z;^cZv{Vqc!y0-EHF-d!*DqgOCt0s3nwY?}*CYFJEvy9UGJ&aUBqQ72MuCkX|LQ}lr ztFyIRV{H(lbt9eS#a%Bxn~c&^mvVjO@`vmRU!Ps5#A&=rb6*O!88+m0%YS1me1mJs zRQw~d=TFMZk0u!QT(2A_!?WRvHTqqXCGn_ogG$B2FX0x<@3)3$wCvm*deZg|8Eb6d z(!XXdUTn=7XEvH1HaNS{rC&m>?5MO|CiY2dBs7q3F|pm-r>{28ocjK2goMH$5TOxylm{x4Y?!PtJAa9MJXw_?T(IT(e%YFt~PYS!LKB?Ox z4~h81%@r+rJ-2fL?54`Z>L%%vnJ#gyV%hh&Yyx2uD*#!n#hP#59uO}n-g|ZKMR#aGfd1W|ujyZZUj4O3E8d^7&=dYOgZ-z4opnnP z{ypzo;#MCuP$v9m>DuzcKOb(;g%4Je(T@(+GWA6MZrt^F^q2mr{wv(?Ch`gLO`LM# zMXAk-K={#rE!RbkcL&iWJA{l)7)~8hn2r|RxuN}ET(xFJ#($=M`X^Q0>WK#VFX?C! zhznDBf1f{Lqt?oveNg_`CPg}}0?ky1{8yZtr@evmV4gwKi`N=KH&IDguGQ!RT2lIE zA|u}_1MgTH+9r(5YD-Gxm3@Pjs7gD46v^cNiO=$T@T(5zHI)tKT;hmEp;?&e!Ob8$kWS{m-;aYIhjTg)P6KpV!Cj z&OQurCvdyYTUenybFq%MaD|e541%0$`-XjD&rooBtUbFx8G{iQ?y`v*$5e@H4tw6c z88O65o?G$nz4>+72NRZ}wX|Xh<~vFb{D$QGs12IATGDyB@_;0QO@pxb&s5&3$|3O> zdGQm`T*)ytrk93&mM8Bv6z?sJZMcyc{eBMD%i36|?R?I!HZ=5pWj$_AO=N9677fqu zt37<^sV1XP$%W~IYJ-|BPKae{Wu6>{_7Oa~(lX}0rD zoHvD#s^Jwni%u_2xEsjab>@G_Z1LTR!kd^>)fNR7UyL_zS3~kSSDTAY^SGuF{E#xAIcs>s zy6|`EuADg^#*f0sN~Knmqx7(Y?T0+?LrpC>$h z>>_NsBV=Z$X#KVu4I!cRF3DS=YHgLkcV%7aOY!+Uvi~H519+J(I^}@-o9nkjs+nBZ zDF~+$}N!391gi9voSRGLvFp!<# zn#n2#!WkFr7QNgmxY^L(_b56krP+F2rVcbv*^r#Ipzy)0zOA*5v>iM*>G!QSrb|O& zSeM$GjCR5?rpoT!*`6eyIi5qO>;UE5+u8ibMbPOk<;zBsWXl5x--k_rHNT9RL#T7C z+VPj;A(@qFG>Y}+Q*bfMops?+dS>5c_lOcYv%h%K>2ZqNnzx2W&6o?QK|n?`4s{D8oPoluv6#L2;TaE++d zTZP}ESXzyTUi6Fl5DqHf^t;ZHdIPQq(c2jA(A2J_Ozm{mU=S|z$(8AcC*JI7v0uH~ z4%a@PXLZr0d^0!gI`j5KuP#Vu@A7p9jo^(e&p2Q7C*twuXU)CB=6CPAh;fJ?Waegn zGU7-gS=H1d?%gN@x~l;?*E|Wgy&$?ZwlzG$^2Sd=TmM(P)5#ufkJV`(+aA;aRSfQ88@(CM53D#z>Z|0JwuM`#?2W2hWYkgpheYE`fj zE0F8SZ!FJWP7t6Y!vuR*fqtye6oKw+&2Yp97=%F!%b?Axf>Wz?yM6*9gbo?niKI2c ziV3X^YG?Ko2!*1jJO#_i&hX}6x~M#BeeTaP!gRoGzHbsD9aD2a4P`EX|5}J`c$m$6r zJAEn#&DU7cK@;dKlh(b04!tThwkpv1=B$h-E|wRhHoA07 zO1YLD51EU+@e*vup_6TF9LaE18}>a<1OSvEe5l!0MfrpaaJVmCzLiZ1tDX3@`x9IFIMJv8K;A-G_yH#D^6mWNu(Z(Zt~=B zqoG~HWtM2H@4V2FAn6a-SG`_ncUH6vO{yVNriR8~h~IfaIV-BqvcsfZ1v-*+TvIJD+fepWka%Sla&qV&!iDItZ9x^l?#Dd~BP z)Z;a!hkXh!AkvS)e0V#)|AvKA@WL1`^y@ttx60T`;^m~8d!H0>k!9Q?r93C4?3i-S z5fXDz6KBNNX6H zM)TMYDmFF@MF?7L^>DQio`KO~5%@BFUb#2u9ZsC~B)Y8(NO*)2gp8DEw`fU17^#?7GM=j`i5=$T!H%4ZDgx_hJ(HQI#`rv6| z$BB-N6nWbs5b4=-Khi|hOXY8)RDLNe5a1oc2n_-JzLm@pAkp2V%pBvAXiglk3eCbm zpJQOhAeMMMQ%fn+aTzKSaNq&S1>u>ph-~|-ELkh8m_A`Iz`@}fnpn!TxdNp;)2hHR ziG@iGaN^F(EBq1VFCz1NiGVg^`NPAYFFg6_Q=Vv!X>Cp=@se3|E|z~k@61DeonH#4 z_ptnlJo(m~;4t>uuok_yuN^biu=uOz)9S$6(E(<2rGJD=75bOlb`2hbaF1HW*0I9I zCeRRhfvCNUpOr)kXpBa?XDZ62m&i0|~-2Wjx3yGam^2Z>482Jk}TOtRI5e zmnpo#=`uB<=sgqJpL;ABrJ?Ku1pNaR)=lEY&!W9nkfok{u^_n~B2PEQzKnKd`TteAL-%zm%F%6waqWddPs;w~6ZYlp_-hEl9Rw8it>F0}5zV*gfj>R^s-{26B&++5 zs>+qI6qZL+bO(XwLm7n9!oCHF-m2qxeVf^}l#{}CK}nYM&y$43C1BUs3V5id08WaG zL^{8!n>zda6nkp_M=w5&ZCvPLtO<6_Y?h1q!BeCo%4=4|db&+(hafbtDt0o}I! zx60B&;;b4#IFhZ7%b4^X5UN<*Zm#bRo(Z#-W}qx_OuNNPL+q@uKE-p$%*B-t-R_y! zxyeS^GmatVsdIfJ1Rp=5Hraz0w2k!jfz!meQ z8{EArc!}ve0io;3qf7R{dO1DpyBIWC5!Ar@Ooja0lmE~My+_XY^^Xv> zI`8gr$idl+4x7rl*CNHrteaEm#gJctK&15A6=-Cow&T z&}Z=XA8!?E{boGDCsZ^&P0pYlW1o)4-kVVRdg#d{fPHpDL%LT$s^#7TtD>7&FO|>a zk+CwK51!~i@}%i#)AGeWO>BG18p^g5g4mO`C2`>TgdMRs{3?wK7G&m?UL@YATEieC zY`^?2fgD!jsqhR;=}*AUfym5vKfPo=%^S~fNa%dXCsMQmb){N&9!OcD24QWQRSpV9 zD1oQXAFt16CeFhApLZLfTp|{w)#E<6s&6Sfy8ZAJQ9sI7yXbmU)Qh&e{!_Vnu)IQC z!&xczaUQp0P*@yXRiNcCw)2t08W|tl3}rLUp$DV3|JIXI_rr~E)a_s1o0hu2*Wo-f`?)G>Zr(~H|QBMhwTbK{gEfyN02LpB>HD% z;i@MBPm@_&^{>Q)zAa{*>bUzsNaJf01x02(z=+t>il{d-c5~s2Zc1Y+WL?DcDWW7 z25<$gpr*S6lt@L{Rpnlw=v-KQdhEWX-8;?bs8+3VKV&dLe#O*M5j_*$cv=-=m*vw;5qn zQ%M}rq@7fNH@Z|NoXl7QX@~c@R$_Q1&$8VI)n$|#!93H7R=%h$kBy4#8+e=}d9rV~ zh1>;jk|H?Sn`~UmX!V=SbpWo8$}FdgJA5?SvxKI)?l7Tw@wz`oaUAMtch3{vO(gq1 zr+L2`c{uxAo(DH%z2|w6@-}T1=Q|`eeM1ko60-HZ^?XcbR*UX4MfBKB@ z^XG3+n|p1Hh#M3MB9n6vJ9_*GGNj0nBukn+i87`C%9Sizx_k*Urp%c%YudbtGpEj- zJbU{52{fqCp*`9BN$Czo2PI1OKujS5hQEC^It<}K$)A!AOP>b&Sd~Xd9wJ0g0wvbp z5K265?b(AX?M!`MjI?PZ%AY7EQm*tW)mO?Ohz*%!#3scEqKz0eawG~lvgFB>D_g#d zIkV=?oI88|3>jN5mhMJ|>*sD?id7Kh$y@2J^w4-8`O)qOZ=^VW_pm7h4LrE;;lzs@ zKaM=P^5vUrZR%(9PzoL?tT0ze=SmwUd9Qm1x-LA#@Z|lG7jOQ2`ts|=pIH8x_tp5D@_WAEu-(Gy>`KKO!`l(0YfB^pgCtrgKIw+oj0=l;#f)zf9-h~uG2%m%r z9*7}`8)j%BgcHKI4t~U(XkULCqNpK@A9k2ui!a((B8eeRC?k8{{pg)KO8|q@KEb>O zq)G8W^H7pb-jqR7SPdnkk{1+UcgJrgDjDoo2?Gfj_U(x@DfK(&=ZEDPlTk zl)4()>!+m3swl9Q60519vHE%{ol)NZy5+HwCOa#jq6VA6u&frF>#kFBYvq&b8dnD- z>%I$baZ#2KM>#o^fNu`vID#*}j+E1H2~(64N5H5Q6L7s#1Y8Wi{nmSLzWWCJZ@)RD zQBDgjP_6Xj$AGRT?OTQbIhl$--B#jxCQ8Y~w>a>*Tk9P-F4pUg7G8guM% z$s*TmGRh#U%(BZcmm_n@G%t;F4nyy3^u-x(e6!Fxm&|j@<*Zz?(l#?aGtodtJvG-= z``kj-C2w6c%V~ow_R(L@49n6@gA6y&a*zD8+fw)4wcTLT{qop*=kWB{XcNBn)q?9w za=#T@JaNAV2Rv}W{T72t!sRUg+;8Ui)*GzhF{n(MO9ep!^|+>U$hy6?_= z@4o*IeDJ~#PkiylACG+U$}i7+^UgmHee}{#Z#?U?Ur!V+n_$m<_dgMX3iaZTPk#C4 zpO1d}>aWj!`{h{gef&6ivxG77-;X~`;JeR%|Nawfr4M{z9LKvC(=t$u-Hjsy1H)Y$ z0+22Y4W<~{HWacDsX{$)HGSTGF; zR3HO$M?)6oAOl};9t8v8hZG8Nh!~uLnMg>SCS7J3%Rr(NNoYbMN^y$0dmt>zfeIdC zF$W(g!M|Dvgc1}Z5H$GzKnrM54jCRWfu4)QI2PEz+tCn><7fc~2-rpf(qdsiKw~-l z&;o%`qa4>L*cVFBg3$rtk1Z%z3&K~(5&)qIi=?6@FL}Ht4(AF7u?#D=r^!&_FN&Bf zWd}z2hh|jf8B6d+Ip5hm zQigM$>U&_6J}?CWUc(Vn_y7lDNzB{*!VhR#BsPmd!&EX*n7Etc?Tq=0gg$Tzy$s?n z5}1QK+A9Os0Ky0V-pGM#&M=@Qpg}o+aK~|&k)AB2A_w7l8CB%rGGJf@Om8~7FT}B> zJRRQyQ6a%{i~|K9NW&@tdW~}YBBXx!fI&xE3}YJc3xQy$90QuM42(mnU!ZChE&7Xb zl#zgFq6Qa*y3&JY%%mB8<4Su9R}H=tr<37Bch13!j_BhOuL#O8ph4Hbe&;>!iK}6y zXT{tVgHp68fmX$o)HTInP5L_Mn$p0pVBWByJd6XHFm~C@jsua72;?|&=m!p%R7(mK z=^EDxz$r8#1BpDWZQVCl!4icFe}x7g1_Kd941*6%^hqc{soUfZhZupt)t0uTuiHu& zc@Oj>?!LhP0S@et7LXO{DiQfn1MYGhv!&EEta`wCeIcpjWmIHW`ADHofdd(!!7s`| zjoBhHk>agk3ryNk4ur9`(=BlMa;sdRWG5rg0IomIaF0uLhbP8Zf;tk6Vb9EJUI7c3 zJPi!u;+@xb)Cg+t#*x(Hp;&h*#@&i@*J9xLVG1sDYKU(vy<)f_!;xuHgFC{9b4+3w zJ?RG+WWnPkUxpY~DuEkJ0gfn3dCK7UFpjSrVpz~O0~|ORmcN|5EIv}nLxBe$yf6$^ z%z_t#7{o$&(hGr*dCrg-)D56aQIY%)TA!8ra4{eRI7T`t3EZWUk&S6yE@ji zuJx!lZR=I%F4v!)wWx(y40|SmQT2KBbpX)=*%*7-H$hVg@J#7E-vAuLo-ByU7&Spv5t+afxTZiz)ng z#y39klH=>*CkMw3=CxCguN>qeCpijDu5p*cT!=WAGQN43;cnDr!Y8=`6WU-CKJelH zCih6pEOZ+VsVt`*Tp%HnGXQOfPudrWc=<)GjX_5=IqMhi`Z1$QZ}*9PUS#LG*U=vK zu~XUYX9xQQ*FN&Ln;n6m%OKvLJ!QIoJ?-uLyWU^UcfuE5?s+yvlvbo%Q5a$Js#u9C zkY1B}K-5dqfm%m_#RM!3T7J2>zh|3b`-| zXK)Gnzz5in2YGN03djdf#}EtHfSn`?bU+70cM_3c1Z5ytN{|)Hkq4zPLRnM-u!n+C z<_n0I0plkIH-fL$AeMggU3VbkAD$x&RU<4UKS1Ire{tymtVS4+p2O$U?S0D}@ zF=2a!f7nwcWS4>v=XT&wFtk)Y3AA=jmWe*sb(|u!eka2L1pIuYiZ6F$qf-5d-KC>*x>dm=8ukWS8)V zDsc|>r%pGO4;|nS!LSO(Fa{ww2Do4axBwjWh>ren32PW!!&O|gvnsqeickP^vG_c= z*Mckfein(5(5I2k!;z@qkrqj1A$f|_Q<5i%l2WFUo;W=%X?`!MagL>Z;zN@k*@`ha zlFhS|FR6myfRpOSlE!3`HJO6G@OZi{_qOrI0+qr zhY*2}5HSew7?uv12g|?&!Ih6HF$+c@1+4c@DF6eku?H|7f;3Z7Sa_uvImfKGbhd6LJ5;&`2CNtT|c4`(@+-+2Xo=o0x*jg?1NdoTt=5DaIr z3S+PZ`%oP4AO-E9W^CqWmY4xZNTTqEcw&^H_A~|57N%6TcVucj!4@TFDxbNzra$>R zZpuJtnx;}Vr)B!4c$%kC#)8phJRCWqm^h((+NNger+*r#$U~_6X{fB}DRd(`~tBOd#(y=u@0ND6HBps zHZKj!I~W_WAgi!C2y=wzrb(%oA=|Mf`>-dwvAU|V;@3MZyMiy9n+T>!oyP>Z!4mdL zv_)&Q?-~;o0gtqU54aFQCSj#0kq$B;3RhtNwO5O^S!=b4!2~`k64(H?a;6gS5VoKP zF9{p6b7yuXfSL{jtZBP-YumQBV^7ovx4kE~wh2JEdbh`yx9V26)7H1UIxppbicw`H zJe#(3$F_PKxPwc$BWJjZWlT|`xbx|=Pb7%ZNjvRef3f2_-cScbKm_hm62XuHC_xFS zI}~{^2tq3o@L&XIK)Y;M2DeLO%fJOz2@%k6PFx@kszAF(8ZT%Ixm*Wj_%a1?Pz;OZ zevs?8$ty6->%2+{y=5o8%iFxT8mAPCyhJ&@+xxuN8+dLYY04W1sF0v2JGkGAzT#`W z*V~%sJH6_wpEZlQxpGr)c?H=>JKM?s1d7%=;s6FjfDa~tx+!sCs0zCyA-f(Cj$pfa zQXmTOz?}Xthx*VB*%%4#_%6mPzae?2Zh*hgi*Z(RrX)BH^^!mzlTh1H(5aCWE z0SY1j2YZmZ(g6xXFvf6D1GoVH3SWT>sN4~s9LlK+10q1BHBe^LxyDyXfamBB%dm!$ zHwdg($7g%TifqisjLgZb%*)KodiKa36c6@CWbxAtlXt1=!VhrZ#3xY$PLaScFa(ES z1yXScpd1chY|9YQ%>gOPaKO&D48a08fc)3HYv~WY;DwLj55xSy#oWyQ4A22B&;w1- zqxj4n1f$_8Ka$tK>XHtxFsdif0SdVasaqX~fYA`K3PaEVvX_2aP|K$^;QDKNZogKf(%2@C_;f z1AFicDUiBMF%=4l3(&Fu2k8vb9kJ9+UDEH2(hw|x8Ua{sAdaK@TXgKfHSNzlt=4PJ z)@|+9TIbXELkH~ILGh3n{sttgump#w5|q#ZA|T5C;0`rV7E0g`RuB~_5Q3}l*R0{! zpuE*59TaM;((jrVm%s#Ea0T&@4}8G|UZBr3tu8kG)}?LQr;Xan9M{0HuQo9Za4Ep| z(+i&0z|7GMxp1l}VR|0n4hshnQ!5hBuz80N(vcy%?a&Uj4VKre57=-U`GDMraMq)p z)~RjX*NxrTO|h%}8C2I3buq{J1EZ469QYsxoDdl)Py=Lv#*%RlTmauLtrDUT--OuD z+Ku1&t>62NlHA?@8K|dvn@SU}UzNqon;{OrCDbqhqf4tv`~ahL?Go6)1ZNP5?QjN4 zMBXyNPNuu+iXQ9D?(EN=XS2Q;U%sl!O+xX&fs^M2^zaf*5)Wg%rX z3-99X9vScM5%W$H@qpLw#0zCmoHQ}yEg=eZ00ymn@Wg@c3{UbUf87p06P!*5!_Wqi z*kE+P2MbZ_XD|ryP?nd_29@^;O4ku;hVx9?3v@6D)Ls&=z~!Q3?UW&TN8sZjj~pX! z@=-7U^=ysuGBLYSAO&S`3BVqKMi39tEL@iW4})-Yl9vzNK;TV6_IRNX(7*($fC%DV z63`&FPR|l_;p=w)^a(%ps4WK#&;VgDX?_6sPM~KI&;So$4OE8s5zzPAUG*_x5eOIw z9gKNiFA7lE5rZJfpT`gmL6)804uiml2PfbnaSvaOo@YEOt8kq2df zm7=Z^22bY0Am1?&szv}+`OC*|P$Yl;n4I&c&j`PU3mZNxbtRuaX!;s{1R1g+wrCk}@nQ6Vl==I)o^Z3B%J5{qbQU#Y?=G z`3_dbom;oad`v_QQ{@k6Lq3Rl{|W|HV2K}r_>eG08E1^~y#H>* zF-INWf^V<0`uZ!d!2oDLtO)WNY)2)RWYWI?X$)yWDW{~;2$;Ho21+e+yC)JUxY$UP zD=yK_pDw@L=);dnkwO$Vu_>dONhVpbrFc9OWlKHx>Uj;4n$N7Xra)-VK$Sbj5mt|H*N_EvU7pkCz7Mm-sxbC}1t{Cas zY#VAr3MLK`CAKSiIRlzWE@_5EmqgT%T6yQ?iq~fE#h1Nc%>t6Xviyp`F9aF@zyJgs z%{O6%AA6Qw8O6EPVN0u|ctMyDF@g(^B<8r|lk8nMWRc_Ax2%4Jyx@Vs<~vx~kza;c zt%f}=kDuUZ-m_wz53H&UBj~_6XpTXSx#*ZnChKHGFUT)hR9ZGV>Y`^BI=o&steVed zu$J+jD~y;VYq4`BT57ZNMS3iy{{`TJ!}3}H9H`HxJ7KEF#_gQ{i1FszB~^{Usc*rd zEW2*QB~4qbwaN190&p(y%L%y=KD=^C?;d=sn=|L!BwmbQLY&S=C$DhJPiGRo2TWj{ z_4~j9vVqoJ2d8mxtifHZxl!jGP|QiMY8cUhC;bv5uDFMIAb$A)pdR*><&XaXa0~!I1G5&0 zpZf&3pYN#;C93$20r92}E*POE3WQ)v*f&55R?vc$D_{gcvW($rFl+5#fe3 zLn3NPhdzX26s1VO3^Xw%(D0NMX~vy0Bv3+9fQS}p^u#Ha(Tr!b3lKecj|z#AV=giR zjP9`!2(?2BZX9D+(6~oF_OUK%^x!M#h)0Lr$q1szMV26BqI-xV7l>%1Dk7msLh6Q$ ze#B%ZrwB+5VuOa2q*txFlP3sag9&F4$0bD75PgWE9bN#XCH1IDTGo<}oQ&W-G8Ic` z^#dLl``Hj{){u6zfr$;d#3gu%n@if#na^w=E+aUL%v%pSMm0Vk$z- z;95Ujf=yD!h$8uTphEpAP>NO*gH^Pqv;-uMQuLyO5~Wc>O(N2&kQAimv}j6I+AD%8 zP#j1pk3m5yp?JK|LYp(mgSG&gDjNmEZE z)s;Z?BUJtAKZm+iu12b;2IPfFbpws(_yZf6y2mmo>C~6xq77}p$uLCmiZ2J?$uN6hy)$@0FAo}MIV99qCeaUSK-WpOg_aUP`kQW&Z6h5 z>VsH2yto^2yy8|iy~kSF8a^f@Q$ywQ!X=phfDPp^$2l&%!uYI%oi^ZheC^1ETOX1S zF0^45TzExon}ZGC60^0kZO0&(P*)?2(~X+#Y<9JqMvN{q9^FvIF3W>j)vi`0WCGVE zNR+&K`opRX@q%tqq#}347bLl0LNEOBip%(;37GPU6ngW#t_#A-$CZr8yN zX3J;O2aQt9lRW28h3JOC4`}29iRN9&AYk;GL5QMER|rmk2OKLp)pvYmWx~x60W^Og z*S#PD$$&*noI$jPn}lpIgoQk0u}C<5(a>)x(%(E)To69wq5@WOD73%;lH zN4nP4u0ONLcm*oOJsL;{I(WIyxA6j??|El`tQnMTesiWb8)rIa#kyC4#+*rbk7}CO zy73UTR#NQ`RHKB{62-KpWi3=r<1inq#>y~Wfmx8;8YKEC#ivVZ;Rx=`*vB?Do3%P? zW@ozAFvLcAvvO3FL^~wHw#qO*sX=6S+uIhq^s<{hZXrK=g!+KSF!DjiP!FRQ6iMwc zkaCW8bmM9bfmkoXNbQEi8!6~k_eR>GR*;B;7wYy0z3shVknls0orXA8Voh#}pVQnP z)b%TPK`MS!xU~AXhbo}aj#6;{SQQ=(PCQf*4|q(A-yt7)$uqujgpm9r=Xgaspz#!F zoLn5dHi;u6K6IF%AOpUrc+$7)iwvM3;?{0R$7$1zex$o0C5OiNRH2W2sA3s&=C{@T zQF61YwiF-fMHVrX^)F*PCLbkw+)o08446Xccjv4rG6009Q5awj_7-GFz_Z`BPM2t>pU{*k{!V;TJ5MX)C?9FX_J zOlS8;*iWB&qP+O(AHjxxL(UPxUmhg(#(eNYf{F}4q4URI=?yY~iikUW*ZTN7J$BuV zSI}eiTQBV_UZKd2Z{HvP(pLrbALRVPk6jh6;>Rphkzw9b-X)M1e*D*wff69!{uRAA z41J%H(Xzzt{kIBTa=rKqKDsl$`$NEU`n)e7K%t3-y=lOqBL@0Ix&*YqS^^7Vh`^wk z1+3aYI{Sj+yTB2|B;{kk52TrR5HA#DviZw^%_G4WG^0cSi*;Z@nt=vsyFs>^z!FFV z83aNo(z^`Mz#a6MSHLwRoT)$n3m6Q-Cp@9gTRtW9n06pGDqKRaxBO1d9@YfiH#%sjJY}CeWyhgCdfo}xIa1_UJ%mJ|A#&bl+bmT^F zB*%6XM{`uic$CL%WJh23v`2TuM|}jyZp?sw?8ko$$b$p|4#)rwpaFs$ z#})vK4?xI+R7i$&NO6S743Nlt{DLnCgd&W_j76$&{=~kpPRF^vQDA$*;(Oll)1UbV(UWMfxMkqAUlOUZb z#L2mY$`IsAsx-%}TpsxAidh&yk-S2`gp#@xOT2{1IAF_iD95zi6|-bZviu0RWJ|>y z55(+B%*;!r{7KFf5Y5a<&@{}!5kRg0j$uebXwipUFs;($6VdcZ(i9LhWK6MqOpufi zu(-{$d`!C3OS>G+zLXHz9L~Ml660)1;-nGf^h)N0P11Qlu1G4H;h^Y@&EWh@=%f+b z?8@B#>`vbF%?w0J?c_`3EKkpzP3~+?^JGcdgb?R!O7_G~&gsCdfQI2Y2W*%#alnTC z(~?=(2z?NWtHK6`t54mKPn3ku2=UIJ1Wzb=(4vgc^xVtie99SF(D_8qJyFgEjZgP% zOZrUE!BN4jaFFB?2Yv{ge1Hd?gO=d92YASbk;AwBD$x*C&-WzJ@XUbToXHB6Q1OIN z@;pot<dbxnQ61Oh zWL9SdSHZc5)I(5*BDHVTg{N**Q(%$eFadAT?uHVS6xBFIMhvC@y>MrT~=0X z*@m50iDAZpRoPf7KakKhpu-9l%~q7?xQ!!~WL&ihgaLL6?I$5auqvlr?Sn8ot2mg z{MxQ%*jyRgipAT(6_1e{272g)+Cw-qdBDPphcn%nzjLUz!=D`MX)jPtl>;etQazQZ3CL-QNY? z;MF2k$cJYXinRN(dx!?DLpQhW2H2b32&r9da}{w&GF@tjc+kEf@?GKo_1^CVU+@VI zS``Yw%sF0AJAGi)E)@sYi(Uw+U9MpxV%nQT`iSlg-~83z{mq*3y$7MlJu?Z8bnAy+ z2sUx3kZO8g0f}Fbzz}~x-i^W1oIulh<%hxSih$+c@z943{#O3$;I$>55VndC{@@MH z75^2A^jkmC(WUb%yk$7uT%d^Vt6=e{-BQqD(fPSqm<5Umui)a|8_rckSdPkw2O}8S zJ}Fcg;RgWiU>ZraE=>t4c8OGjh6mY=)Dnsf5sCt3wH!pE|HUGIs5dNGVhMqU{u74@ zE)Rkb3VO(gFCHE&i7bK$HEQh!PeS3UIbln&8GaDtljsLKyoUt;66D4s+YG@CEvA-k zNaOK%Vrqe@-N=V+8;}3ppnQm84(s4G=7{7GCP?lRP&Nr7u8ebND&_UyRN-OKsW*SX z;K%CYxa$XxtqP9Kg~(du{u&8HxCA6hrG01e0$q|YouDGvV)EY_FYk8>QaxOhiDjNfPUzH>rzpM zhepcRlhEXksO0Y(9&w!G>;th!mCMLOVV^)#hMuG;{TWg)3(P(olGl7OdVk~&b zlGMTm>x*QO_~?dM26$*Sb8ZND;9+RsR+JWxxz53UPAzOL2@ZSPHAXjjIMk1@>8`-( zexvPp@Q6zOH!SiZpcd-rjys}GiZal?>Y4;5GJ>Z6_7p+7ggm$i+TaJQP73gLh;UBDtx@MxnHJg3 z-ZHjt@;JSIwpG~f?^xytn8pT{tK{OQ&Q%_{c1Q?H;%PL#6ik-xBxf?}Mua@@VF7gt zgxGHCPKuj3qSzzvOR(~M5}jyZZ`{a-mqMZ}?udp6^Dp&q;?D0hBT$a;hoRt-s1xvh z7|cQjI|P?=1)pS?=b z-=L@pqvI~?IkWBX$daLW(bFER|DBdK=I1o~FdyFvP1k9VAaalxZdO8aC8u?#qHbR3 zqEzVzYt9GrK{a2latjgfEZ_1rcm;Uif-m=qje!Rmf(ONBrP$_DuCd|{_LNWm@0F+~ zv^EnAzLf76_ZmGXDrdZwEuac4Fch8|^n&8MyINEfKHL!MZg{ z*I;c=@n2IUb{`2&Pi?Kh_H-5<{+@4{jt3eVB30K4R^K+*cJc3Mby=_VjbE*MI36$C zmMdt3Ilh8%;Q~kob}binEJv|m=a6{+;Dy*w1z5@rf{3;`3$=!@`J6Yr)r+?LJoGou zc9j6RgxHvUeW*Daq?4_tq8IFPhlyvOyn)U^uF*MY2&1bnbbxkqGse zSpGY3lAZO*hkZQyw%I4|u8{lN$Nk*b{oTKPOF`>(Q#IZf{^2M7;y3=V#|oYHq{FXb zPZDttI#f^kca(Un<&XXA2PNA7|2Et>{_f9xd-eYC7k}@^{_;2f^GE;mSO4{A|MqwP z_lN)Zm;d>v|N6K8`^W$M*Z=+J|Ni%Xfbb`9Ai;tL4foDRU;xnl^9Z%&BuH&z?Si0u3s3DAA%u zk0MQ~bScxOPM<=JDs?K=s#dRJ&8l@P*REc_f(n4p3SGT5Ml4?-BBgcDL&p@kP>n4yLna@e7VAA%U7h$E6%qKPM> zn4*d+ve=@FFTxn3j5E?$qm4J>n4^w6^4Oz~KLQz~kV6t#q>)D=nWU0SGTEe)PeK`` zlv7e!rIlA=nWdIna@nPqUxFE?m}8PzrkQ7=nWma+ve~AaZ^9X;oO9Ayr=54=nWvt6 z^4X`Ke*zk)po0?sTBxCiBATe8i!$1%qmM!wsic!qTB)U%Vw$O@n{wKzr=Nlvs;Hxq zTB@n1qME9ztFqdvtFOWutE{uqTC1(M;+m_jyYkwrufGBttgyopTdc9iBAcwT%QD-n zv(G{st+dlpTdlR%Vw$2OfyYIpque|fpTd%$M z;+wC&`|{hbzyAUpu)qToT(H3hBb>0p3p3oX!w*9ovBVQoT(QL$W1O+Z8*|*T#~*_n zvdANoT(Ze0qnxtJE3@3P%P+$mv&=KoT(iwL;QB+MxFH(Bs8M^u6}dV@WXgL>10Ojm?B?SwtUg-lw8JaC4sfrgLN zhf;2cwO)xqbcwLmiB|22M8=C$VT?tDj7yr0gOH9;*N$87j!b5cO2?2~X^~BQk+;~9 zUhtArjgy|)lVoj_Q^}NLqm_%`l~-_(I%;=7ZhypyrK(%8J4 z!@ZWzy|L@Po%X$#=)R$+znH1Nn6JOuy1~%u!KJLioYup)+{32o#H;tkq_M`M>&UOQ z$*A4RyzI)e#>>{;%dxu5t>MhR;>@%5%&+dwxxdh}#nI>G(ZlZ1z2wro(bL=c)4Ijf zy35q#?9|2d)VNf=jG=6<<{ut=Jw{>-sjl% z=-%b&@$Bi{;_BV}>fQD0;^yq(`RwHC?&S6F==tyH^zi5X@aOIE>HG5S^7QTV_VN4n z^7i@l`}y|&|Nj6000{p8{RtdMu%N+%2oow?$grWqhY%x5oJg^v#fum-YTU@NqsNaR zLy8oJq5$&6_xL>fFh*r_Y~2g9;r=w5ZXeNRujE%CxD| zr%fOt? zuiw9b0}CEZxUk{Fh!ZPb%($`R$B-jSo=my2<;$2eYu?Pcv**vCLyI0wy0q!js8g$6 z&APSg*RW&Do=v;9?c2C>>)y?~x9{J;g9{%{ytwh>$dfBy&b+zv=g^}|pH98H_3PNP zYv0bjyZ7(l!;AkPPrkhQ^XSv7U(de1`}e!i;hImszWw|3;p^Yeza*#)00t=FfCLJ7 zM1Ke-s31jR6zJfC0LHW6gcMddkQ5MRIG}_UcIcsg7y@|04kVUHq7x?WkirftZ1~}e zFs2uxh8l7hOrr*=g9|9=loZdUd4?*gLOZ~Ki$HR&A&@mI z>>$xT09yYuQak|ma}f+1{34Po8~k8U90Kkjt4Nn@AVos-#Bs@}%%Zu&sRF69f{nPm z@M^3h;d8`+x8}NQuLa=(U>0#m3ByRm9*dBik~HhCm3CqT>juLX1WFu$Kq?SEvgDzx zt;{^~3n74@lgcCSToQ*Ql>(4tB%ks-3^$vtW1PHkP;BvwW_~7#hDAoXy$1^d*EYLm*b1cv^k3f5Hqv;AX zjLL*`It;Eyi0qFwk0^_eK=eG4iYsyG`tI5MNs@{>3|(@FG7rt$u|P)%2#P>E2$+RG zhY0^@%@2kK=!8GCZqUy=0R9P}$nQug;IakbDxes6>rJ@g0771OfOq?|d4M$!#Onqy z05Z%#-w+7x=NsIO&yWDR;?E-+Q0`COviK7ZK^*ARO*ybD3{&g| z&U;LHGQ3;U{rBOUU9aLHiyt8GdLqakp!P8g0Dq zKoGV~P(bHuNc9K;Jn8k%ANX)Z9waxu&;hPp?%PM<4hJB%6>D&SGoL?Nv@Ad@i5a+> zo-|p1rSITmheXAMm z_V|c3n9Ujhk_84ORKpF#!82Cniaz`gl{Opa7xDvwa8Np7HB+e{>Y-l6|PPx`Yw zYT}?jBo{k>tU*90;ABBsDUj>A%3-qD2O&0aAaHIVmjeM@?tD2&U`q2pgS*)e_F>Uw zS%W_Dz}4OgH!zBB4UWTT+BYNtpl>+T9}wLKuSD>*y6v%UF!-89ae+aR4KI!leMCet zbS!j$UUwAQ4>C8u$Rvwn*2H@Bv~-_~@Ie1fo%Ya8!%!`zSH}!HmQOh@ArkP?w_DS6KcP zK=??7q4FT774##o6b0&M3nc={?NM9+QtLi2*ed`j3bcELK|fHt0$cx5?ib^doK*|A zhE(3IK`Ri$K5zvB$xV-}XHBkm(D4XDa6_L08f-iy5r>vJq#p$$8Uz9KgGTKmL@e9@ zF8Zb@!fq9`h&5c{1o9tsfdU2~5JTNMM-W`);ZG}&)pr3D-cwx`1X9q~c6)`XVb~G` zz}llfQejK(b_#=XBIeipG2reBM}7n@l-YbX?fju4MW93lS;-{6Lt2lRuv>7=pI zlM+vPg26NUxk|4b&=HV#i7YrF3tzwmI@W->0J-WXm)OD1oM7jh#A923&UB7aDIh7R zLkx}+&<|wsj8NBAp_)#$s+GcN0Hq)vpezswWD(&|qI%W1&NWc7kb++K`UQo1P#!>W z%~;-A*T_zGPlD~BQ!4w}&^8LQ3*_u*Tl?BPiN`Ot&Fyag!id-ox46em?sA*^+~`iX zy4TI_X7wY;71;;84FctM+xwqlNb*IZq%MlAp}Y+p*+tQUj(Th4-4>ZQPUX$;gCLwD z3~%_oCmt2tUgC5bkww2JB5#6hV8VGlODYuS1aN2M2Lk^!$S>xa5OMi9jspVoK$9qu zwhBn*6wkR)xKXz}uv{Xzh=Ul!sBwY}L+Hrc2OtJbDOdbL$DHU#94dCS)L8=^J=?%H z#2^NI{6ZG_280^5(vL%d0(jNV2#XM+LWQ_|An3RRO;>OUlNrQG6p8!X>uzwJ`<*CD zw@4~ZAPGq%vFHNP*&#T=!8jZN7e`Q397M1(%HbjlV$@)-j^K*C!hjQlrd@+T%@wMy z02liJMeZO7>K~X55O#I;Grj-Sm!4bvQ|@C0Ss)W54Z{4t9^nM&aK--$04yPEr(2YF?%*pMbcCRt!B``~&6fs6i!bK#>|4FyZG;|d@k0*Ams%y0sB1u4on5I1uI zC-Vp*kOFxSa=Nny4>OH)coqGS0wVt)G|>}BEAU<-5J!2y2`8{DDbSNp(hon_ASTE` zQNj#$b2`O^3)d5rL+KBJatXLXkKB?sDzgEnIq0GX0+q%W9k-DY zRS*w1G7FUu3P+Fvk)1~qH+}zyopnhOU`Ck5nVuoR3;=f#=-`TnaXpo&p7hxm=;;}& zRi5>^pZwXM{`sE(8lVEoECa$p2NfX9z-+3*I-epS<^wwyH97?)AY*46)wvOiIiMQq z5Cv)uwYCASpbUCr4$uQ2${?X$X$}x-4m1@Yl3)%7A{zNd5unf_(@7C4FrR%fqbHFJ z8M>j#0U+iuq(oYzMtY=3nxslPq>|tcxcM0tSAP-#12;MmKS~!-dJ>NZq}L&&OZuf? z8m2=EI2D>1Jm-}^7ZFqncX=QNL_;`_kOgZXD})d*ex~F1< z50W6KZz>mO>T?kH2u}a-fap+d9OE#JDl0KGj9Il0had&32X?q1c3heqVj8NVYNWuk z0cDCAfoBnGic>292(kbI@>T;oSk4Nq+EQEw$yKuotfE&t;`6D-@d$Y^q`q*jzA&WL zU=5=hs>*<_>^h{SI-HupeyTbw;=&0v`gP%og#$G`xNuSl z1<5KG_UeA~r)9X1b_FqVhhRkS#W#wAvEa6)I3-sqD0UARf#PZ$LfTkAu(Cs%1zFIp zMS2NJ8V4&Y0y6(gqVQ8;yD@P%hbLPcDZ2qi zx~?!wr1bKoLvXW2>JCS6s*@p$v{xcwsn0x;0~DP&7tu^bIo!w{bfhb4#S);If`Dq}uQW zz90pF_zYsOyK&$@Vlbqqa0%Mr3(MpS&tMLxkOe~!2SvKKN7|x;br}$OG7LKqy0&vK2~-< zw!b+lHGwjgtWsN-@Ve9(yTUQMM0yI5H>A%HJiBDw;EV8O~D0`e*bsc?Qf5QiXe z2(Iv=L@K>U3O!fa7gA~v^On7=Swi3NXbz!e_Zbmcg%ACJV_LDo@iq{8$(JazDqlMg zRPw7%Hf02n!U~KVBWw`1$hn|Rx{;v9E*s`)*#=z)XH5cwWjqY-Jj=&%YPyptnnK#>!rb(Z-Ys=p>LK$+a(tmYJD0htW!9( zA`!SBPe3gWj<$&yW*hxmfn#;()b(55F9<*c{`G z-nr+Nng(~kgd>*W?5Y_N@JrmF(V*w^b4gjy?-#ulb-N@?<^mHbwzp&H8fsD@z8LoY{3;s(ed!j2~N=u zkPOMtL+N4Q1)toY(aZB-Jp8G?>xV1f?sTYlnJoWKVY$-N}PR5 zj`&?{ay+^mq%EZV$!bK&JLSuCu;%^iq79DLdSXoC1+5nPrzcDVe)z~Pn3r^JI)M6@8y*Ij<(Z4Pfeq9 z{ZO^fy7eb!J|^@Xw(>G~F7^FfT^oDFH*Jo=4)$Sfbg)$TD?N4d zzFXgg+}E(MV`)_9@z-OJmR&YO_fwQJPq;7Vuy)fA?T5DZpTn$7!qmcM9rlz9UcYj# ztz-{oRJOcpvfwoymXBaHl`?cAU9J zQIe_M_*<=e9PT6NU2}A!JPHTY*nh3AD^g^)s&R+b`Rd5k%`K;63-}U(9~GTFVqJm0 z8#{YtSr%RU_p`?%`UcEGWc6f&MX)R^;^4l$xX$4U^0*md6%?*aS-ixCd6XTpWpbZs z6M=na-0(qq@z?z~dA6(sCqIM{ONC}IAWOv6>C{g1lk468)kA|BcZ|N?nQf46ZMf;w zYH1jE%Wef++yEvw+^`_tP;l_Z-?;n-g-eOAtw=Jc9ova6XUwkiLzHqlGP%7C zcHSe-?1OX8qq|=oW#57x7iA&w*sy`ahwp_qb|1yapcT9ix$)o5;8#WFdj-L}(fDI| zbblOO5PRjp(Ma9XNi5HoQFOZ;ypv_v;=|dyS59!}N9HheIC)E~cq|4_KOb|nMgMAF zC?5+#{VJo06UOK8@r+z&J*LO!4U*AVi!#Cl)hL}$IdE}$V! zk1rpVE)<;i94)i^xP0lUdf58nmyc2wr@cwG48FY8D`jT+XFX>={k&Rn#bo^uJLzlm z+O{!pX=d`bZH;Ts>+U4Kwa<0llU1Hyr~cTtu&GvVMEFp?vsO{(zl$ib{fHc<^=?MQ zSL@melrXKmQae9uaAGr8?#EN@)OioY+{v7O{&HWG_~lOf|IT@}Ki-7*3ViNO+B!6~T`6Bk$ocg3s@MmjN@bBKCuc}USZ*$`uW{=c8 ze@2G>T|d^Oh41{FtoE+1X;PX>%kfCzn z;Io4#Ee?obdiTZTi}YdZ(29uF>)V$ECZ3*4Q!e4Re5st6dzv!Xe!_lPdoMl9LXEk@ zjGKNQaj=BO9M3PKZ?WVnVG(;z(XZHU99mk^&SKAee0ru)sn6_!CnL&quhp;C%_ zZ2tW7dq4sOvwwJe&iFuLR=kT_c~4US)(XW}h_2p#uRM9+#QOfP}1jAiSW2_|zqeq|ctG4nitod;vK;|1&Z zT3po-oGMne8coT8qURTwRy^hz;1|XI%Ov!$$WI8KO6553DjLbnu0{*!{b2Q^W+ttb z!mUU~tZpk=j0dF{!`D4XD>M@2KxG#h=VLJpN)|7R6>cGtafpxj3^`Bsxx-6*iG`!c9+HB1KfeJNaz+&-SPVre&?_7n*+-FNF?%q~l&*mbo;bgPTZ{pmmAX5kWdY z4o_@1nn#wDm9e=uq%4J|tctv@ywqxR zuwO7Q1ZEqnBK~9r@>kg8`H%(80}whYQt7{N601iE-j7shQGKy;$6Wms@R z*~7e;3`Lmh4h*1wd2g6uMHxwL@zrwc zt0X<9&HV{XyZiM?WmD5oIa2PwLkF4~Putr+JR$P2Oc#UV!>kD8x=LBDC5zAHH3ejO zmb}X}2Yg3i@|gUmy9vJiDeT$=wD_mAB(7-i`&v(^c%CT4Jo!=Mk`2P0iU{`Z(}(@2&fBuvlt2$1OP&J1iW&jH2p>`^VcZQrBGL`# zudR|U0)^I`Hv3w@@iFFe@R#TU{Oo94^xEJ}ExHQ`q+{tcQcGkVTHz4MoXGf{n=a5X z-Gh6`p$h1^(Fy-uRlVn|U@pneO!uH>$E8 zx?X`BJrU=NYym%CfTYRh;d!?h4J*GeW6s63>gK30oC^$KyIQcvA?jtwSJVE}f`cE) zHcs+8Z%NeosQ%nPGgO}&KkZBL8vS5V^5~~5#_x2XY&ZaD*<=lt5&_uNc5_Pw3-yOy zvS_{tiW+f+0un?Jz*`GvBLb{GL>PU!r3(Zl5@}E{klzbH6*2&jpa@WQvhxNs?oRWW z1MS}4WI64WLgJ@65k;y@b8$Bz2qI9{OKRb92@D-BT0$aPsutoH?n4~r&U@=c38TQF zT0{sEAOifvQ+AUBwd{rAAO`@js5SG0X)4X%&HnfcNHDUDYB)3nguxWR!23ib6O^=w zTF^u-k9R|fY=!s>{|#=$z<`UR%)s~~b|AS97mrWjlPdz9GR%AbW^B-K0A!=J=TY1! z(Km`h*_;T8!etXpq~d-Ej_pYW;{WwMX=d=L(qyR2N#_bR;Juxc)loec%!(5JxCLsT zWBjqew;P0x+i8{U?Y7$pV0_pWd!H0M-Lt9#WQJZXw=PbsEwxgtRfZ0GuD&_4?l* zLx5OjozP@mZWORa!B+m(u+~fX6`Jwe55GokwvIz~K0)@xAw8vI0BK$QCz_#ZiY#jS zEJf#75V{A+?I;AenFgZ5+-xt{owuZ}6nqE;nmpTZu*@ERMhFpPp92f`P-K z!fCdFh5%VqcBwvn%RtaFzi7C6wIG5P!-aeFA1O&Rt-HB`9CT3_6#0(kL)r>C7fONv zE+AJN7)I)a6_KotbU!fTp)_qL%X6W`1Tc|!8I`CP)X0ZWcg0b1nOo^j9_?AVCagHX zyauk7ZNg0e2+2(dq6Qr?U{8x@Fa_kk_A3w*9>yj3;s69LL61PpaE}*8sZ7OX!PRu0 z>|`a!C#0N$&dt?*l^B@6^=BTL>JGhAPy@ipZCC3Wu1EFZtPIWUAo9hG&ID+9o?#`6 zc+^K_(2*K6vS9)OfTd&p+*SV(*PP|odzm-{C#jkH5VJUl$Lxv7p=KIMmDHdmwwU-3 zLWHzX1fN`xk3W?fblV@b07Zc91$Z!o2%bJ?F47vTc+Q-50f;18Gebc%wcOvx_eW}5 zZ%)sm5f$dN5vvA$io;nKE{4()A94_7O?Ap|0sR6)^fMpK4a`JiYv6&|d}edyxhSYR zVkrEn;hePyTTvUa>|OX(b3#9vr8VJqUV~2@(n-&V@fCct5}Bcy?J;g~jbqHd!lGOW zwpaF!R++Vtmt`-aM;95}N5PLg;t?Xmt@P0Yw1rxZgb5kOdj=sBng1T2BH?4X;=mli zY3Vqh_$`#Q{rJt@SeUU|{g>_Yo-_bRCL+tnK|&@8b}>Q~OqP-2_oqSk32_UCDK{$O z5MgPI%2yoEC#+XyeF=HRCk?O?hs8g_RcIDMJs9w_Jnn$}f!Of}Bq(3|m8Sj)i2%&c zwhw0pt>m?@OwPll`oXt=_y{1uKm?jU4yG%WSTQDyo?VjIx${vX^{RS6Ay%17&nM-Z z+s89uA2pPPwM>aHQ{ZnLl8r#bH@5Bu1BQ-(Lu1Ut)Ts#?g zEZ%63h+!a}dT`)cxY#tMBn2G!pArLu9>9Va*VWF}Jg&0qKdW<%S@zjAluX*1MOB31u z*dA6~gV!c<^^0c_Ab*)s?h#C)xzZf}eDG+|f)a?R8u;U@3dS3kw%OrqE<{)mQUCZe z5qpucU=?}_XEg<&rhwo(FoQINRS#eYQg$jLdc4(eA`v~l&nQe^KmEKJ3Tor>kp&I_ zJm~aKk!Wu@ZQuVjHw@_PP-Y7vn*Rndkj{w{0BACC-})Lui!O_F-1^nVs)R|R64;oF zn3RyfWe-GV5M=INY1 zfVd>z45@*yj5wWXhdsDp10n)hnwQ`rfaKk;4sxm!2^tJ@Q$Ue3p$G`$A~q@}O)wMu zET>gifHfF#MxN9mrwT5bicb3hO+FVa4#bt(5KPsfmYlEtI%6HVke2>lF8P3l~rZQezGaT!A#>!b@Lq`yp#;!-S)|Pn6^WN| zMEWXh`lq}N-4p;7b=^eN{8@VEK~#Gxa79-f*m{Bsiic^LI@4SLeiep2>>_{w1^SRA zx_^L+045UA?l5uxyR#qad&72zea)g22O#En5#*!(m=h4P2mG=Zqq-qO8G620jUqo% z#ybV3nZxdNT-g2{h3o_ z2OfO0i^24N1P@UqdWZ)`M81l||0L`{rl-OK065WAN<{wTBB-b&xd-Ud1C0PqQr#a| zAERRI%Xd?G3x)~*l=M^>@PR)jv=<FusNHx@~KCP z#Od;^iQIY5^+L=h?EC9vpNA59K!P|B-ytn}ZZkF8+6pV_w_3JD zYAZDIVW*59F2^VLB>@yha=Rzww=YhN8uEan61N@a5hSO?vqYtPGuBb7v6rf0(yCCs zndcrvfCNeAXovd{FJAP~YS5&a67k{AQSr4C7k(yu?|A5?MW5&ZEBC6F+W&zQZG9F` zX>3p?VERpaAK(J0N;YO$sKZg~*ScNIwwn)NeFr{M?iOkjSXaNGm^jFCW4qRaONl64 z%l5M@#osP{&yn98nXOUar}@b~Hc6~4P)i`enU>&s-8*3R`;QqESu(jpGWn*C#QB=U zoY!e&Nq-8Q(eMhV@%d!w<&;R%7;04fg%D0t(KGi{COoE{jYV2zH+%=ZOhL&+h#Jwi z!X518qa40k@ph0#gQr#wwSRT$kOd&7@RB!eN?%nyEfOcTJ13s}nK1HDO!e%@~mG6C~)HB~n+n9@U>7v%Zw<3|KU*7us^Ir!ubjJbWdH@L>xqiBg8 zdJJ~cM2(bXkZllT#ScdJ3Zx~lMg}2}jMjLDYpz_eYVo~Wj1FqC(UwWp(@EkrqP8*l zc}&4f9SMLbz%p$LCM^wA`GZhjh2osIwiN@9&AL3!AC!Dty)!SM*rG7)RDDT3_v02X z_g8R>30MSx6KWB9p4BHUU$}`sz2bFwh%n+>#gqXP1%MQpmRulKj~KZ*hTCFYSPPtu zl8`u_(Bn(349UPy0xVUT_!81;FAbp&2`xZe^rMv8deV%%4##qvkdl`h`P9O>^&%k^ ziFhtGX5K3<_craG*ofmaSc?EmbMqO_@9Gh2G#h5;)wmlQ&y$lOD76^Vm@izWG1rPK z`IGruxi_GDNf%agUuwkMBOJVN_6{12E3QL?O8aL^KXf;H_;~W)^SQ*4)I6=ZO0V18 zsGDn^7VwadzH9{=3uomKG8=s!R(q=dv+*3k3BgPz?Iebvx+a!>;& z;1V8ZojtS_V_kw1XwbV51RLNbq3FFar%GsM6OM-s&5-Sx`K+`GSZs{q*c3y`q=&5= zGsQ*%JP;QMMt(0MGn5_;bN1#mqX0b;s8rcJMkiWRIvxFlU*5Uv*9|r#3<5|u`2X6G z`BY)`()fD~88cORy*cF1uYa=BHEw+mez?avgVO@}C}dh$Eg;>t+V zQoV(5I38`WkIK@GA8Xr5~S-0Z5qF{caee})t)zzf)j}G!8clL6XN9npm5R;mV z(^OA{NbDDxP{UYOjfyX^NOLt&lQuNrKVFe!*O6PjJHetcy*ElR$yi^bk`$TO6%S)L z+h}6x=LglqGdQldxH(lF8955|&h-~MMh|y=RTZT64`jN%6&cF9r19S*TA;;BJfHo{ zwn4sKW<>cz$LI9rLAZ%ltBqjRK4nCFRgUNKb$4^s3=#bX@>kK@rrx7EevS`32-0o) zQptY)A}Nm&QuiJgr#wuoD9O;z?}KLn=2L9LL6J}$UAZJC$=tKeS?8h=u2}?1_dN90 zgQM6<)xnqtM%-1jQVU_XqS6<-1IalI!MFRxdKIna_@{J^)tV=DT#7;}39piCDb53C z-`f&H0fv#!m zS%MpBBU!_ac9jxxwS{snWG}iAi73K}zmdhca50Du5OKq}H>tH<_HNf9v9d`%Z802z z7*|O`g#^8}4kiC?E##it5(pw4e3YNS*+Rml>soZXX2{aUOBKgXcD5<##-nVevm&47 z%$bbDwUiw<2}aVLcA8wpQ;v$=jh{DSTVo2XZ%$@$o|Jumaz3npb#e@ zZ+vjo+lDwEH2^>KCuVXy)e^hmMDvMB>0-V~O4q&1#Dr4ooK9WRMnjO<)ECn6&Ot;Xok_W>hxJ}$AT$Xu&qE+B4INl zpI0gl%%mey5s)E~{L#~5!6mw~o%ff2(7^VagWN$jDc%8fo8*AehJus$QOzSK#y0R& ztsi}){CJ02`2L}L^JV4wI;*DHX>rEQTnnvFpRzbAUt?sZfw9i>JbutoOi@s*?p9uE zCyl~?oc)}$!drWJ&K>fN)&oDOi?yafFtf1^V?_(doqfn@sjg2{g``v2{+Fn;W1gw{VqOfvwcbOr( z4|NZl+OcOtUky2K)jjGw!s6m&hFyC@@~oMcY~cL@@`CAkWOCB7iuW+6H_q-b*^5}m zVEA^&;0ZyLP3NE%{7OAHZ!@SzJd;!#R#*RQJxWSzLFV%9=cF@Y1yb5?L%>m6^@S%# z-dcEIJcga~`O9zK_|JA@tO%uIw&)5i0j7zR(d{CXV}+s2>no8}uJN)8w%nCDu-2Xh zG?hB8Q}s_#_4x($ufMo$1}Cj5i%M@~g29!7c1Xwk*&NRn}Jn6{VYNndUNu(fN;42-VZSba#S?E`bPL0j+D+!7Z=(q z_kecg;OIa0bC4#2<1q1j%<%=cMzvxG=3Wz4D`~jdcV4I%fQ__!Gr%wTCH62LudwBo zZnV}iXwz9|M@5%&*m~3xyfUMTHOez-H+!6CFBRD0yfX9lhf@~JlLkWU>$(k?13ML; zoB2;3@O2L{w6C@84lgTzK#%`ozh!*%)cYRRbu{gsQdwH6PpGuUy^$qbFBz+!GmaYL z+i8J?j6y$O{UZ;T>L|aCmR`L?w|$Se8+fa}$iW*N>09HKTq&OKd#@O*_U=Tv#&7Ri zx&CgF@{5^i&b!W5!gqZKRQl`Uj$hu0GM$;~YLF_OYHoAt=RXo&zux^gnGJh6!rjfKq&&Cj1R>KiN{u$I z&9mL7{pT0mrgmsay;-W(O;=2_@N8cYPP$pIEBr7!Eox$U{Kml5s;&I`_NDv!%|q^I zLtlTj0e>vS37;qQft+xFv6uz95nc4>De$eqJ@c!WkfssN5S0kat2Ds`R14Q}N z0{esUr^>=Hnk^hUMyfaW=b3+e))7q7$*)UPo)V$(TckzA(ltNgu6wpwuNg4?yr#9v zAhGi>OK0cOpRh^#BzeF-`+B22nT>xH%Gm*ld+}bISR@?!ePOm@xI%?kNxoy*qI1uU z)?2%g154lQ+|T8^J5)?B1$GDaUDYFqRCE#sXTaSum&j3vlbdyPJNIa#llLAe`gi^N zX0ZvS)M#ivw%fJlAP=SJnTgTWkZuITjs{AgbxrJXNIfjG97=MmHA_QYKJbKQFP+4jv$de@L-ol@*u_((T!mumD3wN75SDUBvAw*ZF zwG>;C1<4arsb3;2!`5e>lElo09YKp0{3S1N1|n`otdZrq!xu-QBy3p>@^ZgQ5k+BKq=U}p&1-)6#Mq7hSQO_r*fk7Hz{iIfiy#CsF&#k=fq>C~ebUp$pLU+X!(8mmchzIMm1jOEnedMf%juH@YMsiXce0SIUHuw9%f_2i$_Y zi;Adk@TfSBZ`Opeu8Y3Q8*8jrYGp3zr@XoC6Zlx>@#`C5fAsW1(sgMi6G6$3&B0=c zGKD2^g}udhRDN&;8u^cU1sqUpoWNv9PaAHrKwqXHw-5JDmZ}hv_T0;^lFM#YUha>3 zaPp11)6Cd~AyCWqXH2dFyfxJ&S z{ETl`f=qqj*ldxfZNIej0p(n}joq)+D3bJkK~~~**`{TXA-~LnE-uV~K5HH-*W54V zh~%C(?hsG+8pE+uiYRQB7p@JyNu`$QqLxTyD@??Py6HBLGmRoMr3CSpX7?%N$0_gG zWv{!U$iIFDw#MchTQJ}F&Uj=bOuHgz_Z$9HFVC(LAqYN2uR-8%3cM2BsBtM`lIJp4 zNjqg1M`dg-mS?lS(#`gJaQ4QcnoO3l42Wu=)dJydh_OuU5%)J`qCpVSGP&k4X~t}y z1AryMX!GmJnl|1dCT_PEU~l$s_fCHQrzC5k;K%sPlF1dO2>iAwBV8IXYP_CtgPPH1 z^)}fJ_Bu-T`yhDM1@^6JJhW_x;kG5WR@e8g)vRtZfA=yRmDK)POea}Nqae>EIo%V~FDDhpZEU=m`XDpZ zRxlTNq3*$-5{1gh#{0O#wwSwB36e-rho=v z{k^EmG>?FFr{o{ciIpBq@DrGHy#E=8p6OoK!IJF$2Y(GC!ZoxOUTCs>S;xp^t zg@JmkWml=xn^-aldx-_n-~}Lq4mYalIwt+6RaUi2TAVexS`^5YGmC@~pND_of?!0u zfz~Gg5rst67N;iq=bc~EMzb8jE4z1Dl4yVCJdOIO=*B2i~1|DZE4>luB9 zN#qjK>Im(1)m)}L3-ON*=p4Dcs;uhLkCV5$(Di7!oAbMrdC9XoaFc&d)gQw*d7hw{ zUZI#b#qPv8m24kknzBnn17M2Q ztPUGZ)N}XK&+i&%%KlC|K1r~zBuU8NAn$4wTVW3Uux&xFw*55DrzFwrX>|=6ia5WBHnm zp_ta^dhH+e4FlT}{8*j6dR@9ABz#f-Mc$GR8=K@eDc{j)u#$mlg8_X!)Yx0FU9{=b zIOANguno5Do1Ct0gUNj*(|2b7t)F%T8_0G%YJJqUpn@{zZs!@H-QDdsnpPQq9sNP#?zV}!4{$BHax+;<3G&!Lb z!_yLjQHzyoi9M$lr~6IWze{3VUr9+VA@`Y9zk$s~eJ&IzDYQi(S>MJXG9fE+7-YbM z0&ynmE2V?baxnr*vC>Lid_gn}%|Kiagl-|rW~*gAa?tcwL#N}Fp3+-{rek$Ul=wpk zu7-wDf@Is`AMCZ{={ktsYLRXMp{GGM(`0rOo{O_9r(P{Sf+lgX^*>pMd=5O1VBBNp zI9-?9NgQNJnt_rYncD%99k>7A-~F}@aMES{{4w<>eXWn(jq<;C2?x=h#wDmm8tC{K zmFl(;%HmX4AvPokPLQIlbjQ_DqfGOFVA)oVx&7j1n<}dJMKh$ZTEmV+Nq^tPD+j_& zs}+Z~Jb^&aoch=b^~AWI#B*)BWpQt_5`<9X%DWKupceU{$k@j{)!qp;95GMQjA}7= z>o~}D9Qb<0HUSqkc@#dQzO5w=rgel@vc~hc^ql>q!RbP$DSPb8#@{9QgVFwYn`Hft z+#bnka{S>wFF?_rjuGaBcYM{fnR`c0>*?IPn+<6vRl|}fak_$WLPT1NvzBc`ob=p2 zuS<&-a68GbRX38tzKU0>Xd$FSk_y{&Yg&5#c1a|Ihkj5_3)Q%b`bq#rw!uJ&^N_C{ ztlQp|`R4BMSk%amD5Z3YtRTgvLJM8fKE%;6><%0Gsx|ssYwS>KoH^P?kRs`?ue1TN zS%l!M_2b=pnuqq)q_sb~8!1K|SGp&R4ngD%Tc+Kk5+cB99OU!qAkG@C=MgOe6>Yk5 zhr(qY0@nJ3U?kcrp|Ii%X5l)QrtfGEj9V5yDbqqNm(aPnj@2jG-qWafAsBla>P*GjWpZ! z#q)04tZ5YWwF)#u+g8RtI*dEBNb4?*RFc!=+%Rb1=^C}YtEr@4bvJI=PxFZZdAGD> zZ|r2$*+877!^ZMH7eROT6!G#u+{^pgto!?rZf!BcgA4&5*%@}R&+VH64gg{d+3xpp9c7<2B;xRdOE{fo5UfUP^jR_+~vh`{gfZtv|L3q%IZQ&M}pEB2!Z=6o%9l%_piN8`H`)jgwA+S%gyO`wLQ4|*CZz%QE9o`l7)x0O& z+^X58zuOAL_)W9&iFVnAwwd!QRdj6sn>B?U#GTsryxx^L*yuU4;c~7Np9zfc@n%l% zQegMxzI#rFR7KS`HDgY(wT-?aP>Rn>QbwCWU6~m#=MbK4(jj+5<$DyzsM^xg`j?AE znqH4kMuLb`eIK+@ag0^GSpX_o|{htqC9zZvFv zr;J|V2v=a+mDyMi+iQXaB8++6t@bj+yyv7pJZ#Qqj} z((2mDmO zKg-tVRrecRe)V2~_&54p%0=rbpI_!@gz!~FZ3UZQ9@@5fdlh(06|6juKH*y}N?4bF z9P^M?`rumCGpM}gW5!$RR}!B;{`LB8C;#utM&pRF2E#9lhNb1RkJsu(bOio1O!mHA zt=j*Ft$N)=c(UI1r&Hi>``gT3QKRFNud=uWNigH>9UZ~VZZ7Ain`)|8Oy_&Cg-^Ez z<=+dw3hyO9-5%AZ3+{{?VV>=LxTGue@6$EsXaA-g?+fit`~8xlV#l7zQ15QN)|Q-c z^-YR+6!qvCjhdm``Aa`}qqm117c3?AwMBL?{Dop~(7+tFBI#9{;!&F5{zea`FiOoC zI$Rh)erlHIe#BRr8hmi0Q1Z9^Ytt;prm(`hf9($_#!DP?03(4kzv0fvF~Puo07;h@ zls{K89V=7>WS#Xd(KL1W1Z(vWQ}wsNJRx>TGzDotv|dEcwj{|5?k33(xHH%P?BbT3 ze?9)Rf_blCj$I`uOJB~F`5Alh8y140xG*|JHHKBm$uRAn9GSIRPrs>cA<=jPa;J}| z&*c--<9}StR6Py$R``IJF4!<;|I*#M*2nz{>&;$M+@tnjn~9uWDPA&tL9-nnuYb#% z$Bc7Pgi)PTa>>np&nJ;?Y`7`3j&cok+A%rh$!i;C;5o53l7zu6P^CI3Z$@P1e`}xF zR>H`}m;KSe1c$n+9G-Kq-pD~R2fM+={;2g6_V?==sD#g%t4~nN;t>oTW;S(bf4(@J zDhR20Yb&=wE#+lZ1bmctF~gEDz~0nt9#ix;W;h-0&o6q>evSRvPwSNxYOk!eQYn+Q zMAxu{*#MR=G_kLS!nc!?W7fTt8^ITMCU4J3B7Q5+khT)@!6e=hA>m1sm{)UZV6>0& zc*ZxmYK-G&L(Qti<@!|Ma_*llFk{R|Vte0QeP5!cJow-(hkE_y^ z>o;NGx=sCauD3U?{l5NX#-_QGt4iQIgU$vT8BstHoEz^|DsW|Sr-5tg^?KyIK!o@F z?))06Y|>9ePbk!b{|B>XpX_hSg=VgC9p zN-8pxH?F>rE8s?oX2ODGbNxtkp!?4=UtaFGS7j}2gX*rs^r zk{QDGzy6?$?NWrMjS|8e^_=A!<~v6x6E#aNNDbHRbwE{cu{<%(p=*@)d{aR`4}!nj zMK-a)xVt!|)18kkM%uuVxL~9l;^J@@?=tRh;thv|wt08{PEx$c3@!mJUeMYJn8e+T z`o6bY%z9nqEbqYv^~T!g-YXC5#&?S|kV(@G7w^@c|3+?&ytqJ#2I7JmEt8Cf-S`ck zZ$199_9l|oofLq3%l%l+gXLu)W_i4};L_JbCIQG`0QXLjY}txYzdGYRV^|SoZ&__` ze|TABr%YxD$LvGmem1%qZkIy1muT;N0YUi-?k4F7gb!9`0G9WMu!&?2`l zE+l9Z#c|_j<~vdv&iK7-9x^=MfPb7|tSH^d;$yln`xRl*Ie=#iuX(p1aKE`7Yq9A? z&=Qz;F}YT+l3~=s&9SON8~&b(w9XDkf9yx;^w+ZbSPSyYIIB~lOV*Q2+NmF=;hM=h z$^kAVjRoV%Bj;W+p7mcvu9UT^9ArcBt4I33Hq?8HYi~W&#bOrwd&i!LW(0jQ@S`tS z=nEC#{|)`?f+~m@B-Sm$ukn84btwR!>0~XAt`OoMx4c~$lN)}+(t&@me?}*;D#32L zBSbPgD)B?{NbEcDPrWtcI$gCaPtObM?<4xgifi}Zz=T;|^lB~igJSbhamW;x6dcWx z717~#;bao6=8-2bumZn)(1H9k1Qi;*fA0b=2WqX=Qxe!d2&Y6r@w~ERvX%!>teYtwPa|(16{98fpUuug z^7KFvV2sKfcHD(EFc!&K19iYlQr~R!$56Y9r60#Y?z>@x|3ZRet6l_3(Jy9W=PpZa zhzdM*6*zK*#!-bvGi7$?c>=|R6G76uq8~Rv9KQP8$zA+50Q9V@WNZy~N`x&@oY`s~ ztLF++A@Nhku<@>tTco{yJi};$U@Qr99nc4ikUPaf-{u~=m_O>ri=ViN--;D7FU1b8 zN*cznrT}O+YLkx}pBqlP7b!JOhD3v5X=kUbgQT|RHy@LcVfy@S^D6!*sZV2);Oa(S zHwe?16qSVVq_W$0p`uF}-NEn|*-ZA-@L5sBCs*t)nYja{AHu*yg|UX3vD#62Jn$?* zAe68xRu9k84KU*IOvFvr_wziP#Tcp!YjzNZO5*JiX`_#eBD?4Ku9qOzH?Wq)NMASP z)u2DquGj9 zt8xqY1y&IJc?Nb&7MV!fTs{MPhUEKOiz#8|a0Fx6H!;p`|2W4mi^15L5lj=AN`j*6 zUA3af9qQ2`2tbGZ$Ux?uuvtLzvPOfJs8tM=I!jzFxD09)nUFSdF38o-8{XY~ZwQi;eNQRTlb*jX~n z@~ZM)F{}n)GZ&evLQ6If=R8UTY`dsRxN{H5eCA_lnX>}BmJsb4zG+tsfEGMg!IDhpFf)OeHSTWi8NuYPnd82gpEE)ZQ>dHs9(HFIBdn(o{}&v#oSa^NYoa3 zs|Xaf!T!*a#RlMX878i>ht>(acE>bk37uc+urk;Ql4%abG~H?ncE z2zO^Ure@AErihb3WffXxt8~HE5Uq~q{fnG_VljVs`btx?zoL+2p^nBBIc^5_ycW$2(d?8A+}wF+<&6zEr*NQcFdcj zA2_c64Mqop(Hyrl#n&NYo7Zf+>>Vfn?Cu6JzHo4)4!yvKd)tN|`~C{Ti`@+3ZNUqD z0u64E`Ge*$+xgfa9#JEuwp~$|II6qi6z1MM)J6;`G6lM3!(t=G6+Q}SdRPXW}!aST-iSrURBRpOH4 z=F}sfE_I)B7EYUu-~g(v-INsnPt72NvQJlq8}-JdjPIliPbx`?2tthGx#}@C}pi2dY^szv!uh?*I#@0wjc4? z5UXrl3_bTzDk5CtAZQf3P?21W9}B0stQ0#LszT(w*9BFI!On_u(+RPhK_b#s&4nW`p zX0oZD`UqCysFNCGH-Md0b^}151|F+=FChZ26ce4m8ZH5U5w@v8;0yoW_y;yX3Z%dZ zLE!|dIvG|120*}kZ44C5KnToWCrsd&eL@vkq8Gq$C7wAbHU}rF`6jYin_ZSJ#$h9D z1U8~*Bktjus$dfIp)_p4O#A^*s-PcGdnHucA8ertO6VHuDjslnCeHOrO_3;O2E(co zDh81_7!O;m>LsG;bDN^+?o1u@^(b0aCK@;*r$b@}UX}L5f0S34mc2#KIdg;z7pYXlwxq#^HymnFz9vfbzi&9=$)L zViF+TUnb-o=#wadyg0}K(<#KEg}iFKA&_c96Ku~PbYRJwJQg+p!-60Y&U+<-fDf2_ z)=8nUTVTHZA+ntr1XM8yBB9IAyBBnTD0Tq}Y@?bZ0$Kl{U>Ex6oq~i29;6UVXbBMA zgmMMcvt8SbVi+tz#an_5lwc*4d?lKk4~X3#q?`+=U?r&F>YAJ-a2>KG+aD;~AATLb zt$f&tViNoZlU6~(lf_}Y0bq~>wIAdR%;X*p{g1XOgSQ>xBOWK-AqmdXCN!Z1=79`0 zAQH;GvBCTu0KfY6k+1YTmfgD$w2@N_dy0WAP97zG=UV*9p8Cj=7~ZdlH<{(22lUD2pCjhi~0mKmcvZ0;W-*%=^Nf3 zmj3Ar-|(-YQ*z=WeYGFnVIRVLCFWrsbi(9s^t6%z~pZLF@`@x^~v7h>Zzx%cS{KH@O z)8G8XU--#?`?FvD<-h&cAN|F@{DprVlph)LANc{o-@1VV1rl86&mcjC1sNthh!En# zgZ_{qi%8KSLyQYIdh{q!qr{8}OPV~1GNu2@l`LDjdA7TIXYkircGkP7b^PjM68-B~nj z#J(vH$4vM&@6vLg*3N1iGV<7dLI2JW{5jR^bhnRhiax#i_3Vezk<>oE{Q30j+gIOq z&N&++`r9}MNq_x8a*lu_L~+g>2DblFjDh+^VPG)<3aB4{{sCwpfdbl)#yKjeV8bBi zFkwT9Oq`QRi6v&TpNbt0qM{89#wcPMF&2}eiXxI|B8oAx7-NSYhB%^%C+4_fizU7o zE@9`#%U**ZZcUWo=^H|!<0|*XhWfNYWZQD|D`CPh8toiAcF=X zsGxwwwBq175DLg?fBG5tUx3Bk2Wzad+7}Hlv)+npuDb5((o^UB3T&{#4ohsY#vY4o zvdS*YY_rZj3vIO0PD^dI)?WXMZMNEO>+Gw%ehY56{`6vrxaOXVZn`od(+aolz6)=> z^3F?dz4qRVZ@%Yvt8TykF7-_k)%;6v!3M|5uD%K{%y5EKb>Kxg&+-Rl2E`0pP6h}9 zJIBLmNL=lQ`;E|W$tIs1Ex!k^%yLR!gwk@%GS6I-uPEP)GqoLiF^38n5a?_SB&T69 z#u^7pG|x6<;Dxm!58_A8QcrDi6lBqiHB3k4+DtQ9e+@R)Hd9S@*}-<)ZYxZm!)FI0 z5ZJR1BNQ`83>~PD%Q-?ax$S?%1f`esafY%n?HdBHEzB zApM;KRt!-T(*YBH9KAQu#pIF!6JS9SIMNw79)-owOaHTnBlvN5Iat|{{pMIjk$vXP zXJEtf=4+fj?()wMtPNh#4z?Xg*x|LT)g^35Zn)+_%5`8>>TA7K?l;%I|^ds9MDq% z_%vw4Vr0*1UKsxb2nSa|4`rYnWAs89)fTl}sZ+>)G6qrCmPAXGjhPv0`oU!WQqTm@ST?cPbbK6|RB`uoG+04JmLHwBUQH)}8kFBh-EK}HXO+p^=kpwq03tE~5#u1AxZCWSTF0%oS8pS9E;9d*c z*vfXcw5_deZ(9!5x)!&*?X7QbtJ~iSceuU{E^&{GT-mlZxx5V)b75;-+Dfe$3Uq{G z5EB1dC3a#5@5ruqS7w$GB-XTwrGYrCi&U~*Y-y4~Z)&HuUd6aqz59``#@;*K`T_?M zGAdhq8F*j%vKKu3#czJuYu;c?<$m8)Nk2BR3z_IcA6rApJzS@hO%TBf4K_(XT2|q2 z28O&Owr*R1cC!hIBN$Hz#(C|6Vw9jb2Z*+kOCxuBBmf!}Z?BNefxd#-8U<)jhqXjH$ z#}-%`*71_Kw83zMkXxJHl0m{Qa$d2WEha_-D{Rkg>2sa$9B7jfx+;eLGoA&luSNfh z7p;eWN^2nIVtzZ2UD6s z1SGVh!Q_yE36x;#g0*7|Rj>nwyEzro&UwzkAZzvxG-`RZqJh?>fwC9dY;in0&-t2m zIjVha#b$fk(TJN+3QxAyWwqGdDFYU z{>HbyOKV-Lta=x(Sc0n~DHBO877k;!WcTNgh;SV53AT{T1K$!pjY|H8L+puT$KS<1Oxxb%qF_g zvxRh}Go8f2a5}!CzVx71ohnbq7S^Xu;y2jkV)y#`vbX+}u3yXSK^J>l$ZjvTlXvS; zSvV1@00n5ode;6BMHOt}hK;Xc36q#a75wpsCD@@5Ccpb8#<1~S1U@16a6IJwQHUyt z;txfXNk7IwgpYpL1Op=nANUaLeegjO>8ipwSY47wLR^SbF}2X=Zf9{Q=y>tE7@NJ;Sb-u#qT|UlCS(j>;6E-=N9s-5`5twdIal$pxU8zec(r*_|$j) z@3VjV?RTHs?+QQq=bZ3W^P>txc*QCJ8*#pG!3~@5flhxh*nX8=;PPz- zXjV(edrg=aBQGu0xb{= z{=f$<5NtZ|4vC<1=K=*=unW;J31fg^BxO=Ugca`xh2+HwPgeg9t?+%uLMl|ThO5|$ zPw|SeSS+*n6txJ2x41^Ih%LOB0lt`oz?h4}_!`DohHi&i*+PuWsC>vMj61lDt0->H z$TrY;jY%kESMy!>;BbvZ6LaVfdSD0szz<$i1*_lxW^%RXsksq0uVR$T*sWFcEm}ixli&>eT`I%-#bmrtN zJZO}*2b7^XnT$!Ar5R-91TCuRk*pbgobg@bkZXIG6!K7Ekr)&g<~ka86VZ@sulHRa zW{KTZANmlNL$O(hG!=#^n`LEZPp}T?V{~ryZn*iF*}0uNDT`ErSl5YG;>n%M;+^M- zo@S+<&oYFc)oAH@Zf>@Yh$Sr*FdO)H52eqsJpg0Et;jECS)U!j2vU0jf!o52C3EfD+=^( zlqznP8mXB&shsMlRq?3~^AzSKs-PK}8~Oi92$ca~`lFS0sraR-sG6$SxvIt|MpOZ- zA1S6@1$y&1Kna2Zdm!D*+Ega0c0mcw@k=S=9_% zP=^T-4R){uTQClv73vOo$I-r%eC!lXQP{{b~b6D+o_-%x~N;9 zKv}jM)d@CtNcyk^TqZmFUrL|D za9MRo-r#D!2{!x?2XXrnG9drBE87A=kO)-3W{D8Ic$yHgpuGtJz#v$?a^Mfd5Uujj zxCv2gjj(YPC$3A75Ws+U>2SG0t2*oozbA~sDXhXP%)+qCv6)4c16fI}_OW2Y4z!@0 zFR=kC7z@s70#uL)LA=00umJ|?4-8xqJAA~|tB@-@9~b<8D(4T^@Miu1c(FGIBz(Rl zthp`h#a|4@VJyb$E5FXfn@f zE^f9STV}b=M3pz!GWcKyq;MZ5AOku80>mII_pk*WUBxmX3LY(bTx`%KZPF)=(npEV z;WBK+CY(t@3q;_d8^sR{%`)mR1&R@ge^YuYg28Jv%^6)LYHfT^-Fv4KA*i5UbG8LZM+0-BPb`cjkN(zyJkZ zFxBe;JKT)b=INfWz3isK*(_uvMG+6XX+6z@<997x-C zt=rkH-P_%gnGG)E@W^S*6Ez2?U&Rkp>B%;64M=bXuh$M|P+@2-6#uiV*InD(jo&hpI@@E`?OZQu93-xY4*``zENQV1ix zNJOBhNbw$-l@6VZ1may2cF+Z5a14!*4h;v~G9e8JIN|uC)faB#H?G$i9xEG;5+9Bf z!=MGB1q@?AjzbaGHX#ahAO+Yw32p#uwD1QcyjNu&m%y-i|Lqd5pxA{~&$rTQMZnfgj@?qu=Y5XSRE`wb3I$Lw z2AgmVOOSFzAP@9wO`8A@g#c!)mJi+#(^~`S{6_@(5Di(`2oBy7(O{^0z8?CpWk)XR zSyTVxem>A)&;Sh(2Bl>Pt=SmP*>@EP|Zqk506pH8uqBsd%ph)td=#?PfJf6rRoQIY!dP|TFdN2le*xWKv z%HMtx$MDfXft*A@Nf3|fPfqUwJ?`ax?(DAaipA~#pzCI(0pvdK8(+}&E)@8#@0=$O zpqCEF8A@rf$*zP1-2FqbKVk%P=Y=U3yGlsU{)ia@?|g3EFTmu zUwF-MW}5&77Ptg`KMpug@L2g#fbS1`0FPZj*DjF{Ox$9?^ zyT7J0Q3~t4YJ@JuQ1A-kxNE{+5)xk$j%ZR)pxmNR32x8~;M)@X0D7RE`UkW6w?D?N zpZ04n^6d@)4Nz7QK<@3%{nJeMH^KYuuc0$>2>uJA_t21AU~7I?=9E|ewx<95V|}v# zAxhM>G0iGfYIeroLWT_+%7a2gltYRYEndW!QR7CA9X);o8B(Oj3>n0oM43|MN|r5M zzJwW5=1iJ3ZQjJ0Q|HSj0087@8AbpA5-f-M6p#zqQ2+vVK7|@p>Qt&#t*$INQtMW( zThS%y8dmIBknp_u)3;=oKYkH``7?`Ft-oBFD%cMx0pj z;;<#JK874w@?^>=d;SbMwCGBvKZOoJT3PgH(xpddWt>{I!$jd$zlKfmT9qnL#_(G> zsmj9HwQnzU17!&P*o0RRICntq2LUw&!S*zJ!TXZZepv4s8q2QWYZ2PER7Nr>3uF8&r|utAKpn=e8M zCyXk*%*x{ofD0aIN;CJIqA)}eM|7z^2TyE>A36F#vBiuCgfYedL2`o#J6yyu$JZbv zu}2>#v~WDl%)5ZRrds*%Mae?~3L+|@vP&;T;*m))$DAz4 zmB`ynrJ*hm=con5Ns>%C=d8-fFRuf|j63(Nt}Y_h;xkY|!=$rNLv1osB{Yc?=YpY{ zfb+dWC#4i7I|W@0A4@RhR54(PKw_LuM}3meN>8oyvIj_DmDT?+P9m>?R$XmV!G47Kqnffq(Q?-I4l9;vIt* zW=Nkch-idiiOFJjV2dxtIAiY$p4cNgKbN1jNk!O z=5L!Jj4%6MDa2+SRem{Wp@()kW}hv3C7+{{RfPyB1#W{sYqFthiZI?0f+GP@37NyPU+`+=ts!t7A+HQfAZ$Bn8N+GG>u z83Uep@4qMB__3!LX8Sw!dF5g0)V}T>C={fnk?74otb6e9$KP=Azdj@m`tRh2D73UW z>x`ii=(2-6mSBf5{0}kq%U=W~NVNS~O&yzp4HZb?K(3%KcGkg7J|f1!#6WO@Cq!Y! zDrmJ`z~eQE*gy~>K|&N==^{212@Yv!5fi4+hd=*p)(V%V3q`z!946S|e&kWJg+SpI z7;$1pmgtZuCXt6G0b&-ls4*cP?Hx&Ak!m!dgoX5Dh72J`EUcKCV{E}8F2o2NInu^A zuJMXn#A6=Y)y1Qkp%<$m10Sk^4>=%lA;h2p8SJQ)UAVywYdMA}XyJy5z~dEF@P;?^ zag9Ix;Vk~xge~0A3_I9@lb&2iG&^C~}b@X$KOrhlF+*rwg0d%OuiOQHUi_k_hwLo-m8K zP=#EI#i;H=%F&pgAnWnn@E6w5B^YsI;bE6U1;

`@YclF!d?=G$y#1M-TK(GOf{I?NI8M6d~*Sq));fD514z&p4Af*`;UAF%*IECkJD z0xzV_cg{1L-8{`|Xkr6l)PxB%!38Alxz8KG$3{faJ2H1#DGfD-;lXokRA)(TtoLh|wGg#_WL{r~|nXl-kbjG+^$ zhPFq1kO5YZ+S=V^MFxD3>R>l{Au>qbJ7z71g@C}k)u2MIdHwBh=ULSqX2`I^*a9z` zfD~LHH?l{Zk-F$G+W)4=wBK-Tg1hz&85l&W6QKitD9zjcpn?+KfrA9&yWHduhQfNW z0f!eF;u5F0c^BSrVH5w`AAAVHJ~qA&i|e~12LyP`D>8}?<);5Fp5+k(FEpx zcrOOb2NU$7<>ror30hzRYa$o~R~GsPi2g^Po7*4w*Z>H~t&SE9-5*Eaw??)AOqrK` z9jC^5+8e57!I6W!6xoO0E}{;M#GN8;pZnWK5)F`+-S5-3x!MJv$Af#p@1tsj9u6;a zWSm;mgNHm2OM;BWbBZ6ktvtZJQ1i%l9)&tLeC9C)j4HTS^x!>v1_thV*5A(&C0WPm zF|}3Mr@HYYa762MKfG*bfbz8ul>5>PeDDsUB&*+j@(*|LofBVB40isQjh_J;R9pGi z=UOmy-u&nXs>J`@2XEs~{RHb{-~28M#uG~7d+-Me%wlf;BG)nf@jHSQ_s4(!^}m1q z_uv2j2fzRnzyTz{0yMw_M8E`8zy)N$26Vs&gun=tzzJ+XM{tBU0JV|`z4+skP2)hs zS_dzfw)G>y5;VaRM8OnP!4+h|7IeWEguxh;!5O5%8nnS1#K9cY!5!qm9`wNlac zi|G**XH>=$0kvX0Mo&ye?1;r`bVXgXj%?J%bD+hBu*GlW#9WMxX*9-ayvAMJJZk*K zbsWZaOvhG)$7p1ju@ehp5tMe2D|>VhVw^`=tjFpoi8&z0bJ#*|B*+XnNKOn#hCoMh z97x6hNNt=**r-T>_vv&15#E!8v#f#)gcLYVW1WL6G#iguDb-6qy5{W;EozD53 zd)S0Y@HZ5x0#QkaO{fF~$s7WTON_kAvxG~p+{#<*%KP}rV|+}pOw3bcOVuz-Yg|mC z3`)&B$^|jY%REeEks8zUh%GpVXwV(rSqN_c23xou7s-bLi3VWshC(oz+XAw-n9lc**JH&0wKDt+0bC*sSbnh&}(% zj~s!5Em#Y2nyl)H&Ff5!(1cFL%z&H(MdU2b_dL$9<6yh1RVg)jJv1 zV$G3ZMOGXE)=~8WQH|GsZ4qg$h-@N@UbP5#C=iCIf=y5zrGWxbfQKre$}x~1O=5^i zfLDMO2_oE+#hG$LMr?r`Gg#z2ygjaw9TKI$AsEa?a zg;0rxwQ2;P(pl`_S%tOBiD=h_XolU$twNYDq=nkJmD{=fADTUby_|%d(yM6bDZLtp zLZ|{!AeLhp+v_OXg;2>xh?uTQC7JRE(o36*%~-mn+{(3FnYmUUorFoiM}P3yGsvrZ z2o*g*DS;r|Yu(v|MO?$Y2DuWYh8Y7Lh1<)e-P*O?amn0h*xbD3hi0&g9=(Se8HZJ< z0;N$+>OozyRb9_D)Fq%Cy# zhjc(+mdpr#K;LvQSnbV`+}&Jw*aDK#o@Djvt#-JJcJSr9;D>W6-}f2^7vcwa*kXAQVX^23ebD2DfMaX+ zU=L=9Y1Ri8xeIOn2!0p`Z-Aq8(1&rjlHTp!OCE>GTCaBAV!CMO+zH_jo}P9HVab99 ze!z`$9tUU4i+R=uc*u+QwJ$M~V?zd(cJ5v89pu#51Yl@{+jJpY_=84(qi!k$Bygcp zmf2p22r>Xz^R02AcB0j2ea9QDu4$s zErjUchjusy)dgyyRxDoX$FXRKYtV;#xC)P+^O`Yq^spM8%6}Xlxt?i91$Xc&M*sUI=&)mBP+mhInYk z*c({oT3}e>O=zHq^_(RTZJDNNLk4Na5NXmP&s}I|A4Y<_Sb{&;VN7=E;(lo#c5Pdr zg@fYJg~;ihJ_}k%;;38#H8pDLmWN;Rhjjl~YO#Rfb_gSRfClOjmG8Ew)4qmrkcV_| zYRXYfizSQ6;^2F*-nd4KvQ80z#%9rCi&m`&KMw7GZiZq0q=W{jE5T;~|D1T9&}dMa zyKYS+uH$R)8_~XGg8rP-E@6azX1ahQ3xBT~Hf=W!3vS}tSKt|wfr4=mg{;!yC=>BO^zBhQCYYPjc4t+ZiCt3v( z6=>*gvB-yUXol=P;p1&MEzjQ5jt2EMEqyo(%7O2(kZ;MdUW@7nohD!X?r;9S=8k}H zytoEFW@oi^2nOe3GxvuDw;?}1&zS#}aEG99fLds~{v`Dd3CiE)d z#gM5#VAof019<9!W)Z)#ok3?H2NqU6bAL#4v2bU}8eg=S z9)6xsLzb-l28rWEXFVtMRGJ?@pWL~ppyaqv5K^vy=- zh5+zDpP$Aaan;!LPUjiASb}~?<$Nd`c*u3!*mb;6^~7LxA@v8`^aon#88`w4A*FR& zCwZ&dY3|Go+Hixuu7cs%f;RsS_9rLyuL31w2ZrN#1x_Q2bii!Q4)n?5dB~oud`MKC zf$T8{bTs#Gka+62R*?0g?}meGxvS#IYWlY(_m6;wX25K=;QDxHh;~TSoJ~~fnPO2{ z@Uq|fW?*}$xdsHCc5F}7$%5=gPgGIycM`7%NN0Ptr}(>XxW7*ChRF29NO%+vUTY-| z?nVMt(BbJ-`3EnSP^NK3_K)501}F%Qv##61E--d`}igfBgn?ufl(gd%%5%Fn#N&1W{NovF-=(4tqITuV@f`(r5m8JkcxBly&eiZ+K>MP!gd%*th2mkOF|M6%0k(hVYiDr!$hg`+tqw#b{qD%F6 zBi-sixYukT!GZ=4B21`oAwz@y9zu*LaU#Wv7B6DVsBt65 zjvhaP3@LIX$&w~dqD-lBCCipBU&4$jb0*E2HgDq0sdFdKo<4s94Jvdf(V|9=B2B7v zDbuD-pF)i)bt=`WR$@ta&r%&YnMm4lQ~#>C&c8 zqfV`QHS5-{U&H^7EqgZY+O}`w&aHbl@7}(D0}n2IIPv1fk0Vd6d^z*x&Ywe%E`2)n z>ejDg&#rwt_wL@mgAXr$Jo)nG&!bPTem(p4?%%_YFMmG$`u6YR&#!+!|Nj2}0~nxy z0}@!Efd?X(pn?lB*r00vV)`LlRk}kw+q#q>@WA*`$+CLK&r$Q&L%_ zl~-b!rIuTA*`=3Xf*GcmW0F~>nP;MzrkZQA*`}Ls!WpNWbJAI-op<7yr=ENA*{7d> z0vf2GgA)H*sG)}5nUANtL z#xHeyX>>mUc2Z40RSQS1PPA-{{Soi000051keJ900007 z2nYxW2Mh}f4i69z4-OI#4;2#<85R{67a1EE7akiLAs!qhA|NRxBP}T=EGjB8Eh;uK zE;=Z=cO?Fl=#o&2ckU za-n^4YFT!ufp%`sb~;^mrE_?IUU{X@c|C=Ca$K zVT?tZjDy#XT-lCV?~YAok4k)yw4RWM*pOdokxqP)xr~yW@RC#6lVoj_QlgZKaF9>`ixQ(E>j?1~N=DM7(yV3T$lcc=H~Y1+urBc{O8!^>GAsM-Qw!q_v+#1?BV(B$^62^>hUpuvL(6DnND zu%W|;5F<*QNU@^Dix@L%+{m$`$B!UGiX2I@q{)*gQ>t9avZc$HFk{M`NwcQSn>cgo z+{v@2&!0ep3LQ$csL`WHlPX=xw5ijlP@_tnO0}xht5~yY-O9DA*RNp1iXBU~tl6_@ z)2dy|wyoQ@aO29IOSi7wyLj{J-OIPH-@kwZ3m#0ku;Igq6DwZKxUu8MkRwZ;Ou4e< z%a}83-pskP=g*)+iylq7wCU5RQ>$Lhy0z=quw%=fO}n=3+qiS<-p#wW@87_K3m;Cr zxbfr2lPh1&yt(t|(4$MAPQAMI>)5kv-_E_e_wV4ti~k=_zP$PK=+moT&%V9;_q)~k znoqyJ{rmLc>)+46B&iSp1}NZw1PXXWe+VY1AVp;q=-`6@%Cq2v6jnHp6%b}PpoA87 z=%IcX0=PmCB$h~`6DIc1LJu!&_~DB%rWc}y8gdxpjW~KYBMm2}=wlB~1leMQITmT; zbvx#uE@fb zX|ZO8B(w?Vop`#&O%71@NkRxHKp7yED1<;lpMAE$kT~L$B#kr}aU+dBBGDsFqXxO- z3n}W9B+sRJhAOH;J<#BbKwE^MhCtlB(1S$!0RPC%Nb&&K&qXwZFpNmCgb;*5Z3x)I ztRiU=f)xqT(}pIaHj8EtrwZiG3pV;91FW)+gwGKN;;JjJz82&QfLgR6B@H7Pi!4HN zR^sftR@#YCtRWCv5Gid00;xdw)N;qNw@L%aFop!8&MJ_=qe&Z;P^u3k?mkS4IE=Ir ztv~QoT+1+!*gH_Jt5m$oK>hx!$*umV!$=u-=t2;_`o7!-B*y+qP$1#@sWQ19F|t40K(Z`80?`A>DzUV&E4*m) zSIH{wFr*11&^$!1#{wN8ASnX%AfOihF#jSTH$fO0pcDSo8bUwt0Qe_>Bf}%5fXo(@ ztAJ(Xy?5b@0~op80pjfs=Ki7s&N zdY?WFhd2PW%~*sB-1+>`B4!0*Nz%v__5AUtkx7w?0PQsNzQRuF0I zVR9jmM+O_^z!2!ej{XarAq0Y{+JV7xx|5nesKzq>mlSD~O;BkKO`= z0tx9NS1Z_qK!zxwvn_;)0|{0LRzS-YJVPBV02JrQcRTcjU}W^*Bp0>lk57bvaC%!K znV40<5Y!@9o`WVXbs0z6#qL$ItfejeX0zH&kalV~8#e${iwsK0hJnmxEuWDq{SA@bu7EsvTy=np`KV0LN{&ide0;juR8giJ`{14%976g4@lv&)Mk34e4H^ULkqFjrkGMpxkN&rY45cP*d z_0cO7fX!~-v>Oe;hEZQ+aAd@jV?-e_Q4AqV9VF-hJ&biAef(*jR=_Ac$mmRe$ccJ| zlu+$R;E&^xHK1q(k2r>ef{|fQK#z1s^FolkB?1Tq+T;d40K~1*^+SAt7!p4Gwkm;e z)E^$*qWeO6On*p2aRG8?K>^ey>HT$K#CG=Uun<})&IUAmz5g^xh98I!>NIlcx%uL%Ak*2kzjJvGizFx>m7AK!Vuu# zr+^A8k4m(`We({_fr=JE00p5@`Up`AL*NU)Dax>26|G_qXE=fUXI-SA0SU@rH_s6y zS9kc+3uYBw0FC!lnI(Z0_SIcqVM-afBtfvg>5o<9(z~6?pq!2gHh&cOy8_X~pGcqu z4pp$P^W+;Or1XLz=61D+WhDvLsJgQ<_r=%Q!x3JPgCMBkRG_n;KPu5eC6z&U7@-HH z2;#Gke4-3%%+kR~exA3v0C6rVg0Di_4cB;)cy?4%DcV+_V< z){YtqMW7?d1OF07(1JL?!R7%4fo?i^tn{ShQ=Vin&48|wEC+Oiw!q7Er;f_S5Zhq4^U>f1cGI2cT}Q7<1a4mhF?+Cj zP==8ApmvqZdnKSm_~K{h8Fy$Ay0CY?+C3w9`QqU?sizLGH7R9u#a##c5eqf62vRGA zKL5_N@L1~A4)y+kX{>$~D|Ah+TjJ+otdRAeC8xey!J5^$oWKZL?v)0Npf{Led;|y` z(vSm@qc`-xd@7_24Z<>apb5_q0_XET@zg=bgLxe!CvDIm;D;;^cn1;aK+IxO=%;>W zcU+T|E6?B$AsB(NQaQb1f9Z#RAwUn9bOa)J2R1|lJD6n!7=+8w4=sQKfv`s%ggq}1 zUnn3)ckl@-z$`7Wgiz8CvDY9PNJc9l4R(V%$K?yy^Mq084}qcyxpIQtqBkoO0;=RB z^gseR;Dk`939+&&GiY}<2sX8H1isJ$mGgRGmjnAIgoJn;zQ8#&Fe}(YCND5M`u{Lt zM?fiIS30}m3x9$ki_~z#Qz&-BEEpn)kVq-D&?2#ihXMf)(L-DX6^FG$3XjNF{iZ7Z zaEAqlYRlRM(g;G@6nFuaBH#y4Fa+S=MX&r0uA~Y0(o!__;?OB6(E&x4h8~{7}*{G zat<8XkskSxAQ_S(Ig%V%32pO>8JQHFX*g8xAI8rt%rJ|wZ-cRw-Pm^qV~aHQecxpn zOCj5sv4)H#SqrHs`4LHl#!{3em5M?`in63EA?5G?=Dc~X>zp^wb)DzAzxREAO1`ug zOU;xh?U!H$wf{AhsI!UUlS;LZ_KTB9h8i=ab09X3Bs2qG#$XiBGEh26rIUm*%jd;< z$IF73r`wB*c%jbEi#;fGtC`{|d`YD5Wtqv#B9$=YiOa$JSVuvySW=nCO!3a^5+bf# z^c&g&UmB8h@&%C2Z7fF)tWSJdB1sk4O7^Nv6oYx}=0z+Is&+Cur;Ua6VNDVl`k!DmcE23^P$Dg_GWJ{X zr;|1FvRG_bjp$Kc4K}O_J5%#UhdyBi#=5Zvk!nlat|l?1KQXF5CZXGz=wNXF55c2(sMn;jPZV&o?={gG`MJew*xSOui{k!! z;7@p4>~r`8n~XfB%!GyYM-n=T;t~WPNw2}ib+sZ^x`#9Mx~GK0_GMl(Oa61aCc1xJ zY#&9ufS$!Qa>=2Lapl^Nu9Mt_h4WChi&6yk>PQ{B(}ntFih+*M4c`w!7NqM|NsVq7 zt|wt`hRX?`aicdrLqi3@$Rzq2rDzRb4A`@+;c&_fbfl2wS)r!8uk-+}BL4kLoDCts zJ-NLj?R>DRUWzW^k4#gUdvp1P=8CH3sz=Q=ADXWoG}j5;ZqU8m=zja=h1dcL>t3*Nn0b!WW2INhC%RZtfqi;EM&!JFyfI(Mh4&VYZ> z+qb|V#pSKgGsv^_eO+wN{ID&I;&%>`-a$(6Gi5WIM5y0+t_(bK5(uJUXxG+>?9EQe!0$ zUU%8V`hIrIed{~-N%`&ef9{FP-?O&vNEYu9I@{$_Egbr%E&L9DYW}1<58@C4sns`tIHB^_e=`+zSirVvhKbA`0`7S-Vcv^XFv9y zjqRN{TJ2q2#VOaZhEpC5C39!$_N|=_5obJ7e#lZs?pgPEvYY$_$_1rkpm~VW_BIwe z-epgZ0oCt27=5C>{{-a8SkyE4Rvl7P3j4d-sijw}4D?Cp6bGK_$IkYP)W|&+egawT zm+~BNkT>YJE_u+^f1;&rlEik#N|7Ulo1oFJIQ!^Tz+m4uukqwT!^1&kHk8Wjpm<&@ zA!W!CTVm4VPDI}OQ9TsGj>`B4z zyg^pVF-o-?Xc)oGjuc@_dVhge?hMP==?Qo~mr5G$<3hYL9lq-MqUdaY66IEH#EX`i z8_G71Z_mC^s+3D=c`jQ_dMrCCrIYeV)*!WIlu=WfKRc>`9eY+#IvC;bG-Zsckl!`S zrD*cJU;p95v9Wh4rFrt>?|a6vmE*H(#dr3{6gj&;c}{#ynOLcrFgtv(=*hLzGO>w$ z`Ah%hCH?O89?l;&FQfDknv)3aN*2I?U1SCU)Uv!I+`Gbrc*wE=UN8?GcJ%o8#X9&h zo#jg1OGd=Y-!+{31+PdoNFD${Ghb?jfp~C$_GIyu<7#FA=$I@I!MqZE4hA1hmTBif zc-jC=+hzPFU~m?qyk4TB*eNgxS_U9M8{o!#)zb1xIF(ame2Soe=AH!jdoQEtuSIp( zmFCLLL`q;85RCJYMNw5JT#P`iR63mUlE}Dn%VX!`D7E?(qA-G9w-n(#vl8RLH z05H2uVgIMkep8Vx1oLhg|KY~n>2o-c`CPHEFF>t*pXT}g{E-1?;`p3Y!8?<-asdF~ zBp-`H=tc58^4MR!i4>`v_LzGs@|@L5@wJ~9^cWV027nL-OoRd;CP5+qWOUx`trx`OJBv7m_u_!q)TzK!bO!)fj~9#JL9KrQ&Epr70p&t=1pSi&qxdv=5kdlD zYeAcHz-lU5vke5`L97!n30WpIiV2T`peKvD$>3!?NCJPn1aMHw21pD1O7E|bk9pMd z!p!1#$KO?)pBta}w9dJ({spVR22f_P&%uz%8EhK*P!Yb2XNORhuvVW$9!`to9pkMR z2LRwpDjVPnsjNj_p+Jt~2iivzh}gzg)O<0T4#LhL!fKb$zNke4h--7p?9^>(t%VaPMreu6#MRb4h-x&Dwzi)!Tq`kZH^Acn3j=&}u6UsLEo&5$J$>f=c=3~oVhh`EmSHFW2S3g(0F;f0`87Nq zct8K_jqN)6$ZuWwQ-?BoW*M_0w!b1`^=7GcdKvv*7{4uCHz^Ek-~@k|O|F>h{G25L zW@L0V0PqmDMF2GR;kSi(V37d`bCd*+7Z=ShgiIE*crTV4>=-H`lTm=kWEuab;=>3? z{F9ZZe}69HK>fDM0|q-zM%ymRV4F{CCmGus*M?M-w~)qLIcM&9ihK~21#xZ`3y=YG z@1=9z8`!*MY!WNb_EA(83Xpz^J_JqvE4Dr=R`3N3@c?Wb1x!N4n|H8mDAzt#v$k_NR1AwjN8kx!UKv$T5Ae3_F%1|615J;H$g?;XQ z{RBSoe5<6er!c_0cb)>4Ge(`LDP8)!BmDJl^1QjR#HbJB>7f|6H- zorD=Q1%#Hth&HhEFpyJKmF^m-^8%?SP0dFG{jfZ|KdsbtSk7}DfclaWHw3c_SQ%Fz zcjB&RmR>0fsbQRMr~?VQF;4%Rt@;I?V1CE4Khj;teph|8fsKh5s$)m$Rw#jCE#m7Y zNVO^af4!Ppo2D33o){vRwfavYkAq`N2g)Pw8N#zo|X4*xP_OU9l0M|I%+B2>uJe}V5e}ItG{t{ zXhl24Z=KfuToz}myV_aWfq6F-_tK!hy&nXS7D`{bN%?^&N7k%P+jm!#EaUAgC0Xu{ z>&f5WwzpAx8}Gp7R<`XxS~Yn(Owt(Yr=WRc&)Hc~J$K%^$bN};c0YT6$Jx_io5$HZ zd1uGPm&SYE&Mi)3*EO)zN%Or|&beK;(BrJDJ1O65*FB=+?RgK5>KnTrkN13!yAz3Y z%S_f010{R5*O833j}15B_6VWLdpFdc1relPe$TC=NMIBOsBF6QzAP4kB^9i`}F2cJ~74db`d6C7*N$0VqHxNFXtIfTC>ZMaTeQORO^>o)>&6 zOuir{kSHC#NGVg?i=cs6Nr1u^RvCcuk!N)m+Z*~mD5?FT9e8h4 z^G}4e1v)vhj7QKns4V`X;MBF3DTs_3k5gZLEz{9L(mx6&R-vno((FMlSBs%7XyEwt z={G@UA+hl3Zd2~saSdmKk%?uIzQg9p@vp75ywvU+`@F3}2^`O9xw~Z(sjO1|%wRaOTe?I~DT+w01+jJHG7C8PatV`hnf<;QEM@MN709M5%mTG?pA{&ZYOC-K zH`P!^20V0)<%jo&Psbn=DS;qyPKiCwTsR}V0mPy(;@Pm#P5}vP4}!;n3co7iAiu*+ zphBdM(lILdKZpmDq{_}l`T<(acyXj~48AU?9n>2$&O5a$$w@%r>^Ayqpv}vatBOz< zIF$0;WcbRPFQgp3CGo2f$029e5D;3|~Ny#KzKcq;e60=S)14>E`F?OdP~REu$kZVCYK<4%DpxQXS+OpZ{; z>!1&bm$VigfRn&5zcUqLAOt`JTyeDTBUVuLP6uaKfqC`YEq!;iD$xYz*k@8rxS^2F1N=4bT|4saY%VAZp+X)XD)sgvUNj zWNl|dw#m$gpGWu16kbKdPz>fm)#fpg;{OUx&U>nY46h=9@SoChw^aeCtEw#N)iE>+ zR(}L4;DI4@j{oZz;_Ci*332oVn6Kn@9GRB2PeV-nJJI`=M3A_W=nJ~t_6GJYBDc3` z9OS#gHNE+(x!y$1A@vvo`1*PCV-ZJAbvA%idkQpw4^UJjk7QX60N$v1lREen1$AEy zcFZ&0yUPG*s=fe~NJw(YesFi3_uBcVC%t>O0HfAG@w7-bU!5d$j%eLw&I&ldN}B&g zMM;MSLKdVlGC(r76eV~lkiO>+x`UsBfUnY_#akfgtes+%;NoGOL@tL(QCj8fH!;{B zH9yzS&~o(vYqZ9x#Z$K*^xudnj6^mpJaaw$lG!LVOS}9uzxh3vr|*qCZ0WcB)#LqN6KK=kI91IYK1eJqL zQf`3=_m@eRAex)^vkJ;UFS#d1Ek~jN^*RWr0MN`I8*&FHFmzOWpD-S5+zn}a+t90R zI<86vLGiEjj)L+yxMnj{#hHq8-h|S+Jdgm-?2Dy?8Mk_ z=GJ*fQscAJvgfki59>RT^9>Gn5blP~qe+>^h%=L@Kfd!3iz6UQ zxZGj|W0HfxMR0k|3IzTHr6jN_R~bkj2LKka7_g&_Lsq+-sbmcfNFmAg@tC#e>C($y)^HLnn$|KsoO1&5> z(RhHIuq zH3)emD5=}jM1iQ?KEO}OHu5Bz9}e=@nVJ?5trUutQZfu8h&E$HStL=RhIkxe7^mkb zAu|+5Xs#4iM;kLIPc!ETGnW)I*8(%Q8Z-A6GmjoK&oMKvSu^i7GoM2mScII@5xq`)F61(}q~VyR8#AF)WCwMbjDNI$foAT24L8NCEh2GWv>%;43w z%#N_kNwLf=u*@qM3H2N~%9|R_l*-7TwY;=uS+qTJac!jF(6WTbQbT(%$;RsPY&l_! z)>AQvp=V3;JjJvTCz(UIMyu*WWR0iQwL>dhS#~d+`j}!>8E8@)|Ga1Ac})xP#t`Br z^i*@r3s#;N7~H8x_*3ecnZ1cG9#O2y0!>O?&BwC-CQ6+Sy!m1+H;Vo*~1ltx*$h0V~z=}|@E7{ha{7jN^V#^yz! zToT#jX?$K&izy=oB$I-$EwfQy8+#d_KiOh4Wpg^<^>b^_-0@D+cj~8u6o#fVFTQs? zJv(;#b@O;!_vz{QQ%B=9r%i2MFndhq^lg{0wbR?DO|aJg*_f+s6PJl>E5y;I9^19G ziuTOsL5D+xp;OPwhJJYFZjRY*>7NM_9DN*b{wQT=2l{lkz-GJV%mI;4KGPxzIwsFE zerD?BK+Boqcr<7nX_t~wrZ5RQO=9sPJrf{ajwi7elK#(^yGug!l91yh)Eo)DPU84W z!l3LpMeMi~?YIr>cuw2#dfD+s+VQ8_2^881*4hc(wG-~O!;agD%-M;q+ll?P!=dcO zMeHRM?IjKDrB2&Rd)c3ev_F|@FH>kQTWc?O*IvHYUSZr`an4?8-Cp^xJs#ztBI2N` z=%8lcpnlrnNW;rPGtxmT)j_+^L8sP1_pXCpuY>-$gTb7G;ktv-Uk3up(OAUMMA6aI zz>#>`(ag)yJkrr3)zPxh(W=(*)LlpGUPqg8$J29;w(E{({yLISPIe+r_KHpp22PHr zot(U!oFkoFQk`52o!n}j-0wPh^g4NtJ9*7Hd9OS9{B`n0Is1t?`ztyJ7&r%>b`J7# z4vusVNp%h_bPlU^4!`Ssw%0jg+&Oa2IcnWG`mZw?@trr#&ijk~R_646vlkOvyQ#!zO~h<67R!9RP<&=29KH zS=I1hX)?`RJIkCvE7^ghhi1y)!CLgJaz%ITBv^?CSgo72;s(ugGbfShnvTn^+M$J6 zx+~D#ug|%2i$hBdShVn9-4%o`2`tH=5-A_5#$8MALt9flxPH1z!Xd#G9?_dQZ5nQp zOlW#{w#*7R+VVqZmdC(fjGK&`wPj}SPR^sZA0MtGo{qB^$hh?>7LegNDSP>&A$f zT|D?b3MM5p<0&(frtNjH5N6cPssN|;!Lz3esq`Jz_RB2S43M^Xs?uh@19N_e3Dspz z4{AUHnqaz%sPNT+5`%Vs-1UBf-t~5Z#%FK{g@~cj}rNddRwX=?P za+2vR{&-gFZdQMKRzi0+(+uLqgt_59tSB-ewoH~N<_@o`U6Y%nFRNNE6@vCGJx5De zL3rcA26b7b6=9UY6u#^Asv}f-*NH7lRtnHD$=da~EThW`%cL{uyw&G_(VTz^Ym^aOl z1^gvfavl|e)h^AbE6r$l`<>W$-*ejU+s;g4cNVb@qBTjgB+n)JdR3j9RS_*nq|a5| zU`d~z$9FSJq;o`DVfT2w(=jZPbU&R9nDw(j$;qXXFH45IE){P*q`R}*&e7@$=aiUk zM%`Y+S>6Nik4z0}@@5Xs&_h8WIC{nJ$=<>lbP&{h-d8l(U-PR|@R#Ygkl2djr-wC@ zMN7uJ^zD~^?fDWXpY*ypl`h}lkpOA!;4`d2cIMu(yRbO(<}9 za=?Eq*$M8a2b|f>Ns)2Wg?p>5z&@Q`wmt84-utVLQs_~;S!h<*Owz`Ck~T}>e5f@p zClQyU-5soj%ijM2b(>x>#}{Co!ISIxXIH>9?}aSqkoSM*&tLuAmlf82IV`^~l)4T! zptHD-KuvdPmcH3!Z59C~Hz}sq^X{DTOUS0(Z<)Q>B~Db2;H=WF%&Hr-@Xe5FFaO)3 zX9wE8r~LasVt#Eqakh&O*h-*2#X~}45vkmT#9&pHA3qfENu^wW;s95z1G(av;Dd@&=%332%F?XzPh=V zzViWcf>jc;%3#7IbD`;QmYKrHK>|b#Kzce;kIF;~@B|1W5jJGLe&L3f?1Z1;i2{ce ze>Da;b|vRa-}>w>V)ZS}cVt0{uiz6a(#SV-vDGDw1XkG0NhMHKG_#c$-+c_Tl8QNbf7dmbDU4}vRNc<`B z4D7Z7Ug^!6&h`R+EOmUzTzVSW6d9RLakEbR?16(-TxQWR{61)q#Z1j17f> zmdo5OpEkUMO;!+EB&q|+(|dXzd^1+zq>BOxEJgD9_7-AWv8{3xBxJOuT9j$2NwZqc zOqBW9m%H_(n?)@&LgP0~oihLL?UoFDr~U7ic=_V?#`f97&z2S2x4hzGYR|sE3*+zm zz^g?a>y1~x63H6Es!O1yPR3YI&x`){QuAebdOFOA0Mbz=>xUFvpjvtdY*DPa?L5JDaC_vV2Gp13s6tuAo`fUE_+!)p(#nE z)Tq3J=A!Hwi?lBxY0EkP{6|<5$Oj9{Q;W*kE{x2(%PC5UX=nESGJS48*Gh93Nl}F- ztFAz760`jGvVX$h>Bp6(3Za|0flPLO)rgf%@u6IeNGo=2<)`EbUBaS7 zA?%mbi!$`jT94fqoT5sm-OQN4@BH`U_8fn$HNW)g^110-VikMi>mC13=XWE%Z_7?K z+269i8@?m=uEqU9$y!2dMYFNlwU+k%)oT6LaE--3tnXe`$y<$V|8!id>AQd7_pkNU z-^xq|6~?VObF_Qqq<-vZ-^FLGJ_{qrKM(VFV+-x{Dy3hYox74>m{U53QJ6q%O zsrvm{l200WaWs_f9xn64H*VC4K+pCO8eDDi}ugA*n?B2 z2W$iPy0^wvyL$XDp1kV2I3htq-piG&Nc6ub``2mnPt&3S#Q;sAn^roh%R$_trDnq=1Q|gVyU5EBI>M`aHW6E9Axt}ExEAmcwT|F!r zr*foyX5$a600)%k$vu*}{%K)Qn&10h&>8iX=TpnqHVP_=3;gHLf5RQ=BwmvBB>TN6 zDZKsYJK_EsXNB~ewnr;Em!uX(&ZTlrf8hNG`rr_>D4oH~M_v8=oI5{7N4_Kf)RqY* z8Rf0knP?W~u|G{QIU+oXpM$P=EPoQp}jJ6j0BKn0p&uX7(h$Y(3w zF8boHGC`Ej-n9Qs$X{&}qdQ+3qMvrej$;f{nM@U~(__0}7a9uFQ=8!Z>d;z<9Phhl))*-sRzb?Hj%|**b zo_UMQQQ0`}gELY^U-_)YNA5r+bdT&VAz-DsqJeK8z1Jaz^Tbp%3vf{(wvDJh>k%m1 zYmM7H9OCM(^LzI)ITRI|#GE&W*>*~b@SWBv;%ip>tg$G%mn6?Uj3r)(GjC*{#-A1n zRXfAq4DobO){^^c$f z$bdCdKBM?qy;*vG!Wbkcv+k+kVobin2vtCQG*|Tl8OmBRz+N?yZ-}e6oK$H*UB+3e zj^Y%}x9t!(Vy$A-AEH7t zZt0IE*`{pMO8NwHbzd-{pI*JW8dO(js=k1T4qvDnoTTz)+f|L}&w-kOx8}&={VcSs>urZD!@fWtuT!!d6Hyk!oXdoT}@usM&7E*QJifq$KI9y+4pxe@w``i? zE9#+I)7wJ7yR!v5#YZ2$dn*<-2~nHH6wq2gzC!9P#zXB?y*&vUl_yu%d6LzKP7t}D zXsoz_(@V;vur?k$`-bfDF~~-wa$-N{C1#Nxoq%YBXBe6=GvA0V+(6c|^52WWL=*dk z-^&O}gO@qfJU{cqz+r0dia893{m6K2Ips0TFj=6U(|poYVAfTAh;N8r-goGvw*|Y< z-T-^SCQOn_L3vf27WyWVo6!H~5w;G8Ng#bRiD+ldlKBi5hYxW2FBECk)giBJ=Lp4O z5RlK;JFaO{#gA>!nq)~no1eD`Ts=&^?cYy8yTinn@))6`6s```&3>9>k5{I`nC6b49N&2~P!>gK9>>qpZ{H#p=b^rQ6 z2Mwy$p)q3Ns>!9)(O>2NWn_9POuXSg1<;w^#7MU7qVkvC07X*Em5x0FSH;w*R+#3M@n1l{^a&r%p!fThbU2*rIJ zvb#+GB9D;bFUDA2N!sAJ#(f>pqpdp%U3{xy)bncU#E5HtTD*75XP#ltRq{9fPy_hT zW$=E7!ov*?mk;*1c!Iidgx~G^|NUh*wH#1p=Yke$B%Um~ys5EccWeYK>Pg_YY=#Zw z>Sdze$&hotwlE4$F%Betv}JOo**aKDfp7ggnq(EHQ&w~ljfisX3-|$*xO&}>xbTqV zkDo*ns4_3bUL?*TJ`mXL1zzqr!~3nD8_?f9`FEk76$VjJB_sa?2*_jVdadNYtHJ|$ z4P5(w;UiL(5LiPN{#@n2^!ew>ytAgqXB+>G3&UF2mWYx&$iUHbg6Q_S3#8v-i#R z(o*OR_M+ks1KlO#W{P6SN0*_45~RL>W#N4 zCz=Ms_D%KoX}kwC%Y9R6=9LXLD2{zG<*)aq7ZH&=Ws4CDadU<%)c*22owj=+SBWVqgT8`A9tRVfbDYYgcNWPnvaTj=|ajo0iG z!Qk>agUJjo-wf$B2>k02SQ?nq+okF6dK!eE;jieQsI4@el9my7DLT9ACZ@POAI#qVz$U zjgleGdSncD&EOSy8B!?io{eP-71Oz7uEOlc44ok=Tqjx?P@#1J^S>=k5+Ty~5kDht~2A6_@xHho7Z=c{^?H0eqt-?EbN zHSjf7Kn-ah48G7=ws0LU0dRp|SY*qpEwnvN2#!W1((RFxCfkr?DpZ+^c$>fzotABg z%l#*nsZXjGQYac(fLJtVSRMLy*=e~=j^GK`k-J|wTfhc^ zVtsi+);IaIN#)f041A|6yhp&|(8-s@)GdtHXtO-}@|d2ZFJFH^{jrEE1y*kIyKHY| zL^mBTgBsd=bKj ziBVe)YI(&7JO4E#%~eqSM2M1tF|>aRnaSG;uuqvhJD~9mQG@)0hr=_?oDrTo7{2Z6 zTF@_~7k4;)`%!mI`Eelm6`_+$XU9IYeC$)4{WS`bMj7`Ks&`KknTn_Cmg7o7OomhV(Yi?jkUC>)xM48uSy&5l-94dZvNZY z^xoK*-#C_Nf5<8SJg>BMBKl|f#?J$#ADYoSCjV_IMgNqD{?=u>stNhs+PY>K{o62l z&;GydN3DA$KMum9ciokLU5#E5-7udX8*5w}hcF%k*<}CG zhW@z8k%Q+!sBk`M6r;&aUU@S6SrxX&c;LFJ+7Ouw zLKad}5jIlc&51!jiQ#5Tra9}J2AhVAc>#h*k3+|yC2c-&r$(%9Kt&3$R8S9RKbUUO)!x$-NH+P)mIrx+JhRBaYCLQ zYWiyGk{zmQ9cn%u>ONa|{wdJ8%K5?4Sak}wp>ggQ@MIGqw4xQl+ zJa>mmew^MFHT7$18nX?2e&jFwpFz{_2oOY zeozfQZ>dIa>n3!n-H9{*t#(Q>-cotn;%tX$btihI!}QFywMQr6LA1bq$9kjUEt=wWRl6cpnT}Z5YxhLrRGsgz1GgY93HM#nHC%JhS&T1UY|6)|J!j2(J;fY`FylcW0-hzX!`MY`H5-z zpX~Bi)(p_<3b-}seWXDgp@~{)rb2fELXNtEqj%-`H{ppHI&Jr5KB>X9A+FUBSa+<; zXhz5_7w`hLNyIRA%sE_Lw`_t$^cADg6?4k zq72}TFx8EKC?p%Rk7)89GBO`iuw@hkyes<_gF^v?sng1Tkmvzx65Nb*P=Nfqq?J(i zAb)u;vg5qah%t%`sICI$MMtr^L^6t$p;o})Iy8>|{UAIaoB5q0{yW`_1nGPTr z&h%ISjOE@sSwulpKqSZjhb0I`Nvw3pC?D-At=1|1Z?EdX!%OcU)+XFX>*ICIsb}_; z&$JElGV==)V3$5UtXI}N3)zH|j6U)J&DfW=YI?+RhXt7lggtZDQ6!4cUD(rS@#a2`#-(MjL%(dr8UyH z5Be2uMzbbbCmuJ@mA;{ShCbcDkbGYa`BSM8fK`}60}@Y?DcA_I?Ae3iTm7;wN%6Et z=SD%S9C6RD_2|&|q`C?06@&3dAo$XS%7hDd|0Fy~(BxGB!A5q6`2Pm*AC5>1>(FV^ znFF(#H0e!0OV^x1n1ve8fS`OE_{zOk38@|~Jub;0IE8}tO&UJ)r>L~MrD(Ubnl^cB zU$aIp3_@m01Z6)qIIefdzA{ksm?CGR$4dcVQ-Eq<_hdqMM91H_$As2s2!_JoXe1yL z4ZcWfzCL**xHLa0Q0r^B3I38cq~qD2=KMD_xBX;?syzRNFaPzq5C3(2Z0J@E$?mv+ zJW5zgIJcg>vwmeqtmZ-qO?wsg#7l2bnDp%{BJ;&7>Qwp=4yo|%`ZH|uFX7GsZO11& zK1bS3eVfrntI>J@jUY>YviIa%@*jifdDEh7NPQS&0=2QFX36yl*a+)o`$mrjh zL(L|t+*;Z{2vIi!!p5hbi19PYLI`TcR~+9A(Ef)`or#c1<+L?{fv)RH}u&>rC1@LC-CEwzIv@z0#fg&DTKv!c@IZ+(){&!vRmYW3xlE z^Zg6wj=~&TJRY8Z`eLftr9I#wRXUk(-SKW%J69m$ZG=-}+_G+IhnfWay!QU?&L4if zg`LYLi)PAkZST|kfbTZ1vu@uv8_>eun>BIo~&fhw- zQ3t=izw3bYUCNCrLC3e=EoQWqn@uooti=FGb_pZP@pHBLVcrXmS2xIGc!`?g zf!u+jqbh~=p_|EVcq%_%>LL6pHVqpls+EoWuQC_sQ1l!rO?)Q!%F;8G>u>q?bE{)Q zPV9@!+Yvub@rWg<6sCovMo==*xOQa^&BU1@;p`yA2p@EMb zI2|$|S1W2RpB|xDXIN|d)em4tqv#s<0wQ>SPO>MgU#9uI;XYsE5ofUF>XU)G=oXS6 ziT@CI3-!HNtLh6=n~>K4773HSQ|=nm;iTCc^#sK~={5S+*yz|O8(9Y!b8H;<{ z4M)E4`WtW0DuiY7rsqC1=QE037%oYW$P2ud7FBaoB3JrmcZ07`o|Hg|stOpDlI|G5 zTdL8JDR3F(5JYefGFKs@Tg_|@rHgc8-UWpBxpmj*ga(CFrk2A)=QB@KEuTowj{DXS zlxF&5Vxr*Dx1ei%A0F1Q!llp)X=Im4O18@6Frx>4_=Hbk=i4Ni4V*MBsP5!-`ax`bP&^#jY z@MsaHpSHUc?Q?Z6J~1aP_@o14edg<3t$zucXRlq|Uuxy>ITY@?)|kB2e{E*|{pjbA z;V{=QCn<`2EDhtWs&X&pAKC6xI1wF;oDlw3$8zZ~hW)8HDWjYTD>y7h90O=X4sTyN zth-)>*@IwTU@UN_mWv3M*HP1nJVs^5s@TY*y&UDYVUY5eI&9lriod$0?Z zl=ZUf{-Ef^m7GSlMa;~7(`(q6!~r)6&e}b08T&IA9WfHTcSdf;#G|}>D*d@@6Q4?5 z3NA21ils3s)EgUP3JV_jB}&|YrH6j^iCLveC-&$2+H5+EF{`CR{fhgYhN!$TzDH(g zw|-0d^(<5m&MMXqs?ZpC>0**GNA?28>1|xREdwTNzuTnLy&$wi&k}Y2a_lrvDV36i z!rI7t8ll_NmYPB;$p#rQ5Vfl&L~g<};e54AINW5q08hcycDRwaA=@EF=H|6Rhxi=F*(uNm)U136jU9EZ5J>atKL z4>(nP)vCYCeh49J2HQ8Z>^JAU;KzsK7iX64@_X-3>P_qrQ9eRSBDW zSI6Oz`$vf{_a9wek4Rno_)%1<`?2&!pi~3xA+-3`XS$bNA!2q`F5w|(_(t4)fw^xD zKLoJdE@2?kOSZj*C-%GH|NKihejiB2DEY~7tys-792@W#7lu7Ts9Z;X(v~`B3dbty zC}`QvAGkL92&zxd11v1i&uX_BD^&9VCZ-+6ZR?zM_re04pE!u*+dMPb>=8c#x`niQ zlViTNfK;v+w7F5mV_V}dXO;Q1ANeEkXq4YTz&J*#c#59q87L_x_}ohAmo>3pQ-62Q zZa;Cjtl2Y2{!AseU#MW&QNtkTEej)Mv)mNlR|Pyvd!t5d9arp0U)8Q_j+=xPRY2Yb zt6kTbh!w@uqD;Q(4JHklRgYhzQq`)XqGg!{1``pN4LA`kORh$m{7 z6`VXY=XTGZNfC~s7U)TQDG$D3eY%vtxpnFX^U7+s0*3|C58kzxtL=>`ah)Wnlrf=g z8nH#W2leLVlx{IDOSG0RC{8H?$~TGOu-+F(R_^`R)=?`yGM2+0+x;ds@ndLdI;V?V z0_*5|gQn8qlrP)2b=sq^{;DaBSzbvY>X4LRt}?unqiIR6(&uBZJn>sGGkRoGdiC^$ z_j)HTFA2YM2+5QgyFOK6IdEIYgk0Pybj&^7GvDBRez9VK@95DpTmHnDqSON#`WACo zZ&LxeTYhD4swgynG0Vck<;nNqZ9mPL)=29DtS<>Ak_~ritlXP`G9|{(jc#6aQhwkXX{Mglkp-Ron-a zsQBIVKO8xUH$IubFt~tQ(OyM-FXIf^kru4H5i0@GW;do>0 zp|OyFI9$OWK(sMq0T%EAT5fWGc0z0n$4wOv+|I!s?FC0_XY_V*Yy2RzAhCwf zS#Sg#}603)>bFc(~leg)@&DlP{&y!G3d`E^fDd2I(dtXhZ#3; znk#bfK8FubdYQ8ELtdU0rOWJrp}J-p2YWT zu9Kog3Wmfg#5onHVk?v>*Gbemo!8?PuMa0*2!`ViU2@Et?cJq+D-8I`?Z$=$c2-WQ z3k2?|1s3*J^2SainhpuIlZuZy3XKtMV8FW+%I{?$rZc4cGoU1ZiybXy9>hZbiT0+j zde>!Ui$k56f`=Jmb`;K=oCxmELOw)$lN0<60BVdW>bg?4;*0w0hZVbneg8`8441US zJ>HO4A}UnDf;z-VE62xEY<-+pf8|S2+9k+bfR2fXO%~5>y2=KKZKGdC%jAVZgV;Mz z>C}d(Z9VGwPn1)M+@05Q|HUr5z7D#$q&!e3>hW4)>J*QTrPx88G~Z_yDiiKrw_hhI zHVY^X+e=Z!l|I|awR0(N*()FYzSq?GI?z7I30!<4svo{|PkMjBjH(c~nSrno*PRjv z_rJC~RiZ5aMpb!B_3!II*PF_}GDNv8RkzsK3IO$h0RdH8l?HKjmktRTZZ#in&44#j z8r(j%w5 z`vsmrfP|8S-XSzWnjoTdLN5Y>3epKkRZ6JRP3TA`R6(UFNE1P&2pXF7D!m3odQ(9_ zDLeQ)&-=aKUVHDgPxjG%PiD=*9AswAn#^4Hb^U)=Z$w5~#<03$nZ2+V7HzK>MN2=7 zr#?}UaNy@BZDpuK0C2Z1Xa0~K$0<%riA+o44A$5Tc+v~#F5Dhax^qj*zYSxls`Xum zX;FD}!Ckh1~^F*lpS3Y84v>&AVyC1p23o&2*~G7w(49#pkKvaNIx96}NfAj|uE6 zDg~w<7bdV&phOLe+nUc9bPI1ga_wDmYk~Gl$*wfhMfFM;^K#Q+7m{yZ_=S|t8PR#p zp#S-{?#n$bnS8aCW-$7j00&klXOHtX3}jV6yNZD}E-)-(S*kFqR$+pDKGO8J_34d` z9B}#}ZTeCqP0qaRyLiJNU9eA57g~%i7%aer-rgMWF;X+Won2!D45yFJr)|N)KcsM| z^U(5la=VVeWO%q`u#E1_u(?O9?rpqERCkVg*Z~x`6jd)Mzg6IMprjl%ouxI>8Z|q6 zCM|G%{}OL0RRbR8zrb05pl1>0_OXw0U*MAAp`^ny$D?+vhk4u5E&I|r3NS1NDtejH z-anf~ICZUEUYq6b^Dhv`b5PX$r0`$q5Z*7ueUb*CkT8Hm7-@5G)BJ=4GS_UocY$7a zh&!H;zLZM8*~y&-*t{TQB|COkt(n>%E79uBH-n|#_Py|(huw-S&4u3{zGoj2S$YBTo%yEni;UePZYL`2{M+vK zw&8ON?sDU>?PabD54x|25U-cHX zF#G3IDc!cA`nQWZM7JFL|N>{+)9lO zC-J90j6U&We$3;MP?z8&6+3i`s}kkCvvBJd59QSos^mO5fSC`utVtoVMs?(t-CM4< ztoS}tzDAXshi(azGD-U16R^CAJ4`~~e>yz?_(Sj>*8(^*Ye!rcyb^plTk^&h9X*5> z)9+659^(V8qM57msCx12ZHG+V{`|sgDN+H9MyyW0V-FT4qR-UKJfi)8#QTd51>9xH z9xzHz$ZpKaiq)~Z4m8TqNyWUHWRyP=R5J6IMv=t$IXmS&PRY!*mCO;_Vw}my$*kk| zNl9%r3-ywD>M)oaLh|IHDJg;AZ@cj1OqOXSyQgz6s#V3{{ot4(?wM5AGsa(Jqz!`P zQ`ypZf8Te&PdxdOQ;e)^-H&YH!se2QH4)>$MDfQ(LpKULu!RrpoTX75S5N>F6iQ3( z#nxa^P&s$i$a7%$g^jWjDT)^lAAei+Bhdqt=0C=MVIe6nFYQ=((UH&ou0b&P@yiv7 z5}Y~aeIXUEdAW+i%a4Jx%kN_1{5`(ZRrs33)UXLPnpd1HNR*$&adIq_(~uSnzk9_k zP_bh^<>d!}vcD!JkX(KIg!XrpAc})H_pX2tVk1;R*yRrDbZ|&BQhE8to39mq-zI0T zxz3LM@#`apinj*4;GVvt8GA*?g+2|}H`q|)Y z9>jxnzj3uNOPu~_6kB}Xn)AWdr}26Z?IZaQI(A$ES6Cd!FDYG7XC>c{QDI7!t6F*g zf!R`JC+H(=B=yWN0eDyY3sBxDjXP|CYf-w{A*We6*yxP&_;toY^W6gOMU@I(O1W*G z52A}qRk*i|9BodyM$*%UT_}y~GyFeQ9tj9g7VvSmVBTb58TY22JLG=CDM-1^=-Kf6 z^x?P=qtX;)D7lJc>a3#OwzZTsd6Qz<_0jT@21ms%P!?856P*9*7DVCDw&ndyYcoeb zdv4eJ90pZ9r7omfxusmhZa}4Az-6rGi{)TSmg8h|11Tw2spJ67nuBs3ABQ5pB@-Va zB!M(!@RAiLF;oIakhlkA6(JcoY358*>A1nd_v?7zQp?CC-qD@WVM)$0=D_|w2AIk8 zB@!TAXeuqdlDAHUCOPRXt(G5|5K9&ouDa}VV}+YGtZnliyh{BoDgJaAHzeZ~}abaMuE zDB{u?W6P~xLCa=)hL)u~q&d1IaPZ*ceTdEayyegL^BgYE9QT_$rAW9O!|350qHhcD z(gBxtfDX(ed^6AZMjm?G&i~$dID!8-(v~lFw}2W@Kn8gTqGDs{=tDHZzf;QvgI^=0 zuF#5@H^xX7eoA0X@IMA$F6vC-zy4U^Vrt=8ce8*0NZ8PsKPBf$q1i z(+&GsS1Tx4>Xcj%=z6}#rfwgIc}`tibh(#m+o;L|!XlFynp zVm6PA%C&1@Fi6Yjrtxoe2SwVz&na2o%OE8{_Q(CYU>@Ezua_am&iwTteAlmOV8IUByQ<7pBOg5Wep#1enidW^=u2wSRV|WC~H8DC6 zk)2+mv%?uELB47r(SzMlCf2bI8RqU?mLePgT8AtVNiQvx7mN&`i;1tnkd*DiLIZaG zG+RCumr5PhZ1NLRLs;Dqp@-A$S9nVfYkRncqMdlRbKyt*Bl65t16`Nr8P>X=**O>% zor_^9ixJ8cPjw90?-9J!!Mot~7cyy{h1PezXFn<}J;ex@CLw>3#xk9t)l`+s>Fzme zzBycQsqPl4O-5BJuhT(bTJ9`0ZY`9TrlAxUiK3%BLi5bg!b1R$ zQqUWw-b0$)346CBrAd80!ybDke7 z5qh-zEGhKYI_|8K&!Y%%Jl;95b0chTp4&m*P7e}%c6tWG11QlLkZ2wd7J~-~qp{Sk zd8FLnX0R?gjG;V_>{3i~Rtb{Yr*-}~W2qT_^(ANN1UK9uriGyt9Vx1AtLFn~W$8vo zT^Ui)GmmNISg(q_RGtrW@^9s&^o$YST!1C9sPhU-Mr$f8!f&O+%aL(2;@dX!qo8Nx*GRCG5fc(aiE z1RNI!5xnqFw1|fc!lBCMl@hFO@4?WettnS65fHz`OUlZ*usUEH_=cZb5^{mydYSCW zp1mA0pyRgnl4+^y*A`xN-CI9t(}%G|qU_nk6%9g0Xl{|Ma959_9i2>-=yOq*mpWzb z!?;d{=a&<@IvpFl9-qpvnDpjVJM^{EjFe9c<+JLzkPBY$0z1mNX7zY(59hA!I@ms9 z)$?Rp&Z$jX@mOW;XE&gDHZrDYVU1#+#snRgHh3{T{?n&2@z2e!)nqJan`l8>#j>-bMoH3_EPoYhG1LdmmPbK5%+ML%WZ<}jp zkVF?$H?I2l-y3>PLs1+xvZkQVHe`4?rlgqPMMtCIX7SXUbi=Yo0%0m}yUxYD--Jlf zWu6OCyN+D7c^Z?)o-}6aZl!W}OC_HFy5$u(Ts#6?w~Q3#tPQ2BJW#)9boBVn$8E;v zG%YWUCfQnEKN%Hp2UEx%(#8h~b#m|2eUkppBoHdp%)M0iNuJm#8YXd(I>xOG@c~W92HaUe?l`ej{Pjd@4J>>8qy$Tv8 zz>n(Ml@aa(?Y4`UoHMD`F_X8i#Oz(IdVmSlOs}XKw*1_Jre1t!Qa?O)Lr(=+Z_asm zE=h11XEi@cpj~3<9rP%a!5W7x@ll_q@3aK7&o7i`B+WYvhW(x(6tDYL>57%3X5~km z><*jQ463jz*J!WLgbN=KyJGeTNt*toRIyd+h4ws@s>!V^<9Xqt;BU4W%@3ON&CwB@ zGge?+Z7uUpi9n5p1?q!WEe2EkyIHw+<>$UInM$;%R8&JF_M(^01^G{jM^EQqXM&f_hTd1b*Z6y zzZY69QrI)SAb0lus4H=JKAOeBMeW9CmY@&1Sellk8y5U5?m};0-UwM2&avfWGm9=W zt4IxMWb1ugdF9D8$MY#5tk7_AtuRw}Mg76;VZ(LQ_j?JO(ds;0uvm5Gt38Ze_hk7F zB*M5Ww_w*Tr(wMU(d<L4ncHcV0Z;CqH3#*#^rPR8CwN4Q zv-2Kcu9~dZrG&D#QP-kU4B;_adpSPO!lE&U2T@A z!M9|pB>GRz>N4QwP0K?by51L?7wNDd$Dl&mT!4u0i6RG%T{wy^P%k;K#~ogcM=a8H)h?n{iC_Pc;eXS!w^P@j*H1(lN_R3s2l4dY1sn( z7IJ5)6J*+b1F(1d?XZYB2{jhiZP@#rkiDWyO>4w%4&`LN-LX9_mKO^YHJhy)DTAb! z++S69QaF9TzuzHOw+3_MFF+jUg~x3uef#{rh&!&GvLYRJvie1FzYM03*p(j<8pBI_ zgovPBRH2qiOW78T0Smdtx25-*)evbRY^e;rTz=tC#N$WYQk*-uo&2Fi|Z8`vB$+C)1Yj-xws* z##8=wgqUK;+>?VeF=QQr0WwbH+*sp!IGHnw{2)B6VLb!1!4dT z)S?}RQ>ZRS(q<6Cnp0ro4sczJ9ivJl`#0z(W4LB=uyZrDDcY@|3O2n=LBmLG3nvGD zq7KB63HkYl!pWYDA$~8z`=h+0QXr0Kw@+~B^s?OnfdVCCcZ7#np?pJEDBP2Mm4xIE z3{ljbu)bQv0S0t{ilt3~{TzjD0344|vBiUt@iDaZhzK%T>Ifv&XJ49(L2x<#%&Z;} zEdGgd2~HV1C{vAs++2m#n8s1OeUyGJx}hXO&!6@;BKG3~O_CFw#FVx-i?$XWy(Gxo zi+(uD1g(7Gt4naQz7{l&hv}?Rcy%ItR^iKn%-MO7y;b)%@UAIv$Zta8`&Eh~3D}#T zeg*!4N9cPZD)*khh&=SCEi#2S4w6wmcW;A}`C!P60A|`Ka`!yY!LDoIAemVQ<-7@; z-Ylh=ne4e=^c^D@c7m)KP2-$r6N;kdi6!HUHPKB5kE0D!08v|cG_gpEP!z<9fH9_} zP9Tsgw!+Gh;2kR1YXE|Yc~PA zws_ZXNb>}VV5nyj0Fv|CjFChK?uY@U08%Q8L95Q7Bjh8{imxdKH0A(;J)nHP3Tx5H zJw}o1BB(mDnSAm<%>eUL3`krPs+nx~8Djno9-%v!atiPc)1hcYfoxGAZlOdItgp(r zR4jpF97R4=6`FyBwhbDNqpZk=o@T%yM<|fP%QK3WI$CHKL>r%@O9&}OL3ER!8X>6m z(UfGyIimhgwGsIY-DVl%WSRt;lPEHJHp&P>o)J8H15K%ovYgBY$D$lMUV<`EptCyK zDAvcCXsYI_v<#GGU6!2@(ubTUI|}G~Q_I&R+ShzM%<90`_L^V*QHr<9V^A|SA1n2e zpVBOP{eHE8PP3o>BwM&1)mf}Lk3 zQwj>?Gw6_l0i7Z#%sPrg&EZCQak?0?w&$SbE?6KF88C`5Y3jxGI+!McvQZ-M6b`Y) zQ~5Xrz??zpzn*ce!`|#hu-Q^;R-H*kV8~Fc(zeOIT@w%E%yZn6iI*TAWLx$vy6_Ya zdCo$XkG6cUnhRPeZDh+b$;f?KN_$n;N)r`eItmVdV3`2$`6*bmk+7%25GDcP>mr=4P>0CwMmhH z$pi-7*P!O|H_Y!g8~h{%5H@csp}nXJuNS7mKK7?fjR#DWxkUR;c9l)&KDX3{`{X^; zMV1e)!aL0M;|`OH;bfqpGqV22w1bbb$Wo}X+96$EVLvcrpRX20zQ$G$2EJkrI+Y0e zW*Ai4K~>NEd?hRKMYcc9{tNQg{$$u!zt?N+bRXRf%6>!{b&~943J?E=d;z&%Ow|+C z`?V^8om?ALKo`W=JZQMD3m2z!%X%L=$VA@R!tUJpt{G5$im2wl&-fby+CbB24oZn; z!^+1TLJ^S?Jq?vq@2>=9U(KNfJgP_4)tsVpO*JBVN4->XAZiwmnjSwn$Sj+VD0Uvr zIUZCoy#I8EkkyEMYZCXCCI4;rs-G49VSEj2vMtj=p4lnru4Gxr%t{DBFf`XV)LZ_s z?|pcQf2quv*C*8Dkl$yuK)<*G?0vWr=;1q_)kn#u|eb?Sh8VkATDm%O4Zr&EAc z1D-yPW0G9W%OcC8UupKzg1LNe<`S3t`N;*BHnjbS|K1k=W43lVk&8AbaW zJA&HQVbF$v)4_Zzy+-5vpXP&T2PyNoo7ofKz7wT1niz8D!N)0G70#aTI1jNrGGPLp zu7WZ6A#=QcRwDQ7fQ7*nqrn$G$;r;>S2n`<^v!S&Y?rGiQ+9lp3lNj<0zcX9B1?fi z7kvt9pvx^k>n^jT6C+L^p<)YXyz??)!>h0(RBT{XNAyArIWV?kl|36xnE~!S*@d3& zL&46m-(9-;aOr8`AgTT!ncE;m!XTt#kgAWk zoHq!g8KM;%qSqf{bQ@w$7-Fp$V(%N`+#2Gh8Riii=F=Y*a2pm(7`|9BEYdeDwl#c- z=Ch>OXDR*9(r%xxC482x_$=S|S#j&LGR=sJ*od0`h=$vUR>Fu*#mM!(5&f-^8#JRv zVxzb8&qj^iM(-qynpKQi^o?3=joQ$R*@}(b)gNr$3qQ zHkp|)nO!lN(>M8KYw{V*RKD0$q5jl!x2fWUsh1T~Wqnf>TT`!SreBLq*XU2bb(^kD zn69swe&09Uv^D*aW~Nzerd5BY-EF2LVWz8Mh8T3ww>2|BGdm)NOV=VRo`& zcDiqNc58N?W^O@j?u-81SGT#9gt@hfxsATLt*yBon)z>H^FQ?G_ub|X66TL8=70Ci zpKP7YpV1J2;(v!vP$Uu{l?1AO0?jr723?>PU!XTwV02$#PF!HETww2C;M`u|hA#4m zFY*~I3b-!{CN5sATombF6x&|B1pOi@{zb~*i?sWfYl&ZEE5FG1e^K22q6}S95noa> zSkiD`(n?&?sa(3=zofstbOZX;Nc`(9gRjQ!U+*M-HLLt;(f`$I`>PFf*;ah{uEDZ{ z`?6EwvPil_LBx50|9`^tmFm4M2Xp#GJR?UjenRjl}GxWQ_q`)YLJ zYHa0dT>omq_G%JzEk%5dxLla-zLuG|mR-4))4%p)d+ix?Jzspi&|v+!`+9Mr|N6_y z^|Jo;itY7R(2dvP8#M+SZ{0U)6F2HBH{SPeG;MEugl;yAZ?+n2w!3e3ByM(9ZuazV z_HAztK(~g(w}>I4qwZVdiCdGEThskpv)fzq(Cr2B?Jowz&id_@#O<}p?T!BJt?lg{ z=*~CsogW4}``d`WF)q-5Fs}b#T>rti{)2J-|Auj4*TQc+E%F;9rTGAUqXE7LZtJ5` z)IKYMVP@j_VVDz3bv1Bn48wJ>3d=|oMsY+q^#YG+3S806nJ!b|lb2NdljkNPX0Vrk zvs~fjZ~m}c>)mCq@c+ef@sypjT)fQBPxr!Ky*)P2+|sCtLb<9Gy`cpcku2Qm;p|%$ zrr~d8p^K3#Ly{z2aJ#MgQYm?&NAlvL#IWYOQ#_xSSS#}KF7kaP?{{I+i@jAB>VFkN z`5b2QsZ_|gd%f9{5$vKW*rJW3OCpUJm{M|bi#{R<^`mBhWG(!4H;FB&&O63y!mp6s z1Xu0i?#)0@?5|i8jQcJZCLY_?b3uwa{ zpqeWHgZu3rgi(X--3MO*74#w(;aOYTJbH-($$(FbFn%8V_KmM^4}VDS0V^>BT#dhf z*!)Zg<3l0b#5jKpuhLhBTHBKh$f@T3LAzwF{~OwMI6x^ufzXy{;m$1t%2ZLvS*D1C z+zhprP}jO&nS0`6G8xto)I?w4KtaG7kU9U5~_0B%Uc}WQZ7T z-4Ah&h@(F@yby^|Bw=^dHnrf}eFNclteBcdmS%viVXu zQdEcSeZe#3Q;<~fYD&@}sUiHzGE82zCA>5B3upYf;nnO*G`yhfUt5TV7mjP`-0-@% ziskVe{ryDk#%V#JI4?uz=UYj@O#9>+@AS zq6Th}&ej3*%$E#RP4u24-yg+Wxu7F5e^6w5)vMp*$~XK#jctPJUI#SajqB1|3m6$u zdzg0_)PE}+V}jNTL)}6g#xZu>%ZI5&5RlX~Kv{~G`t|3Z(>FE+C6w>HL9FtX^e>W! zEWIx4i4?a}*0)eje=Y5Kn*_Po`BK@hG|46(bvs4PdC6I#H18L0w8ZbHFtxl}+Pe&D z-8U&9fk>D=r#AP)#S+Fa$|Q}c&)i-GHX;HbMY=<^;6&q>_U(L$Jox~}Z0bR=*`W{c zWRFCEYBRDP3j?H3`Jcd1h^UC9-nY644c{C&&XSR{vtR?Z=Tw6lR+M?-U6hSwIRgnpsF~$xAEQYIfnqrIJxBFpY2t`@vqA23=YrI3L&f;PFY$L>Ewq%3$aS_Ko!L`d6jf<6g$EGa5KdQPav8&tE-hc{ zBk%H7`5F8nazckYLfkB!xsOwt;So|RnSwttkIWPEWr&or<%>_4#gWchxk49Dp;m~1 z8*~-C^tp{GH;+u>P?hsuo)yM!pPty#?qWcUcqG1&-~2tLJ;IR8IAcUU3K0Uo#YeqX zv-oTy|9esoaus8aNOE9`0lhU%$2^8?xd*+!XyJ^lYZnV2R9uA?u?StuIoa}i&n~p= zH^@1szY{1TFh};dw&|eH!yoo#9soO(n48!MV?QQe>VizBQM8 z>1P|t)EF#JN(?$iuocI2C|iF^z2VIG;>??*S&A+qgD7`B)9-BPRu~7q%y#ff({?2F z@y7k;P&${pG+%n0%U#!RclMscUBUZgkMsV(T`!as*ICZ$$Hqw~{=i)&*3@yFpawG4 zH^YC!UD|)aU7Nl>=Wv(A&kmu&m82T2TYum#XQkxwL`Ld@lX4>5C2;)9>hT}A%jbGH zOX%Cb;VuRu+?6geHm0@nq?ZVHMYh|V!(G6|cUD~%OUJ1%jQq_~g0;@!E?QY%!a3Y^ zQgtEo%|otth$x%8?7R<f!7QTovDBqor4o_ItQw+XBpzQ=`x`DMQXouz5tINehC z0u>8mK81ptoX`U(OM9V{5y{dElf5;cH*A&d@}rnk?#1rhxnhp7*dtYp5QLjai!@&r z*swYUKuJagOp3Lcyue(~$Iuwfn=S-f1j8f>Fjy?YQL z>rL@^mzoESv(+s{Y_z zgVC`>yz5(ywK!w!pc>^XXLHUDBHk6-nHhV!CrG&jYefi%5fq*uCiDz>l46w=jNv@Y zQe?zTaI9k(AAH4BIsxaqT#`sXuJ3>3E`Flim6uDD zyIjxZuE2lET~hV9+BMv_R`EVISFZl)H`dfm9TGXB+$Hg>N;dCY?#g>|^q1U)4mqjM zZ(7KaznIq=o3-BcwEr);3n7sQSCp$KStH6_5*g=m7cyVrNaSlvzFapuS1GQQD0krv z<+{L|M7e7>{XQ8vO&~M?Vg9$=g?T1_B+u4RXnK!5nkaYS)B0rd6(*l69?{ql^Xo!(vOtJ|iiwckBR9PXV z|AujuH8hkpPL+N5RfcCPZ;>l+GcW%XSl;=#yt|>Cm>JOjt9+2HVpy(X#JplGu!5OK z8N`5i4fmk1yZ17p6ka4x(o!1jnZn>=s_z)s_od<#P)Yytb_BK(crzuux?;;5xBIAc zVv0j{@?|$Qt>$}`5UIBzym%=4{5eZ#Yhh`Pfu1eLIC3C$>Tvmuq z1V>V)!<3WCGWja=VnHNZ8kIbYZg#bB&THz8YSsGc3->w1&O~Yyb!xDlH8MRox!=r0 zjOz!aMmmVYkoS%3^*5q%)!OoJqG;c!{;tsw;n45FsU#Oh(3Tj#$KAQlV*dWEVC>sl z*Ey^q?^cpo9LsPqDes)*-*o`r%Dg9QAj(~sObe0PD5F}(xLS7$4zx(k%?;*y>mJ3ir@hb0C}Ykpt52nR6CL*^C#e4IAG~WV z_6!-W<2_^{rkf% z`^WDhA3yhe_!0E+Fz4g1xDWe58C&l^0?J?B%Omd>`EZS=6qt+8W+LgHPO|_|el5e( zDKsh~$JS zP4Z%iWNxcm?z^(oCa7q&JV%=5WE9#WDQVINGiF*^uBDhrl($`^qWLD( zKJ!v%<9fU0O#5{RjX`5OGN8)Z@{?mP4y9macZ{YXX z#+q8;+!4CjZsMAnj%j~Am}&tB-9PSlbe7wSrN%|LcE&Vz-hp(aUF;B}l`|FX(!W@B zr4e_7f+n`St04Cs343F!h;qI{cj?TV=ZznVk4auOb}Ju=Tr$LG#M2GlYOUr-LsN^s zhAXEu_Oz5YNaa#wl=igd_6Bsd4F~mhH>Q={?5fSByPW=HfFo@&`pI`(*Z54IbG`6X zZrhwfzd=Cvf?mJ!k+HXYpYRax>&?E6`+chl1NzwwM2YLrHD4}ZU^X{*bB69o&H#zx zAjoQvd<*yH_W%d&AjRw;^kfh&)=jKrvo#OWo^;fpNSjdUW`je8Rk)_XAu|+>Y7j&z zIWu)|$TvB&uxj|Q9Q+6kHA6#~&qDf$6>k>XA!)ZyT}U8+0ARODXF>u=Fr@MU5C%9A zf})BGAQ$h$i6Q{^UD9QcKq>$*0742ge0vo?qPI3;5eMW%LnQzJs$?>G#a?11oGJga zOJDn<1ds;#Ibaf?*@r8v8i~6&rmQm@2O%{L$uOUNLjVA7h@}Nvy^TGg76kw#PsYj` z$6WPk-DxJ(f=NgTlu7{*V?2dQ2sG7@EDQlva2Z$39=C474G>0dg+QCoqz-08>AS+$2a{2|%_x!8%Bw!4OtZr0f7<0Lfx7AdQ=AkY!z;6g)+9zLs>f7Ub4wJ;yl|1OoNa#i3&B>)X)Prj@q zFNC3?g%SXVN*wJ6TuBqI4gk;fujU*Hr`h-k% z4)~&fTJhp~k^8zc=ki_aWtT)?{@mhgrH!{wXTTVsIltB> zeo2G^Y|v0uG~mEsCZZCj*0=Etx|L;329TH~I~$@VnmiIS@@Gf0{)&ryq2J%=&&zdf z$>}V=cF!0C0KyEvF~b04&sNu=q@>85htxaY=HNg2_w;*~Dng)7=Wyj8a7LWF#rnGm zO+eH~pkNd5WE%&y0hJnLNDR=B8G=@-AOIxLb{*1Hm>){A6LMYaa-rn% z7Z+8PrqLh2wqPa!gTVVbKjgEAuELK>#OD+qLg@%wDW|jOO9!YM2fWEY*F#BMF6}#R z?%ntZH@fjlF=zZkAC3+VREz~uNG#iluaUgMrB~vd0Dy@;oGJitc>u>gxUCrbQ?ZML zoIw2j!!rPY{x0spE8LSgAOJzFvUVUR4(j7xdwk|j`hNZw>(i;3 z8?vP9C6`6YK-&~rt2%GmQU?*AL7ukJq?vW2uLGf)q?S!==740|#qqx^FQ~}>Yhn-@qX>FLr`t@u@6Ph<^nnMVIT9vlJyVB|C`-TOG#Bhk4+!R%5gp zOxizG0gb7pYOg6?TbIXBm@&BrB7HG?-d((g4urJOR%&n7upSsu?utT9KslleQkO(a zL3qH+T#3AHKu>?YJ+PD?zL9!V@)%GY31YOIeU_awU!?KqUmdqrcTvOs!( z!-wZgLsTfQUzpu@&xq$)n$Jz$zcDTb*+{Pay<3RC=q}*s&>y<%x`FR#{;|08zOniJ zKA~ZK?@`}7e=x4af;YaVb^@vny>{8H=W>_M&s#dZ!G7izo@9SxT<&oU8RqC##s`*O zJ<_$Z_um^ou=1PsJs9^GNPl1*xbgO2?&9Z=2R5O<_YZ6ci|r2xkN%RoP(-;49<^f( zVRI+;uPAkP8&k0z*5pe;9rMbD&*iQR{Ka#*OJ+CsUvd{jX~0?4Ac`n=gw3 zX9{#Km}i`DD&qW0?!xf=k-N@PozLm6j8bhZDOVs0ux~)5yB@I;>8=-IGH&N|7ry%F zobK91yEI^6M~Kl<>)qv=eCT5(Ok+_OS%Y&M{heUX>q+9ro*Lx-4LtRwmAqvpHb&(e!w%o`c)AMVF=pJb1N5i3 zYp4E854|n@?J6cQSTZ&lY)+b>Gm?(8T~`xC!4`vD@aBD%o8NtLQ4_SvikFN7_aCCL zU_C`G7HTF_L{(%&7d{lyFH?9DnTsS*Y_(K#Ou>@NAV|#C+m|(mW2BFk+HYBes}3!aH0xR;oc~6g#gZMfO--Z7*dvlt zGr-h(H75X^C)|Rz^DSpl>XI!ArcdbscFSPAV-N{|5-3Ss3N@vt8=z<{k_3>4A*05S zTtzObau{OtMT;<0_3FN1rc97G`hdr|WaPV70V<;EU;_*AwX2!&^k4!YD)@^U2!tsI z5;M~HMg>^$d=!iT;APMvfC*D;NSETXQW7^!2T_VyrA1Qfe7ehpN3>Jp0o;TkS*|m) z45Q!}S1LGwmkRzgN)+I4o6GdA(ci%5#b+B7e{orNCpoj#V*J%ZI5l}Pu-XvMRw#pK zZx_@mHHrX|I>*YGKkrC3CB5%^2mpxVGf76A;vFHB+$`DZ6fB6;l`MWOGBgP~IA2s| zw+JYK!7_CV;hL8;;h*2oQSY@pUy0#a=ito?`kq2U3O6LJ<4HWi=RwWN2or&l*wS25 zevg)gz_5}g9;FB%+2M@pAxei9E|hdj4S9h-OPXWjS<%qY_PBN`7#QOK)>et_)V&(A!IERljYAynsV&c8|$bFlK9`LmV0B7?ou#+T3;>Bf2X@N(0pQ~(uo1Rt4QUCzS z-L_{&4`SbSh@yD(6)CRe)-}hXD_noHB7KGM$=BPe0LF*+sB26%$Ru1qm!>@iBj1yk;Adx<@NxGfy4q^i}uwl13H#N z34k_nY()l@CsMO{mjh{srn{!Zjl^G2qffbP0_P0_7aS9VWL_nneSdS8K1$l4>bi!U zy>X}BPT9mgYf&e&o^}J6E8YN_EOx-t zpu{5?6lu5x_$Dj-hTq|8V> z`;E(XHHU;&PD2I~e!5*UOzcw@j7s-@d=;#UAkTDo&)VVgy9@Dkq%57@(gJ(;T;AWC z99Rw#sMaj!J-0g}WER@#NyK>q!}G0VD*YrLa!f+Fmb!OxO=lcPOAH z**xaYz=T_f`*`{tsi`|r?!uS1p1!OIvO4}EcxmF}JjXfQRpkf1X4!oD8AHlk zSn`6)6H^`;bC;>7y*;z2+A^jpO7CvWsZ@eJnyl<|j9KRuWusa}Ed3o8eDrE0k`8-i zA?i9j!ia^zHW{SSnM*t#yqPHHEZ!ii>_C6%yM+o<@8utA%`LB`SjtoFC7ONiLMb}g zT5?phjhdN`i5OQg^O8!~OOhpCRpHmVx_gF%GrMqH5?hRS7*c>9&kBJ}}l)LQN)KO_m|k6pZCFRa59y^D_#Emqf@O zt0Ae?4ZFhO7?7fZx_V=V+zg3|tGZUO`ZAd0nx49Dd7FMpbi}Tj_Kdp0rh04%(X67l z&Y^KLH%%@W=)9{=#JIGBu{xqejH{i0p_O@1-Gqp7@v9j&s+kcnt}D<0LMwbCOd*EM zSw_=dPt(CtlZbIS1#3FT{~s|f`%k(PVG)FO8zRPaOk=1A_9tRoxmtncT0xCk!M*hQ%uAepFc3ZS3BV^jO$oC`M8r&8_WRK zN#^K+n*wo`IvK7ynZY_)@j6)%TIp$BM1_p5O6SRp&eP4Vr$stX+jO3BbSK{?RZG(@ z($jqiY*!GX(V+&q7`CFQvF7RtR-o3G<=vm_bf0hPzB*QQ(T+h3#uUeQOZRra5T&VX z(>2%YNigks6@R_nzPXAT`?@ivT0vb~8(iBKQ>@S}J<|hUz22w?#^+vd)6=WOQQV;p zyImC1sNU89gMPGnJ+2*HKJ8>D?c|aAAR@+Pl_?k8 z8FQ>Buh4HjVmMs>2jdb*LqRAFDvlaq8 zpu)fKE;#rfylc9&&#dq*=tGDl|2K<2c$Xy+@4`!){~O*l-4b^WKP7p`u3x{ zM`yYi!K3GRmwA6=!a3fBh?(xQnlz0*Lr+j(|JJ)^hbt!i3FiOOyT-+)dj8hCW{4qI z1vHUZQ;M}GG(@~>Z=mN~bx;yI5jw5p1T; z@vh0w|KMHw&nJm^*PwKG-jnDz(o@a<;9W1%X}pA{(D0a2ee=KYF4)ii8SmQt8}FJv z6#KhRl8njBsKop)yo(-cRY=6U;NZXTuEM^E2~$fV-c?{_r#xs$#JhG*rgzWru9+aD z&BIQ+!r8?$adC6n(}hFf%!x)^=C7Cwlemj`pYi4ey!@Kv|di&Z_t& z_$SYmp5tBf5_Jm&0@31?6fKN4fAB6looxz>PY!&8Ukko|HTY>-GHOZ0yYRl@RS9$A z1asv&>+#6>aq!IUAG}LK$x@WX$zyd9d$?!=b!5<-M{_xOU0rc-TwXoLyNFxMKX{i- z;i%)d%zS|{>S5lT$~#-fPJ00vCmV7{TTPp=pA_Mmt7rZhE34;v*Gf>L`5(P&reJ?1 zJlWpnT<lGbwmu3x@^mkLS+Y>~*>&cwUKX{iD5$}R56Y;J! z&6osH6A}Vr3=!|bSuduWIIo<{M?GEYkG$8MelO>POX@#(*R6l>E+KJqe!|>bUG$4( zn!2CsX-!cD#$opzBMaX}l%C^V&Sh428@M*dt2W|Krws`!_zSL|l)$Y@nA%X5KBc6X$rB+jywkM55c|(=9Nvm*{xM!9^iH2oLerE-pU zZFu|_-et+Jy#_uY;$2SHoVWhLyF9Q}Hp($emUWiOEHgyBtNP#YuJ7k~7y4g#*UFP^ zA^&e@S4~$($(PUZE^{K@RRq0D#Jg6NHvZsU-!K2cyAGQoSEH=qJ6~JBM(nk_*SWbr z#F(F5_g^CFT_x#cI!YU#6(e{5*1Nu-z+Pc*?_cj+{73I{LMu7OyxW!#T3r=)kpfu$ z!Mj`pR=uzi= zoH+_B@2Mt5m|hJxJIA{$u`xbj=XjT!#yQ?K4;|YR0{?72*^|up0@~TX@4uv-wO?!H zi+$p{Qb8g3VOro?*i4hJdZs(m-*}hT^@(}$18XAQHL8D)hoJSA`K zTP%_ePdL}R9K{ZnMkl+jlkzNfiFA)i#sO%k9gNFBHmT;H{KN(GPqUz(DV=96UA0bk{=vI~v?hYi{-t+yp6gvFRZbz}HbF$a z>q2bcKYG_Mo2sBjn}{Ys(z$~W!XLbA^c?RxnRsRrP!eK+|NU=x*XbX;Dgm zjPygK?~mszJ(vs?YyYyRd!y8Mhfdt}>`-E{?Gdw-`=kKXNiG+w{#6AJ6*mlXZt zla`02@8tdbV|ewK-l=il`X#+NQT;LW^ebe*x%XAUYc-ko-pUV##wvA4Sa21t;SZ%e zwvT1^R)>op`qcK`ekO8b_kXZ=-%U+6UZd|J2@q;R?ivogl5fr6}bm=AZ z4gu+)R0Tw&N*C$PLJ>p}k)~836zP&3@V=k@+q3sM|D8EA=R7YVGs#?;wXXI3aE;ze zZQGd1@o+zEe8+yYzdqgIzJ2~$*Y}MC{Yw36rq@y3&Ry#bA4WOJgDECc;h_zgv_ z3%vTJB>0{j2}-w+(1N0p+LiC&C%sMkZS&%hv$+fPmVeQ%y0R$JgG;pQk$As+XVHp1 z9_^A$*_EMBwmpOhxL5lEb`y}lnk%{du)S2S|Quc0IK;@!G=@>i4)xogs z+5V%4I&=P%j}*jB>BuQLeybO%))<&3>dbZ^BKWB7-(XlCpr7^&QKs#pz;DdAcocXP z7Nk|K@hqn>oG;^y7{o0!CPuHs7t}-9KgnkV353H7?hmQ9_)@E7e1L^8MQOYnQ%Vy_ zg@0*@_$|YoI{T2vEeUOgikR;H;<@0!G~U~ov!Wst(sy?#PX}y}lP2c_;g+-kfA2tb z#*QIE0Kqh>jYg%R7TuhhJO-*gT*UPdkg9vUr#auT})Z|WggW>4Hj)g3gmg9n1f?dmC>aX zW?kB%xEZd_<8QD`)pj6U#-3>NTQS4kt{D1YKI$Nf+hFyfWmH63oW{H`?T!vuI;jp~ z=~MS`+e2M!P7)$a6TxsAHz;}yMt&Pa$ur{AOKqk@ChWv5dx8cDg$>5Zncjx2i1co;Sr&>vY z1V_jK2vZTJC`Z3y{F(nkY7sS%yqU{I|49zs@-3#;i38!sCZsZ$N3`cTv$7?_I7R7Y z6a?2EgQKbc)bu2R(_SH3GNT9)HdF_*K|E96T!pkR-l#c40D5Tc9 z=eA26(-M(#MS_+iC%u$`b}_6G3m926MeNfDaSCQ0B)4Xv8KlO8WHa;RG!>-`;uZbu zCH$$-F*Kd9QLKUsH6S`;fk<4Thx*KugvKeH4mPF@F0h-0D?H?$oJYyAEhY>2FR^Hr zf>zJ$W9}DlQ!UsHDD4)<3gm=S-TM*E%)v|rm+1iq;+5pLzE&|uw~3ODqV{xnjmsy9!RY5lliA#ME~lR~b%=inN`}>X-45O!|F_@$qe;KrKJv z)8R@D4YpuJLTAYV#`w?aRACW4(P6ci{2(QY6LaQeM&2@|g}HM_uZ0bM*WOHo?&?b| zY>b){^_&SJ-P`}uo-77KJ~fqFL+Jp1OE*y(1C|?$+(KFPuBVPBG$*{{B9X40msN8b zr_BAW{Y?Hr()|iW>XN&0s;&!E0kv&YDO|mw>K1BYjaiZBT}~*YA}ZgXd93Z3F={d} zx>d2|$S(btYC5&b4@Kx`da88PIzM9kxWZ{@gJWraE>KDz!EsE+3Pid5k9$7QL@?%h zs4H-nM?E|$PFHZ1?f9{?ZgC0+_uPM?7HCIw-QqK66&E3VUnYrs`gD#)uJ%BHra5-j zqjVP-E9PshpfelZKZ5VhCg>;deY03RFY-sRJi#D9xvT;@ablt=J29*9xtB2y)hpUf zT+8<3Hra?tp(xc)p^L!^zhkq<&wjWMM6wyzYP3Y#GF6|bGt_g%a-J1O-Pg&Zy>|of z~vjF?5qW;a-SvUg@HhRb#*(fw?R zFLb(P6TRM|}wLD7yR?#Ys1lpSF)^uOq?`9!-P-{#npzuVi-U(j9M zGR)x%SMhXLL_s57?oy9^di6&#jjCzkKXTWy&B}4_ifqPNSq%7CeeJRBdNqgkzjD{~ zrQ9`Xnl93UM9HA&=}VYoX|TtU*j85NJbU#ExhvB1xL5rc$bO=3gqOQ)m=5p@gou); zhu;qaG{`}jMseTWrp&-)uoLHq6GH2xXv@K90mB{WRR0i9cXb!*MpYW)=`KRD z=Sw<`^B*;D5^6?}QqBEKcSS;e0(kvzsviI~+`ei2#LHdCPs~=D2Blx?%u!A}+IYEZ zESqQUG0FfqzjM=35II@rz;Mv3i8>8x>wKhWLP6q7xho^po#>f8fSq){ z5K(_AcOjr!Wp}jtl4WL7|9o5Gvei(#4x;98)V+|qBK=e~U{V^Yo0(csj;j9UTT{H6 zQRO!AeR65Wj6M$9`@p#l1`e7GcLQdvRYx;XjVpHD(t;X@6ZMI@a^1=IG5$rmPl*Mg zHoA>$8bd~EIhX>(s``E;{wpFU7FE!a;j0v*I;2C4j8YjYW~ne7TgzamKpIU=UgeE~ zxzfl*M2g(tx`If(kh{((!OgR~!S!4P3g^4!d;l2Na&}+Ba5Bni2Jw ziM7&DZ6VA;aT{2mvKUw_D^fV4kp7h#%wuT!7l?<{11eF1zZA7P*pTElc9|apjnTS^ zU{flKaPj4?pOuhI_VCQ=9^x<0G6-+p=I9n0>-}@nrdE*GE4JVi8Y_|21dbL+k)Z;Hu>u{;%p6fMQOg#y3m;54|7I19;gPZaE4S4RNc=o%Fq8-zfO z*Bnt8L)p1|IuuXT=s0F`AFDOaZSY35Fxb=M<*r6!`rInAwyFU)k-;`{RU4$Q@xO9c zR1VS1clnqlA%`V7K>i!0D!klDveQYEDMnPhX0kB)A`w zxho5}OD~J>s*0bki=UpjpLK5Up1Tvkv43W^&$7e`PsP?EqORC8qRe~z9mWt7nl(P=%Cqb?+NxkPmkHgY?qiNoF zP$Xh0rfb39J*eDwzWnjN-M4|eh$ibF|~ zBXzouJEW{T>?AuheiBMDk+k`qB&=R6{8Frgb)43c@I+6Z?K+0TI&QcZ4S78a<1P$X z&x`Yp5hYiZT(n@co z@l9;i@<`QYX*EL`#6}ki(x@Kht&qB>CX83Mq>+Q(O7Fat^&8bp_En{2P>%ark9@RQ z8_A=5lv91wKYAOKHt4DC>u&Ff8Te?HN?(IFUaQ$der?oZe5iXWqqnzzrDoq~qf4_! z#z6YFL0IE;a%p8oZ#-Hh$G9iWzb7T{EvXC?(ePF>@Rp2~l{oa0BA1pl^b(tvE%fnF zT7G4_Eqm+m)h(Q?9Z8ej|Bi8~$vJ2@IT*^_HJ7{n@zqU79H_uSE_fk1KV>owcr994Ke2h9%yRB^F6;-VSu@=hw0oDl5cG)Fm zG?Qc+wpEnMdHG$iUGD#~T{#~EZ1HT@q(rKCn{-nan;8oF~lj}?W)#qm;U^Z+x0S5;YB$xAV2aAP=%Z4DCCmuiazwEbO%Ph zX>a-(s9>&KAJo!R)jnv_+Xp-1LTek?Raa_6u$jR8vU2;iX{Jd z$#%`xN_;)}P);a07!b7A7W@`T8Rvm1HOs}+f!_NcT5wHHs?OMIEnJxE4 z-gv{zn2X+C6^q);>r=Er#tgi^f3_pW?=2M@xXFpLmshnD4ztS#Ci6G#?fu@{c(}j) zd!PJ(lOia0Phr;&=YVIsoK}AEgC@o!zUKFFMTubF67y_dvR!>|&PH(xOm7aqtL=Ud zBEhp=^KWqfVqDcJ zaXA7Ox)!vmJe30uHUmD|85OI?Gsp(gIbVOOpDXw90^=(EYFatNM~}O2R*lEFZUpW< zW6HZToFyx4{ONTA#e>Jzrwnd3t=?;Gjl&CRhObq_dv@2S3R)iwVgALqnB{#%a{ou^f(_t)?G(~kbcjs2+hvnH|A6lLEwg%_v6v1^e)vxUVhX;SY z7QUxbI%MGVyEA)VDD=jihgsK1T=9Wf?GTQ~K+M{dxq9{odquyEIrozMvdA|7@@d5+ z3_$hu$Q8C1pRc$R>m4QnyZr-b>_s9y8g#*V;C}a zM^N3s9or;honrsS9R;pOJTAjDev7HVI=+td;tRx!E%^fR$`Ps4_-lBHz@qp`WO?%j zDdm@3e)27f$$S{%O!MUH;3vvk)=(}F)-Y7v{zz>^NyN@?NOiFJaa6y$T?6ndgh#D zwD{O6B$7Xlab#@JBJ5`|JV}erHA{pZ+eZ8Kj!&RCk&a1JG?}+;JFI50kv-PhT^VF< zQ!w&vW7fyII^~<^_bFjprm6lKTR+j@WlEfnB0^l(W4vy9_QCJR&8?xsf8^&6!3-02 zQOQA9iCn_SwHSCkM9tu~>|`-et7oqV(zPshVsq|0?7$gAMFRNKbq>}X-#Ig-=ibX_ zCVQ5q6Y)iW7CtuSFE49`%%i!D;n1gNs(Tzb!s&jyr3$j4qBWwNT9*fW(K?YAY1Z=O zNohF!ZPi6Iphii@21# zzTkPvzjBv7^Zz1ug{v@Lk$c0J_WqCJx1>q=l}owHp)|B=j26u8W z>VEAUJIJp_w0H1JC~RMywfNnk>i$z;Fo12I@MmV+X++$d*+&=$s%{ zs*gHu`4|qJa01Hl^g(!!)Xkg0>gw01sq}q@V;P+XuO2k32@sHmS`spnnC`p^dg#f2 zYto#lPHI3(l_L3y@zS4`MJ?$8;}pC71onlvL7MRgw{I>iF}Evd1Ig0K96I1Q4rW;@ z1mK!ruFeO(TrlKxl;to#m`iW#6DQ?VP9#x+QL zE2lSkxd&`vl*l`}8^J-*%bB-|+$lgqu8Ukn-Sp^{9fFZxF?b6(2J?ywUk6hOVAA@X zqo>{xlIzeAG5EBb$pzKF&-dBOS10AS01xYUdl5XgvMj$pWO>ak(B#RYYdI(RV7O12 z$>Wn3jxV-I$BN#07XYtlso`4bqqe+V)aq$uN3TOt*T{I;_O;(Ll?Y2im)I?G76nb5 zqi*~h%dC-h7C8)$dT7%NZkmnbOfphTcybp~W~aj%#ZM{w7{y?ZAyOB?NkXjbcAi10 z)lnn~9m} z5?5NyLVoPdd$|s&c8pxNDnyypbWZ7bJ{C6Jw7Su;xFbt@b=`uS+9Z<+o;~3In6(WZ ztz~hI)B;RQgYrv^*}U_#wMau)`YUAYrB`DCnvGv8F+I3|G&7N1pBG?HB7V~RXl08N)2M5+*^uIG z|N7X?up{VOag=%6(ht(lK4Te@ZBv1_M8*BBOTXX_T z3~)!>rcI=T$s6w^NKP)&6t-|l4h)AqQeI^3Rp?hC7MUwUQz>E}YT4OOB!ryZE?e~< zkg=`sO-@>77i4Y?rKpX)Q$hl7af7$e?BxvG+>~*v8^!?;oJE1 zPp}-<&GlCIzI$c#SJsf`EEBMU{hlbA^_pa*eEBcl`cctD@{Ty3_pP6PzwCcUBhSDgRK`)I0)9DpITEQ#$$hNx*jgcZuCY z1VJcm_#=}~4YtEs1?CHW{sw{lnX&K~36Qmg2Ag=7j);>4BR)+1mAX~?>K`bde&Z}e zmUkZ-?Xa%C(|pN+Q#^evy0 zF*?AMiB?zPKm+w8&0#Q_Q!snziqkMudbVeVWE~*BwPPGMwm^60Kgf4?XD^@l<~kgp z@AKX0aI9*)^J$oJhmm36N4U-Y7<=rxd8&Br%^PdA?teCKK~BGZH-GIn*sgWp`HZ7e zky@)r-Sf=?B-p=%`(kyXu?4wqfuwymLBdmDMC^7HN&TQAO13<6!q8a~!@xYXkx zo3n7Vx7ss06csIVrECweg80R(Suy+OisAY81^|P@_Tk8|x0!T?2(SdyMtyXc$(U65 z=XcLqTsg}gRJc9I+e{FqMJ&%>xeK|Bm%DHmbQj^-f6`qn+#qeE<-c^-z1lx4qnC7- zy^}&58)C&%!M=NUd+%VBFn^4+5K2~RL{^S*i82I;-sVst{>mf9=0Q~HzeX=X2)~fK zim?}Rmp#cpa#t}dCY_YCE-)2IQwkMNT zj!|<`)4+{s_$5Cn%#%r%P@}H`+<7S&k|GlL;i&o34CJzH5v9&T4ik#`1sBL?gmbwT zrZLW`GtTMNRo;b!e2;*zZy$YvT{{bBrYVLUg;Pa5pspm{7cgedPGim=XMXd63@2{P zO=dt!=l%&#cZmrn;^{8L4)64zMMM${9JJMh;CV>1X_MB+CpS5Baa#y$rFZTx)yxe7F!s`p-SQ+l7|INx>tEB{^xB zS?k0uFfLkY<~6bVKP*f7Y3&yolDZ_t(>R+z@A8N^Te|3+)Y)x41UVf9q)p@;AIXV& zOW%L$JjBh(;}vcDhz-*P4s75}>i+qYo5hw^@lJzWHXh@8#N}>sEh`olw{UKUdL@K` z-R%~nP2&B_b}g1LoZ;?5Qw% zOS<2d)o%7s5jU#f*{(iU852C)1^Y2D$$K)dFfMcTYlabdrtWt-gaS+(&vw1=ICW##VwWAB?LwF;k3bePc+baq@obl! ztQpB~)1WDHX^1Ml!*`ko#UCQ)JwyQ{CfQtanb{*Wl zWV?#^l@{)k*tu*>+XS*hreei4-i|>scPe=0+PTcyZZM@rsh-Y-#=N6-J zX_q!M0X$2xb0*1tTXx`DSm#nBcmGY({ojlC2?_27^Z;Lca%D9a``9G%sp-K5+tvN{ zXK`2A3S$4jja&}+2hVoRBZgo$f{XNk8=iQ!%ht`?p*VKV-rH*$BDca}YyROHH-|#8 z#UEJU1=|JlgrSs6(R%LFy-eBOxjUY9zTyK2uh(YY`S$#6#a{AkGz~KR{yeH1Rj-)@B%#B)$MGXT_I&i3i#}N!FCat{O@d+Llbv8#P?5g z;Alf|yi^blo_+r{qpQ3x8-abW`sZMCUwMDZn1BaxyxUcQce@adOqXug*#}NHg;+e> zb^b5g<(qK_&vw;Ha~~~C%Y2T!`!>+hf=1gvs6E@O8_#yx@r!l2bbm@n+zoc-ju};W zFUL($`IqgQM~JO395f_;ML4s6zGS-!6u5sb7+mvC$o5D|q)Gf85v1>* zDC@_VSM1fVkVajfBy7O(a^82l*aFXX1>)H*G(zEvziO*x={!$hhDF{hjx#0;q89RCEnWOYVX^RXS=v&a6&8IOt*9MTVZF2 z(5L^pU1vuO086-sbLM?!vG`nai>$xEjxP6_+_%F&kSo z;n}X+fnxXK#gp4S@3BV-@GBY2_$NNKoWOespE? z$p(iXzyMwSh2=sIo|&m0w119gyKq~9{EaNtk0zqu&eVuTAlr(b;;c{~0&A$ZJbM+A z#GBLTzq-6`c^QJLfdrAY2iAS=rUnOzIz9K0RYpTmUk+N=$zJmM``0)5GM$xctSP>4Ze7YGF_F! z)B~s8vfSJ?GE7gJMSL5zaW{V+;pwg+s|I**j|m%6Sf!WLG%J}P;zy7EQn+gTCEdkx z&{u5TAJk!LS&*Go+;54OyNc^vT;#S0dsr43R%u&|X9r7z6~kxKhvs!+3*ZOc$)3Dq zPI$Sis2MMJN!U~t+l-8=NLQ*P+7h1B2J^-db~k&7;pHw+9rf12El=O(_iXP^*&duK zneBwDER;|Nhu7v-${lrM3JV7R%#PEVzE8`#kh?I$?}cyIGIM_vzx}Z|Ym}3HGG1yT zB#wPGNOazwL;O!>mT#V0yv!8AIyrxa&VN{t9TgGI47PWQ z@e7!Pi4KAsvYG>G?AbrRE*;RA+eR{EWwMRE_FoFQ$HF~Js$pF|!Hmbd?4t4X7#{B`TUmeo1omW(^I+#5Gs%wS15^2Hmy5{U3T)S2RQiW8EstruAvCd|h0CgAx_hKc2`ueQ6l0uIAA zU>A5-`1=Ar4x?N(zgq{xyCG}lHil}yOF3DnL(OY!rtV(iT~T%ThI?LBYouGop>uJ5 zP(NiKXLer_)4{?tOEI++Ca;uh>BAfI`mocztofu;Ep=FH^B?HN9%2Kn^6q4Hi z?=UXe=aZGDk3JQ|%T|5LSZsMO@-ZT+=Ot(%8pHUAIj`FFE1wKN{^e||&&CDDC4RFx z_*CkJrEKkdcN96RYLY_TVt-P$?Qe3$dTp(s+7a4(j2E&|igq}iLh0$;c$t2F48Epk z-IZ+J9!ja)wE3m+zV(kHrDnIVt*s2>@(-My%JU-@-7BfqToMoVr*@wX>xqV!H9E!f zVx^Ej-u?c4&fZ~`F9(uUJpQ#Eu)jvJZ6DCA+K_fLlX%QdOnO@Q~M;wnLg0N4!IN1AA{cnVM>4+PyE)T*+#A6y?zf z`nj|sHJ&c`HL#06{UXB8Va+i5Tt^V2DnjNE%?Bv%qRa+_tYJ7>5yOs!p%KI&l4=26 zr{Wln1g8>(S3QK#FjW^Q+>fH{8v=>pp3TC%TT4+}Z$zS)?~?gAKO553!{h^hcX3d5 z&em0`3HZzysaQ+`onN+XDH#{J3y`_i_E71&+PixgfooFJ@rIYYnb5z6R~K%iQenum zv^auWPrv%rxx|KRzWaY6UY|$w?yEN*6}!GZ*nLh$Vnp}W{muC~IaWd5qiE!H@Q8XZ zxj+}NTJ3WqG=fiDr$PG#dCe}WC)}VF5CG9usf!>L-*g2Fy9yan@n~!s_IsP{V^{~6 z47}f~HuWOO2)EyPkE(7n7QI&;fsZ5q%vL}h!R!y9^ zIBjc!za2ME6`dDM!zBC8Sj#@z!`ym9={M`OVQM1n^1V8bd3yf6-*?*h;_cZ8yuHx{ zO#J1y=oUb$rJJC3uJD2F%*y{q)~zSYaM@SDfE5co-?dV*gnRExmTH$0xc+=}K!C(I zl}cIr)F;h&v&sCX&ee>X4?)|V<_|@>n;|%iRUc=V`xe`*#OgL4<9cm0NFk!K`w71M z<@b#Gi0=m@8P1TS#nVsFN#V)X_ZJ<9sg0LRtfPWLhNu=Cdsfk?JlW92?*#a_He6pu(Tso_5umuoTkE%0BA>qRl8`vu135v#Ik zf7O(W_`^ewpg@)q>a~wlH_%Y9folnEA-G>au8vQsqJ+LCu3u!mi0}Gl3F8|9O(7T0 zB#UJnH`8d`0DnSSlB>)emhY?mvh`_6Zc(M|&4HsMf27a?>Kk!qQQys9-n zs0z4_iRvqXDWZqu6Oa_q{@AFw*yiu?Mjkc`sg1j9p`k*hIQyG06={OaGB4U%Gf zo*zL*xI~YCu1|H!-pDj0dVjU1(&SsPZ23cz_h#l_ zZY1($J{!mwwag7LEn}Ff^n5SG|C^XrAEeIJhtd3X&`oan4ybnzJ*wh{Bp}@~2xPQaNT2a~jrJmVKY~N2Oq_x%UC!gxn9SW9|7Gpdx zk(}Kvu_Zd|=*-fH>ox>>{ByX#(S zZ8+17m`{ygsz(hzl1)xV$hR_sZtAhv$BZ5=F$A&1@pceC6>G6~Z}tG{ZKcJTV5aX` zv%9-IQ65>H@oCGdb_ z;r{}6-N}0}IhamZn{>+n->PDYmpAiFnI5^zkytyx0h&GjT<`;S33o;4+1ID!B4gOF zQwbt5Y%6(dpWgxRNeaAC;S^8(((c5!pBtHbuLS3Nc+V}yyGXf(N{Oks1<*#Ks7kqR z6gxu^xVrZ=p5&eU=~`9XGsAn;>G_W&JPQwS|Yf%1&omJ{&ZeUqQJYI?c8+sP{6a zcm}^b#iL7y1~SK(6k0soER+a)PON(>#UwO`X9xMefGT=>B&wjjaA0tx3_AuGo5%a&Bhge%KvgpN;D3&0SOlIXap2IqrrKpo$bJ%b# zcLhl#b&=+zzy6g`)A2)mRGz1!7t)I9&vb`j1Taf<`@4f}>h{F8vprxKEZBwOdz{H? zHlqcx7z1dLQaY-5{X9qIXBgyco*ZZR<~LALrN%HU%AO_k=$+~b2k8~;?Gs_o5GRW7 z7GW=dMdx-}+!0qv_{sYYd%x?b*(yJB$qvJhU4wQ~w*y2TsgwSJYI5%)!SKFFrh5XU ziafVNio!$IvvAa*1c3&Qq$Lv2<>^o=YGPxfKq(}JE&o+i@F4oT|=b&#yxDd7(*bXG1u8gPm|aGG`FOJn?PuYya)`C(9fGVfavq{_phJR0a2ka16CUky zIwMFd_lSKq1a0k#2413F0-l#>7lBRde8{!GW*3GUZ+5w^Lf#)Gy)sFvn~s~zVZFNw zJ?%3$qK2nYuK)rat+1Fa;l@U0IA>2ee{oDxsYU7iO2}Q;&blE`;^JW|te@ z?D8KZ!^uHKkJG;~5)ClKOBqA*|C(J!GI+B~IiBds8YDOyJZ+?$Ruua`%`T4!!3(ns zN9=fsb`{{!F0M>G+U2A`3G&SR2kk;*cvqo#w9DYVj1gTSnV$%m|sj|?ZwoC}9_2Fw0ze0P8C=W6V4{Mf%6ri!YDDnn?nsfk5 zr3H-y)TM?>`e^$3oMU|S`JvnMK1M4Lyj->M=#d)?1MH@l<;SyKZaZk3WNZi#bsEcX z!TT@NY?Tjnprk^T!FpU6Eexbn$15b3NC$IE3+q*mO*|ia{Cya5uLxAn^~4%Uq6eku z`vK9JcUZOqiTrrdM-Q>C0LiaG45xE_y2!PlIASey$0t zW*Tz~7+yq{2hqZ22X-Yx!qW@Tpy^7`u^(Ah5eURWRBr?+M_;!xj{+osL{Cb(c!O@* zf#$|ME#^y#5nw?UqQn-eax`dNvoIb5_T_t`XGf}q--GU@T$o)(Sy@H46FCmXT)v?3 zypSDVPkFzGetrRtQNb&IXny|=LFPAt(05~xx)D^ChY5QtBrvee)7652=jG&tkDoCT z4IM`xdkeX<+ z0%U=<)q@tD)jAt<-Ifl=J&&=9Ikf|obdlqUt~4ld#~4jLKeSpXKdrVHt4Z#>3LUTm z(coYFEy@m}CH9w3kb+j<%t%uR{v3iN;W}4f^;sLA-NSUEu7vEck+o zI&b3+sJ_`enILdu3d~IyZsSs9T^vxr7`81GSjHH*gARPIokB3o;5rkc5=b_KrQ#?D zVW}aZMW8Ru1wle}wD)b6n6o=Dr{WuV`>V9fSh0%9TIeshFTMu2>{Ir#kwWRwvyE+5ii6!MxG0~9QH#z+&V2d$8Y zQtQ`cZxnP6HPlX>ev-K6G~&lG!(SU;Kc{A8X%EzS z#dZ%A^dlPE?_-ruYIBr zu>oT0jmr()U_b}m)w2#e%nm!94(lR9Cn?va=tFx}A&uy}296Xx&aF9B-5nP0{Q2(2 zTv5kRDvKge^BBy)#BH_iCww^sM@>9ijtGXz`5o=Z@Z+VIU0Fqn_4SGWl z5fm!wE^1g->>wE_F7S|BCP=tR5a@_XIHuvA$1yGk-v53}R2gm?hDtMd0_;+)-<+`W#czyaG{h8D#s}6c+r3T(X&! zg%LXr_F$};u>(1wiHC>tem|#2-;Y$Nh*VjP+$4^I88mZZbn;EI(nTcZbNZ|1} zVrXJH>*mNWKbATJ7E9RMx(<3~r-z_%NQ=tt<60WVb?R3l?|Oq7+3;W&L1M>?=8kL# z{wV60ATbaz92%YgDo$8nV#b4AwQ=w9U>EBF6%3q6bu#kfU1(p@hl+6ewHJ(l#lBLf zTP58e_FUhxdQuLW4UyWD4rYav(zXm_^^q^Q5AY5DbR8q9hVInnJ*cB1yehrX6=Ycl zAx|FrZ;b1|F|PjyF|LCDH!v=@?ZcGqP}$} zPOFCTSVILLfbd?!5q<&CSfU3UmWTnBd=SEPmFlu)8a;>b;GP&HC?qu!zEs0D+Psy( zD5zj+sv0{+5cIcR#8<9MTpwU>gM#Y%w3-Qr>vyT>hH3%F^*!WA^FUI;Sy2686OjE% zo!Uu~O#|XH0w6Y&yd{j0La|O%N_BG7HK@%xRFSFz#(~2^b&a?ku^av-1`yTB5fgpU z`J+-Qg~|2O6q(KzxRF|?4m?%7?sF3MXCR*QS|-2byd+<=s{NlhueN_UulWyn&TIY^ z=ors=y=YHq-zKol4@6g$s@i=7kg@_O!v&-<3ZRHn6?D~-G!MGYjuo(b-e{Ys=AzC$ zPj<9P%~gjZYC)CGgPC+J_A5TFH7@La7cv_c2%(T*0NQAlD-3qEW7Y!ul)=AHSb&Z~ zpAS(bQBlSYC*3NL^$Ivb>AV5(nGj5qgeCdqL_mc<4E1YYya}EuN|}U>T8JAB)T5xt z@S_CcQ4g3r4k3V|$rA_1koTO+!~v)PdYmH!-zBUxu7t}7AVkMF4s)I+O}_5pH1H`Q zDLjawGo&32I@peLw-hfWfmqxkgmsZ zf*BzAZQpf62guoJ(J#%SGu*0xL}+kJe1W+{9Zs2(NZXms(w?zJHNE51+v_=w zyob)XtWQK>BASWl8narGaGUFhFcYu<$?X@=7xTr!{xM2c1Qjc}&=4Vt?hmVkMdkyw zrRPT)V7KYETj_#U<`hWLVEQXr*BVRu`#v2xO&P^$$@uPi-J*vT$M|^y5wAX%L}4x7 z=b`qQJJ*X(ZC4;tnGMM#S5U>1%n@O0bOi(v+MG)$p^k428OO`z7Skoq#7`pgU;>3K zbLFGqBBop{WmCid=wE6f_L~xV?Y})u^J_DT5$E3Rn*J6Hb?ZV#@ zDk&P`OKAvbv~9YHN!!TCiTw3%(wk@lJAOh0usmsM$w;9&#oRbH3;mz^UE-I&6+oi6C+_9FsR_|*&kt9m2w510J~ z|Mj_Bk1OQ&;H@jqmAmV`%g!-c+kna520(>A9Yu}$AURj?iyk9C%t!K*@o(+vSzXbn z@SW#iL$Z73oYrg%Xu2Eo+Y@LnfP-KRwvh_{bK`A)x@yR4#9YSpf~c9U8WM2b%q z6aOTNGgBMafIRydb^ogd#|EIeGug+8(6XETF7ydH!id}ztkVAATXPR%XGSQnf@l}( z7_MVPZcYj=q8M!%$ghuP@Sk0h8u}Kwg&@mUm04iL8McmuaeI+`v;QHH+Iro}wkZ4U z*-r(M1eKWXy(g!^O!@Cx$6~MTm$3d>)iG?F$o{bZO!?26QEHo=-jB3UOdz*82`jug z6k28Ouwj$k_L;wt?7Jdtm5DUMl2R&ZgD=XqSnemgt0AgoT*<-y{B$m!9fy7~W=+||1r-%24*o?SF@=mDyXBg` z_#(ekoHznhrt?@yMkmth#dnJ;X<5t|cNagj$4~bGMoCZuD2`Fu*bDnLD3OU!zC|0n zd0XU)5pn1=a4S+}%AC{)5X#-cKI%atnl8?gf!f-m*rk;W-8W$!1I0fdtULQLc2+b$ijprpr-34! zSbnD6t1g4rqSKYeBVgZJQZlZSi!P!k1fW<6B)_lO+f~)FqLjIQotj0+xCOMwQ6wBl z8rFk4m!P;l0r&w-VQXbel7X^lm=i5LtTBVmt(tYpL3!fS865(M>(UvE{Q{@cWGdNU zmX8#0tzJ;ybj5$cE|g-1#u^`RSMxqSR~DSBcK6Cu#sh+M)Y2a6le(%V6W1Hl468~` z?EoqUZPhU@E^9m!0^%Ao%7kG^RwOc4(3XVPNmuN+tQE7j3At80vi)1#tKtXUFB{Bm`tPKFW+f&RML>NmtAb%q8Im<>nl-V~Vm?2q>PAQuqy1 z&LVQtNhu2z^ZcB1oilS^RI>XD-a#AgT0->YUW7~jc* z6=jz==lg$=h|>kV&duKP0Hp{POboO2#b<8FFLKgBB53n^fi!YBv zyQqjFFnc5*TR8pk-j{-a)=2ie(mu~@SY9?fFq<$iTVTD!Hn)VwBm1rqvBb<1L>`Il zE+wN&=?av53#*_M__V98(EqrUz^Z^huY}x+T!4V=`e^f*|jL0)svc z5ER*rce!JkVd8c&R6LD%QIY9 z`|uVRCWnmdX+Ume;4$SByao0@n7iw*rUU-*`|oYQ27}Qsq(*}hqZ>wdcS=bZI6@db z8UzW6(SjWysI-Wq1QZaF5)erNMM4_E-BYjY^Z9=7bN>PNZ#!o@+u6>0{dzqg#7CFO z7T9pk3 zPX&OCaqT+S?nl-hmed}%)}Bn&{ywe+IP1WwbrddjR8e)X(mI-#b+prUh+lOKoX?n4 zpRsFc36m+X;b16#poJLf`a=F{B+^z4WvwjAZlZDKJ3ZGgF_0HhfDc5csJh5%>-y0*Tn-Zj(LQ4r@ za3rp@3EkQlM5e$Nny$f{6F)S?zHEw%Z;p7<9Dj~0=?Ov0@6e{SdCCyr}#{#o3Kn-r}9h};wPA_fgS(qP@J#Pi0=N# z42D)0ku@;>%x+7r9_xP^m}Ps$Ukz+bzQ_HfN5i$pL9N(%pvPmTi&eK<#WMrA33beD z&HY{|Ro4}>R@hF{i!1AmSU<})(Czl57thsKW1_`%LSPH;O)za7CRtKHR+hXN&3t?B zO?oDgS+JkutdFQ#e@R>ab!xgBW&Ky-sN{kEyRL7;*ZMiA-bRVl-{-0x#;9I8=#!b( zFr9wu^R6$gwHv=yuqHNV z&TE9Ky^k{*>|%m5vkac74Szq^8-c3%Wjaiw{v_js2zxc0zIJvz^S#(Jdp4R6anuQ{ z##&REA9&1)SKB_MoqP~fFXp@9B;YpkWhRH^Gm`}Wd*0J__L>nnl0@+`{G)vN$MA@c zD&+*KsS%0XSL*7cx^AQTF{4jTUSZUkv~P@>pN?8`k1a*NGWpDCCNcI_4X!W@&nu>M zSEs>!g1eVfj?Xm_<{*t?)b4K3MrE4IH6O26!d7x93HUJ{voX6@j1E^O?w8TB004qG zb~6CPf&!HA1Of)c2mmkeWqw=Z5&d8=z9cUl!cqq?c$A{I0Kz56jq;-G!4~8&Xa#@( zb$}Cbg676VL^)$r%~Yv4f_WIYcBPaKI~k!&lX;`;&LH8!jWTr3r@Jkma^VO7_!ReP zn($=iJrN9`ChFARSAU*joqqM<)qn^Zcv?pwSW-rlP?IYtYC9BcA&1aZLuftyggc_m zHv6Ri1~68OIWCcIpN_E95cA|FS{{_!_S0nb+{kq z)3_zn2~QJ|4nI$%>dzr0SW@}{kx^a{fqBy08> z3Uv1dfztz+i}^gQ_IcEev9@Lre|bjRqYMcEj94<sMMEe_1AYU;SxN(C9PQosWV+m%Ks}M_IxEy3Y3mC;B6za%M8^~t}8bkfn%lxZv zAOwLYFk`_hXb?Y|d=@~DF6l(CTtj>&U;)VR5@!x!91Zx>EvY^oRUcy19QuwErv^M0 z!u!FHx-XHj2xm#y3YrGu@hx0z8CN`uONRq>tDOL#6HC4rvBEns!33a$`46YY2<(8!Hu4x6QxZfBFrH+6a`~+zs5M$pVn*5`kC*8y?gtIjHvg zI}?d&qs`+}O>m&4}OYz!v2SiUNw?p>$slH<~;}Y%t#XVFv&#fx8}q03{JjkwrjNO#7IB z^*{ebK9;~8N4O`sc_)AlK*8*@_W+OGx9vaCXh6PpKw4^tL2KtmT+5A(*_Evyk)Jkj zVxP3LXZ1E`I2n#P@#CCxJ5fiwdUYFB90xO&fF2f6O1>e~F*kq+FW%Jr0xS@Lh}VSf z7{ZVC6+b+|TVr*X`&XGH?I${b8((rQo8Wu}GIDD}VD?}I1>%+Zf#Uh~g2a2=AO(K> z`lm?cao*Lp+4M&UzN6*Mrq?5N&*rK+|~H@QDZke;751_6_7I@LC(wpb1DwQ zb5yicbZl0YTU>C629={~FCB(CmC_S~#ZhCRK&Q-^r0vQ%aX?$VLR6F%e?t}!WEBT{ z6lw*^KS5GpeSP$kXoAE=MPa}@YVr5+WM7M4KUoC9vOC!^pG2qG5V$-w_T8b*vnPv= zlRrtbK49$Wh3WAhF3my9L-$-JHr(1G_BNKMCpOO$<0Hr1`rwSU!-+ViDQLim1yw%t}{dGHh8U~y0fZ59F-*W*z-Mx*ZQ>y{ZAH225Ylv%^*iLkz!byw+EjH>st?0BC(hWpwK`)(#O({To$hJK$!!y!7g-=1zOlvnF$-p z98Atj-)yF~J$DgUaGbo}FIp4u94~uv2P1ahvHoT=N&Gz{w2kb85g(KIv3PzKGT1fp z^ejt|jkzir?6TBhy*ApVP29D$wPGdHV1_m@!jH#gJ?76N)9tE!Dnks5O8cItR6f7{}K7@=8XzuY8BK(d2$hEsMlE-Vge>^Y}l`fhLf<3fna;JVC2>3kHz-+j|&*a!w$=>(H z$UOf8g1!F-f&niG%)(?&%JZ9l!YA1h^f-!|rLWTAp|?g!z#Go+lkJdl^XQ+lv={)O zrtJhk5P&E?OUZ)7H$7!3dRqmhLRq^~A%N*hVMOx`xJLo$0Z@W32jjtjA&l@2PWFc* z3%=RS?M;3ByAK-=h}lv_Q0I^m;OFTF=qeRYdj5tA zS%;@8!*gCM#gHI^d}6DDIurmZSi zMDWbVo3y)@U_T`Y9XuFJy?Zu#_$>y)#Sm}5mk?dJMhOyN08wY&M)9Ws;ADn)K)wn@ zQ%8P#vesoodqDgg@nhAVU6ei;XR|a6L?2r2WCv_7KBYg<2N;M}Nqqh6 z0Zhr$A!!HA;0j1(FtEY^ItNM<3KZ-zyUz*eM^*uVs0RpAiKjz4F+f>7m?@q3?c7N| zeQSnuVc^NH;%HPG2zDC@K=!jAKy?V*yRBSqHP?R)&VZMJ*9jzw&*ATuO@ehI ztn=@OUbxTysN1qX<({i{fdY~F4+OUM7XrhJ+c=uA1txd>{Xhq3$B6%eqmF!p5Q@Q6? z7^FY5;K@f|(HSCLkQ_DDHDo!P%R~k>OR7xQH}70p4sx*V0$Bo6IE0!tIYlO-^H#c*I`YxF_= zy1QjVaKbcM0-IhFvNQWcL4K!*zM~2~YEQQF4=gCTs|vMXw9I*CT@cMtj8Uqk!dIuU zupV*%;Qiv8xoQ2-jgV1*DI-x941n*S%zGY)y(EuVR#Ggd2zE5Hetcz~io`Ke(gstj zvQ^UDYFJ#g{FK4PO}xoLj_~?%N{x5gcMAvyrTF}sw_%;X1$Yo$$q<;Y?0kfkWLu$PZNKMi(}6;@8gbh~II<||gN6nzIh zN8@2-cZ4B^!8c1EF!(+1h`C7wilnS_RA5Aq4*5BFu!xu0eE5vg=na&CI7oSeqQIFe z9YtH*C2_p-0<3lQdy?l3=JmW{#Jq#HmD+RjLdTD#U+X>0qXSsf1uAc1^J3k(7D0~0 zSL3*pQY`K0<_%pg^pC)-Xw*{3U5GNjwUnD*uY5YhG^(8HEL723l)aA|k69AQQjLy*nJ>dS0O?$BX5iVsVMM1>VayesGN5p{q{Cj9kLo?9#b6i_+!#nC6oB5kV}zIlc4%%oQq9#FmfPGN1r6g7x*hQn5?sxUf21|&O(q7-%UCOMmc9ug0T z-d5KstcC1wk$}50{8`jxzkyNdCXFi|3>2R?01-8aG$AgFgR}AyWbSeH!6NhN(#?Hv zMUSO{VjaikN;HEgKCwa?1ligErRmE8qvCl}S+~vlFEA>a#sRcx2o((_b?{bc@rV9< zAq*5$hOhUhN^<>~XzG|xiOe(eMQ|m&Jy=$_7?y}UtFa}${HuceN(g$`^ueTH2%^p; zu4T#eiXn+1u!zz&M=^Dd$^okt(UELbneW~S$`FGE6;V=lB|CXCP;ydVnp8;K>gYD4 zav-WbD@wkS!B!Z;<`qQAKal2*6lQFu4s52-LQ-C0q%vZl*k;rgYs*#fKz}jPN_i*8 z?5aajpcP`#?dtB3SE|qVL3b4-GC+~#i=|yQ932W|u*(z#&j5fpn3+$pb%H=Y;>Cfu;9I2r6JE;A~rHY#m1DjWC*b{X9|F{-3BuHrJjEpB{A&6o^!-E%d*Kico;Z}c$p zZS!9dj0}QR4;a_X7(e|3b{W^5yk$khSzso0T<^#r*b7$^a@VgZ+N3$tq&dl`A^%;q zt_f3}$*UQY_Vst|cTL*6Oi)Y(G3^Nq=1S01QW4FwM%gJ6fEJWR!S|VFpVsS#Y2!bfTp#JyGepBD zC^CB0Oi7kAgAA)?Gji~`vOi!KV;(e|HX(HB`piJjRNLS}lBsLiP+gk&@{IY0so2-M z11l%g0czlnqj~{YgV0@?AFe$+6c&4O89&#}kILRdZ>HLtzR$!DdY=rj2_p;3pzV}{2EET|jwM=7ja z1}vG?KirKUM%>e%UpLDa*T-G9O2&Oy)V1P#WtGo3)N9+1qv)M>wFut*kN{Y7NmvWG zA?NFd;>yg!ZLK4GtPjEEpH0dDk)3t5~!wMNs~w88q=@y0T&HgaY* zBzZS@Q07|&HyfoJHp=BTDm6B$uWZx?ZPY*8X#B9zJhj2l*=ljyYD?JasN3qA+3LC3 z>R+)nh_N-iVQW-wYg}V%^2*k9(AMm;t@#gIi&I-mIy) zLyVo{4LhfDJLeiZmsfVKgLZD8?c9IZoj@9 z|Au`)xjlJV{L(A?%Y*h;KHLBQ1nhDuedSa(=v4mMsp5yzty8B;I_D~G=i3s_chsHl znmONdbH0DY`9X~H!yC?z%AFtAI6ryiTs`Pq^V#|759iub=Q=u4P zfpz>J8dwkaXNCx9$6pPs_n!tf>_XPS`pFuYlAAc*ZDi1mj*p^Ko!q8GgH&pq?)p9YpT`#KrkL6=0H0xF1{aYfF=ih4BtR|Bhur9iV1Uhh%O6;KWp!2f7qh8Z@)o~>jJYE_cuxYXewkeORP? zGykiBdCWyvdh}oT;t`gC!_W1THL$*U^uxstp;WV{(0NMFKN=X`$A1GF+2|)gT?c5vrx;S_7 zt=WZ7h>eEIA5S(uTMn;%VGCR~hc8HO^wOnp3D0o52QG)LK5q|Pm%OA~;n%5`0KXr& zl?69P&63ATcVaWbQK>ubSN8+29>iWfB#$M`K^x{$nN#4$`WaEfD<>%#M}ZmPtDB97 z4!DW|tf1wi>n|4TzwJ%TOUAIBpUu2=nz`1y66R+N^?A z(h_hm$g*C-i?HB0RBBlmWN#K~o(48q1tSgMXZ92QhLXSSCC-(nk|Y^EqgTPkX<+k7 zs5f#R-3%3S0GszgwJj4QJv?C9V1dc33sDBfc7>d`LXr*c{G8ewr52>a~9rEG8mEkrE4D|n6_`pyT{d@Y^9Lk0b4#Q-;)r5 zQh!ZyO#^4VJ}m$2)uRu0Lb=O2LOl+?Q`RJP|_P zanL{}>0v1hh!m^mlcqNz%jXkkCQd(-%`W=iBBedOu0};ITcPZvyvc!Oc0z)koS)@0ALpP-A~W5NlYt;|J<7YxU$LDoyH_$U2XX6 zmPYjcO!+UiE1zl|?yD&~kNVLSi|E|MI(_tG6t-!+e!)BSJ^xUdO8C$9wH=Y+O6^o8 ziHp0UBX^AnGFQIsij6+Beqdd5agPjk(J#li_-6R<$~dH`@g?@=iX^;acpLJt;IT%; z%5yz9wY|903;**ULMJvQ=7Lnznq`9THZ3nQ#bR8ybyGN} zY;IqHdfi54RERGkGL^qI3uLPw`*W(k_6X)<&uHtM&we|fcjM|vO??QFNo|PUcl3nJ2jW?3R8Mlf(D35f$tyb$gNe010sY@LA zBNnKIh3I7Z__I9thcS)1T_q(Bt==ZQM%A9icW=?Cia+Z)yE1gnE8^#kD;t*H@C#ye zLCNmmS6&$slzK0&$n~5g)I846eZbUkczScN(jj({n%$?!>+!Sgx2jUFuFmI)v%eei zwU85|N-YstX}uF*bxU1!@pa-%z_RjvY)7(K%ed;Zf`)agQlXEQ>tB_#1n1N8pKU){ zU=6$dgf-dqIcdr8kzK`NKXV59rSzbU@s2h_`~X`fV-kanY~iQsj${c6#gz9_iHFzk zcjNzAVDSM(8viUXh}?u~JHzG6JE0%075q$My%E@Wcd1Xtj8~Yx_FAC&vf`y`Mww3$ zQKA-b`XKUbDnmRZUF8PH$3C5jzF;$5&F@#nRB~&XZO7<;yr8$(Hq2_#yRYU6*E`vI zF{*A@{=$%{XrL`mqX0sgZGEb{=-2JNpC4%1V|u0D_IN8Wck?MF30K)r-ebSF^ht@Y;YZqt;H z@{7&jK-6;ac?f-7x+|DWHhXDC>oybx#ZpBPQ41`>g5WX7rK#4>Wpl};usf#dYW~@E z*G)p8^WUoHqw}}Te0cBdr1MT1REBs^!gChO)qlb=hwsC%R8s-Q`ivQr|y#rGa1-qS%M|=wrOR#m?kplx~3k@Zs%N3$V>a(2bD+( zc$Xf%NQ3)Gtq2CCC@4{6GBO&V>W5QG{X3~Ih;nIc09i7QAl?T5F6tm$YB<{lB6B#2 zRRx#ooDbr(7j-pXmrmoOoi8P?Rq``!zcB4yAr3?8rz(8q#$cG%TOyE?Ax)J5tMLSg|8rJZg zUU4;^BM?TVH;z}(d>YQb;Ml<({lOqaLznxUf1;FOWtMYWC0lr1rJ6PdVqJ~lUU;Yn zLs_S=x6Q!}wtBe@D=Dp$FuZ(NaAG#bsoXLV{;4ue@%G$}OC)p_)5=aN$J3zeN=b7qR;^~pg|En*!U;3pA}`^!>NT2gGZxHb5=mSGMD4W`z*sSQg7*ueN%|@1 zyKbce|8ZTi_7dOK2-O5yErmOXQ|T}09MAk7?lu$WVw7Rq>iCTU z0b`7IhFm!!r??Ngv+)$h9^2f;MX8DdbJ-rc^DK{t8aaDSVD3Jhbo0atrt$gIE5|{z z3Iu~!n?JHqT$TyHSL0a?J1)dkCs1U<#?h^9I~Cr={k${jTw)`=b+ZgytuXWsDhR54 zZgEb;6)kr7(XaZK&UgCaV;yjie_-VtAF$jyA;Lr`{Wv|#`~mCnY$- zGpI|G;}2sN8+={K3ZK98_$=Lz-D`8~6|<*tWAXkl6NyXH<<$<%_&v+ZG`6awbuE{V518rhsPpdDbnogV!H*JUOo!q1AhDbT z8m&s!B;bM&x2~E8#YRp?JcNsba%!(wleUYd=)<_sWJf`VPPI0DH4(nYLGyU-lUwSP zI$4;_(HzxA^c7E&8_|N5U}HLnaW`W<>DFHtOk0EOf?>vVLZ`?s80Agy4szB~%RNku z)8Zu8QrAs)mEK~qh@&W;2^r7n|G`R)hy6_Ql2ygBXT`xE7p&t5Z?U!nVJ_!6Etr+h z&E_S$V1_noqY&M;SsjKdu>N_-Px6V|eE-9OozMm!@oXs%Z3W%l{=2q}aT>Ha< zmHc7Bx{5l?_wO-l_*DLf1v_y&JnA|$8>3V1<~Y(ht~vkf)fdbEuwWgle^@Zk*UNe0 z%^f;&ox(x_XU&*F$BCd3uCtF9bpNnmki;V$yCaC?ahE5V1qq z?_^Tx7({o&4HKo>01dP@tU5s@2iSSoRmnBW`Bh(f)XJi67=qA?)1uH|aj#P5vZ_wx zlyfn!n>HUvWUTWvLFjSW&n}Bu2|nce`5D433Sx8Y)PQ+OT7uWKE@+xwSarHbe=VLP zn&-uKx1N4dXu7MYXS_pXmqv4ZmXm;>Mz-6)mYD`Z*gsu&sA$IXpl6ch^yoJ_nt?pKE zgvN_vl6cA-g>e^(D}r6572IUBYS9m+?E zD!Yl>q8xdjSlw&vtUYY_LqU;8BMMr!ThfU~y4|W}o#0;@v@%OD4?HoJr zOLBS>vw4#`()INInqb6~mN#mdXMfLA4UtW-%Y668&)Q^mSD@_vnP3O6a_)139@K!5 zYa;hKW<&m(V2H5K9qRWJ<8@+RIl*ZY?XOBBNk)wjpNH?6l}nHJI3{OWA(fC z^}6V;Z%gEDpQ>8^DY()6P)!1J%2fP0{l|o>gdee>qX7 z=oelPCYxXbBf?eHoSotmVKo!J)EcfGsXzW}f|V7seXBIQxpX#EP=iHySi4%Kl6ZbK zahv7GSuP3YKyk6R31Y^#HM#g@&U(Kz|Dp3A6AU!2q04gl$Ar1$vb|*bT|Eo33Fbk% z#9_-tC%76%C0bRW6@9!irLW5{z@YmN1zV~iqYC144`TNvtd}mlx1DghFBDL6f0QhO zT^{)>f_3%=X^-naU6z#EQP|9uqg$^WgDX8D9P92m>)m|gpX;ov_apI5Zq-go?O~Qw zpikET4<+2GgW~ucRsB?OJ6(Ic?`TYxm zVL|L@uqqa^^Jl^NR2xk?OU9~dDIMx44A7uBG&B)?>V>9+V4;_`s8j2H->;+2r~~r0 zsQl_^-eKXjXqpTxH5{udij`d=|K@^*e@DZAVQF8Jw!kDTgTOZ3G<&;~5WoFbD5*RJb&!i45OPkPvBGC{ zI*2P{n``13@=FlU)t|`lV0Hl+;qyNQL!R?C{S--+5y%c^C&&ms_{mfIQ~XYl1pFu0 zfeaZ06a4s-gY%~Z^e2C~G#LaFC=F)k3^^+zd#?Anl#7gPMzGxGbK!&MG6Es|@E2#< zccdXNlC3#82p;RhjV2GW-PY3f~u?Ee+PFn>ib3}lxqf@%Fn1d}6+ zVA=<=tb+Bj453;FJDSj4jL5DQd;R}Q1T+3Cf*Icl6*&wsNtENSum1~zg&I8onW;id znnI0kg&Nn&nZ1@XORP7{t2b(@H$JF0>5?;kN3K@OnLkK0Z<4dHkSEJs*7OZFaiNCU zyB6mfESu!4dP7wPLQPB>ENu47W#z5?_U!YnIdC?>ldhR$U$glb>UyW%S#!^6b63*z zn%x~acYy}?+Fh&Z27C4}r(g2UJ9}oedmi73cD3@Zviq)aqy{gzoC$lVLDa53`>u}2 zuC`*R)*lfpRK+w@D^&q=5TXs2)3ONBm}o2t4N_TZ4BS+>deC^4q=1DsVd)iv*qefQ z6@x{Zf~6Eg6q`ac6+{1kU5eKn6oWrDURmj`@lj0w1A-~yQk!tuiV=jSh+B%051Jxt z!&N*K!)=n(a37;G(!yUf#e7kWm90l+V3E_1x-pSEYP7jHLf{Cq?x@>ixx}N5(uZ$#hrmE#d978)*hzq z08kt#RQ1q_1WH?J&azNqP3uS@;j%&xv*=06xtn1Lo7yVG&P)+y`b-QBms~o840}QN zM~Sd*;N}Ayp1wKYUA!{OVf@Peb%7S;W&m~=PWW{oX{bdFfNnnM;%&xITXrbi?NX4d zXTM8KwCSKHw!r(esGEu5Um|Euh)jT1u1M?cbK#GQwf5V4WNf8#=+knfTJPE55+1~J zjRMg8dc>$a=0PIo1i-4)1rzV!ZEh~CZI)dJL&f7cEx<*um7s0PtXM#%S*5xZXSlNi zYsM?1z|bu`3l?(=uF5)doLGBY_dw;gz{}_7Tk9=e27U)PEvyAI$JzWSRb4@orYiBa zDh=7>2r`a68(=*=P%s6vW|9C{T4c?Ng3O5)_kl9mq`Q8pHU52TV0J{{mt)DLs05q% zCyPolqb-QC2$jA=1@f$Qv@%Tm<@Meu>QRi!39-1et2R%ym;ER?gkF01mWNWpQ2a}L znpRPG+uPLegf4L6gGA+rWNQudpew3;Q@fkg0h|2Qt=d&%6VHwV@o&8>>Wxy#Z&p5u z9u`Qrxp*kKo}ld55;w7}Ag4TV_2gA*eUHcq;p)lTxR~TPwQWc3mxEo;_qrqw@Akho z0-)&0*Xq}Jj{;R8qMO<@SiEv`P3eKXgnv>{w$zEr2u zA_0CEuS`7T#p8S6q?hd`Cxw?@#oBZ*VTp-duU20)O=@Pn%Y`UAYK!_$iEwKxbhn;ee8Q#%>X@qdF|ZxWN0LU)V6K7Bdd9}DCnnEo%YD`33d zDY#U*P}!|y+V`W#+gl=u6t;7wlpyjfwtv(bfG`RBvwmF+0|yBaxcDu?4Pcz;KUNG3*2xZ0gA6 z`miL5Bjjtsfp`nG-+GCHdD(G*Ld=#APW2mOIKu-!OEf*Qz7`se zo$qppTtcN>3tncr@Hyks_nF|ekB=NCR!h3>e473hYzWQ3S={Q&DVB}niHY3%{;4UP z6w7&Zu(LMNq}hHFbNcIh@B8<>> zt0kukq3$}^z~Z%N-pKhTJ%?rssoP&18k)nP$Lb*WsE5R^4Di3<;0$HdM_7-NtFx-w zeoWjx6wmb$`Z=Aw0)D2!ef;Io)`ra4M)PmzrPfpPF7ba$z4dhe(}O3LnMFG95CS9=w-UU?@Q}Q}kr+Et5%WD-GjvjaPDJW-Byu{I?Fcr}9YS4x8=xw8b zUl|9%)$c&tVP^g>&LzD4!px=`_lJOPu43kc`0flEQg zg&Z;^PGJ#^W#Mm{(6_EoMK&7D4i`fNu9-^ppMLNSTTi*Ap&ga`?ucrtfuSsCW&Ke_tpoqv*T(E@dqWPJ53fCO z*Hpj8nY_TKA-X z?@6Zmwgpv_v!c#8y2D%YuqSz-QXx(*5Ovy2DT^RKe(LW+9;@0bFL^v)rM|B>Zrm}V zeb~ZSm_s8%Yf7`+-O1*gkt8fmOUH4ci(!EwMTUZw-eFRkeHKNjyYPwOW0f}V+l^!u zL0ZOJFg?MuRuPI-UN8ZLb=N{683wU?(8Ng){*M=3N8S-49fkZPR#-x1b$? zj8q9_{9J{OcKmcBSy{*rX_l4w3=eL~ARdB44RtQK+j7X-8}}5{4NQseS(}Lc79u=b zT6T}-=ps^t5ZY$)He0EXK5v-{dfG17Rwxg=zmZt%ly4+|byCW@RmdZ9E46cc|COX; z0(%>b3dQ?|cF!q+jUxREhDtd{9pO`vkV&B|_sBroSdd+xm=Mzbsepag1NnPcm!4;p zB0A@GFzdGj&Zk-kr`34x*uGR1xHChR?T3=-0KHda6YR=!3buBLli`8PNLK#t-E&5d z3@QW62*JR8yJE2<$0_TZCAJUF)~unkN!x0h-M06&e+0M(me@RA54~q1@z|irKz8?7I4OI&jMuKi=R@}gzXuz$wt)TEO_qlJ}3z@BVOg>BRrT27hL~Q}$0}62p2=(`ulv?}J6Rwt>6|+AKBG%H~&Hb~PSMM2W zTx7Lay?B=?z7DK-aUM>(us)Btw>GTaYDqtwmLivZ>bP_93Bh0bVu^T=rNLe7ZK|E+ z4zaW?W8r~#8t<_e9J6)CDkt;sO}%b;v1iE^)r8?SqVUtZ5-_(?uaatHcSTH4<|B;r znL9s`UN39)`##idToK}>6!X^q`N1b{<~=`75A2wl`Gw9ig?D1iIOK8L9NX0uM@vj9 zOX0(t{cT@+*%3QM@5*kk1It6*>a)9T-8MNHOo!2qRKkeAosI)>CW3azrEjXkZ#oj*b}{N86C}>?XR)gZvdIx?g_LrGl4kDI z#^MTuMmM0UH@oOaj_5gI&Ui|>?kv{6H_UJOfo-)+xbk7CPr+#7%&W=Tp1G`1IqCbJ%vN7Jjfn6)bB*tqBl zHS2lBw)WMT2U|+*uaCUOM!J@uq%R*fPxnabq2=mozh2hH?y#TxE&e-r{lfd4m{FSC z%5#jFcm&Ni&Jz%?T0CEV6S>fnv7g;wKP)`xlR<0+6G+%PZX#xSpj zG|O@wZOc4p8B9stNqc6#0vFHlmIhAM$vJ?c4Zez^v06lIGv@sl?AjJ*A_+Z1ovHc~ zEaB1V(Xa9~9>q@~W69CJK{Q1UysmTF|dCpTlC@q}pqO@cn;}ICk z9o@rQ#K#Cr;9Vtt$=t@*Z*u$tyO>89?^vBteB_ySwU6w8HJb#J9k9j^eE%G^Q&FrVB5_QjiPoZo z>wNdD_{mzuS3a6dU;-t~*^K8F-V<0m02VO8+@Z!2qPT2W3SE?|mjp zq^FSgwv4-s9mV~iU_P2*BJBoWp73=<+6Ue+nGjNBu;{N5DH0Pp=DBET4o7AsJhFD| zu?cnHbML37{ODj5mn@uBn_E1E#nnpouL35^yf6kMD`4@JDP#o<^eammZDS+vkSEU` zgx0+4FvCw<^ql&h39p$R?=eb6X7MiqMx-$NgMcYrW#&()QpycR)ZO*b7~-&sQR(k zP{Y?Ord_+;*Xwm~@f>Y@3o>BlJC)jye9wF$eZCF<(+>K5KAr?mUYOPu^NzemFmG6% ziKeY7w7?SHP~VrYv(Uk$9WO1Lz?6wG=e)`LFsSuxXU@r-@JdH5jpez?czCma)gy|F)FmkFyKA(440lqVD5 zm+EB1G#GsG1a^k`NoV2_t3*z7Po6Xod}NegFvQ`)dy}{^m)GP-mYs6S9xChdx#uAI zumY|VjGZjMi2MQzg3qm~f%+w@>n7{@DjV171m?^t#?E+#z?)pUe2l;Qcv;5i|HlD) zn!(|fjKXwzHVkmiJaFRxORtf6ZV#IJ0`HeBx;q6vCi}_<1ro>qhXb~Cme-#$|9>4Y z{4q@)3Ty8d@y`L9x7V>Z+4oe~qY027bT`|V`saW(LKyBBQu{9cb-?1K{y1R3#XHOw zK7grbwXP0}Fdvg0FpoQ#dCc8p2Ml!ptobb$29o|lV5(Q-_{b1gNnyX&2*3L{tj-R* z`8}XFOthH1C`*>#y=lP!VCtbj1prh4V*fTXIvhYA_{`fV%k|TX8?!CU*)NFj`p^6K z+X;UUdwSAwEl7%K7gdg{GUEoHWuD+~)3QGYuFb?_BM48}*hAw-g7_SQr6+!=L{!8|2SarAHP=hDT$i;vWLYL(gbBA2T-Pcp5E*Wtf2+r4e>wT z8n7{}albW)J*|x1;EVCBkz0so{yoO4Aj;Us$Cz?}5|*PmQ-I3|_o-1t=P3sJdD9jh z1V3ux_<5VNqA8NkQU2M)xq9{(QYQ$WKgZTOCqBmnXDf>CHL7HLk}Ls&*AeN0o8r0o z^Kg7zXptC>OUL<`ROuDAVGQ-{Yutx?wB{Yid_3<-C43h{eJ1`2hZit*kitEg=wy>X zApaPq6!L_OfW;@iI84+d$p56fv17(hM!?d@2pAqq^%nudzUQ4)$~x@`r(QUdgJQ>3 z!B&NgR=6i0?v zaa1_u#eb3_sUDRs&C}oxczEvf-Uf180qQoN6286?dIcKeE_3d$9^X3(gFfXjcXCp^ z`4)XdC~lETE|yAbi8xw_5DSN%Edm}+lrxTIMvbyfg}Y4f(Q<7>jrgG`N-C*4>FtXW zaUggrk+!X)@G2jHkH50JxSSAj>&sD)2(Cz|xT1Uwr@5-Q4t6 z#t>(zu*jS80741LmgM}H^2MzvIfq&$uBhZEH>Dy(h!px!^&Le;d}OvOii;(RtF@zHhyS_eOAfKp7wHqGt@!Fo zFSXTN8qY~Z$SUXEAr!^`3X&*pIO1=Na^VR0l|dH3LOCT5zlB##tKeK(t_}r3F)y3Q z0vNF-F7jo=e+4j~JD~bst;M-5uSKabg{>{x0XH!13;z#$_Z8LTyS9rygd`-CB=ioU z2?z)(Do6{x3s^yl0R#c*qJk7d?+_4_PNqNk$+~;JN7%ILXS5l9Z#}!n4ZhI!<_`h(bCb_`PpZhPoyMl^E>9=j_qMa`jA!aQ zgw(_Z&T*itrIv;-ZJq!{s{n;O>C9M6r1a;@pK~-1{s>@kZ@U?I zdKP&s-dlJ+weC3$2wpG+*vr^C_goHC)4@eqZ^w! z>I;AXrdlOIdu|Wd*0TKIm^u26Wf<5`1_ZDp!Kh(Q!77@$FEsi}R#!~5DuY^oZvBTxZVb-=F3W!s3}{=frod&Y)5KCo+ ztjs;g>ck!p@Jpib1^r6^3#*P;#>$of0+=->gCnY7kB?-v@_Tk=mFCBzKLXg*q=?wP zmDhaZ$=KyoK8J@-r;n;-7a#Ji-V35c~EU zN1z+qqyzjfa$L$RJK%rG6>a`*VrXvM_?>g&^8?J%Q>%65^(#k=zK1syt>Io4r);dD zvQ7M5SiW%#oIbi%T_9dEkU1#OrJeUF!`yaN7LRL9ZuAi`58HhyP;kg9-ZmAJl>LMA zuIz`Z7A2PN*Rl^-6TX}C)19T}v9WGyp{hbi^5BnxQl`_Je6}=?_Ime!Bk0AC`7hOv zGNOEqHCi(>zo_P>8&1v2b#5(6cPWMdV3!~KbZ%=Ti^R=(hXm?y4<3a_zu;%`I}(KT z(-b4m6dZLWOPbgE{QQ>e1`A@7#B}GSQxe4N zOHc0vi_o48aZTFWTny(aF`sYzx$^VevxGNV5v%uX^G))Caj#vKR#>fvo*&W+UagiT z;hMPabNvH$aVrGS%bzjG3YIW$j2X*r3{ajGS%BL|y7}KrDK`B4{%xY}advve{(c_ks@ciD8jSQhIi zjE^>y;MO5bmvnO6woXUPnygKlgkAi)2VCV-lB}W1hwjX94wt1X(b8$ACUk1~4mp~V z@P$w4sFPn`TZi+H3~F&RIL>K=A6D+!#IQ~UzgOT?M%X1?7M#b0Ubl+0dwSDV{zK@M z0C5ve>3a88;P(+Tm?6KI7$+w&IjavBNF+kV1(vqy5nW(d=^<137B?%r-j#OdHWD`1$9;!+k@m6iFSmCf}2!^l&MWgGt(yCq6JI@RU zC(7=5(li_rab=pDI1{5f@h7VO)DWj9^8? zCyBVo$w|jE>f&XS6)Hl{ic>2NvALQmlZ5`hZn_K3j8gWYJzhiLm(5fxJVu81KVIj5 zKuz;fE;C0_wjK{wToVj2q!V!13X=5|$Z;QPdsuclBkBL(SHG^;G>M*;@f9zTYaItdw&U&$v#ACr|*ei|yR!z2`tY)o=5L({Rjs*Ny8Rw?0TqYYn#< z8To%-wsSey9_L>$% ztl<$hdI!36o8J;13qnPds5#{O{klt}=3hEpBH$i`EJs46Q^cX36xj45B8`e^gY9j@ zAdY$rUN53qjH=JWcnJdsz^)EnYFe)STeRXG!lF~|;nYsyFtfdg>u`?<4ar=__=r~F zlOB=(0=x7)qOKPI1-qO)qAdWht9&m)yIuf#uIGpk#pv+j>`M>jRWx;#6TDrn%OlRK zIFENdqFwQ;N4(#+JpLni2bR{85F(i`fQszUIO& z^GU|Hd{LDe-Pk2uQodw?xX}yhSiEO)v9pMnW@MLX+U4ZZ;sPlJgudC=#HUr?L?rIL z&^Ox}eAY@YLKDQ6*;-otJY}vp;ZzySa9)HeP9Cn%oj89gi zvB$UWH~Jj^fD##rCw*3T+RuWlDyL^SFi04du6%4(2NXW;<6?i z_68%Z9Wz_ z_|K4UyH*c*Kl6+^Fqqv(@kpYm;e9yMp6-qrA_BA7qXXf&y zn@E$%_cyKk-5xGR-=duAcr#zH>L+K=zti=j;a!1)Pe}T$?}^^kPa4X!t~k#!we^X` z%1~RnT1$nQHgjJA2HM4*bQdbW@I}gL#)uZ)WMijk-C`;CNr^EMjURo_dA{5yk=~Z| z;A*E-nAo%U7%VtZJzUEgCLy~Re?^A67dNNj$(R@~L*nfD)coz43$@)tz5UL`*oyGr z=vnr=rq~A`wvsYE?)XD?@M~1@{T5jXe0ZBRk&suB!!!qQW8VO;HpSli(nw! ztC2Ufa9!&YQ2KS~o;^GEb-Hx4KWg_(iMEBh#GQ_u8s+=K>mGEH90Fz9W;AnJ>PS91 zEw+T+H?6rZofLn_kCd{xDZY?ek(#L;8OZuZ%xk@Cso)#FLJVeHJ45s(*iLU8$}%$QbX!_>rGgEyzI;kfqd#U1IrgzP@h&(gGrFwtMo@tb3L zUQOyH#YGp}e&+nqsU$c3E!aqWGVP`{11p`#r>%q50Xn&A^HXaHcWDy^`5&^(Z4z|6 za+>r7Hp-X5MOGI80WSLGk3N)XGwXl)Z>2hVnz`E`cj!>vFii(TXf#H z)_-PeEAJW%C~m|&=NF9`h;T=P7i%`uh232#SA@X`xgT0Wa*R_|ceHolr%P4741^B3 zbJ5j%cg%33e0$IVU&_lGRv86%PI)z<>SeSzQkoEF_dZ)ctqxfI{JyJd~Z<0M8pW{a@qL4SZ`KvrmJ23ttd)&Sr+GiWQ_z>la285=7m42uW znbzSB!W0j6B?W5Xq2dmY?=GV3$!wj;bS8M1y*rBji>;Ro{%T4%6C=U|2hnFlX<@^S z)!4oy(B)rad`dZrc%OjM8bRyf9GO(3IX010LB9A496flbx+v{|yH@~?s|gLW-mvc> z(wO2Lzb(?-EXO!3qA$`!rUEXQj2Ga7X}3H6z_SLR9gkhG4c~dd1;hAd>!|!7+dFrl zq#gKv;2~g-D0eOU-y~#j8w%_}TVE8%oWsWO#{?_+Yl2a5HQ1nxza|)(s{)6pSVEyV z6JTD1J#BsyT*7F16ni;p!<2hk<}rgn7*j$bn+M%ThmfaD^!m(Rvm}p}=a7{LeyrXM zzp%~$ME7lhSY%+x(*y)nGd)KYQjQ&wKtk#J0G++4ZM5g7AMuoDD3%{b%#5NQwWi5v zMgqKv&XRoNx;DKDmc{`M1&2ZG`&p?~qb-33XOb6v`MpP=XF?oMrW>)kCMeSZXp=jG z15qA8!TckkkC#zx01Ae~CTv9i8wI0;X&s|ru)04c*h5BCE;bb~!Aj*Az+>=#OfXWC zvo|APf~Ea6!AQP;O|abas84rb^7e?FerP2Q=4}US^5N=C&A4vD=I%(pfCGYs5T-Ez;~ZeL}uI0D_@29&IFWls^SPFpifH ztoJbpc7^c|2n{q1@%xlkLO+{8p93*9w$;Pr|8jRV*|68kDQ&Nf`fGw&{WZbR-1~Qe zv_NGEqyj}Px-xG@zy!+xOfYi#bw*h8oy5m>r3HUXuoGsvvHj@!)PM|ZE?|On+>AXL zg+`d?PRTnT_D5hDUmT*n_MchA5&}A+%1r`8Gauy#pAJ}f7!ZPfZO{?$m=U$k{#bN5 zJ9ze`P{;G?VpGqM?d~Z_jhVbkId^uUMLJTqe=z z$gX=kbZm<3uv@dj?cPWAP5o7npo+j~mIFLw#hpQmq!5vc;(YELfQgQcCe!wi&76{W`Yt zig}*~OJgo*CF+IkBeE}Nb9(UGu3%+&h?-6a@k8-$*

1yeq8@{OgW9pfC`QybN6r<@u)u31H3!E5`Dj?z z<>QUmk7gww&3ivuu7145Fk&q=Vrw{J=Q469cEqt{pT~~BC>ej%J6^InUdAv{E;aGmaN>>2L{;oWP02)E z??l7u#5;!nfL;GTf?dD~E;w`@nhZ^zJR;XfeQ{{{$VJ{EU#Jl$>7ZopCEVqc$wl+BeI#B27J@*My9 zxqn&_IqHR1?Tf7K7Y}A%hQt;7>=gT@mGsV)L?x8^H8`!OTl3)KH5bQr7 z*#B)HnB%OFkqT>MFhhY)VXYZTc2BUMO`zyV0DVDv2a})-@;sZU2ugHfWDuYix-2OG zu{P!g?-F0>q=n!#)pja#bZ?Yj4O&(&cM#}j6dTU{T9xvh@)EmNkIR!jGnuxcv&Z8> z(InGyGnKb7&C}s;RtktP2%S5SRP($(n0%2!p-zyqn-oBc2TIbdwIq&3GL{O+8T^|D z3$M)oU(jIScYkOwV05xEd`yFxN}@@xh5;Ik_i=G#*;koF(kb>k^_$Hu*g@w8YACP^ z-dMZ=R^G#^(OwAr7HBxch05>iQlblC5dNGyS@aGpo>wW5d6N1YqxwM$gYY2N(KnP5 zp_oaZdyDTQ>g5)B$Vg>F78x(sJ=YQ|-R1Q%UfM3nL|>(AxLH8C#Ub&n%AI?(oqQM6 z0S_p-cn+zaCIjwOXF2%R#|TaoG)94)x+CzD5NikgrM;HB@EGbnQM`JKPyv334N{D^ z`!-#-*-g!%v_|zc-~fM>PQw{43(4)l&Y|T_{Q!N9>mz{L=1{@{SJzSN)6IsD@e-8R z+W5Ags3R^Yk_wu8at)uP%=aGaCrMyQb})Ox6^b&{=to?4XH^c$55>sO`n{r?wd~Mn zToTam7VxGL=DA!KGz>QG+C$Oq1jh0~RC$lg0#yRo!8$U{{LOh(a!0#&>aCinu`b3m zYcIo#*~RkA1Y~Bhc+KED-{QH?)msBo9!Z~JK{$<{0IdyN%wH_ezs1314>Cvuy`lIm z-+Ow-{RjAQL_*Bj%5l1|%{n&sfq=bjn#l~3wg-zI*H^R^H#r9hno?!2KYN`@oe0(O ziaNQrl%!al6BiIcop9(*Bd)Vqi}-2Fnj8^&Q8rN3$GIPoM$pD&kklB*FQ2-EMl-^< zY6B}7vA>*j=nz{ftX2d3dlBz>3#(dQAvCx@KP=yf4{7bxz+FVpG?HAq*GqmG3>qi0vZ65CXM{SO^+fmJDL$ETMDl(?h zk3U3l3sohk(m&3EV2)EKtrV0b4+ZN#}V*|>$}vv?8BQ%>Fd zx7C(j+jo_qWo02?i4`mrBwjZWn)f;pru1{OC=g5F7GO=8){7eKD%CW+zeA&}j!`)W zcYv})bQX!`q@=54mi&&>=Sk4}AiaY(v}-WeEoG;~$o0zG{j zr(N;!t5jweE?w><6~-g!sTTn&Rc?%vnJi&^C@W)n*PpRH2VtC97+b0=vm<$*K&07m z7zy0j6m59R#y7R|N@c#XIY2KWKzKu&YWsk#O^=%YkhSA^sofa(i6)P_T2sH37$&$j zI7C2>C_@zzMDnEO2YIsZ|4eAX(j)laJ^6TcLBYI0ukfs_^?? ztwFQ4tYNjm8Mk9*KiX9B?}@R{7FM-I7806&M%b1UB6{eLymJAP@OxNb+4AytZ>N+O zYg{`)XO^f@MwQcjQ7H(tf_(fauoV1zn62hzn=q-l?WcvL zfjD)MFm{B?2ys>A;mJ79M=xh|e%}dnm@^XY2ZxUrS%)~$F`fRfkVJJ=`%zC4UwztTus0IF1PQz<;JtURz$eo}ngg z@=Vl2xVo(@X^4t<7N0g6#9Rr2-6~RN7(1dPlmz5P@;US(n5rTbC*N$QaXS`*D}ofd zHWaC#Ic%d#@}cT?w{y80#N}v=>E?3=!3jh*5))cK{?hlC7wqVyIv$PKx!2Q71+rLA zi4T-q+O41|SV3M+8T<5MH=o#mb#kklTs&m2_I zREzl|JalQoX!S>R_sAk1U;Je>FG>@kw{47J<&XIuKqQFB@(pm#34uwnLYJUF#}cpN zyyV7rH7PV)il{E}R9b+jU<`J?KP-IMuS2ur8;Awx;}yPcsL2}B2}MZO<{cwr`=aQB zXZkL91r-1p!z)aC28-yNPR1x2c1DlDr91G^d7DdY7Ho3_3Ait@5p{Y8AB`8tD~&NbOea>42d!iiHJTlSS3W$`qghAbJ_d$Y(*|P^e{{q1fqSTkQvsf{M_y-lu z%KI7~pe^-{&o@6atN3$Z-V8rWe8K6K>bZ}-!ZjrEFa(wAq`s(|b%{YDSlMx(^8R-Y28K zhL?az1CX&11btbgNq1^SzkB8+D80ASY~V;fYkW3Mf(^2?pJq-qaECEdV_*hn%+39z>kR8r+fEeqG%!b$U{6wVBkg|vyY5x#oc+F#CtRKeq zGwrW17VzJMv2t2l_i`<_L9I>280_4mymEJBI0>3sQ{dwc!Jf>w+W-QFX%bsR zL%sBSas{q>JY|jr<&gztK6zzh1-4G9R(jyFRQV(M-1~ocG180Na!$I`;SjglqWUAo zJd*D7^R?L)UkgB`(nmLm-Hmpn~PqK7afQ{kAHb(hIqZK z6K)0gV%W@4D9jG<#f+_weKCL+BNdj(D@WD7?6}GOvlB~3*dNnbWEMS`EqeXt$n=Y+gC1anc@lEl7&Y_KK-S;dfc;-smkrh zXPn^d7pdP!Dc^8u|7DBqry2biCzX|}fX<}6$;609l>t-=>n<;jZ$ z_@=q8+zDDqjZJ;qGn>Bcl>ShkBgwqxjZaNbLQR@NdWJ+z+iXoXU3v~TN4|OOky&+$ z#EX{_>CYuIw-ALLQrpGz=Zow} zYwOeNR?{1cqw2p;u&=+W*{@?iFt7Q2_g(eoy9F-tBULh#=N&$feCS99TX2_yXv;|j zZ&u!3smt!~uVZtrUL-Wt)HR_bd8FnFrVhY#m8JMUs+jPvntl@W#H49xPV-N4GhcCy zG*62dPjhXKd#hIFf*#4_Cp$}fb6R?t`XsxiWaC0qtD+=_pl!;%Gw-h|il+OtbnB9? z)xWoxJO;sT(YLO?c#pGav-fLrs6Sy-pLsi@%~eqhfW;JkwVtCYck^qfFoUHeGXPlZ za?CLZ7W5Yu!!|Vou-Fj@fj-|6_p8Irq9eRGBdVu^@U88nM*GFhI6xMQh-r9pH%*$< z_Qo-7gSM+Eqw6D`gceVR$-}OfJlz$=HN`!xRr_79zcnYLv%chZUa+>z&gf32V-REM zc^#>yihti+-_zrq-cRoCqOS$nboW~He*D$b%GUe#Rr`oyS*u@VqXF~IP}P)0OJvgzfRG;>1FdPZL}PhWv!&v)n4z2d&^{azZb50V~=tCIaq z%Ww0FKZrvZ#*6!zVjEm1nx%jBGiMIGuo&R;8<0j1a9O6eZh`MP_n%Nz$>$lY+V6eO zMAtz-c#?OhMY7B12ul)|8j`at?Z*zC&m1BnB%m&XVzXvN%R}!XLq8^}AzX$HN;23p zhe?yemsf{L;Ez`hUWlp3+OEWYhOQr?+&bBc++c*LFe zlcy4pZtr#JWpcCp^r+#J-_$4ORr14LMxWJB`+rEWu2g805-n+rE(##UHfjMBy%n%!rZp0JuN zV>~(GGg$+eV&sXc)ai!1GsmVFWo(jna5Qs@t$gaYQunbbCX;?_ic!Y@A;r+?pQX~j zG)$8A>FXFK{~^WDvx~TyvcIHQ|LCmJT;fLBUs9}mY?`DyhY3ImTcvLRq!?)OFDX|3 zmlQ*%ACqF_(K7Lw9hY>$0Aw?99w5aclxSrDQj9dc*gq$IX%-;G`WeyzQjA`iLi>9L zbcqq{`sIb>j2b!n-=r9N@h>SxlAm4hpC)y}hI+?WEdP*VNB>Jx43r)O0*NYx9#WpgVH;cwf?cw=IEkZ;m$nKT*Z(2Ku)C-E*2@qR;(Z%R zw>AJ$414NFOz$x%CWA~ju$m?ymy`JCtPj(4kJ1L#HfR2jVnIfGAk2^5fbRe)W}V$< zx&9qRwOf;2S5rE>NZP1K-6SnfxEjrRl+AGW|Ki4uaT9l6*Zk0Q-y9p;pZrUT0pC9h zhb{bPz1sHAWFPUC$1^$sXK{_rbi`gg}EdMFaa>t2|HN)L0PK189-J}|WQ zTT`&yNTi;ehpSr4Jql=e41#Zl#qg3NjonF7@6<#Ju zhL1i2iq)}>Xa(LBy% z@TbW@E7sY$2WzQd`sR>pr(?T8p(crpt;vDza+HQ)*vUC;2ka{pKZ^WH!RuGQXi?zc zA{Vd|Bte+qq%+^_k`Nr;UosG5WIO7*VtG9HD#Q^NbWeCQP`xZfZi$M%I`Xs#HKU?v zbXkQQQ6U`jl0lPJ%2|i8`vaDUryw7pt5z$9J@uavUajfNw6NSWIMXe=ZSYc!;%Rt( zf~N=p+9R%j;H>CH9)3*B>ZLcBVF_g=W}X5nI6rzR(!(bIQSGLBdL6 z)OAQO%@q)OkAmUYfh=OS6&1vc&EE>7Y6!2#ErhZ;3o3C?4_jv{R z=kKojy1bouaNu&GMcLQ2cIB<_TIKABuUq56?u1LtI-8$+Gab!^^>@sR!GSOi^el#v z%c0)nz6hZ{u+;h|zS&F40dG^yafht%>X;S#{+AWoDE=>2tar}$4lx9 ze%^D@7pzXU2Cg5_Q=>sxa6DQD1lk(0#0F9!Ep!8?u#a$f^(s3Gz6a$r2U?UOI|Wn0 zj4{`BXv9FE26vA}(5-!UO!K`*O)54B`E?ND^7K&1Lq*65<3PGr-7VUV3!3zxM;L39 z?;7VD37HV8DFvOQGX^5cTRE+YUtf12E+)U>I@ibAN6;wdu?@Tp|4d&xI-~R80SLsW z!E;^=PZhF@&E_QpstemNu6)5_UUqYF7iS#ppdLFDTa( zeAQTF60OS2i6cmAQ0Vzhu@L76NNXI9imrbE8|#o z_Nr8(Ozt5V&QQpgJ-lh=Z8jP?^1B|}7s1|eQ<||E&?7Ago<`3&Bz#8R9fOAnpN^0{ zZF8>V$fvhlz$9%V7yFzIugtzIeyyAlc4nEyP+E^5Y-50kMsD;{WvD|w=QD$=6XbhNH)W&x6P^yvBG#dSOCEF0-_XaNsTillz(sI)q^&0*CE_CP#5+E@AN0 zd(4?4s@HZw#-Od$z8rdvm{+uT{Pi_Y5FMFctD73c#I&LV;>EHXp6N5XzoCP|eI&qe zc+~9`B0pDXSw|I!3RDSte6a)USKXU}U`#h(aJtc*Z&>_xIo&y;L!{Y}`c@MT#JPcn z7N3ryCT>TsbR-_W+bj5XJON!WpmDcb@Ue6^6=iHP4S;^v!&*8b}@GSIS2}(L~ zW$ermXm8r{-GR^YTiN{5Gn34ZLlI=%f`B9RR9+lW945 zrsUDJ`p+Wkp5&jFr|ht~bfmy@79lNEGQ<}gOH1;UlQ-^PCY6fIs~)hBgP41n%cq>d z)E0G3-!g531#G`zc_^l#PJN9_+!1z+bxnx|lw1Ne*1b52R*(TiCG&>f1~_h+FW&{# z+jYy&fdgGfOsZrwXA&P|&zr#>}6{OF=-uP~j*Jj2;hy_5m$m813a2Fx&3idmwf z32VbLt(#AYpPeN8Lj9GI%Zv^;#fL6pIlwM2p=uY=x*VW)I3BpdmBdh3zb;jC${k)_ z4E`ddk_tsRyK|nS9H!3JOh_jAoVXWPYmpL6JwP%%Xz!M!$4L@7LD~BTSOPkU;Y1Py zChU?DIIIdnR=>K#<|j;J`qPNat8+#f4~o3rDLgajstalAzM$nq^-Y!aiHv?1% z7U}%p*R|ksO}@L^{=w_t<#xAi7dg)-LrP+V?a!b9Gv#gDS%)|FwKw~P=P+257$dYB zV>@TqiV_v>;<(G@x3B#Vw@cN07h9J@K^%lY1S_s)6xI)#hvVI9fmd2Ad(Es8PcvLD zJc%ZQX~9Ehd+t#&$uW`Wm9NQj5f!0gcxo{so^C_H_kbGINJ9a^#RNIEoyfmJ{E#~G zIBM$KUImjJlgm`hml>9@5E$`Zm_q#!yV4*;lF(q5vH`$gF47+e4fGZuswG*Y0#%k2 zxEIA1j*XmT$4OV(fMCz=Jp8Oo2CJFjYS%K4y7$tImytN!)Yx2XM|f#T7PaMmH-Bz_ zd|&b%4T{bfbKp7Q5>EOUH4t}oD-2~ELIuakFwWckxQHL5+A9>EOCj%|K)KQHhGFiU zyHvH_F#qu*9{cn71=z-@oI>KTOyWcEJMQ$*x-Pz))31e2v%)TcS{1t&m?h4QzAJ}OT<;y;VHYg!#>-0})A#W2b#DWP*|DJ8)i!5&aK;Yh0v zQATfIQ|Q3(e~WY2q61t6qTlg5R%_!jjJ-0a7e3M|0Kq{dE8YlYMq$UeiWW%XXOS z7QL38h1OZlHd{X}hZrqaT`l`!t-Bd<$D|nH7zA^(cy>&RRcm=lw!8fy#oR2kA2_#v z>j*~UA@?%0A3>hDPl6xzX!}pLKZAo^ina0dI*WwxRAcP`MV*lOc6WLzA2a}hbriwd z+3{f^#~|2F8~dJi#H3D~0j-xGRdBry0D@h$=ryC9ge z-gCdM^x_byl*lJnbzjx%Wtw)Rj{XY-qt`Fz(XWuyf32urX`uh+zky&i8Tz%w`gQf) z6*16a00b*W_(VW;Ffjad+u$6by)naZT=rRL5A~~!b~JFj-0nn! zI**EFI;=wP&--?LGieT#1zL*T1rxYWq%zl_5g3izwxoX^!5bm7cpE!ES)1se! zR4YlQ6JDaltU2(+*aQ`8!pX`rN8cHU8SpF~U@y>nG;H#OJiy6tS-_5qUx^0OYx0f9 zgs-H1`RZj4p37pfg9QTvLYeKm8htNfw8QH!6FM&MCmn-eNsdNS#a-bRI-8UIM|h2) zK;tW?{jbPDpY783$L;jR6k#>bJ$n8^~tK%sl4SF(%e=3&6y-?JWO|iFr@|cykLHKx<}d31DKN z-SFQ~%C8ax`~y_U)RJl~@)#4Vjzk_(vX3#b@U*{}*b)U=I_ZtJ`Y%k3)@thTFDBM8 zGqj5M$N)_@rQw0m0+<*VfQgw-S(r`*I9PZ6gNX&F!Na)Yf`-1sc9NlUzY)in7%B1! z`ZJLNIl73Rf}N+fEuy*DPc2qB320)}=Yxkb!x!zQcK)S_P5wrBnohYFMNau!{?)|P z{%B(V9TOA!i-{ReOIXco1MHQZu5B0ZiL~BwUwpDsfEUCb|7S^P}@Sq}aO+Z&tEXy*nZfPKoColOEW- z*`;~=VC7ZYS8f0kqkJuGv^>Vda5Eo8?q-tjBm$V2spUmE8vqj%3|L7hw62N!>SyJ= zbKkk+%T)c^1b~V4BVGQ%#Fm@@OiU5L#Nex18QW7 zsouKmym$Gq=G^=*O)TtRnwYtjr7+s2pgMf)679-~wbq8PkJo~->_bQ1hD;R%J7Jus zW9?TjxusLq-kq8;Mz3sNciZQq?AlL>!NX+ir%q?B@4Q`ZJ9GoNIh#&9jA#b&e{q8> zA}--(AeY=}K1BkU7)`c2ZJ9gWJ9ql*r~pcMIwf3m4S_6+^hGbD*CQFiH<&&}GWWTY zMP{^4z=W{t85A0!2*_hNl&91q8%uqbjL5zo^l3U$21}jc9S*zWQTNf>`Oq%#OXL-f zJ7>vY&!uqDnN7Yb#Ho>B+}?u6NTfhJLWSTgTxA|$3_gxs41C}h!utA@c?JqAm-Gva*FCHtg9*;?u`bCT87@CP>sGsW^OgU62AlzLhNJH)m zPV9Tdf~)@Cf^P~;XagV;o$_2A>qa(ZOjo#WIiHoeCFD(`L~s^?@21JXKA!{sJomsV zU`ob&nRq9)BE$vbpeuNs75uYi+XGi=^CL-PurS_Ma)kO)_1-;cCyQG@yy8Qm z&j&{@hIpIKq!s;02l^Zd?hk*^$dcw%cYlb9?eFmJ-ERAG9<_hBXi=wn@2me_d9NE+ zDb3Bb=^Gb{FU+kY%poGoPAu$}SePCB(76LP z!w-MxPUBPU@4vS(7H;lO2)_pkn0pMpvH{WN2vIhrK0^tT@D38z2@;Sy+IklxK>?q_ zP@gfSRwM^2F9oZbLbTl>2GvKJ8;8mfN9RBh=Wchn&w7gJCJNAo3;TG6(xsmBhRHYI z{+e83Gf z)%G88_h}|{rzx{2S^g?W$W!py+gR2gH{w_JUHjl4{r1}@iJ~|*LlN6^h!N#zN&B9* zHQT}e83fxqJx|_B8;=?f?@1Og>Y#YcFMlkv<5@_KHMo*pe{U?e=}nB;**kq@QFiw} z^~tVH);`=`TRph2L1_-}%dq&yTKM96jo0q*59v*!zIb|~9!wxrImQ0*Yu?`qdmD4@ zPo&)re|h^YK5g&}3GYUKi8J_}`v58PDF5fA-N7~9zWX=4+IR!@IlVDKh?CWrUX!#M;Wjpd%6 z?6*mnt?ln>W<%e*!UC8RZ*O10#Ot1{5q@HL5s{_Poif|8Ji{4wUg5PNy-b4@9 zqct#%P+N#Vt9ygp`Kx2iXKwHPI@3|}fU(@Pb2Y^+qM^@ z_meFPwqDo0QJZH%-905KiVI;sFQm>Cm%|1leB-hdrSPo`!ucT(r0`d)y@3$MQmFf! zVCrvq;}zi2#NbhIWuLO-`+L6YqN!UUQX-ozeV2Yvygmr(;&Q+1{aivaD_;E09Q#sE z`@(Ns=Ju;StSKa^B<0W9R|8y^hT;mPwZBzGDPNDE9%S5u98sl;Rn)f_ojMF*927sS zYRsB!fd|N847RKxiKfj=lJf7V^2jky`ZX=;mx35Oa-PU=}2f~2Y+CZ4IpLu3QB z;U`WYT|)pzj1Z|*3OHgcDA8%-X%oN^dkQ#W=a$3M0Y@x(SWxt@BbF!_1vp}+fFq{L zbnJ*p^59aB9kC+75d%g3am29ycEln=kf!sUfFs5XO}x$DaW9hpgeg-=ek(m!4+427 zF;;^9^ocDQ0mJ%scy?fGXyh*p`-w#+C5TkG!tKZ(w{vTPii9DFe!ARA-b_YO8vc=y zxw;PS?=Pew_-}e&Q#mT8pv=CU;J3ylahoA@&5u!<|_xGx) z<+^ZCzu{a4BgkjolP_x(WYn0ii_P&mnZTS)Rqul+P zSLhVNW@f`I>TUU&g&|VAjPd3fYHY0zQ-(|(Hi3zXm6tu1Y7xDbi@23>jgG(@##SjqPoU&6-SiG*`N(+LIYcA5 zR`v>K5E|kL0rLeGg)FHwYMG(Lh#~%2UN@ipz%a#J>md6Gu*5Gk zy}eiy1G%A1snig z*=Rdwqm7{X*+TzGPLrp(DdzsXI?6Nzq)JDs z8j6AxsR|-W=!i-ODFQp_^FI3-`#oojJg;v1#=2QotIpq?-;Wo-=Qx`2rYeaV z1LB_lKg8r6?e^_sH3J#;;=C`ob;CZNZ;2Z|?zk}Y&5C|UPki7Dw|ftp^Pt*=+ZG*WuvkajZ!P{nUtH{G`Ep z77Y+Ijko36)yO%})aMN$qZQm82oiMQ{3;o(hNg)%Q8`<9q$SA}oIx%hvmB*o5f3G@ z8cOlElX`+Rw#eX&$qzXXx?r7l6Or9iYPtTbSLIgfZVgQ?*7B9m4Rn(NL@TaJ~*V~ zt{HXj2vjOqMe&BsZ5w;44APM3%YHdb5E4f?b5Z&o_dxa9<6bM$*z%=Y&%@qn^TN^h zmd}V!`ZtIqO45|mF*9HD7rOI4pZ(t0cjTx*dPWrdz7hAGkXN20u{X#nGHU*y0#@AD zukDzO9)Eg~r`s!`LkpO)J>_9plePHP7k@}V*YW8R1sI71@XV`I{$!_Uk0!SylsD~t z|Dj*m0!_>N)W=eT8~9kwotO*O(@)5b2te^^{UaHpxh-Y>F{A(Yho|RKS{I2i40uWG z$G;>o`cHUC41VvWEn*r+{g7rD)yMbgGG;k3|B4cmHO6&V3_l`7{Gr4c67%Ua9F`e? zE2tQW<3E&`4eP4WBBC2c#TrBPu%G58=8q)Sdjl_tO*0^XpM|@zpv>Oi4V+e=?0l!$ z^m8+B&ObpUjO*Ra`(sgS9LH`H$1lf3U& zUWryqkEVfLjw!)VvWEgk^=-D#p)g;8t3e7x=?o2dnMM=SN`s%<<{3olh` z$Wb#zW5~G>yek?eZilS)dxV+yZ=Ui~I=;Wc#L$Kmg5<~^&A*r!!YJvbKtvNKKIO!E>klSIMRfBf-_77O*>oD_xN_yr ze#^Rkk@p+=^OwZ5^U<040F{SYs=4MjWj#~{@f}!hu5unht1(9ECQLB?TZM0QE8jc* zOsIdUl}8)Pi`T?9OcAx^YSWxKM}T1%Ik%^UWjfLPF2J)ehWD^<#dN4BUrTMe%9tKe z?wpgM5vSUq#}#NGU){sdYUhEM#2Ogg6B!ZAz3S9_3c=IRNloyTB&K=RZCU7T=B93T z`xq~YsYhp+z&}i%NNEx$;w3Q+rPaQ^FGRh3oknCm(*^SQNvi#8 z3~|vfmaZhRY|ucGQSIP`mY7Ph8oPQ28t^Uet8gwo0>e(-oi1Tx1R1YUJE$yrj*Y`h zV!=g>NW0tc!V{9{S3PXwudM*GRIaj7vU{ z*5N?n@wh$)$S1vNB5Ir_EQX+zkeiu*)wqtxXi3v6=lN(gF~$64oN7zjTpiKpA%}7C zQc+na@zZFLS57LoMZcnB1bH=7Aw&Xx=mi}@9Ig1)THRLCfLr)Y(6JZG4|eMQjl^f} z)sNKU@ tv>NQ8o6~S`;l_|yG6Iw1*dkQt4Bdm%nX_DZo+?x=Sh2eMwN3FXU{8mZ_0UzLA!#k(_52{HuY^ z`aZpy6dl`r+5%tYc3+tQDatWP`Y=M{q*#+BiE3i-j2O{E zzdZM5(28bo6CWAC?}`xvNdINTxc)F=+bIQ7nwA-NE4U* z9*=SpjIN0*evdn=iA%GImra__TAJTWT7a!dpu&&4zDbBon)fq)5XMhDZBN`EDPrKq z$F|4k*2LxGCvdsPMdlA{mX_Gw6C-XGyskV)emF;NkJSdG6-n~bb^v%+_26*m9tQOy$Z&C1RGA_dJd75hqA z&8lhsqR;>pf2;~w3)gb9gnWSZOrsusqE0;m_Fs&c?jJ@>H|vTK)BDSawOlb``th<1 zEiE#b0KNDF#Nz|qwgbJ2|CJFl`NN2r$OH-`1ensvaci|+F=FG?#J8H7$eCEnnR&^XS45lNkh55jv;2;=e%fjS4m5%uT6DEq z-jK6055zl+rk`6amJiMQha)7YV7)S>u%pq-4I zTXUbad5NkKz|@W z56_5A2Pn-4=rPIb#s}yZ$m%Wl-*A)90{AHfv;{@|cMwcItgS8VwS4$MTlkoK#LOQM zY+XLgsqH};p#2~y3D1aa$fKCrQBVc6Ks#DOAzHpYS}j;rVE`Gr`PBU8`}E+8iBZV4Jj#aE~&j0I7(BM}t@@iXnI3&&yl3dpvkSY=?) zvO+ppM|kt)zYsCK<{;|vVAky8#ERn}`H+l1h*-qHvEwBw-AD29XFMXNo))hd|2z2c z5ydj3@{A-9I-9rBuVm*2FN;O^bjCtyp6mO+h8grq*yf&q}1%>3lh8S-v z$oZOXMKtff4KcXlQ~8d+h8V8_8goODOiSs?5QBHM@PrseAN+?QhONj_W^nFGk-rS< zkm#zeQSMq-dO6b-^qqir>NYQI1}boch%HVlq31haGO0v22gC4)Sj_PsM2r9kIcp7) z#~Wh&QJr1NnpyjyN#V_ghFCV=Ht}S~>7LLAgs2pw{UV{V6LLhBw4) zLeI$dz&4>m$9O{wJxvT?fUtYQ^xM42t_m3?hbZ)nwD2E&!oY1)&7Hh^{lja$Qb{NW?uPig92u}Fp`wm(| z87{rd{(xYo47Fn#((i%2nJ4*QAQ-GIPsH+9S%<}O;k>k0Ztrd>IL)-&Uz)BDdGPadN6D}vR zw(M6Rn75Bf`KCnJ$^NIu49gr*k(j2f)E9xbUlWk9r^9Qo%eag^`oh%c0Q=&4UjG9P zQWkQtZb%=M-24>?R@l%B)@IMp0=E@6^wZzr-R*~%htud#8KvxMUSi!_8s31o4hC5i zokO+wVRJ(1gWO%Ts}$iWHkt4E3y39RcNuxNouetA8Lxdq&KMp6=s^iT=X0=IRq4I3Y{s^=oU|?Qe|H0ho(6rcc zz2<|d=|IP#iEzuxp{b2J!=q`te~8Vv95Mp$e3~I$#l5<@dxeRqPzvKQu^l`n_UaEN z)@Y_2>^B&|DL+oFF9#}HG3{~UBxGc9`)KNu`C*M?!0s~q@J(}jq}sW>J!tQX{Pk!C zjbpo5GS{(1K^OgCZa1L#?7}RqUGVava9pmXn{1BH%AQq5RbEh=@|%R*cPvA>K3#JI z6ZOxeG!lbsbf}A9c7mEv{qJt0%PL-nEs_ z4Ih=p>xQ#J*{h#_^#1XDlppNf+V|r5&!&p-mDCo+jb??~iTj_uUe*+Kz3A+CrR<}8 z!vEd}%LFhcd6tJyp5${pvaHParig=B<-B^EsbjGlpX=qk%fl2Y8X}O1IQk|q@8NFz zByh%q;#md)Ln}N4Sy=t4#C#jr7+h6uR@l~-U=*c*AwiDMj z*pmVZ?Sr;J9;@9}?=^^7kd0(1 zo;GtS%{r!-zXBM~_d-s;phc749>7D7{-D!tpGsVf{rb7=gw^xfaK2B^GkoIT=xlga zS`y3pwb79Gp3<&QR__W_16$<}TX~D-v-y?Y0S$zmv!46}%qnblC z43(scaxWoM4j@uRNOcnh_E68j;zbWG8(N;JXg`x<8HSQ{V$39m*&Eu|b?uGDD&*v@ z89dT8t? z@xA6a>-lB=8K+@}d)5hFHXs;N=VWGG^Q`fgbyxWm@=zexok^bcUYgFYl;S}SOJBaD z<-3=eC2ZSPY!CA35b?G@&A5%`E^C;V%ijtXvvne4btDbREr6u7;=#%JKbD(2F#QO+ zc`z_Optn#GCcw}BC_H<(^z4fsACUvez3E=2q1ir;hUT)cfMU+6s`r6r-(^pGGmmqL;rz8efi}*MVV3VS;%;T7-lP>Ivm6@cP6pYbM}LV+ zb(uysrDdk9iYliKi|r6Z5F}B-i51s^OO`D=|mx#euXW1b26nMOS&EJ6*folbpQ zncd1?_n)B^7Mp-W0P)cQ9tXBE|5;l-D_<*Hj9(4(13#L?lVXt0(koKz=11rT zj{bO&gIABy0-R{$bNZ@k2jm=sB6=ufi`@?hX$vt_ z?LAQDVrt zl}H8kv3^OycjF)*8YX`url1U_u#AwSbg;i8r)c~JP>kJCrwzHzAW0~QCZPSjczzj* zr9c$ZpF1!hkTmnf{Bb?>=RGXNQuIO@Y~ks$bq)L@7#c(p6OgN^0mrtJqxYt9yNq0S(oB4-~#HT&Zms4t&bZ}x*47OSKA-@NtN#tG+aBF}2 zn9DkM)OZI)d9zq?nv3oyt(uhyTs~V<`9O`lMK>v4dJiRWsV>C)M;6m+?Pb>4mMtOF zd}E+_JzK|XTIUO`RvvGHJDM5WAR_w%Y-z}Nk8Z5#k1UphmQCs*bu!gC+t(>I)vGXt z`cEU`goOUWVx8Qa#-<5G#-C;tI~=`UH>yEisU--HMGnw-^leou<1a4CGI|MWHBHk84bN3Lr#TlWeUBiv|NVlKrX zqypUVu-K7_;Qxfh2>ovp=5-Q5XS0l+ni{)3!q-Ed3kK?5!D7Yotyi#^DRCy0bn8Vd-@eDcF8PZ=JLU;;*nKNpt^6Mu5lJ#7Q034$0^$LiFsNWFN=w$S+7SFl*g6)a}m$}QG*1&cW`<&dy3IvOy#beTAcMbrI-#YXF0sjGu z0a93c%lV)6xzUZX$yPdram(u-x!_?j`Wz2720fD;&zc-%^B2;_HF$ps9K6U;0;5BS zKnJz+{sR^x@NVWGH}n4VFIdc`^o8!_5lZD2r4RY2^YaJde`PUx0EF~0`Oq>E%$}%I z{n{T{%vY^Nyo;FpD`B9ZAf3e|GzBZPl|=^|!pmakrr*4^9|v5)V(RLnrUH0a?DU$* z9-(Ocl2;YW;EsTHw3m9_3J7WYm*fk}t5Hd{AOFA#1=-l7- zmCjtC_vO)hD8{WE$G)ZjwPENhrgLs}uy_)a(xCv6Nzn6Nw*C*oh+&{-TZBteP^rAY zRb5FwB<-;9{nzHvdq8($M-DT0t&owm0X-j|BGE@V;N(K9uJPJyh(m z>^V6CcIvi)=4_V=5@=`joUdGHomnvV2B+-o=LQ~5g+-TC559L?G#g9;ak3Plt&u;2 zJi*6uhvvbPtpg{ zGoAc15@!=G%|i$Yi0qb$kmey_CvwHdBA3#i@@l*Va~dP|)fqbpsM*U=_fR*hKIL<~ zVT+8vakgFZu_x36#dwWTaIM$|7Vy|*FaFUdv8X(ir1p&14kTU{BT&d&v=q-f*{OpE zZTGmHa#1)}QKs7)0rKN1PF+h0lyA1W-Fv|~bW-STDbUU3N`*I-@}9067r8_iots}H zXb%p;B#>i}%1Ywr9mRKhqGtj*dNJ#am1uYFXKwjIFOC!brHU07`Vc-DNOS8AD9u}o zyfeq7rgTl?G(iLw!bI>_6?^(fNx<^7l5ailX^0^95Y}epm&sZalu_l-R~2hX%nuLA z3L>rTOdOu$Z=K7tQ-5&THJ6vjEhuJHD-m0}%E~l$DmNKgMhGXyArl=4>j=;IzlG*Z z9;Wmy+6_;v+qBd_U#(|h1CfN`2cfQIniJig;!yxKVM{zMwstCze&9cz@>$oFtv$*h3a^TlD0gz@6>-5k zE5g|Qy4YTv$whW`xJ_j~vhEIaH14!-8VG~uo^|!yj>=%kBgrlL!WNvY*yhsLHPgiu z%a({&#l}BN8&XD|Tf?W>UO(OnA|vRbQy~zyZ}_a#LlaK)b`gKy5f7FULb|cLMz1pj z$uVZw4h^5nR7{_0AsndEXV4uaA0Eff6{9;Z0t5%ys=QoadE;eMe#hpmQ=prvtg@Bt z!2RBSZ<|3b#sTu(ftMQbSvj%~Wp&fK`O)EnrQNp4zk^kJ>YfyeIPc`=5#MT@hhl7o z`sU55*n?kjJ$Pv&a!g2_g5=xoNs)*)oM0b$Q}~|e(FkgQ#*TF~)=>f_$L!lKjrd@C zi;NUcie-j%uXn#^x_)~*d@P!#G!>S1YuRGCSl{{)6uro~b#20si#v1+Q1*Xf#tlZ#ku2kI{cp1m zIiR~Iw&Z(}+lvppZ`9qIu={DSz$AZR`)ZNt6(QzU=~gV~%LubwA1>xjg0bJ&QY*Vm z*aFzKBPVQuzfPSTp2#vk0X@Ja1TR463HCe|po7u@9_Rq3j|WE3Gk>GQg*S9@xDj0* zMl(3w?_#QzVrn$}#kexP&O2HKE@p*ai?2`k3;)@D*Hajuf8nh}LHm z1qEY3>2rHQeG;+zOT0Dq*&(EO5G+Be^lj=N5G)ffgZ;mQVDF!(`~kr#Z)L##fMDN@ z3?j0X4#~tnPF0!KSUq8_8-5vP+e@O4C&DtrE|#>vt@|wW#9aJNFV#*ieA-IVp(E-( zGNNjvlkn(zK~CN4TI(*2Y{YK$(wii0iU=yjrj>WuS0I>TvumLiOW4KTL-vh{GW`Pa zIYc6DLr;p9l>aCODr%I=VJbZ%_T1XfXKd7kH+Ui9UW%+|K+ps41rn3`qoCP^{;BC( z%3o8icOCr&!6F6YH&=$uXX0N7c2^e;=0!yXJe_MuJhttYf9m<_>_lQH9*l9DiM?Te z1%lb5DI70nERmpmevrJ`BfK2Srfor^!ZoPLAOpA#EK%=GMK~t%7bQ3*370TCCUa1? z+owpsM>wTkdoY7ZRx*Aw$Z7u0saWT_RlmcX#Q9d9a1rr$&RMrfA3JB0^aMLS`U`?R z%qq>c3tf57;Zj-pQ0;i0@HUGXZW%0(LQce~wjP%xGPKfbB{5ndR>A2E9pn|N4i7C| z#k^aLGWc(&d>bna{Q7C)x<&WCOL>l+9_J(LdqW--Mff3-$GH!!-cm}i-`9G|H4Y!5 z*i2~d<#-k2%TI2VPK#C@OUI;0_tCGosW%Mic~wEnhRL)p)Tyw}GAr z;NA4ekKe)1FWRlsRfid!WBEynvDB|fex7A;b~IdpU}wYtvgd`AeuLa;;r4i7tnq$% z%DjVN0||+GQr<6(y@r1QV}QGm5$Wa*KWz%OY2vagU~Ci*j5Xi50>*}n4a8gqe9}__ zKU(=vf4Tz30`b7uVp^;C*?$3Jx-+7ng87xX3am%{mltYIkDu)H{u>zktS|qQM$A8U zG-vV{sLoePM1>}!2E95D5qZht0Bs#WP4i_37PklHX|ZA zLqBoV!kh8Jn0=>OwOThP0xyh(>{SiGZXj#v{f z=@l)0xJ8_9ihO8XnR*S5`fVO6_V)DX@dG{HU)y79I-?3u@e^`iB!D*^kw5VoU`KL| z;x6XAAJRdj=ZnV_?PPFo41*p^rVk-h65UgchO8Qs7FzqhFQS`F(;l;X&pnGHT-Mh;j4+%*!4#rd9+4SWb2?GSJ%Sj&L zS|S(!4FpTMTgC$=e}~ZX%gnAVrr?=&)0M!MMudr>3&8bgOj2M3B^u+ zcgscoAs@CF^2=$#gJ3?1!#10KkGp@AOGJ^6*i-oD_KQA~N=v+%Q3%b9!7Lkx@~;4Ze>p8qTW^dwiL;mPjq-Co>$#+v0}}i3#br%Bjg@1WKwjR zEhR*~dD}^;Xs>cIL|9H(KO?EhLqn3}R-mU6Ce%1XSiHujb-Jz6^Fxl~;dMB*mqC_Q zQB%Gghb9Arth&>z%xt~Jw{_Fvk#Ha0YV0oZfMC1n% z1L6WR(4fO}5jg=XnKy((MXf3U<)Y5IQx?sUX4N<4Hr);A<}gJM8eL@<9G=jU^T|3U z%;l4D^o}Q7G<>{YB`5X|Va(fi`5&6N-qiZecv^PnT&t~dv9tSmYE5wX;@PO&Xuv;K zmtVe^Nky4*#a+y5lfQNk3YjUt7l0DUTxNaLzC$kV2cDf-CpzjbJOA$Vjq>SUoov^~ zM-mTqmF-ldcZ70)I_e2r$y{5}OihTC>=AfX$UK^nvpY`Ikq(#r?N=2+WLnq^S&Y zFmi7gb||2P!0^89phK$TYbL-DYlwodAiel}De;C7+~NL{^nR z``sUZJ)H<$m324x;dP5FY|@nDTCMd4mwx&iptsSO@K37%hhGKECoEO{KDHFZrs}Pz1-n!!=h>2l^+Gd z#$;Ep`;N|ycO#OO>)+Oa9#2r0*l60M?Rg)&vwGc^(=`0L|F!)kgZSIP@gCr9(8TpY z)LwRt&8q8obdUOldbq!e-qw@g{tw>%NiuTFObfxUK8wW#TIxO0h!5dRHS(+Op1Xwo zw&>JBFUD2sM{Jt>YGe7hn-S>F@m;55;3M6=0HKMqWn+y}_GEp3(e-(CX@Rr6>vhjv z?ew-Bz<6oweg-0d@SXX|F82B#X>12Cjp5FbZg^>oQv0^wzo4-lJT&&@wbp@m9>-6Y zAinh=*`s2z^S!diYhR@Nz^|;!%V4?@+9SS{&p%gCcDKi~{MLv!*;(dL$cbImhw^o% zZALxaR_S`9dfGu8bt1JABgIMs(!gD3icA1}jSf%Jf)Q?ibq5I0E{7h+yoM1U4XAV) zrrS#b@Fz>y&I4bLL8451(_WvWB#!?B8gn&dw6MP>x=U386XjMP^{gx{Ccf)W`qwsf${g5ALt`M0>|FE|BE(e4_!er zT3c|~TF@BbI3v^1Gk9&x9^xSw&vB)VDcn(~+7K|lB;?-Sv0_?GyxrsUv=M9_6roK6 zwyv-yTne}tzYLgO479>a-aTMzCeN%&ct9}v7mm~j2@q-r*&+e8zM(lt5(^xul{)}% z%n1=>C(ZenHFo2fTYzND>-c1jKh~Hz-Wrok*6n2@%%oiSK$GJew~_%yo6+bIc(W3w z!FE9iV~7qWHMxh%LNd6d0UWl@jLB$zx1b)6i2A{lstFwn>?Ejy7$%@Wc*PSWNQ(hngC zK$04bJ0a{z?@2O-Rgl@jkhKlLq>K3tLfS@oYGa&+LZwFEgl18W%DB} zc-sKVK>{>$z;Tw5O_(Se2dWrf3JU_~8$DF_J#xs7Z2xj&Z5EISOV;;f5tF~1KLkZ= zr$>AifP5v3>?CK6@qmm#;z(#ngPFiaAAwF5G+FQRbgRo?i+LCJc%F&8_9R9Ro~x4% z#uA2+;ZqJE?m7)&6+5XUjpepOU{?&FiQrj^I+-4X>ed<;VMTWOn4V)$hm= zhUI#=_zHV+JshhMjPlTzqDlKQau`^Nz*N`fJ1fqT=-{ZU;z?>)DPruajSSho`%)l$ znf&sBi90^_)qos(K*0$OmnQ@jMA>6sl;g`}!s2Z-9?8&CQkD^QHWPKpge0~=lI0>P zXOT)uxv}30@XpvWS=I`Yts>W%zoYEXvz~FH?3Gp7l3=j0b2;{}Gloq0N1@`v6GGAj z^7jN#O|Yi?amGlQ$PqYj01Qc^+(w;L&h@eP`y;p zx|SUOs-~HQFh_@`mSnM|$-Z$iL(JvGO8XUR$}cGq`VD`HO{?cyb~O2xdDzCW{wikRuks% zRBvF|MU(clKNRW7CTBlBbA}G5olE*z*po)hmN|azAfxQGK)RJES2!R6uR=S`XsA3h z+DRO0!)JQEdx9J!nX)y~J!q)fzJoiux=b7K&nM6fLDz+E;R!O3be(J~p<7+0zahD5 zB>5o>RV6l(Hu)WF?`gl7;5HNglBWHoL()sX(8SvX7SzdwW<%i7fLBqcAi*KZ@1I|7 zf2VnG#l(~EYzu>Gw178EB2Otng zdLFAJqU2xXKu8Av%Nes>L{3O?@angW-@jy{G@?(AaHtKoitj}| z+)4t};7t;5c~AuynGbAZEPKt_=} zdlGmm(E1k{)0t!R>0TvOK5Bfd$&-s-Lq3_KnaaF$Gebf53xaOf+EOQYaP5<_}jb z>sGDbui9>{{sUTb5MOh;z2!u@YL1)kS_nz{a-ookL*7`Tan{R4wU+|vBu-Erk;t`Hxuh4 z6Z5MRecKZ+PwAB_oxs~aKmF@M_wX@ZaOKU$j*4B>J zK7U=G{k=ZB`fd00+t;PdFTXYy*M2OX{8;|BwR*C>vAMH;zO(*wZ}V*L$L`_Q@1wo_ zUwcOgXWnxzaObp|L zjp9gVqCqs6HwUpq$A5}Xv6Q@fyqGNcjHx7!)sJ~ORUuT~iCW;>le#B=ZNcd&}Wj5HN53mEW2SfO4TGRW<< zRHL>j=}5}i32C`MVBAHh+k6b-MKltf^b%1WbnYPD0tq_L$X->Gi{OXECX`FY>LQAr2z z@`+??!f=3CiYbwhs&yYhqG}^txh>t2;~2H>q;aqcfsO=vVh16FYat;N!LSU97`!~`ILkI^#mYQa^bc|_eD5E48o^IdYwn6rg#JJvEDIi)Y>4vIv z51eIG@2NWSPPCh7I-60UBg45AoNKU@QKKD4Ycj3{|E9@8!2L$jie3#RD+m`)u(&8J zPg&~0Lc@M6)0xPptaTtC0z{!i-;V&yRGRFunQ=oLW$AoA{*ST(hWsb2_lNQ9tX_~iEu6yyb z*~4@IQviDB9xNU({+cyY3$0=p=z}M>98qMuwo+a=;apG+i4MVTo#hnqW5)A|%MuTA zN8L#RMqa?4K07bw5IrW5`4bGEJf`f0)8pRm{ob zi>Zo@>nq9ca;cgHKNmZOG2T8z)q8S%s8$LR^9(IhlVlZ-LS&)j#ekM1OKYb*z18rCEX&6Ji!7^{5~hRi;3|L?%Ht39+>v}cH%Odn zA5kg2x4PJfm)>d{&ihzmvS(4O5T|@63PBA18CI7$%>9t79P6%6Kd$?QMvKNKy^uf; z<#j?7+sp8#e6r?|55$92jk*n zzH2h|_=T8-K5pjsgoP-(shhWUw0tn!nsmxv10< zc;rk9aE#$hKAUY-+9Q`aX0T9FMkJBbzK9-l@&FKas^`=}!f$TPu6c*5m)Sw8q23$7 z#w^wDS2B+ao+x_>(xcL0Ukk8Op*PsfU@r}J!=$LfTyqLWg7uPH-?h9n^S zPYupDop$_Z2QK8FkDYIYU7q2#V-AEz@SForpSOH&KZWvM zFjT!h|1@?<{+v71pdo^V3oz*d>GO3VFP2cPFeD~=5OD1_EsYD&67t*}wLJlj*SPup zEsZD|vYdnb^2{$EZZC-;*K5{+Fw<<}2kD*B+iobyK~7#$;#U}O84?~12-Pr@Kx16& zR>89nQa&WWeIGzZ1eP?E|dE@7J?9~_FMN1Rjf>jB@I=8vI#lC zQjZA4IW-|x38G^$C{;=L%2eDV0>0qjRp*<83BbGVuK|jr+a`D zq?xT5aZ0)hd35p-Gsh%`7?-8O%S&W6Ejly~Gl(Y3-=VrVmUr(HkbaTEAt-Qf95#6@ zO4Ol|Pk@-xu%8=DwMy`u(pKZ%p&HeK(yKp z9FY(VNOyUN&<7Bg84?R&41;&bMJ2_=u~~+b;Idg*$}z%-oB9o%R3)S7cmC`t+AH&n z<(t^MWm#_A9lUwQharaRK5FL{;jfAlN)+x!176{XR(O(L!2n9IC>hJggQ%NJWg_}o z{E{&7Q+r)#nN{KhCk#l98-KUZTaM5$dlQ;Cv)HUiT4mQB(OkKZj1WLg)Jw8%s6{|Ty(vT#>I>Qk0Rb3ck9Vtq+$2yVFU zP@3aN{f00V81e-IDiKnuSzk~EmP8vz3OX%^C}G^po+ah8yIsy!6Y zvwte-3(TXS7Q$je?MZ74<5ltCnITaMgpyH)aP@bBdV&){N+}Q|Z-!JZq?ws;D|^uu4z6t%#^RNqapp zQA*E4aeNDyYWpEfBM!9LkTlv^_j00=TKIWpQhDc4%@Zq1Ap(j8q!b1Yry3&az^iA5 zr3>jqUPG97p`d`alOlM889x{y-M}ki`7O z&m^%RYkgizwzj%PIIb$3Z0`w6Y!D&0$cAD*5x4l=WZmyVZ86sEabfKV1?@>)?J1wz zAD*?Rvvp)Dci?@ooUo4Ef{y&Ij>69!MQ0rtwoZKiCEgc%7S>r=(8=VWW!~TV;5!)g zxD9Wg{a$X1&nQ}}a%8E9Cp_qEv@S~2W^QMLtt$%t59aRssfj=S7yX8iLJPf!l+b%e z2tD+Up%*Ekih@$5NlBrLfPjFC7^;Y%CWJD-wBZNEzlHfccu?h72yxYU-*+UTNL3H-; zFbfE6k&H-9M9>SVS&~dBpZsj^@uOY|T>)j;KAL}q8YN{u>(lb=8xrXSk3zDn_(Jll z6>R#`Bl=Md{pdup;8u_LD6f$&*|CAwd8^y?q`&>vfYYmi_<0&M)(a9CaIhKl+&b&m zS%b~t4H+E_KH&{(sJ>D**v!sEVqp_Kso9}4nohFSLkJz}{UF*&y-W54 zz<>6nGiU@F9r%VWd3=00Z?pv`G7{)E^6&acF7rsHfWVd5ODYL1!JG9d6>UXZ-Q$nn zB=x+pNNjnk`-WhzdZH%P(fKxt@E|RxYU}OWUft1t&(T58(T5GyJ&B{Ev!i1tqv{2t zWaf7tw~)$B&=Ar{Ya(a#g?Do{v7(d_;Wv=HKkrr&$JWY3Xxk}9O+!e2CQpFDxp?fr z^ZlpJvF)77J^A;CHt)<^-f7Jj0hO?wt-dq5BxU3~4sDWs$T;G3T!&e%QFm;T=>t#F zhn!H8W}AwZ>2W^W7vmTP{kw`l5+}lCT=;(XIL1W!UH#D{E<%?G!~C!Xy+ z@I=z1y*^=2$6Xaj9`8PR3r?XrCPbBM&A)Mcnw$t`sr0@#&SFIv-}@ACe_DU*#qGoq zv()JX!5MtnWZKrWGB86Btmvl$f0><T7H(US!sr(N8z(eU;gmZrc9Pa{sIJ<*)SVUpu^}EAM=v z31}sHU(-gvbl;zq>KGbmoE^K_#QbHp@#dr6ZyZr4votI0OV_vA#_kBJ=Qr_)Z{PIh zmb~Ux6h2>Lp5qspTVy2}KJ#$QBlOq@qum}7|s^rCh=<>lV2@Kpw5TML-Asb4+LsCEj{n?dT z7!VU1$ci7#7T&9gToAHaP$&Zlg(Fk|fQbZQM|@$~q40p_4TL5a&>#efVzmJX?2xM1 z7A|Bjve$pVzzMz#b7 zmvIB8JHXUEzq zlDRn<@rdnvRsMPb4(def>g28`U4?PxFS5L$Svaeq<0%d5&v0A-PR1H{F)!4?kwudt0=c+>=J&CxM$(DNGr5X=g{ z@(zNsy&4V{pxIjayGh83U)1E?nwznudT{xRh~Jd0(r0-{1i# z)?hE=!O;!sIy@hwo4vUj4muP5D?)JFiIv^u{E2GkarqdDQJHpj03pUnXU2h*xuwew zcHjuWfU5lcC>bG*0oWN&Qg%tA!rv9Jrv*AJ^z(iz3$ zYhiz`va-1V)!MOkOgQ^hey(8rW2|8~u%)Z3`KHg5RZ}Qv5B+11n8Af9tJ=eT)d~Ti zdSH^_4IUPNnQq1@Uu+;s1xoq9m9Z*OlOzBCt&G{#czpU_%GmOkPn)hyz5}@u7pAt{ zTP{77HT^QReW5*KneiGOMcfqNm(d^JYf8gqAR;LO5NxpzAix!M6p>3LmTN}v+oNxt z?R#rBc7+b(`wSKi@i!sg>A|-{6Rr>9_VRb-fnXuUVJaN$X*iZN#$c=h;4)sZOX-(g z>pNmmIIE+7aunWcaf?kN^00yLAoGnziGj(jq@CEk`GG#?`l*xn16m62#qV>c*N=WQ zKNs}*{x9ion#6n=vf==rkCvq|_RQDeEI>ieB{(3^n?+=ft7Lx2!+himgpAEY0Vx-; zo0ZndV07sNH#vB#h=#zWwobO$T>&v>`OJHKjd9Yg707Do3fkBdBL)L5_hPEo-)!I;BZT`Rzgm2Q9mI81Po{Bq9`0! zw@_+OSonHcC6fmo<413^3KIU}*oPJtPgL;uU9hZ0AR9~?WnLg`rVyH$fgHx&yMXum zX#W{eU~&UIDsE|!<@v2;@)i#W3s+~8CXZyuD-_s-_p#(zDZM@S+cSF7Io9iNjq|rx z+_!pb&Fpj(#Q}2l@4N#t?CjnI{KpdSguOsJ7hy?*NSOew*_sio&&Elc0piKhlCbw6 zDM^40n6KbFVre94`W(mv0QAG?4^ibK6bEqns{p*U6yGn7Or;vilPKs9790;Q0p(YO zR0M9W$>LnvZYupc5{L*8Br{S+)Qc&2=EYA5o222u-yuzX|0!dOQ6(WQlZwHqp!D{A z5l@dKc$s^gSv~Jd8LJ?tvQ(`Gg%DLJBfVEz!r%^}oW0Y*YBi=NB0DJltaP``#(+-+ z-R{|a0?$hlZr~Hosn^rD8mG}89bLP&n$2VNJj3<**M98=-KvHFiWPcqN|ncxVkv#{ zN!Mt~*uS`?cOiOlD?h9yc31M=^Z=|pS#$c9N3WCrMwD3nyS~@Qo06DPy!tQkVA3@u z=`e%8Jn?9;Bqdq#T^Kd_WS=(b68`-mH3fiTaDmT+sZ-M#P#XIs7B>tRAfXd_Lg-kd zafi8d1lz7nIz;I#70)q)Q+qLh;E=`zk$qE{Fl4aO282->gXe+wA?$($_{(tZ1RFGn z|L7jbXSEvywa=s}W5?vBzLSa`08ZPP(;(2^m?YaD9ymJaxO9ayV9A9i0~|b0paNt9 zYeX-oG!EYujs?INvI$ZHN85)oYRCG45+^fY*bRZMCb1>efqpQWF{={n$IcROVIuhy zM>P2^3Z5g!g#(Q36-JCGG>7Y1XH_%m z9sr{iepbW9#f3#jNa6nUZ$r`MEl7|$4h!Ny4qsJm!a;z?$lh^4R|*GJCp9CbOUd75 z&uForC?JmT3rfl_Yu|oD({YrdjmA{y7!D%g9Hp6@0yHUU_Brx6)DVoqr9+g*`8Paq z(76?Q1{5W`>aHoIRK^115WvNWlcG+MIcK)v#x2!SI0;hsHqEVZNtutZR)xR z?(7KZ4i9X+!kgyLF}jff*>Et4dkF-o%Q9+m%eS7jeyDJV%nzZ=P~8K_xnMpMLq2-T zt60@D(rjeV!5^n`(pEMgjpN&IpMeIf1wzndt0-+8WaioPtCyFJg9ip*Uu~8OJ1v#G zM;rS&697N~!zt+30{bJg940ovhl#D}^PcKskHUKA#_L{NwUEb?HyvERlAc*7{(`># zkOj4J28HA-06|IO9#ZRtP-e8+!uYof`+h8PUqCedl>CGd`K5>2Ixa)!7C&H{iRwaq4#T{Z zd$JMa`u4(#s!Qwjz>{FgGp2U@x!}OJAa&%+GvlGAQ(s>mnyEIT*ICY97_V<*jC*Vs z3;?4u@_?^HIFu)pa3<01mnaFz&Z8_DZv=Xyp{%m%h66c7um8J?*aQg-KX|LHfYQc#7_bnw&I~=ImpL&54&MHv9&K%(Y4t!CzvJ4&M=&-?%!0Loi-Ip48?DQ7l%xb1m1^v)=$Kh>UZ3T+s2Y9#`!X?*eX z_)i&&s;4Pql=#6=2}89YFP*R8c3f@a59Yi0uU%;Np;$S;CgHm4YETsJ+|}9AvHN0q zF3E$h`G%2+N^XOCchkb?XFt6fW|?SKuy~a2@!6US3liHL$h)%`o@t%fT@g?NV=gXh z%eJrSV;{PqpPXeM(Z{Gg;A_~*cxZJqX97I#*cSaJz*l{f!&ji+w;!J_6lD)E3nvsM zAZSUvuQQFpx;zs^Xvbf)BLqo(KEZMh7fVNaoZMO}#)|5c$U9CdF$vlGVJ=}YH z^#{mq2<6U?H#j~w@)JiSTQG?D5}@U?Kp4cNEKsb&z>~Lk#Vb0$a0>0-(i1-E!B%z& zCf>N1j|+NP$=(DdazeHEAs`xNO0EVg6F_V@<_PioUqo4@8$oUy*=#UIR4;h#M!&zp zr{Usr+PEu5VCpM}zNi@ua488SEguz99Pr^0gw!jfmEoTGRPN9HDz$pXIlsfJDBWliaE2fRANxa)=*0iCSA@@DCw!ymNzn$k#M$ezlp8q#{o(Gf7wRZmB$$54x2&9Lh zx5YrbFwiTspB?zx_#g3cj(iML7l!%WICC0?xdy{>I{wcS+#G$LL*Ya84{N>P8|_@6 zOYo8RT=7*~yf;6zmD+Q@vlo1)cnQNOg39K;Vt=dQ1J@NMfg1Y}d53D$gy0ni zDZyGHZoIHWHuBWc0>dbwkj*V&qj{3k(XEVH zd8cyhMD?sAU)-Aq!=q-WmNuW99#EVt1f3mN9`M(U5}KT9bU$JA#u@@Xm190R+;{dc z6?D^sxqE#QZ*+3~WF@-n{J6o{S8!_5_mk&kBdN1DC)x1rB zIK_Q)P2P5`J+k4OvBCXvrN!n(Ky1cL+=yOo=~vt`lH4+Hx@FyWqh>d{-RN@5dFOWX zn_KR-Ti&T#K8yP;LH7a$_d-4QB3t*{Uhc(L+)I+&OK-a0x$l0r(fwYRd)Yhp``_Hl zx7{C{x|3KuDg-_Lf1r$cj%|CsKlL1E@%kX>HKE`&sps|4*6Wj(*VGlS=_Ie2n_i#q zdwps2`r73+`_AjzH?O&EFY5PGFA9tIyrB1jg7>1H_mZvmvX}SD74Ov~@3ouW>-W7k z8of8Wytm$YZ-4XN+4kN&_1ZuQaz=DLZbc$H8{{Lirdt;%m-NPvK*6nE_!B|G`lK9e$+NM-_ z6dhuPqKQtcZKp5_`O2$6RZw&?=(G?NQOu2?8BFgKoT^Tt&_4Ce@(E&bV0WaiP;y@E z3`Gn@3_;Cn4?|KiGZ+qGe%N_~QYdnaAim=(W5#NxdF4f;=WwEXWtW zgS9U8Q`5+!%IedpV4>#@p(*4Agg%|!UPeehozfl@30+j1^KZ_FAX@27^Ziq>izpZW zTKJOZ3>Z=FUk}d+ABH^)53p#>z6Q@o!e+-QLxb93<}3c-+>A5DX=-kCetQ`f(gBDV ze}5F6(J=kFL2UgD1rwZo&0_g+9Z^hrIoyH{F$O{Gz|JdD;*N+R!?Y{O#-amd$k?!9 zKleJ{$COmDJwh!y?b0BlT^*6(a8XS;J)w@M2VbE}hB=WJ)yR~1U;#1ZmqLqmn<0i^ zd>&teSfo-C9YPy)x4s%CB{hLxlkWlxyj1cm&dOjpMKTXgqDEaUr6~{vh z4O%_shVbaq1);z)dlz+gD4KDwT1uMO4A^6jVpGSca+ncI!QT2uM6Ezl>H_5k1O1v( zQLi(64q>G`*gLhC5Mvi_#C*S*yNDq#7X@X<=BHUmFV=Sd-nlxNs`&l^o$ew3Nv6y)o%2fM@r&RD@llGYPu7CaAO_9#M;-DYy|){s77c!e@j zyZKvh9_394Auni>eVq;oXa0d*6_@O#nHqwKj?cQ|XpKQGy+wz&a znzFXDYqqmw78nFiUx~TOC7F5wou;@#*~raqc@mm33`0Btd+HOD>jL*`w;Y7R=3PSm zjV(|Sdps^-zgZwonc)m=^pTmc$b9f>GT0Au(Jmjt5SChxTbF1LsHO1@Yhkx|_VyG5 zLnOmq)Gl(qU*HN0hduRCngP3$(rnf?4KnG2;4mlDC4?Qp9dYA2GNK9flYjbq2!$dM ze#K=k+!{NtU6=mns&9pNq(WG>WJZ`AB5i%}r-E>l_Tf(|qL_fqUqRFcsZRf#JPbqS zMNtu#26rgpxMhv^`;RE8ic|i2)cv|RSR@c_q!evz5N&ck`o$V7Ih-*W2&xZ`u5Hg4 zDUPnk9-!L}>d4WidEaFZFZ$7*v<${%6v1LI17i{B_%vG`MSK{yEdfM4Vk4@@WykLA z+ZlX!w!2(fzWE3Zsn;jUBQuhL7?;>zwZSWTaA;}mMjh$b!y_W-{jdGy42fa7l$i|i z@yq^u1gW+wPJ0CHd_o)>QN@l9(2Pw%!q#c`*LRp*KArU*UBJccpnu?&F^J|rutGUZ zX~tJ&kIpKe&Ta#lX-0cLrxGee-KbB+A#_B1HJN zzY}>m7M>x7p)W%~Z?w%xw$Ycq&bWjk7@#5FUSD~yzw68k-qE=BPoF5KPl%l%f_&)i zTx9fbpBIx}ycc$fjR(B9aui-1GcQSB8%LBtr=eFMmsVD*!xA>%5hN@K;dU!}6#D+5 zgs*n-Pp1D^ptsb}iFTPV#URG!*z5}^q9HXQ#e(QxyV_Pl_jcN^V|u;%FlMA1+MG+@ zujxmA=zUIUIYIi86!cgO7cjW{hw8{07gR@7pUEg`qrX&q?4b{Kfd{ZriQ*Q2%f827 zdJQSpCt$}fnR)+-m&$NnN%hR9qgiF^HLL6w8Qs?w)ZAbV-Al8viQ03NOzOpE9@}N5|ELUD@`1}X5-U*vSL+}sL!5n_g z$gQ;>V*g*tSbyo$CA}MeHU4pBM@SD;^4@aT(tIZMvCiS4L*wONvQrHnZIRm>zvO0` z{Duk?LJsA>v|OHker5AeVYWSTb)qrkx8hu9!tbBko4=JP|1ZkeaDkNLlh9*71DdQh zQn;G??Pb*W*MT>9uDCz*87RIpR(vHoICf|0%GC6(>i<^8&Q|NkuUb7!c79VRW|BNp zv`&Czb4Zxfnhz0btV>h_XO>15hgd~Tas_<(h5ucT)fl~}lc#@QyJ++4e&&$V{~^|P zHE|$D6cP_HOUu*wF7N*~B3FJI!et`;0dkh2wJ?%YzdbE{dP z8x`L9RBll}pXAR?`w#HvzQl>;YMDvq5ty~zpgbmMdov;4*FWE?o{a4KdiAjIzRMau z236;q@Vnb+?9A4Rm3Spkk0naof4GgxmCEz_@|qTNEav2k%=pU~n7O74mv%agYI>x< zlzNavQ^t&r%9=;za+f|p*e_{2yixjCW?Y5)(_Z*aDPqMwox_bQepNG{?vG4eMRvWL z!77&{qiCs7&tPSDqFCO8qr|9su~2LU{rwuQC`4h^pFeUiccoJs91Om9F2K0FP(PU(y=~}Pu9y{y z5bmy+cLVY!?>GH+AE&nJufWwfI*91CzsX6K6J%s6@**UP2B2ZPT{J?Yecx_+&F?db z28VfGciQ11W<{AhQU%-%x`W$Bk4{tQ`@XzUWmP(2Rrp!zpiUcRzOeO<{Vh(|Wy!a- z@W)ft?-icgR7X{RAMz)Sh8W2{VR!HNMYSv9c!l3_7NkNFJ`IIgv&bkU?S`} zn#nuBF~~AFz-y8JQT8AL5`m;=IASBXnuy$p-4o;e#|AdV;LX2tFR>;ISnB77vs=3@ zh}G>?ag|oEFX!Ck^+wTYd5A*&NeK4U$CYNI6aq&Xy^tub4ApMPu8Xb|KP>f@`2*^| z9aoj9#SUYp(r_()Qnm2$#E_h}15s-5t+y%D*QwtE7J+mMu3}ntw zjx~zY1&Qp;5DcOz+};5q9H;FWq8R&I+Mc^Y*2zQyySq@FWQH;HDnD1%iXwFB{ zov|Dy{Z>qKvS*L51h=(*|G0eQ&mU#7CiCZvM3_?PtJrUmstw|PWSA+7B@Qx_HBz9eNfI>jFE5b(&>bXYH-0&Z=>!My|@5biY0y}J#)saTWJ?sibS8g8Hp9JW&H=BU4riz9>lRsLw_>AoFSr~vfHI{ z9S_%0GWamQITFn5^}OpwmsE_LQ_${joL0>FzubYEmK!oQ92qN@K1hSxXQOFb{acV!8 zk16?ct{|VuT;rRtOREJ$=6UmtP_4}izkBoJw^(&|6KY41Ftz7wMFx}umdHc%+10B# zA?;4=*IP67LsjmLvCXjuk~GlI)>(~rCxs79Ef_ymcn(JqCnl(S8Iu1BH?ohSiYgDy z8FU}z9vD>?39k>mRQ#DMj`1tsb?epbjDV)l8#C6e9e-iJe9>r6lrszt6Dn!SH5lct zN;P<`%bXXN0QTRKb*L53UrXv&x+tP z2y)>4fdh4!1W3feU*l8w83QBwiioS#fJkXz_g^oQaHP*g>F=g3{OP`Z`1?hXEXEA8 zgDTu#q@)Yej{Dxt&j#+f=;jl-+~{_Eu@-PaqhW%4exLABABQLLOGK}sJm{mApwW_% zIFkNT56NH{sS*cL+M+Ov^iPZQ0~{ofLx1ry1@W_2n?xmq-~9aK9p~IO$f%mfXvQnV zu)?6)B~-*j@Wjh>hAt*?o)?^}(+TAi ztk>Xl!Y8=HjU`HjtU8Gj^0YYGQVD6go5w^AHY1J`!LK5zyl%Y+s4=PxnaBR!G){ys zy&^T=*HpRCK-yTo2X{7~Hjn8j?Tv>wxE|b6`oO|$aP1xv&PT|;CVGZ4L{9VZ-Rq|?ORCJ{&5-)t2PE!|;G$}s2 z2TBm3A8n*#E)|t&x&^t)3!gHXNis3JZF1yHrI(C1d49j=jC>D!CtmJRKWr`+{s+u( z7s<$DqO07;&Q%}`G1af0Wa05+d3_tH7oaPT_n;|gn10qj#^P=KdC61k;lAcbf>d*bENky(nM_Z<9XndhjXQY zr{9m|B|Gob{}8mqy9O;K2I4gaais=Mbe%6mjYb2#p4^k45VdN@Wt+bZ+5fT3W2iSR|v;P*jfnu12h zOjFP@X$qP#L(?tokE4**MH{X@V|=PAzS0uSZ%Q$S_|2}~@7M0c>s(9y9^bFaW-|V= z2whBJD0dX!^5>Z|7L+&SaqHE$2~fn)u@EjA!q`moQ4CR27|%!AF$utcUzoFuu{0CTtUxKBPC z5KArBT|+Zq*3vZQ1fJv8M~qp>Xht$vcu7RpEl|RS*J_D!CJrx+=@(k_kZga**Us6P zWys(8>1>=KkNRzi1nPaxNcvPi6Zs-EBicy5i=DZNth4}PuC7yNA;O!;YDdO}wSX8~ zt$}or721fgoh&%F;FXO-pb2Mxi3a^4-`>D!cNj8YzuawIU%ex9cjbHWT)Jc?nqdvN zLYC2;8siKnN;a+BEk^S#)v#rfxjjLuUSDpO3sm_UCuFTC#h7?dV!26Ba4N&`ba7Le zh4iNFfxlh=D8$~xK(rtBr{22Eu60zbG6^md{N0*9!;H(x0F4>Kg_s=>TL=4toJ@w4 z`5rz`)gPuZ>$`uje`2WSh1AE+@W;)hI`bLv8F{~uM4^kdC;H_7^UsK9u6=FO;!9=W zvnJmbjq53Q|KxeUm`fEb;@q!8>UCM}oBKxh?VPQCCw`>3#?GyGB*x1XH6fgdfAS&q z%ttbJ6Yg==fNp1~=<|U?B%Y~h@m0cTL<@7=I+^>3fFS#-y%6B&oRwXhtb9@iA;@qv z(CI1r|MeBhOy@@_-zC2!D;Jf)%RH@T-OC}yCAMb_#_u-17}r4tIHyT{SGdh3O{Wwc zI6h@+kPl{kN3RcDaXTPt&TM!Dc+C0PDXdi*An_BTsbC|DESAT}#+#+6KuNujEuhh$ zlq%ct3~2B)lt)r{q>kfObwP@z;fnR5{VIQk(wTWM&+uLIk47vMA~F;Ax^Yo;Ea=J;LdXI!P4ekVy)ol#x_;@c z{U>w>G5+gQ)$+4T7Lq1GUMwtZBC9sHq;nAC`$CXMnVr)CEw=;%be~mnI z7r*`qS8G&X?Ok>TzEL$1W(qe{x9{g}M>BpT0Dt><`i&|T>p@Rb2?W-=^AD3JsC755 z?y?8;*fbJ(#X)jpA*oEI;{;`HR~2|~9w&&8IbgdZ-Gp~F^~Jaw5O_ifn;_}#mahw13-3jHabfxMoTKff5F0=w9$(Q^}Rq`5m{jz7hjB1NPkg)clu(ljw9RjDvlNuTGUCC{brJr}zET=MKsX=0?@ z165?Vs$7+-=;KJ~6jk}fpYlV`<*I(libd+6Bh^lxi@83R2RF-e?kh}3iu3NP=tL>) zM=CyjuF4s?^99T!&2BAT^Xj#!zWAlFXdH@{V9)&FJPR!v|5i@AtK)_w`oy^?&bc zQJak(s~HQopoaEMrJ~Ux>V{F#FkN-Mulrhy&H4}24F0NFR5e@Nh%$~uWH~3%3VRE-5V+zP{M;c+20N zhI&jEsP0450lvFvZ5ujY&O@4d=Jy!QUEL?3u0d1J{9!TN>aA)>YK$N35X*b$t9j@r zb%>}l)b1oRZJ|Z0!JJz_h?=2xG+jt@j1)X3v@k}r|L{Ndj1qJCG3creICLoH^3#}* zZjGz&HLe!6UaD%nJk%NjYYUy!2>GgUbyFkkAu()7Biu(bVwW5x+!mb|bLqihxK3Nd zkVfQmjP`6y=!Ld$|KDN8no+U8W2<80LHWXB{LgCm`Cs!{ZFkFz(mrUr*(&>j6Kl0`|#!6r&{;gUfz2M zE?I5Vo%vMKW?I^`j}Dy(sdouZZB6U70o7oT^s0SDn)aL8f%9d2c+N` zO%k{P!_IE9Kq;9iQ|t9(fKAzujqg>jQfHym=oz<=^YTOun2ratPM=R_jZz2yD4r!6 zeR&t1%o(FS+RWmFSIY-Df!@bqI*)bw6r%yo(S7k$orYMT)721$?0Y+O@~A`uTS#Uq zH5QFVv)UNKj{c6jg2v3X-*R@nkxHCU(i=Rl_v!BuqF=4cgf17acD@PBL(ewwAZ@@^ z@AK2+${T77P9RcPpSTlNj9hE(*6dPi`ZFk|{q8U76I3c^f~tRvx^;VY{nvXX=3s|GbICj znXgu#>fce)mPaLMm-aE{R-+_xC(^uXqwjgIW*-tC-2?A2 zrHIO3f3n-FSlagj+@T*vY#!2IT}-;<^Ll}(v)ZhGvgGUvH+0!2%$6a8FS0d7IyKzcHNz7zk-(t=Lu(% znfbS6NR~;JX!wV+ESif{r(Ci}8}rwgkhqL`*mr{fAuOSOkUka*o-(CC+8rI0- zQjko&yzn+?665$#ZTUD&cFfr*x(P4;O`}}adsrapgX9>+&>5GV7pM2yR%ls+q zKV{7EO0*fxG`dvtZ)8Z$?RS*h(zRHsP;%n&{tI5&Swqw0(?1m+zp27xpv2*dG+mbJ zMYUSC6B*i&?ZKm-O5l+n??ns1i2R>2X8fQGPkha0S`^m#Nd8U+Pg}dO_&s*7MD{_x zz>^SVDmNuKuuS0EY(&{V?GyH4WEyj1$(%O4r%2?sXMrUNn`L?XOifwjrOaEl?Ujfsu1DiTSMwm2hT<{z9n*wQJm~0dQsb*`yD-le1-4N1ioC=yRfuT zIBvvp^Vz8RN6(fa!~`DlK6})V?M#4V7uTfCxo<_2c6z5p4%UW(w?8@{lm>+FI_s^@le)k~W^Q>q7iZ3D+$-H(e2y7eajd?)AO1R(l{ zJKtRK&SK%W6Su$u&B6D}8ooStmLkF_r7ubx2jf?Alb~l-1!4!EUc8s$tu=(xnVIzC z?bBL>b9ZqVXB!+o@8@{EGqM2pt}JI261;nV_2#o5m!Yzk)VvQ$&11|4&xDqgRhzYe zieG^{wp}>Fa)0Gs-|09$RS7?Vu72I^)3MR@5Ms!9bLOHzitetURM^SR=(*JkB5&`c zdhQBbz}KI7v2MJaIW@xYmap;Gmy5oyYqk8cVx3}Hg+#E*#=`!qkBoE71m;c^dbpDQ zZd3ik8(h=6by90+0@{FY{JHzWu!ME3>bV59=JQd1ca&gkC!0ehC-K$qsY*#d9wyP# z`n3td|76GQg?A3ChQ4e6=$kv)aD5sJ$x8TJC|remLATT+#Yd|iZAQb2XY3{TQb_|p zp7>^&HT4Sjx2Mbh%V$b~V@DY{YYMFGHew9ByYKDohw15aN$#F;$-szN6bGd3GdAjlPfL7TnsPZZON>PYrZq zk9N$l&!QYx3E!i%m_f=hvXac zTt}71kMBGD9p8U4d3oT&DCmY32ZZO!sdIR|x@R5N?{;I$9qfXK%W4eJk$Xqcsebd* zW)N@vo9!Kkya;o1Q`e4#z$dw{Lp-VuE7aIaI}IB# z&)*Tx&iyl_GOPJ|MEp)*<-<~~YVa9MmQjv`rhoNkz9H?EXDf-Gi~X~xA;Fm+`S)Cw zs?R}2jDmj7ah7z`s9#=}NnGm?LUI-HEy zTq$M>i51NHlpgD}jtt(7TQv&T)@umyi>0yj+S~yzq`L@jxEcmFT!g9HpKNYjg9Yo& z@x9`oGWmS%>6Jz??B*BOe#O=g%#%fR*~g{E`iBnbUYmiN4WDh|4|A*DjUiCu6Hy|t z9mYo`iI~8O*pZ=*(;t={o+t(?%kP@o=Y*4Iw3JeQHq}49FZt3GB4i7BVQt@i8d z3<_wcq^o4(tlk2*V+$f~p?mvVfhlB`|MdYT#~FF4ll70cIsICAd0)Tg%O!DmlTUmMl@ z{c-JN@ukr@?CYO`oB}Jt%kywiwjLf&TK}e#*1j3d;6Q zNG4B9U49X!38awUDr-NIQ^?|Xk^=3Ue{{XQea7tAgJALlgB5v74 zXUVvNFI6^VFjJB{+?BIp)u-*aJX;Xbx0W5F@C1bUcH($DxOYu8~ z2jjAbpiob%Ca!t-1|{!*QE073XJ!G0BN~M^&!{dEN2Acn$^S#4^{~j|WgR{+)6es~ zG-PWf^DO!^KU!ecjS%X@rFTd&_1DU-VZc-!90vo8!#$s0hCO$wmj4uU_Sr2w&G+lP zb(D<};8Ya&tH62gmTdl$KJUxIp63T*9txk3S*v@Qp7!#((JVAP)uIBLIM2v8&y)6l zEHp!Sttbs3qedek^dMHA0=|)P;W3(pCZflmEg&wFj}rMt`hci;CSW}wex5_Z(LusB zat{#Ej*-|^Z}7tL^4uwevT6y5^#1DzH`2F6S z*ZPs)WLJt`8_|c_pP?SHLT7PY0oJlb^Nc3Q{u4u)hoVZSzDlRGj8|rX>YsXb&zbci z1SbZCOAXm;3Bnts{XvIoJNXL6j_T%x>ef-eXB{|)t}yh5cW+SOhw_&zK}N;LQ zPpbc?j7{ov7usX?8NAel6s`zWQ{WA`e$OkIW<=!uD(D>mjw4p##0*FrCo7f&#R3o0 z4e8xz4jM?`5yzf8DYNXa^{DpRgcxr%l4Xn*zFgnBfs{W5=?t0a^f(&-r;Kq)4DA0X zoria#nWzdk$BMDqaRvMFUc0cH0g$C+l(eQjSB>mR<9ANY4K#m|{^Hxpg zt%K#HlkIeo?JEWomcY2|hl;-zcyiG;5xK&3>8m*q;SLg)nTa5Rrp1dJfBlGI<+jf| zuw^mLY!MS4FRYr$6DXj-MSm&gfaq4m_@UgZjOXkPxSfoh)p=iF5-epid(Qmo{qW_- zMu4x$`*(_+JxohD-MK10i(8I5+_t8-U9W#~buMuFyY#T1c;+geMye<+;2o0I8LrvM zQBQsCdg$aAtGGHSLQ%io>HczUyM5u+wQa(S^OraIG4rjN2a4zqo)7jydL^sY7DC`Q zJagyue)n>Ea9z-|xzOe8^ZT~Xr#wrUbl%dQv)En5%MOg#UlLUB%NdKUQqVTBh& z_zv-GvT*K9E2TY<_3pK~pD%n2cmj-LJjgFF3m^rD14Ze1IR7ETF-UPaRMBmNwKhsf z+lFhOAlQMVqFh`}r!J=O1W%?0Fx43CFCd=$Vp#3rNZheo6&2?41;TigPOy>rv+r zA__9xDNWS%wqP_cni2{X?IpT1f$=KtFzQfk`xZ?O)lhmT)G0Pd?j zV#vi+QGL=AK*e$X#B;jM^ZX-^mEuEcMXeopzjyW&u*J$*9Ypw~UaKQr4deCsQ5>1V z8>jU#vW?hBD=_`T@y;1Cs?PWRoDZ2=;Mu`*R-@VD=HUw%lZSo@kAEkO7&2ypxT}j} zSptPg>V1Q;-J+UtU1|tXB;7F9o~|e99@#ld*enCSq(4s8ns04Nd>Q_UG12VnRtK`0&E25Y!A&k{ORRYo}2Fp)Z?CF0h5 zVwm+bI*62VZ3*lW zmod~HNj+zVbIVk~bNn=1UGbewzvNq9n)SkUpgx~(4C_1G6Z2w$yS|n9LIiZylE-sH zkbmSBm2_*TBLDuF^#Yz#-L+1+P=IUxMq{sY4G`Fa%1Ls~?QYJw;AT4v+`Pz_dr!o? z!}sQ5+-=Q|8K=FuA~FKcu?$yxm}V;to5@9>&rKF`y7`|usLzp1bEIthc)8?k-d^8K z#yi^c;SBZhnO8+vig8z_@xuK8caAT^(L8G>mbI9~j);>(f5@-Vx*he>_`-3Hzguw# zU-FO$+toYe*Y4!{(CDg0RN>`^)j-)Oh3V7%6~m1Cir&ZGwhsM&V&dON_m2N!n^#0$U*0A|Ja1^>pE-OV$VFdz@JM) z(56ajUv%h%-Tl`@BJLW-{xM|qe8W@#_|EW`%y~ppYsl838nGsA{||fj8PwGK=!T6 zfU?V7v@5IQT7eSKzM54%cD%~pp~`U#>>L0R0{{*ZsORk){g2g-U+sObKX13cvp8HQ zuag)nAn9qP>(fwpiUfAajJboa^X7Z2{i-g0;RubV{_b@-YtG7?uNHX?4F+a!??$~9 z5CBf%(H?x1>%W%Pm#T}rm(jmb5R!r)=D1-v5keEce;;+ST78F*Oh$>$C8Pd}7CZxuYc!=vHw*!I9wM(=~OM9@|?Y z!g7B=#a6nkIuars_)km)t?Ne1>B$+V&`VmA_BK;o`gsa*`QuJL8cRg(nrhric#r^Z zzhmCq!{%5%jcaS5Yg6s)A^Dx2y%)BpUYru1dTg^C{CQ3!gX=t<^|rzMYq6fff}ai3 zKLdFdYx91+d$lYZBNXw(Kke?%Lv3V~&~;hHq^w4r%%)BDm*!uP?<(K(>={}}`ZMl; z%Q794A`@DC>)#4Wrhk6SlMxG(kvjAS-DI5l-jF41wLu@Q)yCqrboizYsSyGrEj(ir z56{;ZM1=DpMAZrP8I!HL^+gd;`nYpD{3D<(WTI>6p|AtUw>$jLiLouyg@Plkzi(di z`(?gy`sS|X#754fNC9YQCxUC`qk#(-?^p=iT~Qu$WnLgC@nfkRwfn3m zTt0ffYqm?3xvHn@5m6bm&wA2r)GAEyp#Dgofks%E^UlVf&Y7GTx5#cYuJY1%^+FZ$ zYjY`<+_Ax#O-WxSQbdwP%0cy*txVk%>E#bzaQ{kdm?57^Xd7MRKy#4+gfYnRHr zxJQ=k$GK6L4*I(8il68I`K>W9wtAMv@!U2+FhS|h$t3HyVO>K77mO(o?eP(n6&1OP{f1JH%VZ%Bgs5l z6Ehc7J9Z8^*)AM(h+oh=_&Gzi?^ox!5$!9gm#5vCTa_ZEi`}q|;&fRs35Oq5wF#VO zM~Is*f7WFxW0P^WlGUnXuP16m_zk3-Lyz4QH;^;6Cg|#Ucb*X3PmSh^`Stp|CC_A< z=wrtpKIb#US2j9vNh-slkIc+13bl2_7L6YqWa>KPiuc#udU9_v-YGAr^UJf`P>zz6 zs7tM!+{b43c4joVju&_72!DGeW#OvMzo?ramT*VUccuGZ%9u;>%x2b|63O`XB#Ya1 zCT)7f_gGR8>!kOIK1Fl$j^%GxdR!}-4_HepYCG#oiaUiLxK;Of=qyzZ_;esd9(8|w zp*IwCsN!mo;Jf|Pqcs%eQ8#^W>{rTrg$EvQm&$WIUj4{kLJQyi7mY>-CG%Zl(rDN0 z>hg`{o1J|w$fHxM-*W@I+P};r_Fft{Zj4wx5b*G&c5`m*iW6F48NSBctBnRe-A4!O zIT~at`h0rty$!B@dbPa%&ao) z+@SSF|MVv@uY3HOkxA1Z zf;$Xv_GbE2!(3|)mofM6zJdE&+fUH3%hM^wp%Y%dCUINZY3os%4l^q$0lP^2Zj-vWCh=>5#HtbZUe2Y;h>#-|G%H%1vKi?n-hF7U*SU=& zB%DvO5!K~B`s2N7&yeWzuXOI!xDRKu=R$5}J`-t;NYLB!CQG=*9QjuC;k=Abs*b6? zG6LECTtzzVQSLWhm9DOTl(CaYI}urAPilZRTQX~*w1N41bX3a2Qa4F)k^HoKxjyMa zIJCUyj-2)jI5TkNVVCBwgz`;Je#JHcyN=)s7qt2wmbeHHjyD=IbFRu{~u*56`?A z8_=z>r?AKuYhe;-Q8K>yZ0^O{BCJDk7kvx3%8D;2CJZII_&yW-RjhlcnLVU?7L1mu zoIm#0QvX%HZ{fn754=WBgPA?PMJh~X%roI*?vn5Gv%gA=!&pC2SpAC6%fB#9N%%yS z@hdUB+RfUDH%K=>?rT;i<2Ih3kO8xM$((Dv_KwxO{DlMDLH?!XV8Vz{u|%nx`%9~K zwUL^h-%RxW<&D*)&vgTS6@Kqu-u(LFOVhyXiZJ<7i!*qOR*sR%MD}wCH7r;_g1hXv z2@=WQJNiNAUbR!18(d-ei&A1fw!KD1@>Q++#}^}4WW06MY*1fEd(7WdE{J(vIsSEG zY2;01YnewF%4)Kxv$h3V?j`$ieEQ(co2|tZgR+I|KibmktJ??fUd0~XSS*Ts`|`pf zO9N_Bvx>c8JW1XkdVXRZR$ITXUlH`t+qzd`tEs|KA+$5++j`!Y`V5H*-!0c~2Xj1i z0NPXSsZ_dhOO^EoF)_T!{~L&>wq?h(TW}!<{kpogO{j(8I%CDRH=NOS?MP*ier_H@ zmgjww=~~p6UE9#N-mPZw9UT1OeIEVK(Du&f3+)Zncq({13HI~RpI=T5`)qfuekO$F z2iuFmd3z2Yualy3t+;DHb@aYIPVVQ^tHf%PC7-dxH&3X`Et~f)D)|IHJc~aiK9&kH zyPI+A82-#)dcXACdR|kmljcWmnseuzM3a5It!fl+HE~WOK=c96q39Buc+LlT=c!`X zt~UK>jjX)q2F{@F`Z_O#>gex=^Tf+pE%}q{FJ4V7siEFBSN)kUt+iS<`tI|nFtJCV zO$!~$G9&D?@!?FlH@lHTf`|bFe!6@OZEC3RYfpRq-1Cy_S130doH0=d2qPRT6^ClD_*Ub^iR zFb|)oIM5 zbX59~HJ88fVX*DWxl5wo_Fi+mTQAdW7RavsYK(&gJr!bG zury)u@{mNCVj~1Q>vV)1v=D_4``+t&DtFj@-SEq6nmWysMw4IhP{`O{_!D(zNYT>2 zDrZBuwyO>2v-jAK=zVII?(gU@<=gbktmW>hV0dQkACd4g>jbgje)pu!-fP0E_YOba zjV%vwEPsa9%^W7LXeEDo8T$LBX1g$($gXCn%|=n^*@4yg64T2XIv1YrKibfc%b5IS z-j;G-Ey*sZvLjb~&!Sl`Vl8IDAZ+*J)?dC6?=(MWk-tXuAO3K9h78`-$T)uWc)>*D z-v=|_wld*OB?bI@j7V$NwQW6#Mw(j>Hkct%{{eRZkPd8-!1f`Wq=oh*Lwn}jlE~0I zSWG8CvL_?gW2N4Ik^dPLToe)Rel2`vEA$p2gi9)7?tB;?gnOKNUCKK`&>L1uxi2sU zZS=x85ykvSta}V}p#+LkHNpoR#*-Od8xcMiE6uGJ!c6c*bs#%wh$$lCcLpj9i~Q~6 zR!l+Yn}ki$5V>B$vQl`P1x)N5W`hnzZj13Nj?d|e zf3hv7&p?NfQPse`S`*<}5^IoFSiTpm(d!VUOB8*ODUvjbD7N)##NHb?$?NG2vuB89 zEyTAw{9s-haAvC zAUpSsTA45Up!swedI?($m>HR+s19vIW+^p^NGC%r8L%$_ED_t$JM&m4x@1B+%8U$i z^1}4ZdHOLPoJBko!y(PQ4p|=IFuIk=nGupoX($JCj&wruPAUfW1!n;8iRGtDt)VBT zvAV>^>7u-V6zg9iDU2xEiig=#oWrZ&mH=PLHr(mBgyaHl%jr>LD$)rcb&`1igy0q+ z8AkWrX+hf1V)M7*oNtf1?&PsY*%@XY-SlpUP)Fs_w| zsHv3tVv9^_VI4?EO(F#UanUG4B9J=w+rO>Yn#0=#a=mz z6OiVZyu(pjnK^JTt>&E80S|h73vWOU8bTy7Sv1lXyn9|eZbxbcDUj77-{*CI*(B^A z77aM0^zHoPqS1BZYZha62=|Ra_lMvatmC*hi`Ygw^z6y-4m{M43^T*K>3hLEiMZ`r zxD60jN+Jnr6&&T~HN)QDMsoiKByE^38W|R`4)4T2vm`z{pz|bnv3GGHEgA4%6!ruY zc$h8H#w5=I$Cku^MRYL1H^gNQOaX}$~Me+{E7d-6hJmzyH zyoMahToS1EV!3e&^C$qea~qjYht_z(RPVw3U1Gv^pBr%q9d%(_wngfb+;&=?+DoAV z^n^Q!2xiLS{U5GAue^zRX9sR%Cj+g^NV~1|w4^HP(stAVHu~}ccGdPU+G3mAYBZYg zf`7Lg*GISQwFp}?aZh3-38hjvSW8>4=U4_l(#Imj-Xvi^cyf{8Ky!z6YsaS!zH@DYiu#R7#jJ zQ8W_kse6yK16l0JEW)Vjf!hykyjUgyoF$_wpNQa>g_>2Pi^(iP5#I9XZ2v)IJns^$JaeF2~W{eE`{Q3Za^uJy#1uZDdGA0!d zYs5;wK_u$}Z&3}^2Sk)(AV;Sc3&%NxCH}dQ7>rc+Gz9| z!q4#a*_Pqt>T=2I-!Jl2)BVlV>h3OK1w&XIPS$$|!qfzijRaoHEa(#fZl`5zxC*A* zDk)co?L@p`sX>YoTLaqeiILqav{qwvOP#0B$*=XcAC}|7;>te>H2d{N`VmtN=#A5h zxSg#fVxi%vLroPW4sydt>rxXh-BBgL9AAO z*=mPlp>Cr0jPn{5?FpDCgd+74*Xc1ZX4{=l6} z+Fx1siuYA&$fZlQRz5YPo=+6gr8rw5d9M_AWVLj33N$ZV)av+Z8@c?fuLolq{hn_P z{j>*V{qw!8HHZBkHham<%`_jW5LQrBG^4 zwZOeA&TMG&uGmsq`$E`xX_6W6Xa|e7tYoPr<82p?dKJ{a{d!c$Ubvc3p6|w|S5n2= zRe4r#SeHVo@nX@Zxvf#E#0A)7L&~byxO;)sly2EyFbUh8BO)8bCCg z6vo<9pISmXVseJc{94#O{#B*I28JLE~k=#@oP0 z`b^+(`|(@bVmko&`F(u!TJ*C;T%j~Ri^K{~d$P|!+ijvn1M#T*vHqs`3ifZa=lF(% zTlP6tX zq0@Bu{^Gx{_J801b@=^|oeol~-Vjx4962I$NVQVeeyyymqF2`xRyT}Re|xNMC9m$3ul^la-CtQ{mLY%?84zOz)RO^E zVIV6Qs6ht%Dg%RA!zr$D8m}GkT;oYu*8?Q@xt{+QTm#$cs z9bA`RU01|xC@F3z8*iNO+&Go8ak^sT%;1Ll>V_s}Q%iAE+j#SW=caDTrhdhy!Qkel ze^vvbysWvzxED z`^KBVk*SM2o;?)9(k4Py3(6!$+G?|=5(|B|vlR{<9F;gZ~jiV=8EW8Xo?eg0|e0 z`~L?BZRmdqp^g6+gx3G|_41d>TfKRg8s7Z+T6cGOvcI8r^?OSsO2nwKZf&OPq3q3p z#`=wU+B1WvMon*jFOR-*{XWq2zXzd}VE-LL3;1^k&GEm7(8&Kc2(9kiKM-00=g~hc z=P-)@1B5324}^9LWAuMPXmd;m?KF)(i&8B84+u?o$^olVTIq;2w=mWDA0V_lObAWl z-yyV1ObAW?975_J2u&)A$b`_|{I4K1{Y7z%h7c1%E8)sYaLFrm8G*1jBMxgWAY2d~ z{U~)kDuy(XD1jmETc6iPw8wE7AhdWt#gNoDWzvp{TBKst-JORyPi1%Xm|{gcIB?SN z-S)EDvh-j}%=qri0P5tfSY&x)a?~Jh>biA4_Z!Fy%wXO45_2NT{=$LDSvW3_ZeE;aUvh3CH64UPk>Bp zSS5zzWyy-NjUA8h;T^$2+J@~a`R9VGtE@lzKoi9IHV?t0IA*n(L;9(Y0Y zUl$KUx@D?r!aeo3yTpw@vAz^vDgiSWTF&;U$_^Gao5;LY%&w?rZGY`tkj(sA8mtiAgkVVY*cO$uW~|QcTJ5YBoxjIYo`xb;FN;uqA1sK_zkpAhBb>2_ zM8{;}`1(ymX>5upsLP6sXx-ZjZ@RvH%M-^&V`Wsu9`4L&^1E@VvTz}?m}Tc#Gcl|J zg@iAu5m!dVE-V-8NWZx8f=yLU8*77q%~MH_m196p9a>=Vfr-5$)6OEKQa=rCXq?xF z2L(5}R9dz~nO;R?ly<>2YW zAE2Vu##0}gB|f*##pA`W4uOzv2@=c5Ay$Ed8d=Xt)nwovoar!5x;ew$mM!(-G}t+i z#vxKWzL3IgDT_5o>#VF&Fj|YVV!Hrx?Xh zOw-uHHOyr6Jov=!g5&YL*8^Lf*OG9%ujs8Byx^<5&u*&ymi%zfr{Cu4fO)p_l_f$n zhHu?7q?vq!uQZySKdP5it{AEjx*(Tw5?3IKeR_6&iTfuM0p%EeUuj+m52sxwrd*EG zgjbhhOX-!e7DsWW6wMPxv?Rl4a1#G@jhNjTPVYingzCkku8Xg@+ox|jf36zs6dn$* zmyxsQ>}|R)T8$^0K$`17lla<+iMd*K`?JU{S+ssHpG|N@6npEpuH|61x&IznyZN=;?H?Dv{3*I{>FnkQ z!-gLzelA`wpnRo7iS(DbpU;WJAX>wn1;8KBQC8fKN{zamG*XXW4Y5PDoyf3I-R-(% zF)seJm&IB&Rc6Lxx|oI3(R%aex#@Qw&Vfpb{q>iIfEYkbh?EIyOEv22IDAj8`J@#g zOoLp94}W+X=L!Y4KPwQ`E9br|laEMi5fKkLemPG1gp9yL{7c&T3Tmo+Ns=M{WT_`# z!iDimJu|UjODIZsdGh4L3x|T*=RhCz_$m2sBNIfW>@-195uZ6YiJSbknl#=x(W%Jb z*Bpq5{e0V=lXWeaKd~w_9?ExCiO?Hu-ze->&cM{IBQb9yb8Tpt5Mca$p#m4{w7-7w zbn@VCrF1nwmAZ`L{cQ4ER5o_1VjGtg;3b zPGb%&Cy;&A$CQW*URWu5ckIu&1E7H`C#i{7=;;a{HVuVIc(HiQ{3EL=k_hGqj1dtmeN72CuIbx#?G{s3J01l& zY_%JO!tNJ288K%^nlj##pVvu-80ZxbwfOmu>z-k%mzx*#+$qpy2Z zNe+vUBkMf;qlmg*Ul;U?!swoQmc%3#7?#!b+XcpchbCCFzbj!kfb5z_MPW+zTd>yL=pW2IBcnQrSRh@ya!e*DAiO4*IOa>CL{cs}rE z%N1SHLw~Y?3Nbz5OPb4=;C>5O3X4#XyjTbEma@I&&p^uB%F8jpR6uf{lEg0d=*VmE zFj1ZNp<6Rv$v-RO04tSEkTd2}7}Y)@O1s9y)g;J}d+`~T`mzTtk15-asq3=3bd>AS zn{K=;rzsJxW7!|3P`lcYCNU+4b?{^XM#-P$(KKq}9OV5pDl9@&^K&HCMC#KmdEV;m z`hWRqHl=`6pHMX|)r@PdoM$gsn65GfTlvo`Qo=B|${L3vdy z$Hq-4E|QdS6WJPQCyiY{jV+DG?qXvIbx%3$q73WuSh59IEh&icuqqe@`QJfkTtP=6 zZQ6gBz#1d$bpes-tGUk-p+|`ru2c}my(8HL4DbATawvxBt5rkiNf(&Dngl;M&ywt@)8-C&I z{6+TJ)WAxen?XfPU+r`dx`kHMVoQ}#DY9c__vm5Qyj&y)Fu?(>87`JYF*P(|(*p06y{TSDv73Yk+By|@zHUG@l8 zR-IjTTY$~H5*m30_RbdSPJ5w~&6>?%Zy`Ihkl5$RtX4!8cM>)01yw!UnR30fTl9r) zHmfe<_4to&SOz46!cVP<6&=A`hq^%oLAV)ik75>%5hdWw6&cQWL!@NY!B1C3{Ow1eFOtpgQMJzti5#x3If5yc))`Jm7!h zkW*W0N_G4pp+bH$dc8z`sg@Xw<`91qlv7vX&0-__+)K93HV2+?roJZpxuOhSs|BsV z4?fBN>?gk9C+*q4`D%(S=yZe2j~d=qq-yc+hQZVFRel5kX0l?tS>S2;qK1o-J#U-; zHna#dww-D0U~bP0ZtTiw>}hE1``S40w~;2$G<>G%lU375aMNf`(|AME#Mh?DzfJT* zf#&Hm%{+F#72CwhWlZg<1}3n^8r2Z1RDe%#0NS+YzsfH*G>a_RMv1-yaTG)r=zus- zJF@t{s5`R)FVeE#J_ep0ek=eF*Mr2KsmPSi9fwWRPn)k6R*+aL;wotte;6_xAfT+lD|@U+}h-=+D}@yMQF9FHBwK>wRcXnYbH{)1bNR} zcjyLngr#;|WTT#y>v#{6G|r_OPIR2MexJ_&o;1~QG333Opx|}p!Ni)#*1B8Fiaa-7 z=iE+ZmrlF0f-Xy)Ok&N~jT-0u!B6nRhpG3PjnIyZye@>|J8WGsN?pEgT_K0of}y4D zwi7%_KfA0G-}~-$Jt!4OWvfqbSVr0&zW$6MFpfd!l-J zsAmPL0M;sE#nqoJYqBLYLT8;d?}fL$*C%?b@fJtN*V z8~k(j*m2fAADOpX6N3kPgMbhXBtH<lwzhymeWZ3aT0&F=Tr%?uc2hw7<+CZpIN zlb-)<`*^C=o>=zz1RJ}B%EX%=Gjws`_gnoWGG4c7@*CFHaf;V?Ylkb1IE&uE>X{3t$%nHjAyg!;<_9?ah*(3L{ z%%(9PmysOF(c@Yg&hq2ems-x>r`jOcv&zP+%RZpl`iB$GRH=V`+u!TU-dDR1u5bEk z5G1E)GE|hrfxGglZGXh#(Qv=eM6b{{EA>x-__sas-$v!@tc3xaiH# z?~qWss4ErbG5PWBcloIvgzyxH#uS&$)HT*#bTba?F(o)TC44Z|n>59jhZ8_gXRD)5 z(opu_4o@;Z5G zpn0ZHQwdm7X2;xTb!~8x$@A{=XigfSZB)!|@@cUAUFFI7nkgB4$H{r=K>?iww;_E&qIs0F+ax879yW;Kw{3c@>YCCQ+mFsBx2X zy1Q&Q6z1k>C?_viF=2V0w5XHKmWD^>Fs37Iex}({rzr?a5=w7(LD%JHTPV77fSRU( za>CDzn7}Pbta=Hkn=PO`N`xwx1GX87+OjZF;8HH5Vv$#*L8!IbrVnk`F9(0gkea)7Vdq4TTgc>ZZyvQu^Y81hc`wB+-;^z0;O|+?UYqgWXfnWS`+O}F%}oN4E5=zd z*aH@{gawVR6@@&a;4n&?(vHJP8c{8VrY|J&o$#0)9eVa2D>{(G%;C-IKa##$?l3m| z!yr@1F>ez@%+-gA8D4iLazG$y>|U8nz>K?F4hV7RuBNotb3#q*16-=(BGN7an`&Rc z3(5dr=52t;0gFnz@Gh6rhko=TFQAiDLj;v3rH%kK#bDrb7xgaRKOUQeD&RMK&vpW| zs`ou6y!P7EwAX=P)Swjivs@7)beQ#qDq7zn9ZRH3y0gZ+!jZK2h(t zI`{4CsSo7JXv{I2z1JiClAyCk7x9p8Zg2$pk}H=QiGA$UwrXwue0P_5RobjhT}bNP z9KeJ&37Y77byP_O`+>Bh&x^@+%7oI{^q;#`-jVpdt*V>J2?HpRv)IdtR7*E9Ek-Ux z?N@HMUK;&)ua1w2`k(5%#!+hn$P_B)MHP<=+*T?^Z@$|<^48&Zty$9%yxSoB9sz8Rrfj~)|)0kih4}g{|iG(Oj6``-%IRiZ9de!*S z#~r^MA0c@I<^(;)S{x%~%1)EOS6J@YM%io9Elpj1#AxIN)Gdv1@5wff*8Bx5t8uY-|*z zvwRFV=A=HCb_W$=w%r%A#&m<`wp`1iza9BukRAQ|@jb;7o~M@U-z4l_XzJonti%If zhSHA^-}T;8hasDHWYt6d$U>ctzZ(y8=hnXt&^dO3C6>W^8(Z%_tQ8uQAQHA9XkGK6 zWTk3xZ2Rqb_ zZ5*z^Bu1HP$L_u@ex%tX_ROpu2F1}XkW9v~IRg;i;~Y2@O56#7ES5*3Bvj`t4v~6RW2*h0aHPJPPB<0s`54j z!b<{kiN6Lh?~Rsi7erAMyoM^#YJTCpY))<2Ibq`U@K3qI``ZS#u!oMM*+R$>C6ab} zlucq(*3ozV94sbj65Uc(M^7D=q*+9pojA5I&hqzy-nFyZ90`j6AV+4i%WXYUEts;8 z_A*F|YllGz#KQz><(sURpE3ibz2|+5vn+2x*er+;qn4!OMifw5vEa2dQ7~Ku57s?< z`?!9!VY-(#3yWjRPmx7`~~V5({eH;J|`;G(2gj55hr;uslN&0TC{Q)vO= zEN&vO+|G9%h}j)7%@%u z6@{qfpVeQZ2NYrguorK8lOT*0!K#An`hyWcfDH({8+^>tbO|ubgTTStXrf*EVCYfv zst0B4O6(|!LsnmR(9;^8A=*7DJH~GS*fzRQQKOemTLvCpN#`My8D2@neLz{7$ry}-677=EmCNY9;a&Fs-7(;P;)Q7XPl@-;3dKaF8oVXdvamU zDJzQu#s;WQw>-U*aVtr8WC{Y*5#O-_kGeGG1?%YT?sr{*$=x@#U{#Q6GnQLW;_wT= z+d{l1q-*`#2U4!v%?S%q`tg*#C1)^gdi}*UV zE9J-d3{MQU#?ljrZek`9;DZ^UPK=UEo?uDWc1z=%x!3mGo)9Ip@2qDKFeKMQ^5(IH zxQ;GAf!G2hCG3r7NdXdhSC2YhOQV>*q2DO_?Cp@#0stZFc6teOt05t1I1nh&d_)PX z%v>_^J!K5vJ)5`JkMuB!b~fGdO~Ru5oXH7>0H^pIjYbqxpG?*W2t@ zLT}7t-{{Ac*<5csfr1ds@?f?;4_D2OGoMgSNrr>+9^P>Sdl3^>uPn!&1{Tf*Z|J41 zfmnFI+>**FH8mDP)TML9&qGxnS3c9XvzTF-EtAgxu-_xDEqw<+5B~O7k5Sh?$G7yj zV?wagD;CoN9(dEKJZJ9BS#e&F`y@Xtgt4~CO9z;xN&Y%X%MI0KN7eGpdhZ9{KG%Ov zOD)gM$HspnEmyXpujxF98~?_pFG~U}qXr~vkoa6*w^&Ls~Xu2-}=K2xTo*$l`TdC zykwBa5nzy{I|3N7lDa#yT|VsXgfG4(!A(jY(qSMA;03-ynNS8LlVV%TMwBZb9w*hy zkicF5s|g83hv=nA0*pqG7l1H{atO$TnH2u!MV5Gz*lC5S7RoTZG}x5-HZ2>rKtk{W zYXZn|Wi=|K!Q~a8mksq$11uiAwo!%X^^j0R(rVn}r@_gJ@g($u@J$0GyEUn4h`>ey z;d2|sc1hd-KMG*h5prB|iFmpUu#p=Nd&9YYSZU&=H8*p#)%Ko+MYIbC#vlTC^*BJT z^wV}0g{D~M6i$@08cB>703jaUijZZ}%q#qWe*AMJy&V#e{9QnCZj{$djU=uXIkMhy zm!t~uDge>Tl7cg%!(T%6&YxqlYI;g~OCoW>nbqMWFe1w-6M*Oy!m$ncFV*#nYfQ-k zW@Z3nDn|V*z!o4;&H!ng2bNTo&1VJD#U)v6#U7FQS@qDdrew2h3W~|9b-hHy@e@Sj zxn?4hme1e*7ppekuFaFmoI=nI;S&1&O!qUBRU=0KkVSfUMU$-Ay;`D8>jXcxg~Rxf z-kDINHt5(6uhBW=p)0v*q_rzU4|dI2J1Y}97Muri$eZ$pCaJ{R*vjKnSv~4lw0NOU zaWH*sl+$sfE1|o#IMQ$CR45At+oelk>j95{$fM%BWG~L>#XR96PJ01*Z!y%6)Tdk* zUz9RqMLorPPh|+TrE+N3%x2bcmS2lKQw-wlm^Tp^5E8%~D|%s+y6v4zWbMOu(x|8oe<__ww3A2;Kz z5aaDc1bvlev0#i9G75}?>ZwNXsG4$JGCgv`l-u2u=e{Y=4JI!P;=OOm#~yy9*i`77 zsqimTk$qGAzA4Z@gugH(AQ>)p$xQNw*-^(~(Hq0!?q<@_rYC59*d{Z%UtD8-kk3;6 zSTZdC(EwI;V8q#o!gcw$Bm1Fpzu8Im%kR@-$Ov#2!;DaPQRRo(XN!-@CydTML8|** zCbECJ8SxQI{+L0zToV;VUbvjeFq193c!3?HqyC8;a7C->is28@KwtnrdMTawsYc?8 zcHt+*3kIf1NR$36S8rV99{;5D`_m(-e(~bV^^aimPnRyg`+;+=%dU7JQWwJy&J#yb=mxEGr5@AZ0Ki1&DJ zepmg=0Nky2n97wKetj`Q@v-XvKxm@^@t@4fE}23H64m>Y_OCu*LTF{F9ecE|zE|Hz zjSQS0vlRLVLd$qOz=JhCk!hL9gwWJ2#w7-x^#2!x_Md~$JZ#~iwuod~rY*r*Zi{NR zWgD<%pR`4<*kTTBv1mJ-upNhj9jAsJmyz8O8#`_fJDyNG-efzzJUjk!JAr09!2vs= zNju>cJCOss*S}##mCVfwFuaDnX;y3?FT$V&EFLg@>|W$$UWBBMy=*z|7J^ym9dWr1 zrc3}!dPVA}M(YqGkNtrhhB-$`5x}R&(Xyd4lL7E!C&9{fF!`cLYesAU!9ENhqxdKC zPUbB+`Ylxjhp%`RDGlf;0$2@;P%{CGw?v$$%_s`nOTA&y9&i{ZI*216w-+2d8L<~) z?8WIUVRbQ53t-R88T}E5YjX(80DA+yD6>DY#z%h`aU-q|K-B~6FPq1CA!0-Dakj#@ zV=Q95M&aRc@LRSq!HBtQ15*KnTb2ZfBpwzP$6_5iZE_MOj*l{2n3Xg~979KkTfjX$ z5NZUl8yYSVFl`+$9U6CAFd42{2a`iU!Vxj<$q|x&U^?kgmD>m-LWBZu++~7uES*J- zFl})Xa;q4As78YXbx!%KMqfP{DVZ00I%ZnuPpp>-SiLSr;Wol2VA=~CrSPHG8r7I>Uxg0SQ8qn zvpwhK>?*zxYn%-3nw;(qo$e`jX+Tez=DCFG%mma$pQk`h(IWL)<^n5kD|Y_q)QJnA z&mAfjLBpgL2#BR)M`K@=!*y>vg<+xMbeDk@IGx)=9KUkxaCOR%@3bJW2Q`0p<;?Y z2Kd+je5@+M>l8jFd5*{dsxBM`zCG5RdWU zS^=w*q1U@uM*l?W*~WOOLd!z!B?$8m>SASU5$d`d+|4n^%p(}Lqor!26pJEVh~9*9 zS542&a9|T>JZnN=T=4fb(EOnnJ1vEXiHMnwY~IxQvcfCk$L!*Jao{ec6X6Z>GdD&U zk(eX(v($|9$H!W}^RpiGyYb!6=4nzW9UeiC4KzpC^27xbR_w3E-F*JrK{L*A)sI7G zHuES<+}1ae4i}GO2}3|V2L417A<9ySHS-8*-8cdBK98ppWDsCt{;u!E~~mJ zw}5>XVgnEVc!?lLBaybe%k3I*cNGzumEQNh*oK;b6@;n;#k~LHl$i2tm<}a zJHeI<;QP(dZeMOc(p=G-iwcDKtC!ykquCpD{j|cv<}5tv`eShFh4ovWbIu09?J&TT&`GoB%s<5^QK5T$wjb z$YeZ!z7SF2c@B#>WfF0j;%FnakTDnBVr(mC0+ygShmJrFjYjw18+(?A?RBO^eLETX zZ9Xa>z#+UVB%%(gT&v}tCzu*B@;)}$Kx&~{*GbtHdOgoalL#JwBgCKYMJw6IKTNec*yBas z8$=unJCHk3^C>!4DXzX?Htl}{Li@S6ag0Ve7t0}k{ZTGX@~=dXnSDp!+jyPyb!Cot zia^-a3Y&I&DoQDAi=Pk4*~zAN-6N5A=(;W#s${s(rm1h4PLOXstd&j1EzwXiOZSvC#VQfM^H65X`%C9_r?Vk8KTJ7|) z-2E>T5_{{vvGVKhDF(eQmQ&GVPj!8^CqvnDW$*On!tnFU6(0L)e}0bF-d$bUKeNr4 z{_s%I^Wf}m&;I6A`!VCZGxz2P?*C1fzh2|`KJ!{=o0x<^7&hUqXo`Ud+ZD%41E|>f{fg59x|T%1y_KXVU7r zV(!<~>BI_$N{PiwYJ`hD2w*dNCS5UVW~efjJA>B1UOX-N`0Dk*AG+Ih8~w-M*d8t* zVy(|Ao#(JF#XS?xljy$r(3n3iMMI}7Hbs4fmhoMY=XM<;)*;byF6)715;k-uuqpxjVZvdv`zVY`#3n*Ylib=E?b=^Zx

nyVBS?MFX@?cCpv z^j~?clGxbk1FcRqMV1=P_-n^CWxo0Tr?|F^>H)V&+x!#Pf*peYLtN`wb=#-@D0)NI zH1!Wc`*S)y`nV@uw7=>}*~x*>Qok6#{L=PbKZ6NpVi|W&7G+5rxt6R*j|%l|G`A1W z?GgWYQwTV7T3#rqgj`#fSQfrC-#I+BCr&{B&zd1pRJT?@u zOR4_%kGDk!&Y}haXFGLWy54;);(oNs^D_1|m!p>WNUzQ(nfmj@ZiYf57U^RYiO?DU z2ftGt=R>`h(d9uvx6qQ;e2yfUm#Q_NAJ%U-^*6s%S_;RAYspf+OFnsdyv3mJ@;9Dq zhO)GML-tR^B|OkWm6wN}g2|~v=m0e9^`lQ7>Flb7jg~hQ#YtxSh)1dEVK{CcKy^+O zxkYg7@|$;f?Xe&c4o33~CO7GmtS-tYrhrrS;?88)8lECy$Qz_JKqtZ$STr9`G$=msH~SOJ7eH6O(HO34P6lYcbbi0dqHKCvUm^@@CLW$;RSzWzzInj_OIS_P$D{h_y z;t1)cNQDjX=FsEWCxkp{SW~5qfyQNqpy+PRLxB1kNurf-fCxgJu%8>~;mF zlITD$HO(%CZ=)DjiLDCm1DhvBdC4R%Eh1T1VN{3`l@CJtFQarRkfqfHWE2kuOciX< zTK%@3oZ^=}FvFtI?mu0`_mxggr{wqvzos%CZ{)L7kP|!eZz}<|991bku6g7um;Fyv^IDR@KRL4gl&KuDMXQ+FWEHWh^V}|zxzF1$ zgwXX%mt6rP$ENGWe-J!0z4?_8utxkwmPir@JM2++{EokzEIe{2^AH zs40eBbLdjl_ro--oB(6SLZ})1!p%~!8I3#gHTFdaDVQ%fP%stt&ZI~m*nhHXbTT8} zjsMs+b?g7et`%X}wZAY*yV?~(+&=`oWjT>j7i#zYbH_3 zhR~KLQoegd-%EuTo4}J0YJp$<3FEHmRnZj6l0I~>rlUZqKd3#%i7k2AI(yu2zE+Yc3Sc9gAOy(b z=o3j-hXJcuLlpkPNCkLo1Ev1J3>gq1Fe?-!1*#mv(K+mN>HX2ZKa>UEJvKwLdm>#P zPodM5_$5znQZ5-0?Jj@P#B0^5fdAQT#2{_Q&d<)QWLH||_AiWg;Y^3QZUybjAG{KB1(7($#!7~J0Heno-wxOn&XtAt_4;goY7v8Jq=0~Ee~ zQ%Az*d5{Ij3zsGphsx)!PqOi!e)sEFx3SJkvo0t92_RY@2j%av6Sf8@vc6VC>h(Mu zX+>D;wIY@?kAn+|WW@~`6yx~$R@hq5A5>K;kNb2PQH=LFp`A}oeviCgq?my!33fDH zgc(j*Lbj5A0Z!bD&;!jvau?j?R--4L_A={L=d>lkIVpJA$-*~{7FOtK_Ti_ z!J?#7B(>PQ2}rdnbUt+m*5`~I^|F^T?Y^Q=W-3E1St8`p;(sFRW29! zTA{K@okFoIQFUxjTAyo{{nTn)RxhU1w9TSVM;JC@Lw^7m8RlM~a+xNqH|P~*g@Y9Z zLLjZG^7~Y5IC-qsxW@O^vOFjkjJxL3{nZx$`xPr+a%0RjxPA0yGEyIS~HMbez(O1ff>VB!iN&T8}w z7+nR6u%q)uMLD!vyX;wetjlmIctRCO(*$f)bI7OdRcPKK@)aWT0iU|9dUgJhYpt0I z=jjTtLWR~01ta9>O1{earpu;5(hZdQa6~#51m)V zi_Ofy`}}X085juxBxzLk!K`peLf~9uMG6U-^;f|PVOSikpJmsvh^%%}U6zExI*7+U z%Z_!B9DLJgB zu!DCP0C7{bh%oIFGR<9Xd5W8PQYm)7{#G zL*Ue=2(|3rg~(mHBrG*z9$hM27E%;ztqOc@P#uP=^=v&jY*W*SM>7V!vBrjyo7#h= zhpDeCKhZEO`dRu&9)qp%w+{l4I>xl~4%%e|8q0~vZBdI=-IyR4`V*JtI8EJ_hY={C zyAK+=qHJb}`al7OgZp{K+0Q?Ba=w+00Wt^(&5FWN9 zoLbOfGOxx{E{-+(N}h$f8<&B7am+40bu*^o#1W!HwbKZ?oZ~*0qIj~5yHc&D;G=vr zt=4>XxHYAuc-=)VRj%^JnzkIrPebJrV|`zBs^ckcT?aSM%@dQxP4a#gy_@hAUJqv8@k+DO_o+f4w#MTR~h=iL=s0O~|A< zg+sQi%o>6<@vA^}@l{YcGGH}0RrL6x1ecgILVg`>0I28$n7Ha(KCr@9HosPtizrk` zC^RlER)|pg=dD2>aA&zv6$q1rSJ4H>anJ5+EGq-ETlyDJqd@uPT** z6S}ve-0I`tP>Pf`=vMo_X2PkSgFw4|u_K$oJ+>Y9cToXv9r!Xp{TWJzL7gKV0kOj7 zo6wQDtya0M7QP7<#uXF6_-7ywQCts^24!J>4>6${F|C?^;+nRHxVo5xlgA^DGf{;z zVGl7;bFoJpHB#XC7|Ph8mHX8rhZ_v97bnX*F^;9`Z|K zaxEURG9EIf9%5AD5^u%i8O7!KJr$15B!oPb4bPz@=L&sdD*R$+b6wP)25cUh`i2VVx$_Irxu1Z(>v6tEE5`O3WgwVE~Jn%_OO z8^m>fd+HROYc_}*44oS^oa=0PY8`n#L3`?fytHT~jJj%#BqcPLJWUk-=m*ysL`x7D z)#;vjYTbzIHPq@+);((xH_Jaaimx;E|MQGn;(4ir`HMR9d~u7n5|)g0mN$R2ao-u< zh}(F;6{QE8fW*XjaHM}pC^ly<2Sdt8cNCoXR z1e`Uz$8Cfc;UbnA5Z;Z*J!q(;R9Ml4UxPQ&P%3z-A^6B!`url$Lkj8dgD`IliT8mZO(;A=W#dZ2}$QmHs>oy7pONEJdyr_fzVz^7rtsPd@Eh#-CXp(x!}fk zWg3{Xq=s3(EG}v;sgN#hXfFLOUDnrJHY)wCD7ZMdRF=-<>+jyLGtCuf=}L2Nt#~+X zdRGB=L%0WAjH&stE7Xg!nTgb&fzH3M4^G1BO$=zMe=E~qa{Y~Ddu6}bnb5Bxt)=Ou zU#&lasT+$x1>I0E1FbJYRExFL&^EW0%79$qv<}U6eV33dnGS;6dP4+81VhQ-H<7e< z3I*h<-~1!jwq?4{ItUA(o)t3S0=O8f5?KtCR93c?`=+|9i78#VM&TxB=%y<(m;?^5 zN^Y$>LQ6jAlrYhBc9(|dfk&u4)#HUT(r)ey1tqGg_6eqP~jOn z^QwK5vKcrpKCFVIfU&%+?4s67obJM(dE36*=U1D5PN;{d!jNlvC_*@N?(8myqYHv1 zG5(}otlxiDre%?`c@WgRbcA!PE<-(a_niZGbPqmMq}=|t3lsw%2!^ls$sDFZ$qKqa zjjfOA?swxAy59TufN@zx!#J##cUr>ea%8XizL%)W2gd$ z@6~Q$CLF5G;!IikE6|>3T9r+f!kXfQF2QIKsdNuQeRcuh`-y`&yYgN_j~%${Z0}1I zX%7wyChf+tXVSD16FXSeF7`j7mH&4Tn%6beYoo7E9=jfUQ=^@XznM3^`*ThG#hqRuXbID?|-jdpdeNu+Vl-?9AF#VD(`~E;Mo}jY<1HDy$9( z9_ty(ko3Nnb^G}p1EEzMaxp!2PgSy(qaHDk+Pe?RW_-^Q==W#u*O9X^O!ev2*@k~u z5afN(of@MP(=HwW?AhO6?QC0uz@0OiSm6pAR*VaV)IWYGaiw|O*+Ed6B%Eyk_Ee)m|4pRI3cX|tMbrIlA@5`IV$y8>8*uB^tTvp`195L|8H^9` z(JyWGC{G~h=lPPAG|1P+7a|Wdm`&0WES)3AKxl7XD-{fC4>IBAYbNtZ2-$e>NPf5u zjue*x6Uf3w%D7Cy{850of<0CZ{eoU{=+@ce1} z!7{bU*+^ei*Zxa3&DX2$OKS5y+omm9^WhQ40U@Rr-2Tk25cx&G^H=VQ%7uNXw@l48qlEFLk{(W1XK%Y<+|@|62XDuKbFE z84mR8ATPMgc$HhqKDTY_?{Y;q2p4{a5!W1}|4Ur!iFjNo&v*a}w#;yjE#-S?G-1HA zMaL88mNB8Rx|rnBauQ_zR&V3*#-*W-^mY=p*Y2OixTxlQ)I=-Eq=a_jor#I*sns~?ryB=1Xf z+};-E&K18zyG2N{T*;-qZ@JofEX88Kld&}Ax1UVl;rFmqsGaXs))mX`f!Kk(pIq0I z4F6Ml>CdAVnSHe%N4QQJGOj;}beV^GuUl)1d1^#QEl*^Lp@Zz_fh ziz~e$W6}y3ejvNTyk?-KrTDi4K*E=<{8Dl|rwS?sBu{cx=Tgb7G&U`xX$J>hCt4Fb z^|ygt1jDa>v5}eljgV}t6jP*PryfIft=zeWXts`mpPF`a2tIsmEow~}86g<^6&gH6 z6ef^ztbm_t2Y*I{&*qSbax%>$FFPg>DQ6>5SVuxqeK3o5Rq;WsE(|6EYb-ZDAFRaL zDYV(O&|ER_FyX(dWTk|(4kHW>DI`)XrU7&P>k{;&*Sa>CT zV+?m;AMynr{I=kQ7zuKxmQAfQ@_a~U+Hg_J?dhE^YSXHpPR z82KO$e!m<_U#J*iI807boe)ZYYaPiWj-*WNCy?;S4S)NbgM8_{Px{B$$XoYL zQdFf1=kP@Qq#OH3-&}RLDG>qLE_A84@!>P>BicQU%s2PQpF!g>MdsGzeZ7U$NjuA*v0=N0XYoef>5lGW=(X(lNwen|Pg4cuyg^pZOH$YzQJz&F>hu0c6K4K1kTQi%r{MYC`yeArcz8sI6Cz5H2 zVb@YUP4Gp7vr=>LjHi96+n%eIoA6@BRxz%>*Q>&QAe6kaUGwX=N=miW5YYKq&+inviE-C#`0!}jf0%|uORSU_U3O{=R0bs7n1Q^z zJEQ5{o@x&Nq6{|eVQ{H3h)>k^3N`H(-K;Epo@LWh=F=sE$#!Szm{>>Co#Pl60=Ep} zMe+3npZ_6%R8T<0z-!E=#brJNPJpvRiMF$IQU41>h?hkd;fX{vBql+JYt#-@(5T5s z$`@;ytwdwZYVa5o`U^jbtzW5dV*y`WsTaepJw+gMoMGEPn5D;=OvUphdIps|T=?9f zwRyajj_Qna!XM+#3uG1yJVds?>NU+4`p!9ODM(Lcrh0x=yofMVM~@4a>Ux&GD6M+% z{g*%@Ow2My@x{yGNixe0C63y!0096l(MBtt@n4Zm3q72+kwf*mAJNXbddZZtsao>( zlFzL&zfd45aU@@_kl5|70GsE2$~Ei7n=SB}q{ciEc5LIl`U0&B>`bS!n11`-3An7^ zWcrja&Xx@=vkZGCMZn&-qx_b8{QzW|Yqi7h#-}AqHe!q8PZTx3RSpL+$3M95b z-^^DWxx_0kb()h{|04N*(fwWSxN1am0}^^w?s*^QZ{%0O2_{MB9$FuG5p*WDDCi=L zV}U6Naky0yIHe&;*{{|Sn}?vUX1>!m=X5yw+cD#~GLNjX0LI!UHqYbY3z!Hb1BTg! z)l8`pLi>xb5%@UfYB+1**w*{(g2K2GdRkJ)*c-UG3g81w{|B>d3ETS18~0H^K35*u zF~E~!XMK(-~Zse9heF*V-M?e{HLvNHU z%_(FZEZXdVudz#Z<_f&31ex!G)`Y>{PT=ey`T4EAusVJTZ}Lp(fTNVdV;_oNmHsdWeL>5~hC6_ENhv*JLRR*Qx zf@6+s!C zYH$K#i~!J~m@94Qh9@8lXChrmd`&Bmaa@k7PErQ8HXj7Igp{PblW~}pRPTgun*#e_ zLg+1}q=%uTce11wnXrE)`E>liEjOuqoGnF^fD$X)hg_l;Ib<-!j&ddXGFHI)Fys*> zf662zeG=l{G16fzKAJE7_M~DO;eCssr@-dFL;z>wAB}U-`y0w{CCl$k%D+98)Ox#_ zVny>U3K#?brOm}qj`g@Fk3d|J`FRe}P!1^-9~<)!gbyW9b;bu2SL`~K>GF{66??cQ zLX6W7DtyRN-bsO0!RB8)d$W2bZ<3;FKBfBNyzW%&?B0Xt$cZJdgAjJ~vKm1$ixRLw z0Q9gZK7t{v4tNabz~^xM=d4v>Yyjhb0vk*pPA+^d_A^-d$=3TRcG^5Fvf*VG*3yn` z=%YJq)habr>;HhzejCZ#d2Xkp2%F*5N2RkVRw^2W+rxR#3Y#a16Z})wNEkxw_ZmyW zG7Ci=|D(}|)2Mda6BgYSI;Tk;{+|Xy)P_0~y8d?FB^;#gp4%Y@vR$f>>t*xlH9X`p zp=k$uASYxV%UJz`0FuqvNnF)g%+TYhiT4K+dS^rLFG#_DDGb0C12sm(LUh$nD)AtV z9aQKb73S(B?k}7={y)tssLd<>n0SXK6>>maYrm1kGbMBoc47oJHqsdmhQdGXB@*UK z)X&$b)f%XcS$6p%*f{^J{tgn|w6)fKJ0jScOR~BQ8_B0TOtkz>ZL;yfnsM5C$&KK6Sn#wL;!_Ly!F)h{cA<*7C^dG=>e9Q%~QR;b$d1BZDIhnV^-?{6o~IaP}{Q^i13R|8<@vfnmM|eFy~BrdiE-H;vKP*up-~Ky=5O zdxs&yS5zE|pvGw0N@D>(&$yvZMrm)sv0=jb%M=ZRL@(<2Kk4y!^a;_b3E1#Ru^!x0 zsl=D`hV)HBO!P_r^fm00P3z6oRVTFURrKRu8IdT`_An;mOaAA%p0uWnwa1yZ2&cuB z%=%xT_8pdjj5E&%n9&O4=9e2_Be(G7zm*_9{`Rjwml~5v=T$1j5ttzWe*}lnLUz0DilOJM}RCbWTnm)DQRQs?o@o9Y2 zFrxbaE#Tjw8G{n}cr#zJk?Nx8Qs9u=9FlV=5d$g;Wy4Z*m4DEb!!PmK=_>xtH{rbx z6P9&O@LUduvE~SVWB;SqnwOpWcrcUb zMH3P-^|t8!wQ+`7@tNsobad$FXofTbdMs8fLqXPff{gOt=Q)4aOF;go!%8A{7=A66 zWNg`~euqZBz6x^V)QUyWf~udfi(T(Zy=L-OEG%~(WpHVxE+4QrCbo)6y8@&xUnY7|?(wRAGxhZE{# z+UjN2dwJBIG}lQ!NW1ePxXi!`xN6?|`PF}nri|WhhK=-6D6@BmZz{8`suRR1E$D-d znboANsG7HIwF5ToDVfcsEr`C~@-_a3r`M@X=4p7o*sPdi;8xi7nH|rj>Tv@sz>HNt zY&@bTJLP(txm$(MWEO_t`OgwoIQY*JUa@u(F?1i)boy^mlhSp8ZUpYnnq*o{g z9U5(8F&obYj8eL#a(4h|0>=}jE7;)UjDbv>0Zh%P2@%y%kfr2J*4VGqH_bT%GIDyP zmXohsQ*mV~y&qTT%f;V)oREAlsde}0F|#uUU{kphn$`Z`#;gakoE8Je6F_If<*)>o z2lWGHLj!Tpy(?H)9k@|nutV-SRHGkg^*U61}Emq;ouVgRvnJkXtx1B8rtoy>pm92=3moM6u zXytv;3%0Usy?FQbj2gESFKsCo7wDg_^|iJ7o-$6Gt{xyEV;zK3XDqUF(m}?n)In?J z@-EDawoGix%Fk=v#2r>Lq}w08+1sv&P^7`P{PApYBRmMhCT_kIY>s6u{Hg4^hW&Cp z^`E|GBegB|X}jXNZ*045>dN*GMqg8$of}p^L~T{B(g@d2hO+FRdD?&Dc1S1gZL>_Y6P#WVJ@m0Xi1A5J+SsQmZ2tD-{f(VD zHGFZg;Q2=`jG6uZow)elGgNHDHGj-|fruCU3Fs7$_ZOc(x5m;!lCb1n=PFj<{Jldl zryw?ar1@W?|JHQgRI%UE&s^m~?o4Vg#@^Z?hk09bq7}9s%+ul^H5gPngR}XV``9gKCwy9K{eCM(Nhnh&;5y z%Nds|kxk+M0YdY2UltZD{x1k^;*0iwKxidXUyVx5+M*;%r@vXW_+?+$6wOxI4t->f z9(_?;`^q=TOuz8+cUI9Ryw}l^6^m{DN4x_cv+bK~Z|bc_JH549axH%k1bLT;ryqP)P^i82^$?pRb5cObhMleH zwGBIIwqCsq>&a!Y-y9!GL4yO<7CIwYfV?83?}!D_{pRjKc|z+D3SF3WC=Id)g=BcW zJpi_Kpc^9ocxf0W>kMG~5yn@}W6YB(WlO}3;#}3E6ZQ>O^V>^QCV99{NSLFrr-GfV zP$a&Z;JCSQ32_qa>?h)`mI5$D8z25$4mEQbuuFR$L{u0&WT~-%j5z$(zJQjb`TSkgN^U*AA3A8n0cpFU@NbL{zY~vg0wn`Jpm2z!Bm^jB z+4}QM97%+H-t@7&Cw1>g=Yi){wjbH6r zk2}fLYngG#-hSm6A@;tcP@3j+Kr3nzUdt*YMF&aO5`F`{?tQnDO@f-F>O#PV4~3@O z3CQl}Mf`TI_o+yk6Cv@foqVpYF>50Vpulq-<}L*BfK#T&9$P`N#XA#_b4HUbT66+Um`t~XfPy5WCwfMVL4jHRroz* zUPHE0FP?DMHT<{8G9w*fH!E?X)!St#o6jtlffNeTRg!vIIyvD4!Xtp>f;gQ2zSfwy2WQVV-pxD z_VEt{;|oEtWwRp4XhNWrLJ7Sr_743|Tj2@bsCnY!)PB`R)y^T}oRrFyIKV*;0#4Hc z1(sim@BVa?sS+3PzsN{_=gNu~Q1<2FTOzi8l!45t9<9!`h-}vWFjq%`hR|Dg?bZZb zlIyuIV%d&R*;V(CQ)OHxQ9NOdT`Zq`1h*0(vL zCpb|naoc4@a-%zaZ+bmGOP#3E?e>hq)pSysE?49*5SpC)D)#}nSlNyuKU~xE<*GJ{ zg8~Dgag8CEOt)0f%+2wFt$X>LKaJUfZ1iGFJilo3uR^qT^6L{4~%;;!_V}nKktkP=rpn zB*wYyC4}TxzvjE@ly7BrLeam5U%#u#Iw*UkKr%NL^{%$y;Wx)8H_0ueSng!>qBHlij23SBJS*7QqKJx50l8uIk0h5_&p86 z$aL1%x~-;4toN*80&2~m=lwW_bD|Lnn{Fg#Q+o{6l|LKE!Vg&e{Z;z~tEv@#>@%SK ze@_Y4*F8%)dIPzgn-4o}H_WyR6uMm$|HBRau^Q%@5DY;zhM(M?#5q|6Q*N$?W1`?V zr&eXE7Gf(*jC2&n&(xc-+Ym+obOomSbxCpibn|NOnbBITijPuN1q0pqIGE9!pjZcU z2J0}T1$MYdRY>6QPdv%lT^bkolpKW{aUm`6x0XpFratQ|>nN+!@xnKgS>8=$sC8c> z+SsO_%?VKEF@;Z8u+y<^vW8{;)K3Aj!G1)4Xa$W!4*|U2GA4J0PR;LCKQi zNJv072z^?p8MH;;zPeU)eRuGx{j3K)?lp!YY#E%A>&J#;^T2T&(D;7KftL+BV0J?R3@zm?D^il2)CG>@Ek^P=1lm^u zWvhT9m&liHAW^y?=0?6T@Aq@=m{9U{*#vw)EqK@j(PI)av*{z%=&cV1iIWpqy1F;4 zxcyF}ys!cU!9y_`o9Q;eUqzq`9FH^HJr<7VfWg?{q1FjRn*DrCKB97$!LqZaZpD1k zjR>0@#GB%9#hws}71~BcE4hpv`mmV7J}Iv5eY{FbhgQ2JBZRnBLobh@4+rd)Bh+o zAtYWalrotxs;i$iJw2p5B1Xz5)N%|j2;;DU|2b^-3EZYuMCTZX4UE_AN(}Li=5vp> z$2e@PV223uDIWf;4x!HUn1k)OBf@ybV8GEbK@%%Uqc@3>60nGsRhy33zJsK}3J`c2 z%SaIy>q>0hN75ATer3&K560i`jXQxQQadKnHnDPF@Lx#T--GcdmkA83L7HGtVh%wo zJZV)wfq^3KFBj)MIXMT6Kxh?A8JQ?M4x*i*l^jTx`{l!+0lbLt=PCLJutj13HY>o1 z9=Ki+=w6jXGLa-oj(4<5aMg&8N@XP$u}+rvrL+d)xph0W%$nU#a2*{xh#^v)tK5E6 zfkYdlY{)^P>5jGr$a^T#z%lqG3M6{z{rpN3>KlKtjIR#|Tr~nM;efKS@KiXS=`uc_ zBVgJgT7MgC?<#CZ-UkOOg}mTLBu#L29nSn-l`JNhIxslq4wj zEATtfTmZg1Wz#Rpqh&FmHJ~7a@e&c5R|&G%CJ3Y;wFcv__mcX-@$ft2gRmZEOoWS_ zgrt6D&alEabs#-LgbKNX8dnL5(?D5cc;m46pk@3i(~k$9ps+N@+ni(*cMuHB{f(FM z3MxfJ9&`)lE@DMz9cfVZuv7jM*sys2Q()ssBOyOw|0l4qlHaqEq`GF_Lvv}Q4M?LB z-caB%4zgDbP~PZ$CSE1O<)w7;r4($Y3>Tq%@{4I0tZqyP6iMgS#Da4^PQnOmv+4{V z-LwDZW&cVflJ6oFYy|4i5hYfFo($#&C4kOcgV&co0f?ina*(!eNS@0~Gd@cEU2y9d zUTS?Fi*gM_jbHS2>KC-6|NWDYrkLgdA! z!o~Mc2$hIIA{;Mh3^*RNsAgIbXnH9?H~@R0 zM5qoB=}CcC6XgsTzXD^gp48#?4ceJLyn+Wd-Oe!My*?QMwKi!St#$ zb5e?|h%|-y^)c$2D{;D8MzSNo8V<0JuIEWhu(Sd!F4LO2)??H)8%Dh5YG4r@5XxS| z>lVmU3P`mASPxRJ&C#nJGnc$33QfwrA0yPqiV`M%+Zi)&tp15> zCKz!IjNb|uBH96O$W$ak>;5W|i1X6)S^?Y@2~C&LrS%l}Q;m7Ioo#75_!)W-R{su* zfyQZ^YJD&<$gO~D4~%B4Ui1sRhi0t$<*Owm;EWSqU4mgY+}4gw%0o!;7{~x)Q`V~M%4hqTv2xe5)Mt1{tE9?GKT+8_4!OxTri}BAUdepUCLK#dJ(FZI21bOTv7W$aclU>6pYbK*Fcd z{3EvuTkAt0POIoiyog)3M%l3ZGl5FkirUpMG*_3jp6xi1kZAW}vq7UD*x-|JJiFQG?3I3%H zq>ib>L;F*-j-kwNi1e}085nQPGzZWi%-!Jg@F`9^)|-c+I1JSZH5*xs>Adweu)36p z=@ipOLcEJrk%JY*t7trT)Qo)xQ3|}h zM4QOYZA6MOrH?%`{c8&o)65-sz+QOZV=UlK_*btDk*70Yn1^-x6@YRdWV4sx?mwr- zK6nouv~wFY@)sZ*CmfF8oU;Nr!12Z1`T5EiYqitEBGaP=(sj1epT-U{#}sJ#30#o_ zL&@>lwFsP|@YMubo*XP3nu@PH6mMi(T=~Z6`0L|cLMkv3bkJ5~sztN$aOuohv|fk6 zt$OjO*X0VMRVJW8Zq<-_p3h>6p9PYLioo2wb09);$|G=^8wEY4)pS{BiJE1^$9m;K zJ}>;oC(G~^df@7&CCy9%lD~~IH32C|q+1t*^vkAv5rOQKTfsnRKyuLg0j9m>)&3gb zei-8od`&`dU0HWs)n#2hZe62%U29}r=WzWAa6?~k!%%nQsmq2*+{Ux=4fBx=%fpQq zz)fqx&6m2Hb}pN*;x--2H=RZ{oewwP0=L`*w>)&Wyj-@t^F#QXqrii4!GlTNgP$%3GjRvM$`5{zV8;Jo33#|7c(|tf{|N{U{r@`% z?d%Eq&s+3GJo>5veKU$i-yNZ`R-P#wQB zG0O!3=nd8}JzYAVR5A=UG*cg1wujIgZfbv&a9-@rFx=ApBo}xOU@+R&%TtYIQT=GN zV^FA*Eg#D8bl0fl>9^-SAD`|Sms_?4;V~NTn^xQX_@tU?d|+1ZywD!XXmV)L>~*l% zlWB7Fygl&l9*@cN*s3#>45F50dScTX%d8N_^z77j@WaCwy;;xB>_@WY-s3ZyojXi? z(fO>FZT9E&^f$}zVa(=#-^|rHFZE`dU$`u`1)}j;EG}JFdtxE#pDeE2H-BU+gtJ&) zd+z@H_M-2T<;}apg*J`%1d!*qKBpT$KC6Fze&_f1VBvc>KzRxevLHBoe zXaZpahqPBc#M+u$D=P)M1FMS(vPA*f&I2d~#UeRG9u@IMgMg?s=r5dt|joB-Asq zA~h2JGt&4dQpGw(^PrBSvK|prOW-_ zrRAdjXIk!0a}q|E+im&ypSs-t3N3drSh_z{doonrJp66tN5|nvQ`>0O z&-IMW_s!2QpUw|YE$mM$4qh(~oiF{L({jK6i}+d160 zI@;bn-nu*9Iyl|EIo&_`b9npr^z7pF@7151tG{S88Z-F`(O7pxOt8uvR4^eWpGALV zZdW*%Lo*9jmDdwZC-!>2zbd~ko>e)DNu#=8AeqOo#A2ZO%TSu|%fT#-n!+EMQXaeW z12sjXpP|9T%$l{u15+4Azy-R9JOIF>BSA{i<=8 zEU_G_|Mt7Vb!{+PtD$_M#pi5yVW^>E={p>kghjita-}PRlK=T|W7S$;0*BTo?WXFD zp>#2)#o?x!tvHw~N4lJz)Jh7KjyRn&Wvo2=IHgWLe%;y-&rS;G=oh}7ms_fr0h7AwODe%nRKJ#g z+$zodgN6nR;W(2c8E~A>lt2_axXX_?25_wuXve`wVRjUS5*WXZ%?i{Jm7+mt%&GE( zF~hYWUjFoo!da~rKn_z^48hV;$!FSr(zMmNduS&&#-}GEG`k&5+IBBZA+KKSlvvdt zRv=IVxWdVyafWGh!6uEP6S7hQrbg?#?WuIw4qQC@$OoV*T7<3FHEw-Jsj|O{X|EVM z8vwEYiOqo?dRu9|5sxP{gt6`vpAyPu)GA`IIavy~@%)e)7~|)%NwOpdE7eaw5phGtPL)23RD_%)}-hU`$`_vEwC zwXy-ipz&TA{Pei&*aUbg(N2UgmVKo~EbNqT_w;7DVFV69hEM!o=3K#Yh}Sz5k85`+RDu>l;0uN&=yT-XZkRdv8go z(u-oKhF$~&K?DT}q4%h$fS@P{QHmf%q#2qZO+cCjq=Ph3QBlF1cwP7XJagv!1Ls{{ z?#bRW$zJQXz8|97ctZqWkm)7i1L7#s%5bYZOc}{=?1miTl#586`jXB`;?KjGNeHin z^lDE6Q4QVsuG-8=JfvCnte?HNk1fsB_QmezLD{3T(J4zz*i7psaON1=1j)r+hmw>K zTD#BPzDa+(TsWm(puAXAvR9Um!zl9~!zGGHiwI4CWi;TDpX99N#4{Z%#uPPp=p$Q} zs#%c-$@D(Z`N$SeU1o&|t-c-NR#nf+N(AUlHyxZ-GWk_*_J*nN+I{aD-CryAuIjQ@ zz&>k!)=^P$sfEXtYZLmdqo|3wUZ4qz&7CgMRZm$s>AKe8rtDDCkY3f}NS#DUR#H=P zq46IRn?ybZ$ge@^A>8t_F2^C!iT%`+7;U52Lz3S|9dR#&7}iQid1u*5UlSjH0RHq~UEXfy5c zCxU|5up-YgL3+)~&9EjBsLyGHr(jv$-7ARG-i)`}XcK4Nk;>QvWj?3BxOO-Hg89lI zyC#1!0wx9EV&yZ&y%@rEKbnJCf1eSJ}32996wL}lT9e_yA#5Acwbp0}h9LV!w9yecj?OC9}s*>^JY^QH8 zs755bw?qkQ3;aPiT&`v)Q^MNMK3UTw?%!iFnaA2GZw}P{PQ`}tGC^S}Ywq`8Wrl7S zT-g<6iLJKDS315yTW!l!SYt3hf8klGFPusCa$rc{%Z9*8%I}`#*u-$UiBfLcS0QtC zEVM5PW*IH7)}ZqBN7Okj;61`ek-0cwpon+0+#sN@+TrnqsRuN>XZXp_8MyfJ7%-^C z06me`5Lt)06eUF@jZ-u>9-DQQ-Mvq$-4kIO9=IPfhYrG!)ava?5Y-vsKL;Sf&yGgS zkMyP!gw(>1@vend4B;mh74W}IxLfDrs%qq_#@e>F4bJS}wJd|OzP$Esu;7D4#Q`aM zND$LrF%WWM(b@gaPZ@(9_x??KJN;>%QSJ*1{hU^~8&?nc*izw@qDZ|X{+ zcE29)EWworvB)V*HJj6cn6UU(YR>bX&9Cl2=L+rP0F7IdD+^;m9+1c@OO1$M zIpR8b#*WNlC>vZP^GfFVpIWYWKw)o>qI#%Z8I-Vy_*llkVtO8PmC+EXZHG$K5@S1l zlE@V#SuKTf8E5rZF;|;16M~}^T$1RzoK0jBU=s8Nie+KS3C9%Z`4p?e6o*63FDT}d_*54(vmke> zTWhMvT&mY$DuFM}M<>k>pB50C7F3u*f`ru$5X zD-hQVcC3RBgBWu|8j`{JvY94oHqb{zioKd9jmoB^U)vNpjzE_$p0q1uS0Eu@Rv^=L zkRQp6&fh^rN<-;3 zA6$~GBt~WxH38GpOJmDMkze5#1CX<^j$u@zG&*BPHENQ$T2rBx#tjPa&D~o5%h=tR z14|cRK3Xz5;PN$|36IekJK7T!QC~~xRc6%#wYQR9Q3b5kPGndSu6>HF{vXHg<-zr^ za_FFzh4jf>(1c%Ck3!cI6$VTH;pyN1@~Kzl@z-v$g0~ zkaar9>;mjgJ?PCOsEtwT7;ue+RH)qynxGm9gCn~3Rl68V=!{(vLE%PR?mk#cjO_N8 zu`3H6|9Sf=Ek^#2N-@*4DcfPAsEX43bRn0to;bOwqH`-7~GmvAtj+I5N4${C%4$7XX>{c-- zR&HKgVM1YBiYT}GRpB7;z)A1HNw)_sThibs)^cgCW*PFLrl$^Is5A|nb_`b1uL!v8 ze7ddj*VPB>izLYdW;$TEje z?>~A@q`a$Vd7$^`6`;B)i&B;4^XP5$n@Gyw3sWqS;*P0mE~XUiQ;O3;FN-O0gqSXY z+6iEdSUsijQcY%?T9FlG^but*0&-Wbc6mr>s+h9iP*X{yECBw}b?)OQ{KpaW_|twC zrIO6Ju#Xu1hL|O=R@OsJ#~HGQYIA;7QMKyew}sYw6` zK`@*0Hj5(vg)#fmlkB$z3YVMuk|-+G6t!%MR|-XYxba+8qa=~y@4#U6?y()6uUlr3 zepj2pgxDjq%49YL3=1M|Q*03p(;t)PY|9ynRB&hdSbLJ(i|K-*S zcfqM&=!vq2{M*f0#ckPJEOlpD-%;Bx3$kFxYwo+Z(LbeMVzFD6d{o^q_p97U?ZZ3~*p7?IQl5T;HZlU+xB1hdQp&nkKVX|MRSuE;U<~i^_`b^04bjM1N zT4l0Rk7RUD#y2zX5Iz+llp1raMyQfjNzYvll=yq0^tS4i>`wh3m7vK^Ag(ILt#;}Z z--^8J1gj?vSg=AGaRGaGvUvLj!wKPBnm zd4mBTkAcYefn2MBbMZnkBLmr610?oAQVk`AnJ8FsEcN z(feh3&dYoSp=%u!tV3&ZcgNil%Dqs&3Xh=+azmvCLN$+uGK%@?ODGwt!wv6;h#dph zI-oFXz7RsqqmkjX&%=!hBds1n#UleHJG?JzMhb-ZM&k#D<3}>8hi@H?jE9cS*7UyZ zcsXY?+Q~lpN`Ew7Y;5IdbmaXg$!2UZUa)?jvEHHaT2803N9`+zp#y^#>k+T+mb}`f zyLF^S(tq8$db(SO8jn9>hJ=j|{^n&=%wtX%VD%h_9lz?ncZE3ns8_yAZkzYu>G;K* zKCw|=36_!Gvy%dfd^%qrL~*=UGn9#@4ZUiX>ryX$t=IY5VE(n??!-XWYa_!cQ_m^$ zPH9ZSYm3gQf8jdz`AMy%E*sBjE-nyv=R^1LdYAcWuj6Tc4oE%{@@3>?rEX(o7n##-3Wsg`q?5f$e!({l7O@fYaCE+z5Tvjoq%w6M7r+@xD)U;6x9 zUgz+5`6PLZlyW?G{)oAtzWUqwS?zBB_^>xT!WMUPpXdj^E=zbz>3sOA>`mqT06_fB zL*aK%aw*j;v9+TVdxLkcelxF$XOt*%N*DCEDGn6nK64d*c<$}p-p==pqwk`v7|U4@ zm)oG&h<76i^Xa?quL#V)_MD%ic~*DidOTO;G}3zeVf5h~VlGoJ>lI;s<9HzR8yF3Wj6I!IW)(YdT*_ejymUs+^j+x8A> zfkScmgZ$VliSQMS(W5UTD_E~pZ|isFbrkN~D~jG@C;qHDh^#pot>q1k+5BO*vs-iP zTJu;~I}BSpy}<6exAs91p+!Yh6hlLlm@wR49(J$$=C_^(};bftzxPK?%&V$h1F)9eKxY6xA8mEat07#^UkyZlnFHM9LDKu8Tlqz1_*Q+6SCt zg?yrnDUAM@bsrRY>%1-7G zL*v-y@c(Pq0l@43(sg7CovveiPPdT0{!7=r7ug;QXMF+yH0M$1bR8eALm*q3bZ!-TOb1UhUqb z({?_feOQulq zCM}1Iq@*4O9ds55W$n!aFnZ$WO8#b=RENL7<499!U<8mAj7&!A^}v8sxXiUbK6Zmi z!+MHYYC%aLQ$ZDAJzcQ7VFMW9=ORR&UDgI%NV|yoAl8O`!<5crxHS=?2_VJ(X2^5n z{8hv0&0O~W11%Bo+U=|=2%)i$fpDITz%mF%?_)a!5D_>g`GZed)zY9vgH)kGj2I6D zVB5501!gC*IK3y9v*^5*?aU0i9VECHq|hT|3n;_%vv%pC?`5h4p@bN8PVObKs*|O} zQ{BoxJ`wtlOQSVjzMa(FQBbj!wuW@?Y zP3kp3v8gC~qu&Q#?J;2Pf(GQC#=$sE|3r3*IHPzOJQ|ivjlY z?z6Psw|tR_SD~f@ecmzWIW7=30tl$nQc1*V%a{Y+UEfV-ZIvP8K0Ix@o+fhwADs8!&hxipDv z{7&a<=9S-EqaOg6#pFdC1Jxq=3v@<48-N6WvqXDPe&D7iza+90;3db1$sLqQ6r|z2 zUcJ&rhrkFzrxzWC9v1+}K4(mbj_UasN0cKRNrPZML%)AFy{o(&|Bo)$w#$%se9nvy zlR@Neg9fH{U4_Or#okL4r~hd~^S!QkAaEBv49k+{30!=I8bd7(7`QxlF^CX}UL~g99r1V6FijWVdOmfz-Zlr>a3I@g{W=1pZNyM+-C(hkBRY`Nq z@jc1Q*&b`IQR$d_w!oYH<)OLG3&*@{dZ2kX*1}-jG5>nxiqP?)1%^RQ>B9SRuAfvQ zg9=8SfgKCxc&UBy1of(V)CkOKMhT%&Mq<(hi}Rx%H%*Cwpsf?yS%f6UXaK%LryD{K z*Au4hF|rw$aB^8nL(vb(+uP-+grC*~>8s>+p=Ew%A^^YzF{C@9fUK`|P&5I+%b5+T zr)ge=F#~Kq_)vi@6tc7spok}Sgn>Z?0L_3!fzcSJkw}Ce57jC@f;c$Rjfk^~@EbHB zzQ~As>c!w(2A0}F z!rBOP$vz2r1WJN0Fv!gCLsKt!(^+h(9t&hfnx>?E%7(55+b3kzC9hA}O%nz|0uiLJ zit`|r49E`*iJ4uejLy$h;P~5+focAN;sh#)k9&aA7>4E96u1VV7@Wncd!Bp0Nrj+ z8|!KpSm`TU4%7b)f`;D*yJ+hj@iveh&_O>p8q0cV{ilGL1>l2}ajonR9jK%0@_2`+4 zS3Px=KPp^mhV_jJx$+T#MvI-tJvLk}U)Dho%m)yPHb$R$m5qLPyY#J;iysr zv8^XS+ne=uEHzDcnLBbo)5v~cN4vwm(Ud$hA2qk<>h-VT6oJW_UKk((SkAquKAHPQ z@sN1tSfK1V?b~C3-F~hzl^MpQ1z=EFe6z(J+y5@3HaH~}=ymd~(0!KRv#&}wA?nKG9YwUW>)Y3rG43@-J2byVDxw~%d4=aSkl;uEF^yOr2_=8B>$~UukVDUv z*!cB@O`9zN0O*Trwsb5Y61CF7kT5dsCO?qIa3XpH{pdQQ3SMkum*=`;QZW!!)~6JC zD#WO`6WCO2T0|m7-4Y+RihZ`$_JC7LURV{io)cby8_Y3*Z!AOHkDbPcObb zK=*W}WkJp^=U&rW6B)hc$1DebxWxK-Up9eB-XFx=sXYvtDZsJ#Q&Bbg3a&wbi`>YZ zuS%mp5XB@R;R_MQm2#hBRU7LzhyuSTmMu``jPYTMS|f>KTYJO~0LO0EAu( ziKHYTVa)9Uk_et-R!BHA3MBHLs$n7Ju%Q;I{@Y9!ZrQS9`@YxYn0~I;PrV<`P%c29 zs7#Q3(#tQ=X*Z#CX*Y~*qv`)CjEZV{NnQ2N3|C;YJ8~E=?|@)?E$$v8^i~D;z~pCaK<@6KEl{3RrJSV^wk^ zbii)C(m7?KF3W^w2ze{};ewiX|qe=cb)4=$?(p#iWm0JIfgR9JZlMEoQOfDr*^n6&C( zV$39aGF1pRN4(P3o~mUlHT@NW>iuUD|RQ1A2 z=2ieC;%fZQb~fw{_$Gi|K<9C54-_Dx+za5_G}*GQU?Tu~8;IUtTWmj8N*}hfQi&hG z-U-E0xhk%+Yhdv?W|FvW7Y#&U2C=$Gnw^r?cb1VE2?fpptjj21Gzo|#7CKPo+A3K$ z>w$x8rSNpd!G7=|`HzGuEBoyzJ))T&_{Xw&6j={Ut^jU6u4^xTm{|)|F%RtK*1BDZ zy4F_hDhx*hnj+H}RRA%U=*Mvl400f0oT=}0WuWA1Vbp#w6%Y(!6dg2!pst>D9$1WQ zkHP_HmRn3C_lV350`k}@`8FaL37F3!4xivoC8!&1wuQx`T^#^G0@baAOwUgg*S!3C zufI?sJB$oHX-VY|hTe_NIoAqx+`sd+Ckd^V5kXG9*`E4a_4du}tPryC&wEx+FljCh z{f%Vk&vI1hTpAo#=2CB!wV3*y1NeH5!5Rs1w9F%9K(J(V7A38rBS(+Ob#dqfQ_+dr zbX_i9uw-gjB+ID#fX$COEV{!6E0mUywV^EzKhLpwDO5m>w&8pPy3siDhmr<|kS)hk z*d}d^INDw+*uI85!m-;O4sE&vRC4h0we5?-`l^r4E$1K4`3O88ZpL;xBp|{0GXFM zw6Oz*IRI|JHvZCeV0 z6!8d#3l72$><>KKd_G@rdrVHI<&L8p|ggqGfmUenXb!ec_uYufZAcsI&{0P(Ai+r*>K+3XxG_z z*IBglwXx=FJ*gzD;Ym~5lV+!;Ft$@T&y$uHoHeOq%C#r0xw#HUlAHKmAu|}Br;H=f z4ug1y+mKU^3CvENe|5RLS!vC1Qsd%D`(tb8>8U2mX(uTMcQb^C|EV*_(`U|4BZ;S; zkf$f(Q<~?dpDdoVI=1#>0s1S>GzYm5YF*BjO2qBILJhvES8;jG<>GT=#!k^TLKAVW z(DM_7w7ypjIMo5`my9G@m?ipL$&+aZ}=skSi* zNsC8W&~LMoHgl=pveH=o(dD?+PJuNul5=fyZs3L1M)P*Fa+~qj6yLNmx<>K2=1reU zKAu~snZ9XecfHiLIQ4eksWV_wmXz`iT50P zpqWvr==QMG%_wdr($GHQ+pFsNv6|h}kB;5yr?0QEGcQ19E zH`YNj!~JQkd;0^|pUmSOo$g(`?$3|isVpAd!X7<}9=(PheYPI`o*pm4JO&ax26H`L zmU;})lgynSBcmRp^B!Zn9tm_cR;|}| zr`OJ?*Y3R6-mcfDW3PRdGoOXed{I2})$q*0{|CApiq%I<#7A7oN5aTQ(#}W9%SSrg zN9KwTI?qS;j*ncOk9?Pp!kCZZf{)UkkMbWM6;@wW5nnYWUv(p2nueXPrkAf)xUcpV zU!6Q(-8;T|b-wytz6N8yh6}z%dn->j(`+da_j(3nrPV09T^EsUMYrpMR?X>lT?E2Z z%g<`zy3fCMUH!jy9oSq4O1JCGzk?GA|Il?AsaD}@&^VYmovy2Au&D(9rRz|BiF|2x z-@(Bd{tDy%r$qul{1A(O>AE1qf9N{unjNd3MIFSaE1+K`KmwlZKeryfpY9jqCou;} ztf$j;r0|Tjpcer#N;&wTRV&Ms$nWXlJR4!*h`1L}!yv!&b%+Q6oh8er&mgxuz#`_- zTQbsp@M)J1H)85mVo_@}Jl(G2V@M=IE{Ct4Q(}mst9QCl5z5IxlDMXh8X z3XD$I6;-a#>AGmGWYat8_?T7y@36mgU45EuS9WBOUqn#0U7lYeAk9iAwUJKOWu$te z)2!;#yfacS)~6MiC*v2>@r1RK|IRx<&)+j8Y4G6_hHwK)(b?^tdJX*iNT5RFbk30$#B1BqaCGy&`5^Vf|;F`*mN46 z2ss;*ou3yN8I$Z^mVGKVAWQ*fwVyMS967h2!?R&E2h|P=w&3~vEcT=M&5*>! z&9(2_{_t~Bv5{bl2xxk+^beR!`WKAWG;^)<&&I>SKhooa z{PZk?>2}@qyf2`qL2`FL?}kVDr$y=hSXKPwL$~X&q6b!d>9Kt2I){AI0{ zKS@+D{amOO8?%S@H;tAwa~BfZnN*9%DR!0M5S1wVJ7HdDzMp4}iZp^cpuYG8#svJh z5O{7C4M@8evwFWW#%o|-o9$v`V${pMa~9?#^7ohvRoS)?>leH(#@@=lNWaL(7cW1* z82{>G!kL^ai|J1n(+lemiRZE}qxO>?XJ5JXBl%W#%7-5+TI;l@A`B7^-&^T+9TAcU zfaZ3^w$tr8A_94uWO+AR1p{u4O}}{YP<}mVo;B=CWwtXqT%Aga+)OY0beMk;ad!qx zfbTBf%)XY6FfNO_@!aQs?7B6jG$p`yPUHX5b!7pP8DFb^q(}0d*Dm{!FqeX+q$myk zY}pSs9t`(53^2y8)9E@FI$c+of}TrBK&RcL({+WHq~=oM`To*%*<*)j%J2IYJ~?9y znNL%%9IoOqBuo1+Z6YK)@f(^7a;F^4yddE>K_xAz*#kk1%KL!}DTUu-J?_LNQdg~7 z!`=B{HhIw9E8i{}r50R)(djyxsbv^JCtl+FH!Jw=W%56E9X^sPF+ySEP$lR~_PHo^ zOu%2et}IH5@3;lca8e6&);!@=-l|H*mlwCT?%n;24MP02>&P3Pd|NFu2_Lk5{@Qiq zkkXlC+d#EjH-0K2!cu>ZWK|m@?kTFT9IZ)gvf1(_uH=+_;iV5c zNkt$1bc}rkuT1qU;Ct_vdzIaiA=0oW_^b3-iQdI8|EtS|9H1vFtZu1=Z5_x?QJkur zYD2!s&D6L(J^$_L%Z#huf42B1Bv=pOahHEB^}#1i zwK(sN4TX6QrD0{QUk6*Wjpsi7F+4i_vhjN0%}vTdar1?R+59LP?)VL45qDg2T9m{n z_pdI8N@2Is5liKEixo@b4`~%k7rt~TmLZm1|=$tjt^;v zQia}Gy3$3iA0_g2ZEzo%WJ@*9TZ*0~clUGKOJ40m#x2D4b9>Miarw_KCRvCk2%Wc- z$o06kc`ucFa_FTN!A=;5O+F{8cMavQ}k zEe;u8d*-VU-m`pfEoDm0`L`(U%jImRG0VRB`X}_>AZ~~E$X3<1Ze4;`-Sug-dsrRP zuK0BR(&ZZlk6-A?|3|Rf-?CC3%8;Yi7Vi{EYmBtLv3Ryo*{-BI*SfOy6o@cT>Gn&r z&XK>>7c9@ab}!_JD=!SbRy0~b&kxcG zyChZFzCzB&6W-Z|T5qtY=B;I7gN~cz$7{g*gU794mZ7uzIRjG71kUI;3BQk@Quoj% zR-zY{Yo8htm7~8)g%~GU$PBfX>#W^yzq6v(>EWZzo8J25$C}iUpRUV^3L5ok^q6rZ#It_i$9`MMl}|9}Hk0VI znK$_U=Ipr}M&lk%dt#k7U4~9y_&#h%jjPZXz<1m8uN&0O&08pLPtmkhYeHL|ZQe_l zdwaGj1elw5Y-helF%!L3&)}5vDSiJ6a!|GqCyc#pf`EC*Q>_ZuTA+*&b>gt6_vjyTwe(Q%O zaZLoVcd>nFkuSLx6)T#Vt{Iv>-nqrhUy`joM&NCxd%DI!gmd zh1c#@6!u=d)mm=fmckXQ@j!H(?8UvUZd=+%M)$S$NO7IuPS{kJIkI_Wd_hsWn@P z&s*Gv(M0uHEZlU=S;jvI$e0JpNA{@*y^9?b7IEadJbsnycz@lgP1UX@Wd&-S1Z5#l z=8t~t#_jN5EyC0_`XW%T2S-EhmkDbekHq0t%x*Ono3EZ&x$1Wu+8(OD^{pxYliKGv{PB{z8F_pwA6lM(IZaU^9^nuzr|=mG_EWkNl(98=)~Apc0)rKD+C=k(=>1DGG6r=5U9!k>)gDT7r!aS6i^2CkMU-J zCYP)DGxjFEei(WC&N`Q>2^7Oa?6we~oA0 z^bKR8P{7aR3PlK0MRHNmqE@Si#-WLH!O-}0UjUH%)lV?v&PhG4w-WGa`va5Z_kH@P zZpqdueRQ`$bg~J6MyKpR-=Jl&*!QR%kaR0h$1jD0k}PV#_a_*|I|PPZDl>*8ITX3D z;k(g%cZ?|@Mg<&NF-1kli!blq;+s;JpOBEHzR**}9tkE&t?zM$J;D)QYOQ7XrT1v! z=*OS;i~-BI)g>GRw-W|tQh?+#ii^?DBSwo|jbE}A60Y^^rLf#8VgxhBGP)*|$dufu z`I-FH@Iw~C)gZ@&>m-h|P2Aq^ZdhNB&LPl*c>u=;F)94;OcA1Vlx9r%pz_N^mDPlo zPSKci9=n#&-}_-Rrtzxf>U$(QV|VMDU`%TPc&i)7^#-RxxRzON%;kDr5h8q&U^_*+ z+vjUgOta=VAlX8&Dv@3fQ#>|1i;PM<*y5+(e_p}$Et)9w2m`p;?ql><3%EnUQu`>U z4ihPKlBya`N)puM--|+Fp-g2~CrRocu+IK+v;UGUwcKEArFPEOw&I4dWU_N7Mlh!b zU2w|A4GR}~Cujt*8LIGXUewNUb<$b>_%m6h1``OqKl+TouaF{lKxLv#f0SBt(()O{ zEaMJwCx3z!R{aZZ40VYg8V4qv~no40~0^3_?c)QkpRH)C)R z!pR)j2~vPY8Y23iebbkc?J9dEg0RM+CbAqA#(b;yUUE0Y-ZWOJFu_%r3iRtY^_!QZ z>!E;Qhv0;Ml81_5bhFt-G5m>Ny(dD%?BN@q4lvRcCSyNM?*BB2{*_$!6D%dLG zw{8|0r5wf05Swlfg&qvw^MLaOItTPPw-7eX&WNWGQyz8WG$_V?*)G3i7(-2rgSsVB z9yl{>M8qt+wXBISzC@TND|Y~l6mW0~PLE8*J?Q<`_?I>67HVwCna)3uTMl zczL3e_~E`9zfuz1%4Zc#y@6Iasz|q6~wOTy^{s2w$a2Xq-&FL)P+pa!FqP8s;>EtTXbj zT_?s2C9|3tn)P~yJ-AK^bZ~fRPnP~pr|U*S?ckO~)J+TM;0VdD@mC6QZpYE#&BY7A zQV}S5(^NHBfXaN=L1p)p!YM-n@R}jScSd|%3(E0%l#nIVkOLAj>MzP*-J?!^N(r+& z=$$c7B~SHzDk~lkNi}k?_n&wj^(e`$S=?y6SYY#Qb^lu&7cNy7s}T>*{wQULIi*)& zVvOXPe^dIX;cb;FG+7GAds67q!_(31MR*08iRJWPyY9-G3FIvaq-Af^3f!s-bUBIJ z8oPV8ra&|WIPGZmk6j0td@x|pr_?kG6BWiH_SfG1vb}AjcyYu6*ghM0@{S{D=q5hfNXvz*q?Qwar`h7ua=T4p)I< zf!|Q3oI-{^HZI%wP)nV{0~KQL^AN?IEf;k_VA{(wNY*8jj(!vJ6wVZQ>0UXzQa$UZ z2wB;e_K+UD(&L?n(;jKGdvYEA55gQ@#_XFA>xYyet`DW8G!&d`%8i>Ow2aD)9V*O} zQka!fL@~2QXy7I6Eo0Lp-!Jt7j~jk3_HwI~+K+=+{_=I0)!j9hx9Ge_xSVB^N~H!^ z6__bD(+J7MRZZe@{_J$Hehy?}(7GO!R%O2P$-}@DEr~N?Cb1VpvQ()GCaR$k_A(=3 z_H??g;2i%xiY20^kHP=V1@L1fkx**a~4$I?AVMLsoXFR4c@)LqT2oZ%X^TlVfU zJuTV5P5?$fG~q>c%keEq7pe-RKhZoB#cIZ(ANdv$`srjzKgTa~>FhHT3tRuo6vJ znP`*WV0D#b6Lk#}<7hKFFeliA`~B67E!x!n0CQ2pB2dFzx#>jy!HG{_EzTWauSZ+9 zYgoZFY@SA2$sAZ^Ygi6M+r4hGc^+-m7>#Q*F;xdU7&n={igtJv?YPopYu)51@y+>n zwEeb*wekfl+haoypdRB_t(2&QRdC}1D=bG-?2yh+X~HN(VWpG z_v$t^^B#hcP2Y42aPGIhx#l;!HEOy&r+XcmuD3Pa*karTwcLg^J6rBl@cN|b_T$@`^cc7N?|A)|z>6^f*;?MO zVuJ6s_*H$!KW_1V9zz(232*-vE)nC!rVTsn_1}(hSC2U-8snG$O;-i%-}oJ`{^P9i z)6frEkx?zt<=-#Bw1e2RqxFCIN@$1Dr8zlNfNEe z>VI`PfwE*oE0Gli(o;xcZ93#A1IskUW=Hi9hhsA(NIBa?B@L6j>R5r*SoOZvoNAg* zcDqi_hEDEAZ06nAtjDc6&s)i_VsoD!=58G3{m{`nYR!S^-sFA8iqXAhubX*WCyVpv zbCoMqTnY=;qnD z>>qL0X}US;Kkp}g!tSDg}xHr5E;lG(V572P7^Ur7V4r0m0tdzekB5_do3RO8Vsf%HK}*-4q}-j0Lj{ySsda8xQ@DI4CY_S&eqS1vPta`zz#&Jounr|lEF&OkvbtSLha{8q7Y8SKkD|6o8)Aq`3hDSf3$x1XY#VwNu`a2|I+h zVsud;*glafg8G5en0vLW@9CA#kGq!TjlTvOeLGEiyiN4D&LWBK{(kj^o(C3us%tHZ z`0bW)hx!pdAHWUi(6R(^=K$d3KfAwwsUCHE(K^3sn+!)?>6!iW7;GxQ+BSZ#T+M+X zpesb#T;7MxtA=ex6Vd5;Fj$zed4W8v2ihheJI*MHzt7Mn$}^k`cSQ!+6sZT#G{tcS z+Lh|ZeLwigxoO8Lbi}QBi7VLYfpwAAzq*|BLpx!HhqMqEN^V*3|LSs=dBRSro+Pzg z_{|gU@hqITs|{aYlEoez5`oz;lhE@-(Q*#Ou2oP2U=L$IlPE&Hh0=%@iw=1WzNU{w zm}XAZnOj6-G`{d9)%nwhP<1s-HpKnS@|8AbS`q{=?maVk{r&s#?~76dvkh1Ex36Z@ z7XQ)Zgf1Pd?NClXtrNT)v->VgNfO@34@0#%c`z7U{nxF-jI_D_(zxbinKi;wiBUdp zS%sCkYxLVH0?WbcgtmBYjIm^7XXRSPH21k(R(z03!Gmyn8t=)ytiH>(ZAi17QI?1o z{8bzdIeO3%E2?W9ZQo!jLUh=-Oam$Mu&8Bxc;Z?5o4} zS!!N1xxVPU%IYF4Z0~l>07pmR;LyD%rd0)99+pr|Im}{Nn(z1oLob&HJlN_G9nT3z zUa=HQ6F)xU9KVg}yE=}$>eH8lKjOFOzkf6scx&m3`UX=Rb0-&Fmn+R8i|SeeryNX# z<;Iv;HO7YbK$ zk1zNwlAnFsz7b_OxnAgN;8wJD)AI`ddZGWFXVV42UHls*!RD_1r3rud1Meh>6mQ-N zQz~9BBnP-|7G4eu5MveMXM)G*bqNeO)KOvEb)5?ZO(gs zce>@_mEZ2q<^$k+RkB;3g-%2dx}TG_AcH=&{SbNd>3MhayHMXD@bWhD_nSVQhrgBv zY7)MyBDf^hGlUMtgAZ=)Oje0JGV=Opoh0O2rm`LyBIPJNHFG9k_}d$M&-s`+d)mp1 z&T$uoV9M&wY7yh8z5AC?m>ijBIOJWm{b^45wdRO z969(@<23JF{2`F{h(cdQ0A?l}fD|}DmNfmF1M$mbM5QqLQX;o%F zH7V}9ZOa;s+J>wOU$hSRndEaE4HVosZ?{YY{Ats*^(8i%tgKdQTVEU#No~~HQrOV= zfKi-{u)MRwRivK|di8Aedhs{Yc|&c;c#KFs@|7e)C5MoPp7KpMJQz?KLFHmgNH_2q zW(p&V`BtX%uZL)AB#?zLBuoipQYprkAq>}jqB#Qq=2G=)H?9!RaWWkRB7rpjW`Vjtk#;sHOVkQqU>wSx#@%cg|H*g2fnoR* z8f9M>`BNO`6ltgL@}q5PSjnU1ou`G%Y4;^)P1w-3=RqZJdu#BgxDSZ-FK{*OugBr8 z)3;q{KSlaFekWAx?8(1;w>evQ|58&s4j zon=nCR-@%tdHzOYYJlpn*;r`H_U(y#Yl@G5b`GqF9=H-}+{un0?Nho-R2p6pX-j?Vm%1i66kh0EQ^63sz5;_Us|%F$MWR|HrBOT~G-9SSV2a?kH& z(!9r_Ux@D%_Vqs<6caZ|BGnBKp4~dun-k*GYLE{KJ@Z=X&2WGgW2vWM8Q?cx&z+W* zU$7^#Vly%hh#1eOo?~9_zh&lTzPB`Pz|YA~DlU&{J)6*P#@-c>Bg30dXH6{o)YG2~ z`oxIzBZvY3%qY^F>iE4O*uzG=kKvO=uQBsyTa9Q=JHD({H5yRrnkM!SFakqQLN{#By?yey>+OFXeoXde@Tb-a!hyp|FC!8PffOMzvz>Y zKp-IrAiagAbm;<$gd$ZsB1JF(q=R%pMH71ORY2-39fKkQ0)~#%P()Nvnur2Y1O)`L z!hJu_yWV%!%$~hx)~uPm_xsEBA6(a**Kr=-?u|mn%;YZ4!oqqM>{ii2`PYY7dMFx+yrLP|YOerf* zDeEa8bZ9@hfq-NBz;9lrJ)5GDP*}Bzcrg;JWeUzfO3sp?6!W16yFjkHKygcixFkeU zbP(>6O6y6c9Vv!IyDj8u^4>M&tzP82pSsl+eS{{~B!9*Hs705q+)N}Us*2M8jHVyy zqrV2FpM;-%LS(4hvLO1>--~2;>AM?_;H~$Ny}JzB*d&ya^+=}h7hhHeWzLTJicXlm zlQX4B0Ns!N7WSz#t&a>=P%=dmExwE!svvAF?M&*E63WyF4zOdXN#ixZfSNsWC} zw+>ywMg=0NZXF?p>ek5s1LJ*Rf4in7g1zxly+f`YA z(``ok3f;Ru7@Hc<-8HK{0s;vQi5$!VtmbgW^shYWf)W#tqLa9i$b#HE0s=cpN^vlKjwo^_Q{#{*$q(L^pnlzwdO%-)dcaIBynJt-~^15@6E(TdiXwh(^qF`3vwI z32?CqKod)4kn&9bsCClv;$Nt09e@RoQ>$XY$d#jnjDx({$zQT3zM=h z14t<%Bv#|xgKb(SOi)sv#oDSr4A6|H;QQF1YF>Ea&hwspWs&&yGqA5fC zqCf?aM$zy&M=jC;Vrml{iF`7Kn?vhFN zZ#*C5af;S;-F6}pVABL~EETPrXJTgJPj`bP|AW?Hnj@)b9f|Wu!^8~BRIYHbsN0~E zE+(4hUua$WQ2#1k?O5Y#L-f_RnU;ocElsknybtWmrZQ;&-k|EwnZ<@GE1Q*zx(TIL zkPNX2LYz*g>R*4ObqN$F(5q)vp)2u*N1v zpEUz-?v@}GXjL81(BxuBv7Gi!C*$;6yT3pjGkZq;yEau>4r^~6HnX_n932aw@qkQ8 zuTJQPpw4~Z{p2zGEdoqWLwf=z_tRHLhxMkAmE$=p$GU6QkE$;0Xc!y8KKB9X6Y);n zr5vhEs;ynPB6>$Lt6NlE?(LN01xQuABS%xZq_p&TdMP$r#UG?n?p8E{D@r9+;H-&N ziFwnl6gKw^tIIc)EYdyAvOC#=f$Rf+OIL=laT{lR%4C@+W5}QdvE(VF$b|QZX@ZoX%w7^W`&XxfXf6p~vvtS3N>p|CZ zf@Do^s+{FBced(wWKIMGs3-IP3i#aE)pS&B__cG&!C!Kk0E2G_CMX5jT@TI42_4xC zQdzO+??UwaT3RV#PB+26pt^N9Xw-GdLy~g}dr*GP{h^%utn9i<0uEHS&Sy%3t23l` zlh@Wruk`VN0_CSrq?aLH?v;ZUuCJemSb4nzEo?~qCY{7N`huTJENHz&DY zbR{Yp@|0WCBfN0&nxeI;j~fHXI+pH2sJeArPOhz%vXns`mH0viQ{g%Q3r326I_8Sk z0-urq12c|2k;HHFR4n25Y)%YWQDOzaS6&&0vSUEmkA;%w$eW8e7K7Xtoj@LDTw#WwYZ*59 zeM{k=P}>F096A^H%KMz(d7Q*3?)AJPcP?8>izMi783_o>B(b&5Uy){oVeghS*pxMf zPS>|E69KSUbiplKv2;&%QfD6D2g5&R^c^4aynu|EfQN4h{9W^B;x*{^JCUszWG0@8 zBK%&=mbtX$?vp4kTW{OPp`jIo(3Dgj>4WrlLY_!**~ZN(aAE)W1BegS!L z<~secjJqqzW7k}gA|CrQ_NM?_D}YfZrqoNrGtI6=wt_oZ-(|C*Yl@KPienR07GEKR z!KYuS=wyfex$J6(^|vK@-zy3taIUw!e3NfU;0~m^b&>mg@l?02*q`dwB`$Tu7`(Pq zX4r}fh^vfR6L|H~?nO>1w=Wdceb1?nyW`tr+e=9Mw@|zF(40Hy3VLkOx!GG3_Ivvt zxY9T)-Yx*0NPJoRVvohF=+S&tmJ>U%7&ctY{p=%Kvz@s0FiGH27mmBfzOn1Ym##al zpa=hk>poo^7_saKyMHtKUUSvrU5drvIi0SeBc+&2cJZ@KRJRU)kmFkXCgRZ$PGL}^ zpH(4uFzS`Ke_-eHBb%m6Bkq$!4Ul1Z9)JP>fj?v&JsNci91)_sZ^p?po1+zw>in(E ze*He#Nq(#sH}>L7j^ccUcMGsbgDf<61$OKiyMW?3zF_QSr>|yZ zr{fAQ2dK@&H3hU{#-H=*M$ditjCvfJ|8kqN{moo<0QdCEzdm%UU2bUioGxK%w`aGMC^qE41>o8+)jy@lUfqMRS>%dQdM`548vV3}VmOhc=^J`FY5@t!f z!<2NqaPjdIAwJJ$&80V8Nc+%b`Y7uD{TCL_Ws?URkx*w{XJ`8kQS%31Zl6Th@p&2) ztW+%i^{`px?k`vOPb+?(R_U$3UVOsxq3vsDsYuR6?Y8XIz2nuIliZBQPHhXB`6m+l zK*mJ8_9m`|pmqCR$7&tlh7J3g$`huL$7>~m#@_54wjVA~6mkOW>0d?*u;B}*#x8GA zKKZ^o*$AlVOd{14mEBpwdFbZw;}`foG;db%ZJqmkAM(&C6Swu^6t4ShAf^2SxS80+ z_C#ss`0jpt?R{XI;mw_@qN6dkeUso* zw=N`cKd1EL@xpPD;7Q8$!=cY7W_w2mw|H|OsLT@#fL%o-0vN`1DORSWD+{D0Bq?j0V{EBvj?eOJ(z ztD5+We_I6Cnrf9tsD?Hf(*CW>(FX@uY<<#fK)#ONh*#P$&(QhU8V0*iKid+x_Mtr? z=bZVA`+xX)Xp(Lh>wepnDwXxke}s9!eetZjlv!!~JYI?$Jp8#G=klq2t)kxiuR9=b zDMJMx|J89&e;k|9p+l{9R-gBeqfkY6LfW}M5-uaz{3$63TDS5qwCsEt&cFB&p>}A7 z=&SMU{fguYZY;CP{Py)cw9`AWE?@nr4ZZXYgCD?z#lCFjpeC|tiI(o1ny1S%`dz9p zXdsh5@Zgxf@usucFr^#G^*}PBPi}86r-*R!*Gq?c`Pyj%(mGoa4zVbKbM|p(AICbz zi`VDQItX^Gre0_d35ar96tUtH$EWTloPrQ&?kK zbUaCY2rH1vH^f2Dmd^f5xm+rLRUno5=X&f}4dG`4F8Mwk^sWV?Ex9g*pddLgXHw@g z&9f(&0w&xV+D@(|!UXd;OqLhusvw9uTj7FSQCU`gJ}ftyabo&CMb#`MkUb;X5 z7D&h+OjY?`xgHb)_h)qsmWiByA{~2V#&p)k8uK08B%cjF`~E$zr}Ug>{m!W2?e<@- zFV1^p{RIW#+k_LT2X!*b=D%c@kpPQ&!9 zapaG&wT7`#vYPe!bm;EH1*?`7>U8yZxWMGIHRt&@-yWux7>w4PZ;{Wtcsn9ZRq1UL znZqxSrlww%>w=cQ!mQSeb^M-B;78a-V$qEuK&c=2+>6b~2ZwG@nN?Cd)WB3id>$|R zET$9gk?PPlk7qdR_PA@|JUDVt`&%O#^=nS!{s|g*sf!7u=*IXk;%7gv=_>NqfmS4F zydMB@gs30TKt5WRU~a6U)aF5vSG`KL@g$&?mv6jftzf=&ClIaNpq$X$joCSuicSm< z6u(c2#F&y156jCKg>wylc!=uAIaVZqw-+#XNtd|x7ZENzj$GW#NXaRugh)wejpBV; z3330l#2cgB${48D;jJ^qTe>uzt0_vAKz>~)@X~%rUvQ*W(DuD0jKUjzlf#Jg?DYlY z@itUy*NIzrl}_|{Kh?&~Aid4mQCt}irNa=A+oLeZsGJ(_P<@3@Aq2=UPk7_mhz93{ zXej(5_q$dvLPjK0E@=`$i~1Rlribn2i`?~nADQu~pn;0Vgc13jbpaoAvfL)8+9_by zyI!GAanIOTun^lJ=V=^{W1@Hv0#HRg4VE2pS|SZhpS$wPSK3}ID6otNvMNOB^&L;; zTRxMDs?zuFOAK=$mPvYl6Ex~d%ov=)%GHTA^Z>5rP<1(uOK0`G#V33sOWZJ0;*WOU z5VTKqx!!!Ct6hk)=s+{`I`KsFM<88=pqd*=oNOv6iyA~Azx{rm^_T~*X0CRXJynb;%{%%+`Q(T=rp)j|NANr4q20(`GbYw{&9^He+`clrc)Su zrB<$xPHo5|TWoz`&5^e={y%iNgb!2I!7pmP>pXA5KF$mVQ`<@F{;kWkZESe+F2r6a z$3Ih_lRVGwHoGhx#p{R{yj297TQ#`TI#pL6^fZC0%X!>+HM>!Nhq`>X6@RB~S?X!% zSi+~>>UYB8C@ssZAw!X_I~_teUGYb-$fNIfULS2deYoez%nJ=6fu)IrAZ$8CoT|&6 z+{Po}3!rn&R9%h=AMz}~O=h}y9Au*9c@`M&g!BMb0I?d9j?6V2Q1mf!51(@zo> zvTlgCJ6hym59#Bnl#S^{(`rU$b)>_W(Q%2+A%4Z2t8ULSir}|IUuzcI%j#!7O}r(3 zyLZs~$FqzV_jwQ@p~I_y=jK+2n&+0$_yd#SwgYqp;Y?!->7Y_j;k~QcCEIBiohdI+6%B ze@FZKqv#8x-Tkw^Yexmc8aW}aI_G}gy2nEq;)(e^ajq~hIDl&@Xf)I;%D^(U8OXCV zPbc#OsPV9^n`?R|vT(ly{4lYbYZKbrmCZq0^V)OC?EEPC>5agWiu_fkirx~pLz4Lj-myhyOgucWeRYdsrJhifRoKcb- zeZ2wNi|zyRsP;IbzJ7g69Qn*~ry}fYC*>|pbC?Mo-fjEg#Me7x*MfATEHd}-$9K$s zV`0`<@U#x|cdUtYpJx|5D;jZe$3>Cj z0h2O8*v1VW|89_ zn2IdT(^_MLJ^G>6c-n1Kcp(RcHg_vlWefTKU|~T}L3<+XHf`alf2XBl>;D@4qEG^{P|Jt_GEu^|xf_>kj62=Q$fEn@9K;W9OC= z8i>z&A6q~JJZ0>zQW-m$f{VoIeu1MCX`>%qK(G zsxqjK9cRnq=s@Ja%lxCQoM}I#9$xZi9+FJMbY(1<@zh|y8L2i2R{QideLo5p$XVlk6HRPfEKcIHZ zP+4zShlqyxP(;waqyZ+G0JBf^CMr|lW*9m*6vQD0>>?xgW~azLHLb=pp2jTh zF22SJ#4l9Fx(AT62f(NCw5swvJ@Jsf???+dP8SHh^*r5rDq|dm?tv-2JvmRcaG_#3ne%3R zT1-At2TQz*e}a^)$;X4OYQU+X3|YlsFbkx?6j>pIK5j15LDA~@wpP^yo4kkUkQuvU zA(b$O2$@#1Vwvy040=SV00~ac+$IB4dL7eh1nwEs5v)E@pGB0ix0N$9O$9SOcLSy| z>!-2BrE$GU^PtjbD(UJ8Y#I`%A#5!P%iXhDwnUbRYi%@=cpY}oD9!=_3l4&cv8H0dfxe-sc<>2^j`JG{bqVFFiZZ}MQY*lG zT}TWEKV6M0^ogEE8b0Qe&C5TG3s zE;a4;pf~SqiVvRN(!W|y^3h0wTm=`TcE9^t?yd7Oz` z859ZelP$i=(02Q@jEnCxB7FPu#WL%ekyk=9eOoK-a2;0uuPRa*H!+C@7_RMkIs<&b z_Qlr;wOM-!Pha6-C8P$Hxs_xL@tT#q?SNWyk|#sg*#NmDYASWA|7m)*J{%X78vcq_sh zOPgl%+TjwZpsn!Q&YPd)*0zg-20-|v{{Scr8p@q2>>Y>O5-t}g`2vDBtRtV_7}BTe zb;XEjELh}A?SV{8&R4o)&fyD%lNaq@_<0s>1gJmpxkH)(fQmQp!#n>uy2rkD=UT z`n+^^+D!L6%5Y_}L8H-ks~u@e>_HyxL`&;oa%LR>=yKm z-e9IGF%CDdPu&AOpcee1|hbXASZ@&_3^ZEvsEb@3k@790zZ`9gcj4vSf8qdA8$PZ zr6Hw8U*<_LiAXQWOJDe9L^tqkQTD6<0lwlS6**8UG8b0{DJknC&{yCky+0s}WKo;G z`2$`2{L)`i^p>tkt6ar>@%t%Ak}50gSGGZ{2q2iJ$mI@UkfuL1w9%SnvwNNw(%NB)#O|5M)Nr{e9O7gBzzJpFlb z?5Eo1Pj%$3#`#?>lie$~scoRU`cHQa$99c2cTJFcX6OG~m;3*>F8B6-k1oe}VVS6f zR0qpyGwBL9XtS7T%U)r+ zwB__ga-5&(pDT)#GZ3pNe`X-@v`5ZR>gDn?!}G7f@>ga1g`Z!QAJdjMQhew9+~~sR zNO@zGujS8;FK+h8o2dO>er}>p0V|kl(1|pfY9X&EjMLNne^$EF`adS!{lnb7&vf?9 zxgMSK@?(zscJ7VJyp)f5UV)G7q8@c-fKamMW9TXT;CwjbY` zFA8{2?E1e-cc1=?bcbL3e}ix!ea+zU~aeM3D z+RSs$&!g?Hes9iqCs6oJI@%6?u1?n69P4QR^ZV80-v709=Xz#!J_0cKd_MAwm2xU8 z%0;<2k}Y_(y@&c{q?^`y7SXPSU9+^6Ih# z9}|U<)PHVmEofIP*bu70F%^wh1522n6|hc2sd+m;OS{}&sriP}!>xwt#kBt!Z75M& zVUTcAkndBw2CR~@Hr9gH!Q}g--OHJ z#U~ug85raJ|IOXCAO7?k*nV;;uaM)IOSA7kad*yk9{($M2lAWvo4ebka(9!b+}$vU zw`Op%8Xg!hns+9DRM3^m-3`mXlS~~>G5sj1K|rmWoR?_BBQO7pyQ4%XjnI6v*nLao z?x5|ORPIjwP3kFk$8%5n9hJM&YkK+*cb8T>@{~E#Sox`r7OZ^X{?F=5qmh#)=idxy z!TMc8Z!9qP%o&oV)u5HF=0Urf>FSL->6%!uQ7##(hdhY!q34KBiOJsXfmwj{n#*M%y_W7MFPQR)dOveM|@ zIe-f8eor_Zo`EC0zEjHp#LTXdEK)4LHTH1H2@CDXdJWh>KBT$(5tGdd;sS$iy%j>W zt3r#97o79}2JeKZQQe2TDQqWhGjN(l(EW7*%*Y`UN#WjO1VR=CO1N(-D+?C|t}Zze zn{&l`berU@$8UE5a#%xq#HCceVH%6EP^)5P3?vr3>&D)iE{kR^H^nqIN8^_l>c3Ns zy$|NaB_M5;UkBsT&f+Ctq<{SnwJw2Q5(5LKsfm*C20-r)e4eCApNHVm`($3rl=U;iT0F)=*v)6!a&_DCI*gYqSGr_42iv*=r;V-{T zEkrEaINcxw7N#Y$0jeW%YD>6Y>7?{XD`v*+#hlahC7rluUHu^E0_}nh*8-onzJsjk z&`e33-X#I==S#tMpo;f>xB;b6a$4w;0OpyF-$8v1w0TC8>2diFul-FzkTsH?#(FRilpP+mZ>YDLnhj`%(t}ogC&zCEN+CI+=|TS_}A_&X9>Qo z^Hz`=aCN0*d<{y0B*5YJ#f^(cAkcD8ch-eFoM%6`FKS{MA63%-g7sj4tYoMh9T{Gq zbQN>oTNkb6t6d0!LVnwKD)}UQ(fD1KPT*j^49cx>dieg8(xqgdp|5XEr)HmSvEu=~b_|Rd?w;_lyvzwU)4hf#0=0p;p3ZN1z+A z5GlrR+ph)NLZq@}L!{5sM{1j&bpCwD>GumUY$vrLD(7~JwYf)EQmOYL30e%AWjx*_ zwmm^SuZ0lejg*V2^xOAexFC#rzR^cvzTpDR-9-*4$)=2V8(lt3^tGn)IFQ~3{)IIC z<2~Ppm0hhVn^X8N+MlkgJhsUl?D^ZuEpgjLeizme~{`$q&Ze`y+yM^YB z?Ow1Z3<$*&SMTmw669e0Br9V=4)jeMme0 zkb!_)Tne)L^$?;#9lPQkGPD71s6zn3Jmbu+E0g3T0<@3k<#>RGvU!+_g8jieTTtC% zNR?K&K%lyc+3h(Yk%xUOG;HBYAtGc&w63rAzYIO{siDUOMvE-5Y2LNHKi=7=v4f1=FsZHvG;j^yNE;;cjxITGkP(wZFJiem1)7mM!P9;5=(6cmXw(^vI5mY-#gwa2dM%HzT-A;n+Z5)ogF%g7{yK%S$vL!mv z5z@W|gZ1cO=ZPUyN$;3N>W6a1Cnn60u!#_bO6mCqmb^6{d!}ys?C#iwfsbn{#fo6VUcAuh1lr)hp$4>N$DGQ)w0_)OS;J z9_-mD1k!aa!3B!-8Cyz#MgU6nWg2e@pV1oUMB9o~J*no;^gwuWXpN!K zc34Cd7(Irn7D%OA*{WJebp>kDuVB#?hc%l(^omSvk8N$=-P(aiwS%g)nezxVLoN9c zUsr4Glx^L#YVB9^>hadPMZdaFhjo3A=yH4NmJfMTur(`Nu|ahwntbAs&=r?I!^7i467mx1onJbe%sn^dv9V{$uE^QT`L6*Bjr^UtP2D zt3f-TN5HU_is)VTMEFw?Jrq?Et3Ho~TKs{Fnm%qfU%+&E9fo=dGavC$t4Tfsu@$kqNAA6O;4==NynZWS!T);fFECge3m8uE01XzD#| z>K51SHh39YQypXo>h`#DFaLSM--e!3ena=)hTbik=F%Zt;J6=)|y;R~S?v7vg zxV!E$YU|Rh+w>s6)BUfV@5j9+( z=ijT>wyM^OYyN}LW4S8KRM1u5pq2BzCcCu?lrB-kPzp+y7A`O%KJGefg*x z!_%XAsYm-pkIwxby@DQtww|lAJ;p~praZmomwGL4^jh8TwJzwjZR@=;+v{-D%Ra7_ zqum)Ef>wccQyINS2Hi2hYS3a;gj}DjW#5T!AJxr^Ug!gY`iZU?0qZrv!&TT>o-U84 z0$!ad71bchvrgbnbpuz!&NU7JNpur~7S+*<_QJ&uOUCn{6D4m)%Dw@bzljKcLv{4> z+uAvg-bBEkRSl5_x!#l<4H64#pGk5|+730rsRAQUOYKnK-J#~IL!=wE)U3|y8^f$a zL+xHXrlrH3!^7S1k@1559{5QA>~OcnNG}(-^CAayc%0bu^H5w_Vrra$mm z3eRNvrAfuY3Gpy)Df5@4|BUF(PqrSxF;Y|DBlat5>3a4<2E0>R?US8;;Y>006j+z* zA9k6LsTW=Y_H*nv=f)@!(^gU(m&&RFCZ>JV&OJ4l1i`v)+rJAAdlyppF0}4#K;63s z$M5jGGq*?HZ5g}^51SFLrS%|BxqGunCeEbtY6_sm&zJF9Gps@wj*qV69 z0JGbIer$UGBBPLm0>fVCTCP}#ej4wg*-;CtvQdGc~f0OpZJl)JG*sX z0&D+1PI7jja8`Xz&85B8bN}Pu2x$yM|LgAgXJIS{&p(Va4v(4Yy`&`7Mx6osx6dux zf0{akzg7G6>eBm_xlcdrYgc*wz9-ct9etvFoO8TCry#49Zaz(79}q)si?j8XIwguUQ&2mJG28zJYF~_tJT)NG#LMR7zA6WThi`W zo~Zj63h2@rT{f2{P8Pm3z5HeBt^^-qSuD-mG2zP~Lk{kvnjm7uh41N0xfOEQir0fW znfev)jxVrAn7wrQhA;`+DD}?tIeMFuv8-7i;l8E9Q1zr6_(A;T@}qpH`Dk zRvT_i#9n5OM|{hE@GbZ8H--91>M0?s{u{USDWdoJ5#)(9L(d@W2^G;R9;PCCAa436 z4s;+1hDN^Ce<6B+@7U3Ag&j;qKh{qEG4wFsKmB9qVQRB6Kqdg-8K%L-cdeId=wY6I z{>RWm6RC#YtQXB&hq`ytqyN*;!>nILtamXzQz_}kEHmnJf9t@nyEOc^(b{8Mf6;-w3XzY74&xZKZsuH_v4eFX)?9{AbP!zwt5(LFWp99ifZBK zsS61k{~&t3{~&s%wNymU1^>PK`xfeH?PV&WhyAsJXDF9uxXMrO;=8qyx@(C18H5I| z8`n||JyFUwxA7*=Z6?0+`)@Y3G|>(J)6gUD|6}O&9PR+sw|WL?H#-aD{5E2Wqx>7us~w|z6MtAec34kE^f0ym1<~88?yd zSN7Iuxy|ItNIHkqkLCB1UrmZpLO@hjYA?VDbSqB+gh^E~qcMhi;e|>>0fvxIAX3*2 z8Gz=n4G61+mUGVbRS3b;=w=hHoIwYsYM{>pf`*NJO{I7s^fWjOrkS9j954#V#XO4f2{TIvQjg4Q^fq6!2=Hv+7F?!jhw z%8kH9#Nfw@xM$e9Oe##*5ALl{q=!)MZnfVZ%lt8M+u0jpBvso z!dG95w0`ku2t?A&mV_l1i4uv;nk^^~5MapK*Q5cyH3R^3Lb;(e=ZBdV0YA_9*NO&6 z7Gq?eL%&xu08GV_L<2ZP@lkfD!?REKRec*G5OMN8#WOB7BZP%&KL!m5XYF}0PL^;S zemh=aXCS_K%}4)ie&CM2Xj!KJV1XuP$3XJAf-7kOB`#6YO6#bB|6M<5a223r8W0ho zWNlJHVP^0x_BO744h>KM;=>g$?&}nHGItddd^uvryC`(D)rQ%-$wglY4bMfh%gjF4ET;gVq2jUn{;kY0qB5 z@|YL)BJ%9aymj)$Xuaz=Ew)hSbVH@Qi;ikmp|=9c4WQaoG*6`K6wS+hvX#kp#S7A9 zkY*Ph@DG~D5dI%%9^n+tqd0T0RK`|)X@>z~=)TVb44bNGt_5JJYTghn^&NV(@*iqm z$Eljv^k3AxAtKjE0bS(Jj{;RyF0~z0G>>%a1KMSKjo7cGn?@X_qIpSlU7n8FhU#p~ z85Y;Wf~M`%Rc=i>y$mAhGd)W6grK?;*I2~8@N2hA7C&Ik9#FkJQ^ipH8qh=cEU13& zmFnF{wq>Qa5FlorrZbviQT*uGJJAy;gIAS>H!EQ*4B6&wK|_*+$7}Sh3c4u6SN^M^ z-)jgR0C~$~P^Im7c}H6NMxGE)C^v>xdoKyPdnK4zj_mFGSETRCBsf`)UT|a)l(W@lOT!X; zCT0pC_E^Q+$;d{6iG&L0FC2ecd1y5d z)yng?x#*-%%DKJthK^(<2~$AF(a6$Bb7GnbPpv%CnEd?!>`_|$R z=i=xW>$wuQ4va(ko;x=>pM#EI0bl@HrbH7D1fWWsq%c65TrTZ1SlP%+SU_s5RP^QKB9m$Y$O0>$yos8vQGjDqD!A*u|Sye4H-G24SjAWJcD(BUbvXG zgin@+(H8^O8%QV}tc_EO#ejwJ0!-TIIx3duK*jPP*4SDQXB|Na8x1M1;L4&}c?L67 zD-U0so!0BhwKegiPZyLQhJV6#YUN=sD`q_Yuv;MtfMXSZuy_VtSOY{qqM-wYOy0QN zYt_7R{q-NCxEmWVJ_fhjK$f|WTL3g51`^Ru2O#AmKGr?FEPR`eLnckz7YmT#0DiE%I)0#ePXds#BdZC)7w9SGLR28%7H35k!~D18DBg z-;ns_Mq8SWbCARUM5HlW;AeowlXq78Uv3v2AGuP#=c&-RXjMy^VY#h6^f;}#A=Lu$ zG|IUZC|YqX4jI6uC?`dBX+uR?+ZZHZo+^ps?}28U?6w4YL4J7PjT@!}84Tduejocx zGz~8|fQG}j9^^T@}e)gvy$kk95*;Cd}Ux+&Ty1Si)}!0vy0^dm8u)Q%!5E$=4Tu79AgNHqqYw zOwo8lN@B-Gc&r=M(%^>yGAQRtZCf-_`f4Pvc~#;wybrsN9X_9ghqGrDuZW5xuD3g0 z@8_CM)xl%D=9)^yok5rQ2BI!RyIM}U0-(A6)-$*GD^+~~O@x#eXw>|;^Swcq!r89j z?G10@%a;fhQEmAlECalUc|r=qIxl4(j>u(yeGzbfK9+>xwti8@+ibMt+9z<#PWnSJP2+CbN*xeQ#dY?5@I2n|Pt9^h0EP7Z_1?zk z`vBkt#GEE=qCgZ>ur2q?Us(%vbqXp99B-97GflkC0PfrTSi=4J*iH#E=nMP9=BJY) zv~c;&g8e295inM>0RIhnJb|FsEh*W#$S-->gjN}&=oOF}XV>u406wgT^nTOE{=6wF z4Pxk?4_sqD!>kr*oxgjR!B7-HZcgx%k9n8S{vM`S5uStC=WT8##-vLd(bGLVP%AwVo1zW)e>xv=NN6%$Yd9FqC=UhFOGahnaEPPrg*h8Adzr!3~m$u|=rsrWbB z@vvI?5jlK?1Rmr|wPEqFd75xJF#v`H^aVgo!Opn_v|E*GOwd(7Jfm+F`Cj4>Gt_9S zVki9Z-%uW2n1>aBZy(N8`@*dXLyOwrt7!l#%JMYfgLoJQf1!%=?ehfxI6R`VG}{S^ zxdHg2uK>prEY@G&I0Ufm^TO~iBfHsRJV0;P0Py+OirfhtnVQ?!u5-gW-d?=h&--G1 zqoKJx;mm~aSOAd(Fhk)PiIJtMctH*TP&ucN;e6zGya+eIMnBd!7vOgZ@JdEQEP-jP zvyx^;M1G$bKAcJ;1HfrAxYM6EDNot5#oq~3rPwqpgg+AM$miK6;6Z3!*IWS2Y9$@^ z@;g}XjD)h88-JoXUjKcx?jH+DgXo@qV-& zDoXK{3=fAe0ZL~>Y_+o%7;k8es*Gj?S9MSWziv_PwNM-DbHe7(MxOde-)O6S!|`ny zChK@_B3J_z#Xujehl(sQi1l#0`m>S-jwLq248P16JiTNzRnANl9zjm1Ps16Up+-dk z2s(77R_I8v%*Yf~x?4|v+cQF$sS2;6F)ps8}#zo|jPxp;i3z9!O8-K4g zUYn)=nrOT+JDQt2x<-}m;3k9t;~!M%?ou38%CknDN_W4Vp{G!uvdLksNrCApl=pkq zt0rJ0(|I^(PuP?$%rqmFE|F{ksWXMmb?n1|p?FiohG|Od z7<))Gj4Is~*08&oB8|+Dy<_afWacn4jxXi#LKD_HGiVmw(Z7|v{|D0D+=KvC$s<9z zUYN-Sk>S12|03O82(wU5vQR0sP_47L*#4iSyJHIsqoqcag*AtTR*>nxm^?R2-7rhN zBuo85OM^N~1AAa1254Akd6lZ<%~+Z;UNhspW-fKjLh73Q@svf-l(7imTG%z~q-!== zZ!MGFS`}Wq@xoFOO4i!Ab~7p+J`z!gB~z8Wr#<5+H?yK#vwCf-TO0H)$Jg8otwQ5z z6QSYpXe%c*BUjex!q{n-TV}UkKz;wR3Q)Ul{%jgWvWg>HeYy`$ zdS6ehydJo5{b8YSsPZ@(H6G*p?vv&9;J5D_zZg-!84<_VW0R~eG0ae$MXaN-WTBbq zuZhttjN`E(Gjq!`@yFJQWa~IJo6NW9P(72BYq9=O#_?f9P@c$TOMXxzwrSp z`=M%X^j_Ax^APeOk6omw-Ge>bCT}~jmg|m;7Jr6KgCXyKmfI1Cb}#3sDX-*v_SSS) zW>_FMfUL9W8+JWLvm3Evn(gDABQtb=SsyIh^irj}AX}a=0%Bx#h$`Ln&YqbgkNvlF z_rFWJb5cKX!XTVA_?$JRowY7IYa2UXad6i0an^m{te5Pp|Jd1}-r2Cj`Rb^%(I;o) zAI>Hx&ZdZ4W_-8)2YYuJ71iG`{GJ$K$eBU98w8{~B&Ac7Mmhxqr9q^-TR^%&N*YAE zB}D;2N$EyPV4gkT|GKXGxzBpeIcq)Zyg29VH#0Bxn^|jS?fKRBGqwAd$b0;3%@`o^ zX2K9eUfP--K;+GUb*wCntP=hq@`2yjYU>*$ApJWcZyfxW$UFFD`p{Cp66CtKo?&8*Ao9Q)0FmcjWchC* zuj!ZfsWm)2C0sXc4KVV8PDB0WHvGPVqW~i>%`%t>6Jg{nh9x|-O+XlV7}0k|9?NF< zk5vziU%D)RiM*JswSYS!54@p6gsF!f9IXLp&X@p^mjw`cX_kg*YrZA6s(*<*)M%uT z?3e(ZJ#>#4G%KcX1d%rrrls*b5g_t3)*?9m5_uJ0R{ixm67kD|+1!p0I!+UmJaZ$Qus> zh&(u88tInWmyh5?ngaT({e1C-Hwc7*%L>@T&X^Hh4n-_xP8^#SG3jmR)yC-$9^9vyINbH=#904NF zKL+LUX6;Yb+SRDTF4byayF<0ruYVJHi<_R~wx+)}&e6j?khV+#B2QtPi)B}jDs-(9 z!!yJVLF6GhBJE?~3p&$F9_ncyqA^<-%h>=#p1WJ1_01-ShLr@1UCilLoX2|cEs?ig zY0VyX%8#(B75i#hh?{bGc7mkkhk;G=B04LHKcOS99Ldl^@>hDhTvC&qC$VC2yp^T5{-MqbXkp4D+Ik`oa?@( zU=+0<6Rz8ej~yXpyLdFXs4)MMq_zNvyeI0v{}OpDuK*&i10eDgT;)i@SF}-5zk&fG zPibsdrT#U4j52Pvh7<)!mcX-79+yAoA*p*6P2yJD(jYNq7WR zxy(~J>&pe?GkL(nAmOSx>tQh-k?=6U$cy~x5wq_R3-^fY5BFWfh*=CRC&Enl7Ur#S zm_!^N-+!J04@-qy$XKlp0Ysjkd+a|%Ue4Y{yksEH2AEzazyKqhOENIVBQ&D;;vtdU z!9zzhNVt~nKF=+YS5AGI_Y+(T5P3p-zi+|{_rVh5E+zZs|043v0V0p#|re__aCBN4U^5P7!$LFAPJMBae+zlgl7n+ukit9p0y z>{0YgfXHL{1rT`#ZyW(451v(ZiZJqKt~YOuysC!atTWp_<3q0>LFG(dFGfAz9IPpA zya0?m;}|rgs?&&v!Ns>mo~(zXiOp*PrZXj>y}v|W*h}UrX9?m=Pa*YXp%u z?!u_x8;ulZ7L`X*x(ZPxeY1~n?c~Bg`6`{9AEO>-#Y^2>HEX_PjgL3Vz{=uDu%RDH zV%4d&-#lU*N#n7e{9j9Vn_BEjaT*{i*|DOE8&=Zh_1J8f0^WiTF8P?k+lq|k3I7ds zMd_r5p{V0j$ZjXsO&_-liDf>%2q7m~XC1sI62JZ{N$}?gY}VC*1ZJ&D`wO05Bk6*# z_I56KZ8GSiagKHN%lO!%8psB9wCzc1O-$YOWmC98ER66?(JQRo*yjtCvv{H5xYG0N#6MmKzaJE72n6G{>)|b zv&*$4wf76DkbHFDizJsbTubIVxR}r(MI!4A(`j)HUXVK^NSo22G3r(K`cLUjH}AQS zXYyljj=}*tH8qL#lE*-_= zZ5D`|<<_ziBj&U<1<8mOxnwDbmoAGH`?axI%vRtsIY9fmLJFN&M=V|zUnWgyi%`d@ z6OGiO!XJa`^@;NUZ8YD@@b`5?SleXT0c5PDMOl|YWtGdC+l&QGkFmMOP0V9N&lN-- z)KBYs?|Pg!twi(G+suU(8;Wtu;u}_QKc|1k@~8O2I8#LILyhL}Sn<92!D7z0j&qyE zD=FT4w1Yg6pG79kyt%qC+1i|xkDRt2G+7vO_uR8sjIU&CXF0AAJ4{e0EgWBMwWY?_ zvghvWF@dk(<>Rzo@r-b*Z*p;08ZK`(85*h#b)dz9_qE-Feysl7^%U;qZn-jfV`>N5dg?joqk>r zjR5ji+*V7=+b`2^fxNr~#}@H2b>Hwc^~H&Bp1tZDK)Tauj^RuV8||g7%;dOIIycec zpL=e&94S0v@J9TUQA8b!Dop(a-ui6g6Zb=#6`&no68Q;`?sP&R+|f?)veZL@@V$1G zYmXHrYIUcHFixbl)8Awn;hX!_K>Ej;4r50c9g7lk1 zY&!6s>x|@yvJc#Iz=gRxs!$263NgO&h{rDQpe02u#n<1-6-$l_p&2xfkz>e(O=FMU zn7IdE+QN9USDrPK9ZIpk&A+EE1cKhXKNKyyf+hv-@u=Vln~bUzozX=lhTG-Jk1ci+ zhvv{;BHbr-hpyn~=9!!qe1RC(DUrTcS;d}{%JOtsO3RIxAIw3f^7+yg7OfG&!AlPz zog7QkK9Z}V+8m)6%10xg?W33TPk13lN0|*9kgI@UKQ9PMCVhhOaP3v1$)!29fS01! z(S>?)+%jgp38utLP!i$q9*V{`Sr*G$maL9I$MKDy)S^A=srE2t1dwMHPVhRcj*;Q_ zsA9iu*0X;DdC>?U?_A{;$P<&}Bo_`==MIBOKz}p2>nSqg&?Ra?`LTwb6pNoBfV^A| zNdU@sgJw8GJc}~m#kQaXoc_gxDW}>V{PMa7cms3nc6Gc(98~|V^HfuO(ZH7A-}ztsZAFR?ShB&pM!f4K%N`| z$g`Nf1M-|x{{nfBGlpZ8pd|RacR*g!Knf#I)Sv?!!4l@5J0Nch0P^7f7V<($=u7u! zz!6V<<^UnDhe8_=@@PiWZiPIVHs6edTOlt_n<*36h=&03gzkVm7Pk-Xy0xNrKwhrDp}BVfKpxk0j1~gO%i{xpJh?j{ufz2(khl6DK%P1OLj;g#qH_!6g{3~E%3b|{ z0P++t^Tzy!9-Zi7uQ=QSd8q)97q1!$fQ+|59yw_*>F6DhSCgWR0P^TxFtpzHudh&k zZ3h5(Lmwp-MMf8cx+dqoS~oeW@JwKvRS4ePHOmS9RVC0C}@{OQn|o2J%i3K%QoZeNXcm=Xw}b$`ZOswFU0v&fb^~ehlG! zF7Z6!k=$fWfWS!rXfvl5+r~MxZHOMkN79Qm2?GjANQ~AVo!Bu>pf~gWAbpZY;1d`M zv)?uNAZC!S$qzThg&u368eN<@_Y05i3V}0HKhKwVh`4Mnj*H(Y#}JLVlh_8UsYi`$ zU}eIy3KLisyn36(W*nQ1y^A^$+{OQrd&O1;ViY?0H6xRLdw0D!NCWi_vWf`bxB6Pkzc$%*ys8)_wxt!)ZUex&B0m; zBpba~!gtjm&t@)rsV(R;2y=A;@_n5S%=KD|#zqPyHu?MfrqtTwmd^E@gOxgA!}smn zT|#_`_$ED(y(lgw$l~ILg6G}rEYtT#(qMdxsQRe*i(OA1m^>OuQS!K_L^G~rk%new zLQFFM=>lPekuLsK@^sFn-ZUeBzUap81C+c#0|u0_Qb5T=$r}K&wkV_u&zxOTBq-pN zP8Wfd4hSYMUwBF2i4lUyQ;3`^>~YGSo&lJ=sHY-zbMz(CDkn0#6}L>D?({8_CnJpn zD0y8sxhAiid;ThU8^H)AZ+->?i6-Jf|D0%wgItV4N?oP=I;Bg4zYI+8+^jof3 zCWEprH6Zuj=~jS}_hdl@Q1Wn$d31BlUN5ll#T>uQlgL??3tE-~p_~Fro;@hFl#ZnA-kOaFiv= z>!F)R){m#fs*};x>lDyHAwB95eG-7*L{I-jnr_iR00#yC!~}~ z=aE%`T!ONEk*vjq4A0Coc3me+l(e~NAz=_wn-QIop{2H$Hlc5K-tV9XBH0q)ZY%;F zR=0PAS)Fnv(vS*23t?)*Ak!JvO0sZ%gIut9iil_MCvZ)?ZGQT z?PzjHFLu)f`LPzD zVval!0c-FlFlj(hT_<|XjSAIdDU}wx`Sgv6_ku*4kTJSIkf9zimtJ^{HhUPour!E) z#GhPYRlE+3=tuOD|=%qMV#a>@+)7(uSA0Ahg{ItBDd~`1Ymky z8~4jVzBp?cR%GFZEq6f4BN(7L_ZRQ%$*8+i@=#JQWu_mzlPEzbdBnF$UKQ}XtVJk! zr2kU#1{lOhSQrh-)W^*cN?uM76$>Ggy&cJj_5z^f%>1a8MH;AFyWyUHU+)>EhA8>z8o{Q zNQRbK5OgV-G4U(t;g$LeXE=dxnZLzT`bJxSvZ(L%mfz(Fq+@THyyRde@x@yvulma` z*Q`3t?={G1(EyWoKoO*<>szB|%P#Y{ut(HKmT^@EZ!Q3*7NO*wJhst#B7I_)W37@* z9pwL*w)a-a>-g^3FJ{5no*Q}}Q1YUFt&K((Z9qx^CGSKUub{x<{ZCk3HtQA9=OSu; zrX7Tmhx3b7f2lTNL6DE~08sLxjKN998ckB8yYu~e6cS$+7AB@dcbJrVWb(IYg6@<& zzun6eK*{TtRII#olb3Ko#mor(ntX6ME^T<(4I8gw6v&t0B1(0TC7ATn{~cg~)`|27 zj`}722NqHcVMhSB9WoC}FrF_nfQg_NEs^DcfI@{9aJOWdbW1KyXbnjsENeuThUM3l zRknq#63xEbVY5k%g|JC(mg9@1WTk%_NuT3JBKvP|`mGYlZ|xFQ}_f1xawrrJMLPh<8% z>2_wa7H8Rb7UiUu^AC~7&O5j;&KEmq`7@1_ZM0xbyDrm^kt~8*$$q^3HH9_{hGxD02n7C@`sd**Nj!`!s zKZ1S7#yMF1Dk~+3Ha8h&@8~2^xMX+9D=W|U1sb6tmO^OH7|SJ%Ao89R{H{h-cmYy~ zt&@myQAn**%;!@qcTvdT6Ae32YUNY@bfSVF@=#TW>r}p-sBHt%9iK{~i+b^i>bi>( zq+SDFr{2$}#q6pT>Y|C^s?7FVQ>$JpyY8v_Z>>)*Pw4A4jeqMr=hxQxt)={1`9rrV zZ=l{cK6UHgde*=7z4>)k_zYTIjS_$B2G&36|E)y+UffVhHuqRSw=#4U_2V-L27>^C z$YT(=CGrBy5ky|M1%k+fcUk~Oo=OA4$Wu8LPU^DYbTelbuuyI=*EzK`XfWe#FfTr} z_|RZ(-e4svVC^kn6DeRT+F;x7W--?AoLRu?kAQu?n^V=P(9VOh&ft8#ag@bG=r z;mh1)j@sm`;}L4?xT)ac$>8>v$g8*|@+!_F03xsBj>tof28g_Ip$IMHXuDJmMxM$=(ktl*^NR#}loW)K*PSZkN#^6pYbcx|)10K$ocyLa<;_Kc z;6RO-sZ$|&v%W&DW8N>^8-`O zJ+q51vNJB;jS1(HU*^+$Wws*an>XjLT%^9bOwwt|Qf_&>9gvNBnG}1Onc9*(>{&uA zQX%72rhZu(*<9G#f(LFXd~x~SyCv_NSER9U6uoBxxp36BQ1o}9m~Mc`Ly64~jO`Zs zP$d*GW;uPx^Z0nW~>ydBkQ5Wmg5fi~_;~xr)A-(dTY3qH1knW;IPZ31kcxwb4 zAo5UK03vV3qj?y4u()$@8zAx|BZ~bH(%o6xNAtF8WN3eEf773~3F_DF!*@iUU(}d4 z?uHLph0pM(wr2I~Nd%GC+5WSw0FFEv`f=tMAo4oLv#+%Xu4k&mzh|_LAc(xKul=_~ z-hj~5zll7gdHIh+AAITnBCknoB=OIe?pq=+(`PCBj>tov&xikqk+%pKc}Syxk!R9= zYvf6Gz45&@@}$1^x9@!Wvy#;LQ#zOjBrxY4Iyct7f7Ui~N90L11C8#8{6MRJ5qY{F z7u8?i5_$es$|%dm;uwYx5k#J@#HpxwV)lO$d6Jvu5`;byqw4TWd5N9wPgn5voBcP< z+elWZK@$F`m_&RYP*dq(Y*GQ2bEsKQXj~E(88d99%T;II^5h(5p%726n8N3YZz+RE zF<$R-f&V{AcWw8r8AD-DhH|Hv!iT`njW(z8>7yBH8VcW>@^}=u5gFd+x*}wyB-G?o z{p`H_<+^PhBZ`|G(IVV;c+f|lPUENMvdu4P40U^zh>XI?Hhf1O2rx z)xI@YxcRNv5HqdA^DFx4hx(CrBs7Y+4isF9uy-IrL#}poN*^ds&X8RxDW2arWAM6* zOz1tlRhD2>$}9ZJw|G-=3}}i8GVdARx5b&4@d>Ri_P_~-?{}gp8Y<*c5W|s7xnylX zc5R;;L|#-^r7Y5LgWvh!4}JM18aKp>_7XMIxvvzI?aPz`r3}JUgHjOZB9BWU^&%^pAn{|& zSS+Ye$r4F^ib`j2J31pwG2ahqDU`shByn2X$iY^L-;yvop2zIaI+FwKh(6os=)*`R z+v?lo8MEjI(NG}(vzWEYFxs59DKvVZnO^q3l%mONh=|F-X!(@!g~MeamQnhJ!<@^s z>-)EYF|W^R4Kd0rvh;fDcA)l)3#mVSxy_hL-~>NR9%TmLKGdbK`DT>16!LV%ff1_Y zoVFbCB5(BZ8~)RcETwApzI1zm*@En7)rX{ONkz+x=m9X_W|QCtS#N)(M=LQBIy?OE zToUY6AhiiGYhM4LKZHZJSx{;?3rl;amR6`k_EkBkCR9m6ciKnyu3>r~s-DmJ$?>KGO1`08J3JD4yaihJA^6w%ULQPrq zvZ`PI@n@?umgvu6Yq9U_v9ao#KYP8+xn4v+i6wkb-vhy#qxGsc`Ul6^LT>&3+2Rn! zMV3I=-dY@&Y$t@VRF+ZYC4`;Clo(8bDfY6X%Cku-5EGdMgN9kWn6w*qA}Ytr-p{HC zst5w9&|+xMbcuYd_TrY!#nPJ*VOet06Uj8ieSOYA^AOKe)KHN(T~n66>%c^bArCv3 zq?gpOF#NrU84h+|L^2nB5QA_qY`_B+MnE6HFe`tkLWfJwvP5Zm0u~E62;kjVF*?qX zAtCV>6UOUNRprgajFzgNDeMSQ^Fo7bBDteVbydVI3t7<7)(yPmJBsIwTxEC%rdBpP zvdu6gVx&G%OtCz&jPVF$kolsh{&iDt!7(H9t&?LUslj@K(@j=$fXZ>M8IG8r`?JvxDx{5b^ySSQwE!{r z{)VcMAmjG-gpobztWw00$wE==two<53m^^jWvgL)u2F}tO(NqPgu{Y?-jW`W0L{Gy z*=UfJrH4}SLztkCx@ZX3m}L4ED``RM;Sjp_mekb^*EZ8TTDYp zQA#%Gbl19zOm>%0vVv~vg8>lZ!A7jFMF5cy^t$ETLhox(-Y1WY>9YEo zoncgAsrqMTKuNn!L2=VeEK>g-N(`_N50>CyiGF&1NGtwvlE%#+6KXY0jiq7->aU3^ zcI&48kdq^!W;(qt*NA^*k|W`StQ}TD|F&z&K^mAEd*ee-x&)RryoQC@)Gd+OEQjJd ztw9WeI+acA7MEvg65!vch_HT}vs(}i`aeM;NQUDcuwY!Jp->B@!o_C-gv--0=ykj9 z5pg}|0OHN=oHFn2RYUJl=wVg4@iUAbNc6&%pB0_6Lpf^XJ_$VwjnahNn9%cLOa?3M zbm`1Rukgg_hwI_2mZEG{N|v$3F}xziU&-Ng{fG@SBtgcO{lG2#Y#i0}oc^A;P5He8 z?g+2EQf$ZSJ|VMQ42h#5ozKcC4mwwS+Mqg$s3 z)ksV1#Gdzx0mVi`w)*+*{+r=y+d#TqwadP}u^*y&DB13xwqH91VNk@Ex+!-5xpu-t z9i5&O05hcR#+#Wmljyk05Lb7A-|R?Z>LQ`zC)h>$?{v>JrGXO7e?T(XJl8WiHksUF z;(ZJc2IvIAj3}nT4&!;mgEQTfN&@BHF1b`R_eclR*O?OauS|4Qqyil#8_& z+grk^$FZey!hk&w@sRzZ-}OTG7cT96y!)RSx13(?afKf_;Z68Mt`8ua@_4G&P)z6D zG7L&kWN-i-m|r6t?~oB+6NW9fLw3kW<;+NTI2EF?NmTNVXq_?vvkFU|ip&%ol0!s7 zNAawK*2EuO&KzIB9~ZJrmGc=wSAy?00m*qzF^_~jfTaWcNcWQZa4})Q zaBzrTi6mZH*x9NYKBW{Htdxw6A&p}qq9LB_{ULehVRl_jZ&N&l_y`5&Et6LSFnRvV z(onJqO{#A&a>hf-m3Z>angzq^FOr&bl{lrD>qU7cR3z*835!_wmIGI!pAdViex`4N-zX%R{u64CXol6P>c zLEbMqm>AHP7$|NB+z+`^@SM za7c0o0VU4~7aNpEK172<^q8jN18&VdG2SOKf=M!>Uu3pb#D*haIT|tWJ{BCYF^jn^ zT4|F9iO56A(<1i6o`+FG^Je%AdJqk1>yvI_O|wcu^f|8VDut5&4~xheR~Ch%;~ZUXj)U8e&7Ik#1y%KwW%9lf z(xO*Oe32SvJp!0KMot3#eL~~or^cO{`j%^f?DRMS^+gICiWiTlFBfg(R?q?#!TLy? zoAlK4l4jVaw0@>$oUWSOF2;f>=AvKCbt;UfWMj|4j}*IzqWsJv!>NfBZhoLqH~A54 z(Nns8Kvp!@lOVEwrEM)NU@^6t#X?WbN7yoC2aI>XAp#5${v`6Wm%gOq=Rz@Xb{;@v*uUaY`=v02jkJFIJD+ z@Vo-%6_Pn^7ul5w^)fQasVOvNli(nd;eggu>&%w_)HPqnt5~p`fmn6fU%7#wfvFSh ztJ8RHDyz&v-!Tlqa;3`KWtBIhY|0^gm+Fw4>b0WdyH0FcL980%%CJM@-2VZ8gVT~1 zAo7-orgRuCb8HvRJU4axiG_T3QzZqE%>g2>{e+$@_jTbeW!Yn4@&6$5;ME;&;&g^U zpr2GXSm)fC(nFGFhK4Ve0Wk73UQk>iy^%rDd;}PIxgnY}p4zFwp=n{ir-L=k?%ff2 zSgr^nuXdQa`ag)g$Bv{-HZ+oQkw#VIX{@2d%liH#e0?@pY9!H5iG3qUVpd3uW6v3K z{)5O%c|gs+q5L029z&@s4R|&V=6Orx`TKouQaYtI$%szC)rt`(!lOY#>FcmX>P&o# zWW|Am!sJH|EkdEmK@oUsZQdvuw@BGASiI(tw+kJNdF!xgKrH(i>j0*(juR7@1{| z$2&^?jK=1Se&I}Gt4!paWZXW~I=y$yUNNR`C4UHKv6aH$K1M3=xFJpZLs}G? zixXi@F{(>zxa6lLNbcoaVkYJB!?eU}4id6OWs{Di%tR$%8ov8nv7Z)3HAmASfK^AF zM}HapC|!>&J;T%rhf6Z?F^EKe(4Jpzfd_2=^XM@ITsa%YI ze7u0OG{DDeJEb{a!W1U213_R^wmg zs@|hm$X4uEml7rLI`N@H{gL-*LyRFYbtUR(!$@R*_L#%?$B$F5k1&Ick$gk_O zm~yPoNF>_F2~oPsGuizsBgf3q)_b3EO@PKJjYKg%nqV#TBTJ$#c2pSqPY7Qt(Nfytkc4&m2jD)zNfnr~w4Bm>%_D9^ zA$e`mr-wSEpFu83p|)rRyHF?Y(pt+9=TCJV59Sc8=D9YZE?|n3B5MRju zUH|!YOL8&xvxT2%poKFd7#k4vXw4@VTdJRJEe8hU3yYtozWZBIl9?{bFV7N*te_Sw zGT-2R@I;c!T18z%c_==i=)1}+NdX*m7hAL>Gg*~-D8oi&li(LE`jC!&7963rjukQ`Ll5=`|*5a_wMmAM{WY*@|V06sahe@aFfmB5^7kJWZT@dpp3&U`d5ZhA z)tub{aSwNYXB`@+?yDBTI;alr(P_03lN7Y@1h{*AP;3KyyrGYM#Gel3KL!7JeUzcs z7j07eD9~p`>P7OckH_RW@+kY{1>Ax9r$y4zNjEE%0^Z!l3YFXo8hfc1ybGt!#uxe8 zXM!O(>5XTEX4LZ;QF%EJk7#kq97(mMj;cg1QmL;b8ZP+z2%C5=sm(~xe~QTm;+f@2 zkuFhTHK$%*{ptRBGdp{dj7R;2h;J^380o)DcN01F&M-EfRrjI1zGw>nxdz``D4Rx~8hCL2nEU80m|Y)W8gYN;j|QjlClBmTxTOL$%e_(LqP{r7NBVdn zY~;PE1rOfyqGsUPr;gNaOyIOO-wz`+-I^@MtYpIdPH#3?_xyW%v}mJauf6#n$1se! zuadEon+CGxivs-qzM^AxfSC93Ut*r+e--n*&JO`Gk68WYPR!fCc`fegKT!;bdC?Di zp{IuaHzN%O0azUH0h2(yhhH=N@t0R1d<07TLs-@)K-~`HFqIiZ=j9TM-?W(5BiEmK^VLD{px<}zAciKYH*cN3Re}HQsaX~BvAH^J(jNKUCm7w0i?qbJtvT=y^UsS{A_<+FSk zmCoRbH;QA^;I`bk7Xj;r+E+_}l||RuuWv*hoAZ7>*J;_yih4Q3x`NI*==OHy<%l~s zK;R6v6vj(*`?cI521e1qQ@-j`)f@uN} zO5Vx8l)U-vu^npku8$ALM*B}J;^pPk_c`_${*vwnpVUmkTYzg22 zNh}_gibW*oX;bo}c~cq|0tKj?lsNlCX&M%rP>IP2bbKeK*XBMJ4EGBZ0;-*9#22Y? zy_`c8`z8`_!!d}q{0DRes$yS|6g+qlKAhI(j~W!m7?QaNdF_-y1Xk~KVs4_K^;3TEzuwq;LH9h8S?f~olm%0b726g#z3LyPs}lL zCmO;{p9nFV0~CUe{P>$DIc({;aAH|=NZFTS)@OcW&_Q|-TTZqFGVA!sl2-PdVJE{{ zJSslY7iv&p!FM>-GCIoitAcYg42;J}V@YLvao7x{k5)Sc0#E2OFgq!wt?Zy_U-^p2 zNim2{e@*7Jl*R5EmTJaEv>8Q6m2iloF{EKV<>y{0cqCi)w2HMUj*GsGewtM0lRgc1 z^}TY72W&&D%}=WHK9HL>W$6E8B};snlVbJ0+(;GRovKkQ+bM{Q;dW`IMIA8r2NkB| zPZaR%39G~A<%yU3W(FLCp|rW%rks&_fEu5b@QC4;2=-iwMhdFnxw(~kq<$TVOjaiD zdn(uUxfv?UGUng_Tdm0N^Orp&8sKm<%WYXWGPJ2@8Z0VjU?5*@);!`+`Rf0IbVvS7 zUT7Kj{(*yjBz6xKAl(s}hLT)veE&zfgJUluVc$u2kf=ov8z9{|RtHk!EP<6hx(M*X zv3ja?E>U2yWP1lPGw~6BP)vImIBLSxqs;I_Wii+tYa)$t%qfEydig4AqP5zVfdE7= zZ+lJb8~bI7eUAZg^xF96?5nidv2_E#V-f) z&RUunZ9L4eF3(bHT@Fs&GM3aF`C8r1T5;v=WT?+@p;nx&YJT9EtBry^%eV98?iwQUmA-}475K#*Zed$X=Yhp6uP@w=3wxWm*zEQN>x$@rl`l8KAC`<_*_?oQMuJ?mwW zq9bX#ve&xcncL@jJCYBrWV>$he@Zrg92Shd-jMzFXZAwvG^cj@J()AAP#>^>Jl`F9 z;}ObVQG%w8u*MgyT8?vuidM_ zV1A==-$&$n2qu?=Y}PQaBFTElg6Z0c>s=E}F5ng`YteH`hwr(wKzAvg-+tmKe{!O= zfA$f6^?UY{=%TsS{`&BbnPux62&HJulA{G_dD!Otl+Gy_>^aQ^Smx?8oP6fnGro!b zttqb;__4SR#q!;pMH2C#B^NB5(~adjoB?eMX4 zxxgz(G%ir=K&U@HzV9EwS4FATs9-<;Ujd+*S3lhSURyguS-jhy2DUrfig*M_xk4KJ z(BuwYQ0C&%F%qsoedJ--gy(@5XMU^-WO>AaoHGGV7{P)p!NHw=1~!2|&nU#30@XK^ z5lY?xZ?!XIvX~JQ-w0;gaC@5O#vX>}H4v-`4q0T@epteEIxU;Uj2@~G z%JCG(_8fM`481}M2Ln)^NqA~lc$!d<z#Jrhm@{DW=^J0J=}mo{ieb4HVX0iGiv z_KZi2;Rc%)Va(64V9V4nm&cLFgK<0`{wjGbw@O~ij0Zx=%ZWrNd0agEx+HX)5Vd|+ zgpvnk*9DZk!&qF*L=34o*V;I4+r)!MQAXovzSY>hQ1szyOiM{FT@B1=e8>tEt=kDJ zN`jerIXjuj`YiG;`; z4fIwMY&kzJRa;{9L2z*#tkonSnfbooV|3FvR|CwHK*h9R+r%V6J}XRzJ0)-RUrL_I zTOaAv7l%pa#dzLz9+O!Zo=(VJWtW-qNVEX&Pr(wc)R(b)+{{_mUG~&Ud zNO#CkHTj)#){wd@26eto_hsO_GahAJkk>b^?_761rb43UtTuBM9Oj zM*&LS;06X56`|y5-YR+7wJE-fyfyJLXq}OuGQuP z-D!gN|0;QK(C%iL-eDX}IQ~W>p7=fhAI4K)vQU#I)Up!Ri2z=lgK19WVTY%r8MMc) z!qp({rGAbA66O{ex{_qMTpUF4UTi`*nnI3`Azj1R79`la=}H4#35s2u(ktJf`-%w&*}D0y4wqaGhFt zo)l*5g$anG93%D;`n?5H;h<3|C!<&cQ^}846N>%Y2}|N}4zeA{)DJYy_Axm$)yM?2 zph2e#{g}fA9wA42n2%TN2ZDAtPf7(Pl!KB@K%cjuYRU|9am<7Um{6^fn+~w%cmgOM zUjO*kk&_@zlF>^_2v1TPOtK*-v}1z~FIIM#P=oA>R-k;jqagI=H>Q) z@B3x&E#}<6Sj_%1*~};gWvjq@_Z-yuGFc57Q%(}269ZFW5#mCVB$(Nax>Ujxf!0zC zGU$TXkf3Sm5GY1kUJf zwk<`|=|S5c-W{HZ_^yz)s33<@{1-JHa-k z_h*+OZ20exDG1FJQ>=u+F@~AX^*9a}Q=B|YJ1+>JIQJL5v1SkOKxJ^yDh%SA12S*y zq+}oFzUZWB#cqOPUF3k`4)BP`niAP^xY=2UbDBEYyE7J*5}y;)a0OwBcC#v{e>UV` zylkCB#%!gzPaJ`!qC!Bz4prlaoIZam%8PSMDyXlL@pGqljI0#q-;P4tU(?ZTv?7xt z;^2ytEH&8dk2A=K6X(M9PN};8of6cS{ej`o`#U8sp?K=*!4nspH zgHRg|u#1f#U$XRxLB1Na52+Az_;FRTL&IrW?$3T{2c*|g0b}wZ}SAQYXS=K3=;c{rqYZy zJ@j_WOlI++dOHVKK*=)+Ako7~#h>DJ_#P<~h2j;ZxtRS|$vfa=807zz&|}la4}HZ@ z!_}e|O@KbsHn!1RCpe`QdeZg!gbu3W%U?J4CuQ0r;--tyaPyt^nr{G z$a@De@;!C_&!Wf2@tg5+BL9K*R=(9vu*lm7vnB{7ua5_swM#(pAp68N`@A%}Wgz?G zyMY$*^7SXHN#Zbi-S@?$-9=mPmfuIMq3mND0$2}^)=ZIABl?(Oa4?H_XNAFJ*EcHBRU*}tgV|1-LOy|)j?J3!_{q- z>hRw$;AgS$iz@h^F*yAH+tMBU|8>%x<|AUuI!$@%V7|wS%!PH2l{tI)v{VJx>a^4) z(D;)zH#=rmexJ z8{exBvozEeHA?C?A(J*sF*dieH#ZzGr+Q?Ou3(ipXhr8@Yd&Plq+*vj;=u0dXs!7o z`|}GfduPwj&b-<$b6>x-({n5FcXJ$b6ESixeC6Ty%(HOJQ#{z~rI}BW+3WI&*Z&R6 z3sBYx650zA(+?4I3VH7z5}O$E`b&uBUZ|vFSpBnbsh8okufio=!mCok{ieg8?uW~o zMmrHkf1f2XmRJ|{olg!=KllB zTgY*%$xS)Rdmd2mJgK1lT|xEFTP*Lm=w(Q;b4qdFVzJ*!@$2-Gj)+p1meQ=_Qn#hD zARv2vQsI;S{`2zti1^Ca?Ui|XRlSMTZ`Nv}lWPOIYl}{5gVO3k*6I@b>Ps^lA}SgN ze>X&~HKuJgCTBNA=QPI*H^2YYl3CR{aoU<#(3Uv%v1a>Y=7;vFjrQE)Pia-3<|aPX zo_1ukbdGO#7M6Eqe(9>O>doEiEkElmsOig}?Q5KUAF7%E*DYWv^l(>YCyGL7pj&=@z?_Hf99iJVYUi|)ZaSDgSfiFK8j&zMlp-`RE83?|| zYB*4x+Z_s}SIShV$?J_I;kBC|sLAh-rICrCP^>K&Ok~k2GaRff98TpjAIenxQ1m%N z=+)Nz;D_R|Y{@_@N~OAziF}1bR->W1(y3yNe5H3v^<~rLx*zNohU&||Rhe{0P%1Z6 z%zm($Dl-~xc>lf8X=Ui0awAa1^*G*I7;dck`NwgcHH+}J+SRO+u6bYSW z(GP^>A^Qf*R6_H}^8Yz3PxF6?<@KrJX`CubgIOd=X_8oiB&D-oH+04cEhjNV^N~2y zM?a*SpjXXDZ(hN^zm>!=R(QXZSLH#HBLd6wr~fZlUi%%E*Yg+4n@3=I*M){iF~=AP{^YA}9hPAjlh^-|v0*f3LmQ+K2mK&gSHvS#z&h*L7dtk73w_)*R(*9~))~ zx874h9l{?&6I4-eI1HQqM6rbR$ei;!Vf;7=9f+%LXNOmV=WJgJPxHtlB?tI8C_4AB z;4JAtJHxyB*{130vh$b;*rPwRg$Ts~?A8|c8FhX~4l*?m}a$1)j271wJ^rX_%=nw?%tWXE^IYc zd|3FsO;ERV(*>&ew|Ig8v*cPA#iJ zvHJV`bGdJrY9@&>-N~&tU|}(|Echq7g69uL9Id%>z)372>bdtpuwWL@yICDJa){_ zysW}9hS+RLW;<9g&@74}afcRPw9Rbx)U^R93 zTD$>K_Dn2ogJEFv5_6a=9ubps*^*YPg^)rItmhvnz9k99{qv^9m7o%tL@W&o_Y{^` zdul(1=k0d#R=TvQ_SelznkrL0$f3J=eRMZ3TgN=*MG)&&?SI_7>|H5(&ZNJo1k(im z+c@3LqdXB7?>d7jGjqQ|ck@iYJlnNSBd%=y*UdX4@XV3!<_&gKD{*6AA6ASXT2HRf z-8|eQ{0u@oLoeG}WP}A*MY)^RIkDxOgo4+)!(qXnnw`^MS0E+4o(8@^Vh^HLUT~1I z{B;qwS^deUs(-M=p^+R7awW)%cM`Am;nBU#{laS|q=+RVv`>NoFDgi)9cP%MSLPi> zjzF0P>{iW10RGP23Dw+gW>KJ$-{D@&z~}j$?!fU&L@TtIi;zFVH5A2?VNpAgB@#V~ z=tDexNb;_)Vjxb;N_(#wQ^z5#wu(|7M?X?fD62Sksl3j7ev+e58u?f+IA&aD0{AJ{ zv#8i~%u@T3gf&D$7GOlboQq5jb|P@;?`oNmra@byq87t?9R8a^b3aB> zXbY4znYwDp9S0X&e^T{W|1_%?L!-!@4zx7rQLj&P4{OF{VLO6J%ezXUd;~=uZR?hn z<`m&#j~2?5i``)UsMK`YVq%Ape|#yca4n>jVV{bvPgl38**V5?u=d3;hA9d5-tYYJ zQ{sCU%YF2!#K`-3SnOh`!XJwcZ!w~pBlj9}s2vzPPuR|Q#3*xNn(g+PTF|}u86O}N zN%7q{U;Hy(%>OPKs#n3-!CIYVkfJqG7dW26d&ZpUiB%>(f5GU0T_bMu!Mh`^`Mb5= zD%JBFqnbRXINV#9ylI-CTkAB-!W)9#8A(_4Kd8zUlc}@s8>Jw$8W}<`)a3~e~9T=P^cFQ+UD*z zt!8C%-OPZHLw7G+o~RsnG(80W+g6}J+PxjI6sHA?brKZbXk;vgLPeK&*>W=|jQkO+F8 zdUKAiq?r;JiWKRDo((MK%%+0*@5|h)p4aQP^UOR6O_&$pu1&xt%z` zlC32;rTN$c^V#JWSoO5XcvGVB6!(9XJ%EX>?9BtLXSL8ctbvYL=m!ZY_d(9+$OP9Y zsX02dXJu5>%wSo;K$vA-tAO!B(UJ?CMPrbN1?YvSztG;L!erxC7do`}%?fsH2N%gj zA(Fgeq9~LpuIv9odlSd!(Jm44U_+M_dwM0D%V`W;RG*@XInNfx2rnSW6DYUnSRNl& z*(&;8WNO1y>Y1EWhSpT7R$BWW=N5*gt1#_>7K{G*wBEfm8ee+9R{EfQ`qQZNp~CbR zUFol;(#dyNPy4VRp~>uGtI*rPuw$xuX_q0>Htp=4AA6=UbLzR zqVf+)rYJ;Vw>$3uuOXK0Vo7(VF+rGvJ4KuF-QpG394h`2d$Y`n7+m}pX56d?yH3sU z9J7ECq)%M+!%d}wv1@c<&q~Me`TSpEFFrEUz)ohDPVCKlxSo~m7!xN)SF078&WU?1 z@ZhV*YJja8qctNjOhr0BxT)1jV(01rT1v4(#qRih{qxWah?r`%4eEmo$@eBob&uTZ zm9fQ{Q~VD*#wRON=bs_5yVWuO)WO3iDgQLQ|A%_;PqVv=(Hcco#u zIJ#`nFihG|IiegSexW_4;lSpwgDwbRg^?O& z4@7`4)D+XUuG|GtbMTvQM8Gqja{OJAXl;xPFQFlfvTNG{qRo+)~WNkSo=~D!ep~!gB+$89WLoXhNqHz3e~p#nDm9SSd4-F4 zB?Rs?+mKt#XcBTi+Kj4cMwNGOC){gW7AQ_}dN7EiX82RH%&1q4sQLH0UpjV6VyT{1 zu#$Tn4Rl{`3rUuLu!3U#hPqsPs)rWex%A*cpnO}NQ+JdTvkZXvJgZuP#xFhq=lVM! zeW5M2sZas?!3i6vs4r6M%RX4E}%p@N|AUs^jW<0t8qXANiDCiw0 zs(5wb{O`y2+1t)~G+ll9a9IwydVqwlH^TxPCQwkh503%eK9EA&yIDy~Lt2zm+so9x znQN@#3y-;U`i`H$%BS1ua&%$({ML0jZa{XCU$+MWAXV+(*@md=@rYPNwLc2{3f-7a@p!Lbe z>` zwN;q&=_#P!tr;p~9)oROsHVRFRlW$jz}F&#Zqt?NeAQU{D(Zo5W52))+`S&T9%_>= z-_HA2BQr0XU%i?rp^i4Zddfch?mBf^h;L?Qc==#Bj*jKM>Yi*Eeg=|SSs(twGW>RC z2>vuuoWxd`!m9Nt^!l?t5LVd0aF9X_EqvEV;Ce>iddczj~Q z{B?}m>*4g*7?(cS@ayP1ujN=KuZm3ynNRBJ3AO-;7IgPUz>746wz1|>tBsd77H>>T zC)>uw3qA}xZGsO%j1t+I^R~b)vBP7VZ`>67<8g0|qu)-Hyd@~UbvzQZSPf%(*{AC~ zcq2a1OK}+U>fQCZcS6H`@h$5!z7{#?R{;yin9|A_tUG#d#|I9HZE^w-X~+*GgyE2 zCjz-6JiGSb1+;gTC5Q_976#882KqlCS(^`CjCUY=Q1%=1s((JpQX3V%&13WzK-cE9XFvjRjeD{D;z|uQ7A4IG9uN7u?I1&y{|R zrVY4JmqYYhw=&)ZiLCsLk=OgSB+G6~@?ZJtcsu@~9`4&pLS~bt$m;iFm(|pu)%&E? zv@&Ye&G8sItw&l)6q%0wvs!ec$ujUu@t-e`&8TITtKp}_S+BC9KEkX!5Rfa^ne;KOxtR4xOluWt zqnrv)4PkW%VS4m!h2D^gQ+WM>y7(Lm0%n1}KTSMX5SR#AiechHfdCBi$!oC5AgTfy z;AU>t!h)CqkOVV?t$9=f{eAcBcTfgMID{1g0L@vnbGufbU#?*oge#Fkt5j-ad#0 zLuL`HF-+%aj4L--bFCOiXx8NyKUNNYKrc~8vETP^us#Zbc@q(5<5-o?QJ6>YVJ!1jZXk=YFO1ZvkNLqIk?1PXpW1op&| zD*t)w>w}*wXclrDYdvbcDtk9K1ST2B>Xo%?afy*5dvoal2xhgT+q9rnzRljUT^{#I zUKy&EMwgmEi~ul;X8mUrppRXQT2alv|3F`&2GBN_EVoVrbZP_uLSWa&K=5YzMghWv zuwEhpkP2W)4i2Dq>3&nnV*o+~z*gIcV|wBNh8dXMTzn7WMEznTKmdA60R6qb8R!`W zFTRJa82}7c+x(c{$k~Gx0z?o1I02Nf&ZhRoF#=9s7^Q%MFO(n|VJAwoG;xDa03s4{ z>O?>v00nb#_e2l?2q4JnC^)+?`+u-JixTa~Z|(fz2?iCoL{=%s$yMv>GX+YX?`=EE5Wr=AbDbj-#uhR=BrJ&-KEceVh{I!eqZ^_MMr5NTo~F%`%|VM z_7Q&?nuQPJ)71M%RfZY3J|xbZG@^ZnSonhcrIlO+pf!1fA{5EYY5+}rl4a{_T6lsM z4O{5p^i_yiCM2*Wyn(-gj_RGUZ2XJrF*vh=pS6*V*-gpFCIgKR5!)783;E%eI*$=z7P{M$+d}NxI;JhR)YT{qAd$s; zXO5Ye-NkB<0+GIC2DhRuGzvMtJY<%p<;@Am_(YIdX|k*Bw=h0fAy{R@4-i?sceAE}+db zcv$D*&t0dG0q(0azyG}6b&eR=GRwUsNvW8GNb2kg!PyzsOVj|Hc9`w9Rs zx8(^tWyxQPL&~Q^=2rO`Feu|fxwB8700`qV#;A*k{yD%oH@>k#e6_4l4RF<{CqtCx zYBPIA;_rZ*!_a5)X2_Gs!qPAd+r zWj(142Gg)Jj^y}W{)&sY?6IFEvy07onu0kuUu#2i?)8lg#%{MgJ^qadNs_lFH~?t4 z)@I$~a0)&gQr}-qV~-y*I+t`!$8|d(3NN&5mHZ?66HBa*G&>6x?y3e?lJiD|ys9MW zetE*qmdo-)*og1f`;?RGCV6a>5$RETcU)QM>5+A03Kg}0^THVwng2jPs<5_b*yC_( z{~?sxj0Ms5@VXD`l1?pN7CqWC(ci^nk?q*D&>WJ6TOK^Q_LJcp&qcUhXgOKck`B9Y zQ}e%A-p)<5{QjBJivE;}5+*vV7oU^es*p>mlyhNfFAuS~*;ehTxUum3;Z3oS_wG zbG~E*UsYmO2QpyV5Dk8-uI9h8-WZow>R(uI_VOdGJ}5n3-)-^^le;dS^8rPH!GoCk zX6LqA&At`Ic*kebW0h_6FRWLgKKK{b1F9na!g>|O7F;21WL*L<@PY@zI(C*!hxPb7 zX!ifYdb4WZ=^HbUE#mCa4s_FDy~8V+t}GqtaW7Xv-!6>PXF4ivYy8eH8ClmK{Dt-E zaq5f{W!*cO7#wgGnVSxD?$oHYI9d#Zkq&EYIQSx;s7d~~O>sc;_iK^m)8!gZ3lr|b z*#Cv~&Ul@7vgz=tS6?RGXccO6FMF6zQ-Yz}HgrA6PGGDy`KrBjaCKFGPafh>&YY zp?UuY)@x?q4@%JpNn(&366m1AdX#@*z3T4Ui6H`;lbsr8zz<`wP4236SZ~(#LfYqf z)v15db?f3ujo*WWX%peISRkh;pp%c;fyLW?z$zyo5o8p9s7Rd+o2pp=ng&oN zeL=h9MLypEaiu)CdBgl+s02S!_Y1)HmYD&-f<*7o!r*i~it89Ke=hYLWVgP^VhYrO z3%3+7m`Tv2l(B#!T=PXyIwyP%_ivSdIK63e3p0GF-hSeIn5}sMH)pb~fx~s3#3Bnc1-a>c7Sqg)zT! z`Acr`mj!X}>kH*)wgzw2z^{}IcMGG_LG6J64_`QLS;v2}4Bv6`xgn&a0dSk*i)2M% zh4ndZGwN2TdnzuB^JnjfpZ?&-^Yxf_a@bGmpKxhf@{M=S>Kwlrb$_B?x`EH#{JYn*3GgP%`LZbn_StGvp`*DL&%_d5^#!ssh}2Fj`J^aO#wnn#HFvtJku(L(l576uOJt=GLG}YAg8mA*cZc&7Gy1Lr@g4-rWO@pc z`>(HO+9HraFQNYR^{Q&~Qmmbc2%D31U(aKm7C`s)Hbt1Ty9z$=-tq%ytkQpaX{Yh& zvekG{=z!LY$qlYDh<|;(u7uYA_3-a zg%$!NUE(mdUn`B{bKl)Wi6)U#XGYY3&@kpl@MeN~JK#j5c-&6wJDD3MIii*VkceQu zGVi~*-d#rf$4lF|l~di;1ayfFK$=a8|BLJ0o!JDDjEMN|`?uRGJ`Bi0fx5ydQN(86 z^z(ueffQyfB4?WZFB~74c2Xasmj}J~7uV}DBnGg>uAB6(n}zK^>uF~SDrB&$=qJy@ zE@I4E07hrN#M5mmLVPtORI)?;DQBxnrG|%v%`g$&`Q|m?cDYklFm_l-Xll%f%8-nxv)Z z1a?_B%>zC{^=D&5%bt%OX~T>x&RBk%$yyq;T)hkKhMa3_hABm`otc7ai=Q(~t@(ES z++KCwcaOhV-ftG>jr4OzGw1FVj{ZJ82O!5tL+Ab|SV25fCE)S)C@WZ?6=Ollzqp6yVH@$%23{X)Q41TsCu6)7fo6gQdVI&5&<<3c}~@OpMe7Ivpy?2H5dZ&;q$ zBRlh9JByEYmK%2Gj{e2+5-08X>}?dcH=^D zWS-D4xtTDI%1-JXoIv^5v=&a>FuE8i%zWkOLUif7Su{PDwv%kk%p_re?FvqvuTimnTKu8S6~OD?X9y9X*clFuh>g_ z8JG4;FC8>q`u*tA;qaxSkC*;zTsl6w1h5i7A_TA!0isWUS`rvs39ujn;|&5llfYC) zKu`(Hy#$sK0`e1q^&5d6wuWMLV;6DbP;%qccjK~j<92o933B7T;l`Kg#$VUWP*?C?s1NOzkhOM>f_A0Rxf(!<%nVlF|=6x8& z=&J!%qrrk}$$MSl0q&Z!U~r| zfYi`luQ)i&Oe^s%9oNfdaL2&RXs~1V?8K|Ir2EDYhgn94?3Kr58}EBJWTaXulMV_$-wCT;x1 z^@0sIH+F9%8}oe)p#;-$y_`%i;&_?4e3@l5=tyKKuQ!P1wG!2pXjB2w93z{~u0@ae zg)V=AYo}7@-~q1KQ7z-S{1c=|6g727u83B5*}>-y~5|abX?en&aawT;o5cE3h%a> z-a^9Q+fBy4X0vo$&)?5DysG}I@6&)d)K~b4ATO;@srqo~t6?(t{m=YjJbQn~fN`hE zD~W*KF+BD%?4sj(tG-@4l|ZQUUtG^iC(2)Km9evTM{32-Vssl-9!cqpbUv|SCAvh% z^*qj9F+(Rup_8>_A~exSW6Mwn(_LfJ)a?u4Iv65g73?_r&G}gbHE7*}?MK1x6$7gdj@}Rm!G_iivP9cXb_#>6uB-E=4VW5 zla4MlKv-6+7G_2lE2myXCl`N?F8dx`ejHu#E0vDx-I+~(XpE?2Ouc5cRc(@b=h=R( za%vshfvwl-duckZ7gb5e^(aufeAxW$pc)HN<3p%jB>8Km(s8{zQOQMh2bMhkfI%>~ zL8`5Nh#CzXwwnA<^mi*8qW>+Kp!J*aI?MPPP*&-DshJiUL2=@^^H90)kgH zV0BY$kiv1ororcj_|tR&&9L>=9}+{^;wHHYzFU(28_V-F3ousLR|yL)VmGL3Cj2nT zE-|=aEWXdYAiX+v-S|~W*yY7i5h=!kLI-^h)#_VFf7wxR?9_6q8STR}ClSp8F z@R#9ZcP;TMw%wYBl}m+~x)#K@@>aJ<)invU(|BfX5+>W1_K#ijh(!+L6R zw4YQ-yb*nqaQ_u>-jUmaFK)4yCquO$)1lp1Rd~J9L#@*oFR@~{{#$c%wm+FuIp|Pp zYw`JtfY@@=rv+bo0gI7~-K*AjZa&1&GqgW9-VVPf?tDL=+j;FqwzH?OaZiqu7jx}W zL^c6q<}$>u)-QXJ>yfM`I-Wrw3Qb~?>q3)XG}aF4Gd`~zwE1ezbLYyGOEQwjIAU%_ z??ECvEE+yOKY97ro*FXoWW487im`g7dz8lhRV!V~k33w?SwCK*{|A5mxR91K2aL^KKevi^ji0c^4Np@6BY3Y9hp0c z0x>d`DRKuk1_!g`EZwcHff4PM>)RF@<3?@fDv!C^(rc$Z!hLJLbxDj?9qhEp+-rPr zAlKl!CNZjg`^rJ8Y?+p-si=Q;-xKb0k~xE@C#=_=a656z~hv!u&9ko5~W-8+z;dE4QP)GUE&-d59o9uOM536=Au9-Id z7u6GMlqjqJdiUXGB^}knoJ^&odiD>#D0i(rFm3W+ao+4pBe)|TPB%*^_sZTo?ooXi z^J1V$@xdy!BG``wPw%QrlYri>Iou1Jp)zc=Iw&-RnG5pR_hBoqhSF zhjgY15}wm`LaMx^wiikX-Y(oRH`jO@#gH!rBaJnG^c{^&2+q9xtwu^qMkT>kN4@i&FbXGQzw zt*>kBcIlsz?lg!`KJl_^KzDC0El%>(V}iIYZ(2{-;ohIwyy1_YP4Ul_SK5!`4F2pN z@Ak?@RahQtN7Wf5*o|EG93ab9#`l0!o8wuznDkTU;7XEr*uMIIga}S#1@~0aTTCR0 z8=wa)xy=dEb`<@}c%~x7dEU>b#7-}Y^?A673-Bd~v4+>{M)w#-#WpjXf^l`;@O{Pk z?#0{oh8j%dw75WkQ$@*)TbjP48?rX)4r^Qell$EZ-0Itdnwbl%ep6t75y5`$+}h^I zX_8w}s&QR_8*_%Dcqqi|vZ(Hlw19Lg%g>vGt>)~5iX5%kyZy<$-GK4@l!U<8Yp1o{ z)l9)Kqe~vIL?jtQ`?N|2xL&9-a3SwLaa`}yx>=YcLk?#(k?dpNLQFQQw*-Mdp+u=HhCs8d+sjv;RRp_8jBC|O~y<&ekBu!XUd4jn4+T!!-Tlo(T zhSBY_=ounZ76oQCtslT#)P`$##AhTZyiWMqRiCH?wv1bcn6afW8teUoveoV5VVQe& z6FX6zS#p{E49y55RVkNAlaM^3Br)Q&D>8Jdv-o76a%04X^H(p|v&yaoXUkd5@6E## zoD%gL(3#}FupTB|#^VR;jUDNejM|A8%?f!8vVV%7b)ZNos2u0mW}ig73V}bX7^2I1 zAQ?0hSw*fJ?z1&1=IH}s(xA(FJJv!o*dbwb4*Rve39ci|7n7bAo12sc2F?@)q}D{% z_Ll{2_PY$GT|dAaKJ#J~78HT^C_cpocd=?ZtjGG$ZjMNXTKdT7mV}_L`t*rvV_1}% zQ7JcOB-jZ!cz}6S)+k}I;o{jZMu(dhl1!P!tC^mAsN`nINAXB(!Pkwizo6KNNVd#< zl;_=aVArddkSfSA3fs%lI#N&(i6}I`DAoEkXArOxAIOQ}yi)WTtPv$<%nf<&d z!O~+!wZEN~agQLu)v9HR5nxa0IUw`i@PVr3Z)KVu8;Oe^nfhPbcS~V65jXrXOOGVa zL48s6NM-L<$|NXUj+?>rKtL~Khu9GYQDmX}g)lK%Yi4`z0@~PkD|8!2&o8zY``l%S zmFUmF67L|{e+L;3vcnbIajr!nhWrAlS*M6qEEi}ditj7lfBYmijC=%P0KtBZrPAKP(iFpy17GnF7h!ZuQrQ(tV}mRS2Ms+rwH)teIOZ<;qw9&R2aaVe z%7{Nui{yEPL&*DxyB`ptMQkrA8g}AhwMqjj{4Jh}@Gh}TJ}|#XC0pp=1>vpKCm6Z8 zgmXd(T%Iy`#hr0jsIHUi8hnub#dXx7^x*OIn8=luALDG=zrkW_GGxxP@_~N0`g+;N z6O^MG6T{f>%#M6dahujmiaiw_YxUbFQ5>@%-Z~Z>oTLs0Wnt~a@ATdwE4(!ncBl74 z*^P%11mY7+cM>>!_&Q=~sYugeY2FxsStH%4rl#kvL5n|>!Den4VZxldW}7;%Il4Y>%y_U3L)coT9cx-Zhd8De}?!r zn|4InN3YrA1h3=3k(2Wb9bI^@C3dhtwmZWb%0$tSn626~%V45KGWdoCD`tyB58&~f z2RER}CJ6%0Loj>~J3}zxa%KLp)jfk(#0=X(F`-xXCi*)l2DPz1{T>V?e?YA2S;C#? zsy`nXUT*;#U+-_Nu83D|pl4TO#NqG8p4iolUrQJ&+r2pqLV!b)?->sA_(xGxGf^b8 zkQmP&9|+=wm3Lgd7-P$rzW(x`6bZh5QA;0)rhB|y6*C>@&2h2}% zjsAJlAX_+UAJQodMbty?-4)XuDplVjOAo!AlNyW?KVcW+^$&*i)|W8cj2_ElQI7MoGwP^m< zoa2yW2*nvDmiOeB?!PJ0Z*XBN;rX_|h?^8M#PkcJdC--_Q_!2-A$Tc3+@yyrC4)26 zAW2D*p7=t<(73n4Zh`iOMlihA;pKA?^M*r`7H=i{W<3lCZyLyuJs)8Ov%#|MZ=D@+ zaIF#r@dW3In&2m5TGfswMiRgG2QUrTX!fd4&)xWi2IY2Wh#{Weie8ru7Z%JHaC^sw zUi)~(MeLr-We+*1Q>RpOPmQ5ayqzCdQwxWF^KNf^I>qfYpSosYj}g3MZ`mLkj$a( zpgRdMoOCv?tZki=bWWAbOs_M@uTv@P)2A4Qcdb-4E~=h(yCE`I>4xVub;MZ>qNDo3 z9duKVyz?giTqeK%W{nP(Zt6iaE2^vyAT}DsL8sHB_sO<~HFh>N#y#Z7hNoE#5DqNG zV7*X5DA7Q@Khc3PMSI%e0Ph$hZp5_;{z;Us$A8`*XxoP*9^h&Fcqz>B=Sw2ctb^4b z?6q!TM$FKX9?9C778khW>SaGekEeI`9nfk?#DahNa7K53n^qnDgfNBR4SYb7>Czq? z_3@<3{=L#FdM}tw$;v%|KE@Ea2k$LJNrvt&&FfUK~yU~u;IUskVuQ7;O{V^3%G`fxV=7yo7T{^|5frn7p5 zIGjzj(W;Me<|iEVU1Q9EzSe08n#>I<R~&>zVeHB(oH?|e@(sP0TFd6U80ZFq=1QTHGf3B3_3xC;3_@i z$7gHEg z{4>@Nl9bCCLYD5T4JwEqC&g#OaB%y0?)D2GLj#6-C77;z-3c;Mne9Wd^q)qpo!-k= zKfK_5c8ziYT6;Dj`U;w3hWt?@p}>wQ;`aH1sYZLl1|$13pMnx$Z+Ev~xm(=4*D=1pIv;_ELO%)SQaP+`&3GrDID z6|x^DHlALEz#}i7J5w((#G|`b%u|n}z4`D+I6*AnHm4~>YluI10^)U-th0l+UHbH` zuehf`(A3mHJR%;~W1sF;SgFmZR(;;EXTZtcP~Uz|QCIAbWP@PCcQgC`g9jj1o~O?H zI6i{N*?bVvz7K)<6#NQu605Hdka(v-TeAFp8P)g5;F@PNTLBrMsHkPFzY3+evlWfKN&q5h!i){tzP5q zwdaxqEA#bz64(scfoN0gzVaM52k($I4>v<+gi&ed6fw01LVXa?LG9WFxz%Z0yBJdD zrt;7QR8+G;gW``yVAUR{kWj^wE@A3?Ck}>3eIkg7h#SHg44dEJjQY|Fs>L-_M1O_C zTwWMK4SjVskDgx$yg4s7xZGL5@&1O2Atm&K2-LsDOA4!K+FotYvuJ#_+Q44wf(TSt zG(psPN}sN8x)jn`y@&%9`TMHdbXE^rW;6*#&f?Jt#GgOCyw_yQsfMNV1DzVGeD@%T zhE#p^0?8(@l+%_8P)~%BqjJ6Vvg`f!FgEA}8A(HZ6p6iLOSq|3KDkZHB=f{nkP)5L zs|eu-#F5tn-?$!c9G+2RvT9U1VT79EG5qW5VHMGB20l0i^^MoP(x$ri9Qj-tVfT8s zw_=O5X8dXWd&lbjz&VMZHYZNn z;yVn~@oGjtZ4El?F8@3)7-^#!Y31KR=kv}Bp0tlUc`>-dwywkBUdOq#j*Id?9r7Zb zn@(Dfo^)uz?1 z0J(DMamQte<4FDY1KfjlxN!VBhpE19z+MQ^mzQKvPxK8ZBKGiN4H}I3Bm}IXT>2wj9R{`Z?O0WKMcKEW4VTBR*M+0D96QrnC!t^<{yf|S0C zE`A(cGOAtvS-ZTdyQr?UJwB<-fHn>c|!EwC@D#RDPvZD?g|yk1lH3 zzhg#M@U*K|qIK4zOZ^|*2|uW?)u~QCsBeq8d+GsmbxdhfOwD#Qb)dW9(!o8p1EqkN zx&iHG{DbDv{i@#&>NT(36Fk+x`MZ4dp!F-QZd8YA`9_yyi*Vbk7%8M=)Ag9xxCT&^4fXfQQ5yvxfWj0f%eeYbo+RF`b2g6Py9czyq+hQ zbf0?nJPp=;c2#$9`xJvt&SPd1H5%zzx$cX)o)=BJFYoufe60KG8J(4b46zmJ5~hb3 zO`ebb#qvgN_x-YnyoLBdQSG{Lq8e9^b_gw>xkm^ZD}XwB@{GvL7mYa7ee3<`?YZm2 zqZi;y*F%8Aw^@4APKU3r#S0ArjJn!`?`Uu8KojzMui12Gs`S`Gh`a>dH;)fQ^B;Xa zj(uwz&qvgot4o-I=+B?{Lo+y1s)i`cK3dqO!K-O`b$Y1kW13LunB@3z6?L3}2^kCM3p*_-!|n~?ra@Z+}zu>wOlWC*tSE4E1@T4&}8 z(kfmpoyKw4Hxs8nb80{-gvK$mE19qVkxtmyf*8>Qhmb)IWg+RB@EpStIPWqlbXF{J zAS(iS*J^H7+$DqS%Y@e90BFdFm0P)1jgIA=Es%HU{2y4JZ7nPMPt@+tKBte>9efJU z$rvtQn;NqMW19(zknP>m(=MT!+{gYG8lAeY{5;?eux~M;zkz6;K*tXM>B<7zn*(sU z<%z&zBTpzL0&?}X(JrIuAU*9_=WF5e#!QaPsa*^2(06l#548@@BnLR@2%}fGW}7#j z!NZragvp^C{AZ4n6GyWFmTpedZNBe1KX$I$e|#9dEzNUR)5c_+#H)EjC^~HE*%Pbo zBO&_1Qiad`mHrbYsqDUfg~El=7rz)tNR}xDwI423vROTI^C1dP7gLk!qMN_mEq5w>o;}E6 zq09b+>zZHk6CRgp)KflByDy5z&&plO_8q8E0$?tMyhs zEzp>?!Da&{thvq;B^=I$i|BBjt$8+-y+6Dq_mx>lUj&C!Wi|iD>zpGu73^|GbfUu~ zgu;eeB=gHGnV#i_kbW--okHqyjT>e8)uOm(W4LU@7fSMNI{nQ5MfJowM1C(|&Bi0J zWBlP$=|08M{N#I3WmUC~22oY{np3=Xve|!8J<#n3Q|5gP>Bc8jhXenM>g7*+iIv!n z`{ei!vN#-y)txIl97^V8jXt%4h^`jCH4}DA9%{pCV}Hx#%F8gGPf>wqJEUSuZdo1kc+sMXfay$J(V=J<>RP^IB?K%~;d0e(^}PXlPMj{xC^uZoCr* zyFI?%EBL+V=cnTDy+==PeS6FxTC(|gh5B%FkX!U{^NDbr^LLuqv0%;f??ENsy-K6g z$R;nqmBQ_(H7)byH~EEsPG&JHFw}ksI!FrDtpf)|aW8UJPy1ip-5rh0y8mm&7i5;iaK#bTm2Zs8IN-jb7Ict$obh8+#!C@>m+;2}Qf>Qu$F+ z?hWA^qc0tE9i(h!R`_HxbA%U3avh>^2MeDM4^3o&<>0t2+l!zRkyudX0&F{c;`l8$ z?Xpbv4Xz$h@EIOlq{SGKdlCJ-^;tZ%ZmjyVQ~(Mf(fu#q-LnBXq-> zTj;P}GJcQfJEM4iz$8By!wdl|C`TmTFTyzvUirI@0q8`x$3( zUbn?3Z%fTnnZjWFFZYb{d_&ViR3-TIR?|;D_06>OKkZP&Zq!uL506hzwMl2kH?3d3 zPPm|whK36jlFx_s_Ggp8|)MD-uGhI8w@@eVg zn2Y-lE=#w?ybf4an_kxD4zZE%%TD9~e=#)_$UV`AhAL{LAkEZC+<(w84Uf5G=gk3* z?-V1+-SI4bOY(`m$rA>zrzxgU^0K!vU;&ye_sItL%lT+yd~_uuGM=MKatFx-wn8a zTx4B3l2ykpCwJ)RNDot$;xoqJxz<|p4R;fGz39b0OR;kbLxwzFo5@m@V{l7)xzJ`f zQRG2thEc>DQQwmTjt^fjsX+%gXx3*9E)1pANhiYAHW_u*ty9h(REZYVCsw{9SjM5% zjU?R5t11euYm-`p!el&c0;?+IlU}t-;oZG7nT1D3*lyp+5cfK1#xU}Bxqaz}s>2=kldn{aJ#RWbR_~ z7w5kl!CgJTl<=R)h-U)$FpNawr%gshpZ~?)eML3Z{`;a&LJ|@JN$6EbDAE;>CLjb* zKu`%*M8r@;5b2_H5_**Y(osWEP8jm8^i>6I8pFEts;!d6tGp&;vye+5}WA@$ovc3P1pl*l7x1-sY?;KhUNV@#>a@1?t z@@Lxc;+ATW?fGfg(~4fhm#imQyLf)=*gw;Wi4}}qLn@dE^w_C`GXL_Qt9QHcaTnLB zRbd;hA^#3#gu7D}RmFO7wSXf$HL~UcHcrXXrG%$gX{nvKY<jod z9s3q5+|2`t(V29&-l9%Sc|;rMgmbe_o@aX$A7*b|lW6&+DA+9Jqo8%2z^#{54Y!m> zAtE!yNBm@?f|64LbY>NQiByepJT}P?mhT`}D^D#&9wX#(9xx)fD$wkb@qh818fC5b z)`y=s(IR$9x_am&_R0mzn{wOrwiyA)$EM&`w(+8f#zsofNwAE4WHYeCQ}?p2`S$vfNg@{8Q0FXBy#N^?vu%T26l8tXZwa z*`b*>l$h~t{ow7+YKQgk`9Rjmb-%1|(aC~zHP$!Re?23IC0+T;R5vhhqI8C3>%dq3 z>W74I$1excshT&vw)8j4!p}$f-D(+Vg+cC{*i{z^T%+j{MA zXY?9;yXj|8a_+4?72CJlzjc@2}OjRn7#>e&)>9F#`)8X^Wet7UwT0^Kz8`{6wlwgB|PiV0~W19mC3V zYi)PVn9m}Vue+7c`y+cdvwU98=PM~k(&hdPjWdz}JG@7eX`h6U#rd2tCuK<7qRKKq z^e6T(kqSVOeOyZ?jiZ8EUiO@`0>CkqOWwneq~5L|J2UsE>fZ*enVEicwO$6ryRsxzU`ud2HPa= zBG>zi@(oq;?r_B`?tfjZ>isntx#beXMS~qK0DRui#8zqCu?B);4A`rFF%ue_YHso- zkSmA6UQZJf$mNiuNkgmIL0FXrEV>4ul9(?Y)XKi)BHD;~;sir@ElB5hiJoLAUPFlA z@aDP2tuAe$_)aDM?gnpq+(9E(jK>LHA8)`S*9TpXiQzk9Y@FYE{wDT(LDFHZ{8x#b zy17DUNzgzVn*dgYN$lO%7unMn62QXz#kC5nh5jM-ItESzeG==zaEFo%>*JU^xvw;7 zzb|vkUH4Ys9nd@E#<5(+vDS2kr!>xya`-HwTCwC`ADwJ~++=YWh9-88N$e4Mh6lv1 zlE4C!a=vx~BLiYbYYYziE@?N=loF;-1hldXP4@Vb&;=X_-&Wxs3_BrNdXt-LI|Uao zXdJ?05_Z`{zZZA-igRu?2_VZJnCOI*mo)&Wq~;1Si9Hh_-Pb^{1Oxv1QOux5nldO> zz$EqvU#ZwLarS3IlHU#kqxVRHT)EOrVs8O&{Pw6^6p#7FW%J*U&F@s3$g=%@Opvx< z5__1%_@hE))U%0stg8A)Y$!tA6m~R1g}+Bkj&!Q!v9!=%7Jb#{`cySPZJt_FVasbp zs%msaIWIzdNYOWm7x3|2MH`rF5r?gcz}vwIhM?0IpY@ zVv^2~qkka}ZHQ1CDm4-dzRD2Xqgh%IIdxv2PBOfFO@^!~$)5+{=wXNo-J>r6jQ43* z5}{XJd2Obz*v#{qd#0vjFo`|l(S4JJ$CxWIj@xN=7JlM{C*CrHxydH_r_=u?_C&AB zn4nJ&lX~wniM>hLhmURk!SZel*znFq`AV<{|MfDs?7W7N*be1h@;}5L<-~DGv2m*7 zC3V@*NymqOUGdwr58-nPyXwS!%z<+@StaId%M@Qc;0`TD?5Y#!2vn?|%A<7p$eRI3 z>h#U1G?%7fm!?`L&NF%UCBy=+v>e3#A@+J7$j4)kq=ac!r3KXfa#F_AobM^9A-Ux_ypvTf6Edo$eKM+5LL~tt>*&jl}Oo zlb_OeJrdU_u4J^2bxxf!N^Ik)XPgw{_Ytr2vcLA=P95g>=L=8{;S0HRl?U>h$q!$4 zM|vzY-*+@BKf_1nw0zsj4ITC|W_{qy@6W{Y5Pc6&5&Ri6VWg+LfGQh%T`%8R_#W|2 zDoFSbv4<7-MMC#1@<{f`ujby)e&`nTz<>F3@XvIClT@cElKFv;ybKN2k>0p9X+m<7 z+w4YN^bs9SINlU`q#h?U%kQ}qUVD0h4= zXDbPPF;(O=mUFIKZ0j>h6Dy`k;&r2;q8|x57D)$oBtGdA?s*j#XLqY>L=)i5mX?w9 zBt3K>BUyg?&SwFI!M}+;-^n9+814^>V)quz-6lmfv20u<{&<4y+G}b2_0;Q`B6&oC z3&048h6}%P9r`%ZuMH$O{6p*!T><(TnN;UL#2!WX@WS85Uhs7%FFpWJz@*A6zIqy&M^bq;ee%+5?8he!VL~*fIE_vNiShD6*wX0cS{}6jb z*A5H-Kn3SG0ssU6gX*9)Nns^CBc)}vw9@vdbO$WbOQv+4#OIb7yFV%Ql6`zHH~*Fj z*Cwvpmllv{XStCn(`*}Fm^U6ze0qJ%sb?$K`Sr8IlN3}VC3}tn#AKOuu4-hNeN)`AuncwiGX`l8y03=Mt@cZT>K3vP;!is=3WVZB?r0janju-hi z%#&_Zz`qo73IQ-ghhx-doK|n=hbmvUHuC4Nq=O8kG{qd4#2z8AYKhNOqg6UEu$h#* zlP2@}-jPF@EIudOxFkDKH<`OX1MJs#sZo#LW;<4-?!Jxj=g*of&@H>P1iIw9npEZ(U_zVj9_xwYqD*IZ~m zSIFwm3e5&YXLWBpXbkA(tKR{B#zqB6Yj2OIq1ZV!l;o{AnKg1rc(iS*;NNO>6 zo^ZLG5#I)kx>AGwR{J@B_H$;7UPO>c^n?S_iiPXTGhu>A+RZP24UQ(h5P(jOwS|5> zS3AKg^`);xtzqJe8Fc*p-^3nybbK+$WMIN+XEJR1ru@!dTkLNx%pGA!s1!<5!Q#48 z+1edg+>~^%$a;&g`csF4mPyiO2CwJ`dLiP)%^9=sNxi51-i=g*PLX}{?EZD?#=e7& z&1`_`OkCstAB9nSm(M#aRDNAauv(6B)a=lef4uf|RCJbme`Q{D;BY|nhnREHu)5*+ zgO%>5KMr*hzCU#u-}^!9fa&_5L|^l4`T7(1ZS9HWs%HoHy_VJI2FGRh3@)ax%^s|2 zAHI-yYySrLRQ{RlA7T%1;b(45KwIRYIktLo^3|uU1QM{Sn=2R@H=oZ z=54e-AD3?TExlifdj59~e~rHXkie)-Vj)e_yeHDdLMKJtw&@9C+jV*mQlHtU*>3-q z6AivFaco93vRvAY<{fZzx8d9V+VC!u*qhSWi?I`v)FI~q44K3ps7CklI?=pf##51uHZ=Q6Rp zzl_-vl6;J@pOVgW-pUWK;97K!RVyBj;dS{xVR_#_WHYfm*XW;LzJ$64er$k9bx4KkkQN*48=aa_yep^;qo>6N{wMly0a^n>r`MsX!bP94nwH#87AnRjw{9&FE^%DMb5j&sjR){<;&6YKmU%PvduxnIymx5l-ZD^es&|5uWUUSGM+ z@o#TUvnn1G-60Fd?r@%qo7~bTmqKmx!!-_+!XWzoN+;y~Udz3bE2;DFO4T#%3A<~P z=5`qCT07=eY*jeaJKKeJVDNUcWo=!G)Nuu4Ki0t`?cn-p9$e?d!SHzwA=Yg+`I9wt z17!KZ>T;i!g}_5o2{5Nt@tbZgaCvi%2a^R4Zq)w#dfctY^Sx-mtYU|&-zQ@Nqg{MG zaq@3yPshn!iSP&7WBER?TzUXfP0g%UN(AK4&$;@4~La;mw}~ zvx&&m{;zMx zhkvc}&-NaOgD71*#g0hCx-DHI`u#eD1`VTMhJk$GuHVMJ> ziv>tV>o%L9<#U(>I0K4>dznNpVEO$NW~Yf%(p9==YRSFZ@|pnfr3XKekX&*7P#hZ& z$;AcMCu#5HK2j&+wE6E#9`osvG@@5w(iKclmrYo1jg!)I_GS&IND=;>UfBYMv%*&J zUXM?!93nDcjYI=B=`BOCcd91BgTzFMaDeE*%5&{@5*1xBkoJ=)r0pm5`dD$L91z3# zc4!=2oB2$_w zsn4q$1Gg@Ti6?%<7Kp7lL~rx7UI};b!PX1KP|HC`(EdV=b6hG=%)u;5AsE-l2D(JSBL_z zb8b~B&)8>WG#bRUimFn}<7X9?DdHD@RHeUFxGm@6*JC-SlF=7GCmomBb5Gqn>)Xn# zT6JcRTV(Z1NbBMBH44#u$qW56tPV9;7A+VjiQtQ6r%>Rf%3ec4(QRcD&-Sr5JY$YugU_;BIO!;>06 z<#_BEYG#i$vj#%$Z@OsQR8jr*>m%!cPd4i6;*t5+J>Qaj3f%XnbbtDMf~5Raed|d= z)b|*vqv1T)iEd)e9fg;>JVmx--a?*W5iAG{lpwhjL|7|2QNGW$Qs=uNM@kBR-^su+E2MDTn612zkaz z{POUgm~YUKvEX;U(5zYS6APq_6EbN;QB8f`t61KzM98sL(JBmEI1Z_`&ST*jRsi%+ z*W}+z56$)rwI~X`{n=N^5xB(ugUTZTO;x^FuoDi&#cK?+)2bQtr(fjv=%=ZGbG z4;P2vWOt*)HFd@{`$jrac~F(eHi399PgJ{1RJkVc#X^jDq<3S#PrgYsp)E)Z&Cuza0cA#ME@p$gDH;=2=Ju^)>Z?g8xc}h7mz@NIH6G;lt;E? z`1pu-*@)=GP!t18mfDIK^7R8OCL#p2zGk4U?Rc{IIMLV9jvk0MP1&1N#BT!B*pQVO zFZk(ky^5a5E_>g!^;)0F5zx!z_Rp~|osyL^rE&=D320>aYC;2pHEA`W`XO5k5&Ug| zBY_IPWSA7{`S<|;F!VD7%*nnM3TY;yC1q2NW+orMc9?qI*?MsVa+AT@OklUAvgZ(> zs$q~cfWjIMX@O%e!~~ojo@b2!?l~*u zb3(8|$p=Zph9;rbuc1xRifp6;&NbwsA=(I+Z>+DdB*V7Lcoa-LK|@1ri4S`ytd<_U z35%?5CxnoG=t-wMwEsw+9LG0rf#v z@HY{xm=eFuFc1O>?a@^V(a%4=)PQobhvN`tVH|06SV=`d16q8om2C#iCO(C#BEXG= zUgfiCWo8{_W^qD{CwilX_$=d_4SP($AnVc}i#MGS3 z9-*S7rDbjlQEnu^a7}_Qwy{&|k>3R1NfoFsOQ;eC>$VaqPaheyRCpmB)f~;;P!xAT z?xm(Fx|t4<`<&R0hTJDYEmr+a2+*i$6_YW@?Jl-;OY~F`D^Od(-b?em1uA?1LL+iG zRh&Haslsx#;sBt)0n$n$LW1eYF*58vgEfr~VIAY9(P3`HYkO3-@KyCS44XtYYM+W7 zz`aaB!)a91k*ZS2D~v5V`~c74k3-lnaXk#eh$69pFLT5Pr=i(y;t-B>*!dfW!88K& zdVnR6yP~54Yl08h_9!)zMcllu+<}7?5+PMl+^tsVg=npd#^@vh`%eNGwNv7%pQ0X< z;#Zw=Y8%N^=Bf=I8=Wk@VNnX9vG31ieG@>nK9q)8l+BwqSe<++9ScsXsIaAghwg&6 zDH?odvn`3>HECHAqWl2Y_@W!-NC8)Qu)Vy&bDsjHtp*%0?;0SD$X>5^1Jvxnrb)1vs#MM3*fpy$My6?f*(f7MjX$86lUANX1V1eX zfAM2Wqkx^bAys%(AwgPNE6)fGHCcVL))bI63Nd1EScgJZg%8=nE8bWz)R#TjOaQ7) z-54iv{V~-+8==~CatC+&2>91s+c#% z4BfL@K?6t}^)H%1YxHY1q!zaw^=nArhCHmwU({51R;)3}K-G~^7%Qb_?RH3R`!co3 z2z}gtlqZP*zE2EJB7zTathb%tSz4pZ{R=G7!KP^0Jw~(z@!@GwhY@?hE&-fJMKG38 zNoZ&@Q7h{#+BHWbiQdv`fM}QH>>z+yn2BL>O$#UVwqLQLafkF0bHP$r-Rb;(opwNl zuL?nOJ=zC$8o{2OU=qYRF7lOpCzmpf$L z8_nS3Mv-MR>*bzWPPVqKzb&M2{3PU=GO%k5R#UR?$>z3!lK4~XHO*v19+|m-!GC%% z5A~ZQ+1?GdvcHt?wdjG5NH(3pBJR{zT`Z}2x7f>PU+}dXlSDGNYaC$I%d6gC?>p66?0^-KF=H2_nJ6F5B&!#oWf2h=HmpaS3`4XJ7 zQVd<|Vi_ph_h+-fp>KM8;7t19aRc3eOU%IsheknkFLqzY=2p?O$_2Ac4YO-wvYRQ{ z3OT&*n(Jp{PMp1mCSON?I`{F?sl!PBlmWkdu-FpoK0Vr!n72l6d(qrJqm$R_)w_Xz z3vTW;}=m96eXv>e?PCc@v%8OP#H|1vEUP0}S_Zd{8Oe&Bg1hB79*rG;3nqCh0uW3VYLW;s^*|g?c^vPp?8NLLXIBjtN3`w^88oS$j(PXf6MWaQdk6@S~O^gB}f#sA=XlL7YdeOK^JS*1Hp;2Z@-iLp-s1COh| zqmDh{emUL}IE_u9r?Ta97!jdPgyWF4FX#IQc<$y3yvh~XNf2F{c-1^Hc(*k05;GEX z)Pm4{-=o|K{c$dK;taP}_8s_o0tTOZ=~Q_%c1%6fYI;ld+IBBydc{-EJ;3*e9TOaqSY%6VX%o zNsNG{;8Vb*!l#FGS|pAplP?C1aWU_WbW7t{zM|KP;?9;WHXP)vtx_y2ku?X3%c~EK zDp4(E%l{k8`|WN1+sFO)!^Gcy<-Y^^eh2;h9fH~-9p4Hw-->YGib~vyDc_3g+e-Mk zm5AC-I=-D^zMbm6ou0UzS-zd!xBcYj_A}H@?(v;`^PT7JJB5imFUxm|`*uoy?v$f; zE14bK=DV-mck2>&8_IW^`gU7>?!HCs(T?x6neV-G-+Q09*HymP-M9Dg=UyLbzyJ9D zfcgHA`~SxBZoZ8?a^u&%vDdEOI-cD4?e?`L=nyJuz2!0WEsV7NKtCT==}! z%BB3B`VH5*+1eZKZy_4iOCZqy9IPjJL=^M?bzAT1zuS5V|4VGWiT_~hef>|i-q*iv zy?=xCrV4{M3h)1$t@j^cy?<1_sj}Gr8?avN?0-@;x{K)L$+W&_2{+GadvYh{d^`id;>pA{MSdY$&cGRSV z^F3I|4Hf`(hN2|XOs2Sx?l3}8O81z37D_Mp^CE@31f~--{Wa`4q+cxLqs+mhcHt-a zoM(KrmnbvIT7wKpcBP6{N`moK=UJ~4;*$kvwTg=8=xf%tk;qFn&U2aXKAK2`U(Q>c z%dU5=bc~d$fzLnjZR|*3!g|+3VX`<_FL2Qv_ui6rWR<&%$!TPLAUxj9uOX*oda&%6|{a8Wcih(?>8 zE)ZETZ!svG_ugm{syGYrE5~;(qGvXS#z%%ybWUTxtUyVGnqOnc}Gh4 zNFEDUXs2u_R!FUfQ`AI;g;P;{a|!ic^sURf!vnj`V%R6X)3L&#I61U#4I|05@{@^5 zJT|nbI#zPkY060I@sb3*-6%yIDdE<+jM6zbpb{^sh`PTbcdlFo{ZZZJVu{|8k{wjE z5xwcv44r^sqcg{alBtv54Gz0wok zH@jwj{=2DH<23}rVj1%XTNuVarrzuUoWVUV!KVY5(c^UD5WUES{|WO_0A`j2|DLu1 zo}p`RL!yKE*zsS$RSKMgIJA_vC}}2DpED3CW(cA;NMb`-#}b)i{sIV=c2MA|e%KRk zG(rf11j6kwzEqa_Ll6GbJ+6IXZgi3)*eimIY)6b_BM;+cDBc!kI9b9KmeO=XA#o%g zCeEZmK_9C!24_dJTi}9=I_Jf1TwG>NgXX?*K4;HU-@cELxuDi!@_}$XSrGakm zNkew1k*ILWqr(bJ2p}0O(P_!z(W68ZUtr~#pdhz1e_U5$Mpd?D$o4}F!4or);)08v z1IlKiHRx6p+}Q-T*cN|O#3)W(h75&tLPWV)iDeY_Z5maqMyEjcD>Te@p$}4aKNm(A zsgPE^BodJvsr6wN>QTgkbEv*m?JpCmh;hdhS4SZh*p5w%A#MUr6p566P5O50R`^KZMISIwzV}Bf9Nr^gkm$i3*M!Dm~m4_`Uj1O*3 zI%lx@P6$wd4D{zplzGz44ZaKxEpW%Y0n=MSFOP?gulm2-WYx7JML`X!TxUZCc>Yo@ zIcK08GzcOz99>N0b2dYpC3jQ;whLv~2MaMQWOVdgVs;+VaxhC)l;MbWUbT%|N56mx zMVJNos-@1hW^904Ta{ol5y~ZLIG#KD0=f!O^vx*RX$s2g1v_?CpaUDa1t-BksC8N! z*KDpNywrfzchxv-CRX-t%u2%hrHK%bwk0*<9A@^@nD8G#4~WO~Aq;a@S*!XEvk+o{ z+b2TtEbq!n;${1|e2qB*jdOO8Z%G zr+G2H`)hDc06mx}nNn+K8fV|%*Yf$}G2%j2tIe78#-SlGnq{Zi*%B;a9Dr6Fl*w-=#EWa#%sZW^Yt2tq}JqtEHk4Y+(^e7Ak@ z3RZ++F)#7w9W#Zm?YrdLoY}K%_Vw+fy~@vM$qa?)m>g{y=6gM-;vp-md3imHzaDQB zm-T2dQ8BAqOyc>ZEn;E71w3PTL#KU=uI$3Pox?k0_gsNW`!?JbWe* zfmT`WP+Hl17gtsZn;E)$EicIm`9ryBQ~!lONBhfNg~I1P#2Ig1(;Ih>KzTfW&&Xtd zQJ50A$X1g!1Q;DUGQkCm5v;1>VWbP!f3MEpoix#BYI-}}yvYOXHtLDH9}mb-Aj|0w za^06rl^AvR_c{L7^nA&e+BYVSqkgf7MVRWd_?@&|2Evl=9{$nv-mL|p_@E*2c}w_x2$z@NN!4S1>BO5fRuce=76$Bsmlx#uBkTe-4~49S-q;|? zI-!5x55sugx1T|43277@?E_+0QZ;85^725wUEN7%z_VhjL!TDcy> z5>m0X+;@k<9ekzw)i6BVhSsa#8))bo3erVM-ui^1Z(0k&rHdi3MBN0_{xHOLF{ zg8=m`(092UO{}^dTpt}j!b3cSVUA%?tyeHoI7D6Onb?u&toj&38syk&%ri}vGk;+& zM%@}njmgP~eFfvmU%=q&Vk-pVO!{LX>#Y$5Z3}GpyIU6;Ee| zN^_BB8Jzoc5@$bYry^n87ya3ib9^M>ljb8PuXnHkre$y%;BGT{y^TmTG@taH4`ayW z^^_BNjUVldi0ymG6ewM-jpkGcMQf*_*EG>f6nK~;-o_sNOOq4cK+d9*cP*2CV(%(3 z5IZIz45l-y(%ybjcw42!QE;sG`h~Z4iigU& z!P^-M5|(rCU;`01)TppuVRG4iPx>oETI}^U0a<>}**R3yn-4eG#g)!tI;2a6vRGuL z-RPJaXT?du(^&+gF%_rg#cXr?{6iIpT z_g>GI?DXs;KwYzvn|RPTR7GSd_ieH~%hKbXF1fZ)@Eg3=m9(TQQ3>dsg7=dMzRN5$ zkjl`5V><+86P)ZD{4q7M!kblTfl%e;k&_YPdSjtj{TI(48z21LYr?Zl#9bdE}lJ5V!t>A`VM*?fA~t|%fjt}LDDzhQ0p}G5rOan zrI!`cx7 z>#dBH%~`=0tBa}OZmTtO z{(swgO6RFWj(HC@YdqMESf*WEHX-L>5K!BtUo-E|W`eGrry40yVFnB+osA`mP)8?& z1YDrSBV(5WDsSQLxF`&(V#n z&ol|YcPZ;@1${#k=03pE5i*^(%O3GT>jIF01b0UOJAMjefT`8_qMva64d|ZmiCvjb z#B#qcO$+|i#T^dzK;$@uIx5d$K&=AkPz3^oXJa`oOi98@>NLKT99e0>a8m^XTgyq{ zeeFt=Nb5Y4+m2~fQ4rSP!BhfDh1!w5a-d$5<&tyb?`-zo{Xc^Bg7@ANMLI(RIjv_>=RU=Ah4CMb z1(v_I?&;S4_}%JbsBQU3PuZBr#}3-ZiJY=;A_7x7y$e>oA)dVp;|=ZXy-Z<`N$UO7 z5m*`TU0UJ)9pAgn-bYjJTa*J&YYWJ+(--9Ekrs4lOF7H;zP8*7glkz0Xa9Vx0FPon zGDu{1{G&CYHoOul*+f6$Du6lPFY%=RU1k5scwbQNC;6LwbAcbBA3w<)mI@#H-m2Qd zuaPjI#37)4r1j*@0U?Ef4Us{=wE+#6!H1+lsp3I{e}p|;!$y2t(z$9nfqvDs+p=`% z!u}s&FI~IW8o;u%Dig&N_CPH56CdgL;XAIa#N6Stqr)S6!;T5Vgd-xCMgkGStp+*u z!IKg95${+Sz(lFiA$e?NtW>^dV&>-8*&z9d z(l4d=zOGDs-H87h_q{A#^XrdO%nt(>baDOc%Dv7q@mE4Gonic?r7TzyoWB1J_LQ;1 z#=$J_{mPD6D`jx$PR@N`e4J-23w87SXa)A3**(v#P}41}`*F+lZNq-qM>ZB_$A!Wp z2Jl#EYQDl-Yj!Yaab;Wo*5g2;Fz%ctC-?B$m|!psr;01ufWBBDBk`?6}y* z_nkkyUQ`J3C4MEJ{ws+Hl?g%K?fYfn&dN<+o&NxYk~eW7GdkZkIM+7v-M=2!1FK~N znWWx7yk6Ni06+=Y=TU|~zAhP3<_cI_e6(%^0C21V07PgI;}5SV$^a^RAPIl-dh=Q^ z0O6Mi9+<%7^$wZ79(e=#_HSMfvo^N`p>CZ)o{ih3zvoh}H1Q^&01)rFbx5u!oOD zT`I??7e>^dH{zgX(Dff><^u-LMd+#f@=%b4iF>!$CI9gG!AdlVH1-I%Y75k~jpep7E1HD@ zQ0d1V3B7=7kp`F)W?zqK-D56~p@hTWdpXyf^+0*86rC0>~FP z(*+?^E2!THH%v&J!``@Db|L&iITd=z1OW)98z7>4^>c|?pYolnBXR(gIzGwA=fgPE z064W_9xnEH??_FYe%8H!MbNgfmri_jB6r{lshq)K2D3= z6|Tl;RdTq9!#loP?Hn+!8T+n&Xm3v-ycXLC8Ti>d) zGE?VL5#$hf6s%y1;J%wHg%nSX31f9`^uluSPC0g+Pxzzj$wfbAK~c{-z0Oqez~w++ z^>BhBbh2-&M_~Lf=;JxLpbd>^{`fZ2D#rI4I?l#Cf-QLy?Y!X_xSBBBZ~aeRRV zIC&jKj%CXZ7;p7yYaeers>*Xq9C05@E zr~(|XcK*@zIv&00jiCXp{Vy$m0J?1>OkFRR!qoM!%^cRy)^}MfawfOYtTLL?$zuN8 z_bS6Nn(p^=e)MLmvrF_FNM-c_uWq!W-aB!=qce3q3h@2OyE9)Dvz+I$A0GHEWIoVh z)_SXP=0@COw+4;`BSat~M_}WG7SJs}o;4tmF>)RQ3k0mBBw;+@+b=?}{H((hOkEG4 zueOmXhqV&E?h2A7vJWb%3&L`Mp*}GXup6kQp?wcz4Ag&Ok8aUoKfEyWP|u~O9N;u^ z5%LrQ2q%Vx9B4;rLRA3r)@&kNGUxxgw^cZM#ym=U#ugQ1`D}3lAAMJd28hI{M=->_ z*?{jARA7&76g;tA9^(e#mF3SY4MQ}6q{aK|@zO_U1aE*pe#=;3J8{f`&!oMJm;Vz& zWcGC22~Hy>8y1b=A;ufdds4RFQCHC{|REzBn4>Sty>cTq1d z=l1cSZ8HKi<$U(wZRG^Na}MGW>8%cqC!R22J+2d(0|kz1tcMNfoJ6Zo%*H1cp~qxS zO-lBN{bz9d*^|{oipO#zktx`rRXwVKG4gRoB=G=qQX7n4rnA$Z1Y)~PfwWeHarZ<+ zrNfi1v(W*3)v~J2R+sX)pD%o>*rsFiXaE=hcb6OiU;z+~zFQGNP@={Y8NwX|i3)f$ zR0eV5yk>PxYAPTQ&S zHsMslNziMOLsp4e#7n&5kA!)sfb^MHb+59n9CjLAt@=HB*B<4-@mYZilwla@%P@#0 z^a_nG0UC=4hUD(gTLR~Fxlpeh3RR#0m-<7&bU-M?gldBAqJsFSjbtZk9doW-=w3FO z=NwV#_LY37$SMFXQpxYsA|&FD0m9D_j3vlasVEGoUy}x8t0&rW+l}*h0yq#(WHpZ_ z00*WY&gDTp8*lhD z{b?f=@uE*aTnRrLfCw_WNSenS&WWI80GSza$t*RRAA~Zf7a)Y$>kiZ+mQJ$-tt_+~ zMNLo!O$9~=PG*k0kgzyLJeRUG1yzb7L2$$%RvjWl;C1)j%}1v3Iz;zj$@hnsicK*T zs<)FZ015zlr-@wVSb6@g#gYh!w+5_BIRN80lI*-(q~GY5 z(+hx|SExeynt=GXa?_P#1wfY7$C@uv?B83q6+8ei{|I?Bf=ry+liSXSCFhV?1*lyz z=S^6^M34s&>vfV|1ymuEwrVRr+?<1wmc7#1H(STIgmCf$wI;(keehtI@UosKl3r7Hgt35s3^8eyoM;XJ?^KLCjnUzy@&H&OlcY-Q@b zyShRCgA&pwubOeJzjk7LK-81eaMwmsSEN|%hRsU3xu%DE1f-j%oDnMN_FV}haD~d3 zrb`jSin;+^pUTuZ;OUNQk4g)W#=(9^DIU={6%9KR_#(YENu=ot0 z{W9iLJ%X<1&B3HVb&iV4a2Ou(PHLfSxcmpc_Y>E_&dFA5CG}L9!W&Z55SJQ zk=1;_5_@N%$|RcdrC+D_OB@0R09QobiKg_ospX)fE)O8qwn@XgJS>_37X~b|zCr3z z#mH6y6rM2=pc2GvtY6^ml2z8fE?&G`6;8}~)EQ?cL!1UAgk{43L|sP48@ssT*fqdf zSG-8d%N;g8PXVT#9sZR8mB%PWm_~C7>(kK`Z-JO93z&~~bL+(zv*?%lMF>w^Sg5`1 zA<*d!NRrGfrZ}0?W-ykbp-Q$v6QOiKt1KYWo;j4N2#~4dD;6tXhph`nLx=!I7TgU@ zgqS318HS16Aj0wR_7VMa%Zes+pNGkUv&C4M3xxnc<6QK7+?z%Fw^dEVl&pvbEkIg^s;Z_e2NeJ= z;wfE=?z0qtuM%}awX&7oK{_c{yNLn>BlOBqv=eZxff(!E5E3IRy$VBh5>fY6J1SbW zRmqwKh9P^#^H^%oc+4F#AXW1vClL-edBiRN0WyeYxMCKiP#_v`IJYVXx*>~lss>Wi zGSRGTnw>0L@lplVa-2_}c-3!!R8seJSd^q2RuLeHj?)qq5Q<60XvR%#*|R%3sWKE8 z^Q-WKvMB*5p@84sZ!qz?slpK0m@EVbb5rVM=>()KC5UO7fGv4~s?TzKYI$M+@Ol-q z{Su(x@Wf{FWeSzZf>&Nj5sS2lzPTJWXx3Gk1%aReiXxp`3&zAfGwNx6(hSqGh`eqR zA#V+H^N46yMR-snEIevNx=H_oz59x4DvZ`fO$kXLKnT4PdT%PdhEReay*Fvnkq!Z* zgwSi~h=71d7ebLHVkiP4AWcL?rHLqtpwiok?sE3{FaEL5{TU;dxmn3Kva;rU=X}hs zgM&&a`iCim$&md`l)guUAv1;@D4;cA?0yMFiXDHcjp6UoG|xbanL7`7I52%H$p);6 zrUaNvY9d0W=`J34xnx17G9E8tuR0V8eMW8g*)ZiaVF4n z6SCLjFkB+Fq$$H=Dyq+9bdILXT*ka!GYna#Z2nY{bm_GIrkqo=7Y|HexD>W?Q@A7* zRT8P!j48jG*~!ZB42O=HXt|m5nkmHJOhU~xa>)d?LZ+5V;RXKR!g}WF zKg>1c=C7}iRrSnCNG{NSHmySa?_fPHOEbyW`dJjn8?WgLQ*3Ym0U{xpWoi2l*0XG0H&nJ%4i+|uELCCr)h=6}F?YF6$yh%XBl%S9WCMfyN1?{X`jM~gnUtbCHJd_OEY zIFi#_S_aZsE3KNUiDa-ak$RI%N-~j(=s?6N#>4#IbfB$+9$7~hig?pe@gAjx+;~IC zd=o4QiAs7SOk-2Yx)goGCP}jP4ihHkb6V`akv<*OCH6;J$oV3n%m!9(lgdPu>1cza zd25^p5@br?n@USPpH7Zh%GR;|>0o;wWt&@WTgoJmck505<1`i>vN#H3Zxb`V50rOr zP2YcRTg8-K@yPa}qa9Zs2uNdBs%dlFZ_64ua}PgPvv1V+_#GwJLh~&wi8Yld|DO}KL_5X+Ulq%em zTijFz+*GIC)IPeY@3?7PxM|W7mpHj=Nx37{-L;L}b)4LFZ@cTox}&n)^()-3x40V& zxEoHp-}vZmwBv4k;ch~E%ar@p|3_HQ!&S<|P3Ts@3c#Qz$yd?CQ>r+!B&8akD4|H^ z+qJ2nnN*#+>AAD1C<~NTBooynz6Z;1lc3-Qu<%N}hTqiKP0__A@T19XcRivvb0X?D z`4zFg;4jMK6mh939tR*-nJ*e8z?kU-%z$Td3skGe!vc-H0CP;!& z8t^di^1{((q^Oh0Is@|#fN{7jUv)C0-4tIOne;9&W^oJG;@yIy@a+QVID5y*d@U66 zu3q};ute^A+q)K=5;zWepy~54JUtAY5(7_1%K$xkKs8<7H?vd1i;}L|kh$)HP)FXr zy54SzWLo2Zydsa<`b`s&^f1=#x_T`C(YHVpneQ}(?+&PW=QH{U>pM;+3r}~Z^A*K; z1eAE$qCM+Y68U!%sx^~Flc|jAu^yveugWAx)nnC`ww+=@7Pzlhahs7dUwmu5;=n0_ zOITmHXB`~mOAD%&@$^4PamE8I;{D@#K(=t;r5irc7Y?*V6EAQ38jd0smlVb9_kPs> z!>0eI&F_}X6d~$l9*X3m1pm7On+j-9^-2=|64s8eY3N0jeUK7?_t?3>7H(3+p?y<} zd|mO0g##)52cZ1{kAv_4-wOXN`On{+zM9~+_k_|Ta1^Mct!ho*a;L8f&dCp*DKZa0 z;5cjr^B3RU+oCI=L!rc#utXh0xYeh)o=A4fJ_w=_PFi{E}kXR)d4CuxuB00i4j}PX;#Stf~6DHm43!+G}^$J%CI%$yDQh ziQNCy-$K+-{ur|SL9qCGfRmAWp+Q24_L@l9DxqxN(qg3wf_PHWuxBBjQ)< zPN=$Ykp2BFhfn(dU`>{BR{Pqt?6h)o~3P6g=|})`Lb>@kF(4fnsH-VrP77 z6{D(qQiMFCYGqE!Q%-7dQPtL;Q%3z=6%*3*sA3h#P`fE6tiJew^lOUP2dLkIGvN>E zu-6pRBM8S$pwI2xf%~`j9%@p6G_Vrzl-QZ*4!zTA#qVlh;C}dSP2lN+BdpW?Xp_wp z!Ev&eX_X64(Wxdqxoq z_OxXUx-;f&L6WMh_c2MOdVDU|`GWa>3+o9^mR#{1?GGQByU%}E%XN$uGhmJbzc^lx z0OV=N2BdROMQl(vDv%#a*!FBpBHglTmj4t3x1aFW=UM&ecF%dHVfmAbw{v%j-XcAF zcP*5m5K;Pn!g?}mBk6J;ws`=5Jb$#JDRX1KsM*7kb+w0+qE6+F7cKJrVafI8&E6wN z{KE1Lj7?#4Eoyn*RmFl8t#ZoOmENrVzp&m!@m0^;hhW*beF5W_bK}A4llsN%Q{R(p z%def@{?M6GxY}3Oyyx}Nj=d%tT30TQ1xsAg zm4xozDvcB>^=|AH`rpBNeIgICrWbP5iCM}V)g64uw9=V#_J*oK>MgGMu)|*z!b<4 zxv`qUlL}u;+&z{~Ojbr{W{VZh(Jby`1KMu%4fDRpQ%+j3Lvn{=s^! ze__4I{|W15eee^h0R1WeEoGM8#ctKu)FrAxq{ zDEL}Ar3W;Q+}$G9{G5NiRQmq@FRVAzd-FPX{i`_IpTEX%6NS=2ks>G69+x}A4OIpzW5v={~5 zW07&h8idF3`w&ZxTaK5ebU?)90~J3oeQXviI@|E3yujDmkfS!pLo^?62MVGtFh)NQ z77UtR1q&)x$3B1S{O1yHf>8VRIpR#Egg;iXE(A?Rl}f@d_E?056^x)8OQYtkW|$mt z$I>|BlZ0T{oSX23bTB~h=CUcZA$3MFsh_z<>|XON35E=r9@n7F)udoPU2feBV9>4$ zB9S8xPJtxwQiciWibHynC0b_bZWRS`F;C6t z4t&jt@$=@}>+#a#GNTe%BI7&TrPr@y%nPw#U?6m&nl#CB^8WC%bDh;-u8HNU1=q0J z5VWt^R_9dey4MfM`e^2?Lj+qgg1q(@@Wm{} z&%8zW5gilWODj%WI0CK^O{Mfm(>d7 zen!dT3AWke>#aPiqWq{{+;FTyt7DI+?m^)7yN1lkynCJlHD6>Ktaf=GX* z-MRd;-n{)eMNo^HlIl5@Aq%XBtc1Wst!nyIf@L$$gP^vYi6ZrIJ&}tQn&4$u_C(f~ z%F`#=Npx?=pHJyY(&o9P+#!dMJOV2KU_b@K<|j0;6nNJPFv@$4(!*IqeV2sxsskxc zXCS#xNJh%ltBlahnGq({43J$3DTlSQh|YHm*xj+0chDq}uR1>sEs@AJx|AZJY*-M{ z&&%?&BqfQ`G$UqYow*)IA=|=Ff2)U-vS1%1+Cu!Szar^9q$K#4P12Ha@2E%JwTrq* zR#Fucz_&_L)kpU1!e+#ohwDce)Z6IHN(QMrXHqyuX8ocXkaQ^4-&)__(|M?S?Teha}ezQya5Nc0Tk6o>*x!49x5^kE&%#`nj+|2K5BHNU*OP-KLkE$K%K|Ij@2d z;5j;gD!`O75m-mt9{4gWH8<%*wS<@1N1GlFW4vC&yKubsO^&m?)Zx2}#2dI&QH55p z#V(S%US^cuL~K1Jv}%T~E$HP$eb4n1f(!aJpC)Hz#kt7|;1q6h3h74L@ zA}2}6S&4=B3j&g|i?}mj3kfh@QlV#cg8783ZH6cE-u%89&c;6M)<|Jfjz*BQxTS~$AHMi%|n z+zKR02jFhJ4ztI*@J(nYfD_{rZH|U%L}iA{&f9=oBV?yXsi(p?XK* z+>Zzz=&PKIKj(Wp%hEN#XphrFU%HFEZgsxebXjmLe1~@hTAhsd_U8t8&_CwxUnRNJ>DF zvcNq7_@d4qX9w$6bh^i z9*UB4Ru_FFs(SUGq)(7CQi)PhiSQX7g&Km^%3B%cb>z~7u-SDHb?vp!9P(xM2@UUL z3gVmUt(Ic6eBa*F(5?j3Qr0~uS3+H41Xb5m)BzwXqN+E{lR26qp?ymNK~=k~wLxZf zgoM!7Xs`;%qDiGy1#~~GQ2O&L#JV7tY>?_iRgYXO_1!B+ZbuznyCY=vQwqi_!zdxU z8cA&~XI|uzR>z3D;anzB)kA_wwPJXlW^x@*m8#d2Lko(2>}_aG@*-u(xLJ#MYU5ST zUjJ3~j(AZ-RZn6NK~(ib$o{H&0)wxacyHSbA&IKqA2$=fH6)?NaJpu2$sO5GRQ1Sj z4aRd2RlQ~_2PX_zhEw7Sz+pl#;3=V+oRJVvR@6eGe5 zAwrb46`6MoPQd5zC@! zr{xUbLnn~+4zjP1|CwSmPG(czdZ>&V`m5@d@SC1sO{F+)-=K~>CO20Ww7C9IuOAuS zlZ`q+2PevbMG-!?wYEI@I-K$3#_c_~S9M;~1NGGW?9F>WSRE zDRr5c{mW~QQphDC5pu@ylv8NAp&ZQV<>oWwHo$vqT6ra%q_AaBOV#^SlmC1l`B&BR z!w4kSx>1Q>qSk%7Hf8jwsEMi`fvD;w{8jY|G%Xsi;9j?YWNq0I7Tr!}hbX{%S%5nJ z!P$pZz2;$#6}XN=mCoewtrC;Js@``u4J!%(&Owgm4Yfqwzp5T5(a_VfmP9cBK{mF0 zOTL9+DI3(hUZb&)t}s)y3y@2QXJ1IG?Lr~k8lZN*aqMZtSUs$;5MqYaSn zi3fXWm7kC=-!0dmt_90zaZg}r3kZ*WeA$>Eg7NV@6G$3@CKZ94N8(aY740QB$&R|V z!G^|&$c%8?I0-E@vTCzzq#@Xes$M^a#bzwC;uh+q{3()uh}aG>%IICd~a+_3%Qs?~=|Ylj5i_V244b=EY_2?_*tVl%boIvD&g5 zq%io9F8G`7Zo#!biym4&sUC?B?EsOto#~g!Kj>-~Ij4o>G8N58+9)^cxWD)i3#iR4 zVOjpBrY1!cURyoq{LWGH)6hoi zWB^0Mr|WQzVYT5lC_&ITkU~T%0w=L&W}G2(4D>xD0YkUn@1_Gu@6kG7q_^Tth89Wr z5M(V{5pw>dT@S|jh^ijBCp|xBw(lnl`MaR|5bEK}G)dp*VU@g zFnMSxmB=YQ!egdW%zvSdgyVta=yXrDR4?CM8FQIL5fLqQx*=FWFjw6b@wlsh zRlS+pIxAS(5WN8k#2?=>41UMkVvw2X!s!{B>d;Qg!HfZ z+S7e#z7QMu)+Ee>V76FNMkDC)WPHI1q7Q-=`5hb==mC4>bhbE|L+Z z30QzLd+N4cGuX~(DAD$HBz0ug=Pw)NamN@CRlV3w52Ia32UONPhR5s$4r<;?HP**=gn#>!dq zhuLQA+t)od8GCHs^w_Lp-zMd;!*-Y>5!q`BvlkCjL4{of$Xnc$bA-q{a)dj5+PB~d zcSRoDJl%KdmUH9SS9^C!&fLGC88W=hjZ(iRaTH4OSJi7K@s1$;RrPAZK_v+qJ>hgm z!z?{SRS)YMhan2Mz7MdXJzd@pA9&Zv2R%6msypy|6YldV{PxfA zfXi?nkU~gDSBS8J_p9)5>BGQ~?x5%hkaoBKdbrQFd_Y}S0C_jMNj@_FASA9k!uJr( zq7ZXWA* z@@^t<)?5zL*2m}mB&vD|xg11QZ#b_>lc{EyQK9w)(9&c= zu71^9{V7s;%@uMe7&DK#(pc5(gliDY} ztt!g+vB(F%mFuNX8yTK7L89-EMYaFbtQ+flWDwm_tlY`pTPA&6k{?+j9a*RmSzvse zr*cUy;Sj0pew=szNiOs_pT9TH_bBJ*sS_?-`AmfX`d`9&{lnW&0P?VBEJRh0Y^=Ed z#s2~8)nUg#C8{>I;|yctWBtTb_bY=)pL_yK$?z~sZ*4^URqB4FP|X7JGtRSqc5uw| zmjno1FV&y^*EiJ`_0A^8n86=ZgN~vXQ=TrF- z;4*;*7f&sr{+{K0w&Rqu1UV;t{_M&5+Pz^&V*CttVCLr#V~Hl6M65E&5Tr!&pQ<-p z5Qm{MjA0@f{Qiu-<}$a3QyT5 zF*Kos(w~GDj!5N&E*cBWRU8QeiPmz&z!dBfNXxR@i18>}8j{Wq+q+=&F0y1gipi)NO5tK_1Bt9O<9(4}vQ zN<lzPWP9BlbDHa}`}C{84xb;|8l?FZ=Wo!>4_M~r>LTTJnRWmZ zqoSKPJco3Z;diIl3{Tm^{oZ!WC!a@gMC>e&z`qK86DbW?9Zn;{dSyo?Vx4ytA}%;T z+t~!_l-eV-vS`}Sl0WHbQWKGL34&I~zXWrYM);XB3T6i#j(?{)>l=0Z7AtYwpFjSv zh)oQ&aEr_NTO0sH5i~W@*U|u#e4d?8TKP7Kox;vuydE{$W;5hGCo=LEl zAg965-jCvtP0j-19gM5-u9lT^#;h(jlwl4vo7Yw&tp#>T=?`6K#fodZuhR6K)bFVB zFEGrMWEjBvE#TwvlZ-!{OmqlhT3ZC5``UH$Bo+~}Fx1s|F>w0FnPqHM_ir1|& zoDO#+XE;=y?Q?!l5SV|mJ1y}BGmieJYzE0efS#Gfe>NZs=a zY2R*?l*jh1_*C;MgS`C)?j1q|x9E9D>~}+k0|;yDT6NUT448Ul```h?bjDJT-jc z;*O#zo&fqYcSCqunkmo9-kWpuZGU>#do>yJZ0F(_+jmOOJ~)2xzXTt?ly-xZ>xYA# zjtupkMGh`Mra+sH)<5lk*mn!;{~G$~WHoWP`eZewDdu2Rtbl)OocsJw_m$H90|hAL zg?o-gRhe70;^uy~a4ye>lb^46Vo&oI&dySv9hMu!=kisa?<&x85(4zezw?koSv4r< zaxlEnE4_U61j-9(_r>k_#f2}0fn}>;2L5!`n%$H~Hbe9dy?V^qgbaEQ3T|b>lIq96 z;roHj2Nb3HEU|^BPwMy<{E9Mp-TQvo&COuCBt1An!+DgR6JynJhR-aW`bsWgY4ZW8!nf z=M{%8=|9fuH4>0u85yiPvwV(nY87lf#NCKFsf(9(j}`8gO4SACwdubVqPV*;r=^r_ zTJ#H1cTG>sx}no*fIj;b>q^>P>*-=-3d4B1X8Qx|YcCJ^sqAkQ#jF z8Y>Opp{^1;M-EW7hD8mi&l_rnOYe*kzh$8BvOZVf#7N7fGK2M_ns;h1s8iHxFt)uc zHb45YfKcP|22sh*)&bVRo;WSFY_0J!Vi{;24iHtgQa$Ury2#F!|p{MH17BFXu{jREje(gr+DOLyZvs7 zaM00^R85r*dikbyxTF2Kh`;_z&Sk2I9^H)}4qn%b)~&kk#i~(hcq!6Yd9{mJJ!IfT z=caK_yyg*RY=XQX*8HM|c^;=$X_)LS{;qvPe}<~k2+p;B(Ck+BA%DmC4=>qeTs)F( zjV{skoEPlfLrR3K#^~!m=h&yZ7{{1ZeSHTjTBwz$qU-!&YxI`Xvbmsyg;u;ILH5IwJ1TmVArcjp-$8oYr#>KR-@a>iLo}di}N>PZe4&1qC6s#UI>< z1_c^BgFLC%JdOIDDV@9w2D03BZ6r-I$|{+J_n+%W2a$Q4XeDq`d$h)6D(Q+( z&ZiI%x{7hUA@yUE5i^J`!6@pdECA440&v%W!ihU@*Jvl=90ymK2aOBcMV9yf(? zY=;K+%|9yIBi9%Ca(}_CEI~Ux{JhGz?AmAQ5wspNeEG995AS58;8lmVn9Udg7p^yo zWcRXurSO-{FKMijiw3SG4_;rlt;Z#?)(S4B-slN+tAG3mk@n0mw_PHn($J=w5MXt3 z4@=5}22tG(yvoP9;$w>*=Hj+XjR+JX(*q8Qe;n1Tq_G%CqTB>lyw9%t^NF=MZB*&w z9$oAmPF~Of3N;Xv---@-Zlqugm%ZJ{x;`};zs=E)OVTBx*9PNSGI{Vt`we%V57QKa zqs@O}wbdpqzGz!0A;Ri5u*pszm5jxBQ^fo6mDsm#5G@ zP38_~L0Ip)WE}nE)rwDL_w`c{(kIptgnMqED3zG?Egx+lg_oXQS4aPZcAM!e>zj0a zk1(U0V@u)DdX@g>HacReX)4uJbBs|tHt!^n!uyqQ|DOiq`E8!5{d{7Dv?(C(r{~R6 zevY_9$KZ=yDEDHz-WaDvMV?fG%z5iD>EX+p^oL(GI)iEOi8*4I2pYZW~L;&<~00R@ir4oW#;2U|w&=84?zq0NM zwF!lD6`KXgr9+^*^UQ?@9vDCex(keT-4)1A1p3pFlA3^S;lVL86y{V3D_cL!{AL4X zC{u8tEFxcd+aE8cZ;pNac(<-JcEz?6&=mI3E87Hnr!E%MtVHet1QVwKa$t% zo5`A{HKB(%=;uF2hh3ea)2Vb?dHOu7H!Q zgzN0eP~dlytxCf$T%)4xntI%v7;q|qIZ=z#IbP(_wF;PtrB_72qSv^me?T3^>AUuz zGZ~y8;gFMbJ9b{k9Gn~c6*@vxoRVRBZK;|-$3|z`Z{SmtR!SZOw|)b+uY1YuHipAo z?jL(Rq;oKecAlTJ@b_R^qNleRMC0n}X$OnHAnaOa-n{-^S1ljzQ3?+|^S&-$g1>v8kH%KFf5I&is z2Dr%fBIvp_Wz)!o!`7((_;UA?l)H*R{Jezo=D1Ygl9s;w#vKgi=5qRN5_NN_PUcL7 z+vj-tboUTO?|F(3?JpGRKFoOXkhFaqt>rl*Km%QcqK8;)h@KuPyM7h$&MFnFi)<1J z$A5Zy2yn2g(me+DFgOK(6xzpdHDr!wn&|2AB0DoUho~AA5r?P{(Hk9dKN&QBn`<2A zYn zRSMD5W8@V+s-pdf)9pbU}XPWPdNRch#+K8Pn*56l`O=LC(%zC`s?XA zldw+_sA&yk8Hk?VplTbVFs+>F@Na|R1%=UTYM;%hO4pzjQrv+=PmfplW2o=iA0z8R zOZ(TBH^ZL)DyX^$RXp|Lu2}(F%y_g{L6%lUT&nuW&O{&wSI=JtwtH)3tQ5bbM3j_}bwkvSp{y>?nWkSRIWUog(qZL_r~uDL5)ZvDkpx~3UpY(v zYi=P{o)JZ!T({jI&+}wnpdPO=XKQ*qOXD#Cld%AK`YIX4w7J?tD0286qZfTznSEOi zJRySBxA}PjnVz1dbLOpphYrLtF`QZaina35jq<*qm~XE$TXh$iutJ1W5!WssyKcvZ zRJ*%41-%mMAB1cW>=GZa?&kT=j`@Es3gUPZ6jX5gi~C^xe|UNb@imMehe01_pw79A zR~Dac+5^TuxDrEmDDz>Eb#a(I3!Fc~h~x*m31UX*Kx}I+{Go5MPksOGTAc#&wAWp#=itnzv z=XCoAbwz$+iCvEnP;vHJ<2C-ZFD9}I?z8SAc1~h;M0dBLx?$WiQ&JrXTz!>nZ>@tq zSjY0PCh*(%R}`B}$cgW)gpx1N6E}$6=i^~qWrwngU!FPpv-QLXvL>0cCW)f86$d@O ztb$KE#W-Cs^FO$@=+zpkVc#lg`$?IIo?d%=RKwcM8%K$jtZDfrk+wG44kf~V#^UX~ z(9|5x3qnHckVP9A6Rk^nmSRkvb=t5^W<*iilmGJcbcvoG0=D*&=lkJ*d3vkR#B4Ta z5#exPPT?z#iEruHYNw}Y#fwy`^d(&G8_d{JE{~Gu%M$VFDMkoUel0bv%?vC(c|bt;`0@BJVv>Vw{d= zNtX|e8RD8)DN5nv*~k;SatuyJmx~tNjp(8zAt6D-{m0=XBrAZTf1ci&v%4~HC8)9@ zZ1L{ngux5v8FvJ&ANbL)DrVQWY3B&R2hH!#R;w-P=?_*W(4^5scIyk9NPiXfzSeR! zq5MXa7eTSgalhhsOH8Swr``SHWNJV5%9uVjty+%Yle)b=*CclF)RU$mUP#djbN`BQpB!ZK zU4z`aCdIywSKiUL!yzA?Yqr@di-?{cqPnKkBEzgx|NY>cGbNhn=^?0123d>Zc`gW$ z1831jC6TS*M#TFTnp4;PzS{HSR*HAUW4v@=jvnZ}SOaaE`}Q)@(L%5hJ-tE59LXL0H)i#SK@QG4{Vr8I2g0p>T%3K(K2_;y zuH2ViUf$~*ZksBE=!Jp8+SW9cXE8)ikGGEWY~*zqp;wn1A0P9xD)$6I)i(yoi${i@ z;iFWuAChH-4JyQ;jpikuYU+N>FtH$%F*KETXQJV(Wvt(*?T&lr zS*s+xk_Api9Q0<@9HlI`^i`d?|C~T`CL-U15|43dbeKDxpV0x%{*ERW(X1pd1;U@= zH9i=Lmis!bqNSe_a@UN@%jO}^=WQJ|kZSI_)%V9*a zJ|o@y!D6}Ko2+y7$QaH$G45nh7SPr-SGLf zM_li9bl;D;kOSc)iEPUsp6ZRoc%ATK-+U0;2|FTjGJ5Cui6yvJbOpd1yP6aIoIie* zBbIaXDTkQ4`>*qru!2vN11~1VAf@H&IwLDrFPJ?~o?S-QS)Xm}-F^0@++C((BOr7= ziRkI^ZgdPSo1JVT-HE#k7=A{QBsgu6laNQ9FC?jNiT?BS)HS_1;af54dY(>R1)8}L z7b0HA;C|X|y_zo%iKnXc8y2Y8Hz1ZSTQTLd0aE+^-f3>k=?9m7?!=)&#tu(OZ9cOW z%;Wa4agp{K2lJqb9C%j-GJc&jWcHt@=c+V=xp~wS7nY{sR#bJgFUB3Uvn;Z~9V;z- zn=~Y#{NxeO`Df-+*;?wwh*OL!4=X3Grc&z{f_gcd&4%<~`{Zo?(NE;Lv)*~PHJ>K7lW?!~O9&t^&E-CveI)tzqEjjdnki-A~ez3vU& ze)lv(q4nd_JEyuyU?Qy7UZd?`E2r4z+I=W5Z*+`qb8inmw;RY*>e!qrQ;7hoDqXuC z-nu-5C*;QHn;HX#WwTHfuO_Py^WXmG;*8*f_6uLYnzLs@|0sTeGE6cbm>mdgX`B%zCHovm9yJa4Ll}MDqRoWY>N6v|de9I7&dt$? zTUaxWV0^mzg=a9Hb()91?fR^o0eT&J)fUwWy4v8>-FUyUGzucDa@;YZT0vVYQZua% z@$a0z-sk6BL3e2ibZEm@rfGPLuP4*ZS!8&L6rtQHg$G6WX8k_an8F^_=9wEvZj67+ z>Az^^f9_v@jSp7QFvQWUy;=Q|Qn(!@Alw@$C+imf24Qct>^XHrYqUr> zV$7oP3!C?4cCUTP)$FKmje6VYuK{VpH;g@cOcHeQ=71dVuaVHYZGjgwcXo%M#&`CJ z%wPXc_40uzBCPk!&1yn;6wIUk!aQ2{wRNY8ar%<#x$bz5aqRKt7+0*}B=Rq;7qA$0 zV&98RVf$9U4pm3TyNxnwNU;GGwunUtVh+xW7iQ+?0^$A*Iw%lGCHaPpJs9SZTwVdD zd#zC({05QIq-tg_^Fu`P?o%4uzI)95jDNPp)l>r zXY_tmGAC%=f0`a@14F-B1;uzJvn|f>tW_p~g|z-f#SR9sbZJYu&I`QB?$<6?sgwh$ zSsH_~{Awm-mGKEzN^eCQFb?)yt9mAh0;HnxRT|exph2n5G;Q{-2WJ?N+6j5;MdbtS zKS4Ta4psISC{l$J5twF_JG2+~K-iHbk+!ost489XXxdylEnqd@*#*EYG^Cvd#xNeQ zKM)YeBa1jfa@`g|gm9wAZ5My(wbXzGgx9ciiv$*Q&n>~Qx4CHdD(>cHq1RfJDds*Q z_m&fpAS#y|k8o?07c(Fk(Y0i`6>>_xc)hDZUulDgV}+ZUx0K(!HHypZD9g#L(GX}$ ziVb`KI7ffEs$mAXlN3@_VWtDBCQPMX56x2CiWg&zddg0KQ|zWb)VRd2lEJ*lfO9L@ zt4e)TtX>8BzvxIISHNbuc?%f!&t6K7Q3jD)l>ol|6#XFm-kg43T~2Lb4Q}a!gYzq%=z9A%gP(xogxj?t;)4MG3xBeuvDvvFV4^cF5jSIO- zXT1HbO2+pCC&4!V`Rz>O!xk~LwufjP56pZ!Ec^ocb1nmvfWT>k76@dTx$%LpH` z`&Ju}D9G3vsWg|p=5z>q?2Nba7~YT_7j!PHuQT-!MX?JvS$FLqPoPINe*!KUmKo z5=3YF57rY@h@5c94leAV-jv~weCa6lyZCwLrg(nuq@y|!)|=d*#owL!3+rL&5@-_% zz>uPLYPeQ{%6p|((K{gpm&+K<2a(inBHF%~W>xP?Glsc>Yy(hPZnmLs)O^|IHHnHo zrqs}=g&LroiS~_e7QXKm)DVvwEvLQ}5*uEvLi=?O6>NtZ;};)K|9ND6cf00S-P29+}kMkR-8GksXxahiNtCmHhG zL2dJnY3AVm-y!QqBzJGxd611GIP6d zBvNsFFvoe>E%O3BCf71Z`EA#Y@&Z04_mMZfJOSq73Y29u-|kn=Pm)QFPhlNy z72nc7o&CE1;m#~0*HJ~B`hK*u#!PJu%oG1sQdtRRu#&FqgvR>b-b=3hfQ0!E4g{(5 z#LR;4&yPeXZJ0a8J=Xa2z7k4(hRNa(kBE6)d>KN%6+2IEzhUAeGDzJlV_8q2Hh<3j z?F#G|4x~rNH^|1Ed&Z3__#6|TMc;VC>*L&t(Il+w9ISDTRRKU${FBWDwk?$iSIYOM z^omss@~bqqbbP10kZMiv#th$8Q-&+7@g*$S0NFSJ)&!oCJM3ODoQjd1MTr2glSm22 zW_XgW*NwwKh;1N%ua16O(}MW4xPHjc?u>N+Cksh)U{7i((HgQR%H6glCP43?Cz9(GAonqqg- zDaIvegu9A0ZPlGRnU#T65q4IY&IY@r4j{>B6Uk@?Cwr1GaJA9)qBHn|pnW~b54Qqi zQjO5&q}s%z4wy(xccR#dJ64oIfJCdGjn5Qb7$_ zZ=N6=LlCeVmyxnY(H{u)oVgknPj$9SixY(WR>b8eiG&sAEMRgf!8pCwxx!yX|5MU) z&g$I-;heJ`rBJ2g0E_=5J^c<`+HC#>Nkr@y3VImzmpqDoXBbCXo(N^$Y)^qPrQ@e9 z3a8z_ke(~ylDaGqD3(uAj_aBqO2N`eDR1Oo&-pDAVA8k2!kogil_1)gy?{+=vQex;lOCU z;>Z_w%fH^$MMZo6F5cUOy2{YR;bl(ofISk*Xac0wRxFE>g*ex_O(MbAo*ot`Mc;P! zz9{TmGfH(YV?e*`Ih#?HK5d*n!{ctiIh=T%(8NH%9uA(OMXSemF930mk(a`I4}?+5 zFM?As%-$Oi1)5cw7$_8U=qOk3(Q;eEe%__8A7eVL%($vVzgL+dqQLfO3));?K7hKU zKda1uc$HaW%iJL+T`S!E=3Zf)VQ@+s@$Arn^@*#pqz!fj*HnQw5!O*DRpsZYI z*$Rcti{iLlpnYWJx*+E41$3~N^$-W(|DADcqDAwI^-ShUeSFObr2->S(ldb|LjFm5 zCegi`ykGuFdQAT$J+-KYyHQFxRClX*Dbz|?6~4hvE9-8rUeaXf!+za`;ore*&0ud{ zFilpP90mebaT#JAfV7^{#z>kocyRd8!y|pz+g_%-B=>RQxR;K%sYyMlrXPejnFRkvQJa~7yx|lL#Q@J41%clWobZLs@!1Ty zS<&&Wy}^Pp5195V6`bg(-{63GmoYHm|^rMm{-2@azY!UU5VcqR`1HaMpbndL`Yy=LOo-ipc4Xzs#q0w+ z@BZUJfbJ7e?FsDn1f{-nuK~Qg-kH*1Ma~Qk*^j64S*v1?0=VJ;n;8R-S?)Ro0zQ%m z+6E3(X@b|pDcn0~>u~^i{>vwqu;hj(fTBRa=Wj5NQz1RLn4~YI{BVOjF9k{;7G%I| zreA24R%rL4(5bi7MXBi4ako3HhklS6$U)6NR~2)e))U*D@xDj)PF{|Rq8VQ3%1Ksl zaBm<3?fpC`OP_Q{(=Y?+$hOQAy+EnJ1}0n}AWS9}hXY)gfb3t47^%V@etTd`h%qn` zIWtK$lZjAUeXe%r!6|VXi2!-C4aLEM{kuxJqA<5JqC)+&#|-jBiGDN?pr-}4rR!|r z1=_!D6sdd8eFD52DE`H6eB%XyDCvC>qo~>yjf?K1pBw2V^ppJ+^U~mFTo5chk_% z(n`Li)gvGBQ!~F*tC?Ju)jUlXu1wce+-J}qhT9pr{0z!`KyiqUu-ZJz+CsOcbGA(s zlOiAeJ57z>IR@0|s3pz%=(AuohKVjE_nc5-bbNVe2I`Gf+AXy~nF?Bia>QF4)OVr$ z8^AF4m16Dd;Ps5RH_tTCYYp*hH8`KCpj@MU>WPwx6qEZll!_dV4C<*YGKuiItpB6@ zK`*W9*l=hiO0Nmdt3D!_(Nw!q$UaM!#$fXusiMX9+D7Je@4J=(%8U_%j7ts~rA*@xAcsaTmfXEblHu%oYnj*ZiA-AdEC!EzOs76% za(=+SFZO#THiHv19csNgUFweO`>--;;}ftrw=loMaAAGH8ua+xFzu!Y-M7q7FQXa+ zc|To7EOyJyU6qz~Tfctu0qt{=q$e%gFq?4Qnl3PnH5E_A|E%z^3B|C5*jJWyZdr&~ zzve5syfnv!V-VZBfVNjGqgKbWw#FITe-xe%i0gQ;t4z_}(wypNm~z>0qNJ zuVeZgyMlOzxcQagJBoA*`D+DF7XsX;ntw7%f8zIcCqxpe*W!1~(y`UYZSOLAk!Xyc2| z#@G8B-%2;W4{RK+ZyX^uK~kF#<4v;5n-npdlx3Un!A+WtO^8n!pxtwn+H*GEbG^KGDQ3^TY|nFW&wFF>f64}f?olaBm zo@z@qTQqv4e(NWc-F(R;Tjw6PPMZ0$>60@V8lmjxR?G{}mDsnZo%?K2YS(a^jKgBp zveLExkIqhxn;GjUoI7{kwfdaG|6^Eh;vx9IDe1BQ zzryGVo%TrY^A->ExpemOZo+m1;6V$p3BqQD zw%wI^y$__Fd(sV_J?Vd_r_?2aE^Y0s9jIh)7SYXIj1sCRTsV~ zvV7+8p{Kg&ZJ9%BB$wgy;&;#7UKiT+J}-Iy0yo!_Vfdo-!^?o3l@Gly%09lrL*U$} zYszO@BN)W&`)VrYI%D|_ADynPTc^6v)c8`=Z%I$@?~-07v$wv%Uy`1}IEKS7%ppRT#{WtuT@Tva5Y}+;%(rtR zSuHh#^^l!VMHV-ri;ATp8cSSvu_>01r*_sytrYW7s-;z(rOZ3f2m21^Mzrd4Y#%@d zN2;m$DSIJYFMCFcsOA0p(-=R;s6;3FhD$tiIEA!IGZ=*Hqru*N=V+fy-k zHJE1N68tFFc*qP+yeF`IFzMFOL-f|q@8P_uJ@ymFZ}7Zqguh#~qKXcu8LvP41ucPD zjbWEaoH1TZIf$1VIVGn)^wHf=tW@f{xFk{}Uf)0bd7N=+cTv0;_9iN&gYT}&`jyi9 zrCGP^4zB7Rh%QV@KRaeCb@BcJ-!St2JvT)3h@!`)=nQk8xofOc;6FQwx4% zuMjH>+k&N`lNc~Kir}gUMv%NYzA_mD%80TCV?agB`s!R%Ze-yQbG$w}=4z@T8-y>I z)Tn}c!SuOT>1RNo+!A2~(r>Oe$F5E%1j?OjJDNbmPMnX8$wIRoX$5PP4~L)%<}}4> zc<=(vFu4(cQ~t*V*a2Pp$pXmjCpX97iNQLWRP&bd0pv8b&H8$n?JIrpdJ+Uc8_P#; za)6$r3?I&>p~}?)d1!T#Mz39)HpqUKyYhXRdCo!&xg&vJfMb!ZVam;^v%-n;qi0J` z;;~!+JI7(#LMzg3CII4&<u@ue*IMCi^Hsp-`8^-NlrY zXM(CNKG#=V>tN2Des6gm*Azpr4(DXu z;VGkPV`Ohs?a)>%IiQs{&oID;rP&wvl-o@q5 z{ZoKGr%f{SX>wjW=Xd(s@(xLNZfW%uuSuuoEmUeqyGUG(&7|1P91Gsv5~8qaezh4; zXulw!Z+EI7)MGb`9gcZHAT3d(MOaBx4WX~P&5YHKJVKIHMs}Yp7W%>mYGtNh7HpzAt`;}*z6Pbms=WvM^BD8I+~f6mU9Z5^R_*Q;60f!&T$UG z1oLVwhkm0w)R3`w;nos{ns6Je5% zN?{rdAseEbG*~FpEI$uAqOED~-zzJ?(uV;QVPuJ(f#%sXF8=zOR8@-h*Q(DZtY3%< z3f9zNQV!c$Uq2C3_{lepKrSE*`5{?=KJjyzteNVEE5+a-b6gXZ%#P|a={)Ly_-U&~ za}~9h6CzF)eUxSNw~!~l3(>m;lkyTZA!|aEp;9sn0Oo|c?J024ecT)9OxX8N?e`uh z@Dbxez7%cxdInR^rPn8G!QF<)DdV!qYQsJr)#ggZ&cKHXdv8C!-@fr2^c2hf!6TO4 z6Cu6@7eG=}5s0iC?Y`BAM|KqydYfQnB-DdM=gA+_d2e$CT#;B-E!t^5Iqf#O$0p;W}MIMSl>7i)jaZqa%Vmg)xp@;T9V1VkWo0__*YMnOarR@>A ztjii!C36Ad2Sw*n+Dt>Ofn!F`1EmGf%W7m%s-qN*OhQlc(jM|L?Kja(H-$K{M&!rY zf}F_KP00OowY(D}@Zbo-(SAg14I^>icpI>VV!+inF{}BAxbeG>SQ%|l+S$Soy)UTi zXeu&tyY&6L#p970XCo0K+NDZSRVj?+mZ(b0sJfacOmJkAy=lvPpOkounlz2j(QI_k zvlo%A(>|1r(ny#)q>^B2VMZV910uvM?#HYT(}rWHE+$2>HS@uOop%b(gR>B6)0ZgT zD3H$xFwVpxyZwRGxXosFsb8L+Oq*c@D-^^46LzpAK*$Af(3^8m3NWn{AOoYMH9~{) zg=B#vCzTU-<3rj7X-s1vpld%fY0Im@>z@Qj=dXS*5<2r}r83Dd*N~XIm-U*iR&0VF z3-L~(HHWj~?3cN$&r4+K8VfQTK*OblcFo`LD^fp~o`!0}*kA&&lDwHD%ob9OOCIFY z^y8H^HQ|&5b~es2Sw~y&RM`ac~6Uii%Yq!aOlcN&*Bgyi0w* zMSXpU556OlG>RF-W2Kyp_5}2gJ-uy(wNL0SlBbt>?CH^lrTOHLL51`spea9rYrj1` zc&=of`!7$AP9ylKB@DPD%?PRw3v5=qAd`nO*bADpKBTB~|Y>dmG%Q)cj* z=pOk(_X8=WwIOtogb#sP{iIJA4-R&)=-QQ$+6)jTE#+@gie*ah=~9`D(B6HA+WclN zhN9VgEu**y=P}C`TA@9L^+q1AU^E>Wz?~#m4-I~ME$766BZrxG>j}6)hbjh}GOH zGef!xAZPWF>5<3-{Jl(-Li!5APo<~Dk(>`E3d>g_dBO`Ttc#xCD5^;-s;euizi|RW zIDLW%H3CG{IvuNeNx2x_rd$igbdb}z2KJJJt)j)uyJ93&uaayL&?l*STu@XX4$*T1 zar?Kb$5*oQOVwLHDBWZ)+g2&twJuw?rd(ur`nj%bVYqbfpbX$BU$>SV>M0`^;|4!0 zr{X9Jls2IFeyr-vkW@XBil*t(o)_hma2n=bnvIEabMgwlhg@r?a9X&^^*XYN!itST z$}bb-LQ!DJ?-frKD^DJAa0zmNx6&%aKbMN_lxa&p+gw5DnUpKutm=z zpm6}GEy~G?&Bb;8SfXQc*SHogQktRIMrZPM(`mqt(5;w!-nQEm=EeIPF0l|<@| zfVIl&HYT)UyBlFa9Lgk84=BImS-&~av~@s3Pa^ez<{2>>E$L_E=}p|lO)62xq#mO9 zrTR-Uey_&GO&N+G&63m{deTkoHqGi$je(mjN~bwcH%q|u&{c;L-szHeJ&2wj!~zv6 zsR;qz2CtU4@&dKWob8!f*cW6`6!gV(exX3TSWize$WPLc+E7fO-Nm-!Vo--$dWT1S zhu36>&yNlqXQ!W9=M~$|YeAg>>76&~JA)=W|44djT}0ciu%NDp^e)z;VI4N%#_W z^Aa#$8z{SEvhZ2X=^myNsF^~H6b?}?-fO4R3v%l1?djFy1iw-fZV^XSOtyARmVBJ( zF}V3s!K>`$Y1W@<{S)bZlhOTCr_0`Z^>+^U&xw~Uh_f!G4{X&9;D-m+f0S-|4YYy; zcZvt5IS0NK4~Fp$ezP6i6X!hSEYr$t454c!qbZ~CW~FKvfa@!pi;XG7&f*8RPU zLvP3q- zRwx9unbfnKXMHG^7!K&np@%MX|qAoNcI>!F8FZJ+UCf7u^`b74b zaq-XZax*3h8zzd+@IU$Rjx@}T*amxKl3!PUr=-^JtJfw+)IU^aO!k(P572xV<$C`ns-UsYX8)W?C~bMzBToayIdoqkav59cUSB_PEpe&0hPm_KAZ$;zU}|GLrsGhnA)Mz z*J^r4{&O11T|NX%BQ(lhaL%wkn%PmAX+A%5u=<|;Q9o=-2W-S3!Pv*U-hU*~_v%*J zn*%V_qgmOX)3DN#B;;HLveag1T5f7io*R1d{+!CdoJM0gzfYCm6ulnPoQ`C9rPtI3 zRou|@%o+QI&l0_UXWout7c8b0koOjxN*1J=7VM46dbhzZGUi2wF^hJe_Ecv3=;72x zpM2~W5ACJ`ac$l{i$V6SKh?)?Of3R8#q>)*i8z`@9f>XN)ul!ns-G@hiuIX)F|zcX zYdN{J+}ddQ!Tl*ik0ps=zAN0zP8mxN?N=zH-emc#6t0&RG_Gvcy?8W5Z?d*hW&im( zNzyYKP~iG3B=foU=jSHw)t2=I)BexwGoM@S=bt%18ZyhRHdk{tOB*&QfW+Yt+VY{3(;EPM6L}BBg+oY_RJ~ZTwSh&_DHe0^D#zoQ zS!ocB!@Br;U>{MIGqj;@xV7W4rVfWt9hFs3UpQv9QU^C)E2if z9ZG5!dHYz^+rB}0u8CrYr0SWF5m1QD$nCAVZRAnuAh1bx2ho5dzl5i zkW66XNE8N4?{T0(NhDT}r0OAenKgG7rQty4*7^j9VdyKZ!!K2DdZVBm=o$pCPeAu* zkq|0jmFtoafRYBlt6p-3;3>IOHbpEe_#Pu*%=*OPA+2^wp@B0<8Bi0RVse1eMj8kP z2#b&mf-sT;6rQPX?TEnr-WFiVw%bQBH1g}wmpc_-&Kd{Tp%TA3aQ>j!F#aWpp9WTC9) zPD?|t!C`gZzHBTSKM=72AdEOPNz&UyLmA9*%_Wm78(J`aBC>W;+!PJ{&Rzh6kc*!3 zs^}2y0RudhT~0avM#9rV2SI%O*(U($`=N;Iy-jT)7mG^xIvSPqA|Rr>%P;B&D)594 z(yEQN?5=^9R~sGPUw0MWcOXu1 z1Sm<8-azh8XPoh|q-TtVFkKmXj%nI4InriO7t&1Hr zC(zi(EmceHku^_8M)?NbK>;eI51Ej#fvgfCYIbCJuvBG+G6_VF{1IAeXYrM)#r>HnX;~uzuE=N zDGX^{`CR=~qJwYQfB zB1*Cw0XPx}j&J>gn80(bxzA=CZh=~NIeoxI+1VHF-WccOWER#(0_75JR?!O2mxy;H z7#^slub;cIoHd&C)x97m`G&_)AxrK9_u^+7o#l_xtFj*02NgpARwAYNxwhidHK-28 zW}UoCj7?}($HEjg7KkvbAe)x02rL|eGkEfX6ctvR9|e|bvk{0I;zJcgLN&-Xx~K~b zm@w3mOubY*$<%XCr?KTeHuZ$jIEvZGpz9exXX0}iSgbvlHm=rrtQ5WYd1JMxQRT*Z|GhZAD7?9C@MNsn4=68_N{n8+ zC%4WaMec`rP&);nnF?dvD90U2s1!KPa6tjsev{V(OObzciEHE8bU=Dq@q+`}m)v1Y zkU3L{l68^(o7vI6~3neCtX})k&9QO*Go~6dv#S9385g__a&e?!2pb#aB({57u zFqf76hPfDixtEmmO601832Oo$SqhR#7bpY3Ow-k{To5@_{frqgYXccKBP z79gekJ#vMXv6qN43K0p}V&iUcR!V1u?W**pi1Limd)lnm)x!D2O#o88C-(50BOOhk zzV?ZcSUpY`{v8NnORYQZRH}}Ee0zq$`XY*@$>oH)$ODY1X=09mtH`_dF5@TkA_Yy^ zq9FtbkOcb$o1DH zu126ireyc|EKz(CpW7)8_G9g&UCwweb)V#jJ(alm!vTuKVor)0C0!60fhsJQzvdMrFD zjv25D!Zpnf48!vp<9zwmffCggIqxY%znX8hTnj`!WN=iTEmmZg z=jAFPmRY82ff=jm2eaeAKH4r|Qp^c)FM*n*3xeo`^DVor})@ zv`Y}a&j3{b9^V$1&Kw0%f6@~OeP^BE3kGqbx;fXoFB=gSz|&PvW zIVY+$PyJXvEJ&egc?pg3#uqEQ0{)EU9l{8cxr!;v=V-?!z*G>izP~R!=HSky0pQ{F zr2rSE;uEf!LWrdeM4nP<2yX#E=>X@2E>1+F7eMXM)F?MvnNk}!{R&DBX+XbdB9}H) z&X`9%Vy_ru$|rMH)+SjpQ9sH<=1eMlQ1$U=uq|F9@e; zsss~foF|mgz*V=qm$)~9cg0`soxXI0f1U5LRuSjrcEE2d{ok1`jDGPnf=&9#Tp1nr6N!Hgro}bWSlNmCR+BlgTqtxJ5-q4A!T7x4DK0 z#@`>+uL%Qy6~p_VRZrF(6d_Dt!<#bcBT^UX!jna645bl11q5hF0Rb(;aGD0{LHXh- z?3!79TlQVU8WF91xUW*kQAmOaC_)1oganDBRf=16WD}L5Nthl$-i7ZF0m5&hhZrOQ z6B)MmJQWy9qni3m5|c7^0M(ORu%5{ z)T+D)Qch{88j{?Dhob=#HjYH}y~w+G#6JI1NTiu9p7w?UH9q(vXS44Cz-$78*C?eJ zrabfjCoTch$Y%Y7yW)MVuvDz-4?TR`^KLRdC?f(`Zo;2M;%RD#)hc*C zR^SL!fT%cCp0a{h;=t!rkYGB1s0NJUFlyiF7bry|oXhxZ42I?K#A$e&ez}uZ0@MZ{ zIhq>3T%Ft^Fr0Ijj6l)@A6-fTAjk-c2+d{wmKEpPv*nqYra47i6vGh0B8wAn5Vr% zO=@zT4rJ?U6rl1+Au}JsL`{>QH#uDcP}s7D>hH*Ky5$j&;Z|KDwwZh~eNP{klk=*D z_Y^jOl<@*Vgk=^5Ajmh{BoeO*dKUji(mVHC z(tE2hGprU6#XB+Ud?YwuV2{RArhq@>3`;YGfC3B^+(dn`z`6& zJqe--zNHX(Ea^FrBt7*=|3i!B^l>Cf(sTVI>6wxwJsJWZSLEL%z1-iDp3W~xk1Uc! z{Vz#Rg6fZ?=lVy|L;aHU9v@43?^tj{ng3DJd;gRq=}G)e(mVVk=>_~-N$={vN_y3Q zNqUF>D(Ol5mh>i}!~aRrQ%C+gNpFPZe$m!wCI#MFE;!$FStGa>nz5J73Mxc=WHJ^QHelLYC1NO~uK zNqP|k$-gAMXR#znkEs2Zq?b>U^oEWly`o=|UI9tc>-Z(<;r@f9M(FG+9j|DL3G?T@6Fe=O;_lO(;u-;&<--;&PLd?OW#NA->FGmNaFD-C zdWFZ5-lbzn5Am;(p7dXmp5osmJqwbg=XNaVt&=3ZyAuElJxRx z!X*E_q_-eB1hL8_NqWA&B)xDmlBB0gc+MR~lJqM6NO~9llJqY92T5<)?C+A^{eLUz zUHmQS75*jZJ^fcn?>R}*!yij}MM^jFe@S{pza+h<|B&>g33ne*-Io4K(tAZEFC9{# z?@yBS3SkiWh<}mv3YEwx6e0hR^zxOweoJ~!ao-Cze@l84f=whzkK$O;TOvt%J7!B@ zl?XxrEablKUy>f_`$dxUBEW`A;O6U*t$V*Dy*-ko7cN<_cqF|9J{9p>()%n;lJw61 zlJrank|aql=08Y!WXF=;HOR4~7X$t!>0K6&4VC=gOM1&@08B-9DpW4MEB?PB>0NA# z=OIaYY+L?R@ZXZ&C20IBX&;iLcd4yUL=o~u*s7rISklX$+zluHtjeEHlJxSmV>C&U z9vg4f(7kxGyU`>`Zzu7;ko26+5}@AF{5Sqp(&PRu>A4g?M*I zmj6o9d;43`YyF3$HyO&t^)Hg%*AVT`pQRV>{ZrCg!3?C3Bt827Ea{0HOM1fpl=MPE z>FkarJx3x*(yRKXq?hWL{r?+D&xs`IQIB-8yWo7=84LtuNRpntFByw%tX2nm!1S;l zPl)(0N$+J>*)9E-q{J@5Q@pZUqglKvfI7ZJq-@;uwEVmDIfO>lFG+7jm0n{#=vlNx zy$V>I#h5w{%HnDMIYMYHjD4fNXf}uTOzf<4jB4tQgWl2xNw7E@Ie+mcUrNfh&&k!% zqnA>>5-)%FJ>LG|X;^kE*0&vL92-p0O5*gQ>$gpQaeA%3{fdkA{)6oIi zoRbswmnm}q7dP8(u2S}hK6|&(_k{X%ScS{(FHR3q{l>RfUGc*oPLDvrf(=-UIOg;Q z{&0FJ_b4XLZ%*zhhQ3u-UPv;V8dPpik%dhd7lZbtK6deoX?%2B?_xhMko8=sJzcJ9 zp`usgdr{ueEG2j0ou8-VSG5oQG}pCrzOoxA-M=gGPTxp%r(abnU$r-7grXt<_4uB8 z#ryjP>&DZ^oSvGd#CZ4ZXRD9p5-1IdF>P<}W1AjVcYTYU`AB!w!aZZ6Nq03Du3O=NmMoUU-H@lu&B zA#Z6z7!+;Vq}uW^(Oo%&z7}oylS-{z+vS1V0Xka;4)ANN5njOO6rrki2~XD|$F)iw z`KeD$Dz&Z5xAa@sPxH2N`&C5w`FD!>e7bvCxRYzNynPIFk|-u!;<#Sf-m1bxWAd(r znac(`N8Qn(5ira0PTDb}afb3zDj~vNO@MDFG)CIbF}kxtPyWeU#j$V0Tq;n5R|8!Q z1_(^Z6_s$2G}lOx6O~cb<5)j&nR}-6i=6lM3H(Rf=hJCTH>lWtDt~fwe{)xBB&+tN^hS`RRSQGtwp%l%hL_S* z8p;`OqUTgq8SB|K%TD8ak?L*MFtxW!`p@ys`;f;$q6USmF4L+++PwwpC zNB8>2Q|+E1Fnth|n-|Dcs%ef@)z9YNgFO*Bh#QFuXw8$R>x4_v!`bnkCgSus&+-BcJMA3X(qMdM!RV9!CvK05ZivpJmk^f;~9en@jmw;t>5&dExR zuk6;fy236Mu@3U2Yx+VV^ZoRW~+39`fFx&~wV6x#LRY$S>GtLE&}UYy1TLyIAY&H z2&2Uiw0)QNu7X%Hop*sY*3^#NRko$qS{SFzDFc-~txYNjm8h&RMLC+cP(HSViS(F+ zSuf?#%*I}87^REg&kQ``{qC(_mGAB4GMB@1b&m`?dX`)$^ZCSEN$hR0Y|0Bitl>-C zVa!$1(V^2h^wt3&uQ$P$yrfBm7F{DY0ut&D1#<@%K(QfM3agq0m9{3#E#W>STmeiO z3!@Soa#;Bh{e?Zj=#1p9bh4)ai=nZ>H<5k1Ff&#xrFg>@wkFdpV%74K{0n8xArm|K zt~bBJhyZ@7siT@O9g$&+J1H~l`4L#9l#2U7OT1JUGqJY=9K%FdausjDAc>yr*TVOm z$fEbPFB)h;E?5TB{dCYqmS@3>PC@pmpB}EsJ%lhqXA#zU5fWq4bgrDdP%R~l_>W<7 z#zfHIPZKP{Y4}#q_#uOyG*L5sh}@^m$@QZ>Y`^^u#flBj_n(n}-KgpA}9v3RxJ)-TH#2fq# z=P*BJF;L7;UkH`B_G8{0Sq)-IEMH#AELQ2!D@Vvn zmQvD2N;7_Y1E0v_DP0bE{{`S@xu|*U6^UHS{!9!8qb3YWY{#lg zLkEWEJx&@L4Y(StM+6=~Zm@^2t%eAr^@2{N8LBWaZ7*V!CKRq{Jsh=>sCQE6Zb=6 zx?4yRBEOsEwRFqGr+J#?uEM5Jf{F>P8$ktlq^bW<)3!hpd6cec-m*ja3G>4(R`jFe zi4*!&K=T{8^Vo;?p+Lx~Cnro!a6P#|uHg)GdCD#hP3kaK_;f)t7Es&`Vb&+O8+{5s zv82@U#EYA>Rfw10Z-Gxk_=!!T1rSQhB$bpQHL6Scj3KuiiC%B=^xC+v(^$1B9-iB6 zDm7ZPrcf5^HZ|KYs&6@(U0^ho%Q-ZGt_E~^3hU+&EVvp%CoCWskwovHrCB3dOkicc z63m90*Rj@)@ylJ7fch>MDz1i@%eO?Y=SS@>+6V`ujj(t2Ld=l_9tTDp?NizgSP}+w zV(F>o=!c1K5T`JJfB*OtkP`R#MPZg{SQ8a>w%X896Fd`8TLqc7v_@EfOM{8pX<951 zVC+)!Cs#sgSn$H+3`0 zt1Zh%+r%IQ7kn`<%}v*#XeZGsZCeor{sV+j(Eh$Z^kH@~!MC=V@P zGg&Gm=E@Tx?Hunsh#Jwz0MNP6LM`up=U48*>`2bW<@Xp?!UN@Cc|11b@PxWovPuC~ z6V0^oZL0naDdwx4B#2=VCPojBhbdTJk%rhPw5fK53eIDhrwEMew5@f)-v@|-HDw&D zD5vox9%A#S3T?I_JX3Mg9A&54VU)>~)bbzrs7&v5%k%bJt;nXyrTCf^bbrZt2 zL5e)GqII*XQ^2oUig_#n zCftPo3mr zJd#2A$3C&uMrB@ve!k#6b3(Kf?0thnw4{VpPMfWD4O2hnY0)wo4?TSla{7jg&H>iK zX8|+0h1m}dTMMD@N(uXhWjuWe^-5M;lUBpFdEQD}VhGTHVqh*jIKsupKbJ+PMKl7$ z1Y#;wGVw)A5c6}}qPBh~V%qS`ZqkX1fn%+AN%2rQGHe8fZyv(S8Z7#`Lab9(`GMR* z0A}3+sxjY0^`IFwa)cjuDjA!&@Fu?L&7L0?s^ud?WU2zFMq6aRhcLH+M^BY_#S^QZ z>x|5ZR_uL#b-$9%AvmlA%Yp)l$@2*n`-wf2)4N{v6UShb^d3&ctzOhN=p@cuG`R9< zQ;#>>?}q&8&fUh8NQ16qrg_&mc>>!$h`DcmrwU>s1vR0q)}+2_!cu*fU*W9SRTGY@ zs<1DnstRVBU(CN_<>Vpf46Du0f3fhqYAUB-&T#d-(3f+UuA0VFTi&TYucu&>f7OQK zs#W+^v!X9nud8i%s_h29*l1q0&8fEf_~pW91^c&OY$oOb7k0bfE+!y)!Q@O$x*ijo#B;N99j>a4{3|1rO3U;yB|W8dy=96ugOU8 z^hb;Aw|YT>H3OQx6d@{o&sp{%$i9;*i7j4#FumuVZ}+_4Klga{+*9bQm)r}_x#vDF zPhS3h5(iiGefiv1@4A=q3;+C+J~tJw@Lvy*`WoPmze>{fUhZ9{c;QKV-S_cV&;08i zW{QC^U$5u9@LT+PihBQo!8g!s$VY)DN*cWuS52|48P(Pzh2Eh zsUjb8t@pZzVZg2PO1HnhAVevKy}TaIQxhmu6KWQ4Rkh}p-ab*TCPFIU+N*#t|E93~ z8qbiLz_$T0aG~w4qblFrYgAoYDobc8@mfS~tAO~ocw>wTI*`glOQj2x)Jl*uQI0nQ zC8@S1V3p&=loM2ylMIy;Kcm@mL*O-d1PY@f9*pSwMxTon>id>{9TPwIEsY=i=yQ{# zQtRW^K=vQCa+AuBT5Ho^*Jd2mK05ks>nT5atDL>~?a^T1e8OoJ`}r=r>OEsQ{{1f$m6-de4&GUj)Od7m8TC6igIq`z66TqYKxo?9`78a zzpZ=HOS1R~`5_1CwBK`CUS=5HC@0Z*tv4zM4=UhwC8{qG5p^XS2i4!}3eVq6_^6zi z6PWf;Innb-Imt#j8Cy#d@E+ucJiscylp_6uqqm+Ct%t94$l@4TbdW&GpS1qVj5AYXZiII+H`Os)4t zeeWx^zRvo-LACz3K`PPp!QHLNscOEN_5DX`LllQeFZW;p!OAA}@pn`xlprk+ArzK@ z&Ap)E{N`cLAnF=)d_I2YvijTW4efLF-f_r5hR&v&4euV`>M(3z>C&Rc;S-I6XxwhG zi{E;!+0ZQiWG>`M#HcUL6meQExs2|1ohL zqp9?dl;14!c`92FBna^#JEoaDVWO-{V+e5|jtN|Kn> z^DE!zFP=`7b04b;WH^@eyjOr(S~rs{>|JJs8;tIzd4{PcuO5tw2#`oUt?IPIqYq5i z&(~i2a=_$eS#d1sy-b8qJo0?^{OWgRNgHK%<><^`q@KAhUjI7o!6}zJc=OeLncQcw z?w_XAyJcObudrX2@HxF{tm5*1WElv4KoRb(shc84LCDdID9Oo?(H3ZSCr9jN>z!}gf`7#!0_uROhqgl--zcEe#tP5%J>S*44lCd=xp?#1?A5(8?$|j}r zx<6oCWsu4+Qi{!CL|jqwQn*As?Htc|B3Mf0o6;uce#;0f9&#c>J|D_*!m2AhwtgQ% ze9Pw~p0~?-MLMG+S--nOX(3(i3AIf%zHW-S$;y|MhC@)pYb1`4(NLLh_AV1Z$B-#ugFg^2`V&D{L zkn9{X7Ys#spih)3n&E=^%@SPvIZ)_*-cVNQTg(%bYhH;H3+ysj<~v;=Mq$nEnT6Jm z1GSh(9{uVpc8gT0i6M-Wx4wR;%FcLTpJ91#**>*z=5+ooo7wAvM|Z5i0Z%jw#@kbO zR_sz{s?!7cKSTt-KPhbZDJyU2BYC&(?0xm00C%&KtYksb#W9UlW2h>gDQ&Qnka6=Hia7KqGy4G`>|O zUv9LYb=U0Zd~7XL%44MMRmgL@YvK}5B}T^S{>Fzpcf_B#KGxi=N)LM8&Df(%9_j4C zqs*Jq^YQ1wSIzHao2HQY$44t~C85@*g2^~LX;xVTL3L61PPgep$%0p8kD>`{l@4H; z{pJMU7>Gm2W2+WNa(6sl@1>t@_AVG~OoEVGy-8JIkLE(5@UxcTKd2FC;yFYS6Wj&s7Qiz_j;F3MG0?{_oW z@y;>>2{+JFTM1{y#%t&D1=CYwhGrAqmTU7{&d@s9VwrDuk!gx=A@tEsJcs2o^7t0x zJK(B}dwA`dw0KJX%;0p3v#p|Px|B}ZjKYO(EF#J`T9AIzskQE=*4Dn%M{(P%JTxLc zx;L~F{N(elSS@>7TaP3po8~bWE*mNzgvBWoU{o8Pt~wp1C8tj47I1xedWb)jl{+By zB(!Pi9KjJPr=7)ZY~W%-dJN=_oMqf=BZ#{=lHal9qq;W@71(xsW4pv#{8nGv@^IKf zu3n)yT^cJbF2k9>+Zvjl;$h}SVxgL_3`t$Nng>~Tt>t^DS9U3|1}@oObqZs+1V!0; zdBe?5Dk##7@O8^aRSI6)d-j^HZgWAVOpEP1c0B4f?l!_fMzqzqYa zk)Zc$nThw;orJhhueW@EtvM_YDZg%4oCnoU{MzR=lH2{HDaL6&TdB?+S;MV1*`KVH{0f6Vo|Ua!|YpZCZ828&Udz^58) zo@ScZdMnp~uGDQO1g#n;Uw4AXGOzuTtC`13TjifaroVnI0A0lAw|2$-l0+fd zPfR-)hynLUh?--xK>grri8hG*Z&LLO<2VNyJrAUvajrOdN9mgwN!jF4=Wt$eV#N~3 zohM>KCpQiL_`{WD_Agk4Rjj)u5Jv5SP&BDz`2X>w|uMI{U4p&JZu*;SMdtDsX z*y406)J-8a>6xEzPeodB9!Gx>w5#06>-CN~{>P?2=;y71-;psvj;~<>w~~ZYC}vlM z-BINL6MnQzs#lR7mrM7jiwbeG9}`_hTjQ!|I@vwXyLFFyMn+L?pYdZAIyoy~r~dlO zH^1oKV{`0}c9`UJI*Kp?yv#FAp2D-L=UKQViqAIPP&*us3U!pgaPM|szP@G6S8z-k zrP1pm(Cq%IK$2jrW}9Mp&2{7>ixWjpbiW;b%o)pLto~gtsy^L+F^bJIJmgeO_(Dx! zM^#~Wv&~k?X=R2Y$!u5*XOkuTvZO1Dc3aW@+1gNS2-wHNAnF4a{p7|C;Kto1?|HHaRG7u9jkx_0uMY(#Kh*-BY~1rB57G3{9_(4jle+x^lQ4O>5Fx!Pgd-66Dbg z$1NDt2ns@4F?uDYRpf#(5S`B=!| z4X9-W^bd+Lh`<=C=sVE~yksN4@xh$&Gv)AHy%oN=fdc_Y9yxBMVONaQ!Ih0 z{RT5>=~#{8Lo)e81?0C76Wow2b#jnp28~o0l$oNysMSH+SiW!r*b3#0u?}rVciG+DcNyrtSV>*vKC6#j4zOhA^9#hG}OrW~~0ko^Hh9sO+gZps za>GMbGJs_co3W2UD7*NG`*WV?cAJW2p2wuODr{Xv3unt_@Aw-L-f^E*5@xIyX4%4= zk;ne_$KI*IKK{l+SM|0nklP{Wu}kP~ zM#Q&{jcLrW$I6{THw4H5BChnQVr3cEI%)V5nYWp;r@UtMGue7GW!ko6Zl96Ue)97I z+rTf~_`}}2G`4owH|L;#SiMX=-U<3ZB3;#MXI8;x(N(=d;$Ky-gs$pIGCxItT$Ep6 z(G57|>@O0r9f-s0Le0b?Ul=^tLy}l8q=AvYd{*T0Qx4bjsxldJ?=n+-k8?ffJroZ} zycx)O=P6({phs`;^VBtAr2outh43B`h1Ziv?%=*aSM_H3&O}1P+UcsEa`gYGdRSQ0 z70v1&e1QN6XcpPSt`a#c+(%dSW>3FN7U-+0QU~tXotJics!C(WF<9dm@8RGa?-G7A zF93`9>f$>}2>WuLbX6};8U)WGZvV26iazr?QLRa++GRh31w_bg6NuaBGY*@*i zNaPB^sPn3HvbFi|d*<|v?DZeUN+!SW5hZLBkSX}3OmV(ifJtuYk*arz%GXOo@SIf^ zJW}ZhJVSVxCq|=qW=6>XtS#KI@b-_4=&rc7c52Kk_oyeIzcOG>}75)m+ub9 zw3fi<@D7AB9zd4w=eNE;a{^>W10wI;_yO){_*d1Fos0B2_Ptr$&+x#-vq;1!`!z=s zMpN{5kofO_Yq__s|Hd3wGe@lQnE{7Xf643&-+m$!f6KTnq{8r-0oOfCySE+e%v`rj zjc)nahCIp*VMX2M{HyA*ZORC!Fr&E#gw8PSQ?3=w@$LUd)f-vjkUl57G3)-I(KqsW z$U1k#k5L{CjOW~}Ip~*+WIL?qH(k{`6Tc?C*NeIkBsA5p@H$Gc3D37=8?b5{p}-S; zoX5A;)^g&6>5^QubGOts>4RNrd`?pDe^k8-bX5<_y3#AWKl&e4uN{?^#_KI983iSj z4DgRECufyj?pN1RC?_0XY~edghvJyo zBDqtDQa?{fTHnuhx-U?Q=P}cbmEbl1HS6}#G}nPWtBnhZ0f4%wK7h!)`xq~I0Ei|| zVThop`yMCm$P%c6+le*Gx@HIMu$KncG}?Rib(!3@US7FV*sH zX2=bT46YP*2JqOiG zeSSV*2qCiX5lh5NfX?}rT?_!e(m*~p2~An)z0xC9&jqxo`}^FRFq4Twqlv1-c0^yi z03WZ^rIOeOw*1{nHEu3p)7YA>2~@|jP{STL^+?rAlaMughh?RCdlrm-YNC!pGL z@`>OdS4hhvmOV1(MBZBxB}!dii)ho2;AE#88%`CJ|)$@i}a-SqJKG3okCt37$- zRY?W;z7{3*NY$%k-S7Mw)ZS?*RErN^*Uj9+GIu?K6L9)5g>^|M)5|Cx!)lpZO+v;~ zZJyxVOAT z7eY@CRd==v?n@aqozc?1((L#FJ~Ctp9cFN35XY6Uhd>qE>dZfzYdCpdn!Q6`7%Z8y zAQg^qcaC^E;m#kpT`1}U;uxs-I+}~_r`Q8?A8l{Dv6R=k@fWwzUw(}~J~mc#a4PYx zK#I0In>_;;Kat;AAirpGmt!Kkf(MG7?49ek`1P6Jee%P>i8jG0OT($Rx(rYZ;2!Vi z)i&c%ZO=Vy%;IO6!xOt`A@gM?szkse2?|&|eC|PL7ZyJ!6E5(pjbHQ74RkmgdymQ+ z-t*}ZysBv4^3Ck&Y3|#56jWH{wf(Qae9CVpcU@AwReGM$; zpxpsvURq+Hf|3qCX6r0T{8jZ%0=)PzOQ|}>w}m`2ahdUlQo-Y}F67Gj@^8NEM-jOoTffs8JJ#Uh)CT$^q`(xf7=Dqo6z9M z^=}v2fCz7u2tP>>O310YVXzVZY~|EdR*bXt`WboB0{d8`aQ$gn~G`7 zJstT4cysozsu!R!lyP~tCF0h78eP@frAZ6L)P7c(7fMo)^ab9GDEiYTeE5@VUpo%5 zaC<-Fim+rWq15^G78U_0eQ+wG&yGnTL9wJG7QwF>22(0MlJxZEDK3RwsRCLd`!MD5 z?*EbWda`95n!}imSM=RixjxbQj;*Y-=t$B_+{l(7{wwL7y5f;ib6zteO;+!a?{jjd zF=~Y5*kb=C;dsqh9bMAXPF5vNJo9#1-ey*l*AJVS7xDhmTRYu$>Pg0zYQl^ZB?kTj zJgHhY*O|hv6^_sdFhj=4IQ%*z9g}U7s}(99g1YkULD{JS`6Rpg<&QNEE#WL$o0A_J zgzH1yxu=sSTCXp*R4As<-{_9*d_ zO~1AmmUhi9dkG6eSrJdj8b5cpzfZnOk0=bw?Hy^g+lJL7bldkJIS7PHOp8HPfMZkbnBCCc58biJ%g8TER+AV>yUjal`byWVffjI;74Q|#wBe5Xp^Xf_f{ zz=Ahx0tZf9^*BxtcKTMLFPiv9V-e@C$w zQ%+ol<(c`uJ)VyWl68y;Xh3*u^>OqJPntd-(OCDby&l05b#Jvd9dX9*5nFq zYxoG(LvTp8-g5u$^HwMV5nxC4LgY%^MblwDF*vUo9oG908?ysFUU6zuA3LZ+S!O0v zz<=OBIvIFf>98J;Zc4xUpR&OA)aUyFeO#{It38+e1p+?V-{}8oB6u(7m8}$K^n@!3Iy~yUY1MXbt8ce%mKq`e5)bzxnjTbfMAt+yS%D)gSK2 zP{%pHKxx0{CuTT;Dcb}q0eooG$M;y8bnV+&+o3exkRlXM9X<|_CagLSn zWJ3518p(x~Fv*Nu!ITe1aak{ovmcS%4GQLc(@z9PBQ?Ftep^!_U8#O)H)}*q(%ONXO|0?@ z)mLEC4--LNE*O(m2K0e0C^$hB>_v6re7Bgcg%}n#*)|epEJZ0I-!Lz2F)}x~bVrH> zh=g=^3Yqh2VEl-jx4a+9Tpnje0)W{wEev`!>I4iS3Lks#4bFm!jHoPQ5&LEa#XdfJ zNhgu@xxTSD>o5b?FopZ}#zTo{sf=Lf5}}5biZ1KjINPMaLe($^M&?=5hlCC~s`plj zeHeOz`UQr?8=Z)tqk58x;I%jICl@c8N7Otm${V4hdL?vJFB*K%w0J^F4<7X}u+-5C z!brruPx0aiRiomiX_6rvcLu2tNj#(cL1Zdd431ra&?QuJaoDESx!fVs2zCM2aj&-F zA+LbvnaMlnFZYNtirGCrb!o46!14`?F6i<3cOQ_*a_2H@lGo_YZ|lVFvOwmgcrD?m zmvO;CScf3vi;;RrS+ab!_iC!NlGn$ko@|E&j?)g7cbJW*f}T9`jI;i#K1yNcI>X}$ zz5KiT`D^J<%2%4cISa!t?%f=zrFZVAb>N$OygtcWe#NPa!Y2ls7g%{b-(GdMd-9=* zOFrDRT~r@FIi@O}BcQnAwbYm{pRo44)xZR+xOH)y`F4}L(j%kLv!~{N!v`l?Xy!?fEjA z9TiAte@WrEugzT{|O#4PM=UVD1*8-6RorJfG8 z+t;Va@q)uhrAp;CG5NYg^OQM`>flaU-@4?>h%Y?7!Cfk~b*VQ}z6g8|etUAQETZwonB1}Ec+E*(mOxi<0$_$r%iT2 zrCOuMP4#KPQ0szv^oxF9)#HUHIiSL761{Tv0V6-0qe1iHT#}%K!3fLQn5~oK| zQ#_C~NzM0H(5JE91#V-WSq1mkwl65orQA%QOM3D_k6j{d-uY8TYJ7vLvZ69B^yZH| zc|KirE%m$oXgWhlS5UR3DK90)uBz?Vv&u;A-Zq35q*GP3UYlh&K`(T|C{pxk>Rm4d ze(_IZs?TbDKKtBY3{@Z^)gDK-abH*CpBxbHs~?s7e(T5A&nur)pGowLl;ydQ5c!Nx z*FKAd+;*6bX-^R7lUtTg=XYo{4Ay+f!Vx4|NcCq8ZgaoLoBVcidd+v`nZKz&zD$|4 z|BFd%tHI&0FB<9M5wooSUO06?b3SZ$t^GC0@7(eIg@p@EWNqa7{XjBKF#0F^*=l;S-&-P&5PcD>;KURKl$6A z(fDTmBmB=?trrH!wXf`uA&o~AgyRXJUs32?G?eEXzNo~XrIPzF*AFnpE2i$X&m969 zyZdf|7aM#xchgs-`y$RLTDVdp?h4cQI(FL(PMcX4JGAKfzoee1l`f}xFdyi4^sU(l z>2JJ}lCd(ux6-+zuZ~_fw~H2Ah)Nx`TlyJ$?wcW9)w6R3OEGeMAGE%ti(Jm-sHa4` z?L?{O#Y$90*r>MCvVkW9-yJ`r;ukMnhBa5Dz-3=bYw3U>e&Atbqy7e2xDy zn?C3f;i4Yb)tI!GK=@deir74MrrncchyZheHlkBx~gZ1Lp;KT8Zq%0 zc^;{HTvyVim2=vi!G#+ER*BhnmvSQ#vyC^9;RJZSB4;CBY2J?+T$Wnm3~lv;8xoKE zzT@^Ahdol{X(h*ao@7}{i3caVKsw>UHDp2#9%4s@HhF^G zJ@3t^qpwocxH93Ec<5C>RLk4&o9KLz@j?O`VuIlq3sbYNG z_nn~&9*CD$(tZS>;=VAJ&{e%bh_gmsI2G!;M4S#pIT{f635D7IDB}&~w|OYV5o}pI zN+FQhyr(cM3*C-`H4LhZIy0qGnA>vUV#*3NIAn>1$RP%vgDWS}L)FobbH{1CmbmiR z*)sMzRJ0fOH0C}N-S5n(W{VVO-Yjx#N3a>B&7xCO%fJN5f;zgYM+k`3s{kw?^I+z_ zPuA(9CVwaYRrQERs$Mu<)wAH6r#!mL9JEVJ*78qo>MqalDr0DeJ5d65jS-eOMh`Tk zE*^Z9pswgz?xdJgLrsItriK8>U>EU32F{<(>P~c34-aJoiXoCA$nU7vAZ#fElhp)r zf&jZoWy~3bJcDtx55jzjw)+(37_{0Zh8bIcI@rK|P<)&UU}~qJFr_5$1jYpuLnrk@ zaB#=Jq@LlR=nq`&Rlg%r&j|;=ItV)(8s6?qgW49@F))z3s90ma&|PQJ87a8K9VI%c z_XrPZXk>3sLeE!fTC1UtNIhpT%W{n$HcQ1bD>N%hYZs}#oAq4ep1u_6k}e6>j-Ykr zPx7GNJd%Lx)-FiDu#@Y#(Qi~uV^qCrg?SnUPv}993yd) zt$rSd(ENhYm(J-gKp9dRLN=;6>Yfzh!G+b}i95_WM6ee-q}~Gch#+x{uhb9?H6DDq zJ{g)f4l$&%T#kaQ@iWH|XfG`RYTx~sjWH@O{$MZVs+?C;8{~VDmHb-t z;)`^4NJwVcS7UThui7jX)kr`Ig(x=jzkx-*`A%#xR8$CN;-nYs+$4q*@ZdunqwAT@ z3n}PY`*J(Qa1)%=J|)4Dc-zROOMj*8kMohLH%(Xd(9l+*W}Zf}A*G8zZtK*8zrM)Y zz5>qkq+wI>c?NxohZf?S?U?oVtW z#j3zqHdHtfoP7JW2BN>a>9tNT`~`-2mk1_oAWtm6Aj_1RdCT?rHP7t7DI}VblR4_~ zaZ>P3eQ($Ks<|vF z48V8wgwFKWhR=I$WC`S6x!*$E@6nabys7xWg!ry4A6?|A7}oRT5gNiU1bKLl_0h_6 zSO$Efn`zM~*bXiBw2u=l4X+;0fAUVH{!GF1ckrgi9ga$UiD!#71K%*s=D{}mh7cd! zy46(MXlf>dCEOlto~%hJao9x z%oSDES)avK#Ka@_?nz+_8|2*~w3B=C#XinW%v6g13b++NEZskhmaS>55^=K7Y<`BS z$-1ZB#;K`Yq~cvP_PA)u6FSXQY#`V7v8pM18P$N})bd2oevMd~e*S5cCX$q9NiEq$ zBkhRbM)Yx`C3X7$={WCqdhC+*fL3o3hj{+Wjixq{rRh5w=mux7F(q`9@N$bE@jHdq z9OlH4>kTa=+Rpnua>D2N0>~bC(Y0g<2um%*G5PL~-J`Jk9<&TCsh#pF#ica={*2=G zf$|>A8V{m)rqyzYGlt}BaCDp@@!bepsq|;Hv%3?A(|g&>Jj?uR;O zTo&AsoICf04)6~cGEn~Vturm`p1zB>&9pVm7!`gj#Bc}iZd4xF?eI)}Cx>kRpC zyiqa|yggrMBpd-&Hpa$_)fnoo*mg`o3M&dz?vWe8A@_G}QCvTL`$~Df{pH+AT zK7LN=t=Q9Z#|QL}o1PeTdupmS6LZ{5W3l|Hnb!K7|TfCiAIZ~p-x;6YG4{gV+!KNt~kI1+_MqmXQ9 z6el|yFBc~_H!nXAw-EpTJE)%c|0htrG>7pt`}H)_tLY|Tgoa{5{21Y~XNFl==8Kff z!i~)H<5_M#_pD0qCHdvr#^p9n=6Y`CIac0JpS8UsUh&%J6j0-gNKw%n&_{@pE=$;>(YT`H`9V^~J9Xf4+X2TUh;RUzXOs?yi4Z-dH@?SX|v&{Tv|CGDENsDq084W= zq~~Q_oj8#DJ|kq0Vu*?&rqODs;ebrAXtyzg-{3|B_S;+bNg{IR{@O(ai#X0*NrLMT zl~-jxPK0-<`-9S1p=U4=0>}sgedUj@Unwqw zM@W1luRdSsVCl)2aY@HzhMLE+qn-Ma1gDO9JUS`K1KaWKdc<`9(iO}> zr2#K8m>ABn%qwt0(2BR&I++Q4euR0PKfSYs2EIj(V7QO#$$qA`#~=`~HRFB)7^gSB z?wWssTL-}Lw3=pi;jwi8s)G;=!(nX9{nox+2;4CYfOun71-qZ>k)sxBsl3Y`K%zA4 z9W+zv`ZmDA5?JXJ5hemYEW2#;u3YZ^r#U-r{TnwQ!HLaPWZT}LwZKgyGS`+Bn-`2n z`=Qi1Caks4X3*Mkbq501#DK6{{?%vu?ZcuRJx{Rsyt@i;OmNCm?AF+6-b=Bs&Ly2o z1O(dOe)atnh+WKeqj#ci{=}?G9KZwKg#lu#+PJ6jr&7d!}V~!USa!GneVN@eX#f(^Ud|u#2obQ zyua)pUK`rA^%Td@Y+ZGKYY=XZSTAP$OjfohFL$F1SpDle+~ulI{96v6?4Oa?>yM+s z+C;`@$nx}4Qe(`2V&l*Mke0yBS?S5?m!XF2(03Zn2p{al;5Lkw4}P7;n72A*@E@Eo zKC9U!btwQg|I^d9G{c_p{fU-s-HMP%->5yYYo&qIFM*sU4jlr9HE+Y_y&f0{oYvF* z^qh~_(p=_%1za%=mAUNO_p2y!EnbvCNi2-2V5_rMf2;lEDs9J{4}?HV1+D5zwc$SZ zxi_M1T`IM@mu42%avA?brt6hBN45&c{tT%`xhTVNNhZO~jDJ8&`Mx}i4%COKRnVNC zC%OtW$!MK4gD40nU<9&c^on9L8V;Goe_G9H`tIbG7V7w9y}&==pf2F_i^7ujVx<<% zzNv+|<1qKsAI|B3bTz|+$hNr`UZnn@x_|h*SP1yW|H9`v1w6A%`Ipb@41M%NJjc>q zTi$9A4|d1TKsYPkoNB|^`7^Fh{Tvrzv%_BXy~b5>59`EsGCB9qF81b&2`QH&J})T@ zt@sb0_tLs4iq7XnNNj!qD2r>`nM75k5k|mF`j}5lji(!CVP@#h_x@zJwFUY+=1U$d zWe9BtvV0bP%I$0uDY#wJuE6EXPtYtMB0)>)zR9$_q~SpRI2q&a9U8Cg=;%zgw!kbz z0~W>SmECT!SH||y+KTr|0&B=%*rU^b**u$PSC~sY!MbXUSX3+HUD~1wA`&%z{1(6i zeRvHG%fkbk@K6_XNJb=Q2_A9_{HYU-kz|#yW0m}ZWPx3m>kd~OS#vZ7!f@d{+zu-jB}U*Yi5j58jGo|bNs(z^C&U@uzB>HzQ&grZyx?m zPRs==GT8v5OE&Z$aj^Il7t$1Ox*2yMhH5pyh*`iJ0}SqT$0y_^oUVh&4JM@0BpD3< zvUyt{9pft#^70aKtcl2a{rsIoVjf4K9j2HksUkS(h|MD>(NVqEfb*3+$@O$p&o23S zaB@@L5vtcZmi%%jnL=mtHlpZk9ty@vdxWB+dI>=(kqs$5JSoG*f7v`hGM&vcp|g3I zBQ|e8*Mp%x@~q)`wss$&UfPGIw13z7Y9ex+|3Z3-pqRDo3AgZm`ouD@Ni$9mmw2aUdhInwcMFV*`XfoxVXIesaDeoMX;@ zF(Znu=CP$QuwK(2wU@#8nnj@;MQ%xGIPTFlK=x!;#@Wo3TVNR?lFsI({ln%35KNW? z&s~G$iz0063*|Ge@FaIom>B=`>n?S5}*7M);&x!Elex=AFHt_djf2)<10CX|rwc zzii&@0vH(5I7hFNupuy?&pMmfWqTJiMJRZkU4n>X?P(VBndyzF_@Ys@jZL3u;I${yC#^Wvto zYblVr$oIxKAv7h!#d|3$r&H3aSotf{qCy^bZz|O!J}&Hu=WMJj;;SmvtSYy!stBp7 zq_cTf&WX|4JWAp}Y+ly9*-aAz5--LkyQ;NHxH<$G+oU&$!8rIK!@-!rL0v#5<092; z(eE12ifPdq(`HXv2Vxcpq={zI)C6gIk3`KUW&7#iy+{jQLSuZj%gwbb`J}J))r)*j z=7wtNsNPgQV}{t1y*J4oe|{lj|m$m6uJx!Y!mVKk$Z0y-KfuIIWsA z<6K81kY)f-iZjH4XyU#Bzc2{ZDxc4=J1 z-ldXgNGnXd#s?HRhic7?*CEG-(>|F&xitN{81Z zodu2On`QiIDEVvV1&6*kGV{oH3qo5cL(e9do3Dh{Csa52IKU+V%rj(VB2{m^85Ez) zGK?Y5;6abkEtAc%hfyt`TKJm_NCWkt8Jzie4=jlWmz1GCiG)Jk$z+a;PzvCf92%Tc zO%fc*9o2f(=u)39!t%Z6Ik|hn`P17vj0OwyzuGRK{0yxhql%ejzVfNA zrz}&3LXpCvub2zl$^PKA>tujz`$hpH-^q@Kc5;U_MwfzY=L5Ynsac_#tpG>>E2b3; zMuCD97XXEw?KmmRr=ssr7>ptDVfA2`7gwr#Cfrn4Ok_r0o9(}`5a_fz*=g)IK`Nvt-}b<=9(U(>EK^ z*WoSjZl!OExj#~=f25W4X`g$nus@@>ZzxQ_Vy^!q;(hnIzWM$BxS{^(*8W*pfu&ZG zu0>mPd;4Y$>4!JZudsnGlY#Y<0tY<;M(JYIfvzgGt1 zScgESL(EPBh}FKe6>k3ip@p!1y3!|DG&H6)yecp(A~$%fja!)1C+9QVv^k8S3493A z{cwUbJe4vmCHLVqE1Zai$De&)EZZ#sXW4Wdc^)=$#^-~M55HV#6UzZ;Z$mPk6`r*P zvRdsMBtxuyKE_6V)Ohi+XX@kS)OPdF{AxU*htloJt*`H5W9&(N;)u~u>?oUDdr;A* zsL12uDiV0Xu|TUDRXvpRrWI@l5UUdvasxKIC|(@qC}xbTH4O ze^h<*b%FMT4iCt!zm^W>MVL<1e4eN^jn71bPk*khBSSI)zE6rKo7={9(UVX6YoFbn zY$1(*WSG#k)4j7X30v(S+ymtvkOn8(d$m80I1M%`k5&ryer)?Z^?7iFmNhzSpD_IS zGlmZ4QOI3ar|4i_z3KEgf+f6sDx}vYc6IvK-I--(mTyioGnzd+2Qz?|q&>mV{Z-PT zAXOSZ<4``WGNeaJp<=?OYr`hym}?X;Gw9;kDVCW|d!{WxcwaL_*MiD-Y3@V+?7-eE zR%cG!_X)GkQ^YizVeG72@hfi<=?{!W?9!K$mj;VlKRTIeYfXRAFMfaO(wOcc==7!e z>1J7ljd7)7Ho5JobI4le_p)_VBiTz|9nD7iix`#9eRa}V;JH5SEx6!TTiXcwVp+^4 z7S!m8tc~|<969~PKYVeOZ~msvbnx`zop71ZwZ-X>YpBl)(nBI=X=#hGEQ?Xo-zY_- z#I1s1r%KI3Ozk@RV48&2MG02LvLLeHT?JS?q81&2Mv& z@Y4ac9fSey*OK_10qV4^>e>&oG*)?q-O1HWJ$yY60Kn=;U#CRQ4elZU0JaUp#NH3q zQ=1b#P+Hn5_hm4E0zd))*C>pFcmRA5C0eCq1AI+mhHvc25=f?l2MiVf$p^rS89+${Qk6_c41ui! zj5A4png{;8yupAZlGyz~ivTpc{yDnQ1#ArL;6mUqxS|{L;nQ5*j(16}Yt>#5xN@4K}7JvX`^&J6rVqlDz2!J1*b*v?(tertXz1~^2lvoyx zK!*Y=1AU_yuFpvtQ(z2wZ&?s;is%gEIb$9GPs;MMp}Ocxi(!BS4qI^JH4ZL3JMDs+ z=o?v8z)e_~l-ta!NXAJ&yg$f3if0G3aiG36ANUUKj7unqHiGli_%2R=4*i&8+LOz`N#j z=I@nE04vwoSMQaL6&V21Er_z`f5p7DKawh>BQfv&mgZZ&7e`{=RQ*3<-a}D89(FPV zM$2zg0H!4JUVUBw2Q$=Fo+6)SXcO6LPIKnL>&QMGv3j^M^<(R-;%kn;bIQFcA0DHh zTL+q|4wRrv(l)kWN-1_|0Pguz)rV4+y?GCYLmlaQQ6{B1@wc?o$?DhG0g~SWqmh)z zZsl}E51arr?&Kyg1zpg2JcVmyRmX7G_*)F{R`k=iU5fdgP&*$UM5*q82Ac>I;q-Tb z3%o%M1I7l5DHo)bG+oA2vuvKX$Pe87bAg_RL0WqqjFG_Wy1w<{rxNyFODm1C_8(L0mnn9ina^QW5;ffPIwzP zPLj7{68P(?D6o;txm8#^1j7afIAN~@&4g=d>Q1#J#d<^~Hw5jvXXcY`UCqD#eD`YM zWDD2T;!vGM*>txfFi*ec2+U(WI}{M)$X)uJqa9L0`&Y~(^ZZB5i~K9*EgVlsk}PoU z1ORx;{t@$TX*0$_{m!t6!U4uLRA>mVI|y2=A>pg)H%@W7jwAv&b@}qBg~nmVB)Jw$ z2wt<>NMv$l0Ffe=oT#_(@om#$?H>VyWle@ikiBn<+k@+)7ZwTwNAKRA4js62Z8F+C zNAJ9r$dC_2)uDft{hT+&=T2_`KmA_LNR7Oxc89VGOfnY-HPX&%?Wd`S<=P^FIBIlH zMJ`8a85Y{_15`v897jy45rg+;84TS+k9R*&b(Uo-E6Vif$f3*F39 zq?&|eyiOByy&=d7jkLa!`XhCoAxq}n@eo+XH8D#i?++z1m17wf$Na<$R*V(S>9dK2 z=<{55D}pd%I(aHKwIsY<_&@I$sz2Y1^(y~zi=tb|`_cc{(*!U)ugqxO?;GeP&<*>z zqf1@fDkIr#{^^2;v2g{M6PHqM?B|R}o-EI_PJDw_+kKb!qCDdW)ib?QY!qdcUG3&B zHMyLuNxEGlK<0JOXxt|3M zARMiAXc#r`i$tLPn{M9BcYqe%he%dQd@02e3_nL?K>sgh9&wI0e?#9<3d3Nivd|l@ zaSji)CaTL^J>?nUJNh#o75Z%nn%dV%JdFEH>W}ZbQr2HQ< zFBLcgUeQ5Ci^1$rpFYIO6;1BPQ>mZ>HcQBcmfBp$$NrTV6X z=9My!eHchNDZ{OMpbW$LZs;6yHhZOcR)%q*G&L`%Q#N)JvFHS*_+Ntl`?$e z5)IIqyqG}Y)gj|ygOXStI|@iSmw;%v0J^4r+7lZ<4@bFI+4bQV1kjOuzdEY) zf0VL_pn**E>PFVFYP85;p12Af8?2Pk_X;gxDRcqu)pdDt6M+1xSP)C4hu2kX0Tl4D z8kC87|$;ii~u1192StL003oQ4A#9TmaGMceIf=z1lrx5GLEAuYU0Cx>Hz^36g^2| z!HdGzKnP*j7c0Vw6sFu@Gky$6ItfP%(ryCWc!%^Pu6}eY!~q!d>LRt-@}poc0GKw< zcwWMjeNF)IYxX*T3wNI0=p@SicoUGv#K!RyRn2~fa=@$tUhyN|$PMhcv?g0B)iq@KAL2)oc_>JuZO(G3c7C7Xt*T1Jz$r)da-G0X!5yu$r-( zc%bV`XI3~gL}@jUV4GztPw%l(!w;(9`R4$_h-C{)Qdk2wST{K(uX4el-?n~#1(wzj zg4BWS0z`rzz=V?U1M1E%X)ufH25ON$7NPo|G>RMYd00FXjHW6xeVc5Qo#!EA6u#6l z8H2tkpk%JW2pXt#RsSTL_2YYxxYL_6VpB0mgqtFB^+}9ZB*!Nd5|06zaD&C@&;Rs`3s>W2>{&j}ET)0Wuudd`6h7lD zp%Gw%R=*=-GJEOdIskas?S*jim(_Nh3tiMwluEl@#WCAg`r_R(jCmIQy_yG?G`2ni17k(tF1AcEs-M*z{;2q*?88Ng%0NJc1R=;6dA2kerv?X z68KV@p1&xoPHc)vX9JVb@y7{jsceAH(XVzaUj6#6r*N-KWvFBK8wBW#MPj-FgXB?HPub-?KE~m6aD^ z)eMy|oK|$wk(r0bY4AlGr-ahYJjJqNc%rc*fMHmLZss{S;@s(GUa$}x-IVH(o9+dQ zrki;d6z${$49DwTYKOu(qoApx-^Y!z=w@DCLJ}J=H32{IubBs~{%hs|ySXd!pQJ3~ z<@f>FOGjkhos;-CWL+ix%+WKC?>?=iaaVVD9>;qwCk5PpVk=A7wZ})S@WkjoOA;yX zEP(COS5}EdM`oViW5({6pIvolTAD1sJ-BBDI6oT~OgHn0l_ML#JNkHlMkC=Q=EGSj zaDq4iAk2(sA3npG9dcymAnU{LJ-|{y99cG@QZvL~-d~YOC6c%ZRH$Rv%KSU(j!6HQ~ z&4Gdz4=tk6l`)6sEFgCWPQrj^Fcz@XLE_GUoSg-{&0-M&+P`OkT(u}9#OL5F5Q3I$ z2Crxc;GGQ%PAAK}odIrq99ya-uUrlHm<3mxCF;Z=H+qP7)l%5!Av=qOfS?sqHX{Q| zq+EBj9Udy~Vn*3WF^yPCEIT$_t{E*)k@*ON|EKflHhq|x%0=h&#Rbz zph=QqR+RmTsKd=4yyw2+RQwQg9EY`Y$)_QW*r?sv1O|1=_ ztPOpvjqX|-|NLN=aNZ==;!XnOjCvIJn6=rewfTXy1&fWPppB(dWDecl6|_0e63J<7 zV|&-eF4g8@k&S(kji%2>`>!K5$08k9Z7v_!I72=;9DHfP?NVXS6M$w9+S-8>pdoei)w)Y4Z+}+PcumJI&M{{`WLY?g5+{Nxx4q@sa5609&#;#|N5e5fEjYq%Ui{ra%ne}aK(t13@XFLaYkwCgw zV1T{TWS(=7aA(yl<8I_WM;Sl-!86)*_T5Gx6ADb zgo%pJ_O(S^HQJ1%BKzR47uAXef{r;>rEP$z#X?YH@?hb9A z9h9W(Jnw!0YFpk7y12*Tn1FG7Y3kVNlpfV@$MnCkci&G<_ido(fzVq*?*s^4I?`LH zQlu+IDG9yz5_%v&XhKjr(h(351QF>{6{HA=f{1hh=_p+!C+hv2bI+dHo!wt{XLg={ z;rl~oa(%Ar{dzIW8f-(5FrRO0GtxDGboNef*Cdn3>J3nUo<%{jYt}S-)m+caj-7<&Wim2VHirt!_-B(+? zb#J?kD7(#jcHfHZww~H;ciHWX+3kL@+uOC<|7CYTWq-(F|6SDnhl>4?q5ZL~{fW2z zX_WohJ^S+_`-`XcKfCNN$LxQ7vA^22$NsX%qlV#g!U)7*096>F5sb(VM(hJ4iH4D; z!^nzZx} zvvtGS$Kf0caLzqA_zKQN?ZD0Hz$4}WQFY)oa^SOb;P-J5h;|T6cMvLe5UzK)*6koN z?jXA0Ahzcqe&rxR?I_9VC?)17t?KyyGOG9W#tE^}v-rEM#Ma}^?)m62B$+`E0fclN z7J@QIlG%N9cDkZ=r#r6maX{7^db2|7>|>;w?w7B474AcskCqTV<{kicXK zq+5wys!%ouXhe^53|FGt3Cdm2Rm9KLzuWbYix93#Bqu?jh#*rmA%OJ6X~8z!>Yd>& zgpqb`RTHidQh*D>4eNV=j%IX$ED^@ECJJs6_zG^sxwv6e$*oCU40PjDPtZv%o9Jk= z2jfI4q%LXQ$$q5h$fV@_`fm@d(0;S%+$0i5SYkxpH;mC&M30NL1b~l=I0i|WO}(Bh zOAO(Pzlqqi>n4Ly6XjWvRDUE>lpuISO~OmMZbP~r&Fpb2okXpbScDXnOPW}f9%pw# z9Fs%jJx*pM5hsRBHt*RkS|U{J`I;#UaI7IwoB%x6`5G#pkS>d|D@F$}uE(69{Y(f{ zS`%Hz$pT2%{Se7+VlFYH#0K&SV<+ec43UEbu|X@b112G~HL+d?V7)|QjknQ)8=i^n zzBk(KZ*jt!Cb>Xh#L`J|lY4IP5L76F%ua$pr8EIPPWVvXEe3&xsrslrbF(`jdAhJs z&rfXdY%_w(Gp5D|-o3X&?ecD6SC`l)l{%q5WG@$i8+{QbBql!=b59KcxI81XV0Md{ z^_7EubLmc8ODBjP^3j0p?ndME!!5rvAIK&;DV+e{cbx#Z4p8$sN?%J6^LhDk9nqSg zRSJ-s0O*=*g+V>sEWTmulM#MfZdF7vt=kfLyD)zAPCW_o#3Keo1o87B*(YJC_Qz2N zu-Au_xlS%EZhWnY(IF@wpDnQ-7qvW(Y!}bM&5e55xQGMv9;chgS%B}5Pu}bv$G$&y ziho-S8z8~-19Ibg)ki=8;(Jw#o52#19H}R~<$D41HcZwl2DydJ@ph^4<6{ow*zgAn zctTG})CG=YLw{7cB*HL16ntGYa0zSBChTKifuA48DLRtWMRC(ZdXwa-*b%#>hnCt8 z5@2u)f3SJ(`i+b=6M3S$uYwJ@&LhIIJou1?^3xeY4WY0|kQ?)&F zxu8c2+gdRPX)Yl#AtBZ!!8i8^R4_z$h6rg+0lKh6KY5~cY8T!fkMdTu6PP??=ESHw z5oQr5tNb!?{&0^$1I zv;mO{4qi?7hwH@E<{&a&k{N+EGp-Q#tCK^y(D#kQvhBlie8X~mQ=*s9c}wV~ZnFHC zHbaS=&bxtY+`BD!DF~Obr^`VDA)dLLhR0 zj)KNvWfR3pPes)d#7a?uNoWbE8?WH;;3-K!YoHwTJZdRk@L9Z=+s|B#r<&V=O`e1U?n)iMy+Nb?}S5$Jo2W*!B3PobV9U@EDkjfz`oXE)tg< zA`%osU1L^l_SEB8_+#CjmjSz5$Jb_`IY{D|JYMJD zOkU%$>+nz)ihzt&i&SVqqvhesWTx)q%HUm>p>uOLER7*^%$g2unv^LSN4Cg4r-E7S zX}@x*yv{I zr_d*>hCOsMqE6bO^-nt!}#uLI8OG#iyu1a!&F*hYwgar4z5DG-F8 zB;izg-ggR_4U?TJ)jg*l6w?UZsDxq5m>(xIQ9pmlEtww&8c9}B@*LB7*8Ccg`;J@W zlS@!)dd;9`va2V#(abKtQ510{v-S!RcjIfyD_SZgCBT;qrAbVe4lTmb!9X@LCR%B) z?en*Bsj8z%Vb15Sj=;S#M~w?Q1fnTTsa*-Hd<1LyLBanluyN?xiyMn{s-G~T$EV-ysDFL;4g^2TK8d! zs&Xtv`dPv%wO}uq8A$jdiI`ubCCUDJL!3b-37%&{vK#XRieH(V=<6Z*>~SyS4m9)q zl(U2}N>G`H_T@&pW&9}JVqS()4Lr46$cxnS zS$zo+5z*{g_`^Ze=Cg0f+LvLu`CpkrqFDLQJ$3^a$F74Mt zp)%=@g11w1AvW?=fru5dR0Iz{)=`1Ry_1VH7HNEkRFciQR)EcODNdB|6Kmw914y%T z5u(1D?f{Fs4T-nUy*Q+>5k=+ZMaI&6kR|wr`m0c?CuKsF~2sqN@M(uq=1yiIdcz_6J85zbNYGf z8?dMkCoeIydweBO%qT}2#3ek_asOtv;Qpj6o~uLG9pD^mW|y`$eZBA(8I{Jhx$`AW z3Yr}>!)4~;@fw7(w%_|!Xav4UA`UV0nvBV~<8CAMbQ4_5b***qI;r~oVAwGSU7LTp zcOmdjCt^_f{58Xh0=*9*0j2k5ZlEMjI@GOfNb4_aoMaRsL@PHCoVHdYm}evFppF^tr2LgoF}pGrzy4|2_&S8 zWBi%)M%0L#!eNbA_^n$%4f!_E%u1c^*A(sw%ga)yxr~;%Er`Wun3UgBy<5$H05k-m zk4geGZyi{OB$|>5L^|+P3Tm==lx4^!0cID>Zg8c;z9yG4L0(bIKJ~tr5uNsliOKW^ z%R}fVI8z6fa-AcEKXTGT)GC7c&P1OCe_V#X+cMF^+vxm8|ID-TTvcaM4NxTwDT1(^ z_WTx*!|Cw;JP1HBOLm}r_MVg%NZ|=g4n^}!WtDdM)0>@iS|7YKFHJ)~7wJd(K5Dpqm8G!1($N2$svG#$2*${?{qm%v9v6$vNOIYZH?_autN3scS5I~_d|_o6ldB`GoA z7HqN7G|BRh2N5olHXhtyGch5S{;5f&kTm%H(DcNZy%%)lr@_`rLy;mA!84A&;)pkn zT>YszUspm(HBTAx?LG41BgrpgDV0m9+0do}u2|&@(@0ywL=+?hUnI5`g>C!}t0mM0QK!bo}R&2KqR!=fk-!-7u|zVutpOUfp$jrTj$ zQnDQ&IwnlXULp%oA5?g###N-2*BdJdP>=vH4kUmj^4Lg=)Xum-to<@)SZy8=4&zOf z=uQ0+EgYVwzG=$jd5-#^lB>r1Sqq2f1>Io^0DSt5=f%BTF^uRi_L6<(HoU3--I*Z@VvF`(BF6- z!Tw)7k9kt!h?C!gXRp!DiG~PqJ`gT>*8_*=VTw#}c%E*FR2JN&jYG)G=ybANZFWVg zb481=?;MBc-5e-bE*Co)s5WsZv2rjz;^=^*T6Qg~ad@81TbJ8k1P~VO=Ky?0u3Y7R z>^$!K5EJF4=-+l8zBWB}NUJ%4)(`f^P)m$5UJVITz35l{gXgL9XOYWtU)O;!I1r|g z@v8sE^E4if*-+=)rm2coJ7Bp9;+BT-me9;;P2kfx#EZ_R=oM5B;qbfzEzsJ7qp*<= zwk40hY0exYogdlVG$LfR`;F(>j&bAgJexmw9=qb;Z#?h7@i(5w{s+&K;HJKl$Orn3 z=P4_)^PXtw74z!%jp#cNy5aCV`hV~|*L)nF*YhfSMLX-Obqga)9I8r%NHXLPo@X*6 z+GNX;)2sCd&vOedu_suU_D-3&8ro*^M3x=TZ}o*&C{gl6N)HLrv8bvY`VX&DRvq=Vx1 z&(v+!G<|m~%o9pg7ZPaCtdytwwI9xFyYX99tyx*{Cf%x5v#FNM>s3vP^Fj)ghOLQF zEr%Nsme^`CQ5J+S4{{+iu9{<6m--K~Z{0LwQSE%Bs2j(41d#J`G`i?Fy13(UY!G~| zX+rI6%E@Qm3Gfwpom>Z?T|=wY<%uxIt6lGlJtR)}m1l7Qu+%9xnRmT9Lh{%2e?A91 zJ4eM{pgPYr`HqThjN()K1oyr;+bL>O3>CgS;QJJYwI$uDhItUc?=K&7EQu%L-LLJ{ zqI|4oGU_-bNam-h<`&CqwGNvTh&F<_rSOaH4pZo?g+B{WFu;uJARlgFBb_bR;f zwI$PXO%WE>Qbe=1gsSn5=Z$dlIuu)7sfh@;=_tTPe#vK;K>2Nj)*m@fcR|zsubj6D z;gkO(=lRuE%l?t`mR8kiiBhF_zZw#HmHn3UG_S`c)mE$B1Vr@3(+*YC_X$WIp)_%L z9(|U{=CJW!Jg@Sx8m8#uIlvBw=e7R9^CY&Pdz2aB@VwD7<%xgnJd-|DyhYq&g#_qt zJFicB6RU3FOi-H`>qmpb^VH$3>P$z$7h zh;8XJKyNu7%L{QXf{B9vN}z*JNy`vlp)-s9PVPgZI>lrF*+Q1^zh z;-y^*e>@)98s4KkFK^ZRbbcuQco&as{P8&ad&IaPMl6Hh$4?4711S(P-Sr|3ZONrx z5T%CMth-rr*7a6KCt8mL&V+JWq7X$c+Ihg-)j-DCknip6j8gC0;+R4x&*63?xrvjdTB-S! z6}J8oDrT`mrnC0$rQxpGCJG?QWI$8H8vjwgzL2pdwlklt-7ub8BA-qdAfNWhbE+T5 zhAJKbj30Au<9fE%g|{fJd4hVUqO6j*XN$*(rC|&@o&7hhG*r2kMG2R$ertr{Nzn;> z2w(E1M5}XkW~Xe6EPN`fUh=aq;iIT?PMfIdDVk=e*L|O*EPaBpJJeRO;%hyIi8)ss z_}Hg$fkdKL$rXc~y}}ckPU{XV4xaO|fEg8g3QE*v0^Tpf!AU+_iDvfUL%=f20<=pr zz)nlq&4V`Dik_E1oLT+zN<-ukKtXqUW1%7Z2qiag5LsD7xos?Z`ACgj9nTmWFB`Bb zT=z)}$xXGEEgo+Hf<99V^4@)lT(wdcF{xm*s<_3=9Z8I$Ea&3fTGr+o(DwSErPYgu zJAtpBvAvSypwc%bByb^+xJY31a99z>@IN=b|54{q+vIS@Xuu`Pg^%>F0Aj% zUqL;;YNUSxAh$_TJ)q}qh~tI?+!_*yBGyz++6cZ=Ancib43a&osaRar}V+GUC@YBTS^Sr<=R*2vK{cCP$Xj zyBT-_n@rx*L7pYDohewt<~>v>Ot~i!LVN49Al-z|$92vd~ zGU^X9e5rYJJIHvy*_h&m+I3B7oU2Fj8`U#gJu<=`-ISIwVv{jjZN7Emg}I!JiLZ=B zpv+BknOi@CEbbm#g}yM%ePI>x!WeaY+wz5VNAR5z8S98(tDiwi%)!#rGN$4(cRtJ9 z*$%$jaBR9CY)^M$GjeWNS2 ziF72@?S#C1m257pv#D(g}b5{NzV)_mz>AV)&~(&KrEYlo~?Xp7hTmf#54 z5XTcg0?D%~1oDHqLjhNKIP$(2J=oJHSgjfT8LL*z*Wt4Wk2mEGEZDxF(@uCHB4aXEZ zd2V%Vp%N;;icfRHO8XTbVK1sz+iO0DKg5s@&(>*pE19qEf_m>;bfO4zfsR}t>85XDiYu9=RmMWybIj_!S> zSEn7Xuu457ojpM1{|(ja?CI$6`VQ)Ra|1WK9QZe?hwdE8P#&&E4Lqn&=CvN|e>0f- z2i0Q>bIm|9<@NV$v{nQnrRO^o{BW~^4))PV*4fCuH%N*l1o=Scl(@=!rpsZbCBReV z$f1b$mR%npkrNTI>;rftgl#=D(5dR!@gJSPW&X{tDf3un z-L5Zaq7Nfo3;R)BB)`Ne0U|XjOFw!@E1?))6;LHono9#$eWturnccV?M|H60X}-f^ zFFR9zEf%6EkjOFf?y>zi`Yb%qWt8G&EJYR`gO{eFSAS-NIyt)v!)4F64kUXPk^hNiPf)LV%FoECqDuXP>Bsl6wOVjWVa^6San?s>9}P^=80GKrN= z>!H0=|8%2!R;gdiuZOnsgx99~Qy}u2IX(%O=6OlXjBne$3?xM!fX5FHsEbcB5q$+9 zWDsVzseF>jW6MgdeC3CCABXD2628CPjG%GUxyPmua&$)Hq?-xR%el^CMgofY-%&jU zZXtIqA6Juto%&X~|Ap$&d)z8ymw(tC&Vco_czU@<)n+88!vCYAz<)wbwg9_Vqa!I|JDPK&Hl+SxV&Osy6|1D%!%F;2Qz4bF9O_%8+ zz?OP^s%>*LPiiuS4bwwF)#>(Pf!+oqV-RQUUhHZ08_;9!1oczO%wsfj=FUL_ARv3@ z0Il@*t%j6`{%*PSZ&Sto`gvIDpwes7u1sf;gWrH&>r_%EfHUe3py$s`6iEMl z1qf116%9=bvH2RGAccR*v;EdwCz0Yia|p z{k5+OOt;!H$U*Wo_$*C5H+vJ}#Ksy8nvi(QXnF@-`dJu>^80rV!JY$@)|cD@)8@Sk(~4LvmE2?gT7GXml@jz1FWT zOuKEn-E8}9?{LwW?qA&z`>%+=^SkY2B|ppiXQzDji|Jr`XbSvw>r)%;2bqw_THQ+5KA5CXisg)~4(BY29De+rW|q@jaD!Pe zU9xCJZwi|eb-Ju(f2H-I0UPzBYYc4q({u6XYk$-aBE6U+wo>dwZqG3l$dUMnkpNHw z{RyuNV^6wsR!&PT&8!ZWBh#e6T%tFyS`j+!iK9XEs#o`46^AgXBtE3Ncz+<+;Z2UDhwcp%+m-eNa~OkHf3z9IJ4@Q^eJs3=n;g= zM9vdvY|pwnTH?A!M1oUtX?K+-2j{9ozd1d$?A%1v?6vo3^QCU~bCp zFkSX$wkB3Nd@}96ai#sTpu0Y~8M-^x;Zn%Nz*v7Ib9`DG{Pf!xr26dSoHtn+%oIY}B2dHm!^E*Ylqye%18$=flJA1L8P z!UCfpN`oPOfnR4_0x2nUthh$_!Gsnc-5P+%%eCbW6e)L&c8S5jCwgGtXliVo;2Nv6mds6-Wf?eNfKK6A|z0l z{CXDv;u~x+DafbF1uS_iff97GGX5qAX{mmm9f*7PG%L=I$>=z9w*1Z{@ zEpi1-oAWi}ZoIlt5IKBfPCa`fk&&Ea?Wu+GF#heNpj;s5jd%h1<+@vEeIwJ)UdxZp zO(B{4A^eT;#wZ7xCudO#QC03X58_>0Hz!mS_^Z2QzBi$;98 z8khWO&BCzxABJR@fx2ncyGiPPrju*`fbZ~p|MEqP;*PHfQ0F?4}DRkd-E+X!HL5}2N*H?^y zebJT$(MzSO6K_7jd!a-8imhIfGkVvonQwUYSzHZo;)CA=TE6huv$3qB1L!a zkj9{XW;vRSwWt>tK`D5JzHAMxHj#$r|v1q05^0m6X zL4F&L2si9#I|A|co2y>Ve;dzyMPj$_zY(!47jV`oVCP69L$4=%^T1!^<68Ww(i#YO zo~F0Jo4qf$Zafz2;qirU4YX!fct0ZtYAJM%=~8f>%*32$pus@8Iyfp1PowbZQVjoeBQ$O=}7K zObvRB{xAVZ7>DQmMfJpSb{JG1ja@SOj`bDX8!8l!xn$E1`c@eGSN;gvQ_o&4(n=jWHfhNm{W6UcAa{ljL+o-w95aW5bEeG;G#no$&j}ldMke1Z(SSc?a_;Wayj^Dh zipR*GzJxFVoi}n}{W-tUd?0S(0^VPwzrWxnIX_DIq-Ue19LA|kLU$iZT)SvMK`{5@a$Gko5-cCbe!GDVm0eeZRd zARr)1p7J1TN^p2mC|OE)ShD=W5X?X$R1zSR-;`bd{V)Xx`QQb%(CzLPph-hg%Wtq< z;)9iA`LoKX(vi%+V(IIkbc}jnEN?H_TO`|MEX`ag?HvuV5iStk12zJ6iYqBOt0ult zX!m*dUTVNpym(h(4^}I=5BE#mwDflv;rA!2NJ$% z8sY)Xce{eOo_%TlBj;JH@WTJfd9lp@$a(k@|CaNtc;wL!nE%RoRnH~w{v+q9SEjN* z+p3YrKau(&AV(+j`MnD6MR(O)b>l~wlX=^^df~}LLf>-wlA$Y3&hz;r=js2E^IHDO zdA))b4$Q6Z!Nkqb26KhBGBN*tNoyRO7dwK3^U|p(Ec)26I5|%>fNz=(^yz`>-Dz!m zdL74GI^Z$Y?OK}G&oDcJlrW&{1-XF6l6ZD3^KUt?Y~13HoHsNE=E>qiPJ`odavt87 z9&SxvM@?I^?vJsFzvaAGzOz{BArtLtVnJ7W!}qrgXR-{xHR>?vA7?@N6mW80&qfeV z&g;41{a4OIGUMdD%PnyLPR@%}DhlGeI5KW{uHO(O+jOM#PKq4a2TT*=w7|)Ev3zfD zd5kw3YTmweBm0)Y3snWYebm=MhuB+r-oU)oUI~e9Mr*rvJ9hn-AKH(*?ifJQ(k{&GjrU zbN{N-AO~Yh9=j-W{%LrouDV9VA35(zhF`=_<0JQ&8BWgY6|l^*VakECeT09Wv`KYN zw1H6Cy{K`kHvr@0JTA|Wzj7W@k^v{@^*Bj03dAitSst5PAHnSzT^uc3$V44}R%UW0 z-zdb%d3erGlfkMrctn5XJVq^!VmiXG)w`}a?u8%S;Vmvd3CpY?U>khEYRgr@EXXB#9~)pXdnXys}%`p{Su_0g?=2oxs?(qlrPED_hngU;ib z@_O0!OoYy)$#*Qq2yk*9(mC*7a$f5*><+9<7p(gGy;?g=Irqn&Bw`kT+D+^e#1;r6thSm!f`s(xFuU?r!;W?JuUS! z*907#2WR;iDs;i+MTCf?nMJ9!A{pd0vGTFbI5}^fUWlC=*}uy%HKmmCZ#fSdmQik} z(27!@MaLk5w3#(^&-knwgm7|R5KhiR#%9Nv=i=l%?&L7$$Y`dd2If=_i=?vu$azwK z=v!O;VSWxy&O4fG`N>)>y+)DxlOh-= z=cyxca^8+R-)}i@xh(&4+ppdV>y>}xyjrG0oSe7D$bY6z2`_B1eqD)TGxWb(|84Hk zAqyTCo}*Y#(V=x$|A&U$zSh&v_`w8FqQ7$9Qh4zeqp(~L$z^z_dg)vUe~A|dDGttq zC;K{ZS_rzm`w5@=+3lFsg>tFK2LJ4G$H{rik9Iry4P;)u#L0Qk$o*IhPR^_OE$3;# z<{Mlt(<+u+>O(Q(t(?3M`v#TxJD1Z#O#3+!`_1RI)gm1=Iv>fcKiUW*SuenID;U-b z9(?(lF;5W>C+BTW6xMt2_1KJB_W@}?Yy2bUsrUJo_4)UWMk^0zcRqctY*=LDSlU^C zZK?PlIS+5}xYO|E<#VybIvRE7=btri^O1W^2e` z>uFaO$`Lqt>d0G;QL$=PRA&L;mfuBFQ24| z=l;rhexK%sRdk+5xn3;C_NxfZ$(-(}7{2CM_$K)XC+88UE=4E}a6^|!qS>dq*cdq- z7rGWlf?JPm``fB15^Yx`;}^jTbmr~d+e%fMovUB0yWdASvfHh0m@kATaakLY-^@+50{F}7X zdgj8`jP@xr2TD znv}C$?!dygXkmAGVPC~+&;K>u&8|HZk0_aqx>WA|;MG2xeIDSIb2t7&iWq_4avt8z z{jw%|&4kwOeGc#un(6!9*EE~O-<`{EugWoxw#(F>hJVoO#>si#Rfg5nj(pXUAXi84 zKMmbmY7Pd(toHhM{gLyG&d0sMr@1kHH(ojCz)x3&Iof4(ZYa+SS=tTQY%UHZo{i`lE;_m%X- z5>l`#Crxm1SjS(=v@`K#7RS?HyVIE@S%E|KL<5#2#Qx7wy=3X?@!zQ4!}G?9i88HJ zmx0r<&DXNsS&C8QGF!c#Yn480p>JzHKD9wdUy{lG8`YCRg+x)v`RJH&*VWxmm20p| zJrzFTy9jMq>azdDs+uPM7u6$v3xDSHc*kxy;nbmNeN6v{?KS4t%!3mwDk#P=h2ZJF zj6bMe%g(37FSXtGXzXd*Uc9C3RGKJ~njcVz2sD?F(D=%jAs|lO1@WAlescF!y2?+7 z&yyV#19YN3TzXqalI4<^!UE{RXQi$j!O0E3-aH!e^?goB>HUS{yRON6`=mh>(#aSc zwo5>c)iOfjQzJ4&lC3n-qc6{^myiU#0wrp%Z30ulHErn6L32e(4k^O*Lo4$0nY!$% zS6UE9%gQUb|2IK6!U&KWtqn%J4b7=;>0|f z-(sG6m5W1a^c9&~XfA121`@Vws~O*VJ?o=3sC$-hp_nJ2 ziOIASx#qTF%<{dNCjR=69hNPHkHET5Pa>IAt3b|;KFFhLZ9;=W;n9V8z1p@`?3~a& z(q{hZ)hS_VVNYQ_j0S$a#9+$LjriMN^}unlbR&KE==Xyvr$OqAFMHX) z87rG^wq`soEiT+^|7y3>0DYa;m?5YGdVl!lDt^@GA^qJn|5w-(VM#jxgfeJC14xzA zr!o>eKB+(QA&FMs%}a~kB{GMP(RAQ#J?*0l|2@&Ig4!2qZOrN?BR^Z=T9A8{JLnHq z9W*VPMlnyGtKo_N`uI+3e=Kw{y3pzRg35M9h0cNepx*5`wim#$g{^${u_BhBj5T7J z?pC{S>{!@FF%uhNJt7&~KW6US3?ZYEvWIFk-i5F+F0heZRP4$HMz6 z?4S=Aiv-8{K6Qdy_dJ_Cp9J+M`|)k6o2K3h?8i9EGw3%tRy zRkI3I4MmzKGDuyJ>abl*ucY!@O!I(*?zquN#z-{Wh^;A6ov2nl`A|;f&6ULeuv9f> z$vEb6m4flQQ#Yjt2(rjlLZU{h(Uj$jG8O8lu=9(DkV+6FW}96hL?g%kw?!v>#+;K)3fk4(f&=VYomM9)NC;PMfrYW#55A{VLSlHZwM;w7~lG zNerc5f%JZmbb>O7&Jl(}T3-qUU4mZgOB%lE@tyDK5*~>EoKa^U&IP5{1?f(z%Ij-z z3HqVcrYDJM&vcLx%Fe>hy!vsV04|jpE~-d>^lkpBY^G!q#x4A}PAT>GeP=7BOUTCL z{7OM@=gEa0E&-z^?sIvY094aK1PH)l55f4*(8zE zoyhQ~mWDK2S;}BPN#FLvBcBkdQGbO1P}dNYnq`tu*&zuKSySLyH+9nMhXn3B>UlJ_ zgA`cK3rJWzjEJgkQ(IcN2d#6x5MujCIRF}$df`}6r);A$FTskmnc?GA2;q`IXs7;b424xz+Q)WJ#Kn^RwZ-jL?) z6~cL#8NwcNf4-p3`MCTey>egU!(9S0Jg!b)NQ~%7+<{t&Q4oE!htH-;_8kvFKNwQJ=lCDrCp&4+4}O;NuRiZA)>0CF&ycYq!O zn?Ye0+QH$D_R0U5WRZkJ^+X%O-lZ-a)rNNMcsE2WQY@am4edU9+VF2w@9<~RjZSW& z4@4rNGFF3tYx0m?kS5;~3LL73=%cTHmLQ(COzDZ}XWx5DDrgYx5seuX8nH6E>tJ)|8o1hswYU8@}lyg-J+T6J)a2b3}(SEoY`@aANe#Cw2mz5 z5FwVN*Jzd*FO|%+-qd>Y6*8`it^iYg)5YFkWz_n`LBf0Q+fB=GncVT_YH*i(b>!On zN~0S!sePMft_4$>pA1k@b8fel3OhFQ^XJu~F90Y<-%q!vHzqQ(Fgz|-_RTQBbtB^QGqKkG-Y)^ZuOgS1eGo$eKm7IB zI#=I{JTXgxTiBZtxG3MS7?8o$T;AMrhjYyRdt~6IK%aFeq?uE1KcgVY;Ucif&;Uje8PK_w3Z9 zw@P(l7F;ZTYHarntQS^ArFJ&=g!L4iq40q%V|I{9Y3GjpkH3)K;uAd=X)Z#$aO}a(oZV1nGEZp8;2uIZF|1p<7p7*`%Yl=cQi1v;FbOPp)plo z2Kc<&)%FrkR8&DimwvQ|TsIg?K8n1>J0*%ETi|}kOEeo89g`Ovo%u0(-O{(%B6cq{ zID;|P7eDsv24eYR>=4=$vWZ-;;Lqt{XUC_ZT?Q>6iRcI*OW<&6HVt9uc;gR?M9;Hn|OUvPv6;i;~`Yo?HEUSz2A5qQl|Ddo|hCWm=uS@ z^9~Xan@Xl`iM5SE?yW4yjim3EajAkaW~2}*3mx9LQr!LZA&hc{nK&Plc0D(_ ztu|Hbq0r2pc1349b=c^UPL zLLP&uyJe|N3K=J)yhHMtU>1pg@VuV1-*{e6TInA=Z}bnIr%Z=InLfCJh6KzI5ztW1 z;qW{dh=%OGrUHmDEKMsp>$CGc+X;?W*5mkCKL7};Z5EN~4(G~h>G68a#* z|A7mJtQQI17~;CcLaEb3mIKMs`6YehPY~($kpFZevqBR2losf~vSrEc5x|L^QQ$TZ zksFNo6c+yJ2FU}V%vWS&rZ7Si8k}QR*t#rJkV&ELN`o)Od=82Rlo7Vaf#eEk_Ms{S za6*VM!_$C!I6Uw3CJFZ+JkRwHo<}#C8|^3avN3~bIgJ5_)4_;m+$bw?Tk`=x?^Tkh zKX{(0buJFiyLDc|76c?2^1m!Ce1OCAV3I9)K;|QGOfQfzkQ};I=&1Pz&x1<;#q(-% zDTvEOf8%+6T)*)=lM0OR;}ld)@-U03Nd@WjYo%ipsB|U`B0V7iInZ0Gz=cu&r;mO$sarq#Jtv1 z<*grfj!9vMr&P~Y#D

;P5<*|JHx-yoWeE4^NIMu)-FaU3YK~aF`a3*HBl^>*vUD zJR$oR&r5*tT&56RZ2=$ifKvcuR!bzirBumK+;upatzU^Y4duNhVwXW+9hB6H4p`3B zcs>Edm!d+=kzzoUmt0(L@02k4Q^7%~zkZ(KUqA1F`v>%?3pDC~`gtk_QN1u?y}OoM z1P!n8xOBO~&tY{6LKHW@Va5OYc_nXX8Wlkw1QnnH|M+=V`9dYF3_N=G`GsVK+V1a9 z+!r4flG1x%b-_5G2pXO6DIQKcJP5aQ08M*5O<>E7^C;8BBfDTF)`!;VV1SO&n95`# zUA*Wpo|lVT!0d>&OBk^}7e#ex8xFwH2T|5-0Ch`6*IcF((l_wbngZt@u9)9qcrLi7I=w&>?F^<0p zJ=4F0-UJ_`aM^D{57fPy8lLip(9?_~TEP)|6J4f$GQR~qj8q*P*h&y5=wWe$-c#n~ z7Y+`D*=9{S7^&Mx7vKNsZ-|oJGQZzsQP3S3V zkXm7$m(UR3a<9#C9Av~1dZogPtV3VIdT@l^B973*^dl(R2{PXh#1dJjCKMXHVKwbJ zhZCkGgYT8*DkzIzVR%geYT-Hy{JuKcSJkCd`zB@j z5?qH%B>I@pYx8}B%9-Mn&pR;`y(9DVtLMM~)=2&n)X@EX;QN_Bib zEFi$DR@G0?Sju zLhZuW+9vWRCK8q3)@>W({_$!>KEsRvC2}SaQ(&DQ3XWfs+=e|(;kA@gME4x;{Per6 z@M-Gq;nYBK-<$l0BcC3+96X$S{Qhq5`>4A$0rQ}1?v%Sa(>nL&&8_ay72GpDDE)`f zLrTh7%i&cFhSv=Vne#VEbihR!nC7~c&pK&^7CeOh5PE*$hZ9}vY?M2lZT`C*6C-3U z+SIOdMB(3v4^ptitUTRTrI1vds|O`>xO|s2LE|Xe(Pu3y5ngt^vu*nekkXr&TIDyP zM`Vo#6o>@fel;0bC~2ahX;OlKmm%(`F`Y#nzg@{ zz<@*QLw8AwfUl|Jk=}co^8kj7aiy1;iym`P+T7pr>%VH?SY)?3$&{2ZIX#J}lp|Z3 z1q3!laeVWM&(lL>%biB*ss-uogAmh3V8i%UTm5#&_;%Oc_ABb0Ua_72|Dt;TMfLuR z>S;-+6EXe&2-TDPe}?KAQv6rZLqmO%Uf3n82P8KICPRag8*-A7ACoQileOS!+68H- zng1^oy@kq%ma4)ZRRM|B{yEjJm#UGg|5fz}M)&1+|(en1iXuMM{(MCF=p41yB}=>Qm|~Z?RqlkDA1W-YO#gyu zb`#-ic8b~O2jsIRGHrvl5Wf?FetV%MF638Mn18D1$^A{y^ZoBr^whthLkB)U>}woK z$^z5W7rg7AU1||qYkkEw=@SDEZJD3N`d&}y#AF=kh|&UUeq3KgtR0yyp*-q+aT|4* zA1uVpi+(z39jB7oF7{KV>!aVx_s07kN64?hesS|G-t4$;XPor4)y+BiDO z{3uuQ$Ia#xNL%(QZVyw5hW=4@Mmr3gHNvk4iLDv+4Mc(Z_Z0^c-OjAuQQhq9nJlB!Pf^chXBKz3YbM63`b*`&4Y77F=FFfIb9H+ai~rZ*GiY- z(NPvHUFhw1B+qq$|J#ZlEt0RB{U;yC&p#Et`3L_@(L-NEI%7?1y+F5#3?6Sx#_(Ns z^3QJeSbgU6e-e63pkzosDhKZhdw&3KoyAdO=}mXo6&X{c2LsUZ zFE1Xd0MDXX6sbX+jclu2@~~?eAM^ce^DeA#BjuM&)?=`{Cyu?BP(?n%euNsX@FOKt z6~yvTq6Uy^?A1ur>}7eNJ9{)|Vs&rtisQXlA9ac7Rl_y_V#juA1IG(yTZ*1iPA})*6un0|l!0{ltiMzA z>R#qN`Fll=lY9gG>hBc2u)kOIep+R*Ro%3x2LA_&Ud3M&z0toYdNKc~=r#W4Vz1#n~qL=#Ls_0GF!v9UtYyKZs z^f+2j;1%#Zu2GCyBl)!Hww}Sm7jcNUB^;b2Absznl2UWqrm~0nnnTV1MMW=>!=cb` z3f%l_iKK4UES4uD?RP(x16(73sn(E5<;GMJT0`iX_s`jwq$45{8p`rFc){aJE#|HuE`-A*Kteu0hoiWy(Yo4WF<~y3Y z%fE-A;yAhn;`yQSUBAMF<}}+KQmaXfTAD{En-@P?xQpv%*Ne)o_Mwi;`Q3s}9lc>4 zGjF>2lj_CWJ21j;LFR8mN4sCXc`KD9sKC~UrBnF!o4?yr-@ZHP?LIA`N`Sc%UpqeZ zk{DLi1#}uZy*5tjIWYY0QMY*f>n_DS_7orfO6(p*^!s*JHV5qc<4*i1lCId_eB=1} zJ^y@9y>nFgbU7z0MPrChFZ=yV&&EsjeBQ#{oMwG)^89LN^?g_R2wJ;eg!J(5Z6b#M zq3Ff8_eQt(r*12H#LxY3kN%9C15vl(?)cu&BrfgJf!o7TV%XmK$FMwFUqStrqUR^W zq=)JGEWqM`VH%kltiAamM;KZm|6%^fhewnT*POI+*+@JYh-cwl!$M(-xPcLYFyYdn zw_3x{@V!3agB51OgZ0C#=2}5?;j(D{;UTuhE-NNh2hSH?TuWRZ#*^B+Xop@>Y8;LF zKRb=AQOdgTA6I`;ZikKh`Z@9;XQaGl96J2czWseq!MLXO#LWCSiETm=Ghw(udA$O@#WudDTfd`zf|f&j!^v{_;n=Zf zlLqZW!2rdkWAaRLa*y88z=SE9dv<`S)_&A9FMK zn%mRi3zWdAF3&Pf`P&nLbc_B=A~T7t|jN;}nBk%q0>a#x-y-$gs z_U3=eJGHIoofUqm7lblE6AhNTg{!su?>eYX%i~btmga)l?B-O zv@&Eo9(5AWRGvI_zhhQlbhaHud#MWoXut!Da$a2!m;f0JVd6$JxHcBqP5A3U68%M`0>@8EDL1mV#P{w2MqwGMasC@JH_a%ascr1@{)n}67&+s+vJtc;a~`j zd6*u;bexJ9xQt4(f<4xyz_}JlvJQtQmn4%cZ~{GmZz~S0I)PL2;#!7vEv{B03hhP3q=(`Stod45IUp- z!0;=(tTx$z&1oD&5CFI;zK1PQYI46n8Tqb40|qrfyYVP404**J@GlWQs5`U3G3dOR zp*m9p91B#8jv9DC!W<|5LD4gaL5_A#{zr;l`^%$~HrtBc5hYH7weU&S{#AT$oj3r! zDR_X^`~t3~kqln?Dy9I)p}}Y{2O%l;Wx7*iHw}amI;p^GP3_i;iLD2E)KwdZ2fVo@ z9Z4YKI2=CKp#{OiC0PMh7`byHaf4%8VXtsU_H4~cMgdaYVOJ}fUzF0zBK*urmX4R(Z4Es&)*dXt}jovo}gSS z0CV&SIyO@QihXjw&)f|;U`Z@Nj<;fg1lBI;kZ4qM2LnBJODPXn(Qg3YmE03DMX`fp z4EU}ImcQ>Smq*zDw4+kGOcv0cR4MIn&I5SyqsSXC!1J_Ll)dJ3Lqg2n1vLaJ#2U?d zV4{Lr7V|J!f*+o}ctq}j8Pn6X$E$x+^gLyu^6!rMw);wJk_(<_Xv~ZALf0=*9?g`x zL9|r|aXKmF37N)(+J^2fx_HL`6kuK-%7frVN=hYI1<|E;*QBIx2e2}Bn zlQuAbmZ&A3GQLJ2x^@t+vpK4`_2ochbd6Hs(rUXoMXk=>FN+n)Uhqz9QTXrxEb&rb zJT>q$$Hlr&+J`tEcj)E;nFG>wu7eMfcDW91D|+6&bpf=rn-#~BjXfPe?Z-U6lQ;$4 z4xXwSd=|t#4GPljH5YLU1;(z#<$2M&;JF2C!^8dt0D7IE^hqMTZ6;R7!vSDz{Ky*h zhz<8O9X6N=RGWsCdvOK5Q-=~uYRS!^;}2TCG2`LmH=@9g$E=H~MD4FP4TRzab?H1% zSTa&mI!NN&yU!tk`rC>gYv-cLdgIa3j>rt2da3}+WzVU?72@lyAD!L6>8AP`R?3w<(*GxljtMm8nff1nGxjWkdWX(h>> zJG-UmsXWFe`SzgHVk9IqO3v}7;Tn8a`fsv1<*7zUP8{n+9x~ig^!ze~ST;;D^2TCQ zS(#9*U7%PiAK5hYZD}p02btKT8C!~8`*@q>-QSAd13wd+mMul^j+xb`BaYjOo|K%& zhA9sIC8=g&QfPxpfMYQ_>>PTK^uwB-hpE>HQLhkV^Z4 ztrm|XGWpS*>(v^dGE}g0Pd<~4rU7HB zG!_LJ3P?WnrYHWA=?tjDwkvbPCeMJO=(QSJ=$pyMgdgtOLWUFM`s)7ugk!}da|0Q6#!vXD(>SfvHr^|RHwc1^)&xYHL!aIwX#hpJ=|VH>dhTjZ6ibZ8Ao(i*xZQ3R4z(sm zNQ1ab3<_bf1E$dFpxz@iR#(ht0}BkO#^h`KE;j;1T({}eF@HCwhk8(P42a?BfJ1on zq5SViaK?f23rBj%KP2v#!|3#4KZBL5nxqZ$K-@;{a%bKVasv+Na^gNi?7pU*6ruqT zCHwWYOkE3(Le-zw?+2fp1Cnn+5MY4Y`O5n~s=XD;U$p7`5UVMhL9o! z1y*1{9`<`#Y_g#@BD>p#?_xXuzA=hmc@tV0G~; z5dO@L*qw8b%cZGI!P;3cujI%snjoxnUpyozyBHHAYr>Bu7{(Q-?tYeVK-xZ*9iV}) z>?K5q&#O8vjkd6R6F@A8z--}X8;=qeV5Coj9a zeFYt`8ixZpaJ+%gNR*ziOb$=V+8K;%TsP;tQkf0ofLx(EYunC-Sd6GHbb*U#z(L!E zXk%2wFWi}tF8lc@xmln!-QwMC(VuuS-+{NR$+)aM6=9xkNovH$23Ro!V)$mn1WB-GH)I;NL0~i_Yljp zFhCWtHqYO4St|CB<6+LcDPsUyY3h54^3`QmSkZ=R)I<0 zXWppm0{&Ly(;`cRm&e}6tGEIkUQo}1U5QRQH6^cNZt-upI0GZhrh3|RLavXbswY7A zki=rYhu2#ID7CLmZVDWTA?;I(F&zkgB|?Ajxki~4GG^ksy-TJ#ZN7wjDB3xRi@dR9 z47qWCuL&qMGV3}S9y|Lkj zr~w}Qhe6l#=t1V@*S#TX@Zgb38pf@fJtKTeI9^T z|9-vR?n}wTSzwwYk?VWW@);iQpzOdW&>mWhY-)LE2pQ=?ZN}&NT9J$DkRDf_)TNV6 zmt^$TnZ4Dbf?msj+UmbWrL3OSQJ8O3)4w8@k#dUhK(`I3-2;R@&}nv%rvup)Y%IBV zWRWx0H()_P-z@SXngA<8?K>F7RxhN&?}Xn*VZ< zve)60MS4dy}oQfU;>iE#{qYesf-c%yhqk?HI#*%c3aF>#xhi&##JF-4QCWjLJ4P`&JuF&xzdH#{bBi#H|-@R@V;}8=?i!u{uTGf()fXg<`ZTeU#ab5&|9*=lc_>x z4-@@GGRd63U?wAe{FueUd%VX8I5qMzLb?>cG@? zB8RO|^-RIIIL0Ln_<1HG@5CCE;?^d8MePrH|X1DUmV_d)5~V#SYIW@zRcXA zIy$o$_Th_Uf5iEBa5k&jNo1XQe|5V(!GlD2}g<*y$O^P;3Kqvq50e@k+g8xA5>o`L}!~*6*a8-a9$nF|y|I zcj`^rQS|;VD|!LvF43T-wb35r^Nz`x!P&7{6;L@W7zcMzRg2E*pLEoAQQQ}U#YW1g zMSEZ&5>{YUHvcW?n@*r7x5GgEJ&V=M2<6@6;F|BotJ{2FP5{bpZW7pvH2--A* z&RGq8zMXPLQYGZ zKQcBP9~Xp$yVN4C!7m(3j`hfd?Y9D-SwfhYT)<_zoyCIn+n}*i=dZ6%S_Q@VyUpIn zgGiXn;&s8e4j66$k*_@&v=rq*jl|;PaJX3++WAW}t|!}EZqT3-Gm%+p(NB7rEb^ip zcrWb7#)RZSv}ERNF2~x?-1gHZgE7;%HP;9vLaHmu1MhMJgTSE>Sp=6eYq7WrMxxF$ zq84!y13z0q^1#4P2E}^N+?7*dms6uJxp;K(dc9Tg>QR|L!V9CjseoOuQ8H++WR#0) z5F(2Pljwrjqo>SjnWENW0|A%uwW#}vZ3R3*D*vd7;EXrlBIs!r5tkXUB_#2--zr3s~_ReEu@i=U#6S(a-eCDIZ1p3RZPR!qr_1gV^ml8~+G; z3Js%IIT(11(8-ZNabN!;?>h6M!QOojmktR8) zMHAptnz?cCPVa~|1nw@ROeN}ao4bDNa-p{0{y@Cd;y7;otuyBrr(II~Pd=Ye=z^Tg zi#9`g>kyzW$XFY@i#X%R`y%n7x&iri-#n)$JOCG|R3BS(fGNR6qbSBK#pTQ8z=&42 zBjqzT$moO9-y-PWg4ly_$&t85d62c@MSgY{$-G(e;(88^L|AiU zY&yLbgz&)xXZ)$?-JL;PCNN#bcw8q3XVu0E)&yTCY^3*XT%!eN-J1yG^*V!%yqv{! z84EU9inVfczw?|)85@Fb(Gy`2 z7{tNAs4F7~3o_W0q1^?>)viI{ov=6txp^yAVSNfa13p3qTl6E&0(00}MAHedMH@oj zDpJA3{cZ}lr}aBs>y*2f%%;N(;zA}&5)-fx05LHM@lFBD)xstam*i5T73}65tTyj- zgbDGvSzzXhPP-O-i#`7c3VROov2!`;hWF;)`X zM-8Sp;v;r?^&WzZz4&8!KX^|r$Nqam{K5AQ#py&Gd1F{Q(`i>AV=T8}V6c2M&E{95 z?)*K&6dfJ(;w~}n?xb{^xWuKghszmhhP;9Q2}O^$QbsRQDVsya}y$ZQ4$DV zhV?XRZ?Qb1n_*HpCcl5;hgcz*E}f|3BX-C*u83su>Z-Al(fn=6#cE@C-xTNE#y={0 z!-ID@&itCab2H|SwS&aQn10Ud&~tn)eqHlVs=KE~$a9=kIdLD@ITI6_gj9X@-busB zsT=tTzB$V0(~^jP#dGJu-0EKyz1(uUm(R#m8Iq1!2P2%G!GhR$js3(vXyrt682fN^ zH(J@_&6{#X6RuU}aU@aC4>?R2akCp_aVXo^|L6%6|BIrhxRA;1$UFJ%U82gG-&sTT zh32z{KePEI(*FM`dN~6r+Uqol6wL(&Dy=>ecEwz|Th8W<;tM$&HtS7sVN;otMhI z-$8{J+lrp{OW!j2J8OC^JO-M2aFNw#E3cIXU(uV&6K+Kcur-K8$tk^*=DU(~Fw^Df zeuknqG}-dvX(Ut0f{%68ZnWvs^L0Yks}y#XrFSEw%elXpcj$O*DSEB3o$9Zjjv+-H zS-l@h-2I{7uFh3coN{pB$BMFu^=a#>_an8R4~#Gry_#b`Z`dU6i8B1zEO3FolB78{ z_JwUUxzyW;(E|~Jv^FNM)5X8trCA5BPZwT(W*KQIdO+l^A>HQSkBzzK&)wF!$B}&LU5NP$$q@#h zT2)VV$L_{Aa6EUr0g_-SdfG;!-Bf5B?=s7jwbh=Lbx{hIBMBT(L1N@jG*2x**6xoK z-t8{%n0boHnd9||hLRK2o*ZnYqbX(!6-J`z6>uB;63JNe_S(=gb87II;EFfmfT!L= z65A;Af%{q2p5Ep9MiA>q6-Eub)~bOgE7@wJS?p_LNw7rWL;~bZ1eLi|N&e3fw zbe<#WOyJILy6^D zOsm5@@NDDC3XAY1A48TGeke;#XAX;}l5%tj@gzQ|7(z5-*&Hj3Fejr@ zPP6Kh*L){jhpbu6N{^5h$B*h<_P&39ekEN|=1k~RceMLsV-e8=c<7RI0!jVMt~TW) zO@fTBd*mmj{&#^hfCm*A+2kt=iNaJq#ThJR&0dh1h4IKU*c ztmbAA$T(m6Z}0xWfApC9M24@NI8%1eq?PP@M(SEYfW>nnj(d~qfb<>y6I4TN_3f1G zU+3=5R$kDY98Zv-dhu*lM5>AVK*ONK1GiIO99-|-)oeJyZFWG+^atj6A%?U&lh$Ts zA@*JdJH8t+C#LuqM`|#XVr@ZBh+i(Izs=3ahe_g?HTCn1z3~O&fvKdU^9@IARts)h zEyox?5l`krOYthZ9nmSemvRDmhu3heb>f4m_oTB{#iO49SJ!sp_Lzo9P)PzmNrn%LXSx-`>wd8p{&RkEoPU z`wQE~DQ3=lB&2txJ<(RBII7*VUd6pEXMH3icRc=}hg1d{_f5ttd)~6mgX2b~@dNFA zkek6?(J(hqwg5rMXipP ze3*=tl4Onh(HzB+neWt!u3*VOz1dGnHgZO(!_@Uh5tmZOdEa43CcWP8 z0}TsXRLi`y-MYuP7bgTy5bjykav-q}iKpAE$!g&IXca$GE(;lm&ayWGpV7d)USuSQ zy*_vlcP%#%N0Kh3^&jlH&Dvi9)`>ckqDN(Z$;)Ic&M9`?tXV?*4Yo!5+$gQ#X%kx= zBcp;cL)y%aMuK@5vOf9m16Am^j1CmrO;|Ykiz{b;lgBD|~ytLP{c|GN?kp*AsN^@6y0H{TPQA@~z9?FI5Q91$Xevxp17DT!IhC-w*;jSAyWpb8YlvL|}c6QTaX(A)FW%Hx?JZ*)~wHO=G_Ks|1h#~b- zjWIM1@)6p8Ex*)gQ@jDIoFjtDGhivf*j6vM9>wXFsvLMDBuR%)nlA+X@_^fi0+QVi z?jOjeJ&F0y#Q5ypjupO^WL4uEwCCYZE)4Eb_ zRjBLgVHT`<^7_xP=~}mN=%Yyz&5fd7x_Z0X=GhNSV0d4)w)X3ILk-rOn7nAla?+3H z$=H#NK;YR@+#);vzP7kB?m=I^CFGOM$QSK8zopk!r(e29LRV&5m;5=Pxz8}@Y7FRg z+lQy`W97Z0RV8zdEg5(TTVE7r5PD52dqAd3*>ZRUFS4g)i_inx-UO%aBEo&P2|W@I z{H`JXGxXqb5^R|imxN;qXlUMdLx-P1=n?-Q^r+FIa@m5wO`WAYb_SuhY9ua6AGTl+ zdM3=dnhZj(VcNXUeFO4dJxfQ(VT;i79OU(h>fa*t^lOH08X0CeH82Q0UB}|6vcuq0 z97Z~AoH1^^)p=WkNzGRp`Q*WEl z1KhksPj8;bF0k!0CMgPS6MDL$O3YIhes@lkG21OPbFk9yNaLfpppn@OLT|#}aGpWv z>Ds63TM;Gh7mpt+#aLHlAwVze*`g;Igx*rDpwJ1M!h(kC@qZ9>oB!$ws3286f?*&_7jMI4Qx36G0Dzn^qc zF=C-Q)Xd!J_jW=~(0Ak9BXRN0UwLqfF@&@dJws@*j+xonSAPMjmXG z*e3Ldhdx67AoOSwQp($eo}27FrENm5|EwM=I$pML!l>Ef8iUY-7$}3uw;6<9RLCw`6MEFb z)Ok;hZ9*?h+WjiAJ(8ICo6z&L9oXmD94W!1>vB{3H=zeU+O_9sMTYXj%(Kk1BmvlM zRJ6YmgV2i_*d(Q_k}p{wPs#p#3_NeOZZfE3bbHc3#}8DZ1iSSEs-tziV`#)0P&fg# z)xzoJ9w%M8J3e&82=l%})>iO&DtOP)=zBk)0biTcM=d8?=$+%IqHozjfB8A|$VO`` zcll5<2qhr5=(CS65Cy1$b=MpW`6JWSiOK|n>}qK({!EP`1D$#9+2W5i(Sp@RMyXQz ze3DMF;r+zPYo6Gq<0qRBwvnVJqBuS_vd>y-*Mh!<6rHfs|8|S!PP!^eVjOa#U0np8 zQedRGRWsvKLpMTRSdZoD)C?F^jIk=6B=D^3qk1IWMn^biMr4)yItNA7j?QzGg0(cK z#_Vex`ogfJ7>(+r6_+LHxr#G880K6fNUZ7H@pJPPgdkdV}0K35&Am5vM zJ3cZiC-XE%D!YRY7{B0n?24u4F%61|*c$2`BiR}ma{FZ)E#$M;R0?*c<)Mu*c2o2m zQMYS%rOG1RWIsqc%X6P-<_`2SbhCA}1$P-oAKHYNC{5^YHd#xbH_Y%hTqhDgM{)XP z6E_UFfoZ|Os@-4MwH-k;HY52iKpU!$gp-PQB!kw8W3$_(m z?iDcuSlBC9Y%?Utc6YRN3rnk!5m{e+0wTr_k}lY{JEBV2a{p>j<0odA_Jl4xOHb%6 z^&Ri$?)coUbfCMe$_NU&aGb>R5Mb`nSNKkk;%(ON{3YjR46ArVYEP3bE%SI{T*ZMS z5-A3J7|`B%vEAoZq-`GPEBY}~NQaF+uxKt#3%oE4eKE92zI1Cze_+&0>EWS>hka_V zj`?Ns+_;mb9Jy;9#7mOgco=1Rh-BLOP^Wp-v{Tvq!$Y(2RnvEV2dRD*UmjX~^RxPH zuwOaK3Ua`T-T$b>SM#}tmakWjj`&%NelhWRc$U9ILG{TL{}WOEmNf>}st_mk>Z7^-PPzW49#q>4R-gLv75CcT z@yP)@y1&ICik2hDWMIE4Ss{L+>3JqQ!ZAQ>ogT&PsDdaW=x~EweRcP&8@NG;Izf7F zst^s*BffPAnxvwUq3_0E^G-i8k!2nKMKzcnNSIS4O@H$aTqpYKMxIu4J*eiNvL2sOHc#*i za|~c^)6L%6P$S|I4h#=T1wr`+~Azpjhvc|>#e~WZ|bh~sTWMwq0kz+q8gbDX>WWZ zZ?W#$x4N8rkn7hsZjS^PybI1pJT9~ex#zH%`-E`!y81oAkn8tDvQso}M`_$#1Q#3( z$!-nF>(sag*}QN0xIA=|5*tEkQZN4U7^SOGd@rQ(fyQ0-@5xzpDGnP~ZR%1a>QcGt z(!SU3r$(mVgIxJu`)Im0iDe`CTTs+%dhL@dE$8n=J?;FL6+J}Qlcp(D{lPG0h@7To z%Ref5VH$={)m}BFS^o&N(|lROQ1rsmeAkdcM0KQQvZ@9`Evo)H1cBao`kMIWo?g2| z7;7yx=^pTAcRfSVYqPt^MU(-YTzt^kR0?3{p6Ma7vO!Sxl;>f)w(OO+5s|J|rGQ?({y>-W!t5pp1R%gD(rEkuSk4<1lTeZJva zpV_yLtR<*pz=$ORH=a5_uEPUBc~A;(-4Bk^uRlc-j1TTqv-7@dOM2Vj-t1t=vFsMQibk4*RN};;Mn= zs?nv@Ln*6=%NfJitLBTVmh4}x73p7XEWaMV^z}r_*OTR69Xh`{Eq*=CzUHjBcGhz3 z{G~P5l(h@xYaX3zUW;oxir$W*x1;C*VjEB<9)J!og2Dg*gLZ*qw?eu|#tXs9!h&K) zquEiMXm%bh&Rx7be0;kE`T0Zy1;j*zq{K0Mq$K3`NXyB}D#^>L>{HM;;`b;6#Jv*RyHS3*;*;vrMK)+ayQ$NgkL z5i#ImXMl!N;2qx}!l~dpoxxhsAwFlq?w$$1-yLoce>vcDgz`JWp(J9^vB(4OB27<_ z)D}pZ_A#3H7z!~a(CHwW#~=Mj-uB@=GuGe zO>O0PYpR6lEwj3+$=BKYJwCpvdobOZXOJLb{`ldicSSb$PwgmryD%9tBwhX7@J9`) zYGWCaNy_}*5=q+P-4f;n@Var9J+m2-op<(lbD7F#TwL9ASCA;drp;6Q%KcWwMS3rU z?HTf+5z-ym+|Ht9QpipiN|aHUuAdyCAlt8fI-LSP&Wt_+ z6&nZ)yQ2kSpH+QgkwhaNM$$q_EozMdA!|{spL>Dng8%LFjF|qM^SDy+<}bTf@G8bB8+GoLF*KVU#|oCn zR^^H6CUWXUiUgq-QV9?cyDa&#!VHhQl?>Dex`JUklm{CFP)ceTJ({)#@lub3)a6H9 zjgmY$cwOSR$Rqs!0imb;o6xgn@!`5?njKSS?Fc#pTd1&uazZ`057ooYy9dPv^>cgn zTdq(Yp!(oy{N5AiVuS6-F>2fnUEug4vJ&nXJKYCSh&kPEM2@wK%S3V~&tOdb;%@ZE z9ZrjbRmI&>i!WT~TEVgwWyhDOv0{(Mm#xQF@F!HMB|NlCcodXSo1O5uHsNW1LR3CV zK8WL|6_LZ5xQS0}Bil8%X}1L>K3h-Zu44W%4y9ys=;fV*?_+ffLc^mX;#4qPGm_ku zULYl6S5Wf8Z$eKhepiDylH)8>chbKRdSc03&ME6fOq=9n@V`drE&YwqTTT5<=zaVL zLhsNvp~n#P=mF@x{|li93I5*^dP{#L^w572dg>KMB1D|Af$sJ$a*>xJ~GF{S!jZ{x_k=nbW&+Be3G23%yW2Ff-?V z-JgUWaO=sJy>}nT)fDH{{U-F#w*qPA%6YkHw;RB}Pv|i-OZ~)Vv8DeVp@(A5|Am&x z!OKk*E39&de%VjWF*zJRU6Tp+#cj*`iZ2xBuI`L}t~ln(goV!$qY?(8=U?);M6@^r6M`izGrI{m8;p>zIJaj>}O6j$bWCCS` zu3MRhr;Gq_Dh}q5cjWvMi)|NNH(<59&GFvBkRz^keco1sf85A3cB z9Y~-QDR`UcvNf!tiirE7!P%S{iv^`s_(J5`oU`X>y*5gHQA;@eay83vbBA$jQc`r zljiP|O~HLR7AYSd=3YmfT44)tsGCN!Ea1bhIXt0OR`<`=;T)=B5^6mhSY!a^kqV_e znttD7aC8FOTcBcu04`p7Jn&fV*O@0hPXud2DV4LYQ3Ew&>;3drwQFs$PL_zm)#*swKWO)9k_1xrUENUPXDA zHEO=8a(vbLgVOxtb&K1P*OuFvH+ zq4&1$vexb<`yO7FrUEYXio>gyf5|?J;Kwf*4 z7`k_sLFiHY@^227)DLkt4kj%^R@^*@a0i))bOs4r9um%811pu5Zb=$TZA4@ z+1R)1z5VeiUFb2TTjSvRmv)r*JyGKtA`??@#v?zED*_Yx9b1Injn88YLhsGZiH{-M zgkHwj(OZ*xl%Zf+sdd>T?qcQm&E8WYQ!^oZ71(PKS0-0$aM=8H8Tsgug<+7yZ|C{KpS{?lTF$W-33-q$X2b4$jD1zq@kl zQ_i-cH|_apFY(hY?b$+xqNhD($~G$`HhWKDo6y6SpJb451WG-2i_rVPAoOsRY1lb` zTx9`zKIKnBk8*LUxNKISVfOXH$ZtZgQsECmum4X%PbL)Jd~1u)LmvEt(BowgdbAD| z%)+N53#)%2^tx|NwlB>4+;~UV-X`?&=6OE=L$@de=mnXC#i5cp85YQbMa9y^EkbYd z7lm`NLv)+aYdy83C$a>-#L;zeWp10$`@j(N=u|}CpM>7PtEIusrPX9gr|!b1KM6gJ z2$X2VAB5hcvL!Q7roP{VUMht*g<_t&I^DH8oU=vfefmx4rT#(a4Qvy7)t4!PBg-Ej zZxMRF3_=e*SCG0z=y6A&Y*W5!F$lfX#eN2%XZ1xcWmfg(ifH9ZO3JXHB2@Y>gkH{X zLXSYvTwFq@P&}!N?=Aj!gkC>`(8FvIdcnU5J*$Mdgwn1J2N{H()ygjHCL3+z4?-{Xdsr8xI^uiX^DRQJ|2LtB0W%am0Io?3u;`BjA)zuF zfeUYmAXdQBt-os!5al^=@$VG9mnmZU>t_ev{s)TQr4qw|o=;nfUVI`H*JI2JQ8{qK zzYGcJO&hbRSyzIRlL`PoFER_j5(SXdptu6(s&iR*J-%jAhG7O0Q&( z&L#Mg)bg~GHJ{81vB-42Df2B2zRwI;6}Ol8^2-olj>_QO`+$Q>l@Q)e?~2>Bn7tju zNJs98Mpjix{9dv9?~pN|i3cl3rfs>AxYweh#GW2A<2SF3JwMLK=4 zJ6H3e%FN{tMXy)$TYB}{;N711a}rGA*{g(6lE+NxL6|0la8892Ob}8X1fC?pZ@vRW z;T2Ot7Xi215{C~whN=Rj4J6#W^V2bMnD5p07~h(&j32_K%{%80kEuL#6}j!oY>IncIq9aJ{=zZ}7Bl+aS_dFys@gSpy*GWtXFc z`C9!1kD78gGKUz0=&U4;yUg+4`v{TN=^V{hWAJJ~);TqgB61~6n-xgeJ4+*&))q_j z`QqFFzIU=!0MG=#n3rWOjiX)j{pbS8V(YJ-V%S0)g+!)~xMsw=o}Bynh(cc>$;y{g ziByv+!pvEx;IM-Wb)D9RW&6=GxXJ26Q^5hA&d)XiwiLZ@*M0B){B~AK^0)-YfGx1U zr(c`c4DXLQA@@jLlpGseYDWy-TM+9lAGstuQAJiu0&-WIgjG zOL#+y^yzt24q^*nN%E)?UNj5ZR`h7g!$>DLDGpX$r1^cM0`de0{!Ju7=FxX< z2em9@;G;dZ6usIO-rLs61E53rGs=fD-iUL`S)oMg3i4UzTT-GD;{`^7Otuug_IL>~ z;R~F{)|b*m8;BqcYU*KQDNemN=XRfdlx`HWt>^^{d^&t=ThWusWIcJrZcEW4z1-sv zY`U%Jg-OYWGZekY-I-4^Bt-eU066WQn7A9D6dNedr2OTQ#Rf1`No!Fh(iubM5&eX?al^Zc9=vBWxaS;H+?RzI|NpATDJV$dF)SyqBrQ7{XJ}S z7b^h(42^A(4B=ImM1jfRfU{h2V6(ev2v&@3+ZifI?PQv?5RlIs^a{?PMqx57p%59W zcL$M$#7jFChS&Yj(amzrw%ZG-MHoQ7s=5q6R%9Smk`3Rq;gdZBFjGv%L|#=T=cGv> zYu${}K|0AHs}e>M7i=sD+GfUMd1XPylHBh=fE{L6qgvEncL6zV5#4({9Iy|@5N2Y?<$ONUUuXMd|L}o<{EvwsK>;Bg0s*A%EZ4i% z=1f_>LQ`{llv0>j+~BMLrVWEb{N|`1Q{`r>WWXW9VS0r=>*5 z{0ii4n~Ge915&u8pB+;LQxexxP?z=6;z;_c-KItmuRjaRov4U`fAeb<+3K^*(i@&X@# z_)8H(%3Kcn!f+6fM_?GTnBflrt%Fc6>bd=hy}P#nkR0|tup`?1Zn=9l+-DeU;CbP=i)B4oB0;{T;CU3k8#-#lyFU2cw^+8UEY@0$kw{WLxz3e`(Y@bDc0c zd19W#X9jZUGGzG}v=kQR14aPAixZWEyW_dXG2f&(wv1l9hq;?OI0HndA3|I$eR3-@`#X)Lo=wm@Z^Ck0Fe7klp zqvMj=G}7ip4ev!*wSN4H?HHhd9RUuKwrFAGq$V$yX0uWq9JmWqFf~uLVPkkVn1l;s z78bP0KxaQTnNiM5pxyRJ;R4c1#4I(uDWDX7B`8{-QDv=HiO0vOh)7P}(o_Sioo%`@ zp!dE_^7@q&&L;(k%p~8ZX39=hYKPaujbveqAUojlYDEJ+VX6mwc2B4Qb}-mH6PB^* z_N%X`^J|R_{#r`?XF7&qK&J|T+VmC7i&ok1ebqd{rwRynK+Bsas#8EK>7Xlb!F&vy zj~$+F&~(P-CcC}BHox7pzgap-&%7i}Lv-=r%-G~#w#{8gqxHzWgpm8R${kS>f%P$S zS-bWti zRomSs*84OfCfJl}p`MiyFCavK$n}2NW6gw?DZEnYDUy%Bv&T>Y4Q?Y}8-*4OJpK2E zW`>v_EuqaP^)k0K)%H6#Wn>{Ym4--AMlWMVTPGfb&K)4AUP*ml&1jc!?JfFG=#YFw zd4+tYR(nQ@^zfHv^0Xlh*yCr9(yl~7t(-%33Ou?jtD&Q_P%5JnpT~r6VMtd*tL88P z79w9C!C(LVqA!cDQ6%*rZ;=5JXd2XgQ0L22O|6e0z$N7e`915$7}SG7Co~1JId_>H z=m_`$eXc0~l#Bq#KvOgED1E$!j%>hz!~qnQ0J2ViPDw)<2tdUHpg;qROvy8l=wj!h zv^_qGIrM`YrSH|6)8_DLkL7*^py@Kuz|Q`(N$tfyy+3do)Flm0$B)4TsIDe1^9`CB zj@FeZ!^r2vXI}l&SX1#Q!^9N;>Q|ycW1P+5KBoXSBp)@PGso;+ULY2;1)zZwwIVb4 zAp^&g{jypnQR5YdLL&vJG$=YFlL?cZ0hlMEtr^g?osh>GXh9ADu#>$mt2O%vTACYB zG<5~i0Z>|ifl57j>N~UQjfpiaZ1+bm2NluC>8$X~RtFx!u`@b(fdi+|p2<`@h=U$u z&>%vEe+dALXr$=uTd>tze9~z>_ps0gV6&1Is|BDTh=14vq72cO;xF^11*SD6AgKDh zJwB>1<|%d7BCiZ;H8gc$@9xukS@Oy{C0dz@kDaWpba4Hcb-GA%lNaW11Dr;lu~3GT zkl6wBuZI2>!R!%$!j&UcOHqM`P@Be(?!=(!I$0&m(tSnZu_QJQvSj*%rG{x`I!!*3 zT7`C%x8$P8Z#j+&`zSDOQH69u^f<_v7}HD{AdR1)L8IfR4PY9;v#y}G+iGU*xeX8UJE%vSyJ#*%rZHqrz?<*bO{dT=l^xkKQTOQV6eCn(380;yR1 z$xUkmy-NY2#mJ1}9MR+=ZvTBn?}Jz~{!+$@Qh?P)HF}cCo>INyzpdzf`{0POy&i29 zmqB)24a+`i_dhCnR&Kz%aG*7mI zkFMwTUaa=Y$jR$B9en@ZVVc0z6DJu6X)lDQ8Ah!ej;Eq(9fFdmg54ZK&mB_kkl%%3 z3(*b%_GY)EXY03SgNLl6jVNQtZpBsHa%B3%AaLuB{HL`d$X(Q@J7@20%22x$y@6$Ze1oS&mpxfvXm(^-5m3F=2Qab0#hCHiIYNVpVo`g zIB!oC%*}l^oF}+B-IaIBuW%|;hQ&2l-u;$=`f5Q)!u&hS;J-V+wmtvwo23ft=dzp5 zbueR&p&3sI+BeZUg z)7KRky-;$~^-c4_pRM<%U#$p#Eo;dpCuE%7(z;C2XL(Ga^Xn_?t%z&K61OHtg%Y%j69O3uRz! zJINL0K_X!|W3^OcIL_lr#X{o6lxoNA^;>Cvly}~dX&?J$*Jp-7(-XroOCAK= zcT5kPqbSax@J6I1WPH!R^6=XHoaDhG0^})ZNYTZ&3>9)A0lYbK%dMW$gD$n$k)pnj zQbQh8OGhDm{LS_FTQW4zHjhHLlUy1KE{3Mn^HCF}a_8>i+d?WcVJ(X#vV!uQ>~ z9h1Y`S#+t*ej&vu3MVuKAClR?7myVL_A8-ufFe^yLsjL!`}U=+<$;nrL-eh7wv*G1 zQ9r`ALWH)`)AK+8j}74EH;`P&pXBvSzL1XL4OC~UVI^2~5^M_p5how$Z~XnDI1`2V z;a^Xd((qG0W5+`@{bvhB%0^%c1DOyag!~r;$Gvc&%HJLxzp2xGeCv@SooPuixZCbO zWZV37GXjeoewNN{HmhJ0_R@FgkkVTbp`E+Mqq{sy;ZtlsFJ*p3$Wi*|qa%Ves@fr$EuQs_?)(cqM@9~Wu2{Yt__ppiVE!ui;PbdM<7yJf9m|w5a>T@ z9)9|rskVSEkKsQhh%la`^u%Ldjk!R@Es7?-18$Q*1Gzusmr==V(KhKD#wqJoYGlr> z(PjeaD1mgfj7U|1wBZeiQ^Q~5hRp8{pbEdtpe>M<+^&sSRPDQ;2BCY|4l&7oF;~8+ z*)B+%K;}FOVK@StBCe7OvTql@!fih)JJY=>sG>&>Z}z2mIAF#7QxPxm{`I&(ViZy~ zH1^j~_JgP_om8u~KS@WiDe1B3%WvY)v@Y_EZWYvJ?5%yPho~(2=+)%#E0M1;4VCwh z(gLuRw~H#s&x{pFFVOwJ6}?+o$;(&FV*1m1$`b#~gtp&hk23Fxos{QiR)n5hAKsi4 zB^O$bEm@`zLPJuZWc3!OWpJ<}ic;bh*7IJbJRElql}?~IHH-;7d=NtYFw?d(RFMdZ z*h+tza@r6|**S|tlFyFYnf*3FVvzfMig@N6eB{%Kl#v}}Bem7kgEo{Ip5Z!sA;{SiSYwK+LGOde}Ew~H7}(n^ap zE7?sbAs4!m0JkD!=p&3`Pvrqn)r(_JOws#{QrnQ6o)0yH4|PTS1Z)oco2*%ciqEP_ zkKc_-qNM7~LosGFu*D*>omUupWJdKk+J&_o|5yS z4yHx~vej0hqOWDW0{LIHU;-aM?l_ffvK)_>;==)g-bi(@FyB zj`wLb7qT=J{-fxHTke+xQ8y+L!iN|v37C{`qyH zkY(p_qGG2|kZS>c<^fCOyEiF8SBjpS=0&y`$~^B_YO1a9`TO%u=G?g;VNX@j4xgIX z&GsuruVPo)`v(7$RmKFB!?_D}(P~{xy4c%JoaUbi&0iv#uMw*n?Abln{{7KhDa@U- zYQ>DnZTjfyoP6u_gJHBHn zettYGb-D+URa<4xkrIFiiR(Su65=X3o)b)d+CwR7#CLI@RkT_UPIspb&r~!b<@$-j zrES2_T$BAOrE2p0OXYY_T`W@_-u^FljJIRUfN2(yMO)UEVLbjo)BU%CWulPIZe^NG zT3Q>-72|jCxL9V}r~%3IhOI+jPH=0sMe>lu{Cad4JaRy5IrD}^MoW-qZA&##ynxy~ zDqp*r)cwqUH)odvW;;qYwE8=o0nGA2-CR$~cF!s=Z|1tpA%Ne^e68S@db-e^W$5px z6qw#_?A4a3_+#BKw>*Sr$-9+IG3n~e;zSX{%i7;q0Z6~LKg=)~k@+CF2}9ihSP$I$ zovjCXnw)!(qnU-Fv`?K(a``bL7u-(c=Dbg4=<{By3qCPBI^aF!NUsx&kvEZEE2%3O zp$Dr?$R({gmM_#ZewzYigc&f^RzlT;tMWd&LIgh%hLk1mm@&81RGzD6DV{&g=RX(b zBlm5vF#ip4*~9beW2th>BJzxcfy^x|RCOgFdZLj5CP&FyKk1HfEz25J3Bb?c0BHMg zR7qPo)ju&Y9RIRepWlIj!Y~h=FVJMn@(e|xs}{+TQSQ|!;+n}{9bbgo9%SNXd?QiK zMdo)rh|pR$V$TuaXuy*=Fs^tx+aH&O*Iu9@W*8Ibv^ZUe)jHiB(=2U=fUIPz1Bt~^ z@mqtxu~maua`W#a2M52iDrZ2F3n(C^((l4y2SeK1^0{mdPnm6c1RzmE-xG*>AhB&` zJ_~e)=8NTgo~AT*N3@~tZb1gYYF$CJhr)xPZ_n=*C7!%2X^A2$A{vslnT`#!vSqWx zbw*1_`Qn9PUbi-@?^16_XYomXXC*QD)?E(}YsNBRe{$8)-uXL*l8UgM_URB)I+jMR z;pk#&^IQI?I!wHgHpl8_ML@gMZ=$5o8uYNMJ0o`hsa`|h(g zLp%)g5EWL)|iRgt(x^^1l|6$DIE+0K5&%4j0+Vv;;0pr6urPE2PPkax`y$ls0}@U>1z zm-_XGE@u+Y%9W7rTm-4F$alGf{9n=kQ#*G?#QqY$uHUS4ldC zQZGAcaEb|1q#?SQ2RYF<#E z<4m2w6{qMhDYk(IJhKjv(Wa=1rhU?cdvGc!q)~{$q~x06xC^8EKaJVvSD+4Yn>wqZ zhQ@**-InS`zE5s+$=qkv&Q`BYBZKthU?7=@yx0RxJoX-+s~k;OlX`d!0)tLNAzC2o z%!KN6x@X9-+ZhUHLn=Yc@5196~5%|^w1DV4yjAPsi%MD4+O>iBc`Mjn0E_#<Irmhu#mYBn>BGS`O>o}T1|z7rO&WX8 zGSp;mgx9o^%mxqVhlPQ@Mok}z4Z}Mg*U*ZK50so&rSS}cPL3J=iRB@nDQZ=;3mK_{MYILqR4?gX z(_ieJ|K6D~z>^2zo9Et?6B7)>{jKRJly+0YCJFHZ21$zjCroUg} zb>{SIw=zr08ABMp>&KB4a@#&4~hEt@yiLAl6Y5R zy-axFqtyX!Z+lQv04S34QB$0xg&Ht~7_gN01E(uvXnA3K`l6kBYCLI7O*npLGUlo~k|Dj!Mg<`j?z4feuO>Ikm4tQ61 zf%HcL$KgKvxj=sd=^=eOAgvFP(l-v7FR8zZV2y=3Gd<;>$p{1U6KPn z%M;*rKE*RhApg?)x{yjhAZ5)ESltjK0`XEig@8BGr3pBG!$CBB&{_Oak35jaa2*tS zY(Vk3Mt50AAqpr5S=XzmWKyY7H1;$qn$sm0mTMZ2Fy$JbUv{KixqfW$;FTf6H3Op! zr%F8tudt`l9H=>G-4tuibcVHnkhHb=Ut010%`MYGy3tz|e}5%xSOhbY@8_Li<*7z% zcHU(TGY}!dYxvH`dg0tx1{iYI(OD0=cVFBF?|A21U@A<_)!7zOb9gc!--M2u^7@?V$q65nJRnjf z79Z~TdVp#cX@m@qv7RReGV8=Yyxxc(kXtczj}z-GcraFhmmpO&JpsHE_cpLa4}a0z zPhBY@0r;re;;eBam5OdCK6=u!;jWe41I`jsawjHSflPyqN3+Y@u*pzM9%o#M=kGDn z*=EzZc9&w&+fAI5H)Nn~1sSs*^o^;zj;%9{&xkq&M|0sgw{hY`MpOhRLu67-_t!7wxSZT zmK@UDGWIGO)es;h9fPyRu<&C8A(oO5Q;vMMI$fVSArM*8q9r{NtI<}Wdr+oJ)1cdP zU)K)1+bd$&gv_4hbNeh}`-0dVZswl$bv6}{d~u}+Li{*v_<{RgP?kn6ID~MD8DtYL{FQk)~Xh%Z_H=d>ChNIpd)Q4l(-dlI-peh$V{B1xfqzhWS}bz zfgO|JK(Yr15S15jC=y$GYB1+kp*!OXmG9?zV?alMKfCsmbM4Cz-lz$42RZkVJgi~+(xc4EU>u6yp>mB0&dg)-gHa~6x zd_GQkwBB<#q)9#`s1B(QCBW%S-;BQT6H*7NBC-At95`pBIR=pocr>dc#;@ROzy7xB z9isPiqb&=VkA@_`GgQtubmKZWNZGIQ2GW}6v6bEC)uOIlbldU(ntw2B_uF*-#<$m0 zM){RHHUiE%2L|lrKWr8Awf6F=N99jw?Lwp3Y?Qt3%h-$pKZ3kf5X%KARp_a&5 zwTsHKbiJip^5NV{_Dt=v#eV&|(E|NTz1uchV6eo!8)OY!iEL`s_t_dq-MKEW^zbm@ zQf5=I8k#d`+Q73Br*VL@sBr%g$K838+YaC)7zLaTFG=U*crZk+=xA@$V_@#6ufwt; zbBwrH$8iUTrpKg;gtbyWCU-p1<8WnlC<4+tlBl-b;>W=!ObdzQ(~K|BL0P*?Id6a@ zXF5lW+&bW9^6WXU-bFJg@+9~?X>E{?Bsq!m{#?!B@Z<}|U{=9{nr7f%D*VidY>LxO zzQJ|$x4|;4JG>VcZeytB`j|1okwq9J$AS6$SuXJatPZ!^oRy0|z^V3xC6@5gtXazJ z)#>t(F7*eM{VvxeD>)6LV6+Q&>-to+x*Q@l`VdY7JU`2vjyN!#wM5sD6xyXA7Q273JivNZVKoDyT{aNF z1Ae#rdDnk0YXOz64m>+lV0i+jvdhxhDiUY~!MOKrd;Am04YU%)qAkR|I^|uh@Z!Sy zIt*Aw*L}{=XJRJLm2~sPNf}?PD?-N}6WLTZRldY(|cH)Rl0V zd=2p}*E&qOM#$bE)>5%+Re<`ZGRrv+eSNvD&Ndkc8pZEOo+C(RCap)*XdrnW8~jkf1)%Q zq7}#6t;E`$KD0Zn{dIm6ZN2%|ojS&KAlfCR-D6E@=JNn+qMdf6!PWy)onxt!Ftjf_ zR=432?Hh@vIyQg>t56hSbTS$K%_ zi17>UxQ&Ym35^MOc<6V9k)er+!3L4eGCx?Dl z%wDKU@LQGW$c|{2m@pI7D8<<5rH(l7moeD(HAXEKoP`v}cbi=(Rr&*GJyiau|157qB`zbcM=b?@G*l25NnYt&1gsNe5a zFMot9zo%Xyr}5w?@k-9C^0<=rPAaIo9^6xZ_$W@JDXvtit3vU#oLQsNE zYM=Ph(DcZADLu`21hc4?_2AlPi>Oz~ld?=sf}trq^h3%nx>(4CQORy;Ag;PyF)H>?H=o zCAFGJwBk&+dKi)Lwwq@#k!$jdG=ip{Ls9i;O;hU-DX08)ybBIi5q&Z!kIn4pt zFd|d=p|G9Kd?etzEs#QB;P~FtO1;%dsoV-BkF|7&%9JIVD?2E zv%f_-p-4&a5%ZRL%&i47<#?6=n>w>nmFQEJz?*bD<__)gtd~I!%?{tpzsX~>UPXNo zwdMp0R*v+L=>E*iW7gZs;EJ0sA@ z4LDR78FssiNqPDl5gal&*j#h{o=TF&&zXiNPA@}l(%cEUpI1HBk;EMv^0hlWy9&#N z3R@XazFHTHtJPVBre`W2tNd#8cqD(B8n`o-m`+zh+#~v|4mM z1aONIVjVHUHz7evz3ysCD#!I4gCXyD`pv4jf3D^~?+eG0`?4|#L$X!~?~L&^T5Y>0%yRo62t$gv~YH z{m9{RxwX$5CVf4Ur0l_K6+hAkZ-l=iv)`@(LFfYEeob<=VR`~);p-970`Oy4DhI_nzw=wXa*Q zdE?h}YNO@6UJAy_U|-hgHp~r`{&@5C-qPdpIH50>Xg8M_5wne|x_7hWW5&yAyvZC5 zemyrH- z!Di#QrDr*|vek7;x2li68nfDODVGHx$XE?F8v^cp1*Nw~o=Uc3=v( zqkq@xazP^+>HqUhDDyX}MCqmS$c5z_EB+*V z?`5CF10#Oj{l@sSyCW=b)XB5FZ#+aX9rU`rK0lhs=bt+G9{c(%ldS+nx%e*Mf)V}dK%7f%o zI$2QbM9)8A2Grhy3Uv68L=VkjA*u`-|MC#?!V(U;k*5QE7n89Y%zxK%ge9%BIU)sFULz|dup!wF+g_X#9LBlrUXl})Fk1^xh~@=E zY*+whd+4eC$b0~L12w{v3(6PhsiZlD&RuG4F{By$IgUa(m`nXFb8le&x)!)%fzFYm&ism?mM`~hhJ}_#<;{hgX zBt%T0AiriqpWdd|Sg`g7eQO!8$>tlL6YH5dw_MB5w+^um)dxo(gZauD@Pg-4^fza! zE&b-|bx+l+sFLKY45PNL$2roeBc~rdnt#tE(c!3|a`@CPqbWc%sL$QGBcmiWI85V9 z^Lyvnj67bMup2H;lUxd9-HLTSAN1xwXehOpehm(APHx;zkzLFxjP{9) zAIr_u(O;$y6F1<&0Gf0`u-r=(gtz6H753R#c((jtfQ7)it7bjOc~YN*?w+_Ire2vv z_%TgdL$^Uha#*Qc97{RhU0uyKE2H;t92ClH3$Wl>W$C7W^iKL_w!+ym>w7gpffj|p z5yrKZEc|e5ZcXBC8x!s0f&2y6>+I4bYGhq3VZ8IP2=XaY>tlScB&_vDOOLxL!JD{t zTh1cF6C!PwJ1QTiN1wJrfAav{YC)uspz3-g+quHEPDtew_%4l%3w7zc`E<*fQ7@9X zI>Lh?vw*20D_P?u(=85+*>>*oW^+uh>m4{qYTq^USNm{hL#al0XW4ttF1Y4##CsVB03gRCp^uG?Og%}B^rl7HYSjQ3&A*T`(qU2eSdumr>Li`b884E7a5oHV2^l9eWV(d4zsf8kNmQWeZ5-|*9zJM^p$>Kn3+h5}8*T<2qb&V)|fL{@aZ?2M3~ z;*-)NP~C?+LFT!a5?k^nJgTe`MY}uQIh707ZDQ#bu9V`ue+#OsvBXW@-2|~7W19Tf zL4TU+R4&R?3LUzk;>5kAl!X=k{HLS$@cU(;`|Hz-d zc@OB1bWo`@W}R(mp1}>VqA~Z`-Ss36@a}y8alR-s7*QN3V8j+C^2Vx z^MS|#5&A|_YoJZf%mLOMs9iD>@6L|D-3j3!|Ts4$^O#8? zf5*@jL@_Dkl3UM6_A|m1n7VzLXs4w3rkGxb{+fn8OCFWVYon~lV3et5(wt(RpkR^F zXL*EW`PWSK{2lHzGjb7jpYCnMI*Bn20S(584z7>|3$h{Ms^S>tvptbr(w8$dUYf*t zdA{5|N}_E;v=r4bwgG5Y_43aIXs&q%0j8K@N41ifPo8HiR=iT`w`+*5j?Q0rF{j){Ar21 z5fV;glDzrHvezaK^-@>gCNPnpTZqU$68|2F*-xMI^D;DTirr)2s&ZbrJ7dK&c%YL{Qgw)GMV7-ojL$ucDw* z7$zlc+jQeXB^-I@n^1{daV2{NWg{!;Od8PZiG)K8-MmcDX%+n`T*Q1|B+-+MV}&`b zfCWipZ$dL(T!Xghu@92?YcY&igb3PJEnR`aN`PS(uEAL-a!#VT2IJrn?iyWroRh`u zPOq3`qi9*E{pEx960^>ltq!lGN?{P+v+MLO;yl}1Un4eHGgIMW$MIDtrIs;7DG z$!dyb%+eQ0^VB@ogHz6E6c%!C?W=G_ii}q>cI2=_Npo#*kscDwWj|5KUQqioOnbA? z$YsXJgT>f;X6i>_3zr@ZWn*6d^52lO)(}C+nGI*!Z?ey5mMwFUA`+E8V&M#^e{0p` z9AAgS~dN5$~+82b-UF%%niECC6AnOp!zg32M<%c1*%S-r zYI8lk8Irngm}-x`-@W31`-i>(#g=q|%KKIJud`c5fs{GKz`}lx7Hic&aKJ;8fQsVa zx3j@|tbs>{ZFX1==|YL>JQd~}Y?JMO3H{t%0(4jJ|5UOtG6S;-!-v7(TVWLJIv-Rl zA?8jcD{Lqjdx;?5vzWF!W&phz{xCK%a~4%|40Ym%P*sXBO7%-GPhJ;*&tIppZFTuIr)&-JlZ9+^xQQ+Md-vvrLd!L_f~JjTB0zVJtVq9FWs8T zkPkG3A|ky5n^+5p{t(88_2h?o-nUpGmVo%xiera5d{gfwzWtPFn}hl*%(pn`R)}TF zfyWnS^9L7gU{`z30JfVeUuGgC<}lP~g~ZF4#P|2E@&vT9XwqLpmwWwmYM*puNPsvM z@qHL$Q9;VxYI>pydgutPIftWE!zkdHFCAj|a^M#E{fq#Pg+-uD_Xlw<4BZxy+rpDC z14w=6llWFNPW>2{FqdwmikI`jyLtUB5#-(zI2t9r1xGeTDS_P~yQ2R~S0(isntc&Qbq)4qAC|hl&bEtTt`=ZKk)RR#uD4e2 zD8?0z!KgA;SY$Z}Q$GCSF<&gy6{*$zRsbLY1Wg|~gI6gfcH^oI>eBO;1b`%ocOM?w zsX{0hJu4ndGWX5!=PWy2S`;(qjq?`~g+9D3ta2(dAkm=o{AJih7cexAuSEAr(z*Qw zxqx(%x+iYlw-f*XASn9$Xg~mfxd4(J03g5m!KR*hI}R{2?@9kkf_$9*z?^Lro!TSG zk0ceP&+%(8jWta0b94#8^ytfT*vnpu`mk2`_HzrXi9efw4#(9*)@Plo+s)5ctjs_7 z&WYUS1fV&+wJIzA&DYY#l_(olQhsIc3S6P)x zV%vdSbvM}RH+gHR9-qS>@2)h+O!}>2n0q>jJw-fBdbMs|AvKjrCS|o`uj*g3)-9H@l&hgd611s^?;%u2Z65JPbb?6j{aOG@nz`&=v`#eZuhfFSF$6Q67`%yWsTk`lbFLnS!oopW<6t z51Z}dAACQ3;2*~WfVBqQZw=)#=pl6W%8PB2_&}Z1Ta^#&1Zl3S9iI4C_rLIK#EY~5 zDI+lKUFi5$3C4a{hzFLxf3Mv?!S@DN=lg?>wfGj7(+5BRNOlAC`Q*U|Zr6X#5?L;< zx#h(!6w!b5*t!6;O(TsVmDmx`Y4;U(|g;Wqxn zja=7wdHFS=KDLU}wX09P-0rta(E18h{}CT~BY!m_z6(GpxIXmn#_)t1Rq${;*;vj` zSihK7k7qykwkf+}kEaUHkju<~GWnpBv$w1BgIIVAC8GXbqMf~KP_SnsFpRe!LDxO@ z*lzXhK%&SQ;Heazy{(dZoX9PB#wLCy`euBjKx=HC>+RHwF~yCs?_HqnIG_Lkz=Qx= zXuY4ed%LsXe3Mmn*tk;(evsY@pzvu^eQ7jyYmnHD<}V!O@C0DdcMJmx;ORL zMQkf-R4B<(RD0&jhiN(T4>ddhB>>$%7n{!c$B$7nb)vXOYCINd2Hz)xoGKh$lRjNn zoV^_JoK+ltpbTjJ@(MVt_o?vQJV84!%abAJT!Pr-^{UsB@bz>R;K#_~PDlSRt8CqqHVh>mAC?hJaZcCTXju4DpZvc+gl6ltC{4g1mWmY zjw!_G&6}0G+xOkgibf+l5#vy@Ke72GrLn(Go1goPX;OdfHy2zVJn}$O7hHk`(4a*j zGr?j);ZpLsi1@Ppuc6I+g-;8sBLO>KMcraVA7prCNw%oHixHwr&)1zAoMV%aDw@jI z)l(Xrh?l!)xX1Kry?RiaWVkXcbE7Gen(M}--iXGLt9P0T^~qE!-wmZZqeb?zO5YN! z#|P-SlI3yaiZ^vQ>+U(MiU}7Sg<%=wjQ7C8fg;aF{U0V@P;T9l-DsRweVrI-Ys8PD@5)HwdT zOu6`aKGPp9$@vGE3##C@evqbkD>m*LgcCLN3@dt31heGUGxZK`uX4P6Mi4276prfG z#NpBm>fCS8$-E#aKGt13eQv?>sA76lKOm5+rHR_@zKmG;=(EX^kK`La+d%Z@VWd{p zf1&OV4j4Jqo(i9Zs<|yhTC)F%<~A?57-Tle`$zZ@@M^POGi2yY)+dMaUxpV%fjf5l z{m73#356lHUtfN;Ff6gQk#sI;q9)y8Jv<%(B9hK&t9)BJCFF%CWXA*t|~ z|Jx!~w$brlVK$@J%bt90W2NcQv1F^;?c5&hgHbQl{;h>hhy0a}T8#fSab#-vrT4jx zx*x>Sw%nXd&F0_exXgpmvGI9>BlyMSYUXBY>{40iyOXvQ54sYy1+`0iV`^b@;dp~R zya2r|63aI;_w2nz!cNChUc&Cnfjs+-ic7l0J<;TM-EKMCzl*dUnML&&G^^en9UJnf zP53+Ok>5M_LHAqt-eL-UviD%<>${qVoQ2 zHu`@xT6|<)$*1!Cr!%**RbEuV%ECaWXP&xe-&+Nbi27-x=_DHW+Ju##l@z+-#lHg> zBz;jN>bx)rAPr5fVw0}s5JusFM<)qSVqFh@(=dq}a1J<{0@(NaIo`G5#muYJ0vSPR zGpl6noda>ZRs-D5!W>Z?RjjRZ46I?x90J?b?5{@)y~E&Sj!Kt&`-vQ&zyNHLv*Reu zFM@sw0l>Ek#c&yK4tzUVqq}`DLUTT)m9{06N&OW2{0svMA%?>udouVUtcOj#7@#~z zG7h=SG4p4&tULg$@YntfP5Je7YD$pA?2++xX$aZwUb@C-ICc?Wm0xe@o1;HZ(7RSk z!CZ@x+pWA|*fA~mo2aj?lW9I_y-!;fT(9`>gq)5=u+$lr%6wLp@1#;o8%P{s)CbTb zTBdndGd!TruHOiWXD+{J6IEfaew%8lPkp{jPI(Hm@btp7eXp+9jjgAmgRhiAWE$8* zi9q|aLdpF~615e8-sTj{Ed#}fx(noLLzt_dpfCoPNy7OIO$ZE2ySOoM%PPamw)Sz6 z37sjzIt3YZH+3L`+wjF?xyEB&9#f273YFcjOshwn1tZIR)A_*`>v4?Ne;DLQGBz-! z@;J7!Sht=E-JnxyaNT*lSS>-`bn%CU!0^PW`$Nuyh^Z&Rw|^4VrgNU#CG(}--$8S0(y#2LvmaNf|IV)@{@)p@wdk&tuOZ}($YqFkEw5Ll za5O*73e*hM&(hMCDwnfD6NDPz*=_9m3HdbN1jSyB*|l8Df5`Jq z+H(GltNXR$=RDtJV&lf)8-Nm$wBNNSoKS&f05mYnPiO=!B;@$K?917P8hhM3o(9Ex zLjq~~_~Ky>=?w+STx4h1Am?}FiyD5~0Mp9>uZG?=c&%J+z>O!v(SD`jNT&_3 zO)vW}dZF~>H};3$TjzOwN|ly$w{3?t`?~?wtpYAjD`8&bH&ebWhFjWwqDI{Oai8o(H~uOI)j1ALVgfpbS;DZYu#vZQa$1Pyl^19QcYuOHWkx8FbmU2FrOhk>HRm@=W zyFRrh{~1W>{dHeK^WqlRkT#qivquxib#)LnNpJU2F@ zKafB(us&ZCJtkHmBt8!&(uOnAConrZBrl#7i0zWx+4~3?~faS#FGaDmraDM3%D2CnDGC_KR%jgsU~+>07kl>|)WpC34S&-U z5|X7?Av6V~gd!p}6d@oAC@LyKC?W_#5D-Mf&_j=ch=LeE1r=$cfYOOHktRy95Q_Aw zD1wxS@A;l{p7Z?Xp5Hw8%sq4e^ZRFKXLj~8vpcghyL(;N`-ROVqsKW?P$O{nZ}DHN z;bD8ilVZXJ?5`#ZMu`Mpl#{19#zwW6pItW>FZvjzQ)#nG!`f5f0_Oa)^kWx|@E=hE zWvqy!Q*^0;$mu=d2j!#q7-6nx|3ev3#S<}BQ&GB%lPh~-ZkFQ%sPGf1qN`;5bu=Sf znBmSsIR?j8!=le+h+4+R++7Yz622~5z*rWFR&AuZX-eM?W*j(&3vK1cwSpT!JTDK< zhZ>*Hh%fMnb}zvTm2TsU2jWHj2qnTXnewsZT!+4Mm%1AmUX)mZR%{a_x}hM5{N3ec z!C707K!9M*QbVkJuAI^4S*60ms7YH?%*QFzV`IDz9g!Cll0-*%fP`jhk~0(2s_mb_ zNO<*Ix@sz+TP^X9JaVoec~#rE9nG`4lz7C?L%1P@+ZkE06d&A+sHPFzIqH8adQo!N zX-3MkU-F(W66S6s2q~oe{4LX>eM53hHH(T%0`XBLiB%jFvn26;5LX->{&tExsUD-l z$Gl;iidDPp*N?#R;07WP)k`1`=B7y-DCn7J8kaV7Jn3vVBCH1o^M6hXG*+Hq{%eUN_m^z9hrwLRuBkf**U#UoE&gF{=w{Lmgx$T+oD1TqDr_UCM ze9u8`lo2dJ0+WTRrr%9>$+*Ik9$bMhJxa`>pa#mow75IC9zuFQ|BK+vn(0iPhj+&5 zw_$!EmfeJ0CVGvGHUkOk;d{oJGEd`#o(+PwB!Mv2?Y*9u+QcEaq&E#nyl>EM4cV}C>A@24n2Wd-( zhg9Lu(&2@ylyo}6o`Zbi2|uAA?0i6L^#<0Kj&!FH9>!h^Al>0z%VdHGvnBpzDtyyZ z%pp~z3J4H@5F>67I4z(heJ@XVp71ih_c>@r&b2F5{3y@$hm{nzq}Bi;ovD{QSSWiI zcx@VW+*NFS3Vb(^;(G(*PEBeiDt=@rua;{Yrw{@Q|zPM7&aw+z~ zfxG?n1c9L4$3^dROWiwt1Jm1tE3Cx7MPryHgts$A3~l*i`|ZlPo)5uKcYxW zn?f@t!C8a(APzxUx)vl|!I>f4x>~hCMOqTE`K&5yBa|hGdpb#=(3QujaG~?Bp0=o& zvEYr?E(+8i?avl!y|b9_D> zzNhKcl3#T0Q+PUc8~!zf#|gXJjHIyJ3$w@dF@gs{9t1CZ5HaWSU?yHha`#qsy11X<$k`3j@FD)%O)F%Ipwt(V(-I_U|t%9#(pMZ>NUbLM(q zX?B3R)%a&#@PV7@_7IF28TAo`p>PNRC99tCufQ{cQFMgPtcWxDR=(wd)GZ zt+C+DCHx-Isywe8qw2jKtg81)>?|9ryg&S9)MznXw(gnC;m@jC>(xsTOm)4Yd_2P8 zGyk4y&}|<9nRxf82@Vyn?d6cQywS{2d?FfNhgEAWt7iusJD4x(7o(JZtnF2*QoE+o zjR11CWp``Q5XdZ4xj0mm4S#@yW%f#H?W*m%zL3Ad!qECLIJoU_TJi^91)EN zo_7n{uuldX&N#mfJ#$YouZbIyfm7>vW7YRkSWA&x%M#29>n4OS5bpI^Dxqz94%_bp z?kW!G5hl*meg4efu%uRB4aN~)>#cq05=Q1p6Ay`MG_X3i>N^)Y`2W!1vovlqn-i!^ z!ZVL+VL;frQh&h3{?6Cnb{0?P><8{c-CM+0t2w!k%WiVr!{E-g=61Z=)NGyAVxzHD z8%r+27x!#9!>j9iB)EHFdQS^p5zm%rKYT$raVu5no`A~XtUaMwZB<$CJds@@_YNE` zY#yqJnIV*u!M&ciIjz2fNuQ5Wk_6+EY>7E*Al{w@ub`?q9oWBh^Uam4);$`#%Sf-O zhXrij)o`Y&c@@51r~=EWa0)eKxxVI`7;d(@iuR5_-W!=^b7GwS=q&5D54c_I!IT% zMYE1pAupc7i`Z|4nv6#_dOvHs_-*92{wW`Qo1AZ_k_D#YB)NMC)LL+jI`Lv}jbYC> zug;0rW1|6NH8+VMpWB3z8NwP_t>{vY-wf!lD13*;#eeL{X16468NY0yW_RgZLT%{9 zg{h$*{3%7cFLQ>y+OL^PP2VZ{X5`16Z$G3S5+A(BLiJ+&%3+=_9^3Icol{T4o-Ko{ zQ>FMvwHmXW%QmHi;?9|>qS@D-vo#B|b@;gkmAR%Pb8md+T9W75iss&S&h2E7cH!rH zROa6wng8H3-=91`STz5sbN7A&A&V~7fg+=`0vdZF* zBa1(M7FUxO*NPVZbS`c#ENm`Km5-Md0Q@n)jTH;<@A`q5&RR2fOTl`Z; z*l1AQXtLgTO|D)*rL(zl(S4Ho?A@p|r zgV2+el9ZE?{#()8`!7XLU1Q%4q-StopRTT+v7WYxvGES1chvIWAuH>DLwd*Tj{h6d zJ9FlY+rKQm9Z1jB)%CwvdVc4rJC#h=){F+{nIhJJC@$xke(0Yg<~AqKaLy_SCSq_{}^{-G0rqTKHfR; z;N8T?bIF$d$!7+WPcJ4P4rP{SGh+vsPM$Xohu*AOx_RtoYT96`+r_k_$!Yg<)8c8j zPx{@qkGWm(>9*%`hC{)fluviO1MZ%QzT1$H^(reXE9Rc_)oj;i*{Pqi&wbAJU(PiqebyIim&AszqwT!@vSszr7Y_C^XKg6`5EOg-^x?7Ud9)` zZ2S2#;af$HnsNGuJzZq57f;Mw08_P{fDAA{NE^gBj4sHzD@l3_VMe)z|_RVuZf|J z$$@Xv%j?r$M`ymw%*?FLj7-dptj�FMeBFoSIph++3QR|335Q``q%+#UH=EZ~XfH zYwhQswO`xY+dCz3^fs`G5vP=}cG@)f6dhVia++d68Otn6Y3`c@VT}u)TTAm=Qivw; z;ud8O+EV3@J#lC&d-(P?$+bPp;`yU@ncDs{pWB}2ciksPa_zD#FX+jmq$-|mFMs?# z-zLxUp5==tAD-Gjzwo8~#nb*`YE%5KLoW*lpL@K2a`x@ZXP+u)W9|12RTO=B9sFbF z%iD_L;W|1DD`8bxGV*^-(ffJVx~_Kf&&tT`a95qI0uO+c=g{F?4AblA-37#h=lG{M zkwE)%P89zMZ5sj5U7HmpczG%(Oc*)<1!dDIqkOxe%TQ3m`}ar!>5))wjOZodu_Wz~ z{dwGb9!<%B*6=}>r~`tY>i~LK`OF$b7uf)*F@V&jerR{Ej z`o7*}H+0;Pl?NXS+j=RR8#BN|rVqrYtg9xQXBN(p`xI?-H3kL_ao0k1c?~E9UOlC4 zG3DOR;Xk1qL0^hz6J>O_Y`&b>zTb^vm*S}mGj9JV6mJ{ZGyz?+{@kx+FgBo zh;qb@?`A|MR}?`x*zHWA=1rU{g*a0njnT`jcE&vXLxKABJYY1nihW;)b@i6xWH4`G zyPmf9KEY}FzsuyGP0>kA3pZDRSK<5zeJRGu4KPGwA-SfU1T*+eG3MGIz7)Qy0U{{-BUT$N zTa1e*hy51jrFY6N1M)PqJg5%qe!8s!C(3$5k6VKf z33#vcQFylvr2RA(B0%K=hJq+OP5c=k&k_{J+d$MI`XA2~x#3#j(&C_Ui4^fl$)P9| zocS_ao7|eTn0$FUotrn9dCAtA7w_q0Wbi1$ZNnV#rydDLv68pTIbpy}xJ;`DjMlw$ zU3#j8Z#dO`hpG$Pv*TkC7IcZhfzINTM8sf*4N?d~;Zlk9DmMXsyPV5)lJwDH-*T%f+D}WCKJiJ%D(g_Dk zqQ@`~B+tn=O~P~W0`UN*aD70d%DLd0cUh{+N{f1?!gj>@(>DY^|33CCd0*t=?#4{Z zj)U6g(5!u2X^EvQa}|%(ckkcP&tGb6GQb(txUGyOx$zpS4gkc!&+k)%t)?Ar3bUla z@le>Z99{4uXv7PT{cz}^;Svv4PYJx~y;q2xU76y1?t9#1UixH#L(H zl!1iem=(r9BO<{(bn|en^b*;G*l@eL@BR|MREYM@~Fed^dka0!D0!4sCQEstq|}~qd+9mk z>90>%XZMS-E&La6t%gf^J#K=NS*REI@)SdbO0GZAvB!QW?jesJHC8swBlOsVfo|4b z090DmqGmXZYdo%E@1?vtq^CuT_M1b3;nfE7mYPRr>nq*+4O;C~yARvkAGcb;hlUmz zdmhSYqW{cDv6_F*aHS}=4xNAn^j{7*1b^brXtyIe`E>lc7rqcH15=X?<*1%CSSY*P zpftbz+gcQg0~P$|jTGLIzjU0d0FOJ%>$Oe~PcGa>{RvMoeoPIo7FYTi@|xhRjwQ!i z2ERh-!lv)~2&2w$9wxOT)7y)UG`8iN43f`9AfT7}5C>MltAn3lrz_4pDGIuAV8O4Z z*0)*eS4%thw@We5tIcyWB>&6=YpJw-oskEjdS8F9pnh3SK6&=?>Xe z|M>(fo9K+vd#5riV*(B@DLJm22=sotkRNrbbWionvT^;~eX9}#e(mT8*US5|8~4~L zu-1$f>SN3k9pj^aDSCrDir&v1Meo$#ieB0OPerfrKNP(Y7_WN?OseE>MNjGfsG>Lj zABvv$j-nUwm!d~0`KO|1{Qp(-GXF!-qX4Fv^$6<{1hpGGQ;+0@6Q#!ZG6DFdAMk%F zdPe`D=p9AO?tR`e?WrRZtFYDx$XK$BPH_QoJ7ob1!|jW7Tpchlilf51^tf(D?rqv%;v zp=qnmYyU;j`v841A^W>tj=H1hy&!0m{7ccJbVFxAtVVq>ZzJ~#pE9gUbdd&5un5o_ zY38SREl(ou?zRqJf~G|IqWz7J9M``TNyUyGCc{(0snBU@=**Iu78z^5ru1EA|IKb` z1qE>L9aCSxFx71b3^7zIPv7%EJ>5n{%>>-LrbK2PVSY4z^gcBmkh4A|a)8MNEybyG zK<;idya&xif=Cj&^H^>I^c`U8AnudEZy z2k&OJ31YpwrI}^a;%TijbYFU(vEP4^d%RiWD|H&`G`4xgM1(2vicREt}BuhQcf`E zuvUF8sGXgBg9}LnLEz5B5=@?ScZ&soy9ATm$@f-(eIkLM0N_(ew#n%S&>MNWir_jK zDc=qJI0gDILDWmK!#LSn)ccD}m^>AKS_kV!f;MspXB0t4u17A437bn$MG$@NoRy>o zEHob)7lXRr4ITOpy}aZ6v2Jshv5tIz_5y~x-e0O&QVZE`&)lK?K;59ZnhuoRQG*@*+5hRoAlTF?_05R-F@0%fj-GC(> z61IfTpyl20gaA~CAQfM)Sh!{LwBqhlVLBv<3OTL=CfVFk2OwK?s2dRkSOggsz(s`! zGoNvjpoT19@YS=e?q|cIMPrHyH8w_CppvAC_hcDCAQGO|i~Z&TYwN{ckB}1bPtviI zU&>P!zT~=QsZic69dh8wjCvrDd+E?iZ)H=i-=|Gm7<;#fPY^~=ils5vk6LVza7VvG z=f6K;szDFX5EFN}pd?gSH^9U|PHuAuNwh~>R0tFE5U>ID_JX9n&o!~np;Gse%aEB1 zz+2X{&+Tcy6+rnC2viC<{IuxHP~I=kgih^qy>{VE!v41Dpn$g7q+8+~Um}d>rAW{9 ziGs33+h>B*I~s?Omv!%g^?5L!aszC+{+WuPs|dSULhS{Rk9y}-Jm$z#!YQ#AgF2|K zB|JdICw!_r^E5%yKZ|xqYMlt2^1>r+9xRq*Uzx3dHB>&DC9w9snqYEqgAW1HGw4X! zRtaw3I?RwokYv8_Csn3AB^)5_+~3!)-oCEQO!x~z)RhujD%v8UxJsTboO%U!h9FDs zeC8_e-wSr{t^=C!k5xW%lU{ZzY%z5n)g z-TFwH*ht-t*WjTmudGgiu)=2u+(V2W2dOm1b-*gM{0gAKxA$6MrJk1ruhLfHi+ zm|+mt%m0_RSWu5RxP-LmXgjoVb;%uEpyS&+2)sEF>&!-sAE7-Fze;NVs3iI@kzGKC zH-_T9HDQTFxG&hGK|_DTaVs)g&at=-V&LhZO(KXTC$ibq?64?xcVNdWKu>`@Ycx30 z7ITXOzA5Q)W?}lXy0fLac5p_^4dj=Of8dPu2sOQ)5ITISu+zM-&AeLrv0lf$EOu=i z=uXD-FLxnx2pu?{GI}@HuJ?ES*!b;xZ15V}JAf@v)1%_uvzx}A?(3;Zd>_J*8zy(j zT*H@kmFZZgS?N7h)juQI*t3w_aYWL$DRW{bw`MMwO&Hfn1yCf!WU;2;w5v~L1HR0r-h_MG%mrbZYx3e zD>cn>j?8kJcwXY6>abbzlr%XG=8Sw#BYX(msbzs{-M#!h(;M~E`QTVY_RLiylky$$ zGNR#c@lq9E%Hj(7>)K-0QsZ(ZZ9>z?&4nqfQ6HHXc$X3w?P&UJj9>-<-Gcc0_@ z$FTWC(dTLW7H$urzZKC|fMQ?-y4?&_AWzV zow^@~KmQ0{{z0x|8>jr}G+H@$zin^V_t2UZv*8tk?G>B*Z1a?r4v5_GV)n^xAxo#9 zXNQHH2tS=uezukVv{Ztp>I*w2|FS6jb$08Uk9Cum@2`&1X8-H#xErgKMB&il)ysrk zu1+fsw2r6-WVGIDw68FuYc)agw}ti3Gfuxk)qWqeUJG$vBd4q}CB^TmPGsE|x=&a? zsw$jUHQ zf)Lq$lWaXT+a)xA|F4=NNy|;8tdhP0W(oS|e3@kab;{;UA6qO6n`WKduJaQ1pX%yLs@aA_J z{BgcJ=U~b&MHi${wFCd<-Cu@&dJQ~2oaeax^fY{Spo-i0`2*B*z2=5!!XC%Zzx~GB z5@Kqeep*{7l$O1ks1h~nKR5WoKWgIB%Fkr1r|%vo$Dp7N&Vl&-_r*?s6lwp~mG3mX zb!{WzX{X0x_38ormpZLC{VOw^Uv2T)Ej?egc@A#~cNt`0X9dfb8efQRpd0;Vl}e0p z=2xww>b`3%oAMvC^E)^)Zp&x_jdr^1-|^msi2iixvuzQ1G0G+S!RNDI>#nQ(OX9>j znWN?QR(Wv;<@3@mSt>p(w0tfnQmB$XbE)I|+wZ}7rE*FQn*O?Khu=qbXy_k%##S6l z^FN|9E1t;BgZ6%}eh6RA)2t5*w0>dF+j!V!+Z1)Hv)gpfAqKau%&mZJDewHwPFe<(8cJmC0=Gl91ZqL17PJaO9V#lAz)jvoR~I$q^g`gkT% z;Ih3-!lu$mr_|$@Pq_y{KiZ#iZU`4@FDtps9}!%th4pfL%SLrcmwv=|nMw7ZILLos zPl6Tep1`2{*sY+i$L|OBA9GU#>LC~QykQ z!d>1fC3A}df+svDKQqISNT4sUsweeiddP+CC%3%TruC13RMG@oVA>>?M=&Yge_Pbc8qnBpK*w zlOY5>)<~8r9wb5{kPxabUQzxm1cFD`dosffP=Wx{n<&8|k4eyXmZ}HPV$sk_RAjie zQ8<)Hf3Hc>M`|yMYjbvrf%42+gMbI2lZS@aX&fNJhKQJC0T_7zfar0Y>+A1tWu;?a zXmW_SJYyf6sxf-+aZrPd3r`XwZ3N+GaRaEIMvg)u9XRJbbl!ms;Q%qhsNUyF#BP}P%9;L9mYLSgxhMsP5v&Hid2PftC3NEx zM16_Wdv6RXV&jq-6KwjIqPO3|MLDRP@`hU$B1>vTh?1E&rTP%E=`Nvu#7-_6)eY?L|EdpQ(;?fRKxb0^n!d9F#>S)PQ!o-XiuTk>WK8l7V9w4!db??VLL+k*e z(XcPOeA8=J-aPktDb^~F$~D$A=nw^j;o5I7^z)rc>6##2^GJRn9=Zn+y$(|LtH*%Cr(tx|SHV{i6fEjYpv*Kb1dmDMP00D=Y29E1z1Xad7+R6P zL3(9d*?h}!*TtvNwC2hP3K5EIqeG4|8kN*FuELC9fYwQ-6hOa^7y*>IjgM`|2-KsX zQ-b>e4m=qiNIi}KNO}==0KTEjzEBp=mC(QFmn1fv2Q@+u-f9+Pqqu|_8*CkB_bI{W z2#av1z|+SgZRoLqfltxnAr9l6$KmX=y#m}vyF8aivn~SwH+{J#%e@Ctq3x|9;Fl1q zF9Q()kfeg{p)>p}Xdbzls5t?H(QZG6sTqgrpJ{=+EQ?E!$h#|@QJcMVNJaOT9SX0d zOPwy1$Et@uX>r>BIXAMRBnI+P7meg6@fz6NlS6%{pG?k;mk2Koy{}Lt{`Hr*{kP)l zkWlRH=2zZB(z{oJ`TAcq2Cxv1Vp+AG}2IXXx3gn ze3rxu^^tP~GanF2A;R^0zrhvXiy~zJQSfwlDU=xHeuR40x!(w4)8NBlkR>7uUB}k# z`|`GyFBv|Q;YCx?U*=tYl2HlF6EQ+XDroh9{UEh4N&TaJp(WX-zDQ5z<7BW|@oez$ zNg3Uzg}*38*v5pwkf?qnq*+J64+0hb(}x{Wdt9?QF95mnX&lmmnO9JwZ z=;23_&lr6{2Gz`;mGpa?sh7tJED7D2Dp~Rk+P+cwqIPoE%+);(Bfr`z8g>*t#_iaz z*!>tCc3I; zmh5q&hv~b$2g%)MS)RQ&;H0f$u*$5EpiaeN=ZC^HSv4Ve55#aff4B+ z@%@PZei&yl+X8a;o*<^IX;kC}ar-vPEd@tqK?Fw=WZvV5S0WfN6kMKMZCIDx;qSQq z9u+^i--jEoM26rfko^Y2&nW?SPqz}oVWJ@9R&3Zx#oijrN+amJD#!H{0}v&%=xBSq zu!s#EnjGuJJ!ObtWUh-b_?bY`fttczI+C_?e4nF@zGDXYTLK-;1l~%8qX#mj7{DdJ z2qcA$B^xxx1HPkxrVZdy7W3yAz&)k1hXEjibuWY?EL?!Mc}0U_3|l6B>nhOGrUk5n(oTaUB$|eAC1Mz=w=P-PDEDD>+i=GEMin$V3x=0Gc6! z5YLzpX@sQ$4OluQDjH|62N5o1i|YH>=(|zE0TN*3(3IC(HX#Mn@JX6G(0yzrpft&v z$Y}AS^fzK^yJQ($S0y)W4@&#{4*`nWpb&WgoDwnBE3qYN)*Pb)6yQ{7S-1^- zqRIGHx3NlfbIOm##5|Mz_N7+BCgMcR_+9|^8t@WE!0b(a`ZxXh!9alYEZ*45RZZ$; z$cpDMFnE8d6mU+R1b-cEKw>|*+w5=8j=<6Rfml^-l4JoJ+PVV?8V!WAdB$S4rLv^= zj72Bt#Ueo<{EP;ql8EJ_Go1k+5EYpN(GAuO?R_nDIwu7B^0b?2%^$Hs8Aj-Z8vm(C zGuiO_WliO}z@tlaDLr8Ls;;j-_SjDQVxl%T(-ct;@hT`8d!OUh z0UR;C7a-0El-1XsLqEGBDO8ZHYf8H22c4dNE&hG)huMAMdMFgJ^`L1BOqdaEO8TIS z=BhM}s5Ftd*=*Yi71jY(mr0Rm5y=Am!+dlh-i*cw;M!VUzyHAxe87-S>>WLn56zfh zhpF0Ix#=bGM4d~WO?hB!#rer=Gd->V{sEN|;hSL*_ z@?m0(dYnpQVV)F!bIp3TFtiI*QXN4i?k}jJv;q)P#+R~4fJP5u?=*}^q3W$$xZc%4 zAe%gZm@Zl;K)O@Jc#q{i1x-2|N}tm;@S;FgEsjLm??3zb@YlrJVTT@X&3*jJjPt^! z6dFsZMwDzla!(O}mBh5mq62v(Ym`n~-!u#ZHQK@c8dhewu4Vq+2}6%N)1f-iCLFX6X(^P)^&JRcCPkQOAxYgR2n^DZ?M7qkh*#a$*1*AjMBov(f~?q>D$e z)a3w1npw9j0Hx8nC;)CMvVu=i@0cls2B2-=Zml^;A{~BLKhh7tljv(9If&M}aEsS6 zdZEEq z%68iUjfO*Xa1=SAM#s@~52_Erz>HhtL;k|>ANaMeAjZejnA?<|`H+G@>&Ypb>#GVn3&aN_%oT{h~ zkB)eauzxE!pn=}@s4>wDh<`V{;;pcm+#TsvdT_w!cM2IaifBD^G-YU9CX_FCNFf> zX;)4}@3@l}sc)b=(_`=OpLb61bkU#RoRB`$0^1&VGN<>86RL=cl}LxBQ@MT8OMo=& z78UeK+ieuVt>Wy^$fu8S7I1VnTISYPa~9sZw_AiDSmZ3W`(_~{TKKE;u-^^6DrX|z z1?$Pn1K=eLTpoM!O7Xbt_8Fb4=ig=KqHsb)?@l=XWf!H+(Qcv(Z?=o-BcUtxe9ss- zaRpZm1J`ZMBd+@#UA25%_lLP^C%YcVcGW3z)va~a>vYxs>T0myYPjV}#=Gr%3I8-T zs%qeNMaret-p$O%%{$zn#-$QVx13S0BA1IQ0?{Nl4~g@ad=fttBwj%7_wG#W z;;#)L

| zCiC$#%mtlb14hB+8O7|0XW17@VkRDLT}Y6b>Lv0NYj{PJaEIf)Dq`HPC*%Eo=}5bC zCr!M1CBb+d=2aW!e*LoN%xe!Th<7L7T-t&cTVir|)%f&}DW=4QB7g52N4(pM=6tcP z#|Mo&UV3*f%t>gBdhK|n5*K?_F1|Oo*mvaO2gi#aeJ=KgT^vZhIGDXNk#O--?ZwZX z7r%VHxcznE;_%i*4&G-(!e>;)XUxE7{D{vtN1q8FpUE(vsbrt&Y@eATpV?ZUxlW(? zuRaS4K8ssEOL*U93E%H3zCR3nSC07pboBk@`#afpE!%g!$oEgJ??$KZ=2zdX z1>fy0Uw}Y^NYbFHG?*a`ZcRft(U870)O8vu9(x8uu^_zepo&(?EhB zkE9>3svn=BAHTJqfRmq~ub6&sPX1cH{`;@{Yp3`hxbLr1?5|truh-?TKkRR?=x?~~PbLHyNd_3J2ACKI zm|6!MbP6!@4KTkRKuHO(xF29y9B`;E!0P@5Urv-|NwimMfQ@0|l?+s70bEHNYG)p( zt&PYO{%LbPFke1~q#Yrz9qmPml%YbkX;H^$QJUmH@)F9N5g|i|ouUW28FB?_|CAv` zdS(37sYhIA#9FVSTpXAW@N5SCQ9=t9FLaBTKPfj0n;a(*6AE6~o_IOoUx=l<88?_4?KAu!_~Y z9>ER+FuU#5Y;-KGH8!(5IC&crfJR*_hzpiSx>ur-%!3V5Vx4XyG^kLQrC19_uwDPE zi*|%@H~i|JpV`_!ZAvf!1y@)Fkur=m8U8A-Oxr-(5w1yNo^Yz^?nrkIuymLF1z*@rjy|%lw zukPmgvA+wK>gDd(XeHeBup;N>hMx|vS-R5g@$Ra^JtdttvU8I?z6W)kt9xXfCvoR= zpxjXTh5oXOv%i!-S6-Uzede{Q;>d7jNbKY?9pSH4X?k$y@wNp`I9j<>qH41E<{_n< zrNMb2Zp0JM9BLw>L*3NRy}e?ds&a8%bEfaf$>%=vLq8pBueKrtJ1Tlrv@ICBk1%Gx zNrb;iu3v7k7Ssr{yF=}q@%8F2Hecsf4bxlu-ak~~zqqie`+I4$#Vh6A?ZM)qmuJls z+-CVxZ`bfx1}&)Gf2(8vZsl#*hXVmwhhv9%)^|1Bzdo)|NsQ#y`Av-Cv8+5HpKl-R zEH}|V$dNpV%<5L4bCZXt?K9zS`d`1G`$b~q zV)?hbS5;ma3tlqShVuBEW?`27*|J+v5&SjSht6h3u|LmA8%GI@A1Ry`4J9>Vo@zOq z++zLCHO?oErHI%w->0zGzw0vho6jv{2|gH=?zkNwBfp}t%KU> zjOhV|(|tS#^_*$YQ@k5%bA$u08cLrUWSQE=35sF+)QBUWBNO)H`$yg%+ltD0BWp{` zwN`r_?Po0>2406&-z^HdwA)>HB-4{acAs-t8LQvQf===+|nHpzkznE>)X_s zE6qIQE+_w0(1`7&~wK=&@=LO zjycOs2#J$wS;<|hd+NNnm;3qPR?$e61)Cr={hEl>7*M4Ad>|o_i4=qeMTLI0_G)lH`Pbj~pddVNJ+o)RGhzT!G_8b?+@1gM^uop_= zj)dmsfUR()HfaNQGHTJzu8wb1-t_cTo4(#;<+s6HISKy`&C$qQ?K=&A=Cml+2Yneb zxYFIq2QvL?C)NCnN>jo%#<>%2Km8WaUj1+HG%^r9IM>oNNHpnp3Oyj{h!A(Re zJIu5$9C&-V+(hEp=av^oA9tKn_m}>|Y&qalb?9V9BVP)UEjdJrQL%T%e3gZ2*LWiK z2IOE}g-yi1Gh+=m!v6}*kWqnj9i;vs%y>s?RkXeHjYvh0YM$4@@K_(K%oS{4m}zzP z;6pW^=irlM4ThGv&-LZrO=)!(O6PQ`l;_+_S%Mo~iL;3o<&*ql-=?_ioN~Ea{dO(u z(LSLY@6VeFtq=6R8F*{w+^@Ez>%vu?XuGp&;B zO(o{KUotf#0t~tcbG^3AePw`4a+o+TGt~XAIz(D{lBMY3|Mb&B&0)Hm{pI9*s&{j=4l6y z>MwJDERPg@VZZlq92M~>jKA##sGyQp`u21m^ie{6p;^lY%KEpom`v^a{d&$vj;&O- zhY89Vlzc`g=Qm6AwEDPso|(Md{ac#nXIU`cVFvEgwD=?We1(jS^Cx;D2u~lgHMc^O z&6P%YHuVq3zVJfeK8Yx|?IJO*IOO{WyI>LqzCSuAmn4Fg0w3S7kQzEM=3SnHec`ck zc4-7_tR&S=NI#?-;fL{Q7jAHziB;`YI>A97hyb=2}bi~gQN}hJ; zC|vCg?*%_FY^ldm^gxSrIt}BnAAUzD9A&=FlSDXwTKA(_>%Bd^xV;Y0LD|9BcYG24 zy>sHjh-n1x?x*qB>~`|!jGzWlG$GqoX$QfqE(P|r+F~GghPNTbNO8G3CM$Qw792Ek zYac&5ckZMZ#zalL2JKoWE_E(A9HRs!@O(Zc{#h6;A#22QwQq=jv%Au)+K7_Ia7E(d zqPbOiQ&tz61aLWT%qrDjeBQ=V%>zvEhE76juC$ml6E zs|~NupY;eb{RPgX@o!0~W_DBb4|Hkxl(`ca>g~+lEQl~% zvc4M{C3WZIYwE%?3jVB#d}{=GBSOZ?cxbIj1&EL#!NMv{#>Y(zJ-+-}Hj$%nWD6*I zv?;iKDk3eP9i6nF<^*=Cy)$SQ(4zje-Zi>{}UG;xtOI z+xGQklTF7>znchW8LN?uDi|E20LU&0Hs*q?d5EmxgK+et26l!Ke<75(0o7RHW;itI zqocK!qz=0nHBQ4d*leTn@TzCSHlk75<$%OW#QMG*a)Al9+DJ8nuG8I!>xN3mX6rHN z+)QI^L8Nwf1R7^5(ov5iLU?!5`ZMYUnGxu@NVFd0lyta(I(%P%k(PMWe$(*%4knF< z8$aG@*q3!gn*`GtZ;~(#7wKyN=OaW`STn($5>XH~i5(|ps#*@nghq_qh~Qq?HWna7 z7?L7Ds)>s5Jx#_0*}>gi^eeuAOyq?3Dw`^oha1bnco}q%Xr%PJNvwb@!pJc!X~44^ zeMcitgh6?QBMkK#{vY=4^Qq~tU-*51P(n#UXrU(bsuXETOXyXKNSCUJNbib33cXjQ zOB0YTU3!%wNLN5aK#-20sPvrtu50go?|shPXXZRP56+ywAT!C#H|x9B`n+H4UTR_& zj8fF$UNAwfc>#}|QWZa57wT}ja)-NYWpv0gpXWBl%s~ESXVzK7hC2>4tfsQ=e4VOy zavX)6#DR?F6#95oPqKA_`nf%TD)U?i+xf&d$D7*J49^DRhWnsL>!=m2x%s}2mctrc z=y0<>89tKhdpKskTok4RJ@y%8>Wt2{kMwYfk|G;B>E~)JA(0}xSzhi0=p*ez-#){~ z-_n<~=^x?4?J^E1&#Fpx!cvs`;OT_T<2U&#Vy}Lkl1WmBQxjh%RfXm5zmh)b7m!G| zhN&N~ez82mgU0f1Efw)?<*6O_2eO7MaFi^(C1V&0uc#h0_a0852QAM> z!BC(!3m>>aFI>nboO*_bZr%G!f@Vr4bvwh0WCD2g7bFG*ibO;n<4V_M2L5t}pK&_v zVAZbQe>J5;$_0NYW&iqn0>AKznO|y!N@fM>Sp^E%7vO`ZrW#-^s;HnADP-PBVGFm( z=an@rzH(TT$m~h*$~Ot^+e|4|SuYE%8Q}34p1#-=qAO8Bk>8wDH>xVXkU-CwmZMfw zJIA6Swvj5CqLMDX*E@FmZxY6+z&o|0_(IqKsCl>jfb>=_X-^&BU?yiGMU=|Q>YpW{ z8&EMW(zUx6Lg-o5n;LEdB6F}4RWwy^H>;qUByloeO<9{CYGeD*rwZk^jglbcTFJ|* zKq(B|WAeiDXKrh>s_{$ozP+W&&!>uVRpaLCO={jkg_C&f^@)d*8T9BEFarm4_Xf_= z24U)3r)pC0W!erB;bhfK9e1W^jI!AdN_~H}c{u~8#_tHM)+|$@Rin}W`Z|*T`Ex9) zoD|xK+0rOmtlzH+cwSAdF3y*)GT8@)m(}!DPfNMam{mRL+Y|8WX8@AIyGRi8MX2F2 zk+Gh+RNR!tV-Zrq^XOZ^b7S=~nEGc)pi?%95p?Lt0oY8_HsxuN z@|wzr`ERL#5zA3+*^d<^i1H2o?z1hb>(#3!dovVb)YP5^vucu=SY zpYXG=4@MSu!3|XcatD!ycapHL^t%^v45@M}yH(%P?qfbClM@mhIm3^9cihQi{C!IH zzD;aw)J>z648#p>Do}&zr>I~G#2BMg*OHIWM%SUzaIad6Xx^hkGpr&b>R$EPc44^E5qbkSU%EU~)?vK{lwIj!$ zC=G0CxNDehpjOBloi|@t0dMv6QVE17$qpz=Yf?NCZFyR$vXt<3azH7Qj@cBad~xu$ zkCNy_a=&=o*<%aO>}?&bRJ$+H#Z!usm*U%1)04-M-Ka(BtNryz$~qwvauSz3R?}l7 zMT+H$y0Xc3V5rrPqGKWLSd8>P_14X&j#WIP94Y*bQFSLyW zz|vYTEyu47x_${2yh=#}wL~BFx!PYM>0I7JQzEt)T1T~q;X{6*Ba?)V{Q}=VV9!LE zvq`IlJtA(2RfB&exZ$WWd%*A+)!>>^B@#KezO_!KVx5E@ic^n%qdt%;-OBZ~%Nsx>g5}hY)@)%*WOop$==J^|O5w;2vO8qg9=P)I zDs5?IQj@!)`K$*-_y>949%_swgTK+f8jE7zPyTSl9&z^Q{Ko8W%ynpmNWwRH>Tqu4 z>OW58Gl%cfWNZI8kz|uk1YCTI9{OkRRi=+u3 z#^w4+yerlDp>iBSTe}e?NGf-obS#}Inr4ju(U_VqWy#}1Hdsf`_uG{(Oc|oyu7OR_ z;(M6k*yUgz^s@@pUs-;^n!C!pSdmG!ucac-|I}OJ)J(yfL?rS_Eb6IGxZ%5Sod7?1w|k!}a=C z$2ZKCeT?7w#a^5yRN|?ZhLr(%*_EU@)_poOZ<#Sa#DPB!KjHxl1`+2UP|>P9B?)_p z!)cJV!p*kA;5oIGelQ@2HS;yyytH1sbU0h-?guo)4n0IdHN8Jyyb$1RrTd8J}vl0XshW~=nz$JFU-C10TY6dRbNri z9saEW8?crO8mrpgdZ*C4`%uDvVDm0|PAZx=lLTpx?JuQO(C?|uKNe+>334pl0?_w5+})G;=sI6_dI zAX6IrrZ|p&HnDp#McMgIuycy0bC}`s!)C{{UdW8N(lkxTRGgwh`Li4Ml}5W2XC5of z1c%J3T#kl?%oktISzpewC@lnc2rl56l9!r&@s6Ha%IOWGet@quIF&lU*FXTx2_6Xy z{RfDIhyDXZ!b5MQ<5!75BoMp$LQNP>==w^g{1pWO%7?XGhrg7Lx|M%E?)s@mSiKXrrKkKi5Tnv_b<7^RZLP9Zsr>bY z+KtB0qhBGXyeennD#w9g`?X<*{oQ{ymH(0fi31T{c;?yZ}4b~n#&`Hz}Ev}(Cs{ZwZ6Q;qeLDjgPH3W)l4_Tk}Y zzI%e*p$f*FZY)#wTy43+%k7$PZuH{+{<&ng@OBssINI4zfFIuPi@fA98=LpOHH_yk zYaZkEbDsA9@x7Xs?U(&{)C@mYe0KfLaFJ4XVs)_WCcK4mldJlq)o$fgu-3`rE}#QYqM|ndGf<=-WV~yWGRS=I8T~u> zgWQyN1UB~$V-0f2rIaBq*7)!2hzO3zBB&E0iF zu4$@U;&?Q)%weJ4{V^j`=h-v9>DOfhRaBVe!k~`WGM-6;}3NHF5GcQn$p=Gab5!+1BC_=)MNec z!X3t9m_MX8qs;=w_q!^62AbwNejn2P=J;(TdOG;(NzIej2GS+_3189RcvoRCtx~UH4e-lT_>XP`x73WNhd^nzCyl!pWIlRVREUJdcRgcTPZtr~3s zttyAch%MShbwF{@5?vP3bk29*H{@>?j&TUs49mvLm6Zqe-m?_bhA!Qujvp*ws?SlS zy5)s;>UbaitLPJx788C{@;Y!_)d6<8#m9W*rJpE)7rQ`%+1h~6(|K~j8JizibeJqT zCH=l8<4-2N$m*mleR>v+ff4$w{(GXNucA~(3XKEG)A?WnE9w*y23@I|AENPmybp$7 zm^yMb4?EzgQs}I%OhS2LbU7>}!A@KZS>Jm2#Ij!N+^c&-#x10AIkbnAFQ`9PDG-%mZow_jN<`7?8Fshtc zaeWg-7kOK|k)d#O5AcSxwA!8H8+jK&a~G+~zoq&5tfInraK_ZVq7|2phbgYOn&0tnu6CTO{f;h8`X-lh5eXsyh{Bn+STrUk1eg`z^T;? z8P7Uht;}D~=Ys=GBb8p=M`X+i>-?f0gGM|(2KI?BM6i|`N8j0ErKwMjV7)Mo5t7F- zqzzLxdB2JBwBx1!`e&$ZeDc84w~w~FJbWu?I%(O*u};=V%U8}Jh8U1X;pir>^3Ay=B z#>0oB_u`uf#52SkV8(d^v?)c99@qPDR8?ZLF~5Jh7<5@wV0|%0{awqa>`g4}2jA!G zST*i|LeTcd&xnp|f4xR}7<4B0k4%O9 zhSwoEprktLq%=5E(FxLwO_E3V$rK96{13DGbwITPpqkOI z`^n@fBXFDKLK81iyV^#d$2B|HagH&;`b5_gy2oP}_!0>XN78x652~J|ukSpUOD8Xm zxj~bwY>D=lL8Q<`zz2#rh;}dkGJs1K%{IEuc%Pe=>h+S3@XszyS~CgCyR`Ktf<%*+gv(*&ooM!ZnE$s z`ibSg(ve1XWv5s}dJvSr`+co5t#~ zQhAC}Tp2z>h6BP$FyRCpkt`|o3Yf@@H{}uEHz7&&^~a6LQlc@_$f}!nh*r8LwVt!5`{cLpbR0CG76ybk zkI}_Es;3W~sf$#9KyMP>zivV9`z~R}`92T{CHiO#RrH#`fCGge53?IaRRGJd<|}Z3 zrX_J`6(J!GjO-7Sfa?G&)FbqDNV+wWJ^=9xa&Y}VgT7mtS>;H*zS0$Mf>4>!je>|Z zHil)#t6^zjHPZSm>H6;*m8Ycj=FNJvluecnpeiph)Rb(tT3MRauyjfmzcViH|le~M=nc(!Tx4y5Txhb+qHV;?o8%4)6GOv&4cIOsnbD8 z{8gg-&GC0F2q|+s4E5?*9Ym_$4+KRN3zr)V{RFT8Iw}U_>e6Y6W7$bLU|Zixq}mxlmK5 zm*Qz5HGK761c2QK&2fQdgQKZz*B&@EYX+joNl75M=XNS{h%ppHC`m|tBFFH!@l7;8 zy{sA2N5_`ukJ~>!?q+uDXJ$Z~rGCebFWw35!}fl!v$t7=`KeOz^&+k_QB`0VQ$HdI zroBKo_OXp4JpChLEx;p=`Qclv4R48^T^}un!$m+L)CR{<*Tdh@tH?XYUuUM=WacLK z!dKywuL_G_NU9s}2z|v&RJJ&+sfjt*Lr}fvURt5%!xcJh2Z9X`D{!#(;5gc$nKlTZ8~TSDRjg5v(76I&kD%23x) zK-Hs&tNf)c4i$mH2sn<{0w)MJe&YJfusMo|EheI*eEHei>^YzEFcb7qrPJiO>$b9fvl~XyM5%5X%fE#pxO?bqE z;HW`Nl3dCD=DJN{c0(@qmqpUw%%skM*nYk?wsl1Xc(SASlgAGbSEvGk`xM~-kgdI) zI3HpKlL~nv7+RPr*@~5ENm5uyQ(;Y?IZA5b8_?AE&*)=L>`5hwK|B=|=O#zA^)O1P zb3SF3gM-tW3Fms>=htDbg7e4qu{Rcg!5~!})}hEU`{rr(H_QCZ)29s!+Kjcy znZ3*bBrNGXT&WAInC^B4w0=VLXO7e*cgPa zIWv~@2VfPm=|I~eBmA+D=Du{r*M*92RyCW8Gj-YNgse3%8(Zf0Lf9XljABXpAk*rU zHAul}^=ftvgY}#3#hT!72I(hUiO6ShYxGNtxl7#ES+YVLh2Wk4loox?1P4y)=!4(f zv2Jo`d-G)d&4WI^3?BBM$ns$lDC|s#5&n;~=yKlJ?VN3H-5M4A!a7G-z7WpA%|X42 z`A1shKDaxb1k%&qW z%VNl%Kx86KHm>*j=ME8>#Q^RMx@>_(2U@YEgjyM)pXXHC)+4_Bt>zrD8N{Y!0VQ;Y zw&nu3r~?3?0JknOZ>_#3$Be4H9ZYjP0uU!t3c=4GfBD(V%yZFnNn%n6p~;~KbNj>o z^qd70mU)wg7b^;MtkzF1uW{62KyJk55>@mY)S?~dM8Cnry|oXbL}n2HCcQVp^xO*?No3-F~?zgsXgp-uI22a# zV^Yb2cdw`D{Ct!9nG!SOj>x&Uoa>7H7(vb1*Q2=Vwq(O|`hvg@KrerN$dW%v+kyQ} z5+=AP{`=v}KPt8-O)P&?bpCdJA?>muK>`3!6p4V!1@HXN&X;<`s6ud2?H%(ghF5>S zzVsa;WCGUW0RX{wtcvm;E0G`okan!FfV=NM6ushi2^RB9T6Gc?y^qy4q9P=Kf+!6vP*jBP^Wj}} zl6(IsdU7>3c!5`oox8_|(jCvvb?E{pJja5q61`ymQuOp*e+Z$yS$}`nu0N5}BDtu~ zgN#HP-V#v1HC3YbLUBa?_ZRzc)ArEIY>qVE0sGHg3GHQ0GabR_i8hvQ8ueL%A{;Fa z1+Yn2m_iu%()G*I?8854=1T2<@&*$nxLcGuPG;MSb5xR+9XCLNi!CZn$zs62`y+h7 zItQDUOA56z0>s{LCPB*{OA*O3N^b=u1lZ%j2d1_d*ll^M2ln@mRgvb?LLCof0iVHjI+U9H4e;%D1F@`;~ptUWRjy{@WWl*E~Kv zVb644j(M2>^xzSFw%-+~+J7yl#?K!-dV@J$fg zEK-jbTfxRzH!Up3d3N3Q(DUqGdiv`Qyi6hz%zzod$Ei3J3`~gson5VU|Ffg#J;NN~xt|ls67TnR{KPb3Qb0a4Cm(uw`vHI=v{qC?Nu^3a;#Nb@~6Sl2c=+J za~$vNs2_bv0ZX?94~^y?d)EnZ{@aACZ#N-1+~5%s;(kPPO_2ub1IyQ4L295b`odP> ztXy|}=mZ#JS*L#vab>PT6iU~VuDps>wuWPBPw}AEQ5%-t=n?&Vt$6xhK>GNmOp8W) zhASbw;xZEWsP2t$VL4D&q&LP+LeJslFHJVHK5bNTKXWrO3h`EQn>8Z_Fl&4PNqBavjpn6iTD*JO*s#EP}~;2i390N zl$DP4+(g+E0cK!%FM$hWXxH|uxRr`p@|YxwSH6$__pFeX)~)&&vsncB?;+{876y?! zer2WJF6x2xkfJFdm*Hyol38g|*r-oj=Xu$kd>O^EdcM~x2116lv3FBWWR{BRaaU!yUJ-Ea*OT273OR@{vKDRhIg0Na{5YM`HMO-I=M@UbLUq83W{N zH3Q176x`u5BD_&-F$7hlm=#K{O40NjUS30&=dZ zD9RtWK!I*drZ?d;3$tGjeDqo!R@5<`_(I%sjb!hMM!~3oO8||~>UFM?T?j*u8V8R( z-h5)$JZuHqcePTZ!V#YuGR(q%7x_xXv?qb4SiN6*0Y3_l|5gSpB&(9IOM;BWf`Qya zA`U<<6P#^26_zopsgBR62p4do#_1{1yJy3`#xM6)SJNye_6}+I%3=XaP|(k~SBhyt z_Jv&P9=?VF98rk(H{W`1VUSCE5EI2-#CcD>?s5;sGb9>#D0CzJGk4-dY{7&MMPmUd zZ&QizH)}%lPZVGCB*Y`L01>I|Yrqk+X*5y7#EI$4HFP)U^ebQrdLBhgOpylhEg#+IN4^d?^xk1DB8oiZ>c^Z}~h`Z|Az%WApci()XyxM5I28Y|B337kHsyg zgiX##bW#-f_X}3FIZgAnbgIYxJnT!Kjfjq5cc{1%s+Ui8qAfl0aR5gHe%mGrD7p;( z+xkA_z^}Ib%a6pr9lIevp8aV5N&uhq(1ad_i+8L-l1>IhLysb&fP8Cpr1cLK_hoj( zH|DNKa;Am+%Kg!?sRB8hj0-(35%1hKOgfvb4E6IiG(Y`lqqBbKjtkRI=pa zr%Ao!kGbX8F-nJ%0r9S*q@-tSIpLIJo?X9QLPEGQ`A%nRyMCv2Uu-=loX&K2{q6R= z*z*j#_(IuzS}A^YDm2Lp&;~k!`I7W2=QD9ien!@oJmUPfOZVS{=Nu1|4Vpw z`H6(^Fw_xAf%Ee17UXw*=L|jx7Fea-w#TwWV_7q?wGtk8rbyVRaNP6QG6o!fG?rJ0 zLBIhgnh6#{E?u&d^sL2 zA6{FC)62wO&%`09A_TSZLvti(i-;S;aGml9Rr!$GBfKsgZ;;96bP~bu5~-CbI#0pm zY~*=e3ewRZ`57B&)EX%^L*io;!7LPF4wnrH!kV1GZ0Dn%2*Cqe!=tzGa+#5A!_l#Y z(E{)oe|h0w%uz~`Ak=^_SPJ3~rAS2HeV)ml){4t0hg%Ot8J$GYX%o1Dg`&gC*-D3T z<-;tE<&h3S?5)G$ZB*ET$~ zO|}oM*sh?+M2om37xo^Z_!#cQmErgzQg3}@(8f2&k(BsZ9ZkKL(Hd6yN&?c+sIlL7 zb8|i#r5*F#B0=Cytf+QEvqOTUMe@Q{LK#)EOlz{#e6r(7axXlkK|V#nBISZgh-1@R zfrN4u1*ZEYA+!qqRrjX@0(YzqljRJd=$a7tDj=ut9hO!7QD zk>zc|WNVyiOp1CIXK8dA_)(hH0$v&Wmou&XvpUQmBKb+R>lD_TrJQN$}JCc6i<;8ky@)MosW<|d1C>wolX?0)f zF{mdwOi%deM%2QK7WtHCS@2s=u%0pOsWI4e&f2T zThT`gEL68%k~`vQJ|bCM3+ha0&$k)lMxGT915hfz!tWL1}Hd?c}W6Rtx- zrE3hjOHCD_3iiT~N2`JpG31Y+kU^tY#I&I{8xz|i&&{@CU}`aBCzrvz&^xx!#kGvQ zsN|Z1stG-|^0Y+Afueb(sHup&oro|RvHN>bU#x)j`Dq?LExYYqBHxw|?km+4q40l- zOk;bUCj!qU77U@_h`N$i099ID$<+$DESW&1Q&+OQLY0dV4W?#qq%P0B0l9HXQxi_! z-UCb8c@1{)a(6B4b*+k=Q*!`NwHwho^w8$oQ*4KmXF#=&P+(&eZP|)lv#wOw4#c?t z)NQ1EKUQVxj3$!%ZqL)I-O-wTHe;O?Fa|?YkCE|lyh9Wid5cSk0%N;9jn?Gtr3$ts zSBgY0YO)5Jw>Le7R_UC*M1>1$4n*EC=)KWsNNrQE^(dE^0Z>hqK=c7p!F4pe=VGKe zY{2(7w|W$Qrok?ZC~LxD^z5%MCR1*si<#mYZU!S+^uWilGGZgZOC)tnG9lOhBZ;}^ZI}@)4{$d=ENPDMocXkz=)e3WJXXK9I7tgMc zr-z8&ifth&;(5vmUWU3dnWpsGnGYEHLC=M)g_} z)wSq4l(BLvl>CuB#ovNd8%xRzzJ|bCpl>jyBQIb;4tu8`dn+bha!R>21>*GRPUM0>F(ajcHg;!x(4no9gG=aV*=!A>=<5aew1 zYH(8P?vC!?0%HsmPP7;M-_ssZQ+YwFgLeh~B2A068!obdWEzU6(9GR_mNv zh$WFQdC)9=)O+JdRrF2cvtC-8V2BnJd_DX2S8n#B!#6~DQR|iZ!fHo9LuY6qC}NOS zG#0FP3zSv>LGM8-0aT|FkVrM~#^bKP&mn>p4188a&2imY$*}fAVdup}Vav_(K$}!G{M|z*+?@itB~XNGghSE3WJ-%5_t#b}C{RD;u)=L*DcY z6Xy=fCG^-JGz{uuZw3tFC(K@SW3qv1&>@E&pu#iYX|e{Ey51rgC{V0X={hKcounRz zIQIbSy{PG~G5PlZy8&re$-o*$u-0J=D_z>oI_I@@NcGN`f3k)Zto6$t<78W8 zSxq5*c-_k+#7#<=WF~{yCG`R}c2F`aGUy~zZo;$4A<+#>T$Kp#-R0jhG`phEr3?8 z4e0kkj}-%XR|QN)46aj1J~~?Ufx3EHw;CB-O>7zOjm+~*jNJ|D@8BtDU zfBs_GE8GjTKxODBYg~8%J;xG_3YRePZD(G$8xW8nx4d6v(0_oZGDhb0U(Wj0cq?s6F#_epwg(ikejTLoiCAi%cvC&%6Lt`1wL z;`_9`>NdVn{c)vv$Jk-bK7T!gjQw+UwvOt?>ANwVDMaqvdPa^ z$glO2-)(ZW$@Z7Q;F`&ukKp~4GP>9P1>{1#z^LA@1NRKqX}%JrMWI6S^#q21FJfHz z-=q6La~(J?|y!+ z@%;J+{Q`3Q5=0wI?(EfAIR35X@>`;{kScf@KsTp(#n?T0TO<)ypSsoCzGqlbRZ$^9 zQw-@g+PL!CusmQqO9pOv(HVXiIyI890#OOx`MwN3uqK^-l;0*zGCQg~>~#QPx~E)Z zE%e&zYoG_kliMrcgO4_%UyqVaY8VerdmycAU$2riwta!^n620kTQaL#vwJ&M$)*Zr z5F_%Y%w?ABtA={+M)k*y+K(x8-!~>Fu#|!)SUhMf(<%m0QMFvVGC!P!ZB{a%!g|#R|CgfI zAym049#1hMqd#l>zf<(o;S*mB{5ZU?)T2FY^+Ob}TpHP?+a{$3dCC!NcXv!H?pE6l zX5Za4tGU-1O3rS!XHoxPJYOTnY~QNc{YzH_yZM1loB#guV2=3@+pdtyD{>BtL;Jo6 zI)rAf#gXH?1WuJm4$GhSM>DS3z00-y<@_N}DU5>C>eyxamBCBRJgeWXAFJ0$m2Y$E^L;!|C5r3bncvP_wcSwuz4L&BFP(%i zN`&o2(9ezWms&4vFQ5L||I!_Wu)BJ8_Iv;9&`Uc)=+)&Vfl>lRLdjQ$B7^*YP0=Hm z+eOHjSw4K#WM=I-C}VyvaHGlGE{t5(!Xb*k*}^GFL)P-aiwDh?4_`*eT0JUz)ok^+ zc2L&Zy=9}>+OwNn&c=I)zs1IPLPPGJ|EC8n_X590$k{&GeAQwb{9{ngF68eM6fuwIz%BNHW1-#&O@7kSe;>tX3z=N!*>Hy`E&et-M$Wf;XRm%=E4HkaZg&0CLNy>M=ORQ59R zmh0=X(l*!1+IP1eSGRm`dtBR1q2TsrNTA)VVM0^Ez3G#4yL-#mNCl6#o2Bg@?LXcr zc;4^)`@P+>n?RxH)k`VZ;nfe(QuH2VdD!7Sgosk~84)b&@Odvjr06>?x7p!4p+c$T zH>D-m={IAjrQ|klUIAeaN2e-|(Qj)4gFQE{*%y+RnKn&OAhQ@ZaQMpu00s9sHl< zpdXPO#0GfW3w$*Zs1WcZB;u*h!VB02b?w4X>0MwWT6mWBN)^NW4$pY?i>=nWFFL83HRRaLcC_3Yo?U`kC$ zQO)4L)xos7u!?_?gBkTWqB{7dq4{6xV8p-G!QaiX)h(m{p$`5hIrzIRwWK5MP3O#i zk%ND_vWV(n`M==7{~`zf^yd9{b+GP#uMQr)um3kZ_+RSa$wYncRMY<>c#x(5HEz$8->I}6Z)@5m4kEnIH)wC( z|J>@kyFA|B^5bg{o(6HdqxI<9aGI#yhmN=f8$4Jd^q=sc#ecwqj{hAVoMikT@L(U92oE|k2pp{R zfu;U0@F4F$@ZdMF!T*8>ueAG9rG_W_@UmRO8|m-IEe29ilar*+QTmb*a0QgkM&^BH zIo<@Qrsel+x3%Zw*{s)Aj=i z(&y3YNz$=wvtaR}@;j-D>7@hXq7CY8^ko?yz(na2Q$N>=fabO=i1ZI-aU8`Z$&f1uu!jC>)EV3h?L?%|TZeqB-b5kr`1AM~hz5LPa0ysuvN>!PakW z6$cNLhl%E()co*!+P0s!V9FYqkEiMeBmLa!fk&kF%&v+${o2a{#{xuia0|!g87mR3 z3G!9fq8~v?$5FjK^YwnW8>zUE3*Q&PHIT;?y+b5Whfj$4nX*?&r%>G(M(h;8K#JMr zXq}RC^**o3qG)_H+X2xWoNAzA0F9E-#f?st(;oYbWdn)kV01K^@;dPKn9sbcI_b3@ zvP*0gs-qD}GzZmrsO7>|9OxFDXc`SC3~31W4fqars$2#~{FI_gM`*3p=dQdMbegKB z89*Yn!{n!HMr%MTQGuZjC{Tu&BC(N>FUN>Hf* zRVE6Oog*^3U_^89zDLLAnhTDCvwn0Yf=WugKIt&~j5ldCT0ZV`0%W?nZAIW$0)WsSsjM{k8bzFRigw=@sOWF&1J=Fs>cPZ$*3!A!8 zrsl(lsh?U5sls5ijvD8x37PyXNkMIdu%&{{${}zXA{p)AOKKbi(l81)b&t zl<4TU7qJx^%B`-WUIt@$QXK~tV4o@r3`|e&x7t@ftRF_D4?L$5905-!Sw^EVlsi!c z`6%@@h`1@Pz)pFJWhApE|ww>Isj_Q9Zq77Y1ew_n?eqavvrAR>@G; za(0{&T&EWaT@eu^CfIN<$tva*G*XDpsGFH*1}mlqyr%6zlV1pCITpqxsN2IUH6Z|U z|FDn0qXgBHyCpPa^gsG+i$%(yf6D%S3y7@*`TE;zTn`Kd=s8^H#YTn>Hbiu%8=*I* zY&Vp7iH&Y76}g=x8oTRj z#o>mI6-`dClyxZ_-NGKQf5C>okC-E`G7`^P7W;1Kj+!3|sUD;|8+NjwWvm#VUuZoR zAR~CK6#y{gbS zW2-*zQ+qw}><;3wa3SM!@2Cf2(YGod$^d@Xr4)A6DwFpOEL>VlSwB85HdkEZiR-zS zYx{t;P-exd_%XsEmSvMGXo8%1vA&2Yy(v^U-Yod$@T78_d}Il(;+_o3*PZg>_Bb@{ zbx-F~&^3kgr6i3j2knzgyJ}S2gI$k>1;WTn=`)rBqeUycVJiA5IHPn8(d90v*XL|}WRiPM76Jqb zdj$F()SJxs1%#!*ufg9nh;XvT8uh)!wbMbN*k)sL&p&R^-(g*#B_Bm4nnSPx2GtpF zFN|i48?f(J#A$G*5R3C1Vg>&1@i13Fj@p@VLHie2W$dC>bm^1bQ_k2ga{h94;X6B? zF*phVDA~DyMGx;!9g0p9g)1w*%FrBp>T3!L^&%Oxo7wi3Z;vTOa^vnATuM0_*k`O^ zQI^VCNE(jC2ff1MJ=*J^quD4YCAP694C=q}g|_zsK=qJUe|6|`(i*6mZAaQ@)EV+A zLnc23uIHcQe_zIl)W+D!{OMG6Co_qXuAeKA%Fi|bz%q;go*XSdJhC=}gb~aSjex93 z7+rluf#4-N=25CP?ezgFrPhu57oGzQj?u0$Og@irJ^RR?nA@b1VH8tg)L2MKq@&z} zupzCmDrYF64C*UoNbZ1P;)c-9BZoginT6;%gRqoSU{ZPLp#*L%5NC^}z^NiNkdPE3 zpd}K#QimfW1@06x{Gw1YG|^Bz(laRpkq!hB$I#w*+mM+6l}bRClR(Q!@{R(q%L)oG zEvZ}xQdp5S@WRV+Lr!^^8dhYNG{YUY*m{|ecP7D68b}mI)8p7nS2ofM9%W!0dBz86 zy^9o(prKE*_7sW=J&96FA-^sWjh81={f+cPQXgd4giu8%5>BE~-qB#3da^~#^GwE6 z3uJmwOzwP)3OA$=8e1eE+t*`~9~4`j8C%gBTQwhBa}rwzkE@rDYqW@KmWS`y<5;F3 zU^qRv719Hb51)(MVuBmHkU&-l&PF~Y&5)sZlfXcd^s^jv3{Si`;Yt-C>!^bi zt*Ey6S)eAx!w%iMHy|G;$pwsnfiwSjgHk;b+vU;*_M}NK9R8#ycMZx~P9s0bL*1Mo zzMdm3Yvwyi=C|HV>288ZC6hZynNt|2ZkhqF5emSyNU|tF?jT1OtNEmoBQT`{_zVeE z7f$Cl_PgWt=*KbT^&ahCni9Mvrc>Yk5BBaWs)?{|AN`~NfdrD!J0bMmkxocLl`5bJ zr~#3#B1KS86G90^DWW3H2&h!0DX4%1A|ld7K?SMOM5L*J2s`?|->&O_ve(*Y`)rP9 z&A~IzeP7p4pEFW(Y?vUl%^~lsj-~U65g_%NH{MK}o#EqD)6NFFWLuT8FGFl~F_7gM zyuTq{p1?UZqz!e+elrc39}ZNKH8QZ}bSTcbaSrkTgL(r%cm{z~94eI#@U4QtpxX#7 zaI7Z!Qw;(&c>B>7B60xBQLmMmgYS1HsMO=uhOL(}m5u2)ux&RwhLMlR>}o@bgQa=F z*V#7STPrbd?zQHWwOLTkg{i_gsj zL1Zu}ll{qDZXX`DL7)zYntP3ss=3WefgS+Rs~=L|CL5~~k@qV~hw<$RmGV6n;Pgz#vds5RVBNT(pR^sX>^mg6alxHEXybwcJ1r_uOI$P8p>e zTmrs}^dp10CyUtD*rfo-+*6LlMdaEx&jB)c&iP&)j$H)1Ty7l)wDC?iYuOdRT@8Vef?dY|w-b3>sDMIaiP5 zReySx+KjT$9Z+#(H=U_NqFtd`?eHl)+?;UEl(*vSzG6 z8Yjqo5}AtmYYnD7Ed}v^wx$5eplfW?_Sv>fwty8~=k1TBamJQ{xRCW;4UHM^O%P?848dMx5|WkRtoeXvRpbz9{i`lFVS}jKbr(6UDv>Vv4TC%E|G@^SUQMpXX<{-3f&Z{U7Zx@Mlzt}_ zoeOD%STq%VMdnsMony^k+Dd_%DzMLAY*4csV#2}(QIF5oG$vOf53sO7;CbD_=TB79 zeq(r;22c6wK=Vt;vvi<7uc+`KN{|HHw5>LPw}7*n-x$|}Yd|S(YD73V1_U$kFodyEGDzTaWB(CLAL(259p<`{b%lybVo>)co z0SxA77*8w|GtkDsk~rpT9Ovn=0RFExC^y!j$?^tKOuPw`=*_f>W7>!_b2QDQzjin_ zV{iq`LnbfX#F02e0h3lA~ga*;vB+le4uI80R0h`)#C%%f)riwkb3LLM>Jde5%z2Dq5ll2OVV}>HS@NwFQ zOrXgo%x}(6kDjiAWr=I5%!$cYcTmiU0k&36wxeNoz5{SqI(vmU^FEz(ViK-43x6NR zb3c=fKxZd}zP7B2{rb-vWU{ljRK#ydF>$y`##!?+k5@;|Y?5&;V#~ zO}n-xY=Y$tYQFL%bE(ck%y68039v`>4*c>P%Vuj?9rln(j>}{obQY3GLfvrYVkZoEn%9IKt!d{u&0&e@n`ONJrbR z0iK~-D4BocU`iM_9rc$SOqqnS$iZim94T#xon@YDlQ0%J_{ZcQIXL<5T3F65ohzP2 z4%QB-ON`vq>cyM99o_9hN(|1Jz?D}yC&IMv(D4>yV4l<5L-bA-Id~s8Oo&t6!4AoO zL)OxvDJ1AM65Aez(_a&jQp352<2wG29NaNsZw%uoWs!rxs2V!2hLZn=Y@CFF zEx49j`XyIj8#w4=I{X>#^T?%%?LX7WAm;lT*@M#)Gper@?Pt<1@rb#O+ev(=MNRR2 z9w(xZJviuIXn%hJQetmP+H4v;HG6y&{_twQ8qbW!8DUG*_!;j>1GDk-02(Pd%iA}; zHwh!wu-9r%{lakAk=VXFOgCaUrT6Ck023~b^B=#lc{sxxzfB*NT)1NKN}-`$uz^Qu zU%W%LndfT*k3lo(D+Z%o!;_#6zG{Fvsmc9owTB}M$)1H&azGdKA@d!%$m$Da4~4X# zUfD4iw79e(4D+~{p>3j`-D)SXi%P6vZ5~{SS zRRfQ8*$ncK4QQQf+l0_VM$Lt5}i zYzG%Fe4J+4gB?H3R^jp(-B7%C7!OKgtqZ>f@?8^i5;k03(?cj?H85xm%;0|Eg810# zJM_yF*OY9 z?`&UYY!{x`z5?1gv9g_KE?%nf`hLGq`J6w5@U7S>@un`XfnlCIi9k z|KhjF#P1j8zZ;O^YP^HCci6ORkm=#Vj{qgD#oq)ZcX$7;nWS((@_Lu~pLhEcyDuM! ztCwB$PoFV}TfCe_)mq$0II|Z=*-Ofp`MUon%|ZtYRrq;jKWwOd=zC|zwf#LL3Jc|7 z!Gn_)37i7|;6a@6yzD=C(9jO4cV>2UaK<_lt#Pd(7d`trLwkUpHP`>;V6N7wzS5)r zOL$N~l=F*gCXVPwv0C@N>10Vi^X22a5IiP}+*}vD=6=)0to7lkPd87F)}B%+a`^hb zI^D+p$I6#aBi~Q_2Ocyi)2+rXXWsZUx;E1(e~P9W!VB~3xvh70Zp<)35UpKjz^*o3 z7cgAz{>X~kQ#Xk6;Ap@6KIheJ35xct$lUm+UuP%(!hJQfha41OSDJR2THpYKY#ucyS4VEUt`&geP^k$|HEC+b#FnD zf{wE|u)FJ7T{~#b!VOv-?I8{(GrWvT=Ql}aEj#B7;L6vFzY|ad{&Pg$*z499_<>DC z-k9PW8}`6%*K<~)YQE=`7wP95Iz?j();jn423?G$Tw>)KZrZtp+sfxhX4^hcRLR!) z;Ty|lB>7-C9QVmn*S1!zL$&73!1C51J1uEQZ->IC$PEXBnPoUPTG2K_#||8)_0ssa zJ+k45c6P55em~T4U>`622F7hD7PaYOBNJuIj;^zhlwDkIj(j^ST%Y6Wthb>%T<78E zbjZ!sJ?h;rDE5cD$H^N}@6FhJe;oFXco(H{BE-1noi{Kr!tRVsYAuS@Du{XQD#!sWa@k%qGcDw;uu-}~N(`gJVE<<$9O&ROLE zz)^f@Ii3sL<%6|XrUAeaeHE&AsxOS3-xFSj1Hp9DrYN(aiXIQCH{P^SBYXiIMP)(m z@p#NZ{q0$~c>zP0bBckuW4*##w^(a@pF?NT8pc(VXgc}1`e6WWRJq`~%8p1z|FH|M z*)HdIg38idF9hwIyP_t2Q#GIBk(c_n^-<>(v!52{)!tmVO%-|aBwZWN`MaQ44$sbO zLoy2xdftEA3ww^|Y5UED*!N=DjT};o0Y|!>e-{K#Hx++O37slN_$hKFq&!jJx)_)} zZN?$o)z2yS(HTfCMHKcg{ytkblS!UQF;Jw3RQw4w3Lc9X`SYkeL`Y`=qdYAS0kE0? z5XO|g3<6|h{EKUYo?Iw4FzY-rj{!W|KtT6E2NlA<+C~-bo0Y$L{4p?Op>q5TV=ry33ziMMNcngW1qD2R*>n-LLm1`KM1O*(C7 zxj>tM$i~gbn%8R6`4M-CRvvi3Z!+y0wM2ZetP$cz_9-%c#ip&v^jt&H>Pt_loyHP9 zc4(Q2*KNQy&RK9>5>~_m*!0(bxp)^2nC9Nx>dOLyy%TN(*q0Cm_l2v4`LAb)<_bc; zDH;rd$nXGNK!eYpn6jFdam-H~@C{>&RCYZE$KVjagQo`^&&AE0R2U;Epi!xr3H)I| zM+JzC3$4!wG9M5vK@X(uQrobDx# zx^HHLze`p__Uor~&p-mEs<-KM8%pZV_@N?7O}y&WGM9BIej66iG^T$f%EMbM)XEoD)Z zRKGR=yF6>in&T+jl_G#Lam%`BCzIAXtfqqp=pqK#KYb1Y*f+tPK!WZ7K$Ru-oUawZo5}aqQ#FZfr(L)q z1qV=wRVmqvxM!-4v!DuuC7*5k@n6atUfQMK49hM~?5_M0Pww&+;lTlueL{_|tI=Lp zV^4}d;CiYnKLFMX9GwASKt3|Em*5HvXY9T!xsuxw0^K72@tFpJ`P(wGv^2jt$i^Nm zf-^2Ct?~z2WH!upnM@Ud(w7%8ax4j=C);dA0@x6 z>TH8KCpjIStON!yxoigh?~XbHM`&UOSd_KoudYtRxyT1o4KwyL$xtv1e_qZvZ9b?^ zK;mVO`XL9j-+&{wO;gOTg5pt0Zo3kGfxWVS4dLDB(pn*T-rd20*VENcXioNZwF(#! zAvq0(!p@h;H3T^Yczu!sKYq-B@Sr;RJRXvXTsGw$+yDT%wRt|FrsZ_ZD`T_qGY4Yu zYfC4-&;iMsJKr(VUoSq7ZQ5U#k6}y^zLQz-AY5wTIX4R)gtJc}Sn!})k!Z3t3m!bt z&D2;CF;yjl;9K{IHf`jreGfb@?g9i@5k^fOz%xvqkStrtJHhfiXbf((4V?fdWi~vw z0kNrPf=NsYXNNk3FYXc#rz{TgcyH^ZcHgTDN6#2sVdOdP4S)^xAeBY)ff|me zbB?N6N@^~`)Y<@zfOt^bWl&!j4p@Ybw29{Wd65f&%kLq-=uK%1p;+vs*ktVUmE6jZ zW9!#naE?elx=n!5uf?eTPM+2$twhXD}~m<4pju6Yn&~jxE|kr)1UH1<@i+-Wim)TFBzEFYlfK3g+rb<3}goYErv9v+lf3(30ID&il1UNLroL4Taw!c!DI}qTY zkq>=h8~nt^#@`B~0i0xUUK;S6Gad#Lk`E+9n}A8aDp;31k0OL^>t1k-Iokkv**6|W zA~&Q79wEuH$2=N-Bg$F!f@89vLK^@VORDG707yf`q$J4s7G%ftNQ^LB7@%X6EY6Y! z4O%;CCUa@{>Ik#c=eJgCjbS=E`t`+vqV$=dc7eCesWtR-bzE?5fGzWG1ctJ&W>N&1 zWcM?`JmCYnKPEy^+S}!1zz)NK0dhHUDKu#feofBN=YS^hNEeyG2O!@R_0F&l5|a2~ z0HleaDXy$+ED6pvUSU{#8x)+D1~zXSvuIN`VH0GUMGCoQGwV+FbC|xY&d~L{9Bp|j zD3&h?DqJLh;B4mk#%wS$)WC8i8^)$+rl1N}%)$E)8z}oV=@OO0+7eV7K;~3MZdeVx zEXi3D&_uk@l6{fo0UUCbi1Hz*SFksvK~T;I9!ZmZxoSZ5Y^I|f#rVz$fEo9f8RSq$ z)rdR)V+L8h!%9Tt47<2@vGd2o#J|cQ`Lc=*h)myy@c|isnZaaA3dO+Yd2rVAV>oGE zPx>)e%g^l=-%6gP&=vHq!%Ph%AsA4hC!x+5C=Cb1O~@Y)J{Rt`I8#MCzsc}xpq&b} zDphR?@HXH!AhOj^g2^=#`O+tP$Ts{EXLDd-v(4JFoRzkB_+upX#cN`Hz;m_6A|F)- zJM?0AE#jceGbTXF)+-R7X&N1{DuC%%-93q5i^-GPe64A}42mex1>x@A7Ey&Y3EA+z zcHuV#(W)bGwr7Ofxf-~hK$^CVCAYF*HZ&V>6S&Zg(QvTiSeP~hhfl+l`l@!P8g!SY z_n!9ZPK3w14ofCS0wfd}zWNegM%?dLC7NGIiqOCdDqsc*tm-$xchK4QO+!H&wIoNM!Gbz1fE` z=Ky{XqJF=8mvhU#@!Y7 ztd)v{plevrt4&ZJxi?RC?iAL08<;|m8^Pm)@Mp46L9Pz!2lSAc_${!F$q9C z)c8HgTX16C*Wn0ZVfIxr3`Hi~!`xjQ+Co4@w4XIZutn$qb0YL(iqwXGz~IdaNde!I zS1e#KSO4i4y~|ttZ#PsExuNnOFh~O_JO>KXySuZD<(*l+AX&np@J^2Fh@0EKRrTnT z@ZpQG>@y#XA;J`~R6wZ+7@7ODpy6eC&z}Q80E=4l?MKXk~ z?2KCJeMBO9{5f<0b4F7V(gie#mFAg}Z*OI^|crSH=ffdPl3luF6IaCL@)pd zjLe@(R-snitkRyH9eo&?2!)ZQEQeF*9%+SRp6#RL8TKPEwv-#Np7F6BfWr&MR_N0q z-gcn%92>bl(Y^lWt$JE8>2-wjcrT9qh>b)~SfXb{VqV7>cNJ|vpzYYxQ8q(aO8gYN zJq&8EC{7|L;DDCYm}5s`@(l(mWvOF`d)~|M^xr%_{=Um)#gctV?%lgEas!|8ih~<> z@9`gU55JjC43o^oh{M5Mt{=<9{AZ58K6c{IrM#a}C-(oGz`vgOBQcr1cmgmZLDhLd z@+9`dg4L7PSvOsrtvNDE@$9XWM~nF!Zj*RcNO%UfK7)krE0kH|L7MsS7x9+IB%;sw zj2UJb+4_hS`EVNYqDQ%f`g|T3@(SPek=mR3PUjbV?}IfHG0LKFv-`>&pYE&ifvfu} zUK6@Nzh3Tbe*q7w-%l8dnO5wZiLUi|zVKgo z&_dD=ukJ@M^Gg)=H9hWkDl7R2EdiSe`U?*}^gGz-;INo2R^2zBpJ>QT~2DH}PUte6L8?WEUHh zSQt$oLgDb7%o+clm^q(^{vncFK{W0j+}vrex!0qJ;G{WiPynTECd4ct@_kCE{?%8L zb1@NInjicrusKI+z{#8NIr@C$=YY#eoEPZ)+_hZMHyfiQ`Qrp>exIMjC#6$X0+KH; zT#?V-pGQ%!Hi7Z?b0p0I(KUgIoeNHPX-Ss@v*rRW45X$PrG>w5qVb?A!+ftLHQq>? zge%S8k_^gk^*?)NF6>%Ufn-Xdep68^?M`b@vDrMmC@8hh=&b(SUG^mz|D_xjJm@%I zbSC&}eL$)r_oXC*sv_85c+fE*#-AtRy^)yN)y%o3`kS-{7Ch+h7xOuAP9@MTE2O(5 zB;j&M%ab`7m)S3AfvFFJsvyg$pF=v#LQm&UyQbh@T7|y494g-72Q`AvjwaZZC5CuJ+9FxTVDhaps0s$`FtVwKNxW6I zX^Z?(qR6g+iC4fRh6uCHjq!Rm&_gy*oq7&JmdkvWM&y490*pR@qPd`WfCIeVfMS@F)RKj=tjjE805x*48htE{H=$ zbpBsrkRz8Av!DFxpD`HrUt=(7B>F%`vPVXumUFxh3qur+aR~V3ive47K_eR@^I4)| z3}>Xlc0nctE4&@133hnF=J1{4+S!emRcc5LOO|9c3~VdZM4oN>>5&^%Fbc)C#oxjt zmW`x1WKwPBe`&Fd!Az?0*0z2$$Dx{ES~Z&&R)09WI?pl&<(jA=xTpdQhXa!1mUEQT zUtl>(V8eD%JtsHqn>fQ*#^6BYz%Od_2-}t13&D-sK4l5fFOrLXaqN#wM2_CN z3srEC(CCKhkiZ#{WIgTfQG+pUG2$^ zTQ4h~~u61U?2$@*hJfdys32$uV*UvA5_qQ~snW;iL zXO_3LX1lT#TpxNLtsFI;ZND$zWji7&Qnjr)6cf$3(mD$``;$AHd8ENKGF(zF70e9&?#K0Z}mdrq7{oLs(pNlh{9U^^enr7 z@L=k!{D?rf(u2Z_J4$7Djpld^i}U^m9;|-0qg++{UwBaE;nSsOA(#HFzD1c!e%$V}oTeEn@sYDwY)4L5XkH!Ac8?`mWC+L5CKa{~6 z!t{kw&s9&+8V{KHSgM_y53y7|p82Fi=H$G65h*_=@S!>KRJprc3d#j;==pMbd>tk(8tbp#92j1G+paN80K0?8CMW3_d2t*%Hl5|)~V>!yG8H!GnoI|2r}GUtw_cuP|tlejG&*DPywh z@7o!6|0M9O4<#Xq-CX_gGSoLu+0uh*ia zK(auDHptvkyQJ(E@Z*qgTRZP-VwvC>qtxs7hY#*2EGqgf*=w#cc*`-J2l7Ps-#xB- zd}zxcS)0V5u|Nvg(Nj#tixyWS%5!hp6z=V)Qq@Ulxnc6S8?CZc)qB>xl=yqt5F6~hyMgx z>(1t}cf4otHnes6x~q&$u|JUL7G{&m`|xSvBlURIV`j*8XAVcj5(#FD`gx)HYsWo& z8%Y?bD*3r#9EtES(Sa&=@!YlLy2h4+(%%)AGNGEKzr-IGO8PzZ8Z}DCiZY@!$PjzJ zI20Al@r9sVbIw!$=Zsj*WQUE&U|KI%Q~S!ewXUj`D}J;687;?}QXZ5NxUV|eF7IDu zLaOyL8}OC*zFmP!IocC;zCM34vdkn1eYIilm%}$YTf$pSF65o2F4fc4%KqIcXT={# z4AT?c;VMw&(T2N*_t}UpB3Ipq;swhU<)vY1&dM7UzLqxj#EH$TE^ZP0>iXIRIw?pG z)ri;3T5H`*uOoGNx69=uyUp()#!ab57-x()@*iuaO1?VkR^LWFr>u3=>i3Lzx(!(7 zy-LB^QtVSVoMv8a{HHCerkAIUI25vRFnQOHZbO+M)E=&2g%cZM*=Z%(fJ-xXWtVEf zB%J#FW}%~jDlB@xo%a3r=y4BMk!ryk2%4&RG2^JDZGhR`%}F^F4PpY1EUluyug2LK9!8bIJ5KDl5T=*>rsbb$0&+xkfBRM61UyrqYb6mhVHc0+{VA8St8&Ge0?lc-$fK&nZXdQO%McnI$E zdsV42@$3dP{xIXWqZLn1#1TFPEa|e-FnKG_`-QZ|*`ApoJhj{FxcpWZIPtd@+m)*Q z1KX>3^~Cob?+@A}j;wCuFK5B5!^WkatrD=Bgy<&1#4Lnl^AW#FkVDaBh&Cpncq|D{ z*s}Ph1Oms zK&hBxScg<3Mbyo0RWHF{*Fvz2Vittwqgk{}?L#EURAGCNSx>yEZqfW8i1)pi1SVeI zo-sMdP^D4y>@8Zx{~Ck3QzXJJUYbAN#t(0z`p+0-f`OZ<##IcpCP--*GHQWf;%2Ri z=y-;*)Go6?&lZ}~+W7Y|PNib8&fu-G%DGtk21Lq>6 zg|{j&h@37(K{Y{}u!^ZzvrPj3I+(z!vC#!h zxA>~2f__dV$gkq%{VAGU1VKNFrYh9T886rc%$7q0JRyYi6WhHB!T<Ju>@trpDBEU2PJ+9X*y*$<~;rCjv^fMkhi(97(W#Y9I zL3PfP_GF4kACX;ImS7}no(p2jqr6CG@Rh|&v4FwSs}%1JQ~m_~CTUwwMr$=gLbF3O z@S1%o=*fi>4&8r^L5WrTuRoJ`mN8fc`fCjK#3OpZ603zKG=Q7-&lv29hiN7VcaahN z7U(UAL@rK{_Rkp9rML?+jHaP_fgt^!gaallW3Z#uHva8ZhF&IKECHfJCzy>f_|hrj zJ(=q147lgD=`o7XGTf0Q?68W5vwp&63l#?d4SNuF3xWkGdXwNjTf!;r1Yhla|IgpZ zT)QBN)iH+wyyz%Vo1X#$EYQvI=4BKv8i5@0oyr;m=z-$rt%0LI2)?4_O4WZYOrAI*xPr0YFId)S02SGe75Ej{Dj37YC<;b|i}AbIB%~kdSFGYPbNg z%V42jPLz`d?-QzQ6A#QzxvWx`Y9&qO!G_Bzy!3=bW;>!*mJrD?FdYAO?UhApnmKAg zPne;$02W;SgvXWI=ZFz7{`P*8@%jr?ZEEKhK{C=$bqk`C2}y2El-isSEiI*aQc?sv z^oaJ-U8TPU@oJZ{uWEyZN%L!zBJ<7C)op^BI*S-2V7ZhIsP40jL8|-$pNBqit@_wu zmNA%HXhipwZ76iyBt*PUFczlB`V|^0GE9ijUnU@>t_K^^8JGpUe3gYkW)d$Q(%;3@ z_)u0jU3P5;f@+Z1+-~EiwFmA%SjHe0bz3v@e$o13fTRD^mxRq;;;$*+c?v{bRj$tx zEHUe^tr#@peowJL@SJCbIcW(`T*B{CB3Q&()1r=9z=w@(7=m@fOLvv%!{X6hvj_cO1}ywF1{;EV3Ob=`{GklV&~|<37nU(NTZky5 zDAK^Q=x_YAHX})80^;;w((&wgsx)t6Iqw5=7y&MR;=bd(>{JGx-p zxWxPgI#CB@?K&}VaW7ZUiFJMZG6WK{-b%%e3DHS#Y^5Qh+m1isk!bY&x~)h_WKyk( zfowljKWp4%N;^zh3~vO|N3^vtEA-ww5J2;)fner201Uq*G`fvD_ceY{X&?hFpXf5T&Hz*qd=9slOs zL_V-|{k{4rmsH%3_-c9D>Fk}k*rW+GOO?;b=JNpiOBZdZ8B8U@s@$30A?ikQ zUPYT+3dQD_lU_9z~$07E6 z<{s5BOnc5hU{LY;dWJ6x7zA#z5`JBOfkDX_Rg)SGIh$!`@>gLJB}0`dVQ@@-GF?cs z$MeZSvnr6FjWh5ywB<8YVUVf90tSir&)*6V4dl@w3Sz8onzKWdjzag6f z`jKgL}}eRDFr%@f^(n-Q$M#qUbJG|giV(r{jmI9wDX2)a;=Up=ZD!}HZh2&6pF z@##!O_7m;H@g2?|@55o#^Qk~IHt}HltcHglVZFoi$(Por9ScT8@fQmaGYjbjFdF(e z!WeES>wWA%0bzr4xGxlVw`ydyIlslr!=E4iR95zz!t*O}H%Mr>NuqKa&6&CZ zPulz)7%IcTf@4cA&4Fus_#~qRy~xE|+6j%C&|x*^>(KU=)WrnkzG0S_rP3l(YP32^ zX9bUXSX2z2oW_v|GTMR#IWMQGxMu*rnB2|z=Y}gPZxCCWjZy#jM(i| ziR-YAW5hPMpKR__F?pR~!V(CXjm_Q9o7?i6yNq|cOX7N8n6yur^mdpq-^6v_G3kT+ zH3oY-c6$gW#-unM@s^j-&3!U0eL8>of5mkk`1985(jecZf&S)UojBPK5CISI`Ut}D z4r?>Fjgd(5M+i$7B!B#?3qml>m;UO41AmZP9YO<_baEi$S19JYz&J}695DU#yyfGo zKVvUi#%KPFKf3hkMa%TtOEWJneg3}x=hI(oaBos*?~_#P>>JZB=B*#+|IAxoo=!8H zxqg|$+ic?ZpHGn1$rq+mVrEMnElZV`zT7ffq+VXqXkDpq{Tg7lT-mZ5-TF<&Yz1;< z9KJsTXwr=IeJ_*H2ve z;kCCh(Yon+Wh29UBgAYsxn;4mZT0 zPg-3SGd#c9G{{?>A!B<}_X6)JyBjJ;@AEgcI~Yl*1TNY9qRIDKFNQzN%~4mPSM!C#p! zzgIuedF?ybO?#25FRyZKOWegS=o|@nijbMB$P0O2fBaYRnRG9X+2%aX7kz&W&jn6Y zCk8xO`ZPCE;BwEb*RX+=?l&xatX%Q_?_j&8dq&zJ6$o-j}=_x4p4A60N9t z?DC)A+rNJ8OSJ)P;+bt=ZW9y(D&YNpfd|pV{#JzEm~%^D>(qbXLE@FXpqK6ccX%*T zQx*M6+#)AAH6n5s-Gxc4T40J^HQ}|TBb2A$5xfP9H9Q52{RKRihx)I|^@punRQh>J zzh=RM5e%_p6TaT-!i#df`r~mCX1iSl*Bv?PciXQD8I5u>MB7XVcFNUbNN1C%g-JN9 zMYlA=NaB*4y;3Xr`jDMk3;$bthitwfN7HeQH{L@*_o!-C?^vMV`Ri)C!TD{tmZ*#r&yDSYvGe0CId9K>k{O=f4DKglXIh(dNSN+bgJU;0^x83{}3pZ=b;|cF16C32_nHG`E-`2 zqEKiqcjN6;GOMqo3oXYhZQT)77C1jgPh{6Ve*5Y*t%DT1%=Xu(C9_zn;NA^`7bkbN zEM_B}&$BznRwsCvpV%ld$bDF!Gd*pUap5qBI+zLed7Uc&YL(u;|rL)#V}5q_qk zt4_V*MsIz@r!unwVJ8aC@w}MSTkUf1sS{9ttCgI{+t(Jo6hJM{p9yKa?=hr-06jla zQMuG6KJ-GfaM+5<*;(LG$h;`#rm-LUwPN#(*T@SwsiGc+*n@`b6T*4(dHYE_<%zdv z?%k1eJYV%fGIsj$1I^(-1!%)t0v;S&9~_MqI^y!5=)>f;hhR}(ukYIAuq3D zdo`hO#34N2M?Z>^oAa-q^@*s2N$01Si7%^`)lV&KedciVt|yddiYPT`2Qef>5_tzhM~$ zsnLqQCadG)wBXU8_IV9w3eA*+_+1N->&n>OF1ZYTddGsR3>hdRyx+7sYQt$rBW-ccEsD~EADGO94(Wm6hpp7PjoRItO@tL*!56r#+&WTIq1^0)A zrz>yUs0?K|=xWIPnM}6Zyoaw(U=KJgM7W^D2z}P{$_CdFZ@& zWAC>!$-%M{t+g`#TnHVexgL@c;hZE5ve_p`;##xl1l8xi?+JRhw?}NOB}FRAjjFJKBZ)?zlBr&j3)gAg5Dv#$h=`a`dRKCnwYgQPqhC_Xy4U?KJqlYJj`Ppy|^FLTkV z;C$ri52$XX*EYjNNe%ZEa+WM^?Z`d2mR0GL8boN=ACY>2X%{YX&bZpsDE&;hLl

I61rhfV6{{G!6EYo`;tes zyrMMe$|ixY9yTO=zaV7Z!s((cRQ+zh@>n?Z6iGF2=$z&2@7>csC(N>sZRYQteIW@D3kMNMsIr^9I0;v6bKaGHwWT6}=_G}pWVL#yxXRO)U}m4D`Iuj8N6 z^+oD;4Zi6frc9;-`O+b%1a)nY*O}jh`W_xN!(8dpXSYfOOMh@E)qRUBI}$Hd@Z*(Q zXEi#KC{igso($UsUGt;OAlHAsk}AE+9DnL1=R`MunNcf)y6H6HxG$wqS(_(VTICe0kpNo z6)FAua*t){0 zmv?&yxo*R|jM;7f%*<I`CapLbwGP;<`;@%}w0 z0MP`C3`~lnKoAXx!s;{S%tDOwzG{EX{*$1pCF$Zu5g6(rmN`C$Obr!@`LY8Pdq-mK25YLqxCrX%{Lj^0L&w^N)c2#OO-NPlWk79Sc=Bvb2b*x#pz_!Z>k#< zf7`B3;v697e(0^I_H9I+n3J~Lk!I8)(Vw&>Nn!}Q6CpD0{GS`F!t z;)QVYR9;26n$oQyTtT?#3{v`>W9bSRR(D$FbDHX#A=Q3fwYP`Vdc$Sgf)QcJ5gV-3 z7)AZ-koxy*^#cs}{$5qf&xgwSN;$sM=3&KuUBRMt4F@I7{BSm|4b8@N4WS54vA3FN z9j$k^kDzbOVfEV5eDZ4FYGhT##E^T~4LRLN>t^(bo^eURSmC<)-151OudsDGsW55smh%Hu%xbw!iRTNd%%9_u{e^b^v5NNRe5Q zHh%Q|*lxBSTW_n@CbgT~rg`(Upmid8QrSxK@Cy9Au9 zGG5#dklANdQ_K>0^%=%8(~{W(NPH#l@nK}qHSzatTz#k0c^O>R!R83k?#8g(n>CHfO`8(kgZccZ4|H_*-Gg2X z9#b8Yr<->7+zvLn*}vwu>m5c6@^dTc-YPr$;hObG5lK?;6>m1iP{Eg|<7+rUl{)N( zT>AAZ{F@t|dfrcE7ZKWJlmPH?+XgHpvIl?N1PZ=w+ zKTL<&3`-GxxO6SreoFJ^;Sfy>DU=1bB15{!9kECEdOC%yBYRB|TqJxE?PnyH)JJUI za2@(8t?zDQOg>_pj$VC^4|yD5e(-1<}(o}S!{1yt|0#T zApDJ-fGS9ajN#q&alS<2B7y?`fCTITG&xB+L7Md(`tqi~(@n9`AExggOYe7L&^b;* zP-*sRvUIbL(eE7TYd?*z>j(24IVW`MoY=@Yg|OfjMRdXjHLxF!)pi|IlOdA5u2YDo zaNI8maWNQY#vx}#^p*!ePguGj*Inr?3!bF6VdF1d zrEbNfjl^UKUC7rBHxHD)0k}M{=An`K-^huU;UYBJIdm-~{K&+YIK=0xUK4#%iqZk@ zx1{T$V}tb12Q-=n-wH_k!5Pzuq-jVB$H$9eV?LdVxy}+4My~7&k?%c$7${#u;s>!!~@colYNPvVMq=h2V2}ML|C?dUDz=ENu z6h%N(P{dHB6RI>Z6cGh66alHBNS7uGC~7FuMLX3yTU_Fl73_MGJ) zC(psN*0ZkbzQ3P`EW%~qZN7)_X#DiKHx@U5jei@UALek}$xEmBdPt)Q_w@&Mn<40S z8C^F35krFwztmlKW(Q{`s7~=i$4Ni=ecw4G@nCa5Swv}?nubR=?WeHu_SAOH2c6LF@fW|@;d8cA&<&Zx>^uXgmm+Ktj`Hup8Kia zPo~`y$Os$1@A%4SK3*&`#K{)Se~>SM;Sv%h>L=L%LVcjbO@v{`9zVG2m{F6NT|e#{ z9cVP=9hZ^E=Re73BQE0k>a(^CI*NAfPe}6NR@9YR9b*uihI)3yk)+1VN597(o#f3q zUbtHp!qcA^XFlbC{*YpUdYQ2zVIc>1oiv!n9@J=w{vnXWa4a~IRbW|p&Y$TDwrCy~ z_j&|mMawj&($E}^5@jvue_TPRbB_tg$r>!WiY-X0F1YjI$-UbKEDrWlA)%?Yl9C|^ zhYu-R&VEORB%1osi?>8l@s8Fm!c{n_*X0~||gc1o5$lripVmL7KF<_Hvtq6Z!i<*fLSF)`_IYg2+tpJRq5XyPmq z*@85#lb{5YE$o!ZeXM9a`a;$1g;>6rEm&|7SJ>5po_{6zEK}0NnUip*BBCQz>Ep`_ z*-5He2i5biTw+4dyx{I)47Fc0>H}I`OjO;O*Rut!n>XWQUef}uG{(i04*vH1n zY@u8t)@1Ve^K3)u_fNOaK6+W7^a%jtaBPDMoX8;7EY`oQASi906(rDV?ij_@pKUQ<&Wkb%onBsGzoFhn4c0I*$CX7j>(IU z6BTS=9?#cj-D*Ko=^Xw=ZE{l%M;?yeX&ir}G4@*s*b_=i zzuVH8hfe=Pe|X*~=k>>+&}1AnLwSR3MKMLe)`n@l&!Y4Ov9cC~w?03W_V`gL zrZpr}B2+X~m5$o8Tn(B^Ix=~`aSU|6n<)yKu|h)6b>GUp&)X=en7c6QCqZoSnpP2Y z0ipVcCE?HcJ_jzA1eY|34I~`3yMq=kZQ%3NuTpjqv5HOc)wT2&S%|Hf%Zwi5|2)QE z*Wu3+8%*5$269;V5^;QLU1Y07G#4nL(hymPR=>or-ucZM$y)TQq-H5(`AzWb!B)Y~ zUWVgM-|pW19;n+Daen1q(`38|;7vh7i2z&U4+b;78NsLd@Y!Gss;IIjHC~LF@)K-D zSLxjLZ-IBx1f>|us-Zi$3qOY1zny;MJ1vqnn>%SO)2)E(<^{k20OBEhPtyxL%hw^HWO2HZf3=%i@N$-FHE)wY!(k-UpA9(Q2 zPHLA;-e8_l>;ava4csGl(|_Q>7N1{*295V4x}-|Z)!+VuRA07@)a%XCiB*68;l8A0 z{)uk8{9bQm!MG9`;`7gqLBChnW!0&~f_4PEZk&d%gP&Oxn>`aAoau&l?=;38t(=X& zYHells^TDteOkTvoZsl|?7PylKVTL|t4D&@-I(xTEtd1(jL+{qcA>nBqwTSR1LHSa zqrdjHX+YUSw*5vFJ8#slV7{+?o2Wk&`0Ml_YvG`#Sn4?%A;M<>S4ha@T1~ymJsI-L`&c*jrB!tf?gU=LvrArLJY9Hdh zFJy&J00BWbTZo`bmrJa!fQIuGQ#!Y6d=hi?$eBFWH66}dhj1esaF`a7@SDY02@A1q zH-{CqQ|2jl3BO#EPB}$iOh2>YKa=Vd?D$#D2s6n91+T!0Qcrq!U&{7w^0@fGBlO)s z&dme7C{F7>?(8EDRiK`SI%TYv^CCrJq<#I6vb=|=c%+O$1jZSzS031Mxloxa%l%1i zFjDq`u=~!tBSdnHtllMzZIs@x8SZhNQ@Y;`F z9l!8c!Rm>9c#!Y5`3r-djqFQ$YZ@v=qUFpb*el4pRz+I*c*(VPF6+Jn%^ZTzj-&k6 zPp#!07i7-rIzfB;WLvz21loCC^!fE(a*V4WP<^<5q_&mDa9s3j7mw15p7>!Tc58$! zvqw=h-hWuAMTn^i8Wq8nj8Hc%UzV>Gtd1P}_{@LQ>0UhN+cOeUF$jB#%4X=|!K!Ne zt5095@12u?f&gLGq%6L~VY^Zc#pg!-Ug9b1xlWAZL+ z;hThSl9d`4KNCy_Kf@+8DCFXDuIb!k}NL_XuP67X+r)p!fQm zeXMBslcu_*%`ac1migae+2g~ahr!AttO)32Bqn1gRQ-PqMP+LYS)Il{KX zVxg}Xkz76PagN&(=)Tqad@KgW_e_RF9VXfbck{w66cnS3mJ38#-Hb@S28bu2eR5|g ztoNeyMLdB}HHtg0%?Sfs+E}~NnRkkZX;1M6;&V??4_9Mh$?|bl`bNTO8IU-C8b+|YLm*oy7{+V>>GP@~Vk@eZ?LjB>!o}1NT zlS?D*IEVmG9wLJm#10g5_*RiP8ZaPdOL9cTw`l<>6JwQdF>)W$sJ8vEqW6tn##!G! z11Kajg6)z%%cmjGtDb{BO0%$!8zPY~VLbfm7u1DVYym{xM#*60zJMQo|p@cYd zmd`tF&{fQR#GuW{eEvI8|J4_IaZBi3q?_mjZx;{O0-D!Hf(91zv0WL&|Be}x)0+q6 zreddf62o|fw_2i5p$JVjqdG7sR2q(=n;?u}-&XHyju1n8oo}|NcPPiZSYVK7Hj#J#%;?xXw*%g(m(|E}-Sk>7?q7j-K$<(3mo-10;x@cMdm`HKa8zI}1(W-^*|s1n0m^M%GdK?bXU zKF~#gty0SMN+-db?gsE-nqlhEc%*>)4rv{uk1|Gi+NK<+%$nVJbe7^8xJ-lQZ{B}; zE61*&)Hyg_vD#p4D1_JI_PEo_s~={3?Ls%&23Vb!^oy=y+)rzLuoSz`d&P6vN3gIx z=~DHpN!D4VsE_7_KJ9g16v&q(KQ_*l{ZbSdW(zQnezfECiywFOJBhn|cyZ-OO2Z^9 z^FYo}*9!0e#5PbM99?kVPN{{&mf_kt*_coGL&T%u4n)1dT!8J(U9^j{Nb-4#==LHNN%k7dj7BC zmv*AcT0RLI##EQ3bKmPmn>jYAv3HljQS^3~dGLDbi-xaCiQ9c%!T$vw%oUU0GI-7MMFLas zdGWFIF52lo;KBPkhW@1A6HCFr8b(2ep6FdVs(G{Kd*j+I#_vhFd)pOEQ}EHFpjmUh zt^Rv^YXy3LmOnN@-q+}EzC`bdhEpJ7#rnSj8Q{oN+%8@8-u6KA?~Pl;Kf{bxem3Rx zcfX2zPQPa?3QVyr;7AY?1_|asdQQJ$(t?3^k=Qdx#^oe-=@7^k$&fRITO)+GSW_57 z!j6#;($+~W9E@tP@O%gu5vnK^D(N12C?j<5vO+98>}03Y!GKVftveOIAxBI|2i!x1 zX<_jBFu}1B-(4!^-5*b&_wn8kV1+3h|%*R|*-4)v2^ST1w$|`6|Sj3cm zgpG8FQgN6t9j>oIYT^%-frNXSgj~(Qnj-FBD#p6Hhue*X%g=}0)DQ@W3JJzW7EWC~ zzr~Y4Fp%!V{lKv}kr5h)qnbcbcZ=_*`ucld!>?rAOK!nr6h~g!x*yaP7Mg)g!A7Ok z-nZ5WDb$F_$1);LW22jfy-z8#t#n8vuVTxwB1Ky*f0XtlECwr5Ie0L=@FQokomxJ-5`ymX$(>`m{$EvgiW{M z$^m33f#Z9nst$Lijbj8^MnLXWTFAl!SZ!8>DY-Zhdoenrb~4JnU_rU(~$5K#HYr_GGunah4m| z4kEPLN7b*ge|D$QkBU9%#Hk#}fBFNuM#M1vKK^T(?K9dpIWEl@x!TiOd5LGm88Ij z-fi>`Am1eK3D~RRP^=`>oMWm-Rp2S~4}gpyqeBVMHL7;H0mluM^77%LJ#u`_h@iuU zMJkW9$kLP84^b}dynR3<^|5`kJ};8;o-l0 zl8(!;+}t(?NTbgeXEU!T44mJ}H zH$21UtX7!fkKUVMIhlmcrq}naJ`bkCY0s)J_@NKMqIWTM1)k8kaLw((^ntgp=8TZX z>x*^NFn@edoR39blUWmpijx>r8xx*fF-DGJDlgih(rl{s@HCh_rfCuJ@*{6*$*Tju z6|L)St;W%5OXxqurak)0^Uih0@>*Ob?OMbwTT~S39q1XN{dD0g`O@-PIeUt^j!rLn zlH$V`^vbG*%}lUxl9D&jE*Qs&X1l{KXMs`V5$e99*{6lgTs_t2lQ&N)UlfZ{zb!L3x}zr+%12QPVJH%{b-6v0JOvJA)PiGXy6hESL_X9YC4O zY6LcPuDHUku2x(5v7B117k0D1Wy7}O)7AVD-A92kL!1|f(a|JWFfJ*05PpkNFek^` zOh$Ooy_uQLRC&xAz(5Auurd>#L#pghv7Urg)Gkiv)NYGG3tPZ4%Px&gD-0G$L1&sl zJM2q;f50b@Sse&SS09}&c?DP88z-{-XEICES+*HE!kEU^=YzIehLy`>3Q4fOW!+!^ z=|FNa9cqJc7OI_$1y3U4EubDK1LOg00`h6{B0AOcDd*FtIg7@R4m`7x$J%IPEX*-Z zq=GfN;z2hIQNDv)!GB|^$e50ZScaJauX8>cIt>z5h%n|>r;uLOzH2CPet1O_Zbn5) z3G?|;3BC(x>^I}Xa&K!e4^CDK-d;jyn<2v{3`0;gTUU6Sv!1P-LIe-$i?;X7&}{ZF z+Kyq2ku+w)lmK=0LEWOK8aqS10Hm=EYhbc9Gykboc<2;0r<`Em{h`M3VbhgHGXVkA zGfd7T9&JYJGs9HIC)G>{qKv7Q(_mXSmtmo+@|HvWGDgr7JiV^p%2QGi*0g7)S4NzD zOJ3ELH13qNXqK{jm3BQVg$rX5pJ)!i^RI7<7J9LZ1{f~Oq4I~meBaQF8RlT{l`&qe zjXvZietet$cHX9vX;11(Vp`5uYKkBCV&h9AK>hISH|$uADd?*-(|-0(E$dLn0_Z6+ z;u2w8*DbeXhj*6%E%d6}A--ouOlPMMP85{du@~R{KK}`N*L0U=nkRTe9N}rnfOd0r zO^4cl{#N8gX1&}#0oBDwy;01hO;}^3xldrsaJHVQol*}~gM0gREub4&+R=ovm-tak zVU@rCIC1hhUA8QgP-aamn}62BoF{)b`{3gIq5Uds;~BfP3dRhtvqonz!)QHRw;Gi3U8cKb787(WsdO^2QxB5*fZemP4oT!n%|V8XFHvEO7@G#Qv0l2kUvR_z#mNZMu;?8U5D>L09-tRQw;RPr&Vr>Ks zEg|{l=eg%79n%&3#3=dQ#VOdhQ)R$1jErKQQbKQ&S+rw5=6xQR%UuCa!fuh#h4n9f zQ5CPQps)4EpU*=ZlQ`?L4m$ma_wxXw85^GlnH$n2(UI(EP< z_ZR(4_EF0OT{t;-A_8w2qsTIsYTGbrlak8*Qycx@8MF{@*-V6frn4Kbq9}@sg~2>U zd;J&UV2U^Vc8K;F&;%c=l;<^EM~9!0V520(#XIOU0&;F8M7N}_LOI&8uAwlRKq0}f%*lDpmCCwe%u)Mc%eAWW68%i!;HMiZ+u1nQ zMfQ2E;kq^`vh3ShaXSpUASN(X4aipA zK31YBB3eAZ*S&a(65b&X>tw{iN($LiUEmr115wIL-l!Ts^O~En1py^BK_!!K1YaW$ z%S~eRtqFC0t5828+%RGJyftI<9ri{9?XW!*@4%MW?S3flTs-Wmb z>~4L69;hNp*-70qUY@ghqIYWznbn|fJpX?V52pVNJ@y~);8J=N=Ru=V;`ZWZG-ti< zpH>~YR3Sb+ulK9%JMWQ6%}jXk=9QmWlZVraPvr+Gtj?B|OEa>}68yya|0{U#)8X8D z+DhbWj#%%D4#8Ev{~I1W^tj5-T(l& ze}@Ox|Chsq$8`h0KDgWzIQQtj?(KyqPnvEoz39}v^Q~sC>CX3N7QLXg&O^;XKL(EL z-TgUwx%uv|FZcC=x96TT2mk)wsdw+sues)X3o&DQ0-s2^fth?@|L$6{4IC%VcDIumh*F+!#lh21IA_Rr3BS3K^0fOqZ5F|90m zldW-maiY8O$-pB*7*g1*s(7ftBt`aoPu0`6PizXz9-6&;_O8UC((_Z#%aYL-#I_h= z^Xk&^O82+V&i7V7|5)QQ-Sg1ArtDLFz>mdGy)`esG?O4G5sO;tWLp%soKs(I`E(~) z!aSF`Dg4@-q3ZSde?5cv^WPanNtu5!hynjI3?iDzAm(#R?X9+=RsTB%vEVO*h_?DK z4B{eXCRuZgKEtM3wMvOUe#+ykpQ^~D8%Di~dJ$u9Ar{GD@8mI?`TC8CMEHsPh1u-d zi{+OhrK(ux9tPBP#BK9T$h(hqBVXW{Nmt4GL8xJ^2^Qj3qQR>PG zg9Y7WZ3{J-Dd|?@hCNz=nnIgN#X6y*DBoWe(rr;G)xJ}ZAnr1jgwj`Nwon&Lw$NH7 zkmTiTloa|%Xx(ITD?9G?x$D*A;BO2_)d={!glU7$kr-bUBLA?5N{2|uOT7t`&~MBG zeaT4qE5JmeWU{}GO)UC%N(ahitL=C?$W(+rXFFQd5KMk0DF%l3C-UtPu_(w0IA~rh zY;OzAE(qfQfy6HXx8FrD0Dq_@rH56V8b&x{gai)A5*!TkNPUMDKu<|0kvUErun}+@ zNR}W)xnI|l76;12g+uaRABIXM-iR0^0X^y5uq}D3J@4=P_|&&Dz+>a)p>#( z$0#O>j&A8=M%EtjjiCw2HtpD}K!U)IHS~{ImCQw54A~ z#GZ|m>M=T_Pj)y|@ZPhu@=FxfGyba4uYk*Jpu!GP0w>=_*X!j5k2-;63#*%B$Hj^s z#0A$TS?JFnI{*}N+_|})ApH=<1u=q#W?aKHLdmTMxJ*JP$hV6(SagZbEHGWat24JV zxPOVsTWAgiyM=l0UU`L>xB5GBhJt!__Pg$l>i=?Xg5vSg?ZdX ze34PHlxQ7Ax%Z<$HnW1){tUJ^bS|RIA7nJjvu^h4&5e~m@Q&8=X5#EX_GPs&nY^$b zNYk6U9hXq8tvlOc7Q=<1-TD?+YeH?ggkFGXMYrbam@1d z#&N#hgPMv5s&QSsh{R~)XYnE?%_G8ySH!4KEqsSlmOr^LKJbO#oj-DRf|95mcd|Ku zwvX|U?0QWFekj2VFcGp(hdxcc@f!Pm{jNGOE%*+nMd`v|M&9FfUHl1!8yR$#jg0pp zn`=*?*80t>?y^o_H#*OnMMYd*o)kdkb1YwEC+G#u6#wkC4$&C8X>_S-R7BVVTj0y`B^GUO(~8$>J-p>)023l}S(T^7*ov!W`aul9oSk zct_*jv6*Hfr+V4P>$nI1xJ%+9!xMi`KI=T!zw)5q&!at42@>eQ!7?rB#{z|SUM+FB z$g*8zIlBFEVt0|Kw`wcREA8_j^r6VLUz~jxK_<3{<2A&~m`}s(ZA4_1zwKM<@YZ2> zxC>+Kez^sOrWaF(o8=Q%3iP5a`pLc9kXrB1%02z78{58T^D6I5)8K{Pfb&95ir2pj zKr31usmDq81s|>uoW5$m9OGCr|gxQ<8O5$TQg@Or|mB`gP1Y z#R{Bs53WpNdWc2uT{K_c>wR*s>=Fi6jJauN4Qtu=5V^*1J+l}gX-~*+;JQ_+2^l^hzWymLBADn|5Den8@{Z?X~Fam;m`oZjPUZja}YnM9@9gl z?|X=i6b=uJJ^)r>dWd?MBUXd^9^$SzZx!YUy(rv${y_U*4>9H#PM>1z(|*=EJ@RJ2 zearI5T}gEPF`N_u#lwwjjJO}}5M`MJQzA!2n?R2J#NA{NkZZ5c-Efaeb%@p7Owz-5g^GZ{pthuG$`&mgv~+7uuAO~js3^{i+y0-R9OOPDsFNETcCulw#0@enHm&JTPzuC2IPribVw zT}U{Um%%9?iovI2CNwaM6xN%@Hs@S1lNy{q0}`{yiE~);cYP1lLDX*o5}#@w!wY2= zf~TdDDVh@Q;;PnQ@$tC~75KTN%4<`BBlReg;2qn77 zubX(OkrN}_M_7vRpEeA?BM?%geGE~=6;kDs*3YHqE5WJ|nhefMHfe`{moQbwL%u@{ zJv|I$>{(~Ps?faT{&mR0hP(U;O>IY3o2k^a5J)i-L&WW4h=~MD;y)Oo{yv5n^bdx} zne}pu?dG63a#}gL-C)4SP*K(D$HwtRnyls2;VGI?WKJTk@J~WlbvRd!NX3EhvGP6hA))MljV~eGv`5?TEG?) z_c6po4-Q=tIDY|cN@des1yy1)RsMw`F3J=d$zn8~{R>0HJ%Vwvh~glN-7Mc0&_B01 zG?*A7`Eex^L&Py}D?A1dJ-%?DsC*2qL;;fsr(Vw}t5R>tlH6~uvMP`jlx&{Z45I_b zamWuR6maa{+mz);Q6ec%?l2b7A^EHwet8QjdPE6u>sQGQBDklz`1Ui_rcgNvPfVx` zxKv#J_#mpVzzrq6b_R;6>}t=@7&;`8P_PGp*vU{Vjph3)LLHgO2LOCT(ClhS z%ObkUv!pf#gk44O5gtR<5UDZ{3>_^^fP+E64jzo80DL6S*atv@4ndNhe^^9MEbdR$ z%g0bWZV=j(a@!@v-vs631er#HqB=>warF?^MfrV(VZ^%bc&M0qX1!#4SPMAe_|#E%&|_SH$T*=d;OFHHY7n%fe6#Ae>dYs4AVa zHMh>fPAs7tnIxi3=BpiqRSDWr@a5P6mVZdZ*pIbmvtvbEbA0}ih>lDWkx=-Zk{h^G z18J!(T0&Fg>SjqucHl99dk&u5ClS>aA-YTwk^IsZUz=Eh)?ku|r24wv`li@eo+okH zT4I9qGUhiil2Y=`u@34Cf>Sa-BCAH_FoSZ9T5^COgH#1uea#%D;nKh> zw06r$W!!5uJCHbgMwHya4H>Yw>9-lQP@zGm0Pe)k1K+7Mo`zN{=axB4)K@V3#&wRn zOb-#?Do$dV8?>6E-vR7cnI59NcKg1E=+kbJ({9$#ZZXkr71W+a)Ri%BKhc0Q-feft zd3`3Rea5f#wd&=e_)a$8qgXeW0yD9A0XW7!33Zu1(bj@s;Tz zk~>| z`CK#WgMVr;ruuM1kYOs?LjgX}`O^O5rQOFH#Gh*SJ_%YeA?$nh8pAE2G9?FAq+^|<)997ynlc>q<|T-VT&cP z=i&A-#8`lpi6PdnvM?1)#xe&P*x#grb@nmDbliI~bss|vV`7MF!z$-T)05tq1ik%y zuAArT(1Ha@7C?TXs89wCj!%HO@3IdAonJ^-O&+%Q3@b^+5AWen=2V0Uz

~P9?(I zsfZjpx`+-Nqp`hS9dIR~0*S~y68nAHhdt7VXKy~-Uu7A?!$06L{#QpEgarcbw)YQ% zzu>Kpmm>Jsr4?u$r1vlva_6)ZyahO@gn^33w+lftQWP*1RKyGkw#T4Ce-WW8Hsg6$ zVPkjKlN!B3HxMaIbMOE>MA!8G6aa8?>dx@&ecA(9jR)ZsHj}^bEapVm z`qimg0;`0`+#mcGd+)hVObL;Q+G_k7s*Al{YM_#%zr+CAj#%g|4O%VH+c4D}U=_%c z&Ad218mbI`>w|t5gxMxywg-oKh8A{d3q`<03}Ws|c=0jQ(~4y)Xp4Y_3Po^?oq@PKST*=N2;3UB zZUO?MzKeVp5xNu}@?E3vd)eG~S)SD^l1q!;!p2t?(tPI6{$v`6^Pddbz+CNH-dH_( zoMw)Jst*U3^;)mX8Ypy4_`)g%zOJP$Evba#W+>rCp3h6{uDQQKu%`ZH z5XFv(7Zwk>87D}bG|owqidVb$f6X9X{4nBSYby8nCY!i8s)~8yl%%QhZs|0Y7-oV5 z%Xt|)I7%)cKED6<@xE(s?(2-33g>E6C3XIdK^$9k&L6Iep}D57xOOFpUeB>zm+!C?`Q?m_>=qTpnr$MWRs#L=eu`tI-3C59!M(_4QT#3vURJ|hB) zFXnXb&&l>&ugCyt{z0md4{w^4gwtY(dD5{99H~AA{ah1KE!TzE@CHcEQctDd2Kx-6 zx!T&1KzJ+FaF8!#97)}%s0$Mu@JL*b8dGeuPUx7Vq~%YGI=n}paC@!iW7YM6q*3}Wh!7ivbuMsNCM z1@IlbYa7>fhKQU|)p{t{#w&(CdzuABZPtIEK}=IC5-=LGvrQ?k=eDS?cH1KYy7B0O!+> zDjdC7T#&v~%Sg-KQ-y)>P4mfen`6I(d?1{2`I*WUj!QAQ`n8i@z0$6!=h4ipOp5w3B_BF=+A z90-y};$7W2UcjiV3xbqf0f|CLAU1U*jEIKwdKiFU1fa-+9>`{s+o=Hn$kc0SDb{@k z@mA9T4VpNQNy|}b0vHJp0JJ+6KYv-T=zJ>x_7P!GbYTY#vIPY6oc|uQWr*YYdP{4@ zP7=V^#ncjjfRA5}R`Vl2Lx!&pc#nst3{M9C5co76J%k^CU!61@`UdJpiBSnAE z1b#+xs$_A-i*P$1GjTTn+=y}CQzV4H1=K>^D9)6929aQNHX8SrLA;@2ompw$!`f2MA2ZNs<+7uJ*j+E6 z(h$J!X_3eWyL|4i!iu7v1#tB;abmlPph5-SHA^AWY?@$tl&-*mP?dNq-+DlW z6=H4-3dj+J~b=LCIP}R+@OF{KfP;pWc8nP5=b%@Se9yw*i2r(QUJR2-;({mUI{+ zf4=6)VeczS@xg_$E=t1?*opSL=bxL*NqddQ77|{}b;%n`6WeN^olN&;GKkZ#QM#2$ z3^wpGGgD@zN#ewQ%$DIj?}KU;=jN<&{}+RJX0DdWAmW(}BG^@j$si`4NfNJmPzE7#l5;!I8`x zNRmAcV(vmm8+)LmY#w(Kp3dHC{-eVpUgr=A!2q=zudn*ut2#A3>(KdwDO_RvTL*U3MZ>LpAZ+61|pC; zcTQ2ohaNA9f-p=&Z&{!NLICyLh=%G~pLsB*0vF^*fJSbG92)Yw8|%Ry%1L2+R7yGx zqP8fHj`o$-Q-Cu35g#LBf^O5A&#he#8*}8V97PN(6c@hiqI^5&PK5E{w%C&wpaRt; z2>hUwfEP|(4u`;<&z)Snu5CDaEk~&&*7DtnNA2(VCyyPqPWFle3UZ=q0Z^o+Cvs9_ zx?B^efLp5J?a&6lWIOs@|SXNmY zOa^y8Gq{J<8`*yE})aeel4!#5*kIM zfrA}6x$u-3{US2-#AhJH5DwjWXoF7?c1TvDH|69RLJ@d-AD&6DK%tzH9U%l&3t;`- z4a$25p~OdqVAes}b3+k`he9;(HxECn4 z8L&R39}cW!I8=S?S{d!{0!>-fs*855C|^(ow)!)9QZqf0Fu&z;Iaz%@Ho;v0!&ty+ zvhlF&8z7-3O2d;t0@vR8@%gFt>_f)xSM-X#+WfE z>tHj>st}Vzgj7V$%NE6AsrnMKyk{Oj4>X-7F-b&{n&y2GCW)A#Ti2yI-`m>zqb{!P z!P<}(t9j*-N+o_-%FNSNz8~6gwG>8`R`kA!n6YmnDqH*|5lwKl+(7SC1Qe&T^OTe* zpvEK-r8H?IQxTjbo^`99DfKc*#5QgKqgpyJ4S)L&iMVB`)`tLlY5_P>{g2#Z=SWwT zMZh+X>hDAd*pb>G<$_fR*0Z8JM~}u++EgWS1a1S*Ux7i=0A?)HK$2>UsH(Ro0VaUM zIr%~}UAaD_{D^_)hK6E$t;RVBPklK4iZq@JV8>EX)JR)Apr+k?_(XGx0|3g9@pa(W zgi3f3wKM_HO^6^^x(FmkJv$brFZm3tVN_ER#l#l(RYV+Bg#1@UBP4blmI#{7YW)8W3jj02Rg>dN$Y=6PJVpEzZ|7J?eAR%*s(JQ>lt zuVZ7xHTd+Xc0IE@!2+~G21~0yLgOWm$5A^a;yU6oIy~cgqUt+h-;$1>ARVB>Sp{_u zXm+r584W9U#t@DVdKsqBvVQu8%;&X%+a5^yh%{!4HLB>G|J`}rheVR>3X$)G6i0>2 zilLw#35<=n^-T2VI6iJGLnNxvoSVc0#RPg|=v!jW?`Z(>(1>F0)QxcejTz%jAm z(7Mo%7X4pO^g}cNuYgEsjMYX7LcI^^J7_$dM-dsc%8~^j)X1JHg#TKuL{OxGtXP|Y z_0NsIKde>YSi@?^mn~3b|2h@vIw;t^@JK~>;(|595NvZNNaxKuNt#Z1GB_vcu z06NJWuLF?=gAk;v3);9iEi~>JX#P-~Po_Z6=uvsfNj6#R+3^&)Fdet?zdRz4;Fb>u z4E!zmaVLPBKYgc(oz=0KvpIR{Va#VMzf2*411Kz1n;RETmk7ZG{?tRqaNdM{9*o?@`xG(&00YETSVw9pgM1ea8}fRpGO3EV*rJe((HDXrE}xzK@rfc2V~yy zWcKs7wHza-BTr)^^j;#~TAuF*?w`d*Jj`no8LVjvgoV^coT}%0--hs_nljec&({$m zPdf;|35Ym*Cn8(*6Q(`tGTy0<4%3kYg-u49Bf>55Ld2+W0#4`i;m@^sVGS-71C~Qc zyyt%$RvZ@o@_{hCb}Vu={^Y0+sTw8wQ|G?9&V}8dc^l6!ypPfoEj{6dKk>0v@Uq~S z-)}BJbF=<@y#V(nPRM+LCeU$bh%9A96g(M{PGnDbP&S`}+9P5VQzegvvMWwHK1;D= zb>?+))*gyiS9ZpJekch-^OZRZ?j=1VMPNTWe=_CNdgYAcn1Yycan7@g=(-e}atUy{ zNO`*~74rxiyT}-4YG#J;++;F{Q?>#Ku3Q(TbpF6%?h^Mn0h#{+gQ!{Ns@3GG-S4XN z*;RMtKN!TrW#MnoSz9Ndh(-Hr#lnIyQGG8x3&8EIoT3n#un63kR1RA9@^ zhU22G$VI!q45GbvNQe#Sgzm+Yd?9@97aeo|VGu8#Yq}U3_4OS5>uL26Cyqu%?jo1{bdeqz z>~D%^y?@VoM6&sE9KN{{cT4f|?Xt^{G%k-svE9h6DJI6f@!_}(e;MpOR|&Y^(RB~+ zXALao;pmRJTXZByiic4bFz+4f-c6Yg@1Kuza+iqa;M4$nscU!?ryk6m4?%bY3N9ol z&ZoS0e{j0OxVG?p z)MF9PVyQ)0tH~$nwE;y7iJP@0O;OJhJYPi47OpHt3(m)C#FIppLMU97d!E(RbHO(g zyk|AVt=Qu>YhRT`)p>h0_Rj?Kg-|}%MV0Y1-)y3JFEr{d`&uQN2kR#6%{RZV>wtT- z=U(ZVt;QRt$u}(bFd4+n*uB~)8f)vQtEO5fM#`r&(>>wN!%oRf22=k6;v1zNsB--`0~(+x?t5@=1 zo+uBUZ_>Ud@gIz6eZb^F*r&pXH`=kc^g{iAB%?I$+56Cm;$!jnSaBq+j7FSK`#meY zdlsvq;h&NN)c?VV_o*-<Kt z;a5iJ?0+8={*w_C^NHcQrm2Tt$V|NUI9}0()?y$Omsm4&VH%%*}9F zoeCq;A@n0Ldi!dDiTD`?(Gj}K`umm>z)&iTh`jfa5=)6%WQabD2;{!+L5Pcbl~{hj z^hNL9!aHmP+>%#KM9ZT`36QI z$V`huaU)C*!(8Z}fSH!nu@4Dh2#si0=V<$z(H{dIDl#RhnJ|KUcgIGAqPKko3Z^B% zVjgGT(_OKCY<<`>wYw`k0VV2bZ6`rW%f@;2n3W0k`u3m}^ed`R$m{yK zS@i#hLHtP=Z71jE9lg*~}DKAhKzxDY)2GRFO>HBzrdhq7Pk+RRktqrcK)eW z`h9pVs?AbP`0zd=hg`RbbjhSm8>VGqZF33+UIlcE;`V9|hM7xd1#ekm2W|}%=2#5{vQSr?RO-AqI$ry`9bS1|A#@Oyt!D#VRK>!MY+&-BA%sd z9V5!K1mmsvHeK)6Jn+LMRdbk)i&7cH6fNnR3a8f>BNFkQ(slnai1{xKt?OUR-v5t5 zlzHBBd@MugW6_pv8svQ-+x%ME0 z;U;k{9Ev6-7h!rP`pcIP@RxUT&wJ81esbdjiF4a*R%^)<0jxTNj29nPE+vX_-C$8Z zOLJ$?JVjbQ@8OT|G!lFjGDEOA3EfK!pik9NdM$ow_#Kr-q&x#d+t0-sKaR}D4pPzh zuMCn8g#kiu&v5;2e68#KKv*8ycH4xT!twS(M|kWOKWEWOCbx?XEraASY`TE^AY5@6 zTDYP-0=;HNuK;s-(ycb<*Oh)w%)}?BQSMZBGC|hNch#8>*J>dKbDG&vCpnEQ4_-Ah zfT~aJsty5D_od6_KbW{QCvmjla`I#cIP$ZlkS{qiI1&%{$L4D4dbNJS-4>QEOvAlc z&{6wwbKTeAKja^H;ze-U5pmr{Ut8sN^2E=~HbLL69eN-5!e!J(O#EcsM}z~Gxt0R*-h!us~ zQ(dq7&`x^3Xb|*g|KOnrq7MjSwT0F&4OP0w_na3N+I}{>x!|bkJ>JK5te-7zKBbYD zhC26oZbArW!!}+jB_}n`PfFWLeG=#)TBXdZdAlEYMIV5L>4-pc4y6I|eHWV%?|3HM zyTCmPadOQNF0s4dapA`BHVE_3p`ApWjXztGGD+kRTi(M{n#jUoHWQe0AuY%vIka0; z5cLgJwN)W%%{wZ1ipbS#yore0OKjh%{(muu$~CX4yPi44Ct?UgsnO~^PoYC%qRz@> z#=XL?{ELSsYLqe#_8av5+mX#jdN+&d|gD_A?&J2w-JpUrxTePxx zKyvg;p-ijE?NwDJRa-oQB^5=uLL(1GY@GN z)_(px6>3twA1}EXXf-(`!uxZzmT)|o;X~uZZjP6gxSf?g_Ob2d+bC=8+@KWQE4!fA zZxSvrhF-Q@bpnQM#VO5h&u?dcQu?{YAdyxVZ#3ZXvZmiujpnDFDw@Ro1EKJ6x9;3w z@iORc!*9(PR^7K{pDMyMj8*V;iww%)h(l%*QHj|^doRPb(2Zerk4YxH8i~8o51F*z zmFYu4dsHWH#fOVafteq0a!wKglRVR0BL+wvQPqQ5C(}-eweLAMLL{|5)!0UV6yXhX zBs`t9O=YiJf=T5m=7!B?*t31$T&%X%uC+~V?hIjeKuX(DTv;R!>2>{d>CQAdQ)QpM88e+D~O@T>aD=;dIZF6XAD&hEzKiP&>!Z#h}Zq6X+yk@GR z=6j1`^8gVXYL2X)VD@})Lr6s^<;|0v-TO=x+@@$>=|74LLB1=@hs>m#M#8CnC~COv zu#G-oC$UC7{sOAWXb0z!Qe%kmG;o*74$IllKc35JX^73Tgu2|=D|(B9j`eQF zrIg@27)7eJ-)Yl=I5hy|tw55=Y!*ZGDp|RUKr40}lxbN1Q{1fEbfnPZUdgKGSN zo42=e&#sDZU3$%U&Uzab(S1Q$!Fixk{chNgs;jK)-sXZjR(X9g-@aYSI#ACP!TwZ_ zXU~xwJ=AUv9`|{OP@{=jjm2ilMs^61~jNsK@_=L~U7X-95UGw}kXu4CoC=i34nuCBot1kIYUxmqb(?*`zSTe_L1DVLTBhgVxk>Xw*mABFsRcQa1$sfGsWwsNpx^8 z03T1Tga(F(hF^CEppn!EBh$$d0N*Q$1ssHlpD`jnB$!6Azyy}(NF`e zM6+M9Vm9tyq-gmZ3`~v6c(w{e_}#gPi#94GNF2(RY9mG#Yw^Hj*$k=eR7qcYDXNV) zLDD5)G*-({HFR)(6wE*F=$?`MW|<6$Y9q#S|DiRd+K3u`EBng|DS#v4c!@odls7;H zM70t7kiPV~4hCG~#P2nRJlE(fld|Q5wXWkGQi(-!FO?z?-ykV1M$U1{k zC>P@NkJDI9(1f!H%bFZr?p}zsNcskjspK8@od4@ z(AI+C7!v)f=sbP$hynPQm?>d+ddA?RT#_*_D$b2o4Gxgs zpylG3H;MGJA@Z3zNKpTS1b?4IrPxubjY#kUDA8$FBd(udXxjkR$FXT5mWpV@5q{cr znnB(7B!voSmYM-iE~RR(p;!-bWA~^Z=}*NG`^62U)Y1Rhh&&FLL+STE7r9bx#DOBM znKCi*Rl7aIP=uS{*ajV*9+b>U-EP;PftC$*^x>NvL-qGt@{f zGH1YC)>aCmKw}KqdJ@F@-cDW;v1#aVrvL4p_ptv!-rYEn8O)b_s*Tt`N3{_hu=(ck z+Abt$$xjg%k}yrV8XGP2#3!{Z{`LmdMl>|2PAj1`9tJ16*SIsudHC+4H z;BTbBU#vHQ1@Zi7BsEMa`AxgkM+{MI#8^%m5_dC3@cpvM$W7^S;)iV7$%``5-juu~ ziNRwzp*DR6&kU6DOv1L6aT}u;W)$0gY>4GW=^$gj!N-NRjCD5_Ui#C=Hx73e-_el; zBoD+|L{>A)H!L}AOtWkNX`dUsH&jf+=#Ix3;o*J$1AWJ_>6=R=sA1A!DM%_F9*)H} zp?Jyteg#*Cb%k|(8??;+;}K==ygu%Ky)3dG|9a$VRuPZXwW>veXHX;f3m(EG6 zLJ!x#j4;5D#)9xfnX$V$1yma`CVw=ReLRMR7%NDS)!7OVk12EL_(`=95rIa6URNdf z)42oLuI*UpwgR|(arC0Uh`jMq>63{EgiZNyQNhTUZbV zEPv?^Y~#hMiLBMKv!Mj$)fh3JtL%TbzymEHFL^#ZjWe_r6}_g%)Y`zPochm3i~-Tu z8$8&o(4^Xkq~)?=9^VjdxZQs?V&8Ya8#;3>tpzcQn~uo&J_xgceN2#jccDtF;9}c0 zMShT9bZ~1y5IP@gM`vfV68l?fQ1Dx<9h2M*Z%^S1)sB&)^d5bpInXzNc_ow72TFr3 ziSgT18*x?Gv~bfX#0V~&a{`F3GVwIaB2{=BD{(L!?f36Xxm*Jq>y4{A7uuO3@Q%?` z8xbXnyk2p*5UOSf`wb(W)>_}%W#6CKWk7iAzzl)@@;1iSD+5EWTTyPFmA!2FJ`5K4N& zaIsc8Z}YAzmd*k6gO#m6B1QlHHRDUXMi=jj>5(h0@fK>2G@8;YW!{Cu_(}3SefGg& z?KrwYKL%Z2^}VgrdOtJ} z4LGKQm2c)Ub(Zik=-ObeiZIB{(CE@oP2X;#;7QJIVIgu%7Y z0BALB;oHuqpR^j=DP3LM@tyfsl;6@sasIKJ`RmgC!Z?)v@`HiV`nB1fE@JS3yu0xa zE7XUn5s|b$Dv#LT%&u*8Vq&B-_?K?bqI>Xq^x<~Kmym_ZH{;GTAp!@CL?t!`P@dqbwzeY|{bcTd3OwwH# zVU2pbs551FIwjdNNq0ICtD{{KseGm9b$j=eXU|md>9l&(XxQneCs7~lquzsiW`a*7 zs61lQmA2r1cZ#WQTJT&*%mU{>9&zD69#Qz@KOPaH3pwoPf;^=1h)V$&JsQ9gl}Ch7 zg+ofu0@E3p(z7Ibw)8t{k<#;>^WhIl)Uv|aqWala)3X)lv&HMZtE{?f(Y*_jXFn4k zejn5QG5wIKQ1`3$*`i_Z3gyDL%Txfdck^e|+9%y#{SP+-dUtkve^=>l{p{Il@7?y) z-7$<_vY`Ah?EORK5j!64ah|Q(zxwg4cY(?y)}L+tJU!%lbtL&}H-NHHMfu&O_h+&9 z&(r9me+;7j36L6Oj@QSWnM-Zh)Rq`=W0*$MvO%)Y~#&`PRqQ2+o_Mn$S?od8WvDh9rJ^i zw%hP=OMmzzf9M}>z&a znfj>=BCa2F;RG@OG3L)2fLiHr4F2B?A}6>WGR%L^OS&K+^gjlXf9k3Cl^TxMVvnw5 zzm_P9gpMLA3YCiZlDnZ60^drP3uSvwpyS9}O4*}IZ*+EL;>~AB{ICkG2D+OXDphpP zmEsJoRAxh(8mpc3IVbcP94E~c*)QZ)n?#~=Crtz%ZIMvoMXkmW2$;K| zWGpc*edi7R5yA;*?RfjQCuC%zwYK7kic7EubE(yxx%Y#zaIDpECQs+AM3EnH)+3(h zgpT$L4>4uQEXL@M#h6Fs)RY z-+abLl@jSCP9XPYFBMHz&`GEjh|Sq*YdEKXGElBJtk&H6z&>vj%VPyRDj(mJ^2lpn zDK=&7d|N7dtzeBU3|_rdeTwl)HXeB$Bp^MTRq(U%Rb)8$1X}JB{N!ReLYO|| z`p_1Sob3-m@X7IphzNr?%Jgfo2BMXSXx>=^W&^bJRT^X{SE}>Z1uvyw94>keE?8LD z%n*7t2)-d5Kj<-$GILRwUZA5-an_g={<-?x$)d$XIkz%tYO1Rw4q6%&>#H=iC@@PE8Wl?#?~{k!yK7iH4ne4jjY5u4 zf$-f=;;!dKZ+0wYW$=nw$SQ&_&)cA|BPOyDK)p4Sy zNUlNt_njE`JB9sV>Y?QeJ`O1X_? zJK0oNcmRQI z$XJ2y?CWeKE|H-GaW1Np2%+O^KBSXBbO(m%(?X9|69cQ7vUMq%6aLre#h}SKmOOoI z?N)};-o`+Wf#!30s@s)`x7a>~!bi=NY?xf!(}oIWrdfdZ(oL?e+dn;=uB)`Iv+7xM zV5Jy+R4&`EtNNKRJSt`0Wpb~+LTtn8@2nItA-gHLU_BfEnb|YO;6lh0I&Z^+ZJH-e zYSWj&b;N`1j6e_L>I0gsh|YG5(6u)OG5i0e`{HS%-ma;baNR{DL~||CUlZWMS8e#R zpqf7MbP{YeL2K#h(xS)P)B7h~LHNtJJ#^h=5Fyg{;$7a~EQLDT)?=TRuiC0*tAH#s zqji{shdZ*({Xb_s=w9X)ib!{RyPfmExmRX&Jw3-^_i?b+cae7;j+PI1*4MB2z4Xp{ zCu#kiZnwW<+9f_I$m7?eR(D9($UyM7UzzhMd>t!j#?GE4@MX47LY3j_pWpZcA8vdU zqI8UeSx;nncOHibiEJ@xWDA4z;qyWU$=b6T4YkT7`bM3{fM`wc&|R7o>Gz~HWPfdQg!_( z&Hvr8b!P0J>()`@CYcXXxHVo@A`JvjINa4*23YfE7(^ER-M!=HplxVir?bexqv|N} z`AlueIyP$b&X`eM?isGS5O0Nz%1)3k%w;& zl2XoH8RNf#S~H5%gOLu+6#xt#06wIVpbcEaW=8@N!GknG66jR6zH1O@^>BYE23s_r zSgB4XbwhyOc;c(s1f9Zf273aYdU(h=N-vDtHyUdr*9uPg_JxwxDb|xD6VceZs1KG|GPWA zG}LJ80PmPXXPMnUCvh3LN*LbQda)A-)-C~`ZZQHlz^?Stj7<>srfCRAo3$6m(FDhK zYDqcvhfQ4N-o!WQ$u77pL~4@r(od+f@0!Hq_sDmZ54Wn|?yON|JRhI)kQ@FXiCld` zq3-{XM4bM7Yk>G(DN7Ir+8e%r)3#D;q>A-N^7e2p7 z1QFK?s7hj6&6D1>5GeMOPS)!ysjeznzUAcViAnsNSu)OB;pV7;rOJ;hl{<%CH!#`pAwAW zZ)+q|*Cb<=IQ8NXZ)!xB6s6*`jS8j^nRoGJccm`GNk4@O1QD!d<)If=dh~xHn(iXb zl}R^D$yBj{NX>>rJN!|~Rf}E3EzU8m6}1a;>VCLP(k)Uw5g80IB{RgCi;zo>UJP4DJ}-mbF#r#t#B>w4+y`d@7YrrlQ}O@&i} zbzhC?0#&S1no$DprQim(ze&>kzw> z%5|{AP7#X()7NUP!~A{beGV4Gxt43YmQ5;Fo&i?cZ>XdXeZVSeZbcNz`L_tFp~3fN{Z;SH1QoJJkGn|dw_jt7Vj5xL&) zHWJX>o5H|TcaSAUXuj{rOW6FGxB2UkCKyTJ3(@ArhuyPDr?0u4cV9RK+%vrSNF?l$ z^UaOv4kJK64@B`j2M$s46IV0 zQ^QwCtPeO4!*_z>m??p}(Owy(fy_VRqY-AE-A8G?5Fa9Mes|yI@$aaod3lB8SkX$C za_(EUs>`=j_nP(iMdxF45-kI~}D za}xJU8DLACKDjX867EWUj(Jt)M&bW>L=*=$M*Iqw_bmq|2O7u*jnBac`~S@&jt{So z2+#9A;Fv{E{s-$*-v9mi$6vlyY- zPWu8}%kG^n-8)4wO+fhB+ELW4tluDk;xQro62t{7^u0t&;jz2NxsbX~_n&h=XyNuX z4oft-3sl>W`c7mH3__P-Se!mdJk;lW@6PDsz&21qo_(=(QVPuu4Rbb#`ux}@{&D#E zf$;o~Zl&bgXZnAa-Nj{5Fc*XS+zIzze8LFvV5z%iVAZODPhMd@f+a*|kzJ=|6zfNn zB)X5D;-C-pfe9(qGmt-lyorq7G+PNMdN=a<6I7g|KF1&DK;f0>doq3FOxz_zwjr`0 zD1h?41rZZc>DUNQ=Eo74Q}PihfQuB- zxDem`JAp4BSLGMfF z=9tdMhnSG-O_J-IV772UqRr*>e@LRq)7hdL-o~$Pjp}b|O*N$`>oKJdZVAew6dYel z?kqt0KolP<;P1noBGDY|{iB%&@U`PQ1+JUgdaQOUf?J+5EzTA;` z`SInsC#minBzFiKp5JH*GXGYHa17QE7SeD(=eq><&N?5LD&7X{3Yy{@WpBT=oD!{# z%2i;ooa=Ti=$M=93H#iUzDyMdsJqhK^8gkl6zgtjhtCMtCWfRHc6yZJDnexH zCcXx+P)~9hAM`}Wc@1ij1de-9njkn0EwQ9T5Y?ZfUqnID&`Ol>rM;$w6KUo7bH8>W za72MW8*@sf5vP63pqgw=%+ZNE5WfElqC*|-!G_S>HZMy&%LGUzq`4WlDB4%)F|9c# zPT0$XPc?wlq-1zW36VH@Nxz7Cp^Y96N~z)OzCu$upW|AHfcyrZT;f`wc{w2`d zKaE`>UY9sJHa~g9^Cr>Y&F6V2LRe_yuAoVC)Ta5n2>y4@MLa{~#*xzQn<89ne~~YQ z&0bn@t&R$Guz#Y&%xx}x;N&b^#&BlOZQvjCQ$VQ6>js6T`gPnzlb1WEOL?Z6t`rf` zQ(ckMPu%gHeN&n}v%81W@gnbEQJq9I&=%_efB-UZ^Cdkw62SlH6s@*nP&wLQJlfk?s76`F;9u z&6C6j^)&_x*|J6Npwze1Z*80Q;zcNLfZ*eg_$x+Fbv0^VQc_y$1oW@OPii$q~o?50bdkCxn%G_4j%C#@IE< zL*9E|7j1kNPt~1Il9NvW9)&~aFZU4Ml8Pt*eu~Kb3T&)ba55^_SZv+6R~wKegVO`?!X0 z-Tcxm>U<`xC>ztnF_tlBgG|1e*z-g?+WN!ZuQi3gub*XH(0=P-l82iH>7K03PmDd$ z(`(v^*SkzCw~ne%U_G3GKg0ZCTI>G42wJ^i8JEt0<*i% zqkG1}G+e|)r+6BD^iveaFK+iFkx>SCY-_`GQnGaCpF|G7dS=fwaZL@6bTy~JM+RD4 zaezqRuj&cl-NIW!3oj@Z|t$Q@OzPVMq%-@OgAfNc)F|a z9w++>-Or;Iy~+tzYsOuMDmF)cYVezKvk2(;p+)`!>zW}HUxg{``|+8m(WE4+t0y&jj2dtd9ykO*sKz69nU1s zIT^0-R-@8DFiA*j>JuSk?N#qT|{7u3s`z8TYq`Bx^f|!pfz1;MvtJnIdZV0W01ZW(rI# zk>9{y;}ueQL}3)FoS?nA7n>oT*I}P6H_mSWI#_#t9jdur_wMXMGrtE=^8gp1ySMaC zZ1!uhUJ8c&@(CumO>|*fkQs#(|LtLyl8}4&kyg=6nptvbXrC~g`#=`J-QhxY5&`jQ zGal5><@ob?$i#n6;tbVEG~Z#8U2zAdQdLkt7bx#-u>H_6-9IOhn`D!q*In9d;zD{5 zXmFJH&Tf6uukeQ( zgqz$Jn{mOG;J0O?S1{+#aL+FOSp};e(`6Kr3-t1AWX5h}rRLU`M|5{6Tw-%~NyQkm zsw{yOM`GDgI7Z$_P!@~a0chc&aWiO0GXM_|PdcH~^mS*wyE>VCrq2Z9@kuexrB9A8 z{Kj8|A9cKoJ}s2A_x%F#wdG9{Xi!dhB?^4Zb7wQvdp>eXB#%HJ zbnP}<{@>VhCHiI$LB3fV?<=$@>M5*GA%;SJ)C?6D$E8${y=U+TK1Qb;Hqknqsgp+4 zRsrU`@oIY#f0^AZ$$c>EZuT3;GgUIrtGlA)<6VjNx6M;Jr?Jaa2Jtk>`iBI_U&WYB z0g19`z7GvgB5RpzdW^?mKepE1Yw$IeQK@73)Y18^*4I8xuuug2ym_DzA^g)0#PKfm z#XUltoUhxW^rr!7wOiG*QWmrBOp4Wp0%ycT0itk%l-cXptLT1RpGj6LOAME|urXIN z!76&?MQqIKhAv#jb#w~L?yHe3Q3!X9j&W)&k4kQt`7uM5FsV(I^LJcZS5X{CGn#5( zZyTfO>{|5&mZ?LLqn=r~>L#XR2le*=J9@;r zrRul6<3aa5EtO8;0hD%o=?-pzv_Zv^jQ# z${>yhoPkU{PF5|VkB3Zqcb~tDT(yt>J1^9`-voa6D=hkCilTeir0{U3j&dlVK{=X! zb#_QcJr`o`W1n9<;Yll}-^tWF{goL#UKK4sq(px`V7RW&tTO!I(p~RcZ86>I(6=}c zry1s)4~A=%h3v#B8ib+JhrB$T{HqvIGYotcDq{A}KzrO3F`CbEmuzEv%7gC{|hInH7R5D1Tl9Y!jR z@K|;vI#fg5Iuh3c6H`!$&Ke2VW?~h>ld7|lqz;qr;-$B~CujkHI=R8DO5k8JLn_%J zH=HkPH7Rw3*QF}aZYGgMLomXHlC(CFh?_~W(nwB5rDXf0q%kLmxukk&qZTy#BbXkiN+pqPQv%-L1=aDyN51B6r>2fO@_at$O}pc` z@OhyZb3$G|Zd}6OlX|Rnh&Ml9koHpC)C<@r%IVso>im`6iKxda!dix;#_04NJXOo3 z^1L+)s%`ThNANw;#PlZTTdd_r)Z_=97rf3BI-;2fX)o-^mp(wVI-%*GRCBJP{jeyI z^AW5$reNYT{zAtS+~*vEM*io?CqFa=vNVd@5XFOk|8uPn$orJJ60VX|%6Kd_NO)?| zgg+tRrKu;(=rYNaGK6`NCtMYY0V&LusRn_O$3^@pocp&#T)0E)5rUH=C9sr;Kgj|b zw?#VI;U>4Awb3zb z(o&hSXF9xat=i%}si)_U8Tbk+!+7BqwNQ~g7zkfr;|BFavnoi-xDZ$mkc=&gOqE3N zW{g8~Ol5c{%&g!Uq!aFUzx>`P)d#34lIAI-77QVvSYmm{788zGP8AX>11ZdA#PS7d zWza?@a-Fx8w>n=Lta!v)>&4JT{jKXYtQvv-((ppw`r!gO6AZ((Ev7CTwrfPTLL}oi zFNVAc(`|i_jXql?;d&ddQp`H|ACd^yz2T-eeaxD${xK#@iHE-fhZL-hK zZu5G6CmcDw#q_2eY%!r2L12{}lcj&a%lJfd6{r2%6MBMVs&#;J%G8V_<*hlu1L&r) zjx4F|7B!C6^A@e2o0?Y8&mEs-TTFn<0-2AH%qD~uPq@qqoFXMXCrhr)c)KSM=wMwZ z#scN5DRNW&F$G(}0S)T}4occ)0geqzRlhuo#Fg7+INGU&#+X~&$28As+&H&so|U@r z3D34%QhcW1_DrAk+5O9{ig#Q8+z0#7Fqmygti~{ym9X5u4Rk^?_~DvOY#JKe2^WNQ zJ7^fFa-we+y{jfumRCzS3y@squuW!PCCVK@Knq&}M`X}0nSE)(;&hE2vjiH4zYte6 zoXdxkF1M%k0Z&GFI|9=bN_87Jz|Xy)o`FwfN_&(5l?VU>`Un=|RZ&&~cD>#9ip+3r zB4ZrKsI&D+Y0d|; zin9URCcuLb7F?jQvMh`0Ja`|@S_)xFgm+jdX7|^D1HOU=QrY|KF4+))M}fwzGOFq8 zPd9B!LWO%2OTn--nA~{w=v0^cJa`8FVyzs!L}Yt0)z$x#eHRD5eMKgSA9SzK`XmPQ z>IwLi8p@<`yk*H28w_43VZSxhwM$mh>|=vJlQn^W6x^&A0DRlX<{2W`@P^Th`zvD3 ztA|-Y%pjX=3doWLD0^j;YA;6PDBxYB#>xZ{G#_vjQ>JcPCc*0>EK3k~XYnKwjm8^d zCn|RJ*=%Tlzkh-}0|_IA;0`1cGqn2ke8QC@reH`wXeoODnZa|575rD=#KvzX=FM!R zGP0ayS^2pqlIeX6yTs@#Cvv+rH5xj>NHYwsMKYg12QJ%U$|DS~E3->-yu)p=o)B~^ zt_&zNja@@Bd1*1`AsKJtX3Y@hlRJ}#Zfp|kRD|VURUV6GF+qasZqozpK80NX-zg%P1twc;Q*)avb^7vj{or}U=LzXE(NR+4 zon{t9pbMH2aM|usZJM=kLl;XRHCU<|V))k*B>D|JE2>`P2gKpPCP>B(9Mdu@XG0NF z6q`%Q9iZ97$M~k+HXQh_oaNGOdgm!dEc?93OV&E%%d=#9toi3W8u_28?AruZGsvZ( zE=wvx8Ro|jCO3y{nnroi`|iE-yV_FQ%4|cflgxUJEv+A4ub+@^cz?S=>`BAiY=h$) zZs)7eaWrM>6Vf(n2)C4r(p=jfx}Kenh+ixp+BD@-!Hlk zMfx%FUMu3bK{P?mn7gx>O)ypxbt}mN-diG)#mmGS48?X(3v$8LPJ;{VW-3Fl6Y_}V z8eaeS@Jd{cS$k)WimQ z)gOAiCEel<^pLktY%)MJ(Mz2(bJ=2oZGuZ5a4vlr0m|s6#yoF>%p(RyOwxIF%%A_! zVm@SHI39hzO@2=EWXz>&v1I;BWeK~V2h@V7yKOU*A4~>ru5mwZd_~lK-_4ARQFYR% zsLI(bna4AbT9>N)=(})I4UM%QZo!&u?8J*p zo&?K$0;?1G>oJ-^rx!*j9m65PU!$5g4A`<+70i2EKP}9PHw~(B5HLZ^HgHz6l3G4v z(1R$bmtbQ)yy3U*_r5paLfn-2cJCPkWERNtinuAc{rx0^;dBD*La4T}fx1AxsbYRy zXmIGTK@{qHQkBM!9~T~OcXYhfzedw6V$nPp+1!O|K2GO8`2}{3g4*D$k055^OV1@H z*PiaflVn!!#Ye&GL)H6VR=_S zhU~K6&YA6t(-qksWrO6pH-G1NBSYM$<%*w9IX8iQ~{o6#paSLj@ zV#oHeXLAtE!uA^s%H-tuJuRKgU>qNOOmw*1wH-CNZO+PP^|#el`-S6!7l>$X>32I4 zL95ALsx`-xDwI+6fRlSpY_A{$q)}rqf!>D9&;~gkf%eK6HLe^pCF(Z@oUuB>Y+vC{ z=U(ZV5$Ff~`mJr*;xY8EtQ!H^Oe?i#8^6tUF!cMpd?im79KAYIZ73aTrN3zu6F9}9 zR|w&K9%I+}6g8Ly4-k!KNK+l#z5BgbAPvXs-U z7osNGvgIt3er|Y7^%P0i* zkJFnnD?v+$E>6gnPl!jO+H->D2(8i*g{@ZmrDN>Mq8Ym=gSlM8ks(1tc7?|s8~HTF zn9fht!%wn1J!=*1-y2Ze3lx)mvkMi!kn|k1>ZWQObY4UByLBds(E9j!k=6ZiBk!~2 zINnqAoe#{G)jFI0Wazt?Z%);^Sh64nt{5(*r><6l)&_3Y;(<@yY~@l6-0lB2gJ>H5 zq(k{-c`!;0_=(;%wpKsVpK+AoC-$t~TKjB&=1GV@yv^7~ zub@Bc6wY7rL%ogR^ZxA96o2VyV_TDt{W<3t{<7=!wiX-xxyg_K1jhk(i_-ut4Htmi z*I;L3IB+SK5}>Gb!2aNofxN4X0OkD+_RePqF5iX(s#+g7=w2|8e-9U^ez@V_k>>+f z9#aA}0}nVH|2R-k#|YFu)8OEv)YTO3b0;OHmz{#pkvNbgF6W02wd>+dK*26qoQ zg&ui-;}avu@L7YCJLmrg+@r^Ta*rq2+@sgwf9D<}{7?Kl_xN9|$Mcb=dQPMMu^tm+ zLStg%*xV!WY$Tg|e0R?D?D@o$3o(5cEdJphGn3E#!#)0|^%y~^Oi4-k%RQPAQ*-~} z9=)j+k<@2C>6n=Gn!o7dbeZiw{hJ%yhoV4m$-T_?myi}Hu_juS(Q_Deyu9u z#>1X}xW`M66aQU&WOI+TwY7}ed;g%1|DE`l^S0sPzjKf8UOo9AijUu0?ryXcRkjul zwmxGp81rqFn{8#)|1{~zvA;orH(Jx_X`UT0q`*-%z(I{$4? zFe48$*as@Wc(}E}&CvGK*8;?h6tZD9$7XJXPdw+iA#tNP=w{rL6{N={ znEC0bnoB0Wob@RcaEykQErtqkz?DhQxzta-c(52~*{SYosy8&p%l~4;dzw3VnQ{e6b z^H+BDKj2%7=`MtP&o#-)BBp6W?svxwowfsMHG{cHRp_Rt2(ZPL9hS$4v=q%h?9cnu z<(&&JD=u4calR_Jtt&2qTHeJMhWoG=rC|T8YCs`fy}Gggv}K;%oprR9-oiUc?W}Q^ znJe4w`u#Qv=6z#i%5dUiFrOELpXj z=*0)c(9yJ9d0?=29t;RdMv#tgDfp`Kx$-WxaPe+WTIzX-8bcJJSAo>S;vkw3%h*j_ zg$fV^g$Su_8C3udKcs*p#h*gTNFEZf3r3=@yqurz0j?8dXN9loaRz_{1uDco!DwcE z(-rj?aP2lPKcRZeX$C6TG?ZS)FGnYUMVTjo6LoA?MOb3umsqXgnX-j z#(ou+LTlvo)2>`QxH?SxO_8z^QOCRRxA}I39I%U45qgOBUGM6yeb1Lt`d89K^3(({PF6DG7r@4Yz~aX# zYR62ynM;s?T-_|+Sdd#9w>6&P5s`I$6h3Hyx^KU48+^S>6sUjClf@gkWPgO72UH1d z=~0-UxY-vzzJE``#_~S*1i3cJlX3 zT;sZZ>)QLGNIo=X>$+v|jRQUy-tu_Qo0#&JG>xvws~HcbE$TzNqMW+PAp3D~NGn!c zI7dh*ZoIXo(=i$92-$(Eyb&14J)VUjaLuO`7hiJg7ut8j_R2N1I)|1ll#)$v{gS-B z{pnToLJcdd0^QtlZ`Y|naIJ7=$Xn z>uWLOd!nWTM9VIcx41+=d`*p%jl;t$Z3F=vXOcgzbE)EJuTj?f9Y4l|Xr=DQtQc-K z05U4MW6yX`YoAL3IQVKSJxNV!t4(>G?~0l3_nF2j)N~NaisEHTZGEu2O3=*gk>t zill}bv8UBTk5%QM9zS9}JxBYzy`K6lS0m$l>~ZOEOoj$6`Fez!(omMxVcscyQlN%$p|3 zviYW_t-udac8)@3$An8-X-@}zL9%_|<6I+Lw-+QI?zCFnf0uAbE=;XNv>-YyP?Ku% z;M9J=diAA}rpFc>t&HK{p8Z~7Yo$lcCXkuUCklN6B}(=)kC%qY`DTdE%ErIr5F4qv zvoHTyl6}^Ziw8RUEHaH#%SaDS>_^8&aP2zxz%Ji!el;4eWx6w(Yt7?lw{`90Lv&`~ z_sHR4)|;;e+v3HpBge@ud6Fo__@Z%Uv6qE(&66#%aPoPz}R0$=YSQo*ElhP zfOS+1>o8~UJ{eukAoRhbn2Ip;{T)d~vhISvr!otLm&2TZMT6b03W^uU)t(8BXKhVL zq|MZYzVbCK^`7oqIFs3S#&=4&?iL2Bag!AaiLZA1Hr#ck$4TnPgG_6gJQT!OTC@1f zTe|RRz)h=dx8j#ygex;fY^mkvzyFfS_+oU%sG2fAfBMoLPGvZFN7=JkVa1;FmAmQQ zamC7{o5eg@rlvLOcll;-;Ey$|rKuNY9(MYz=jV*SGC@Wkv$&4ul^zLfQR_k5Pkp9J za`mCU5#2>_>*Nxj6KA<=1n+Ksi7W&-h%?n~%yinRRpv50OB`@Yg&jR{8Cl9+DT#r_F$x z*H716Jq;1{r-r8QiwPk$|3It|oI$$Doc+mc<*_W)&nPgt!#24p2*$bxi`8@C@cpAa z@(m$6Kf=URg=FIYD32hv@`xf$oFn->;Ur-ZHf-e)5Ai|>Fsn&y<#E1LWR=t0#ogL$ z*}=Vq8^mA-BvPJ4dxQNT<_vBxCd6xrqp4URjEM#e=vuUJn=uWYqy8w5zmWT$F%3s; zD1LL2E&C9M`vlV1$|J!haxLnhVQLgS&B-lwTT!t7FhbdtSLm|WY1Oo(tu(7A9NH)< z*$#|dN5rFf7O!|GLa8~dEh;*iDnP=~JktxViRO7C^5fGBhtk=~BLkMf<{lZ%?ziIq zaE}k_Gphaz_sCWr@qd&@0l1Jry2KC{zK{y~(bGyaeAhznx3+op2K zqn3{bgIfs>SfNj12)=7sOt$ifRxN`axp_^<5|+*J#E;uI8(bn#h&TnhvY+!wwg@nH zrIULNp0m3po2LwJsunC`V(a2OF?~9urHI$lf}hv{V0g&6NeCDFd#f4st-`<=56Zpj zyOB?8YBM}GfcRt#3lBPV{{!gWbHy#1lGp5|*Utp?syTf1j&i!?%?3fVSj7U>2ymK; ze7s-q@uA$Z|9~EGYWYfTafhQqekww>o9$L@Pzq%p{gZ~=aj4NjD{(cbvg;q{QR)!* z8S2X035!TJ^r)8ya?7(><6HpS??-?aCb2;Wu*&}2;{$pSx4gG2;Mu`oHANHS5N@YC zf1tLghGqW)Jp%tgkI8K4v6&CnfBi9(KfVviRjW(O$M(5eY1E>Z2W{tR z>So1Pkj+=6Dq!z2+JR(H< z?SKYo5|Typeta05)r`GMx<(WF<2;(PnAiS+9-pFNGi>Ow`d`rF(s0ImcES`+u$ zw^Fn^%pmj+^cX|{@iT96=yJ)yz%#$NzI_v1{s($=z55i;set0!!`}s6y6ZJ^w{l2O zyBI`*9iczf)r&f(?iw5q;L*ZsX~XV04G5kYMsUt6zHhdWxE#||t$XIey*;K%Axcw( z6{@W{xF5OX<$x8;wz+qO7|?I>aL@JOryFchF{25F+25z(%xOMZBPiq#xirB$?hbC7 z;7uUOi-5C@6jc}ewcx`3%aUr(`jBVd+^>3Wt9&$|;0m@3@E8lpLKfezORe6k)~rHu zCm`+j(0ncvpl>mMiN`38@#{Pw6jyQ|Ad?$-lqr~v_#-@KHk1Kat}p`)m}+H>W@Zhj z{}!8goCpPUBOVT3$Y1>r;?Y{;(T9xEcLjhV-gvOy5hXR-Y)MjcebFMTKeCe5l zZ}L~wnymgRG1zsDFwV&D*I`km;7hfJnYH^rKR`?dH>d zlYbG9WUu-|rkPR-g~5M`M>jU{2q^oBzZD%;2gcOj`Yw2_vSEhC!{1y6sICI}*6cO& zp+>;wY~m62EVjBX?H}S1@Uq6?<#Ub99fSbGxIv@_I6E>MVeZajVDyCWZ$T*grJp zeFeJI_|~)*R1HLX!G5BEJP5#h)doMm*XtLXaa0hM?frT01M4{=_95OQ(w`x*g-8VL zsq-EUqhe496h8cTSpYKHthgX5m>_UbRl6U7@QM<+2to8Ww;+qTW~-fNi!TEF{wR-1 zqjVHod4w^rW(=G+!zO`YC(S58S;&8H2h|+7B~GjGH`Y}bzyDPp8EoZIm$Z%gqdYR$ z%46S~K26Ennv4f{NMnLPh$1)(202~bsuuhLe-(Od&t;<@)2(CT73cHymc zqphM9X9NsNBnSvy612v{u10YS`0{P=ao&XWKz>O4NDx@HVVsWY34!p=MnS!*TLm6+ z*&uqSq#60r4DBUw>FC?5W{ie(SO8i;wB!%;*v^I?QGFa=25ty3xXl^7GK>(GC<7MJ zuPWWI3}eht`s*CuU#M34y3{N87kV7H3R7T1k9fWu0{ z#aPTJNFb5_eR2C&*9e?BqvMVa(&;bs$b{OWxILMIR|uSKOlT1sdIWF|_3^bAzYC!8 zUi=F^W)Md9&?Dv9BN;I05c&`F7^U^8UPdBLhVhmSJ;Gct-B6(~Dq8sV#0?IARL56C z&elFfZ5TvWiy^_dn#+bBnb4_X4tBTlrYmHT1sg8P{sTR3v!TZ*-o1t4suN?giQL<+ zkUP!1iOnOIdIlO2zh;ehX6f-C((JV|XY4vLzQUNZ&HSv0q4zNM`|^eF$>1d|-YFRL zb2k6gAA&`F!w&c{kng}AjrTGFR$e`}Gdg}5C1@SR+t|!I8S!N!`>S>m9G$@LP^0+U zyf-haRsRtWrcrnB;UGE^ zfasHNyT#wimu87c^B07ys+H2Ygj(!q>IwzaCxK zNy6^!Edoi4AAyYgNbZ!J!>XEy^ha9Sng~yF9m@ZW1hp>?O*>mFaFy)79S<`B7LI#7pk>h`o( z(?*2lMxR@Im>eVBZ4;XXk3YQ`b9&7ydEE=vO?kqZ+_*tJ4X5O8rhVP)1#VznNnotL|M?Wg$7cfayk;1r5HHM8&N$vMJ!Krqm9D2c`_yPE3bIZBucjB`rw~QX`jlG5&C}5-9SsQ{X zTGGEG7v?s~0y>Y95%3dPu zqvPtQfoX?p|5NUf^Ge*n{r3pubDY$%Xa`jVmuHdFBm4Ye_89!(yU?)AGC7|Qj{?Jl zt2}E&dhSGc9i{NuJFGAM8}|sGds>BD%gX*dv^?3a6i89@(t`$cv%|<|Mh~nOy!4VL zi$xEwh78_6e4A6tD$akP{OCJbmPAjUEKzStV&==|e{qjv`WoINZ^CT%wZto~B0R3R z-1Cu%F&I5}H&=RVWc%C_^U3U(cFo1DZ~b?=RL91C|K%QaMm;#U=j+~myMAzGR7|3f zewrP8f5^qa0YZTmq&=rTLPHe>7tooaW@#-f+5Eodij_M*tzcGvxW|i^Z4taiYnnGk zRbSer26r8bwvklxb5s9zC&Z~;@?zm~`(AIqo2i@&d0M;Rpht|o^3AxsH|44tdHU-R zVO|s2dxJ6P&&M_GpJ>rFtKYs{+j2jsD-Z6Wt9<^;JuZat3u)TN>N|iEblXgK90d(L zwexyhu&i+BJ{DH~WibB%sWX4L#|V2)p&CaI#cvCZaqp+#wfTo!_pcgw)OZHCxO=d< z#}Atj6b35O4iP+>rBu zBbrsmoJ8adG6P0T45XC2kBc9hD%3OxIr7o(%9+g*@os@{9L<^>sJKqAoedIu!?^S~l_YyXvEdze>T0ZO z`q}(ug#%WzvAdj&3nq$8W26Aj(u+{bqJj(O#xAllLr2^Aj#Ja?-v9_uJ^(;EI_D$+ zz+Ie94ZjhQQ<0Oq{NWfBAQlB6=AJRL_!EJ4^*JzI3UId!d=tF zUxmj2Pzj`OF42C_eH{A4LGZ1dy3yCeOmKJIDzD zKe|jD>reu&lF#9_Lm2U752I#+l(F5KmhPXxL z?p#I!8k=pvfJ0FSDbUvdHJc6~lj9hGFg17VAqR}fLCmDi1_xQJw&+LJlBuflF|8(g z-I4$V!-&oeIMvEmpbtaqye~675#&+YGJalL2* z`nylvh{%T;t%bbT0TqL^)&a(d)((F?v@|bF)jEk&$av!?pOD%~;5es^Am?N#V5xaz zSK`vCFX+lJz9P&!d7%4|{#Z5~+(e(xY;33y(Kzs$k!_2{1^|$B^GY{d zpal>8#82Zr8)JTkk1}P0tX~Y)8yrEJxB_rv;vajtdukqK+wtsP5a@NLfDTkudPS%m zol@RB5vLVNsW2A0fUOjtWsLEWT3#KzMP_Hca`?S%fO@0&B)L1$C{jO#Hhu@N4{ogo zl|LW@FMe6gUiK|CY{5sCrDahB<$5=+-@OE2ef<{>0L->euUNdaxEWN|@l~IRCD&J{ z0@VpTSQMIYX{qH~Bm5Fqy)gZ7`p<--Us_TV8`67IkCTB3Io9#Zd~B*Ndim|=nU8Z1 zyQZUa54DX-i_8&a9jjIVG{6%jmfq`BHghlSD=D@LR@Gu z@c>fPThmw-N&pk^xtxs%Ty@$}3>)Z@ph+9#a1lT>2N7P1&r^f(X@0{T7zl+2+YSVR zmETK(6`Cbr$AM2k&Ez0M{0*;r7+?Sb{AG;}Nls$KT! z-7$)&{;T+BxV&gnDr7SJ+LLy_N)QOx&LLT2!DW!3)v>c7vhPhnr>3X5@Ck=9@p)jbGu7a>DQ_cBpa9^NzicsJ#^Vx5 z7yxE1kHh?_Z0?cxoL%#kN60+zD(F3dGl~H9 zeCFf&$|oK@l}Uio0KM|;N+v0JDj_O_#A!$1MK!f%5F#fC`mO}u8qV!Q0RE*%%2WU+ z;brjk6gxk_cI&NSDa}9D<5ppiJ^=DURyPw0);A~%l|OT=c;aSCnj3)2voyAk$e#6Q z!m_$ATWIi^&QI1~l)v3ni88!gtCdj#c9&J7==R@)@|)ay(1&1wT>(-iT*3gW=>vdB zyHt)FroiSZA3K6V3Rn0T8CsoeJ_7v7#w{Clk$Z1{+Gw9WjjXm^RpOkRX zq~nALAONsPhylKN2RzTg78xPIZ7O6ahi8+TVy1c{Hbk^Rv&`7YTRo}|qSFY(ma6ij z{;?hjdS;de9+sCp0f)n6&ZN^(HK}!#U>L3Lu{6671a|jOVqju4*AF!3N%*He7|&hrSEM~+AU*T^~Dg<>^D=X6^#$8 z`;*MLxRyGw(j8y|m4rimD24J+8B{7=I@HFaSY_wfZNXKXT$GWM2te$H1f4{?*J7 ziN4sCE zUDr@ipuj_DFpXj7S*#h{^Oet;X+{87a|U}BPvR`8cJv1@t{4dp-WA{+oVu4+_3#+N01``ocK%sYKI7~_jp_gdL{mJFVb3Qlc`g&OQ`j;5 zT~n~QhL&!{dUX*KBgEC%<>*8ei?nkC*T3+*pd9k{O>BkSQjM70+8cbeyi=zsRuvv? z43=vw;(`*kr2AuwonYA>)2%&}?(BzXaT_>U#U;rE(OTcO=LK7O_g69vY^2*!OtW z-x*9Ir%nl(f#F1%41#tS!3_!!=!1OF2gt(+fWCshRm_W3f~zQibc0&(q~u%*9gMXm z-q58*H0U>5f#K!jngH&4yK5l;Impqt<(A*Ij?`-$XGFWsh8{O&@Q}>4vU!_-b>R309WM>ATIX<>z9UC%XohpU)bF`!rI6dl5gf&@YlDACs`>swyvB6%Hpx#zOu}jdE9uA51vz77;J-w z9x13|Ugl(Tk4^asx+Lfr)9a3iQv)6*;EQeHHyp={%oHgs3kdo6ifQLn5K=^P#=ji6 zWD?>l)#S^4Kv-yqUt)3m-T`5Fi?6)w#Lr?;_=c~dg@nl^qM)*$O43AcpD(|&pL&k? z`C^g#o>}l=KkY?7ojpI4fWNM+zn=F0lzT)yhYVbsP+y!p*YEoh8i0`vz-k9rSp?vm z1FU@lQjGm9lL7)Sae1rU1v@0H?74=fwb*#VIiubl+Bhi$I`IFwosO@UTzd zk;uTKNr6XYp&qHz4u*ju#ev?vfhWeMPZR{6cpm5@FyrDw7ETTDZ^Fvhh;-m{6eXT% zG-WFH@~NGjQOXJGzBc3E8x$dy5mL==QJ9IoIP+$RFT8w4PyQ5TGX%q;_@( zU^XF)2lXq6s5;|TdMYr7ccyvv?A|Ft68HIHQU3cp=Rek+lNC)8qXx;=QBt$W0>P|w zw&|#xHwyzQ1P4>f^ASG5LVdw$GjlGNsMK4*mpOyacV^@iWX6d#WC+k6(EMoabyv%o zy!Nx#EkcUMf}(R~VtVUuiKX8@TVMQ$S~3<=<}-VzDI|;2H2TO)sq(y1>U@E7Xl3MV zaZc!^_ovbk{K@Uck8+@on?h?MPhAKSI3r*xW0;+%-0&=w`dmA#AvNH_Uhp@E;Db-X zdg)=Qx58e3oKc9Lnqv!{z26=yEM)D4wfTfcznnPq1lti5{^r(#?4)nV)9|j|@E$hz zxETI!FTC%6bC3V8aE~sbr{^dR^&H{xaSnoih{vn-5KUE(%jO!Jc)Z%V=Ad(?6vc^B zCH)~D5o9Erc#MJ^ih>v{t)Z8=Y^fw9n|MT>ImzNaH@AjFfWz(1n0Edl9uID&`9-ax zi#dHeNP(;~p&jYit++$MWMnnDpqi|t3et3?1e`l#&suY+I2%`&9;O3A1#_Nd6OVlx zE;^(?#3L1iTH^E--;DdYd5y;5@)}}Yd@ea^^ZcpJ0{E8q9N6XY<_%~Xi%mRUCB&z{ z{!2Utr^lmGgX+1co97PXrX9b^q322RTS~Jxj(15r=dVh_4nfX-TDvh9=N82s=Xt)c znv6956=e)^dB@@MlZ%$S8V`kpH?xPrjd%T}+81|v=GfG44ZgdiW?B5oS)@Lu0J3C>6c$Ug?0XQTJa{=Fe`4PF45)d#qLily|>S~ z_5J#AnoT^W*c+d_N;prvy=fTCc>~W8wV8H!Xq|NQOgQQ{n|S=JLt%6zo{NH9733Pz zAtgm^s%|A67Eg9jNgmKi(z<%C|Kzq;Zkm@X#1ud^yr{fy)hM@HI6NQ&6)@B;$Dj&L z>aIOsDtPT|-Q1biMw2#4T}Oa&Bee+p5a{ z1@~wJc~A3gHMAbV{ThMs-9QozuYajH6#fRexqHHoI;|~sT0Rq}d0ub2tKdu~v*#u2 zs>TE(LtH;{VM}-FU)K53D9_F&#O z@xro*QR2BY9la*IU#q>d?;oFLS?uj@Z?2x!?h+*ceX!1EQ;g+fQf>HtSCtybT&FWp z^a;g6$hR~lesN3T_Q}0#plgx8w_<%$rJRT4GSnW_BQv#LY$3C*ggi;ptPA(El_EM0 zI2xCa4)h7#L7Z_(u@#a~$g@{(P`vD9@@p!3i118IL%^qoCNmh$RiNg2f%Qx2ns3Gd z$Jc1GYp`Gf!99-nlXXgvIHxW}-h-%AQY zmdJSnP26S{rOLT=Y)YWk=m4+gvk%8FYL%pj3JKuaMEe7scSUvIMC{9cDIys#$oNQs zO;Ci+sFPK7e=3(EIUjIPe~(6F`;LIDM=ylVA7aQh+rgKdmJW(SBesqw z=(bC!uzg3*tt46(B~mXIL4Xz&c~yw;9YZ*WA}0~ZxH?>pA5FfD02 zKi3msJVvZX=n)l1DpFA(T{I_xXuz9JirwX$CyP)-n3un?5tVgjqM{DMu zm)e%+PdOV~tGbJ$0xs?ym@Dz}WGD%bS!EwDAGN;f_^60C_4H(_ZK|4IL#=I_1GMRe zLg0yscjJo*Hw_}fDs7_iDd@p=;c``Xh>qlmmGQ_UynFf2jivB z+);YQm&=!^K~k7mQD_h?Nu`Qpe(+vxT(WkcMkXjoYn*8*>9$5%x4x175r5f~#HEt3 z*70;gwG$&;LrXPNM2g)1A9IiPrT@M0SSFtx2%Qvf^-NV~v5m(=1%Hi4l}+C&reeZ{5KMW1)kM<%TWk>4zAtVy!eA!`^;@gx#9kdkKmbBg@%=Llf6>J(b1tna#v3>|+Pr=gsQ&>D3j|M}l-MpOOj2?+`{xp=yM z&o;Mjp+ZHrxmM^Q#_-7mq~N=!;hJSn1S-ZJ$o-(Tdk+^w=zgtY!>-|>WR1~@r~VT6 z01W-rzR%%9lRlMgl&~wbG#AZM8Tu>jvr@G=C-K-iKQvA=>HC3%0RKk;>|MpA=$wU` zx>qjpKh;)>A-ZLo(odg_`8POxG%LhPHlNWUfU`srpp!z}91OJ@-{VFblTuFrID5He z-X5O%*2OPr1}NV{xZTzl8Tq!vhw0oQEX|=-WuGcDzY;I>ke|#dPy>V6PLEDny*WWq zHII{e@=P136JvQTD_w#0!AT6^>l_2c?=r3_o{xyvP$ zrQbvK|E`bQ(BzC!DaooBZw z!myTrgO#C@dQcLA25!PeJ?tFDgT=$gPrYSMic>4XlO>9-orPM_kHD7`Wp;SOwaFjK z^MIGK^#$L zLD$oyKn(+k{ON-)h;v(Y!YMH)6L)o0mI3iCN-;c2Llero!IQN7n4FXTejm*Hr`?z> z`AK%**34TjHPNJ*5-6P~cWD3N4dO~+MS`)`(K@FDU&`wTesYIlT(~SkF2d7>9oBIK zbpMBZi9St(Y?oc^gLZ666b~Zx_`_h?8HK|6p zAl?Vn+-d;`(o?!OlF335MdtYhRnTQJUQ1kpnsi=G$2)TD%7}V95aXgpxaz^j=@tlC z`TL(3H%;%$9h!H}5BD4w#a-$c%GQgqK}#aT#PD)c^y)Je{ojCKci9UW8N8Um&4xN#&`B^&<&(6zHStUh{CQ-+of;0WVX`|vMv zjZlh$fmM7xm^o!-i7HxMrYpgiiXL`maU_K(2OI?S!x}=491Ib05Vb) ziwVb?p>g_fvPCBdP@ZA>6r)`azMCL;cFxKyAWc)0{1TC(*GLp%7Fp7;`=VN{*|MXw z?}%=zlqU(tHvwlzD$Fs(XmZYGtuQLlnJqgy-ZEJdVUv(b#z0jxL@lcYKG1IUBwbBltt-=sN|YGGaq5*>VHGWlfgE=* z%`}W?6&3-o(yz`Y;pyh86vO%wd%u<@Zk+y8%?}r0CXpR3`Y+tk`>s%xf{O=OJb>;L zPZwe1jtgE&1~L+C+>wDXHeln9bXhZ?#1Mvm86?q&pE2l_B@kaPK~l#lI!^%#1kl(e zEdLgEABtp&V%v^nZRV(fXAn-4734o5BUnK|FoExWP_(Lv&Z^`;xFej6JJO{pn&GY` zmQ_H(=GO2Dt3xD`5S)!Ww*0{zt39-_l!yss*$U;b7v+}53V~Q0f0VrYd=URq`Dq_% z=Z`q~M>Fu^+vCxHamT0SY}~Qu4FNGo;)d)qPW5+Y+&CO>C9BP0SvEHR?3Si6=)mq9 z;c7~1Czih<07v2KAH}{0vKvM*L?4i$G)8-gB(@-hvnzAT_rfU#IR)U!7Qqxla>tgH zTtb=VmX$t@oYrfNSUV|IrI_vq%#Z{ddSoHH%QlBwwNiw0EJ*Nxd&`NpEVlxdepzW@ z*|wtZ*#vREaY}^r$?F13YnYo{{Qo+U@KV!ZMda%91a?1)5Hwtt~$SP#x z+ij9^psXe#ym#3vS6sAmTR~4) z2QjdW%Qg@NCUGR>5AHa07--~4Dv(Vv^==i8t5onNAv;SAMO%fsr)2G9;1o_BfR(Xh zWyes6(@<-RY(mOVkex%l%mYOjf>0TBZy$#lV zN?49oKdko+j;5$Dp?O_>qLU(q?mX~&sE*re9b-StB|>CV=rDWV2U%q*6oG?d-o${6 zjEM-lajB&iH7w=)0}>xfOO09hBoyg?timPX>(BiTZUAKo1dHK0C+lwT-E88Xrk%#K z8l0H{_}fzE8$!qTXbl`fHJ+eR*G?A>wlm9OP>vw_79Z-bN8kbwKSmuo1325eb`AQM`XU$wA($Iwu329LZ2n(aHbimic-c0=2@>Fp1Y zvnK|Ze_3_ruy?S+-RkKa-`=qcAB99OLAqM4w#W)ua~wbc9F5+)ZS`&d;o|)>`8PnN zf_``#tO0T#IbX^~e?y$LaKB5m(%;elZSr8;FHySgiAXw_+}CuuC`D8`58E^Zn^S>hRN^r(;$h@GVMn zy|RJqkAK$`!@uhIT~X&W z+jsO7#Uj=|tLTmFuO~pGK*BrVLp*ma10F(A*ehLN!E0!>(imFkAK$T{%Zk5~?K^_y z2%2V{kCp5u+_E$zf{8RVL)PGn@)92$on?drHcnYd1p_%`qTWq>ca9f!Zf4K51V!eT z-ZyC8d@b3|W^EN&-BCS*EwWK13!o5RZHYeVZb#y&Cv=>B2+0Hp+L3ziFY&DcOjZF? zO+;?=w8iFX{)oOs<-ArVRE0|5UIJ(XQhnrnLq|D@wu9PPqfoR?`ahwPUf##wZZxpcc(K+g2| zjOxv}$*FPNg%^v#^i&>tzgoZZgnG&~_}Sq(A9FlL%7!v4Nx>OEJG3PcWj~Md5mnQi zKUVo{|02#Z;d3&_A?G!)w^f^pJtJAG)Z+(O+ziD0MlqxPlyn_?;xj2$MKn67updj2 zpDPw|1RLYzrv}Jl_gQ3CnGG+7+>w862!3x*F*SatGzHEY z!)A~K_SXR;!`hUXXQk{a=7<=?X!zkQ48I44_u6z+w^h_z|JcSGLiL0xWaTyKL*zuH zpa)UMQ!~zeEM5|G{sCwyA}(H1!%T{(-bWt2pnkbM<_HPwHdf9Hqo2({pVQ-T>{7I` zP_(SR>$P~(w0IFW6rE+k6j24~h!Wi#>Ymc9TECr%6 z?P!rolF%ew?d184N-__%)1p)QY9;9^xxwA;u`R%A3F&M;^s^gL{~bqBRRw(%#T0U# zFPdz4@c4x#deDUl7VO)E2Z!)i#o6T=#9nyHO)x02#li4CI1eIk1zOAQg=IKUGhod| zaM3*~MkqdJR`U}eM&cd<*=!_-+JmWao_<}frdfgG3%jxrhH~NzQ&dva*K#rtIBSxTmoGr zp|#ek#WaCo*~su}v@|B`dndFVHEX|^(4oIeciC+ro6*C}I`kXgTy5-3Gi!To_Lgqe zQfl_bD&cMAZcj&J>uh7s+eXIw-L89$y}XI|p&K+e{o~ zB{Bt*#^sVmRV=38znZqS`1~_*&Sh`na?|9^B(8v_ui&InUW@S#whGxa|JM9_?eFQ^ zP2V#2=8Y_VzO-04-Lz0^{v+99QKM;*m-R&?d5YI!l`T6G_Le>;PD@!%g;{*QO4g4{ z`ca#>K54P>(_+ae`P=E_1@h~adlsvwSnK{tYqJ*XUaZxtuUBK6c4L#d;w^XnaF2Tj zxkZgTZ?_0&?(*x+3HssEMbwTwjTiE_$(A~Bc5O>2!0wWq?y%%};lP7e{>42y5H+UC zK0g7?=EUVkO_b$7n&nadHU&vs*9>-2$J}0CIJO$pTZBp87%fp|$?(BVh3}TufEQd{ z@GTc=?8U;6HKu3Hy*?lA89C+u=In1tLzyo%r&|0zbbZh`_9U$5k<-iPKR(Arzv(@k zaIOPilhPmbCG=>NPiw;C@$G!dR))RWaGm@uwcPp2_)f?8E%vEnBjH*2wB`Wtk5-wBwk5 zh4U@2|MPP!*`$PDtKUA=CCer6Q9gaI869f^@3K5_%fqUR`&ms$igJ&cNDtRJv%DU|!xyH+jiw7P z_wH;th`cjXTGQh@fS+Tka7L)UKd1>D%zL|6wGf*o&{`bTjpOXUJaF(`88XXc|Cjvt z&gQ>8?wV@JTp46Fjw%%$HJM}c%?CZlRV;~~vGhm>bMY^nm=jkO^j3h; z*M)|Qu8uidJBgn(c;#UwAZ4<|pUT2T9lVz*5MDfSU6>XXTycW;%U!S{Pr0HeEA>Y% z`^b;z>4&bp!+kVX_#2)}&1CLufV$q;*`xdI&O)cPj`m)ix$*lmkb2=vYH*&Cw0sBs zOjYoy**n?FD?T@`JnDBK9xTRoMW~kCp}?Svfa3w2z=^3i>862$Np)48ZdJOA_-oa# z%0LtLm+qfU^N*OvOCb&d3?T(?<0L-p3%BE<`uDV)`PS-iVl~($^E!lKTV^l4(uv_y zaHDIT>wkN+)v-fb>|hlAYh|U3@=wgcxsC6q^x6%waGf*zMl5@dnP+`GToiZBx4Yr+ z!AEt`S7KYe7WbvTp5!c8dVA&nVDG-7n(Eeu@28WHkc5DM)DU_#fG7w;=v9h>B5-Rc zDxxUGf}oPnr1xGl^dd?T6c8a4kuFW7iFBn&Rf?21ct3l8`>F33@5y)a9OX1)WM!3q7tMTp>w5F{VnuOm`j2-9kGEIkRDaf8Ga0)QfbNi}v^G5z zm4#0~d;crTQ}MC0R(dnz+)_JBC-rRNk`k0T6oat4E{^tnBlba?Zz=>cz$3V?5Rd+v zrHTTU2|SastYAn`5`i}nuJ`b^b?H_oZKP@kPJi7g@q-CO4;EZ3hZl_Zmww(!-D%u<<$L&7zo46UaPcD^fg0xl zPtJ#d6H4P|lt9S_HfVRw^NS?7p_|cVO?}I=#xv1B+=5ZuV4mUc+G0C$#@E!?gl!i{ z=u65*HyX+k>iToHUQaMz(05OrkdD%nhjtQLC;8V(0}!1xtszsNeKs2K$afMVMgI{8dj`VNNn^e0V}KfaCc?$(kb9)?_h&-)Om%-Nn^Vuu zA3fuz%{DaAg}l|kyhWTdRC!-<#KPn0Ku5ijt|64;BC}EWsf%ZBTNk9Ah|bH}nKOBF z(0?wf>kNK|)`Sl8>er2@JDxZqc~*sHksITjuW(v-$Gy2XY$J=Ue?eb;YXbM^d!tCE zIEb%ictF`eA=iy>&KMLRmKQh0Wu^o>bMfNm<6jhuz0+?nk#of8^YO)Z6s1j8LI&NY z$)fSUr5wO2gAKEDCEky2I!eS3S>5_%e(Qz4Qi~bntUCl{4%ar4bwebMf{{Ao0Zw~B z48M~zzYA$bQU%ag*|yFVMUj@b_-r9!?i*jrr{~I;bW{+$FWPqHWpmmM(eJqP)eSi> zEncLKUNqS>oY3gIds(u54wv}$4B_@YL7q!f{GZOX=E&tBKo<5*5b=cUM3CiH7S~{u4&J>!(x}n_ z7u&6;Zf(R8RC%<^eZNdd)cixl%}lrxc8S{}qri|oEA}P+X24OFc8BseVEut^FE4LY zr!`k^OcyrC#_qbV`Y=AF8Jle#tI(4*d?*J{#DbAW)hElOl{pJ7{II1dP8Iy`yr=C+gnt za}hm-vLzOu(T0kBBD9i`6E|9EEZ=Vls6OqBmI=L^^BYT5r)Qwt-~`5L7Hre;G1qXx zRR_eanp0eATn&Aq5qu!I2aOrnq40D0jeM@}v+k;HHGJM$VKdyFOjPx}sHQf1G(~kv z#^*uf4X5?^<=sntn^PPCwX2*TR2vkZ$+hM2Pfv;O#o82mcKM~%mMiM8{u;*s?kf`* zd)ddn!+(KJY7Hk=>S&j-M3-tgE?}?M4vJ|+)Z9p4kKq`mu(-F_V^|Y}_(Z;8Q|_z1eBo5Ad@0%k9Q*-K{zPYC|EDghfpZyyc8 z=mQC30DdJq?uaIKM<8n0X;^!1IYJaAZw6L(DdyG@|0T@Fc$}uuP}Bv~4P%y}67c8> z%SCB)A5qiU7rnHC?v%oaJlEJNVL4cNix+%)%YKGKID4Rb2G!F8azi6oaeOSSA2e7` zcdK(pvpjgYt~(EX=Zs5o6%M2R4ig+5CLv4cth-ds1OSpU0nc^W#%l_M5jiD+JLFP< zTRq!%O4{Q5ezv!J)jz)7SxNQ~EzQXe@1tA7q#Q58pTDz*{|chlK1c&s zjQ*f8F3BqJx%B#cser?r0G~GgO9}qW53a(Z7cF-5kZ_SP5kiQp$XdP7-o%KmJtC3} z`_hK9Xj9t)9=QNYe|?RI^XyZ9By*bvGlx0JLI_cx!y=X^L0B>uz1}Md z78bwjF6K}m?jx@e<1YSjmCN=FGs*;un6Bo%A(6T!4unW1uD#0dl{_LWRmFNBy7sNc z8505+RRh(H;bf=I%6^NHU67YG)qK%1zIjlDVtgUKeH;;dj(AYUMhEmP8m%yz2UZ25 zFn=-I`lIJbQb*1uaIr~U0LY8$JNg%=&Zl`6{5)0q$t30;wv`%sR83W$uX4%+0Z_v$aNX>OqLM*0{)snwxG#z zd+w8jNHfAv3qa-7Ih8lDC_0%QRZ8gAQ0)N<|FBbiS9}TAcY+UHcAgG;+l0UBeEOpC z+tFCa@2u;&*VX1;o&L(EKINf~DOF$YGjNwDJ#ZK8)%a}j0q@m!BClO7TvC08VcAIV zA1Qw*uZ!p1K>PG18@7MMec&rcc^^dR^7Grl1L?x@n!Jt_A#w(6F$;GJcv@w19~i^uyOo>9-A4^ zBb9ZGjuZB^`9Q+<$MZn!kiGG0vX+Dvz|t&r`DI5%3Hh}s$TGPc9P@KJDE>kK(k|V? zj^$o6_kf)PZYFc>eDV!DutjAXQaZgNwqKH2L{Ffm54n&d003g~nrEdNIIStHq(?c@PXaAuHj1EELc7ATa#P#V_G%)r8GA@MR%i6tCcpX9m_4i^MnZRG%{!^mXp*bAC7AevH#Y z;eyA>H@Nx};k;C3bEi!@MMyno?35;?*Yc>X1#FDM-isl`WiV?~nD^{%3lZ+_+1=fv zGL>O@VHebxPoC>R^WA9|NYP*=_zEo2l*JM+osaj(y5NkKKA)*sptn4G()xIz8g_oYL})J#U=_a01zF zPXY<2#0hElFVQ}Gu~|_$lc+p{YbBfhHTNA&o%l0paCG0fGAeRlLWlu9GMqN#!$kIs@QYW-NQlx(u4D6!MqVP_+w>}Fb(~sDL4CM^6lj0gVdqqC8b1j z$FKNIE>MDq@2bP5K1ZgNkZA)?Iipmguja3DDegh?v!m&K7t;$)rksH+kLDk}Mdfs- zicD%GZOg>((BK@O1vH0*=;WZODfTA(mCo%4G7r+(tTQ-wyqGG^4KkF+`n^ddk^K0~ zbBNcDH|BLguU*YJH2WhjPO`loq)8=bm~!Ww^HLj8*73|ZS_4!yZMgZ2mYQxrPrzfv zhlEN5u^WQ?6b%7pYP)g9~Uw_#g|-g=vX6%p2Iz6BRlmPFIJ@y+_`-4P_W*vxU{O zHe_HSTjA^z3%eSH;0(643^uC?4iNB2YDNVCl$`4=@~(dPDvwaE)teEA&0KtW<4Cp- z&%3_V1aof@ni-Og$9mTDWs=zvhyrA!|E&rBQxgbwDrX4IB14I=HBtvm<9$lwGOjP~ z;8zKYH3qd)@5y#5xQ5tGoNqCD;5_J#<<8Yp}XVNJtYuYvC=4X&2jvINaA8jVP1 zNRKq8=>k;lF65;F06+rBswSw~wL<}Ia@$Sk2+N>}Jx4+GRjHA+2@Ve1iakN|M<3yP zf?aHPala^Ao*B7{_I|0hgc`wO9eYIRcnk2+IZyA7SgW>z(Y80L?Jusk11U`o*CDyK z9h`hAeyQ4Bs-4F&aBFrl-|7jac||{Vn-_RLg=!r80$wGFNdO!|k90bF7z26~#FmpW51ZdB_*&b18G5!ivfw!^zy}~aN~nI+_+bK5 zeR`@sy-yvYe2!mPy@7vGFrl7~UPKY-bgm7?4|`&Yt2DXn>ku&M>%^3VP6G7UT{&$WT- zD;ayM8IRivzO3Z!PgU>(T*sLqWB~E>FFvKUq4yk@(sR0jsOrA2ht}2{dMZf^>!KZKe$Kw>buKSKcC;{gclq4g?^;m znL5y~{<kBO|F91QNUE&Tr5W7y+6d%GD%oES0%2*m_@+s0M9f}$sVs)f&o@Ev{|Gb9byc9we7?5C8&|HXV-EtTsju=R}uhS zm5zyjxW`Pzf*f+>qUF^TgDRmB8Ry#TJc;&Qf4E0w)@0Wot=C-oSEkRnH1iJR#kx-h zJ2AM&;b%_+PJH^96;mi|#A@C#m2%R$Nw6(jlE2Ln<9Gz7$wh_)%XVQjYyoXJGJiSf4hv8_H^pq;@H8EPstW`UZ>Jy5# zANi(g&Mt@~4iW>;;CGt*!lkDYEW)I}x_R4_f9hgzkA|0!w`&Kz8risViGf`>YEx{e zIytThL);w9xb3nwjC5`C?(@inzGlVfH2joxEDz{NV#KFe>CscN|R)9hrHvF zXAfhzzR}Zi)XLb;H?bAZfSX@;mXyhKhUAR?;T}J=YkpV@?zZM|kUsTdNYF@dU&-B+ zJ!J7;xX0t24DNA|r}mJ0T&`~Z*zzyjBT_d6Zkb=UNrC?>_qbK#)mYP27+f!O$UVM# zX+Y-skKCi|>Xg_a_gE43hkMNG_+Ic}>OXUjLWMi64`>oehukAZNHM9YliT-@dkhhm zzokS!ehyA-<+@rYrzvmvqsNpsc-+$vC>yL>Xb6^D2;&wGd zIKl+aYA7L2zI<{zchmt<)$$(q0~jV=KEYu``XJL?0#O+R@@>*#N6XD)NKSSbBUgWd zvu9&WznOQU&-FLu3=4Gaw13z%}ar;0d{fX%&#EDdCi&3c3 zer{bnPNZN0SR=w_)R*_+!%X$=RR}DlfmOs{Qt-yb4`MIN;?uwTOB=cuz!K20{Y9cd z$EyrTU3n%^5qH;N0TN|8S2tPu#pZb|*$Cp#e9$lW> ze3I;Z-gfupt=9Qbu6m}t7I2Yty_8``HM>qnpZ#r_B0J36fy&;yrD11=3p}f1?v2}_ z=FOg{uTYs>!!nr7cg8u#N+PYdeAz7Xbqw@N17z;>33&gc;nm(^TF(xYRd_b?m^2{7 z%mxH9MP6LL+$;S{<6dQOhOV=E-3jZDr*?snzbMAH;>gru8bcbS;+$so9`oP zG&|50aX&7R>n;XGp?@tgxX11zdNSJN80!`qhi4K$f91koNdP~)rls4Lt-7fW`1^E7 zLW|rfI_RLVsd(lHP*RyXBon9o)|2v5-57vz3nH`F${f4Qc>6*8)_S{7b~X3~{2F7S z5w~v!JSl05Lw#_>h@LA6PX_s;%QNY!vaV960RBAB{8>GT!4HgZkh8W44Kd=u%)=9y zyEXNY5{IQHu{G1;tlPrkClHv&HsH(S8Lcvb^2U?Y05AB1&Qq;&yk4kpQwp`oAY)uKipo6h=ICSF2qldBqjKw+g5{Lu*}QOszRDMcS$;=gVIQgEhjJ;!Co?QpAckFA7KBwNB-U&FN)d z1X;x=sfbMJo~&}yL`y3vj%=tIL>TNXJ>Rd$Zd*70oR-L*D5PlLqd7)MsXojPN|7p$mH#ZTm}m3qSxQ= zRf3V1t711YpqazF&3;@4Ee_~AC7sHjR1$I4p|!J5@8b`cRM6FZ7w(}f*!bp5aMGUw zj9qVOWz3l_@6<#91ERZk=S`Sphg>?)IXC!lFPz?vd_`|owm~K4)<_l>Nvhjj@bTIH{P6eZ1K^(CV>>L&&&=c9 zH({sTtEfS5p;goz>5zNm^rRMJZ<+M~xxP~cDyiA)R3Rs-;1*6u!&f2+E!u*^<@*Zc zQzI+57O`$pMCvhpsvMVNn1FR8u;o>Xa1U%v1D zBI17UKLOR14#h{kpa2rrqnAB?`dhw6LELtO0YdtLI#yBx1MH5(+miB_s;t0ayFinq zK=D4tvQPj!{E_8%MQO$u~Bi>tfy$lffrQ2KNX*2zr#not^KS+rn8E6nNQ*=XJh+ zs8Hir&Lk zJhkw!U|_^pe%RAVR|7Z1

IUO?HQlEF+ptn5*p*G1YN8b1}*@l|gFa!P83N44P3w zJiJ09{FqYI=a%rC?~w{gQN+q9@?ccmR#Yihw6aq4PKy}#hkxT95wHG@d-VVJ+#}S3 z$W;}-`<*(`!Uf%?G8sexu15`Qg&#={?@9_KU60lb=F4`9MH$5ERQbubN=iOqk08O; zE#$@A5LDa&-ntt$<&Jc5wGb3KX@UqQCC8m?ja8nD{pgt>lpp&}KW@$};jvw)TS1(? ze&S?O)P0GBv%VhFcw2*21q}!0&XrrhHAB&Qg@CHW3a)6cWGmxcm6-_(Tc9l%-|l){=q#eCmdH!2?8c3C@0^zn0#HC zPk$(xy&(1dT(bS=xQQOWw&2u!cg!0KRHr2P6cznhlxvz2Zz_n)-%bnLPHwr*1D#8C zo(rj0zIP`naZ)+cXZr>)?~;S`pWB4a)~8dgu`|ki z(-LXzEkVmeoG|WZKuteH0{)00t;im=UBw>KiU}#e(Ys+nhM6MPnSBPZvtYLG^h`-> zw$1L$vE&RTxo09VckMrfOajB1tCN%Uo{gJliNts_%V#S%tC(Xk5^@OZs2Gz{;-E5@ zE-PHm4RMYePSr#+ur_~9bOZ(JgvB%ybI2tK-FbGd6vD}rEP#6!OIsT2$&B018CMPS zfWFz1a+;>Al!Bpb5e?WIdS+=Rw8jW}qEgX={dWREdIgCwa3JKSaSd56{`0#)P>t%<^g zw-uoDU2hm>)EO4}=V+0!@ES6T+>J^n!M^!J6ITwQM^iIYF8y*jv;3oZjAIFcj#sm{ zR{zM13Itl#S{Ka~zFZhK(;G)qqYk0RTk?m{qkvy_CJ9#Kiqt7F8BXCIHiSixnGYI} zZ;l>9kB&qsQR>UoHq16Q#IQ7P(YciKt(Zix(OO#|OpArOV!TrV;eEwaLoRCvmE;SS zB>SPO*40GoG56?(BC92lUI;Ro$)|vQjAB&zpip}Cl`?BNmr*$&vveB&5AKl$-!_Bm zU@_BdQq$N235j;;QLl*Z4L4)(=M!9(t`3JY~2!8R!|*4LS0EneL{h4{Lb*zbzSrDcu*Tr-K7 z9{|+*62w&-R6iccNKNjNQGDyN^(r;*7cdh5*Yxo>1;8@uTdpedA^Moyt00yEw`#zB zkz(t|#)@V@TRk~_kLNJZ$Mepz`cN2|7SY+wWJOop^=S5Bj|cVrWz8dZwY?{y=pTpRCAM2NqS z$#pHHkF#vjk%mQ>ALA(#u7#auh}?1XDgXs-M_G{{O6y?J(q1NGtURmuvcag=Vs6@l8Flo&y2qsl2)|D#i4e5GDMTy{Q@V(G zj%8?|7(XpEt8_gm5hLQoMmUw}ccnpm)X*gg=Hl8I>xxLl!Wof*eqw$^H*C6uiAs6# z38m(^BKtY5lTrkaSm|&MWq$*Jwb`Pk0kEft>Npv^GX<_m7Z0)CXY;? zl3ObSfZTV+y|==}l2Jd%9pdI~={pAn8K(+!yb24d3t#LMmZr2lDs2<(LAg5-4Y)B( zQ(6+G?e97UV(vFI9D(cMyQJTlCz?qNx;CtYLRZH>;(0#&Fq2sB{T%PB!r&e=$NBE9 zngxu@?stFuPG z{zE_JoNn1RgB+Gl&02{XjBY)mlcpKJmSd|WcJPyNIJQz`HQyCanoEgh=>IQYBTcQ6=HXTjqwDpGp_4;emtw59NG0P^64kj>$;@a0ZUyT5RqH0b!e`lIUlQaPen>41yOg#P`8FTBuPGl`#O zeX2GP&(l1iaTvu%vO0h_yT0)RevI&?k7H)+(?x7E**UpUn3@vEa|%=sIAw*O+NVKN zT`f#$P#Z8z8uAG)i_A)A+oiScli&SYUUlVIS|`%jvIPEWdcbIe+R$*ar&weHuJmqN zvcE=WCD5v56kx2gdlu((3pb60>S4u0y0eEmr}y4M*2|f!&2UM4LsZu4lNn>lwk4BO z@C`3yd%9-QsQ#~T&L7OhYlY3W-o@S7-M@#64-CgW?Z)@<`q1YrH~$78v2dN!2QS%P z&Y!s7uoj1jph44YI#JJ|<4Y4hL$DAUX5AIy%7cc1P$D@CH6|>26qvQ^yzEo9@>T+Q ze$c9WK~f^OI3}A+gIB4Zb0jmrAv4#|7}v$iHUH_Afo#A@%%V%#!da9H7P{;{V`PH^ z=#7ZTHP7*Vs0)LZxk7f#Oq${*e~iC=IhU6@Ui9|MZ0T_Z_XuHM!UU{9ewyL0wDIA* zEsQ9=7Zl3WW#rAfzbT}SC8hNn$0PmEHi<^Bd_M7sVL+C3my%aX1y#yo&C0}(I-xyt z4v%o=B_`WMeUXgf+gpI&NSi-gNKd6#E-*Q&Dd=o9&G5>A%fd5ri{CZ zPrap2y;USE7x2vpi#DC;R^)>+HDKa>e<>C97)*>+qZ z-zh9QSl5*W%r;)pD!6l1P5$BXCPjVO`0+bA)>1uJh|vlpoz9#9Vz8+w&Kih0mT||f zXBL3(0$`oSuwC2^i0!0WJyO(SM8^z%-RR@9kK2U@dRURZOe~Z$tx=a{&G0XEYqEuM z9>%1iLgFi@RB?G}Pq`^i#W^DwLxoIz%z4suAoelxyhmX({t2tv#@9%fM-;26kLSUK z?4mmR0W+DJmLus{*ZUZl^|+`Dt;Y#(8xjVM?nNvy$_pkY%K-D%uOwe63tBqHY(g`T zg-^|&%iO-vo=JxPCPxstHsHsua&PAQXmuBzZko9T(d<>I%wN^ElgF^i7t4NBJaC^* zP`_gxQ51sv_*F#OYIu76ela}3=vcZMU!?&@{Z;1DEhJ~onK;+|MeR~Q7>|a#Bkie9 z>iH$>*Dhab19&d{u&utkH1uhj~BA$Ay&oU-8_;J1dVa}d}{Ld4H#V4r;vK#G9?<$3`AEyiQK{-qhtuJ9%5E z>M?lz=U8J|@q6$0hqh)pVQxkwa2 zoSZKC2lrSc#UHs-{Y?$`NmEa$hGZkBd-Pg%sV>j$COJL6`_rX*0zMFVeZeQfW%@#q z+VTb>iH>CkV$b~L4JGoj%M7K8oBoA+G?x1y{K{D2qqf2s#m|nf&Yb-Ezue=sXFjk~ zP8m-`tDUkUbx&Q-NxWKpJ?~lIDd&Q`+-m3N#m%Q~ym&KPeWSPz_P^YtWu+#^>+XPK z{&uJtVKGP8 za7Q_gaB?2y;o{`u;TGiQ6&4f_5h6&42+2r@N=Zt}%SfJ(la*IcJf$eFrmU!w&(v!ga+con44QBLU zz>FX7onpX@9-b6GPuB+z9(;Uo`fr#q}`7*TnPgiB^IC zP#N=+BN-&)ofI2}%J@%`(d((hi>I;wo@BhnAQ}Hlk8wTM{huV`KRw3vf_wi1$v9IQ z9$)7BfAJVA!)aBK9n~-YCK-Rdjm@r!U3i!FH_4b^msDIgTwd4ri=J9t-`!td&QKW{ zB;)^^$4Kv~c;EA_qi1xcr*XOG-F8n!Utiz*{_6JrsowsPh5n|E{&(A-YE}mtTRzt^ zJjT|+x~0Lk^`Q?P!;L$`@4H7ojE;U?8|~=-ve-M;wlmhg@wI1QqU+~G|M1lM@O1yz z>E+p(+21n*W3z*kvrFG+XMfEO?|vJcnOohN8=IW}GCx1RIX^MGII+DrHMjhqRmLCx zt1_CrtNyt#`0B>juK!kLT>V#-k+B92fQgS&Ak0DkrZPJIPbwo;I6;4sO_UM5$140M zl`-p&%7`_E4B7iDb6lUktMY?39iuWhK8;c+U!jHToxeWgu0WW`#-1oI$-!D$Xf&hG zUAjJ-{Jv9P$loG!ZZ@^jrSxi`XgT6rx@Yz4PPEyt>%MSF(ouYzWDr^N8RKojOlcyH z{mgML7q(6{LJ!-SELcTjyZC&Z@W&hp7jV0D+^r3XT%Jd4ENVvz+MJ)3+&X~yv(&(a zzj1?H6u?;A5_tTCP%D_M&N+nlB@*fhzF1h5ARbE($qt5F3TsF5q}7#kJm$>r2}wzKjG_6s472ntz^JKk^qtdYp_t#VnvsrMjmo(-;}7rj8eyhS4)+AM zxATP{j~fd2j7G~$$J|(Ro^I(t!*fftGfh;|#00#QQi|O4UaUP7Y(PaXo_m+;3Y7O z1j6Y)L@t41_$_k#u9P>3z5d4-Cv9un&lhN&N{SN!@c`@{_u)*`$p8)3CX9dr=o1z#npkHjf}w`jM=UGXuCKNmg1J zlnzB1U;)LFIqeb}+7D_3N|HorPpaDj!hp$me{f2GB1AayZa_B$(3Zdk-BMZ|>X98S z1A;J`FXwa`1Q(@(K3Gt~4)VzY0{~xa|LQZ2K@2I{s)@^GCH&})DP+$zlc#^RwpaNE ztR;VvA8IenqvQ}cqCZXGxn6yx)qz8EYXXl%*wyO|Qf%CSLz zzgTm<`skvu$^Ah+!Q3Zdk1L~%HOJp`1G3TH_tqlB(~&r^1;jV$HvSzXs(~A)?>idh zmA}EHMs{O@s=2#NdnK`L3Q8NF>Ur2VFHj{s7s+u2i8TrdEZzSesjd^9r}mSJ;*US> zGaBbnmoi|yJbuDY+W~ViPLIfQxBbz>Bk|K^R&Z!T!?BiQDaa$7lj3l#CoVn7`{GIP zUxHn+eAxo8@P0r{f25|eLspK;yk@+$9F^>BjrYhZy|*bqXCAmMzy_}>nbq}qIAyx` zsz|ifEPv;L^sAW$CJOngW$S^cVov3wP%tnXgqUcsk{R?Po#)0=WU_O?ZV7N}`Y4Bz z7T+-dJ5->RE@vOk4;(1mA^w^_Eg_ia#aDn3jN1uqes6r^ z^B+%IqeNCOpP71j|My9RX_rBgfd8Go_xzPRL$1C=Ol~|^pE^KQTV~O>qlbN-m`rjAqtRN zmmtI68 znYsLq`R9_}Ek!4o#-~WDQmb`qEC6klm3HrX-2i8zpL7+pEX$Xp+f`6>y4`TL}00Mwg9x( zM)u8czRr5FPEF;r%<&iJ9$=p5mvDR*zextk*L?Iv7*9UC3Xp!|!}(+TW;CJ@@T%e% z@?1;MOGH|Qkxw?E+GfduDDlhxg7PJJ;$AG`H9g~!sA~PsQ0=Gi5+iA#|6E5Ilp|H1y!q)1xSTE!{4mD1*C^h;O}bt*|40- z>b4YpLMad6ODY_wfFT{mg!}4JMKN0%3VNR%m1Ee1~H#t7#Oq zP%;;rtc&&Vz^>6uK+!%hISPYk%pI^l_1>pF&*$Y;EHoc`@2n}b{*Y(H4dUBovAp7} zLW74qBb>oAM)`jGe8@8fXyWzpm}m;b28;fR^#c#(zN@KXWS(md4)YWp6l=QoxAA|&lsYE z*QDvXHrbda1m5#}a;7wJR~YkF2QNxQva{je1w8S$3OZ8?Jr)%dYz5Z&iNAk9WL|x5 zd)Fx_&MH`=CkUOXolqH^mc*W9g-;m_W~ht|p0UC8P-SdbIh*|-R7TIx%B0Zh%Fr4H z&qzPy8QHm5*()*4o?%ZE!xlWkng>I>^aTq_)J^FjM+G4USFsJQflTJa%_mM^0@Xy5 zb(jN>+X|ay@QkkFS)}tBNvzVoSYiTpR0%szL)_Cfvvt5uD6#(Zj7*7&{KgfvtZ`j| zj{HqzCdL_sh(H*r;we$veKN~sGDx3x($p2dLIh*hBUV-*qQnRMfu`uAwznHaSbD?3 zz-t;GtYxB{O?)x7Yj=etY=0BMHZ%vOgN&GkK!^;(W+Z|`#RNx)>fMB>=M0R*o&iN0eQb$t~AKQ^wvd5@TAjsT?leEPtRRe_CWL$hSI3JH~!ynp= zVI*wW-!`Mhq0RV+$U00zVOdiuzN7AU6PTyuVwyC&T(x8sOx8E_=IN4WCKbO9klz1o zGlC?TXH#5WK{tQtNkbC%0oQ69q-!&Y%yF`GWs};oG6z64w`AEakTi515=yr%y8;d+ zs^>Q(MI8tp;*9F}WEOKYvU}nmT;d#vH{EKnPm!&O8c1JLMQm*l;>~E5sK1rg6BzuA) zU9%#_WiL07q!qBHHsmXKBCt449w`%(n3*DbmI*JB=h)V)ndMrn?HEP?6juEz0Zm93 z;}AZY2K#u_*)`DOY8pp%x36cMPG?EV;@HVe%Yvk*8u% z@{iDHh7KL6v`G!+cSv*nBQ##LLBmMd%e1Et7T$pCD;YIQnpE`+g_#AA1HiukO_c4^ zP{((`>I|VVs=}RE8TlNeOk@a{Z>!ti)`o_%=Y^%J2=a`-+{ZI9)1EI~tcKVDVYK8C z=JL-{*lwwJDpCNR14=n`70pHifM_qu@Bo-A^w|i2jSPsp0f3Q<_M>VSMrxOTKW%?o zx6>2HGf@})7HjyR+UOz{oc02`n~hM#f;;NaMgHP|DvWBqr5yhK@v}f2OHz=O$c6-n zV6Zfe;my|ka1?LxlY6;@&R)}CVpl(l4Vqa)Z8)GGLGT}m4TLDB*>02Bac_VoVwQ}T zR&8SNjI*vy`e{vuHBH8&O(qYU;>cKc*7U&a!6V+*snrhzT9f0i36C00+6Jz zrly)EMpE?33e0DP=^K`H+Z6&cN5qd~n@T>CqY$6dx}OSuoOQizxCr}Fqgz5o2bMtP zReK&#Am59+^om;bY6u0Y9qDOJ?}D*ch-miJLz}S!#ac?~X6EZlace>!q&Hz7HQl(= zbmU#nv70@ju1(+jd)|fhJs1}mBYhNdL6>x+2NJPjGT6@~RJbb(4T}U#Aj9#9COo!! zg((F5B=X+m9qN$(t7NupzS=R3~g_3(H zy`Y;Yr^_VR1JzrO0F>DpNE6?y-=RS<7y_{(!-;5X#{b{g6~maENRSz}a%BOpe*7|y zsUUV|@r}%Pj>Xe3dRXRa`tVzpzS?(R!!%kWaV!_Zy7bbU056-y%*JO+Simm_dl)%^ zurH|9{scbc)C$x1BbKx!Oe%iZxMU2J(aXRdV;D9g;46S{GKPrJGecI>kl){Z+3EZ$ z|CmenCX30WFz{(t(r&}a{lVFBxuK>0cs?Zit%-?8kkt~D9M880gqQGLHo;sV)e0C@GP5V zolMafSTsiF3~O2!CiNDxob31fT26)IHc6g%E?z-(IKC3spmb9($p`0JYH2ik^wW0|5*7^Z>P(( z8PWLrhu7XqO~6Op(x*QL?3&_D5P`Lp$E5KgV=~93@fr`816LbCnT-<1!L!&^wmRt} z0(j1ks}PKxKwYDs{2GX`3OYFZC0MNieVOlq^sI^Mid6BsRmZw-^t#%|CX&zkN5KuP zXH7Z+T>8&8l8e^-dNxezR&~|+%;bL>)@)D&f0`8kd|dz2VELzIF|So|lO}PYLrbUut#G55`wacm{`K5v=N%CF41i<{{~2g96~BwRQTIE*uBrC&FS_aP_8tO9`i2b={CnjPW{kCu zZCqaeShCwGFii#SDJJh~9bDe)Sl(?e<~{!1>t@ZA+QT`YRlgH+>;4h@k23ZHkIgO# z?1#E3h#VK&dUoJ(wx!RsITHQvRmQ>Y39TT+dQz5E&!kS+thf$CWt`HB5@a>$u)J{M~e5Y&JpLPavs_b69J z23}~DI(A;te*Vj4cTK=evL$79K2v=`D z;I)R#ek`yozM6h&<9jd2{7vqG%i7T#RnWB-mC=8wjF))DFH}{;kK{^bmbT7nzLd?o z^u=rXT9uCJN#&@GyDJm#uYK-(mwkV=|CxEZ-RR#cW8Q79`LK-vx72orpV?J1S3PCb zK}cJd)4k$HqX<@_! z%62yb-S%M3d0)xiM>?j{4(^cn5GGxY$@rr(#`P5|YSXmObcwTKb&srWoO{qBC#NHJnXyYw$C#em+*1H}q<4%heYW?y!l#;?q+o^Oh4IoI)a~keH;-A(7i2i@D-y z+GQp0GoMP8+5D&#lt6sH(B(9e<~-lhFdTAoH8b2n8t2bkDTPaMiv51#8)sW5tB$of z?8$UkdMD6!<<&wIZ7M`beEi1qgNI(uW8Ry5C50XW=e`Td4vT^T1R{WPyFueI5FqTD zOL!%tIg)vT)OG)w5WtEK1ayrz`hnS3=wR-S)ky#lm#>cntj*GVdyZhoDS)J3gV#JR z0ZEP_o)l21HvrAp-~)3Rvom+|qr@j65M~O%xR12KMOXD0r+@)8`oZ>=S(hdYr${&%deotZAqB8-GKVcocz{=!A zA-v^EVdgUnceN#fj!4)paga`2^S|vjrt%kNNt~fF0&up%^(WHYfY66ioaF45t}-$S zlOKKCW*x}=NxTgAlo-HR$v!%DO;2^c^fprqsZsk{1kw=@{2Hgh>hgf0GEN7*+BGMz z>P17M#tEEGBpvw@!2EXi9cyDwz(5joJy{3}X=#xfIoN7u0+FnVq^o#E(hKIbXrJ~C zU*$kDIEVFkU>>$}E0O&LD?~sqTJDl?#F2|u@*kUxbKFQdyek|`A;GLS zs(T0{bg?txWVGCQat{9%OlQZ*80PN)1iXj;AKvc#AIkp^{QYbg``C9g_NB2C*=Dno zY(+?8i|kvd#0+B>LS;*+Y{^oREzKC&3z1~YPK9Jk!kl@3zPEFJI6s^p&wnu2+-`GS z*X=pa*W*4S&uvP3R3^EqQ99FoNV$q<2Z;MluOUH7qF(p!mP=KglUB7j6RTXQcf!cr z@OOY0pZmn&rLW2C(WTtLh!>*t^Os0TBGgqH%&1ewTwgWx({`B+&{u4n1OVJ|CV^je zq*(D?0?^=4yvsCzz@xr1MmWz0c&D0bd!h{)a1PDU5v;w^UimsOB8!EOYuT0Gp9?Ne zklY5JF*Y>4Jm@UV5cB@Kg9;WX_VNCU*ChYYyCN#BERNWV9ew&wUZ;mm00EkZ#(#ot z84FC7Hw;>ll!R9^AYStV?@GJ;0&NK_DT{*fTHbgmnjl-kKfc!}l zv29B*<0L3b06-Ac8zfvwi3hRfm&8uv)7huWg&EE25rV2{vqQj*yr+F&XN5F{!tS@* zfwC<@PA=h27vCN4*Pr7C_GpL_0F>9kLHB*T@AuCSDOQC-SA-itIZC_T>98?>>RUzmBI4`4KG+V6joQQdPb6sSEP(+@THzI#1xx?0>+SGUNx#0Qji-GIX&JICK6Id?IHV z8PE)Ecceg3$7-3WgPu0Y&UXEqnnW`MK_Vb3eQbB;jA3EHm4+$FUb2{R?7ikb2dPFU zAO*0tnBiDN6>2Z>yyFtM@w{eA`b~46@~7+ESFSD#e7EXmfTB0-F39(muSUK(Z}cM| zeN9B>sZWC&kXfL3Cs_mlU_2`n#2h$1y<71nhRh!PI|VyI-oXI%OHm7$@(J2u=AX)+ zcS12PAKq^pd|g6aPHS}{%lFh9a7Et>$871$w3 zH|#L79jhTG>q#lq0h9+Zz{6Q9F-bI`D3%HpSTDU2bq~biN@jRhmB5p6hfxkP{hWCO zP-o0hkP_$m(?!mruYRsTgLfVE7md#20eWE*Y?&FOipScvnx|nEg34(zl z|E5-(Ub9*lp+)552gbJcp|5O&&R|)FYuALOyH8ns&{$V-0_{XCHZ;kR@awLH24MLN zH-{))W&E{ud{eoet}GGNzul{3$3MLT6qHJHyGHkxzOD z&YESPFhn3(xpcr#UKnuTL=wzmgZ>@x0(hEBcvFT-qhA&||5LjnyDxw#h2SvFFh0$| zY}5%&0Wg<}a;FeFgyNwv0Xbg+qy@0VQwyD!<y%$ zSgQX$%^(fNz&!ts3GnE-IssAx{f?DQN2p@;6a}0G7!^Am34#nEgtlEG-DdnzlwO-_ zs5GQsKc*9-@c3JTRZJ<%tA_;y_YF%m2SLd$|QR9)s1u=^vlDFh)XT$RxLO0L7djJHtk z>XY}z-R+%?ZD}o)pEc-za2av0SL^DxgKZRshyzmvW!EA@+Klf%D{LyM|(=vZ{k;~CeT-cAcTaH1!(?n zFBusCvz5ZL&BG^?WH->Ygm&WCyOpxXgatEDX?$58rMZZDC0iYwjmJwp+POzxn zIDhMj1ua7(-f6ESBZ6=hMqowvupSzvHN1`RqP{Gq3ylQ2(0H{0M+>J5jo^P6y3j~4 zz#4eH8f56ReDlu=P$d|ysG0^C`X)Sdj=Kv#^nB}P*{>MDz$+-&FX3wfi46vb?|-2S z1u%t}wwDjmk;Z>s$dL^Z7#R*MRm;f7$$k=*@G7@Y$AWc`0CMNz^CVt8e7PefV$gVY z=EK_VdA>(F^AS2eP&G4rDYELrl9lzc>yTdvBjYXW{U=cE?zE73ld)C=oW7u~2mn_x z4%9*;fRCjkl8h|MsOlH)*qML5#emUhI2RytiDwArDTgeQv_cTf3Z;r>hEI<#&|)$S z=5_%JJVJWidP2yDAq{6nx*^xUGAhEb)d?~_erF$RGX48r=T9$594Cr&k=i{~zMKO~ zbpOXQ9_!nt&41u~J1*uwnl$}Co^eD4G-(Tb=)D~_wqlUaT}E}nFO!_ zWV%1rXuvUK6GXyF)no|C89l?&08j{l#SFkUpU@13>scBBLjX`Sl4na9BZ2@2`T#+BN{RU5N;tfKTl$G0>J2Zv~~imY|M#cDw*$8}wmo zSL5my||8Z2!4G{}|&&MC$fH3A!6&q#6#3=q6O?i>yy?La!fzO5OEd{*Bg zA-<*qi~yS|0T5CKomB5{>qkDZzJG`6Xhw)D8?Uso&0)V{qwzIqI>`~nko^ogcK*tT zLq-=^;-f|!eDck^n~B6%436#mU)odr8q8_b-Pu;1BuC#zL(vSbw;75xX1O|`KB{xA zlMIH^z=W@iRxnaab;?x&1fyy&r{&gKo$yvsfA}r*9^aMS*D{~PzR{OD_6?G@6F*Oc z5SpbF_qCFD46YuZJlcD4b)ppXY5AKUVtk5RfCne(}q z@l14;agH1M0M8yPP`94TdWwhj@JeIi*_GX0YI4sryYsm%VEgZ$RdW~I5|jSR!C&Vt zQo&tANEDoNpTltJwYnoX7MU^JT=VSWh8|Uz96>G**&vVYDqb-Y4+WC~-2x(SqzCe& z$1uW!yTn7~fxykG+!{Qaki!3`%BcU*(_qfiaLW^Q>WSv?(yD-rtuLw=dPPetwmW&5 z1bLapdYNTr)kq4!08 zBEPTq1q8rCGckyc;va46bSFb8x^EO%0ie9jaT~(4^ z)V!IoYLjEb>N(ot@nL?^zs@7@BU@0?wNUZ3@)X9eeHnfSX9FtOK0?=m&gMq)LvgY` zx9WUKRM*A`84(rQRZyM?ey9(}>)4XjR^LoQvk%GIw;(&&;w!9na_tt?hlW+~B{p0O z&Ry>#_$JQzW?oyDByw<}LFt+67MDHJ_=)#A*K-*8(nowVW51IgFx9DEZ_8fG-1P-g zSd^{5-=tpW?19}}y`C@b7v|;@80nLy@goQQgJl0B1MgR!>31vAuax8a3zxO)Nk1y; z7HgZYSD*UbzU22}tA*GM9n)D)J@Ye7$G;)S|Mu3}YQ9gMl*f3;_pB%WX$}5w=6+gj z`Q$w?Os&g%H_<_^^X)X;2sO#E%rwm6xL!8V*290jyCh&>t69~APpM;LfUYvmsYx~W z3~vRDiU)pD4jeNK9Jdbq>=rl?6!?Xn^3M$XS`s)_7dYJ+IP)=Zb}o=Mw-xy9G;p4S zp0){ER1R7)3|h7hT5$_n4GLO|4O-6(`d$+Bqb}%YXVAvSpv}3Ut*xM6r$O5s!N0|W zca(#74TJZrgZJHn4}yaK#0DQ`2LCMyKB^1;*BN~LG5BOI_;f3nb{Y)e3<1JJKq?_% zR0!lk2*cG7#^4a9xDaSo2=n6*miiFZcOh)UA?)8mIDUn2(n4UIpbwhl;_z^sdpmm-6VvJYIoML5($g0ub<1O7+> z61=Pa1Z^jWn(j+YLgHGIZRl5em5q1{SmPI|{+pmQUx zmpKHG9Fv`z@h$q14pYboQ+^7Qhf~t+zQcrXyV2YGPV!(`LB_ajND68{MlMMTL9(j) zW82GY&y<+2!&LZ7v z*8wS|Fxzk6Dw%}nZtsy)z#dghx+CCu;Y zWnaoENj)XL8y@Wt2aX~zU(o>>R3$nLlZt)mLS?dj@CgkyA9ZD$sg(0i89d2i{?Bcj zTXBC;9A18x z^d$=vdw2(%eLNqPY|?TwME+E2J^6MPXz%g4oe`qrUtRd=6RLET+Hd)9Aa z=goI$zEdMQQ;+Fo30l|GElD@0|C>SfNi3t-t*7gwna*#*Hh=7&nQZXr%~A3{Ku$IJ zeR&!9^I+3h^=#JA@uBR1Yc_r)ZzxY7qbtiIYi8Gx#w`7y+o?^+K;1^czS8EIxM6v8 zsAo%|cKFuDUyb#Va?@Ppz#~oH*6{rdYEe{>%$4ArNl5YbB{Kt1s@Tn?tgE%&#~JE3 zJI1e$RoZ6lEp3Xlge+E7+Sdo2=pJlLy$SzSa%F$UyVLa@qfnxalD!pn%B1DEj~zeA^6IZ_c2?C8VIk`e95n2bg9uJ?aZeYMCQ34EmBzqM~70`(CcNU$k$#k zKf!a5;7o?ku|4BWnFp8sT4f$yxkbA^s)bPdZ7N$6<79q5!sb;;enrgx zqcX+{UArk!0AdLYocjBJtBmD|J|(6`3xgLVuDi=Wy=S4T@a%ri%?~1b*V`2C1XoS2 zbD0l$DpWr!yro!Eed|sCt(2-G#oGU=jIOO3)-PIrw4JGcdvtWBfx@DPYKN9YmBC_uX2ihk%5$40rq)-Rm{~(T<<^x#Ryhe%g4BqrX-feNWoov_ zDX-h#4iom~T^6_^A=%`J*=o6^Okog-X^84DSQ$=HRP-89Kg}b)*EvCF)OUh!gpSdzxyTiMT1XyUKSonSN5mh;`Qa~W4!Zs z?&;mi8!}pM*NS(PId$6NSjF>Jz6p}PV3L9kR&{5QG+p1Zmr>zh)BC$pRFCAhk}eZ5 zzZJ04xs@ofrS~gmndZ>*uc#Z>jPB!cuoMd6(_aH#6CF**(rT`~twLs8``gaACjGn9 zrMNzxzXMS(yq|L`#7fxdaGGn!7FBx^Qm%7jHCTJJ681u@fP-N5_Q=h6Mc&d2Q_wyD++s=$0Tj)q}Twjp+ zGcUT7?)u`^L?|SBz981gN6b5W_UglKiC%#(c zt?Xi(X077^i^V4hw%`+GLUN&V%N2aJWCrCF3|jyhEPgznjFIo+%4{I%@wuNzmFBWf z4pp3O|9)PQkXyVa+O20f%Hj+F3s0(=>TNd;C93b`9Qr0}=U~#W>n;|$I)PBFJ{#gOLI>?d}bFFURoY`}8uG zt+UM-r$++_^`4NN*<3EruT!OO;a7UikVjZa>4$A@+qiWTNO01(SB;tT>YseVB|E334MZ?|Y$wRH{a|m%d0ij-g4b~9UdAOI5T`2wCIjhPJ`&*7T{M)$7`kRj)I{>jOJpeJW$N1LRCo^4al z{vL4+m`uR8za|AgBqUo)KM@-c*bi0Q^3O_82DK4mMPCQ)s&_(bTVq0v+sdC9?ol8c(x z5D>IgvNXv_oTm=9qfUo2O8ux*1u1>n!$ z@}1;rOCv+ZYNK6YkF;u2pcfOdI_HLjfNLOxtsCDYnKBzj%3ffQZGh?<*{p>G&}&I4 z5aSKCpLToTCE@Q$h<@%&Asq<+*%mX@RD0GXRWE*#uRUy5rOb&sB?y072=!WXHcLW= z2cu+_-Z1kHJCDA^{C0Fada98jZUdm%Mvj+03gtBFIHMrgVcIY2u^+$gxg@!B3p3H0 z4G5Fo=A|U@38J`ecFanuT!mCJghj1YK4SfunIdj@b5`mo9#UTJM5u zcB%Iz>*-q??tv$+!%+@chy^yan<}f>)yCEh2Yf?o3gKXn(IKTw9+t=|j=KCV9YPm8 zf~qqfuL+AUDh5xS8)BQ7Z(qh)ICL@bleC>MZ`Hcb0=fiRXc%)fIEDm91Q12!^JM9; zA4Jq}hN7K~;hU!-TY#(2=ENM_$+HwmUo4-_oY)XZ$O*>>nZZs{5z+Qs+$0qQNV3C7 zwmSih1gov%keZ0P7OKchyk15c>JW5`&X(n4k!%U?@DTMDAUCW8m4W5+#hP#8q;1@1 zmEuKgKo}!Qq3tP2(FJ}o9IFH3N+Du5a5^z1y7Ih8C|E|HC~x{4z(V?z<^4sK9N=FqlI(VuLJ&}sg-$gRRiO|= zn=bw#@@h3s(3y(vxOyxG5rPt7Ml>*j3`W>s-`!;t>joW=`jYlY48T&fm93OCQMx5V zx|ArzI68I(3}+lxS|=8{f;X8VMi$a20#N}5<_WNbibDj37BKnv|3(5kU||wM*vLxA zh^Tl>R6$bb0{XOOz~{ZF;vc2?MMW)@(gXF12(I#5uD4VNc~-Nv#adnR7lmqKCgHWfDNV260~lbAiKlET*2 zTQX4VKui94^$c7Ze-B)KUS$3$s#~54qiW@`Nj6wxHo!`3_Gj3urBZ-5Eok3<{7tYs z`xwRc1YJeYbzRI20$(yoK>5aFU&~<7k7o;>q4Lo))d@-_pt*BKsECJ{mIO@BBRy4$ z{zkWi2~pwD=YYLi1QxIC)g?jz<5el=^zULc$Fe#6?ul2Upuo9MILv_}-%){j8qbWx z<}rf<#Qn6&vxM~(sB?|=M zyLPsUfX-j2uAt9S6w8FGG2PG3IfJ!Uzsay9F!!ab$~=>cnA14|$F9mb%am($n`!Jr8lM90~mE@$F|tsVTJ5(xV!EUPVkkqP^|8 za&dyf;h=>D+t=BuphK$IwXD0!RPx*cTsAR_8={oK(~U*iDPdH2Nw7VaP{naOMLGE` zoC65661v*MLVCc_EjdG+z5gH5IDL_F)&^VPM$&Tt@#OnUIe;VqUT0xg?nC)>4v1iP zlJX%)C!aJ3@Wbc_bXy=%7m=8l5)=Zw@PQzb(Ip??Hcm$xm!+O@K*lYUeqR2rZ6j$i zq!@9OpxQ*(D^PTDQxJk0DFQI81o4Q9&8dMO*FM*vx4RbpQjr_z(A6K)vYVgOmEJ#C z8}#css}DrPl!nD!SO5HsxVrP34k4iL@$QE!;Rn5mPIMLtYc=GYS@jo4ER; zIed{J&ryhVu5PNjF9-D)p1%k5;IFJnzt4;zAi4ip<0{*KN`WYVdZr4MQVYDlNJ4e8$62%6E!&L01PSs z_cF5gjGSL~h(Ie_8 zqXaIcJ^@ISt)e;|B`BY@n{9l8vH=A|zCin`GSyd1=4={w0Q3~czj{1#NF)4hej(fO z;=bZqY+DURcV*Q7^@I6qffJQz`hZ>Er^KF6j*;Rw=*5c}v;s>+hcO7ZW|g)n~m>CR##g?eciV~r8yF%XGkOxJ74ipx~i6oRZxBMSe_{)jr6MwTR0>u(U4$va35h5j@&YhJbf zB$=%Wpua(+Q>Pkz)QI(ikvz zX<*j?_netN4F`h7VHAX!k)UPwpfT>}<_i(9s?kU&4)$wPhGKxcAt7XBcsPdx*cl@C zhpn%jRm@2U>aMaX%&1?oylrB1IX&_BXE6Wz0=iV0KAe{bd|!?cSXcKp zzl(k5XRILu`MXcuKR4+nWrP~na;)XX=uBQr|7;jSl#;)S?W6vy1FjqUp9`W0H%RHE z(Vn2`aH=6jGbTa6L)k#EM7IiuI&}4kT|FBqr5Qxxl#J(Z;9!Zxi}7!$s#ncl$u<4Z;C>1+*jB1>Q&4%hqK$qQ(n(OwuY{J$ zneHu{I${7Uq0m`VhU`V#7*W zl>qWSDKSCTqDF`+%Rk$Xs6!LAXw}wa{!&9#AQOKn%82;G`5Zk>kEK(Mlis+Yc=mZP zB83oiZEMagl)o~bcPRn+@b6tZ)!5f(cvxIm2`aLWH)n0-kEvGy)NfDV0IYWtcR!*@ zAj#tM8@Kv$IorzC+Uov+9StD#uac09DVRc^r_Wxm%2^Ztfc_F;S#ROQAz%NZyY^i?1MANPNrQ6Zlj;$@~*IHtm zdE<^V>dZBEb&vqIiq?Lg5< zMxTzJaDBSU_}*V%&*``~UB55uxc?tmLlrzw)X`sgJlG!7o1owC88Z}hJn$x__glw@ z;f|q)`Xhg0MjT^4eu?S(cl=T6?MQIPC-})oR?O&b$48ZuaXo`Exswr*lfGBf0gZ%- zO8p`2lL_sUFO~-5uk_+Ak*5PtLe+0;ml++Nug|Pbzjl{d74hj_DA$VNY8q zI^9n`{qe^@S2f|MNZhxgxSdMWuIal!(}w?88tO_))Pc+@+@v(lSck^KH zFORQHnW56C@Gai!cF!ye&fYlU^&2CeS$w+G2%??IipXv7c$8Q7Gfm<9BI(9wWWZ+y z_oC2E=Y7A47w+#OSIT6sWC-fhtE64vklcu30C?5wwdx!$@#eB$PfNXU&j+Z$)<5lS zd{45_(&xc1!S0m%>{5)i`Hh}^-=zgAI)pW{MvKmW`_TU9rSFHtb8^;iS{nQYYvpno z#ctE510|lz${p>ROUs1B$EOD8zWkUVlqEbcJbV@Yb@sLkll95(?M;0*PE+gHzk$D< znWJ9#M+JR(<=*>Xb86^*)2GYNpC32WC?a392+P+nip9XXpfbhTUH?aA^cI$XL{}MM z-7sU(|D!U#=e_89@BRN3x&K+s5ISQE@i#d9BR5P_d zXNQ}-t#vN@fMqw>d#^O?nm77P^!u|>Gnq$wSPrhYy!Jx>HY76zbq~|ti$E_@t@Zbh zvmLB&5>iYsoXpZ=PTpD(X{z#X`M$W`n7!}#4Ecy}5|7%@co^a8{+Uml4mUoxPH-wK zzy--mBzYLW*tMoD%F7KK=t;kx4D*0utOHkuyk8*F&dX&Bc7Ku;3mo|2ktoQ>7C$RE zkVZ7W=?E=nro|+R<uHZ-r*Pt;Y59I}((dK=LStNe(;{n5g#*;(k_tBz|A|(S6qd)Yd2q zngMRf_aJ7brOaac<3u*9%kY(`mlqrX=PT)EqYZAUWr5vf2W`<(v~$g`q&!`j17h4gCP0dRxeWT{yD`;Zy{4>_=PmAby zfQp?jlCidQmKQrh6cLnildpaFw!x`J(Dh@eMDzoe`;(^ft*R;Benvtb3t6F)Fxj}i zWSa8$AH;udBQbATTDvQ|!8I%(c6>Rl&8*z|L6zvXpRGJb+Jf`m4z5De7QtewP7LIJ zaMq7eRQ6mouIDNDZq<7ZxF$4UoI8I{s^4M0fhAOwr=XwLRMzwB(D~K!yJAqFLh2ADFgg&pE;Dl^5V|kxS ztjHV5DB9(g@II9`{%&#kmTgvZ-V=Ddg7L+|Q75gXfGmWR9?MaejS~=Vz*kdeO?wH7 zRBU>r3rn`HBtN}8d;KE2dyTy}!$oy#KBr6fYf7qa+4rzR#B%2`j&Iv)nnJanU)@AI z^$H_jD83d16;IiOe#$!ER4;cjpKQNZ##u>Ngoc;?#LL{beky*c(jNHv*$g(*16?L) zX$*a({&}+3^+eb_ zYAVdFCfXMJ>OQpiSN*b<=~$W#vj4#9=3VoqBT={h+Sotn*Rx>s>l?oUH!a<02Ln;> zj^75yPN%y?)p;O>e7D%MM({8Gk_5ZG(+m%O@Lm@sGH=ljrIA#@>Kiz_-BPfi@PeZ` z3#9DJ+aR@!%NmUl4=w6!imY#Bdqll-5>-A`dEn*BAw1pkNM}MVqjG!+e#W8imCVb$ zj9P2Nn_aGuKDChtL(2-KOMTiwjX9skmxZOY2JD0UAJ!MIkbX2FSPzVm`6GCCoKu(T zZw3iJ_eFn)a;}=~Ue!9Tbks~d{C-HXv~*ckte~T?D)U|`tJ0c@S@}JK{%)?3hH0&* ziD&s23HZrWa!)ieIC!7L!gPOd;7?p6F$9vmY#%wJ&_Aru%^qQz!{G8c> zFiC&E=|&DXBx(PX`t&RP+g}K`n+^PI`?CCCUui_;ZXJ3+fb;boEoFWm?f#{1-oJe} zY{D{83M+-|t$iAuzjJ@euOhCtMsK#qH*8IspL=|+%0Fl)GuU|L%y8ZC^u$j==95KV zH&s;L8TO0$Epk&y{UXq^bStYbbEoSsT2u8hy&Kv^POILBp17yod~#)zc7MJ%_#`wv z_TRE-V9#@#KN6t07_iU+075+hrueo{ggqCTc-X2VRcBO_$?Rf!j4#9Q;TD>->+fag zky&g}Uw85!zgWHAeRDYO>rOxK+tF~Zqv1-o{emy|e%f;W>to3tIA3nC?$~+saLM;i z1Ltd<5Y+0#myNxS-st!u|1_y}3Y;$~+=^_&d_^^EOW4h%^^9$LeuIc?WOC}XJLt5{ z(nB;O_}}cEuyiRruoU+Q2qP_Uo(BLIE|7cWRSjg7Iu?X`B;4r_=c>%(k6 zKCkDZu9WZA(r6g=uU$QmX%Q$jVk8@aP#oD&&c2BlG0|xN>xZn-(5hfl*dL1!;J5Uz zf)-%y5Qg(M%QiL;?O;Y1)q=DI1`!Je1=xX=<(u^y@u>|80sFJI;#*S%TM|XAVg7g9 zuYZ|{<0E<%`!37y1$fT9Xx##Ia#-$k9=e#(aDeBT@5;7oqzjcvAyv`~tjY0$b6JAxo9s;h zR$&S}g2>!J7RVvbba2Y zFwfC{V*A~I|vI$l$ad}w+Y%3X6C#Z|~ zuU_cTvD_BouvS=w2Cj8 z-iV3sa=8lf$uD`#yX8NBms$x`NL4p3;a14q$**{+VA`$t#7^v{&4m#O4#iNT)%2zu zyEEzB5_Q~XC_4i^cE&^MVu>o!reRfk^U&2zIht^W%D+!4wEwA$Cp+@kvZ<%R6sJ#O z{JCl(V`>87JY@VBf9+0#dYSBV#epyeC!)}$d8Cp?foiT%OT30bxH86GQ|px`ibwO? zbwzc1BePc;m&eq@H=9z};1@0cT{N^j!!>+%HFH0mZM8BAvmcMNX99jNR%T;`y^>?( z(S2gC`tU{TD^2c0~z`;xihj zk`?MM;Y|Ph8SJtzOnfqIWJT@2!hJM(bhO)Zy2}X8Lox5YWovp4aj52v5dW$sW^Ek} z1D)Fp@7UQz&o0YLXksfpuqTz*7#Fd$-!|ty^uLi*ikK<;Aj=W3OyNo)KrMPkTcQzWKMG<30Lu~deus0y)*{Zo30PKqrU8evx^)kO7JXpuU{00w} zMZmkvFgS+CGW7zoWiPIH2$!(I^)kY$ecWnz&+v1E<=3Wz8O`zed9FT;p{B#E zX|?NM{f^gallMYz#06mEh4W3|^?{Rt&6t=4XgOZorkDExSueQDHXav5cP8}Cfv zj>bYIDR6Z@Xbqm1_Fw{TM}hED{P9@9{0aDKb5PbLpEnd&x#g>#EebBu`l6d+U0RmYQ3%*l2s#}QU1X9RG&w_xH!W;Y?y z@F1ZYH00w59HcFa{{r{=0*@rHzmo`+@d`D*@#~{|+p)V4OhWdM$kAXQE|GM}RN(e2 z$J=in-tKLVNT`x7Alt7c(|h#-oQ%S?J zBv?jD*|yw&pJs8Weu0Y$x+5mz9}A{EaY~aqydCE&l!<0l-cR~p#yj!0=O*+<}J$IKuNfG9sHxjsV-e;#9Y#jO8b+a^=~qZ(wx?7)Fv)%fYQnK zYP=cj4L`=0Dta-J0L*TsS2d*JbL1SJR=Jpk9IZp*xj$Jbp*-B>yw7Uc?MB^Z88-^b z`7;5z;8b{wrHTv#yrq;^(?~4#teQ-Bs%gQ)+g~5X2^CnhT57()?35*papd<%FlPb^ zXu0u%55}?9au45Kdr16RST9uc*eSoYdjanu2z#JV(�pz~7)FlT|L#MBsp#V+j=Z zih@Feb46L|P*)vR0(7d_ql9RJp#Ao(22Fi(szdx3&3CPQNC)=F!+r*nmm zu9r%oV}!dKjqBB=mf(u7)5zD}=vegH`>64b^vmmu(gh%qZFE)70w=$&f}oRv z6$KWc_nKN!=u*^@-v03Fw8B}UpfFGrDnaa(;Duqfj)~7{c%B`19)~ji(|P`flopT0 zqGlJlc8S>tJ-H^G7O`%6qvQ41zh`$vI+vy&-fLIrp$m=koDp=P5y8`;3eRCp_FE7? zE5QQ82xO8k*6YYZXUJSJqUCr&tFngKA~WmeS_S>`mD_nA=t3izjXRBtrt8W)^FN}o z1yk@pp^<7@&{-hKJhL#|qr_AC1)hn&r+Sm8bQqqGw_6LM+??sfl(9v)-{+c=r3;O! zqM5I5$RGTJS{3z2dAGQ@rroC>xh!Sk9$e z08b)Ee){ZA8NaN4^9jc8i3;l@H4d4J+azDF)V~zfY z$AW{eiwCL4gVPE_K*IZb-@z|MKS-TqeQ{NoI{xri3BDaBZlNA~@Z;mIXrFueXVZ4J z13%Fb5jkgIBaeni3h#^uuzm(O6M4c(-V{%wEX)v#7u|Vi`7WC!QKGQ2(d*^KPsq>3 zPDb5@$19^hy_tD5ME*pX87o-GZ;=>78BG6m8qbX%?`0QlATax3SR>6+8VOULSBBr; z9d?%-K0_B8=h+z_^$T1OAiIxcTp_yLXJ(8Y^8>Qy&%YY}pU{{>WIkNrY(jH3^z?*# zxcJOY$uLe2i>1AJ`w2Mz>3;Q0?|lAI>5Q@XY@o1Nh5c&fuG|Jiyh4a=9Wjarv87<( zJNj!;b8wh}Y@GyMi4(x%tS#T?zgs0Q*7Ov>&CEHY>jMr$$8aL(zr(qmPsi{$>0jMRKc{ zR*!-oQtyBCyWM`(`KV)dnKC;TP83qVv%v>X`*7+`mVWpPLp_pU*|uQKDZQhES}=Uv z_JmNJtxK(H7e6)Jemp1JRz`8%+ZK`etrQm_bhSW~ww#-~^Ll2Qi98m1waERn*q1Vq z{6mxsFj&f%!F&8ILdl*I=9WwaN*mbPDEkcWH|5h2*pZ498wk*_7{p3ZX`;5+QG0YkjLj{`tQyr-qJ|8aW-aa~J)?6Q!GD;UV zNYkwT^f)5(n2SZLX1v1_uI;x@SkBg(5Fy%%UR7L+UsLOpSzxT%bu+CI zKZRXyYSqtG$ChrMupTP5+$ILjjfHvHFTBqX(Mu})T({hxcjkhui_Oy4Bq50~7icL_ z=gkAlBE!4R(sRAF`yi2V-R6L=OZlQgXZ7A}&0?i0PucZ#yoas{ghfJaXOa>vOgwox z4B8^mDru`3r2on4H?0d8FQ8 zKg0+9)XDKke{~YtEu#F<%+r#qFvT@Y4Ch>nXqY1msQ_o!Ma+1g0 zEG1%A-lbDI&1h;C=KT;dcj%Sxk!gOdpu+gOH$L<5{Rf0K*c>Gyq-X((L!IMwd+@Y4qayn}R_QOZTzc?3A{Oj()b zdUZuf+K*IZs^3L9{l)=BNmDw_7=RC9`I+#vV!*$?^Yq8Pr>#fJcLk|#beho=@gL2o zPp27sbz9CfOt{g~H=lG_7v)Ob4-;9}w-x0|SNWnJ`2l{^9ey6U)~soTTqL%;y#6|u z9^7diWEuQ!EE2~nSfL-u!4uaVPNx~0y0cXNj4T8P9sIBieb4`-H@I6}=1F7GNw*ZU z?S=P|oRZ*n2g@$8H)nspSANH~>m^jbkt4V|GV{!^5mE6y;4k6#rg}|Gdbp=jZ$`u? z3x6$*k1rzZnjTW+Z(9dMbYoLccV~DQBySl7Mle}sosaygpmkoua87@fQ3X%C@uB18 z@9>>z_oBN|bGLj*&9lV2l32d#!#jusO(AjKMbgiRy}3+OZ16XF+T(}E_E1A~)ZdPu z+=AWX%dOT$R2uKC)r@rZn?Gt?kT-h|k)xAHuNKV~%T0vmE&tiR!<;auqy|?Vnn>ZdV^gurI%;$MbHK3s`z# z;%+{n6Mr<2zhSfiSEh|3`;5C%Oj)AO3C6nXcE3pZA=A}EPEo1$CCfSr39E#2J{_T| zAKRKfJRF4w?)8dl|JI6A1v(ZiEefcl_V1BKuRcP=H;RJO`F*?KUfIbaB-mXo3kLe? zu({0H1=ec{8{xBPuoeH`A7*?_dE}qC8W+`Q1{|4_K%qVIs80 zl{Q^!|L7E7iQ*UfcspaaVD#zoBJ$=3lA?q9*S#+vIu)Los1li1#vO!o{xBhp7P4uY z`BJoRUk4*^?eUukB^l0zkW4h=ON3^t%uHo%(~c5KPqSC~t6N8ke`sp4C$3*Oz%&(; z6BLWGrw8^HhMoKxqfUzlUL}1Q&qsW&gwE2#BDZo%rj$gdt5}5G@bQHq`!d@wV&v0u zqe2Dm|EY}M+9s}h+^CDVn||qAbBckJvAFv)b2s;}iE2cAiEzSmcg}RD=7HCB8G*>L zODpz;O-meJHp#lj#4Np-1h$J+cF=e8Nq2_g1q!RWkk0C9N&%Fj)A=({AaLrARrAY- zj}k>@wz<#im%Tg{47`5fcCPPRm6zS#Ru{$6U2EsG{JiohY}o|TL>ig#i7!#NYm^qR zw~d7qXi@k9Z#aGO0YZH+6HC`=?hU_(?H}$0MNrjPxmvvVHS*t!)kZP>iWiXUBshiA zSOxhbN(K9s8i-nU85|7&vijejH;uC`&U;0&DlKe@mH*`AAxnu$I^i6Lryb*Fh+U`i zbtuu9oI8DQWG)ImLpe0uWqo5rJx6!O1T#EqAZ6g^It^Kz9cdg8iC&e<-D0lsa}S{I zy!o#~q@By-BM)(6L8_DuXOTr2?N$qu*wnNuAjtd_K_Du6C;eGQZZy73+$iXkO(a_~ zd{C%0WF3u|*~qfhr@;S_cPwWb7-J9^$hN#j-ZU4-a9ZiGIb%~P^P@Eb+Mj~qGY6*9 z%_CWX`K9nA!&lbAscSY%Bt9`hSMh}_j$#o6zUT+Z?AJ;`(b8Yx6VXJdrdopU24$$x z`aPSQ682_FDMQPP{TJc0Q;xlbpOm6s>PO<^UH*A~u@SGap4&#-`rnq_SSPdCux~wA z{t~nU{b6~ecD)dCNp?Ncy!CTnP-zC3^}_7`#om1fHMKno-`^yGB(#JoQbR9NLQ{GR z0aS!gR6vAC69fb)ib@F(dJ9cKQA3d?YN!GNQUW3%BBG)qh9*T+6cj{+H=v&5X}^2! zd+$5Xyfe@LnLV?2CM#K4Ykfa;&CXS>HgRWP!LJAMy}Kl3;}53+M`Lfn``!~>10?%U z_2(7m7S6`y4>8{i*|z`N`*`Ov0lZy#*N2g}?9^xD63#Eh545MIInQRxZL4iNxDcPw zJ95JCok7PTMaeMFwo;7;Cr}4eKIXE7`2;%!Ke(Rtr0~|4jcK6)!?ZZ|Pv?0sk4+9X zW-)mOiwTUpz=fO{x)XlW#o&nK&IgTIMDLS=)zPz#5mdR+f1@()2>%chK^3P_KvhR| z-m1?jO+6Nl70L|z7b@e;=C=piqtHcbAFZpD-#_k8?1LUzp(?d}NJ|SHze$8{s*G0< zVZs#>UxseBe9FADHsihV<Q~>E z5JoU9WRw6^WL4R2H+;@AV)=Z;*X51SZ%-LEj&8z?k^X^qq$Q7ad9bgX-SLoh79Y7S zK5~0bF#2rVri5R5W_DMqr^wtZx)@FPPCUH z?&A~KHRJ1~5smNWcPS-0%ea|pC=4IqamV>7svw#=VqZqax_L%nW{5&a=>Cw%!GqCh zABf>0F)9yZ!gJWq4D+2Tjq%`%HIRu;9Oh5%j>@o$t5ESfA;X=qfYF&oEgC@a12C)L zgeJp;^qKg|%HS}&7{c%d;dNVFd=5K7BY^~qIoHisVwX_167L!kRUZ;tXBX>jm)IfZ zZ==i#HWkkx2y}J}HEJa5l?q&g#oW*k7#dDweVXJIpGdV!qMMDQr2+#Xk*zB!Bi+RD zndA!42`el3)CA&#iqa~Xr4o;ujNIfIZx>X(m8W8G27F5Pp2AoXQ}U&mWT4B z;Fy-ORFteX;4#)HhXtl-YZgD6rKHU+96X)xUi9uSnL1^WdYUQPUL+&UR1W*BLB8`I zx$&N5S+td`eW%Zv>^_%S5IBK$v=UV_c4h8G`Z0p`Dt=H+_;NlXKj~~);_-%a{$4`x z+(@6?^Se%;57!h(PdFc(8=dhitLRIPQ-N`|fsR(2x>KS0O9JD@1ga9rOhmJ@1gYBx zUU1KiEq=zIQDE{~~%QwLx)1DNF*mIxHwa&80h z4$mA=IhqlX6TKbBOVxJK;JTr|jR8Q5b-W>Hoy%bz1KZ7X5?SVk?c7#Zhw0GJ}k zEL4E0mjZVuSlHp<1a8Cxl9Aq(UeCTJp<23*W6GtmOhp&3xL$7|@mZ-?WXqV(dBpm;+M^o*7)o(x~aD z@Q}^je~03S8{|e-HMgrc4n#41W;dHoBPXCEt8`CP$zHpu^adrLyBBy9%Ph_o0qk3C zg`f(djr3XfIHiK2GB1$5NnBsC4G}c3N8w>5d9HjU7qLvST}9z)+cyxa$Xdxn+vQe^ zZ~ll63%1`VwfnHO*0%*1_T1bNh&Zqc9mH|@_FnLLQ-|Jr3n5lL0Z{B7Wta~j@(Hjn z1~5}3V%lcM1WwkFMPcH3>O%?PK%&JBZf3!~W%+w3&z`X5Q{cU=79oe=x&CaQ3|MOo z%zFpeZrj2uDc6fh*?iuOr7ton2lN8Z@~ok7^F@eP?T7=1ZZEhpSIi-tdQod6#1}%X z@OA;4PurGph{M+qHE%3$W7L+>dY*JiB>fB;z5PfzkdB~|;Mr)_cVw3LRjj=+RyUmpC}*HN;Dt;*L+sPfoMFB-5!% zW;|+dC7EUa2#jiQ_hhm@8DP#Jz}{mhUaaizD5@_}cZ2#ML>{WfNJcS|dJ(v@Q`yX( zpLwfAV@1z^GhX}x5q-GVyfSEypJP#E(DmzCCym~^;1=_<9=)c*ONWs+5d0pAx2 z{K`DlewFD-&mDqGvyx$A!`CajDb1?si0B5rwjDT?ii(%0c7-vPGYg%4-R%=-x(L}` zat*;S$8z-yq8KIpfs%7czJF4X8g1Br`l0SS6?Uym_|B!&LA2e8LL)2u-C}?Cm#UT7 z2CVT5P}x1uT0)Dpa+l`ArU)yha(^~IgN{l~Ry%;b0}!XzM&FSTG#rzl`oQ*VxFPkS zpk(uG6w{(V!jp8}kHXqPu$HTMDELkB9f6T*z-s5uK8R$b_eb#yXLJ6T2`eq?3U&2^hoPEnK^=lka^2F0ra&Ap7=s3 z(2p#TrH=!0_YpNu0*SXa3ZXH~?0!g=3_@= z_uywSr#&^rGXoPZ?U0BwB#6dd#dikQ{3&R^?6PUCwK-4u{)gQ z4Emg==zSoCaewY)rT-Wr45CI9c#34~G`?XFXm=I`B_N4yw)I}Pep`l9O4`qUlQ z_S8Vpo&JC>bFCQ+tt!CjmSH}b1^*6_0WdWV*mCuv7xlUhJIT~iYIGy#ss8Nl1g2ch z#-fTe(*d@3t*8t4Whp*b;1MV9t8xbem^t413xu6af?Tg*-Hpm&ZHJIsWtB$RXuwP{ z3F5=lrq3{o(XE_%1Opvn@ozCQdTaKndE46DO~89H`4HtHHv)EzGM`dke*AH_?*{WT zKw7c#<&4^7p6i?X{!bdW`d(6A&TZa1# z=Xc2E-DDQXi*6_z_hUFyogwt8Kf4`0I!k7Ghnk7n$nXS;@%T&bNJevjDVsKz!pgLE zn5Xt3CukQU<6ZUp@j)xJ5^(3%r3X`{1Dx=eTzhoB@Ut?(3mz0+(nAQ+JGFYE=FoSj z#I<#yhAWWUcV@8ZPbUI;9)FwWkNN7af>Dyc$}bPkMlv?MpD|Dvdh^ou7D48szw09I zkz(6yglRu!a6QJufD5{_3$y{M6NJHYEET-i^TK|q4Y^aUuWp_TJWJ2(|moK9hzq#x$nE% z4~eSlnOj>;TU^#pU-am|7?v+9j%Mapvr3gXV$Ec3A-}toD7BBDg?G@jhrnxg_T*a? zhX85Az@&f*nbJ&8+03Fy&IXe#v(>=(Vbv zL1PR!d2Ww@M`b==h8OBH|3jlBleK%V4UGA#%B!z8uFhTFmALpuP$2^+$=oKF`!=OC zxxFBxf30E1%}B}uG*89@a#uvFfDC3UXT|uK{ZVD?qpqAt%2yFjg32DA>7KtFvyu?U zhZYQYlz*B(v8P8zpgb@?{Bw|uh~YOwH_7hRvPYL>`qio}yMeM_$9&ulRgCy}&o=FSO_S+RjFft&)rP#PvZT$yZ$YN}PeGEFE_;`AS08kaVBBeU zBu=u<&Q$~y&8|z~i&OPl2MX`Ga@owK?Ha`gSA`VhoU8P{5r*le$}ybkIUbkH=t!R7 zhmP<(h}q?Idiv@ur?cMB_-zHjnxvaU!uAE(y}b z^)5+DT^g<_yI$A3rs*?jx}}>+HMnJ3nrOObAM|K&Kkpo?>5+T1xWVJ1Uzg_5OQ&8p z{9l!kr`~{R=RhQah=)jO7&0Fih}uTQGpW}bvY#A?7J(drVKt06&kYczjgK(@XO(f| z|ES8?u-oF~VCEHwy9`!ykM+61tP*2)x%~}$4%7{1-z2)ryKC-scrtkI7S&xbxM8o; z=fU%LARg!#O>-CCM>%(mJyfzA%sqBK%6&leP`j$R@7RGy7oJi*cGfoRJAU%f#aED{ z8l9RJ0p}j&O&cH8dfZ@fvhLBP_r#+*Q<|1%o;P7hl#ht_*$-osY{b zV~*Kg)!JWv;BooE&SM8^8~5Kh`S_YMqqkkB)`6;Xk1Jp-u>YTNmG}M7XZ^kIpHMvz zkRK5E|Jth5`QNiDt$$jT2T%NBRR-lA`p#9-t;$n*_>es3|8rtx>{Lx=UR`2E-MvqB zNmI9SW^ZL*Y4}fKWyyafR#tWuJnL$x=`MNGeY4}i%N}|dt9rTTT4Ue!vHPu` z?^pHq554PeXns)H_TLdJd!Dw^(aK-N%3rw3d;fu}{LQKyogV&2uJYagGLiS>Onx{YXt*6g z=PC=hq}CVO5&ytdP9Q9P=PKvuT&32ZxJuzMw@t3n9chbGj{OT)>Gca&neq=@r9=vy zs}$elD#bUs%111+v){Q&E*6C!T&4K$T;=Czwq}?_$tG9Xf%pSg*^UN!o!?9QY1R~IL)-m%|H*Nj@#0?$_yQIHQ6!rcdk-R zp~GyGt1K_5+2kr0rKqc!?BcGp^dDShhB=+9jFjJj`_5HnkgE2fLLGnSDi4cqag`@j z@B)KgDqX^xT&3A2S7}zTXOpW;lgRkaRTg%!xNdTl&A)J!FIa>VS&S1U25FmIr5~NE zyw@iTE3P!l_`y|PERCddl_7MlGUEqVsUrT9t2`)KUehrkm29xZRR$3WMX;r4g2PJ3 z53bVUzr|HT{=!vCY;u)pzj2idzjKw0Ke@^v;y!W@T&$JORT}?STxIrmuCk!>KX8@1 z+rM*_q%E%U>53t8@Hox8iQxybe&;HUA%DYFxw}L{{wr5m`kkx1_k*h}{lQhfrgN2gTU=$$53VxvUvQN>|Awn{rgN3#AhX}O zN=Q4w_7|=)RHL0W2nqSlRhrVd$|B|8xXN_PnCxk}DIa+Q_8 zaFqcN{TjFf4dOs!Y^LW=zH^n8aDC)1i=SL&%`aS~?Erfs7G?OiT&2M;TxI?iS82A% zRfcRB{MTG%#}-%F@gKR$*?;CL%PIO6bgq(PldJ6bXRfm7k6h&}pjAK&Je{Rff~KO3D^jxxyOqldF92 zldGKlg{!P2Gbw!MDhvMwS2+Pm-r_2gf8{FmesY!7zj2l65RE@_mG`!|$_IbTRVMzK zt9|bz|Ahz#Z&_pt^bj$%>K_@ zWyKG!GK0=lB3X|M{^TlI+5W^;8q&GS3PuMySK0InS9#{IT&4LISLy!;u9ECe9i=h0 z88AAK_PEkmY`3_|QVM(&$@-9+^(mgcl+0-SD_2SWBUf4RJ6Boi|AVVcMs9JH{;(-l zHab`7N@h0x0?r0-$;jWiO1D-f0sxz%bCq}q(-HVN8nTn}z?uAKuCfx(9!K8fD(S0& zf8{FOe&H&s{!6a%%ulYe@+Vh0jAKnDu;rszKp$9A(eO?b@-_fVZQbN5GtpaIrS2wI zc@Fg?lg?FAm}6T1j;l0hkpr0NTqXHCR~g1OxW!eL{=!vOQ6Q-VNTxsIIvRe|fHk!h zzV6TLOknH7v2K_%)l;A)G{i9JCs%n1Mj)`c4g3vPnf#NhbV4)Hxk^{$CRb_r8&@ex zzYBJI`saVIKf+{-t2{{9!NOTj2%GkVXZh+*?FF8V)+a z{GM$9r3YWcL+U6jw+W*!F1`9HG!b`%`jjSN0iw+&-&69LNb_YE&U|Tcn0}ox9^p|F zB%%)w0XIK*oQDt*9v62S2Qf@JFR)a-e4wN_;c%SK6!$VHl%^H_@(jSip*P9RK6=R) z;)7!02}cA|AP4dAMcpYez32X;h`{jib=pg35@Q|V)uDgGRl1@7!c~SdqyHmUnb5k$ zReoS;YWd&hD$x_aag~+-fva@-pKz6e{|j8DA)TvyxW!dIm~#3LT;Y*x0va+MC~Ev}N|2Uj^j=PD6@%T-qVg{u_#*IebNpIoIJ_ODze zuF&M)aFwN-TqP4nKI4UZ$qoiW-?>Ud3C4sN9#?sVjG#Yrl^K8HDsf(p*qL9sO84Km zN)I|$>G`)@C8vX=?tkDa4Zm}hZk~U`Rp$Q&D;M*oScRQMaNQsJ*$<)c4yl}hTyrF5=R?l-Q|h2|H)N~%fdbF`aE9T7Te+~`)0m#mC^itv*VjwrRQI`O8jqJW$Rr44@9RJF+7iEz+Rv`d`N;9 zf4nd_XIhyZAb_UdU^aGl9c)>=dXtI06vk+cXJ;l6Kmf88%+JL7bu7&*m#zYl?=mFW zVE{EQ;maqNwNXq~@ff%h;ubd?$VwGcL>kabz|Dp^7Jzh-*{5SCRTD=*fFhp|ka!*r z6y5aep5X-x4@3(C1AGEdkA=TnWLri^!p&~qQaUp z0Kv>N9r7Zii;;VQ4Ih#&rrptgSBGHI;(HzHVO-d4V%N@f0}Xad77=>-uOKXDo>2G>A%aBs1{bYXz0w#XLHqd>Akj=j}V1%8f*c zUw3pB5ZZtNUTv*85ko+&OsVO*WHrlcWE8`h-SraIMa~i2h3WthEn5_r`@{iYz@s%D z+^_VQW)W5PGEKL%rV@jEan@z&hEHHe z?b2g}7JYhb`Jv`L41Oo@Ieht20t!`ZXWMIFuK`y^i@8WYKrJuO)j)$6)s+mSi1{sD z*)HeOl6H?RSqQ@P1fWIMh4qUKlpF4UEYnE1fC1$fjZYst^T?0Xdf~cxq`LAR$%F&8 z*S*!BVITB}UX<+@x%JQ^(kc=#Aqm?A5M`N)kXSIszvlAdXR0Kir)4$4YO@Ewk9QOCC`_@80`T9vC= zX`M1M6vFJYTWq(wX$lobfRF4^4B%vA1WtN)q7FX0Fuq{-;O$ZOsB5v6>T$MjlbpX@ zz;HOJbuhpNUaUNMQm?n?jlXZ`#OVZlm&qACl<~F>AVKhn%kpDJ9yJ6|8#{A9y1$p< zN4s;8g=@n*vm%&SQ)!vKO7Q(0+GT~XAKiI4&I1_7yv4S%A+U*z$kN2 zikDB<%U!IOKXe2ILA_$S0O|dxUaSr?0aRIQK!+I(pdIqgDz_D*K{=q8EgX?r?{d0Y zJ5QA8DwK^MyFgEd5|9-aJ7-U`^9tN&RkHTwaHFM)wVEArtFW@GAm9msryH;G&l_g*7=Yl z2aVcw>YTAH!0PcXb`Dgl`5h}Fix2KblNke}A`OHe&WCsG#BhaPe+=;?KM{#b?$8Vm zuHfpNuzjDro!Or-!?6f30<9XFMFa@UpgC|ia9rUdz))Igz=Ce=y@&@W&SX_Vq`_Az zmI9M{CU^_bW>_DAE?A{+i&Z*_?LKAjIiPuN`-3x63YNw~d-Ze@kp{wShv>VenxFG& z5?-*~@&G^lHs!@c7~Pc_rL3yJ=^WRoDkiGf>jZ3XMQ?!RTc4U--A?|%12<)Z;eeCg z>pEF?LKiQk_4e&}CI5uh3gT`haQnDMDmaSz z7+4MiggG9VBBV1g%8)~nX~m~h%&k&LD;gziAar-kuW zB*NNRU?5ixY(P%*S9Gm{0W3B7F8RUssIh3qcq;%(x-AR?m`E)Rl*Z}XM!hc@lRD0S zY`YEBuHJc3iBF3>m21TFrYWf#Rjrz^!AgfJOX*N$BzhC7WOB13*Kq;&#$zBbI#fwW zgKt5V%5|nVh1r>P~{9pl?nq}YXc}kBlwE3Bf-~D z7=*vl7#+iJ4RpLg3sl5Iy+yvtHYAfeR8;f$&j8okKnyYfayZwJm14D0-DpDq%z%To zDv3B?ve2l4VkDnulKRLf>1@YNnM(5&qc~=9h5-P$4S4f0g3*}8y&X%c35-ZWE^}OP zdI!2Hwx$}4#O`;$TgF*o_jKp0qIX;@3 zGS~rQCQ%bf7{P!pt5y)N7G3_NB(3BK6PovM5qLm3ccOsMXZnwAItRIcrz&yh)#7|{ zMr_o?I&l5hL&B@@oMZ`tEYP#f_V%vH7-?04G&LUM(si{azBqKx%j8?774>5N*p2s< zW>A2Mp`PHpT{yU-uE7zw?Wo-k zAtZ{H0EO`_BMPm_g(6FxjdPcI$NQo2Izh+4!%16EWtoCD9jdf5ZAk3p4**t;EBe1f zl?3JF5RBwto`|iOb)XTLPaY`M8>^;6l~=}_V1d=0FR1%ozu3nWQ$Olhe{_aoW`~j| z?HgC?A2&Al4vA%S>0c~_sZ$gEyGWbmWX#zlbz;QKxM8|4Rb0znF z4JZ03_W|Im7?3B}KR(jKc%}o)Z`AKCD%p!-3?Y2$i1&p7j5NN_%3w`f0A8BUv}z5) zn;o4r2cZCHDFM<-(8FmjTs+W#Ow5FbYKR zF)a{aNWhFy29UA&mPO@P(W;GR9Hv8+nkXkUxONMw%nrE08&%jvhbnXGnwC)aLj~h{ z2>kUxZS{AkGE(|BKyK>k%fG!L<+%w}iuqnD7 zX6`+ljQ0I+)zbIZ`x0WTXm*LP!-J2L{pw?^hPP|ZGlYH>umkE3+0}a6Hz!eIa|N45 zF0r@~Vm6-KW3?+)w<~QJU~EeYiKSg)acaeIqH}z^iH7kP6M|@sn8allxzEWW>v-vYm(%HST4l^#Bn~vYtFo}GisHZF zDitU8pxNY~xERKs7inR^=(!nNxM7`saFwPW%3VmJ4$b`guryPoWlAJi0i*<_z2?>OfnbK`Yi{cHDk9)9#j11Y7C@Y?sHM-Pql zc$bH&;Zb_n-!J@V$m;}uJ+^MtbVzg}di&8xGY`v-qem|%y{1iv2zy4IgP$(VVryeQ zd+ye0DQ*I9yys}$Mr>+wjG|{!%S^N)=L|AZJl-?*2Df;)C&z+k0+Sc6GCr}zGh52@ zG%b--oOJR@eKIeyqTVGV_Ev5`p4DsmqLbH^b&tRsuY*$S$%+a2JL?Pj;|tflio>Ui zM!izIv;t4QzG`wzrg$#r__6D$(*-wvjFD;`n+C@7u+0p~AD%-ZOh1zUiItj&Jd9`|Kf+?l#@)nfh+_8fHFK*ynDzkMD$w zZG+LhSbF1sUVvrXYsjbXozH`Hp8Gn4!a4_;_yw7M=PLc?j?*A}DU3cjLHo8PpU#QPp%8bVK~~#> zcWT7s%q;D*{8S_ptAqy0XfXPqz~XkGo&H1{f1)Zf*kA!_N&t!D!S=Y|Biooy!Ggt6 zU?0y!}FR^4Yjk zIT|63!{8082dDDrnshfbmmHGx6dnvf!w3l>GE5%z(8O&a+G%l)=NVP(KrRb$yX``( zN|s&FQAPuhkmaB}*e6^(JUHabrBeDWo%S~XS>0u{dJoOGxflXKeCnf=2no*qr)3tF zj<1}w9}d1mgNS!WZcLwy#$sFE1NNiq0SXWFDg zJ4{16O}{E{W3=A|@<4%QP@xeb!8`q-Im3+N!{9@x;C5%5TqtV1u!9KV!85;DZ60vJK?36zw7Ye2AiV7#4lHrRuxGTcSG-AjyTGwFgE$NQ{VRoE#DQ;#Ejzf8DICdu0)@0LZ|rMU=+!?D{R zPQDkWh6@`esu>Lyb>?ZE`ZzzkC_O+nisMu`wj?uHwCB8v@7$8?P^oo^_4Q+)FF!%BK5up^ktVf%voMquOgB8glg|suYJ}uK2G+WSM__hrnxlpQ1_bq=eT;c;(b-@#&NzXQgj?>uamai|^lW=o22s1TaT3I;dFaOi-2i1*nc3 z9MVpZQIs7y4tU$)MeXe?v?GIPw@^hv!$=02@25?NMz8U%N`HP-UN|aBJgM#`cq%GWjw5XJiTz zKrw6r)0(R@r+q?Tr5t8`g)+st#~NizF6rI7cPb%7OYS>YnML<0%jqX}esGnw&(G|* z@heyP!dZXKj+&OBw|(zJU1^0kHkH~9i}Mx_+0QD}Jv(;RQ`Cfj;ZQaqJmb- z*`E9L(RQ~<$kQ0^kJJMe@8TO3TE5KPR!X5XOs2&MuLd!zu9HsCeM)5CjoUoIu2kVW zc6JLEOAZj&@wH=3%6El!-F?7$aHSBz$;Fe|$w?U52;RdRY{2ngTi#@h@fqc>s{LL* zlyZbL&MpJn!L7>*(^#ervb6@wKr-~5-IU?nc{4Vw$DBtNu|j+OGb}~-hNdGMamH_z zpSfQQJ87HA_i$f*-BL9~pbBMp#L36p`bHYg@d(Q(;4Ip^tsJhI@Z7LEj!#;u?hV2O zpXi<-ZZfQrYFD-Ly3nkoXp&vSL?YvL*gjq!hd z&a_L`qoLZgYcJpW1#vI?@dxAY;$Ns8_q!c?;z$gR+QnHS;X+juwpST%F802oyS(C5 z3E{l)4WukR)e(O1RZXuR+{7*G!EPOUo5|bQI?_hl+*|6ko&6u24V3Q;=0Gze^5?^b z<3}WV;*#DeQzF?%c*vaAi1>XI%t5L~dQYC#JipZI)mPfFeMwYeZwS?2QnW*^*i_U2 zmCu(RC^)Cn&cX5xB$Y>o&4lk{ANB`r!`acm(K1xg%ZU+u1@ZDN7^dJ>Ma=93EkQUd zhBzV~5c&0zLLaE|cF;YAyW$vHGA;f#?4F`Yz34<=f%-FZgv*eanBatrsk!@t%&AIP zMu@ek&O3>;$*}~HZ%3|ca#oRIbtJ^N%DS;CRq3gJ!&O?#ejA4BEr=pc*SHG3M#t@H zN>A}L#%ym}z@j&@+IoE4#Is9$OtmLDdcq{GWwxwIqD!g}r*mAhZwAH4>U@A1tsP-I zGHk??g}N__v$?Rpw4JvO8H=uQ{`+0(5<@NKIp8Z0+pHeM(+iZfBj_!94N+z|e8q@BP03(S(nb#85`MDasq z+Rac-k$3kv&jdz9WyRgXg?wG3FL>e@;)0 zIph5@S>D7AYe{E9f{C#}Dy3K1X0KVMN(bE4IKsZmX}JraME^ zpXiWlmBlmx9<7-Q?xb76bvb4@zx2z!xrUF{9u zbkDG{1Rdkd>KpN0S~tqCh1ZDlpE(@ODR?%)!A-pCrmkW@<|B6ZoN>g%z7GHDBWYos zpE+yx8wu;=C!dtKb$c|WPb>5UoOb*O#|@bqGYjc#%>_Yh=NDo$TXwwqVu5Wg?gk%v z<<0)`*n_HZte3Dk2iwHptyZSjaJ{Gx@v-Y;O%*Y&`b}er`lU-8Ex_AIRr6P2#uJ>n zhlxt{y=JF3WcV(gkMD|BuW-57)4|jF4Wsbn#Fc8XtfI?kX{}Csi^TD2-iPr!BHs?B zI^aUsijf}@GaVnDtS>0JywDZ++$zN(l;41*tnJ4T25g-u+}d_hAj2%q!_kbxryC@| z+x7|kSkzyNKjzLV%*}FK*`Iib%Dtzw(_-a=iMVE`U=prf(ou-pNs&<)H*2aUGe7ec zeNoy+Zx6cMAMWhxE+YRvp*M|d@#LGLZKfI(a-R)j{Be`862dY@M^&ygngdR7%^j@db_@zZd9fjC$mTzHJNs$jxXiSxQ_Ewov~>DlKIt8 z^1azT6CPLu_vyUDveHMXMY11HPQ-<=j4SpC%fHNSModkbS)JM$DHb&?(8-qc{@!N= zmBo8zm!)^MhQ1DtLMY(tMQj)Deof0vQ(Ch2G9)&~I>x8qx}(JI_){f9FX_*NP#PfB ztT=7tCQ&)GTSVW^2xMUtO9e}7wBNFYplD85WXwM!pkP@!vlo^5CxM9(&g zPE<}_#qc^|uzAj_Z>f9`L!=o7)rvvcf||FR3UzNpE#l&%{i)zF5l)3jaa5Z`e4j*K zr0D2?Y$(Idfw8x=c47LyK8i3_V$$ei$F@8pW%;&hH=2b{8~-~ZbdLnqK|(*zkm<5v z8;OCt0)$!vskWv><2!UUAcHF%dv$GzOwzwp6~pR#w3_juo(Q4uO{x1)GnCX+$0r zHq-%O)qMSPFhUPw{3&5 z;{5q6myB4@?a~WEvcm>SI0o?*usCF+O??vdU^nLI0iv6=)T>znGXx|IytEu@=U*{a z5sZXE+Bm3NP-U_q?`kyvjDh@&lpbzMS};;{C3+L8q_&%X6T>dO_8T|!>G$wm1*;#I z!VVko>ACwqZ}sP$?)v6tAz$E=%J5Zo^gC4P``QROIl>A7DdYgwxf7b7A=kaIyH>_* z7BFv`o#yXFi6=*sgEF6H#EA?c za<68-V61qr=+AG6VM`6PbX+oyci8TU*7pG!Fs2@)VgACr&NLZ(=-+N6D+O=Bw4hcJv-y+Ys^ zWGjmGOQYauPzD>=n`w@G`L&0b6VC3H1@Vef)ab6scH5765;Q}9DNL4;)4>doxAM`F z)s7NtNU^ms%8ss70@#ovrKxk5KbKtrm03`(?s5XFcD)j3kA~;OPjXC2+m7yB1&2M6 zwu8Autt5vZ%g|fJm?eWzD;P{p=@)rJ^9(3{1+18t6hB0}6}v;(txd6LM;k*j(?{7G zJsnpZHi*#6-i_~~wDO{wNJVY_m&_Yu>0g(N-KBS0#)JxtQ;*|!O8T^~W^t|z9D6xA*#-=S53b-b)E0e(d%DoG$&bEiS;NEo%S@Bm~` zPGQz?b{AUArbaGswS!JnevF(uU^2^xBg(BAu;7IQW8V%}9dm1z!!EVI`T|y%G4Su7 zd&cy3K(Rt+yDwIdq33w}P7OnpW4^$G;SOu#v*qoq8Va%tc~w#4Zo#FvWD#q-a%}R3 z5Kd5PJDO_0B5Y&qt+!qxZ?~ha-B#s zJ}anw^Tmds-pw7?O{LO<`o0u}ny89{b=WuJH<{6nx0o({8*H zExzQ;#KDeD#ynWs9^HGZY)+$vLeQ-5ep4s|EdZE58$oqf-)(p*frT9s}Tk9r3 z6EyVH!rdmZb@|*|sXT=&{^_uSa##h0A4i0L?J&Q09hFNLD(Ou*qtOd*aRwU;H`tGl zbvek~C#WsaiAu0|JVCA~8ouDDg&(fkL)V>O;cSHmr8;d6K=dSb1iO@3mx2X6@s>1C zTwnBI)`v9*-qeoAKBwNV9mPCxjFgE^zsDNFfzx zQW&sjBT>M=&9$oNsFi>1ffXlR$4J2g z#2Gmvc0L_l=4|2l&gwX~tWLQttBs|k(f8e7dsw)ti%r-J8cm!3&ng-{y{PO>R+e6h*gRUzbgDP)g*G&ZdPWRXvLwr6Apb>!l{L z3ei)$1IZMA*I8H8a zA6p$Sdc2f)keKvF4QqCpb|A`BAKSKy5n2N^mtWJTRP_>C`MZqmDq z>4wW2^dBCEj!cXS!+;(n3O4+Dvi~iYX{DTuvR-M`x#|$DQbNLf^!=~~IpUWb?U+xA zR3&buiJ|EA(Y~l&3v9fRn12e#fM)u{m&>;ruYHZSWn1}Vp?#423FBaF?@a9`$xffsZc+7Cz+$JY7#=moi<1~o zj)tGluxbnHUvZf|>re`%iidVoW7_wIc3L)ddN=8-VN`uWyN>GIO$zPC ztWllU+B0>iXLY(UO+6(|_s;8dJ<#cUq|;ul(_<6bcYCe>X;a5?Q~!&m&WW|&TTKrH z!Ui^U9=r(cKcu4_5~`}EdpAvIKwEcUZ`eb`x4V{Mj|0CA?)dg_TIb2$P|0T)9#6p< zKQMm6RL{TqSr9P75~WW8Mos}NE77Q-Flbhk{){ei1-xx0Of@fhG%!*hh#b9wk)3HC zd89k?Ds1$r?yIk1ukL;udAgxH`R3c?({Hb!;V%*4VO5rbe>Ajw4etl4HWFu_W zd3~a=W%61$G^mBf75-8{Z~STVxN6Ir7rOHu-(KHrnY+0D##C>imtIY8dE24;E=}*f zcFTK#jZyK4DFMBuQv`j|`iD1RuXpHAoz)w;f>9-gzv~G5yr{Rlq4&WwVlE=$ZRVZD zTl80Md=3o%w5<2pdt<5O&Qf&CH!$QpJ6Jo<+k^t%7GZ!~-4TN$=@aGG;9@ffSCbS0 z&67(j9ByXm;=9kOo#u4klP#_E*x?K3?_A~m$Q_g97nPtH_Mj{R=qmBX472Raq(s=1 zyqBfy?n@jB&lfk24&?7n<9(UVL^mn5k@>DwU^Q}Xp-)t1m-X}hXYQtW(V|Sdtakyz_scuSp`$B4Gv`_?__UD@}ZNBik*j=}- zyWPoJRx!yttvcHu>q*mJ+WUh~1HlRcg%#Gf>Orug3P-22t!{?~VNW7>*0 zWABH4K00OejQ8zau=m4~-RE~I%+2q&4YEm!g+5m;!xex7pWKQNi3tDp^~>_|hDa-* z&JVi;V$(skL3kWVZGYw})kjDj-?>WJBFk2K#`s~2XOrP~uF@pp!f7mZgWtD_07%*24wqZuEtZ@(FutM3kxoacbk8Kh1Hrs-n@|UuO=pbdOQKVhl zcH>Qu@&M1kFn26TJb6Zs-%Dz_Gj~?FSFjW!TQd~Pxd~E^rLgFjAY)r& zo^3Z_;8k@6rz=ogk5CnDm=l$CnO+Fxd@2L`5@aHWkxovwZ+MOyKw%!eW3{vG`?FQO ztP^q}ou&o51wVFMK|1U@6mC`a9UL&;1S!ohw2Dd|6FQAEmqwY%Oy z2Pq#PGM(IS9>YSEdd+K|4>3w$ZywEnw~O!zTt6KpDbVejhOM+S`Rg3IVA>%9+hDNm-8$xG zofg8m3lRJr0eMZ?v9@4Qw|M?GI3y^xDTIwgOZ5WpgAd8sRLjQWJR8pRtU}}%1Eq1e zu%i|Qd%5}~&SEEwDGIC2ksn9tmlFc)l?yM3!Q5EI$d2GzRq)dMNy!lsI=(Os$EJjW zi4pY+%M=!oWJzU-x8#8ZddC?|SuN%fAbk%58~f=arsk6gZ|$O}S;#nt2|(dZ=|fJc zjDplN2wH0e1J}m#q=vq!^~&C8i$k8Nqi%E>%H)rP+Q)b{3&OyU1`U1n7#W@?3QH&v zZhO-kaJI@*l&udM1#L4D*H@8GY>*472(9jQ%dPyv`nS4)}H*PjH4(%`FQW zFd)FBgVnSgye;hVkg1q|7@W23$n42B4t|IgRd;39Cf1H3-M9jEqsxNiMEK=*m$~ul1&W>ZM58B7Vkf-wd%Sx0v&*V=lP^DHz|7NE4lL4uerHDT=Z%Cvic} zOY*sjQ_^O6m80-i^6lJMP(ToGjNpI|Zev(dC(s0RT$znC3E8}7OP!iHWu&ViL#a*=-w}*B-)Zt5f z19eRxdD$@qWDm3hKfI9x%bV0X?vA=~-#y+yhr94D9kg{&e+P#QoRUYGeQPU_^KeSMc zC=jUbZBr_qzN)!J(9HC035!|2oz9^t4qeyHJ7g8ub%CSM7pWI{nO=^`o;8IaQS;Hx z%K7dEuCCGx2e#}W1DoSa!x?_hB=FF_O)}Qka#4!KT>aPHY67>{bz~apL6<)8)9pM# zHtkPpzA9K(HfytP3o=WUzd)TcK^;Wus5z2JBy!`$8k5dIh}u@+ zWUN1)p*BDJIb~O@)A2?Y_A)u0#w&3}4vrhPH=E_(UhytV1+wbYXeb_241rNM!BW@J zb?GRmDacRnm@S#yL*5t_W#22_iD}waBb&+P_7#)9*6mIPRS!Tyij@5`fFg3;ZuA!`26aOy}rcQIlCW3&;A+US0>QAY#1Wdy?@xJ&V%;>Q92 zJgB+CW8gO9mM%=i z6C{limS(sII7&w&nnh$6yxZ(>{ycsRDmR5ti*>-^Zdht%XZcFWrcfq>GEwkoikbE(a=TgjmR0uEXj?X#Hbqrd12;{=+C=V zAhqdl>QFh)ayYfuQOqrI3FJ^@ajKO9n@l)GbGTZXIUA&t`aHNebGRSV^PDQ^wU5-G zy-M3cLZC9?x!cS2YKey%5>8|$UoOH7y4iAy?mXnhg|+K7Ub6w zc8V9WYZvzLR1c09E?EM$OmI!W=~zQ+Ibwuj7=*wYA`zaV=q^z~fjxK1-f@(_K6(Zk zi+yR>Sl`Vc@zhk}dA!6+83`K=rocPzr|0;F@p~5-Bv+p7-BB2YP+hJEoD-RhD)A%T zV~J%>u_Moskviof#U3g`D=YoLQ<}3viq}k%p>5ZWtDHlC{I4scl1=-I4`{Q)L-#5 z9GFP$x^lzN^)W*$cX81xj1CWv9Vi?psKLi|8YN(lr`XwYV35ZvDUA_V>FBIF8LXJ- z^3QS&5UBd!T;>|3>W1^ILg?xdjD8f#?-Rb+C~GRUJvPr)pvEEiX%kR^@!?arJ)jI-UXv@@{BrvaI_PKa^g1XmIfme8hq?S1kjFFA50fbCC7_jQ)fuC{iIQ{;c51$ z^QU-IliD4>%S%U`Y!GL~`-7>w&ho4pzOtR8I^)VO$E>S_u#o7sIQg!2`H1fOch)|S zY<9de4EKtjI#RRltp=8p3fg~djDbq@)v3{r%7jZed_8XQ?<`eFCiTpf}e?~+)(=RYYMOwBdAv2!7l?(8C!R2FA$g^_fybHh{T^nN6i)q~N} zgVj10+W)m|TgKwHKqb`)A()DLY(DzClW%4Bd`^OA1#D?4!dBH6Dy8H zdRSbYNv6?ff!z19<*vK&JZPpbC(Xy+$e9O=18sSlm)k-5M~PQB&>X>_#RQ`+BClbE zp2ks}CcFt(Cl*!MxA5pADEgi5_wFAY0*pHAb0}G{1nM>d&%EKsSvd0JXyB+|&<<yN1P_M5Q#`=G%=>Cn zox5hicwv-D{u=UPa%|^7?Bk;@0ls<{=2Y_;(a)wC-QSz7jY0@YQ2w1XT{M?B#@ zHDjuFHaX5S6JD3`FJ=w1L?MNiM zn;k0>Ei{HsS~+~}q35}e>hXuiAgATk4%tCutc6DPS!0$xtCw9G8I$C12Kl6ZJeGYRC7YE6t9ObuIg_(!MX-p43OjfM z;06X{Vh)pX0@Pt+qpYi$bmj9*L*7D*_$!k4t(9Kn1##qmluC;c;8C28bQohiZgp*) z#8cfwu#J{(!iNXDc+|5miRepsHpSa4xou*AK`Ap1fJSh3SAP;Y0W;=H@vy){b{VbOD{wUKFnyna>W0n;) z=~w(FrCjV%_Rz2=yc=Qt;P5;Q8fnToVo6m5A>_v^S{r6muGWk z+i=!ad`NVt70Qr;)C1@);JL8~#zZNYBP7&qlt-R~iouRG7GQ>!=O1|aVDu6f7R6?W zzjG&5o3FoM@6Vkqa!cvna&r>c$T%}6(7+?7fV1sWsnUo zhBd_c1(I9Z_Glk}{`=~8j%a`{;~owxoBJtuc(ApFrL`q&wcQbICqXqj%#a0hb*yk* z&O0G=d#mG-1ni+F4_xL_+iYhOSBrGk6ZTZ{e*Jkx9sZv(+uj(R!UFr2S`q~6DdktFj?@4_MnW2?2m$SJhQ{g0+mVESJYX` ziBGYP0k_fgJ3Mpkw>h`xllhOV_u)I-@hlpt0#kO?`)<5mX?}ep&^KfxWhZucG;X-Z zJCmI>U}jP^B7NH$qm{T|hub8fYTN_CVEsTrBeV^7GqE@9t#E#|Jl(2q&W-Tj$JfiO zzL^tv`!T~~CNOD!d-PP)LfFs%C9X$3C|Q~(WFTK`Hw+S2cbB`elU^ zt1pH~Q7%8L7bristn_MyG2>l~tft1-7M8vBhovf*FU_T zSgF`Srm7`n+?+mOOPZ+M?m^g0AKtp`^I@TddHTtROdi2B)|FN@9YB^Qr3Vngp&Tid z*%(I6spt=q&T5Wg60@pnlOjr%%1lZ00D_hVhEw${)1p4f4uFL1_*bryo@zBhHm^Hf zd6%oq@5$6cZ%0`WxyoCIQWaakZ%3XMY`;Av-57QIGgqm2#P(5{V`qv=Of^>L3UO0a zOE;cV7eLco;CMr>WaNSW$_Nl5zxc#96v(L>JFr+Is1qp|NUcylfwli&RUfNxzPXB< zF~$E9CZ~o5o-lB-K%r{(ng5Pq;Df50$^(_IntM+>w#&b)yYzv@>PFS#XtPamI+E|= zvJCLBhH=J;Soz+e+l+O`m221DP@mk{rd6$5FUX8_j8jy7;D^WGs|_;pfAeqyAinRW z0Spj21CWC^)7qd&Pfrc|HQZT=L{mlYP)5?biG%Dy^{WophOuPaiUV+*CH{Nv(VfcC zaK=an000K>Lxr;zxA7e&*Vj#DSjXv(J8t^hH4E&2dz3#+VjJX$(V%4~#3*a5flLqX zWlvU$9c*+=oY~NIyk=n?bt*}kUG&s-YYzF#^g3Q<$EA-23<_SeDsDfWde)*{Fva#A zzf-z?yFHL;1=oL=#Us@Ib^MunJ?G4Do;@eh&}Sy2#9~yb%=vWE@K86Kd9ce>GTsiC zyo|szNm)1KCJ(b^XGAlqIkCm*9@)^k@R&KjQC=j!dL-Jl7#OAKYI?P2P%LK!VHYai zV9UrZhni-D%9X=o7^TbSS?14G&F)0ISHC#Z?piexm}+*wKF~woaaNo}SGi_=Mf-Nc zkQrr7-N{<_2Wun8?d~pb=5*D2t}#9quqP#bsS=f#njkT^3E_OOu1f9Ee0!#|)X*;3 z%;WlIo9B^?%E=Y-(yk&>w}){G;vOA)N)EBK2-S{7wOcb(+*T}k#%sX4E$<0sikjzL zX-_RbKu*96HAy*G{`)Ed4K5iN3RQJ zF4t8?drmaa-0iovthU@Rhe#cwu4t{cgt&%+WaFf-p2Nu%JNtpmRm z5_Xs`3J&R(>|$CrIMR&y#8BotL{Rk&em zP2t^kty+t%0|bs*2)sYL{P>m7+;%tHquB7cCyi`g6qyO`(`S87-fLw*qGJSy##xPR zLnM;0w8zuzz~1>Gif_?0GMIK4s;!YHi4dt}YaAkvw_RAx*@KW8r|!bQ$tC0)X*lT6 zR1=|(FA%TXYkLD%=Q5!Yj%pV#HnEQ62Rj0MYe^W8VsDzjaJQux@w(ZlZP2jcK&d)n ztML*6o~%}y&*&zOKGcQTxu&*;i`^4VD{Q-ZkWs-IVhDHyGr*Zo`K~u9&q~{HAGJ4B zmIj)s3$aKHgsMG+Lk|GeIRg9iMa0pJ2FNkiJNB%E`ySz9r#-l;OIzgY26Te*$*2~T z!uXMmjeAa;3cLF|3Mk;g#|TcJ+JBTF1CMh+j1fL`OV<{MXD~8JP$Efyw=HJ zzUvR!@}>Am{O~j%u-cMx>f~00P$sJ$4XKZI{QwI4vJZ=5w&Vto12yWxsYp`WkE2K1 z^esfn{CI3^E+wZ=c~@65QyQ1Zg|l}NKnHO}AU9ltS7JED>r7jaM zO`Yb*BHSeL*5iEb&dZVT5Ei~Ru*7KuM0tP~+J5q_aUi|qz-?A3@oALw8OW=3nPxU_ zTfE=Jc#D~6R9q^&hz03+LV2|xCWJaDuZ0cI)PG;JjcW?}=y4)wf>rp#^j^b!Tiqby zu!Z!Y9y+Q#B5c^F*_=d!@+j{sroy9KMbiK}-84oW0dQNsqh}>2(>4p z6pV*xE2!%|t~d!eq?^kNs~x>V7D#f9kT?q2U{4%xTB~p4fdx|H7kVJ($(d^ECsAW2Jhz{h8 zNdiYj%GBzOlMfs&PUC=CYY_z({hwr&js{%bTWvJZi)KFLH+bTK#LEd2)AUucM~Gvt zM!Am|op-cenz6SspFT5>oNI25J6(NmnEkX=WUWD7Pw)Nb!?BJH?}t|FZy{2a&Wg^+ zz8a$-%bBBDgjrY(Z=E(hP`~&j*p+>dd)gBR%TUw>g>q9l#S4gyO#7Aq80Z#!hpdK3 z{2jJ*H`{{s531U$gPKQq&fV*6baP3RoGFlrDCImwd-m#wp}D~KN_3z% zLC`J2(~-40$@;eel3NpEYHQ60>Q~~3MkQIpM)v{L)mv9TzVN|rK5GbEYk-E*dBK4P z2-|NR;1~wf&WADKh7Iql+qC&=o5T(ZT&sAK{}_AchUEi98k&|w1VyI1g95Hob@$7W z=c6EzC~aOk2nbC!fWq~X&^x22rO;|iL98_>${Nc!JneQhDWDBaGL4p`2R?8n*V4p*+4Lj8G64iid`42SQ7$UXXBg>5o!xjkq!|%8gf9Moj2J{mkq6- z9DKMfxSloSnT*>BbqazG|4OOOk#+7vx*>YHVU@geo>HOHJ(mz5cBpSSTYh+?N2n$! zERrPL&>4MRgx%X0?F(Y}^Nnzr4yzb*x0+^30B8tP@~#+D+7Li;%#ja9BQAj=OYOW) z=ZCmN2F06%J0*u3OX8Nu^ZU3BggGI|OmG9DL`g4Gj@T{s0rQAh4w7d?gw1asOHT!)Kd zPASBXnM63aB=ff=`>+d&If8S{m{e9|m7daAE~?P+L9&cN7|_XgjOTTbx@IpnY7Ip*v8?BCwU=> zgQ@7UAG-~jGj$@R0QdB%HdrBX)18HW)R*W($d6bqZN#RECu7UL^2SWqnA*r5RwOAwm{e*>S1Nc^Zg*5F z*~g&(73IjVc^MiMe8*t36i8A*S&OSG;VX+TyAzO2o64`EfQNbm^yEp(F?-4t(AIpA z!WJAwIi_^bx5%{C*bPaQgw!7(6v)Tjy>v;x3wyU5V{J_jEw=@i6KDyxG4GA3moPOk zoY1Wvr7kI`buAx= zX_z>XIjdRQGL}x21{^pKOItK|zf`Q&6-=9k1v`$>RuzINURL&g;5Oo{QaB%H&;!lF zfHsUl#9U&lndmq|*ajlzJ_~&&Z&0hQoq7hGwOAHNRj|1TneBjHG6vP!g2S*-Z(GQQ z6wT7|4x}xi2I5WSzXIP{)B(ETwg4dVSUnpP=o>kjE;{Hn{gOTms7?vYI8<|qpk6hg z5@ic{&W1xFiOsdU8!T-@DWz%x{ym~*ClTaUN*OB;UcofUn?SqZ zD#uFmP#91%fU*nGBS!2bLb>i2=+Q&DI7+JUSXwG}*!@i2 z4#q89rmN@!6O9MJOHelexh_C(7DKLEO67D0Y>6ayBUI@C?vOjwJEDwRLLkOS3P&WA zSufGs_Mq$WTw*{|7hAlEphzN=dijydnNcrkP+x0?elQRsEW(#;f#(G}wh=oW8yNCc zMVdl$L>`W!p;Vau(Cxg^v~r>;N(Opb+BRNFnSd#No($@uqija1*rFj7Y)p^H*xq8u z-{GL;1W>~w`Qm-bYAp2WBKZ=E=OdEb+Oet-0P1)GCCqDY_YhN^R)+lvqs21;EA>9X~t-pbZ8(b3#PwTaB>41R30NIuX;eF;g9NGL_YO&UCMl`^xZ zF0rK`C^W68PwvaXctN*-Ak#Q#8n&m4t`vo%=zj4K?Fki}jNC$Dsyv{&#)dfwP}h6P z(|G8l7uDj2{1rrd!5Ao|=PALqy-SMLaS`qgsqsdX4wjs?37F?Xhw$Ub0Z4?SF=oKjKLp@Wh0UNq7@ z11ay(CjQQ-FiLyQN^~TI(E}h4_R{VtzZ=IyMRQm<3Uyf@Pd&qxLnzJc%;AiohaQcg zS-8!(XO)}zuo;^>n?x#O7m8@&W)m~2oA8(0i0%z3s8dCs#7U10M5Y0KCc~_S`emT8 zF=g8_#J`Ios|T6{AgA%-xr8a;=f9T(FkUH6^*Q26~O@iHT9q>h(cc?l_DLWzu_2LGz$;uRXW8|g5Ekvlb^i^&ETfBZj(kwLj zc7c3v!A*|a*;co2ORyDPW(+N49IiX`hCxHQ>}8$%)JG@83j9%@Vq>7Pev(AmJ6oV? zAn_FgHb6Fv%+Yoc9+-N(zQ_aZ0+6R5sf;n*C-C3^u2&XYkf#U=11vPEn_}rCHO!0L z07K3iNbU$Ax3LAEr)@h%0DVwle)QtDV*Czh@{oqOGA-34^nt_v+_T^W4D|dJ$feRm z?WG|b6cCuoX@exM?xC4-PK;s-jOkp)<0%R<8UjaW6Zc@~mQq=jP=rPBgV+MEn zqwiSpu$_2IL_6<{KFyVTz&4{%DIPEdg?T5%@;f4y&JP*sERsJd?qR#ie+)rkiCHpO zBo9?ymX}#h+q*0P8?iw$8%T{9jk?XL9P1wl4?bU4Y6)OQ5P5N5Tr9 z+Dfx0Gk?$;j^_v=gR>!!S=DCE8al5kDa87CZH0Cs#U03UT=_Yz%4`QB!g?W?q?FX4 zG|aZt7*$HeNu*LsH4W>@7A#IVmJJgG6v}{)8KJ0BV1U^aozbR@<(ya8=CM;c$9eY_ zs98oBGRL49-^fAdx~qCLs67u!m<2tk2zn>a`&CcQPix8&n$ugWH2%v%ZbcJVcI<8%Ow{@~P*<@Vj z%69U-RG;X>%E*-1QucLefsTt|T8}snNSb7YbR|FIl`o;KxWpZ>y_yG&QgqW1+W4^C zX`|}Beg7oTYj#YuKs|ybJ@Ijy_KAb&Gs-m+{8-}wY?lMY zkqD|rw$>gM-)!-4Oxg*egjsL-E>AwXdGMz7N58cb`t9BYw{;c%G7#|KYx1wtxYtM*4%)zzHf5V%H$5d&|QyN|0fxwK#^M zeO{?s>Pd7LjJur=axhJ1ELL{W(Oq-hG5b;Vr2!Nvc($*dH0lTzx3@uTP3bKD!-VTw z1(}n;*7o470RoX+>D1irI#q z<3Q8{TE`az$UoZ!zH3PXg_2uYt5B_5#bX^M_p0ZoYu)d@3RgON?{#kN z+1iDNO6PVSyq~H)*RVsOjKot4*CCsr2bDcqSe@!TT452&p6$YUb)KD)t;**g9e7=L z{;?XRidXMJ;Rjw%jSi}KKeKdt;QibtLgm7sQ{ICMFVD5A_ze5Je&F*eh*H&eG(x!E zcl_!>)r*tYo$4>Xz8RtFHAS{s(R_o-PiS(78)qk{FjJBQ~#C62h{@JJ$Gsd zc>gLwEpX#?UPIuAg;uqokMCbM1null+5*Vs5u_|VC`v6`(*NTs|0%9==YJ_z>7?yy z_3XYyb@j75=fYf1Oe$L5?|)Y8)8cCV?!IOH+Ov`%ayMIYrK623{iPAwZuZQzM<2@d zmt75Wa}-oM)?v|Ke!a!*^#0mo-52^RZj!q@Yb#kjN$Ri6({?{|q}Hmxy1%L<%-!u2 zRO$GO{{HH_E$(N%A=N0r{~4K8)=}!`qYz$ErAbk!{wRxuD6Qz|XuDYLl-Q6{SM&z1 z9ACI{*dJ4wfr%Kv*f_@>_K&}}7;hGzkT{s&=$>fyi;?O1jgk2o$n;7%PBb#V1u`#X zINZrd7|J*`lyQD3;}qfM`C;sZz)YvI%%rTW?By)i;2gJ+9RF*%&DFW7E4iNGc^-s3 z)OcP%NS!25nd#!A73(c7Vgg+*72F0K~&r`&l|Uz{~o9358@kX6z$S$6qadGKm^ zXhlUub46ZqW!S#}Wac+qFKKvD+3<7?mwF%H(}%B|!`~`wOmA*(o^2_7-;(p6Gctbz zWcCcTeq&^I{LaXH`f6ffYI5qsWdFZ!WNvP(?dqytVir0^^nlU_w?^n(|%{MEwessTT?s=(0XU5^$>NhWI&d-fJs=d4L3Ks(T z2aL=Kk^es#nbmk>`O(DR8kwCU|DKUqwQFSN>>8OyoGHIEGJ8e-nvto^Y5GFSvWpff@}X6cQSm1_ zb=$aXqLHcf$;iyj^7>?CR(vutyZIc3J{y@AIO9GUnW$fk%(&Qc8o1xlUyaOTj9-mR z)?bWFtxra#R=mErJ7CC2Anp&0%ueoT$Uc36Cqti%OxIl_la^>?YJD{_Q}R8&7@2Wj zjLc=euSO=TU_oUQMkwYNBQyK6ky-IC8JU3J8JT?F8=2F;8kr=&7@5FbBeRBRWbXT` zMrPO_7@6AN8JYdR8kuoNzcDiHIsY?8rscnGWMcoo$n+UF`1g!VkDnTukUuao7d{!8 zn*Tu~v*^!^%$~nwWJ3Sk$kh3JMyB8Yu#ri-Yh+&e-)m$F{F#wyzlYlUf6U0FkJ9*J zWQz2Lmnr1H$e$eFd-p}Y_DYHNcSh!Ok-ubQ=KR#ijQN?7`GTOJ6hoN1-c^g(_g9Qe z!1qR`-?v64<@ZJ={XcJH#{JC5%>3TS%mM$_$jtmFjZB{%ssFfU}XX z^S>IIu?V`@Z;Z@1uLX6!FUW)I~8@+U?nZr8}f|2K@x?B5uf3_mh5 zqo_#XlmWjOnX&&&3V`_MrOY)Z649c?4i)>YB!j_v~Zg4 z?az(Oyst)P3?0xN-Xv@L)yTwrGBOhZ#xdU-nFM0>n(~X0X#&3X*~r9vV`L6d4A}BlF_FZ)9Hloss$6nEEo32D@uyUWV@)nTwQ{ca6+=_^(FhHF$qK(a6M7h5pRQ zTm%!1%$uK$O!qHFX5RNk=4BKxo@ivkKN*>qOTpW=R5r+8jm%sE^f~IQk$D*ugrsrA z{J_YJ`7F@m;kMo>dEGQIs0hj8WyTUM^%e^QA_i*{=utQ_4ccLG{<7Q z52iN*iknA_2~(vsWW@td=zy1FhpCtPk|ZGGibxO#;Gnc68Jo zxP+tDk?Q$y;?V|RjL&dPXfK_~HhBa5 z9RfwU?dW?1r2&$BC1A`%^@Ya>boKBg|Jz+76Z7iCzi4EhrIP+bBQuil)yVArr;SX+ zUo$cd{~aS!{2wwhM|X|PSCcmXgpsNA*Nn`YpN&kzziec}erROY{m#gg{L4n>%2y*( zaX&CJC;qmP$#F~hA2c#M9lkd*McDuQMrP*^j7%)i z$Q1b>Ffz4wjm-Sq{X`>^n&Hgcz#*cMd02pGWa9qF$dvkbjZCe-X=H*DMArwNbLr;N z{np5&{$gaJfxjA=X?&%(OGJpN-65qLIn)y^%?;ZDm3c@zuzj`GaN`=>@`)K87f@jo*%lfM|5=D#;Gf!`RJDnBwZBmU6HMEumq1pF^DG9CWh z$Rz(iBl9;#rqg#u=8fMOnRMhbz#ki#s3`Zo-x!%;pN-7xyGG`1qLE2OG%`bfF*37$ zU}W#mV$_sPgi`_;%i{SO$KxBrTfDKz#oBNO#ABU9^FBNP7j zj7*NdU}Wb0$jI#bPZ*h9e_&)r{jrfrvuk9+|EiJs>>DF9;cE{4FCBedZr9GIjoEjZBo{-!d}o z$p%6`8<}?$|GJS`_%kC@>Kh}o>&(xL%>2J%Wb*x~k$LFnMkeon(#QmSV`QSV{&PlV z;lE;J0zMg;xc``u$@DWLlk#6RGReO&GRc2nWaj@BBNGSQH8Lr5={_5osfvHs$h`Ca zf|2R)myFEF|1&bL{GX8-cy9g5xs8l-o5klo)SuhxKKF6t-1ggZJKN^~P$UTqNh*mX zQ$qrckmNQxAgLcCXA|Sx!DQsY zY~#Ul&Vx0`gYBvZ`%Mpy5)aM>53a`^+^;-%7Cc}(9&o59FU*rq(vx4!Q^3em(8g2f z+<()^90=7f4Rig)$jtv@WFCLm-WP&x=yv>kg{8+MN?5>fC_-tgp)Bk~y8GLv#!u2;sW(n|LHZm!{8kwXT zZJ&)y?=MCsmUv3v8=31NuD>@j2X>82DWZ`Xbr}2A$i#j&GAVy&WQMr@#>mVE{%U0A zhq`_;G9z4nXk|=O#T~2CTLLR7b8>mPmD~oJ<-Ut_+(__q*Fc_nY}{gOapC3LAyrg$zP1j z+@ls@0J{g(79~hZK`yC(LiVnS|thH-op8B1UnTz@hMyB`W&qn6i-x`_s z+I=GbDI-(lTO(6<4Em==rsL0y%vH)C8ks>uJE!znygnJ3slOPRXG?EC7BV*9H8Qs- zKO31|pN&jjFjMLmBa@ulz%3s7TO%``EpFGy4E=0mp4I)0ktw=sWDW>@F*1+nePd+i z?i!i!Z;VW>&qikMw?=08FGi*nzsr&DjLcBuiH8ii@J~kOk>47bOG00a%)OtC%!WTU zGM`7L{fUu@{bXe3=Gm%enLYn`BNP3tk*V)y`*R~xTU2p0 zQhoi$MkWc=cQFJ;kqJ5|-^egw4wU`)8zb{P-K6+uBh&WxM&`_~M&?YU;ZKZAvoA)b ziJC|w!>u0S?~TkupN&kipBR}+L?hGoJ0mmcTO-rui;;fdLY93pGW&@}CMl2N zH%6u%(a2Q)Y-FN$jm)H9jm%rPZ;VV)9{q2O%+bj3&96o#m0Et^uSRAQgUXR#jm)HP zjLdhx7@6rmG%|1fYGkH=GBVYFY-F158kuXF!v5%8BQw3{^q&}+JfDorE2fSWpN-5+ z%Gc(fjZCgg;E#;VUa+9ZS0fYn$;b>P8kuFk8krPdjm+>rGcvtHzZ#i=zcDiDqfCfK zX56lk8RyzI#_=N~Gv`|)Guk-#XGW&&?~P2iZ;eciZ;Z_4h+QMIfM{ge|K7-~{DG0_ z6Zy%=oHhMpBNO}C$aE$enHZ|{*gr5bgNa5a^bd?o{I5o)%^w<>xBjA$=}WU~WIBiL z8kvo-ecv0I<{G5>L?g3FXdltYY=-^b$V_gOM>WDe8<}+98=2RDCK=pW zooHkxelaqKz8aa8L?d&An`mSXO7Vqd21kuRUlI!9%utqnq17&N}r>C1a+)~$2M6hp}*A`#lLcP!7 zwoQg!WK3R7``1$iw7A(%goxru-O*t;7)8+%{yezUn4zJu$sI&r<@7;61@$_%($L2cw=83$GBE zD!E$a$XrW?_`XJYPzM7pG@`AMrq@wBk@CraMjw5fCI>#)qH#Z*WbeEN%Q~mFZH^)K zxVG|XLtjFu#dMfVNZWc7Pom?Fbdx-0`s5ZGa9{nc@AJvX zB>Rn#8S=@sJ* zaZizvC|Ytti>`+gNosDwA}sW_$#fere)4pSDe#&_Gv5&UWk3N{Wz(PuUe37j`ra@l z^G1eHSAKI+K7xZB9Vx@aL{zx8%Y36Gs;MeEHe*UlLVUhaZobK?d2&NjW^tIh5-$?l zI3X&*9Eng?Jfkh}=1DwHVjvm6jCSGPJT+=fhzKnI%&EjHB-%NR&`Q9RMBYHWM4?W9 zp{bVSR=QeGFfHJ9QP!Ov28pJQr@a=<^f^UBZXt|-M&@cXPbHsrsneb=+|w(hk442q z3|S(mTGn~PZdvU#50He}#gk~Jg)K;HS59RIbp%)0iydkvlf}~F@~JO&@zj`xhQw3e z3wBi&M@KX_={e?8(B65a!AP(OrJQ>EwqH{kgH|oo((tuyFa_c^$zr>C@eEDj1DW*j zc_V5KVY;K7`6Q~%xvfX$>BpM-ay12<)z#dZN-q?t`HPDOkLXC;R7;$1&zW+$_Idqy})OB_)5)FaUMdQ1rT`)|LgY;#Sxs6D-Po zX4(w2EOj~S{(Nr>1jCO&-!YfD1wxe!o*{jq;o|W^_;xe^Qb?kmR;t$3H$)W=cGFaI zRu`ClGf2Cm4P4kW%5k^ds5KdRQ{AU zL9^UYIB8FqVhx&N4S0p3MR}lw?yzf>wbFu(~>)s+2p zD3G%TBasH=yQPM_0}>BT>Ik7tM(n95kMk=6RpPF9gwS-MsBj@X7;)txNkQ8&(=;)5 zM|BbNW*)7RLV!oC#<3@!7^v2$vmB)(Ba>*@28t|ci6^6|v`All9HOoSycoe#Z+VW> zH0@Q}2~i8Jpb~EceIR?cUt1s_2Zb^<4lb$- z6@_ohqnv|5$tc?CCkThP*$Q|$XN}R|5jOKmR*)%OB!Jl%KXGJIv8kdw1vT7%M2di? zqYgH4E;C6>+n2~QoKd#Tpv_6Cy-?4?FhpiD7iTyT@`gs6dy>rLb&-S4I81Qb>_i9Y zX?4C!PdZ0mN(n*yShOu&!xo-3F}K-i#};cGZRP$aZKu5Zp_wik$MMw|E_6; ze&EE4mRjZhBcbuybZkUTN!1O_53(_8)7-pGG9}@9;0&4r!!LJ+Uz@2Bs)*j_AHr<7 zvsY#Pq+*Vy+5p$-GgJF4_+?bXX!tZ(LTN$P#Kxz%EFxXICS=uIL$xnoqDExqLC(m3i+kpjVd{H&G>4`28NifO+lzNeWP){8exaWat>MusI>{6 z+)P!fe5*ZQcG(op1aI8Srfz!Nrz)1L>7BPX=Rq|u%@EnvBZ|IRJ&gmKP4xZ;v_WEJ zxxuWaz=Riop?9QHXm%|a(=_6qv1nz0qB z_24CY3h!@fcGin)gD!@z-BcAPAz!C!6OmwrhA7q z%9!#R8OEFToVys47ebZXraHitoEMy&aL?>SiSAbLl8m9s{K6xtnt5|A5jg2{HuZGY ziy15T3LEaL={_?igwPV+v-ua$)a=tst@H}71jp{nI<7^BwAMDpL-GLnxb zLC&{ksB-#9&uN6`0TEf*1_k@on~IJk4EdxfI-tTP#e3H;nM9_xGLKW&+_hoc(XKtO zZKal@o%~Kamioqp;(M=^D*8>$&3Tyj-HF*pN~9#1ol9Cv$r{aojgob0;{J^X>Y6l_ zn+*ILm6Vl4wwjMCx7chw{GchjkF3?T?xD|CTa16Rneszf|Mr7htr7msC3PJ)>)M@^ zAGP{F687)R^Ka?g>RhRNq*nKMVe65Ne|J+|=gw9yM5X8b)}y(tmWrm<10hd+l-nsk zKBfHF$EngAqWtW5z;oG;Pxv2nJAG`nj8`#7Kh*vp2b2q(&^(ZOn6fzlzD@>d4xlVW zsR4tBeE_2CO(SGzH6(uIK$F6}3hO!<-TX%dOykJofZG}TW~?#w?|oqiQK^jdZ1{r33I_R>ng^6~A_gF%$*nlt=6Bd*)a zmfNpeKPv1CnVt(Aat&JWQCnnfSS?gte;K%;)-ZRl;r;ob*T)+coOagC8a9oBrpdG@ z(TFV%dXdEw6(QU;>ty9Uu!1o3mW1P|Br!$*dm~fqShcxJL6X4@S(pCGi;PZ&=}I0{ zC+I1MuP|%+U{m*bO$&JGTI#B^M$Cvg9YX8d&Cc(Lm|``eI8NPWnz^!X+P-OFnLc6)vOI)U)=Xud8_@L z^{q{3h>yG+*D>*96~UaW;g^#4D%K*!DC_6nFWGlz7=;*eh3q@=6tnQe`lVI*nTGq~ z4%W4oJkH&XKJTM*MR9%hmETFuSDAzF=3iH9ACTN+5gjPCj?+Ii%e2STsU$zEB06!) z7<(v&hq;aM8zYmPh=E)IYEb@}k*UoU*9=(@qibP0(k01p&cdJmA=|06-kUq8twp~v zGTY!`Tny=qF3Jq;tdj+eX#(l~3>_kR<#BC$?$j+8gy<8pxTwSK7m+!!iRY2!F~n+i z?z_KQT7?}Ob@G6s>|#I@${s2+yrlKggM1bCj3jJ*q~I zBHzBI>?JeG)_%xhIDl2td{QMU*5AVXVavsL!*|V(jDEB<+}Zauay*>2=LN~#%pU1! z3ehxCbX05_!vC2Vr_i&W>qJ&kM4+vDd6O^@4l?Idu1t?@^E5TXjP#nnB~}fCKO< zcp}ro>B3Ymx^qGve~KP>EV;Zlacu1HRr>Q0P9d`hPPo3j9PkGB8^18l9+I&^&%iK_ zC5t)7O>=vJW4&?CY&X-Enen=EOI_!+R<7;u6^#gEcp{QK=N>krgkfmje#wrxMF@`3 z?6IhJEmst2^&Wu6d7IgVinc!A3wx5={}>8O701u1t|50GrQbZi^#8GU=kHLy|Nr>M z7|dce`xax(zK0NF>{L`_D~3o~grr4f#+Kb!v!x+PNJx?;V@=kvl)c9Kl+akRGvDd? ze7#=p@Av(Ae?LEce*7HA^#`2ibzH}JUdQ8cp7&d)>Fe1i@$(-dpFg_pZnnG9z({#Y zZxdteteHQsH}cjk3W^wqkEN#|rb$Ib6*D$7Me)+7vh#FAg?>KTQQ)YNNqQY%D~}RK z<#cw>zjv2bu|A8(aWs1wz_{e@e{V?>cTf1xdb|fir4F828IrS)jOKSX24Tshor=db~c9dMWOm5 zBj$#er6NzF3kC-qEnw2CMp5CmjN{c2;%3T4Ns#*mp%Q|5Sjw{dvihuy@ob#-cE3%) ztcCQvJB)kL8#Mx$lW%;9dR9;5ohC7rT{#3Ufa^)*5F4K!3V3#wD?>P$m)A+|MX+6H zSX+&mmR3ddBOdOo{tKUtbxC24U%9d?>%CMz_!!)%%IEc1oYy&8MY6TzwyLZ#54oCz z3;cM?b?AYK)(@X9L~G~8PXq1DSEl$MBh%S7Og;G-hy4Oh@ILk7=G#1U+5%2Y_e`v- z9U0QLV8mP1A96mGoD&c^B&voAqMQrMD*_tWjt!hW;mI~!dQ|HcCL!5s$}Bg39P67k zczQYT;fag9wnPTT?&&wUc*+RTCgi?$aa#1NaNt#r-n^kAE8bVfj+yN}m>)>mxWqkT zY-zZi#QktV@+eugS)_jKtHGIDh5kyfZ1|*$N=YpOHmZ@QkO2y~$zyMVpI+nG*OjF# zro9O~_-RXzarB?19L>lcbdOgXX&y_fAf`!sLKK{ip9xBLS*9GHTtxekn!qphKl796 z965IRT*p6r;*Beh9oS_R)kS^LpD4F-=+rFRC{^_QD{@fhRQ?+Z?9rNHSw{&wA)OY4~)`(=wo)x8S|X|%qycK6WSa0IarU| zg{DdcP8y8eF9y-S6MavdJ9Tch(+tDPvzV8B7~sk8nZYi8k~pu>f#dU}n@f|aQKu{< z#G2dNM6({kO*F@u`cg}Eb^}CxKc6en+S1Ur4)T5sbEM`ANLpYi@zIq~K3fUaCkIpB z%l)itb<_GJTwMxB0fGf3xoX6uY@j zo8|0k`D5#_dvvP_b9PIh2Q&MW3iF&>F)e-gZ!c3Z~fD%2PwwL%FdFH zwBHS{<>JaYLO?ZLB0D(gApcM0!Wn`CW#LLX$;;6cB?XV?romnT{}4K%V%C+b)MQNa^y|a>MiIR6Ps6(q}mP z`+@!l7@F&xGFm&%SSSf@gV?l!vR|~~%AiWeM!q+}G)U4vntS}$ZY}$Sz(nwr)#Yu{qh_}y?x>b=WHBs$rk?JO^i?$-{2(G^> zL>l@o&hp*6*3Rk zq{9M^zU>!qz0om^MI8aviCmKnUXwX0bu=PC_DQTyxD zmg|;UQ>B{Nx~g=Fn((>o2hFO&QtG@K8XSW|5?!_36>D@G4W&U1G`FVQEm?E!&qA%5 zV(?>Ke(?o9lrpmN8K83TpmvCcl6Nya4*OGv$z;0QSWOF_3QfE$G1>vpeSS{&C6mb{ zP@>RMLq>XE+@)9C^%@i1K<6}&tM%t;z&en*Y(jRSBouAn8%&4rk)toFc^nb zcBdG;m6G{&PWU14NYLxshg z*LX;Ssk2;Z9W+!>NRPxprzwI%-kkzf4GC3psi~IceU{<~tCT>?OX^2V*U@EmR`05; zj?>W{ES57g-d*aknL!IL6|1*ZmMug5-|#29HcpPKo|3C#t&iC&ICcLYDO2VXS1SIv zX{gDW{No23Q4?3kkt78rW(`%!~-tBsOh||4Yi`%Y)12N#@_?1WdWSXF*j_To^59^g=lYd_{N5 zzI*(HUhk4LxNV_FIe#v}YKzzKw*C3{Fnp{X{@`7%owA(E?Hcy4hUg&G$G6kw)u9UJ zVx@zyofl&J6r`5K86&2YiAU?_nVA1AW%5nAgi-z_W%7xGb*3*FSO1YR0aFN%^owFl zDHFr}DDOVWSEQiFMmtYp^_bo7FZe%3=2xF*d{>|IZDPJA&Kq^rtOrxX8=GxiFEnwT zmH(}_a-V>b}o{#&MYlEZfs z`R}DT+=B-NI4y`jA)IM=jEKYY&PfXu6K5gRe z``~LqLD!xb`o8y0f(@_7cI`)6vJzOAQd4%vgLm z{>CzEb)?paf3VDtH_!ZP;=vyCk8q41&WIns>lUqJdDq&obdWeI%X7!UZnFtF0m1%* zWqwV=t~J4-f{ACQ`8kgh_Z<8uN0P(@lO=Z$Q`;6IoXwhBf3QqMX8bnGyWYw_Sf*7T zb|BAdxzg~yW3oeLnoDMy+>QkTZn#DyzVS_KXMw~~(tYv;0yMEw9BFB0X%{myVn;I| z0_i@U-?c&o7)!rp9{774Snws>mGH_Jt^^?Fiwr}>-9nu*gm*GpoN_+hi&{^$TT@QR zXyOZE;rEpg4aofN_)3xSRZR7`^heC&o##B&-;nV> z5>EVCX_rf?i+ZIm!~DEhjDK#+=!(Z2WZrGMM>Z5HFy3|V*|7^E3f8jddEu_*k+brA zDCsmL-*M@(DddK@=p!RBFtO{Tk?Reh*mlPAb^C(HSw*p9DA=yWG)s6^h79gZp;NS& zyaSKR^wJ^i(O=@wCtqZ~y2DH~iyB9ZTAZ12-2J55%-ORN4Jg$M{lrS{f(XQlL zGm@}V!f47WWtt72$4bw6a`W92jlbz*CPtiSwXXmiCQaG4R}L?%Z}we1{aCOqOaLTxN2%R9(}UZMIbM_Ph14t$a%5 zo4V&d<8_UDX_mXDWG0qLL;k@sG2H2@VwbP|!7{a2peQDmNjMjuAvv4Naa6RX5QaeCZI`jwE89!%Ov(N$V%J~m|7+!0dbG} z0Tau_*tRG)+UK+xF|kael;z`6Oe~XgKQ{4;^uRb+*5%VF>1u6}sDH4`g;$&j`=H9d zu}q-{={i%N*$5qCA6jnUejNf1Jy{Uf6~>uZCUF1l0gG)&A2_%l#$wO4N!||1dtogR z8a5$WJN4Sd0}@a8^drYEa=#NY(vfuGGqYM)@#8ZKw#EAgYr;e)qdxv^y-am|*PxPE zXMm_^a&TvHci%*tw#T4kHfQHmkcXB?qn<{k6M3U&50F_*EYtn_#8l?A zJ9nV)&--^oQ0{$(9$9&WS3C86C&v22bf_u%*^EvC^=VFjwo!V`6jk_ONhqw~L{r4Gj6%gCHKv3QZT53!(j2CrG>pQJbTY9_ zPY7#*NN21;)SHVkxNnP7XQk`4GXB9bJLN~O6LUX&``J}242sWGc(*-7>|$b>C-{^C9$D@zATI#*gY_j+LA@?610i0Ns?Y33k{1 z*5%=OZuH&t@!E%VYEoZ51pXXrh<)PO`}9Nz`-$VJ<2Fn+Q~F|-cExmC9JfmZ?07&k zK1|AmX=Kj)Q8T>`IdubFKEzvR8^jCEyXMNq9v(YxHxc>Fl2hSSyxx1?_C(XY`4O9M zzouH7_%td#FRadWN2h#)pQsId8GJOS=21g;M?R|DjbBn(>;0-E(#9~+;Gpwc&v;!3 z(oI5E(y61J`9Jyd*6;m2>+dD!2mr_ONjwk;FuDu4&hSgVcX$FUesyVXYg{+MHfqW* zL}1D+QuL8*|08sQ#q`DF5Q0Yx0AK#}tX!cEQHLEsZ?c4w9X%3sPahMH%9JA}CgN~~ zef(w?>Ym9v(KVM+P8&eHQjwcQQl1Hc_Fn03A6hb%oF6%fq?}E#yqtMC7s8u{XBwIQ zgW#d~D{Vzz3X?tZqrG$QuN*JTx*htZ?^)=1saUjaH>+2QmF}LK(1&?zTRfhJmMxZPA1A4rM_ZfLM`okS(m2j;c`8jV}Ai>u^%;yGQ zuADAY@YM!Bbno<9R9rRxLuRt!_V)vjN0xTYlxu;=lYZfghYCsR;5VZ-`pIBWzxNh~ z>n5&xqK@v(!r$%xNpvR2uN@lUp3vd6C<$|pj7;z2?@@nl^;X8Edi9gm_qg>7;(be2 zM>VclcE4VHVD?xse0#9tge|7>Q=evR>aC%(HcDT5WoBl2yl!kvSGf#_!-NcZ?;L1; zx%@1C)B1c0d{|b(HVHbuGpdcA6dhH1Z+7fu&z? z{XKVaUT}seeSu(-+-nd!LZWJ21*iu*4P!zr-qH4(ImH_ntT+W%Y6cT^)9j`ZxN6m~ z4xOHBeJ5};*O^K0TOFYm(lZn14t|fmd`4aC**n*Lta`tHd+FT_f&!SfyaR)s)ePN8 zKU0P0Btk=R)x`SQhjXIV6gK5_zeG1|6TI3nRQI}nq%@U?SgiSUtb+s(-xr4kJK`i! z?l>Na*6=R9)YpfUI1N?Sa0w}Q&T~a2P*=KWw&R#`9Hw~iz=m-0`g96-Tjl0oS zjy0Cw2y)dcM82v`rIJ!&coz}`^=B|kAavL!QPShGUB|I>)X~fPonpcGNW+c{lGbim z>>+AKCTl)XIyEN1Ixt@x2@#h)Ea5iJNb~aO2cFu##JP6G7*&vG`kIi>1%A~k^C(H- z!Lc%7Z^tN6hAP5k#6#pdDBp01CE_qM9}VuGI%4f)hViXT$sP$;L(9J)c~ES8 zgIq(s>Cp&y&_3@s`H{fqr2O{GZKCGt`|3+JJVnc z(kWF|_SAXGe(7!}DZ6?|Y|HM|nZ(87JZCA9@$F6lS3EFA)n1&ABKlP(8>a|*mN1P> zheqhvGAQ>ePoSXKiZ~=b$=vQs=lO^W_6kt~e>FG94A(F6zPWU)#Es#6tg0|g ze7UAYU?2$*LYqN+7hvTL1sSPSxr2|ipZ-WK zSrhKlI;Dp4!K2;ItqzmIr8g$e+EEtIzu)&97r3k%CMh!YR-&8;97HGZP2}-QIF>@H zXw?t(TR@*J+Er0NhI6>ooew<+H~bz8Bb=i!;#Cz3{+)~Q9mK%((yxnsWoaaqBSr*>OFbns*F5Y=w!;}@|zs^x}RKNA@; zo?>=`KX@g{DU_%w1CR-`70AFgs=-t zE)#dkLG(K|ymeaU&4*Fyl1~Owu0+?NqFav?#y|dGL0iM@<*jeG#J?P4{5a^jQ}qlJ zm~VX%-+mWoake*5ykF}l-Ro1|vqSLtsQ;aj`72?o{^!GA4}R5sUp;xaJ|`Xy8{7n!lY=-s#%fn*>$4EOK=V~m}0FmAxhjgLWy;pgflARtU5vxC67 z4gLBRE#6PywjiPe_>W+Tf;9xbC^T0QQLKZ=>rW8NAR=WQ)0+_N?}1>8Q0SFVtsp@; zfg6e=p$qg-Hn3t!WY|ekv8L5h1n^k%8v&r-m=sJ}{FNG-lp0 zb3~sR3BR$;chw^NK~$)VMJPc)@KHx(`{T$e4`0u1bTSR6VlKXp<#3~dv1w6_?onY; zQTbDWeiq@kzy#kBmfWcDTaFQ-IuY4L5gLwR;R1}PQpeC@uu!HBvFvX2?Xx$XjW{>w zC6j4_?`uS=EQr;zf={=@$O1y`+tG|o!DfNjV_LCABN3IdLcI&&Ljv*TBZO9P+;w$7 z$1{+@8RT~^mM)Ej|m$=VYgLw21q3q zBy0zXoEIm!khrH4v(pjx1xy;RNrpP5DBn$C6vb=Rq+}SyLhdEIt@%6jgFRJG^FK$W zp^mO5vH996Rnyp)HPtR@q()>gQgHVKl|~6FKZGwZF~H-bYj@+RIwCNi^d9#p=i`Zr z9qA6Qq?EJ~?;?0rB{d3}9A1#vvs9GQ^Cy0zd`3Gdik}lXWz*f9GA#%GifJ7!6K6&Cw1HoQkjCbYSFY{dYM!M9tl_;7Cb{TG`*#n5uH|kH?u7xBcLXu zizOX$H$xMWnJ}8}B}e)t9_BEdy}X)T`#H<1AS`ty@0ZV0xLTr$2YZ zTkhqqy2pL{{4@f>Ab&reIr|}o5-l#*;kJ`U-W`=|qvtxWJ~^ZyxrNY;GT!uVZrJ#un-KyLcTbP08$@=Gh+*1f)Z2KJB|to9BKu_%OGFE0rc65I(Yb?s+P5>Z{Nu!0wR zm4YNzzM^5%WJ{2uKM@8$kxS*ThFe~N^NKA?-sGM`qSmkNQ7HnT-yc1YMF$vI+ ze+bZWh^OBv2V2O^e+W=K^w*a%u9>1kEPEy$HYiy*FInA4;=|s4bH3yaxE~%>4*4Af z|Em^Kw+&mNf=yYWt>rLpTSzq(ysr*VqsPYY)gJr zEDl==gTZrqNwm0+xk!0^<6(tj7!4yRS{%VefGU~KVrq03~2}f z)?*FMg|VT9)y$z#&vIBb9wf0>o!MXQNo9ZwQ7U!iS@ZMnJ+bnTiuc=N?+?pi!FVXV z&UiutN%pHrwEnnK4ojvR6#A*G;-P6>O2{-c)>XCPK-G6j-{&o7#FRVqbd zHaN1G{AM{UkN_LM1@|q7I1#{ps&M6>lGPrt(x0YIwvcW*+c1UGwH87~l{k$lg=0CQ z=xk(w@%fw*%nnDOD)RCb=p>c{CEsS3gLK`u@&x300uY0KY|Q-4>Vl7VmPAFPt$Ktb zx7Fbx^gqTaf@z%MJL(6j5BI9SZ(9|!8WdP@!dtiL^fi~1}U7a*<~TZ4e3ji zLR+NNev^0{st(gxu*7&}91E7)y#&<}o! zGs#IE;Z}YsBQDQV-0bZVJNi9`!%5Djadt4VI%5t(o6!*vr{7=>dyE$tKea zJ?UOin1s9Cy={4)CSZ+XA3z5vXcP$lv-%6VIiCtwmo4#TUxfuSzt$;BTg$u^ zzX3U=vo(~icw5e_3HVek$4WV*+qV0OxZO21geR7-j$jtglqIj4&k)$r<{U6n@Fdx4 zLP+-1QwFMyG7xM6esP~YH*e@bN2F@%rWQNivjuueK2b>88TFFG5yHgL;#LPDEI}BYJ$0FOVs>Voo%odMZhy z-fHkkLP5-H5t}1lzAho>53Jm1rIUWhQ5riwn{7*dBrLWdaRIra`hAebV4tCX@@nW6 z5;s351l`5I7zy~ygi5FXhDuLef}R^Y(?%?^tSv}Sf;9CQUekP$6Nm)==I~5HW>aY0 z)Q=%Fc8`+bkN~8rSMBv%+{RZ34b8#Q`KtlYX8dZTEa!s#~zH#V3Rw460NJ!@Gu zYtlPmeR?+Q0UUH48oG=AtA*x7g3LU`35~DPoI((V|b^iHvxVL5_n zrAGqULPQrlu?t~)96;&rN>|Ovn420iG;+ON8zpBotO zWuxk%P|3EOD|pbo*EqjUzR}v(lZ(jjLS_7~>gVp8FBF$mt?*UnqPqPewex4@B(4Po_Lyr-*nWPv;q=UtMvf6Rx!7<)9_eLThcfiQqZaH z-Nn6k?t0Z;dL36vs;^tG-f&+9H*q?tzW+5-bnK&`i`$Z-@>)yZ?dAK8J9UkjC@V8s zu^9jnY&T7ivf#N2f~=Z@#Vvg4)?p2NLYi}6A`ImX&6zz2tK#^<978!Q6O*(vZ4^S^ z$2TE+8|e+cpgcNdRf5TcGGO}B6u?xUn4u(Xe=0e(&q6z-S{@QEaK@w6-xf3`FcpP$9kktz0D$cPEWb^xOaFb*neDE;AA_pI`B4k3V7cb6Dyk&6E z(_RkmCx#I_X66$GTTAssGfX=l*y~%e|6%2eiJW*;^2@_YRNcV%7Jd4d_@h^WyZkzd z8VrF}{e&_(%k?G2nh!#Hl)Hw^)y;26?cCPd;r>Kv!V(P!c6R&p;3 zDyxWx&@7+QeY&v+N+NH%9=o{uSrU$1lHmBS7ojmt`tcOt$yfy;@gdzg)$vxay4F493zbSzXS_;>j&V8YNeyr$>t?#ivrW`} zP8->YnxmqUsE_u1vXvPBg?ttsl63xFQyaJ<_{%4_q|j>0fC8D?887|dD$^H5t$Js1 z9*mY7GhXlrxs;pD7RU=r_UeTBMjcWYHGDtSzSNg0IC~T(W#&J zOnAtzHeJ7Ms~*&h_vh0{KnnGStAx9Adg&MonaPRbSbI)f-o)J|+h~k2O59NqIwlf) z{ZZy79$aZPCNf(6(Kg{8JTG)i<km(=k&|?~n5~xg_+3 zdC0T6KmKO;C6PHDbKS4~33pmAiT{Ox{`$Lie?cSc&o*!`nS^*y+rjIF?^j zl<$moxv}5MXw$!}{$#Y<+w4}>m-@?Ct@oonHzRN5%;|gU4v+TUX}^{GyWZPiYqT$f zJ&??H;tGy`jQZFtFrT;KikbFUe_~`{f%FL((>FXHdaMq^SvsR|oCvDE3wp2Nr=zR9nIXioe^CZ{vwwjJ%ZO&*H7Z=Y9=k+e8 ze!>gsxd&bMxZrl_>ZMDUIxdNtdS+Q%&iQ;ofvlCAzUXkB5oz(gma|6U*zkd$TPlCe`ry!=8x~yS8k;Hs?)U|7KlEL4C^4 zhMbQdKQ4UCd)b)zKS827Rjo~L+VWc4{=ggY!>u>qh->}pFZeVbrZK$?uXl7t& z{O3^X#!$n~m-^MOEgi!j|C=Z}Ha^KjMb{>}hbET>zI9H1oBB5@+V^{EXl!PEY z6h*%;jL!X7-Tm=xdU0}badB&LYJO>IXKCigueqIH3ybu>4(NX^uPyGaEv^1uS>IgS z-(1_=`Tc)U(cXr-tzWd3ScaI*$NHW1g~1ZHzK;!iTXp@*SvHLy5B4^uetz$3l==@U znqd+GmMOxRdh;yM!T_CbY2iHQb*Q|k%R1z6zMzG?M*_eF97-zPLV`zzoy8cXd3}2- zR<)2nKT;?_U?yJYzP2RdXyL*olr3n~Bm6j@&+KizP4aB=2^vibu31AT$D5z`nG4X6 znUX^5)Rar39WMBrAb#m;?#tiObFWmdA5;I2|Dd8ysCUpI1YB#y!}v{OLbwgr4`)xsN^!X^MWZ^v zf~m;a>Az9Y<4jbv8TI&IsOVf0NFDWA0{G_eN_Rz_l+ER%W*=ivH;l-w=s&2aq_yY> zmvE0{6Au?yw@@F5l<_~wL`4(zPio3eNpYF~K}D++oAm#nqFMP>a!gcIgNcfUotE&P z^@kZLHJ%I-N&JI~5)~Dw%F0v%(jbwLe+qNY?!Qn``*n-@6WTFq>UjPEIrVlK;yc4J z{XeLvzK4~9KVYIyGVvc&bcUVNd%NX$VIA#3;D9x9mbuA9)>(_jz(Adn{j+~i(MwEJ zw4+-TT=-f)^t4{iQL+0v88t~vR5VH)2v7VMDjM(~RCHLXu&M<+bL0mG1A^qf~}{aqCQ~k8MvGs$;|G*15!&TSpAi_b6X)8$kVj)ji0=|6Bf$i zoQ+MD28w|G7gSXA4=QTIcZe5&vCaY?%>EZD%Jy$mG!=A`424Q`Fi}y^{|6O)O|XaN zxFXnQfh{cZSQ+x;+RlhLmP|}IYqIJ8go-w^0Fk(|A1D4nMb8uB7^QfzFJK^=x+=y$ zg2X0TvE;43eC8QX#kO$g}5w2CrI~;HaHBNlm35EQ3aHBOk~C3LZYE*(o2Ki0)!|` z6=?^ZHkkDTen*Zp;dMr|>H3h;mjAilgP03^%odEJEc=e>t-NWR2%GytxsvXn7#nw! zLF>KBunl8AZ&ey`_l&If^{PPgi$!M@WKbth!90C6zpfmv4^wuRF9F5*zfLS9-rRo` zlGYY30JS$vbg#a!4M2653XK*p3OA~6`eO*^F?lWmU2IkxpR8CxX?=GvtmsS#DWf*An1*okL;w+ReQ+8E;&)_`{I=e^I14o=_en08opB; zYr4#9Sn%s{msO<$ZSU1T`Or(fZsh1_7(%kl8sbt$!`d9s`f(rA*iibdqo{kcQ=D&V z#bg@B1}`1EQ8&5#i~tF(*mCc2Tn$wfGNr!-x%m;0J+86$ZRL+l6HeZ~ed9{k7q_!k zenk_%m3<%A^Bvr#Zj5Cgzzx{FhWlpTAHohn|*0UL|igzDlUuBw^F3 zS*?H3|1bO9JwY~ck$6?$S!|8fc;8p z^$I!lil6UXps-5U;>&<}mlCT4cdER+4lh z?)A!Wg*zX%q?+NAA|q0imU!zsm^Z!&&OR;|6&LQBh<`-8v%GT=O^Iq55gAAX{!n`2 za>FivXE^`jgw2_iuX6Q|?d=?5VLUpjO@iONn?N$FHkyF1V-4e*52`yr4g|rmClPX< z*kMqwM2Q@^?#pSm6c5|GU#6q+qlTFjn8@N>6Z%4<`I2z72#2lOkBYP0w)Ww9L6bp{ z=C6R)7)0*#x8RybmUS=?cP>W*^`5WtGg$a}TWrKWq7&~2C%h;~VR}@#^%A~w`lBW5C${Doq+=Rv$E;jba-3=tl<_>TDEq`WDYXG zt!wTvk+B^`0`o*IV48TN)9*%Gs714#jm{W}&RyVMLviNo#1sf{s{D>F%7`hhi76e4 zDcg=I7l^IYiLG*st-c!@MnoLFi$1U=pwAHYP;t#y96#0=Gz-MlWW@1!K=)YLUS^<8 z{5;vwoE~>kY?2|7S{Tj+DIVet771dfK*DlA@-Yyq-5M>>BmtiCfcVF=5}ElS-k`Q) z>`n39EQxDTkPX>{{cX`?GuFCtB(wAMA=S|Yph`sfHrc~H@IXtlwyhtAj%7tYPoM+X zWB~VEBWxg$3pZP&;i2)YQ z9rxl^N(w;Q(n(O)77EPcJVryo$zV_(7>Z#L#h?;DqakEsHIETDMCC~DRg1xLX^XL32G zShD2MC&{d+)!Z_jhjki;8@5@n2u?VcR_zR@1jh1}C%$dS*v08&02c>5Ir~fHvwLJc zuT-T#zMK}zWO?sv?bDox6msp1!i(pMeR4VdazKv_F-t1>9`%$CJ|8mjG~o=C9fLxV zGiCuGd1~e%1=I-uiK1X}POKxU%zMj#5!>7oyU&4Y&yUHYcJb`;RKT(mDu~8%%r^HC zA@>mQ>=&6;9*eLOf?ZYxQfNqfd6Z-Eiwm7G2LM(Dg0}OlnkM0n0{+T93PhQtq)IQe z9Y6+-V%Wy)lu__sEo$=bVWJu@Z!aQ=p^$coydMm8Lwt&fMI`G69@ufdC@|)aB`SIq z^-vC2Dxqju4l4-r1XzMJ+w+s%0WSz}m0~0c0E(gK9|Bn56c9g+!C&D z*O=F1ov%x4U_9Zhlxel5%gTFLHA}2q9adR~pjbygD&V2kpJ5!aW-uBmt|{{-RRX4j zdyRQ>DCsOr>k= ze&d-a{Bzcdw>l+nwfCz1OW02TM83a@;sRv8b%t1!AkWoR4P5=(62&4Sf@|!pV@0`h zu3C%lVgTo?5$yEmzie}F#lB~CuOa_LW_5j-wS~gV3jmZ-AhGh0%&7teVaZ5Q)ptSE z_t7OtT@1@1rmpH*U5#EWmqT2dp12U9>=4fZ#g;7Wd;pzcVYkivQv9}G0oB=6e@p=& zgsEa%MR1V;KnC`81qQ%QX3HE0aNz+7*8uE*(nHmcbHN{fl;pO#H0}~pg^7)^btqH8 zS~Dlqf!=FINdc=q>fl-<_(Hfmxdt*pu~);iX`W@_f~Q0&NN>miB`lX`ks<}SW0OW>g9vqqSYw=k%FqH{xu!(rOQ?7sXz^e zrYZ_OQ*13Df$A?Hu9>h#V?q9uWWjbMgG>km} z049(S=p>{C9>wf$B8)gU^x2=FdVtGMoUy4#?xlx4WLon zX}*VDYCRmT)9~f){y8di8S@arG5~akw^G1gX^=7us(}I~VL5?GFaibKMd(`rKu9Fy zL3!T*2DOiYxWFN1pOG)I;4Ummwqd~chTx!Lt7?2t;sh0`)LU5&d`g4M4?>i9T9ql( zFDUrcCAb0>YILHtb3jGx$iN{DdYS^Z20&bBNK!d_GXrY`^$|h;BdxvDz;CKHMmb<4Tvi$1{z!)1TlLda z5PKs)S``ZtR%#WgdJ?aMDx-jBsBDKc&=vu-LLbRyG_ZYDMQwM(4*{_K?$P2C-{zDc zy97`!3HIXDxZK0>jUIu7ec681wkggb4P?F2YY5)YN2L180PY3~7CxEI*un(wjLnprfZkT?|()xuabvpn1 zwBlzu&Y7r-2vNyN^W~8;Dh*jSX@Np^5kUJV`}%T_f*+@j98LlAW-n1;B{%xydZ+Dt z1uSV3z9KYD#N-128uo1lkvo|l4+@|`k&lpfXdoBLH%!@V&g=OD>bFz9Q^P0NFJfV9 z7w7I92n3em6myIg$<1mf4Hv1W7YQv0Mj---V_l^FB|(c~C9)6rA^RSncJRz+falDQ zee93t4p*FhHn+uwu)6<-+UPCL|FYVEZ%SAyU> zU)Kv3r+trLmljg!f4QVJ1JxHMr<;M5Ep%Ujt0@_FKnx?Y=S9!nxLPV|} z)0Y)d!N627y-Ev59?-v_+gun4t^23fG*sG-sR*cttefSmt1?9N`qqt?)`{p1oZ)9l zLjkjp4Iak&AdB=#n^qH~fOW{mxnKPDDjN>JHagM2O-_R1P6#-x{5Iy?JfAS<`K#Hz z?|1i^b|10U=+#YJoIn6_>pJp?Q^>k2sylp&J!F3~_@=-kl`Ue(R&UalM&s{?#O>&3 z8@g`8K5w=o4MpN?W|ABE({Ao0xNVSZHlG;oP?B~C`#bl z8JW`Dx8Ccv|B12Jxf)u`@NU;kXd$ie>k6c3fB$Q$kAH@-<>Grd)+nHT4tI--v1WKQ zq6XssD$&`{0J_0oz8ur(pEaJA)xa z@14QuA|iq$%4pF=4-(9n(Yxp^gA_#c=t+#85Yhe!B6-`MxTKfa+ zz1Hl#_T2lvuAgo^gOJ&Co3UlRWDYSZ^vrLfP71%C)$^tP8}d=2cWPX|j<1>I$+(U@ zzdNyhqevxqclGPU#!aGj44u%uN%t=Xg4vQOhG;^r8!tonRu*CFPW9cl_CovdhGLZz z_rt;_owx5cd5ndeb51SJG%0nY773g z{%!8);UX#Uo)6bttM@Ab#>b{+$RGZjV=t5*p&rqO8M+*tVbICySG&GU%vJpVRl z_kXo+-xtmwA4fkO7m+FsQPDKl%K4+?92NmU)PJgv3ZFbAT0!@_RX|)pd$M3A)w&|+ ziSr%_RNNq@NvC$OrwiAeyipP(-0Roq=AC59&soCo3(fAVF4FhvD%kUu&2^GNpYyNS zM*nt*_#2il=8V4$kxwlHJ|dqcnhp++1ir~$44$Fwy$(K&MO*5+Fc$lC2TEixZd{!66=%b;8OT*2}} zoanmo(bST{rxvCuCf?(wYVW*n-|R7#d1qjv@YmAv>wOVRtNX!!@7Vmc6}fZI>+N5w zPOFq)Yx~E4|4vUl=nJ-SN>Vr!vcm;uXLAL4BvZR<@f)(aw-JREObD&R4DLs*J4_Fq zHCEFDTI_X-w6z&G8X~cp7ZoeEEY7rQgB3ir@0S0VHrYU~Z1^j@I_zDh8 zhG^Z1rAY43#F{))@<9MuYrK+W@Hdt)m&^>pk+W`3J`oov<&6Cvtx3r)m(0-4 z+Q`A(OtU=m9z?`cdMcW^yD18@xti)qhIza!WZiXneuUNrE)%I6dJ{EvB?cJ8=qO(i zb7?c%e&CAum6tsJGhcaHJH!x9bmsPdz7`Nv+ELU>LA3#w^pvr2>*VoJ+e;07phw#@ zRQiBk^0`n&j`X7POQLSIO(EhL?r`&Gz!U$@qMJqMy$=m0Tic4>{bPNc29p(b$AgkQ2vF#D#KGW5H?pr?;P>k^KUsFeOAGnA ziiMznav3Xtz$q3vg2)E~?tdNccV;^@A;ia81(WCbL< z$|Hax1-$*T3}n=a$PK!rPz4C3+&a>L1ogPqz#4Vh-RH5)Z{WQ+KTt6>qKoD%woO6v z_v7zFWplPl>0Cj62tF6*?oVMb4~wh9R`0u_4-r_Kg36E@7kAx2iXlS(V0Z!-jFmP( zUxmCY*NJ+)UvY1Y*4+!zt{yM@5!M;vi3f~D23SOaak?kY@zX(EknVbHI4{8sqT+l$ ziGu(7TkB(ekRufj!_pBY0?8bwXp64vkE}gKFbpD@ts0>#CFaczsRkf{nV|>vhh72AJ)V!bt>JAWr zO`H~>Me576ou)_~lsHpmD892MIGHpV`?ngEuHNnn!ScZ9nWfWZ6j#ojmJXlkyAQXCLY%8bpT}bqj}UqeHecS-_-)wNhsEDR4s%d;VWplp z#(bG71DKC{W$3hJc-e`p1ILx1KI*UXr)ddmn^=%|X^KYmBXXLGm>qEdkk`)vwIBCF zS-Uz9yGvQGDi9QCy1wa3qoPvb(VJC<=@iNPO#8Zqp#%SJ2M18&sw4(1=Kd`l=RhKK z%zHpE%-liyVT&xI*u$%z&FzzrK2Sr+dh6B)=o8xXA3Q}65b9q3Np0sdckCsvpO%UG zWmgm!HZ22$Xhhr4e}Xy^9X^my05<@IZauqnQB7a*v`PbD0J+He)pUReEH(53aU0?$ z>S56dY6RfS8kMH}HR|1S@-7&WJkp}8%B!>CvME`WlB$d9;Xi%otga^)v zlYylj+%9xN@Qq@;RA(5B8ooG{7n%NC?A+TuX>)vdcSLDF!OWq)n}O>oa$Fw3QU?{z z(guVyeU9)wGJQP%?F3yJ)`tfNVY?dxp;gmAi2>#JzyI4!Y5#Tp5KF<0x;$BbM55hg zi=wt&qIPTbfJORUrGrE$A09tAtOLl!7jq!|Ac(9l3DE0Wp{+Y|LM_@;tTjxB_8o(# z9&=2+Y3ebq1IQF!Q!|@(f#vr4c(c;I6hv@SYw;+sY~7DUx%DMRxA2KaI=`ROR|6F6 z3zGEaSnm=;5G5QN`YjVX!@3Fr!=16L7;L5(39xKIJy1ko_rs_$lxO}bp4!3SnK1gV z&SjGO@4ce^enqXNw2udF2lZaNxMnVO=K5o2YTA`Y-_7Arn+_flz4}?(O-dyXG`44SA*&D>d1bxRdSgOb)1Pcl$%Y%&7SkgUi8jmdnSB|M0)5U+MF( zbOFrn_-L^}LV0m84Wlv<#4vNbLMx!{W30!8w84T|(`6*lC|ZO$3d+4fO(TK=$sg#E zPfA59#77}S;($cTGV!U8f$BqYSyNz{;AK2GcV`T=&20P06wWFo$ z_Whrw=*COWHw^0%DJnQ6;aD($E6mqDvnab9BO`+WN=RUw6Gk@r4ndQ|FRtUL-YUqf zEJeW6HA02eMMN2~G_6w9#MqnC*y~N2s+sC(HduLdaUi9xco})k90U%uyy3^%x7VIk z#c0IqmhATh)b%`~xq8CHN_R=|0=?iafMygm`L(BfES6QhjBg?aD;`GmuYk;T6&fVOJ*)Wql_Lc)t_wd-tOs7 z%`4c?2dz`~oRzT&nU}oPCAPM!TRRT~_yLcadOXyzxQcs z{V?g`DuJ>4@*o6%-bXPQG57mYr)j6U*+D1cWGt5{suECptovcv@tK;sA-E2l>jJcH z>okeULKaDQ;JMwf5gyiGBL3O^-$(52XtPe<=peuW4p5s5P}}t4MBaH4K6wpmRxX0c zMM|K1F_+S3JQOgjh993)IF=&5d}vOg-WJ<%&>o;%6aQ8Cj1$)U{7p-BA**+vasw}u z&Ikpf^C1{ehHDsfVZ}_vrKtgmQT@+g6KDXi2dz%M>Y2y#fl&4MJstkCB35RB8*|!_ znR?YS4NnwkA9n*klV`#Fd2HTj;wssz`}uJo`8XFsa&ve(`NoVo7D6OrDys%t#k*J(PDZ+Tw#?>c`# zJrr#CT%PLVh3FTQ&KG~^yZ^Sa0)O(P(Q3mqxqhS7r%h0|F>*0?QmLF!r)jyA7{%og zuExVdJNJML+CXUhCpmcwpT^t)z;99r!N#-9wbmr4?3CG~12OC-5&-)FREAjUioubCXAR5% z1s1ETOb+z~z-Uc)M z0BAzTCh|YZvV7J&zm4O0jp9*qLzE07K^%#|^=|A<7?!q8=KdY;M)K}4`H<-r^e?A3iv6!I`P zxLr$KpCS^38zVPShBouV9Q>e)v`NFRG0k5gtyOB5?=<4EfA25-$KQ4SZPNYLN?+ks z8^welwq~euRU&SY$NtB=;^mZu@j7sSuf1oh-Ao>rqshY78~c5`g6%&0gBuRBOLO}^ z->p;~{-!vbe9QP9?C?*6_4VM>lWz_eFPH_4qW{q@-bs9FvgH8LTtro-$TMKb-}4B3 zrGutoC}SC4xyO*d$IS6FsN`elb{8r585z20X?T}j@Mp6A#xR>X?#44y3p=t2=PQ@u zz*df&HIAPFM|qOY@3q_cABN}Ic*Y|V;u zMx0eB0Z9aD?mBB-IHTxXw0T{0cwN|R;#ovobTnNU{6Kn>vUr znh#uiTz!WDCv;O>|5DSgfTwd-IPjl-0y%&qkp_zEWVfk#b(BQI^g^m?F4qNM10jE$@D z!~>gIaXWj&07hP9wjB~Pv3jQLpGw}v;m6B zrNXeXU5^(f4}8q%+(T61!YMI1trfiSmG++1&CWiOn~RS(43|9n{&~+tGqwr#;83b7Y&1$)nM{7o9hW*SlNNyGPTz*UY=m z-n-w|dmz$#FxC5GiT9@(@1ghJ!z13G7raMyy}w*|kJ9;!@%oHQ`b=p0Oq%)pXYVt0 zB`!qz%%u8!E%BMH@tJ$?Ge6? zr0-^`?^cQLj~d_Y_r5zLzCRaycXxe%UHI}1);e>5F}N)fR#D6l9ynHLIz^aHD4 z2)8f<1w;^Pk3v5kj7*1EV}dNcQ$Au3LZTpEp$BSpWKr>P8b=h4#2{2Ux#jn08>wKs zSCl#;2e;B8$X0w-E94p~S^`C|eifuH6=XCM?3SBgZxAhGL+-Ib9t9lg7(|=yQFx_C z%Pf*x8XUUu9p(6u>EuGuqajhmL+{+fEW=0lHz7LT5Ay&Dzrncdx<^l*B?JK!5rzq& zh*;NFipSR;iKoTgctj@e2X@?}Fm!)(snZhTbai2^P!EcOJoccQ<_SUZp#`lFq&=7wK`3$)oXs9nT$^Cpiq}wjB#(#< zZ^dgGo`^k)y9GRw2M)qee|6YGq5z6(n-H(`gM4-h9g4UtVi5W$&OV)RYcD*imBJcM z?vxJof|FbOQFs9-3VvjfelfS=!WxtwH(5V!wmua&f*Kn{yP_bX(vSV5f>iwCvIfbJ zn-Cl61Cw-0*CUEYh@d_Sg7+7)sLfCZiZI7?g7=5Gr$-dSQbC_Lp6Jj&{y?!$`Vw*z zI2pW2`N)sV&^_cT&mHt7d6K3bNUw zDCUE7Ts!bzJPT((Oh2Ny3BZuKXWw+ApF0R6oo(A>l_iTw;TOUd)$YFq`ja`80(Lfk z3l63W7-Wij>?1#AUAJ!V-r5%$&XaW;ulD#OJW{0kXr0@6?-i3*K>A?D{-4b}1w+72 z)zfddzw{B!!k=p&7jvrbw`!wx{Hvs+N}!DLGrz7gzPaDA@4bB}G2a@nG5*HuNOGYw z{AmBD*!Zgn&6Q+T3GqwO+#(@ErPoe^HgDHronB9d?)CxRCi32F+bWX>*+Idg~$bUnX4MN+hYek)%kK?znrHF z-by931-?OJ=&`DDj36)=EAz*&+-tRFi)aN82|XGF!H`2(IVy22i6tZBqn)_S zM7H*0fn=Fa%7Q5h%@0zv&7^~kBQlA%iseab%84m@oMFgJqw8(Rr;1ChbGOPv%VyGV z+zS)Rx$D^`lsl(!N%=zC;vjSMrd?2&aDhivn{c5|2r?_c|5IfVH;|do%lN0Y?b@@5 z)01mN*$0f=0yxfak+OvA?G<*r$vq;^GmKS5Uu5447k!!U*)IC3IQ&$!qBK$UdgY7g zX?iV|-xKr(m1aR{=o(+bb^YDJM20snf3%C$zN;~UF)o@SX&BL-z4VOf@vd;Cj{!~O zoU972vk@FZt{G}Er|l9g-@?5dT_7h<`LJ9m4ar*W-ZExd8=71adY=!+Od>a(f_(I_ zMTZg}&QJfxb%tf$WDj?IT$O_$J*qsvCCSM3}TV^C@Zm#zWpMb^7ZH`)6IG*L4Ha> z!%^APn>9~Zopj4h*XWcD5dn9kjJYC}mlOUdJ&HUsY@8IidpD-H+#!v>U}~^b=9BsE zfqwy$yKaB{t<2+`*#(fzOP*0>--Ke zW+43WrhWEW-Zg3^&W~=4Uq+Z>>=`87)vHdn%u~wSnWt*c!nK#Li@#F{*3Imc=<2xC zJS)lMu+kVXxonL2&u~>PzcKeut@v<`f1JW*xp$@^Y!(q6dwXVA6KeyO`Js<|#y#r0 zqN5{4m6?zLr*m0}Hx>5gk2&nCB?zmRmc8E~u(Sy%yBiLy9!_*#3K?qJ!G<(`C@PMT zoJ0m*8>iT+XvEy{c&=GQy;__lIR`v;q`bMiOLqM2`IA)33K}45hfk>fownUG&fy#>FOl?>L^B(32Nq8j; z*|Iw3eqdgg5#^`4GrY_;+N!TIi=)4t*;o|PYoOMdk$kr`;Ht1phpFl$Mu|O;p>XDb z&ikH}ov~DX<4m?UPhP0GaeNUT{zIQM`ar3+G}XVLv5ZxtNPSWz$@-#{ z#-`!{#{uT%ojn1L#jH539=!1#ogO0hUlBvScFH?v%bY*qYkGlIw{N;0uvHNW;wvX) z`B%XgBb`_JdP^g+^9?jnMCgro{u9L(Z>yrZ4OB8q6ZJNisP))?2%t$x9$EM@@7b%R z4$>v{iyyd38@Pw%XV3rI##xNcMFynMGO(A5OSFz_F-og4qEA8vYm-9p62no@5?U;rSf|syTyDRf{jW)oqwE+j@z;FT zYo_P^(-#ESLSozvWI504a|yqVGCF&6ohAnx!gVt* z!;%ZB7dU+Hzs39Wr;rel|NDh(XwANPjQgcDb+9`{<+l2!a2Dl|v*tm`ToIZ4^Kg5K z%eT&~M-g}t*!&{P{U)BOy-ndxhQbh2XI7M}Cqo_F;&Egs>q#z<*DC% zilij39czw8fz)j&p zrTn+(%xY?fmNoWa3(ISzjEaAo-TB*8YiQPdA4@_v{f3#FuX3eb)$Rx=GkCRjUvjdL zg3iYIec4k9|D8&g>o3dr23ez-Z{(5bAItPHzd?VwK_rxGFdAodl?$C?`y;+r^H_O3 z|A;&xV-;E2D%f#e;`9jhL@ewi?Fdl1EA(mBz*7^DAU^bB(luMh*N3#PP(pDI+Cb@k z24X`XQ@YqdzcfH}hFWl!+h6j@zDQ0oj@>2XmpH5@L$0d}dJPu`E8wQ{F-Ov;i6m&S zOC&~iJrH3TY>t0&_)+9Dle+KhW6x8%t zsNQ==un2~W*F4Wqt#dJrcTb4zJ{GODi-S48_)IvOVHO0Lr_Wlgwd}BDVQ2EqP5YiiH;GUi~jn4CmwcBDpR?_POz;uzIU7P zD0y*lH}nHcv$O7dq%Dc>^UWKY{n&}@?7iP-2}A`Q%?x{KFr(&?fh0hRcSQ{Fq+`*V zBTgF$d%BG4alMht8I=04Bmr!lx}NDg!)H1Fi@zV@#Ars!le?8e!3e(|<oy;VRB5`22G*?MVMI@F* zym&8Z4rI?;jc8ue7Ql3~WZCf!c5_(vuwb|hW_l2-xtyn8hX_-~~>%5Qa~O~K55-Dqq#M-fhZN0*zRi%exwh3YUC zEu#1A^-yI<)@A->U9HhYNpq-b5s0~hN9f)%$ELjjrd#s3l~IuyqS?Lja0AVWd4 zgFLG97zK0mZyrAVqN_QxwmcN94(&#ob9v6=#Ck^}dMP9Mq zh9pryM@k$&*&C-Yk5?dSE25Wld~q!wwfV&H+{`0-ounF&gMmF-xfOM_@$oBR5lzHX z_mU*@0M$V-?Qs6|6%x7#h_nK#1iW}WpjQN<9YrxBaayZ5%23e%BB22DfeRzb*}$}i z%@(5XhiCS~D}-_(ySO=RdPGJ+%Hq1JtGaxKAkKLb`5Kth5VJUWgD+XTEvPrjwO6tg zCE(XRg#**=bh99%wUA&2%N_}DTlqa9z1}rjUpxA06k?S$;NL6WNzzB?GR6Z8NNurW zUGC~Gj(MV1t2TYprOt!p9@79F#w#QgrOoZGD`uW`%?#=m{(u+t(#7JX)^M4<8IJaOriv(A&F0 z#@r$#A*ACTy_ZR1$33#ic&rr};>j~^G&0&3glEJn6p;(KBlkW>U*l(tlholj6aGP2 zqLsC}d4d-<>!rH-MqE~*IOrmO#T6z`+_EBUDe#z|;wce4s`AT%>fa0wbbagyYJMvt zZFr{PONbKM#5Y+d6_)MVocoOwQJ(VLuhcr; zLeFoZp*lXEJyfPpdTx;?PNxf+X-jg8wWUyB_-wuSgwM5t zUb1Z0uj{BL>%66LY2V?F*1TdlN_SA3vVLAnR*th6_lpz8_OfjQ1(1{tl);?KJ@xXb zQO-KgF2sZ!H1}<-)2uhy0-aJ;l2(IKWBb{pl|gl~(MUwM!l|wzyqnS8 z1!vaF&aE$b09Ky4PI9T#8`f!mx!vCEr8iToh3J_YLvdrlB7=6H>~S!ZbD;(FQ`OGm zyG^BPM`P(9o~OFft-z1-O0(`w4pV|C6h}S(&6E7+sAQF`$`ABT^z;c{`~Yw_{xKH2 zGg#%NkT0%>#F3n$6;axVNeuTYNYD%#SOBJN6&LZgt-9d1+uAUIIv7oQ=)sB&@`N~v z+J1wejTg}B{b;Y*$;Hn20N`7viIu#2MpB>dn1;g z4TktgL`dlE@oj|rYQQ0!n>{uJvP++Bbm(g>b$8~(*U|?REKYDknr`7DYwlK%Ci=mL z&t8yX3Z`DiWl8}5gXkIk;oUcj1+3ud0E4ft?7dy}oTQD%F&@h9=f? zZugi=>zqCi(=&wZeo9R}#`Auukv`}#hl3GVF8=Pj3SC$j22Ar0Z~F4N)=o7HsgwFi zmp-pcKrJ9)|23=?oAwDpH`;aOg#JwpI5g|#DMQznX|oXr(FdSmJDtH~+&jayx2t&W zAkanUuK`an17-{TrfACQE8Z0qZ@22Vk ziqESr5>fQ(Is-;~NnYI!REql1?wN+Zx0ao@AQU|%FKTD_~YzJ z6180O#Z~)dY^@~etC_YLnm-7^jP6kyCC-oQPM1Pd4|MyMUf6#UkSGx_F-HglX)g`( zo%`@H#rDZ|+Um9H{xsClG#8PO7g=7sY~c1jG&AgeFocU>U0?@v`e%DN7Q0wlf!i

%u(iBH6UUy*!3CUv59V_k9iQ z)q|{AwF@FCvNUR_aUzGh9d}48;=oGw!sN@WdqoS@WgCq}I}1gWOJMFt3+%JR7hO4M zziC1@SbtUVss5J#S;UF9hkaLrKH}VC>e~3q=2`$XaBb>wJn1hC! zveD{Gt0QV)w>Yk%DjdKem~`dre#Go_?dPYJt}}Hbla)aW(41`AhyktyJi`SE9&Dt) z;~cjzFX90IW3?vO9Nd^Z1KYRbOd#OY)m~nkQJ-Zls7z10$q`*ortVE6Q-rW>m*B*3 z5I3Gmr^XAAfdZsg930Z(0`S?K9&0fM(f2%l`0%C%2AQ`!VFdl4p+=w$NftOs_I-LK zkE<;3l^s71bp_sk1~z%n`t(HQ!mEdm0001qCdC;^QLup!|JbQrm*T~9I-rF1e!2Pp z{<6+fI^kl-W8*lbC@PpIM(sivb?r1UWV-=Nb0EC$fO9{GRy_z87Ra*-pe1M9 z?_QG8+#5P`dCdHBn6h#|kl4?Lcvp#!Uj)4CeeiDSmAXRWM@2?iDw=w|k#?NiYnDeF zrp9y6@6)djJgGL)CA@G=3@uin>j1!MG$p+cJlk@4b}giDj_dXy*SOZRenUV`u>sOH6AKLtSrUGJcJME(v+LKKGSvFY>1e~^*jn!D zz|GJfBVubL{(5oDV&RI=_{f9t&J5OyoPIt36_Nc#er>R{ap+sew=gZQG4?HUgN9ZFx7tv*tO zSiWVv7|Nk_-089rlg548HqS%fXz_Ln0@o5bKKd`a^Z%IqxcSw840+}D5C8ZB|M3&q z9n56vowH8CC5HqP zoH~ex{OZqgtiQt9MEUxLGOpMR>hq%%yh8Mfe8dTC=EPs^YyvxKw^m zJXK6(`l8CNVJYRyBl-I2F^#>jr2B(W$>D@=U$ ztSkT#Rj_*FfG8DM3_UZW5~Fl?WVj+dz~)p>g;9@^G zn>6YA^Ba8!8=#_<2>v#=HuGc1PjQ`hfA?2tsWGz({@axxnw0Sm4}OcO#9*|F?j?U% z!b$j9M%#ubiZZv7)im6V?7MaN-i5_;j5h=E10(*$)$|+wKlsxWyuBV}_&n^0WmVaod#n6&**li;j{S=->5sW1uICav<*yq& zgil@1+Yy)x7HP4}dm`<<%orGepMDy9~dxnT1OUem7W%`{kQ3Eq3A zp~C{j!8R(yhv(>0F5(i_Dsn>NbAfo-6MIWh}Imqqj|@<~duj zTf=+|lDGO}#O+7&OAIe#MNAq}AG>|#%3_UL4v8;iDeSe%!%nTS(hSDj92e&5OQhR$n% z(R<*QZRYKY4nb+wo-|$M_T8N!@kUXv5pAC6f?>YWu>zZ|n(W=%KLeXgnLEN9_w>%+ zN%9^SrH{TnX;0Cw?%t;hVimP=%iq!7;9RIpnDk`Y)$xw48Jy+#!1Y6y7>n{H_oQr8 za=dMG>gHWt2gB(&oWYN|9#9+&c^(jHjZXK?j_7B#U7w3lJj$Sq>GE%07l@R45alJm z^4#^T(Mmym26(jjA1%{Q6uMjgRZeeWwEi=a_%M3*-{Je|tn=cOOaGL{5?TOtTD9(nyTvQ73)?ocWY_qNS(XioZe5!Ly zm&98zH(r{@ygeXi=H2KethMn7eXx!2?Ez`W! zhKtc)sltV#B9~*m1m=8ma3R|8Np-nf`C(Tz9Zm?((MIDDciB>N`FR>kMB_(zQd5Jk zzaELu`v&1GZiF#Obr;%(z2(`~eHgc^rYS!<67zDMhl$}Oz5R7Ho|mn9LdH|)0zAHP zXF_0n%qyDZYNmo8b^Dy~;RcQtBgEBF#|RoV2w%uIUF&RXu8g=4{6~q4yXg*GUwaJU zP0J?_@WB`wNw}r$nRLy?FYUR#g;x$7;u+l*i8t&JM)-EYer%^oqE(UupJp~AuOLCM z#^73?BhS7RLg}^{B8km_=f!R}`w89ph%bxAT#cdH6lHbOM+Xk6%#EEqqaH~A2z%V# zcGua6M&T!I7x3k&P#FfZv2+>D=aAkNHDdw2_{RdA)028EW>`=>Hkp5GIq=d|DN;nV zSJ8#iBJ=WycO@>&#ZilN63207L81;U^{I~u6_v;;doV{2p-aqGGP(}5z}&t?BXw00 zSskHKc3bSE^>)$4@(6pjk~ErAmI~M`KFvxxNl|-em$57}$rtKJa6EM?b5}rG;K@eM zd9P8Jf)R^pX zuRJ<u^V;hDZDzM8t(*R*q^?yC~$_XNj>NeHp3H2eI?hs=94ixgH(*SL_hPai5_c01bD~ z7R(3$=E$umJ>kkJb#&ud~AoU$91_Q)jsJM%gHmxShH9UKjb;Y zcX0&*_r~dC7;AW0xt2ZPxgOOI@tUucaIv(!R29r+-A~lB#QdubN@SNq1*I zMeKt`7>5(iH~Xva%^=tXyaw6-yWAxTo<^^^t<)12BEWtbZWh@(9&&4Ts;jtv}1emhrt>;A+li ztHV}$X&#?j5%ECA#Tou*&^xd$xp1_O8vnZ+?NvwgAm5NYFi}vrL3S-xX5a(QyYD(P z(p9nEu8xSyyD&tJ-JR&g4+0*`6Vs!2$Cn7N z2lHjDq1(vqi1X)d)~IU1CBg>kK=qfg20+I;KpH^qa|N}CBrruvPTHu zSY}AONK@<$W4J?nd=-(;`p`EU`oiPhOz7p}J22l0xnhQY!2LWx~Jrhkvap4O<+Ha~Yvc3Z&_wcL31r zZ-Kc07{=v*4J?{(0{9CM#?BiSay{&LI5eOqFk%T4iiI9!qj-lgVU|IsJT*CWBTjhI zvUxz#B~iIs-ZPIV#xW#>TMr_}B)DAyH7|{=#|EEoFb+A1s3)VEu7|iwN2UN^@|H|J zCVur=_a{x@mkIa#_N}q9bO~1xji2GcCVnMf9vSY32-qW@`@rm2U{7eGJ`$aYzUO1G zuoSYjE&Q?lWK4e`opUzSK^E!nbKiaowvZeX-x;&H9Q8bf<^{n+$O(}^480@AAr%KK zBn5o~fW-))Esua>6O_;}FmV_(0*GLirxlZkpW(-Fd4QBs;yI?`C1v87?}ncm4){g@ z^H#(M=>&=2WpMORD(FeXx1!lhKo2F634|mqy`&g2Tq`_DyEn;jCCLDBDJBq>o?^=~ zcg-Y@0@3q`)qP5HUIK0wo@_1e^Hvw42MBSFgHW4l_Hi`sLNs^q)JNpAc?>hbLi7kO zKF%qRNXgDqP(L9WmvERH8_d}Q<{0P|v;rykE-pSwUA%^B0-)AgsWE{mnp;nu5 z3Q+?<>+79U*j(rk5=mMC--b;%M=&vh3|V{h#VhnL*chsW7#GM8&UcB8z4U2MA_67h zFQ6IuCup@=5v}3Y0R%o@Vn%;&#>Ep)^20;fh|nf0{u}~+T<_TcAnE8ktIgx2Pl(Jp zq09xn%+EicnfE#xINSLBOxjq<9A@VG$d*aX3G6v%{Rq#}{b@Wb&w*>Do7Cff2ccv) zIYVFI&d@$TKhNiO$R>`Ke+Dk%n=ygnpgElTiELXzbVWHF*7PBP^y?-$!U@{E))CWI zpb%1yP*EyV08G3POmq*CL)f7d?0F zd+xsa{3dOor*NUyxkB&1#MGj~`+bG~5>uxOLkMsu6Ohe8w%s)E_pTTAF3_~@g2Zz# zlU-i^7E_B}rvJN`dakIzrKm8X=w(q+aUZ3qbhQX)2eJU<#;)=v>lEAHqm54k{}xkM zi{H?eG)K@&6G}Q=O1dLTqyy<&`$`5^OIj65dWB27n@Zlgl)hgrrdCX6tV<_XO9zWe zXB0|59F%?*E@8u$%~P$^zA||dkc|YCj4SJD0^xw5jlQz4?BzT6O7_x#KUPaEh|s%R zx%I2OIO?|Z{bJk*-3qw^gRG?Is4O;dBg6w4A}iT0Rh|x_XIrbpzN)<9Rmr1Y#p}vI zr(eabU-?{!b=@TU#BhbsuL=>xfO~UQ1h#6aboANcib|;nWsc{+XsQ*El}d{2T>3Ti zu2p>LHRt-P&ZPk5XKF->S=B`-wF<5^exB7DBGp!|)u->*3co5LaEK^0GFk_wGfH7J zH0vrVYwdnj+PX@zdC(ZHRrDRycm~z@q@(p+87y4uZy{N4rq?|DRV$ZX^*E9>M87Ix zrtX$~x-*h7EKv3A0nbi0jkyGzXRo2OwIN)wu0XQDJH5vDQiYgCy@xuAJm^w0T>shYs1bqPY18s!Mv&oL zj<^~PSprSs6J?)6PY_XO61WZtkYfq-1R0&4ZP+|7!jwUxQL2OwyaBGi8M)7X^s5r0 z#7;ZVB6ZmIiKCtFkiGU7>8@Ib%6i*%sa#1(xHUk3wv9eoLUjrVbqo}$BG52D5)(=6 zz;f1Um9WZjRw_{G)bw_lk`@V@8r{Qo`q_5%^}0r6m$qoP+ia&bU*3X;qm+AS!*IW9 z2KqCSm}-=fhazQ;LYwXZTaZwqTad_H@4TthI?CIXFv}hn)t^Y${_Kz?YOOzgprCf0 zQ&w@HYF6+E9%)1dryv=ZDiS>;(dP#k%kX_YQ9VSCfj<5A=-GlX&;A&t4)2m4RLNkn z+t9MXE2#C*R9tBmLpEfnMNc zQsTLRskthbB_7Pi+jw=loeaju>(mPbBRnHWNAd^{Y7{>>V4W>-!V`V!@@QVv=zFD+ z({=->XI$s|3JUeP`tJ{C_7Bmvw6D{Rq22M-)1$MAkZm%37yx==R@ntR$nS|!F@YIx zV4}Ciy#W;d1pEvXqj9tB_oH(C(gHdOQ1L0>st6w-f_AI1jv}#UGmPUUoEVYhSEzC zvPISOhuJjO^63{pkVS^(#khqq+;bZn64E1>^kLXf510xTlcjNT0WZLf7WCP`RH8oV zYM#5kC?J$cPAMHxo_o#5Y*t49887)+ymj< zN4bxyt1#}6!55F&XYeZ?cF0ymh5`+C{n8cQk(Cpb(wrmGsv{c4Er$(v+UuJ)d5Wce;V!g_1G!$iaQf%Xp%LvIrMPZLB*8Qt1tm}AAP3tHgE zut`QghSi4s2Nu#S(D$>VJ8$gFAxS@{VlfJ}C>ivA1_&D^vOl@Oa6s6&8~=&3b(4?& z>&ZFw+tn<6M2!dYV6w+(z9XwNJqC9Jb%-|wK{)C^n#LTn#N(~|2%}4cacXM!0Ka`q zp!JT!>};J`7+xRBO#cZO6&o4h7PC%q|M0S9y24>0nuIyPnuZb9+s`Xq%%jT^-|5Pu zpBRQ8A2Dv2QLDY(eCwFJ`5l7lv?CF@>2b2f16D;6s>0Hi;VE?fyNoRoa7P0EPsP`8 zy8_`o{5$Yl5WLA@w-+{yJR~YGP(HK5cbe_jt+{B|H_?OZXnW)D^5s(#6^Js4@6jgT zEBO^`926V!6tnpi%Bs6b!9rdFs5_)ik)W^lcQ$iwJqshyd1rk&={VvNKb8Jr>Ue7^ zXQQ>?!L(TZbV=!!YwH$A4xPpYs3sTYb%N^Ik!|kSUjhfyBg!Az=h1pt!~_}qb9mf( z7-@vXpvPg@+mn4{Fh|beKslWrnYIo9Ri&rB3PAT|OPB{T?hqgcaYr)&=$`@%E!jKaLJsuCc|_XRv1Am?pXtxC=zS4n=sMSjhne%C9Z z-DI^0W-*I#20bfg`)%&NhYg0&7-^Rfl&J48bG(`wR$fhhJ5&0h-k0qiGo7cbPF2lV zp}RMpr?|tu(pNe~_9#1P_Q1XDH|ED9`&LbwmozM>m@0dt*;LP`Y=Jt_)%-MzQfnUX z>l;eC;FD#VE-4Y$?H(d(n#LBzsysHCH=|<&666 zhMp-0&~RDrULP$m%+h{lz2`Dfdil*`u5_2h;BGxjY%_&X=lL`y=*xy|zTUrg*vDGr73!ZkIiJ8oKh;Iar+zNJ-&af?!enC5|> zTf}v&Df1-|=5d;?O2D$$3qh-nZv*AQj*abkWShpmtXW}@Nr75dPK-o=aQ^l3+@zph z9=W9LYaCLde0hR%d?gwz*st4#XnR66tq2cb-AUAng>wp+D zQS!S>83I>lD|7;Ug-4J~=ChDiZDG?cxqt(?m1_3rv@QH4VYQx;;4hvJr*L{v(G?0^ z@(FZTi{qZ7D|YABhLm=;yU$Hk*{o|uBwz)zu&y%$10gGp4R)bCkLB#cPd(p1EAnXM zAx}WpN@$KBdCQJF0ID1|{5ee{iOy<1DA)8I*-MLf=^=K2Cl$&K5%s56>o71mqGwha z-$l%Gy810QMsV)P7k1!J&BtA*LMghlnaXQArU9ECk1nP^!(k%G`yV_kFGe_hHP6d7 z5xpq3JJk`I=)2L)Lori=b5aC18+F#*F!L$e`08&r9RznrU=g0Xyq6-S-X{-U_Kj_J zaNs+b%8&Ki+~VbqlK#?gh2Q|5Km{d!RJVS-%%6k4H8Xwk`apIOYneRe8(50j%-5rrCF_(Ub~YnP!uP_BbDu8AD5+7 zd$#r*cTxT|y5c~49jT(>wnzzCfwu6b_wREb{3@Xt97rHMzVDlV$K6YdA)6q6xDspG zbzppaV29+nU~=QnU@*QKFVBlu=L>d1OTqcDL9(O67UX8AK&YjJLGX2}*t$9JY2P4O zCVB=pfjjy4YlPv7lr$r(F7QP8?&h7NZ>^ih?_``Tu_wtmBE!SH#{w@}k+H#Cn>7tu z;yxphS0Mc6`#F*kaU*B9v0-8^UFlf$5AIgmjYM}fy78@5+V~2k($o?3WA9CTmfO@N}iTzuy zjNMR0>-Xh3AtTA@^zue;>=DK&}&@3MSMG6WK7W>*~x+CXZ#tb1;vtYo1Urr8Ww3 zSv#+5OE;Ou_>AZAK%I596wDG*#`8~UIiE9cGE1o&FF5_k*}zHReA>kLbJj}$KCvb=ktqvc|j+;ri&%)6Jyk6bPkD3}*nyeqoc>0(jaWM1a;uJ{Vn z)v{B;qAKNG$u%uk>vv5Sb#?DbZ$5Ih`K)lUY2sbkolaNVy{3z=zrQOFfVvT(ik59G z6BVIaZuZQtEW2bTDq|nHIi66o>a&=rdeZ53Rr;0Hkk3SQ*8hm9%}(NXruwhYc%^nK z{g0UXFEN$>zly2dcZa-Rdu3wI;{QK!Q>kd`1w-Bc|DdUtLJ2p*YLmkPY{S)VhgVF5 zTYL@IOo#}vi_}Vs3jRNFQ*#o-|4FCbNVfiWH}%ES*uUJ=+i4C|I`uC%_3<;idzlXZ zL{t5;T*|VN{?Msc{j>k3Q~%+n-uhqM)P@(&B3|B(Dhin^3i-=TjVZYsR~kX3Q~%vf zO|DG%3r*dv3aPHH?x-&OCz?tXQ=ipk{D+v@ke$^So!uDwzq_f0uZwD5550c<`Y%1T z{LOULn~DDgO)Y7wuIQ-m=;$c#%vtR${?VCV*_Aid)mYvAs-~x~qxW51-^+jMsSW+@ z|D;o2|5rM7{QoAMy6}HVr~Z$dI?!0X^fp@~lH0JUW@YTf`C{9_rrNdl6%PHG|1Y_z z8~@>^p8AKIy8O4BYW?5c)az|1s+($0fAV8s8%p$lb5j?PR5x`2rS&g2b%^Swo@piv zAVpKRvZHk@oIl?>b3k@PpH8iCqPnR$opk4IoxeQo9Mt6xxsbc^CB5Odn~ML-O|2WG zGx?4H!T1Sm_#}Q0fckGYHBN3mNg(Qu4Fps9T`X6qpdway+ZfX$!zuZ)x zcI2x+ZmN5G8;a_tlK*y7Td4Pc_$u4fe!Hn;6e&Ug@t2#L@b7M_DVnq5-`vzG6xB^d zBV{&6|G25K|K_H?`s1d){^O=LQr*;Cs+-FB$4#xI9++c$+y28%z4b3Ql^;oUQ$+@O z+kU&L1)QK*0SD24xT*7hyQwvzZGYUEkazq0tYBA9V2e&+4{i-SaqQ*=uPYz*(iJL+0(z*1 za*pm1#8lQXIKk_4#!IQ24|hiNPAd<8a)4|aH@vE-Y#?NE@}W4Uv{f1f%Fo3ZZ1@oS z`gm7s#(sXwjpRq*?0@lNm^mvvdQ`ga2tVi#)a-8>b$BBB+BPz26s?f}01avg8H>eX zK==5DQ6TL(3?6htV>1iD!h2M)1;A3|0;PR*^$0EeVU-X^g@EDmOQRfk*ge=AK<`@~ zX1d{NNUkWlnS|!w36+C0mDAlK`0wKw0tdX|=(BPDRjFLHMyK$M;cqt;|JzM{6a@?-gn$zs{c%&nxBj@P zFnHr1H}#&{Z#Q+7>ZWc`-Bj(sC`uHOXQDxk>ZbBA&i>`5UIE0g+#&~XE{8I9XQTI@ z4q1s3w3&YhVjWt@Y$>zae5_Nt>|Y4nRXwd3n3XPJBQNR{lXj$xU>pq;4&~(Ft&Ib! zvg`+^Jf78=vBc9I<^(=Iy-1*Avw&QU$TJx=fi-pk(B1%Dw_OJdj64>)h{)%$9&THL zW*Ky|B?Rz`Fy-j)A-dqxAs&S^A!DLsYKb3!kmM(~o4Wy0C0J)M9_HnJQ^_>KARU8t5V_6QFY+~<1{Izi}S)bjnI{?r^kXg z*mudHM6P_Es^K;uW;|A1IXLgRMiJc;Ysny@i#6UWukzjx9un>Mn>>v0#)>c60f7s~ z$JHeQFH8h~-4(6=+y)CI`k41pgv|>Y9`u8OXF-UKHp`PHxvG}z_&~9a=U`73m;uR# zQAvYK5O4v?GewfLkLCsl3j?axV)Z>p0lUS*PpeLxx5BMGTzXV?8>o4a`P;e1L2vP6 znhAAVZV(+Hukh?6A>NQa@aw?`U00E#>c4&l8;oV}>OU}f@%k3oZm(7}kSsa$xF;B8 z?aO>D_0v3yyV!YluZ%)tfTad|T)QGJhK%$CWesA3At5hztCi)LS_*@XWo-zwnGL1X;BQ2|L4Q$d!$>$zpc2?eFZwt7_iz z#qdT0TrZ*GF3+@zp|jI?g*epw>Un4G95?br*R;oN1*zI>`}R-mN!3smRGr}xg#6}O zSV0Hq_UVfs)I=ZUOyUdupQK;mUWcjBNjvB6D38>~L;D z$uhz?OW`*(l?3E3iUbRsWxv~mHGfup{lra-<>ebW_daS&$aU`=yv9-V@j|jp{Z7kk zxN_I_ekXAiFTw~IF#+51;Md~m5y>d@nYLt&R6tk4JajDynd;uN0Wyif)l`-d5qsJl+D_%De@Ya@giW0cZ4lNDsybW z^I%m0gsBO{lmLB|jV9vN03az9UW6=GRx2)8mKSr#$6SS#{d6_Nahbh?8LvN#j?lu( zXKQL0Ftp+}19l_Ax z=xG`-ZE{hedy(sBqS{g;`*e8=NUBhx1Ac*cgP z`AP0055depjb~x>q7SjBQ?QeA*i}5j+rq@g0sBFYep@a!!6bHJGWM&avuFkK7a1X$ zuOE>N1~|axPuffnpm;L)-gyWNgc046Sc7A<0HF9E=0Ee}TK8mZM)6Z}*qdKuOPoM2 z{4HY8QO(oHa#`!W-p{Ux)@qfChy7iFMlg2)Qpcu!SFfVl@ zCuL7BBJF~qYE6s}0JPlp+f6-<|LvxJ`{Sna3t>-C-Bf^T!Y)V@pelJG@Eg|Kp}|Nx&F$P<>Q4l`Zlu>2Ehxj46$i zEVZcl4>xte9qnjvMEc{VN@>VZ-BjYufbrjM>dn90)at+8)HAz0bNm?(UP=Y+T@EDK zRPBKR392t{rN&i)xbRr1;ZzjG6b&5#R1mO>y;%0_rwsHEDFFTE&xrB&in|(^0wIl_ zVZ2?t;#(3hA^#lgs)2G%a+km@);)@Z`EHFeM`n5`xIZ> zEA)NKII$v1MN?Hs*=}UR(}Y~wo@X&?|3FjmU|uSkiiPyysXhlXsu46Tft>~brU{vf z(|I8Jyfc5$RNi60Iu%XL28s|ey*x6Hw=%!t^FHGXDgvQgSh{100`dyxB0I=xI^rl0 z#2+YYy(lRfcZc8Ox|;{$WSp4b@N<(<%$7H3HCU3mvnS`%g5rE~5CjqNwzs=r1(Y@DG|QLPb-tf1{~ff1#;cC8%MbR5su( zd+D)6>AQQSQ_~TzOw>+O(Nz9FXzBtusS3IH4b)Th2TkRVc04b*MMYE19KN0ts-mK) zPhYMucmp2Z(o1m@TSRQ0(`@SjK8bLp`8PDR4SzzABwj$vHaiXZr1u+5^%@35`~(=1 zF+t>lV>~b-JRe|!m7<7XPexV=yHwWEVgmpmfwFX~Y+O+X&sw&ifH_=MWMv5h;Hgj~)NSS|hL8vLtUl{n zedbl2`wZmV8m8tCnp&|6*DS^u->-QaL1RxvQ?U)f9JRGnG}ZYJnz~g(MN^dz(2GCo zZYyFRzG^%+X?$`Coj(lFqXqN|V+GbQeZ#p@c<@0s&^VIj?xltUJec03pbYs+Tj*82 zoM#ujkUr8U3~)%db6!a(#C-v z>yzMAG}YcffEhnZ2yd|egQhMT9S^ICP|;Mg-)JfzgM?paLb{`*cg=Pc>CFw0}& z*J*j!?g7v;tff}KWHim>_Bdhr|WpN^aoAd>ZhWqCthH9 zGO$0k`jhsIFwh=5&@M9H+aTJj5|s|Ggf0^3zqw%#umji;+F!G81xneJIEPh~hLxJNm7<2x zhHVHGNGZbzOA+fALBVY2QA=(kUr4kY5)TkLBOp&yrv&_cHlh@deJueGGob^;BfTZy z{rI<=f$*?6%#q34F+BE=1UHX{YxiQFd%*e$SiZ}n*WKBMsBcqjFv4#bF*sNy0ZJvK zgo6>{9qlKH!xMOv(>jWuj8NR5CYVpNwTv7O(`rb-34zp|1#0jB+AaZ4Bx7<+pzkZ_ zI`MBWc_2Npw8tJOf57xHX(|&n?Qa5mUjdz!z+QVcQNhLLI@msB5Bf+jR-%I=P=Y71 zT>;uqYjW58b0`ULLNE@@{Gpwz<_U~N;xQ9X&(*^gs+HkMHRp z#14I+EhV76B|g00nmm?3Br8Ko#jso(nEF6Ch@N&{`7Kind}kP36NhZ9_yF3T{(5;n ztg?NAEMTPmX?C_ppk)4`JDR(FHdJ{YmPsD}0N$>kQzpLMEE)JIF~{sb$0m-|u!X)h znch;G-*BHdQo*SN&N?@L5-*$e0MJAI7sgOi$^@FyxZiH71{o4X{OzWuD1SL3e&N`} zj9iAA5Rhdb7L9F}0=BzEZnv@5pid06?Ny^UYtUy7iBvjOv;yr*83M-2VJ}Ibeq{em zr{WFiq@{Rj6PE2)2jc!K-!zCNLqf)XX zb9zUG(oVqOlvQ`JmvRZ2P-=)_1CbV8jD$rs}EL-EOfz9#{Uj`Sr)`ubdXK zJC9E1JEo*RdTQ%$J+)DFXBd0f zZoCj^cPR1mm)x7fHsizQa!!ecdv@#dXDH3f{*}R!@;knpM}EL#zbl^?0)FeMC(oYR zq#S$YcK2p=2BRt811S4hl!GkF%$x2#43_gBZYqLV-0_c_3Y$yK`|YMiePPjBoUG(U zN)YpZ^8a@?b#ZEJiRz|$a(}ZN7i{C?%q8?sj@vL9sKgi*Sif5x=M247aVss}JW1nB zSx8=+OK8Zyxv4YnzZyE5$&==aCbsN4!i8=ulI(Nsy5yODyQ$xt z29om5#ys>r1M7XN6?^is_3l-cXP5xA`*iL3>yc!*BZc?t*MGRFKW?U(jlDjJRul?t zGm2dzTF?IUT8_vWyfHhwLU=b)d-IdZv%09d7wf0Eg>ieorF;1$FbjT>h6}t>>|CO zI`T~LNFayPmd(Wi@Mm4i|S->7ct1uw%_Br@TRq{yX^H_`i% zJ44&|9G&tewssSn2fsr$j5}pO!G^!x)Z3=odiFuVLJ0RFdASp{yR~h6Ew{(|q&{CL zWqj*;`+}Liw7GJS%5OLIlJ}lbkP+2QwRWX4ju(HsskZk|h*?|;X7;hPH*b4)(c+?~ z8f#~!M_ppXZO`xFE(Z5I9{16Cevj@`nEU=tU2fQ$ioUbu(E$qFzbhu=}v zdU82VgQZdTj^fD+8E|&NdMTRS(ie#VV%2haE@EB{bD@S{SYXhrw5JVOS$aT#r(xah zLVTjdJ!kSc8FpiONZe#Dg(@`U*?Q4A=VClXU4HVV7Cv8Q3&O<6M2ae`OlM>XDRf~q zEM#<88wc;E^LZZl=CVZI@R^CL-H^$DnSQai&CRRm?gHYvNYM$TWK87?jKFBw^`*>b zE;AWd*r|BxRmHiPI8hgwQ(s$J{lO0-MVhh!C0SjJ&d*)zk9?XlQl9lc^KQ8QH=atO zKeZ1H33#4cE=D`UCwVrr{tFM_H=YU^A_JgOJtU}Cl^hjM6<(#{si)%t|H4yqC5P@Z z1LU3y2E*Y2cn1I-pn(ypdm760G7|_P1m<2Q_|ge-cQN6BU!BVMZcf_%cpWMUyC4AK zlhC0+l(0YmoD78h()#uBVvPB>V?f}N;4Kia zgpT-*@!}~6NiE{+S0~9lT`_T-Jli;l&=w)`F%MR*`Bk^aFog9Y{tpFmnpH?J?L1n6zJ2zu5ex!BQ3|CaAL&J9Tq zCcQwIxKt3$g?tc_$W5o(-Qfw$jx=7*l-DE#Mn!##v=hJsQj1BN6Wa1lqK%L87`V5TXnGBz~>y$=$d2EgUbc2}4 zctaNuK2?OH00{x~OAb~fBb}BvY{N}h%`u)#FXy$oEaM;UF}RU9r=d#sfNq!ZPjIFu zabi^{pWI#8m+V}KN*d5(b|=lvAyAHmU>w!p8MWJgQmg4nWvyxar8i;h04bu(fR(0| zII)OTHqthWviV_ z(+Fp}%dJ%f0Q6dE!T=tHTH?gM$kZevv?2&lREq+XoGm<4U{3?sLQ)F%qx}^Kq72BD zhp(R3-a9{|y%%RfufYDLoTV}dBzWufOEZX9n7Nyl=9WbY`yL-XKF=xuGEw=(qP|YJ z*q!ZmM`VVVRa4Iwj-vg@)!G!`sTW$d0aKyYoa=0e7o^kvK_Mqo^H`v9gi|piu<$7o zU~prFwHs*_u?zwL2)Xi7#6TUl1=NN&5bQ$#3IWKjV|!UHu0S6I75&;`|KbtI)f5H3 zU5}yb$>MdxDrlH30&~_i52g5QztNn(L}O4YpK$PTf9O5u(pbfE@1k~ z7S=v9!2`jQiPWZJnCFxcDr z2%O-mFAE<{yy{b}d|`z;l;KR&Tq?7$3uAPXddval|M7t@rw(({39pU~5H~s2i)4N~pyg54J@m#t@rhwn}zotC!2sw3l@%u^NcCI1T zU=f?MH!T5R7NUNHZy=E62{`~?u}w3;A&I|=758?3A8|UnCE1D#Xh9ppV*q&Q{O%E) zPa=r%a2};v;mzWYb@d|Pfvmc<9gK`yp{Fjd$b>hHoKAcM;w1PsDG`9`GkB-~0k9x# z03y1^!{DSyid7&`^B4@)eE@pW7zpU;77O6Q_SD8H0GAJG)_F}?eY(B1>fXQkrobhs zd*$97>TSYv@a*;SFv{Q%4RPMU$AbJ_S-i+vySHhi8$ za4xV~Swg?J$(;r7w(!k+0D-hy?wz5bbD>l>^{ZXNe#k{|3@tGbUiW{nci&A-MgQ9H z69Odk5PD7MAieiM3QdZDfS`ypk*3n5sgRHWp(s^~fPx?b0)iqPOb8-K7erK0nn)J~ zrJFnc&N(yZnYr(G@ch4L&8)R&_RL;$eXkFYk}H%jK4g#t799gXJd1w{ILs}oKj+|5 zG6k}=PV*o#6WQef40n#qn0QM8UrP=dzSH9PyU%cQo0@uOovX1F0|0H;6MnVNnQVSA z_?u35Q^$djxjgtkHx=;lg(3u3$=1)n)bKbEV#3r4_J;fVsbmiZJ@hhJFk}5_1H$ ztPB}1Jp0WG_;af4ufzj!QFWkyr*3{dJ09_R>S^@x; zxIbAqT6&`<7=gQ2_Ug981KvqhNJs~OOm(gTbnxIFeN=4z#QfHfnHvHCnNsD4s%50a z8Pp6ns;Nv>d2WbQfZRwP%i}++?Rs_qMjD&PO3_+ z!7tj=#cB)#pf=SW_jw2P5`dQ2FiVi?Axxmvw;L=h(S#aFU@PwUKJK5JO6pnNQB)iv zG09UW6!KMS9|evXu&y=NN&r0Gs%q3E>iS~@hs?q=sS)e|Za{Jeg91_s$65toGb%Au zx_{M@BGDs#am*szA7HZd4BCTcf#TA=a95$=_)xs=w80GxL+AoR8uwzDjY8wqFzirh zPj!8wx~^fciz`?_KQ+V!hv9gKo)ZziUm>S=KGjak5R53HA z{S#BAYH+CI;wwgV3KiV}y`(CMhdiziRHUV14)9A>hM-u*w?5kc#8guY{fItLgW|~} z$XhB20_jg@vv%Jg%0bFNf)61br1VDosI?XlAaT3xDL6BKcd+wxg^^8df9f=|uoAg` zyFEtHWWf!ds>X<;Td8VR%po00S{;&_IMH5`@pH3+UPxhu)o8Fz(QymBAG^M40p9;- zrApn$Vch9fYF~M-`8xHVmAYVsyR_2tk4lXu{26~l14jdN9vQh0zGa#_U9SmVj4kMr zEN?7UeoGq>Qw$xc!*+w0*@(#Ww5V`AgjQp*)c7a*2l5NrG1 zX1KXF=?VlfU4ag->aK?-dmW21KL^hG(xFsbV1xC}_}j1=(v@i`&QFR<5l@n|>YPh8 zHQu3LtTsJwYB*+<2#sN65UkjJ*T1{)jdsOp6lk7f6k(2^&q$S z`4N!&Esj;*VWO>HY1T&X-r4BO@&;xtR|Lfeb^!tjxc~U5^o=mS)dYld-~tO@@E*6b z`(uF!voup75ZXm$1Ae+fj`p+dT!$nE)F3lO_fY};_X6lXDlRx-OiAn9ghsM+RbSl& zp((9W)ti8PpK-d63W~)Ybjn@av@70V#P~mK(M-N7C{FiL>n6&RjRCh0CY3}R7b7}A zYLu!4qZfF411WfD835$jqY$A%{bVWo!gfIEnX@JCmV*6k+PNm>mYByaS6vhB{LiKO zUub>I9#WNPS5>Iegug*8axQ2&zu)9P-kDa#d}p%0M?2lem1&|d-P!V`vtC>CMRHnn zQlECZs||(=P;fPxh_o+C-Dkq!RHz8LxK*SZqKMG^TrktsbamG9!iD#@?{MF_*n*nH zYIA_64T>LKOpQKA`*1S*34ig6%pF<8(7dha`0YEb2ijkLU!1%BC5!vZVu{DvZ=Uao z9vc@0R_i>rHotuKp55U1Dw53#`{}V`e94*x28Uky<1C|F&AWT&(!uMTk7_x8x-NZY z$=)cq^mp?s+=|tOdmWh}Ld=~Ts+ucWa0d1{3S?fI~qH_Upo*eu_ zx?aHjm!440`9asY-?N@PegZKeJdY9V{{CJ9abALSH}!#+aJ`pEx0mRMm)I9C@hz|a zi<_D^Z?>NfLwd=(Rlf_9Uxv1 zsDIbTU>U&E`oG*%&L#ZxBFbgah#8OZ@HM~YYZ1AGzP5yo^F5vCqt@_MeZ*J7-BTTv zz9e)*`=IN$vwyQM@RQ^s>2B)f z0^B#$PhULX@)apo<*dtivac0p*-j^Y(bLcKfnT5wOH6bw>uRc7x31R~kH5-t*t*~A z>E%F`E8%~VgZ^-Tn*+ermP6urS&-~uvbq7d?=1DC803}P%ziPMsnH&69@knUnt7fe zyrN`V@G6%;^WO+xt|VXkcK)v?&0V&YbS*Vyq$TxFDlX3d?(!9-9^!pq99xR=S(|AgH}QsG~cmlkTQ|3F_VodiD4JcT+Eg@?Q@Xhz}La2o-u5D*P-| zoj4*|VVWiHkC!^d7P(MGMj-zJ#?`V(iJdjU1wL-^H6N9D`#0CEcN6p;% z50083jv%;3=4MXJ*ndM zplo}B#0pSun_wppq4_Y}-13IU^%UnH_*1UTp5K|{Lw@W3z*}u6|HDz|n5|oWdt^lB z`ZMWYVKgy~jEDa50Y_%H{<#nW(mxZK2T7*ms5ywJ@Tin<2>3Q4C0ag-j-!UAd>q;OmLl<@t%e0zfU^d2n)z@Yz%N&g2pR+WyU z68_<+2$oCRd(t7{*P?&BxJKj`5XEr`*^tECL=fbha^jo2tj~|d<0mZ3!Dv&Fx&%Q@+8`O(s9&e7r`WNPhyZ@l1oT(P)M|@ zKU184qUBz6^^@4AZ({4-90+}7ME$^Do=%W1jlJ?PoQ|Vr_a};n5L`>cE&W;C_rOt^ z;hoP2KBG+WuF)=0`(B-VFo{}R3@9hMQ9ZAoDU%w*MU%5q#5@f`4f%|{pde103GjUecm5^q0DzQwhBHTm{EQgXb&uIn~9_Zh?U{O~aQ<5<$~ zkNZTR?gamA9Od> zH?~SBdK6xcx6?70kGzKHO{E@RT>}n$?ZTgE{F$5n>61ayxbm(xU`KYO*dXfH#_EpT zSSdOgrX08{|E}V6zQ)zHU4_Xi$0v?;fxk~pk(>qpkT7ciKAiB%#1ouj?P!wTS3S z-#7jIytY!GlFLF`CDZ=LOPxyzh7`q@{aN0zEmcPRbM*G@?*3Qm({+870RCX zKswjqw7$$eXVxeVp{D_D-vWIzX>XQHO^1AB3NQbcn<_LZ{pg@kwm9^yzFf(TuoX`I z$p3azoyo-3Cq-;q-@J9yxUE9jtTdU(U*Ug3OVoKzL`6xj|y{GvREyqc!uRM$S$28ZCQ7cCYjyua67g zo>N&LG-rtx{s!ut?i&O{yfD-p*8aR9C}|wBb4vMy9c8X)8l2B9L>db8gxQUS?Y^*U za!kZDjop7iZ3xg}x!HNh!&NJ~NA5=$kD*6g>UJ-CDmJ(*+36YYXJx7C4jSvu-jMqy zAvHUzX^SokCOUA%Xk0Y&Y^rQFljmvUd2E`QrxbJqep$R8ta z*M?bge}Mfd5dI+h#UJK)+iB8VT-y5VO&9qY!-%-k@oAI{UY1yC9R`qnzYlXea*<|BwhlT|H;M|=g zK$#24kL^2PCMx=S_a9dB>#)5*UDjyX&%Rtf>ilF_3XxU zKO-sO|K+BRPG;Pm1Hpe~S{P1wafU}zMW#v22t0+&6QChHO*V(`P2Y{|FTZ=#>Lek&+{kg{Auy~_%PWJU$cdDe3fj0o+MDcE2I1#J!{ zDjVw$x+Y2FJ`#PbaHfsm)C|je8O$i}sLqD=riyNo!7}9vwx<`$#lDdGl=KuJ_NP%0 zYA#yPUjbq*9iEvK;V1R8k*juy8~(>13himK0W`XmLPBkf?$)8oD9Y4uBu(=@@|BGv1AhS9%O5AZ0THn zJb-*OPD*4=5U7$VH>&S=OQ#1rzIQzf{FcQ!{mzSjJ(J6}(dvx#+`F8YOLF=-`f2|A zTJ4U3Z*#X5&CEU}*VG9yni{2S(P>+0$juTp;?2K{T_aiB zfUUO~M3(Ml>ne?(wm;tMLCVS_wUn4R+1`kc_t!dZo?ncd&X<4B*yx{Ao4jdG7g9ZC z{#Q3^Q)S@~ry&dtvtW@9UTeHS)dvYcaNznytiw1)=@d?PWbE(Aa>3D>onW0`ML z!qtulCRX*x8vmSVtqY&Hfr(H*6i*yvoF;!-iM23sjAXRFl!e7xI39A}+4gfo=GPD0 z!85SSYDtbed`#}Tn(*Vx#?h3{$3(8G^QpI$P@-HvfygkmLfa-urx__*c}pKk?qu15 ziPf-;pt^nu@zH`6RKT`*b@Z)~&Nc_Pi+&lElEpIpZq&83z&Jo;hVjL-MhRW5Lz`^|^v_9<(F?DvtO3YM+=lp_njFpgq zeFKH@3CK)b9e%nKIR7nqN%Yq=?H)tLZCJ9}g~uyKDN*+R!*7d25Igju;rCY^4@tX4 z>h8}5*m{wXMRQWDOPga!Ft3_o(q1E}hC}$`Hx2Yh;eFF>m20`*D3O zapl9$CN2l0*Pn?GZK6Ru)93Ai31apX!P=ZfGos!!iZ>moWrgPU$2ZJ_L~sOtB_OiD zN6>%vQzM>QtLs#X*MVz3vMt# zst8IArui}@Mj>Rj=meH3f?huZ*p$fM0ASn4aqZLJk120719$@j<^`TU#HlP3wJd=; zV~o1eKwWpxbBUh5s}yZwf-sS)QH(+MqF}*zVR?I%U=;18?Q}1S4{L2FFKxS#zyj!z z5iF7GB&#R^5&Iy-5>9&~fnr6aYoq64ACC`{A=5w+c~QF|wBQV*#y$YNgyU*RFiEFC z{PB=Okl+x29`8d&FUjna#p$GGIz^`eZ7Q*V5%u$aPVxBj?b0C)BbWCL(}`mZT=MQV z^+EczN%&&zB`yWxHyui(aX6!0!lf-VC%Y)yG+qg3YStyGwA}m9mPuDs!f~5A`)yW@ zlR;l^09wHq)0Dt)B89Cmf8E>Dr<||QJOD`` zdm&XIe)Nh?M1|HqLE1FO#+7cS6Q!ru3oXX<4$dl@ zlzm9}K5XiCy2K7AZd?0BqDD+_18pHZBhf(dmINgQ&RWGLc)0l$9#RXDj0H%d7;af7 zos22=ykB3P2@A*)U$L29QaF#1*%Iex1wk6XAB>k|Ziy)z60ZMU_EM6FP+|`?&)zo1 zDmF3b)0I?qigweLEp`g>w8Pg~blpBE^3fWmYE8`)r7~3W!Z>a2fSSorV&{*p7N*&I840MZZ?V@t=&MWa8sZNNfsDvkrhQ7VajUxMsakuEXm?_pP3e(!?mXS2!;2Tg%?UQ(#gM&M^BCBIYvlxsD=NJ%NyytZdDuq>V4K|Su;2KD@&1{> z1#YuSkFm-_%#*mls!FVp67_?TwU|2==~--2wT5XrHCGd8#14$<1wA2suS3?p^Iq>2 zUgwn9nwrF@2jh{sy_Ur^_;Y0M^FvVFuhJKsnCRm<;7_8sD>~wzl?uA_NTijX*}wH) zRw`9T=~VkdP5W}$r@FJ>4yYY{>zz|i?OO?ldR&i-K~Vop6XGRBa-s=)PhZvOt}&y& ze65LEEnpeDX8wW0GQpsbF*1eRT=aIUrYm&VRvDq@$DIbZ8%QtCeS#;^4nE!lGqU!6@@ zyST(YhNG<+m!|WmB_J!-YLb$8cB!yefpxtX`BuvJzlGR3sbM>%TI?HdscYzS%l7{Y1vaqJc=T(&#$~zNut(tn)Q(B3(y=$?ywlX%MMME*`PtblC$ht_9g+5Ae};^6^+ z6{q}|lSM8jUu1*~+vs60cse~^$u;e#peJrPSXsgFf-9GSp7Ol!EidtOso<`rVku6o zK^ENV=g7EZdf1~MD`m2~bVjnsa77v0i-GMu*^|M;m(X19yZ$`d`mUcH({!%yf^+-@59u0GN~b@u)F00o;(iC7`j4_)kj z6xxLGgZ`#ulcuBTddz;$vv)NnGeOjhty%Dx8|oChHKI}{ z067f=1C~E9)PV^h3chN{mC)3k7K3!a%?%K_En0!xnhrNxt-PZudTn6}^b0l=`HTx2 zoB*%2;es3x1@yDOw>|jUYw?p2Ib<3dgiq-Pm<(zfM=cu?OpVUOV3%+oQN6HIGNd0} zKvz;55)591PfETP*)tV=b4TEZ4b@j8!_YRy6bLQoN%MYF@}uqPt3wA<;dXsVwhf)L zd%bn%nC#Asf3}P2td;-N)7~JZFRAeQgEFemr5_QD64S#sz2z{(9$~PCFONVktvg;` zy4ig0sM-4H=JM@MscM|+TA*QjCV ztB%&aj_#w5rq3OnH;=j(Z&GN6&2S?M$f%X=R`*xK9?Wr%Tqgx|OePo_mfqBK?`&)7 z=<(_72|uRl-fE9Jep7Vo_1Rm!T%G;lH$^^zcrrzvT&JELwLn&B%zENRzT-`DaU(va zEc+PoVVw9YA> z+aGAR-l?99>YhwkoJ^iO8NJjs&1U>DzH3D8z#8gw61v{;{{CIxb@!;n670XoJs5LC7zS% zv)v!cyGF_}nzbi$>&Gi1-K%oli`W0oJo)>z1G)UQYx!Z^YSGDec=t+9-1kxB_ov3& zPm#6BJ4`Qyc3UDe#tIboBxTI?AB{JvGL{T9Fcfrtwo>YN}%kHgN%ND|lX6;(8zbY*5 zNeJA%MVC}x-__QBj;+qQ;_whUQOMh#N%J2}cHkH>5}qj4bl18nJd_$1gw8pWY@mI} zQ*HB@SzTxNW3${q&8hnP&Xx=zk0Nlu3Cnzv8jJG?OwlSzwo%hLWwhdK_P*LT3Xwyq z4@bG=KVEK?uX9`Y#{bhz{rvlT1T{rKH%>Gr?A!mtO}%({W$r(2YTOO2>ns1-#Ky?B@kn~nft>Za1t{0jEWl;6vtUj7>wUm92=#5}|s7K^6cRwQ0-BwS8?xq^@ z)#+iF*co#;lc28z8+&=uio&df)f&)-f(QMe0kzi^4lIgEMm)n>@7;5Tb<7mQhV;HE z*_pY&bGSTc7_g6Hq!jex8L#Z)=0SWL>5f>Kgw%X`m<=|xRbf-fB9X($y=gomq>UYH zI5B`>g5{3MmRv);NMPhsqRyY?L!2!`p0y(5AuiuL`97@9%xJeL|3cXtdnAhGxlYdw z@qH9_K3-rKR1+jIcLS{7u(EsF_Y`uG`l+X}Pk7ltk~OwV?o)WOk1Us{n&-VsS1Nt~^oFJ6M6^bP#<-cuOf%&OIR-RLO0c(6 z=0O&^)8|g-Sm7kZM3t1EjHN$*!1E?mQJk5ZktwBbt-cF6!D{(L{0PbC97@+xPkWAk z%>IYnTr%cyUeeX^#)u_pTzoz)BkH^M6L?PLp%X!Lgw5j}B+o{U27;ub$rSfU%GC&o<WX6qHJ$FR?9B~YX=l;w2PqmE90d;q{3>|v7-3{cWK-u4T>@%56Hzi!qGA1B zSe|h-OLA7o&|)I$=eLLPEq=e`QPkt#x*JD-s|j6tF4JV@CSuy>$HQ*br-Vr`=ewJA zwIVlp#;;g zB;}?a_>$!%|Fz5v_lUbj%bp2F{;pW5_M{7k)!N$qQs z41MletgIo&WiXgm_rTSBOOMKKv2>5qj7o~>?YX@)4$_%)wJ=Po6cU!i3)!i2n*8O? zT=ah=#BzXjCqh9Y{+2WeAvZCjY5-r9&4jTas@vP0Ad*vnHIe>kb!N@{vLGFT0lA$v z;e4J|9Z{lVuFmRM4a*xDcvy&xsdSlHG z{AhCp$#Z`^cUfx?Z*$r4KW?g4U7PLoxU)sGjC413b;vi5r|30GPV+U(u$sP)Phvm9 zFt)nSRt*UMJ^8)Y$(+HF^+m3_V6xFY`-kdTmez_=RjAj*F?JO#J5J}76v4xh(%Z~* zwF_mYT=V#@gBHEq%G6x?@X<)!qG>eM4hMQo57daSkDhL zXE!|zP4W#So{wkHXs+$5#RWe-Pj|1H>-p*w&ipL&<89U|TcL!B{kiYG?W}uPpm4o% zoFLH(YTNDyvNbhiXb)pJoCS%tnYKbOEp?6#+=lPF4iH1-- z>Lvxp^Wz$x>rp)CAIdEz)+l4-*xo{ukC1NP?cKW zg?Ix$j=}>&e;3Un;M(#i`J-W&hi1X1yB}b+a0H-D5VzWeo0hkXMsJOjQH=aq`9GPY z*K4}&lzNZvKLTH{L>qhVxQ{P+vrV*Oa6lA35CfvTU)gB_1( zps5R)+SPK6OqbLogPyG>UzEeo2++xc?aW@u&h?BqOEf>~6GQrm#O3iUySV+&!3Cd< zK8BhyUpSC(@Zam-G|B$+`B6gC?LJ!+q^Rfx1JhnRWn!zg`}54ogO*mV#cR=TiWr4# zw{LG;+`4u1=GT1%z3w*KPq<{8@hLaGmT!gL);pMn2{opei;Jscne&`&xL)rYn;Gpj z_n$4fGS{}vXSg1}hnKZ#4_+LB#Lc{iw?O^`jm&!&Y(>CDx50Xu5}bzE&+y31KEc|b zQgQ4O$$v}D_$J>VJM->}Jv?Y>%YeGXUI%vvh=&tezT9Apzi8-n?nQD7r{#;MB1W`8 z_|4C=TfbYM1a0~zw>SpLc^&naY(~W2Y<&fa?)rXH^tM3g{7>ZGEs+oJ-rgaYREqeF zJK?$8_UDHiJJOvrCHWuWZ}>-u3l}E)?dn@F>X(Y|l)PHzHx6M?iFunK0-?gQTTw64 z=vb=nfrUMSs@*@Hs+ZZ4Y{Cm~5wzQT_+X*so4r|Yp{5XrSoh`_Eq%B8mFRjr^U;M5 zf4^3ami+KR9Mi;2_J+Sk?HbZv-Iev*>-zC`x9H}{H*m+vJlBN-ffpzB8Czs}OLvX9 zdu^LHb@8~YA5HaY_7t3;LnQ0tG z5?XzxA1k~p(Pex!fMlo5JN0(qHQ)Xoo*BwLZH<+&dWRQ6Eh zoIVDAR<4m9!cw?BQ*2*bDaD$Y$r}0;ietetUo)6vf&94278N2IyUqGoitQeKps$u~ zw4tjW!%i5RTd{koYT|CHIb(ymf zF3Jr$@q%d#zxR`ZGb-+~2K^ILn^F9cK%r?IhhBi?KQVPaTO%N+7aY$}#{sN<@tk$D z0Q3NfVPOvB2UYWfWY~561i~9=aXc&0h{;UpPF`OEXa8qbxbbEy)Sw%N}heR3JI`F%K=_Th|G`lTz2vM`ugDu9Y62moCG|TyESP8DCgF#Th#{<>}Lw z;3%J*EuZ#|o@kK|^5M~_`=!J|=Uc@dInoEWPnCzkYH)+aj_cm0tSh_t#BN@?J^QnJF+ZkYbNFy+u}sZp5H`!J=|Um~)qfe-ZuZ{MjbYAEeI zR~~Lq%~@1^qoj5tBx$X>y|`52B1l1TLPJ$k{wjez7O-q{1^$R);R51wBanhYlg$9_ zvuCx>g|pR=+XcC`M^HMW5;|L6IswnTn}4azLTfE?CyrPQW1z0r(-*hS(vp09umO50 zFLdL1^aEWpCeHJBMf=Gt zsmjUB6qZhe_=BgMH2_$z$D_8+~m)!})`umrmmzG&*uGx)AvzR8!crDAve;jS&)aYU= zB|;blG3U`foX|eaW4vExdS+4lysB0+j^8)DvJ@e(1!VXiG42vpa0^5 z*>m=XlR0_67++h!-NTs`E8#0RPBhip)2-2L#F5XK z=&aT33AIxd*dhZ<5L&He+I3o5NF7~_&Sk1g@PE0f^Vcu`@Q3Q{kaC~08sxd&AJ*3M zbjhNaxOuYEjOO>yAfY@Q$Y~6o?xM9a1*f~Ieq<&JC14a3IJoMBu))ouwd=x*xA2R>m%H>3sf z@FJc&I^H@gPHlNLL69{-`lUtTHBcvHLo^+oi@S@To|yE9KwUN@KpyGorM0r%?K%Cei#5sd?;K~eZKy?K~Y`1DXpPhYuZ7#p^PIkRdj9c$yHEns1)x; z>9hIv2(|n50!58)rt718V5wfvo_Wsu`Jy@$uHX^4X*;K^1YDNF>tDwCH$>?8w3doh z(@~bldF6()R04OAQSKni7vbjbZH?DOTEvQOiu5v;m9nlO1ZE^y5E6_#5)D(y{99hI zY%*6^8F(RrVCK_SbksH=!IC7wo7dhf=6msMY{ygmmcarx@i|~CqJRP4Ng(IS&jBUp zL>R0X#7!A`iqhd1ba7-c+{PpVA9hj(zC}ut^ZIq{1*zR#uDD5NbLVMB2z)6hJVV!0 z=QU=^7=G`&sp$-$zE8K%@;i`eQOQ6mt$H{ zcl@?ocvvoa&|)UHl|1M$GgL&$=$057HJVg*9llN&KBFvJ>yDR=xoe<1cW|`>Q7-yBG-=M;WY@}R4>SW8D}YiQ?T-t zGxhA=%!D%Yo7YMDk0Ky>)T`G&5Lo~yuH~!fE|YtES{{aI6y@ImyaWH zFLH1>{DEr{vPe~OYxG{8t#LYoPcZyTkHHv0BZH3)Su<ux3yVkW<%q?difby5$n-h|u!_1HTz^_vFyWA%kW()9INcjl0_~0)Q-v=Q~O2<^LZyHLe#&c}RQvKW^#} zT}c0w%VIM+Tw-+d@R&ub%07Q7X;)B3tNPtzn<4{_)OQt=)lN^(ccp5tzeWH6lVrVy zxz~w{N>FxK=%cBZ!Jpf2v+W_?K8aXA+>jfs|I&49kAGdA=>C;LWE05~(ym{4oi3uf z%!)vh26Ea&*^h9)e3e*%u7Z2YyQmaiu_ELhgO_jGlIdt#Rp0&K<*+t^5?{ zqrV)*>Pz*#K+D#R20a|O#N?c`m{I%gse+!(b@?@`rgl_)jySYkzKLr z?o6%mT)3WZDL=ki2wnl;nA13Bp3%4vu~YY2ilF;Y(%iK}C)zkjk_8Fzvi;6v>6OUu zLGns^#UVH!lz^S^A|-)XaRJtKj$?_wXK6fT|$3VIbe418wknI(c z>1a+X-z@V}vWpovf0z5_pkKbjz;;^uICum77b-GP;sclSJ>pm@-4!mk<;zrOrYzgV z?anco)H|zRzAt}G$gd>wOUjj4tGCs;y63i8J#~2DK9}o=rynbe~AAI>j-O>RIHyC;#~Q$myUuez(KG`t~oq z9Cw|Gj??;>N$yeQzfNT^lkS=FzWVmpfk930?AT=MM?TRQ7hd}R;o*|ho0|V*90%xt zDfb16h%`;s4_@mKKA%=^bFP5rSpSD}Ogv)^{+&(?cS*IX4_n z<*98fsSzJc+!wB!nIA>$NzYq|Lg6lI7am>I4(pM9c>Jvg{*;)Zn|P;j@a@fui#0*V znTjvGB!b?jNoMVOJL^1}PIw+6b19<9(n{ds)3^FO%k;L@++VwdmVpRud$)6&6C-Xi zJOg5n7_YL8tvlol>o&zvZFE@LBvU^NQj~Qqf?4h73So1wjdH%ntuOj;KZ zN9zx13m=e1Aql<|D3e?QUf)lYD7YOo2YQut$3#9sFhImY1DV`Bes7o$7BQ|HOJO-j zRJ-x4v`;sojNEj?jwP`TIqaph$j>7q^oHnR^JO_42qzVkS5>z?0)9Jw|u@gP8 zl;Aw)y)kd0IE01pCduMn=HlI`p+)2K^I-GLMe)#LA2GJJY|YAX`0G|?<+ajQjl;!T z=d2!n3!Z1@NFC;nF2$<;-9{SqQ}`5k&4eVq*qk10bFqmgD|)58^~n8|o7H++4N?X< zJ)Iy#G+oezlo@!kp7p;h4^l47Vn;(0@}6{Jgtq3G&sEi#)mzT-u!Nd4GGVwcW)dWm ztWKY99p;M;A&6|{QytrtMV`$uO1#ZviIs)sEosRad}9$lm`8=r6~kfGg0hO}4bySG zOb;V324yt`u+m3nA?<8e`s&@1=cNY+jwu&Q<5X&O-Pg0HSYY7rmP&h_7`r?i4j zTr5&F-)&~qP6{K{UMpST+f_%E>TH9t z9szqxCo%7NuAhxWcGWt9DpZKZ8FD~qS8D5nMl`XTA5b~y1H5}KJs0B<0+me##5c$4 zM7?`lHO^UqUuMWG-lbFZ_43rCN!|A*Ih+U*VAzk5g&q4%c`C;Y<(1#R%frltH&t_^M-=hJvAH+O^Np2O~qQ(JCHJ9AaLYU>M`t!6?_S9HOKHbziMk;&3;{Yk}vr(4k<^~Ax?^V7ZH zdwfq{FZVWDh@?x$6JY(krOHLezN%(!ZzKFU^jv{{l|-t}9m>BYW_QHwo_~}c>uvu7zcb@!0~UB@^Vq?G^(j3{ zMcgI9>tm&yTJr23{g)u9bG{2Txvi=7e0VT0=Xc4on=wN{aj#^rblgsjehm-nTIWyc zc@{<@DmJ7f4Mzx zl}ntN19qG(S$)H5v5#+>69jvpb7GH`Me8~3EYUG7hDrrJh5X@tQFGVvQlcCfxbF%D z!Tj@j_tEV3r@Eur6JTgO;;>p!Z&-l?ayXV2w_Blld+0|)q6YNNsbYByEM1ML(t8VP zKaC!`$aVAMA{Q&h68)jYGfFfrPBUKn3wdlQESjmJ|y-pFduJgctJ16YcrJO7U<=LdqVi z5*i}mpCCs{kSJjJ6BXa_8>cS8`MH7jJdOc@;8E{~ZIgj9>bxeu)fFH_6oi<pAxNCpMlK3uzb6UBQNTgJGHXk)+B?%TY9F6qX zGo_#^MXOOhW;dZ_T7XxM)X|^jyp+(|4?9n0P?`fF0jZ+f4A94%3!V(Y4Q!%h5`w1N0UY22@a??maRz1nyG^6 z&#|OcWhR%xq{cY^mU8+47$!iee5*O8F$mFBF7Y;)u3DOE16weHmMi6!OP9;+?78$= z@{m8X{Y-N%2jqEpGYe*dy;fPwp!ci?VE=^LIgluT%T+&j6Tz9)PZw(QPf6rQ1myo6 zqhB)IgRlNasBPdtjB!O*-_zx0`^1-b&yA-F2eb&uD;g+NgMkEbtX>LSs$>Q!0GH7! z*A$LH6v3i|;K&TP|Cr2B5##WS43uq1tr@sq&yrdZa6dVK0jB z`W6ne@ND3K(qn}aRwV*x(l;rt<2SIAxqJ+)w`U5nI)2bXZC%! zk{>J&C$jQf8XlOySUDXZ{{F~5o|5r%^&u@rSbm(wey}PPYf-u$z*NXm%J#PO`p?4M zm{P4C4rfmsw{YHA3efjssmR;1vknk>3Mc<*smrpD^4k6ndv_iV)&DUBVUXd0_8j=(;_9bb?ShIvw_H4=0VoNG<4_ZIf z=hJuj{qFm^?%#Etf6sNDKi}t^*X#MXCP82mxt%OM%B36*V%eFzHW@`gljR?rJ3_{+ zc#y3A;QB!HMatgm8~THF;$u}KK=SBV&D$xQ?ucvbDf$sBec5iv7GG;V0x>R0*|GC# z6$!F-BH1!xWjVEhzj2Dh@uzO;=Co5EZk{5rx#GbN97)dpD<_82T->4SM{fM7 zo0`RuzSbPP4V@elu~KSpx=RE{Yy``)n@X3;D2u=-5*OXn+S?h5WDH|8;~_^TV`oO) z=FHlTjOK{Uc8-ikJ2SieGdno$JbaMZy)*OKiaRehX9(c$47%SL9=)SV1qo0=IO?4T zx4~;CKyOCxOsHkOjn8-&0ea7IXOT_?t`|qFw10nr?MnLEKEw!+%_W`9t)6|GbX7YL z!;_vZbTs>1Fh~fVgXzpxiOClC$XVr~B`{81b2ppsV`MNL$-WsZU6+F%%Te7SATM-P z(Kc75j(S(&vbIP1jB2iuZFa+l6{|h+xFd5UK4x#J%UfT^kA&xA#ujo;>Evm6EcC78_qm(5xY{X{6ShSl!nVK>UYO;RxDU<~hPPCT$1c;U2!Z0FzM@EY@g4IU zhvE7DN2zMgg~!tiouqSJq)SvrO4RS>UmshMbTlsqE>z@^U+}T`w9hqTdBp6xm0@I_ z`$C0TbvaQ-1#aBUOZ&K@@n{LSBmV)sfO*?4p}VTmLZw%b^;e3XY{~A8EcJ1s=+fEt z527;qr9MQ%GqK*?;)e4$R2~{`c1kMLvTUPt*@`V?A>s@7*1+#kxo9%#w8vZSuK9TX zXx}~MvAcIW?<1q`bqmp-eWZP?ljPo+q@}~!hetjrmyzm2J|%H#Du~YEAoFLCe zez+<~MnJfXU89U&_}*H_66{eLB#NPQm%)~CZ{CA(+Ln%~zgN;%1lM5bWK<};l#p1# z6HJ9JxXQ`U+14wG67a0ax^S0hQh(3}GejMr@9@_Hef!m!+$O;;=svk5$rOg)v>yuJ_a zf^Au4+9ILZV2hVm@^ULVCt!uFxpX5x=fn}H?sqJ5c_zZw!mg#4^z`cy7WTUm>`3DAr5F%CHAp3z`m)0 zhfhRY`Sha!$43ZL(GB*^+wje_K{#l#}@Teqa{7{H~=wf;0(sG zd$&Bz;_8(0E}Ir&l;K>>qTzDQZC6fHKNdgP|EiPW87glLJs1tIBtYwM?6$c5W;C`Q zUesVTgn{OmRPD@+>9z&f%f=0yWQ`hM!8LL0h_=p((>)`ZJ?~+V*92H$69>QZ%6LO$ z62;yylifBNp4Y_k?&3msXf~5i<)EE|7X~Cs(K&4aNont%u~)q)WS>H8`F_4dEN`*2{);lv;C!hBp%xpcS#lsA-+TRiCqKq8%hF7R_AD&)e zCOax`KC0BWpeh5}xRuMnf{CaSAK&Oll@XxUl(M>BRcR&0v8lI{PPio>|Q1@)@g9qbTk>!g~-xK_(D+E{+v-e9foG8QWI?Gl@L;2-R z>~6fUalg#Np2taqH?NueF?Ggs{dD3n&iE#deGl2Zw2e->7=@)6MM)U?LX5~V#>u+K zJRIj?bd|dJ_`;bt=|R_+N#WxQL$x=W4>q@76X}@8b=;6GXCko8&pW7^6Nk(vq_YwG zZJ|aYs2U4PeaA`3^>eG$I$J%v8<>nL!2Yy}CWq#9%OFWa@Ck*0m>iB|W*q*6unhTi8#?5U?0Dim}DYrd!B}8txP+ zzBQ6$&_E62mM&PKz{a*xQc0)h?i%@8jpGiZ%)jVS7cYq`JOr8AO!P$C7KU`$Dxa*! zNj^0?zPdQWtmdeu+xz&Tv*zhnqUJa^pYs?rryhLO@yZ;w`GQ(;g!PU?&AT)tqwa8q z8_LyE3GOM#Vxb*5Mn?Oj<%0X`_=|6e2X?JeNioxi87wgQ&P}zQ+1HWp`K;=y?K`iY zd-Owh()MnV+o?INGijM^SeDZ_*Ze?bomr*$)ZD}vmxbEfBQ9+ViNQgg+s_f6e0r}Y z^5E6%r1(HerqPg@oRLep^@Pfxoyd#)Z1ww4u4Ddf68tJ_byFhju(#LpM9)u2(oXY4 zRLVZ)Ne{cIzRl;(h3q+bGYQ!K6()kmW=ltlJ zv5e4Y$xE3a-$e16%kmmsMa3*^t3Cw=Nt^6>IsHi0B_oZ)0P%Kp!a}xx|HFc&75Xls zxrhq~1Qhc2p;LBPwz{6w$+WVX7}XRy`WMvioLm6AurP@UahSKCa=rc zu$pP4PB+xoxg*xGhfN-lFM2dos!BMmGF0SzR-1LBOUxdj197sm6QK#mSszHA5gl@z zZH`!3X?AyWEoHUnTE+a#``fT(P`2v13Bi7yHVXl+Ad5qf#cV!aHxz3zaPM8#rs&Gn>QV{DfVyrbbd& zGK2zs9VdOEmA$~F(I?R<#3-59*)tdtAq%7r@AItF7PIy`mj><%s3Dh!>%@l~Z0%Un z=Y|O~$pG_2>xW3~qr<&6H=`49s`H-o6D%u*Jc2Gl6uO*OH+I@ep94<$ttJJwByS7m zX*+8>O*=_`1QYbfgP6p zXg>l`#cGY}1?>I<#z+3oO|A6wI(XN%XXZ)a zeW(}dl)7C%-_xS%OD_10{eG3| z4wDz3mcH1u-~aJlhna$>_uib^AMjj#*M}!h%idM)KmPXauKAg#_ZOffGE{>I=5M3H z3`jw|r9@VhHaaSVbaJ(ZBggi(a{ek(h*GH|@^BkNjMY1AlZF%KQd@&;sS{se zTjlx?@2K4x&MTg_RjE{YpE+3SEc&jkTAS4;`jmzXQx2;!H1Iihq10uKN_*|r5TElm zGV3%}b|Gs_Og$sY+1oJI`avBvrW=NmcGeQZ;sqchd0)F=5l-{RgZE1K&lr1 znp6ehR%ZT=RMjRbtd4srvXYr0V$TzmlrB&!j5l zds3D99jOZWfmDt9j#LGHAXP)I6Pg!E)p5?Rq$;pPs_v-yds3C_E2&CjlB)HKq-ujZ zlT=No{FPL7ULsX>{(w}?{##Pj;$I|H%h&yuRHfD|ld4MpM5+cXld7q|Csm*Sl2ldv z2U6AMk4RO|e;`$N{J%%4DzWyJGfCA`e<4-(|8J41;LoHg=3gOI@4K=6Tcj%Sdr}qq z-y>B+_y3wyef|TfdX-74=KPja75p8kO85^+)%{DPYVseGst>*+RrUV|NL2zF`Ror! zRRRG*z(I?DL#k5$kW>Y}BUM4rZ=`C|ACRiFB~q34-yl_Kzb93%{voLv4gc8m1F8BL z_ivG^!1tso^*??}}w5>$kgXvvW{Pr4xXsg>!w1d*w+|H|No7&k<#~;^n4UlzA7POH2dc+BIq3QtV|x z(jx9?u?L`xl`>k<@Efr%pvx>usmbRYffNd->}?bZ%M#d>JU-^(Ao_ z0Ki~D&k5;;qo~^tZr|Aq;z6@vno?M}*v=M%QN1W}5)=#qJ~V+iC;$czYU}_+uq+(C z8BL?8md%+@A~G}M;JnjdTEBd`tK3JtycT|4FR#Yp)SSi3z)Txsae zc`#g7_flNEoH-1;_L58O6Wyy#cl0Hqgo(F`wo$=-GFd)sCv=VH0fa>m_@(ac=N9tsW;~<$J5N>TV`j_Wc&g@0 zo+?{TWYVIiy7nm4ec4kr&sErxD}SZHBOPKihRR*?RI@&?>87JhZSxuUd$` z$Gk;P)p*HMZTjM=wio!Qqsdo_=W)dszIduNQs`A4N0Pjzs~Q>FSR#w~lQf|-ixrC|Be$}toM3-Hl$p%X|h3IGN&v#2=_h#;*raB6+&yf>`UfE8dLXXPcv&CDDeOLZ9~|-Z~?X zL9mS^x-bA6>NOjp9tCpPhKN{f00Rfc4pAB3NYzqCBuPswl3`SeRnw?2VUnt)jKLhb zu}0;miHdCjm0Ac`w0@;shFEBC`A{$1=_Opfl(Umux$PywIiN~CqjJlWN>Zt)+P?i8 z3DxQdD1lt1DZ{R-!Fo^w98k|75w>~T)eOj23->Zy7c#0fG89beSpvHluXFnO9vH;{eQtbno6_@K7!%lTWR0F12u@T3*CcvEkIbu~))eZuO z_7RpyG=WUzP^@PZ%`uMoD(u9w9wfkCpt;^Rv9QwFqI%Jcrba6?dtK?nM4iSV;(jd+ z)}~T}bRzN)09|9(MCxVfsBchzS)*RM@@ha`Oc$dl9K92VkAw0=|mO-c3=6>(y1?2Zq{nHsh>!bICykTE@rm1~j+mq; z>Mv#1G;vPnG05a5vJ;p%x5qLX>M7&3qZdvH0&m1OYjo)xJ%ZLb-4={PZtX?skx-hw z$S~7(Qz9n|*Talg9B&uw+OIcs&-ZqJXkv*c!M$)DVX`gM?w+W?nzI;A(~O3V-Hgds zjBe7iAu8wFhNsQZEP)wah~};g8pi+u*5b)|<`pUy->ygN0Mnc232^4T;Wc#6q()yD z4yB2O7n0x|vgkXndL+&WsM&Fv^-E0MYlzY)Unkl&)GJ@-*|yTFmrbi%6OF2EVtL!u zSet;tnDz0U=>v-oTG8MspBh&-_N?w14DMs8K3(;pyT6tbhSup-mg_#-&Ei7iU^|X- z$3aYKY|rfnc^%rFdQiLX^vwg^R^yPIrVf+WgTC|pUZuKgGIYl9<BKrM!Gtb2ET<`ET+>3){BmR!Yxtqda#@}hFZhMWA7@us$yLl8f8*6X zDYx)!B8WbzfT~!ZHQD4irSgoiao3dmvnj2)DKdRZccDo!YpM~n+E9DiSWLj+#I*hb zzxn!U%dF|T=4q`2mQ$Yw&F{P=biEA(Jtsb7*qwdbJW;v(J|jGPMk7kV>*0*s{VAPi z(*$bWNf7Ii(wW2i1^nO49MgW+X#Q5^#M|RKv!@oOt#-XL-9H<&RXFU~QB4@bm5$eMjfcq5cfW=|(HnO=Jln#5=XW~iu%aE*^U zcYS207-WN*G*M6)D(b9{Kss)neE3I&HwfDLxeZ8xN>pswiKz!-bBy~RAAo|>VO6qn zjEIHT{OSU+3MXE4KjeRQ|DBcG{FD0vD$cqGTIOGCOCE`uvymBov+FadI-B*$-(=y# zPo!#+p6ZZkIuF1VVfX_GUnw0{kp5=5cM7e`q^j}}NDx)GKH+kMd{o-UGx~hC zc{LR&aR-L-cyZnX;nXxdHC>y zmCPz!tKu|aiBy%~Q;oJDq{bcKw%BYh{jz!Uh8r&}Odl&Wmu}MIJ!oa|h&#>F%5srZ z9pg!}vTueLp6ZOzM1GOuTM-FTUe-}NdbRncf4 zr5^42jG3+Uw0(6Qbs&LU(Pn*2WX8_-5of-Q&jwioFd$4&&SsZjqwNgG#xRd2R<{t5 zUh6iClReku>VTo5lzQkX`vde{3}DTjAVj@42!JNI6)A%x$nC~rT&R+JtWI#zs{2LLX)Y*t6UcT0(#_jE~q#iIj+tqal* zUr-rRsz$H=z!sW(LE2}ddw4vksCQ2`Ehsj#VPr?L6GC2M_W@F1XF|bhj+-b+*Q7Mp z0#4&AUV3r2cks&8D2K+YB#;gz@1UZu3q*V%^uG-E=_$UI-!QV<`+%zXMeci?bSKvX zuNynZRIq7&$#4#8$zJu-+x!o8cvMI3Z7Zzb5gJi&3jo*zdK{FV979^ z4E`~XXH8fg+JHTIWh)w}^Le+Xz1=KiLrRh<>0S9cK`@XITf+{CRybjbvP(ddSy$(j zUc6_rG8dq+E5d*t9Wc;y-B$i~B^ky{Y)x;dMpdbB!P0PEYMm68G_Eu3c;d={*cvDw zHQ&{4E2(wL4qlfgBLvXUCet+KczF;s;5t9yd`*1%By=eHxQ6zswR zQp8PD7bc?irgVa&2-Nz^&R||zQo1TZ&nng8f=sdFy8Y*??;A^iMAK>@D{wI!GWa8M zFB32fC|Op`@#qj51d_u%kUMV9g#EU0_zG6TaqvKo(jDuWq&iYW0DJLhXOjLXuc4F$ z3$Tu$t8!xF9Vi+L^-mWTw5`&7ZEo(F9&02apGId>r4ajHvMAnQCm+}>ab6BhmS*rU ztk+m2<$K)Fu~pP$7`sI9vJdF97Hr9Sui(btI}FV@7-+i_a@irJM!9uwi&&gq1ZLQc zwaomX`|Z=9J{z)zY`}YDhK)Z|~mrF7JmR)^<=4vn~U`4Z! zwC9M52Bn@5fxnF0jnU=*U`_SUNV3a}t^*r#v8BbV6@Y`8n?XrITK$q^s`LVI&$dz~$RyKqPRQNT&spH$Lt9`@sbl z4W}b|_jBaAV4Ck*nkfSJP!urppbz$)Y`1 zR{>xGC`@VKFt8S1h@H^xT?faQu;L+zr!Wvjg>1S*!T+E!=W#9oe*Qe!s&}YxSBeCJ~tCLQ2{j!833WGXibn(K~(b zQ04K^y#_ZMnxm|)6qa5|RI-roK5t7v?|EmuAKr1#^GKfUnjYtYEy zdQIABnjcH=={V&2nu~?0=@tAGhX*NVfOVg$P-}W=x&68jt~eUI|3rvb{R+4?DSsn2 znhcNo&=)?T!Z>|=I$kwk_##~wgakC~L@B_v-O*s?i|F`@HK02;YFHq&O$#zKKx`op zl&A%&v%>@ZkNEC$v<>XgxC(e54}KdJ7Fqn%e`R~;HNQ(|;*wX{j|cAxZ&`2;|=G_CoJh ziUTSlWWe)%Agd9X-A}V52-s5s$Pj>?$*gZI$#CKwv0l{-OFnBrf)%~Kzoj%F5STm) zoQ`F)#sTx;*tJ*KKE`hdrzVOigf4h%TH^(vB!HS52oQlz^(b1WrkF-akF=^^vuLg~ zZdx}BFv{ZU5bJB!L30C@9;DEQ99Z9ZO{z1*Y4%zsH~D<|DF{G@*od(g z#cezQEP#zJ&(`tx$&#+P_bRd8uEBKaQ(N*XZCuir1 zZnPuseJ=uLyOu4j10s-%h2oJ3VPNiSZ*LVbRZah7A+SF=%^M7|DZK2+__#9p6CDa= zBX5uZ?#Hu1DfdLIB1PwP>=8v+i6Uv9V9sF3Dv7jp7&4ri#ZF;0!vT^fwaVhlBW(bA znXBH-u51L(m0Y0_sOFu=_GY8g4WOiC zeR!BgWEgNFPcH+R~y%5vipkh@sFS_DWb=G z$qvl088(?6t6w2Qc9nRrCqXNc1ZG7iLWZHTirO$XvWktEj6WGhxV})6u9R*^&Zt)l zhmb>RrRBkOZ7b@E2EW z%WS>5+hNj!Nj361qPO!_D3VJ{)@P4F-O+*L@(mGG<*gO<5ltqIGEIC_^#LBN4?l`Y z#?`lIm_D-F;_eo*u(x5YB5Y(Vf&^4Gnq!5EOfN_QER>L9iEHV?H^bdRG+HR7Qf7xN zj9M&Csd8VJmJl-Ku4B*5LV@{>uw=YKR%C_QtHH*Wbg^WN!s+Jn>;v^)?4h}vw;B;H zdJ@3gFE^Tl*Vs=P33M^;wN%&oYJ!6x*<*KF=wR-Et1vcTp^b+CRwAg}lZBjCv121M znd6z2WEB$O(RA%Xarvj_8Im(m=Qb044$K zMgRw)rdB^Ig9n&;8SI3muwsGf7LDWVt619Bp>~qz;mtgQ{H!&ATB#XOW2V{PsD;JG zGt{hmHCR*=w^e{M?gAXDojszbNgqWJ<*_^{?iU4 z&$)*VI*gg}4_|e7)!ox!(LMI5XGKH=SD(YA?5@oqj2N)%?WWbrmwBd+?V8Q!ZpX&H z&D_-i!n_RI^{IQ8m`;Q>br+C$6}^{xUY5w>ipKVlJJ^Vwzodtz^k+xZ1g zB7#*=*%9>a1`#z+tTXDJZXjZ|@gE_^reG0!Y#jN#{+Lt^>Dx3O#UbX09KJ5m z=?7BvcqII|6Hb)xh|Qv>%BSkQplJ4uR8?8>R8@~VZ)AF^Y0jEGUr5#YfxwypyupAv zYoM-~%O)nN>N=?7FsSF{VjSX(qddcv4T_l%aXN?r^f^Oy$Fsx4T6Evm9-{d+ghiE$ zwZra&FxHIdC^H9Tf@`3uL6nu;^QyEcTi4wVub*EcJ;(RC1db2x7K)Gw+#N_AG~6(> za~phs;JWLK>s~qbBW^rQPt_(>h1kQ*sBtBAx>k-4?G@eQb7x4@7>OJQG3QO)+<4fu z5Z?2yhtKqehwbqxd*OC5G|lftK00j8yvl*p*i>}iL1ZCW^ci^KMNrkAkfG;0UiaFw z7A=IjMugod44>gR;pKKZaVYXkbd++*j-J7Yg2L$6FMNC4&NuF{oEeVg({N2?dq{RA z9~}t~8#!OLcbBs3!0eusEyO13i&Im3!vXgMId`10E7E58dgqXV@o1OxsH3v`Er;FM z_fFj#UifZvkM?punCPB0@4j2zkuaf?&FYb><-uo5w72re^ZGwh^&yG#843A@gjyh> zQQll)-k9~?+}hqeTfKSzCrH(2ZcC(UH1rFp3Oh$Uxn`MEorNrsss|DUSfz$R60;DG zrq876R)Ldp%cLrvNvg6elB!utq-qm*kyK6lOsW!@q-r$lmifuFFb?YLj8XU7o zss^t4o>bk#BvnCDM&FaFFb_QV3#mF2OZ=Wxorx)$*||un&RhUsq?*ukOj31!_yGWR zOd#A(AjIV>sk#TsBvpxDNL5EBsT#&4Ri`e5O_^Ysr0U@%QuXi>shahbRAupS14{|8 zJB%M*B2^D=vzflgBvt*u=VniyKe0@z9+DOKMyhTl9r{M90!&hs#RHhWC;;0C3yoPK zRUL>Yw=R;Zux-p{01@KB#=4CJ^B~R25uwM4Ka#3X7fDr@Ch!+hH8iG(Nvg7Jr%V|U zzav%eeq-w^wKPFW@6dahOs=klmlbhd@s_y?ls!o_( zW0I;%QWfH%OTv61Rg?22mr2!`o4+PiPhdhXNXIOZsuC=Xl6TM${a=!*Pd<~X zVSGHsOj7mQPo!#-RDGzrOsZn(HQY<2YHye2XHwOQ z+K=MhiVb3tst9L3-gktrq^ebvi}a7A>TMY&sml75RK;RUYnY^}DOLBBUe2#b)f$Dy zZ5h+Mv5TZCa*TbD>x)DNU; z=2ucxB668j9k4s|l~mooNUAz~PpY;ken+Z$E4cf9B~^Elmq=ClZ=`DPXHs<^lT_80 zjsBig72g(S@QqY`!M;qYs&4;6s;2(0kgC4dh1&dBvB#3&yMRf-zadq_IktW#RWs1v zk*X)x>eD&4{Q;?ZQj);+S5mdx&tQpEHDHpeQNJZsZ~Yyq8pR}4iA+-U5tCGvl=A&T zst#`WLaO3Vt^bZxJr=k`s+xZxRa?0`^K{o`E|RKw-$>N~!89hRn!HG=mUEYceI->( zekN6C%elXhs`kE%q^jXpQg!^)S5o!yGO3!}FZctgs{f5t&5-<=RDJk`RMlH1RRg~# zRqel!s$=2GZ1nF*)kMFqq-xnWQnee;BvsSDlB$V|q^j;$QdQ$KsT#;6RmFcIRRtsW zERm{)i=^t-&!j4WNvirXN!2ie??_eB5~*6X@++x2yhy5AERw2QzmTfJOQh<MN<*5C1EvTKkz)75_}C3Yt!mP{e@IL$s|>Gd?r;{csB0PyZ>Y%JPj=4gN~1?p*biRPADtsz!fAs-hN2)oTKVOj7l< zca823q$+8NRQ3Hrs+ug5s`{#JpwFbL%Mz&?ptne>(tajYK{75nKai^9OQfnklT_7L zKm==1mq}F}W;t+?R4rO2RjIm*q^iI&sjBx2Qq^^dRMlN3RmqE_>d7TiH52@qRQ1*6 za{(=qswaO(suCATRhMN_)oF=T)s>@&^XRX{;XypW5~+$`CRK@_NmUmnsfz!OR0S=O zs-ZI9ld8T;r0NbPscN=Js@g1(s!WjJI9QuNmqaX+s`}rNs(Sno2Yx~%9~QK>6|`Pr zkyH&;>IznJTP9Tvmq=BeA4t`cOj1?n8>u?TBvn~tzLKiGOj1=(mBu7h^}i!k0UqD) zNmYGC{UuTr3sSLJB31VWDg^|sAnPrWsvwLTlT>Yzbz3G?L7z$0!DUj_Ua}Pbg;d2W z4F=IS{Y0u(FiF*u^2?-Z#uBOOvP`Pdr2ayxX8a+k>iQk2Ic6REo93#lr>Q2m}%MFf8)RV$ZC)u7*ysxJQmsT%NwRPEB0hz9+R zRMq)Ls)Bw=s&=akej!y?ERm{*{(w~7wnVCe|AthBNK?yI7fIE?Z=~wuucWFwPJ=G{ z3sQAdHi04eJ5sgc3#mH%pOdNtx=j~&J^5!+HCTR;RMnC6{fSgP{)JQpzL2UeKa;BZ zi=-;;Pe|2$zamv{F!|ZT*2_-CZ53`60+N2-2Q_zy|dlgp&49xwWn0+UqTzD%m>t^NN1 zsmf6JA0buoOQh-vDIAkjJ%Kr|{V$QKp7Kkis_TE5RK;$v{TE2paI1QZB$HH?&$VKb zs$2diN!7QxUSCPoI*WgiRHb|&Rf+#ds@ngBR89XCshakURFw~x`UROFYgo1VUntOza>?Xi=^t2B~mr_GpTC%l~gs(f4)qr zy0*yXej`<1OD&SBw)=i0RrmU-eI->-GD+2JCaLOT{1d6_x~*W5R9&6_$D}Gl;(Jn+ zCBeJOQG&Z!3lz#ls(k&=kiiklG?#ZKNiH3=HSGPVUi{Q|`d#0G(w?HhSGR}J-Xyo)hJwc!VZ zM+D+`yM;kqRZpqTwi#+i1c$ruq;U7mT;^1Ci4M;|E+kr^Bh*qyM5D0Ja2fQ|#wN*P zXIlr?9K5c8s34v_!LwS7<(83UiJ8!Kfm097+Bd%#d}xxO!7csTJJF_M>&4>fcuBbY z=3QV6%i)p>*YlDPyHz2Cf>XJ&ZF-xp4vNOL3BN zW93?L^OWecKKt%HO+>b4)SB?@QoM_1c7ar*?7)N!8Ag*POR7>Dr#2+6I zR{78b+po|ZteC=cvwYG;`^X0~4y{_FFmcevumx|36Tov8pC3aAcehHJ=P@>s9W_4| z5va9^7iX$@Th$4#lP#;1Zkai1eOAKyrqAPaM}@I%m5wd4&lQz=n{-hxZn)PE*~KSKcQlYX~rC?tgcFlm!t zCOu4tTrNhZvPzE<;Ky<0B4PNGS|75v?>#wGts|-s1q%!lD_Mil94-&16F5}FR|=d( zoU*R#+^vNPE)<~Q{4{cHoTwLW5}!21-M1XFOen5%pGRD|H{hyzX?vp+YxLFTddm&H zG5QOA8xk2NFJ2=`>Q1B*DW8IekKf*ilH}uD?@xhBw6ZAI^ESxlVNa_EtgJRALZ$P1 zQ3hB?bSz$9OUgb{mE5u7IFPSxlL_4(yx@=7*$Z8jGZTNj{vf9ESfJV*2xTneba#F@ zRD#GQA)D7Gwv|t0WH3~_-hX35^lF$3^@14Yi1OLC*UoN*vQh?}FFJYlQHwSR1^XNp z(jF-oEvYp1^13FZF^N>UtG5Llxnfr9Lu%orSDs$`#SFS%dKKQ+b9EbI6YhMYn4R)| zV*B2>m8Ex@ecftEp|<+q?1Gt4JJYLAy(NMlca4Wi&yJqmTCet8dJjZXOA@OjKx%t^W-~`oAjUtUFD&QSzZd88`!pog*;9wKH}ye8sF3@ z%r)RvQs+G%zjK46P|tR>)&qQ^}0mrA|zdOn@O>g^T3?WqNkdN73fN0Pi~yTXTkKd zo^C$-L<+s+snS>YQ}%( zbrwBUQuKXaD$Rc!CXZ!1MJ))6Ui4JY;MgyW*Ts#erw*_O5;&?e(f;iEfR|cFE$oFX z`~X4t5S8P`07nc7u^|%vcub8p4tr<%x@RxY$H9e1B^6qUZYfLMZIa zJc5ZSpT3@$B5B(04Em!Zmu!>RnxqyhPvyzJEUm3nK05LuQHUtc9~H>ihK&sw6iuUu zuB|Aw;N2Cd5BxlkDYxh>ZqvmS+CiH#G`QN(< z?DgIl#U&7%eRiRo>8UctTE{zbNP@>E_+q$L+=x-TEzg!^-WQfs5#b@63V-A~u${+T zWU6nw%V}jTG#ipy#79r;3KpBiskrS8vEy1{$ewcs5ScvHI6k-mKO%P69Lqkr5iuEC z!NteogS|)CD2x&3wh(kCimy}H$Rt&tMhncFlsEPwVz1{lN{O#NCw`T&N`A6VA0uLf zQ8Sz5owy?ml}7p#pAcD!{aYkYVAcjRN!3Y74?S)v&q;|TOf|k`-8GEV6(39*w(k7o zsGZOG@iYY!3;BzI5CTT}qtPO!YC)H#E64=a-Y&pwd@w2F(XzVwoOHvabl;>{Uw!(M zI=+QRUhm5-Lq+nBZijk1k^_vt2 zuaR=|ePdNZot?GVRfBWW4{|x zo?+m)AwRdG&*C7^i;K& z-vb;cZ9saN270%0(-a~64MgLsQ)Y$cTb-@SYF-O0@DOab5T<;$zE`XdiZ&{qB2)$v z;Q0l77ldNHH8&cI%}2bbs7CqYITiP4j?*z#41p>dZ03=enj^b}}_^%q~f5Wo;==SK`-50g(oMOSCz@Q z!1nFTdV2{QS&+xO@fNRY@!M6o;}$9-Yqt;rrX7`>sZJjPk;6>3v{liE4gvZwGr8Oj zSpJIu+Eod5Tb#d~E|v^J``{IaA3MA|z-mB4w$X@z)+V2H;41_9)%FZjCAF^g#3(e1 z#a%GF`q)-)+ZR&xN!^3DF++}(P6tA0pkg<)BO!@v=PDSIE|xdVdXd5{{6uUs2m9PojJ`wX$-osn4s}N+}yvk;i9-K94+N%#KKxNkr7P5-ntjBQnrB?5Y6tQzXRl95UY{IUZN`P- zXEYKInH+g7jII+qM@Q)gi?!l6+`Z^D+PY8W(h{k9cvJJCGmYMCW#^B#Qea>%$HYMP z{)`bY=rQ2jA{uJu>IWC8|u)V>z^YyCc_j?}lLS)rTx&XpW;;t^sjH0Ud6A zmIIOo2Xrp^FO#a41;%gK54Pur4T0GGDN>6(U})dgdiQYCqQ>iXouR#PzN$J z0x6(MgT&LtS`I@1alhCC&4|NyqSFt!gV;;(@YjJO#SI5H0EIiDQhRcdv-t44S08xSS;iaO0I#wqwV>dcf*i)w*_ zHoWtYZ3|r_RcBS@1&ol98ru#8yckA`KQ)~RM3Qc4+F)S;0Q9($)?^?eCobGRm_M)W z=#9eAQ#MhHq^iy9Fh9eRvAaFnR9uo^1%78{9q%0SF&!7@(9qFp)gfNMXr?}mT40i@ zm!q2f455=DXZ}&DAi^GV}ovt zEP-g=t_|S_8JNv}@v8)`ykL^58|4ZQ+jxl!G9lmnEUq!w3%iX=LMX_xS*46506>X9{xqkr3=^s0*<2gZ6_Pv9QJ1n|+ z-1304NHU-;?SGNscYQyTRQ+&PZKq*^pvn_`yvkltfh5VkSVe+}X9p^lE z-n_o^_Sof{C>KnbpQu00*Fz8Ox-+pW$#T_2WmX-|RH;fR@g! zy`4I(yf^Z%#Cusm^pSfgK)4Y7F^nQR-zXq0hCs^Wgli?>q@FNkj1Qd zD&|iKE%3vvd?*$|-n>_?1e!qGpab`!pT6{7v4hT0Dx@^&Xt{S)(NS{dYGFsr(Xw0> zHh@X0y3;Nd02O+~^AG@lP)a|hVF-4BnqAnKtxEhIWXM_GRxH&c@Y zb_v&=u&Fu_T;-YYbW&XMMW3)SEHyy5hEqdvSR*qZNUfshTe9%R$Db_*>FAv1-T6QS zZ`w2lw7wP>NNf;MSSPwhC+C`(Pc>+r*MA|V#5xcU;!(hAC5Di6g0#`hRy#`;alrZ825?;v3 zXuk5)G!);SYrK9-cNR%wN+BHeyHBIbI1AcP+1=GW_+aenfu~nnO+VFH3In3#vulA0%dP=QfhV4N&vqrQbqsdh zy;01jkG%v)RW;>1S3I;|G_}a_0bbBl{*ADE7q1;X%hp+RKH`kZaZ|P9&+GUGH{ELO z{M1_4#dil(Z>=?&UHE*RfsWLB(c9@_(UcpB02_YK%L@%Jm4w(8jFWC{$sMB)FU0ip zZw(fT-lBEY!IV-qR(WW2A#Jp~`)q+$#g=2hgSj_Z@5a`x;rpO*voVmvYO2ymR#D>G zXT~-fI+?MEl1Rkh#uhw$;0Y z0jEf2=eI&oW33kjx}OE4h`tGsfs;Hxld76dwBGeMek4_^rf*cIHhv~m-Rivc{2|;C z%cLq;bBDt1&!p;OCo}-K+wmARCF^?gv9boPl~3=Da)a*Q^(7=9*d8b$x%| zkJeMozugs}k1ce#_8PstO(Ei00H@mx-IzQ)33j~qZABEK3Fa$F*Rd$Vz2ouyLnhP6?VP>c9DeR|7LvUzJuz+EQ37}#C&h&Hp*A*B|gAiYd_ zcufAeWN*K?_Lf?~0r}6cpHYGXu6LPU*5YCAQWca-)xGG&7UKc_z)rWkaeTRauPC>_ zxl{sC!CbkGGBzOEu}r^vr&TuA>fBXS{I2Pd|G%0n(rHgIsW$q(?CM&NgN7y`l+o30<9gZxh&r6!RG}-Mi?qoOx`^~fuy|#N(K+A(Pxy3~5@TqT+eIDb zmxmGDC2Tr85?_56r~<3X+RExjI-D`M@7j8~ou8t0nA7sur1R6_4eCR06bRW(Td$ae z&G|W)3sb*H+DLF*@O|my``fJJGMOrz$N3F}!sX z5RHnDD}IqjtLP?xxUsd1sY>Rx?v{^Fa;Pc)WBZ~{ocUZSvBP_QY(mkX_;u4wh5M7u z;Ms(ECGM$ZG;76jyqhnbk1q>yjR#^KwiKzB67A(k?(8yobjj}O@2CK`P>ymX(!FNy z>*>UGS2eep2cf3P9TfMbp4IVbTiTM#x8ah~7iZOI!`zqf@m$uvbis@Zd`4TkJbs>O zrIp>>er}a_GuI-Q4ECBPDk|^2W;l^O??wX;$e|ag#fER0!_6 z91~mN0;eLky-L-s>>S!@zY_LT&yMS;r7r>*9QO8D5DGdJvE>)SV2dY(eM+PX5Qp1L zDqI4xx57ux?32I6W53fMaf}*kGoVDnVK-tIN+@TVB-#zn!Olk7ldR>Yu-85J+uw6? z7&Lio8zetr{Dxt8*rz9}kQjwaRX{)NH#@FAQrC+XHGMc>*tQW< zASzCtIWJ#!JzmK84VqsE#VB2qlZ-7Z;O)WIc{VarClo%FfTTFJtp)dXN~qf0=+nE{Q07ge zXzAqnDBl#j&$}1;RTedVJo*yG{siBpvN^i>pjVA5$@!q^uRNn?UmhYG(V&&av=QeS z^pz)B*^g~u&rOghpBVdL)v^RVnZzv;vz zBi+w`FF=PU5)0D;98Q)cXbu|pYHL-6^BMXn=SLpD+s$p^Syaf@^Sv_~<`TjEl}D~+ zW@MB5>4$pm=4B`=eb|n7RmOMYmjNcL7zRgm0grn=5n2gvSri!GB3dyjDx?);dsP>g zuJt8_l11b>;jcg1uSPHYb$XCD=*-gls~440=v9;d_>@%rYdhg+XSz;x3$uxU3(r55 z?sH*0PlSGYmJBy22xopqjTrCSclY)W5qVM7sbW7LRSA;~m zwVv#}sJdrCc`PA+!f2a=e1&J*dl7t03XVSK|KkC2n+`Fz^f+h9bN<>B&XN$WfG2hO zRvXuZ!%&X9#bUN(_j?qG5%ys;oq^tipqiYZqYt5VfNOgW(v<=Z!@|JCP)YGHqFAUd zRgrBO`Q_YwG%4gCQZNL}a@d=H)oeVzVDE z5fWlXjC6;2URd^=Co!2M(TW2g`T0mG8YB;aLr4Bz-<2DWm!lrTo*0!xhMm$?y^%#w zm=}p{2OEV=B1$Jk8O_syRjvfPMZd6i4Y4Ms_eFC(Ai7T>yb@_wb50MjkzQL&HY5l% z&+jt!1bPTCG-2ST0E5IR+!CN&UqZ&E`Q4|+9HL_Ktz$@7s0)cD*%Fe!jI36Pty5vW zeEH4|@pv^+Vy0XChkdtf@tZdP%B!T3^_{3lUTE}|idjQmfXCx=G zw5G8v5))SY;y=R@<^cwt92OE8L?T1ydZ6pe35x9u9*Xfy{r||TU4(d6q8dV4`bSa3(%R_#sR}9(1b6tzw}V zg6GxOEH+K4&`YVuglCez;Ng8x6B+6-^9*myx=n%n!lnjBvl+WXFTm469tzzg&^#o+ zkQ{~Ulb_#w{lDlmFtYD5i3Wj8lA-+~oHUzKW0QiOv{k ziX}^A`;=6T(N4SKlXHIu&cu?-Udl2^$zu5rscP-ZGKkN$ zbPg5p$h~5j&Kr{_VZzb^&lUC%p2N}FFXyfFXQ8)VNXF!!k717RVOhuK8*~I}dE}r? zKoL3lhK~v?qiHab1$uc%3!ez{ffo;bSkA>1TpP#@?@zyED;(HYSZ!R0MuAPE3vV1` zb&=BT4hr=YqCFp_p+9CB;)-sX6q^|{8v3x9t`_Nm;scvstfPv2ZDGZ@k_eBIsF;%I z(vsMLlK9ndHsKqs;_Ke5qPL@VzJC>v2;!1>w}ZxD~Tx&>C37m%ifrj z)p?XP#FRCambDC&{Wq!FX;R+p@jpq`!AC42mb97!rFuFkqX)&g)KV2f#Y}0%?5U*s zszOW~JTF=K)ui&f30Q2tVx_clZMA~xQTfB9a(STgr%Bc9YUNf;)sAh|VQJ-FOx0Ft z)sZCWOAO7)K^2V}2_6gPCeUyLV5nWCqz?;!4hcR;LNHW+ldMMLst;{xuO!l$nN;Y7 zzIb%LMl6tJC=qeWsLHI>$fnc(eCUDRs!{TMqfo{w=JZBwwnl|pKymPm_S&1M5E@DD zT4nAxb_VEme4a4A*080ABF_- zK2C2?Z)vcKpt(h@mkO-(R71O}HLdv6%L_C-J8TSWZ*)Iw3bLy+RBH}nZq^QLj!9Pp zdo*f=z>GKwVG2kO9BmjTvOJdiwP!P7jmt-}$=$9=&@{e%tv=Ng-Fnz?@nvgaOY7xB zu7Sgvve@R~*xGkxElX!}jhGqPJOqh8Y%?v5Q)La)hb>}Gt%ho?OWaMphq?Dn-z9jy zn~ZIEJJ{MF@K)ONt(9GyQcK%H%ZaoQ7h4?3`xvLWn2&5A)1T?%-CRWe#4uE1?o&J6 z0kh`6fp7N$xycN5YlmDsjP-nsZD{*g11mr!CcSY3|+X;k(hn z4ok0&ilnxS+}*Zj9kM~~ShF5;X>o0Owy+#p@Upp7J4{!M&jJViD8%4OA$&}6aWiYf zGWIycabxV?Tw&x6^?DP&-fcJ6;bhVXvQ!4}hZ?}7$~8HRV$ zi>K7zkvxXSTc!6f497|>TE2+o*BoCly{7H-o#n&xaSdP9(V-ddxsL|Q=iWCZH6Ph? zG{+4>Zs*kn4KrqrwEqWRl}h8@!y-81M>#V`ZBcZ;jwm924d3H1JVzrTJQNWoZrBE; zMtTg^QOo%dBlmZ7=`15#JX_%@zDgLEZD(+$j%zVZ;5M$?QtT9c5rz&EU2zlp>f>6> zV=K~pKCg0v9H7HD(l%`gSr3@csbwN2{b&^vu`w3Efhn)}SSN#i zcG|xRARb_+>MZGq*r{VGBAZHw!%od%Vb5};(>Az}T$wAs_{A0lZzXv@-ng^+ z{^TEh)tUI|i#e6nn~H${qp#MH>B)tH`YMQQLV$DT=_?p&DB#0uUm|_Kgl)kc{29j7 zb0pfCY!js=<*9U)3sS=i?gm{wAE;rb24j|VpNf&Xmgc@b3~GEz+>YqI_wGw5ul=Hb z@YUuQEE1m=OozXjKhc-dL@*)j^-uBDod3dCDc-#bh*GwtxLG@S%qX%^eo5})l1BT4PA<}jcTSZVX_VPQ3K_p6i{2wmsI?=FD*7*Y3tE1r zp5m)Ur!`J~2#eEmUCc}z1!9haYJ3K-XPUcgB5k)^)~+%bQBG)WeUQ3ly&k|Llgf`b zWL9+So7;y3K!X)ewccdgY^BRaZF~g!qL)Z zFnUuUy(07Xs9{Y4GtvTUVa#w_%2cOM!0n2_JwoU`Jog_gd@PHR-jU8V=X0F5*d8XL zX%MM=#I*-Nm{o@UX;f;_-S5OX$}&-rFQXY5M&#rA?_ ziVWl1WHY>STKU_ry^j2@`}?NcoU0|nW{z~L4_zfyqhv;074B=!@l_OEBpP9?(26Zhjt z`zKHzkf9GW%4ZXL=cuS3QLOP5!!YJU7g)~9KeElPrO6fp>m>b&eeyuroTIT~wsiyf zj%iNAY_8|;gZ{U3*y|h$RV@99jaJyHZA$~=b8Na6yI)2Mhnc?YR2Xa$AW3*8jYOnp z1#K(klvKX=1h`+A$*7UTvaNN}Pk@wLo-kNY*??_hrbgGLql@sxJD*XHLGY|W4!YDFIED%|-P@^f8qWOt ztK?_6Nv2(AiLlFxoFl8jRAd)-lyDmEZPsn>1mTyiHDav+@h#^HgcAe5MovWE^2>NW z`#w`EiqGM|e|56rdT*A);r-1|Z%=~h_%#Fi&1}8N zNP)=T>bn_T9ajdj?b!)OTZ{4lzV5q8F zZj@;A@|p>g0mG9c&f@GBf~!thmt4B*rgD5lbe{)gQI%SdHZPmbU?wmtYcJ`N%5Ui5 zow;m-3;u4YJ|?L2zT(5@EexFmF%Hxq3Kuh5Hd|to(>7)gICVa#Sv!5;_j#4J6}@kJ zbxi8&`rjXRI@uH%2btZ`f~fH6jb46^YU#u1z|ACkr)2b*@%P=4>X-N*XQ2)G2kLVC zziz2RXNJ|%`11J8kNTDp_!f*GtrR)*DBcWCe)+OM&3?N#b^e)Y!=tQts8-N+uUK2q z&)(~CN)OWALgRw?_;t?o7e7xGw%zm8sX6$SaVt$A5l`D(!x}B-Hm@kGZQllS!`<)& z!>!QPntfa0qKqB4$0X0(7j;T8CVw-s3eb7ghohD8cM>+?(qT1e(3bHh=|u!4Oef+Z z1I;2HXS&d`WZ|kU3P}9FPMSZ&23g1Qv0d~(z=Ro6eiw;iv~65f5XMmXZ~>Abmv?|e z=c}LCV76|e5GUlrR98Hoe=fti1Uy6a{xj;Qd^?=cIDr7fXlX}2-W0MzPH^z8h;$NN zX?huxQ2a_5Z6=51PkX}+Mh41bNl(LFg)etJEle#)o{Xlm@YYt|40SInch_Gx9nW?H%@+-sK7t6wy>H! z`8qJAQn+>EsmxjSXp8a-uVDUBXaCM4Hks+=k|DQLy3b$mdww~{$})`mRbu;QN*ulK zvu{bOIH&@S_1T7C@rly1F_h<1B=g~V=J|B6@+0V=^4DoGF4|_xNacxKWQ3c^g0IdN zTW`h^rtrQG=f#Bk6Z!nK?ne5mmy(}P6o~1&n_OwW^t^teP%gsV?1t*)S5p&3=epf5 z+-ts^^?RaNi`K(3Le(no%w&n7zQ?81W~(CQ$=4PU9#(~_S4yu=mR{@jxLVhIrSkq{ z*>ze^>u%MnHP0u@Z|Qs5PBveyub-^A6X9w9UG-Y?)MVv@ZcoQw&DY-jo~#O{^}^Gt zS$CY7B1P$YIkUG|_b5+QKaKEm6;ZPpxH?tyqTB1HQj5*-{i!$Ew71>#)ojO}Pt_Ld z-}boDVmnnoRaX^p+v~=On%&2#srrWhpHx-9rTSrJ_(9LztY59SH1}pkf}#F7v>I+Y z93LrB2L5^MZ`};eom#+={skf$?k3khj=kvdFH(Bze&NB#@oeb55`7JiOD{f76dT+t zz4F%MYQx9Ls>plgH#9tLKYW~O=($&U@2#if-p3DZ(EFqa4KHVo+38+``!%U=y>6bH zof(O|Ut6eg+vD2o#}7UC>+9a$_I@xs`vv--v0KC2@5SuglEH)K$+zD38)iTKhTmx!is%(ee( zQq}GMJ*gV?XrfT_Ug@<@tJiuTP1U`-SNY)6+I70X>2A&YH7`D`-!csRIQi~=eZ!{> zB_`Pa1HQ_PLLgWf*;!dQI5;^u&hT<`vGXts@bL%>VkAU_#Kfg!B*l58SQO=?h2*); zD=Da}s_3YzNNDkB>*yKlXqp*ls~Rg_G|^Wu6}WQ2MDEg4yUXSmtu85AC#l(rb=xw# z*k4m~Na@9MdAZp4x$x<`zQVhDI^A$Kx|z{`Q*gjt)ZfF!+%xm`?b}1QrQhGd2KunFW2O*obNq9oBJgp_q|8%t2#U=v=6Yn8$fvY@Zrcqjfh8n&X3DRAM3^jKiCLX z948oBg({7QnqCW235|%}jL@)+RQHLjcozA1BJ!d)u`ZPu{4dS=ugjVg6aFvFI-Owm zBf+VSxf|43;C);i%Urv6OlKU)r=9%m5l*><3b9|m({huytn&+v@dheyn zDb4Et%I?2W)<>D1k(o}9vRoc#`F+ar-_E-6UnuLXyu5;)eDD7$vKIQ)6~2lr3Q8^N zd{sh@dF|i)IxDs`{FGz;Sr(X3{;<5f{M2Gis*HVCnYU0GlT`IMuc|xsG_778pIQ_8 zx~8kMrYP+{B5PW0#3{%6t2U~q?sa;7`l-wMy&-L-A+@sceNNNUy#L}@e>ErPx2B(p ztfv<1n|I^O?{a^?d-<=$N+y$=+bI+6wQKE#zuPm*Ix;IdYyX8lvJA9;)~+f3;?Brv&TTWY3?;*8e70hh{oAX9iCd)`O2-BeP%sPk(jtKmO{61wo*J%ZrP@ z7e9VqT0dHv{=Tw(u=06f_0!(!{NnojpY^XRKfeF@vAnvq^ml7%V`uenXMOwE=Fi_d zN56M||M_+J=l99U$!Ycmb^`o`^B9qG+QVS1lJw|1wi-`l8p+&$Mnd6Mc`J+QjWI-I&N%7v5DB=z|`@is2s2)=*Ot?7j z*hdJAV%Y#=&I$E{@md^{SRJfjyFv#FS}1@)kz>O9VOGL=7F=mw1&Eh4H~+;~M^Qp# zmQnHl!dE#_b@bm77?pPvbgSAD!jUX1wyx4O0uqDL+O0sEKG&rw?4?F{KWD!IQ*&Zu zh7dZBxszuyq)Q$(ssO4wb{nd05w^G~X7DZnbz3{XVDuk+Ro9evnnlfAs2x2PrTSVC zgcNnx(^=7sS2x=gUqB}tv>uc56=m8LYj$uwMS1>`;C{2SqFo312?8#s>sd>t&CGHz z-b`%^_gGpAB`4EIKx)Rvb(r(t!~bviYPh-3_U}3Io?gOjqvBq4EM%%lIdr3JS23Ah z+8vm)5Q_KLu#80>K8s{<+3!4G)JhHVA2UVnKHv&DWR;b>CoM}6k2kw)Sf6Aei#iw2 zqZ`Pd)-gs@-SnvpT3!HT>a})!tHVX<5}5@_?*mLmvJGY>%! ztza`O&Xbi`V_+Gk88^hgfU#_qRRQhKyBGlq*Mtm6xId#h_a_uG3ATi%s`x~&vtDc} zkcp@?n&m4TAQa09Hl$Q6E(ZZ}9~D2oRRTC|go6}5)I!2RS4DgtsW4hHAm~bfmju35 z$q({SR}cz8JTr|-RG*^l0D;8q%os7>It&e4e9EA7>aeDw53s26Dh!`)yZmuj=wmz> zrfb<7Y-_0nD0Mi0E2UAnsuFq}pTlov7N$CPP4!kk6@7Uq9~XBe%*`^HJId^v;0+bX zLy9v@HlTty>wO?{uG6e3 z5(zryy1d^^z$z0onCkwgHflWDXA;B`pTs#aX0Z}gEmc6mqyd-r-y(B=C-s}xQ26`p zZ=NkSI$`iWdVRGgnMTyD{k_o6LOM2FRQZ_Gz?$NssjQy2;z~Mue$t8m_o1uSv{8}3 zR)>v{rJ=@oo8IK%ZX%5WNPFNM$`NGb#Pml<+so+fxx=@Cgnvs^9d?bjkgg+uC@~?@Z6`HTk z*_*wu7T06ovc0JlxUu#{QxxUr6d!*K^k(lbT{ZK`wcP!QzA+eq1lqkyL}Yjh#U^(2iP0F#t;K2C~>V>wG#w=xW=_K|ZSxHTx*f zq{yo&3{F$eg`2_It@zfIycMMxuKUu5n~lZ^ zm?T+>w= zKu>(vwl?h2->5Lxm@*GFyUN1^g;3n-FR1(A*SeDEwBttSCcSf={tw1wJqou3gJ@9&1(kc6~OI^z_foSnl1VsOn|0i5sdjt-Y8G6+s$|uXxi5bHSyn z>=(;T_a+HfetxILR{vgb_PgJINVcq-Z~|c{F1B`e~cwS?&wExsz!5s8TKti|O*Nz-NRX;xb*V>go zG8}n0FBx|^t1>m|FaNPQm_Wl)Sax1qgfl!4B7@#TUUn)`?d$*id69$eFyyIDE+(XT zeQnpf1&Nnsz{Sw3G#*NZeSa>ChMHpXBVIvMI;yl4(4CrbH$ox5na62Qwv|y29?pU^ zU)W{e+)I$~_HC}0>po%W>tnj%hn{MlTziFm7wwOco0q>LVl)HQM>uLN1Y;9x!ITh))6@t6L#$1ntw8-pPX3W(q}TmLietqLJW_d zF%qu)UeDsqmQkQ#$BX{H;k1e#R0?_1@haxKbKPZf!8EJmm3;v`J!`On_8FBf(-69} zC-Ef>%yI~f^BA~Ll;{CFyP>+qB3wbEqs^t>yU1++DcR?R*W$S!Khx8tmO$q+B$c7a~q3>|5m!0LDkSYCu_3%#=&~duS~c) z8RT`I-s$%G&b`uw&f{{9P=AHFt435G;|$t+_DW!%?0(bA_|G-e#x2xdRPfd<)O{Tb z+9U$I0Fq%E)4qs8i8FJcL+IuaN9Ryml#ovxp^iz=2px=YGL&C^uK}i%bjfi6=BF5}uPWFk3M7B6%JrDX@+y3WzC5iz#i4DW8t1+>artpjUxVj#1<>2lH_Qvbixf^j_@h zo7iS_Y=@D+>l`f$Acj*Og1d=oCq>X*#I7yf0@n~O=rE6-!7!-AeO%`I>m#0z*G(ld zONXMwo}(x#s8s^oOVrZd1vRO{ywaGEMoE~pe)=QVO@YjCh^3bcH;qt&0ONEYr|hQ5 zw4wyaqn{8u5K>_a`xS$y`k&s+v8P&MVcQbke} zl5nz*JCjxxq;Z4`5%oHSUz3vn5u6RD3WXk2%_vVrF($$wNiaB;27^IG^rM7x=vb?n z1_9>emocYyxJ(|F&qZF!|D_OBVL{m9aRH(nlsY$|`*j+bHZGXRqN@yfWwYh|xK@W- z0>!%`HMAf_{|vXX^Xm3j8TnoQFJF<;E7IgkSPML?XVO*)58DZod<^H5$ALiG;IJ9g zq9uwENCPKjGSYw&C74Z42w;?aR`GODtB}?%=_S^mk%>jQg_S8W!oY!1QB5;WSc~0161x=Mw_^MS-4NrOTv% zW)?vBQ))HmIa6r1$s}l;s>yTNuazqA>QQt`t)}6nAp^-fU}ct5X%!D@zgc`E2L3Kj zoY@1#dNLr{5ejL=scJues!MU^ljMX_=x0%R2U~zvB1~Wvi6jFiJ~WR=i09b65MvtP zv?&gcC@C#Hro7JPD&_E@iN?`rtD>SU)8w~+V-m;-g937p!ifMJN5g?CXT*WD5`lKr z@?*>L-dE*=wlS?(Z6!dOtgNwEgJcVXq;xPOg-f@dI;$Y#E=Br?1V7%3ZrV`Cs6_imr^VW-amO=De6Pn1TY9~tF zvp;~iDoT7XU(^)kGFa#6NvpPoEIR$D__RC^xKbGs7yG82p6-~00Pu*rb`3hSF?_l? z*GvTtFaXXI0Y&AnC1&CSYlKZ364LXd(;F5k^o*2zfK&!@sy-$$3LJp7a*!CH)`wL8 zpbV*kr8x$g^GutI24k3(V#Cw~czr640XjIglrYnH(-Vj_E*D#C0fn?QOVxlVG;(&R zH7pGr19ZkzL-Djxg8(ZY2s&j}w@QzJw_~%<7IANndl_E6kwX|K4$8N*&O2?QZJExS9Z z9dl$R(~K?(zT0-a!(83fQoUzwwBv^|#BII9CXv>Tu`|uG)1A;GzSScaN9Uy96QIs_ zQynCq(S7@*0j?Ljjxe(9dYIAo`J}p6Zk@?<9WJ}hWRGREN~A4Ogucbmy8HCY zs?(=YnK}XUP6E>smfi*raR)lXq&w>#c%Krs%c#L43^Q-vJA^N?Iu5ay)7L7^6?fFJ zoIJ3UXrO?HExc%F)4xa|fbi>5dt4hcGNr=xh?(f5*`ok%r!{cZ95WNP+rG&HAuI zuVC8H?>OcKFVq2c5GBB{HaE;x!70l#f>j@p%`lLS8#(rB|KkOc4Y~lB4{*oRIW91L z@S-dM44dR2crFFx#ME~>d@x0Z*I-a>1Xu!r5tPIbM1T!phJR3D35m$RmctVm)DZ@* zzX8+eM;21)hHxkrhf(X>oNY%PTpB|`K_h>LhH43*EGm=aSGe@I4jKH&G?wYwI+Hk- zUW%!6U`!7FmU4_?FeAW>sjzEQWC9-AL4ajYkwp~RDKcXpX4o~6;US*>7|#?2%p8-a z6PRYg$aGUU=qD`dTIOU9FQ?r`$M6{FGsZ&J3-*UmTn5{HzlYWZ==rD&?F2+*u@Kt7 z9o%d$=mf?e#>2joXphN|Js-%A#19dVX{WHLg}Iq}{BWfC%=-=GNCnampntJ3!^-<{ z?mas2Tq3Ol5IP{<1L|+6CO*@PcgZl5~n)>{-2TCJJ>HfXGeI{ke#~t51NQ? zx^3+S{AjJLUES7Q?KYlE>$^*dXsaI`MBiUm_x<(K`zQNcf6J2n{b;g)z54a2k0}3ihB- zZh;VTAjWxs`EpQj=OC+!N8I|+<)L|zprt4BVd72Ou85-`fg`V=?e6TuSl_e4ieeQ0 zzjr%&%G$c_$o`$q{`;}*@7zh--?v|SK79GRpmqH1`my59Upt=T#kS+Zb0egYBV6_J zy4H!D_=QPimkkv@VsUcx<>VNN5`rLj#@t}#cj4@^t^tUv&kfX||KO_&`f;46`0DiN zSHq{YNM+&ITTXh35?7w2jP`voO_RSdTwpW4Xr6KY-sZQN@g<8K?FfX3?ZooM0^?NK z8}$BMEFl4?=LNQtt5?bygy1P5bZb-0x-|(Rc2ny%weA%+IR%7Fuhu(F{)?|hczx<% z7eDjkODRQgeqOR_#i=WrUGC=G^tMZ1k_b3Lm1XNjDJ2q?VRG-TwOiULzUpv>Yra0W z3=w4a@t5axaZ_p0$pBZq+xy#MKNdgD?%kobM)1IJ+_iD_H*oya z?C}1M&_xT_pyzXe;i{)%TFQq_8%%Evk=KvCyFA1S{6<@oW@)9=3bPdK=)Rk3_D zfx30A{lGyu5AK97vVE9v#ejGx1AR=dVaPg@m!THwI! z{*)Jxk3av#S1(G^3hrJs5=yPr)a9((y<~QFazfTn?APvPOI?lfF>}_(5s?mk9nM!S zvVFX9MY^&*HW@SQ>^iA7%p!8l050&2M_OFs%IF1srQg=B&kCa_oPJ10f&ea~d*$@v zFdSWkcrFG6IBJ_RI&wS`0KA9BLP8;YDzVUeH2LkbW#n$@FywG94FN)OyFcQb`fKkx z0e<8Db!_)b0*%v%GcX&Ww{k}2k#UTu%>56yIpll*SVJjMZcXJC2e6-tKYz%y@0xO1 z)DNFrrmhVHaO8Exs1<2MF-yUiZ{ZFyh`^5ObY5Z24|p6FBxNdxt)wa*WOLQ}KkW;$P!m+a6>%jAL!bb#2ZpfhQ^8!r%wQTCl;tuKDA*)`s#sx~grtqP`|QB3 z`EnSX{DkTVFm7EA|4D6ajF!h5Il+b zAZE+HDVDJi+I4BJdhqP}{vz(L@fwdd00^D>nk+s-TZfDBZmWaB zfg-I4H1zFc5L;eHI0-9Y4d75$3tLV#UqD#m8M9k(0ZUvCKR~B^_q8IF=t48;MP!YDX~>J~J=inh(z|;l?gb4MtQT9Pb3q@R zcCh6=V`c=YkJf3{w#uU^6o<3Lgm{X6gV4v{ikVosP#{_tx+Ps2$YW3!p+|^j=qJ!@ z-ZGrnKAUpP1_tKpQ8OBseL3BXbU1fFAc!_lvj7D$a4ce6mW6;Dk8V9wL$H=T0$ZSc zEmyK7%~$vSMN1fa)fi2Uq#hmI2)6A_Z+;SA_p5^8e8`m%`OcixJ0a#utAeQqHk1nq zuQNnO;GBMDRvUr^4|1I3U5lI8RDYH=3qxd^qX9-X7mkahj`Kd+06j4e9ZYzEU?jvk zhbs!Sc166$!vMCduV82nSpl@A zh5;n^j+vV~c@+V9xt3)%ix1r1efi8nXQ6Y}w&68R2_p0N~i)M?$cW3VB`D0-T`XRPwQEOyl2P+ zTD*dQFdlgKQw|d|k3b&%tb~lEx zdWk~CEDv~?abzgk{6}@8+GnX-e=aS?@7!SqXtKTvgJAgS9bq7#_bok0x+RegMvj`Y zA<&d=K)^$-Aompl5c)#OJ01xTySj(qKHt?QysNHNDR0&k+ou(0Ov{J`&b-{&A!SFP zKxw55KATy2Rj&jz@AQ4>yX+sD9ad+mH3(N#S1{;1VjsopRi5J0^vhQnt4erLx;J-| ziD6I(21goqWJyhT_?g}d|2WCZBrZyz5u+L$B6?sd#ieLrHvlY{^P{feZ}c58_&FmI z0V18_XiYpJpfwW-u!7qhoJKehKDL30t!>YF`K_f8P}L~+jh!2h0}E1@9QK1aW?!^g zg|+>F8O6hx`%+oa1VYz~@IN){1T+hw-|a|~hC8i-6(kGvwS~qp0*9YD@}gnl@96d$ z%+CTFA=%uQiJJp8;U&B@sOOvV&DU@*AZ~=*@l)KiWbjSQ&{uWj5iWSE)j#a{oN==o zz>APlSZ2vtr{N8Y(TKF>CrL&Bk|#iLId?HM>=0fE6?$g(JWW8^`Wp6oE+MeL=5%Zb z6{86T@q%|NVLr`3V#hTyp^TM~l$7~;0h+D*8Zc4?D7cnK@)Q9ezbsSC94X0Q|r1+WM$<0K6);h(w-v;g*+mpwTEv10j7<0ey~JHoo$A|2y+5}o-I+O zww1}Ah=6FDN~wmXHVn_57o(T%a|3vTx0wA>J~j>ZI>COfhn!y1dx78&i6b9 zpSIxWUc?lhmV_K|bz-(!lvxSKPHB~o0JjjC*4gtIS83I6EqAGrFyXdR%Dd*WQsTfF zD4Jgi)mPKi#yv;7D-SSu!DLO!BZZ?);*46P8=@iS1c{w9{XI}|C`}2VS57D}C6Z0` zXVjvSYw4x&0H3|afd>I-1b-jRr3(N=5Bfx%`<&T&oQI63JUC|l$TqtW;ebi2JWktH zaOO|;U=V?G&P;hr&7cpczl(j#K!Ez_m%K8Dco0rLm`R^pCOVTdC0t37joJ}BeBgDA zW*H7lk{4HKc0)@RkDDMt`AuD|QLUywgj-r`>)q0H4VL+&eN4%0Lu5Sw(irKs%-3b# zJ?#qfrFOm5&c;yMqZB&bmb2+pvFYE2YUTyal6gm5b^Kk+uQk^EXx7eS&KMu6+T6)z z$%BBzg#>GQ17E(Et$E-1wWaeA*d68IRcx;L#_mffoo_9-nukhZaY<(jF4+3i$RMx(t9VE01jmKh)^V8=GW!o3KX&am6vFLi5=pFl0dGD3G@8EMcslZ?dC} zY0D_gx#EOVuudn#N=P!&$g)OWj4>yKp4{e1Za@am?7jv<{4E+JRK@nQk!}WHLgrxT zsL*Yqu)`(SLXs$Zz1u!d@mYYyEeEC|#E6OjmI0Hkre!t)keFI`mYjVEoMj^nU4n&a zV=UjduO~Hm9Aw^_Z1@swY$j+1YHUOKnL$?Nz>1nQXtN9GwqHVc`$vKg}8lf`f%KLG|_UetVk=IZW=qs=ShmUUn zI(roD^r$wko3d{Jt$ru1&fU~jJE9xO@C#_RO#Jl8shR* z|JeD~-caFVPX&Y2*1-GtNFLg_VrHf8I{P6F2l}zO_YMwIpIFBA9j4=XCT}}@tZ;xy z%}u}mbT2eUqT1o)%imAu=TX8GhXwsok!V(`x#RZ=mJW3C!sE{d3yD+Kj;j@)*=886 z362{#3jX3**8Vzf$siG&G@a9qKQl21f&anYeML3VM*RXOB(zWxdMANE=n#69G|KY1p`+3VW&XYe^`OS;ATsZ`< z{9p0a1+kQ`{J$3xa@j8WfEccZjG8;b#9h|jUG9oI^2(wRlwH8b9T~sK>jO}HRQwIveU^m zLRI?3QRtWmK$iVlXmN_pnXgYamtC)T8C_!c=JE35@rp_SdwH?j>bAN)%&wch>N(dE z^xHKbnq72fb2+wwIqzV(J$`b}hO1TsC+-aBUQThvYVFf}8~3psIK?eSCJZ@uwO zj9-pu_z`KdoSc?<&VHrlB15oPp(j3EF_l^#Q*e7LezmLc``vddN%KA#Jid$%dD*?+ zzFO8%^o5wNUKaAbH|5E*umF+p^3e5tQL!2p@B3hGHGt78)NcCdyKkA4AGT$9Th6c2 z-tX}hzp5a=>Uh7JY`@xvesv9gPu}>|kNQ2G^K1D1AHLeir25s^|u*JsITO$O=;r9VkqVO5m_+OGNl=Bd-I`C2+{M)yS$iofwq1W7!{m;TgvzpHM52q%(h0?S=o?-Qu`3mTc@3dbTS)m_0Q%E&VEF zuLP4CJf3ysBA)&0dfQ>7hQ`!I9}|Iia`q zS*%dW`F~Sm$SiJ9R;xxR%#SahoYsI;a;++&Uxi9u5inr^X~ zuIzfZ}&fnZo9hakP{uJw-eczj4dST4gf70_o7N~sOrBQ+}WW855?`Rn!2c}N^gSLIfUf<@aX6IuN!3qXse~uo}d@<1fn7 z>We)Y!dgKtE8B7-g__=z^}ajE(GtUm?e*0i`SDU*Dxa#~uEJ!c^*znNwOz&cHD{~O zo}cc8U-0UxB>cMUkO6b9DikqQc;QLSIoTF0d6#R8zAJGmQNU_a_7fr~hjPdHkNP|% zk&Rz1fWD9V-?Oc^X~>_;lebY4e&%BFEHKGcW58<4(^UJw=Y93~?A?})`@%TaQ7FZs30-E)nm zAIKHDfu|s7&Mo{W+iS@1)29SJm?CXAYDD=<+-IZYA4aXE=ff;{EM${Gx20 zJ!27%MRoTXgqrFRXD1>2fdhF-KJQuUXtyRO}CkNoc3g5tvrYCUxzRdtja}tz1b@I zvzc{DlUa&lpv0EFl&VIErfFKRTsouO6c{Efjy;ApL>%Ngh}#xw4)C`7C|Cs)BLC^l z6FE=b3EIkTjjLOHqjD^r*gjeRaLH&}Q#31#Z%o4dQ2?i?RnP-n+;XF;8P2v|Ji@9} z#=l)h+o(HIjWldx^sBjR|IDe3b75w^7T@p_hyZuD6?NA5S!&cYZW_HMc`ZeU5` z{D`}XWo|lt-3{W$L=viNOgbf0mJ?Ht6KpPJK_y_hZ#Lp3{|2X-jfUi=8&`_`PU|<` zex75tzXa#59k3QWx|Rrui1ZM7Nk&N0Ka?iRxt!(nt67rP z7v)q<2`%vG={Kp#GmV|5NuKR=S6`gAG)g4iwP{>NT5SymG*%SUb|Q7X6>=`T$;&3x zbjwi*sV;T6%<=*Y)F91V&l*t__L-(fGZ?k)aTl2hP8EGoI~FGuP)tiRL#|uesAiqJ zzgPb~^}?XtSw>nvVH%Hg)Zn^4pZ`ecbu$8YjQ;eIEaxQ2(a!1E1D5ivDhtD?G+TR&> zh{@$JT6tdmoXl#p?OPq+h8KITpCop7bJF&`jJQ^u(&cxT=~59~`0D6p;H4LCIV7_i zb=TkapgxyPz1|koUnl!rd>Nc}vG7Ll{y?8oaczq2?xI9ooQfw~*u}HcZX5){xHM|pQn0Dl?!84RfEF(^W!WL{9o&z zUWrkFyl2(qpA3E)ly7r6sz;_iriR5RyTZY#?@acvqotR9&t)`skW# zeKX{1s@5YSvAgb_rLG!;>HF!%Voool)-2T74Y-j0R;=Iu*u9!ofkawY=T1B&#UUYDj?^)B+6kYnTnqS9r@z4*_O zox#nC9FkqdneS!O6{{Ndx%_?nwAmOR>p>LY2xzxe79soog%7=`n}PI-d-K3a)ctMR{5Pnt<_i?a2D?-acQ z!7jB0qHE?My&FiFD{86Za?S2L$1$--bgKb%=mLA}5kfc?1Jat_&dyt|fUO1@p=C9q zEeerRC72$4A(S)Gr@E$W3M*FMtRuW9N%S$%0HwaY6%_;abwM8wtlQweX=wgo_Hb#lt9g zm}z3gYM;bF_vbWL@9D&E%htzm9G`w-qkf`jEc|96PWU1o7-XmRjT(26G$994$dEyY z6BXe={*l4;*GWQ9xb5dFN)7JH+ayuT2cm^wQA@1iCRH7U?+H%%4@@OvJN8Mapguvu zW$|wSt!BVPXOh_aOJ6T7*`(n)8Mf-C>tDGf!)A=wCPPRCGKN1g-AatJFx*)<&Ki>b zIAeH*-Xk)Qzy$~J?c*g2F+y40?CUgkW8x_ptjv_Dt`An1Vyb6}Wr(R&28kWHM4iHJ zi7cR2Dc3ioFX`R$IlAynbaMK()%Q7E`aVP z?xh{xPASQ{$+Pzt6L=RZ=_zjaK2dC>M+44y9sqolM06zpoJBl7wai0FbX)`)-DKt) z`Nm%f(4!>kxRwcCcF7C5eM?LAl3v@`r>Kco||u6JF>4oG^5574%5xY7i}QbI&1&Cs5NCg2nS z-*8!9sLK`T!URe!iB}ib{lyeL^1-GPFE{nc2t_pxCshO_D;gs|DVmD!W7S-6;z>Xp z3ShWN{Qbu@+`Puv*89dG4J!Nj696EX)$MSIS1L@9E_6acA1Ax7oF1(lm@)nQ8uay2 z*(edHUhnZz2Pa8b)+neR`mm&cx^r=;20iYBYxcqIQZI)B!Lsz8e=`c%Z$RQ!KKy4^ zY@n5r?m$sZDKIOEW1MyPhKGFTXNg0+yV7?;!2Gq?VUN0rZ*oa012}_CA{s^1`Nqig zz-l&_y}L12rjohy#aB(g`+1F*TIA&#+Xrav+I(Cs?(^KEI;3G?V&(c`?fYU)lO!}8 z{zXLou?!qkZX2d#s>;q+lc=PxidApxf$EE0xI3wLNaOFBHz`0sOC)R?{BWh!xEk-7 zxANdNAmt1|&$K(1x!1^&ng5%jjAb_*Q;rTN!DOl0zrAn)MA@3}>Pe(xa3<+ZPjyON z&6TsYYNjIxG!4fRu1cyx7H&9QSuhJDlU;*L`q_3!mxxNlbFe0pGajQsc+LXX+oK*) z*;rg9GqgEjB>nN5ZGZBpKrRxbPAEZ-ck>YB@|f#w0+d_7=eEu+KXYf+GsePdKA)Sy z@*zuw%q*~*`b&J|yX= zu7iu1^mg$=+KD|U%AdUweWa;H|zBBHiqjiH3!km$kIOWW~$ez%L6u z!^uxhmsa5BY4@t+VopGY&498YP}ppbvGF+Pfy#Kts<}#5PuuD#&rw0kM3YV5huOV? zyLb@(R~g8)?R~BPr0FVh%^dejFrd*Sl3T_n=pz!O^aPHTV2Ho>(`*dqNVW0%1g=shV~GX<6d1G@8pgElj`jgq~quu?I``Hix0NY3ysv>f9E+H&5W0W)z z5JmoXtjMecj&+L2nI`W39{yk;GPr*9Hc>H*3g)J=I{*ZmON_Qd;G|j({q=L)BuN3P z)Oymn7dw-t=*=RZ z=ef4KKrEt{Pc_Nsv76m55+Uj{6H(wGWuKmZ+ z5_hf@;LrR;XcH@3!`KgDxgD^a0^fp0NavOPLySfF+L(qj6w?QI1OCkOH~`ew6Y7UP zUzNcUs=7bE4!xSFD2>1>jpJ!^K2c&CCfzHFv_wu2%{3cyMW|QHd`bPijPf=Orz>0a z8b6_AYHUnL#kD;Y;KJd3v@U-#MHi52p$uH3GtQV( z-a+Zso0f=Mt&X-6?q^k8u7~5Y>XWOw`!6qKN`?h1gF>{zaE%gw&6!pBd#gJ7)K&aS z4{41;{TM*tKKX~K z=>i>OvYz0cLyeuHsDIp6t$3_b%%tkQkK-b~RRjsPF@eXs(~s~%6r6HbXg^;v7YxA7 zf14FiAruyp00Rs(_6f-{)+9fc<=S%C+8=yvcSx9ddZUbsX%GLS=aEISNzTswtl7r- zA0vzHHy0$vH`cp>@t)o8E@Y#l8!`F^F;a82~ud0qZ8j24a zfiW*&Mm1=o7YL&|<(NjgVY9~(BWv&l;;5-2+KAGj75nPR$Bt&wtLFTpmrgPD_vlCM zuVPxmVp`N+b>w&O&fo=AL>od#rZE;KZ5s2D2^6+ObYlV~hKX|@50~g!jh2X^>bdtaCssb`TcZbNZO-blmHG+!t|7 zz12CSY&`s^lVWlFwmNp;r}5xUCtIg+uk$h0qq7fY+#h2+j_4fcj2-@KJhl`&p40iB z|Mf(_@g!&Gq%N?$fK|8T9Mf7jZKir)GwY7l&W7jlZ6U#m>>`olDLqytyCB zebbo(MFbXqCpVrJk(ZJ6UZ# zS*tdIlwu7jjQ+F)cqN{HlW9i}YZ%MCqeYibi{It%f}5ftBPM(DXfXm^3v+8%p1!R| z-|?XDdZ4$&(c8+mc8$>cHt0WEU4J~f_EK*dhTqz~Mc;dYKIlS2n0l(ijoG&``8=## zQi+o~iqo=RsuPVt_Qnc6We}#hyjFSlK#)^cv_hyRf-8>cv^&HGs zbZUte4fA?iY!Kn~Swm4*YxJWPJB(_P*=}oOuYrJI4&0n9|BmSP8k_=v18oc*c<=K$I&d-_F z2>RTk@Xejoj}f0f;yKj6yYE;|yd!yAYxR>_Ok;pueB<*QElI40V{h2yN*r{z=3hZ+ ze|Q)8+-71^32vr~ocl%vst1QTkTJH$yp|l* zn=(Bzw~hG*W$(D~HS&^T`GzK-C8KvF-p$k|3G|#jlP2(xwbWxAnd;l0$x3{>aWU$v; zf;u=CDK2G=G5nsw>we&&>w&x<1ol5Eatc}kEI-$G&7+rLo=S3W_(O~J)^IEJgU*V379$EHK!X)`5suYfjTYYAtWT_TB?swJQD$1C@=^2QSF~k$O5okFWNEJ1TN$=nc{QUJ;56RyfkHJ)BBUrc zNme^7M_EEz*nykN_$H+?s+pK`X*WU70Br4{P+l#G@J^B{Gi%8cXc#QNY_fXp51%;9to-&8~PTmJ;v%$Y8sJi)k-l#6U0 z8Hkc8*V|b03mO%WG`|!v3qrS%C1;NG0DKP5L=&%R1>OcJlMQQe4- z0RLpa9-d<}_t*VL=h(mR(VvvZ=?5sMH!EMBv0cQ>QjT##w)(1p=Z9w*(@l^TrUo5Z zV%ph#yvP)p1r|z$Sas5Pv&SvaE!MZYB1L*+;!!dZ09>ETBD)tVSnMoS)`A_8UDK!- z618sTvD>Wv_c71aDL<~N(B1xh-lpqmBVfdi{vdYGKL3&JXN4HT;OnyOT!S@MRl__Z z=N3)!bK~Vf!J^@>yP^Lmost@e_G%&uMTidpX;7`aU2C2*R$Z0?J`U5`5PBtP!=3y?^H3}43|3p5Iw z3CSf=Pl7e(LGq{8ztCUyTc#)*%(*}L$v*JRdE}2UaM&O7az>BR0E+blYkBsKl7Y`mV$K1VV_)e(hWyI~kpJ}gV+j;6+fp@QcDmD5zp|V+Wuyx+A zp1Cb+X6Nece5#8J^V55Ksg95bA3{+T6JIt{Rom7e=Nt?dIPZbbHETI8ksTT<4#9Yh zjvU^7phQ2^0_O57$HW%FJ-q@&7m}&T5!k4auqz9*6T(3daj?m4-Jrss0LNqf*Lq_H zQ4#UQ59uyldTdMMIlqc4fUc`HAKTyMChwN&9aVboUBCYD;ckuAmya&b4BuDSpAGf- z^5yyJ;fGw6S%YF-2>m27kOX+aV)bp2Bv7rY#ih#O%v3dTt9}2f)X6=H^Jm^=pLUwY zT%n7^0{GrY4Oaq`>lT5Ty)K^F#Qe+>Phc7MltQY?Hq&RGc!$Ui>xh}IK`^|SCg^+h zw({sYphfx1pE%;{_4D|cb zXuZDHxIa`Mu9f_}gRY$D(~rO(q80RC=FGIBamT_Dxmb~nHs+CkJo^atxp&FWcK`7{ z%aJXmiDj2=iIE`L6sqMD9`gnD$@%wH(xwa>t%Hfg^DDhBBiO6=pe?7;9&Ew1Y&uyc zddLjc4t}Li?xdTbK&J>GtHvYSaiG74+Sy>1OXK}6o7L_JRx>hdx*e~{AezFh`;Y@v z8_1?dmN498bwjY7Z)D#ZV#|M`n}h=OtFD#T9%~Nqf*dr-fbC!v*erpwfr(coo1ikYwj{7zQY7?8&N27L4*MV>?xT3cH>;5 zW^Y>NB(cRz)^l|Eyy%qGeORkrg(;EY%|bq3IZNZ?l;zi%$kcz%_m_oVtMSlMgWXO; zz>K%{FfoQu@uLv>@uq{|)g}JZ6N0bPM%=e_yh1?!xu3F#6w4eBH3Fjxsj{09U@gcegyWEoP%?mu4v3wlK43xImV0L4UuHie8V-V z!^g*WCtKtPcir|pyTV%ydrn#WSy_ubS(`~&vrtV8b@Y4Cs0so$2S9q9MSAlg4azu! z>*WH2V4PN&-$FNJngEM)T>6cMB$lDx{w+2pE#L11jFxmiaiLa7$A$IaJAY zYcO``l?w0bg&c(^d`d(<<)S85GY+sFP5Bm=O8fQ18pW1|1UbSo(+lklgrr*-KXDyBo8`Azp<1;4{G!$) zd##dyFo6&)$#DKlhvj-Yo2mxBbuIzHb6Be1FYj**H?)-9Pg44LP1&jQJZM~aExd(c54j_)W*FI!oA+eXWHd1u?b}`3FU^8xTV_X=us0b?N#4+G0?#JWU{jhOx+F5m#L@8%W=;g7u&sM#yblRn zZB!VSG`Ea2isuH`Vd>fSGDuUx**hMVKoXaJkW|Y&wQUXDZH=l4N1sZt19IBCox3pO z*g5Za;fUWUP^VzN?s?>SoiH7pI30Kd>hyu79K<1Vm#QJ(c!hrG2HBfek1aq&ma%Y^ z@pps?_$3I}P@qaT*c*w$Jyci~6`I0?suy&A6~^XJ3Rz=vHM?W^kI1FAAiU@Gb%;8? zmc@U6;`~#A3&isTwz~w*DFQ^uEoS}VGao5|EKNlP3HxE@A7+B!^BO6oyvIZfVglG4 zaOI!0pe+UZZCO20`pQ3mht6Hst~=raAN0)yQ3Fhj;V zsPOx@;P=msO#&+6(xOjju{7Y393O%ex;C-4;)1#?K!6E<{YUb3mX9~D3rB=GMF>u*H(P}~_>lA+3(jVeHT;|_cH?vKHnM(&&Y8u4zapO_+j^?a-f(=w;ia@y8yMevcst>r>m0SoLd(;542 zb;g=jn^;aufZ=iu-4VeDjc9GVMG9u;3`uijC)`#Bikb!*MD}Ig zV$B-7nDe(VC-7sII%w(yCvs6xOUe4uGfSc42vPY8@fHHlN~FHYayFZM&e3$9x@bQ4 zUd~t2vSu2iGP{97NX_X_9i|}yr(h**8j%+aEU}z5H}U|w`QAl${fnIR4o+dl#B8y< z#%-D>@9!NAJfW=3%hH~FJ}p%GB2o*%PlxK?MNi+YxOBfd3KnvJ3nhwn05s(h?vDw% zii$0T36R7#jd~)4A*se=VaEwnTflb|5azjG6jzm3JM(Zu^ihmneEy-l-;Tym5K zE4;voz%T7nZv0qa0{+pQ#&R}?J>05#xKmWFe82qZ0dD@UW*^htBOfVQfb8Ron%i#B zf&4O656HTH`Br3_>g?mIhL1Gv8|zD_{a9d!MriT?%4=68Pr|Rr0eQa@csi&8-?1VM zgkott(dqtUw^;_JdjDFqu}?HAk5{XS%7dzt*vC{TBW|kF1PF7&j<#6h_}Kg6K=b^xXslh>*5i(H1ajec)O91grx)(IKY6R9K%( zYaSugii_WWcfU#bDD9(|7FRH5O9E{U3s#e6+MzzV+b{H+$e1dGN~v2P38U_u}Qc#3dpQK6L>)DQqQVumn@- zQ6{FeOSGiVY@e!(=$7w%w@pO+SP)@hE7$F$4IZhNey$m0Djy;~haiL;XER@6PXXP0)P|9t8{N|FH za5tcT^7uws0f)V0e*l%M5K}Wjgmq8^$V85PBBTWlnKpaHyL9@g%h<2vG2ytbBk^$# zt8qF2(@#^Fpiwe+;S;wXOvFT-+T{DRHL0;ql|s(s+O4a1Fp2J#(Yhp27bQ^j(j1#X zoFk%CRyPTrBGkT{`mMO>BF{e8E%E!spuZ1hIabF)N@h4-Q(%-#wbCx`i zDoe{+;2g(ZYff+1HZFtr*dKWRZ1}nG?c9|)&I)n+Ov9^ER_-QzFBgQs*dOg1;`BUe zQBaF$9`WnW-?1*~4XUN^8zyGfhh&iiAUo3xOB_?8an|w4?=DMWFj3_G8=preu71+P zT42eJQVq%a@e}tQBN%F*=^cx_Fr$%wpj3S5%ChLvR?ov=&LZ#4yQWn#Gf zv6tAtR7>%C_zKMu08U^A8~+Eo^1uOkCz+I6{5@AU6d(DhqW3gFi_&u%7%@LU_ah1s z#3^Q+hAQh$<`OrIW!EV0Nf%&?-o&U?^xc*)iauta^k8f#HFZfK_DQ;otNd!|=l1~< zqL>7wFPHJv(9&~naR4Q&-(I5+KIpdBi)#AzU+;hm-oFzd)R@%~?(2W(SlnW!TpWjB ztDmg$biya!f9YE{`5(S|t%IGf@yUzei|_=*OgZjt^IW27HB*A%3;PGEEbBsS?2=ZiA;&Q-gjmjFW**6US~<;gQk%` zUK#SyT2Z-L(}uH77HAqoe#NkM>oGGy6WBwHp-gNEj&!Zz>f(Q5^=02ySkiTqvyrPR zX*GiW9cvB4YR;FtIn>Q)>%7-r?-@Gr@LRBlUGF+M`I%GlE8fxD;mi4aTg}`x9J~~h zoByjUT@4KDNwze$sj4RDDg*{Uc%J6eI^|iXrkQLzioBzppIb*RXtGLu_^`=Jp^-;L z{Bb6^Vnik$RHytb=Vr--DvhUQK;=#$3+%Jd>MPtIi&5CBs5o zZ1c={Ax~FioH3IlOKE(-G-3$!LD<#YIp!@f&k;J#Ugx2=wY@?jWT?8Tr_ea;t=`e= zS-n_Y4}UpXMJkb9pne<=QL2ZUJ6QOS12?H%nCy2}eIY}790C-o47kDV12UQGQ1Ij9 zC$eJjR+P2wukVj?Of392-o_vOw&@?_buK)J|F_5gK4SK(Hqch{sxOes6$-pKIBuqS zynEa4v#t0Q=~Uf1qU?#Hha-0^oax3wZN^)qhe}5w&^J_;r-4eMY|=y*OEdjc+`0Xk zPLnjxfr}rHsmjUHL`}}8sV1;9GyeS;;aJ4db*P1D5(X+4Op-0Zi6#@eg~F2%n)9Zy zShq5^+bt>Q9Je_AMQoZVd$pn&Az46Y#{5%XPd2_Jw5!@v`1 z*5Tz`*NOs~a~ren_a}({s!IMdEe?JuY-4@XUbq&8D}q{vN`i?A2H9KH5P&R*84Jt#OG2IZR8{ z@#MWsWXB6deo`rsvYDA4y4lj)n|IAM^V6f7czst?Z{au~Cepj7YZz zB?bmQ;{38=EiBu5O%0pkM*rfohmgvK+J}7(&%y5Bgtx$hZt_}xT&;_Ey?#6B#u#Tx zSjoNHhvek@mEh{waO{hbi^*O;hUI>=L{LC#n`gcjf4S(%ObHQSb(>hIG8pKFoM3uR zRie_2QTmisoyJr&mwTbibjSW?U*RMrlN;SH_B4(k-S@Bx?e3A_J+|tD8TuI`-EBMklkK0%-ndXIk7OmsWOd6QPwGnB79%ua2JMr)qLLYqS zxv%4s%kX7OM%;cJtCwSXz%Dc(lStMgu)d;bDpEWX>A(}46-Na3tI#~$u0g5Up;Q0oU7N1UbwAP zlKv_GM7QmFegJWkD%3TNb9efkgXXW~{`;M#9)2o&W3mK3Md-FKqF%i9jVP)ywu8C( zJEvgj6J4leyb;YPnZZ0_mmHrNdK$f%u@wL`x+j`(q84&A1`yo%!oI=lG!83vQ@)b@ zWVg)WXF-K!aLXTfg8;Y8(3Ru46HKbW-9WG@<6CMxIvUw{u=g@7k-rH#;8-Zm(3PNUGsq>_0!W!T#<$q1u_Jdx} zIgQ0-KrBQKeQ3N;BKyo$^wB(BZTIm8WZ`IOOE@QXdn4yoh1%Bx=uH1DNoL4vDu-$x z-Q|!H6XYEd-51l5d(o#Ufz>VHfp_eI1yRt_=&03gk;45(gH?^d3@#&r01yO|UVBLJ zl-X&;n#3(ol+a3|`hcy-03pSJtxFmLD}-uagv!;3li)FJFB7o}NqrNED4!&YNMda7 zEk-ix4HWDD@~1bLcswvH@4p4QkXer~oSg|Aju=j64KO3WuH*+qZ#popH2G*e>9k04 zGnv&F11;PHonC>sGA{oBH1nmO%}VN|2GLR>jl?dlsejc24%AElSq7&d9DvhRJd5MZ zHz=G4R5FsxV$=eeATaj_b5Y??9|dThW%}K;w2XnYMGWHrXfvD+Y<6JFyTDdhlg_=x zIs7NJeMLfL9?*wL>tfGxbmx+yFn3&p8j55f2saB+$(P{JyBFEARv<_w$Rdi_w~<(u zMmlj|8(o3iU4aZJK#wRH%vct}4~PPVdqM&FegzsuxYDPP(<%*;qJXJsIS1_G_45E6 zDu-D*yIcW^M}g0>-@XlpD$MhoFj8L$%)vKPK3S4kt?U+8Am8+%n9@`o3g?Xk@@gBI zn-}8b0HOQj@t(;ED$P+I%=2Ghm4dP(4Zx>^b9t?xA!!gx6!YeMF2^d*VnQ;a5u&RI z)vwJ5=;!N1dwCuPkNm?E|0(ADS)~IEf^!ahOMSe!+y@^qMVp%pl=cl27jUq! zM3)L)_UL0`5vD$bM4QJBl(kRuFe{a1P{4A2V^4I(p(9hShN(XBO=@HLh{HDza%9-p&*eBSS|Q^w;< zb&s8g9=onRz6!2#Q>yZ?uDa$|<(*OGTUX^jR3%JhzOwniF4}xkp&Fh6KCJ|bs;iC} zs*YQ$j*jLvhS!j+|HD^}gL!ZL559U!tR@v)n|#0KHUnQBs?HIsEi$aVJ5-yQQClQd zTRdA^0j`l^sw-8htE{WTLFvp=P-flWx)d-__zd%ty1J)_br!^pU>5XpQDbD*q8>> zS4}#1?#ny#W>cWhGB33KUx+wAV?BB~ zIq}javw3s7Woov?RkL;ca;r0^LtCwQH@W!l68mv^tIDeuym;#-UkkvdB{;9mL%Ad< zv~?xA%}lvD(x^R{ubnrho$TCp_frnZ`9fd1cplX6r1A38$B=e7XU|97N``fs5Fr$C z`66qFdtUoiNBi~3w#AONw3wE^v#&~yo~InO7k+9FFMeee(|O;ybC0h*%K6n7*$3|p zcmtt4%Ly(+a?lM1bV@!SNQ2nULzW5L8%*F?3zt5@*99?8$BmG$VxCfezLJ8sbRM;Z zzUrK~-`P^!HDdHE$%x*0a-<&i^UY6xyTvH+Gz@E#(-n|3N35`(^EhkSHO@dv%mRxC zYe4IPGiBrG)s#}#7IPOowreZ&m87xc*sHF6qfP}0r0dUaG25$KGpTJ8SLxTlBb1`3 zcQ=7$SB+F?6IhC}ao3-p_0lfth}a&vk(Uy-J?ojT-VWJ;{1G;0g2EB2; zWh32ZCG-+)2kr#W9ab78WMlfGW^1L*ptj~+J=531}Y``AC3&(kL_n#Z;0b+ z9yA#EMQiA7(;F-qwoT@0s29gDj8phqwk^)Ugts&GZ)ZyeJel0Mi98F(Z?C9~7JMGz zCl3AQFB7}m5|ZEcCaeEeiQa1c@SoUGcO=Ue0h(QENhx#5X-^%Yvb~k&0KR22I?odr z%wlq$CoP!83kn%Pjk^YO{9P9xIW}QLn})s^r5gA4l?)NrS%v~8Da1gGH+-5lyh@0{Z=IDa@@!9z&{NwcB zPPp#+vFPbcv80dHCQq~~C;tV99VecuP1l3tkL~}5z5D!XBJB4@KWQYiBp|(p-g}iA zdJzFpI?|g;Z=!-Gp;u`lC_+F~M5GA_h;$GTAs`?iVvs7LR0R>q6L0r@@B7(jpYxn` zUY!^FADCGyvodqdHQ(R&<4$ivyqXp(nvsEcNOo@N?0+nQ{(z+kS)S5;GxPH6`;L3g zpDtZ4Jzd{t#27=0KJ0|f!EP?)J5)b)`1EVsArX0H)R=A2BSpDxge=RKc4L56cOFjcp&msdNra5hVgE=a=Pj4e+RM?;NVBB~sumR~} zkbDn2Y^w%W^M@?j1?_h-SdeJCLtqWR(5Cz$CY@pTWNGWg z=r1NAY)DfFY~M+t@2XkO9#^%P~1!Hwzq;~i^*>3*x~ zQ-R%)+pnO!C84S{s&AG}KQdXj>=f**R-BTJMVuCB8k%q;$C$L9w0%-v#6oB~>eU># zXX7u=;5EoksNk87JLVH>=wDKNCI#J<-+lKqqg(a%y__GfUa@_>)|F7_P18ose${a# zP(R7Zq*pf-x480IZctOF#hWs?bK*luUf4t2sTBA*@iD^FXD8ytw}%#kHs_SCtjh86 zi%y{vW8|Psy0GJcX+WXW_k3eves z;!XAtf-XSh;?@qlQ1#QoFu$4NDaq1zM?b51#+|s|@<)_ojR!s-DL|eFQU8o;mWsLY zCw$9y{IVA9%#o!JEbgY~mPiZx7fa0wC!;o^HIi?JwO@)GldV_~5{xZKhU?RY4?cPc zS`jUvU3ew>gLN;e z?$B&TH6X>0o+WTSZTM`snJ4B(<*ns&hJtjHOeU_&WBmM29+;H~lj>HILnT_#_vW|~ z0xTkDA}ItX`D|PzctO^tD8cfnELJA zPa3Og+Wb~fDcvfX{dr&73N~eXzNTN}($nWFt(R(8o!05JYMRS3V)|PMw9ZWkxFeF5 zOQBv$@alU5kAf_HHi~p$t2e!3Yv+iY@vmd<7eVD~gprup=}&Zq{ht-oIWK)$LMW

DJEWF zr7}e z8SqZdy3!!R-$fBjw)y69AMzp23sG8X>taegRp9A;ie*7FbM5A3)2EpI+QdngW+U0C zu{1a6;$*9Ql2)&{W`U zi9XwSR+6E&ASd%pl8>K|hansu2sSXX8_&7j>n*C%VrCgNo?8O-5jQ$%Zj(Qr_sGyk z(zeCiu4(*MO}vk^&&g9xQ{(y1dwpahTTZ$D7{A>C^_5RJY2nHCw&0baui~v13t!c@ zg`@Gl$`4MSzG(OM&bwY;)y9_7K~Zn-e*7oC`nG7*&`)Es#WJet?Y+%-Kdt4HR#&Iq z7Vr1^QFOkySY7||wuA!p$HFwM32YNYxRJj;>%Z_-LlKQLsdf`(?0x>mDy{#6ueQGH z3yNxdasFG>yB8l}!O?x1t~>ef+E$H%uTH*j{n7O9L z|9ICyfra2;TJB)>=}x$D2!XZDoknfC3zZlWFQVl^XFuJ|-cOzCZSz20p6=nN2~9TA z@?^d}-797snrhqT$@YA@PcAVu-ABud>)mv}T7PI}WSbY?{`4zdny{<{EpI{gnE?~y zu$)_M-lA$VgI3fchX>Sbvi;1EeScVfW1Elc<(b!RG~oq(TE2?6XNLWZ!|zPC`Btkw zpLr8Z-Q!*U|2@7M=? zU$)f4%lBV$>0h%n;(`~IEv1^JW`Xj5+0rXPepI>iFIyTG9)3MMfXbHsHA|^rDOD@I zeD&)89aw4?r|y*So*#&fNGXT-tqd&mi`4x|7xW%C71pQmik^Nh7yUyf0?E6 z55lQtX~M&6{|{zq*3&d9Tl&A7r8$4iQYu*b_}Qyx&z@1i(!z%Pf0?CJu(YV@?$f5B zzieqqO9quK9d4~&Z7rgjrDZS5YhH{~+0yd1yZ_FXR&-R=bjnG4Q@RCjlmfBV2W6!-mcaO52~$TZVJi5SFnyT8X#GP%A1?BT zFcpcwPzlrGF0?v156Y;_aRKs&FojSF)5iO0Vt)zK3vbP({}QHGApc32<{L1yB80^_ z!50?wkyOHzsQQ;M?IYpk1RKZoP&D@h+>xEB%4TpYVZ%|*+H^4TLJ$*PMiLJ5T=6z#*CFAiAB87KZI$E08|0pHxBxD z!ZeCXm=+h|;Z(x(L>l)rGd-hVJCu;5@c;uJ750;-5~j&o77b!#XxKjp(+KHy%pbxu zzbrz6N|^2nQwdW`%=zYj5~j(22-9e3sVrIXW-4Kt99_o97j*952-9zc8}7;pynjIqv7;Tm&Gi?7k{vvh*%4R3$$l*K90a(ey}Lb35S;|6qIU6zb}jW>MV zI_`B_y4^KBZyafm3#}hEkg$@;LW8dgS|h;+IlyKr>`OASL5_WA*}T6#Dk8L6Cg zy{>?_(NYY2>1e5Cle@uaJ(7OoR&8v4(qg(0&q!U9Zr?vhE-%h zw{W)fRmDh1JfR09i4`O!wGJ7jgHo{ZkW?cTsPN6mlr0=Elz#$_XxW|_lbuV|1Ech- z*9E!-zDgy(GRLM<^4$3+2lP-A+x#qZC~R^qM~f#pgDr}DgLf_I#3&JYP&nc!i!o;1 z7e{;bb#mrU7*iJ_rd&U;e06JGD7d33_Urw0u|1T_lHUMb%FNg0;x*XO3bkkz@!r#M6_rL-a*F+6b0z-894?iHYrxzQTz z8y1#9)9_>c8I)SiV|XhDf}04nvYOv657+a=;jme|jJ>x38wUfh8JfpQ?$&XqiPT(e zr%82&3o87RPW&x1vEpE-W1-u9M`L9H3m#@9`6>S2+mFP*H9X>jSfk?E6m=hlIVtuD z$ieDgNDmnlWOOg<1kAR$N!VfvE()-%h@Iw4l?r7|FYSpqu(-ie1^LCXI*1&<=@7Ai zgRh&E7U6Z@a@r=EJ>_tLDavHEj5)mgW#vXWe14NQ-o+vOzHZyL7^N$c*+D$-x0dfr z*a8GA|76`OJ9T`P(!PubB=PYP9?~r0WBihg=uS5Mw|Mm@5OGskx}s7fD^N+e89?T! zGe!Z7thlM~CZZZ--NVzjEM^E7q|+nyc2xIU0A_^Xxq&B(l9yW!xE<)Pq#zW5^x|=r zhw(^B)V}09&H@=z`md`f#na)5lC1ird$ahYH|*5_;gveKAWAJ_!S>*_xBZo?RS$hG zzc{9s*ORBwt>Tn&MHzd+ud5m`FmIKPU&>}Q|IUxfxlmp8JD}yjd5D(I_QhP|)2w%8 zqF#qWY0F&a8R5=ZpXh{N849r;BjZ1=A>&;Xs$|_$DhK!0NjCz&bm?r}l?f!7@b%a8 zpSf$cyvH0z2_d^fah)I^!z^=0`v?WO1$Kztf<1e8uT#d&&okni5Q9mZBt#hY<^^7V z{u>%wC${KO52tJ`Z}dKhcZ%-kE3U(=lU%2Oy&jY6Tpc4U$tObDy$!Dgj%5Uerreyv zM_;&;=vTys_j_3#o0Z~9yi3#>H1p?Pn|4VU%9oF=TbD!qobdy_$}-0_R?+xZe>wO#_Mhtg(4}iUq&O#$mGRG@n#Al zZFJk(ofE>x!eFV8f!i?M+TZBxVBw2A*#7b)b0hglrt621Ga>^{kaGm37|Rh}l7{H2 zrQ^sooizitc4^PHi14Tsvon1_tN1)_!0TSG*p145^a19g zWd7N`Rn)d0`q!%)v<~PfPb=8iAIOw;8q>dmW)WxNcEH2OkbpXR3r!%~6I}KFK&JW_ zR5Au?2O#IO2#|5exBDk{?X+!j{4@nDY!qnW<(EY{;_E_PAc9a`Wm-ET)D8!0B_h2s z|A0&tXiuWm%&q?eGOa_a%~AVe*0{P@F;*mGk`YFmXdIH~Y{8ZobdY#jA#p?))og?j zMANevTnnVu!%im|c)%pl$#?|B@H-}mOuKo@<=R?O`gAguawZwMot@T6|;L#%@}>Lt4M7 zz=KX*b8IT-7{u5Y-M;k)GTk)shvfW$OsCi|sOa?h=|7NZG0vdyACRd`K6-)*nab0K z8UKMyXVjU#9%SaCZ+S$CLP6k6b9v_1~JPNI{w zj%KEmG44TR4x70)L(t1a6nHyV+mYF5A9_pFDw+t+R1SuR>s$C}hn{t(TRg2Bkr|Bv zuZ@G&$qEW+ddtt!J15k>*xJLGa!n&>w!9P+$l$FL1|C=RC3+A`<7${f*^wN`dV{}= zw}#Rn!rCV9mI9;_gMLK*y{t47mER76ULMh(h!3Egv4-=WaY%o(!d$)9ZnLK@- zY}1VDyrKbxN#_`yI|7!5-G}0{TxwV$c&u)PU_sUxn2+kU;tCkgAYo%b0_7x{Jc#D9 zE@VCfRRWmSTT)5yH1`ZpLQNRfBGw=gf;j|kC0I`Lz?ZL8bf78!!sbyJ%0TVVi5VX0#HVid@r+<4o2lntqYck z5FuRQu{HD+5yC~JOPT@C$fIWfU#bn(Hpjz?HiPQDvcWrWkNPW#v(z25mb*it{3xo64CALR+mh zOr~yX%X^7Ra>9soGg*H)Qw=Oag36ha03&x$qb=B0z@$v1L^;QXM2BFxXmkOsb-eRJ^^D+G?$*07&!g zUi!nC3PJPA=yT(xTdAC>pjrPCZ30-A%9$$VSLZR)%k1Rm-I29~V?-`{_fP2;l!p(_2EVz2_3oywVZKAWAbQfMuH>X2?FdKz4j zCjHKvRsiQ2Vfk`Er&%3seV1wPT?W?+jy_3H+R1tk>!yOAfV>7;{#3c_DfFdNO}^YP zgDy!7bwTRH} zislnxppUH&=xQqy%vyq+fvmwQxl?W6Gw_a6^~@N6J>(hH++-pG;MyvTGjN97%!mfT zfPO!`i{u=?_ajwZ9me>80+3f6WM5nB zu8>TCdIsh>P#CqM-y7Ks$E3R>Y3jQashp`NQZ=SWy{Jc{tw(FFM|Bvnnp>x&)k|;F zqZ89>TGXp*A)7$SzV}mwV4Fk?|}~-)nN8XF!i_ zk+(xz|Ccj8yHk*mfF3({m{)pFE~&>4cqJ0@N=+K>)z^>H;tQGso#g3{jOmdK@5NF& zU#ZU_^+~;Wp22yA0kvVq@G4r>VMYfugOxk1asu{Z3+lHusJcO$+sW94Io)N=xI(72 z1E7Amp0ui-=I22)3&_>1Kb)z6dIyy=ZHYndV+MPrx$EHVd-peTXJrzGU)ZS)Bl z^prJ2oIB&)Ejn-)eOwlz4})$Bfydz(zHA^`dWT_+@uS*IGsh zXv*{6Y_{H%>V`BmAh+k;gU66?1hDh4p92Zze-wBg&0t*y)+UqG=JcYjpfzQtTnaDL zpL&I^qXI*qXGw^UzZcOX%fGjlS-zo$?!j#3=^q6yna;1=$nKY7>E>!c@&|N(e}??l zfRveY|AN8D)*?gZL22q}S2W|k{4kvI57m_OG*j{jDoK_u;M+=GHG4kPIV&0i=Mhjt zV4A^QVga((Ee%c_p|9g;D)^?wy67veFW>rqN0<&2Zw$3>49{A{a6!m4fgq%l$RGD6ZAd97~=3*F$HZpkI^3TczlE)C+xKyGfum2QE$x8ioT zz@Ik3%yW^lU5Bo`#v4o8N8h$Gr$le;u*q?VbnIxR?+8&)ymF{rb?IH%njO`W9(5Fl zO2;nkhaII&Ds8%_MV-Ch-o<|0y-~WSIl5=i!DC#~qrLUctbOThN3R_}hk5aLHJ9&a z_<5Z+zN;2-c;Dzr4*lW7|3kTE*Ekr~WWnK$edUeXPgL0V3*Hadxr{KU{J2Lo>aXLnqJDyY5vGY3X*Ncyza0|br zul$nNJ!&2O)lkDDoqW|f=97%p+V#*lnU4jg3)0k1BEKVD|0_rz1eR;+$=X{8b11@FEK<2Y<{~m zUFS3UkaGV0kFVLrJ;f_BQH3MO*go*)*t7l6r9pN%=dYw@2SV8`t4EgiPJUTEci&$3 z_`N{T)Cx&?-!KePqmLW^RVTjmbKIly^iZfkORZ1;x#55ds)LNhZ z-arBcd1oV6a&g>cZ|v=%4U3IAZ*IeQhfYM7wWa3wy|>#m-4W+(PN{r9rw#{4+S)ls z-jX?U=2oPg)1}5+O15s3k@l|F-iS{*pYlmabYZ*yY1;Do(6WPM#rAMEFOTNZv^vk1 za;I~YgB~799@?o1msr-zE@3BOKb*}IvH<{|Y8wEi1rry_~= zSiZ>6>ObXtnYa4>+2=XF{Dkp`U;rZy0KuJI_waL?YVb>Ww#S3%DKn8%Jp6!rCkXiQ zfegS@r^&GJW_2tMdVL&%Yf{L<1jC@}7@$NbzmOG>{Ba=;@q>C}1^{u)M@tq{9au?D zts4M7V1Tqhiq(eo;y}ioSF?VZ@jSGD`GuDR(oF$qhnYZVLjV9r1C-AAGwr)ShnZk# zU`rlgxro3x)t@~Nciq?L=+6?V^D50nf^b7Wg<)WayC-)hwX_{90k%Eb?dxTY-5}hJ z76AK|2ZYO|HQ>l3f*>mZVty_tKJk9u8Q$%cTqq7Be!{=gO0pAI!Z9Xhx<|_j z)W&()j`9d&I?w4mHXI7oE*a4UpVfr`)g=7--$(NVUBS{F!STNf7xD@qL|{BxZF z2W|)=3rzaC6e|9#_`9W@>iEy4Y?cOhC=XDpy53btf_AE3I~Zg~;BL7%otI?1N_zJ+ zj3~bI@gucaml&}vc09DKdHI4?_!a|%yARtSZ+)ti=PlfMFGY3a!uYH$7+@x^(J)lB z6CK<$S{dlLUJU5S9+h9+6S`~uL?oRT><)2rUbRY$G?hL!4BOqKg|3_RzIvSZfICtG zVd{jBK8%M7pI^(SOSP^|wDqFePzDY3?^(-Bxo54u-Ao zV#?MytM>hKkyBN*NiaIvxr>>7G$LSE=}VdIl6G?|cJ|&+W$=j{c_GYF>?boIfGZ|_ zWwJVXtxZaWVoe_-N(6Tgz^J3`nNC!I_8vNskJ|nQp3vtw`8`GD5|fH37?->vy6T5@6j1r7J3#y3@ zUw?fe8{N3#;CJmKw_Es8Y09h+_IlyK?0o-PQ>#yC+M7u{%j&?5tY^NzncW*Trfx!` zA$%e?9Xdv%%0S<*;eJVO;|^&lfOB{A?cM9+T4J}6z~{Yw02G3YHm|xar(zO1c4!Ph ziD3G0ns3yh_<)o8H(I3fh3+um5J#s`;0k_9Z}-ySLJrryAiypx0jeO>4SnyaKiR2( z1(|jQ%;RtzUR9rLS{=r7Wm$sx1NqU2PWYSeW-{@wJnYo-6j~1UW_UDPp2{S0U1g;| z0xvS0fn-kRE-F`C`g+1{?y6+Vw3BSDM7}NDh-3Pw2a1sYa_WSCU!HH&GX?L@-uAD! zZ?cFzbqGZii!fhc2e7aP-asFd*_lPr=Ltl{Z|f0d6BT73kgEK`1n;MbH9k`P>)7C! zw<~jdE0;Td$MRoWbtY3FC9m5lu_5tdJocYNqkDK|+2f>KltSIHeLkO0pwAV77}`)! z$$8urcz+MH>_K%4ar?W5aRjvDo?47N2v(-TE}Yy6wZWvT3kzIyG1aen1HwbyF_X;p z7M=XpBg#vJKN%)5up>fnb2B>t^dNv*eGY))+2v`YWvuxyIO+K$5Uk)bGac}qmKst8 zlU*CX!8(Jy*w>-Na2}$GW9#U(t3QO~-lmOgmBU@PL>*f{^6$byRklO1mr2vvym^p_ z-(pvVNYe)MK0x^g1>xrUQRH@no%$kr(sKh^@fv1_4NlV%(zNXy3j)Wy-b&7Qhk-CQhgksiZ|fbb?;Fc)(#^8Bfh|NYcW9e zze$v&-A$Dkmh`}s!ri$f0V1OVpJH!~B~hvVEDku(2ct!Z{)RJ8^oY$}Jn4N&@4MpS z^p(i+CV;+p*)v=>?$;gQ;#xcy+ueIsf9)xKJwBS>zGU|R)aK)3%dH{?8=xcE_%F3#HoPj_We_6Rph)K9V?xbNAW0*cI8c6W~ z0)eD!SMcfst=@H7jP)2%oKi`1C-t{RE4P=v76BjB$6FYnqa`S4Vk=}o0uJa&zbAs- z2hb6TJf3nQVMZM|BS#A}{QbTgJ%HLdfQy(27J9+V)koK9@S4-O#8jW?(A{Cp_00@) zyHH=w>9#OX{7gvTd8*dSA>(k2fzpPthZazRHYrKgDow^+?NvWgOfuepx+#`?4XwR% zC^B+Z8AKxHK7ImsU|+6p`)Xzm5|=SOq78KDI^LzP)>QHEe{!pg{i-k^F+JcttRX&} zARq;Bu%v1*%j6c(<()Gd6jsYWQsehGTum{7qiH||xHCo6X5*9#bAz56gI+~+k{p_H zS~~D>9FQEhQ&&dCPL;X`$0P?AX8IRC_kYqaSlJe&ROLN=TEv6iFjTt8hrII!?yi3Y z3@DM?3~F(`N>%X;0Nm8L!PA^M3|MB7NhoUj9NKe4M<>o>wh8Lu8ERmc2E@+_)rZqC zPZy6IJ<+;qF?-kI$f9*BpGH$0++gAi%_Nc^4=I^kPpbwHAH0}puy zFJU1876<%DL{_@GN4=*z9}78N`-;{Xu6;m~qgk}WU`jNUtGeJDa4fXPagtlF@l=LRe+y6Sdi z4HgNBd#!-GV{W9D|dUYdEgk`4dfq`tTqUJo`5h{wp4k+M!5SA1vF&`G#HxAvLt1Z8UkHhyk7Us zE5*k;NV6`P{3HuUJ?*q9^mI|(DKK&t2$RMwv!k+ z6i7$IS?UDM?4r|%k3!8Gm(dDq(wn>GBBw?}a8ZU;WRjCv=2Jazjelg0ZPc$e&U z%zB4cAo~}iJE$0Gc;(}wC&9Mz;1eNp@JMKnO_c z-w0E=Z=Nh1$;=0ya>zv_7DQ3B^ipy4QVsG_yY8hXicn5mlv4F#B6?|!cxlfsYUg`t zH+ktGm*hktI|-f!BV9)ynEQZCzCvaCC3gAg#iKuaVGR)Q_w_br0bEy*BwxRk6*Ew(f3a^kjqert z)F5Q4?a1?s4^bV-?jd~7FBiK~IFbW&{9^N0&iIm}*qX2N(Z>We&nagT*YKA9X;I6#%r*RbA7XR5i>P|$aXS8BElu`3Jrx_Lmp?SFRYhmW}uksf4MoD*b#O zPt5v5Dq$K__dBESN%j8_rg9+=-4Li%2+S>nCO8CsBLs0XgtjDvt|o-OBjo=xVaiVl z#h}6j_`?L{!i03ggss9v+`>eI!^Cdv#Lv^5@{f0{r4jE4YoY3<#AM?F0_I>x&IEFX zh>&>|Mp5!h@f;vv{K02xY2*+DA#9u;IzbN|C)WYF!4WTt04ql)C{cEELf~>Juv{&T zN-)HED=E5`ih3rfb;S8|>`AysU~3|74nXADVaf=CYC$s9IK>ozb*=Z*O2XuWp{J}O zGduS%17L6e?}1xMr{TMpTB>fEC|W@X1$s+&e9xE&-I*Cx6}!>GDxEPI?iImv!&H-Yk#$bVR-dzb?c zwT#}yU}@z13B-?)@+OeW1?q7BK7$58QDa>o0NPO5qX0Hrla;l<#{457<|3r zmzMS6i_1xR(fc8!UzoKd`(W^h+wLf3ha`8c>*JOq|Fy(dyU_(OimpFIxggFEyMIOg zvYJVhGATLQeP68vcBA4zI{8qJKdH3_Znb*(200$%aBX4}zH<2(gFBYHeLS-oLe*4- zY7_`kyB&HKvA0@wlBW@HC40kTenPy$j zKwDl=_j!LZ{#ewe%70t1Kb=c0Y-4R(Xdp|_BuDzfj_^>PqOt`8Ho{IX6i z-T`C|pJmG^8{m9r{J*RgrSuVq+pjiR6|Xbhv<^72bQ?>cVft~`>HEd0r&{mo1Hb%` znQIPP9e;XhUv|Fj>VE3p1r|=ZGGn~zpc3WOM0Uw(6+T16l$%a`VB6KtkD|)#Ya#Ti z-`GzMEMacR?aBlmDz8se+CK~WdZ-fc?ylij^#^~KwouoMvY#r;j~>`J!b96mEG6rt zPF0#lt=^3Ay;0U1_2W4;R=QaCJbd$?gsGUD&&U(U>{((u)KrU z|3R42sblaD9pgw$mkq;GLo%X4ca&2E#Le-nPSFC1oEMt~l6bF2klnE<^@BDiu0{){ zN>nxrrpdGpFeeKLPC(`I0o%T`lk3ewH?gb~cQ&#-wq zELrB~4rxd|ZTz8bt9Q(?I%iylS-i=9HvR3*ng_+^V-?|Bo^v%EjTgswU1l_78Yi7j zVRTI_7H;Yoocn%?m3Cg}JZqpblNDROw6dYk=R>7T8~^mjH$_e>+w|3)$hi7SZGXBx zqdM?F$%?h+AlHaBMcE@or$hQypY}4Jhndb5o-=)3ca`h58hNiBDGWr4F!!n!j*O2o z`L|INp`rAp*`|*+RTTQ96~1WnN!co^xCsiarg0kNLoZh9Zu?&iL1!>9_ew7w?3sIQ zzASre|1IW3JU{(5v*Nq+*M3Kyf~$JYX^%lelf)T+J|h*V`i15{%Dy5%$EDY-l+<8y zBt3~SRQZG&`^Iw*W-o5S`qklzMSMR~*#T3$jH;^Q`6_qk>nB$Vt(5zi3_2H3a(bN} zw4X1*6{_Edh zcT$qpPFv4N$>_D-k{9daU@wY9_dtU{qRy_0=i(^S6@{V2-VX|f3a8Gly*;q_!UdIs zJ5m(0r}i_IZizH0XUFV<#XCXsW|s#F2c(!66T%B*PdI`-f{Z9^g3H! zsF9FDe`yxGG)KH%`X!x5^xeNx?VhJNE{B$Rv4H*%rcIx(3Uk^U?c_3Xub_oA;w_A? zuc1uFYLkt%1etRydPNlLQY>QxS?-JXN!Zk-+O-L?J*()G39Cyxv;InF3y-=oaG&=J zwc4h+E8~*ZB4-z(Uv_@F|B{PAhJS#G=8`~vEOaRC`jHUkV?F9hL0m>iM4y>;kgmt|_r9f$JB$}p&HP63Xu_ntZLNVf=B;7+TT|Fza zpe|&dypeqKf^|xL!-IhCq?3}8$!abOa4yzfv8e1*QRWx;zc%;F8XhKw>I;;JFVk3^ zubg;rSi!IGJ6=!3k*4SyfqTdQgifNwxT0Zwc?lx%H!Xj5BuOFv)9eub(Q^Uh$Ge6yJ8leEah85hGoubsK(DsEBqibI- zgg<$oLl)v7m_FpS2xY8etK**wN#GrlNIvfp`Ftc+*7(OZgZV^wOIAjiR!o|ixmH{= z>36mF@e|7W{ z&fe>$AGYzK2y39h%d{MK*Tqid-+E>$QeJF@i#_%ShOuVD4;`9IRWgzjv!pzcdK`M_ z^3#lJ79L5bX!f~yrUqSXpSbL<5AfvqE8=tKME#qI@MwqAYdc#U%x;jv`sq;Dt-7?M zS=E6+CTS7O;Q8sw3jP<<-txV}qU>90D$rB)S`3E_6J!InMHWN(4GZ=wZk-76}Uu`89#c6E;Ih5aLL7f>4b3> z%=iFgaEmN2a!OS{RkzzDuk*gh?`Y#Nd>gOMrKKSuS{fE{kl&pRFUot)&rKIPKU~OI z(?{sY^yw4>nJEYmzcz_p%(bp!Nyb>?J0Dr^8*ww+a78fV;I4^b>87YMy--?JjZA@( zXufwThtly=>58Y>vK3gU7~TY~&VUTG?qy6CPtk5aO1TbP6L~MLYT3C!pw^_*h<}EQ z4Ar9SBZS$c#}ercm?YP7~lNdb-LRx`yol(zhiCg@^= z@5Fd7l!DU)rf{Zh#bL>>#_U=ko!)Lv;QT}jh*Elo8`>?bg@4ncCmo@u zq6Ml9jn|$AiPX6nZ4vknKn{lbS{6`6mN=a@{jV7*nnb)zvq3H{(__s*i>F`esJpTp zs!)VSR~6|-cb~B6R{07M+~9u&>*kc#H=1*2FQPbNnoC%{U5ydo%)nk9T+i5Jh+HE< zPM*N*-EA1H&mN&Cz0t#o>tWx?(IDf|dw?<-q(ue^WjS4F z=#mmgh92}w6J4i8NEik^!GKeO$sor0o8sPhm2~3VbT^N6vP#GY!--EaI|gjY`U*Ut z`@$cMT<|oWhPyun<>`cUlR>j7T{Y2qC(^rlx4??!>A?;?+75B-K)SArSFi>CNof}k z2ErBa#wf3=?DV{bD^#6`<6a{yy`g+GtbouJ;boI?8odO1bm8kk19&e^j7o3f45T9f zqX;~GGY*Zw>o(&N4tTMMPMruGvvfH++SA0Pht&mQ9AI!FAWo&(Nf0PvcZd^j&{N0s zoLuVRQO2`3bp4(tu%~~VlzzkG4XTNVkMN;hWh}(VC_xvhk2)78Tg9y` zE@E`#ZCzc&m)z~RsUBe-nX3qXI|ckTyV=tQlW?i($U?W6QoKhFs#2~TVb1^8ULliB#n&-N_ zhn6!I(LdJp<44@tPH%x?6gfhI>?t%-xd-7wGtmhcfhw^^bJ-I$yE6BDL^h?hkbH2nZ- zrt@6-_te%XU~DzFU^CvC8FB-BPyZ^xVZkTouqVe7nr#W?Lp)T?>oin$&(-;CMD$P1 z8B29FG?w*LSt?!oPLxZ=5)t>beD%i!%5-tQ{;e>EmBg0?5BVSPfnRp3&*61>@G4lp z&pdcFePgsmyi81LM3&qA5m!OnsgPOl)6eF!KOu$rp9^gacO5T=6vv5I(J6ypcgK32 zJBWkJT3)WMgx|sVW{DXM=!e>$7Xv_UZ110J`C$c<5vO+3@LIGq*jDC>h-%ibF{ zhPZF{2>Dn|hq)8}5Fv4K^JW7f9#A<_U+6$j;;=_`sS8c?I%#9bib;^!>Q+OqH|4Dv zO;aSRS1P5pFBpkrJM~-az1)M`b|zDM+>05MxZAfU?Q**6>*5F zyBf;u#W}X};(VqoyzO@kb#0(J(-PEPd$S_y{jAbB{gKIV&*x$5N!PDLwo2Xf^da=R zcJBy_GC{{ea>hKVY0rTE{@0Dupmyntp<%t71W5~xO2*6gbtc|mNU|R+DTc~1$R|O5 zgo7R?2$fnf_^^bJl620pjiZa`6yB=Q1(viAd%B~r7?ujHX&l0Ro9&g@vnYU0yh5uN zN1q31CCl>MmKEXz4tC6MMJfKNKrtfO!T_$>%O1oeP-reYd+1a`MN*s z54)Ueo&SC8<0nAYW09lNRn=1^;hBvL6>AGiazu1OFKJf=L!SWniqaeE05HoU&RQbp zUb&ZA1OQ(WtaB($C`@w}n3BL!YdhQg?73 z61ihEL2;c=@fvR)6;Wrq=(?5ZRGoZ%3vW9&xP!3LR86+hoiY#hQ=T zt(nEXoF^t|lQ_6`FWrpN)c8Qv|E7Z}6sx6K`~vh{uh;ZC?d@~ckA>D9v^~8JJT%pG zIb7h0+TTNTJ3`s6Qn~!h)cXp)H8!W++p&@e9AC)iGY7J3Qs&i*nqs^>!azA+4Io7P zTvfVKo`%|!v`c(~SlA8j3eBHC@yyOcJ1N36)gm&77Fx9-OLFR}Od;}v?cBW?D@-yR zML#aewVP+RxtUur&*51BTn1ivdKDndA?NlW*eYDq`AG^YuXS|wcCo&MdeJ)c42KZy3fz)CuTFv~ZLRqCncZK>@MzS|&A*vF7l>IHXUTF^Po z^k~7NFQGCu0pAo1^2T(?ZT(8S1it6P)efhp*Ev|s&iW~6h1Mt;hK|ll@+->f^%kgA z0VN|uat}hPJqo`W6qK;VsB!_f>Z}>{#RFf}M(waL`R3?!s?(FfJYKD)u&=$xu7GF3 z$BQ8>lKVb|8njgXs!vWhL!SkQ8t-2aqgmWmSD6-{&E06A3VPAIr|A}qxiUBf(ecmT zkMgN|=Gu8ai1O|M9DRaqUAOxeDN|I& z3nlIYAtHBPzPSJFZGs-tJmY3zAbl|FTpQD<$lJ1zV#CnlkM(NH2gTo%N-1V3_*e0c zk}~;z8Dm2kt6#-04H6Qzb%es!$B>im>zYW$iy8X8Ld>azXHD>W*Gz%7yuRV{ukEq<*N z4UNhTt&E55YktkM%JrBBWljm_aQI}3g4HVLvkd0XsL1dvC0VMs88tJSYYFTansnZ!#6bkQ}%#~p2NY%4~9Od^nE-W_VMovJn9i` znzHwQ<9BYf#eX`kY2w*Y&x@wXQvZ<}|M|khc}D-C6Mx8l$KG9F>-nq9Y=axi|#~D>;F#(Hos;e}|6VHy9>~YJ1s(lu! zZ%dEYT@$P`4|E*-n8W@bRm_Hss8=`4-#!%)Og=hKmO-@4X0E#yc1o9 ztZx?XmYn}+=lZ_g`|IMHG~N)k|3aAV9P;-~-~|NB4w5T-l<4}N~mjjIX71Ula{8eG{s z!L$OXzrSkyNw0#!w_*pdRO0=-8xTg{ejNK;e;>JYX3A75RGQ}sBI}`mK7}wf5G!d0 zUbP%k!{4<*ceCA0PpCY3cd-%OvwUXq#v^7Kl9M9409S}QopjYBT^`q7T=a_idPp4C z%k&COp(+x3KMby6BL5sj7U%!2g*eldRsC9ZeWqyZRT>q-KynBcAR6H-ZI6hI7TU;i-jrOKGE;x>V zN8mEMr3|j!R12Ub6Qc9Hm&ATM|7eSsaX+szT~(&mX7=Kc_l;SD7w2lsyrrFe-roOO z{1w9EGXJbnGnUOR{p8nFP!@LpT(x$}@|)Fu6P%Z>Et014QfDMeQTU9-OzJ{NIK7hp zHFiG#uFaVl$jdvk)|&4}X>Vsc=eBbqtb{|HeW446h&gTCYqJI39%miolm=?l_*#JD$Gpq~l!-J=6D{06r?9>sD;`R|??7`uW#KX5(~ zi3m93v;6!AmnHs0D883qJo@s5>(-!|)>4{#_iek!eVRXW6WmH>U#hJQsl9k@x36f^ zwmaYBbg&;iyE!&fMJ9#_x}W!VBFJ~0RlhINv!jv(uQq_noElYs34yZnniI$IU{Bh8NkGlm|$tpBV!bzIE8-f(P)c3OIz<8AI!x3Wm!0IqFMAjgI;MZ z&0V`os#lt29G}Q(p1OUzhdz(?YGA*%Z0eTK^<#!)3N0iRX{K*`D&bkrl~tLPdxKWH z6)9s+R%KqCHE^;vPVVbmmA+Fsblv1}^5x569rO#hyRGBV=|qs~=(~#{-h))niUdzN zVPic^F6m2qlFtok2>RcTdGRxc9Vb0Ga{5RX{E0GGqQ7sHiZ3F8tQhQ=`bVXYDl0fn=5l_(2F}omoVVE&@MIRgQlEcavZ_B2~7Ito4cY^U9 z(<`Nw&-$JxRFml&S8~}Mj4!P)Og@jwhR_zx7^dBy%#mQ$(PkF8@=`>wSoJCw$4yEK zd2K|TTu-Ipz`pCI?`6!b;tJ!QgIbdqsRVR*KrZW%I4v-S*11fv#se^{n+%S)1P98M!}ep|>w0-?WwK zDmWi~{zM~hq48tlw+GLOu?I;$4g1`i_ji@w-#b=lWsyhu!UsRl*psPhU(!ojyNAy6 zy{&0|N-w!BMfb%#8y&*-P>muAT$^iSIdlkBW=Mege_tvdi;s!9?Yx9W@}cDp^#y{l z5cuOvrt23vC1K}bFK@8+dvIxA;jJX^sH~QD07Ug)HDX0|Xa@2}Wu7!xuIz{28=MyR zgf!E0%ek`WI=W>Gf7rPtZ=5n@uWFUFpkjDdD1LaOT=rRZKMmY4j1|AB&iDnL`r%?X zi*BdE7>rEa0R4J&Q03Rl*x{FKX4qWZZ3?_58Jx}xwt>k7FS|4nnHta|{ z*L{NHJ|^}Euq{{yez#>Pk56EwG}H1L4|}ZIez=i6c(Xl`U#jo2;X{F6-M2={D@HS} zkg_$tQi1koVv2*RUq0t?6=j}#JAAjQ4*uNJLV50~e31H%Uxv>s<^HwzyHz{eU)`ep z-ZMFQdTh+Ed)(R|Jxq3zyNI73Q8AZs=nWC)YkPb3S4UBr%bH?5`pcbtTNbw;;lf5= z-QyL4F%Cb%#qHp!p1z7XV)HsQhYRocw2bvU8Z(U$!@Ryolm_zZJfWw ziB--0+79cDk9qzZUs%FtX)ba#_RLm?PkrS&l%7Kw=E5G8bx}G*oEQP{Sox4Q*#Kd3Tw0xs~-eSrhS~nCgKqU!PO~h@(h4_S#@jy>xo! zkJN{*l^t@-S0s8w4mLR3#e~0IN2=O|T|a!!g^}sAKiFQ-K3v7AfyYl2exwB+owGmL z`*yS={`}@Ir=aHj%gk_gn)h16SgkegCqhcd{AHOO_;S$Ykcdu24A9R@Ak3L-S1 z_GM5jrzF0d_hGi_H&FAbsM!E1+HhN1pHW!(5GtdrA>I~t6ZL+uBz--uh`|P=m(8Xf z*S8IX7Z6|&o0V7s%-05b#$a1O1H&=8tv!tRm5F2=#fag@R3FK0+m!6a7qJ16892@x zc-DwE_#8RGVVS3Z#KRPsz}0tu`qfB%S**3_#Ryvp)=Rs7p6MvM|H360_bKM9Qir;i z)_j+kq&1nZCB!<6G1*A7db`JN(g%cTe5s9pS0B%M9nBnwW~+nAgk4fRiAsRPZG8v> zcDOll#T1U2fK5(yEEw@)+Q%>#i{%8|px==-Y(dYPLoiV`@>ggd;>y`?N0uc4xTkND zN2z4QZu97rL)OulAPtOYB!_o7&o^n9w=MQ(Ip*UhEO=Xir2>1+oo%_ObqY)8&0kqv zz>AUL_0-_wWSA9W;Fgk6JR`%@GR%RFEn3Awjl|{d>YcJrJmoZg%9TOr_PCrPevJOx z=w*J)99}p)Q8@atusAPW^19HoivqX%9?@&?|KwxWC-SEy#%6dZe80h*TOm?>`7r}j z#8X4GPDW6RK{VS#G`~XZqx)dah31RpYY#?5M=p!Kye!_d4ZCxi{=^e3@k(F9X)|;* zzc#m3NR3|z$w(ngPZ@&Y!MJ59drZcHmf{kFGC2Gq|9B%#R^hR%@&sK$Yc1QB%y5w0 z2wrX@Q%(QhZ z^5LFki@Fl&1o#Cf%*{&z{9fNqy;MKn(_S(Gh2z!FPO6`%F=@Wk*Hn0-uIKgjqFm8r zE2`H<{Vjnx%uPI7R?Gg0mb9FP%@yslRoeEO+SgDjm(G8e8+yP{OuGv76mElx$jZLo zk(H2D-d|9aUgA+PlxZZe=swOVXl2>p1sr9>>9zbx+8a|$5ky_o+<(H%;&EBtYhG^V zL|+DQu?`0Jm^OdKN5!*kn0f00m`H~WotFd zta^t;wb=?k_+j#T64Quf@Lh1nx^T&fkBKdChCelker+zvVk>K6tB_=?yt`NH{b;qV zL!a?svrWy#QG!0+SUH;p)rQHDqseGxgiT>C;Hf~F1c44#`YDY0?oDu#_3H)WO7JzI z4mt>M$C(#->#C0~&dzAG<2Wn0SPRB7T=ljPKZvU}rz;_?jl5~XV%GkNe; zRtbAG5~ylLrc8)-1wz;|NZ>Q0n7emnP9~=?(^D8{GPudcp*-86Ms9UDsPquXen^nn z&xG?lc0}eHG?;h@J@XKK>rwc_sY#eKj(lP46+D=T*jr&I0q!XC)11O{TLM%YNOS?> zR|AJJY;A-s6Pb$&-r!wV@SYMx!A9^kEoc=XAju<;jaIag;dx8o+w*w#>|Wa2doT9Z zWPHL^eJs5lcv7*@Clw|o3$Q6(IdeSK1pu1Nh)JTOHh&AZ#6m1_A0K3bjau0cfe-Ys zD7QWPrVyVI#~PjT{BFG1UoF?rd^wz?CQAp4Fbo-i`cWj5Jh50Bx@??vBvG;ItDr-GcO!(7?I zA2A1;*_9UX(^b^OPj0%k*ioK{r;Q!d_i@lGttMSyN|Q&FRtPJJ4PKydulP)OrD=3c za&)v?_*JU^HW!Z~M_#?fP!; zW6JiHu(!Fwuuu@Yd{hvwW$E^+spA*LI1GCtKl}YXUtENNOaQOxwGLE^_akeY;D#}| z>P*-S`C*xc3ENK(F0;gIDNilfolc*{1(!UGw_#y%m0t{sj}}iPW1&e?m~J+?tZHS| z1=u+AQ_-oXkIhn^nc1G$oW3fG3yoYh%fXA~w0A1q4Mp^~Gbh0iBwe zpG?4**xK}L?Wau)Oyfj^3&jx2L=oDaTU2tE*<_rVSyxfY#;|8l3-Htmgjx*JrXGWwP$N~UnJC$+91Z5X8a}qQ2Tx&T~foz|k zY?O)c?a%qyN@Yp)`5zq$G?e0W%yBSN`TEvm{@Xkp_){aba@%dljmli(4g`zYD^B)k zJQJJ5{n(fVh7$=OTRomD8OQ{~GZ1Y!T!1XvnJAY976%{;49Hxzz_M{rfK)DLwcttx zGQ(`pB{QNF6ijM1tUR;gvqRpG)FgGzew79kZkA2cqVQ{SVNVA`kuOg$$ z7y+4@u}AW-B8FGV4X}qj$*+F$*k4_*mgtwwQ&cr3Owy5Q31& zu6Up0AeAMSgGmC&givv6SfE@olmouaCbPoTCNDpY{rDFPY=I#c&y@>gHnKrG*q~47 zIXmPVIUU;S=xmsBZSo|6s720}9a1~HU*}5iE8s!}b7M!){nwneue+9Bi`#NuTjF?6 zeA&H#XylNOQIOXJA|noKjn9^p8rQ7|XNhwn2tkdm_{vKQ`K)bhgv@fvh_3-(8FIkT z$%RsSfOO!k$7e!PGEq~Ojjlk102j+Pp=8Cfd4dERCm@Y%I9P%Z5;jQ09Ni3QWDnO- z%ECr&t@!1KZX_ZAnZx`|uumk^#xI6Xr2I#`@Md!0PV0a|Q+_zdzytsR;zR)O4A>)C zX{(8|R$LcU;HK$_^?KMg2}N#YyZa!>&r)72lMS-KxlQox31U11tWZGtoNuiR+ieUN zZCEy1xf2$OPiZ9N&T}8xaH_!j6taU~G%EE@)ppINTY2?&EijA|(AoIhU9%<#e>MD4C)rN;zAQQf<{K?-cDCnHLL>a6A&^?z)%ZCT;;7gkA zSvJ}jQd^Ol-}D82W!eMQLx`B}OxgfE)6~q|i3TPh8FaFA9Q8P@qzlO0Zt*@1&j46p z;d?%?V?EDm!vV17Iv7uX6mEFc))&sAF5Nm<|Cxj{OtzPUugsF^xl#(@zbVs$Y z!AYu0jZ52Bh0u^N;UVF6W0k{<9FEkp-h00DLIxx8`U*E>!{BTrb=AEq zK!J_hllhB8c<`HI&N7u}_3{l59mk4QE16SLlhxwRPKYa>=U7U8Y ziWC4yfl$SIB%@0Po4k}I=K?b=6lW7xe$JY*;&4@q&K8lj3E9-)EW!n8xBL9zzX{VS zr`hKDts7nQbaDF)vKwy-&r~0-&?UN>a0Vw?(>lwHU|O)?G_@Dc9Ab;V{={0F^(Z&) zZ!HVCXvn@dkH=2+Hv}Tpnh$@1Bo~`sl+ViLg}wmUvP@Y`$b}dvP&RaD!H=vv$|6KX9jw zKf79$Y8X;$Vw>t%(swJ({j>H9&1RsGC4c0ytPW}?hLK8U{HN`6wbAoB&v8NOd(o_i z`?43+P<7k->4_H)M$bH&zcCZ<@6zub`LrFO=Um>+6J5l5Rqm2j*z0+jE;ODVeUo_ranj~oHDS`Q zUj5c(R`bc?#+K-#8dK3bD4i<_E-100fs%S-(}wre#&SWDm12{eu1)%gOo^It9%)x* zfGnqE^#FrSF=0iO-o2ASmRhJ3I>bIFx}d47$NXX6R*!Kn>q|Om%s}CxXSnv;UM6GG z*z+!#5q*@J&h++>u|c+@MVxzBJ_3ftZb>7??{{?Cc=<+}uK3fLxF<#AyT>qNo|z znN1rzuA>qQ==7cJ2tqv}XjGT;sQqw@wp{1Xc7}FK@fn5+$er-QbV%*!SQL#A7l5t{ z+M{@ZKHO^XsTkMQp3dj|bo~{JqK(-2bGzC#07m@6IDTqGyQTo zB+2fjWFUK@LF(3HF8l&DA4-F181PhvDp&SJANtzH_PN~T+^$<@3XxU3P?M@$g=>3S z4p9>+?d!|z6TU#hZa3yZu2;(br-xjldJqYTue#dZkty@pFnn3)xs%)F*3F)z8)Ei4 zDv{boATns))CSx+b}{$@Q|uO%uvV}QO|;_^>{5Kv@W~n2<)2WmEP)>+{Mle5uc#%E zuoy_w*lavGxGam;TUTxQlJTQsb*rV`e8Vo;%PFv@u68l4(AfJ|r0hB$BXM6YWn&Sx zN|a{wJF7;eUldZ0y$UVnHBkleh9yxyc=25YW&>K<@&%VbdJi+rF{n2W?+Rqu7dv%U zDNO|zX`t@?Smh2TW;DZgu{U2A+Q_Hq1BF>0vY0yFU=<}oBeDU2QJfA3X}ZI)+Q65) zNG*nl9O=G{F@vh}X^PIYQ`+>4?tE2LRyM^?GQNJ|6<&1B44B2r|M_ISgnHfZRUaacQMeRprT&z`rxs*j3($CYx`F71%#f^J-$MVnd z@^tl4bk(+KKe6yG9JORrApP=IfnsSJdF$u^9j96JDy^*fQQ(1Zu7N+N`z31X0=w2;*@UOD!dKg0zi7xgF5MSl)`apD< zN1?I{z0nqZie$s0qu4VVb+^jTRO2K8bkBS;H`gmSx}W193_<)xq?g7nebQFenT!)H z$T*o}2ntxvI`s2+TIm0^)aIT+T{)f0rqs1FuY88RW8)JP{8<|F)O|LH7H)J@aKg8_ zkLn7(Wpte!I*Jl9h;^jm{`t*g?1LwQP+vXz8H^6GG+<(QcQpDvzf0+9!GO{0majq7 zk2iH#m)*m7aNuvycbfe7Z_!$(OVBCF67XUtY3?dz2~V#bo=r9K2#mOa0*%$HJupO( zXLCXH3^pUV%t@Gk)so*G{X#S$^0@crR+_}2$SG^=FR>K0 zs0C_&H$JG6#SO-fA6!>HO1}v|xvQGwPZL3OE%%pSxw-gEDa4IRhZyjBk-Fltt^rBQ z0c+c)@PHvFC~HUe$2Q%8ky3~hRM0bsy*B6=5X1l#7JO{=!k$R08zP^Frpt5sRd$OW zh+MLveR|6LfE-Zob9Z(%U=dGPxEl)L4P81Dat8X4CL$z+8Zm?8u_A%^NT{p2sLfHJ zA~TO!xcRhZ*i8q2id=e|?Y^JSmZ zMrZE^p^~PhqzI33Clt~&lHP{E{gL=O0`->x$|i16r$2MQ5=ynG*VOZhz}MVMewgP$PnpOAb2BzY8QM|k*FIQ*Cl z0ewaeenK8nKvZljXp5@&BT|qAr#_8>en!mz;{;EmRN{g8ggDWB=lpn}N?Dxir+BI{ za7KItRU{O23G|hW{Ne`T!`{i*kC(sqIP+%0F);z004`?(*OMV<-zBKLLv2Y>{~#o8 zMw}!%wkBH8IyuGz9}s;qNL0F~pHR6&?1)J9QQ+q06Qy?u@@52g8S>_7lzmE)(`V$3 z^GUiXlrut2Mv*gE(9vkprJtz#K2NK4VeDm8mYT5EP1g9 z!A=0holke~jt&+?DhpE_)Mqf8aGJJsNJ2WT^z%k(gmHXmOTMS@yX3CV&vC0y%}LJ_ z1z@k_Gkl-Xx1W#u(3b(fi|%vFxOXLJE8o++HTi4gi(TfF#q%$$s^Dfup+_kvx7$*H z<|!{|IIP0l(Cb)|)g-k44Jjol;}ii3b7*XSTGo8C>SZzqGj#;TI zPZQa*ECs?uFF5e00_91GLKm`4<>?dhvn1-ZeDkvJCS|nOQOTNTYlvLxD5GM{%sf}G zRhgG_y&E9~%L$8z{cjSccfRDhrsjqbppFEPZvP8|8I7+IuPzo)3rFSz!t#Pd{w7Sr zqw*sE5T=j5>0=9c120@~hVppe?x4jmQ~&r9v#oFDNMN zKPf0qEnsyAmx~luUMPHhAI!Q?P*+gc@TH*4v9RSrVQqh5$AzN8FNLj9MeXKA0|kXW zQAMo$MvwD+bGtpQVL!Qaze8c%IIcGp=(I0fikpc*-wsg zI?l46uSM4hFHVt4nFmT)=^oc*mAAhu=Sf3g{Yp6>CZFbfvE5sKYPMAPmzC77QiKHt zrdJ7%F0WHBKig2snkFh?Q6VK-$=*<|F;J!@TA|}qabck1WHN1ef%;va)(+~#)KE{B z|5t@(bm8DrpcA?Bvs8X;Pxx>n_$^;(y++^L~Tz@pqu@AMtz8ms8nEZ)zr zD5E2>^4J-Tol%Gx~HT zH7v7rjoWpQhT3ptgO$@fErg83Cs54I`iz09^k0eQ6SY!wbuz{calbOMI2(MF8yeGU z&vVw7aW(>!8(SOdr2QI8*9><0I3Ho%lo@Y;GLhArunc=O_K(P&O`QElo2Kr@E$Mn{ zi~9P+2DD#;tY4C_7LEPC8Z0ZDsb`zN_@N&UlyR;%xBInqrZr;HT6UFN z4k^LFLdt&y=aK!KA!KSn{+m=|&?tMgt3=3GX*vgTz{RlJ67-lozX~B|Y4*8N>|opV z=;nqrdCiy>eid2Gx%T?)x~nmw*PJ`m4=?a2A*xK#w@s|Fi1hvU)an#aw{CS?it#K( zchv=V-dO8on`@&h>=bBhu}JSaUDPA2U*X!=6FAr7EOy@PEp%}gMLgvoRtq;K)6_iE zC~c*^Cxzhcu?K4QE^;;y>3h7>J3V6*-5>NA5B6Lxk~eqm&yzi=(B-Jut?Um;@3Znk zRvO`n7U+-UdUqLwqCQV4vA0}>=gvUi>RO*idQQRPw(RuRY4ig@gXJC0UArm$1!6rc5VJX2mY&EZ^3VtRVe&e)XY0V*7<{S9P`+_Pf1~`zf#CtM-p>#E$L9ufisH`Z zlAFR?d^@$#Ngo6}yzdRyD ziuwi$*@c^+G^w7pLjSbX;TRg*%O1lNE3rNrF(-{^8=SmhsT`kQkKBlzbVpU8Y2q_ocjkJA zu4<4Gx#C>+h!(*zh%79=x`Ni4Kv#*PstcWcHT9t=U=YO=~cIuDAI8 z+R*!{Q0aE`>t+_X51EoQqdK33Hc+;yveI9X zUKuFg_oxnSRM##~oX$i#2`OhVQP)XF5#Yz7>D{ELLr%^65Q8E--%z=))-upq<0e`* z;h_P3dq%oG@rFMMv{&P22K;Fp04KWIs^|BxHemxuz~Qup8?rJM^>fsnlhx!_%} zgOU_nnhjh1j;7rWr>FBr)FeV9Nyx7o$ZR*7pHj%YGl=|>FZ)SI8gVq0)#$xXYxBj^ zF-L9;@1chzq*^kR#N)w#&7+s}5pL)a{TBKGjz7WF1q2b znw3~s=_cJE4W>~F>fmPnyOfutwbe@b4*$Eamqus(n7kd?N_R+f`&zPOT({Z9`{kTC ztak%}-|SKv*@^Bsc{^1`TUxgB2zM66>dtu0omIz;Q^@^UrR!JT2V<0*e~6moaT$<6 z6^NpWXTNLlY`9na#y_$A)1hBahd-Z=o}XS(#e7sn^}4-ozAL&#ur;^o>}} zJzLqtwJ}PSK;rp{Td{7XGbW1MWTiSJBYKU5iHkoEiS!1Wuxi}9Z}`~-BK`etzga)3 zgVBT1I3WWv^&PU|vF&cwCLAvX#Y@qc87(iA&m{Sq2e{Gb`B*BdA^1PJ8=7#Zx(fB! z%zF}d;hD3&n~Ga*@|A@7*)jv|Y}tdm_)@IAMm&_}*70{8?*M}L4WhSskN5eW!|y_O z_lWd5#1lx16g9wR=mRd88qT75H=Nvrt@PE}dHd;2>$OLRaK{@1WNtf}a{*MJrF)-? zT-}%!Nuo{>lssNExNY}Ux6$1oAktBG^L;}6IZ9Sb&K+AJ(&+nW_Pkqfle^AFr>J7d ziebUkV(YHdt7|4DR!sraoF?n06*q@6w9=BC$Ai)ur&F0VuM4PwLzZp#hdvZNNLr;= z`7_$2Is#>c-&(8_xp_|>EjW!SP0A8ejn#KUbOt$WPe+BWTCfD_4A0XlB$ZiPM1H4U zu%n9=4vegr9@jBq`r3v!73ycTo3)O<=7dOpDr6l$S;}g%8 z_k=!l!kv>N#3AgPe}H|`p5O7tVSlQdP5gYL#IU-m19|Bq_-w%sd#KOF`zudXHg{e{ zx@+DmiS(Td7FDVSd>0Z9Zk)Zt=~E^;#QkhTQJoZ&lU1h4Xs9Wz#eB`KT#NO3X!$Ez z#*_{g&YjyD+2qZ|QH)-wqq}iDhCeV&SI1Ov7nmT36u6QrUmF-UC8fpjm|ZBp$- zd}8r)tUF9I`;Ayuoh_gLt*kL^i|s9Yb$~cmluB@FulS2^P>*OVgKg*;^V_dpR+~GO zcAY7r4crRG1a^OZg(j~;Z}8XzatZgokI+{rH(NZNUo?4muglaTthevQy)|x!_(-CG zKmxk$yUyUfj&b`4O2fE1IqR_RW4gf6l0ic#tBa!IW$NaF7yL(kOU!CVKX1W?3v}Vr z>oEV7(Q)!*J}h(hyE^7#FmYrg;IMB9vjaKEjq=;bUraNg*_LZQX8ZDUR^&XC%6AAf{z?GKhYg5Q;B)4+J?yCm@5Hb=p^rGPUQCrx{+ z4X!OGkVe-q0>~@s^k}b8`svu@m!CYN*Y89{jvBaST#t2FWIopul>?K!!qK~AqI_aG za3{4lc>KX=-i;`HJMBc1NLPgbR8kB*T`yM98s{z9xhd7apF3T*J8MPsO!n4f7y%R{}Kg3IHo8X&-Amm6HoM za0p>5NumVxMni=(ZD3bx)#ElhEU4}+Tw6@wv{)__`}%Hp)yo6bN3VI4>EvKR9X1|^ zWf8#WtTVYbjw87aPKQZ#E?kRZE6U}jajb|zuM)#vuM|F2BE^yLP=WGOiZApbAI^rg-sTIIDAb^OOq3+9*qf5P-lvFk%8 z%N3<-^;2(3e7c;hf7D-V-hERVNbO9ZRyJ*CnJNp{a=yXbVA_3VsyyzYvz?%_S-;6t z#j`HwTk;KN!*{1DUr^t1&{8%ZPnxQFrFF;gYJ>Sy_0;Rqhj*NBo+w+)OifkSblq|B zZm@X2J5|#{?c(-O*>aKPZEcU1i^tOj%hfY)>qZ{Bc)e7%`eO3-|67FV+yD0vrrJLB zZ<<{0*38a-3-f7QIp=!+?df;-)KFK~)57WL8~&x7QixOID_8$f zPVN3poZA04;?&)qKp{@O9c?Mf>0iVt#W(eHyZsk&O2JLN@812#Hx2Q1qY$Sz{0qnY zHLeC+iwd~=HBfGXsBeF692TP{rVfG5`3clVO)!B3@fZ;=L$w#7N>)q{+8P z&4{R&|0bK-#~b~@O${C-)chxI`shi=DWzV!GPaoV|)>p^}+OwDVJ%J-_z&sZo3eN-5> zQ|K34bg!tWh@zY(mOT1T=rpPPZ|F3?qLX5sCRZl?Wu5+CL#LVb+5cgkrvGJ~zG^P~ zPwVu5A3FVqIISD#>>ik;K&MSZeSblx10&NE=(Kfo4C|=xzqm* z>vZY=);j(0|68X{?+5EDKaP+zqInGJt3FTU85dd))xTbQTV^|uVbD?|EYC4 z|6i?B_}|v)CyhYpsXVNf>3>?M@eKcIotg&2Dc0!?^r;!QV7Tc2Vx4BwQLNK!xYoa| z(|(F|DpN~dhKWA;$2vVA3m_z(lnEj(2nRi+|L@i*o?@M1{$-ug8}0sVgYf;fPWhZS z@h|*i|0nCz^B?Q91NJ{yr?*1rYM=riSwXiegW>;Soz|7%CHcxLfC$JVUJ|@DN}vyN zfnuGqCH}Ea)&H_i(Fo46|FBL|;1uh0749)@k@3 z>-13ik98_1*2MK6)@kK$>vR}nTJ|sNG+duzoqCIjj*1HwQLNK&-z)=e=l^1z4)OkN zoi^!EtkX3x#X9x;ZJm1Ro-T@^Sf}A!e_5v^tU`p}*6I6yS*NzYt<##a-_|J?2>#nT zt^3P5^(OT9^Zsp}^7pY)tkV%D&LX$Jt<%7NtkdY<)@kBl+h5ixl4704!$Il`BN)&h z>y+hh>-5b3V4ad7|J6E$*OCzbvQAz8wobhx;1uh$?2mOyVjze7W1YU?{o6WS2A?Mt zKsgEji**|Lw{?2uk9CUq+dB29Sf|1Nqjl=>mvzehmvyR1u}&i>)@d0KF}d}EfnuG8 z?{~0SQLIxJav&#V57I;WW1WirZJknD7XD?OF5-B~TK{dGuK#VFexmxzI&ES8+d4HF zr8|61u}@ty2ZSAM4b#^)KtxrIqHtTc>p=LG~tSmYIFy;+WT<^R4Y^ z=Nq*YZwe1NLcL;&NIehSk~=4g3HBa0I#hR!&fWOl@yM>dO}rU^^`4t}?4$qLGM){~ zgIaz7-V!7-O~vCFsKTf8UaD=j!On(kpmAuu9m%+mt~r z=7&z;;pwPzNBKQ~P;lnM!MOZ%76bQQy`Xi8=xx-kY_~dO`rVCd%N5B`>hbo|gNCUx zOxG4g*fhdFZ>AgzPeXR4haYq2V9HwCfyl`?^|K+_xf%uZ&&;KR2#)4h&+PL1dze<( zt}BKST35EJ9q7AccwAW|kZUCT>z-KUN;?gaaQAB8iOAKLwf+NOpa6(=zTH&NC`-+h z14|U|%muqoK`yk;F(_(q3jwZBW8Sq&-+06W5D@`Xt|9fETLZrrilkHs8k@4M1%G^4 z;sw;a#=L!@Zpd5W7&4`9&9g`c$bKdF@HB5|Ke&0wyZaX1xcaZ3A^H>Pyn6mddCe|l zn;);mh-B&Eu-*{3sW0=f%+ISYcnWXN{U|ws=%ZrVkWSrE*Fwi?gI)}wLa0LXzMBh# zTt4avv8z~f(z1{^N%9ER{JP=eski((uP^zy_eP!eHH{7ysyfg+=`r1|G4SVS^ z^ySK?*^Y@#O5d-~x|Jc1Zs~D;PY5qxx1UUJl0ETQ7lbUdmO}wS8vWterxhhoTkG}_ z>Z0*);-QhRIWe+T#_wsu3SPz{DYXg{N~5aBPY8maSa~jX6{a6Y=+VC3j5()bi=WJF zDgu5Ak7Ifr@ar^6XwjuNdN(zj_h^w(=pK0MiyVAJQ5-yb-|82Zu?eSu_fOmzl_8hc z!JdlS?6$wLk*KH|0mbr00-P-1@m8~K#c)z8?`cN-gV(&XKF4*KM@1d+n?Y)pTfPJD ze#lgvEU7s_rEocouW4RZfxINI&Ev%$W=>;Y1w2c;&a+OV3KI;b-o9M+X6dVHXq_0~ z^cV6+mUUQ`Q@u5tbI{W>Usz2=S|G&$x?d+!Q#365Jw~iKbL#7Yu1HfWm(2U`HKr7M zb`F}^PV}&+6_^HYep+KIb?O+nDWU3)8w1l$(QLZ$Yhijt(_h`4F=dTbLX^|E>YCy< zl6v?oKgI_z>AKc0J7zP@x3J8E0;BHlU0gaRmfdV_nc>z(;W}FAM5H1E&4Fdp$5u6K zm0B1h(BOEpm_C}n9^+_%qmu950E^ApGJE{-C?ANQ^m; zObQ?s@H72btJRCIhUik+?pr=rAbab-x56gwKll{+-qZ8Q5?jnlxav3WS=1KCe1|h$ ziPTmMc`p4)#IG6m#SJH@+qLVI=!@doNJfZ zDrXxNOF`<5hlEtDrzdK=>|(EdU@=8JHF6ZIHs1e;{9%tep&?RaxZmO_I*6GU37^LH zt)ke3(U|1l;wcqHJT(eleim%KMNJV;Rk6r$GT4YBo{k2CgMW*syA<)X{f~IcOG~rt zBVrm_o@ohQ1gi~Kx2n3&HqHkn!1k)2!Y*KP$#{O^dv`H!|$26)5U`$GmDyWSrnvN>jk1G4i zIyH^1_KmKkh^NHg;wgj;4XH)8E5?NXh+dbCX-|&s|0AAi5u@0Hz`D0kO;TYL@pQ+> z0X+U&JblN4Wvq=^{Fiw8{Ev7lnS`3Ah^JUuZAWI(OEE6dC~j#w?wj;&QMbRv z)2N4FfHBSDDa$uf)EF}OzVUDIRCH5n4T{v-1SafV{h1Tr_D4K@r-*W0RVca&%KIap zT7Ud4o{9pf&)!7|=DVmGfxeuy!bIbJ8UBc;$Nz|@(ogVa47#tVl1~|WyMbcuev79U zDdMT;AMvy*)_W89sp7YIDgmIo{6{joBJa5J>9S;M!(~FQ%^iohNX($W;WXGc7`cPb08pZ^84*;Fc z9zEO&5zmt1;Xg0Vl*&n#S=KZR%s~{53s^7RLD=aZwZg8C^2N(&DCVRZ5nKZ&)hR&) z*HzKMMNwEK?sG-joh5PE@0bPtbpL#rpdVI&t(Fxlpg@Xv>X96u3F5+{q(_tCClvCu z4G?t}wcLkdC#5i;siXnujh|7If5g+5|A?n$z{H#jG`IB6RXyz%5V9S`)|rVP^g$Yp zwclN~Z#$bVSm#mmM?9tbOFZ397yK+n5l>ZHNzP;g30xMe_jzmr41z@=NXaWGFm^QM z7)8~C0<*`%V)Fkbp5ii;XR<-)Y*~tUT1Jg+1+1H)?BglYdWMHv#Hx7*gxWFCFq_ID7+hzYy^7e0dl5@r?_Wub<)MFf5g*Dj~cSH zo5(AQc$!EML#Ikm#8c)I3VBK?<~Wt-F7jJE<#t42>VZXkVybR*FCW_hBf;{yS&24) z{CK!{0TO`%hm2+&lL7Q7Dn=~LHvp|DJXh02P_SZ z%LgIINNgE32naZ61;R)Gv=nf-AHanI!^lOWUy$P$il?HAi+$;smqD$I((Sh-e&M85 zai?o=5)wqQ8VY)PLwa2I%wOoK{A$S)5S_UCv*c8XOEg&FXLfxQ^mMdDQy_+ukY8(3 z2BM&+9)Hl&QM$_>pmp_v`0`@?&FA@8fIXIG^=E-T85uJA@)!mDNLvIjLP?3D1W)`b zR?R9v#0$XxDjqk_{>C2N$98kvj@Z2KVVH!12y1_2 zidpZ7248y39u&9$t%%MnLIYBS8>k&;QxLdp@TE#!rAjJb&FzM2ePY$IR3QWijQ?2> zkF7cwt#P#J>CN>z`z0l8O{EN%ebRgDa~1%YLWAlGj}B( zMjxL8AQpk7D~{dh#W}#zBqaN2l|8X0v=Au`03Ns2m(Vp-=tVQy$HXb~GUG~)-5@YP zVN^<;rxV~Qd$C+$BM9Hv{i_0G1XQs=eFFetSiq=x1*Dbg@c;mILI6ZK0U)h~$E{88 zot~@!niqdHYz+J#yxr$hlL5Cj`a1;@AVBD$hYnJtiAV_vp$iCtfJzBPKzbKxCe)xP zRj?q^L{vnYiZoL|QRxB#B1NPqRVjj$ym|KC@29iR`FQ?;%-l1{UmRfRu5Yok#BM*X-S>JiJlD*zW)rC!_6}AqkXo8t8W>XF*`j0ote3OJ#M z9%|?#rloZ&0IY-2)*_(pP67XM?&6SH#IZ^hx6TyLI$Qbrwn~)UMRZ zg~(y0&RW?t)yGyR|egdq)f2+}XM zkvjD60Awv}EdLvHR2{j|Hqn6XFJP!nd|l=K z_MF?Wj_qYBYkA_B)ZxTRFN%DiI55yTp7q&KY#P(Uxf9IHCQiq5a_Sl{5wn1%+NQ;Zz94F+rN_4vxl7JbDtHO8s@iUX;q(C6Z^hw z^W7SQ!Vo6?i>Fm?O@;zI5cj#k)iFaXYi-!f-yBx_05l6bW#T^TImPW$GJCW!>9_%H zpiG>e`Vny?6w+oaUxQo5(v)3|mS=32(RA)tT;K-qvfZNS1{uQ0Wf=_P?oL4N5s-T` zgZzU_2b869&@d~V?kn$-R{j_3l&O{`t&Wwf601!uoYhL!xGiFr z%1iF43!()REYrg+RjjEge=I;W_GyVVUW9LI*&1K*8EN9FDBWJcK^b?#2p>^}pz%@fn~lSZ4z-)>?iH;HAN*!2O;?9D!q zoT=oNxg^TOW6PK!bcTD&K6~p^{T3z*lDhiC?$)+l+cxR<5BGlhh0yK6-(B8s>6y7Z z23e?}7dyTWH;vwI+0}hY{mpW=Mt2 zrN8}?)i0FIv`#&CuXAtTp5DE6ZBJivFJ5EMVQud!=%0$(zkBPVH92fG*n!j67BkA| zJu}<2>l>M3`z}WNkKc-^Mhxh4vnbbbXGfrF)RiTZ$}`wI6*W3}O-P@C%2)ST{Ms+{?crYsuA||%sFU8tkqt*HMhaJh4qOmROyzXT zgRygsv6Fomr^48sW*l)NMWI}>|JOR@-+ecUC-cbJYQHAWnq~wJAOoz4l`~e?jUdE{*FO(Hp|=%YXY`O5O*UeYO1~+eg=V-b;zq6rS~Ana|03 z>e+Qm?#2eQwx6#xN%iZ_%ym5GzpT?Wx3;U>Yx7^P$TvL}o$|N2IJV*0$*mgVGF!Ld zC6+07rpk5v&-@2F6#i7r8%5uNGSh6X=T}+$=p{1wzU{Zi%TI!xyEhpZzwQK2zKLF6 z_$IpG(DXd*_$A*c$n2wc;j-dWlY5(;JziSA9Z~3jrSX@6MQW`rL(5<93;C^lJ-E8{ zbL55Jw9pd!?|#qw@hMT?rK^;4-Y-sjM1}VWj;~8Oi6wiTV(>cVXadS21I6IG#$i-O z*7awG2(95TJePixjRZ-e5~=0u6Imnut3x#0R8ZmAZ^eZ>8a7>mfv&$z_j_`4^+a*r zY4zC#{1Fz?%+q&^Qrne5%|F+@Oizgv7397BV(n^Or_{Lh!s^{r^I1z(zRxQ(nNQlA z*4m$ImTz!waO5|bZZ%rGJ-N&uYVM?Rq1P7kfbP(b;cHytIKgpZ%Z_2f$8tH1vmD{5 zv@n?vH8cv3EI4be)TsTr$;uUHZ-*C%9^Qg}-*R+xI2S#l$0EIb&c*w7w3@^Dm)lOR zkt64{o&C1Aoe3!?o{YLVxIJRI$Q$W3Zi{ZXbY7uasyA0eQOoE2NyTN^i%y3dbTI`n zC8rVhv=e__qi}LDuGVRp0FgD%)H)dsuHKO{myQL9=Z>`|eF?~$Nqrg7wF^TNu&y6b%|hk2yU&BXt{2w_d8P@= z_lg3*6ua+iYz!+4&>?>H>@A$mKmW=^zN`NFc;w71SY?1{Ek0|RrU+_iehg8@>SaKoRzfd$da?I)a!ns=1S24fIu=4X{ z|2M`6Pw5sJwni!1`Nc=WI`0KLdwbc?JTv$sc!B>|UG5OCDR1W(l|r7V%P z{{c_YEf8X}D!KH(!P7G3S62l9%_KMkeiTFT0uW2D`B?6m&z3C-V*x1g9!uPNVCGy8 z5jh|+kN%#%d?_?uAP_J?Bf*i_QV0?tG5|5Gx@To*qH$y@km*Mv+u;G_#&7rCPwJ!I z*`+h0m_>?RbB>E}wgJNcxaj?2oT_-T_di1rdZSqkz#7of)y5(zXCm@Ij7YWGWHh|0ydZ!`tptMkx$! zqBJ1xnBBk3bxThjtGvl5q-P8)-*l*C1?+O5LCG9I=tTfzhi%EQu8o@bJjMSxy(Ocm zG@1JiL7Z*poxb|#y8!YrxBfU7RIO=!+~5+9^H{4Rdx7bo=p?>UG55H!3E{O*nRXeg zHiyqJc%*@%qHwlHGU~U~gt~&|H3F9vfCh_n5l?;>?rqWl#5lqhO@?<(cilkJb{;-M2(F)C%Q zkL*4z2Tyipa7z+$Bty9|ohdX9G z5Pnj%6sY9~iF7}qW zWMY9VpVeGvdfdgyzJQvhb4VUJu&vLy@#B!3a`p1p7g+BQ1!HbkN&1KPL#>AdEoW3z z*^wk$_#7AaGZLuqnFA2_I-r0B%hO8&L3E8pL)1e#P*{#pO8{tmM2^>hkyyGB?Ovs6 zpvyQ;2;mDiqWYm!0Obu1(+Rr2VzRF|^lkn6<^{09^Hal_?%NGt#FApm|H82zADQ76 z)rQ~@n*=Y@)tJu1Oo{nwHUN;t2F9j_!*|oMKa3v(VJpC%Jq@HT2)s_MKmHTP zkd3pnb#s^V!vbPJ$?oYHJUM<;mx(Fz$KKJ3g%QDFpLhwb_EG!F{Ftu#_+)M*(E=eU(mg$|=LdGXqLxb&7Y zDXe24ex@{>4o1}hE|^LPBDNW_zX@E|B7RG!)+PI#T2|Ig04+Sq5|zLb^W)}Vyo8za z!8~BN1qDhAqjNuvNt3Le5aYHWvY51obuNj{;uYeo`MIJ#xt0O2C2>`5JD^|90>p+S zUg5|GzL<4_z++z51PMS4g92C=5ihBrfsIRn9*~}T-n+I$omNs@ewu#kiz1^9X~QL! z<8jl6A^8={?V3qQ;)VOgSJ)E`$9PX+N2R`>WJye%TR?}5EAb^Oa`Ylkd!%V0a-Vlu zVNoub@3qfrV z$$V2AfmtnQn+8cNiUbgUBn?ta1hE(Z;jc=GVE&xej4>K`>h+O84I>9>FynbWyZe~l z1QBL8A$aq(>7nyOr`6L3pzzOBNW(US)dCoFVBd6*=V%7A5as+R7?wI<;U6gLS&Ps4 zG$Su!+p{)uX)0_31FLIx(E<*yqUBTIdvT2!r4k@y(GjTioGyyRpIG~h1Z*gdo#rTe zW+*1x@|C0c3bmv@GJTq)p{o1>`SIcOZ^ik1IK$ft28&?}y(SW3vQNo@$viJoI+sXh4^mC;^Gapb!Rem{?{*L1*piX2$r|+d3 zrlo+oiV|&C;PR|EcvnJ@AI(D4mnt^^>I`T+%&VY>f;U`G>j;sfFwr7muM8f zu=Urfb07kDOu+Ex8Fs*zXE@P}VfdwD>>5)^`^(Hv+C z6=aqHnh{@-Qh~HzsUEyR=b036G4Hr~Im<~nolk|R+0>HZLImKT(9&l&lj-16fIW=a z9DpwyWJN;D#TwJD7)YF|dBGJygCl_(|7D#fUIv}4Zt=}E;G&_!o=TdhnpgqU+5J;w zMFo{cP3;w+D9@Nttq(j10FVU@T$;o*1kP`uZ1a!2^W=5W0@H%5lNUNqa`@v}{6PVL zdL18;1!c!10e@yq5SV4 zcb@IVk>A~i!>i3RTdkZ{pzdcMu3Oe&YCt`r+Mt=z_%dZ!tk6cjfk}`72>m=Y6&~z@ zV|-<=ft>IpDBn(on=_4YEAUPJo~J3s7qMsvk(x=7Ij3Tosr#XzxX1gVWx_tmv_Rr& z5a5Xf*ifcy1N4kB120#mDEgr(49vRw0E+H_1`>g=l&v??urNTI_CZ_PGSY=uJ!umv ztHwyDDyx6|IQISHClIr(ykhBh=|mJDpQQjGAw619c0~hvLj&MX^f6Gb;xA?Ox9y-& z-nW+MR36VH8t3v$B8R0c<8ctmv%Ne6KR9Xqw{>RDE34F=?o`u{D`!#~d*UL~&9qaG z!jrpJdiu9|Ai6+6So)EYjdwyHnJJzQi$R}~lzPOi6ah#W>&sMfXqg0G6srLZ-S*2y z3t=&IAx($bTrYNiVB89p|ALEG+8hmF*^M%qP(OPrej(cTgpWZ#-cf?;Q-27C=_>zD zc%+DK&61+t~pxQ(T#dJjCFSv|5Fq=FRL{XuNidI7Pz z2rLMi2iy!Ac(>Ogxq(KM0$hnDUy{!rR+6>d9AXZi>TsU)+YoQkB68W4!H}}-OQ}E| zVD4`KfqwyV1Zfaz4IRMp1J=cMaAc|HZQ8*plPiBpuHI~BD0dsjZOcLwiSVtm5q|Uo z%>f03bBk-5PmNvyn+=Gj3B#TuZw!D@XqZL+^~}DXK_6^UT1}^kL4&N*8UTA~i_cHB zCVgW32c~|Sp+D1GwoSe|GR%d=t|G0O^X-QlaKA7MIo?ScGX0#o z^9L6e`}~)q_a*s!H#(h&xPA8ZDx0}b# z%lt8P{^rf;u|1v@e~ryP%?G;!o*z5`AI$UR4L9GF=0fa=_nJWo0;On@a3 zh6n|RHU#z~l+Jn3b{c_coqkfB`SYBBv=zEa$ge;le0Bd{tkVupu|7}nDNl(tPstHJ{|||0v9#aW%!icE&HQ z!{5EiA71aDdSm`{;Zow}QXnCdEdG2`Euo% zk z_9&3T6-1Q?qNxSZF+rWSL0#@a??Qs!Ck1ui3i?nI^szSRQ%6uwUr_H86ON9PW z3*E$oZrO%zyNB+Cg#Jwm-MtmMR}%WKHgvxu^q?>Fa4Pg@EtJ9R*yj!dNro}sy8i#Q zP9qevBa|LS9D5O={5C?RKSFgnLha0-;A$vS!wOuA(Coj#xOyiehmb121k&+~)I_G` z=x%C0+$>gPJ+?%WL9zsPCrhBfnne!eg8;%bc%!pB*!8}YKbJjlyZ78Qav=%3>AIx zOy=2j=4Ox8ZJPBOc3WqiWVHbc)CC_KW3|ZGzUUE?7sjIFmx@8gT$|Y@e2LB>?_98h z=t#yC?52ly!*Vxb!kjZBcUcpG%-9VUpJrI7Uko}s19v4&1qJrlfSFXs==|OGS_0{l zp>F1p1ul^`@XW~Wzqc(?B&z?00$`mNEIR8j#udiz*iuNKKlm7s=|=f0gWU|ah;~+t zx?Py>V(IxLnUV+(TcT}*OSBYCX&e7L))1`=zz=6oy zD3}g|HD@ML_iu(90cx)vcdZ%byuso@fCg@`IETRkvAb$vEJ^z)iI0uF zw!5#xeLy%9*J%FxC3lPsHnt7Mb|s9(q+<`^z_4AJ z85s3-$}@$8D@bS(cLsf(lr*y~W0A-(@JrOWlKAFY0zWLKxqsI<8+LvvMH>y##iqtR z%(yze8K97oPtHhKj6A;q%Y6)f;kOx%Jcx7IPW_uTK`TuL3{#jlAdwhnH4C7NfU_|_M^l2@(4UX%) zrDBi26JVGbUTSh}kF|>D&Lc@TvZ3tRrr0;r3lb0F6S#tRPpphJ_uM5&rufTd~zUXTGZ2u7`pJdA8Y7|BNrh)5aedeF-#uX}J30-}cJjmHgwOM+X14 zetl^Sojzg{b8e7+B!!3ZG=!G>Xt0!jSTOuN!e1KCA`**EW;@o5PT{2a_G0k^JRg*L zH1g5{hO7n46uIJdrN~m(b)_@q?sfB~-aH|9UEU=;Rys@Lqpr-&6BDuCwn|TR2Y6Z? zhlcrd*-y#lnu=VN&3kzsZ{}yCH!dtZcBqR)ZpvPjD?I;S*6Bxv;|HI_*i-*wopQBk z+1~qqSf_6k9wz?VQz#{|>nVC3ZpiQ&9acsy;r|Y66fgAY}H z`g-6A+d1=qrPX5j5cX_@bj(N%>lW!*U zZ{vM;w&lqJ7R9^z<}hTJBjelb*_DfiLENnb#}D{!*1B-wBi@&~$y+s#DJ+^uyUzSR zysO4tmEww!&$;4z9r?uy7C$7;$y}KA`F-*+zpO|;$u^?_+av)mM;}O8a09zdZ=-`$ z?s~gD5~-$({~55IxnYSjkw)hq*pcOg%YChrO|H2<&G_G~Q}+XD6kYM1MqYiE@9$}g1L}q#GPe^5-Si{@<8OTcW-7QM&ri(@Q zcq?NxGOq*G^V5|A|FHQ;H~soF)tL{GbSzDzXPsjzMNW zIW2|X=@OI0?Bo_`f2T{V=FEwQ^H@4x7-Apcr=3_V?L7nC6j|1-(%gJne5({#P-k-hQba|z6>TFgC?wpGsU-R1`C|w}*aWapXXj$!&fu-U z8QP=Zuti>WD~AyDxyMfJzjFT#em=8AeP9eg?8B*?SI5{S`=XRk+QN*_X8^hwN7pge zIT7i`?_RvY1F|j`e`iGpY1*1LgAJS2Jc4#7+N?;kg zSMg%_bAa2o@$@qW`1+TIY7v)`xe3tZdy(p0C#HUNCWTc9L=yebVYtyB5wBb2#@}nn z^9-!SIo>jVa?h59S4_GN<1Cr1fPi4 zH>;sA_U?Z*QO{P@O|c5hx^R^ff-=#IT!q+yt6(j`Z%y8M-aH%*43o_#Bg1vpH(S0* zhtKg<9GE1Es>~&RuMoIX5Ei`hJFA=ZL{%LV;0UDk1iGNA_3M%)>_mTv+@*Kf??Op_ zt_m)7vpQMi^a4|_TUfXhvf%+!K$RMoGx#f_jA18)-*EfK^+7Cu=}&57kb-{=0E@2f z{k4*FSJ(rBuqSur!BV(gmu8=J-u&Yxp`U5}^vi3lUobTbHpHnfmsvPUZ0HRt+?g1> z&MrAk9|A&{?3+r!1ZiUOH)TS4{(LG($~UmxKn9D^_@sWnlN!dM_6<8lYU|Q1b|xv2 z$kt@_@Fq-QY+fw?#c}DqRHTEDXA|X;;)`$F z<2%gJv3(W3T2Az4QSnUU2pGz?Bzj6dw_{U@D8DI?faO)RY_aH2Pk)Oo!-(?2PU!F6DWo4V(R@!@D z8zYe)s6yq}%r*abIby-caZgIaIU%wF8&E_6eKKeKyn1>-oX?4gAd22OIQ?#BD?hUAo0#X zRI;eUx#FS*rM2}`NsDA&S#ePcSkN(rcjQbhO zSYMfNxfEs#!a8MaQ4JB~Cw(57754%yJxP_F1fz^W(mP;&Ebh1g6r%#>uOW^+p|d*S zEo$7C%b%tlc$Q7R$Lr)Kb z#iGb3+P>k%yfyzo1egpe0{19P($WOSQ3&0|RcN1;F^Zy2MT40CmOMnT_zGCF2NCff z1{E(Tn`Klv7p~0!)#d)DrboDd(~^jLeAnG552{-Lh)x>+6rRCXIt7GMV>) zWOM-Ha-^$niE4*kvM`x<{8x-1W5w%mlt(5i%%LbUg*yn~+69;lDvAsU29+8mkPvmo z?%(MqUdh}mBqM$*${(UU^H~xD_4p?zN_pfu^vKAr+zFG)ku`${S8>4QJ!=`A4uw#@ zcXEOh_s^CzSU29NXI^!N*zp`IPNBh63Y1F9ttN%-gCNKSs1fDCpEC<9m3Ve2LM@ES@RQVrNMK zDJI4#Miw-U2bC7;5m#TJL691ibj2UO^J!4w zF`TLiSU4fY2|+zIK-#$pm04BD-dVhcfpT?=EsPU+W0NoZW8jq%D3U5J$7>!LL7x~A zN)Z0@L(fwea!r2j&5m8At6KQH6$gs~@)`z9!{L<*Z6jWBPmG8^@jgHC!?;TUDR2WQ z{JPTJyv|#H{6+T^Eo&+joFFoz68t+fn4;APkE}f5oBTg|(2%@4Vm9pHbch7+x@nGb z7DRlPj&y-s?dU^NDtw8lzsuNZ-yT+vf~6}tqDPbKIXR~H@Fl;L9;QQgpB#&3N~i(lo@2YU##Uq@!5{?283uu!`Ka0*}Mb+g7_5$P#&0>c59JHVz?BnS>i&3xS#C|(> zVHt1XZ(&zG7hMf+!hcjE>8KcIjHc?V(~vn&(drCi$(eE{RwxdTFjL+ZcDQrHG_?eL zRUXH!ywnEGz{UmMrt7Jrdk~G?N%ky~LTvJgMh$NeRz>Hsg-Q^;I`<;LvDPB~3K{96nWmHxE!@Z#UP6EZ4PF3gwBT6y?kbm%W#e%dQ;JmC9ZD zOx7RIGmK-s1rlwN$|*Efs=;=~8NbyCKz$sEi4bi&uVhs)X@$BZ=tK3%Ap~qMtPMN>C@8~ z8$*7);1a~t0=m2R3lC+YrGA{e%Ly0;h->G}=_RTx>I*7?iA>MGJ|77Fj_-O(m%=hN zRD4;Sa!t94*sTY;h9yP?%WtZdTl!MO7T0eO8j1SlelUgmOPxo$z{E^$-7P zz`AM*@?IMxvqXQA_tZz5fir?@OqXAuu&3&$o5WO^Y{xye`xDHk{#$2>J`GR)Bfo|> z!T$oD#aoe$V#DYE85^ZF>GhF@@E_7`mTgB423~yTW<^O8=OGNT7&*(jX-* zvld`8KBjQg5aATan#7+qHDke1lNKK;Uy!i{h1zLqOtB-q?3R7?10rmI`I%_vY z>fr0apBJ2+4G70oR07eIw^8Hb(s9`4eb$C@3mR(z5rH_X`J&#EpQK|!8@weh7_`Z+ z&chW%P=LxC6INjx^=OzcIkuO+Pd`?6U2+unUCVg$rNKg z9?Kv2sW+Y6^C`y5&m7FWc;t9&T{1TfLDfShA`en)Im@NW7Eb4A+poA9YJ)h_O~!41 znQofoD2?8hsMNihoKM@bW`tU8L7oT;$viMI`Vn=eCf5SxEcuIN9S?2cb?n zr86^CpTH&a1FzwdqB4N{P2y6>ClO>=WZ*i&u6`5tAn}c5I|7MA5yw_18`MMh`ur?p z_%3%bgEFG`^ri)G^zAZ)B2~2;cpc&v@Q)Hu2>zbCd08`n8Aw+ep!eSTnX>lVJVNPY zBaLm#*gu5g|0~t-`mk{?H~7hP69Yga(S4d{3*>B|?YF+At9$k z3l@oM;8>@}C&-G@TTh>)nsgaL+Z<^x?p4Q{H>bk7#Oh7NcU3N5To^sInqQba^w0D5 zZ>_|P&P3Q2-qCGP&G^(}YW+XF$V&3_2BwO-+-ml)n(GsNiH>yj*bt|V+v?`P-bAP} zK-RTTm1Y1lOl2K@G%}&WaxqNFKpoRrPyX^+k4!9Ip|?MXRjS;*K4%ggwWhYKIx|XN z|9+ZV%qrQaQ+|v3zHK!^eK(uq)Ppalta=_n-3^g zf}4dRm5mIj1idlBf|SWxr(}X{i^I7q!^c&A>tiBhzcvr}yypWzig)~r_fxXJzIpmW z&lWxLTHrhTfR24@XN_prY44~Cm5Xo4)n6J~UVMHdxe6UsqS7VEXdjTcHmKnqr@r<+ zUR%FuAJU}X+H@xVrTP9#{r#p!dZYLLD?k0$DXm6|DVnMKudlT>SM0yJ82{=DBfjZb z>&yJsHy8EW-ZJwBt*;kb+m~A#Kee_7?zexAr>yJ0c8HPZ|~QqY8_C8+9;d* zZzJ{fb6Pd6+nOJ@QXSf;-Ul>=gg1T%?`|Y?;uGkM*7x4~sBt=Pr*QLSx_yy3CQEa| zmDsyTGRh(LdH`&@c!`w*megb;;E;0{F95V_mO=)cNk)^vKm}eAFzj1y>-~Gszuq>$ zcQ`Pe(6`<;0!bWQPZ*Ru>{B=#I(hin{IKs#`nep`y=RjQ(WMNbOK8!YfNUy)v-WGm6b7QvWxAgNz%;cvD+XMMxz z^4XpX1AZf=&VDN@zc#AJ?%LQ7MJeB|oDfzr-?AyF4V`Y!1ZR_7P?maHX;eX-d@jq` zs8ooNnn7%58po0DmH*W`jR_iWy!=1b>5~(WH~%;5G%o7r=l`-!<6~BPPpt3!w{`k3 zT=d$1Tc>oVs(|=}z0IFr{$rgc9q#WvUY?R*TBnmXC2~E&NGepoH3xTXYMGk`mvgjk z{GWWei zqW1HX!f9d=ExgavWtfi>u*SmGzJWC1A;tgc7FMe+*UJ<8n9{A*p30MXEJ-ZCN3+*} zuUA{2d+( zr>24SU{qA*3M=mO2bnaUoNoCCL4wb`pn|#0o}c*zPlGV_=FKFy<8<+Dr^Vs6+pkpq z2qRIOVTS@^gmd1Fqh8)BgB)7!%@mcV|EwNg=Wxe*33HRX@m9tUUCoL($Yh;d)5mRC z2U#`$s;{B9SJ>>apJwNhg?mf|++}+%*?*0#f8^X5fAdgihB58ij2la~!TZ2*@7Tf~ z$W3UVzLQCEIy2WdmF=~-bGp^pWihIX$D3Timqy&=jl4^wV&=eOq=J; zT`K(Kiz&vPO;hV$VCCi?0X!7UC1%s2LUNkKu&PP_Y((S=$AnB+5kKxec{vr=a}1Rx z614Z-TH`^4g>LxyT--?7l7WH6_|l~w&PH~Odq~ue5;aA?wPU|Z@;`Y*DJ}LR&h+`R za(|4uFkh0laNhngy~Ne*$s57KTsPhO@9*{f%VoLy$y72iFvi+QJo@9gel^>fLfrYH zo{yP=!>85M={xO?AwR&!;T+CZcV1}E{Ya1fE8ZVrWmEZ?5mDWPPpO;@ezRmO{us$7 ztwspnUfXXh-n#f<*S9zvHCU---{C1+yb*Rv{a|C!n&P`;LCFWp5L zGg&aH)3wvKJ%ybYJI4C>`r9|sVGXgx252ei(8%wo#>Om76p!<%6@<)!)QwQ0IrkSV z(>m=IzU1f+y@XBE-5oPGx}K8h!(YjFj)%tKkCsYtlx0LM3{&+Ae@bMbJTz+RZ(2E) zOJ)!MkPnP(;Ght)ukp)CLf%+7)T)VAqPsF|$3OmGslSv(sI$&rmH_4 z_IZb^uR~b*4@AyBd&+EFkG8OrJLdFqLau%%x6-*q>2gPInnat%2}fP9cn}?KHn|{S zvxHL}{UsdRjkVcCr^v6zf|7m>7tzrsPSNrMLUQ2j41o+UVdd91%-{|D1uB>NFP%qS zdiCVKTacf)zSO|lW_Iy5(#0!kf*$dzz9=mx&_RD<1+)MvJQEabJ`_?sU9v+sQqic9>!QplHyV>;a zr@pdn;xvC&0Xj-K_H_bDEpYKuy1?e`m#$@6ehR&w5{yrDx~|E`kugT{Lq$Sb}=hTSxtU-cq$F;#-+$?4)I8guf^Gy&$;Qsx$# zY=xGV%&TP)(+j?diJFjn(nou$@0O{u!bs+EaLwaxDQE!*?)>zI#g(M^xVmlxOEhDwTukL?ZiO+tmyH+y$ zHuBr*@3qy66b{%mMWL<-Wnh_rS^QliA?6|{RYfm9pl`TXOLZ}8@$xNlxXOch9cLTW zxPjCQQ&B8BOKC>3o)B*#kYow*&#e+HYhV^#AZ$#HI}wEm5{)RITOyME(4s!>GXCGqQxlmm z&>FkKY4*$kcDnkvrku7)I!FA#?9|!=aM*Js72;8Wx)A#D4+&9H$_6DZq?fYAfZ%2~ zH+Aun>c}mZKJNW8q|$)oG2D=C?}CX#Y@lZ&JLtxl;)(~_5H`R{z&ZTWh5A$O_9i~zg8O8F_kFry8iIl9jJQDo4SCd+ z!R(Jg*Y0^QXW5BWt0EYgzXfmF3+3k4XnPCatq{J36!{&D!mPZ0SN7)EbDfw+T57gB ze{kZ5vm@>m5K<{{Es$qL$#Ay}6_8Ts3m14EULg1hK0tM!3bNIP6zmRWAMl{xgz2*W z$!`f4+bYY_^b-h8s0ef<(E*|;`dLF#OTjw@5l5Ie zW1^=xbL%<2Gu~t`na>S?Hr_$oT#KW+^o^p1b|3CUKk3Ap=LwH9UR4b$^V8N~!4VMoNH&IIm5}u}XEn{KkUK@}$j& z#hd6d2fbcXt*3l&Zik}DV;@Z3i(9EoG@D6)Jz+6uL2HPZ6L7+X$}$AuIR?_f&qidP;93b-DvE_Z)zH4%E&Vd z<~NBPhFuv}KRH+Ba%T9u!q6>(gqa_2DnZNK8{<%+^*%*%$3c4yw5=<{;e$Kj_KPiG zRsuC3IfUikrHZ5b1*D?_@;7cM3Kuh!lpzQD}c$Zdfc zh!dZh&@dW7kf<&!;mPlt+n}a*7*?uQFjosVT84nipnc<+v`T% zEA!3e5sjs0S4nY2%oZgG`o9SBi{J5!6ug}8baHMxccHA)B{IDO*X)DzvL-p6fxx(^ zlC{IE1r$DQrj|MjIhyg*tq{@~m28L!RMqi(DRu5W-bG)A1B%-UpmOoou`zVLMXj`~ zPRU&Ij=VH7f?@~xc2QBfRNh$6GmmElB&5RYrG$Jv`T3=UvOEQSK^N<1!*C=&&c6{_ z&SI!9;rg#t*sh$pEy&*LDY!w&ZXpTvL>IUU#Y9_0pMRb0BNQF!eC1s3?f?X$sMG9gsQ#XiqE zCcOHuRBKm?^VnvnjF~MYI?OXB420uhCkf@1Ay=5U-bB7E5RwKUR{(x9DZ~nZ+?Wx_ zBk^qjzHDPeo@QZw+_k-vzHWlZV?}&bq-^)uXjNUI7Eh6t86HNmr^t@B5W7^0a1o#G ztN=eQCED4S1BCRYMtjT(u$Nu9PvWCd?zlShsDS)hnO_i6gi?`Ek}f$C$5S}QtL>7( zLFEmJ$%x-2lkQ|Y79|N33AuX;!mnI;i^__AdS%fWE=SEc7rWGht4_Y=B)(x{Mm~;ngLvPARNO0! z9JrdHUnJB#!^4i_yywYJInOhEDob>tjP$c|2<36ZU*c9Nten@SR*HSp^MuPR#AF7! z#3xz#47zrQ(}}|3L|BURgjg{tTWW>r3^(*@@E|?sn^m*+W5HXc91SxP1+{;NN<$BE zd7SBamcw~no*5hNVV(+uKq6N8@*bg|D5W-mVw9jstOr=-dw85NIG2 z{N1{J)04lvrKVpmY88E*dJajW@B%nahC7KPLW)DTEdPx26JosAOtE zv@RmAOn7S;H3ay-JkJA;eSS%>;RQ;R_n&~~eU8eO`vUjt#rGRpsk|%LNSnKS4Md@= zvip(~FRxt5x(OGKn_+v}ME>=E*t^SzDBu21_)|Ofm=_Ef9i0~a}u4-<*Rye=7{jH za-&@rlCvfv>g#wZ3nA@xh3&}=@^*f1B9)I+>2-E*YDpXvAb$%XB6S{zCB` zbrAjrNTk^_5vedUfaPZ}x*O1AWzS7cVUhHX2XkBNsUjK0M~Ywaju)3}Giwb1Jwobj zwGm(_vK{pe35-RK95$NL&+)8piTd^tr8n0v+2svK4Hg;MOf1v+gtiu7)?uUAnIlDt zo}J^)RAV_j-|UgdeeBD(Ny-Oi@Ly5qy_Sgl=FH)*wq1Wnn}j(L?sA!!2ce%A@nim%8!@PbWPtZ$j5fXaICGw3 zQcqA|OuBO^PyO0m3mz)tm1R1X0reuhKmI0mKIh8;y1FqgzZxB< zgBl?rSsUAH32)nVW*Ob_?^Rh?>v-3(EG+=^jcQ&Y!R8Yh4{mB8Yj7yf1-9tr!T#!IQlwbTzGfVcVAOKYjD-k%#_TCGaA*jN&0YjGImWREhSpHJ#H)bH^p z`aMs@gys0Yo_QcZBE!6IoeKx*42wDCq&gKGsUSS(lV8lMr?I3prKYB>70#}uHZ2`B zAkqM{a|*1{0C-CiLe~u*4LH+$Yh7mt5sY0=$08@1YSZT7*?=$fn0W-?<^u^0dr~RbQZWyn_*VDETAGf*b8}!-rud|=j8^39P z)O#{(kQ8;=*UrkRm2I^$+SaeCpY2@L0%8ak(w1crd&~WmmWn)3`C0m3H};N%7^L^@ zT^}qDew-&*crW3jQU%~wGF|1uN1a!!A+b7KGNtDhfh)<65=O|dA7jG7H_zq~@>-WT zGO4#8-F6)H;Pdo5nwmSB6Fy$LeZ2hf$mQH#Hec%C=NSui`oSC>*pfu`gCL)BgeYdT z1hMd|`N9Y7s{K3sw|XupWpzXesC!4km9x9z#Esu+gekwL8^v)e?D`;1=k&cj<22b2 zw)2fcc}(AmpZjDkE%CWGL@@kJ{?L1qC$mmNSz{^=rbn93RR=~(l9~!;Q$BnuoG5eX zeWw}m=Wg$7=c&5q$ZhO&t^4=M#)$K!vyHx1btXv52l5V64zG{KmJFMyk^IY%WA<=C zi|fUMgWOLNwbXyy3eR-qN>}a7Hcr+O^n5foT4YoI;vehudzJokH6bz6tsc=e9p38A zibKfn6~DYS)$YdS!LYX->vhs)!8RYGk3OtTPS^X;S+y9}De=iYv<7;E&ue)Ko!YtL ziB@*l`CSW}Sx>q5Uvz4DR^qirFQ{XXooIE-Cr@+FyLK? zF2xqONxNmm-Q;6uF+&M`)=~3>7mJccXw&T7dl|u!za3tOEA-sYijH=7^wFekK~A`j zEDo^6Sq|%InTKsh$^E_NsxMsmiyEWe^$Wm}pOdLJz}6JXK9f;(Hy3@PYo^Dk^pwQ@ zeG25^0tsxbB7U(<=gb*DFsAD@9>S`Rb*_XO(7lB)KWx{+W#fKrUCFJajmrp}Ld6Mb z7d2fAt34(cUgE0k8Vb-dth5g*4f$q|vU>Z6nArq6k0`2yKI4V}0JQm7F6IDQEC2u} zI_pLm%2Av70zw`=6 tE&E_((OfIF=Zb87vtZnhy5p45@^1Pf<8_OjYKqbPFOzbecEojp@@mxSzEc6TbB6=rs7<}En zrLSCjt*TBr%qmjVEZwlOMR;)RQ7c?9b9X76_G3M&diKU%e31H&U%ciL{((y|a$Uca z%!GC4z8KwU{2W#L(!jo7^_J1;=~yn)i*(ycWw-~pax}HHDRFGvW#*BJDT_!i$K3AK zUN%#KIM1nvNF(0KRiz<)4u|O!ui!(L%f07svK_OFWy;O@T9mshK|+@|^@60^S(?%B zi7X=U8r2iOpKh<&SF3*iQTYX9z2HjZ^dxY|xH}-~dH~}*5np64|YzIS#8d6e+SUq%KHI?&S&htYjkxPr* zb%UCZ?f{4YT}d(}tdoP-Rfwy7#@F$Ct0+|I9&=wjQNc(~dTn=_2iQ+C{_LBzO zgvAJ{ks^ky`c}?CB>OMU@ySIwy8S~1YU`OyNH7cA%%P!0r2_%{Q0-cl;L)9qoGyEW zmr1Xt+Hi~*d;@`785cfc>w3|?1b^Prhf(8*@%Chk6yxkU7p?bN>){nSxrQad&o!9y zL*^pxwn-~c{8{sYZ4s^MFD7?H&Rc|k6}Z;r|DLcIF4qwFrm9F>@ae_Z)@L%_OeHz+ z#JLt+>~Vc-xcWuL3vLt?8t={)n*l_c474@klRQBStV>A$5h11f%}N_^ zjT*&qHAsuM&giAGn>M1@NQEx#km{q60eii&qQ)IQIG}n>03-2U|8lM(G;}HRWMdX% z%!5TU=>vX=M;BdJKJ1wJA;-ezfiZ_@8FOeBh~0+EH0)hRa!+g@ zptX)yP#`AsdeI5DEO3X+*nz}#ftmZVx@l|Ixs5yu-G9eOdT&Bv$9{+2j4Et?^`=$8 zC@RyhI74Nu4pX!5@zTxXs~V_mTw>5pD^A}2Hn(h1Ac9bl!bF7#d7BSB<^nA;G9#3! z7^pK>g}if-Y!EK5r$E}GK7Ipqs&{gFt;)J&#sRx_cZ#L9H5Z6f_(rUs`XZBP)OfbU zjTih@$XQIiU3%5+Qw@E3{h@_@|BQoN>G$-W_ZyGR4Fe>p$y{F;)8gxb_-6+yVK>O! zxC?fR9(k19y*C#cH#SONI_=DAxQgB0-#}~E5p@f(pUzmEq*(19%)yO4&jJVRu}$9z z>wQab>gV<^Cic92mRV0M-GgqN*5HJjd$!Bv7zbTuSxUAoht0J1yRKl+@qPx6>T53c z+;QeOQSI5KZ_pjl4`;#N&K2I;Md|8I!jzlzxx}x!#tvG3k#l-`>Y_r{2?Kk_=@q@O zPF2v;Z5^gsktI$Z*3%||4PRAy_mYnzoY`hH1a4z@1;#D~$Yx51`n>A7=^2ekS&qCf z`fM+hVM9LdAV%oY>Eh~#v91~STp5uc-YpRW6sdFMz$>A0J(#yd0*9#}q$1l{lSv&G8#GgZ z(j|||1l>nTmIt*hl2z5uoYpz3fHl$its)HIRJ#TsIk~GVn7x5^`laJ~yxXA=YZp>H z9Y75J-l;l2^0tG~8a%dpUJRtgssfyr!8)|5{JB}v%BwNImGCY` z^ZxX;zZ^Uar1Rdy84Og8@1%ch`+7E_w$SWsPUGW-C0nVr&e2YkEioXb=8|f?mDbwX z#z5BciFOXI)<5cd>k$uXW5_qf5pM20;0IY*r~D&%hU;k)hL%-UEz ztJjMcH=id=0R7Bc!`Tg6ge_w`d58d%jgzR^F+pGAshyULdwaO@XCGQ8BRi>XR(M&( zKUH5OH1W)lnMGvS_lv@fM(|VRF)*aOesPC&Yzrzoe-!`Z%k}I(&~b%3W|9W` zdZ{Jnb7rHCA@CDo$o+{n^NKmc){8IqtN5G7%6t!Z>Cd*mgHQh|6AKk#Y2FX(Zu#=U z3-VSU8ox#NF3Wue@s!zKK<*QC{ju<7APHnhI*xsL@K?EM=f!reU&$NazfznBw7V)m zwBu!jREOC<*(@*7V>HL?w-U!+n*V7(|MBa-{PzQX>b)$7uuc-VEQugtB8NH;$0h}M zU4qM`(+u)H&^Rx!4fgUpJgDw5P0r#kg{E%{U8j}Vqzj4Y=7U3mnZ06 zm-W@WOB+s%~*|F8kTt2)GXqhh_yo4`k#+fwbnq!8Jeyyz3f(&MHBL zR#SkkyFmQrgM)b%VH`^v4SC8X^wd^(nM%Za&+vRWD}@471R`YtNN37pXM9ApG-v&$ zeQKVAI1$*237<;3(n_1dE&)7%VJq2@rH7Iv1%;+290xe#Om&1OOX?GsX4?v&Ui`xLOKC1!RZP6o+sjtisuaAkZE`5%t zub@K!Oh;stFb#d>L4u*5rv}14n+P<(ymEVRQIQDDCOd4;gJiLw&Dht+c$5?)7L)>_ zk>5PGVwR;O%1NR5@t`=KqyVd=;F`qeyl*7tJ2Va23BmI-gdI2{O5kKfv@6MPe(Fx#CL zqYIaBW?k+U&5=qU|C%C(gKy-ghbzZ%<5PdD+L#mQ{(y%zUNrXawdn{WF+HzDNS&zR z3c1;l%nbme^U2)ru&snd6kmqSc97shn*pzP1_Z|0cjlrJivf-O7?CMc9wu*XwXT>U zW(3b8WStMsQZ2|*@5$0!%hG1f){)89Gs?d3F#BS7_N9VsB)UQ$u5p(+FE@tlNlSpXSI%FCU}D*dh-6UP9rDTPvic(b$Xh{9@mqfx|ScO zmj70!Ah9k#{b51wT3&`)L7_FhAaAW8m7$)h6%_9mRLbOYV++fT3afew&y#`NWZ

l~OpegxN=MTc&KYpj6hqggc_F)vi?Fd+}a4KgNf`Zd}%SnZkyo zNEuU(tjmA)l(WhzWYgo8aO=Y%U<#pq@Wsyuh zpE3iRVjUkKvA4`1qFjN$(t4^K8bP@-RV|ZTLh7yXjwtm_t^#GHI4mH0y`)0{*TQUS zUi4OhzgOwXmbpd0prHkiNejgJtSpAza*nXO(@AY0qypL(YmZnJ=H zBHTHZfv%iqjtUV8T4D&M7p%_Tsrggq;8(}-y)HST_HRM$=jK}YR6~|=?fYcG&g8nx zDd9ZhnwE^lZkvW`-L!AroDK}Vl#158A^U@X8YA44Dc3kwWMu%R$+4Hz{xsgVYn)OS z`1HMG@OwkAI=%E*w$VSLWZ#Bw?|b8DZyi*iMxvMccS`)%>Iwf?re+;SrDiy;T2y`T zR5=d1=8nD$ZN4|yIB_6+C|d$L{OG|?O`8&eJGU@vG>$vf(RC`((^QdUs+hC%K9-zf zdBTaym|>Te=8NXSIPTwV(uBVkAu#nUe{yB}J(hTo>$}8-is!YS`K>Oxm9!OI}2_+X}S) zaB02i+%;93Y0ZLVs_*hFI`tQe#EVqja$$`#=362+-&>)g9{ zDkt4H1-!%thtIM)&>&-HS^a5{n+&)VJ9v%A2~FjgIF$OE@=4b9(;=e;+PD8Yp4CRHa{faQ)UfFl+9if{ zs+uWbJH%g!gM|`RUk<}?G=9%asjJn)tqZP7Z1o@O?dvg8rg%pszmeo7RbRCZE66h}eY0Ea9yjO%e%lz7HHqdvba z{`@jietP?!lq+Nj%PiKWzNLKX_ni}t@&Xb4!~R%ipR34ceaYMo#y+l}lcT0;i>H17 z)k|;{wOpu#vgdj<1;*sNB1_>Ax8ko zJpxPRECXR#~pQ9a!5FCML8D~f)(c0`aB3#h^qBDi5 zv)g4SYKNJui0Ix5jt7LP5zQ9n;>lYgBB)uRDZ*0ZWn^>&2hm>;ebZY^i$iPHsRru& z0O-lyh}g#tvOhXik7K%qtM4L1O$iEFrZC{H@L53)e{)((WXibvPAB!a-4y2R%<;bU4WReInbDp^=)`wJv z^v6NFoBtJ^`<4@(jDtloQgs1Rkw4IXqv7>bH0<7YXmrA`nezSe!A!vF4;pw$YmMbtV$8rqnDdEAT=GDn_tCjUBu})iTC2QGn z5;j8;H5H>b<(Y2_j(O>dhs^R&SUI{BpIxKzZvp6hUr%#T71k}#w;~=*e8?1e*K#s# z2y8_`n#QWU_xC7RRJI6_=^{c(@JMq2&j)taF6zC4zx8@#w5$1Ff*Zst7Ao;oUkNRv zNf037nAo(qNd05ogD`izyUsEZY3Xt$XZIFkJP8Clt<4z~#lpskwmXH~E=R$bL~n{b zarnag1q=zrc;;$*e4m0{8fP*dcQ>zgyL`|66t`QKBE$hN+9u0+R@(zmfkd~#n-`c} zMBx`V$DBI*BQwU|TtPD6RlgZXfa$bN%tVP0j|rWfJad0o>*}~%`cZ62zf6?$|Hi2M_ml-u_hWGRiqg=Qwk+%%@pwCpc0z;`i=3b=>a0&tf5BzkFk^ zRdqdvIcv4PNfXtZwb)^2%BlZtRp-s>{hdy3Wtm|aDNvudDUUQSr({6St@V4mxR_M#H`l{Be_B}L@(bE|eG=!30`1s|CAzlHfU z9DVD=hV&J8J=S*njd=dc>yE(lqJDp!)^`=`#$D?;_s|#TR>g$Xy+dLzAj<1m1-E*qkEbcd3_O(+qENSMn|Ts(%vbIs1X(O~O9Oyj z_31`>-EHvp-0O=t(&^Zuw0u4YjTurAjZ%PAMLB*i?bPzws9tV5h}~7ambn=a(V1gy zctJrStVpNMCQd-6Yx4Bh^X%C#R$wQEkB(Ne+UNE!&xLeIY5it`z4X+HH~e!}z#)6# zCd{4va9c-dy#cB1QwOUPkH@95Y?{Sk+UqqeV_($h1soGQJL^2qPPA^HQ-O9Yjw*V5 zV~VQR3yL1(;GoLHSYIsYj!uz|zA_8b`A9-0HGU%bD%+N37B7s$ z{`>`nr-Ko(W3+ImiENo&I0z)Fu3S!yT^biS|>$E|HEJ_I%?L9p^h-qCFbyh5k--YC~OP zgESmOxj%PZG;oc7-QXa3{&Tl^u z-#r7jccTq=egA*6PS>E2|1*$Oke~PeNFzOcBf8z1&ED4H{7rB5o3Az7|MtaHh+6~u?D{j$lZf*>g z)bHuz|FEQ9US15A)Z9n+h7U2+*Zm)#^naI0Zw21E6GZ%vPkJvnoETjEZ=&?WzlqY) z|2IUbYn-=l+y~$I+Nk(chE?jEV4aZ=_Bp}f(W|QrtJL+)f455AlN_g#oVJo|pTCV{ zaHa0=Y#3JQ^AxX$lsbl0>XT+ao94ci=J-6_DJngWVU@lh-zAgDFEU)WvmXBkDh+z? z^E#)M;giQ2A8k$lSo>>)NT-Ci={W`5CExk61TpRPgu8@I~eydB)s?TP~q<`uYvKtce z8sAnljW#tktu$pYWYRy>+lQvVe4ZZJ zp6(l(>6xCH*`Mtlom>AkH#oL1u&^-udtqd9X=n1w@VBpD55A7gew$eN_U+HN$*;@n zhszURSGV?8XBXFI{;n;2TVL2)Ut0b7b?@iO(dP2T_S(Vr`qu9sJAbwh|7`!+`+czY zhfb#h7)4Y%;0PgLNFg@|z&T`WyC@m0As8{E6vML2_6VNS52w4zvN~h1df@_>%Cox@ zq^}m(c9*~JO;oh+Nx4*!)1R#3wl>{ek^3nf7r-oN^dWCB%P>LaX3vNG;aq%{QL0g8 z!Dyjn#lxAN%EIwdLTk97aaGahiu=O_H+!p!r>codJ*mdkB{TJ&J8Lt&)unSENnoUq zNeyM8HIzg4R$oooQfI7~ahgeO`Pbg0)BhiUq~ojq50F&)zd%wR{eM8xJO2hr2P)r+T^aqOitf~^1^-u=W_=*_A z-}u%%ou4bBJ!%sD_79AvYm|Gl`MtbX4HobL<`?oy-YpjPzHz8 z<98fsqfoSdD;Qi1Y7Fp?sw<)vmWnNKKrqw+NHWd-8JJvgOLzTK4LqN|}iZgLyw zUImUeSFs^x4ZLs zI<`>^NV3k_T&$obzT@N3nIZT%xN?P}{ooZgSo_`Wg5G`uvEpC2=eb+e+VgA57Z*Wc zL-4b1gagknAeUjda`|7EeA%d?o#<~}FU6vi z9mEdQ`bKCjYmdFWfWZqBpjeBWaGXv}TR!~_*-sUoeIaut9eVt%lASrvb|`$IfR#s+ zp8@>^fv=G?@G;mS>UGsploaWdgOaPCHRIEo59;{<|y)(8uO5#UA z{F`eT+My3j91L8zrgu4t!-MGl#(j(LE{p`7yIIrcv@#q~lGo{b-%@`l19#8$U8~kg zZug7qI1^3Ug^Dv2R?l0?(b!ZZ8~7L4Kk^B-8iH@(E5GPZ!+U4`f@#?*g@8q`JX&&1 zV*kq~cS**@cW7v?M&?@#mt7&+IMbkPdnHEuY+TN{4IYv==1lCu`_RnG_3-f~FB9xioi*PPTeSA-sp7;g|5sne%R1wc!T|5L zSK0VSV(b3MBRS&&-7m|r(Rl`i#Za`5Tth>C&etH+6V0c`E`x9vFmZ@qi=%5mY}$5d z?YR-*!z3d~=lAYh?e7Rr!<=jZh-@~=>Cq_Xw!dxUa|>Cn z28f0MYIi+|6N1%E9T(+`UDM_h=ddx3uySo2Pe4yUaKCN5RGGJ$cuW?p#c}I3ab;J7 z?6znI7e0cExog3^7Ra~B<7s9?^{4OJp#PNJa&`&O+L3;9=>5<{%!ixTU+khdK%Z6Q z{GDsv@phq3-A!CY59y6{6JJO1EPcJKUcx+?V`>_t0B^(ZxP#q<7a5dUPtMU|*B*O0 ziC4Nx-|&%4Rw2JBP_m#X`_OmI9Na5Qb=1nKPdAq;(sd#i1e^f3B7!q`-h0elDRdHc zcxSOJr1I(U*juQ)LgFh*Wv?C4yW=ctl~tFLIGe3$0r1Xu^{V$k}|zO+4l_|rJxh#K;$ z;moN2{8?|&$xemWOl1AfQ-4wk9wX16p|W=BguS+RnJ>O`>}DLsvid z*_S4czt>AUv@9r(aNo%tRea(dNx4OTL94BR3~WaEl%nmWm=_kx!63e01dkIV+1jED zCK6#8K|J{(^lAchJ1SI{#blJC%VHW^Bj_OX<-^hL`F@PANnNY5y!P_)r!xUHw?&?A zn=v)N;8H^Gk7H}TVIV4ONJ|oIi*@TfW((stWZ`$qMUjaevO|OG5SYGTSe?YNnq)1t z%Z~>up6uV_s24wxsEB)X&rgwupZ_dYhoYBAU@|5!S&@<7W1*}Ntnd;?A_?qQ4iWd{ z*rA-dqJ;EYbXM98EQzxLf|z~+EFnW+M^1mzqCI{h=+*@16;G2Lj16QFSfQw7(H3;O zCfHep^9CM7gont|uA-G;nV3h^_rZ^9LVV$z?km{p(GY)?P{XYdCic+4q|opcjvYmo zX!>qwtO|=%W#}u*ur~~lG%4&|O<2mrzd%wlPk5$Ect9B|rzhJH^(98>xr0OVXzfKe!-wR3ObXfTLH1u+&tBD&xVx1 zB!q!@RzP6vXz6*yF&?z>Jdkwn`u4S$>i0<7A#9rpW(DGwDFJ%(Z*SBzB-S?obc1{c zwiWwy{Py>GW@*wh_7>FUj@)Vu-#QVqOh$vs;!j$!tH79otIVF`L85fr@5C z6?~RrSBB(N6wQ3!=w=Pyjsr+2{ez7*vz(mAAPI1Y9UQR<;K5>^bz}HrVH^vnJ^))_ zci1OIn0&mV;8mrQuafv^$}=NoULRNs*NdTzZn*d+15QH{`Lm`vDYhI)JY8b zvuy8>EW!=9!v4;ASpL*6;h810&}X@ix{AhlI5o&x9uMDilX-&RJxKwAT)-ZmG0S)i zl8S_srLgz`{ZFzPV8EEmsqdtH>%iK-X-Nu!EGW>qTKW!4#95==-^QxhZ$CQ7qPE|T zu&24jsuKd`?VrA@LfjY1cgeXS_h}xz`BuGAS*`m@a&l*Zp(5Sif@T#lvkVuBk|Ls(33AEcWk{nqD3$^kw!(OkfF~&_ zUh^r(gw%BgQA%W0=|MVBfYf=kstiVBEejVOc0@{edi8=}0O)>Vf}nkfC`n!d`yM}p zw*8DDK#VoKL{MlO&Y95WnLH}ipQhR-L0^Pu3iP_~dqEAD?TK~=4WX+=Kq2Qg>%Qibtu5Nr zcA-O79ZIXCZRd>lE-+cpb@lHxLv=d=5Lr zqnS{mw6;5BW~nf6T&#J}JTqrsbK+t1GY4gR8q}8xTg9;L0U%JuyE2byr}SA95Ctjy zX(D|q_{Tyi&{AiE5)$ik0uxMQfH$DGuA%u+L$hp4=Npzzq4l?LCFnOC+b4!Dy4j7{ zVxB1K2Nd)2ulEbG4DdS}@Y@conNYv21Nr65)>*n?ny_2js5ZGz{Oim=F}?`9Par>3 zI|)89k0`}rK9boXI4GDRcX;K~-*t2;2@JXvV_JWjtR`xBve11<&y@qngq+4M^7lieKyd~QpbHX;l3o4Jq-m`8oAIo z5$7_(KP@M^iQ4$k97GsUg|)5F?pk4?rYyF173yi;vnmQ&=?E@g`u}F{2afhQ%k*U*>E*;UjDEY`@<+i>Cq6Jbb<=vqkYwsl84nRHoU{6`E46{MfPl?3bfS92VUVfDAYE z<)xs1pysws=RgY!)|*JuOy6}uNaDp_bPQ)?~z)>_Vs zwY;>of~K{i*|n16HHy&pa;@+G0!f>KuLrt)X<63b!oIvk6yd_|`tZb_S^1+b9mn$(7k7hJG2qCU(U5ChFtp?-~a@Z4x>oisZ2-GGS z2wI_S%$3M32w|6^e*`Ese?O)MZ2tf)ZiuSLJ5OQI}}tB*J3A~*lu+GIwz zuxM}6%39F$X0}s2oVT}xe78t(TYR57{wQpH0!j#f+!Be|V!ge+{Fq1R_O`&O?Y@j{ zLHab4$}-?d=~AX~IWC+Kls@L&L+vk@O=O*gzV+ z^Z=K!JSlt(R#u34(t3l6Xgqbi_VIZAWl$6iOrp1x%$xR&9Wy}E$vN;(VY-3_Jqmz1 zbB5(VAgO~T=7QKu21uH1Ik?CGNzXq1GBLPx@qYzL-y1&EfeyF?4g42KI=pP290$!z z0&}L~QJRqoj*}y+SMx4sSPzm~dae{)djI(A!kB=&JHzLYRgfWAdVzkvGkiHYj%KA}XZh#J{B79ns4r7Fg_$?J=UHc`RfiWY zMrG+W`Rut-vP9~>&CVP=+gxa$OSRxpx4y3z0e#(fc<8gMI_y;Qh~u5*->u*2>9>Vv zo_GKG-t)R3kM}1I!f}nL12qkJqzSC%lRs&JsaMctNxy2WnSF+&sE2W!q7I59detA&NS3JT_`B*T$ZJbvJxvDsn`EQW)+HksGhx?AH%F?TZPQS4aJ7yX?hb8^0 z2fudAap0o@GtGZM(z(T}coiGF=ER)5*fXz^edAs+I}H>#!18$FezE510zJ%hAUYx8 zMm$joG<_Qn21qJzeQoqqj2{31#X8D>;Gm?193w0dK)R91Y=14q3{FbUy-ULSLCJSS ztf`jLTzU4~$@3t0GXNGSb6+rF)Ux;N-|4gWPHDMAB5Uj5OIk+{anv zPQ%VveP!5pRwaU&%i>)SKtK^dB6f+``n9CjhB%vh=_5XcJ0Ej&JW#6%)Ks+$L9^Amso0E_gwR}dNPX30Jjh6D(b zmG%X;4cayD!E}G_=csGIZo04X)~x!WXc`MX<@(;raDw3Jrr2j3371fS&K)Lz%5?-F^6|aSyf!)u z7mdW*hkdq!0j)i5A?+@h5Eku8XS>*du<5|tD)}Uk{vVbB5qUTJEMcTbB^aRC!UdQB zoKIu(g9K*z`werSc=5~+vnFRR1j&Hc|83lS{RAXnD~P-aZ%q{>fcp0N)I=5i=rb`d zogD1>uP=T)^8*%oq2dzgTxc z*GwkG1oiQ~$BJLN1c0;Oa&+(mEQV&**Is_x>mYey-qU245dGj>H^B@~mu>~@mO1$S zy1~?71vI;ki&4AmfMBXNdW^xrSEyJ>lb0V6vl*{s15o8uG)_J2m6UF{$@3UgH`mM( zK;FIp1QPvTyP%|^@95}V+^w{kZ39|4W5*}=28hn2ILAYPSO#x~!?~;3Vl#86wZl`r z1yNG0p(19I1k%q1c0u%qOOQwGP=F;$3g;|=8cgDFlPmTwkJX8@DNi4Y1XBWwz@ z9{-v#78XK;{Vs=B`i3PFkcM-&r2<~hcL?Hr{Fx^FAF~0dw$EDpnRZ1v(Rk9*wO*)= z$|tTdMciGcGQh0T9hgJKyeXez2cGs=DvhaF`||3^uc;>lnXk(`-Hc!z-5|R|D36%N zd&YMs&XzKO+{{1Wx8?~YPVXSMK>ASq`RC;OjD5%QJCnv>u>VI?*G&mNfCeyfji?P<DxXQy&WX+%7q$R-P<#RGK9%e#+eXlyn0xIz?$!gWI zf$(eQ@0)h1U{NCI_OuRM5E+1(k7R|3lR@Nl`5rOBqW; zrrqa%ym?pFc17#>I;~)=FKp- zj2Tpg0CR}mccAxy-gw`dVy&TnW)%R)h{$1qY&B|YB36Ei6J-xlxOeXa0UiTCG@dZD zC4E8^{(c7)u$~>L00xF=!si03&d%oUUAe9iU%hCBAmF<0VMs1RM^aBe`5bM0l9cJC zOI7brcZVT0<`JafE9vv9DK4%WeB~->L?Q6Jiv1)2gA_t81=~vt!EB{KZ7M(#X)a%h zL4|s{XeJZWK}GT~LM;H-d?ShUjsp+`%zU#LoTna)hLfNW(rz!}iBfv22q1n+;Ad}& ziHHD5G4I@9=>N$R$TYB$1dYe6A`wK?~5I zdgSBnpc^SCxu+3Nnp@}n0KnZ;rupD&DP;9fEu*r!ZAAK&ML~B$3)`=zR=KwHS zk`YNdWm|g5IO_WYM1*Q82T=fSl3~LJ>QGhv>X)BqQ(>$RgLdh&t+#?}@+Zr~@`l6c zu%ru*lDi<%Bs}i@gi`9a9x3Fanm7k(`z?3MLy`?bpzw4{p1NRBtJY)O$GIIubn9uF zy1;lD!oZUB)yv=cSNBT&r4=R8pJ4=6rq!h_%vIxJ?ntG(eigeYsn4(7mW>3rB*X9H z(&R|mEc)&|ZRuoxCY2P(CKH@R0179mlTy(S8L(x>JF-BWk9HMhYuD8lHr542QPNJi0!ew%7ktU&@( zpj|D^zL2)gtGHAwOqG*=<+I-hhg^^%z)3oNU+Hzf$j5tt%uS$ZgY)~_`l&aP$~YHm zUN4kKyi%A{7QUJf0U3&(w(ppG(1-Q!D<7^auLrCkxZ&ce=>jKQ@%aNq5W|IFsm-RB zr?XD@b z@&-Pvb}$KcVgxUn-z@!a?A>=)Q&ImW{4^j5Aql;cgeFKAq)ABtkzNF)h?oE(O{J+I zh$-|AA{`7!@4c%DT}1?JsE7ekDHfWFfbitL=eM3U^WVIHc?ajLb&|8swfFb>@HKW` zQkTfRGx|4#3@;=*K*{i$*cwfM6%W}aOXKHS5BE9&NH;0=6aL)m6u$FZy0~&i@l}xZ zL)yUoMrFVLXUOs3BZr{GY#hoJ{GsW&=7>Fv%BMmIh5*rZCcut z&)f@)gFpe2mCuTnUmn^w&St@1d47sN3yrhwgr$9-t%h=)gL2%WFhq%+rM;RZEgWj~ zIrUFrfkg9yJ8pE{__G$`)qWgWFEmT0X7e5-U7B*Ek281_gs#*;2oVKVBna$ZQvPL* z{9amKvR;zhsrMbO4VSf28{HZCc1<4&YSU($iR9&>>gJPN$JR1!m`1=RLpNg}?9g_>?xEP>>*6hGL_d z-=2AX|KR!KyXVh;Y>@QAhUA4!l?z*j7q)FL{9=QomoNOebzvv%!rzh$yNw@zq{TLn zJy$+l2ygumzjFbABm$*~AXOsRhzPMGa(EIsuMpWFDKwo3D`g$Zc^*670EI@&~ zn(lSzwC1Jg30ME(wRGP5zTYQ}Qg40Eq+_p>6{Ql7jvk*~&#cC$%3tp=iub+;Skh;M zq+b#v^Ru}XeGKKhP6+Yc$A9ujS*oHhnN9lGmL{G=W)4k}v9q5@_u~~;K3K*qIjUa# z_yA@z@8dGL6qyNgdJWT4?XsKCDvj`TeBF62ey(WhvkTJK_J`N!#*3F^d`s28kW9an zIQWuczG(RRu8Kha+3Fr=&nTYl37mvoiub)5+DFPZl*{o#M_z@Lr`D3b~(R}Hvt6i{IoQ0W;^btRxWCZHxgptdxi zt|_4YVL$^LB%Kdv`Vr845YWN~Nu>hYR0G?M{vSwsFBsk&%zf$Wm2rqg4M$KvZ9IFTb@#}18t8*klH&9nY?kaGq0I2fn96=&rES{>3hdALOn(84&Y<|-Zg!QAj z1c%6AU=a!-SP^hgVu)T3$L-7nsAeCiKIhQS6we92x4*rB# z0xRrx%a&pFR-)_nxf(G4&#hE6SIBrm5+mI9V`4a(>sBH(QX$^Ahs#DM{9kT@)a^J8 zJczW;bt)p9e{9PO05WcY=nHS8Dumc)CWhx@FCVyQ0+fl5$Yki zUulNVId1P<0g zTZ$rVYBw?oaC1udU#$Ia7A9s#cOoRWH z{A98IjsFuQovEiXEWu$4lhrk&)tn*-?weh zT`xCw64R6Z_7&CW-X&M`zk#Gdhdb(v!^y&$=58F%mE3ZdIu<_F6zDp&wIDZgwo2WK z>Ed@(FYjutzN~X;zw&+eS6~n{YrQobu3VvktH&{uhXuZ|1{3(zuI%e>E29`=Z$1P?v##rvFJgnFr!~UC}8Q-}6SB`LQ zaf90}qTIR4GP>uN{QWfj-R0zctVW#iE@C>~c*L0?o>AqDJH@~0mj#o>BY3B44)R}& zRtm4)vE>kyTdiBB@U+Uk0>4ZImXG@`jp#iyz~#r5DLX+SsKM)%i>I2 zwMohqZv*;Oy4UqUW;%|qXHa~X2ad(MR(35|+|4_#aZ=^`q~3^j+4ssjiyi0FwMeh3z%S@3i2@v9p^0v&mKBPQIY2{DM5qgQEwtf!Hg z<9DmZ&YZh-{58#%GK{{qv&D_R=5wsexN>v6?de&h!RTzk_@{4pU18^^BEp0F?f5Uy z4`y=@j#|AjKL~oKtqnbK?6-=W(nH-(X?w%>Ot=;tp9!phvg|WFx{Cka9nzLb5Zz^Q zKaSk9(;8K@OBoqMpWwc{9@#ko8&>1Gcnm$-R>*%zzFRA+=M$6L*kA*)l4j=Dysd4n ze-TU)dRP4M)fvGonV_qaZ4(HHl)>iNV@&T^ZiuwOB6sX}@MF!bc0+y1-GXaLKpS;OQB zZJ|l+W1&1RGj6fKn63P=6YnU<@aRwCZxcBYwAK_NLtX6cxH-4WPWp|B`>}gHj!>Gy zFmBKnvxOhyzBrzInz)FLnx%<7431NfC`&f#@e;g`j#Khj;J)AhmRw#RHk2q&wN?Hk zGE@_D1YYsrcw`IHTXp99}jS`jd!5+-M!vG}+)zGZ>Uvb4@nzy)PLfAj>YKddrLy z$riGm8xk%@kEu-&vU56?474I8WTZ93%KtmZVr|nRE%G*k3w=@XnLlSnE|bV1PoEuE zQoeJyNdS!tbb}u$+=O131#8kac>GScm;4A9Fq-1!IV(X$x~v0rP>#vx**DfaH)S-R zm3BV$4zeoMM6Kv=+U##6khr&(>u0aIU9A8KR>FMJZ`H;fgliLkSQfF%c` z6|9_A>FGZ}{NeP)lWx3e*UiFg|3!=$5N<+Zy{JNOi8WTVqs~8BIpxk3ruOZmjP%x* zm8V9`wB|Kl)}HRv{q9#Q<`4-zyRQ|oGtOyfTgz$l4J3KSP|00%>+sd-DQ2Up+P|rzCkuc?qBX zz%n@KJ6Xf`yS4P+H_I=~i{U6h`cO=fm#cE=}4f8MlfUw#pO(ocH5a!7tk z?dndu#ibOQlytS_i{#6eizjcPIXtb7b#>JE@LaUuc28|9t#bZhaU+Xo)M!~ef7wgX zZteYsogjw!U@+LpJHyXmyY%fX-%BATx+4 zToc9q_lbvSP^u(u2*6cBccuf1rHVZ^XJS zWx>o0lCaUbdW7Kj*>~%&uE#wZQ&PQ-bIKp$k3L+E*9&muF$Nrwe9+|dMdxZs^!TDz zoX%sN@|MrJ!S_*a?1HjOp6JkRCftAhlUyz~Uh|VTasdq%Sl392;a`#roBdbt)gjRu zPni$Pj}vJE#%Z<>OGWi8u5xdt5D+G~+{`h2p(EnffXNSG>M8Lnz>LG_XQ;NR1cLxl zvJ2GPMvq`FM$No(I}rrWn_4o`qWdr(z%>4I}CbytcY;&2GhGL>}&!Xb0|5+9GF&0d`^u}&nQwgJbt2?pB) z*$AxnI8#X>%{4ggTaF|_0VR!~$T%d+^6v*tjmlA1;;}l&j zQ~f2dlonHD{E+#5X;385a|DZ!@*|XV3@eq#swU!b2#Lc^vMQc{+cCv$GmbBV>Kunm z_{{J~pT21_bek#BB4#~B6>ESTgHSlP@#r3^aRoabV{Y7)<>>)gEe@$Co1kTv_$q)p zJ%dV}SneiSs~8`P^a^7H$pmS}ZSYa8PgY}8w!G7eu#B-tro3KwFBicu*rr*mnxJyT z;M+{)2(tQ_GQmMelH^I97L5J+&}igb`2a{d-}EMyNoW{RnnNn`K}^3TE7N71*UQeE zfV2@zl_e9wj<r=41UvSZs)8^j8e@#_pij6B+c#->uBl0>>nd3V{ay!$m!GK6j4NzrQP0JxI?)G?!m zEJ6&}f6>Z{()dAaF-d2KiLO~xw?ZC@lN&nyRW zR;soX8Bc(CdY0l%*6O;BmW=W?@R^~_R^@_+__isJ?j9~l1heoSl}G7}lw?rd<7M#K zGXZU8FJHyp=7V%}#u}zGMYhewJ3Q=`6vU9jO54jBeRAk=Q_USj(}so7H+ObOGTp~%y}Z7~g=+$3N(A-!@!W5Q0z)FprsfKePX#OZidthI-<{T4bvno2X`~I^ z$@7=E5qYWnm;d*Lp{6EN8CQaF@KCZ?o$)t$LLzm@KYQ}8>3csKsrUFTw<=<;aBBb_2l;9+Mdor?=ji`ZmP`(rH07p~p~FM3n_AwO4+0kegHN?%@Fh!xX_waZ{-E!=hGqeHj@IsD$T~^&Qiw3@8><;PsWh=b=YA=#yrEuPO$HOx+%xZh+IQS$? z(9Cn8&FcG5+{j_E)x^+?nF7K9k&4~{IEUV1%v5s5-~11;4!WHZCXq4FdCKul_# zVCUJ(7a;9!@La@#GNe!Np za`dHX_fdr_)OAgyAEw^~h5$PSkez6kG8G`-x1KQJW|?CA0G6A93TvuA>rB2c(m2F*d_R?+|3*{T1BbQO z_orVMgnU;UXkg6IX6usAUqTY12eD?>BQYF72|AH3J#jA}noNk%6u>ow64C{bQ#vNI zS0N$Oc2(F6r657T6JUu9ycT)yXYW1kK4S}-7J{AU>X#|q zEdTVk^zd)7rhcioe)*@~ic@{}b@b_1^{c}4i=FfSoPe?+KAUt(W>`YWj8W_3PGd);I5#{k+-Czt`||w?09?W$iEet*JnS z$o&xT39eH{A{w7u0bO9KaVnte90R%o#)kKEWKxY2PoQ?d!ioJ_EwrAnLyGY_xu+eh znAqPHcB1QcRL_kQy+u*Im-f1DoOqbB_wdGEZ*^36b5#GMsDbBE-EU4XX#I>oC%W!M z4IS=1FdcYkJ-{I_(AOT--G5@>M*qNz{*kB?quzTA`+?yT`y*dZJP9-y!wighpBPs& zc>KQqasPhLQ1qkz6O-r2#_Ib|Q=%A84IYUbbhXp8WTVHuqo(c{Oy4tj`ZanOb8AfV z!7~?wN#TD}{L#`UcebwSO`-Lmv~!{54<+M2xcT5bNUue#Aq~oqr!c-kSe;>mw?Lb$ED-r{``>)%RX47NCiNA}^`?E}%Chno2ZE4O2oGh$ZyV^;eg zehFinqz5Z^9)7?7a5cqft>5U|$A|h82P`(X8I%-2r`>pu@x&KYmk zo37nE*i3o&?S;{=z{BmWgN;i@zwbX<4tuoGZ?vKGNOQ+%=S7UN*`uvThkx0F&EQxV z|8Y>Q(`j%S-?bvN^jt*~DNS<~bTaj`vKLd6w?rezb)7gE$BQv9RzUjFVDR>xow((r= z^Mpv zx9&Dw+ZPG-d3^T%t)ghrFyCiIdbGPjfgyg=_bv0ZZ;4$CcvbIKA2=t<21x_Io+Rjq z-?;LjKXPwxPAnpLLI1ArGUIdcM={*Xl2(?@4?S+ZkAsc@WkUr^u@~~m#6K8ICUx}TFIijxP>TcsrcExL}7uxQH)ovVJ0DZv|c6IJ{X$r6%#vuP2Xi} zK=4g>0IO`$voRC`Gn+_iEK!)>5Pa+NbwKltuT#5SEwb2RvglOtGL~kd5*#Xuk8t=k zEm0DZ{(MFCc&W>i9h>7emPfB)oyKNL3QW6 zzSoU`1F@j05xZ2gp5j*eW3;mFk)=3gviBW?kJwFq?n(ON)2B4fppo~E)a_jndj5Db zOX0a5IuilV?(~Qm;SpqAzuUs-Z&kVa%)jYPkf}u21ZDV6l?^W!Ux@Vk&)Uz@iS>B53KBAIcBC~DELfA7C;T`+Z}yB|ve3;DLs`?5mzBKf zy!GYM9j>P&+yN&qZD#4@wj$Vt`k54gB`nO%!c;1oa?5P#P+Ih@h1uqGMtaVYtSHRV z%9HO7&Dlqu7ba{6n#qof&D5ksF1I~%rXm`l?MMw5#$>@cBtBgaWF(9PaSyV*G5ZM?h|yFGhK7V=r0lV1v$9L1@2 zZwXj~FCZd32Gz?xxh>-*Z#oElmy;ALvNq=!ax91BlxyV>wvJXgPB{vycfTtiJek$=#qn2z|A*D!!QI#5o_C!B z(w^#8Ej+eSxUc$kXU%hG)#c*L;}O9lYhgE~9b4a_o83z4gV86R_LrYgob*$+#uSa5 zQ8+1P6_>0?c6dkfuM{Duq!`Ua<^}+OuyyohgPUZK#y3;b?M#{wnrSCAn4W&7u};Ic zM^Uj+P<>u^#O1>=r00@}=wrQvV=)^k%hyds=@uY05ECde3lyHgDk+l*rolmryge2s zoE4DdRc-I%t-*uZ^9-OU#3f}|B_ZTUz1ZR3(xIF`o%dt@Dov~$d!zDIorAbiPUQUb znGu|Hk~fE!xF6uidu8@#ZDx_!-?pK_vnp5Cch?sD<7U%E=hw9XHeMyHq! zxNOeZu9`C{PB9ByZaYHanfPJFeH&bxl(p${*U$a2Qk8na) zu@D>@2=`B=qO<7>$M)5ijhC@tiD0I<1ck^{Ul*<(AJOi^r7r2-5TD-}5p!mlI?$O@ zECnt%0);zR87$m7%AuUf^yTh3jMTDh1sqM)c%g3v7P`-f(+FSyk(mI2Rf|)qqb+Hd zi@n8rMMku>N4YP)EEn%|Fz=}Tp3Bya=Zc!L)dN~rF6u;yYvF(}f$6nSS+D@z!d#EY zbJb&Zibq|*At3x&;DtABuir2`{dT|q(E7@L{h8Y`4==yU0(p(v{^dZAPC>L+dlV;{ z;xRUkONs-Fg5K*~vW56drokxKt&{pHQxQ?KpF zNaZC$uKb(MU{_cedXKjncSUzL)FQ73p9wQ;c26&nJ=**9R~Kd{7;fxSE=~)Es0l43 z;4`sYM_q@cvw`u(EHA#p7{IAx@*7-YUH`s^WQ+)flT%cB0BcmyF$Iqp_(g>w)byEA zY_JAgt3jO813N5opbm2-gN1HqLU3Lx3C3)1FDBF6qfu2ZPK{IlT3T7Ka9N}=04(Ik zijw?B2T5z;_zP!8wQO))oT#GS>S-Z9)X7kEygDakPiNhZ zgQ##-uVNwLBzh3_j&V{F+M1|8(I4-^!_>5H$bYG0Kjd`e-z)o{2L+4^2U;iBb{mfe z4-FWz+XkO`DKG9*FzIz-cFgWJ>CVD(o_#FaDy#FE#DoVdQJ+W@0zdaLF+P-- zFYpO8-wvhvaDPEa#8+u%VRS;V^fv*#kCVuxNbCYu_RCN;IG(05!yO=uHHHZwaGNEB z-?R#xBPedF9p;lOR#9p!#NP?xc&NST6M#T588Sr|4hQfs2;$i|(J96&Vj1!YLEN1A zW8M~_(Tq$iLwKmbJjlbv4r&89{wMhN10?bF32`U|dW^-xz;m(i;>bpXEKoR+GQCO^ z*az?ufe7<5xI6(h)f}H6jAU0{TyK1>+K`)D1`o!G9uRrWNpL(+WE2ZaSK-PgzS{{v z?5{`|pm?9K1a<&I|12d(%U+BE_y#D#bu8hkGB!vjkc{Qx0}A+5e*e)Zfz1**Bm)t? zjWF}F??l|ACxpwo%39i4Zy+|X%TRt>dw!kI;)aee$pCRT8;yZh{;lUCc%?-PZWJO@ zNi0*b1Hi}lE^On_RY+LE5`+ptpYx?T3Zx+PeJE`>p!xy5lIBZ9VTjuC?B|~Hc1i2lF(ROUn~&ej8)KN zAfXE=C`Dx17FDbvs&G|6mcU13qTZ#89I%wHTFBp}%3lsuJB(h`pK2`)E`$4xZ%Cjd z7&tx`AZIA?a=@m<6THx8marC(FM@#79@Y5hB`&e?R;!FBx&rRZ0O?}6*109-Gvj4( zJpWiKO#G`=MneO9VckOQPpG&GLyfelDjcT0MOC}Is7;y?@zTsd-q*RzFaMNOaRpxe zV`0>8P3p$P<9shyFJ7rCB9tQ=E6m4nMjgi;dg2!eIs+6ZW2L*PC^B)(ED&i%QJldp z*Wiq776dz3x6GSjQ`i0b7`hqk0xlIoMFmoCOQ4F-yN~DZ!U_j7pC{9iQ>>}fHpL`@ z;&XMrJfNan4kB*>wM|A((Gmad4DT*VuuaCp01E0D5V`#Hc@FCp%N8Y&Hr=xEYL)z* zOnME)A&z&bwnNQ;&gx_@(Htr)M_sgzD9WHHi?>Pyli*LdMe69nPZo@mSQ?Q8ejCa$ z6(`{bOx_)Ovw#hfg%vI&L*y&r&OElZ?Gkp+xE)n~`&pno!x#dT^xXUZep7Q^;w)S; zj2oYn9kf^MgDm1Y?7w!{Wmv0khTC5_CShx-_xUmJ_?A*KMLGreoJs31<@F zF(jyYlTe`vreh0Bm~z@Ha~7S!9HT5AD>LYn#-s;e?;hpb1;BeqLg7HsFGM6=!RQBp z`v)nha5+hf>$+HzLp{8mnt(i980^&|;hzIVx`-dL0TSQve6c(N45oP?baq%t)JH%> z9w?BD^Z5E>GnBxmU-m7i#*O5oHoz1(MnN;^HU$8oOgewh^Yd~9)b1H^`7&gepah;` zZp|wcEWi`~vUJwQbB~&nGYAWY$upTK3rEB$62d(L?QAQ?V5od&cY$KLJm~zs(^jn= z$Hc?X<_i+BK-fECO(=!iE*Hum^7PpI$dwHswonrRqEo~vv0TK0vt9j@*M8M%+S8zT z=hfU_@r%FQ*1Wu`hrBAgN^^g!6$=IA9hbUjQ#%3dQVvy~k9B@3jd}eVur+V58OnZRmfV?J5Zy>h6FnFN31`Q{l%ICff=#5*?=f)NgtaS7G= zxi^L7t3JyKa0&kDCC*9UX1_=5&T`E&*r!AH4*=YmbXXO6Uta*0Nry9; z9ezMWV7E!f|JZ#VG|O2`PHzvT5TS*1Q&K%_o5=gH8Sa^9-c5wY)8WYg*c6>-32-AJ z0A3@^w?yx-dip#R0Iee;rWlcBfA;#i;SW*VsW)z{(Qh4j&KIYoTTPQ>VI9^$tsZq( zqjZ9A>!?!SB$xlb{qyL@V(jfdd8)fEZW)?EC%I8%*5@wiXf+Xevq=B3AuUdrhXT0H^HiXpawTy`w&j#(|>!%STx7fE$5V8&YjsQ6xcwp!%fIG4Z^g!}}z{3C2RRkVPg#1l@296B=u zIBoyt*xxxXT2TMCusKp7UDNXwVWDGHzbzsg*pzqXtqOf4dC={ktG7t!q$0yD@P%&r z^?;`xoKjc5t;qUHA(>p!WT6JCgabgyqm1L&5brTy_Ps@^sXHhtoKL6_r8_0@jtGxH z#-K^?3B1(l6jTR;4{M%1U9c)Zigz7^vGDD&e{=}%>df55D~Btdk0u-NN|@VMy4fu3H6(-_*(yHH~|7D z2mss#-0{M6Iy$kRtBD{y&cxJ|p>NU^5fV&bA*UG~^#w3=cz+L9;Dt~w>)Quno)MPh zRr!UcFn#A*9z{L8R6H6uH+ofKEX+LSCl&6seZ;iSQi9h+U3ed_Urko*E6W@!R3v)1ah@Cd zu_p^iXUF4aIN1V_C>;-;eSv8jP=tQK zAU?1&W22#kr*bkA|0sP$(vtKim_ib|kcwHNr++YTW9?6#`|yfgJ@5Tsaw|G+TnIMj zevNU%DN0VP96!|z;!-6^IsoV+ctrA*p2#W8p)_t-u0)Y~YgW(Tg^uAd{eLft%3<+w zDDyK?`5-={GDT;@?28_+pMIEoF8SfbzmDGByNoxF_7bED0Z{h{yS4@RJCc$I6KqE3 z1&Cn+tRd;QkPdVX2Nq`nflcm;?h`(uJl=InXf_x|SAG#HWFSUaQqh^RpUKZI7_@vW zo_lqE=~U^G_570mhuei^(a-yCD+t2+O6*pzE5(#3`ou7*y(ELJ1zMste~_R*fXkKA znA|4J%);WkT`8M|PcDW_w(~3Zq*m`sjXBO6mab|b3g%rDPM>^W`kREBhiW>|xu|pq zy-0JjjAy2Ft$k7W<-v!s^Q)gce|#R0wS^XS~N;1l7?%yRKMJaXyOF)H(z)derR+rLsi?qqy?b?(q0 z8XZ$uHg}~=d8uqG&n|ft0G*h_{PtAppnsCNFVIA0#9@}Q?(IMQxc}MX?{k`LD^tKG zbY*C;^P_QcD!lgCx>Bl@{smHL`$J`Q$`|0^JqEi`U(6u!{|8Am%7>HqRa_Mk{s$!8 z|35&|CwG<4^+o>=NLtrnp;2v@*jsq5zB7!ji?!Y4Fih6+9}K`}u1DVG7suS%j9J+N z+tU?QtTxJ=;MxuU0ZH9ElC&G&b^aG5)oGgRRSwC0P~;#bYrmPP4+04`vzYs6AaGIP zl798Al?|N25&wRr4$gBAQ}x1z+rv*p9U$~OmZvMMKX%aF)D2s_zEs9E-jS*p!oQPv zrn4-yR1sCqJm4|tIXzjGB5s^+(7XA$BU#7 zmSIJp@FNeue|z7Lu_*4{|GT~R{*l_dzftkiJnr^|Jd%44E+H}Xa=&1wtF{kuEHct4 z!Yz8o)`Ao#Y9uCpI(6Yo^f9O`TM-q;6fE6&tk}8ERz#oSL67kx#4WGX zgK(OKJeU4URIJH?FC6XEpyQKmB_&6Sw=;sVLDH&DQZRnqF-|CY31rWoO9Y!4{jnh> z>vqnukAu>dKoH|~E$}ejzv*q2o-Ned%E$UgZ;6pRcJvaRs`_#%C;r`~Q((5_n}tgj zo6dsp7yu^C(OLjzS>e2kT(Bu)XiTMf!0Ry<6jzXPIdl<+=BwY}uf`X&tY*}gre;~I z;Fq_iPFz61gN$Y+Udf?XfGG|1xGPVTL(KsIGLKXwPz5RpI|Psc*Zn$RT_7#)rqGgu z_*moUe9bzE-km%4`EARJfkfK@v3lcwe>IzBB9;c59Z~0}?b@QR-w$vgf;1W!@FTrJ zJ;;RLK^-r=0##1Uf-?QhMeuvyhomyJRf9g4o;WI-zC`0fZ|8%}G!Kd+eh#R;5pR$5 zF7~s_M71uyAAEET48v6IY*`JiBX~Nd4(4PgMAjMn6$S;-%l0bI5MGm5n9?@f@z!6& z?--ujHrgJ%*6*`~1ui^o)e@nEg^XQ%uHrJ@oR<1SY0-vargA3n({mMDo9Wn>n=>ak zoR9B;9}75jEL@#MZqm$D7onf*Uw{Z+@Xxv4o++XhiHCVeWB2^i2)=z{5*J@${*%lb zI@Wgm*S%k)_1j*uN8m@qM^re>BZo_(HiF(Zie7%A_B?uP()dEN>F-&m6Ta@_j?eJ! z0{1f}-ds{cGr=UyVU=EI+$*(ag!H60V&erTy2Zu*c0jjy#%=5V>IYpTZ$VRlR11Ln z+b8#cm#701_^T8|tf!ePC25yO9@AVy6}8Jr(RewC0+rD@>|gR|BFse2uv}DULR8(x z%kjvvF#)o=7VJr|`kxbY9?Lf4^qR&SL7!aJDneR8^TCPL1q*gxcsXRUKTEl-oZ{y@ z8;3)rBpn1Vwk%D@VjVePZdqf7r+ZBP9Ip^h2std^dk)PN;|oa>f4pFRefdNhM8iEm z956PUIF==gPJ@=kP3|P-g!{1>P6HtH zj)0@L)SG4VG)C5CG~+BtjLnJ4;uG%$eK!9+{@9gTN$H68H1oaB;rTc;FSajH>2^Vu zgRf7{!F>j}Ev`|Z;q(G#h(EV&-*Fy*dezy2(v^75I%IS^T zLFDLV0w5E|bMkf#6h7?!eV$580ez+ex(=}y9a5S!%8I`w1j_eSdufeGDBQhx@}yN) zq9FjHRjGJ?KdF)PWlk0Iy`LTf=$slWP^Ba5Z=LZ@${tV#-!!Y^yI}P);bbZqGBJy+ zg*O6r!(~j<>VRo|kFQG%)>Zx~<#_rgE8OoP4{g^18M7`?nly(c7uT zI7R&O=+)#9`{}}wywb~!frgy!_yfnzZ!3ipr(dKU9cS`)NIvzwz&*!$a-_>|*i0+x zz0^;S_2ce4yrBt0?h95o`h=x_h(6jltp9o?0ppwbaYkAvU&CKV`B7z4z{zt?N!Eu_KD{6L7^h#^Wf(cPBWP1A0AUQBrDk(z2-)(Jju854)`W?u6fFS5k+2B@^eo`gcb5uYpOf5tt2 zgz_q;WLzc6^Yo3k;aiv=->iQc_BDcLJwRMS9O-40PXPp+r@ypny)r)WRx|XY#8>?N z8x4v}qb<>9q#LCn{+jbf6+UzNuyE^(xMH#BvXbao%1r@SiE*i5l zGE?}wBc_68Uf3b_xNsp!ByoopqLo8G$@z!b-c0s5cKWG>;9jpBk1Y3cASYFCZk0N>_s*bw|y8qEn za`M3P+5Z{nd79vxUnYt?QZlNf8cY?BwhPkdb^j>&Ge7Fu{Py0c>ccFviR}}!fD79x zkii7^{c^bZ0!{h4vpS`t>~JyK0kb#P9Zn^JVO5t$1t6!n@_OA@1P^Iv7X z=k>l-u?pKI4RXEhOjR-qhTZOv=P{&@)*3p>j9y$6Zixzqa`RmaD=>}AMGP3vOhY6K zK)fK)zv*TcnT$`j!U|mHo_0t7p17y;J>XJ}@8CkR7k6CkwsTi)3@KRP;xwiYVIabt zv3b;@TEu&5?1*&TO_k8|*hTEWkIMG1+X7>DCg)oG{FQgvrDHG)T|>vwmQHhRX6ohO zWY$GT%bIhTI~~07MvDnhGl==_%kGE9xgVpWB%4|Wk38Hv6|lc#>VqM8S{P(zCus#b zVfvQ(pPp&7?q0wE{SMLlZ_a)H69bFD7Ykc2F?cL-oN_E6e1Qz%MTrHUK7k8;?`$i% zOpzRq$*(5M?6@GtE#uU=tTa&k`Dg`Kw5W&x4J+BdO*!sI>k5yQt)Zn|q>{IhVOY+! z7UTgL6^2H70XWt+uCnQ-J$3HBV3>nCHwXtmOXMi~ic(s>dW#Xew@xGI#-NTy|S5i5!yowgPz-PA$pNP3TOx+Gfr#@^tTJSgaw7sxy2k}rHCBY zx=_zqz;QBS2#a=CKvOLe;}a8ZjwdXsbBn2?+sUAIbuNZBS7J>f&jz;uXM%|~NQj6C zUp`Fcx`+@`2Oa-H&8h)jzX3bx#{nWGhA|QjSilWDQY@1bC6c7Pj0$T>IUpt;IHZ7D zQS9Az2lR~tENG6E0^&_YQn-%hCYy|cg<9av7U|DA;6e;=M?Q7T0&R$eo6V-AVbRj- z$b%N7KMPov8#lh3zTPACd>J*Zi`HettI)Z_uoL7zB`KOYvy4h3a{OcPNKK*S zYBNzdZZAldU_ARuAcFomL9$>WGW@tUN`(xPX8Xqq=r;}^v0#`2{*cE2cDIfUvV#S0 z(Sg$VcxV5+t_5*yqEu56w5f~IVgb(Wqbbs#;Jf@!h z@p0)k31O*N`b7_WOyORcVByWjkhQm^d;Sum42TepJ*#zyy`n(R`HWmnx|iGn=YB%x zS*4@kWxPkr_^rwW1ImPx%S7wS#GjN&u9jio<+!8eGFIhs0p&-;%9H8PD`*hFs)(Zx zQw_b(`Ho`&O8>tHNrf~(=ke^LK-dq7m7K}&<4?dB;8mpmf~3R%-s9x|0ZES&c`yDK zB)xdE>T*DJ;BHkI8zenib?s*L?ZWElqctI`)wgd}Q}k<+0;+_FHSx(c$?#f(OrTIE zP;IIvC;%ih4a|bq-f^$Z>#HgV02UpsIX}fgRxdfHkLCG$m{}L#E3|-E#n!i5)pxqr zw@%SaeSvI{^g&_$dTafIqYdr5^+FmAV|7SFX#M!R`kvi9+5)mk6x8~pp>4HcB~f&` zF7Ks#emVcywL?#CMe5GEqT+;;HUR(?%t<9j>-L z3b`lT-y~L4Cm0Ac<8Bsv+N}J(ks#jkez#*;M8f#}VS~e-*x9G`N8h*8W(rQ>5T&9D z=~&0}QEjK+w>gVTm!>aebGL4B74JN7?+!TEX%|(0y{I!fswpP0>v3D* z2_S4TS8q=ay_?D1tbYB+g4O{Z#Xj||ns8zJc#kEa`&NB-U|{RSYM04g?<=vsFz%jn zNhWI2BEK+Dd{sx**)1UC;-3V)10r%3fVxLC zM{<%n>jjDen_Iahf|VL7mHI5X`$CoatGWAU`yMnu?VYcebhqxe<$m~iZ6GSN_yh&& zjTTWfM`t>OfKPK=R#)zv;_P$w3d((WN~6E_#NmVcPwQWcGiFaTK=!*??;kwhcr>FV zVVv^t%UUz<+#^e`k}vD=dK=!iec;cq`FpK5z)Ww92a}KR5b&*+)1w~h!LSOfS1xAM zi$7Rg>#MhBR`oaP6pQlK56K~L(p<2{ckwRQV?L;J$Q_eyUWdgHkvh^BC6T`qOh6kUXb{l?cS6&-A@vRVGHT1o#(MMtg9){{%H_@&axt=8aMg7tL$KIVs zL;d)T-+#|OwwWP2F&O(cmSi_$7cwY&jC~0WT2y3=eMyKyXi$vBqjx*vr|`b!ty|Dy>I$ zrBmQ}&pVYKlv~#*rgzD*pQhXE8`Ve4M$Pz`z70b!5e9OD2X5)YBk_ILgL}QJ`rdXl zM7#BX6$U@*JWgt@J3e^WzsvdfsSNJ%@Bo`VdsY-TZ2*mP!?cgXpAQeU%?>H6!%5Sa zuY}=FcFt|m@I~PvNLcTqR?%%1y4mjVUDrWISnu87k+fEvqc(1~b@=TmP53|6s2}$*HLbXDg%JpNBzAc4ExQ=- zV~p(#@BE7B=R)qmy&%jFw4m{@w4LEL9-b$GD1x95&E(kMV?I>jO0>pN;^XgUVbyzs z;y)(Q6JnxV;;~(bjMZaiYDlATKHUWgVY7+f#uEwh&6q8=W2&(4b{Mh!NlL)v3ByTC zj#spBN$0vl$sRgvB$Fk1Dl-#*VVx|^)JVbGLSA|pZ!$~!T=0$pW{GOvPPTzoOxY^F zj<9}BTbC_SmAjTHVogSs&*>*b$T!!?TmF8TPn-s;z>~?WMmO0=dwKIl<_27^FId0H z4SBN?d%TQRgLdGs>9}H!@!FkKj@O$ewjKLQmvdCZ>|qg?&D8g!-qb3-dm1f}U-Y^+ z4Z2Sd|2v0Z#zd{-)QI6TIKQ{hj@KSZ;&*Oh!76VgqlU<9P0xI(V=C{Fw(pOKO&lpf z_v{_Rh@pFipE5j$n%H^!t%l$1tM(LjJiGpec3f_I9Bz$=60%UX=UL}xHE(!xf769k zhaAojt%XLVH2v`KZGzUOJR+Opoxbj9!cn1DVz~Fz4_;wxUe7V=V4>yod6wFF4wb;S zaib?ZFRbjb>@Y@GhUMGoFc)gm2nk`&kh?a5c-eRqWsTvuCU6T6UvXpq(KUK(-Nc@O zXz37_|G-Y~M1)YySBh}kcvsyabd%1s5e0oZ1M`;>I_Yys)=Xu-CKkc8 z!+CyUcCepstIR-3mRQ%8P_-8L~{<{0~c!Wxx{-dGh49$DPaMYkz%wzLkZIdyi8yhNyqOLiWjcQRu# z@QW*O3DwE@>S~D@fF)It_joUYc}LLS?9o)slu6Mc6t-0gwtWs!N8qU^vp&JHr%UrP zgSsJ^{CgDEYh=^)RhB4#r;7wnq=iWCE`P-%!2co58F>C^UiEG5hm} z|B~3hdc5O5G6DVB=@92I^qf@axI^d%zut!cwczTQFo9=kB(Q6$ayDM=QW zBH-cec$i(8l;XCO`tRQ%lkC>3tc@g?SC;g`I6J8vmS==Eryrs%KW@mpgnDBU+c`uQ z4=JWS8%c+KKg#WVW-3Jm8HEokzyD)tE_&I|E4=Qv)Jr9A3R`5&Du?IvNR8e){dYZK z`#sfqAy(@vuhEk^ZKFA)quC!}zSg+aFh9({CTEyS;rfrmx5wGd$KhXMn^6<1!u|^F znR=NR{wE}frD&;GGwGx?R%<5&CdU#W{uC+qV{Dq}`SumHtSjP+-94;i|) zu5xa#^ukQ+WBDDD+=J}jRC_iBL-}e|wMXI?Lja!5{S|AH2zZSx*@yKQjq+X@BL1qDy=P2|YNKEdQaT0o~TvN&tSJZFMa8QLRz7tTu5au~(b z>+XEAw_MVr_ehBp&8$3Vze+GYSWl{$HZcar9CK!!lCrBl(~OGk`!fB=@ml%LS+CKl zF{yRk>jS6Ra4&P+e|$&?_{w9PFY&zN(Ic*6gUeBwf-kZDVAtE*ciuoK%Wo{hgJO7pUxV1rkfhQLJIn!!3oDFV2pPVJZ$P zk#`n$msS@d8+KlTSvHl!MEuuP^4_?evCmjO1TjY}wFaIclXbYS=X8eR8`0WV+^o|W zULlLl8;GwZ2H3yG<tQZm=` zSxOtZT-nW~@5Za1kbQ8k?u7i4Zq=dcYq)dl*QUNBvg20)nuyZ=z;%sxlsh4f8bMhP zSm)G-e)5Q)6EH-@e#3EPwe3Yf^*1Yje`#64zw}Q&&jbMU73qW0MR%YJ5^cBkEY44u z@^IOX_wIZh5jAjc9uo~KuP~*{v9H?@4*?HOlH>-oyP6uoGx zYN6Prr-NjK68)Vy-3&Ff5RI)LkdT*z5P)AoM#nYl^W0Z zyV5(jjlEdOA-!)nqi2nfs=9XqK8alCda;=+Rf)|CD1!)R-Y?y4h9PUOzIwoc#jM?& zlr5O4Hfyvg0GrPH5&d8NE&$Jbc@}shN=6?34vC=+V%z0Q{~5d0#9COu`k}5 zdSCzxA|+^_D&>7uVP+C!m)mlxT;tV4n}ncK_q9(~I=rfM>D1pe6jWFNenEQ4AXJy4xe0CkY|JzK1%lZ>Ogl8hb)QFFbesur=M-4hy9u=(sKLzHaI>2@TJE?zW`yx_K-i z^!$CDGv6Fux4iBNy;%SJ%+HIjpMQj%i|o;HU%&Ob^^3{5E3clrZ#BGb`;~Am>a&i= z&h+aS+db#5Z9exn*m~W52%}J8x@0il3(210INnKA?$C*yPUP|Jz zR!`KWnJ!UySh9((7uW5XZfVp1UqMpB{|-o+5kvs~*HBR*e!inZg8xgdsH#%}(-n1f zb@g>~&~?wz^ti~RMGd@Dn6zlHw_Crrh>1^uo?pO#pSVRpArly7>Y|n*i6J2&Ok;H9 z9Nv<0=^tS9ba*k77Cmv^;q-+Rn~Q$`HbyNkS)I9bKO&O)pUh~K;hCt&e>0;_F%JJo zqu#OUr)bAs(kva~w9dr2--x^77H{L4VCbE2;osya6B`Xns-h(S%NzAcIdeYcNp?!y zbc(~T6l2dc6KeX+g7mnc>;7TalV&npe`Q$v-Z&AI=`xe)5twx{D!Ym4jYi!p`!8?Q zE)jj`6qs29M z|6xWq>f--xjNW~g&16RZ!x+7}*^pMUNMgO5inXYKpz|{M})_-(SCMw!CQa3-sV7j6`qpeI= z^dD4oc;ekZuITW~?uq|GMTe$7{TmhSpMLQVD*8V~(Kqj=#%KHfLlhmFdcQLJewK-f z{%?w+-&Vi;A5`?FyvJy3^Y1U?WtMkOwzX`2pLu%r<7nIS?O$_UX@{aVFIxZpUViz= zW9&uS?)LLnoBtCPU3~aosA%4QqM~R20~H`YX&3~m44sOS zwpcphID!B+2Nw(&-i_|<7-uH*jX93-hZv3Mi%%F?m%*D~4#|GY!HwOQ4UtGw%n99) zEf*7f^jjaX4C+!*W zHe6p_d=%=$WQP=pZ2-Tiuw5_Su8C#f@_;EvIVP!4NTHOoOL#NOSQYc84)o+8V7T@< z&c;vL^hGEpg!rJ_7I_am?`|}!FXe<`LjpzdKU-gzF%OG|B0U(5FgjgS)T$+ip|M_|67s=}dQ462`Z z8k4Cz4eJDhWn8V;v5TE%X3r_|aD*OvTi`eII!3;^3s zufoONai{29`j*Qz`@ijO2~lbT|2HG|$Gvxy~JSp(K}+Km#4YmDi6|TKMnYf**pDptnpS8>0&2tl4M##=RD1z?raIEzv(oSOjMKouG8-;Aad!Jj zaXzV!f^=M~cQaIJgtpL&fkRARG$rq#$*bA;co&1$rwN>|GGXr94jFmf3=w@rC8Qy{SDxVH=9Z6C99p76KR=wU>zy=aJR3D2#tBwvMO?V8)ifX$k$ z{9Kd-K?HSofS;o&pa{47g(&+;*`?+VuH|Q9G}_wdl3YthtJ0 zuCgiTpS5*ksu^Ngz*$EQlH2=^#~e`wVzLq)VKz0_ryE|5?mPL2CB~z7=jGDvi>q_9 z_C*hDAyg38{YP3d_j%*5LZtb((axR~`uzhRJ}mJec4KcCW?^GnR=#h9K1Y+4*a)et znoYZJX`gRp@WU;!WeI|aoX#giMW-(PbQle~#rYC;xTb-*bom`v|F-L0zwHd!(6E*U z<(@;X{(cVMaQ^9*SKkZpFRq^*6EkKi%GO}r8U6|Q`iZ4`XBwt@N;Ax8aoym~TwSFJ z?WBz|!j&#b2O*y2{rU>NrHpmAHtYA6tgogPv`yFvHSq1zm{_wtdp7Fnt(D@VcNM|0 zykw~@Ur#xKAz19w&cW2rUiJ2jdo$ebr~inNE04s0iTs+~ma&MuYdIB-oXRMy*CgaV zHaQ5X_CxzS7w$1goG7f@x5VN0KAUnocl6pJSw=1GFL(NSRl5df>|Pq64?j@x76j*Fr9#CaMp$6qhSxh)H} z^I%=(PzfejBdTfWc~{fvgrKHG>$QYo9J;{-izTyhAGsRXpBR~!WW))V9Z!m01{-c- zqbaQK{M;^8Bqir16F8Gmldh@%LPe9g)19yx{Qp8l^~KQLHUC0IyBT(Q|AC6$b4o3b zOs&XEt!zrInn|txlUk$6{|%t=O`s3>I1d`pElp{(3u)h;rnT^=b()J+(GBc@Q~?zz z!4K12lYp?rfB)hSsiU4W4QRbxn$GwOLx#!Bx`@4l-;7u{!9S3`2yb%*w)Z0*LI;w6QOp_LS zhcy`Yy7B0+7KQLc?O-w5DK{m>&bzDzte!&3GR%Kn#7o{gHFf0p!8p#{JA~cXDggj5 zbJ>X;fiz_1jDiiq;@l%nV-y07!{+pPk3baX^p(4FC39476O^L6vc4`T=u=N7G#K9h zlzkhJBsdGG&9Q7x7k+Kc6o& zr@DZ%iN1%b0_V<78GV{T=e-oqJG^}jviFsXN)2*7PA-~7K!E`QPe9_{V3tT2b|4RuRm7eJ%9Q0K9pWJv zmEs38QLPdN8#Vd(9Ci*ajpx7FGZRkOSXk-A7j&PO=a?*bwO$-uqvM^U80>JTJ3m0d zIlS6Wfhmn1H0!*u(;ieU>}dv(H$*K~??yAMRj4Ik{-TU|RyY=e0`iu|A>#P_0{}KW z4iT7Rr49U-G@4nYELaA%EtO_UqjPX^1~Bi8xsnM|$Sk5UiWZwuYFO|R`Cj!LoSC{m zpxgsz-Lrgq@BR#0mH>{7GZA?S4mSf|n?{sUz;Cj_WLC+2`YjIH-4hqUQ*-*l_Tg=F zDn3Ew2N;H@CzBE<<1Ykg%WK>0E<-z#FyMmQR@Sy7$5K$X* zos_3A2Qc?cLa`Wtjuxf^tQgP{Ocgr=q(=ujw5twCRec3jLoTUpc%llRz|hcJszY<% zgyu}LWyLGBtxEHQExOOMCM@W`p@}K-R_A!JDVE+!FHo{XV8!`-IvNZg0M&-~((Zmm zr^(%lXa+qoWQh^CIA8`-v%xFj4>LPJYMHR9udHC^5_wHgX#wnD9BKCr&B-Ww$_l_# z>cFwJg3pWi%!9>%2~Q_os$@?TKjcrhfLkB?otnA}uK+PktXYB!lUF@}X3YFKKZ) zVmy?`2Xq7z(xJX-@;eKpSyV}Xe-4g)-XdQICV`Y)G2a0YD;DTVt%Ya8l7;{b8HhRy zz*j2|fVRnZHf*+T&JJZqBe)xxn{W!uA(kDDUIU|zSR87 z8x?|LsT9=8c~m_Ddz*rSt9Hn-O90_0rM*tPaF?oXm%3Y*QWVSQBen9aU2Kz`8hc#? z;cg`>CDd@2>8z+>VfP8)F6x}Q*GZ_FUD>;eNL^vnUfgQ0GCR`Kq&HGm#IF^kQrHu+*D3eA%aqWEUqM-r zx-S>@e>kk|Q{Lk^tIMji$Ki`*cfr6bp21(xVPTX3r6|^-RgO-;x|7WDg~IAdhK2!M zaSC1amqKA{sAbF$lN>$T{{j_;u1!a6)B9T$1k?9Am$L`IWST10AQx_TaGThU6TswF zcoq4kK85Ar8M2PjsoXlK+G-}<1&<&gI_1Fwl~(=u;R0j_{4;@bfk}?yhcGH^%nKFn z#{x>iV|d*$r9xAs^s$5Bj=$_+rKppD^`H<3l1Yxf4j%u4W&6p9VksF1hjH{AGLWwc ztWTNdC@JFnTQY-Q=yNoox=b{7rEYSP50p1Bn%HpB0{Z1k7+X%6jmq(Dy2EzdswoQfp z%6y$L0(*_eEDUoT5ZIQ2XL6XAF(E8FROo#=+l?P@jvRga;c$fi>YmIL&^=T&iP!z0 zvd6I;je1A^5Ms??MCf}yf_b6G_LPK*$IrfAeSI*;lAs71Rm5=oKsRGpz*(#>^+us0 zEWfGHdOBN6%`ABB&BrtI$rGK^bGVa+ABOk(c&p~aqfsJV?=I=hZx?;+z7Abuj#tT} zA6ke0;^$!3=2(<4YWnbY(wjNm`S&67`uhA9)OW6}A4IC(Q31})=!Gth8FM`1Y39f6 z8iXAKHvM=`Ep*2I1NwEyC%DKga~YtX@!mjxDUP#OHm8HX8^yQM@w*GP(v6agBClYv ziVT$B3s90KCK$`HT{DEpLc_Dr7;DaJqv+^jR=vZMOQ|pTau!|EBe6g*15_%*ve#i` z`b9Oz5EF}x82=g>&DB2s_2i4?|1VVZ+V`g7?=3IBw|@Bk;@~?&WTjJorQ3a__u5K- z@k(T}iJ_y?jgvkg@voR9FHk8X$=2dl1|x3L93;**Ch}Fy$s!^42ZBBDe%}w-jUPeW zKLfRQeO3FiHHktj%wE(REFO--(>RRS(yXjS1_ayNt-M2}+A5?H%T#{KOX?$geFXQA)n? z`&|2059jvLnCuZ0K$JR(=+XFb{3;N4Z3Br&#|wCtjfYvZJy2Q~}Tk{xNS9r44R@*6w! zC&DsLSL|2jB@4e?Q;Cy#v*xF?7e(9)E?nzt-%Sfs!>LO%QPGIb-pB3TVJG+Bw(rj# ze%$}?alidz@0-K@MUR7}m;-gL16SRH<&OvVkB`;;+9OOX|MWOimO1$v-R+5Ioj7-R zfW`l5<2Q2=``EvV%8gZ{(iy%m0Cj z(uoNuNtdZ*+j|zd|A~qkKa}J|%6Ofqa;^{kd}1w|!AosoZe-p2?))t9RkiOshqHMc zik;^ZT&I7WX}!Gpvk2*^bSBpCFKU$a2N$+xy;R9>c4p1HKTFc4-0h93ZKX8Jxn9!fvOvg7#yKD26n*=PqssFyIyUlemrLH?sKsux3_Mk%pOGV z{oVZa#lTr4;4eH5u275Hse}a#voTRoc|l6tGLF`(3=?{WXCNeW{X3_Xx-D-tReU{X z%w?&*X3QOHUrj*2|MpnH*);8miQr4MbV8j5xz@ov%jODHYXHpKap-dfex(?y=++FvU>I+wah~O<*5j34Y9MHajXMY1NAb@ zi9pSxh1Q6%`t?QoM~JQq&cS)NgNX(MzU26rD{UpQ?qweb10a^<|`P3&~R;wyt`)%%X-JhqF%%IX%c zCP>z&FFO0(>0x>0y=^Q71_T2(6b*HkN(+lo=Pb-9W>3)XDz zoeU6two2}U)uR8_QC?3Sh*G5QEmzZZa?Vp>B5*}^5|?IBt>>Suz%7tenk1jyXXl9sj+#7qigjV({ zLh>E^Mo1JlsdKs_euLQi?P~-rY**OcanRg^!=}?%Y7E7xmM?=QFe{bI&yF zg%96|ySy%(nO(e9M{#zn%sUW205E_sYg_U~9$ejf0EEB`73`67FB({>_q zNZDnj`LH6;m&KG*t;JcE2_`rOHHAb5J&wy`^E6Wsd@*?G3qJ@%QZ*Ojnur1da#=M3 zP1ssfqe)ZzbJhG%h@3x&PH06vc+uc2y2goam9iT zafb^AlI8Mwsnq)lAYd?&g9{Fh#AV6#nV&DWFVYC}(Gu`^{>)-Akw-e3A}77WZT2VHD0( zBUN`Ug}a0b+VLNWct#Re&Pf&o@W)dl!}9IaeN2!+)nF(QXm|?;xPr1|-d5cMkZ=dQ zOB*)lLV{X+?CGqa=LI2M7oGF^c^bW9t4}}?teMDlAGxg7+|8;Z0-#bY3BZ~qk{K(JCze>2o6zyu604a&;K8868umTF!>4)deKc0l9fq>X&vWCD1 zc~J+4l{RY!h#Qmy(U8Eu#7)jzJ#H$z3(ZV;5c>&anO>8?yF%9NdqWlq_?@7Q?Ra$p z%2(Cps<4pH>%N*V-k(`$Oe3m4JLUbzc|gRmcsbvM-E2k8sYLzuhtH3}Wq1rQAf3iq7aQyCLxPiBXPHyj#*ygI^KpZ; zPkxu4r`i8J|LHTDjm!Yc$!4MO^beaaMuI9eE#&;HtJ$^LKyP1@(~=~ z;NH3y6q8A1rL7^PPPuScQ8%}wnPliA-#y=Doz$9-e$XzM6!HEF$eCKJJd_5mAx9}O7w=NpqXy{JbpFA@E zemnkoc=i*a&)(9z50}}u-k4v0sB+K}6@M)#O_=)S$bjIL_><>FAi&qN0?ParnCRi4%}ACl9Ja)%pFfjhCu+)-PqccifXwb8L;ZDwacweWIk8`K<|nL zkQGBOQ-?q4qW1`6M#qD`9xans{VDPM zh7c0h-e`f$W!%jHz#RCJLjC!$4vwWJX6XY8MUZFJ2&Wc+-swaKDvwZau9e<@3vV_` zlyWN}1)m$a~U6=!n_K}z(Kip&c2=yz(q4v-NM-vsA`4+p(n?ES7^#8yvB|Y^ z2>PwDW*x1Z=gc_R;k!hnaC_B^R-n*}Nl%fKJSg1!t^&TlzFSLZ5w@)YusuU67Z@j3 zC8O?{zu-QerhZ(irt96;UN{mC%A|5y6fj^Kh zU&Rms@q!;7c7Cr!3bBi|XfZG?Pg&Ke>~~809wl;Oi@Vf~R0fA~ zW`YK@MO(2Nlpg z2a4cPaOFyzvg(L$kXdJHf~=0Xkg=e?{NUe8C9oi`HI}NTsuaCa{C-dTqk3zbnl3v< zG-jZq#MP24fVk&!(mEhFq@M;^(KHM{@#$@C1f_U7I~StfPbP?3W3)dQ3Ln!X41`gE zew#I%&F}Qd(?(+{0FkFJDe#jDzsN@&02Fm?g{_d+bosIhapV6l)<-P z8z&6a-;lkF$~9h)r(R*|qybcXC;1@$3_GTq-_Z*(Xs;t)(V$OZVt zMuI{4;0&tlunp&D>yM|P8zq{BX$ddXfLDRW#~4~{c~l5eW>Suh;Cj?6TYuMz1!;XJ ztQ6qM1PUJ|Ug@GPPwK=i_BV5COUYo_=>SMiIH7x@HQr7(dKz9f8F&>w>8uS|rXsIU z1xlIU{{gmfXtNsCivw`Ys)Hwy+v;GbN(DEy zCze=kG(UlyqIyYIMI^&qx|*J_9kk3}_YLch8$cEZiG4l# zBO90#YRg1LJ>EnfSxfNP?-5>HoCcI+|V0&%qg356wtB zv}ObQ)f;YkG@38!GmVgotU<{1XBZzZS0BPpRx^jigAt$VnMHGyudPz5MMD~aH<{R= z@p+uHp8vX@U;XK=#VX2Y+iG7|6zk>4n*#a1r+6Ffwz#Tm7SHbbKEo`z>ihZhrnvpY zOw3Sm=AS8LiO2IlK4vnb#eRAv$g?N?LQnc#rn3fBBS|8SzArMWu;hT=hHwq{Tuie6 zPrt~YOC|>&!~7aApG>)eYP@Qi5|!Z}Bl0==1@e-Vj@88*vHHz4US!qt3UlO zrTHGFR-`#*ELY|Q#8xaPepohl|2#qQ9P#$(nDNUw5}1Mw%s3gye8^$$KeGGzcwitp z{;Rt~;2mCHjMxI&&Cmbh*Sb$%FTDt4qM~Ol$;5f%GO2%|q9#k0+rIw?75#AT;3xBo z(K!H30f|z;$`ptJ1!_Zqc~IczD2Nye@&<+FA5^rR!ahdf_((zhqM#2c7<3q?Xc(7r z*b##;ZksS3k1*bIVSF)R{5Qe`?uH3I2@`4$6CMjY`Y}x8SD5Hw7#1BaCK@iT9Dei* z+@>bpr3o%^E_^UA=`x+gat@5ESeLsScZ$v;b17WuQmPjof~%pOYJzJJz=yJV5L0uQ zX=I$7G&F_`l_$`&@)DGu{4R?`$eDpPn&9}4ac-+gk>v0gAW>Bs8rUD9ggLL{ah{Hc z%G)Gp5omgODSDY;+#J{#*if|zmv4lc9-hBTIgi7~`SD)}T}?7?49BfPW18Ud{E*Q8 z-&TkVE@LUq72%eI_|w}-c8;62cUjJiB@imY9X_T|kV#P$Dc&|)#e^ivcv60Hl2<_D zRn09IkF`khZ)Y+@7J$bjLp`3X+bhFy_;_pnO_`4@a?-E@0wQcXMUMbJhek+Nth-gL zUu(L^a|2=61Xsex<^M^bG}2_gz%A~=j{jn@A<$IlDW}MP4)3nU>yg)8{o}kUlJt%-~!+pHMBc7F38`FyOhUrhCn0G z6XZWai)YTqL?+37ykbbYaAp;8DP|)Q0k++gw+=7QT~yuP92iT9#B8fRfW>v zV%^DH(soq=YvR4>g2%&Gme(ZuGsVra6$1YY6&;uyX5^WJbDxU;_Or5VS(3>LEcdL= zi)H0CQtcCMcOLn}lH)}|bCC~Dh`m|-^?`BCf5}oUwgh6SCmgP z6Z-ogzLSNGUzaBz*gpx~{}(D+e}hFmZMs}90%;P&tHkp-?}T|s>lfFu-5={;UOqW@ zsDH4xbKFBnCShEva@LPUDA(fC?U;uvjo7&VLPb4|_va<;g^=P*%Gl|fC3>&IvpM}(8VHc5Czu=gzxP7HfXWSRVfieAo=z9v?O%dtJ7C7FBr z=k`f0hoEN3+h?vE{*}!0Ox2Rg_bs?ARS@`}sHjxoxy`OI;@-w5OcCt8J=zXE@^D<| zveJfsVS?ZlnNoj2nz1Gf6c#v1G*vu}vmr4#vO9gwpMJv^>awJurMl996d4T&D%r!rZtO%cYb zftMX9hqi2kHQ_2l{06TKANH27uxJYC!GV$#Vo$=1nv{;n?qK`0Y7b;-V|KFh|_;s z(k#8U!r3@dUz2tJZRFvp(GL#XIFlDt@6B)Z_dL|j;|jirYpr&$wJT)Wy;8fW?v|Nu ztZyE8rQvt(!MkCRbIWQ^B^BN(V1z1#i-p8aaH~C|;eBn?tIYRkxzaF;AE8<%m$TD& z+R=wG1_yr|S+1UgjV$m9jdy6_{wzgIc%wo@F6Nfxcd$p4nY{DMehi8J;U64;<>JqJ zoG%s|4NEb5$DNqdHDXNK(Gg>{*DPOh#M^@ya=q>}HzbJ;jRP#yr%b(ukHJsN~SiAT- z`MuR^Vma_vCXW8hMDT8hNy|;vW?aM#=`V(%RBo;DQ~(JQa22E$oyhK*3>HtLn;VCE zvUy=3g5y|(!5?BKv!X4f5<5U(&CU+T`iODlo15^)W@Q#GN~*-BI?gVn&uAv;9Il4- zIZv)<{Gs$6FAYmGfZL(Gk_}~hun3|ynER2sH!}oj7PZ1$Z0kcJ&B0jowT9`k8FgJt3*Croixtiq0sik-y$=sG&H+GDbxIikpTS*CVWmK1lQ>{_7_e}$8@_~$u0 zk0>J%@6NvC>EDoA!r^6fOt038?9}7%S47e#OlFT_e*cBI;EGJeot^o^jGHHqvFi48 zE7q*Nte&i|e(C((7X1~`1g>vuaxg5oIli^Y`Uy&ZdTgmGck0n$%Io|U&wxs0t$&2VDyVgi(^Sru$ zj96lE(r+^FQLBl%{*H@4^C#Z+T!>u3+4p~0UmblymX>*;B6x}PiI-!Y|MSOh8)70WR{xGY4c;^bN^KXTkYBD&3EbdAUqN)LrE+;_;W(W!Wp z<+^peMygj)a1brc^!bv%)$0tocxtD9Qe~magpDd$Eds_VU2Ud{9oAoL5Nf}R^ zL=#(C1eMmm16$9ln4pPqgVpoYpq`rP93CGGdqX>tls~+45l-tl+R2C)yMNUMv)RR$ zl@7nvGX?&8#m|Fqt517vSRiUv{7wetvESnS2N`z5aokR?^z}l2r2z>T>5)u#T*+a1)l3p^uWcr7&Kr47 zwnqWHslyOXdjGeJRczr5XBV!U>r$M#+(Jl#rz|e>`zsSl-6E$Nw|7rNTTh)pT_QIX zE3rBQB=psC#BIcK@zt8;+D@QdFU1QzcNoxg!Eicv6D0O2EBfwh7}rQ2Aw_^FE!epp z*DZz$Vu8qWg{SfclNa7PAT z8*T>im_5^E?9sZG8_c&hGea9x6u*(^A3u;b7e)k83;J*Yo47 zSGZp)d=CpfK4{Ap$Ztqou4$~gXR~kqIgZzMSI_W8EzH8eM<_wYNVuSRD=xhZEpL-sGbh$5It)|!qTJd`9H1VvD#67S$ zmUb%GP?y_SjXO?%82sILKxde!)Y_GqQ!F-4)9;3O41@9UgQhqmVO(tg-!9nzqO|%; zeNyM1^{jYzhomr+y_$Gik@h#Ci@( zrAq7K2CcEL+lY^AVtFdSVoa!%IvniQrRN3_3W$TBkl@RFD-1PM8}69G#S3e?5QSeG zrgyPy5S6hXc6j2o3kkv-U}VK%oZ>S^MsA^QFoC_Wlcl0dbEXr{-!1rzz{ZTw|1lD! z68H-{;VWHmD-cl%B&82MmJHI6fgX1P9e0B~H0f-s?KpO?QzYIT3G@#d<&_z3bh0SWWRhoPy9vqx9pZnwm@@0J4O)|J7fBF$^4q=s&j1Uz zp=jP6qgk4;y0I#*1I|w5>ZciWgIKCSz(rqFKiFv@PCwO<3;LP6yF;U&rkU?4kRj>h z?&}bG-(ba)#B0osrRx>=veSLtr*T>$@$6oo1+7GeFMyH#Mg{@C%IqDD^#rjQFmlFT zLVLmJyz>c)5zCcBsDag)R|(=0IdN&`AP4q7g=b?Lg7KcSY>Kn4K~v&fP34kqaa@aF zMwSclkGLUwxwVM#+8e3R45GYgnn7gQ;|)u-93wRfu>oWFZ3LvTV-%t9ui^%&!y0Nz zeNb%8;*EYFLGK`o`%M0Y=+qS%BzqDwsoFDhS3-^S?cxm1(-gE8iLFO_j@~n%GP)ib z@z~LNOpFcwoHlHQ$gj2_IG$eow^AkZz$Pg4r+6prt*F$ZZ6TN*A-T)Gwz zneheVH2dK%mABCF)DA?R$`J48Q!&49kX~W1as~yxKSVVCC<$H2Et!ZlQeG@;vSq$& zU)A9UBOHf|vH?79;D-jtK;lXOk=xKc6vD!74B~p2b#Xa~#&l`ojd@Ft4T~+k!KzRh zMRT*-shy)Nr3P+Nei-Qrr8r~O1Y;(pN zVC3-Wu}bQdmH!ue_u18ix3-Hup{4*y=)Hq9>Afa|t_Xq!0hJC?q}xD9=)DV4rAn9H zYv{cTh=4={LFuAY$IGOd>TwnGvOoOTVkF|DL?RxwePwrZAFi>iZ)h)yASS^@nFP6 z<-C{va!q}#05+CowCAiuQHWY|TqYL+H=}!>;N&eeG0uG|W@u>@{E9%j*lfC8)li z?IPL!LA_K@$x?cXG~m79zLf-4munR$RCN$63p0r;>XF__zDh}9)RIWa(pc1_d0X^G zv!b=urENw;lOI?2tVjMPif7UC{vRL%Bbi;TCUp6yaD+uBr}h2VVzkhz_O}ugKf6{v zKU+2S@fhmU6SC|d@!99SVr!*28G^KZXc;o<3b{t9mf} zROd%)#!f}2Xi0$ar$0B<=tz%i^_=i!LA8)vW|- zcFQjCy`5#G#8XMbH8`ENypZ)L1M5SXVM4Zsm`k0yOTA5>iY#nr3al)IB3!PDv4?6% zYIeyQ?J#I5y)DB4%TqL{FMkZ3I6LP^wCi5o>S_f3v zLVuPF`q}A*(j#4?nFu^3h3^xtOCF;?Gn4SVWu^Q2R<5v{!iVJVIa) z%tEl;e3z0dvAAIpPy#?8!m_2I8;o@2s>iTQvP-#kgH?kyn5DsDJx3ExzZ}Q}5|EGs z=ANrPif;gcgv{QhEEv*@X3LcwgVac>B^EJU3-?MfIm-6wp}-(%x^Ii<_Ay-VqjyDL ztFD-?#EGY&XsB@%PFTio>>8T20!_l87d!XB!Cdq>?mJq5`((+tduKc=NTc_(j3u>l z-98HA)GEBG1hmvW2rF9E#_AO^&9u^5LkZNreFvl} z9-)k_O67w)IQmqW;{?9-`p%R1Xx`$Q1e!I42w;A*Zwb!E0(suzI@0@C`-lf#x7O1E z)xw6x)PYn16y#thGj53MkdbZ^!+5Mo11g`CbD&if6s>qDCE<|i;6+<@|ZOxGDeMJ1Xhr(lH!fWU6riAE_g!;`Ij75`jLU;BKX%mN{DF12D^N=<1 zaft*%<)``Ht&CAum3UMN39sbMk4qGOfz1&{N^ca%kosim-;j<~D9iv-PQqK}vw(Ck zM!M+~MTMPW31KV*fERn>F3`+FJ(5h`qp!$l`LMJ`i6`X+>LW#pJ8X!Sl+t^NOb4^7Zo%-<7LK+cXq|Wymh72(8t87d4+> zeK>hlA=*|+*H-gcxn4u1PNJ>Kv8}_+mCAUPW^a`$KfeYem6kff zMN3*+v-d^QuU9JYHkq-u>T7K+vu!Qw7p=v9HQz5fq5kc?er!Qvhe-J|DdAJ|6H>8{BNje)8?|`L)TArPZkPOG(U+#O4y8z_u3CO?IXc&%+!N)M?`osfx4@e*@K{IepbSmpHJkqFj?LhRP z*|qSHLhH+s(+&x2;&iH+!=Pk4$lh5-my{rRFp8$6GYJXq_t`YoW?ydy>QKa};b{VT-hjwGGDW%$Jdd&DRMk{p` z8MCKzO-QnpqsNWWvu6Ud_F<;7)0~wmhcT*l&43Tz?1XUa ziZ)}^)36L?VpFlm3O)C>Acd4eeSqON5{kJq%%wb=$k?L)2lOMO11vt!1VqZ|g3^1c6 zJO*Zh2o_>RsSV4$h;I^L$QlskYq6WqXR2)bx@u_JRGBF6KSZo!?0WbQ73hG zc5%oNb+D9^x!0;@`V8q{Yik#8B$fwDG@RpSvaqVn6JXO?y%VPH9mZ2=+23MqneNn* z9;Uw4+CawK>%wEl?5Q4U$E>AD8OHW-k$F8fP*m6M)?ImPTbg-3o{yb$`0C>z>@(0v zrO!k&l0q!?5EF$KU%Np;;9v%>Q{Lh{%L8wHHr}8VSk1M4>)Lvnm$aICil-KKG9!tG z4NQ@T(&h%>;Mdd_^S~UKx9f3fZ>CV;)^;!O)K<;iGzSO4*Ge=4tc-hLkF3I8l~M;G zEN5s!tQ4RQg0v=9T1vEiao3yIi31{@l>u?Gk?loy>r@}Tk$SW3ZPRk2?ZH>3`1CF@v(iCaqv=$vTI|KRQ5Wi?BnbMCQo>1D!v{t-LThc%PDxD zF7=fzP?Lo_OW{d8k7BZDnpxTjrf_mP}pE1E0kB;@X68Irn7mqZZvoyT_rITWT=pr zJrzXx-2X85djL~=spkNXF^2b7PMSC~?ch`R8*29*LB-Xn*z%{o<*Qp^9pNJM3d`kC zf|uGL`!AMY7GyEoU00cIKV}bXi#1sMEskzAx8LN`TI8L=AV$R%vhbr{l1o(X7#G={ zGVvmF*jrmliRvDbhp1@sD*))IqnmcXikusKM5!#qDrtg?I##~%aYNdS7KTR6xp47A_{82J_@p@AvA zt?@is>OP64yIHo-BjaB=XJafI*(c&yaqECT+~jC3*6GN5$N4NPPQu(Mh^A$nNRs7~ z8;@CM)8}Prd8hf$zHZKLGxxsNf9P!bj`gGdz)Jdqj%EAByh*oU89Om4n7J$AzDjVn zPQ}ZWpM#2|+5**R(V^R}wp^L)!HtvbCM&Lv3cX789>?{iubpo6`OHns|7pzZ_;D}r z;teetC*rSG;b+F%+TR=|51)_s>S9|h!LE}(MY`I3XYI305LwM;OO;7tCKY3);uVPl$to!^a{$Wg@W!kSWvQGV+9oY|#@7oUt zq~Z|5XrvL@{7d*zw^5Q+k7wlJ|(YH z6r9-auh9D0ZsnX*NvQ8db6l>#9-kFE2GlUUE1P&M$nJ_H`JiA+AqwY}ORWxZ-ouge zZ+$&a#Ol%{$i*wQvn?_KRJQa3F4wqhSsZjT`GeE~=g?oTj`k1Uh7fcn2|N2F2S$Wf zMS$n~0H7q0+=Flq`vjNN0r0u89{h}>1~Pa6(zHmqrp{7zKkliX`}#w+ZZ7a7daKWE=~rJXozZhH4liTMwW&Zc)99ry}j6+NP%3r>5QEhEh6F ze}Y5j@l;jS;Ez-KOVyBGahmT5`##!`!zpTD;yW`*=>Tg+PHM1^B!%OF6!G41!hx!A zw2wx(FnCr6EZhsrC5GqV<)J;!aH+t)ah`h~ULdDB98`tWncT(U<`r@B*1f z+!&r2fddoIv-U}TOer&?uVSo!r%W-2Br=!xGFt#y2!Wm~w+mTH*I5c~v!o-)D1S01 z-v;k;FWMDqeVAtJeagyKz*-#4jzzHR8nL77h~HdR7z)9&&Xj<>KKae*%3cNRFPP`A#faH!v2@S?8O1l= zN}=%z1?!3hrpmYD*_%D#pVpO=O_kH@l=IeALX*Xs>ZDp{mGbCRvD3_VsyQD@!gr2j z&S$s=x@mp#CWK5WGx6Nlv5edWJY4J29Lc%E-QYeDEgtS9+=_^o7JFS%YgtzOS>R=+Yu ztVq_T#OFql%&EBe-yisCt7>(DM0&2^;6(uQDjo-yM)J^sL34aub-X`iQNK~(DLkv< zby*M)%%4bpu}06yM{_HQwUM8}0Y|k=#p4z$@8AQH-&s(NwIVxRW)9eCMBZj+5#&`4FmCS#WAb;PDq@~rF(C(I2%AL5gesr(kwyFJH zC8v+~+ruT8TvH!!{l(E8Ow)1J%N*e@H;>Q7^WgPXY_54sNqMmI?v=)GoCCR>bFnL` z)>ffq?IVZ>htwp`-Bn+c{d5zOaC1K&ip~ht!-K+%X6(c z@M^Ccy#W|LIH0YMQ$0ml7@^naDy)4QzX7Tt^LdaLzvcPgOZ^ykLCL!gHnNPTt_C&Q z~qWbc)SMKCC^9-JJc?mV?}+isLUXY`De_E@m? zi0}2_kXH#$B_{=fg=S=$$Gzp;WDS5^Zq?$3d|azQU(@@(_nCZ4otb_f$#bQB3v!cu z+ADq!$6~PNVSlpd)Z}pk@Z5L7>*d08H>MZf&!4|k_@Yzw%x(Qy;`0{>1=dh^fGDGq z#hsVY3oqmEzj%`-bU*%;d+)0>gr9`mcm)k*3&uxf-TU*xj9&1k0r!$WJggP`GK}&v z$1$>Z!l};}0trl*`;;?dAEl>om}$Js9v&m?6JoRA=P7rpC~(7;G-hj;*ojOc(7Y@1 zVn@80MCPr&eQ+>A?KGHAF+`YITlsBB>~-5&pztr4_Q--C!Z_H`Hst=>Q1pB#;fE3% zIt=~WA9d@(Myg~LKke+5_6>*lhn3)w8OS4-M>E=BR4XE|!t0zAa-mc*p^SCxn;59p z(Ks^}y2{OImjEBLqVpQY>YtuQj9;(jhUp!+8vgP8@0cc+qjFx z_)=KHFS9tx7Bwh~hpu?sF94%z!@Yu&pu^KdhEH+(i!|&H;`z=@g;^pv3V!h(b9epZ zthC|`fzagx0!2xql+b}4!CW7jrT4HXu3&0s97fdr^&ZafFaE(Pow86-5(zNLkvJe|FfB}-{4(_7vxSaJ0j zcd7WR1#bQ8nXu{skY19H8OWCALzrvF-qUiHa>1}eE>a7rvSD=bKrU9av zX^jX2NXM|27AR8c<-*Sa3!R zXRZ3h%Iw~{yosNNn~nGI3=4h?dIY4Cw*?8|xE_6?tY4$=Ug9zXk}(29ID?scmbhsB zYFf@qn^5mMZ1Z`7shROozBu?|3Y0ULZLyomf|IZtAS^md|Z1r0v5?*_D`J5YBQPFGUc`qvsCp zKo50nv-O)=_TSIyKQT{fitBovF}aC@B&_*vQ?T`6I8@P$NDP?}Pt;)^r4bT$kxOoq zOKyfJjjwa2tZ_m*q;EA9PhmA3&p1u6 zW1lS=KU&UQ%bpR<4*A4>W7$Y$tb1Uz(tMch!5?e5>Qhy7D^Lz|#74NQ z${~3swK->u9&Lr^Y(<{*Z|g8+VA(*;vSgnjryT1Z#t@M8yvjN&2*vdk&vcA~b4xMU z(8(QO=df1=(W^U?oV(*ETi*pa%gQHi6quKDuAb&K5h@vEWs~7Ze@>7u%s7~%uRC(M zbInC<7y9J@#`VdhbK$NnwI=84CXQAlb`j0T1;%#rv>HS{1C@i7}2S>KE8{m9Is&>IRx+80A zbzO^0)Fn51W$|&tVBy(*+Ia_G^K-&gu=>Tt7XmOS>wiN8NYJd^|gfQ7gPMHADH7=S7nT7YRyrS*7w?hMj+ z{Z5pI<_7Kl^oGPceRB=>i6p0fX@_51-?bryq5+-xHjCC>@dP1-A=4YYPqH6lNN9xE zo_^$rup*kz1hOoTK7r?1s9_*x0m(u}xe~V;R1#`eS1Ptj8EDXG{lPfc%WXm`+3M`+$FiWLYc{5mNH=nS??)IIk$ira5rw^hxN zq>a;PkPX&TiU5>Mi|LsmCKDWTA_a9Qo((xwB2R-CW-eYMF#leSsq2hSJx=|eMgCs~v zev$S5c1tmp~#?} zbk~OYdjuw-2tC(Px2Dwq=+ovkR{MChzMLD=%zk<@i7fqFG6k(W2K8UG7K9R`#ceIo zLlSk|v@|@LhccA39r@87J$G3A+dPq^QEt5TjPHS%j^{u|!kMsEYmhjYVolNbiBiXR<8UO$i znvy%1cwDZuKePYH>-SeydoZm-jQr~p$MY-y)29#(_#yp$0M=_5h*N#{wTs|zylV7& zk1E-vx!6%?d?3YJ8Dql-ApA;&=0tc|Ljz5uv}c+UPx2nT~^$ZF`~{6Jfr zw#cAs;-H>SWbh#V9Yf%-XZMhRu@TN1pu}$0!)&4(#KRvYNpZUunu><#ao?p0G<$;> z@rdwOvW8G5WkC`ONQPfRsUkuB%Tw?O*6ncBlC3VAZy(src6d!fnENHK1Qp?M0+waG znUP*_5a=jtTc4yC_jKqsw-J2-$GEh%#1BMBA+m>Aws{cgKo;{{j33QfC>48m$T^Z~ zqJPaSL7CMn>EUnz_g@^1u4U5m5F|;UJlH_f#4Cjq7zel&X#lv{jroDz5avPBkOH_# zDXhDN^&>~a%#M<7cT)-}ETj})dL`$N6-mWM-t68nNrb_t+GvR>Keq&yg$G=-@gmpB@hA?OZ&*scH2h#|{ zc{y7S!ddux;vX*;sG{FMi~^%RR_I(WdFA79SLI{;3oRQ9Y}=$7-1E+Lx!>itTgvWz zfh;(CevEcfq_j{eicfVLg^-Mdv#2gm-7P%+pzG;k35zz9A8&}R0+3s)k;la3^Jx%6 z7URsG@o6_VYaUZQvC)87hxsuP8#p0yStum zpsNQ@y-Q9bG~I~Rs*$J9c=z-Md`e9uqUaZRe|^;Rzh?5KE#Q9p3s2;KfPa{Zd!bA4#SZ#3XVMGFuqCn0x&}C}`mF zeO~V&@2)Eb$y?ta@Gc|-DQ4oxBgyi$Gymuo3-E65&e_OZm5k|<8*W+7&QIMT(?*Dx zvpvYlrMof^Rd7>w!}Q_j~O z)dR0T-ZKm>IQ`hFUH0Ki!L`>r_=o2bgEg^b6+GnJQ=lbbl6Y#%+ag|9_E&pxCv9KM zckK4t+{S4}w2yHjM^1h^WRDYDjK~oXCc9NG*x65(mHiY6S&aA3IQ0q?$BzR_^>aX+ zoD(k1NF2@n$2$6vk0!J+VMw?K9_DvZaBxdJWcwuXZ2b8uqh|!Ewz5>{_yynZa`)H9 z#wNCjqK^&ztJ;69Lr!me$Roe5r%CJ3g=OY{z_G4~;(AqsoebutYFinUK_8J#K7xM0?E&)T2C1J9nnk>dd*i$iKND{(c z>@)#K=E3inC}DvkrNAlvR#S6N!LUwYS}3S}e;98Q zgvprPQz(khmz1d*k{}3^nj+x`z;>?@5c3HnY*XOtNb0vkQ591pWgz>bYT&gsziOfA zdPTpop{V#5B)5cMN&u2tvmkpjVmyfQ%X}p70u7TfNkK#`4a4g$e%P&Rh`$~Yta&e( zfN_k|u`Hf(w{O!}cKgnPo~SWLeG-b_mJN4Yi*rnd7<TS;GuGqZ32!o4I(Nng|3DN43Q*7tjInSC-7F>zRutjnT4U>v2! zk|NUk!f^P(U#Hl+Rw;T<-dNcZsO|y~*THW@g8d)ro9#-bn1aETR;iAWsZUB%U4~QL zwo~1~X`aGq-uh|2Ptu-8roAjp^BYcky`2^aev1`;>kkGeSWya8zk!2OA$*u6*EjZNr%z+nHT0neZCYUj3~8Cs~6nq;O!?=y29JIIAZzYx+sn$adDp zC)pj~?3vQ+xtG~1!&#q7vuB30SM_snFM)WLob?AeKUvb5xJkEOW=R7OoZUG;Ti)!0 z-}MW>V?w@Lc}d~u0TO>g$P`Hqb5YFY2O~xkX#S0g(m%d0~Jv8|KIR z5n=9-^P;e__sV^7rU-DGbTU{5=4V9_v*sI}!4+X%qNT@eLM)>;C?UQO%(&Y^1{%Q`FNiHN2F>+ zCXpeMo~RLT!0(z0Aoe||7bKH%NJLb86k}fhupoQn_#SNmF=Fu{KCjE_>gr z79i9ICsxZ})G{Sh!G6?cL+aSh8^}hhQDqGZE|n-h!H}goCuseB6_nHin&D!4EV)tX z6xFJt*pxgXpolosWKi{MwD{aWrQB#W+Q9m%+3uoFsl3r;q}t7;Oje{t!J-8-D(mF{ zKGUX84+SOTK+E0a5o@j>8EBCeXd}V)jke6&XjWI1<_L@CXwfzXgSME9GWTe~>hiiA z=>m(h=C<;L?w_q!L|qdjn^;D_#T{FFo5s+Op{~27--5yQRmrk}oXp{mA=-7^y>{98}G^to=>+lvK)MfK_ zc8InoJ?ZT9qkz5b0>^X_OsRHN7+#IS0$mVYl9Ws_IF>A&UN1$m0R0xAhtrb&1l}|I zxmy$43E_KVAzaoo(*-1c!v5mI6_T~-lJfpNn_7a52(WD~$33aL<52pOZbS&Tq>fGB z*E=-c-Q*j0Xkxm_-Qh$jug{?xtkg~~Qr0a}*K=J}Oq8m(N3*vtzu(MtK(n^@_;>GJ z>)hAEZUhm_v;=T6;07aZ5|aLI_1jg9R9T|g z@d)nQR1Q;AIUdSSoWMnlFqLV#4g4^WcFdz4+?8N`$KR&-7nJjxZz{PsBwQ1Wi@f`1s3s4^Ca+|KBBkEn z5}*$8;5_Fhw;|^`LQJs9PFMxeT;c@ia4ggM%lk;PICR4B?4Cu@X zqz8v7)$*y)PW7lx(I<}A-cu+xf$X^s(D^fL8JJlq@Jq%bq+{s?*%{8^JxP}Ihwz>> z18&-zI5Siax9cne4#sn{3GrH-?lsKYirfIswsjz1R!!)An=kwe)n{2fwN<00K%j!6UbdGn1mU{|ZYe&RFij_XOca!4-fYz=H{ zH5!Hjd3z`uxGA0_n52!sXg?1dSb3hkfyv3+V)|f=X%OnyJ-w<7ZvNs}NeL3o1#}*) z){Sj#jDBQgS&KM>kEI`nD{8URa9b*v0J{E;O)hl9Oqm>0Drb)x=F4ipkOeA^7#e$8 zaAGcOcNbQM1f5C3aCd3S$CqxW!dU$28T|>k*oE%inBfe68)i4~WiHHw37ns8t1@pp z-n~FQVO!_`-cV(JsSS#eY~qVFW&VIqR;6q?pn?2`h9RLZO`)Zd&~EOh!H3WoNlIhf zKNYc|bg&{MD3#bAyJp?L3zOgkcXh+K=2z8Y!=S5o3jg+%CM@q`$h_Ou)owXQ`%@Q6 zLRFuFe+AhHH`-kPW5XtIE8-wQJI}gd44tt8-BY2Ti+w!yP~@B$+`slspJTN)ZzyJG zC_!vE#dSC%bvV;>;ehCv-cpc$8Z;D8kQ#Z7*W`B4_Yd#{_q?*JtgyeWbui@}g# zttj-bGc?~EF~)*IBpvKi*EcZU@?rNBa1_1boc}b}=5C^J#M)k}Jz7||33O(H`jS+% zkk;Bj2HKWSmYCqYKH@!8$e|VuX1^`Qhf;Bu>ng- zLnj76_dG`EZhtT18Rg1@Hu#UWRD5g~+wNxHDv{cvou;z28k1-osP91E&_={wZ3)1h z;C9CI@?g=^%xO6S4alWTdzYViEYk7szw>+)e@b48{92q0nMk!))j;&AA-t_Xt<_-q zi#rc>Ko4vf-qr~1#S;GM!kycLC0bFm=K5|@-PMEt{>pQ!zWO7&hOT;k=+QfV?e)zy zojq3Z5sk6me{7tuFwSKI&QDt1s_)5$R0~+pAxJ4F{#D z@d)b&ixuO`>@Z>2M7P`ORyJ*EF7>o5qLmI7!cA_ac_iq5-o-GskC!<+;J{nc_QrV1 z_Hd@=uUzF)zmH>BvLZGHf3U6IS1P(_cR!(%qDX~vb-|rW=a28#plK81-5x>@f!y6| zb4m;+e&@K|Sf4Ys zNQ(t^n$rit2m()YpaA^vI}M{2iu26DqD7SEIE^|H75&;5ZgPnbQE-{h3cVWB1MumET?wHoo$o%hS}<& ztapFhDZ5+ws6XSb%ST$F-C%tsGW z1)8xUyUFg(tE+QcelPn%J<`!ozHe36@_I~u3!|P>>#s|7J#JB7}}=jSd=``p9;mB zQ`N&7Es)m8>hwO?_0jj0#rr=bq03*RMQ5Z{Q+nK7ZO471 zdCexy|5X&)el1l7q6cjA3H8cILDJroi>cLpGbF$Q;pU zwp8NG*qgP{P6Z=${EG*LHC_XfwlU_H!%49Ut5Sv#@d~=+F}>p_yK+S&bvAK+Uk?ul zgB@yYsIVc)bzHGAzbIR@yivR++sEmoU>#bvSg7b;COYaDbD}0h>oQSLHE_Umq&@g3 zkseG?WSNde1lFUz#wzGE#N#fdGd6ORDvWImeFls*8AQeXr3T*Ib4^51x+K*Xdz{zj zSE*GjuE(Cs->k84?;=;Q^c)muvZOr&|IGEti_*RJkfQBJPWT7Z5zkAWAeWD@;JTB% zfYVSRF07IEZfU??zuQCK{ZNS2@qF@-T4~0o0+n!B#~6dp2~s?K>;Yc)yZk_Vdja$9I8((=lo!c#^RS%g{BhS~^ic~kaMS!~vyof5B#fd-elNgWlph6M+U(idxX2b=|;|4?$omfO62 zO6WM*Y;*4-P^fy~D6h49^ulyiJ^Ps)+dT)l!d1P7uNAj@jYt5^dnKjZ?^Zoe0P*KO?j%T0U2i2ahJpbPD{7WFE`U`ycwaypo@w)0SH_{wCUw(fV zuKsGfxVZDx&#FOnzul(qoqqdWlp6krgV(zJe@*IYy#Br5*!BA4OSneB+4thEfQ$V> zjljPr-@5_{1WGFaxgzqvP*LIztNaue7GgyvpooIpn+js;v!ak|M$tT*3g#lS25Bp5 z(xy&@T+_0K+-=rmY@7=HpQz~kRG31aHH~jGn)A<8xCVjD=Kn`U1OCjuBak^E$(4T8+cx=kj2oPFe!Wx1;XOBzO-j(av&z(xK%Gsw$#2 zHE(DmO$@Ye+%&p(OJBky-rU5{^v<0}_l#xD5{bIzLkkN#i@PchQ;4+YV_RoiTU$5V zhySKE^&GN@v?kHjG<3-(qMFa$9RBTUj(f`7^CC9aI(vEh{M*tbDw>neRi3?k9s0^^ z>Xo|9>+=;*-b|MoNgjc48uQFIF_4-F073so}@Q^khGCx^WzvYAgK8i;15S7fbs zB$CKx{@cuqii#{$Gmb*>uC~>}>yBE0NHgnfv@7 zMe}3(;e7AVLg(2+_wZuZe}^=Qist9#FGNU_sA$ftj{KLR`5QmCwEk=9+uWCLdz%}Z z7aQ|mH+Q!-H~(xdt$ts;{JyxceR#gT`t`@kj~_oye&9EE@Mk;g+j|>l`&<7{kmmW> z9|C~@AU=x`0Dr;kdeynOU=W>%YQfF9*#Cves1zuAIZMa^kJp##jhVLqfJ#`x-g_r z&YR8E_}&OQ(MMy=HS2>3d^d7#w$yHnzLjxV9c!ukKADS-<}_@r-<~ZoEq^rL+VFFs z+IBR@u&r_TbF=r))$z8b{V(0YP_A3;&4=ql|7Vcq?BV|gX$m0#6hdnlGKkrKAWd8P zt+imle;~~%`u`4TF80t6Ax#@P!M(K}8tMNVq{&B3gf!P_4F4CTd8OBzBt10M6D-du zypb|GZq}ECoSGs@Mc$O^XI4S#Z@jf*RN#w+>pa-ZaQ*t8JcA`8*CEqqetSt?O#V|L zn73rg4r5rx9Rz;$W0(Ss4>2{7owlksl)Z@1GLnsAouiQ`C^St{O?h7*D_W=3iYR{b z^w?1L^p1~n*|WyhbeQY`fp;p?8B`xjrLI>G!+eh6hhhF&Y6XLOBECg|kj#By7?op@ z8A0BgiEuSn*V^04))Xb7CCn(%>$=E@H3O{z_AR64_13aId-Wj^V-@Mep;4;VBV}53 z?YAz|)%{^UuC-)`B)N>Ps`|ZptAfYx1=!@aNZ8z?kP$jyZ#!MY5K=Z8)N=0aIq)+~ z^-~t}u1HWFd29~_cPu1iQta6sHdoms5E{h2oy!YW&8)W8FTBv}@t7(IDFCtUos&*Z z*Ma!JBc#;PBhw{R$6jNZWIv6RqhhQn#mP#?ycS*cNUnF20)r}$?e*NG>Zf|rBl4In zYwAz7(0ZduBPhWxmVfU@g;W2Kk7307Au0>4k5{gI`VAE`d|;6e)zj`8NqYIek=Lbn z;#tLG@J5?&j!a~xb@9|?hFk&Q|M?6Er`Buj39~QHEtxm zD2Tv(4jH|MF*JjTsRC9dG(R4^NWysNCRl^@e6BF>K=oE?gh;^i9caSEB?1S?Gvpqm z1yTcm(t#nt$i1>$&~kV@575an2@WKA%^mdMl+Rk2${2c(M+rS&*O0z5r$9yqN&`mx z18E`%^0&(bBf4pbkfzY7LJ8n*We5o?Q$1N>&>a@|D+PLED8)iCARWW`ot8oz`NQpROlFT7KI~5Qu*O$P34^+r9fE zHGqH=Xfx641(SRWig_(*K4kTd>Jo87a}Vt%4Ij9)tR7)#JP4gW(xppcp+RZ$ao}xw z6=s0BRiVNUR4msH$Y$cDVywMM^uxj0#vymyV$we|n6>NA{^rjO7AWr`JvkmiB@ zz1}38paT)o{Q5O>$~EsFNb~*_@x|rm=9V3%PT`X^R|tkv`W!dlexykaFrZXZ_z$GX zN`e|tNO(xHW%UoF>CCM552X2_IvB_n+2|ofgfvgz2&5~jJ8eiDCqlVW|^q`yl~Nx^|Yw64fST{5f8($U=sa2CbAw)CfZvF zgb%)_cIajx=qA|(=-M#NO#Fiw6o&Fgcy3nW?P&xjp;`l%6xU?`aGm`r(+>`^`M}cauBoN7h=Mm z6Ymu%6w5KFZNt+F@|^TfvhmIx%^0hwhEkq<|Thn zOD$vlhs7!(F3_D_uZ2eBqtW!R?Re3spWf!|uBa%{gxCl2PI)6+*gdutY{+QnNAe0| ziS$*mO(Qqt;^6Db2I})6TQe&9vhl@F&Bwf?1P^>Q8Kwts7l?fM>a_vGh1ak;YvGU1 z_EYT&Ra|Ptx0IFs59eI$!4(gwwAq;hRm)9o8xZ zmK#$$zR~@@*53r1a*!a=OoBPti|X6kpGMw{8?x<|MGJ5warY^(i{<)`_R_inGSTO^ zbK6#E%cGWbBi_IKdBzd7qTs7g9rELcdu$MeAT#NOpjkKHkv>I-iOQ8Ze|bQr738{# zO07sJ)%Ogm2Q9~xB3aRQ4gX5p{14vlJ)Y@5{vZBM*lcEVKASn8=PbwBC<&pH5OY4{ zu#i;BnDcSypp=}7kVYyZnJppb6bX$a3K3}}u6@40>-t^S?YiBrf3JUb+n?|E`}RKU z{o3>KxI2-q&n4kcTiwFLWhMg|>I3-0^Pe)L_=a&yNp8Znzp3TkXX0Q@g7^Pq3X~K* zfj;x@eE|;;F6Fy2Jbq!m^e^@Izif@inLb8at%L}$!y&___htH}cTNq8LopD}`b_Y; z=SjiD{ZpGR5J`-iKvQj*>ObGi(83Ue4hw48^0nz^)V#2NMp&k3bZBzt3g#EZ8KjrU zJCewsBv_T^XA+qB=1^jN5Nf{~6{CNY$M-)`6ET75`G68rL(9e|@r@usrl>Vs^4x`F z?{eN`0>%y_Sm*-rKp{WVlffX!_e$XnUdy9yb{6YL+!|pZ)&(`wl*V&jU^j?mGt6Tc z;=$%=ry;}Gc)+NhRU^!tZR|w;Pt@GTiauc*PaRD0Ta;pmVvY?X)9o;LuBf?5b}&vO zOQbuwrtK&Rww%DI;Na*Y&nUI@tDJOVIgbu5gK7%qikfja{?+SVSD+brTu2jtE(4it zOLNb>Q7lsEj=2{9AEarmAlQYU1mpe?giO*-fU|&FlqCs-*ao4vLjrsXxb;&Z;4aD$0(x8wgDB2nb45+O zS|#daNio_AmCMuU2lLGZ7a&V8!Jz9#yp6fS!2Dk=uuX})U5;F#3b>svSji?lk9Efn z>r&PtUVRn(z~)hLfn0b8;i8)Qow%hMLnjg_|EBNOHCo4OD#DLH#lwU;R_Ou7%%DLbUg3$Oa5JT~#68FT`UaziJ)H=lY+kX{&xs*){eU zIrJw4s?%w|aS*3ib$sNDF*n@AWi7>FP!$&)EDb451V6!De>-OtsR+~7F9f+39%S(@ zgAW^Gz{_6J4hDI>#-d; ztCMg)xtIn7D`HR<`|M%?S0v8}pb?K+=tarXZwVzqOaSP$k6G{1&9!U=AHbY?hB4kT z=7SgzE!F{M=xAVkDO<4WZOY7&L^TNj0s^y7BP}SP(j`IrG6Z)fzj3%2S0(^17XXwA zEG$*xHBg3Om7sFLKnh5JUCM*zlLvqodij=?1lPBbCKS+Dm-~$_yoxBq9_c=);J)YN z{kjQ3oiY#wb&}bvr%%41;Sv(J2Gb^K>!7NgMg`A*#PEJs8tJr>EQ)TY>z%(;y~kE9 z!D$Kahv{gI8DZBvj}QcN4_3#$fVH`4%6k-HZ)?1AgS4Z}?09K-l)p9?*d+5$0xEX^ zK#NW&?z_BFfG%dJ9KbCc0Juj1 z1^y4PsmN_rAOR&%j|6Zai!#87*`qy|N8`62eR`4Aj#G^p%UdsYky74u;9MA>5A$g3(zOkBlo}lfQUKU#HoVWOT;I-mGZCOYPidcsK5_$vgdUvV z%wDX_1{XY)^iBM%)|4|+FNV6M>B$@U;}(is2`+eIoc+Xz8zu1Mar`0pjH{NSZZ&1W z4%P{}mNbt@@$Ic40WL_YXUj43Y-zF+))~{rK zfzB!by&mM-v{#0}^?@zQ1m#gr&(WW!*9n^7xWHyx(~-7kn%RF*NS0}vrhhSEBa4K3*7hGz022l7_UaZzdEAP31+5niHT zeMgKaW(1S@6ic{+#n76a_h{Vyy8kgY*O8E^PNiiL!EoedHJu?0#)E{s48aU_GO!fB z1-kP>*=2wqY=MMPo?_xam<1GzV8OEWVjWCi*~_qQ;H(sWFrC{m_&Y2&i`U0xShJw6 zqjz5abA9bK_1ZRvFR$vgU%h-Gp1A-(gdh+$?E;_Juf28=p;O&9i0(t&My_@_yhRkA z{>G-BM|-)O&yiVMWOzvs@f zY@QZthW(}t#r@>z#PN8L9sGyk$2xgyWmq+I*bD%n4MBY1y0T?F#7c#_@=3vOqZi z4w{8$uR&QD)QdH~Y&v3R8JWKZ-5wVFrzHSl2wr2q&nI8orSVhg2uS0oj_mjiy#a#7 zyNTT%LD`Wn7H|y!{|s5btc+kY1wuv$jFffHxJ1}mvffnaVx30!0B{F7^xzuw6kD)} z%*&)h@2~}{)_BJ02v*rp02zJ}{~xxQPMO@pPd><mLvk`4eOr_GXN-OSfU(dp1HG@iXO$R9jpbszQd z?v?+v&7CQMJq&zr`D6W=nQu<89XzDG3|<}aN$c*XtpSO=ezg(Y>p6=tV*6*?WdT4W z@)P@Wnydhx-1BY#bwu_f9fwS$Pmhzi5-9BMBgld_N@Ydx`5F{t!~f&ukjx%*0}uHR z+r-U$IP>|>^jtcS^?|N*r0eq+tzI>)FEuEZG;!KN;3IvFjUJq~MG3yeL-xXlhVBST z&&s2d>*aU)RxXlk*mtKF@kA1rH4+<0h3<<)5 zf1nnz!x>;3-chpPPy%Y3gxaQ!iWDsEQkJSZ=MufYUq%z@%9(sZ40ItC;HD|1#r*PX zv2KhZcn%st`$4-R)Is|Z68ZiA1!?wmEceeW5AH1w%dL!Bt&DlEe7L$YQMxkOu`)HY zGPAeBmRsc_ne*POS2OJFRCQ{vHipfA$CA~s%?#|?+%GwdBtk7$1k*rYJ!rbr5Gk>t z!~D~5Y5OW+cV-pnxB3CdDvX3t^G>K~VbUA4b+j<%7C!;6NFV}JMG3qxye62hg_g$% z=dY&}Fr|H%lwa$hxit`II&F}_zwaaKI`!3lDlpY`!#I&;YAs>*YQt`7!$45hT5z){ z{FiF}_(AzidtC{;SDWJ2o9{-HT#m8`KQ|qGeoIVmCarHCN&M~hS;oVfg=bGt3%>g6 zvI6tJdX#RRmERh6?GBY^W$OJQYQ5!dYir;#?UAQlP_w(11#BC;er_f?2!FQ*6fBVkS zZI}F=DnXeB0Dl8}_=NSM?q1IjZmZ?q#*3l9bZ$BG2o@c&>45?N?Nm#j<);Z1wFzzx zk#~ECz9cbrai@2Ob^rD0?)L4^Cf5dc8|N{vt%@IzC^DB{{4yl|)tY0Rw>R}l!cdVw z*5BV0RMp&5;P!?VPU1uf+&_mMff97&_d`S^3bwltJ# zoYLvhGK5iB#kT*)(XU5}^#TX~KOxN|2mOy@5Ahgmvjtqz6<&iq%7rv1MHNwp@m?oV z7ag*d8JWuE(zBPX3HV+x9*tw4zp<@qE+~d_I#`=#wI5mXa-|iQZOVXd0T$uj! z=YK((g9e&jlf7pTi84%;VlcK*$Q1QI)s6nY%brLQY*1f3+_91?x2){pR2}FX{b}T)a5(t-Fy8b zuIBPbW4<@5F^;k5XTUdO!QJbknf7`akadXg901q%# zKgDi1nSNP5B7oL99Aj_+;VG5Efun@|aOdHDfp=2MmRky*z{u zvt+dNLr;G2yMF%9nRwU0xHB&K4*>waiYv>>{Ggulaz~8}0Oh2y0o5zr4@xTOkFLQ3 zp@!MBN%P@qrVZ5VZ#%&;egHgBYf<^>bo>$H?Rn*Iq6bFEY9aVD0}`BZ1y|@y_?L|4 zX|1#jgOcKBQ2>04#+@R~Ny+@7GZ&8)y2kB<)?^>M7`ktL48`_K*Jt<%UK!XvNh8Y? zZkOL~4C=W>m3;OAVPFgUbf-enOr_ACWE?EsHgKz?oGirXx{-4EgL&a|KE9=IC;MGK z-3eRhte8v-U+T*7mk~neGCmxqoi(suin=kY4_} zMTLPy0KjVxk{0v;7Z>P2J^(&xeFdoW8UA^n(-#cpGfV-g&@jw8S4MhA+-CeA1 zc9g&a`oB3LWJ@Jiu83ObxADelr~LYJ`U)`P%-n*hm;8&7e0jX5F+1>!+>0w92$y+W;& zS*m#s4xsp!8oc4r$FV%tQ5jFF0<}Z2w(dI_v{Zk4ko_7BZdj%u=?>ToGJd^O4hL8k z-{ABYfIzk>*Ml7uiDLT_k0eBj@}-K!AS-I7Bfv6vupvNKILJ0_Ej#~AfDFKo!V{}; z>vPkqa7;I-&7PZ^Yo?SssN>EV^7`tPv-4Vs%^0#)HctUa#; zDvxJV7;Ej{BefBnmX@JQiTYH*o9Jr^| zV|4mfa#&cbgUHJe&*`wHoZ9n5qihyY36+!{&Fc3P7p`gu!#{pSUieV0TYo|~okNfv^6 za4A>31A7*fN+#2)NRJmL=BzDG6I&Zj+=x<+Joj02oPBUHQ!q)jIZkiz`)juNM{1SN zF;aF{ngRM5sn|e>mgKXW3{!Yw2h_l1`9tUY4&VTBr;a##hCIs!rBocSAWih@Q$5;VJ;Wa|(r6^HI8lf1zbO53Cij6DD?f|Iyg+ z)USGUpytx&-PLiK%$^EKBp$#X6o20P2<@4B*7-SSO@dbzh`YI}2fvYlH zu=~d-b)t4)2#4}7a1IC(>CDg9!>t%8T|ONNg`ocmKtXH(hF%F!?V)`y_>d2P;(2uz z5OcsXz?6ys@Lrzx-NyE~zzw4*24b%0?^F^_4``_nD}_l!>_ zSkxi8KRw+C7GkSso4+`INr!--gg8mR>~EnBqO!01b=+2H{=<0+uW-@|AIblM$EeJKAkerP<>t_%!JeO#K~JADHUv=J z!tReUZ29dUG6ett@XgJ*6#C1E_&4TO(~&0ew(H-0zLo*x==XkN6@FZ5yWFz>OC_E$ zOZ-Vu^_qpt54Q>Iku%`Dvxw^W*V=hf>4&_&NFVKE>8{?g)S>|49A%`pe6qk!m9oesZgDB&qr7*b`C!kF? zsx;}C_x9UYnO`-%b1k?Qr}_jh?-{YoBZq!SD3S4Ctjn+dD66=;g1yG}|5=aw*Q*zi@#uA5i#NYawNF<4~Q3KnQ>@gW~pyXXFzPk4OhJ0|2AQ2xS1; zV98LZxJn2G(gI)$*F$?$kT@_8=YCkcHP0|**)JJNqBLjUI7w3Cjek7$Rti@0ijRG# zRv!SWkkdJv0RS7uu;qX*H^DnEPsU5|kO2nz=`v6NuW3hjhQ&fQ;ZclLR@=?BMsuiv zf$@_HF{!dIpw2-G|c5d#pbIFo0So< zIR(Jxt+tai1s(|t!EBo}3VpNSUST%WSjk|tsGP4;N()5+0f2Xxh(KW;>k@MwCd6hD4C@JXuW1hay0Fb2R%Ku|HS36Efm9838F7f#C@vl$KZP-B)a(P z`JaJXw$0@@Fjj^VKx_E0sAHSq6pyL_Kq&=E(CDs6$f$I5n3#HY->U^+kbmO?l@#wkjCP|5+Cyxt`WUyb63=CfyBl;_9WdLe); z^+zvcERXiRHSS_~vg(#y6_yeJZ@8!8UG_UyJH-}XnHvkaTNyF8M8kk(_m@AbgfW7`1kM+J@2$ERHE zj2b>7TFyWz+^h62mQvh5FkRmVc3=WE9^j5Ej5lF;;wj&SN&zmETpM`<5|M`nkkpXZ zR0bqlboX+7PBMkx9)OsjHh+;j@RtZA1EBUP{&9+GMiQKRI)_kr07(+$Cirk~O}$L3KS=Z!1u;A_w%uzQF<^V<__ z0|v#+Tj9!JH2^HbzBm*h(|cJF)f5Adb2my%)^KZ0Cj2ymr+0=Wqddn%G?J+ zn1gH0lb1&T0+YO7^e6~?n@z<>aPh7D;~p_6$nsPh*rhc?q8RQXE%f`z<^gd8j!Sfw zLtFHF!g@8#2Xj~YPIA%CwGm=OM&={;-beSd1zI9Pcw95w(^|X+26Ce4Jg&o%#{IyA zFC`}drk?IbXD=SesNua~m_9b0;~$bb#>hEY^Ymm*>)YwhbaLAMr`+r_G@_7NaYPTq zrYEIqJJlWq8NLi1k4Sxf3;fcd>CNlm-k07HbyMHZ2*^y&_!rN(o)F-(H!q*`wjFR; zcJi4U_1>@(*m}YsKH{~wpZmi1*-Eb0?}5+NzMr>NeD;+^cHjE|btL!x`GBq1SQs1V zO$zMgx3VEY^VmlY6k#gnkq)P4W#s0H%7rC|+49bQ%Bwk24J7Q~G}lZyfo*AWi=RGk!-f0;*|#7FTad zJ%d}V`djb$1$d=4YD+K}Lj$G+LL z1}EVcA{0Y2gBeO24yArs%2*36kqe9A4ZBskSpQ?e zPwe|WLo4#aVi!Umd}z582OaGR&S?9d))V${FD&+I@PbFBEMKg&O1kb^^RPKjQn;raxpJ* zsWkF?L*$Q+$mPMvm6^!Z)ySWFk!#4Pb-ActI#C-|QJcD`uG1kq^W2ft#mY6*(X{hDq1x$S}i|X{a&=j<7mxS(ON^%+MlC!en#u= zN8kj07^|N#*84Gd z!B`u4u9g!^w2rks8e6f->sV?fFwR69(nohj9EADrl(qKZj2rlh(Ioxexsb5sTCxA3}EQ$4v{CnYvfBJKD- zICSy1CNAzw;#P?S#7Q;P7y~jR!OUDh*p_4>X6txVtWz~4ME=ihS{#;>9BBH7%XNm0 z$70!Bt}{)QlN`DDJ5u#RLORU7C)U9<<-(uz*b9GRf?Gc_uH zzr=-)Jj{0^gU`=B?Xo*3A!;L@a^4*SR+ivRjE5u*ZJt-<#d0^o^p8U|%uz7)h6`WO zE0`GuRKUll*t6-`vw3wQ-Z($qx+P7M1G=%6T9}yZ@RRp!I?w6nFlS7%RtU_Ea_J5m zVn*4#q6+q_;WO(2-wTepfJwciN_DuG?mfPF_75M~CE21S?b!3ob3L(SQs(iWv59Lu z4w%$dF4sv(^~LZyw50h`c(`2WZELVQo6o)Tcg4`(mp}i$65M?}{>#PwJl368KR)^8 z;|oqv;0q+!8BDSTE!Al}=CK`XkySBEx#?kBJx|BCyWPq9qG2Je_%__e&Wp*mbfNEgx$+f*B^a=57<@qJT$tj4X{?NQkI*_%{ezGn)f z3z;9NK7*+3O7-y9x6fD$wSLPlWN1b|j{RgGF#A8#q)MZLIt8th`^Mnr# zzNc1K{+g_@yV8YqBX>=s=WjA~HlE&k!ce7#6}Vxa1%3Xw_e-*BqrlO6MlSg0)0La% z=%2Qo*+*CREO$BoC#3nyrD|)V+E8c4G>ZJ88+S^E1=HHq_IoF9t5!Zl7!Me^YrfZf z^QLvx*oRZrd-nF_smM7KY^L(ZA(SK>3Lwb*6rbuZuxvIFjBoqCcsvWI1Mg$ zv2HR|`oADe^(iCITlXE3uZh_58GVSb>nl9{e?yusMzCp{suZEg>%RXR()^iwSOtBQ zLm06+&1zM>cV%Z=wJd2S)=7Fs@>0!(h~pWm=!5-=YE=aeX6pB^%k^fSPj`QzUVZn{ zj(SaQtATUP{o9u`{)03%>K?t=(Wr0iGt+$Zbn=pBL+f`3AM0Hl>u9>scI~jGtbpgS z$NH@?^^U!(J6g>n4u8L4Y$=t)I9t+TVQX6pf;4(M{_H!+fX>((RQRBwnB!`QnRfen zpZU+Mhu7ziQbj-5_UZ4@&b>DMw|1nV6`om=&K-l&6mCQOLrcFzAF z+Y(p-K2P>8hUJCy&(S{yW=T!il3V&?0(fJ>wd9U^}RRje8L9>yqq0%@SiwfXC1*JdsUNe!Z*$w0&?NPcjm2R8Xj`l@I ze)ROrKoPsn+lAGgmitrum1)$}m`f`xJ37^c!VEz2Vy?ktpBofe>@ZJ9?kZxy>OD3> zO|&!%C9Lqn9y|S}g1iJJB}Mh#!){H5H`)VSb{Yi}*hINsA#cq3eUVY|VCiAXL4C>y zoTT*S+C(R>`o7)Aytp1}(O;;TO3cHzXEXxkf8o;APi$wo#a5ux7MW6AbRG?*@>_)X zAcKeaRPB`V+h<#tvOB|kM2QM?ZcBqDpT28COFUJnfJsi%avZQP!2w^iqkbdvC3 z>4gw(r0U_&6@4N3m9>NVAvXO7F){GN?%%{d&<762VoU4lzgeEm%)jDpSHz(oGC!Ev z=Z$z${Kj#?rieBi(G^oV;eNe3$KVNG-Y1Re$LW7t(xt z|7tN<;y%vSy@<}+w$&y53omNbK09%+KIxYK3Ix|bd+c@Lvu35fa9y)=d_m4zqQmZo zjs?EYkHdo-&f0g8bZy|_G9Q{d{%MmZ3v%2xlWtuY);3<&(;f5ndwRHqs9HFJWV=0o zk#+izuFknnIEgvYb}ocP2aeMlW`$nj($BxE^S+<*T;zg1L8AX`-h@Mb<41uXvJSx9 znTF@D_U^*0C?innR=LQ8#ER74+1yi`HdVQcuLJirXMeAPrEV5KRJf!!P3$9fzH5gV zqAX_7`-&&!9=AluhcqJp#2S^!u0Tv%dk3P74NIFT-KV#IJct{vV4PKZopxAj{`#{@ z5rdT!lbm;Xp~DrmzM>$}@T_AON9W3`Q_?g-6R32ZEG~2e@s2muM!iBg{9d?R~-=*kOTlx%#Sk|x=KKJy)V3Z zVk2N}1WvR+`6|=c(VTM5SA$aAk-cJPR8;WYr5Dq~kd!LUG>X%yICO*({(VJH@!~o z!KgNrCAsXBPhV-DQ+a~xGY%jnJB`i%M7V;bWh!}&vjRl)QAr<957MHF%V6vKX@`VG zMv>Yaf4-uT*P?PG`Ilwi^DjVX?eG~@`TgFOhZlv>n0u<>7z?MGA=-6=pml?Cn|Wn<;e-%{ zu7AbJ*26zb5)yznZ9e&aSB&jCyXd9R;m%T|uL_>o^+msB=b2k{Db6!}EN&Clz{(J| zBHrGD+=caeOW3tp*0#-`wp~%T1zTYnJG;2;4eB{WwOt#N9HO>86GO^0Q@YzhD}zNI zvFhpiipMy{gGH}>ata4y;(^jG$A9YCngWu==r&dn1mPT;_F1ZI!ah^N5y%xerIkt- zJ|>~NOJvo$R2&(fKQgp(l8hW}+CTn>9~gYZ5IJ26G6ee&f`uAOA4Rq5Bfr8m!-XERAPa zwJfX^x6l7q7omzNVF;Xk>8F)v^Lv`5vPMt|OTzI}#X^#BY7i^;WU(H=m-|!^4<>dC!;M0-Hu(706zq9+NgQ<#}q-1 z)N6FDlmMB>|6+>MKzeG7M*A|y6rf}ui~BSnnkiY|NrIHxyrMCLtGeVlUHWBCNnFGe zWot}VIPA@M`P4>Eizpr7>{t(c8sX#AZEM<0kzGsP`dvn-BEsoF-D131e=?#!S(bg* zA_E|hLx5KiP5YCfS}Y0fdaZ~JnnRT>rikSvLzk&g!=w{p46_-QVK~sZ$JWHc#>9*H z(kXegfMG=IlBO{AYlv3!3}G5jmPON{*}zXa&&@N$7vDM^pKx3wz}31`$EMYW0lJP! zSPm5nNHSuB2JDD|Iw^ko`$5wy6n+4AJMK6QcS(&w^hJ`O02_2mvNb)4KPO3Khbpy9 z(d<#N!hKa00V;5a!gPjl3&G)Q1z~I!$K$liA>1axwot?>4x$7Ersa(!dr^|FPQ|OLu zY9|5FI8PwI8R|(MU3p_8#CVBbG?HLwFA~J2N(lG>$COWmyLrs72T7J<-x=Jb28<-`;G{W~Q^vrLKfV3|Pi9A&xXP$ty|I^j+oNXS_^iPXcds? zwU24NU8hq6RG<8W9iv59g=~AG(SZWbr zy#W~-Eb;g>9S+FU@oe|9qvbqWTdlrJzn-ZNncoKMkKIMkxknB3MQ=qUqti^|J3%Yr zUEECJRjLBgu2!hp8cKvC;5j}{6bx= zwxh_S5h~|3eTy;06 z1_CQ1(R6*sZ_(0JX+$mB_#hfy7>53?wUx+Fp~-MWg~u2BFB-{&H`NlI%Gliyi9pTlP5 zR%_1{B5c9BJpe1UWM~<|tQatGpO_-6=?A3hucUm@)^?HkN?;Qk^}k)?O*Ly~Kt@?P zb1aKyJ3-4j>pQE>-Dwk51I;tUF)yIPXR&MkKL*8j7}BDMr*A=mmTVO76p6!p~W%sHqI$y?Tmzk&W1`u48*ipZI zjF1MW>V0O(d||Ctl%l7=XU1I}=b1ldT{mQu)aPxqWs-GRVD<13j7*obH$xtG*E}Io zP3epkirA;d@+-3O7yR{i`Azfp{gI<{y+a7SnQ#DM}+)#BK$TGc9Pzx$OJa z6rvir%c!5Jzs#qVLUk#HCl$!!+!mSMF@K~j2-;A#g1eKnT0ZwDlsw)(uhxh7r@%C!*^Os~mrew2} zd9bM$^_myaN{e9cd`Qmk#e>h_hb1h$!;U`6L3#`%J4Ij!@YIPW!QFM5;$V8}6RNA7 zB5QwYo7Y7F`CioO)EP|Ipa~s|n(M#0v^8q#8>8=Gvm<=<$nS zJg@v@|GsCm$q(Zc%$pFf6_6xs<1JrUm|tLi2dG;7Blix&Vv(sWu~iuJM;?+h5aw8{ z*xseuUa>~(LZq5Gc41J&L&p=;P((E^;%lGXRU}bQGeu`UX;8K+;zzPZ3PIa+(1}f@ z6JIZTz13I3$?oP{ouZoc7!X|u)!7iOkmOt?w&!lbVE6&+0I;kVT-9Z5a42Sk*rRVj zNL}XB8)m#LAZTYaxZ=Ok;eA@y9$AYdbssE!4ARu_DksRO3slMK+RCBA+pNosf}eE; zIwfhcYsD68U0I8rJ6&mJi$)2d#X8)mDde4cN-vr4s&GkhrH#rt6*(w7k(4=N3fm7(2`xLqA%hpTCX!F$+$y zPD~_izpGY~t4`i4?HA^t{A@FY)lW9Zqx}!&N*{>Z|;ux6U}p+0NV|53oI%?@ZH13Z&C7(v7o1A8chSE4Mzj?ckQ zFN_?LckYOFOPpI-+C)aPPx9Gg!hTK0@!w+DiB%4pR)0coTPkgcS8GP6Pt&xCSzv#eljX zzaY1}ctnFn^2|@&o(K|Nm$7_w&p1P3UeITEBv4uZg`^5xzp3V7hbF(9DqPXc)lV)J zvLiU?b$bmD(Onu!tq>TWsQN_!viQKS5z`*&|l1~FzuKZQS?zVRs%%&Nx5 z?=q_8B_BwOd*!ViHo+UpM~^pWdX2r2RczQ0`<&pv@pSU)O*>IW`6qKMEb8Cy_kq>2Tl=44-rPMu?(5SL z_sw_w?uCgfeBSYMx#_aoOTmxjcHW=o^SSi(O;pD&lK%7Gg$b&>+}2$FQR}VWr=HL3 z{$ifGck%Dp&wrc6U%ej~y8P#e_Z}#u4M@<9hc8pCJ5)O-T_011-p+g@RLve#r6?DC_Dqf6Q(T)yuGwcEOgouTU~_$a+RE|FUhm z;=Y8vWgl0rWL3&7y8cY$fdYEF&+B`mWGm*r`tJ-!%b7-6LVuOssZb}` zmS2S}B7p{vKfXP-S6K2E@=bfG$4O&rJu523U;42`w|3lm_Nj1Q&F+jh+M~y^j!t1k zA4zm6%ZZf3PAn44o6KS@L@LnN7j4x&T(iAJDka}99y~DVntxTKNayi6$#or~4d(>Z z-6}oC^9(D*`$5%$*u66?=-dJgYOz$U>6KpPH#L_?z4(eHd#IMN#o*kJ@7bm+)FAjH$Gw8|U>~+5PR~X_x zw0A2Gf1tBYE0;e0bsde&My$FW(+}G5{;NOMyz)NEYbc9%-*l`DoI#m`qZI67$>m@p z8o|JU46(M@dY+RT#Fs}Q*^7ub$(A*Un2~b4Tqe3=yprVOJ&}`30RB}XAGW7&_UFMC zuvt7g{qInZqgRk)Re`aZQR*@{=E#NDgD_IOIv| z2&whLgZhBi@8fAd!|;){IEQ4+T*{` zl$^tyODyi~C;1~adZ<4_6NR&nbl5%DJ@8}y+)uS3ZHGVEHx%3CU$@zAAV*#vYW^wl zn^+=Q@7*ddcYlCPo%DfhrMvjaHBFd?Ba)=MeM<3i;T6&+6_7)Ta+`A$^SQ&if4r1l zf)xM(OQ+@C#yi+?#rFm*)zIhsycDJHuvsqyKL5QjBFJ}F@ZXxH_09+R#7~oJDXO6S zgl|6AE_d7;@CYHo^N+~?tuMc)nDP0_$c3_}Ka?Acb9@nVrSkv&Yl8py`6{;*{^!^9 z{oN&>L{8D?-L3Kcy`}uby?uGW_|ZKOY7d(Q7#Z51Jo@DJY_AL>L0|PR{((>pDHb~n zBKpOj#BflZql4c%%-EUypp=6a|;1XQF;Q z9?p=-V!*Oy!M^FwKyTp;7sOLKxDQ1i#3A00;R7yN9cu!wfqb_^M(6tkvTeALH~2S{ zfaOo*I8N|On4tAzs5*SUGYOGF`lUa2Xb*3s#;`g5MXrGYjrIM=_T@VmBnRkjFpL5> z(*JVSkh!yC$-_06B*Y0G1Chz=32n_IZJGOSWI?SB+ymdQ*V=muHWiK zrJ~VS^u%J;YvWwO>8_Id*{-6e(2~R`(UaBWdZ*K60X>Gv5=weTo7|f%Q7VICDJLLD z!Si;LdX%@W zPyVo*qGU8j#wD(HK-?bWAVgK{Kw}epQ2`A6|H0mUe>K^*4Zcq&5J*CiUPA8}5RoQ? zjz|{^Jrt!_01-q52^|5Y3L*%h2}o~J#n3wfq7*?x6%eFKhfMDKe&1(5@9ddbYi8}= z_WTK1>&#l$b)CoY{g50P&0F|1!K|lb*8LV)-Wf(JBr0ac2?-XTZV_i_XJegV>^2sZ z4}uQ@Mn(Z=bZ;?*Ym~*fKv&JAFP5>xD!`ie^XT4zuVt~f5Al9u@+F(+QRWk5*0y-3 z&CgzmI{V99dh@;yh&d1T`Rt`(hHh+exPk0C6ZV;e+?Pt(v?bZFcXCSTvy3g1{z33= zGVMy1puaihK@fee16qGZ%r;D$Wsa6Lh?$q4Pjj1fi*UYd27Ti*pL-(xy~OiMLG&w* z!^!20N;AW*n7;^7g%+mCVG{)h)`l>8zY)G}EsrINUA>^H zBPtUkY;B3ra#!+uy|+iMT=XJb>`Kfhdov85yV!C?bviL`V_S7$`%HirbTF$IeJ&SZ zp+1(N{w-0pNK9?vGYet|7hs_A;j>85b3UvYr-cUZxmNj;EC#YBGbL`}!buS087-#u zY=y=aKIbf^OFnKa-gUec<40#^p zjV2j=+u8m;s&>`D_`id?zTLG%6*Trve_MV>7ENdj0Zg!#B8BY`1 z!l&RsL#(3RaA4`YZ(T_e>IAZGk#OxYSL~xYSa(d7JkP)?xG6V)8fT+2@uH^7)t1+P zTIPJoe(U4&uEwbB%va)>yt*22lh5Ur^VVH8CVf7Z5u0)?Cf2n+wgXqyhOgSRJh1zA zW+!UKw#~%0@TKwkMish**T0(U2^058i#RmfsT0j;AyT zZuL^|lmKVi9p1sSK!O=we^zSJ(P_r>V@$T}q8=_?wVyDj7mbdFAb4UNeA4l^xGHba zGP?M@2m8Hu+56+-;hUSG@A~Y$tJ^j1JMYmK4cs2ScT0>v%3i{#M*+PR)Qfh{#45O} zxVuMuy-l;@zUP(u{+bJT$CSqtpSa4XtRNH#yp|`xH;F+|O7V=kI<(H6aF!fa~E=(?G2f_8Jy=n~A*l&hGiH=h?Z) z_9^USGuBq`_7KTnGqxRz6w|X&!0TN;H?P^>O!2w9=i@N8{a`G|y%v0L?4kN6X_LX6 z)}%X)RqjExIiV?j(PKylnnx-Ij~*pIf>yI{(G1UP(dwT&KeeEh(n=lJyr32A^qh=O z>}QgQQ@{wFFDQF!c>VEhCYp3IHxYBY_Ka_E*;DmTY@62u|BTdJ>k6dof!8kt?$u_~ z-4Lu|^8xz>(bfg>*WEer@%scswkBu$Rv>>}52o?Up>XdLC~P5FxDX}3Y}Ne-D4!=D zay{8$7gSp9-N$bv3()o8xr;Vzr&=Nc*&{>7BLdo`ii|EZ`t5gbJd@~+x)H>2jT{oc zznwIm{d)gd&W) tn%=3$B+>fqc z5Bkh5EFSgz+t%|xKEJe!xXbVn7h1zy7!_9gYT|lv_nOTlW8*LBK8%~@6~YDE2TR=agS~tDOPjF zOEUNJ_*lM9mEQc)@zQx2woSGtn!`Gtl9=k7B9O&e|gtKJ$ zX}KQ>3_cRXaNt-Wg1;TELIt#yQ`IVQY`%5W0ECb>gC&Gs&_;elU6tZ~sG?-rF=|i;`$4N|>*@`Xz`dEYUJrG@uGXsPc z@(0rs9`H8fFe|Nmr8s86;3C=vF!?dg$*Okm3)|*oabRQZWwW}7w0-z!@Rv!Jprh1y zt{|{=S+=ROy#Zgwyk%4Y-ysnN0Tqe{vsvL7)qrpGf?$3QbSy#DK_pI3e(qy_P8J}X zceb&O>&-(P6wrd4Rl!|5;_t>W`)gPjdULCukyp~-_4rog{jH8Iy@hM4g^#;c&|f;M zv2?LTPkrJo324A8>tis(BoBg9cG?R9@N~NTA5Va13~`Jv3^Ab^A5Zw(Pg)8d0zTAg zR9CoLU(t$ioJ{u2k9^f|73=?eSb3WF?o0Jk`h zGyMBONc2H>v{+A8`VeAzh}ulZwCv67eexsq?GgdL%j!c6qN`IC^v5%gXuyUzOpdh` z(?|KFG(L{mV6e8Ie_0PM`|sXws^OodRbE7a#&lTPW%()MZ91{PCu(Co_p-%&HUx>^CosF zKs9SMS%nP9M2^Hg4KwH6EL5op&KUrYB0Dn!KWvuo+UA@X{-Me*ut7Y8w zL0M-DbvkQpI+&fZ`v}L)u$rpBI*_q`^nLva&)TCrWvC;&yUKa?hYf3VtT4M~wQR0R zGK17T1wvNy)_3czll61<+ZXs+!S>gQEW*iItPffoOa*yozbhX|SS4xpU6@6z1SlBr z)AN4cs)ueX@$Grr?_J4Qwuap(3t}mC;MV*ujZE7=SYbN1DWf^d#KFp1x3zF3hnc;3 z_io_&L%yRb`=fxd?Ft>X10cw}c|N)M6AX zW((~Pf)_$7$f^R~UJ9?56|0ONMnsBL)JG!U+{;~LXf!VpQ5z_>Zv49Xu;MRB^IP+C zmD+n{Ap5aU|MCpuI}iQ2ek%U{bR#|MUV?zr6b;Dh1u6iscGB5}Aw zXsPXJ=0gOBuH;do1@-(WuC~sNb^{l zHBl4q6>!dsznCSOc?$T6DvzDdx?FkSTHT?Xtj2|Zb3RMIJ!i;{a$Edu**(O~{LFB7 zi9)SZI?^lLbFy2}R4L+Sar1zcq*Rh&f(fs7|2ZUM-~MOb3euKj@yECE!`~HYZte8i zj`OArJjvY$N7wvTo!a0-{WwR@@w?6&ubQ6p(?q(DTW*m5s|*CeXku9H*=OM1cd!4) zBZP#RW2m)zL;rC*L;XrG{m#SR&9JgGvQ3t@v?XDV`gmkhQAq;W2gQ#j*?8%2^)(a= zsT(YT2C=8ks%YU{KZf_?HR)NQt}7ecQ1N!L7F6UTu?qck>JAxp?5P&qdnovq?zuWZ zq-4@3tEjn%Pa*0V(?caSI*%5T?O!}vY&Ca|y%<=v3*{L>Q+A-ThsRupOJEbEq(C?X)r#9p`)W;h$3UjKz3jw$dW>`jsf&Wg;rNP&WU+v2d zbjxf&O-=`-gbt3iWlde_5c-{f{WTwNm`0m(e6h^RkVLdpkD2_ZGuo)F3I&rhmC!zgQy5n16TD14$x!iL2_}r2dK{sFYd?P`OZ@%MN z-E@gtoi_*c=m#eWB*}~Cic5AeD*Hz!728A#UZlNz=`=Lq8JGl{C7-)?hbg&9n@;SK zTgtUb2=_~efs2(mh%wwd74qojSw8X0UPD_E?^jdvIWaG6)x6U*4sMDCIlymOxpNy} zg2&q@3|#Jiem7rqV4}xTgx1Oja#T?G~>Pxx-AwDAZ7Zqp!tt$Up)3!=I*~seJ_TKP=lK z5-c&Vs}s3w=ao5S%|%XHq?K%)F1bj2Fs8*9XhQ+8ThA_KBYi-#SO7o`n*^gjQ&3*K zEReWC%s?*dYrhJ;ma>24U6fC;{F;#oS5|qMK{wArdXm)%+qKGGpHju@73=k0Hn+%J zt}^~`xlMPi%psrh&_B7hY|Gjt0kNADN23ds8{vy$3Q}Iu#W&CPm9)Vsnaf@k2P+FU zfA!D8W!z1`%SKO*B2B7p?}TR?So1th&w6-T*Y$O`QeZqdukzhmiHBtS6G=K~oy0ruFHcp)Yi$3klb5bg{u&QB)fA z`t*k+pDM(UeJj^ScSGGL0iR?2R5gXp_V-^zwtNg4&2HNBDf36g;C~NMy%MZmkaZ4f zT5kkE0(F2j3rX1-I?W|4Th0{tyZ?I^UAw{~Bi zZDr!eDQB_Io;+{9AXJ_4H`5h+F?rVdY2uZO!A z%|zd%Z)K|)81FpHLZ0NBISZY$EEbRN^jgyUT2sR#+!-YM+-vo#+fLVcgOh^OW4)5N zU$R&D&GK(|HsP?eF%8k2DzlM~2?V2wu_rCE?>nK?RePg^sm5jrJ8>S16( zSmuWIWW1W{taj~c`_ULJf0HleY^iID$gwQ!Q8^Gk%XwmnRf%dcQd-O ziosPR*d0@J+7N_)33Zk|=L%S}*S&=o&zo;qD83|P(-q7xS)fCG;uBWH#{ zz9fWzX|M5sro4qT?Y$_6ia>!oWOuI&KjZwKggNKn*NWgQy^Z za#}ew!{hQ7^3UkxFp1wqP}HyA+L4LMJ&8(UiSh@rx=PQt%E2PU#8YzOs4FFaV;BZ7 zPk?qJdCEbO;>lCaVbMaU(R>X+JsRzYJ|QMUy^`q|Q{-il*FCLmC+u2YB{=n@yh)11 z6H*K{>4X?xt##1jCn5t1UwMo&_nW@D40;kfVT&guM7yObjV2JpQ*}ld?8yOXmR9x? zz(0Pec^nsC3^A_)Uh8E06ct)W)Pr8&Usow9c8vpHEWfsX37tquBZR>(YP_x%PoGMN zm6=G>t_5gyBGjEo6Gfzb zM?_c=Uy$MV%0csmsmtR05&Bsxw6GTa46fcRo|P;-lY(ax;VfSp&sBH0f*^<54 z(kt0#>2hQxa?Tm#oOjQ;*aE^6U$ZF-edx|vaiiln1Zx@O>bU3XMdfNwGAomEjaG6^ z=<<|@m@gURU3Jg9)R1fTF3&_c@A^ufy?d@@L!Q%Q9^U=!Rf)Gw4S6>ozrFh|mxuV) zz4z@sgM4)~hzAWikIpwe1YI-(`55Fse4YQ~FgIWl80h{M4}e7y($tlO*4blUNW@+o zreDGrrbHE{zAh{_O&%BpzNUMZHCcE%#GK{+?v+(?-s5*gla!#OhIem!3t@F^tGHB2 z)55aHg%z)3UT75I9~IR_AsZwLN#_#UBx0joi%3z0t&^PZ-W3-}6g`MedfT+|a)^rN?^X0oJQIj@sW)%qE;vu8@Q9BL%Hl&kFoUU<9AT&(&oc=bSg!E8r~sQ%Erx8)>jx~2^X3Qg|=}t z_T}*T3l*ro^10WgWfgA@4~tJG3sQ%f#V(he7?cVADiLlhpPek1|HVm@R<78`sT^I} z&0hKO+$#lH`cO;pL{DRlG!C`KD%@>u(V}9BU!10CmCq+ARp=*G##~hjM^%mP+=h*X zs^2*Ajm3^d)zZwV@*1?o3JN3V(T7AtdAUD4TlN%-s=*)__H6om-YaXocPOec`Bqsc zQ6r*KN3yEBE?H%J#2IM)KIo|CMq1G=mG=p^3z@#v4vnN{PB7jDAm`%jew|1EF82t8 z^4ra!<~$jGqv_6B*!_H2kDsY)F{r=tr0(tOy708J(Azc7Z`Tuk)hHb`rdT(&HrBl= zs3@p&GRQ!u*y)uvl}%U7|iE%(*|BIk2&*Sfhscy0O`;IqPxr z%f`A`uBPU1&DDtd&ofnXjWxCIAIXhkhvjJ|H<&mUDcZT^^sco0)_Q1GUDDw%Mts`G z*YV9M4b4kW8eem^toGHkt9;xjYU<#sZ=NJwzFo>!Ofmp@CY8IgFgiteibfAHTrcM} z03oul@VOyAXh`|VjmJLI z(JV@_>6X!=e|(48wFgCe2i-gb%e&K?@85fnqXO-R{MhL{*{MCLxt~LhUwv{nc_bn{dkw4y6h^%YO9U27e%lXT3~r~h^+-J(h+&S8YTaK&EzI?< z0oC_#Fh?yKmpSD3J&~Ps#?F4Bi26aM?7@yZJ#EE9XQmnF%y@qe%Li`2rOwGl;33TX z7|4Ck<6;i-HMHC9+VWqBBQI#3BbxT^u<-JbVe8Pj=by*7`1$1o+BcwmB--mcFOmS> zo77Db0U@z9PJ@eRXCiGg$$eH8J?BZ$X^u0T7yYbM^2K`Ki%pKmomr6(thfmlnH#4f zDJT}|C{}ZGpjtqaJ-TcO*(RVfrRn{-Z$%rbWuiSr$u77S_ z)x>zb(R=!Bf)+HsHv}`mLkhEyyh(ID{^u>Lq(=OqDQjQFr&)^^%fgwlrq?%0&W)VAn0C&C_pS z`cUcL#&yY$o+YC?V7FhkpfdF6wx73^*fJkV@joIem(KiP$wDa)vfVa9n>C}a88a$A z^mMJDKPxmqCikMUAOSJds|JSSxB$xHOjn8PZUF0R^rOD=-W4n(wK;o;|9Kim7Z^u{ ziVZDn=hj}L^7|QcZF-sg=8siJ`r>(iwV)0O0kNdn-dlY84&)>xW(-H`#lXtAOB0Pn z{>C?qEG!d$qka!jV6|jq-wYxIgAlbq|1-0U}v{AJdlfC_)Tras;zdYQA4@BV`#c^+-ee0x#9h7jt8h_ z*qu@Z>fdp#JGr3-!w|CZfn#_9!aqR1UJc%ax8v7FOBVfth8JZSYe~brMsyhI5pX59 z{tH%5V{{*;%pS|Q>1p@K8%^L{;`n517%-}Fe?=r%KtdDZOh#NM%gVKSUSPtC*$eh< zK*~K4mH^h=4H&tcUUC=ec#e7=QSu(!E2d{naIgnN!+GX)X-x=!JdHhW`lH5H5{@Zi zgF>_K`{d)-C!H|7^43l)QSz_QE_NL`GumgWyoueaA^({s8f5b;bakqi6E0Bmyz)PH zX=*(phCEz0Jb)KGe_!)F2=|=xMfDLOcRe}%FEjpm>X|T$$kC#W=Re!I2f-SlDH*IP z(gF?S&p7GP1=~@c%+Y(itVnfqUmVnw;m&^l+Sil;2oSw5uw{4! zv9&4s2!!O&4olkfS^Fr{-)ita4cc+`fxUT=UY4BP5-jQ~F-L5&{h z)L%!qostarglGBtv9?=5y^^HkdWoV4%;VWYf2Lsl!m1HYSzQj7+zMSTk9Ju-o`>^{ z`CZ|HA$)l?#$DT9pHzOUr+5HQNKKZt?`Lw_C>$wJwMN2MiC6}Mf`H%hl@(j{{l z#Sl}toh!tRPLEXn2)!n4Ag8mw@FDF&oIi|TAj2V;d&EM7B4I|nY0Tn^wmgo{8}}<{V=dwR z3)1xdq;tX6XY4~?uH$_gqsab@;$*ZXWf+FU}<}s0h9jdFoky)*-Ng>i-B@{XJ=`v7+{9Ic&w{s9^0u zYCbc_<)^4;&K#}s;Qmz`-+zNNXZ7?Wweux2@*mK8mp#WcbmFbFvgrj3^wLj!A`c5s za{PTo8$Z<_TAnD6=-Iz1oyy`g6|z~g3P)axWpm9KXA?BXbp7@k=zL-rCiW;u^P?HF z`3;^^--mU3U2i-tH~i#eSN|8Jd2|^3Le>1}SL@mD)qFjtL*WX3Wil}hKlDm9NDAD> z(n)x7852939Amk>I|mJTLE&GJrW;Fa^2q`(&m2+tZA!}Qv;PZex+s6CH}-x1`OSK$ zi|T@s$>Xn|GxpkDC~AM|P5yrjX;S`oAwM7d}l+ynEwa>ac&tl-VY_^>ptiyliq7xiHaV5`I4^PxYq6^@-l=o%fS# znr_NGoanQMdB5sVy(O12(eI+`{rXGOEn$V)i2;vr@ALv-^u!H1pR8Go8w)b=Jm z1yY|Gpch>=kK{4`-^iizXQhnl_+^y)uas5Mm!wYm0>7V6;Sa>vQ$ z->Fb97d%x9^>KIl2Mqm33w`40_750Jr9ypteLwjsQ?*cUKSGF~=g^~z|4^Y(PksNL z3cVk4AvWYO6%2I@Qwk1?DGF2h+YD9l3@`alFqBG#mM1?WP_@wC&$OeWqNr5p>zI&+ z=a(M7(5Gslw_**lV-441jsDg`js4?mW8;kz5|gqLpHackjl?TdGxSALrq|zK=s#L$ zL2B&3z|e*?d{jD~YKEq#r$7E%3;q7L7FwE_^pxoGo#^{E%Pl)QhpL5Asn8#}{x9FQ z|3ig-$iGk3La9{f>jLUW37>{{e^H@DMa5Jt^dBm;qQt+sG<&u*D!wcryR3tXgubi{ zepeYnr9!K!s{Wk{Eq>qg=Y7;d?W^V5mznj?iS@B9^>I`%bi5&+YKHDMB$PD{uQa}< z;-SS&iT}u<|FEIeA3yJs(*A{qQsvP1Z5i$D?d2WRSGOFhA4-)&_d0W{yYs1dXh%;$ z$A9%h8wX0Me(0Zp%KyYe-~Ww=PAv?#{T}{6l|v6cH~e>a=s$93&lu_7+0e;v(^NV1 zA2xJ)dgfnp=$Dy6svJ5tJM=dm`uTq*haN0X|Mzm}`u|)G-QW4Ya_HcP#=UiNTQr5w zq`B!}d+BqL{io&+M|+K*5B^;abrwTW1LS0?918yrITZh2ACDwOIX|N@5z;o{OumzZOa)q*N$7sQ z);EmgZDKMLqdC+nt*?0h#1dIUH~IphaJZ#e_VGn13cYOQELF)b-Y2Ee3~<%qH1EWk z*U|PIbn7y_iwjTZ1!OZ)t2IDvIrNuVVA-i_e?_B^$t_Xc=Evx}8dG;(f`?lG(qZa+*k z=Erx(BQ&Hx8v1g^7^-aGo=8c?ofWGipwwbQvj=#6Exg}PfYvDB>QT_uHvOiH7oxo6 z@Zb?QxlT|}jrt@uc&%s$8{8}D2ADz##yq%a8jU(~3a7*Gw<*1CCi_1axr~~>FgYCl zE1EkB!9piU4V_8PCs=Q}&-e5<)4VOiCK#)vi}F2IOn$|$SF+_g6NHFaz2cvL+Wq-H z|8Qxk{Ab~ak7I%b)opRYVYqG6-YW=5C?N};6hgSReDOVkwPvJB{vtb&jb`+l$D6XD; z1_&E0V?D*QBfy~`|8f41(|r^J*B=Jp8viZ8<1b+p00eTGXh6#w7lN~5p#yMA;BIJT z5&F+d178xZ(xuxWp`jpZ78WN!jA-b683TNV2?f8?IS&zv^$8my0D6;|q5F!P6T>o7 zQ9uw%vvgUum48(-vdbVYhLVKm`_kQl9NFY!8Ast_#M;@NHlA}FegdC(I&zL|HjgOv zL3;{jd7+NQ?5a-Tllt=lTy70FdvaJ+_Gd5%+JbCF z;vN_5(5Tt*(?Hek-<^N_ig}M;%20XO|MLo8z0h(j>&@G!%TeJKr%SQwI`R2xf7l7W z82P}dB=3)}zZx!(6+*Pzkd{e$ST3Kw0RJ;D=PPf(p{<(I-Dh7T&almj!L=gp4yT@q zy`tIWA57%Q<*NaN08yi{nirjN@>EtdUs}lq;oYq;-Z>S%dwe9iuO58N@cQya-9Z02 zlas1q;cC-@!$(q83#~K+{H?1!6p{0Jbpg>}paqC_q4kDXc?j+rCx#%M`xfk#Ok+y2 zVNn(2kpQqj`FcpQj?Z}k0xOitUnnC#V&Ha;L~7-k&sOYx!9T-F9{{z-Shfu62Yh~? zz&cxPBFN7_+b2Mz>Ra)nuKC>QOAZYY}{n6s zlq4o@_lF5*e&dISsO%#xvb7`AQF(PHyLp1{__QY`b>fp zhkdngRgdxVxUuKl=;4h_$+v#g>Bh1D_17d#>e3%2+C}g5$Wo2ifvW{~px*3dtX1*( z)MO)lA>_tqcf#r<^4=rN7y3(VF{xBYKs!YVo${=@EuD zJYDqzvw`xF{qx;WJY_thLYqiZk7~dyvI?AibepttU-HY2$X2IH#`V1(_V!cL`YfyF zA^Sc*TW&iU@AZ0cWvgX5ju651<~Po6LA7Rdu;Io?3moU~)CziBtI8W6UZIrB2OY7N zshup)*)G#*uFK3Omt1*-Dk>;vy)Nd60b!dy1=AYNX4(LxY6IDRAKZzDX8@{d&LXb# zM3%zSYc2;81sbf^EU@BlL$516rAawSgcp;t>_k*s{y1rWj9>BsoJr)BJOjL{qv$^7 zm4Z3)R3?s)l~kUPf2H&{;-vT=kFZVuTm&_!Qx{&|9Zvqd@#Kf(qyq7Qc8E2Da?a3I zk$|Ap7J^9}n)1C?Or|95qogk&7~x(HJ~v7iLfq*REGQJhlF))X``7xNn;DG=LGL8q zqEmIz-rndfqA@5w5GG5Y5~6RvIx2Sr_T>kb-a zFG3VfB}C(be~ty)Il#g+F*pn=1^~H6B}CC7;7agX4)dPEMe`9C6-GmI2U-}!Pv%u< zMV1X!7gcwry+(vwBSM?wkhjnp0Fa!T1FaJJg2B*Vgy^xrnHuy3^4n0?CC>IU7?UAn zlnzFPsO#BoZIT@B>la~E5q>O$YSh69m?Bu1F(1MrLT^MGIY1@iBA@MpbpBxOQ_ysq z9X39$kx4h6X$(gqv$d1!o~6HHd36Kxdi)tx4y6*JZJtpDucF#E%yY-1O82A6|0Rd| zMc2KGZm5fHq7tH{zX(wlc1D&uRHt7|#QB(C-Z7oy(SuY%v|QbQ6va^izH|%S<{3_7 zhTV&B1q%cjYcfuYCa(p=T5s|M||QN*GK4FYmZYStzR>z)fe9UTXa9 zhYo?LerPxj9Zm!=k$@nLRQgSzAC_^P>W4DEuGy!*KO{gmC!5f&`N>mD=A7~Nj@~j! z!f5vVkFU6nQB)ZoCJhp&TYP=j3A(qdCk0741-LY~N;PL=>5^nfO2*CR(iEVYSE|e) zND7(*2_xB5o&kqp)eBl*#ZiVqd>FJmCiNdbv=WD2=|OXEq%xU7kt zLBQLJag>B9=n3vx$Goge`9ooX>wO}vSe&$kX};+Q>d^#-Zqh;w&9L4kEAo0$KE zvVsb7r0u+)wJfKOp98io2<=c?0q+YQyVEuYOA5N9gO`ED0#Z0KA~(kwS`SuW%}HL@>WXcv+G;LOLNO`?ZWY4MsfQxwlg@%d1rNWuf@0oyWkLsl zR#bsDD`i!x&Pt*;-O$`|r7$A>L_^8tN0}g55IY79B2_Qn|kw*hiItFU70uOlz}gfhx*q+_x(AzN!beYt67Vr$a?B ze&Ea0qQs%LE89iwq4~LVuT0Xippbab=E?c63=_TOfTpCq0u=pc=aj4)%9RqjERas1j zyg87G;Z>xh@Qyf0;L_SWiaS@civwfvhjTrS#FXA@tkJTFj$8mS>^LCm_I8 znlrMQ1`~@%!m?X&^nw6KFowh?b^ri#l%Vt?hb@pyP-|THUAE6Lm6QysvOF9 zynVQbU5)&jqeYi}*@Y{j)t5*ZwhJBCP|o`&tlG#$6nyOw0z_uhk_8n$1< zL%r^_CzrRok-LOVyLBj1G%xeJUa0c=uLAXsJ40;RC9ZU+5xaG#kT_gt>~YUDV~@@% zLy#dI6^yi^a{k-={G_SBg6=DmBc8RUv* z|1pM9yZ|MZj^6X^U-3i{RR^;9dvy%ya2tJ#<+?JaaFsY1_Kq1VltCEZg(uKyr-EYv zrePa&7j4>GV%h^9$T<>WkBKxZXvTvLFbqf=M@DyeezG>D9km(Cm>FF3^w3L(O*ZM4 zQ8iXP6ni|htT(u5*q&n8E?G&d$NfofDhIBV z?IvAq3imMUbSslINIS-uy`=A)G>zRX!$xtB_A%WWK7f{g1mwWb;R*jrqAkauNuKcM zru3i|#A8o*fBC21L-6O+D^>ER5e)hmKx;bJ`rM26|Hio{S8Sbu9e|E3->hK~_>V#=Ta1B*XfJ9&+&oR)`A-FS+ zKEjjs)RfK|&(Lp5eCdha2EGBt${LH_4K3`vdIp$70m&_<5;O-KigXs|=uf z$VmV>jE6ym*!^VyZ6G~Sjzc~kn*K_dIQ68>cn(>TMhmZ@>W1JTGrF~bPrTpZ+Z*62 z5~9iU`*^^QgOk|^+4e7FA>-K}3pPE{hI3JCvR~hf#xo!o)n>k*fHz6>>KmW-qy`R# z{*prlWza@Xp(OI;!PG4F=|VUVGQ!gZeKDtVOov`Zqmc7V-=|Yg!0rG9^HY>30b&mL zA@qGdb8Ybi{lgSF`|ShtIu^0}{4Y850t==L(8f`Xl(A{sp;RH)s*x)64!*J=J11}b$`8ZJ&94r})NQpW+>wdj{-x*z5xW}Va`~f1wR+v5;XLmjUg-twm%wl)c?Ejd0up3i$4A1`Q&-$a= z1dhGPZY8s;W4LSCyx>~4i%0Re#_VdRv){LGk2Kr!rtE5E{<-P_Z8v1Uh3US(z8B88 z?_;w6c%9qUe#f4|7X4}e`i~ud`#;Y!_d7QCmF*7#f(}yo4qm7q*sUKV2Obbn2eNjD zZ!@>=mmTErahH|TmTe4OHCf8{Kn`Jct24K2%l_2=IP4o1V6xr0Mg|>ih(&DDB|0WH zmTeDVk2`+M1qB_S=QvjXaop~4++4;jzx3FRvbvz6`i!s{qENWqKy&((|1@}gVIlJ* z+UuOKyyylW#ap1ekFWE&3}ru)a!8d!KT@Vq=*7R}&~maMiuR0*^WSnPqpPg~POP&8$a%hsMVEm<*YEv^Ubk#4oO5#A3qbpa7ElYmm ztLLs-gp?Iy-(`JSwXAlje+;!Ym71vb;%LA-kN&o8dh{!SyC8HUj9<% z^5gr-5lP=RE>`VZ`kQuye?mvtpwzHFeC@MNS(@o95`*X z0Mp(f)Q~DW8&o;8-zXzgq}}wKt9wYl;ICGdO&xXygJJQXM6-boc5i>L&g1@yVP&36 zPx7h~wdJ?;g%T~8F6CaMXF9iS^(0J*JF}VzH(_zyT`t+kg6$ieA0N4NmSgTtoA74~ zZzF|o{eASPUCGatddFJbACvm75a_Hw<9X>Z4qtAa8whi+6a)$k}!yP(Tb&diy*L))Eyn$!=9Y)vX@>OOJC*e@E z_gkX&#PzU?zP7&Kq|aMEP|>*s0&si!%F>Czan?a%sTdHzrhYZrlh%qKa95(u0R+XA z(}&YUg=mo9!xaXD8Tx~O0O;Gu3l=`pXC65x#D?xUX|=@@gS@4k2tR_9E<+{YguZa)}P)sC}Npz`hEL|skNsv3( zPDcM$5%h65IW+%dHrj-wpywJCXXzZDJg}U1&wHL#j9c?4oOfU<(ydm}EqbR(Uhf61XivzZ+FcY?b`;0!N*eXxWBSUmxN;0ublAO;0O zOZr(P8WR+B2q6BZ7umM~0+Dcl+jAduv&VbH4_H~<7U z361Ik%)3c}0dcr}#S3UJCf@y@x)TQ-1u6a5D%mdJ+Ab^z!cJ+_O0}wn5M`Wu?rEO;g=(qd?pj8b3 zc>hOQy6@RH0Ni9;;367y2%9A!;JIw-%vs<9MkJj!9upKj4I=H*|K^c-l9S`W^(G7q zkaCQ|40WCpBN99yb5WrB2^NL-LD<Y5drWnVl}AwZcH zmAp+!AoZk|G=BRAH#+4+)SUBP04^y82>~F`EF2i-NJS^h!n30t>j9Wfal$8VAq+{^ z^pfs50W@IHN&~w|@dLUldA`^cTx9ES0fWg*o|-KB%^P((?f^X07?UrXR|5bQlRaKL zIrX=)HA^=$_)-yX8Z9v(jgZuvfVx`>Kzx647849qa5L3Y>~f^8#lz}5Vlm~lib1Xw z7+3D@mQbblj-1k9!2!a1Ir8NY)v6fCu&%1L1jO6f3nVm6y0G6fP%De!Ms(+E_=%w9 zXFsKZfIm8?l65?wA?|>2+@yjY`c@0>ozsquI+Q6WP{GSvE^qk)4sB$SfO&l;U_?v! z@wMuxm~7{cxPE%QICytFh80VqIp_95lur9G=l}$e&_nYA@EMREk_4jCJmd_322_`< z003N}WX{S*a{+Kz<*h5no1_XO3nBW$!)vRSMqK0Gt+=i6v!K#8>y z%&%eV)H@oij|smcT_pMW(3<@Ic1e>Dniz)2K|gK94l^daiK0WIbR& zBT_Os7SQOke&-pNU={3o=eegTGtDn&D0 zc}t!>#1Qs)%LUD!dSDBISdhO%BJ+?0SM9Og*`j;FAWzU#Gb8ZAcSSwfw_bVDAKuOw zaN;m%0IEPz=cn7?8TLHGvzkU|=OJTEix`(vf$Vs}##z}Q)F;O+D8ua7o_ujH$= zx(DxQrt>`V(Fo=@)-)vm>b~xThy}R0>Zxd_-t&Ff9Mt2x+j}NjA1HP=5(u!sC4BK6 zs>%zv_n<0ba{l+AsPTX#HwzxTvpp?fG4T8Tk;3%R;=?CjpHp9gZO9a$+{2FyPdA#6 zpO6tQer+Ln9-(?Rvroi7@%{cOh07~5E92HBUf@5~s>pzNV+W2Yor_B0#D8rg|#g_JA_ zC5w27XZl0UxFPIzin3?Z<-k@b0r5-#7a9lUuYs~3Q#km4-u1{2uh|`-B>LGyscUp35T5}@K7d%V?1>V=IvK0vA zYV-!Agy>WPLFNTf3=)71!Rq-q3f>%L_uJEM=^bq;B(o`!fO0d|Y##SG2*%uEsQ+-G zjOcNDUzDKgf%RP=WkmxO=t^gZ>ivfcRY!<8kwBE5uiSSZ9r!6e!V+M>>{HS=MZ-v7 z8Ri$?@qS+fV==d3O$V|`XC_D*P9&fDjBQsEcda@a(RU|S z#KS-i>ZI`*Zu&5r^ecr_Ajsi?0RV?;RV3(#bK139b+JmQx3O#)&H#1; z5K#U#odp4qAf#)Y3kcI#04PlC-?y@lWE?9S(8o`uV}od6G?w+l*gK#h@mOTBrp;foUzY{`Ju3J|cM zr>YLm+J~69{)YY;ZgYok7jgE(#+?rV6}f(;?cema@Pabo}!`TSZdH-n5- z1#q{M_}rw-<}v3;IJvKlz5qgO7yww?W2l8@KZ5}-AfPiDQ2+#@mU<^efz_>22McIA z)Xa1m!FNwqHA-+KE}P^};bHA9nECtQ4S9AFPD2~LFGC1d<(TLk^joA8C9SJhdIExa zxi|1^B;Xk$=B*%!t=5VgYdKqIO>Vay8nUL=R&Lt@R|SSy^`dBE%{|xJ`x7RfUN*5& zr1<{fa1+45mIis;gjzttNpfiB*7+XLlrVK*%r!0*dfp&#?yL?_5uNr$g`>9Y_qNH2 zseT8Dsn9%BDu~exdywMw=wtM?G))31ic%T`=-amFw$}ph;i!N{pMG2F_>*eY6XD-% zGXz17Q#Ir`5)lUAsh?W1Swx(b;ZTjB#WefOi98@c2lddZoiv`Js>uytpLyJ~ zWycl+*wDF^pDXcm6EC7o*N~K%BoCsOb&xtF1_-dyWqJk$+$KMztm(N5J*A*dJkOo6 z!=E6hKzY4rpi0Vg>eB_<)BBAXp^CL{=gndlhVnVsgM_3whRDp^77zw-W@D0!D*N6s z7z${Faot+Is*oL|=E3MF z_wZB{XJ*e$N4qGIb5S6K+z2B?(~si zqSZK(KZ%!#-%NGcwtQyhR9pSm$OC11#DfA{+U`6caTx%L1KI@b8b*_)g0=1Q*Jejg zDRPs5Tzf`m!T_6IWVr@XspQ0jtufl5j<=m8dO}K63!o)@!fe--RHBGw>ph}Dryyjp zY=BiK$&rJ^Q%jrDCMENdSZB#52+0S$1(ra96~#%6?fy#)PwnxkrY%Fy0XsZ+;reS; z5(j3yG#r2s0-T?7(zxaE@;>t1!Tr*T6Kpct90h&kL@1{s>FA?=xT3J7^X%FIO~2>? zWJBZ}MjG4>xbwH+vMC@f{G^6XrdgCjpD{_K|K^On1n+3qHoA-p15nB}Q3_Noq;_5%<+hKabgeZwMVNGz)L3fJd93H=}LF5Dt)HH zafajQZCsqhLOlpl6H)2zvFRn-UYx4*(2d1=Yiv3YAbA#Qw?@4uMkjjG%z8~GcD;M* z!1rZ3AB@^EpL?-BT-O|U=kqblc`2`IvDvVY%>v|(wP2UIpZ#k4NNQgj(BZ6@1LHt;tWn_>|S`Z=sKXRz8JBRg+ z6;t;h0Y;F+k08gx6~}u)jxU3p)IZ_$sNfrcC#@~`GgGJkd zANPJb`6JlxM^;!E?*u70yz0|fFOTozPZ->18sMW}T1eQ9ETVd`hXdWSRIibF#ovMM zcJFRP{HJ@^{K))}81>MSf=ohRK{M*}#j4NdoS|nO^b6}R+@Pew$>i@nMHpdYVw6Hei$0p5cb!qAmnja#$lMB zUI@Y=geDlCkNT3P{Ur|@UJ&`EVEPl8_=Tz+;!zr&*6@Y?Bed*&c$i*zm38lVdHM`u zmCF51#gX%)yG+&Li;?E9Ppdg79mv*S;9#GDSidRg>qo}x7;7bM&`8}X6DcuTr zp*o5k#{z1$Lv1p_-bXO#Ozb68hV3hGxWNS_(gl}iyA>N?3k^2iJ)i-W(*R3B7Eo|e zyS9&FEixfK=YN$mcaaOA5c;np7}QT;2iXg`)DD&Jr9>L+1}4NuhjMx%;s^*@#P7`T zv0q^}oN;66Mu@m`ng4}BowAZ2B~&>5#|15>X9f^6Qks89KiW#1iSvYk$%$K~6+5xhuW=ge0W;V$(TF&2#7>%GqRM4<({`xR0;m{}kr_%c z{|O~#vN@Dx+3%562w5&ONtL}2gPH9ZQR)#4YS0WW?ukiIOg}G5HNTeWs<1;mf7!l1C-GUVS*^S^if z{x&LXIh5}piBL@y-a{BI(jmpz4?$F8!|A9eqj;W@Mc;5~7{GYp*6`%yh zuM|wurQTGFX9rJ_aqU0A|_&K1{>Szp_f9WRivxuP1rB{xy5?9mzVW$W1gDThW!Dy<1- z>FS!#V>ZKgxGIpz%3^B0Rt-)DY(M`{-JH1GmEK7omMfN&KXNdKz;(iQZQO5rCv=|s z_DgNCH)eJ2_SxU+@B5Q~Z}GnF3YUtps)A>XuyS~sFI(hiESgd`Jc2R3nyHyB&El0ETovNlE z&cJ1KXZTjd_5%4@AG=CA*3mzZC-^l&#n-1+TCx>5KWMH8?yK&K0zPOS6wmkxtbFOvU$317Akew&>e2dyEqx^3v-aBKqYY>GJp z@chh=pW2rHpK|DQqHPqFBU!ONOZmQH!-YP)Lqp+z<kZ8?yKB+xIb|NX-<+A&@>mS5HL5_S>{(@yU?(1 z9Yl8)aJ}&^b5@-z1I3V=Hq3AOTqKY_h%i9XS*%{iay*I z4fZOOZp!+r!2oivi<=V&OP9^?boVJ?W0#W?!ciE3XCh^ux{z4mzj(fYGuWHYjvdFw zjP}D#TiFeMGg)(aeNlVb&xG6ex<-ZbFwt3NqjL8~O$B75ZOzh9(QKYmTG87KTa|}8 z&((Cd@Oq=ubNui#46Tui?k%%~P8luqsH#tu3Y)5jNpOIyW0`7Uii`J^+PjA%f$LM| z*qCwtu7c-+s=-;~PD(C);AwuUC)@GoStJjvyL@*kjO!HfP?(`wM|vJvpu+Sz9^(8lW4Lm|W4+Gqi|`lN$XAFs!G-uHQ!%aa?M zljf|ZU<&Kx<*hkF;V%E5|dRV+L0U@nJO#!QE|+0t|LFbUsig%dCYXH<8s>nl0)a@{w=VZZJ_y+?gmPI z#$@XEN9P4QxgZ|FpvTIh^Uhjs6@CxPG=cYBb{Fm}zG3po{ui1*_W0bDTQDnm7EFL{ zauv;UiZmn;^P1}O8P4T2ZXq%R%g{ceMw7o^Un z?eSnc3!z)RZ1Q*f^W&M|lFufg(wnn<=T$$7v|B>dx}qQvAxk{tmX`AUNwz2QwTt%_ z#vJ&v_*3qJuM{7QsstKNo}S+3*tHDNA56--P%Kz2^04_xd~-+`(&lkugdjeX9NiBn z*R}v*iEd9Of>Ao9NgIj;t4dCHj*sHsQOB%fZ@M~$yCH5BK-5+%*b1{P>V-~TG16Dg z&i|~%lT-jX@i6*%wM+L+(om*h=^&To6q?%`-k0@l@66C$j+TtV42%D|=%!b`l%Lp4m6Tl`Gr5K4HjIo1I zpk>u5eg$$7G!k?I-VqZ7o{aqg_EKbWzsYk*t9DOc&E!I z)Sc@S7pbBpYY@3|fy-_>6v0eIsP>{qmVLx^{PO}rxrTF=@d29KdTmHb+?l7i8I0%} ze{6D}+-CEsVLIU@J4mCjQ(^g))IKUxM>*BEf1^R7pZM7H2ab>5o~pbO2GRU6csVr) zd~_#utid`cFLI>e`L*dx-3_nYgk3ys1Z88UUj5}n)%}y7KUwJmOv6Kjr~|5nnHbon zy{#(v@d>kd>1(P2;d{7=Ok#$PIGOvb8wl|uY@{o6kjIw+WQoj?AQ*9Z~ zGH#ZBDh>?qGRQKT9_9|83KVH?wlNPG7?~ zKdcPaard8S-@Df;$SICL_k{>>$rA_CH2E8p|tyNjR5yYwS_ofARlEz0g+B$-A*8-V880jB&QDI6{@Rr)6lV(QbKc~vQ`Kz00x|GX* z$lm)fW^cUGa&`9mZ%eYL_CJhY2;dodo}9t< zHpiLWH|ia-U(w2RF`qk?1mboZr|B>H9e%1##^9x{I@#`dKY40lx&lrvrWh+S(U@9u z0A8tJK{fT=c^FdC9O$$G{#N0v`%50ME-#HJ-pl%0DS2MALWAb6XoDP z!DPim@BI^YY74*!gSwM_U^yO8qwd0|K~TMvg)EjLK4sBgSO$1cm027VSis5hP;}Pt zQUg9q*LfvTDBG1`lV6N-$7namZ*9EXTB%_Li;~9Y!;TQgK69Ptf@HsRP2ps)S+K`v7wQmELTpuS3x( zz;}(}O=p$`L#_qSvuiJ2{pYjfP>z?E#Gw+0MNix5^fLI?$!Fy*BZYWRexmA3S!|u- z(U`K(bQ04p57hiTynTXZL}Gz;t^$QkhRoKex(BaKw(t#w!zOFXsyd)$!29CcOR~2X(C0@T0tFOw!-+zGa73!uQ4TRlWUQxC^^VgdF%~u?FeXB&?D3EkVh{h>-f-UyYqJQTQlav{a^*`-tODJrp+pc> z;-!qn+F%!YtXtQ@YzH5rtYW_I^Jd+~r1+|VlVz(fFaZ&L!y>+rkBM`f(Y(*QE_QZ)77adasB6JkY z#s~FDgMv)FTH|0{NR8xrMC=3(?Kvd4(;_(!LbweJ>a1EX;8E*y6YdZU&k%}ab*un! zN9_85MYeG>P`www2c(`qQi>0ts^eGf2oN(9if}Kf{>JdE z`>mNv17skiQ#oxV7dnLNSYZwKL#8iMHz4eEfz#;>vmH|uVk zUQP5V^rSXd&R$?>m`C8)Wd?7l-m(h9y$l!`et-UO|6`CASnO5~0)tat17a}v7bBUv zi&R}YPOtVg*nW^#MqaxahjSkhJAVBhrQM1K<{Bq!1hr1tSsK-1#rG&2jd-o@=8qx4-i402;nY$MN1CY(fS!51RoMIxwERh6P zDp6Wyi|z6NAZm{aZzLa$*{mwsEK_3?CkN#`wvPP;|NPZqIl_28v7tDIRk{{qGoood zqjro6?1SDsMnA%dsmE4__%O}f>o_$!z|QdJc05i0jio;AgWbk(N3+x3G80`eT!8}O z0351`FsLIWrakw)Eo&r6a*V%MTh4PB#sq125AxbUSnf2*JFDhhIZ>~*&Aw@COXix- z13{BhNm4o|*Kle#acqC7_$qr%&*AX9*R0lu)kt$P7j_ES1-WYVK)asHSNJXf1(vnA5SvH}IddK&8o+=2N9sz4$*|WyUC6%?bO+VvxSV zH_K)+(z8OGH0+8Z8?E7n+-b@e?|Nj$VSfTjwa)UH;1qQ_k4$0?p`nv}RN22&&%fZV zmslh#+{U2QD(LJN->>>B6kc;h+3`U|h2i5!Wmx7=VBePCUp&#C=4(G9&Fs`GCRt*> z6GeYnr?7TCa_(B0bZokVzu!=K&qw8uBh=Tl%VWN7>{E69w7uKaQs_MT+%5(h5W_@8 zCsf=vYS4=k_Dap3oEvd-8Fa%3s3EBzUzyVy-yJdu_H-Q?hEtC(KwfrZsNHYPvD@H$4B{fAk96d&xS^kk3a!j7d7 zMbb0esSImu@>9vn`2$I_eZzX8IKsF~8a?q0N{hJ@rm z!4y|o!#-YXs)f4|r%0n5Sq_4U*kAW|EsOWE&fYwJ3IDEsB=A9}IS-hNqm}q~<2aQ? zwme$atz|Ua@4O7|3w(Am$eyOL&|*Gs!J8WsESO}KYC!*Rh3)}?A$pI6QQ_;j*&Adj zr#4FyZRK7@cIFVZGk*EyN$mNNdsiU7j*zrSh4)<+9^k%E>c80x$7 z9BC&jvpv1&;jSoqNc0#O4&cyour#-)l~Q|?FB`;9p13}F!c>TgSfhH9Mu%N?0!az8 z1f9y;$*+E8I!DgfKrH0tJjKg_W=@?c6A4e3hjW_1vYt5eFnsv8-^pBmKKUwRd0Wjn z49F4&lrl?5ot$sb5Ee%R`BHWWLX$NI?RLG z#7`=1?+?owbmrPyK6sQO^C2<+8jTBc-f***cetH>?S6q5sBlt8IkYPvg)~@2wuJPX zyUiE17IK8)%{_s#o_`8n?cYw&E6@3(Cx3<8lB&@QFcYG%AKAln2KC19m2N#BPW;_8 zNJD9&?f8ZzbNI~LPKPR*Q8T$Ugk~K(Vxl8pz;7h6d{2ZwJ@Eth7Xc`@HXH@kk0RjA zJs&n|*f(97@(Q_ceakp*`?_W7L&Qp@lcYs&(|sLAa_djJdH=t7LA*IfI&zJCF$5yx zZ>if%V)q<+@|Vin)o#x0WfuZDPU3m#Sf-$v-OnEOTCCF~o~M_EAUIzZtl2FOme(C!1cV11Ik zj8B4JF~<+J_ozS0u;`r!-mC|<7z@s3#-;7V{uC@P#)2l_Ke7tge8s40B6F;5?1nya zus7eu|5{O?A4?>0%wrX!AKC2uvr;7Md0LeFDIDj!u8L(}%2bLk|x| zVFFbD|A!nJL@1QC;i`Hr3mZqgAxsn@Zd+UWVF_AstI;q_{_dGyJAVYjoa$tM^7bXn z2bg95*(-GWs_|jgXiD$LaL*=7Y511uD(-v2^{QuSf7Nen_h?+rOyl`!{)jvBUT)n9-1s{Vg)Z#`CG^r5V`Ufsl)F+jYbEP!_4PRdM_-S7g|99pq z{>}OC1D6x~EH10f{N>3-)L_xY0GrAt#wkgmiF7HF(^5C5?qb0(pXouAg|pU|F^Sw? zElZq!XPVRNN)TVihP>mkR?;j1)ecKYS%6|44CPceR))CZKYm5r5&)u~PZg-k@Nh#w zIYdbt03H>JOQ5nYX(=?vB^yV%7FoXbSRC-q|8pjNxex?DTfofE!X^HxQ-Xc(7kp-7 z^EFfQqUEpx-c;dhQcmjFFQfP_BF&cPJ8iy z5v1SgxBi3-;uTJ|vYJFTx>HcAHD~QnAX?#U1|t}Ic2TTeAVy|c;Y5brEEO%^+e$^t z{h|oH3Hy&6dgk^+ks~?u`;#iT{u?k)hI>Rux6PT|(8s28L`7a>^%9O3bobye`3wU# ziz*w{+0WkBh`5(XmH`R?Y}iVwp^bOVT0DMYyfakL*9xr^{D5pPEU0zx%?s>br@yVV-R2nOZoH zi$wA`62S&q4#t?kaNOlf!gU`M+!X6X>Tf4nah{7@5p7PJ-y7sNja#3p}e;G6yH>aP^!Hfm)~w^Df7@>r z%rC}IB$yxm{?zl~T1aKzqr?5p-;XY>r+he=X2oQ@vK5$lSXpRs@bB``F;6WwesA#KLrjK&Ngb!n z(bMvMjnt=I>_W|=hUfO$0{_6=VNKV83R{Cng{XR{1C!tExHGt6gI(kTlV6?KA-2kp z5k05_tJVYxdD7~Isy#NZGMS#u z@{tT;Nxcmt7|6JQQy>k9CW5uLgFxPyAUR>FFC!-M@IqR|ii9brt?jSED-L&qP`Xd7 z9j<8Q+_)+$=FhrD*PRu~!ddb&rM?uY(UPIE1KGy8fnoPcgJEnaVyr!oJHu7YEBsf4)w(Wz~MI^rC z5pfOWbOnNfGDpE1qUf}NlJ#ViOK7T4BjSbP5t`~!SQA$-{4mTK_r|$ABd$?G?Tf7| z>%L3eU7bFa|E`Cbi(-0LX$ z9ue;4_4d6;Pv490kx6g8J{*el4hsK>%JTC5C@tFeF){-qvbSnLQc>|;Vj!m!sK zmf$Fp?H>*gMeiN^(ax*GNMDxrgr&}G->?2}YoV_fV?{osReW4E+sM|#dU zM$}Rz0G}wFp3GBHw<_2G!Q=}cBS2DeOVc=!o!X0&AE)WK#NXt$Ix!z4Qc!%3M}TsSg{kI zU^Hk1?3?k=`o*lgq!=7;>AifFz=_x-+N--q-w)OBa*MsF(7kU&D##!wbtmP8? zDfO2MKbOytB<4IFlfFIV`5mO2z4Kl!xAfr~Vbkv#L#A1U_+6e^yjdb}V7eU4rAj@V=PO;zy0+KycG(e_ z#JzupTf#H{-URt`sy!(1EXc1Fdy+zFBr1w1#MWkeK8E#RO|j^ERO8ZJv8fF<9{N~kZv zuPAUfwJWHHA2ssXyEcye0geY7-`o9#-N}#o3wUC`U2cw&C89OVK8ghK^Y(nd0>^Q$ zMzQPGjzi8(wl5*_{U`Tl_*pO**crxN2;=UM+$rGabrujU6yO#RkPv{&778Gp;VQFm zU^Ksmvw*+7;J*SmTlC|r=tVfX@W(2zg51&mYL-LC(9}4jy@0PP3i_BVRK>V|CV~lX z5jkJ@I7ahI{N`AyGdoE@l=nxOIYomd_BSNqy6;y43Z-mkt1Wnj;b$as{W){VB28gWbdF1ZnU#L$ z4BpZl{?#GTp|wk+MqWKRTkGtu%lf~EKH@gXpg*U^0F7&P0oi0G8S#-{wLra=6a z)#m8_w7F`Ei$d^i#fxyo3sFi*bBd$fN-Srk%(<+q{#`m-N``Qi>vKv)aOIAeNB6W; zZ@X}x9LzdR=(by8f8Y7WpdR5cr?7*TyksGshCRc2$tfMrI9e>!Km|)PiQIf-Nk5zH z6_`Tw93leqaz9+TuT^+FPbd_t+1;)=wxpR7(=Xp8sl2V#h0)g6*6Nwm#uf3|2+G>K zK5%dat7?k~`t#}(z4GkRtzt?#32IBl>VBBi^)KQJF48L!7?9W2_tVztDiCT_>?Q$4 z3Yco=hh#sy$dCs4L&%c1fp9*w@WIj9>ylp-8HpN{60iFYk2;MPm&`DES|mwcf{72Z z4C>)&B%XyQJX`^%#mOcCp~8pJn&2I1-ja+iL$74=t`1f>cTifi2-~!VEmFaW-o=S^ zVP%E-^@pf!@oLz7|9%jOD}2c55iTgoJO!l{{~t6OY3vi*%TM-YiETqV6EAS2YcK~Cy09A z;;@-;$G@D`WjT)FFemT@k?p%qCEuJ_n~uO;2eAu|-#MKHPb!>XN=|>rlfOCt6LMuM zaq0TzD%u`rkg*L5xs(D#Yy) zV(biY*R2m8Pvm?l{k=`^4tKz=uJ5X_f8YtCrn}1+IUp-G@aIYYV;9t%C)FxGsq0gW z#Fn{GgT#9)?2#0)Y$3RurAjv2={&1b7xX+QULftzr;S0PmZ7{2V{9=feN{C3p#|p# zT4=}Bu?@?6i!4d3;~S4=?f;Bu8+Q{el{&d3Gg&8W^x~eu*=&mE8SWl8DWHEsGS-TLhMaY@>pJp-c2qY0{?oNC;5dn{*@yiQ&$N|2m*mHa#*4P0^& z?b^%yorDvPu-%NQXch=akjuu!eU>$E!*cNrA`^j}1ECzbLZJ%K3sgPIUmViuH$2kK z)=CEH*-2qh*93ksIq#yQuMG07V7d4|Nw?KXUxjgbc?LBSEVn(lpKLuzwHK;x<_r_a zEZLSx=nf_+^curltmU>*x7*MGl{VkYXa_@T6_LxX%@V% z9%npB`Hxw7rqtx#pV^27paLwY8n<( z7Z3y?{dbJi@K$9(mm-_jg(prG8M>4_BbOw6D>;vdcw25*(ae`dv@>5;%XwXXS*)Vy zR7KgJa+YnqU}Riyj~a53T-o}%@=isihClbdVE6m((p4C8=}E~6qHyIPg!rc_7IXFN zx0;DRSCb5@UnO0ACszGDiTj=H)zH@Jr+qb#lJH_N$|;`2M=xtWox1j<;@Z-wB37S% zsC_MGaqabl!*T_U>!?ayN`?062L1cxX9HsR8Y6!Bi^(6C4cn7}VC0hBc-$+md%d}C zYNKIOE;CQQA@EuDN8OFfrGfpgYsN%XPB6}cd5Thu6fPMx`=~7X7By==Zx$16$xLn; zi$Rm$w5)PpX*f-P$0dO@5R%3$} zfkv|qL97*^z)@jfUP*Rnkj-Go;A?|`I<84!S^A6ze*qgD?!`Yuk(|eJG3LGbaa`jl z>Cc^5WnfUEcq@7iy4u3Ih*d$0WuJZBg}ZmnGUZ3X^IKNh0%peDHdWnTZ=3v$?*#ActXfGs_Kb9&~e}!LVJT!JZvI5e*+C3bbpD?PD|vy`gh>( zF8#MxPwd>I-Ks}V#UEWvlmwVCG`p}3zk9#`VXz)Bc>4#(%5y&?$M2AjwZ4deF0nI- zK`c86QVT761xjgvVp$@7bWSX!)=qmp}VWv}lH`_|DanpAYVwt%dx;^1%Xi zQcYw=YT+cD!s$}U?7R0h>fDR^2g(*FwSAuJS3fs?_uO16VpKBzGtlVV>2r)FHJi>r zPaNDJfSqvtK`fKgVrk}xL$$1hb@IBxqNR*p*%KKV~fJIthIr z$8(Q)O7&gfy2O0zyEk{F-gf!C9rc*^TZN?o8ectD>#5D}IEFDC0(;7c9U7%rpwEcO zh}AH_F$su7(|7;;-~FqD#($Qe`RE9}>woAh6?R1o>hH+}G+uugc+^jjc>mvmLE7Yl zhZmTh{j*TMGzPgRSge-0s4K&+ik9+b-Wf!B?~{*>>y4%*=Us zx&rIDgSGfEM8IXDh0>ZdeU>w8)^h$MhbAcvoYV4s`Zh$moXOt%=X4vI`{k0XFAicy z5;zn`9>Cd#toYzVY^uQs69AhC=v$K9=YMq%GXIU`oKiSCjTp!BuzrxeTSWNks|T8V z)(YSMWJzy;&6i8k&dscF-7`K1+dFZWtw3%0qko{xz~4|F>Z_j(iuudSO@=c2IH@g} z{-1S1ztWce`fk{N*l|SeUZ~pG4w5$wRf?o&0onJK^lg1B`zh4oge2IW z|MOIZ%isO~Lk>kKoqU*Zr@6y@LcGsTgZ*Fal!5kCic8wYo60L)*qC;bl$^WX^Z&}B zAyfMQTMqrvBJO+b)99_Ys%6YH%+i{1pH;Yd!5X<_u2TBqCh5yI{-s}w8fLS?mCvj{ zF{dM;%KWm~!~Na$U=8VKTGDb3o8Ef+K&gQ=@&Vt`4P1iVR|Ag^Cj4TMG5c>ptmw%! zKC`{kN#9Qp{Qo0|E}i>{Gk|+zMx8}7nNPt_E}1vax|mTHW1T4KpE`2sTGL^rG_m*e zz1Ht~&{zKOAowJNC!>%7kjN&_{ml0&IYj=F6~&7V~$KF`U&xyU#;g)k?PT!e007 zTFWlMv(L8O|4~^}H((eyDxAEr%qLjlxs5lwS^}C@{p~$8g6cnDTsEFMU=d+==UnXh z1V@UO18y%@UVkP6$j@oZ?b(`WiwML51`$SLpz{KTCjn&0+eIB9MDQ+m)VYBHJ(_Vc z6U41r!Xx9<*})#k`f2y%0w}j6+oz&+)usK08|KH|QZ--v z@p(s(>QBw{wmNm|qgLiS>=gWH0_Yw8c3?0fZqZ>jLVMcuXRhlrz=4+_yRM*G0@DTk zu5#`;evV)k{uDpWKeZ=!D*?x zwgS_YOPo2fZ165`V7_@T2FZ;Bbw~CiZ7dk^THA(c`1o53|eOkgb$@=&N;bJU%(zH)-Q_2RM^7JYJ~ptsDG`L>&}`e#^=%Fe?Q=mMi zb+*eMdt))qC+?WcrxW@6maSPA9mu(ND^!O=ahD~pJVepDA`I)MTR3J96*30V$MV)> zamqv4b1U_d5CBy9OP711&<$Cka0&nUxkCC!!tK5>;k2l%T99kK32*uFF`R4g8@FWL zvkZgyH}hQkLfJzcG#9cZpWFdJ*YBLHYu7|ulXbVe=`h0pGozj2FMNCK?N`CS0`;8} z&4{PyOy6sX?)SfDN5TW@_o@#o`_$7Y3t7vj>#HEI zQx>z|bNp5$9BdzqMw|SSFXp{HtFAJ90X5VmIX9dYreZR4U(ZQLkaYeMH;HR1uHn({ zZOCJP03ZsWz`OB1@)OicOxoYQ_l0BM^hwsc&M&|E7QaW9*Q?+x6V)GVL}z-Qvi6NT z_v*p=xu5Bbs`qXjPu!rZhzc^AO84$=h7_LkJH`t(ZrwXn_An21+cwi2mkI+ zk;kh5V;X{TKA1#RS({h=qeITAN;GqxW#3(B*L##$X3;RemQLY?jp3Xyhp8*j%*4k! zY1y=onRMWAx$ZU*lt!9;&`-!TdWoq<`Y*kbvbTJ!8`)Wu6BBsM{|tq|>RNzZI>T^mCi z)yAt<$fyH;LEoawY7zNjEi)ap6;nrYsPXG(*8;8NA=P35&7M-X$zau&G4vEmmB4Zl zj>cnoUtz9q&g~g+2F@|LarTe(56WX(_+!4heS0(5`trP@$p*x2@xF5PNkWHB%=+Hv zH#RvoS1~V*u>s%ggV;ac> zK(!Bul`kKv$A8^(i;rqDet$s{{u({jWHN0eYeA6|&-5|3=6gnW^mc2Uh-k! z278@`UU&xQ5+adL8XM5{q|?5opnxpVUhI%5X%M121>*c#e$+!wlee#adst(vQK6r<#OoY#V~xiq zxkdNI?kPZ^?BUFCRB|Frm*K;2WwK0`y97In8XTA^pkTjl7>r9mwtI{VK57(dX8Y^? zOgV2V_t&@c_zq(vIf;!+b%`G{F3W37e|*~zsC)^#3}=MZ-upv`&E47Fiu#>tRPJ=s zMbZLwM*k#ImOp{{@h5zBIHksiKke7hKv4ZS{_MaeSCZ=i*Kf?3Q86e_lcQbfAxFj5 z6K6vFmG{kfeKqY}lgBO5G~O`Nj~-g&8)Jx z=gSLhFyUndL0gUdXxG$)g!BBPaJ9!qc669kxi3#S)zwqSSP{2W%c}u|Dx*LGvzh&8jvKxC(R>M1;6_(jOWqMj4n8nIQF}bPOh28V1Id@hy=+ zF-&gh4Y~}G?a@8{XDHB<0-j+MtWgm*wU#@hm^%`dyX3?zO@wtbLER{JZUy!^LY@$i zhj=5$E*>OK=5WWHSk4uo=E4m+wr=EV?m50Cs%7@sr|da5@sevLoI zso)ThcersG)WQE*5#~rrIdlTOAr*i+3WS~6>ulfrKiZ(0dh?UJ`l}5flYARB0+rL_rC?R}}#f0tg~S zA|iq!CJ>OOAfkdIL_kFZ=~9IobnkukKKp#nIrn+)xMSRLpTA(`hm~Z`H9zb9V#?;J zgY#1mc09}$baZ9F<*f|3ClPHx2hAB=)f-~_V2~(G%H|l#KEroy%Rk%ucBbGL2AmB0 zL1Go_LkD?Z<%tY+Zgi_8fh;lEm(T0$CV?uk9t#+Vk}Y_Soc)Rf7A7z$VOT?Kuii*x zRwCz`Nuz~{;5*2?V)ML`hTI#u*>{F6FW5rFuz6dAJUyluo+5bZSdL&j8$SVj<#vAG z*@WCL(Rb?j*^AV%lU+$jjz-n->7Pvz>v0-FD2NtOAzT zsE+gLA7u+4ig7;cD^M$rTrnp&V{#Y26h^z|PTt-w^t=Z%(;y9<&F7!Hy4^`~%R;JA z*$`c#t;wNmnEN6GUbBG{!*;Ajt!qIsar1SXoT*|D>1zw&`yS`eJCYm7QU}6(Dyj=xteI2GW-R zwvrq2wkVJFDUVMoPpl|U?kZ26DoJIK0oEqird$_ zZr_-?trh@w#uP=YW2Mb2>>e^lRzNDdwkxWqDr%S+lqTn1gUb6pl@F3C_hLEkc2z!} zs=TXJ*(g`_;9=zxpQ@)*6;!RN?xR&5Q&o>Es=BqRo~>8ClB?v$R`**}zwWBm>I3oj zfz&dp8@j-1?BMrZ)ua4(#xGQUtN?wUs&b~l!sm)^tz(ZgUi~h2M(qP?l3ugTOa=*% ztAThmQ8FT#e8`K8>?UtBN3y&lv$BM3^4~>SlAm=lMk))X`f9j0YIbQ~&MmszF?3fr znGIW6^QAvWR@-m6`K0hUFex@?F}&d!P+6ziq8%*xlR@wYt8wYTf+e8@0lg zbz+b1;(YHu>Z<*smai-dA2k*pLh=WEc@b5>u zdps-_co1^EFcYRka`09B2{=YpVGwx7Ns! zLEwi(y8BUC$o_6#&OHUyJFI3YO3 zE3=!$2cx!vJt>dk>ohfkXmUNY5&@bFs?BD))mf}~A|qLr z>h;Q=xrcmX!!R74>2<=njVH530wsb+k5>6k@tF5H+EaxC55g2^Q}Xr8jUqmqHD`wTPd3r$ zs-5sgN4qkowTekSk)No!>QK+v)&fAy##XM?NjC62$j()`d|) z8{Ic_+CorWmX@7&0=jPdb!~e0G!9Enk?o#H2(K8hy-9gCl>f}t7I7c-Y(1c>$+I*N}PFYhfV^aNL`xiJ) zNOf}rpxKHLXi+WYQzs@!aJ!5X6aGj$1NgixO8GG?N3@?)wD*|U3ocesj?_*u@C!Ay z0mGSrlZO3FMZpf4+cl9X{ad!C2t`?9Fxr*K?@!YIMvP%zWDGz`&w39C+6`cz!4`W4 zJXqgwMam?#$c&XBQZYK^yODTXky}}E%OS6-RA0eAG`$UD{F9+bVlY@F#$D*mRQzj8 zj{bD(A%Cz!OAAXI8OCUkj9j^zK{`L*r#Lz$7CpmXmip={fGT_OvgCs`g=lXvIe69j z-ThPVob6N)msKuhFy7Q;EA_`xT2+N1Y5|`G$`#@9w#+wMS z{!Qw8dXX*}bAO8r>z@-`j)$K9w1?m}ToXC`xP*=2B*Lj^e@u8OWMIg8botb1z}@$S zW!z?c==7{@^rbn-JY6fdRjyod@bIz0L*s|j_Q|fgNe}3bI#l&K1U#OA#eN9*z>@hv z;zgfa5gI9^vx^;#Jl4>h*}IY7k9yC(N$fikK01E){R#?59dc~;IFEB!Y}XyqUMr$8ttGVDb+>MRz<0U5r~7dg)U3Nb&Ca)8NouD=an^I?-) zQL(u|5xW)$yM9XL0tr5{G^QSihJ1?IB#=4-(D=Sf-?PzkOM#{>=$XUJ?B819&p8~P ziJrCH#y3ZP$h^B#M zH%VyRATrJ-=z6~_L3p;*l{JGj>|yA|$mlwzP|WIQe=rbJ61;)VCJaV3Sqt5>W$>e@ zmxp;6!CfB=b~A(d(wyadAJYYZX{`@km^GDf5t$00HVp#%kB@PF@I*wL;>w4szp?2P z+l0{zjAU=3D{DT$B7$bReL7IulxKw$sPI`!2@#|x5g-#C(N9Ozhp;!C#wh;SwyluI zFrvPyvt#X-b_G3_S&n2mcjHYd`OU>tIbqyX_sH~8J#3jBJxn!VqRxyct;~D8En!uH zoKavPx%yZ*1q-`(kTF@Qc$IT?lY+7WSQ`8X4}p}%D5(5;Sh*_NjD$|zUQKF+x&jDu zV^gbM1%0~6Su#xNIme9~gCI5J1=3mPr}GoUs4YxDkmq^=WREM6IPi*O}&xDpZ7a_5euSic?IIBgyd#t*sT0NU*58@A4*LUXq_>x3kC8^mx0ejX z3Lu;lk|OgU@0uC9HXE zn#iX_hZYpNb&kaRDo3(FUeDe8j-qCp2U3I0j)%tvo!ixK!XN2n65(TiNb+Kw?bAFF z^5dw6@PYF)SY6c1g?Ii>9n@6)45v=wBQ}X177>)pmMV_VwO7=8311ef#n2BwCoPh= z@P#&)$U$9~Hb3~)M^hTd3Sy|4izKbtaGBoqJAI~w@*OXv393YiD>``g(&y<%GbeBT z92<=fO~}3fIx$RA`QVaq{%QfOs6!R<3S#3X@3ArO6Rd2gnWY3+9F-(A0 zlVaOdr8UD8OET-JWcf4!XO!BAI4hH^l;?0jQ(o5Lo1VG0Jf^$M+l2IAE`)l>7;9SCWiy^w8hOY$G+G?0@4s3TEtqx$?Zsd1~aTcD2#UbMMRC-WX5yQcY{0Lof5*&^-6vZ?rxg^78s9^rS|c zrcGe>%lywq|3?lz+MoTp?w0W>ABRUrU)R6BUlV)E&*T5^a%ky)mP2>F?Ta`0H95@i z-qIP9!;-q*7Kh*VUuzF8+w<7l=G@zth48cG#=1Us*WM1?GC5n}@Yv_jgSW40;?7oi z==vUg^Y-A+=W@`+4qrH`E&!Qdg>b%IEOA zw{O}*>hC{3b^6@9ccbug4F@?e|KFBNd;ct#UfTVVXlfs|=XliKmr<4sxwJf5lYy5e z#%TG*YBBuMpLnUxpYT${KjWp*Nk9G4WaHxG9m4b%UfP&$`DeV8m}zq>>%d&r!9NJo zUw*0o|KOL#guzy9E{`5=z?v(t4FpawvK>r;tZ79EybUV1F;@m&^ zrKEr5moo6u%6qZD`=xOo>n<SS}#ttUA( zPu?=*(ppL$g+lqWUs~02qp7v9hE~LoOYgRq{EnBF{lrTL>bq{g?`oRvy0_9*`LL(@ z&vGf{#r;3z(%GMK=|8chYXc8|$)*2^E$vwveewDyTl&*19r%}K>FDIp$1n41Uq*hh zr883_D^njnO@COM9{=)hc@~v(|(gn7ygh-4@;DjTQO4cIzQ!7t{7tRj$C?pH7J2Nv?G^V z{@;>IR~d3C@E5sM@UL>|#BXxxM^@0gwd^Y3!0*e|)1_){)@_`6)H%Cpqh_PbnqcF^=6aw&`-2>uUpsq2CNQ7-NN zH*)FM9l2B-{WrO^<=@Dqyuaj9&Hp5qhVICv3XDD4|0tJ!VaTN&f0Ii!#s&W_m)iVC zxl{|x`e(WH>|fluI4{d%5(*FS*q1e^D+q{Ht90{6m64=r6fc?C!snORq8H z(kRux%cZ1$Etl5+A(!6yO)d@kpXAcl|By>n{)Jq6LV+Qd2K`AcU8NFj|0I{5NB)#c z)ql#RrV{11Z>s)7F11Pd-;qnHf09chcjQt-EIJJXJq)mo{9P`M{F7YD0o9LTI!uNh zCc~Qj*}Q(qrI9;wspY?tOKmCtom?9554rU0HunD+x%BhDl}j!E3%RuTuX5>U;$P&_ zoIlH@*x%*S@t<<30jm?1F=?{T@(;O`{9nnXhkuhxd7v`>YibJBJ^Igb>GQwKrR-3SqyLags|f%Y1WMQZO)fQMFq9NfHA61-XN3f? z0{(xIOI?1+rJ{%(x%4(-=1AeUjFRz;(rOG`gv_!P~VsJ3CejSn!KEJ;Bz%X1Z!<#jUMvj7$zeVOO5gB4n>H z*4YM!79;s3<=x7m%QvVM!$ot1DRKQiV; zA7E+=x}m-2T<9b0BpR@FuT%i(LJFz4`-xaRgJ5mgfv@N$9}I_W zZo^R~%S!a2uXS(a5MIo{ct%5V^8Ln1)%Kx!u3|KpaDSWt7)`NFc|XvtyqBc?&}ypD zON*_#53pqhUdUk-zM{JfO4JP?>lDx-c|_#9hIIo72OWJsj3JjkYA_4vN@(Pa0?kaJ z14%&RdQNfo0|s0A_U*lw@|b5EO&Scg)V2=xfsJFE!IoxJEBOKkDUGaH5FFD8Cjf0z z=#OE|mYQq~xwIqf$tHs>Eqda`U`sVI>*Oo2`6q*Nl>I)GZ;6!EM-)X3a9TG7{Gu~B#)@`!+xGnV;g0dUPlXAz(+V9%RciLrkVI$NGzRK6-BB1&di4N?g<84>W|bW!<89q=`u>v_Nm)Xwlov4jmNO)SkWlq zJ8Y>t1%9y)G5L!v6=lj|u%))W&i+jOI(-Gy=M(;COq$?>O(u5@*e?0Ql!yDiNI##@ zp=ImPyak8Ar`3q}!) zPqE>-(XDz6alD`PTYe{AkY$>|mV)}h2Uy!Okgw=WwG6g21DXDlExnk5?9S+&nM0;C z*iy<*wiL_c^y4R63Lv`;Ffxz(PoLr&u%fBlc$PWchtb_(OKB0-q8FXz1G|>Q>ZUEbwLwXvZ5=l&31ZHPj)^1xAqbl4;M3m9d zuWXUGX3!pD+9VR2btLhhVrdpuT(H=J*-;>EI)>aDvynPR@`Ar3n?sR z$KJ`NzWe@+H-o{J+P00M28^3VO@>(k8@3_(Xf%T@&FFaf3}ePRREuT1NPho{FtkNt z%D(~qq=;citc`O>unY6o>R!RE9k#Td!Ir9wygoL%@_sB{ls3R%OIt?AtvVGH#uH{# z2QChzE3%-4KMXV2QZh=vuXh>sY>oUuXzPQh62>wOMxnf2X&O^H{xOan+Q<47a(diI zkcGjPvVI%KG{3+01LB8eVZVe9%7EI_N5q;ZZan++V{XLe+lQfNFh>H*(y5Q(KiSf| zb1i&V*c7c=moBr-Y+qs1Lb-flu%#Jn5jGM823rdJ#+FK1vu8hN9O*MFjC?Nsoh{}5 z#g>jy_xLHLjaZ0oVqz)q`mI3*TUvkVQ_0BJ=KnRh^cPzi_>(Oq|71&-sKRq&bHZ35 z23rc9oV@dGT6i{=eRhxY?45J7a#v>k>ZZm|3*i`Ssnn!XTMO8A@;yf@xW9FCWQQ&F zVz8wKY|J7|s3!_+44eGKeDvK`-iX%dmU-}wT>3titp#yH=-_tGM+-qup8bpVev6VP z-e?rkv_uyBR2Oy6(Dapgar#Tf&P&9AC9^XxB+xr-srCLPhFq$nzie}c*J^ah0=?W5 zxoq47N!8_bWczNU@cnSXN6+-{$I(J3{l4S#dHwg(5>!_LM!y?IEFX-3wOa9dVLJSK zSK{VZf|OT7X9b<~myT`kN~~FRc(Qase>pi~we7{KhW^?`m$hr#t0^LD&b@0_POKHo zt}5%T-;7xFPhTq%5xm>SbhoeXpz>ttezsoVdu_zx{q*GrPj=W+jPBCmUu-Em3xh3f zOkeE9Y_>ibk967Gv$mo8WV2=eX46l$)T`&C*7h&9G;#6a=O33qTg2Xv;}O5HrQf!< zf;c)l(e24>+utL8%B6+d@Ah{tqcOrea%mskFy;@r6!A5qgdvw2r%Z?$PYm8IVo~=h zUK2mkKVklF<8lomn@%u{{MZMx@9nxs>< z-|OmQs+U-OU!OAEl@{~7)alJXRQT-l@bDLdS1<0L8P&T{zg#xhs2!0#0P|n^C6_)vQLNOCzh#IW6mQ0dIv3w51-w-a(XIF{XEarIBt zE7+PioPya)IIsGA;~Q`cpKs@JqooV-SGFFgbL1CbG7?MPkxNr0;}Lp`*T~@~;jDz1opWB^3`jMRSKZx;nJHI(*0_vS?RZmH*x= zNf!cMCwW^j7TjI@0pHWQa38)q;ED6@2i>Ir3dobQgb)Rg zOFOcd!~lS&o3{G8m>b*?rzR+63q*AhLWvr@z+Nl*n8F}YkQcC}B%RYYzIH4F&(mmj zPFspgkcnUBff{_NTINcG;ysNLAI0cMmoC9e&jbMYHy_V2Vu&XUpzRszPd7bu!MiL{ ze$7kA9An_Rovd(vEbT(4_XHD;a<}|rGNCL_>?BlAc7z%`q2em@&^~yFNEvOMPy^$1;rsRV&f#>jopXQ_0?7qP``Lg8|-Zx^7?w5#UoTPu2=*b zMX-DV$`o6vb>q1OB68~(=F?(cgOTK@W*3`O<&BY*+QGeV4$!iMHc3!He{v8Ut5L&k#?hGtasy#j?6V;ZSb5{%o@f?gZuS3PCLk7D5W!3&3)uR#u zv(C3M9RY1j3Biz&zFT|WwFCYN^a9=&e8xRq`lsz^VSH}yxEr}qw?PER$?kmPD={KVKR)8)Eg?xX5 z6iiUcHfqSRv#Juz7Yg{yO#s)RRc7~q9Tsfh`qiXXw1v&ct!;S zAUbYs3!AZjtmjS#kVKfoC)5WJ2^g)V0r5{2ERBvtC5p!YZ=DQo88kkAk{|a3iH)c_p9wM3POs5x3po5O%0*@WrFP8=9MX`8q6Ab{J3?;7IcuhygV714qb;cQ@ z+DTA1zoc9f!lU9Rts3}&3IDjt7Y)a4$KT$*5t3A)*w5mxRr+cd6I_GI5Ck6Nf61B^ zTzccjH2TMUZUbnKiiTNE!$!^?WB>L8{n*wtrT_Y+P}kUZi<0$?DanmRVKhh}_P&O3 zWezI!q=W!gWecq28H-GCfiNc!$D>?8Fd6!}+sy{+t;%t82V3?)%y;~h61M;QsYf%3<7 zou(w>QW!D41nH;(0JS8!%7h;+y9>pN0{KWo(Xd#CSt|3>EFFB@s-i;|Xer~=i@SX7 z+YxP;Z_RgRUpb84rNuuc8PryQK0jL-PgT1#sRO@FD2Lfw@{BIvO0{}B% zdnK2cLfwtP60hEUa#2>8XQBp4d<6929mzlwCs48`-_Ql`#=|*~0K~RccH*{_yfE0{ z(R5=gteoeAD6nAoh?@(5QM60=s7bcug#eM%?gZtYTaOG$2ok?M1_-Up)tIP}ST8)z zKz9iYA-gKcmN*Q--N)d>HrwKNx5SR)_5iAujx8}QVxR`%k7A-X5(de^edVm`=uUt! zz7ew5lxfA@^oK0#d#go>C*Ia4fvB7h6xoUXqB11GJQm9FNUH#toO^Li7U=5s{JFlMh31kJ#6-+h^a31kTa4j2=DG ze3;~atEbyc8w$!>z*T7jrvm|=2{n$a8e({Ra)tumL|e+7F`>!)BBx30BciT7z?%by zUDpLU5+!PhIOq*8=W4m8W*-{zwrn96p+I?B`8+%L;}LCWo_u%Eo?r<*iF;>(p+MJOHHF@SEf(ZKiCMoigsjTf>pUXlz&WsmubUE13AFV#CG-heBW~37Lp4tF zJ_gl-pSfGV@w0Bsho9udzOBOrQgF9`cvq+pq6IV>&M-@f-hDiZ1o9VzU+`f~eUb&Hg0sim#q>MDEAfa2@3 z_FEFN{X)S!Tn`V*m7S0`5A+wAO5)+M@7yLBU@bKs&Eter35G)qvvl=2>{K7(*}>7I zzKCCDX@WAiA)4jXvtmK4(q`d_;uemUHYCF=b<|Jk*fC3!MMYFsaSM17Sgqg5=n1&v z53`h*FwpZI{JU9duw#~fU)PEFX_k`six`nl?u&*I470SOb=J9mUSU7ZfRz7L%Mqyy zu@yeT6@BNYS$f&-4u@6=%}2;xpu4Q&13BzLQwtZfGQL5@1wIv}Ks3WN7zpVkj%o=k{(M)T3?IP$Q6v zak2p)H1Ck|rif+Jt$9Aj$7*;;hvXo`EamOE>J9L40jF#G6jKhrswniGet0#~8p^R7 zNz~-KN(7sc5FPt3GWX6;blQ6DZml~R8IuY{a z_?+&ErBaT0$rIltd2VAfmr_rx-M@lmiu(R|^ibesi`)}iGbeO%S$9)UfSPY$JK45% z3DB*~n+dU?(}Z{InfuHMOa*i}69<~iig->aW#ZV?$Hlxw!0b&mbs;e1(#!zPU4x?x zxs<0F#ZrHQA(w_^BK93%ZV818K6eosaS{IJ@?XfMW5Q`My+jwj5!bVK=?_+2RoL8A zh27MYev?bbA7EXSPLI20#yk4QFu+o+J8s&|ZaUB1cI46#H!e&9$E2IymKy^sHPmxA zvT-*)?rw70-9#9P51G)Ca%azQw`_K|dOl$l=WbQzZoM_3XA%vBxY*T8fl_M z>inls+yyDSJUzHBd9)>bv}^WoexB$>%<8oD@bdL|N=7+le^h4ojO-h8w(<0cOFSlg z#hxd@A#QJRU)FJ+gac)D-Xz;_0ZF@_ zpWF@#@><=K;g;&+ac=z6L<` z_rdnAMrvXcHO-<#NKrV0Ws~$E{cF&p`rot3%OdnB562J&Sn9m8DAEUwD36n(fKMZr z38o>ZO_`2N2ARvmc(25JUiLW5=(PcpnxAjZ=1`?k(WCcTxIECWPrM4~1GMcIanO)ff5 zE?!zaCzBqpR}Pn>gR_9Rm@`p2pW(-@!7R@)S+9Z=WS9<;&gb;SXp$DsNHf6Fcufi< zGkzgNCDuB@susnpad!Z;1ZM=nyOeECuP1V)dWI zWb=kZc*X09Z0cp~fThV#mQ$qR<{96^fb$Uv;rfM}U9&;ePc{TUhk38Wq#O><@`tF^Rr@eD&|O^Hn7 z3a)WK?c6aYf)4+h#FYaUnvz!?Y~tOspnE_mQaVSr*CcaXk~F1qJxts$`+HZ&3JR>A zHOH_o=4eXaIQ#azbb-82mXqlD^)+cyB(s)`(@X;XH@P%bYhC8%Kg*@AZ_xTo^9-1% zNH{p%uuKmsho33$!c-P?X~|dLPh;vpj|v*~pmnuqz3BASaVEoIC`%@vyA0Qrle2qh zv3$(;U*4*Ibu>(|9xaxARe%1&N``FrDjv=~w#2Da+a>0%AWWf2E8+X(2A-ZAdK;$P zyq4F#pTmB&1kIku9o?2$To3nuj2ix#DD;Fi@8@88f#8C@QQ5RP^H{w$f8?kmcB`3t*KwS z3>B!w%#Yal>42-^tZiLbUcI*yW#KUz;;4EdC=xtyI1idrr9Zz^v4GB!Scp~}>ySwH z;P2}HdSE7e5ie>9!mHCGS@bk6o_cyNU0px7|B9WyVBVfxc?Vbp`O1a|*n|ruMZdcw zPD&)|dp)?fcUOkuDc;9jpg@d(d5&ENx+z#TF=Bw@lJ>X8!D z(s{DaWz{CB`JST2!TO8XYe)V*#AkW#J*zFb9@<9a*ksV9cIPGi^V1 z^n*!p**A?(na~TVt~r^azJ6{-eK{HXeCD;h>yU!m%Ng3jF-FYuCHkug7=v~3LuYj+ zEE+@;mGe>rMzJq+n-8R#PL&AF&`tKpX`%z{-BPYiz!Trz?X)RKx=adY+Lv$_H#Jeh z`^qdqV>>(3)Uhj2tU6p0=c;LGv`~9(&2D1XxM=eJP*=Y79E74yQKBiYMQ(vijE4J% z#QkPs+>7OLvP$!c-gYnZR>-l^I+;ilPYK~}EPUTL(X=zqAVO;@(pt+@^w7oen3eK& zwJ~bCrA;k={83AN*CVMH>^X#TtSq#3M3ckkqDAg*nv@TGF$?f_yDY$b6<=C(ARtCg zc$LdkkKb# zU^7>?CT!gP*}Et&*S>NyZiFf`2<08_C*HCIRa{^~JzRtE23+<$^qHE9>nVz!MCM9q z(Y9HgxQeS|DQ}N97xh>sSn+%egQ^TbA^Q3L?ZLs2Up4_ zTh%_fB$-w@aL%UAhIbE+hK5V$!M}~EonEgVuxn_EIa;l;&ptJ*>8x;lPV2rym0X1F zNqhsxh9cZtVc4j{Rru12t0qU!CpHg{3tL)9nI)^b9hWHOEnDSLnIj~<1H`$Tu(h5q z?a;BGOgXRhL6tWRQS$?>Tv(kR>2JQwu4dPG9Ar(h(Jkp7bO({K$z=VcqQnsUwJvYA z#eM3ccW4VOob_{0)%51~vn)3Do-3}F#acKXnTvW_Yepk>Qdrqf;M$t5I*M2Yl$%`3 zc~EZ~n zAvdE1CninyRe@4nSjKm?@a|Qwa!GYpiDDlD;NmidOoXO!PDW`r^XGGz z>&Gk{og|C}Q%}cBwdc;n_-pGI&|&%sMNA^wsgJ|H?`~ccDhnMh<1ccp2~K@i(B9sf zf66(oalsI!`D%Rc=p>?LqiD{$-jql0y_x!%spHjNu52|up{;r&Do6aqAJ#?hQFPV) z>agx6vVe^zC(k4uaL2Jn4dVGJq*xhj(P!P&^Svb)n4%Y>@?#b)G)<1zMdC0?wls-? zk)_giqAYghEwAp`#)zRJzv|DfW8wgqisKKK^K^(H$^~h`3+9^0F&42AO*`)#g-Ta3 zR;6Vo&3JY=Hxg6@UJl3G`(N3WQx>anGx?`(K{cF#H znisEJi&gLwW!f7W^^u!To+sQh&J+{G7P4(5YBw75>fpVz)|6=Jb&9rJ(+AembKtpB zAYR=I#H4Ew#nVp-HgcP!p1E3d&?S6?&YIr2^LMh+e)L8&AL@<48z3CP+4ksRF4IbD8-LmZhG=(u_sj zAz2=a(HM+keT!8Lk5|kGzbksb%ieI%fx3$V5-AswB!cCKw@Dmt>1Dab?UBMNKR*}KY&rtVf zx$-RI_cgQ#V++MLTO=1s@OtiM(-3tlFt1T0b`z{#oUOAt-e(O~$4D5(-89pR(P;qh z{RUFH868ZAXa_^o=`B4koCo$Cdue!VQBy?80Kbon*E__dO~>KNs50&oW;`BdwbAnS z?oz>cA2MTg&>}Gn!hSbkl+^_H7RocCSq37t+}961ca}m%30ZkKd^42VjM}3jd`OO3ErJ7@SJ0GfDUYxE$tBQ{yI(+QNQHVBC!jwF5P)YnmROH^H(F6`A&AL*a z6x{x<(whS~VN#p~G0$BKswo5JklDs*X0;n~xN#xfkEX?{)Ryqb;4Dm-G~K}*Nbo^V z&znZkEYn1h9QYgn?(LAmIYQl7Nd)MbwF&zsKVj5GK2B_H>oF2OBXZvQ$eG>8c{OX zn0*>8njOXvA)`GVV5J#mmLf=G7N@b!Ej>&Yo`#Q<^TMToIp|=#`V}(?x^V_Rzy{}M zLDSQ@WlX`FEkV>JB-}0H_>AQfTN}o8FF-orZ7t8H=cYF$SLE%)paow1{OxP6`WtSTJxQ>f*4IslTBr zstoG|Qp)$jr$jO*P)sP%h;m5{cL;7EQrta8x8ak_QgqBEBdy0!jRuMl4TthK(sv%$ zYrq)|fX&>aSrPy#9cZx>l)uW*$r&W45F^t-L{M(0;$rqCP&lTgbrh(EH@!^A(l3l-5IVjlRfZ*6w0bF{)H|@!y+$ansWJJMQFz|~NqzOiC9^LN5)?Y&9EwZUoMJ}%jHqEIMtfJO zJXMj)?7lc+w1_j6pP^GIPyy|3hn`BE-9YkOvN=68(vgDBXK28A>eJ|KZt;!Iy zdRspM%teS=o{1I*6!V2!_>FM5i5cGEPoh23+$3MT4DN&;O^=8#D}V^Mx2PCxw{ZBk z7$&svrZ5=OR(>LWa_ocVsv%}++Ax&yiSJAv`u1@QuP;-rjHPL&P&w4)zT|!5^=AO5 zQOY{sxQD-~I8(V%t%h{GgmWZ|{gkG<;vOumvFxi+3YBfzK%KE)Up15%miFq2RH%9$ zcJt|t`{-uj7?dQKT_@^^P88qZ3`|m1FSjpf+9BP(6|~jMnRAEDLrmx6&J3-Z zp_o!QBmywRGe$&J4pNAO*e@Ka%2sIe%)5 zVGQ%`7-bSg)IX{)35Qw*a|uT23%0P?Mk}p)sbkb|6Z66w-mF)MA_I{bIH=N1DqaGQ z38k<>4HZ^f`L9rT)vNVMhO9K4%y=t02#0}2=_^qA-SG;7#b1S?veyHJ2(?l{40N&v zD`JGQuVyO=M3t+_2gxu?(?&cOc**i5u2IMiyuM$p(ljOQA>%?($Y8?q(KE)wo?@Nm zW^>V%Ek2J0^E3#N`V!Pb^rzNqtqAdLsDBDd(dx^&xf-2EyVxhd*WfZBX`1S20FL0d zXJS!LIr_6+XUdSzWHhlwiLKtfd8!3lFu+JvE#Tp{KnQnZHY)n zSxdhhu!eYDT`dSZ*CI)#?!y>!kRXAHzNTmH>x2iJ%~JC1eAx^lWz2rErGnrCY^@R- z5C&U%|1}XSU%@bU@iD$%lPdO2Mxi&##Roug(s(ONIgVto?DN1A8?7fqpvOgGM3w^g zUZEJ8og$z^OoHy4MsF|SoC6l^TEDe_9c8ekP-JPdinMdZ!g}l8pnFHJ@SbdILU_3Nl~wHY9|(GW%{z=((&yV3Skk&J zM1!Pv7br%%thUHy-c9mW`mlQR0mPXg(=t0g>Nc|L|uDU23(Hq>`-^_o4`y}a@{ zsbX2NQXdr>DH0mxYotAl{vN}ug#n?~ASsk|C-e>V`l7YjIo5!SX+p(QxvU2B8n_!X(MXvd)cxj#u7@)kVKXSV<%flwrojR zvLzy}7oEM&I?wm=xjvukI=IOaLW>v%mMkNeFZwKTp=1Ze&4^3KOuF(y_jv(_GDw-{$;g(M3C`U8%+8UG=<)FNl|BX?wj`fOU1o z(Xu|(Mh&^E2Gon{!1K`E+mhA$bcg|0@9%>>c=9r03Dy_9Lw=#!7|;@)%-*80Z9^`# zhBmJF<2q>?ZXAwFWEW ztg32VA!S$c%SOS^2l<{-s(Lv*VK?KUvW`p$nR%yQ8uA{RN{N2si#A)!#QHN)DT+(l z(+PBuB~?qkLoQWaoMl2uY{;d+ z7q@ktK6PywO%>~^#PCnN#p~Rh9iUvwBxz!@%mK=!nExP`KIzm3< zzssd;zsaShYE8e%rNM!la%t-PGu-cTsp((k($;9M|0tI}m2|&lnwq)IVN)(Wnf3bB zDr>3xY2Iq`+f|v+F_#v8P%f>VVx%+OExT=HNxKhS%8LoQ9%igAn?d=#qnt6Zx854kjWLoOXp`AsgB{3(|{ zhjJNFtx+sS{O-TWrPL5{l&M|jnN7Jg$1$Y-G~KaBWSGruQ!X|7=$|f8TF!Orv;~w)TU|TO{71ReqwrtJrDOlMJPcpjr#eY$faSw%cZ4wrk`?Y>o0O?8z`4v!C&8$ORcy5 zT`p~&{Y@@avi(&qb>sa*F7^CPE>)=m<ThzXjB~kd^O}TXV3@DfGT_TEY%B63AlS@Z<|By?2{)Jq6(?E*|%B2@{e1DfqeL1E} zSq*mnluIKtHssQCf00YAxBeoRI{zk@j)8J%-S2X#6qJ8ME`9!oTzXB{w?!V5O9w%@ z^s7oUD3`8p$fZ|6xpYJh^M_nYx{-SM54lu+Q!ah;Pvz2>O}Uh4@mIOjjgr+>($q&nq)EtiIZa_Pq(a%q(4Ka)!*_(8d}_Ahej z#Gi6$a{5oXRLn-mVd%?l?Vob#J=Vh_n{p}X7rB)3yIeXVFaQ%!(&YH3a_OVraEg#RFyYI6K4m#Pgh z_5DRIl@^Tf{SR_!^?#B}D||QP(j9*2sG3*<1soQ^+OOyYtTx#@hHVRT z?!Lle>05+`ewRx{*n~j2beR1b9n;Os3d*HI{{y+S_utE<_0&J)(t0vKP`M$O+WBHu zHQAF1Y)V5La;XOd=U2$j@>4EV^2Jaz|By?Sf5@c}4k9R*LO4LVwEk~$Y4v|YE~Wl& z$fa+=t)U_IC4Y?j54luVFdmdk{kXA=pK|GY)pd7JE`|OnmzIg}(JOH>HMk6$O}SKL zLoRjp-H=Q5-0iwH<Pvuf5TI`2hdWtXXFuQ~A_Fv^vBTz0S{wbG| zK)Li3^9*7>g8-`JPTPBCQt>!-Fka%s$IN@ZOT zvra&%B!m)exVHNXkBw}7-IO2n=WD-@-PgX0kIrI@J`l4&Jz+0czOJQQ3F>bV& zEyte`mX+U{g{m&c45`HG9uJB1)p1kAHYXI_R}k5e6eqkKwMW@f6 zsb5P{R(7TC;GlcH#U|$_dCBl7muM2m_(F3P%RH=$F%z_@7=^eP+B;cvbbOu4-}m<>Gfe@qW1u;hZj% zA*T8uI|&H$elOzMbJ2r6ic6I=sbL{XyhuHr+wr*+%b1YX4r?P{;hstpJDVpO*-uW~ z7Ro_|^bVHHmY$4Gd~sQI>-yCfD=QD?SAwfW=m4UTY?x6;$E^(Hhqc74?Q7u)Z9C1?MTsu9Ok{B?|uqo1Gapt;Yjj zLQ0W)>9|jTyxvHis2@fS!h0d(bWOU!LkkYaBMnox8(vucLoR()d8ygPUWhuM{d!Db zAm+rg8ZHT6(My!WWvWb6GAF)Y*hCYK73n33x3`OTG57B(fY7-5%f9k^BgP!EIfX0fdQN&{)v?uzU8Sx@9 z5EeDU<>2%;ult3&`-L5??aOSsb#2>Qdj;BkB@)u{L8q}=UOlSWJth5{a;epDNnfwU z)*E#KE@OjCf>$i4A}nnuMBM?7mz4a_D0BI{^*u_^f$qGg z%mH_CVxw{qoELac4SDYF-1>)H`Z4<1aIfCTn=+TNbmnW++mCDb3kh4)iQGEih9-nI zTe;noVEKt4Vo5(3R0%V&7AZ*8w;K>HyWQ>Emw1uLlT62~*AEKS-*Wsw?wPpV6RiGm zLoPizBYy4U8>BIfQre01eWzmqeM=+@sYpP}uGf;hzN?1~OOjHPMKgrkM7A?lgp7PY z^9_7v+LTKVU4MqwIB=#zv6%enbPnZgjQOGst}jYmdQ=N4er3^rQ5DL&Sp6~3>dS^) zD!e?O@a3}|3EkOuy^8zV;MbGE!dfwjnF!%RYfeEi)BX8Fv7lUfa<%6A^5+1J?V`dN z>N02a1w#gzx?XGzvniMm9qnk>thl}k%B5e|KR3PG5BM3WUSA8DC5DjFs{%31j^EmI z(tpUMA@Z3Igw5toJJ+)|2^WPNR=ogq5&p7-u|gAVi8i`yJ68fwX&sIUi+`= zKjhNh&T~csr_~Hohbz4ow7pip!P~D4+Hq41SfkhOTjq2>pv6*E$fP9t(m>Rl!|{h)YI}3Y)y!5a zc|5pHO{0)EmWV|%TdF3yygx|vXs(Qun#XJFy2%bz+a*}tzmR!zQ!YINMcO53%N=e# z($zhC+Ie_kR>UZcZWku(TH9csh6(gM1Ind)bE#+Lat!4|yU(frlTv_};tT-uoD6%1u! z<2GBU+^1@6d9VeyeC15zj-ozr95ird4yG|Bvpx^gYTs}1aHlcewlTcuMtHosCEVcL zbH(8vr_#b(5D5JAOn^a|{^Vf@oZTU|6OK0;JV$@_onL@$Y9TsYroi$bO0#h;CZp&I zJ}$s=)o!AVv37WpnXjySuG)ep9~SgndF)WjzFM$S)gTvJ_lT~5y993i_R$aVsobQ+ z0sxBi+4%O7482)TSdZjv@x>-t=Dc9s&4R*^iPq#bB^&wLBNM@2M6b+1i`;q6M!#&@ z;?9x3uVF2!;O#jW&wRVP_L2FPT`SE`_N#;Y^TvYRmJ5A;hi)x6?b{*SAiDfO`Q}ay znUU74qqbX0x?)cn)@BVChDC2vs3_-AiCI{;3DYPj?BD+}BX+uV4+HgTf8MM^L9RxT z%%PB-frWe8^^Cijyk-hsD_xm2eH}T!|MXzL;$U-kR?C2|z=7`v14}OzSG_Zd4v;nT z9yuUqY%;s@{?;LpJIgU9nh1eYXyMeOW!{-R6L7;*VXF(h=k(YQWmMzj5;J%W2la(% zlwdKj3>&UWBk&)wvV`%_i*uEEZW>7`d#~eVLkktkI#!A2e(!aAS2Xj1A9AUAX~>>Q zHqAn78*B#|#S@l}Yiywf!8HZ6?rR2E&Op(xAVD%Lv#fB_0Ac4P2+S6xn`mSbaJaAm zajQF}PwwhL6mNpk)$m4we!8WqXEkI~&~& z_K{#(A-)LOEOd)eCPH@XLbRuM_JwoKVu>L;Y7K@*!*(CHLf&R7K0&fQZxffbwQ1bI zQ>QX|dr2MnTYEr}jW0?{@veqRQo$9a^dnt{RpepQn_5B#L%D{K6oP)EwYgnbYL<|= z$0gVb zOwWB*@QI~IDvQV&C8o-Z#{d~az*;61KJI}|uj@(@y7BoYRyb!1(U)2_uN)z*-)EjPPF^mqH$xvw+M?xr& zeQT3kgj@GehM#JA%L~Qckvp8GeFKFZPUTi+nj~tmn zf>*vGgo-k2m>8S&kH30YwC(<(^3YZi9*bF;v@$=EH~XBW=32=314Zdt4@TuN$?VsU zO7Gu>>ru=Y$d(wqfy^n@r7l+o+X9t)+iVcQZ|IKkim8JkraREjE3Jx5F2Y24SF#Uv zy2}ejw}hV7zgpN_OWG0!xCDQr5POja3Ta%2Ti`(7Ae0JKo^o*Y-RA4c3jl&`hA>KR zZ%KnnkOB49xv#eCau-g#aW0E|bos@$?2mdTY-zpcyn8ge;*)lq=tn%h`Yv2#_R-}E zS+|$CVo1sq{Qb)Nzv>?cit0GiXCu=v?=wde8xE^;QQ*aV&@UoFHXrwoQrvL&kCdbm zZWJW7s@;-8KQ6N!+ov(?E1V$o6dxfLB3solmK4X=;yvwg@Rgt%iL$lzYP-H50?@s|)Ua!l7T=SNF=AIMN~fPbdAH*9?CwaF zmv1mSte-tjYBY#rXi>ND4*tAA@!M9p%NH_TRJ;kk+n@4+hbt0)NetFoZX}=?e+TdF z&lq?(Pbz6)b*;+ib-rXhZ@D@c;xRdbmnR#jv<)&jAIrp-&e!drpwT%3tVRe~m zj8(%o!8Z9zkDopIcpx~wFjx5=ea!oV_Y3=0)Iz{pMGxoc!Uy+pO-N{n?^J2#s5__zmfw6>`aVHS)_~=Jv8F{3|942Q6E!&dGrl?25r!zg^Ne zBy5=-&1qdFY#UX_e+eIHd6js0Eh<2QQb|7d)l?+=&0SUF&$tUpUmYw-x|f2zgQWrxoe2WR~2Q&;b64_oaKbzNa@V` z^bUL@DAXrhUgkEHn?3cgGl0c6VB!l!%^^^4#Geh0eT(O?r9z1$Hj703%pl}y4ZD;6 z0qMIzPPsvkwq84i3g)^Mh`)gHByh|DLF}f%EH**1rYIeDBWKAF|8A4ryurcs9^w+V zlO{~IuGkNQxFr*&0TR-LE^hb?;Y7nd9YiMrh^J(HhAZM2{n{EO*x&aho+*!xQ6)V?0jvH28N&_0u+L)6p?a+P)$5MdmJK9y6^$ON}4cL zfDued=rO$W${>aqg=rvR;BH~?*@*U?ku_H$8c^7$gAkM{TFwN0vCZ|ON#q*`Hag)< zdA;3MDx?-4u_9?$ONDGDpT0UCDTMb=Yai9kdBnGdUFrhx; zI>&KPL;OnA4LAf*`T-{%#fmn>V?EifQ0x~CPVc5d?$d9|AJ@H4h3v*hU8LaO8Q@h3 zEYZ19H^_K#I(E$!>qLM~p9$k?h*fV852nPThN4l?uo7Nag$WaR*LsZhS+-d+!lf$4 zXe`Es5Nmr}(}fDTPClJMgRiym@BWB$q9B*>a3KJ9j4y5yz$T1CT#iS(N+n$Qh;wX+ z-?ALNHi$abhI>nZez#6I9?Dio-*@t(4d|B|k>eB2Zu+Il zBT{RhKB1f@K|`^9N>|g0<)^0cv50)5AoL6}ioP%@%~FLwW$>Jeznzr9<`<-R-(nl+ zmjbcd8#5$Tz|QD)(p)LXP=v@P9!rO=G#AE`49g4SK>P-JeB7 zI`sb0=j$wi_hF!1nlGLAWN#jAFNX>xujGDSX=9!Wnd2$TgG%WKV8yiD@j>agylN0E zmCmd2deEBupjrB1<);U&UJu%LKYZ($$8Y_xE9_wp%cI>t{L;3EWsOiJHs}z`qY?8* zAD-uZybqa>et4n{ah;Tr$0B0+@z%G!=at$qV+0y(9}QtadrS&evZ0}yXvcGD7<4{N z)fXp_&!&16u*k=U)7n4sP5@L1JdID4Cg2o%dnmt|uYl-;6`Oec37RCk@B9*~Kzbid z1}(t3?+JTTes$~kL%dGg6UMP73q zIdWWdo__vNDbHT?Q?}m1N17aNZBL*Rd8$o3JA0pSm=&HDDDoV7x@)Cqe|Q0(>N9V) zXIxjGS@v!n3VkX(i((1L&{V*A&=H~dK<{uKPp4-G(SrJWi*{WtT5jaCwI~{MEizmw zbe|}`P+Ayi5!+l@j4pk4rL@FU;Ca|SxzEe&3?R*I4t3`4^K6S}UcDtZol2xtOY$Kl zdrC`_ElQ~t1@x7YT-CBB`RB@4OY@`4L`%z_tVn-<%N|4FN2J@Gt3($>S+9!VLBF(! zjP|^2r;RKpwU;&Cem>M*%C}NlcDt0=Tfta)%Cu6Js`>(I@qEsSw(Rs`#p1>L-u110 zQB>JVO#6@=ZN>-AL~QLEAJ6PJ+2mKC$#IAfem!@O${t_#M8>(!b^ zu*s!V&H6Iny^MX$G`C5{%ih}ht5tm7b)!R1kIKBb_Mld=;8G0rJddVhAW1%y&UzRx zrA$Pp0jOb;G&M-ycaq%`U9Z1daprc~>TE4+U)|%%S`$H`#FQEd@7yfGb>Xe(yZ44^$!X%8qgUQA` z47F*p$;uBai!d5-!wu70`X^fkjF4BTO|Q#@9|$&oaw@5}Y~J$VWkW>4%f9?h@0ZP( zD5c^2FGepPAH=*}W$$djosGcL=}4Gy8{$M;nkF*!4BI%pjpYv8*S5CulvZX)i;;Vl z$=(utN9ojR^PEwOv|39$MkL2vRBTWCn*~+$EXoWJmuEZk%!utPz~+Nz%hc{%6304| z5i8=@P%`2S9#=!|w50NFJ;C;MQk>^bJDYp^9W^4?9#I3ASBLJbb4!)ujV^P2cdVD4 z!d+st#j+cGB_r?4Gc9*w_Z5>4`@C{fNZZ#hviC&C2{3;4uTmU(#*#mn^e9=j-1w#n`?ivG}BZM!xbxNmRS!~Wiu{-NkS1tw3|=!g#dmZb!^ z>!PyFlfL}+zA}F-o4+cm66Zno&OY?2Kl0u3!*?FD{pq#qT!wA9s2Cgt^h;-UCwEFe z^y#*I*nJFUc28gGtH;)EpLgb~t;h1q$CLwy^M_d5ha^PWWzuo1tZIC0IMzGG^r2Q@ zs)Ox2+iF{zRoT1I+5WF|q%j#@P~JP*Nu z{PXKl%oQ?%<4~Uu;5TaXnpyt+4L4>xa%WAHj?xzCujKtcZNx-B1l5P+Lr1<-8c}w~ zA-DrqsnjMnoDOh(DIPaB>Sj=do3Ur%QfIJCud|!?i(H{==U2YxNyjBwX&LiQaF|Z; zXnLX2-%I3U_KJ1BqqoJy!27%hzD2=b({YmbFL_%HFYCScaAVdfz%WN*7iUp7skk@O zxO;6(%OsqhI5tN8}dfSc5Dl{%?Zt+B;((ys~3 ztv=jAT}+{EHLMTpfbBDDZCK|h^wf+99Y7UYZz-~7DIjCIE^~wNU1!1uVM+mvW@ngv z2pfbcwd2zoW%edWm|DBPNgd{b*rQ_gb(i|`2e2A#HDc%I;HIo$2U$}IXz}gLY60$2 zJMdrW?s8vulRmgf*UfV#C=6S#qdNdpaXef3qlrdi{G)w%u<^TN`$$tN&y^O*#b~tm z)SGOe+wYFFu-~UPxv9lk#3CE6Kc9?dn;v=eb^d8j?iM-Jb(w|&84WU$PDVWOxcu5( zHa>P~m4-1UpiAi8W;QJ6XIPsC&stLX=cC}?OqLUFV#7P(b|j{KNAwTBlhK$FzCuSR zOtWW9bwdxLeCQXgw$2UH{MJZr7m+KW9Ljc4=quJ=cr=j-v<=8K;5?e%$||cYem{Su z`yp9p21Vuh{?PVPAlP`MUUCk;xHf;u?O6AIri)EcaA%76=v6ke>*r-a$W)B^n+Q5R z24#$YixOG9Qu&rKxBfPLPYtAC?0lcQepj;|$6@zf&8Iy6~1;G2+M=edym#A_mspNrh1CjW(@r6k@&~^ zoGaXB6W}M(?(cbVwB?JzyPF5fW)kPVhM>3m;EgpG_r5+k?5$C}oGE^6Of#qmsOgl+ z!@X*Zmv7oDhMm&Bo*>G;S{A{gt=6>aW-A)aTGWyE{Y(Dcm4Vq)N(p@Dmqrvw*F&9e z?Pr+CE{s%!u74BlcyvJScmZ(Y>l>XEsS)F0kt2Vd={$T40}M!^rqN- zU&*`QusS471dkm}lSw#$q{SSb@tIBSN+l_6$CcR}#e3uP`mqrm6QA;Edd{a(#Tc_p z0)><+B%YW%&xAgeYn`vlcXpDMC?i&hPdtCLH8|e{8vu8^gR1%&YQhAG@NM z)731+Aokma{Pb0+pvWu9)&}hD8DaG**Bd$Pm3v+-Fx`P=Z3B~&b?(NX0aSTt7UP%M)}x@%kQ5aJ2{~!y;osrRzKyMSOfb_;W1_v9nO95 zrgr^A%Uyy@vn6h6uw03^Rn9;To03p*y;s?<3LNU4NgQpuuc$IuYr=aadj z(>WD0iJ9xq4P_)kCG3_va`w>({^-xBY>%0$@{w$2xy9ySR+(G_oqnRnq+zUZecPzz z+hY3g?uG`^$cvsBD$%ZG*D)D$Ir~pV%qr8G##0fe&qBlX4ji~+ydzepJk9bNVZhd> zdr2hPZT@Zs^XG2K&~%*`gJG$&C>PJ{4}?z-me$!zR8*a>k*Xtd2%oA|GrZ(Jr%Htu zPsMlm04M;3={D+3VgOXn1B@@w_%$lXn%9l3LUKu46HKR|81yJ{zRLfR5LSLwkS8W zIbv(mdLpdx0`I-&qt#}Z=*B%>uc%i8FhsS5FjlS$VB*9mOxih^<9j>yf%2%(>%nAr z%}zaq`1T0hZ>-T?Gj>&S#WMkH5Us^&}19;-IVg|K!r9lSq%m8nrVD32&S2 z^*oYamz{a`|9`o(X3shIgx0CQKuu5IFa%xG&o_*{qY6`_g1e(mo{cv4jw$&en!b&- z`@xxJ#D)DU(ez5n$sd+!&>b6)GkusE+n0K}FV$l<_4Htx$2F>bKlM_6di*A8x*?hl zWO?7pW)x-Lp3ipw;hK)*_$24j!|q*telKYQ# zh?@Q?n*QKSzZ8Z46ip*R(ewvr`na@xu{1WTEOw$irJ|w&E<3w7oX*Zit~vSXm}+d$J_IMa8dpZ}?73Ua33esZRNTBg(Qmp_kx zn)&d3=EKz7r8dEGIfEW|FBGbL|N6j?ln55($1|T16a#ZVmbU!7eyGJeF`@Chh_SfP(>nE z68-SVry?Na5|$N;IwGcXKp@#U4zx_q{jf}<@kBa%`#&sGKKxUbS1&OtiwCtIzqk>A zWB+JwCsQCI)g+@?4ixJO9PA+-EM{&n1TE8wwxARu=Z0n4?+aR{?L_F~HHYT>Qn3SP zw(3=c;axN{3n3`bGSzMZEz@nrV%3~bgWdOZpf@F4OhC&tTFq!da!96Q(=t_(t=8GF zOtUjCNW&V8o`9BV>`S6uU-YJBdPyL9!!o7FY%|#|+f2c!M@JN}60Vy4X_+49Ff7}% zBg{>~nrFi@C4!b|ug->LsuRN^;{x;@5R3j{nP%3a?KUja@BdkLMXi(mg#{%EK?K>iIZY>xD1z%r|kW3bNS-7t5D9$=>Z<< z(z3rF#QZinP8=!#oU&U(ZCa*m44kkwG^&Is0!6;O$-QRHi-x^|dQbjgnSNn~K!|4m zuh)U=KmgpB`iYrX3|gj2zgedDC`VXQtgwhz&}xV@S%Mn;yzy29B#96J%Lx2umT3V5 ziqm@ZQTK;sdN?X#J=a>e3-H5t%%*X7IP_wI++Oe$ zb_5bBBGsw1W>?O={2vL&1S+C`TBfL?4a?LYV+UHM9no5BgbmAd>{rWFcW}K~y5rQQ zWqO9nY<@v8wB)B{%ErA=zhRj^=bJRzf4N&?e*J}(Of*kk|^z9m-eJJ$wbtm$jjI*S;B>cAC~E!|74khqbA>F>zesuOxzzgea}wP(;(Ro|BVjXJX8HI9aXmgx~VF7NSa2ps38 zq9b&b@NXGEFCvP&g#O)1MA;g|Ixvj^2IU$Z+(DCw%+r7lcy2dpN`rJ+8 zyXE+c2=!d3o1uF5BZUlJyxM{)u@334_4^MBN5;;zQdyTua+qfb}t|)c<9DI zEK?QhTi33#89n&emY9BbPMvi4;e|v1wUF55yEvNzKlXRaH0Iw}ro@nh91G9dvwTZk z6Fp2x<6m|$k9nko3~1hLIg;y)@Zf#S{UkcaC*Bofh%oDPq)hf;&xI0RV+@m6hD-mj zOvMO)u}lp`DG+|~N2-!{`LeE-Okht*V=n$;nM!g=x+eJD5jy{st@$*>z!atb%-SB` z-@#gK%~E&)to3y)DLWwT*1LoAji_{uu;+v^ZsK-NC;BJ%ZLb$3=4w?_k9}?6WvrQ$ zq?=c?oou>tLeNUiYuDhu{pk*39IFZm?N>=(OmZ|WDWqQHD--2~_X4?@7A77w~a^x4w z^eVXubBO>~-n2}a#B=Y7Dg0@f9*Fv%Sf=g2Sf=S4mZ<>&7fpd%;IZ#O%ajP8%H;T} zvgcrjy(*j0!Sr7&Q$M&y7PG|+++qe%9))%MVVR~glkqza{IpC9@jC{y102Tr8C--T zZ5x)U7FExcVR0Zn=+c#71_zjh80RClM}>r%!F0b6F0YgR z&se4(|Fva$kEU?|v`j_dhG+2Au0c$uq_4paFrx2%Ew&zR0{c?L*tpPlSINxZEz^(G z-z?MQpO&d&8+wJbX_<<{0T^mT!g9@;^`te}6yI?Gv`k4b-29E18Mp+=Qz+;Fme2Bh zwK!`>Bn)D&S!XUA<+$GuXE}FKRLXLN1he?VG7W;uMqz0rSol_O=2@eS7?n4RWitKK zGCcxXrf}5uL4&WCND{9Ozv9@zluhbi(Ef}xQuZ_E3V$#q?u*pqW!!q^yt7Y2yyJbow=&k$0*}a(h?O@x!QHDSA=2~xBrpK`w6llgr++Gl`n}HPI zlN6||uyj^{`kQ5HOM$Wy(hwp{!UWi-w@gzXabK3PyC~52QFjYSNKpa?92AAN-#y8n zo!^byPK8p)M{7pq6=|2HDGrwhnPsD7x0~dcbl^PS5?Fc;%Mw^WSIf)1K?}v^Tm>yp z-wftj_wAvw zRZf;up~#faY~?*A-<7BxIH&&BPedi?ahw)fHYOn>S?(YcK`O_tu~sY7^|6j!1QEDj zvPgpt$vkFfzOH4sM#3DNgH8_LkCKP(az(rk-QY}p+JKwPkYnqExCra)HEYPt1SS@0-qi8ua2p6Hn)b33zS)Lv+g7f+4G>tT6e4D^98>^GeON#MkgkZdUVy_I zh;ssv)(_UADkgd>KCh%UmA?1}il##^qD%36J)iG8j)$c@M66~ptKnhIFR%{+B!Lo~ zTICUWLXFZv2$3~0WSjVc6qINSOHpX2e;mOYLJEE+gozGR#tH z%)J?-?o~%tYrf~#oKmaZo5Bhzs!jpA#) z%Y^j2&uii9^~(?zwtC*nu2rWIsJU~@QYtz#2gL1gAb^vt)RA>(6mEi-K@25H>r*Qdncw`hj2INKw z%noEsBy9sVwRUA~Ho>f0B8#a^AZq%Cf_gZGyI0=xFcNj$mGyKKD@^mvrT*S%-#7Y` zGO~lL^A{N5*6oxIs58dlBpKye}NYA$R58|GX+A_EFerg%GMaAjkSxV_WrL3J5yvUG_)zD!7Tm3;~zV*;oGLsPn zG4C^Ms@~%^fLkzrFFM_CO+^$%y|WY=_fM~r7OUnf#FEy%s~4VNrwXy^7@G+KL6L?H zkb}k=fWH>juO@$>rn1ItDKBxi&#}nMjpzPEO?iKyrtDQao#kQ%_KL0ILk5v$R&Te? zqRU>6-BX`rSWVVm13BQy#)`@2fytL^lWjs%9qLoBtfsoIO?9VE^;Ast4NMKJO$`c7 z535g)T1}5#n;!pxnu53V^niGw2_jWeT&#*n9$6G35J1$FlRz7so|&$+(c}Gkqk3i9 z>-!@upj9tFb-K%jace(wP{on$GK8=JStS{QruXNd*=lG;wUiQU3_r{9LWWD2z|%Sl z$Jq$KsPj(Qo)vK^4w;Yoh+pjjW?eV->^Sod5ym~8xe0{!EWI;QPp)1U5 zkhYNYV9vjNVgHL+J&i3U3Sah?El@<9`s_5Gdyl_my!euJ zaxw4vw{i$Lx`gJ4{1UW()X{yg7C zTWt{@^@V&_PFmGoKl#1+^J?v;ZR%V$qVE0i`mDdo$N7L8Yd+~~LEFcsgxA8|6vdUc zE~Tw|9IWp=P|@iSnmk zW*zSwGfvrcaeDM^=Z6DS%|NWU`Kxi$EW=xJr+eZY#BQ1HVqwciBsWA@sEb7RyqC9Br28Up5|-UFCTA^mN2OHX z%-Yu9Fnvil%Y8ShK1uofSl`ubYmh!C2 zv#-)k@7VNxzq&O0zBT7O@27}`F1OoFwqLSJWY1iY(}1EaUG{WH@Fa=!~Mng*@j>KC1kAU4AD!z+5v2wRtDYzL*+`cf^Ad!B*x&CEWENlF@SC;!e=y?yeyEVm}o#gGh zO|WLEx(Jy6E1@-znIy@9_f-e8G7prm*AC98MLGBy$~M9C%M_mM&8{RcE^cs zBThFbV=*?50y^XE@yHX4C$5YbKRtd)TQ?gD2u5)ipruiCi!jnX0u-5CA;f%Zf1N>@%pXXYl;X6sR+d(z100i z&>$BtCK!IEqW;W7DKF$u(BnNL7}I){I)|_*UwBMn%aHhakI8Jw6rFK2 zMFBH91b8`E4XUOybQpj0?C^$aYJdmk#;AVno2qHzVz<4A6@+XTc7*~BzY1Uh0D$ot zak8`tVI+s)@QqohPVxdTUCm>V0t2Y@_2ng#nAuyOy+2m_!lbo)ITQV%VpMP$XoeC5 zslK*k%P)Z;0OM|S;%hw65)hoz|9yXmw%D+V-`8qcO(<|TGXxFCEBaqNfY0V2vq&8* zPwtN(J_9IdJtoSq4d_zl3Mf3Up)f7cnE<6=w!UL*Ob!%c8E-Pn7L!OBKQbY9C`s)7qL95OV1KAOn>Zw7V6VA)32OjI?*#HMtzs z??`6aawDg8Z06wt2fhApM`K+sJQOTQ8kQk~aaJ)Ha%Vd8Bd(*sx-iPxq8FfWD9m+IjJW_hDb)1)4tS%vTg*=02PaB%p1NGP$->1pYRT=y zCJeHsGq7#C>iWl<%*7DiH6IUe*Me+$dF$cR04KLdf$b$M$Fz==)vIOJgu8b^03|^0 z>p3r7p1urPlMzFfe=!1X2k_FVRgULPDJ&sSK0kUQ4nsY0_aFoH`Te9I&Oz7X3LQx7 z*4r|(4~Fc6UgiNPf~rk`Rfr!HotS#>rs zGcFRv=gcfT%#OTR{qlGhJET@okOEXbJ%8@jL}roDu{zb%1QfW1hq~5AYl4xX_3`y9 z#Pw-Ezv$9vLLX^?^g2ogfBWC)6n4?b_D`~1!Ou*0PMcew=cAVJlI@cx|9k4`W# zQvf@_wv{Xdu$aghkw{)0Rq%veOFK1DgA}Y%Lp&~|*%2QB*pnO(RV)--%reFPQa=(w zMm+ZsgLz)>~>q|Gj=1lmw z2Go5RfZJcG-eN`Y+-f<@A>3Rge9bMwzRV1nbER&N9B;UjnzFYU0OKYYg+);y5uch{Wu`xo^xfqH)N^bXbX`{xqYn~#WPMl;Ta8O0q0NYx&9mxKFF=`-aO_u z;E^siDdf84gvHjq^^XUV6gdZhcEvFW3qWyqHvolG`qd9Hz<@a@oLbxZg2HK%Y9(O% zf^5GNgaAXst=a7gcAb;I1Vts|2A>eOv)`Nyi^BQNe&5}->*d8}`+*SkN8VT1MJRiu zxu=w$s}{+-Q;G#%wjiIEHowJCD$YwF&Ya;FW2}YQleH{`DC>I^C~iT=D4!$uDgvY1 z*s+3mD)4!F)l=@V=eYjKgp~6Y$M0AAA|xpgsct={lgLoxeP}*0S2B!9nQINMxRhDf zsA3TH9dSVh&9)PU;>1H5A)+uA6Y{L{L&#bz`oX0pY2G~K(tY&-2G72l%m-9V-ztY zRqbQgo-jOudGG}ja3@cJ0>6*?s3j3jAZe*e^O*A>r7Mn6coD9Y=kvaRHRaRo@CUK# zvdwCbJ9m0ZrIp_%Df=q|24Ibs`aWNa2Uflw-c2o33}saS5XB-M1c-uVDy$o6XH6*P z0Gfsa;gSj~_W&4C3B*nl4e6BQLLxFGW}pn9Fpkwz7#tM5>Vvjk8dkl8HjG~Q4wIVvh-N72*RKY-&7fvNL$Poj%z}x>L^y1q%*Z8%it+c!1vBB z29yJ#S*?mdQQ#pp5}7E3S;aO+m`tCR(`vC?bMY=af6i*SJOiQ`E6GZrp>vU@BtY5d zg^HaExB4lVEs%bFKv$& zt4MzU5YVN|Pv)rB1n!Hl&ZLr%B4YK_&UVGeK@5R%#Y;8M%?BSnZSBeEs(Yt%vcN^) zTe*uiegw8V#RlsKFFxsPXGrS~XZ8-u-LS~29}3E{Nq=RLHAS{^0!Y-Sva6U%;D)$n z2vb!Oe6Ub zW$M{i#x3%a6{sqSQPkI2f%nMM2It|jtZMzDY%<9QS9 zGsL-1$Y^;I%w&0nOWI#RmAGU@qmTZDE@o9g>GIILiKU+Y(NaQKa z48Z7u*r*|2KXLTl##P3zc)-hcDA*BCp z1h7ggi;h>Al#uqRcFe?q4O9WWPnYkd3J?)r9@C;;nSYvl{eAn4;7xw*`)OTv z2kY557OYN-VX*l5lQAxDMb!!!TO`@4T()9v?D+5;Gd{~?ejA|7FKRSsB&u3UYEc;< zgapj8o`T`-Qsw;uQMYv2{;f6#!TTAIRPp}gN8TFKhG6Nzn`}_>p|O3&;%mVtZzY2! zQa`5voB`b98lV~L&xEu0eX0xzNZG^3!hvnvC-CW}S=LPI34EI4c}u4S&yq(LkEqg; zC#Pi%jmZH(Br=B;fNPQ32}T&5CICqQs1?Mgi*aI~f-_8#!T~&*aE zTaqV$mj(dR-%>z;W2aiCK(e?^Pv^CiSslO%7U^L??VJK|8IVg0nuQB$Mlt}rEvTmu z#+!Hv(kMWzi`;oHmDL#lJGrAhA~P9pUygPgDaK0kjsWCBBx~0l>pGllC0)8Eol>u{ zegztXkwB>ovh_=VJruyx3wf&pkcE)}GbLmE{axE+2LS-32U^Qe!rmR7$iGAOT#t$sY};svV~ z8~YKR-qU2>53FCc$XrAtg3JdnK&1GsYa}fBH{>zczdH*_=k%`-73|o<3Ug_a0VD== zM*Bt23)Pyl#n%TeSdkMMZ|iLB3%D=Z>CC2nOuYbO$-51mI&<;$eySTR5IsTcfjj>fXMUt_tU^9lZ#=BGK|y~tuJ2c*uj+QO2% z?w;I$Ay`e5c|DKryVNJAjK`ip^93(%Kaii4TsUd!*fmexOnLX7MDCP5wX2u*+vL*W z(aoKvOYf^dlV27Bk#9frR`@wC{Cj-~Qc`~O^%83+(cEwmlCxOoLS$BKU1YbZR!?kVW8gnN;V(()A9kd*t&Bh>6EeuuAtoFX#sIgf=K zb$cRtmf5j<@I`JZQ?Gh#fe747A=GQLUO>{qOWC3fT}Baz_ELT0HHP%!tMt++5x!bq z*g)hGRsP?WskfeoH!9RyKf&7|$J?;d`~P8?Zg`s=dt-Qf^zJ~WR+cqPePX4TA36Vz zWont=W0m7$UFl;}`9;aX$JRsWDmldplO($DW5;vZUh1-g`ejGc%Z{NyiXG6&^zwf~ z6e0i1E|r&En=iYaSf)?EQfOb@S-)ODQ9O8jJ*9lT6uw=Q`gTd(_p-%hz1St&gl}qc z%eq&yzS$Mnae7a5dA(j-4r=xadBYW&>KiWgy*iI|b|fos4;6fe>kagKD5@}82#Qzm zi);3))cih0&WgIDUk~Mv5`y~iJV>bg*6p7|KKaZz>tB|aW-|+GocSI%;K#%%`%_wc z!wOeM$o?tw{yDxYG87&-29%kzV&meKAwCR@m^tpBldzgx!q%)A(3AH)XWJjx z$Dw|9_3A(X?-1wj6ManXTQJgZG^-vd(D?ldnq zb_U!%4!m|guxX=<(g~e1SV>p;kzo+j5*l=EaCMRUJGUP*Z$j7H4b8kXE&4)1OH$oPULl${L zm!v|MPe@zS&@X30zj}mz3l06A5W12Rx>^~!)*SldY3TZ!(4X_68ylg&jzc$j!nUNs zeyfLVn}+S24cqky+Y1f*lMuF_6ZW?YL_d$ubV&SiK3 zsIoMWV^^xS0Z4&Fwa`qlz)&5T;6(pqB?FK?E#-n)G=s$EXqKei#p+TA{vQf;GwOsw zbz@QkoTF8%qJ4ht-9$p1^jY)`Kxi^MnwTWp3nJw0xh+IF-G_wb{<#78BMSig*8e%7 zP{S>EWoICXU8za{YN+$xmC_h$UAnWut_>zR;lvaD{)ZWt#-2LNVq$P5ahRP1NQ=)) z&svDRX~#wyXQQ83sKnH3Bm2~a?byv-XC<)0iG`X6p_%Q*Yoy8|lWpt&*blQivZc`N z*vc2!(FPzoTbi7~6?cQ(1gonixoKuytf~MoeLtlvH_84NYeX&!aXS5s0Z5I;?zkCO zIRim&?j&naz3SP}ZsSHc1^J1E_=H6Yi!@*03Ly3o-mfLn+yRyVoZCcVG zBGxGe979gOXaF*yr#Ouz-|&w~B&9iq9Wl)o|6JT;tK$8`P)xO1{BzANE^+3%)32ii zvn&0)M_T&V6WCH1b=T>nH0L}>Y;O9EOR)(tVBaqG|EyvXp^5sWeV1pc)48C8{p+~A z!^N01i>|9ArDNHZv}?Jbom;!RX1jY~*T1tt6G3czs{Wh*@0RH|s8wEQg5=-N8kK5< zsJWGV!sxi{F*%NGZIu|brAI2d;KhPT-cbUpWL)ja zM2_pjh@We_Dl;u!|4+;Gr@1C7cl2;Y;iY>XalB}>NI0uM*Cuyv`vQ|)u4K@;N7O{y zPT8@}?zWU^b!@nISGj)V#`<5K|I0GANI_MJyKizb#W6NQN@p#`%T8Ru*_TC=^CDx2hPF@s#B} z!j_tXFV;Md-`Lkd`W(|`W+RUFBfTM0pWLp}GEgMj+fhMrURntI}ds?(+5&o#_aYFXT-kP%-E2Ho+ujYb6`v(0wNQ$bos zQT$_i&jvrA(|@k1Gv4;v^ctVepzUcPgS&xsygFSlr!l=0X+m>17e!?Qv-0#n{qfdI zqhHwRE-d2d-^uT(8J$x3$#sFNHeU^j2MesVjs+x^nm-{O(v{=BIHt>0Iq`9rP0?l8 ziv`@WT8)Y^`2T`|_UPrP z8ExyF)qmX}a!$2)t}xg1^LoGSi$8J~xeT6n$?Cm(M%m_^nJN-Fr`4qy$vbrx|N6>{ zz27Ye{%CT2&wADQPC--mGD5RmB{1lgFr)NBgX3>AyRWMmv~W+rqEaHp)M zemi7fSkWXo4*rYgxIMPF%8h)Tq%>ty=XCC&8pIzfcC8`L2Vb98N4MrB{}z&e*0D(y zp^+?6Gi(`V1Wx$$%8-IspGd+cRkXk^&8(LKwX*`Hwkj>dS^9Zn264Q8r(YOk?GGS% z;29^OVX7!BO1>;9%>pufG+95S*QRCrZ`)EP z5}!<&*k|>|@2m!&+I|1759zWjYr2nu10K;nDl|`+cJ2(M{>i(e?B&U+DdQz|pfsm? z{pzcr(Ik<_weoxny%z<~WpP`86sg|u2^*iCOy)H=S^Ac33X5(9>WT8%$hESoFsev( z&yae;(r{G+RpgMKHV%p4K1Vri@n>bVy<`pLyMqs8__&+Dg=g64R>KD;hgDZFs!b8^ z?kS4;X>CxLoH(m$zGnvhT6{fromn;TWDG%}8OWhOsO72ST6xNefRU$w1tfO;RjbU| zuD`Z9j zPNG;+20J@CpZvtz0-t@_=;SL~`5AUqWp0VD^kK;BK}4@zy0u9%*_jC9@4&F(4*R%s zTg75c@Z>*GVh!Od^H2zuK_t_EopzE)@e1ns@LuS&nbWnOA(56$ zKFg6ITF3-1=dcXBG&)OMF5FAr9LO0JsFmUkWbK^+J)a-ClAzkoF8X9&PL5{}-W3ON z%fhR-I_DImdU4$D0!y4T9o6Vk+e&wQyo?r`g0kym7Z?a}rgpy!bLC3?Ugd0hS@XH& z9D@y&IE{Sn5AksR5EgSN&h=dMVd#5fQ5eKbC!)=Nb!`D&~E7c38=t8 z?nRqtM)TXvG3_&J^1CA2x-Fy)*`lm9S>CG|DNhV^S8C>q_t~FjJFxzTT!uUbC&%I18W0pN_d}8m7(eSRZ zFHF9fhQ3edVJ>VaHHQ$aEb=~$o)Ps8c^7|k8cMFKP2+HQUpmrVH!oG@nj233Sz$xsdVz1o+NL~ zkeRDO$SsL|sp8qF;yp&fA)auVB_zgExdW#L!Rr#Kh8Hk0}!t&jZq@^DG*NvMM@orIH1Tl zi6oP1^4!33lBaeB)@c~`7Tgtu{O&B^CJYL{vV z!7Fq1>oo(74wCdo@TeuKKCD)@5y}+tYc1Yn;tz0&i$Go(9~Hs?noBlx z7jHzM8aDUwt<=cB1)+96TfC7w6K!pzgje1N@-FwC_M)0dT(oorS*Zi1qLQ>b@Mjcm zJD^6;`$_UGa>7bDmBBARmcS^wtb8*?em}`j_4b8?%U7H)D=z|3p|=g`wSU-a9UWw( zig3Cka?V4y-J4lVAynBf41}$`#kWglv3Qk}Jx3>LXr3%s-SLcjZc3oa*H+8ygB*ma zjAy>WTyeVVUp+k(d`**$J3yxom{_$&YP7AaB+g5buq1$z762`ayvONA$-1}M7`+7% zo3YBqzP|Gcp&SIG4&15eBz08S z$1B4;l^=hLEfG>%$6j&j*FF4>qf=#7DY8*t1NLv9b;WuFD!SHwac2GI*N|Zt)z7EY zA0}2GVm-hG1tM+PtYNs0t4dhnfEWp@HmK;oNY&u#=Ntd#7HVMxp~|l#FIUUccc>a7 z4DG7=v#-e^l|#lI%(T1%s&SQv+LrDh3HOg#!JParK+ju8uTi;&w;a4ZjghLoO2+Fzd@!4c!_j@OJoeZ=d z6aQn>LPgjvfUtt4l%(_oDp-(|;o8a2%|=28<*CBp1m9qNzj?1rGPi_kKN?~h4D{T? z3+@lRFR(BrQ#wObkFGSPKfdA7I<2u&j9JHBx6dxtvp3xVZy&0fIVBnCPRu~;X(4A@ zd6I**2`?RP&2vD^?P-}=13Xv26Sf0>P0(%H%Vkt({AmBcx*9f?Dj2Jb4`zaq@mO&Q zuTegN0ia)~qfacbpFfXMzC0iq4;FO-nvUbd5ATJ}Q`Q<*B~_CYoUl*kfogdaJts=J zda?;uKX0J=$s5DvCj)fHYc|AWhSqf%Uz9Of6W{nDP|02`&UqqLLMM`j0|^ zP~HKd;66<##MG`|Xq4Q0hx*fX0Am~S&J-dFrNFGf$ljCvw@WI^VU-*NjirpF#?H^g z1^?8;olG(fTZ6=6@2`AX(}QNdSW1%WeW71GBb=sXMy4nWfcd{zzYv26BY$Bb8=`ev zV%zvcEwGrsi$Hx6NQ;^hV%0rPK#?X9-oVxth>mj}wh>#14v`fn7?6>r`(g@|P;jn;hT5wE-n0LkIs8M*Nins;?&r$^9oToLs^~@Dd{|dVrlX zuLxheVdk7~)>vw$i5K$3dLB|Pb@w5s^*m0Gx3q}L5Ztlrc(uGLutKLS4S4!h#NYv5 z{#B!yZ40IzgT4`VIgB}=bed}B-!HbTjeXnraH!3Uj_JHZ1zLi-&;+x?T5K0y(68J` zlk%!}CYGLbxWA5ZpHcTo5+{R+niTXQ0#7gK#7*$1xLo`0N=;e{7D}QOy$ypFp`D>uQH!Ws>D|Q3VO&(at zi|8q-KQMznH=qyN_Uaw(gZb?o;|)^H7I7KfP|^0TOn$1+dQ$dOfQxtxMFr)}PgtDn z%KnvvZe$ir_kUwl`z2$41m43Kxd#abgXJ6d%#*qJAIHjy@Cf*UrR9L0uO48O zO0$_D1A6i>{2|uau*C1GaD*XNq|r=npyv%nJgZ*`8hrZ#1uCZ%%?r8h)-xEfr=O=` z@kQU_-+*RE@9Q@GiXf0ElyC}w<&;phczAH{bBx)>L(ERnqXV$oItI`D(uAj^QK&fRd<+Fx!6l+|J3~AdPODj|RVg>F733t#Q{03!Gh zmuKt_qA1i-RsK;y{{}|?fqDOdmUBPIz8k2{=3r0xrNgoyeQj01}vlg^SbyLcqP8DG~FzU zS-_7si-%mfxq?$g;+zLKX=DYbrv(`pg?|HgGkay|oyO~A_O53IHhbk@LZ;R6;fr=H zJj-Jf@R7Xk(*2hxnj#zk&xtHrpfA;_E&&em186Oi1 zGOtA63ZAMB_*nQf%Q2|*G-Lh|&$G<9wt9n` zlYLO0* z_|vjlX>~#@F?H4}(-V5iM;ud`_pJi;q3abQj@T-FGwQ&;x%89}wy{*LbGEv4?ROaq zT&*h8kG~9hzyc@Gw$jRM2n{2Wj-68n;&Qc!Y zpn|Hgyb*Jn(-*BJZT^h*zEg4zj5=ZQ0!k1Ye;#JJ!b6DqHN`WJ4rGUq%v7WG$0I)r zC-iFG7E#!@zFQF_btSD#_EZL2rcJYF(rYEy>{r^B$C2tM(|7nvOc3dSfsubkrG={^|>I3(RGT5 z^(vBXnh5T>#vZp(E=QU*&kUNg#ueOvE?ase*)m?-Qkr>7MJkAmiu`DNFkk+o`N^@^ zddu_ouf<(2f#sAFqvWAJiAqa`&X^CqLA)MnaosXy><&4|g8V$FPGw9cOCUO7*1vS{ z-PZ6;8+xA4>YcER=ZNc@pOSyq-c@8Jh}9l7c)R)4~!DS&enpnkR6#n9e!-@=^-rnRQYzz2bG366wuK z0)*evU!W+RT(9lP*Jg;Lop+aR9qnEEDQ7veIos!^E#lhrU<`*}-tDkm84Z1^Ls@Lt z1PP8;$GVZtFRK?mUfmhswnPphs@geK$qcSfGgg+S)lX!Jl!&!)9G{b?-Y7gde#3Ez z!(|31{1ld|7G-a~h%HJO1^AvS&*ovDlNS zI#$qE^WlVEHCJLUD5|_mFFHOSc}bwgY!G^qGPRZEZ7VDCY9vRqp*1@wsz%~yBFlI5 zq;^_cBehK%(t8@76;G&<+@e`vStoO^Cx2$jz+YHt^6}*b+tx}e&RXf5cPPk-l9l6i zu{s^6U07OMD=WKd5k1o?db{?slvaR^Z9`jO#pBPX+^k>OXZ$YAPrfboh)%fX49s}D zI*jt&?~Ye$Ds}zQs|{Y_6;f>nt8Q8%t7d@?`!(g*O-td1*{n#XcFtocPHzmG5s2C@ zdsU>M$GH0Xq-CVW(2_`HF4OnnQJG z)Ghl@0c>$SPW~K%j0;DB+WG$Ju6cq@o^IbbmdQL)jeurj%&^=d$Fy}-O|uFegkH|O zaKS+Ke#lLn$4%^XPv>q^_{|`XU#=IVYJrkkG$1bzB}ZWs(0&HPrl^_ea<=P1d!7lw z?`jpNUtVWh0J2HP;N$E|4^(@F{7*k?PL~%B@7r#^3l1{jbREqo>9}LAhIabs@}5Vy z(|G1*@b!N)BWL#>^c08q1&F&wWp%dqUr`Ic@XBpYA*TCnNOOoc?9-dG4E3hx82aa&Ts#PUcdLlgSp=Jr{m*I4c?qb7dL;$JRW;@-`&t{dI%le zdgUm{i-3CHfj%s7;@p^VQ!LESF!w8AczTUykk-)lj!&+k3feFlfjf z#na~VsV7MvAvTe|(Mq2RHnrTZp2$xq&Jes9a2ot~>>>@%g(h8)Jc;29=xwk3JLhE| zRL&;3l9b1OlC|e9_P*6(N1z`W;I_aNn-*qtG+zDmLx|}jyh2K8NO?`=Yg}xu;OXbM z5l#~?LbCR;?%v!EUNR{19=q<*T2H;0Ba%#h^Zrh|I;(gxu|^iAHGU5@&d7cm?a87w zLxPPxOHA=aqc?gPFBkE6<$)LA3$a^yYdZPJN#a$q&_#8&jz=C>FX^CvJd|er;`Ei~?<2J{!%po~yL51hIV+fCVQJ`W zm!xZF{BvC>bs21*^iXGBRO1#-xQ9gR33L0I{}W>vHTDnB_3vFj%)+qp%=hrXfth=C z#&fd&V*9)+e&3mXTCIgRm~~3Z+>Uj+ADl1s>lT6-68w2@Hq3|ka7L8p-E)rAu`|(; z)!=Z=I`^dx_} zPr91(U)0zw)mWv*HgUvR0}|WE&6gNE)+eA$B|izfJmfJF5lzXo8Y6bZa^z)twrY4 zN!t*E<&qD~!BSvitnU39_j#;@Z@y5Fy#&_;eQF#7@?2}F+aF)ioN*9bK=evLG^I)f$*V*yYH*F^t}2oUbRBR zaf$YvYN;i$&s#!9n3-elj~yoxEAZo9KH}d5f(b{09gu93@a=pgFP}7j0dmV8Avh_G z;FA{RlTj>?VNM`b3XpDGvVZMmzs%rsSx$p30|`U2reYL0Lmh z$$L^{<^F1ABLd}lNUT9f`J#80hz($Y93qd?)>yJpXK|E>A~so_n*dFT9Q@W;n$-LgAtX4ZA*wSW zQ7Cl34g3=RyqCe1gB0`xb5#QZ?+l!6{4CY72#sb8E&>fZeWm6AsU8#Z)7nx9N3C4x zKX>>>6_(V)PJWjETO9ucIkG9a(BNvJLavTMNVsA4_ITcuM)5XWbxKrIRp(Zu&i!_S zib!;KyHTs7(M=u0`{9Bm%Ef0#1#)^#>NfN;mULtAh~5bFoh4{3Thn5n(g+f#?JKnc z;1csxby+D}gzJY-iTG@N-Fd11U{rAMtlSj<_I^0_IRo3b{Ys`oLZJiKw1v~z!M11N znI=wB_#JIafmb#iV5J>-#1glr(|4yR^SVU|+qXDTUGv#2^NUVWo=z5pZLei^EZlbR zP4;rPUkoNl3(rN9q)W$NSgtGgzjmQ@H!Gma`U=~B2-^q?~ z*DiP6j{l0iV3Zx=-H6P8_RJ3Z(iaYyT#kx@`U{Kd+l{YGi|tk3y|UdsXMUy0s`!=D zuDtkzvXUL>OC(3Ftj)Pd;g>`)XrDlrK(ZHu>yW`$T?I#B`F=CbhhupYcEy;mr3-Fe zE*0+(C(%bs(*)4{T}~L@Z+Ym85Y8C$!;Z&zAt3BTwFcjKjCz%aZx+rXBQO zFz%x_UjF7XynwX$u4i|D%47HCe|K#!c}`qn{>KIqy42K2JZigi^uXiS6=Hs)=igmo z+BW*aqQuK)$Loz=2q8P!^Th2d&0KrLD~{fVLa$7tJqq6r?o;&Yu!0-{sM;dr92piZ z&F|mNMLNM)3!MYJBsmYl*Z6s z?=2Tfg#JYPsos7b)%~53wzfmE+oj8COi;EyA(@JUAVscFoAb3;d5yy3q3Jt zLgetF)OJif{{<}*owBt)>m?mj@O#8a5i+8hi`FNpPILB~Hep8miMFI8tIi_|Fp|wn z(0F~+3C;1XuyK#<_!8&-pxVn;`sDaU+QJ1V6uI{FDS{p9j%V&`a!clTmQ@jH*bbTo ze|m8jnBJ8xxSyW8Km5Ta<8D_5csE1KakwKk{ZW_E(pHmh(J)3kU9u!AZ9B6jHtXZQ z*6SH;qHT8ZjF;gBIr*+qIYvrYjfkn0ojd5HTjJ!=z}~6EPSbOO;t*E!82)aA)Zb)j zKmI^Lmk?Qmvtx2$Iv+q<2Bsvsx5!f&PTT-;t$B)8Ocde2mth`c=fBl13mL{ll-@;&v4$GghFfv`y~J~;M?+N5Qoq`o9O77<8KNg+We<9U zx?P7>I2xZH+(VxV*77ys1ZQX#Ax9x8!0VQ;v&CTkt$ToDd;Ed4)K;ltwb)=Dg%%g`F)_=GPbg4OYanrjJ z|8+5Ry3^abyxY2Cu9i%A5+ixbMT{P%7d|W&e}zOp^=u4UCVobRT1P;M(znMstJb+#miIKj3m0 zelgx)VG%xp9r)ZU*xA*RYcv??K4c~_7$Pxj89!XwH9U3gP8@q|n%?za8tGR-Kox^y}=p`xi3}U5MV82gNN^~k;i?Ssw-*mv zg07IG{81$ z5mMtN7tK$`lDB2%ANLzAj>DrTnBNc4r5DF5OlG|WKhzvqUa6A5E7|QQ@iBe&gJ*e` z@=;2WYwFku{%RCyT8B97<%Qy2`T4?)J(Vo_SbM&l7Q#XsrG+Cyf zcz?pQS~ZvUl5o|4@35ycBc9*?{mf^H*@rimRpys9q(19-d8Xst!gg!_#+IqY zPUZAH6i37|FU}lGdq6#8Y#M!E6<_$1h}frI+BZyiQA+zg_UZaJX^x!5hG(qeT|`OW znp4S1N?9P~q9kQs;)N>Qsd|u7bM?j*)9Dx!-B0{z(uw_A#kJuXe5;g)F7@W-@n+hs z;~xvhzn-ljo}HxZb2ds`2UO3%6}ApVjsH)}R2&@-g(C)2{t4I+lWYi?v9zwO3 zp2+>N(H{N2L1da;`NQ>FCE%3fNSSU>()u{|#{Z{f>U7Wb$xDH{*G=w~AmePkDdm_Cv&fPcJxM;TkuUyKq46?5bg*JfqSWW_tXBoiGBU^H(O%O->NO^h>4|0fCH&I7RHv;?;$U?5 zr>HNy2Y=bUB_|3rg7wu%m?u2nxUYkFXdW?NC27PY4t3@TG3+~BgwM=&Ux!Dhb3KXE zb&K#&D*5ATI7FKDdZ(bs_T%#$!lGA4yfW#cGsGoIo~M z-=@ocwY6JQBXCQrwqmsHq5Zh=Z%G21>kpEB=ACA-^7@*#H#R{{%h-n1^nlsclXgGrujhjx`;v<=F%yxnHUOn>r&~ zfi$)^gE8atlr62`8sirFimlIWG+uXPiA)RK5+WY9W*XW1M7)Ool?&i7=$vb}7ND(1 zK+HBYBL}gbxfk~g>|x$E4VGg^d8XbJo0$2|E3;7xR%IvtsxRQV)uPQ|G7!Q~nkr<0 zIWNY1F*iBm#Dpr`DFn-0~g`L=%Qd3@~%jHH65Q|EWS zMrOO5M{G6!d^%O!@uffo2mqJ>E?aCV9?Ag#Ag-tUimy45@w;aBrXQ0N38I`58((X7 zRU__&`U%w4smJ}@wmRu`-5B@_7yfbBd{LhfS-cDHo*H}R(&x6IyDZ9k^|f2&xpFqZ z$j>j}=QUCQ1ntSRf9|#7Jc21kH#b~@B6^H0dv@sgl&AU^^_#4Ps1=Im7Nu73)!Kqf zn|-=jt%A_MNXtN?5I`}K zU%DvA$YWL%UU3dn1-|Y41lr)6R1Rjc3DJq1YUBY#7Y!stx9Z>aq-cikSjxAMS@e5< zr8ne{tM}Rq@r%{+-Px2yrqNgxs!mHD0;tLveUOAhEN2`MCyQo-%iPS%y^(As&}XG} ziVp^0T~8Mtt|~N*g%^2Byopa{dFA{vqNJly{=n3W_gYjNA&NeP?X}~&o($u9VoW>Y ziO7~)^b&UMozn24KuI^&qz_30aLw*l!7^V88%Il(nrP3#XkGwpoSik}`{PiXvspE* zw~ZD8UX^w9^L?nbHuXAD9P)yNFSn%74#UUd3mV1!4WL>bi%&1sdtYm(r}UX>^UB}8 zQ~BY`9rVxLGq>FHi(G=M)S2a!-YAcbu=Uzo#*G;Jyg6Cx;(K zi#jG+((+sNYkUFN0*&jyIo<0mUVWXmpZ(F-a0p@B2s^*=YYV#PDBt69qH08jt` z$N>xRP>cTWQ6PV1R?}D8Bo@|L<&uP0ll8{Eo|Dz3NH9OiV>`lbM#DAQEB)xAio&bB zp$1fmE(0HL_v+cpoKAcCyL8_(>}2lXw^y5?KZBLeyDlhmcJcx(9|lJ9s!1@=gAZ;7 zC*G_7kny?m-0KkG%xNYdd|#;NFGuswBrV#vIDZLSWM=I4p8H0sbP;RFz z0D@*ztGdR*^;Pc!FLhaf2#RRIE;s%{M+x{wdWvQ6J+AK|k~sja>&Kd^;%W3S%~YW+ z2;d763;=iT2Io2E*iSas&)VB8;-KVs2eBeE9UlV(nkBOCKulTQ6ct#B8LzI2>FWBc0v`VedD-)RG<45C^k%R)?=Cdw zMs~ebF#~gDQ}~a_r4Eyy>!bnE4;V*_4$GSN6em`_SWtY44d{qBJnRIHyJFEnH5J$-FN;_Fy z2}?DK|1zGkKd=d}n(kl-KVs#J-Uo(dWZw9qYu5Ne&zJ-5BrPal_9{*A_jVcN`hd>; z?g@>!$fTwk=41_Jm@6)ne@-4k=V2UJ*5<(OUFu%3{Dl@NG}VY##p-n+2e7dTc_-rP z8n>YWE;Rwxr7`$3{F2V}lbk>2e+%BFV5>%0n!AYt3CVU6lVRyR$oWamjr@~`l0ZJz;TVdEEhhQ@U&#%$}` z>0Jb!!pdoQi$^v6c;|{z{4(`Su#c>0;3dSrec9-}!QE7i$CSz0oq6QP{p%er7S}sB zIro>Cyos8e29ai=q=dvx+li=C95mQ;FJ!cj%b5s^>%~U-!C(f2y5DS_E$Cw+x`ie2 zw_nmH^Tfe->;z(91CUd#iv1YF`g@!ehvfdw%AxITS+U7UCtQIP!zA_{!=F=*t9a~5 zI(NpQtndT_M>}XA#n8a2!p`#7!(c^P%6}};kSX>AjKl(f(}a!3)s8RlE%kVl^Z5mB zgRi`DGRdFE?Fg|6FN&3)+p!+NIKEwRG<-n$uAh1t%JZLri#$CgMZ{Zq#{Fn0x?=)r z+Y7n5%^B*zYMp%E-p(mh^Arm-jkHTHGq})yg0B+KWRF9m9nwqb>Br-=T)HVMoQXT{ zPiv(+xpjH|t0M6NdHq`1M>wvntjOaG-RVLvRZ z1`)&q%!Z!yezsh4113D`a9x1rELbJIn#uaAbS<+JdM!`rw_P@OehzzUwmStnyG&T} z&;7IF+~S|}PszB$-{>ca%_om%^@Ls}u`OUux{={We^^3Vcss^9%k5Y?h_IRy;t`g5 z+?xaK%j03s16k*FjI)N2Vda|;&?+xXIbVPsZtZX()5<@2Bv)rum)3RJM-!|=hFDeQ z%Miic!s2QqwkJpsYXHbCFY@0em%{&IS6N&0S>Zf5>!N*fA^HWga0$aHLxOc^oInD>~lHVQb}e4}kp#qaIg*J}#EB!s!SG+jdDwoDf8^g(J+m{Gg3P z5(Zg`Wn+UEpR^3t*yUXpWoKo8elGH^jPE*b&r(aC z44{%`So!s8Fg#sbLC)%NgPg`Fu8v1()h}WnLp9 zNOX-?(x##t$`?w~Vd=TNJ`PlAslwW*9Q!*wv~qNwgl*AaN95YZfLgJ?wMZ=nKstmDVh~hBkkC6)6%ZvM5ITk;ML~*46IgVRPAGyR zO;k|O0D`EfpeP-lfUb4c+I#K2*FI-I?>n9`&iDCc-eb%$=a~QNzJ9Hag*gR!6?YTT z@*GijWt?b2zur}TMyLHE^L#>yLlnznxe{rGcwaAPM|6RwY3Z?vf(taId=vC2NK#n% z1+5prr{_wst+4B8mK99`=3c8K9lcjYg(>W-$GyJOCE7IPzi{2_+J+7Q z$Wep1QT4>M>Zw=N^MtDDimI0nt7hL|H0CSSOUo zqS0L^{h>~lwq66xB5qcH%)4H^txo=4y{vk@%7=Pw@4Dk{^@x}CI^GS(BpVQI^#)fO zPTZ^GEod-()L>!Os0%p&@j^f2Aa$EA#f3IbQyCYWN?ua@|8pIdy3wfCka+h$%iHDEekBr?pHwJ&{WT}|XX*y#kW@Gw_4Ln^u0 zD_hH^yB;%_xarb4*@_K}_i(OsKQ-$C<#v~QcT3Cw!7HuOGu^jewu#Slibk|6&+u^N z-dEG;?YYvUWk1(8e0V-p?_9MbXw1uk}&*Ab8j{BATJ*++<1j z6;=pxyDg~Q-pRI}TW!6nJbgSG{9MER6K1{cN&UlhrB`iP`~VCK-HbCLDxdu6pLdr* z1O-?F=|4ee3~u-aC9&J?4%oL7V$22{cl*6>^4iYy`XuotN%gTM^(NCj&`DyrT{&n} zS7tv><%$yhf;)iR=;o0o!0$WL`soOMVh_4A%;wiVl*ZGNn#6m-r=@+j&8xltenf9T zQd?i{K!DW4{FRnRK98h}$`@AClrjFct~A4tyI=yA($ar=cbG+>XGDX$Gq<^W=F#=a z<|Urt0gd78cHX(9;oOz`^LE3TW}~l@Dm;+XfhtUVu~fxRsAkdRg6YDw5SnN?2BkXz z2hpR=E8QErTyNStGBjY!SDWAG4j0gkEcYP*cmhF61TyK)brya-D^Tr?ZjJhU0;r}?or!XOG|0Je}q58D-*8s)^ zz*9@E=>D8eC6Oiu8_c_&}Y^=mzTa((5gw&o;{bg594aE5x8QPeY=J&FFYNjJyh zUE*U(0cfP-^UEJ8OixijRhZrrpGWXAZ>&C7di-qPfAW0Yy}s~nzKN%-zLTyUPlxO# z)#_@`otlo>(>E`Mq!o}aMzPN>O)v&9uB}miVM5$}%(H-hK75QJ8BOj%LRXP5P$;47 zS%|l9aTdbWE6>+!7zv}!bV9`8!Q`om>UXXVCEE?z_gPl@ zcEJ>FhzES%EMG41`ntBhl%ALcr4VhIk6G2z_5&z}Yp9*S<=%i{)`-gfaFxf&LYsNi zU2?*8^uo?;@wZ^-UgQx)Q|85A31(uC-$YP-W+q=i(UcPOtI5=jRyny|@u+=bbRDBA z!E?2(p0x6U%&wlMa-Nr{eq%QBrcu|hwx)d_K=A}A>T3*HNk?w=ENpz7F)3#7wbZBV zVGJ(7?&IH_NLli%UUFZ1!)+vBKE^1J&KT_vUPI}Ly-_@G!L4!oH4Kd6w^6_r%TWX@ z-I03kH`m9NnY3KHOgFxKOn2&hA!FpMK8%Sm@^RM))AOLcX{;v8?&|X=?MpA$-fR#l zEb#|m?;-}dyEHei8if@v8llNuJKt_fOIb}AdLSt_U;-L?l*W9-pyBE7vurP!-+c>O ziNVkcu0Z6m?$mQD^h_UFFd<#6FFY+VcX?^HkuUDeU6YuG|6KW8L00jjQ>5Z^B(=5$2VjFT8TL3S+{fqh_kB`V z1e*3!fn=8sRegadiCeAJp))h)=o+K)14tYe|01o|AT*K4LvB?*eT_nQGi{Z2w>u1x z|BkWHFPjY6wzRs5;6=?Lm?q+ z5V5Rc$ccmX#Zd>fYjXjJjkD>;&If^AS7~hk`YUIZ-4*rLM;V79>WW9DqNbc`w{|~DC??~@{f)SI$H@lrw9|x7+}6)YdsCv&C1LU< zc6qc>VS0D3Tqz}6EAlxcM`CE=ZgGUsL*fLP_2Q{WR|R=8E2-O-=Z)INkL$%+=;fO_ z%X?<%GiTn`HhI;yr1((dn#aBzqIZ~`OT_kRVO3ZrwTzwlCwb?295mD0JWFB6>dPyp z^Iaud_gE8MDoyvf45mZAR5_01?eT8iAs^1S@9&m#ekaS9Y5F|7kDDVVTU;$%^R;rW ziGqon?X^9w%hyeYgf<4`w^&Bsv?#RXX~*)}ZwJ4BTIKNIw*AhfPqQ5dk(B%w+E>#G z9 z!n_vlwA-(6KHMj|m@HPvvwk(r|1G1PQ+y2m*j~I7j}ZCULF>6aB;iol-vy;vor@vz0^#?^n>}39lOXt12 z_RkcB6b9@LmFRq`6A+45+8cMOu@9raWFuh)y2E@r5-2KPr#1OW6)Kz)D}P#AcpzCT zxjop#Dz5cL-`kS8v3#eQI8mB3iSaef^qcFHqXEn;XT>8Y8zfUXS0u#*rc9TW6run<`p3LIa2zPRWsdHz2ky~mv+|P8c3Ufj1sSW zijQzi;}T&R=}#~}#ecN{YLq|ps3yaCO^bo^q+YgLQ9cqf6pM$?n}okBv45uI7MY!s zBc$in6A-#MS#{g}P8X$i#M%SVu81`b=a2SMHU4mZ{~8n_bFlRGjbhuaUcI{Qt=v+m zNKu?YYePDKcDxGUzaS922$SR0V?R>d{^-wy~JNfB|0a8~)?ZGWuM z7`v#Hexb9U4uc6-4_KtZor^9gI|#OPy$Bj0luWd}RTltODk#(xPGt#2$Ch3Rm8A9- zL^4}5i49i%auuhhx7`QVbc3c!5wjgx-ymYe#(?x4UvF=*{%V=Jm~PfKq#-BN)`&wg zg9CYTH{Ni8G-yp(_0xs~Dl!t}O|&gM(;^xaM7E18wAH3%%+A!MKRb7~Fd>@xrDH5- zu#LTpiNbZurG+p%yV*L85XOlcE?GO}izNz<2~Q1JFrPe1v}bBH5p%+6ufguB{l_f= zag21qV`S_K%e?6r0l!r93H})(&i%rK{X+Y|fyQhBe8yTYzeBP#s9Sl|an_AzTfexUfX#2`hbE&qHFm_am)}Y zjY0`*o&GqLnK{PT&^}?UfP}L%^BfOC4-?=imuD9EDZ>U0v;xzz+-Kg18ihSP-5HqK zJhLbp8#e5!6?F6Y%#zB3u#u3?pxYZWZ?!4I$6~aCi#TSNjf}!4Zgi50#b)1~j18Z> zs})jlcJ}?*2jR~eJ4323&wg;AyfW3Vb*V0EcE#Q3%8RF+mztXMN`S!s$Lv)Syn4mo z^M}3qKY&-&{c%S7R0%(*G@fwe%nJt3T}3nt`I}f%|`IueSbwYOg-|_w3cr|7)-ETeP>VEexhh zI*hd6|N3_9j#<7%N9(r_Q}ut@UKJAm)m{z#X|Fo}-d=_L(OxAmlI+zY7Lol`0^>ih zR~Hy9{>xr{PqJ6l{>EPAn{zp|S6w02NO|1f*{hzvvsaV)vt&4Dg)g4H+wbh&|by-X0N8<>rTM_V6Vb|wO6?h?Nz=*d)4^R zUNtTiE~Z+?qHQ;Of7+`S|CYT9{yTeB=pWjvvVi)lJ=B?bS5YU$<9v|4n-}_fPHBhC_SRna=sAy~^{??bVt;+N<(^v{z$)wO4ij z#$MG2|B1co^pEXT$-lK%qyDwMs`pRqRp`HCudax+|0nF#>;KwbHT|ddYWUyTtI)q> zuL}N8?N#=FV6Ube*{hlV)Ltc0{%WsU{YUK8XaA19>hiDb)uR87z3TjL+pC;M_Nvj} z*sB86h@bYV#6P!JImCTXQSyIpua^DMUM>4+uP%}7Rjng?wc)3|n*JB;RpmdiR~`P= zUIi1-*1xw`RS9&#ke~Ld5y@UHmjBIOO+A_PAGKGFV2o-0VA~(|D)S%h)!N_LtE8Y^ z1GOy?Y)kx|y;@7H4N{X_JVXqed zjlEj)2YdC_@9kBcBYTw%3}E?buUbWC(6+>43tG$}614;g!y?X60+pD#|*{gT{g1x!` zPCBwzlYVcnYW-@j*8gU&rh--fv-WDhAMDlgU+q<`BYV}PhE8i0SpN^~Rj1$EtM7i= ztExx#YCY^v>{ZO~?bX$PX|LA(nZ4Tj2Yc1%hrP=A(_SqvII>rXWXV75RXi9QJA0M*&+JvBf5%=e{u_H$@Nex^RTTMR&Y`_}?Z{p|L$Vi>|L@tW zl7DBfdjHN|P55uwtN1_KtMAFQerK<${!i@HdKl@{@c&nP_0&JKS8x0^do}G}*{ixF zd$p3>mSnHC|8slQ{K#JQ|AW0+K(befi^STbuOUM-y*vKKqf){%U*RMQlJ4;hxRH8OyNkKd1S9T6#T8dT8mlr5xF->2inXtUv74OxUw@ zlD%3(6Z4PkRR9giUM={ey;}A=d$q0xoPq|Y`;+g>QM>EVrF2v8`_nl5uvg6w?bT9} zy^87!6@0dzG@*cUtb_%iYJ&j9sw_eE)L9qzMnvbcLS6kKv^bF+rMMcWiL#G32E0 zbe$Xn8u03Q_w;@@bx{ZSgEU0qBV&skH7I~)m3~Z6i~1`H+*Cu`hJN+3b2j4402;(pcOx7{_G?!68YhL);JV;>CVU@OS zc2wSC!OjnAgq#h|C2Cxo4FfQ-TzJFI@G8$2><6RexW*V<13oMFT1;l~j`Z6-o!6%= z=3niRpCiey(hK%l%aK3$)dP~h`gi;)?ucJ?`6Iu2dxSCX8cmeNaz!7P}uMJ z)r=#4HJ(V(dBm@FzFNBX-?LY5e|rCeU;T2(uV!og%C909Hh<+;YXP6X9Pz7LDZP82 z!VjwTK%<8BornBt)&=UgBYrjM2fv#8JAPGejjcH>WF}ziSY=H^R6#_=8^^UeL-t;#aBH zan<`*d-mhBmp*I%6dliWiW};@3ahV#ML&Px&TNK-j&l@C!(fPgc zU=T3G3mBvbotr2eND*he=|UZd<2p96`}m*QtCR1n%8Y~?i)j?%sHDfN*(X8ojZfD& z_9xH_iW^r}ft@{5i=9`_w**LLYTmj;e9;s*H-cw<`d{{{V_$r->5haTqitKN@JX4? z01DF^{8%K(Uj6DmR!Hr|B#%h6pAb||benzg&3p3h$da=q8${Q${#lKU(@c1EJ;tJ^TOljc$MqKMUxXXQ)Hhxiy?*P;=g&O7SzYuW-zU8} zv{xNo@9*_lxz}9md}(*PlCGkUN7@aaq77zQ8SZ!@aU5A-qcl;ZMlRVPCrzOlK9);G zUA#&dn_(%G-8EHO(`U7^mRAj9egV-REqF(-8104@(S!y~S{hbXVsT2`A*_K9qCrY; zjXvYR(wP%3wreJ0cQOJWij|U@@X5;)W7K$R*XuN9-F63SWgof74o`f6el=Bu!;|$T z7BYe?)IQLe(=$={q}Q3!Rj&1GlKEb5uz}Q_KpyB*CXbBi!x6^x>hK%Er%WaDJSmu3 zeS+Jt%HLR(2bK1Mtsm+M9~0)Bs}DYX=6pya0Wh=}YI_f>d?UjtJH^F^m`rJU?T=Do>ec zUXaH&?Y3~^@~hMNAa$iS+94ZY*1eh=gH zPzv{v8^CVi`SvCL8U+PlGUWbv`q|8IuY-zo@2z*5RuCQMD21G&>(?H~yCmom1j?48 z{FiJC+*z-wo>v@(_WK0k*{O1j1#_l(E__#}yoe!Kf)meEZHK|y-|ee2r$poWlbRow zmoP}x@GfV{OzGeXbW8A8U`5HF2K|E2wKKVvWxRJ+jfa<~d3vZ%wlPp^@x@_5gn{tU zvU}h6E)yx+!Z&2&*^3H%6#y_08DLX{XORP-6Xi~!cMH}??Sy!9^@h{EAlquN(cykzEfzARSL;(N=INm`5Hc&9U zh!7G0_Pjjh`B_k%H3WU-_2i7k3o4rbQeC8uNs>wa+mjI#}$zCNb~TXRZnGx^ zDwvXTbp~wkstm=BL+0FyIRn6qK!6>OScz`sJHsd+vU~;UsN;P$n2c~DH;C7L779rr z#|OamSd4jq9ym;atV=M<0wt#K7$B7BTL_P9@bW*coaR@sgzpxsg0|jTTIUw(UXrvz zpGaniFXC|ZKL75H^Z|oKP|D8rXml9o^4F%uId&3blw19Cw#gV_BEGX_Tjl$dCUTVV z021s)uFk3_P>gK=LBoAy%aQ;Xycb2}9^7crfY9=N=Vk|Y19iHMw%w4hvAZ8&KnR-7 zV}w|=!#l2UHT4`lmZ_VLysG;waJ5JN$@Afwmj!?l zRP#lBVv1%{t$)a_C-U=VOYR60H5dk7@v99lz#15d_9mWY0duypP{}i(Q?t<&LsM=xw;7LZzdBg`uGB$) zZFPI<3Xzs%uMV+Xj5e~HkFjj&l_uG%Sc-X;x1kU6Ppu`zmN3$yxoi0@o!%x*w!1U8908_6KDo#`2jVT3ls%<`p=|*`Lp=0qACG}iLC1hx zEg^#+>jJL3+kY?s-hZm;j05qY!Ur^TfB>WgtvHMXu6{nlTyQ>0LkHmVW)wsR`6>d{ zS>afvqXe&N1#S9nkJwXk9M-47X`lLWyv zMt!$=4KH;y=`sK=_ce0rUb?<@6WD!uq_|+%B2jNcE%$ z-PIvjl7U#eXLDauJySf0Sl)5|3jKwGlDz;xQHmJf%+KJt5-Vb;cV7(CYWPzMY z;|N}@IfPd`n4+j)St5t#Sj@G4-%iv+I%n(naMkl@v9jpZm!43wOD ztcwzoS3&Ypxu}-)M4})yx{?wcURe}^S~hqRyb5UjK!R5ROD}2OCTdAdw!Ujqy?UI# zBe6#5O8s_PA=ryg!liapGLBxa?I2|eMutRlD^M~+dY6u)eRXgY;p)HwsSCzvf!=%? zsvO-Q04O_$j3;{uNC2jzyJ*pd(J^#78ocGS zWgq!6`=S6~D8(XJkKoniDBuT-@e#br9?fgGTjL8Lh}kL} zTqrn#S2tCZ$R2$#5MGtb<|-p=sxNIjgjY*1q#NkBw{?}bVp>F>D#mIV>oZS@ zA6IH4(~+PA5XicvNbo95=s{G?19*}lHhaJ;!W7$P82bcZWR2!3!%@MtcoYWcy7edJ zjW9Y7#+(r;V>dSfFz<5^peI=(cs>%m+M%Xtzhgv#SJ4`Mff@jejxk7G6I*~b;fXsw zS?JC~eh9Dn^V3Vl;okwL+YaGX47C*17{|VI$7u~sAX+*IeF(1#&va*ms$+V)mTL1c z$2fQjbKugrelywA| z_twtPkl@uN61y`mXD4k9?T-}@X^ z_$dNCWF@MM?=T(&0Ohh@R$u}CfJ?=a_uB<1%TD%x8SVlC_m}{jid3pDf+IjZ5rsxk zB{YHbnE>zfG6rtQk3{J-pYMA-)1dK6V?5>S8j}Le#X>$b!;HUnS1nqhLv2 z8)MU(K}G4S&rA0n_@SX3B!~9G4`YxVYsU%h#>K1>QeP%TGoqz!sFXn1-sW@1x_9L+ zwbqG01cTy=j~5N$hfW&IpS%*Sc`*D1Ee6jy-mVVS$nr0c932LGP2ZE*I ziE?+m+6PD|Ai=8-!EI);Q}Sf6$wPQ`N5~R0c7nCadj8Zn<_EmmEejennOCx?_JOII z!vxpcIh>B*RiZZb2wuG&z^|Bn1h0m}n&gwRJO)~s#@I1L9N}awNP39rK z94>G?ypn1a?e)YA@?v-pQ$|;Yp+&WMd7#MwKxkYYEtP5I49`KEcNE7LG@aPQbNPpw zuBN7q*7y!F^NwbB)a@=s05h?WD!Z5t8UrAr%~dtY#6UnQwBMQrELxorY5=4vi$dpc z*PT6X4m-_S!J$5#u1Kb-PeSHNs45^iH-`d12(;&bSp#k^n%kfr6uF32_$pGt09;on zXvEOe;Q(sQ=fQU$T6WiW9jU8JBz4tBU7zL!83dZqZW{!RUfXRWL)}l7%cc$GhSE1| zSpKT6jwuXQm`J6Mq`M+)jM3=S$r7b;{Kpwz?rF?R3L9ul3LDk9wEaVpx(bcCozu%V z*4&LD$F#+qd6fA~GR~`Di11{#N{8aWCNysd9djNVbBkpbA{FNjc~wP>(GUVgPvgv? z(f2v2P;t?^Vwy8-7h21)?SdMQrQ@!Dvs>d89NT;K9D3@j0q#@C>F54{CKidWdDbJm z4%@udUy>c3WrN21UVA~Tr^dM56<@DbIe;>6?({o=El~QLM35h9;S`E01?Az0BH62M zTQoYUDEe1KL&r3D4GPjEtTE2QAmhlw&iV?S4!JP*+L3up)e)L9CvZQ(2Ia{8J&%2e zh4YmoUo}L83G=$eQ6Q5`ggT96uL=z?n3vB9_&MF^N#}&k36ac}{UrWNDKt}NZI z7e2W<>@PYrxjH;^Mae8#o*`o^ad90X@Vl|Jp&3Mh$7E#a#bs_pQ@!E2wItRw?|s4T z;{H-G9v7{e(STm^d*bHalqmU1Irvjj2!(rS%G&~n`_nkOpxe#mu=F8xiVHJd-jX(=9;(CKkQ#3O7o0e>dZ?{-U)GCrflDJ`=(X9UG?f9ci*&wL7Z-|D5yODzRU zafK-@MX%!S?0c4^ye)p^newdll1$uPLocyQOBH@zRo|D2M!ZsX-i{l`UAwJX!xuLR z{ZM7$?K z-yUi0`{Y5m<}<4*zRwtg@BiAXfufOtV%G!3^8+P* z36$&#lo}0`ejOP>wN3oc8O?&nEdp{!BtR}`g zJNSfbQdl9xt&E<8|q1Y3nUz|2UaCDSa3x$GQl(0XUkt(^7Dw!oE zIMN?1CI?c%$1DIyy`|A5D2fXY$A~~#Qe>kjzhYssG%4pB* z?~=qz^Zu0QjiXBjV*_*;u0mto9a68@T)r7csWV8CS8(~tL44>h)Df|lp2$R7YLlzT zfz-08)oeiU7Eti@*mKS6md()z3t|+(rPOZ3XeVN9&N9wQHO66;GDHmLO$0l>CJ({K zAW25<6nRQ1b(%_82FXLk$5;(sB0wq4@EfA2O;P6`Q0Ki3yLJ#3De8+> zA<;t=cRy1^LN_f{K+athVw+KZ+27-38RARhoeIdD{G)x`aVTwa&DB^Z7?}>9TxFF~ zO)SLB|ImrG5(D4#*s%)TX^{DJBpG; z#^+#DZoV&pL(y+#Q~u3hny6K^&z6E)_u2?F)xeEfJ~BNCFXLP6A`g#I;@tXTwaf-D zl&OH*r-|{x3ws0vk`sfwlKYwMcbz=5q|a{`xVSD2@XPDaf2D63r+1eD7D+-5Dm6D5KUxqVYOo9 zo$*wfF^IP*Vl>!~=t)xdbBSFt@IwdLU)uLM`!?_O{WUWUKm6ILsNfbydzJ7?P@MWc zYq&?mJ8=u-zCehN`+oELc zCAjACB_?F|*gCPc5|lQo`a$i{xX^guj`B>vBTbdrke63gEQ^M{!HOBT3!v6m*a zpT*A7`7GNEsXkpRMXgEo1T@*DMfVh<%`LqU-bv=*#qM6*`hmkE_e+qVE;eC7voM;w zLkfuxD83ht*EKT1qy?B(&YIgdZk8?w(J&+1gXvXWBIe@YU91O-nwthg;`r2a_|)|m zPKz?_Of_f<4H$@}Oo#0IW2M5u$MXt8i6-sKf|a%Ug66&V3MvUyXUnpgZY!#Z+1>F+ z5@Q!>m}+JqSi=e+<=dpO>r5BO0>CeC?F{PR6blMt)mH={Sg}3<=73oDK?3t_#cV&R z9woC-CzitHKGE+3RK}rBp4FiSa48_QbzQ1@wSSCAU_eswxu~7KsaxO6Ym+?m;jEcs zhR4%tO^~W^u9t%YlIw;kZc@%X{bhaR%yoF2hL9jjxp7Ym#w1c9DtO*8Q6K6hU6}L0 zy}DMVhg_JiDJM&M*YU0_EJ>*ae&WM!k6vUPfeM=7sC{5~&QxT)F0-j!SZ351Dz;dc z)v+sx=6n0dk-0v*gXfK4dik)cLVeB%c!DFlA=*A>HKkz}eY#Wp~zhz2HvPwdtLGp&~;LBt6x4*ZGYJDvqjgWA2T|lPJd=VJKymIDSgNo^~ zRE)(aG!!y~lxf`-ACEuPP{iI*W>_>Nq?y!vvqd9?yTbUUpVu3e;yDYi+d*<}1^C3} zNqR~996n;Pl0Y5G(>D`6p7*7pRBKNhiM5qZRw$R^YZPXsam-RnQ-aG!m)|i)#LnOg z`Hx@q^q4$u#kqv2ux_j{i;;L%o7PzAruo4J5pa;uSl1Z52BqZjdPm`u@rWuHPkx?P zo^?oCps43jlr?wAQ(WaKvkw`C)c|hc9voX$%2g>Bxa9QED{0zre2lb`qnM6)6erQ+ zB)dB{`^x4wx#GK0URL*BJgsYLOus+C*?6C(-PY+s@TQGtCQ|nKm!{^OVw20+pZm((Uc8; za_`l*`DB(8QtVe2Bwue$GbW192J0 zTyB|-Ng``*tj$JbXb zN`KDc4-{f!zHLP(1d8%39R%?fK(5DnGEkqsIu)Y#9#^@HWkJ?NAIO1(b-E!O!pCWi zjNg?=nS*F@$Bg8|zs6j#^P$ciG!n>MjTvj%%qyQ)C{e8Pe3c@?E!Je9ohC*rw0t_q zFS{>6{e#x|;uk?@Q3(bRmH6>7n;XTc=ltF5GFXkCFsH~xNe?icIM-KGk0Ru%cRPZQ zarRuhbxpb9j_;W%TfM-IYzm>10)~+uG6#}XTIcj^W}dDcJ0)#%L9goAQk;6H@$>5} zgB-zJUO2n3k{JIuKgAYWNqO(-iG7uBvTNh=f*8i{3b~I?-927%_559_??Rl+ei=be zrr2EuAGYiTP$YKKzT*7)fO+Kf#Sv+Sc*V2L+xi^_qtGKr3AhcxRH>St^GvF42M9#nzj>@Z-tR8b)I#KvZ`@MujhKL997_ut z^vF@>du{qvzt#8N2$ykvg);0*!#J_g^U1}MF-v@1`veXz-$cV_xt=XIa&_KtoG_Am zjLU5cE@)A$r{hZLb&b?`19y;q55ajt55zwjibZZEqb_{q>}l85klGqg?_4j@)_iaE z%l6YB_UekW?9TL)&hJfKnxBHMH(y&J0}9bzPW9BCb? zdbw87vD&*BNMdLo+zteg0d4PL_Ny@4Rs1_0{JZp+-D!t?COyE-D9K4ZIeik&iBKmZ zHbM$0*`YrP)Qs2g;@9kA7&5;euJzaOP!PE?KZO_Z+gz{K4Atj35`l_X6VI2$=Wj{t zZO33}B!IO0J%V-zQQ|f*gl{*IsaFpt=-j%)mfb z9X1Vm880}p`#ExiM;#fc*zYC+yw4nwmX2~Q1mWU*8b4ibg+i7u)>LN-#dBdy61vdc z9^v*vmBB(@EkPj(=0gLug#ah^u+w4qwW59)s4S+>v>FZxRDpRQ(-~b31t&4XBFWDA zrgaj5iWP?t1ge3ehXg8F5Q#t?n}=8G?g$qh5~z3qDUmx0OL4N>QKCfMow=ez0(E#U z?1(@m>7~qN^M?d#!@NQY{J5Z@x)@e`mvmW=3W-1k+bCWDDM=gv`9cp3)ZPLLW@C`ttbZaV-7Fme)psx)tuc3OQh%B&hAzUpF`vN+Xa$7!sf zyEovO5=`8MKRkE0=5ml3s7h%8EWfe&5BB9 zGu|3PMI6BhHakMdnE@?2PbBMscoWRi`8$CrsYqglj4c|0-~noZfsJu`-K*GVbPvV* zO0Nenz@p!wf^Qhw=#ev#p8$YZ*J+Scb7z*KnckdriqMr0UJ|qxz3$fsC84K06eL2@ zk%XSw_3;|(IZlxU+|<*~U3Ln_F1i}34tG2BMbX0yN$#>%3&?T|#vb^cFr1Jjwl41hlJxGBA*X;T|v)VS-;ThL!KWnk_Pv9+PD~myUtF5 zdB$%U*WJQk$Q^y)g=~>D6cfi6NFb-s_bNoRJTT_p$=Eg#-A-m&5t!(YDVb~^4D5QJ z*dxS*k~)Y+dE33GRY8iuH%83$wAQgXt@PT+{z?PuN2*c9O5l30=(Eh^4Xreyc%RO} zsGBbR;zujePSGmX7z}g2GB}E(p@-f{(l8nU z^U&q?CyJ={K#j$;W_nbTfNVrE=>|;si}$=PRnf12a32tx|9jps0@Q|#&3G9(MJCYg zFyREo3h8-x=xHc{6idi#K^*bOb?3Go%$6k$V!h8`y43T-@y&4)n$ z2`hv;_*kHz$88NV)c*uk5`{}uv9CxXH3*Ps43cD^Hh8J|qon8%!m2%@N?`G%9_SAP zbyxyn6~XTSpE>6q6-0Il)xYju%58$a-;E3$2&Irn2AhrpBJ-GW^)4N z2?8es3oka!y3qSNgf(D=v!iJpFd;1%KGoQI*FLUE_m1rv(x7Q)bY;g5Bqx>i)f#lT z*Q2P5$)tw?(lfeuA7ZJd0OAcgYSCoUaoq+U~VuNNLONzLcL6JIuIUR zsZ9iTSvNGJO1lih(5EM}fMAOxFhQfVRY_7XLP}$9O&Kh^^M$v&PU=aWw*N08D$(=} z(yV0l^i+!SkCx%ieQyT7KsP1J{GzA_5!%CnGT@%{5gpjrK@>HhmS2bGc34qYo+Cz7*;VoD- zg*G&xX}fn-Ihv-pcaiIxGO|`_vsc6)#pMicX0CwlV5xUK)uyV2pzj^ld)2+7)#+;N zT6&p>5h^YZWOaaBcKY|#^aWO;M5f{mq%dl$IVV0SU3f%xawix9eSd!+z=n^K*uhF< z1K@Fsa+P)3>y6sE_ZU3tO1BMK{lFbu7ptxzWRM578KwkYK#+oJV|lo|IBY?|yGh&s z62khD=Gm{xWjctmDtGL6`?tqdKQo09MIODCQC z9^y7a@AZlkp)^^+)l&M1R4H0wSBJLcy@LBzX6YqK3cX_+dUr|s=`ujB>wA`abjoMdoo}Ir zJi2ig3CgOvuq0oSe)>@vtbUI`2R$xsTpn=8Z^LojVanUlT-or=J0!G@;n?_ul3|c& zWvyOwaFB&Y(nw;o_l&jP2rP$8W#~XlrCCFC1r6ucgwtl68@gr4l0lGRQL#J#bGK zVNLp!F1)FtuboYDPqFOAdbwA&>Qr`oKZoT@mVSPWR$3>}5y{|jXo+GhwXvt#;G2LT z2e}$aRHX!-n}XE=ti`&aboQHLSue$w!3marjcBV9&1NZmmBDyJs&5;bV;G*QcqW&P zOZibe->_PhdzYJ`zbws$oNqwxs$cPDoD-tvL>t4$PZ9t-O3++w;)aeCCYW{qDoH+7 zIwb(m3uZ9*42(I}et@R(0gdjEu}Riyo8PCBBbKa&!~fT6U10;$|9EA1vv7t?d{;Z)DkzTY&4B=RPmX2 zbPe5EuY2g)_sbZ1-)m#FHM(fz^!il3_wnO|%DQPiHg9mWlMo3Zr00WMyNoVf3}WHj z*#P$XuAQ^rwn`&4$)e-f_jY}m9ra>0A_RBYte-%f^y7C{O`?}wUY4AAnY+myG?v(P zO;;aTSoDy;oL$jJ!`q`bnm$p{^I1a}i@Qahdf4~%Efx=C;c@I7r`}*`vajBvK}HFg z(=Twu-u;D~<%6t|aO?Y*zHPJl*yL^CXB5p8^ZO%qqf5SJjaKrx9T#5;ez_|1H0$`!**PS=t?S7;cjVjO%i@z+xZua#;U-;R1gCwZ{*}is1aE9JJt2! z(PKMRXM6)3yb||Z@^NvajURBWz8l~7xI_J4?lfrxhFIMelJ?5nwhC{X2=v~^d5$Yo z78d$qa*cPMm7Hk7tvgK{XI1A$GWxu(yUq3W-?0A;#pi>o~w{ zCK&iOLtkpBG~3$;SPY#L%Ec23zXXsWR74ttdc7r9JhNm0TLhig12vcgcH=F%vM|Gj z7=s!&s`04|qamQXlXIDD|gbYp?a>=Pg?{ptQZ<F5Mp*;L5!D7sPp|P$4}gWyBuo%+ne*r+I(AYsNd2H z9)#}pl;(5d`m03W3(u@bKk}+D=lA8b;@e&A=g;5$@|cq5Rz!!OD}{ucn2o#9AmB|x z<;dl%yj|{x`H_*E&tFF#(4r0~%DN6}og={-C?M0IPNdQuPI@~ch}XD>nx+O#y^>E( zPNhptgU53B5y{zX^q6iUBl#u^fJRgV_S+D&tcw5*QyBqr4T4k6b)>V8Bd`hLc_BUn zjJ!6woFGe_e!C+*+@=q%RvW7^JV$fGxO*pglt(i~>O=VyQjyUC7xlXV zL(Q^SWQ>atH*b!y!iao&B9D+L{ktIs{fU%N^E-T7*@oJnQyE@D!kqkD zCMUfDGh&MGid!BxIh~ezBP;qY%(eWH$`^&ya3MF!UCfYuYi(+$KB+P-G{w!j?*FiN z7fw;Xec$kRmtEL^-DT=U@SQnkzB5O>Kkrwnqbsj)?j=1N`r70veIDsBCS7~YZ_g6+ zorRxmjym{HWu%z$U6i*q@P6r)l^e>Z*vH?lH=UOQEM#D^KvVdBv4y~vfO7na#849u z8}fQEEs>fcs=b>DbFr9~dn-0vZkhl!<5%N>8%OM7sWu{2DE;wJUokrhaVLc`!mdfx9_bouvf zqX}`-Ec(<^oh6{zv-@T#B`Jm4*;^)V^QPH(Pl}|Mwv1bKr%N==${gKFt#X*j>gQ9+ z&39hiaFDYo>rO3hSt+q_V4f{TroA-!y=}?1{jR6KzDl`J$RPk=-twuwCP-u3k>T3a zh|I$RZa;oa9$n)YxoM_KB0uTQ)95YlGy!wnMXg@IX!(r3DT$LyF4Ed&ot>e1$ZXg9 z*@DeyMTVA%)Lq|K*4MXhFtpCS-Sux?xc>DK!|SOXH_r%$IFTfVw%j}YLfs2Dj@lX8 z_fx+HZCc;_^`4<)gVEzYknicaV8)XUV8!(PZ`QVC98Gz*g@FviD%V5(>&f0IyI+!1 zo>=Qw>ALMwE&oWm_3Psj}J=fst2^$T_a4 zh4R_ht?vf*%WnV}8W!x-p`d;pbMMGPKscQe)39-xc1$|SUS(nEX;B^!3V;2PEyWgs zE)++buTZKl62Rg_?|7W>>P&wwD{X_AS3ucE?_MmJ>XFYAzb<2}3lm#NWS4q-%&mYwE8q*|bzeNpuORgK<3*sdUM|_dv0s&2*<{GeX4c4! zPGrK8%WXa2T+FBMbqg*#@LMczSLp-10>rdP6_Xe}PjyDCx_H65^~L=kajyohe?76! zh~-I8eNwLi6G*{=Yo(>Vp5ma2()SnM%3>;!_pE7sIP50ZPsXtKDlbs; zKsP`x*b5ol;HR)hw+LWU9`w01>FSYwv$^*G2^ZH*%hys@`*eu`!T9OaJExVQr*c+I zZn!H1Q;_=>bfHd6Q#2BTef{B$aszEsCq!NL`5Q8e+Z4m%-%GecMzra}#fF1Fk$*F0 zxygHfgr>qI#Y5rExybp?MJ-mJ?4w@HkF1CY=1G+G=*QjQoM|9RJD}&W%t*gqzkpvg zsA8s?n7yIQde%e?)l;ORVrzKHq4+^|PrNW=Cg>8aL6k~5*g>3m0r39!fygAKittpNyAA*DtD8%`Ea?aw zG&sQzcrUN<+63;gXXS{>ud8>+w$Z%&4Djn`hV(78#F!s*UA$ly50Bb&pWD*Or2RnD{%lEhkw^2*B;gqf;7F@UThSBTQ_fm3aDn( zLiA|XPG~+K&LtKjAWwxLF?S(xrI0K=wD9gbQ3J3?RV#@&mD=NDM{^3>vAeDwyFNkV z!zmzNGof2lmMW~7L|Bdyt1_CR$O!DWdM1Yj>juLc_gb1<&*D9WWx=e>+$^%eH0>0W zQ=a_S1Sm4l@Z~C4VFKdw-dglwj65~1nlY`sIEK%Q){vS`t(x{~BAqog zjnV|2{6v#|HO9i2CV7-;j|!vsDwtASQ#}{{dXOO}_NjR%6~jK2k20&jF?{y!5V!u| z`IRW9Y~vxqe)7xe;WA@pp;%_G>h+Q+j6Z>;&3N!Ij-_^=Wm|xCfV#iVm?^Gnf0U~2 zB$(#-3f~FPv_+Le;-ipi7xa@Y<9K;$LLxRGk>Ebsrx?u|y9$f7W18Y0B_-aOCUI0X zWSj?myfrwE11yh3F1x@#E3S)ETmq+*#F^#EeW8Y1;|q5-nq9sV^#s8> z*a;06<7)@NC0+`_!NR+%@K{@hAOt(1`}!vpwFExc(?l#CHILQ-V?jLIw-^i-BiLsr zm^>z^CM*QIBh*?Wm_;pIkAV*z2&o7QU+@;jt@32!8XPeqw=tq^lfrW~qUSpjqMr|h z`!GnW;Mj0z)0HkLGI_~2j}f0Fbco|I{kF1p~Tw%r2NE`UbB1UBseW+8DFZE4ynoG((m6TcbWSH4#e5693 zwjx#@!bdO3`6w6d8q87Ig<-Y(OE?8Okc3m=E;e4fKf)>8ky$@n>31^6<1T96T#%nC zP3#IKSeMGE`#P_Ra(oe;sJE149V~8@N51n?PfVj6o-VcvcdnJJHnpMFC8VxZr>;}0 zcJ;8;JVjmdP<;rh*`}vyEhV@N;N%%?_nub2dDxDZQoENz2&!w3q-8RkjvbzY4g%l} z_Lqab83t{c;j6S?Bm*3SDNnmFgidN%JIukV4v9FeIb_D*^p^AV$EP`DgDD?%VeYu| zov83#!VyM$I9BXv1al=%>}YHYuMR3-{+$c@jodpWk83bdgLw-Mb>+EIwQ~y#Ti9M= z@l#vHnDD`M7Gg|N(N`reQb?PbEN@@Ea9|?--I%NXYR|T*$VJm;j3`MsJ>Ai)+%~&v zre=|Pm5&6#Tr@38GIzV!?p<#@@U8JUm%qr4PVuVvdGI6^+>Sv?MQsqkaEjw=^R?*@ zW*R)a;F_y`#m=DZEl1~f>k~4D4PTO|c$qX_@FN)4j$`V^p>2841-Vo&^G%NJV5hm( zuB(Xf%Hu%qTeHEj_H(34*Mil0Xzb|dd71oMS?TSoY3#BS>>tp*7L&G@ z`2mw}FzrA%Ui5cho7~T=ZM$M_ua@d4V%}tqpAMZ;EM~<(YChc{yUYWWmb#{qZD~ zFs@K2emp(i;Vq-%tDN`Fu!1_FgKjGkZjMP+gD2gAiCF)>Vod0S^f6Agg%b_Yiu%ro zTs?rj0WR34j0_f2UH%DEu}Yz6@|@a8F>Z#L-AlZGb+7r(&s4iPU!gCo zYqpP5OGVT$sXotAuK!GXcAS2mdo?4!G$Xyqnc|s3pEfGi*5JXKSmU#-*14<>rsqA) zSz)TQFQi;Pif}BdWY0XyUa-o(w2Ba%ZbZ_j(NS|!MQ2zL=rZjfvCS`X_Fkly=Kg5T zDg2p3CYJ-7&pE$@gu!!??-G9A%iX@GsUvYQU)EtWEti@(@4HnV6?3Xki(E1y|B`9G zjs>ufPoi?=xtHqmo6IjA?q@}C`ss#x{J=4rQ1T^Y@JbN*N>V68{<2qxRuJ6Z!bMcs z(&QZPud!!j-n||Ef~m##t2c!AsN3dQN#SHkb!OAvj%*{dbgP_i$FPbf~$(;)*tz!X7O?2H#tYz{@ljZ+%#$ z4!4B@ws^bk$bIt^cpWjaxDuW<_A~}`e;-DX1$}1Ey0`-0o4(1m3V(xTr}t5p0^WK# zUv8WPqtS)0DT^93$1-I$c=+deKG^R}uJoh9-1*h$W8D!KqL{(sca-96F;@qJe z(hHosxp!DXC|&Jd&2>BMuKHj6^hY>V)}XijRxlVNy5F zru*l7SD_`V@M-tCU9H;qOl!S-k5W+2rJ&xKj5vh+y~AAorFsuc7j(n+RydeLVU?Fe zoX!^DwCyaw4jkNFuzp(s##|~v9A{te;;8`7~Ncadz0;*!}WK!vfo{l8jrihPON=JM!||qw%zHY6IJ00>9pAR zMNy2Wm0jXEH|+v50^c*+Gs-P-L@se`SX0b9um~PkBv%T~@e#VTlgj z0zumrNW7l*X9j&JYG0Ax|Lu4W4p0_~uj^l6bxA?d9!` z7Y9Pi2N#sy_uTL#=Rth<{^{SJkeDZQ@H+`(~&$tKnO>h|7fq)-5O5TS4;A;NSLj7`8*bIC;Fn0mX&7B2d+60#B-x-4CLR; zrOW5*T^C;J!(}QKn|-(E+hSxY-?u5HH(Vd_7*cm`@L!cHz_74Cf@uc zE?uv2V}0zoyw9T%nYxXM0=+E7IJx>sm@^}8`g6^wHi%noY(gicyO` zMp3-KR}-IJQT&-0bMcCfQ>yt5rkOZHZ~I)8Z;Zhobe#6)G zqaC_st41D4+xtAvvGzowaGZg47vYH?H_@xIaZ(1TFo|Cb@x+MP0lL78O^!e?J#B3D z^>_EfL*?-+oH4?g+$Kg%t!So7c8G`_lhPi2Z`ANv6)T*nb=JPuk-DBnvs37kkAcn- zvv|?CCZgD+Uw0%G3;^t*uQFZ90VD$eP=c6Vwxvkt)a-RYBnl3yWRQ9i+i8mk59`^i=_hZz0HuR z#mFSzI(zs_I&5t2Gwz8`AEv<(7_Zc%>;?HzI`9y3d^*mN*3?b6NNC*I;T3BIM0h7y zp!J@I1S5mqGo^L%`t7YFiun66$G}M84Levoc23QmOCC~!7FzPjLG)3#4*;>=qEyLn2CxkWzY9@)#bAL#7f0pL4z#a8M{o4uY&Z?XEuLI;(Z zy|^lxdYKKAP*#qslqK^JCRb{v$K2kDC0qSWl&Z19D{c&1*ens(>f*@J9+`Ie0)fJ0 zS;bv+`~}=HrxKh_f0<{AG+w?GhI#LYNjwf0fu!u0bwfN3YG9ob%DykmWNg# z(=}t1VMtOeYO_B)v}eRsq9Cr}kXh*Znl5ugXqq9{GC*u*jA3murM@^@S|%_4LV}7l zc_M@fjv2p~M2-GU^G37+JMOM6nS$xkctHQ{P3Xy!tj*C`8EedyZ&sam;CKK4O~7B1 zK76okTC8=>#5tHi@5QBZ67oiISTEc?#TMnHqyD)m7-lneM|P0^mw8EZJo_Xae^{jU zV`lYuVwd1$gWG~$Elp_^9||vmJIQVC)mzNczRH@~WPWBk9fRl6&u-SDk0m6QRTvkV zhNYSGbxnK8nLWI67ccwrnt7jJ&R5mSHLpim+5aCXwr_R09gXNX?dw0S!`T^ZhyMpeMjH2 zKW{ZjyQ!O7w7MTHY;L$G)!W)k#dY~A6MeE{FKvf|_t&|Feu`c)U*VE){m?(Eh8=iy!}^0-&Rqa1xu(iBr7`LQPt3hht=k9b9j*pk zo>k%;Aaw;~ZO*&jYwWJ_-&9=mKPhZCKdm+k&t@T6r$&***A3JK5cV-Wr%brO_qF+{ zuAg?!!@87WoMRMen{oVgyc{=+*(A+@A2no7CJLmz3hF<@4NWt}djU0aa%vP->iyJ5 z+UcX_ZA7v3`!lVIe!F|Y^!+64H0DK|`>~gc4U6OSYxjnS56kULBJuDNBk8MmO3|)9 z>I2tdQ`Vt=**P1*JZNaw@B8RBTJB|XLaa<&2vJj?Q*x30^0OL<`OFkw1=d63 zKdF$v?J+dM8@R8q6VmZh>i5N6j8sX!si$WA3Y2~_1XrxNT1CYtPEHa| zsdB?9fGAG16=zq#SFq{5;)r|vR*o*=kBbFdn$%;fDD-hCk3n>3B-&3F6Gh}Hs*6bE zkKzgo#k)iq41|)BBmrXNn+eD>JSt$1%0U_8m*;nbNd8M1(N+aBRX#^RRN&VLkdx1- zhz!4AeDtrXsO;mYKgy{mI%@@-y@4v7jjlXKw*?y+96LPz0*DAsbTlskGTDF{w*?I)Q1%$18!XZ9q48_laqlAI{vfBee@9N)11LVY z#H05Se*LkR|3XfQ#R;O0e~?q-mytXCpovCUZZQOcix0rZ{S!I8I~yNKLQXvs&#>`l z=tM|0>ev!(S{3vMIVB`QCK9Q0DWc62G|9mvK@qrZ`} zVWGyF@GC2*vBZ?WDW_T_<774Q4hUqt%2{XlEF*z1%qn>sD3iG#$yNn=Ig}0Nio3&&G78im zn?T2skW=Ls6x1&$t6y-5BP~|KW`=HF*2!GDos(sfZi~$k@`kh3J%%0btmzybBU3Y8!7fas!oB}EU0iym$jJ%$$t3l=}oEe^X=9Kl?i0DzH@Q(N6!rxOk` z*eho^`DNJ4^ypX5N?v6RzslZvl>;xy6)efuEh)THQWRbCs-&cJ_`IZitE3WMS}j;A z&;_zp&P#@4FGZC)&%;yU;0}0Mmta|sZdqq0Mjl@_R8lrFTqd7@>HEuG?Ymz#cBg#g zq-=_0uL_pEyyk6Oxp&I5l5R@ef_L&Wl1;n=%#Y4E#l0;`l8*uU&m>*~tAY)B%8ex#Z$tD0q@ z8Yf>p39aE2s&Ro<^Rm=1W<|MnA(1U0>e3pTks6IT#tWs{67sbhMzzS&3pty$P5ueu zt+liZwJ(foG~{bUM(A})>o$~0{A#V?NKKshdEM34TCEW#S-m>R(s~(0tz~P?ICI^J zNj9Gf`F<4FH?*wNF9sW-2F_pgku(&BLe+1E>lUYg{;hS-z3Tnt8}9-eby=EhvuXvc zn><(=BJ~>FM})t=YG4tEafjxUzU2*hy2#0#x1FcDO)9IcwfeQ-!t0ia^A5>h9g_J}W?izB&lZqdX4Hl=KSwZIE4Lmiw^b-< zZecGeoTBrEB$9|+Qe4+mCmo|y*eyc^Sv>bAT5tH-jN>L}} z8WZ;2E%fvL0q-E{dX|C5z#bYz{~NDC+N{Cy>OL3!PIby5HP+fN*VlG;heA#{f^NZ( z4Ak{$P{DQRk|(4%gsTP%4--2_wqWii77TH%Ha}aWk60Xx7VeMK?!gZ7d*jVE_c*AC7TxF!0m96@8r0omk zY;3FQDMzpH=-VK^EiuzsTDBuGwnxn@yF`)zPu>)-fV==ebS)`sjF~ z&8X}_4QhxJmOb8QLb+>$SsOwH$D%t{$Qj}$n6f9@RmdyJskZ==>|-oAgE1;M^u`x% z3;4^IEZ7ktVy6ue`i1Z*2-g@*uOEmeZ^+eP^n9DmMPi(@ zS0!bVB}0o?+6~C#J>l)Oh{VGe7E*?BNsD9U@3+U^m*bRbJR8pTAX9tn(piWSTK_chYgdPG!F5-Q9VypLC?RTFN{o^v0Y z^Kh`rZ4{(F`6C|ves25J+rfygZFJ+BEfn~fQv5TT+QW!;S|9j8LxiNAPHS9d>ez$a^lTsP zq_7}JyrY7OZm_vLL_5A=%v2`VxK8O{I^E=rIt4?u4+NjQp=2k6x_cU`zu`Peu(KUJ zM=>leX|FtnrK6zVz3sYDX8~|~_g>~J7}tlm`?eb86Q8JaQ6+B}7u?WY@6rA2)b)*@ zIJzhurKVQ^lchDJ$EM|zsz2hXY)e4gjdiHSr`x%(>)n)WdEN;JwgC_;OulO}$`Nr$)h2xO@_aM6C_U1S+wf#dOwLG|+?PNd&;8?rJ(xL<;-=UKZt;bjC0rB$>UQKW zu7td_f#MEuY2%Q)t@ft}*pY+UkpLS-YwV+cHV{vj#7tfDSO*Q)xI@9o=m+mT6tK^s z_X5Zt%mmzyP{H#BrvLV{8=%x%A@{QHptk$ruO-#7cdnjhOG-(MBq?bb`TtalM5Aj% zsn`@l8LM-tm95$V^JZTVl2;s0`&#Wc=aT~0321o@iq$qA%=LEPS32gZ&Q>JibU*@^ z^k6*=5Iy7hTq{E(j$I?ubkpReLAGKTv)PtuiCOuLzD%=i^Flf5aL(G1H0p<$7@Snn z*B8{DXRaUD3n*5T?wuVo=`aTDD#%E_b!t9A48LkQdvs6XqLJ8Y=>3;GzE@q+d1Y4G zkFE2NM%E<(3u~DL&^(3PKAhfeA3p+#2~oVHQEid;*JNc}sQ+;Y`@`w>uRn&0+J z4Qm|P;$+TtIG3AbYe)jB$K9Q^WvD^ENlCeZaK5~L!&RVm=j>g}nFjHlyA~9Z{`pOU z5GDZcIh17P(x$|}e#I?ARO)XeyH08iR_bqrGJ3lQ)R z*_@}v5g}}X`g!`Z8$(C(O`A5PONaSFuRL>P=W;ckAPp*e<#&kOobqngaZ!Ux6~1g9 z{ezrRz(a+*M9RZ6zEN~|ga6$6Xlf0CbV5C(?t|)Ye;LSGtx|(N7v62=LwNFQY)tCY z%pZ0OA5QSacpUqGs# zeU4MBTjO%LynFL!cbD>n-PddIbD0C+=V%_+iRJ|+>PhMbA!p{-{q|{i)il}CNp6IM zv}&=v8dr`@yF1}HWFlyBa1E|?*k+c|E5cy+)djC38ls5DR|*EMmjPBBGO?bHH^M6Ls){m?R0k7z(q!*4 zE8g!n6*$1;#odRoU21G|jg?)Gt;Jm1Fx`F8UZl_c?iT`MPK|wR$?5Sx@+tLhIJQds z$^8kS)*gXKFQ*)gyk*H8zCmjH^Cv#x0NX)%x>0v{3PO#eDDL|2SCr!Ua5I66y6h&5 zp?G06Y4A0*li;544?PiS=l7{hPgc9-vWcOhv2fH_&#MTuo8Tyc>`h8!!abrG$MtU! z*LGLA(Mw#;FuT0#v4c2+=b&2-WzXf?Nk`G=V+u!2{qyRwT5|6u;sbiuYulA66uF~> zDo<29)FWAyo4Tm%)N`|U1F6(nfHKRjn*0|jLx2^w<4vds6Cbp0ojbHGmg8nDE` zY2>w+wfS-`E_B@Slc|-VPQp%?^sc>|Ol$0Q67_4=yYXW(y#wql7IsPBo_;E0K;2m) zwOQX;dg|GDsIyezC4)QWQ<*cp&eHYG2JZe-S^uzC)25z(R=*|pw)u*G!&LUy&|3;0 zFB#r{H}&GE_tv@M_h!RKKc;fd!7j=WIinEzX#$0Yi|Rk@)i4)z9y#M^^XWW>J{Qf4 z|Jz=jX_@JB_pfigzUx2Jx(x9M?32H7kT&!Bvxdk0x2-pRG|aSp4f6>4D1YHm$KO5S}x+Fbj2 zoS+uNZqQPg*N;*oL*aFIJl+|OPZ6~!}17Fh=o@F!s77VF_B(7;<6X2mJkw8c5GN~j=2W{+TF(*tR ztH~Hv_)8n)HBk`S6UlOLLN-2GN67^nA!92Yo-Co1@)^sb-8PhqO$pW#1vgXr%(-i; zigb}dy33T?>cq$tkF`~YrGvlNQO(~%oYxr=uOb}dxDQSiUHXSY-L#8_SYy=Row;)9 zG*nJREVm=&rd}p@U1Imw{!U$!P*68Ucr|!6Ck)cVOC(vTNAy zzWfAwR-_S70(kkIQ-vetAefUCEmDTV-mB6Jr2r>1ypUBMjQbkzznMmb2~6-dHpC!r z-&9e28RB@V1^HD)jt)(TKd)PP1V{pN4Z1w>AnrZp+!&x)NTfZ)40l^m$QX_LfA;6#52BHQP|)4xNjV89V?^$L~ts1XXn z=p3hK6U;C`VhNEla3or7>h1O#n_nf!-y{Rs;6FgU$DOk z8cfJQ!Xw%(Cf18=CQL1+F);AeY4{w=)4tYouz%HKvS-S}sr51^>nx8CmYrmP;i_|A z9WpO6C<16}OV6*EzY-kVv1W&2>9qM72JUpRgS#gI5LcWu+hDCaG2~Oj3e#pFkRS*h zB2N;(dk8Qx1{Pm`9Ha_9%fD$hR3P$fa?O%K-NUyCL1-zdvKaB(_1V{|qCYUf+=Vh; zE)>v*;Y`{0{dOf~trO_y$2lA`o$F7#Ot@)>D>?BQ4=*6Ffw8r?gIP7E8(|&#(fUyU zpKAdf+e#oBZil^*`RWfBSEZr>E{J|C5LePhze6yUuLC~Yg)dBGZg&(u#~QaGZY(k1 z6S*VtQmM|RV#05~3Zhe?eBVaO#OqM`SNX2;FkR0V%)8=D*9#24KTG4jICRXVFQum`M6 zpr}XIJyjBDg8vGOxcW_yPkGHmRYW}>HDQ_H>u7iv0OI7HZM}>h{P&Ef`a89u4%&i)^X6o} zGycAM;1cJ^VN>j~+vtz%;N1u=GI8#pl?xV%yEXn@!aL{3hRh&Jw1Cf!iolri(x_7- z+RTn$VQ_Q#^KLrDui&SuuWW-~vkHH|--@!6KqyD+2DVVhlWk_YGg4Tu6vnlXrwv!C zNS^ceD8@U5KtSHpkV{AC2XhN#*BY%$DgqKO?RvGfd-UUe3=C6!^bQ9-=>M`M=93j; zh)cQn=7|PK;r;O;#Sh)jCFKq8`jU`3jB6~k^yt?gjbSngW4Pb^#P_~KZ#BhR9*b;2 z&S>erX1~d@)QtPyq-{q=lLCB1noPxqRMWI&Je6P3mRXe}r=vP6IKTTHQ4LRJW*}YH61~U$AU%^V`R82Z6TETMzQ^1em z<4aax+kt}jkeaB8^!FP|N*2W5)DM8>-)EG@nNRIk8-Rss%u9FB%n|K1n+R?+MOnys z=)s#lFSp;P*XPJE&Q)Z%vRrc}%0eYUV*{R4TsUJcMU$iSx457@O0#-n@Csuapb2EB$`Z^8 zzDujs264I*7!f#2>pE+A&}j{}2i18BSvW@Avy2RvrM1}uMHfe2v^7PYhfow?Z;ok1 zMp#CrL22VRuzJ2x&u61_5~E0~_(C?i;F#t>jIuZ*x3qKYUAKvp0R_Gmc4_DgMqQ_$Fa0|@Y8SL@(gNKf+M*& zrAr{1{K?}eX)MhKH*K{WP$HNZ7{5D#icp0+@kBFqaYCsbU=wjt;AC_`IN z%8}jA!-Q^#jLjtWy)4>`*SGvFsN@s(aVL+hR#HzrO0o(ff9{VK8gkbrf;Q0z$<9Db z4b>ry2w5}pS5Ndu0t%FvqGg5AwSZ*u8V2uyo`!lreN?ZQUG{!ri&!^M20sn91#NPI zHuuD(l@XRdg%7x7l8BrF0%)OU>h1ugWJfYdfRcD!8ef4#s*$L;8d~V5urlFF!aLQM zQ^~0S9z!dpO9^mLHKoJ~nu-VoPe9>fKn^TAbqLM1LP3*DH3Fc{yB#x*rx2+T*BPb4NPr|3)M35;+rUY%4dVt??fCX;rUHl-CcTu>&DQt@0M; z`G-THcLyA+T=ahMmx{i`LSeBu-s0?i|J>AqE2NlO@lgKN_K_#1P=>7nJ)z zf&jH{0vXykr!Mm0Yc-|As!U3f9>yu%`i+uPOs6E&w*P~g-EDpEHFBzhtalhfV3oXc zfk=S=^P%P>k@UL*2b5>n+}3YGFKdP0Mm?Vq0683TXp7|rt{6yn=AmU@#3#bZv1l~l z*~dLHezD9m0(fMPj6)2OBtSMVhMori=Br+)T`K@3=F1DBPn5~|iGXibXrDcxMAeH& zD=*G$bG{~!@f#tp$sruYK;0`SYeDo4_)B}inBRLKK{V9KNuNWKET{++6;6>u1kE3S z{NX&m3F%Zdgez}BlK>^QDetyD@%!h)MQ3RKEKgy^6465%X-+YlE2=2oD%aSSqO3ik zX4O1$@eIyR!JZIsn~|KJbAqa;gM=Uxc>;XiDE|ZiR3btzZlTabfG!^RXb+j;nV)zY z2(Sgx8zDOd%g&5S8#7B8@xVA+pqU&x&ghv00CGv3!}iVqGw%%}7NWEzF_j`nd*0_XM0WiTG$wHZ+NN9D-x;iIX$s-^&+a^mh2E zD^N$|ynm@=OTrJ3=dIV^DxwZQzNS79DRhP{fvA*Jl8{pqoii~?5@)w_P?8`Hy5tG@ zL_$u*{vf9bFrdXB( z0)lV=gVKtn`TQTZW8TO*O_~PxF!~s$q3L8bKRd>Lp@}73MG$KFNJ380g#fW~&}z+D73IYfGNr$eQy;OWr=_S% z|3pr+V(1!Uv*d9sdlhG56!5CjU8@$b9gw{0*&@7dMhHz5YLgcNu$<%TAp1xf0svH1 z{s%er{1@cZv+OLPeS4w(DQ95bvP?NJ3|Ky4p1KInczcavzD%?-Kk%OYe#dX;LL(u{O5I zM_??Gr~=5N__T86)7}HXU&yJDLa!JI`^mFc`+2YK>t6lE-YY@9sh)}gdcCHvv6`p7 zme2dH1@(Tk?>0N_`(52_mFjHeSd(H(Ch6wsQp^P(&jJZ6#t))Qvi9F5yXa|>$HL}TOqFs z>irvXI!R0U&K6nR3DF?M5l4sgP9N+!qjw3&;ZYPOH{>Lt8|H-?E<+xP4Zjm+E{yFh z#)G?Ek*-o?u~@J>`rnXKCjwjp22Mr)8*=(6RKj_W9RD}u)brnvQ@ej6r+5$DFXT(3 zntzbfPKda|#C`mKL{8hI(9Zt@)OCW_vv+fPqAhmn{tC}~<#8@IBndg4 zNk{XEpl6II|5uUI&i@&5S|vePQPo-m(YR4f0tVg_K!1?a*TQG^ll(iVnoe+h0yr7} z2RXg{59HMDZ^)@Dc~d9Fe@0H7{}nks6&6{cI`r-RJ95hEVrxWxPv70?FXU8}gq$v_ z1nbO!{z6W#k^X;uPn6bQ$f@Ojg`8^rK~CpesixZ7mSa_ho^n!I#lF$X?gdozzBgL= zR7FKrF*}I`$HgvD?oX$QAl3oUm4{T%cTmr<^ZHfqLAjG>ctkQ5T?AMLu&pJFp|nWM zXcc0kZRuoeS>h3+rYqHzHE!UGHyK~LWzJ?l?TOFrOs9$=fzb6R`kHMXlqMilyZ z$hU`U#)|V^?@@;aAGvmy>A{jVHMSV!~L}D&&Q8VUUb)&-Tid?Uf$t1|&#= zY+HGQQ#p#vj(o}!^)?9o(-!@cI7vIPdD^*IgkBGMvGqVjtu8f+BBYBN6beWZ!o~I& zNN-lusiG619>nd`hnNoHwrj;!@BdNcG^|NO_m*hIF3KQqvqlsYZLi9yzs|{u#VYV8 z&|<6icK3I?>@^uV6TSj!zE;kB6$0a67j1^f4ELyc zv2taqlD0^U;l7R2J}_h-a+6Vs@<3jJpZfxq=Gnxb7>;8d!|&ENe*}piYZ(5#Z}(IA>CXfX*31i^p64*+piWFKFy?ox z=Pv(j_c{rGeiG1iQhxJ9?ZVIG3%^P{SgQf>YOx7x!+oh6L&TLI?OzW+p-)|uP8&N| zMeHZ^#vzhbsHfhH)nWpYou?WoDw1oe!p=B#?veZsbvS!|I+1faZ2Y8hpkIEsCu5gE z+48rt;l}nj!%ohzJo5ZQ2cxWms!z@NsS=+gC7L)&#cfF>bm?-^{s-hVmhL<2huI06 z?%2N}ryqLa*v*jOdy_ofz)aZ5(G4)W#|S!Ov~c7tf<-9?6ZXZlRCu)0m8}@%s3tK2Un+T8P{7WU9Q{ zq$!E7LGjwm?vapFx(k|uB4yCaclNKmv|-BaW)iRnp|$@DIn{XjL(ra_;k6B|Kkpyp zboP}&{*Cj}YvAqKW-8Uv0*YFN{(mB;LowlPr3*iI%Q(3o#EOP)`+S+J_X+*9aQHnQ zaodCM=8b@W(UnWlGC3^Q-Z2gT4LN;y8dLhl?Ex*372CmOQfH~3AEogJIn@;`EYq#!VJZ_%+wvYg7&!>cujV0D zsQbu^8%#YJmArhly!Y?ODXS|LHPm&gMi*H}-G7MS)BP86no2@W`Se`=ft;3; zkki+$De`I3-=(C7NXY3l2{|<@WLp0la+<0b_IKp;qF&hFkW&(Z^LOO*_dk%+fWMH_ zu3(1Ynks+Vkicr`m^(K{dMU;^tKT8V|BjsU{nyAT-(Scn+uxDXaiVM^mV})4OK|)_ zPRCsWu`;aaU0u5+5^~B)LQb*&L{2aL6FHqBA*Z|~K`wKZu zA|a=cI1+N&A1_?RvypaTUil?Kr8psrgq;5PcgQL6Z^-Ea2|4BccgQKC^?ra@6npo7 zLQbKot=M|Roj=H_^k2v+AnxzTDWU861rl-!4EZmSQ`f&Cr+6{|pc4RP2?I4zJPDRn z2?k;DQ&M7=$z{h`Wmf>_0RDt_Mc;y;&w9%zo-2S5BP`?-0RZC%0DuZBcAcF$XV24M zWKiY%to+&+!M2iX*{=e+xa_IpB2!i&UOI_@&Q0<)5O9>zZkYwV1+MB2w)yIOiDgg( zuo5x}TnT&z03dh6V?D%IfQoy;fH6DpF?BDHnGt#+x?h#Tl#7)i0ic2mXoY0#FszN>ergYMmPyw4&ge1QI=c@73 zpwk7Q#GcsXvvm-Yk$qNppt{W`lXMv!dy#-T?LI1QAh*~cgjqR(Qn>TJvN{{X3g$3Gj!zVF+Nec!WhvtVS&7E;#i`x+w6 zjKO5z5<<2jBqWh_46-FkC8@?5C0R=n-|6*wf8U?)_xH!|{LcCP{oKxZF4wuP=XE{q zHysSK^H+S*zPozAM>d=fR-78NWPEZ4qkfhJpi+s{yL00K4I6+q1^huctf+5)_)%zw z;HV5;HB#TONdUkH(H^G9`b{YmX|Q3HpGQx)`a0C~f4yshk|2#T*Jjd|h$93#Cfjui z0)#wd=>~>zmqj4df!Cqm=lT(#kP$VS`F)u@m8Z6yBRAcILmE@U zM>_Es7>ITZ3$!B+DyXR^fDvGTrc*8)fW1Sr2vB53+MlL!572?u-)I6TkZeed$9Won ziay=}z(Q(sZjxYO$awobB1G_04F~~!cXNbih>n4ecs!(=JAHxY5sf*{J;vpHvJEyZ zCZ-X(a`n>6)OpB|>q~BCyB7gFOJ!F80A}n?Z<+55%@gflb?%6aGy`!UKY&)HetP~Q zS0$*%cm$@g1_p=Ux(`-GCK_CP2JxB^5>YXh>hPrd^AZbuGW~%DVEpWb8kV=Fasbru z?33m~b@G$?RNxyuC@q%?hoMcmB;797DZ4sd1otrUj%>5g$2UUGs}HXOgIP?7?96nq zf$*K~i)&xjAB?R=D5qJCH*^OEIp`c#8zV0Q}Q)>VGJZC^K8w@PNdNT zaSo^l^usr=17o_3T*p-2W+kXvv=*C3QG# zni&T(Vy+V2(gH_xxg&rezTdNq^Q!lI|9s;H#=l?rHA4SQR>2uPNOND}BP{@nv(CPV z_&^)m`^{A}V&b-bO=V0=q78s`J1q7zuk&%=XW=gi(w7#0j6ER@zInAXJ0HV z2N1TB4_ihvhUIU=*6b;Xvy&O_fuUgFP#o!+AJ&SCPX1oA9ph`(Zj#p4NJI*5#Mk5PZcWR%;{E4uX0 zT5ilN3dTSt3`x_F?-3@t)MXTg0}HSLZ>A;fKauY}+kpUmQ>I+LKm-UFS7M>gEPVV_ z=Oa5UhE=XAfkM?QRHnDy$`-bOdWWpy1ema7s@ z6IdZnP(7VdC$hNc<*fguPKBt{sm#CBDJ@;T9&q3I|w(=qjLj3^;>LV8ol;d00wX&u?=T~T_YoJP9v3iMsoHB zGi%72&aIYwC?S|e`bz+~8ym<+19nCoM3H~&5oy@4WejOoy2xq`v`-#^+2XWX;Q${s z6-alAp~_HgscDq+5S|Ikb(pHGts+`K1RBLM#UbW!L+tNUPvK=^(#t6s8fmm_0Nh1& zU?+@`6PxRa4S~`nR{{0n^b%io^SV@B1$DT1qkDe~Hi;7x0}Z0qlT9UXciPD9dVqV8 zShzmmQ-Yq5*kF_fG0_1Ts1FSqGQIDh9g?ZdoQtLzzy;Q~EWfJqo&;E<%cF&eF%kya zo6raGqP+DLdT5nv5ulBYZlPWI*I&=zH0bDHqbBHQU_N3BTIIDKl%W%i??j2@kJyfa z_;dk3cT|$0{G#;+)GAe8j^aT);M%YLw^1gqIRW!(X(bwIA>l|?2l_7g&ObpSzxj)^ zB>-};asEU--4z-+b*!vD(F8wIr$DSPJNpUp-Xpaphfy?X#QLeK^K zXPO!l&7&Y%{|X2oRqxn{4hqm45Iu7hOSdlm+Pc_+lKxja?og@I zU(Lb%$ZANc&^VzdN_h^E4F5}=0{$f0My*@%JfB@*uul7zI;ALsC}Yc(QxBtIx(=1% zOuc+p+gR+VtaSf>sZ)(k6iqHx2w$lt2_P1lPX7;en*4-QRNwx8Q>UXDc-yaF(3V~eYs`KoKi z(gCak9erHz7nx<@lK_EE?32b+T4w+o7I-Sx4X9oie?F!sO@yWQFQ5g*}gRR6sG=RH-%Y3(cPQR{lYb7x;VWW7xx(x_v2L=URQVFD*VfpPk}14bJMr| zN6avqzq09$#+sN#m~yq>?tUf-$YE|`{?;8B2T}|6UO4j7{I{}V4)(X_X9ZK z>|Ro#OPd2+!lqubj{N#1IIb&R3R6qdFfWb@FXel@@dKO>h)hDV|6h>Pt;B=bO^v8{Wt;+LE~*zGl1kInM0{w!@ELyGdrE=#?|rUg z`I5V(8)rzYzw$+{V(mdiU`y)S-IKr!@%5K5_J`_0_!edyOVE>4-|N{d5kGXre3`S> zo<0>Jb|(e(CHY>z67Z$l-x3%+#kpCy6*MIBS*mMgV{biIB(OOuIL|tGym>Q>@$&h1 z$jOf2sjW>B^`#(4$P1B>mr5b83`1tELteXuya^3?n-nsa9rCUsWWG7%eP77JRLF;q zA*Ua=LOz{@P*_42MM9U9LYECgSFA%<-9pzwLq8{lu4jjCRD^zM4*l8}x;Yj4?PKWH zR_OPW&~28m9g(nIrLaB2uzl;W1Glgrpb!%Tvhf_r+dp3D*j0)b3e^%fZnh(wL&|l3KnB;7GwD{-0E}m z^-j8r;RJVL^fef*MGL`oJuP$~#T`Si@1(oXNo(B#i5X3^pC-81(|c)M@%~A7O@`K@ z7;+Jlo=}{Yu%4E*9esV2&hPCNvz#l8X8zx|-N z$I`kX>rapRQWZIFU=mBVv|c_!4@~Po@yQrxiFm; zoSWiKxq6*Sw5lYB0_d*U&{-D~?*62^FqeY%pf!`BeeCzUl#kBZIidU3QJ^SYn^r>m z!`~f0(^~Ej?C7W`OtAHcP7e#u@H^59hg_`x5myiPrKEYW#b2a=qm@&~Pg9aSXv;p+ zdBw%wdkCK9{SyZu_}-x{(_*-$O=H}d8eE(f?{SR^s-Ef=)4i+v6Gx;ogMy8xAzs8k znY>qgr$JUkJerbBGGa(;ZX}---Ztn zIi&5H{CEB$r&^Jl>pSA3xngGd3IV$kkq&l?}%&Or^Z8JL$hD~43afV>i<&q<=#1Y(b%%!fYmRrKlP$N@JuDB zj}_fBpS)KJ{0t44Orf+Zw4Yyh61aA9NZEaR`YnsB8}tV#CKbe?-IUJ%?x_m5|5&NckpPL^wG9ZzpC+*(k7kt`eohVpn8w=x!vEmlu8 z_nt7^Rj?(Ho@qlpt4zc?B_gvju$E*O{jIWmbOvQK63yj%99gdy%Ba)Fu?~OcI7wC-l{ypLw77sD4YORsDs*?_>38 z1mp84VZqmXv@gx;JWXF(>HX1|HP0!!#ee?dwKH!n1ofVI>za`DcvL&XK=Yl??Q5Fz z9G{k_>W{Yz{tx8zWBk`%txwl}|Iwo07!9=-Qv^k#w91^lEE3Z6PPA9@EDV3m?Ug;r zm2?h19+73bV6C%Wm0_s6arcpaUkV4=HE+*Sxzv(fdnPw#tg##Z*4 znMC1L;gsQ5BC5Y#1oe5cexK;?&sZ4!nM?6b92Yd>dM?dr=Ar7M|68u2HEj0wb;F~L zM)&FiGXg2|caYD+2&=a;r|+${?H~3Vo&NcKYGmFQ1F?`;f}X*B8W}V$6{p}?e~P7} z2Y@WIDrh`cEm3y>&bI^mEJ9s}1VzaVvL7jmv=yZA7f;ulM+YkoYa`(ri+vO3=qP;` zqVCpXzVz8eXcowfV=E;i;UJk{sxQV-;ICeDsm6)%7tOQk@QP5XjcJ(=mChcxl~RVx z;9W!ubL}dsbJd8%P?9-b)sLKMi?E*`yu&}Go%$<7qP%7#Q4tX^YKmw z<&a&m4Q8sU5W)Pcb6LQ^GFjme&Uit^JGV||#ek&*^Z9XWLB+9z3`#8hnlKP)&#aZ)$Yx^BOw0qzsTux zG;w4+3gYr{Lv{5ck%^v9Z3^wp{c$f%iyhoAbBTP@mXfB-2+xT8H7%Ttg=-Vz2}zlc z$|%Usy|ZIy{dOPSIniG_cAy`=J0va>69*QRnyHVk&5eN*vQXwA@qs9t6$9A*2SVtT!Y zO*P+}*3fh9bjB1OQ@f`iZ|{wJBh(PnaHUt?jn(OVmg+LE^Fc8pI3j)P!`DXkyw7G& zq95w+Pc!|gY_xqRXLon)a%Ox6YphVgjkJ1R$5heRV5AzbJ z#>TATU*y!a%yrq#5b9G8<)D3jKqe zF0r2$bPh=ws;1jFEpeMY77e>R+9+VO96=6#B!qhpw1pliM}B zwUCIpDF`Ay4egq`;wx@cjHTI^0ICj855|6rW>B1kGweXghxrs|v_*PjI+9ho+Jj!P zHW==pFl`Xt%45tGBEolI#@DWLLna(4It=yV+hR}ERl#xol7VmzPRm>;h%wamsW(TJ z8}j%jk9Mr5ef7>GY(QTrVu~!dT8xXN_RjKn+IT2PlLpTw%qEn{EESl)>6*Q;bXwlQ zXZFP#ey%I=E0;kuopyFE)HBuW?N(Ck_oqSht{*Q89P18u9m6eag#>4Pts;G%V|&Fr zlUW0OK#zItGUu76MbC}Qb$HSSYfQ!0rZ|rZ7Kf*UUV39)htzsdN|EE9IC^tqajstj zH*S7?UuPK*$#-HvpI_p+TBb5Zx*w7;A){#&U;`6?DF_>y!>jr&&3zu0(~WeyO7L_9||-bdojdQcPbzyViFaNmN{W zMyFSP%RNGi{3wS4)kE-pHyLwlEU_DK+A7{|zW(Q zR5z2Aj~6-n4@Zm)cMBdKud4U|n!0YdU!#0Y@#kgH!+|vYm6`0AVzSW3ZCcH0c4i+4 znU`dfdf1A~`X1Xa{4Mv{!B5+vjaU6A74uZPg#Bci9soROp6>`i=qmHlEs6JUS!=_7|XpX=5a9Vo59mRBtB;neKtY_ zR`nM`9j(+ihEp?HPE5c&eV#&7p2GiuP|3&l;o?t41Zk7yFW~_!p1fRM{1xyI>mGuo zAhs)u9GN81OM~#M$zqz-vZ4I{QBYS9yle=`|0t+n0fDK16jYG(8iLmaA)LLWSRtkw z>TFbuQYi+(VJv3Opjl=<1mYh8WX)-NqeOz{ZoUB$nD9Yy{GAXV(hvG<~hRrQ=tl*?&sGO`g z>g3g5|8(FoFV#PtT~?#|r!8K*ZU6YE8WNZ`cGFU%NTQcnX|nFoD^;X7@7iDg)F8Fi za+me5e+pd`b@0~uc>W*%l*CK*PtnVkJdy{SfBjP-s(-5BePt}!N|PdqCR+>KJ)fUV z^-uNx`lsM}e=kYsw^ z{`#kQi7Rk+XP~f{r^}l`j&yg8LaKlIkFv@F_x#sCW%KzImn^McDI0fs*EQ)Lg@SUs zLUOxDG(dxm`n)twLfZzC6kwcEX*c5!oBAE&J>owz{`1(5shy-5t235tN#Lbls{S|;;sxsC^ZZkIv}(aARH8`bU~D9;xyt)^(@DGc5$f!LFJ)n9|MBq zB~FtvP`N4Tw!sij$Es3ob}13f=YQ6WLKZ9gTyPf6PbE+jA^JpsFO@)5frwy`W^>6g zR04H<9TT0L(}$)Ks3ueb6^k}PuNZbBO{PKS{}QN{kS5I`flgp(FiuYY9|HAK!_Kv2 zdVLackj$xo)(gfNZ32j!ARa1#TAOk;gw^2#xIAE}TzAOHf5;b2ps|Cz$Q!B;g=jkC z8Y2n%=r3kyS@GjFr1xoHw^AUlM!o4aE_@Ju-wR!M1)ZYnVM4_Ah7ts48_>{TRP_Cj z?LhP#wna+RFc@h214SiJhx0Lp1d^CbW7(V6XA~CB5I~aj`Qn!Nv`r90vG! zDx42jf*3g?C#>-vLttrY_b_1OjvC~W<4|Y1*_2nlf2Lz-7CAqf{11VO{9KQ!XnXlm zGX%DZvT+`q(j;g(lTsXqG$A0qP9Rrv5Q>0;1>*!~DN_0f{!AG|3Q4OF@s~jD#5K#w zi?n|uE>Q_oB=>-v$TX7Q8O5C`V?rfRB|P404k=3@#Wd2ThRD=N3e{Q6xLAr_4^XM| zl)O50LvbC2C?AAd@na;C1!1`RM_^e8l9chHAqMo`O-{TMA*n&wC?dtG4ZaN}kxhrh z*+_hrLxu!Y%RLnH$e@bDAZOwrG8f5T2vpom79^1O-gxLjKpeH-kX*qiZRLIH-=Zay z(OihK3R<_2#5vorZx_bT)MOHjRNmg=Dg3SyOl6(W64jy!I6pOYoBQU^pWRjz0Kb|8#(J zNR>k)+{6aO>j6ebqqw4zOAg5rh3AlE$iMz6mR^)#Av`;zR1AE5@4em*f)87=eYO>~ zk7QIClrEN&oikQ;0E!=i#i6(}E69?wD0y{)OhMcIEi3~BBvLDDG7A#4M61DYP$-aN zi0XLZ7zz;|kH`X>#tJg(hGjd_FdY1l%-KoO90CbMen)ETsV^f`Sff;bqmG+*k#okB z^azdj0~$_IdHB;vRAsZuE!@z{FH*%={e7_NXFMBmg9!psECz_OwCr8`Y!MA6+<+MI zU7%4XC@j#ps{mD7vlM309Li{)2Puk$tUVs>=wD=0cN7O@P{|*$JkWx<EEn;><=& zLA$v+xq_M2uNWw~L$LgPBdrI={u^bjZmN1I*+mk``7m0s$Nh)ofp{iR|E=nyBQVdS zMk9~!$a-VlmeBKI$@~_{M!`tttq4?){Jod0R5h~p9(ngH{+=7!2)csmMA0s%Oihx} z^mbfjAm&ARR1e~>eA)xlmSN+pN8$7#Cz5Bplw<4}ard_5sV=tcW}8txo?Zpy$^usY zeG0z(o7}fGAUDxqvM-N@ZT^t@s<#%8!Fe5K;2Tr(3@WNnNKl9jJR zw_PEVY&SY<`ACR!bGl;*lp5XcrAWBSnas0^PwbYtyuazm`9xpl3Mx}RaYCzQC7t}b zy;OzlY@jr9pn)q$y^JxnRYogn!w9HOAtju$Hi0eCU*oL@&&8;3TV!4^=|S~wsxs9s zhlo>v_nQC8r?rk$`E<}FnV{zk;`AVjv$4_mq2LmT?k+|Y0q6K_P+8)1w1T?>r5{JI zIHY%sLqQs;^y&R6>|gnG|4V@r4yvKzt4=6D#XpKX(5el^tY~Wel~0X(+ByVdWB3&a zasUJ)R9@>&eChHJm8~DhePjbwKHW6dIK6UPG(KNmCq)m!(Q_yjOy-0Pet1d}k@=3? zeLQsCUVP#h;J z?DZ(S#vz)DpAOXuHUSNx>iMCE@0P$tRQxpfFMb-8bJ}TC7`G=S&o$m?)D)M|AD5Xx z(5s{s?ak5^%JksMSBOg$x8VrHyhqxM+i}Iev4cE$Cc#FMewhrguSb(}6fLcHpLHXREn5XOFk1(tpd-yYE1 zC-Yzi?>y2m)jy69e_9>&`QZP7pB4t_vaQB6I;#K_OphB15(s?iR3=(Wr{KV>o-DZE zgl2neGMPMK{<MVc4KWfX5kcD1>N^%Bwg)eL6L0KN@GLYA473PD;y2 z;>&RI&JpK2F1|me;h;Bi?bON6A;HM|99)C8?)$wnJ%7#9SHF|_qgss!RPz*nVS>z3 zJg|Pd&!K>7o({PD*F3G))?Y}FoK?aMJmL4p$*4d?3v;Aw=6^evw$+k~dZ=?f!g=bZ zRSe%lNAUFZSczh}(kWxcb{&yP9$RAx`q50ig1QUzjtzk*{k44$C}kbWzEIAeeX zGg$KU_&FH(3p|t^esbKUIwMxzuO@Z{gGAcor}M@^X*p)9@AO-VS#IX0or939f}lkj z@sXl^1(J7y-?s<6=S&eI9e$NYqi@PJ9Onh9l1J3>nwE3U@mv9RcP&aZV~)529q!xT zcYHjraq280MyQ>i^F3c-9-w#j$JMyk!n5f9vNzpFJfWAmqBb0RZ>_kw)&6o^TlTy8 zus<>Wmr77+TYv{X{nHfZ!Z{1A&ObjStvZ#K(HXq7QOz#@uF-|YZ9^$a-k-MYikXVDZaO_zd zs=q=H&OKF8No-~kZ1Bew@->#$nO;n+4WfnT#*w<=H!tOnNtGwsncM4cJQKY8<6Tv{ z+m%WmneU-UFPM@Uv?!fvpYDmj6I#8#CcyW>bB&n4L>kNwD)vxGM_f)*aI(y)F$KzS<*~6<_R}^)~6x zA708SmPVwQf@d~7R19kfmdjOx&bL|M<>;LzjTc~}Sq7;FH|F!3B}WzTxh7t>&w4QM zDqZ+;P?zmaobkF<{Cek^utjk(p)$?_gyy-v|E{p0;?E7dJ0Z(ZV59b?Tj|EVaw%?6 z`Id-u9>MKanId9MIZ)ITZDS(PyiVGHq4}aH6!Rk{}#%ta>OSj^R zG4d5x7Z!trZ2ruYz-&3j>T<>TxJuXSgs%(jj!NDs+a0$!fauk2zXMLIRsLzFmT)zI zZZz?#RY^Uv;24AyHpU@nToyix6&1Oqn>QzY)UIYTo|~u1P?=m`^HFbSll&~+vVKAM zR59P#=Tqf3#rfSY5Bf!ZZ$AF?F>>uJ{+VdvQAw3cVvp1KF~>ztrpL9`uhLv!bl$mh za`dfV^z``tps!~arlg8Nz0>)XkSDFN2Tt(_%%{hPVWTyi7G(|gfJkBxP?r`Qhj$uS zxgNn3x}ALDZ>ny*j$lpQCP+2GXmy|@E=Ov}PZNdFwG+vGg)QyOq$l|D+#u6@c&hR2 zA|&OIEF9XBrct=acFP|vMjMf4xB$~Lonw+sWlJ$B^J0HkkCw=1%Wz>5=ejeB7QM2A zXK24;&@|n@5VD+znRwPc3vFRpuw^^(APMUYfUZOFIl@$l9mEg09tY2Tu5ES->!swt zAiF|$q23}tV-V+Im^o+q0~EUT5+w&!ZB2 zqLj2?{BxGW%QvT(BxGv3#_T@s2t7BJRQV2_^2~l!JTWAt@O*3{#<8I2twf!+*_f4A zl6u)Z!B6wtyBAScVC6qoYV-nhrc>UuCf%y1LCXfV=rtR&tW!t~+T-A88+?t>U5GC7 zDPFb`#UgnvMg45)jBMd7Ka4rRRBL`FrzpA>zKRo=W>4nZ2C%RDgII~D^N{Fhd_DRb z!L(r99xZW`?rAu!LvjOcy)O8^IrA5rjH?dyO56?l2eLLPr4BW|=kHreHrQ9moo}cn zH<%B9x!5(H{3z-C1HEq>=bi*QK9u5avNCyQU+~58!QerIzhsc>cZRv1^fR>AtiHIv zVt75m@wz2Qc=LSB&YMn-n{D2YH*KHEUFc5LY$=sg@U6Q#_hQ$hIn75QP_*0W)m5(h z@aOlf!vtR5EZVP?WxlWOHe(mvqfy(t>ThcC#WuI&QT1Qs)GWp<`qKY}oGx;Ij~lz_ zwpYR1yK(;e)p!4coIYoC{o<%B5Zd!S$?tXMsnay?0FY-JOMi)qobr+A9k=l_p6(}S ztwkJFo+Zl$%blw6k@?@Koc-ktQZ8KX5lji9D{K7_8BM93yNODB4OsedI$rBfS=fjp!b^gmCETH z8oY_pu`XNss@wH^m56{EEBLd0UAl|#SJ#EGg9`$|!typC+i%e1(cs+a(#lg15h zNELmh<6gssYai9Lo-uvZ%TwF`9e5FS`g!ufo0u%8CR&|^N-$y-;jNR-g8t~iFeQWF z1jN5I(tF4MblsSf&HM(^q6=J5fuzfZhXhJ@)$y%H>9EZJMS=xqS1SfP4E_X1fz>|9 z`2K9(a1Gns9msuBF(FY*f$ARl3p@I!_0gY{U3xFL2zA>LwosvA3rgm&mq9=?%fEdU zL9yjP9(R{7db`&g&d-_b$2?JF%6S;a@bcrXs>*Y)X;3?O9*U%g3RBZr-z~{^0(J2N z>yg_!iwGz6Y(_V3FG{d^4$*X`L_|Q9&t=_~bvxaC(JQ(znMTqE+#;VH;5m1RO>$0q zs=W1^Y#fP^2>n&1`0*7>A$jxE-)!ji69p#E!pkpUNwu0Qi7}N?J#~7qI#oEO81nn# zb*r3oNe*pRKK5GC4wcL$*>iHtf!o_9x0G)``pV?`#fv7)OyZv23{VI2vPMX`SHL*% zn@q;Scx0T!Ij2uYkKY1r}Qw|*~c(M3~q^i!mw2VRvr876bO|GL)$6k6s> zlSJC)+hfTbtXH#@r8X14Kdj})Z~MZ)8#AkSM0?9#H1i_a@N!?h_MPai)CDMc`QKVT zd^$TaZ>+UKP52qMZ zreA%^vJCZpBUMLG5ZkUN?9~StqdSw^SH>fob< zT!gMu1P+*sJK4qW>{J&jjCxZER4Tc(DENywl{zVou#-5`20m*yYON+=7AbC(Ut_o{ zIgFrFV)zm~DVc?kywE1@$014DmU2>)4hohg=fd=iIqowGO;VsuEwY0!21h9+C4$lx zkTKhXx0+F86y&jnGigs_e8MhnkS86s%QwyfDs)*X1~e%ue^FdkQzYjp zoqmX9+KE*1Yf^IBRf2oUl;b)x^Of`RRaWhkf$FNT)HSDP+7)45YB!rLBrfb#Cd8EZ?sZ+AYd8a_8reNUSBQT*r5DsL?e6`S` zu2)A9sxQ!v+0#|E)a%;g?B&$|rbX1N7rB_SJZ1EZ z^*6qD{WVY#5@DRCzx+)Wr509dUIxwLxaIi|Z05YBL&OLweJe}KX2by3|`I=ADwgBt(GjxRVa+`KtZddA_q{nJCZJk>z;eWiYX_a6ck zXEI48P^)BD+uf)H>b8Sg@S>^p;%IW!>}5_Y z>?UZPtPwne#{TzO7~p114!p4u_7aVgwL~W2?8WR#SWl!|tHyw(iNm_LV>4d%uxVfCbyn4jsLAE*E!|0qKaXE*!<}m3wTh)>i?vSQ zGA8fbaxVD6pX`(+<(be}%~$A$wnH8PGm1?XQ$-nOupzrYhs6i4wB9}&*B!h z4s*yVrZinW&l_sviw;YziyrOGM1#o z7sv1pPa#-@I=exSq+&}-!gPvn{>r#pqTg83)ABB(;7$I{qwGTp^h85W^HupL{)*)b z6`xBg7T?LCJuMqMD7z%#qjz^s`73{ya1O#uYJx7Oz2h`jt7(p>ajD{WgxzL$Vd9?e zHXg5nKP{6it>!}i#SZcQxz-B$(O zjsm6^1!DDDlFo86?w81Zxlo^0TL0mf+?CI!LVlvvxM#&)8F_e`Ix(x~*oMaWhKKV^ zg992j<0NNEjeb&cTZ@g0Km1vaYNBFQohZ@g6^dWXE4;e;;9<~W$Eyc7IGay{oSNT1 zZT`L6yd2To8}aa!cIFJ+aUwe>`lxBy?!iteIZ~&E=65x;d#~eLHk)2jd}&hIE~E)o;+9gy9sK}+E{M~@`7788I~ZVHGFAVO_qk+}tHEBS`K1;)Iz zs2+soEE;J7-s3PJufakeXV!>-S4)3^Q;9;Yw^##*_9C$YaB(C z`&LWsM#<*(#-dGlW913`(?U!$3efPRFK z&!^&{XYUoxB@S5}x1xG7hvCEL%d{^$Twm_)y>My7^>{dBA?K9u(GxS#HaibdY4+?& zma=T7CFk0(8|>TTqOer{^k#Y+Y-x8n+;1vy!iuv|7UR^mSWTaeC+pu^oPVGL~;x1 zWXtDegjNf-mAYdM0t_#`1G`b^Dk#(z)xGH5!)NT-u=}!R>g0g}#(QVVk&r$yew&L} z3?htPjK9GcGV*r8qI&7i;?f^ay74u^kFL7@e@NVR!h=)=73lT_cK?rX0N{iY<@PzY z;&Vb@H)r2fwRnMq+1w5V@@H|?S7wX{fHdvvr-ni38rA}9C zl2=z3+`)5`SAIZ$kwvk*5^LwQ{2bM^rV-Hej?QrC_|}7OBLfA zxSSUW;$X~SBPi^~jUxF6NsIr=H^!-7Vk9M$;e<5g#E1+cH^35?Uc7L#3j-crKE z5iOVGX|mh*tx@!3+5P16^}2(L9&YNI;)Y7Wi_kvN1OWa{iFn;H|NkJT^=$6E_H6Pl zmx`4ZhO>n9uKq<%hgpP&O7w52-8~)ue?v~S&C~VMxfUh7%z4)`B5Nw%|Au zplW?y(ex{WX471&w{X5h`oGBO(`$@>kyG8QqTZm9a)eiReWWgEy_wS7mhC?l+kU%9 zyNZ@>PgA>XWvr03;%6dY#*gyvk{Y+4^0(tCP1ccUb7x7HPe;b``9^*$n1 zOWXR5GQqO(>ikcGzQmU!AuM?o*o zVAnZcC?9cgyqEQ~Ov%w7AMBWU-H@svdZjNN@`|DV!BEYuA|V9D@sS*HGa*%a z%olB$Wqq8mgPHx;XxoXbvq~E?L^(rxpF%t95}pi8rJ?r@v4B_~?>*N-W$srV<#Jy_T2n;h1-qZ-0VY3;VQuP4o$x9~I3_x3Tz|*| zQHeQZI(z?i#QZ1`eJDW_R&fU`&zujr>{Jw8Fr^CM~z|fCn8P)8$<% z@<+^7v$6KX%w=LeQS$h3PYhPJLLQUr0N7<3O*(sxA%-uV?gh7>L@z_78yshInwA`+ z$Jy;Hbwi)-&XfjqZ}l;LKilm{Tg>ALXv@&E^^bhc{)-UHpxv=RHs`vr83{4kQi&Qu z(1@08YumC#oh|+1cARJExlN3CHrOf{HmY9^1Va2;W-()Dn+QM%XLvzA4VyD8@;TzU z#RZNn06s&Z%|D-rIN4IWM^5A$f`lDw1hYw-{Vs@>@!j;AVDGvxJH;pR;?|2Zn2iK< z?Zq5n{!z_41_c}dz%2i&c7;*;@f`}g|GLZnQiXG$MCcXGtP*YiKi=G-!AZZW^=M0HRxh&JMu zHR7$`_ZJ33E34L>JK?*HMja{zquMu!TQ;7yqt4g<*q3DMeQ9{o`;z~`glo5NF+s59 zk?RRdF~9d#VTsE8$tQdey8tUc3T^#mgM+aWwH+o==pa@6jZxs_T0FpN?2n^n*r2CjZNi_k!ijK_C}^g`=nZfnMphQaZ@H8P z5{XEc7hsMUBn&GAMr2&tTv}G3DyVF#vYz#)WDORMRIhdMcpv>5gl=qZkOY-MoyHUX zWl%BH^?ric+hkRSND0ZRCH8{3{?p# z4|BFV_vp`E2Rtg-67W5nK87w-C;OR-sSj=>hldr+4L>fva+C*U8`-XjxM+i%KDroHHgMZ}aHlyRm^*@+ z9{`syU)@Yb-W<^+v&EgC)-Dbw{Lr`(b=5$j{OIUy!DI+{X>MS_(bsumP`>r`SGrqL z@^`#6{DtZzFZ!5RNWQkBK-%v<_>fMIn}7M^#%*pot)!s*>A?=y+K>kk-5d<$(Syz} zSHFeV)IPmrC=@QV@QT<+Th z*_$4;L2=}=i0u?3CU#n%6pf2ORXerGMXts}MG+BKgq0=#uWRyZ>7GoP8FR0>zs5bQ z3{ZWi4}UG_M^8E-5yTEv31&*J&L8*w2xePtN?Tf7^R9hBiVk$A&z53}xH#f1aB8 zGb2Hw`Ni&jTZJfdK)V9wzP6#poGpwh-dxpONq}P*Q8; z4f{&lz-8JmCz!V!`KD=*1+i1p(E&+?u&eTkwLkj!42xiVE`^J*4MU7x{>=B(0e1NI z+M%gtQf{`o)*_?wFA2eQ4(jbAm%@3C+@1?%>SSM>Eq2v6gj zJnbh1m33*mBNW(MexH(V}c^)3805c9WH+DXkA4pw}uK9^sk7Y<#zSEe4zR~vhMVIRZ z4~Cmeif$$EKw(I-g6d+z4f1NVz7~TFj|scs-5r|f zdD_y`f~=rk;YJ5fV4kI5A_eKf%T5O{gy)4toL>k|>W;Yb*Oj)Y zZWvh!S|?0IETr(1YRlP>6;uLLyDW;VpxPzViV+YESnwM=@GbnkgLNu_SgIZz6}%-1 zPD`_G7qdr^TD z-5&+DUMQnFAo?dS+ICrgeHN3If;l2)AYC)4X-|ke5;Vrjq2GHLEU;NUVnLxs&wxRGf!xPxE#zZW7AGKWEsw-VT7|bho~)pP>~3eACsU^ zuPhl8o>*nNRUmsa^Rc{xDf3%!EIwQ98Q0B85a&wv=S;LoS%^-=jR0jjY2BPF1?-Xg zX$EXuPr_b4fxYA)Layn&v{+&7B&FSW`|`6&#GQy6%%}`QAlDL==Vn1=s7$9-koy@F z{qXIz)xnoeD2gKBUy##?{P2SO$lm-XEBP^~S8>9xp6b3zba?d-D`nCuO0_G` z7FMFP=>!TZC8jGuT7}9f)*A;^H&4Az7^*U+e0@uv)w!-(X1e;?Eun9d47M&r zF-i1`z&AE3Z!CS@B&g82;j2%}m#AK3I-gdvydL3@Rue2zqjBra*^9OD5;byZwSnce zDI#zDFABk?-jJ_O7e;f=NuoV4@MwH!79~?gUv1_s*1HikmUT6pBCMeYHIedfk}lR% ziLg{qzBOTGZ!dh^DRQdYr@EE0?w(|(wh9%)6#vDI`iCv`J#|%;7IkK6bvZ)~HPe~* zDC=Dd>zlq+DNWaYNo&N_H9ou5D00v^-p9u+OqW!~N%l|OLr#(X)1loH4lgo*iof^* zap$gP6Hi=&&bN9}U!BrHLj`T4_NkUt$~PT#4NShZg7QtvJ}vAs&58c?9O=#HhO^GB z&^iM&^Udh+B!RUu&xbNxx&Caw@zZsKkn62FaSeQ;tn}=ajG`^*qDs}G78c+7jlzb{ z+;ximRl2L~@>Dz**yy71@UvhCqy@$0Ozg<4O!YeahCBMXK!>(OD|df`dVPhaLOX{` zolIY+)@q|2RnzzB`lW9iX{S0a%{2O$cAk{y%wwXl7LG6vR6ikOtNbxS!ZG}?i)pC+ zetPFUs!G?HPUrg0d3p9bQC&KByOv2^9(SuUS8J0l_Qs^Yw8x)N@Z6#8PeN*u=~Du% zW0>a;eM&PPa5q1w7n|7?&(5wKMU$k^lM+?YF4}5H)z7(Fnb+Ut@vLQIsA_Pg&)E%K zjpclGif*O(g|!5YsUBlRbMFUV+WX)7<0M+TP5Xrl2YP0@`ifeX4*Q2hxkmKff2rqN ziM{qiot-S7o=^>fI0kK-5u~X1?Go?SMcF@24}2@Cc_#WUw`g$n;yW?bDt=ZT=0}5p zm#Ge;81{P5(dn3`bvVD%@aadxa~AN<2Zk%8;U9~`iEoB57VmXib3F>`wps=;x8L)b z44oK_=(ZLyT^M;Gt%Q=GFmVyo&$Rn^h}i{L2pET{-izPSxnG96=q~Q!;d(|0 z(JwSEds1$<$8Ni2+81N(oQ5n3L#$6Go;F|_$j+&%3Yk6K9=Q-63zcjY`vm6nzQFq6 z5$30SbLdaX4*{@8ahOp3$)f5BRgUS`@A!7y&eR!FSu|6nlMMasDO#A(bZQdzmQ#tY zGd{%>O=X<;C!91pln?kr%bcb^IDH&H;*e{TQoMqUql3-6pwuLlF00905A>>to3SM! z^9(U{1YOuKP1H-_(I+2woIdY9Vy5Dw^!|pnz@1Q|TmAK7?GvodFX^*=rB+%olwNhm z92z2)m(bf!KGPe_GSblfdVV**1$v|`x^IZo!l3836lUXRUKWq%zmqMx=F^>}T4af# zGb1O+83A8YPg(Y}0~lEY{bHt(?Mm7W%t)!oDXiG=Radmcg#lCx`P&!sRMnCqbBTIriP6Z#?#t*E72nHb^A!8x^7B(o7`R1j6WKg9c9EW?Myj)L+%IO( zy}pl{^ILOfob$=H6Yme9(;+jQXUb?9Y*m=90PZ z*NSHFMBQy{G@VfxlEgZF5AZY+__Q(x!@()#d5GxS81 z^sD)|d8+@*GkI;EeRt6^lftK!N~D-Z@{f7C@ptpoO@^j;ZDlSIeflFU-|gX-EUVre zQ#U0=JILlKG32S?$}NQ0@a7VxRE%t%e*4H+LN-sa-I5A-*xvs(Pn`*rYq9)IgC7|z z|CpylzuRQF3^CPlPx7@ z%s5HpH+o9iN+M~J3ktkl(MIPy=C63D?cTqZ;cm9&Pc~0KRnv}hx($>xG00JL06Y3- zYt|N%%{;s{#L4tWs29{zALIF3OV(CtYAmq48)e*zIhcb_KjLCcy7Xrlc{dM+&z!hH zZPzJ{T@=%kag>z`#P{fa%y0A5WRNir^V>Y_WP^grh*uS9cc87;eH-}OU2#Oik~@I7Z*Te5i?tGQ9OU+r@A zG4@(SE5q=~+=T?_Q$ycnF2%@X8-;76{t()fUP4t6sH)JI#WuZvFEX@aSVmx@ zQ-k|KfkBIpp5`$Wb@@t1K!4TYFPW0O7QWq12@t~%QcbeTU&v|6?S{v&Q`c9GD{S7q z&`7)f-K6p&Wr$>D-`H>Vlu;r6J-4i1^``6GmDnByzr$!2sHo?sw1}0e^drUv1`EF? zvlp!_UKJ{ebd-%kRToE2xmX-Z7$L&W+-&o8)gOMSJ}R!5cc_o5yqM8)rJ9n<^@7mn zxOM5^s_?E9{J&X~rYTm?DoEz*yD z$kn^wT0^wR-SBo7YqJC5vYEHT#Qfr25ExVIxR;ot!*I*=D_M~|oHbvb4Tyn`SbMSS z-hG`M-woUm0?H}ad~AWZx`YPf4eN3ZX7eBkP1ZYyJ*_%>6Z~Dib0j<22J&_ zAJ-m|9vbd?*t1w(Ss}znadXz zV^bJ8C-sJY4d-Vz;=nd?D75&*GfS{Lett7+;2gBeLR)_A(C9R3zk2?ChIIGE`TFfc ziUrw=vK9h3KRRyP#uN6bVyvmGP#C=_)iXroiPr2vm{6*P@;DTt zQT$wUCB-5%RLX-?^vh^CL)Ci!skx}aNWdA3H&R!b)GtLoJuMbCH$wGD{XEa8(!oaF z)om15E7E68r*{lQEj!!;`&=`bZ&#m)HM_q zFJ^UQ0iO(DSao!KwRc~B%IQa^yz@!J?qZ#D14q1&p=g;WFPBRhzBMUfL}T-usPN_F z7Ow~k9@NdBq8I7K`th%@v0NY0w~Xt;d7^Z;ywoT~*@Zm&MSAAGAf;qkS?X|AgFl|& z`n7r6zo#?@EKS@AeP^KXy7yefxi$NHF3)VjRVlCfp^pdE-$NU@OruGPcOHF`ZH@B- zwev(6epu|=e(uYKx;D1N!>k-gK* zaH|-qs?sG52Ax*t*4_h&2bSSPHzwYR z|MW=Oy^nBL&_UWPCub#c7Kbv7X(aIITAEt2AL^b^^lkKR5 z-6iWpnl0C}e#;A-w;YBpc-qDaGTDX;54Q`2JkWe!YZZ0u(lry}5#xB3Klow073WTk zmf|NdDi0Eb$t;=ab}-07sGSD;X+1=;5o736-^ugYg}umFHU8Q+tLtm8yM=Dsu*V;o z3uv6zy___hkO$RK@aABvrW9ZeVHZoD1)Q1Ys8mjCIDHXUEE8g6~@M%irw1K30+3ao1L+?&iRQvC0gXlbwdV z$#BwGRi1{E!?ik-vFfqcrNK^jZBFEGO^uCJzv*zg=TUd-)6cOtO|W||!SbeaOyf0O z8u#21>r5BVj@P~qzIXqnJUKb~e;9K5;r}G$)Uz3;=n^0=h5dh-bV_DUeO&DSCY}C2 z$DAtvH_Yio^sWD)aq60E{2!Q8k7uUA&ssdw%_Gy_rl+TeJh$-q-_4wcEL1&BuZ}3H z?%Ao1TzHf6^-bc7nkU5CnEtmVi*M6Q>xO>TJ_CM5bjJ0)-H|~!&f1l{x zn(XSIYWp?SF*wsTGxK?KrhoL~>hP!DU!VH6KM#$~zTcT0otod8`tncd^wZ+R+~W7; z#l@eCQ-@0v3tu;Wef>1I@^NKlWp`zEadmcob^hy*1@i1=Wqs*red+t=%E9L9#?IQ- z&&|W1n?Lt=e(nD}IXM9U0C@r&QLt;55nBS0^ungyWtnZE7+&3U?eeUSNERuFneOuJ zt{ALZ1iMZ}PR~=pD+Q)K6)*deaaO(QI+eNa(q!FNW_l{~-e)KWqBwM`@`tjup99qPkl}`HvR}%Q%zZ7zUP>Lno+ZM__^@x{R zk?~b8IhuX%~0QCD#*b|SaalhpAm+qnQllB@k_0ww!GSB68#!AIzV;yr>a_MQ*_XK+OHwgCXfW3|X{o>>pj#B{4%h;L>Be{_;A$OSY#q z8zV4ID%n*aWDa;%t&d}ZPDG!tjR71c_evccBj9AH`c*MbL2p}dR&-di{ZS{DRs1q=Ai7)Q{1;a@6L3Ssl z)n42A+y;{G{o#w~FXZ)^km){-2uf*PzTufTu`ltCn+{)k`e~54W$IZ5^696!yk(!| zyJ{70dUCf?#jRfP%{%TM+2k22DUlxG4{c)Re^u2MD-fx?W!QU#@_rB@o9vGfO!g|n z2-C3#rwInuN}>wDR0gsPjuxV@dkB?CBIqu~?0bm#fEilWJ~0#b_5%YzCTYqE{NPi7 zrS&lq;XfCwLfek!QUk}<@^FLU12Ig;cos^?DA;$IJMef9O?_&I8U*681-w55p8)rv z`ovxeo|-^q6KNE{Cai3%mPzQB5d*y>v6K|~j}nH0 z+U(-suwf$5K(OM;xi0zX9K^4GIQGD9nkvM#iMeVxn1{gKmR{EH!l28xg1vF7F_Q)e zcY55vl8LOsEBJ3)QCY30E04yh(z5$s<8)s1?338v15}7VeZs-y>hf#4i=bNLZ`t zneH>sC>fdO1d(c?PD823f+@(K+yhTJa=5Bk0zv4}7}X0_*)Qc+R1-}k{P7MZSoiGm z2fJL&s4-_QdSq?cqNbnk{H3GHB7u6t{9j(8l?$y10^aso&xwG{%bG`#5Ks=7VxiSo zY%oyyIvbWCmXix{O+uP9&oC*dahw4dU|c=T5|&Rm0iKnUif`ykVMM>JY~j=j#w#W) zeF00sCHFz<;|!a+weLLEj^LfGw>4F%fwVi)fkK&qy^zLtk2-BBTU&qa2kH&yBq?3@ z4pQB@21OT^+JK;F*R$%J-Yji}YoR@{{2S)_^Cb@t&fx>_tB;JOuGjNLY0@6*wJ@`j z)GEh$I1+S2&tjk6HW!(4+fPoQV|RGils3&%s-fSpt;N*GFDK4cjqPSc#)NAYJmJ)C z8s$LN;=?|+Fv&b!{b+eI#T4{t`P{8(VvIu6wWh4a{*&jiR?d=eW*S{Uko`(D^i_gK z2g{GmM`!Vgk9}zMioOn}WWL-~#9bBhNFgD%Q^x#%Zsb9)s5GtH37ai!ciqwr3%;{D z%SxF=vuZ=FtmOM8f2Z$~zwFQhb=#_OP7ZsPY~voDu;jd3UY+)=dt=|G5UI;M=P4H5 z)5FG8V%}ZLzYB9`DP^jR%X^gMLZt&U9&sS7PSM;8$9|yFNkz@oQBS_5A3U<-!-fRK z9LyGUnGUQaXCGx8>+ljlZ2Tqif-hNfd}_bZSPD~lAkObk&m6AcmDY^}83kRE1D(PO?#-p_1 zs=ijOU~h6&!B9x~`5Z3T@GbE-Q~4TOr5X)28Cj&_D_-cr{4%C-!MpeXiTd{e6nb+R zwt(mxKjyXlyKxlxK;`Q*lvjGfN+{A_U-Zl2d3)>oRh-ab(2eto$mFA^6jjZcW&#Q= zJCW*5am%g%<5Nx%8PGKiS%-1gWbA>99C3nEB(BrABHpsCRQEAZnNd_AaT~#( z|0+kK8N<81cfH@bk@ixQe}Ql=CRG#T?vB|c8i3>c;1UFsKRS2JQm(_VFVC;U76Z%2 zJh*%b)f*z(Qz(N6AaB~Odx~~ z`aPR|_w0qs!*}Fq^)6>pz@eVvDM96#rVw(HE{DQ|1T`VS8i+Iw7!?34shCNjggIw0 z79i)lSz^V{6UD|J5N(y=48LD3%a>$3S)4&QSQ!>zF z8-hcFjKlOZVP|5)!nGk9JJ^ROIMhamsfS%yf^oRYP#8^?dQwezdJ02|G4|O+c$RF0 z5;wZdC4wxSwyj^znfNcHQ(2ak<^bj)Gy@AQLk+sqGb&Uj>YIC1=S1XyHutMCC4=S& zR$8czEvC&S1Yv~R4Yh;t_#3FwPBCHW_M#S-IgeZfvz4!BrqGH8U~oy88ClEY`- zp{W&SR+e_hGv--r%o0oN4;6cft94&>fX76R}tag8;PYrU$Q(=>ZO6LbO7hWX6Ac02L$B zl;I$er@5zbN@O&&aDxV6MDq)a(Y8uJjaQ0rB9>cjdWQ)N^b+Gnr#s2&dT zpynmw+g0DYsEJD%Y;9|;Gz;s`$$TDD?ifXv5#eHBA=KhCr)=2nPc2bs@-c9yu~oD& z3x`S&Z$59(cvVS?R55`|c- z*crn-!(p`7Bo;MwRSZl0qfJ6|oRaftp3lh&+CPh;(=GF>q!OMN+`7QGjR*Hu=X)1W zGzN(9I$#2Xz(qWwT7#51+14;t=vk)hcnct(NG)A}#*iRU%Gt*Nh&mR^GKl!$LU9(A z$^-yRc+kRnVap1-%Av3(0>sjcWKwg9F&2tiMw&DlM5?aba4a(wY38D?_r&%l|#!}RRHx(xXxX*@^ zCa6-1#V2Py6TghW3gub%cB*E(l&V`t@c;$2Kg+-r>k?nrBhivY#|&4 z#v&dK0x=Yz>tyLL3jbR=joHunH|do2w{-fIES<&{Prpsr{fBg#Rx;C@y>m12ouJK- zx_>Lu-7pEmAgH-`HR>DWk96ANw?LLoRf?GmQ>8@U4@90}%Kk+ zGUWY{PDv%lSOkh#`2BCvDKT~WKbKC4#T8`f6#p;MDgN)$>A{QN(&^A&(y6Z3+lyrB zG`$#hnDdu(S`_pT>9j-ow{)5kCc;maPI+{`e?hb7sCKbpukNt@C~Ib@Z#A%JS;T}b zfT_2wXdaQJQ(P-gEMjrcV3Fhls8THAv7-N(bee$wkJ9NEw||jN(-E=I-_mIuv277U zb&s7&tDbsVxzm!0+F7AX>u#ss-=tGtXBDz^S_Csu?$RluFqrwfbn0dJ@6u_Jq_N%~ z>D1-lq|*$S_D>{Oi!#h4R@aI|d3Er&becz`_=KSv`Xim@5n<|sf27k@pC4rD)a8$K z8W-@3_(wYR{ik%A=ki-Rb(DrgVxUggHYq%Efr?rnDaO+WUvR%)baB?5fq$&mEW^`;5Mh!ELtZZb zQ95l1$5`WN7&hL+GtvE2wB>|B6vxOj*LIqd=k1tB?T&w>(=&gh)8#+Xsn_47)3QOD zvA;{FPh6;p7)tO4{tD|0bR0k)_ij(W5`oX?Yp+2>}}K0z1Z1 z+)<`_Qbuu%N16XkI@N>Mlp%6S=y3wF_&fSl!&LDTq@N45^}kA|tG}hwt^4hf_#whD z_%4;584m8HaL?+GbgDv@PWx2|Wa*Ta@(GUSCi(xr=z`Y#Bc0y(yL4KK8_edIUU~N^ zk%-pB{12p4%-^L`4$Ap;#NVXTH0;#HvcII$cq}IOk8~Q3Lu>w5(&^zJ>6G+GI_3By zojO6u(y8WukWPn7SpFqA3&$wLrI7)=QlD-N#9EG z#Imq+D4CJFZ!hF6Z8gOWJE z@el?1`C56D=YMkkmQG_Buw|sR^|clYO_ts0wu3eAqY_o%y=M7~wK0qKL<5SD#%pIp zu@Rrdq(!kRzCS`X+Q3coY*h*v0 zaYJ;`Kw({^bj!GZDCWSp9i1OL*Gk!qOWRd&lYz4fHW}@AJ};Oz{*;dTY4x7nk!n{fo#jDHdsy7= zL#5q8&`-ruSeG7)gJJiBGkYOMdtNbnkAJWeO1CUOGsmF!4Y{^`sdwW(@3l$n%fHz7 zY1w~nzOU!E@AzZ?8JeBw+?|`j{HpX5)w8q1uW0QU8y_&Qf;i zW$(N7^NPM-{Wb#6e%%U+K7N#W9CBvyJJ)ffs}#TV>Aj2-_siWQmpWr;Fg#EivUEyP z38G;9OFE?>o07&ua~e^Lf*NG$bY3f#o7SM;m~LJ(PKbBFFkADPQi_DlK+etoMmkNf z<4ZI;52ww9J?n`wQRIl5`tWVMRD}oTAI!)rDi(L2U^6{dtX6LQ&P=`Vr_(fRT>9F= z^!N|+ze}fv_NE$NgyhFJth%F(2C()_HnH8QGR})Lla;n&JhpaT>?Y8~_ph#%+CG=t zULFP;y~#asqGNlh0KU^LKQq1SO!_08vJ2j*eVsU+E1Xr{v#3%co@e&a+uE*HRymQI~nR-(4YT+{ljc5=>(*?NjAfhkR$E({B@ z#_|{o46DjPcoSTtpfV2OexODp8naG@ebjlor&?&7vSr@#1@bt$|1g?F=Ai4x8l zY>w^u9R<08i?3Oow&OI}kEHH&eSfS+%`{{t>8c};c7L-+AhV#qdtmG-b<(nTj@w1B zvpem}jv9s$w+4+Q#GcxK#b^QTBT8t*+#`cwlLHwj$wGt(4 zwDv@W`pI2hnZ^6tM%uiI47sf8gO`WmbL;4@7$iEb4jZU_T)krS&Y`?P=gKvYYu9J# z^RF6PC3;*pIbz7aX6pXI<3^)d@y-o=Amf9H5BC;!jIEzu8xt@OPuxjl^9-k=bkKMe z#_Uj@pDUy1JwZk1@NGhm!PQZpkJi1cxqb5HliNRUd(P{>zWG2+BNq&?$1+t=3L~3u zMc`gx!GPIpf0Ub805`!&`dcO#hMg!1K_nEmkZLhz;t+KID-ZxH%YSqA;eyP*bwX;+ zLu-w;OrpQXIV*q^p)Qmn6)iQDRLRCS7Bx532*O$vxm_PvzJmvV+ezoe&fDZ?Cr#Cy zB&Hop!9cib?u%K0;QI+SH#XP2X_z@6)g9k=+D00iK{0R-n(mB%1} z&oh^&QcQK4=dAL;!zCWTxETcWliC6y4C~E-Z2e#701#ciHU`*SBnAvIVn_ra<>$m5 zcQcT(b%ZAY9O={C1>k|ea8=M$AV}kxi!um9uxAV7grqD<&;SJR(!AP6 zntUYeGM)w%O9lrq%@8&?)xm6xeF}~M5W7H;SJeP%dd?RQcIB~fqcAlJGxQrM1&|QU zBaTt-ry9GUl41ay&e7g+cpw!q0|vxdIDm*y3LvM{xqF!j!Z{eo2@JApxR`6O6G0K{ zmUe>om1SC8K@?YWTwdP-CSTAtc6j}`FQ1cLY&rTS_IyK%ijz_+&wM&7Z z+2~Y^uvO3Q)--OVFTp{qqqsP}hjle_U?J4>&(Dj;f_SHNp=vocVos?~c{(?>Le<$d z=#&E@O^G1Jo@Q`BUu1@pCz`70?G zF##ZNof^K&Ip%&0Rx$t(D3s%#I%WcZ@HpA`@5*hLD7huRn3|9P|0qJyiD`iOl=~Y&WZU9G+2p;*A1u$Y@l2HG0P8Q-@&6nD= zqmh%cU;wx(9g3hLggcG;H_G6f#ateOfM$tGDpIhZU>V`-WHR-eZjxAm1+a+<06~J( z(T1nVTe$edtgtMd;kr*Y0fiyuGND($N17i!_(o9$c$M;e9C6bSG@fua z!~h7upL4-SB|`Q_i9b{skR+7f!J+|}AhQ4y%LRK2o<26;bSxD68U75~w<%*QgVY41 zsH(uXVakX$QFF;U@j!42_+uljzua0%XLE;Z-UYE&a}Ljx`7i`bQCi1WuQkR~@2JjP zcA~W=Me{Uk-?A<+GA<(E3GQC-!gm`~rd_m+1)M*xUUH%{xfR5`B;8Yd1{~!sFb9Sr ztMgA^oxgPF*0ccX_3MtUC$zm%#-)THO5MpeY5;(wZdV7DH8^>8-BSE`?~SMWh68?w z2yhN&tYf*qdDo?loNNksw#=*jAdsa~$}@v?)MV)t4WFeVOQ&LHMgCg9rPKcQGcVW_ zMF}8EKtMygnGkp2sY<8p1p$s(Ff%t)TNSF!H5!3m_i|}yvCVo-MnRN!4NK~^K`?O= z7(bdx(RNmi-naMz)Q&jD6&lrc{;@|9eaHOOA5p9*7s4{_761d9WzXjjph)EgcOBrK z*d_qVFIT3-HTR|AfiiQV6gx)wpk1R3DKN?Y29O6G_aoMg;+R9uJkaQVsQlBdVTJX* z_k!3Zb&iXQ3gH-KPU>4f*@haXSk4osso&T_hK?5O;e=1;o(4!rQPY|Gbx2{k#rw*L@RiH>4LfZpq0%jzOrMmTu3l71q4Nf0SkmU5KNHhSvjVB%)ef}m;Ku~jiPOM z^*7@bCu1gf!OXLH?vHWW%k7WsK1DW8UG4uEr!0z15rESiAY};P3qtnE#%WoJ0NFS- zl(+YSX;~7w-NnbVk!0ia&<7Yy6^1AiBV5F-Dv$tWRic4I50!cV9q0a!k_JJug(`T{Zm zw>BnL(*r$o!BC{+!7D-xg){<7NRenJ+0? zFbGa<>tfrsR5;lQ)89!o9#5yAzZ#$eDlF4;lGVj!egmQX9Xe(Lx}lfoCdt|mot?2E~sOa%k-Rj zZLHA#_FLMk!Z2^u3+^O{I-h@fgj$6>knKj`QULlc%d>%7UF*ZVrB&_Pk@h*7dA_oa zMY7a(>acu&2T$B=Wu|36;iB|&x12Ed!+IG}iXwY1<`mBJ3^-qV@N!EXTWjHPn~WdX zI0dxT__`p(ut0{g;Wi{Hwml+NjonSts2R*82mEX)i`+}$BsA#K6BxlMO#y)Kq|%2g z9mDj%xMXND**G;+rEZU?hSVI~;`~LC62(Ul0(zOR)}8wjd|HZdx;;$Ivh!tCSgx<` z$EY{?MKwGw8Ye%>|1wT3>k0ZfVUto?(hPmI&0x4O**GQF-;s?|f?OGw6xlfK&K?%2 zt~CYfOyvilgb40DAUmvaMyxq1n`^t{?aB+LxdB9&!hK7~L|_mV1_*dnEQbUT##*%z zeQa*PFZH5<)l-;eE)YQ?%2SPhIGyV%&t-R22#Y9?uQ*#6!hp}1t4F$5c1*YHFHY;z z2iJTut+}&|(>BJ6#_G>W56r2PfzuF}<-k@pQl>q^rCW2PoXDAP@J#qOaN1pjybt2y z^$k&z%WPr0xNjYCImVNOT@Mq>FX z0XcsapvcdJ|GEsuYv1{zM;5aY`2=V)plB$+o?5BO03Zkib>$E?2LK#{H7 zQbrTPFG`k*Q>yM*7Wg^=i2<}>f`H(aql9)gNx9cPun8$(`8?JP2YLMmI8AgbVG79Z zBm<`@)%DxhcfRbQy#%%z;Pqd?ssHIZfLPz#pILVzVD}q1<#BqRH9GP#eN^|zxccpA zkPm#}t2`Mv<%l3?_X59x)8?RIDFP1}IGtb`I8b@B1sE|9{N5syUuAkfX@lr$5FI3* zik+A0ZhPwS2RQBSPO&F&@r~c#SAnHWzVi+C;M4VZHYGy_PGkQ7r(r}yn4!DI_$vmO z{I@#jGSSJ96VgqnlHwy?00`%G3S|%~nD>qHKw=*AZJS;?*v*0O(s9>noIGJhB;yF=?conm&NVQj3D)zgAm4+u8$iu1 zVQrYWgo>N+T2A_=eMBT`LeIvhi5Y`gu8A5taY1XVphf8s4hN^_6MF4PZ zOwdr>M^a*%T#ZOe-%RBC!~nIq$uH9%#njwn;FR>~x)U|RP@X7kuJh4!Bm<4O1-*E}jNMa=+3Y8k_#d(sEYTLinJ8nt#i>OOhH`q=PZpdn zFnqM2Pc;_YA#8l%ab)R~4<4iYYL@$+Wx}hctk_w;7nZ_5qqvUorw1&(;zH$Kp*mw} zaWqyEoK}+J{}bs{+)5WmEjnhY5|Yf-NU5f1t!`kgVfL4Fs%1N;^5Tn49Ktv(h&KhK z|H;~5&HB>uAL*24feC|TWU(<4x1qHIUA<>>lfMp|L2DBwN9f#r~Q_n>AZ!ag=@R zlYYXYQ<`lriRxabt(3|gf8-bE$9Fsz!(9}qyRl22fe|>?JO0|Xm)h>!c@eotTJq$y z3rM2y%1@wfLHQ)T@e*JlFooHDsy+#cjtUmHi)>sD6sKFp`16L@g_JPxKDMJ>w+n~c z8S6?fn!~+wJ+z_CC6pmp>KWjj$w_CEkAAf;;fhY?4sAc7OJgA zbKdQGeAj8-@@lPWPsrWA#?@1lbM~Wm``7NiKfXIi<23aDb?NjW{o|BS??J@Nnow^$ zH<6~djxG?g;F*(xxE2PRi`}G~4NV0?Pkfpwq;@GJlf$K(+_*DBjY*s2oP$JY_~XG) zncdLG#I47J?tabgDT8E6G&E(|UHOgsqx`KV!4OYuoR46L!Y)FGY-*-#zwq4FesljE z+D$gu-A@*Ar5kD+>-MlLSQoS7i-+F|3)b8Vb}fTh=0mTzARgj^^;6&}o;$`F9)4j6 z)BGS^!H~=pgvk=(vEBWYe1zob2M@7ZK6?<`W%m>^SSq{qxCSAZ_p>!W)Kk{|aW~>o z3PN=^^d2!#_|i{TjJr!l=;`s$$N8c6-)woOAiT}DeDfc=o!d`Dz>asqNK7J98a)V>>v+eJi;dq&3~ z3|;p0C!m=u2*d6mLl&}~8YDFys_%kGdbsaOLfEHX@aW{`X-`L*IWV}yF#%JVpKV?(Y@??Bp+L*>o^6(Nc$l%$tE=(>J8uf(6#Wt{Gw&~58ZLdTvAg;G+wK`s za{z)~{O-vfZmvCoS?b>Ep2U~#_|sQP?(R$4_Cc?8aNOt_yYO(Wgj<;D8>L_X{IvMp zJzUbO8j4$iZ<9VSIkf(2PL*rB{F&3xZf<}2V;aAb>J5vxJeLQGFNSW7+?Vm7+gBkpbU z02@FdT!RgymfOPy(dx=_2Qyy29wTHfO*bmzlU}YRu|ZJPrco)uN1D zfqk<4eRVCZbK@9Eyg!rNFTtERT{)qbi6-7cx$&PWh4Rt{=4tYF+DC+5y?mzYpfEp* z#GCrx-s)mB_ocmf?wYE~x5DxP6Gu@)Ygf%gYg562mUfs~*pSCP4u3g~S6wRu`%F*!r zu=#bS8APdoYk{5eU2pElufw%b%m0>6r4;z^zqEZNIw{RvCz)A&4kfx-zlGv@_|tqP zdxZ*5>ML^ktv4k5|B_Crq_hYdm5BEjMrJkNt59B$9=gEm#>#z9ut9p{k92Bu@q)*Q zW4Z4?q|@6LiVtY}-;e*1PVEaD&Q3ek9sRe`>0{ZsaFIsY`6qu#r{@w8wHQuL>LGz7^LU%dM98DawC;J^fgI?QhcQ!Lj^$3#F36M#m{Xh0Pw3 zCWWnc@+W_!(+fKvZuwpKIqBGRVfSO#$9Do5@4j5vUr5W?Q?Tc}+QUd)XQ8Y0HA6}1 zXs7GswqDnynOX?##R+R&-T`8X(q1<0SGPd*)x2hyB*t;gE`zq)@EEM`0+Xf=@*$hXxK>P-xb)4$ZYRt&6Q{^VM_HxZ82lvvHeT2)c}7 z?Tm9&o~8UqAPwEmZ|9ekkSaI4!H=+1EA% zHFXH1?J^ZZk2f%Q(eARK4edD#!^lpgvX!zk}T3v{e6zvxNf z63rF%I4b*R2q<%PD^9${I2YhH>dX{(VGr_K-|9LeoLL;G9K_O*-!2nI8#%S-Yvr`fELmK&3Aukc`Sx*h8Cg(` zJuS*7A>2~tgOfBkSUueq%{mY)IqI=!%_4|O4UewjWr{DGRoQ0I2; zlX&6P?drG2tE&L*QA+Z4yM9T!Rrc^Enu%?gD-+kMZa)=Mps9ZQ=C$$b{skvvuOF9K zQQcDYRevTff?7v=-<@C#k8f3FYvBVwnk_3Cr1pCNC7Qv$sj-k37EnoLcOQvbpIN#d z^4)^nJ(0zKs-vu)#wKpX?WGdK*~^ayHS#VB-o4Spc3m@qN#G^*7kgfj=}EbJ6;xK8 zXT&Kb1Tn2lxh|P+WkndCJ@~-2Z^+Dn(g}Io%=CKR$jQ>t+mul_x%Mbq>_MfYtYJ!4 z-KXN9f=*CqX!z;mG=l!aQ{~K$#51q0_ki8L51W=m-$^Je{^~OyIXL0)rszGO=Wc!}C)v4{h-YA(=S+pSLUfXAO>}K^#r8;^&J5G&tORYM>HY@C=;F__N_7v6P z`H`C{0(z$upFia4`0$K3OVumiOW90PoPvU@6~rj!Pv{IlK)l!TO`b3mm$H zL+&EiZQc2EYBY1KlUJnv&j}F-Nh(LsEp^}?vzxgr^RGc&>wjfjD81n?@&M0YPeUAf` zC%D|;N{91jPWH*t>CYym5f{qt$0~3ESyl0wPnF$IBG&Ag~FI$wsK{HC^K#3K=}gK^94ig z1go{BY>=uUHvCg|pDgX9szdqfa@pO8#I*DIu%}qc|^2Wc2NAC`HDk461w&u z@L|eANKqwXE>{c$@PIcZRMId^*&rXkj8wG=ljPv#g4;2MeH6n8@vJHzJiVTyeZc*{^A341|cm~u9cngHL-&I3s=Vy;fOMvb;yS(tdhX+$+a zD4k^!!l+!(7Pkpg6)Di4I;ARv6y8CBW>9)|Vd|Z<$O~Ac2oaZ8s7!UM{*=RbS(v0b zR@V*1Q%$VDORHhiD$g^+W*#b;L2_E6a0KKij7Q1v6e^oItai?x=HcN=(UuG6JL@yo zf=5azX|wMV8S=c_*bGyw4uuKj^X?EM0{AU2ywuPM<9E;J*$F-6c1JDTk@kYNT!y11 znNxTtQq;8oHiT7y7)a-HbNgVS0@DySLK4?jT&Gp`y5m$eQoL-AE#*+#ib}v)Kwq_* zSkNa_5n!Ndh!o%C;i1)H44UCEb`l?*CeaN8j%bTR@_7m#us1)1d4)>av|13vQ7Ye;OkbXiLT49+dUQ~0Mckiv^L!Z!2Q1t`Q9 zG0#n~at0a7N}+U0p~e?ZT*!A|Oel1Zp7+w8V^Sivn~GER3Z)amUd~wvC0MCIBDhU7 zKQy%7ky)V(!!)Grqd}#ygM|xeU2>ZXCs?WGgnSymY0Z(93Kj`NiFAU39u^u{+t4P^jT**fuBG7sg0sp-?(=ZPZp9ITx9llrBQ2 z^+K8;CRRuvv|s~7GMZ~g7`JNxMT|i23s_3xKVzW@_9Dw9jNz?X97GyG%!6>O+*%l^ z$GJ!;u+~@9BdT@}ln%emkX1(LBW=|4pm1)xc?mC&k5D@^Lzs^UHvGM?ek_JMBgD2P zi~@8?$rY(Lp~VM@RHtwj-XyszqN`XS>=@n(+94GiZL!@XIkSQkT_qJ4YggzFp>)R&_Ww>SNOS z1hRxwwQtFfR6?5TAAhO zS_JO2_~7I>C*(u4iN9*|>W7v!w87+vWjWGS^+WmIArY}6FV@4fov&KrS_R~^RX*yK zaN(TR+s3_tW0~TzZJ`~c5yG(~LJ3_*Tq=ZkKf9FxfK9D@l%8u9^P3n82v;=0B&@e7 z0799oF+op7PUiRjR}lmEYXpdT8*je{lMwEl01dsaVBPT_`unR zWicmVR9mtdZG{ivX994NaP0XVqyVJjI766*`#@yhZFao2t_dA?cc=zSsxXdNnvczn z56RAfsoHH7GDHb1PiyFe&J%y&sV3t01AdOjgi2`7UP1NJM(~2PSQy$g!owO$kStiN zlwl~nA66?C%MwfAQjzT8$iv7y6#y1$xuVrlrng#t*#2psz@;X*NqU8=#G>f^L;=*R zEOoBd&f1bivQqCv>m#T#);djP>GD=mlpU8XZ^ooqJ0JR{5NM;KO+z^{|f{GJ| z9Vlj&-Wgx#d;F36kZ`nwZP&-L>PYH2bjlMH*zRm54EQq04R}f#5rh(EXr;@&iw(dr zy?bIy@Em2VCJ{ zI!e_ooSWG06DwLpNa|`W$&3({J}jsY%c)mRO$<*@WQclw-CzVPy+3h z5yBK3O0gV9p@b5GVQ4o<gg2v))&ls%-CQ5>Tz$F84XW4*m)P3{8$ zmA*ItFve=D?jc1oT6luyrRu5>z**s+02`xDj%wBfglM$nGyFf|3l< zfru?>-pb=4oc*kj@Ero`u6U*_d=~V@@l0q{1lLeihdnXIC>UZd!6H>nuylq7ygL0? z97wp~sLB*?z3$H2hr~*dzV^i}J%%rdyk8h3>h+2UbZTo=uO@peRw%E*~KQE+dI}n*X|t{{+Q{Q zOfrh&HUo}MH=OrXxY@pni}=#Ey@;Z8&HB;DvSwSv^>*?=b^8-cV1-5uGBh&1{(vUUCEX2Kqn zBKe#Nb}9+#Qa!@ZM=v(CamBV{7nyO#|L%Q%+d2|c3i`2Z61pKPl{TzCV}g!h#0VEo z>0Lcy`OEZ51kQ5~>>R2RRf*Uv-y`}!y+^`d=UD*L8IZi8VxjU7)=u5%&f>PquUAQ7 z9#nKk#erAu^rKpes3tZJo!5J%kVK2ow9 zpdl7YQ4p$<0VpVGwUF5nC2f6qK-xeTU~6!AT@N1?Y9Z&u-mdIrVm;w)-Ho)CKT~Jc z74bGg-(vt--N$GW+Lqw=?D+)*t{G7g*c?p6>b}FQw~uS}lZNT6d!qz2&5CstKeXUe%x&MV$>ZlL_P&2~vI`3X};b-~gHhWE4ZOf%! z=+rQ-5M{MGHU8L7ww+3>4qA6Oa9jW#{+kCfj@%m&V=Zom&Dm?|uTG_j#+7xFW+^&;8220|?H$fO+O9+GeUN)XT?SG-fC_Qj+IVz4oRV|B z#H!~?HE(DB>6*&})jgDBbyoYIsd+-FQcVtYpGMssyiY@~uzId0h}F}nCrjCPYoFE2 z`4wgr#EKhhSLj<;IWdSqzaHybk>lNssLyepeCEHs^_6`?oo_pexkpe2hUn{n(Y;OX ztF-FsCO2kQ|781IY|?9c{F2vlvpdhkZnBxihClw;C+{*}eyffxF0^O$!QQiu1kUr! zrrS4C;3FHC?jJeXu;*qRH35Qn!9@?4L-Z;04-dserk$Dd-3D#H@nZJ8%RXwU3V-?VK ziAkrjPCVz75ZfZN4&(??6@fz5YbBsJqnW`8Xm=Wu2po(lXdewJw9J% z?nt)a+(GO`&bBFPh^$9vEI^;HM#AI$wAzmG-%A-dA>QQGZ>&NvR2#SYQPEajxx#HI zkSb3)+&HlV_}svzhb$*vKO)DRLb|bsBS|vRx#v4hu47YQnEK&U)ty zHpMOte<{!H31*PfDHbo3BPvo)ND<{}IxzF5v@%S34PV@@3U2Seg$&|xeElG}K{Vio*i5yyONvkW`LG7g(dlQni?ka2gGT-y0-n9 zEj}c^@bM>F#OW6+>)JW__{N;!!8M$n3^Dtt{Fmxv7xFbw5nP#z`9Gx7%l(Z@n=+b~ z>z9U5bX&LRl1)hq;>tILo<4|Gp7F~aCA+c{byes~m&2r8|6Kx~`ob=~;niBly%T8( z1@l?5(QU9B*h%2(y)ZYnH5{XL_F6kZI=$|_R!q#^?TjXZ)!3lf+N_*fE8I{v3Yim`7Z`@$4OzH(lq27yr8DX-3lyEeQ+zOzr+S>$zET*HGdnmDq?J^8A=cVjegh}S38D6{bNEG(6@~pZqb()9TRg~i#ZZhTNaN=My|y=mGLmu zQ?$vQ^N4jUn_-`<)0Ovu-tq3Xr=J~d7klFoA2Kw{K>DRa-3@)mD`oZ&&U{&4o9z0X z;0Yep_EFIBNN8L{wgb0hvA#|cG&vK*&j)*gUebp=xJnWp(?#B6a=e;$ zLgt826+a?WfuZS~@CZaGCGe&@cSetV;^3X1i<6ErgLnH<1SE^U^jJqGXVuCHs5V{h@pY2T;e=n3Nq^Dhq>-H7pY5a` z=>N(h`fSdc>mzlve_znbVD{I>ImEDX-xa3nJZzAVVwQhDK~TL%OCzAf+t71;ijK2K zQ&967Zi3f0;eE==qlqTPtt3ExRZ`3mXhR_cZY12Wv%_3$Fy2 zs^d;|*wcTw(1YEoEaJa<^&0iWM6?1~)2ZcaXQ{{L=|<{GIaeIL*WQfMr8N{hU9r^@ zy*M7JSoels!bdQ0;?s&vO^2gI!2M?y8v%@H+n!RZPi!NHPt!h^U8X6Owd^19YLqYg zI(Ow<*L%~7n(7j0+G_ZqcbAvtk#xEmsXSo|zR7~)W?G9nZfQq1Yq6(5wp7+qY)=~3;x~ptW_>K3T1@K+w=ErbVXQBXpIc8Fw6*85IKyQs)sbA> zEdi5d?K(dqbHV>Su|&!&&LZL}K~SA02&!c)@yPT+z&z8QX{qSwdiIj#0)uM(%K^`D z$@Pg#2Ioh6QA~0%ue&3Z2!g7;{2I->Y%-ZF)*e=$>O9{*0?xtGdy4{>t2}vGg`nOA zGkq>P{t>Df$TnQuyPD~pNNZ;PW>}RVsI~_5 z3?C7yEn==xA7&nFI(w@SLrJQx@Dd1B(IY~Ygaq|QTHgf}T=tO>fTF~o``j>gW5!r7 z3M3koEaSEB7U;4#^l`-Z-s|j0C4!)en$+^D&!lyj(j*9~MD!v4Bi5>K2!bkrK0*1m z-N^=$!L%bm)gxN4q1=4VTbZ^W#p%j1!Jm$8Z~E$ny?DVDivXv+3Q%lD7N*rHW zAH{|F%bcdpw|Z~qSle}P+WF>#FEcE< zTM>#9pz`wbRCI4Qr7N09oLrug9KInO9Y%46rYXL8FIuOi64J7KeN|-%WhS2!bHHl1 zdet~SN_UzS%pt{Mv6={8&%8K2|BX=9Cuu2-ziK-Z!QqB~i)T}Lpsy;LNYa0H{O)SpTLbTr9(X?!+kwszAsu2c&*@cZS4Voa-{$LWTC#yVBh;>v*>_*f(<- zUU2P-?WCJuS3<;h>I|qda!DG+_vjK{B;DDSXr*bfn zhJ2aa5H*pE&=;i)eZWH_*FBk7YHc`BmbE_~Pkq_!AD0;r(jc&+W@4b%1d< z!<>pZ27m;a`Sf}L9H@dPi{~B(Rg5=2CWDESZJl1x#*aV<(R=q!mOhfW{3yUB4k35= z0UYMTeVYfzQ3B9zNZ6HmU-Gh~EdDh0LIuvNuXTB~^(E2xX#=plD_FV=EV~TW5`zq! zo*s&$rH8g>Pt%N6$YK+CMQpj8xJjfPAR-;m;xmO$-qW1Z;?RIHAdNWQ8A4Ac(t2}K zs=|T#7zPWYHmk=JveHWLfgHVj>K(sBS1ay27{ur>PQ{fsfvd(|jLb6ZGU zA}c~HLpzT7P?DEvo)>uqMe+c>2zU<-Ph{6alUQDzjQZxNM)h#Kf^rTTalTIE9Pj7+ zoWSwch-)E{>#YYz7n*Cs2)3}swcQUpG~%3v!rw}_lC5*>C2}hFu%@j*-%-X^co=Et zQQ1-Q9Q%0p9DwDSG>fMu2b(cfaQEQLvVqMPyjqomI>tFC=!OlG`1-r~80l6Fuh9A6 z`E4r)GJE;WWCRkg2)JDDGf(1;Big_2)op* zzFxv0-uI5cs7_PH=JER99wCbpZK4xHSCX_F_q|*fZIuyykyP4HDc;6S{)B3MIaWN5 zNc?d4y6EhH7y>OZxGwQQOj658NEN^)4C3f)p%8(}HUp@o{l4BuF(fnN1Nh_!IEK!SeQ0q2GTNRIxUx!ZFXr6kaEp9^9$E{7Wsc|UG z&?O;A52W)gGuzoxRwgURJy2DhVWzNU%3fy^$%C3;U{TX- zQ)J9Ld2|z6bbMN{-RYAL^PDm_kKbEoX49hUg~-ZaC^$k(hHRvxb?NFH!pynFm(HF- z^s^9Gk!Yl}bCr>t!?3g@X^f|&sk$Fqgw44ZP@|oL}(d?@&eJ7q-glJ9X0gT0yzDPuWpNLBiMR&OV049er zS&RVr48eLUVEL~Ce2V2?4~d*i<`@u}jx6hT@+4InvKTW_Y8@_~gpJ_EQyJkm?NqwfeI%Uklu_+bEpFSS_E@Ibk&1oGy7h zbkg#5${B$fpf1UhFvsGlii~bK5tgEK=!4;iYyeEx)?kdEdc}dm&K8`V1nNl$2)S$v zHsdwpmg>SVLYQz=vFXI=Zi(AzDe3-*ZvShP0psTatgrdoE(fT>1E2Z?7QYVMV+xA% zRh0b9UivzqM=@y5JLuWA%Ovv+WIxLgGe_69|5A+veO20fZoOJrS?v*6ju%`Z^oUWV zSTt3yRna%|g89LH&cPD!98WM5#)`5BEfkIPC7Zc~_vQ14D6F*sH&b)2J6|GQF*&HW zHuwX*m}pNKg*d@cP0PVYFv~@fCP!YaBXL){_0jj%CBL@Hm$y(#QDa_F5|aDKJChV?KqXnK=LtZ<-sI(glkHFcgh)7hO^2kr^-@Kez{?7 zo@O@IdgV*1EvuA&G4K?~^`<#T17o!3`IPkDv>U7phm&#{CtWik~A*Q$Kgk-dLY)|>vv!QP#-#yO~u|e5qJ4gI^*lmeQTqT{=&mcyMFV)^bZQr8wQ~3)@vw&^78T^r z3X!ch5BRQcakK4o%gEIWja0}Qq=E#pDn-XD!J%p!bwa(Us!vX`D|uBH6+L3#lzvoH zHVX+F2IZd2k~^dNRMK@xD&;B3(9=2Q8neI}->MpGnHpu8S|iq!LCnRIchc7zs;%jt zUd`%S{ZfnCD+$lqQ&>rh-HX`HjAThZ$=$(u-DHyAjosr4&(BG^5JV0N6f@&s2Y3)l|Q)UbO z>oQ2=o?Uf4^CpyPA46}66`ZIR-*1t&ye2;(fA|&;?ZmXqwlKzSfn({9pM`Bs%bOgQ0BK zlxNDL5z6D`xB&uSU`!a@wD41Djl7)z;RV|(<)-i_a9MS`E|Nx)`5>!L`&|)(Ba7I>VA`^Zsk)Mmq79IwB3#RLFZ(bQDI4Jn6$WTu(jcpW-0rv5?n8VV!Nf=1#v z@xMx^uf4|Pow=?LRh5%-R1RK&K4KIjk`2wZd$~tmWWz8q2{Xn3nQRxY%uIg8%!FYD z6CBthLyQjRczTvKnef=@R*p^RO0a0=4~EbPtxka=$Mo6Kd9)Qw*Q`N;IrI!=X`ilCa{v zgQ7hEE->mXgypOpewbSM#Pvx&Xi<_PE4eXXv4!GY-R1l|s>~^wU`_fVTQWmCkYzs1 zBu`;ek!GWHr9P6d0x(>(m=^7*ZJ1f22(+;{|i@&#L_-krSnqNlm=8VPeQ9@L5F z%J%VH!q3;MEu204{2aF5YQ2xr*h1@BRBCpXK8kCNxZF1;1VZqu@et%$1dIJULf1R0 z^^9nL; zU;0t+J+-ero`cB?%)GczHuY$RVm@BYWR)B5uib}v8V2@KTz}VAQ=+?3|K*L>f@B4{ zE_3cm|HC&=UB`csPQ5-<;`|jP+x)$najZNzAV1D#(oTTUZQQ?EQxhDPW$Q(nrx z-hP)ai>mD`rO=w@)1fq<3+-okby>Jy7%e!|FQ|*$$&a5d_>pLDP^pPgnEefeSg zu<%l-<=7g&Ahzf1i`+2fR4@;*^o3N|v)fdqNvT&(xDce%2Bt?EZZiv-8!TuMlXxAr zQrneWzx!w7`n2>EA9f?V8P5nroI#K59x91GdE?e~wU%q?4R1NkusxH4pQX#F(oH`q zQz659;sXGPv+qF#(bP5pE1c7DV>fON zu~2gi%*}3yol|Y*XNXaB^IPB0uk4T*ID2=H=|gO91w17$08wn~a?_Ryyb8IHDzRmXqqV!5hR|=UhPov`xw9Tp8&r`EYAhMl>+&z6^ ztz^}dpFMS_)y{5B-B&C!lxed=onevIH%g2v3y^_u#efKJe$cluQ9ij9f|M`&gl4nu zH+R=IpbNo1Yc6EEbr_8=V^Y4KtGd7Llf{dQVVKSXZXw)>B1BtOLx2twj4!p@NKI#` z%-l&nJNdEhEfaGbGdjd;ir7*~C#{8FGAFL;#oUq1$Y(i;?|@ZS+~Swxai`bnC&o~8 zW)HbXZDYB20lCd_KqC;#5)wjr95-mwVUki-z%P0Gg@KHu;8ukmJaL^=#Q@1|EflFi z!Sd3GRwt0Eo{Y{N9V6zo#EjMiDlzdxgX=I9LHh-g>@xcL*Siu_bojvbL*W@f%SE-JRxmHUpD>AVsN)<=uXf~6BSzAS)k@F0tT@KO(qwNr2%txvqkCY#&+4uc?vAky0n^6;e* zBxWLsogyuQOWmF_n9K5<<8rehu8iTN9TmYf zPfN zvvV;;1P>C+busd*~JyN&LDavRN&-E)w|mGrWHqrWd%5` z+afdXxM}(0#=t3n87af~<;NlYsaA(}ok86W5BIdkZ2){5%^XYRq4K0Q<&RsTg5z+3;%A19Llt3TW6&n z%tydmyM3__<{w1WcY8hYJkcBU^t6&fFIp>6(eVLXW6&MvBT=0<9=ed~&+8f53qtg^ zoCF^Rqorl9c%7LKS~PG`=@<3X$rPW=@ULsv6;ClpZ}y@AmB=#9lgcBd9xs`X-yM9i zB7GtbuGh`+9=x`A=Pt#1Odv=3$U(@{vODTd7fKCSac%~P88kyz`#e`v-GZiT|9vOs ztiH~bn%d|5FUBIdPrVt5(Wp}v5?U32S-Zn!P1Y`(vG0vJCben^>g4?yaG; zJJO(+J_Tkz%W>_ChWhDp@8x>w)eNE+&y26!Op)PHeMPyzGOz)!nwZmp@pLalfcecM z51H%@J3AQ%;vFV9c*-oJUNTKaIPKZ8 z9>2!Ke+iFNtsjmiyF)1(%@nKUpk>~vFg3LtC`QCwXO2}5CL&cFwWD09IL7|g?~Y3X zH# zW}UodRp}sCsU-Er_(BM7_(sVvC9=gijfh%2m-0Z0Y`u?63r^ERM#VG>POYGP-lC^^ z>oiC!gwN3wxCmYVkh0rT#YsV4c1SN(Lx3!ym#Qhx<3c93LbQs^_4Fj~;mOf!G!>fH z@d>14{W=1R;OF=N%0Z)p5V)uk?b=Xi-}Z%IV~|5GIIqu=W6p?#;)cOuI36E1N*+!) zcak_3zNzhYHk~!D0(cUl^c>Fb(5HQhlvHGn?EFCRd2`ck!6F*C?=c`ObHB#&+$5)ltNSGu#@jq6lbV zT!+L$jN(8ZvByyqO)|(Hq4-&IGgp?l1$~V{A(fBr9LBtlQ+SMcihPZa!KK};8!-pFA_klQV=p_$TU7l z>Nc3~a*_}$Z8e;jN{d=nliIh!NA`R&PYW;o8g*-XqD}iDF{cd({vuWK8=@Li6Q?dr z3@f6PDJCU>Ci{jY{er0Gk~Ntpj4gtF6VeVK$p=ztBxvw)Wr#@v>IhM7Nh4WHQ$~}I z`4E6-AZG>DMd7v%#;mnT^+AX2f-X(CK@V%iWaMxJ25mCkggz`ckc-)5gtIfTJV)`l@;}%wjvOc0S zhS}TxqRM?lWi6L}F;puK?-KH{l5pZoDUyL)@_N?XVkwft+X3rvVsStsX#k-Ko^Z&T z`)-UlPZ+FXLrnf2Of8aGP(<%mA-&G(u-r!^8ggHfz#*|ZuEV2Ql_PgX3(19h79pM7STe$e(^=+tZhCQCwhlfsd&lWX#4L)LF<$ zM)vu3p?w4Q?pT4f2z+vtv@GgjZ(Ba6RJMc2qqQ{(CJgOW5jbDeqlIIIqN^01R7J~1 zcR4Ex1EaZi6G$I97nLwlFhL)osES)XviItXHWOo0RV?D(5Gh0MCcZDuwFT`SFV2YO zdaGYja;#8%7kF>?u=ueEbNecgNvr5lG#A_P(mQRmnI$E6DyaS_oko|vC@JfDS=RHu z>=o7Hev!w6Cw@q$Dv$51Qe3Mbt_*ls8^C2M{G^VG?2d5hQpuAKFQ0sTe zrgMR+=AzlV(3~I8sy0w&(MsNvmCRXaSWG1+dnJEYrT9s-U{La=uZj08HjMs9j*0+V`n`)8iyyr!DJWHQ$s6JVX zj$vbhlINYQfq$qk7$oW(uP;h$=sejN3vJ9+ZK`}$7Zli(X4K^Otm*kP0qvL1)VL`F z3W^X|=3wp^d=~R$X;aNOdvQQxc4?zU7Hj*5#+Ty_RW41dF-_%rhs|kU*jGN(yzOe* zw|GX}-CRLjct#6k)F4oTZb{N=x#rT6&iy>y;(22tX0x;qZG@qWt=r3Ln)>j(8H#lX z#E@HJou6Up_j#={8B*PO35aT$5BwQ~IwOj!cbdLNiu!f>6=m{P{X}eJAhx+5cSjju+AUZM?DA zYPA=KC+q4b+MboYe6-9Pr9#W@&+uvM?86q2<_Q>o5oGyccZ(V{X}_zKqZ4h=If}+T z4(cI^?HU9%Ms{~KaP&5*_4dVby)dIZuLmW)MG-YfG3ic9Hp`2KP=!m;ve3KFI`kIB zw#>P*j%CwUc5?vkHI8?441n54`yZ~zGY1cTZWiPOUA%j}!(qEP`c&XN@H0&K@u* zOStc#fnN=!zGzbDLcmlwO&-sgcaO&l=chkz^GTafuE;H`G* zfnm*)8tQea|7Id5)h=W#k%r~@Dyg+$Yj*{?S%?z+fc&O2S5*_FESUAKCRyS|P%Xn4 zlPCDpXL90xsjgm7;0C!{h)GN{&vt^-lTNThH;=vzL?j)2;j>?&X2@4?{bccr4^`6wpzCq0=gjg-jer{2x*~efv z@5fdwckzPV2_p*?f`kO`E`I$Wm+EE$cmpCfTd}e{Wot#mq%J+;28chf8pnSW2HH#Q z=Yk&!QO=(BhssJfO>MG!9@3=d&iH${-Rz@i-ES}R`F#Epcm+>Zho{Y1rw*5*wzi{w z2%*MlhWQh!zj(3;EVx9ILWB{Vw@RLx0RCzy4TMDyYQWE1z~Kw)p%T!(cExuN(IWAy zD>lH^#_byj$m> zfiTo0^GaaH7aZrYrUrdiDA#`p&NqjN?=}e&qZu85t z$W8|AEhzt-tTx%}oJ5+7yS&M_$I|WAjNcfK#l=HayrNJH{!$}Jy83(*?sq9?Gy}n%qd-*cUOj6#hLG|h5Lc) z+<5f@KFR-J=a7}VcoOET=<~|)%U9~?;+2D7j6#H&dSH0qJeL>J*-StY-yDlrxbxm` z?}>^fUw=oGp^xng*zxAkxd&kemy0!T(&`km-*m!=D!invjDTd7#1x8S__8Ys&DmJV z)qD2yr1CoD9LR)kZrQZocbhQmjEgOkT?1QOY1DgQ=Vad{&MUOfFKMCa`VRitK&?}9 zCc%9HY-x8pB0=yg%8ui#K{!v&yEK>H`?Jv-9kSt93Z&S5tvFug`W=|_JPxz6uR4P` z1j-p|k@Hs@BEiQ@9?SEO*)!{Hpu!|m9zGbne@esirRpjN)+>jy&1VZdk%?7mhG6!2Nfq_FIc?2(u0P|Qj`?6W})0d8ZSF^-`R}QzthnZ zsCP&{qpR$gYBqB9lwBvj&?6L(MSEl6>M6jIX!8J?eI|v&?-@ud>QQ*QfJuTvAr=WmW}ie7==nUf{SJ zy;oV}W_QS98XGab7HujU^D2iQdEYR0=lGl8{lvbGu$C7yIf1K@+i?HHf^~5Qiy^)H zV(tBU*JL(9bxNT$^GT4q>5GEj*GT=W;{BbXzc!A-oMbej63#1_G(ElKzmV)Wdq+>h ze=hx!)t6eX2V2pi6P@JQqG=VR*X~KG3k`mHQV|mP71Feo5?veN$N$pzg^!>XiNp{S;ojhlIo78^R8U-J^% z+F_=d>-#xyN_8JRj^#p1rER!uHjxSidcgg3M`0j{n(~Yan9jsVph5pmz!e$d&FExR zr)r2Wz7Nh&f+QE}n+6q#T5Y^j(o-4fk0(f{$Q`I`_^Pzx+D@lL+Ex}1_dB+w4==Uf zctV=c#ffgm^$}cm6M{^Vc-}UgLQn~dBeu{9^>+(04XYD+KSr{8C0Gi9W$qlCZ}rhh z;V1)o-)ZA^NSRuCcVEVq>c!mpjg_K8#00mR{8j!t9Xd`jsr3d=9g3HVkDN$|XPMtj*Y97^yIn2aa&o;a4$=J~x!l+~%w+-(VE)GxPvP zdRhONig8lfP=VmF%O^}5jMHj{3MC^h8#t<*%^VqesMvP-w0FbVoUNfp2+}Kt5h^D6 zOv6Qb$F3NsHkdqA7%n~&amA!i#kAzyaEV#lm2*!UOrQ7+ms*p$n765%t4td%yL8Ou z!f?a6n*Sr6>Uu<*Hrg@Ojn>wCauPSN<>Ss^=qB-6J0-N8T8UG;)i2 zYIR%de~PYp{ZG28SEj{()K!CXFHPq9p!3r5^B=6|yMz|FP89^DKE%~L%-VQ(J@SzU z{!z&6qZ^q;Ez!l^4aGSd#Xi#|;iaWz@n!y-WkH#bdzzo*&pe4vEWeRo-ZqDhORWsu ztPHQJs=`%0%BYU`v#xsoA9dA7&oj!Ozo~vcxYd%?fax2;RDZw}mST@|)$g=wMSK4L z+i2BaUDc_D(Ld9w1Xp$bHbt$Q2mHx~LHoV zy-m-yKjB+r4q<1S8^3HU4whQKYHr%wem3|!T9r@u2dx_Q3$5z#jaF^1!iO`QQ!@NL ztx60b(5f@^0>o>?kiVf-c_C+hqg7W3w5sYKXjPu+f+JeB5MpYg)&3`1_3BTwYR{i& z)rP;MRjGxSQW%bCRr1ub4;WE{YtAQB~+e*2mMT|^8TPz*^g*do+DaS|A!ZxAqCdey3H#!$S%vfZ-t)JFUOas%QQyS{3*wT9xnjwCd__v?}QjS~WcU zRPK99o)!YFs`Fpas)SL%U(>3`e^09dLLmd+Y1Jf~pJ>&5_JO~pRolXtwEs(5HRn66 zx&%*WZ%+J`R;@XrRl`6I1X^_XXIi!5_q3|?&$O!F=q>JFXjSdSHa0-YZ?vij z@GodpJJjFOs-l0SRl|RyRo9Ma)iCh+*gw#!*ngl^;r|V-x-8KA&$KH3&$KGm>;rbK?DRejY6 z=Lr8qt1=6_+7Qe9nN}_NJ*`^w3$40EpjEBD(W*7S(5lIQN2^Z!Myo3P1+8jLpjC6d zexg-@*bvj7Xw@K1EXW5Q{+(9UBhadRGQZNQiDwdiqE*-aBdrSkds-EJM62pUz)2xM zi|@25&+lnf^iQ;EC=gLiX|W2lSS4*rpmq69tD-42A&O@Rv?}zEw5sVhT2=cGv?{}& zXjSw#TGjlBR`om7{7Z*AGB)K zzo1n&fU)0b)!3hDRm2Zkwdz+|H4mu#FKAV_@3d<4zo1o7K)UGPXw~8$v?}5ot$LyY zgjgl2`Wsr+{yVKI{f$;#`h`|i{zj`-N&N+_iu#FG^;`Xqv})yF(W(u;wfsh_hWw6J%_Y#PtE3o1Qj3HW4tQ!){1L5MRPh_FT2w)L z4F40Yn)`cNwd^-qmEdW7r&VL&KsN%d>i!F@>X1vNLkUa;7!snrztXCXt7IVniuVLs z)g4G?OPThKR<+6fiB?@cqE*p0w6`jbXw`s_zo1nef1*_@|0As$@Pk%G|3s_CyVK;s zk7(6+sc*C@fFk}It(q+L8?BlKA4w+Asuffbe@CkVs0g%b?(b>UqLANc)yfuNd;%~z zg!Dj)(peJ}-$HrdPG$3*R<%5$RUZ*()r23k>OG1OcN)jPqE%ym(5lv&N3^QLH(FKu zS6Y>u@F{375WfEg?qI!dw5s_JTD9f}t(uk#)`R~GS{43-Ru!k71yFpaRkPu541Un6 z$#BS{Z?tN%6j<-y(5m9!Xw@6Q^%jucA81utvfpS`)DK$q@o%)M-YQuofmSsnHwgJ7 zt!k6|uW41sKhvtg|AAJ$z54$It&08!TGi@*H?6AuFKN~1N3^QT|14S+N=*Qd|AAJu z_)e=r|CiFLpMTJ*B7Z@vnpgcRTD9niR;7ZXk7!j3P44fssx}|x?FdGDLJQdEkF;vi zU(l+HKhvs(ztXDn1X|VguW8i_{OZJ)aDPXuYX6tC>L7tut^bKu6=6-tHy3#G&$O!e zzob>g|AJN(|1+&R_(xi`TwbS$K&y)UN~_xaj#ho}zkpU{Wa99AKcydS5xiZFZgsq521yg(0d2zB%v1(1f)agRRrmXm_|ps zQbn48fKo&RMGc{dfQW*Kib@kf5J6BuP$u{NedammteG=wew;P`!M@hI_TFn>`~7*p zx<>z>XjS!pXjQ`E=Ko2n?*2=w<}zs2!T+XJYZGB0nG$7_U4lSZhydGZ>>i3{5VJr818Qi;2fyu$cCLI&Ssx^U#n$h(Bk6osTRQG4bEc?0A%$baa?DUY&=(2GVdHYh;vyKLJAhd`9rW~>K%YD z2UD@`w)}wEGU#r(1Qaqls_^c3e}oBybyvj(p|!BLSiTot8s8)9rW1%h$jB_7brh;(Dj95cBuH&lI1Hy(QdLzN9Elf_qAWVLG+U?adP#4l8Vc(JHfa z0N|oK72MUIY0G;Ca4ZHVc)SA;xz)c<6HoF2XsTHbe@;Ng+@p(hP;Fihn(m&B&1d1| zTJhuy=Ms}8if@Az%#1AD#@!{MDZ}edDtMsC0dXY_qhNR4 zY`Kb|hudeRdm~*vVqBdcAN^?8bz*Sy`BE5VdsCB0yC>*5O?F1iB1d}!P3 z0wPeF|2tK(<2#J)NDV#LPR+a-F>o~nxc@#^L>x=3OdW*tuhn=Z6@pkiX;4}Rg**2m zOa}6PR30Hf5Gpdfe2AHQ@p1r@z_wE7!7kJ ze!Fa~0a#H$atI@80N=Kbl9bz%)+~HVtIj=gYZ)LxWr;w!Iddi-zO0irtOAlzSg<(p z`}2%P@%2x}v@Hw%Kw}kIxpcsA9t3aVj0{$%~-m5jvW-NoCJfrAbyDQ6Kde;MqG$jtfdcrNPn~#v@WbPrxteZ`EOrL4+RFe|f$IMO zRte}{ZooJLSe0b}s{pY0zrd;$16UQsD5^38`T;~!*HPjSHxZ%VHuVlR^B-UpZmOjy zujnBPqUmpk4ngPmRxk_?RNu811GVf&j1Od&b^PoUD;bCDn`#X9)iO<0C?bd;0O^|U z5J(7$8`OuwiIB`;>3TTS^^w9*w&FZK{HRxXqE|P9!ZQw08v^2rW!cdktj*~TssJt9 z$JznL#jXHg@69MzetqHkFoM25fD{iF0%4^=8XX>M6ug0as3wz{fRleF(Hic*IrF-B1@era32MmOOHaM1Uo1_cVzm*Yk4YU zeCmhIh8{fXyr13M^hzVch+v9byQydJ0BkSMgrI{L9`@X+#b`dsp*@kTGS6)u3D338 z8aL0KCR#cHgqzcu6^w-OBW!b~(>RN4dy5wr28(DFKch+OohINxei>r*!v~m0_=jGs zmN8L8lG9L(0U+Dn5^Q3aTLZ9ZgX_=Kh6txAM%97|VjK@L^LI#Z1~q{p7@~k8oW_3~ z{_u>cA+SIxrobv@E?s&P+eUmqntuGL_BwIV5>!L3Eg9&kH1RG_2N&RTEgOPkt#gRh zH#biv*IOUwYLH#`s&iu*uP`Djc7%1`G`r(rY`|bwRgNNo_{j=D=rB2sVOXNzLtH&m zBGC|U7;tACI%;$8&=$~)ijh~%226YuoX|zm2@|bbNwwcn0AqLH;4sRYoPUAUr%L(^ zU^U9(LEX5}dE#M!;?#eD)f%XUn#BE1nEs6$VST`%w=+!2nhl%%}U%Vl=y|e}Glk*gwGP zb;*=;Q|sD6^-~v5TfHGeaT*=E5khCUClLNJij{Q?VAZlhqH_NE-=kBlz&<&;p{+`5HATVn*)Bb4dUwipXLKU|XZ~fW`D}Oo34VxIHAGw-$ybg0l_D(Etur z;(k~yl=hgYt1qF+MD_)6W&(g^Gb9k;uT#UZFjdI4Fg!kp}ln|J^uo$rA4>- zi|R)iz-ou~BS`IJig45fK(LqC^IyO!Q0@WoxOQf2qEBBY@gHDS@~lY1rejmLldXtb z?@On22Cxb=U;wM4*+i=`;y=LZC}c(hAn^~ddi6Osy?2**ng>8)0IPkqDYr}@erCjM zwJTQCKy=9-*-R309HHx1y&1cn3|_zyB~izv4&I%@OJ;i@w2HC0a3N^ zIrXRapDNbdUAbuDYDv6QJJV$AP{i(Kukj}R9n2L4Ew}-nKI!#rKivbBcCFJyZ0_D; zQX1(2)a3!+>j#-(9ahJKyYKt3FxQZ46GW&d1zM%?o~u7ISoK5S0@O$nNO_ZPi=y`4 zgcCBUI90*VKCnoFneGB4Z=u+Yz;v6y`T+^QN?=dr zzr9w>`j>zh;gQ6V*_C}gT8d9H`|^4(;-6mDR`W$|iAWxB@ZI(mYDBhVrlCIge!y|* z_xYkZmY8u!t_5}mt=fz$LUH*?NBV6y^N5)H$(fhym6Lel{S+tto}>Mc)qW~hOB0)m zTL^3da{q5=)h&PHzy4T`0Il1Q>9r*lqku%r(gWB3qE#*P0<5Y7tgBaK%>!(`uO?V7-W<58gJqfeyFPoA((F4`n- zjvz*ZqM!7tm*}dGa!{c8C#}Rq+tgJh$tBIC+*SJ`dshD!y?)OYmcnoQotI9_8g|}0_-uS|w#Jtt z>=&xzms~rccHvCet*u^C4}4l@ElXh|TPM6D zGW^2U=hd>{+Y){=QS15j;n^MG_daacZUvWIHOjhO@_4qNc01&e(Pp%HkxiaaK1XQn z?EMkG(4p#xv8^6eb6%zX&9M&=lONP1dPbgaMg0GVR{b{1#N)H`q9;wy5)x&Z9X&-2 zr`zapSR`bQeoPS zE~7dk(j22Q?CwF_EUDqB4AW+?9e^RNrk#j_*hNtzqnJFSsHO>VuBCAa9S~U+NDw^x zLO~k7j2a!4VV}wDhRPs_{WKTL(3^uMlrfdfrI~g>FGewWSnm3iQEBj>rUhviqB8sl zFuRVl3sV{4gt&w$rlbPs$+h&K`_+tGZSk=#M`Nnm>MJSEEb;(6n=s06AHQ??I{ z>|DT9wx4!k-mZ zb)Yf#QjVG6mq1KN#gJ_@p1iCR$h|-Q;%>^9t!v1;9gMmmc4&21Tzj=t(BJvc`y($H z#%hxeNk2^uV|(c}iy3D1lux-AVTQTN%@JJ_3OXcp?rrdQMrQpE-Ej%=oM=A%hXjVi zH7eR{9i1$SJLM(b4A%Pd^7ARDPM@CJZcA3PZI6?PcJFt8g}*#w(<7NKwLkk&+2;Gv zn1`#*~BTGS{>6Cujy{#LarUW!w4yU%PV=g{P50{(sF z+ssYc*6oK}Zs#~7Q$fdd&f~LZ-7XyO`HH3kamU>A9Dy_}`JSha6LkbQ22F-c)Nq*z zyjFFD@da)UI|#Us`qfFhZ!YG^PRuo3F^-1 zx-;|_u(Gw8|I(`EVfW>lt|k8t4OPbI$}?!yWcijxGSIxzA*68L=}1^j;nv-^$qKjo z%yA#CcmLFDy?#j=*w%4Jy%5rsD(bM=WjoWNcx!;;0=0WYbffhSeF-Li?|t&2Qtx86 zzH;9c_acS+>n)YHPv-9I%JkDm^i>}Ic$s4sa-r>k%Hu!3FF?vZvHXWtRi!frm*pG$ zhgP*QP#fm>Ec?cAIjQATF3Uf(DkjH3{eRG^>f=fej{cigP0^Ut-RReNZg_a4G4(I4 zY9@41^QD!-|3s^5y>j-r_}{eZAFVmx97F9l|KFrl&m|0Ic}vF`8y$o_U%rmLO7Ptf zYGR8`J=ht)9;ZeSKIR&hdtK7GDU_~99%*Y_&Xg5iWPejG2I3yoOlbQC5A5<0iCW}L z%FIU{vRZ>X(MHho9Tc1b1^#ums8uVd%Dece+f)^Js%YZ&}ztR^rNgH-^fHU}0%NOWt-AcN`I{{9P8N zvE^VUUeuW@R>TAdbbv|E1t5FrC)M)o;U{Mh7n$9BWMOrAZf6yw<8;U;X=+NI6WQiL z29n|mlAKWAfvosGXP%>JcI23!p2Ol0R|UP$)-H`ZPBn{boZ+d?Eb3($NXeTm=Q$pA zu>CE!03U}s<6BgHIbVa62Csb7~eT02a4x(L)4?L_QKQxI;)NAaC}Dv!@x@d zwj{PYWlwCSwC~9AI&6M}2WR7Hsg|uLBA}`ka}=eZAFz`{)HI3mS7!hYtzP^7`IsXK zwMnzq3p_8yzBlk1e$`d*3A&=>%&*s%$ss( z;5q0q3;-hMouod6A16}3PJFR;`I2tPblTa$(^)@lrpc?5_}VS&Zby@3_^qF+dEJfU z$Onmo-9eu$sfhDuUS~;MseZ~nn)OEdso$Lo0c2mtaPy(rF}vJX&X|a3 zmSC%R)9x@Aqo5yxOh1>qtM4-Vn2f)iFWSxzTss@=-0F!*2N_Nprb&%p@}q}&#k;C_-k2d9gS2*t!-K&1;rJOP@E8{jt);e*(MZaX?`^s=E@^#HvU(uHejTTv3?qyi;IxFjPGZYSwtwF}E!V{F`-CAH|t zKzssM*DSf$|B2_Ln60Hz`=qNHm?i{s#mI|%<0sT-kn$DLRLm(?-hfF65E8bdxc=@T ztIuys(3R|3==#hj9`|Mwo)us^pY&pH{m&;l=tEfZ8im_I^>ihZ`cswnVE$g9Df8z= zr%OZ}D`kmll&z+yIuB-(Sur(Y8GxvEEATjEf;eTAzP{*F6C(7O8J7`glCi&zJj9HoH1GU-eXRM9Cv=<2)&%aE0K`n>xm_+&Irk0WC3Mwh7|pe`=L z0oEKazhXBN_%PuUr*z5Gr=x(i#?n;w5M)tfi*60MppDfM->h%)us-$*2l`crKeFvdlR zc8bA|_KT1k+ zEDV-jN$@Tb z#10MhM#GRKSTX^Q#Isb86=v|NreN(d6a57fAfNBtf{A|OF-3g_4>rO>&JQ5x(EJ@Z zW+Sr9cFiAPEdqo#+60G{h#Ae9==Xj+Isk$77Q{_!P^ODW+eI-&x>OmN({E4@QOotg z_r(weIZForC81nN?AbU`*II>rvO2`qAhDJvOaPfv1b!}x!Isd;WCaLVyL%9@R4d(* zD+;41c;X@E7=GI&;aSWCdQr53%zb+inK>ZliI++SGul^`1lf$rDC)g*i4`C}VnEi2 zEVmERDx-+7JTvMBYwwfgR|f>$2hbyaYO{5kVb;cFCaPs1OozA-pk5`>#K4u^C~;8B z2qffGqmYbOEu)CrN$C4o>ij0ltl*%L$Mp)PgL(l=ulG#!O)1J<14x}^sTq*AslPs7 zKyWujy^JpAL=c5YYR^&T`t2-Dr=I{xiEc5_R=SunStVd)b!gE3{DPDz__XznU&9oI{pEkC)D%{ErJAw4Fp zr(sCxRntjnjIh*X8AXoaKrIA@i9(dN2bx^T_MZ8=26UtW-X?PFv~mM4Dlo)u)M%{1 zvF_8eGK!4ifH*2BTAHdkK}10CO$$z#?(sbWSSSjoG~bxHKv6-^k$X2z#arq_2Bqv5 zTs}06?@?4zYSeRO@Dntj$n4wBrwsOeHA7g8X6S-OWOYPv#?YX*@_-mm$g@N+n>nLP z%G98rh>f5~7SORN6v+lkX0GR{Q9A4Zd_g`>Z$|c%0nF%Urqd|pGKVX!jE&& ze_jWc9S@2t4oGD-Il<9^G?Sn`6HU!eQK_s%QECX@T-9|zYBV@xlTs=TDf1gE8#$$d z9#Ctc8#&>0;1J{j1(BI4E6Z(cITdh?9w5bG3 zc$1j93A(xmrdZA?%1s)Y3I-g7;?_(G=@2yP3$}!y4=gXVgK4`0Z{f`iJdeKw1jtFC zo3JTg`pxC}U7P(k2hLv|X;C-7YM!1EPjJcvJT^)f+6=>^6>%l&>2L-79H48G(EpBYedApxrf) z+OS)oBt-}fZ2b+Ea~hCNTr{$zzl)cb=*3APs9%bJ$?D{}NZhdakOYE+q74~Q2Rj?V zTqAT% z)*_>zi8%gvCEk0TdPa?GR=A{4Kb`8Q&S%cm4}FnCIBPzY6p>lmOI zZSo2+Y*c{g9(l0(7Pi5JwU=R|V*j;KL8AR(|JbMnc-Dr0ZB#ah0K-N_1Cje+(J~re z!M`@D0Z>!|2eE)^MFAP`>IqZa2~Ua@wgg-9RBi~Uqe{_R85AwB64{nGSh5hK=e3l-LJLz!^3wzS7!6iIXaSN4<56$N~Y04u#szfP`qK z>Sz)i4nz)-jeAKf1;CHL2L(5=ik2Eiy*n~!5@vspyLUi)h%7>G$2#q5E#uTxVl)m- z{!DgZ=Z*o3jnUfgXxa%e#X-@yj@#zOsq|2}ddti;vWJL2i8-!t_62(QMm})#qBBX!%?nJxXH`Cg*)oEOYZEPIewFWiYnHggnaJ?o> zIcfT;|G21?e?Wid3LkR}3Gpzlgf2v+aI4|SQMa(d1SgB1PA+$i;lO5?BEsnyD93ik zxS7nXO84OatGWCPJpQh@+~UNxiPPEqWwdc(1ut~(rsM)nTgJo% zg@m)3qI-9>n~7LR-(ZtdsCB#7 zt!8`cv8`s~bRF9aw@=+uYB({Hf^xH))y5MpxM>PEg>RFb`mxzqKt4bt!N67gktvpX1?1&6Rf?@i57$8}lT>)9h?)>>ZVc)|%+?k~ z0E}y=pU*Om7vY3RERdQe@3M3c-Z(R$?L_xaW|Jspt)ed$0is_An&(sEKgeM zG2VF}{3s<07qFim)61T!gA-=JsNmK~qEG=yjxGGE3kh!85yA<%8h+4S^y5T#WFX{( z?msXp?cvLxIp;tEaH>22$BI_aeR5C*PSE7Ihuxzyo+C<|SZ%Ri*Z8R=r*xrbl8ip@ zOGgZHL&zWQ1I2XOu{_-j7`3z*Uzwb+{hPz~6m-ZuJ?zx=fE4v2opGUtu)u;7OJxb4 zYwsoR78}glCv1m$t$)gXAjOXBq3@`RhbGr4$Zk`e>qCg6?j3^s|(bE#Y5cXRI1JfBe5 z86qMLEHxTYZ(Xka+H9)KB80iHsZ;-xsGIFc|kT#(z&x zD}9}4ZUoP)`-}Imrh;I;#)uRoynBMm}W79vqD}x|0vBR9Y3_OuhuY_ez?ti zvFq$~!G_?w=%R#jI~ zaq7#rbFBuyn;cp!Ps;b@}zxuE!~bzf>h` zN+MRNbss0E=TF((8~pQ=6xgRy1+V(M^i131LFd#3)(zjndr>D$rdg~sBA<1v(fje! zccrBy?>)XaE+WnJ?s@W$+8Lpr%c5SGN1JcBiY~TDRq8DQouAiW$)LYo3_pe%X#{rN`2HVkb7%vKk=S@!T#FXd9s{)xNAFZb~xq72uSpo$JQmzuuh59=uQWu$W z9kYGO*CXrJRs|*B`>w*%RtD%r;bmyGY}0t%C3VfX_g7wfzxfg@d~tSA#@6S-hO~?S zGwTdo;I)f8~vgf9$TncA#n(w7A`)Mep zcwKjqOY@!7EGs156@v}fvAFK0TmJUCtn}ZqjoT0Z3T}1`i&jQ>?{tDHyBJ35^$I_n zz*7|F?{xw z&-Fh)-(M0-oqgsA7rnH-9t?{gHVMBZnlsI&LpZo>1PiL%7eSDiofbd>bF{kzlYlJD zFk6c%xb*Tfu0Q~U557#x=>$`mfs61zYIMZaDXOewEsIGAK0Y%lO*vqZHFzyQNRyJT zUgFE1&^*Y~g-Dm&^kq3$Hps)_kZHW<%T``CB%R)oiHoR3=3AP{R8M6)rYy16XPQcQ z@8q}-`SD(t9p>*^PJgEA&z@8^tPvD_*?UEt?JIKNn`$2p4 z*WDDLnYSMB`Ms zN$ZzgRqYd>#Gk)l^FAF@xYxgGc=zCBJks&{)KRcrVV_;f+uf_%L#qZG>vrUxm{P9y z!A!sV>@8V%%K86rbd% z!rSQzy=7jU*%SM4L9enN%_j95p;$+E@7?MImmJe4$#33y|9Zs#=-;5ThtJ^${>KNe zzNH?#^Ep2L%TELHZ4;S231BhT$17&-zw0ogP-9oOXNgdkhfHYjSirdEF=T|3j;8wP!lo z-nZDVd+K0boP5PIPyxLE#H+xBO)tvv&j;FQ7_BoeY2I5cyqrm7jZDnSh2uO0CZo;2 z^U%?O`X2?Jv({Lz{q+;t$5NcC5wWn$?SI7H8)#f4*_9B$(kz)e=wl@w@fekl4 z7MweLrjah+vt6Ct(GH?0WXjq9uHdYYQvvjSIO*O;VBmweGTO zwK*J`q~em%?sEO8FLJR9s8#cX()zK}mZe}Wk%fT=q1V32^?S@N&z_JveCAIy7FuKu zR^im_ix{>gGv{F&uFU$T#p?(!uZ8x|btf0aRJtIJHJ#eBYH3I+_T5+SZ)y-VYj@Up z0l%x-`LCaO+qM6CPcJygZz61+5Tm>@0Nl0Fueppa0Ze1yiDk}@$U-#|@ET>N27T{1 zV-4Nq2u%9)$X;ehfXL=$W=i8N@1HEk`@DE_3td-xke0pvr{gxh^p{Wva;e69fp67# z{`9|Lx2AJW=#V|M^MugTviy}LJM#)W8VdC{)oJS@6U z9mQzzz5S)3rP50#6&QHZnXV%BKY#xoupa)5$3XDtXR(VvXY=0rv9Cwu&X4&XR#^%C z-5NR5!s>b%csc8pHR-Ki4ex9u3%&*%xqxy%d+2v^mM3cB*l%GyR~f>0>u~=$R7LS< zl~YkYBy0rp2MI!a_vP1v8%zwaJTiefv|l?T-*i4~*&D&Uy8-+Z&iuJm*!Aq=>Zyk{ z1rHlK9yS*+B4(fapS>=cVr8BXSU|&H(P6xFiL$Q^JwH%pcj$;T%7vYsZ{pq?nP7{a5}z_ z|H2FYOI#?511e*OIi?Vm&m~Y?h>CXLzw$!hI+wt4IhSBlq2Q$#sMbQ%J65422SEzg zWYyv_dY~+%nfIP4Pg_mF*o#T?P^i2EpOOR95Kg#>KwnXO@^YelwNrF?Q}nWx*g>1v zZsC)kJ3?7`*JVje@GkVU;tC9jW-qEooW`^(qL*Kw!rB)_+r)Hy_r~hPilAzHI9DDC zo;)JQ%!|e5brF}-!Y*UuEfdY71E?ArYK5x^@pa7aymWY%ls-V_maRn3mz?Mz*~a=ul7)i&0Ryqbd}mu6svA%~3-{LrtZSm(S|*LAsn)m%7iYS$fP)7S+4B6|S8F+FG%oX^i=xbYJ4ltwz8oJ(Ps^ehppbuwBf)sa@e70VuL?`#O$e|I*HfjU*cyO z;4}X&s5roN=3g3>$e>XNkn_ICW9ffvRI|p#Q1fn6i|_oxzR1K5Ma@O{6#B+2O5LSVpxPp+|n4aO&IbGiiJ3=bW29l+fd~ zI4<$J#$jD!UeR-3$sAw&;c}MOC5v~U>aI&+oT^F;bh37(f97!;uRF!bJ;T{0_={_y zv&y}@9%^i!q0YExIbw6?1vF-}PnE2D|AjquU1f9B39K1~v2m z)|qnC31dXm{6I3xl}H&a%vR62-JX<_n#?ncf_V$)o`Dc~1Z}Hq&K9b;`CWANC!PQx zTgNKf^i(XpCZRMzH(dQ$EX(-RB8rch1+HVjzAw+6NYbw%iIgQ_P^9D58`8aGC09!| zl){g2%CW#4MW36`Crtl>aZx?|^G05hZT_!bj>F!( z9|{G#y@||o1;?(BkVys1eexKqfdKP5Gv(n9kSg4aXUE6eu~U3m z$m6Ux9G3;k9n0b{H_NY;bJXgVe+w^vzgzMPRe8hhQBPmRt-hqBP2m+ZbvZ}9@MZa9 zxAL2Pm7jX6zHwjM3BR^n7VemY;|}G_!=8RY*8;oOFjdsBzOCU*u6c*&*<{Gl3>Xzy zC*fWvBUq;kWJ~Ps_nfWp-;wsPzUET%4~#mORR6W3{!>mPKDlA1ui@11dgqD;8$oWD zvHC@Y8)kZqC){mHk$T@yAtziKz1}wB1e@I6mM2sgFOD~5@HCg6fN{f5bk@{90oAs? zZE4r5c@Ijirp7W{)S3vzc~@DYs?vTp+oa&FY4=;>0JigYa)j_AOM0PGOQI{uQO6I= z>jNv23}c|Nub(u#weJ?S|4DvmG-(Jv)MFA#VioG(5>n@_M6>IQid5#WqeL>bzM*g4 z{86ELnB3``TQ2vY%jHX#|F`#ip?8?~Z*s{h29`w3Gj_6b#L^;HCJ}CDt;kWQDB55l zP2%3Fz--}Cfh1o=QZEpvHL}RwvEXPk%hds3U%_ctGB&<+yWFANXGCKomP5U`oZqeS z&b#oqmIr~~ux9!O7~F%W?;boCdiX2(Vc&y?*MB}_;^sh;gnK+Vyh)VHf}O)2k3S3b z4<0_+t?WOjB>nNY`}t`;sMP+{XZ}1w?AR*uDz=Rue{Y=Jc0r(5Xi0Qm8*@BH<6HWz zp1UgA?-F815}U`k1=QXjSiS$^f;2`he-S$nm=X+qGqRzt64O6&&a*pMIDgppNz##C zRyT?1=<&s>QIf%Ep+S*U(%nT1vAHfk8yT*=8jjJ%liLH_L*fGSHK&nep3xgxKXJ%v zBHI?3YrY1_CqD5MjW{B4v6z}9t_(fSyou39CsdF2S z^8cCoH1k4C1RGZ)B4yaQ{_Z7f`OD=#YSYn0zT{LJ3$eF?;g1@{{w{DL{RYoALQ#v1 zqg9@W8oV-wy=dEMYT;zwFoIQR*vX`jh{&Ho? zcYaFa+S}ImZ`(!Q-SK)ynVEhUEGim%+mT7wGbYe@gjcy%k-Fd+D+F&L@=)K4ISoKi zi%Up9RQh|dnfK_FKlmOXX10FLWS$GDoyAJJ(22&}C*6{fe*0K@0OPJV3#B&esUj`(fka`_*V8!`SN2yy7fuFvM~9$_bXc7m#&pJIQY= zKYTkD-3I$?cm7y+%lKk*Ep8RVk_V_i0Tb#ah)jv|*|LS&)^ExC=AO~4cnG)0OlSC>hG|~qi>8%QjE{9gvU#)K5?#Qtjb5R z)WkCe>7@hsjjtJ)GY@rdgFaqbQ4=@$BIb_yjT0eL#Kf0*!alB@yZ-Axw5rIvUl#(E zpV!|Kx^ywMOj#{R>@Sfwslnz^gKLVUWxAwNs;?#D`nM+^kAe6q8VSNGXppH|`#;gD zJPPjpnLB%#t{~3#0p6=9?;gZ-;@hOpbKXw1& zVYDZ*$0jG5tgkNk;bF?*?^^=AZ3_cbRw1KYozA6Up5eb{9wS~Beh;e7>w(rfSFd;F zg**-JuDq&K`x~~~qkH$uD_gKeD!YEq=9}*DchzfUcO6n~k1qQMh4U*neqXhoz3jPC zEs>_zbHp6v?c?aVB=tB?$y*Ni^E+|Yv3gU(hXnh1_kALZ^6@Bv!eO*;pTzXCn^oieTo;dyg(>hYwe2M1 zu$(yheOimSCh=V5H_!QbIf7Z)YFiyOn zlF_%yWnufhSWlc!_f(MRtAQCk&zZV_Qt$h*{`r1+Rzc-K(tFFLVpgwbL}qV3eQj`6 zN3Qtvf#mt;7Ou)aGQ6YyGP0Y&9M&PV>8Yhbm#ZY#Ul>@dJe^@ATkexgJhL>Jq``vNk<@^F-R1v34>m10E!K3i(8SB}0z$oXen2Qvx(@Th}UJXxFP z|KU+%s203FRXaKkh?ueJ!|Ie`BZ5KJ`SI2tLQxZ3EpYfVlJO% z5yP~+J--XR^CDiN0RjN*5eo#8uSXRC0Pv_?>@~PD?R~aIX6*Tgz?at*qn)ni5B8L8 zo@)DRTC4TCT;}P4o~`|4(wttdcJhkJO5hB`TQ{9BN-f7umC_9 zPBA6RL|)i|vmebZ7ka2$8I3>FnB`EXjbptpqF=AS-@S2|;8BpZ4DWofnE@W##d$JwGzo27t zr019AIMI>5Zvy7lzQK`7T;v%hfQ$jPX@;r;Tn_roDZTuIva3TxJ~mfS924LJZvfdJ zktrtaI?UqNp%N>9TrcoEb40V{MT|;khG9c9kLrxCpeWEpZ7ncX5pTv#j#DWCqr$AxijcHx8$0E^ZI2%lWK+i+NZ;xQln!z6k7M+^TwkR}PfnFKgV_Rg$}9pL z((`P)>#`uG%*RdlSQYKNg?>m%pN!(FTQ=n>`s>-;0B!=IS-YGYH36GJ^^`QFlh?5U*-zPR2~p&6lAYY>4Dq0Zc-u0}DxKww#)k%u zvYjxV(u!9&U1zlYdvSm^;@QB%rIygkZ_Y5y$=taQ5iWQ+Vb~o-eXtgn5^^iNd{#cM zhi~r|a>fHZgmW4EyK#G&6^4ObsEJlxgahfD^DGK(YM3*fV6J)CP$zKUt72E}Nxs0~ z*ZC)s;*EykFLZ(wPE7U0Tl7}am{Skm(apF)v_EJ0PbzRk%DSN2-rwr<&Uv?%C&g2} z-WvPk#<9oeC3r&hp5*q@Td$MZ;67;R?RXcc*PC0*T(6&Tpufy`Z^ke5KN+h8Y~}bc zCW>!44S%~RgXRDO7QnDj+Qs#EEqh~;%m>-+kxnD(Oq>c2;I}){0_zE{4Dl&Q@5}ny zLiacvv+4ZGomuR~zFe*QlIS{{u+1*N8wLAAnzwh(oNm+09C6hEP)mJGO)5mdfg{?= zU4!tgdwCu4)!H zMJHdR=KUY+-Dgl!>)-JEkc1uxO?nBvNC#;mgd)9*f&$W!CQTF&q$PBuNe87$SLq@k zB~)pKB3+~?BGN%XKyrd^_jdRFFZcb-dCr+xll3y0tW0Lsb^X@&)7zNhQ`RAWJQWq8 z=XrN$)qwhG$hPSF(nUD&6+ZRd8`0Z7NY;b%8i8?LZda9`n`5RyH$r#2((vy(Fwk6a z=_%T|dcYdXp7iQT1l&&5S%gv7>+MjQ%gqSMM-n34YOh=d!%p9)3K@9|g1oWr*D;}$@kPMu%&Uk%vtGGeGmXvD&msu>^OJ^L9;%u_= zoRGr4D(%eNoju?A?4!LG8x`OtbTEWD?WyWyJOptsKbECcQcX*4YxE}g-B^-Thq<7N zgtY{e@s=ixBR=MiFba=(qLUt&v*N^&8*mKh5=zw%$F%5T{J-d_K8v~6{N>Zj56Myk zQazt`_UMKv7L4o9E~?&4BD%k{WI$tQM!H??bYT&}c~P7`n%bZ`5VA#qQmw`Dy!Zw| zSl5C}xwaX_)7VO_r57gPN6_wTng^L!BA`g>R_+kX-Z^L{@F4=xNMX;JWF8e#q1 z*&Ms>Uzpk<94hcU*KD-;)&nGE-%2ZksHO`e&e3A4Pj6{eKwrdU$w zGRqw_#pS=Sho8PlQUQ0J&Boo^FjAp8)4bs|F|TvH!T3dz6lG&Dx@Qz^0I|)6=C)ks z*w=Z*eVcJ92(}Yw_wEkm5@P9Mklh(4wmsUI7O)msFoOZ!@q)g7xvndWtvMxl*YJ*h zxPhl4z-qT%6~W2)gQ^_1#fTYQ!$Hcq|o! zx-C)1+0lJV(cJnheCLrDMk8vMVn~9c7L@SBgh6V7;1M361k6iq5a^92W&=cXmE&&I ze)mw(>lljhsylbL-lUC;SGxef1qfFJAACO=11dnZp839NOq!e z_=RnY>#)3aKn5V%1WgWzz~5Vfasi-oFrZU*II&OUaUj0L2PohJRCt2?&@G-xA9wjZ zl&_pbdV*Z`Glg#)#*u^9MggmhW+#}_iDCUwg^dL7S#rn(xgwmXIs9G%nw$?!dK^cJ zeTa=w^{!HJva>p0v_&#=Ya%la_`N;oc_6+EU6T6;imgE0s@$Zrok?zhBI+w>A4@Esl{oU6N$$#LX&SEDm*<* zow9x`Rb&*4v*ISGXLjX+n9xKQsG$1yGO@d?eQ92sSPOOI z?x6JjSL%2BK4hkUzVAW@J!=6R{s0~Oob0Yf=~^z>%xiUGq3UHG5+dEGp{_+bU;_+A zDFMY7ST@UM7Ds%xf(WTGI&631s;PA5k$Mh7G0hT?L+L@Jmykm-3i%w7aEdS2x{|yf zor7PAfD0zQV=$n1B+a6}^I+M1*C5MG+-MQ9VU> zmy2SFisSi;?`ae#IToix7N?aIXY>@`UoOrfD#_t1$0i;R_2m3yc#1@&i$O`RWQrcaN)=ijyswC30A@wwu)2YWG9HVRlXK3NlMq)>XjUvaVztx?3_YGDj6g5)(PB<& zv`=MqE%`tosSs9FJzXVwNVK_6&LC1HU0Nm4;K-C+CC^YLffd9wtLw%ql@F`VFd$u8 zN%`x6f>G7NrPVecXtXmhS$YgLOTsm$PxCjusCi(Qpqo)6(omBjQf1w9Hn(L+2{^Ygp88xPJ1j__6v{M)k@;4N5UfOCJxa!M>gCfrz5HO3&UHzv?4A0!=>g*Zn{e5fHOzQna>XkFsLat&fG{3#+QmPAXc=>E&F;3(4jHVZ7n_gdUa{R}7}&F3)LDsC+luPoM8qWR0rKw-XxMi0}LzhokRaRfu zQNxQ}lMYqWp0PPh9TI{ni++J$>66^zA*66U!tG29n#9;bK%9*@SC3#JP!N4d^9h8i z^kdJ>>DFYE+{M8c!l!%Eb6)tDF?9NM8)}sfMKfKRDje79)4)oo7A!TyRLgo`5HmdAzyZ*s9-J=&w9bvYvM)8G2lxDHj;3iQO$QHF62yk8*#!mz3?TTAr~p0C z6bME02A+t(u-LibU46U(F8$@~;o1D*Y_B1~xtDOo!aEv`xRp%|uU}FLpp6;P7*p7Y z2D{#uVK*uaoK=MIN+7S6h{HQ)ioHNO>@DOXX!N=WscRr^@EOvWKwNtmv;#0|0b{vf zM#1@-Q^{-uPjsXgyBfL5>M1Z5uu652iNg0i6`yABJ% z;(_PJ%(rs9?!ay_y|T0zx^sOZN1I{*C@{pP7${5#td@(21D_ufT(9&9|3ZsgB?nBm z#O)F<2ID5-P$UUV_9Rch*ZO zZxCbVMvvn4o5_sT-c5CA&%C=ngO@9=wICh>?%ndUH7pbg>kA#ygI zc7atYB7U==a7Gn53bh!@kBnFqbr@(!xrH8nmO;b>HBtC&6a$`~*&@!*Z;77IHP&(;mv7{q*2c-DQj!$@FCVUEY-F~e*4 zl1mWrj1$GXEh|{-636C};GEmd_E*}Qeii~m2iQyanxYwiXCd5zby1Do&TNnn=y+q8 zg1?Xl&d{Xm;aQkL!oj199#Cdn0-qs)tO1B7h`=>Y-(rn*FWcI;8!F;Ar#>6RlLF;? zq1!7@Tw9&%G#};Pu&%(o! zIKv{Vk;5_LPx`q=tOT$y>J2Hvr&n*yi3-LuZ+-M2NiKh5ZRr#^OAyuA=2yEVfOSzf zPFz&yE#j5CY_)Dgm2e226m|0~fh&OXB+6T^y9FZ(v^eSYzDrm=hGBN@Z4e*n-302s z<05y|x<`GZ?#=5pQq}Tq9q1<}caoPLC&^!3S3+OUo5tL9d*wIHXynbuJTG6{RT`7v zl_Cjob2$^V#yt1RZumKq2w?}=uDQ#)o5JLv0Y_cnWXT<(ven5o6xT=$_0bx6U$%#Z z9>>-On-&!XWPRfyi`nS8*PLS8#KXeS8(`ZFVq-n|!W0pPd*`aLHYyozWgzhtVd_G% z6B~5^5AT`?_Irf32puiY%IfRk-E^a=9N8nk0oJa>x7c{6GC2K=gfjv_5H;qlB~N;J z395L_aq6XP(uI>~Z~%tFHk+`|LUn87ptk(@HT+VB2lXNbv!Iy>28a+wH9JMW-&eO3>b0%09X$^^ban4R_hOEtlWKJVlz(f>Oy){E_(p>Pl zii0?ZU-Ovp{KPBKT+Q6`cPY*R)re`~4Us{XoyF1eKF0iJ)@-h(R91z{-ZmbFE5^C$ zo@_P+CpIdJ5!W-x7($p2f0s6AhSB=fraNDbaT$#_EKngNEGn7Cn^v87nYeF72^*%F z)y6ChClbuX`-!!Fu1bm>W=3i)eqQjhdABsQJ+ow3cz*K5)58nF9Ay3$@n zKqO%jS^a5M&J>$Ub?z*rs0MFgeq~RqTc%B{S#2M8Om@#lezgE8>C&8yOQ&xzbDhYj z6I8tgaU2pms>F%o&KGoKh)oG0nzcu`nW=b=q$1DAWNZ_B>?gO>tY*B)^*2aYV`X0* z#Xo-Hkg#2tD)}}7$f`NC&?mQH$jHrge6+24AH&19vaS@sB|}DnO9$aRrzvF_DjtBH zkyRwWwoJ-uDy0xkX?iIgpV*jNe50ZqK&RRvX@P4^5BD76F6DnZ&6$O4_j6XQrZDA= zT-&146KyTBTF$RuFE{I>jnLsN1h#?8PXU` zvR*H9QFi7IcVVMPuTYoljee8#7kv5^E%SH%T3Uk%O`=s&wUB*UB*ga(z1$yIrG_g~ z>yQsCM~P-D7F{#G@#fs5OD>>o)Nck2Bvw`7qI%gTC7lb=Q5=Or1Z4a`(V}SDN_npG zZu;9F3LC_Sn*?L#@A}Sw&N*JrMFTIdx2j&esNB4cpTfTT^h%awjGJd%N7I42-o009u-Viba=3|l<L@-tBqcgR zkX3cVG^Xzz)NaESHmhb%C3hi=X9S1D19NYB5hgXj3)seCa^>iuRamd73FM=VKHE&Zzi3DYhOG61M?KWHj_@Oc@ zIK%_i(X}lWl^o0XWUk&I)+|~+t*f9USGAW{`8PG5IV1?3QsTV7BN}6L8=c$?wi6DK z*0Q2F&-~g~%T&CWR6wyH^(E3y{GqhAZa=aB9>W1W$ zwInhcN6~b;g`R1&B=s6^W5II|KQC)Vkuu)St@1xgtA6?aF0DF7M+RX13?>8vFaUmR zCmDbXMgzmJX95Cz2r)4!l#C2YNSCk=zU4fpaJ?|pop+0@ju)Ra^7f0RRgG2Xm7-m^W?)i>F8 zFxi2g?wX!{vo+l}@_J=xrsrU$cjwLE=-j}kxsl0*?a8+zZ{DsizFRzeH~QBi>f*{A z7DfHA_U`lg((=apFB|Vyx0VmLRyIGaZSQU!?QZRU{&etp7lXl^{P;0|FNBP0<=L$W z0!lv9?(&?rFfw+HOtp&Kjwo6&$La2hyslW7QY52#W&VqMJQqq#UsOKqO%b;4$yBc@ z=+BV4u{`~vs_sjTy-c&KCnZ9RL?_XvsMKNpFSAQ5Tx=?zxzrN<dya4L`7poRH_pZ^>q+fl&qf( z_~h8WucCoN*N$7IDU$5ELS8k_iHIuJ%QQ~Gpv{4z0;uAhjg@v>SwJaha* zUq#d%ew6Zwh+2^Ac_N}#ortIdY}V63#0mnh^SWIMbz%1d1hQIeg-M)&?&XI zYzz+dwlA!I6j4$3@p!mH*?{MP)@=FIqBD}9fWD!)L;EQR(!H6cdMJb~khSfrh{{L* z0cP|&5!D)E{nsMufE7bPNm7MfcEn{*v4}drH3&|AXJ8CD^-fF2N~1$P-HMiu*SIQc zL0GhZUA&g!Y{=h>s21*jDxwmTX-0-t952PnpN}t;`$QY~gNVA06;YL~p_dc?xrkat z`%OfZw$6Kac3Cm$ib#-!;}zJAyoy_UEGqMEUDE=&>_FzTSo1J#VDAd7c zgw{}gvizmrh^WfUzO?_lB5LdpBC3!m9?drqHRe|$st>tV*#~rL&ciJ^VZ;AH5%mJ? z6Y8q?!aK=s5OrMRVaLFgNm7>x*ei&78qs_M$wWOR`inisZ$;Ge?14D+oR81)JfzF_ zefE*miVt%0ABd=gZ5Bk;p8Bu8iKy29B@vZ8Lh-AJ`c#&0@V6prV_jD6?4t`l(9)uE z>MMO4-k5w(g;a<%|zzeL}D3iih$D#uq5RhXEH681bK3~3y${ey_=g%SQc5tS8+ za*HfVjYMr2)QS3)AE-`;`WtSLlmb zh6d&Sk%(IIYZ0}DI6uz@_q&K%^j$=a{#istEjIg>suZ#%HxbM#5&9%ilvp4kB{yBr03XI%Xw%cF znD-kImEx<23WFGI0axZr;zWQaB5DK?5Qcx}XAw2}FGbXzxJOg?cJmok*R)BWK zRTvdzG4?0%~BTC1zb&)Lm-4BsPR}46##L8KvWw^rr@1cM-W%p zE>-7Ft>jL>MrIX1*Iz``GD4l{?;@(tKNC^2zKWt#IS`fTDg7C$4S;ze(qDBFJ5m9Ys@d7sRe-cp@2oqHv}K|t3`Hn8vq5w)TPd^Zl9fE7_;g!aOayXAz(7AGQVFC2o=!h4#H zUw{=+1HX%?cL@UANNoN{M2-4cL=`>}QLO>6<)+rkX())P`%#{c-%gtH@jF)0(v|?__K)0M7*$p|4l^A_*q1CBTj&k z6?_#@6R;wx9&vp(@zV3x2mNnE{b^O4Np$8pa2^b#9-$ zN#!7xA7L=Da#Exq@SYl(4UA+@uOB}iiWd(R&?e6ufM&S-jffgP`&C4pBW@g6ES6ho zzq-=tzta6+rKf47Z)RoScm>V8IwZF`a&>joe|7x9>SWXE^vvqa@#-w|+PvJ_+pBAf z{%cD*;a3tp-wiDvw;+?5KeA=>aB`s7;0wFVFqkY)EETN${~8h1@xLXa;t*#6zKf_w z5U1wD5X<%^^W7HR!`?wQ>iauCiKsX~iKs8jkNR2O2H|`;lXj$pG5^xHe$@GdQGDsP zbK`=X--m!rgv5t!WX$oMoa0cT#Z{K$D0eYVaqj&r%ngI?VcpJHQZhC$DTgt7Rsl(P zO3-E;J*N~-xcyfms$L#fx_qjL?LhwJ;YICC>6>eBCx+kaW-Ek{a$Xr(($7~<611I6 zwEsy&eV2w6QRTjisBDBmA*a~O1mb@uqQ?J5L`B=1o`3tVil|$cAIbu+b^ncs`u5th z^SYAKaXU9R<~pv8^*_(|+8oK!zi&DH#dm-6-OEB(+U4k-QTL4A-;1bhe=DMDe-%-y zu6dR^=g~b7aXnA()Lzt4QsBE-Lw9{AUWM_C*tM=ze=U+f5K&2fFQWGTRzwy0t%z#z zD-o6be?Ub2bc*rTiHLf>*#4`CdIl?^LMR+Rj%#5>RBaBdh}!aN5%pXF5WpBmT}jMG z&|(rPd?KP={SQP`uSP_-}FGN&@A4F8$A4SyKpF~v46A^VvAR zfUhEI=81^9K=c0w5f%PFETXoW|1XKC&Ho({wI<4A6f2_cpZ_YNj$=hsEu;YbB34A5 zrD?{BsMoO~YWO^D@E?k(spBeNMbz={B5I&n_%{)C!Xk=f@^f)!EWKZ~fmecCeKRq(5biaHTdrT!xkb^5eidYTzlMD4*M5Xv^5tW{!{g~(){HuuiMa17d)aW0JsCZZr^&{P1NKz}KsPT`JB z{Tm``veC~XD%oF%s7*%yv4|RK{Y^xj{0kAa-RMUV^=v2=0RZ&JB5H%tKM_$W|58M? zK~S~*D5B#1iHO=D^NWZoY%=?gMAS;NKNC@l{!B#0|34t2QvHV_YMa?#il~9+e=MQ~ zntu~fi~dCs)e13z_$H!0lKD?XRHzmF@3D)b>9SQIA3x zP=79>PW{Is>g*39sv6SzpNpteUqw{kZz5{8*?&z$z44=n3OEr_TmCZ<)$d0URpeTB zAn+#<)kv*e<4;9YGUPuHQQM4uE24%Pk^L;9;+c?r7g2pef~tNaqDqAtVMSDLTvOj~ zA}Ya&h&ujFM8(r0!-}Zk=7Hd!MAYIx5>e0nuZyUG|F(!qu15o|UuIm7A3$%?;(O^u`YnYX3vL{#fF<`q59{!7X<~er!Vb`3X|!SPvtb$8 zZp7~i2~!9ml5WtjCR-Rd4ISoF7nV93cALjNU@6QGwt0Ia48J+dC(w-!>XsV5Nf$=w zALmZFN9>QrWk`wjGY*CAg;{QdFh7Z4*WJpb-C{G}g2sA0Cf-c#4l~XMUm$S1vk{`5 zjnv-2cN4}nivuff;A8Y)p?Ywn?M9eMH~2z$gqZI3nKxU4g~;qB{F_Um?)IDR;e=Rk zRCeqJVOE&DF&;dO&}#`#zzZJ`8z#3$tm=jDKfdYfhOfSde={3wRleaLj_($ZkA2JA zZec!dVcuwbUHng{V)1X*bQL| zM+!Z`mE6EJgM$K$@%Zh*R&JmGH;`x%?!~xUH;Zs(!U-Lp-g4N(6-S3y0ub3tc#=lA zfE1)ppu2|J&J82{JI1&4Xz_q7_<8``3)yb!-AEN-T;=hN=L9!|nnT@-a4!Htw3b4> zH*n7ZLTvTIFg~<+>Kh?`_O~u%gZ0M27nblM3cYDr@Gf+Nm0@^yc)a!Ez-F}gp-&*=_znPEetb6s6j!4z)cmCnGx4s2D%cEwi_48INrkz@?b^Z+zO(pNsk_D-VeT+c zYF+5{FhWWDgTk!cD{i+vFogbfyS9KZ-)?+m`vY4v!f)etc{Bb67)ZSc*I_nH3yrX} zM{3vM35196jDyedcoZz*Bb)IqjDyYg5%L=${`h{9aPWn?P~Y${t#Dk*Cn56i4V5^g zlbc`n9=^Yu*E!=*pK-9dG2UHsybEea9j-@RlAAs)TTb>!<2pRWeS~Gp7R{K?RM_bU zULcHyaj5P14i6yAzYf>#{bvVmx0~a)i{IZ`;P&Eg`RopjbP~R0X1r(XhHy{y4hZ*e zHQ)2w2W4WdRyUA(T}afm4POAr3yxIv!Y;Yo%I)&liw)5h{-hO$mlht<_awx4-zN?m zP{R-RJ;oDsdaG$JsoZaD zrSpd^&c0O9YeQuY+gt;gGG1FNAGW#C*@|Hl0**U8L;0L3AM~-4r^PDEoo3sGs!8IK zSB?bO4TtWk+_b>EdLu6;OTPTL{ea!3WMwpMud^eUve&#xK4FO<(`4_lTnK1;aAE|r zI$7y-cP)od*2P{{z3s?S+*Zn*>%!im{uQZ;+?Ff#;ja>QlXE(PUemv-=lO8uoJP)~ z`>vMAna+}@GA<=0PWeN#s4z+)mm{gIw>>Fhu4_lqn@*lPY9p%ZvCJ1t3x)EO=nU1> zV;6ir;kz7IZyt9a-c#ukOiTL`E?}ZU*eGwM-}3TFz#A6%0?%44`5N&`h)6q6XR5Vw zM*^WpMyhBuN5u$RsvN~tG14m)p2@NH`DvGu!g}9|#d6#pshpEaOf@j#TC>@w4ujnB zhVm4Uu}28L-l^5+=TMO3h(Ay7!;zrQ_l)D7wnQ;$qQ1&Rzm$IPg@=ik9iKf+95cl@ zL^8y-j>HOiD(B$RW^)*8Qe}``SWI)E5bNX8vQWTtuuvvHf6H?zH$=!~ge_5KtX(nm zenwF+5A|Kh4U#j7OX}f5V`{;;w9IzE$f)olE8c=r@$TT_08?6AI!~B^X8yfPw;c*b zY;w<2*ck736{b}$ay)r{aB#Z3mFO&gMMru&xcCKM1AkS&%wbuj$ZL>5%_}Yj8ofm0 z27#xqubfS*_$s2lOZOG5R~wqcYgny66l~n=IqTXWa#@wIDN${RfHmRmp->ABakP*> z{6jDSijZ9~9Q9p9bw|-Z^cCr(b7~anV)C8Nl-u5j9PQ%F@DuIfEyXnc-z}oP_Qjmr zpNqnZsP8fY>6b}!PJe}8unu38Wxt9UC;w?@$go1(KuQQDMhLoQ9iD^9CCdH zrQ)MOtf|^c;>F5JO1u<9${Yk4lVW9Zsd!MbZ%vzbfDp&B@obCb5g2ym$`()X`aXiR z8wF$=6)VxUk1|YjW1~I9?_dwKq_Y_bXGpFg>`}&nyq>kBZsSos>Oe(>OQGod>&hzj z7h;~GLB*qW6mEg+ES#+QtR}gnZnC!LcAnw0zUNVY+W`sPB+ZM;Pah?>`7me`G?J*#!K0;}qGubVm_)-A ze%5O4NkU)(g_md1b)DQuc8gXPrJEolJ#Uf-l^_;DPVhPPup{$U2)-nW~peKNNS_B5;lc{;Vr0Uq4BeOpT@jc7@-w?I9)M ziUl!4FFK~SI^Toe`7Hir+YgqENg>pno^yCcTv4&SL7c@q!f`rm7qfbo_$YN6G+HT= z?^kdL23onBUf`0f9qpKQPO5>JPmf;Khbaqa1@Rkvrsm`^6TGVS=>hXDqQ2+8mgD`5 zA-^J{Bpds6@)Fc8M);Zdj#_bYK(`VE=kbNb= zt%dZzD%WAtniHx>vIHt>0L92}CfS>iKm=BdVFl>mw0Rsn z!I;lY!&p=G7DIuxD;tI=5siBE1^aJc{E3?WGs0^zp7#V*O|ZWw7e1$Pf~sDcR+b~c zqN;(!RK65%%RyJqXxp(5)hyu?TgQX(oq9aQ=87L~Vr=T2XCGkq+@X zr5+NZ-Ca5SF+IKr~7;E6?ZA$>)v3eHTa;_?Ag#ToJxHmICV~flg`jx6(7S& z*q+Tdc2TEK!DobKn8IeD`CN>S%RIF{TK4@t^J;gMRJLw5gv{D8)$sOxQKk`;&6>ii zyNE_=n^y>Ueh`1GV;F=%9%XejIRA3q9k!{)PC42go3+e3*wfrRNWn{UZ)BX~AWIFy zWTJj#u0ArJJI4a#$_5I;KoZ#?IT%Ds0a2y)rRtiQ| zat_QK`IVqkZa6gIL9}H!%(Fo(`>imZV0NisPMaJya1M*s96f}UoSKz_ioKs>@VC-=w9*^ruzM;pJg{IKu!PLF z$q|I%&LhrMx8WyP2<^;5`GUzJgJ6O*cX0Dhg>~5Ym}e z=oO{27D__S?96Vtr_i}llX=oJmh!0};Y1Y7bAi6yTF4tAYj{GdD#)PdY3C$w0fkfB z1WDflRcwSjfy}gXinT!TGK8qVwtO{6ylerRo|nG@6tFltdGQS!h|ATd=IE z(zJIzC-N=DtO7R&_S19ZJyH@fR@C$eVyh0;+9i>*Xp`}nr#Ds!$4sZ-z@=r|F>R=Y{T=pH5Ql+i+DoETI$j_%J z#Q1QryUj!*hZ7DmJ9x+OLg{pMJ_{Np1J;z-V&%GpqP{hE$`QlLR?8{?Z9h%G1yXui?&ATbc+?Yx6+g9HgU9~xCa}#aVtMFoBdEQTMSG*6()cNa`7q36Xm)b zC~)rpC3ZLmyp_0REqM*OoMOo28o6lN?e2|(#AmTK2gi+CCF^-*=21I6|M##yER(9l zD`m+W#|0ihF>erYsv$q6BG~zZ*zMmXOxx;Q2{M&SrKr{{q--&v zh~ZOr+i68a!YoL@Etj1Gl{lhN%6>{5`Jnh| z=F9Cs%2VVEYJ}R+59PoxjdrEPzz^_56a^aiaDqjL0I7qeb@_uh#fvoiU^*>Ix;r58 zx9@!Mq9O~BH-^B%#zCBJ_WCQJoDyUXn|f|AR0xJ6SMZfmg?yz{$<%q!l=1}eu4#%s zOas+NK}7@LJNLm&jY!^bN2-l0>h^37%P7LuG@U>kwP~;%PjKk2gGO{KacYZp3xYuE znv`>!^0WfS2=W|{J?{=EeN9=i+`$0Z25AKIj{|geKo{1q7rdQ+qg|pekkB5b;pwDa zq`)BFMV*gQk48bl0Tuo)B~vjilzP0fvaRsyHtPL0QnCm^iy)4jwsSC4zP|qav{HG8 za`9GH`r~$%@@2R&tI7*U9k`RW1>%(RY3**rlbQA`L6jdUvciM^u}5oc5BMfHtQysx zHQF98W2qSqe=(rU5MHX$;;hcU?2+%JG2YS;d7`PBf7MjcZ0d4>I3yJO^Rz3NCxJ`ktq&5Tyg4A9YTvx|}svHiQ&LEhd0)uRBXR@*s07YG` zz~qRcjK)ct$Fwo|wb5cl)jmar%#Y7!f)))vF!2X-87e%TY2}YT5ml2ak>?EoGVLR?2WJF}Y6LUZ!ae6iQvEmMuS_R0-OCqEw5V zWKtiW(E`6rcu9+;RFy%n77&(FO=ONR^A169j%QPq`F8**#T6+`BjwdFKzi?XdLKS3 zGQ{wg_*e86)}vK*H=MQH_95~@y>xFGSmj(+_l=0Jtg3a~{1Sg5*%%^Ou66HKz9c#1z*oA!?ck_MSN3K{oGP`6DMAfA{oen1#N;E0^5v*gJj~S z)MsZ+U*0l%gH=>pD+;iRYAX*mHkThv$d7U)Qj&Rd%jKS!L_n!#8NbH3GtSgU$ypSK zt|P~EkUV>vWtJ0nEJ}rI$MaC>CKP!C3WjOzNEEfe%hqmQ^KyPEaeeBOdbtOoNgJ5v z(I+AVVYfj&MJ0E6DZ!zVmJoW2BK7IS&W9y&$7qDGIKO$BBbNSq!WZLBQosUI ztg+kP{DN5B7fA-k6w8n0utInC)gQ~j2;=|h`Ad$ErImoDupNJ7!Dmr8La)b9>a|AVDo&2r0CPb0D z%$A`DN3OBzKC|?@d8M2}B@Sl=d1obm`DgOVpTA6gWDhNqH}y$&^+}mRo~0d<$_bI- zP}qKSfghFxc^$&)GYyXhWga3j83Z+jkFEfNxbhL)hf2o>ZJmd0M2AY8`C_?HB%1P^ zwB$kFLOXGp8 z?vi{ob%3y%g(!oB`T%nL!36n=GN~>31xn_4QtJRjFAChS_DoxLOL?{}K(*TX4F)Mw z7u1%kY|_v!JNtSb=6&AY*TW%!gmCzrF*&tfupDDhNJ_)~9i@-W$0`;?blC=m*pcB_ zsS@SHRWF2GzCOD+%51+u9N-tdZtM#Skk2P)JqijD74th0R0Fwop37@_upJ`uqFUJC zwI_nAalr2OnZzhKv0<(3cnhQmAiENj)YkUG2(>p44!svdjYKIgwd%i&{EX8s8gN{H z9tY8BE+=`VTpYO9#X=nr&`DD5O^%&Zr4>Ml=CBPx0``%8_9C5c?$b%w_0jWaDHC5i zmMt;n@K6;Gm{a!H`Mhn#aY|@y-zo3IJT3WIHr7^dt8A__$AM+j=L zVS4+E7)zJmkvP8Yl1VJQ+mn@nWhryJ5&5OsGw-!43;OXPDgypdcV_R!+)YfQR|Rvj zH_c1yL>KbgYb&&pSH-`GneFN;_8d-my(v5hTvlxXNFTVu0f!9)SwLdOrB+W$n0PY7o%`UL1Gs=YqQ!iSn&5W@BeH?)(0i z79K{PADXv~x?Vf`uGY^{K{;xCexf}m$WH+rZ+Ub%ltM$$ql-$J=S5hmljCPZDsjIo zWq&`F2cln$*K+u-(@>cfjqou*sh6;;JSUP`B~7f#XRQoZZ`zIw7(t`PpE#((bcbq4 zjacB}J^#jdr!1zDSzG)R6-EMK4mWbPQ$AWPz2ITza>5A1rMV6~o7~A(Fh|^eC{3+} zj{+@+9b6G9Qd`%5_wDpX2#c^5Ybc>V(efg_^y|xu1X1&?!B|7pTa-xIQ#x2my@M%b z+C%Ik)==edzovgfWVtZI?e%^B`wZ`&)4cKf^!jP@3d=?5;RFsbl)tu+}iMuK|}gDkM6YWsybKp*RIkF z>_ogHFrT+X)p#$vbJw}Ie8RVjQdHnzQH=54TIT_pXMV;k-Vb0tSbCUsbx21l5t79e z%(O7qL4SC^%yL^Y(cm<^?G3+1@js!5MPZ8Ca)xrRT+w<(<@U*BFssfzelzUuswHR zoaQY&u1M2F7AlfB2N6M#Je;tj5-UPM9w*nS#LYpC!%0X2I!W0{+r!m&R4HQXl(}T| zBlKk1C?|s2c?|R;FZSiir?x@_tjMEGAndeRJm*jIxku}+%+qA0s)#4YM;j$BFueC} zKl8vn(#<)K`NO!f++sz{#l}2_iSaH);83h@89Ro#v^z#n6cKMiwZOvgxl4`0Anq#G zK9wx)R;H(jO^IeBx3q_zo<{pmp)KaN=0pt+%0Lwmq`Wa%FKKxTOHjEjzS>-@8D1-k z**K@zKt-?MK085!EI#io>E0`y6e;!9wn70PG%pVgB@a$>p1Dq_WjUFU+NoS1n&zN+ z%}y+3s@je}$U(wxF^;1l@$ zIqNiCkKOGwj4Y23rDMOfN;T-1hg;FBxIg65c1DFPzF2$lOJm3CEGArg1Mm(R1)CqsJCROW|!sW`ogBbZ!!vUk|$i*_V%@U&~QtI-1 zoMo0@B8@vlM=O>BS1dR)uJmrfs?#{Ho2{ph^#oH_AzrOm4#tnS<|bE`+%2;|cQ`q| ze5*F|ZtHdYjHyAs-8z3<7YAR43Df*rHU2jrU&~<_9}uOfoq8>B^Umqlop_QBvi8^A z?w+1`J4)SXd}`&=dyTO=mzpwjl8?quCm|T4%u`Gbub1yUl6m#S>S8JPo#kM<=~?#< z3MTU84dm}c)OJ?t&2V*7E6Oa|4&FWjDG&LHkmN#05<=D%c;oK5_W!2ROIsUSPUIz}dfQsjc$ zJ8}EZ?cNWUiAtvJB%6KO)%-=b83&`J-&{5z3*CP3*?C^ni7++6Uhu_Lun(?PH*>iWwm(eFrbIZL2%#@zn4u#2^z6fzMbSc5Qpu&3 zt3KtXrMXI}lBtSf8fdUt=$Wx_7ZpJnG}uySek^(oUR8+3u@;`5Lo}5dUN?(T@(jd3 zHMZMTsORp?&G@#S@mZ?&Ro9eeQ^T)WC+t-BtGb)?E_%j>FW@O?Ur%F)j)}0 zJ)n8m>|0D&=mC*SU<{15fw7kPU`AcwebzNQ+FMSE?Ae=MZ6CQHEsKZqUZ!?N*>p_r zEEEWbveyeW&(gX_A?*Px2pz+HZ`0TXrifWSY6_Z?2 zy}Ev>+3EG*n$vSJ=6$!_!@1K+5v&7BI@u*59^r3}-1XxlTT|qsw9n1ip~YxUt34Nf z*dIaK9+YNWQ@=NyeKA}p{bO8=lIW_2|8~m4WFub|F2^=J_Hup4E1m-? zzkYy+?hbwVnXQea#gefzhyvcqriH`EgFW(T(pLaNaqOwAWWtUqw+KrK$iU~z4!FvQ z6=7fuvj9z<(ALiCUHnYIyY}H2?tq9p#{6}@{ zBvc&wpsE*%h6KHd*uBcehQ}UaIX!J68M@fYRn;Die&rW>*t%9098WQ~+3ASxB#C$S ztIev%QVW(-^Q}{#5Jt%eqkG~>eM3}mlkR(qRD+w64WrxwHXKeiU_N^^2ON`=P*ku+ zbzY54(u7vdgl#$ztujXCVS`Y_LUloOXOp^2Kf+JA$-RQIb?~yv+R$1=(%*2C+SP-L zC((LK(4NN72fBT$3!=BkM+J>BNjay>4l(fXFwEMiJGO(5(zo94u3~(~SQM)v#uf7c zXINv~nF6TTI!)Mmlh_8v*hVnytD~$hlGx|Q*k4q$KHFk{pTsdI!SMlete~<@nQ*>v z>-Z+Y_Gyewq=Gqm12e4`Q&erN-cE5|kL#33=6O}dN;k$^s%+Hm6z<3ut?OeeC+UUJiqnocx6SiW=HWAW5{2JU=e5BaQXMhwTfW7NK1ebn>KhBo=E zqdD3G(srJhX|LE>1eiS5JmJNx6AXF-v))MO;6shfix$N zv`U_AbQgL${_xSZaJL$~K!1xgC>*0E{JchJibhy!OXR3mQuMX6s7%rcMO9{YBwHT| z!C@lVhNW;UmaT!xZGb4_RT&wO9HvXxmfNXBOK48e2vsDBz8+^%MxBUrrfEu+>Ryx* zF_K;=mo8kBHj|Q;N|rg3LTlq8GgmItyDh_MBbL?v>>ybdlp=S|Ox8_GKCD(Q&_nie z3gZQ?G;C?R;Uc0~>OFS|6H|)pZoa?`4VHMoJDY+--xj$XV!a$h-A2SDx(7RzvMAO{ z#eir}QkQGf_E~{esZHUtDU6F|Y;SAjV+NH6Kq{X-7{9(yS>5Rcd#YBBseYwVyKShZ zh*c5X#=Ih`sBNfmc~T79sC|)AW;gHW_EfK|RzDJ(;CnEre9By%+>;SSlg>2AXd6Uc z{6a=xNkOcG5s}Z3xXmJvk1=oO3}0j!UuM`3qM28t;XQU#xeSUH9pa_+lExWX)Mz@4 zB_$UT=#J6}qWB$cn)0Gx3mf5u6hoY z$5M?h4jR2qHGC(Xw&&UXjqa@C4pr0kKzOY2lKI&vXOri<#)U#AVe2M>rbgGuwG#-) z0-UUKs>H1zWKj^>kV_XOz|hymDZoX+VnchJ)F=I!VV7WD3Bu^gaN*9K|6xPh*TyKI zw#i?}iS(e&cG5X^RHgH2mJ2zh+s)>~kVj4voE=r1X(&IZRp|jbA(a%i9w(CD`hcYe zOqlM{P^w%beL4TR-qx2iy?3o{wOO^??VY$K+I6+(?sl1u72iQyn$ z+>D;P$AG&rNb#FkI!|WVmxna>7PCEIE7+B-9FG?f{{cVCaH;D&sDz z{UGv#d~PQYl0zJf@o<~wO549>l@Ew+$#k#RLM#>yFQz(Gq={8Cbl0Wb&`EWY!8l_Y zy89Y#jCj$$V7PKI)@5kVx!&}*=*HNiQ|;*qrRS{I7mA81skz}c6#Xf&clJUy-AlaxwObBA1S1- zou2PVny*=<&pBZ~MVw!0gI^A#|EKf*L5;FJGOT%we(m1=Gxh$JayP~<1jto0^<7|n zCg;1bCj#HOyJn+Z<{>FN$(ihl6$(a%x3eFdI4AMtJlxhpqtS!jLN7&?{m~*q+gtLG zgs3tr-6NfiePeM8Y;DNJ&_=p>#59Ja{IN}NlvQr10;JaR~iu;M(G*$bv(?b z31aUPzStOksi~`&BI1@$#86$tcbYDOe0YR~d|CCwL$fZ5?XdUfqD*QdZ^}nK$Pmr+ zQ8qwE+xS>RzF@b~Q+aKIr9zkt8x84#xmemcJ%h+Y1<0wVxp&(=9G2t02gS2nT#%_h zC&9zXO&g?FN(20KRBtERpf(5$3%@w3I1>g&W;dr&;|EPzUrZ;}%dvLXDMZJUPr1qW zj^C7WPU#*@QMpK~aVkZvJXP`3Z8M9zI?SDxUsFvkin**p)C_NUSg<`h7iD)bC1&`p z%SGBT>2ww6bcgBmuk}nvhkB&}5#$Q>vP2%cm%N<8h3BO-f>Q{Auo*7KP|p4$*?T5_ z0+|`UNpkrlHhtH#W+G|sTiJ3FIa4!H`BQ16Q@K)ZxigPD=P#fy( zGVJ|!yJ9n}nU)gAB6)P4rEHuQBn)`!zpY^p?NS=Kp*>S8+~CMX`>je1$i)?#qk18OC}Y|N0xX z^=6XwB9aYi%qe4_%PJ#jq~=;P+Pceg{ja_@xG6sLocksuniilK_HH%|`9hz*m;Gw$ zSF{cXTM#0WSjrJ{+#b{!)})bDtCe`eA-a#k2Gb0GN0r;L%>$I-=dQDYC`Yjzk9LeC z7xD7n8&+v(E1W3aw8_2xdbjM}GdN>8aPaMXT@&Rl=G*r+h*dk|d@cLIGp=k`J~o-x zBuroXv!yorVEKxp3oUJm^BwT)Hks^B)Iq16d{>rYp>CZ}}KD=kUA!?bYLH3SX%#5Bl3D zQybYjPrn$b&mM?j9XNH7txak03+)+)=`O8+!52>khn@`Gm`Tg|9=yvXd8@$}y@*&* zbrXegDlAGQ=2HbYV6}s6YV0_<7s-1F(hT`jLW|tU+eo0DlL4e1-DMblz}=@R@IL=~ z;W>jUoTP+p|4s{J^2z1b2mQV;UJkKMUR*G}mNThm4(HvTy!gGt`}AFpoPxVdBO-nG zC7OJ9K{ZFk_@i&_r~@1};J}ulomSJ9&Kb}Rwf_mJ;z1`1xaJCP&*o^ufJN&4C3U3* z?mjGS0GF(+7n||j9yu;Mv#0$R7N$P2zuvk0`b*C1?@tXQa)Os_v_%yC+H4f^&*LrI zX?Qgd9K|g`L<;Tv`5#>LhPdN07T@3kC(E4^cy--|{1d{^g_tK8tA;poE&^XLmc#$r z%2Ry#{A;@_%hz(3Z@gL_VIZ9-!Nij)@9QYE6YoWZEAeSSbs@}}!4cYOUT`q;Lsh7( z1(S5^%8$jB9PJaWTyIa>zx@H45K(8B+|ft3bMFUHAI`obhcDl}@^LEnjx2T z3ZXYBgu`jmdIH&-b-B4$uQ_%#tao+tcE8GOl=g1i&GGw*sNx<({DPITUKIT(rA$_0P>54c%ma2UJ z{%=H7hV{|*yq-8V#a~5K?)Q(mqD*;@GI%;on*+$DiU;G9QH)XYL6s^%$iOt!|z2@3fc1MYI`c%)M(jdB?TPIszF(G+LOSzFKc3174INV)x3S4deZxJ*7T#IG|wHaQ{y7Fj@e7e zgm8W57mkjFms;+`c~!srDWbN+e z$RJ;{_|q+WaW&q55mC3^vhKa2PT`~tTuOU6e(R)@(u?*nyD5**#GXd^_CXCR3dZee z9b8bvtqWIze-%+JJ#&M2uP^q<6kACd@tbhDhlbO1c%c|{R>nDuxSXVogj?}l%$7Op zJ2tVBEMcS&*;hYB)NzKzSgzF?L%DiY*f7RK>HU)9I9q!c=h-38Av1^ ziFk78Kge(t?RPAU7b;oKKhksPB}DGYp!j@-Kbt8>&UYpejlZ_f-Y@=EvQl^~qB@l& zXbCOxdpJptV2Il9BJ&??zoU-$@M8kO?&)dQuhfZ?C^Fh#?=O!nZKiOk8#iLbluPs< zt(ly5vxObweH!8vI^9AFEETa#;7Z zm;bACfxbu}dTB&5)wl*DF~t;HrLrJh6V)W}xzM$TJ<~muZommybMv@d=>DAlgFvNY z?{!cQXLA#+7XK7m(%Hyjg4w|7EX|-uQJu}DVM!h5I*$>R1ln%#;z$n zbEf-FSXUS`qg9Q`)MK41y409d#X)kgi$D;}qVlqf_6*IjRAk8qh_tB!PnX- z4iW_xE-Ozl!FDju)#H6HpRXNyEUym$swl+WBhWW; zIQg-9c^TD3z6SQB_E#6Y*6@l`um=ox`#Nex`657tS&I~M^vGm0c_rUqTerBYTOsqr z)X=L_LjhfHN31ztE>~Os@Lee^bZu5Efe`mpISGf~_SPt+{cY=C8%6PcCCQD*>j*C; zJe&ST8*te9iSvm~V!QQHTZVz9iO{+hA8YD<=M`I<)86Z0LDajwr=cs<-Z){7tg3Vs zr)s3;ajMiLExAVla02!a&p&&K^Lg`$=8#lSzyl3VDQHLB*8P#N>KmN4uXN<_xFGWS zQLy}hTZ{k>6b8_7FyxMF1{#AYE%2S&=OdXuuYlP^j+cjk4TXsNR){0+D7b2(9VMO- z%w$OF<*XyZ`K8~u@-B`7)selab%_xh!HZu2bZhX+CIxskg!=2WMx)?0vsmXM9!~P- zy3}HPdsR9d9$+#BT{|8eHeC5z`tvhRynd{YwP{@QDMd7&GghlYMyIZO=3pwXp`}gKrG8${>QOMu!F@`}33Zi2 zVzJDRmsHZu_Dq(#xuO#`EtV5tE5Xc&$UW{MpXKe0(Kq+it|Iza3d9%(nerr7A2-z?Wkq~&k-d_6dL$$1^N!<=jGaN4aVzti8YRR>S zkmqY+a^gV~`J!-}g?y{m=O#bYzp3ETE3;u`rd42zsG^AdU_GLFA!zzsA}|doo4t>2 zRc5#nbep4;$$$nw?jJo z+RIG2alV-+?oM6lOSV-LPUa#VV>{dZ-3Xs99fPks-KPT{teNKdO&J)~zevt^>fGhw zkhv&)9RnI^>N+vaGA2`*w_s~6cLsUc*6Z@F$r&ZBcA2k02q-i$p=RSMQ zdC!j7U}o+7ml7-rUFs@vP4JO}t1j+Y{;#vNi>8S96KxYE!4m9mB{Q&g^&wq|#N~R;KPFo*Z+%F5?>a_&TdT z4*27t%L3c6i+1D@ncYmXrdx4!5N1IXx!|b|x1wink5*XqzTLiYXQ!l7^gVN@M&d_s zq;8QNXG~SQc0)dyYSnj^NNm1J+hU9Z+E$AZ8c+7Jy6PLjIFvV3k#9{ZKCF8oxT2^_ zem1xL&h+Vr_Hr$q*fh%$gWFqD6*ryIVN}dh4s;?+ke<-CB$vpXnJ>5w5!d!m|9x%> z*Af`*gh`BK?iR-sRzp^LyGMRF>GPv0#E0h`O6o^8%#QekwN)bZ?6X(Fx$vjMKFd;p z66d#KZz?}8$rZi1+rbJugIs>r`W45D+^gRV^ElXw;F7q^HVgHDA{Bj<|IHwc5+aJ3GnUJ@gs6gv||K_QiMo;C{DILN~Vsc8YH7Y z@@Qp|C_N|K7bh@Zdd=750uG7PCTNM`Q3T7FKMju&n_D35zE&1mYasm^NFZcfh5%iG z7j98i;oN=-s>YU%-g2#IzavwV!!}MfdFr(Z^Dh@HJ=Br*L{vW3iq660tz3Z8N{IGu zaOmeisgZ<9>Fzb@Y7@F`e^{6+pG@Af@7S^eu2zB5V)F0vLdW(+34VB%8 zN$<0cYKpwC%O4kcYSEa^Sb|IpcLe6nk7fg*xbc@ywfi@CT0Aa^dAt%R;TZj_Pp1TN zDy9l<%7vUuX7~~a@!zZB7WU(6N6B2;F!wg03oED()A65l zX}D<4*)v@$@`*cMnll3ScH$5z0#w&Hfg1-&)nO9B!=GTmi;mz+Shw#aBtHS!hefj3 zB(PwpImnauRg)N^k`~1&xvMa>;*eTA{E9c6=VG$JB4txg;`tshyD<`7nrdN&WN!nz z6~_t-f-xoJi%l>V%u%v8A?XkY-b7P#6UoXf68ct9-uSzR=;TB6U6yK;d@1H!72%Kw zo+I968M`Z(4<9(6ierGXS5X*{XYkchu%n^IzHrvDWEL)HArOC&nVcmb{E>uO9VO%6 z^ssx3iJ8e5s6j+L#<)Djs1g!o+bF#^Gu#C;1u!TJqKN->hudGtf&{ZRw#nGh2sc@Z zvm}&wa>n5(nG*q}iv@4u?j_DpFXH04+p_GxX0@NnK1|GNd7kFFib|@2{iveiA4PRX z-4m_8{fOM5d=z9Xp7l;pMB&(YMPJ*-L8OS_&C#sMO)xi+TnbI)kH7aRlR|1VH|qq7 zdlU=^9_8&`&-*-``{{8O@GWDz3c|0NcSy)f6pRvGBol+@%JopQSAmzG=Zo?u8S|qO znM}mFX$Ge;KU53YFa=!I1q#QAD#4La$mX_QF1u4KRWskWp2a2*FzdwoP<8T> zbH&s2MJG2AxV7SU5*h4-qU+Hd--xigx(`a}5loBsZ>~LPG09rZe6Z&h6S8*6e-@;$ znI70*QfGo(5iALh<`@IPs;To3L>b5ICAjP0h5j4n;6qA2S> zdRVdcu#%#zTBxk{-;1a$>A`1#VlmE$sbNdSIj)DnCt&dL=}3p zr2ptw5%uAt5Bnuq7ttKPQI*C06faqzM-)|{lT{FdDj)`eL)a@nnq+nk-2C!S`s6vYC7OSvT27 zkt^7GGlTkbmyYTmVPlzW>Msq{mz307Q8t*dhFh~X*eDX1F!c@t^%mdrZ?{t$HG{90 zHJA=Ggk>__EX#LSY`pcMZg4y2HmkD^bEAiUJ?aCCy+M=thkEwJrt^vo#&b>P0}WAJ zO@0PV0g5a>icS7EnhNqCr%^U!P&Pd<%{NgalThH{muU8;Y$UQa6ezMVVc#64& zZU;3bv4-C@XsJ5MlsDJZn#J69si7l=sq1@lPD^v~c0ma%Z8BEJ=U%h(BE!H9roKy0 z`sY~E6{;97F$n%m2sMg%>5ZjjMh-5NI6TDjGloYgw%FL2t4u9H+~ zUBkRr*7`o{=;;wH$z>kll@^`|=b9s}hKJ^oLTY|WqNSaxElsg<6f4WEJ6pQYCn zDX5X3e1qylJAKcmtj0&qf!GekQENp3xy9F64`}HO8)=?AeN)yzcEEyAYLtJ{<~!FW zeeg_~t(|4PmC34I)wxwYyWNYjGu{3EMIl;OfO@G7)xa;bfv1`x6z}D8tRD-iXbNtb z8)w+)>73hT=0OXT4NBRavQ`bw^PO2O4Z3Vy?#O3`vFy|SJ#OWV{)Rou?!^jY2>VUm zdSbdM9SfN{SBe>>Ya8bmf^#5}{?muv!u-zjl)Wz5JvYnSw931@%h^&5dmpJixtraU zb)!G|;E2mhftoUkmgHqtT!rM;$1*fw7PJQ*uG9D_^~2YD%(I?D4hGJ#4$?Yzl^(R^ z#ST80M-B`&`+gW~HpD*BAva2=*$YS9d4X6Uzz(>1s?ijo;?&eUH|KeWPMP#SEr0H0 z)sj=ro~~5iZP<@y8*-&=`+P7+jP6|tYWx;39O;g#$8wKn(XQe0UnloDVkjpMM#Q!r z?`Jb92#>UaMrcous3XS$wno?1pKCgE{&+G%^@DGXXu%!E{zaTxuwwk=({XzuoLppl zW^9D*%-EY7V=_;le`I^exzNn)GBU+74%==x%gm#}{vur+$?rnP-H&=ak2#Z1XCm^_ z6??>O58M>Ur@namCH($N|QCkJ!u`MCGZxL zOoTn7p>OmW<#t2$Zo&J;R0G*rmQ-Js$WX@}^QmZ5*X9(C#N{qRqn*QRcX<*fIk_P` zVbnZ%z(%G?HIpYo@A)yxhl>7;|4>oInTsFA-g%` z zi->BZ=&mbw|Cmq3QW@EX8mc2+p!s%8}+^%obMBi$-(y11a z){ptrDzUGs2pu#Etf|Umw>Ta*dvB6NlH(QBB!63gQ8ns06UT7E*}9;F&7PX(swnN1 z9NKThmx8-ATf~>deHO!y6OL}E%Q+mDANw^Qv1#8yxgOi8;=F64#t&7e%*?e$#-=Pt zOPMxHSvA~xp=9=W)Y@^|5B(57+R0zRG?#k2)7`~p$_4$8SDi^nCy6_OjWYLGsw!_DWnHV>oc4v zDgL+>>9zxR1VWcQ(rt(NpiA$sX36F#5D4wSMZ#l6E`vqRcq7pxyv|B>37 zU_Ocz&qT)&z2A!YvO)9t#F}I^vT9{GSoAVKGKBK>0@Y(Ka!2;_AqSBrxb`CbfCVeMkS*^ zMtF;pcdjXMN=&JeGgC88@4@3n`yE-Fv=ep@Nz|@*$_?=&!PY90nJHy4nzGY47=Ik? zOCs~@Ks{7NaMPpmH{q#?s2^40I~%CWV&tuDDDSB)cO#lEez^tBDCRq#Ht^7j?7bV6 zm?>kiDS)DyxR;96_kG~q?uNSk#hYo@%b#)ILGF~E099xrUQA;1%!5xITr$bW0QIx? znAeP*x5e=8l)X9g#Z~h|XCzYIY%nBj`9sL?G$>0+9S*l`!IAF^4zIVKN*i%XNtsFM zsFJl3U~HiQDzwr^wFd`PNE7t&`2oY*P~ZXT0pKQXlxC+5`VIeMrjmMx7G|#sf0%S8 zJy1#qf5HzS^FO5h5VznLj0ha{tMuMmN{l#EJ@R_ae*~!MnYQxSyRJ&SJG-bJPQ{%3 zybg)1_JQ4CUe|t)U{~loxU7*nYAwtkm#o{dEm73rKc%Xj*^3&=b8t+1sooh{*qmlx zQ*H15fg#0zAft&n{M|i`V(RoC7kxhIidc^ym_*+fXW$ErD_C*TtzEUKu}?S{11~3t z+Tzk&xR=B@81$zyx>y+`EIhXLI(-RBfOhxwHs33+PhUf`D_we8y#Y@EG0-s}H5x#*Lw zz-7b_J#eAHYjrd3h3_E=MAv{Nq1q>O55p3p{bFE3flhWMDitiuaD|?OBs^COnH4`@ z;I2hap6k40UtC^*$7X`1!js-SmB^X((vtXsu!7!o18bhfOO{p_7Q}(rIQh~5l_^={&XB`)<>teoY`+A5#grs{K!;`o4zxMEbOe*_2HceCb>6J>RkF1$GwB3 zXg2smN~v9)906RHu}+{f4JMA}EcCvrzU)P9`9SoRt`zgdVB`EtvRu1qVax5o7a29| z$6-((ZP87^-db)wj( z?o22rR0?X&J36(1v&eSn(;BBH`BWL57Q#k}(B!B1O6YkfMG zR)-b7Ah}Ea4w2wv685DA+ammCX5Hq!OqySP?ytuafWEm4+JuCA>h zDNmy!weAeXY|(p)_EE5oTpJcbB_?4Hj%l7cQF!}n`K!Uf0#dlYd6&0Kj22y;s_4OL zf<=$exx;U6F+AqRuHW#T7?va{P8%+Yb$1PWx^V>majAHGG>!D+B~P%$s6m`#W_*ns zRGJ8qqx?GN#DAg4zfOo;{>aaW#(c4;Y@gcahXVN1;mZaG3voUY6OPVLBSK&lM3oqG zd84jLOVn#D#v00Fwx3@emM9^;HJu>pg&)W*&re9x1PSpmfSQZ9726tqey+3t2ZEWn6u@vPVt0iLbHfwQV-YLOi7#Q z)-W3Xjg=ym`NiftpLt^|Lmh-2P8nE^QzJb`Aq?Z`_xys?9Bs-+Ip;~?XLGAB)IRAE zUogfF8mng}mN{Itl^xW*e3h;LiH-DcBC5T<(njyS-U(B7cj-8;_WvuQPS!opa&bD> zVma9`Szj6E;_PswcxiTWvZ1lt#r0OprB{2C|A&dFNB`%Fs9MVI$?1IaZ%^p>_MdBY zPivfiCl=v5=%DOzZ)$#7uGe??R;x$umuw;k^#3zCb?BzJKmPiVo78#te0L9zi(Vxo zUJ94J?|b+VLVWPPzP=;Ar)_VSjosFWzT@>VQ2GTyCoagxJXq#Mu)&2;`QWhFjj+>} z;Y#@Mij?r%6X7PVk@Z)i)I6e8J)^49q6m{w=RZYhMn^~6#Qt+O)$unrH8~~quX1X^ zFKnt?y2W4RRL{&yf5B73?zQ`5TgPNKXJ==J=h*n?+EwJH%;!4H=Xz}AI*{@_?hvmn z5WTAN(+Ua-w+n6r7dgKw3bVNqcw%Gt@zas&wz2x|vG;>x z^Y6wwKaD;8I#&PU#f$EV=7EW&@rk()6J5I#Prtr=vNhS+H`V%es_oCosU!baPW}H- zPA&SYoa&<)h~O{5YW+b@Mbo3N$`eBwyjJq>FoCB#P`qin1+@H!n;j@A=i|U4@nXh; za29u_r8s#%xk9QF#VboFBj}86=qZNlZ*M8C5#J^&Oq1RsrRp|`aXRO(zjK%3ShPpW z)>S#6&5b3(sm-ojU(W0v(&Y&?FIZj9YPw#1HJrbW{C$p3b3+`p@!s`dgpevTHdQDR zr=EAa`Jc+EF~7;FJN^GePTdRntDM@2qWK6R$@2V7PJPO)jwXA|MMAYjr$!;nc{LMQ zGk=j&olJzZUz{H>L32N$N&2gtI)kdGm>5RNZmVllK8p_>!K_-^3)gT74hX9~0ZPJt zlT-DcYg4t7j^$MJFLJ8eKbKPrntmszQt@`6kLA=7Rq$DUSL0(jHBrg1PGC{s&2Mt5 z>Q6bfpg{Vkoa*sYPL2PooZ7&e_(wT4G4k)_RLE~~D(z1>l_RKLWlr^{oT?h!f6^70 zNAV{92RW6O;#WEK+HZ1drRr~TYUVF;s?V{Un)s`nTJu*qm65NcuG5%5`Y&>-XDFcn zu^U3L+%No9PBpSu5mo|b^AJ^JC_!>ZZ>U5XLH~Wi*IOt3!F25#uNaB@UqgaHmRYK) zd|=jplvA<)Sx)r~H6-?vbN`f6dH*})RJA|Ksrzc^@vS_DX_RWB@$cl+f`2cks_idg zDalK2r;dgGDLFN}Y7kHNQ%+6)y_{NyQwrMso1DrPbdJMgSr%E3I^$jwc4*8mrA0a# z6#7k0P5h&rx<&s7ITik!oT~PBa;hdYIO8VvDKxQzg+@1cKGCma7p{EezaXauW2FP< zQ$3#Dn>74aa;mympV&Q0dd4>b2#pArvCKn(48$JSa0+`7=YLI3b>@F!T=LCVxb|HK zoPfJ}?)eeF%>Aa@F%VD@nEcPnsR>4r#K4_Gq0Acke-r!I$HIHin8|B*so*Oq^UU#b0b z$iI?P=l>?BO8yf$_4UbFS)%73S!&j8c`eHb?IzcxR+1FndzK`qHU*)T1`ccK+0~JqQhWe*b+4Q7dbWh z_j0N-Hj~PP8a4)0|5Z*+I-d8#?4F95YE<0OKs|hH{7>an*FP_(y8S^;eej!{s!RYE ztfKUYNaamX^}mo)ZU32^Do+5vOT1r={ZDf0=zo+`lmD5VdhZW%s?J~JRA#6wLt(NF z@Zc|UDhU$v7dbT%%5*HJ;{Ju4TJ?*ZTK$`xiv0&Ul?0(6RLriS=GQ7nLXWEdD5vWD zSLD=!zmrqJe=n!X<6!T6|0buNKO&)G{^#V>lm8&6-hi2ZKs|Lr(;8%~VR2(dC4xq&WsW*03M^1+RsT~ zyQNJ*|I2c!I{7jVE3DKd4#qCyyL7X=^q+JY&UYCFbfw~y1r55)pJ25Qx-7E0Esuwm z-8;@5bpNRCwpHrU|4mL!{7-VK8oE!X9A>H7tCvk~wEl18R6}T7cGpKd^osEFf_3)P z*sfQqeYyCqS0q@QD$Fub|5#2nC;gODixdAQrxp`onxsD^rxxR(9nRD@q#@C0sOzzu zYW$m=N}$mE3P}ZwqcDlX zD(R=3n)^pNwfJ{(YE>09mHIrYl#eernz8{kpeB&gCWo-U$*FG8muGqrtxHqClT)?Z$f|ykQw{L2o!f6rlxN*up?1%_Gs3m2k=Lcg z$)uaNn_0dcGPY~3&v)8C>wWrsu9D57vP(^TCS>uQLtfXl z*YB)b_oQR@>_*sbAiwBjGkGO+MJ9dmR{1gt+EcEC^%^q$x5=qn6uFxev~i!qjh62I zGda~#9P)irAYzLm#Wtn$)6V~}oGPI8c6w|{Cy@(dG-yGyq@5(lGyEszRDqvzDj6pK zr<`i3#GW+!^21c68W!doM!$4YB+--LFgIDDQEfkTMYC+rWsW)VKgp>MiG7(eH<#yU zYOjiLUv>0lvxF{>l%A`+nxnA0G7dIrEIxACX8yNws@fyb2UiyS{!4P|DUqu_qAFlY zE0@#b0?dg87x>536(BsRcqyoi{J&*W61=9ysu3|gChYuD`Q=#~F* zIrZgT7OyBIrIW@_Ikmc^xZc2TdL45trwSj-si)tfJ*t}iot!HAQ%<#79E^1r3FacW zNPox&!?4rk;qcV5cG6??6;TLkuoMKqs!AHoz2C`vwNK3~^0wFN%qIrjlCuLy(dxg- zskJP;ld*4RTR~Wxa`*GY|58p}NftboQ?qr7<^L$B%5=sW_sRA;M#uZUP5PUhs!{b< zIhFIcFnM9naW30wt?j3rx>|57rz+S}G@Ycm=a{l0`%_LO*P1mZ3+#%`83I{t*8U4Q z)sahdmLCFeivxt49cp(#0PnAIYQaxA^*uLW(FOtrf0a{1SO!<$03e!@m4A>^t^O*f zw)FxyAS6s3G#diax`$Tm1NW<2$(63nj(<^ z2GDH~mOaYviwW1$#R&kh7ZhWz0Z7wwYK74WN?g?-u< z9kh?cA@;GH>>ne0TiK6CENF919S0`z%<4lmiX22-GE=yFwsj&j*|ca>gJZ0SAo}NR z;F4aA7d8@?0bL%3!E5n!Xe>{Kl^rYl5dyg0-j)$K0n`hW=p6FZQZA!|L4JBoyrxCg zfpm5<01zTrgk-Jh-yD`6O7B0xZywrB^t>5 z*i56nU-HSFe2nGvrW+mklGZDWq%ig!=YDe*xM4r&rW-DUDW9KJq=w4BqYkkQ*ujj} zf^GuR*&cSP@(kFXSRju3=58LdkZo(ibE;VO1K1=$3Tn7L%su*0?c7U&X2LxUaDLI7 z_fCEYb#G6nH6P@N!5&vP;4S%1C1<(&4HVq=Wz z@VhF8)gZ6{SmlxFTaU!MibJMjgUK&#{Xdt`LwB=rQlTW%krc27Qh3Oi5 z$9~MlP+hcWl@>0=7ja2?keR%%P3(?D*o)Z$fF98;K^;`h!9;A?XCEVTo zQETn-mRAnJW#QE_5#}Fbtbg2t+))?r1R+M}_9rKsl=VO0T>_STqi{XiemFR+MG4>~ zI7g)3q(EO&1F%Q3ncEIG1z51JkKX_dDzOp>2tWq=R&p=AaMu#Bn7(O@2H-#t$2)(u zX!#3|f}pB^BthaD76rfrruZ0`PG7&wJ-`w$&jrQ4Cc6h6*p|5}gU|+~kagh8FjaV` zu(f21Xb3n1ywHjXQrO7oY3p`-a9X7Fjnh3Fl*j zbNm$b_1!yhn0_gXM?RsH`ZJwW007C{Z3wM;>f+OTNqNDw(MNsD3HO-@aID}rIJiFe zwPd|G+UECZU()zQIAOI&DK^?db%>8dku4%0_;ib0DG$=IT4H4!>=MgmQ5GhEC;@&g z-8MqfcOP{cRbL2j$b_E1px&X>V6D{(d~+?P#$ekFpP7VGIvYP~(FMUoNx{|=`Q)9l z8ngiwFQJaaNzRDa&Qo`8mD6@FnSY99roaR|CVPv0%RU z2U?PjUMbEY(3jS#y;W$C6hRe)Ahz<^?>L>1U5@WCk$s(2BJDrY3z{uO4!GWAoNdWo~@m`WY2B=bH6=JYVhfyg1Vx|I*KLi0d!b%op znTPVpx+fJa@&SUJPyDN-Zq$Aa&WBiX2%1ZPR-Vj$RVwep9zl!=ERt{}r;@El!04`D7RO^CGGrZ*j_DU5mlyJesm5NK-qkXFOs`HWuBISjpI!p86qb$Pc zvKEOgR6Hd8^jAgo|1#bxN{ItpIvYb}jpRE}QKq^lL6B>K=JG5_FM53qEBZZb zp;yTFnU%4S?rQ#!RazG5r*PVX6r#Z!lpU;5qX-nb1Ds00fT<$OH#N5>2tVPDYh96! z+1kYcO-|+VR5(poNsyCI6{(g>upcmxW_J&$kdjuBktQ#{&dHdOD^DjLa2>qT-onyx za=f$tjx-#;->c5n3n3yx=)}i65fpv8DG-3osmHht%%A}5wO7ULXK?sD)u$!UgEO85 z0|8^tCJeO3X@M!ph&1U47u^nIS9AkJ{`)14Z{$p|yad5bJw|gyx%sdYQoxC>NDa%L zqPocY&ieDQjStG3xE-}VmFgbB_hFzWK(HA&Q-qi~*;}d9`?w8EW|0fs50jrzfg=bC zRh&|O1b83M_{oMQE1=m*aR^F?=6V1WMn0Vvc{ahyx!&Epmdh|c43AXubPAdB{{>7f z{RK=l)_LqVz~T;k)hr)cXGOPhf(R0^cn!XBHcoSH(;Mm#CgHZC7VDC5I$Yl36NE=u z_iWwJyMZ$_VNDBG^4B7t!aBgybVhI3j5^E7u32E;X=+5vtCS<)&eH6ZYeB0;rC@P_ zlh5Fnqyg`Th->miv@GSDT@R8yMK0`1fjl}J^y*vJ#Q{4m&&N=(Pmz4|^MvZNn*C?} zoKFvI6Hd@(9?6$0G76{T6U&C49D}Lhf}8CRJov7fHe^e~zqdcSv89%G5XRCBZB`VU zl7Mh_<-ifZH>62)(-2ME1goM+*k8cZDlu@?a}wKdxfNDU3Cp!x%V1iAG$p`CRbr>Z z0b?NEhEpn11C~f&al(r75yG6chj;-no5OQs$A@wf+q&(rS z`y+DEtYuq6i(-8uj5$UD* zK4F~@buFTkM3TrUskY9fTYL}!fV{0Q<2t~JobudKgZa!1dctne>EH63l?egBiN9#6 zkwm!9sGG=C2@ue z$m2gzTt1>rvNu`pq`y&;mdZ~m)y@1qn@RlBd7~YRusN~|2w3cW#PAQRp?$JTh_IZf zPL8(5XKpY;ZP>XMhLs3l|>Cll_!Z@nM!8VMq6t=21@&*GLPG#e1+kppcnx z1NmR5R3E6yUFBDIvlN<6winLFj9LsXT(KOtTCa%X{PD`J>dHA6D!8#C@!Kn%daDf= z>-z00pB>@^_XqbY!dLE7J+W!{^zzD=kFQIEUhh-kKfN@t5^B%EDuI5+qPbH7L9gHj2kpw zcDZr2pM<=Tc2x>~-ED?QhgS_D@v*^&*COLrjtHVOE9f8|3&)U0bXX%tOq*l)q@!_l zWJJ7U^4>LHQgmE#jK^eC6a%_E(l#NyF)bH|JoP?9^m_L2Rs5Z0xA>+U(WqS6ro7=u z;_&rCqxbobuE+E!;I)?v7@YXL*6u1hJ-qTh?~YSk=d~DdsypHG6^{^)9ywKCx#sIi z?K#}crX82G+*DT_SWV;nByqAo9#T_%iN zUS4sTbak1!<1!uZGL!2v`^aUk)n$IzW#N^}(W?!Y*M}c7S0e6MP~0Acla9aau&nhy zSe7Zg2oE<*ge@|;Vjf5Mwvm6}k6t%&-SBep7J$7sihe^K@BfppMBQP6-%2F6-G_QN zMeW`Bl%xCUb9~hPhU>xkkDsV-2JX8aGng5X5tbRyGb`c##Bdt|OxuDCk0rCsfL?5h za09|jSI9CPH{l8Rzzq2145%T1Csu9jEJp-Hk}vJSF%eO2I4EU8R6rAaX(NJh1kUoF zl163g4k=>m)uw4v*iT(m{$>UV&V9l?5OC*G*?uz#za#F(|K5$0a~qpZPIJ>eu+NQs z!M%awxf_U1ej)1m<fsqlWeI9|T-nRlh7)By)9Yy($hI)Pq_hX7svmpC_sJqXvrlR(5 z@Pq_NXeqSNLk~@Qm6AY2s&uJJKu`pfs&pX<(m|>qD1<5k(v*%!C{jeF2_lGqs5B9< zPy{mJzJJg2%$j*KZ|5I4>#TkDIcuMN?d$uwg9!~C+w)zG58vKX*auoT#zzUqStl@1 zFQI@&LqseFWaiI&DPZ4VkHJWjWQmW1&VZEhK<#2CYD)6sSu+gTmbc7i5^OMWH$(rJpn&GZaVf>OEAMUi>@nzug}Tu(DZNYx z+XtyJ{me(kTP7w%88h52P6$na)a^wy)YBe7Lij%D_8#pChdUyf5DwdcV<5M%NZrJL zlEMkbbWB_cBopMe`MtQCVuo#t-aMvs2?(yfNw@l!eU#b>C z-=PAM9>+dhT@LH_UnZW!=O%xzFRb)BDR-Z+_sM_hRq@K?|GS)u{&dc{*V|R!&9kXN zg!c2riAyzY|A)@}@HqUyhJw(%K3pD#AIS1z{t7K^Qu=i(c!teZI)=S|rNWn|{$*H! zPkXQwHGiYcx8IatSk9h}C9_i2Cdyf+p8BTR>`XdTPjy8eN&4^&-b2=UHQA8_E_MVz z(SN=EHTdxpP9@#MSK*BE1<$W1h?x=AzuAkMRQ!*eO50j}Y+HZxSo`+IBZ|HWzTH;peO~9UwV6(!1M>*apv|NuQ7tLoFE!*BhyI_3tvK!qxnF|_ z2i$~(?w{2lql>WVT?XFfuoW|Dq4zfL?(!C010^+4f*5#7IOVxGE@I+RLifW)5N`*b zQ_Nv@txfEn?%}>z29{ZiCa0R(u_XOhPSrjr&kBDr#g=35{9r{`pAsaIcPWAzUXf#z ztR?xts~}vm;CgkqWR_dUb+3H+v_8g1A*EPG-dpSKQpM5JTEmtdQA+5NyL=Jo(j=J< zbXoj#lVo|OskU@Qu5*NR<;15enHdXShtgFg$=WhcDheWGs-IMM$kaUTIFzYv9MP72 z+Wav>wyu4>L-zkJr}9P0HT^F+Rqpw?=~KrNGGV60Hr$+pfZ~H%T>RNT@ng1=P zD!lmfF;b!XV7*h}MTrN@nTTVSv8WKs-Q6B#P zwVc}7jE4&u?IeSK_b_%xTmh zGV|-}+~$Q{qm00`-ys-I%WaNJ)CAQvH!A$)!NH)8aaKixJ4jH}AVC7T#GD8o{R7St z$E?*8&U`6`cQ>Sh*;ivsC*t($2*o$cf5x&Okw;oQ<4jP!VC2KatZrB)V!;3^-n(ep z14xg`!NsfkC29AR+)J$T;@!&-0h6lg!$P3ko{sd!qT}TNcpsS5g9D|tX01dUh_0i? zW1S6{#9~1L#;9?5<0+^}*7)U}{XVq`4!D~lg!xq&LJ*%>ao@Wa?37_Fy}6ID7!wI| z5d|u(W0`bw8Z&;aGcyh2yTZ#AOaMAVh~EKe>%&R7#l)dM*>UN=d$~ zK<=H{WnMf5>X_l5{b-5>O+A;SZ#WU;LT)R9-^VhdU3W#(JPkcV+(EiSM4Vp_$P7V? z63G}s#aLyXISL#&xeqRL{+?2b4h0ZtHw zGR(JH98_g;AMxoR!7qT$tEY|KgBX+)PXf_NAif8IHg?Y3 z<_P`|`4wSP^zaZoNk|lsMe7WY7_l|cX$aGG+Am9Y!%HN+QhE~s&PX|-y9>{FFm|Vn zv(ktb^Wp70USebxVd~tgb}|^ht+dAS=^v2Owk?Z#V;sD@6O>o9Bs@BCdu8g?6ZTrGyyT=}z zHToxfj?K!qo&4-4l%oI+QdoCbGnxTl5;Bt`K2>=##FVt!a8CL6mt^8oX=CQI7w;0e zW6R{_|MJPsU5Q>+X^|fge+9o6*i&NShcpsX1F*~~6D8ic_OgowrOCxjqbddEjCZ{q z?_SMlrwh61cdh1=mlqb=kij6ScbLF6`#4|Rz$|7UgBQVC41T-ysF z!KzHB&AS}kCol4%YV49Nr*dCXbroZuf*9PUluSZ#ig90a?Z$1|dP^Cunq|jbA8 zjwe6>-PI*H_2+BBVG*Td+uCKWXKe##wj4uEd+#aL_1o@|pFFlhe93MT8RMSBH6r@L5&b=OiwuC7s2|t;mzAlh&fX;cQD$F&RtDR=mGV zOI;B%YZRV$<=;)}ToH_c?Qe^h&y$WypyI39NGI}7d$UXiZH$PJn-vOS9O)f1n)FV5 zJVBDIm?UTHBx-jEH1Jm<>w^M>6#fkIG{GQfZv)}}{bio5k|GIjQDKr}FAt{I6P@w# zZ3}lc;+gx3^KY@p28=;q{cybTw&C#W*j_Fl>rII=CT`L$Q1zNbxcYHF)`YB!%4!w?Cf>wNxxfoOMK`Mq)Z^|go zdvpyHy1MjSRU4wzqhSzJ_29uAw_iQPDMOIjKRk zt-&>IA4TJJrO`u%Ik2i{_{+T)D^?05_G@EIS(4W0eavj5pP=0Xsj+c!)Q5AoqNY@$ zGcAyHkrQ9MOC@gt7%I=c`1yWED@crebh$nMhSt+d&hMV8}k8@4)Tyl>xqb@{-VDU4|F#z=vUtV z>fY<7>TZ)1j+QuH?!V%3(f><;MhzJX{cuds8eL-Z#{>Dsj2RUp0I8mi!n9ZV&hNQ# z_HelvVO2f7!;)vv@cgpF*=r2an&V*XzB`-y2LrAgGF4>TjcQ-j%k76n#!>hd0Ydu( zVX%Rys)3jtJy4SV_<#VYBOHGcKKUU`Wg}1_cmT5jFpFMxj9w7~RL?g&yJ&b=v1qzz zcs9&XdCcHQ&EWV30pY#KU5lkxr-3+8G?bU*!I<#C6rZwt>u(MP$`0LBtV+?dXLf`h@}gw7i!Z{tK8YiVEqtbLB6I7nDk5S)T%?8AO!kGGCm> zDPZc#LIX_Y)qD9?gvEnsfT+vRQ06@>tLp_R{PnrMJ2P&nS~6fpH$=0j6tFP;LX za>`@?{aIs7lXv2*8{X>!%5(4Mbs_ zJ54ZE2r6IR1ghE*k&*@q^J0TTedp=%G{MxQW99P@P&TCOR4~m?6760@1QCG-WP0^Q zb|Ok11C+Q~UPl8=%UAd^h!%(C3(tYFi6T;Ar2JFBl!($>g4tnt*HLy822f{VRZn60 zv0j9vf$TKd`4Dglm_9GB3j<1lQPP|e4ijv8QD3>^Br<7$DY;BaNX)Ip&~h2Xa^Fiw zuM(B&S(gu#22oMcR$g&$dWDd~;>L#4`G#U>tQfjH6aHp{D{*MCnD?&{5#?f zmOwhcarziT(>s+9ysw$=OIXkua@o-ba}VfERhiDO-dnBuu-ywMi^^;Qx#A3Q0suV_ zm69Gyl->d2{4PrS-S73B6gFQokDWF#C!TG@`5gf`duB~kMeLcAyLb9|U_dWPVo--V7e+P{o62}CYW|zwmB6{`(^(lnCfo<|Bqnm*=wrmV^T%ncTC_W z_nIW03Z{?Z-)Q8=a=_3!npoU(5;qN#-%LECX{@O9_wcpouVyqWMIe6of^Re1Pu~#jM=(WB%L_|h{*xHmClK;80zEKns*Jc#&hftMzP;ExZmq0u3ly<4{?ru3ei~|nf=C5 zy}}`Vno~qoR)fn`!23oNjW zxKv-0hPsXTtyP+1Y9+5=Omj>L9L6%6BU{M08Z%FN1@Ag;)_O&60yo$Yw+NK*rm%YZ zXuCdDGX`}Qyg`Dn|Kpfi(Z?Pd3)s5`0%pp_s-Ln8iJ`FKt^|EtFE1-l3_q5A*>IlT zmF4SzTE7nt+#r-guO9`tDMxthU2Ecp{a6||D9H`vAOU~%@eaM>_3l4Y@gK?56BSl-a6q@R!cy?tWaB+ig!9-xq4GiHR-ejdxeO@x&9$&FHbSz{a)$J& zfh^CuPl!_hnsOxV6_OAJz&i$DT%+e5lG6-ZP>jar6YAIsX_{%QXh<(PnWmZ2bD@d) zXoFMD6n0(JxCti^Bnqu&Kh;bDS^uM%_Q4Y|Fae6}KIs%Qb$zN|MPTphmHpC(263{NbniuGL-ar3rGJJk>vZD!)nkP4{&b7-iok^Xt>etCQF$mp|0O0T9T#O6}Sh&+}%`aT@jFE95% zpH(%)tku|yaK^8dbTd2eu+QaLi_Xw3+VClQYlP+z7jG**c=SPe(|*obz~to~bMAz| zIKYf1n}Sd=zxI$aScQBXjIK()3=jReyRy{>qtaEV?-I^~Y|P?lu<4@<&CJh9g*Fs* zt6n=AZ2FMlmI^&&z+at}9?=kDWY*6!lzV2Mo_1p=EzKsIp)vnj85zS&!m9OAedM>X z{E0-g_jn|)z?;RT|7xp7{Lv7&c_Hq$;y;q&^PXomEu1`mkT{Uc5r!qoN6|7o zm((JcejB$ z{QK=B)$k>#V>-?wFRq{kB>V*eXrwfJ0|ow+)fH zgTS}gCtbWs;E~sZg==Go+`s$O=l_yOd%5*>Pnbn$p7Y&n`|+V|7HvSbgL&^NTI11W z1qzj@7j5Y@mht>tzh+mPe*gZPrv66%6{x3ywq2wC^E!RUPW`BQeJcaBa5(Pwzn8(v z*=MybboK0&;&Wt`;3+*#ZsC%iTGl^zwS!y0u3db%xc+}6?yQ5J!XEYCByLP^!4)m@ z&40d=?T@J6cHdxBw|iL^Nur5#GUX%)?7H^z75{&%iea+m0oCbqad0z`-tXJWDt|1h z`gC6Oo6JTxTRsNU#@$b-!CGA60Gvu zbV#T6OCavKOneAQ?+7ZUXyCNjq58XHV0^~tcZB}YJJb<@NIc_xs+wXX7rXS)R3+Fy z!|aq;QO#zZ)mVtt$=40lI2vne-u%B=)4P47Hrw8n<^FI>26UVn(4D@*#0ponllv9O z?@8ful}q)vZ7{fF+C_v^(TTIJXTHU-cpeCj#;)5|E9Q5r?2~qDXxGI&!DD^f6#avgmbH7>Cf1Zv-#&R5Y~4vtkw;`4QYRORYB%(!JV~tpDb|QKGTsATvXqSPhHxlyAoC|(u-`25?3EqlU)KUf%M(5u=Wfsfc_r5Q3!c16AwIG>;@TNwqT#vVs~$j@M7k}BEe z0|!ogP0{ZLE4C{6kxLn@RadSHL4CCXYj}y8%M2@y4>Mm;`EGQ<2YF4F%7y(sKAvZi z2^)9k&dvC>3U(!8&Ixg#l>yTHc2+Ck|5~Q3BRD3Gbj@Y{%jRQCxjb=N9`Ry0q#?`G z58f$kZuL-+9eG`?<*OUgw^G*6iBR-=;4=AHDw1+SmhsUNLp$oZs_IIG1uk!6!QL3 z-lyB2qInj=ig#a~FQDV?Ly4xTCT4l%f%@5~r1=kJ_VH2nXh%|kvhlQ7OG$aDS3U8o zOU&xFSH4`xtfOF+q?0Iu)MCF1rGch{Ru<-enV5sWM)Q8l_bPhut>8xYfpt`MPi3W( zUk?)uet9-@hIiW<2eK6;Tp*UVRU9-lwX^e?mEGp?*Qq}U4 zLBXb1PE*C1Z1fD7QH)BZ-MKEWcz<^NSt$J3eSE@ib0MwU)E^WH1r4r%q}=i+}G@7!WecJ^N0IlPi^d?JORW8aQF!85AB zyfHw@Z36qIFdZcECHSvDfo=0MgS;jd!VQZPWOYAQcBIfn1d!02ODA?=y>Rx1IDx4} z=4-14+?4@wk^-o++(WE_k}#tDygN$_+7PJ>OEfSSW%oBW?W8rDvrwkBRw3A(aQc{Z|!*o!(x% zjVkyoUZ$qi{xCl-O#wne3VkChnK?UOo4kA>{3 z#pKsa64vd?UU5EAJ2z-?#mBzHLkD_31(1SVG?|M8P zkiFty^Z~y#_16SX4}e<{H|$7%Cg)2#aKMMDKxsZGWU-_t&5$q|ySQFpN0voy z$7{_mt$u8M8Rm;qFmPQKIGFB}zNmCdYV@vVKN-k!Yl}_wAtU2iWG|(;F(q^iaK@3R zFCN^VOp{aD%>b;D7k9HBjKVnic}J|-x8i~c=@<7ujGRSNn7I!bZ=8{dX=xo;&%J&f0!Cj+HFVJ}0H*)4<=l7C_Q(Ke46iUH(nh_FFp!LEXv zpZUaKp}6PGh3;2u4fkhy?*8&m5M(7;Ix=?%H7U`(0BW;52Y#e8fP4@j`pkUzEZYXq zj>UWr%FtSn&QngDdz0-Rmui$2YxMg10+(W&(^oCapK&hk?jboO1_@(e^*^>%e3%YQ z0;xGvsN^RdKwzPj+6#{_O|a+mtyyXG=OzfG;&ZNyg1gJqpC1M1aCDW6wQ3ex*9Esm z=6|KC4GxPhYusbyEf;PMNSQx4K?9h#e-LKQZ2vh_&&OWGx;Y^O8C}4F=hY@eUDN%fcI(y=zg>tS+!A( zVQuD$ZC0>$dXZ&MiwSKj#>N`n1v33D0|1u?;v1GbPC*?+xOTJQXZc$30*0#}0Liw3 z47dZYjX};T@~qSY4e*>rW6TvBBlDKv7t01c0BjSMLlqm@lEGR`pBuCY&a-06Y=G2{ zu`yWkUv%ek+-BR@Zks?4k7P*VvC2Ut?qo~o99yn{I~z+TxpTb?S*p*sKi7?MeSK>|a`~V9R@af#ac){8g^hXHPzkb#j7-f;ZSxyo-bHaJqt3>P*pG_9ZA2KFL?9z5c^iSC z#o~aseKJi*(I8QXjhHY3qH80HZR*1*LuML*yBV!L1lH6Zrt8ZrWU@yCJ*H(@sICVb zBGS6(E$dcoi;NP63a^7cYu{ui{x1`|N0fX?HAq3m*0=5%wTUI%Nr*@TR1E;)OG1!B2@v zJD3e?D}c*B^Zm8zYk-=)Et}J%n#r>l9>MBtu`k-6Ec$kWTih~?he;gcl5 zZTV^W1}7 zHtr_Ma*C0m-MO#6=cRueQKio>ZJbZA#`S@9(qdW3Ju;)gQm2BcJIqu>?>f_|U@8e< zMq)WcMTB#H(*#pfnqXRGC*NadevCa8OeF!|{7Qv=Y&iC%b!dXAG6A7jCag@bSH8Z)W=rqy|kO!#o3gL@b_nL zr3lB`@Y!HhSc0Wy@vsm82URAH^(IBwTiNWTm%#lcvp)vhm~zfhRhMITmsSS-n@biq z3rBpBrWU-zF8-;kJI`%xCUuA~w?nJ12k*2;F6;`{zEi!$Ltb5k1FAJ{=E;BG zR$twtM%L@>w7~$yvp&1?a*N%uaw~nXD`&Rk+ewt3=arRyaH$7xLkPmNM7iy*Yv+JV zF1O3}dtuWO*Bi^+S06(Mt}))hYb!4?DfU6Ov7CIWHchc?b?yRqEOe_8AZ+D7R&~++ zHqss|@w$Y|ldIi1C6#K|a^>sal-22*_gMU28vD!N_ph(J*(e)u79YUW5^&5NXmK%c z>AoC^6Het05D5*`Y7T_*xGOmX_ELa~vZDS*2h?{&L@=?^2T z=w7hCib&*Wi6ad#T}mlJMfwIuI!r`TT7mtcQP|d~&d?5e=IHmK(elruohI5BHKM+2 zoPh+AFLQKw??j%rkEw6E^G+j%DNOtzRP`}SEG5+5eVW_aAz2g~7EVN7O~8tav(&}H zkFY%V>F#8p%%~3isNW)?W!!Q9C~@H$o~l@r&Jd|x0NTGb1r2~z;rZ2>c>gZ4ziP1E zT4GsWV(A-8xN$YrCnUDhR)VtG?!A*z-h<2zgM76Sb@|)&5ZPPmn4F!F{NO`-12S1f zBeL^DavDo&b6ARPR_eod?FjYgfhDUyd8vhA3%#0Y*)YS|w)T0)s4w~UDJ8kv*Kgb> zn=}&7WT_eK668j5q=-Ba$JY2rCG~v*wt)ye{+9SNjBREY^49(1#or^{U^X=;h}LZO zF;Bp-ca8}!#Nr@3Pd4Z7<>XwOlNX++JbcIdXma+U z+7b&&wH0(fmnBdH$eizh=gN0yh|?;{4Jb-uDcRC0eiUB(ue}K1T)g*yjha-Hv{Afs zK>jj&PCrZaQNG7#?h;5r$&c9*hJqY1KJBEGQiZ8f{Z_ip1f_cX!)v{T4h0o1hXv8Z z5DduwFM!Y3Pcj)PiB^;%=deb6d>q49HN42emmTM3Aei29C16=7J- zz@gWUx4O`#`pIGIHw(@B2(@Pg!L1*wUlhnu`1sp9ki!wlc}PLU=CvX3@)ADT$J(_J z^Bn5?h^N#iPg(pwbffE@wU)O(mSp>Vb)Ghvo3E~vz~k}Yzb2Bs?uYqc$b*pqJ&r{4HXGji=3aH-vvWVnU~4(=bTdEd@Wwn zCEjz}Egbc}lW>p7R^PeSn6uS)BU_RpJ=6ZleC~k{VOzeCxt8Yc%rE$nUi z%2UHslY!eY@~Z2cFdT5`q(1HJ z)i~-^I_ewOOam5%{RW^X)%;-;@ap2#D3GuwRx*RaHr9{AEnPhH5JpiTuX@m&6gITG zNI=vnV#-O;o-V^~Mv4rfguWBpfNGdJ44SvJ$E6df-DPX2LzzL0D!Gonm^XesKl((4 ziKz7EWkmbj!<3n${67u@w59ny4gcTnRSrg1PYqMDq6d-383WfAX*x$?90lrfihyeY zJRM?u$V-mQx=pt*r;MpVnFiTaWQdLrqdqz*~n$?_EBvbi8&n*2@>t}mF%n3cmbeEtnC+ z3&XmsVk+P0UL8CBWO#Yye9?;8Lfe;5p^Oi@_u|=8$)v9geAW2R*Or--MFg*+ng;3I z4k_sMT!gCb5?kpv|LzHq%gdppPb>988_zCpv=-T0 z3k5hAy#rhmBKLsjXD>ClvrGe|E>l2ubO=#?VbNy{+h5tK7d#jp&mnPS1{>m!$K|UP z$E&x*)_iRgpDc5e7g=en=~k}a8hM`WX#Q!@hE(yc?C0H2pEi!aSvK{&IQT|nlejF- zD|z*DIMSf|n4)65JWF$1dp*9sb`qhnrs5>w-cn8!^@{{`H?(*!^7G!X_AimI=)fyg zZ%#tdNa-0b;UhX2fB!i=$W{nvfEJgEA{g{`ugAFkYc2lQSzJC?cu9^!<(HtW4+Wg1 zAGTj2S;Kp2+nvRD38yE)gI)Ny@%iLu@jnvPTF6dym#_&aEqqA;vGD06Dwg!#W#!k7 zTJ;w|h|!Cj5w9FBHHsfo{~tM3+)R6O4EvBek}K!(Upe)Gs_(}xnwNH7+mHb7ft*aj!e9`!}17 z>-uY(sBaVhsGlE(d}K4m6BvaO@z>`8)HZ9K>?nk#NhWefh%~m-mk2k)(PwG@IH!5f z9yIPgq3tr};hrp0DuE}S=>}g(RepsD!Rvs^Dwy+ zGIb^ULd2A4n#~0huiT5hUS-(@cE)2mMgrGu#L#-f56-qQs3XKe#>OqAf3XJN4HI25 zy!z<2GUwOah^^sk#gSKMSMD>t@8eVtn{S`hj9GC+pe1r`hDs7IBzqU#xtx5xvcQ?i z`(^>G53EVTX|kge&3o_Zv!9fh=oOUPyrvzFdNwMa4a1G9HNIW9)JXO2tuj2eGLfCp~C3Yr?hLT+|! zqyq;OE5wKWy3N*p4a6Uwu#lfkvI{u;2Q)QL5qQ{d_8-tx{uDF?xw$$tKN2|wO?&8B zBokRdb!QBFYX&i9cCX%b0C48@K+|TV$gJUyI6y=4fvX{+)o5rocpOj}7dR0p z0idJP>9I4KlZxVi$E{@pr5~zO_Le_v-qSgDO)yymKMng9^>q5t z2Tx0ciC+2}s!BSXQt|Qc+{ts+pY#4%nd_Q;2K~(0@sD%gh{xnCL*?Q+`y#hG{{_EO zywI-Vr&AEOZYlp7rOYlGr|_JM6TXoC!kOOp?F;i%{bL*6d$!S9Dl&m$THij^yK#Id z3};u?bda+)<6hhkX5n;jEY_ZP=$g@wF<)kOz#A?!J?8JBS*F8E3m8djW(MVk@N15} z1yhuJ3OPii8I3Y+A}2^FdU&!mvTec=du#meMayvx?g=o_4K<5LdsvwWPt5cRboWQ6 zLb9MS;C>WyU}9ioqV8s9Z%q_|eF8PalUM?MFW;C1JmAOH5#n4{MOk)s80268cDPcU zu%9B89+I8-Dx=maVLsBGc9u)420%wIL%~eezIvrIof!O1q*Rz{j7w`MMO;@k(Zgrn zRVg>+@vLI-JPI7bRaWy~Lk%f51!!EVy5iTAako@R`QE~IT}F8-`||I6LyZnxl#=-8 zuR#>n;U##k){2PqSL?er$tyyZYfPNrJO**Pw7ipTiLqtCUH3QFAeKqzbv4&al+?@l z`Iyts)IZ5kn`rW__JhcJ4I!MgEuw&J^Xe|%W;PQZw?9lJYIkGe{oo*OPvp?qFVe5{ zHx)EH+URvUDZ|cD2+>yjJ&%t`Rx#ec9wIj+&ALtWSnMnSFxsTzR5Xjft?&r)qso9v z2bmD@0U5guJve`>-71ZJv_zKREuf(YsIg~bLfRLbstk@BG%c62L!Ma1w86FQ?mkBA zezo8lw8?D?iKnrqvQMXz3xw>0wJg45CHlYvVv|(%hKO)w;(OJ~km`4))tdMD-d2{@ zU#GFAcEbg2)unJG5xkN7su3h6K3FF$AnU5)b@6#!xeq^VkgwVG`N=UlYN$>ZrM5MI(=2$EkG<4P%$#q z@%~Y)CM#Vmw5=TQKKBeIz0g1cz0v+@DevV6mWI`W(C**oU=h|Iz;l#6t*s&1@xPsN z2h*WFpl_VQfK>de+62*diZL-ybdlaHthcP)pLzCRao$eBF%^*FsBE4EwhikSfBS*Z zHEj%mB_WQO;y<4K6aydm0 z_Cy5TASvvY|4G?qXXTv6;duF(!1BgH<_mTaN`3o{+M0$;j+l7(9>!tx;?pt3yoi}j zy+1#;1Md&C5qMJWTio0a^mgU&oa(xHJ39K2yLDnLbCuO!8}AgDpFeKypI6JXmzP|< z`ELr!TxQX^ivN*uG>WoxHSQE{+9GzHr7SSEzM#r%^)4Ag?$Cr&u+0r3O*mbe^g+*ymMH+Mld*G~u+b zfoPR+BT-;x&ygmaHqwMsdS<_>0R&ArWxl-qAK{d7mjX-A_;FJhJZEyEE=&_nuOU-E z(uC8Fmyg0OGj+C)Rn*z-*sq!U>+nCita>V(wtKQp>v#XLA7tKY-ahAWw8P}|n>hgJ z-nU3yJg1mE^qZmm6#~?hD$9z4QryGVm|ljD&Cp+!`J!me`*VNs8FcsDmwwpo%{+(a zD2@{(NoO7w=e>By(TX5TMqztz+fJZrnT|EJwZ^vo846l0+DjvqGP?Jwvl!O&{s>nk z(Ve+G(_63ao8%UyAMlO89<8a_6ac++H%ROYmBszB%$`xVPLS!QX4x9}SiT99I)Z;nuL zw8AR09$Fct_JFx|iAtKJlUSk*S_4?3{fmca78WCQmB`izEqMjown7vvdi^MeYLTLo z69OdF5C%p%m%U+5U^n5RxNAsxu|%UKnu>Xfo6@sT;u&bRDV>uDCG^K{+sL7WNTk!% z#CSr!v`;=Rxq-<{1R7Wci}!|wqG`Dw5*1Sk*~Uqt6>-GGB#ZBH(Eo>RieU&Ty>)r< zKelNG6=J8z0u*6(CNms=hbHCUiX|ui8%ruiCO=zbylDk{um~z%g+AR+Zu|i23beA^ z_lnI=8uylnOEfseO#?vZ15#+Xsa+oX(inYc4~wW`vLu;dq6PXEPd^mP+N%g_(}XRe z(^tIHK2)Wh;-(nbe{oZxP^NE*>Fi4^x4xz7{0BFk7|M{dJH<_{UM2oS(@*3<%BmQE z*z{0x+W)pqZ|tWhpEe|9o?y~X6f=R1&_hj_T@MYmrJw7`1ny))OPG>xXPjdM36h!F zq;uxWm<8!Th;Swl;2sbTDn%vpLmYZhAkP}i0HZg%In85VOsi(Lp zELYVemmLmuz@T-9ujn5z8k*!K{W`@>tt6Q*EJ8;cb50gdantiO+%)#S4GlMqO%`3u zljO}OIpv=;i z+JWJIJ+fsF+&9MnvKa-RwG^ikjdoFY2l@qAmJ%7qq2HZgD^Fls(ugks4EU-?|Ndk@ zh7=tbVE2x7$U9nVYUG&^6fMukVS^{+nxT|sK zOo&B@$dqBL1w)4mbeK`$-sKHn%LFrkSm|<0ZNY(tf^!AsR6vsbp)=^?Bg@5dtM!T! zYgWrxgj#Y1m6Gbk=d{^S?y?T9Uaa(IejF(CIN0>@EuY7s$&bUUA4iTnj$VHp!(2s> zsUn$H-Sw$TNUln%u8La+r*AXL(%6CI$`k91?1D=CcJ`I(Cxs(V9<4tqX09%gsV+0E zuJEaToJ?!Zs{Vh;smTaAIz}mZtCZ(`ZG zd>eUa#fLfOWi1#wvr(9(NnoQ9W!A)PmL}%XBvBL3QP?Ej*;v@dy$LIk#Wc$MHY!Bs zq>4QIbP^}PWxd8QZHh?(O>r#zQrdFD%bZjr(+H3!g@r8X(x9jYukUS2AGnrZF{=NWOABtZ@8j!N=Qdwm z?;5bkD*G~)irTrpX~+E9s)#Nq8brAstux3?==7KC^Avb>^+*68)#xGD&%XJRbyMK3 z+~7U)XYs6qd9K5xD6uFF2>TwaIUX$C2mTbm@Hj}UfsV>dQeB0;jnDc8UgyxD(=q;he#CmlBOfJpn=Z}Aacf z@YH33L!4qIC9&7WM`3$bTrqgAj=m{+C3ZjBM^TN+o*KY#jmDk!Ou5DK>%W1)kHr<` zCt2wymq*{Sz2U?Ec<-CWg^h+?8svoWkDU7fP4k?1LNwXKFb-px;4AE|NM*m{_qPnl zS8j-Se450g#$}c#i9bGocce>{W%hqDWd`W>kif<;k#2gq+vjHr0W+Y$?%({^x&ve# zP+W}g5BDD3y!&FZmTfLHNPf1IrGJ~@H5u|}>VEcO#12N`Z`wgC+0# ztu4E^?Ju`|Y-Ia1F8@ikS4jw{OhQRnDY93fM5hf^)JZCFdHnqrjHy=4U1coy#Eb1Y zEmL>rC)qyl$bYT{=r@pCPPQ2)7KI>fAmUZ3Mwr+_Ep}nDj9u!N4u}E`v5I~%`TiHp z{$^itu>Rb{4L2KuR%Ss-c+d`Br%^?&pY3Ca?#G+9PBAE<85KynGVJGz0p2&9JO5lg z_XU)TV5(biSwX~Mz{JcI_)fq?Q6WF_>9~0yROTuCC}1K~U}?#$5)pB9aEri$eemx*8AjC@6QL*us;0>rat zsvl-#9xb~^wu}i7uPa@BJ*E`XLuLH)d_^`0ij<5$B2&JkfdKgd2LV$7Dc%HQ0Y4uh+2S54OkoDdU2INiOE| z4o zP%OW!St{sQ`eN@m(n5KA7zKKvPZYt^oKW&MQ#Q>!#mn|@K^25g%~OrGm zC1UI^6w8?3z-q|}EpTI*{>*03EIAB6mOW!&41+XBYvNC zcEo3N-W)*NN>O9L{Eam+qaB9^{hEcRm3y#rbsA1an} zJ6RR|>-9iV`DFRhgOD30v?XzTZmG)lLRN-ubVR>u;9`?pcU0P(32gX9pxH^ktY+O; z%WC_lmj?>$H!jv)dU1z=|Kg@&lh>P~bA|W)CJ4nttJPjOVM!b?ZRHC1t=rS6g5RvJ z%I}2r#X!gOMT1vMJLJ5gvCMONxYxlIV z5Bl<`mw2llU(rx!ob@zgKel}ObMNR_tK`H%`L^;avX~g6=xXPura-CAiTS{_dlDc%V;L4poYFZi2hYZHJZ^DH zM!b>LiU{2hijuIVDl@wR^mrVHMuWsg6-B-~c?iTbU9YY6W5h1&#WW$mF-zpd*_iHt z4?Pe9(SY+J+gAKFPY9+-o<;Ei;(he9;N=J4RjT*q}gPa^jjKf8dmaaEuHqtgmE544gc_3%G}B1uF*{U4LBB;_O>L zpzd;`uQ~v!yuWkp;T=i91*q6J5%nQL{N|gXTO2hKMRElKvTBYQFBp;@9yRhf8A&44 zvyA^Q_U^l@32oaSenHARK6{^CPrK(nzj0r}nq#a#)?9ObKSs2}x8$s7jG=-f4Ns!5 z&Gj-+pFPecjkmQ{nGTUS1)qt#JBCBIkeKb1cM^zRl#Igz1y(k+ajaE|Bz`J|_NulV zd_-$^rDSA{0w@w2xMbKcjMYAr)@0fC6a~Fn#i4&ItsBPl0&w=;7Xq^NWIoiKvHL8e zz|;!c39rbn7E))v9;bMPrAP{|zn3>P{K>VhL+v8*@#gHKi_M!Zl*QWj>%-!M2n~+W zc9usryWW;(8jO>F;CG=C#k{&KX;KB#DZ*+8d$0AM`9u*TBmOipUa}lhI3ahR)tE(~ zYrtvZV^bGgOQ{JqP0RSeev100bIJH&wUu`9)1oiyBUM_@MUclxnvG z8=p2c=wdjNm`F{ji}#G+PGw9=t-mj{URLdIfMhaSKvED0#DG~Q_80eB2;K4Dm@o+&$tfxZ_wzcfN2<*!l?9$S zBQD;4hVM(gzUBtg#nZVszOq8LlUK8; zhO3x3vJGc;qFvg3ls4RE}IqAI|xJKCAlt z|K!yFlT$BecBs3^XT0h=t9Lp3d85no|M%onH3iuJ&!p3~i_-sf?DX94W2Zk!r{9@V z?`->jt90u9$I|IcWqevy*m@Pdy1Kf(x+Jsax1`gCgx_aQGfJB??>E1$X&&AnTz}Xy zFw#=N9QWA^pi4=?vVyquX|*_nSk zwJ^T0u&}u>IlDNyy*NF;G_(DAZejVu*X0kNRu^_x7gxS4t!=FCZLDr=f7#jIAQFiH z0DzJR_)5iNP?6h$qht|3(N&Sx7Re}Jlw(kt-x159aDJ?-vY_)COecoNu&S^-Mbzrf ziSDYR-VE6@JvoNe#ZRvzE-#FASC>4yrH!NJHL58cEHFqBKiN}rd$`2B!03k2gFDaf z+E$+*?|E={^ggC7hS#{Z>{Yev@ST&rwfElCc~1A-Fs>^ff8@8eFy32t|6Ow^h=$Lk zzGAX1ibdj7Uw!3tXX1ZRI(7O&kq*JxAUrML&$wEfunu;REqt?hLzmiTb{7gE1`YY+Q_BW+d z<3E&6DcXOMPC<-6lTOEBVY2YY@GwS-2mCO`Z_?>eLA2Qq(y7OH=``s;I<@#oI^|^i zK{~bkCY>@c|6Dq)*X6c+D{EO#)vatcVCPMyZEG)Spt6W?D+r}*ENPFLhGEJO2h zPvC!(PA~OHT==DQ>a^9SQrtKZ0NvC*{bh_|eI%WMc>C`8VEDMGQWrrUI3tsaH z9|#5Xq_dE3Ay!@uN{>Ax12Uk>=YNz=J>!U_80ZT?*w3ZYi_7txJ7(aoo{m3Cr=6u! zeB_R0+=w#~Zw%hUxz9ax?mokY`W@-?h1^?ZP)OfU;;!o(TD(^?d(BWdJe0rf=8w|p zv7bw)wFlBEj@D^8M|&s<#RU5orPC{M!SzWdsK1j=t^ZZ&6#wVaY3m3@ZE8uKgZ6xMck zDY83JXyvr&borHCWlR`m@v5zYO`~9(9@Cyl3p>w*PW5R1fpn_;FG#1tzagD|mZM-# zF#M%-s*Fhs_GdCFn;*!`D_Yf*wZ7+_384H@Iz9Gpkxt+KTcy(}Fo8{-$A_reFw`~P7SAh&whp^eI1yh)R+{7zv|=8@ zX4&8}{%(YYOA^o5g;$@b-8{D0&C$MZ-jHY4G;q4t?C2zRzDcJBzbl;z5B{Na zdQ8*<1C(!k7Di<H-bPAk7j zr?%gvQ^x1hnmAXnM!C+T$gFG;8NKS-w$ zenG`S6|}@5>xn5It7A%kWN*;NvBDm;(w!b zTKyNK(>3scbm{~8g>?XPQ`wZPK&=wr^`P| zr@8-#bSnER>2(y2FyFeTUKBE73EXYfrrRSuPGTo&PQlp}?ag|BM93gcIbxSuLd zBbA(ypDk?(fr*tk^*ll4{h4&C;z>Sp?r+lRvHz@es`e}CH1J=RPBFifPIo|BKa);X z{#EJJ^S>&c%2WNGbei&Sl1@=1>9mpD{=0NK_6O4GiEq+r=&z*HWpaW!xqYIM(*VuU zfdlEZ=x5StQ3bi~z+aM1i$Z^tPGR5+Ka)ow|OP zPPM5q-=)(N6m_|f!e2?J&WUs&AIBbq^;1t@$bt z^-Jm0j&%P`JQ?)ANvF2Il1@`|8T6N_9ui(Zq8n~{{4NgJeT|v!cwE2!jdnnJ`&;vg z*$O&P`O87rZ%d~pKa)-if0R!3|GIQ~^*}n+|2yfFp5kxPDfTy{Q+<-Z`%OAE3H^KN z6qEaxrBi1Z-KP0dND>1@5`&Z}Q%*m_^&@{Kolg9>rBluSwsea6i_&TH|EP3o|3507 z3jd4JDK%5&fpluG&i`FHMZu^Jq|>`niN61^bXxS2bUI6tPThZ7I(=2B4m{hXHuh%o z;zUb0gUBf!(PIq{(%uw{=T~;mqW%f#wBcvcsqi=H^!0yTI?W_Wr^5e}(&-^?-)Kpi z^T-3~wE93g4f-yf(*7WwR{foHYFZ2g@FcQT(TGzL>|=hAP8WZaPS?Ikr+p;pwE8FM zw2~y9R{frIs{Mm>s=)mhrBi0{Z_;V$-$|z$)3o2E)8t=Dr&Yg{PW!(}r(Flq=}K6< zlcXyC7t$%2t{q7_egDVO>D=#0r|<*m^sK-)>9qd`=@i7>xAGUIQ|x!?H0KYb(`Wyx zbc+2w>9qSmI-LMPNz$p_Uy)9Oel4B${ve$`{kKS`XZ}z+9j2M(_&ezoB>vZ=)1rS| zIvx34>Ga)iNvBo+v~=2kAe}n>LONakK{`eJLORtskWSD2B%PN2Af2xKTclIucj*+M zD^HJ=gMO1vqkcm=Rr{b?G$p@1@h+ z-(XiEzezf+_%rF$>#s9p%# zkWL-{-O_2#?@Onv|9I{mhEN+MNGscO5b!b2%u;eo^T?Mk zAU@`h$tK7ZPsk(R<)W@g26hk1TaYWcivgkGW>KF4%^_FjApVKV0a1~Gw#)3-yfT-U z8Hg9i>OT5UL>{`h!W+2~%yH2tigeE908hvaeLgS6MX}CvD;tnt19Ih1ck4NbttaSY zD7nTmY4;L{9v~;Gc0m+HL2N`&h~E-2luGf~ni*!zUK?j$O0K*W9WX#{x&?Ohgq-A{ z=n;-^Fo$Svt_5&Fj5zRinGyb75EAgJrM&*7>7pYhCZLquhyXtA8R-Bcx6H*m)JM8S zfzb!3tB)xx{GAPigrEXP@y_)TM~N4%)>Bvxfc0SEf%TCNp+1%zk(OFttV?`sBB_p* zfK3x4`h>%+bHOL0K8fA(J+GfG{tTjw!P^tSXzexQ<_OtMa^<-Tj)V<+&(G?cksQD|iPsS?1p z=KkrC(f$BP@`o=DTcDG++s-im6n+2fI*+TR;Z`E#dVU^?q2#t0yjQ7j>9rl3246)2 z*bx@toC%RM@JeQ1dlBrZ>KEV_8SN9fR-wNc2mpCbK)mj|MMj2u3`F`(Ts*l2CJ))I zXTNM26-hk`B0;XML_7(HwL^Sve0!0Ex|(OK2hUOH8FGrxaY$gL*=C&n=jzBSSGP#0Ys3_Zxs8f+#^4LtFY9fAjt@jGH197q z2Q4*)BkRN65+mF~!<`MtPcD85M0<`yj(77O_8eF1hF#d%oB9`Ky=aJRNkd+&WKHSV&`o^2~nd?>@gO)R!rL zcBt&!ns9%P>Xp^S`8AP&TvQ~p)cJMMp+ch!`3no{Vk4!B(P@L1pT28EddwxRezBczKXC`c zVH&sN;wPKVk1GVw4OZBv1|CU!D?*4l#il3xY*F5hDG_63AUQ8DhXF!rpv(Agp zAJy?$gOuj&PZpH^B%PL0k^f9Ob-TPdXC7jEBeQvG=E(Z1?pWv`~%rw4C>)s`AQHmQAnyt}Wq+(M(J zzS6-RqQ2TK(X77qRE?;5Y3(WyUm)r zpB@o4_f~qfw7!0MLk!W{-~QOFMf|!;Jg=-cn1AH!{ZPKO1btB~xokt81U)jBOh8o5 zF?5pB0@iW3#}h1E--3+O1|{rIFzBHtDQl8o)SR;kjtpBHb|9 z_)}C?tVHow@%*hQ{NcfNrvCSdiu0@K-u2X+z1ndB14xEa_1VfXWQ+$Uk2Wm_Lh~4n z_LOZA=P2t&c!KF8T?epDfTv1dy%=n1rxivyC?vG%W#LU)pjJRYcpQjPLH zHNqPgC5TW`Wuw^{2}5=an%_VV-F6X+1%3&qgHFL!>yZpew(r%Mvyc=$f_&FUawYe$ zL}S@pu>LgVd{E4mVMx#vCC$+VyEg&&x+&3e>#rZv#J7ZmPv}H>h&p5v0Igakm*N$e z@L;ZS%%w^ko>|Z>}xXL6YNX*taiHm^<;@NC?g)u zVQAvWyjsSJwXmyixzA%av5OzCjV;yIJV_fm*IYw;y3E(af&(pO;DzStU=!xPaW~f! z4KYj4gJ!?Kr=zrM|Ll??^z}^n6=lD%FPA>B6#B^oiOJpcA|OpXIP??`R~Br_hY2$| zqo9ZK$v-K*fAsjIQ+8;t1sO|b{)Dp(zMzm?gMebooLGRuD``Cwfq4icpzziFyScG#F^m z7NMT)qbH{5DLdgiFe|0Qh%ARJ%jZq9yF#TT}a>f>t#bsiaTL1xsHo@0Kr| ziu?MMu`uS8*NJ4o{HD2N9^y_52uU)DQbh{0Xi}O zkh~yF34%9KmskhNpU6}1o4o5i0YSC#=}igRObJ~A zN&rAgH8=`+_#Of-TY(bF$>qG%A_mWgYZ|j_PKn0x!=$ksTS%e8eC3LKi7*DuOpts9 zN??MFoe@joPes>(hcnyY5xtr`Y>O?+B!}J&?ZXRDpQVC+z3(`3u-+y(eeXH zY++$aC>g>TE2Vx-Z5<&KNBBC>XdzBryNK2XsAxTNj5Vw0Wv4?D@xG0Fi zQuPi%6bLi~lv;Lx`xS6{6hl2xG8o)3$5>@gAYDS(Nu}BG36RqztdmQ$hPIBLrhsOx zi(0*qxp$s(jG!kq@aU;nmAF>g1e8PmfaGFJx_Ox5SiFj2E3;zjkyBJQ=L-*+6uD`3 zqW0h0yeN8*2@;V+!cANlblYet$zV=YHgv#8-E>N%K;r?R7m7-AJq#v^b(D7YtpUke zhf!AvS!fz-t)uARIt)(%XHHSo4#uk+z!hz&GzWk%C7?X|tZx^{Q5p-?7JcA{WAZ%< z?rx(&v@2<&ME%;8wNa|tRE0rc$;E-I#eIQ|5HrbA&BD{izmz7{wu?#<$)v#9n`9kZ zVKV^kkTohqsY5gMwn>gyTB+E?GLYG6 zT5M705FV!kBh#cLOASJ)uHa}p*u<N;Md?1SjKPI7=z3Q8BzM=)({r32_{ z)x6iNK$-RdE!^8^Gqr{L!1rtMTs25v_ZG26v}j)_W#YoSARLuCGQPEq1%Xy`$7(GC zQs;ndXw-q23gBWY>eHynKEis(c&;tEza6T0$9NH@V@kYZ-Y9N{!5x|tH|xTc?1MAK zfj+AE+p7}Am&l?P@O&pqtlPzpgq}0Youhq$r(|6`QiH7MLATd}ub+ZcJ!z*y+?|+c zWAk&?vc_s9N~XOBanwxJ=bJPHv>(>)IXrhxT@EE^dq>OLz$-ID4UIYwQ+dQ&S3x?< zi{w-}eorTkrQ3p1EUryt`N?-Q^;l_Jo@)0C-+Q1LoMhsO zp<2AMEjH!_2~BNfuW8{~a5E;MsqRQ_R|jErGR~22Xlf$z08OpIR@!~!UB3{2fTp4j zEtAkxAg8U)p%GD25}I0%c~eUS>tQ0aFer6I`+=G|!Yp7OrB>e}Rj-paAEDc>S1X?tajI*y*vj&OqyU$h(g29k)Rlyy#=*{5!>y|Q_>$u6)F#SRhr#iAC z;`3HAw*56)c)4}egX}AH8=E^;76H~YX=Urvt}51k|4Z9S@4|O3_2v**s;tF? z2gDo|$_OKIsXE-H08YVjt9?xxzXrFpws+jQ_#hZ0{a)?{s+Bi!O|I)fCH*DQr82??aE3H+06kQsbC)Qr zEa7-SEfq_vacbz3fd^LVZD1{R4G~vj)1qVo#DK!p%z-R0E<7ykx>FlWkcu0QZSBnQ%*00TtCnd9g%sPgp^z?KK#Ag-w*FO28JmF!E z*i<{PQUL^tE3n?foQQTY0FnmrQH|)ItGtOo)#)sw?vbq2MSzOcmAnxD%vhD!M_^_i ze^RfS1<*QK+rsB`Uq^dgtPgNK5_qFrdCiy3>>c{aM*D?2eDQ0%I7<&x1KDF>cu9%!L8%)>QD zp2N=}qUx9)={}f?AdeHb-5hRxSmBA)Zej=!W(_z|a#>HL zj*|5;7Xm!6+XhDk^c`@iZ6q$Wh(-0th?-UKjv#)Ag-}BLo0v)vBZ;X4=T!ZK0x#3a zs!JY`yqmR)<_!dEmTqc%_+nPtW-^YpdDE_Lz}7oZcXe7>Aj@1TI)$k+N|Og$^O=}j z_C;srv}RoxlGBs9_8Iu4{#FFN0^qJb9LB!*0^^`RLOm7=qUnn)9jgL}AU3*WEQO2Cr zkJFV*cphu)5LSIONyq6@;S7$ujB@CbN=vufQfIqFW+=6GSlQco+8XV=cP+Fg#0nr@ zmPsmSJJ1;wp3k}Oi(X^*$kU2u5woc=U`mc*zq0qB8=Vvv#haKo!NM<>!WE+JGWL?` zh$A*cf4!c4-xT?)~bGrF<6%VJlnq^ycBn7Pw`ow`rF#mIE@ zPR=U(9o^oreUHrgJk|?#58PhW`s}iKoM^Z_|Kfof69*qWZ0hYxn`3+(u4=-oJ?BQp z?a@uIUp(67xa8EGB3Ab}0=qy%|TgTB+vAwUT}@R^SPg8+|0PH^Xo?5 zy#|>aN>UH;lkb~HRYQTSA049gz5H}XS&mzkpR5$?AQ%llrxy~eZ@xM#qo@DoS>R^T znAC>At*h{+C`eIj3Q(DvHk=joxSG&}#q+R|;~4Xu#ntU@ifJtmh}DST2?;st;| zJp1BnSCDYXoc~OO_t1Q&?5R8Ef}*EtU%Jqo671DyL)Ty=`Aj*;^x>x&kxm@Mkn}t} zbIce&Ig`;3U}{7YJYIdMJZQ{zh)zJvVU-mi_pcSDeqC5A8uY)=*B1EwtcCJ z6R$lL4(L9S@>FR&h~lUzJ;#LHE{waSXhoFl#3M^ln*cU%Ewj2(XHQ*=xQay~$zwVareKt!#I0iyv zRPK=9A<8&76ZLG0C;g>ke9_%oUh7#5(8Jg6mEQ8eUwv`UTfC6%jr)V~^f!wx=a6v0 z9k4mv!31zV)=K_1DqmdlAsKUs-?ka0p%7PD1sm}0)FD9>{{R>94(!d*{El+NDYla} z^-pdH!a2{vgWc1jQndtQ@^v)?9UU7yH1;{(C+hNLy-zmPY@=sTum!kEqPl0 zwWGJZtpVq6<$mRSI9DL>NRQ?!Mx`~wFVjHsX295_dk*I7pk2NhDcUMgz>f9$V%^CQ z-i?>ew{C2_j-L^1u3dh`Z!JG1)GJw5a|amRs3Sjl=Y|M#E0|b#*AE$foafzCT1kGk zcvbKi!ZA^cW0;&qwr6g3UcP2a@S{M*oE@)JvO)NCg^cu@jh#=;P~zT~ckg^Rg~Rl< zzh+_|kAO0OY`0bNk@R*61=k|BBSM%`C%|I1NSafg(l0T`oltT(5`BI4!WlH@ekl19 zjI7h5c^=yOB`T!GGh7CrM|;ws?NDnXuVfiN{e{dnz8xHA-!eag|5h8E*(*Y6B#-7A zq76px6=CY+$edKKBXMLUMj~j6`JzFGXaG~J9QX`-b!MBC5mStVi2!HER)^#zxyVII zxWdGijzEH3!kIDwuBW1CwW*bB9_|HP4fUN|`H5j06|F>y>N+}Nkrb3s$Ei!zyjyz< zN1nC_pxI-hH1*YH-QRLz+#3uxlb`0fwHT%qw%mO=YAPk8&q+vS-q1ASQfhvUpzz`2 zy=GIEDTRtf!WOrUY#uVFXQw*LsBIgcv`J3wc`qb&-M{y&uXbjMNs;ieS4O5y^6A54 zPKPC^Ok6cDXN}Z2tK82rb@ZrC9!I+%&Z_kLgjL;mc*0pjZQ`0!)mlnZ&{^ro7C{nqMj&zS%EWs>GsdcJ75$E`Y9Bv*XpXkg4IkP03k3n%7vIjdcobnu~#z zYB!Zl7@Kt3e7AT*7$O%3kr)tT7~rS6v0%+Sw>3{)Obksx&^|cylw-lPM0UhwYrRDI z>HLvE)sfu&RLD7PeOgBXNGG>|N}bV-WuTI(GRB7mp>fxS&dY}HeLknh*YjLsuW74q ze!1(w>vnuR+otY(+TBy+ACGqj9ILr=w9MT{;>1i0{llBj%j2I*Fi^ia*6!kS_vBLv zhwT8H=JG?l7c?Z!tSr2I$YoQJad}8fEs!By5I+m1=yR>Yjm0*u0`n zuXU!AbQjxw^K8ndrZ(Q~TwpJ*E8XV7lM5GIqI<2X_t)>(-diwBQ60(W`%?1MwZn37 zaO9T9u@YroakHxr2eJ;NQz09nL5tJ7Z`qr-pMMVCesh{g+&_>`mqV#eU_elg7Al+N zFwQXyM2n-9Nw^|##$L-+wyKr0m`%Zs-+>{7qn)2^CCc!GBg-w04lzAQ_+d#kf7=!w zsq~dfM42P^JC07h=T1})%@cqhMk;w-^@4i`vH7o+>D7+luCezW%_)aNJalJS1R5xZ#WrFV=1r zsfWvYat>hGymcG<&gwvSnhg8NRqc>x_11F15|rn&a1~)L96*RrR7?JIp7;DXW%?P> z)ufNTv$`1q6#UC_rB`!eizyri=P1#%QFrLPU5V4BCvB0mJ>1~qofD;UbF$^f81cee zT!Y;icIR!1&zO@-pc(*&pL~_R z)&8j$WNBkBpM(Jl2yPeo*yizau&V@>n|h^OM|7u2wy*}3(HWZLf%##Zk3YJ_Y1a&m z(q5#v-f-c4z?;#LFHf(?`rUjlMo@e9q~b7xb^I{Bs{o}CqYouplS}q}0A!9b>`rgq zd`bxKpbekO$q4LWrNcn3Gsgm*TeZ(uG=FU)8hVhY3|i2$rNZdm-fXysLw?atg?+HO zsMl075H8Iz^`WSE+eXctzLoneV|@Q**)WR)lim8X*P-6ykIUEYx!jn~-%uShG;^-J z&numZm{iXPm)d1ge5sUSqxZ56c%x%GxOss>>2hV*lj}n-uFq-nV7jet&}h+V7HwX~ z9{GT%vAo-LELg=?ipnlSpdU+#d;xj8b|-LWP4y;m_Ayr)EdF$la)3g| zbm863E&WMO7=D!r9NX%%L;C&Kc_vvdGcl@#4knw99Jr~-k%%OY*g=kK zOq?keoEa%GM2LDVs=o%`{+lj$GA(UBd#>j^r)LZQFA1nx(zIQ3>5#gytE-=M2#E{OzI39A7_TTDd#heR&Oi~1Om&`Tcg z+~To>p5!Wym{n1)A+gA0hN2y`#43VXQ7@O_8HYfkGAJ zh@SGi3O%r4QK4a`#wyNs2}zmDl3LYr5|OOmb6U>Qo+@^Ovdn zCpDU@RcD5uEUanz$Z2gVz%QDpk&S2$4zoN@N|_+JsaPs^1-Z86E}-!=zVe20 zDc-{5DLe@aV<`77*k$tdlw)b@3L$BwOCxO&hZ%_14juQ=K8`!op7UHZ{)_F^L$+B* zTgx8U=8?*HnY@2;rjNCy@sNFrwSC4>+rWk6c_bqI!3ka3lm16hld;e_5}ewia(PW^ z{2QE_U3lQ84pdo4x$vYrSUT&lRp40_A2@uSbP%G@^Kh^pNAXJ62+Fxak4$kVkla*< z699g$*97f^aA+n03M*zKVBiMW{TD+QUgjkezS!nr#2X&8)Kr6P91Qtc&-!dg_@}o7 zvAHs^p50A!eRa4sQOPyq1;h1pJAT^pX};$Ihc>r{+sbWR^U}`8`?fUNoKLgnsr5w) z#kp4>ReE<<>F^WfSB^SDQ%3m|M-yOj#B{8M7?LgXjH7J2qu|CxXoT}uEUWwrDG%gT zM<$#LfRas+C3HgNCIE`!H@K!tGd98Xgp6$+BZTv$bm&YOdf@)q;!tB1k6p`&uWpTs z^KL~O-V$Oy$LgBx>XJ`gId`UR#d@^Gy$LdP1zHW7XSa!Sc zj4Oe){?|qsV)^|8ll)E|J6|*;DRP@nnpK(H2X1E#hZWOCwD7i;SnT>9rcm~La||wP z15MWEEuW-h9F^Lip!6Ls+8`LKIv(YP+Z&Lv5wzXk%F;WC(8gs1lvIm+h%0R;xv6={ zxw+WkHEgqGm{>hXax3hvGEM-|LiC8iX>8%r6!9&|1cUl8vwB4@xsoTtgaNOx`_IGK z6~dM4!!K@$J8Y@4#6^g0+4tJ;K4MRTPX^D2ag0P7q@rmmzQL)P1blL?MJF1*-g1>L zBW4#FJI(H<*kUH&BfW~1esv{Tn2hlbhP&_s2gyy9UOB!*M!Vp^EMbxGN;qX`Jwjka zLh2r2pfb5w8@jv2U^*S3OJDv zREz6yIChdF1*9?~-jK4JXo&PDpkIXiIX1Vc-A@6v$bG`53`RvLph6n!Hpt9^w zR_S!da~iUmd$zB)yb6dX!KvEeo3RRc%-4srvLA2f>~7s4THPcsySZKOe$F2yV#tu7 zZ8h1d#GakU{W=fIl`rrx?_<7DFU&(<8ajq7P|hw;vnx1EMjN&M*n=(S@`}jCn490- z)WwFvugitoy(Kmei^zA1&ZrihdtG#zi`9*~=!;wN@qiLzJChP-HFA}6h8ZPZT%||t zN-uC-kImNpLUTLA`u1&QhzKR3aw2~emEX)&))tWW$uY2^-G}6+*0@Wp+KMI0N>e_d zow6(ckX_yjpe2T}#HOKmHv-O2Oz~}Hdp(ON^nVy&dYL?ey}L210yJE~^r*?QUz2@T zjaw}MYF{bDtsr)tpI;3sx0^`h;LY?|R-C^>bzP3(QI$_$w#II?n|O_n%scVhHQb21 zLZVVWBNxUdQ4aSnklfU$P)bK5DOkRgY`Xy*$pU-?UvdZ`5*$Ta8i?9TPg>|ICamq# znJNHuQ%L1|V3rpy6&kzZo8~3OFnZK*wp3P0=dOG7LgSUs$m41nyx7KZ?#I*iO*D_1 zp6)hPDmAmO(eYq;derFc^6+_fRj;lScI_XJsu3V#kLeo2m~v`&sghVTEY5VYC*=CP zEoM#+y+;%atAEpa@7nPOsVP3{WAL6_wv8Qy+*!W`tOVh#V?|0EWXv~1Y4=e~uQAM`j!(M*jC)8X9w&5+V24;f zt@tK4bW4N*5kjYAab=^LSyI)>#zbZ(kGWsP>&wYf6`BKfqkE$kd;JUX!!$|qQMo)7x)4C#-*Y33bkJ2}>QV=RUB`N~;7>6-gz zAw2FWE>;nT#Pg)^uv1aoly_jPnd8tSEnrsu_dP|gzGB}(fv=EG+;B(Dn^!wH`5lY= z7<<@h1oOEUAD*ezxZvA3rf_On`R26R+v$vrvColA352q9B}y%!8Kx8}Z&2X*Ne)D}NK&*EOyzf+Ba}(;U69?ME;zMN za&zIXX0uLHu*Y`$nMnEsSd0|~R~cqz6i9g>regTJCY*OBWh{M}vo$`52#u*GKQ&8l=ScBH7e?6%V?^->-JBpl*%2L_$2rZG51{Lr;ImP>JfAyt zd2fm)WF-W;g*&~a(X?`|#HI$;e&%enKe=-pr__^E+T!^y?oLYp`3$866lygKMDn2{ z{3=`g3I^ECTW>cb-|anvejSEZ$G)@OSk+j1xcUhay>02V(Zb+bY0w8Y&FL^m#Tx=z zZxKk+DgFA4Fux@3Ksu$A%ST7@szyDEk}G%;&!hH}begRmEnoBt>D0<5CrY8@Nc7rV zs6*k!5AAmPW0V>(inlF;of8r*sSk^5N*CnAP1*bJytsGbk?=iRg|e5GPQBS0-=))I zZ`Y}n9Sa@2)^t^dq{`nu26qazxTR$ZH__!WX~rQcr*bfFRD)?%s@`{}O4!%Fa;SVO zzXYZq?l?M}q?K(f5blFE)+`;bA#A5|WGWBWxV?GtdaG(_yfJWhpKfgB%^UM=xfNxP z<5DN-SPCW<>(WK zp_sX@ubY#f9aQp-FBxo$3?M4DP>d%}dJ(G_Lg+O!ZT%Nb4IIMRPDSM*mXJ{?(!JhR zu}3F(m^#k$ycK@RWU^N|U@^LvXQtjXr(ob+7+QHqb2g{z%r$$%Oov1iy`fW*v*Zxh!_+Ra>pvx8Rp0n)ithT&O$D7?;_S_{>GX& zg?g)3J8hfs*lMO1lLyB&e}*8~I#3xjN87>5bTO0CI-2+P9Vn7SFZJ)=4b5EVGe z*xY4+wtBp~CoTItv5lf(uBFc;(#+kxfcqU~CnDPezEPpiRsf_>H8=yqz#=r8GCDP= z@xGu2J!X$Q8mY!E_6lt+2b3PQhAkO)i#ycs*ac<8}h$2E~c-JpZJHe#-_06M8 zLA1_(FSUsn3ks}zO+n%wh^DTDM$)r}ySt2B!Rr#uAzhJz48=M0nY4j8$&oNN^Bf=y z004^ua+IGR*F6_AVk3SyLG=|2$5@25Ji{`S=&O?I8hxQsUE&2UB;wOkdiL9uuuQ0_MpSf2vXAiEp9w z#d>~{xIQbM@K9;JoQw4YsJ-P_C)eWrN1zB8TW_8h=}Afrytl}8_Z1vzQ^jF1M$ukL z)jbKVz_BOJEWj=eY0i@p^d1JA@ETr^ zdmc2zvBlOImfK=wz7XLP9~~+EiX??rSMA=K$ABJk-7fcY4nO(oUIEj)tIwc2)~A>4 z-amdoodejX!2mF!7^h~s-$ck#Y*q8=b4WzQI7O;L?BQ;Bv zr4GUoyd42DJEb#e`P$cmLMMZI4aje*XneGj*lnu|=9EBGer%%!JzWr`W#IX zIs3+^DuZrmQz;O$m1*kunm}`8v_xso)u%>{?t^D9_sDerMNzbQxK*RQ#9L>B=E1D# zi_ZclyzbQzULc)Y31=vF^{B_GhdLwCL2w3pF8*>os?NvH7>i`it zvD_g~frahO--j-^9`gnw2tb<~%!8czkII6Z^(-BEEU$F#*J=(I%J80AmA~-xBK?&Q zGeM`z!t}zYnPb=zk*86xE~SK~kAbq2Y;~AcjUXm+<&iL|g;HInYkt?>8 z<8TwkZ_?3hKam&w(u^`QAW}%E-JD{FaMIhtY;)u)&>>9J55*~JJ3ETipOb{vOFm4$ zRMbLgLI6t`z@0JL!iUH~GZYe*p;Z%>gC*f|9U^VH3@PoDaRYFhsBpVE1~ZQE5&_0{ z1y}>o$b=&N>jw;-6imyQL)qr8gM|#QRzfN|FPRsyy7_A14H!c*PwOEfXk`E zsvYuxNM+!U29t=?Tzfz<`J-Rf+;4(mO~K0TBbDq9TT(A|lzj@B4Ya z@7a65WAE{O2m4@;_k13#vDTbptZU5c`u`r^_hb{PjuI@)%@u5=J+`o%KZz)JRxPEu zJk?Slv3w=f2bvr*iM~XHf1;uHsF+JkZch@da^sTg*gr-p$0-6An1|D`7AC!dsad&1 z1=4>L(t5(v>c`-LMC>glv}Xk~X`lWI$Frw$(rV>g*>>vBB#926!=7v+l?pW-OP8cT zE++~oQV^|p@PH54k#hbw19gys>cgYJi|JqHN1r2nx*kB za@{t$e(G-`bz3$&>f~QU>bZld{Bq#!n;Zj8FmNXA5kaiB!r>hX*%#c`)|1hdj=DdHsU>KDdSW zn@BykkMjfr!_9khZ|pB3)s#)7GV;RNL@Faql9+FKIsZX({vI~}uGx*#Y$CPxUqosZ z$1P0>E|%x5!)ziIfZhI7Fjn#xk$UbRh8?fUCQ|SH2a$R`x$r&#GR+bG7m-TMcs*PA zb~b%Fx$y3-qDyiZZ8G@HC(L5<%_~WKK?LQ2qaLecEYKll0ZiEKVUyPeHF0AKkI_$s zjbpgYv7Dc={15rxQDF&4`o7b-Bfj+ggR+G}aFHvLrL+Um58a0ebQO*g#r%|EYKf>p zyo!UnTYG_0%3UMgLMijRm(<@lei9iZo>hD)W+KYU!Bw8^qf$#x zwTS!Af(V@Vc8f6%V~LI}j?p9@6Tf?2J0gX0h2+opJLP5V2g=0g5O3eI3w}a9*h1gA zvR${-%Po%bT67ZreyDtTpbd&d;yF8be^=sqGN;2Tvn-13qv9WAn?JbW_uyvAgIoWv z(rNiWKq{qNXR!R+qpT-${FijfSv4+S^_O(&SM@5T zYSw{A0SmW$Sh35K`*yCpqP;?&T>UYn`qRVeFITDsIw0RTYu4p!HU}XB9o0WmYJLq? zubJ2Uk+1pjum*If_S0a^o?k7XUCWVL1M{!lOR44Ldh~S;#QnM!Bl1X4`{aN?S_*>8e+f`or(rA;BnZY#+1y*mqi{2RtULNJU+|S zI}SE*VYY5ck4U|Wi%D%Vb!^JF;Rz-^IqFywaus*p@yX{`PYQ20 zN4PapuC^rnZX`WzvAfz-UD1^D`w)(^S&LmVZQawG;5!zb)F2}Gz@n+%LL_pqnew}t z@Dz9Zb@K|dIa=hY|L>No60LPZxle4K8E8MNech6L^;!GxLkEQgx+o<+s)CH$xLNIn z*SB%KPah9BwhpJZ3KX|~5_vW~)HwF~S)N7f(7r4wXOIcJp7P9hl<}L zpyCq5S^%zP4e^3S%qEGe4!9aU?F|oyvV4ESs?rn0QpL88kyqDPF`n5IGKO@Vrhyy0UdtDr{jws+P&Y#!;Cefvw9!<~#!r0z|4&Lr)e5i-rb0 z0*0p3hGJh2RbDOE*cEqi9Gx7N`$^(rJ$hSRH9bjW!mHsRZ1G7ryW=K?kUlCjXLA#|6{JanA57q!$=P@s-xGSHrC= z1+jyc6MI*ib}e!EuVZN2MvVa}<*(ya1}Hu;0j@#JJtr(O6RZ1mQt!`xFH)fJJ+yJU zwehtphjn>Y0!Brb?k74t-(75zsbYnms9g{>9gYZ`^NnB-QOa z)MM%LHUxAj^8iQbk$?*!!sgF-1DEu)WK@lT_WQ5M+{nNQE_cac~pQuBe1^%`e#h>7Xf)3 zi}`g@Z-6o3up|)4K1(Qw^9dGX?3I2ErSzkUL{ldH$V_HQ#%7ZZ# zdtVkksWl?-lR9pLCc!dKoe*gDf%d zA;Eu=Mc4Ha+(pZNqSLKa{J91ny2rWGs|BBs+OXqF8mF;0*06a>JOX{k&pi`(b$|K} z9phIut-r{wVpwplR@4su5-1wjqf-x{j+O6%9D1ypvhx{U(v`duRrN)#}R{VC~nS*ArC5RX!kt{u)9p-BC z`rG92HQTl|RL8ZScLj$4GaqiWi5SL`JT6+zPM+o7W!@{6`EqtNB&Av~56`|KnEK3+ z{)ee~j|I;H%!jm;45 z91_E2+@H;&-U{fBJH4sFW>Fb`v8bVcv#2VzT%~Lll}UABv#1RC*}qs+77`+Wxx+-{ zuwB$KrKL5D)!#0vQy7?MN#~VMWuhw!JE4L3@Z%A zxF5gx%j2RQ;$m=3XxKk2DkJEq#5dKOYem5&Y!)@(`Ul{JV0EFQbqH6T)$3C7>!v{E z(21|XZcNA{)F1+G&5O}gHHN#b<1oL!!xrAWQPn-BB5*p z6@NMB()V}xggshl)w#W?$)jFu8MSts&xS3dvQXFfF+gysWtHlamhqzoiKB$5SJ=#r zkC=UYCT5auzT1SQXa4BB!|xJ5yiipe@+)>LA&K>=&K^;14oe+)=4CmL>IzX@vM$oS zs`R^lb0O2At@PJj=pji3^F5sL_{$5We@Umdx*Ny2@{YA6MV~EukoG*>Ue?)z_qWkuh*5T&FL+@V5QBdvChfu>{ex8=iwV6kU4@ z&TgI?E!6d1d%Ag2$hk3Nr7KQ$DKo;b<5$Iv6WK*cmLGn-yX5(Cb#8NB_SAjj`Wo5N zpx>M7xppU)dM|xGDm*@dcg~xqLf&0ZZaG$?%=#F4?NC;m)Sb{z(KwDYrEA(|`rX&0 zBT6HVxBSX}nX^8Z)Clj%K2bv7J(_<1XRf=a=;H3i$M(Xpmh(bsyW<{0Q7yrDWi*^( z0^5)IWirL1_VB$b^f;|0UDtw#_l}uJN(QwjmF*~ST-s}b?X!pbj^B~(mGEkpT@tEl z02}@exZiRyi}@^I-i@SvsedPyqfk;y!A$YVa-D_pbLe49l|JeE;Q`u~44X=IJG@`B zg!9dM@BE_;ri|b#7!#P*3YsZ->*pdJo!SOtBN8+kae3Bq{MWVYY#(H zlX6TyNTiG$N(_G2XSvwbpuZEkdAzl^bYOHp%Y8mcCM8LB{JJ`1aWc4ojtd>zA66p! zs6+6N+$RGywPhyXAK7tFp}AfCxZ+;x`SSLppmAwQtDsq5afKMM@y-{!J07B9Z;<;? zzI!C|I}KBwCvQgV>K`6B@NJNCUv_TDyEDn;wVcIa#K9(VEXG9BEnd)?cf@Uw(RU|! zr#MbeHzV)&rPJ4$Q6rbWCa7eN|Hcb3V@Y7^)n#X3aA(sN~ zQS*e6@}v>o%I7o|rRWd}NmCumtJ6PNOMbemIx17++h>>2nSK>$6`LjVz!W_#BDaQK zR;#K%kDOmZVk~d#u}AAo+5=4NlVY0Y33*8bHmpKQS1Y>l#CM=v4_m-1W%{EeKorF7 zLmIL6Sr+Q~v?5+?Zjc(29cPE)kbA-!5_^_G(!QD*D1_}sYVHWezZ#R^3?+6g+RA9l zpB_&+)9t>m(NrMPC_u)g)KX{eE?2Ivi4n$q!6YK2ASuAgB8Z>!#180q(Q;SDSj-F0 zo#0;k==D_ZQk2-Iml}6~QPm+jJ{ny5232$NpwI!s3c4rP)(qe*c z=M-rxeZpMI%WzScb(cTYY<~Lj#EmKlL~M&1zaD(fncP3SPhbn7(Ik=WmUHS*&A5fe zinmG*{==fn050cy9VtZ!q2keG`ETj;0q^zgYX=E6b_fyuyH@`#o!)^3`WS0FOyvHz zbozglPFer|lTHQa8HnTIodRu?1daJcOk#NV0lnaqGxJM=ec`?8ZNV9r=HKl{T<$m4 z3(2}EGQTWy{PN(bwvgN>^Y4`tFAsa^UC4hm{{i4e{%?Az2PE-w2c#6_W!01wj;gEb zXdPuwR~i}^SQry6EKk^3nJU{}cRXoz((aU+eP)jXkE5fbr_*U|=bR_50)1p*HxKUs z52t<)abwT?a!+B?a|HwEB!j)l`M%{k z)Z>xST6Qt&lQ9<8W3RNvnx2W%jEPTMi`R8X(556*O(oa_B{jMv6GM}0bCapl$)~?1 z8zrS=xTP7#rCH{u#m;0ne#@{rcinu3=IWbi9g+DYJ=3;2)1H}ivN$_=Hrq2W=VV6C zot&JU;G1VcZrcBsWa{PHE~y1A1qB7Mh3;32JnD-A=8L@E6kS*^@~Xa*6G=a}K))DO z%&wX4Zj|_^-t~(q4R~7`aigqzvn(XxesI?P5XOCKetCOSd11GbC7Y(Dj4OTmw)*`=-7Pg`^9o=vQ@72p3GPHk^*{|8QO>nQlgPOa!HtnQ}w zc9(qasrcDb`l$ErY;W_I{+h;t@}7aIk->^LgRQHB^*;wIT866@N7{dk)UAy+KYQ`$ zf5540&`DwH$)zw%PC|0gds+yL{Bm->TlfKduy#}pCD1WOWxLvZiXHLqxR zpx9oj2j)M#)DLVg^=LPvluN0ANl!n1+V_3PVev(2tV%)c0jwQK-{k-1rTVhHRD1)+ z$HMTI$5w1Fm7^0bMHIs4NCf}}cdoT`&@`8FjJVZtfrcY`lH-O}cj1kYG`UYX(l74G z1s)z!%n90(D-*e9$gL;vV9>9qHi$KCh-yble-s7>zB9o5hnLz@tFIzCQU}7qnq}a$Y;LG+>A2JdOL_|w$n#2}^<5aP4>OlYLr3SIR)UqZCWGQ&@0+;v< zuSL3akEo=nR)r}TBNJrL_ENKSEF0vO6qf$+QWbkl>o$nJ)g@c%k43U9wVA{ig+o|^OcNEOmz7tSj*oD<@OT#dl?7>-0y|5Ywd!i5JUTW=y=HoU z5O)&J+-37Y8>Z881Qh!b70`begV|j9JHj8yPJrW+Dz0$^4Fk(h>_bkUh43HWNtND zM*8xZ8W5i;FxB8;|X3EbaO|dGFu?m(FlrBX9pE@xEh(jkNPeR8?piT!)m}j*w zhYz&U0slQzmd=MmlR#jQ?$jwIFBB3gDLSTDErGvWX%UBjk{%an68qD?Wn5f5jp7Z* zhS;3oMfl^`l?RrChH<@6lv;Hsq^oo0q}9{=lze`U zOg59fLpr6DE;eei;1kZ}3_dI1mFV;r>&Lj56H)`y>6N%b*>veQI z-eC5HN4}z<9ssScU|FAjAV2ceQ5;FNGZeOVVO^tk--a!( zKY-I^M>}yy*om))1*vH|()P{0*zcKtu*?2RoWZX!2&Vp<7YKSi_=u5s1lvnx-W;IG zvAxtUYnT2UC&!4QBmeeN3wpVC)7)^XE*J+kUvjy_@p0R>Zo`v zSz~O~yOp?aNWDc#{4st&0C_#xL_R~@=lM%9WV3sobEojB?62=jSaa~WYpWWLuj$uy z(!7q{{jka^%=E8Oh0CBV08`PExzGnW@jbXr-bfYl%gAU6%gV3Jypn%;sZ=`RA1_q~ zcGB%Rlj7&{9%|}jULNnWB_Yg}m9gr@O(e!Gm;ZcyB3fttqS3jASK?y6xV!cPB1t#z zJ16)0ifvDSnM2%Kfwgda&MrtA{l`n?sK?dptIxcDZC^0T7i$07kGeL84hqD-RR3BP z@kHhIC2-6B4E}B!Gy0g*_q?IGKOqgQcgOsY1Y&7-BjVmNIP~zDHwI4e#Z3JD$eXua z#a9t}oa!;KjgZ=@FJJZIniT-K&y0`am0YF%E$+C0=-eZph3(~^!<(t7-z>sD8CNhh0iB}tq_n3&ihi6~&9skp^#_PG?3XvnC=lzj8Tr4|W5GiAD z>`KYcd8cDP`@X>JPr3dS^=IBTehtq-`%xrE z0^PuK9H+yUnA`+SJdvTN<#v8b>cF+fe4WzlHacR|s~A(9$dEezIAByjhn=9qoB`CQ zS~xEhkAKHkFa`d8sW;sj->p zL*YDs2+=$$(Z5M)y-D_+{d&FjX{|G9{B@kWo6w3p9wN;XZi6}(j)i$dB^|-@tVr`0 z2ZDH`N9{A##xaTRoL;aLp$r81pWJ$z;E$%i_5wxPQ@NdJb@Rrb9s9#8(b7 zG(YPYjMGw>#8Ic;46p+mOO4L@%(@7bp`q&;;1n5&*TiFEytImSG{OeGgU6bfmwX}RWNS!%1q9N6I?8q1x-2tQ%vFUUG zO9O!=ZlO9rkp!N(F%)+FRs)VZd`$A^l1lcJ!8p}W$#=%h5dk=DlX^l0p4w$3T+|ETscZ@X4Y-CiYy+z^S6{n!3wZVXzG|#y#Sl} zJXT(kM;ix=vhY|<;1;(H8peKzqu7-JtkBpk-bYYPfM-K4b-GV`%MepOKpJFYsrB09 zwV>=$bL2ToJ$8Vdv`o+#zV)o)UNYjThsmtFceihWY;#D{IfZcs2GgS5=B_m;S=eJ% zC_*q{1w*kZhy{L#+-EA|6Rw(#r7i;LXVIEiF#Qw8Vhp88164Eg6zSZMVs3!Wy~0sy zAdfwgau-~TVIMUVa0jM>5E=uod_sIVdgmME?t99;Y6@I*jBAg2Z}}6(Q5fvZnfiwg zmck>wye!4vfv;$QlM)cc6!2^bIE3@SAIeSc7`k;BcnUZsS|8HRP`Q|Le-A5Lz#=OM zoup-MLL-trl;wrs49+?H-5my? zIfhjIj6t^p76ecvlO2|EH$w;nU_m@2bnEAeJ#0n2Lj@lJl#T`2YGKpyw-gz`9v1A4 z#{zVWC>`L$g7`>PD0WB?9e8=AYL8a+damj%XKFi1Uj|~kQia2+V@8)t(#w0KOdhm%iOzpVb(GTWj#2RC}G?B3f&AqkYMTHtk zjwoBRJrc^c1iV`JkS0-mnov}Y2QDxWpM5J%Gcf7+vOO&53+r+vK*An5ggx}S?sIb; zc)bXo3Hsm%jF78l@^ik+VI|mqeiFcO|LWO?cQ?&ahh@Dc)S^4KLrA$;K24L4Mro^? zsgOsFLXV=Bed^M9s(1*ul&-=lT={6)UC5()tNePX+mrJmPfjx$_UILG5GdQXB9Gdz zLuj;cthzuh14!tE)TUFZsp5tOeijltc;KWX8jdghMlHHfQLk~eiC$59+rR8RgA>Il z1(=oK%(}fXlxhk@la3K0G+tmlNvXhS0-!zo(+2;iO;=O-Zl>MPmJq0|-XkD5DHZ9z znlJkUdC65;T+hJ8t;3G>U=ql{5&I1QapHkjtd=@B8Fp(3z>+>!&UnyoEk#2O5Ggz%@n3xA`uqn$dfKph`|r1z5C-#yTgQ2fMLhq)(u%@h3$ zfzKtQ;qIL}C@H`lO%&}m(CIdE>Nbh!CI)bPJXxnZ-_5nrWh&ZZtJ6cYRFYWjwx8c` z*V^Nx(`|LB=d5V&R|$@Apu3yW+YNNTuH|-a?U|tV2B5mF`nuhn`c~Gve!!tuP~8r6 zSfE080i`>L-Y13a*H1&_c=x5Sbi~73L58ipQKH@Qd_6|weuH`RaiBM~b>Pi-zy2!s zH3d%MDz`HcWk-iSG=g^!VL`M(VgP3WgS#7V)lK34K<0EHLWA(#H!QoKeGGyxqCXRc zckw*3E6>rvn666n&-Q^)1>qvm?v=Ws5A7yOwMZfZt`K1(cz|09-{)D&X)p`P#gbA9 zTt`K_pILTChN^lo;p|MX&sd&cWGLK*;|2rUhkfBf=XkOEq9}Uw1Npq+Dtv0#q@Ik9 zp~Dmk##TO#Fi_pusP03OgRKDr2CcV|Ht=O4&-VcA4rv&x&$WjdJ6I(|v>Y=C7$Xjw z5L?Hvg`FsFFp*VY%^NtRn2vB+;{KdA{(->tm2{1xV;mgK-N!`E5I7s~*iI%gn}Py& za78nbqofyW1deR>?JD)fOFVX$#BoNK!)yRkNkWbivC`Kj`~ro>MY|Q8M=~nMP=7|A z5Wpo2Zh0wAC1|${ZR{11+jo^)ip;62*gZI*Eb?Z2kH}@kM4A&g&M+|9RCq5FS;)YY zGhj0eR3GVuS3B2LD(4=RI|V(8qx z0GhBmkJ6t?wc=XFBB#^2GrwXADYI70Iq>@UUOQJd9$QXY_z|&~O~9DYxLVt}7R25# z72nE6;*LddTQ5of{4lU@hT9y&On*aUaZd#{~|BqsuxAY zUkZ)FIw^CT0ZVe??-PQdLvMQb#NLhQI$@$2T5XJJ#aR<3yrG@(lZ3FMz;4hM%&Ql@ z7chQni+jX3F72>p;I)j>`|v$MO@4fRd#7*%T3)p4dp&xs0j+^@`@+KGm1@yft)WB!WyWgBNl8H;j;Fz zXl=~}x2Dv!v2bOp&EUs^NjZ9LdcCU|;TAc1N=^YEv!JM?fY&Pcwu5#9+xE&YgFjH1 zxiscb9TPOcV@bjKSTo1w36Xeu< zf2%8Vw_$cQ#Bp86vRi95`&qv`?))B9iLE-*YaiK_g~aKxOtaPRTz&mb@Xgzw_UOH- z=Qs@yg9~+gtH)#_19qJwR=yr%ZMm|3M6rHc|C6b}+J4L0!(c_AXleT~hJGA}fD(UF z-@IWG&jD*S@8m_}G$G{{`}&0ZV>Ail4C%8I%Vs%pr>Z??C*E7+sd$Z+o}K()Rd6(H zZFx4kOgddZ9{mRJjNRwUeWFT(V(a-3nhbz>BRD2ugBcYN}J6J<`b z8}A1kaQ;Utn`O_96&qhiSIxq_yNaZ;&pkVpe?SN9+`q|keElPEzB%mMhc_Z0PBv7< zE8jR53R%3@8!auiF#l__d&2$jxwcTe-^y&G|4pT*EfZf}-xl;(k2`@3(^G zTDp;X1HXP>{*y7BMf%uFp2{oC=6uIuoJ zm`bLqWtq~gSI1R1Qu2+D3g67hdj87N$+A|ieyhg3m)T@%rpPzB+9@@pwrQz0S?Qn7 z&E(2fwb-gReXj9^|9szB1&0x9y~3`OFZB58eYn-R)VD}1BR<$bJa-AoPVU2eMCimc z@_fz&BH^P)oo^AYo+Udf}Q)*|t zzkfeNj#Ym!?R@eEEWwp`$bHrpzvDH_&OTD)6PHc(JgX~v=z!yCKC|uu1y#AzFJ0o* ze>#QgAGqjbc`F&9!n2GmW02tTAH|lE;Q(+}UL^K)(rG05_CR?y?r$JnkV2SN^Re*a+X;pw*ywJg~A^@kDu_UTYs()V8J}qS0s!NwV+u@dII_ka z5!e1h0l}k7K5=!v!XAPLgLv{FU@mst?q?Y0^r_y|H4+4f`X7MWrIDyYBaQ)x zGs%%3h%wvv2`Yc5qPK7WRlQpx8bS0tk9pJ%ZrI2D1r~Ir&_hBfy9WXoxhIRwn5aq9 zspnDek};WQ_bDVqD-0_7=`^|TI}C?#=|N+Kp#bcmy`E}5Px=LI0ESFBdcl+m)(ChB z_X#(*D^opiMEYJjbLB~*G#$d52|(}$(J<+n_yP;bCV<|f;PXOM5)T*?2aJ{BSYW_G zUun@N+DnX-e&pSI+%Ss=3Td}J&mJp%#?UnO2gtCKj56ihjR3e3e?CV#IVST=7ocOyzIsMUMNs`SaKiko{@=gEtp9JCg$J($?s&I2@eFo z_1gZ}_nd|2f1DH(2XY>f*DVh;ymFFiD$3RMDv3!1VI=n3?YC|26yGN)N%l8Sl;=Ul z1y|~RmbVYQ`uc2g*kew~5~Kd)N>}@n7+cYVExRFrf2Fg!)iO>Axi1|`BICO9z}{gt zRAf$!P;xj9gQ7bXhO9qLzt*n|0946*2n62WePoN~f+G+GtFr;X(;{AK_stVX#;oj94_RsJRL+tm^MHhjc&;HP?x z-=)_jb4#at4xP>kIx)Zh(^>TGk5<%aX1EK_wqo=RqovDX4k=>VrKPM8fR%NUgi>4p zr2_*WNA9WrFs7iCCtq&jq=% z+EX+da>@=pTI4J ztFPze;_sjDi3O91oi(XO<$WW8Cu1BJHeIeay}fom;^XaOZ^|w`|C26w^@}5e1(6B& z+5U4rP9{PIq^x;vs}_JOkaIDg0EpyX`<_nw5{mNEO|z2jFny&O>q@*L7pn!PZ;6=F zh>(#TE*>HgEQ?DJr(wUS;Gl70L1niMGOgep=1i; zz_wrF%BD`6bc4V%?os>5)C{Q(!15;5O94-1dilFTmmewRSyMH>6kjpkh0S|&^Q?YS zZZ$hLJeIz9sr?T0Q^ftW+mk!X&y-ibbQSf3>d-tbaw}ih9gM)BwGfZxS)WY%7KIX? z?_5+QW7>yx7mUNl1gedBW@?7k z<&8(f2@DXQMynAzZ#y~QE`*EJ4wlW0N2GxuoZ-}^1bq-pmVBw?F>$;8SiDv_upMzXD2rTb_|BN22mipcR zoLlWxer?ZEwC-u*AtWkDH4)8F{d21> zkPK&aXn}|iTulMM6!@MDq$bzH@jy=Su?UVL7EePBg7_4m8lI&CO?*IcM@7RNyw3y9 zg9IQzmqE&%lFc1aV`r54B}#ztB;V zGQU+4&7BNuZ;Q}lk)ac+ECr7)G(}E6t_4zC2FK~kZ{mM+9{0#e>M`#)TZ$BGHO|d> z;xw$mm1GE`g8cEdFZrl2oFwOr;i$gJgssU(|K4OuK2smG{GgL?MNpbh^hgB%IHdI? zWh?hQTyXb!RlC-u-)#*2{Sj^eI6vmP^zg&%PFHr!OI}BE``t5^1IhLSxs{fl zR|g^t!DagpLZrlKISzd6SU7jiuD5U-YH%e$*JZf?x)eQQ`2b7+{3%20x6FfMl6>0l zqB<+yI?2R}DxM9mCuDU#>QJl6somM2pC5pNfzpk9)70WoofhlyyLXDs>1huPYNafL zco6lHFDy-S1M^~d=9@`72b?DR_ca(kYw7ECdIlPdt9hq1Ar#9KDuzH(Ic^NIP$ad+ zDi%oWi7S%IK9k=i0bNtl*pwM7)8Nm*J&fT~#8uTeq30`H04xv36pEE!ZUcewB4NhN zLqMW50QR9GO{pA&{$8D$hx}BgCG;E)564qCTUxJyq#$Dt&rSK~jM3(?K8uej)LQ)r1 zz!cB1-Jc&(P-7T_=>#}u#>Q17AXMAKRXu9DXDE)7fvIqYzBXiqDsTnRFQM8Z0j_>% z-(et#8}JRT=NZ;Q5sW}9-AS^yKpUszr|y`f-TxN+Dp>@g6!D6rN@aobh&jF^pt^(5 zbb}x<#S64}?@Fy4p%Ku8D#G+APdpWj$0O`dT*)8(oHk%9d$_@Z8ZZlIsIU)Si?-Gb zq)E^sy%V!()u0VvL*<7=M4;E|nZI&52!NDbue69licZ~h=qqMcJhb-aUa4;ApSjRC z)ap|yfwXvA{HsJ+$A-eR^JDS0v|IAapbHfuXDJ%aV`|>TI?1&O$!dvDt5D}?@#C4O zvv~Eh*ycJ#VscVKry}YUstdhxGq69-NK+&Umtg8s-N|q~OBB(yKO`>ldO+<=t)eqX zz;XOW7Ru^PGsu~EARS)blJD+(LG$F~vp2IgXXXUbeG$R^SUl~ z$KSkhcKPs?@13#Bik0B|K$p)|F4dB6Rz?&y51k1OCp+dm7;&A392EB%k9L<3U)l#xz#Vay%!Nx^yB`d4mx*+j&2X2? zb+cfiu>0H;^lpMKaw)AXahpa9mq%leshB*Fa&Hm~gL`)vouYOyNmVI2F8p1Z&%0>a zJM}gXoly^6@f75!`>}odF6Y6_G{J(ho`!m!M%JFjE}kYqo+f&rXm;#!kf-Sam5=Nx z!i*Lqcna5ogl9Yj!$Bg>#H#MM!lTM{CQf(TfFyR0N+ zNVxSjkq5RJ=ZpMX9&`9%lXQi=qh-A>n0V7xpx3M8uB^o!Ux9mvL#|Qa%H{DvdCQ%V)*K&ih6@S}{?c2HYOeN&xl(R6eR(nVg`4rEEpkPR6z4>bw;oGW4gVNK z4Dc?Gr5!{>%|P6!-rYYEgPK-(l!L>$88D@OSQud?;RnKv1dXBxi)i?S zgvUDI5%s6z^sou$$T$iE9O4t>F%wJaPxhOEgy9k2O<+^Ka0S_DUI3!C9q%;*r!eDn zo7PxfY(JC$CC)&taGR3UckIm*&WP6{BwWQN=;0Hu_QFGaR)X+|$Xah>?0Q6hJg5(D zOi6GEhXfEqAOFD7+j$2F8A0bL7SFdbX$X^Bv?~0^8=4;zPsf=d5h3mH$QkCL|)a(Pb_rsKtfw@hIPT_G5A6Hb{6Ew=a9M*t3&csp z1c%kf%;g!X6pfj!8DNJEHo6aU=#P(FfkqU?9qLDrFv;XgiE6a(szlf&-1?cQ%XYHR z?4L1-=Qu7kC5C?pjv~fuXs#ErF4MxHudTgp?j(knjxXavH^C?ug7JBqsrm?%cF%%VHfNZG=VsKd$=Ltx7`f%ur=Fz zit`vlI3&ergvVJU6aB(tKVl-H@Ce{|{O{9=oX6l%;a7|Sq;dYz`sd{r&Ibba_NfM} zFeSQ^cx7xWeb(KLE-M{r%JW*2>ZjEo3D7j)mKw~FKU4Q#dZ}ft>Y|GAY8rLYCJyDf zs?J&uR?mfqg(&d*Knw*@N^ zxOH2uXqa#~K88cSi5&gMX^K0JN2`nL2#-U!M1qhBE?aLje7pE?pD08!`C<;q0Qb1u z#{wrar!SeNGN>t)_Gar+fwXiFB5R+tS%?%(pYy16rm^tlWy_;M0MzW6gjuGZu{f;z zxMdJ!BKzzSHLn}eZmb9Y@>0F>IWn`(6#bG`mj*q`%=h0n`yq=GDfe$LwO4IIA(Hcf zP4*?uhx>~U2wjoAo36DZTbgOEb&{lHnc{lyreBNP{lY6dpRzZopWpsZFSVkoA>VE)=#)RY5;SL zRU!T^o~MM$8GtQQD-VYXYw;nYbR;=V!Xb(~G95O}U6EexaNtWjS5$Y`wcc$D{6n|r zhnmjE9GtR~D349iz`5z}m3LjLrX$gNUe~4`baPzLe84ZQviG*st_+Mdn;~VVB##(MrB?467ZmX%xXF66UcMxx2hnw90lOC^lB;Olos~sz&IFDBs%TS z9lj$p5;t5|peoVhn#M>|H7dSkS~@B+|A=Ptqj+E8<=BI=wq3EICPmTZqj1E7PDA0l zyf2#*rT(m+_zZ2(8c>rlI#tImE|zG&oRYMCR5(Ockyy^NQ*4_HXWlDJexM^IZY7_7-7ELYXEP59aRmOPN3Ugt~Ji`caa?m$QYROs94X8!HFZ} z!R@0WX^)1iWxeFC)BBIzrkF&LW4T6d+z)lOIDS&^$o;Lx^IYq%-+bf6Njl9y6#a@h z%HZz2qez7O>a5TK=*EGcdFL&ZoEY2|P?5rp)PwfLIp(b0d+3+4N@VfDMFE+9q$MJ`obqlvMcyBAbf8P`< zdXh~*)K|V&yddQIhrwyMy>6OJoxSC*SZ~yZ;<-fsCE>TG8hvuc(gPRWi(X9RIk0&gGT9QkJMsLTCu4bKBcA zEOjMeK7Ks-{c$&T!pZ^q7S8on`Y}>Ryq1xKxJzv@Fa4^26wO2Nd~Kbc7Eio6#ch)7 zlP5lCkV^#SuDi)FfIe%FH!~{c+tKWmd`7u`R->53_e<7@ywdbh*6#I`OoNBUNxQAJS+YL*ynsg4+>M%8MoDF5{PIp^HIzu)R7$S+*L zjL7gAiDo!79M5G#G`3Y%Ul1v#UCN=dSP7QE9=7TZL1sfvHWhwyRO)%7hk)@b_b#yy z8Vl4OhD6)ld1si8n|X)do!(zi@aQ)=B@_IE-JmmCztZbuN&^dmG7J8AKHUG0;^m7| z0QUX8n^Hz{o7g-ekL_{fRa8#KPVsk_oHm;(L>$pZ`AXsR;((V-7=o$mm&olencs1K zTMyN$B-U(khi_N(M3kvpq9=aD@Lh3(ifs@zWu(I?!wiuU`Z1cv(zV=toG{)SFIsGC zBOW*#l2t3?G}+0U_|}*3&GeUI^B2K;pKJg8XnOf)BiisEFBN(I&oqm{UmT^`O9Z)f zN6WiMgAJt$VpL_LX?Wu`8EB@>c)bDq(YKz0H%Q5EvHAtE47@ajLZGZRQ0x#OltAXS z190V#;DzoG2P{t@nKyyR-+&ceBqI+2V%jn6ZZYf;G`3eLp*RvajRYwo!OPvD6nAzv zid@GrP+8brvjDG0V`OrG{RL937?p!a5(FAAnhd`J5Tepw<2VJ~fx2QKI*@w!>d#*w zHCD%NP6O1#e8Zjh4vJ5W$ah4PK+SRb_UK|fxVO{v?ZlL?(U1vP9<~zxMJ#_84xLk? zU+z(G4#oUxPJv%sTwYuvhN1w+Yc}_Qn8jDv#8o>eQUny=6|(ftIi&v(H|M;ZcL{$E ziT{u+b*)s3P%7(0?|SxVbi_z3l6k>Id0mPei19=LBdTl2pc$_T!YeM36&(N)^YdyO z;&XHz72YEUrzolcg!sMCVX;+Us6iN`mM%~z!(Ax`s9F#!>ACQ+@YoAYz=>wKYj;sJ zYVopamWIteD%BEPHVaZToTf{eekD7mh=R;;7v=WWobGu}U6A@9!L z(JnwCT8Tsl1zQI-@Mc7hiVL~m&;^X9Zm-RYaH44XS5dCByYRwq!F1G9w2-2KTC@~r zH-N`V1kNEdy!E#&4a0gA4vA*Wcpc^n>Uxic3s77ahYpma3^3Z((WI>KXDp;f2YM9Z z7LhcHF8`o{)0}V&kR!sKeWQe5T1?pruMDhs+{~z~EhSVKR{9s&&8d)mxk+B)gviq2k?h0yX`veI>qH!RIlVUS_+fQ^GV<)<`<6@%!pH^oHJvUBcz67$N002aY_vW_5>;)ui9_kfsH5}s ztSEnV)J3_Fw+>Pr>yk$1`qwBD8k4w41*3Hz<8l%Fdf=FHull9J+!CNp3J!IRQD+f% zf*W^Kg)d^ZpR&HJ_64uxS2(5_r)7t|7eji(vK3Te}tacJ2Xaq2im(J0|- zkIc#AnB2A=q0Jb*y-YMWPU$QDObJkfN6<<(R!A88_&$&)2q(A^D>?9_LODk091h>0 zEReA*p7VrPKhQ`!2A)7RT94HZ?A9!2^g{jy-bkX;zM7Z9*G{0px}8TsZ<%gI-=EWh40I%i|5~zpBn@GYZ-qJ8?JQ zP~ceg<_8fTJtyHdVoJdgnHh!}x#}H&{*xdfyRSGKAESgG2n~Qn)xK9Y(k&ziF))J5 z=^u8t5)Dv2)Wh6MkSoOKCiEx;GT<|a(AI9mF0&|sz*|6JUhiSn$0CtrkR#|sAXXFe zFC*1G78f|B%5+vZE>^t?Z_sTlFh#^};spGZUcH$T4a1h(#_{1n=)!LCxR{Cp92)~8 zk7A6WA`tGO4C7~gtD zJ)77&DlbDI=H`pL<_UCU58pS^QoLui8T;_MB1@>U+z9|~kV}al z#M9G&m^M=inmR*hHj|0t+@z55Y&i5l2K6EMJ+x-|u7w`n{5PFWaRTWu>P%}~1_6h< zu-q5Or;&`BR{7%I!IXIl<3{mWy%>I4lOc`RxX&o?yv8u>J1zyI+q>a#HCDJ7B!ejt z(*kw4Rc8|j<(2qd3tXj>k)hclzO{#iuUb!otbYwp`pqk^c)m;2Rctdxx&y0y&X%hR z-*_^PN%w>o{+S&kIek3cJqK=q>%5F$?SZ|8SwB`X{dBp;fOVciY+0-v0nYxN(6hNYMOP0sW|SZhI>I4_Z+^6x^nAj zpCFzQkv)N@0inKOcVxu5K(TlRK!U$IcixuP+J-@GFl>5NkvE2P`i%z6we!>i-3i#c11q_!6}P<~ zt9C94+Deow0M>iz@C0^)LN)O&;dv@+=8^cvl(?(;O@`b(kO76j7uqrzWLzx{R>;th zr(QT5e)WPxkO~M*sy6(9b(elO#Ap!&AjQ13*5}q5YgR~w#H}q>p^!C8!sOP9A27U@E zy*JHYXhrtpxdzVacfYxru@i8Wiw|sJ5z`X~%ih#l6d4x+7`= z2g)*32-N}%G}N=Q!e-}>@db}!CH=65f%xl9VbL%4c0~gOM{5kJt*HyGX-#o@*G$8M z;&mI0b2PY=?A}Ni+1%`2$7L{PAH-%yD;a0#+5BXXDaRuZ@IUr@Iu3dO2eDGZ>i1OQ zUWIm`AN{`nGLBUk&E4K6+_c3^(t9|olV=mLC%8Vcyd|jEJAA;1V%8mqrK6~^yk@}C zv6fP2&?~K*<)ifBNWjr^W(v5gG&KQ-YPNwqM#TZa;HmHDtU9pDT29b}}dAOZ# zJ>Tuizhz}guGBeJSX^U-ikW~^!%kQ|G}I5{S%^1`JmL|+8#dU8fq==P^+9g2tbq(N zcAbXw^~yo){j1&LRnPPn8NdU#zj<9xF2+<2;n`Zd!QG6=5pDg$7gAq0Ea71SK>#^) zuV_$?H1`9Th%>kF`;blmp9^+OvkjM7BM)6xBy@w#Vk$_Fp@oEp3q+{!%O|`&%!L@b zf4raGEnchd#~25VW`yu+gi%T52>H#u(4NPrd+a)rBIM^Ds7EJ`B3ctN@gFc+DC_ba znauf2BC7xgck`T)VRWG;VL(l9lyxgPd`Yh${qy<$qiv;63nMkB>$6!xto71E43$3uf6YF6vuF@3)dE4sdd+v;xqn|vc+AaZxU&n6g8cvVxhs5kv zo<@Fky%n~#Fqr?Y>uPeDqstfbiAL!<2}=Y<`UQks4)#%%D3?Rw>vOj3Z(Z|q?4{RASOgErl4et6y(X35LlT_ z9G`?ZSN2<{%bjm(Zx4(=(vNKE{n38Kmx;Ib8k;2VjCZbU-prRPIvym64>>Abo|p{t zWP9xrPn0|=3CkHxNQ5O|S*q9SxRy#i^CYYue-|tRd0i)UHQ2`Fi$4uEO*9`6ODH}* z%Yu{2AeL?hr4#ReeKK=Lcq#e3DX^w$jznT?@Dxv;WR(qGD#gHTDBM5$4Y> zpO3o6LJ1%Rrcb}(EL~(z$&2~(7|-w3I*`wI{V@1ZmbA2hta}Wwqz@V8^eygEbg4W4 zLiLj_!L{0FpYE^KUG9676xh`+#T@B&U&=e!wfkZwyF-cZjQdK2Mg0;n3DYBCAR(^Q6{n`-uXZMYYh<*%hi>3Is3;kd1)79r& zT0_?+<+Y;r;x9)5xM>WKa~MX^2F01_X?LqRpnNOM+vS##05F*8B?|7jPNiWqrm@l2 zf?|=N67Weg3VTe3h_Ip<>!|_?@BV_oSdbX>{2oPcA}CJUu7u^PHidsQDDLE<2S;!# zPJ&}ALFjOfUS*pB>|PHjzJqWKIaCE3yJ2{a z!WLnNTk)m_3&QD#F^0D{2STgnGw)h?iwUfl8JmP;Wobx?^PC?tEj^Q&(_AjD`q7}bQAmEThR=!g zdsa@$O&PCyeKbsUUSGyFKB)fSt)oIqcL~|atUL6Tzj%HiRKF*0RIyyIOyIP??_^{$FUEYNc# zMk7)WL^q?Fi#i8+Z1ILmwzK*SBG_JJ7bxf!RV~=T%3kC4I%D#qU^}R!f^!3N5PARy~MA$#S!Xx6^DSu&n^{uc5P|nh|eJt}fU&q`N zd$Dm_x1A!s&K3Lb5i_%Hzy5lux!)sKod3I*T3LUy&$8>%=F@J7^jmdzv46bOyv;Z{ z7Z2zK{vH7n79x+ZZnzy;ScF%#j=$F9ua~N6U(V8HleS+uRnJE-i5Lzh-oNk2q2Lgk z{A<`QCkxew<=xKsK0JrGIf@`_Z)f!xxrrMJ4DG+(;#=&Ff%fKfsWCJPdRuu373UAz zy(KesbzeYAc*1QRcJ9Y3#-CCX7!A$~#TJhE>V5v*p|ioH>yGs`SnFn#Ot_RAjRE?h zaLl6aP&`_`1ZhhKvUHn@&LlBN@m2h zuB+TJ0K-TfUyrGn=MP>I!1<05i~?)p=^`IsH}1ZF=II&w~Qw*@E0Dsv%%VeNMZ72~U7)^OAE{(COI3`W`W z_qV5O$Q27m+rxv3cdg}r{A^R{ua?8EhQx)#1u^X%=dG`qYPt2D38FBayBx6=(7Ijn zYe{%|PTf)UK0D~cv3Rjrl zZn6tyvM<=cB*u4}u{?GFRw)G6>~J1@T*1#ntCx{)BArWW^d5?gIE~;?A&iBbnrIdi z$ihx|+VWn^9lw*jbO1D%Ao3wqFl=-%l zJ*!l3i0~Csd3)Nx6;1))M(I;oGG1Bi#4PFfkb#6ocA=~hPDSCNZg4MEp##Nw%>;d> z9KA8ennK{~f{LyKJ(tL)6D;h5s*9=%69gCL%cpxn~=dGCkPGQ!AIU zDDEB$4O`5&ClT?X!U;`k>TEj0Se<)Kh=*LdsiBn0CS6;E-h(rGQYm^W?mEK=P8v;T zSxLX|EbC~A-h)tW@?`I;CVks2{mC8VX*QjY+4@hK5Kl8R>8WSG5{Ox-3I%uqisNT3 z;Z@@llAlAIBO_A)LLqnf_DVQEcyLN%k^5rC)qqQC*YN6R_>V;%rK4aNN%%u4ejewW&k$x*eR35 z=8XHhFN^HVXE`~XLS8A4SlrDqSFN?YtYK;CZ>gkcHJoV%GsmRTQB(#rJVznCgsr*+ zZsUP_cK0ueDlf(=TEZnQhMdmf``K;%6pL8TQ3K`L{3+oY!tl$Ntag_oYa6+YEUl6hjOkcU)7NOOutNq-irsm+Rp+}&)NrTs5s|V4o#6g zPp}8PaTpu7zi!_hmMeYfnZx-L=RBTOM7(kQoCP`_B`;KQzf8F9@Zg6-HV>uh%{luw z=SxoLBCVY5c`)Q;B&AXC98v3Zw!&u;JPgYfSiav;}hpmoGQV{a1wt4B)YHyy;yjqF}#Y*zQO3YR{}YlyAVQ#_NE^;GzLtUWJJDk zEJ4Y}KYN+Ml@&5&XnaH1tVQlr-b)(?9=klB&s3jK`)=<%-+&hOt3T}-tuC%*UvzBz zF*Qk!x#45l;vLHS(bvH*AkPmfXLwK3r&!YhyYs?C-Q(P(0_QbUD~WSk>THU;N+I^N zLJ4mq-ktuiX8-!-KVLlm0C>Q>ik@Fs4G_d*7ZS@}ELQ|dn=YbS%^3&BLq9PH4G?*M zq9nY6m<{_gTXHYr8+i&Eh261E9|PLCM?~3nU8vO#A z<@=oa6?*o$pQBbteW;Z^;u>D-#`$-Su6)-|Yu>z=uNrWZ$HOr+oLBbqNZD!$<`kDT z#1qdle_^Pb|HB-sChz4pH&D|KS3YXq(y@a~0fbTrSIFY<6x4~6NwA_`F}2()Sxh{B zWN7IUy?Pz0XnvEZCGQwA;%6~L6%$~Eux>ur`SD!t;8(N?iojH$3Mq)*Su5wzCh}-U z_j^P?%<2;5CrC}RCr@ynU3Fd?7X0)4-0=y1Q-SH2WoRs-Anx+B*ktvWxV`9_6CqC# z;VhRpd*3OhiQV$|Hf^P`ijt9GB@jh3$j=-LW*qX<4fi{($gI{|0=cJ-n-#ak6er)7 zDPrMO?m{u90(%TFHpICNB6JlChagg(OJ~+yi$BhmV`q<3wPvLTr+(f{MuuDbvv{Iw@GL>QrIcUXdzb5j&y1GkrQ#EIMw#IN4V`IgyTde zocwy{MW+O>KJhKJjEIG0gWUM~kE!SDK4#T@Ywtj(>F>1b{A#~>AXpE0pvvTo{If62 z{vd6ZU*wwo2KQjq7eVw|M1y@wfxwHWma9+C!alNIYGmSn%rqcx+Z^zLem!3gSR^Cj zFkoDgLN6B8A&$M`&8a*0gneFgL<`PZ$r+&sH;WbiL}Qib@j}kAQcWTN#Pdc^t6WgCS|JXmX-9eMX$Hn2Lk>_I zVpmv7*+RsgNGx33z((9;<~iOYi`2;hvlR3}N0o3nvb;xl3(D{+`)%g#7 zC!G6~&cC|Yp>th>&@|f>5Gj14uwP)ZUnM-#)H!NbOcC)ml$#7*B?XLvkQkyu4w+rl z2W3Tew)R3w&OruH%DiND#S&CQAsdK{Vlr+mnVB_eKzCf;Yq@t32MAc24~fV==+NH!(URjH8fJHVrKQAthiCdm7Qf zHF16HbjO&G>&$Wf!JpFgoB{#mg6d$Y>gEpm zh;#secmeawrTifuk=-AWD18=jZz10o`Zq*+ki-kIt#HyWR_0Qnp>8Pu()3g2zA}+H zF1IL_Ey5sE5uir(#@3?*c<>sBzGtHc^etA=7XUKBaDsqHUwU?m z8KzHRs;uC5FO}ynK0Q!$1?BQiC35M$Wrj5WRM{KeDnyh6F?Z8qH0pE>PP%EwSiRbKoG;y!90D{{k&@*TkA@_E_g<<#_+wfYwtUgwdy~0Mjq_{fofl}ZwnZ1 z)$>cqe`9R|MBHA_%0npscDk{^c`?Vu^AqaU5nF`P$L?~=2L`FS&sN{mK6V3(WMANl zUM&1FU$N(--*ngGpmMt3tR$BCbi_TGUsJr{FCXYdj|N>^TbZhHewlVsZ-q=X4hfHC z=g|zg?=G))rb}=4mSf6hbyU_LY)EsQ)Kmn|l);a~1AJbcD*?!1MxU#$eDyFIxph2m zrPbEKorbWrtE63h!#{3>YA%dBh1&0xNHK`ty_3YJI_G5_FJITnLtyCwd!?w2nk{HU z8qI9=b5l6SEVKEailvdZb`lUKRv}rs0zb9Wy})k(le*3{aqnCQ-$l<}c>ea_#p2sXGZzbRH#jA6JNyV4 zt2CQ?GAo>i4ctbq-bxAF3R4~2KkwBvWpIOLM-DCvf}%oFAY4;zlNEK&V? z3szFOsB3JRGgsD5Yo%Yq)foTT98xx>qjJWNzKza4tnc}he6?$T)o%S_YB1Rt)zQio zFFnnl3AB7Rk+}bvIK>6UAA?Ml6l^zX#iJ4Qz#OVIN|? zrB9XRkcaVo=cm&6Wq0ynn#zd4-b$}}k^CXdOQk7J28MbLnDrVN%(PO?MGH8D4SfM@D+#U&bzo zo~-mSvK(#a@AQ;ebG&}m93?tl+O8)`ROvGO#qU+1xAtn%yzyvpK1F;{wshYp%Xqx( z*U^|U7Y&jj(^&Jsdc;Pr^T&q3i8`^c<+BM+k=~SmyG-M`L`#~J z3YR040!z(YWU#F~88;XjM65;q;_vUx5p7e%nHDIO=%nBjZ^h#>0hODGThl#>%5yl8 zY9d@k7>}F?l@NxV(T#J*Lt5u>cjK^7Nux`+&w5tq3 zm{Zbdtu7t4mtZIM)uv*$1R~noxqpw?V*7B3^IrGb;_;$wyl2d!=-m}lZo7Ua zse;r8N1K&1^N9HCQB{z;iKZvRRxEK2~@^)qWn!!1(o{ zLG;Df$8)$_znbZ1LT&C4#Nv2;vx_pU=T}ncnZ-vgJQ8NBRQ@NrTUGW_xn}|>Qwl&5aAZ+PvC`@7bJsVwu5+## zQ!-`8KEy&zY?597n7(ZVuGTJ?DPZ2McDJdz3kUHi4lfPqdcx=HdBjb*QV7VQH06f0 zZ(JVi0ok4Q?!r9N&aVY$&qrZzT>qn;?RyZ)#+JG8EN~&vzf)2V9OwT0#SbBMk?z@# zzP9w8wmZQAG4E$T1w3oMeC3>p>b(x#e76?+jXdxC<3X#{x+gPF7PS+SvlpGT&W(tC z)D>2B{Q)XHwNi0~pBTkh#H`#z9pUv~f8y3pu9-0%pcKL=0dKf&{}AyFx$q=_J{B*RQgxWKjm z$St_cc6saK&wOQyi?q6?FC@W>Z4qVZ^U90S*x?r2{hiuv^?t0H#cjn!S0WCW%8^#`lP2r@;VQRaKa^p z>~lS=lB1b_RAK|5T1$TauDk9=T)c={t+`+Z7OOE zinK}?RvI8=+>|I68G`Q_Z5O~J6pR)*0XjIdBDcHv$S+eldmY=POHaW#&XwtYaZf_V zrgHkP<UXecVj|$;f{4UBrXBkYb2~$qb>O3z*vsGxEY~$e6<3G#RDKqzCK|8R`xw4hskwLK zb%X@}rvYh?rJ~Rm4|JUq$Kuhl3+b4tQkP1-0U>Q97{fYY{_-?f^F9ObMVC2LR8Y}i z)L8iYWdlCPWT&)br~S!*3y&VZQ=|_IeQ6z;2NiCz2rSMqqih)8mhI&k%8x@lB*$sb z?98@8Pp7$M?H(Q3m~s!qv!(~Z^Bj+rTXNvP-rd%if=C!eEVV?z3Y?U+R7jgFMO*Am zsGzxSre6(?xJMe4hMN;v)<-tNzgSKm^oQTFm+hzPs2J7)shCwz6iW$FcmT`h=QzVc z)GJWo)K;lq3NG3LpQ3`0vSL0NhJrjyKX;i0Vxpg+&-5I+w-MgfiV!okQ{3`d&Ato4-u+e) zdr>x_Vm|hh1jj4`mp2KBLL=@Lz&9HJb@80JrdF;#iTQqsIXnrm1BuaR5^9^kk#Kr6 zHGvb6#447kSOSixm=t_HPO1mn`AjD^eKjlA*8N6c@y_5_p~3bDET1vVc2v3R@0hMM zz@9U(6=Im42f-^zOil#i5hm%H47jfq)>@U^5eB_NV}_6+r%B02gp{Fy6dtmu+oL<@ zTN0ZG(wsb#1M$qtG@xk^XrdLMx_LuB7#PODCPYg{k(kz7VB-wHArf0(5PUKWKHrf3 zCjqvlo&H0X<9P!B91K+sf`$=7!x%Dvmk<>ArIl$f=^(%WHBAPhHxLAn&hU9ukRjA? zyO5Ub6a+s?VEOSChGKxtHvj?%36S$iM+6p)71+^Nm~uG$z$TLg%k*U#CegsI6%N-8 zM-)nAo&R~v$VX+B9n1}yWaZmrR7zw4mt?R2K$X;FO*sgQD)1W(wnC#%bq7!I=Ulv( z_plM{l?b;;grBrYQ)OTaYsfpjlE;gHId;oC&Ul@Nu(+-iC^E9}p~2^U|LUj*ws}W1 z<^TfBv;nY*DhQn6m`A4w5$<8z?mhf{@5ts}{=FQ_6<8MOua1hM!b;2w_-&!v2ig?Lm2r;tmma+^q#5&1MFTnyL7==;YV@^@DS%>5YsKQ;%~XR_VuG@Kn{2|> zC^NrOj-O23a-~Y|rTW54~rT0tgKIn4kV(Py5|MgOJ(Oh#wb>CL& z=5*^;%RNf|5|;rrCWdCRKK@c|E;?IK)3$Sem#>y9UrOzXxYF#++auo@R0y& z84X{Dfa-Gq_Tj&3D%Y(#o?8IEwfYO2OtChlZ-S9e=$xSVOP1?gvD$WUdvrQ)zh>AkZ`j$`|t3OOA76 z`tS>X%eEWC&x;?tC`o?a3eD>Zexa4zUg6wv`$fmI13AG}&Q}d3o>@Fo(9Y6Z?fo?` z)Da!VYcEJoUr_Y8UM6>>taZ>{bPWkT8xiVMJLm`*@9`fl@|Eoar{q*faiMTS(zD^0S9#+a_~g zFs$@dpNa>?Ik4A~t?5*4FE*gxIIg?&bB17lBH@6o-~$eN+Nb!4t1 zMD`|vS_!t<0e=i)8l#9eV_4%c9CMVwhQ%RS+kr=i!VeA`N$Z2EH6tbKEl+A&$Dyxl z6lUi7TtydO!E*Qtb2X!eXA2RJt%kaYBdqCblIkODa>IJ7!XX*~&- zyo~sxC>hfJXmu9J6*c~cJTB-;fqxrg6@Q{QF{S_7n_x!*^Hva>>o=wqz#A^H2?yX5?i&*0I(0NRDbM-t>g#t7VsD_t zn+}jk+%kz3uWqUthFpE$<5GT%qB{e1d%$JUlYbu{Xm?V+DXJN4-Zt6?%3 z_<)mXk-%zAF?60hVd4&-Vi%ZlWlks1H}jzly`(U^W9Cyg;B&5@7_Dhc2Ksw9HDcCa zZZ_y4Afo8E&M^xiZ4JPn!Ibjo2bh znj8tpbxt@2!*ri4vXWuCodZ&zgk^5SuELotHc_i^>sr(s*>W5=hYIq_+}qdEKew=d zbySqlXIqwL&Upo^H}M}i?`C~@InGWr;O)rhu_rvW)+9-VNk%8A7TU_&-Il|-?6~xRww@(F4cf-zFgja4XiY+e(ltAL=*)*PT zoK%BklVE`Fuv$3yHyRdh%t8NEV9kS}**TTIP1N##i+}yzdqQ!H0^5h*&Bm;GK2rKH z{}pvwdD;fD*U8h!3#N|)Y8PIVAKN~naa={Cy~Ww$ZJ@|3XcY$f3Po@hV@uy;HlzG) z^#0_AmBYcg^n5Q(>D>klYj_Nybi-`6etLhDXLkILs@erKOE%iX zhM5w?w8^Mi>Zm%Ue!#WBW{ZJ#*vvW-S%cvmqohDL63-45w7+>kuICt_>!>y?{^J%= zFX?ZTyhUSj{lqyOHEUK1CI*GMKiyq)kH4_|%3){!)q=K56H9b7;@vm45N=pqG1O&} z-Q8HuX)d(K2x?k>49aeZWFVeG$VVS-cyRIih#G1T3rm9Ypr^8RuTP06PClV^N_T{v zXj`rw6!LOx;4!*)ro!SP(dLtSmgTt<(Yby+<@ST0%Vaz3bX^4OzEVjhz~DK~gmes*t2)BxqPL;>EPkj#@ zzq@rs{W)S1KDX&kiDyTe6w1-VK)d96EhirYxgDOcy7)A9YtuWi>U+S!FE4K$LKX`;ogxOo8*wvo`MaWWvYLCWz$8_v54K6XPORh9V%Y&K zQ_hGlH=FnJbmlfCTQmKFO?@kZhsPDJ`7O zU{}}hoE7Jj74Et1XXf?kTqbpc`Hf1)>#)!bmez>oR`sR8JSOT;Zn|<+T*a*SmsuUedP7y~fnxo= zTIYV0-p5>-=&zQqAL&P_(4T}UGTJ8LUPLk&LDX`RavEK4eva323tz5y{z>qXMkU+r zL$OE27L?hZgmAyZ#u3$IrJYoldF*P#7e9e~KB?C2<+)&F+v8btX|ajZ(G0Fv&`~)y zEJ7062E)aXQ)2Mmjh{koi`ji#c?%u;I+MjXzq#glg~)}NrYk>$a`VolhhCFV9vEt7 zi!!6%Io1bTGUq^0*1O5KHXyq1V#0nrqqzV6fxjh4wR%*kZ8g7OUkW+ZYIS0N^O@U9JdOzgJG#9KE)S{_GQltk zD#vEQb)dN57c6BtB(Zn0L>`rD^JRmJaEZV=lbBHEBL?oYZDXSP^#q&aKG@%6m`s-q zu}C!kAnxAYn)&oeZ(wY&iAYO}NzO=5((ky>m%Ze5X|zD{6v;CncIu%X zeGPgOJ&NL&^OTg*F}HA-$jaWqiR=f(f026Sgnh6W1*8wYfR!eG{{bt^wRdCh> z&Pwe7RnK&-C@-G-!tB##>GpFX|0T%RWO}9nv?g&;(RqKbWZ)J zm-?*j)Rh~P4?cq~YWL|_|F4(&zh3II4j<*4Gs72{|8FmK@Bfcp>R}-O@c$7*72p*S z5)=~^mJ}0}k`VvLLzPhB|367otdg-XT}8FEFgCNawLfj~pD?Q3KQOARGaW{?r^BfK zsHm5{T>nu~jeV8>QBnUd5B2p`^nZD%|E~t>zXPar{q(=+Q@VS4Io|d^=~MlGr%yxw zrJw#6eVUg<@Ju%Pcly--U;3&4zx2~Png0JqpPu;-`ZUk}zueP^d(Kt&(r51lUc2uc zf1gY!yzmcwN_S7`=;^0_p{Mc1-v8Y_O(_fb$34xe45rhkEtL=G^eNpv-Fb8^x9S>Q zKmCV3rMste^t7Xvv|AfVr%xX~xs_QLMb}UNMV}JO|C@gLpY-Wp{q#pm(tpyYX$@^@ z|InvD+Oz2NY0LjfpT2lm@NfF*ga6V`dwWaj`%3;@KW!d(G(7N#uAjCHJ^pw7bn11( zzXPc6sLkZ@mcH@jk?{}T$2)(IKbxIsot&KPn`-~ZK>ae+^B)ZAn~wRnU33X`;9b|w zO#jIHp4Hh`-)0A?ANv2nQ1?F!OnqFR_%yWpd7O@+9{lxCM}N#szF*jRzc~4ihx%!0 zdVcBK*QKSOOS6YxXTPj$&98o3U0waPI=8ew_j`R|<$p0$DAKy4b!%ZDTj9cJN88Ra zwal{Ey0iVq+H_OE=h6QOLw)z{{=Z?Ul2x$(z)<=B14DJ={z{8w7;2)$aXV;daKU}G zNpZY4zLwnP2Uf*!&@og`PU+Q#7!H;HhM|`I#Zb#Q%t14raXP%07O(5Qr(>v^1GGh` zO4Wa1sLXScFtw@%DVVkPTs-?(w@XX+`d*t!#MzauEInworDLcP|AC>VvfKRBp|D8* z#ZaXq(R2*8luXrL$kBMM!GBr(FNUg4$501ZhIFL{LoZp(wu(b5gu;_nOg>x1o;H45 zcYnF&!OAhJM;SP1eg`w#L5ejxU}$}9w#_g-W~>$oPcf^D?2g6yfvbAz#m{dOxgg`Q zccg@#;C%^p8A+MVQz5-QEOkbpA>vVrvvTdaTP5QY;FF!0)EC{|HT4p6T%dMdo?C%t zgUXgvWrj1vJ2`48pewi#lSbQE1(Mh@3&F1$O@Bxb%x2UW*-&FBaP`B+K=w?cZ9Nj> z443s=2Rz2?4FLO}^;l1up$Y3xt>sd?y&!^_HtnY+IAH8U+VLbnz2T;$k|jE|O5f&b zg{12cMs+N3u;HloYyw}9;y2i^GFNYi#00B8Rdk58Ihj?37JFKP6xd`c@}}V9_VCgA z@lNOz(0&r!p|2IOrFwypvZC6@9Q@31RwLv|&926oBn7|2+0zoKK__jJIS%lzSl#w| zbShtb2-KLcfGuhX5)K$Ql!IiI;j*b#j4Uk~mRAyMGLiTL(Sr7g0o}Ery2!5!=Wr&| zM+32aEGimw47Jc?EJTc2SJXT$dAHtCvckJQLn z8G^ZyilRQl`_C+O%rH4f!cCA|9>HQc{dcfJ1n`jI`zJoq-tX8Lz?lb_tRDb@b*%Rh zKMRo-Plolvka(a?o8T6JdGs`I9I3CA(+iD#we z7=z$u$ru0z&-B$_mfNPSTnfciYeAeE_8{1~Yh#%FK)YsqZE30`L|hQ2vFZ+Cv7`%A^sok?))9f z@c$eCHLJ0YeP`@r-*+)%$dW9jsKi(jSt7g0Y<7i`RFqQz2VETNFG zglzMj-k;CszQ5n^Z}+eFA8;MlaUI8boac3%ujlLa$UxrWYo~VzZZ1J~LzSclm)k#@ z%)`2<)$*`I+EUiHE`P(^R+6&7$#>(#tyxtlZtj}2!T}0BvAjB%xAEV2Nina)diygJ2ooKb8+&6!&q&WpN=Uai2>=<97v4=yaMeUx6W18Lek|X% zLz_fYeQi+RXlB)Vm9yK^eucNeGhAiPY=>hJQ|`^0gS^^`c~MF+63qMn<2sOsK`Tjq zl}I)_xB)Kj;TG5ipnon66qsJ-$ZseJVbJq~+t=mWu@-@Z3mD|;-&@xJ7attpTqtu? z@ai5dON&XxIMzn`3WQJEkTQDI&bb`lmGM^D9Z-r`3~>I0E>YNBR!9GCz9$Dk3X*07 zZ=UP({&@MmeWFHT*k@4Q3L@lI)N{dy)#rT0c&jiz*|5J3Mwt_Kvao>RMp?^G3eatu zO=s96@)}OuPoGKL@~k->+8nq4V!$B{HKeQad{Cse_|o3$5N+#y5LjyH%h$Prvpe_G zuRei_^Bj3~DWJr66X<_eD}C%XyL#>E*%%bT6;*Xo^Z+a#ne75udzjH5hkCz7;3=si zovsgp5vr-mQs_5z0}u>uS0Q1yDMxVa?1jUsB2V#}d4b3J+df>>CCuFDIH`K=`TLiZ zG@q7t`wJ)hSMo#jRf9c}+pazT&>w;I@v%)WbiQHe6^7(_JtSzJ-+mQ_wC)^1CV!^$ zK4?`?!)Ds28$`YsTdX8-pKIDU+2F$>v7~I7MpAy!1c8fwc@daec3h!re0*eR&lmqg zE*>?be{`d=+Qd!7J}LO9T$>B^879ekqb%jW))=Qycia+DQ>2M_*n!Wk56rT5gJcBP zi~J)Z$u=wKqr2I;j}z>o*e|;zN6nQ$)EN8cLbqH>#QfXdZS>%e{2g=9e;5)JL^}F* zn}&QcXaJV$#~tVT+Es~MpzL&z?|^IQ z?qdTd!`E(|j0jXo2mHz^s-*55VIHpw=TNXivntsiVx+&qt#rMs8pV3?&CykYPv!CrbJJT1-%&dx;Z7kFd^ht;70tPU`dA#kh!E z<82_O6T_!f*JStw~DP23eF%w8k;myf*&>pGpcpGmn^bn;lKB|OftU;9xB zxb^rCFMahAsJC1NSIO=tL=COV@3IX2PdKH$pe6=!XX`mPOlF^?%qQt*tx^71TjTHp3OrFga`pFQJ78?OG3^A{|r>6 z%`~++v0x80R2aI_kHBJJQ!IW>bCh+zjV1PD z=8S>)H1f`4;6hN8!jwn}6%u^~YRWNC8}$w;@d9VcRjvn~vadNqPc=xlwAmmrvn3i!YKk(oZ6eTgk8 znmLDZUdt}_$#E9Sff?qMwB(d=3{)f^m4l&5^67rb`G=vF=GM0S|1nge5J`SOkWW&G zuSKwZGcWT=-u!A_duZMpC&fFBw$2{85_E`T2$BwDa$)tCPKQFy5#8*BhQ(15N%`OB z;Qnv*8*&I2K=Z-V=I1z*#F&A3gyi|fL7%weYQVi2;ly~HYnD$|l=6Wrn4zEKQS{zDfb6Cr7jk84K% z$3VRtFC*o<%>sK7g18hf&xu1d8wDP)z!WyeK-IVXP+joPKn?h3pkjVrQN_l&MTfDP zxwQJ7)wYKR<_Rds!MwaeD+7U-g9CMF1%&%S zV0tMZ2%N|yiPYu! z2cQOE!2E?3umQLn6Fia3HQiIO#HyZQ-M)|HcZ->n0UE{0Mx81E3#oEW=vj3YoL;5Ss7V)wzQ9;I4Hj5qg48(xY9Y^8YV{TiFu@}-hn&2^Dz1RlkR9_;kEss5bWn25ZRDk`sye@kmfa1X}!vcS;G;^@x`Ppo&f9 z2OUN)k#!{Pf)-sg9`!55B;TL=7-!(9VMvJ3hVm>}+P6F1-^1UB5$`|vh)6ed-b3;o zdIX*~tSzi+@Sv67M*)%1TvI=5J(z--)H{3npy|JL9RE~T3#okK(G=U_5tvv39{_zn z1H5I{_gfbK)+t zhIwsZ#kS6GkHA!rB?mx7fj9u_2g^rr09N!35XJ+UYz~0hShq)gIcilr!+gp7_G00k zg2Qv6rTII&ivIvq+%e?ZeEr6W+uJ89cE7a)5hNA;7Le~h0QI%XA&?<|QIeMGv<6t% zc|o-4(ArIq;gN^S0_<=aCm@PQ6-aT??Z?>8(i{OWzwGqES(@7{{yZSeJ&Q-lSu^y( zsM%xeRWQ`fWB$SyNim>1@aMFh7^K0GD@ zoRrUq#vF~5?VehGqd;W@Ms%5%89K3dAZcZc1tip4yZiE}93`JIf$5?~Ge!w88WZYO z=;BA?btAy3sSGNfdlcXsfbxBb7NBBawgk9EZ?{#%`GpYVZ-PK;uONs1UY|#9P~Xi( z3tsWtDsW~EoI!(x>UW48@$AQg!xmt7Xjd$e+-xtz1At-f`p&A?QEe3# zM&(K&2nZDkV(`cU7D52aH&+I`itk-tkQ(R267MoX8+*@8^UOAKdFgimau>d&zBjAy zHfvKVqcSGz8BcPM{vHCN^gei%U>l#%Lyr#|qXRc$7~5rx-BiYzI0iVapL6OeM>FP% z`x{(8Ml)4Mk$o50sf-6Sfxc@<5gp_phA#`xU(VtI5AbGjx@%F$=Txq&M!tmwfqWX` zZ@=J@D?gYnSit<8uV1`_=SgqmgEkGw%MF$gIa6>x_WE{%_Vo{H^QzIfM+*(h`t8i> z!N5uWPr#c|JotJlLoiLCr$bNp;J_Y>$1xg#qahA41q)D|A}Hb-Q?R-aHrUA5N$c~+ z^Twfg_VD}^>c}2`q_km#LggA{!AA6vc%8voIjP_@#`|jU1j^ma7Xjf%)8ZKyGGTtv zjCdf*58(571h&?Hcd-|nl#I06H=N}mz@d*FqXjWc0rKwfAv`~X-uIm;*aUniTpSr@ zH;fff1u^~H&+%Lp`-Wv-e*N`clCp#TRM`2&`jf@ZcgtS>wKRcmsPTk-{MP+lZ{8z& zc6m50-6_n`!NTFaXzqeE=!gzdd{OWz4FOi<`99W%>_e;q(B?wk=lC&j(bsvo$?O2e z7p96EcziIw8x=8me$1dtZdk#F3ELaRAO+t6P*#3lXPKa&{J-`g@Wj;7e%{K4K81?! zN)^*&At>YEn~jD^^93HnPb5fps&7#+AAp9jcwiR@3@ZSX|<)&G^ zqsQ>DX2Mse2h->FB+Nun5A-{vIt3K-JJuKi3!MVG3-|>TN{=QG@229#cH`f`^l}vm z!ixl@4hrS>3FhhY#MsQ1K9IPoa-t{+#Ura|jzB##q-)d%Q*3aWP$5boyk2)$eSo*V z{l|5i`DRW`>cV_W#r*#nhWb70n0?&z$Dey16a|0|sz$5Y)3XAc@95IWlTHc07n!=h zOS~lK8NU>3}ly9e&fL%uQk@dv#g~e zB{LK;wP@^g9*I-b@$h_xMxuVWn<-twCYy-GCD0;;I)5yv;&qP4HZqy!>!{ zr9XQ6IeU^o-_|*{ZK$x*@^JgnLs^~B6kom{y1m&1B3XB2<&op=g@D}zw&0JE@twQ? zBUNMdY5%?0LtU@jUlXJD7RL8}v-eo+y@7)6d4+!%>SiU|*pQvz%ia=1D*QKwO492t zygu_`;y)PbZJ&Wj4u)zHH9b5q<@o=>P(?=&6aT?b?YpNSVxhBf91L|;R?Gt%DxiA& z^PJ~xXaAMZPR3RD8mp6@VZpecN9vC~iQ1)o4*OhlxbB>))vAMXN!XVMr@jp?o@j~T zV5lu8(Q*zRRnD(ta?X#({jZ0LPm#|r`o+Zk#(?-VPN-lle06U$Lb`yb6F%H@>cd_3$#1v%AKr~8XN0UW{2JSnr$jHK6p&z`9U56{{$e&K%uGAykyXRjSyY#kb zWD%aSuxO*fPT#>*~R&GjR_3ADPXo z<`zHO4ja(BQLFA4h*+FQmft46hrREXlE8a=8Iwpod%3I422Sex8m#P$zQi4Nql6{bVyR|GY}#4!ZbAQMnv+DSu0f|d@`0-hw``W|e_`1JR;sC8=YQI+B=4mSVX=G5vgfNA8uJ_(__NZM+`D6O>qcC(o#CqNf1e{#|3{I$e6Z%pYt!D-vK!lqu3ZYSXo- z!JNtYije;9Oa_on_&QO!W(7+e6BA*8X!RL4z-l`f>J)-s$j@Aif|V*WrYz$D1ahxt<6hJ zDvz`K&`q}EVFdMI4it*NcK7oWw+yY|04KjhJ2gcxaQRcufL(~*kG}!uz=c#dinrlp z`7Lpvkf{ggTe$$#la$i^aB=-7 zr_Gnr-re%$eu1u+or}mu4Zkr$a|Be$n1r-XJ)-h&i0ImbVHQb&qN?I#u3su75hx9I zWB8@+1%v!>D;jL;PfM%^6RdCih3ojHvq$ENlHRmR{-lptxe5xpuL=W6I7fs*KTxjl zPKpAk*BVYsvZ^S+OW)Z+Yeo?02*hfC$_54Qp3T?0bxNXd00N;JMj3r}e({mrVjlqz zV4l>Fr!~(3a3d|&GGKJ}uGve~+=FqJSQV%$8pO4xb@PIGh#`pizA(TL4o3edIfQm0 zz>DK=FWk#1?lLP!H>vxK_Ou6wJweU5LPWk09vd@VDt8{`T952Ex6-7Farf6&Z^ zdt941txmj7>fik`U$0plR{wb3XZSL@!C#gikYygXNCekQLgRA_!tc2$vHIi<-EOPw z!V#pL9%1NXs>9YjOhd|%JfXN+KEsi*e9iq8zBB;oI&E>G=1Se-ZC*fM);$Fv#9WHH zvTvlOou?22(hld4!1oWFzicl6FeUIsC$>@=hvcOV;NfG}S1v#0Xa8^rP(QPl2cN0E zB8$eTkK!~fPCXrJ|9u-8#Q)Rnyb?$#>S}F8=b7-6GP(`5M={n5+-BrEyr=uHIj=nY zLZv)N$D6N~WY;*nswg7x?o3K}7Dnjs=o8(|8G?v+L=Z{qu;%f^#^X>xD6s==01 z368^d(u{p7VNhAJ8u4J*%pvxn zQ3MwO?3E|H{-V4Qy*ve8c`pc_)7_&@PW^r(EVHJBkr1~a-afb7J$7Ve27xA^^A}}= zSd-cQp97Fw$NK|v8!CT_fI(}R7R&oEmiQ3oI%2=XrI{Qf*Lzp|@dpitv$cG_Zd^NU zza{%&&iL!o?YYCNx5e_{2ks_U^M;W!KJELPVJw6Nfs4QPgTZUmwB30gv6P`tf8k0m zr0R~Ms-cC}6KqXXcfjh6C$zf>Psloxp_OwY#hFJ0cxWKB1i#R=zNgI%X;E=k9EX2| zB)%Antn|Cb!B8)~YyA&~`od`WABOr;`275NBX+D001mqD5EbV z?xA(8;fk(kD(FXKudEUVD7=6}i0U)@;yl6J9#rV7QZmwQ45pa9B`f3vz>RfSoviI~ ztJ&wgO&3?^;;I0ymN6}kfcjS8pMXk;-@OccT$u)eO9Ruy@jwnQ9k9Pg3o=F#7M%0^ zcE~Y0*=~U0hpOa#pJ87Q@1Crd0bXFZ^83zTNNg)unT>N4P=W%31FGd%@1b~i0py?Z z#)BB|Yb{qm{eRwQZv$4i&i)h2eD+RSjiS+S^jiCKXO0=1h9_;iy%B!MBQ~s<@piQ5xKlEUJ_N9Mf1`#yl~>jS z0|js7@uD%H^AuTlv)-<*)2mD&jO4_Ti$u6N3YSi{Ja_yT%qRWp)g2kB~{L5F3FF1p>DsJv0;;zk&wC?=U)ljbHzg#(* zL%;X;MFpn49fjf;sJj+fNPqwoR;ql1pKesB5djK6WhjcpTLL5OCkCy^dr_;hSP5VH1VvD7GF2fOq;;l0>DOPO{^Ou=PCXGEx;qj(86x*8}>ow!U<6zmKh;F@)90nEnnzxbB!@a*; za(@NJuHF2`f(D40Y`1PFp~xz4_*U6u+1B#*O>8tFQX7Z22J%nnm77~v781-ybJRr@ zVQ7MTg>(2tX-hIk}oE2{5p1^NM{7%XYbU*a<+FvY!R&)mt|-e6ALl zwl_E=&sY1N%gH7IOfd2>wQnK3n$l9|1-Q)Lln1=sjR9uL>M~Sh<^phhtK4e0s~A5q z(sst;#)M~Jz({Fd1RAvPv$w*>)FsumJMLxAf_r*hZ{qn(2oAb^9sGLQ^}|9h#2ko7 zf^RDxoYK1WX*wsS=5-G@RW|y4p(X&KWj-0@jtHv|O3gICOno+a@Pm-k`!g`ZAU(Bd zX~$1*A$51Tn6UZE4Z!e{3qMKf_3gX#N*RaO3tB)1 z2S$x1a61urkWQ}`9@K~eOZL#y!1FoJ+6aN4k6)!6&JU1d;2Yot^bkJGW%=dcJ)gW|WG(}e^u5n#GW{W`;e`K9? z!9P1x9&LdSIaKC-%RIXIsv;@#C~A!D{kQ|!crB6&wFPdLZ^GON+tYO+I1H<&K0p zA#Sdxjc@XgFQ0_Ig5`%hWd=vXHGQQbS~6oc#{v1Q7zBv78}w=n4(wHj7sA7$;bDcJ z-UD1bgLUCFc$klPG$F&pgBd#v4|9UQTEvIxPekemhz@d#+{K?Ry%E!hKMiCd9|Q(I~Fp;r;}uK?|(x zPsZ}WOyS?t){-zAJnZlc#LIMiUmwJ#FcWzx*oKwiA^+o$%@4+(V5}r8I@;gek75CS&y>7A%VKcenuyy1F}QAevbF(?3dS>*(GGH?CEe+Sa}QFpB8Gd z>2E-Q=4WOU2!uIFh8g*c#c*EM7-oQ)z8I3>84{RVbt25|mq&HTv8E7%=s8bJMr0w_ zUy`dDKK1g{?B>DDWVhM)_L+u?(;6)q6;G!+S+mwOs6UVujD))PPX%MbpS&6J><%jr zopMVEweh3qu2U2f@>yaK)Ei1f!c=%6IcD%gMN6g$I?MtCb4AWsn{&PK8!L$l5?zNH zQ6nr+;jh2vymdQ;ox9ombh5M;;-mk|nHX{>YVo8Hk6Dq<%kqiXq;d0;CyhMjwF|-Q z`rrOWi1`bir@!tk#xj3J7DAj@Fazddr)1WrtB|2ZKO+`2J@xn8)A?9bgnJ9jrv-K+ zlj11}d)1dugZp-TpPiKdQSm*ZKPuv*&G=%)Lci{W=kQOj8S+`R@OZz7jqh+LBn9FY zuC2EC`RL;Qo+4CqB#foV-&>X#m&Io9LS(O8>T%zYcNHiB_jg*Y39Oy0qTyu93CAIew> zxN|1T!UsYKiPMOVBRp7AMT8k#BdKXV<^w+^*nWgJ^tILrn1%7PwkAt}>BbkqO%4E@oj#-B7JD^lZ^q9QUbjhl~) z+>C}l3o(oHi_cC>2hzjpahbYn8c2_kd(v0P#x<{9c zC&v_V89nEIDJfK?kW#Iea?3V_GtB>vf6CpnDfiM-Y6?>BSEkfHOsRX7QvV^PVLau* zQp!WyEvFW~^4a7^T-Q?dc$?kNKk+~R^z8X(>E~Mt&Ofi*Zt?#6=+*YgA};Sx3MvWo zihZ6Ym`Ybj?a)hQ=%tDw5hCWP4EG&D573)})Sk-Jw+~a_y-Izjfavbpd6l>$#7ylM zOdC+x9T-X-*hu~8zROUDSazKsdPYwjy4=aSC?{lGS;<%GpU*CPmUit!+Uw!nA;Ak1 z3OQkt7hhA;@GsIjqWPG!yIKJkh?3hAl^1>uWlwhHzryUDiRIR?xIhd|!%kfo`LMV2 z@WQHXdR`$Hr%u-1eqZYlpH>9TXmk z|He>-?!DY(fxkY_mo_^yy*xq2%z>cSQ1w(wExV2O*1R*0sMj zpBZ`-Ilr#so3+yIliyhsG1_J|JJ^uxt%tvn>OXw2m2Ip)*_kV06t%D!GepnmlFy77 z(HfJ8KEC&IA(_d1pQx3t8@Xlt>vOGlUa_l`*8IDRHo1LHKZ0%z7o1M7N&CH?HQM@L z80xvowgAI}4>mOm^7Wrs4)LDca{Tq>S?tp9-4_c52K3qT^&-dD%hO{w*aVyHt<9C$ zPY+)mzOS_%`6}?s1KV>cr*_Ry|AnDq-v6;96&2C(WFZ%G#SF1M^@5Cp0i}vraw(gN z*^0U5N;#@Gl9X~aYFm`@bXqt63q!qV@-<2MlEtqU<$`@%oBta_Rk`e}`CR47LCe1? z#hxw}s#lL3PgX5CcILV2wUa6AzyAwEO;)=RR{LD-W@PJMwX&GE7V72kUz7g_L*4qT zP9^eKqN_5bQ_$5pnyvpEL)Ex*^>~WL-Ru92p=xkO&0A{TzjGr+v-W;%tLFce7^=j= zd)0#ez>8l_CYpb+hVD~>Ax~W^&qPCsizeUT!dGC(1KEM^B`~r@ZxpL#mG+OMY{qsA zV{R|Ie9IV`6&S&!2JnwY4NtmT`)(_{j-_~9X%W3F({7QJUaI3FjcG9`lyL+UVTy%A zb!uY~%zXPCa%Q*&?>XOjYGm8Z$*7=@ByHtLuFYrDh^C#vmM=u=zxqX-9CWfd9H{;_yF zLV|3zl^-23Bm_seuuq<)HKan(v&D0c3?m1Qxma^=&Q^0^{7`|MczuNBB02(xTc62FSSD=*o^7n4o8bV|mK?RKu< z#f{*{IlJ}Pde5iIo1a8u((K1ZG5oi`E*4SKBE9XcK7WWO-EPRxiF&5=ZRAaZgq-m= zu2`+ns!^lbH~XTz###H zBQF3%Awd`zzWV}ckV;3ftvh^0F2f8Y zJLNv#dB{7T*kc1ftXPfwA#vN|(gojAMW`+U{t3DM`V^cwFh3f?dx`MRyfnY_)U>%LinDIsQ^g;~kx-dszeT89bDC zBr*9W%7HnuB)pe%-H=K-Fg?W?0MoJk_Blo8VQJ*@JNSjLfr1w=0}-)0!P1g+{~lc| zPxElB(j69$mQ$hl?nQ+wr&3xEpjnI|SD8_H?1AIOkf?_(cY+Jlc|ff=Sq#cv@%+Hd zgPe7tv^Fji0+q{CUk{_5`BZ!%x>TyVJd(HT;ly2XX;#Z{b(X{`No@O?|8%zTbD1cdXss+Qkg%CqWVS z?~ZENRfKbcg+3ne*_gYW)_q`bb99iZs@%HUa>je{*-{-BaxVJ06-4p;gnXiVLnEJA zHdZ0?f<4#y-B)_0cFD0{7|09d;}!51Go=6Q@? zK&u{=lMOg;UcbwW6>MLUZ|=1dPgyMRnZ6_dJK0d{xCgrpnLxr!jFCs>+xss{-Ftf_ zC^h`I$?u%oRM>>JUAG&u`Qb~{8QT}w8Ljo_pI-h>cxCl{^})u=)t9*cVMZIta#9oU z$y-AF!2nr;hRK{1`fxEi_%>^Wjo7j2{gybnh?AdX?l+cxnClU$Bt)d)q*^>O)FrF_ z*d(RRSF$LYUPNABG){Jsz-0|pLgqM8R4$BfkIlC8-S^)8b&{Q;(AmD;vhh;It}6E! zpe#LV&Sk}FO0JwB%Z*~*XggIRK6}#$<;~6CN$f1ob&?=-|=D`1u%+yptVw1)gV* z*QN|&SYxNui51&qWBAnU&@4OSGV=PVZkr?py=lB`G*QT!2ra>z+Xo<957?GL^eJ{I ze3ei!NMr?U7J_~{l!`&~%Xl=vun1;Kka%sPA$p;9vW@c6wRXmh5S~dV+eVzR zWPns~fUHx8b~IUHW}>+fVn_kYH~J~tRPRIg$ghx8cB+IriE1c_mDA*-MzXqYpcs^- ziK)K2P$d)?Ams$sDZ6s@&Q`MqzL8wXX7L@oFO~jtL z^=C@ErE)SuUaz-9o8^!4t@?XJMLv`yeI|%V!J$7_V3nrtFd247iE0#b;4seECs25W z>MTvQtgzp3z}YKR2}Xmo&f(m@c3IR1i&4Oad->X*K|$7F6WUt~6_St&Nk<88;x{Q` zJtb`lQ~EV&@+OE_Mpl!=qRdtLJ)nXM#6XXJh1(;w(KsnDy0+|{6HJI6v_ncd=1#1mfH=&@RbHc@vc4goF zXGnghBn}3H3^s#N!#@(#C=&HFL@Crd6_B~aFhxRT5m`>djANO0wvw~1NsM#g*X9@} zTnmY>i~|PRNiC2yjVapwou+LxginXvS}3*W!G zYMeuZ6fr+A#d8&Yp<)qtozSBmooM^GdrtkTizc01$X`xWVxeZ|EphvlYKO~e6h8Bx z510_K2q#-}qHT7q-!x7<^m}Qh^L1g0$<7TE<}6eWtn5`|J57?8?J%^yYNZcW=IEl5 zb6%65l@~OX37p@@4n_0;W)6E(xD#XA`9i3(Ni96(CAj9)HFPmfeU8W(TCU^4P`J%+ zDH*Dttkoj!Y10X=(Fn(W4hu8wG+khW#HsY1A7R*hhP($zIA^+352@ZR|NVD&g`2M#oVI zHsZ{~B#jkVzV~Af1T}%6$@AEG`MfY=mjw^swFKm~Ree<@`&dx9eCaQ7bq28FUXr-DAiD(;k7$RQJ_6lM(>2E`ZsHVK zY>deQDb)=pBtj8b)h7=VxEDaO!|J$RJ1Ol>16i0L6nlY6HRW@kwa z3PNq-gsF7X7CX`QMNEE-6ay>%f=gl>G9j>x>85r5T9SH)k$%{OW9r+n7Ce*a_u%(c z^EtpMLwpG=*fz_%K-A91z>n)Tw)}8UBFDam+RjP*67JNhS>TeRE501iV!3Uu>k@S2 z)Qhfajk{@M29U#G%9%^c^7f&^6h(>7mE#PZ*%iaKQrlrrrcj?@G0s4uSib*^bduhx z6VXV7Y~mxPSo-kJn<-}eV`~+#M7>^4;Cv6=IRFEFiq1X}w2JMtRX@^q~M!1Nm+p({F?o1)T)}3n8}w(gZLX z9hPiG=OXAH5P>G&Of2E)H0z#oHhyB;-DC?T^X&X|mud7kPsbFK1ZoIZ-ZrH}%nv(G z9v1A>{aj<)P2w+G_7X`-Oiw&xRGB1Vj}(2T8i5mDz;;w)<>D3{ui0s|5mLyc)CVS? zkGtqOo=zTYwhIo$wXr0c3z)q>ojae@-i+!&rvvqZCV9HWzZJW}jo;lY8 zKh7wmZ|m({z?Ad0s3Mb`@>{;SX*NpeOYuP;ZsSC4lIjXLz$iq_n3N%CUiQwyo7|Fp z*q-^4v*C5{GI5Hj7>#-cqI=ddf{i%g&7*IuX|J!5e2uav47I6wB5#)EOvR!1*_&E* zS_hxJ-uuKhwCLnE&h_;N=H~l-#g{~_Nj8{d?H4|_u5JD=1g_~y`TIY|6EbDF>Zz*8?85_-DL`#@`te^y>6UBl{Z2Lj#M>;MK^kL#y zB#R>)r$$tzEdZ|9Bo?exv2xq9YKjY3~cM<~GxUo${EM;A0(J`2>EBBH%b1 zqDD9MvCH8yYt1f{a9CWiJkc?GVSk>o zGf(HxfP%{v)niZN{tAbrq!*hp0RW6tHUXot0&E1GkFf(E_&(9x^lwmjS`A>9XmSb z*RrWro)q~X${`m^&#npnso+Q~15LNU&s~X|jZ7{-^$9C3huWX;){%I=7Xlps719|#5mzKy8-=f#zCsT`* zj(H)`2==zj~<-@elPer?Va3N@Z6Y6U7zuLpJ>JUOEl^vtp=}oLNNTOl?W+q^ z9XmZjtVYCcJ%zPqRVT%%Ug>(c*ev#6v7vF2f zPCYkhU?vyed#YgU&meYOF7dnKJ-LLw*k@^*-*xo-lE(LO?X9!FUQD$4JC(DsFX_WK zgYxUYoc-mJ=Zl8g|Nd|vZ$H0s@7ms9CVi-FxqUHNNw#ZiL@nafs>S17TeIO;g6W5b z(;*rxM+Kk-bZGMy!HSh7&eNSAC+fkwH;e^+!sde4m(sUIP#_Rza=ubw`W}P{=bgg> znuSCl4h2 zzY56*7A6FbNOtb>4`ev@;)QR8(lki=q!XhATR-$`@a4u_j%c;_kp1r?^^!~(P;BOrrHtWrajdyjtR70T94>r~-id4!$jP3+r42`g8sWKU_jAA2j4 zkCoF2rSrq91XMrNhyYK1wyu1Sxs(`4<9&JyxMJRX+@QA3PMM|uP0jX2?y8Ke8$ zt^vZIO5b~hn3Z!M40gG4W3cnCW%Gz@RG&zhna`~G*xz?}fz_M8L{A&_NF9uKeSCdo zO>N(^9=Eue*h=W)J&O~_{t$uY^3VBbL$R$sa+HX~YQ4IlL9W|H-PJPjFQ=V(e|b{Q zt=-_2`^o?NxkvV+xN5f7ef0HXZ~eB;UAvQdO4FM?^7fEvkXVyfm-YcEFOLMntJNx} zWQ~p;sq_2dwtcwk=<}~f9$pZu4-lL8={bJ%afMhz*pd0;BVUg`YZrSEb9LVL=W*{B z-;XqSz1AG62DQB0`{UN@1M}lP;lq#@Z^}8c5GeJ{r_1K3TncRA%k(lO)>Zs*ys&n# z#kXT09tjha`bidpTh1v_R4IRxF8I-&Bc_xDzS|e;^!Fu`K71E{_B^sK*hs6%o>l#f zeyZqb{0ZNwjz3S1<8i#E(JoK=XitMM4QGyhJMvR=11{4~@(r`fwvpvvsIMbMl+eEa zFjQp@WHIvX`-66IiC;b$2|vz5gvyx3@5bodHw?1bQrT44a>nnl!{J`0r9n>`#%klH z!Y_#@5t`v@I=Ugx$(yye1a&;_n0~Ihc=XH!RLAF+fc_}y=u>>7ybWET5EHo3F8L<> z;8JKi({voAe8FhY;OwAGkizWxlOQ~U&(xz=#!IyL7oN#?qn}R*MI_Y8Hpbl~=JBJ5 z#7j<%@n+EzY7sQaqi@c|l6#zin?XpK7Wy^kvKh71W1WYO8iBs`9ye!8en_7A#V^_t z)TKX($fEh59bSeeY)TD0``J_>^?rEhw>$J8s=-ut04kJ(={fnm)?iB?EBx-c)JHMH z@Mpb8;Aj85YeD`Azlr~`B(40!jL@iVM!hJP%?9Z5*|BjZX(XwdATFdS7eVG^#Kwy0 zfUyv~L9;kJR7K|N|HIyWKQ-01`vOlw5|U7o(4;3+sZvDg5UNNspeRisG)0P3MMMcr zdM|=>1VN-rMEMS8_4htXYuL4LFD}Ar1mxV}S9ZxX z9mZRw_l@jALL1HZc?ynM;Llp*0;xCD~8!h;E-_-4j8Pw%jWhLlhy3I+T z=&_$Jw^rV^mO+sI*w)h~=|2ro-1gR4=>v=S7G(YW+ItUEI_`;921P>zeOqi*m|p2VC`PWSt-#D9{<95-V2PE->2w>vs2wschXth^J3BEL$1Z0Z(T z)(tw9e-e2cdo+*$==ps#hXg7cbLj=Mn2G~2lydW>Q}YGZIo%7+^DDyD^O1UJ1che! zF3q=<9+V+hESdcjMGv`(hulbnSWzIhDColxutOr$l>&9B247i+-cO`;m8N}&f`ukR zZsB1L)vuzZAwd)f`4tDR5!&sxuUxgJ^gMj|62i|d#Mlm?o(OOua_3gVp$2d4ho6w} z^j$IZkNW68iZk4|V|bf5G1Z6mn4=#}oZyeEUtVt-OH5kCBln~k+p5|1Do2E0Va{nR z=oe!2!z8y0wAi0B6LE}{s85lRbW$5|`TpkU!SucY_SAg#b`)`arAcXnSxbglp2!k6 z$f9Awa-oJrqo4U^68CMplv1Nc%Nll5FIGhpwksw_ZYHc=GL61oM^KYiVFT`V0IWJe zu+|QKYW!XJq)=u}13|=|sYLK*7hPE-%eMC(HmozLdY*AdPjsRhxd|*iqs**i?)Sft zd^mFmw6&P6B;GwQtjQpjLl#T7 z&a!*<;jTUl5hp-fAsDH_>8^=kPm(MJ!ZwS`=7MRdGp0gyZBhX?<12K85 zNHZQio-guYRjj5KK1L9WkUd+G{JPd#T)kSnBbiZo@NB=ScxEyn4^ONvJ7Y5sJC!rF zwd@pt7oIu6vhj{`^3J0S$(S%Qr(qA<{ycgn7|l-AL+7PR3$F8?m&MeaVGKrJs}d9g z!n#Riefca$-_ZM=t`bXNx8w^wRqSD2G@YB3FYeG77qDxN3fnRh9p6{pet*$xOPG)% zzcL_yH|2t#PtcQ;%4lYVwTZ;0cN@rG-n(^|7v{J!ybm{rL z)(cV){xj`FPAUV&^xDN{p6+NKoj!*;7mVK2P;n1t{Pdm^R-k$eRPVs4Pj4}`=bzyX z{%Br^$=F~jix#K)p%Q#d=Cv3Jvaa;!6fce%a{F(;`NIg z;5Vk)H{bIZTuWexIqi8#ASKhwHYEh?iS?-dFZ|a?K-N6m;SixB8>k5Bg zotSN%(DCYw@k=`6dP&e^Cg+Zu~wj1hT#NzMvN>A9>=7cs7)7f zrvug7$QXMKIqiA%ufE0@KBKixHl|b|3PFmw=F#pv&CxtpJb`$H3YVEMv8Y|hOginMZw{hLHIX1s*m4RbKo0M~^Y1}wJcAX2AdTA_%tY(8(%p6#% ztXZ!+_#HX#X#XC#KFM@LE9o04xjJCyn)-!nt~-M^A8t6Xn(zIPJWZ;<_H;)zxykhYUbvzl$vz_y(@7PhD*2<-Y*?18V*+BKttI&Q%a7;W+ui%zzTUeMMLh%ZT0 zLW9;^7QPN-(SNT~j%6q>;I;{7-XFTF+BW!I8vJ-r5R9w)Q!UV^Wtb8p;Z z-)p!lKX(_{CLn^) zAu;KxCC_)^{@E`BK8FpLd3ND3Y$@#Xu1+EB8O}1oU5KJN)DhpAcss8rIJKKv*lohi z6&C+Pvi}IH^f?U`DGfOreoSSO^?6aIT@t*3;2dATIG2BF_l>tapLg@_Y5OGMq(srA z5r9eVm?(qqmB8Eak5)|Cr*BluKCS;1{o+R#^2@XXQ*%T@`_wQn+uEy|1$-?d8^HokicJ_9s3U30#kICvzovsmB z7a38e^2GD|1R0M)P!;r%_!P}&6ZqKtu)k0H?j?X<(~x}RQJ!xL@^TQ z)Ng7E{AN$sGEdmcNH|nTxW&)Z0kC>5iJ2iMA~}+nKPF1?(c!nsly+iOJz11R%k`pI zGUmbE3d#Le>W zPQRmyaP$jg0gdNO>BTwB0cX09sn3WE2-8SN(^FPpnVCzOmd81%7SWn#+=T(JUVVDS zBm1iOSy^jKxxftuC4Bh}vKJvUFHfnwUa8`=xW2q`s-m8=^7wkCdO)m*i_KRb`v4$} zeUTtYE!gg1Z8)>MLNRU!*fsJ^$O7H%g6y+lW^3~R zhLeLjvkO5t9#-^^)|+M3OL5lEKjpSlZiryIcsfzG@~6SJwZXl$QM4&U+-7cp$@!78Xa)W3tC^^16AbF82Y&#bH zgN#@QNf%+c=IO|Wd>-w{;UxW;R;JA+eN2Ydt&|#=RKhK3qntxHbjs6g*yqn z3t$vpxH$6=+n&!9pmFZqYwm^j)lwRUTdj3_)8wbw{aG{pdE5if-&{ER#&wY(HPUR~ zts!us-u#*dlYR(oYE}XnEw2{_Y9^YPX2i#0~+%~oHdG(~*qj-~Muxaizv z>g^ya_EjjLK=$sFR7!xw* z_u0Hn@u1%wk9CaC%Wi7SeINH2Ai#*EE!PnCCra|UaZIAxo054uR$q30^K6+3YC$yF z*RmwDCp%kT*p!oc?K`>3N%Z(X7eVqP4V+; z+G_poWej}bf6Jb7SAeAq+Z|^spP2V&1atiUv@l0q8$7|O9c)*XE1Ye0@~fA#X2Zy- zlQV0VRV8H(*qKKP2GeoMmAZT(htqQ@1;S(s|AnC@%{^iGrW8QMP>cGK1Pz|4M9CJP z4y65!p`L#+lqu)>4~F`b>-@pdN9jq~yLH3^W?IL~YR?U`ly)vL^r2O&q%Ge+m$H$1 z6f^^8Nh=m@_qj6oOf}}BdvBcb^j{ci0kJ; zICg@zyxF>nD)_Us(+wxKh2d=FhiZKntCmLpg`wWb+BW9X5gzw}^ zi|RaDIAl~m@%=_wneIqX?>Kos_FcK$1FvfA#T*f|p7xy9&Cu`EK2_u)%WbP$bg zl7UBe$z_9z`e|pbu?B$WG8*h#pA5)@*m z*mu#^^f^!Kc-8Xfb^=I=Js zYksfQ6ZH6WIDrYjFzT1S@yAP_s7O)-3eDr27waNVy}}+Wt*3;GbaRtO(tWwT^vaM? zPtSjEV7}4txur3!p$A|`KQ@E+^?f_jiou+j&5`L1+ue6&4!)iFV}QNTHzYn(n6 zB6C2=KHdfKs3cy{no5pgy|J^2;9J1&qSRdge1|)`KIzJv2P5N$pWJT^Ndi>z1z0oK z8h0ab*&+$&$6WJiFp*V}rJ788Uk%i32oRZ4ku$dWJs7YznZB%#%bq<_x_rILI$!H% zO(Z0QUlu6}D7ej9USPTTl*Cl)=nu7ezQOak`d$|ET!!tOPn#S?mY8`tZDfM&)<+3 zeFFv`h_0(ixEuTR(&ADB+|2-p1OPxRcx}@GCcz;3V#!Pp*;+q=KdisA$^&lrB-T4! zNv`-39!{dyBU(+D0VRIw4p~fWrkBAMb$p|YewcMLSk}IfzW^$}O+75-(QsY|Bmyzs zuB;<0XJtxU2Im6L3oKCLZ> zSJczK!f+I8?Ylom!q?8_)9my$oXXD^pS5g(SgxDW%6hZYNs_@7)GAdlgl$BX=) zXMpvHSF-~KBanTf#u0la-o6f{JffmTagk#gVtz_FfDqQz=5fJy+ok-^Gk^?zGJZB@ z#BVS$y;%JZVBxgHRXy!%F-5dk>}vUi>^tTH!jix~ze+$UPBByQMzcPV8w$YDYK{S# z+sz^zgo@DV-PcdS2$+Jt8|1UnRGNdqbAf)Ct2yW;b3*)PT{l8 zop^t_{Y(ki%VszFA6`{4r=LZH8l&b>Y<_WFNEu3%OUg=2X8Q`ug7f{hRiWW~SsAdI z!lxIv%ryxCqxJ39LgEt&&)~Me_kZ-Y+kQz_2Um@A0>v|=ZhNrHt~Vgsw30`Q}!;RIk@f_sV|UWy{Sp(_po3Fh$bE4X-3xsq$cjJ^%?ZpBs#}T znOobz8od(a3xE1RBk2Uhp9A)61&`X4`9cS-GEHWK-B{`F7{-yNc~*={x9TU~S7K+1 zx~@<$)aTY(fu0WMZ%>71924RIK~FGJk6&x_xh<-EWgp&x^w4JG5z0>?Wwd<0I`op( zDk~30c^7>kAIaA3r=+itkXfV0;xo^~eXsEMi1ob~eRW$m=2=^OJ4P+mlik9ffDPvD zQ;IkF8JU=XjeN1A?W~$;*>Q3F`Pz<`!J`fZyNi+!Z+9#0n!q^d9l0E*w94lVVU;|^ zX9oARq@L~8mG02+$R7w!)b(EQP+v&--el64HPL=4waW#c@~lZ6?(VV@c`539ebsMC zE4%66s-w773tRz63Kc~Q{`lmz_@e}YsZO>_O_!w76QP#eK%VE~Ib0Mkq-!7_7s!ie zHQ>2dy!lQ>?={?UJS7)5KmmL!p~3lY$d(zg%8~YhkvOyZPw1j@J{uU?xW%w==!H{zEJ1^iiss z-(G0PUAKi19ivGy-zF8d+OSmp%us#iFRGa@X<$PKcq9 zMp}y|q~TH+UWD#85eg>2FVBUYkiuM$`HG!G8=ffTN07!*OhV@La99U7J#l&gUyq?nCTlagwn zGAheo}jVXYLpDU!TUM6L4Bo`)hX&mj>72`5CTBN3^w4qUfO0{)D=u@986h?C+- zIsX$W+MN7(KkkGAyGB7yOM{N_DeR)?nreIAmv%+fKz-?CGf@M%>Qi8mw8JKrM#>Jf zPD%by4Z_&LrKB1BJdXjXK6^lh%Fjk?`hO6$u6Y282IPExDYD%JY>LW>c7i|^ax*{Ba?NMY z_UCe&CR;qng@%M|xLE#d2Fd{ve!a{)G@&<#=WYMQsKZY4b6K;ZUxNQM=fRh`d-B0t z=vo`*FXW!asdWW$R}1PpkF9-r|@iap=ep5_&}lLa-kIB zxs2HJ^ZL&(927?9!|r&381=K6#V~bC&soYKYfJe$cZx1W7hNtZG8iZ_S}wYRC^iu* zzN%krcBlA8bg@NQvDHBF&E;ZS#0#z@kR>ihV;Qr2`vr?WLaT+wefh;*M2VMJiAM`s z-mc_+bjgFV68UPh&vHo+qSVK-3Fco{@hQ7uaoIA1>1 zFX6(yOuh3mWBH}>9FS`cc)t2&z$XwFA1H76Wr5OZS@E-yk`~}g#7oP3$Rlj7%QA-R zV^WjlBd$^UWk5Muth`I9{FQX5Lj$n8tb8z_oTdgnh^XlNR6fF4@j)rH-Lismr#zwm z%&)nuv+J)ua+XiuNywV7h~}tVC}aFK@XFz9>MzSjO|Z&kvGUaej&c2}v4IMfGCy4Ut7LLtlmD_i!a=a?H#46#gm0tp?N0+PDKS_u8v)H1Z8OkC}F4aWTRHLmb zm}AtrNKnu##=w>;`SB{@>2j!eHGdWxCaX$1hFzAc_R%uuMe!Pi=^ETtCWk9J7NERn zA-FXMweWL8J>iga%sc;Q4ES%;#NpR|3-)~q|<6cf|b zlhpVT(G+M<8}Km8-nHoyqS@K1wfk8^c2Y~LRTKST)2oLq3_jWYE@&4l<98C0ed6pI z&bv|wll~R?-Hu_60(a3Suw8s(l+;4=ur+VAg>ukRhi)HLYLByOEVHVt8*N)TXv-^T zYG0`*t+aQxwn2PyIJ{uCcI+~`sD$cMOQ=1>#ZICw5!x|s?{}{QH`@A}uKkgC`|H6L zlv_KKHPv#uRogf6XBGD^9VFS}cgn@?;H*20Fi)rD1g!)CD%5mUh` z3OoqUs8s8O>FzFVtv7hXp?cJ%Vn|kVBS%qhI`?abz^*f^S&W~^>t2<<+`SMVWF1yZ z`;h%fCfj!u$#?^S&wk6cQR89V>k`=8P3U!X%96l5nC@fdZY6OJddF6#m$Thh>F=JeASv**y%}K8 zqR*#iN$y9Uw??%kLI;P2hG&Lkw;>i#^b&4(Oa;A%AAZ6;1o3)Xxr%`-=bIf4psWYA zRo=GJcRZUBbg~p&eKS1as{kd#t`pB0B@u`-)Ra`A033%_*T^d~Mi(rk(u^?z3$aHb zyGbJ!c=&G#G-*a~PU77cw<6}@fsE|__Eid39NWebac797m9q!?3(2EBEEb}zkkKktx z_$5>dUCVXsDXNVqU^;gWf_|^|IpX55pxp2Wf{}=2r;v>%y+t+sQ<5InADGn{SOgfX`6(Nk2gmE=UI(NwsRNTH_Ki%r^JHQ?`9#JZ0{OQ(4I zn4Iyjz2Gq<0*lZ|p|d;6{JI4hE7oIhpDK3$RKkD>a1dj&B8=l$8}EZf7Nr1JKh~K{ z3Vpt?L=m$%k2;!z#yQCq#eM2Bn(2AOURE*n{vr6-lmE{;Ej2gv%JcKu-qTm&KK}VI z_0yDvK*Rq0QQzBDmYhRNW|)Y+73$j~^rvj}0e+Zp9=(AdUNqwBjU6%$9v&~B1(T5< zVmo_(o$~ZOLb_CgweT!UIGq=$kpyN%gMtxLz|2#g4?{lgVFFNFPh|Rek_T2W#Hee_ z?ITAp&mUFpS?t0s0^Qvel>7oGv3!AkbOAN$0^V?On)f@Jfgg}Yj*i6J5+N3L&4YHd zrt4w}6|{5mJiJCI-gx%3b;yw+{qL?31#e_h5pDYh|8?TT^Fmq|lHQ5C;2zORe-PR0 zI<1u-Sf7quZ*c!aMh=`sbPjiaIA-Q|$NFN`D!mra1xUwu_Ec5W!&wyZoU6%=1=bpR zr=pPs+_1Pe_&N9VfhTx^goI{zUs{{3+nv8<0-va%AB|-UA;V(xkp*kW7x}b*q)(B% zCwGNA@!qTVSU~BV)xy-Tu6_J~$yFq~mtpAmsEARcI z_C73@LU$zXQE=`HGT6Ro^Bkn~g%8DPtXuAt;~3#CK^Rxf?wbfXK4!k-{vloi z7J%~pS?IAs>n`_3=%)k6l}U^ zOx#SSyOlHH`fJW3ZreNOr*qO4sD}P};^Yc+%vsHZNf)JMyFGo^(ogbZTOqQvP|Qe5 zp>CbOkxBf*S25S8P&2~Zd^`B3$%j}$gut;l(rcU2PElQvjttE{6zn_ zGOziBfk$ooEmBX@Ds~(AGj%XtBq+A_+gKx>1%`wa9AqWdS|kP-%J0qmHvY8%uUYk;JudXPu22nyLRh)AHJw! z1s?uU){a@^3vWx0>^az8v2M|t`B}5G`~g#clAg-!=Xd#yv90&$g5oahX0NY!n`M)) zE>AO+?lAj+3|$OYS-;kz5Da_Tg?JF)@@sv z{okdW+GpscG@JRMRteP`QHQ1kkj!+Ro+m0((QdgX0pPW;TTgn&J!k`&R- z$?p;|_HsyA#cb{|++eFwlNAa~%{;^WNK8*cX(YxIlWnw`}F6&7K=>=MRnqi`n<>7tHKZ^`lTDsH27n4G|nmnuw z6TG+eL%SqOM`>gq#Z%iUX9KUUHiYxW7?y>n!$dJJ*u+*}Q;NXn4Q1X6zBW8RqJ7@z z!erP>h48RHA$b`G1Zj*CG2^WfchWy9~UU#u~xpj^U; z4~(9jl3G0+ZNNs1uOwu|FO~e=R2u0&m-;%}W#>(tet3F~X@R;u;qt6MixS1)BIb#L zebm_(P4+QT_u?c(O;a_a-Ed``RST(fA?H` z_I=YiN^e23ISZ)?xR9u zO+Q}5KYio`7x#FhOq z{|fU}A?+kduvigh4V$k%QzPy8xr@_Q)&_c^NrUkoB@|O+#5Q9``+Ev{xdSxcw4L}w zjy>@KLcc(>oZKl#tdL2_=WDh1=hUmgE61IR(lW?2YVos;MqbjkXL4mmzWTVxt2fQ3 zv`d?{D>@o|quZXXzDeiYz0nw3umgLKg05`(XzVR*2hI;ox)&QpmXr|Y_#YVR({4xM3(c2}?~SG8 zft^Hj6!lHh$5KnQoy5$V^{+RKrB#JFNjNDQSWb+kH+DOn^ZPf3Djlw9XwUXO(QKCu$PH-5iCKEe5Vn)IRNTbH31O@$TNl>;J}3|F;;b)&DOH zbruT#zj&i)6eA-uf|Z$xlbw}~3(dmKz{AbSkL3{(6rf_G+`=qpMZ_hr@C$4!}?q33_@m~Vz)vKmfSB*?L$BMzUgwRxWv_b|P(jk*dmbbAly2TH_VFh^c<^B4fogbwx9KA}%SUSe$fPkr z_kRV+QDsu6VEun^(i@=)exX+XA(JXoWzv)W>WY%1RBcf8S5qOD7kQD9OdRVMwI;^?0GFP8LqTC{ii^}l4&4A+ee>xY@v z|Cprz21!G6?JILrKj->x{d4l1}^Sol2dx%=jGkAE~$Dn&|lNU8j2Q)v!W zAWeViJ6{$?MM$?^J)}CMe+#66RDrauvgjX#^mTPwT21ixnz-%Su$QkRx?Y!L)P?^C zLP`}#V}3TK<^64sF1BRVwI=Phrcn9O);F28ZDULA1tlFBRUMz&NLl~pNB__EC>0(Z z>VHYKM^^`4?+v_~9c*hEsu&xt+8wT4d)M4HQuCkH(f>k@{>vL3Z(01%^>1=?=wsKf zj{{VA^yjCZgHJvGkfURt`>EvU!R+8aWv=!oTZYZ z|Lu+XtjrF#HvaxX3jKfcMnC-dU%XLK=6`vkFr6Tna4A;D^*?)~_%r|68-;11sop3* zn=l)n7FynIe?CU~#hJnW!w=I6o2p#~bZ6Lg9CtL*UYXd84M4lv{b;(sJ(f2PlVu!!qHz)#&d{J*_X)>Cq*6sSM5RDEU!A4c^?`~KyP zs$}3|^=dQn!!*2M=G|5Ne)gJY;G+VU&T_rfawa3k8XDt~*U_O&LVvVD9d@Yuh+(Lh`hy>>&!|Kg1b10y>@_4Ss|)7isu_Zd^5=3pVNYi&aiaSUUY6es#QNcI_0j%P2oXOBe6EwLzA(d z&kSqu)^XxOlP&s4p|U>yw;jiLP52~D{Zh5McVG1r^=G!rgTuN472CK+gBy+Na|fN5 z7C=dw@C$)wTwWRxy2YUN0ShP6ZtBvCA&WPUsvls=CX0C}HZ%LCZYzo}jaM0UV$ghr%-i>h0J$CE;1#RF13 z^$rkYGVFq1)upFsFGlBs{Q%yN0fpKYz8b-DA~@NpDcX`W`rXeYOa=Ix+SEOo z>`WX~+PVO4*(5E&;=^rP_(Z8!h}N@|21f|}QtN#?M3&_W_Sn3OQ{gPiFM|Tco148& z?2i*#3r$YHR-hi~V(6$l$~LplToLVe8frYX@Jwer{Nl|fL${^(F}3+`18)T$E&}zpnGzFrU%iMm8RsLx-jr2ZzMo#WsTRot(9oLMZrv!fjeLM{)!W zzp49I?+m6rw8UjGufVafFHK^L32cN{D1$=Xn$4vI^$STaF8*RCrlRB$=S@<_EJYby zMNu;*CcLeV*Ch5fOo#v6DFroOLyWAqJGDq#N5t6CaXD)_9)#KnPtnR7-AjBtWzRsP zf>$rvray>VD>=s^)-xuCa(N^X*E9AxXf=AW{edC&Yl|i4F)&{Jr&L9bLTKu@$(pyx zbQnND=2bM;cw)=0BppjqSfGgn`ziYgRP)YFZg%3+W zZ0=@N^Tlz@QHH*-Tk$ws&ojGwQLfeico>uL%!j+7R#Oe+>h#3J8_G|c4dB3Qyk8+} z=lQJ?Sq4Thj`oXmmkWxr^wrYY%daNJ%?(4>g0?T&hFHfw zx*^W?2CSmXyuTXG0>*0QOcr7jm(@81nXL?BEWFz$Q;;(*zP1Km>q~z=J;~>9#+}h> zXDez15jH5gIvq3>XPRj^ZqQ|BP39VpwVrp*V%n{-ar6317EM12xML{rP@W@%i&hvQ z$H;fp{#t)BA=0&9jko;HXr+~5QBXHr|bFD`Yix!L$+C+t{*;S^Gu@?s)C)x*;r=hPZeX3mfK*6nfUw?XJJDI)ca z+=r043jzF}-kwW?X?JtGkrz09z zMgb|Uk_Iu;@9&>N5<9;?AfOKoush z4H~wMEAsmvCR9J~JZQKLx#`a_k>>Z(k>x&Qc%e)zTG`C_m$$vS=ZD!e&zHtWE)b_V zisbJK+dn4upS2yE297lnk}8QWF=28m`39(38xzKCuGjEqyq90g?`|ZO{5iOHM{qTh8xo;1ul5C@J3qiFN~8u?4l$($<#~9^a6?7m5`j__2-`sY}lr z3RM(@vlm31Hrhcq;}Y!|3_nzB(ij91uVW=D+doX$3t(;(Jfk8#03ZZRJhF)i>bfiCdvw23u=&-xzQbN+w{aufyD_+ z;|YsUpj<_E(iWlINySgxn<4^XEWS(F#t#u83Xysn5fG-xu#Cv)CFV^Wef;TuWTGDZ z+4{e{(RBZ)jEtyfRBv=DDtABXFK<*KI<$b1#UFi43PQ;Q9a5qzr*4#$UaFjmE;5f{ zDMajtLQ*o&`a}n)2EDOA3QY6Z?;`f#hagL^8!$PfVJi05DDsIWLM|+loy-pd+Ef4I zGzE+~($2?q7ofqh%;dP)Dfrw{?9x7O@I9LLYGe^b^EKF9e_d+emKD(lDh`17heBvU zNclPG2^iA68brKhws|d~@jIRM7`9shwFLGlgwUif3htAHj3N>PeQ2EW?P)g>?@ih+ zW9j6H_nA5w*0&^nHgT?aq89U!pqgY+3nqn0$dd-+a4e1VDbC~bxWpQb(9=Bws^&TT z0@ra3ChmET`-6;%9o=_lX@PIZV33W68H)F^wl_(QwTnicq$W9@N^MrnkUsbDYvR(F zgpRavV7lh>_qG`TAsGt}r8xxkC|GF@MNff&M_~vYkROY>-;WZZ#>Cbz3RGps*j+e>5)lN_0Q>#TiPB@A($r+r z^MOJ{(N`0*8BKDYi3GOLsBcp;&ZYjPi8h~Z(Z^iS-`_D%EXwR~6ld7X9B0Z6OjNcD zlXSTEtO0(T_odg1>k=b#$n{L+Hd&>9)2v1p(9LZO9+em9X{3^$zbBs^KaYT7Q7G)w zMI2ZHpK&4$8N`A4=McUxX{M!7(*VG9bvB<;A;_zMLkx8=2bCZJ-dmvju^^@D?BKa< zzh*`y5@?b5ymAgIQcZV~_#9O8T=n32#uQS)Mc!;&MxJ<2@uSQ=7|j$vjR_SJC8aSG z=ITIb#z|^i&%N4l#rGp6gU8@K_IbQ;b=&uNV@;$u(bf4*aa*zo%^eh^MT`uBv}2VN zd*k)(>`_!fbPh2sU2p&ZYM)YI%*#j=32>DN46bHO!58>g00DR)s~ux{^vjd#(zt_C zb|Nqi4?M4git~J`1OT3pKpuY+q9Rm66c1!yFQX@dwDSQ~oUcx#U)2Y^YL1R-M`@`1 zMToloMTpWsQVJ6}eI?r0&$8F*b@)asX}hx9+m9%THcckiKVWejibh2V1;C>&xIp6MDl5D?XoMe0Dly082-|B4jG3lI>7_q zaguWFZ*r$kW_RAZJ8z(-g9x}{*^WmFxzJ?FwmR(_Lz_~rSMfdpxwZMHQeD`!M6WA>@ccn|3Q564aBhF z!+0Nzr6g(q7HEHX8vn=-0EGOdhynu1e<`BXWbPRv*H{Vqh2VDs7Z@mr6M%b&m8ynqE_do@X>xki&H2qCH^m!e$g97oE?&C5<XC%-M#6q(Z2c}jz$bpRC z7s=jp1I~hUKkzUoJJP}`E#7mm`3>3q4OzVNZ94s1-qlPP8TuWEe&>cdSno&4&`o9! z11dPUW`=o=hPm8yxeVVetB}6G0dYwfuhRF^{-U+nWayF@;gXcM( zi+GJik=J3#cJx$AG;AF=@iA3^X#-sm`e+ zuW_y!3F&o)Uv=bI{5wV3ZufiCk>_qwJ+uoTB%2T37$S4XrPI!jIv&HmW1%N_$lf~m zH*Ug{2Rb2*dKWuYj~(_po^1SuY_33F!6MoKuppy})1j%pVfI_ABIA_KMz>L(nHkm%d? zJcQs0CowFvJVL0=A~n`#$KHIpjfFNPPMytNz@MBk4%CFYX{_Q%blmEzp2n+KGN6|c zns=@eTt`zR+~7BrjO43>d*T#S?y=4&Cm45KQnTFK5PG(J^M5Fv0?WfvM0(d?I?>x zvS4dTD~~-w60foM7w7lq z2xKR{`Z}ei6M0HTOR=-qZtwp>#g9L1qCG48{Hb*=u!=0jFQ>rie=ZmvW-*U5Ix<1`mi&d-@anY zZt`X8YRC5Q=qBgsW>CS7RnR8Smu<|*j=3$j&FQ-1mov8#cJ03G*q`n=@EmWy5&X8+6o{cQlZ9^s1_-r{jo%IUZ9#mK!_LJ31QAI~Lm;&ONO< z#1;JbggK#+J(qB|%L#>OlR8=MI9Uw}Pt6AtPdjVpjD{vosNU$OD)4X2={c3tQ~*lD zkY3Q7GN%y+XFca=z77=%LJJxHAG}e=kJKKUJ9y|plHa>UqijV#syF)n$Ca!M=(;p?VKe963r8N`Gckj;G(%SZXn|@JV=#)CgI6I^G ze*RKio>u$AKR(sZ`C1ldKmYBGeo4E*u55W*D+Zc2baeD^U-5lblRNV>^SzCoygb{} znMeJ*EBfhWrPP3EF!MD}4X9D5yDG4eQ&Q9d)p|~oKJTi5YLOvPS_|bgT`i`rbR^yV zqV!Xx2Ah@dh7P+`Y8Uu3J`9mE8@4>G%d16$XDb+5_Uh|>OrPBMA}m=zS2)Xe^Rmd# z7IN58;y0IC&WdrS>W;FRO@q=F@Kx!tLb)wNx%b!J=LPrrZW$?jO-*?lJW;o0tg>}f zIjnfN{g*d-TxP6FcVQcUs{Li*Dqg|iBQWDdU!ren7BML`*7e<|Zdj>sk}tAU7wGp+ zn@KSu)`&eWg`#ccNA*S}Ew6o$Dt-wdLa|QEAqWs);f0Y{B7kUK7~pU%+YLs{Dj}BQ zfUx{ee3qnpB5b7&XQu7KeC`2oK-t?oDId%SFYL`+xW#&=-%G>}YgWp(&st&<8Fyzo zM>g(`rOQiII&MJX4*UxV_P7@cFuZxgz82h`y$W*d2=Ka9IM?+ zzC4?~+%xs!R=iWgnEWie&Nm-e!{uj}g@gCF?L5>XoU?i_hs9cZF5F+dY&ZLWDR)2a zdtBN`volgwq)qh9sqAbbv_zZa`A3)tS-b3EgIO(*$6Zh` zF>8*l&nMOgBFF8f7b84P;Kuq`yvD|&j$kH zdFBBSU3NmmRU{P&&0+%1Iiti#7Qydn@J&`h4nVARmTo@Q1@;C)BHCXvA$fDTwJh8R z!thfxU0k#!RwzI|&bLbRRAqa#)qPGFVW|)SVi5u00I&=QbeG1l(S?}?e487bU5O^X z@A!Q&`0mlxq`Fkc%3Dt+K;*D6Tj-4%*;{2MFw(<|Q8Pw{7cT((k?a7}j4%M#l5^RC z8D|bS1rq__nH&drY3pSg6}uF8NHmC{saN3~HaLrZSQu+g?8_Z(E^>)?+d8zVd`AES z4EfJcAEbP)7Z5^p2nq3dX4E%Vfeysq;`sVM7`xA?Cc)2jHT0ekLa(8Rnubyp z1XQF6C`wTXO#uMz zta+Z9z4!lD%t5FyTZ*!eks6x%6+PZ2wzLRHd5Mm{3jy)&q#xh@a#rf=PFMDZ{ms9i z()X7_0pYnhFi&GLyD0E7%7`3V4yFjmi!_BbPyiF{`<&K(rU1T=J+8TgL%y?Fd@u0Q z3!xIwqH+W9nQ_{%V@e3_?bH z=n(A-41pK$Byq$Ny;3ogsOD>rJJ%89wEnR_)19ucTYN;*@Kz zOf4ao*YNqxAmDVn^$)LPA^0V{(t@2RC{>nX7XkrJk0a97Bi3Ac(!@2MrJrkJl3n(U27{u03ez z6}0$L3p=`^h{m)kHr6JxT?quetoXtPkcp2p0rC6Atzbm=XTv$U=$DNNAZ&gT+hUO5 zQDrH%Lv%nV&F6{S_Q08OQ{^f1`nModsk^@;{%hyzfQ1j-tFehHrfX7Qd~R@Ze?sdA>G7?@4v1TPZWWEgqfautE{Js_3^%t-$Cp*19kOMR(p#ojFjTuS<7kS zDMqJvjqGzm@5}s=S5-6Cof>+e$`Za|-Ozet^IQ@8LuLP;n7#M+74_+IPX|JPyK1wP zb^D{*7jPgdfX=J>sz|-)kZ4@};73HYSa^W`1rnU5%;wTYxMAgNB!xS(b?R7Xzc`ijk*f=OR>*yb0F}IkoOQuNpB1d*(w;$>=soG zMpLF*YZ=^j=?~Z*w6eJZsb=ma=YkuFPZY=if3d`*d*;>Ghv{TF6R%=-C$C~%OE`|H z(?l;=UXzCkq&c<{;F&{oz4_wA%YifFnO0lxmWsARhD!g?u-#IeWT1UBv$^0jByl!nQ#=Fnas&x(K2pZt#S z!+`|a`gKVI71ohOnL+MuOP}qVKVL0ftPai|M(_^j9bJl#yEUFG3vTs`T)c5TO-&qF z70ck_gp!RNc-5i+t8`M)0|K|0y;sC*)-yn}4LK&oET4n2sri5F!okxd0!BVa6_u#{2O+^C06P2F4F- zd{{PdO1Mw=Ed9^9)>I2e?8OqtCW5Amg#K%lM8{p_!1+5E8}Ne6&E4@Af-;wz>aW>9!2 zbbmNqf0kk;#a5%h6~Uo<3fUsLo*3fgzQsOGEHzC z>`4Z*J5UfA%|+Ak7^@fBSekVoRojIkfrofusca4aKOUc-L1Q6YC}1*``h>>k0*Hyz z`JS}rX8~-KoUDFb2vEfL7hO=JAz2T5CqQwQKv<;4Upzr!=PU^WXn?2~)UML?r_$ zN6!}{vei27P!NqzPd88mcg!V+pu9dg@Uc7EHOC1wqhIqNIFx!YM3gM@dYM~Rn`eaa<+== zSQQBjiFGwUh3*xu!+;?aMXwFU<~MLJHKkbEX1%$k)IBdoh<-$w;Mz*?d6i&Afta~K zv8^42?;*Ypht~-%7hqtNso)yw(+MhP4~Vyhf*67rE8FtUb%t5lmiwD`4>Q8@*n!sr z3dKgY%0~e!Knc&rrVQxxfVhVM|40b;6yWMWh4xUh`{T_V?DB-|Lm35tAIQIg23-M^ z96WCe&MF=|ueL*}WRnA?~9TFEuZ62%~}xFdI7pE4Qg zFU|og21$)WKzS4vrZItS9%MTYh^-)$k|_DlXNEZee^Sznql=L%B_s7m>X;-J>^cC1#Y3nCj*^^oIkQCN zVyZNjLyQa%#;MTxw~&fcEcJM$A+Ri4%u@v;9lR7E zt`#(;ozF@Y;M&d4s!237@TJ;}ro8f52}@JOy49GuRiFIAxtQbRBP;cX*+->Ne7~r2 zLK<(xlqA zHv_lEPjts?=3|cQ{4D_B!1@PRUi`?2wKb?or$7&Y-A7q8Cx~KXSRi;WNKURG-lFjP znc3SG_k6Sc@oqmZdmefw^YN32A=lz28P_Dw#kZa{H(M_PrKWua&G8C)CxJg2XV=ZX zt;ulYIDXxv3I_r7>`J3ArAXns&|e2L`yM!RPK{vByhB>l%?*H$f+-0H-;0_5=~nps(MRsem(;XBW?q(ms} z6pBR#`@HF4yg7yM&^Dc-Kr;XVETA(r!zFw89bw7RJPuV)12nH3R+cD282}id8&sW> zl6@~B3)l_;&~XUIA*4HXj$@o*as}Y*@!^j*YmQR8_DiHtn zkf6vrPUVIJkVbZP7m7d)75oaYc%2Z~D2ZqWuSrmOWhoBqIXlOu9=c4izw=eT=H+lL zp*`e`a0U{VC(A1y4*=9-SW31#7{p1jORmjMePVUn7gYQt-!FFy^j#?6I0S3zCaLB1 zPQ{n~yJ-_8D2I|AFUD@wi!i^P?F?M_G4A4b#vws0EC`8<2f*Yf*!#eS?zvCT+{U@& z_&F&jtZJ7{0Eg*>c5MnTM#f5;#=2xqi3*v?Wvf1A3v%8%5ueK$@Cz@M0){ zI1)z2l+mQwX9>ByB`#LTy=T7nTYWv=EWDoi?mnB~A|9V#4?FxxRt_Y(i?dz#KQ2po zR`gQ0uqR&YiEwUza^U!I82ML4#^I4K9wxEr4eJ(8EmXWozBm<7jJ@}lNcEXcIA+Dj zeUxq8^X*<G(%_Lquiyk<(DXx*=PsF(=0R9Fc)xhikDw)U6weua z&_e9}wPJyvZ-N#N1i{*%7kNR;qe$>ekXCQdn&$FOUe^3U&}P>?iI<=$gW#RCG(Y^ahUNI5-JttC`_FjV7PsODHG ztzu0Rxu$ZKiqQ&V>EgolEZ4Pg>)Q5Vh8IJPFiVJgVeFGjLT9)-Fd3Hp%a0?2%Zr08 zXT!E9x$xy-B-}=BaY7wA#Z*h5(6atOc>^CDY-Px0Z5eJi7=Ftk{Auetbtue}&dVRg z)sYcqDYR*`o$NRkekwR3Gb2&uCHy2SJYY5)rJZtgS>G)<#r+TqWo@QXHUT3ZAqv>x zo-8c^?FOo9|Lax>Xy_O$v5vsW^{+2Fr<=mE!^%2W{LIP3f{^H z-nCQ8kU`pbYi(tsmeT+W+B=UTdnp_NwJt^We4D->$t`zV^R(qqvw3`s%Q zE#}>F%pmK(ywU3)w5|{1u8-JX|9JNLXz=y1xa;HT*FW96{<-@4mzL`jgV$MKzg?eP zzW$AMed<5F(byTS*jZfcoPF%~v#~#dV}HiQ&ZoyN9D1YGvAG~q{&#z6j@fx4*ERDeb=8aA?hW`BUG;mvUvN?Kt?a%LRt*KTT zhmhta*3SR*Mz!a9vXq>vF74|47`ST~wzRmbJ3mxvdF%i8M(czA?CCF#*IgX0y8PE* zX`<=s`qH1jhAUHT|K*LY&GqH#1~2U!ZOnftcd8EhXS}ue`9Hl;lbw~B?ps>H2c~-) zzsBl=mk-Q(gNih#>)2W52m87|-%3TQ3K@<`31%tzD-92E+N5kGRg=LvM6fMs7+a z!Zb4xALYDiA)eBTosI!}en4lRsN0?u^20d!8Xj<}ObHe3u+H5S^$WjMn&o&m>S1I( zG%{JvV=qi0_axWILrn^F{sD0&=@vgBdaW03JTs4WFzv#Gr5E|BHu(jv`eC`HP72Sa z%ZIS%JLKMjMP_z+7ov>l$SQGbFASQ1TU+?IuW;_&Sg~4*S8-DTkg72n=Qhuw(=)Q|E z8+f4n@QV77NxjwD9KN8g*H>mhzJV+20OQe}&+kgF72Pk6v3jH0vYzyCrg7f_cRE0O zlYWXL=~<`Ilh;?=+#Czk5?7~6tq>9WsNS-53svs|2ek0u1EJBG^cp`v+RI3J%xUh? zLDHH<_t|ZyooRb{#MI2Cz&9_?@P(d<*C&pymOSwDv*IidnY^2k@z6XZ9jA_#VE^;A z*fgUrjf4n(Qa9}R_fG*guK9FP*si5fnOom?h1Vk`k10>k9RZa0d$&|ZSUTsfh%_1= zM9w|+Sx(=Y5Q;P3J0&rwpvzQl*{3H?h4y8FY*=waV>kcavZqAl^&Q0>{b<&8j4P%Y zBE?GBXaO9Ruk*!tIa!3Hz#LZuzb9E5&DS&=3)9_-^4QprweYNy-VWf}1o6F#+MwuQ z$1X19_?zrD)YBR&ycxxJM=k+3P%2tbA%&7>@`M!loCBp0D>Os-g8-}1Q$ zoCHNY0j3oc%PRLy^jtWxFDE{&b7Ti(Ej;wwM|`-Vt06Trm7go225M2M|DiVREUb{V zR$<9HnMwwy(8?E;dk8t5LYAsRD4|DI)y?;%Z^XzSj%)NmZySln89PP9=t7z&7ePtGwHS|>|I>I)-mCDOz?IB{31RoZe zlS#Uypz|m1(Y;Ld%P^z(599EJ&<)64?ga%x^k$-0hMhkLN|C^+-FiMN?B;$2QGCB7 zT^avY+^0fr@i;2M)DMO5^K%piBJJa{XQLv>W#*^XrNC3hiPoM20Xb-Toz|-%V9kRX^TznJGnz+&v;}(Nkw^RhmN_ zA&Qf|o$}(nMg1B@G%uY|p-1yu7m-B_K&NFnJZOQ9%8B#~tNN(N3&Ibh6R}0boH=Ze z2c91%gtZ8D*PLfz3@qCppNlM`wNE;wpXwD@#$@jBgQT5{%DTTr=Q#|ZnK_M7MTpypAc z4FVz%-8F#YI&@R8t?zfmcuk5yat8dw_!D*X=hNQP;0yMg3H>V_Cuuoor=p>FQHt8c zZdaeXf^d&|S_gZ0MTEm!-`^U6`jysS>4CVnPjCUX4{)tjl9qC3sN*ZcN~Yi|gzJU* zUh-wCk;%x*eJ-jCYBT<wd^JtY!A2eJ&?Tn*d$BLwdxEUCW{5on2#C;p-_j-8=$sKpqLXxAKF4Egia}+>rlG zdfuqx%)7AX>xFkRewst>4&A|Yss(xbUM^^t*^$+)sHO8?jX_Jk@T$-M>BX*U^_eFu zvL216E~uV+IefP3P&IHoA2Z%Ab3A8FYjom=^AqYvXY#WL$4dM!*US!=#2N(8o%rR} z5Csodp_P3cN_1(NVTnD`cq?X{i4YL}O{`|8dcPP3~ z-WD(U8otuFU-VpM!u^tJg*W_;*S6uL#&SG~9r-hk>!hA z8tT#lPbT_Y^Zvs56Wi|eCO5X=ocP0g%U1=rSuycDM=9YN1of+?~dF1Bc@qPXmwq(kc%_R#ov{YwnuAV`VI| z2c-Bl(tigfa+$#{rF23NzUYTTDP zm*^^`C7VnqgyIfMJOO>mEa??@q0zscE;aTO+h@m{*B)B-nje;+15IYcE_bJyVp@LM zS=?VQZ+U%mZk4Nve@1lCrLX0f#w7=7h3#pz-FLSM;<%s~ZkGd;a@Ozu#+6-m<-JQO-`rYQ zFs9|!9p;umyqQIN|A~UZ_xDW3Cy%6l-Y%2@2NfA$rrZws&6PVUdiPk@85%3p6YfO) zinm5sun3ko1!J4%r4?Bh|9ij~lPQwG(w5(M8c~R_&~NMsQ2hytpfLEvTpudcxy_IM z_$!$07u0;8Mcg=2#NKpBtC183B>re1+&0>^cFyR{$y!YSo3KvO*Hlv%H&HN~`2?}n zh&H#&FvTN9c-q9fIt?+OVFBJty%andqL9&sr$7`c+K(@OguSJu4J3sY(@|@=7V1Uj z4KBovc9f1BArCAv(MDJyC3$xwdv|a;5)`MrNQ_TMS|OH)L}ijEyC$26oCUW{2PTrB zC_Qbp2vL_~s2^Bp6s2q7sojs-%)^;jPN?t{U2u`1Ja@SGC2bV6{S!{?htqUzpe*qY z1voFMfTkNjHI!lG>z~!aCZR7wCB;dQ3zHgUa>Pn1PbK9@D;+le4NGpvuYeRBSPZlA zVwDYuLV$(g`5Eifc3vz)eWmrtM*GB{b~Ow`;}Xq#v4mM6afZ4345#loLi3&O znEOXpW-a{aV$yRKJ#=GN3vL}62N%Os&iBV1g3wWD=pzQsyhHW4h4h*PDG%>#!;r?F zCz2Hi6LQC#<%uh;M<4Q=XA>4SNn$(gZU+|T#ne1zSxCm|tUx`Tae+KM~qhcCirPAF=A4v$BYYFMLqhljo?AJi}1Mi&x| zlEA2NlF<}VUz$iQZr6MGHsJY>ra6+vrH{UX9fxUc(mGH>;WmjLLY0I9A(e#6rb@@S zDtR#_cTnalAZ#T;Y>KX7HHUA6Aoa?TYanzuQA}PQIo>Aeb$G_p#PN37i8H~178qrk zoE?`^2OW>0YtOe=o6^v)Niy?oCXqH6X`;qFeWIYPnN#SfJY5>k#O*;;!U;-qpUoY> zVQpx14@eOXrBzY+b=rb+2(EGrG=?gve5h2?k3McM?*(gnfko$0^s2)S^fvPzCbo6H z{1FL-Cxmi!oEC(tv{t0vRW>a4!&#MS@vs=il^{{>Q^gfCRm4dm*^U1|dk(sXv z)z70ENLv{C5e!}E*X>y6no_xCk|4T=Oyyy!e+>Zg@;T;H`Mn3KOH*p)o)u%0q(bc|SWhHJaPu>|r z+awvZhl`RmzN9N{Ts0Y3na6|@y; zOzj+?<+iHc|KUgzcnke=IcJ3S+Jbrr9*WOfJ&=S}?X{2aF&?@?rX@8?9^S$MM7?BA z-E4?1g9TcI6TL6(oKe(}wkI1Xl@|JSN?6Y%?{sz-1H?nn}Vg@F8cWX*gZ$<)8}mgR|Ty zY6Gk^Ld8zlh!m5E5{x^wReMdEi>gf1Q@#S>AeE#_Ttyof2D~ZI-m^#lyGJ_7PVLY< zPjY`iNK!~ok;m>3>ks4j7P`ovYF2$bqVgS6R+p`=^{+6rUk{?d7N`h1j=5BQoc_18eER zNy3vQ`(t}_&BYJjRBQ2iA=*g-a^ z!E17-<{xXn0c;V)OBdE8WQMh$pI~YUGX&?S-^RMu>_J4`=gp-91@Lq22O(9be&gX< zP6k>$jdbHHAVV+Wx@U)pJ5glu1G*-;@Jc(%g7CEO7rvOVzCpw!K@!V`cSSoiMnYu1 zg&>3&NB2lN`?I_~1YS3JjjL2xW5oeqhx%BZ^`0hHUHRxG^3Wbcn}T^^TqAe2O_GzL zDh%Sebbd%v*Gcm``~f0va{kb^9U|f+uzZC+ca6X?YOxG zTF4~ivSx@-H%Y|B&h$K=X#Tp<;2C3+CBtYvt1-HE4b|NAU0JcVZlku;V7b)&jO#^X zq@Hhc*7D|MmC6UJAu|xIClG~f0**}LLF>$_*5NZ~RzF;w_S(^wYx1W<#8*O zkc211ng?j#Ole6g);?SoM#XiwiqIfUsJ)bhKse3V>j|#40w@0rm--0j)z))Rhws>Q zzoxD&wSJ_hH80jx$55^AEOu}V@6jIlh zbHYf?5ri6D#YSDx#i`7%fc{P!?-T@`M^UY4=h`nd&i3TH)r0Z-PV$;lKh3D)gPKoZ zK1fy{DJFfub}Sflm^P<{0<}0`0RES>?QD}o?lBX$7I&walN-L3I^klj3gfJ8NZfdGhc| zjn8pMmCS(?g943nz%V$Tz4OWzVi~#jGdskdPRrtW`<p&GZyET`E zrVoRuV-aS^utIm6SuoAAjft(}2UByTeQWdVCS_elrhZ3CN%WD8Zl4r_pqoXe0t1;- z0gDkFJ8Dsr55xwD)}RcU+Mb?xEm@z2#|bKQsN{;Z!Xs6%~AbjK~~&Ogpi+J36z{Mx?3^ z(G5u}%RY_I6>AFnX*_V!$Gfc`Lm6KO0x+flWyb?vy?JuIx&3y`g{!%mz5!G@uhhps z9dOwYdD`EBzZSUg_Rm7`4IfO8j)Qecgh6g7Pk)VSWI29blYij8)}=B*%xj$2URN9C zZ3I<4?>ExCDcw~Q=M*}8)-Ug1Y{ zS&xaP>lGZQqG$M_vw=;kr{n`zFOTlqWI`e0ZrVLwYns>1FQg=yJA-`L2dWCpH}12U zM>3lX4!7ulxiAgZa0wR}eQEHyYK?(8w(*bJK}}Pi`K2b@N~}rC`q8O|yHL7KyOo)r zKF$oJgld^pZnHEatRH%#ajrjLdXaE%Vm!BmLHG{b$0AusVN{hlpFJuc9$5=*}&MO09$5nq&p zBxsy$`?7ZGPy27Ll>f=S zxbjc(%B4^B{$q#UsG;rUZ}_aH_umr=CIpeSDB*-f@^_^#`eNhKO}P1Y5A3TE|C-hp z{uG`2R#a#b8Ta;XJnO*2bZ>pM{%U&Fq;b^x=yc1)mdR_efAM4$T`pWkYx#F}s>(9z$VmBB#{;AZanjO~CM4^AvAc zY~w;|Co5r1rMyd;z(utr?I%;a;XW(f8L}j+P9*x)iV`n6|4es}-oABPPlTiP2TR!~ z>AD`-OlO^5L>y0+ExLs#v{#WHqtmDIq*7d4M!({2<-iTf}_E{sp#P6GvVq9jXyGiw|2hY zj@sJwD|V`)`X${=*!6iV7UX*2f!So|g#xk3==al+Ib)C)P7|n`cUmzi4itCIxUeoFkRXcl{=iE-K=p;O`wEdgr2ha_{EeB4O zNNSUM9zzTZ5iqwR@J2Qgc6zl!u3|X;EpjyRtqn=*%M$*Uj=%93SA&__C87d`sxFtE zlUlK9=MI=^luxZ?UwKSj`>;ckcbUFvG%}0uwruaI@z#r%C_=&{Or_QYGQk~ZS<*{W zbhxY?r;F5Sn z$;jw4e3!-j=+ZeQ|1)K7qxQMU!ap!1C5D1XsCHIG8=7n(ekGDA)02JS2cNZF;Bo#q zQyY-j=NC6^$S5}X;yD#UA`i)7;bwFAQ4WbSEshCGDaE$mHOG^2niR+Q9_)SibMY$p zB{WPWPY!mamk|1C;X)d z>W@ett!o%-VJUoU3(g$M?{9*C2=?Ad)aUuY?~a*$>|Hp0n8a6J*BsVPq@pF@|E>dlJ8t5ObWF{(kcn6HhpY@j8={wJlgluc-M4v z4LgiY`7mGO)90eGsJp&Tty4(3p-H1iMvtXdmQz+(BmwdJqn3~9cMaB++9!ACMQVO$ zTx$Z~eh_DpR<6)y@h)veONy@hEa*kHrF?PK%_wyfkyxA9QD!fyH<#Yuj+CK_G_(lEA}{q+92 zXNhhdEZHPp>Fx9P3^7090H=!#8?B_e{0IAx(8#;oeSszWRy#^A{j9t34R3~@23G{S zjq#7=Z(V$%ieh+i=QF-QuT&>~t@J^<(cWCy)Sig88{KLba=&eb+W`q)pNGhoM$!<+ zJ%Un&Q$8+iHN4Q_WX5F9il4!DJ)!Gaz}}clu6thPT$CZi^yg15zS(Y^;w6Xfx4I!@ zrdrZ#mt$Y$PwmEPx9vV9gLI_3Ml?}@oy-KK46K~r5Ex}1nrus%LwH9LwW_$%Wv4D6 zB3!0YP8Tj27)np~vOJcu2M(2RH(c>TBVRrb;H*bS9>nx&ZFZA}j*4!PblP^)rhG78pPOf2 z-k9lJ$=L4Tt1}xSO8?gFkhH#fNx>^UrW&UQqctmMH*$lUfAx>;jlDA3syBSTE1!OC zjPBW2gm84BYu`|9@K_c~{!5^RB)N)DO6Ep(sP3QO2^^roo2I%ICQ&f!`TI!4YIgUY z;MjJKS%IvIU<>Rj6nn{;?{Vu}g_Mw1wb%Gn@JeSIb%t2H72O?5q==+>PYS04^f@3( z2_#kn5~~K#kgjCR6m_TbmJT9+^*?Fliaac=LNQ;-Y0^{Q zvkEL@QvQbY2PU6%!dSJ-0tCT(Kg4YgLdnGt&^xU96xn~t@JHIUB3n@sT_y0(Lr5;F zTkwX$+c}n=;TDoR!*YazxRmMOq5fvtePN7E6Xxr8NGW#-U#!j zD&@~XEArRzy-)#O%@VPBHO-AQiPROr0H*9+2ib~{hJsaCtfOEbLFP`aq!LkVjy907 zz@9=mEV&7#;CX8Sq!ocBN8S#or*Z3kBF_~pKBZy-C%qkz(xw@hQ-B?5J)@iZ?(U5g@(jeRPfadZVhV7OBY*=e^ zRa>b3m#FSL9wAMn;Ix-t>96VC$smrf_xn(=<#@69ytO z^l=)N6)ML$4GTSSe}^IrCGx$?ko1}rz5wFglj}HI8Fm+{qoF(d<0j*?FYNm0KfxcMZFXDSk$cEw}mIZGBI;t0COx zM7eFXu-&;z;NtIytrPg=3|wcp_0tX8!Hp@J#W9b>WA($wey@rP`AFrpxC1i6-*vD|slPA+R#I07gmZosF%&C!noQqE_dyYl{*OXK z#jLgz#JOj0fAPB8{d9LMk?;guruDn#Hp72ydaj>5`)I)J$H&uWTRm5wo!Pv7cHyHJ zOW*rQ1Vo5$?XjNxTOBX4=U)7xUeo$MQPttfIe>3rIj2x;nJOQM^!k93f%+<9*$ml;7* zs=v?-BKj8GYU%LNfoIf8|B~hDAj=?8JX5lWnKbwiMd*2_|M2r^_e<`LQ+!=~&b7{G z&*e&AcyVRy*Upl`hAiRAmj_pa#UgvJMEZ|iiQA6+Im|bndes|bhI}3vk$W}QP$Av$ zbV+XH12L0F+ridH?&f^pq$-IQQDhC2ges;kh|Pp3$X!HlhKkG}(i~;S0kPEQUpk7# zmO%RdQ2fWK$Vrgm#gd3;-y^7ole9AHKR}EKR`&G>HGlhuTq6~^6C~zxTq%#HRSnb& zU41e}WA#$$CXK($BaaUXlf|r%&YDstmI6hx{5pw-hl;GUoWvZ!B|C<+D&aN;t!z;R zYrJ@~XaaM95T9J)STTH`D$rO0?~_$IRT|4rp)Qoml+G32bD)OIo~U+PR2$eyfR?`mhax-`_UZJ#RQWL3ZXeDh9+Z z&NPh9C}}N zvuV+Ppi}{u)bDvYfuC}MC33^gT=tE9^SGGHxmC0P2E0i!K=^)&61OlHN;wh)Y#-Cnp&Z>VPw@O zS-#Im!OuuqF_sA`h)>O~9+w>={Me^T2s34OQ2g$CIkQnY>z^K+m3(;4_2IF|doMT~ zo_NL5mH3NfNrCc`&Bev2Qch!nOb`$nNK?w1k^Tn~vba;q=2LN_>6T&B{S(B3~aA<;A;C_7cS!0KV}l#D@^I_1406 zSIar0rZ6vENM~l{lw`B+rWq@?yn#0yg6xR}= zQA`dwRf8O&4(t#b?LkGIdky!bn(n(br5ip&qJP*C{eBw9Ai~Q+&LP>S2V7>@vjML2 zUMXy7aV{VH*`lg{mg&!`^y_-U6T}Pg)_L=3@lbedEK6fG);32gdhi6 zh1p3ms!{GL#D-<5Hne-&Z<7`&I?GB9IU^|{Rv}$F1v7|2vr?(yZgTe}O$`t+ZBFi; zr<*I)3bZF3WK)dPg}#!wxDzNZMF9pC9)^gd|#g2pqScNYwNH+jqbb%?+4_*j8SySa_YB`H zF~9Vf0-w4jS4o9$pb#_yk`J|aoN&VgjBIloN-h`@@c4MT@FVNQdoeVEX!~4$Mz9DN zmPkVGNf5PWBw^EfKbVFW8`0xT9?9(yd()3@9-mmbg`d0^KZ>>Y8~N~d#87{fjDFE< z)sjOBQ~l!$(d;E1=`x@43n%(LKHc-c?0``}4%c%@v@~=4;=GJcUbgm(z?v+7)3or9 z67jaF_u91Qo&WUvCMLKir<|m}LP%c)hz@2%sXgFM9~EBefsQprA1{H+dQM%WPw`J4 zwpum`6`@U;;s}B?-?ZMOtw8B}q)#E-6ooYP79msN%GnZj?_^U`e<(eZ%FdXPYI|mQ zfEw520ejA)?#~iD7jn&?9*IRqJMj!r;L+1E4HO}=F48frfjo^cC2$Luh!uHJPJfYY zMl5!|k!~m!KhFG;ZZ5wcIrjC9TUOx$>)tC~#1!ar@mom{7~#eBkOzXGfSUo8~5rUZKH#=T1aZ(AWP+gH!_ zMI!+|WS2$pR`S_@C2p&_a=W?T<^Z{G_mtNA%)Ec3-^1D0@9z%lXiWxo;$2MS=imjdLOqDW0e2Mrys(QeAJ73?{Jx{f-FK9mj5aK^@wxZF0z zqg(cMi=P_m*PeW^b=QuQknuE*Fxd`rCqJI+rKch9I(0o>LCgQJzipY~tQuWf`+V!q z#H(w#Pn)$nt8@MNIR58LUz@M}P1D%_9$4Hq{~zAyx7fL5wR<}(elH4`&uWUI`RN>g z9G50*M?f;lnIs_DHGl3d9d0~wOh)*OP`u{rq(l6HIZ&E}WmI(7%h)@2d)?4{)8iGn z`_}5*O@q1z2+#8iow;WzAb@#Ve4K6k=21kleC%K5&I#&qbJ%wajn`SQTc_UT466II zmN;l#nhH%Koe6s6U`TdwP8EM$nVjbPY{c+>5_e)Pwqy%x?Sz^cW>u#O8n0Q z60<*cAhiFYM&;_&>W+eo9R?iAClU4oHerJ+p^KqInu*e*JqfX`Xs?IL!8DZ41MV4Rom~* zs~tg6tbab^A^d#)pUxJ_jCvw7)|sA8jh8XgStMh_sZ*Qi^zV0XolkxB{Su$?Rhm8P zxN+966RpHwk6wiAc)}}+qGM8S%^v%6_O(a!QtT;2^zxe*@_j-+UjMGGp1xUiZEfd~ zrF^DH!=z~@;hm=FQr}q|db#U&mt%2ReBfOf8Av&Z9wJM=VFxE$gz~X8p<>L-x^Z%>tTbD<~NgXb#xnw%5$7;s8<-Zn!B-+ z*?;}iaPFFV1%czmG3rBxu;M&TRd`Qs=8i?Fb4dmMThvf$Kz8UxfxHQ;cwQ!Q0w~?@ zwUm{SE4ft`uAq21L)$9l{ms7chn6;eyDJ2?gTu&C;V9)J6PY|&Hi{%?jeuawwi&&%!m=<#y&N**^ruu$CuXCoLcWifS%oKuZZLRp3ml$5fAQDd{U|=#%7*fMY*l_VqAoik z^;qz)bU%28Ng0JHX00){FGNa$4vh%^C1Q$&=i0-}W8B_JXSDu^H`C}2Zco_PM}IeX^Wo!Ob)H#FKDMXCW{-vh(yZtK9g6=|+rteiLty>6x z!5Gd<@O!6ni2E&|tL>{$ZM>Xm)*mrDS9%2$H4;$DSvM5l;n$6(E-DYB*m>mxrCqiT=#|vsCzQ@`7e- zD3vCeBNNw+Xk|ZhQ^UAbrXbtc_BSRE0B*!Fu7quvfOUM zT!m6%*YBKfF~4W|f+-1R+HpxSyN4H$ms)*$$j*-Ugmu0@cswE(^^N_(0`wRe#3LOD z5)C~ivPfnTi}s3-t*5fHza2PoijsL2)58$wxokcjEBd48)!y=ALlRupJ40wD9__x= zL(h1SOQZxZomH`5HzZ{i+!J{+05f2(i4MGdlh|9jeOW-5D^zc8PE3J3plOX|JA35m z+`)X$+kMW^*xt9|yRGd;?R2kL8d>PdW%Kt=TPg1OG2c-ZhA`*GUv#q@yyE7dxjx;9 zY}q)TgXhPkgh8yAw~a)Como!E*lxdkTt?3De7Lx82fqHDb|^aH>Bn>Rxer4L;yCaPi?}4m#dn8A2H1G4f$R@G??e`{t?*xsF)5*p-TMA#zhX{sTIQ9D z%{d$B_r^x&PgUfPIfcm^QXrEuWPzUyR&ff#K^mFzd=Q5DG&O?|gXCvyto1Tzg9Z?nb40&WyrLZ&VmsB_-8PvuhNF zpM)i6Pw|d5Bnh1HIcI*(7a)5LCWkQd)2Yd*7aTsMi$Ui0NBKeYSIJQ|#uPg5Ixz(m z0$1Tlm3*0`0>GndQjZ#65yvMd4dQmN&?acw?pc_7c}hl28mDm#)H#jADBxBNp@1Kx z4_vvKnm!Z44p~X77-ZnSET@BBCTSEx?i8j!mem@dK-?En8wYXzdBM*Ht=94BEiiZ; zIdiBcb7U}cY$bCXc5OoT+6(h*FMY1PO1<{F=GyGwwYMwR-ovsMWV05{v);kr6e61n zErTSRxg`<4K4ZI1ftC$q{PfBGotpipCVO`s-~e>ss*KBQDo$+Um7sxZS$Q8%u8T;m$ujX0cLE?Cj zUPGR|3#)q$19W0F&ra)xV^OY?0LTTN=SzVmZ=?zg;f9$VY?oB`1@={-AV{ttM5~~o zAwh0}H}q~nRAhnTV<-$>v9Dp0-A{y#UW{h2}Q-VtA+2%3-4PL-CM;}+%2l|Emn#z zdJHdYi!AcfEY4WHN%hrIZRPc+CvqG?FB+9NHQl_cRierTx{STKO3SVPBNFLSnvzyJ z=XJA5pmc7tr1f>KwOZLzmok;$vRHw#v9x1Wg(VK+@D~PYn`-DtGBlfV@$20(+R)9l zqB5KsBV($l_*G>7qy_bUWNG{AExG^|(NwnLa^?AJDxh7y>{6cjhq?#XbSdQRqb3Q4 zz_0d{^9vRwzb<1}qgMG+L2r2B+C@skR2lgaAw;==UwOVoxw>V6+|ErM*8&aKipY>; zry#h0pHM1u{h%Y8aejq{&P;f+m4JNdXfAWKYL>Iy;7WN!Q$?~|rLuOZqINQ_sajXQ z>KLNR=S_7)Q?=eO_EPa}mz|>_&M7DD*ggqeyQgJa-Dbh9z?mg?XPPvb1joyt#Ezuz;dn+0}MeyIAH6yf*XAk%S1|Y#IZ~ z7l$k!5Nx}l;7wrDh(m?}=O6PBYAl6+R|{ zbuT}QtnA#2YP$Jjr?Tj7{qkpAe;C}lk88EXwrrZiiwcuDrPPc?q!3VfflA z?YCpQ-aI(PRYxjnES0Z)ex?4DedB6-WrulfA*e|Dcq2K4`8vw{bm3jWCcL#nHOUJ! zV;dBn@cyxEa+0_!a<8#Zz5xHFG1r%AncXrweqa2!oLp3s_U_RY>XFA4GQSB7lvBn- zmzxj$H`p|g4`1ds>3*s1vurVV@?ct^UiM~-cShaM?Upu)>`TlaX}b^e&#*h(FkU^keyj7WN|<9TC*lriybHE9CDfnhe7g?~UVf1` zbQX90bX%KRn=kUQ-(ZW)O=vK(WBj62Wt796C%hN68m7LWJ>y#u<!+SvHPRB4Hb`u=TTBTE{ zt93Aq$F9o0y{3zb_o5@QcF(>e)7+gqZd!E-%pE^|82V-s*-wU!&cokUL>JIJHwh?L z0{b0YI9%9nW&~P-VfXKQa^ zV7oYpY*g)2vhRyaL@g%p@N}UgsSQNy{y4?Kgi+l2XaN)J8b><1f(rXXf}Oyi1sXX; z>;~9_2Sm#uw<-okDV^z;6PZD+VHxh$?SFRL%<48w{MlKoJ**aC>6@%%lko z`kN%wQq1P*|1A4-`%DHevzYnB+G4pwq#aWs$LWNtXkYi|+Nk4FTH;hN5n?R0S5VOp zzO*0WW@mER=ELk{40NrkH<98KCGk|0^Esj$?myq&Pn39Y5C0phoJc`}DIXWuWZD z;ki`wxpnkCJnRn+&E<#^SDKW}N5?7fUfq4XcIxp-9U7$0DKi?*WGNe^;rwIHSwHPZ zvExbm&L}UA0#aW~jx{s1`r0xzmFM;K_0BLV@*Bydiv)YvAm2(tG!xj5n+h?LtYOro zy#_=INnsO3TEM-vNiv#kTTnzgj*6Ezhh=RD4(HdGX=E)sk>e~kRwdL%iv6){N zIVy0-hcWO&O-|8EPGl+Pz;n(q55>=&<6a^oUy@)?+PSQ4IO}d+#D2ywtFMFi@4LJe z`5D#u=Sk=OZYPVdC)l6n{#3-%hQT>XD=FjQ-Q2L4V&S?Z@(OMnu3XZsU~T!03MnCS zoS>lVMxb37sG+#~)KAnXRtnj;ut#He#Ub0V(3T)XC>Aw}v-N2Z`c8oCP(CPRp+Myj ze+rxXqSS{N#D@)V_69hX4DF2OJ-kM9t$`(yPAf{zE)jgwlB{^VJ{Dl%f5;-tE60ze zT&kn&TwG{oLBgMkd`mqt&?s~^Or{wpVuytXBrel_-bu%~kqB-BH{2dR{K!mx*M^}- z36L-XyBE27>Fv~s1`kXyoKF>r_yKdpbcwROpX=%pW$%`-?w*M5=6K$HnqY2DJTwrele9LS4pW^Hhakvj1*_PZ#RbM}#K0o!`kXfZw z$mvEgBLp5i%~%;D8_xH(+}C41+)?sn$12!3{g-&No`!{gLD^lf7x9#_6?#j8*^#TfF)L}tKTN-2V(|}s zNbrkX4EF7prsK*&QHvNC^XIJ%($;qDeI@2wSw9abNvFPI^wzmvHq`VJ*lIoq$(SyP zihnC?RQ`3<;#mHf+i%Y(LC>~Hot5opN8W}>zTtw6b8jerRrv@nHj`1)+p!A05X!#wP<(4gbl#Pu`UfDQCVF`mws6@yyC1uAt!pNgFksa_9hG(aKu*l2 z>n0Xzs3JsH{&PVLa%Nz6@w#u%I@-sk#=%p7{)~4A16jc>xbo9`bZviz$!tB}d(Hqm zDogpi=h_E;q~MT&^#X@J2=t!#<(p+LBozY3?_Y+hcSY9Gjm|PPzDRx-3zim^hmn7);*`e0) zzDHm2(bcCiUdjR;Jkr*s(I2`Ux};3!EK|zP?z~VN7wdeW_Es?SVqJQ^i%)GZR}a+2 zD0aV={|L+(w-D$`r)?J0XBFbIdpWlb^-DOOI588E+hWHqsM@;s(x3>R5ca2 zP}SI9xvgbUE_^H9Of2@w&|+$0prMOhB7zdLqE89=9o#hoxOg-RO{Rgr}J&+ z3Otjw&KF&{-FE)wrGBjorI(l6F5HTRYWtL@NVoe`W}9mJRu`Oa_q}r~S=;aKo!jkx z_v-t#FW!H=+py;p@$~od zlS`cps1B)%O{Rm?gErF%>N|9yBWOS@MJISj=1xcOh{}LY$k?$@9U;*H?G~~*s zC(0OmAskx``5w`eR1&?|&uJP7PScZ*^?ISgT8%`1(NpvpU~f#4rt$yuMsKtlOKVJ| zok;ZNsn#@+KlT56Z`4)qXWNOZzh2y8fPL@~Z9+WX6qU`y$B?(3kgPFPj!yD1meRIM zKQ&cxsL#jrXuDlj_*A7B#Mj(J+demcs!GPh*V3`wKL62FwMvq&^*Qa6#nV%_kG(vF z2mbH5Nv1U^z{iU^1V400KtxD@R}>{8ij@!(k&!yAAg`dLAf=`xFRILaR1K$nR82=) z%TP!2sGghxUf9?OZ)J)7Q5G z(NLNHk2RTlJLvoEU?w+Nc&FzdZnB^@{J*)$lDa6SIQi^B#fJy!IrT9W^?g;1@l0#- zMpHbKn`~}w?)?`x+1!%*A93>jqf90`No~y@ZGZ4DaWa?5P5$c0`^!x}rM5lGXNr?G z|3W99(Mo7ET6-7$A93 zxyct}4GUwm|4VUlZnE{oix+)U?ITl*lT)+HQ$0VXp8lC?Uwhg2-{K^bn`By(YqNv@ zo10`sidwfeEcB#uD|n2yHZBfO4qKI3w>2$|6dt+wZnUlW(>T>AL)50dW%WhP$$K7S z?GL}aYB)1eX7l9H`fTg}Yi@G?zq!e?a-9F>CR>gF;U<0mo14V&{Wmwc-HrLnP4Y^k zID0Va|0izp42sE3E@B-14{maY`Y~7UN_l}l98QT*E;~849isnGEuzn%?Wt@F0d?<3rl%szmxkLg}!Pz6c)R?G- zc`E^`KkylCd?I<`yo~9ybWDIjb%z103bYVnB|5`(dQ7qpW0kJjzZ`RLF|EA)@wLUFs?UsWO%ZGTC={FoE)@j9}1OmdiR z3%lKHU6FOXL`G~!6Q8l3trI+n8`A5$yc0c=Y5b&p@=O7fn=JUtO_t%f#*E}nDABn< z`4V&tDE?pEq!E*wwBOwO7dHuW{>x3a9FOO>!dEl7$@ZTej5OkZxXJrA*kLXM6ioGB z+~j6074|P~a?tHMhRID{M!vQi2?bqY(c2;#H1#zwxk*YN^Gp@+^s^!08ZW~jm_Rr` z8eo;=O&ILF*t#18{Bm(V`4p{|HJuFAV5cKdY&AeBS@g4fff_Uvghs9K&0r`x(@R|} zAlcJadtx7-nVCHyDX>9wU~LaH;VD)1SkcpQY6s2nF~l=4Q^G*32TmWbiRbhhWzn>k zjo(kBidjV`XwNulUzorP*pF7?GjAmL5^{x7tUpS4`G6zmx?z0Ck64z6*{WF${7UUq zydV#ipR|@RWe?qLkhL=WS{a}&3*GkydU(^RN$iz|Y6EsPel-bhmw*&)KvH8{RUt(J zNZ15VW~*0qg@Ei*;3f^=Q<0fPbGRuzwKfah9oC{@9+o535e#JD-rZCE6>&0wjW&l5 zx^F5+OKThm^t;f{=&7q$tok$@(OOz4baF7&pD~v{yw~j_L15F>av8{dUY*Kf2{PzE zhVcNIdvW|ZY~UmN^yC(aFH)0u_?78{6r~`s?)Hyd1ILv1bC9_QK6Z~_ja7d9_p%SR zuX6|0p3G($gc%|Q#7<}_6;2WgKtrXxODv`3n!)!ZS$D(ouk!IJ4-Yi~@obtx64^Rc z3ywdYRGkXBQg_?&xBkux6$4zqUY&q~2b)GE?CLK=1gN2W{9+E;@~0R_F8txm0G}AM zKeSt}mmg_*WE3~7pcV41LRG>328jxksht3>D!lX>CTJMA&kHL0R zFFk5C5R2rux@s^ca6IIw$2m-(Z~;Kq%(Pawm&;(JOHWJf2GeS{4?u;^7aR3wGC#%; zYg|8Tc41aj@3XEio?!GDMla+zv<=_Qew0T*j1=~$@fnLuj|h^(&GoGYoNj?!D)6|q zZ2c%3dZKbrN|a)Jgo$ACxe`7%r>75MC}|$IqULpb+AeTzlbiLG9UhTg*zMkH4h(9H zXePIS>RfKbeVv~^bH?xV;xn>P-BJN`x6#~?9WM5x$C1BW@vWTD&8e3go@C=8KMu<@ z=p0`cvSZ!IkzXD?@x|=Xr@C*3_vgv&AHVPhJV6tck@)mCzaRavWL?UCzz;o6tVq~_ z=C#)wsGIlbWroGG{%V|rcrnohVM~S-2F17(+9wB}m{)A5 zHEeTQ=fTODCOPfDh2`wOfYWH=9)7!>TQ_^3WNecFuN+ZXT^0v@AKzDw1y~;rI^x}Y zn|cQJbH2lqRvq!k;rGJJ%eifM{5SoYVZmkRGw&7OXkfz}5>9i-A0Bg*j*i24lbtDq zHvzKBw@-H}2v)|SGwJP!tCL~{l-rElNxESseetxa&>vIF9B9o|XYoww!8w!!I`pgP ze))Nro|=-kXzhEkPZuzoGO-LNOt{(!h$@*^9L;!z?OetnWH~uo|GJaEnC>Jg?$zTs zk4~IHC$Wp%WjJt<5k!IMPSRBDgOEYZMw-q^^$Vw2;vrx0&R}n_&qvcoQfE(T zCZC<*ajrSBMK}!(20g~B*2*NG<4+0J zMhm}lL#kejW=3u6zuZY9NKqlDDeVk2=LpgcxAf{F>sG9-A?G+B7BZMIIU~AHlC8!Y zWuKnLO7}|jZO1$Ivt15f1)rsj&4ryY}EFZgl^irkq1vjoRRHX4P zv#P$~&1RSL?h#mpifYEQM$3rL0s8b0S@#-IY+k4xEJh7}U5e2d;!N}TX3T8HnXiW9 z4(B;cavwXOOL}~Fo;8Qbkd?)kRJBM*gH^?^+{c!roFH~NCO@EStQJ~$*0mRgC$$Xa zLf{`KkMR^KKUV{PlJt>t(3Xyb6XLJmm|$(dVnzw9s0rX29&?QXaBZ-Fx(e7;L055{ zvov;0PGN;6M_mabOFvc@brPl$2W}7xr|lAVLH~U^7soZh>~b&mS&>$qsTc_ zfy1!)rFU5*_4t`QCExzyC%!(suY>B&E54XV|GInpnZ5RK#mx>m5b=kY73o%FgN+tY zzL!$Mq-Y^n42o2+OlOs&79IeQQ97#_fQ)qcpY9}3DpOnmnkW}zx|0o1IXW=of(hSX zISQ0s#g)#sp+x;qdI;o29H?U+tv_6HfUT0lSC!46RU1GNAtp~vS^Xb_p7y7xP?S{3 z)r3j3*K6!Xk%J0>W35M3eVc>xH3oTZfS%mcb~n80T=iVhISELR&)#SP^V7A#C!hKGnM zL)0GN?%ud^_b)w(XVGHPleB`P^S}WGbP>yBanOe;036T4PrAo}1L;zL$Hn&!8ty&a zx!3hMtrKIQ1t^M!TwHEK^E89=wvZolSqHa}vGL-3M>5ZzlwG^7#NXuo%2MtzUBXMQ zX4NoM+DuMsmW4t*4leVUzdV%Ktfmyh}tyR zeovgyURqI;Ve-p*`**tdv9s{FSfa6QQ>xy5O^n`6&d-Og!JqIqTe8FmG+x9#OkV}C z+89x*N?|LEi^4;PClJ*y=n_8+7rmgc7Orp)t?{Mlu^*ca0$8Rzx|q>?dqs7RX-zWe zNn-Od0g!7)(S08>_kjJ5x2k@AY~zP&paVw{K%+c{{R?`OQm#v9-PvH#xX9);`)G&G z$~BL^5!F`oq^;VS^f>h~cN5FlXLL9PD7sP*Kk^WQEo^_&*sOrLzuV5Q00git4b)oX7HtiU7COyd@T!(Bv zZIyqf?)Pje?b*zeXF`Rbz>#MgOnNfjM0>_HsYX`fha3oG@uj7n&~lRnv@LW9^|HZSDdV!N5) zDE+yf32LvcVzk1)Os3=wB@FlOS~4 zu=gLXQ#(=uzF^(`wE?U(+8@0p5~$Pt9!RXkcE?$>I|I;~JSdGphFqcyO4K3BHaL1W ztb3X6Bo#rVfG^>@!~MG-$OQg%Cvn3}ck&SbGp0LP$#f^_0}m7hV|BV~8^AZCQNG5k znY0)VYdaoZ4h4LlFBPGz1umFBk%>bJ>2#O+cSnV)`C-@+aIh5&=N~E<%A8}G#|&V` z&eGuZdt+sBqn{{&QhU&cC8o_JR3ZT7D0v>$0Di_kDq%e!(IJsAI&|5(yD|;qM?mq| zjX}%N)$E)P=f~!w>BZvRAFaEW6}x>UyT1sJBSxRgV7r&q$Me_dQS(yosiS|QQO(S` zrvl8rBIXr|J)6XFi+}`ivS;Jr16a%x94wnbXMeee%ESNN=$!HA0Ch1Pj~97_>w7dr z3Y{HnM~6L~%I6$8JV$@D*Z+C+d1o|o>=%dpnM=OdZWYS&83KpZ9>>`7gk+uKe)ssn z8u}p?l1&Ha(IE$TcmR<-d*iP=d6vX6Mq+CsIa(obj<#%xcjgK5{Dnt zoh+ZS7I~esFeS03sItMaDcPM#drsa0ndw|0HoW}Qz_wcm?PXfVMDVsww?JY(o86?} zKlCIP@|z0&Mx73S3wedZyj+`Uql{g4pXppjcil%@;t;9(Gf<`1&&T)!_het>UUyU(X`cK>xJ4c~Su4j}60f3K-NxHipt_1$W~tR)6LMg_kX z9=l(H{_g(%cjoeUpz|L|Ms<38cz?mD{J2I@^gj;?>O*W-7LHsoN6lMOU3ra0MPyGH4PFvUqj z)-}SC1dz&F4xJ{U@BbrC;^;gR%X_rtJG2ic%RWUJSOE4MNfFqe_c~cBWcQDE>1!xka##?3#W#kli@u`!;M4yTZu07v*0L`_KGnyalw(n6StM3q7ra?Mbf(x@BtP#= zm^5cmLw1U;C^=gs#jWkm`jic<$^2OJ<5_PH6cP7dE+8#ARntICOK-Pmpzf|oy>*B* zme=1<9rp&r^%Yd{##OH?xxDLZU{`{~Hx|Qhe8yi{yuW^Vw>~M}qh8kS*#8x---FcW zhrQpB+1r4kML1P9C9OB(gEsl~2OyoB_HQ?(>Na`b3lk)^4l_2f*;{hAzKM@-efHRT z!C?7**dKgh|AVUSca2N@INR^XqQ9$+3!AQe|3u+8R_RU+`e7pWgO9N#8;eU(RW(Nc zOcMX8cjBk@dtu@Co04e$)8jwozx}k$-g3Y7vw!m^m*=mmald@Ue#LhE;;8?1USF7W zVljC9P}rs4ft|n3sBB%<|4max7F>TW8@n`c&_4luPafY$8{f)6Z!^)!waiT-o^^La zHboVA9h7_R{l*00Pl5gdx%`iq)*mj$pZx4UlzL%_l}NL-d4bI3s}u3q*Bg$?yU_-_ zMB;AOvz?5fzvyJQ{$5DaZYiUOXuCJuxi|WL@A><^dUXE~V{giHe>!$woooMu&i)MJ z{eIbn=M~>}@m-(Zc^=5|+YX%QA>u}s-yeKs90+)WuM@$EmwKA9wv$lCPAB7?3gZVF z^9hWSblzYRvDc$hi8J&CBW9iFFE=TE&Eh~k@Z-$Hq8UZpBumm|@}otboZ|x~H@Re0 zsCw>smCK9eu^TV*adSsT=1))!l2B5v8(c<$QR$+*UR5hhZnE08+VzG!Kc_lPr|-?w zXQu|QzATP(M+K*3+o1Oz)&J%uzb?(b`Zio@8r>wNqh8FJoRA<9D0qM4 z%wW!a(}bHZ8|lfXr7kVc6=Ba+6(pSh#r}sh-T5zW^3BhSGmR&0hZ(M}TeBfiv{ccE zRbhU?^Z>Ms-mi^mt8U11dqh9%+lPs2Bi*aYZ3fGow@%1JX;=DhEc9i|c)r{7h)ip1 zzLxB8msY7>Vc_Zc{%Y4RiEw1*`TTHTaL8gKiTY~zJ8E=f4)GEW3 z^_C)d?A2H=Xj$GDzn)n)c=$%<=bB@IdBK+Q_gc7A1dX@HEvU+~IaWBhkpBp7q{`3w z2#3+SrN-=EZgSsC)<*lwUv4t#tgNjblbamhTpkS}7&5s@I-~IyIMmKmC@fUjfb-!= zKH=!JVWhQ%z~CY4r}XppZQ`kxT-KY^be@ynttE3pI%bR~kMqWao^rkT@MopDmlC%i z0`Q^P*T_?`fF0ybA_xXhs^vB8rOcCo)*^Wrf{R3!7Z#OePdS(7_p6azN0W_>c_Co< zO8wl^F!%7;Ye$?yIfXCO1WEhMJJ#~chGXsX?#s{Jy4{FKxp-$dA4dX6r)0^FlnvMwS z7s|eNrnkDO8l!zhJVWqgK3sm`FyezWD}a?{Dti4ar7TQrYPY1A<+l@+^j!9pQ%(^; zj)MY3BMh3|oW_bKX?|j71)CG8#(-uqE8?Rq9$*C(M$BF8DL3T9p{42CU~vGT zlFwOK5W@w)O-}&`tgtWvsfZUT3DN?Hpm_*O`sMd=heT7L>}Nspngd{S918%OXPFf2 z|6Xyd(S`>P)GPu9E#l-DlPt6J5H|*!?snG`yln!36Dp}Fg9(5c@k_kuSG&?^7)VGE z>|ROIpt5irvOgnCCzKBc7WPC+8M2f+FW}EIVR5CA@H&Yk047VB-ux2wX{Hwvjy(p1 z266+Kcav03d8iH+WZHQ>Vt8pLAvsqFxJfz=K$e-l$A*GgtTn6^7944~!Q`V~xBbQm0x zfmEJB;H~U^UvtH~!nC}7*K@-mtz2xNnd%J9RI9r4%#nhLbB+m$!FV2b_r0$DDxR#lbthwSufZj*N6g(D}56A z25<%k=>zYSGnA7^WECo$XEl^fgc}kwoCaB#!=YgOxH%>egNNQd2jW$sgGP&K;816< zWGXSPKfH@8e~`v%&5{saR(aDOVyu3d9?z{u15;pe-1(eYghU$DIF+o#Hv!RhBXdkO zfWuhkS$udT3dM;DxE;Vo3&z6sMQM>R;)1u+n+P~v1+cAsVIfiQ)CUf6_@`IO?*%&2 zSbq_n7#A@OftTv0LZwm2^65v(iDJa+U-%&*;F#7 z1QTaR`UB^jG1jDtr#Q`gg$HX&v$rxQ6KDA8N0>GgMUjG~szz?|rp~sqkf*tfAu_i= zTt=$J9Tnv_V38)>wOLnY&A5VvKLTSJXtwwrUBD}HB82p|h54s(50utM!E1O_ zEU&q>!cxYO5>~XEFqE5T9*oC(3vghN92-db8_XXEuo#Z@Ti+AJSLublDE(UFpy1Bm#|u4{$TH?s2MXy^ zGA`im(DnRsMPTCU*ZUy>8Ex9-ZM}ShoTg3>A_bb>>FO%5qOfmG5)TX4YvRF%)Fk&8 zU7!!mWbU(yFr@K3h|`BPIUuKQmZLX3M@AS3w1jd4^Poux_?Y}X01zg#-x3^X>LhHO zOO8*CagKmVV9D&1ikFW7cQBEb>_J&z4^LslL84W&Z14b-v(?k*A!n9~q8@lPL~Aq! z`X|BfunU-&U?+$51Xj?(zgWpEy+={Ia2nE(l@0u^<_Dm!~y(oP}O zsO7{bpKZeDOeUigz-~v9^kC8_VM&H75IhYxCIBsoL?CsJv|LOGbWJ=l0o5W&d##8* zy3#gvTGzLnRpLyYGS$EwxUb=ycmxSKR@o)O4b!5qTJf_5Q16>i62N(^zH*5LtcXLB zsqdo@0oym@_@SSy5}DkXnz24(m(cku%1ARW_xG-f7Y=f~!{|6WL}$)OZ_P-*!{{Ze zv;>R=LKUJqC+n?%AB=2NQ+Of)&j|beO!8S#V1t zY%k4&U>1=%5LCrly8?>rFpC~Fi{vR7T^Puqnbce1>Ie5jtc5I?*V3&{vdoEME%y6{*WkyV;z!D$7cBDyNT z8Mk;WpRHLHV~0E3kQ!Kk)m)@pUK=pi_-ao6jpsrNMesIQ87DbuA(8>rH+OthGxa)B?wg2FM6J7SGQn%+yC4EZ-%jKOME%&K#!zqrdiwbu0lWfhCY87L^13-pu2Jjf46CFoNZK z@blr5qttom?cNhXn;;mG6>Sc{_-}d*0{v^k%=aXUXbs;eg7Lz5Dda$L+jt5D#ZQ1~ zf_yfT7JoqaO-{hb0GwZli%hQ60>N;+(mrHx4e(tOra=Z0J^&i9JM~3`C!?0qWB`gG zkLa+ukR+jnW1qmnaN;Oi#7%A`U@ORWSO^wpbxYln>ZEmRD=&%RXF-O_R)#Rw0L_}6 zs`TG^CyqZnx#K_Xu~`bjk$oeMZ^8k95C;{!HIegzRSC9hN1!&`0+Fq2C0GE=)PyP! zNKXD9H#NPLWE~@lz#dWyApcC17*Nlj*t7@c0rQf(2VpC&bAkqaqIH$R}7tx)bd^DG<0NFjZ2hFjjOvMznelZXr@qo(bo0rmpVzwMWza-g=^^RTX$-6Z#j>kC7jh=j$Xsr)QvDrSUY&mxK0}yu)2*4(R z@Kf&{92R)eWdfAukF!}URrp4T>de@(lLbJUvW&RXhutw?0x!(2?+|xV1wddf!6GOa z$l2r-`#jKc_mB%OEO5hKO2WpI^g6_ceB&rAm=iFo23W9%p!3N==6u35R@s+4%f?tx zy4*KAjjzI~uylJLnpLqlu*3s1&~)UYiY1BIux?aBH|)Q$PTzLV_r>*=vI1)7oPM|1 zM7W=R^b!_XZG-Z838t|G(4i_K$#B04cG}B!xO1HO)co10TX=!R9G1A8siU8HrSYYD zH>ljlpC28aDRO_61#xD@z;bv~7u8eYH7-duFGEPD>{I1d%c^|EQh>_J8=0iZ_fpduM`1>mPAyW1 z%K@i9D-Axb9y=mvld3?5(q<7p6^ICjxEk9y6GEJ`L&XMYM6tk&;=mC=251SN;FW9< zWG!vm?|E)e>YTm1Mt~NZt9#|xzWVsJ*PxYCT+Jeo+V^aQHg_jPR`U|sG32vD>N+qo zG-9$j(GBhgKuy)&x);CmeOajh8l431y(2=Oi$Q^`yO_-*8k@pMM|%g}(uhSSOxK^H$LEXC3b2)3esPj&Y7uv*ugi2!P`#ED-mKBqyptnF2Gn z5G=*CG+Zt?YW(yk`*i^ZFV%Z&4`DgkHhHBwhi&Rq}JcKkZzixC-D1h`tU zu+*vmvcarW%77_JGTzV~rdQP}m1Gkkx;P;gh$#ip z2rRNy6&hr3PFEG(GUFM~adq+?aQfN1K$+Z!DRt3PU1Si(0g((1UmlRr9j;IYAUBf0 z*!-M)kCS@7bm_r7Wy?Sc#2({2l?$1AfFRMCw*@RX;=^^=j`di(6V`2fZ4(Nij2hjP z*$3%&mdx%_k`Co1)d{Cgq9)1Wffe&p!Qye}{#uw|s+^I9dEy7RK$k)R9h~aqn*TT| zVDr&CG3cd3jo#liU_KjPJv@-bN;y*FE=@e3FgdZa+Ca#gPi*-X@B@b)c(y)o(MVW8KQA1pI^!X_??d#yh3GRv)H@R&%xo#Egl2LEXPAeYc!ru@3N?!jHO~&U zxD{$yA8OV4mz%VCA8PyUhjDDM5jyO0PKrxmq7;?I;Y66DXBd;4bczjgj{SLvn97?P z=6oxRGlJ#x`!J7hOl~rahz=)Sg1I01H3sDOq`T#L}6XBqD628=1+EPq?$OaNF}mFB{FgbUZ?`Q zS}=4SjVh-H=c^1-KW)TecS<~WO7wrrGa{*>SE%GGg?T%-s8?=}@5DadNg%=uzgb-w zK-K>qY54u8@zs?UvBE=83^L?x&0wsh`>>qqo&@67Q8KWvTc{1Fug zM0cn}GwaN3*e-Mew?FE<c59TJ;C;u zn+&<*Az}8vxyk=8ag%%7Tc2LkU;4{U?)?7#?Hfa%%!1Iw#z6!luuN{U6`R1Jy^T%e zwA2(&;&qJ>PZscN6;BblvMrt}p8l7cERK*!m#b}+$WUtEmdI2c);xUe=$nYcSz4c& z+@$W#_F;+vLQ68oL@-h^*FwHc^16-oAIUsBOD(Azj;@hX`7VBKQU&f;7=NS+z0$R$ zi_R5CN*DXqwn^XA3HuJ0ZgYDfT`Kzg$T+?*v>bdZx<6~+~Xw&{5>n!Y4RJB+R|a|dGVMs zt`x4zu+S79%h2+{yP4guWu%)0g^2epDHEAZZ?_XJ-m;ATFn;t@>I6p(P4tOU?8A+; zG>OTb9{-kM1kYFH=nbv=hX3*}zjlxO+rJ!856ONqw=?3VbA7rZnWul-$f7M;J1Uba zg;b$fhFBkZpuZwJM;Ec=SG47h{@Tq97N=*r=_I(lfCzvfjOz`&5#iS_D&0`~k;diC z9%?y;iU}Slc~=``t$TSxRc)g0X$azx-@F#OCg$_!Y)$)(plcQEH+t@9f>aST`d!3e z%jtu;lNqW5#NcbH4?99^`U9rAPbl52-3yyJQxFqA5!s`GsI*kCOy>1EGkZb!8|;Uw zSljl{kYV@Echs$9G}ryRe6z)ks)fc;=)nW)cNdl>g6|(&t=Sh@e{P9WrXPPuuwTQY z6NCbU$0VoH>kk@oS6fp(zBlsjIbThEQ`q_4yypYyfwBIm>*n!_?(Y^I3)QDR1zQs9 z-F$W0WP{CwqY2&{I$#yoKw)@aC9bk<+vtcu(07-trJrkz+D84#8fA^ajgflr{`T3< zFPtRxl=OU_2XXf_iSl0Fg;QA3QxEc_z)k`uyLI=`%3j_M#)Evr^JJU(mggUQrKKX5B74)*l+BB^O=^psoGrJYey7lzB{JI_cbQQ&#<>n?)j(a z3a2-MGTJx4X9ZU^mwt_w(OHxmk9__=*t^fCCf~jd@YK+g&;tYrAiei0q|ik~nt&ih znqZ*`q9`Pxcch5)-g{9%R05HX(vd2?i5QBAfMoN#f6qR9&(7@2oqe-^L9UrhGS~S% zzsK>Z;J<*;>LAOKpV6=Lovf83_kxpy$c|wYrJ_ipd36I9iVmklt3$eeek8Ht#%g8A zGie5lU+l}iFmxjhPA-3 zX2nL>GmaOeM$F4m1J>~%M)_q0Y&p1xw+0MSV63L|D5=sJ^zBh=u;9;cpb)_qjG;U4 zg9qm$ngNBx>fmT+3s*3ua1DX(Ikfn$Ya5MaDIq2Villdu`+8mjyH&{YMw0fsKFWU# zWH41k7d>eY6=S|>VOPk|o>{ZiVa6a|V&a`s@tkot@ z{QcwD`1dHNv03;g=H}N$mrog)nG08I3d0dOMObbs7_0CuNP+5R(Ot`)+B5=CyW%E& zA#cCDOli2w6q{d#^r}nLiNz%^66TxSEk3H(T)&v{cE8q`vt%^9ur?l)-}cj2G1_)h z_b}w+-{+2+39Pl5BwfJHen(tCmY0^XYIb{i;Pub zvIYqg2{@}HD~isO`GuGdtz1E#SZ%f_x-5qo7cVEhHn4iMi0e$@(4x%9f@-^yPohIR zleaqLgp|TB?BRcLXkPcm5ItFR%M0G;`?GZQ(%kz-C_B$#KRojLnGombDhg0LRy{y& z%;aL-Eu>H}D7eOO>#3Vdo1wj#*ipOq(Mv-B9mNp#A{d5MQ9{s5Z)2b>UlLy^l?X}i z7+G|TXM9j9+nG&zzD&t~m>i|~)i=>UWQ}Jw8Bi5N&gfEL$35GxUup0I7}Q@paX#u% z=Kmn=dH));+*(I3gbnyeKC)G{`?3H~HEuH%EMAz>x)ks&2#cVKv94>%$q ziX1o>7%sN--s{%HyXn)fea0#Ypl5B^$qImeLlhI?E=Y@>nurxz;Qe{Op=aq-TOF~E zdt}de4)W|rc8nF!gcnml7+=xgOS`=tuTic^hXxOO$k5w?g?RS6ln3)`{9Hx_S$p}O zkf^@Hmv8mHNg7ahjWyg?y1*^sPkYBwO>czcptxIn!uh(x!Yee3HmB6028d0dAXiF= z%?x(BSs1fwMc`Tb@XY94*S6l?7l(mt5&QpqtKE9y z6paK8>T5}t;D9ATW&Hbc&(@0zew!={^1S%c*=U*+t0Q(BRP!bB>Ye64CQr2KpMoQl z7Rb*v8Yv;K!?1n42!@%T0RCCEchf7$bWS;UPhI`AC62+=N+v{T361G;LjB{b1xW7t zw=dEO2u1_i3s6NrVDt4v(xcH;&;Si!Z46ZU+)#*BHi$}&mOrq+=S=Umh#~4?$1$a! zQX2M+f0;s4Dq`x_?cNy7Tju#U5%%5hd1MSJzvkvUv{A}@<%bqv$RuFNZIlw7IbtC_ zH``Fk%b1CKyezKh;W23EN_Tf_R@DmT<6oX7rW~eAv=|vmWj)^^goqV0;gQgV(eKh~ zk5V+X!ft-WDh=n&-OOX|KvCQY|L_U+)2&cI%8~QKC-nkMOCbgA=LzX`^i3y>vSq)z z8j?*7Nm40QhxuCCzcDw3@7;5Rs8LXje^QS|@^&~Ottgow5+kxjhIzZ>;*$S7NlP{^?^I(q8cM$uYrIT9r1av3wNj`>4k zF5{ipE^tyL<}!Ysw>MP_Phu{6uB4He%bqJF=F;_+&=j(!B9_ElD!aVa4awJrW}Vp! zsj^T>=#tAosdVa4B+a2+lMfGe^KyZjXvTx#{ck85U5Y8=r!3qS%we ziwBgj?@c1eptSqlLX8vp$vF8Z0_Q6f+Jd^=_%1pdDy?WTmPATbMqNG=3casbfg+?% ze6fB2rA8iJZ8Quuj0D>2>OjAdz6R7nx4sW0Q%;zu7?m{(o~ILiJq#@Y(G?#JqrGsA z3hSXWLo0SuveSlZhM{#%yG-|xoMuR!1*{;|#00eOD%q5Q1RAl8RTp6~<_LeIoOsO( zZ*Hoq;z}-I7Ou5kqPHz1)K&6Hm zQZar^GsIdu6AY!NBH?F`&AZ>@dqlO6Uup5ME1U3m80yMbc(iEniG}nshbGCm4C#iE zjLUV5GL)jA0BdM{1X4D=3D+`5?7u zbrqplZTzrM!-OWDh?D6=;i*s@JE-3|6thp*^u>C3q|g>;C~+sWD11TN1Q~gGMW_=e z45msiLkKA#u^94)B;gW9{vY8IWmkoLKZ2CEr(hQo$|yr{FLY~n;?xM0?<*+A=|jRaiw2#1j~ zrIl)Aim5Q-AH*7RL(t5YlmcCp0`{1^6Fl!EGFV_7)(&!_uu8v*WHrOwyouz2^az<; z)4zjC*1pw2HmH1D3OAJG&!Nys#)(6*Y-E$ce7cb3ZA618t8ew&jJrHJ9z1Pn>I6p(;lQ+R29j{OfT3m7wlQK+aV|Vzl=Vr2R;_Zd;dozzYhF z{{WZFPCc?}uF7FQ;5O3J8l|8bGJ$opt_+n-7aA$n!xA0FEQOOg?cu3`b7xj8opnha zOKW@H7nec^nd5kTdZaS#iv%T>Pox*;q?gWm%AeN4*TdrM_Ld(KYgROaL92ofrW0mb^DqOLm`rI8JTS1;giUexp+38#FfeJ7Q**d zYs=R-Q$F7M=JoDbC7euyYfX4H0$igLvJcPvt_-g~R>9u_- zK-a(jdHPx29?LC`>>3o28J#O%tQRUk|06!>(y0XjqpWDRfU#)=xqYUswoTdGf0+!as3Nm-od<9>7f!@R?f zo*HM%o+}LNk;}1#GmEV)&)x1hqhycY(KguwhipuX_~Rn?;BbPwJAu9pWlFOD?2!OR zyrvU|&3;lSEUtm1TPh*CBJ1JN2&NxI5^sqrg-kJWQpsE#%F-OkvFh5LVHMPbnK}`$ zSEV|4(X`u@%plTDp}&KKoSMD3eva=N2|VoVa`%!~eun&qw`9aIRJg5KAM0f|ek*N+ zUmz{ddZbUgmfsxc&ns$E`gi;|f=eM(pHZlqkqbnZE^U(M(Nle|qF|!(6IZ(;P`~qH z@8=}_IxCH(o>VP2*rZHrQ%|hPm!(<=rLcvkY-YP=;BKZJc146CTqN$7%;_sOh^ZE1 zvRF@miRiqENBy;4lwH2@RVbEw8+!?fZ%<^9nO8_&Z4zRtWFZ_PAsDe7N1IjDV((oA zO)+gqi-rjW=K`^34D<}ufDD%&-Zs$GKXFbB550DJuk0{^HX(;JB>N0j# zy}usNYhFWZ;x}(5%8u^kelCzaRmZ zq)JG+CLR2U&^d}(Aim6U@G8NW(+Q((e`uTf_~x&kSbx8OK(vjO!Y_sJkmAEJK43yM zLUo(6PODq!KGE3mc8{YDgUlanN`9RDF=ZXJ8>YKDStp=^52Jm8d2x>FMb_wOwb05@ zP!C~2IbG@A>SQ;L7+f2)*2$P^$h5-VWG9^n$M5J>G;v4c#2BggyMUt6Q%$tGpHD2c z@n}c@dGP1vEv3ygA-q&h7rhC_LjMX=XP5pKIa9-*TO3&0&Mu`>R56Lf(R#~b@K(>H zn;sv=W7^^q7X~t+&~$qYK!pdfP#jtx;8cP!<(08r?;A3a`a=Y}E;{ z`(Q=C{;sN)^5sF(I+1gv3HRN3TF!b%R}%07jOBx&1q|`%YvJ0~Scg2Nh-?*I*C(h3 zWxoX?igkgqQz*PBX3SX|-?>iN*?d%_s9QMlOmEpJ@Xj^;`*ujRyE(C8+Jz5w#X60o z+^z=Tl7LiJYjqzV>2iMHW3!OJr*|_yqpBCC@;;&T%=^|Sq^si^Vi>@U$l))a>(izx zvXThbG7DV2BlECm7yDtiB13{(b^AFMWy7s}6<+!eZ3(+F^R%`cvs?0z3QeLdw=rc~{=D}&puLb}PE@31IT$dYJFN@M>6ilp`dYvMt} z7bGJaYCE=T+fVoO!dvdX_uCiVpZE82Iw_l5JYsw5pV*-VLrmUh8|tL>sBZFn`;Tq8 z#~RVsJ0We;q9cV^VW8qaZUP&rOaj@bdt|!Kaj0C7BAyMDG) z^_=L}L+r6M7RbB$lOz4{md`Pu7y|2|b)`C$K&R5md?)-uiKV*RRH%w4^4T$on?z5A z&>sKxj)BWnW%#^XkKtj*{=-d*HB7_MK^QL9W`-*op+Dt}NVsG^3l{73XF^nXsXXJd zX87F`Ru9u$&ES*cKdhc+g{T+~<)>`-%uBDmR(X8J=4DxFQEA=!l$}^@7Gm3U(3wuj zFTmVPwYe-p;A9Gk*R7~3B)Oxx-pqpYhV^prrGN zvwrDa$mO8;%9d;??q=*EuQ`jq`$$c((JPhb+>gDcs%`&=n>5xd+>QAX6|HnnB>id% z=?5ny*dQg4t&gXZbNgbdoQ2);OwdIhzkd_?yU3`a$z2poT42Nsp`DH|l&}*8=!DvjiiU`_cqzmx zux;mMulnxfikuoJ<5{k2F@IvSvQPQMQ9HPqU20rN&Oqc|^6@P)n1qzX9r|3Plm~{h z8TJeQ75D2GO0gU_6p7Mp;uXveA0H5_KJMf4s400lDEV&Pk5uHNbT}m3yx=#WFu=$< zEWT)eG$QZ?dSs+B(Zc%q%0Z3dXR$3y))6hr4T~?DYbA<*mVMpSklh%)R$*grRe95PQ_5_*~J57>*=@3UK zXLQK5HJDMj5J%2DHXpOHwsU^3JYyI9t98c4qn6#yp}h9jjFUI-o3D3M9o|g4H-563 z_j3N*y5OC!cgyav=C-wsi?=eztX;a>w8QO{h{yB9$5YC`7d^={UM)TP5yAe=_s`>L zM{JN@o84W3{Vsk#(6sXHh{|6a(~ooLR2^*;H(#xV(j}@bCVdy4T?rJ}e{%Ps4SEp3CaU#o@Ybu4<1fyyo(Iio=932zyB~`__gyCfmjb>CZ|OarCcd~l z+NS>9`R8l<*88(>=J$p?%nuL6X(`frBHyHO(iQ6C;?trmelgj}0uL#f!aJT^53ayb1k~I$;A#S-s zZ~GO=`N1rlA$*xJLNLL63>>A-7Om()j^e)rMS3sAy;3R?wU6kup$ZNzTNUei3%mC~kn3>VwbrE&- zK1otky^w7~%;C7tG>;{n-BL()oH@?;%^{7j6MBZxmWo4*l37Cx$7s`TDF3oC^s!tN z*O(ekziAw*NQUFY{?WTrQA=B6VgPdozxhYPbd{uJaEEmXn)m};vk{=?0gRx@N_M5? zmWF}h;Bz_OTUw*U(y{Gb_rXQzQF#Y$=wQ(4ebU3os+??EV{ zw4k*XCbi-M4Ugz~4FS5sv4)+Eq)3mFLEVewu}7P`^jtsOQJMs7MBZ_{#K@M(J!2gI zMO9vTVK^HfPGX`#h+)5f$Lu;W;7b4D`cy)YR$LRJIkN=0G&&2>kS zd*F+rbsDBG$4NhU%u4wVc~O>0uX1R^aMfcc9z8Lf9HPNo-#VRI!_a^^<6feJAbUe) zr_jxMm-y6zUgmMwo(sq_(gncLq+@wkYAzG#-&R2NkRWPRKM>auX;~fgTwhcDQgzR?wTLv1QNkZLI41H;*$z6ZU+*tJ(CLQnPAj^^qXV zBzvW`wy_7Wv;tlG?;Sqr>><6KPjD2|fbUra+i+M$dTVX$4Abpmu`4}x0~)aJ54WxA z)!S@iys}U)(PZd5{u@@rcbDUDZ+CXHX^f71=d+3FP2aazg^?soO4fWil7xw-^EIJ{ znA;N~udOla+MC?!805wnnRM@2H!#A;$c2N+?+l%#X8SW0qec+R!4D`~UDE8h>TK5@ zJ1DYW<{%`bo>7AX;4}kWBIx#6QUc!(@H9b^_Wr$P7Dx4H`b~m)z~oCmLmL>jzfUHF zD&5}|!FSu*-|C8paszs#!Y0F?*0%js|?@8j}j? zg-)BK%kogM9`%2UD0YdiQ{tYu&h>h7bD7>RgnP13#2|;&VORYbhcyh=RB7vgyaiOB zFm0z1;SP!S4j`_v=0k4Q1TW@?9CS?-ah#MJEqdK`4EjgHEJ#}1V~VVvxn8LkRfb_; zGT@nq58S#Ex%lH|P(k`GbW`PJO1yI(&*F~t?~d8Uhe01TzidSQ?%Z*B^dxnD;bi*v zhn<8+FNi$L-!p=`RvZwR;`whsmev*1xMLpMiqDfZ>2jE}`^J5vvX!EJ^EP6H(ogMz z{_5TpE^&3)Y*H!A&7^O-joLcRMX4zQU+IWJ8jf`>f#TC%>`C(Ha+Q|OGd z=sj6u9sXgYBU8-weEY6=z8M9t%$>*aJ+W)HL2Z*Pbp3BqD7mcQy?x%w*AOqR?%nm`M^svxtqamP7cM0zycCZ_2X-(HI0-`%b4 zr}5( zu6Ww$^Zb)_NSZ;cc2*smw^*V4ad;H7XL96oHc!4FSdUPFHp?Z zhqB&@mnbz#_D7boKAQ6C=setv>H#p0flu z+qBFyW5=5Xd;Qzdd?_FX<`}61CgGc2Vq?)tDQjnD)`4bplU?+?E280Mus{S;4avkr z(`^)UOm>kxdwRYVRxts7V_(iwGLCu#v(p%V;7z8d%p5uhCJZ?%(0d088a5QCAx`4> zmvO8zD}OOPtPgQIlVr4i3x{(UI?^Jooj|~5;IDr4eV{~ifB%UMjA}?8=mz5L>1BN$ z&8yBt+SOSbZ{#CwY)>**NhQR54%Q=3itM*hqn)w*@GhpsaoZ z?1U(ZCaAwp#VO*brjUIXbb*%z0-Z@B=5g#8KX*NcauGDknUf@9`mG$2M9gqs5pNb& z^VRM^G}ki<-Vn1L-Fy7HBoGt9)r$tVjPcVJaxanbxMTTC!W6t-GK$o6!(=Vz$iE*l z^Hsxsp7};l_}p420B|p%_$ETXdBm^+`>46JUV`)vbL6tQ%ofWeahKFaN-OJNCfoP< zo&IRlrGoTPitM671fzxQS~3)10r;s5zZC!-7Jt%C+XbC~snmRKRQhD8+HBpN87U2#3l(}^|~N3x@6>r&_mg>0ifDtH*M zt`O>l2JPM$?w7jf_eqc^bDfKGN#MR2?Q#IxV(inW_=<1QY_Dr+Xp9yUg6Bj{oVY6BD7t2rhK<+M% z=A$h8rzCo6jMZcpWT}v$iACAULfFLuX73NtWE1;UNJku9;ZEyPdFBF6gI{cmgNzZd zr7-TTRUOkpj(#T*!3X0nMQqOe;cGs7V&eA!;=C`(t*e#|1mp}Ek_;1CD3g_p=mLzQ zj>9W@|7lrAr$NIf3A+8pNlMpVCC%693mHVO#InJ9*!Id=uA#QCc^F0iw1kc{U7<$u>$WJfw(|3(EB{68PQo}N z1o@TfZyN2e(2_H>;WW|&K!;8<7i)a@0>1l9GXc`~JgHgMEGNT~ez5stAwcW8@P(sF@Dwt=ju(IZzb{ew8UI}#OI$5JVabbRAt&-u) za2!8f2%KZXKK-&!HSHoWjaABUSxa-hPH;sSy4`J+A4$(Uh0}!}h<1QHvDThZ8J=;| zo)H;h`h{#mUsexSc=I^Cimko!Pgwh%n4)$i)vdn{CVDA&dYj|8Y@FO$IVRhzu^&|K z_hsDwnDC=R#m$!N{wEHP*6I82IUY^~J|t4BFdyv`ebN=mQ}2I2d9Y^vKt1r$Eim@N z8b}@lZ@nM;xt<~Ojc8(qpX7`m`i)rkn#gT8{u1Y;fK^L>!I4-D`&3uJ4bI0FZ?BHl zTS!yz-nn(<#VViEE!7{Dl+$lc!;+}i6%&vgK}=>r!9kd`D#}n!3M?l)Y9=V*6+A_C zGkRt%^X)A<*~DGn7Ox238I|9Gs=;p!olAqBmWuG!s1`d3KMS4o&Fw$v23s!SXnL3p z*Ky3GUCAsF(Oxv*i3>;AnuOR2tJhn>eY748LP!oBMuQd^5&(9P4G*p`cN8$K3S0ot zjB4ZtO&P)GpC{@z>35yZ*aB@@t+7OY#rs-M-rWa>_y=LHonmjQMZ9_xq1hH8c@)8r zHm}wObXhSzItem>M0u!%>pcNJYzu#UY8VW8F7KY^lgg1U0!a(gfeW)1!2+ZyLZ7(= zRAczbzLFkAg&N7^xAItaO=us7ByGh1V__Cvc;PVDb^;yS_Pz9OZIQ*`RQI+sp?fxo zOv%!z=}G5prYpd+LDkkDw zQE6}VQuX=qa60!Qy|We2B^=iDRq(|MHN;upM4jdZIRj)RBY}+P1xds#5=;tItW?_>g5wlRsvJvWe5Tk>0NasrYI9sg4U4pVJF~}-;myqM($rmCQ zn6D4*NRqUoVpl+m#uup8J)vzVVB?$zZ7S@wx$2MHH9dAUecZqs+!#ItoeHN{TN2~KFWtIE zQQc*!G=b0uC2Yi43s@RCgD#o=Ux5lHWZhqM-CrEdCIvJtO9I9nAIHM0`-MLX!OV!^ z-*@s?&#Hd?x<>JKvubv;W_Iw%9C|!GE_V}{wrpmh%iTm)7vw^#h2Zssg$4O}?v1&rCAT&gJps ztAW$iP3tz1ywVj>_Z2E@CzeA_{;xl&$Gi9s!vKC^fSe|X?lN^5jvxPw8dCT+?(A); zU>k4!`QTm7ngXWDWxgsF^`EfnzqqCcw>o$haQx729Cr&>1y>rJu?pfv#x7!}>lozH zN?o){Tz@S%x$@T-QDLygm$l4ieIc0hrhODW_vVz()VZ2VtNRhO&)>e!H>lf9=(q^O z_=YFyY_gHrmH%>it_}l;ccOC%du1TRzX~6W>lDQ}57s}^>v-DtHfQMF!q8A-pR>^A z=9k)3LC&J#Sq0yF}E9l9Qp|Ta%BJIvDUCNiYZ@^za?A5zXF;C zrvogCC9ZJQBO7-^RQ5x@2sw<4<~E;SD7W2&#go5IMecrn9m#@XTs0CSQJCvaW2G3# z!~QWh%5m|98FPOOTpd^$KU#^Y=Zg>a6_y?1aPOjDxiY zuS#GcS4ZLFn4;2J_~hg5i;^(m?Z^Kofk{q#LqIZ>g8${hf-UdjZ0-ur(#p!%qN5O> zAY*$%(T&E=m{aoFzm{T=eqe1^KD)Hk6bx^7*9+FmG0w}CTX`F|wO1X5a4CXJiC3dG z$yvixMwG3Xy@J-0Rzcjn8x|inqVl%+FE_5EbVOYE(wthNx|vP$VwVu9ub+zXtpN86 zcubx7;tLrzuZn4CZ9Ai8uaMO3x5B55qh}*mooVZ4LjDf^ zrZ9-3k6&SX`rz z^*s@Hqm|a~^;S~;OU(Vn&&7C;S4$x*Jq`2pY7I^=7gv)z%XDKt;Yi%%3&!>pqnb3y z+aKc9n+Nj|QG&{IEaGJJk_KeYtJGU}r)n&`6QGTxPSZ@WjkRLeJLJE{n|n==b-^_)CChM?m`B zLH(!iY{sac_u_K`TohZ9*kQ7u@fDi8>DG5h*+%Av18%R(>9)DZ3L5dt9OBg71+Tp72HZ_p ztzX;Z_kZEOoZQYu~#&dYZVjx3$#K?5Q)*vNn-;pApU3H07IWFP3(a^Eb)O8PR7y^JaUAdz-M z+NGs%LB?y{j~0OgR@yURIj|Ya(LOLdc%=2q zCa(4k=y9seql<^bjk@d841WD;r9?`dBPCNR{0b+4^^e#Xkl6%oCr&jl!82Q17`Hdf6`s5s$6bB?_e$773uWmKAFYZ8783|ZX&NA&Mw6)*+#sraw z|Moj@-TzKN6bpw=qI!I==;z?<=jF$$ugUHcooKL)kfq?=Vwt>l!%+Cl+4I$f)NfZ^ z#14%1I_BTD?|wj?s;d_kYn@@>%5dQBN7*mJ-`@r4#tVG8al=gAT+CVM*BryN=Fw6! z#|1$`pp{&m2S%-T=ob1#JM{A-qM6QN*T-{tvsb9lOM%DVUJ%oLMabD`&BENtc+H`5 zaPq8}kaa>NrA3hz(MV45=R`jB&Tl&5XmlXw-ZJV`6$jRjP^|M~zZd>W;Z|)dS}LZ1 z*3l-N{^=-e&j-)QrAjLd*^JcEqNR1F#gj3alIiNw1K_H7L3%jFx>(n?-8h*bz&F+` z#Tir_mc&k6Ti7{ysH$k_*G3m%AgAvZ}F_}ms)o(SL|4PKus~P!vtDw zS3Yps&_q=%Ab#OT3evx`SO65$pI6WrrNZOdw=R@(9)M!>1D6N|N4?bb@=MKG9}s*b zMsH{D=WZ^@@uHvX)lHYC)SCOHlFcs-tT@(rC{u(#o{j;oqxP?N@K|L`{9GmGo!#3FZ4&ZhkBWx_R3U02AMpFWz~UX&%brT0>v2x z`CJ7k5vR=Y&nnzlNBBp2mxz|1F@ub+Z2FD2M+ttek)9JeB$E9q4bu@mB*OV)5w4&0z)2hoqHA!(g9alzEn}Fu&k}BiR8A5NJCQD znCV&n=)pyVPYK=KsJu}dJKf?1sOE5u5)+~Xafh%gu&ehMK>Th+Pfo#tt#)mv=)ZRL z%4N*q4cMIg86VVSa)D{>o${ zNPoQmw(x{f0Ct}2Ew*~~5^Nxspb?Qfkp0!rcXfs!?Nj}G{@dkuzSM4A z_3q-xgX5ky+^?wn_nr~sz1Lm9eKsNC^2MN!O`rg+h)Q7M)EmCZeM9bwNm>OmlgD3Av$+9e^ zZMuuKN70Ag@0I4TXU*&%6U8)EzSH;)L!20^*G(yfPR{agHz0%l`oxg9m?Au&cT(aWyH%HN>*`aV#+x>IkVFKXKOCEZjB6VLR$0X@8*KRuZI+7;s@^foSzoq(vw-dEW{$(r=$r6>E1mXzPJv=AA zagI+YhzxIGhT>gh0F(3L^`FAwW4}|5a(`cNJbHpfXdxnU&xLlbJ?cP(*e*I(I|xAQ z+vhcdk9f+^041AlmNFR(wBi=;+7krd3js`iJ{jZf#Xcp|0!BABEX-2W@Eeiuw0-99 z{#siEExjKD_ktj(dZcHWftc(x?Ej-gCz?ScEdFhEURD2g7{i@-zuq|5NZzSBgqG!0 zYhjUJl*Cx-5+cJDm)0-)dw%?7eiDk$gPpf&AsXb|LuG=W?=@h6=U|?rZ4hEl?^M;BOa`=qO0QGmLq~m9ZkQa+KT(j7B`Up*2cgNpLw#rc`R6 zAdEhtLNm$3DDvQxQ(@R^74%BU;RnJXK?22{-{GW#x5caQYo{35Pq)Hc zAWK2yM~@Ch)K4%^z}!M-aL?r~f-Q$t<42YLvo$FNHHEg{v}!XCUQ&xk)G&U5Xry zR6|ldZStw$Q%SC5x6$Y4VyTKosmk|KRY}}rWvV7gncPX$p-V%Er=g6}^zNk@#H1Nj zrd=CIyB^FUV+Q!R{qo}FbBlJfAIU^|Ia{jR1L+Ps>5g<6_Q5PV@C>(m86GhiIzBAU z0~rr?GMv>j?uozhXwP_b?^V!Fy07}HXST1N?7VtFawpYaJ^%A6LOg>PmWeUSj2y_+ zBctFYqtHms^i-!5y-ATUkeS5wIyE>WeU|*y&MP-FK=g9F^?)p0TUeQGoah1L&T>{g zU3Mc^c4I*lgD*={Om@3%Hl1&F`%ZR!dp2M!`{SQ%oUJnNXcqo;c8mDEu`ys}KgB?0 zPTfGxhA8hB+o*B%+^Nc}iDv9#WlT9|?u2@__iO$hqr8TDIo-45JL1{jgLCF~@`lBE zSH<(^@8$hQ=dQ)%ub1bK2ImjGCgv}x*cy&edD1u(<6|gnslka9P1n2RZ(d_@p z@|w*VL+5eO=bzjw+<_G~926X>7Z6|P4ssQpkPa5Fis38xC6AUGgt z`H@Uj5t(tGv={%OP2qk_VJVjh+gagGSmD*DMLTpQ$&E!~c15SmrEPyoW_F61HA=Oz z)HBssdXr;RQFOCCrGIXgQtal-36|{Gmi%cf)0K!78O%4&Dl^e2x3epk^((6nE}dvB zpKLF6i&Za!uw)k8R>fSWDg?bVW3bv3AKL&O`AG9yUmv?t5r0q~!d-ECP==o^V;ip= z{8Q;iU)V5Po{~}-mQ^m-L@Y1gEw}Zm%6?iIBox>3mDv{t+#tJgs0;qt<}4D*bj=b$z$t^r_;Q4e;J?X4sTr`6%P_ZOKR0VBSxSESfA!9Svl0*~Bx!Lhfv_ zxQ3OgN*eF7Iqt^5Ge$**y0LrBf1XBU-yt_|H&IEZuFZ*mZBKGJZ;AH@wU0_GpD}NPqmpg{ z_dD2ImzzDVv493!EuOU+Yc%8_Tc5`X-(hHWifeY9udP&TL&vqbd%q3B0@I7-Z~1{X z;O*(&?HSd85ZK!U1{qiHHmjyGTZWEjk{yiMxxv}B&O@?a{ybgn?#SWM>;tz8;TkW% zO|;C6z6I~z4ZZ7}m-d2LfY87mo=z5D5dQYthVNk4p|+2LN#~v=c`O}aL*jYaZPwkL zP@(kGruT!IhR0}d8Vumj$bNYIhRGL{p-7wI!u$Q)U;|9!05jYm&^n9)4tyBDUjyVU zzvn;bfF2P$$G%H`x%U3QOPoMZ2KdSWQ{I!A}z({*+T?R5(u(DjA!MVaxFIyKYG_%FM7V@IiekswO~ zU8Sxgn*yF1QIR46pkiixFipK@2Nt;K?t0SAW6^!{{-@fz>MmW1?|c|tM;Y@8igeL* zC=Squ6Lqf-{gEa9w6T|cm=0HP`_JvGI;@OsYmdxAJ_XnGJ-d&%P5@%NDwL@Wtj&Ps z1=4~=fOp9Z#%8Jy7_HfA07*XdUmF=rtD|Fz02Rl0!QsB#y@C8;H4YXIz%Izui3vN( zaRwt&lc#e&OL5j^93Y^9ns`2H94N@CWfnJ3Fg)~rZ)inu&^bkFdID7ZAI6f@YuR52 z9X=RrWgP+_`frrUsVmTDFMNE%07=v^C-V?qssS%AFc0vsoWVQQrI-)kotkHYts`#-a5>VERf1FuoASXSd1zv+*9$D#@k`)A0pK@G zi13ugDzdLZ@&lU`3#okb$X%>YBlsF3=Ic8I|z)b?I0lj!t@pu`3Sv3qhAw z0LuLVs*(GLuynQp+P$A@NC$O3V{Wg0Cr$J2!9@?k{%UXhKZNC>KHbe+hxsXok%sa2 zBMwx)Oqs9&4EiRPK+EoJ^fhf^sJ6$qf#GMeF^RBjUzTh4p>K6!#E3vhgrzS*@1OeC z0{H_udTiJ~&8{EyW-iH=(ny3QnSJiaXWF)TSNW;qXY^105`o5Mje9?q@X|dqT^v6W z=%USm1CQ;8ty;c(x;;^9AE$7e!`EZlhxRu;bNE%;XV<|_#^5v`M&k1?DSsjIT0P1k zJ?fo3SM5FFQuy>m$SVi%NLPWY8UMku=?x6HJa^cCyx;xP5abCN_|Q)%3>?(`EZ{MAq_INV{38Km&aD8Rj_1{w`Yk&`hjQ`jl85-~u9r@(R4J|zy#-}u2XlUR_4PG-o-;!~crE$Scl@IDNn=>b zb$oSqHF2dcOllZw-ye0KFQ&6UL24}?u}0-gbWL4gdT0s0$qK^4s0k@Q=J$WyZ0&s_ zFa1Z3!%cplEnd(M!MS0*d8%lH9b<|brHYaOH0NFS)%h*O-|iAd_mr7Y=j0kW2r|uj znASD+y@Mgnneig$y5Nn|Fn1oj^9-vq)BZZB4@|?5Dy09610ZndJfb9!K?=2gMDPc9MkO$O8crECom6}+#jqk@v4?Osk@j~Oq^x( zqJEqTGO&hLEdFwo{maqXJKPEV@qnl8=A7r3lbt)z`yU+-3O->32(1mMccWBCmY16N z2X2jo*EYbgpX?NlEN7!s+hGt#wjXW+L<_AGX)@p4G@a11tb%uC;pW=HR6#cdAvafb zWB!(;9R*cbYe`<>A2;fo#jAz>p{2iKQ*GUjxGc$mZvV{7v|~=H98)!pkq?EQvtNOf z^B{a(_D9%e;z0;KEphGKuQkHBXU4~RQlk3#+VUI6uaC<&?eK_^f;_#rlZ^sD32)xR zTc*Yz|6LyalF9wTH+1RQYiq)CbdDFxeGY@wLiim zYj^B9UV1%WKiBS;*K}>6^$$!8tW3sXb^TmSc*d}cMThqd39{4?cJR6sfm~#=Nw)SE zY+li4;%dH>;uC^AfBNg8bw9C1oXf!EmBFfA;6dNXv8b?@He%RV-{kIr*IcKjc6MRvEnNnL>CR{M=$L}WMS;S6!Rrl(j)tNR zwUW$TX7Lg;Jim9wVY&F{lGx}%rh{j?jPiU3Xq9UXAQXH9oz`Nd>@eLb!Cp)g6Ubbl_YP_^GKa$Pe?PsP^l zmD?LzkNgO0MTPz4VupB|X7}Wjq7A8P#Akk7bvcFMzsC`m5^~;#Z+0ymUm})l-d^3^zGu%UK zL;c3n>507Gc#rb`ceu$<_e>vmSmd2dmg`V^nMG(=7I92f80dMKr*~MEsZ3Sgi14y3 z)v&6vo~pX_$;-O6!>TTDs@jp-+vbyobyLPvjfbAM-BgG5n>SOnk4g>U|4()$4Gkj$ z0}uoTgMiFn(7${o$*$yqaPsnS3-CdO1!3YMLQ)c9B)IZFYNhA*%JE1mCpJh6akQ zz~JYjPqibS`Ti?al9|N^qwj&aHi5ZD@+u=DBK9M$+C^&p!&F8! zKZ>%zMkS_2y|9ngxf6r%kAeHg)KAA)|BTUfj??XoGtWxEx+fa{$E!4Xn$(n+9Q8HX z@m|XHs1)yzmu-ZX<_W2Jv#G8R(#-!ME7M~Co2-2H`Zh_eB$1V2IgTWYGp>{k{Z(s4})Nv+JOeO6mrTU7VyfBDM)wk!XID@kf)b!&9an^+QAS^IV{zdf;` zJ&~kVj=ZZUsg-a32dY#t9$WZ1_I`f6ZDYLWzv0U7llXt(%7N(*|H74Yn_ss#H%~U_H@5aJw&yo?4pw&;fA6kr?ydaUTif}$@#p9EE@A60Ve7}??)l-~ z!LR+Jlf#RX!;?S1&i|Yci9|BeWf+m{FCEmdp#X=aWtFh+Zz$}E0t*@E7&aDtii0S6 zE%Y}Q_a(tlu~4I?l7Un)iz@qprqZETa(4%FjGD_vveo=|7Y3TkzvRQQ^t{F`6=TJQ zsS*x@EtM1HX2r(2#;sLT)i%vui-WDzGxbhAvAoyb)XX+}PE$5hPd+lxA z;=3nDyNg3_>%V;r`~TRkocZM@R-T&LJOlJRYyRyFVONzW9cp zf?(kluwsLcFxcA0xfgiDRD3h@(}0>#xY^6f&8#+GU-!r>B?Fr|0Uf;~K$A@)42`H9 z7c5ou2^^Uh+=K(GuczrUk=RPqKWruHdJ$kibx=ciK=GdOLKl*ugy){|j{dS4_J&?* zp50be_AfbO5HR8iUk=ABvH3+6QP49_^b+fi_FR%!6&Z_u5l*fphG%D{IEv!NgFn zC(bEa%$a+LtiZ02L(!Nd0djq8_yfGesnQ=WuJmO(KiZ3RA65%Sa;gZM!#*$SJC2b0 zuvy3_hcyaWohaF>gce~HJ}!>$Ko*nz2)`ok_!3)Kf}=32#c%c#v^xg+9`DvZ&5((f zTNduu5Tom>yxKBnV0!pQ`T6yxQ-oFSgzw9cAIrG8*1x;-^!yv}2`&lMYN~2hhwlw; zN8am&W_69$AJKnUvrZ}C(+XbtD6CRQJ&4ZY+csdz>{1V((KXM(`43Dj(#`2nBwxN07*3@)4wco5R z2(JB(_bljN=ZkoPG!)KXswdbWO)nGP!Iy@Vz~_D~-vW&5+8(a}HXVqgou4adjr(kv zP^ejA{Sd^%?b9wUymcKGcyTwJN*mbE@QI_slUGXcy_T40+hNG}&U6MKu45NlMmx9y zXrGM1ior8ro6JsV$9{~YGM^k+6#5dVtlsB8eF&{#Wc6?cRPODD~wD zuo=4=zJS~FoD!bEhej

x3)%5|qcMX@TB?puSEI*&PL1RZ-VJz1$1o^)H|Kqgg2l zFXIZsGyd`w)QMp}<5kDgjiLd1)J{6=Cj_mc^!uR5^7*)qv+Ayw-0^ixLubLq`HMYIXT z2OolgFy=K~?U*8e`CF(@!9L2U+l$6=AU6uAI!iT^6cSOxC9QV~_z!!uS}%Z~ z@)BRlUy{4eDQI?=KDwlVw!4NF0Nc*+AzV!nkFef`mRnfNboe z4AB%8qWg)riM6%hh%q5(au0R#RpEIU*w!lfR9bXUtD)%Qxyq)ljaAF5btrz5Ezp+JM>b zPzEYGP?x8qND14ZaU2$OJvqKN1oPA72g)=yZI9IGNl3mf*;d$Yzluq*pBhc>*4ZtV zAefoELAA52GxQey7}ocYpx-cl z`y$^ZguZSBV(p=XQsbk3$BHE!6<-hXR`_~X(qv&@9#sWdODO8lF41+W1BCC@B%NSXrgJP$_Nmv1IhCw+8ey!sOca`=B2P3vg62XsqDT ztE0ayBF|_6GJKy#?w!5z=|DD35IgMBt1fHN@*LjH64*vcqTnSjtPfpJhc$nB0y;Kv z*7@W+huqp$~@zecva}rgrJHg54_~Y><6;Ty~>Ei2c_BU9hi!16CzbD#$q4%dz5|sL*M(s z=Y%jXa5lgcarhJsxt0^r=(vnG8}Ekf!O4_2xmw#)T|5TF%0zp;O$JIU?u^0qG!QIX^27`nMo#!!he(`>RHCUY01_L zd!9*PT$n-$Ru}@oI;b+(G@5?QE!8of-lHu1@Hj@9z%TsynT!1v-gW6Z{VWpTY=5IDI;I(vCUI7OdNe>7cs1oFfUy{8!6Xo~*MLY=zj z|ft_J80^KWDMG!5+&V9;$PVkh0(=$LrzC9(9QAFkQsHtQVn5|CVYsgGxK(Dx^ zRyfL`qM*nb@(_c52|$F{Kt2JTUk0FAAn;7Fz$)k>R^$Uy09|)Ae^VfurL3ZqDBSIiAK^KjWd6VT$RI&g&3g{x2C6v!93sMIG7?5bVP%dBfF19K)zDj}u%Ey2- z%+UGM#aNzENdX7`k5Q>m2Jm4(65%xhDs;iMQJ_F5!#5scfb+wMuFNXRv~WK2&MpSjeXSaxpy8V6 z?GKN;njeGnOW|x#QxNcyR#OvSFrZaLWiqRpd^HL$-IP9apcwt_ zh1DnMs*?Qxb~jp$18?qxXPN>7SygZl zC@ixopH{bpeIk3I=KQqq5&+CopSig|DJs#sUH$Ywpc4CJo(k;g7vdbK=exjkm2Fh& zZ}7Mv^pFMY=L5<$(0><&UzU~OSdc9iXrDRa@PlVt6bO1+IK{mw{YFz^Q`p_W2C;LX z_C?|IWkBZj;_w^waKhD^8fLjBns>qR@akbQuq*tl16%C}j--00_SF3{C}# z7KNX>x2nboHQs0~c5mCsFOcQ+C$^&Z^MK0m0@z4f6<@oGU3<@&_MRK<{BB_P2koDD zJ|%b7USk0?N=Q#zv%Ic^5g{E8UnG~wU~D$_EYaGh+&dgao-eGVPJ;!(9N0qmApk5W z3B|@uun9%iIggpg^ zI%u&yraSOBb}54$%<59bc8irF+)}$daC_~RKoS$3&N#M;0f$8E7M~EZVQ@SrdOodo zi)jftmGX&B2vX4kE-cs`KlpPhESUL1w3M%OSdd*tWK#tf7|WqpTmW0#8CCzwx9>3iSUM!F%~I9^pvyS6WMvVee8g)I`IQOVWkLT?pg*VI2Ic%GRrc)& z?o#;&+s5J!etedL+M+s`j2{oeWb8RA{5bcFu%Z?c#)&U1)|ulprCWvEET3Y`Aw zt?DX2oqPE6x#tVnpB8d=7Z`iL6EMJh`JNz8< z@`$e54VAj4qbV)FKscrA~w<2t>ODLz^V}CUPsi!*~r?(dUasncOAzTAz@ z{NI|-e<#fR4om#)H?$!u{3lpgj)^@mDYxeI<2zMAP@Ve+rn{-)v>DeSrv-i`G6d6O z3TK^@X0E7-p4o&83+4*{Z0^z!UV?#?iNUq82V zK;hpQcj-gh-$r5aNbaS7pKd21p*zY0u)90|z9#N2+*TwIP=w(>`vv$t2q^cG-St!4 zcOARSKRGM9+}~mUh9KzO|Ln@)X-hgvLdWla*_E?a*)m4|V^_Y&9A(6=Fc6Yv|I4m) z_EM7Bm*)^^co1N}e>49-cBMprH%UO}V9pQa52N3BcIBx(RyBpnPSqz)6I%=K-@`F@ zcBP8@*wTsmGjAWAHjhwdTsirrhYqLvOQEGm8U9(+xC9pzfzfANt4j^uRRhz z$k7#64aGJ77-_g>#Iq~k{|Xo`-*?39q)fPLWLy*Za2!hhs&s5-K_@p&+hZ(b+~Mbs z6DHqO-m7^}&VlJhVRL;cTd!o~CRg6|6a}<9*sez{y?+)q|H<=VTkD%I#0PQb_s@JL z$E2d(*lHNtGwxmay#Kj{+1lPGD@0UIke_$W-*3+g!Qwwz9c%6nJGkfpzT!JotD!_b z&H5FhfYH7|9O6z(u&Ud7!Qq-9vj|<4a~wrPcla@KW~sO)5P$Z;wnTpM0wRp3t1SRW>j zRQD8;;c?yfLkjZ!_k;$Ks*lPV-L&o|rAFwLo+)^rRvL2en)S6aQ1Q#9LH9-mdt8#l z_viZgLHB2gP@Dbe^7r0bUkl+>TOThhj4C=>$c!kGJr}(!A%F7PztQ7zXU;ymF7aMG zMum$*7gtBztjNqU_!#xaGxoEQCS3Tny;K!vtndxF@LtlKu=8%Lm5j{<8YXH}ljN6W ziS`;2w1?Nt4Ux4hzs+Q5JC9RsM8BQ6;fmhBF5w(LaoR7x$QCY&GiyFQ>P|lVl|mL1 zQSm5?PO6;oqkp3h(eLY72_C&vYCU||#ZYA0l5A@|?CFAwpXcJ~)eTq3yG<(B%Cm|k zgJgu|E5(8f>gYq{d(AP+O-r61Ro8qNF`+Hr8@5A}Y~tM~4(LYj6S35O56kmxIM8%G z!(j6CihD!3xO}i#owR(9yc0blbngrGYnGz#w}vb))pv<{n@#PJ_1>jG!o80mWFQZAg-a;?3y2=!*LZqfkX=rtzuO+ zXR`1Xf6oY9vdVD}MM3ghC$M+>+dsxAs1~dgCtlk6{_gP4Kjup}&KY6!x(hwqz`}0{ z9-PgnI!on!=l5n!%thz3V2e63E349n?bZ?OkUKQRkKJ6DK}K$g3Pj~w3Uth26lHgf z=xb@4#^X@=Xw8;%(U;ewiPYA>j}0dZumCQprns=g0Gk8tg4)M?llP^{vb&(MSC^lu){6wK0PaG zKBQ;`X4pq40vQ=-DR$cLE2TxvI?(#)^*1=A%nfdz>?Rc??D8>HmQ!Ytdggq<1yQS{ z*To0^l0#$PpnDE%yX9C;W1wn)^5J_CS$tFgLS`IOV$zZj;$S1-YSpD4p0lo0wo13XU?jLa>BE<^QjX2+D=-``*OeRx%)9^q7C3YLXi=X*b=>h< ze~GZM%0(ZysC~>&NZGf*cel=Xm->4fwQpE7$R-W@ySb7a!~#ottkBm~h=LU>P*?-S zHD!`WJHjF^@O%h$Y^@Xb2uO(MPjc0pD+7vIV6g^Pf4|QRQaW%_L4!Z@T1ygY?(DVR zy(b`2*}o+VwTWjBs-84y@HOKF7d!y6c4BviXdW1T=ir;tZ*7T3aa>b>N-it*~FacmM6#!@{az?!8 zh@~MgpFuF|aFSf|QW_e|)$MgEna8Ch(BmpF&_l6cAqovF5j+iB*yMnq3|gTa7}y-7 z@lTY&rDDZl?i5n6x7&E1^K~o@h%W^d1(SeM zExqFhwR}A0Prmb4jClCW6a|R8sq9A+E(RnC@#xxXm1x3!XAU46%coKau@@$A(nFaW8A5?4ld-bRIcjohPLz=J8mUiULn38P41ZDvWi&2quBiNO- ziw3!yUjtT>BB1>q1c6prL_`?DxBxSm#}QW>Jv~q$ddZajc>)P#D=2F zK28f6sm-Jl!vA{q_hW&s&Dbi81^kkwLX z0|6*$MK;D9`%HnRW@J77z6PLx2IOYoKt^&g<9Xp`gCGD2!H_&oft9)yo5qDQ0V)8uA|gE002Bsx+7#;j z6a>L`Cb>5r#93~GX~FiHf>^+D8g3SDeBTG~7{^AI03T8^rKo@xCKc&M3m~Ku9UvYy z<{sm4NL43^N<+F~6$sc+dlZ6k;E0>f)nSrFX^ZV}vI~S5iN-q`lF3VkdSc0jrU01A ziQQ&tL~`Z}h<*+|wG|wn^#bN+iy|~w4nl*viKK~<#aJ?MG?|}_2omeEP)-ej0tkwU zV>n&Dm_4rw)zGN zLz9uy!j6H5EVBXd0VoLF#nrxmiq3u!5_kw~1L?g&`wanpJUC?8btu@@9FzgR4oOEa z8~r11s~)qyR@(P)jdnFb0gVO}O*;!&sSXaP@Df2K)oj&aLH(R$jc&A0OE-HdtC?~HGBz5LrL&; z;Amg2L$@ObY$-LOrNC(*%@C6+64H2IpC)D{B8#Dc35WMY?H|l`*4KWMRWgG0%;biw zn7}Zk(56>ND$O>~c9#N22LkX|;8Xobkxgr33Mv#xf>CI&Kn?9!n&CJAiv_$PaDEB^ zLzAezfK4jR&;~-bLW$9UvmZwu4Yb2!X*cI!3X`0=`XodwfJ7UkV}TvfWCW!~jY@-- z(6p^k4l1nzHV~L;O9kX(D31C0>q20u}tudpEJ?m_=wu3i)xU_+}|wl1X5bZ@#= zguBK%ev~eH7uWtS34(C!b(O8^l|s`5XkDmr_z^6PCP9%`K=JO|3h`jCz01FD=1b=u6D%1w}nSgh9F_QA$sd>!T-k5(H z_Ew%WdOGv|6Zga-02HeMFDs5t0GXkOxnSAXl<)P-D|VIt#^DzaY#hsEZNIRAoIp<> zLN5$Kq?_PPE(zjBqy*1)&rH!ao2P<&^~c>;qnsK}zY|b90b?e~HoH*6jr>14L1?V@ z<&Pa!2z(C8XIo#amzfRN6 z2gam!kc@Qgt=``?>o{f$Lh=YG0;1|&mh9`Y4d)8&BLQFF5jn?8)P`S19+SI56ESoF zXCyBgxq#YTy~eE}rmn?Q)4G3Nw~!6pGq_DtZ4)1K3)A z!@7vlrUG*I0F^SHb(RK{rErfLm>|M|tG}Ed;ih4fX^<5{*RRztWm@54+jvgn*|%Vj z8p}`inNc)NVv-&A>!mG|hTm^(CPafvPe^M`yPTz2Hn3)M9@}c3@b<{OaQ8!zDgErm zT{AHrm1+^oGhJnX_$qW(n4a> zQYU7UQc>rm@e!3~`o9~%SXv0u=p+f@x5xSCMR?lxdcP~asPB_|OH(h?8!xoy-7cjT z(a6rE{f-#m^3%t2-SwnUQ>)w~g=%cUaqH?M<8Y&~)qUyGZGMi^!l5(@L~ZC$rCHtF zi}2$oIOiuZW3OG_K)LXPbl-jHhD|PqB58n{--$GZlMSaCX-*#g&R$3wtd~Yq1}u<1 z8keS0+guYb9=a^|EY!gq77plnphA%VjH;n)4@o@^AOoLiy9a=!J)hk51!HJ%#(%J` z1$tQIU~jGjU}r(Ek^tc2lzmp2(FrCJ7jbI-qXEl9At8t0<1_>s$hFSQu_wp5ym$G0 z7<3cXVMyEi`GX;PSfGjq89t~hXXN&gg&3zj3#MMp2!df4&Ch63B>-p=J~{d%q?ESU zE`szMR+D}v$fO12YPo3g@;sEX(3v;IefhVB0-E&rjcCG*XNd8JBmtVhFjT<^P(<^v zRT1vxHW$bwtyBW|bQEcq_QQ3w_0XhwhtkCX9{D}kZOS7%@4(~XUP_)j!!&ZFLAk#C2Wiv6nbD=R?XvF6JNkY#0E9Y@_dg`$kjef6!G?w;-%RDrduR5B z+#VeDcMdIq9l>d2Czme36gQSCQV!^a`bW&)~56^W}h&mERL|P_0IYsT-i(ORz z&#o-}bs6g%brNw9SuWr<6yUimAKVuvsLxt?;X{onxXiI zQ}K`Hx2mVM9{h~2L&O`GY?R=)8P>mbR#EK?Mzi6cJJZo+$e3pOc1Rds%b^SHQx_Oz zX%B$Q4NeZNr!L&?-EMgo&H0JyoWIZ$wv)ATA(EZRns<2ZCdyJpJ-z+1-)X00IBOs# zVOaQLIs*|eBT`bb^NOBOyMP*Xa(K(l9Fw~E{`N&W<|4=RqA&r>ZNxhUm0b3B!%0P*J!0XyG zzwTM55CoLi-+l!+x|zX#tP=#)URP?Q#skFBY3I8~?jLy=w4pIpd$RGIV~a@8JEmRr zw``D$)`uH@H?A99|E)dIbZ-7N&#wGIEN8B)si;5JE~vVQIJ)zadlWm zu2=6%U8Z8Q-szWDP0s&V`fIQ-TyeBtcCfMj+h~pJ)6nHDqowgD0dF3KY#aZWXo>mt zV|m+TWtx*NrW?9rx;EE;)gbK0&VluX*Y}S;4*h5Ld+9^t`JX@jnQyMlzPO?rwu{?Z zUwrp8?C0*m|Bvm;ke0vuOC$OZsNRhF5T|-8Zn0VQ_QkEgs%1%hW@_cBd*an9GBsM% z?&O$lsa59NnW^6`IvKBi?^;lc`u&^nTk2Kim(4V)?_Q7BsHv`L(RlE%WlN*BuHQ`a z;j<6%nva?nTQncHZ*6HZ*?i_&b=`X|Xg%rEXw`Z;Xtu3YKVoOD{p{_@3)&6ugIcv4 zC*!xZn`SPX>okA4enF?@TTQD@>vGGsPTOj~xo-Q$hYPyT|E6u^sFsYM%XtoRHyOh5 zc_&+mpj3CP)%irUuw8u;V%8!@3(pP8SYcP;`|e^ZtT{=WoxS2#ZHo3?>ipb4LL947 z-ezIc+^iE0*d{sjB-y;tmo$E`^D8t}&7)t|YBZlB?;xH8Q!)&gc+4pQ4n~C%r9|Kc zu8E(WM);&=GV$*mM0!lLO&pCr)b|x&_h%$5JB@jry2<$1yp)d}7s)X=Y{9=g`G@>1 z`;HTFRlIZH;D!6`rYBcvS}$cB(4U=D&Z)LV=~;3a)1L=RWvyL&XD)gM7q_^^V)|aE=J2@jyGI05-R>9>Ib&l(ttvvAfqP5<}XgOkOBw*vP z`sv%wD>!YAOJvOTGDiXx`v_TFIp|LxmgM#`i0)878*v^A;1JzwMw9cgdHAFiAjEb;tj8*h=V{AcReIv*zoH*r<9PQB}uXGOYh_Rf0qlSgL+ z)qUgU&1^FdSmrevD@++=ALjGBKfx4QOdi;_volm`7kt_1|`PkiaMYWM% zl+Rzww~Kca{yJXyr7bPfqs=mkPwWfH?J9QR@4oZVO9}?hd+Dx{YmCg`o4nRE$1{sh zZb;7DeOI!|PlZ4ANo8887Zzjv%%EAR>xV2vT@Y_>mMh%3v0qU&VDN;$-Yx4p_YWF< z9Za%Q7Qbt%M%44pjVgQ1>FohzbAVo3cRmRz1%Ufb?sz$1=ZD;-Ug5`}q z9D>;m7oGIuk7uN68;|r5jwQ={R_lg|71M{W&F$Tzo#}K!<5;%GK5vK3ISsYTnb~hG z_biS|3#4ltuM>Y(O5#t|D>r-lF#2wd%foTq^a<<8tzz}9V3s=K6!pX z!?h`vcqZV6rq`P_q4RUk9$)&WNrh;-No=Gj@Xw@wSMnLBPxvyja%{Zi*K%K&DL+Zk zelR?7C|imAOmRnzsN!=1`YF{|DaaH;BJ@GCEGn-E#J%v;H6I?BLDPNcHtK1yaIKO% z3e82mXQVjh2Y0}9Qe?(iDN$rXe{|Un6)d?^o~JxsC+%*!sBT}EYUZH0*PC`ybn&le zV#%G7u;~XDI6as{Fz6ZIj3?$%N!CNyY~ZhzJ9rK=)y{z}wxyP~$O+HGDi4bNRfCFg zum?zo%R2^BJ--)qe*~)1%%`Tzg-ceSvwpkldi`%Ot(9lX%(U|uN(is$c z#ILhg)>_Zu*ofrUvJx$(+nZ-aoXDTjheXg3oKv`x2&--=@~2{s60}160r~E}XQ&T5 z7Gm|QILOH56`_OES5TTRHhMNL&}vv?ivBJC)bhrO6Ou}bVN{~}q9pD&thqP46D(H1 zL~KZS3*Xu_D0EVv#U5)Gjm34|J}8n`snYe5zph(mGyZg96+3$FN}c$gi}gnL(dq9T z*Ntl2GVwzLug-{lpIdZ>9$eDuv|9LB#(#548+%76X!r$8PozyZ?}tE5_zf#aJ{^1B zxWiZI(_YKXEb*@)y-P-izT)z;)C)h19@}@yT1<)m?oCCmJY~N}?OnN(A5dZE{!~?D zb}R|IM>-1MkJPx5brCKml_YY1K>aDdtKoiHjzZ{ngVPWHnXyjK74rYBCLtf8>ADGu ze0WommgY+4(Y9I14$~fkN2@vKJuLr?BP>=Fh2!e)$b-MZ`|p&mQ8$&OHS*y7t%^cP zg&unqrw;j;AMUx>3z479fWP#Oxa?%gL0<}l$h3r~s$dTy4+m^XJQ{}SQ`!Y%no<;Y zs7R72N95YV*4Wca8^)M}-|WI#-lPB<)$TXHUo*AR3Yu99ExRMXIsCVEWhD3ZJD-JI z(|D=kvfn!Omfv$Xp%d;xzgj-ZDQs`{y;$`YzQX05ap$7VMsji^*}6$9ClgyITD@c1 zPTB8GP;F7@4Tn9uG9#3ZPt}#w&3j>zHgD0GphbBcKi0Y{R)=s8JsVG*_`CF!U z_dRA~lC4fA@mb82Op~yt#Oc3m^~e;coee&cq_t@|e;`>`8Y3OrXF(?IueznUNipGpHD@_EVXH4y^%%Pp~ULcHDdGLCQF2&QZ)?T0%;yEEz+)G-_ znRDk^$&}msViiyJLR58<6qs`%SU=geJI%cier2c$g`=hhHKiomp0g?mDb(5=7$n%3 z;qEq`?vhx9bWVyKbNNCn<%o_ZlU7Kqn{@U~s2Rm?Ki1c3y404{X}3-?Y$`YXoqEUu ziuEC5J(Bhj<{XwFUPQXrJ*!g%wz?R$MJ-8VDMf-oB4R)mfTb&)^KYed0CQBG`6juN zg7NUT?m9}M5``VPWOa|P;&N9>(vwzS!QCrGc63WUw>yfaejXNd876=#gyLdppaO`A z8eN`FwjNKx-y-6}PunGs+&(}L1on_w9Dx!ZRz;LuB5655wX2{eT>vVaNOqVfGC{Ib zm{cW^l%0HH?rRARf-+39<&eabyUfCq@Jj^OIk4n0p1BE8B1`-+>^+Ee^_R3HEp)ni z6$a|l$IhUG?vW!dlTAF>>e+PMfWK|;j8cg&amk;)drHcq(uR56N$$SYz-epj+^8cR z5x8K(1Vt0cnL1skPIkvwbO|u-Uz>(Vm09E6lJL`kC+A4ga~_xUdoO8p8PVSxnSNcy za)~On_waK>3>IwA^~Su?&0-xAaPN+>H?hzKJkyn;icQ{DCSwGuh~cwj@^xZ(^;P|1 zEu_q;qT~Z~A|@f_z<9EuII*)55^#B>bnML5T<0Ed&Q+bRE-Rf{8d~sDXHj#ir1{mAB??mO5=I^E;da-%|Dz5qFqlwit*kCdofLSt6YC zc^RVO5EQBdmi6EeO&`ce&++D6son*WDG|-EBrib@!~!O@os(=1)`LqF6S#}dPL?SF zX)i%!8J+E)Ce4swQ9=;0Y=Jng`MgH6wdE{6JU}GSAzjIv6i#eEco&ZUJP_2Ff=g%TD_dM`uV32>-aHZtX(F z6IFhur7nYIHcS*K{CuuI1u?)+L5*EEy);&s3A;r2_{=g zlcCk%nQS2zac7n^mkSo9go!ahCM?dD$P#NTY^O$d`vmNOq8C{4v2mVjm1VEzeV6?cu01~Xt0G97AU6=NAOdxsOh2kT}UTM&^b{1E#mVAy3 zEt=pYaXIjzyRf{ta9ED^u#@NAz_b+2FrhBn<{jseTb#_!!^y1RuK~LRuZO4v zWRF;|pzu-38IT_C?|cDgL{d6Jhx~;vv)Zh2P19U2NVL;RHo`2{fmrwuAFDlqz4EZ- zgtP&~{^QWkQOiAf>ZYI89Up%Z-2{^WHf8qbq{JhZf} zg|==%^xdolm`N5X_dVT{5W{D6HOZb+8yEJ%OfmOuV}I@3tHKL(I`!)rni#2;SQ0Ba zLQEizkIVFRRZG<{-m4j|t92hg%n_Q_J#e(ay))|YgGmofsOT{x)o_v+5^VM~Nxt_i zkshwmMYG5Q`}{U&hTF=i{`7KzW@vPsJxpG?WjR;kkbJ$108t;>8%exR!TIHkdVE#? zlH%_YNwkTKnNKd)V?(=1${~?K4;o(AH4tM%a14{dQIcM1l7-ysxhg}4H-pJZb&q-s z4{ZdAjMkH-IjY(FVrDzHYVK0ZH7qta%qU&P}lQL{^}cK-8;D!ffpuFJ;p ztS<9)!N1ZjX3CZg&3RhmBfVt~Q5PO)8eO4ye|4e%kZ$@72yS1Fa05nh!Zht zerrKhMobza9DK{E86o!wFV}$3N>+N=Da0T=bNVZ9V8l=7eN71hr2sCRYuf4*b^QDT z+l+OYV35<_l)dZ2`|G8~l)Fr_bM~;_$OeNXW51nzJe*{`O=5vjrGScCk53Q4Pic_v zqP3iz$NFs>Y(hG3vy`;ruqFp=DI65^hmS5@CYB{cN%paej`>0B{>ch^3PMpHi^p5{DjKl@h&C;@n4e@1Gpo|ZlnIjP1Y`o=)yY0eI|-ehgG^l6Ak*Zm)Fvjb zkl7C3B!}Z|6XeJx%AEa@2;Z*RcjSmMh~!)i5f{VHZMQqd{VcV&5ntQ+IVV}S5jtc^ z?gqzo=lU+-$@swM_Vfv2_UDB^y%ux6k=Aq^p^2-*%uZXnBXrVtf;Hofkf;rB60`C_Xi0W5x(Ksu0jU(8T{-vH zTKK6+$7ZVf@oZE_*pcU+tzUo2KUIS9T+PTFt}V{#2>Oc#XDGm}&bP1F=z?U($!BNZ zH$-hqhECl4H}LI>yQD%_>xI7G+wkJB>H6=zDB}d_2%qjO8I$Op5^ZfsXujckRg84z zrQN2=S-;dWiNkx<|9-zK)oYtP8ZExmU2#Xk&Boeu*5_fLXDw9ZUrVvF>m%)VpH!;- z9$`AFq4qE(Q>zDilO4WVHe8aw5WW1dIp)u?^d|@ zWQ|TryjS(A?SI*oo(Uq>KUSKR4sxFaUIQsu@;1&7CdX%jZd>7OQukn1Gn4lG6PV2_ zi#j|WeA^3{(<%b-<0y^V>-l#xPR)AH$|ufN&)-bg(VN#y)K^v$C$FhjA|Ddv^m@#e z+r)-&8xvQCEdEXWxN`IrCje18pf1LWFsScsv zMTHu2jy$^bNv(oGxU{M&rNuG*d;>8wt|Tsy#Y*msY_K6pED{oOyYQ*<#p6CX2PNP8 z6?qEWW=^jnZeQ}D2;TM|sgcw-htq1k5kN!O79ex_7Al=m4q^Oqtdx5iFyi|2V{@7yZ_VTkY_hf&x{_43A(m3PQ z7W!gl>_%7<53mg92&T)f_RT+%TYI(v_dM5qD6B0~5c_jGtO=qZ_k$zsbnwU9cMTKk zuSu>qS0~*QJ%5eAyEXaiqu9{FSZ-^2I&S5Q(aQY!neX@7HeVQ=x^dIr^!~+=#jp;q zKU3FKJui%RIPKl|QrEb9ZXx3K^Z5C`uuH!-8{d^}Z_G$4ME%)5^#SQt>@ZqEL*d?i zID$b$Enr|Gqe*7UB5b4`QC`x)QespQA=tu^*%L{XFkHyyJ30Jc$t27u!;*iID!3SE z$sb8TMjf+69GJtXohuBs)tB#jhQh;?}kWp88wJ7Y7kLVh>}4<^cHDE7d5&>3!@Wll!$Kh z&Jcn~7(^F6LL%Jr{r&d7_TKk>+|RG;IIi_%{R8Vb*IMWKdB0z$i(E_7or={}u|eXC z=csQHz1&I~Y*%I@c z&elvBGZQQF4Kp|0{h2Zsm{%40%?-S>nBL8b+ecJ`6>o8Uxas*%7IMOk9I{eW7o zb8)eP!!eN=%UmFGyW|Spdm|sk4mQb@l6P+!Ofu$VGtoRD-1qQ6m|GI9kd=n^mGdvg z4sf3$B@&Un$J~U{yjx_P;-puKtWUFMDK&2~xLNe&cBK+iexp&im@GN0Y6D9xLPQ$| z#Dg4!gJ}gG1(}6J8NXgMfv=K_ucs;As>;fv;VBlj7JCn6Jj0f405qZU#SA`=Iz#Mq z)Szz1RNZyN5DC1{1Jhk9A+w{>92SWyA6~qZ@wjny3~b?O?krFd%UUN&v(EV)OT{3( z7$jk&Nj-T7cU^ zI#ZW|MQ`2WN-PcrE0HYLqz~8w*)tywZpP${*X5147T!-2v9hvc&%a$Pc`Gi*mR&xIW&JE4?8F3BUcFO&O>HA8 z;r$Y~7_yzyWfRxCvDp2kz$4}z|F`$CYADX_&5(!1LEg`v!*G|xZu#Uydv|@v4f#ZS zFDG*eG8o?@^w~M~?bp?mqv^hg!5l$%YO~mTFsiD*Gi`hdWh{%LnNw#z z#?pk*0;<;O?>zhDT#WO6-MCONVB;YL`&m3HWv+`k;?|+jq zIH1ls4Wv2Dou>q1Qaia*i&?yTf+9V|d)UAik7}34VOlzsZax_}Lc_P4x>r9zdDa3= zd=eamXP;C{v`Y7DVI#IteFx_xr|QYUe|kVe@Av{SB|N-EfDjQj%PtgNSU z{p<6QBX-or)K}uyV!Yy;A92|47uf!&!;~sWsE7bdiG|@hT3tB#RkmfuZdTzy$#ZzG z*Qe!CzvnV`+e2x0nUDgkl8=n%4!o48;OKd1H#9i%`SjvUq*T|{u(`vmXK_IuI4#9u zYoF4$o!?calm@Dr>)@vzn>N2LnOk^VDf+m~n~HfNRgu78_{`Sw+~MsXyGL(BO78RQ z+!}41K7l*B27;J5fBYd=_BS+q)MA~xr!SHJ^0?vCHtP9X<Hs zT^C$=w)c1u@=(VA6W!b3CPUsGMd9AvQtW6+`t;+8kFB?=c9ecR{qRTi`0S_Y&^6sP z+@G(8RY6zN{WG6r2dq9l|1)^`rJd!yR`Va>LNM1i{(u$9!*?r!MJ^km1C%f(k-ZS%HUTLT@BoPsfN}sTcReZ*5{L&wbwLMou0`PX3JgkgDNrY| zEA3M8lc=x6Z`>kLnHv>Y5;nHoORAl?NATdp5%CTjgjG`#17td0#ETmH7pG{226cs;8KDbg5KU!v@c2dd z{p@&Y!N|m*n8fT<#^2T^DEkezC^9S`&pRH>OE17Sg5`wdFrT9CK3IUA0+|W}QOY@t z`#>>oHAK%oC(=gzZ@a`OK(cq7>3oFDhT-oU$>j?qF)=#Pex^{KK}?eT%p74%;(3}Q zSl+*COTU3*eao&LMuKcc(82bQU}&~1guxWYDdR2J8!vMOEc%IFHZNB8hUsr58M!e> zxoaM>y52H22QnfNau%jg(?A~PEuj`id99=?H<_;NI4U^H$oL*yd5%)_Ns^cG=6M*! zmukd$XMy?R7Pmt=06#~MD#LTqCN+{Q|h##f;NFbuy!L@u}^lw*XE$t4HG`sj|#=zH;!`yK` zx60jwQk+7(^mt-zTr<&DH3ZNM3*E>5AoA^o6&AM5pS0g@-(OC-zd*V_%aXY-XHEA6 zc>xieS(6dGW0hxWbG;h3b2|h1dn}-uhqT zhbfs93fI24xC7$RNf!mPr1R6@e+86F1{OsIGO!0RneDLa1irgM133?dCrha=96CJs>Ou0QS540t))oGa zoBiHRbsc;XCv>y%^Ge)_&nqYxEHrL52jpcMHlwJW!Duk@k$4|J?GT8Sby-TZovAie zXnWxvRl6caErp8z+VBDc0f!k4MG>$Ji#Ci@{4jYNv(u^g`yLwt#gOk?77R~6Cn%}u#n7T}40eudcQb=nv&!7kt3&f1K zikVA3nLN*5ko(0(yv!>S4Pq}cy&*fk>_~cjEOp$T?7PU@m5d%CvKF(8qrdaMN>S6J zkexul!=Sfe-ESLazKyfO=T6dukZzP#k5WaB%U|;3e!tafKpfNfOJG}O%?r%Gf*8|@Ftjadn#>5x zEgr@qPi;{3YRG!*WllRh!y@drJ1^%g=_DjH(?dwG@oj@bfmKaGz;oC5*Si*l^|)~7bi_IS23 z9E!jRF3VwqgmAI~!ImL3!he#UlBucN;o7rOt&Qjr9FxDz?Z$ELk$ur?{8uN_b7y{6 z`p$j~;Hv+p>%!NTKErAS$Xi}-*|UJUkZfqc&ol%A>*Zzp5+T|)AO7;Q=-^PWB=oDG z533=KhQ2io3rQ9k_`eG<_M98uImloZD0q=8OTp~%A4Adf`U8u$Um0zGX4_nqvJxUu z{t8yyQZP|AL)HUvWq?5ZLely}=I_bq_;!dg&f+hy85-RFx4r$Z4P7EN_xZ?d>8Gn@ zPywD|5??A1m6!()oNoPi+-7IlFnPsM9_}t}d-iDa$7OPj5*41tfvll2pBD-E6o!+cU~N{K;~7 z$`9wMZ1uvgYV09zAs&3pX4semJ%&o)f~XAw8C5v=loM#CJK?rCT3e#@t9B|gGRJAq zs9MLs-<<3*uCl?Wx4mp26J*#nzuIOke8V?)C*$Kx=ERL}6NWE`znxm<+d_{{EKTIa z4yhjx+xF3e9$)Yk`pnOKoxizqi8qnV8~jaZKmYRpITYXuwnp=H0 z_u}Qa3|uy(!7#UHgEVt`j~l@*;N(|d6!M+CWQQz(MjkCVEN9X10ut!{sGh|TB6m4 z62Mzzy}+It`@Ny_`|Yf?=()8&8Oxm))J8)&T=*5X1$Y%yq(9jsjKa`If5kh{W~@uh zeSoPD>$QcyYrPb^@~kpRMS0_pQ*Xf_?XSd(FF%^>=`%>m=VOwsqnfOjq-S>)nVfjG zI_I{i?(fnCydLvc_6p)?Rg3x-+)w~m{d9>b4W}W0R(dp@ohI-`)it+Yk-kNbJfWi4S>&_llNtc^U(gG}^v*pB;v0 zA4l^AQYjnOsYniyH=FEvF*$zcR*z6>(N9$+PjTOaRF7Y0|7ptpV|xFpe>Zw12mGX2 zf_vc?%NoxMrIy)X)CL~jn!~&j#7@BNP1%0_Fn?8h{?DKJb6{>JG3dYBmEy%}A#IV& zV)vROr1E-Rb66go50fajXzSl;2*ovG#x5i`LM@GTH+1k zSGZ<0=d~!=lJNrl45gQl|FA24IHlGM47=H1i!qO8HyA}=_72z(rf9y*!7P5^|Jar0o21hJuq#)FGq3u_rptNt3Ix@K z(kE|zOTIZ3^}Q*)U_!y-m$I%}aw)j&=}?qW?Q4sViXvTllc?9q4PME=JD5vql^YM& z6XpC`W3T;jP=s?Qs|lA}O%7q>#_rxR3Z|m}GTRhzSx)w6r0!hKn~Tm&xses#EhTG_ zyi8IkCe_xP%;dm>&~qhK{{iQw+5!JDvwf?;i6N@GBGXv10o7sx!s6qg1Oi zPYg=(XiYY*rM4c#;`W{`rl{6oLLw9+G8+=awe(SUiaX#IOc{Z^(Pg(h>z)~LJRazC zkO4jScF69xK94Rp2{yPcoKPJTLc>(&A{DFJpdUm-NK~hHM41KFy6LClg1WgZ7^5$1 zpJEaQ6}pZ?)-_X;tKHkqzvR*~`Aj6hRc(?@U`(gUg>+0YTT>_jO0%U2`$jCu$GWISUJ(&pe9-?*SvO>wv$9Rt@?F=1^h|buJ_OiY*4wq#PGmcR)>C&tn zV~W%rlhb|B&@C5qDgYH&qLyY9-Kq!D@;)bh)nS|t%U-V}!t`aIEjYZ)qt)tiNq8u^4s(o_xSSx1VAV?I ze75HF=G^M(x_hB(e2uWiV3FkTSH+L()qkFUTvO@y4*N0i?I2e5ufFFBtL}rorX6CQ zTo~?mSEuIA-b6^lie=K_>*2VK*p{&T(UF&t#ccU61}LvUEQ1jk1IYeX8E5-rX7d)b zXy@u119pc@?F4ZL{afdMKS9$ft}lb2M}M>r<--%d%DVf*ZjBsG)3uNK#i18M-cZ}; z1Lyz%ASA+yS~Y^_mz+l^|6&HcZy}>TGd~O^kOO5nz#<1o(oB?XoS7#;FzJ!D+#3Fa zJy>%nVmC}skp@NQ-66Q*a?uApjLJSvVyD=Z#w9sew_<64B@RC66^1AQ_msj=QJj0{ zoke{Lsh(f$6n{;tqm+Ys6sk3&Y|1W}l$pWBV-2Gz4=mNdp4L?eqZng?onEjFSR-nL zTVJ@NjhSBeHOVtRS$rlcHKp;x!#&V)Ju#+-sZTGeTG%e;f$DcuKanL5tdF@adS5p| zH`eP7D3Xais0~!U1ZIiNo zhGG?Ac&!i&K5IX@NRhvRx~DnGZ20+z`V#wSu-@Zb=9_(LEb9?6D!0sb9_5Fh%m40* zxbrw)Q0@`GT9k>OZm0cxgn<+})ub@MpWUjzy%-!gDqA)478_^#8A(%fJ?Rw$DCEW-y z6Wd9H=?-zXDl7)3X)i!sx76qZ~Jx0wM zPS6Wry)rw#=v1IqeLJn%F8hTkY*)ppRP23B9{<`z4>@* z=f50G|LeG$c@77?luu;?v75eD;rYr%Kh&00qpQU9T7v0Q4m~>tYMBWl%;d5-PvxBt z=5oW1BIX^K44Ze5Xih@H+ZOCb(}w}B{>+y)Gl{l;BG>UswFNf;%+Ny za3}S`m-+J}4KE@v`jy24?lBj;)L#csgA0bwm#u&H6x@AWp6)J&;>!~5@o?goL$A0EvaB$~_%V+w=M`Wi{sn z#ejrCzJ|@{%h!Cl_;kHII)&_UKzjMy-%-R)s^YKx(i3w?rn_yiMtQGu~-*Blz*D2502>;~$ln)Tp;Vn(?B-e6v$M<&4E z5`*G@M8~#<5CkB;lYuUT7vj(S&Tw>h))4$_2qOj5ACC|w8yebGgb-K8(to9e#-+7j55OIk*L#kB3Tv!jsvNA0f?OG z2yQcfUIOBy0;-`tI`MjRU2&xA8X}oULt#1vNr+a3C}A7dl4F}_TyCa);&Oqcvcj&`;$>AJ2inLSB^ zLv$WVo8FroN0Y!wjM8yTxpjCBMlHaNIYb$IQ{IOqiR^;!>b{X?j!-#C!RI0@jGj9- z+=)V>ZkxaH?!QK0k9DIL{TTlC70p}Dc&e8=Z-bu8_W)v_HoP@j_6t9`rNEJ307wm{ zPn))d86Z)QN>UvY1K)Do*(SYxeF826r00mF=jo;wIHngxrk9kYm-eTZum8)g6v?R3 z&G_J$@sVOzmSi;bXVjg5;{bFbMrk)n(i{asyBpH%2;nLHnZxUuqxA2_MBaVUeK+a& zZYuKKOv$_1{&(Nj-_6r!Er?_->1HiEX039d#4xlLk(o}x;XfR2ZwEpKPN;r5X8(%J zJ}JpQ;y@{pv(M@AK+$u&5)$>t5l=ig||ufQd+p-}pw1*X6ASx^P0Q3bm93eG$V%+m`Do${@t z3JmmcaSL#*G|+X?LS4PW5Oxj+&IG5%!VOZP<^Xo;QBldGP)DUAy?aIQeBjH0dk8(93q-|SY6#TLJdqZ{A93VI)$UK(Fj`u-k%aIZ9jv#jv9$|aHa z;>Pr>pWft1BYcpwF@P668>MMVMJ*7HnDzI;oTaK!-2e2TDwTZZEa{Fa@qAIfB3kxc zsbJcva7(mcTd&+!BS~2eCO0FMva55TRIvsr3)bQJ@}=loBR6QU9JNrsdauM*v@%Sp z%3vuZ%9%H>|wsi=~A2~(<(JwU^ZDF@MjhTi^p(RT%L5U<$9YWA|AO3ky1 z8A>>?Az14X#*AxhU(^&qDu1L`O>R`GT&+3qtYU!HLZhoR{!}+EB^&iKx?|ugMDr{; z)Q-nF!xX^ue)VNrPlg>NWGc!0#tn9<8%3%X1af-8)##c>&NW~$o(F@)_P(_xoFDaW z)b2XfLZBtj#p+HLQ{>vg4rIYPoCWBU$)jo-_jvxgU0Ml2i_^)ED#o>5zIB|0a+CLU z;|VpXn{}Vm>im4!lbk=Y?3d!Ne)QQaOenqT>-K{!^FsrOEqGb7RI^r$Bn`SB&2uTAvZY^MN!E zQKHpINEi-7_!^5BVg=tJHq;^RNBaU#ya2oOSFD3yfWWL@;}YG;msyC~VwY`hmrI^}BCvDLRm?E+oJ$GP^}Q5td(xI%TL6t}a|hYDya6 zzE>yvP@7gso1=P9S4UQkbg%7KeWDjCg+QaW&y?ju=-U%TBjTxT z#@GT-&Sf#&rIsk}jOz4dL>rFH%&7iEqrWFJu~k{TS$|}mTPfd$v)CABLxg1!R0|{M z*H{sq#@zbB!!BI|4g~r(Bl`qp)CxhhZC4xrc5tJN1RW+FnDYHXS@X%!tQbK)QHjGA z(a3%0v8qufPudtGNI@cW#W~T?TlIDdiJ^R4YeW%FMs!FA(WS5By5$o$7z8=P`4%p+>XQ%;63!7}M$~(OBIyF`Jny(I z%ZCjY^}fDh1|VEp z_!Mh9!g8xoUqayWFwq@Nqf6#rM=QS%>^(5KY{A%PhMTCG9KB*{znV+y;DPv)Snm@! zuJgC;h0Cy#6haF*J+a47YQ^3VSjlb7Hn`VYQ;w>;#wheeHsF+LQ@$?;j|frjL+wLt zkhFY*FyBBLkjYrk?o(Ei0T!Dt@m2H=n7(d;T!;(XFVd4Bg0`#8SIY5d&@SyCyXj*u zgpyqODY=&W9ksc`gOgo{mzQpy#`_dtznl+FDkls)$~~tusNp8uq;P(7@i}A6YGkei z@`UMo``he8yQ)FTaEln$@-Ce{k53eiGsVG~_St`1Iqg0b5fvr-_=bAaFtJ2^nCTlwx!a3$vc)@n`QlWwA+s4s8t;0EWl6*aEBOUscfy zSiiSHvhLx{*}PFwl1tqcRy?mqdpws*$2cpiq}l>$YP`5Qd$>#lTpHV9rf6sbf##ku z#0>)a#|Jx|hv9$dDfRjS5G=^D{*74B6%`4EA1 zcEUN=7djWx?oMW2%^Yw7U0<0(7$f~oM+yn4U30n06wH&bP6((RJB1bgJ%DuhSx|m z1xedLe9qrut`&G3VL~nO5c>8e;|hjml9u(2yXYs=jWg^?A>;D5k=~*Jw{>~8trf&} z4;w?4=Tx8TW}UWnD`Ot7bG9kuVGo$^A@sqvBcXfa)gDQZYwWiA80(J}2F4@hAM8e| zs7==K8Imz`6?P8^)+dTt%>P_yyTkMHVoDN1J6N!yeNyt{5!m+G%C_DN>BqeUhOl^Q zz9t&!`({4_&d-^0=m70Q^F&Ol*Ya;U$#_-qGtg_Uf9%RA7P@phX~M07=UOg2&GPZn z86qpPZyHm_=6n3KXjk4pFfLSew4r77WpW?D>Zj7O_6t=p>jhLlNV6#RdersUB+BuD ztkG+tyVIrCetAr?VL+;~)F&Avswz!7O@jMbWkhR4gEGTmve=@sxFH=aquHIExtO3K zj+b^5yG!?Mb(nltZrb1T zWjzxI^i%UW4i5M|5qpm?7>`$Y`DNWkXD!=TC(D=bD+SDNGg@sBPBz30lu`{g!|9VC z30gt{J?-l0L3M$`#VHZe5RrgnfsHH)@q?mNnM3*laP{MjRYDMf(>>Z(nVU|$s!!=h zS;U0xX0F8Hm;==Nuxd9ZetQ7=vGq*)eLOjJP3T_(q!G{js0Ex z^|D@EYUpnw>kLZY{2z9u>7@?@GK5K-@azH~IE@gdF8KYJT&Q&ZJh zGwf#{M9XyaiK#V-w#Zhx7lJ$A`^`;*pxR%Y`zmLYMHXS_hPKiU(U>~G!E4Nj(Gt(= zRHA^^9jkIpKB_4h24#^s3f8H~@>=JONZm8%i`IYoF(p>M!v$d1n-i@xeRCoNboVk0#dF84~g#vtMPrG^JdhT4h`Q#_aG>LV)~=VFV`)cx>l zPi7o0i}70dG=+P9`Kv3mHB&fu?b$-OQ0ud$xa*W@*|!d@&sVdyiIl6&L36;Nn&Dfjgh8-*&s$%QcygJo#LcP2>7V>*~GnH=k>Zv|OETH{JVO_xVF* zn5)af>fK}JjRaep1 zR=c5Z_@6}QP19Sq|4nq-S>C1KoDJ6hg>#nLaZr3t$`KAl=FGL{qTrlwocJg{Cq?Fb z>S`b3DnRi$hutwj|4DSd^m6ijEK0#SeV#lSeWFAmIz66>c|QvbdFC%8l97%W+8TfCjGa@S1CR!`wb?CpK&YSjedcASxh4Hs2R;N?KO}~VXyNRa% zRyt#ov;ToQQ=+}z+GVyvqClM#r86@#^It@#_g>b+SJ@9MvXlMs z4!L;eIsB8{+`Ro%rf79E1iqwQ52suq4H&JWp_$d&~nx5g6fdN>d=~+n*T*~Qhd&`y8fSaQRN?_ zmp`Vie@vl3ol_0TLyZ-yjhR0i6Dd~bhvvlUmhrXL917G~+4iNWEsX+o9uYGd+es9w zbE3VL!gZE*yzA`3S9a(8o9nFlm(|(dSNNg7xVwMizgwLX!<7reL<-mWZ>w{PRMS58 z;Xk>~W(w5#^;7-wr_Sz)#=(iD@rl`gTb-MeJ$s)!`=(mIPqqEq>iqqs`^RiQ#p`Q;y_b9I`cbpBqSTl~LJ=WM z{`_j1Jo)cXr=SS@{{?kE{O?dF>3=|-5g^1rP$%=HeM~O&{{wZh{{waQX)ym6)Okg) zp3+IvcmfESV->ees0Sr(`9 zgI!u3SKn25dUq!I1rvevhP20m^xT0&p==k7?1;ieV+{uFuSPX-I)_G`#cJEf>QNM} z(~hEb_Sp{Rb3K3YA6jQgMbv6(#@;zH4WVeAeVvs8 zjvej)Xq~Svah1qWw9fdbgxU$OLn^vzO-+i{8Dk+?wq@NzRSs%cQh(jrOr#99P_)iQ zRu*3uoqow1@&9O@+R0Ke)NbmYCKRnxn1B+Z3w~6Kx&9cP1JkVYAe5?dxPRU5OPY){ zuH?pAAVpkG_b6KDw0iG{PLl($RPbqzfoL2tkDe=8zj5>e9C33%A(8;BRNWPHg-XR0 zD;rgn2vW38snN22!dY>W4##8hABaIormp=9;}B&M-vH!06e39u8NHD2aS+ZsmHKUe z@NDb+z$)*h3Xnn<5Z`U0`L=fBf=)lFbSS9X64ru2 zj!4!kN9~RxAulC&>BMOo5OocK?1!cgAb0zMSS7_Fz1(y;#W}H_RY2|yfYCJhV;N=i z9@@>MqdH*pYJ^^s=hAloWnuarLbD) zzGK?l8-R;t{4WK;3;X zc<@UGOSq&mu&g73J#l<L7~jyF{dg{p#0tj8Uycd`T?2p5 z2mYQ?w^fRd%O%$fr@m6k)eHtpN(bF<=Da4)b?b)PXTEH5;2D{2Qp%>ZpicNRb$8QO zRmCg0z%MDD30E0@(~F1I$9!#@n;Ej!dxYON+wm;&oS`&>_@A*S$=rWtB8U{Nz46{?BD5{Xk8D^~`t8f_XA+K(Le=*=6{Kz>0u6r!)=9b6*y<2__;MTDGug3z z_MuU)%qni?DC8xH&EhLQPC4pg8-7v$T|FpXHCd|O(+G5qUBc~O&`v26dC&fKE4uyt z+Ge!1wM0+HmP^vFAbzJ3Dr#kXcrCmZp~`R!ISjjQvd&p_ZQMj%gxe0@YlieOLpTGz zEC?vxyzd7Q9F(89pVRc}kyepHm&OX9S(2esN)JjT+?i78k-a>6jgU1DJ7jRNlvKAV zecOfo8KC!P$^Pv_Zgqk7$udrKL!+GqH&RZ+NE`G$H;}91wnNsRgg#X;J>Ox|BwMMz z)me$+@TITJkUFwXfTGZwEj| zxwI;7ZR(L(UBc5qKAv0hnG-J=ywo_3jKz`M9&s7xk~;M-Q*V5QBO7NKKUDj=>s$?X)uVq;8@!H}*Oh#0_I5m48=@ zoK(!`qZ!DMs2mVhUyajh)%lgY&gq7Sh!x+1vx(ROf~5)W!L;JJ*_OLvHO_25em+2f zD|V}^#5PR^A{fExET3-?_Tn#PnK4^FJm%B?Y=|aiu1?4v}UF0KcSwR>i9*@w<<8TGBmYY7t~f z$6%jb307g};*&$`5H!+>G{!_48!~i*NDHS%^DQx@;{_y?}F5R zq90=+{t+Oj`tXR<2sKm$+*UK{AE+~xImQef%l;44smhP&B1C3RM|KfRQrV+&6{7OZ zq6(iy6{kkMua7F5j;c70A}FwLAcNV)5cKRQ`g%lTeRPOV^jb}HBYSj*4u46Ysxcsn zV~#sQ#2D@w35Xk2l734GYvba| zMg|de`pluAUMF9s(Ux6~iC^m7Es!2}>uQ zh~Zh%5s_t(osgbCH1N%bmv#tRCAmVF_5XS(ic`36=5&`%n*F-#BC>DJ0_|I>#aj|F zbfJ>fCpTNvCCO2lp=DlZprLdW#0obBK|EARDT%8RUu)5{6Ets(J^@ z0VbY{pl;x(kj7a>(|(Oqs)wXB$$S_CNWMXaft##T$c$@G*0{x?UpI~>=t}JA(lsz? zPw(5;mt^ocdR5&K|3pRr)33HjEA`uE^38}_AM)v-@sHzgO5t;;72d^;L1@uPI68F& zN5yZHc1EBc#8GjQAfXMQSqyR(0GK7blTyk9dFG0UB7X>9OCNJ(Oz3L+mp-lZ1~l*-c zX3&4qZV6)ite&>!jcz0Mz-W=+S%nmh(hmCMP7z z0I-x%TxVB4FYit3F4bk)+(a8dIf3EAdjuRu6@|$?BLiRrY9<2RrU(5+rBoCEfIZver`2>w7?5D@U21ccxL6q=-G0KkKzg5b*fHV^{? zWh0RCf;A|LnW}AEs`-xOAB>a^MkMXMTJIv%KbW5-FUC$ocq>GLecyIPNu-V_;6h)r zR{ah~)Grr#&aYdeh6Wky#)Vh!u?l1=dWx zFB5ahDq)Yg@mZh#QMBmW2lNSn)`A8N5h}F`Nc{}d-rLt~P3#?&=Lh!jxI5m&fS@hr=Q2b zhGk(>9`Jl3jTVts#-&eA-*@8-e#eS+}qG2Q&Dk|PtVcGfofwVoD_9cmKjY4|jsA;9>lgP+k zABvdzP?r|DoD+GQ+i znc{ik29Z+GZP+MlIfs<~sEo^~l%KX#^C+Oa{S*e0(l-FT9N@W0XSL7p#jj6F9J)>P zqi-jH&Vv|w3E*iYqy~*762J)_Fi;irB>_BQG_r%FPoRLFfg_U?&=W&{orfMv@jQ{> z5iFAV<=8z}jt=ooZr7oJveDDip*k!Gzsn#(1rgOGo%k_K;JfFspd{{2@Jo1q$7Sd@ z(isMNlK|F3(%&N^lJK#{TwOV zG8JtEx2AY39tiDSxbMs9dw~p8#v|EQ(Dj(!CS5$h?P3_vZLZjusfy*>mBV7`-)`8G(f0yi4*kzf~)ueXTE;yHX z$6hCVy$dd!iQ7V>C5doH6+vBVhF|1Cx&#Dm0)p`-DsTi5aP9`V=D!lt%9eO|FXkDV zDw7CI6ox5xEC;V*4vB~$u-oqXi4R*(NyU)I(@>(=u}TI$y&6WksE)z=w|Y=Ps% zqzCLtvK>%y;pqI8F_m@M)E}W$KOoAE07m^$N$5hX_R`4KXJ9AcEzZUNHm5k+V?=^a z94)55y_|&u*>+$04qC8(_JvgY7QBpw7Vded+lfQB@v8uLIf4eaz^= zHN%4dn*-^c1I2l6tZJuf;1pK!yY6|rJ|DZ<*`YS~&+Bd6xAlLn#x>xtX%X;K zjPC&JjtS?Ny61ir7Ibv0^2m0cTQgu^`~JZL#G&^No9E?U>gPv23BPy(ehC||nJoMY zX#4e2?N|$b?5D~dak($*^57b5f-?uvi(R=jcKihcSIj(-*+H^!v%TIqe${r|!4Jiw zp(jHJlrQbyWLO4nhBFAhxXZYPLTYi|a{^QR5m%WQ_U#OGjt8^0xEuAcecp3L!=o_D}TuvS6L6DM?5RejSTo9ua1QZ06P(&I;L_!#&BqT+oOGH5hMJ3(6&o|C}?myxF z{W?24=eo}IeqXQm>-BsT0(mZeMrUvrSf&4ZHc=#Vsi^t)bMD3$SH6O#o{wzRy-*FV z@_Smh@J9M&9b@;qXBeqH=fB6le|%nDrQ8O9Il~MKt79SQP3#U~MzjJq_l=+&TN{FD zJEQbWiw_sLEs>ZXTZ7^zsimK$bw`IfHAlx8II8{abY!~-jghtH9;?Mp^4R(? zhkyP7m?d4;ZqO@e%*dG+|7prxCU4xrf}=7yBDu*kARo-jJ;@NeG2Ajg*AVhNICo`pIluYv*F)=<+_n0k*95i>V?>$S!0)Ce-W$oVgT^o4G_457ubPvL(2lFgM zS}O1P6?&OH(7Xv;xLX{VB+Z-N=QVIQvQ;Z3w=&N|o0p=Ktidto31zEf4;gGG=V>~7 zY3|l5d2i)szB?eZ*j&^y6q&|}b| z$h__|J{{h`?UJWb-mU_Edxb>Nz6q@q)}VI>$+;tohvMHJUUEwO{r>gjbz_+OXRO~` zP^cEa#oi2qL06Q^*f@_hi7|XV8AnskZt|K14)f% zGBX^f1{xChM{eB->%s*++TDZ`fX_~Gwv6Ewrr>Xey&cV2cmH%&!k&er-@6@R*$ zRhLS4ef@Qozg$FxWb5Awz_aargvf5qK#!gn>-{LQx&8gc<3A46oL3iH!uGg*5-WlG z>vM=gfDyeJc1P4n%T59H1$ugjH;NB~q{Vp^2o_Z9W%rK6@#h!tsN=c0FVNi4IDF(0 zR&DvbEc3?pVbPUXNAp(ui=`{J+%CtQ8rg9UPb9e<1)=`7-CuOCpjfJFW>PNH$5Ek9 z$6D8c8ljzEv<<0_b&`(NDZScMsME3DTcP&SvWw0Vj^mPbVl9)ql=ahX$Hw?iL6_!x zfpT-|#Qfe3)a$p}NCxZX%xl3uLZ{A@VcE8NZF)LQ`Jj)7>YFXapMw4M;y-2tlKCqo z>L)Pq3p2EvFgUBJsJ;kfcJOlBi>DzWX=2VxRqvj)P*wPdAaqkxc_UfkMtF87J5QKr zP0zqO8Z_!gg*v6=TpMoIXS#m%_;71gej}U;b?ytR%PmNR#*W8(99B7X*Ns}lET4KD zF@$#QJ`GL$7Jv0@n}_%8jnhQxCY!LEdjNGb6-^R9S{m;_RaFY+>fm3F8yz zOrU#^4I)DZi3rB4Iq2(hUQM411*iVmj5$mb+azCiXp|kSQ_1 z1nxUVc-IJp0Ip?u_Ax{k9%zYp(V&<9AcEczixKBJfq>V;pLb08r;!Hm6N$HTIcHicr90> zpN7mzykjR4!`Yr{!oO`5>ppxHTHqgnZH&0s&l-Ftj3TL3Cr(%aPc-=H-ACpEE(*vey8p$PR3$7-@~t1qK*a>1d14({(Q5)YXs+1VxAe}w%0M>p9g#;YNn1K;be^Q1%(ekAS(=l~3@~IsYG$wk0i1n@ z8zgv_iIr8wL1q7L#EUCrA|0ze(_xg~lNQ#OhH1Z6Nlb;2B5MI9!^Q|XtPm?U zbBz{&#g|Bv1xL{U*3pbD4X;^q_Zx@;m!D_$ze;sRj_&VUa7T%`In!aS(Cls$fn44Fe7Jb>Y^T= z1LqcCs;Qdn7DIxR0jQnMa>PM5X)KgC#SaOQUIc=n7z!!@N;gL1l|b|;eBGmy#)t%1 zt)=Q5VYof9Znm2ASmQ?2TMu3uDoCmYd#g5_QZ`Bxg2bk&L1<&NF!R6=OWlkxz^*__ z-Y9)WD|K88*{o{9w%3&-`1YP|T8Q9(X`~qQcQErfG4N@9;`rU4uHbOnyP~Z{jvf}7hW{B!oud?A>2!aY3%h%?GW9U1S|+36d*CO z2P};(AS=*d0KjU@2?h|+52a(pj=*YYkXV@%5SZ109#aZq#b%R{!oObAN$46$0A{+t z#M4d>~ctczem!-HFog zc$4NO%o#xA31C~JeWIg^*|CNMr856V9L0(+zt4hdUmc*ou>**8$I33^=`aLFTPz(? z!>bxVp|J(r=1hfE&!<`wQoB3puqvL86SBcnUHF{`@0cXno0)7{#ra59^0jAQ}X;afC=XO0>qGO`(oI?UOBMKmuNOVSGAG&Jy z_7e+C#v`#PAggi-_=B^d3QYj^!ZHrf)E6DX7j@0)6_aIjq7!dryHzP|Qt|7zYF!PL zT1hXAaRX3%>D@9_s*Bngxr6Qc^a{MsR1)a`TpRGf000m*dbuHwS4uB1=>1P z2{>Xz07tHG$466i+LzJ-06+EU-o0D{umZ!Xdt~ac5%pLG-i2HA?naxmweLH(cT+ed zQM2C$>I9actXMf8zztqHVIQ21vjZ*qa$Ti1Q^M;6x62t&9m_;Pou^Qitg)fPCFZa( zX5=gwOQvmd85q;fbz2W?(#h%X{dujZ&H5_x59#;#dl;`fO(BpHGJGLSsPj!Fc5Xo9 z{Jq&fp?UOg2J=M}TcK+IvMw(Hj~#VrmK*R&F{S^$z_7h^riWeec^k=lxde;lzHEPA zaMiBb;NA?X+!uZ7YP-w{s3SKgM*?^k^?Yeoyo5ZiMY-I7x)xD?Q(-Fo)2p0OW!il$ zxxx(>kJ-L&Ddnnw7X$S3F9sFc@|BoFD9XIo4);NSAV5F?frx@IHj+Z+U-VA}>a73R_f-0R1Peeb>fUbu8x zzH3p(rV{!rxKaaIf!UecSpihM~Qv-Xo zf~z+11H|yM(eg!LxZ)6%84W!0_1kq(q*b54)RRi^4gtOs2_^g2FORa2 zA)sEpJG$O@jez7gm;Jiu7lrxj@6m`67XFoV?bp> zXl#16WB|bstS4zPzKCtO04ZoH5N#p;5lbeW@Tgw!s1@;Mn9Q4zip=+krs@J(Yhi?Kv zYa`DT0godAcN9B_syiyeG?aITvyfiojd@LQHNZ4cPsjiuh`!PHbR_!EEVytXZB3az ztfWL^Jo^;MfW!iZJLvW;-AugS9B#8Vb~$6P0-a&7xBEz7<{4^u^O%$SKn%_8IZ!)#xN11K>Cn>OA6J{+px}(|tV} zXA_gMY80dyla{7LfoRpGP@&G%mU3f!+QiYnliJTU|A9K0U$Asd$K0bdl%F7Ss8Hv# z#>=@W_io4jn3n8yiGA1@`;%X`85>)w)KI>FD5k`g4b^W&9G5Dcyg8BM**PksLY*w4 zB||ZlcjLP6rS(K`6{OSF{WsM4Bd(DW*TfRv43BS7ihp4c-)a;8@@jlrX#A_&@$Glx zUq6oTXgqCr2Kk7Kd%ha~v@iC~aeOaJ0tuebr<6c8Na(jo7zpL9_#8iEE*qPZ0uBd` zG$xGpB#ccYjISoVTTS2`W?An6IqZ5SVA&xW$QnzUNM@`Gj6LHTy0?W6Uf{-aR76Q|qf zLrJli20sun5=6(Y`+|AjgSvv{5(&g}MmNCbNI|A9K$v?BJQKc8Y!Wd955 zlzdmB9sP6f%P*K;VrB@LJ&gx(6>#W&~ zzg_<$ztDR9Lrcgm73w7Z%)!#gz!3M^?B0*qeo$ilzfh-o1+!k*@8GvDza;dRE*^Yed2;)I zpibSOpg;8qKg5!|=pu}ccC^-a1gl_m@7A^d{1?=D)unT|YkQsc*7K;u#XmmgeT|_M z{eNd#FK0eHzW<2@^ZWBvCjf&pH;!JmU4*c?EMZCT`z^tXXJc1K_;6ouN2CeGRj*xB zOmKD}iam@G&5)@Hc&IA}dS8*LJc1I-R$qt_yQB5BUF@#j$&px&5rdX^t|@=4_`M6V zuf_AMwU5Q~?JTq;3LITxCGKCj{#xRJdmQChqR=ZtOS0(N!&u3Ofi}RvoRUY7SO;8@>3+hrH5{f@l9!6y9s=Spb zPEZ+Z*>bFD9tI)7tG)ozOcyQ=K$|>8VYb3M8scUy$ooo3Yk8 zSDUr7)Kh=&=$fcLcO|G>{eye_x%#|Urk=*fYsKR=n#ERln1+An`MT~eq*^++*?h<= zfhr{D-^E0Zm@Hm1TQ3F;Ki;Wify}}dBLA(w>l305yz7JVNvLdH=1hZw z)^9e-sekRVw_UQ5w01XWB3{Yd;8h+nj#TqX<|ZcJXkUo7nbzK)@4k`D`;PTTwn58k zNWbJiPjm{;N6DJqSUMOSp`l;BRZgh3Y{3hR?wnpQE_kX z?pe64^uZ2Tqg5m|1>l2q$SKzJ)65)l^yM(y2cfMF`Z9yPnM25r`0m)<)mkej=%2mS zu|nvn4u=Ro>=gKRhPWfS1{6rH#c1tG-bvxT82S$|C~l=}dV9%-djK^k-ei|`r3);; z;Xsx@QqCO7MP|D0ZSatf$Xb5loT;jK)iJ{q1&bdyZo~{3s}R#%))Y86*Gmmnyfh8} zc*T|xMD`?|vo6U05avHL7yI#N_|ui#JCCw3!Vh>d==mcMKMGC7lZJ92*F$4(gr%BL z9B?|fho_mAIRrISFf>VqDy+WwTRPm~hT+cB&~v0PSA-18V`2OO_Doz&s6yA^{(ksD zFOBH-8m+Dg2>WaQ>iQf1QDQ#Cx1g6dLrZOVSg4>qgEJfVy$k7vGN;%O~zh1|-KH5$?6mfXupq7g8}1kA81e&?;n1 zW!>N`ljPd@xo^vy%p4Y*H=iP)81pL4Oq_6Qx7>g+z*1eCP-MMUt@|a=hUx1pT=)0$ z3pNY({s6*L1ui#{Ft3cokJ(kMGF7_gxh$RIUU>E)8vp~>lfk90dtD5aIkE&- zSviu%ZBsivqQ6owG~l^FMtHyVm}o<08Ahy_wI{`s40EILb;dCm@-e?Je5bm%$2vOK zW`7^y(My)Uo=YF{2f_Z1HBAN@s40O$O8n)Ox47u~@x3-g;8qX_?Kt(}0`I$22R%2b zE47Oa?SGy(!BVWkI~Uh)Jbm?sOAzT6qshzu>a~eXfK#R1a>GC6(&$Tlpwimq*5Hj> zvx|x`6F=`nB){s~)>Dd`sr68yx}i+#L$p#GD=AChN54uj@ra*yPL0KTc4>BY+ngE2 zw*DZ_kjCP=*ONIVV??hb326R>}PtrkAzqgR%Ny7)}kx+(jB$T8qc^$-Rr#NELFV$rhfw+Ag5?X9n<~G zf+$QSo^-)Mt{ zoneo+^#?lh@|?SR&tM! z!KSt0on~Lh8SORuM5Xz^q(>;-Y>9kf_sTr{ALu;zn;5;R0N#bzRf>pw6_5U^7@cRS>;XAP`$MB;d}pK73+anj6^u!a`?8AhiA`I#N+i{`&ife$LBdc_9%iV5WL5OtOi zEQ3G9-Vo_l^tZzAjQFf@J>4-8qa~$jDOFfAusM$n6H_n>9Lr-KVtmSod%>=*h}gt}?k^VR(QDU7V*>dmqx5pi?MJ zb1RF6nIPxfw@13l6XPjkSO(b`I(GMz@fG==3xqpUK#WOb#o9}>87#Jl7~&GQrBTRXLyNLAeb317wlcMhSsnu=WD}BP;tVp zjTbxKGP6LA9Fo!_w0KRrz(@t(oRJI$EX5$!v_?j%^_?D&6u2u&56PPCAUM)f?95o$ z-lt)YpyrIHW$sgIuP7TPtDiiQz)--N55?++338L7@FkGW%|X_h$GhFazYy@B3XjjD zRuwQ{j*C8ocCtdUk3c?NXJNCZogf$BDV0F*Z3n9_;h7&I*gTEs=HSLFpnoC4+sl0l zf@>oAh_iHY_{}~gU$mCuux37JzrfhQ-uL+$+4Po?nSQDs1}yD@U>|wHIOk(ga@D+| z!fwG^L;7h_3HahIBN?Rl%sgJ$WKi6J+*pDZLmG)ee0&moq`h$()4mP~70S1a#P&Dw zC1@!mNhDxv#$kZvjO3GVD>7Ns;7esiNeY;!u4`l^Mv#!1*yW2UI*bH~`4wHo&BQ~p z66CR(ZZ^!y*9XhjI7}fkcu0D!krfKP#4wc}!(_Ay5RR3^C7`J$(eyBwK^~&13z9)X zP`Y^Su7MV|Ir%!~FO&4zj8fCN{VHF{%NI;^S^Xn4Ok|KG1w)8IQ>jINsYMCc+Xe6L zV)ErFM0eiA0E0$(T=U0Nxs~-JJ#iAOel~mHc(dgA@1_Z&nMxu+-l{%*Hxf!7!oEgk z=`xkhVKcxoTNmLSyVE6p#tETiN-3#QvzM}>jWlfamqWWBL#GKc7!)5)67{N~H_J=lC!u%1QJ~8y+WiMY)2Y*tJNHdAJV`nClXVNFE%JRtgFhL!@08&Kc;u_h z90culq|Ii1FSKv}aD>nyWiK@0?pgFd)#y*CFwLhfPR`F;sHkW!#=s3{Q2o_oi@;mB z^04nhu?j>(UO)pPO|M`q9#d_wXAGCes|LT+Yx)$#uBc>CaO7YAyt! z6TsZPex_k$crsAn@M@VxKk5v~(*!&YH0I1F-PiyNVX1=}K>c}qo=URLB(U$hq>wmX z(#Rt6vJnngg@BmPMtF#CPV!tBsW9jeqe{IVmpRyV$(N50fCho1VNgMzBpBg`I)&O7Dkcf6T*+U=_1>V@<(pW;Ti`e|lYWynB zB8(ykv@W1qfkBE8ZL@w<89?aB_}RZ*<7zNN6HW<>)t<-0=0QS7)w*GZj5#2NYBFjb zM29rtYsbPwj65cR&@iA(Sg(GVA*U^pz50f!Et_WcU2Bok?{`x$`U5Gs&k$Y)z;&_rSyrZzYbK(|5cP`rT&P#}yT zZ{@kN@lBvnneSCU-zi~J`@py#FvQ&?I0O({RE~ZZisn>s&nG>Xd@hN_>K+pKPWxEa zp25y=3RnO={`-p^3ELR-ttp5BXW^bozxal-!t?^r5J;!+f|QAo~Z z-!o2aU_R7`h_i&F9h5X6zmPhFJICnL69S*Z9o2k->LmyGevLrc4|ShuWlw@L0(2Jl zRe|_uvoumH2Pl3@eW;%s>`@xaPSDGW5c+EH^t7s`c$0Ae=r}42&3Qeuh7MX*tzGs_ ztSi9u5G|wMuOm%zE3)SpBWM~rUn(-br6t=&)2HZ*7a)?M>Ok#)kLV{WS`Th&H~wOA zqkHnRmn#R)bBdP58j4zpD|pd`MZqz@7rZdN%W zNa}up>*9iwI-gNewu{h)p z<8W{Lmv_{!6sw~hD51)RZOBUUhCJl2MdTxG@WtoyZQ+rDciCX$B$Vs!kM2J~p&}e- zc=TCs$cXas!4DzxZ~9Oq3-Sy5uEx0;5-koAYL$-3!Ri$5aD{;-L=G=>k;T=F2nzT= z{SBHBJbQ1bni8alUKL3sZ|)Rgbov!~HG)SqLNrb0PY5bRBCH80lY`Ii_{$ZB{W_Y0vRWGM{Ap1jJH?E%?)ngMSV0`Ni+|+xQ57_k^IZ zgcXcdClqVO4gL9DuZb4HMFxR1O-S%$b%7nyM@18RZascdgk2tPn9uXHk z(*6IW4wCPuUDOjK@a=%9@wmTl*<63zHd=MdYe0V7t6$`1pPIIe{^UUP*S6jba$+r6 zEX>FtjO+?~WpJd5&M|)Q2VQYPh){hMiH^R|ML03ZjV~v#GcI$F_n{PwKUK3dE1GB# zUm4_RcH5ggvJqnmz-!w!88p=!2zD66G#J?S@$CLJ?ux4mI=D?2V^N0vd=HM3+6P1e zNU|uxC?s?9Ta~B|Bu%~flnn8XsV4w555YXTeQByuL|2pA$59!LF^)Sok=pL#cKpw&{>a5?6;|VZoy`w4lDR9y} zS)TeT(Zv&wp25lhgc%$xKrfFSOjp*cc0^vuGZY2HUxevc=z{3iLIpx8UybvDbW6Qr zhtKc^GAx|`nAT7NSVQo{$#(QVW7{{MMk6;ub0ot6(52p;#DAvp9wsJ0%>aBx^z&_k zA@A&oksFB%BLn97x3Q=W4L+)7L<_N!L~@e5^Gvkt359+~z2?zoB3-62uS=E)>`R0r z^W>Gl_a)vYr(1-h=;#$EljQ=+h2iXGvk5EdM<#oItAkFGGpWQ4$Q_W0p2EQy*J1QT z87Y?jm&DB@R`34gQ{jzBX`2e|j3-H$!=tG+{VXsKBkSm0jj;^I>wl)=K4x^cyeMB8+{eZ7qJsE|oH1DX|}E_N~z>OHLZoJ*RtDU2(C~{mc+A z`t`;-Ih{u{Nig>3qo>-xPq)KYv*q60n2+Os`8f2df&E)>`2`MWmNG`%_+G}NM&1#I z>m5K4J$u+2!=;=QJTE&B29>{#|H!0U5)S4~>PLPmlsq~}We9E3`X=O`g_IUnkR(;n z_f{MF@`xSTG552Y3g()moyeHO*Z1igOma>Rw#nSL?bGBbd?8(ILxNFDcA~Df(op){ zGhB^`x8cqZT+x1ND+Av!={szGb|6s$pO+&(6kz;GLzC5zLql(0)gj5yg_MV~c{bU3 z^Sp10HBY}VqD*{D4^w%17hervyo-P6OdK+#5z3!5X42!FGU0O1pSt@d_WM(V5$WoV zU51J$R=3+v3EBAhNgoWidiV5W`Bx?tA`-c4sqEJ6ZylHq%!h?%RiN!}7e(%R!T7rHR}N0Z&WsFI=zh){vBM!M(lDyMIgoUy(1Jrb%F5 z7vZ`-GB8AQ-8gF4f-s8!zD=CBP%og?FW*}!9Tp8hDV`gF8>Ky8W1p2r=pLHSw&tM|T?4s(N{@sbJ(fXE7GIsrnQ z{l!T4@Xe18Z3*=!mR zF5YVy*BAKq=!ObZ_~A+Tx<%2=d&TXkf-RjQ$)chdI4ajlV853JUb5fEw5lM0IS?I4 zk#pNQ+%O&6LBo2OueO$s!A43VZ=*ge{hC1kNVqv}5b7T}ZKEU|M+<2_rg=Ut8qUY8}b# zctnQK^keDIfKVE50N1~mGL|#tR62bhAr?V`LBf7-RM`fO&dL+Ww_`{{4XR;R69I8K z_t90@;y7E&K+6AdFIkfrIK5y3N5)*-Ie2Yf1buu=AKcq8m1nFBq&aUylY~r0yLqy~ zplFeQVQD66UR-wzP52d?vDW=woaU$j1>>en*(Fat<+Zzl5eGQMGj9fe=iVq?a_U9- zGJ#57b6LNsEd3{y?7vWEi2HVT!m`RK{7>fxH7lm>`q!VBrFDQlgvty=ME%hZ4(W1Tya_vC$+6o&p} zI<3N?LXyef)HQ>Ez3fn_@h!kOkTPNs^yX*bu>7`Mq}Hg*%Flwym;uF^sZl3I$AV`i z5_0aZhwPHL>b$rdfHQWNUp(Tf4|Lph`M7YITDxorFW7ba;^Ne~!qpi2e%Ir8!KvpT zR}=1Ot<%7DjljLergWiMRS$&okUn=yZo!_fjO&$A5AGL5)^uK60vfTN>w=|RA!WR^ zS7sh@zie|TqZ{O0)$9a2nGJ+me!A3M_s!R%hO>{uVMi zvV8s!O(PJdknBF1>2c!O^97L15b!$tX7J{HA88yz%~vp6?b?0W0Qn5dHLz4WZ;#eo zxclc*e$_Q=t+a0kIeC{>h2XL6uieptzF3_9##&{KRT|ATTbh8FUV7%bYlPP@CJ)bE zdhY!GWKt%M)d>iGdlXUF*lBf1c3v>07r;EZ=dI-g9GBnzSt1nRzqrl!7RTLsCEUH2fae1%f@(uFKI+x$paFZJ7Oj#*S_Bi zv7~l5wATe}57b1urYJq5X!1!8;!^F>aUn;&VD=U4qgOg#*@s1h^e3CNID-TIH$SxB zqqR`Ra#fgN&wAl9+m~Vi4w{G03d62&6ZrT2E^*$=RvOeUIRbRv}l(o?b z4(9JHxTUMT{6*3uqWA0Hx=BM(C5xs|*uz2&ULqKJrRZy6BdNgW-}l$zqYkB zhcYCLyd#r&SOpB*O0HN3(`mZBbPNO0ItNG82DNPssW|N6^+lJ~z5I73NlNcFGXU%bxD@!|bZfZm%cdAH|2b#k$`y-~S?yPDWz z3}$ZsdtsMJB|UbwVy}{~zpeFgobG*g=30@VlaH%O5l1U02Xy+Kuy)|vQ8*>~TjxzbXndQnjR-c6Q>NKkR%>i6T0=)du9UT|X9Q1uu@Iva(9~8HIXm za!=j?E4)h=BmpTl`{zh(PlQ!p8-nkiQJ=Rht$z+}&?IdrD_!K|k%ujHjF#@UEuA|n zEiMO|IZ2l?3C**m56Yo?U`sbKLVt~wKGe3m3^*eQB)a%R&`;Q#)^4{o4QW?Hcv)!# znrMfCu-dZb`JlEzSo(Ar(}!9n9F+N65cBF}+eQ;Z{P-PlAl+dY%aG;F&q?Ov$$Mw4 ztd+aW6RdnL?o)vI!h|(d#6e-4;)yv$ATr8pPGTC63^qr4jg~LF;DgfW4 zEe;~zaZ#DeDV$Tem#ZAc?U>7T;egALo+~_8@OoK(M|S%SHa@>*ZtGkgpYU^RE_X_J zJ2Cg1!k^ovZ7Dd`3+?+M(T`>DyC%&oruy9omgULQ#~|D)ZntfuHobi|jO4Uw;>9WR zZ7Hy+Te7?43fWDut=S2u)CyaE5nfdu|D)V~lFLm|hJ)>yj;Dl|SwyZ`);tPFR33Kibt<$-cF-rH@ zZhJBJd$Ora1+x!B7Mq1~jm{^t?r-O|czDe;HPI$b15@q^5?e%7VKU=JGDR)GQg+$w zh?c4rSvyv_VmmpBOgUvwnbAqMwKbW~^6~>M40Gjj#Szk^>DIv(dAoi2nV&2+Fd547 zJ^A_;U~>+xpMlq}oF3ES83B^u0P%v#xmDN%B)!-iNbn4xfbTV)BQN$91iPpa{KQMS z2H~s{sjAAx%Z*_7W06!LL1zJV^CDcF*VtLXA%@iu?6k%Q7QR9@@SJ^92Lt#rknb{^ zbmS#mjA~N?2P#EXI%)<5glQ3GG}2#SAO325bV+NGjoBrtB^El*3iRwqPN$1+ey zK~8rx2gQOxX7@5ymkHk#Rfc&}w*&kj5~t*^>wYB0GcWdPPc{t%`%j!+G?3*xhXCuk z{<)FiIM(n-B*z$lSF(H|tO8ya&hdsp;S4COhGP$IfvXu|ZuT=W&%h~ujhp%nKI}Wf zp{ks#xq>&|j)(@eoisxx@0o_H!msAvADLscTfAvM)@-hww2{A;n|L}*Sdf~lBe1q#O z$y+SdIHnp=^P_V5Gp~zbMvRPooP*2)%2h%PPz>%S13njX${>S~);#~|@D|}%9PW5r z<7l_;==1*EFUp)|_j078)5p6`K59)#FBJ(@HaH-@FCtnvoO_umLE7Il3zVO5_tIV^+%Rs7}o6*?t(~E_FYD#jwfW z%kX@FP!kDy$R%~s3$>5PKP=NY!!c>hx`PHd{frp%2Tm!ycYWlc0ofY-PT^ev7doa9 zd<+2$XKgaTSA2A@!j#v-*d8FboWg#6&FB9cj`$n;77m+D#QM6PU;Tny5gp4mm!Rs1iw{I zFKbSJbV-xJ7{2q~Chryek&`~35665YhntVoPlVuNRKD~&O9NIQ&W!I6%gmT#{+$H- zY6PHXO;nb|HK^XA+~Z>!yQy5yFd3G#{6i$L`SS7z15En%>0IdPhompE*bvLxe;=GP z9^GD)#nBbw7Hu>)?=-k@-Ewk>5;{tviNQ%7@o){}DW4e#EKUR)F21UOe9kKR6%HkA zur*E;?l%u-os}B9O#kFE^{>Fp32;RpXyjc&NaUJuSkaoz?qHnbt{(&v9@hnT(2|9zByNh{yK zs37Dxy%VUQv86TS$qFEIjA_4IPqrxt4p?IJc3n_(g0E zZX=*o)8~&7fBvXx*v6kYZ~bvV>CBbV4_B^xw8V;mgf?QGZf*!~eY|>s9_)53%hCe5 z&i!h``SEh@b$ZUCVhYVd@0F1wk?ID88g@x&aQRw0AT}`1FhZWA9qThTXr?KUZ zwdqvd!tzVSCFXI4fJ!#ahjc^yb{sw{U|Wo%q8J5rRBXKCC3@Ho)f*7_Pl8mXBjPtQ5hlUy$~O3nLgtdS%#gOwtHMS104Y zUO0QX+zuY4t%DfzWdBTpK?%yOo`OR7myOzwADjx^?QHus)%${<^guw5r-byag?4bC z#G&0Jb(SeslI6(LG2_Sa@s`ga2}a2>yi)TNeuQOVs3;M_Te7cn+a-l7K(-Z!!m1oqxKd>#zZ)^*t*Ysa+ar z6dZlAq@j5hyj_m`eC*SY6E|0PaU-#NZqPm1YKdKAz#xXH6se&In@Bw-+3nJ7*U&m% z;8xtQ-XeU=lWi-C2SmF5vP`6l#MobUrL4=v!*vucG%e;nEmb;AC>S00W6SC-Ht-Xt zwg!|ANQUFVCB}TWPQo!jFPJf30~Wk;pPI@(dtN?!zD~PDBY5P7BA&gFViTI4;$y{K z+%}BoD|HdlzOk=d(wIyEC9H_vCIp7tk0 zaAW~{^4WVP@5RpUp%9@D(dZA91iR;+7f%c|splT$A3Kd?3bUICt(zf+hy&Yt@}B`WMtzpg*IQ1{ z-q6JcRvR*ntqH3|d42Yra_yv&*lVG}pCdg!%eb$R&5XOsTFA69st}hrXVI2rNkvgH zjlOqxH-+=pge&w#ML&w10EG_CUi=_&2E3UMN&4I)yhrxf6YBUJ4iU0LGqUT4#Aym~ zU4tJPLeeWWlc@Foj7?QzA1N$CjlpAeXype*cptnPFuGzEW-n!sC>=(EF8m8VBJqQd zZTHJz76`3?evWMSebuYKG!6FYSIxX%sQv`5|K$uDF{01H9uhWqOFB?Z;~YkiUs_lH z_&@%0y8P>!uL-Pz_n&}L!w2#Nt!EctIId5iYmLcmLL(K}>;c#E?)mc=eo|>L8rox` zR&c2l$g==Z+E_olpceM8a5DRo6lWsJq8w&la-2;*e){-)arNBv;fbZMS2Kx||Nbtn zuj2*sU#PQsvRDkwujdl}KsB^I=Y+#Q*+T!BL#@kir*TZ4{tH9z z-AW~4WFl^bO--Uo_D!VIVn-rO^Tu#4dUv5aQ?qfUJK^-?9ES3@dz~yWSj;K=Xy$@h z2X_+VgH=&6Gf|w$E60N_ea6qW*FJpW-kN%1S)7E?Z2J1X&b2d%NvAEig7@N3ssGC? z-Ni1QcRp4hnWhtn-=nHV%w2wb8Y{kSrKdxMI%jKLdm7@Je%&b+lq&J>N4+7JaPb}z7RcR7;)Ms10%hxEE(r`xwGjul` zQa18sZmaEhrt-57u40A%19ght!Cb_Rg+z>rZa^xWwaP_Q=neh}&J9T(2UjpCCJJue zRn4=yH>o_oGhAt~BHxmp%=i0+SKhNC?_}=3S{JSrMCO`!-*>t3_S%EEqV^KD#F%wc zcxM;dRC1m-_pV_Tkua)PY>OLKG~54mN7YPR&A&AIKH5~dD#q_x`S=$xvumk}Bhn^9 zarDoz{;0KHJ~PFuzuv7cFKmV>n(b8P>iDXS1nAF;q*Sr}M!-VK@7}WriL#`IL1UTaN^v1jS$Y5# zaQ|UHeLKh@vmvVL=UlQuBCp!0K<7~&OYuwAnpz)+rH8c!(32OT;xT*1`wsk3q>HVW zFN(rk`t^y};Y_arV^#sm5T$U_@WLQ3pgcu@wlYM-!218NcVAIWz5C1aYt+|++ zTxVtPwfD}mpU?ZX%{!`U!YCCZoaKAkjuCg2TH=yBjl!tb#N?1a4fI=J@so=ch{I_O~H$`!@$95eQbUy|-iXkDU?@s3xz z?E6(wh4x%{0s}N)j~Q2oh&#e1j6LSUS z#Sg0?H3_p+w}qbk5%Y?=`0J+bvS3?ta{QS$Kh44Sqt#~@as1y%T?`98=(>Y28v6ZE zhR4hr!TR17d%57$t_f%zxqwSN{50i0p^rij82;ezB=)`AKiXv

LVV*i1L?S52H~B^2lxv6FHDB`iVkR$j3}EbZjYIlGbuPxh0VE({{gSvU z8>cg3B?!;vp^x&{_eEb|wPidwE|&e++~$+=Df^!O+Cp^?;M2sITNGa+6eSV;kfof% z0N16;A3GF$vHl)dpj3x4{|)jn2duL!*EK(_j}d;@%dl2XNYr0_^f>lv!8u2(UMwaw zd)3bx??*KBbyY}8DPUI@i_vk5C8rJgmCaE{(6@PADHcyI*1lxXi-{&7G8hvPqPYYk zpQ&Z9CrzwTX+2xbSq=vDJSZ3X#n2Hn44A(_|MLFY4YE>hqv&9F=*vi%L0nzOhsAup+} zg_Gc4s-uFweK==l)?KurrNFC_DVeg$62T(#-ES`;CUps_`^E2e)m;{Io6SV0V;S^5{>*TfpRpDpBAl{wZhi@ zaro*!)l8>#*D0x+DwNCc;U7?Ytb-iiplg3|M}@Oy+=JJTM!A(Tc3G{`cz8k#qvg|# zHsZ4rd7_KIYdYm5S$0TI6?|#%*30RTi)R%N_79i)ksQs+AI|RnQ|-l4d~~%U73(Mh zs#LzEp0x1 zRhqzWTWN=cUc_y{X_q4F&Icuq?Br7)0x;%zVw_-Zw_oW~h}@Tu1bqhnyLIOcd#|0p zL|MF~|4nn7J39|{i=c3a^?lP6)99|!jE<&rgG~EvE{mY8FYFyZBN?^Sb{)B`$K+;Z z#feK1We0huESh!|%JaZCr6%;lZ((w3)#KrB^fTKd+HDz9St-)0YZd<~5HSk-!Uw^M2{si%&b7gf4y%n^DB$FVB$T{wbdL|y1Z zGNQi)K3HRQ8jCVMGtqM)lJj#}N8je93vU|&9{$?ZuXec~_+U;npW48S8JBMdt1(ES zar5gon|BN_H2d@W?t>c=2SyzlqFoYmUr!GINc`FSUE%Fvsrr*7UIZPs=REW)v!9UN1gs%i5rBU;EW=A3zOQ1i~kJ%-GL>-xg z4h?1$_+muMp*VooZ-RjvILp3gsAsNrJ06+&0yk-V2UPc*AJ2ayf$$tAw0aD z1TsW`tru_o#?fJOX$SGNP>D!rO?1M~=n&cH2w7AkC75oDq-^!@%XRm|#H2=_(vy;G z?@!wc5<&fAQLoHR`-vbyti4QW3X zfbXm)e}ckU0mMa38ly!tqfsJ7Af=e18~~bOAn^+hW!w)EI-78QCgt=RWvL)xj`H>X zWr||Q&^VoBHY21+aWYLL4J(d{n>xMD1j0(BxhRO1SUd$&oK)fabV4Lh)*^T+7tUCg zUbYA}zwh;p1QHsHy#6cQRyfkUEd5;b*zPFGKsz#?KOCMWqDB{%nUvdsJjmiQslf{0@n$jn*O;pNduU*5p-_dhGv zc*aVAY(+o&ei_@3OW*qS3|1TVd&A;|MTRl{^!X2++;<_g#yE!gjps0*DCDO34(2&% z6H$TBV|L9u6PtIoJdb@ak8?ARnXzJbEs@1B+%7K( z7%aKXR&tM~^iEbukZx(jU~w>8X^dQHd6p#wv8SM-!BUDVN*v2^aK)a@%RRk!lBYvrCS+l*z# z{A}gMZe>%fv`$RbxK`YpWmWrws*=4IUuR#uw`ASdec878f)!u&%eAs#yy}l-)t)8W z@@Cb7ZuL@mCD^L+eoK`zB*VlOmaQYOzvzel%?cZ;9?q&Rl3?&odf66R@?(&P6;u0K z?gfawmh=1>Hocdk71f_~D{)V2kn(k?xEhUuOr|aRWSsoc4F5h3ZdZ+<%cu)qszt=r zHG*=ndN29pD~`fIhEHGeS)Eo(|E`i^Kcf>+x7);FZS}&^s_x=k^^Jgfjj$|jBib?- zp;RxUTgbCEZZGVRXAOQ=5r*oytr}`w*nF%S7aue@4Atn(u|FKD@RM(ZBVR-#Uq#qG z)s&z{ETMYfPM#nOa=>4fx9E6WMIYEtF$E@>n6?APJQri{M?(7Kb z&KhKg;&J<&NcVJ0wc2slQHW1!uG`r?$7uZdxJ5ckfY0C}G7p1T!SY+NsA^fan7ff9 zRJ!E?yV2I&^z$4Nv}C0hoi4}i)HF@Mc&a>yduV*oO?WlR0#1nVnH5rpsRU9_p^q!C z=B7e7QnYuNzSn7+Lr1UwwnV@CFo*YX?-9>H$P3*1n1S18jza<}XM5oJ@qiN%9^^i- zM&Fkd-~WKpzZqF-sSxcP8|(M-9{l2lof(W{w;L)FmFUYI&>qh?UhJX+(R$*Cd)$Y6 z_0^n`99ct9BW=U%-sn-<{+4-6`Zhn1m1|VNeRpfnBYsdm`)U0))@GLfv;N5YdRY`b z^^Gxc(-7MS?(~OP!64FEZM8QQrYO%`swz`dTrQO>hSEEHLrI>ly`XQl7MOl90t+39 zrR`f%7)Voqy>cI8raQCIM%>Y7elzQBv- zF;xE8io&?z@`Px@By$L_NEdI^Bs|$h+As{Eo6D+|!VgXMYD=D?-fPO)qJn~G@%dB| zy{DM-Ig|GhlO{fs*GAq(@(McG3HD)Oqt(=p{E}0NclH3GP*owlAU4lGq=s9T|)u-u3(0hXfRPFL5>Q1!pn7Ll}Om5{| z>&RRp&uj&QD$`r^g%os%4RjBGNx7Tf>pI)ZoSTt5;s&f=+2(7w2~TG_V^k> zZ+Hyp`5)2e^kSahWwz3KL*YaZ^xu%dw02ln$`-AEE>(T#bPy?ME@Yh6XelX~+6n)r z52JiQ%YTT|39{jkS*Ehj5ecFMP)$A>`kyPnmx|Pu0Yi_@Q+g}8^j%@;qljgLB z=e0V=QK9e^dv>m8^hebzN#oPhJZsF}YqhI>93SXMy)?gxqDB<>48{h^D$xc+x??Qy z+}fuV12-0~GntE=qq)QDxva?;>LcRDNiI!L5_;e3%Dw9=`|0T3dHQbs(_w|5U0xnj z49mQ@F7y$Gm`wV--^izEx7>Jxe;P-fvcRJJ4o(TmeqC$>VB1|3RCEz`iSt#jTPn)w8RFp!cQc%$#9QCg;wDR(5)hk3YKzoey%y{IIO{ zjL!*56Sw`@^9wYDPB@e$Fu`R7N zo2{ij&7#zK4-G%Yi=UBy%^O>TROXy2#!YD?@H3>Wdhu*fFVVUSz+&mqE-LaUp3Q=) z{gIV-Zi+$!&jj5j1T}47q+BWswR4hB&dH}9+*_tK!693BPp7Wg(%i>!BAk@M~25Jb?f_m_OnHMoa3>@XEW{T(do zPuOF%g=H?%E;6Fk6Op84x`oeNNC_WlDP&y%G#Q6UOq}A?^4YiX5i0R1>GrvE>I3ce z8Xbdr;+Zb+8Mlay1wYLv+qH-A+Ee} z1NVNbc>KDd5koKLa>xFg+m@cMr@QjemET0vZp$`#@wM5VmylkNlQ9^89pu-7&@8X8G#rk8MV8zvhl&-Vw=Ay+()Np84U1vZQ0x z*iHe>L2HBfE_=`Ah=W@U3C9w=+>fl~Tps&e75OUYS4te@Hg5l>MVH$~;fxAY&w8a8 zWOu6Zs8gnP%M+^Xbn*?DX35eKtK+LI-qTly6Qlj5~O6LyHu+i?e$s@#pNQ z%wqUfrL2rsOdoPZoALHzfqO=FsE8wLpX4sucxZ>@>wUPwp8BK~wO_RaSouQzAUr3R zX|sR_%9m3=5zCY%RQK@F{<$QJlYK3YQ?YWR{aMy&+UiD&N1<6 zAHU=dHWzUvT=>$gWroq&jmC1)9&pf=AR;$$?-A z-e-1$3X{_^*YvLP+NVoEfUoafeSX-grPNMZ6mZ=b>mPCV_gY>)*j$9 z79Zp5WEbm&L_ghs$CWTGSIWj_0 zy7P*lMB93~ljM}MewSc9u5s=8x$(aC>o77cL1ezEgg-hUCxYgJnMu}$)mdxjjfZz7 zcz0~Sdv`}`(5{#uMFxEeVhUMY@~KbNkIMwrgEd8K%44TI-RTuOzV%Xc(dmE!rNOHP z)~~+YzY#M`h^B#7gKq$Yl&~`d*9iH0ERVJ5#Fq@*vnHtr57V^mAt+?ce_vV z-ooYOzTt#GxY)DsyYvTACDL7ypBWX8JoqdV10ViMIEr~z5Lk8kTky@%{J~o}xUKu|ut^Mq(32K)i4K=ke);F2J-yUULg84{GBn&XH_K7byX$s(lH^11 zlCP-oBSz`b2pO_1>Q0jySGgdU&Z*Ui;@wqMzowT`OIx-K=}qc_(&I5#!tI!fWHm(0 z#$&IM?N}O`G{o`RYf4WfCx$!l3tZOGHJeCzLjM1uPSbL;x8;b9QE-+3`+byNTnK!{Ozg#LsXVo*aV^NxZ`OL4KHDzYU6C?c! z1r%L$ugy+8>GdnV^xDPn-t6Q{I>i4?U6h}PTY!&O_^G$oNhp+#Mi4Jou}bF%gS zNf*sei}ikT?XNES#D$`ZmS!f-Wx7+0Q3^2n;OUM37^C6O?B<_2et7m*8vPeD>O?U{ zgY%sKVMY@Q+$qxNKg?)j@zbb(OQRHKG`7s|ahc!Jzm3th7rFl!qtB`Wm#bp`F-9rC zXnJkve*>ds^-(MJssAxX{{xKvM;EPc{R@o#w=UZDU%Kc&wCIbj95OlY<^QBbxB6fH zLyL|MR{XavIyqAPzt=@yQ*_b)rbRzYwftKbCI6#~?#vASM;HBv79C%p@TJ@T(4y~_ zMh}+WeE*-c=<@2$`s(VB)!7eg)BjD2ZfFFsYjEgu0{!nvhSLb$yAs9uh`>XT1qcD8B+1fSv zJ#nm(uJiph1$~KFwO9_F+QNZUA>(rEf!d;>Cpi1TY@NE|k*Bi0oAU#8C2yYN!)Q2l z>r2NAv{OZ{57w7WmJkYbb95WZ-&UB{xh@PgRJ?y_(;dsH*Z5+l&Uv!@`cPx#+$*n7 zgE@Mysuo)P_cs@YUcLO-L4+W=C=LFl?r6sIH-?*PKJ_K>={?hLu3Z_*kaYVn++4Tz zCRZ(v%b=xxW3tq^;>Jiz!`Az1`=MtBt&KbX&vnts|EY_X{aY7xRu4u9mSQ#jhc24* zKXlQIf9ayU6kU`AM&sOnF2&)B&*Vq4`LKRUkbNM-M|ZAxoexcb&e(-tW^ws^M}9f? zbBgTPB0nvza5pzW%gklPN0M#H0V7pd?T9fmH-e#zZ@8>x_Kaxrg`4DWtUhh>u5pUw zt*2hg@o#CQLlJ%mtHK5FXR&DlepuD#LG7JMvdd}ebeC8?)JBwe$JGo9U=5#;!W(J) zZ;FIH81=-`JP(A6*@M*Rl$gr~U7uBZp7uiM+7bNc*ufqU2%2FBvM9|r0I^X%Gd>2A z;O&`E!N6YgV9}&lvJTvgU;PbxwtE^X6XatgqB&_cWQ5@)GtdjekJPG~YZ?R2q8n(Y zP_)uV3~Dbs9)%~;Z&*5r*767siK?^%{XOhgCvaDrs0S zV%|=GUYwZ}{Hgd_NJ$fWG88@=@w2uj67eQbnvo{|9@DzWgpeRP=n( z1z{2qt(1~fn}7|p_HCR2@0>#FDY_`1PMe}kY|TX*_KC8Ky=);3%5&PGI~51oDY~M! zf%!!K6hB2m9Qt=R4CzS892K`O2Kr6tp+8gZHj!IMu2`gcEdSJEve@So-*2v;2ZzNG z#WmU~hVt3xxLahit3$LZzY)hlv?<%hfh8yXlizs9tE!|X1)@5x36#}$W6s5??-K@% zkq+U+LIfpLWc{-)nmCvA`y7$rpe(8Y%-)1o!_dOi&i9}akBPc#iNEfg4*+kIzI-^7 z`}0>=D9G|DKJ6`t4PobSf`NxDM|7gdXl^xdQZxTJ2sZkX<-~>+2}uJ7?(&44{6y2S zf2RXc^Ev?!-bS1P4p75f8X8Vg7~WEy24EAxo2%s!hcAhN(ut-t**8#BY2a><2%a}L zs%5Y!9u$KOhm`kRg7PQci5MdSgO3@he#-96jEm3bgTQFjstuhkp6&C|eY&~vr)f6a z?-x}t6MN5@rqTEmb26*TZo=84mykbR%j^?-?Gu-gu))%0E-E`wHhufZIqlE9>@F>i z0|hK9KbNpb>e8UJ@o=(pGK^{>7pNiH^A z*7s(LW!u2MXKznN0WlsAhc|8EO*RT42L-f_yV>}OnwlFDR1V zR86;(2(fXs#QGN0+&$zb(Y(9O&4_5OUeyc^{B-3{o#;6dq4em!Xx&N|oM_`@I&dm@ zsi-L^4gxX(Q?GQH3onM@&DgL+vBF}Aw;B8@X`V@+k5d$2qT(JPN!TTD0lb^1wOeR{ zUvBVzfkuIq-<0Z(s9g$d_{4MN&Sa(bPR zU0p}N!t^H!A1j-=hO2&WfS@a0I)b2RZ)tU|K%wr6FHr+Y0(&xh?s8;u$mm^I#N(!5L{pR^upI=YX7&u(-bv&KpeW_v4bD+gCBp@fw z)`;zAMkK~)mM3s&cT902nr)&NI$18IZZFuK&a#9Dt;<-=QJ~Ry(~kVr;nSQX`W3L2irP8tM zEOKMzpwCJ-I^x#$(ph8*YTJQ43Vxj0=|3(N;Nqo0&m-S2fd z`}5t_EIf1l`(^4?-|XmKmBbO#Qg^CU)|V`G$t6K7-n3;<^GR3Y-#Mf^*tfJg7SzkF zbTciCV}G0rv6114zn9AUtVh2*DfsjJ#E6dwViTy67ZPAC3~1g$*@@6bz%T!rn?K&P zQQkEwWf1au5}Lzx`;dOv5u|N|(0XNKi&>bU@3x_-_cNNQp7?nQ0dqW^j;c}5B`L?Y zU{_gqFWh${ZTg&ldm#9&SN_?x!vR+E386X9vVH94kpFd#8{)oS&q8avEUGn$QS3Qi z@HbWz8m2~D%qdfcYJv7$;0Lc1MD`+U{z9V@R^qQ_OEvaiYP2^!&tI%GzK^abt!6nR z?2HY)*!CurT7UVu4Iug|_{^7I?#YN3KwZNHr14;M6(YMqFSwAm#cIa{Ck%_Yc3BDW z!I=lT)Ctmxzl#!Lk%TtFAmED3Y^;gyhh<iQRjYWOLgh3dfI|Yn~(u!9nZpqW@U+tuZLH)%u!fW!ZA$Cw@G9|lDWevdElThv$ z6##rusgqg(BSRPqli@)h^D;MJWEP9Vo!8mO%viHARICP8C0EmnY<1;v`sH^pRU zFlCryv!-Jxy69hT)a$PV5i8dobDS8pEI=pWaV5l7KQ+hkccNPoRiCRTc; zCUmcVeAK1*E#LUw>9|pfH(ISsAjO`ghw4zgQLjk25$-U`8NwT4s7gP}f@Sy_zq-!# z$4jUXZjkpKbRQt7i4PI5I2 zMO%Nn!zXP08-Q5n+QU!tQns9+!bzwa01|hOXAY-K$>CJ&q2NZSBP>SSB~2jN&n(Gh zj~^krsC{@BCpcv?#V7+W#w{GEeHB9R62&=_P+V$|2Cwu{24!zPYO&B1a9)b`0qiTE zjuZg-3Q==)Q580({tT9c=B12wK{gJYMNQ-sET{>~Y3bgON-Sn*7mOMQ;_;Yx3Po8B z0;^=wE`c86=x3{GG4xptvUL7qyfmLAQpl>4UTWf!hWiIv8zd2f@Q0Qt^vMZutG!FKJs(GtCQiC+*nU;)6r|*xDLx7orGik!k*sSqAaOY5 z(yol8(>*X379)erq%%TMjRT37F`EMz_T5Z|PN)n(e{ebO{VVx>HFSA1VQ>sF{;H$&b0$e*>*wO)FCks>Ebo$fbki(E+)1o5F?a zA{g270&pJM83(4rKBrQFv+sg8H{d%K&%Y)WtRxjx5@B4`$dlZn)eZD@Iq*%kxIZMY z02bk3s?P<11Y840N5XTHz%yN7KN^8QNl)nVa}CnLQ@ctW-9GJ$68_~SC#M(+KPP%7 zOMD+9&55-(R7LaVy2oafWG|j?x|k2+$>1c3blE}M@p8ImNh&t>5+Y||V>Gk)f}>qP zhO&Kp6OA4N^zopFyQmDCf+S%OfCVuSQ0+7?PH+_s4=R}Opm;3EL=F>=&%Ce;oZ!ID zSPYPh=E?H#=Xk_1<1=`lztZ~=wa@kf8`fPWweEEbH&gTGMi$l z^(xS2TlMdC;m$O`N>Wo?TtoA^MDI-FUwG7}VZ{beKSFJ~yi%;Figf##!( z{GnH{+|1e8y272y=YcbXda6AX{V7ErO{x@k1I$UybQll}(@c>^dp6N40quGg zsEyY4{(z3-@ig|l4(Aq(E(IQ~PNUlEm=+;jaV33;A^mJ6iOGT=4UsU`%#f@4e9YJ> zVocLj)#tp}lqtg#l8T65Hb~X<a=ML@5Ry!2!>YECO{{y&6v&@dyNud;{&@G? zIPTHh_Ol%BahT^yA;Q%ZXk8U()!OLhfZiJ%bZ*DJYTJHStSOgcF4e%>Dr0?W6nGTq z(-@}KJ%$BYgTwX7{-=cd_viaR)%0=NkUd5F)n8mR(TB!ZlXq;O_7oa%o-;|G`~m1M z#8EzB-8NL#Nq^x{vt8H=1#%&Y`U8%3oC1&9{Dnu=cVQaw16tbwU!5>ta#6h$cr-Z_ zw2e8m8Q%0lKj#?9ryo?07jJh*xj`Xu80hWYE*3|laRT^`HLMJura+(u8W1(4ZgtvW zt$D3;_AoCDA|w|@pN!V9!92F1MtRd7aKN&#BjBC0TkfbsC-QBt5jQ5}Di&eA+eMr6 za7kfQ&3!?TD-mONA(L%i^L>AsXRvA%0o|xtsp!+knY>gDa(0s_f zm^Y@XmJOy2qMIW1ejtF06v*rzsKJQ~TyzsBSft??LKjDE@sB+^hR9h&mtbJiW3+w5 zH%__80I$FHXpGJ533j&7V>SkZpxC4HIE?$wyGG8lUW(+=6z~V!6}2`5nog+m5}62t zIpWBHI8TZ_O6UTs><(oPs}N_V!92(W{9k+2&D!86b~`XUYve zoY;IgB0%$VkwLk$i3t<0u78XN4J4}4nR1LOUnd`VkU#8xT&<@2BQ`ySfyRg~(0WfL ziXlH^5VHw%={xA0>KQ}a9GGX~Bo~>A#grs{_|>+QibHF7AzN~h%hwkUJU)s(WK|8M zGhF8XEjF4q+$HgFnoDv)b7#7CaS_wLpn*Y;5TM8UZ^p9GY@GOBY?@nmt2eha7@(FRl zc9H58e(wB@mB5oTGECT--LA9sC^7x+gF4jCX+27g)^-bv#Z{w%4f$_U{83=!AAeN9 zfd26)I{hI{z=yT`sxy*ORvG?SW)2=@1h#3p^OC#xqYt_~$CwxZ-uPwu9Ei5jbG6`q zql*rI*d9IE9^=}XP}-TgvGeZX&h)dL+4i0J4?7=Db{4t5EE`5!runV-Y@U$9Df6J9 zd?A5@PR_;UT`nxAf>0uKeZA_-HrLk(`mdL)zgFG*Dw6iqy?*nPC#UL*0F2V?Z5-{5~{@OJB91qTSS59(ff<2$d1`V!p~p1==3kB{iGK??i)$L=#<-r z-*bOvoX~&CMebo4?kLk019VQxNcY?1I^vO0Fy;`TRlb1fBhZ~YX1Vj`@Zs~r(au9{ z4ru(J!w~Lw`eHPFUg%>YU9KnfTO69SyOiKXJDp2YYeQXL9KRMxW>^JBDMNa%e4H4mVUx#<N&c%+uf$j{+`y;&++Tad-speaFHm0m=DCv3 z+xg^!Cuk1oJAV$EB?m?X402T>Q2*9Nk0cq$^Twq{6kSyAQgm{WMT6(c?Axtt)wVs4 z1gzd|ThzO}t#X`uw=-9evTIUg^~Cksy^=@CpWkVxzR-C~GT1a3P7%MCy4f#BMVpb; zo$9yJ?L^Hw(3gBCxvp>{L%>q!D`<-!nIZ3$uCnbp{E5KEV4m;QaLF- zJLdU15rQ#CW%ZUNhL4^0VyD_L7o49?sb8c@IC~!dqA^pFnK~(7H96Gwy=U@UL5;gv z8{Ky|P55l)9XdDaFMp`f>3g`T`XMH}#aNEsk-_~aWD8%jebq|^`Y&CSFU?Dp=6wCG zD(%8%ZWxt(b}Ul5Y0&LLFmXnW$wKt9<`bQ%%rz$R$+8ybJz^#_JHW8H=ea*y_jWJG z?V99l4mCBAXNF)GP0kEdYm~&j6zi+cxek*Y2h!P^i|9Eam#I)jq80|iL4ud5IK=XC z8jn<+Kn{#I3JXwdZW+R4Cb!`kG7A4W`txP^J*jgXPOO%8@u&W5^bt=xsxx0I`5a{S zrC5CvE~CFr8fSQ_gveHZ9B{EPiB4QWidh7sVS2~Z%<9f}Q*-$W51QCx0W$q;zS9Xo=}1o_jR^k!{^)^zQZ~6 z`j_+`8G0@moH44_fj%{Kx;mJnTD`XxepVV3e#SSe)%jy->5y-#LCaH(wue+Z{Y-en zb<%au#)T0y8brC`ZJ#wZBC@M zIAm^t^1@u%mU?s44V%x6*XwjmbGr*J6>}ddLC!f?tY$X9$nt1Ju8mW4(L3*~?)|xw zuPgIfWNPKH0KJ7-g=n%NDMl+@RbAZanVo1aM@9f;h5Bspm*%1*t1q3f`zud>y-qT$ z*LQtp#rm``Y4e)s8h0FNLv7Q?e558q=1rE zTyp25^R-VoOo|FrvFYSC&;ywc?%3A}cY7AX9SCKtB}A!i=h1s* zKso&K(u6I$kcR7>v`pi@!qc6^yUqn1G=gv$&Cn~v=`h~o`eOBy8bs=Lr|{-tIF6Ku zzV2%O(r-LU*k6UodMK2ki&QT+Re~W2J)*Pk%X8kXj#Bn>6!W;RXT4hMh+7BF*a_>2 zlWe%BawE+gBy=1EjWdTFoFsEHhmARk*xpcd(d5%iL=k^i*4$?&nMw<~>o5V9scATE zps=2=I$uV&sFUo%FGK%c<7Y0(QT)=-u;$?^2KG&&LeoS%9%_;c5pY(adqAK>u>v(? zqWU=oS(y|~@)?Ut)ia-@+2ITd@`cEMoG=lq;UY~%9c?|EWi*>i{c#!6-+NxE5bx~n;x{l?g>UI8| z(CpYNlb-#;MUEbB*MD1z`;oFaV?E$zzb)ri&0f`~Q#O5?%*}my+%R6MZr~VT{6_Fv z^Ux(E;oN!ac}>VmPF4@oVs_|j)oZPD9v)Kh$gpk0i0ADqp7%%O<-dNAY`c2)>wUfR z*JmzXYu_(=d3Bo_a@g3|v`Q&`7hR?Hh1RwF67UMy;$66EYDZd)?y+tnUWaIzcNwWr z%0~+6FgR-$qcmsWC99j5DF60v(_^8R-OqCm-mUlz{rcd_@$!E0bjM~|Wt?rG{m$Ue zCsMErk33kAGcBfMHNlJ4I~Hc#Pfuyu8xwl&cw{>WwLWe-B&%IZ=5Q3%`&Dncx|UvN z{bc%KQ@_1nox;o*2ne20ODC#`n!DRw>a%z(b+^uAC2;Mqw{f(y$Jg}7+5C%e%dw`< zE2+P;M@tHSjm14%u%#Pu*1py=p8QOJ78c~9|0XFSZPl+t@sA7P)8ogPf_KVn8#f3g zmTxPZ?v!hJxSIE|nU#9p$+qPQyOH6FY~`Xdc~G+@_Bp0La~d?rQRAkV>&`UrrJj$& z_K}!F{=4{x4KFH#JS@GPXZ@Wbp}->1I>4Edy2aN#Pobv913-1P1zQGZIt$cBzGOB8 z%Crll4FDf2(?PAwpTE2}!$VG2lPFGPb%e*K=Gkc{|892mt?=LG9Q4k_dZS*zSIv=W zt1**Q#=9N(^|#59=Z*}p`=!NyzCq-*d7QaFG6~+x=+1zGckh{8G@?=r zpW}E`3g$Wf!4$9O_(?H%P-)%BitVqjly4)!xfJeS{b zdLO*F7V)`S8!}ay7bSMUCekz=%AP^btZEXnNszcaxZ!ih7oqlLl~tokOX99pA?UO1 z$FAvuT6Y93hX|_>am*LBk~wF+PdTKc7nE&Nkt!l{r2v4n0f0a?j-s2yjS^6Lq4Kj& z1n;oU4zX=Mzy^e(=whvxyZ|a^Jmezr2MC%3lw!buRUZ)wehGj$umJ8{Q@;jT0=QSijk8G)LhS*79@G9YFF)!dJoqQ{9$>{zT5{kZs8*KD<(?7l4dzNk!t} z{?XJ#pj4=}ghZq&jdodAY?yrA^%O5K+`#GI%z8}p$cUsS5N~55*#c>7=R!j#iKpBL za?j^NxQ`xx+Xt0N}YVX z*;Nn_3;hX{We{P+SQ>L++O4xFELM#2gU^R9O2QeiKsi>OhreqdLbP@R8NN_OPB;7r zP)+t#31r3|N98vV-9q~(m?W!h;i zS|MrLesUS1hbKPP!@M_C0qonPLcOR(_cOAD*i}*rz%Wr%8NH`vL{+}Fb}3@);(6JQ z=1vF3$Q%ejm+|UWc&t?_D4n*;^@SP*aC5irY&X`9JdQ-$XsGgo%-pq5YFc-Wb+YWW zp3PS!6ttHRfphV(G&tbYcVK$5>5M)EXlng?GCB7S$g7vJYWx3-R#ljR0r>v7ww`c) z9RLGs5o+CH1lL%rNTz|x!kg!}nmzhoix6qfn_y_i221|Zsx&KcI=2)q*YN9=VnM|U zdIS5)2#&#L+dyu%{v9UW{%~>TG#Yzy!%a!JB1y%zUyp}`ba6*rQg(6L~mcghbXGE2efa76uSm?Ep;SsTE*x{Wev&S$L$ zTaPe`CQ|voUiO1nCn;N3!yu&wsy4tEUrJ!6O)@YRjv*W2Ra(XjKB)KF>nV3hMcon| z{iHvdSJ5dY3xNIg`PGMqq;Ss*^}!LBhG+>3gNhxK8UR3qmPU)a6BuM$4u{lA-N!y~ zM8KTK;Qj6T{zMov;A9WGGpShGugDk?`@^mGwlgKWOYA*X`WX=WMrtt5eGDiCf}ciC zK!{+QMt!XEcO%GvwZr7ofK2@^77qfeBF8UAqDWKA zZ6Z-XeGI`q239oImnWQF?5RX!03O+q5>mwbPx9u(KI4su%Lz12)oqmZhhj-)H!l|)<#&zTUi^fu%Z!a1f z6T~jdDQFVHsI&+mrQv2I0MK}K8`GH%@S9rqNE=taea9A|@TZxV96rC(#C0&+CYGWZ*1_(GE+B3!(}9VUSWN0_ZP z#C%ba*Y7gK#`A{+QEBRyq)M?=8Kz-J1+rS0No)H^v=`+{ z1o(~2Rwi^+6~%e{BvKKGRIE*6M55FfKt%+a86#;302Su7QUfsSxFBV-P=W=WB6eTB z3^GAF18^cyW{0X~hiZk{x-fjfFM?Rf2u5f(@;MWs833v08iJ@ns|}&5R(JEdPE}2u zR0zFiI%6FcA>ja&{uyoviZvnvvq8kWK{OOmb+@&HDI#{wVWMPFkwFUn?YEVAwG5RP zkVz3$ZHV2KaW2=b5f4jx6-{qLl(N5%!g`FB^zplDiZCv@%5<7vsTSo)$`AtaKr#wV z$vyxbXBJ>i*pA1{K=^h9)Qhm5N({BX3daI*$Pd11ipLR#i1ir&XJ=kY%x4CE)DITU z!)hF13=~wgX$jpvPAN5mOD#UIFH`lWPVfPrDrSB&15`qmr$78|mhO&z3RChcie>iH zkJIjeuQ!Wf09)opuEzUT}=^HiD=k$dai|XbAU~W!^O&$qdqdNN7;f?4!lTSi#N0$3*$Wbs3r}^Hm(5J zklP3ix~pK%V;lcX7L%wv^Jb{lXb+-*Gb0UF-`0Do9d5Z z;Wr#Y?iOj@Fh-yFPs`+5KF~YTG*9{K+`vz17nC0p*x0 zW*aCMGg-aJcJ%ckTEdnMl!zsZDgsGL|_Kuf`QYh*$a~f2B;QiPL%R0y6-X zk|_%WZr3f;aoX2h`cxnE(SvnyqcZNXqS=gw;+nk7EkUA4=`Gm$3}e*{-6a6}ELNQV zbMV7hBA0XHVOe-J!c}9w>ct!o<4z(*38#Zyv?2iQ<`tp!E=Vs1!`|w{{wVJrhPrO^T)f(yiP0e=qlR8Ek-;vf4P`6Mv9Z8{8 zLnPa!%$MF9DDQ`Poq`cgYCrVBSOA_xgys@G1h-(?ra5dR@_aMOK$*ZWC7x_K18AGWqh0A zosDuoB-xyUX~=W&Vm|-}|%D zfZo&=mY_0X(A%R&hP)5J!j6DGqZ2^ zO|DihvgV!jzIoR3d_ND$+H?H+6D*)@Y%Ao%RJwqe1lOSiwO@)YIdt9z9-MOzD6wP+ zyweJopq(%$m6))bli~%9pxmP5x}XSMdSrr#uZg3>`hzhPOEjzzebaF-Tg;zPZ*K2} z6B0{{FC%92fA)Di>iuvy6G)NMYB0cYLMT4JDHb_!Enhru`(68^xnQYlqhRBI5pOFa zXKu2Q)lsr8N)I`o@%xvj8a4kU>LZ8w!u_Zv53Z8f*Owo^M6Eo%iWo>)V!C*$dR1>H zYF+z6vQ|uvCHh-28$wWK<4N?Ewp>q2!neL?I{`U`!sxx9(LhjhG%@<%Asb>x?x*&{ z<1bQ9%@+rL4{cPha*AFY7e72x=G~``{{8+El970IM|c{T*=Orecsx)3|4$d4;7S#mDK~m5@4J00Hdkd?X<2{w zm-s^6t@gkl+rK0h8(l^+Uvk%q{y(~CN8tC9Y!2g080UlEhd*|H%aHnGL6r!*OP59x zk=GLOzjaY#{jt15ry-&r9tX0G$fL@oR|x^XToSYbEzH@!7@E->RN+$1SUd z*~fQXxBZVUdMc14fDvJWQ0vD%k45TxAYy5Om1X{t?s>?x|LUSE25SK#^1lmV^xLQ0 zio!(FdWZt!$6Fbila(8p|J6kst~`O6o(vb~)Er#W=twT8inu-XkK(!#YmI6*bE*B@XC49+E7B66b@mKXs^kvSi2V#z4 z+sR($92gZ#YSuvU?8P?b-J)zdUVd0@!$nr!uL$%&yo+??Y{quvrzK@3p6w=q%G9#qBwrV22 zbjiM*mF_thQr;eG@1Or`bkQiKPhPcel$LzE|0yj8ywg(EmZoZ{ZM`gv zR@<(w?NuX}V=vTpo8D=u|7cx`R^RK~=~e&v=I@32K7n3a<6wyAp~m4`i9U^^3H3{j z<7s1U&0q8O4>eCddiQDm{v2_sNp3{e*820c@S)b(R&Afw-yhwVTIUDvw6*{JTDp9w zeeq|fPy6!U-%D)@5IJOmxiN-H0|8{I>Zg=%#4$!9z=EO#+AC>1&l(4%8qoxXTra*O zRI+fhc|7bfl(7<(Os!f)!Ql5<{Y5dnrk2zqtOE4O{%a5!1B|g} zQ#x?X1xB1oQwbd4zb%A?sjV{km=NffRo1vZi;iB5y2R%i=<@xOyQVRGAe1zx$FE;C z@-zkn|6^b%EW^bT5~IT}k~|_;5b(r^u{m2iIg#-WmP&+mfIrK5gXVH4EDgzuVF@@V z8S9GDWmgI2%2(^aZqYIdrNOBp+;9P@dKgTCPUwE0vi)WWI-ubSkCpUmp7=T!zHu(){ zsP7jI=0%n0(JBJ66wU^C1>L+5`-_84p|HH6Mlj#c0F#Fs+41T_>nOC^Xy%#)Yk&5Z z;j%!gVIxeqO+$!pbqeydsx0kdQyk9v2jjyTM0ZZ|5+FBoTHRNO%_Twa!p`JI#PGio zJb1CU>}uxFD;6(jHn1A^GLqE#u}oQ>Uo~vRbo3v!3LhkWg?Oz#@qGk%At*#`wj(g? ze5S*yhbOmGq3zzW16~Wx?K&B6VKmeLJh$Tqn6O)vyi~;8wD=0+UYc>_@&G?d%9TC# z9mDgo!OCa8SPcpKPU&a&I0wB+Q^P0^2?vFM# z%AEncvYZTQ&fLTI@ntHMkFNUA1`fu-1FNLrDd_j6m!rsm^cX|aXQa1TWJMM1?iD+| zH0IZ!+z=5g$B8vrrHh-RkHAw(DWDfoCS_bC6-tw9Rq>988+;Ng$ywp>+dgSiw5>Pv zlBMJZ)U4zB6ijNe2P8SfzqF^8P3=7$s&ntacaF<|hm{N$ClqJHr5dDUpC37pa!hmM zjr#9A1HCCQ#mnJiwG8RmK%>|5D!Q!z5a%1hf%D25b{>6 z?d~H<{0siKkCN-HTqZOr1S{?D7{0o}OhZW@ahY)Lk6}MAWfG46#p%=-=x4T?pk^Y# zhc$n}%y@q9AzrZC-?l|dO^(tyY;{4j$sMs$ld49AOgq|-g$|Gs3{6T|&Qz%62K=9d zsF$+*5shc*nHrlkEnWR=N|NX--L^k|@u=CHn2e)KH03+jLETTBIYvYpjG{ zzFwdI-hCL4?Rmx5$kk&DzQ2nmm@cXsJzgC^Hp+ccVmk{rZkY5HX@`p7;#f=%Mt!*z zwr^-Vvlo@bYe8FRqBg-BgcbYzr;fW>mtl2Wh%d=uX$L}nW^bj&8$Dadit5yHTXVn zG=tL!k4DYARsL1-L(d^8bgHt7)cW4ay))f(p=;~=NAa(EEo(G3&E1@OQBy4P-A{{n zZZl8&0Vow&nqj}ctm7BL^}()~Yx6ct0pqs{_Y=-!iLw~#3pw6N1d!oLiRuvZ=AtD;xzI=Ocx~**+dZyqQE(t2ao1(K(j=y zLgQCCoCs{eXbwDps|C%sLEt|p2*8Pg8bsm2Ri3t0?luB@AeJK*%gIRKd|t{;Toula zl~}|;-Q!eSKw2uLe@Qy(VQ~tJv20qg?4f9uQ!Gadjz^0q0!2a@u}EL4?+# zj(Q$Q$Y?;udX>$6mEFmO#jT8ou}r{yKur=UX!Aq^RVHMK>Q|=7bopBrq!aObPiPi2H;X4u(7_- zV0Y6DHPy}=kj*9vD7dk-08kgP$~~#CNpZ>!8-gu3j$T(WNSvzVx-RCi!39xd1IH0q zE@3dB<|Ej3tfRi+YN+PM=MXF97N^@*P9kF?=QxfFH}g+AeLWk(qi!r41Q7!rb?DcX zx&avn4A1zQKu4_k6^NEwoR*Rsj{{LAFYX5Mx@|6oPXWM{_fnNqK49%;qvXby9jgGj zbL$a<7Gyxsc8xzw*!ZrAc37OmIYCm3C=;{cbZ?5*lB64;=EBE1z-k!8?}5~;;l2|l2XlS|+QLF62xBdryo5w;ko0KcQ%>l*4V=y4b?$o$=^&EuxKPGnoMYNQ88HqnP@Cbdrpv&AT; zrK@USSTumQq7$*muWaCNp_2&~ZFk62n&%$*Vhjr$<3DMZ0mdQX6BXEq!{OJJTF@Me zm}o|7t#hJ)kC4+v<=vJ6SqBWq-3o_XL+x`Qx6#0Pf6Tq;5dtn%oSr;tGggu8+Rd z3M(-l94H`q8D(Qw7JTf3l_ZSgMyuT?as%Wx9VWLni0r&1ITBk4dw4 zsgD=7zF|@nTW3mWZPI0*tdJ&S%m6)3cw8hOCPnG4aW;THso%6;hg!Bm#Rp_W0@TdK zYkGQ$9fO`pi5yV=g`yX1BL=n7+VK*VusyfJ+e9ar;$9Df#+qmuWI6X zCg&P2z#VJ4N)CjswRjDc9aI1BNSaMU+Z{4hz%g2N{n*JN$-DDdH>Ut=5==3%GRQMp zYkcZs)7QnIBIBhwBz=uFh)_JicsEWZxL>J!067elvj(!pVE*Q!x#oyk zJvbGEePl%(f|IcQTz9GhEM`E?jq?Yt>u8f2WJ+|^vH`54-juifxz6iM{9|R#WjM-Z z<@^C0&;dhIoI!s~Z^gi#6Ua#PQffIH4Fv-fX@D9u7~!z3&^ess>F4C+yb}w__SW0v zv%gUM(tli4g;>5$aV1-sPg_L3BB+w?u3=FC_mK`~5m8PBSNTKAI2e7@3?_Bmguick zUPD-^!8KpUiGzW%3OWqQoJz}akogRx@FCmBX+8xI-2Agq3nBaYple&aN;w7JDR8=A z&#+vFt_LTRgi@OWLgs*cd9@nB6byI_U3r|!9EQ3?muoWtmIvlbqTs+|sCRX!Q8;NG zx$t1LQX^Q|1fc93M`dvX^&<-&qjR-NM{AeBnuq4y%#hB*eZ@H1|3GuXc}#rdjZZOj zl0eZkdaO`Cr;oL#M(np8jO{W-Z#`e)?Fu{`I9i@1H{OY;Z*;AL!jPl zzn;f&iF=^YhF)$qZgyBgB#Dfw>R`T;-_?jiXfRSq{qo!g%kiG}{-Bb7WSp6s{YAW_ zr=PMa&b%pGa??Vny{Yck?HgFmn;&3OD%s4Nm`0Zj-dEu3g+WHUa!T31?38sn{^;|I&GhQ!)%mV z&*jfMeghKJ)FP5)vX#2;Y<7Z@C|W88K*q$@Xl|Y>nbs>#C`HarGJ7FCn_k(*cKXj-%-+qFrOjDttr z?#A!gH+Wcgl{bhIPillnXg!HIZGYmMZ}MV|LI%a{+U36up;AUPu||G6v_;_khWioU z?@jY52P&T!aRS7W1{GWO=Xz!2k|^ZAN{Ge*`wknm5x>6T{StrJTfI+PXfBq-gP2+u z9IicQQE86BN`9Q-wyg{^=FR2EYn5&kvpA>^VCm2hx^K_qEjQRq6D-Vp*nc3BqUCf^-B#al&TuMqz}IEud@+h$(?5pf+C0pflU5 zGp7;^{2M2qMh_uBYw)qXm@c+p3f9}oMj8*wMST1t2Rcr`(SJb(e?fBP{qJhp^Mf#4 zMY`hqD7sS#gUYj-FYUV9zYR}gJI{2aNGQF(ziS1%%FH_XmDI@hs3?f2gu)><_IUPj zpkiI~N+85qO)PCY0pvfU5~_MzJghsud01DukRT429zE1e^zda*ygW^!zo9V><9U16 z@2gII!>M7*>6;zmS5<)V5a{l_N)tx}2cNppZh*#VH_QBhTpms<%tnGO{HkPl=FOj( z?VKlp{MX?`kjQ}mT$~Yl1s_FxZ&+62US0ZmKgiPf48K|&Zwdh^v z^^O7IKUOcRf)him_0~nNQEbs?aWN&a-bs@W_rZlRrjnGeO&+QM;~38``98 z9g7M&Gc<{s{~c8IM9r*`0w%Lo-4`f+pJKHAtoC){>xjmw<~LHW35F$av^UEdJl@dw z=xF=nguoFIOx-5B|8O!9uQ}dAh!l#aKnQ678H^GykE2PuysH)?^hj_*c3<-P{?3b? zLGc%0En5|%5hK&8IGYJw?7KmYefHtV4sGY`q`w2p)#JpVI3x3Jyn| z1`ab@V3;8fPhVq#d1&1tF1F<4Xrwxh27dUdH+?5(ZL-L`EnuyM zf$qzo^Z76O8A3{q^@YJpTk&~*2ABe(J@g-UsL#jOULV?`c<%3|4je*M6$*#jmJY=& z2hh_WZZGe-hyYKzp5NP=t-jeF&AYBC{JZ1(_Z*`K$THg&oIAB z2PrlW2l=QE6|MU7EnRu{F!8yqr-8_`=***`6g|eHgjC;-{e-SaPs72~?`vB^6}T8f zp$wOEui;F*XyEs3UCFoKIeqTYZsc*1by4%>Td|;Nv=o|^(T6VQI)@C9cEFV5T?rwD z)gXMXE&R%xgxPIp8X^sQ41{go`i5i9yp4|3*R1HCgS-6jx_!#TifH)gRlz`7BwM&{ zqX~G;vuZ5Kdh;NG>dp!g4His09p)Hv!Qwaton_17=)st(geo3Byv zT>I=*L6frfmdNZ=WE+&C8!U#=xcToN2kA1#cP9}Z@;8Yi`#SQyRrNa~pw|MR@H z6sg{@{#%xuqco0jK}O(#4ub?1394qIBschQXarHjJF2o_iI_~WO+)C7ddp``uHcFW z^l2TDL&LnnR|sB+=r!KW?ELe5a>;=zFczIw-^;yI*Sb=$)9~hkdw01fe_j!dKNR_Z z(>kfle1>D&Nbf#fY(+j5O9?O8pyK|Rlq9`mr)*c$0OCxKU(n-yxZE_DY?$fv>5AR? z^!F80p@&`Djf!82jwUrP;m1>Y)SSf<&EfUz-1`EO?5$D9lvI)ITV!3;WrX@s-9p9a z9&SdA&=_@mlIYhbSIEdCbDZ(=Ke}o_Y-iYZz<2e)(_h!TQgjW4iZLm+M~7na{Lju0 zC$#zh9natXGH>P|$-ocql5!Q8w>^R=2;RD844&Nl<_L~?4Wu_RPhAZVV9#zEKPaDN zQeiEH;=&$FH#frWdBzL!5u9Mn2-T1V_K&UZmb{2T+g(om@6`E|TE64{+HY~rcG^)gF+22nmn`FPqjeUf2TU9k0 z=T`4ffl5bD^WWZe;T$s1mn~>Yb#Y7NcweQj&~T7`;s59=X;WW$_~3DR-gj$L_VcOA6w6~=XUDDva> zsp;_f0M=7i=)^dM__@FY|EUgSCU{u2T#V}BIjZ10DnTLVt%;k~)q;ug3dP#|@jFT^ z&t@DvuD;5jaKHDu@C)R-N}-);7{==P58ZEyfB#Gd0j-|_Cs3NLZztRzuoO+Oil{Pd zo5Ul7o`cw`w3!M_)5vwx-@&5t0$pY)1+!&*&#o9OZC%fH@8>^1uToeSvkd*jlKmEk$b8T2FvJ)VL$@9DB3XlF;xzsn%hB z%%+_<-w>SR&A1T5$kBnIPC|{a1e$*oDH&kNsjm}RsQEaI#JpBr9J-2;_^23v!j(T? z=O*TDH~uL1b*b;JtDw5wgp*Bc;SZlX+B~18bJ=+-=Y8(tmOjBr>}}n5>RbiheOl_Z zdEN9&QVmu6@k@hJN7aO{Z+hzTruh9fY5xZI(%Pk|2kp%tzDk8ybkmdQ+MB=D{|J{e zUOM2r**oLt7cy&n>vETP#RqhKWuV5u z5P$pmyC2$z=PTXW4w4MdW!%a@Lt-`uDR#>)9DSU_a=8a-UJkB2UpPlp7Y-hWFT3(z za*m?e5An&7>(HmC@iY6vnfd3k9zA8F7P*Jn)ed(gZgah}TR6<^OnWS-%oTOm5v$to z;EvPmeCK6zRB-fZ9GX-LS8@0Dzq$`$e<}zHyM6#)?s=s0_q54d5l~3$=&7@F`p*5z z7s2RK&uiV=gNeDv%1IyQgkoR<~tW!Q~vbO2luSa^-ZE_Oi(vXF8jgnQMwEm67cr@g2~Y5`|vTG zhvtjZ*3O6&hX7?x;KG(t9{ViB(7mT2u#_d=^n7^6 z)jitJ(rifU{X`A4QGub#6bQ`p{?z2<8>$M;z`^S@Run6VV_*IS6kfafJ^b4rVzSdQ z>vvKCb>*Z`(NvO))?MD4o{2Nn7iIB0ygPQ^uFC765>&k7n5}65)RNBwSw&o-q_p<; z#Rp!A-~Vh}icKn|-0Y=qKgy>=dW6W&Jd3;cN)hzw6irhzfqNeDY4=JMKrMReN1%fG zPR5szvf6uvE-KPDp5XPrsy^hEbFe=weJWpk`@kbO8$>B!3mSa)e9w_H8Y*C`zE<3H zXqyUv1&p+POvsFiB(H>C#3s!x*Uh8xY8!rRnC529ZwZRp+pr(@N7)J(2# zd|wrH`V7?k$Z^>>SEavG{BZP*&B0anE_#+zt-`GnU8--glkLq>7v4#=Z}p^pTfGMq zaZ6mDyn-*MGf&pT4Sb<0&Ifn*evmi)l2;Q|BwBgW=P4#oH{|c|dqoljd=v+lM8}=4 zvi1|0FZ%)d(YSFHJf#egi)!^{xnaSIILKReWX^18Fb1U3KNl)+Val|&oWB1JvN2W) zGHi{P582Y&wXISD%F;Cujn0>b|Q`a&71 zeL(h&5;X>aqgXIVfInCu)pQWv(677be{M}zzdP2@Ak6@p9?sk!qJa&M34Hd4z-~&y ztbAZM>R_%u&_mPplj~3!xzXnWjK_E-5*{guf` z;1n35XfTHKI`puK6Xm@Sv-$gDNFn&N7jK7a*WCr#$MdSc6ld& zqZ0P71;A!5z+7#?gOx1Ko>p`#ML7?$+f4D)N|zx3+_Q4*S4kgE>p!)S1=i9}-!k}i zrulxt_>V03f5Tww8T^+p@@nQ^wcw+iK`=2RXqg2#G6jB4^Ydi#-+&;uECi04=AVKsoux63#^*f==B2n_SF_xlYQX*E&Tq`nLd3F41guhNN_~H6$#{}mp9oCe7 zMqSG-(jd z#m9uJAv*kxmTEPnG$oaFq*Bo_3XH5f-!=lmjg_!U%AYz=yv@R88}N0nM(NcqTaT%r zF9oVJ05wvIdX@E#xdqPNf-XKl3m&8D(=bxjyjPFqlv~Es6hN1p$3g-EjEK2!y1}~D zB)G;LnpQz~Hcj`sHLr!WUYz9w^Rb@sF{ao;u-I!b2F+g7uaPgPH1`@AQwsION;DCe zxzTLrrJRju|6m-yB!Tw|mif4pf4iJJQ{T8VSOQlHyHd({TZbjZS$HgkL0{1O3fk1h zv&aO2`q{#kL6Ex|qn8R9ly_#smr8-G%?hi{l6TEOA?B39W=tWY%s0$`wwPzUwMc+i za{N+UCrSMd3F;B5XaT z6zUikJa~##s714#9Vuo9BS?EC zqnu4?j$lx0nS4T!>P2%6qM^Zh(5h92BwgoI0vWd#xhDV-Ed!fPMJ@vJFMi#A=`2gl zKFSWLBX+0cG(X=tzoo-}HVR`@ z?Y{3$e13<1-Oly@x!|8+NFK`b6})LVWHmTub$9uyrsgcYB-(TIB~9K7uF!dA&r)=5 zg=hXZ54+!<&AH%ocBKA4f&Zdj%^OUA9K$g*VD zf1#2$33BgN*{&!JOV;JHfAD+Bkze)U0TvRGVHoiR`i<3103uDzsy7|k zEcWp7zWmb6++NMhlmHjrkj{S*9darcahiA)k(98gnw*yyZc_hPB{@eWaRi$va5Vg! zGwJ0i8xsrfM>@9vh+x#BbE}G3{$h$RBsC*1?PqH$L-q*qG*OEEf#P9Ix)-zVqS9kr z9Pp*EQyJ_W%~3>=njLc!eFt$22+h~W{{>{Ici7pp*?J4|>sy5k)G&hqNQeQ~DFqYW z!udbmDvmRKI!ik-Up6EiHz$|Hkksnk28@BjQZ{79zk+l zKeB~)=3;lB2HEA^4lwiDAHJNkNuSS2OYY1GP|eFyl`Txp_Z`oV<1Bc~&Lt^cS2sfI zd_!$n=MIA~e=o?+S1>-u85dH`j5*Xj6)-Oc!RNnc*K={-x1yWGeuuLy;;XrS{bBp_ zvAFMa2-v0Dt(wblR_tX`N?lN5KV1s@G=wZDwc_GZnAiJGO0yqV0c4d(aF-MsmWe!g zakLB9=o-dw=al{~Oqk0%DWS={p)}4?8FZ`ienDl#r^=8srLGx>M*H?5f%h?Yb=Ixw zCtWN=XHz{N9}zX)FP8NVk`3krE)!Jw%QLp>v!MpH+V+Cl7qI#o?(*vtwXJ6_J3rOd zbJq_%sHau4q&`CVaRpu$->+LbtDC)5_wqr5N6XOeEwcme#1D;0q^sna62<6(CZ46H z0B&WzDocG2{_@i4TQ9QZkW5~#HWU4?wRu`}3I*3@@j>La74-nObv|>R`>G-}BM(xu zX2Hw%-HUNl1PzelMa;5Hy-{ZqS~!+;e$$T;$aqr)H; zo^%J^gEe}3a_8oIYR;7Yc>V9p1&G&Tz~0-guN~w0ch%M1?jy!Y?56e_Jt&SnbS(MV zX^P`tCScvL1kFl0r>mTHf^0eRVby%T|@lrZXg*zB*eAauA^Q1(=@tmfaePSDIHC6IVm*ore! zbZzDx3VM!(wWT7arO=s*b*M|MPgnn~y9`s~pc) zc~(~W`OK;v6TT4)tkk3W2^xw)K@O|TPJ%RGU~hOoox$n~*<00hhE+_8?SJOia*}8u z8n@vj+;qQ)5h2_J4?rgr(Q)M*$1{$A)C`lr6H4cggrbDeVIkn((LpKE`{X15?=L5Q+G{v?GHdbD zM-RS=5}M~--ehuh0j*Sp6n#`C`WPve3#1DEDpuwq9!(TqxcKVGyC65KA>v6l*ALkP zc&D~A#`lgV_x=pv|2KR4--m}=^bey*L-e`Jk;kV_{rbDTS@EO)k1i^|L4sd(NYz|6pa*xEkd|| z_9KW@dERWbb!}{sb$RuCy%m}58u=xfhS0b5p)XY6FKWG|2XvL7uV1bgO8$T8qK&36 zg6QuT-4rnqj7spkqXKb#lO1x_u6AmST69h<5FG0?Kd>~y)J(j zhefekjH1$1*@iXBa%K`V819W?J%1fg|Bo)}lDvNUq4b(lKW;*75kD|(lgtWpO&R^1 zr3dSRZn8cGWeC#CQht22Ao6!H$u09;j+;xCsTS>>?1<8;a!i-Pz=V`N&T5c#^Tu0y zMLiXmfrAm2B}wam;)R*Or)SLiw&XDt_h+FGj!Uy2?7j6Uj?_zE5du~>{9oepJy59Jj zHLCYC@K<6${oXC%s+SjZK8?hckh_`Bgw&=q^cK731&a>3^VP4?+1?NgnY$!Rh#j$J zCFEMKf_ys+&(wT6zdcWtHXcHT@1lQ|3fWkjccq zYUgm_tz3GHRZ{6jQf89y`RP?p#IudrWbt-zV4C)XtqU!B#QAl=$c6u`fynRZx%Yk2 zJi$gVyO_U$6JwMP{*!7y;cJA~xbF!ZxXram)&gfMT`iU8z)7j!!SZ^@E>^-5^$Auq zbk5T1`XuX6QEc^u%foUeYy9K%6Hk?+@e?Btjs1QuNjac0i-g5K3jDcThyY@4VF)wP z%BqN0*LjCby^;*cZjtu=Gy9egq>2^Lwq;`dvvHK<@<{se^-~LVxAIG=+ggjZH-cd+ znUh^uZ)!#r^4Mhw$k=0Ci)aL-K*TJIgg19nG&1$gkY3Po(%_p%ryaEnXPK40xtm)2 z`HDBwelHXOuX^_s5z=J3^C95q)Y^a}pzCxy=$G1o`a9F^AKl*@X7(t~qh_dH)BlXV z98P_t@zu_S`Ol&b>>K(qxy6y6#rvst)d*@w75`vJc3^OHx*6Yrxw*4Wrz{CedN+!U z+Uppn*4ca#VLipW=V_;*)+GCIxr{3raSb`sjz2UZnDg$IT7LS_7fW~*datl}@8mlq zD|jIKoPKTpf}?-}$q2XeCrb5DFg_$VVc9LNf@_~9bVN?`=Z;=t*hwT!{*)`IE{a5J zaTTa>QEd9X>{9L`f2ojtK}e;_BbHMbjm$2AHr=-9n!6iXQJh%_6I0k`pK4c{%4PMqH)CzSxN011 z=jL_5E|Kfxbp`5U?)6`9*Xb@<-)df&c_LQ+p5931v)<*ujOp1SS2`)HES4+GPaZ5^ zEhZe(D8pO(Q~0iyFj2Cjtz4!&kSzsj{jn5OjeP}&l4aYk^{DQAf`!BrAQ-_U$#Gfe zOL@gS@)`z)WKhcF;dp-cBxvc#w`L(jGRVmp)%Q{|i%gk?yR-WD$OG6E2o!)((VLK@ zw(u}C=+z4ix;i37S5L#Mx=;qTTV`bof=&ov&(ddx%};|wI>@WFpaf?&;)n5QX0Z^8 z_jl9{Y~LPkHC{24#K_>SR^9-c%Xm5O*gvg({{RdhFPPBmF$b@WwuO}X)@K{2GtOEv zQ*hX>?Q+X9E}XGDzPj^Sma)m6F=K$%B3XLtLHFgd2;gRikMN#~bfbE=m~!{myV6@{ z1o54V@0_m4#0Mjf91jal zj+xZQw+?&qw@Zq?%8KZk5L74q2OV;xBElzp80qq+L#-9w>nBR^q(4{`^r)=roQT{z zy1pXWW@fAOz9)ZLJ={tFbLl@6eXr%k5zw>nGTb#d=~l+ zMTLJ+WV}G!g!xdlujXFhjy>^vXugD2)4jK>pxJHGbzV1Ess6Iy-AwxXy}ua_zMlNO zeH!z`bA?22-$$c*z8~7p&v$_52wkJQqDNhaPl@pxB2Z)hSU6lb<5MHQF$s6}Ey#I} z_k*a!Z>DTP4#yI}yh7SCqf=RuK?M?@FEf;iIkXC(1-G(s2r&@@_3Y`OgvDlrtcbw} zSaZm$MAa>08TEq#tydc6OR}+eUA2=Y8g8f?j}LUPt0^Oi=L`hz_tY1NFDLG{nDZiL zCn4TpIz)Sn`UZvFs@0~kFijPIq6rfb68-*!}kPMGW9!fsQi zo<}Q-rM;1Mf7M!jefRZq>NmfGt~~GzU*}jU+4Qv4#`|_W|HwmsAr%wA_YfS*E+WAX zui{T~x|p&2EkmPJ)W|b|rvy!3Qqz+M8K)o0yNTia>GRi_vIXrw|BhnGlU}8kDwYLj zvvW7$fkh;yI|wE}D0t4?w+t%pHb>jo zX+2{D=4f39qtf(odYks_G*U13se29>no0smBuXZnp7D09&S5+eSoFiu}Iu`f1$s_dTKHSTm0*tAile@fUrM(2hi zr5q)-8}gy!=8bcYBr6t)p%SLSIhu6mE3pPik2aYej3$1#3WX}f2W=(`ju?iW!v6aE z@dB9ar)gV5P0nNm#5*pd95`&Po4mwWQz;24a@XREt5OTB6FmhPJ(}3=^aGKN&^}w} z{uE$Kk!|9>Nx|2&_nT?3ywq-k^u5_s*ViW2&Xnil?~6B| z*{1JoYL|+-|A?W!gJ;|#)2cDlU(Bg&D5STksr(w5T1oUp&XldP|3j;i;xGEsDG5(5 zXr3GzJZash@-v4PQBWRkGEs=(!CNpDBBCtMBVhEgpcwLl=hZtDAX#UyehHoz1*-bZ zB^E>7I|Voq1v)qfl3i6~$$wl`P!k>`z$kB+bsCc?{x);XoSqj2ZKVLU#!v?)QY+_Y z@dF_5s4g=-9|5^yAVOO?8gUS=7-0NKI_e`pXq5J3loDi-71p15fdK9!7EIZj+JU|6m&Bfd@Wl6|)tHJ{;os0h1Nh{$2RVdFx6iC+m61Y$C z>_b10x1W|gjTl){2r8hHg_bmkF!ADntX?HFH6?UhC7|shR`G(v7!We1M2~&zmHXygtVt!_D>1`N{IX{x|(_%_ z{N4HwDfJ&~>X&ZS<0)tZoPjMV6`kVzHw_!Q#HsLxwL4=CKeiiw(!Dwme?@jhf8Blc zJLT1%npc0vUj5sCbxGFEvx9H~uQ z_Dw<=Al{uOe)?vCLXc2tldyNQNNp2bqFI8zStzwxhW<52ZL_5DYiXX>%HGXajbBT8 zzgF3K%~J?a?|!ZM_qE<%5F8JHn}aYs&C)1l0p}LIRPs=63u5Q8g^Qw9S%S(%mL~D6 z@o2TM|4-|@KJ)t+Mvv4suiCb|f9VVin7!%S1H5s5JnbQ=ZDrTmL%rLt-)wte-|nma zqLjesau?{n)9y*%k*CTXw^Q~gqGJ=^;Yu%cDAqaOmXq+e!;7c$O-6fuL_2OrfQ!_5 zH?^IM)bWD8<3&Jc<=>7%b)K};&cvOrRO61izwJWyol}`*4p=I^0wI%(?lM4kn|J$b zbsix(>K74hg7jU_raNCqJgBYh8J6H0*y)@z?oQunUzF%*?S9k2)6*?q-W$Q1g)&I1 z5lSlMY~}0?!oF#y?;UBXdArj&zSCNc>lsUB9RJ(3QrNl7bIJAd@0+L9a3G|eV)@Od zh;Hz5Z#VNx8w-f#U6I!5{%6?!fYkmnxxRewzQe-4U)?Wz0tobV9sg?khj_?i4MdMZ z0xi7Tdx^+aCv@A0Ef*oQg!X1Gqi;6%xXj?akY#E|(Jyg$*Iya1ZXzZX65lovqL&Dx z(}RhB2a&sdqT~H~Vy=eNcpq zFk(x0rg8_~17m6DMg$}R)!HejS%|)T*WQ}S5wq$p32{Ab36i5+sJ@V$u}3|l1xuqT zKHW9&w{&&y*mwKhM2=mSR&DHOm|FN<3!$5Ee!-I&3l{Ku2OmdZ`}>!<9;UUuJ|AmZ z9utt{tB!ov64^Dk`z~E#a^AlCD|}*oIx=@Zgulr@8KIiq&tM0YGepvtnbT~@%2T(S zZl#U&olktgPArZO)d#%V-8gZ@- z(p#(m{STk%naD}Ddbowjl$h_7L=^mI>f|H29y!fP*JT7$YFgM)WP8d^PLc07iV-a} zr=vON-bgJ_Kl@p-0`^RZXLm-D;T^SOt#Z`mj1n(DtM5FM6qReTP~5wD!+OyX(yR^k z#W}?U)d9Ot>4G-{wWG=N`5Hp-zq> z5uZQ6n?JDCqIS~Jb&4r%1@g{Uj=mU$68JSbHw4e^!V0@LQ()I?L zT>ru9^IT;5qJ6#4E|nb={nmbYzhioA^x!! zjqBPCU+pKq+Q2qwlo6&=OwcT*1at6#Ihtcm-qe&^x_5bh{b|OI{RU+#>$OnD(ZCGhSUJBWlQ@Q&y0h*B^{xWvagOAf!tV<>18sN(@JC91~nR}oJ zS9dDFE}h&izeR9^!bhw?%u^}4GyjiICpi0|R9OFki79G3gsKQ1O&lkGhaS+|DKlr0S5Yr7d8V;PC6N2=Jg~p%aM$SFK-%yXHQ6`T_K}Gja`?wbz# zn6mYJ7KlI@k4*0%N}0~2xh3zVLd~E|{LIvO82mJZVEvJV_`tdjMnBs)BY_wDnbO80 zKSysm+}y#J(k9P9{OTDW$+*PE02kBu0OfLNySQfTF(} zNMDJdAHv54k)wdgVF^U08gm*sHl{KtO)j44AB8;q?F4c}aDP8j>dm9n71*MZ!ftko z+{ON$Id~9$8uuieF77``!m)m|EpuUym0`c3;aTmNxclK3`0z>{#&&at1m`h(VTf4p@mL*Q*$0=h5o!D2wI*T4aMG}Ic+Wf~K zE}fVfNY1V&`f&zD1?>)q;WqRrXikqzad}S*xuoIgOvP+)?!^ce7-# zQ2mMZR=vZgt%xhA_6jvl&nn+;$SewbpDgWYx~CWfth8nKQ4dj~*FA8`u6|?r?QE0hq{j9~5ySS~^aDULYu`vfSdun(rhGBI)3-ij9{a@_eXH--B`Y-yKLTG`6CP)oMq<4@`AfbqeASx&b zp$XEP(i8%rNk!-r6Ld1ld$LJoOJIm50Ay_o<4eiWBjGeXk-y4x#vgzd7YLD1R&Zq(2>Mc5MP2Y)54sw;>E=7+#;^%BGYQrAq+gNRcp%91O9zn|QBlxy zm8a#tw|Q)@)InwM&@bkkT@2#&Xv$!5_XzEuQ zSxb0T?sQvS5gpQ-K$=mhV#TXm*1x|oa%sCny;<+H`qfg|Q@zX<%Mp^io`>zvHF;il zNzyrY!%HjkgWrwr@ zf$k#b^X@o(FIt>P8>`X8iP@$)_)NuYe?ULKLis#b91RSHmtMZJ5u&_5aMWB(xo6ls zNF8xfyE#crsSbVX){=Q1{)=pRN<_emNUuzT#8vm$Le|Jq6YtN-`(B6Ao@|<&Pcn@9 zaQAL|)XKfLzI6e0Juat0d}H#$6a}asFEu?PoCxG0QRDcTiM*xN|grT`H9z8FRjgB}Q7JQP=jk zGT(>f1M*i<*75MOv=7XZPfuEysMTs0E)=9I9dDl3DB~=|-Jwd|K)ttsc8UdjkvR^xl$&nEZlJ3-9r+86g>wO-yBOgN`;N48C6v;=h;O9+)gGm zy7PYFy-oOAZ!h?(wVnqvZ!p&s&~s zKA!(%^p0}kc`Hv{yTb9&ibI{}tuHj&6@MGOYsBPZlc;;5bl+H|xv9_bYt1Jrw8rio zOY}K$NB95gqEu8;P{ILyS2p{J4*ss7B+XXW|8I5C-TyPX=xxM>N>c-$gnXzqofPfo(8^WUQLca?Yi*rLYuGicd$x9m zuLqMZ*55Jk_pzUveARXFNkf}|K={dp7A@b<55fid5w-tHnIC`^Vy@nEbmvw1`ZUwcr}q3Kyvas>T=VQ{Mb^c;V8 zM3-iJaAx@Qyclz2kEvnE)$2#67Y>?7_MU7H$$vV%sGJHZim^GydNotDa7$Kw-7XnfFg*)!+T>pUCQ+e<7=d=fyx_735X3{lxwK{lopw zfyioOfc!w93Mp86AXxoWNXg|8pP~yYzo}LE3l}dAUerAse&=s()%}tRNUeg->Z2IL zwAk>!snx1@9H^~=$m&0Z)!d}ZAhP;fSjGNRSWQh$%}lj)O~e0FSpAz<1#Q)TNQ*=Ba8lmRZEIIAKc1~E_MfP)tTZj&{k#K z4visuJub-yVO45r*q2gY5LV4B_y5~gU9Sj9`pv8UwpA~Huxio$$ou!J*Y8JXR!4%o zYT<){zhTvi$MOGdtNsnE4nN8M`Xuea)3iT%RdQ42pSj-utx``yb43|iW**s=E3 zAL~mm@4F1PxBdLyGWH+B>VgakK*%tNaJI_7gjEmTB?bxTtz#tcp3?qLVYL#9mddwq z%dzlGKFJbem(3agQl z{}xse_&s6ut|0V04-5?otK`3g)iDNDMWS<#ibgbuW6+e;yeoUc>M@}ke&zobR)^7| zbnao9zlBvaD6Cq8!m8Sz!YcQF2&<+-$H#vQt2%qa>QGXVDDR%I`nm+Jh8~#v4`DS5 z6jsaXA$!8AdYZ@txBP%4jTf4tTd5B{Chczp3aeTARvR+2n33PY>V^VMe@|E~E!Ef) zRzs!fTVtFNRg>uEO9Df+*K)$#;l84ElC6jn(o#K}Ej)%H(e z_4fZHtZw|Lu(}v02!O&W%bu`mjQUGh6`ciz)xbZ6)pF>iiNA!^MGgq$U&3m6f)!bg zO|%9SRyF=ZSiSL=uv-3GSe4upRuj`98KAIQnD{>lt5tsrs|J4xt5(GH-2#HxIM9I} z{Vc`@3aeRIx_~bO6jld8Vbx7&bWd27NA&y_R;vZ^?n#sX5?0;!;r=bG(*7x|?o)fi z|Cg}ph)nuZSQQ0@)zX^&AYMzsWsJR0Ysp{2YRR9%DhcTU3ag_zCMcObVYT9)!m7cZ zu=>K}x3KD#edYB3B&;^?39A<;%*88`e+#R)wM0Rn{NKWA82o<{R$c#>!YVuZ$kqQ4 zR+BLG*+00hy)68|5iXlO?P)7Y^&3_E9&GpGSE|!K_7beaCesho(2_Y5VJ6q58b#fg zVod84LwaN_X~TV=hYBNZ%eKs!@ztI6SCp+!?oe_5@#!U0 z;P$=PYXfwCwv$F#u1}mc0d#-0;Op|;ij60M!5E?yro>6$2$SVTw%%_vzT<-!|=3U zzEkLLIdA>WlMKJ8ocfhos2*+-u-n!n;G``LAqU}qaFrh)=W}W8`#wfI`Qxjp%Hs;0 zd;F>1pA?&hQB^2r$o&lSWcO{fU*dL0*}d>9RBO2BY? zraAl+>*--<-;J`fer{6H80VsZ#r=0rGR{2m74s;4^^s8K>>>udMTAEc_It+z>xL(q+b;C5-T$0EaGBR%Z$0obta7I{^{ws3`p3K zLr9Wr_K4|8n&tc}vOu=0Sl6y#9CVjuIX>GlrFHrNzbY=u zyLdlxZB^os4{!P|+`%y;>~MfW%QzB_c@&U&=Dg!sh2;pmcq^AHq`^@%*PAvWJdy?d zpm^QU|3vAJR}wDM$81-}Wm}U^p0H2z<<%5`?0@4$g-d@vrV5NtwoQGz^u!x}inxFB zdZMfcHXQy!LSByCGG@(`>t-7vtW?kOsr#<)8?9+cQ;!~9}BDzSbQq^5XK*}4%^rr_8^LzEq#7aK)ob+8v{FO zTb+G`$0LAQ9JL&F>1($SG7eP^-uM?{cd)UD+1TA~B9rUEuP4xG2d*y^B1gO*XK)<% zY@9#@`V$zi06|G<$H*-4`Pj(7OW|MESS%?_vkVReZ47S6P}er_rR0I=Cp=Usz8eJW z4GNPjg~^4%zD!1N!7-S5o*Mu>n1@L_g69iG`M4^3@VvjNcG5#t zCx{!Rei#o9J-lI)aG@fKocxd)NM|n z+?dM%m%hd1I51%$n}s)J2}9+(k{@x!LiGyM@hb%rSBjT-zNjEUVYOTvsr>Z{D6CdB zrdI!#uu3Z7JQ%_C%ZA8hOZNN+4i=Z$Ck$-{KFfD0Goflx)dMWC4^ zW0SNn$R#P>IDd#lT(3apr(smG8rvcI6#=RQ%d8t~a0aY|^9I%n-l>V8X7E6=z7()- z$YgGBieB9ZYp+3-0l20-2OOY6LguSt0!QpYaq&;FP*2Bi&av=p`|`3V6&|=x=ikb=H>zi4xiqy zH^c@lK`JxQ9Ap-F2@4w<|<~5GR zmUC8*NIAds=Qv^Y3(f8WYOq%chsq0`YyxBHCcYm~0q^Ad=j7SSupeq~`p+w}eiLKR z@;}TeP>!qE5?T8A95sn7>m<2k*8R$B5a?rA#_ntMq7C>V3ShRyNGhaImdmOJhW=dq zQ2TO=y3vMBA$uZ{gIlYaheH%|_?`!$a}?)z+A3rO!EyEGSnchLWIfM3#jt%vw8K~B zynMU3Ilu;l=6U%$WeS6`MZ+H;ZX05jL*$D?M>PqhtP{5~huIJqG>TlXSOb$9y0Jrn z_twCKhuG48Kwqn%Ujx8vn_`2lGN@{qrW|@p1tB#Ad^(K|ra+W!ild3eYd<(O&TyWX zWOHAGG&4{q1&eoxccjR7swYqi1ZYH=sVEOD!V8*oj`=ou4^|BTVO8vPj^pH`cc3F6 ztXj4QtA<$K-9gLTASiHyuIesK>u`dHCKe@0^h?madjo`3=SmR6*F`~CwR8_wwLoO! zuUOZykB+E{fTjG}lXxCg7IVj#zK!B~ufJy;bB zb^i^kijjZAs^|wCYY;s$Fs%Dv$L0aZt4{T%g0QM42&?k^hE>ac!>Y&s1*`J>3syC8 z|7{7=e&*hcHU(i-FjY0@*+}m*OE$^} zS;`kjii`y`#h1eNFIaU%I>nwP_m^OA$o;GeMS7o^}vNCbiNlLu!r zGbWe7iBRj%MM?s0oQB&ptT5d~e@|F7)MtGsSqsj*u#eEpw&^y)orYFQqW$z!(_+Vq zI?o8xBY?y9+nrmVSycTvJ8Zf-7_><;hhtIaAfe0WBkgRp2SLAkWtjF=3m);2=0sug zMbdBB(EW+s(revD>C7Ic-6{H_7k58GjElN1Z`0(~>E_rT>@=sPP3M)O=WibO7)^6T zBiRh6Ib1NvQxv9qc*HXTlYdaJp+4Jd#5VC%@`@SNjfo=~Xn-MEK7qCe| zvme@~wJ6XphpBl|5HSqq_h_ykL^uMBwPK(<*Ip2atOL`7#jY>j*#sI-BSu=y9uqi; z6eihALlI>7^v3~1;pc{}66ET>JS4sG3fdFP$*a0&tI|MQl`^>FG*UQAgL2T}pJ=`d zG}d23($zzf4BFBz?{Izp2xzO$5MS{7bAq<&$aS=kBKnmLGChc+4M?Jn`-n{CP{$2FD##6StS`_~$V0?Z zsQX7R7c3%3IKTt52dm-;!$$qa(O33h)pYD%u&M=vGu8j~nII0B>fpoOv+Un5y}t8l zW(Rn)MTD2su#)2%PD`frV=IX3+{Ckbz36ghC%p8f$e?!iPY zmNN!B^9p#qLuSqY1b?WA7X5^31X!W2Z1YD4c{y3viSULR_7=*_;LMvp&dwzd)5aN6 z)+TRfwtJ+j=ScpXqA#aSZcWq%F<6~vEYPSvBK-UIi@qY%pOZ;Y;{^~_tq>WM44AVH z5U|Xi_HBKu)IJ>na52TqvslmMWWfC}?5r`UFb4b#mi7JRg?%xvJ>HTK90Yp_*Yd9lzS0S0&XipW6?=}5@!m8p&xw2oN zvI5wo0^S$3^Q%ia=0;#p!s5ChtQt>M_ESoFV=1(aPNpDgiuYjEnw`a4F)NM5E6wdI zt^a#r)w=tt=%>-l{pncdJev;!!5j(XPiG+`Y>-tx@zpWsRdy~^!_X(898~QK?pG=( zIVx%a_`om9ww8&mApF58{>Mhl>#cxQAPe0W#BwcjltOvNz_3?<$T)+uFNj0n4T_HC zY(%pOA4T4ytU*;j^WC0UAg_h)ptsQ@2erSP{lq+KgPwe&`6h;ImB=dbuv~HX7<%Ca zT3KwJrn?RiTZ4&>yvznc1=i3QjxYk-y=>;KAdXc9c0V#IhJq*|vcjKnRFGG9);4U7 zzM2khh+jf~C9ox9Sr-_bg;<4O820yTUtI@>8Ec3JEaztiYTJfA-G==;gOlyq+WDCN z2JGZ2Wxf;5R)I!s*Kj1)u&-u=cIlfwLY}EV8wJDO0C1)U=A{J=<_&Fb)gTKPqw9pt zlt-J_l+mlex5gmUR~uv}fvu3Tpr_n%#v^QzdwC})A=P>L-VZm(RjQ~Bmw__6s*SyX z!Mu%Ssjxv!4Q=!Rh$%Gc-VkbH=f}~vsF&a&p{(M zEHaQ2@g2Q0{|D!H2J;yTB7r<=Wwcfi#9QPAO}cyXnRvx>r;B)P_kZ=gYm}!8wB`L{Dd1LG;R*j*1f2dsMGp0I5}r?4aS- zOfQb@w~MiQyYlF?SMQRl@e#gxzV-_`h67_f??&RDf4u{B* zhh}@is$*mD;)^?uuU5~tMt)sgoO<<#3nh*T_QZ!1HF0>5!l~NL^s%nWzNtV6I$m_L zsgLuse6N|;gEL#Le(T{G8SS1i8$bHqXT4C1T^*`^vs=-;v@%ggr6BCjeYGPHtZvD( zeGHSa+lfah_YMv=yz}Gy)Un_-bzX6SB`57nmaS^64EM6bEY)u+OFf?s>sxg02ev!os1W!RDgTqg-{38uR^PmY93* zMT53oNdnGg}yH(mT>ee5#tvw_b z(@TL15K>>CQ&-t~tyFb`z@walk5Vy~lX-xIL9DrLsAOs1LUv-zffHWTx0xL}U)9J^re9R2mHJ%Pmc z&HhZF=96xxw`aQ8r>8&DP`WBNYLHr1!xCaLX}z!`lLZI-r=~X`2o5ck&m3|=UsRuG zmHGt7XP1tgeJW$=G3+H8t5Sl$9sQ&!_qP7`^r|<}OY8@@oTDO1d4XzY4-^G~@58yE)J4hgAAWhvbHu3HQ(!s;6&8 zfBWG@Chd#zE`6U$HMo`Ajx=rD4DP=5)DT9X3uGzU10q`WL-9GjEcckxqpRIl1a&J| z+?br-1VNF9uToWcHR>(sA%~sj7Hv;S-3hZ9j5LO^j-J=Xbq?EEngn7d)_U1XT9iK6H8wjpR4>FDga< z=#xd%yIU*u zX&Ry>@EW>Z@p`$J!KcW%QKB1>pd)dZ_$g?&yDaFj!oUD3Ny zIio2)C-b6I4Lt3l!v+K}^`~AT$#xH-PF7nEsxIp1yX)JG zRfEE+yCRo;#^TAEtC}}*S!|I*ktZ6vPCBf8ym#j2R(fy3;;BfH4#PcRwRUa4|5y3i z>bU`m^r^=>LLZaa`XqOsD`?%g_94|tG5Yk`qde!Y-jZNqWEmc(|$ax)&J|jGP zK`mECG1HAIuC&3|P`6m^GeA8f+V(`X_RfQiwPi@D+o?vH=di=Z

v=T2+bvpt@+9 z)6(R%+#ck)(6Z>W6l?qZu?%YH+_?5TrN8{@SB^2XD^j*5?Cb;vLgAy+b&9XX9 zoD%GvB^qqcKJjB}7tCLb)4b#c=?bIEoD0r6G3_QQTTzRt2+lx#fC#*I=+=7|ob|}k zQi%CRW5z~sPR1nrCxz3Kll0V1NL0 z%}ZtK0iBsLCHCx*+=ultZ6GH(wLESua#6$Z!MT&~pN`%=r=?+c>mxlM3lxkGAE+WH z8yqTSizi5Ay0uB3u@kaO2?)y{3_{qd1W#yNniabEXF9h|34iLc)uvx!H~ro`GgA?s zb&(@YJ#R{QcdUT6ucoNX(Qg)+-@V1p9d^mH6nfId-0LA=->Uj|d2ZBjR00VV+Malv_+G^il2+kf`xFsQz zg@}OyXbMbFB)#^w$m{DWL?EFUB61Q3nBCP0a}fnEBIMlzF<(i57%Y7F=UCFk-DZrM z60wh8WV)eiSP=;NK>!%I{g{X57?(E&S2@9tc0hR+*R)$*d3zMfVIp zv}*^hnBta0Vw=0{Q8C8903bufkfok0gH?ti^!(WmN^m_>^Axb0`_$(r3Ijr79;d?}; zo}55=ojFH!pt4=DQ8!cu0$t)lwzhwyaXsVV6^JnY+5*T6wSHM?NcU}7Q3rTV)KAfJ z4DF2G3xU{IoLq{DAib6e0|;PrgmW6N(sZlun}k+sGts&FV9$2V?jrznf`q;aK&IN$ zumWfzS6G`h)YvdjhpQhq`9&{|mh)~Z=ac+1fM`69(-rqscCSln)F*tto3&pUyOQ+D_e3;ysBYLh^ zSY<0gBUzxR0Zfbl2#vzQx++#W?2McvC+eY_5Xh36KLE(AJ+<{{=KzR=O1^dX7Kz4g z#*ua*thq<1uFp%v-TU#vr;Xg76T|y+#B6|FVXlOETB{rUNp!!t2smY@!5lN4!-8$| zE9x?+H@lWD<){DL4%#mAyhHY-TBwb#01M*oicyF{oTE**e`}&y8c3KD2`ou+K|PKj zVTw-lK*OzmLSRdW?ddrZDZQ~D+nd=9LnlD#5bj?9dyoJ@G&1Gi1OSF1(SzuWPGoT) zn$7mAHzv|s5`9#NcX;85loqqixD2~?;Yfg;m+9?&zREmHLEgF%$u5;`!-pG|H5$xQ z^6}~*Vybeudc)%Sf`X6J;TNN&`TbVPUs}3e=2TU;z_SH@^z{bF>jMQn2dZ1{@g^cF zTbP4djzkcfd5FVS2$SgM3)L;V{QWsp7%osiAJ7K8w(FbOO@(cB1FSlm*@XqDb3#9k2M(#!YhUMj1Ah@GsR7j{Y330#{95O#^%js#bWkyMI0P!D zYp|mOR?lB-AJgBm+VPV>qlpqW-E)%7DhY|kCkmZyhJ+6z@@Y8)^waWtNf)d68ujwM zy=_8Ml8}i{@26`!2-u3mF~LD=axBL=qT>v8NQ5=%pdW~s644?<&nB&anc+iQIN_rT zfd%@C3#ty@3EV`A2Mqx6Rp|Dx@YiFa86%3^HU~Y$ZHYD_KjWYi#P`*8W(7lXmygSe z*lCrHs2JF39DTW4G;FKp|MI=6oo0re&PhA13wC-ZMYV6)bvptG2O=+>3r!#ziVSqd z1IGS<%D|1T6V-fHw!%6v!?VDHykZ?90BzI>l!<3`(DztL7%ES>0U@`*3>O?3? zlbOYW$X;V(U?3{fW_Q|@tbch-M27^`Cf@R;!ejvHJOw5T0Fq+qewdg&n!}6+R$$dk zDnxuD6gD)*8bL;)i4P^!O<=7DG?N{NPC^Ba8Kg8bRuTg>hlTIyw%oLIOdLDy|BAnA z?7~=%85#(*YI0euKG!xZqC<4g7*cAuf9~k$J2>(DwPHyFz{5AeDKXL88?Ls}&1#WE z(1z_gq%?p0sR2`YCCr*Egdsw`MM0sw@Z6^dbV|Vv5Qx7+>nUIOFWx*4 z0K)7Iux!4Jd|)qpxYhF$mp>49?JUuH3d*8Q;;;c&v>ORE0Qjm~$RzfN$ua{{*?<8H zrT&t^@Slz;>=RSWC#Kp(2Nvxq5)8r;@l=8csFC6k7DU#e3G+mvct26yhR9o;RDP1k zZeuQ^GV035>(FC_u(7`*KT$>^-n~q`qGtTJJ}bR>I5 zNMKN}6{ssB`k`>N0RxBH;Sb(Xq0{NT_C<2rnw;}B`NlZ%?br?P*(p^hf?EVhk*Ox#t|N=BfGvA;AR0{Shy#AEJ& zKMS)7DNaY=lv3FEKufDadP||ygs`}Q*R4bL135~UtPL2>Xo59*Z44G<%ADe~P~>8w zRWJA^Cx-6CyHm=&yRpi52$DAh*<>So6 zo$Kl9GU#S+#kQb@R~IkMoy>Fl)G!O%d{#&2idoEIQF$vY?mP-}we4}Mo7a7o@Ip*` zq20HzUk3D948M*QRw!v~}xJvzq_FD^Jbs-9BupDY$?!8*D_E8`EJ1ALjBy>1-^ zlZ8Zcj9z{8IEL^fi5BJ>JeOc$+{L*wgDbQG;x2nJ@q5N1lrjr`r6S*Q6bZ!QBSZq` z%}yv0C?M75wu6xAmw9`h1Miqr&be_cScdA>`<=U*@NPA2nnl{^?n-N33~*tLs~Oh! zqYjAb?0$y?n3r5Vc;1bK5^on$)kjLvl&v%3g2OGY}8vTx%*D<}(b_4_bT4~3L zE=&5u%aSIZwsgMc)ttL<>awAFw2;GYXB*sjI7u#L`osW5(8KnfMsjW0GljJ2!<7Xh z!Q2DD3VH_|iFY_OTpl=?xDz=x-+b)idZpyEu4rH*a>_ybI1H`ZvXZDSFwXAnX?ddg z#k^sZev{zQbKN+vxae6nVjQc$xQ{nbni%)GhS!(5|0lE=7CZYi^XI58N zL!ND}a-}1Rr@YWM2D>a4?QG~opoe&b@v3GY5m>5?%$RncA#U*jh;gn{@&-a;&MosU z>P1#+u_xYX5_!v)phTi(Oamgt2@+)5ig+V|^%9Ot6crt}FIyTp(!z`Z`qTFyQ%W@O zz$};$?SFh9)2I4zxVOGPLkCs^xMmm`8Ufh8`JM&~(a#r~XL7vP!reb4E;2!IFO!a- z6T*3Ocu%PKvzHj&Yy}Fonj4R;nX1S}M?=ghO;5)kzqN{0V#``Z> zu|FDFB69|B$OU=jJVGo6RGBrx`5P`?PmUztm9#C6ELDz@{P}}?EXqtUsw^<-u9s@L zebha3wg{gGcb`R7o7YrT7T^10t6fHgr!VTU*v~(7(-~i)>Ic)RunM(-(NE1E^rj@% z-~QR6@~|d0x^*zRAvOJJY4o$?w6!y;ZDN-?o|Zhw{`u_KcB4#ZgiS-^adJ6Z!00ge(*{@V^y38hvPu z(UHuf+`idh`}#C9EFkwJSY)m+?6yr#t1s$@`&S z-Cwu3u(=rSY;S#EKjipo8QeuVI38=ss!g_MdZ+pF6pQXACV|URJ15>sReRPxrX*ZE z>G*j8fnl)O#8!) zA8M2zH5F`<$~~%k&(=H0;Pr@q&0!r+%ahAV2dL>?@TT$a79A00D#_x5jLHBm51ru2$jvrJir?v_k>j-{^bZ(+6K zhEKEX-CI#xvXv#Nx^nmK6h_Ha{T5aqG;hgOKkn6)f6y=)CI7H#xp_}m-IA}Nvgs*2 z?i7qxsC_QivL~$mRHz@e)KhF2I}@$=@UZ@|#=XuA$R~zeGE%9~TJ1tS2#5g){g_9kE+q#CDBAZT*YZ2Q~ zQDfrv>w)Tjn`sKw96j|&iKRn+d&+PM2G}MW{%hUX~x!bi73Hg zI5_CbWk=7EEF{xbG#rxLMoFBJn-Z8KbDVZIHOTs#%-=)2jADs2*2u=B&l)D2({V6~ zp7~owL}OOF6C+~Zvzb&=jj#Bkar;&q&G?2f>@huaQRk*T4(=N5@3lqL=A&7ezGT;= zt67!g^7e}eO+#|dV#L8}oQSmAoKb8eEWBd_2MeD?$@V@{=Sr-#K#uqJ9elUe!gysWC7Ggh)&owJ%ltgpyoLY4M-W&J_a5v1#WWs_%e z60@Q^l!~2t@|eGky2N&(`ug!_s{Yz+oTojGKa`Cq;Qnel_s(~r%lHs9Koc* zUvc=$wITb_7vk-Gqh1z;1V7qYT5%aJ9G6gn3HV=-|IA{`ZnuC6zwp4q#CJLC5+jrI zqyo;UoW-G#Ex5+*q>R_f_t!(x#~wQ)6J_1x9WApZMHeDSj<2!lcpsB}wVrr0@`r@| zCJTbjOVui-LJs01IW=sFjNp~))8ozgrOrJh1)sPpnOhL|?3#7^TB6 z9A}oDVd2(6K600iur3U{3SK)}to`c#+eh{MwO`mfoj$j`g@oUEwsSyKg;AiV!`(Bcx>w)MoKMQm)OQar6{Qhn$r6vy=7?l;$=AF3>@iQkfLX} z?fzX{_0~|)lBt#Ppqh7_@Go?zS+PQn<~xYMP`Q_BwLX_o`o+H0`DC7Y2I{hc@$JsF zj_3zIY|2BfY@Y$zZa#1Z{}aStJz-bAmRymw>0r0X>=FD$v~iLF`wea;^FL%pC}Z8& z3Wv;9Ygv+_(UYu&J#d>o)xru20a~5vH8Pii1%a|vvzh6iysvB>b9@) z0di#4c?=a*F|>U0a+HJAPZ;u&shU@xijh>KB&zHJv_bNC`Ui1wqEEoPCyMH@HbeQXij z%+q4tGV@YT4H9ooJ(`=eI!N}4;UJkvEY7hU*?I@y)1PFAxYY~D^wC9`v#UU-Cj z-}4>WuoJ@aCSbf+Lq>p-9TYnn9~Ze>Q+0Xe(>FD|C~JC6Rm6%lZmVglMnpk6YnQW` zKb~I2Z5(LoAjUN@=qyni`wM@mdTVCA{d@ba!5T?x0>$ z8c4`Iqlp^K*9t#J;E`Z`}VQ^>~;Y7iozmNFPd+*C^RB zYmEK%C@4`W%oyqBKoglZ7Tcg=6yhPL;vvm%0R%&ketUc1_OF@STnjF{5CY_siEJZO z(Me3J0H?=*{n*b5N}z|ZFvXg;5)>DH4J!O1RS?aP2s1&ParqXB*_@>rhIM|PB^lI{ zbn-A7+Z|gCE^J+5>{@Z`-#}rGz(BasbiqBmCmH&A7{z)Qu@oww6&OlwjBM^u9wqB9 zFaiq1wwOxf9mMzfi1S~JmuG=#)zdajOUA59R2oeVP{i25Q)bSO5bBQ;BI~et|eqB~UPp*C>GTuJ#z#!?di}=NOl>(ALsYLHIc&I~U$VH(Us!@#- zX0tS^r%5im@npLl7=mgQm8$q4HS$fwhTe*tkkp!h-9ft_5O(L8Ha>{O>3Pi4LBaGy zhahHFqH(uFGoH61U8T#?D2yb(5vRXnqG(p$`Vppa0=iGBL%G`hQ4h>?)#U&dXO`;G+yio! z?qWk7y5uggLI(OG6x!?pQ7S>6!zf~L5+(~N^)SVV1;M;{%U-FbdXhZB^4e*M+eifpWOHd>w`=B3-3I?%01bKDZ6lO7KVy!yVjkkzJ5F5yft3<;j2I zfcpeYBWPagTdCs{oK-%Z<(#Ju;vI%io>G95U2~DeEY9}Fi=03lup+e=fP6Qo3+|S= z1$i8Drkb|jvG<$DUZhGeJdz7=2ZJh3%2|gYAsCge{anxoW9(Yh#{5GjuAB%EFU3$w zahB2!^(2)O9oubECW3F3H^nq$tdaV6+=!EHJv2VUgKi9%B63bbzudKlc3jI?jBqz0 z3#LH?x z6i>j7%8L&-79aLL$8(~?Hy~m28eG2;Z-gb8n@Rg9+%=^vnzPXOPM8EjI}P)eOpBIW zE0-vn@!Vk&avw>22{612#1Kg(B*e>pCT)&hlXc3k^3D}Q%31{NGqu@`*E^pei~PV# zIc-{wIlkHvoa10gG*%TjXF3v}dkqvVBOAe)Vo*`#V^;}or6 z;%hH-Fdy)d3O#Pe4gGiz$W-IQXnf?mX5KK&C1arlvi_1BJ^_XaG8Q5dr16mb{Ra&v z;tpklb}3Dv9#ZODB?A9+X%Zsg5zn`IPoy8Z-+{!}?`1YY!+>GgmT(o}4sjMPX*vwA zfKy$<38_M_G=bJ-yq*S^ibcMmcb+Vf``Qh-T74Y%&<7a@W9bEkp+0mdUsj@?DrG@` zbCv2R5#JwT{PYZ6LLg3nKsQVvF?*AAiMs>^BuxrWJ&2{tfPi#=hhe?3cD52r6$~R| zJZM6%x1=xSuP&ZjefNztvSos=gzd*u$;AQ7SB>if@Y0*mg^;*aP8fd>RRs$-w1V=} zDWz4omuGPJpcQI16^)G30HFFnhg9ob*P%ECC*hz^HZ?=l&oqqs>J*+XIu7u!#g{^8rPP9>#&Vkw z6$R3lkK>xtw9hg_%>F7r6>LNmU_$jmrqz&06ZqjXcO@Dh_(VvX8|W_WxRig3i|}|kABVjPlysB1)g|z`vm1m08DQ$N;l!yjOW9| z8(})o4mhQuN)2=fe&Ms|ASz)V0g&%qU|A!Zm-Br=XW<*-lzuwUbWQ6&UL?>IOQ zL&wH*^Tr$DjfK2%3Ru^VCJ8dRwIYNs?tZmK0*^2_JjV$|)XfkMa;==*65A2{+d}63 z#QWrP9V|UKBkvF;q;Yx|fa&jGNfG}h*YC&HspTO6&Qs*v3&i(8r+RaEymM8acfc}g zc9r%oHtpK^HPk`JI%E_fN*3pTVpL3d*F}!tVJ;+t`E|9W8%B9GMtRh)3NRgH{mCE3 zZM6^~d30p2>J3FC?_8gkxm#Uz2X62>U4*V?W*<^g4~ ze2m|yx)IRHl5kI#xGG5o*RztQjijH)@9Ak%_5EOZI{C651&__(Lg)@90#t~I+pj~_ z@fI{K5Ht5`Feh!G_3^dPIKK3Fa0w=t4oK)Ps#w(;g*6yS$)i}*$N}_P>x71mhj*fI zO4}+KhVdc=YHFJ}X^andmWfJNt@BTq3l4u42giJ@)hYn+77tut-$tB$Px*c|x7$tN zPKR!sA^njr&)}r7gyod>t{v3yS>M8GwOGwe?z#u9!x3yFws7k`&u_1Oex38Xkv{OITukHID{6~k51`2PpX56=|wlFr~=7bLo zmu=z_A=J~p+>X2%zE6v)Y(+=6-jyX`IBi;`9j=OmQ1aH%=adD;9l^y!$=s2>#s` zOs)SgvYslF-cefE$QMD}thj*B+q~jibu0pQtB~&2U2B9Sv2FXfh^Z2CcOiH(6kSOv z>#EIyHM!k(bH^npPv0}@hoTBXJb0RtN!n4z7%Aon+`JLe-Z);MW8_Jk$R#Dbg}NM_ zl9~n0sJgfi&u%fLni@iP!qM}65`?Zp&?iw3^5K4uR<=f-`1{)FV{8D9dnXr3Ia^vm zWqyIX7xlH{R621svf=r@KemG-@fvI3KA{0GZvPVPD61{FHjaq?{yP3?@I6KD>~J%r z$w91#P$k5$pgVE-`-?eR+I^ZND42TFyjKpfvo|fB@!^EtH-c{x`8VTY6ZC3^q?#sl z6yg{ct|}DpN;;v~4-4V5Nl8Cp$#s6!gSw`Gv2-O3y9xWSwTr9yN&314n$@-GIC%5@ z7Rk2}nb+a$4$3TFHc=}z%oEMoUeKKG&H|cJ1EQ7dJ_NtfVuYQ|ZTZRK4HBkQNrxZV z;R)5ah3m&m3e1heq+E~|BxO2ah>2$=$Lpp88C5WH^|z-iq`HgvviwbpwIDnL=ks1I zJB6;(&tvYiVb+g3*_P_}6DL3R#0c2Ob^6%GG;GHka$pHp4ht{-N*Y6jU%gOu=hG8I zUwrq^Izd$KzDsR}LnukV=!@objq)vM+jXPXUo2cWqal>|Ce?rPvBw!0djwWGZ`&q< z$ng`F84XBG*MoWFEWnNp!|(BZtXB_?gHB|n?2>9-V(N~W%;n>QugCG{*&Da;NIOhI zjN$x!rwwc3_`Qug0<;f3mwW4oS4bza>UR`Pdm}4>oGD}EiI@^j*sM9pXtbRbW77Wg zYkMwTvH)N|A)Qs%!n{V2obaaV>7%y|QLT|e=@j%4qpyLc>qbWIJTd&9@yzJ}ium?x zDE!N5Jg+xSI-zwVi^AWicg!JahB1P1 zTYaz82WwM)u9+(MtbOQ@H2WF(+N^H(ZF|!4_9yoDCAWiAwV zpjUDCyu1+M&-qnA{h|Ysl2D*<6=V;xobSpVKDf3uHBKpvIw_A^s_43KRH>w|KT+~hD+t?v~ zYZ$qEgQ6DNYeN%!r+>uSl`R)vbosJA{rpugR3rOK%F<=`%U1HmZy_`%&qFIQfk^6iKyYb2tde3$K&GtBA(JKqBn zecTf$BgO*6e0)3T36MQZe$fFUl(OBEjtz*8h;T@W)`Fw^C`-ah$&O&QyKS#P}vVWH>TwjNDb3MNS*ERvm1Qrf(~3lUb{r%N_C+;Ohq^H%5+OtHfC$6icfqfvCv zuZS@Tvu|{P+fsKKa)oewtGK(Z3VlqP$Smzfb)+}Hw9frgEYnv~?b4=Qa)ViBG8?C5 z`SE?SqFq>n>?`N)Swafw!)7W9EW*`KiCTqO${Sh=)np1XUT=JC#B%vp9+=Isc`D+a z&!JZo!?6fX&Tum9kkri^JTL)qxK|{qG!rfKM_6ThA3xu-OQeWqBGLNaJFfT`WK1GhOfvMY zN;}4AXR|EVKH*g&;kwWnexn_6@^zyxkTTkQY;Y-zMsJ*uC12X&Wj>zl{V`Kb?k^MP zcx|e}C~{99v1xP{`$oB5zA0TW=T;&W>?Y*?g%ZL0@cR;V1SQ~6QhP1sn5{gT&o-W2 ztP?Fo(AKc_t2t@-bL-c~yFWj-v%b5t`-1in_tv1nyTkq2Z&B z0FB-R9&w0;3As_2jso&AhuBCFc`)}GWSD07w(hN*r5FPfM{MSru8_W0tfc}!L+ZsI zk$j67tD1a<$`U>4+t~QiOirBH-}Mz)l45UP5n%rr(4*;wPq^?Z|Cr&$l%CLCx%eII z6A~ozB?Q=wPrHnz+%0^}@1t^BE6p;kBECReXnNp6{#bf(`FnZyCm}X0XJeX~t|(Ni zbS<6>Wm#Zat&R--MZo)26gusXrA1ZQHr# znPqY^%Ix+YvbKD+a)fo5C(IO$7jwAkyA9@r+|FV_t(%73@gFolzV#5-0%X}H-W;6Y zxp5u)Qh7U6O7{{-ldYri^-VU)2sJbHpiT+k$??JX^3R89olILH(2h2nV-9`{mMW~$ zm_$g){}xu|w7Fh6a=!as^W%-4s=byY&o>-IB>HMZglDJ=TL+ZSwLL+R>)j7+7SwXd z=V0T(Meg(h8g5USl=-r_y9kHgk1$=jwaojsuDpNfpsYnFE&-8yL3t?UDxGx(7cNrI z-W&MV=lWIkO!*JbPW5i?ktff-KgD~y?T0Ct%Yz&e^_5cg$s~&TIzGZBoB}s1zxqIB z4r%8~fuE1>O3^A0kv*#g?)`qNM5fu?N4$Y6as-by-*mA)dklQzv9e#D>+1eRXIcIBx9`ZrwBg+Qx=pa!{5$J-p5myQVQ)$)RVW=E;}ODK}LIl0q?5&1*@iS{lj|jUhKtUgK-y zIteiU3xRWVZ}2KEJds)7l)D>8^i+~J7NlLkeS$Xw8d+nvgN+*&dDB%x1fK20nlIhn zt2h(mmd8|fZ{BoG=iNj<%9i_49eeadH{9=r-ie#q;A2mKfZ90yH-&ni3EsOWV-`pB zB4)ThFJJT#?6Xb4(B^|->55J8tq(4-BUfb2p0z|HgA8(0Uha-SCJv~9u#3R%}z zd@pX>h35;xK_7SVN0&&|_-^Hk&FM3ok*HgQu0~I_e$JK^V&{wTWs*97HC16rJArgz z(ZIb9iq(HXVG=CXrd1$>IptXIy1Sg%R+|HbA2)&Zikdh2dF@^mxAKkau&LlbqcuM` zu})^kt2}FC+YkGdvYfN3{>1L?@o?SuixL2vGOkzUbqGu?a-;^cUBfg|3u<4GrXM~^ z1O=C|&ryz~Lw}({8CLRG3~$Oq+nwYw8=g;cj_2^qJz6vuNHo29$Kn|1apH1p2{>N{ zdULz(HuAvGoPR4sqb*OpPK!|i7!WW5yV{ZDjia$4!Ov~ecwrEO(wt_COkA>v=M}XZ zYcK(AmW6zI*Twxh^BJywY}N?U&Kw*dIb0~onXt%Kh(x_NN3D`j>m#Vo7&ILpMuagY1Tr!n8vK#FH zH?DXsMEoLCs$n1hE3Q!s?rsb447OuIj(gIJYu+MvXoshlk9Oe6d)O+EU#>adZ!uBlB-ZO+(w4ECV z{H>EVI&r|DYPF1 znW@3EV+PGsSIw|8&Epez;0aASrg{_;e%%Kj7p-+~P>V5znsDV_U2hb(!m~0pDc5Mo z)#NBlG#=wqm}vK-LmA-%1jLmk<&)m50|X9OJ5%9VpDTID!w`ldB31#&uYiIU#fgz}bw%Au{#C?ONxXG&aB5 znWBC9J^6f^F@cT%n7sfaku04L;MH$GwbMm6MdA3ZmjgK8$s9HI#3;PE+-||lQpIJ@ zXKCkNmE!8~tJW=U&8@nwwyrL^@vHqyztxJjFZ#}dU%j;(Rq~kpN|^O?lRB$AZtij1 z>vv*KyR*ZsskpQm3=bN}^)_wuXkPa(>KyLxDCHG0s60U~uCL|1>#d70nk za+))}*=lW>Ew zx$3ob;&K7-+B={30;~P&>ih$TUzoM0%lgCxMWkC6e`R3ykH&@EI+4#NFLG+3ldgt{ zJ#=8rX}g_n3W0NB4u0lp_P{qqC=YQ(>`JjbmfdWBcE*3HN8`^uTRvg!4h? z<^_0(9fl_yZ>6_+MY%GlSIT|oJLQ&7*Xa^bTMzfLM2r9@*=l-Y4dK5sXn@wF?(|wj zph9PTBy%9iX)p=Ro^;MUnLV&poITlQicK~}aSGXT-8DZ8L!PWvbl}PpTGgPKDEi>~tdpa}Snz0S1 zq3LcbGmV_y$vHomYZk(8)|T=#lcqAWwyq)jC3{Y8MNU;_${AvIOJG{Nbxv(&?%<=wqm8?IT;-!1Asgh}>)}c{51L zJ$N!GRyvA$6I)UmcOfsLn1A?$(_Dyh3(Dz(12(=5*71?AgAXbgdh+-!m_dQBh;fO> zd=ecL!EiK#eiq6#g{Lb~uwmDt8^Q4md27nHJG#s~fy>jpi9t|s67iQc~Jz-aUCftcYc#Xc3cph9;m2d4fzebk)cZcW9OFS-baRX$Q^S4F6F7t?fvp1nK zr6Fgo0IMYQs|+}~Q(YtH3H8&Q_AzX&4oyO~sm(6mLG@JubVx{G0(Vp250d1yd*Iqy*trUF0L>(I<4_^cO8_9C?p<_z-<~K};UKFpy6n z2MK8B#b{AyI;W~Q`3CSDRHU@LQ>_Ls$Ovi?umNKZg#$VJo_n`VbBe?Ty-W=1b3Z|= z>(y^$)Axu=B=<0xQ{A^l&bL9Iw;2HEh;%%l_T=LwxjI;_09cK@X9juD@GxbJ1FW2n z)mw*S&em$E`B`8a1kjE;ABO{^V6T-jG0DZNtFCfeaUvh z{oKdWLLFdT5c?;N+xj7{^dR=~Ht7mX2vl%~;~8O=4+N9f;&voI>!(~g&=M^5 zW%#Z>kIr9`IT~8DTaXD`@Zq*D23$Alw>SMp!&%D@MwAd2l zvEqOr5wfyrI2#Gv;`}W7Z4jQkXFFQb)PAuI9Mla5Wpidz$J6Au^QPJBpI+cC{Q353 zPPzk#wFp2zK;e1z#q}e3YW(Mv``y*mhg-+%3tPD@tDQZ@&49+2MfV0Fmo_r^gclST zLL1#9We6>&{PIOu>@(5%yhDpp>Uvzvh z`|j7f_LSSj75U+5D~;CW%YU z(Q*Yn_qk5EjmSzD_9Y1#XZ>%&YR-?!f{{GL50xE*8`e+l>Z3X0UqE8iiml4dzYYs0 zX$5n)zp56mv| zvi`Ya^|u<_bZUx|Q`mhj>X>QLgVaz2&-S*VE6qdtFG7PgzCr( z@ovns7xQ^78L;%a-`(#97bT8Z-79Ld>b{!rV*lfWV#1C4r<%TQ&NbbUX?dYK8jzyh z?Sp{Fcy0CBXaaG}?wef*^;}5^MU)=1JS1`ipF78W zMa6JlJ*6uO!l}FL(|78?;tj0=Q-Y!6C5UviW32eIKF2sd#fy&GS>^sC>V^S^Z}^Y9 z`?A%dxD}ZMZjDsl7u&-W=v~)d>~wlyhVadgvoteuPCegBQ}Ez~xK-l0skxmGH$Hp^ z=AEU%IwNp|$bOe>tX`mVj`xt!D)rn~WF!|))aeWS5bg~VI{dElkT>f?fsuIacZ+z< zL38bXt!~?D*McLD&S+t0%ZbG7hnCI=27>I5Yfr9sSYItIeKsKO@4pop&TAjA*ektd z64I||_+dOlj56V!D14TyijLv1z8c9u3pXyW{9vLLFZMI@^z};z#ACk8wKF}@Ze04O+L2-sqF%_VwoC7Q`&W*7uM&$tr?(v{utr{$f=i+lQ_ThobHz zVerqho}$hhmUIlZ=Oz-^W_L4&6?Nxh!;@L>&Dp=$O^{@zilvlJBbOYLP$Q~X3TWWJ zzFP$ld}qtrlE=Ml;9S*Q8K?PUn;|o>qPY>B_R5- z1OtZ?{a2#LqKT5Na6$R*pj7(4JYYMvtG!<--SpcT)v%uG;S_u_`urX#tV{j~Idf|) zOvdhek~8wOSuslkzav$7*NNdR`Ytdyxn*nCuzWP9W5T{we>{FoJ!|9R8L2@o?IFI) z27+`qR*lS>uJTXbS2Mj@?2*jw)+BlI1fSRV*SQAXo8$A3mY>QfUsAS+W2Jl#8)Uj< z;-ETq+MUmK>Bu}kg2lt!?vAhiCdkYsk5e~BY_<52Du)>|5T!}Tu#9Is?8}N`jNeyF z>XdvI1hE?2Bt^BQZb)ATT;w#0JsC;b$P0j%<4jT}*^$>g!?-rcw3_ubu~VHqH0p7z zYqIylha_j@~35Ks}WTRs4kiq=h%tnh#7 z#Q6%aO(dC9uE-kaGh2kVcN0D1Y$1I5wiYq`p+dY>+s7adXlv8IvbOb@~yaK!a8sXB9sm2tbHKE`r`SJUk@+m!1=rbva%&u=-o+wuO)G{?2msYXa{ZVgPS=H8FBN+DW= z7GGBAsAAgHiM(i|_A2YVnJ^>q`ptEv0`2{}y8CU)NB{tpQV6We zy*_Pj($P7tQ-mpZg6N>pJ&bP^~<{x+8hC|-=U z31?8!YWqecuiHwED+T%K;^1wr{Dse=>I53A`Wf+5JHzKEcHQ@H-+tRwbmOVe3lwyZ zhDUCy)?7Y~#H_D7!yKZdUDEQ-f%VR8<|)6a?NLbgt578E=ohV-5aiV}x{)xrUz1G7 zR#^R=rkRzxmPem9E9UuLxdOWcCMwao6BOKxIT)7{d|#)2h?fHBk%xJ)kHjS0S|v_+ zTlK0=p37kvX(7ba=Z=z|H(ll%F!?iaf`tn4oH?A051ui(~uWE#EthxT(Q_|^UV5_>vAfY!;oC8o{v z9g*cfwW8Gb9hR`(^ddF>?3ZK>c_97)_tZ6fH6)umV0BMQJM6X!hp*;J&B6t4U6iXd z*AhO|f(F(&z{Wg%Mfty+GNlG8#}@RtuvLMcsv7G@%DqgI&Io-r!3|H7zLlH98m0lC zH!GxDzLXjJm;2_U$vgUz6R$l7c54=Yfg(rYzNhb*NPhp&=cIgc_$&u=0rG(4;bIbV zy6Y-8*HZez{-WybNfBcGgX7?fVOJ@v9Pw1S)ckI}1@w8C=~=w72Y~Ac7ZiOVu5Pbz0jH&7*&|P#8jven+%Pp{vw&)N((9}3&Jjj!+>a>?ep9~ztsr~yR`_rlF&EB z;ZD(KOfKq16KK!zq0dfXTr)7%qD)bEf!h5DU)LyrUhtKYC?V%ap$U{99{P!lIwGR| zDD+ndG$y+~mq(yKW$9W;@bj{CP&ov(^6uR#+I`K3*hU?CM};UySC2ri6ET?>nvgYg zB3sM@W$b;6V>8*3D~9z^)d51m7x7mxU^z16G%1G19IR4|S|o$~D0DpL(E@EWFY3^v zvYaBQ#`i?WHTK0V%F^+8VyZFVY630zDs5j$JQEOk0UD>32I3$fYt|Aj zTtRY>Kt?N(qiew0ba+G`4b&mtuPyFK7PO5)@sOcs*zS9*q5Ucnj|lNc=84cs^qec^ z{Pyi50%(Sk2*pzWVrZ|YC1?plILL_4hRH022o4-%KO>Sh2BRa32%kz!Bw&Q8a48ve zT^95qEe7^6S+-vg{Sl1~#NgXv6)ALnxCboO516p%OCv(OVorP?p+ZKfeeaPKJIbLwEpaLzM@c7*y;UFo*B{5g^UcDD9{s)s8)d;UhY+jpi4bUSI_6 z@+5_2$KO2Av6u|BG*6APKB+gBLcl71(=Bek5ST-Vi0oM8F&#i z4;fT=F*7|gGxK{!l6AT`?!j&wSimzAo|#g54eLP=3!Nrxkbxq^jFyj~j>6~@%BOR% zSkDa1FMJLRn?qljqpgLygbA{sCQ~yW*(?2LSWz$XDLucHko%I zf5rhBE0!@YwFtP#pZJ_#UDpM7$*IbV-C;NmOh}Ole8nK*{}2C5eb9N#aj< zlRyr5vJVH2JLZZ0ZG^8l9G9Nj#y&=rRf?BAzFqp% zt*q>zw7RscNxZc7cG=6UvSw;@>r>gw+hy&l<^67@9E9?o*zy6yQ|%ET#|Thqq`Z6v z#ES)uBA$+4dOF!yI$a8!6)$(dKtp8nvN>?3KTQ!b ztHWnkB5dl;+^K_R*O|=<4?n442f~lS@{EmX32iXlwK`Y6Iu+xZCa;2XWmOlmpVbur zLw{CXY^pZ5sk%P+B0jF}UK3}qT8(HwXUgpJgr*k`W4XFo$c27k68T*AENg~M1Ir+1 zh)q>E9cQUp{VJTJ$ha_`y#-(y85;snNKyjjrBGt+KqSC90{hf|Fy)y=un2?o@We3LR(NV3S*1 z>$H2rTVttjnU6Typ}*BqbLa6TluZ;F=y=Rz=taMULDUF3@7gd=&9=rZHUigMo(#4i zezv_|Z~EHUme^FsQr`Mty@_?MiTzb2b(6`#o2=aqbFSd=h{UAegFw5i9B1h~WaYk* zTm(#*HzYcqa<(6;QkP#_L9?Aq$c|;smt!HFdarm=*4wuvDvdolQI2^Zwqv>ETnDZo z25~u{{5!NjSL?$rR);SYaZdf+Id}Ghb^u-D7T;*V^=-1-Lz0rCd^fBy^rIuj(2*I&7CQd+*j25(nFt z(5b<#AWZ+9;7LSbgaocsVkmv^bvqJ=m={dv5MaAMQWt&-0fgHF1g#vLpGs1JMv1#5 z-qf(MO+AzcnPyuL6;Gyd#h|;1V+0Z#rxbQ`LlC(+g6bG)jephiYRI8tNcH{%VRRJ1 zckBGpLTDA|&*pnwCUrd|r&7;u?TDj(tbs8uTnUqhX^#Ehv^7fPPTh`pE`9%hglADhhEnYO$SZ}VhVGG!l!AT_>(MQ=_$-JE%D$Nx1{ z=0F>EnrfF45E|j9P+&QRFux}3jB^{T$Wx`MO@EkO#~J4UZJ(*woExT}uV`1V@~k-m zK*up$pH?BxEQ)9MW-G9B&wWt9YE{~H^ey6@j0fYRuNO8q7YK6mX^~83BWPkWI-CgE z!KlWMi{-sOp0xNiX{K`_!a#JxNBGlx-@XU)>n1yWLvGX%2J;L{JC8lUja2(!cW@)O zn>hDuhnrnS*xISLtNXJZ_joYcLzc$aoO5;C=*h?^ML?s`VeArV;URX8&71vA7}VJl zt-r&1%Ut;NsSe(jNo@xKCJpXc@(0%tI!qKXBjjdwab6HLjpvJlQQ_Sl2AwM+Y9 zg*?aD5_@dsl`Vg?Gx5%`e};@v9#S`y@{D5i%en=RfL@sfDRAU z&hSF>8+hkAd5fOZK@`hvo}>>Sabw;2w7W73-D!XV;oZbtp@ns~(c`y4-aPe~NuTQo z-{8qi5MQ?RM%O|O!INi)!gXAL8MoiO_R{Xr>*=xl&-M8d&7lxy0P+$+)#IvUU#N1x zFnW-RmlEkh=vj)kp_B^b11L1}FT6DT=J(Z+lV{DLx3Jl%emrSmRkkU~>v>dm*gc7f*D85Z&tuy}@o52c~!(akVl z)Q+U*)(B^_So$NZW>~z}PvAav7s6h(bV>xQlRABlE=I#73c}y-kA^GXtKGdeATZLe ze(#Wk?0VwE!)bjq6!-(?5i3URW>4L@8k;=+*!0(OrV@!h&n{z%2!21@ucE1-a!?pU zm#iU}=eI1kjAqENJAh`z&iWK-*8U)Ql&L84H;94Uv9T{Mq&-Ua`@E9c9nCeCY_W%E zUV&`OOM&(z_g>sL)lwq5%YR)hW?8-SB+k?}a^{gqbK9X@sdGjW{&CgWFcvY-7&XoM zEB*W_v+tO^i_Fi&_{%R3Os9sSyR58mSDR<&j;;NdhAyjXnmwZzr2TSm`|IZX%P8K< zM?nW)HZO~)U@RZn{n~4p?&f{zST)3JT9X|r@`$MI$s^JSr+<=I$9Dxjd$<$ICWS*j z?V!`-?4vt<0ETk)s7jOxc#EH)Ne(2HJbt-%$QMb!0_iX&Gkb-1$%ITKyJl4!ibdWJ zKB*i}uof1%hvI`OO<(bjmobHm zLDEB3MwbTOy_M)kdnY)(KP_9dQy?0g6MgmNkzaqY>bELR*UV99R>!j-)e}ai?u|9c zoHaeUTyyq}CXJ%i&t1mo4EgU3HN2;9zXHb`2O36HYqhX3iE1gi#q`p_)#Q!S47{vE zQ6aa3Qt#wCR5H z$X{6L)dhK%=DkMsZ|Vsoio6rEi@NKAl)f&j07Zb>(tCqbj)JWbL_eo1M1b}^es5y`nPL3Sq}DN9uQ zcDm{{^hUN=+qOKfQu9bC*2UrjN#NL&B&x&cECTL~n_H0v)9>VaYtw~(+q(1gQC}`> zN^GiLr1J5PRA+?Dg08LCo$d~~ox7e|`TN8#4q&)`{POuYF``o^e1grZ1D;+ax91S! z$k`_|-tDA2IbWv*qF1ox=tEONe}1pO<6+DHq9DzXrcp5c#%-l?nkF-sduehlCp?88 zg$k<{k-_u{a*BIoi9Opq<&=K9yOOUhRZ|zu$;L%Qy;UA8_8SccjtZbmFD##Pnd1fIcAOY-m zCCnFc1xKS|sLe7)iqgHp&)i_BuQZmF7=A@iNX6*%`LX2mt}DWd4Mt{uV-FrdoJI9i zPFtppr4;Kri=S&aeZFoi^?A6nq_c|grMF{gFS?wieH)DJzmBE9gt*8?sF*mijA!)f zy2z(Cn7I5?SXD7~J3pQ^+2x{C+hBUbZ#?@Qs~P?RV2An^*O2-hR_$|Fv$i<#WW%pk=ko2X7}^ zzjfcd`=#mf(bvhB$26Wq8g&AQb*hb4-!qiCnE*XG_5ZoBn*9GSVRbVe0{MT-luC&S zONfihONlGVNz0#5R8^GMP*GGT)pI@;RCdia0!N-tTOp0zq>f8Jc-Vp8`dtc{J$ zmA}YRs#kj1$<4{h$-~L^4_RvHlGE?P?{$?x^-6u+oTy|e)hkWB&QJA9saWY9&uf3M zQg0t0s#N-~MrmMRU`pVL;lNXWn9|t0et(S8`yl~Vp^E=*l&X0~KK)mt)FDfP0sb~5Hs#H4rmsCn+N^fP^P^Hp< zoXc@JjX60vdAV2rmP+Ry`ur`G&XfKAV5NVg(uKmH)S|AxjnenU!BnQSz9cvH$#1Fj zZ>E%Ll(s(0|C=fOW0aOwh3{8|S65eeRTpQ}ME=c`QjO9-KT>}v}LGr zaA;y{sG)td?hjD9JlaV$N`H*jQjOBCe;TFh6J7fgEkE8g@4W5on{55@??CCy?EJ`F z@9x~IiTREB`Ngq?frEv?{dc2p{xV7@mqz{=rSCp`{$H8W`F~?dKmTS**SA-HZLe6G?wrj(ya7`mg4QTT66>HFVIsT0QhznIc3DpRU9M-ik~sQH^I6{a$!Cu{yE{PMKXQFl*(7+zMMS?fmdcdo|6xi62B}Ob-D`mk%->9@;UA{7MwBA>hbgU*?D%g? zsTsfXoY#Lar6bZ(Bfpu_=wy1_9rOQUN-h2{r8{y2#<7obJ;GF`bow7m>43z)GNsi$ zbk4t-(k4pKtubTthm$j-kxheCrc{u6RGa%JQ`-BRDSiGArc_lqv)qfyloI(s^#5c^ zz5ixP)BiA~G0J~2r77XWd>B(W(eC?$XyWv5rqui&Oew4aiWY;r`a|T>h^JQ*fA~t= z0Wr02EwGZm{|E~OUdqO&2!(R~W=a8nF{RjlGNpVV=+K!zOz8wDXyboiN>ivzDFVS? zPi0C4|BWeq@(-qT;xDFD_%~DP8xPE+GNs8h{}WT%_-{<9m1q2M34wP4_M0hv#PXXd zHSA)$`I{*{1NoaNjf3|7VM;j_oFb?H$&~8+D^uF>52mz*=)_<6R&K%IA51AHeYn?4 z)|#;hekxN+uAr{EBmRRag*6h?L-q@h&VQIvtl)2^)a759QoY|y=>Y#P zm*Vp>>#fOWic%Qf32mKXAj|1H7#zf$`l0F~jHW|dwofLDj+u2y6s8k(eKhk(Gz0dP zZ|DJcAuqKB42Ygc)K+vVc&zqGJN1%W2*K?V&ZnT#?;CF$;_XddMp#3|ivGQT_h)`p zdkD9gmmCC2RxfwZ5(zHn`j3SbAJ+%Rfq{h}_4w+7Tfd+^ z9d;Uy^nmQE3SnY-VFTdS*VlVo=sG$MeuSBf6{cyP&kEQ6at?wnd+rQ`puLoIdG3gI zmTg7#lZ19Io4$YM|MMgvl&}$e@x=LN!FYY_p-DR{7e%LfGM_u;bks@Q1G~!-Gu}Va zQy941{9k6z@;}!z?cO(F85B~L=BUHHS=-i16#@$F{&yc>Lykb2i2Yf}DkBqQ(f zF4Y{A^3A}(_A}mweBCLA*l4}nb``f@-tVZIae z7Eh!R&Bbvy;>HZ>Mhxx^(kK(LlubX~$T;%LRS*{$p7?XIwAXfME4|>?qa!T=B9KGq znVP5&N8z1@brioi(wA1{$L!qUCxYg-g($PAXL@KZ&&_YxSI$5~3z)$R!ZpnN7`&5! zsJUe^SuyrqF&gT8JQY<*HBB{R8*wWv{4#;=ZJ)d)-s}tS_NYC)@U5S%>&Uzz?^5f? z<-q_uE_-R8&nl3r4(kddViafY7T$hYy>?=#@#4e|Ogq4-19ba^y5MMJrAi?$&6th! ziL*XV%k6ZzgFYd8!{qRVFmjkM6W$4cK zMn8k+)EeI-^EcXTTFHt*BQB`ig{A*`0B>x|vlmux|8ht7Wzw1#fcXKB1a*;BPuXqK zD;@XKQ&*}5_s{Y2Q#Y|@4ShH& zgvr#$`1oLU$!4IWdr&zdA_QIZ)={)XD%3;5{#ea8Jcz?Gas5^3OBHV;S$9P zOoh;nqF~CHZBpp!WT<^5G(sC^ghMBiAa(%M0wxSB57{hW{HAoua_p)a*2FRi4uu3r zXM|Vg*@BZHnkjHQ0F500+C)aVW3&Jud5t8v7Urb+NZ3g}^dUcU73L(RDBN|8ql+1L zegqY(htnbJdv@8JNsIIgh%&8=JQPGX>EVQNbWDu6mWZhEi_xZ|G%}>DA}(txhRT#`@}WCCW2sDO$F^nRe`8APGvXTS zM(KDC9ij-r{^uuS#no`Co$#rSof_@1e_Aw#~B3QhC2Savyxkqf57Gm_Rq_FI%I zm_Ni!8#}{-WB4Awvc~hvQ#2ocJ}(0+8H$lj!^|mTK9S&l`sTJym_=pmmw?1fQsOEb z>9f`~g$lZ#ID~A9ajYT)0EWF6wmrhr69^z9@=0^5SBeLt_Y$@NG{UlhJdtNm?05x^Qbuk#rO6DCUYlTkcA;4075AqGva zJ(yHzGAJ+Ea02>Sz=+x%_8F|>c1>GM0R91V0+N?J!~prY?<#4fsxAgMUrI?$0vF>j z{Wvgc6cCHY#F7DwR6SHH6DbD@mc>qzkQi)M?G(~~gdg!FIZb=sWTTtZs2(Hyy3UHgY@vQ!|Ks&ehS}s3M|2PT4aGQ3X+BkMVT>8jXOaOp@z(i}5CJ~hX5p7IDYHkB7 zF`4pIJD*VNXuKpu%DUy_od9nEzI&d0?n6Z)q(0}ff zlYBwP^Pu?7@h5ahGQ7t-@1wjXK>*aTEVxeq4b+v~&VsjwN(i`NLd8Jk{E|i$bcF?1 zXjC9dECtC9fMQR2g;F$T8yt%-I069aD9Vq%oD~?KZz_%MFALFqd`}5 z%I1ZC9F5gFy2TB{Xr9^N1~)51TXWVG}>^L5$98Y7u102BG1_i|z zekMN*Tz>&>uP<6hlO-CLJrVQ(YIN)=XtC;ugj6^HYm(8tq*`y;`ouD{iY)Mm*wi@K z)EXDZR2-kJ#>Yu{c7&rvkjpl$8zBxrC^>T$Q8Oul8IfpFlK^lY6Kkk!DI*yGB0sIf z0iZ;hN3#G%0wBemIwveYBDGS}AD@3d9Bg`dFq*=-_%f*pb2_lm*ct=Qc}jCw2v^5| zU%fIl(CY2yCUl_?GC`rzGh?y&n;Ww)bx&*>am5~RnqoHvo&xw1`nbBNdilv+-X491 zK;&bff!p z69iPz2#x*-)aX^e@y4CcE|@J()W9n=LsIAgxdZNn9-ykFvIB1=IUdJTuYz=Vpxkw- zzeAAgxK0Ig*zyKaoYd<~MCh_Zk^$y1+X#GoS6B0IxYUILy|?{OxKs}QfI`Lbh8>9T z$()gge8VfAH&4$&=bQDP6HpN&H1g`Bfh5RU`H(?*zdTNnk zL<;jp78!3&{|G<_8l{T|z&ml6hHct-GV=JJa4CrUH(dGv_m))g8!inaBOyCu@^+JH zNv{oq#use+nXZq1l%x}a zCJiGi?`GTIP;XlR>hq91QU2bnmxmDR%TZo9HaujA_LJC~xeDKu5x19s}M1xR*$> zXhS?*1Bq^}W&YsxJNdh8`X=;dOM<|MLm7u1Cy%C9D00)A6RGrcOg9!6??ZOTNJH6? zpSG`m0G9a8s700-3m#|-ZuS@Q{fXA)$Rx-c(_UEX`^j8d4Bw)z{UT+4+K>XRAS`|} zr!^(f>;*4bXv}ymp!ZFe1-9lLDrjm*?`(P2!t#2>1v{y5sqAr3=O4LL1L;7OOBE`p zOcLmUA;twm|IPgMAGlP?6q_@I&bCL$Nw1Yq;Zg;gi~t-P7oR2-E@kLY_K;3qI4yXH ziKalC?WaXX>6(XDo@i`z*>CjR-ROI`G0?j4YGGsOXk#RS<2@PqrtKqp68a4WJ=(fi zkw?XFXF0Lxnr(XNMbwB1!?%jIqRo%6$}M!#7G!a2PUEwX94c#jsp=H!ecRRn1$8hq z_C*%GTJiatIeJGHHFFpJ9)rrU-==-OIq>Q8d(SQQ9r%X?hHoQutTLE6bNY?D81Ca8 zx}+V>&uG3gpG9RDz8)>HAHyF3-u{q9l8?#o2ix>p0J_K9@Mz2%Aq{v11@+Dxk@f$u z^_O8yK7QQrzcFHTca3dy!)O?T(V;X*x0HZ{A|Nbi1t}#}T2K&_kd`q@1tgVHX%P{S z5JX`AeSg>WzwamavmM9zVCQiz&riHxR`bM(L)}JZ!$#Wds&5bnlXU-g1jes9)d0H<$>_YYGgrU-!OSG0yIiAJg^#w$H-x7N%*Gnw~vvy&-ig?FC^@jWa3|@R|jOr zJ<)14d!O+=2HXUheXqcTR{-~f?=>w`S5MHVmHl`Rpq1}EYapG?_JNyN&g|Z@ls~0! zQTQE;+C8(PIh^?S_9Jt36IIjQjU^2Dm)lvi0&S=QZMYRPlQm=UK5h6%#%huRt=Zm5 zlNC+-N5(@I#yk}LY&}CB27K-A-sXeL%W&rRDEi`knhwmbGAp`l@=>y2%FJ?DVOc>2 zN8h>^A}yb9zT|*gWyk)|a++Ke}?CWvNDVr@*ti1wE%kGeYqIRZytLFMyQ3=>E1?SZCan#b2yq1 zkyCWj!2*Vs_sRv$JHE z2<^EH;Jfw5z0T(Nd_y1cgIUbtz1($VjQkSZV0P7O>WDFN{oTXk%unQRFZTMIPq(1n zc8rnTj|s`l;_geQH^~R(uYXj!>ixm$Lb}w`$*&t>Fp0oY<%ezW8`Isr1b?rLfdGe|hKT|F z*j}+kUe^LjS5R?{G3-?5du>I~58zLKP_g-)uA~^g`MNey!L0IvI`1iFvZ}yTOkncy z7LFsr1bPo~U;@vxi!c>K^U>tm4Bcd9ch2V(d*TWad4K=?z>BQzxN-bJsr(+TZ4sw! z(ZGEst&HnJAv{_EY&ugT!%x-LlGY&x5x-M~6E86Xk0YP|-DD;uz6KxZJ=iU`c^VM< zuZr)`Op!g0R>we*+c~9fYJiW~#t53xer;R8NFjZA(t?>Ty0rSVoGRZ7nND49g9(ZF zf0)u+PTm?BsS=(HHb!}-o;QaDD`K(9B{3CB}sFuRLtyVtJ@v57a--t=YK&QmY%dU)`ldW(MnETFG?;y9&FP5(^FGJbHCjx zKbGr}uLf-!6N=}q-R4pB$+amwqhz&2Y7Kj(iPp>s-u1+8b%uou9xr$YxBc}mjr?@y z`jdwO#=K>CYCay%6kse*RkA~DU`_IFbphKRRaPZQr6gOQPDgCgIlcZxZ|6_l`bXYu zCa(i3XqBnwy>rFH=Dk@S#N*)%qE)AO*3Rl$e3#9o!Xl%^*&`^u!R2XXxWP8cAiV#( z(duu+R){yf@)BTvPYiH9{bptw@ug};z-i`j`a$84?HDF(yl{JWa zh4W4bw+LhOkO7G+->2>Xt+0PruqPOzVTKr`P@g#dlx4A5&;}#zrX>g zMUke2e{0^{J*R^g#~%fJ;6A)#F-C|-tT^?b!#`4{b;GNyvmy~|e1sS_#VP9t5iM(i z|1qW1ugMWKJ|getjP-Cy*(nG(lzl$OP$}X;R=lsM>>X2!g(Pa5d|y~zfoYb(>%7P+ zxCArwy>-p2{2DS->h|}&9ajXnJFeO=7uDyqee+Rk{fKO2ow@UYO(793#v{Bk;4%KH z@Y1A4=-#}Arkri@bXCo^6={A{t{z($>;yqf7LIvEkemB6{wia4u0&mK7r%3@mi+u~ z{OZBMTDNl@$rx*uH11pWSY}I<`~3uY->5t{ zBtTy_*7|xwT<-fK{3WZK*j); zL(Yjg8b*IQ%j;2u8g;?VD{*Jj{?yU68p(mSkLhghXhSnb?u)Vqi$4gYmcR97bc6I# z%Cg}MQRL1Eba)!;(6YrG(cub@A-wWN8Xtu4H>DvL^u)o3X2^^{XfBM9)?{1v4E=LdQJZ%;KhGT>C9y` z_yG^jnDK9hoVGVjT{XEo>(Vvb_9&<6;jbGy=6K@(GE=GpMl2N5e12}%;oBm`NGEOl zVl|>;x;ixaM*!;8F6+Ii*3jEMU9LM3$DQpLyXuccXd9>c>IeMxZ0|i=JV=<~W$YFd z{GmnJJ>>ZIpJ27}REY;-;Y#zf3{>3s^-aHa43U4M*tYHT)q@p2uBS zyOP?iajT{WnXX@@pd^)AG%_OZxWuXC-|t7i?vJaaq1Ynis^FTV15Ah4CN@=;r3**C=!t7S zG*@jYz*uZY&7!?}^WQ`=CSrJ^)~8Gp0oT$<4C2p=&x1!ssZ= z_LbZ#=;8z3Xb4y*qX%;C!_+F@ntd&T99!YjUyNt23sd|&BVhX65@$%FE% zK)J##iRE1|qJBx>Mq=FT4+u6jF9$>wpgH;Y{IKlV*J7x`MSbz9YqCc z1ONpf0PWQF4`VBI0IT@>LS=xj!YnrdJb-WIR;b@z3Pa)p9stVa4(gbXm_@ug697P$ zjYOKjzT=T(tdu_-pj3h3P@tiM0Opm%1Se~ zI-E;ISS0RZa=; z!O-jjg{^cW)hTvhFq#RB;#6``9*U-rI=me~3xRd8_fVx(`gRo9m;u;;;h$VLNS5iWN%AFW$NF{?4=G%+TJbsug03$Jj+fq*D zfB+EB2PEtD&p`kmJkUWK&4^MZv7--5x{s7;pb8l4`&~Q&mAAFfqNUXTA*(S%p>ae1 zA*-0K!NvaEyk081!3q9Kv;Yje-k(@Ggk~}Fro@CWbeOdOfdE5}`?Xg_hd=!_h?A*1 zU+Ypd(^u})SK5B#a87VU&^%rPq^tu1*D!Y4Ft@6HrT5gN(3TOH0u8(#lX3(IQyBI^ zq%wE{!kqx32Czc@N@)XK1f+87o_Za+A9O@j2?8+BX%0!lyqkef^81Iij3cu#2J3o5 ziz8V_)NIRxW)?vA9b=Y7gS+?5JXVaK9f{?2bt&ayzzUV*Q9+uhJ|To21VMuU0Ep5s zC;$*917qqlP_8gZkOl~|V6K0l2E7qFe_lpN2oPNuJC(eoF#hb$r|&yIrT)cRT+kAzvq_$-kGP*xsV!U zGLH7d{E4I#uGQ8NsH7 z8gDkUWCgdjWEiVH*Fv?l=#Yg-^;qIh3v4F{1qe6y?H=D^NTzNdr?GlVc2<9K09aM1 z7|F#vY+skmtg^5We=_tbLXq_rXaE?)VAbwls6%E}L%NdA)^vQjl7vK2 zy;_vO_VIstFrl6+4yDwjI2Zt7GE4_C1E}{wd#=F0bOSjsQedsHG>qK&1cQnt6QiBi zZ!29(Q8pF;bZ!iC)H4`Kg~^AzdCn*)rA+a|JlZvlic?xkVJrcj-*7Ei-?8Yi96|yETGp}G#!#3Tfow69EST>zm6ZFVO0fW7w*G}P0OUC*u7G*c zgK=5~0Natl&2KJ(ts`rbpNL`X@k0Nx zGNwfo7EZ3xN&v1mENbFhI&{9M^(Z^Qfc{O+C-59j7ZwFuQUPLj}d^sFxaZkppw)_C3{o)^R}}!S4S3gs&K%wV=TZE zGrIwsHgw}2r|tY@9F0dlul^{SX|praciODO*|zviuo4*MjNn6otT4<78n!;KMJ-H- zLdu!BGqBIx%>=_n^_DKJT`3ws0oK@ithU(fd6XT&JpN@iejApw^=S_T$FPF;LV zumA;s)b5HhPPdW@K#{hv0a~MqX`6=ue%^C-JVjBlxIBsURI2#+6QqkZtb|{g>iE}3 zgVK~`!BCK;+G+2@8N%r_6%`mmDdEd42%}>8!pYRcmEdJxlg`s-3=1_4Yp3L|8HRHJ z7$rW^7p7P#V1)j-DbsmtIs= z&?dkH7?x(pxQ|tq!!V3sfM2y0H#yOgXQhg_TcI^b zVMRHXXwCG4hVMsiLdZk)Kr3yW{*~Taw*Ve282SkUefFAS(2{%ak8VzH#1sW}QxD~S zGW>}<4H%#a&=rrz(13|=4QH#|0X4{lcBDGV?|x8Z;r2w!c60Nx*;ljmF4TMirY(AC zFNw&&0*G6*&;$c$Sp4g?x5-;i>$g7OxU^VUbb+-K%1Yk6+S>Gw2xwU^kcoMHinB*y z3P%%iOcmyP!*?h&WMn+d!NF}PlzON^S5}=|0}L5MVnBr{{F1;cF{u&giA@o#$ zBkb&_5efxNpu*1`+wCBDrGE?&Z1G{~&rm{*g(J`OA!4<`XXZKA#6b%xw2P9WohW0m zl@=Uo924qq7yor zb{Lw%ugdRJ$n0urGZJCqO_hyd?$+fM$J~_jn}-7^!ZCCv0ET5yrCYD`hBq)A09L2; zIqjo{V1637#zm6zA-kh27%XoO;9h7T5x*eb zj$$6c(0}gE3_X~Yja2>^h6wBNukEQ<0kA;`;1MDNI2o{vF+X(ysZ#*KnCtAe^bXF< zBR)V0W95T(W|>O-{eOEg6+enfB(N18g|qc55p9vUe{eJlV?g zL@_q5V-Fryn{4IO0sn&y@zZhye|8R^@soB{E>20hTp)gi`GixA<-r~M>>RV_-VA#v z2c-;uhFz7jX6t(s(w>Aq)l@p5GA@hIREl@Z*3{fGYt)I$agY0KoE~%iLgzU-agloV zNl8 z2bn1qZ{2)z*)IEpZ871ua3T`@#PcBG=xv(zG~ysE@r3l6RM&j;l(fc*1;+iyl+p*6 zIhG}oXux-%1|~|^_aXgBblgfITl*sa-%P1Zyo%M6xA{_zuS0$uh>jL2U7xBC{EsQs zynV3q<3FZUKb1{3sHMh~Z`?%b#`clqOs&l`n}(p{AVWT=yWp?wW9hjj@8MiBQ!2Ai z8f-8jk#;P*)E@DDsv-D9&L9t(TlMSbiTrXOj?9#nl5D>^v$`F0M{J%du8-u3+rA9> ztF$>&?#BONO0Pbe@q>5}UtgbodE=jI=-XJDgWX^M{x4G+dai!3 z{=PstZ1-H_Xlv@3ZDZ(#=B+i?$jJZ0l%{O-#O5bapB-&~ZVubK)Hy$qrGB2qoYVBv90(KBc9WYKPz%RZSpxu%^0Ke6TiDD1|quT_ja7cI$!r9aqEv zY9fA4&^x3fImi9g;{cb88-X*c_f!^Q{$on#L=YfGYu0pVVf$K$Vnz&WrU92f{9cpf zUwFQ?nTF^?)nkeN2W(Utz7MfFB&CTQ4yrp|1&Mbh^$XcEul8Yf5uJ$TL$-Mp_ zQ>rRSU!ONgG3cv5>U@vA4IJ3@Kc@8g>u*`X(?x;3ViZN+H}mE6f42Q3XLKz5+bq)S zjM=k{^-Gt0Yw|aMr4Ps-uh1|5MAEn4;jEUE$oQc}eBhN?yy6h=2mh&Ce8iS5cdocn zwvp_tc%}D}FGxGw-iZ29n-B6Iw3WwH_L6eP)EB#zKkEEDSDw(L*Fj7g^4~>FnM(H{ zrY*HD5Hr^1Ix4gFE_YQvT?_6}`Ro>Vp)%)nPe*m$ujH=kLg0%Y)x{fK7phA)Kj^4^ ziC(;`_Vv!s9<^_G|6QmpW9gCVE6Mx`>Z|F}z3OY3T9@kUIp#=>jfXA?8kKxCyBbGmX*7x{OiBoTS` zXI)0k#qqdi9R4(6NXwFLcw0!N2Sz+Wr~MT!XyZ^HcdFL zZV0t zJxdu%V3;lR5e!**5EDNs+}F1zTtOI-u~W)t&T9+p)rIQ^QJ@S&dbySg1o4avwHpi^X_ihfE>;1($t(@;8cd{TK9^ z1J3;{dVgoe*Bg{9mA=#HEq{@Oz~ynmY>|Yh?O-L2T$JCh7Qm6gSRz`P2}G+n-9e% zt$z;KsRo11mVopo12&@neEsFg(VT@<`VTK98u*`>4hN0e)1C9|K2JEik`N^B2d+GIbvR=Z%r_CZJQXTS3(Q5es+ z^P#uM3eNYExtcq!{-&=bQu6ochTMxt`dAE=tO8+47p2epO(NNCvX&Bawk{O8n_f*T zI2}~>iuS_NutXd?tZ9HBxpTk1##WH)3dt$|Da-FVd)c3jO`Wq*J5(mq5>wX>J0<_&(u}(~=@nfNj^pr(V^z<{OO(`#vPm^MC-0&d0fN zX^u;ZD{baWUZDkLE{jhY{A{sUO1n2n!X}s`U8L8?xDRkff6w&9lKstsodVI_@V2hT zw5y|+_U#Iig8^gN$v2Ncs~4AV%v7JJx}Qh4$Vr}>T|zdJJaU9BM{v4F!~fue?e5+^`dvXB;+4%z0~b*qB1ej zV?X0D(kF&LP$;uu|I1^*-Vga0Z(hC)H-_rG5(#rYLO*D->MvpbNo_LMHEA zC({RI;B1wKZ(Te3{oIC~@%RM(T`(GMQPI}IGc4eRT{Mzz=Aa@)536Nk(ujXncZ}AL z;Fqxv0w62Q)h12Y#bBR=RIkAk_t(5e>6Ev(^xj+Y#G9oknhgB|{f_tZ;c;L3Q;?zE zk2gu@FSg3?-=wHApY8shS(q#)u~N-Fix+$S^o-&sIY=sXAbKqOE?H)MDA{pg7T}c< zNbEh~6qMSp4o|S<(tTAJ{rB{RwTp?bNA9G!_se&-Iuj4Y1_jx|ZBIIcdQBfI&FJxM z;oU4LOcy&i?5zuXc{rEq&94)gAY>p;)&xtsc*?@~u(X;| zrYe`W6uq5kOOYU~@OYAIWLjtD(YF;L+5KE04yU?@PgaXa*l!|07>83;4n!lzhW(sP z7s2kskUK&Z5XKT7~pz7z@4#TmQ>FD+Mvs?obROwe_J_!uZX}$&{`uPpqzK0^ymlcP!UdJi zrVy9GK~91E_p#irK=yoZ)>3atcLHY&mis<|rwJ&uj1@k`lD$@02b^r|s>~3SKdC|@ zt3vXLH*1x*bicQ(1BGN$zsz~RvJ+mT1%xEG8&}X;QDml!LR^kQA{NNbP|4i{l$E0p zhgXWIt|{w7kvqg|b0Ww&v{oTnjR7hjLh=?5@{za&V~?*CmZ4CRt5m5+Bg|k_@9-+) z`r>B<9#?`w7EmY+CcRIgq+Y3ATbbzu6+;rWg?#npe9;jU(&spp79yO^m&CFz-=r^! z@m7n~Gd_c%_~=x);F1ArJo&3Ur+T9CzG`W3(@thG*{amk4_(G`a{$fdC?xmEWGfm) z_W{|0SK05kSk<$1*F$gn+Wvu`kjd6_3Q4fOr4o=IA}Sw+AD^Q~M&TXhDEPBiEhGLz zRSknS4y#1v!zqNaSDjw>E7ucEf-J~nt1?)WUjb|H&@bbJc9`XJu0PxX*;6!N8hH@(M^+GtTY zJUT&aVFVQM(pyv;9qBH=9M6R)s6t8F35 zn27Ud<8Bv!Uu^+N!{OplpcJy&O4dI=`LenK(HXFWaeWQCRb`GAaR2_+$WZ5vf7Fn zHe~Fd!)B?F1+!6H)+cJ@J`R#r9WE5;(ej^tE0HB0#$~3vkCTR?87KQNvBT5pS=x|w z zKg{#DEYJJNaBC@U5E&wE5fvPYdGT{cpG{lzKe!cxHiMJlR=pb-oc(VS9-iYQ$#ZQM z{z-T}_=?Oma|`|JV+NAzFF0aK%>!VGoBaZoC00oUg`I$iaf;Sn1Kw^7?GdGR8fF}W zR;?Wn9Q|N9hc3+3*F6ui3J1N)qEc^XuyjE4;a!&c| z^{Pc$-`7R)B#sZ9)g|pw z?6nj6H!WQx?-Rw(fok_bvIay*)1=1fi*W14W2ik+;xina8U+?)UQ7{J0>jXrO=<}b3On=2*)NBjF^~IckV_&f{`9KvXz31-Sa)U+9b_Pxw~H}MCoyZ(Pqo|;0e^kb3DKnZ)WmxC!ZW#1^l1dii77%h0XwF1iqEj72Ue= zYh(-plK{fY1mIOb1@i6NY_Lj{!j3&Vzdh(vje+z}J;{AEPkz4w0^4yx*sn?AwrVmL zXw3W)Ax9I;w;#c0P|8u&FMNbk+R7=mC+aM*s)j2dBB$hjV3peSzPD2HBZAg&k>(gQ z{SrY=A@pBOl4c8ve+NffOTx=cU}To`KF|#?rs>nylJOlJxYmHT%$RL|P_PiiorVMN z;K3*$%H_bq1kde+HxI{igrimBS+DbQS`qZrCCG3qUdRN?J42A)!0|?fZO%RtO2QJ~ z8Qkg}kZVCfqi`S6`Wbg{oN$~jyr0F2C|fIy*lIEBj$ghWxcCUT~9cJ%OoP-+m1bUP6h(fg-0Mjut)DZhh3Fn`WM^4&Si0KBjj6h(c>o zW=j}~56X^lDo&pb%)18e6q@lm$&LHtXvuIZ>R6V6&@lyojNoXk;IPl*MHy6T5ErX` zP|uy$W_=`m@I5m*TUwPc?+x1J5KxPq6dv7BiXHe|uI z5CvtZRe-nZ`nKx=cA1u2#-@^5UsJB-Do)1|Wx+|E$bP}_V@0yydb*=~uiv$zHS-?{ zl=aNu{xeF*NVIA#%1ods>rfq_CYS6ObL))(TN*CdiYO2k%1Do=MQ}_!i4r2{8hW>taN}IdC+y1Cv@Dat=?do%7O zUG30zYQG^}U&bCQSBe&{Hc%p|E=Ykgyc7;3)oQNV+TXDgG*SEg8d{e}ZrB3ln+90O zMKEI@&3moPdqgc7!uY477tbi;UKl`^fWx`zbS>yXXFXLD+})o^S{C~3I~p(rTX7#E zSyxv3cgrk^kh-BSACHEV3naZ6kjam&dm8(^F^T;jQL&wYu>~uUrazI}#hFIJh!(V) z-H+=)H>Ahl-(L>oRJQ7Z(e!gSvQCsA6zG|i>9e$;CM#6l{_D!Q?w9+F{FbiQ9qDHZ zkItjlHjnCh(+q8IHeh3j6Y~Xe04M-&2trnhDU}pb6R6I6xX$7ZR{8OepRI2`4cvQX zFfdMOS_0JO%w0I6EJ@uJCw7#IJ7_Q4$ZP~-H$ZPG&Yqn7o%(r48=>bq2#Oax8@%2< z#O%}VqN!ne0iPu*6k?Hu1=_TS%6?~aqG~jicOTzXU`y+?08@ywT<5PRJFaHUy_)B% z=q|G38l|V}ODy=-sk?s0pxmJOwL$khwuw3J)moBP__S5@qK1SXgAqwx#*P~%z@|p! zZgwBk_7C4?tZq4?bCa43xq@O(L0tdP%m47ue|R`trTZD>_6wx-=YKu_2IwUs5Zhg- zzGm>8PTzH0AG7I5-m6t=UEM}Kv!h;JN?&4EFRd?HH;MqWitk%Y`r*b?Sz1>|n$4EcMa1LYodFF%62IRimw>Occvdg5 z-+Ad-zDO%KRA~~kdG$#e*NyW#l_Jm_+L(h|g=P0X4-oFg3bW?l#;4H5#PR*hE1fe_ zHO#b@L!z0#LYu0S>P_V!AnmqVWa2edD zIH~_w_i!tZ<%sDC1a-7M{M5og!Hee3r{F!EuTUJc#6syEHQfahCGlxO3JZZw+UmD) zK38Gh_)~t1c5D}iy=%@Oms>pRP+9_^5=_!l?Gz~)UsnVd3(_rA>+UYSlPJHpozD6W z{?L`}U0i-U&HJ>2YZAfntvb0G@0r8I2`Z9Gzg34d}V-rntF#iF`b z2TY!56eQJ_#lB7(hG;WFn<0uGu7_n`1$-hcty54DPX%F0`Q-@^*{E_O(d}Vtv2^9A zDN~gTju|UMLHmlk-A+lI8S8hd?x{q`{I*|(2R`IWO!`ql?*)d!`B7zC z?j3r9(PzL2ZfQ2j@B+ElKfDXL4$G->WH~a7^A^C{RIE+s*8XhMbP>-u48-1DFANI! z>%(KS@wmV}#XAz@VJ%MhoEfy;Dp5dExC9ejML)C?GPn%>I#%AHu1WUd4CH~Pb9-l! znNpRng=<+F&}1_CS`_e9uzt>u=DVk;2@xL3OTl@^B6}i9FsxRN)^2NlBgJPyS|%Od zJx@h&bY~Hx<;!I$zc1$;jb2^_yw0 zI_8MtwzoE#UDtJog>Q=mu8J+*B!Q&wzW+oi-Ayla2zEk&gjle&cFPpJBhYwP0v~tI zAmhGBnWNpFeODO}$tFT!hw{OnHhnmuV?*L<(NvTR#$=$n_sBZUaCQ!d^u#8)4^+~k zGmLeX6svF6S|(Bfsh|gnV0gGR#)iieR5WKGj%3ZS&R^xYHjr-TMk%c2>C7wT1LX)z zLz}jj-Lb%ku(A1q#hjjUx|dO0Yigoz#AGLVwpX-3ykzOH8CFR4;>j&}dMAxxuKer_ zu~86)(i%$Q|tEd=GP9fhhdCEZdPTOOOCJQAq#6>`g~bkh1VsERY=adF@j; z8t<=MZfcn*0+K+ei^91Kyhx8ieE$>3@o-$lq*KMAkNOf*0qh#FD z7JWbJVKyb?m@RtmEGP4doV;-BKZ#|{i7C`6o=Wx?fS$~}7TxmXxNNUuIsctpA_o#w zyTIutk-}}G?yuI!X9k)F2H6&EMm^JvmbCC&m?_b>uhBlPv%pH(Rq%1vDcJ|v2glmi zwdFtau9mX>a5mF?3-v;m@wr8+vt5ypN?q_{vMZWl-|{57J|sHG;~Ssj%kkFgXsTfM zg3ix`)s`k3!5|;4AbEv7Z4-Un5O_?}&a}C+=?mKtl)T{6`(%EwVF|R+& z)YyKs@6y#~XQ_w0;T)aYjjU^O^LIMw{Fu2L^`h&`j(Y)3kclyacORK~CDLk7H2%lD zk}kGylh?}|;~9T@2rc7yse2pw7)BXLmnXe{zl0?+3V4D(6!iWX+)MoX&l5oJ^p;Xp z0moU&L{0ik5Pd`rs&w^Y8NS_noT-p1dG0~atS(4(DW5J9vPvE2M3fGFmFaPRg=h5i zt;%H+HnrYEC{=x!amXHn>IZ?=?kG7et9-!l7ZcTUNz4Cb7Gk znoQFU_*P>}kFA%Gg&z(low}Age&XS*-7zQKyh8!EzVUynU^?MmWx+G~7EG(7-te`I z-3ZeyUOwN{qv=HjMe;*g{ysV#n6;!*oZw&MefFkD^6Sk9x|&PvCdSe{$Oxf)PpbJ} zZ&)1q7|_dQl!`1Jea$^Lw1>+DzFgj_q?J2}7=TcI>g{N~Y3(kOrY(p~1Kw`T$+{7L zLNA@B_cS^Ch$mi+x;5d9E@?UGd7{9M*Ivs|j&F)pE*)=^*uRB`12NCsQmN$XjOm7jnR%XxJLESA~Tc5S-%3SY|BA|>hY^29gpur-2y3F z0A`3sG(;MJAJlo&0sc5&m=R`*8iB-I6ma%G*T>9QiMcRB1l;^ie%HqE>&iBbV3Mkv@McsQkSwNrxg6nviiD2kn+wU%8M z%H&<9<8~;cyc!vM$}sEgvtsz5w+;3B@m^$wLWFYp z>S67Ji=bWV>P))QGEmJfEy-#w!HQP8pRw0VV91LNSq?E<;TIDTC`{tcy}=?r33)`` zLikw|n$Zc!Vb_tR6aGc75255@;%>?Y{=Bk1#k@$@C~(U1vQ31txuK)ilfs3yK`fkO zTaMZR3#@}sY6567LYQAR+goE zZ?8905HvMYxWYJe6V~(!BIYF&u)_byfCV}N!Lqaz|0LrC0&;9HWgCGhYlv2a;LR0o zEeNF%0cOrNV-m^@JAnP!;9sEPN9eIznTlS24cjrJk+FeA9zZpw;Bg1Ejb5A%UW@U$ z5{kcSpKP44@Y+@6w52b(a;GNM_$m`2u#=f!0a z?h-3>SLhTUHq<%*#9t_hN23Jj^5}vxR|l9IWZ0F(uPCo%C~NI1Cqod*&F9+EY%yiZ zI?d-v9~T{zzagjJ*s!bm93mw1c@P!k zJiW@=)$BT5htPR;?JtpEmf2yE&7f?y$zB3OcA1DGlz}WkI#jv3t_!^e&_?KWEE9~u z*M*jrbHtRCO7f)E@^pxMO0%+Tua(h*STqS11`4*F({Jf8YBA8^FvMFL67mfB%h6Xk z3=HjH=5~umA-XqPbOUF|lfYlRIg}zfOl}_;dd(QC*l;=rl`}PPWTVUi8=)y8diDmw zNs)#XK-!3a>ScoZdyEy}y3jU+fu~%(PNJs92Cb6MBg1ykE^Qj!z?ssh5RI{nKCm1V zxsr^S+inrqMO=-HxcV>ZN+hEdRjU-eiY_zfXSP<*zMT^P3gcL=xP^@iKxvD=p;*YC44+p})ND()_39_E=Id-mvW5Utdy&(vJ* zhn#a9w_F+9I#}913pgmP9ap<^(iX0o1gYpn5*RAHSj>PxcBf*OJ+|Xk- zXyaqi^YyCq^Y|o;ATY#wv7Iri)7Y^)&|NzR3MuKafvbg;^!yfv7y@R6yI%=sHKlt8 z8N6%N(8w3){je8tW6>^(O6p0oD zXP4T7Cm$J*0;8xV2&WXErgykP-`R%_I|RS@88Z0@ebN>d%@|(SW?yhmSOCFiBVsAv z?=a}i5&>kFnPCVa2vg3GuO)m)kJsszL;%6n%*)^Mv#{04Exsu+)l~uCV>OpR#j{nO zJ|K%7HA`7JV^H#qQxSL@j;r63mH#&@Z$1B^!&vVVbKZzrpXg2KI+{nD!$bDn_cAVjjgpY4r2Z0He z_Ol0Sp^xo4DrTfxW}tGPg~3J%HPo~z2D}zr-u;vz-fFiUZsP*Q;Ss6V75lleA@qg0 z4+9`zQj{n=4r5gY5H`TN%=^=y`3ZRHF=s^bcfG=fNQECeD%k^dKa_KxKbrbuQL}Xt zAKhLZGnXpPm_|Fz2rfYeom%K^OQ+%%HrA$(U@*c-lTw zp?;sW;NGi)4E$;QJUc3}PRRG6nXH;#&O=CBKLds!h=E4I3jO2A{jhfNBR$&ja=vU& zyy7Zz_Oh975HpvNcFG^Q0m!{p9{0w`m3H%QQ{_C)QgqtauIg#% z8h7MtE$3Tn7~5aVNB`<@{`=6YL)Y(`u13QnOWp!yCFC=u&hw+NxWA!^MF!DD5AX68 zkq#XSHxM(_dHLVCQt1yfJtRfhcws37{Fe-(W*WtV2P zl4h;NX5F}EEf9y&CrqTg@Di}q)}_^vzg0zqp?9{uu;68#lGan{CPz=sM_yFaac#=4 z+a$+2BjVnObiGOZ*OpxJCN56AlY z*@}*Jj&XMB+~E^XH*YR$k)~)_Tau8fHD+|47`GAe`2`g?*qB^ zjX@YXXP$7c3b2(XAXbGq1d<*aw>{)9#70ro^DG@W_p9ai%XUv< zj|lp5(lVL|ag~EJARQ9=T5_vS22^Z_6~e!y98w?Lpx{HQx4;sFKH-z_5SH~dgmIN% zMnlccAkpuljg+e{m9dU^jjxq)hRk>AX-|qjo&@Sl{MDQi@0qgvJC)k~+Px0{7?&Okq~$WjN?~b_ zx*#>@(9<~6Q;`qM6!1qlF197JHVF5w$b*Y=CNr}8su$*2K1RPaMmp@)^pmRi<~dWO z6Q|6#Ab1!3>|(z1?r@1<#bEhDgWJ`Q$N9{H)33fPw%%PF?5VLX9uYPb7HGM}3uet$ z#u`xzt7D;A<&3#05Cd$)CtqP9Al-Y2Fo#dcCg-Z5Z3rWJGNomri^1zH||KmDBM>FtCNc z2zylC2TH@75tz@Ht>E6=&Jb2jFLi?z>f()SY+m5n6|OB1U+LG#tV>Nh0L!kYHOg?6 zwx2dFA(=Z^H43vAg~9P$a(RQtcx_f0&Au4kx@_=4tly&XsS+Z-vi^9_Pcl6J8z} zNoSw@f8vc?` zGajr+W=fg&RPF@SETARV&M(S00}`4rm8JUH1E>228RZynDr8bH@%L%pmSK&mNda6>(GBH zE{L}CD6;xJ0_hb&IOqSqojSzG_nME6eX^oR_l*=AufZ0x0J=p=*4u!hqd?ciW~#7b zxbYpATJ&a-!8>)tT%q;`v_^~dWo0W;|IXm82>`G+z2ueNZ{@$@7 zSR-?B@&9A*zMq?lMqq>=^&ws0uq%bH6Xo75k!j8K?FpKAVos&5~_fxGz-;G zL_kXDRS1am-lQmkpduh_yzl3E_L*V@poQxYPc__f=bsGueEaB}>*=|K~dUV@HpMDrK>&6--b(O~!pO0~`K9aGE zXWp@gF}veYe=9PSkeCN7DDLegwd z1&EMAwI0XE%fJT?nRbt~owA(bQVSo8W#v6prYyH+3TqwZt;Ud-GX}3uy8*k-EdLqS ze;S4kV9j(s9p{oCciz<^J3=EZOLVR3t2ve{w>_E zipz^r@)DZp@03=yZiti|yP4^q{giNN_IeC%sHCiMCKbw5rx(Xkfl36&JD@N0u+x|U z-LkUsfs;7*nn{m7+p>|>d@YnNK|>Bgr21!OxvlHRjL z_<4d?uPl$4Taoib@6b)=%3vMq;}Lw8SB?1UExiELXJwr-j57)i89j9%+it<(jnO!ag@?5UXLFo^Qj{Ho#J$Lq?EPEpc~yS^@J7G8bGl9cq&J^a_RpBz|%r-ngX zLo>vHoq!LOmc9W9g?_0zk^e5z!V^Mjc z|4Go=F zorN08vHhlraX>oRl`9Xb;kSOC6O2Tdk@@cKYM$cDi?(uMv=+b7N6*oo2y+s8pe1s$ zz&2=}&I&Vf#HaD0y#3prEMo!31wC|j@OG)w9x?4_7a7k0V}tx%PcIR|E*=@xozq}U zHu1roYIch1O}1!ZGLd^ue7d_Tt!+~>&8J10YIc$M7aIAg=j7kfSSze^Y|1?rfTR} zy~`U|05%f=vwPi^U!0iLu=CpV&4y7iIiE>2SX;@aB2x0&c{Zn9=;`>{YI^#RG3WL1z;ZJhN`XO2+RWJ5+s>Gf_k+xA z<08PWhjm@7B+`phmixJa!kJMWEw)nu`v=#R*!uD zNc~i{Mo8_%OO*bS&LDjhNaSr1kQj=>jE}r{Mx>FiM9g#{@CB=H&5@hD@W|Fj;5lBV zfv(MmJd>mC=kFJs7oS$Qx;TDxFFM7f+wD%|Ms;Gkt!3c! zGK{|bE}NC7?yHP4fx%}7g*_Gev=3upR4^7mkf1a0Z+~KM9 zh%lTVUeM$uZX^Ygyk&dsYIWfr+X>gX7D6r6>cww9sqlQkR>h5OST;>GP(rdg!Y=Er zmhEKrO$&4T{eEs54++fpY{&8bS`&66K2UaOh5dQm&^s${3SZmFYQwsw!qKHyfEh4j zfID3LyyI>@Z}29&LBLEWMdD?=2Cq$I!YTH&Bz%JbN6~&Q*Zr1l>l-n=rfl-lj7bk}o}2B@$A+>P zhWe4Qa{L9abk;uv{q8S04terwQtqcBx4)S1-c9ibIwoF+86+%!-JH+HbxhFATP2r$ zn=l`Tdyk`tvwUA=VPDUHKE_F|3Jv*nueTi?&ff`fZ^Lk&&T!`mM%vH1BljTrPBkC>{R*z3dk;50d$HOgYp4)fvy|~XteOh>OhE2PUj6k? z`xvU?>Yu%%cX;t&0p!xtQEs-e*vY0RISzNWvUGeo89l zoyeogl=zFx_Xn?^PM|{cNrXn5^BfD<0U>DV3XFY;@q@obfJ@T2<0VQiM!5r?V957f zE9vJNW~2MjqY-SpTKv))N#)uj%^HlyW2S8^Yc|!Zt`L)78Eat9_-loQne86#HSvBV(hJj*C|)Dg~e{m$CBw> z_%N5W8MP3Mm$O)6D3Z7KQcZ8C?|wQVsTHN0n1((dMf(~42mIzUsSLuN3|%+|RNiA05S@EAd*3rl zl9Uz8hvc8fR#MpfY##aA>T^-9+_9dy=%Ht5 zMu_(V@Ow`>%(>@Gy-YhK=)J1v{iDzCVskI_ir=LILjfN70O&E18?k|LC&Tth5Pkqg zh(7Nt0PT7fByaIdiM~)(=kaAZao$O65*7M~gc7D=)B^I)1Ea(_-AbrHBV2)foVXkn zSOW0)PG&m?z$oaTBygZC0+>ps6YhQaI+sln5cVCw9{&5~J-Js8>Rz%=JQLr}{Z0l8 zV_zK`zQVdDpI?BzUV8DQ6U{>gIm8uJ52yDFV@^$AKB_Vr{>DD%WPHk5eCjvT5(&zG z`J~8p^toMjQAL1=wIwMwFd3E8+nZ^cR=$4#O|Bpg;8rrDE;pv+Ww{L5bwD^OA^G(;D5p0(x!{wcH*}kv?KUs}aFTWS(^2ua8eEHv zY{>9C6)#j6jIhjK=__^k?ELX+WuHU#0kFr3A9-f@swx-72X& zRWk8amrJVT`>GT-tCW$|Dx%d_rm7OD@VjIX*I_B&ra*g7^;8Gij{2NS_pRxjlegF7 z-`*&BYuWeKdh_iqWR0z8jlFJ-o6Iw{Y6k#9?I zZ&fMpf;ZoVAZtUX-d$(`N8YK8j<1dF17F~)P28+a60MCWsZEWqO+eN?j;{+5txM~x z%Xm`ve6#jRUtQW}UGAOw@F`${eEo~S`Z7@n9~H<)1Q$Q4y?|vFny)WIzOU4MA9LqD z5ApqTQJ5VTz7_XQCqG{-pdqr0VST<~5ZO4y)mVm2zQETw7T>ty(m3ANI2ag1TN|30 z3cfJZun^xk^1i5%@ok1h)9Pl!nq}@vMUyDCX$#rhCwkF3u9-DDXK%CVW8XsCgyuY%HaFj^(ij!bZa!cMLY+6R*(BkLkqiJOV390LVO#?mo`=g3dEunTTsle z$3miQf6289hw*DFaLX~YEA;a+^|iu}T03=lMcvxgw_6WYI##*bFDtamyR{2k>{RD& zW~l6>T6P+XiPh4xQb;8wAOw}at6`_p?F+xT9>uaBM=b4VooxFO-{~&a9%I#UdzwF( zyUQ-9bBwDeu>V|VHcRTf7=_}Mmx3f?W0YEj^{-qv$n+9&?zu5^()I052dirLf@pX7 zmyQ#zzUZU28Ot80{%*D}-RCPi>l(V>9W_vLdXxCeMqF7uwoywIjLK)VEn3GRzTArG z?2;j^J(0e2#k+%>l{DQe@V$53vafO)c!QxY%8joisBMnBUrnz+rJ-+gy01y0r&M8Z zT1;$lgC)TR#^U9RxqvWR6sjH8@xYxkx&B~3fu*^BU|ykbOnv}uJ@C1|BFSp-q2Az* z+hE+Zz?q|g?V~o{GO91rnh)a!9&uh;eNu%clmZr#gA5?h*ZyDaX$YTfJy@81a$*{IRb>t#lx2j(-)AMp?Zjox;KvY)K_qq32w@d*;tGJA|N{!+gSyQ8ZV$h ziLB)95&y4DEDEE0JH1Z&0*+;)3X4OYUq`<@8IKPdG_o7>aUU=CLR+?2D0pF(aFYf4 zlZEcE0NnT!p0R5s6IP(^gvO5;)D)}!M9QC!rGG?>kWa(&r;4ARZ`+=*QAs|+eZ*Yg z!-h@we1&z`!GgM&J_BY3{xJQ*&P0BlIwnoz@KY+gMDS=bSB8Y?FtvvFzN` z%*@Tonfb(tl@C*PV40nvausC@%cLSxROb6q zthQbwehIBulz$E!_^fl00QDh+rWqT z)|GFXFX)AyKdVp0q@-fPX|$`MPD(Xbi2?5;C0*_XMjRiv;10(n{pke8UQMSI$&|0m z6<^J4XKv(ShJtm)88JipDXbT#I}%w(P&0GK*dy%BnZ`|l;mu$3FymYZ?jHJaI{jB# z`)lEb776;c=?kyNr4$v8j8Km37LLq^^BG68np-ReOIzZtD3)7XHJ)9k_~aN==XeHp zAa9u7|H3x1ar*er<>q331<#!wM$&BTfqaGl5)^xFEd-yWn6iLj+*uAqIC;*|ibchg z<$lduh0OD0gLiuggpheykJL9`uMxx!~`|fWy}&s8qnZJsmQH0vA>+7t_4oFvqwx5389+zZgb_otLOKq$dLyibOaY zDR2+N3xC^&DHb~b`Vc(gHx-i~{1SZ05^`M3r+V$foUFwVEWMcB zcv&l@_c)>R8Ze2YvYUg2$TB* zx7cltSEYuWIe*jz1r%l$74+OXf9bSt@Re40>@&RNSJwcA9-(~YPyIH#zcl&C>1--y zYO}fzlmwmJPS)!NMj3=1ZKFCi4f^EY$!Tqj+;(r|FBMKmBlOxa4*dxHbY3Sb`njZ= z&J_-eVVA{5?}I>tVqv$67qNk9w_8y*mDJbbsfMay88`EHoSDjwLLE~o3cDQzpB#Uu zi&d_HFC|$;A+B&NAH|-D*Dk)z|Ez^xz8_a?UMf;jRR?Zu>mb8_)o zqHo$S>WaTxuGE!ihFt0|29>NaU-HUR7t-r>t+*iAv=f~XBso-&FPy*VeCf-^-e&`u zYPsm@hJfV!wAV-L&@y8^!4WEH4^Y7`1M7h@%QP{T#CjjY>E0+TR7079NdO_X?W0h^ z1S_-R*J&r4N<8#TTeYzt)oNfRF&Vj}wNn|X77lMeZQ9s6a1TfMm7_QSS_4RgY#$2&D)>7m%G5q76c@&zg#`H&vnnzEHth=r%2|Jf_2 zpV6Z8WyGDEyX+dws!LK0Hg-s-*GB5wXJlkrjl8Bws_c6i7#TM{WCEOSIR9hSdzXCq zY|q#VT8!7bj*r1f3#_g)6d`6vB{>n6UvG>l*63ZO_o$@{DD5VgYuO^>o;V17?x<0E z8<+Bj3}tfQPzkQ(lbx`?&RU4OETdM$pWvd(+{=V&VSFS$|DDdy=>}Fq5@S71P~{w^ z2;bMfQso)mCZAIu`cI~G zuzB0wkMH*x>&1!`D+mJLYgceh%jErv5jt6!zC7*n@nKez#fZ_BepO-Q->sO-#MqViS z7GIy)xZV{o`nJip+;3*{7G2Y%jhB zAh0w6_JM~p4T1evfDQI?_44+iA+VwMJ*@p+4Eo96e-IoQ;6)8k`d5IB5Apj~fc-E2 z)jgslGBR>2QqdwxJ~S%nzYDNlv6cTIuy00V&A!E|#ln)T?{*uZT1RcK4MJO@PfR>ZmVzMsr{vl|CpfE!`^( zqB*en%bN^r2cG3 zt7!cAF9Q3jIrCo!w!Vcr(o((I^5SR9(~^G>*f$+{|2nW0ody554(z`OY~x@>%h22Z z*@5l&-#f59j?DX{9zZC3$Ik2PuII#Z%0=u~Sb!~O^=j!C_ z=kcS@<16bs2kWzo8~+OgcH{p?V3)d6MQ;r>)a`zz7U(`VXsrLSIbL;V?thEGe%$%* z2<-BIA+Y}cfxtTacLdgu1?wc68^L;iE$<;aXj~OTLtqz~g@10VVlMqp2yDSW2rTC6 z|B1jJyR@ZT>Zi7bOP@ZwlGgXZzdc2DoC?ZNy=DaEP$X!tWIC|Q2qa=PZmedzE!IG@ zIkIz|p7@NePfCkPPZT2fUsD~3`o*{~M8L)b?8-ufiLvC6O|`z{VT{^U$vDmtv_wIn zNs2<+%j!hYDz$pXSLq&m`jY$Cd|Zp~*VJ=hrMFM`sZU(t)gKrLTGbfhRDvLe_{Yj2 z9NrqU90ylrABiF=?wv~!=D(JVRdjc$G?KH0)keN%R}mH0RE?R}Q7hnHyIQ+cU%cf= z=sssCe`&m17ghgVj!8)~^Ue^tBg)6E60!rDXLVP&-KI4wyeBNgCAF{0bvKR>qk-_g z*2UOOkc@}d9eCgC+>BC~$YK9_F02ZgI1Ixj(nk!6`Cj94lT3zRf#cS5nGg!u)hgPB zhia{!)PjftEv~Ht@ZeAtoEgyzM#c9Iy=K_+>d%I3UX_a@8PT1G6!m+JyQ_i3DPUk& zDWRbn4_4Y&yVxsDT((6`*dwd24qio`IOq#*ZIrrnbo=PXyzFKORQq)7#;pCebeI`& zuDp8au5z+k{ttrq#V<*m5~PKztLfj3FAZr3C{kw~Ge$7a$g-h8a^+$_7{ASe_Uc4J?__AG-Tayyw0 zd9b8>@!E(C1Ok5yj4@>cQ5jDpjf&62hy_j=psvbjTiqgK_ga~8FFe*=i|$bET`337KJLM(ZnE2a0=vf& zDb{LvjU<$G1D+Qh%I)y{=Y>v2`T}*fQ(EDmy|9E}yhXRo9K#W#uIknm_ePHDI91Obpf6Q*EX49VD6j9vp3|e=C>ikn2RJ^Z;*C0LftdsntMXl6#Ix0 zzDS$p0G!o(3V7+2>`A=)wzgSYcBrsZ%LjqQ#V1s^JgH+2jq6k}8+mhe&52kg^U0Ps zf_VDkK}+Y|@2V((4xOURjjoh|ju_xMkV>Zv$r`j~gZeF?;>xaQascKne;p3wimmay zkPv&gpDvUotK_mGVMW^0j4y@8Fsa*{2k+56f0p_2sRsi=bw5klCHwMC7351_^Cv3# z$-w}HYNi`MxTY&%5J?7xE|yXex?UKoOFYvLZsV> z&do_S8e(M0)=bYNoP(B~hDRi}wa$bb9E>&I`%-yw=QW|J<<*hC8S^ddDX`4w1)l{l zLHx;veWL5|#_MB{Q%g5h{@WQWs^%|=o-0aJWT2B&< z+w`{Jc2kKHB_-aFcFF5q3*F+H361EB`~)pEl*pPDTfOS8I^Gpr_7-)}ZEwzSs`$hB zMC~3w_{4J|7eZ`Za0nqh2ng0;IrQpL;ws!|YS?%DL@TOcRJhEP*Gj4$whcQ?1zXTf z(mnseS$5cZrD!n7g!j(byi*zXq8-Jww?ZdMPW`UQxsY<%CQH5c})=~s-ce*2k z57^WqO5>EjxcuRCk~+vvYd`Z=86n=dr01680b8!&q(%7EY?Zp?z%AZiDU;JWk zE_{Nw;ia`v-vb*x#<1o3RQbC4pX@JkM6bScZM2myuv(#h@s02BwOtl|_e>Qo`s5@4 z;!ieOpXg2hpwIQSI9`Y+8P_huEtc!s7tZ7kcyi^yDEIvgQ%T&UX3Wb6o4Ix2oW)R%>-&gG%!^+|s3)ma@S2-F z^++}5XEfe$%y0SGVfc?f@#o&e-je!Rr|J&Y(Uh$GM6=g&t`D%maghC;S?3)~Gfc?I z_3xe$95`cCWpVDAKPs`eQ(_p-x5E`{S1KRhU1oNOagAm1x*bOOiu+D909^=y4TaF` z*TN_}8OM+=&5%+j?8z&v_r?pcAoAc}@)rJ(?Cc zKM=&!n&O1B!*M7yu(Gb{Hit#i5zX)t`ZCvU9fWXSKuiN~!U(xK0=voYgJt5*# z{2Frt#Xr8AX1{tWU$u$j;De~q>{o0Q9Up$@x(m3KsH@62#)@OeOqi$HuLO|-tl5)H zMglewT49HoB8~&-WqXCnR(^9vpp!&)?3)VmztvKlxX+vuQ#`KBk_xztQ+tiq@_M zUZC*W(2_L52R}V>g$)50HayQyLUY-JtMKW4i%K5M2=R+4AfptG5!e^7wlp66Wl7cP zva0YE#KQ1Jh*3%(2=aNv<*cBbtTsYF|3CQG-hcSl1OhhUU;Y*NmwyeyGfw6*V285a ztTFgf`RS@IkkV8K46le*5q3s2*GcCTbJO`L>1`vJ@?4nsu*gthwrdr1cU1Fid&V)v z>3xdWdoMi9;e10K;XS|j$&jM^?el%ZqM%(!9Leg9BRGzt`XVfo^pF0kbTXg$U;OJT zjejL)G0`%Y0LJZ!xR25DVrrO|2>os2C0+mWuTCToYW^9N@h|_n+C#U#j@hB{ zuO!fl;fo5JfB09M7og-9mXj|^{^eh>W^L6ME_wQ$Gj#FYMu^*pOJHB=4`X~MaIg)* zA&L@5s#gQ~f^Wd9M)Wj)?WM^2h5Xx5>Rje04h=7_zfCJ0MH=%|Y0hd~16D0o*(xf(Z%B$pNCJOu(Yq+}Fk0uUhFD2atVl_vSbQaD=ot-uU33NX;>tz>(ss{c&(cT%K7hJt`KP#oo!@a?;|}7> z#D;M%-3%-mPAg4}Un__QC%@&e4Ve?EOc*L<#b$}#fq86ZVLc1L$yL(vRZ>$mcLSk% zo0u9J{EE%0j6-OYV2qwr4{`nn{2FdqYa$=dC6MFt5BzFGgI@uK-)x@UyYmjLQu}NZ zlUY*t7k;G_0?4Hx+sfly2K)wCi3Y!7-`*p?i~bjW1^k_ws1c1n#G>0}>zFHnpEoe~ zasgG@&z|aHS#1Cr+GVoHMo@A?pL`XFOlx$)e!~Olae$8}oJf!j;JRlW3J-))YGF2j z<_*lEZqsUf)24RAUJ0*u9pkSjwMg5v=YeU^mgeD&X5DiXa7*)#K#E!>C?=l5V8sHy zsE1?04dG(n@~h_X<4Dy8$0U_Ak~u&o(>FV4$TUUp%PEW-I#A^NAP#3~gARn?1}QE4 zo^-2ZgVo$Xfa+iSRcN~Hulr7+e?_R2ooI^Bq=447aB{RTw%or91C8oF*vT zRV8<+G(g3_bgGsjw5DN!rJyjec0cR`zmtaUrQ|MtLc1G7w`$NuV=G9sLi<-whrpQa z+@#sBx#)J9{dytn0RPv1Z7)UtqV`b5IG>BPzxISQxS?-Lg5z)ycXI0`L-^bg!#RAH z1Cd?~0wE9($A$3n+&0xOed-OGf+jGxZMu+LG-EPG!v>pfh(N0_{A7S-VLw2=a(s0| z?`pKWdw#gROuuRfwjE+!hiJ!P>$mCR$qWZy&?&gTyRLu4{_?Ns^Pf^Y|KVTB43LW9^E_i27e1&e zd>lXO#_$X+DIm`Q=x04I=H{xZOoPzwsG;1>SwnX<#rCdz34Zar1i;3~4DGptj-E(=0{yWkDw;TXOqhI`I2pZ7H)aT%z+*kd$7*>ve0bXX zhC#E~t16{%#+ftXHl4vgp$_=2ISm+vDky=2oEC5AXn&j}jwO@fOS#Zv3gnj!Wa|Vw z@iY-ShQ}UEqmFUN<1Z7X*0Z0j5eGJqyj)}eWit80a9!|R!a{cf6Uszv;EH1Vk$(Fu zc7BzNI&mK#!qM)R8HS3H1UHm!rQjdMfs};SOONPP8ZL~KjgU}C?D>UhcgPNzL1TXK=TZM} z%ED>%BL8`;J{s0Uo!swVlz6xjbq_-Q+6j5MG#pHaU6?1FUYJUp(4fG`xP@PMIz1wE zOL0+uYr=gN^9{Aa`)tN07y8y_O8n8v!(%p|AXQN}l}%FXh5pvf6SBc3u?<;+23-Vi zV%PYoF<>LuuqS3P2>aUx`k+mUA}|Vg7_s&INQ1p5gq3FT$bg^eSIj zY`=Cr`>H&InWrE;8X13)*Tse~eRzyW3ubQnkjyAO8VJ8kG94jqjQm{Jairok%y;2wg}BEOd-DiLG!O8GJ`jO zHV8cP1At7&A!(v*A(r0A8Zmqx`@;~K^K`3y9+|9+{_Tm1x7pq$?WAvSA5&5F1bV`w z{U*bMCl4`ABt$w6(YF1)kAg_aMQ#BO@~(%#=R` zVCf4z;hhqQ_43^f9HP(=GX_A{pU_T#ymvg0&hz|qO#L(WX4maGO@p0d$ioy8>Bc-! zxHohr4^ekLG2eJkcJP?^TxbXYn;x_yAmu#HXWU12i1TuJBp<>kWRM2mB$exB1@Wjd zZy3IIOs9}9FW-44XHdBMd$+^MAnS5`Oy0Hir04Z_3dbq+bemVj4f3L51#hRU_6Fs! zx(h-*j5v%_WXlT8KW&(n-gxV=GWluqI@z}4k&xx+mmAf$Kb1R9jefPPy|*&(!g6fO zx_*O6qx41eNSWo5*1h`A1_kwo?SdP|4;&@0wgHwT87t2dl~BZct(xQ zxN*Fa_Ta`!HsdF4e&4&({M}7jb%9D^0UwseV()PZYqK`nF6|7wx;1wkeDwRrx6gBP zCm{gvDH^g~zU$1Pow3mMG{Ci}kjt1teL?BaDx$0E7LJdpK=OWkk>YkOnIsxqSyaey z>!`>X{d95XRXejuzoyLO_`Cy$T}+)u%A^y6LCFA)>s3l_;SODfUUztTYkE*}B>Rnw z&=tA$qjfQ6wl_uXS|8FRle9z1dmL&Ui@`m-cFSCz`qEQAyZW+AY2F5Mo7Jh5E7kMu zJi)YpeS&g1&>1T(_#4c6LN8OjecBiougN4PwoEdze)!Rjq^$IDgVV1Qq0aQgBQ`%J zM)t?4Yr)1|nmoa(MTH;l-2cKTTQ14F;4 z)TsHK&z4VESm_GT%-lN0bjYGdt7o-elCR#|%BxT6mzB>)Q$Oqb?-tMSJ`}8XJ{{~* zqRIO(BGHfE);(oyCipN>=DGjY`b(zXO3qVAd@r?w$FIC)k=2jYV}IeELo6sEy8J8{ zDW=0&_~atxcUyPqex0+7aCN3}QpS__pPyOu#@I1>1`LBbEVLNEIX^X;{s7N+6o4Wn z(M8Lg+3uZ}U2bg+y_>S8rS(|jud0Kd&wlgaY!Cex z<7H0oKWscF)*Sr6@Av@?fqfXTQSy$4z*dbt4BVn2uz}lcY#~A4mrmKjLQXN-1~XbE zM+Y-L+J1%FUx|esOCMbZJq)p8%sYxb8Vvbj5~1uwQQ9j?3f>BTezex5bXb8QGBV}|2rlf7IzN)9Bry#VpJV$gp!u~LmJOQW87A_a1K*(){L0TB!!YLMJWOZ|g|G5%q zlu?2`qyU{I0=oFpDA8Wh{&#(?hV09VM9V^Zp08TvGMz?Aw{ir2f3NLQ{Am;v0&(Dm zKG0MXHcmdFLA3OU?kiDAq@-*1g3U78Iw?ab@yiZp6fZy4+c1KEXI@~mV%L#>ZD3WRnIkK#BY0J*5%Obu$(QidIz6Y; zCYc`#o$#?I5Bl^2zGaUe&2i+N?Q}CU$ypj%ls4ip*vi_?nStcWU5PdD>NI(_~r?nE8u%NucaGcZ2fYi0N3cE?;fD|K#f1VhQSdtb|y(I zj9U0kZ{cc%$Ti3O%GdmW4gIGFBL`{%Oo!pOOv?gHOPx$hEZ@4KWGu9vdzqHn3*EW# zfrh}oOb=Fg;b8vhKM+`7yQ}K~v?4A7%GKuVwLg>EPZv$emW_ORKWd?dhQ4GP0&D-_ zT*QeM9GDmA)4|FQgmgKXRXwrhbr3>!_ukB?3V*6XD}}dE(+Ds2%PhTOeW?C?fLL~W z@2u#pK+EY>vqU_%D|fPsB+^Z~NZSLI4AYSd^zheu$&_ea>-iIO|f_tu-6CiC5Fhw|ApEel>k@l1)=j z6L%Z@mGCppyrWH=U(Y09k)><=y?Dl2yfev;qhu)L(ppJ$aD(F00qL%dnR3yZ_$9tG zCl9jvW^S8DmO6+ONY?mydZl4RLFg0nzHyUqxQMNzqA@>3kpZ9iq11}KK0lhG=>0@0 z=Pv@AsF=2z@l~%8%s=sg=}%?OAMW0sjo*Xe*L-l(`kC{FCfwlo3|oOwMaAPJWgtLJ4OV7FzX9V z$)`#cGAF~m9a@-P!lVeA!ET-D+2ar;4Jcq9VESzb!~&I}~V(H+2d3v2FUP89HJE)Z~v6zTZ<@<{ipOnutZZ+lVUpc53kv=E1l zJ3h{CDl6?5yYo|R&fLS~jh~-o0Nd_T($jgIelP|Az>TQ+VD69b@7iM?ukB|9z!t1P zsFED*@&r5<2B%}koYsh}nvNgX{#kn~IOVm^S(KKDxlqCemZ)rciLzFWg+LI9D`^3%N9X!+i49L01E^I=D8NgM<4(W zNK?315x4f)R&2gGD+j>==0q^SGfL5a9zefMq;s)BOK=kai2aj%9Dw43zGkyUPXly7 z0IvYuY3?!rps8^UNbmLsC`-%RGEm%&L;yeV+xg6;5&#g<3jExexcVU!j0O-vZ|A@q z_R5EobEoOh`}|M?8#FzMhT>r<;LbnD#Y;-0Dw^&J7+eD2i(x>rRJVzO{T&6k-s7%2 zl=(0S;I+I2O?Ne{U%3?hi5c>qnRvLHKXjO%BPtK@46Oqbd#WG+3N0-G(;WxG{E0l{ zz|Xsc*+YIhDnMc~GQLfj?se2(-E}{|ShNB9`m(csW0`&9drJb}wYN0fRf`M-u!p*P zMj;J}clV*Jd->6Qpp9K3mqR^1yM?nsBSk(zRe}C`a5wQ@s=RkC zks+7BVc7uZZU<%&2<>I(+T|p408H=#qzh4^mk7lwMqFwcMoI(rfo27Y-m*XtyMWS& z3jVv2X?Uo%n<~oY?amd|l`46_HV7jLP%o@U?&aUUqsk7Er|GU@uhsRg)kA5zYo^>g zq$+c1<7A(DzJP*+*Ah<`hj zlL%F&v*Gd0m*u=o8@J~Lj&Tci)44}g5>c@ji;n$R!3pxtF9%*ar1 zQ}3-cvEySw=q>Zc9sw0LGW4}ihZ;HZo_7?an|Ny6bE$!cL)$ZXo ze4ncaXcYQ(r}=fi0ziwHU7|YQUjH2oDDmmo=+iZFA#MZ$K2Jr^y;cGmQtheIqlfvQ zM293en~RR3cEGep7bsY&w7Z-C5!`zQLIiXJ{Jdd(U?ONuN|)L+XdPpr~S}Ipi#4~GO#-=|1b*9-U|LC0TeQnOhlf5#Z9j9 z1A**e!+ZReaK(3bpgv-S*10BlIavE{{w?ZdLUB~lH33?9M%F7zUW(3{eS~-KgPiXO zDdD5!feC&x@`j#(3aa2%W`|lW0AzDb`cL^KeG_BHcqx>r&OsFI-$(lnRLc`6r}hu< zO1N^3#k5<@)VeImr16vWM-%O-P|!AD8F^W2jaDI2ZZu=+#1ns0-^?Y^?DnSuM|ZP3 zKjNI9nz_%6-Yy%x^JlcQ+sp%1plA+&nG-ek_?c{o-lm3oDS-R#0EbTkSlqyFfSR_u zj<+>&zEJQo4R_6KxXT`fG*`}@j%5CeyN2E(fzNo3T=EQ4Q|~#U>8=Ex>sNYBJ!j0{ zVy`E-_kv`JgYoTP5#sd(38)s0=hT+Ibkfi|Z zf1{*o=v)58gBL~s>J1$^ARF|u7JS0Y6Q)I#s%jwG??+y8fsRe7BwZrD{FCoVdo_yy zEI(LC51W@*Pm?K=UZt>qX;o zL%ak4LSq1xhmj1104V%J-ABMwIw05E`b0twi13slqqI>j0J<1}>`Y|E40M%e8XiL< zv*np$057@1;8dwIE<{Kzk>cJ9$E%}dX}oy5E72NCB@X+9!EK1?rf<#GRAJ*EKH6JK zOhORk89L&7k|c5SlH#d9`Ku1YG0V-iAC)NXP;V0eUAKz&I8Ap2v|9@lJ5JyAwYuw5 zGtqWqE6s=-N@NBQHv1IdqZZ}j3UBla_AUXSA_Q+-D0-j)JRW6FReH=qBu_^|TnraW z02}tkbT|N-`(|ao!m+ObVr>FISTySf6tDn(SSgmzL{%t8F~b4i?ghq&TG$T?pw~CX zL>Hl%Qhfq|nUV=J5yA-+Z}B5%PGWIjMxe=uM}RHLs6>=PXHzGJ{Y$$>AKHFC1r;j0 z@t1bhTB85K55-TzGXy$b*l3Q@BbC}&UUuIlN1!@^LMs|5mjUE!m=T`H$U4LHGk@5+ z0P1hj%q7kCu*KEz#$^D|BK)YF~>SS@Y&vaf}0D zgeYhRIN^Q`+k#kuAt?(|soo2peWcH9E98h13b?0KJ~(=N z60Lu(+&o?eZ|B=S9H?Lsp!Zyq@Un&IIT)>VEPibK&QK6=2z&=M`9IjZuc)TFztQ&z zp#%sdp|=o1FQNBN2pv?qG?gyWM4B`qA@m|$5kUz>0qMP)P!s{BBS?{^qM#xm3L@-0 z&;Nb*JN6i7Uz~G&?sJnd)>@gF>o>n2;DT;Kh#5GjebQh_f!`UpXCC5uJ!F9v)PGeu z*wM3cWt>%~xz(F$BtF*reZiIWE0dNW2Uq!;0j!aajGz6-jvy%UA2!?M9i zlD*I8SOcc}zkINn{^a8JxP0c3{Uv8=D_AQv)!qOy;nPyMv&m$DgST_bA%JOEm9yUQ z=xbXLz^tyF+HQ&VNClVrbLHb#YRL-Y0J&G)fqiBzrBMQsb(NSQEt_BVO$jRF8PJeS7*1KhX!?DpEF$0J*SBPELZkYjMvDSKE)t68p`5r zO9v_j@VvNHD+-|ckMX)dx!AOkO#d3MCJ7$m{}``dJe&jmHC}s{pVMTooG3fWtI`Gh z$9N6eKj=|oc^iNc*Gs%Yd8#>s_Oy1E5)~aBnMgPg>E+ChE^=!Wya%|{cjH&jmxjSD zNGYDtVbRwIM?(=}Cnp)VucuwqEDDQ1JHSD>_WFhN_@Lk8(t-nS`t-dBzfS6Xq>O`G zlE8pL2#mtK0?fIbcQ|l>LglFs@9+P_$#4L?Awqa5J%^iaJ3<~_vlBs=-YQnPfqV=G zWdQVz-pu}hv%iO~oA1w;3idu|ekhPu8tiaQc zF5r%qXvYcL2rOap+af6Ft^9O5|u#~=YsX;C$?=gRuO8Pt3cN~-S&F+&W$sd z&6g<4)Wyw+btl~~qF%p^8owq4XgKj`JXN|TO6TC1JQO4T_Oy8G^v$L6`MLLf^|S)N zo>uhC)?B(X2Q@yQzjm0x!#o9io7XKTr%zov4VH)B81DW0P$x1R6F z?6iD(9=nTdW*cq(e9h+^->?gf`|-AcH#qUzb3a(i*^g^+N0;h>QS}FLTL{h8qlUOY z&`OZP`R}J^2Uh2Q=)fUVbS!9}z5ho9mgPz+?A8!+BwNrp7ky)2c=Z2*z#dyaD!S|Z ztuxB|P<*<=?Ee3Oz)H^7I&}p9Hv%hD*;=`2{Mh$xbHL8ge<84%c^)r&qkirG4+86d z{#$NiD2-DwaQD9vSc`>%mH$Ctl|OW)g?iF{nXS3p68QB*dH1cfMokJ+bbAU%`|X4O zAh4u^FC33}I_Nk5L10&l)k3!3Jbh`;YN~X!`>x|31hyx<_qHg*XD%kKsC#;b+?8;AssUVb0JN`5L31+hL9NMLu0Kv59b!gZV= z+x8@$K5*oUFhUtTg@2n_btP4>__t-6)To+JCIx{lzpy{Pwyp?COeyz4#DdEr^_>`+V{r zF>KB1_wVj@mBC^h-YnXeC@w#+d=$-~Ax=!3cX%94{pyDJW6UC_SZqdfqPTv#S(L<+ z2fk^DN-D?h&lg!Z4kfBoQ)_%GDr>qWpVbw8sfw=XcCtyAeIyc}*0O3Vh3WWMBh^S| z81qf1d!g!oA>xjYXtRQZ6s+a7nWl7`PkV}Mn-YDOQ2XLQYhCVYnx;&bP;i1{m+q}- zuWsqN&pD62j%v#G{iFrYLXK(@8v6g9{E;PJj;h7o%f*h(_cX@e;khfeLe1R0gwRaLZsJDTvARD6d<;twT){ zSd!C(Dfny-J$4!W5~OR-AzgHQ<9P9x>1e+k#LkIVn5-P6uKB24<(*(A#TrRdGHQ6v6d^x_t;&fzOt9=jW`aALVLnhEYa- z8YU$w3T$%*k|xW*b4F8Y1>vzRPoc%ejJzuStTPisc3smUKZr8JY2ghtoo3V!a7f=Cx|_rK1uA%?Ew+}$#mP{ zf)Ispy*wwx(Y%ydom5fN)>miT6}htOd%uUw9w`4q?(P;xOHRQg4R7-U>~@joL$%YV zy4j9S_hdGkEVg9=0@9rWzr01B{`?*}X=L~ASsVhMUZODeOXhb$qfF{GkPq17yn>flEXykQa4L+_~2T60L5fQ2Wep@nfH)KB(qf z_FK!Qe{E-&C50bSw#%NSTMBhf0C zEItH+#2E}EGf@cFq5Dkb^K@cmmh{qEn!-D$gKjs8Bf1N>`0IYsakUDhiU;W9OK)4wK(M>=p&+J#MQrI8@P0b)?Za;E2U- zYAW(l%&ywOWe>M?q@l?UsRQ%weAomU_nul)Yug#bn%n@Y{tM$nzwFlq5nr?`+!A!N z1$*Yd*5R|5_W$xn4|)}r!&_ zoX9LI`N}gy$$*z8XRs8oK6xG;mPZ*kFyCnPlG7I3z#C_?T@`K=*@)e5-QR(l6ipXfCXWIBn65?Q+uapN9} zz2x)%h3l! zbEb_J{fsu7{S5ab-4QCGbxfEkmS39^!^F&RlLc+x&RE?S%Y-B3$wEJ{V(_{8}!9P%(8l|s)1V6;RSGYZZ@ga?}F?w*5m)(Bc7Sj}33 zUIana6{MtxRRCaR0ayiFl!_~Xjln>(m7s-Q*CD-w=T1_vS9MDiN58&AsJ;-FKC%|i z6@+I;1Nm9-{2N4e2Owt(P@o=I*FZIbnER6@9gf1a%9K>Rqg+D^`}R0>M|o z9^)Y)S7EFpyi4(00ayvH0gMcp$H!APfn03jvW$Id-aiW9zuuu7q5CR?ShnC22^dUdYJ-jnkKUH5>Bb1A_b;D9JMEU_og;EY|4>u@xkp|B)v@ z-v7(Tx2;$Se-E*U!PwS8rO5#ueqp!+UNzoBY??~O9;M`plCIt0<F=T7?0!;j|!WsG|F<9Y>!Egco9I`&| z1Nc}KsEkLzLhXspLg@{Wn({OQ2{ceTpIW!J^a82$LLn&cYW&@L1MYcht!)EcJ?xF( z?Kp=U%KI9iN!fsoEk+GD1YaY-{Efv6*>qFjcXHwr zODydF;#&rWl5z}0S_{mw<6DgIJp~DT`cGvbQ&k_RxnQ6Kp7@-a8q;htV)eIyRCl^X`CK9Vj8*#6oCsRg|YhlDHN2BoTo&|5<)<2ZUBDl_3) zlPU~v!V8X4D$|UCyQH1>F$8)oVCldhE6zaEAFq2z3g4#UFwxgt9z?Az>Q|0ykMp+j z08O_+PpBN*?h!Zy%*_j#@=B%wYw|MXJ`&rMz zQ1KbjECn#zZ@{azYwA;Hvi#+`ZLR47?gaz0jz6A!h-LqcU5Yi&1hqMG$2TV!LkP2Xc zBDczN)R`oJ1pG0)*=3?yKyHAf&Ma9q8^gXfh;AhhrTHV!vZeEQK_7i~VSmIp6}JPH z9pG!WKtOG;e@jb{)~`xB4=ANfbT!P~SU@lH8NRuDY}akZ3{O*l8u$R@L@`9mh; zxc%$Qe5hpo_c+*^2pqdd~^euavEZ&JAxZ2}2K9e}v9spRGGbyosS zQ!pp@s6X*I2gtR8spDQOmaC1Y`93Gvb-{^x4JZb;aXh4j`FRUnRXz6ShOlCvzx8ah)YMKB1Pb z1)2#AJ_M8Du13670B)BC-6{;UbHHp9q!>2XC@ZU{oh5lds4ZAfYKgj?J>x^0;Hv=>6j^#Zo})EKIM1LsX-F?? zNaPy9l{H*o^P%oGrt`{_LdKwi>p3RuFZq2lUNSD%`zE`*3n2v&k4~1BPH|(c4;ME~F*6A+GnN;a%p!Wi94k|oO7R@eGO*fc+;pbEc=y@{8U_8*f%Hx!qpt<&= zqZ`CJj#briCge~(F%9^@6ffgR#f!tL%i}d%`AjR;eIMWUd)}@KCWl!{!)(do5v{tV z@$}*YVGM)D4Y#BHTCuXbX4}_IadpheyQ*sg3g^DUvv><3g09cNko|!;o7W{|r>?M? z30TbpFkp3)z<{IvB@;ood~@NEYE+MkDIZz9Fi~0f$C6B=T%n$-9a=Vy>V|n`ihs#sH@6gWHSXB_@cBFHq+hN7*sbWg_)8&#JfKObKC~70;z+4Q^G{+!MG}Ep7F?@4t`8k)O*Yqj z*3(_$>|UEZ1h+Wv7WE%w1?c~5$M9FlnM5c{j_cKD4K%E((cQk_-jcAu^0n@Dg3f)Q zoZZyKyg}C600iA}_H{cg0c!zuzpxM3R(&X%VAK^mE*4R9z;#FiOc@W zJv@%5Jt4>`I2j$Sk8(id~^=B)zi^uDS(i~O;HZFq_m_Q&Y@ zgEElay24RZ$^Nnmb-G$n6_=ta8QG*R*gmTUu~54jll5ERP15ab)K%)!#Wm?#4j6+pO~&u6_=Pqhqc8LVxbn)HWkclCag*2{QIqKS zp&sUF=}q(V79Je+#-Bl@@aWZtB$dNK4oaloIC|pzG_^;w_vR~>H+o8Y1CTX>UPo~ych_FJYTT~HTni6g_3~5<%j-7Vuw&rL58H|JH zu^UclufaKN<8?x{^4>e^#yrmz;vJ30a0{rvYA{&7-KEXQu3q_EXL3pqJ-}YlrIhhV zx9V81>wzvHK}q}U6u~o}m6?NyOM-{~hHdD?zz|N+DE%2qm1rx~XX?fK4sJoV2RN~Y zWcm!g3kHW=sij)HingRN(9E^h_w3rO$_(H>mdS&y`BN;Ts7GehitIEk8Ll1Tu9LsV zUK*V<-ZNY~`#reveL{^07o!UE(l@N&KHH;zFcqKiTtm_OjiC5=xnkUPJ8`Ea50B0^ z4|?5_t ze^GLp1`xN7rp%t+j*SToyAf4TRJ@MoB2JbO=a%F*apG3|u!6&I!#Om8|P!8AEpYY_Sh) zrFt?dPZ9zx+QsQ-C`#+1`#(ynC$HY4l#djpb$c8c<L@-jl<|cSWjeN9gV|ZZ`ei(=P3H#qIbz1j(0xk(v^9!z1)6}`D@lH z>3T+lt2%{(;Qt`4+pnpA`fu-mZMWBlX+6>ApHqbT*F{t)NbAQG?z=F2#<}#q2LYoJ zvrY%GPH~GzR(E$9+26h=KR}FVtlHuc70BFxUCB_he3xy=|6uFu3AZs4{ll@{Gv(wn z?bf|#=5&`H1oU%%Dh@Ncb&mtAF|c&-O@Y`ZZ8=rJ5MSdnPFonkZ3B*^cc19vn0+=- zAY`OI!?Q*jD-<}X$82z1Y3+Q05Eh1~3&e_xrsUV3rczmJf!zgPl>A(M_`Uz{e-PN> zpzpn1M=E+7qv&mUi62ESdU{UiteeQRcc>crs4RiA+fAous74kHi8Xiw?a5<5)%9Z28 zQoL07%)>)G9g5)?dnXi6BbQI)r3~*_A6}FG=EC27XPCh?}W8!Z1KusNwV zf?%zM=`3y&CbPG%%BsCR3pim^Zs6ogO*nxo>^{o~@|jQ;Bic6P9}u7Q)CWAE;C>T=(xn65E~o(Dqw- zHxZWC1y0FSVc|HMGw_h6rYl3-q$2nX4AOpZi#aNZjKq|FQn$cd3h%Zl`BD5y^K0rv zbbU+7cUy67-jg>m-wsOto?h3ZbDF%(Z(Fusv5lR3Fp*`XL=^NAH`NTXiaj|f0kCgt z()(KG57<;vQV*u44Ac2CwolPvH;mlQYc4(rpM3%zmayEBm`$d#twvCaLo$&ICXyWVTECHX6g znmfMG;AqYruJw`_BPf*$(aNu31ST;w{LA(&k2=2wXiqJy^sB&nK44^uP%iJDU^-t< zehs|g=e%6z(bfC&Yrr+$rN#O1jxP4Vn~97r>nv|OxAndS7VuvEw6E9G^Wob~H>stM z-kjY_CG~z&@a2{G`&}Cs0z(_3SI-a4I(oj|`5w6WZ1wnzQk?Q?xN~zU!zJN%Z-?|= zzzwP8WvAPu?GicYy!pyW>}4|Bylmw0LpSQ%h5b#hgFM5`9p~Ha+wy;+y_LrtM^y;y zrFGKY9;Xg3+*O+MXr$azT|9--ElRd6_j4aCdy46Dk6PW^&#!jzlDft{<~YB9uWQ*$ zHjaDTjpLy3m5aCH^~e$9i-Jm|YH!uC4DMRBgW}IF*EQdBPejfiJUU*!u3Ikj`lfxU zR;?>D2HEX->+GP6du97|e+#vnc=A1j)GD*m;HwYYIPd!qI+Lev)2?>|JT9;K*gWQ$ zb7590V_HbL+Q>6MZ5CW}cqPz7J7 zbEY7$PoGffd0YwbyY>gwa`&jc>$T^lPgIE7Z9UF`*94YYQgb)+07DTzg*RV-cI~XK zQa->#Egw5mY#-c6zH*luQE{qlD+H*kYY8n~1fF97f?Lt{>jAdsX| zIkhR}PrxUHKizEsdaXEv{`3`K0>x^5F+ww4z`Vr%8@T$RPz+izX&oxyK4bYfVoH-l zAYSI%wT1l|TUyIr2=4G5>&4tfXHr{6=wH-VOZ zNOb}=;sJZXB+b39??*L%`b*qM^^Tc1qwlE`kL9+Vf4zFz^3)OT8=6m`5itS1=6%2C z+R49@mwZRDROhRlfZ)NOa4mmL%e&u_*Z*vUhs(uV-MxLusq~%PuY+5cA{XAQ9{zsO zw}1TDca`&8L((nlBgURo$-zSnV72t*O3gW$0FLc071nY)Nm7i~(}?i0S2!a*(*^EvgGHa$z~G0XWixi5sUFn+gS2K# zbIR=<`&`^`Xqe8Z=Zk2pY(<@-FEn+RuD@%nP{{u1UfMA(OC9xwi|;O-NZ`i2<$}BG zk*zHC{A;?NZ1`6TaJ8lQx0LV~82WPoWT;zK@ep5asi}ZGkSxO}Z_ajJ4>yO8t;vmA zT8)kp*IP5#AJ($p-(`P*=6E^|)rW3ty<3n4-&Hft0k@AI6oA!*v(8{9WH`LELO>5K5H;aNwj z_}=GedYjRe=J4#u^VGrl4t@aJ$gElvXBjR~>0iF~L&X(iSew96kuC5i@ChH8AKA>K zDX^@={JxdUl>vOOU@fGYE2KFsq{||V&1ILh7B-(2zGN*d%_3x#E8;vYY&R|9VJ)he zD}3Er_`$fSGK;A0{$i2a9v`tn_y#K7i=`GO;8v=?~>%A#sn#r!K zEDM$-)=8Q(NOc5AH3ZKzH;Y-_e8!Kb8wy6})KPa|l8T=6-@geX@k+nSDx+yAi-YlIj4akrNlzdOW;?JZYlBdAk z0+MN&R|rv%YLS;eXcAOZ8phDEF&-Z1@@# zzqB9B_S!U&Y_!^1q%XYI(qz>}pfq3QX+O{789tZ}f~bDBQC$zwdOI`qE<|D{PxpJC zW{-_-^)2>U`Ue>qY#AiX_irjGM78Hw zfAtstLP2?UubsC73#)U89XGfz zinUsZHKSCL*_ueKfAwY3Se zvkSGoRxs{vXQ$g{H}}f^CFJt;IgJ&Pf_&ygY=NC$+eF4sn?$>&l(32O{5A<^c6i1( zXGSTcY)xmQRV|DNsc?gt;4j62IYvoHNfZMIo>dryto_Nc2XJXvhmqza7v`?~#j}o* zI3KvPb&`1+wIrCLwqDuE~IB5^KE!+i3Zi!i}dkKv=~kWjK9uXDJ`+?j8Kv*vgtc=JsUE%o~ zPk)L*`vM_+@$@Ay`~s#V=*WYpr+mYH+`aXY!A}d+<=}w4;KA|W3&0T6eIC{OKCgE} zu$MzCb3z^4$1lD$x4Ew%9c1=tJWP?(>vBg3xi;KJ)pi&YK72SHuIkO6Kf*H`h*r{1 zXT5Cw$u&by=N{fg3d7;K6|lh(INu)i_DRrv%fM%w95LGh$3C=Cqzg(!I^Qk&JvTPF zHqTZ8`xiitJv}*TI4c~y8-MTi!(evb7Cr~^^-A>1)mPy*x8vrjFxPlSCaY7E^Vrp?ad5$71cvdxn1ee#jG*`ToYJzG6<#Hm3u@Gp~!ff*1 z(LUo}X48MR>H8ONNw1F{b>&(nCyjzxdI`<$y}Lm{E=ybm71*t>!QNW{3MxcmY!jV%jDI9%7j{`rfcS+TH?w= zMqftLM#SCE5gEh8jGKyM`wp4M5xR)!rcm3l7?#Xy_p%3P?gF{Ts2%U}s$XYXH0-m; znfdJ<$tmu^RU(W}?I3W)RN2-GN^y{pp%nWPOg~!S0MK*$#i;(QVQfbg;eSCCtMOK(v+hwc(XX>ifgrlOF` zqcPSc&FUFeoevw-i`ya}mNyl*K#Dc>if0^ojKdgP#)HKo-RUDsnrchlb3d{=DEY`e z_9Zg&JNNZ;;8R-0xIzJEvWI?+z1==uZzD|e9v8iD;Y0CtajOKR8I2Ys7hl%(c*>Tp zbd9fqVw?Xi`6ype3D+@wS8>g5dDgV@GLNKjcf}>1QTJ|B`lur3lP7K(m61icb_*pF zKxTpCSMLs#uhe&yW{&l8L>fW=oi&*7O|d_Ty0W!Z{2vuFP+y^>fh{u=eBpPK6c{yt1(Iy z-N?AqDAcTYu2FqiuaRE!*)dNO(7Bnjrx~kp*{(&8%pDpAh-3QGEEL_${qTuqwE1I5 zt6^BH)o=I&7~4duV-%{p$=ea)T-zgr3a*Se0`gp~lY7K1t^wEVY=h)Fcin&3H3onv z92OAt5l^DS>NokTG-K{uZ{UsAwF#k->721H>Iul#ecqE$;J;TC+KX@PCG++TNB7xu z_D%em^3>?B?ou(&^z8axaXZ zz(iwy9y~aO)Ck7PRe4;xXvlJ|hU$^n;4l3R?Ji@vcnVeU=Xmv7_zi5oJ5vEeh2fjbhzE8 zVk~486xDvI={grOKmPVgKgEf#IhvD2=zQuUh1pjbA~e7L?f+5oEE-QDV+bv`&DL3}uwd7+ zL3}BLk=xc^AiISg{OFsE$tv>RDpI4Dj@FBp7G!SmXp$2XU=s-o{VjD$3Mb= zUGj;7z*3I>pYN=GTU_{b-v^#S>fo%xkV!&<5^Cg+r|ADrlW&fB5RX#AV&P+shLw+M=MZ`ubo8UCs-aRo+hIzO%wxt8#U(VT|4>Ex6?l85_SaAOmnZ8Gl<`&n zbc8(e;mWgXCmT$R9YM(%ioCjawYv5{j6c-8|IRI9J2HfXo_^VN&-e3k!+wiNO|(;t zyIJb5`!dVNKX;or;+S>&Pmd3GmQ?$@?l_oMs((s%H&8ucT9{Nbp%&+mFAhDYW5P4* zn5;$rx?tj#$Z<7gJ@^>oV8wwb;ZXv(vQD z?XJc*3IeN>)VoR1XdzufO)#&<`nXW9DpSJ5R4GYWp%8(6Ji3 z#MSWFJ(UCD;yt#gRp8Q!a5&~uC~WolIVI42^sZC7^W@vu&F5;VOCn-Jl1f?dA8B(V zZNGv)zA0wIw~g?Hgkndv23>PLU%n|lE}wjS!WcXvXw0yeN2O^yab;m)O8)xhDJIZC zF3Hdgh3?2Wz#+57Lnd@SIVjvNhP8mz2*t`5#glCZvUK~;$fi2QRn}w3q2DbQA_ZuJ zab>$}UoWYq2eBNOUHy=uC}CvLB>yfjq~l;Bc;Z4w2*&(!8HejBFvb-&4Q4BRT`DrJnD7}5WY+Mz_4Zs_l@F#TXLK>Ec=cbZ z6{z|z)mriX>}Dvl;QAY62Pt?O#Xw(ge)B9SK3U%)e<%x{Bldk_pO{Zf54UTcy~{;p zEu2txI}R{-v6L0}cDwV}GBxFVU-k^wmvUSl004#=Unm@rB^{rDsx|R!e}Uo-y%>XI zSTS1RX?EVR5W{%LRVveAJ<~M|C)=T|d1oW@e$!Ll7h`5?8Ee{1X)p_g#rMb49sNhI zps}hsNPxd_9=u96PPBJrCF$Tfz+Ru_m_&eolxG$~48ScQ2oJYH#Lq zFVFfVxVX~GhFqz1&q`^uAGy_f<@lZc5fph-8-=$M7%$LGp*U z3~uGxp!i%yl(T~vulSH{`*Fi461)!8LaEl`YY}AQBo)tj(UWwAS$DIuwJPI>^Gr$*Ge)-AL=AvAqhoKa z$iawBgc8FWoKC(ob)P9R@AxJ6$9Qg;(Cv%S99B%DgA8f`ue8smv&0iD8BO+9ZT`aT zz7GpAT*a`np0Q*_k!+tFyO^rVC=5MGvVLkjYoi2U6;F(IWihc z597aeZz1x+hsi5lQMa?d_~F!EIA6Q-;C3`gE+OWU60>ObM^aJwP28Q(_oveavHxJL zTW>W0x(m)A<}12yhF$4$66rU=^6&V1*Ac`ZDsi7%={CvNjxnTR=|Du{FgcTN!eCHh z-Y)q9#IrFWUTT-AxzBQW6GChxG9|8GdQhvAxVj*!b#cEg7SbK2G%!Mag6-*y*&)e#H@)j$sH7iBzjFs69Zg1E`7?_>(cRHe05u9w+ZT0BRVCL6ln=sYfMGP%CBvLx5X)UO*w)SrxGYyDhnc`{ugqim(nr%p<8QoDtI z802(VX6NiR`=q|DE-d~_N7x2?Gv&#rBl$?`XSZZ)e_K;#iC;$~HhPeQS$C3jNiIHD zIo9{cJnhb||9y?cK)-~VI&B1i=1XRiJbO(&aTr4`8(O;xZf|4yjt%KqG17Sk_g3fg zRyQ|o5G<4Zh9`wT*IcFU*zcXX2Z?(81o^{+DXb?X3%hKKYO~B8pu4)4q5FGHofTWl z_J;dT=%JvM+9e%QP`g*v#?tE-oiDE4b-BB_%U%LFzL5kB2n)T=+*mj*_>HKxJ+<^f zPBzNfYgHj;+~iy>U@fujeJg*g#qvi3)$Y0S)KGyeeQG}h~$zM0yU1jN3r2g)z6aDTyaPWo-C(@ks8a>hoYmlu#jATa} zypwuW9N zjS!z8;qR*Fbn&RTLpmc1z1WXp>XakBo$PmJ#PA>HA{#z1^{p{kDD%C43R{B3ezjNr zd6}`lltyZwt{J9+7@`ILru!LsX{tFMa*N4AKwu0M7&uoU{I6`Zao*%p!BwGFNNon}@6GY(CD z0?vdEC+oFC(G;gufTNfcI8Xt#+en0p!#jSm&(IoejKEfW!XvmS7^Ln@r}lafuRDhJ zAEyUTq;KH(ku#n29E)Ke^xxi?Bb1?8$O+wrts) zsW~k_GiYDsR1mTs&M=4R(SFWmKEpx7_Lw{^Y5dml*CjG51 zuq@5b*=Lq?#h;-UVy!f@x3YtB@GO^h!D+ zOVDtrL$(5``o_R#+7~w-pKQ<~aNuVu442lR0qezQljpR)IH<7z)n4hNxkcu6Qm!DN zv|7ECCaUyo^6@$M!fz8kMDRhP#$H!s>@c&KnPccXQEI_ z@09|!0)3e%U~7>aXeyg-$F#gk4y3>nyDvZ`ZJDoHL=ivp6wJ!c9V=!fpxUlf>R+JR zP9+Q5EdBr$k$2a3CxKFpWo~Iwm$^0eT{Hg%iNjGj5d0HnwZC4tjT87(53a zl)2YD&CGr5=K54$p8FT9pS2>$qH%ib8s(ucsTXEY+-9d%7%^p;~F0 z$rW(-L6qy&cfg;H)#XvByD-+>wQ416`X2@7HFWk>QfuG}$!EXt#RyVW-6-m$lx8XA z*<2!69#zvKSv!&QSmxa`vFWUrMOUA{18V3M4>Ht^SV5$HYez>>Y&YuW?5eIY&`@?* z-fM6V18C&ko-K}|F5Rf_9A+LLt-IUE@SUOIhh)QnS;J4?hF@t7$2ARqMjQTqX*g$S zqz(;kEJ$Ozjix{2HVr-72O{2tQAq(?mN~VXx@Q=Q6XSd zs;~}AUXZ*MRhW6l?S~!Fe_HP>0ptAIDGXss&@KX;Xyci-2{EO+j%e3l?@)m6>dU7Viwl+2vlO;xwkeUGtcRGqKoPi1>_ zPnT5JkwO{ySx|;!Pe)IeO*DU1dT*grH)684D7rhJx37?~XV9&;KfRZ zdA;wQdH1q;?_yb{j}>j$7m>%e`g_cKww${^E+OQBbaQ@P*1SDW$-Q%$nTzS?&kx6V zHhKG~&yQX#bgrwE@`8w?iX+0$x`RE z+wD3~?Tyd{LFTaw1pG+^gnWf529ul7>e%drBJ3hegKe!7Y zcx^uLDQ8Hiw`#?CsQwwL2|uV@+oQ-gY+sAC(;CLsif5g$7YK+CHU`?3F*fQYKsQil za?mybMrcal*2iI2tKrUPgBCFZ^*uv7Q$rm+FD;jQtYcm>77rIK4dYJx)MJJm_(q(x zUM@6Om-w>#11P~k==nYMZ$V6NNi|~n!ka-%-(U_>McTMT9Tk5tF-BmBptGRC$i$DW*uJUd}ex1_DGz6G(Or*aqa&o%P56oIRT z4HUC0@r^G}y)2R%Yv>(qIvvU=9=Qv7vlaci|FrJ|v^*MODRrJ9o)Zm-W{NRGX47*~3jE{s6yCQo-ofDZKc zU$W@k2ES?nui$9RH$@s@3*q-SO1`_aIoCM; z)>(QgGINI4Z(c>ACx1~SIpFQEKEw~fF+T$RGY)%B<}CWp6u1zp|j;+;Y`=sp5ih@D?n$l!h{h33xi2ds#5f zRq(RGGL-^zA>!I*BiI2)`$G>h#6%kwzXT&LgvYxV7~3O8vFm^ObhQ>b2(g#~0Qh@=Asw zN<2U4a$#tfP_Ri5hdWl~Px9*5;nkZ;(7zuUoe4C8)3D!H)&wqBy$e`C1#YC}sz#SH z4gqK>0m_u0j|$`nKP=Vn=c%(c@vHU2ys{}(PWaScu@v_7;vi>%*bU>mTc%fqdN(ym ziePS5noT{1DM8~I8*L*TLPLh%=^NCm5nKV|@7A*p8NHZjC}UK=b)){Hp9~_6n(^}v z)K!3~d^*J&g^_L(bO~H%|6%l{k;ktUb}C2bybbkv^D+7Eb}Bzd;A)8B0QKKerDHj; z5e(|SWpI`FX`1}ObvZ+XHsK<{Owne0HefX84(E;WbJuM(ff{Cb?OjeH5&B5mSOyM# zR7d^%cACwduTxAPR=GO>LvW(<<9XQEH{WLKS)wjqB$sfgm@E@Z zp+gsz7uFeF0krYWkSH9@E5#3SuIbF~s}T6NfgE}-z2za?rJ+9MUy}}Ta$MG+y|GEn zI1+7&^6sTzC{E<=slY4sdd`?1uvkjI2E&!~<7-Iq=WE1Om3X?HQ2sBg6uWhQ0RQ3B z!T}@sK&FY{%EmL6eTh=NU7K9Ih;^r@*sYZH-&+#2mjTD1yK_(M8!?A|s2NgJ$9x?{ zpIZv`_;0_JPLWvvxsQ3m+iS(1|C*Nl#;A+TxH^&R?+MBA~(?6AQ|5MZpbW~=}(4w916C^2CY?dSlsS`K`J z9ZKAzE69faEmc;_N})uC`$^DI)uWDj7N$>{B9)of-W+^$rT)(Mr=@|CBUX9~V`vok zQ_y&E6A&{>XEb;@#yvEq!}#y3_fe~g%xwZp1i9BT_``d@4j6t6{xA0KGpMPz-y8i( zqeB7$(g}gk!O%NtbPyvV(nUH5N>fomLa$0FQWO=Xs-Os{fQSJ?6*M3yAoveex-=CP z;RN0H{p@|t>}TdVXXebAInTSiTQga+uC?a+eXs9_LyDb-+gXeH$|SbT!Udn#Ft}|w z;1ZKw@o0^I_JrMDA%G_tSFv%sRNCf2c*ZN!WDY^G6LXU$X#y(lw1T74X4k}xu0VzN z5m@x$+a`r}vsQU3ZY}l~GudhK?H2v(pH;2OPn&H_XxMcg%!@3#CpPGlaOZ`O8Hb4J zg9s;n$LIC+GGAH7)z7gc|G3#@a`Juk;>KjEoiC4EOjECgs0l|=mZ{kqIkYP~hi;X< zBmOhI+D1DjM)9g>`P+Y<5=(DyT+cl;uVQ!&pVw2*#q}f-fBJgkaPYM(z3>+GXI$w$ zD4FaUi`3S!FX_V#W z33fre<8`4uvjMH6I(>0wyZ)d}^Mh)b*WYAmb_*gok<)>nlx?qgmajnb1J*n8L%kK* zm{jYjDR=@h+XBlfnSdd%%azs;I1kY<#NTW(^112}_dD34)kC`_?~uV=cbqpSq42nZ zf-cD`MX1V-gP%`%8Jv6M*^@7CkvbrGQU=vwAQ`L72480L^g_6)pMn%67LYz?@*lsU z*5M*E(3L`p7-(0sq|!~*;w+vhTgN=htrCs#USE~|#rD95^2RQ~NB=;-p~h3w&c;C5 z7K~M12CO)}?6ETA zKc)rum|m-ieat5GOfgeXE!fO8}?k;7?!mCS$Eh>q~BDhDs-nO$vm%C7|KriJDp&Sa4Rm2wigUBic_D58l{zQ9WokyJ3O?IcDkkNW^lf>z2Qs#I zBv3d2k=$2OLdGD8bJhcLMQ3MC!RI%S`CM|DtMrP%M4lCqz~p@LazU7nUIcKny=LmsC_ zCG&mW-Fub!nl8sL-NM%>8Phi|9Imq3=oeYcuJ1jcnU=RQ|NJT4TvDZTpkiCS;HvYh z`wy?2qdYI1m-~}3O^Mw8_EER$gzoR$eo64fZlw3R>@O8IZ?0Y!C|VcK;Xsc&*vCei zBC0EU#B%=tSOp76>5~GhuF1y^3O);ObLx*5vMEmMa>!;^$-Qd?9hm6z_iwj9C(Yrp z*X_hSng&x=Mn$lG%(v3z<+t62kv!;el_Nlsda=}@cu%lk)pMG<<`e7ZoR+iqs7aS2 z%H!X}oATEeYub4~kyJW8S~wobDLoW=>8A(8P1~G;^^oGY?@5eft7VhLv`P)qA=|Xi zyc<7=>gW4jqa@591POt-{pEQjK`c2xUM@-;AjVamPe`z5C>n_GcvNK6;8}%h6wXC) z>vjqy^7l(|gySGPj3m7Kx54~ZP+v?>JN%VEX@Oj7vS&nSW1hzYa&TbU1#-A{6%0-yMJvu(eiEhaSSZT17_g(zY$n0OBf~5&>MAdwA)|L${eL1`8`rXrW#{DnC0noM-no~X_Mget%EC-0*&1=f<=<*+QBl$5+g{bT zbH;D`jNcAlzwN`g6V4=CuiW+fL$;PM$=1JK>-BOP(`(%<5BoRSdZUucBwLwY>*oFQ zOtSUi!z!lQ`j>3|x7T{_amV#PWGfSEt*fh})m8lUTA5gD)w3R^*ZME4^=ffr=3lS% z@r!|PFLIb_>(a}jzhrCWtNg!YYyGQt->5hL4YxjQE27b8|DD?Uy!XLGZ}W0*-S^&y zO?}mq{nT&$wf|OIssD>=E0b*P8*OK*txU3Y?BfJeZT(BOPE1VxTWuZwzoWKJ{aBt} z_%i$B%l!XVTi5@esjc4s7q!)kf1VK!^wu#F_#Jc+|5a^m+gDrf9gy8!Yy0oi)@iuy z|4>^Oi)XU+M;J3)8ue?%SIr#)X2UcjCcP2b^))^Sdy-lb&oTFaxx$Wq3+V*=dkb?n z8v>sACdt;bf4&vjRDP9*^erHcO%W%I%vDVBAl!+rZ6)bW=JFrb6bd-dZ;To^aJYo! zX}h)jLaxl)GKGN6Ugg{~TMFf((FeGV1@89--mW>b_br~gnN7w|1RS8z4(D5e6l+WO zKs>MqN0D(D)Jqfr!SY0k0jH)Vq_Ww|n7~B~f^vBZL4ijV&E7e@K8ip!^Bt9E{YiY( z_~hB;QTS7ip*LLG8wBFxSJx8W@-8@gE7wXXyjI3H1C=mQyD^Mi1AC8Nj~RChHK_>2 zb&t1QY&aCEj_5~$AMg72J$WH>#8=L&B@GdbzgsiOS_+=juofTT6YZC7okV+pJZk(BL@2X^^WB9XC@X8*fehBQeY0+$?lcuKb0UuOQ@05sriXAhI zTY0!;7}l#41dP9x&Wtc3rNe*abaHxax9gQRGom5}t>H5|V*^RAt;N$g`Fo`YhO$&< zvb{C}XS(}VS<4?AX4@JR$w^+)Ehs%|UcC`1)y9>%d@QPBxBFcfYT(f$&3EYJS07|5 z>e~^rsfL@R-ea5*3FNzMTk&M)?VJ?y80D87+14A&q+7@Cu+^}@71;eRK-KcdR-ZF} zoKw0C=5Jm7bg+2)M|>Q}`6ezG4Hn%#<-NK6cgcNPDxmx1;~Yiq zJg^~fJ4g{HTb$g~o0tJAL?%FPBx*rrGS4N_$v|)30ob&+2#&yUnHDsm;I*s5KaW6vhoD3U*INelPP24ZPD#K#l|^;E zlEy4%P@(}%K0RK7`0Ys~Cwpad?m$ADe-;aDuo!qr_Cl2ks(;df#PkhPe))goN(g;* zuzoNRMV4wWs_6^iR}@)59u#fK;sH{Fuk(mx4Q3H|#bS?3gqLe` zJ%t|*t4-V`$!eG~_WBgx5JY1hTz}$APIZdVNNoW8xIm?y1%t~;y6dIUNn!%Lo=|1L@8cz;&%09#|toLNlN^pW3n%CgO*%AXgN z>t@?n$sWGPd-i0t${M26A)rz)`(&G=3N;?*Ac7>Tl$Jw6^VpoJGWVymJzI9rro$Q3S~31OJ9qWB<9;-z=SNZpJ8X;HIBu zyu#i1UPw#-xlUiswzi)?;%x>?^RNz=5(r<9L*Wk}`+%VEP;C>5NTK$JtI0hS^!mxe z(~mCv((#D%SdMnoaA=mvFyq~^X+0>$Al7|AiRW4+Ya?^qPAZLs{m9Sd6AQfX>gGq( zW7ESOTjqkV(fX<)&yd{$Y?-NM53WjByc!Z`YxGE&Xcg4TS)TCR8xu^3p4V|1E6y~` zIR5J1T;JX;iq|1^mV?(U0eP`wSD^QE&vgiW*@)KmxPIj#uT9kgz2I)yhB3yzI--EV zx>fK#_S<>|^ceotn!lp^>{geHRZ61Y@}w|lHqWvTHx3&$Q~CAvkyzb<^CrIaA0)&B zgi0KHP$~HpE*Cokgty->jjCCiIxuL{<@<7s6!;b9@&_ zULNB)lZqVXvMl77ddNN6bb$Wbzf>|QA@kSdgD%%MtNA6r@9dgNlR+ZrM+Wj`!ljoR zmw2WWxQ?^x{rEV(v*>|cJ1Q?AACez;OXBQz-aa3Y#Zfl%=N^8Di9z;u503hXqwi`4 zx3&CPy!J+tpV$;rv1v`nIYHE+i>IlJVM@bWvKyTSw@-ZU5u)vq8t*!{&`-XObQ5z| z4gVqxt!;CvF()UB-de@E&l)}*deh`MxSoQK@@fNLdTuD&n)F1kR1!AmqWXNS(sc8& zX>-G!dyML17vT>pYXnco2YJM)HNTBxx0$-*3n)L2Ik@^GD2u%ksC#;d^H_JvBle=F z)-gq>CYKd^jKW;P2|Z)B{NFk3_0+rWvWBf+FPpr|nhym8awL?rK*vpW13!f3BY%bH z7mqOX4AL1(`F$^N7^zEx2PZzyft#=%lDG#y$38n={zqv{yEwum-i2GggnnE$I(EC2 zY)?V#eR?jhT7C1CQlxGhR|FzKbBH6pc&kgM@_wlr6(PN~?a=4l!fTrm`#`ZAQD}w; z4@Ybi+k&%VSk%Z&zP0?lr+!Dw>x!61z6e+);=J7vSnEFD$~S`SokNJp^Gc8G^Q~-5 zzLgU9`9qw07E9tEzBMxh>dfR@5%J6_f|Zg3-?feG2K@|pZR}#%S)h@s1qn~??(?nK z1a@Zz)S0o*w+0~a09f5PmK}@Gv8C_xtvgJ<6`{i@Pw<}?p$Q=E={#wsNPMwb2+ie4 zUefu<_M z!gJ3guOYRlfWN>Id1EA%skZL(ts(nrYulP#>HkS>jZAMSNN;LLZ)WnX)P25{Ux=5# z0p1yzk*t-m6rRyJlK#d*s-gyKOHC8zg<3NC){rFDqnPi>{t#5GErEAT5XrZlF~{Uv zL*(z`9PSqID#sx(d5Cdc!~&W9ycx;W3o)t7`!zDNfRg!Hi1Gy=ph4yQh2X&CTBiv> zfoaxhS=U_;Zg&r`d9jXd2yzVrfp1=0V?bpw7bTL8@Cdt}ZAWttUWI`C@LkR-lt9}! zxa-C_X$99`7>IMR7wg{L>?J1NO5u5ofuzexjbX5K6y@Q+>{g3_Ty$21Jtbg$hV>A` z;`@1w?2!G?0Ug$I%)}P)%RH+rdEai8VA`!A*Wd7ALpRw};Yykn##Q%3n@n;{@Sf+I7HfCyA{80w_e8Zjxg<3 z-WyN<*sUDXYS}cxn-HR^hVAB-`2tn(@RZg+?{OVN@W&Ju6|e%w+>PK8*!OSd%Fz5> zz^|oEx#b>)BUhEGXWL?@vIjB_EmVC2R)$SOQ>m`CK9E!lwz92&viBEnMHc?sZe3*B zt!ssRSx{YocS|q*!*hd8BK$!kiD|c+H}lp z)YGpswMelitigBRZsmPo&|;_GOSsh)a0`Xpw_Ev~Stq<9NPZH3Bk%mRTNfKyR~F!3fAHv%!LyXI zC&)i`YjGKPpzNeb`IA3(>lK_s$Bddr%>_lW|2ZnV3Pt4*se&{Jzc`BI7_(PFaDQo6 zQ|{uF$f>xzw+N4oWA6;RyP%HskOsHS%4~Und!JQaN@H({Q<4ru#3_KQP|D_1&e9TZ zm@rgZuq68wP)_F7d;mu!*QfeO{Iej%}Ia}){!fS?CxcI{#7BD_BEVM7{7 zsF+Ogjg!~`D_3+I{5wn*OoDRKP`(5& zR%)8{;NS+z0Aw|Ia$j)OT>K-rQl5rl8ZsZkbupk_@{6X|FIv*m`71JR8c2yU9_=Dn zIf@^yI5j~}fmn(QKXTN6P(m0`^6U$Xp(1#6mpGtz8_=2+C>97UT6YdgNv z@%u@Kr(vg66e|VV>0c$8S=`P%jd_NP%e;zv4A<$o!*$lM%RH#lCa*K9S;8#pEFRHq zUIlZ;by;Sx+xD@%bqrwX7IsLS?b+>#<(G{N)ed+RNly zv*Lah@AIv_OuiM<`(9b3EQ3}S0&NT8IjspvML>g)?HV|?Stj2~>GBHUF!=$=0Z6H9 z9JmY`t(g`bq3%s*iKeqIB6xpzKv|gcR0e`p^Y&y2`-dX>ZK=0&A!n`nSSFgy9((X4 z6vNaF2QGv_=d0eBSM``TqwX8_-72EJOhtGz?bh{uyS0yLx02}`;_q&R(s=f6(GbzJ z(-Aa*mjj1_24q8M^FsqKGu~Zbpg-Z>N(b{irt`edN64ulhM9INiTn0nyA}9jx6+wW zBd)z=9tyB;w}Msn?bdAMdrHl|-5STVTh|BG-9F@H^;<-}`{dfgA3U(Yv|H)ypNiFQ z(Fx|`-~&v%73lg@3@#%NSyn0k_8gR7f!EYP)5%au2y7Q%_rq|dQ`mQL94ATKuW?*^ zHZ0F6tfdV2um@Z93jF@dvFc#9XcD(Kg&TtZaDL)rR_q7!Wn~ox_htkw)nkA>2oB=% zcgL_?G(6)4aFI5^c;ui(AD(PE3I=heV|bjH&%bpD+>F7Kr!sbe#0^=0JHde0V(2B} z9~b*SF(=n%YwS(bu@~I$T3-H>f$z!W<95n;V}6SUJfeLfG3JW7A&*7|5YVd`6I|;< zc}%;N&dPA(E?R*XBFC(2K7sG<+pQF)-3t5zh)+@eWw*}1ocP)QkNOoMvmkEUDH+i1 zHw8OwI=drt47Jgfq3hh7{AVU7|Jtn>`Y+f1AHWpKc1l(iada2fg8cZKYg%bwP~+3A zV<>d^a2H$4^am#2Dm7{1KFOFEHKViCcuamrvRa40HlwGG8jpp4f^Qw3m0q1VNr643 z{KNT#i#a-9ui;(a3*mHnBk-F#n>;(;&$Zxw<=Ukxf_f2t324``!7kBlml*Aczosi` z0fkA0;`dG0;lHLUase^0xIM zC2-cvL#AMg7=9fB|G^YbL&m-q_f6oqrs?p-8m{lEh+UpFS{VG>wPmrR+@-irvZ2g? z(+3DIe44bp>#;6$62W(5ef`S%JBbb3$=+o9_@m3*&^J_7@1z;b> zz^x1CYXF`ftDMhqTuqF1j=ON47|zuYID^F9Ud*{phfjyF*A##KLE^3HxkzWZxW@i?&v>4K`CW?SE~K#L zd$85exl2P>!k82+0zMJKUW$S5)^Np9Si|V7**GpH+}bZ zOPdeYL{9L%+3b4!9_5%Gcz;ty?tqNl_noUUPJ=U+xeCW0d5;dxS{JDKyeV~hH+Q&5 z@7(Il=)2EHit&j&Xy>8%qotP7aW9}}N-Kk`7+q@7fAhoe z$>B579oZ{4Qtgc{BsJGY$lZUdo^|~6Rq5G>t8Q64QI-$RXP$g&obMsbcQat!K252; z#Vt=kpwwnD&~5SkVLpX(Ulre@1l~%jQIM9rb&i*0uov}QFb`^RF0^Y1S?e5o^V41T7K zwtV2wQtLkOAX@UD>KD$7Zr7}@vkvOUMCe)Yz1!Y|;{w!0Q-$;_8p{Md0~=om4@e$3ylF1!RKLM0W@)^IK5)TotIXtsOm?Sa zfNEn$j`%^l3ImxO`!Gw{n-Lq9@+G-pR)thG?<}7!VY!?HkI=4zL}b=XhEL1s$HsL- zjkO|I#^WC=c|J-o!RBqG%H0W+8JMjy?Iw3f8mte!yA!Bx`S8wOpx`=M@&fg~Y5pmT z`*UsA$986JBP*QCRgY}+BCuUv zQo-mzg)>?)8Kq&4p6OcO9KCanpLgnUXl#e($uY-LQU1fG!)BQ;LjAIo#=7GlY(dpb_$s1$BE1By zzxfqhVs&73JazNy55MAg!At)4(k1c=Y_c8n7pzr}HZ7EuPBBGkD+q%xlt zM{p{-csq_uy%uqk>v5~`%?u@K%f02t`3+NP%C!ws=}LA^sRf2(4Q(mt?lsXh9M@Th}Y7Q~co0S?LUm#?Zz(l5l7T_PJrEMjo(}p)HS)8+3*-+df5W-m;ocNyhe_fWfXaqgzh?)$NStF5T@XiqP5#4Ox`OQ-$U^~th{A465Eyc zi;?pl>$B$5!q(Gy?+T`ueGMk=56=JUX z+MesRxc$byl6ctR@cPALoFZQ<@lVT-oJ+I6XLkI7)g@Co_jQh2p0#PsQJqJgHd%*9 zXgws|3UumEyQLC{uVNYgYs99k{V`&Pdg*@kU#ml5ldembJ|xTySBDs#KE?f7{Gwsp zRoGQOc~c~O@4%aZM=Z0_2q|kqk0<|~>(R@njFjH_8-mkJWr70^Uv?fUcX(Q6I^+Cg zVPv?|;aOc-aBwhFZ3SmNnN0{j6aCAD!BkuC(frOPUv_=>2FV@3Sn|8|8rOk3(e&WB zjK|IUiMAQ%n?BB-xo}!E!5j*D_$De*P~tA+OZkbHOE6Sqhe7|w?fjSTZna9^zWfiw z%&Y!_(3c~>4PKMpcszf}g~sk(_JGQb*I3=TYazAE*MScSPIcRv3NbEeS9#`xP~q|t``nPP?kj!u#3c)H(Jh)G zS7?@Bg<}ZI-e|Gdr;>9Z2@a=-n>h$B-{>b4k8-dV8Q##$)r+XCv0Jj%dN9JAM>D{I9Sy4gW_s z#aA>scytFk#A?}SFFZ|-IKLB~So0KDS~#36@)*?8USS&T##{4a;Gw&Nh{VN3Ut3f! zB}2U`xOx)b0?tOXoO?W6wB+`S`E>$Scz89sO2v2VgXgV;np;=jt}Ff;eWzNdTsEJzDv`oj>3FIXZNH=;ldPuC(r+`2>fHjhE}FuS~y%` zaP+ps1@Q_TU|dm#E-y}|+-&DP2AJsjU;OGEc z7UxDfn4;=MC+7Jj#~PSQyRmiOvE9vURP?CZsidx(@gm{WxL1Aw>8e$eLiNx+8ovzI zx~}A9ex6R(p`f!oq&Pb?TJ@g?>4<%ySjIWNxcb@(16 zs65RA0VY)D>0C2i`YQmipbFQS7^(;yd95Y%d-~)dFd)bP01U)vjSL|H@HMgsCRtS% z09lZoz))bskf)M7bqoOMk^!>8y#of}Aus@^@3aAM_`hV8*9R~_9s~xXV}&SXvbuN> z5@>iw-u42%N$c|f0EaHlh*}0xrb29(4G3hXdt`nLD+dz90RW&#a^Wu+Ck@x|M~mH_ z2AuD)fT;TGob{`}j4?y0YfEih;ZQ)=*ompdR_gENEn#suVg zf{-_uT&xB4twB%Y#{q&`d7YX)LjhNMJ4g!^IYqy?fCD!x;LpQQvRA zn)yhu9C|JR0OnL8ADzXo0f22Mon#Q5e(EdB>%%c7Z2Zb)1a5Hp7XeK<$U8)#XDGWJC^QA^^S=N#Xu9StT z#VW{mhi^UK*J9<380oaVDg!DKYQB_H=AMg|&ZQL|v%pc!qoL&RswST*5U8fpD7`Zv z%Cex+EYGCoV1|*?&vxfPnp0Y1sEuKGgaHqm6%R(2XWPsr&fK&V9ZcyiQwE2xwR0e> z;M*+Mmb-er$RUmZt{GjMQ)ae>(jGVK{Y?Wn99t|barYF<&D+jFs zkS9veQUH)vislLFbhff8P$Ekp$g%HO0DthKdMoT7B-VY=3eFE6&TT(Y72D_Cw>xbR zJpXD>p#XhErkq|{CP4z6;;1ij%NWNFzb`H0r}S0WzjnR{DBJY4dMmb%+ZN$$mWS@^ zR+T07J$Kvg(%Wu7IghO{sq0)+84GhZ38pz-GIz!~%(40QAlh2omzPCbtYF zL>i4U0RTYnuM7qez7 z+h2Kb8UMx=3s?Z{N*%Dnpy4_^fani%rGsIKERJ+m0KaD)NrnjkN=7W#H(xm|V)bsJ zMRxkm7UKb3ro2ppy~I1|AL-mN0oce~YkGb-EFe!#s!I^az(&_4=nl7zK%gK;y5KTe z*Nqff%5q&A(Ae(gpjrd-t%hzS99N>0+5P) zYfyw`gu`yq%H045XML8HoU)US%>nn4-fp20LU_-xq*F{MHuAl>4c%kd;rZ6EyR(Dy zSYqrl+ON9oq)<1J>BLgsyGEW(pDBu_chIt7*myieGbZSF*g170&UToQL+{+QeFvYY9$7?GiT+)siu=e<`j@c?s zIj{$m!=t%Eqd8w&VIE`{!^{Z2B=3}zb{_^n2-b|3g{ z8^|ETNC1lkD0rmxemK-|__-Gn#OhCGql^w3O)Ocvog8H08~Fqi(pOFa{t~et{}QqC zSf62F?>f1=?xd3yh<&Yb_?-10A(reu0sAlk<2e6mUx=+S;!7d-ABbZkHOTG@vA|qM zA6tz%4+g^g-vg;9VRZ8Mpg5)w+cd0m!o!q>F}~UC<^+bZul>V{Fn+EH@E4VB@|!#N z(rvz4$rd#TxVoRH(SwDMTUKHL5Awp-vM1G_(qFhQZ#96$T={>$BlwWy4O{oVwogfE zO*r?qHmsD1X#%iH0+WbUcYg(;CwV#In+1UPOd_`1ji3zy(h8b z#sRYTiEeVY03f>3_imWwgHaMclZa*W)_R6MP~`$30V<4Y=1z_!V!<9b>bXQkdvf>F z0rA^t{$Lzu?x;)cw6W<+#VDYq9So8O5Zg_}_ast-J@q)5`NTR29&@rI&-Z(}j(Mu) z=^V1{({qPNK21<_=)v*>+}(7}Vipnx&~pDr`}{xd&HyN+H>5z^zuz1!Sd4=IRE7l3Mo6pB)WGPHA(!bF1;+_hHG7 zSb@(6ssNC@J_|i5gw!Zf5Z{z&L3L}B4zdWSW)dpnZy_I@(z-{qSdBcq`HFk>uY82C zS+gRUFU|g@vNn|VXXm!h=<+$&pW~+cGtk@*cJ>eigMr$_qB>1%;|XKn_zDubQAy~;RGK8YZ6TFh~7dpk`{ zMEH8u%E~fj*sn|(w#ExM&on_HoH8_~3~L?XcBXm>Mm_!FYY;G^M;;|b(o*D6z zw5R6yu!1)E4HQ^$a}Trtv1qP3kDYLP6FiX=`03h-tLFm!YL1#a;+2b6ri{-_I|K!W zBnbqa`F0(x^Q|m|;X6~;Dp9jqp$tHZlX`BWQy^6I`6N5Twl^|o)9eFgA`-19=z3QS zJ-CBJwYs&ViPlK6LzKx))utV7)4ko!&mR*H!*!39)_GCOCX+Z>J9mPt={ms$VAPzm zuSr+P*MRUE>h_KfN@DS3CBb^z1xPg3KzgoRCIby;`TkisprifIv(CRVEGwFPOZLq4 z>Y_l=^Z}Ep!C-<~Kb{H00?*LFYyb?Q&!P!2Z~)jV+XQ#Qz>)p|--5tMfVGkgr20IKt7O-v5>V}DL8V0y@ffB>FvJtd*833I0;pI2{n`=hb`^X{<{oZX~Y^j+V$)!~zswD`qBt$1VEi@usSEW(8QFaB)euX9?mi6b(Z)tTHq;!TbA2@{#_NOyIa@k(ab=r+ zczygm%LxE>H6Q(EZztp@3-ew*uA7!{9HECNrapz8`mHlojSj2{=qii$)Mq&nX}lK{ z2J<}!yF0tX_XzFBi1r+V8Owt5-mzE%2|YKo?n;lrodr0l>R4b>%yiSy_j&8gFoQEsI9!2DUn)R|3z&*+rk%W zqBvInA8M;>N`Q%t$m#E@pZ=-UVlr^yEno!N2w;bw~b8TwbGSJ{13IYK`L4N(zaBJRGKa-6@4QTm8S3@ zWm1W=X&aTHMnJz+$`n(BT+>@@kj^yvxh+k>K0{JC@L;`CHWi`2YO74{QNluTw%uGx zDZ4{pBRb#h{GKH`-|>>3FC~CWR(fA;ojYGWaq56C%B0=**hFfLtL@Oh{oGrTQ>CA8 z$No`UwfV2&@O1vqMM)yVpKoe|7U#_=*~gJ(g@H{96(6J&X^NzhSI+!F4`?DUlit?DcXA(wibU37s}mQi`;GIVnr}@-qa(~7^`_QiTUnmh z57_3s)*L?Wa*O1oVkchh^lhW>>C+SGM%tq`7j)u>{qkRGe+ysnoxWirZLBwY-Q|kjT;b^!z0bEU@9NFp zO*htGD7$$@fARjq7X78_7rXk)PhJ}vtUUdA#bC8@sm0*S%U`<&YptAE!}Si)7{iSo zrB{ZV{RVr6TLacuqp!m*F-G4;ProwyK5==^XnQgpYy4yOW{mOAg@><(Nf0Ak&h2k%f?@+LNlIGBi60EE2PzigI z?!JQ(YM1X+%YKsKYbY)9uCi0B_Q};VQPN`5@?Cm^Pp+MBmX_G8>@wPTl6hrE8VOP8 z#`D!uk_=@~2kv*9?miYRq?o8YXzjLa^Ox9^Pmy`#k#P7-EqAGW3R*$IY6&sTX+{Ps zJixO^E8pc#M|Pn0-qiFMZ+r2QFa%K#wO+UPo#Hn%nLUN4tn|j)B-bgW*rX@D#un50 zCvdWdN`46AL$djUF>zvSsAROzBu5fvPEoSQ($b=?=+@nG-jr650w!l9*f6KQIBH&S}Qt|08g{Eo3{$_k3v}^qkgnS6dyCzKvf>?nSjKD<_)4h{v8$l;QwSp8un>##_)+en z+aSRma-wR^;t488UQH5uNfrCVN?JLdiv%ni>5Z~nK$sjf>yB5}znGeo@J@1dL?+-D zaKvr9qcy}UCdSvxBe9Q>l^rDtB&t;7nskS6E228bX9>4$gyc_i}LDPc}a zQ=Q+j6WVNBxx*M0{_RsPcp1UxtBz3X)>aqkEw5Xo?Nx=27nU2@&l)nKD*Ufoy^b7e zLxlMgC6Gr4FJ{()n#aX)?Z_S=Il<0b(!mJ?~8}YGUX{|x@nF`w(wW>+#u6>u` z>UyT7t7q=qDKTX;1w-`RbnUg=91Vw*BRUEWF;%%QGToA*@_`vnzsPp%N-efU%olw~ zxa{>>N6y_SEBsI#|M>XK^#iq^IDbh?-pqyA_RuqLq>Z+rRW!0jm;8cQF43#JHKdL3 zX_m*F`KD94bUy*J+FbF~g=ARS!2!gmYU@4deKWyb8)rghW9bPkjmw`41A`v@9H3nD z72E(gom{70MU!djRD9ad89sK6{%#|it6ARvT)HznGQ;Ol#6hz5ydFmByyav$?i+5f zC-XKhT+8UFPw<5o+8!`Jwi zN=QkAmly=A+jij5TB1g?!*511ej8+;geyXRuMFuB-dL;kHIx!PtBbzRK5|Ov)JL|- zWM1SX{79+hEVmv+n1^o~o#HH|YMf#K<_RM_ifF8@{z9-g<)-d>O6)h{=W8!_M8#a| z!P=*12xrbvjQKBpPMGQ9d&Gc8BV%9Cx(W56HfhS;AddCzINdCyfBi2nG5a&|tzWa) z?wrjLch0%zh-sk+k>x zlK7Du+}R)QNmeJo_)MxlYU9AO)TB0nz*Cq0aR&*W#09F@T^W4g(r~j7Q0%+v6xJ$= z=bNwY(6wPt{|bI_r<8i%fO-l_oPU3?;Lda%TI%Cvi*~kXBrG$WYfAIQ4yU_qDL!c1flq`8mJv$fAHAXr46Hh#N1l>kg<*m z$0fYuz=4w2s&9V}m6VYe5|;OnrL}AGLVpUIs8C{C6Yu1Is z53S;ZGV&g3>>grITr7v%1GIAHg?g~mi>7M*GltSJ;g(MT~GUQ*1EYXsw$h!U~MRsUP z%Qap(?~fu|HDzowuJCPOUy-GwRBPMhgTT^Iuw*4(CKss^Lx-4z_*T8=y*>?#ybq!>|?SsWg_x;FFUY` zE5!Rj`7b6Wn*o+|EfL?I(;8_XtC2-Fm-CIZ$}%xow$DG9m@KL7%oz}>1{|`=ZR|>P ze^x3&+Q($ig&m2!7AIcMEZ`;vgWq`j`dN>)%HxP3<$jwC2nIti*@(SRE<4SHWNW~Z z%`!|#)^E{ooe9a3z$6smkiFboH!tX4N+Q_-G^l9EY9lQR%gHlP15?0 zq|JUxtKi~Y=lm*|kgQmxNCr4DOn$W^(}8 zfl{FptWa&9Rx;Tet+HEfTFRB>k?peNY59!GYrQJVJ_~%0%K;$z4_6*s}sLGTx zX?j6%d`K`QFY6gJRx&cljvYLUf2&p5Do7_3Nw64i5s>n&SEuA$>me9YoR+VD6c2L~ zo1iO86z<1e3u9qeW*Y9uXa=zu(+Sq<1odqN*KXofMUdoLV0mP`p`xNtPdg_fLEDfZ z&rj1aBIr}xX&o1GPRDujHpGb-8BY=j`G^^f9)q?ESbF_nq z_##x>eeEhj#y%#?yxeJi=5&jQA8PHhNHFqPl;3K98D^rhrgSU;GG<6q8o`-u#h+h= zDCyFaBNFy4Sr0I=z8YT&5~U^z7J&%~gb-5jccvw)i*IO)KY%bX@Q=mLQ%}ve3E-HP zY`j#^O&9SuM)u#!6kx>;D%!n503PFms$6EO2_xix;^4;qpz$q;z;^qV zs+nue&}4V@aYrk7Q&>zPBVKL`uaiMkv4}@TW$TXzMdNFrUyd7h zgE!Y7<0FXzjt`0c@sg1s(J1dl5fk0*SizcDyl32j41B{BQ9P<$hy*h31B;GVPri@k zNKfd2R&7a6z1(n#^WpqJu)ODeZR^?@gXaW%}>aD-s%G-d!b3Mi<5YQ)UQ5xz@ zXP0M_fG=$q(v2PWtd-5guNgyS*W!?;K}wwQg8s1>1k{+Tn`=H;f4fGDAG(?LRtSnm zn8Zs`<0Wkf&lQuzsNmgH=L$=ySqC>^RWKhe?mfOun5!Mto%7m(D3~lsh_=%*QN>s8 zg-cMt*eH+!qMgsXXit_Vhzlifh1?jZGy1$Fy!}O(OPs%yAi3S9UT;@%yxpkMPTNC` zh|MvWZB?(2-IPiY;a^%0(%?Q#QIft)0=TdT&Ibf9> zYVFa6s#JoANXT($D{noUXeCu>y9Qet%hOEoo{nc5*#qRnT9MoF2Wn!qo58ke>Uh#K zpAWH~@m5~HAgb@Q@FQg27n;7Owf%@D&T_P?qz)n6*>TT5o-9UN6$30{FzOMOQor~!UhY5wM`^s+>}wTPSiZR;Xd=qC7pLk<5G6zGRATk##R)Pc30S>E7hOEG)njHA zPQuG0o?(mP*kuUUIEbfyL9YKIt}X&@uU3f5kMKfgqZmgGkLYE+hm*+~+qK19#eS#?fNV6*z z;oyIEYq81~z_--O2gD=tdA?^$DQ7@b?h0Kw#Nexcci{j{rLqW1xp3#yBkT~fjWkwu z=V#N;_j~${C9jFOufNm3{d~L>r%&a>^KV^AUR4hg7WjN8_Q(sol!JXwZ2bSh-hGC{ z`S$&y*Ud1@AbJ}uI-{3}E_#WGE>VL7Q4t}*y({MXtp@)!>0z53j$DmpwOn$g?Uz02BGe&x z#L=|ynjFGYj#ZvsZznBJ_L?{^xm`PJkEc>?5GzkWF@Kwky_+Zii`2T(m#l+QC5`VC zOME1LQsArH2v%)SeQmmzGr<6{LgnT$b>Z*M7l!2w&2gR8np?@WWX<&pO6EmM6n^y}LrMNxOQ> z(FHVH1&WFXFz&)=k#elsUpZg2fug)Qq8#$T`=1+X9Pjcp*v%>Vi%>^h;fr><@#5^* z^`M|e@?mGYWL(0)lA-kYcFC+{37KzkcRMLuUn(lKO#Ncq#A<>vScY4Ac?yLs01uG1 zHdOsuA$#-M3b{B%kaU5g0Qn6?sDr$3xf~4=`!-e4y9$^z!dp@*km4{%)b3rwH|)lC zmqlOJ?X#e!gZgIdbH&}!`rqYq`ezonZI=eQ4$dNUiLgdL6d)?RSX`&uySMKZO-%7)pEq{%&&!c7zQOgkE zW|g*J@}ke$)HZu=qfYcyS8}+gvBGU*vg#>i>DW_s`L;lj5_F(EJgPE){Pg7uHUriT zA?95Q)__+OCvG64@)GX5J??}bY>RahIB??wTFSg{s~P_FvZv&zB2vbK6@868%-&rAB?0GhyK&XO|=kao<5)>%)S@Mnt#}q=^F?M zAmGy4V^5@z<{4$A)hJ^smE36{85Xi=rVWa?T8}fO{rYawQ)~o-@p;)9K^T9!4 zzKH|pb;^(K6UC1RZy3s{y;QDL$>Em6ZL^K{VkEsm$lJAV3qd)adv4H*PE*CdG;mO1 z&+zSAdPWcdTT~)+tVDK}cv}mw64osBD#`_T3n+^DCC6De!{?YfA*%aKx#`Ad(0d=# z9bb;Oh`PVjBh8r2TV2g@|C$}kuiNDYqRq2oN<^fjzgaha@$!BMG{!TlFu!U(eXFGL z_(`JX+1_fNk2@lel3u=YYJkNVxOBx!<8%xM;UK#gcI$^SE4B3byE6N_?>A-E0>rz0 z-)bC<%&rmYh8YEBV$|p3**#A7(m;4p72XSv&4MXXvPfN|A5EG~y~}ht47ie6beF&4 z`NTsRt_-HfKh;+0S4PH=^{}dX(ym|>N0m4(85d4=Hk7t+`E?=moT5jk+!!Mw!XO5~ zcK1QstH{GS?|i&)6ZhV1n#XW|9C^P}%#Vz^SO@qKu`XmFqYg482GCFzn*kC4)I(xT ztPquRc7maMp)-MFNvbcvPQHjC*{o@S&m_`TZ6+%=mXl&{1)E zp)zw!;cO~hzNcG34u?12Y4n8nVOk?0;f9JV3?$*-)z*x_ST0j5&WbWj>2qkHfgU_q ztQV+ZjYa?T8EY_>YA8W=Wi090axgdMEwh3+T%VRUKvP&ZDFu1#lUl1=%Q|Mn3j08X zPYRG5r&N$EsNz)y(ztNFPbg;U4=T#iHe_Fn38dXyY6Z%awcG{C(qo*J=+gW(80b_X z8qU&wI*Yhmsh!kSyAgaos?Xf1V@M*FDJI-Qq>o#*iA$lh9@dz^e8V;=Uk1Jgz9z5rjw8D4um{Fd zPxNcrl8W7mxy?ok^s5;}OCoFv&G!2A8}}zl{mFSufZwKTZY008t>U(&e=t3KRTLc! z>~QC#Z?fq!sW^b%!G8A1Oof_$W!6rSJ;~GA@si2%;_F)vM6S)YO;*>qUhrCddoo9S zYE{t`&21f2Ki&88hz@<;bQmW1_(5x`x?YLTuDE|@ha#o2`*n#sd%x+7wAi~o+=s6G zv-4Yzp!I7ug^yLQEub!vJ<<{pxc)$?hKdpl;`QlP0;&dVcg4U~Pz;VFa|0G90-KoJ zNlA$7g*m5QEj1*@IgoM=iErT9Mlz2{GeQzXwT*zRo3|0$4dA%j1$GHf1_!zgEXjZJ z#ILi%gIh)6LB+7UC1UoX9(Y?pzfkq?JQ9eqd0|D%ZqzN6`!+U!$C^2(y+`84eo9(H z5Vwp`Key9fWQv0g`x;f>^&8!>Zre6&L0@|eb6wFq!J4qT1fJv2-t!93C+YXoLnbc| z;@tLDVDD2$^h6J%-|*N8DN7DTbk5|V8@ui%N1Bu<<+@;^@o%iG$870Vh?1Rr{*c$whIG9{G2O`EH2dW)9 z2sbdDL^*SGcMADBWV$ud2>>uVmWlUFy{Q1E%H52X<`xc!AX@6xHUdlM2T+hS9(7D> zJy|mr6t=i$G7ieW30AFFaI4Q~#s6lb= zgTKZMQe7fT8A`vUx2_q3m!rTRK7XE0UVa5VO&+4+a>75>YL)b-^+({W^5N8$ zuAvB=ah)*e{msAhS?uhB@hXT{(An`Jn4y!;NDu*dkIVR45T7j}fMv&J5@2HOlwg(tUC}(9fl1 zOxsbrxRhm`dS%#apK*gmh&7$?yx^Ryn3=1Xc3jyMDoSAdy?dlwS)U;!osL`a#ngJ- zl=?`*y#k;}7B*70jLrd*lY|IbbQ;Kq6-`Wh8PO>mN7f&($DFm>XR&?>#nFSG(T(6N z@DWpM5u_rpY4}Op;uiMdkhtt0*sL6>6cZ~Nn`OnMJA6fnS3>#xCNy;sjC#Rcveeh= zK1%b}kt8AdyEYraN69)0xEs57Lubjil+NL1Rjmaf`{$Ng1aSCfWxCFlJN$V1t$^21 z97!Zz6efLHD+P7wUf#dj&gTHYaolD zqm#)VR!3aa#x-zB5zktvNuNPP8&US5FAG>2&6ZTm7PrHezRTZNa=(&O|0?@uFvpHxsMc6@x23o-b#NtCf_WjD!}YZ$Zw*$B)vTS>i(H z{BoMN8tw*Dq{hM2!PPWbe2f)b*AQF64?mZo*;$Lvg13ak5`?b37m^_pR;ZxkArn%1 zFRV=_qL3hbD?!ABOvK>5h}A7oi335~_reK-q9SCX*Aq6<&o9HVST4l^kqVk^yQHBxNA=%og}%Lf2&ADAzxM@k(i{T52w$V$S$2Z&*UDXD@yme`J~V&D~3cqadkSGT);t#JKw5e z6kWt3FalHZ!~N}hLvl=d^5m7+Tzac~!t(6ya=eMH42KG+P9}WfZG6okg-AICiAp); zD+)RLiekb_>P||p{X z;Eb7caMyA<)NI#NSB}w4NzghO*P3wGJV@lay;u}c(4x``pqaRc52Oln=R3@j*^sA+ z@{cJeU=X$jrLH_)l_PF4C&3txo9q)PtW=A{BGXQBLs@U^P=Zr`RetkOOq z((jkkw@>18BsVbgn157dz(i(fuV=UtZFtRG|9J%oY7VJi$ZwQ%n_wu*@PWteIP%5h zDkG)?qns)!GGhJWBqN_BzJT||fB{N#3m}ch&2(m6r8Y_&ISPzQ8deJu-B!ksEE?iv zic>k#IZRTh9OD_L%_})5hQ2va-^~!C9Ht*HUqFaO+aPNfcQTE8v-~?Pbu1Yb-6k)|ApkN|{ zGBM^IQ|u!i6t+U%Rm+a(J6( z$pgq!Z&26H(oqH0lLx$x5A=IH2NMqz^W;KZJ9Y0Q3KsDUV!J~6ntWc%Fi<%qm^XFS z)G%01pH5%gv^P4$NbDwN)J>Nh-6Z)Z7p-=dcCJz8HLwydqy)#mMPnrEb zCv!Tr0!omuL3=!3a~{|terhLhm+1=Mc-DkGzeRqplFpdv zaYpgv^yV|A+L+?jr2*+8&~eIy_V`DVi|C!p1mK03eRKcmsX;$e}hnV*04X zqLBAVqFXZf!M3v8x8yv`cnS$OK#SfcBg}lY6&a`6N8mjx;nTK}3qGUcj+$Z_Wj0CfcGx}V?)Id|} zjH`>{JHd>0A$%aAxG>|At$oc*Bwb~!b>ruvPy6L>1S`b8Wo8amq=}SfKB<&=I-^Bf zsc?4LWTV3rT8(Uex)yi?z@Kx=zbPcua_Xl1$u z*R*xxc`K$=C8McHnr53qavQ3mjwxe9w-nlt5Lkc9w9}yBU4ENL3|=@|m^I|%3{K|| z-{&k(20ePw{dBIo7fA^7coiv6AK@MHVu>NjJCdxYk;psq#2uH}X!-tZZ}FAB*XPrR zu||ar`laUy6)*bgq;#}}`z`^D+dN|^=AqVD$(YDctMi% zr3mT$Lb)N$`-)|35B3Kg;G|0D|+ucJ&UCOXvnXM#PnmErO)Ke+DUuLDGRen znc2dF`iY0)Q$ZP%Pkk~`9}y%yixZ0o@+G78x`ydY`VU@>Q!e!%GOy3Hd(EKE>&AeQ zH-o~xTLCe^a1J@3gA^-+52y3F)@jz5vL(M_An1Np&vmx%!mQl~#!o+lS$BL2U7s-h z#!05c;`AA!rt6EGwFR_yJ``VBqMl#;V!D`SOvmcG^x(F2r`L3TY_n7$U-(Y%wgAp430%T4Ryg`zLICFf4oTY!WEtSa`2>{7FtVY$6&D=T@sf8N9mLuYh;@@>YB zmB~){Q^alQG*lC9T9OK5AYB+$9^#i+2|!wxjy1E1WK0@d)suP>#@`b~@nmbDOu$NC&Vt!k{6N`MDZ8Dto%8sk zI1@F?C!eqNxGSKvr{~Gm_AIvZ_ZALC0-d5|nlCTTzkNf2q5AaW%20=XsyHPXTs~AD zxqzivm$}2KGuxCBnKA3|jg21iK6B7~P{(!V`-!pFfJZsRMwh!R%$zTj)@OY%htO)P z=zN$}%xNQDj>!R>b(XUJNafQFI29&(6@RBaR01@FZUythaIB;qP19I4o6L=#$Cvl! zS*89|TaUYQQtk}0bW=vIW1H}*6sWSxEXT>S8=N`_5JYVRC)gs~1HJBLg)l_eWk*CL ziZo%=H4enCO{rA%S9Oy;YnsB! zqZNx_C~oV<1m*sySRV!TlYXaqO3rT%4f{jtU!q6Lzugger6)8Ld5_vMxcT$UqhazJ z>$=4TxRk=gJgaP1HY}g(i92_WwV6EZqIo#**6!q8VZY!9>}{^Rt-^9Dr#2e$0ZB9t(6_0yoezH~^Q;mDNT9_Hp; z;F{O*ObBC4ZI)wfI&l1QzzS!07soqJWI|jv0e-> z9eT2b$;Wqp84Y1hi#$R!O^f?9_$1%6jg#-$sRc7;ZI&>`dhZtztJq%c;K7Q4rbHiv zGsW*(rLp>0$7^5BA0NXPxa48!Rl_B@MRdCK?foEg+Ac#vUQZmi zC2IJ}G|9TY(WyM6`yw%iX*SJ`9_3aQ%oyW27BU@JFke&)_%R&^)0IjnUN`153wYbZ zGw#LOU~3Vwf)Sx}Z0JlM1|}eJkv$^Jg48<@6QE&EEAiV3@npYaaLw@!i-FQ z(ZNVGu9Peo7aGVTCw&gX!p~Wc<$MwddIu+~vmT3q6_u%c^+ahHk=o&mMY}}bwYPwT zC*xpX_~XlP2GMCF$uKcwF@(m$uO4#vi&Z;?9qy?x(%?mkjc_p%CE@}vn6CNB21Mxd zsIamF!==_O6F8r%I!{w&IazOTgMvGvIx6%-`YUO&RFz|eWvaXsY}VL9PK_xztn0xHF_MA z*P~9m{is~5S%X=;q%7Fj8k24)|Vx5zxGzv@O zH@5&dyAxeKw+kZi< z+>6gs?u@s+5eTY+q{v7S8gF|ngCPBFS3R;~6#C>T-aYS?0)?;-&AXMwc_dpr#Tt1{ zAe>TXj>pS1Ym%cm8N8nt=w#&zCOg6=i0#E9l_y1gY}A<&)X^CkNH)**99O8W{P;lQ zo!m<+Y09Jq&Clh&{Rv1x5BBa17Fm3_`U!<&-PKo0B2$qh9^L^UJZm5|0kb*Qmntyj z^<2IZ7}BYynxg7;FK@>fFGguu`ERsLpY?lPOSc<^{u5XgybKb~{5t&>fEnn)Q41b&O_8oauYD(`x|6Uwf z=kklw+I);@S?81K2o*gzX2wH2<2;Y1r3ltWsR8AUhVp{IhZjPaeImv?18b=mPt z*a72#RV#Fj_JVb;SnPHlh2N-50k!WYv?rGFI^IT`t|`9Hf299LlgOdi6HD#^arLmy z$1PYLdM>+aB_*xV&}I4^dxO?-xvyq}2dr&cZ4-OG_Cm$WU?+l|)MnkDIO4+eL*9W)>r8*K8vZ(u!!Q1zOjGKHxe$`jH8gvo$N5qCEb>Ze ztqz1?FRVi2*7Z%jt^^V1$Vl%`I<4Y+m)~5@M#y}RzTPg=ZAuDH@Lt6ellUz9);g68 zV4f8XuX8UUPkZmZK3OfX*}!xI@jevAXnTsDauyhy=h4_G07;CN#&~2<(R)P6N=#TK z(6WeYysQcTFi^0l8P=LTezzukqJsEw`oqSZs2imu_tl_9_Pi#NCUN|q_f!f9uYV4H z{AEr%1@vwf&CH6A6*Ieo6+Tr%t9mFt;@>?T&_3EtCR7-{@`c*RJ6TF6Z;bL&O_ZJ| z{d($D!t{= zvAsdQr=6v~`!Rm&I${0$5J@#HUkvq;74v7I zH2OSAzR}K}uYZxbJsO2BT{2c6P?Wf%KEP4p9*7^!o<4<~FYcZ{)u=y^B*3oA?=m!? zFX^G!J$yoTTHjp&Z9G5}4w-@D`64{g0c3lhz(^!vKNPTfQ>!dcYg0%1?UGgqhW3Pc zP&|ZtSw~vz)ZLQ}xC{$g3ucx#gmCo)uIEULJqn=ac*av5z$eNy((>>n3Bf=Xlp&jl zWBL)j3tYqe%-kXW!b z;O>We1ib#hOBmjM3!WU5=iQM*cP&#kDg9(>R~QK`;TBpjDn@aBhga1)GTZpExNEMggEg0u^B zJQnPb#XH*tDv*Ldu7E$qM_vNpNJuOOMbr@>{*7y_+kqFM3N*3|N8OpU$ypE@2La*X zZcgAC2~iWMS`?`yP(Bhvd$Yb(fXERx7-xd;2ACs1eu!1InaYrnI$krJgT>44TiFwS zOt4)X(BUyk`%ISa!*m$2os|uVhxSPlMOIQ)SE5#U@;$7iGcsa5R|>0LtgE}Ej`k$G zP%OkMxnG#Jc2|l|ZwlX|WP8q3i}K{J2>?$d#XdBJ)+LoPAmy$FAg29>uWDKnr`5iA zYFe7;GgYY_9egV|@dgrns)N4(!_$K?-Cf6ZC<9lt5T?rFjxB<-5V*!Iv6qNcimdda zjWh&jdPNJKgE>hm5(lvXE&)C-@1-(;TU@1f3CK?y{VJJY=V0YbiS}k|InmziGruvc z@7}Cz`c(w(%|laYh0wkY%88h8AhQuOpy)CdVV0pG` zWaTg$M;Zq-eX9+9lk<*C0*&GBy6j`nb77#sDnF>%R5Uf~kAJVP;LR?b5Jt$p4t?9# zg13@|wRHuzd$W0wul5nxS0RO$I)&QCNy4LeTg&-fUBvWdSg#_AnN>3`u7YoU0AI-= ziM1zLh#>j+j%3xCV%ZtjLZUdTDc<(-kI|1Gf7uPGe{PF$!1Lnm9T%{F)v_!+yext;!a|l@<5K)Bj*q*zX27?4`o;7ZZ$O1_g~kA4t;FP898-XDm#$Lo z5ddvu`32J}dRQ3?Eyo!g*Knhp<``cr?k#6zMWNiAH#gMLDxEyTQF>Z=n;S*mxr-y#TCPpYf&2c2DXrl)v{H8#P+(qd8(D2% zSpA^C+HtG;Az_URca58RjmLwU$B{K&g*8w6Ykan9{0M6UxNE&<0ntc2ZbTKBu-Z!= z5lLvije#dsP?exwm-OI=+FDrmyuU7E>$}>T#r-Zv{oU&a@7_ed%P&M<65gSuRGv>f7U^|AE?afCSa{q>Do_05D04Nu9g$u@L8Xy}e?xaLgO+TSp^)zEsap_jX{ zyS`!gLF3d`{phvES(C=;t;WH^#@TC)AHOxOa5vCHo0im@R{NV|e@e6PrXE^sv71;s z{Y_uc6{4pNN3)nGTc}1fn&>xDUPH@qzb_Y$$NM`qi);adT1n_yi_4P5m|7{LT6OGO zsRmlX-m$cEt@K5{VxuigQLR@RimNf}CXL&!Zntn~yh7)KbC0y~f!Yb^L|#O+JBA{J zx7)}X8emv$GN4wPIZQ>KR>pxA6;LaO2K2gVyNq``NU;6pK^kOE)%I&*W*sxPSFoRCjRL4Wp7ToeK$0*{54NxD4wqUwm zo}zAnf$sP@RO>6A_JZy1r=VWxpi)}_(APdbIB9zQd!X>e0Ia;9e{P_Qr-yW|ziyis?;CmiF08%m(K4EvMI^doamU^wEFTH8Q+_nL zO+I|yGO^vC;n3DP$I`ytw$C$&Uc)M*W7-A{o)xjyZx8iJ4ernlZVvE~s=qZI#Y4h~ z&vzgDE)zci*d~uLT*8Paki?mYJKnYt%42cTgl)3K9*fy;*B5{*IJ zjK2OTVkqy3h{k*R9WLf?H*(?br^!#ivh^s_PYAKT3YHW3t? zudklKh!=?F7wPA%;rPwbAKzchj9rXpyqHr9oEzAg7~L5k%$PrLN zpg!6PB%O~;QoUs}v_#_16|X3gOX3VbV00ZUXQ432RvLM|(pvc|+Q*GoLW)-ANIExMma zl6VDH_-`)tw3VvxaVcvp>F_PtYpu~PN{z5wUmL}TV=y7D(Yh@4&Ve+fXNggFeOe0a zvqs46f?MfIVs~dPVrsRgd^OH#qg!4U?-gOkE^a>@h@YDqzZV6gAx;T_f)U$7$vu8I!zrny=fNvn4&J{Sk6R%1^Jepmk96f;({wcUe35TuWb8365MrHDB=!}T!7ZlL}?PKC;HZ%o0z+=E}^e3*m7E^9fF2qsWc; zxV%R6Va!KDS>V{aj9psExtWmH+GI7S?&WVmPm>OiNMy3y{ctoXTZS{y0yf+Tb#2}Y zb~@p&01=iDr(go%2yic&lSSK_FtE1}aaiVQ?mafca&p{xy!iYf6TbB|`RkKtPS!aI zj&Z6*CB2+;wB9T|0R{FmJSjs*7Mo9!H9pD~L&Yx=J36D#fvtQPi|TM5pS}BWfg%oZ zG9g6t(!o@tS*1#j(67rWY9oRnd(ua7iKUsWs$*CA0zXA&vOb1GHxj5SSYAojoUil? zJCShuoX=5HuW7NWCR`SfnCK0X6~(bp$FQi5Wi&r61F6zHen>_A`14tQ!c18@lfuM# zbRnB2gfmSr<;dQ17@!f-&UY51xV<(W>LXr_OD9gI>m0cBwSEGB%);-p3bnjz?QFKj z0oA+-si7NEy^MzM!SZhZqUYUnUO^ley0pP!v*r+blVgXOD#wMc43iV5`9>cU9)q{& zYREDI#KQJQAW?lH>QjPx*Hn^c8iH!frejTzg{T;1HPr+nC^9kfOn8FrlqTtSH{|Q{ z8b!t@N1OXA=pk0EtA6i#^4M_cSSSRTWBE_}c*!w4hQ1xzfoezG*SRHPzqg%(t`>QE-Kr6PPt1nTJOG5KDj}F0u%y#Hd&bOg>t=@f zh&qvt-J(Z@OXw(DZp52dwrZ(D-J6*qlK1o9ICApVe&s!5JE)qC>e_v>29-L2wNi3hJ9AHXabil^=YW}H{(tE;jo_Il*! zT-$jK6}Kr=09UYCssY2Fu=yWT<1e0Dh&wIbU{uiwb>rK|jav!2zL%~OqI1#;p}}+m zjlOtr_7jN3F+uIslB(0IubBrR0C}|lNDfBQncYEy2W)iqD;}170VbOV9!mcb(o7UB=^DwcySucw9f=!=~(vtip`kS zq1+S#QoQ46J+KzBEPO^j1`k!VW+tywsav;qH>j1T-Op-g8_y0!37KF#v(&Sw4UCBgfU>J-?^ri;Khl7lgX_~TJwp?c*~yMZ zLojdZ*ckY?-JOAOhVWPd)m5TV!oHMJQ=Z#bH-y{tDB@b4S+@gh#jy+Zj-?jzNba4$ z=}1-ZcvxRC+g`$PSlLv`wZ^!`pK7edwA|F>Xi+29fwxnqb~PzgkH6-BX0KsGL+6Vu zWfHB@lrqnbl-;{wWy!f>1GVG{Ifv`qh>k>qvU+GxHve*c)bVWZ{m9~Bu1pott^EDc zP=6A!ez*d8Re@CYsui7fgDUw&b^cRC5|e2$=Nmq`2rlC)ri)!Qg*kzIE_u7$-_+Kj zRYKREc5?I(_qI<~5HFh{=C))tj!0PSp)R zb8>%u&1_SD>fKbQ)1#_-v(JyF>KAYyqVr4d>?cn(tSLQwGEsl$xO%Gb%d>~xE7$Ix zO-?nPbUySws=s@AJk^XsXK~?5nPX8-x8UD!4kT|d#}k`w1&25Xvq@PH7)-ZOb~%TL zG*}Qno^EHva|u(DvLsEJ?qI*+5^=l1lA>n1lRv~I%0|kHW@@@itji_Fqrr;dWV-wR zs;x(j4}DK&7f^VfEx6J@r*oA(+sK=oL&WA5!J(cVY|<_f26Ic4-JV?{O)fEy=aw1q zy?T_SUE@ZR;JQ+)wKYXL?vw6Gu$z;v^QE8~pwvEiw znW_2Xnr@%Z9?ehZPv%dW@qKrn$#^eOE}V8N`|c+-d#{Qud>sz;J$xkIgPr=2ps`zZ_OYs!9S6V1NIH47JCLjBHHWc<#i7A{Y^{VtE1{Vq=y zP$>Ksj4LgetSwl=E!eD$=!!nmR~#S?1`2}*1O4FQj9`VL-v9tUh>(yN41s`&Ng-fz zGE!;^a!N{SS}ICI#wn|HgrFgjfI!PoV@YD(%F+WK0m{JOEmx|+B2{#&A~my<0TmHjEo`ncZz{{)p)n~pL067&BG zD(j1|MZ2>9Q>g69+vue7z~52Xw91f`%7nk8vbFyeD*L86746EFHqZP6mCg81QQ5(g zzoD|Nqt%_GYXhTmE2HgSMjOA5R=t1!zH|It|M>FQ`0Unr$KiP6*NKL`$@ZS9rms^i zXjB$0$_~u5qfuG3C_6W|@Dr8&U6h^tLzI2L@afCy%Ide3$)(ku_0`qm)tULV>GQSe zm5sf#jrpa`kDHsDC!33_+l${t+0Xx1lx5UvsQo_|WslOeo0~4a9WHJy3^%{}U6c(| z^aEVYgDP47S(MfJt0;>FL5s3WWLL5FupmMX=pZLpE@gHIjVsl1wD=RT93uAIbry*B zAENAMv?zP+Z=x*o;;ZkX>}$wvJ-M!bh_a4a~KnPwH6BN=Ckr)oR z&8nzPoAyJLb^0mF#zB}{NaKDNWvL-QMcHHBAEGRn^bb+C5u;Y=FQV)ST9k!+7iB*L zC<;PG|0&95SMGfmWfdaPqAWKNYdd6LSu#%+OZ%$hcTpBDrBTVT$T9XqlojS}m;Elv zW@ig?p+#A@@1krBv-Nb`4^j3JE&RJE>&L^b^HY?Chm{X9KKZjKi>=y#7G+%o&B?~> z1-qEhqU@~fcTrY0kbuV#V-Cy;|0&AmbP-yA7iAm&B+AZ4qD5I&0@ktV@1m?NT9oY_ zxWvybll@JUP0O=Di?ZHmQ5OD7lr8_0D4SQ=u5&d)_J=4N9PFPBzzOy@In4?4pZzY% zYX2XJvgiL0Wuf0i+1TGj+4Vm}*^A#q+2CLe#3qor3@ysC{_jNDq@SW}_J1YH%KZ{$ z;Xg%LjKJSSS-8%hL|MzM{m-JT{+~tJkUvD(m)}KM zHMA&e@MlrB`nxC_1hW1m%0j=3vaA0PWydUkin4H^FR`UjL6o zS%<4l|6Y_`L5s2@{}5%R{)H$T`-dod%lKc5vhwnO5oLY4{vygo{9TlN0{xFf+3f$W zD0}sXC_DQXQTE~QqHM@tMA@f*6J;>~zlpL2|0AO8{4Y_~?Y|Ob>wk%|pZ+e&s{T!s zW#$6e{0E{e(N9r!;-@HU{eLXVa-us&MrA606J;@h^1q3)qkk7=8~-fIrl3VxEWv+@ zvIRdy+1_8GEaI0adyE!k@B9#DtAB~ILH|yaMGv?CCdvx_3sIJl=esDY@`osU-s*q* zPonG-**4H4D2~YQqAaW6@1ks+7W{uol+F4s%Bn!Y=+$JCpQ0@DUq#utKZ&w|I5LSq zlU*E>UED@E@%^8oY#dM)BBuSDC|mg_QTFx^QC9w6in6kQ5@p~15M>R&i?WYU(Eq(C z+x|n8ZU28N%I1~HXro10rthL``@a`uv;QW_!v7-5ru{C;ju3u>{t#u^a4-&jiL!Ui zu~ZOZ+P{mkyZ@0WYw}B!4f!t0mi-WAXaAKbdmAmv9{wv)_Vf=?w(_Saiw*oG%8vgV zQ5Fm5FQRN5_8TaqABs&piV-ObiTsl&>kT8BDf=PHo`N2YF#i-~hyRr*TZZ^el&$y= zMA-wJh##VC#Ggf3ncqa&a#_ePQ8o)l;;*7??+;OS_^+a@<4;lc{@+E};eU#<0XU@h z{}5&0{3goE{19c;%0M!^nB{0u7EKgp5n&+yBFb9*6lIz48KKyly}v|Ri65eDISh{l z3cCEODC_qpQTEZuKSWvR4^ejNcTqO}Ux~8me-UL>Mt+F0oWDfbH;5mi>@IfXPf-?u zL)C)+#r)rhvSmL+*|)!mvao-Mvb%p3WvBll%Bsk6HvSN08~-WF%Ku%I&HkGxiv|2H z%EJB-WhI<(SG)+Yp!nWDL|Fs0C>i-*in9Ix9Z~k$KSbH5e-dTy|0c?E;a~0&W9Sh7 zN209rpG4U*knkC{36x4@6lkqMxGd2wIf& z$N%q&vQfW8SvXphg@BysK*|k7vuIHkMD#aNRvs@IGr4z3AY%^FE~8x41QcjUr>^AOPJ70{$Pac&u|(#U@#%I5n25@jQx zIF4vhRtK;GB0`I@*8T*FKpeCv8-YRiyC`de#Pdb|5M`Zl@Z^ArNJwYdu(|)AMA^CT zqHLKt@w2k;qO7O?zYt|@{v^u2{ojhRp1+B*Wxt8CQ#zn1XQE69A=VyY6a?4_h14VQ zqguX;vT=|fqU`nWqHH2`GVX^c8`SczL|F|&F64Jn_T_J)?AQ-cHuq1W?AsQcC^$}> zKkg+2XeSGbLI5vy2+W;{26R9uQ@mO@{%a(7%K0}@)(7D0Ol0#zltmk>k1l4d{)Ea# z{D#WjmHm#&T0_30vhu&9vLu92C>XIZ60G42R{nv?8an?%Wh;K5vfDfJi3qSV20{G~ zRF>c-s2MR&yGY!;OMd@>Rx4o6S-m%w`Ij*ZLL0M?#nmz*aLM!-6eqgqhcQb}xQxVK z^7(GerbA~mMt&Hx{)BN*$ZO~V#>b^N7+Beyu)d5C!|B6$2E>_ni58O>fh7G52XnZ9 zZ%9Kv?h;N!LDc6z5e0n0+4}_eT<{a2CHjHT8vQ_MXVC~PJt35DY1s#7uLY$16QONc z;v-%b*)nS!xWj)En`A@P7n1uKlC8Cg6}Net@1=_KM|X`yvzH$| zF-RDkR+;$bGkkEIAcV}mV7C?=L!AX7>9riq?F;jTN0&=TB0OVr!fziTeteM62;l2o z;0`_bLkW=oB~hFXK{|9P5kY`}0Be!p4g~Pr?DN{o&j;|GOlJVmD0s@8uv&Job7X7^ zNqp}7eZ}kh`=pUYpY6Clv#BWi~fJ|9n1@!pF-N=7hzxoT`t=~{2-f{C8+aBRDYHOVM z(5L-eq~#p20wLlkdm#x(_Q4>8SiJO9p$z-!3Wn+4vfDOpkt2wY0H} z?qB1d%z8Rto@@HS)%2zARM*$`L|e+&a#JH3?=Aa>s}fYdcD4z}P~{rNQJF<5aX!~j zd*8LuZ@YC%%1=;$J{xD#V5*e*Ao`oI-AsdSV%LYUaiT)~k~^(k*tgZpcr@R0(7zK^ zPpE$tPR@UCZhFtECyq@U{jq(Ufi#hat8*U?Y)7(`Vy~Lb9NLfP-!2t38qn^RNPPF< z2o?JJd6TvcP1EP~8`kgu#3Ty}5di0BQ zKc3GiEc21-b{$A}?!VC@yQ)bZ-dk)ykfLh&xA5ITn7EHBg@$2nrN~7v z-D54cq{fL1!4v4MIS%9wN_~2jd5BSsDy=g%p624W~^IJ{@ z)Z=))wIL~?^pOnhM~iEenh~5fte4W>F1NI<9Ub1%Q6Trw(^VI_UG5@`X~XUd`ex1Q z4Ey>@K9XybgQD#7Ko9+GlJ*HkGl6kdiV(V4C4CvS(e6yomQ(8wRx+xOK5QW0Acr^s9A(RqG1n z4aNH!34+;SfxrMN3D$^otCw~6WbVZ*!4)da1A)BzBVrM9Ej>AGo#ms!B(V4}%NUeG zpLpP^ituO$m?xDZD6ZJ%xII>)NA%h@$GQI*p!#SqD4e4YVz*V&d}9{;F1DpDR%Nc` zno+IS7n} zMF6gckUTxG_xr_A-iPlE{CtjQe~7Xve!gd0H7MWngDF41%il#=R&z`u&;L}E z)yn|fLxPpUWS=qnKavOA6}~f#=LcPUcIp2NaJkmbP@+R}nc7OAo!UlT zz57hy#UjCjT6q-a-ZKTBY=YQRdA1UGxNv0_f$FIeY)SjsYD+sIJ7zQ`K`Wh!CzEhs zUK?+!0)SsQlro1SP_+R9Ud_t}_S)SLF42j0FwLUPnwBA7b$Rx5dl__D+a{Fj67A@1 zMaApYdlEfL_xzJJ`}MD?I*-O;!mh}1XnLvX#JKvUE81{t=BsIR=_VtsUUNJ4sv2&V zCf79B@O2uh-NAdGQo(vpKwAu38LuKmx*DKBkreU0=Z(;S-1HzDHt2MH#- zwkQm)kpK$eOhTLWH^)Z9_b0z(L2Ry~u5t~BiR$Ihl8-74q89+X4s*!O*kB|P|BJo* zifZZ${|0|j2%$sh9YXKDw={Z@4pKvrCQX_specYzQ3O;JRJx#uponxy07V2ux&lfE z1rZRHDieSIdEYg&X3f>y&h@!_);jy_z0dP}Kh}9M&Xo#uH!NN-IPSO5!oCI{*gRx# zy0zl`@Gdb;1`o>0hwWUcOnq%RDDLbiGG1*l8|rlL;0`jfw0dWT2F8lBgdVZ+j7+^@ zuJAY&;{5B;>F-}$+YVeVq>n#xPYLs!y~uw7Ma|qCt3sNJT?Zw6c>F`LoN-$K)dABi zPU%?pWO?<`G+@Qqc@fqL%`efD1eDF~l= ztibTqXMe@*)k@`4yu*br$BbM8`D^RUe;IwCo;Ft|SyVBlYUVs;2r8yNoUD}W7mO+I z;O(>L*3oCOS(-dNQzT?;cdTJnBQ+YyJYa*9M=r$&0!S z!chpGq#1D9h2$l!!;>i?)ZeAt$&f(F;KXa6m6~R9D68-ey|&SQRLEIq^&8Ln76cej2C!J-#6)B|K2Z^Nuce!gDM({EWHhr_ri} zhj+e*hsVM`jr06ei0)$Y2|~}5&Pl5D+)E3C?;?z&S0s9D>4Ve3pL@-q$blPa*3p9F zlS2uwRGvqq{N&?5;RXnhu9SC7&SJpE=Orn5b`a3+e>b@Byo?_^AwqVPp4%RP7F&|( z`MBv3QBu8wMIg39G7PXs@qmXyFCBWWIY*?IWX=O>Ny$co?r_S!&Wgv5BGtOyeR;X*PT@M4=v{(Gg5iHxP$NqHh zBL#kc#ta6qgCM}1;C%F7Fd#+@jvo1bFEvT# zW7iIecNPw~6Cglzx>4dBz_>-A55cpl2%!Lmxj-?1J(1NJ|BI2m3k1f;-^A+2Cen!3 z7F!w>Ti8zD1)?=vKysVy?SU;a(KZH5&jo<%^;w#YVY$Y!U7&wjY{)Akc;&9=tKUf` zLeYTT|6PmChQR**Gcq8hhf{cqDa`iWhoe|pIyRN2#rElr|EI-D+dP-B!7O-EJyV~WrMBBz0f?;% zrVzlcpFlTkjEplIyAZwP*^=p zi>+=OU6+jN2Gyow#})o*u})*u(QeCoG%dF8^=TVo_>6@}n?-hkUXHVMDp>#Rx+Fse zaH=1GgrIq5sAo1hZS^4lyu(fa3{1VTTL3(FD+(K%!CN4Z#mfi)kO8C~gzMHo0laEu znPiuEhK*7?P6!xi5i)6*PTwvE+o|`sVg~GXKy*`*hll;P`W=1619n!#_IAsfUE04+ zOtY8u$=xvPc}?T|16%%gm{VnMq8905D(F(WDA{(hjn8#acC-2x!ZsSzcRg9;F=nQpEP0P3-Wfq)O3 zj?ta4@%)r_a~gYlT9_(fhZzd9=rZ2|O?CI!UL?3PrzvPk{R%u49QRVBz&5tp>age3 zu_ir^b~mC6fVXV-8*LYD)S86s!jsc2+2%C-5~a{Jnh$ z0hSCv+EeL6P@p-7K!WbhV#RiwqunP|_fa|vl=j|&P~j-H8Ohb+cUUOl1n2|ul3=S4 z0=V#{b#B)>n)lM(zO2op9neBZ`nwIJK#4>pTkUe)9Rp4O< z5U~J&3QZEpy3Cr6vCVW3r2yku$j?!XPl+dfS@KtG*Kg-fWk#QDmF&8-RB-GB6T^X; zS+$0=iBtMOUA&=Yhib#Iy9OQ57+=K&I6Y2Etx8D?PJ|}@1O`lDcwMra>MOkpeaMuj z9V5Mv7y&FhN&Quk#y!Jh?a^EdV|52kXjCrOR655%i+QzkG!}-4CcrWXnMN}`-#dDZ zDkH=|U%Uwg??`Z62)jK2K4*yU1L*7t%dsvja9@VzwvE{eu3RARtk>f)0xX&Uhr9R? zynLE{r*df6EHETNgu~m#Gl0Zc63l{DxY<|Pgvcd=hCRWrHvPK`z|w3dlL5fGJff56 ztO5zI;)e!iE9`R#TPeWUb1O(9$}VOoa6=OK zF>_CE*2S9SY4SY9UfsnBpoJ#Jb*)YNahrObVnjdEs${1zv1{m9F_7f7PRJB2Ft_Uo zH{tD48g$R$?_=Nuw(CT5V(-O%Nzdx!WVL5-p9V@6-THu5ZUU>+K+i0!&-6bhRyIIJ z+Db0R7|iGSGi#B~LT6P1*cY63TSL>8Otb#N+g%*qUwoBGC#A&W^k$tx&)%lUPIH2Z zWuA)w3!SmBCmoystzMFtm82{57GbTNKK9#UT*ztn*OvWYt?Dl{H+ppp06GX^a5_k| z|7y;{npPzpJ5a=YC3$o~qc;6h5h$CPyoP<1+egtutmGV@-*r00rlSWxf11$%JYek^fg)E^|JD^qX${5z#Rb>_5G zaBMUXn-Flum|z+5ii5-INjc-n=)E(O>nPE_??p5xwi?#9 zBAKLN!h0#*EE!ET+)N2yn@pxN<{nn4GjNnzXN@qI>!8w{SgZE#?3KtLupbdaUL(H% z>3BUQk!hy;FQW^sDV@;hX_&n3{DaEgV8w}CZTi3OhGGo-PV!;tBJI!y(^Jz1WKLKo ztt8*gqTl;$y$stzE~NqP089NT@YdR>T|~*PE`@|HZsQpN!x4680zKcCQ=ifSd!Qiy zmBz3?T`obSqDx!`xKOv;B@KY135*s1^Ax03%S1OK02B?tQP99O3QUXu8}Q1A1z1%H zzl$k!YoF8eKXGf*X9NIj8ML)9mINhiCg8SMQ$-q2*QTTmY60u9C?8!&?SD?}?YJR< zM(i7!6C3UNRGaduv)yk95NHN!pZt##3zF^t@>jkNb9%~DqiIYm*VB%hn=vAFu?A5J zhUT5Ygw7W8*UjvHzUpOnmfV80R9x36ALgW1Bb@H z12031S=02qr=hZx2!b)BkMQ_RPp%1&es--$=L$-7^Mw(RLnoEdMrbXHHoj+))D1x3 zfIv1uKE)>I{>UfeJ-JJ1P1#CWNhjxXtMaJtK1Bq=J%XAaUm;b~=~{-TMK z-AQRl9du4gI`4i}g!Qd8j~}^W(%q^%@E7UaQ_}LdlcB#Wg4vH<7wA-_L06~Kt>LNd zi<0=nR8+Lr0B_1kM>%b?Fx_!G^wpPE;z^39`t}#|^v~zxh%rE|u+;+n-rSd(d6~bi zW#I0rzhPoO^TF)_$0=*q{}$i9^iA~g4!zyhdFw8f%fEb20uKNFbdwWiRR8_p@`30J zk#ALh-e_ADy*T)B`Gg)G3#Lc>FHv@Z1N2t3Cj%klSR1m9e4YKjMOmbIp5p%&Wm`CI zWhgNPkJN_#lpZS9Kfkl_Kcegvgi_e9tYaz$*A?@1S8lA@@uB0BuwOJ$R(Sa;yPBNb zRK5S3Jf*Y06=|aE8!DIhDTSHlnAMRd|3ukqs=+AF^*<_eT_k!=nkYLp$6R%RS0i#` zk0#2V9?vM;{-wGy@SiCAF~n0c&piJ>QMPqQ_Dv6jN#H+Gwl3mOdvoT0L|L7!IqH9+ ztnSWI|80#ZpQMpnb!ARYPyZ8T5z`6ZH;)bWwie#}zlyS}MPWaFoR|!#5c@FuJ91CHMa=%95t;r6Z*?N*SadH6yc>nkCHgQ&&Bzi@K4^* z0BW2HeEqps3iWP=N*6`6?n@V+J0nNEr4`RF1t_T?9{rzph*mpcre4lc#I9*kA$IJC-DmuBjRC0uuvu5m<)%}doNTITclYNmXW zw|aVC{;XT=NUOrrj)RLg4J~HEhVQ<*8~Ba6UY6@_-RsQM+Gh-E>xwVNJ(8Z7`AN&! zH%yT#lrXaw-*Gh4Uo}u}{rYG+q-CYJEz&Fmp&;BY^44>>?bo+9l~+cjqNrC~4E@1z ztaR-kx}e<2YZMlPqxyIfqn>aN-wT)3TJBSE4JiJzM{0fIqf*sXNVm99f2pDqA&&l* zx+C=gH3K@%%j(#`M~?B*GkNdT1&Y&Y8Hhccgp<<^Kb{*UakIUsB|-tqY@7l*j=N)tcqkbkH#uO zYu^5)3CrfP-CB%#^j>=sBIkDL)wsT12~9*WJ)DtLJgL#hFuFx$F)xJRR=QFp@bm1I z&J}`xEUvmEIJKS|g82%G1Ce{(w~`D=(nsfVWC^8$_jk{zJ=eSIzIFp~g-~Tuux}HH zSt@GoBa4(@<%lLoh$8$R8fcZVMz2AT+p;i?O^RK_&Zxw8MxbJU7u0`G0%R77TyDSK82O{K{p}Y7dNR@s>du`wX!ejq8~P-z+78C+uf3?nrb)?iU4RA^ zh%@@WSP_|>W6juDfL|!26IHvxfj{ULz9>{HdcTF~vVFlVUMZN={R?c@4WijQ*uNL&;0HHpComCBx6=Ddf)@*@u;c>$4H1V~X2eSQxTD>N~xM zt->ch7J1|F@!}Gq{4dbU!S#6NQZsL-dG1q@ub#6go}D{3&FMw^`jH6{E5)ZlT{>1cu~c!|VVGeTuYH!hCwprURAd zYqbdZ8(Z4u#FK#xWlrO8-=lAn<%YknTqhR$bBh-6zlz?@V$p+p7Wm)3*m%yg;H>ZE zCG7Rs<`Dh^^aruY+a)-W?U=S?Yt^g^H>~e3%2N)ufZS&C!`E0=a6EnX#7VXrc$lsg(cMTIT_)9H zUTR4Sv;Ol_;9?+^?6e(hB^$#!TEBxdfA`JXpm*G%YFQ?v_^c0Sn^foD7vh)2M~y}r zr-sZJNUs#Xhh9Ei|9v(9A#$)0VQqTC+14ZdvYUnbd^UDwg@#UyKG}?SdMtXB3pX*& zym218B(htnV>aW>ael51c@XqcEO_6=UkE+PHUEYKIe-0>nC3h`8p|&Y_LGKwLbBY? zFz900QkQpraien6)XHI69sf^>-S3tVxp8Y{p7+L+2vcQc4wKvqi)WVTIqPXktfb$| z-eGX9iWj5OfH>nefK|eXZk8{1eR7csGY1y{NAea&k z$#{3zTuqxXJ&Az6eYfw8SRnu-DH}2dy=>uTxywSDE)&hpHbsE%oz}sD=O0`H^Mw2D z4f%`n;B{-15Rt==CvV@rd$K3nV|_ZhL6nGi^P*Rmk)LV#(%quU=4ykkbE(+C-@J(r z%O1;dd`z-ggkO^FO;!hW4ainOM}cclEY z)C&yuhD<_TOntfLM3ql}398{4{_sUh$W5|ZRPO2nE+s+qIN7fuZ%&wK(DLPn+{~0( zw@?Yu7$+jt$F=O>%wIv(p}Q+8N!f|;5zm?9n8E}LQ&hrl>A&z9&mm(%!OA!a1S7`f z!d;5SDQ-B@fY{A(+Ww#VKy_Br#r(Bz`j7e!qs5wEEyA3v|$q$~=9B)r_0PV?#B z%u`ZBYi@Vz@VmYO`EmDhT6h)WDm+Y^{cwGL!d6h>*Ymg|LBZdpqJPE3U}a*gNO7Jr zaUawFix5jj(S%q$Rue~7hmuvGWOZkxHl8fW_@5A~xM18jEq?JoA=Vx#-2|44#z??1 zqP0Z6Xd<^FNEty?UMS;s2k}M|`7&tvG!)@ov?v^d)Wk?Qlhmb2ioZc>bs(jQGNqL= zWdukCyr{}ZRnNmHjA2x95bghfSh-F1e?Y7(R;k@zVgSjj2-1!@p$G^CNJNu#g35IB z7G-HdEZ!9Pzl2z+1Cj#OA30FQ*SClm4iJ^ZNaccc{k!!G7xgl84Qm6Gy8J~mh}`&c zz8Q*96G=t3TerX41ZE*$=dUSNVNR;B$di<5#Hix228I4+%EM+btb#LDv0s9BIMCE5 zu&hGDJdbLiy@bd>i#n5RFJkRBB~Rwada_srO{#D-h=cizL@r6io~(yodP);w@njX* zWns+(|}m4X5g}bBHGjS1fzt9c*(E)Y$U7aVnk;agcGH;r@BW~gQN^s zXc_r3vRH!*>781#iZeAR290b4heWa%Tf%hr@I-;~Td6bpxyE47QLasfq_(q7<7}#E0lEa;1@wuElK{9sr*TnlPjbSj-RF8g%%-dOHI9?zO+Hx*ox*AsO1tq?U2E#TbPK0{ZSAafjaX`Mbs+6kc@41(dgz*|V^Mw=e8aFEz=cC3GcdShh-3vhK=DCoFIAy5?^8A88B*K= z_t1E1-n0k%O#d-1mx>s}m~M3o+=U9=P1Td6BAluBBXM%KpoY~r6I}O&1i1vSN2XhA z=1?-X^9iQRtk-y8-E{16;N0UMyI@42l>7#kI~9Tx0~y1KwANdXZHU}QcRBp&kKwmO zaf%jk{HFHQv!OUSTsT*km#Jb!>Ly0~rl%DFt!5F56q0ZAfbzE!7;>S8&8E22Zf#tu zu&Ryu6y$y$#>8F0Dz3Z9(Nvdf(0U4Gxl4}R#43b9l`4Av@vwnlGsP!3T4e5AFHZ^B zGy~&55dMS4!y1C>sok6?oIyCzWP=c}0p`Kcc-U_7H4mqIgGOUBJt82}HEa{3z#F?B zHRW4(vBI%sFy6QN_9i0if%;UoU6^Yd~xn7zb8Bu z>wQ{oQgxF_qH*22Q&df6l4uRN*U{~;^Qr4{H|^R<37Eg7ZsV0~s17x;=)y?SAt|04Csj4ZbYRU@UaaDpj^mP58k%fkmtdK3WBb)6b=Fq9TO9sPQLmngwE1lDByz zcT~eEJj@W>93*Z4n5gQ8d17%Dbz8Yab~sV<2xNFfk=zOk96&4LeM`3QF!x6ca*Iny zq9xb7BuhYQsu)eJ4Xvr@t$X>R-rduf9<^1p+!lnd0HdY~>WqjojYc!MLHCfSHBV_Y zl~I0|QTj~L@_Q5&Rfxp)N39tw@(h;c;mLCaEhiCGvS71_X)U@YFVffvr`t#n-leVy zsNyQHOpR!jLX`e6m3f#X%&z1%MUZVo>8C=JGElf&5J`*}GDTzvC(34Y8E15H25Ksv zg=$5Ew4HG}IJAxzR;Y!?)skC?0MV$%8ZV)_3MfLjQRM>S@uSb@mO*?3FSF){rn?v> zS&R%IRkyP_)KMY+!v=xzh$ze#ZZ<|0z~j{6S?pqHr$!=U zE(LZ-7T5q3o3)HH2rO}61Ut#Z5G@!EQN%BaVsT#YUkDd#-K&0K;vD0QO=Zd6!c6}1 zUez+^i#HX@sLw^O8=pi-7ZMD2u^y`=_M>&?Bal*Im(|^v?W3>Gapa~_s0p8tvN=w1 zP1RP762#)Fh;uU3_{pI`RC8Os(C@~2$PKRBb74zB|6VbWKfJpYd;=fQ4)0~ z8)~>}l)mBir6`CSyQOqtfQ5C>!pKHkVb{B=Zb2+*&>mGAlv9;63JX=1EjEGb-2V8)#718&grXeR`YeIx&okYkcimYgFZ&`d zKKwz_H$>nvG!jmfG}hdigTExcxu|!K+h&k*@%#^!-xr@=Swielad6Pni5Gf|yFLNi z#$uZ67Ehcc3~*eDI9ZbL@J0?zhiwe!4)2yU)U{}aNU4Gu6tZS1uRI`U0BRxITuV-q*l{A=0Vm}O^1T#po_=s0-M0nS;>*ml`hld{#Z*|~L zED0Czhux};s70$5Q#U<+6jUBSGet+>-Hv)M>v?7B;da3v4j@V!DDR^E*OP}GC;Hmo zl4u{4m%&tRLyBl%wBQ<1@Qto~36&Y%E(_myK;B+a{S!E?(6Tgbu^Yz^ACsbswB*v? zT!|@DISA?{tFt)u__!O})4GycE3=L(-at5+T@B%U)!oSh(+dOUJ=*0RpC++t1w`XH zUUWpP?pyuYtLIq`k4xuOSrSskqA>|gM1eVSeSyL=8C7GxWPKEsXSZ8>?nJNgkld)V zJ@vPI@c`0v*5GAQn|U+Yz-7Fx28&>xck88IP{T;KjP781bbH!N1`huQ^~?v$um@uF z#WGC#8cf_fO>&=^*pS?13~{@Ef1H2OjZL;}2jq$tdtyy7SzQak${4JuJtdjNf+Dy?tXNY|%_Nqtwn^eDA_r;?elg9grXph>7zShhB)R@L1|kCLFtL8 zA7UmMN}E|sA3ia0+zGPyDRT;!rTw?_o^v4he!H-=_GRt)Xx|VE`0~fXQtG%uYih>w$YuASlwq-auY^Kon=_QI)+cWM@N#~!~G+ijea=+SoC@PMt zE@N~);;2m9+qCY33TArby_LR~jIhcgwUFUxZO658GGAU0eIe?o?_PFHYxRBEK|{_# zP5xGyt4TB2vJbB8%&(T~K91G6-X!~M0BOLn6I*rR(YiU)i=c18qE~fiWP~{Ejy4>! zm#*$wKHz&qpOq-^tWg}z4=$+22_vp+n{slS!lnOwKp-8(zr7iE5cO~=yX2mR5Ooa5 zaT)YUIp&@!`8{8D!&NqqK2K@Pg26g<>kjVp*|ll^tlLN8fkh_wb61MAKXv7bxc|q) z`hTlrG_~J2lNHglW?G#bq=#d=x9|aHka|gPN%eAJX`tRo`i$H%yVZ`|ih$>ltn^g6 zu-&M>j-*{a;_7Vk%KX&?P&b<@I3bT&@KFf5AY0|`!vIg4zacR3+(hxpn`iFdS-Bh0 zbcf6CU^}QRC84_Pw(!c?CUeOc4lwrxo9Yj#l$3a_ zS%!6?hi)~adoDYj$JuCTW>oj37HI~rgbxb>=kqe4h9{TzDANxd@9xNCCNahKI}N?~ za{q9%<3!|Wt5;OiJu0zjjqT47hd^nN%)anf6Eh9_>#0J2%zrDj;bf&#?aaZt+6S|z zsz|wdMz$zsDr4_N_I0PwY?s(m#kuC;mem}|?jS+l z(lnMW2c&zWid07-$e2qU>}(N`zPFZrER<)GT7jjL`DA`_wEYVBEGtX995VJlm+IR? zX7I~2H=5yicn)qyVg~h!@2F)+%qWl@rtl1J-OV)abL50NFAC^T83MeaG+j@SKOulD zx!q^(f}%2U2a<%h>%}pu6yPyUp&uq?coKR~vcpnwC8!m6-iNo|BN(AYF>{jpUY34i zZmP4Yp9mi=gYToemD7F1%^K%rd961Jsa+CA$Ay%Qm@^nRZ3gG@j|Di*O15Q(%$pRh z71gV!Z36Bh1Xr0dEgz|p_b=pfoY{i##KqVQG(d{rlojpchQaWJ(W1GfD&6N_Z4*Lb zi&rF9^(Mbw57H#x`{o>MuqyXHI*;?-Ol`347ykFrn@$h@GzRN42n>^gof7y7q43QRsWqULn`)u*v;8uviSN;#z&EhI}DrF5V!$xk!#Fg{8 zgjl@^9d!QG&DhUuWzuxcdV9J2g7!Tx`iv7s_~m*+wJ@G&0^As5&8CXlv5-t!=f;AH zyKB6+%-M@CUUSf;wV_IVb1b^+Ty3TG$kZsz@$I*<=hE?yTx`F&JZ~MZyQEd)cSgZ! z_O5H0oz~N{!V12F-zHgmXdS7xk6c^sIzJ`3)LTg@I(#{Bd-W~8{!C@9ABTZkFXCpy z`LtShsmC85#%MqBkcyx`B2EtKzj$)r^s(2~i`Rn2UNqn$lmh(>P7mB_divZ}G164f zedP7;7tyv6fv49$jsBi!jB`!3OTs3)5mS$>Ao(5 z^d)b6-n9Y?Z+cU>hB@f>TdDjzT0gj!*0WE3w@$XNxu_&9Gx`465o1})Kr<)e;c=Qo zdX7l2zBb1^JZhnx$F5#naRB12Ti(rThrM#o!H>y+kBX>@zM?iW4^?TVur+O^+9uC4 znFM`JV6O|BpmJ&1+V`+^lD}?PMSpMI$MoT3`wq=?jaTqFby;An11Ya>;T`7m zKw!MamDVjh8|L;(V4~sHui8-Wvz}j01SVT1e$`!=J?ncSF!g*hu6mG<aKKl<$M?`FslDqc??^~LcGdNwrV zWgTzwXYLZgIS_C2lb1@HY}!;_m|J4KnNjo=6T`{9O^lHLFlcY+0Cr-BZC~|{a*9e` zFkD0NCd@`*0q}yH+l8)&?CEK*gjVJw_xeM9Vx1N}dDo(8txZE@L@m(C#YSZ;_RA0~ z5QFKx*KJ=uUehfgfBr4~>vgQpHNMo6S-Ys0%6o#T#pMuowZf;`!=)^T34U~HC2{Os zB9Oz^gpVgdzG(KlT?s{Bgnu+%+t2IIedi;$IF-a}m>+cid^eNOm(YxE7X3NANTOcH zTJ<-Us<~{0W@*i?_rdFXrCF?+rH}G54&E4S{gR8DXMLk)L`7()FDQ_>i*h0^$Nlb_f~CTrmVP82-35$o-o|xSvR8 zCB|~Xr*hP1#|LF+x0*YG5A)kaF}9X4F_#`!*&T7wY*;ibbeKJXbvYHy*FV4ZDd%w` zWz{Sf^x+hR)&8X98pz0=k)9Q-!7d?NdUpjBz+Of>te>i7XBK^3JP^wGZfi1nXsmd? zKF*IWq5e@{B&WU@YiC%;unOyFNeAxz;!M=i0^u=YgY{><(1;zPr%njP%8r=>0-}y# zxnJ+9&Q_H$=%c<53?rUDVeHOD&HzHM{e)!aIr+-Cx`>D_kX15hM$>_tql_Q$dr{n9 zGqH{cZkQMz9%~hYVo*zQ^DO4M+as*Nmzn85=i76UwBfO9`>OIcUu0Kwq7f*fU{ zB33ItNGdFd$isx#r3E$7oP@73`7sg#*7UABQV;T^0%BwyjxfI<((Z!ji|#9#NG<8S7@fe|+pg)zBb zBR}V{YnR;Qj$AxSNIRCXFk*Shc8N7h%iC!g|MO{3{z|x$_Ii%CTK-9xH6pP@C&g*$ z*>#=RpE{2fbgytO#5rl@l(J;8jUAvlo3@Y#=okV0cO)oFar^RRe5hTSU2y}VzrZ!I z%y34L2{JQoIcCL=wTr;f9GjCXuN0tZmg6UNDylOw?voe#P}_+f%q`F0BE+m^-C*nR zj_4r#OHSOyk;CPa{4d8zMrZ4D(zH?cw-sFAh57NEH2)NkCH{k9Gv)=Vz`T1z z#Edus`45K0SAoPEZ+dpP`0|M(*0zzV^YH5Ksxg#>whTg+oZ~)kOvmkedW27;5#pgW zrTYZpO5vMvvj3x1bHqLMPn1ot-{NTOr42&i*-GSt9tU6IP2-^}1JdPMlL3?M#z~R~Cc%#ZDg+McTHQOyM;N zn!ayi&E8Tv<=H+hDOtFoTt;1eOA8`a*8@+ASR$SBBG+B%Hzy(R_Ipg3<&a=n1K`}q=GlHc-T2G36p zyQM_ABzumeXz)%goljYxl=`6)ecAT%*O!-OpR=5P@ey2@ayW5$-ffb-@bWJgA;v^E z@*t7ryiMCX8>U5DB~tt`C4eg=;+mMvNughG@VwL3>%?;UM2Er)xyXxUJQC6K@OqTu zl~5RAPD&jF$1O8#p}7{ugBs`jUyJkX5DXU2iC9+GDE=Wj#Bz%?3VHd(c-oHS$rJbg zT)irv`LJ;OQDJpm^QGsB^p(w%?ah!b_vGHhtar_X_sv)H8w02Fvf7$+J~T&-7iPbI zdF{)TRF3$y*}R-5g_r4fG1qHEu4&o2@#5OJ#i8ZQ85H3mYJ%C?fwtU%fV= zo?l&s*TNo1J>50tFC4dN{p{#b>6uq)d2s?SQ2D*^*4a;$ZY&SaT&xn-tO-0fF?_qQ z7ZmaM&O_^igBbm4&ph9&9uwKExAR-i<)EL{e$6Zdf{0*yXI{x2bnWqch*4R7zSxbd zPy{a4a*yaoKq?Ndl|K-WFXdu98JhnslCYZbNbc(=sqX?$pSNV?KYDt6R|;hC)Gy+x z?f4@Xfd;yyah|q%mg(#3--Q;$?z%j&gp55E^?b_E)_`q&=F9y&AmTatfY@-+{kk06 zH6T=L!6v}7DY&TV>~vF9Qj-ao%PWDLL$hH)&1BE!D}v2hAm%lfYU*VDIKp_)`nel{ z`>r3IZc>Z)vlh1}9c4+a4sESZ4_jUowLUhGfBUY*NvW;Up#8R|KLeM8?Zr!X+}k^* z+e-yITBe^)7P&eNcKqS)+}9O$le23md8q(?$u!f&X4r)GS231O{Vgtls6*-tAh&Kv z1T`@$3w3LFb-x3cm&P7QV1!RF#YD{o5nOwEw|Pr%+x4e$bQ61@;hw!`3VOYZ$LiiY z2OK#B8(u#v^!l8kWqi9%gqK!Ia;3R|@QyYt|IsU#c5~0#S%NtojNn`{_Ylesgxsw0Po5|{#``&NZBIMoOa1hxQ}di7ZO&zlwMO<>Iq@T zenNa8reQIWiN*lcQnF)lA4ir$7k<-o;UiuEnH2$`0kXY zsFm{t^ELKP&eTMhs4fwiSIR<#iQ|azh%=&M^9+4>ajsI9RI<3(edfPiqFmyhpXhAw z-xKfe(qHvU>P^tApTh}Lrdl3Q*NXGPF#>&`6I$<^ zIM1<*|e75KxY%zV=ViVo|`Qq!x5Rq%~ zJ@4Q`Lt{zZ54q*%k*Rp?F(LRP6xZBmQF{uMYkr^IpKn3rvibv)u`_}|kIjmRD&PG1 z?F;MzEe13gR@X4;@BL(|(e}TIf6+a%@80_P?a2&|YXr)>J{t;9+eUv%1uS(W&Oh9J z2RDC(1Ihmg1+{6}qpBFR$u`xaT&@aV3?s=N=;)H)R*KN?C-onM+~&m2}n0l4?Oxcz4Bgw zVu^vXDp(F9Pt0-*D)tYnZj=qA|0(2Cj2*-mocnt;wZnT|Ov;`v!0+x=%^Jr$!ulCW z|4WqZ$kclLpD26Gs9ihN5cp4&)qe8nMR>QiH=|D7ObcvP=&GsCQ=gWfWkTXjI;vjb zUK8SJXE1brF$-#h+~NALaL1%ao~t2_U%6^7O)E-7@A#c)ooq<|%;Fnd zR*~M%OcCK6!N}33_N2qVEQZZn^F3LDCdR)bHkSwZ)eXujeSdx>W(oTt3R)AAtz<-& zBJDp@<2X%A$F6PpK^X+1lHd7-77+Fcyq;GLMEt3EfqK3@&Ke{u8eLtUSdeS7|`ee8fTF*GJs zl(4|Z6t_Pxp2HKJO!86vte6>Eb@AiJ;D`4wF@{xl9M{NU>i0ERHJUa^Sw7pBz)v%krfvHj%Y;}53)SA1?e4zN8(o#G zoR^?l*URVzODFZc#{6T33O(A-y*EB#)^rgLXnX7vfAr`%{-=G6M{ewLEIdVHecZg* z(l^jO$Qx`Pg8@S3_;eHZ^RG28#pFnZm(TWhwVvagUq+jGyb(f27j|x~)qK${w5u zNedK;>BMhAm@8#roO|>1N=rnBY@ihTD6KywtiWm*vbu6o9clrHIm&+)>UP?jv%Q|K zfp!SHYwgQ8TG$^~RZ)>)w9)XPg5bwGDhub!EM;@pY!MOF~& z9j#Q+NL`X?Z0m=RM_d77FQm0O`Aq@ToE#s@pX;T7?m6`E`x$9G z!(FZ31r(m99dqv={jHHDrCuHxi^^jkdQT5f&vD&()0C?@nG(kRWc%AU=Q_EOaAQZ_ zLHniCa_d?z{+<$X&Dg+hf*xGL5wCbyVC?MIZ;XSb7)Q{2oO7JlFG?mfIL2G(*ezO) zcE&s_iNAfOlJxP0S?_bp_7yR2M{{|fVs9#AAloTx8tik*$ka3R9Tp{ z6t_-R6z~oodFA)$)?w1@rm`zQ*O%<*_wd?RuDX2UaOG|5eE-{@uT5QG|Go84`lx_0 zp9>J|3g_~s`0*Yx_v}CZl*tRe_FQgF#6y=_bZKDW+Bf$ARh}!^x2+c=kqw}QzU@4% z*8!_^Sau8JQQbnIR5_ro*X^@Y{sU%i*+`U-AOQ(^;rscvh>TY3qps@7T(onKQ zZ1Fa(kD(YpOuml{5=X6}i<}1+=F-a_fLBdUy3Y>81a+sFaYLX?C102B6vX+0E|^{ z!ZrjZf0V`jyw%n2VV!Z=eCt7@%tUC=*`(?6uwU_l^ZjSd`-JhZvgSKCBO)Is&7L{; zyXA{*F5yY2X(Z`oWaQvx^17hzX4Uf!?nax1QW?km10AvLjvuKCLQ2Q4g=BEy(Z|kA zPZ1+7JvZ_;)6N>sv%D(SiB3HCeG9_kN^5Q2)f0|RE0%+QX@2K>?dCP#I*tkNebSM7 zU7+0?JiHC2^o~2rXnIDJ|4Pe;b4ou;f9{{WG{;w8^7#GC)o3ICRkJyNgwJmIS)(D# z;9~WHq2fzDWX#pPVIx-1Y0^}gi`Dysi7%TAbw6#H5s<@>UBAgu92$pL713~X_?g=# zyqS8vZsyMfquta!y&ka^eWd0U_o+;dKNb&|t{Cr8&t7_b#aGZ+IqkmoE^_IE2ZzZ> zZ}|Q{#>^OuSrC$CEMw1J$U4Zr zjFg@1Axa`dj5TDLLa~aKEtcS?6Xj8U2*cCVvv~X}Nf)U_UAhrW(IMX; zWxxC(2-9(*_=EpGl5QaKST^A3;1!Pj61qJZ?-L=kRYcI{8m$ot@`4Cjw56p=#%=y+ zlYvl14Cjp0UBW@w@Pm6AJRhif@Q|n6;a9~uPy2=Qy#&dmgsqn=6&l;}x`aAaIjs3w zPmyS+a2DIBFsXd+im>n(YqSWL9I9vE_U51x>GCR zWm&A5vG4R_&R8Ny51`se6q7YV&K?0b%|k?o7^bRhLZS$(ucccv5$D>Vv6v4FPGDq;T6(if-{9x238_1ASeAmVkhrH5eZO)+r7HHKl-1@G|YREkdeP zN$NflUV#RJv9!r{RNpU9Ro+iStkaq@r_Qtk*==EYA0H??z}ShvZ%lWIf+joj%^2N@lJv&kN83EiqTa$OtcPrUY4 z@DlbATz$asRDw*fVpTG(;gFjj5vpjKYi~0MD!)8f4_n~4??Von?1?NWs|6BZfr@aY zthc4&w#dYgcRzS=)NJM(NY-VQEHxpq%l#C1+soN&pnY)^+ZePWxfOUVHkqOY$IbkPpHC5%n<}>EMdoLl+RK3vS&nkd%O%;qK%KIU{u* z7w8l|u+Oyv6ndYb8M#~NI|9oed(b9Sr<5Q$YaKUsp7ZJE!&pzEkO%1%jLVv6_?7D$?HAg3N_t|(-$i=Z&hd)ROohB z=r2_m(mpp5er~M${M^NZ=eQ+k{2D}Bm}uw_MeNg`b|JP^fP;T z3@oO~p_|6(7$||Z`nGU2S(GJ=Gb-S#@Ba{G75W*Ie~7Zl0o4yK)}-xJ=a5BN;p)7A znrCS>g~GL2OEu2|YD(p6t1niw+t$9gRa-+_hyCfn_SdE#qtrJ7Hqq9#Sk}F)t?nGB z=oYTELqhP#l3Vf^g~gn)TR!Y0*aEs?R<~i^vZ2^BRo|{*sk_1aNy9R2<7_|z8IoNu zh}K6neC=*n7=PN0eUYDf(6}w!uzM?Od%aN@Nd)QLQ-3d;k?8e%PsIR&8X^ zYgiqpU~zBwOxwVw*RWf_iApCj)Dcl$L=3Evzm8MTz4^#4kx}aDbt6dQQ$ZMg%h+@? zOHVUuknswjd1k44ua=D&*1W0JjGbs1Ds2%9%n}}K6^6A4^c2xsHR`6fj1Cr@RN!KC zDv_z;1@ zMTo8Go!M3|bL(Da)HOU#?=cbS=&|bR-R&Ok=^i~({NX9%B>;Suh&XpsnVgt>WBbg@ zONgA5Tm{fHVSL`|F+YO!&~x{!-RfzZpm4Kl++=$dGV${BL}!RrMMU$}W4R?p~N*MB>`@0(ST(V4#G?mqBJAD4dn z2s!3^gG;_=(6_KtYGuGnwPaSjTxZ?4;4Do~Nx_BY81QYx4d0R7_(u&!`;MiXVvSYO4-k5l_o!1|_U`f2X(I}cW>T_IV)0TF8qfJEZ z=_?61Q$2I3%1FFE%2ILEUvV&Cugt4?_zYc}?}OLcGjE^V;JYl zm5~=>Hgml8)p(sX#2^2*T#@(I-dkDw&PR&8Ut=d8(-9*lJEE+G9OX0ab5Azci?(@= zds7tbB8MR%+!)~0^a^DB3?yL_xrm>7ugI`Tm`d}QEIuxbSt|2T;D1Rs>ZLkS#y(oQ zHpMT@VyAtaO33kZP-t_SdW>5wPG=q84azwJhbm7&h2p@zKfaR(1fG|UVr&go|$v(-*L%3%#cQops+y-5IJQ3@@y;gU*@C#N(&i~BTY z%A45=P;t!}6nppS%;3+=xxqi&4^j3dNQe}+H#UqqSjTEzlAC1UEvBfKsw>6 z8wsq(I7Fwwgw%dpSB?+2ML*uz$FR0yIi#WA%NX*|Qay}nQr;jX)diH~q7xCChdH|Crwl+9RC$E)La1BI=TK2Dz4Zj21g97!=nT-bygOxH2@s&Qd<% zBzK>8tr)pc%(%+VyiJ&TrH)!BOvzMpk55hpG)*mde1bF~KGqKn#`6pZAub`o+G`Ap z{T5GDNZ-N@n}R@Q#O0Ywv-)>f3kF!9p^lZWFLLyDKvw4i?M~hwqysmyRS`O9Z_6{6 zk)*^vAA5&YvHYgEO^^y_sl%6^5^N9&GL^S>%L_4b;?6Gdm2fu4vQf|{6z69n&P}_GShiI! z>?}T!O5KM`+z_?h|7`)aC@Z%G=Gu~MisGwe>p`E}^aPRcQz}^(&XajqBIpefAtD!n zd-loN;47#FHeUqCqNs0YA|8`@SkfU6dqE2B9Q-ZtBsV6J@!bl)KfAK zTatL2^p(p@zS80IzU^+~Rk|e4MPFvxAR_dX$206r-@!hHRT89%1h3to4HKt@I?(1J zXG5PmDg57T$kk$82B@|4@6wBO+ z5Doq=9p`XXhf~>Cv%R|eO#*$ME9J8z4&E!qB5`a~@#9qkWF)o}Zp7-FVD&S3QY9TO>{9I5ceKx+3i&e z`v_={UW%1|b<8iJ)1=0S);{Jm;eJ}EZF4*2Y{EcR*lgG6>2$Mo`}R0?$RS{a94@929j`F8hv zh_)WdAe-{lf$GCgT1Tz?7llPB8Jvj%d>MQ0)6df6;!_;r=NFC%F!8ZybJL6Lh7j^D zUW>8k2uY}hDAeOSKn0JG_waXTcuRpJjyqLhR{A_Qqy_DP#} z|Di(3V&t>X39|tvkG&G`WWbbgWGUP1Dd5}lpz;U{4ammX@sQ#T5o%wi7|dNP@Ed>3 z6`g^TF@8YyBtbDUXpQ#1{i^+el$f<3hjL2QI=}3WW3|)0^O17S_b)v1FH??q<-q*l zb4ZKUp4qXtocj3X{)gH=JT=uv`%%Q*rPog>{X&@cl=c&JxYA_c5(!DAqJ8;JEwQ|Mm*ugY3>%dL ztMbMmf47#`l>e-l{I?6wLbH(q54|a@-kF4Nq8~n#wm+G`^S;`A zAvf;P@J`4n@0PoM6`nO^mJU~#@A1}Hy|dQX=+ZRH526`=Sb8~6?;-E75#3qh+Ot6P zOY=`p45xRgh&;m{XkTMJ<&=^NY-{u7eE=1cNnXV1*Uv0br>tgXn3#_PTrWHIQwhYE zur+>d;nc5YgFN?42=F9arCd*MP<>o_$A!FdIOuPd{W9GonG3Uf;BURmdNS&vcot`- zthy59O1R(IPuCkh!`8wz3D}+)b3JL=cdF%*ACm`SFQ_}PzOHJ|Go$u2BZeN^xQ*o- z4Go(-Hnzp<0}`3TB%snXB~sRo)tf^YECke&Al%rO?2@;&7Bv`&xazvZI&o=>)BQNY9r;-e7pP-;DNkMyZcxwhp3u1im;pCvu;#S`A#W_$8|(*w5o4tgbx0x1i<_Yp*+ZUGTDxKRov>zPM& zCr%D$AtGGVF=I!A%o5N{$$AQ)vRk5d?`=*AI$FW$pN9CY>GGY^AwT&Y44rGq+m~$( zMh_(>9JYJWrD~T1k@+H>6MSdk#KIyZ zm^+d_K>xmZWwTO^0iDF`Xu&CPty%lYk0r&MOPiuvB7)o-j7byyBhe84LyYT)lK$$6h~+^eJ>wylC-eY^1W?r`V_dqRr;mn?}gBk_bh2d*<=cmm1f~lIz`_ zWyat3MP94OQ}l2%9Uptse(lBcdXG!K00IZ=*-xlJ_oYP5=XbjjUl{7AIPr)c$9cTr&D9YoZ_|r zD!e*RW^O?rBIdrl;s{I4p7C`F|US9UGq{`>;Q8*vZN1-+kEEKYZ9XGyP+;pU38g ze% zm1eiyVv%y2=qSy3nSfS{Vbd+Tmt zr1q?SD^+{rG1k zc8(tffC!NAAR3onO04~#mDqkpq_b>hIMcO-tRNQPXfuMW#M(3P?XNW>{*4kli_rf= ziCrWsv8Vo~#IjC19xAa;NMk*&$bTrYPJbw|k#YY}VvC#lW!@!y>5*cw``klTVn6GV zmDrqa{*Ot#eR+a*LpmZql~|w8Fr_0;y4*5KE**ReqpN}P^0NZ%q*@SUB^Ld=5(^aP ze$K6mgcNa+5X2bAXs9u-dLQdu_ayjz3g@3ntPzsCf_{LXYFn+iy7WcBTf|Q#c2V_* z5}SbfLy4^cq_nY~852KKMcql=sX^CJLRMm-hf3^gyKo6)+dq}qhlSIJN-TpeS&7}z z;~_$4G*9J80~0V8Rme(goPypL{&#F;Ka^N+5u)^=5_>n>Tac{8mSf0DY%iDnu-y+O z_8V*5p%NP;B1{lH-h)Rd#RZ1Oar*0Beeo8!azI%z4AdXeI#gmyC2VO2pGdT`l9kv| z=|d$}`ize7MZkn1Pux!>Hmsf2{!odn`I8b`pi5R_zi9LH4Ie78mSiQiy=Rr`QIYgd zC3aipZl)1giS;Hcv2nkY*bT0M;$rC`%*{q--n@zy0_LXl4<*(y9RCpdB@A!Abu|h< zdZ@(e|6eJwTVy463lcIDfkm|Jx_ z(QjC5Zo>i59zi{eSY&h+%jkL#p_!FlD|CWzJ^u@pl7-Q~Rbsmly4U}r#D@N% z#4eGQSOuPcp~Q-Ih^5jVVV)6ys@?P$xW6xu0{zC-dxt%jvlRJLiIw=TlvwXs!@crC zOwGSnV&9UL*p7cFvEu(iiACr~Wd?uF5xQS?WXSAaE3qeme^Fwc{;I^f{auOmNB&1j zEH`_S&drnPJJ#og$GpDZOQL6U^KDEU=YDotzwL`AQ#a<6DC-M!#}Q~kw8qmo4z0#v zc4)P2)MOLWi95@a_6K83Ka|+fzbLWKBp{n~T7ZRP#+Ux0#QOYAi6v3}rog}2ebPbJp=|5}MX`a_A;_?r?t0L6McGy|_!DxgdMuEf^-S&4na zPgY_j{;9-1`l-aW|59R;ekrjdWF=PVhZ0-%ONqVs@08e)Ka^OJf1$*Zzc6GaR^bmN zcB=_*{3j*WU%VOSjRZ;mP-3Zg^Plia{I0~J^rQZ#l-Q6%B~}BCh_eNmuhC5+!zk(h zs>Is;Nr|-wDLYb|_k+y)!8HW9%TFcNj#?QhqyL){Tlyy@*7%1Ki~W~MEb>oE?Cl>) ztk$6t>vOP1z20W!;}Vr*7L6T&C3f_8B^C?cMkBRclQ6fvjmb*v9F-VJYx9~o z-|*RC1{tbH;*($0J}vyA#FqY4Vuzr=l-P>DDzTK*b3iFjR$}i4G zN~|^>rnE-!jI6}k(ox#c0c`)G#M=E-Vkv$ovDLqnSXn%9j*#~Z3E^3z{hJaS^d}|O zC-Wal>>BJ-i-gGI-<4SVf2G8_{Y8n@$o!$i9{r`n=Gy*HVo8)yKb2Slh^2(;llUJ> z?A%&G(;}k6t)MD~+yeTc#D@H)#3KKp#FGB1#OD7+iPcaMtofnD*8Ee6rT@DU`|xi{ z>>%_|i3R+j#LD8p^V&2(09C*bCH5RyiH-U{DzP2^FG}pOe<-n*e^O#k{iejy{7_<5 z{-MO`!XYZTpE5K2mntcnsva#NGKFjARQ{yI;$hN@luAF8SkE6ytn9y3Vgdh|68j7d zfZ+Zol-LbeN|Z3>iy)9&du0y6maF>mDEjPI)=wow#=2$$yQX<$)chdqi!uBL2>ptz z#ClTA_Z!TU0suhjd0VuILaR6sJ@477mENjT*Qz(sY7p2;uvHS&Z9Q9u*4S+|OK-CX zY@PFJKDXPpSK4N;(5@5sONoukY?&v~URtKrjHMeTE3p%>zbUa;vJ#sPHYY2w?$ic# zzbUc4|5Rf2Kym4{lD*-!Jk66-*O+~}uzk3@aPfgt23t0ZJ|NuLewml8Ygd0D6KP>C(V z!NUlLO02K#zffWw&~V1}K1dd#2Zk#CFO^td5@tfBkH;Eb^qUepjE2Ri(4~`T$V;6V z60{kOtm>zVAsi~P@g(T?egu&2P>H>b9*HL_v7R&`ghM43#sx3O(=L^uHVLpP(vjm2 zkiq(>kmEz1X|J>bho-DL89fF*ve62aP)}q^rv0R273ul_FDEjAIe(&Ki%UQ;cu>48 zc%KAyM#5q|sma?s8x^`vH0(eD@*GcyzR2b&)q(dk29N;)a@mP$oKo zWQllx z>E1YM6~R!9INeWOl{wx?H$*!P&t99@_e6}V(2P>l=_I7F!=#fWjjg038596%?M0n=Yy>8hct zBdm@SdzR3!Xwg@18&)O_=)mcS@s%Z^2D(lnt$*Bny-Vv5iPvCo{$p&Xn8?}#Yd8lb z$v|o^%?C=){32>-%1(BLY`H;6)dJP3ty9TH<8ittn(Y{B?*Y_2aeI!z5_2 z3SBau*8YGU+6p`+n&h>OY$P<9v@t`XxrC=mN5e>> zA1C$cVoLfVk<`s4bk~Ac>j2-fJrTp6-x3YzJjFJ>KW%0MrkY96Y$Ww_^vn%U@^)fn zzxS2~@twEcd&sAEz5vvv{k1DWqylS@JBh|C2;o%%GC@+a?ym=Pj2FK{lo~GV<487m zu;%#<*p&_CXX7PA>VD+%DiXoA3crV=-4jQAC6Z}k_$Hc0v+=7EVsFrJuL?=Mbl{12 z(7!{&1&^_X_Tv!b!Sih-tmimv#&G|AFrt_Qohv3^(eICKp~Xb%?N8&ygh?m`a@*>h z*S*3-Va@ouS25mksX0316YE1am(C~zcQRU^Ijhb1EQWdS`_P4AW3=14`Z>v$OM@?N zqWB|Q^9Rq>ncss6Z8%?<)XKz0z%drX3r4xx$)ZkU!yk?Fjq(igEM703D>Q$8X@2bW zl4*%un^HYbb77RptKJxBwrKa5bJ2*ik&HvpDXoiKPb}rttlySzq{N3@`(!f^sP=h1 z?fNLEa*J?s4FkqKw?w6?)%q%x@R=s1=K_`-ow~zy0ol&qjXMjzd>k%!J57H;Rn+H} zyZa(~kw=K5F(g}7e`4}jj=0gP&aBajPKvNixlrv>UvCb6)BOqSm{#h0%69RsZE&Gj zhIYrHoZ6EK>k?w8JGY_>S~Jl8nJat#Lmv|?@+{7Nrb;zvJ-VaM+Qaa?(E3Ur)ajSqz1;!^X5sfE6c4acULfA! z9_qn#Eg0&hp-L0O@B;(qkg~)tRy-x`G{2PCuV>F1pYu6qV(GNVA$Lbkk4^Tj1Fe@|f;uk|AOwbi$+NxlrN5~3Q)wDroo zts3LT&3gf#CoA6tg`#8#<~`qCRE$)7zB&-pnEV|R50%&?VbIGXcP%e{XT0Zd*WWqS zoN3;WhfZ)rz;4vfJ{nU($U!Kymo&#I1wt$ql;()Q1&J8QcJzV7)8esM2`gc@Nu?42 zd+L+hYcop{+cce%TMp>%RR1j^N;1J~r;pWPAw-pZZ)#_d+|+Ii>F={3LX zDE>KUE9_)H)9Hsvhe~XhaOEypiGAF4q+WjWP>E%};oWB_7jt#c*wp&6QEaNy4<+{N z$cf6mYZF)B-0+0%ru$iy7_-X@n>aPhx}(*}fw1fjk(=lN z^Vq0X#^VDb`hnTe+$n9(JxkS3E7;Sw2>>M_kQh@od-jzo`SK)6ap#b5i(dM0uey1W zEBAHfLQC#?rGRg7cXXSt;^I&Asw7-r&oKuGYLz3;+`iyAXTfo_Q#&Is;bJwfRfV7C z*+=wO{Y5l|t-tG-k(JnI6TJKv{B^Cm%I+c@okVZ=cUwO_lhVR#E*9*sX9^h#X;1KKc1xAFuO?qJ!wCN43tR_U7gsuk6&P*7%%8Tz-GzmA}E6xX&qA5&;wk(DK<% ze6SjRLE(y=q4#^^hbXsJo|Wisn*E`BjEpX#oBp_PA+2=qtL7-)s7Mh;@4Iw2$kPf| z-55Ty5wXsz5I1l4@WgwH zv7Pj&U;3lAtK9GdKA*=$qw2yIc4w7E%h+CtFP;syAS#?W^3?R4n<<0aJrS3$56x4W zlrq+1sb9<--ATck(T>5&QjZj&&I(&j3gN2AH7J|;C4>3t?4w)858@a@Y(=1LFZQn{ zMCPIGkF)k2w0%2Q#u2E&lV8Qu^P3Wz_BJZ!Tx$59yX!9{w!o>#9JHf9_`>8xd7g)d z$5Mx1`?nXhua{x{_{eGllPcb;sLOHVZ`G#0rkIbmIwkB_EojI*$GWH^Gh%v79J5fT z2$y{8cdT_yHfvt?tX^vhm^dnJU$wM1DkU0lbKT@zLy^0qZSdQU>%FFpJM&_q?-EDP z(?~Y%VLU_Tciz2xe})KYaFIe&q#PB$ThDOv$dRIM+k6iCB*vT}Eg;JnF2?z3u7eP24`yNHP#mLEPCkijbb=S$B z9j=AnvSzz+vFPshNDIdYpJ07=$L>LLQ9IAVtLLJNFDAd|(Y5DAH$2=g-zgY)f71BT z0}s!emJ_qiUtiSTSj`DcoA7i#P3JYxBiuU9P~3C6?io7n)|qUocaG|HAJ+5@chklW zA_UasidCY0-i=KxI9K6@A9!MzbASL(V(o4J5E_u}yP{lmFgE~T5?c-f24i4Ae4NL( zOdt)OD06WQ|8xXB*T7aCC`AhKA|r1!Rlru$`n5=^y=kQj>oNlUfbifu6vuq?_*Im8 z)4J*3%@$(2E5Cc?T%!c=zH}NcSoCow0svtCn!FGW??^=bGPgkpz$Yh06ldv&|3NlV5TYT6)&Tu~7Cbxkt!|$MyU%W3$ zQm>vZxl!TW2Q)=`Z6YN&b7yQl@LI(~%az^>B_k04U3W(S07X6uShG&IA_f4^+eQj+ z?sqjbo}rU}yfvHvz?09W0-XB8sNk6ZgkL6Pp43bMA^=QXti1V>^@;Q}VWJ z8+XSWSv>)!WHeTQiUv(VLq7cL2S{21gU+gy<$zF(%5Xo7D-!T1r$IKXVPW>g$E&J9 z2N0mY-h?d1YSj<|8bd#WC_0)X~13W1vDDekNIUvBOa;Grlh6A2zA%ufK z^pXVvyfIY$c$2P(q9H65A@Xt+fZ7xD%D4gCA@7``+#jYq<)DfUtmFUyG*r~+i4f;1 zvP2kZW7}eqS|i*qhbU?32&?7DRFhw>X02;tTW*38S|}LRRi)KC$zrTZ^|YVs!Lk)`H5#b%XKDXF&3Xk7Gq7xVyt*0--M!+v@D>iMVQ#=MHXWTdDcBGiXbe& zPlhF`U1J#UCxun_R&kpP^+EzERoZ)Mn@yUlG{V(f0RR>P@OnkN|DhO5(UVrM;$E|x z)^tIy!Njf2q#27?BuZ>uUjBwO!b--UX1IQ*}3NPj< zg%c$`F}p+6EmvEZmm^+~xmcvaSU}#j0*Ys?E#nRyUIe@;quSVK?aXBeJ@louUWg}5 z8?&U906Rel@4n+z(-(lDjslpGx!5Y~F}4<0y+#uuN`Pm*DLoLt(qTFSI3J#uCQ=Pt zpr9bixB~zgv3cs=7@02sz(Rzjg$|T4w41TTu#lpqr+YI%-rY}G&=aHi1@kg6j{}D{ zp(mEPW5#NGdc5-(x4I89T6N5_@w^q?Rq9WxW1DNF4Wen(%w{xwyTQKcVKU1Rn@jo- zPYMeaJvR}Mg^HZB79O?`VM7RglN$ld2N()Os-{$Ph~rlRb&%nP{kv)a06;>73Ly^A zAx6q|^s(9kEL<9B0RUx~KBNO|n284qH13_Hv@pY35FnJeK2=*hjbk{+f+Eg6kE-Le zp{HnR)}=@&i#lhK1pDLw`_h0x9GQy+aFKeYL>k{ZfD@A* zS>axr3PrxZm}j$A>zoK_Z^a<k^66{SF0g+c#wsGu zhVW9A%*7%{jKHOa;Bzz#uP^RBx|H7(RCm(E7Q~#=R`!F7HC{G>b(?rNiCuCs#fP1{ zxD*0h1E9XjX{4O<_b{CcH1)pqCWzfE^!TWi7rDkG8gjxc(qlB_^k{f;VT6y_f!Saf z8A=74wVQBLK!~GgYYFOwQ3&ac1CV0>Y^k&QNORpNxTEiKY8cr8%vg@(slqe$bX#dr z&{P4K7b4_jf%eD4c(z`b>XALgb1`2rQE_GssiV2ad&pModcY4W)>D(|(250YeaQm> zdQHhzEKvH$7vD|V|fc&7^GIOvrLeu4pDUsHZ(Xt5$%I{09Ax>ddh zTIB7SK4Ndr=C<0S8CG-eDbSZJe?DS%3Q#hsAVVinjxj$82E5pN#3*iD<`ACm1a`I! z5B}UkD-dZTKxsl~RAiK=+_E&?j$|cUv1X=i$r3G!6A5|~2NTJs$XM*56-$g{QaGhC zI_xX_(~1SeIa!%mU<>Op?r4f>Ye0H!C{*Bw6{}Tm^TUc=$~i|TaGn}nx9X9{NWkB} zWmWvsij{qBl{sNGdVVa9Y{lY-c|xF`3S3e6VEG?bEMc-a7VkBOm&DJ*Vowqnua z9AqnY%i3$kI_tAFIw&u?R{41z?nH#~E@Lxnxq&mth{DBcKETS#0!xVlphT$XZD&^H z0qz2k@LdZEQm2&`;DGxyk)~fx)dv97I`w_NYSq$?G8tbV- z_F5qTae#EZ{j1i&RUUkKvhpb#K;V!PaILZ235+@msKdyku3>2QltII0=8N{ylM1Ln zd}S?=LI8kFIVE!vC*DJ}!r{rTv)-jalQa17<1-8lGm%#XoO(Pv`uK^|COc3Q1UG)T>LbW9Eq;$0*UjA{oe;w<_pPday5iQbge_yEuf zxkLg|u7oIx;$a3!mB?_j*lX1&b~kq|JWOUL$U48Lbiw#4;kx25G|+`OFrK&7$*>B@sQ@@w!kG)< z$hL&zqvhZAj1i@BFR`O+&wDoiW_?A8a!wY{2M`qhNcZ02crRG+1fD%U zPp4u&CBrqSHE3vZbT_;8gZ2-lq7EKU9N`NM%M_wt1L3vQ!5e=t?z(!8863a5-e>SYoDIA8x zCv^A7K><`L@9XN^P~L#$pq{3?3P_v^MgD-+LiN;k^L(~Vd4)W_S!{P+emo0htMU=Vq4y2==({vCXSAk3^EZ* zh^&RH6pd(_+DGd2VA~W{L)=})9l=;d+;Jv_bH^{Dj^j8*WOdMZgWZA z0Juu-KsZd)1VD{!LcqL$DHp7m+<|BSRX!d>1Spa$zLy)zG6AUYcvS%i${hf#>g(N; zr6Q3(SRaQKer;a1;KsDqmMmlq4@Cmx&T9c=y*{lvpeD?Vog(^90M^6iB}o=^L_B4t z_Tny9IkyM825{SRV*$Ajc6Y$?Lug4Bt}b{mfP#fENGtA|aZRyYF_iNQ9wvTe+R|0@ zz`C#ab6m|;en-e^fF&h51QUxl>;V`;0q{&v*9pMEbp#-gcK8Uy5@G8|fzvXRJcY`= zCfZBXHqbeW@ryf7DJkk_bLhhY@CG#yGdp{ksv{j-y~6zAk(%@ki_DB&&BV?2XeF+nQ1+Jh~fz{#%zAdwj7{=oAwPTZIfLJLGk0Mv3d z&&D=fqA#!GLPSd%Wn->ojUnHu%AJF+%{$`<82ZOM_W64njRI7pvE1dopf`JAGka5g z7(vud&~FCxW5H$hy}Z~?5`>F3!Resd^vct zQM%gY@fUunGXcRnjlo*Nw79eb_B*t!lu=%PQeq|D7uLA`sl>LRxh=j6|E|P-+`#mt zNd8lawd-Qw0@8Qn>fDp`Sp22LCXaUI*JW8$daW#e75=Hj%70#rbw2l|%wy@>@k1q6 zHCsCOTO=Mo~L8FJ#SFp zoc&9Q9e!$BeraX<#6OhSO;;`RSe+5~-5*NqL4^Akxi1TYkAEq#W0hAw{auN@z~8y6 zxVyFSVaQPM**P8_sc)ZuDzP-OnfT)@dB6}#RsmI7>O&}i@keABgKvpYho3TUolHLtqk$! zP1F{u=DlO7=x`Q^dK=|%=LVhjth$EDlQ+pq^aqug6uTHXzWW=Bf~rYyeRF4a$6uT+2Z<> zlu&vT0afm!(*lny6dg=0=-64G;&LvzE#~IeYzq~ZvhpXO3RrXV3bC%I)P!mFpl-$T zD`OW`8zBKAWt~Fu!vsZ|@TIb5xW*i|Un9V{9BLkLymIpTF^?C0LEk+LteGl~S4-U2 zMb><**%7S^avr}_C-ye_gC}>7`7^yu`lL?{yEF>oO*&()JOpqH$UOD2&;)%8P;OVe z?TBT1`U4r8-Nfu`u664=?3FZ2IfiPqgQ2NRnw-zUc5HF=$P>GAtvX>n!VM zKVCb(ET|!p`~qA!rUZ-1#)Ti98>I>sn4vyx8z%fkpp&VZC;l`G<%w;9n-*jm*7b{~ zIOr{vi`gK}!*pJuS%xUNm{7XY1U2o0xsq^hvnJ#N#@wSHSfbBvY$z6pgfb^m^muHw zBwrP2#_VF_`2FmT-K}sE%ZX+)?U104RC%Nv?yqNicP{lgF^uibTl&j>Je>SokLAwW z!!Ix7@U9~@Bv-Kpu7l-#>sq*ZBp!(fqi~h8z?4hw+H7)m^j86ld|m=T z<@yki+=y!YUOvBu1AT%Lu7$=%fLd(-J4oizY;v*LoUW)5NF+s_Q8=y>Ge^*Hi^sBj zB~WrmJ~h5qrN-FH7Iy4|U8h|0Am@}BkRzX><6e|n{FFd6dW9TrQCDR&B=?i7P%u07 zP}U~T2>JYVoK#gPXK@s-R>Ady*M*m!7s$^$@q^669b>1dO?}?8y(PsblCq(Gcrd&0 zG`$s>hV`1b$Qd6^8!VobjG)utjrT{~+U6;}YUS=QFYvl@@pkdcxmGo>CB1J|Gpbfv z8(ZYBcak>@^Efk4*`_ZLRw~9YI!!Y1#E2tqt{#Mv-cO&Q6S-o zpcLl%)3|4BZ^x~)zpba}W=0CXC3OT}=em7EVBw^47rm)c#UTyr=;|6leL1I;&wn{! z_SU&u8h*`;FV=#-?-(i^L^%)-6r6cInOksfo%nk69!r#Yls%cBDAfsHvLVy3@m_ulEZa@}Y?b#ODom|;7VPSX zT=XZXjqvcRXT3k6smivFym`!^C(;)7mA=@loZBjK6jM>oJ4_5s-mGzA|2&kZ$Wbgw zv8RJg``$Cq3gob&xTRxOiHO&Vlu@TJljFu_ovPM+copcTc_E3_%c?oV@X7}*=oxsw z=LT9vchUT6+pxAXhTn|sqQz$m;TH|hIsLN+<;p7FwC_{npA{}Uoi~3r|A2=cetp?B zY@WV6obP%0$`RivwS)1g{gYKK$5*f1A=9w3)XJh`iC6EXI8CzC=sR2Hz&H zssC$RM(l{-v?hve8essCCW%V4u;4;Tm^pt8SR-1#B<_2_a_NgfgrOR|=Of5}RnDEvr*k0+xcr z)~=Eaw`#hx7Wyu(1u*TQbA<_n4;^JqC_v*EhC=aMjiw_s#cR!)7f_zTO7-;>b*-mS zQ=LJV>!G&~wpK)12Q*Egn}frQZEBO#aojB2Wpz=t4EpTRty7e&rx(Hm_+ZmJz_%ou z{42x)g-8>oCU!s420iO}dOPXCjeSC$-O54)+dIX_BJVuWI?a)osF~C%oa;I5gzNkk zcGOxOu3GD!(d&KtP2r6jWq5(BMS#*sSpy<$cYxr@Q>EoOk;k7P?GZ z?HILOmTTU+GWbnmpk&=xaqEcSiZv=_==*dA@rt!%N668Iu$j}LIwv^P;+h1lj5EGc z&?7nLzoMv^nncc;pw^llqwjn?@j#ZWZ4EawplyLdM#k&r+EvXWaUpU&HuCzm+k(by zb`;8!2Q8PmGS-?xC8nFg>c^IhrdZH6D1qiM!Hh6FWB%$vdR_))t2^gkTf&u$PQ7jo zd!U~Gr1fZOD{IZPc10Fzc58id7W)(z$NMafRW8m$C3gD!mlh=wrBX+jLPgjqC`iUr z?SyQY6kC|sa;O+v7>|kC36X4-3N`r-O0Lzg8VYWvl`w@QH6D#@p>S0`ULbl6%Zd+S z!iCVc&LDg2Mb1zhaS36@v z*o*6^3gm~1Y=uglRgX!wn%gUh4_HNZXxsvIZU|};(V&)XH&e+vt%Wf zTWOin9LlXUswSUADWfnWnr<(Y>LC0mhrW=9lNWfx#^FT1n#^?`s?IRkE>)oo3Km?G zRA>l$RgTbdvxG~I!szU+C~m=1Ey@Om@T-P%sL+<`iHa8fQ;rnaT(SI6-V*y$Zde^5 zUNs<%tcIgN3J*&wH_KWGZy~Qt6|deHLMf?5B43qzG=vF8VZf&=fXDpwFO69lcCX!j$FezClat_~2CsgFDnmN#UqoPG39LPyL$Y#empBSYa zgg)YBd_+HrOe(V4Koup-&Fcc3 zj?JQH!~E=NWE@pRBUOV`U1muw;cnOnp__`blovGSZ5mwUTOS+X4;&)zJ|X~t*CKV~ z!hxcZz|d8k$Q;F0b)a;WfgEorJ#Xk`ZYtkvd`GPKWA8>d(0uTn;eV8*#^?PcV!J;d z6v8Q>sRdRY&oelY8^Y8PdfNh`vN3)*r_3Ph z-|q{j(+kAK$$fD{j)1%tTaJ~fD{L*6WT*=WgoxyWlnK)Yxzh&jPw2Ip)0)Hi7eNZU z>dLZVzB(TRMHY2h1@yOoN3>KQUTafI7uL-!a4uO|SqMeBA2z_5(QEzsw4;NruX2?v}N#O~dQRVbWGxQ4)XLI%G0#+4= zvye^LIgmmW(JF}SF{`tKdh|xs(jrxrMJ^h;Q&xnw$t?FYkKzr!P`a;%3Dv2KY=I6< zSTj}K-DT~}CRTt3Q(W43OyoAHetiKIxLJr zUMW^gG%;ay7Q)IGkG5um#Nl__zP0v@sh_}M`^Lf*!riOWTe-|!wR7F8&pu|tFY1x7 zcVkY!>I$zK0X?0W(T&1S54N%*kLzl!YF8DU7dcqb83hc#hbuXT418%}+*14KrlI85 z#D90?O?r_wj&j|Il0Wl^m_R6xW1%uhl~X`f^6Y(i*-)N2HDv+!nGH4G-C~{!V+E^F zZhT8u{r|__U4FIswvC<-!2&@O+!D07y9X&=q{XFJDaBn{C=gs)+@WZZ0)+y_p(MB! zr+D#FibIPQCinf@&$FMIwfF2b^J3o2{u8cTYh|tTI*#-EsnOuw)tV{L;B`N1%H>J3 zYK}!pdMLqM!X-z6qBuy#>~OTO@#WjvC)?P(X{&}sI1+kcAG&bFFA>z$VRDBD8bRSO zdla2d3A<<*%?!8v3_>CXMX{#LTaB2qb!8;-sF_1Co1iIfT^M_jG}=8BGat0(vEic| zx9@NSXpqdukJ)>bHQ2%hcY8Frl$u+y>1U8K3={%5K#U3N6N@wwTYf~-L$|A@5rn2N zLCPK8E3<-*hnUcGifE0?I-1d+%rov(L5G@bXtGd*)_#vf3_!e8S)vIjsHg;WR3eKH zqv=J01ODx$-EVf=Jhruu&m8b6}7YK7) zN3IH4;9;zTd8}CoLCo)t%PUqHnsvxxH5QzJ{v>>HMG$&m4l zG7G)0#(Xy}cB5ItXY+yHM^zE8XX0pL4kyfz1cS;70iXGBP#s@GVZHZAg3wZq8%z(i zPh8zx_c;8^r+SSRB)DM)qCIETswA8(B1nxTghIclG<6yGGiFBctpTa>QiQ7_Dsxn9 z3vfgqhr6Bm@sH#w6gDU%AlTT$F%M5EYXV#2r%&(}*YSTwfGy^c&0OuhGmPe-V+ni! zr0EtJcB)aB7x{kOL!TKzSlUC@BOy+K`_@EB!mRN-@T0;E!ZWLTu&jqh0y*5D>sS^c z(_P=!)&m*s5ocRYjzu;yf7GZ$z}J-6>%t_0W*}M0Fi8Q;7?!}qmcSB!4ViLv6d&}BF~o2zmN z`st!r@=bVhCeBVy1PmZ9UZJR=Cz!$$i2eL9CiOC~Jc2w3>7lI*FF3@wQv?<>Q*#X* zl6-78%Fp9TNxsld$2NU$aU*R*O7c#Ha7kS8VF~s5Y0MzIufkm)XD?o37vp17tszVA zot`0TW1?GS(ZJI0nr;Ur!7()HeH;SuA+Elqw z{DmrHy&uf{H|2W7phwCI_o0&{zO|VVuXB5uXRwC9#u7Fv4?e;YS5yjHijYykz~yZ$ zCez$cmEmxdw1~e(03vQ3C=k%YGPE3B9>F_$UH$F;x~2e!`X*dxf0U>i^&kj1J$yqE z1K^r&(dfPDaOZpL6FzpRl716zd4r8%Vb_?hlz3Wc(p0!^6~T%%mI6UNj{{Au0yTDl zU$2#UcSFtG1A9w)+Ztr}+hjCp5RK+QanUfC=rP?^7)?O{R~njV`R2hqPyS<}Z5#>P zs;ahL`bUH3G`rKRvo())1AjHu7hqr8_k{}{U{)4QBn7%9UB|5enAIxzz0n46n$Z*2oclR1m$tX;8aqoQ{ymdi*#)R|<|rifrKt%4AVV zk-~DN50%v zkZlmjhVzdC(w}HYBt!FBi&OdP#t0(h>*!2=eM5z@P zSpye%QCH$_b*~+;e`h;;!HlD(tzzm#H?tK2S4qv}h%+xa1*?)8E9<2S zs4=3y0M<6WP9HWWSQP1V+bW~Ah*de3Ks+zUsT!Hft>Z?v z+nh;>V#w}`z9^rmpXDr;Ws<#0mi65nV9S6}d@AKS0M|RM1ZWJk^495tY;`yKK}LO| zST>kVMfQ7A8;8HtYxkkFkhgD;c5h0p#Vg3?*(^GCd*TB>QS?M{nO#QMr0;|UWF#!* zGJ{yH)TcI6bmVEb(+u6X(zLJF@k)YyvXrP9yKW5ol3WRjDFU@=CL`EgG5bUYyOVu* zcDD_qAl9}?_8ucEKz|Hv0vRHKH^5$5M?7GC0IVNjiY!=dDsr&#PZ82A42UXri1jHD z#K~kCwxB)g_`Pbb>Ur)-2}`-6swi+hWa=pKDOSxp>1PaKIE}Y`fq=8RJ%Z3cxkrmB zw93dF3a8l!Mp3dnlDT@>8)W4YzmEM0}f#xQ4xOc{(87{$+}Q17fjMV_{X!)1a1 zTaks+E|bew=)FnpMdMv&>GAYtvRRG-V9{gPxAlVT#d z>ynA@T}d2qKWAZs)g}8NJCllG7D5TxGg}-|3mUqw>tEJDK(ZgE6KM+{8FFi|SF=zu zJhZgezrM6AQZ&^vUxj40&b;TVLUl4S+7pT>!f+{tqU24m*xG{_WU{!(9OS2A<1P-O z?=^HB%^I?&?4w0$D|KAl&T{YcqJ+DD>bOUp_J=6$^?0ybL&1644@GJxJtUq_!QC>4On)U&Y!oyfH{TH<@Ifyk z1zJK!%%uVw)(i1F&l$cgm8)1AVf>9vUp#6hA5lKl?+YjoImQQjEMiezZj#C!061^& znV-Nf?Z8Tg{ZbKaxFG;U#ScKFp6~GEygShfIrb0w@j27^e<-SyR-AMbb7tQ0f3DPh zv-jkcnPDB!u-dZkr~cCQY!%wD%IiSUG<|sHoiybKxpZ#BgT2Qc8$Q+E=~d=OIflJ^ z>9x;)^E|?8H$t1u*1cuqwr}5iGV0e2a|y22>)|&2m_u3qL+_y(cKq}NB%?9|CiKxk1~8Ej$ok z^r2VkG|@Arf~$4{s@m5V@lQ|c|E1d224sFByeP)*+W775|JixsWC_^%#Z9|LQIpH&ynp{V}U#ag=9!RQx; zjQ-STzj6-SpZDC>VPt)|$G{hy0p<>4^L8gxAQECbH?$|kfM{tx! z`}LP4ORVOf7^Al8dSR}~n#HfkJ>>cR)CJXLxB1v1Uk5WDBmAfm^F~C{ZkB(}o%fJ- zg_F3f&z^Af!a%E@iQi-hpI%-JYTO-#hA?b%eG_eW4|{89vAiwAi6R_~_kdSh(?+x) z2y7=#qf(ca!(U@dP1Lw)D|Bpc#UMavzH^W7z%E5cDd{`1@G~=P#znm&DkCy%73{D* zAw74Z(3;*50`g93`<_hz(o~OKXpR zTe=D-9O1-*BI8yI6oP1*cLC6p>zHEQ4(!K?O87C%fx|q@Ulz8dq(GBlj4n+3?-S8G zop%A#_+r0*I{aOI@$^RfXsyEkdopWAVNA|fO~xmV(TA}i?>m;5u7(^W9%+$9dN^;A z>vJMNLmb!!NUmM{tH)fPD65r9>1e%_UGm6&l5W5TWx z0cx)su^3;Y?l678Cer&1o3s03Yj9I28Tx3qdi;xYD8nN`T!Qs&$wTVbVGMr!%Z0;q z_J=^Mq?=pk`{l%4fF3aMcokHK^_T^c@`8yf%EQ4%CnJea=V5Lm0YQVuoXMl<44Yav z8(eF_N?9_f^cr#5UVBR|(Kx(=S9hcLXk!)d;o6yaDT$%&ki7aW1Lq;BlJ&jj(aj<0 zZbaQuaVUXw_pe4h@?0H)DnYjSZu(vX14|ebPl=*~8Pbi=@(WvJn1m82(cY~!L}*rY zIg{%O&9>-GH(^ZJ?hQ8^A_21Fwzqfv#HWMnWEYzBCP~Y6&R|(G>L0_{2&`jUI3>T} zze(6onh6vg$){iU9bc)K`SJef-YKo^K*%YZljrG9p!CIAGfE8H`tWNw%!4`mV<|%g zz_2TsPz>BYI@0krk^a}@jNi$0H3}&AhC!BW_S$p$R%jYvFpVQTa|i!Rp?k2O%xvdM zyV%WWY|DVp0@=Z$>R{Fapu$RWbLY0PLVF45d5Dw#m z?Sy1joed_ke){umI{tNtVR z`R{)j2;58woTUid8wlbz@J*%&9vMuNc=KVO@yWQ+X1J5@5azW&tfj*U?c;=Nh(8#G za`x17dczr5LSzN`MP^R1T+p#B0PNv91&>{5&q=7j^ozVZ$p!3 zO{3IM?)Y}COGI|STSlf^7N10}FIBenOtu{?yDq}u1!#vKudaI2%r(e$rpix0W;;oh z`#rsS>CG13fL-A=)nP*v2Vm7lgWmxHeN?PoZZTCCf;@`a)=(NO&za^!Q)TN#?i--I zH&BHIchO%53`>IPFVOZUW-cTK72_|PbMoqes+jCZBX6A<`oxeO$7}z&opVCHlDG&0{$+UWDNj z*T+l1-%n(I>&FWVhYYs`peQrCwcfL__ol%En8~b}r7M{s6zddL;dDX8RQH$6kYyA( z116Y?uqs5r3k}UG(=91bs3i@8+pyGvT42?g!8T5foNE$rf0=Eb9am>ktNJn0gb{apAK{ z6q%X0q=(V-zw40xP-+$|tn4#{0IKW*hJ{5z;5G)=C`cEpdq2SBFEir}g79-pxz)L4 z15|YY(FsYMA02yW41e;4$|~%M%XpQY^O%c*f$KMM{KX8HXg;_51rgU-ahHKVt}po9 zC;i-PDAX}5Y;mH^V?K|IKbALht|L?(TYhf(Gals4^Hdkk-m`6&2<4)6ioJ8Th;)7+ z7Bx3QWuTDi7KQlB4DnE6NLhEN0#fa$QUrZVDx=)8DSi z+^*5iS(_Y3!5oVD98h3ha;TV4_&ioQp0R5_T;cxDEbue70kf4m<*U!liml&fJE(&! zDPO6j1i>=rCM1HsWwI?@xIHI+J?kGlAVQ=jHh*Xw^dsZ-So7?y@#}RlhCjwe&Tm4m zjI^q%wARtuI!Zo&nQiZWIL9e!rZ79$U>VFu`M(+Plama%cR&ZcrDAs~0#yTeJYV3w zwr1g7$LDxYaJ&xvO6gvwXw!?LJcdh;ku!>MR?swu%0VeXX$(fpo(f|VFZV*Wn`1op zVp8X)Gib_l0)h)@@RB9xDq66sX7y@xVJiTd#PnBf-l^}}hF5!@!3;p-|9=`0ZiO0cAEhKeGPHVuZ; z0QEn4e+sbM>3IPScUM7Cc65hvpR$PUf(Kg?E_`MA4m$`*7-O)?^Wf>kzUknZbemRe z`@oF&#tif9*+y`tZ(t_0F_ZXwCR8dTI-Bk0jOV^pOP`WD8`n77rZLN1D!V+}BX>d9 z^g!sTse81f@b7?3KV*u&5{<$9(}-|M3KYx(!5M%>VA5=dgXX$j56?ss=GZ0+gxNbc zv7aK8p{Chz6zP-j!~|y2z3vAmJ4_vgOgROM#l4(XZ!6iBOn+=vl3Dv+H-~L|mO^*zmA4c_hhL7(%hG3yr7y1K zU+tCY@RdKyF30z$A<}xEUzmTCE{m^X)ntF;;UgBUY*K^5a6z%pu*!O*6KOwvXv8<#%*- z(ag3wzljqTreDs;Z<^IDwbh+17L&`Ye6{(cHTH?8Ei}{2IGunruKT$mT1#uGQJ4Pn zWAo3(H;qq=q3YtuSd0l9_HQLNw@Cm6o)K&NNL`1jm;GGz$rw(VW`oOg)2t!fqQKV? zche#z)0+9JC7-@E^@j74R*Otc>znqr5PF+4Z#4<&;K=#5_vY=fx$Q+VAHTdZl+o$f zOz(KSK#w5Msev~sBR`*+f4OLHv_bQ-?I8c4Xs`L$*m9T`Kd~G55mDu#=$BCg-QX#o zcl2lmZrRW$8%(@Gk+lQOHLtR>eQ@v50V{+{J-QUVstnT^wU4!y^>lCFJkIMiwa`2f z>3!O7m_DxdA*dqaLeLVwmqHOq+xCBFL+|i8c`0%w-Uc ziXQI{1?Bqo;x*z76xj9mAhR#|?QI7zc;cjaf`EDw%W~rT;``M0aZ<@i*8B;x`zexIJ@IGSWfoUhmxhW*-!aFS~*K% ze^tH&FSY+2Y{)O^VP5WA(c3o6c#uCcWVtf-cX>sw;sxEXlm<+cB0yx2EKZSx7vylv z43$RH%oRG}Y(SONbpI5Q=O_|UY>>?ris5an^>vfpu9Ky0n_Iu1$G0WQ@MbEec6RR} zlx&1oSdPntuKc80{luvL^lsU%Ya5ZV<9ui64cGKsozTz!fo=YS>s8KKaH}U{p&kgqdp0!!4l*CJ;;_&X*}yLmBy1di8lM00efsxX!QZ9tf9dltz6@4ITZV6~ z@qQejt?0f~-oWQ&W(3&w6NT9JUjB+6`z{)C2gkJm?KAhlGNuU5%Qv&XxM9=2yafrD z4kp8o_^u?Q7VifhY~23N-WFi$QO}oc`=BCR{xx6r)%E$eBITAMm3TV2IMK?f_u9GA z!T$p#w#jLt;NME@TuC0w$Emhpu<=&M165>SC%8F>%GcgJTW``4-USyiqp23@=}#~r zQ*E@~8Ly2~#umB0nIo@#l*$<&u(Mq|2Nkh9JTXT`zR3#|_jGzqEa5o6(CEC>^+u-o za0R2pc1cq;9j_~-Qlh33TTa)}alFZ`ve#-s4i}N7clu02K645hY5b>5DX+HFcD#Rl zDrkL0bd^lKP1kXoSRbfS&ljL@lx6t>MN zsZ92;BWWW1ZmZ{EG|63We6;HePRro?{21Ff?vI1E@qcjDY%R_P*CSc3K@wWwLWhzN z1v?66!o(NrC90fsWoK`aH2G;$V)^9>o~0Q)(%612DBN0TiQZQJCT>E=MFLR`AjlNV z|Q~M_;V}`RA2%9yThJW9iNG*+&owc1eS>i zUzGZ{nm&J@VTDm*8&|kqsFz={J`!$NGZ?WviD`Xa4&2aJffKF7WMIaq)Nxdcm2u7= zrS(6&sO`EVtgg-5+tYQLiBWnZdD8hzoqPn+zAR@%5vEn)sBu4%B>6FPCpjbMnJr`E z24uv!Zs%RRTN^hC4?q8q3pUhZmExqgX!9Zd8}Hx*M=G)4T_~x*HBNmwr`1~Ic*F&N zf^+MhJdtNFWW1lh@k9kp;NvbdDA3b$QWEq0ZQ--8kl;bl#6qW)_iw4{1BgUQNp!Wn z$Hc!UJn5VG`jVQlx&L-iHaVbd)BK7Cy*li9lFTvjC`3x+tAVbLZ=JOAOO1iM(Z?!N zp6sWm)3vSB`x@?CEYS>0F~YinM!hnn^IjsfzO#Pc-(2JxljiuU|8_Kb!|45m;}J}@ z#}H~!OvG}z_BPeTe>Fw-QESL6a6qhJTmUpid&9?8o$a>woF^K#tDrF3byZkP%b5!M3`KHc zV{w4Uh+L+u4TxX$c=N3%Q!woAazmCpfzykEG(`QHLDK8?4=MH4sedXK7LGkzWXv5q z78gxuH%`3`1z52gcs>2}Y%3hc|GbFmV@oVE4N}e`Xb_g!jjf!~P4}8&J&5TXA0#D& zXR!q`l6`?sb2v~ERI$Z3!vkElj_GOMsttRh1q1-P!%LIaqaP^70RZsOdvaW;8U=tf z{X+3LuatDrE0X>;jdUNi5&VOG5~?Jr6m>oZd+KwK!{ib6@YQd70jZe@aq+B-Uz9dc zuGMy?ewQkFS7b3tvbN-jUMc{=Bu?065%kQYl~eP1oMA!)>nZ9(U`z>U=y+ha|58;e zDmmQBWr^nDgSV@{RVZ~DO3Xj%x4~O&qYgn3?i;ewS2)VLhw6`H`HMgvmv&S`f)SEL z2(=W7JpwwB_d?!CHI+G+M1%?WHF5?XQhxM1du06_`}P8W4Q95y?Y@MUwtA0_MVrRJN0zw2z~(rSatu z#fnt{C|YyUU=CH~aT2cF(eLf~c(jI!2{jadKe+P1r5Y8Kwn(s{`nZ^6zW!21#0Ly^ z?2VMF0D(;?K0Y|hGD@10)~!T}SsKa>i-^(bD6cBfhTZPjR&f~J422WMY!AszOea1V zB?GJL4Lv7iSAc+$EjMMseQ8e302HELnw2g z*5}bhU8>^I8uiijNFkMR4m)d*&-<$T8TqV+l$JUrpJuFiQ=+*CpFJLZ$cyh;VYx8w z;v@7Zc@f6S4FCcZLxH^QtGQg%o&BO!A;y{+MS)?Fx1S4Mo>f);!HnEYR=RPL0#q25 zFLFR-3$HiJZ++|@LHQ~K3VjfIV` zRGxx(HkG8lB79pJYw+b9pc@d!EbDBl3unLsbmPCB0+@cewut9=2Z*lb;sCcV|+<`~@jq2>5rq<1rTA}1Y?H>=pjRV9-0m|>^ z`<&g{Q6i%&Ll5-h+b$SUd`F?+Jf8DEmK?(SM^~cYTEY}V^w8;Hn3n7sZU}>f)uqSr zaSzcW%xd!%Eu+h`l zTUy#t-eeOW5b@)2eQNsNJ!Xr$PY&}}mVzC;=SbDI`Bunh`<#`Dh;SzS&?crw*q-V= zUE56M+dmNVU=M3OlR29b(&m>t1a!mE)Gsp=9@pKjKi6&lp4|SYwUPzuZv044(v1Xn z2u1nPpjWQj7+rC3dz41%zqTEfs5bPRxNR_?UcHy})0fVpbb(G`0fisBG1i3dIb|(a z7$Nhis(-vWI)A!=1Y#P2TlVX0YYY&2KmMs2hahE^!xfc!M#FG6a z2vRmXO7^2+y<|Q4spfb@ejML>v+gf`mY;anfw+I+*DdaOI{`XmK;to?@aS%ne4S{3 zS$D^H0x9I6LT0VT(tD=5h6f#6THW>}cv*aPJ)svKPcWGz>|K)2jiJd&+}8sk4Uc8$ z{9gaWM-Caf;XY*i8tQ^RA&5$5_`Xg$kGazM9P(KYtmQ%4ag5_42-dPDX*kx!@+F-S z#QWBb={e~U{|-kCk||V#JZ_gVLL}B>p zDzAwqquWb`1tt-&QumnOkv5#MVsup$F-A+hRlBAx{MTal>}u|0#z>5j2Mh47UL z!pI-QCW=fO5pkGAmS}}pb!1EcCrKD?yitqYEenxdA!&Xn{G`t84!{#TpD?~l+#O?; zte$WL;_U~Cx(x)M6@j$saHxC1=P3}TB1_#+a~Er2{^U0sebLyaRw_`U>xn0Eg&@|` zL@cSKJCDSxK@Fm04HPJm{z~Ok{n<^OzZ4d!rze55zxFkNr(!N( zsZRm-X>9uGaGSIjHXyUCxW z<2u)oekufK2;wHJlNKc5n${)Ug{H&8GG7j5xDjN2s>5?WCeN6_ar;Rc^fNQGmDb3^ z>bIktZfr&Y0sG@CNn1GCLu;HUE}(c4Qhf^OTL@->XL2OrerP5gwFdTE6Loi!MGujE zpUA0=B`uZ0E(@gn1P3yagX8t_&yDdhXsj#>tT&LogTY}1i3(%$#YO&w!yYuHw{rpM zYzVxvafz^UeLen-!yXf}gyP8#VaAR4+doO`ev*D4BD?X(BND>R$Jnr;#JfZHWWN@m z0eXJZKV20kuqu-BiMfR;Ca@IO@_pCHPkxdjjtM*i-@WPnZ#b;8HE@o)faXKdj3DBiSq2o5Eu}#y|K&ptD9gXH(=Hig#&^ z!vY``UMxB|Cidq7s^_HN6_#r6m4>}2T}KxaO@d5A$v7rR^)yT5vB1n_B`Fz%jB7Xw zgylOa*o;Yp2OOkvMZjb0qUmEGi#37BF;SpkNuU{FtPr*UC+S6GgQ zAOkn4v^DVc9w~CMi1jDb#ahNW&S`%Fi-)ZIJvXmYw(BWb%3rd3ezwVU80rIa(T9(8 zDGJOaM#m|Idk%!xWo3I+>5$T=g;fD8gk$2>-ep83FY~_)S9?MzhCQlXrV=NmpFdqI z8;h%XB3zEvtZ}L*wyH~6ak493#FERY`LW0nPe!p1|L6oEke~Y~o>9$H2ikS2n`)e=WfAp`%H5}WSaDnmw*_-`dPk-jw(+?Jl(ny=ONex#*ZszO8DhtpVK12xzb4X>Y7*SNsQu^=Qwa$42M_+pF3;Z`x7jtvyRveIsp8k8wll zY9r&>C2~q;cmhX4NtQ-BSAKP_wi7bZQhdiKu{?>J=3PHWI@4pi_NuymQBy4b>e{C7 zs%T@dlKF5d^JU5W3q%@r*?teiv{nAbcM7JK;dXe>(sti~yRxg84@SCIUUe~&p}voF zF~YkkExw!;b<@grlT^b2`>1m;8pwhs((VTB3%z?uZI@J)q6FvWg*{+EaoxfZCwP>j zomtLbMzB!Q7O{-gJ-gODQ-~f;3pBG!4S2sx)dj63iza3$65bbTzDC=xgNShJ1hrYk zmHH+@&?ko!>DcG&Qm-Ft95s?R%CkT-hOF*-^#+{Q4mN5pruM-?ZC z$ExH9BPXy&bM)71xd3>qAH2kSLp4@Ce#kJoML*6UH~u@XK2LTco_FG2dm^)gO%ziI z8SUkJGa0^7yD?GkknCT~MmVHb<>VE|WaBTxI4w)wFPINM>gf4X&GvAm6BazqouzrzirXeBlxlTO(n%qIhF2+G788=K}&7ADM|+01B< z4KjAlsPnNYcTT?Ap9>8BHWydV{f7PW=9`Ng|AioUy1tL?ta5~zgzOFV$BEg96)4N? z9B^vHi;vCI^_wPoDyZh$mH9%J#l%yV`B1)v7F!bMFmp{W(v7vnTH>WTIlO?i`SO)- zHqMJqQ-fLgGmK+PY{ZN2R%TjOxU81)lWmuqf(6iZiyq`Dw}2TKISZ50%Ip~4v@l-m z0CcB&W$`ccVqzt^X8C;$H!QyXMF$6Fq8)bd4IMo2I)BLp(vl+leb`m4mjd!O6xYF< zTDSRrK9`&`7|!{gbSzP?mSl-_O^qXiJHVhU12`!^V4AQtbioC(NC0AVflb%QUl^=IsON zyX|MIT;ou=7C4br5>cj}r34*82o344X?cs%ji<)j>D}PMleGl{%bNfKoSkilN4t@3 zyDks}^a&!2On{e}kXu=svs6jdgVp~dJVPEY4o-+bf_;6fYN9A=E8ZGS#_3-PfhA(_%*LXmpu#oY<+mGkfSJlgk2-b=ve)|wzK zOeGD|!#NSuDokVe%n#3E(&k-<2x2jcPSPQWS8~>epNOX_2_LfRA389vIu?Zle{B(ebZYt8Eo* zx?%)fSs2|1<~@k~@U`5w8%pewiFIl11g#H_(1z5NFyFhLvVT984hw=61=TXg09OgF z;cLLxtQ~8(R^5C?Sp{s$8^G{az&)p0-Bu4ivDk)|)H>KgGfBh=I3*-0U(mLl=3b;={j^8@~EwQ+%=q?H{w^z9nx zy721GgD^OW^lVLRBk8x`^E06p7eGd{%TGOdo&u=!Z6PW9Pmun2SoNkf*AZ!} zyUYw3K@N%kM@(&4ar!vN+tc`#Z=J+N4}2)d(?JHR0|++22|tn%HG}9o3PCP{!vBg7 zhnk;E5CoGEV7kP0(2J=G(A#eCZD?2y>_`UX3vFa(ul(tIM53yAKfPOf2Us76iP^+e zczVqW7_d14`}iz&zkZi9j$(;zr*=t@@&7xjN+CIs#O!p}JBFlEf1;kjXGy3%_h74& z|DKCV25)*!$)Jm0#b= zer7W1*ijuW4n)0&h;+r6K-4Kt9=Q%DpEaG} zXHfrg(k?X3;4Xe*Z2uv3&g}1F6^3sajp^(xfjUu&A;)A+xeX?^%I&hr2X+}wM+=X# z+BLU6fXoA5I8)hBlE>)V;}UWUu`T~Bp)$50=(P-aN2W$$Ul22(Bk62#s`bF89zEy0 zT;xR%yYbU5mRx7gK3f)_b-Xa?)xdC9|9fUNG^TufFZ?>DW`w3tpI50y|#AwF+1mrKGkaIxXg54JI$qWkJ=h zBAYD_`{n#s{C{-%e}5%l%KBQ{K;2k2F00%=%e~BWVjI^W>5b_I3i-TS)75jyEJ+>m z%DBleqB_WWY^b#SZ!F!-mvaO(2HF9xs!*i77bq z?kdAf$fD(GB%|VI!9saw%UwX56!!BHXw?0n8a$4Sbe-JYSKDdjml6bIa_$tZkD#{` z{v*R}G#O{yYfJT69m@P^jL^thS-#sQl6@2zF&s~O4lu;wNn(FhsWa> zlK2Eu?=a3hkP9Qe$Qq~S+Kqc)jqu9d-|E^msxb4chkfQ$;uonP73X~rbY72!I`&5K z5~z@CnV`)JizNGJaV_s86ZzSTWA;$L=u`#6X?Ilw#%S@PZ`$y`C9Qo_VP2=eIa6cn zqUn#VJ28&YDCQ@?A$crc^u#LwdCxHF@oDMBWWha-qav=fUhlD80JD_i(+wM_1=|b3 zEkXjZy+O)MSxq>RJ>=nYbjoYTH1;b%r2=|0^R?m=R{~W|m79&pSk4i4{#z~KEx1w! zp*<7TShS=hNQa!mRr_MTQqe+FBn?>!DJHYgznD)>YO&K|=_NtDmp1%hH2cALHlBJb z6rV@5Ua;g0_ztjFEngQ#tZcZuh^I;1P`{q<9jjt45M@{rG+vF!jh9?((`oxNTN{Gw zVy#9+y34SmX&@G@id$lU`j-+L>0-|=V>)a!SD)GE;vn|FmDq)0<9@G<&mGVB{TD`_ z;(2HF$vOyUE{r~Z=$$j$;czeByfEe-h1orpeJ=g~LnZd_^40^jz+uBKuZ-s9ZGq^( z(P#4BIp3CdBnAS%`gVC^1`0nE@Lx_u%KH>kuk5O+y_`z#@+p&8`KcTIa^}6f?+4?R zJ>!9wbDz6>Kl-oydW!#QzE9q-K67RNx!SA6*)G4P=9L5Y=vT|z^8Rh#Ru26KUakJ_ z^6$D>ISRoKTEkTcKvS*GP<{#MmsmYXj1JmjR|p(7UOmkm2-*?*68P1B^{fCt zcvn^7UbJQL3h8$QtpIZ;V;P7>_({awnG>1oBc>~>^ zV^G+CMA%?2CogXwQ@@HZKQ~MN_v8LDeu2S}FTEyS%G(B4O$IB+hxi|ah)##9{e!|{ zL|E5|s>sO5f1|K6VNr>xQNjP=!YX>j*8iJ^t^E%dHo-RGVfKG_VYOc-*1t=No=kiby+P9$}HbyRxsX@*ReT#h|d&)z#wC&?g zTV82L>Yt9ZPo2~M=&kn4ZGJ@b z{%f$dXK3dC-i1X?HvKa}8&i&hk{lB2F z|0%+*{+P$0u>WHb_GtU+Xm{`U$KT^02d8^Cr~4=8hi89IZ~vVBx%z$mzeU*2_KTyn zp7?v1Zdb?E+3s}J)A6p(n~V18lm9BhvT^-~2pjOpTkQyeT$lF8&;UF!9vZSYzmD`*frd}>ND$C@go z47<85n{6KspD7M<>&b}B4{8y&ok~OG)v}x?>o6D9TJ^Oz}$u8v49B?BWaFeI-;BN>c#(Djo(7TAHfRsV- zpQS7Wvbl}=8l^YtXpf1~h!S=B!)xsE$p#QB!7w&)?(k$K(Ye=XF3vAq>9`~te14qD zQLlMdbs%3iHWsp4v7=ECTmDjAX;?gT*M@k(4&10at_!}mf5UdTSMAt0=%W);Hc0eJ zW%ka6RpWE@6srbzZR4c7LW+9vZ$&=g{X{x}q;=h$^iuIJePHj_@nhYkIEt};_$(46qF$ePoO8{ZI2yC1IhXTvRGCAWZ-N|{!lKuNeNz$>x)d?;AXL|clvft8Dxfrb9FP*~p_YVc z%V4gh7J!Co*GFs3~Mvz%V${ z^pe$vi%1WAT!at4-c%5NG9!V51IomTF{B1gkl%~ze|QknO#*yD&UGn)+54}F0Mb!6 z;*=tv(2~8BpwI*3eJclKhcf+uPz&SP14&67spiO%WGA)}SinIu?Q_^>mzr9;{Rke8 zta?k~1A^$%S~&eiEhcY zln#|$MacSZt?Gs;n~ozVPY0kWv?K^sR>pOw9*HTe!rDkKQyGi3W1J~c;UpVxphh%- zsz<~V*M#gPni+Y)N)5_u?CfC*ERk}_`*_!54WphM6o&(9jCErpcfzGe?2fkIcmkWc z?$^$)LuiAqFH|hkVZ2&6eClrCyyUmk z=Um7)D%Lbr4@m*Iz9DnJ@i|qJz6h#{Y0r86K{x2t&7`>7joRHU&j4DNb zuoP4DcE!JaG)9(M+1xt&p}bs7*XM* zSb0MyiY>fvXTotWXbnSHl6hk5XOdDE`ZV}D1>9e5&_?exJ9XN~lvr#}?EA*|`Pyu8 zG`~{<@#NmW#0jX=|G6-nKBhx|@FAX^F$LZ$!N6PSJ5mC11>`DU>le1KK&s*v)ndv5 ze_b)gtw{Jv)J5#=xhJFXIN;bf946hYr!Vk24P@@j*s5OV783Hw$kcO368*@rHf=pu zpv7e_^TiLA|-~1zZWFevR$9)jzeJ|;}lEj^#a*_06z_V z1QZIvn+zcgBQ1@xm3S61pcYc?Kz3hF<}Iv?YZFSzOiDBlADknj;v{Db2*aNw0ZNh` z3nDlCkyb@`NF}%$oHWG)$DEAhdmR#+1!u1WdiH;?cjy05{&C#suURk#Gh^Qw`(QBE z>}Iirgj6CK`w}85A%;O?3oVjVgp%xAD#+F zWb0A1GLDD;n#UQ;<%G@4q1+tL6P3XhVLLlq?#5(P6l7GkW>igP)NE$diDfnzW;VHHwit?iBPWXv zBN1Z!h*o5WSXRn-=7L~WM@(j)t=yvsi^Jp$aXF}E0ICy}#wCYaJ01k-q}UVr--@7l z3$j=XGP`(88S2>W0)9Lyo1@-`+dCVQhq#Rn1mDo-tj`t%<@{{r-VnR8vng}c0Mb^6 zEMpO$M7t50T9nnWmvhL8RSx9{=!JDC7Z|DAf%{gWcbE*$x#|7;dTx8S!HE&{TSHW6 zK;WZs@WU3x&0HmquX!(()4hGS}f*1MXZ}}DW8jw^o2CyZVY@j4_DyU z(76hBzG^+jxdb_AfP*Z}5(A8g%15|oMs%RgR|mk*uVX=~ipGW~xDVaGej^W3fkyTJ zXTfISQCa`Pf(6~;)dt7o_}>@vqK0qPF7Tdb%5ybp=UyicIg!*G@oQr?-*VN=if_nw z+~^uZ))sSWj9`Y#3PTz>)+cOK8Tor10ne{upP$8X`f0X3#XpzV*dI0v3)UPMQ3tO> zGjdPX`a?20h~>!zxpEK%G|GIRRlw)`&w_2hqvl_sWb1ATlt7ID{?&IGqnAyvmdFR( zhx(Z4Q&A>EXmD<{9k-r?iRJCRRpl@y`kAW6PB7OKUF!+?z|Mn3O1I9o7+{B3NWNH; z=kmtA;5%KacjN)<{YWSxglozVqEblt_)VK2ERNg@5R6z0Wk;}n( zxbu`aX)uamvz4;U#Rx-EVK<}Y{i*NMEPo{iE~om z?}5@S0A$_)!+l4hn1CGx6u*kNg)Yqs1_5M{h!di<^6?(CvR2};7zLC?200m{vVdDU z0BA249Dqgv#Ylx>08R#pp{n_CVDn;N*tmMnsrqeU^@nE}Z76~PQ{9o%Y_@mMXA<-z;b|Du3n-bp`=FO;^)v-bHY z1kC!&M&?3kCM2&}+&l3TrXg#%N(6OF{WNFz;w@A}DI~AaAhS{bL(7>s4u?f#O9)C3 zb*rI}+xjum`F7K=#D5Ab+wk;gJz|n6sZbPTFSmsToIMbnK-mxS-N@5VAcWSti^zh< zPvqmLBentyJa4dn-lCqljY2RCp9(gBzJ5nW z76XlickkGvM94t?!D>UE7vQ|M0h2~>9SHA%`hf$%(ZJ}w8V{HZ9FKU)hhu{ft(;`w z<#*&~`}Xs9K}N(u(Qi8F z@)WN5I>-6qIKYK_o{UzS>S9B%^C$+9W`eyTb}2tWmCp-1k`P? zuV_j73YPJa)mwsMd|c(|aN_WYu=8W`9A$D=N--YQaeoAOHK4o`WW=L74l5=nDfE?9 zRor3#YNZ&#U_-Fl$y>O85G@#SpAODYG+c6@j%<&BY09-I>qLJj0W@3S*#A&sMoI>v*@8 z5IL*=q}W~TKPfgJ{Wd4!`c5&NTE`2W8R64LHnZMHmLS)d0$UM$W}X5&IK-wYic!pc z?Goz7rFT61?+Wj{BYM6Z#WHNNU*}+lh0KN;i@_x5IV;<$rwyqQeyJtsIhf^PuNyLFQDNiJ783;8wzHolA;Hk%&5 z1WVkH;PcOA%dwNZ-lMc-Ui@}JX^&)EGPJV7GevpjM!r${ieB}u~2_-$UjWv4w-4MhT_>9++!gwNAN-OkbhV&CGH|6R#57PP&*wX zzntLBc+?;K%*AOWLUTs#%14EEBkdKWQo?6Q-e>4AQqCNuu=j39>^-3ktHI)16JliG zhcAtQ1EvDK@SIUQVSWJbp`TzPYJ^xsh=26~%q|kdA>3I=hX|xKo>gBwIS|29Pv(wUo_)2@mnFb=1Uq!x zjIrU!_?Y`+zK-vY`G;XN^m5i5@7j2~G|z0uH-`v5#xn9w@svZ|d~o}ly$GIb?Nbqs zYa$j%YbTy(#XLbPAJ`=Ayv9s)1ndcpJJSgn&Enar<4t6J4c$i}SBiOlt#a1qB5@W~ zeHy3(_ttTV$V4Ze>_c32pP5Oo;bu@a5-Ah)I9@Vt z>f?(^_zVxp3!cN`$#LQ=T!rOF@ZE9Z42pm?H!415aS6B_MO7|B)ium zh0H{+tY5+jl>5S0GTk`6s@gKXX(gk$ab4Q25QK2FGWX z1Ioj+7$bZ+I#yV0+Q z|BoW9+OPs~FpMq2o_-tT)qQ1m3x1fXqqp0gp}&%2p}Z#}?k%{uY-a3G=Qa|kI8%L) z+y2|~+dpSmk?tKqa!-$4JDkvtRcd=;>-sWocE0|^1fRp*zH5RL1Eh6#nZ#Y1hJ6}T_`a{i}3XAofnTgjXAMc;$p04L_&YIMrecxj=mW6*cb*|re zy6E@lx}8@}Pj*%C&yq_2X4=uV1*X>vm&_{aD74-9?^QASxzDc~j;VScEBvMg=FwaI ze~7TIC?vm{(9DfMtPp{Au7$aJ>$u*O&mD2SlMe;>)KA!SiBFU;i5Dqqq{6tBSE~1J z|D5@Xk=&hm*yAFzn10XwezC&S$OSSr+hs$QgGYH7cBmwF569P>5bu4zNv-yaiU`Jp z9p8TGS!7_5qw$)JEY9l0sDs{k)x&FUQ*6N1+0tmeaN)t@D=MB0N$YDKc27Bq_H7FN zj$D_xplB^vm`^xt_FJYr{k28@D;wDhma6`@Osh^d4X2`i^`gyuzm-V>9Ip06H{ToQ zj=7(1{&nnR)VMz8pk9`>3}4Qw4IkfJx;06EM|GH(NNidaPITLN_!eHm@jX}RUCb{v zuZHoVhY1EpG52Ct-sRnk`KgL6krp25osRtEDR}C_=X8~PQf&V`65B!U5;aK&L0Ffz zcmE8!+y`cn=V+=B9u1jC{(1gaOckFNb+Xq#W4wfot==i`>G zkgCRs*wZ!7e{Y>moTvMQHVhbShc=BKlgz{v)E#;lc|Jcb{Mlm5cKEaCN{%z_+rPKZ zbX+(u7~08mU?+khcsxF`r#wR?qEA7~_u*WFXG4|__HBIB(1Ab=6|V`AOVK0tqPrh@ zjvT*qZo=cj?zzc3{ekD-pZ*^r?2GdyE0J1+gst>T|B0|MU#=ZbxHy0N!rsM&(wqN@ zuzRsfjc*g;R-XUai(BpbUn1=2@x)92ON33>T72^VO@!6PB#Up7_$6XF>adDc211K& zA|X_ZE5#)N>F7&3(Z^DSO0Ybs5h-tUyi=VjhyizU(%5m>4R{Pu;I|XhB4rrvzd*oP z*GCBJ^k%BLrt_Y#RhMZ%(JcKCazTAk!i6O;CqFsUf-ahFg-=F}-pEJJHteC2L+_G)&O z-(S6_2d&SajlTyim$OPg==$I)L6_{SO8RRlr#W(?*7XZE_>tYoc^yKFX&EXdwY_R} zB)1*<;8@B*5ufiaw+4zb6h~qP>#LB3sji>&UhQ`}ga%wwEi0NAxteOOA4)IuT2nUc zHX5qAe&qI|U!YSf_v<4WKklril&kosJ6cXO-r4>VcxYaAFedHCJ$6_Z=12M)%I_m3 zlTsBLf?F@rjGKyonqrJ)pE+fH{wZ7%)2+_D)vZ%FE4~R0##-5pWCpmFS73)ltYZf? z2FbF+H0!NX{MnZVJIa~y)((O0qwCXp4^7Mh4*T1U-cUYw>u2!Ra^=NfL$g$6FF zK@Y;{rB~y-MHE5MHxG_AKTh=xviZZ-5@b7JqN>Q zH`*5cy`d;ZGkCXR=lxwLGb|L0f`=h|+6{15fPz!dEimQn;YcD#Qd9`*ca6ZW{8 z_mzU*26REk2uk+}ZtmM%Sd<7jk)6!&%Y9NxOw(TC3DX<(T&F|6E-cA29xWJP*nMsA zenb*aXJAZBk4b*GNc#IDQtNJgay87Hn~!JZME7+pb%T~N?>O8zwXFXt`{Y{MDeqH4 zhffUjj@Kltp^mDaIWai8qyu~DFsXguX7}NMbywOimLBo~XDxoXXg2=qQY>vqvNP%) zs47s%?Cl#b!tj9)@e2>G!(LGnRExNFgN|Q0Kb-rZ$xYGoTdiME2W>vuMLr|g>9YG+ zsdBUWk@vx_i(f{|1I|76+zqj~XFT5cJJ8DeQs8kGS@hZF@hV$L1D6^O*>zV$hhT z5@c}thI7@28(>kMde4K9c&f{%g&j`dqo?haS>3rI3JAEr8=l^nQtdm|Gs*qb1rI!xH2!+ zU1P)l+qQ^4JHKi?nQhL+&0C}{h3R+`X6|?EjnuIIOVc=eb7*a-0tBG3N4c6Ek(B@f zaw)p4F40-1i`1iVn@`Nlp#Zi61Q_mj$(Tj}95EdbJ=|qc1eyYkutEV$V=gsR3<~qq;SM>0GRPu12{(Z04X+k$w~f$5ps$X3+?8f6$L=2YCsdZtd02&5Ep<4&HV7j zwsW&s#zAG*w%3XVo6~U zX*^DpGn*XyQd97pf#lV7irkoS?Is;9MNEOMzvCS z0fO8>8^W&<>1VkBCpCV^fTBUQtrSjH2Cp0C?Q=u%>t?|aitIu&OZT!M8mPxxDlBFE z`k*hi%D|3MPEjq?+RQs406U-&QejyyNv{@rg8f2x0%1$1V6v1|)jz{Esc!bU@^!+D zpt;X=<5wTxW>XoU=+29eIRRA4i7G3=sizumDS-kGPHEf3TdHI-qD^aloqk?DNvO3Y z*3h0D)-!evH9{=dAPfxT^<-Lrr7d$73RD1z8P9_sSI6!@_Z3K71vY4qF8NY zByYgQFceu7;PZhKhyd3Y^i?wnNPyyn=E)vxm%|T;`jMns`^y++cRp}V9T~W|(MS5( zUo#2fn9P_feO0|+(gby=F-&@>$Ms|b4zIwM^4cM!`cJ6$4_K;*ZovJfs3X3qM(T%K>HB#A^I09588l|&`VK0 zw&Y5&pG-5db!1Df9(8SO$yMpPJ?CVqGh1?vcrj-R@u;)#sNjT59wLx|ADJn#xHf~S z3@#kQnM`N^9A~7u{jTA=^|)@F9GZ`JjIxDFnX)BUV8GO|cWH22!0t6vYsiur>~66; zYB$Q3T*aQ@EFBxKSpb{;CxtBNQmLG%mt2$PjxoTRAO&W|d2Yb0ro&8R4!$|te}-ZK z7y_tb;@a(()x|H$ricKH!n=APP|OU#0D;d_)_q@iK2H&T@v=n_3PQcyP~Z_3wm4kH zc}{_%JJ|!TI0!IbiVIsv`kQVGQm_Tj&d@0PD$&e+7bZt{HK+fmFFS2YEO}xI?h#?u z9>sBSs?3u)T4TsL@FBws|LkBTXAM>nwc1WPn`W-h6=*xAq&H+%KE{rxX~2xtOmcWB z@>u@L@LKJ*WJ|8Ou120(2QH2FW{e(bEr$I#l%YqL8yRXrJaPi6gri5ZZ}`Md#lQ?_#H&0d{AH zagxNPr?z4zjz^@~y0tuMqxf#7>PDnan{?*uQXbA_M09|_7>eik`@M$uk2UJ5mIp&v z6b>hV(=aKl!y-Kssz2TAiw1E8QDAPzYks|d1@UN@obZU7IXruVB2LENhc*l#$#Ab^Vom`tZOOmoa@Ou=XXcdm&54XBn(hPfH2 z1W}-^6nb+%jAn%}gm82)LMWbWFLEL}8Ahh$PwOAIC2@*5zK{A)&4O~*O>?b2zo7^G zy!}FSF5`RANVWq$#KeMrjw6mOxnhV>lSu#>=+;oY5_U2nc4}b5fHivJ=ism`2SqTb zV@=l#w)dvy7kv9eH?hHjjk$8`aS1GtM@=YUhH2~66CkGnPg?+<-42JnRApPPc=}A3 z{phUM;&sYBK8oV`R9?J0DEIKx`ui}<2VcW~maDYPOqQwhr~>8ow3)&wz(>*yY}kJL z9LGMlD^Pah&EGlrPs{cNkK^?*Gm=Us-0{Gjf&VbqRYP>kkTY%M_R)WstNuK*+6iD} zv+4`T@Ng9#13Qu5eP>!z_3G5<{%%jt!~?3>@ymD2pvG%&ZYpaI9$@JaHA$6M zPNGG7UpE7CqJcd#%9GF&js*dSe`g#QAzQD&Z4$nm%kV261C)*dQWc#0KZT~adB5-$ z`*cF)s!?;bR^AOAMqi>pZ{ubbHw;wV%RA+3g<{0eG(XpW6EZnS^_{4>jz< z;W0inF*9)2FV1-DZzk=(+0*+rsMYKU2;Y(U+zz(dGUn=`tLXtUbck{bjGE+xw0|Tf zzY8e$9F1#lsR{u1ffE8g<@?Wg!ki&)tXde!(tIJ_Wr5@yR4fJ7A2JI*Pw_%gcwk?6 zIzCqyd~nj7xnf?1B?F`O6t9?>v1kfVKwaSJ)^ zhB*VNn6y;vKJ+3miq`%XYOpu=)hoBdzV5^)zc$^Vz*~HjxA}`HE8zhRrw>l00&W1G znJSxOmhnn8H>Y;-W*??wvSM_I;NRC00*jdf7XO5%Jy7yJ2sw2fDHqz+C%=Q z=SndPdN-}^BKM!?YH7&H&GuYT7Eg{I;9{}g@D9+Ez`JU%>(ghWoDTfwxl$A!T3lO< zcrOCw_M6N<$I+@!8JB$NX-Lf<2Y3yEq%t!O*UpA40Er$i{P?mm+{9rBkZ7gkSEsS3 zaTFy9^#Vx9UwGblLd6M>>;v{rwZXF1maT(&j-IXaIcpuAHnM#JB9_FH1+YCCXEy2D zn$nOO4uSzbv+jG@Cy=W?pjPWznjyEtIZ>IHIyKgy@!#jnAd&`v1Md^C#@e}i;(X5eR45Dp za(-(Uo)0;$cKRkGEF#r8H?pQ8wo0q~OkJw;YOCnSB%fvD<3YzfSVzbn9<>!|7c}3u zSd{rjq;`K$JdSfcKKZut$En(gv}YD&o)z8#_79+g0oL_J@_mozuWfJ;h+7zA-`vzb00pN@~A-N?18uhik}Rynq!v+q9yUxZbN7xK^{eU#WDvX+1G<^J}>PX#dv* ztj{gb&z!{FPl--v^XIu>iT|%6Y$@qth$U?@`RjP&b+zLSq5mfl_OgIR$e4e*hGGsS z^nZ%5)giysj%C)`{@+B{)4#Frn?nA#2s=`J`j6(RrrL)8L|9fwGM8XGX8*^HG6h`` z-SDMN?Js>-RqVqFE*x{O@~xWsmj3E4Fz?%3lX=}772I?CP}x5bHgxi#SM8Os+YJ&2 zQ!lKqEN>rJWp&1#5l-rr`uU~*rn)Fm=l$w}oXt=LW5@V>RQD~r9|=xnwD0eFRX(lk znr>};f7^KGhq<@jRbrp+1x@hQ_6jRd>S5w^d_vdq+k1%vATf41r&=qT%BLxI54J8G zB%6TPQ|3Mh6$!d>>Dh@%$+VuZf|HkJ3uELnR4ZHMt$z-pW^^~4`+U4nc7`)idhmc* zn(p^ag85Aw{$GE~pHfLm zQC!I3lrzaF_dN;pu?4^S95vzo>FobtUl5*?{@KV|OS2{{3o`wD&})Bwg(p zru3zFrS$!51AX$XJiGV&<(%%Zst@nPZUtwKojAY_&rGc-agmR43$cEvcJq5>Rll)% z_1&}}*$d4+u0<*C-;Mce`~GgJ*LO=i_ow}lKAo4XrS`na^7TASqj}tYd*)c2r{sf1 z`|$1DJo%Vyji+;kapB4GuPK_O?+t7bcCFu|z;s>Swf4o|@6WYf?rv>sb%1!-A}ohw zJbTwsy$#pJXR?E1@Y|bevqji=?H=*dZQ8x;K3?rUxlGf4B5b@)zv|=tHk|>D=Q}!s z+5@J#|3p~bA*01M-8W|2|B0|>dc#(d|Gx-J=XR+}7HLCs>Q)eLo+}m1ujtk{z;tS` zxmHL$R3xjIDBMsY1W}!V`BupLQ0OpcOt*eN8LC|8&0p>Ym2lnUO2o>;t;r;;Uxg*t zQ-xG(OqYI$?PU)(*DANsi;Ldm3O3~DIet=J45CL9&QIpEE~eraI-m;V8Pw6S60YMx z>9~hIzLJ5S20QEzUmuVYpGBgmdgEgh9P`^Se8X5%G4y0(y~aYEEEmlxq$R? zyD!AW&NTAT=Jj#=EYU?vHK~bo&|@Ua*4xS(NNzD{j!pO1QyyTVgHrsj%fXTQU*PXp zh@^#1*u@GC9{qZhAzN!zEBYq8SHY8rx|Y=|m8P^8gACkKs8E**aXwnXyf%?no+#9< zeNGK^f$^C?O3GGC`mc=vRzdDDxzxfna-U)QepNz08*Gtb49k)km(HA2GRzN2Nm$T^ zhc|nlJZkv%mP_-MgsfnL%VIh1Qpt$WzAl9zBjkzsh^wKtwx$t}y4*KOMdZy2c(3zV zY#INpJya@xbmiNG7?CDxR5kpd$E4-NYK}u|n)o|1!y{RambfsdSbGblD>M3!Yjt@? zOPxDkS^+Wo9(;tyIZ*k+)4_8qp}}zjrdhsEp6rw=Mbp##Yn)nP!4KfG-;kcCCQ1ij zSi!re-Nz~?ZY}+~%>GL9T>f}gynS`$LYN`V{kl)Kn%)n>Mu*p9z#>biGkl}A+jvJ?;UD6R@c|w8cQ=4p!WPgP>za`ZC zI?o(m^RZvZMU3HA5fA(4^pAHBfRsxyeYXP)MlY&`Cp7Y1k{c4f+BlSO9v;rlJ|bAS zJ-Fj3Cy?;>t5smR;Au<6QiV&x7qUnr`R#fSbgRD|YGiAzXqd&_N4bZ2zQ~j*H4{xQ zp)M3}p)2Q`x;p-!*`X9C!i=yT#}J_P(rX7KeaOk*k3z`0e2L?9otOtLx{I9>?w@O##ccR|OWvyJHVE}ba~Wnu(8{*JbG#c3O3AUvV6howxtr!$ zugp&;?b}{F4UZ7-4}xC!lGN#d-on$IEfofxKbU*Q#(T;fK705A3hWILv|F?Fq&1ef z|1pNJ3ndg8y0B~8Nq2jnt=`j_~=nE>g8vRbet=ka3x7dY|QgW=G zK*A&U)F%tIk#B9tcQbO2v+f3;K;Ayh-9N{RssvKMXO@RtUF49xtZ5RNiP_xy1H)pnIRrBG^9j7>EHLqNfPKZGdr# z`w)H~hyhgY>_4KFiddmmyesBl6RibZC>c3yL9&WovfKt;f(8hL)6t^A|Y~m zWAZv`3cALnDji@PnPlQRgF_tF21BsqWU&N!(U@5+&eEH1_q7Z6!73Sa@eG12MjjW_ zWtgw9%%U18La=(B(h(s3CSL{otX=^XcdFmK31YATHv5=l=ANWm*lBK1W|84&Q2@ae zB+E_C3RDq@zmpHJ1{zq@1Mz+eRY~HXOH}}>k?XAWu`71JlU1gY#IGu1@m|hZC!EVqs4TxIBW!?#|&G8Ur%zmld2H343%TzHy(g&<(8W7-~ZQ7CS z{2QcEMT+b5Repb8WuZj+y|+yMXZaDZ-m0(n{6P~Y*uN%8H6e$j=V;ZJU_9Ee3so7 z0ZULAERT*Mxi-l8-d8!sJLh?9{`ALmqnJ(sF=EvGn`yF;5Bw^R&!7q<%1ln& z;xyk#mdpA=*1>o+L2%IzgbM<^&L6bc*zc57ChZMUBJH>sx3WzY74$5*%$dn%21K3B z1#%xXLX&-K1mB72lJ&vt_I2^V7zVarRbIvcTcYu1+Q)Zp*m!=Ci&@4~k9H4QdVMEx z`-t5Atx7296kv7|R;ipm3@)D(qa}=TSrEx-#;OBivjNd_non-A$@`dT1yF_cIeLX;1Ll$kT%iF`-jTT?*VI}hc?f*#02#o0)!8yjf_?9#dncvGYqaLY+*dLhvQ zRcRefy6dXOr@YmNK?MN1N1J$teZTfjF z4MIUB8HFWI;yXw9%tJx#8zkXP(p$KVIfWn`Th&8Z!k?sm^n_IVjLMr5o3^N$J9K$0 ziTaevoFiGl9y5PZl_r-JuDh5t4_ zERu$87rJ#n*BGYm?Oijk=4y`1S3+qLVyMEos-#c|I-Vd|C;wW&(h%qrX`(BJcM9gx zZ-1sr$3w(jLFR)*NmwZ3ebP$%BWX;hvMZ?lJsq2yr0+_<#Qw1QdrU&O_Ic<;d6=R4ui5T#r9;JYo{QkbN4Dpe6Z8JiC`G9s`vnc zmP%TNTaF4qRU_)SY*wLcp_Nxli`UXPmoB)m$aNVfj@2NAg0&P^MF#6|p>z@0J{8X* zxeVgz52$PIU3jD(j~jGzaAO#z}(Rd588fE7E%{>}}&6#De_CwVkoq2hyLF&&2` zqUXNrr)W)iUghzG^ujHrz7kYdA!66T+Bi_9_zHmr`Yprw{Q0fSpZb#fNi+B9O;$;2 zJ4sqt2(o@se~N_ghip1Fi2ibx!9paX?+~i!*R7W^l9q-|6zNs)SWgYUjKtFf(kj3k z4|ehnCQCCOo6;zJxon}8WH^`%%=~R%+zYv73gQTHXHLnvE0;WCZKw|j)55!Pq;wq5`1IASGs7?Y} zu1WHnRC!pEJ}ddLb(aMaYW89M*#-p;Bun>wCo(%FFi@SWbd(hRR8tZ>H)(B)DzVX_ z9;9c!po5VD$!&HCHqi~*l5r2}@X;GKALy1gG^_0{k}hCHdVMrQ||{uEhY=ake1l%T1MJ*m)EXX z=!o(M?OHz8@pLiwb!QD%A%Ur6%^sM&{5tuv{@GuW$~H+{aabp- zONUI4+@*hL04dvC^*JD{eG-x?4aV4LXjtycA;Zrr&_GU$ON^~9wYOCk*X!^Vc;hcW zg3pjNNxjMr^16c1p=9+71syF9NSyY`tq%1IzxrS7(ajiNfbOY z7VzYJTnj-8q^g*zKehUHz7|)|q03*TM(w0a)LS^6``Y>=o&V&quUYT;*#-@Y^yHj@mPi; zQ{*Q>t+6z=&)RZ~Zs6J(emKR%4J^3%Uf$3}EU*iW24RpbcnTrrTe23V3-##sIZ>!a z(cfDS{}u|SK(`?}RRg?DAk|zVy)agoMiF`Z2Y-F*(GhHPJoULxN^)b91&qYAfKB9D zPrR0dFC!wG@SP1!wUD^v(~49cs21a~#UK%xqL(80{2|r2Q1o*JlEE*urOalF0q|N& zk_@>HuXj#uu%q(Ye)8i7tzZkjQzj{6L!p&NEsbv`VWsm1PBAiXE{na1$<|^!tiw{j zzDLeM>fldvGQ)HK#IToH9J;BYN*7yB$BCca#tWTg9F8|bk;E4uA;T_mHY54hihQX* zUVWO}lmOk*N|B2pdbcD=u(}FtLyAT<&D&y)Z97Fax(sMBcelIle$rwzd}f2P#mLwa zS!1({E`&>k#%RKhnJf0B2)|OE3ilm+7WKh4!U8ti*!Y4w zlw=W_r0BN4U1woAC_4Y-II-1&erdE`SlK_H_3;BpdqwLg01+oL==rf$ zV?XgMNXBS%jcbZ=?hTULdD0*?@}fi5c9+d?s|DkxhTJe}nkwO)q_oi0bvPL>Bqp)} z`L*8q%L^-ZJ?V-^JSlYV+D+XVasp1-)@`}!{?7Lpc$`t&C>`*US6bz$x2w6S4 zRQE0(Acb~F_x|5&D)+4<6F&veoczFd+hmlX5_&U8g`P_=CGYEjzG?87SmQ0hi}n32 zb~kj`BJ9sOoQ8iBt-qjG!1|Cb9osdj0TSt#4!J2)Q~{l`yAvkhegASBJrz0n+ED-dwXoX;VR7<**F_U| zQ)m?~N;fQAGSKd4CY8|nrTXTLmRzQ|t*ek6naJiMv|(Us4A!RdLwQfc|V-9v&)N?(Mmus1QX zW+9*d42NPrUI{I{>weD znHni{T5cF{QNHlZ`yi@z;=;xMJl3x!rz)MwbWgQR2JoC38+e#cYQkvUb-dcycaww= z@44rXeWj{!!Onv@{ z9yAD4RC8F$z1X0&l$Yr0NRoos>?Ts!vI`X-9l` zdB)K}D{lF@XQK4)N&kCye!mZ^6_Zu3Qwl8A$|L6>dT0!@(mR0z8 zT|nHzH!jI69L?goji?(y5`QKqSU8z-$+{!?+gJ&QVg)g*vWvD0Op;{xP&5LYkV!Hu z%j)n|-z@+QC!`c9`(#*Jf}CB=%W|r{X|(?(!a5sGGCg8#wW(@@#Su*&mg0Mq)Sr~K zu%%oD>o0CLiDQvhE-*@Emiu~*KIchCFDamR#rlX_RPjOf|Gymf#Xf7DvpESuWzw=w zdo4B3=G-XrQW>s!r~b zt)gzkMO&L1otuxBl(3$UY`wti1ua$O*n>uc=AC(&zw0{1Mn`kP#I(=25`Bta`0J=kcX>W##B}guK1exSA3w@4nH*d?yS%P^r7L46GKr~j@(%~`Sn1T zkz?Y0-QwBMuO_|XZ!S)(-~B7~)toQHF$FPpd$}lxupR77@o9qaq&_t5J$N|oir~Gi zFBl`U@2^t=gi3{$7c4yCE_nvm%RcG_TORE{oIj#>$3Plmc4_}TDABg*!GbBqk)z+~ z%w4KD((KWJoJyDS7S{(BFM^KPs2#lU~z94&!9)sCB z8b=kG`?On+nn=B=EyUOd1(AV*rCnL{5b5po@tWU)Ntc#`AZzY2U+#;yScOT1e&BuE zF4A1-uVI|PJLtKKo-}8nVuM-goFn^lm2kkB?|O-~5@BVUPEmq7%(Dj<6uJOt)7$6ZN@Pp}Ovb zcVjv&3@QiKmnY7x@~3dG(S&w)TRK8^U#yQH@{=9v;HMxf{_5bdpI2{9`Kefl_dDnR zy#DBOat~_ zb;0NB*6(T^n>X4QXC@^q8QP=L&KjPmn^?x{rkuQjF#O#cfsU3y9GV;9JkOE*PlV;0 zGhP=@%wFKAIC!W^=7Pldqq_}dUBG3rXng&lcEW?vx&>>CeTly2aDAjPDoxExf~yW))iej;<;er~QLj@yjRF3y0Ox zNJdh8!Ln})-_NmORz&p*=_?Vg4g)t2&s-MMJ0`inZI4K-id^b#is;Ynf+~Y@g?qsGoSaoE4Y6%%KhH*EWc0>&4DT zjvSA3q?Ww*wUO`tM*cEQ=FXf9cr@XidcHtC>?)fQB&;FV44Qm@CH>YI1V$)R+GHu~j zLd?hk>OwC9A}_HYPQTUynWH6$JvbBQ&7~kVC-$dg`4L|j^4>3+%JMh;H{lc8GJAK@ zF0JJGrXLuhx{g$)CN6wR%m4}Gem)E|hQ6m%i4)aCkD<58nk@4Y$PRl)kY`N_$A?(_ z8u5B9mJhKM+=O9*2k0#H%%^_B+Z{A?-|B$ZVry0OHWv#L}8;?H2ObU&&KLc#A<0ZCOnL^L!;LsTqc(`kr> z*BXwlks{;Dno72WUe`WDvpR955~w*JbW4*K)MrQxis0m1VYM?Sd{`T%TF~}GoTVt# z_#kIqgyuwFbPt_tA1xa_lO4;Ds-oDyhW9IKxWxwqzG|X$^IrAU^%)NL^~vJVPIEQdWvc7D{$@rA&$-&7~u5Qv6=|%5)Q5EGubR zVnj|>KwiN`UgfI1x`2@Ah`g?g!hsQa;j3~+0*WLT1v3{#`w>M20foaO@<|TtBnd^8 zvy+80D^hZ$yx(NcqzY64x5IVbEd_Hh^U$NtT#O{;M;(J1*~2j!Wf?y)=^CoK8LIb| zRd4d@Q`cs$j2WK&v1kixv;8uy?&sxJRZ$BnU z*1+1#k`DVqQQ3dJQkFcT6+BYw!#A#zDU|_&XCz^I(?7Uo5gh=*6S*)>l$h3Wjkxc9UAN?;msn8W*U zM;EJq$VB0qiFAoky`bqsLGkLSVYe*(;Zglp>&7pSFgw>(Mg`5^3L2M=n&ssQx24~V zt`f?^W9me<<6`{#ypVv0^Iy!yK_K5(~F8Uxf+Ilokc|f%{UQVO{kMEu}4mJ<=7*m8ZK-wl_F1@Np8bgnm{5oQ`;bJ2^U^jk5 zMgRX|?>>W?`rCfdC!wW55_%^T=_p-#@4ZV?se&{?ic|@`LqJ4&Zz@uxBVy=X0YQ+W z0R#k*j?#n^@Bj1M&vVY4eP+*`Is3fX@A7hGvSww?@A_WfkJ(#o^SWenLv!gRB@qVl z_xEVbOeNm?`I~so)OjC|hf46O(xCjt=#q=!rFgj!@tarvCGc`Wasp`J!b>}XVz0w#PkC|Xwtq^fMsRe35x z92b`qy%yj=lVvXwFsdf$Al)J}>`U^#WY>Xa#4O8+Z^r4yCwp>tN9io-yOQ^2lkcHg zbbXWALQNSh*j1By^+Y1cNtNluyQN0)v=_>fjsAQC_6%L-%5D+HI^DL-pV-=HJ!1E0 zzsyKYE9)&&CQCZ7W7(MjKx+Uy4oi7^5n$FX!KIP=g#<*1xr z)u??OHfiD0?4iIc{S7*&Mm^t(Okb2KFzske<(ZH4 zCtvnwBSz1B19d_($vrCTq;= znZOS;oEg8#p8bwJq=rlrCR^y>lvPABUTIec;h=tFrx|ThN$miHTaeND8;5@xj+AaQ zK|Psebf1~%rtgG6YqB*OEG1gol4S8o-EC3*mPr-x`jM05$&_M}^l!}D`V%O~N^d(rJ7#Mb`7SB$0uF5gI>8JEhbNo~ExZe)vyv&ZO$jNzinp4tF-`qa@ zqMfMrb6(26Nm2bUy4Hmy5nxGyJvG)HU@Qjnup@GiT+s)gQit9CjLl1n2G=DII^ zjcu#UHfJ?19UbEy#1j(DcAZ52Xfiz6s<7RRtQgN+j!8Rzl>fe=G$mb5#JWs#X1;T{ zOfi@DhD_Pb+#wwqtwV+!)!+*4jxu}3bos#i4?qfV&`^)Bn1FSqruDGxS-B!(RfSSj zko8dLS@zvWMQ=0n4shTx8foI`zug;+)-PLgUp90|@4bcx+}t_nW*M2U`M~&Uw1WzV zH}=u?$kvO|bpOE249GSzi)b zsVN!84s&XSD%q&8?VZS@j{D+op69)Jb^b5?c5?Y`!uz~$3yHcmN%tC!JO^3&GOt`USs5e1@k(`ioYxbh^hKR1WlGxb zuCT_RAhLQWB`*-vPRv(x0NA?313DmV0?i5+Hy5`LD16hY{)C<^35qXfpfJ)sTu^ZA zY;av1P^TNTu^4pe9MEUXTV@<^d^8w$F%Z5ev4lenDGw|c(=47e)SVBN*$#(Y49cGj zmxYeJZ5tt4tS-JE+~6Oc@ifkhj4BJzD#p>1q}rN5xGyf`3+XsVifL1s6q}Ujc7U;e zA}DkSHmkjpLqxyxVE^HQ@ZfOH5_*hg$!E}tnYm!tTzrb@&*W^#WP8q(K!L`Ma_A ziy2d}bDl&{Sj21>$gVAXIaFjjQoVa*b@?-7sAw}k{5uXTb?Q~z4Lx`)g|VZWvL|hd zgyVX0FG{=w*jEvXoP=7xrPBSEms+)#?`I^T#VZJpHPzQMWQ0HBvByVMMQ1O+p4Gj3 zsLO^iEBXp_Wi7>l=YzfKBVeC^(w{g4T1o}FI0OeANQ>75tgzwi3Q-C5O+y z>X7qdhXO$QIUyI>uJ)=fh3{e-#Dzk^BBgx^)B@I>@#zk66e}|SA;NO$=E^0AR$Prd z6}0IHg@{#-<%qk_cP5Bcy?@3ec6I>~ubwEm%wm|FN~9U(K0Xnja`)06)AcX_h)*^W@y)gFV69Y#ZsGC-Iuqa=VyOlQuX-mXvbeo zNpl-=k_l{)BBC`7--cmq4EB1`hg}NmqK5hG--ePr3kB94qZq}E?ml;X`dA3{VEMm9 zSUFi-mRuS|w)z?6Y{!MHwDDu-i9Z#vTk>xLDqaVLTu~@=p6q{fVf&)s zZ}P40w3L1`R^f4>@~40t&-bo?bO%DfWIhxy7S# zQBoY*!2HzEbjfu;mbVVLsvpCVYRQNpEBr@?>Ln3sWKCMDuD(9%(fEG7!mUih!aOcxi2F)1?OH3xteHYwCyj@B*!CmO zb}s&NkB+U{(ORv^>z@;9VB?O&ZDtY0bj~Z$-?*EYg4Lo4Z zVVCsq0hJqYfUTpADu#QOK0Pg`0%JNVU!dj_e_eyXu^Jj_8+R6mo; za$iYx`E@*J-%H?F9g~T**Yx9~fy7kaL>VMV%PVxy=FZsw8|BW0kFFp&%WLFfNA&O0bTHp^()by_-xhjmD@L=RfzA&-@>lng8Px{>FbK|+2ze+6 z1b_kfgl0lcKxS!ON!6npyKG(q2c*%bxZ9)4Fsp4t>W;(5@5;{B0 z1zrZJ@1G!SI{p1J{Vh`($b6sG@{THaSi zC4Q(Wl;soe!@Lqf+ipaCho(>XGCG*klZIwnw=ycr*yCF0IOH}N zg`jW=tu-J{(F;W)80r&y-W%x--bCoy0bw5KVnWv^@-~K3)X+Bmnht7M+Zsi#2hNr` z0mhmjqaYV2QH1x_XmuO~r~TA2=iU1#(f7u;UHZ@fad3u|mjUzR_*KEpHiaV(_v}8w zN|Ce#J*Klh)?b)w#yBNH6gHOrte9QAq8&-1@QqQcE#`WY65rjv6(k;DkvwM?Z7$=xL)EOW_#OklXhcQtcZ z`K~K+m;%1xp&)uCaC3)f9tW0IzLQ^l8W<~D{*%3dw^s$vf6Mn$87n+ZLc20gULAYH z(~5znEke6%N{g-7iM45*==gB$Z(VGnYSfitUmFM0{3ie z1C}I%S1D`NarRer-xgJ28bfLRESihoH8$H;w60q|gK2I{C^1jaJm{#*le42=c|*fU zMuk?r+ZQwYR*<7ENFLNh9`8tmU$6~1T@XA`sAc;!awEL3C)!79&h1>?J5f*yNbPFb z`%K`*3A)~%{K0kH3O+VL_MsiDOsAX5%g0pBCv&@_&6dVuKSHqDFf15bAwBR#UK^f4 zz0hVR7_cbADB^f}5n-cUxlbGy`r-$~Zp4ffK#GOg@EZn!R)hfn1!M-P9>Yi#|LfJP z`etrb7axu6=-fJUN72!s_u(SGD~!Oto2r8>zhepunP^;OV8MhZ?qLyvC0+A#mCT9q zkqJouyVjNN{IysETN{=?j5iE^N-%99E8{?$+DXCFT+jVx<$y_sJUn{v7=v^(p-iJE z!Jir>C1yJ#@-^0~ns$$}@ZelnQOfJ>ssYlCCT4EiVNe%BYZwEz4Wh+tT>(A8+;`6 zKFF@sfwjhe|FW0W+I_0~ATwdv^tVB>0(J09UNasxyXtvSi*AqGbQgH$I?oUYI7{ODsNcA}-JZAmsiU?p**54ERE}~b_Z;SKk5DKA6b`aKgnoZl6O^RQ z9?4(saP`eBy+JLXmAbms^1E7{3`xrSdjIlyvAt~I$As6N@tE%W)C}i_JrtKIPthKN*OEbQl zXV)i1)X9;(Qmul&`^IGq=I-Iy>M>hZL@Z-a5rr4z>LiahY-@r1LGY^8{cyHWAn)y@ z!7HmT6R)B4F!SOfWoDo-`l$IW<{HI_Zxn>V@@>ERt7jyCv4EXW8(&Ekmn8sL`o z$RL0!{dWY~RUPLL(Gn$_S7{31#YlV?>}?e;R3dJ-V62+}<>3)dXoM%4coOMViWN_o zhIG~I+;cHTbw@@XNBrJEY#~VuOrc5j$o=2qbR;Oj_~@)@Nc>T_;k_uAHcP4OsIX$@ zuX|8EG74*N+OWRpOFV35R!n;hp-LIEx@R2KAGvT%pwEc(4B(g^VW#sm;x}EKFPOq< z$Vxmu_KB1TakUM6@X@;lupW-MeifEzLTcM|%h=A`11E5aj$sH)jA<6?Extm0ZvrQw z`N}L4tG*-h<`Ql;A;ryNGruQYl*bCSAQ)UwdrC9~SkL-$glbRpHb4xK>6d%y?9v(Z_S|t# z6H>X;V0@WGQ`mS_QZZp7O>bhpzYJ^W>?9}v2jehG=dLHwYP~|dw*w7f;63$7{&wUt z@~LS8LNg~*mw}=Y4MK>(#Drl1N1+T6Gr9h-=ZdXhi&8uA}BKftis$sup<&H2JP(?ej;eB^$;;6c`&wsJr^prSMq^}@PmWIv`e`Y zOZiSq*yjsH`STAZh`8-aFN;fi?M`*a|nME6aa!U z=SP_aSM-6YS1}}0+2ul{d9-KcE*(4@45W@w&WZ^s4;jZXTzP0hr9a~}rf#^m4o69K zl^884jayYj2M1$EWf(ul5`gr*YxPlcIg4;%B=kAS3Ye*zB8i_P3x7LH=kCJ}P%83y zku__}XJF8a=g?AwH8#0|-!{Jf3Y7+ZQ8&W0(O&Yxn*E2?%TN6;Sc-{WLtehMX6i34 zX$*b|<-qpx+ddpAH^RN_8mS34h8xwR%G_$8Q%^oxTW@1u4sU^LaIcoOUVVkuF7wx} z>eQ~e)ovu#Zoa7fF;e?;t9BP!x6fa9s8e_BR`)xx?qCa?iYFC7zKU^sb!q<$81l*< zkT;oF4|!P+8?7h*QICK%padGIbQ@^g8|ac67+y9ojW)3SXkdfAW*2zP`35D3Bz>w= z5ADh23wc##SFa3c6cK0?(`}SsYGmpr`VSFyl!(c`QCXl#)xA;bN2BIwqmpiu?r4*k zdy^KdS!bciRG^8_|KF@7&}{ay`Q`%fR!Fnen`V0^V)h9jyEoCDg+`Hj)HQUAy>83B zq!ziCEiAYeQzlY-47hycHFT=rYkMp4EN~n{5vAJ}0@C9f?@i(z}ON}#wsPqwWfDX$*bp(oQ(?%ok@a~-wSF<<$tRJS7@ z)-+?>-fYvBwgBu1Y0G44>#}JpfH5_`>1bZ)7)kGF8trJ!V;&9ZOwa3VND|bvrg27B z>b<6)PUD{Y*RLaUb*3Hm~OjIo=h)yYh{HYO%Uhn4Eq(gzeH9@+ z(t6!_Nxh#-ddouw(>-DNH4rpHV4@Ooe%20a%C{}TZ?Eoo4 zrQj^~ISmKrsyF!#8fx*>E5w#Y`(nUp5r|-JcTXO0_Gr)58~m!<=F8k0BslogmIW#} zLa%o<@*O*pvB;G@LY-tpn!(|Xw1?~|b67j5dkAw~lKRzlQnN3P45tso{TR**9WEH_ z4YVB$9vhpK9eXj>Il1^YNN*gc$5ACip`cAoH>+Av55Hs0u6u%{=6OH2O;znNA~7}` z6Ee1)KbCU-eu91Mhu%Al(DC0j9jEZYbB_<-$GG-fAd$~dLXW_$jo>8=iA*n70~#77 zOriVaA&O*z5a7CiGxd9Pk1nz?dUi3hux*77vWHDx)v!EgdZlK|&g(fvqtEf%2<9S+ z(=DOUl3`fIjQXQs#6PDLfp6*mFx{8?@V#VGRlnz&{M5nzN1?E(@Z4!=I`x&)O@^_K+_*+RqzS&6_W`%9<}-?inSFi!u5m zoRJKE80{T=GC69AR%HH(&?|k1#nPXP`Was~guWu##l#)O!jH((_V~5^z*^`l8Y51L zt!GQ~PnY!b`?uC%+EFA(O4PdiS8|@JL7`cM!E)e3$@^unJ^<eCyh5HieTny;~@N_M42OK99C|Oh7S(flxy{W+QqF1I?3=F0sSv7*<4%EBF@|tV(&U-AY}1=cCzR_m)s$Ifdj4p{D^f88eiABc?!+ z)>33gs3W=&e%Rs(>*n3$d7}5!M^l>% zljM_M=vyaxXd}dvX;Gi=pcV^Jt~d2lj3XO_pl8U1hTF_tgtjO^|0tHs(py&flB|4) z?ISsA{E{)(h_pc;zDB*E?TZLGCVPMq|Ir11c}V7AcdPCT=kE!Ni6O+}(3N`;v{WUu z9?P`e>~y1-{ju+JeB<~ z8p3HT@hlGU{Os{D>sG|Zc18*0^=o+SD+-f+a9S}!e;4s$?~3eH81YM#0`_6=dH~~d zc52KxDG}eW4h3E#QcR{7afV4w6W;5*C9X-e;jt?|YX{LwrAPcG6#S0oBzgv<{U-F? zdU(Tj?6|rnRI-taDH zY&C9582s*nN{lU#{~c1$mZ1-Kw#a;xY5Y}!lcp8>_53#U&!4Zxj$Eu?={2vAqe`-N zXqx>Az874mTSdpaMR&CnR$^Cwdmc*qI1)SKVV1&I>KCmy0n1)FsBjWlb?Nd^@H=N? zhovbKqZPG88X*J0W70)Y|4SVF@Mc;duNm zA*Jloedo>ecdFsFPlwlc(<*#=k51nNRA(9%s~j@$`3iiPE~FiGwI+?Vl&fie9Z=>N zO~HaSZXa-X4IbtcG^(KOd?e(S8FVpmVHJDNN=RJq(~Hq0>KjjNTC*2z2BOD>1!p^~ zZGUO5$a6M_Y9G4rS#(=wT?i$XCUM|LUj;FqmXVCH-R{E)o^Re@rPZCy?qi~Jw&asZ zy3O|W`F+1`XY0^Y#z$!hYFxhuL{6z5th9*^y}b%2W4m(^c(68Jq*-+5Pw=mwOYh>? zuG;((N=LrGa8@g-R3%CIwhCtscN?yHA_sMFU=2TEA1WsLqm50OoKDb^zgxtd^z>d=R9@#;G>xED+8LRDD(j<&P0y_*1g~ z!i!X#yd4jH;f#DuoUBfGF7j-oSw?k(f5UTg)`lyWEGgzG2UXYRCH;nxBrU0GuA;`rjaE&M{uwsw)!xi@ zv1^Gd&qI3zhZGX@o=V)Wcy_PjepOMd%+{eROC|jA1w$2^F1PYnZ57x2}EX3~IXQ(x7D5(fX;fuQW)-HFKe|#=- zeteyG5?D4%arYoSt0V09*EmJOCU9Jl*q5_@5KQs_NCx9}J7u0rmJk$7=JrU+B@@ZU zV?!!5sjPW>J)sWmG4jHZd_Od(ama?ydS#fUCGl6By3mmyNT7fonuRMSk#7LPot2^t zTKWi~+V|1U-}|^c#Ot&}$-IoO?0V&gGTu%1b3m=ZR(!5=X{so=ijwt;P_nT4>}C|X zL<3WCJu)V<0VqpiMNE}F!FJ!t!*OUcTd`I}y+BaKGjt>tHGzq})Z#MQ_4!h=({P>Z zdxDCV5lOkqlXrhM$H2c=t+q%-`#Ug1@y19bOQ{l>xL{h?QLhuXM7y>@z|<4k<-r*) zj*)}DRlW}FLNJ~y-CZ1cgJ#l5S%s&_%IhA-pXxQWg9hDF?-;p1mTO&F(j(?sJkMlc zchut_6*^A<9}Nl)$vNbt?L0?4nzNpfci>5Py6&1?J*CFy{d*kJ;h1;nt$F5`m2KGa zv0yjaMQ-g>L3n~4VTrcC3UMNheHTWl8*WF$)M2Ibxc`t5=&YKN>IjLx{-AKk$404S zE!PL#l#1wg)sW~gP7Rpm*vGPm)`%JU_nFA|AGzw7cbH_gOjq29xv%FeW19PEy7FfK zeFOgv)52fVRf;5TMlmvGrL;5En(A&QPdm&iC1;)+#<-dNrwH5cX3^YXUK=p;@-B(H zWxtHY>#Ug?7j<{*nGTDVmYG)%V%%-lWNvkQnyG!%?|$dF<5t&yh_EEGmOZqe>f_Ws z9BAHH4*YKsR^wsD(@rPuo1cdbV;^Q#$T{=h`rjgKVBa+bpTt{>D|CZ_0}@??F!(nB;$1ta9G5ul}3G zs_dJfH{htC5DBJ0bSH3XFPACdJRi#5G2thBEGAB&X$u{Jg~_BB@h zW3dt-)|!@Kg2nnD5NlgXaeLZ77Hi#`$zN}t5Jc7=Z}b1*SpTtDo8Eo=h0FW5$ol_+ zV{QJQ=U7K(ai_Dm|D_!3@&D#niwxd&9RHYWbpQJPe~x4Q#)|@w^5QYX(EsFE9sfJW ziu^xwtUZ4@)>10&%e|ie5y!fUF!(Qyb*uPWhRhuP8(geuuQ)~1;{Mu0G1g@#q-0aQ zGt$gRB$mST&i(bA{&6i%!t&7OdS2W8hI_HxO^}U(z>by_3Zq~5BfGyES0~OV}cd5}IP@C3Hg0I>6Mg^5S z0oCeECiB83ZIEH@VGZ)GiP(G&>QYoM^O6s-C~}+wRCnn**6@bQ@E(swS0d6=xvU8Y zE(a}(jk3&9GU;>mP=mB&o~sesl01!6HWkw4^p1F!1*bGRFL7l@f<0=l50nNfOa1tX zhV>e``_7)c=Wtw%fGG>T(+glq*OT2-!Uzf=ZwQTH5DMwZugTFNmcA{YK|imEnx;Wk zRN9^LHxorf?dYcXL*!7soZKG%~uD3>^<=G zZPU=I%b|}49Fw)R;vcWYy`#JKyr~zq2O zn5^?f;CfO7AH?+$u}Bt1XCv*@UmyfT-M#aLuK46MDiUa&r<8dO!~}3~x&)I&ti%u^ zW(YP_5VoCz9Yp&60qvz7J&b4?6uQkGb$NmyXF4Va0y*vh2x+n_fFH4b@d$*YElSB+ z9139MSF%(s!Hh`=0iR>cq4{?xNTxx(Kz=3e;<%2Hm}FoMI+`dqMuM0t%|B)W0~pDo zA~})T|2%np@ih>HP^sNi?_u8-h#xe=BwtnAv3)vGMo#S)(as^1(k=VwJ@--=&J2nF zcsnpn?RQN{yOR!8udtFh2r@}I#x7`m<6yeq;XLd}t9-JIhC!;sGAE;ZT+>NOriuY? zF^(MNw4z^b5`~XVVb5I`;L$$s=$fWtI56zJ`SoMo)clN_=vz!(=@ftj&&Ok1CEzAI zxrif2xZ_^F(p0(%iaElZ)puPI-i*+H_&Vm&h+9k@e>KMc-hnCn#nXm+#R*n^VhL>k z7dWtd0F#QMJ;L>Gci!({*w!<6G^fd48JGOHEnQC;AHxX9qzDN(NaZbnQ4*;UM`k}j zza_@@Fj8toE@FeLkH{44e8@-?9(sNY&Zhd!E~qCn8TxUHt(|K#jn)~4)J=?Sxcr`` zsGeS>a7>ShM2m+nX8OL%{iL^xmyA+#L|A2N3N!hShK15R{r1`tOwyxrW9UXMjD~K7 z7pxlRIhk|Gn+^WO{{9I|Ia?!r6c90;rXuZFRw28k^3+-+!p_Ya?OWCm@S6<>{q%s1 z3fx}5t`Q!(YWSzIhP%_K`YcGWajl08V|UMd_=;PiqAe_m2v`n+EcaLn%|$6$FrqO+ z<&{J}Szv42BCQlB%MAcE3EMDE#36+hz_E4p`Uk==zBv4_j6df!%||Qx(a4RMT0fxL zEcJm-`?&wkAJT!I+ZqJ8HqS*oimxPUgs5xWf8ZXhr|0Z6N^hz>OVJ`YTIJZB81b^s z8Ay!qkyK+1q3?aU6E}>#wtrWDwf52ZO}j|D?J!F*i%za&4b+8RHytxxwQ-h%B~vF( z68+@%U4i+Br_VAencW_}%Uj^6Q`hf5(xe@|CUc#!1wBLqPD{{uk;1C=Zkh$$ZWq7Q zO)K$ad&%Kyo;Et{`%RmL;xxHr^LM4|V^;-O#~Y%gbQC%OUS`u&;^&$E{q(yBVUl)F z9|xiIYBndbODYZ&gw0<0W#h??vOh)q+OHxuRerbU%716=XwXI{KE`ExnE{qTvF%K* zq!{|G`gqhZLTWNV?OxL?E6aWQGRt9(_-9o%K?C<0PCosZC(HSMEDc%r&5!@2oHlM= z?LlHnUq{=RUKN(*4c8{I{OE?+o~Q6iK+nLnbD%44$Uk>bP5g1?LdQm@oiD!_w0*zx ztn5$Gr8*}D$at+*mbZeTJf!^xg#$m_pG^96Vew+iPI1qemxkB-S!4n0gWsqzXP}lb zShLm61-Uc@>9vD4`5DjDPn}2*76h;6;u`ddaf*o@=yh6-8$s^4t%m|LN8ASkvW2&P z57YNu8nu^LcTLU!E)UJtuSka9NR2M3lro%cy=!%9~8N$fER2 zN+6`ff!DL68%qAVtcv9jYdoC`2ret~CcZM-^*du9 z4ccsiB2gW!T&&^UXJeQZ8xR<$-w=Djg?OX>*JY)qLcfiPi?)o{FCn>sjZe@bQa?sN zx)O%&_uKlr#%Ee4C{M;yl&WR5CFEyQXIr9k=Mo4Ut0KW=_5RDT_UxII|6e%Pz@)b9 zq>i?vPJ+vd`|Gk&)1#={5Ceh9aT3Wte3J*}lHLmK=k;_u3D7QO{5_v)Zngi+g{p{-?0N}7T+}RM(+s|Ar4db@WkB0L9DTc6b!Zx zdasdBr4RvKln1SauxMj_}jrohRvJy~M@2Br66@926 zLXjDuk_@eB(p}EKDC;hfs@r`PK2^v!kQi}E#`^=J%_CPqGZ{H;h|x;s(}@2&lvVjJ z${O|$%8JTu{tkaM!2w+r$>>vg@2z@W%;4}ya}&p}zasJVlT!aQqMm@Vg7~28xt?Vt zzkg{85_zOh$w{&JS^E1RkY&-^{jCn5tis%s)JuZx*`QQ-H?%dpc)QAO&qMe z%!!17Sdz9Z!yZswOfLQcfy5IfE0tXWfaF+W`U$dKZ-^u`hZX>!wF9p0z3kpXG`YQO zO9ay6z_dygMBU&_J|ZL@!HXsZ0Rd;iAZRgw76Y7{2XNwvpqN+lTZqN2S9t!~h9MZW z6A^Ajr1#$Sb0rZ?CB9Cj>yjA3&OIJ_2a!+Nsxb!&pCj2NW9u?SVL}JFWP_AgO&yBC?Vl0+60=_pZCz(F681%0g1}YOI?j{~$^xLbWh-hp$z?PF8+JYm_4xmrLNzJCn&fR- zAGwp5{Xo1W(5yx|?ZFU(mk2AS<~jcVpjini2AjMjR>yqrf6=T~1e#T;e7CqT=tnD2 zcU$ETM6p1}Uz!zA0YJY3E&rui#s8sMvCR*K+tOYlqzE)C=1s@wo35lJ>Z;^CS#~CT z?IoHFTKw{-RR^&>khC~w0opVxfSeF`D=PqCy23P(>{Cz`13<;EUZV+E-{B>Cc<@)}*tloWE`F%Q_eR_+12BCeKb_#sDeP*3#jf+0Z{C?}u zzHbkE%rE-?H1s>j4d~>PVTlJ^YuM6@|GKPRaJVMHWmOt*xPU*98`Sg|(90SK?PS#m zeV~jS(ySq|RvOf9SnaO=$l!l7D2|6nYPKm@CtK&EUGn*i-5ACZe{_C<%{>x>pDkf2zAk`il*4+-- z{g=x+LU38rBhQNex~wCAT~^9d;gLB#x{Bn!DsSQ*4+?j2q9i1-C%RY64!l92S+RqT zSg6`5Q6|7B2>??{?(6I93-c3o!jp#Yk!>MSr*_1o#*n9YWZ%SxyI2VMBsdrgaT7qz zV!&gaG#7wzV&h5Lh{?}#-A5i2zbpGZ_r~3cVe58aYjiJH0hwm;`|Z%a+|WLOjFH~O zVePBMJh1X#suk#oJQJQopkUB(2TPY;hLNHf{s`qHR<0ok1u@?Ux%*8#Afc43f+k02lN zz6tOw0NQ{?c1?gYl;EHeSoj2Z6gjbjhGY;%wMr9HXykGYAe9jB>%5DkDuqLN84`x1P4aYE=b?b{jtH{)9HPxvd_;cH=2KjdKi zdyp^1qD2!bnu{PVPx48?;Fn@h6>?U)M)1$WkESdz-3dZ^0Yb2xGrUQAC%}ZbW#t~p zY(2aWIpO3D3$}w^A}5hM2>1>jnFK_FuVj2QUJkwBc_z#4FKrs1Y)%wQi7-W?M)C>Ast5J zt9V!g64p@+T|#d7?jXDkCZ6J9zj25q05roJwtrQ>?s>T$dT0Dn7}1OYSK=v7y~&$z zLuQMYfWu3g+ka))L%+Tx?B7&dU)1H#z_*3xCuaQ^q8GCCIQT!W33` zgf2mb-J5PO!r#Jff>09#8MfJIf>dF7Aa(mGL54*_JNED_tkZDd3MgW^!w3?NT&*-h zd$}IYF*#1kufb5}TtiOH#6Nl|H`1Pp^Ik!XjCf}P0)~`3o;A8TYOETS{aliV{ zTS}y3q2vPBk?FgLCv}Ls$j^AYUAPGRt{wD^Jlq9`C;>n((P!lk7C#Cjy)fVoEF^e> zJQG2(wMS+Vh9p8PuL7XOMvzJzV$=w-WQ2I(y>wX*&)b_G#X?%~v(4TMV;_zV>meyf z#6A{+yn27Jg}E5H`=^r}+A2hzgNNk-pl4`u*DpsOT>C!ZxePdtetd1CYMgI1@r2LI{#;YQ)R(9`6T!$-Q5>ZX;vI_0jDU+L~5XN-^N1gNw# z`S`A@_NA3_X&>+SA3nNgDJSD}(%`3^{$3({$M5V-W!1)qVyC0YNr~ML&kFfX1Kj=8 zpD8|e@(T0OOn+gX7@n2RWjg$<&VC7eI@;r%ZJrtP_T-MyvxWB0iR z=MR-vw-Pi$RC3A1VyPDFhH6qScQ5lv)!- zT%2AG!%ub2xGg%rf0#afyd`^w`PrH-?f5mrmOEMMA1og+PB}h#?8Y8NVOl&9 z$CH75xB-7w+a5{bec^*tx&7Xo^GS)tk9F2CJ%%B++gbGcy-3X(F50k_eI6?9qa1Ca zmPPAvj%a=cPx?n(pVipH941{GyD#{*>-%U0GK#D(0sPha4(*jCK9S51D>!zCzZcr@ zx5tW?8pwL-iH?dN>WP0e^VOGJ@H^C(3L8}=mQly7<;bXm9aw}8LP@jank`*onW7mOYh$W_&`_(<_;Q4v7LSCl1MCp8_3IjT5;(xPBWx1cy%#nHIVpnDw%$ z{5T?;kg?x)F%DO#G+m!X`Z#XHryM%F>|1^$axG1gJaa97@@XRFMzMGAZ04`0!ERSC z{&K9GSz7vZHGetQ`ra>enW_K5v4(iIECif;@b(cn)-bb|zjg$QF`&O3>zmyce224F z53DF}yY*ijE7PM3pCL}V_bmSWtrri+#Qt)uLTNt>3rDBP+=g{;{_&gsFOJpmS=hfh z*04ukLVo{wgpZsE2{=dfZj$a*c)+|8yr^zmv`P3bM7C)ps$ z&Z>YJ+?}nDHK6G%9xbQqkxQ?4CJ(hEHs=M=DnO&yZ0!(XZIUuMy~ci}#D!bs`3f_e zh7qEg$O9(|nFu4|xEH3ZV#+vhC#Zv2i%kv zM$7(7uynXFj-r%@b zz{B*SntLq$&-MM>sPgNMxDP)KzxP$i$R{YJGCzYx9;Lo692u=rSUnvNvhXeX&E?8& zzYnp*mF8+>mP!{I8^i`rl^|JMm4(hURPHaAG)?!wf1y%bjS#exbAxd4`B4|YqcZlV zU$k_CO{^=Y%g}FK^>z6WdE=O8#pR`7w7)z&_voqW#r@+*opEEznMz#=H!i7co@x#A z7p4_%=C2rMT)djn<=?v9dV6MGA8TIYcuk-nfM2^d%lwtQv%Afbb-!`t@hjh_Tkz>( z(kxN)I>ttZ+kdRhd3wy!tWWw(uYxW6xFu_ppJh;Us4IJADPEWxH=bqMJX%s}aH!R9Rcq1GSn-qjoNHQ+Tco9V&GW&n5S#M( zmZUD0bur%MY6j}3O{30KWGuRiF*1_|_1mQuB_X%Bbq*MYnrr>Gs28_mZ*@+8zFu%| zVre7vQDhP5jRmpcvB=Pp5oy@C~ z7VgIl%s`2#PA|*B?$bFd_nJZ24+!m4#a>zpLBh+ux`~iK-cK)pk8$1f9`+7<=q1wW zC`dHubpGnV-35@O8rDAQc-^ZASHmM(mGkzV!nD`SMZT(t*r@L(@)FmJ^^}LF;~`(Z zIqr_RDCrJSUcJnL_%LV@2PqfYb zc>S?Dc>qvLu-uw@?6dld2BQ>vhf=bPXtftV(Avg}xswT6(bDyp1mky~KNiurA`EpE7M#2d2&=ysizZygJH{aa!+QbDvkw8D<9KSvO0ARQ9 zP%TheUg+UdQxN6Qfes!Hd!)H%}ktm;8Lt zI1eWr0*;Lv@2@cvQDOmY(gDVPzThzFWi4V_k4ffOG0pD^hB920hkLiO**Dg!HCXEF z-PY$yVIEQ%zQiIVgBsI!MM)7*_1Y9eq#z8fCiBQa5sBUrz8L+(0jYou{_no^?KzmqZWZEKs|sAfP;tb z;egb5!lVGld|L_wpu{mN1B$;hS6|^4XwVoUV!PEnuQusBKtS+~K_cq5Golr80Fj*y zHI!Z+p7oLVI}8J0n#HU$7zG9T0szz)dA7>l;Mv|gza+c?zPwnFuMvirny`<{y7%+$ zg~AEbAYtPd8${unN&r_QY~-kA<@y#x`o)yKMiK+!->eb9Re+R-n7lGpmf*~-?a4)o zwXG4LRPP8vm&#x-F#!9vT$a?z5kvMpf>lewmp5|3k%9oOuFa7oBq{y}xN4VL4eKPq zNV^!leJT64ol}WLu9+1;Q1g^2=SW@J03;`|chTKzMpAMobXz)vJw#o}N?08&2@Vm%zW4NkV^6?b{Znihc4I~C>%SRcZW zeD4Hh$|8;=0D;Paf%3juKoB-nI+?JCpa77Ppx5f-n8uj>P(TYRXl#Rgksu;~T6G9+ zEwF_{t=kO%pw9+b`1P6p3tX)-eZ$NpHD~%xmAG?_w|6a2k>#Srk~-QDt`6{4!{qfc zA~6bb4RX`HCo8@8GljMnI@Y9=H>#D_ze}p;w`+_^QAlZ0WJ^#8_iePPF?ghU4)q5L zf_#O$p-P&BvUGc{{vt<=w1L6b^5Hp21`z-;)ZXl#`hT!@ zpHEGOef!|k=q>b4LhrqY8bS$8X-e;iNRuiE2q`q_ARvMwO+<X*&@`oTo4cc#0s-XQ9nlwr7E z)dXuYrVpsn_wkVO)g1{EQ>pTyB~q)~L~1qQ&9B97JxJ%K3>YBNz6AvV<_5M_0BRv= zVH-`L9Uu_X=m)?ELDA6)|D4sLG_1NNZ@3XnM=!_WNS=bmgGJKm0??L>4nl5pYX?YT zleV-!ylK<^JY{qXjV-7)NDQH+B-eguG7@7q;zm``#r29hgw{@>(NSgqq0-|lqY<&B z&00%Rle7y||X;ZdfC_Gtqt zBm58yU>Fb@96t=kfF=}8iZtJ;5g#YMFCvgo%*O`Jy}XH!CjH*EN+21qdo>^(0{F2$ z@$uZOlTizp2P*m#Pdmd1G`N>Wz^k{<>w3(CIA|QbMnN znsbG!b|q;^Hb2xw6A=_IN2FF0r#XG!$!=OEK`i&Yrnlxuu9eZMgr+&0{i9YVrX@S2 zPWvs&r=|?JpWSwOV;9w6je)S*pwj+Qs~G8OtDPh(!$wgf!4KvPBnqJoJVa`ht?M2> zR`J>gi_R&_fjXkdF8xj``30war97u|p zJ!g`psF)?KxTXODE>OSHfy@*}&5|UV8dlTrBwn@#S~$LY4=SdgAyg^ zmn1R(e?Jh|W{wIVNDBolwk!HfYl0NX>}_m76Xu#|4VtC6yq}g^UK%^b@9@NWTtS-? zxw%5WG+ihLCSjC#@hUxZW?Kc5l7nCzc$vO=oq+_Cl~cxaIFX8n#N8vgiHWtZ8l_T+ zcb36hqdVEhRKVS~TjUAEGglDXhvLvQEQwtR!$MazDXk5F_$@Hakskd}M(0ggs*y4T zU`P`IwyXvE(8M0&gP5-UmmD#!$H|7N%gXYyjlVFv#lo zv1C(wE8#N}2{(x?3DHx`TLU@Lk|f$+JfXe(^4a~&IlIh733}B8 zzT%~h1WsU4$AGPXJ zg+)}%*+Xp4@LG0dmg1hS8Kwx#4O)j3yWkZ(ixBW$!%Bh%1|Nbs(!f+!&t1xYM%_s> z@%)VNPg5IQcbi)GodK|4$8c1TZaFLxEZ3=5Tw9GdHf=o0;#PLys*AzE@%EL^R!u^~ z9hpDh5DG#TRG9x#s}@eKx|*0xo6XN53)@8)PR}$d+HO@A5{Zg8;EK+@khJi9Y;~Lk zut@9H4?}XZrimvD!guU|UdF+BouXA9PG3%PkGqac}+8#Z$W< zA@P@5&G$@mGS>JN8ZQ*9RV;Ae=&;okMZ$eyN;$47%Y5sLuO4})wh0^~*3sjW);xAc zXMB94s>X=U%q*haC8Ng4kt!h%;%%bR=$D5(+O$SQ^&>AlgHJj~JsOV$x1A5QOipa3 zX4K?GoQZWyfR=+-Y1`|A&o^d2JCb8aCNT2?0DmE$`d>HqN?mr0KbJJV0WoWlgkvV| zrTLKqNCKJ{fX@)_<{8fM&(qOt3kD`M=sK!3O{ee^=G0L`KwB z(WYnin0HT_5=@ZPiA6;pm+Wor)U}s=vLXOkA;4@tVT4=$~A7 zZYhH4gzg{|Fhp(DiL`}O#2NX!@#q@B>-Vm8xJQV;Ubr$5TgCkTi>(5_L;~i8LQu^& z%@fVDO#wd^<45E%1nMFip@U!GjHVlfVZqSY9{qW6%k|ybCaU4q?OF51eMM>+;!=D= zo;Ok6zeS$2@0ze;jmCKK`-dxib$V0b{B+PSoWy+M=jP7cqT6M+Ye~$_TByEauRFF` zS6s7)TV5}<+pdWBxieq+ywp8Bt^-wO9WZST{aC2%v>g!t@_DJ_jtfI~lV3Lkxud#u zLV|`u%AcIQ{N0ME9m#Dx)$Tog>^#fy@y{W{?E)>*4(B7+AWRYsx_kR)mvq?^vU9C` zPzmX|H~DvOzBZl4yMwpSkr7eDgs2JU+IQPg%17rDTK8rWth$8ng{;BpjPK2*UkHPl zuDjjqT#%c8e~<7^giXR!y5fLrgQzrzUnxJL`$qBoV<=_f!lVzR%99BX>a>C>3AS22Iy z#GJqX%duX5iy@FD-uz!VR{ezRhD2y!KWsRI&mc?U){(&I|H83~GF~4l()r7=9*azs z89kKq`@eInv(h}GDeZbW zA?{i3K;W+6qZ|2mh4T5HZ7=y=E5AFN-`{$=WfR!+kv=G7uT}Wzy}g^y1H;d5U5~P& zL{!+sO@)b+q$18l%KQdYZ%O58p#9lnBgsg_&%}<*40SdK> znbwT{602@ph}bWLW+m*K3P!Yuzmh*qa;qKSh?L0G#>Q2fC`QZ(){CoFxi@^Ui2Sv%|Bn-37Y5M-)d=H{9Gi& zXkDJ}ztJLG{2^!oz#`p1hFT1%Ivm7U(M=*}=JIJySI|2!mDFOw_(z@rS}c%+KOh6| zFQZ3nE$X4~O=NSH0Wp~b(#Wy7QuuqtigE;!(q8t!QZYFCB`6ickqqkVToL_nKq}+Z zD$8Z!5adFZn&lG7q1~aw5GgWn2`#~w2jJ;<01OC;9&&5#etKK-9@^_}NuEnrnNttr zZdAPHyzF)(ORc4daR5^~?~p=!!Ur`Lgju+%A$%JZ)G)e;ET%5Vaj zi^4Fc0?c&=FjP-`of4^%m0s9ysP4%|B`4v^cA25etGc7_#)gGHGUF#DnAaiAVs*JI z9Voa{T#N$XSbXz}i|Ta@5aVJ5h~0$0)Q)q$S%vF6C=Xr~zoTTzYyVPn8*6VjU2Z|Q?&T-8ZbRYNQv8s+ti~{Uq$ltUpW_&}Be&HSyW)1e z{xL~;*agt(yVS$h7Y^~#MVyXI3?(e(JA7oX?HQWfW?`o*aPA<(_~^Nyj)#iR?j|qJ z5M85yd*dN)}6A2k{zz zchSgz`DW$-G%Y~Cz9_+FhK~_*ffvXH(ppwzGr0(ln6}0TN7?brZS)(_H*Y;Q9S3q z7!e~4VbfocGdyvH7@mNcDjhuQ4}Y!+ z5o4tMh50M`6}omlvT2NixEkcIQqiDY+5k2nz1q*XK_L02(bzH1luR$oWYYdq{PoV- z%ib{S3uifpN%F4~clbaaYP-=@izD^@clv6hH=Li5@2G42dFP{ZD3 z#Qfhn^Vej#bkI|LBYH%2JR+fdBdGwx;e;n2NSsNz$BXE4uH*8Rng!Xd=!fYEKnlNa zrxeMzAw%OctQt2qroU;>UiV*PF-z<`I1RhXh!xPLW$gF^bkY0Bf$`)rV1P)p*}S6Ik zvc>jrtN?jVurO*Ip9+rO2FD?WV>$z}zXUR0FJ|^EW}yYL;#S#cdpUA-#3w*9+F+$T zl-f!keIcD@172AGEU8}1Z?eiF0_5@U;dorkAA*Bz6~P(Splf}a)q0d`c#Q_UYB-+e z7miVs5&hLZ_-LOn z3z|WSSD~mx8dNI7;4a+o&tGM8m0BK`0-?lhdYO!fzk$2tGT$QsKIz-yUd6DSx((v>9G?g|F-YrRIW?uHffsaAStLt1}2_*!}ZY zvwPvS>m29q(&6AXE-KA%lpdm&!=YI14oD`iPfp)sEy_bW1jOUus^0`ssKzVB8*4Ry z6*ks6M35*ukfaM(w{l~-0VS7>gE{c>oCqk*;X7-~1Ut$YR*=F1C^d4yPt_=Co8Bw_ zSiT0386~x*3re$Tjl)JqIuC!X+|(u)#M4k}n~yT(8PoN=A-;wb)kc|>Kbu_v%c$rG zoMElYH~Nq9$_+SJM+yJ2j=`SbhXb&J4M;{zN7`z0Xc*-jQ7ov9cabZf8YZB`56WN@ zy@(|;gq)k$9S~<5FULz>p)+1dZM^!{W?=(Zl52}28+0ScbFu*?4saI^$KP!LD^7q^ zn%1CJJu*%uK16eM7Ot=Ymz>ZM6YJG6;tM)M3Lk@%B+3iJQNkEk(UfxkOKQ#IEtd20 zJ9A(KHbJf2vO7(Ep0m1qwA+?V+i*yEVIE2}%2jmO^K7h_C%`}u4gQ#i=NG~97YW{B z+A_ZmbnzFqUlFk9EBBhZuNs155CH|$rKdxHaIYabdfb2`QC{s~nb3(6BGWwXg^Tp~ zt!;V}@| z^2705Vmqfqc@@*Iq}wYJ+pnyvgAg!SEVhx*&1BF^L6qG(>(O=C!;=*1s#ok5B$cs>!Vpzn?mldEYr{t6La$ObW=IuGKm7^{DqtWY!lcDUf9DY{TGr(11-81kKPFRGP&daY4_B>& zZOG{t8!eZ5JW;&a&kDsyiS^!Bsx(aLp~?dgKaO0&Q^gmBUP4y1#A>*Bi5~YoS<_WM z+_jAbH&;U>=Cp}O%k{4)@rr(ljeeyu422f~WYkcFu0V2D$Fr1y3{!9kX5#ZvcnTGw zD2r#r=&E^PwRW_Fc6yo7I$BF$zJPf()iDhR&KeG&;SRnq%jQ*DKYP55Zt|D$4u+0Y%LEG$)^6j(8>>BR?vE^|s5y_@zyR4P5mTa z`LQ%7`}4%jtq>0_GprWcFLa}im2I_O$z&kDU$NnnA_k{>s>6@Qz5eU3BKalyVp_rP zi2mx*-n%1|U~P?ngN3@U4nKqmeU2Q8*Y&H6-Hz-3wDslL0B)K)Uo-{$Uw;+Mi^)JN zRXVClZ5&x@o?H{$A=IG5QOq_kjEMefIFd8UZ8VAKui{u5u)N+l#uQ}US|4{*ADh`f ze|4#^BM|aazA4dP9VS7ifSA2{g}jxk*4Pg)P#|M;uuTHe3@DsUecJCW2cf|enH1%QbD>V}kb ztS%x2%Q);q;(W*k_+sP*l0JXMQdTSDg=HZJqgVR$$1q*x;BR&)v;@I#Aq5GA{Exp1 z5)2UyCHkv7pAzrpxtkv}K62)VV+8^V1RnRuDD-l*?kPHan-gRpcSQB$`q}rfqKAE~ zc_3K{Z25OTLtpHVT-}vQYp(Z!g@wS+g~->)|M6GxRCkH~Y9Hkd{7)e{rp`tz1e(pRBOUgfgRRt@B zdTnIZ$KrxA3<3Z3S52T+yNnt=Koto*Vgt{k4P>9_;o1eUJ4j7G%i)Q_);^1M&xc4i zbn}ISnp1EzhXfF_1Wrpr2WkVBO7T{h2u7FvKm;e#Rsi5jJuk{3lGQ*XI6fQFLyytp ztmtMte5qA|rS0xTt%D^Zd+C{QS}I6RU$Df=8W}!T*YcPfu7Htf(3+`L$H&q@zlqs5 z(6}_{)9neR$2y=#*wpXUGPkjg`HAH1k=)T?(aq$35Sty}ubSS^{F5GW8RpBW!G@39KkPsUZw9-WMkvAshga_L7U(nYC3E)$8Y(nwPNc= z&j@{__*lxazCS^~lsfACy>Psg_2>rQM!p0JZ6AbsfPlk(xvZdJ3z(IzN=<@@ul7|B zJQO{A^e#S@6N;1az;R+wa*|lYA*Z1LuZME8hh>u%1naeNKxft?LjDh5mC*60?9(>* z9rCAM{Y_QFi3G9<*%J+x>Nu*<@8z~pkUPN(E%`{L>lno=ME=1?;&mtEA@J&Br3wHV zA|3U%`74KCu-b2r2mSOZz4|sND$6f_DDe~!%Pw4<@S!-}ACv`(_S7@4Q&sV;uixoF zugZ*>_k8HOB$hK&5xoSeI>p=kVI4dJr_iB@_G;Rz-K#$(gN0kv)tk`=1lhFx7O4sp zH8vK$9Xc!p4m13=OyBeXW}#!q$Su?KIl4I4=dO$xQaVK!8uCJqvqa>!!sS=67y;1P z75oSvdn=)zZwc9`M8sF|^%gx!W+2}JOF;=_iZTf97^nmVG1$mC;rm%l9!i~nRA!*f zqQB7-y}7sVK8$Ha!)vwR;N~%H&Xit}9iMVbEV=gUumniCPekwDpmIu;>2b4G>_2?9 z;~?^LT~s;{UDiw2fO!+&P(6)^L2bm~A<}t7d=*D)bBtM!D7%VyxL*7$p`U)ERmd05 z2*7=9>fuq5L6Z~lRm}7B-ija<0F|kIegCA=+wym@$`1%Y(ankB+1NxMKb8WOq~BKR zwdU+2-u|1YnuYL`_a|@q!qp{^h6CVRKf~+Q+L?7C(FcJ&Fe1JRX5YbkKAPg8n|ffs zvm6U{evhI;b( zZsE66W8y#DPnKmbhyxSO+52@p-+aHFaulM4K`*Zpp<`qfvi_h+CW8$(-&TxMf+hwBVdx*3a2@v zl7jEQ=ZMr2ASBmw^hE>N8(Ml_K{n`Gl?s^TuD#K8G3Qs&c#sgQ;CsGo_(&^TE9WYVac z-HYH!=Yd$OCi9X8s0TWeFX0C71Hl;`jT|OVxzihudi~A19)ExM-ZWP{wfo|G*udwf zf&2zv`wW+*Tt0)+e>m|S}{q@Ah zJz$lM9>K@mBdk*_q<9%^^MU?~D<`Am)ae7`--$YnzRaMYx%W?n=!HUt+Y7>^7vl8q zBj$+*GD^bn!ja!UN+*(wSy`NNe&`jmj!`n1n~35J^Z(R-_xG<0&hUWMK0Q#MyocFs z&PCi?FuUoHj??|P_Qlj;>#q_0tzo|}xuZ@piPC}mRfVU+QRlelZ*DnV9b0^SyWQF= z=A@(GqArAg`S#O#%D?_9Ty01D(}tm$dd64ai$d|WKr?|0mR`CJ0181Vu$^cm+-;8Y z3V_iWtQ}TIo2i?ZF<2_RMrIt^2I>*wyi@3hxI&jQhuD74_hw#Kcp%ImIpgB5B=Dr9 zA5HXE^{FId%`1rhYT!SA6)Y38gZ<~P-b6T6*a*luHO%Q#Id!~!&Z1j4ELCV4ZZXWN z3;pY_in-q64KqQO{nuaJ&79QdW~LQZ7Vv9eF#OcAWxgD8x|S{t7zm{1tnhNpHo+D= zpSCl(@Q;V5g(F9<-FT8SYv)AvJHphsV7T^aZja9cs(hiJ@>XaOg`BTCdRF` zmSp+lRxnZ$bVFou8V)p?Y^$-@{hM6(!uy4<$D#I~7vKbEC7(!1FXyeNX4(ZKKkq9&T+Nvr1$MZM%FmO zGBP|w;!Z}ac`c)k$dV90&Ace6ke3_>wF5i_2X*f>IlnX{{3y3x0z4Xh>dK6mhZ)6R zXUR|kEtcA3_&SN)j4QYF+T=%2`gyInwdT`08%>b}g`NjK&k_{K%-<_=`)=?wMtPpg z=hX+EC&yE!THHTpt~qnNMpbqTHQV}j56RkW>1PrBRolP*>LmA5_Q8_K;y27j1|-A3 z*q5FXdg)ke+I{LKgjx8b?%$-?Uwbk1AX} zK8je2j!iu+#Sw$;Xowf$0!t9}(No4rbX%Lc`{oDr{VN3Q;ODxH1F zSHFXTuT&<4k&^y~&RGBlK44B)Sd%``t(R@0r+PT1GUP)jN1q6 z1lZ?1v@T0lMp*U+zwq5bJ)WSn4Bs*#(RuwVo^@HF9cBFN^CK3xsRsqs0z&)4X_|8Q zoJgh(Vf)h&n=Fg$=xjmBuot6N`3w)5Q~247z0$=TUaMWq#nYNTOs}Aq_vOy;grE0- zZr#CRKJP%I6v;wW%+brV_V}H`#63}zL4^nFiP$4YEb@<{2i-p#rfSUiEfr9 zQwKLu>`4=e<=lq}NXyZ8_v&T}0XbW$c?8*s zRHNYMJU?%tO%c;^q(4ja<91Ou&ZY~`ew4#Lh+g$+v&yGrs?zxaFn<>Du1s^L$}?_@ zQ4*Ql*v0HO<$&axc#UUX3MJdg>v#Nypa?k(C9%*p-~#L?c>;1@S^#|Ie~ntVEik`| zAgOjfZNn=R@srBV_3%#g#BxXFQtYhs-_&}0zb)%mT>|_IZO~-Ohae@ID!)N5!?(8Y zan~EF6u)>&7tFuAliN`I%5i2A?y9%7bux*`))z0rB(2Y zzj-q4hb*PmM#q=F34iQ&jKbPyzW7ns8rywg2y5KA@C{GhcbwnSX&t*bh>YQMCQP0+ zbxv+dhwcCQblP^ko$-V&~OYZ**$4Yj0Kst?AAw!hNv8J%U z3_r?kUAiG1!#?K9@*{7=!Bf7Jf7Ckdsd}Xgt>PP7S6j^=k9Qrmdz|Va4k8H|U)q*v z75WA@cQ7tDxyaO?yS_({`#M^?+%(YQn9{u>|9Ejb(aDTsx+N*FO!d<)KW9EwU?R~^ zSSltfua$Z4md% z#L&7hGaU2DwHa+2q&#gPo}nKw|NNJHks4P*5PAND+!e*TxAb`ELF;1Vc$^y+;~N6u zBgK9oMKmC7h)4@5A8Bv9yY?KLSL?Dk0L1vq(U&+wKHQx~&z^Kr;bCRFO8+slmfk54 z&5581Y(bK#w1a;wGjp$ZV43pGK|;QkwF_55<%1@HLMg!vEB>KmOOkQ4Y^9sh!it{T z5nqD1O2BHhr(IYnl7emfP2^{xNwRw($y(fd^2&#ltevQ*6lq?vfCsqR7^wr%_V z2CAKJaz})W$|xf~weQ|?Phu3@$mREla^IH43W$}qCoFJ?yPjLb;4mSE{Kg`Jtw$uG z@pttVKLz_=1#i?mC7+($zTNe6HT`(o{C$B@N+3_3+;zzf%jn;*l(Y~g!g)cIFNbnz zNvjR^Xh`+;l@K`Xg6ONSUkOt#uiVl;oJq~DX+M#w0?^fR9r`h>)o?jTLUp>2Fju|_ z++J+?HeQCIK)^rmq(k1PoD-Jml$QWpgRWd+Kqi)6rtV^fZlpm(&pR_HnI4WE#QVJE ze!)>C1RpSbkUrP~hv)!igNg|rx7QE9d_VKu9OZ?G8*!$*f4`ALwPi}l>Aj40U3bGU zdt9Z|DFXX@6nMwt`$dw`%Vihlek5yHChI^ky)iu-OqO-H7OaC|!qKxQ zo6L7-umrF3h>>s+XazZyTp9B!*k8%2D2b5;y=HAUXN{3%@5{Vwz%twPFb>djho%&S z({Kjzv+(E|c&(By(jO@8(Ra|p?!5+Qy<6!roSFtWx=VY;B*YUj267&^l;XH z94qS)3@pbdKE($M;*J zUMD)VCt5l+oxvcS>`JH@A3bwroRL;O>v3yA!MFz1w0p?6;6hdYrM?zYmKHCKQl~kH z_UbAHA&FP`)$Y7Hjr7kf>D@Z;W!*F=So%+$6!6E(BRSatUa}4EpL{g3$ygbhA5xr_ zGNH0^RKaq*HF9B-;%XRfXKJqDZgLk{3|1fcq1IQAD4yoDM&`SW44Cwg?^*5#;Mp7=s2s0Kw4#VqWaKg> zPmvq70>onrOWRdMKNA#Ry~`8xLphn3)laH6Nq)?M_lt!b8P5!-pcR+MvHCk|)LSHq zpxVHx!K5q6G^XF6`-5Ij!&2`}iXL&}+wX{ec?E@+(!XIvIbN)%7NX-CGVa-+|Kq!U zFq7CWHR@irzG(xerCfpC7^|Br#ojdH4!;(%mzBJjW#))OteD=a2Nu7|!LrSAQp~V~ zWDr2Y&Q`CMA-$(UOg~Jqav&KXvdVHjM(z>D4@3F|6nG1Egz9HqIu0S<_FC=!^H5Im2i&b z+v;mHiq|&1uMJ6D_sg<)VQ7?!p)PV!tDR8{#Zs4Hp(d^-S4CKcyJ2ELN)A^>f&ecp z{3o&M$L1A~KINxSS&rR>E8}oCtC;I?NGZ%Lm~mOmgeb2nU>Ow3U%NuBB0$;w}Qk=Vrc&}2qn2}z45ITpUY3LsQoIv&w;s)~91(BOA@&osa%se&73{9Q<23>Q(Ud-}KSz z%9zuSt{1E^zm>3Ld92CBm{WSp2i73|dWGT_Je9|qp`|x3TuqtXLYdaZ04OO@_31qXL%_jKX|@~=_VWq z6t}p2v{8M>mi!?vO|m{!d}er!_{KXSmU8&}h74Y8Dv3R@EF{7qCG_V=)?DqB3_u*K z4h+-m4R3{soyN815X-fdk6e{3^YTQ1P?a@ohAZ9IZVu)xo~|OEvP=$Z_R#8GtKAbr zwcv%=1A%9TltELuk+0RM7(+sRACkR$7;~2uulf*QokxhY#S*EN1l6JZ)4a?_@~k8^@O-X7qlr!1yQb?VMRbnZq5DlO>(qUUZ_atc=5pQMJhr?2W#!jdO$Q+%kwL%7Al`dFm`F1*;f0@bHFaLUfM;F z*kIYOuRQC_XYD}@{Yt8S7x8@3u4bUEh6uF2F@H(uJsX^BtKMCxLSB=T40%Y$8nusj z`6HtG_gM}9!mGIBL6Dj`87J=B;r+`WltpJsxg7QK_VxGeRAhE6ZEnJfR;T4U}ucvOkuenZFAV|v7SKgwai zda2p*yw7zh=!S#p=N0N=6;pe58L}Ai5|w|{?NxmSI>u@sSGdLKTq5Z&tZ;^ z>LGVY43$7q7)A1)sVQIKG&phX9zumPhl6;Q?>O16QF_Y@Y^nDV)LbTEzb3yesXhHPQQA2ln6psz2_1|Pdi|vJ z_T$9|wu`qSQe{8IKSm^cGJv^ohL}8uop;mmfh5C=NmzOq^WB(yyGtqVN+@3|EgDzs(Be02F258q; zR=#5cEs0-U-$FOOVW0REyNFzBUla1MH5`U9Jq@XFI!btYlKlB3U2Eg1Ly}y~r~R8e zMcw303%<3kRI^y+4xH#?kXRW`tlEtMe_CD>#LZ~>@6J?5fMkJ~wDZC(Z ztObaBl!}!zFRZHUI+GNuW@{YUBQ7YFs^?yN)U$9W{aRO7zC{yFq*iN|TB!utU5%Ar z_1udd+cj0;AI@Z5yp$T-7Lr{~>S7v_qkdE|S(hzt-<6_TzxD31TI6N*PhXGDA3IUE z%5MADw3phy*)SMP6#eM;-A`06(xok+HS!OS9aF>MYJW1PR+@V2(FAw&^Bkrye@zkn zHq|$U0bE&oI`{oXX=vJs94i&6bDCx+almx39VXQlr7x#^K{K6m=Ru~{(a2Cz-h1Ou z9zZgYQlhsSLn%erS-AU6gWNTi;R>2J=)3G3RLl58eq63`C+Mx&cVBB*{F_}@2ruZ_ zuSwP)Dc46?L?5BuQ*~5rT+UJS&rL3NJ>9eLzGuF-9m_@@$v<_KoT#pvL3XyhrUH(K+}Z!5 zuF~=Td~`%&cXE<*votNY)w3+?iqUSFAA;kmU{N?zwqrEHn@uZ~%{5j4_M*NY#`CW4 zvvvq|<2XwL#;7VU<9d?M%eJqh%g@rf5_7rbXN^9p=VXj77@xxH9#o-D>3LpHO@)iU znsp*1`!(g9danyTrBq0#e@D4FTs=LtN&A4aYlfUVB*cAGz0X1RCR+GO@;BTE@3*(# zTwX3RH7iE=m}%T{#T#^mr2ZhQ)rsTF7L6CE4j9<@LKQH`aXY1|yVXy}gntFPHmOn! zx1i9v^f((5I`u6vk|cJ1yqC)%?nNrfO&?rUPW4?2Krz~yG>Gc!21-vF{f?I%x}rtX zn8q{RX}Z z!q$7C2=a#hO-#o=eE@Ee)AV~s3A1*wM=3*0o~gv;cG~<{ZTw$&b!R>8e!n{--(PqY zn-~aRi=jNFV^U%66S4EaFmB`lbA($Un$oy26-s3`ksdL(SGdepg3ESyV4UC#AJxj? zkYrInoN!veJ?ro>t_$<~;toAD#}C{*6w0Uq`Zp6mFJeUjqGC19GqjO=}WGj&8@fZopZK!@GdHa@5 z!dxHEQ(t=SGT97iC<-QU-YZR3@&@Ywrf8nfV*zX?+rU9wHFo4$OD+)8UqJNCYv2fH%n+?6f8Bl={C$H*IPaFnFg23^#i<&94& zeZCacI*-M3;7a6^#Ik2XxrF&+eU8KYeUl3&^IO-YU^zgM~}f z+}gcjrs}Bjh2h;j(FY(6D($O4C;$Lr(zfUflBf#X^i+pM8Ygq@Q#JZXTpm@s)Pmf&isMg5CxZUhPi(`X;_W&4s$Rm5O7C zVMWKO3;D7AR=$5IFw!y57Z8!OFBN%8vfTC4-DX$vy?(At+NG3V=WoRaK;Zx!L^wR+ zY8(c~SZ%c9r_HrgacF*E@8Y{@-OWWyDZ-bvMEf+koBeAhaQe{|vR^C%0Ga2p4{@su zKfL>*>=XEPO$xOHsD}7ypQY4cp?9~$R9 zt$R$M_NtpW^>^3|_Ib?KFiTyb9Y3D<6il;=Cdd!9JTdbtz13HC@7JK-!@=QFL7F`? z)D7aiH|bZzCywaHyagHurLHey_e8sFXBu8M5@U-B226#`+C3M0j;huc@__ zj88-?!!CAb=@QV~@{Ptbh*6R{rR2B6apqMEWRLRcN$*5%agyWE!t#lKc>GVl*VjXO zKgX~SR({CjMD*+ErM4u@JB0bit3_>0#)S_YW)@90U|Bo#r*5ABZzm-`-cYcv9#K=L zq&MSbW@0r;q&W&shIIDHziOS7jR=mb=_5?VV~40C3Nt@3>*i4wx1q|eK&x)H?ihS) zV|!329VNmmSI^d`9j-})SHXJ}hC;}AO=t_O|4ZO)`H!7xzpa0<(%l@Vg7dx3(wlRE zFPeej=l5uIB_?4jhXZotuLWTsSz+a>_w)~1bVIFpjFlJ9HOg%76Nz+gAf{^kosPS+{)o1UZO4TEpE8(t}mDK^FuVd~5a<@`|%2xkEWk5uL&L89=2$ zMr$j>=-YrNva18la)?bj2~h0%idoPA_IC!!QyprLBD!LXlWLHUO8kmUT&^C@ z8l5N~j~oAR+jpBIwH9IwKy)53K?NF6I1LgK3yD_aq_KScqQKsue*6vMN|XWe{iJJGkTZFyI{HJDN$J;}P( zakUjG5jVY7&DeBg|FWx;t0@p7yV~sx14egl*nQeb{iI@#Z%J8C6si+ZPp&2N;-#C; zgYMWs=gAoo4<*fyQ-6A;XKp5KRHP5Mr?pPghROVgT@5>=an~l{pNw((A9l5f!b*k; zBui;mMB=;2{$p1k_>KHyS0}u_o@Dcx;q%Qh?BGz{yJTwr7rUCmK9WXECPeJg>F&~n zh!WY=91xO3`X9S`k64+p3yqMu?KN^!djeRsl3Pk4Q#Aq9mU$Q^12q$cB4sEcBM&p- zf7#W`3|c=Ikmt=j@M?aoDnoKnJ{b52{2eTQ3)~SvR%iRDaUS|TfP{4Y(H+wK+uHdb zMx<{~0I?HZr7Mt2GluG2I=>>)lK==GfNn&!;0u(NQWW(0Zr++<;nptQ((aRV_D7dc z3cCrqMHkQortqyHJ%EkyBy^CH`w%jFzvyEYn7f-YRfvvQ+bRa2Te(Fe4F~1RfG2Co zMn#KX5JT2ykwHWLq!3#GICV=7@ zMOu~OilXXAX2}--AqrqR#I=kRS%$VtW|pP3mPtV@xE9?l(oTAoiF@Lr{Y=P>GIIKv z${@AIXs)ey`ArL^F=#m(-+iAXcNdqUn_|y3>WUop%BiMkoDY-3@}1NdfHSJku?XID zGp2*?=TsIH??Nkf70bmB$%!26H`jdbqVnVg*jWZio7am$W^z4XMJ8qCTkI3+wilFj zsn7Nto-6=W;e`C+y~=)bYAU13iUrsax{7!suW<&cIjiifWFI($=pkP;FTf-XtE&cR zD=VwgYboFDRlldKnG~y;HmdpHUGp)iX1=m!arFO-V@147KP10h3+nZLJ|qV76n!;J zNm?XYeOmeI&*-a*y;lUvTA+9>*tnL&rott)lnd&S?e&_*4f0eC z$bx#5eS>jw9XF-{UDaSB-lz=)aYG3p<--OkPOzRi$WpxV8c}3jtg~MPIvO{)07$WB zFSr+B;}lPPjc@BsQtxOthx#;!MK-^Lm+4`f@9j4$jx|3~NTVE}<)F9jH-?mm5b3U?f%dTqA<7{nt+afF8HjOJqAxSO0`F+egx~p2N{1pp(Yx%t2(W%zezsR}T-m#xP$@ z;8`HYz+6{waaUJzS661$Ogo|NV|x?L6cBJueA3=S)p?=D@u$6u`f(Sf`u}0?zQUSX z*S+B<1VRacfHbM0w;;VEBqS6OF)9j3FA6AV5KvG`NNCbTL`6l6bQBN}6%Ya>6p zDkw;A3W(C-3p(dqbJe}p`|bT+@4-IFHO`WZYb4J-?(y9JUv=J$FvrlV`3JQZmUB?5 z>@IpTw43!eUF*Z*>x-6Za~11$^lO;Yb1PLT`$8&Nx2j|8DHDq1hmd+M18R^xRa{uc zSCIR5iS!RYAH_^aDV(s35~JrVa!iCZfPDR-JF)7{>q(8utV)ND&_x;dqR*emB{K zMm72RP!tU+oT?k+LMjrr8udf!&-zfMEb0GbEwMv&UWF}DmXg;6xf7A#Fo}zd$5{I# z#4(gz{TCx(#ODjDQ!ViNrn@1H;DlGnLvL)W-zbHM#8|dIK(qvWXt|w0J!dJpsoz%G z22a!DIB<`PWxzPMg#G9fxSkG-$*leDJd7{BEvvrqN_=a_Qv0j6R=NO7*=lV_=x9QS zjecmC5U&1O*uD)6yA8hr9rcUF=_S&jHh48HZpdRK%UH=(L1yeXL@PI<^QLOYG^}Hw zi`H-7@wvWzW?t}(Pgm{DN{S$rt*mQ*B8~kL=L@*>3WR$$IV)V1BR`$56xS(!gzF1? z_X~$toQWb(-yXptnmV*C*E#fj~o^wipcCgFPS1Tczgu|7H739 zIe_l+m9K%VmWYMdiD^i^?^opr9CjF^vDo0iOGx(2Z6>SZ?^yl%UEKy_+u_j%yW9`< zCMorvIryPIk?$@v;DHj)#W`5L8EhweK;ag46$$o3F~^K^)K2SCieRbRQtojd1TKBJ z1qIg~k$j&xJaGgDmsFD)GYyBcrl+7|@rd36nCa821w865_myB1p!S9MO`O$ZZ zUmV(&8dn-&z94zhU$Y9y?o5x2#GdM~W91g$^i$Uk%o-x!9A!X(>G_cM%kOQEpGvM^ zz0EtqlJr@=_%nNo?t>oPNUSneg(H`yaFj>?xjSP)>F7fAr$iUF=Z_%(UV1z=)OA8d zmi+lp@%Z5{f@gUk&2ya2aA;^*vxcRxh9N0(nS8+^v=&^9@^>Ne?qw0LtO;!*1`v&BZ z%tZB7(y8#0eaezwT1mR$Bj7Wdvl^E;9e5MAV9m8$`_xqfv?BKD)$cRpnvY7K5|0=X z$Yict5o-Ga+elu_x;fUgwgeho;&~kL_5xQc_6P~eakWMbaMo#wM2}#FZjlN;HGfGJ zz0#1ue{WH~6*n+D!j%GN-9WDFkl4bLIlrkK7=tf@l-*(GpcAYB1Q{?!I1b6B z6RgRRmqsCpBj8**!HQ%tx)wtRE9+K0H%(I{n2cjWH-vdV*OD36T6nd~j)g%0ev0u%1SY5`Car`duQPI|;M?k% zwA$dqE2;tc2Of#ac})8(=fx*z80XgQ_x8Ya@?T+dBqJ)WCp@!A*%IIIPqfqu4_8PO z%I*B5iOF^AD7@Fna?k#10>#o?rl(A~;j>avW4VxsWWcng)P_cXimADj#3x1M`w_nDpq^zi)Mp{f_-AIqdKL_1c3HAl;gn7-~Wm^ggD)}oZ9>B;yr zE*?kbDFyI$_nSuu=bdSZA7}1ahA$OcSG|<_s3S+U>{=2!{i@aS4(ox|&#eO$*ABBu zRBB&nc@zY0zWGkXaU)>1ugJMM!*TQSm*GD+)~a}ZMiWyTCCLpnaS1P}Pwu##a}yu3 zV@6Vfch>e_KV%fX=1)}%1kZNx%URBJiaC?irPZ1CBg}8Kuv{X-9M$|l>B?a1sT&(F zH=mL1rMjA9u>pQ>#SW2aCUOMJ;3t@HE9)mhn`$2twFu|L$DeGtdAB|K!JfHPX4 zjZSz9LQYHME~-`R;GVy#&2fn+yBSCKR~5d_To;%T@NZ)g4DBksBPJwnUN6DA;bZWd zzbf8vB%CDBoGS4+R<`DQlEE{B@T-ck7J|?Hnp=SOaZ@*>xh1D{O|&PCCI zUGv9V0y*E~q;s7&yAoIBi3_HOCL?@KTE_T+X!{?%{c-3}s>F1|boM$;%xx%^nT?<@ zddyI&G44!yQ+i~#SRhH|+aaCp0>xJKiWP1iQN?#2`+;2dp%QSrT zsKMqFu-Q=&@@YsK{Ro?oCigLH>t^X;LS4(}jB5&4(%`Y3>EBfMKkP6T3oR2cd(_%e z*XzD)EE;^&INd4p5f;`G1C#u0S2dI8dMB{@-nL!tS;9v7*R{!^zK@>S&9b#2-#X2V zLcfV!^Zlq;jFS|@729UsdHeL^M!zEB`G%6)9lSB(ed9w*j|lcB;w)nfr|Ljqg+kah zO8xR^G{WIA*QMuo%3eOdp5sH2vD`4`8yb&RP99}f3hw4E#_4gS)Dz_`?tTt%*OIFk z6}1WeH01`2LO8VXDD2KRXJ|Al!#`43%%mO~4&k|9cFnM1*V2UUpp#7zbesNW6IJ5n zgX`;Pd2^pbl6BxbAAwPJPyA8Ttd!{Ig&EONW5NTDNXC<}IJK|wVr>aD!+W6@_M2lM zI_FrTHfTV$03GyVL!P0jR`<;tkseag_h4ZfMesWfRtAxokn-Rd%VUBgr(xZ2fPQ9%i{5Ei7sU^sDIpt9Uhob9c{m5x6{ZsyMZ?m4Lu)5D05Tm|5n)J0kLHQP=r%t1;)ulVnL!3!O(P9#6ku(npd?0me zX9TQ`RKK+>F}15XJp&e<$>rW;&(OVs*19UFs4+I^DacfXjn?XY%o*in{5gWMB-NZg z3qNxOw||dg%@SksHaF2bo^q!*Th_$;e>v9WI|J2ICKs>vy>|NXa-imG)Wwhqedmq- z0rG0&#jx+Mop*i=)b29j@Jt5uTjjwzW>Xx2x7r1)Jy_2fjf<2vaAmU@r0_T4qBW~s zIj#&+#lU{CCI)WYcLy6}P5t6dR=e@N8f;XK_DeWp;4U~Y*reU$mlRm-F1k9{Yy`fP z9A)4k!T;`!+5d8^hByNM&OC%JRSX>Iazw?Lqo%CQPpjIBM%(Ltw3oGZ^nU56tm!PR@A?n$)xVKf-+vgU4K*(h zwe=3q4-b!ae|)>~v32EBSN~|o_tD<>W8dD7_kJ4xIyo`9Inn=la$$OM^2g-+k5hwN zQ-c#TOB*vEKhF-&&d#pRj!w>xuFsFne4SYTIz78MwY@mCusplDJiqk6`0C;R7+=l) zJABpc{~o?7{`dImnZMwxxqpwZmi>XRdjAPujsFc__5SwupW>^Yf(+ij@Kw^E@zt6= ze6{o^zMA?IU*#0K_Y+?|`v<;S&3*IV!&lRoe#cjJ_wdy^I=%|q!&l$^6MQwZWQlnX zUya|xSD{iA*s_^kK9b?6jE^N9Urjc!ESDRVqx_DqBKPps%uJ0veARysUrmy58~lZ@ zY6&Ip;j7_);H$~8neAd%t^b0ro&^1lukQODUwx0-w>hxdm)otiho&e3k#t_-ff6zKUjd`!l|p`}g>&@h^Ne`LFTSCEh>btCt6v z|AMcY{MY!Zlj}dlSLZnYIlj8Ehp(dlGkg{HpWv%L{}sMU`?v7b8&>}%zG^J-AK|N4 zVgEPr)%SmguO=PZ!&jsK-@{j*{)VrPi~Se)s)PdjnSUQ&?fHeTy8Sodt3cM@@l~1n ze~GW&rQ@p%|24i!`a68J>^FRs_|Ndw!N1_EYX1OV{iOIe`07^uU*W4)f8wi=8b9&X zW~uxFslVW>H?3m-yYSVUzu~LNI7YjFh_BxKYkW18)oz5*?kB$L{R>~c`TrYVz4;e> z_3|$Ke;i-U|M&3K%)h}`lm7}|z4vE)75+QE`i*%HU&a0(;;VLl##f*H#8=<{GkkT0 z`5)k`Ys`O(uY!KZR~!BsUseA*d{y^1d^HaadkY6~bOGX#uy_(+-y)FVC%zh@!Zk=@ zhyBD?FaJw?^;z=o`0A~H3tx@b{5Djo`o zfCBK)KjW)se&MTJKk?O;zs6Uw{{UYN{}aA?nf71et115oU%mgY@YOv0Z}@8bANXn= z^H-ODg0GhR#8*py$5*lcgZOIZ-{7kZ{|H|VW@CUchx~!B#{bXZtM>mAUp@CbzAE^i z;H$WQjIa9r3w#y!|2V#i{~cdd`=|Kot$&TLCj5r4CjWu27DC_uGki6j$&QY%rn6eL z{f@5&{d@Ro#_#y5U9yE69bc{6!&m8|<1c)*&xNi#GGXfeHNFZPbAgVKUUK_!_Sdy# zZM9!I_SW2o%?#c)Sz@YAVnNk`)eJyUBORS2hF}CZADL>T!V1V?c?M`Ow`@B+WG1T) z_8nmkBXMxuzcc5tdjBC{E+^V^?QRyxY63Z|p1EOeB0q@5E8 z;TneZ*Rk*_!McSx9kV!Fg!dd!y7BpCgkuNGxenu;UF`Z!;uxc8dieH>6&1E1`o(~d zVHfU=683&TH`S#Ztjck10e0o#0BCd|q^5iGT7&Y#2G!7p^8pQ7?LEpNJuG7VL zw!xqg>5sHdj4#JC8b|+Kn7I@1l>pG1S zq6B5A=S+3^aI++JE18X0!p;cn<52B88Or&B#F~R0SS3Lo;aO90bfc9G8_Mzx0C|CB zySYn(T-j#5uhKoVFg)^*ldLj)Gr5bs3ra<@FV(e!x}e`^oL``yc9J>6=w>OlE^HXO zmCQNf0v%T2c-9X40*CfvIk#Y(UsO1TvF%^5Yz4I5t>lrBy0&fu4q*FUHoC|jGcvjE%Xt3dtBh0DE%xNmjS{bl`BG&s@$OsIQy2zSJf_%Zx5V4Sc70&*V zA>Lu=eHd#3lJ!1*puf1V9{|ZwVJUGLE~uODgf4VjEqvOUOmmrEbcEalu%%)l7emK7 z)5iXgVB>H-ze%w0Jqgwg@JoX2-IHMLCK!N_1QibZUlJ_5zeHuy20rNTH1PHSEC)Wt za%4G`&cFiLs$8ac#roKH9}hA3u@GU99AsY@7P3K`i#KGJ`PlvX>9rG(VyuQJQ}7~)QXj9}Z}b#S(hFs~!Op9<&PhC_X63xOi@WEg7<4QgM)js-yc z0UT^yoUdT4!y}MX(tH?_y#UGDufk5mv5=FY>vcnE@cwQUmYXBYF^jBK$hAwp!#N|& z)RN)6P)HhWHC$|s=nRQ6>dzsujsW`lGFIPSpsi(my&ZD#1~=4BxbwlohQenJA7Pu5 zB&pEj}MGB4;nz);$|qp~V!u2)DV7fLLL|OU zsdG+W>0oKt#Z9i$z~04>izU3wtqo!Xu&7aLoIj@*}g{-TvXC{rTSh3>1a(L39 z#1RO6-^F*jy1un8RPIBn)k?Tf-`(q*L{?hj2-LS-8mT#+vsJl?nK^+7Zz%EZUcODF1U*5kzNU^&v0c3X9u%^+* zE0EdM_5JR-cFxk^ooJKOXMOJ(THX(UZhY<#%o+l5DveDY=(NDSLMdO|sm@Y1YJA=w z@3{cW3ENRw*S+=N^_9gPf%b7TmF^0^{yOOA)u>MmPuE>SyP2)_ z))r2!E6Jzt0Dboxxg0Y?fko169J%$w`s05LYQ) zwr1U91^JhHFWX;u{r!hMj>>z*;ZiLntGLQ380^SJu$6^eeiR%vjOf1?F&rr4dL<=r z@S*EN|E}9HPgS!k%{mK~4^__s+pX1OHOVsnGU|r1l;=5X(=T%j7F%w#oEH7uDK2P$PE0ur8 zz*>BGDeKEo*n!oHbs+u_90j7iwO9&w)fNmMJV<$@F?d^6@N&=MlvBX=J^o>vt)&66 z#-4@QWGxFDzXQ$|;c8E{IxcSq4*tYfoo|#V$ekMqy)y8u%`U*xm53 zjXCE}g45py!oznqS9ik!U@T&L@c{rDgk+S#MnF7VKq71Oy=(LY(`*+er63ghSu9a# z++~;fAiYBwjg6F)a%DXdWXzL`jZ*Y*g*dJm3p8P))w5lpzCmc=No)WPlUBZZPm*O`&CGE_jv@3h zgmV%<&Td%w>{x$J>e7ZJ1l>|>*S~ycnp^ty<>0q;qV-;D!%f+!6-!D*_ub18XEnEX zC64gw_1y1okWsH$Z|C#&f6zI9mW{q;LHzc&$SF+Tjb2u(pr#W_>Lq9Nd+r@FJ>HXn zy6VaLuClX!4K9x@?8kqq$uw<#^$>z-M3%%=7dIgAM!ojeM`%lB-K`6wd8FT+7XbaEj_D! zjg9s=ZZY3{Tb}K1RcAKe$tMp&&go~dQJ$QBQ835;NS{`WLQn~PYN7dyn6D>1aKu9!R2)M@p-4eyR-3DMWf_50g_ z-AdCtx0YFk*)`5}+QzxKs1#=IOyEI=&j{ zd-$tfM_c!)SFh>#YUrl@2>8{3yC$9&qOLiN%h*)5-kvZ_Y;~9>Uj}=dgWQf?bG#6B zpz6cz1%>^Pc0>^2#gO#H;P$x>aHIsw*`i6sG61Vcx()eT&a(r3txjH5{N(QwU)bRh zj({;a!NNvRnCDd&azl4nXTli_*Om*5nU4A5^1S+y85dShTPjPginnZGEA-mR{#z`A z?8ZVaJhgH+=AcV9w!z~`*oZL_rLJZ z>CDvmY%F-+vVTJHtgzRS7`<%C18Smb54?}%UCqiql~wTI={ZO|zT-}Apq)hC8C&%r z@H5&#vH$jYE^7S10>kCfy9l39*TF$z*IkSA+w&zuklxm?r@FW!zCLDxD(@yQUwbv| zTU*%LLX){-Q^s~7AkThy?(FKp*6lAv>92>sPjOb{)53&p5 zG;zMHibG(jS$C&_dSa?LJ(4;P7qhkm%#{P-kol>p;}zMv{cGa2bpc)g9>n0(4zY#< zpi`~V3NRp`BJ<%{QiZb%4B(saoS7F3V5Hd*ncK?2YW&>ooasQ$v@9?H3*a-(m`l1= z-3<@FnoLDSfW3qtvbzL38*_cTEb&lN61u zEEu>qd%4z+jS(48>p-8-(5ciFRgFSMQk5NcTL*xh{GK#}DX=zizwzTB|LTXz%|Q*4 z*k}F5j@q(FQ<1Yf=Q!yk(Tw# zp?23kT+0sj-!fjEWHG5j-`p@uiCXCYfP>)dIr)b|3qNN zL4cLli~yIqE`4zHF9cQqMMq#+<{`{90C|wM+V4aTd`#@&G8hFC%1|fBm z-;6ETD?|Z!Ga8zk_B#T*h$h4J5w?1qNK6-`T2uvt3DA?o7)u9G*x?wqY|PU`R27F0I#k1?DEPz0~V{} zOE+LgES@rAI01O)>6^vuC~6J z-0N_p);!`(h{cO72-~1>gK%4sf~A*!+gxhC?iM75vBD8xuE)~g3_#`;qUi{1yM`GZ zfldDdfdyFYA+Rh_?ktR%bOhF=vt&G8f`vAifT0hs>+K=1_`@`?o-J(;fn6li5m@uo zBjCX(EZu-DsRcWN>_<@cBrs^O6X^oj7ZNFoGqf9zXSQv^#!^LrSfELB4Apo-Bj%+K zjX4qPk7P9ZX~525uv7C^;*ypN>55;hkKwRprggjHD*^KmKu8a{0@b+{EyTEoz#g&0ij2oEJG5aL_Yl}j|LE^@ z1eR2B7>C+`&|iB*3u}<{UvgBvsCN&eh}k*3E**hQE(BCF0^sF6jO{08$53yXSqnZ_GWhsZrfSV7)pG%8fqzGd&o}bsRZas{{Tz7VjKna6S{qNiiYL( z5Li+z?k56^=TW00us{~9_#yoCN<1?RU{YR6H((#o4OqDkkP=zC0n3u0la_^2rW>$w z_6(%ng91psp{T~jezFddSD02Zar)WAzSbxeIsj|AW%=TJS+&VPy$-(e%t2&LY4r;` zBPM*UNghvKPt`TXvOp%!pvc&3?LPrn(5{U@ooTv0D0ztdOMsmoXs^jFZg-fU9sH90 zy8ufAi#1f4dG+Cc7hvx!40R5ZY2Uf%0_?0KKKl;=mR9pifL&2P1+8#q&97dah^Gs% znL>`QdVdPAZ~7cnh8?|J#(oO0!*l_5&43QT#{L3e;U8NQ@Yj9*Q zYx_}@%WkSjZo0-Q1zkthJZ8vi&GgWiH=tII<9O#bM3ku^{?mxEbRkOW1IpGsi{82l1o z%RnCNP$r8$`#k{Gc0zdq!es6w|D{yn<#R<-OD!jmXbs~5rxqj+09rwVkfO0FcvTr^ zb;D1hb^svmEsj>9&4D?Gv)2PeglZZv2sA_5vG#)?qaPM$wPH|{Qvmabi_;A33lXLV z=<5rfC)W^@d(3xwRC|SP@p|q0djWv9g9rQ0@)pt$^uV-e9KKSJM0}NKUK2=PpUgc2 z&lvZA=<0b-PJYv?8FNt1gPUZbZS?`fa+rB~6cp6{s>WFC$7C(uQ{uuTne^FHG~bKQ z5{mU;$_JiP=^AY}cI^OIoHfKvLj1sv8VA^GiF7)~ZW5(PCyE zoI>qPf<_<_9oQZbcDA5#a?qHP!z8wjAZvhU4(Y13trCrOJjn}?_0VmL1f;Zn>N|L* zhZjGl!F*eb*%#@?{)4MBaBw5A7UH8DCxe$CoZ$JFna__pUbX~8Qf9;G7v~$z*kLoe0E^e1%BBmj0Dq^p z7fr^@USDa|^Ii1kT0D435q7Op=#e>B&FYgfT7V8l{EUbb9e`zCS*WcTuuJlNm%Fe8 z`p_N-77S(NZ(!*EvheALZ+$M<6IcKFfHPUgU5u6a+*z{lJa8)PErSZ+ob5chq5>%8 z=BZ;MJ#P7>*!-rUl?|zv{mfX)SYLH+7ty*$r<#!=sL?!NCMW))b@s)eu;GCffLdUc zcK?T6zT66&ZCfij65kUoKkfQ7uqj~v)B~*;_K>5*xzUF^Oo-~p>hDKq)rq1%$jmJG z)g<5SA2?ju!ug>ndU*%+!ZM_JfBdoVnLD2ePwb?qqh!K@W4O{ zQ`)CHhd;oznA6+=oUs6)&Poy^;J~A=W=)jbP z(ay36@P9Z5&z^s8-@=|B0SU4{=Y?ki00rPZkinV!otkQ22DueH`{2?HXm(e}`6I?A z;G)}#S3vZSeGVXa#DN67Z5tlT0^q1%?9s$a!0-TUMaRNu<#+s1K0wk|{8OrN;1zdX zjfoLp`|S4=^209KM$(RsXR$K`}MS0tNSCLTw2J_>dB>M7M z&x~F4{rV%9Bg%w#h!J4y>tzIb19+ura^DDWXPWhoQ#s-noLR!W0E|x@L;jgv7 z0EtQmZE&{sizkWh!?h&vUDKgH{uFmy)XWDN8{{dQk#{YtBel5b-FN&cj`{A~$-2H3 zk;sZ_@>-M`3uE_@?RU)vdiXY~?^agELNmWDEGA62b*ae+JjEY);}ZVWu*KY?wVA2x zH?MZi-6yoa3_Dlne|Ne1qHoBOSojfz@NaZ{)$IA&-Cgy}-OW9GmBs&ee3dAHSvxr~V3GebS^MAtSOd`(1JH4}6svgh5hfe<%%AT$-ZetJ?0}2=CS9g}GJbkvami zEc@r(C^65zR9kB=kH5XE9#eP{!)Nd;I3CER1CL-541yC_6)NCFsNNbplFLFz zJnA=mwL(0Gjod93D;}pK;k{hV(88sjUm+2%^b21VaECw&RI(B@<7f;?2RGIvlTmEC z*~x%q^YJ*n`}jS4b$$Gnsa}Cp>dB5YQ+ug%mC|<|ukIEc^9&mINMq>=3BGMmy?*DG zpQoOz@kR4@Q!)<&-v`TP95s24g=ydPKAVBfVa-eP>Dcdeg4ltTv+|Xo ztQ_ijbW4k=#Wsb}6apB-qeHAK? z{JB}Gg$Y9SB9CNCXwTA;gglc~bo~zOR6uP;sJrV&i)ESKu3e)*~U6HIa zmZ-m@Gk){1q3*uH&^s}M{mBg)lh%o z#fLEcFO_p-`jFV>j{agj#K>T&QRsgTUoDB?2!S(7%GI+TFCp+%!@=tL_0UTtMA0oc z3tEoCl~NKZV<66UBA>!rRuXj}M4a7Sj=I0QBwDLloWnn#DzaP>W4I*_Bg!?1bC<@N z8c1;8%5RWXE4^+NBEgd-*C>CyH11Tj1mE-gMx{%o@lIP30#vys)s)hNvj&oaJ^4+V zWu=K1LL`O9I!F7t49YHC<=^nqt8ZT4!ddcXReY2Ddaos;YA8nK!d`QxZBG!+m@KrrN>>K?&BPk+ETHcT4@&+}1PRbMg zBRJl`w(KDNrRp-8OisHt8SRmR+V(IAN%v%Dc z0*o?m~F5jHOwS3r>fOBbrL+g4Bo)enLYYry!^+?vm>EzCKCHc#3UD5<8l zO#-A6n_#Ch=ohcp0}077wl3WJ_(pS$hFn`ZH(Vc9Eizj^*6ne{x; z*jt3Lm!{J(x{pbY$1V?~eA-6fI14&D_`cs>%5M~(q(vKG7_z@Dddcqw4_|K}X6GhE z8KrMTHJ-jCTv}wN={?Jd6{s(tT!V$^hk*l9yf3oyz*4a{8jM!H_0ybJJ-l)G3 zv*@13YZrw60(6bF!Hl}(RA3NEi)M!Y&!{Sd7m^fCNeo5o5nR4S=~A>RGP3dT^a_TI z1IJbnKMCz-9ZLLgdhK{NPnQUlx2cQs(bMz^jzi}}L%QBiTVN((OCZw49kVdRDBBK= znY;LKf|AO$rkp{FFbQ@;Iba`asLX4lrloN$MhCI`cn0a0E-TSeeVh|DOnTeM_F$jK zcbo@}rAQ>!P}o?ma|QF@UDEXsb~jdgKtao6cu)cLSoXW!wIC(>w5KinanB21w3I3N zyqv>hzRb=AUYF9S$NhwHiHFxBOAD?)EH+4kubRoptA0G4AemaC^0j!$F793xNZ3>d z;~Re4HT~rgJkuVZ-d?>9+NXGwdZSt9vsbKal4TAnMU|PD4OfPwIxX5lXt>HUw~d;YQY&? za;Xd#oLjbs)!OR!Pow9K5l+;LS|*|t=1=6U1G&?8KP^mToai&Yus=e1=nhKSc_@*1 z$eW^s1VUbt1RHttH5m~OJp%c}?ub1ec!Ot&=TMei-#@^TgCCm%xxHY6OxlpexJ=wT#{KcfvKB#k4RL?mR+X3AWU zkiDEK8zvzaooTT`H5>sMz9AZvQ}tLPwS%adNRWmUNM(gVMGAy)qH30BBFoYGZ;0|= zh}7#6q6{Y?AnKNb3|ZWHN74F0 zXf+x^O^cv@E`qx)i!VPyM~gvrTvF-Bn9+uW;XLtVvV`FvT0f1b-QcDyHKry*P>)8T z73$GHP!iS*^7CVwg=i$36lV(&)lE_RffBH;H;AqmF_%(Yp>v;E23{Wiuq;($s`=!& z)sOL`mlzZQ^zVsQ>GoiKC#6ziEUn{?K39K&l5Hu$hWaQaFDGT8fD%|iNn>)fu{kGC z!wl2VY9+>!LlJ!J5jI*3@+%aLG^!;>t6@1+bH&*1Bhcbh{n4@sXRK=?k!L0IcOx`%rDXv`T>)wLk2lW#V2~Y(&^Rumh@|RSOSi3{^^qXQ zClTE6v(hw*CgQAj?ooXyko-!7#z)izC*r~McJEV!a|-n$IAgVP^u@SNy%l5W!CXEm z2F3Z@;Ss7PP1@oKN_vH$i<$b4la(I=o{rAbt(*L)BNLc8E|Scku_zNP!D$FUOR9K? zdB}Q;S{Vg_6jvfFw2YNq&W3_J4UuT+c@K#dh5)N1gAq@yQw$m_QxTt&3`z*TKY&Vw z=nDxGs1zyLba%a^EL|%P!)^wRa_P`w3w;^^{{$#+FdfxlrC&m^xO$b0eUna}h}? z(Fc)e+(&7zQ{MWv`R1_f8w#@b*%-o~#OWpzv{H=YFLxoPBfPN$4de}74Fb0YB`6Mz zXY=Hp&%5o^=#w#pznq_+29lLSNzBU$96Su$0($fG!I2-A9f*QnK&6ryw-K~53xWI1 zbjW)XeE?b-BNPiu~tqD19j4E-A2I&|hBoY7F`@)-9jDx%x0L^R7P+-_-Xn-)m z%)4WKek@YB7-baZLA;S@v@p<+vEB{O;JO4-Ib}g0DYsUA67{QVVM-QL^CQ`1gsmHBTI}N`N zr)Z#qLYJfY)`%R4=mQQYsTJx$VZw`?+ap8h-kL%#Oe9@U${R&sV#mDKcYO}|6qz}o zdDDu_-k`gqBO??T9%*5E?|-44t1rET5#5qM8Bk;@=VNB@RLd!nrTT(d3F^b~hA)PZ zjonPcR*}&0g&!kmzQQAWZmiB%ce)$Pm^B9G7}tx`On;;q!r;EnK!4Cfqr0(dKuH8T zqY^EJF*aNww!J-Igk{(;HO4d&WmG6mnvr5I#pt?e6BiPi_H5f!7y);#yG_vvA&3P~ zb$SUJ>;#ztVjE?ELA%6p0*!!w+BS^TwnmjZMQUwD@RvlGrhGw1QpG%gLW~|VIuQam zMZ>{*?N}i12o+HhF$69aDOY**K0?BSftOSy(nb)l2Jzx{pPCF3;nwcVMv=kEOMo%IwC7X=>%nz z#ikwk)i6>HStiB5EKa9m2UV1HFnX~O`^p!jY{~={514u|D6jbOrI)FAp!itO=n+t8 z8*(@o&0#}C3twD~{VB(80S&h5rCgr6w*e6}=RDW@`wTDiMu|(pQBt;!dvdIbv3^dW zCf08inkr)l8q>n)PNDYX*wkG*9Sdxnc!u$+=RO>{oAOS-S-mC$wUbfbi9#u?MJOXc z3a?6!tcP zNTjdFAYQHkMP@`-aNZof|Z~>NU+V+DMePEm~5Z z-?6e!2v1eZM;nd05S=E3Yw_rUraZVn#L>|L*$HQ6B6YH0m|42i8Z1G zBT93G&}@o!(2ums`aU>~XWNPpi7hmt5d^In)ygq2;rf`82)0-h$0k~Ex=v9=i~jD= z016k3F30Wyg}kVWZ631P4en+yMR;E3z+W1*E!%5HLK*!mj$S_FeAzVl1t#14aBTf@ znXgP5-e865I%W*>nP=asN2KA8ajUO+owE)g*4$<^icHtkXGFNDS}WE2hVmiGd(cmx zbCFIU5O=e@wXZPqaSt(c9@ux#4D(zs(#Hs6Xip8gyQ*gU&vAyRi+c3G1k} zmq7Gt5AVFTsx62l!e9bXR$7rzI1nTmi84hmjBHTN?iL$)Ttep~^~#r`&Z9lj4#_Z% zA2ni}HD2&btLx51OQH$A$1z62o|p=v)@I>PHWt)cWG6ppKd*-oVkaDMF{YnrtkQ0T z-k>7}Hb#&QCK?5y8BzAMFPzv2xvdvQBm%1(Ln0fNSXEztqY{qSs0uP}iZ5iz$kgM6(92 zScXt*ssj3=5QNA&&og{AKz69;z$VdeU$Vea!67B>X$NHi3Pq>7JGNyNcmSy9eFZ&a z+_Fx+GWKqKgZPw_$~VnW{NVhB8RZnQDg%V>wavn!v8n_4R0T~&SHO~)q0ZCCp0~&a5&JU=L*^fb!jHfEvJ-mr#RPwm?^5QkR z3oE_}QcGxOFQH%3#=~v~ys>!o^Ob0akfwdb_dW1^12B)gg!u2(t1TCFqmMBZGWY>W{Cl^7!xM z+o|%Li-hACLd#HsBk047^A(R&3?oDIkj8u~L@mpWs!jCkXR0IT#%DLE)dQO~Q=6t0 zMB~Oj3SEy48Fi|s;8l&4T%Z5)*X~_Q5JoV3$I1Fz%*7Eg6VT>8 zTE8+xDntT&mRlX>Zko8m=x3b26#Mefy>i{_S|Bm<{;r_M)=~^7_4bW!<=oPyMZZkc z5B{;TzRwH$7=4{j^D2+`@f>u7cbBnsFFTjst&X%PuH8o}3Drr6?|H@TYL8<=m>znL zXV|CBb(CutJ5m%=SAK0HJxuN z<<)!x)QsG(uChFSVdK4kYXi%jTGcTXi&phTT1kZOosmH3HQU;5;?^6Es*BH?&M9L& zPR|_8h~@Ly3cdB@4WmJ@o+Xdj1>-TpiDdBG=c;HXgM&lcQO#B+<5JJ4G8chSs+>6y zX)@vULr{93b!Yj-yGv1AxWy`pn33(KQbmH3DvHVctd3ml{?iq2lJ=8dzZG*$^>2E_ zbP_v`63JYWO=s>SV50cv7h@7F)wt1&B5+-|%-uM~je27dFSo8pzV0Pv)BUfPnN4{2 z9WNFOSL3Q@6d5l#b|?Zi-oomuCf>pdzqdkp6h?qWuqj}d%u@Lf<1{fL%MxI*)^Nza z4xI^hwlw`{SVMk#Y87jiSHR#o^PC~^IK<=Gd;7XDypB9s*ODukk7q2ulM@R|y`O4i z{;h6ac7#9)fMaX!$JBlw1*}Q%he}tT-VC-dbm*V$aBoH;I!BNZN4E~qwlGBQ`_}JqUL4h zM1_OQ6wj1ED7Q>JqI@|9=1Xr_2x$!t-ptsw@uABCVHKaF;-b_>r6Im$BRNGOZn_@v zWuLpG>LNmPrL@BDJjNI%49QQxESANa?w{U*pB=j6=u}>-<>8ddeeXlbPP7o!e7nBc9~sWZa*9QuOjZkJZLfQ(kBQiNG(S5-`vziDPBp z4YsW4j(p&=w#Z^VJ=zHNP{{leD@h*BX|ZvFO~&p|f}s)+@6WmSXoZYU#UGEgBZzHP zm4ZF5+)?!*1%aQRy&nHq2g5IksJTzaSMOFm@>@aiJjy@%wd_Hc^Q!zq5zktY)uF&o zB3X8)1*T(53%HvirJf_D_bGrgDJHu@6=P9P;PawyI*U$MT#vI!f=PcoSLP7!KNwO< z2V^h5Y!Z=@(4~omsjTMlRBuIm{yebEaA$Sv`=>{{JKMWrcpwj{qPqSCX=_ws2PR~d zuXg?G;|0z(*PEi<<9gxd&yI!*ve6Kc*R?W*1}h;|vG&j&cMKqQ!P#u_zw5V~fY4&a<51)2eG0!z{9aFVLh_=zMF^*ntEW z9DD|gO8HTb3J=0c*}BDRz9ezl3e@v2*-}WKk8xhHJSUt<6gNfs#fGp`SomLy^IX!w z80~N+NAQfpS?$cTaGxEF>tl8~SY*M=`PYv)G4ybyvqcL+^7f}r${bQDnmDM}Tj2~w37Izp(@Q923=g7i*8 z@4eShLlw80=Ls;Yp%HbNym;eMz>QlX4( z@m*`&`B>PMM3H85v7ViYLEgD|R!5ZvZC`InfIU|_!2rqcJefwQA1(aK0%IhLCI|!s zm#{K~DUOj51-f#S4oGR3)MH_Yf@79^O-JMI^M{pr$t8}?f~FV&!-}h$rNKO(OjooG z>>P`%DTxI(s!!c`c>w@ifl0)YQ~CGz(j9pU*>2tt)Eu1g+dC!hZM8h)ACr|0dr86(jwB#J(fV0L% zT-j#;baMtn25uts&7B1z1!gpxiO(j_3% z2xSPv4jR5`(FQdhAo6eu?i5-IMOy1xsdo5~)Z2q%@1sC0l7wAiclM)9FRWOG8M?V| ziA6nRv8E?^)FtusON?8IHCujrw;1Rk`E^qWFZ-iDE|LA$5j!?q-3+}q)w^S?ooqR* z&${)WzDR9FZHNj=gKT5>)ALblGL6wuhqK&1cRM?&Hi@AiY#* zuL*%j(%LLx*9(#iv7HwlSPVK`6*G$5bI3GC3o@K`NiXf&&o?g1JE>h)d3@k@Hn+xa zBwfurL@d>jLrCuRi$Wq66fVoMim9pZ_Z|U?7&EVh13LO%KlX^#0Ya`BosXTxyvWDn ztC%bwk&#idY$JaX9%D@K6KE){L_1uPPZ?5l=P*ptY|@plyPfOGG4p;i3r-9SY#Wc= zGc#iVKhPjSSYa)kazdCXH`>Uo_8bs9@*v7sOg&*M0YY?T?_TP~diLPat(c&UW$o}f z;pLZ-((ev(f1Y-X0wS0_5S_9;;z6UpX=cq_Mji_S6Lu%BOi366G zn@F)2t-sS!8PL|(BRFKr30K&&&J5rIE6&S)RVwVOR~&qb(4pVuREw1Fq6+q z^T~dIap&_y=ew=!OVl_A(gLKbi>3-04_J9S2XX(Ub^LQQ5}%b7B_&E7D@O&vFNcg%RgN*Jpn6g|CEl z6;kE|5!=NJqjP>3z)8lPS4}XhBe^WbIYcAaqtiI%5e^0!I9yu^pjgfrdY7yYNe%%5 z7eC{)O9=Zgq`4#T5|_0KiC5ej)1;%BaWe88*? zpdCXVoz!jU$(}8NK+X~zUhTXUT*{~>NN!g>fvP`fk308gpXE0I2a#I*8DxZTScHP~4p~|bA=!1fb;8nWJS;Ge0&U6C6-1=~ zmWl6Ng;wr1`9#~S<5lBq-A>z`x{nDMB<`EO;Eb* zdx+^6x)?X!8UCAHLG&bTe@I#EcRf5QTh)npBwBa%3BxpS2}Mz)%QO@X=6l@maE2aWJf?!DGR^cKQuh{S*(B zSrQHSQw>;dC0@&};Bx0+lSL4z0|m+q=1M-ZMo{tX<~~IOZ7YD0>2 zjQdqthu;gR!9{PRufrR-ElG03Wv<6xA%L_X@x5HK&k*^Y{Kcs9FP`cM)+i*yU@_{mO6(Q4uuWS;^ z^d3sANoeSi3SN}G+6KuFsYENQ;P|VQBxzLM9H{b2sL8shB`3a#ayAJ~?~PoF&|<#V)Jm{7YKk5N$#2%UgOt-Buw;3hX29jY$7tjF zwZBMc*Lo~9c&M~aQ06?)`B0^uPNl;bJ89;jXL+Ps`9NDTRyX2}-nR+8P7mF+B>smh zpI&`z)y9CyCNGe|3_cHpk8%~JRhUo~F?nQcoNAEBuU;FL&=w@+US)h&pJ-;4Ry2@5 zY#4Kzv}lrhA&X}9oo^Hm`;{L)N)|S!HPrK5d|GWtNp1A#fzf1)k*vkT z=yA#g?8D6K#$gZfsH_pr)bl|QEjGN`nDWs0O|>i`^x?Z?W9MXHxABtGetHW_QZi5d zNX~oWZS)`xy1j8Gz932iyef-fx>%(<4We4$pyI@x%ajApC;Q{{3>;Jl13pm< z_!rm5F$-NUUMj!#ctcAShI6%2+|geec_T}lZbEBIpR=a z^Oq0kyOfwfxilNqDHbc@d_Xb=Wok|i_OxGg2RROT5y9xN{C_(RGI|tl1@y^EU_V(!QkR$O?F*)yvUl@qCf#vyKtq5scn? z_<YR zqaYWZ%6poDTfUc;zKgsJlT zb`&wQ^2yNXMK(a~?)02~HK8Ggbu%rJIPErR-5i7==o2k6IWg$D_dG%>h}VZ-@YKDU zB80~$n3jeTmrT2;U>qcw8p4`3r)wM{W{4l?nbyaIX-VquN9#YyzZatN^3sx=4;V(* z4F=gVR@|ZsCU)tv0zm&$w zV&i~P@yBWL8Z+^;X?hWcVkc>FjA!wZk_jYdb7b|2-t>v|?g`t*A&MUAoyBZ@$NFzl zHBLMc`1J<-fax5>T7mY_IM@M5)3%YhwlOnrn#AQlw)t);iA#up$(*K z!G_~}#(2ZP5T@A#4MSz$A&fDU6}9py%Ob4QD)i8IRf3cz4dP` zkKUS2%uc+_oPWu$H6cnOWN2%Y^)NN7?C4$l%dFq&oFh{`N(6IK~h8d?XR}f zh+u|-G~4qvMFtGJhq9<2khI9?8K_O{0{EJKjf^QjcMmqij4o$m9Z?CPuMpTpH zm0>$MLZusQH92f;FN7RlMRaXFB|bMbI@-<}yw$2r7EfK2Zg4kZZZ?W(HD0DT`9<-= znVGleMUpQwK4cVq{rNHf75~i|#&?6r?nf+;+U)9v;(Df!&2y@~0wtNwCH1oXzVYS?Id{#cE2C=I8)d?s8Y2tZG)v+avdyV4nqw$1OR_C7zAf90&HJV; z-{ysJ(ao=aw5KqgHnpU*2^zA?T$FNniJ*K?L`Yf=N*4BiP$_roH--IeZQ z2IL4cdi&)aBc%kM-Rigntv7zXfqGYwlf9<2ePWOLGMoAik5rRc`bkA9Yvqb?kM32coQhb;H9K~VSkI*lw*ddz=G9i? zBYJv|Q<*xl$tZXhwwL#ctNBJ(F2!dhmrF|)hQBabs%kq?%LtU&fEEousV*5dhmq`y z>B@y6>&gVH?&KrZQ1f@5*7B1^@|9m+wdh@j*_uzAE>3OFQm_QE=(9piEZ^byoBI4& zXG5B&oXu-uSZ9;{Gt*gz;{@O4or;I|(4(-a0Yy|vfG#nD(J+;+$0tgsq5UwBS1zC4zt5LFEUn2{Dz;)q2A&Xv>Hd-UT z(ydfjt-abxnA&b#GI0mo?Yx@IU)(Xu-0^SB)=)^RFii{5WUoPkaW3}A^IX_@j;voW zy@?W~%!;xa1_Xb~LBj$52&A{?WP&Wex>8VkfQ9WKA5rmqmnNk4%N0uUW>Hw1zX^O< z%!Vyh_@R+~q_ccXFGwYdMsn4nndJW;e6`wsAoVYN^>{^!OJ~>i55C$Vt=`R|9xZ+b=Q?a{64AaOmjArZl%TRlc>0v*NGUvy`-Rbn_%o(oO?KfACd^ zOx5OSflj7^zlX!dWJz&i*rd#_t(W>J$eS41)tkcGMJ+dS38+;3l%H1RINF8BFjk&M z*{=*NKc|f%x(+dtl3b0jM){n4{YjT3ZoRv2T2VOo_GOBDWj0Z?fKR?#nymg z)}`N1no*RdM~6>>9#FO?+kh%?;c*Y0ZciZ84jv>yP4iBxvGSS=?Hsqy>JIf4w|u_B z6j@SU%m&wY);@_AX#rcKZVd5dI|$_6jMFIfx15hrFr;}+DRXr?socZXsWz-fCzVsqGDSL-^C;>j;^36 z?T(cD?)Ud8mRF?qIl)lvA}Q;GGXw70(v-W-i-_|zDNTlb{12DC6+$B3rSDnF4O%jX zqP$pb2m1)Gj4=kpxj0HB{FzlKd(lU5H1Z5NIY{^1?|@BoX@JbB-1zK~!*M#mzi`2fQ9WMBE1NZwen>vIuKEv1_ljU1rFUC=`k;P2;rT25cOXF@=T}l1- zNB5&{t`&f%IMlJxE>0(AmZ2iUR>x&ZYiNQc)o7-!-jGX$fs%%wcXZu4>AdN#H#7*1 zQ`lGrSrgIJst9FDn4P&5KAbwuy(`zUDc3HK=4W?>_QO`dw{# zsK08SY4Oz^{F=a4*pNM=X7J>#Wh#XaHkk00*M|(gAH#C$GpvSO<|oc+vgcMrbX@KO zqk8;@5cja|N4LQ*H`T6i@;=|+XSY0AdCMkpPd?n-Jz_IFW=&(x@*$lee^`fz?@#i? z*jIaPu;&heE)FUXE-Zv|Rw{=@g;X)&-fG0PGy6ns;ebD$Y*G?8Tx`mIxLI*gH1G8f zLCQ_p{n!am;%i%W;8M>czWHN=%(rNX+X9$?dMt4Kl)K8%}^B|{;a`BGs5^||ncQ7x=Qn4(v?Irh7Oo?<+D>&gfn3z#Cx0>g8j|2mr?L}X@NIHigU!gFy? zf#xt7$b@k>7s6Q^-l-vfk=r2}a04wM%6@ESpP_ znq=pM8_FklU3i-QhM!pT_NyUrp_La~*lv1lWfad|JIu7For>X}TJmOeI@=F^HErn; z@5TGc0Ek1X3sXShLb%tF9obe~iazfAk~#ltKmXMx7>jA8$pidX;=tt*P|k~vRyB90k+ z$WA`{$smk2dFGSO0Wezlt@5h|236s@udo3J!&i$^A#FAKTINMu@n0f|f0*2WA$xl( z_II|e5{PFqH!F)S-r9lk=gNIlnonLJRJ%k5Th$;#V}xo&hGHNz;0as> z50a1V*rL>NyCKoVhc~U4w$M4kv}9q6)8al3QQKCiL+%gh4^9=$7sqD9nqHx`>O_P2VbLF3pPn3W3kaC(sE# z`uLE8+$Uk7g$edjUn=W$Y#bSqVy7my6nu~*XlQf(ouZEhNV0b29w*EIyl)eGL%vn$ z!L8N%wZWYSAV6OiD6+lZHNYtC8wo_Y>gC-vW0g3Nuq^+HwC+Q??&QTxTQ{5&GFEK; zFmLww+ppsg=g2Oq;FOWHJF0g^Q-4tj;APndAI@VN!;l~Ezk49#qQo*^bgV06?qFkU zGST$DkC(_}u5$E*iA!f7ndf!vG@TT)M92;w=9tL%%FP6}M z>%oWDBe$o)iK_2zR*4a`&G2U?R}V1FJYWz~Y4%LhY~FH;IGY@gwclr2DZENxntXq| zlIm=g?R99w@I2J#s72nR7wPMCP8o;j6i|&wD^+q36RyL_?X}CKR$!Up)FB0X z>nj+ukEMt2VkCR)a?{Ts*yRdo;7A@ENUIP0T~0L5`D8ZWLpD3U@xRP%_60gq@WkCqbrFgbS5^hHkQGVwN0BtFUgaFu<3j|gRpv#p4 zn;zgT8;a|iq@Qj;mkC33c#V=op+XFII4eR3#DaPF6xPM9;e0_lkybQCRulvjrzwni zt}H~s99KCM9BE$kZsfHiv@HN)eEF764zxPQK*$p|B?_8Fz$I+&OC^Up^1+`;1;>PX z1e8PU0K#@sBwTsnPIVqSBVp!l%9PylbnioTiy{LyLVtgyXvR|Xd}TLE3v>M-?`%wt z5A)Zql6os_J{}?vn|}lU^Y}2WXITIo$Mw6st;_fhE7a9#BY2S-ELWc`N#lFsrXn zXhCdo9mc7Z*wdj|UuVQvpP<3d z$0}|A+u{{J5N4H2YEOUn94kET6A9+SSw9)|&`C{(D<>~c$7=GYSmh@jQA6)g(OK;g zT#Ki&Bc*}kO-apHkGAnBZr`K>L zs=y1gN?|X@2)xkboY?eh@x+sVC(Ksjh1vLb^RHne(u8B*VJo8KoO|#}G>g|rkQeqH zwwJ19#JgwqxzGp>*ol5ito@fTTkaO|GK;euo{mj_Bnr=xes^U-um^{7#)Ir%XI)v4 zP;KD}R~SS9{#z*<&&zh>dD$EaFa2ElIjWcNmw+f&JTLpvB1d{oT`%FKmp7$54KNOg6CzC1=@WDLX;)Bn}52gGcV2OnU zK+;DR^w&mUehmd8l;n4xzHhGiL(4k$6&K=Z*>W;wURZxF!5*Sua*u#Bo>Ux;qj(ix zAV>x)%3%@D2|L(hX?p$fW8cRLt&ew~R#N|1!XNK zcqdTGG8=h$4W>5>gQ`-#CZ(Jepqw_L#Gccxxe}YsmP9_y&y*?MCgXD^1AXh`FBDbT z6L1&+loIuQKHK6E!$6K2`1!tb|$VEr4N`eeuY)ad&3PxYDo_3yUpv&kEB z1sn2p8VVg7K14T&;cZ#`>q{f=jje{)jbzc}MAeRswb70BpBihJsb#T^t>jG@!6sQa zb)#cbcXU(Zd}Dim6UMZum%Mp2y0L%0Y4Uf|Safr@PV?k^({yI@a(^Q$x_M!{8SB{c zmn};mDcaJ;Oel7b;G1L1_r{hV%uPS%f&0 z-h?jIPZ7hQDTOfFq!=nE3@J+-tsjP7H$cn{%`Ak0nw3`M){UHF5GXX~PX5)sHbK`m zzLGYgCW&N$_Gi2~BB(YBvnGK&jC2W_W&x-$fM#?;tCZkRW|zEZlV)iLjkW1GVe~H8 z?+$dxnzc(;me`4bHpqmeUv&Od?o<%!1W{4DT6BPQo4ACS<()eAi#qJ&y6Pyq9xmid z-sryP)Okm)T&fAJ;YoM(F)$Lh=>yhFrvlN3II*3G7^wnc}DV_Rol)VbN zy#k5-ZwAU-*$iTqA8ryhc#8y>HROsQy$)TcaOWGpcQ8U?8TX{bLhiqh#Pnm%AD@)Uhz|t!ZH0 zNd&T8ZaDfEEL-mFvj&~Apl|~5e_Mg!i&A&-rA z?m`V8qxyy}hV#t^Hp@pZF50fgj25+hd4vI5yYj@7P$uc~XWqE}Z349P0v01VZkanQ z$U4S)IogWi*%iWY$BYU4cW6@J#s&PxY^b_!m`{r6P2Q#u5|D*Ha-=Od(Jd^8=#lb| zox;x$uO460dY4SH{1_#JOe(BSkef~4TO2+S>N|ToX68TXa?s&MF&_G3Dl#!5sT?Vk z>l;z7hQiyj0C_Y4{@)pR4WUp&ViI9KROB>sHs<~eOIn3KR?tz|R zy@8-+-p>^CZ85@IN_6YDubqfOmnoK3SeLBhNm^nSjXCEh26^j+=JbTee*EC-C|wFz ztebA;kNT8T8NB?BLaYOgTeKWax8&y z>F87^L-=mdB0wGZm%1%eQd5GOZzg4~6gPP@yYArN6nd83+q& zZPH=}7*hvM*{AX=aqwmumK+OQ2XR?kD<-{h?R!S-cKzq=%mYGq8m<6B>65+4BhR`0+5RR-mKR>zvOW9x$NUvL2{HMj9T;Af<)co}iT6pe z(F*e6iiE1O?o&$Nq{6PjW`=$oyZk(1qgx8_rEDg;+><*K{X@$3Pv{cm?Ob7Y;TVQk zAS0OyC~`g6T$H5^J8m4Bcx zhNHOG%2W&J=iiqXAXt0!gwkx7>VSmuD1xy>YAq)RCa$pNu=)p+C9T9)!4m68>;#l2 zx)^+gj~@LDBDEG>>rgThe@u1aY7u~?ovtL#fvN?JfCx4YieA9Zlant*N48rySZ`3W zQ*m-}!mqu6@vTmgWYazx;(zICC}4FgoC|w{W9!*X{)mGY|D^g-=TZ6v;yHPU#bOhH z4q)x{YTjn0!MlR|u1u8sbxS`@9Y!BJQ+mzY3dT(&IN+QK{C`myYrR;UvH;4p(ij2r*`hC^@I6o>ZO zE(P=2kh!9#5s!(iM^8qpTXg}2fal@H4Mi%vu=>oEK_f6BlCo2K^sZ)o7=@z)nTx($ zVC{vP)y}Yav8pD6w{=0W#S~AtJaZS3&T7Ri>Q>R%lTP-0Rod>DGZxcdPIJ}H%U$nG zPh6Io{c#{Rvs2gA&gaY9y0m@e2JYOd7@I-OmPTto^Vc6p`#8+i!*L?wocCnCfA;5~ z8>)$YiY5O}p8euoCh~-h&w9iDKqFObNr0Gzi$RPt?mEG?9`!cg$|hW~gprFmP@Bmv zEn_6AA3@?DB+Aa!RmT$twPhxJW7~qbmPP5v@g-9lm1Z4#CM5w*;bBnbOcYF!DQNdC zQ6`Uic`9vPE}l~S6~*K5z^;dkLS;9_G^Nfioo9|=BdYpoe!mEp!@ez#w%Q>Howi|! z3OxqQH3U|ynQ4QWN1j~ zY0nPO&uBjyH@4yFQ{@kwV^E{lodal8^GxhpPc3=9JEqG(0dCV%|2|hqZMxtuRCQ&J zR_k!Ncx|gU3%Xg9(uqo=O=++&PhEssTH3H2Cpvm_Dt$BG&o4se^i>{$0Y-f3Z!5fkm6n#C8aY4iOsU}b@6|F!ndmB`LIc9{K35t zCKpL&Z>TxJC!Y)BRh=49;hPS{AKo`Od@Adb*W9pn5qs6nQC}sT_}(&Bh5gpiFlg55tY~g!zE;^sZayUrd*;-7HGd++d{MxgPOs zZKF%p=Ix@-OIq(+&+yCGX}9niXHF6%!zC|tJk<7W16<)AE1%p``ma2f-!~vp0=G7) zefdXi7@xq>VFSrOO-|?lqf5PbUN+}aMJi%@AkNLkCFIxjf_+s9e}ZEsC-VcK%>CR; z4rfk2hraHWrQ85L$FpxAS_6Ln?8XIrz63_I*XTzXv7tocF6##oj`PVrOH<({9U*tn z`*cFryvVtwy4fk;GH|J>wxsKSb-mJjTkDe#<1SK_VX>sSixn>;_q6xqmdK^NWfwox z`<`7Vavou!+(7?$lF<0gi0x6JQ84YBB_ItNCnu3N*^BQgMk_5-EY94Dq)lzc&7rVF;dP zfR6_oOoWPd-&Squ2@lq{ZU0lOt%~t8A>$-2dyhv^|+qgAdKOE-b zQ7CJ^WjNh1-Ra_0-Dtk+HQl&O{M@Hg_VIt>tJxsWqTBKg2B~usAJjcRK5TX{teu-I z3->Ism3K6mo|~%e^8D=C?1*oloo)qrm50eYS<%nWbgO$+BsV)fm7Jd)4)>}mly`P8 znxC8Q@~WwBc6RZepI-)f*LBLfyhxp2_@?gNFxl+lRXe}BYnf#=i9 zS0~+pSI4b^S0~Fj90?jgg9dV=2_(>jLTDu5KSkMpMwE?;iu#)E`Xg5EcL_dlBTRuuN9idy%*)vA0ym?`DHwa{DnVK`eUgp5_Ys%SJ{ z^NoOcUsdt=2mO5QciPn-CqEfiIxO~8f0{0{XpdsmsVSMQv>*Ry-d|HXU+cQo_fDtw z^J1gV;nre*ZP{`wl8~HDx2}ApJ(61R@j%^|weEN>oh;q@im&}?;*LuL^_Aa-b5x_* z^ct$Zj~Ctl^mwqLdV9Lux<5;=vF68oo%{CEU}NpiWpoHR`<JPt- zu zZT1Nu;;|e+Gt!!~fbPbUBw^sdSp0AMc}S>RD}B{i7$=gW{cZVxD~$ltcf|iol&$(B z%34FL|3izi4F54iS%&`z_O#XF6+1&pWW%E!19*A->Iz|(L1bD}K1 ztcn7WP5jS^vR@GY$BD9t|9eDP!T+N~*)IqH$bvX~9zbmYNOHsDvGp7L7pRi z0#2oArAm2O7FgJY(jX$EytegUsBFc5hsx&s_o(cETpwjZ@Oe%fsT*vd1bQ+8??;fg z$HSN@VJ)tMW-9}v978n9LxmUU52%j!u0$P9P)7;EC^(T5qFr1ce0%{Bi0`(>=TXiG z-vIQZ_Q;e_9bHWw{vJ1MMnC~rk}WvpnFSF@nKTIt?3|^*#& zWAS5K@l(VDQ!ayJmd+Hjh(X3&(kTlnJW<)91m7K@SU|x;%x56`(?Rm^2-a~LzmA*l zIwb$nW#xn?t}pzh%c@PZ>dyYrWtB#>|En&0=YP;;qf5yH(BS^PkzF`xLOh=L934Zz zFO*1|SxI$zVa1~0ekHi!5BQ_zNj&&kxJTNb1KW=$%ibeRfWy{sNYacw(g4xLj6ZfM z*@*>3p%ST(IoVhLr4vzD`5tMGE7-IYepwDF#B-{muomnkV$T{xB} zVD)^2qD6F#cNf~>N zT*48XEBTfA;S$IuwTlIUHH60kf4 zpD?-$xj}3l-@SGVga+_jhZAYVbg&Sm*|UAe8?%r8X3U~T%k{q*jZ72lk1S$g6*$pR zFNICf>xcj3%woUA1WZCG7YQ)n5CFL#I}F^i@^+VGZ3MQ3hRx%dS@hABC=^vbc~9|p z%@tgZIDQHL0TzSUpqF*{$piMLBlf1P6`?xP#~1I$@LFtx66xzZpv8FN7A%RF;*j`H zXg`t^Ps^eJ&{`$ZXTqb42<$nQJOe#Ch6ZP&!QV>BqnA(b$}EOhP#l&oZJ|%CVosxm zPgz)D6RfZXB>wItjX+`32ffwOoocPG*`>cM^VqeDbwY!0967p5y#fHUcQwlMJv;SYB>`X?(46(y$>6LR0B8Iy30>jeq_0b zGWOCZ@YVCO8*9Fbx|XT^NS)5g=YUitlVOxc-diH!Hmf$2F+!PyBUpf~syT$a+ocZtXQq@*YD@( z@or;bV;@_b2-wo&8YOo%Cu5k7Dn!C%($Zh^dlwESfal2ywh_b8El2<Ss)$HA&$KkUU2R9Gi_Lp4_n#@uocDOHG{J`UeXNZ1`fWctzc#`&HvFYWipxl0xQNhHU zj}qPDW-^AaQjSYZzB-uJ+g}$>qs=u*!zkprrBkYL0=@^^N2_BCDD_#?{xc)~4vXYQ9_T|1A9Tg<$+FX*JM|KShFSbR7p*05oI zjK#fl-sC=;SGvhw$Fs8E=6m6iwaQ2}cpKTEkv)(~2L9Z4@%NGmzJ{t__vfn>-$h0x ztA^i~_ayE_N1Qc0K!;n}+9daCheM4q^;URK|t0Z0X%E zO4*j$;)yupUVLqsT(r;#%+a7RB0W@kvu~BxuD(Y@{X2Jg6SrWialiG_3wXnYt&xNB zfX(|a{Kt(pQr~Q2jGwwo$Un0nNZn^t`Rtx8z0=!HU^pCb%>xA#Hc}?3A6ZJ8hU78Z zX?EMXwwfQ^h%@Uqs!JPA-jL9chuFyJr;NS3R()I1yi1>}c7l=)lgV|tRm=Ce?~}qF zvMTa-J~CmR zkGyA;n0oo6Qr~la?V5M}pufw@VADlgO38-BOBbK+%n2v++KPGMpE8YkpvK5+bQi2q z?K^GDIO803_3NV{ZVGM79t?JT3@y%Th&C4EI--^ky`QgI&zI$Y$i#$(rMG%nJDCy3?+|PP_uPEa<8U+WiKJNM@=SB^ z>rr(O01hMsV3}7$aRdMa(2iweFH%d+YRc6kKCl@KFvhEt7UA-HNIcgE+e0#mMuDdfb&XhjB5)HOb3 zAOcV~jy$mj91F@o0RS>Sw7d@rz=9@_Lo>M|+jEdKq9kMpAQ?Wk=>y34O-xp$RM$vE z2nU?30XlL(K(x%>w)pOL3+3+?VW)Bp0ANiVLJvoNbgaK555=ufBRM5dXZ;PO;t*2+ zfF7X$U})9B)4@kg)>2X+A0*o*5GE19X@PEc1yG2h%%xh2=aJjyQuuODYDHy z9A_EDYoI!`j+*eOSO&9#dmR!2R#n_qxxR2+)JC0{x^qGtMT7|Is6&FJP!MCl*?8Mp zX8UVGfJz4efkXri0?@6buBxU+M+6860Y0Blqu7&VEK#F?bc=|RaEKxugye5XKoZA0 z6be89WDq~N$Bn3t7y)2FG%ve%lnA2P*Sm0A;t+EQ2&bP~hIo%sj)pH0(yc_srUVF# z?~-xvvP=4GucnpipgbcEQP73lmV>1KR_~zJs9ghOtph^Wkj_pJFFCc25m1CgAE#)9 zdBe{<5wUD0!I*Us&*q}+x69;5+UsfrTok$`320A9$CO1pqo&kW2BbJxL<{( z1tmd3itUsnYAJN;dyW8lt98-1YNQU_NH)_Nb?9vw;43wi?}$f&HBk^zBnOR(6jArN zI7A!(SX6~@005C{5U6oi2cj=c5XlBd2IHw%``}NgKKUa6;FOpXqYb4Y81qv#ED7qB z7`U95b)ISEmqO|3CJw5haBTX3c2}i(2?&-;Z(I#UW1(9p)ZdW<5MLN~iT~ zqR7K_?RGwbD;qs^A^^Zg)qM{@SV!cXLr7Q(1eJ&FGy(v|9aA<)b1*TPG4fP(Mb77{pQPw>Q?We?@(bpV$1~`F1a9t|9k%#WC zAZuGSVCSU0UcU(smE~=e`Kbmc|5V3mSfE-cIRa@H_K-$)%y3bFhiBUDJRI?$=Y@Ni zhAYq_w1ENufaKiv{3_}$KmA^N+RwS}F;6WU9{`3{D)c3?xHR&&eHc*MX!OuHGRF8d zZ8^^0**N`R`t_i3%;j`c^-TJ!nHardd1E|B3UHEuP+1@oJq_620Fs@Nq2|}%nj@^^ zT@J~*iRJ)8R$iG%0D)%TbG~5m9_99hFq#}Bxogmq9K0fHC~i;iG_0R!1Zhbz>uZIF zYE|N^O`>mv5J(|U*18D!0Dzfpw);pTIN(Y*k;vjshYtYA_g~n%tFWlP23+7X0|NsL z-7vt=HFQg-w3IYRNp}b;!q6#F3J5AKQUU@ZJ%BVyOScN5NGc_BHu(SIInQ@{?#}hT z+|OQX?X}kX{(9)FKsfQUT*9E5Hgw1uAbcvtTD_wkHVUYL)^MV)FXCYDciRXrxXHu8 zC}r%%T*Zi`+2aMMDlRx*Z3H!sDw_B6O2+Y>pO`nMFPhIDF$Rs!`)NnW#I#&LwlwpN zy>2Hx)rxjJhKl(@v8rls#n9MZ`DEWRFoXvb+Fp z-N!Zg{MHc(#&<=-q%DT0thd(}*v9ZELYAP&Yp8gT7zhlFN6($bzzsl9x(>T3(8*&^ zzwxbZO)$Z+%tt*PVpkA;Jjik}W^@sXOR@xu2N9IXQ^bQtC?jEZO{ zP9ls0?bL9$y9FHu9eM&!K@Fd!%rw>U;;na0Y`S4kg4AVvSbM!Vh*A!|d#KTqI;uIM z%~{z*b!cPOBm?yYz55XXvIDJirl;uH##`EMoi~Ad=E>S76a);oQJ6zV2j-E5o+jlt zoav0>AZWGg z$*dpLO^v2)(;b8DU!Q z>u-SuMjRVE68>r8@>OhorqT;jV>fUrABwp;z2dOK-B^p&6NuVf$ zE*t|TYCU+-Hh5a#CRnp8BUcx8EPX0rV@P5HCesh#aSIrYWqrQF<~AQ4vwWDLlzHD3 zk`2k#uy2rGw>dV#QCjOIQ+FV7H&L-IbVe2J46zB{zHoHKKZb&v(6;Sptd?Z*r_av9 z?cX>s5IoHe++(Oj=`DO1=vscKQYacfzi0HBX}vAF!FIkM5A=M!k3a(cF6=|Ivl^oD zbI(EtXY>w!yfxYyXZk@joDBU~NByKgKN zuJ^74TtqP4H{{ED5r$dQy95Qo$;azffF~Suu-w5jM0E|dddGivT@8%6UNS0nr`4)t zo0hs?^mexn5_Hq4CsXp;FXs2ZZC$2r>X9Cg7aOlPtFOip;df_#$Lg#|d1XToDRWL7 z*F7J)1>`Fo!KD$6Teq^ibn@rVLlwk2SHIr}P*tC4=5O%j(wQf{HZU~sed2P0vf6(v z#D1faTc_BtAGT2uXj6uUfKI*OUNB%FxCRx+kuU`jWOq>~`$NcpT?S7G5`mCqPPGcS&h$QFL~}?B2s+F`yN+ z4$KCJ2LgtnBfG5;cdST9oxn}~J*mySsNdp(7=y+^ma5|ni74sDV<&MmBo!T3-Hg|1 z@BTs>VG2r%^E89GJfqPpST+Wvakd~lLLM}2C>rwij_37n2Evo=dT6FeyhAZGTyx$a z^aMxxU5enn02uD}&ODBtMW6XQBHvcB@JVQkmRM5<}c3@yuY*SSfhlqfnnyJS$ktYh!>VG#?F!iEfCoCT^P0gS%^PVI=-=$l#gz zDtP$E*ayYr2A#w8;V!;^IA)>JToKqO6R|ol+yU9-TOrX($JXP)e+G)8-cwok(7Jyz zJr7&l{rp0N;rh9w=7s)89()6JPMTSF0w0{kXF?5e~YpSuhkHHF8dsV znS5q-o?H8z|67#p)n#`u>Mz#L`Y%y7{yHv4q2*iu|A?}(0mV!Zl%kCoQ@%=>XQj~rB%w9_D+hjQ) z%8q>6gcldPOk9exKl(Ur4t9QAin13Q^wb5=j}KhT1Zm zk8egE(J=>-(>#29j~vUb;z_$TU1D)7)@>}?sr(&Ru2-{#g?KcS)zF5`_oUR^*;&i* zX~?}27xf3HGCTz_4EHv5{1Sz~ss=43@R+5NHM*z;sSxlQrY9=#sy%Tyx@%6ROk0^(#IDEo&R zO~vCVasy!%Mmv##o6UVpiQ*7?Y^D8x%t9B-9$B$sY^|1}=K5o^PEP))fm~(bIw~*W zE`+XV94F9U8CUa}3)`=$4SVyninWxyy(Zc7PB%-RQ!%;tDyat$ds__oELQbOf3njd zEmK-e`cJK%uk!@QZmT*o4X=I{8V7Nrq@{=@pbSE*vT!Bfi#+jr6juGFKs+Ii$6h&> z6W+~|mj9KlMGMXjhfvPS73)x)BxIDa7m4IVk@FFfaG>^SlD@~OIud4t9Ag;Fjw7`~ zBMf;Eb&UPdP+t2&0(PfWjuK?KuDA%ka$^R(r>qaLkDPeb*uzvJ%}q}t^T64ba;%^s{)u*=n>J{ZS}}goG@$36>pmj@y4g`ZmW!#qZU&gn7Up z?zc{|6yD}=(`)H(V0)7K5c-wdpxGj$35O;v|BIH~slIQcJ-vXIy4atH1H?92@_AS2+9(TrZ`2D`Cn@6B+}%y;27u_0NYbsX*xjF4BI;&a4prk`|0L%Z)EMT8D0PZm-Q zTW_S9)h*(jv9l!Q;b;$xK1$b~?WC>}efVzM*{S#LhrA6@Rz<@^Wy+@%>+yZkOP~!& zcvQtxG_-(zk77&^lh0MFL9?W5aqY&t!s`v9!F@SyPTr><(bdj0`ye6K^3cmI+*fw6IPzG^wV5R~s#4lDTfsbT;-eWX#i~mFCYT2Pzfm zHr2>VL641-psF@V8fy7PSaS@~_bAv)DqQRwnZ(sq_0~!HxCqPa^cLu~*E08{o&JG> zYA=FG*byoX?V6$V;d%sPOR5YSyQ;kAc@8j_ll5D^b-jzac#aGhrn_Ya6xu!%4%y>J z7Y2V^SWck4initIFZYnTG&2t z!(6zj$e}W{oLICcfpXd0!v`$-IggK(l_6XcC$-bw>Na3YoIC1W7gOWs56R!&G?BM9 z71Dc7em;R!UhW&HfI8g>~xw$(~g=IhKT2K-oGmTe0PVg_Y_rYGdb5RV@jKXb5) z(w>=I3lJ)_9V|A@;^tGWPKa1q#5L3lS@|y^+XE1? z-STVe@BtjfaTVc9LRR@2(D04CB4kI46f=8aa4D#n=B5vWOBX9!_J)e-pA88aVjoKZLBRHs4=Dw##%W0%!%- zOT2CK3&5VFyF7=Vza0|;5LXxNn?yIXA6OzQLt3lOqkQNAle7F9BM zNytW70)*`G?Mp%y_ltAnUqTivx`gon2-!eA7gLDqKZImgL01&b;Cli|Nw+pDqUwxFA=@TUGI`$NdfEW*gpM1YWm zh^KeO>t7PG9snUr_m_}WxlN%Ay*b3-sb&Dj*zm zdb%4U`L!@0OD&d4R^ZMqOV>WSxO_VuZ-Oaw^ zU>c(_&Kg|`q(TUcO<|XUfwFXV5BZ8JnF$`x2DRekc-mu?r?bQ!ClUQ~(YddHO#xcn z%*cKf<nUY|=IL#JdT7bIv$-Ipr~Ep}7@};nekYDoo|?34x7Lhu|7f&42OsYt zPV=dnR6W0WG|qHS5C2T3{$aQO4i0~359&Nxau`kE6sZ#^pj`}R1jW+iM{37Mx{*~K zghmlaN7Q#kQD&>jx<)8&D&5}fqE=H;1S*kHxn`>)@^nlC3=!I!-Oc)zL#4Qq{?o&! zxRXmg5+;~w|D=!KaKrhqk$F#}$J_-!cq_l2R%4!gWv(gkZda{Vgnv?)hcO@(7aRWYw}YafTx(}#o3M9vk$-uYPTSb6<$-Ncgn${9ishWDwE#cX z70kHc$R&-T)4cLylmGg$nAy21rs$gbzkX~W;Kz1d`mvhZIshMwV#h7I;$uw!J{B9L zb?L|A7>Go+{q=$AKNYeFF%&{5#}F1mTX`C%8$j*Rr_8T%|lkl`gM;P@MFt_h4R#{ zB}Y)k|6t|br)%}r1^n2aS{lzfVSogOMQN=>-5W;Fm8ue%b!%!_^6P3)PjUYLfr>u>c>d+*SKi+Y>|jTJ79< zRyqs&rujMe>^XSkA3s)=TLSQ7qquH&lm9#rx_jlvoR93QK|Ur$3lMC3Uj{& zG44hZ6nCSBJ5e(cQzWs(@hU{cs?x)eu)H3+=zj9dXr^p5O?)KmdlXDMLYqiTSGSwS zx?4LChbk~i0ygebGG*QdmPSQg+fZdz0@DviNN+*t5xYC~X3U|=!)-m`b+|(HXtqGe zbbKW7E`*vVQjtf6%mhbJ*Hb+8NXzNS)bSxnStrX3=6yS^Kv{$ivbzBnL%slIEbFAJ zuTd)et&2u%kA~jj7s36DR5<<@u zsp=I|BlrM67S&CipUq<6!w9OEYkTWK9wjk3%Zd#AG2Vr`$A*?tp%V`rTdPvk6~~jR z(*pe1=)FjfZaZHsu7x7qBHXVuKfJR+qot}y)vhru^{ zzEjMvCW$}#)T@?Jjr&8ze-ZFw;k`Y8A4_-T$L4uyCq+7IH)fsTWap|q$?akB1^igF zR)0-)#tGpk!AHN3(G`Fn8+p&9o14^!4)9|;P3E)%xofKU&s4p)A5pVg&Ym=W=JBBh z{Mb~A*c>UWfU=atTdjqz-jk@nxAbeLQDvm6fFD~CP~p+=Gv((yxnt>&kCo+b06!Lo zCkI#lW$=bzcRF$H>6lc_Jf$sk3bFh4Oojc8Sc@*;#}=!RNmtk}v#aMuQ4IRhl&5DKLDz1$vJSdQQ^ zL^^eR{Bwx*sLC6uvp3gewKr=OkvP-CAjbGeo-(g);|LfmAmasYw`}N=|A3UPz>U^- zPFH^H*VuPWE?solSAJ|$l~dOFK+9db-gjENkyDp`EZY9l$sp}HM)v+SzH==ExaF@O zTY2fnB5`Cc{aDRb9^J~KA(8DsDL0l%J05F&X$g8=Dn7j(eS>@Zf?}l}*F4^LOB!bM z7n&<%qRJoLdo|Qn2H>#uZxq8DJIq3b>k>BL9 z3Z=Y$FWcM7OscO&39p$1W z52I+5R6gQFcj!8vQ;Ua?6(IGG$xdYpy^OfL^b1Ei<9sy4ZE2_zlpcRjlMdh7psp{d zve$mhZ_yZdLgB2NBjz?;f0xol*UqZ3pGra%cQ9AXnZAd682OrA=FcmSH}Eg4w}igZ zzw4vIdYD#Oy}R?^u1|upc}?%h9<5}0nqK&*Z$=rd7{ zH{Ktm2d`2Kltm5tsm>Xja@!>}=tLqvKN^;f8sDdq6R)((<#d+0&phq3k5tURwWG68 zGe<<8BYK*_T?l0iUq4dEslv5pA zG&{~nS$DpJZU^u45PkT`oYT85Sl%rE#ks5ZABI(aNt41;kjd2kz_Q8~EB zKtmRdfU8mI7mL>~PZLa!l_t5q&LM_MCrB#&s%6P@-(ST239N zIInPIl8NkE_&Sp5?^X7~o|J)R=J6>(<8V@nSO1J+$+IVO?r-K?07BLcAY|(`Rm&ua z07BNMLd$?iAoYrn&EEtFS>Lqhe+AjhwHQKm+%T z`{JOvHuiS#lY)x;khMDJgqSNrmcoYNU|2%xl8~LHd{HPBSC>0<-G!Hl_QtJeN7FXp zi}@CoJe%vzC@WQDx3fbc&844<&!ZIm62gNIV6-_U|YdaH-<@`;f^s!Za$F80pM)#FqJW z*(LHvcZ&(Q8=;h;n;;*@RQ;@WsuEr%<#?nh2Ae?@pNh=3yLXY_MPwpO#OuCFt!b1F zeG|#RUgpgAh&=gD)A}+t%ukMH#FG#fwLmE)G3jrb5UZv6ac_@!bt2L51;^2HgPM`@Y&#{Keu{GZ>y zDlX?3MAKyE|IXZ0<-YeJhB@##nOB(x@0@;$bA91aYndwj+*o?#WD#Q(kH+>uN|~xdji%ed<;PUazr&L8!54mN0ux^j|?2P&UjN%FX$a=I(eiX=*%7;A1 zKwjNJLoYx*H)Vwh!+oh?7b_6696yq3e+-M#$~PGe%*}Gko8=aZ-)^=MPQTV~C^vh| zZ?=7JswU;A{GSr3iuRkYEkAGDlrT(pf03w8Ea7*Q-8HLwdQ|RubNlA(H`9KHn{`G% zJRIqhX8J$etj}!lv=iaKIYKl2#uqB!-cml-Qh2kGX*bT^cxbAvZn_%%_?t|&$jpd# z(A(!nUaoqw^Y8I~*Y(EyaJ+pszjS)k_+m=H$D_r1j*zCM>j%iM(IoGlgaoY|iw=DYhn@rXP33BKJuA$C9Z7=!p5fd*ii+?Mb7| zd-x^y5{2?siG#CySWEp9Zk(@Dy>5Chq;Wl2OU{vVIRsy2+AmEY*`9b_x?hHCFU2sK zggQfdK*F{zIquw%N9XR4$&MeqM%O`r^6q=r3*YRA-*3}@RTd~Yct<9+BC<%-$Chz~#GPjprZb+S9~@M_Qm!^_U~5X`25aJa~WTc;4| zjx%+^P>V%(Gaa4s&D5hxHyaPDWh}#|OHmfkd~ad@rznemf9iE9yLc*}mj~lR`l^s$ zjqet{e8L$QR%w1W4cK@G6tT~JY6Qp%JCdLdE0q9VKf$*eaIJabYJRkAkcf9r zf8Io+U5N2Ri|m*CKlnZ{v-62lM6C7bOSRsfiMKUCtWPHf^#G!*BV!(sZ@e=8IpcRm zaRf1zm`dUtPpt*rR2{SJEs9o#;})i;ny2GPI|oZeF|A`9;(s*H1~b zYarM}1dQJ`met!Jk`D1_;{q?_<*ETm1f}&fui`x!2eQZq0S}hn83>c5q>w1}AUm-$ zO88Q+;TY;3rAZ~<`*0Wn$7|vNRLL%KDZw_Ro9OJT9b30Z$>U^qZsvV^!qZh=QopDo zFp1PAC@#43%UPjgrBteln09A$6`}MtuaAb{wa(~T?X_)zpGxy3qhQCHr6Z2M-Owsq z64gLE{Dx0wKT8bmx4vXPwqm;Ie0b~bSHHPk!0;dTK)9&0>SvfW!xiDS98wJCLs}Ysn=aQ+LTsvp(4s@FRmJDD9g;88=0|efg zf@79C53i=CAYmGefNFz?#ee>30y9N|A>absZ#N*h1;2{XL`|J!zl6T8kal0oM@0I< zh*`Hh1?eIS5ar&>lZ1@ff_M_67s8bg^3iSaRA{&Gn%pj~V$I8hyeE|;cVyGGik=VfW0eT{2=Ood*!-&fp$iZ>@h_rZ<|$YQ5|YVpU2t;pq17D@ z@XjFSGvLSa?f%{uf;6NM{nWekV|S}bboLhd$p8rIUw$kG`PYvfET&x$pbn_L^ke;# zDw_B@bPeg^evr6b@v(M-wCBQj5)emt*$BL|1-nua++0$7ZXu=%qGp z5Oso3=R4$V5hDx%#Ef=ip9aDBF~sa10_-kc>~2-;!Ng>EsfREjwj2JlfFDcY&vibA z1pHX10-8QGvXFRAy_%+tD?j$$ICr=af|-IdPLwU60CDNZM(||S&7=z=e8xFl3I<2uCz)~T>4xbR>4v1rL~?>GJ#3KqugfGp4O*N=^mdOf%$r>}T2q4?ts4y1k^ zuU-**pa4B={}rGN_^}&0P+b=`mH-#$qBl zR4%t8YzO?c-wyig$Igg7s?-*W64y+M;Pw#e zTsfS#1phG?!{?Gf3EV^oOX!U}G#iECRAjj=J*W;dx(7Gf1kSAOjNq`p((4N}05 zRi>HwFF$tXzx~(`*m~* z^*CV-vnE%5?0JR}RZM3!rX8aui9sd;eR{Wr zXq<)woT(dcf8M_RzREd(((%=x!)Pme6WabOKlTIlW4i*8fBUg$z>h_`0e)=y z9r?0pcfgPB2mDxN+>gfU5)oxPC0U~O6y7B)qG07GvGB$z7Fas*VohwNwicH5i9%c^ z0_J(xO1x4S2o1UELw`r-OtRDe*DS=0o9L5o0LTy9I8XOmiQuj8$+Qpkr&*q0U#4KK zw}H?P_J0iS%t(Lor=rV1U9lSv{H2>3j|Z% z%Y=JlSd1oFcrd75d-~El!cGMa+R7d|pIGvOL*2&&T~(OsrsPTpWbWY5cde3=ZIBj5 z5Pwj2J7z_R7typMXWIQyMwWDS*wfy=vO!+V@O(6WYl*QLi z+*q@qXI|)&Pv~%p2+p^&G)-vY8O4)Pv$uMYXP^97q|uKAq8PL7-(w=Nxks}K$Nt-a zZM*eikR#4}pIDqqqb_=3-<{~6n$oz&_*K6R1^z8C+^9hpPsb}f_KWCK0FO-t@Yog8 zOFY)z?lBjU=@lM3BZkD^k8S)Gs4OjYb5#|P8dDg7uHZ;HT}m-l2R`;8=*5I!U} zfB4upIqSzmS!BxB;FQ^!6al(NZFIUm(?atz5C2F%VhVn=Z9cnmoN8m1imiL}u_=Q4 zP_5z>b6wDlyR;h14M_2*OFR~|9^#X2bdOFbq)F%yOfZRINJhpxb^%c7jR7rE|5(*a zHdrm-5|2g4*coy8z~C2jsY26=_JW-XfE}w`j(wh=t$LKLY?71n4kzy<2P6J8V|HdU z>1ox;Q!VjaRq>gocR3v=+#)7IKPpUgvhrj<^xRXlT8A3gEGo z-RYNjERz567v}bS=JI{M8w|0P?zgw@S}K}+i`J7U(IyNrE{z!1^SCzSeQzc3-pdDd zFJ+}ap|8DUe3zXxS2`s0D(;kMvNp$3_lwJto&YO)_bHdXBqo|giHm(RIJouch_lS<6&z>yQ=d z^H7VAK-+uS_P4oCyp!5KPth&r?Thp67{-p-mfGL*<~}1Gj5M7|GAsdZ2Cwtq*+Aa$ zFLVjXH3c|`I>*KWcC6qBPOSAcPO>_##3yil#Cv1w_X8_Lu%m*TDohUNQO;}3P7Cqd z*IPcH<$Vl<9yhJ1SoOtZ_C2tk9war3k~4|3PD?`cr?u*F*Y{WtwoS}+$e84&tXAinPlE_=?A`Z?^~M84EC_t&NjK?O>l0#4=k#Kj` zUP?})z*T02S(Y#PRL2-5U3?n8KQV&xlP3q|{&U1n+Lo|VWe1DXF`3JIHp_hrOMaG! z^FPa~Q})kJrXu85NW)eL!d6=t%4b`j92RK2{S-@rfqk)y&BtSrF5pa!ApTa0R0D^| z-DW{z2zo)BR1w5A-AK(a_(dlR6~+cqdZWumWH^9%CfK`sozR<#b!~B67P-puc6IxK zPS*34T!pQ>G~bGBEdtPbuWTn1n6_o-zuBKX6L|jQ$^9o@nk<%iF}p8WPV-5h0F-tC zl&XO8T$RcbeG7Z8P9B-h;H%70xVrNge}`QSdeKP|8gnD09oY|}47KOG7mQ#~os+N+ zei*it7Pig5yf4IZAa?6OD%>7p`{)a+l}l(!C+m&nw_yMtYsUu5bo_XU$J+1y#bcXa zBcmgB>En;sdv4WZRDSjBibRp;OBLW3WPI&rX^wb5e|>*1Zb^pawCL99%NM5=%V>GU zM|2pNbZE-%A<@#cFe+-U$S#&tjAD}(RelpPHG)vB^NHUwvVZv;>!K3k_%-}^cVvQ> z(1D%=g9Jq`BeX9FfjGNxL01f%@!F**s}{#}ExsjQq^S2HLiXM{OtiQ^jYH?DT!QF7 zqHHLESm|(%xUp$o{9jR)2I1l~ApZgoW#7y6!HSA>a|NSKZ@>7!k~2P%z3MYe7yhUI zq{7tKC^fRHm!{aAx+2B(qTN2|+TIdHj8i3VwCINyFDx(CsEcv=R+XEgS4}Guq-)pS zCo{^1!DZ?;2Kaqw+LLEpjLdUngP-HS?-%}9%xF7b=l%6^Yt#Nct&>Jl)g9-VY>}kO zc$yFARi*`+0`tB;I<%QOpW^**U3e?YsD5GY;Ix<<&f%sj*!w|DY788!w;u0p4CN~N zyutZ>l$}LCF#f3R{8>-r>9ZH~w{E;5I^xg#ZE(#f!GR|1H`4D$B*|y&y1vSvGO za}M*wJIC~2neH)o_uI!Yhr!l@pV+`Y$$5qRigD$rJ~rUc$m?G-JUg`i5Gsc2*F$7$ zlYN6xp{p!INYS6h(S<%X-YH#MaKI^JOAsbSTNtdw`T)BeU-Ijbl&c z!I-URO6R9m%v(;m-t7b~dH#twB~P!Zi!wN$L{C3_hDwQwCVwERp3HTiyzWw%$lx}X zm!d%ER-7(r`n8ZOZ7g50ds8KwOXJvnE2gG3>!abK(_Sxdhhbf>th!yBDmUYUwntUt zG;rILK=12E>7mKOAp!mOl&+=C{#kM4QB;APhPUkAGVxl0LOdI$1+g64Rde1$UT>Md zSb7CbXF7~X(=Z3clYh0{o|QMFD@b~+_U?=lHO!gGZJ)Q>=iSllyzSKqA=UMDSh1}b z5?$PVPbICPWLVx&tVNm0qV~(_ATMflc$$l zW~l$Np`dN3sPnmSp*4rvt2fasVIw&1Dgn|BA86_~hU*J?Ti`*HMjk|O8b+_(#Bb1} zlqBUGbh~S!5T|B7I;M9Nu{rG?cXBxC-9h{|_oh%e!H0`sN8DLmiMqqN=yw@A*4ITu z`w)wnq929Lgsn5f!xWhv=lyQz-~GHfQsue`pHiKX2v*Xckq)!Qj+of#J~rzAn*TcG z#m18klKbB?%ior8^$0^gAj2Nw4610;Y;%o-D%;#qO=0H=L|@>yE95FZrY3!32qXmt0zqc5NDWC54X|CUKmWo9>vQ20 zy7ma;ld=~mx_DJK<$^dxRQ#9o+CI$SoMMz_-Tc*QWb<6!|UDT{kxX>{!idIdmPI;BzhVex?D73qgNR8aQNq zlv&?AO;U@7ld%hj67_tOM3{Vn)ULTtSom|wagQox)4_FtCw}pM2uM09JvEg(*RZcq zv0-uaJZDRPny9N$a-mbNdgIuD;gbokQWP?*g9qILpf$*-ddk{ z-O49>@lc>Zd9Uoci=u~VLZGTl?W7mS4IS*f#Vb?x$~HF*ry=zq5~a*puN*HXO`=?Ay>j}%1kZepjH zc-pR21nFaLt%RM2Z?8Mh_igpiYKiEBYrxa#wba4@+31Q=sQ)v=x-!ameK4@HMZ3}b#@j<;DkdwMxJFuT;M|GMvaP`q{y18i$Wc}i$LDzbT;~g1`+cgV# zz4+Cx->A7Dy?4F^=Fs0dCYCEX3 z$*;J29<-|tBZ!X2KdFT22SI!{ki+fNU?B5yStl%;z6m*gBEa%MiEOd%S6$OG@y`Kx zZqP#*SIg)1lxsKkmZseAv|@>UQ-^bx&r7rOHBERT`vw=mK26;=qQ29w%vR_3xPCPS zl$#jlabJWr^{q4cjrw)|7TI!{}Zt?AMyJn`onm2ZHie4LWfA5BXt%hfS|~$LX_|9=NI1 z_^v9HPx?7FlJ2kUeYIF_o_{lc)W_Mq{e0`Xob@3u4pU@3^c64@0j2@b(o4VJR&?4L zzlSzeYX>7BkJ|IwZWlwg_({)Kgl-LVy?#iYr`hTf8xasR>_SpwPQ$)4)`7HL+|Txr zKNWv18Cstx>$##nGNjHHwky9^8Qk{M5&ynSk_xVgQq0#J$aW0JE$*R?fS=){4+f#( z9^1&fZYzd)>)SmEL^*tFt&b>o_;Jr`!@>?fVkdY3J9a+jy(yu?d2~Gv{akRoDfRgI z`sT(3!GL1c$fGc@xzz?2)*;x==NTZ!B9VX`OK>H}242dsdjH6=orn*(dZgoWIsSUH z^?KiSBNWHvI73Js)d|)+BX-F2I5Y`*MI)$T^5g6dKu%aD#2z=>{uE67^$vZZFyo#f z5@2J=LxTJPHWo?sA2znekO^R8mGEW_$T)+Hc!OdrLJ-r!5%&WFF4@>!{+rVHWRWqJ zWdS-h(JF>YSR6TMY}B>*IIrHQ?JG9+T?4avNVIL7oE5;vrb0^q%uPu?c@oci6W>AI zRQ({H5MX0RxdAp7nucH76Am>@m^Kl=pQb7s=oc3hcegC@t>^=*ZY+VSQo#Hb8*9Nz z>Isz2r3JoNk;}{{XwNrVa$mCxsUfz#iMb zYD%1!=B?UHd?3uSyi52TpTr^+`3E=oVueV>&w2owm?};3JSf>a{x+>D+$oMG6S9Mlv0@(+f=StGT-E$y*NPqOd7s$l zJjC0|bRm#1pjSxD$MEqHR0z&V-3tQwdB`88488mif^2O;Vp%WRK``qAarGWQaUF@d(oh z^bN8vgmWI>%^_bE#GOt(@68N%%VjKlQYnUK=7)QIG>6R&Mp+2a$t;t#ybqg*BOkjnku3t=*V zza@BqYYb=Vgb;FPjq^f0(+Cn?L7{$mcUE#Q?6CisvBsBvEdHM@(!MRy-xg$NyU(D? z(1t5B7Gz0=lSp}YH+cK5&!rhl_JZ@9z@-^$_u@_}8Nrnq8-Ma5)qsp|l;k3n#M2J) zGb>-1ujsS@hI&dy@sv!_H%GFQ=#Ekm?Y$yOz>MvbK@NJeeZ%8&FPXbCV+pU!Sfka# zf19z|mPLRWo5BmRBh0!$mO8nYW^a|8D;Gn-1eT-JfEnvLSS-Uzq`*~Dd1c0$C6?7+ znX!c2q;Cr#yDNqNGGmGF)fA^C5;lt9X!4M@_&v>Zf0aF#lUklZF$WRn!;)UJgFN|0 z`qZ-MS`MX`uzVDQ$M;b%e`;A91MfRtg?)6!pH3kXFy32#J|pX45?v zwtHpm?p40&C3eWFL?c4-`l_^Ec+2VYPukUbM&!f0RS8CETZ684c4eOlUu)Hs%?(!H z87IDl%se`EY@P$(WO$7oWKA(3|DON)&RP<5==N$eBn4DM?7_@42i{DqaVsRX7|o!x zvO^n?Uwo@!(xJB)ETgO7y2V?^R$9xHio@}(mVch^eN7pwRh^qdmPoGsYIDVn{5n*{ z8&N~}4Fhs=oi}b2xymiJ-+k+(053MbUT3IYZ@c~mQG+3WgRxG-O%H5?SyF>VMT6B) zgUxosEu#O3vUY#L*xfgW+V%G7&wZ@w?U03v9&dyHCCWb7ej85I6v5vVrPCDS(G-`| zlu*%>G}QEuC~HNI0^#BEzja$H$g--hPL>>YJ24j{_Ds%Wlg!J$@eeofR; ztJ7RM)Z9?fT+M$e$`xFKel6OvjYUIx6H_Qbvn=PcSLA0d|#>N2Xt9P2T^6=_G)Jf1BOBu z^CMR*Vi05Y={Xq+^J}}AR96(s;V8>REC)phP#yISg67H!ibKlq0VBzF5&MF#Zm9|tr>KayR3(q zZQ-4Etv#gY-A6jT+MWU*@Ab%%6YvDpYU%>ffnFQWUdiEZT8B4=!(ArH?cc%gjgm=B z7P^Da-`}~PELzgF29Z{7E2yh>LDzGNv7Vl>dZ+MhApkE-kA`QCRA z(f=T{TG|uaPXHevZXHz89iZJG*q%B7kZNZw#$ zqB?^(qpyL@)>Lm0-L;;BOr3qW)`J|PgY2Or<(2)oc|*WW(t6ZL-T5G;+=m6!z#3xk z!*`+IPhZRbHDgonIIoj9G~EvC;{OZAPTjuqeSvEAd}KRq1luw)6FNAZH^@Qp;Y;g> zjfD}{&<_;PK0v*O#t>tW5kY(_s$?ZRwY6YEWrC=5o;3$OuTj+Rnqb5T)tlroytdJ! z+);l2(KD-wW93n5FU;rUAvv!x7-E9{&jjO-@oA&lb!4sr>)mz}!Xb1<*nSd08kRIQ zcv54vUsYgYva&V=EUA7OVHy;1`RpDL&t*{!C~u+f*bwvWlPP%Q6rM(_EpQ3Xtzp zUOgb+xab@@@Vx^l(Po{zM5`NCJ3&BxDlz?Y$j;a5E`^;UqMltS(2Ti%RGjIXu!mM6 zbdvNEWbx9+%U&TG_FGeAy+!$ofc4JT@SUv}B;UfuRfMQiqdJ8{46%gj2YPBwY8rq- z9Cq77|B{NeDE=S!4lKPXRqmsg@G_k5Q%wb-`WmSX%!PunQZeV27ceu#*E|2>LlfW{3i zAi+E)B&#-n-{kvkhikSXmnOJzHjs~)0g|8hM-3e>^2Xm^n9NAwg< zJ`%u6yK_KFen`n0W)!sQTads&LU@|L$`Hocz6-tOixnD5#W%}WeYJy6_l{GxnoRT~ zQ??QwgFgIr$ttcN>F>|@s)llg9dN-7d@tlJN*~_e`L=2Uq(9+5V1-Cx43Y%G3Ke1f zZNO^QwxhlWyc2x$AXeE&l&9$53#;F>PL2syk7f1}u6?2(vNL5PCHb{T`$)rOFP~J$ zj`(~u!@21Q_w|=s;UYuA%imUL)5r);k!KgFgkhid7#yYqM`|NnpV!!QPevG0t1mu$&8Gxjw}cCv5T3MI)hgY0`kWRxt0CQ-<~ z3uOyg$`-N{g)rx$-oNkXcfP;xx&6*Lx9{)vJLmTO>-zWlq}|`5?NyunLw4XdcRda?o#)_Go~`B zh(7Mg3gWJOE5-M1ID!4S`!yh$J-1S@FsBwVqK)z1nfPh$-9+9!dTV4@7dzkZ$%F0pL^C$^E(Wqyl zx_S4QkQOYg#kC3rc{&EyAJPuyD|fFzUd2w}UoqV{E+PGLcb9Kbn)kOfvDGz>7g5$) zF!WU$Y^NR-`E6MYX(o#9Ct2JLJzD0tASQ1HG1*@>!-$}Q{?0rF^$rm(wM=+gQ4jUA zTGvOuofqBmZZ96Z)_Bmb&Rp>04&QrXU+d4@hM5|J`9|?hZ0~Ly^>msJR zyGu(y1h3wqImFNv+#@O$r46GhH^S=0I&<+wG2P6Pmknh{65i%N8@=KvkzRB@EXIS6 zOBlx1jhGa4e5h;DS?Xy^`YD2U#*O)Bv1VA^MJ#U!V}B-{VJ6WYd^I{?U)0T)_0so5 zSk4kf@7|-Gs-`2B;yHP70-8;YV%V)u2g4P-Aglj#2_K6~5zY@{%(Gm4TwL53y!x_B z_3m02B#t|W!g#`CZkB;Rg7BJ72MTBcMrs=3t<=y)@_rbX%Py|m#7 zwprL2eJC#O;paR6G@|V}+xFVHEe|-LulawNXdNPh0ygh<>Qn7x+)hDt&smvfP|jM=tE^WKCi+G?aePzg zdJ#^3!@LH~zC+t8FoKEvW<7DJ;1VguAeh4X2q?SbI9j40^p*(lL)a5kuZncql@$m> z<3Mdl6%VDky#{FPr`$e^4!w=;7;|L`UOprO<_6Ul0gAxvo~;=jbu$D}CpF=!SuWbv zr<4vvQQ}obnN?*h3+TN!mOtm2w89r*dxMa@wM_@}@Yak8dm*fpBe{Nff~?-{y7#aI zzCl@HBNn2jD3>;_S`7i9#esemFUujk=Iq#ly`#hyLc}g_wNHPL3c2)A3ib-1B%I?H~nt9C2}*8OO;Mju0|d6qI|MkUMlW}}r4*DaO5__HY%T;&vJG&DZ1 zWHppIT2-v+?RoXDC>!PNS+0P$tcltlxt9MiV_4#8MBmhUM z;*MsX7`U$$@Px6!UF*WcU~+Uop@52qp83Sk!_I&rxdsoT+Y`h8CCXMe23FjjUcE+k zcf3ROR^`L#H8<_M6Q3Gyy?Hym{=cs%%kdwIvh!f#GceW#L;#5c$3`Ghk^$oTz!eH5 zgHcja)4<_0)O2tfMtV9{CPrpvR(2L_1y6oAmIe9ucIe9tRbUSfrI_K!P=JvSq-f**j?;#lI>12344;V?@=PlmvD|Oe; zqyL7C+0BBR{{EJ?NaokzWB={PE;IT5p}7CmU~GO(BCa;MudZUTF8AfT!JqHa3+w+A!Pu7nn_z6u zc+>j$`yZdX`X<^ACpreEx~8VS?o9QKOs@@nd4Kq&clYby=>KCdcH`T@=JLwc((%^P z`p(Mlowe<~jUPXEj(+a^JlOj$F!n>kyI-3#ZSe%|OO5phKbD7IT>IGA@cY-h;q9DD zO^wIDcV}0>erzffrUgNSXTnJ+f;7UQf+Yye|82n7H;L57KRqx|Q6wuOLo~)7oqw;g z#aLlBgTYXi#hbogm3NTd@F}FaEnfJ02LH!rBHlmVi)Z-kiWITmqo-13EPe0!81F;) ziKVKC^1HDSd&{)Z(5-`KO!+MoZse>ct_+&UhvFeLBING3NzP}6>n$Z81d81yW@y>? z!i?KLz+#B39w9RXh}n-EoKK)+5pz)}+IAXlb>fs-1P?LfgD=yGJqsmxgg9iG2Ya$l zL&d&FMRd&L3JU%dMF0gv8sOfO^Ww2cD}D+Pn<&yky&$|#PJ*a3{3=DR9X06)aT}2+ zQV@5)?tN4OQ8pqHoD(HS!k-)%H4qMZpGi-;udw}jP;#o0h?oXl@g3F5vnCePWg4D9 zD6!`r-ABQPw)2_BX^<(#?JV9~iL8g0NRIt@*@w0phb~@+beGI=kvfXAD>y|@>MZcG zd(^q~I5DCA)4D zgkuo0VN)6YO*x|m>of9L4RspJ3~ed)z}|Zyj1LwnEuf^W=yToYawr*@=Y*l^_q_+R zPlU6eKY2c+a~5*H{x2~0(IoFnO~Z~|ZKhrUMM<_dh;Bw`a;(<#L@u4C5l(2Weav(V zlU(}Rv?F1HDe~U(1?x#%vU0-JrYDPiguE1|bJ7szBwdg&$CYT3(u}|kmd%}e^7aox z@6Z~QeILjwc($V|Wm+DPHAA+WH5&1AyO`t(s%gtz)b87Em$hC@l-t@YE70d$bD=`2 zg)Efp^A~{UqK7x;VX^g{P7l`oQJ~(Gy}Wmke9=EQW)zj&f3?Jm z^a{&Mrq`*(_x^`q>^rK@b@T(r?hD+}k;%VjpLf}O*m(H#IR8YGFPw-?ph8Lb8EfI) z+Er>^QJ4$a#e>PIqZNC#t;@oU!hR1id0f7~X!~4?x`OE)U~} z_T}6m{{_Z=c`Gbg{-{YTSfLdb0FRU%p}^vHyZB2={|k&AkA0#Lr^Z9m>K9j%71z3D zQaB!0u^)$LiLX?Mud;}L6BJ*Q6<=2qU$4OOy&1ziNW;KF%TPnp8I*8eE@9O_p>sU` zgAPv#Ud^aEj+K@~*A?FC7fp6q>eqdDFmHqrns$N-LARf5~5foAx`f-dTpdV!Of;K1iRCe~n z+PDiweuyn8FwKvotr-$QsXJWJm#A57eA@)5h7!Rv&`cqz0I#cx>;Iotx45bxTTc#B_xI0$GvN^sf5mKAB*+ByS2e zmlOE4--8cyYB#{-5|}jNf;63W$R?kzycBp7tnTT77Lg%eAeJF1Nc*q@{=VxjZmOt! zk=%GL<3R+t7y*Be088SI`U*Fnoqa_`kr|22VJ>adxdIxzP(6OZMt!;6i zhyYnyQ$vu2o4Ch!R^NhKYKm5AaAI}e{3xhE&p?QB;@Qd*987ivtcs)IPN}{jRg?UJ zMnQ_`B>Y`n@4LqMc!uJHM@l^GGZiOrG78*_b;~*u2O9K#Q$2ZtlfVAEtbLwlxB1v4CAVRsRK`e}5qW`V}TrK2F?11NgCZGt^$kqm2WD zD_i8x{AdnvGyv8+gwJqnfH@n&Hl*eRD#||lQA!z&t;|qX`%#Uo%_1b{`0bhx5L9?1 zY_EAV_#WjYs?VoXtj&Kw#d`lk#oGU+Vm1E*D%Q#H->BFzKj_^Jn%j?;?gCV--6p#Udzkwx;pTlf5%!^C?u`#uQ^X@}@kpcH_a(df1)>Jay(fE|l^ zC{C$ZsTuqL#2*Ny8jt6LC`6{0DA&nOso1S`5s5hZoYWJ#Q`x$~Eb#Vy4G zRP6b`RIJb@b^C|W%*RyNt&xppn&!te+xFi9D)y`43VvocGZcFD((Z>OcpDN@DnA>dYztRIx5SAkI(YjD_^#M@%z1qr>cJWZ4ge9SicP$oH#-3&d-qXhEZML zpvi$R=@0A>(|$VfqfAA>M>&t-D#xi(f52EArRdjXy&)MF_$?UG;n?M}Q@ zVMklLmn8PAImx%q5$+=>w?UL{1BdZc@HXClp2bZ&c1QorgI=K&oLbm@3UqE;iJ64O zG~^qfO!A;y43FmF3vHkRnhw*c_O)h%z&Gbf~qgHb0qP(cw_eyrkP0f>s4e!NulXmk=JdM>cr3^ZOMhwNM%TOm&uQuo|{;hT)3g#RrRts0n{J_^9mn zq6Q*L@fYk`7GmGn^d=M{=~8TVu(TF)5e&c>;MWggho149Izia}Auim!3HR(;6+yc4#EjvgP>c}ava&PfL zaJ7Y7bnsYq;TPS}(%aF2Gl}M0)!iwJTx0FL0fiy^v0{#)2uaNkl_9Uz=aW%HcbwDo z?61?_$EcEwE1hanLjaC{$jzcpoA!K*h{8~sQNyFmtuTo1_*8P9L zSiAqVavD1qYgY`|1eDVbzMSC1@5I_4q$OK)Iq|ylytRfD zI86A7-OC`GdfqO`drL_>udOcHm5z(F-BKxe>ExyV9YX%{^* z#Xhf~trgdtN;&L=nB3PjetunQ(Rkcm5cM)hK;+u?gn3gRZDeq^lN5%brFkxj%-{4T zq>!IQly5wernsBRdhA2iHIv61yUzTtYcB~6UWwW! zad##Oz7#1qB0(Bx0_SB`=RY5yOC9B&#;?d4Cu;hX-IkKy@6P3(0eGQa&a945+vSX+ z3~`c%5+*6;Hd*(U!o(XQ1}=)j9V4dCl&*^C3j_}^9PSn$+bpTwpfG(b_#hW8A#8G~ zd!&f2I?9-|3ehZf^Z#u< z12O*t#wxLVcr*OA%f-h(@3+nJ7dl*TqEDcQ5>~db;8poIpIR+9+pRB5nW^PPOv<#A zTSev#tDHV?1Z`xs=MP4!@JQF? zS}+3Y^N1}o0F1q%lwaV;Y3(n?w{asiv8<4rhTsZ8$uFlQ_YJr;(lAu3!$pz&yf%5?MM3{Zy&RD;#jFpl>|Z~+2h%4uxBPgo^l8?5 zy`d?3yu~*Jl1>P5r++G487Cn7gtNg(^T&fh{ZLuO13B#ahUIXegUwAb+kRof4x@tn zZn2@a34<6b&Evh@Qd_Da)217rhVJb}Yo;t2E(J@b|Grtyr;Ig+!NwZ}f5!e`^t#Jv z{n=RcXR~7cS~qO%SzXajYmk|z`|u}@h7Y&iO02W^-nVX?VY=Y7EceZ`_^yPV#O-%~ zz}WniqT5#E622iz)?a_$0@?wF>vw;4_v&7(~2Y)&lmk!ahi`F;E_(w+5&oc~*>*$ku>NA`_FezT z)|mpoW4^wWTmEH@duhd3lH5iXY6j$X#O;8u$3`yA>D=vp%xI@KsYD=+W}p?Qr7+V3 z0_ilf#>Xl<34Y9yi&1=_|Be?FPU;5&Nnus~ogdo)_^}8Tz>i(}s+52Yq1{U+&pJOKb?m#>@xvM=A&62l^w>BEIg6HeIQ zRbDysWBpV??x%h%RHMNu=C2>i0Qj-^<_^G*HBPXNQ4$Ua|KrDYgsW^>hj;l$lT@jI zGaAqLX<*O%ST+8L|K`V{OcScwh)aHiYbmR=twv~iwU=7A3Wl_b^o48mH5*K|8)-BE zer*2rGe6csT|-G-R$NN1N{hm&;gx@++^Z@+WnNGE~POpUu?+o4CI@I(Pn7S8C&j$8b zU%SUiyM7+@bO97QAAU0db~izza~zTodFsca%fg@ThWo;MB6iRGSOtw9KcxC}%+bcRuvbawi^&pY#D?@ylkvG}R? z=rcc7fL5QLfq1M>-MTE|Lu4O;SiIg_xLL#^!6YPY$T_CWji#_~{)Xeu2@H6H z=OMiSVkqTh&`u)OQ6}HH@FBjntuw0|GZ}>`1gjrK5$Ecu0DkO#Um)PeZvC4dJNMuG zSe-h+kKLEm?+_*iR~zaBer)$2KQ^9)Xmb6d)5&I!C6tLl~)O z3*+MG_nkTOWA`=4j!_-Pm%3#&PyJXOMYr%%KQ`fABx|1<_Fd#fM)Wk`$NCMhnbKVh zG!7_>&H?;b;U0=*eUNagN}#b?e?C;XNY}fs3!6T;)3RV-I%Qj`dZ| z$94TCqU>D~oLJO4_DSDQ7J0Jt3lK#xhS0eo%m6>uy;=jpIO?6(d+jQnt4o5dj*h^c z(a!y-$TL6I>(f=%_#tZvG10)WTZhpCh26K${8*QfGe34qGg$6?P~zB&oUw?88mGWB zKNgO9ewW0>(1L)ezKT75XY&M1e4hq5fXMXJdsUJH8__6rEe(b`xKIqJu9}DmD#Gm@H2VSRs zY@M8fd6p(*+-9%Pwgd2EXTFkIG}g+3SZrW>8rq#R1BRUXqDM8X8g}MTMc|{;c;?40 zUrkHTvrBw#w_#8VVVGj18d1MzEFRKw@Y<2U$2r{zMb}(SLjZv6G8*t>&p&Pgk4HOY zwY*~ljRSt{xCu=esATfgkA10cpZ!RUXTz9WVG87T>c@67hI3FE4oydY`s2s$JIN-& zmST$l?b5mfHq$#Moe~`m2fw3G(lVI6{n^#M0kPHcL{o}{#yhCpc z7%0>K}!veJi~J;^K+dE^8hiR4|U3t)gT%GQ8xfz98xfHe%IFN=hag` zwiI?h`Kgn7S#&Mn$AZQ?_&@;*#x#H*%d+q)!POtR5dO!HT^`^{IrC%vX4kGSa7;r8 zJVkBir=w!e0NEJQ5#P{LKeogv9Pndv06!LI9e*?_64Dx*J8EgP(4^V$(l#MJ;M9-R z6pSd#b$Y7;dL01bQ3XvDT1#@w-S9~WHgfc^OAs$|jjUX38wuAo3uh~dX4JGhr@RR3 zURV}$>rhx4@}BQusiz2tIDYCxV^}NLtk$xQ?zORX-#hhV!?m1ms)9Nc+-Yp5yKx{r zJD4Iw-5u~_ajjq+lFb`dvPv|0?JX&yLMxb^w z_S4scd1Q9)sUM5Wbf`<5x7E1jC*mF4ps@MB9L-FC1f^=AHC5VMfh0hn*1vd3aTl)>JBLTA1G-YSPn_l4?H zKh~i&SWt+N5Z13*={`#?6%(@_`|54F`WQJ8;AnBwW=;-dLA6s|k$38RtC|cJ33Gibr z%T|IJkI>sgp)c5 z+|hSafyWG#Ip$?<;r7H;r|6t1;KvG``LP_pqeOQ5#={lk@R2sdaNa4%0pQ1GOGEoE~(e!BT9e;R=eeZ13s z>*NZz2pP^AKNXtLbz)q3v_4iO0$N{x8LgUMI>=|rpE z_V2-1Y6hNGj?M(O3pdx6xB0qLc(v2SJ^mdSTROzsk*j!nXYKnBp#dEFKAnW;uJF(k zZH@ZM@4F%+B__ot<(_+@qh+GkVinSM#m4cDU1<{6e~MdGd1x36t(<|eOGD+?F9>ka zrVQ<@ul_nW(~KdD|Mr~lP4Wzk^;sMDh8IQt1!MbEzFB(x-ai9l)029yMk@E8g0bs| z)t4(|9zQq*V>hPi+;pPPz}QxqmwB_67j_l^F!p}%#Ol@=7|Uh)#fsKyt3G(~SZRN2 zt}FeV?}_r^?#kF}-_0{Hc4_#f?;kKWIlr)jH(-=p0TDrJbvs9sT(ky(rBd8SMAGUh z@I*0Moq@46Joh<+_jzLefU!pVaS!d0FJw$xB%%`hxJC;8KYazFxRfD&DIs)9+R~;TkSS z8DLdBG1@1irMh|w#u5sjKhmNc5%bk$?w766I|XAYZ$!2mK%47r;@)V zV`}?z8~|f~_LZjg%TL|tRk<)7IC<~FmpiKs7rx&6eSBdif>KpsHi|t|VJ=pzQDHt& z=|o{6RbTbuw+Ggt7ZzozdJ zRQ6}%)l?6@JqlAj{Qjaz_4oQag6h$BubSHN-XtMR?c`v!NsVy)o1g{)3pW$d;lm-y z2x9t@W>RT<1a%04gipAI+yswdsz*R%N?NFH;;~#u2&lSnD{UG+Qc#(P;z~&?VZXgsA^U zg;tK%eyj9{&s1TiSJJfxBGoF0qRs_8eK^f8qxvv0A*fpVf7Ir>U&Uru6Bz~6{ehziHiJO_()RLOet8H1{#-bBXmAN3Tlx1I*p=b zK?w3BDP7)~T8l+QAtj~9+PRXO`agUD$JyWG~j9@}aR73{) zIrU&W2cq_pH2f)-e>d2;Ef{_~ zO1+HMijx{L@n{sV%6DSH`*q2A1w$;hz)UsDqSuZoEL!Ny%4$h8MHBHbX=KaGy%A^j zDIT~j3lB}@IKLp%j$v30U;2|htWw}?a<;&OShHDr+&yg^F9N&5O1obq6rgMI97wYRG7A740G<^C>bx@D2$Yxz+ExyWZ_Zn=eIacHCdOx z#kysSNfry4h5ZWXJz3s)n40UHF4@LxccbW97sye}9Ysn$bH>u?X)YV)m zTq%+R`p{BSz&trZigrg~#2{oDe~S-3m?A;>clUxV1#f7?ZxJ+#nz ztC3uUG_c1n>c%d3St9HJnUzC!tvR!ftn?!4NNPEkGL2im2BWqcL9$NANBQy14UtMI zGU4`l${7t}T>nP$4V_vllCk@OIAp9i!WLn$2M+dqWu#`~&qe*Si@6d`F6@rxZXfv0 zys;hS<;ly9wPgv7B)}js2eOl@W>uVOON&Q76UYk5r zeES6qijA-Cf9%^i=XbCWA_$gCoJmrK1(R1YL(jX{-VK{-tj;?k&bmlrR!VsXPVCV} zr#E}aR^!3T9^X(7+ZH+mkp<6HE_Y&L$Na=+T)#Z7@grDR5jeqib$Rm016mD<6rz@U z+`m~FZ5y~5EN0{e^L+#!j_Jv^5> z?mRE|A1ShhPP`gU+zC(F4EW^C2z>Ic^Kh^E3vo{oCI2w8_GgML3m?;8MHV9?kC6ow z*(pGgjkpLXvb%hekOd?K->D*t`$v%-aXcqGFC&PNjYesRwrKoOWRLjd%hXY93p71g z^iZqJsUq8=0*v~$I8$Vyg~~zvfrzKl=r-M-3zs0@j87F=uzZ#J`8j@y?`Mjvj+2H@ zi|J6Ip(q7EI%Wl z|3{HEE&dq}D6&M>nZ*|eioWUy-l{xPWI;l~k~F7^EYg`fK*&?n=uDA~&<7M*Ki5#Y zkEe?45unHt2U;a5Z@FCn6j_pG%m$#yVtoE6vbo>TPxyt(oRl6vRkU+f0~FbIfzU=h zr2$L?pvW35M^6BXZ0jXJk*z;dWJ3h>0Yw%{c&7PASi+fK0^KG7FtRvjUSU@aE{{`1 z)*g$&0E%qug)~geqVU=bIN2v6`Bafrzp)|U;p(B}zHEdjOi>hiL_r+>B<^Ag=0X}e z0rQU{Yw$;rZN2jx62a!icvPI3+2LKW6cH`_SCKthX0bd|WD%bbI*)(YpDD7vFS@lD2Pm@PPSOd3u$dCiea_u(>?JhT6;NdHfFi4&4~bSD02JAl3dgt$ z?M_dEPZe3D*Qp|lJXK`DE``@hTQD-5^m8TPx|{-y=3-S-M|8V-T^&ITb;ih|wy?ZA z?RU0JZR0MO7b}%>eD01XC)C(U{=o{?n6fmb4KcQmPn@1wVW!l>bZiLQfYDYKZ>j; zOhdt|4LOeLLq@9D0E#TO0Z?Su07Vx2S@w@2t9}bmWTgN_w*8MHi!PHu0g5cisUnMt zA@y%ULRWoYFWK8dg22UA@bsLvw_3NF(VRr zQ`Ce}q_R}Gk#Gqw=sD8ZdxdSCN&LFTLJKaAbX>JP)2O{Fm`^<>d`*vpz1bVRFNtzq zyEY|(9tV951|Zpp$umfH*AakZQRlTJXAL%xHtL7AZ9>XtkSx6XFC@G54kj{dn;lAR6q@+WdErY(Sdg$XdunDUu}slEv(NKZgP! zSyM4}ueMW2HfZ5Horc0u1QQ;Ga>dXm5Y^0KIYZi5jEPkHh}lOZW+#XMlAco-Eoj^_ zG0`HPK!}htCaRH)RDq)9F;b}gRvwO49S5{Bv|SpGRaIc*gXbz$wMYO%(GFS>-5Awn za}ko!zHJgeN44k%et6 z3Pe*@(t^n?oAMfhTf~U#5u%h>Cie&_D2eKR7i`8`aSJa8i`>C{WFbMpHQKnF+qeyg z-`aI@G-Hk((YK2`57#h8y^?FB>YdZBxOS$XmQZU!Yg|)*@X+ z^n*2$4Ss#wvkDcdPB~6=4!%iFsi9c)uaGR!=bQfm$->%M`nqX@qW%?Rw-t0ATmsUz<)N#yAN-Vv}a zd#jP-AllbeE>3S;mD(-2BNN0sSQIuV|3I?B@{%MGU4I~1;cw@YlZydJb^(B7!96ZN zkj<@3mp;)gXS%l#E9(X*v9~DpVEi?$ZiiiP_yfr*46CqfR8ebq)03!O!`^xN_M%gs znlS*$5=s7nWXH3(Q+(N#Ws%1hd}-U{U=utx0o#DXUdbB?K(c#emESzuqM z)?kNWB<%@k_HgWFi3r7+JQRX2@B-LIn!GDMYhzVFs8oB+fq`;lhVib4E`uRte*Z+?#6E+G~*|wMak+_r+2` zHQ*obcI7r@y96#*E#}u_%L=u=Ln-G7$=$lT9W#RT#izY(>20?MHiBQS zViarAkg$NzunlSFY=*Dg5!y;9*3m2dMA+XiFWkEO7W|BE7mXqzdTVtHl_cx7w5?jW zOVdqQw|@E#nYE7uBw2&d+;;&RgU_bwXWM{m)07zEj^+WT>k>I1*#-0n25|==A?x&C}a`pXz z;n)ES^bC^SDt8|{>L(mHQPd%~h(Dh}vM%h0AIy(B!hWlG9w9l1n%r9D6rx*wUi5ZL})EzmntPGuqfyE5Oe zq0#EQ%)WLF2AdMBEJv`P4<HLubT2%wF+h~Yr%c>az>OZ6EyftQ^Kga*EYM1Xxt66q z**V-6qNg<&Kt1Lz4RCLLnmaC-{xH{gNblUycOfWk367XCOv8hxo&NY$3kmzrI{J^w zdsFF`io$E1pr|Lj&qWKg;T^?WHVNQW+y-(1@f z04epK3>ZHhD!rP4*qRcuN*9(b&G*)wV6rYe?74L*r}5M1P{1eKhtt>Jx<@iTTZqtz zmu=+DK!gaEUf^}88h9d32!noJOwMX_z$Ws3zta&NF!X&%+^z&KOm9~;u0?6L^wl5L-w9L6Y2tfZpVt!4jRXa2G%7#S}*L9p2%!%;`N(GdJoSX z?K`@tbm1f7m^4`HO7A39kghy;h0et=H$KlyfwIvb&*iU9YZZLRsU~1I`nV;BCV=Z&mze)#ldS{;c>@=ADPiufidb%3m{4 z+>iI?v~!-3Vv^i{ixC_QvP>vVr3)|!QR>QBWxGZSVm}OA?geQFrH}mf$;_7mI?a^VaQGjvzZdrkPgaYWVPuwdFuKJ{ zpQA%|jgOTFHB^8RS)Y$Q(Q(jk4|sUp8O4p4!kn9X6{*$nlxt&oM8V>1TWa_m{D38r z7A>eD3?Gb*HC-U^vVPXlR;5e3JgZTkgQ4M6@PTa))thahw9Uj&=-E> zHVulPa81>csGrk!a4fKmbBdgAj*`oNMH9OFx=n0Ai_;Q`7R{7OM$j#!q?#^DXMZ+u z1Hjn!A55qjp48Aj+}*|CIC-ItGB78lNaqmQaX6!)|6}^R@1{$N4`UOdU!ufJvruHo z-B9WMSqeXg;;r$)OPF1H+zF^ejZ|>}1KGu$a0n}%#12FWW)#5p$Yio;2k)zzK4EDn zLD5ePgnN$Q7^+K9sKd)iJ|hJpa;195A|}bAyM_FT-}T59uVhtb6i7;Z*YS8i{Akp) zNc^6N`ey>_vMcQB6V_^x%ieJ#d5cv~bZwuTVkt%*H;$A-r;BCQFGt<}@{D^j%N6|N zMpOCE3a|%hq?$#_C^f#BJKXO&nbvG_EpZFCO_rF$aPo=GB#-%474yoTlT<8|ogs3?(hICto#d(~_@95A?FyUa{=vm9L8D^d=L3H`Z9U z_c9`I)t>V1#M>hI@(h7>ceQw{j>TKm(&8H4;-v?q=Yjtnlb;)_(EZ3iYuQ>K*Fi+CI;cZdYg9()9 zEGU?Lq9cX=Z2vJ=Y(?G@cJPWK( zQQiFm+evOV^Xy9&-6~9#@eblnY?~}y`irlUyHph|#<-te!PXC9ixeGRtclzYz zs~*ti&5P2r^`}3UdAeS<%)4}PNJ-fL@p0LA?|dEXqvrJ6S84_nbfQwe4v z1Qt%7$6GHr>t`s{En2^-Y~f>V*8-j~a`u@0tjwPFa#LfU7|i@xRch;f{Tln^=h6Q& z80$qjIUXO`X+AvdQ*r(&<6`D180&lYEBn`j(O-=R)4lplOoG%D=-3D2A2ik5 zc55BB>~)BtUJE}+ecQ;8MF^U?cCBJ!Ea;czwa_18cgcb@8*F)B zk!WSf*{+tLl}4`-luA#I4#sYG4O@cZC5y>8oBUzCoA=Ga!QTvhlfH=c(o9_$OcP~NS z8hu;4BpQvnKkr{h*uhZ=M5Wc^JA~ z53a~XUe6ZOpDA#E*8(OA8v4|!I-k!u5!E*xNSlpc()rRWf~cXCt!#L@L1b%^@3J7! zfkNPIyy%r)ad2FsqkhQ|M-4}Zla|vEUzXZf7?>>#Cs&-=X1L6JY)C_SS!4pl(dob; zJWH?jgrPZtt~ugTaKTK-AS*eBl>j2DTlB0zcrm}cJjrt7a}9z7J+pihn35{>lj9|c zh~)nJStd(;_8R;gT>>Q?4X3WywieG%M|;K;V+u45?9Z{6XQhSozKZ7V1D| z`LindPfg#uW@X~pDCg^zFq|x)0eTt!VqcZjl|bk(CySD<(QD15QVA97ed+OC^IuLD zEwf&^?>eQz_E43k8HX?RJ4{@a$5%ygu9xLh?GEF~QVI|nPN~JKsL3>|Xi%#a9^aD^ zzpw9czc@uvPFXnmA5IqD2*q%+m&34!+(Q3yvbdxyS}`v%oNQ(+G=oZo`Q3f}Il5{J z#T#8p-c+qYv{sJ3wr;I9)xLJWxOVY7oo-*g&%Qd^wK`I#I?Q(JIlT++sk&aNdV>#j z(c=1N&3eClbth9{CTultcaa!N7EE)Sj^QVpVW)*k2N+8h`L>mqMcR=xm+R%fmMnle zWu;Q2s#z-@4b1}fw^Dz#MBjyA*r_y_p$XH&7X7P9lW{ClCqJ0Wys5?67jHi^)>$(b zYV!kKb2*4<7RHiAnBLX>GEc@}vUcWABuo-a&?$ZvtW6dL^YXM)CKV7VCE<{38e6<{M~o#a z+z7>3vZRhxGYH){9?BGs6jM7j-86^7MmpzaoTB-)h<_|u&bfz53pMvz*EAO#4SzpW zNp~8Zbka|EFp;cr_Fr=aJ6Fw-@t$t{PIzK$>ip-~Q<>jSs05y#9X!o4c3LF1NdQQ& z58A~3mnD0AmSL+R@!ytg3C5DOb*n>DwEk9}06+VWCCf%|KC2;NS9QlQ#QIki?X|dM zqf+N|A3@mMU~ycjP*7FW^fWtwPPInu`hDxZz-o2OkQR)K4m;G|BJM8V61;OLeZ-^) zB+a~c4t%6ey+zKwINE$B4}1lse0x*%%ur0Y22a`#{EXWi%rbmo3pby6U!Uflucq|y0yJ`3t?D-up^^buQw!JF)?TmefY!JCAJ!STAu>IeJiOo z>lPeSw8Q9p7${SnhHi~+DZFhytk@{xRoj}tMYxzbpVSB6yV1$lBGRw9#3>6+zG%@! zsF}TiUaXqay-_e_GP|^m8!E!VbK_a&a#Ah9eBq(QZz57WXuQ&!dB~ zmO`%WLw6UT9=(LMNa4>e_#p@ZQ8YpBAkm=;H6Pf$5%Ls+Y99lX`H$IiG;4idL}eM% zI5#s{qWwz`7S<{wRbIZGt$h1SdgWNU=C}0gD>~8x^EKRd9Kz)=vOkGbGLf`0Z&$@# zIH;0vWK!s5e3|K!X{F!sSn$Ocrf^ZEoYB3}_tjXgU|DD}EXdHrNr!m+!^z63y-Is& zg7_esZ!Y$ilSMPY!{YKVoGifT`~fh(405q>|IYS3wJn1K9`>f6m)veY~OzQpEZjx?whvM zNOjZuozUy-Eax6sAS`?f%vx4&|K!-chzb=;TO1XRT874;c4!i1`t%SxxY znrhH09bMjFxYr4N^}Fo5j*jYLO2^NaD|d6UxvR4N@#xmyv5Dznt_Kk&xr=B2jK|Sq<%}-XF4CPwmW!v7TwbdN7#Rf+G zxwKx!aI#^~3hfQ6Ry*n#JDaRJ+papQu&GOaM=xSH+3>FM)vixDT^LSQ_;)q2c?vH(tL{?Ko3w%9?QDe=Ms<*`jEGJQ7;DzmQG+d+q8^L!WUW1*5i7jX zqI|XKr2e_Xuhxb!oUDLJNunxqTvrrgg9R=hPdn00CZBrli^r0+X|Ot4Lp@e1zkH!@ zQ59rb*Y&O`cdWfjw_9GmWld)Ax+0N*S$2_loo^sdUZ?naoSD4&v-~8L`y^xXpPAjs zhJb;BQ4MCv*fl#Fg3mEqg`g-~LSh}LZ3&dF40AQ6O20OTF9Y!%UI{Ua4;7Nt0-oR4 zG0~xkrywjWXHSKDW3rhhEl#1#I>acXp2}(!S?;_WUBB%*Xn20=)jY`ZX2HQ`@x{$( z@XdVq`kP2*9gHP=H_|(g!DLYu{q0=~IqOSL@)pwamY?4&_sXo?$agG^#+mfTVl3H| z2eBAS);)-msT=AkuYVCfqg^DViLqo0M^yf?WS#%Zl6^@4Ht3Fd7`(_fSChFy)^UU4 z`cO&BvRpHVrpZSiFrhOnnIMWAPiP$bmsd)IH^h221ba3|JF4$;vrgF_tY^gjwPYc& z6BtWY1&Xm`!O@U=y@9U(TCxaUj3vA6!08@BimwVH{BFSm@EZ5^j}O?{2sNtvb}&RC<2qILh}@{9rbD)TKr?lRy!GG---+O(xR{fhgBp- zY^ZW+_edV?t$sPxXFW51d}dZ~CTIGS!-ZwxY=nl032YKZ8$-ia2#r9=Gu;rBD^v4* zWi{x?9DVqE^M{NgkC!j-b)$tdM)d~Hp%EtZ+ z#$H|%iI-0oV!&91IPr>^Qk@*xU}A~Nxr&GH7II@HsupUjfp5hcw;nAvIF4K9I=o3< zZgDF#S(ZA{DDxF#cuT)oSD@;K!$Z$`u~=Z;LB@)~Ru{evy=4We(*O9RFBp1+Z-iZ% z;fqnFYn*%wiO&3e!=9 zD(w5LdYZ=AR(x6$8xbR1I)unM)Bf#)?;*0@qx14ULA^1Gwe$9z+1|%gkD%VJk)OaN zA2Ri%U))09XE>2-JuQr4i7S_Wf-P&z+0mX3^jH^`-ICU%6LGN^cv=JzPYoe<&5-2M za;+%KkK=QRk6NLm=e!hy!(v_R->_D95XOi9Nt)LMKMxXR75=iO<=$XTldR ztYUDZ)Yud-j)%2X-_d+3ty7hI%%V;znv`Q5j8za+;_k)Pwp(ijYMksLdERxra8ccH zU}HQcvDvJ-z@~mU4tsWPZTAK?>n(Apyopr(qLRoxH2MZhpR}g)v#I)3vZjKlGJX3CmBM0Szq^j`NRI*O{pzdT#|;FG-iwv8!=9YKDk+^zRLFo(~? zXIX$)$jjZzy4%Ma2$+YVLad!n^iFKTrpBt1ovA4_a?az`A+TsF z@Mws&&Na;2Rh5(jMQXGxjbKtGQc5-4i01zDG(}4=gktlPXljW#g~;MdV2{$ea8c|} zAvRulR4E+@Z|CUiN5-R(U!OwnwAx_^OD8l*$aJI}A}rR8j*S(EA5YrV3#hA%rsxpH zR$4Peyw*&L05cI#M;Q|Lpr5}%V^hv+#;8the+qTx$H9ddMCA)?L_RiijEPYH!4{LyeCkI&!SozQa8H?r!F{cyw z0;viTFaWck66{}N;X#eOAc=5C&SONpPQC#5N5jq69bToiywik^rzKFQK6Pb}{RFRE z4wi!tRPxx!Xz4!yrn~a7exekqH{zUJ?8(6AlbihGn#mk()T7k8$C)^>Tf!r@-Cjr` z?&^zGXlDx9{xT6b!7Dniafd)|eKYH)CqcwUA<>UN#b8eQVK%MUWNQ^Wm&IzHVD&WZ z{syDB^3Z<%UoW+t*WNHT4mc?iwT-Wz9><_Q^ijGd<9VhM5SwK_;o=I2T(+KLP5sbE zbGK6e;MI(?LI)t)Bi|*_-H224TdR zMv%9tXhQ@r-z`Dde;{HdwtV=8FHzgCs?ww`Zob;pFH58$TH#Gxn$Uy_1Hm6`k<(;tLViRd$&-(} zWaf!meL^axTfAA1ReDD}ZS_5ZzR`EzBNl8yIaW;kA$U|la}LNW&P}3uG2(N=erGeL zjHM!Cg&n(L2wWPItP|RadKtHDZM^&Y!{}~}UP#qkF16I#TPCF7vRw04N)mWhr|$mT z5uPZ80hc2kVHjSnnbO1N%#IJLVVzj3mJ_DjZGiFC>zux`jq`^;Cy9cvT(RW&0N4QK zFl=}yI$?*l|9y3Jd`lcnM{r@h083$%u}&rV=2n07IuiFdMuRDL2_ZeS6sIu~$6&TK zsMG3dI<5MMYxvjDeZ|zb&-*q@v2AfaPCD>MS$~5w(}#w<6IDlIwPUSwkf@_lv);o$ z2{Y2Xy*{n6Yw5y@=xe~*05s+PkobOurAG1mc2+u_qC|D|?N}&mHf=E3>Cc^q1rsCE zf$V1O=}SD>kejfaqV@p;QG>ME91X3QsR5}IO z7CGE4bq&WvIHi9U<;BiwKMsryNv5aUnEKlfs_f-$UoMUAwj3^eCbZ`m1^UnWe}d~i zW9&8rY{8^3iEbxIM$Q@)MKl)RFg#B@Yz^(d@Nl*Y_bNb6;#`K7{(fsTXLM7Q*ELI#R8TLPyQ$h}s>=Qb?8|AMis^L@rJQaLsG zC#Rn+j+E`R9*2DuHR|;%uwpr>HY@#(Gfb@aGGDUayhD~+oWMLR9(10Gpjqs>Tp;-x zT1UX77$w0fd5FDpxdKvR|Mdq2480uY4=KFQhRuht|LsKYf4@tK>X6^^a!%;uZZ&J0 z-&&B=;WqiX6p4OQ>%ChMdWZYl`tHX$p?5|SVv4B{)}}C8oQAt#g3sh5haq&IO{v*J za6X5D8^Q>V!ko)Oub+gE_&t~KCmlPH%E45X$s!*i0`izB}~I<2_yN^ z!v(->(}?G}Y{X;X5T-^l?hpQq7*Uo2{6C^B%1GD`xQG#Dg(;*Qy8jJ*9vl%3#$SZ_z#{7~@vN0jZv zh_VR9!a4jm+XQ}oW>&^=)LvxT!go*%DT|X#Fi0Ja(-61UERlk~XHepuJs*=u2%n4y z@#-g1^YlIbN0hx4#y>{ls{lYY=R{j-MA;~k2b`f5CZIOnM^ygtw+M2BB*jt$_!bqt zTWm?~6+iowKD*u!zU1G_n>Zu}A6OzFJ|?RZeuYZHgHReSZUcKLz$=3`txw}NDXhL| z#<(qr4k+Va0*F;8fvewu8{YBP+li>dx3_2UC)>%TMu}dt0YtnIcs;JR0cDSYD2o>b zF+Se*JiZwfJt5StlJ^~rsZ>7#7&J%-ujhghy%XI5a>u4j;6CWSFk}9ctnI?fX7ThY zDNqm#=v+wTYNme)xq~f!h#}QfDG_VMPXNgF>4a=ks z&#oNJ$|uThLgKqCgR)S#`8&Yso$UGwIx|l@&Oe@^v00y%Ws^1C9SmqSJaJp0*pf++ z!86$YND_8PHa7xqsSP+~i#-xX+9yoWCPRUq&ifh*oRh&>*a4!2N!UFpnhu%#ArbzN z{4FA?p2NEr6MvoyOk)iK@`&O@2B)zEroH0Z^VtEQEJ(JcFa^qmU=cG#01yqqc0s1^ z?*RR63kgAN6tWaoVFfrUc;gsUww{b$mV$|aDnK~^x6<=3DvLlUFbMrcWe*Ylo)qkV zQCW(Mu)-wa|3PJOc#AX{vQaz0IOP(8!XknroO&0WE?WYf%OZVZ44#MA_`3M6tk?rp z{CrHnV;U=L+N%hKdkdgdAH#PE18P994~5HgSIE(x+3ZR0V|ReBR^Ep)ZeM=>y&uX7kBRV^vLi9iy1|06=xuJ2D{%c9t;CX9R&` zWo7v^lNc)USeO~PTv^9Z)tXglTu~&5EICHta35CQ3RmgMrip&W-|4Ry7$Rq`$1ZZM zfk5-FxPgYtK+4ykga@E?SI}xUXiFBn>52E4z9z1$7!g?WON-Z=7=MqLuhdiZRLgZj zxrUIq?uL(BqL=jI5-I?#FTlqy*sUXq=O~e_!yD&aKg6@zsW+0WVMdh^$2W*<5K8ot zQ;qY^Z#O(ItK&xz(z`WYVU;jb)U#djTqfa}jyBTa6H0hB@VGTqu)Qy3XbOwSkh5|S zTg8$=7E5V27a5Y4R5mGI@g!C>3zXH(UE-=)H5-%A3?AO$N)$HgUh&{G<6_}hH6+x-3yFgEaCS2nDLY_H`7J0f=P@jd`w zHm)U^zx|zddzxE&Mtpl#ReR2Od*0rEfw9^hrEVSn0b|EIBzm!JryJl`Jd%8!37y1= zB)AwbwsZWi6Wd`$B?sy1*X|m0>yk5|`dHO9KHl|_v1^#5doZVKQoDP8yz>)d_p)5~ z!g%+1eD^YA_v&i*CP^1Fpa;d@vsHy5Cb5|*u%$eEh8S@q46*mBdJe97j;y*)ROdc2mLM)e=x>96SjhRCOOD zHlp%G43NURcyt(5st3-t2i$^&It_;`m`dclhfS&nje_dsuY3LEhp=AP+QMm)>y3pO zhumhkU%L;7>I`zN4FivxUEK$s$s=)6hTP!f&vGHJbowH$NBH2wLe;}bUA;x5cm3rP z!vXsv@(y)QBBbY)u(!u!U07o@)nhnDqpI*x*vxnaX?=*!n7I6CmGwkL&;%@HT(E0m zfOLeSYrJhj5On079lx?7uKwjE{A`E;~D#VS8@Ry}c@`>9%B5+`_U6i&`UHZ3MF%Q(pgS*5|a zvR0x2)G9<-$I$aey=M?uB0;cZ-jsmgG~XoGF=;Q4?hNGFOuzgcC-cFKS@qnN``8Pk zAqj<9%Y-@J&Bo2?7E7By#D2%|x^yBye&uz-H!! z?tECyTz*1dHGIV}sNTGDJ}jlLwtJb%2iT%49A_;tZOOO^m~+P>fh(*Ei40aLupTq5 zs9~>^3i8+pE=}kxO-^z(+^oRa*LxL2cEiBik=&U6)%An*uQ%%$RJQxs`sekP>Gips zpfyeVwXwOvPZT4Tk>52>>)}26iY1j1&;AAdSRTSPAd|WmBHUg?$ zB#M|Ngp7P`!DfB%xY;~sG<9ZtPhlxpZaq{KJxJL(K)G@GOzXrEgb`&ko4_IOSK|!% zLu~n%a5mqGk`w%N{%=uM_!bYKwn3Zxb=n9`dkdxc`DJc>g_(TS*7K_h`8RjF4e$}O zn-@~V@Y^lKlGHsInO>eKoW3NE;-hJPDuOXs|hQwB4 z`WDUi)luAKwa$@Kgr&AT9m(l1(9dvw8>fiVzY!5}SSCV+fgmak6&(%gmAo zksmn9_0-u!C>8uDw>E(@9tXe3e;qvnFV()H?#8_lR``!7ORY<2zmIMIllq7*S%jY4 zoBjy20DwuYNTypKAJT!2EJ0|?LLix*z6c0N#*!|Fk$HRQ&jD!jULe_{g=0m|r4ZmV z0G_5W{jRc2h3y9OnPI0lP~8)6KV+ZTP-@GLWxJ4NAIj%ca=w6iQyBJ{NFgvmiFE*$UJ^=CEJW3^jz#Mq$Q)ybuJXpug|_0C@v}!jot>-qX^*23!prI1npfCn*5n3$xnkj zSdTCvfKB^xmu0CfnZuSoS~ZG{SWURh5ArC4m73YquT(T%@r>HmS*JUy*S?=zgVi(Qis+V z)rayrLv|6cF8}RmiO@~g2aFGixhI@Lz+c2RjL`LQx9p%j`j$f01h<>pc}MF1Sh5e7 z{;_1KfBv;(BhS;X2d`(%%vQhMS6ZHZ9m=yh z_xREseE0TQ09u;2Uu)8W6mM37;}0t3$d4X5*9^F1MqPn}#a3}N{d(RZSUiu}> zs%+|OhTZLV!Ou@vx;ugb5J_3ZqXjV0r`q9S5g?A^3)Q++@$k1PapXsv45E!k!^K`v zUQmG+YSn34yc3ZKxYhB8BneSri|-`%8$0!2U}si;jLi3^pHw4lEh0di?ClZdYOt;! z+4cW$vKRkyvK#mP0)y%sxjfpPVqc|O6eZZzLaJI?zlaurA8Qu95eu~~ED`7RU9J-P z3P0qC7rLH~%oIvjnJ#!)i`c?&vTRe*TO}Q&(vgyT0wj-v+W8DZlRiAj<+?VghR)-b zHXCUR4T>An#rARVd5rj}v3(YGZYn*ExD=`Pf@0Sb&vN5BgXgj45PEmu)G z%*Wm*cyUI&C&RS!T2QeiJK8Y^-q01gd$c+$faqqfxugF1Rd>I$^mI7jJ}t$;r7CUv zHT0XR2U+}^evj+qQk{hu<8p8x+>-{}W~xBDPGV>0$aIXYo57~=)rMOm^mw{F=M$YP zPM_KNS#kQCU$IGai>=iYF}}gjn)E;_K5$$x%?61y=UNsK*c>0U%*s0ib$ZOZ!8Skl z0Z@}_5J{;g#rT5ShQ0FTwK@BEBR5`)%lm>TS=Wm9WgWXK_geUYHmy@X$i!uQ^$bJB zK~$RY9?YHl=R-$4IgfqDohdk#fAhriFMC*d+;z}D+9uIz{f)=e<~;$H(xBnvN(ty# zlO4BP$)rCb0cy0ZTXjFvWth2UD@%BZFwo?T%yuY8iP+H(3r46tBp^=s%+;=m;be;o zV>Hq-Fr4h@S<>%jdHPzP=bxXuXH#eYh+E`%*wi+DO*i~a^3h*TR(kfyC(Y*X*-x~B ze-DdAAv~e1ks4025><;&J_etEjaCY|XqbvCXjFBT!f>*^vd zG9T-)^{kp7`Y)skY^$+a&_yc-YT}oc?n`}&&mYG(Oy@eqp^hFIhQKKjFV{c4ioh8lg(#sU zG-)7okg|6Ap6=CmK43~0u(hZ)5?_F0nD zB1VED-#lEKU`>7L$XVL>AmD~OaeD$JT~35&Hi<>-BmH`2?!9Ju>s+Z&q>D~yKk!Ph zq^vnMTHE)CXWf=mprjhE7E_wULR?w?+)h*J?O0Nf_&lpzrLI!YWqZ!b-1}yW%7>Tp z6hbS`^gfyPrpZ!sLGa>Oi7fW7qJD`@e2!J(^VHm0B7>E%5>{)A7&&%{`vjwNb)k6f zws>+&OF?$z0U!Ec0S?_l?n$NCCu)o<^(MIyu(HJ@cShN4dM!s&mp3-!*j(Ce6L5|u zBE0_sV_lDglb$o&eKYeTm5?&?`~P6<6WPW855@*eI4SyMFRp!;8V;E9>+#7yU;Cj% z5I7yF=vzd)zOA7VIFr`nTPC%>qaPhOSE}e&X|}#=HXOL{|0ozM_kRUrufHjUUM_yU hJs%FeJ^L7Xd;azAjsOXuMq+UyvBi;}NeBUQ{|gnAg+TxS literal 0 HcmV?d00001 diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index bd109eecd0..6e7f0cb6fd 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -30,6 +30,11 @@ class BeamInferenceResult; class BatchConfig { public: + enum Mode { + INC_DECODING_MODE = 0, + BEAM_SEARCH_MODE = 1, + TREE_VERIFY_MODE = 2 + }; using RequestGuid = size_t; using TokenId = int; BatchConfig(); @@ -42,9 +47,10 @@ class BatchConfig { int num_active_requests() const; int num_active_tokens() const; void print() const; - static int const MAX_NUM_REQUESTS = 8; + virtual Mode get_mode() const; + static int const MAX_NUM_REQUESTS = 1; static int const MAX_NUM_TOKENS = 64; - static int const MAX_SEQ_LENGTH = 512; + static int const MAX_SEQ_LENGTH = 256; // These are set by update int num_tokens; @@ -69,6 +75,9 @@ class BatchConfig { class TreeVerifyBatchConfig : public BatchConfig { public: + TreeVerifyBatchConfig(); + ~TreeVerifyBatchConfig(); + Mode get_mode() const; // struct PerTokenInfo : BatchConfig::PerTokenInfo { // int tree_branch_idx; // }; @@ -93,6 +102,7 @@ class BeamSearchBatchConfig : public BatchConfig { public: BeamSearchBatchConfig(); BeamSearchBatchConfig(size_t beam_width, size_t target_iterations); + Mode get_mode() const; ~BeamSearchBatchConfig(); @@ -102,7 +112,7 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; static int const MAX_BEAM_WIDTH = 1; - static int const MAX_BEAM_DEPTH = 4; + static int const MAX_BEAM_DEPTH = 8; struct BeamSearchPerRequestInfo { bool request_completed; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index ed5c6c3aa0..318dd7c9a3 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -72,12 +72,17 @@ struct BeamTree { // std::vector probs; // }; +class Tokenizer; + class RequestManager { public: using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; + RequestManager(Tokenizer *tokenizer, bool verbose = false); RequestManager(); size_t get_num_processed_requests(); + RequestGuid register_new_request(std::string const &prompt, + int max_sequence_length); RequestGuid register_new_request(std::vector const &prompt, int max_sequence_length); BatchConfig prepare_next_batch(BatchConfig const &bc, @@ -111,8 +116,9 @@ class RequestManager { std::vector> const &outputSerializedTree); - TreeVerifyBatchConfig - convert_beam_to_tree_batch_config(BeamSearchBatchConfig const &beam_bc); + // TreeVerifyBatchConfig + // convert_beam_to_tree_batch_config(BeamSearchBatchConfig const + // &beam_bc); static void load_tokens_task(Legion::Task const *task, @@ -126,6 +132,8 @@ class RequestManager { Legion::Runtime *runtime); private: + Tokenizer *tokenizer; + bool verbose; std::queue pending_request_queue; std::unordered_map running_request_queue; std::mutex request_queue_mutex; @@ -142,7 +150,16 @@ class RequestManager { // Commited Tokens std::unordered_map>> committed_tokens; + // Performance profiling size_t num_processed_requests; + +private: + struct ProfileInfo { + int decoding_steps; + double start_time, finish_time; + }; + std::unordered_map profiling_requests; + double total_request_run_time; }; } // namespace FlexFlow diff --git a/include/flexflow/tokenizers.h b/include/flexflow/tokenizers.h new file mode 100644 index 0000000000..ffce2d423e --- /dev/null +++ b/include/flexflow/tokenizers.h @@ -0,0 +1,103 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "gpt_tokenizer.h" +#include + +namespace FlexFlow { + +/*! + * \brief a universal tokenizer that loads + * either HF's tokenizer or sentence piece, depending on the type. + */ + +class Tokenizer { +public: + // bos token + int32_t bos_token_id{0}; + // eos token id + int32_t eos_token_id{1}; + + virtual ~Tokenizer() {} + virtual std::vector Encode(std::string const &text) = 0; + virtual std::string Decode(std::vector const &ids) = 0; + + // static std::unique_ptr FromFile(const std::string& path); + // static std::unique_ptr ByteLevelBPEFromFile(const std::string& + // path); +}; + +class SentencePieceTokenizer : public Tokenizer { +public: + SentencePieceTokenizer(std::string const &path) { + sentence_piece_.Load(path); + } + + std::vector Encode(std::string const &text) final { + std::vector tokens; + sentence_piece_.Encode(text, &tokens).IgnoreError(); + return tokens; + } + + std::string Decode(std::vector const &ids) final { + std::string text; + sentence_piece_.Decode(ids, &text).IgnoreError(); + return text; + } + +private: + // the tokenizer + sentencepiece::SentencePieceProcessor sentence_piece_; +}; + +class OptTokenizer : public Tokenizer { +public: + OptTokenizer(std::string const &vocab_file, // path to "gpt2-vocab.json" + std::string const &merges_file) // path to "gpt2-merges.txt" + : tokenizer(OPT, vocab_file, merges_file) { + bos_token_id = 0; + eos_token_id = 2; + } + + std::vector Encode(std::string const &text) final { + std::vector tokens; + std::vector mask_ids; + tokenizer.encode(text, text.length(), &tokens, &mask_ids); + + auto it = std::find(mask_ids.begin(), mask_ids.end(), 0); + + if (it != mask_ids.end()) { + size_t index = std::distance(mask_ids.begin(), it); + tokens.erase(tokens.begin() + index, tokens.end()); + } + + return tokens; + } + + std::string Decode(std::vector const &ids) final { + std::vector mask_ids; + for (int i = 0; i < ids.size(); i++) { + mask_ids.push_back(1); + } + std::string text = tokenizer.decode(ids, mask_ids); + return text; + } + +private: + GPT_Tokenizer tokenizer; +}; + +}; // namespace FlexFlow diff --git a/examples/cpp/inference/gpt_tokenizer.h b/include/gpt_tokenizer.h similarity index 96% rename from examples/cpp/inference/gpt_tokenizer.h rename to include/gpt_tokenizer.h index 701436076a..0a2388925a 100644 --- a/examples/cpp/inference/gpt_tokenizer.h +++ b/include/gpt_tokenizer.h @@ -58,13 +58,13 @@ class GPT_Tokenizer { // ~GPT_Tokenizer(); std::vector bpe(std::wstring token); std::vector tokenize(std::string str); - int64_t convert_token_to_id(std::string token); + int32_t convert_token_to_id(std::string token); void encode(std::string str, size_t max_length, - std::vector *input_ids, - std::vector *mask_ids); - std::string decode(std::vector input_ids, - std::vector mask_ids); + std::vector *input_ids, + std::vector *mask_ids); + std::string decode(std::vector input_ids, + std::vector mask_ids); tokenizer_mode mode; std::string bos_token; std::string eos_token; @@ -74,8 +74,8 @@ class GPT_Tokenizer { std::string strip(std::string const &inpt); private: - std::unordered_map vocab; - std::unordered_map inverse_vocab; + std::unordered_map vocab; + std::unordered_map inverse_vocab; std::unordered_map bpe_ranks; wchar_t *bytes_to_unicode(); void unicode_to_bytes(); diff --git a/examples/cpp/inference/file_loader.cc b/inference/file_loader.cc similarity index 86% rename from examples/cpp/inference/file_loader.cc rename to inference/file_loader.cc index 15b88455e5..edd63ba167 100644 --- a/examples/cpp/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -34,19 +34,15 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { BatchConfig::TokenId *prompts = (BatchConfig::TokenId *)malloc(sizeof(BatchConfig::TokenId) * 40); - std::cout << "load input from file: " << input_path << std::endl; std::ifstream in(input_path, std::ios::in | std::ios::binary); int size = num * length; std::vector host_array(size); size_t loaded_data_size = sizeof(long) * size; - std::cout << "loaded_data_size: " << loaded_data_size << std::endl; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); - std::cout << "loaded_data_size: " << loaded_data_size << std::endl; - size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { std::cout << "load data error" << std::endl; @@ -54,15 +50,11 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { } assert(size == host_array.size()); - int index = 0; int data_index = 0; - std::cout << "loaded_data_size: " << loaded_data_size << std::endl; - std::cout << host_array.size() << "\n"; for (auto v : host_array) { prompts[data_index++] = v; - std::cout << data_index << ", " << (int)v << "\n"; } in.close(); return prompts; @@ -92,8 +84,6 @@ void load_attention_bias(float *ptr, for (auto file : bias_files) { size_t partial_size = hidden_dim; - std::cout << "partial_size in bias" << partial_size << ", file: " << file - << "\n"; std::ifstream in(file, std::ios::in | std::ios::binary); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; @@ -154,12 +144,8 @@ void load_attention_weights(float *ptr, // q, k, v, o -> 0, 1, 2, 3 for (auto file : weight_files) { - std::cout << "file name and index: " << file << "->" << file_index << "\n"; size_t partial_size = one_weight_file_size; - std::cout << "partial_size weight " << partial_size << ", " << volume - << ", " << hidden_dim << ", " << qkv_inner_dim << ", " - << num_heads << "\n"; std::ifstream in(file, std::ios::in | std::ios::binary); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; @@ -191,7 +177,6 @@ void load_attention_weights(float *ptr, } void load_from_file(float *ptr, size_t size, std::string filename) { - std::cout << "load from file: " << filename << std::endl; std::ifstream in(filename, std::ios::in | std::ios::binary); std::vector host_array(size); size_t loaded_data_size = sizeof(float) * size; @@ -200,16 +185,11 @@ void load_from_file(float *ptr, size_t size, std::string filename) { in.read((char *)host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); - // std::cout << "size seee" << std::endl; - // std::cout << loaded_data_size << std::endl; - // std::cout << in_get_size << std::endl; if (in_get_size != loaded_data_size) { std::cout << "load weight data error " << in_get_size << ", " << loaded_data_size << ", " << sizeof(float) << std::endl; return; } - - // std::cout << "finish loading input" << std::endl; assert(size == host_array.size()); // normal @@ -225,21 +205,17 @@ void FileDataLoader::load_positions(FFModel *ff, ParallelTensor position_pt, int max_seq_length, int offset) { - std::cout << "load positions" << std::endl; size_t volume = 1; std::vector dims_vec; for (int i = 0; i < pt->num_dims; i++) { - // std::cout<< pt->dims[i] << "\n"; volume *= pt->dims[i]; dims_vec.push_back(pt->dims[i]); - std::cout << dims_vec.at(dims_vec.size() - 1) << ", "; } // load data; int *data = (int *)malloc(sizeof(int) * volume); for (int i = 0; i < volume; i++) { data[i] = i % max_seq_length + offset; - std::cout << data[i] << ", "; } // set tensor @@ -255,12 +231,9 @@ void FileDataLoader::load_weights( for (auto &v : weights_layers) { int weights_num = v.second->numWeights; - std::cout << "weight layer: " << v.first << ", num" << weights_num << "\n"; - for (int i = 0; i < weights_num; i++) { Tensor weight = v.second->weights[i]; if (weight == NULL) { - std::cout << "op no weights : " << v.first << "\n"; continue; } @@ -270,13 +243,11 @@ void FileDataLoader::load_weights( dims_vec.push_back(weight->dims[i]); volume *= weight->dims[i]; } - std::cout << "load weights volume: " << volume << std::endl; assert(weight->data_type == DT_FLOAT); float *data = (float *)malloc(sizeof(float) * volume); if (v.first.find("attention_w") != std::string::npos) { - std::cout << "load weights bias: " << volume << "\n"; if (i == 0) { load_attention_weights(data, num_heads, diff --git a/examples/cpp/inference/file_loader.h b/inference/file_loader.h similarity index 100% rename from examples/cpp/inference/file_loader.h rename to inference/file_loader.h diff --git a/examples/cpp/inference/models/llama.cc b/inference/models/llama.cc similarity index 83% rename from examples/cpp/inference/models/llama.cc rename to inference/models/llama.cc index 7686ba746d..4a4eca1c8a 100644 --- a/examples/cpp/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -19,11 +19,45 @@ namespace FlexFlow { using namespace Legion; +LLAMA::Config LLAMA::create_190m_config() { + Config config; + config.n_layers = 12; + config.vocab_size = 50265; + config.dim = 768; + config.n_heads = 12; + config.hidden_dim = 3072; + return config; +} + +LLAMA::Config LLAMA::create_7b_config() { + // The default config is for llama 7b + Config config; + return config; +} + +// Deprecated API void LLAMA::create_llama_model(FFModel &ff, InferenceManager &im, Config const &llama_config, int num_pipeline_stages, InferenceMode mode) { + assert(false); +} + +void LLAMA::create_llama_model(FFModel &ff, + InferenceManager &im, + std::string const &model_name, + std::string const &weight_file_path, + int num_pipeline_stages, + InferenceMode mode) { + Config llama_config; + if (model_name == "190m" || model_name == "190M") { + llama_config = create_190m_config(); + } else if (model_name == "7b" || model_name == "7B") { + llama_config = create_7b_config(); + } else { + assert(false && "Invalide model_name"); + } //------------------------------compute machine views ------------------ int num_devices = ff.config.workersPerNode * ff.config.numNodes; std::vector machine_views; @@ -40,11 +74,9 @@ void LLAMA::create_llama_model(FFModel &ff, std::unordered_map> mapping; std::unordered_map weights_layers; - std::cout << "print llama config: " << llama_config.input_path << "-->" - << llama_config.batchSize << std::endl; - Tensor input; { + assert(llama_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -61,10 +93,11 @@ void LLAMA::create_llama_model(FFModel &ff, Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); + int num_transformer_layers = llama_config.n_layers; int num_transformer_layers_per_stage = - (32 + num_pipeline_stages - 1) / num_pipeline_stages; + (num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages; - for (int i = 0; i < 1; i++) { + for (int i = 0; i < num_transformer_layers; i++) { // step 1: attention std::vector axes = {2}; Tensor att_norm = @@ -170,7 +203,7 @@ void LLAMA::create_llama_model(FFModel &ff, } // final normalization and linear std::vector axes = {2}; - token = ff.rms_norm(token, 1e-6, 4096); + token = ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); Layer *final_norm = ff.layers.back(); weights_layers.emplace("norm_weight", final_norm); @@ -189,8 +222,8 @@ void LLAMA::create_llama_model(FFModel &ff, // Compile the model std::cout << "------start compile ----------" << std::endl; im.compile_model_and_allocate_buffer(&ff, mapping); - FileDataLoader fileloader(llama_config.input_path, - llama_config.weight_file_path, + FileDataLoader fileloader("", + weight_file_path, llama_config.n_heads, llama_config.dim, llama_config.dim / llama_config.n_heads); diff --git a/examples/cpp/inference/models/llama.h b/inference/models/llama.h similarity index 65% rename from examples/cpp/inference/models/llama.h rename to inference/models/llama.h index 8c7d464936..e99beb92ca 100644 --- a/examples/cpp/inference/models/llama.h +++ b/inference/models/llama.h @@ -31,14 +31,12 @@ class LLAMA { dim = 4096; multiple_of = 256; norm_eps = 1e-6; - sentence_len = 347; - batchSize = 5; total_requests = 2560; incremental_mode = true; - sequence_length = BatchConfig::MAX_SEQ_LENGTH; - max_seq_len = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = 1; - max_beam_depth = 4; + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; // hidden dim hidden_dim = 4 * dim; @@ -47,18 +45,27 @@ class LLAMA { multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); } int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, - sentence_len, batchSize, total_requests, incremental_mode, - sequence_length, max_seq_len, max_beam_width, max_beam_depth; + total_requests, incremental_mode, max_seq_len, max_num_tokens, + max_beam_width, max_beam_depth; float norm_eps; std::string weight_file_path; std::string input_path; + std::string tokenizer_file_path; }; static void create_llama_model(FFModel &ff, InferenceManager &im, - Config const &llama_config, + std::string const &model_name, + std::string const &weight_file_path, int num_pipeline_stages, InferenceMode mode); + static void create_llama_model(FFModel &ff, + InferenceManager &im, + LLAMA::Config const &llama_config, + int num_pipeline_stages, + InferenceMode mode); + static Config create_190m_config(); + static Config create_7b_config(); }; }; // namespace FlexFlow diff --git a/examples/cpp/inference/models/opt.cc b/inference/models/opt.cc similarity index 100% rename from examples/cpp/inference/models/opt.cc rename to inference/models/opt.cc diff --git a/examples/cpp/inference/models/opt.h b/inference/models/opt.h similarity index 85% rename from examples/cpp/inference/models/opt.h rename to inference/models/opt.h index 11ae888eba..6ffc4220e9 100644 --- a/examples/cpp/inference/models/opt.h +++ b/inference/models/opt.h @@ -24,14 +24,14 @@ class OPT { public: struct Config { Config(void) { - vocab_size = 50272, word_embed_proj_dim = 768, hidden_size = 768; + vocab_size = 50272; + word_embed_proj_dim = 4096; + hidden_size = 4096; max_position_embeddings = 2048; layer_norm_elementwise_affine = true; - num_attention_heads = 12; + num_hidden_layers = 32; dropout = 0.1; - seed = 3; - ffn_dim = 3072; - num_hidden_layers = 12; + ffn_dim = 16384; max_beam_width = 1; batchSize = 8; sentence_len = 100; @@ -43,10 +43,10 @@ class OPT { int num_attention_heads; std::string input_path; std::string weight_file_path; + std::string tokenizer_assets_folder; int max_position_embeddings; bool layer_norm_elementwise_affine; float dropout; - unsigned long long seed; int ffn_dim; int num_hidden_layers; int max_beam_width; @@ -55,6 +55,16 @@ class OPT { int max_beam_depth; }; + struct Small_Config : public Config { + Small_Config(void) { + word_embed_proj_dim = 768; + hidden_size = 768; + num_attention_heads = 12; + ffn_dim = 3072; + num_hidden_layers = 12; + } + }; + static void create_opt_model(FFModel &ff, InferenceManager &im, Config const &opt_config, diff --git a/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt similarity index 79% rename from examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt rename to inference/spec_infer/CMakeLists.txt index f273a385ea..c374d47506 100644 --- a/examples/cpp/inference/SPEC_LLAMA/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -1,19 +1,19 @@ cmake_minimum_required(VERSION 3.10) -project(FlexFlowExample_SPEC_LLAMA) -set(project_target SPEC_LLAMA) +project(FlexFlow_SpecInfer) +set(project_target spec_infer) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} - llama.cc + spec_infer.cc ../file_loader.cc - ../models/llama.cc) - + ../models/llama.cc + ../models/opt.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") diff --git a/inference/spec_infer/MODEL_WEIGHTS.md b/inference/spec_infer/MODEL_WEIGHTS.md new file mode 100644 index 0000000000..79a194b159 --- /dev/null +++ b/inference/spec_infer/MODEL_WEIGHTS.md @@ -0,0 +1,27 @@ +To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files. + +```python +model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") + +for name, params in model.named_parameters(): + for name, params in model.named_parameters(): + name = ( + name.replace(".", "_") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("o_proj", "wo") + .replace("mlp", "feed_forward") + .replace("gate_proj", "w1") + .replace("down_proj", "w2") + .replace("up_proj", "w3") + .replace("input_layernorm", "attention_norm") + .replace("post_attention_layernorm", "ffn_norm") + .replace("embed_tokens", "tok_embeddings") + .replace("lm_head", "output") + .replace("model_", "") + ) + params.detach().cpu().numpy().tofile('weights/' + name) +``` + diff --git a/examples/cpp/inference/SPEC_LLAMA/Makefile b/inference/spec_infer/Makefile similarity index 94% rename from examples/cpp/inference/SPEC_LLAMA/Makefile rename to inference/spec_infer/Makefile index 32e8e1cf3d..0e4b79f51f 100644 --- a/examples/cpp/inference/SPEC_LLAMA/Makefile +++ b/inference/spec_infer/Makefile @@ -23,10 +23,8 @@ USE_HDF ?= 1 # Include HDF5 support (requires HDF5) ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) # Put the binary file name here -OUTFILE ?= spec_llama +OUTFILE ?= llama_pipeline # List all the application source files here -GEN_SRC = llama.cc dataloader.cc -GEN_GPU_SRC = dataloader.cu ifndef CUDA_HOME CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) endif diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc new file mode 100644 index 0000000000..2068da7f3e --- /dev/null +++ b/inference/spec_infer/spec_infer.cc @@ -0,0 +1,151 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/tokenizers.h" +#include "models/llama.h" +#include + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("llama"); + +struct FilePaths { + std::string llm_weight_file_path; + std::vector ssm_weight_file_paths; + std::string prompt_file_path; + std::string tokenizer_file_path; +}; + +void parse_input_args(char **argv, int argc, FilePaths &paths) { + for (int i = 1; i < argc; i++) { + // weights + if (!strcmp(argv[i], "-llm-weight")) { + paths.llm_weight_file_path = std::string(argv[++i]); + continue; + } + // weights + if (!strcmp(argv[i], "-ssm-weight")) { + std::string file_path = std::string(argv[++i]); + paths.ssm_weight_file_paths.push_back(file_path); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // tokenizer + if (!strcmp(argv[i], "-tokenizer")) { + paths.tokenizer_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + FilePaths file_paths; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, file_paths); + SentencePieceTokenizer tokenizer(file_paths.tokenizer_file_path); + InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); + RequestManager rm(&tokenizer); + int total_num_requests = 0; + { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + rm.register_new_request(text, 128 /*max_sequence_length*/); + } + } + if (file_paths.ssm_weight_file_paths.size() == 0) { + assert(false && + "SpecInfer needs at least one SSM for speculative inference"); + } + + FFModel beam_model(ffconfig); + FFModel tree_model(ffconfig); + LLAMA::create_llama_model(beam_model, + im, + "190m", + file_paths.ssm_weight_file_paths[0], + 1, + BEAM_SEARCH_MODE); + LLAMA::create_llama_model(tree_model, + im, + "7b", + file_paths.llm_weight_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + TREE_VERIFY_MODE); + + TreeVerifyBatchConfig tree_bc; + BeamSearchBatchConfig beam_bc; + InferenceResult tree_ir; + + while (rm.get_num_processed_requests() < total_num_requests) { + int depth = 0; + // Beam Search + beam_bc = rm.prepare_next_batch_init(tree_bc, tree_ir); + if (rm.get_num_processed_requests() >= total_num_requests) { + break; + } + while (true) { + depth = beam_bc.beamRequestsInfo[0].current_depth; + FutureMap fm = im.inference(&beam_model, 0, beam_bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + BeamInferenceResult beam_ir = future.get_result(); + if (depth - 1 >= BeamSearchBatchConfig::MAX_BEAM_DEPTH) { + break; + } else { + beam_bc = rm.prepare_next_batch_beam(beam_bc, beam_ir); + } + } + // Token Tree Verification + { + tree_bc = rm.prepare_next_batch_verify(beam_bc); + FutureMap fm = im.inference(&tree_model, 0, tree_bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + tree_ir = future.get_result(); + } + } + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 6a49573538..16c14bdeca 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -344,8 +344,8 @@ BeamInferenceResult BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; - std::cout << "beam search topk inference: " - << "\n"; + // std::cout << "beam search topk inference: " + // << "\n"; BeamTopKMeta const *m = *((BeamTopKMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( @@ -378,8 +378,8 @@ BeamInferenceResult size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; size_t batch_size = in1_domain.get_volume() / length; - std::cout << "beam search topk params: " << length << ", " << k << ", " - << batch_size << "\n"; + // std::cout << "beam search topk params: " << length << ", " << k << ", " + // << batch_size << "\n"; assert(out2_domain.get_volume() / k == batch_size); // std::vector beam_width; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index b6fccb68ba..d18d5c2f00 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -23,6 +23,8 @@ using Legion::coord_t; enum class HeapType { kMinHeap, kMaxHeap }; enum class PreferIndices { kLower, kHigher }; +LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); + template struct Entry { int index; @@ -268,7 +270,8 @@ __device__ void mergeBeamShards(int num_shards, Entry *__restrict__ top_k_heap, T *top_k_values, int *top_k_indices, - int *top_k_parents) { + int *top_k_parents, + bool verbose) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -288,7 +291,7 @@ __device__ void mergeBeamShards(int num_shards, float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + ((slot % max_heap_size) / k)]; min_heap.assign(slot, {slot, (entries[slot].value * prob)}); - if (batch_index == 0) { + if (verbose && batch_index == 0) { printf("slot %d, value %.15f, prob %15f\n", slot, entries[slot].value, @@ -304,7 +307,7 @@ __device__ void mergeBeamShards(int num_shards, float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + ((shard % max_heap_size) / k)]; - if (batch_index == 0) { + if (verbose && batch_index == 0) { printf("shard %d, index %d, value %.15f, prob %.15f\n", shard, entry.index, @@ -397,7 +400,7 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, T *__restrict__ output, int *__restrict__ indices, int *__restrict__ parents, - bool is_print) { + bool verbose) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; // T const *batch_input = input + batch_index * length; @@ -408,7 +411,7 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, Entry *shared_entries = (Entry *)shared_memory; int sub_request_id = thread_index / k; - // if (is_print) { + // if (verbose) { // printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d, // " // "request_id %d, token_nums %d\n", @@ -422,7 +425,7 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, T const *batch_input = input + gpu_block_start_index[batch_index] + (sub_request_id * token_nums * length); - if (batch_index == 0) { + if (verbose && batch_index == 0) { printf("request 0 start index: thread index %d, offset %d, batch_input %p, " "acc index %d acc " "prob %f, thread_count %d, request_id %d\n", @@ -465,18 +468,18 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, if (thread_index == 0) { // merge beam_width heaps and store the parent // find which req it belongs to, replace the offset - printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", - batch_index, - sub_request_id, - acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - sub_request_id]); + // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", + // batch_index, + // sub_request_id, + // acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + // sub_request_id]); int const offset = batch_index * k; auto batch_output = output + offset; auto batch_indices = indices + offset; auto batch_parents = parents + offset; Entry *top_k_heap = shared_entries + thread_count * k; - // if(batch_index == 0 && is_print){ + // if(batch_index == 0 && verbose) { // for(int i = 0; i < 18; i++){ // printf("see value: %.15f\n", shared_entries[i].value); // } @@ -494,7 +497,8 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, top_k_heap, batch_output, batch_indices, - batch_parents); + batch_parents, + verbose /*verbose prints*/); } } @@ -529,8 +533,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, std::vector tokens_per_request; int block_start_index = 0; - int depth = - bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; // a data structure for prob, parent_id, int max_total_requests = @@ -551,7 +553,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; + log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] + << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -559,12 +562,12 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - std::cout << "probbbb req: " << i << ", sub req probability : " - << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j - << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] - << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j - << "\n"; + log_beam_topk.debug() + << "probbbb req: " << i + << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] + << ", sub request id " << j << ", parent id " + << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" + << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; } // process tokens @@ -581,8 +584,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } - std::cout << "what index: " << block_start_index - << ", block num: " << beam_num_blocks << "\n"; + log_beam_topk.debug() << "what index: " << block_start_index + << ", block num: " << beam_num_blocks << "\n"; assert(batch_size >= beam_num_blocks); assert(bc->num_active_requests() == req_index); @@ -596,9 +599,9 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, if (num_shards > CUDA_NUM_THREADS) { num_shards = CUDA_NUM_THREADS; } - std::cout << "maxheap size: " << max_heap_size << "\n"; - std::cout << "maxbeam width: " << max_beam_width - << ", heap size: " << heap_size << "\n"; + log_beam_topk.debug() << "maxheap size: " << max_heap_size << "\n"; + log_beam_topk.debug() << "maxbeam width: " << max_beam_width + << ", heap size: " << heap_size << "\n"; } // We are limited by the amount of shared memory we have per block. size_t shared_memory_size = @@ -627,7 +630,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, tokens_per_request.data(), sizeof(int) * beam_num_blocks, cudaMemcpyHostToDevice)); - + // int depth = + // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; beam_topk_forward_kernel<<>>( input_ptr, shared_memory_size, @@ -643,7 +647,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, output_ptr, indices_ptr, parent_ptr, - depth == 1); + false /*verbose*/ // depth == 1 + ); // merge sub } @@ -698,10 +703,13 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) { sizeof(float) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&block_start_index, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&request_id, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&tokens_per_request, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); } }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index bc96e2a587..a0f0c34c1b 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -48,6 +48,8 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; +LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA"); + bool IncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { bool is_valid = input.is_valid(); @@ -589,9 +591,9 @@ FutureMap IncMultiHeadSelfAttention::inference( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; - printf("BatchConfig, num_tokens: %d, num_requests: %d\n", - bc.num_tokens, - bc.num_active_requests()); + log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc.num_tokens, + bc.num_active_requests()); IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, TaskArgument(&bc, sizeof(BatchConfig)), diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index ec13569665..ad4e59b710 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -769,8 +769,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( // here because we need postion info in infernece 1 cudaMemcpyAsync(m->tokenInfos, &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * - sizeof(BatchConfig::PerTokenInfo), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(m->requestInfos, @@ -880,8 +879,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; // size_t token2ids_size = BatchConfig::MAX_NUM_TOKENS; - size_t tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + size_t tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS; size_t beam_tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -906,9 +904,9 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( beam_tokeninfo_size * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + beam_requestinfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo) + - complex_size * sizeof(cuFloatComplex); // more components will - // be added here later + sizeof(BeamSearchBatchConfig:: + BeamSearchPerRequestInfo); // more components will + // be added here later Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(totalSize - 1)); @@ -941,8 +939,8 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( qk_prods_softmax = (float *)(qk_prods + qk_prod_size); attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; - complex_input = - (cuFloatComplex *)(W_out_contiguous + W_out_contiguous_size); + checkCUDA( + cudaMalloc(&complex_input, complex_size * sizeof(cuFloatComplex))); int parallelism = vProjSize * oProjSize * num_heads; spec_build_w_out_tensor<<hash(); int idx = 0; - printf("TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d\n", - bc.num_tokens, - bc.num_active_requests()); + log_tree_verify.debug( + "TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d", + bc.num_tokens, + bc.num_active_requests()); IndexLauncher launcher(TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, TaskArgument(&bc, sizeof(TreeVerifyBatchConfig)), diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index c2864cc181..0cc3cf9191 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -897,8 +897,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( qk_prods_softmax = (float *)(qk_prods + qk_prod_size); attn_heads = (float *)qk_prods_softmax + qk_prod_size; W_out_contiguous = (float *)attn_heads + attn_heads_size; - complex_input = - (cuFloatComplex *)(W_out_contiguous + W_out_contiguous_size); + checkCUDA( + cudaMalloc(&complex_input, complex_size * sizeof(cuFloatComplex))); int parallelism = vProjSize * oProjSize * num_heads; tree_build_w_out_tensor<< bs; - for (auto i = uint64_t(L'!'); i < uint64_t(L'~') + 1; ++i) { + std::vector bs; + for (auto i = uint32_t(L'!'); i < uint32_t(L'~') + 1; ++i) { bs.push_back(i); } - for (auto i = uint64_t(L'¡'); i < uint64_t(L'¬') + 1; ++i) { + for (auto i = uint32_t(L'¡'); i < uint32_t(L'¬') + 1; ++i) { bs.push_back(i); } - for (auto i = uint64_t(L'®'); i < uint64_t(L'ÿ') + 1; ++i) { + for (auto i = uint32_t(L'®'); i < uint32_t(L'ÿ') + 1; ++i) { bs.push_back(i); } - std::vector cs = bs; - uint64_t n = 0; - for (uint64_t b = 0; b < 256; ++b) { + std::vector cs = bs; + uint32_t n = 0; + for (uint32_t b = 0; b < 256; ++b) { auto p = find(bs.begin(), bs.end(), b); if (p == bs.end()) { bs.push_back(b); @@ -118,7 +118,7 @@ void GPT_Tokenizer::load_vocab(std::string const &vocab_file) { /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - auto vocab_ = vocab_data_.get>(); + auto vocab_ = vocab_data_.get>(); for (auto item : vocab_) { vocab.insert({item.first, item.second}); inverse_vocab.insert({item.second, item.first}); @@ -240,7 +240,7 @@ std::vector GPT_Tokenizer::tokenize(std::string str) { return bpe_tokens; } -int64_t GPT_Tokenizer::convert_token_to_id(std::string token) { +int32_t GPT_Tokenizer::convert_token_to_id(std::string token) { auto p = vocab.find(token); if (p != vocab.end()) { return vocab[token]; @@ -251,8 +251,8 @@ int64_t GPT_Tokenizer::convert_token_to_id(std::string token) { void GPT_Tokenizer::encode(std::string str, size_t max_length, - std::vector *input_ids, - std::vector *mask_ids) { + std::vector *input_ids, + std::vector *mask_ids) { if (not input_ids->empty()) { input_ids->clear(); } @@ -283,8 +283,8 @@ void GPT_Tokenizer::encode(std::string str, } } -std::string GPT_Tokenizer::decode(std::vector input_ids, - std::vector mask_ids) { +std::string GPT_Tokenizer::decode(std::vector input_ids, + std::vector mask_ids) { // look up each number in encoder.json dictionary std::ostringstream oss; int index = 0; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 63a5bb6540..fd35b9aa76 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -25,6 +25,8 @@ namespace FlexFlow { using namespace Legion; +LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); + InferenceManager::InferenceManager(FFConfig const &_config, int _max_num_tokens_per_batch, int _max_num_inflight_batches) @@ -197,10 +199,10 @@ MachineView *InferenceManager::get_machine_view(int mv_id) { FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfig const &bc) { - std::cout << "InferenceManager::inference" << index << std::endl; - std::cout << "num_active_tokens = " << bc.num_active_tokens() - << ", num_active_requests = " << bc.num_active_requests() - << std::endl; + log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)", + bc.get_mode(), + bc.num_active_tokens(), + bc.num_active_requests()); assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 562b09e411..d7c18ebf94 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3224,22 +3224,22 @@ void FFModel::compile(LossType loss_type, assert(final_operator->numOutputs == 1); for (size_t i = 0; i < operators.size(); i++) { Op *op = operators[i]; - printf("operator[%zu]: type(%d)\n", i, operators[i]->op_type); + log_model.print("operator[%zu]: type(%d)", i, operators[i]->op_type); for (int j = 0; j < op->numInputs; j++) { LogicalRegion handle = op->inputs[j]->region; - printf("\tinputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_model.print("\tinputs[%d] region(%d,%d,%d)", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numOutputs; j++) { LogicalRegion handle = op->outputs[j]->region; - printf("\toutputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_model.print("\toutputs[%d] region(%d,%d,%d)", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } } // assert(final_operator->outputs[0].num_dims == 2); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5b256d5bb7..ac038147d1 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -15,6 +15,7 @@ #include "flexflow/inference.h" #include "flexflow/parallel_ops/parallel_op.h" +#include "flexflow/tokenizers.h" #include #include @@ -25,7 +26,12 @@ using namespace Legion; LegionRuntime::Logger::Category log_req_mgr("RequestManager"); RequestManager::RequestManager() - : next_available_guid(1000000), num_processed_requests(0) {} + : tokenizer(nullptr), verbose(false), next_available_guid(1000000), + num_processed_requests(0) {} + +RequestManager::RequestManager(Tokenizer *_tokenizer, bool _verbose) + : tokenizer(_tokenizer), verbose(_verbose), next_available_guid(1000000), + num_processed_requests(0) {} RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, @@ -41,7 +47,36 @@ RequestManager::RequestGuid pending_request_queue.push(request); - std::cout << "new req: " << request.tokens.size() << std::endl; + if (verbose) { + std::cout << "new req: " << request.tokens.size() << std::endl; + for (int i = 0; i < request.tokens.size(); i++) { + std::cout << i << " : " << request.tokens[i] << std::endl; + } + } + return request.guid; +} + +RequestManager::RequestGuid + RequestManager::register_new_request(std::string const &prompt, + int max_sequence_length) { + const std::lock_guard lock(request_queue_mutex); + + // Add a new request + Request request; + request.guid = next_available_guid++; + request.max_sequence_length = max_sequence_length; + request.tokens.push_back(tokenizer->bos_token_id); + std::vector tokens = tokenizer->Encode(prompt); + request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); + request.initial_len = request.tokens.size(); + + pending_request_queue.push(request); + if (verbose) { + std::cout << "new req: " << request.tokens.size() << std::endl; + for (int i = 0; i < request.tokens.size(); i++) { + std::cout << i << " : " << request.tokens[i] << std::endl; + } + } return request.guid; } @@ -64,8 +99,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == request.tokens.size()); // This is a decoding token - std::cout << "token is: " << result.token_ids[i]; + log_req_mgr.print("Output token is: %d", result.token_ids[i]); request.tokens.push_back(result.token_ids[i]); + std::string output = tokenizer->Decode(request.tokens); + log_req_mgr.print("Output: %s", output.c_str()); } } // Step 2: preparing the next batch for existing requests @@ -86,10 +123,26 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", old_bc.requestsInfo[i].request_guid, request.tokens.size()); - std::cout << "print results: " << std::endl; - for (int i = 0; i < request.tokens.size(); i++) { - std::cout << request.tokens.at(i) << ", "; - } + std::string output = tokenizer->Decode(request.tokens); + log_req_mgr.print("Final output: %s", output.c_str()); + num_processed_requests++; + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf) acc_latency(%.1lf)", + request.guid, + profile_info.decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time, + total_request_run_time); + // std::cout << "print results: " << std::endl; + // for (int i = 0; i < request.tokens.size(); i++) { + // std::cout << request.tokens.at(i) << ", "; + // } } else { new_bc.request_completed[i] = false; new_bc.requestsInfo[i].token_start_offset = processed_tokens; @@ -115,6 +168,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; new_bc.num_tokens++; } + // Update profiling + profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps++; } } // Step 3: add new requests to the next batch @@ -133,6 +188,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + // add profile_info for the new request + ProfileInfo profile_info; + profile_info.decoding_steps = 1; + profile_info.start_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].token_start_offset + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -159,13 +219,15 @@ BeamSearchBatchConfig BeamInferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - std::cout << "print all results" - << "\n"; - for (int i = 0; i < 40; i++) { - std::cout << result.token_ids[i] << ", "; + if (verbose) { + std::cout << "print all results" + << "\n"; + for (int i = 0; i < 40; i++) { + std::cout << result.token_ids[i] << ", "; + } + std::cout << "Current Beam Depth: " + << old_bc.beamRequestsInfo[0].current_depth << "\n"; } - std::cout << "Current Beam Depth: " - << old_bc.beamRequestsInfo[0].current_depth << "\n"; // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); @@ -184,7 +246,7 @@ BeamSearchBatchConfig old_bc.requestsInfo[i].num_tokens_in_batch; // assert(processed_tokens < request.tokens.size()); - std::cout << "\nprocessed_tokens: " << processed_tokens << "\n"; + log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; if (processed_tokens > old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() // || ir.results[t] == 0 TODO: replace this with @@ -199,8 +261,8 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; } else { - std::cout << "num tokens: " << old_bc.num_tokens << ", " - << new_bc.num_tokens; + log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " + << new_bc.num_tokens; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].token_start_offset = processed_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; @@ -218,7 +280,7 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].beam_size; // do the slot exchange to minimize the cache exchange in kernel. - std::cout << "update metadata" << std::endl; + // std::cout << "update metadata" << std::endl; update_beam_metadata(new_bc, beam_trees[i], i); if (new_bc.requestsInfo[i].token_start_offset + 1 >= @@ -263,8 +325,6 @@ BeamSearchBatchConfig new_bc.num_tokens = 0; int result_index = 0; - std::cout << "11111111" << std::endl; - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { if (old_bc.request_completed[i]) { continue; @@ -272,16 +332,12 @@ BeamSearchBatchConfig size_t guid = old_bc.requestsInfo[i].request_guid; Request &request = running_request_queue[guid]; - printf("req %d\n", i); - // Verify this: get verified tokens from result std::vector> tree_outputs = std::vector>(); assert(old_bc.num_tokens > 0); - std::cout << "222222222" << std::endl; - int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; if (committed_tokens.find(guid) == committed_tokens.end()) { committed_tokens[guid] = std::vector>(); @@ -301,12 +357,14 @@ BeamSearchBatchConfig std::make_pair(old_bc.tokensInfo[result_index].abs_depth_in_request, result_index)); - std::cout << "Index with old_bacth: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - old_bc.tokensInfo[result_index].abs_depth_in_request, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - tree_outputs.back().first); + if (verbose) { + std::cout << "Index with old_bacth: " << result_index << std::endl; + printf(" Input: [%d] %d ---> [%d] %d \n", + old_bc.tokensInfo[result_index].abs_depth_in_request, + old_bc.tokensInfo[result_index].token_id, + tree_outputs.back().second, + tree_outputs.back().first); + } // std::cout << " Input: " << old_bc.tokensInfo[result_index].token_id // << "" // << old_bc.tokensInfo[result_index].abs_depth_in_request << @@ -319,11 +377,10 @@ BeamSearchBatchConfig result_index++; } - std::cout << "333333333333" << std::endl; - std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); - + log_req_mgr.print("Number of Verified Tokens = %zu", + verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -335,8 +392,21 @@ BeamSearchBatchConfig log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", request.guid, request.tokens.size()); - new_bc.request_completed[i] = true; + num_processed_requests++; + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf) acc_latency(%.1lf)", + request.guid, + profile_info.decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time, + total_request_run_time); beam_trees[i] = BeamTree{}; dfs_tree_inputs.erase( @@ -387,6 +457,8 @@ BeamSearchBatchConfig break; } } + std::string output = tokenizer->Decode(request.tokens); + log_req_mgr.print("Output: %s", output.c_str()); } // Step 2: Initialize new request @@ -404,7 +476,11 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; - + // add profile_info for the new request + ProfileInfo profile_info; + profile_info.decoding_steps = 0; + profile_info.start_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[new_request.guid] = profile_info; // init the beam search metadata per request new_bc.beamRequestsInfo[i].beam_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -458,10 +534,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector> dfs_tree_inputs = traverse_beam_tree(old_bc, i, request.tokens.size() - 1); - std::cout << "11111" << std::endl; - std::cout << "Request Tokens Size: " << request.tokens.size() << std::endl; - for (int k = 0; k < request.tokens.size(); k++) { - std::cout << k << ": " << request.tokens[k] << std::endl; + if (verbose) { + std::cout << "Request Tokens Size: " << request.tokens.size() + << std::endl; + for (int k = 0; k < request.tokens.size(); k++) { + std::cout << k << ": " << request.tokens[k] << std::endl; + } } // Normal Request Info @@ -471,9 +549,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_bc.requestsInfo[i].max_sequence_length; // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.request_completed[i] = false; + // Profiling + profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; // TODO: Add prompt token first in first verify iteration if (request.tokens.size() == request.initial_len) { for (int j = 0; j < request.initial_len; j++) { @@ -508,8 +587,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; } - std::cout << "dfs_tree_inputs.size(): " << dfs_tree_inputs.size() - << std::endl; + if (verbose) { + std::cout << "dfs_tree_inputs.size(): " << dfs_tree_inputs.size() + << std::endl; + } // add prompt to the dfs tree if (committed_tokens.find(guid) != committed_tokens.end()) { @@ -524,9 +605,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = j; new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = j; - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " << j - << ", token_index: " << j << std::endl; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " << j + << ", token_index: " << j << std::endl; + } new_bc.num_tokens_to_commit++; } } else { @@ -537,31 +620,37 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " << committed_token.first - << ", token_index: " << committed_token.second << std::endl; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second << std::endl; + } new_bc.num_tokens_to_commit++; } - - std::cout << "new_bc.num_tokens_to_commit: " - << new_bc.num_tokens_to_commit << std::endl; + if (verbose) { + std::cout << "new_bc.num_tokens_to_commit: " + << new_bc.num_tokens_to_commit << std::endl; + } } // Token Info for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); - - std::cout << "[" << j << "] Token: " << token.first - << ", Depth:" << token.second << std::endl; - + if (verbose) { + std::cout << "[" << j << "] Token: " << token.first + << ", Depth:" << token.second << std::endl; + } // Normal Token Info new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; // TODO: Add committed token info - std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit - << std::endl; + if (verbose) { + std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit + << std::endl; + } if (committed_tokens.find(guid) != committed_tokens.end()) { // if (j == 1) { @@ -582,15 +671,20 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second << std::endl; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; + } new_bc.num_tokens_to_commit++; } } - std::cout << "new_bc.num_tokens_to_commit: " - << new_bc.num_tokens_to_commit << std::endl; + if (verbose) { + std::cout << "new_bc.num_tokens_to_commit: " + << new_bc.num_tokens_to_commit << std::endl; + } new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; @@ -615,8 +709,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; int result_index = 0; - std::cout << "Store total of " << old_bc.num_tokens - << " tokens in the current batch.\n"; + if (verbose) { + std::cout << "Store total of " << old_bc.num_tokens + << " tokens in the current batch.\n"; + } for (int i = 0; i <= old_bc.num_tokens; i++) { int request_index = old_bc.tokensInfo[i].request_index; @@ -634,24 +730,29 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_width; - std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] << "\n"; - + if (verbose) { + std::cout << "i = " << i << ", result index = " << result_index + << ", value: " << result.token_ids[result_index] << "\n"; + } int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; if (depth == 1) { // store the last input into the tree; - std::cout << "try to store the input" - << "\n"; + if (verbose) { + std::cout << "try to store the input" + << "\n"; + } Request &request = running_request_queue[old_bc.requestsInfo[index].request_guid]; beam_trees[index].treeLayers[0].tokens[0] = request.tokens.back(); beam_trees[index].treeLayers[0].probs[0] = 1; beam_trees[index].treeLayers[0].parent_ids[0] = -1; - std::cout << "Store the previous last token to the tree root: " - << request.tokens.back() << "\n"; + if (verbose) { + std::cout << "Store the previous last token to the tree root: " + << request.tokens.back() << "\n"; + } } for (int beam_id = 0; beam_id < beam_width; beam_id++) { @@ -662,9 +763,11 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, beam_trees[index].treeLayers[depth].parent_ids[beam_id] = result.parent_id[result_index]; - std::cout << "tree value: " << depth << "token: " - << beam_trees[index].treeLayers[depth].tokens[beam_id] - << "result tokens: " << result.token_ids[result_index]; + if (verbose) { + std::cout << "tree value: " << depth << "token: " + << beam_trees[index].treeLayers[depth].tokens[beam_id] + << "result tokens: " << result.token_ids[result_index]; + } result_index += 1; } @@ -733,8 +836,10 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, if (parents.find(j) == parents.end()) { // this slot has not been assigned // find the smallest not assigned child and put in - std::cout << "request_index" << request_index << ", miss slot: " << j - << "\n"; + if (verbose) { + std::cout << "request_index" << request_index + << ", miss slot: " << j << "\n"; + } for (int k = 0; k < beam_size; k++) { if (childs.find(k) == childs.end()) { // parent -> j to child k; @@ -753,32 +858,37 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, } } } - std::cout << "-----------after parent id exchange-----------" << std::endl; - for (int j = 0; j < beam_size; j++) { - std::cout << "after request id: " << request_index << "beam id = " << j - << "parnt: " - << new_bc.beamRequestsInfo[request_index].parent_id[j] - << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j] - << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] - << std::endl; + if (verbose) { + std::cout << "-----------after parent id exchange-----------" << std::endl; + for (int j = 0; j < beam_size; j++) { + std::cout << "after request id: " << request_index << "beam id = " << j + << "parnt: " + << new_bc.beamRequestsInfo[request_index].parent_id[j] + << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j] + << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] + << std::endl; + } } } -bool PreOrder(BeamTree const &tree, - int max_depth, - int current_depth, - int beam_width, - int id, - std::vector> - &serializedTree) { +bool PreOrder( + BeamTree const &tree, + int max_depth, + int current_depth, + int beam_width, + int id, + std::vector> &serializedTree, + bool verbose) { // terminate if (current_depth >= max_depth) { serializedTree.push_back(std::make_pair( tree.treeLayers[current_depth].tokens[id], current_depth)); - std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id] - << "\n"; - std::cout << "return true" - << "\n"; + if (verbose) { + std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id] + << "\n"; + std::cout << "return true" + << "\n"; + } return true; } @@ -786,8 +896,10 @@ bool PreOrder(BeamTree const &tree, // std::cout<<"node: " << current_depth << ", id: " << serializedTree.push_back( std::make_pair(tree.treeLayers[current_depth].tokens[id], current_depth)); - std::cout << "push something: " << tree.treeLayers[current_depth].tokens[id] - << ", " << current_depth << std::endl; + if (verbose) { + std::cout << "push something: " << tree.treeLayers[current_depth].tokens[id] + << ", " << current_depth << std::endl; + } int index = serializedTree.size() - 1; int next_layers = current_depth + 1; @@ -799,14 +911,17 @@ bool PreOrder(BeamTree const &tree, // for all childs, do preOrder if (child_parent == id) { - std::cout << "current depth: " << current_depth << ", child_parent, " - << child_parent << ", child_id, " << child_id << "\n"; + if (verbose) { + std::cout << "current depth: " << current_depth << ", child_parent, " + << child_parent << ", child_id, " << child_id << "\n"; + } bool res = PreOrder(tree, max_depth, current_depth + 1, beam_width, child_id, - serializedTree); + serializedTree, + verbose); flag = flag || res; } } @@ -820,6 +935,7 @@ bool PreOrder(BeamTree const &tree, return flag; } +#ifdef DEADCODE TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( BeamSearchBatchConfig const &beam_bc) { TreeVerifyBatchConfig tree_bc; @@ -838,7 +954,8 @@ TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( 0, beam_bc.beamRequestsInfo[i].beam_size, 0, - serializedTree); + serializedTree, + verbose); tree_bc.requestsInfo[i].request_guid = beam_bc.requestsInfo[i].request_guid; tree_bc.requestsInfo[i].max_sequence_length = beam_bc.requestsInfo[i].max_sequence_length; @@ -857,6 +974,7 @@ TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( } return tree_bc; } +#endif std::vector> RequestManager::traverse_verify_tree( @@ -870,20 +988,21 @@ std::vector> std::vector> new_committed_tokens = std::vector>(); - std::cout << "Input size: " << inputSerializedTree.size() << std::endl; - std::cout << "Output size: " << outputSerializedTree.size() << std::endl; + log_req_mgr.print("Input size (%zu) Output size (%zu)", + inputSerializedTree.size(), + outputSerializedTree.size()); - std::cout << "========Input============" << std::endl; + log_req_mgr.print("========Input============"); for (auto const &pair : inputSerializedTree) { - std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } - std::cout << "========Output============" << std::endl; + log_req_mgr.print("========Output============"); for (auto const &pair : outputSerializedTree) { - std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } - std::cout << "========Committed============" << std::endl; + log_req_mgr.print("========Committed============"); for (auto const &pair : committed_tokens.at(guid)) { - std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } assert(inputSerializedTree.size() == outputSerializedTree.size()); @@ -898,9 +1017,9 @@ std::vector> input.second, committed_tokens.at(guid).at(i).second)); // - std::cout << committed_tokens.at(guid).at(i).first << ", " - << committed_tokens.at(guid).at(i).second << std::endl; - std::cout << input.first << ", " << input.second << std::endl; + // std::cout << committed_tokens.at(guid).at(i).first << ", " + // << committed_tokens.at(guid).at(i).second << std::endl; + // std::cout << input.first << ", " << input.second << std::endl; assert(committed_tokens.at(guid).at(i).first == input.second); continue; @@ -917,14 +1036,14 @@ std::vector> } } committed_tokens[guid] = new_committed_tokens; - std::cout << "========Verified============" << std::endl; + log_req_mgr.print("========Verified============"); for (auto const &pair : verifiedTree) { - std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } - std::cout << "========New Committed============" << std::endl; + log_req_mgr.print("========New Committed============"); for (auto const &pair : committed_tokens.at(guid)) { - std::cout << "(" << pair.first << ", " << pair.second << ")" << std::endl; + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } return verifiedTree; @@ -934,14 +1053,16 @@ std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, int token_start_offset) { - - std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; - std::cout << "[Traverse Beam Tree] max_depth: " - << old_bc.beamRequestsInfo[request_index].max_depth << "\n"; - std::cout << "[Traverse Beam Tree] current_depth: " - << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; - std::cout << "[Traverse Beam Tree] beam_width: " - << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + if (verbose) { + std::cout << "[Traverse Beam Tree] request_index: " << request_index + << "\n"; + std::cout << "[Traverse Beam Tree] max_depth: " + << old_bc.beamRequestsInfo[request_index].max_depth << "\n"; + std::cout << "[Traverse Beam Tree] current_depth: " + << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; + std::cout << "[Traverse Beam Tree] beam_width: " + << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + } BeamTree tree = beam_trees[request_index]; // token, index @@ -952,18 +1073,23 @@ std::vector> 0, old_bc.beamRequestsInfo[request_index].beam_size, 0, - serializedTree); + serializedTree, + verbose); // print it - std::cout << "Print serialized tree, " << request_index << "\n"; - std::cout << serializedTree.size() << "\n"; + if (verbose) { + std::cout << "Print serialized tree: size:" << request_index + << serializedTree.size() << "\n"; + } for (int k = 0; k < serializedTree.size(); k++) { serializedTree.at(k).second += token_start_offset; - std::cout << "token id: " << serializedTree.at(k).first - << ", depth: " << serializedTree.at(k).second << "\n"; + if (verbose) { + std::cout << "token id: " << serializedTree.at(k).first + << ", depth: " << serializedTree.at(k).second << "\n"; + } } - std::cout << "Done printing serialized tree, " - << old_bc.requestsInfo[request_index].request_guid << "\n"; + // std::cout << "Done printing serialized tree, " + // << old_bc.requestsInfo[request_index].request_guid << "\n"; if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid) != dfs_tree_inputs.end()) { diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc new file mode 100644 index 0000000000..e50a0e06fd --- /dev/null +++ b/src/runtime/tree_verify_batch_config.cc @@ -0,0 +1,33 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/batch_config.h" +#include "legion.h" +#include +#include + +namespace FlexFlow { + +LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig"); + +TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {} + +TreeVerifyBatchConfig::~TreeVerifyBatchConfig() {} + +BatchConfig::Mode TreeVerifyBatchConfig::get_mode() const { + return TREE_VERIFY_MODE; +} + +}; // namespace FlexFlow diff --git a/examples/cpp/inference/gpt_tokenizer.cpp b/tests/gpt_tokenizer.cpp similarity index 97% rename from examples/cpp/inference/gpt_tokenizer.cpp rename to tests/gpt_tokenizer.cpp index 8712f0e6b6..8ddfa75e1c 100644 --- a/examples/cpp/inference/gpt_tokenizer.cpp +++ b/tests/gpt_tokenizer.cpp @@ -49,8 +49,8 @@ int main(int argc, char *argv[]) { lines.push_back(line); } - std::vector input_ids; - std::vector mask_ids; + std::vector input_ids; + std::vector mask_ids; for (auto l = lines.begin(); l != lines.end(); ++l) { std::string stripped_line = tokenizer.strip(*l); if (stripped_line.length() == 0) { diff --git a/examples/cpp/inference/gpt_tokenizer_test.sh b/tests/gpt_tokenizer_test.sh similarity index 96% rename from examples/cpp/inference/gpt_tokenizer_test.sh rename to tests/gpt_tokenizer_test.sh index b336dd05ff..6134d4e592 100755 --- a/examples/cpp/inference/gpt_tokenizer_test.sh +++ b/tests/gpt_tokenizer_test.sh @@ -13,7 +13,7 @@ cd "${BASH_SOURCE[0]%/*}" cleanup # Compile the FlexFlow C++ tokenizer stand-alone -g++ -std=c++11 -I../../../deps/json/include -o gpt_tokenizer gpt_tokenizer.cpp gpt_tokenizer.cc +g++ -std=c++11 -I../deps/json/include -I../include -o gpt_tokenizer gpt_tokenizer.cpp ../src/runtime/gpt_tokenizer.cc chmod +x gpt_tokenizer # Download and inflate wikitext dataset From 16a5d025e5e895a5cad8e81de83c03f8595f6db5 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 15 May 2023 21:07:17 -0700 Subject: [PATCH 129/344] Update README.md --- .github/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/README.md b/.github/README.md index fdc52c53c4..bf734d8001 100644 --- a/.github/README.md +++ b/.github/README.md @@ -1,4 +1,4 @@ -# SpecInfer +# SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification ![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)

From b9fd233cecb4af6e3b128e04ab9d7bce5357a6c6 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 16 May 2023 14:53:43 -0500 Subject: [PATCH 130/344] Uses data and pipeline parallel by default. (#729) * Support multiple FFModels in a single top_level_task * [TreeVerifyMHA] bug fixes * bug fixes * TreeIncMHA and SpecIncMHA bug fixes * fomat. * . * add sentence piece tokenizer * format * prepare spec_infer demo * prettier prints * make the llama model work * add small model config * enable speculative inference for spec_infer * fix * rename * fix one of the bugs * fix * del * attempt to fix ci * integrated gpt/opt tokenizer * integrate opt tokenizer with pipeline * . * format * move files * Update README.md * add an overview figure * update images * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * add tokenizer in readme * fix * fix * fix * Update README.md * Update README.md * add gif * add weights to readme, clean some print * Update README.md * update demo * Update README.md * Update README.md * remove outdate file * Update README.md * Update README.md * . * use data parallel by default --------- Co-authored-by: xinhaoc Co-authored-by: Gabriele Oliaro Co-authored-by: xinhaoc <99570243+xinhaoc@users.noreply.github.com> --- src/runtime/model.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index d7c18ebf94..b27d2f3421 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3644,7 +3644,7 @@ struct DefaultConfig { (size_t)2 * 1024 * 1024 * 1024; // 2GB constexpr static float searchAlpha = 1.2f; const static bool searchOverlapBackwardUpdate = false; - const static bool onlyDataParallel = false; + const static bool onlyDataParallel = true; const static bool enableSampleParallel = true; const static bool enableParameterParallel = false; const static bool enableAttributeParallel = false; From b8e5586ca72ad523c6dd8872eda5d1416244ae9a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 16 May 2023 12:54:48 -0700 Subject: [PATCH 131/344] Update README.md --- .github/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/README.md b/.github/README.md index bf734d8001..dc23d2e989 100644 --- a/.github/README.md +++ b/.github/README.md @@ -92,7 +92,7 @@ SpecInfer is under active development. We currently focus on the following tasks ## Acknowledgements This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. -* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](). +* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://www.cs.cmu.edu/~zhihaoj2/papers/specinfer.pdf). ## License Both SpecInfer and FlexFlow use Apache License 2.0. From 0aabf34f2492a3665ee7df815ecf3d2778bfca94 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 18 May 2023 15:34:30 +0800 Subject: [PATCH 132/344] fix make build, edit cmake --- CMakeLists.txt | 9 --------- FlexFlow.mk | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f9e78aff12..e758eb577f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,19 +186,10 @@ set(NVCC_FLAGS $ENV{NVCC_FLAGS}) set(LD_FLAGS $ENV{LD_FLAGS}) # Set global FLAGS -if(INFERENCE_TESTS) list(APPEND CC_FLAGS -std=c++17) - list(APPEND NVCC_FLAGS -std=c++17) -else() -list(APPEND CC_FLAGS - -std=c++17) - -list(APPEND NVCC_FLAGS - -std=c++17) -endif() add_compile_options(${CC_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS}) diff --git a/FlexFlow.mk b/FlexFlow.mk index 4d63ec83d4..2f3029b434 100644 --- a/FlexFlow.mk +++ b/FlexFlow.mk @@ -87,7 +87,7 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1) endif -INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include +INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/sentencepiece/src CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 From 427d6027d2bb0df199a3d9704f1d633ec2c665b0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 18 May 2023 16:07:59 +0800 Subject: [PATCH 133/344] update std version in makefile --- FlexFlow.mk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FlexFlow.mk b/FlexFlow.mk index 2f3029b434..27a0062d8c 100644 --- a/FlexFlow.mk +++ b/FlexFlow.mk @@ -93,9 +93,9 @@ NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 GASNET_FLAGS += # For Point and Rect typedefs -CC_FLAGS += -std=c++11 -NVCC_FLAGS += -std=c++11 -HIPCC_FLAGS += -std=c++11 +CC_FLAGS += -std=c++17 +NVCC_FLAGS += -std=c++17 +HIPCC_FLAGS += -std=c++17 ifeq ($(strip $(FF_USE_NCCL)), 1) INC_FLAGS += -I$(MPI_HOME)/include -I$(NCCL_HOME)/include From d87197da69f01709032b4b89a04e8884d9581721 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 18 May 2023 17:25:36 -0400 Subject: [PATCH 134/344] file path adapt (#730) * file path adapt * fix * fix * fix --- inference/file_loader.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/inference/file_loader.cc b/inference/file_loader.cc index edd63ba167..e21b39fcaf 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -85,6 +85,7 @@ void load_attention_bias(float *ptr, for (auto file : bias_files) { size_t partial_size = hidden_dim; std::ifstream in(file, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect bias file path"); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; in.seekg(0, in.end); @@ -147,6 +148,7 @@ void load_attention_weights(float *ptr, size_t partial_size = one_weight_file_size; std::ifstream in(file, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect weight file path"); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; in.seekg(0, in.end); @@ -178,6 +180,7 @@ void load_attention_weights(float *ptr, void load_from_file(float *ptr, size_t size, std::string filename) { std::ifstream in(filename, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect weight file path"); std::vector host_array(size); size_t loaded_data_size = sizeof(float) * size; in.seekg(0, in.end); @@ -247,13 +250,15 @@ void FileDataLoader::load_weights( assert(weight->data_type == DT_FLOAT); float *data = (float *)malloc(sizeof(float) * volume); - if (v.first.find("attention_w") != std::string::npos) { + std::string file_path = (v.first.back() == '/') ? v.first : "/" + v.first; + + if (file_path.find("attention_w") != std::string::npos) { if (i == 0) { load_attention_weights(data, num_heads, hidden_dim, qkv_inner_dim, - v.first, + file_path, weight_file_path, volume); } else { @@ -261,16 +266,15 @@ void FileDataLoader::load_weights( num_heads, hidden_dim, qkv_inner_dim, - v.first, + file_path, weight_file_path); } } else { - std::string file_path = v.first; if (i > 0) { - int index = v.first.find("_weight"); + int index = file_path.find("_weight"); assert(index != std::string::npos); - file_path = v.first.substr(0, index) + "_bias"; + file_path = file_path.substr(0, index) + "_bias"; } load_from_file(data, volume, weight_file_path + file_path); } From b9fddec41e64de20c97d8eb05524a807d7a34697 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 19 May 2023 13:35:35 +0800 Subject: [PATCH 135/344] Update README.md --- .github/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/README.md b/.github/README.md index dc23d2e989..a2bb32da69 100644 --- a/.github/README.md +++ b/.github/README.md @@ -64,10 +64,10 @@ The weight files using in our demo is extracted from HuggingFace, and stored in | Model | Model id on Hugging Face | Storage Location | | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | s3://catalyst-llama/Flexflow_LLM_weights/LLAMA/llama_7B_weights.tar.gz | -| LLaMA-190M | Bingsu/llama-190m-arch | s3://catalyst-llama/Flexflow_LLM_weights/LLAMA/llama_190m_weights.tar.gz | -| OPT-6.7B | facebook/opt-6.7b | s3://catalyst-llama/Flexflow_LLM_weights/OPT/opt_6B_weights.tar.gz | -| OPT-125M | facebook/opt-125m | s3://catalyst-llama/Flexflow_LLM_weights/OPT/opt_125m_native.tar.gz | +| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | +| LLaMA-190M | Bingsu/llama-190m-arch | s3://specinfer/weights/llama_190m_weights.tar.gz | +| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | +| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125m_native.tar.gz | You can use [this script](../inference/spec_infer/MODEL_WEIGHTS.md) to convert the weights of a HuggingFace LLM to the SpecInfer weight format. From dc6dcf878836749fb8db179df9bc542aa05643e2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 19 May 2023 13:41:09 +0800 Subject: [PATCH 136/344] Update README.md --- .github/README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/README.md b/.github/README.md index a2bb32da69..69731f40d1 100644 --- a/.github/README.md +++ b/.github/README.md @@ -92,7 +92,20 @@ SpecInfer is under active development. We currently focus on the following tasks ## Acknowledgements This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. -* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://www.cs.cmu.edu/~zhihaoj2/papers/specinfer.pdf). +* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). + +Please cite as: + +``` bibtex +@misc{miao2023specinfer, + title={SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}, + author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, + year={2023}, + eprint={2305.09781}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` ## License Both SpecInfer and FlexFlow use Apache License 2.0. From 1193b51a367c6c5fa744b7210279e08ac366b3fa Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 19 May 2023 15:53:58 +0800 Subject: [PATCH 137/344] Update README.md --- .github/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/README.md b/.github/README.md index 69731f40d1..7a7ae6a3e2 100644 --- a/.github/README.md +++ b/.github/README.md @@ -94,6 +94,9 @@ This project is initiated by members from CMU, Stanford, and UCSD. We will be co * Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). +\* Denotes equal contribution + +### Citation Please cite as: ``` bibtex From 155989a2cd3e655756c009c9791e0a6ee4eb02b8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 20 May 2023 23:49:26 +0800 Subject: [PATCH 138/344] [Inference][CI] - Fix GPU-CI and `hip_rocm`build tests (#731) * fix hip_rocm build with sentencepiece * shellcheck 1 * shellcheck 2 * shellecheck 3 * fix install script * .github/workflows/helpers/install_dependencies.sh * fix * shellcheck * restore unnecessary changes * fix build * removed outdated test from c++ tests * update link in readme --- .github/README.md | 2 +- .github/workflows/build.yml | 4 ++++ .github/workflows/gpu-ci.yml | 2 +- .github/workflows/helpers/install_dependencies.sh | 15 +++++++++++++++ CMakeLists.txt | 3 +++ .../flexflow/ops/inc_multihead_self_attention.h | 2 ++ .../ops/spec_inc_multihead_self_attention.h | 2 ++ .../ops/tree_inc_multihead_self_attention.h | 2 ++ tests/cpp_gpu_tests.sh | 8 ++++---- 9 files changed, 34 insertions(+), 6 deletions(-) diff --git a/.github/README.md b/.github/README.md index 7a7ae6a3e2..28b2ef789c 100644 --- a/.github/README.md +++ b/.github/README.md @@ -56,7 +56,7 @@ For example, you can use the following command line to serve a LLaMA-6B or LLaMA ### Tokenizers SpecInfer supports two tokenizers: -* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentence piece tokenizer from Hugging Face (model id: [decapoda-research/llama-7b-hf](https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/tokenizer.model)). +* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). We store the tokenizer on our S3 bucket at this link: [s3://specinfer/tokenizer/tokenizer.model](https://specinfer.s3.us-east-2.amazonaws.com/tokenizer/tokenizer.model). * The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. ### LLM Weights diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9abd9c9a78..5d83aaedf4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -99,6 +99,10 @@ jobs: export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON + else + export FF_BUILD_ALL_EXAMPLES=OFF + export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF + export FF_BUILD_UNIT_TESTS=OFF fi ../config/config.linux sudo make install diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 801924bd0d..135b569055 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -163,7 +163,7 @@ jobs: # C++ tests ./tests/cpp_gpu_tests.sh 4 # GPT tokenizer test - ./examples/cpp/inference/gpt_tokenizer_test.sh + ./tests/gpt_tokenizer_test.sh # Python tests ./tests/multi_gpu_tests.sh 4 diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh index 5ab211c962..cf37f3b820 100755 --- a/.github/workflows/helpers/install_dependencies.sh +++ b/.github/workflows/helpers/install_dependencies.sh @@ -25,6 +25,21 @@ elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; th rm ./amdgpu-install_22.20.50205-1_all.deb sudo amdgpu-install -y --usecase=hip,rocm --no-dkms sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk + + # Install protobuf v3.20.x manually + sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make + git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git + cd protobuf/ + git submodule update --init --recursive + ./autogen.sh + ./configure + cores_available=$(nproc --all) + n_build_cores=$(( cores_available -1 )) + if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi + make -j $n_build_cores + sudo make install + sudo ldconfig + cd .. else echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies" fi diff --git a/CMakeLists.txt b/CMakeLists.txt index e758eb577f..274955e628 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -475,6 +475,9 @@ if(FF_BUILD_VISUALIZATION_TOOL) endif() if(FF_BUILD_SENTENCEPIECE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) + if (FF_GPU_BACKEND STREQUAL "hip_rocm") + SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") + endif() add_subdirectory(deps/sentencepiece sentencepiece EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/sentencepiece/src) target_link_libraries(flexflow sentencepiece) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 4721086ec0..6b29feac67 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -144,7 +144,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; BatchConfig::PerTokenInfo *token_infos; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cuFloatComplex *complex_input; +#endif }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 30e122278a..da2825c9d2 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -152,7 +152,9 @@ class SpecIncMultiHeadSelfAttentionMeta : public OpMeta { float *devQKVProjArray, *keyCache, *valueCache; float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cuFloatComplex *complex_input; +#endif // void *reserveSpace; // BatchConfig::token_idxs *dev_token2ids; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index fce4998e5d..b1b265e6d6 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -144,7 +144,9 @@ class TreeIncMultiHeadSelfAttentionMeta : public OpMeta { float *devQKVProjArray, *keyCache, *valueCache; float *qk_prods, *qk_prods_softmax; float *attn_heads, *W_out_contiguous; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cuFloatComplex *complex_input; +#endif TreeVerifyBatchConfig::PerTokenInfo *token_infos; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index b285c2ad8e..760237cf3e 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -50,8 +50,8 @@ if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then # "$FF_HOME"/build/examples/cpp/split_test/split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples - if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi - "$FF_HOME"/build/examples/cpp/inference/LLAMA/LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel + # if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi + # "$FF_HOME"/build/examples/cpp/inference/LLAMA/LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel #"$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel #"$FF_HOME"/build/examples/cpp/inference/transformers/inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel else @@ -83,8 +83,8 @@ else # split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # Inference examples - if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi - LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel + # if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi + # LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel #inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel #inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel fi From f0604b30248f1793d1f611c95458228ed5cdb963 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 22 May 2023 06:32:52 +0800 Subject: [PATCH 139/344] [Inference] - Cleanup/refactor (#732) * implemented file-based configs, remove spec_pipeline folder * fix * add inference test, script to downlaod weights * update readme * update ci scripts * newlines * fix gpu-ci * fix * fix * update test file * added incr decoding program, moved LLAMA folder from examples * linting * add incremental decoding to test * update readme * add script to download opt weights * fix support for opt, move code to root inference folder * linting * update test file * fix * bug fix * update test --- .github/README.md | 21 +- .github/workflows/build-skip.yml | 1 + .github/workflows/build.yml | 2 + .github/workflows/clang-format-check.yml | 1 + .github/workflows/gpu-ci.yml | 60 +++- .gitignore | 1 + CMakeLists.txt | 13 +- conda/flexflow-cpu.yml | 1 + examples/cpp/inference/LLAMA/Makefile | 39 --- examples/cpp/inference/LLAMA/dataloader.cc | 285 ------------------ examples/cpp/inference/LLAMA/dataloader.cu | 110 ------- examples/cpp/inference/LLAMA/llama.cc | 140 --------- .../llama_spec_pipeline/CMakeLists.txt | 20 -- .../inference/llama_spec_pipeline/README.md | 14 - .../inference/llama_spec_pipeline/llama.cc | 256 ---------------- .../llama_spec_pipeline/llama_rae.cc | 246 --------------- examples/cpp/inference/opt/CMakeLists.txt | 22 -- examples/cpp/inference/opt/Makefile | 38 --- examples/cpp/inference/opt/README.md | 45 --- examples/cpp/inference/opt/opt.cc | 238 --------------- examples/cpp/inference/opt/opt.h | 40 --- examples/cpp/inference/opt/opt_baseline.py | 23 -- .../opt_spec_pipeline/CMakeLists.txt | 20 -- .../cpp/inference/opt_spec_pipeline/Makefile | 37 --- .../opt_spec_pipeline/opt_pipeline.cc | 203 ------------- include/{ => flexflow}/gpt_tokenizer.h | 2 +- include/flexflow/tokenizers.h | 2 +- inference/.gitignore | 3 + inference/{spec_infer => }/MODEL_WEIGHTS.md | 3 +- inference/file_loader.cc | 9 + .../incr_decoding}/CMakeLists.txt | 11 +- .../incr_decoding}/Makefile | 0 inference/incr_decoding/incr_decoding.cc | 187 ++++++++++++ inference/models/configs/llama_190M.json | 10 + inference/models/configs/llama_7B.json | 10 + inference/models/configs/opt_125M.json | 15 + inference/models/configs/opt_6B.json | 15 + inference/models/llama.cc | 37 +-- inference/models/llama.h | 72 ++++- inference/models/opt.cc | 13 +- inference/models/opt.h | 70 ++++- inference/spec_infer/spec_infer.cc | 168 +++++++++-- inference/utils/download_llama_weights.py | 49 +++ inference/utils/download_opt_weights.py | 46 +++ src/runtime/gpt_tokenizer.cc | 11 +- tests/.gitignore | 0 tests/gpt_tokenizer.cpp | 15 +- tests/gpt_tokenizer_test.sh | 4 +- tests/inference_tests.sh | 55 ++++ 49 files changed, 776 insertions(+), 1907 deletions(-) delete mode 100644 examples/cpp/inference/LLAMA/Makefile delete mode 100644 examples/cpp/inference/LLAMA/dataloader.cc delete mode 100644 examples/cpp/inference/LLAMA/dataloader.cu delete mode 100644 examples/cpp/inference/LLAMA/llama.cc delete mode 100644 examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt delete mode 100644 examples/cpp/inference/llama_spec_pipeline/README.md delete mode 100644 examples/cpp/inference/llama_spec_pipeline/llama.cc delete mode 100644 examples/cpp/inference/llama_spec_pipeline/llama_rae.cc delete mode 100644 examples/cpp/inference/opt/CMakeLists.txt delete mode 100644 examples/cpp/inference/opt/Makefile delete mode 100644 examples/cpp/inference/opt/README.md delete mode 100644 examples/cpp/inference/opt/opt.cc delete mode 100644 examples/cpp/inference/opt/opt.h delete mode 100644 examples/cpp/inference/opt/opt_baseline.py delete mode 100644 examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt delete mode 100644 examples/cpp/inference/opt_spec_pipeline/Makefile delete mode 100644 examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc rename include/{ => flexflow}/gpt_tokenizer.h (99%) create mode 100644 inference/.gitignore rename inference/{spec_infer => }/MODEL_WEIGHTS.md (89%) rename {examples/cpp/inference/LLAMA => inference/incr_decoding}/CMakeLists.txt (76%) rename {examples/cpp/inference/llama_spec_pipeline => inference/incr_decoding}/Makefile (100%) create mode 100644 inference/incr_decoding/incr_decoding.cc create mode 100644 inference/models/configs/llama_190M.json create mode 100644 inference/models/configs/llama_7B.json create mode 100644 inference/models/configs/opt_125M.json create mode 100644 inference/models/configs/opt_6B.json create mode 100644 inference/utils/download_llama_weights.py create mode 100644 inference/utils/download_opt_weights.py create mode 100644 tests/.gitignore create mode 100755 tests/inference_tests.sh diff --git a/.github/README.md b/.github/README.md index 28b2ef789c..940bff486d 100644 --- a/.github/README.md +++ b/.github/README.md @@ -28,8 +28,8 @@ for serving generative LLMs while provably preserving model quality. Performance comparison

-## Install SpecInfer -SpecInfer is built on top of FlexFlow. You can install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](INSTALL.md) for installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. +## Build/Install SpecInfer +SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. ## Run SpecInfer The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: @@ -37,8 +37,12 @@ The source code of the SpecInfer pipeline is available at [this folder](../infer * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. +* `-llm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama") * `-llm-weight`: path to the folder that stores the LLM weights -* `-ssm-weight`: path to the folder that stores the small speculative models' weights. You can use multiple `-ssm-weight`s in the command line to launch multiple SSMs. +* `-llm-config`: path to the json file that stores the LLM model configs +* `-ssm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. +* `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. * `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). * `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: @@ -47,10 +51,10 @@ class RequestManager { RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); } ``` -For example, you can use the following command line to serve a LLaMA-6B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. +For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-weight /path/to/llm/weights -ssm-weight /path/to/ssm1/weights -smm-weight /path/to/ssm2/weights -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json ``` ### Tokenizers @@ -60,7 +64,7 @@ SpecInfer supports two tokenizers: * The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. ### LLM Weights -The weight files using in our demo is extracted from HuggingFace, and stored in our AWS S3 bucket. +The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. | Model | Model id on Hugging Face | Storage Location | | :---- | :---- | :---- | @@ -69,11 +73,14 @@ The weight files using in our demo is extracted from HuggingFace, and stored in | OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | | OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125m_native.tar.gz | -You can use [this script](../inference/spec_infer/MODEL_WEIGHTS.md) to convert the weights of a HuggingFace LLM to the SpecInfer weight format. +You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. ### Prompt Datasets We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). +### Script to run the demo +You can take a look at [this script](../tests/inference_tests.sh), which is run in CI for each new commit, for an example of how to run the demo. + ## Difference between SpecInfer and HuggingFace Assistant Model There are two major differences between the two systems. diff --git a/.github/workflows/build-skip.yml b/.github/workflows/build-skip.yml index a983d6dda4..65e9d04132 100644 --- a/.github/workflows/build-skip.yml +++ b/.github/workflows/build-skip.yml @@ -3,6 +3,7 @@ on: pull_request: paths-ignore: - "include/**" + - "inference/**" - "cmake/**" - "config/**" - "python/**" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5d83aaedf4..99cb459aae 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -3,6 +3,7 @@ on: pull_request: paths: - "include/**" + - "inference/**" - "cmake/**" - "config/**" - "python/**" @@ -14,6 +15,7 @@ on: - "master" paths: - "include/**" + - "inference/**" - "cmake/**" - "config/**" - "python/**" diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 46c9bf3be2..1601da86b3 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -10,6 +10,7 @@ jobs: - check: "src" exclude: '\.proto$' - check: "include" + - check: "inference" - check: "nmt" - check: "python" - check: "scripts" diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 135b569055..b24e7236a8 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -7,6 +7,7 @@ on: - "python/**" - "setup.py" - "include/**" + - "inference/**" - "src/**" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" @@ -21,6 +22,7 @@ on: - "python/**" - "setup.py" - "include/**" + - "inference/**" - "src/**" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" @@ -122,10 +124,64 @@ jobs: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib ./tests/align/test_all_operators.sh + inference-tests: + name: Inference Tests + runs-on: self-hosted + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + needs: gpu-ci-concierge + container: + image: ghcr.io/flexflow/flexflow-environment-cuda:latest + options: --gpus all --shm-size=8192m + steps: + - name: Install updated git version + run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git + + - name: Checkout Git Repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Install conda and FlexFlow dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + activate-environment: flexflow + environment-file: conda/flexflow-cpu.yml + auto-activate-base: false + + - name: Build FlexFlow + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + mkdir build + cd build + ../config/config.linux + make -j + + - name: Run inference tests + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export CUDNN_DIR=/usr/local/cuda + export CUDA_DIR=/usr/local/cuda + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + + # GPT tokenizer test + ./tests/gpt_tokenizer_test.sh + + # Inference tests + ./tests/inference_tests.sh + gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests runs-on: self-hosted - needs: gpu-ci-concierge + needs: inference-tests container: image: ghcr.io/flexflow/flexflow-environment-cuda:latest options: --gpus all --shm-size=8192m @@ -162,8 +218,6 @@ jobs: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 - # GPT tokenizer test - ./tests/gpt_tokenizer_test.sh # Python tests ./tests/multi_gpu_tests.sh 4 diff --git a/.gitignore b/.gitignore index 2feb324b11..1ba1e26bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,4 @@ train-labels-idx1-ubyte # Logs logs/ +gpt_tokenizer diff --git a/CMakeLists.txt b/CMakeLists.txt index 274955e628..5e305b15bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -550,21 +550,10 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/LLAMA) -endif() - -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/opt) -endif() - -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/llama_spec_pipeline) add_subdirectory(inference/spec_infer) + add_subdirectory(inference/incr_decoding) endif() -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/opt_spec_pipeline) -endif() # installation set(INCLUDE_DEST "include") diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml index 6bd9d6d663..e8cd4c1114 100644 --- a/conda/flexflow-cpu.yml +++ b/conda/flexflow-cpu.yml @@ -17,3 +17,4 @@ dependencies: - torch --index-url https://download.pytorch.org/whl/cpu - torchaudio --index-url https://download.pytorch.org/whl/cpu - torchvision --index-url https://download.pytorch.org/whl/cpu + - regex diff --git a/examples/cpp/inference/LLAMA/Makefile b/examples/cpp/inference/LLAMA/Makefile deleted file mode 100644 index 4249443f7d..0000000000 --- a/examples/cpp/inference/LLAMA/Makefile +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 0 # Include debugging symbols -MAX_DIM ?= 4 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 1 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 1 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) - -# Put the binary file name here -OUTFILE ?= llama -# List all the application source files here -GEN_SRC = llama.cc dataloader.cc -GEN_GPU_SRC = dataloader.cu -ifndef CUDA_HOME -CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) -endif - - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/LLAMA/dataloader.cc b/examples/cpp/inference/LLAMA/dataloader.cc deleted file mode 100644 index 7f2cfe3577..0000000000 --- a/examples/cpp/inference/LLAMA/dataloader.cc +++ /dev/null @@ -1,285 +0,0 @@ - -#include "llama.h" -#include - -using namespace Legion; - -DataLoader::DataLoader(FFModel &ff, - LLAMAConfig const *llamaconfig, - ParallelTensor const &input) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - num_samples = llamaconfig->sentence_len; - - { - batch_input = input; - int num_dims = input->num_dims; - - ParallelDim dims[num_dims]; - for (int i = 0; i < num_dims; i++) { - if (i == 0) { - dims[i].size = 1; - } else { - dims[i].size = input->dims[i].size; - } - - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = input->dims[i].is_replica_dim; - // Assume only the first dim can be the replica dim - assert(i == num_dims - 1 || (!dims[i].is_replica_dim)); - } - // dims[num_dims - 1].size = num_samples; - // full_input = - // ff.create_parallel_tensor_legion_ordering(num_dims, dims, DT_INT64); - // assert(full_input != nullptr && "full_input is nullptr"); - // ff.map_tensor(full_input, NULL /*parallel_op*/); - } - - // size_t llamaconfig_size = sizeof(llamaconfig); - // std::cout << "llama config dataloader: " << llamaconfig->input_path << - // std::endl; - - // // Load entire dataset - // TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, - // TaskArgument(llamaconfig, llamaconfig_size)); - // // regions[1]: full_input - // launcher.add_region_requirement(RegionRequirement(full_input->region, - // WRITE_ONLY, - // EXCLUSIVE, - // full_input->region, - // MAP_TO_FB_MEMORY)); - // launcher.add_field(0, FID_DATA); - // runtime->execute_task(ctx, launcher); -} - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 1); - assert(task->regions.size() == 1); - LLAMAConfig const *llamaconfig = (LLAMAConfig *)task->args; - - AccessorWO const acc_input(regions[0], FID_DATA); - Rect<3> rect_input = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(acc_input.accessor.is_dense_arbitrary(rect_input)); - - long *input_ptr = acc_input.ptr(rect_input.lo); - std::cout << "load entire dataset" << rect_input.volume() << std::endl; - - // load from file - load_from_file(input_ptr, - rect_input.volume(), - "/home/ubuntu/FlexFlow/examples/cpp/inference/LLAMA/tokens/" - "llama_demo_tokens"); -} - -void DataLoader::next_batch(FFModel &ff, - BatchConfig *bc, - std::map &batch_predictions) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load Input - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_input->parallel_is); - ArgumentMap argmap; - - DataLoaderNextBatchInput next_batch_input = {bc, batch_predictions}; - DataLoaderNextBatchInput const *ptr = &next_batch_input; - size_t next_batch_input_sz = sizeof(next_batch_input); - assert(ptr->prev_batch_preds.size() == batch_predictions.size()); - - std::cout << "next batch internal" << std::endl; - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input->parallel_is, - TaskArgument(ptr, next_batch_input_sz), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - batch_input->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(full_input->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_input->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input->region)); - launcher.add_field(1, FID_DATA); - - runtime->execute_index_space(ctx, launcher); - } - // progress next_index - next_index += ff.config.batchSize; - next_token_idx += 1; -} - -void DataLoader::reset() { - next_index = 0; - next_token_idx = 0; - next_batch_index = 0; -} - -template -void DataLoader::load_from_file(T *ptr, size_t size, std::string filename) { - std::cout << "load from file: " << filename << std::endl; - std::ifstream in(filename, std::ios::in | std::ios::binary); - std::vector host_array(size); - size_t loaded_data_size = sizeof(T) * size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - - size_t in_get_size = in.gcount(); - // std::cout << "size seee" << std::endl; - // std::cout << loaded_data_size << std::endl; - // std::cout << in_get_size << std::endl; - if (in_get_size != loaded_data_size) { - std::cout << "load data error" << std::endl; - return; - } - - // std::cout << "finish loading input"; - assert(size == host_array.size()); - - // normal - long data_index = 0; - for (auto v : host_array) { - ptr[data_index++] = v; - } - in.close(); -} - -template -void DataLoader::load_attention_weights(T *ptr, - size_t size, - int hidden_dim, - int num_heads, - std::string layer_name, - std::string weight_path) { - - std::string q_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wq_weight"; - std::string k_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wk_weight"; - std::string v_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wv_weight"; - std::string o_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wo_weight"; - std::vector weight_files = {q_file, k_file, v_file, o_file}; - - int file_index = 0; - - // q, k, v, o -> 0, 1, 2, 3 - for (auto file : weight_files) { - std::cout << "file name and index: " << file << "->" << file_index << "\n"; - size_t partial_size = size / 4; - std::ifstream in(file, std::ios::in | std::ios::binary); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(T) * partial_size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - size_t in_get_size = in.gcount(); - - if (in_get_size != loaded_data_size) { - std::cout << "load data error"; - return; - } - assert(partial_size == host_array.size()); - - size_t one_head_size = hidden_dim * (hidden_dim / num_heads); - size_t data_index = 0; - - for (int i = 0; i < num_heads; i++) { - size_t start_index = i * one_head_size * 4 + file_index * one_head_size; - for (size_t j = start_index; j < start_index + one_head_size; j++) { - ptr[j] = host_array.at(data_index); - data_index += 1; - } - } - file_index++; - - in.close(); - } -} - -void DataLoader::store_outputs(BatchConfig *bc, - InferenceResult const &ir, - std::map &batch_predictions) { - - std::cout << "store outputs...." << std::endl; - batch_predictions.clear(); - - // size_t guid = bc->tokensInfo[0].guid; - auto guid = bc->requestsInfo[bc->tokensInfo[0].request_index].request_guid; - - int start_idx = bc->tokensInfo[0].abs_depth_in_request; - - // only store the last token of each req - for (size_t i = 0; i <= bc->num_active_tokens(); i++) { - auto current_guid = - bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; - if (i == bc->num_active_tokens() || current_guid != guid) { - - int result_index = bc->tokensInfo[i - 1].abs_depth_in_request - start_idx; - batch_predictions[guid] = ir.token_ids[i - 1]; - - std::cout << "i: " << i << ", dds-" << guid << ", result index" - << result_index << ", result value: " << batch_predictions[guid] - << "\n"; - - if (i < bc->num_active_tokens()) { - guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; - start_idx = bc->tokensInfo[i].abs_depth_in_request; - } - } - } - - assert(batch_predictions.size() == bc->num_active_requests()); -} - -template void - DataLoader::load_attention_weights(float *ptr, - size_t size, - int hidden_dim, - int num_heads, - std::string layer_name, - std::string weight_path); -template void DataLoader::load_from_file(long *ptr, - size_t size, - std::string filename); -template void DataLoader::load_from_file(float *ptr, - size_t size, - std::string filename); - -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Inputs Task"); - } -} diff --git a/examples/cpp/inference/LLAMA/dataloader.cu b/examples/cpp/inference/LLAMA/dataloader.cu deleted file mode 100644 index e32e3ddc33..0000000000 --- a/examples/cpp/inference/LLAMA/dataloader.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/utils/cuda_helper.h" -#include "llama.h" - -void DataLoader::load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - - LLAMAConfig llamaconfig; - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - DataLoaderNextBatchInput const input_struct = - *((DataLoaderNextBatchInput *)task->args); - BatchConfig *bc = input_struct.bc; - - std::map const &prev_batch_preds = - input_struct.prev_batch_preds; - - TensorAccessorR full_input( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW batch_input(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - Domain full_input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain batch_input_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - - coord_t sequence_length = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; - coord_t batch_size = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - - auto guid = bc->requestsInfo[bc->tokensInfo[0].request_index].request_guid; - int start_idx = bc->tokensInfo[0].abs_depth_in_request; - int dst_idx = 0; - - for (int i = 0; i <= bc->num_active_tokens(); i++) { - auto current_guid = - bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; - if (i == bc->num_active_tokens() || current_guid != guid) { - int tokens_to_copy = - (bc->tokensInfo[i - 1].abs_depth_in_request - start_idx + 1); - - int request_index = bc->tokensInfo[i - 1].request_index; - int token_start_offset = - bc->requestsInfo[request_index].token_start_offset; - - std::cout << "size to copy: " << tokens_to_copy - << ", start offset: " << token_start_offset << "\n"; - if (tokens_to_copy > 1 || token_start_offset == 0) { - // token pos < init length, the init length is the input sentence length - // so this is the initial input, load from file. - int copy_start_index = guid * llamaconfig.sentence_len; - std::cout << "copy index: " << copy_start_index << "\n"; - copy_kernel<<>>( - batch_input.ptr + dst_idx, - full_input.ptr + copy_start_index, - tokens_to_copy); - std::cout << "------------req---------------: " << guid << "\n"; - for (int i = 0; i < 8; i++) { - std::cout << "value: " << full_input.ptr[copy_start_index + i] - << std::endl; - } - std::cout << "dst index: " << dst_idx << "\n"; - - } else { - // for token by token generating, get token from the previous inference. - - long token = prev_batch_preds.at(guid); - - std::cout << "next iter " << bc->tokensInfo[i - 1].abs_depth_in_request - << ", dst_idx: " << dst_idx << ", token:" << token << "\n"; - long *dst_ptr = batch_input.ptr + dst_idx; - - cudaMemcpy(dst_ptr, - &token, - sizeof(FlexFlow::RequestManager::TokenId), - cudaMemcpyHostToDevice); - } - - if (i < bc->num_active_tokens()) { - guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; - start_idx = bc->tokensInfo[i].abs_depth_in_request; - } - dst_idx = i; - } - } - - std::cout << "load input finished....." << std::endl; -} diff --git a/examples/cpp/inference/LLAMA/llama.cc b/examples/cpp/inference/LLAMA/llama.cc deleted file mode 100644 index 8d25eb3a3a..0000000000 --- a/examples/cpp/inference/LLAMA/llama.cc +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "models/llama.h" -#include "flexflow/inference.h" -#include "flexflow/tokenizers.h" -#include - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("llama"); - -struct FilePaths { - std::string weight1_file_path; - std::string weight2_file_path; - std::string weight3_file_path; - std::string weight4_file_path; - std::string prompt_file_path; - std::string tokenizer_file_path; -}; - -void parse_input_args(char **argv, int argc, FilePaths &paths) { - for (int i = 1; i < argc; i++) { - // weights - if (!strcmp(argv[i], "--weight1")) { - paths.weight1_file_path = std::string(argv[++i]); - continue; - } - // weights - if (!strcmp(argv[i], "--weight2")) { - paths.weight2_file_path = std::string(argv[++i]); - continue; - } - // weights - if (!strcmp(argv[i], "--weight3")) { - paths.weight3_file_path = std::string(argv[++i]); - continue; - } - // weights - if (!strcmp(argv[i], "--weight4")) { - paths.weight4_file_path = std::string(argv[++i]); - continue; - } - // prompts - if (!strcmp(argv[i], "--prompt")) { - paths.prompt_file_path = std::string(argv[++i]); - continue; - } - // tokenizer - if (!strcmp(argv[i], "--tokenizer")) { - paths.tokenizer_file_path = std::string(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffconfig; - FilePaths file_paths; - FFModel ff(ffconfig); - - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, file_paths); - SentencePieceTokenizer tokenizer(file_paths.tokenizer_file_path); - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); - RequestManager rm(&tokenizer); - std::string text2 = "I believe the meaning of life is"; - std::string text3 = "Talk to me as if you are python programming language " - "and want to sell me yourself"; - std::string text4 = "Write podcast about importance to include ChatGPT into " - "the evening routine."; - int total_num_requests = 0; - { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - rm.register_new_request(text, 128 /*max_sequence_length*/); - if (total_num_requests == 10) { - break; - } - } - } - - FFModel model(ffconfig); - LLAMA::create_llama_model(model, - im, - "7b", - file_paths.weight1_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, - INC_DECODING_MODE); - - BatchConfig bc; - InferenceResult ir; - while (rm.get_num_processed_requests() < total_num_requests) { - bc = rm.prepare_next_batch(bc, ir); - if (rm.get_num_processed_requests() >= total_num_requests) { - break; - } - FutureMap fm = im.inference(&model, 0, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - ir = future.get_result(); - } - - // Execution fence - { - Future future = runtime->issue_execution_fence(ctx); - future.get_void_result(); - } - - // float* data - std::cout << "----------inference finished--------------" << std::endl; -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt b/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt deleted file mode 100644 index 4c8b147e10..0000000000 --- a/examples/cpp/inference/llama_spec_pipeline/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExample_llama_pipeline) -set(project_target llama_pipeline) - - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - llama.cc - ${CMAKE_SOURCE_DIR}/inference/file_loader.cc - ${CMAKE_SOURCE_DIR}/inference/models/llama.cc) - - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) - -set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/llama_spec_pipeline/README.md b/examples/cpp/inference/llama_spec_pipeline/README.md deleted file mode 100644 index 4a112ba45f..0000000000 --- a/examples/cpp/inference/llama_spec_pipeline/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# an example of running llama model with beam search - -## how to run it? -1. build the flexflow with FF_BUILD_ALL_INFERENCE_EXAMPLES or FF_BUILD_ALL_EXAMPLES -2. download the weight and token file from aws s3. -```bash -aws s3 cp s3://catalyst-llama/7B_weights_float.tar.gz FF_HOME/examples/cpp/inference/spec_verify_pipeline/weights -tar -zxvf 7B_weights_float.tar.gz -aws s3 cp s3://catalyst-llama/tokens.tar FF_HOME/examples/cpp/inference/spec_verify_pipeline/tokens -tar -zxvf tokens.tar -``` -3. run *spec_verify_pipeline* with `--weights` `--dataset` `-b 5` `--only-data-parallel` -4. [expected results](https://github.com/flexflow/FlexFlow/pull/681#issuecomment-1534264054) - diff --git a/examples/cpp/inference/llama_spec_pipeline/llama.cc b/examples/cpp/inference/llama_spec_pipeline/llama.cc deleted file mode 100644 index f149b6c9d6..0000000000 --- a/examples/cpp/inference/llama_spec_pipeline/llama.cc +++ /dev/null @@ -1,256 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "models/llama.h" -#include "flexflow/inference.h" -#include "flexflow/tokenizers.h" - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("llama"); - -void parse_input_args(char **argv, int argc, LLAMA::Config &config) { - for (int i = 1; i < argc; i++) { - // input - if (!strcmp(argv[i], "--dataset")) { - config.input_path = std::string(argv[++i]); - continue; - } - - // weights - if (!strcmp(argv[i], "--weights")) { - config.weight_file_path = std::string(argv[++i]); - continue; - } - - // weights - if (!strcmp(argv[i], "--tokenizer")) { - config.tokenizer_file_path = std::string(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffconfig; - LLAMA::Config llama_config; - - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, llama_config); - SentencePieceTokenizer tokenizer(llama_config.tokenizer_file_path); - InferenceManager im(ffconfig, llama_config.max_num_tokens, 1); - RequestManager rm(&tokenizer); - // Add a single request - // std::vector prompt{ - // 1, 306, 4658, 278, 6593, 310, 2834, 338}; - std::string text2 = "I believe the meaning of life is"; - rm.register_new_request(text2, llama_config.max_seq_len); - - FFModel beam_model(ffconfig), tree_model(ffconfig); - LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); - LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); - // LLAMA::create_llama_model(inc_model, im, llama_config, 1, - // INC_DECODING_MODE); - - // entry--------------------------- - int depth = 0; - std::map beam_future_handlers, tree_future_handler; - std::map beam_batch_configs; - std::map tree_batch_configs; - - bool new_req = true; - TreeVerifyBatchConfig tree_bc; - - int iteration = 0; - - while (depth < llama_config.max_beam_depth) { - int bid = 0; - if (beam_future_handlers.find(bid) == beam_future_handlers.end()) { - BeamSearchBatchConfig bc; - InferenceResult ir; - bc = rm.prepare_next_batch_init(tree_bc, ir); - - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - } else { - // have luanched this bid - Future future = beam_future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } else { - std::cout << "future is ready...." << std::endl; - } - // process end - BeamInferenceResult ir = future.get_result(); - BeamSearchBatchConfig bc = beam_batch_configs[bid]; - depth = bc.beamRequestsInfo[0].current_depth; - bc = rm.prepare_next_batch_beam(bc, ir); - - std::cout << "llama current depth: " << depth << std::endl; - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - - // tranverse the tree in dfs order; - if (depth >= llama_config.max_beam_depth) { - - printf("\n\n ------Final Beam Search Batch------\n"); - printf("[Beam] num_tokens: %d\n", bc.num_tokens); - for (int i = 0; i < bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << bc.tokensInfo[i].request_index - << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; - } - - // printf("\n\n prepare tree_bc from final beam search bc\n"); - tree_bc = rm.prepare_next_batch_verify(bc); - - printf("\n\n\n ------Tree Verify Batch-------\n"); - // should have the same content as the hardcoded verification block - // below right now, it only contains the prompt need to add in the beam - // search result - - printf("[Verify] num_tokens : %d\n", tree_bc.num_tokens); - printf("[Verify] num_tokens_in_batch: %d\n", - tree_bc.requestsInfo[0].num_tokens_in_batch); - printf("------------------------------\n"); - - for (int i = 0; i < tree_bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << tree_bc.tokensInfo[i].request_index << ", Abs Depth: " - << tree_bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << tree_bc.tokensInfo[i].token_id << "\n"; - } - - printf("\n\n ------Commit Verified Tokens-------\n"); - for (int i = 0; i < tree_bc.num_tokens_to_commit; i++) { - std::cout << "[Commit] Request Index: " - << tree_bc.commited_tokens[i].request_index - << ", Abs Depth: " << tree_bc.commited_tokens[i].token_depth - << ", Token Index in batch: " - << tree_bc.commited_tokens[i].token_index << "\n"; - } - - FutureMap fm = im.inference(&tree_model, 0, tree_bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - InferenceResult ir = future.get_result(); - for (int i = 0; i < tree_bc.num_tokens; i++) { - if (i == 7) { - std::cout << "------------------\n"; - } - printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); - } - - std::cout << "------Init New Beam Search Batch------\n"; - bc = rm.prepare_next_batch_init(tree_bc, ir); - std::cout << "[Init] num_tokens: " << bc.num_tokens << "\n"; - for (int i = 0; i < bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << bc.tokensInfo[i].request_index - << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; - } - std::cout << "Batch Depth: " << bc.beamRequestsInfo[0].current_depth - << "\n"; - - iteration++; - - if (iteration < 4) { - std::cout << "\n\n~~~~~~~~~~teration " << iteration << "~~~~~~~~~~\n"; - depth = bc.beamRequestsInfo[0].current_depth; - fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - } else { - break; - } - } - } - } - -#ifdef DEADCODE - { - std::vector prompt{1, - 306, - 4658, - 278, - 6593, - 310, - 2834, - 338, - 593, - 595, - 17252, - 5031, - 993, - 616}; - BatchConfig bc; - bc.request_completed[0] = false; - bc.num_tokens = prompt.size(); - bc.requestsInfo[0].token_start_offset = 0; - bc.requestsInfo[0].num_tokens_in_batch = prompt.size(); - bc.requestsInfo[0].max_sequence_length = 347; - bc.requestsInfo[0].request_guid = 1234; - for (size_t i = 0; i < prompt.size(); i++) { - bc.tokensInfo[i].abs_depth_in_request = i; - bc.tokensInfo[i].request_index = 0; - bc.tokensInfo[i].token_id = prompt[i]; - } - FutureMap fm = im.inference(&inc_model, 0, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - InferenceResult ir = future.get_result(); - for (int i = 0; i < bc.num_tokens; i++) { - printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); - } - bc.num_tokens = 1; - bc.requestsInfo[0].token_start_offset = prompt.size(); - bc.requestsInfo[0].num_tokens_in_batch = 1; - bc.tokensInfo[0].abs_depth_in_request = prompt.size(); - bc.tokensInfo[0].request_index = 0; - bc.tokensInfo[0].token_id = ir.token_ids[prompt.size() - 1]; - fm = im.inference(&inc_model, 0, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - future = fm.get_future(0); - ir = future.get_result(); - printf("decoding_tokens[%d] = %d\n", - bc.tokensInfo[0].abs_depth_in_request, - ir.token_ids[0]); - } -#endif - // Execution fence - { - Future future = runtime->issue_execution_fence(ctx); - future.get_void_result(); - } - - // float* data - std::cout << "----------inference finished--------------" << std::endl; -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/llama_spec_pipeline/llama_rae.cc b/examples/cpp/inference/llama_spec_pipeline/llama_rae.cc deleted file mode 100644 index 7116c4bf21..0000000000 --- a/examples/cpp/inference/llama_spec_pipeline/llama_rae.cc +++ /dev/null @@ -1,246 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/inference.h" -#include "models/llama.h" - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("llama"); - -void parse_input_args(char **argv, int argc, LLAMA::Config &config) { - for (int i = 1; i < argc; i++) { - // input - if (!strcmp(argv[i], "--dataset")) { - config.input_path = std::string(argv[++i]); - continue; - } - - // weights - if (!strcmp(argv[i], "--weights")) { - config.weight_file_path = std::string(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffconfig; - LLAMA::Config llama_config; - - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, llama_config); - InferenceManager im(ffconfig, llama_config.batchSize, 1); - RequestManager rm; - // Add a single request - std::vector prompt{ - 1, 306, 4658, 278, 6593, 310, 2834, 338}; - rm.register_new_request(prompt, llama_config.sentence_len); - - FFModel beam_model(ffconfig), tree_model(ffconfig), inc_model(ffconfig); - LLAMA::create_llama_model(beam_model, im, llama_config, 1, BEAM_SEARCH_MODE); - LLAMA::create_llama_model(tree_model, im, llama_config, 1, TREE_VERIFY_MODE); - // LLAMA::create_llama_model(inc_model, im, llama_config, 1, - // INC_DECODING_MODE); - - // entry--------------------------- - int abs_depth = 0; - std::map beam_future_handlers, tree_future_handler; - std::map beam_batch_configs; - std::map tree_batch_configs; - - bool new_req = true; - TreeVerifyBatchConfig tree_bc; - InferenceResult ir; - int num_iterations = 2; - - for (int itr = 0; itr < num_iterations; itr++) { - printf("\n\n ITERATION %d \n\n", itr); - - // first iteration of beam search, calling prepare_next_batch_init - int beam_search_depth = 0; - int bid = 0; - BeamSearchBatchConfig bc; - bc = rm.prepare_next_batch_init(tree_bc, ir); - - printf("\n\n init beam search bc\n"); - printf("bc.num_tokens: %d\n", bc.num_tokens); - for (int i = 0; i < bc.num_tokens; i++) { - printf("bc.tokensInfo[%d].token_id: %d\n", i, bc.tokensInfo[i].token_id); - printf("bc.tokensInfo[%d].abs_depth_in_request: %d\n", - i, - bc.tokensInfo[i].abs_depth_in_request); - printf("bc.tokensInfo[%d].request_index: %d\n", - i, - bc.tokensInfo[i].request_index); - } - - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - - // subsequent iterations of beam search - while (beam_search_depth < llama_config.max_beam_depth) { - // have luanched this bid - Future future = beam_future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } else { - std::cout << "future is ready...." << std::endl; - } - // process end - BeamInferenceResult ir_beam = future.get_result(); - BeamSearchBatchConfig bc = beam_batch_configs[bid]; - abs_depth = bc.beamRequestsInfo[0].current_depth; - bc = rm.prepare_next_batch_beam(bc, ir_beam); - - std::cout << "llama current depth: " << abs_depth << std::endl; - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - beam_search_depth++; - } - - // verify - printf("\n\n ------Final Beam Search Batch------\n"); - printf("[Beam] num_tokens: %d\n", bc.num_tokens); - for (int i = 0; i < bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " << bc.tokensInfo[i].request_index - << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; - } - - printf("\n\n prepare tree_bc from final beam search bc\n"); - tree_bc = rm.prepare_next_batch_verify(bc); - - printf("\n\n\n t------Tree Verify Batch-------\n"); - // should have the same content as the hardcoded verification block below - // right now, it only contains the prompt - // need to add in the beam search result - - printf("[Verify] num_tokens : %d\n", tree_bc.num_tokens); - printf("[Verify] num_tokens_in_batch: %d\n", - tree_bc.requestsInfo[0].num_tokens_in_batch); - printf("------------------------------\n"); - - for (int i = 0; i < tree_bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << tree_bc.tokensInfo[i].request_index - << ", Abs Depth: " << tree_bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << tree_bc.tokensInfo[i].token_id << "\n"; - } - - fm = im.inference(&tree_model, 0, tree_bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - ir = future.get_result(); - for (int i = 0; i < tree_bc.num_tokens; i++) { - printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); - } - } - - // // original - // { - // std::vector tokens{1, - // 306, - // 4658, - // 278, - // 6593, - // 310, - // 2834, - // 338, - // 593, - // 595, - // 17252, - // 5031, - // 993, - // 616, - // 368, - // 2302}; - // BatchConfig bc; - // bc.num_tokens = 16; - // bc.requestsInfo[0].num_tokens_in_batch = bc.num_tokens; - // bc.requestsInfo[0].token_start_offset = 0; - // bc.requestsInfo[0].max_sequence_length = 347; - // bc.requestsInfo[0].request_guid = 1000000; - // bc.request_completed[0] = false; - // for (int i = 0; i < bc.num_tokens; i++) { - // bc.tokensInfo[i].token_id = tokens[i]; - // bc.tokensInfo[i].abs_depth_in_request = i; - // bc.tokensInfo[i].request_index = 0; - // } - // FutureMap fm = im.inference(&inc_model, 0, bc); - // assert(fm.get_future_map_domain().get_volume() == 1); - // Future future = fm.get_future(0); - // InferenceResult ir = future.get_result(); - // for (int i = 0; i < bc.num_tokens; i++) { - // printf("decoding_tokens[%d] = %d\n", i, ir.token_ids[i]); - // } - // } - - // // verification - // { - // std::vector tokens{1, - // 306, - // 4658, - // 278, - // 6593, - // 310, - // 2834, - // 338, - // 593, - // 595, - // 17252, - // 5031, - // 993, - // 616, - // 368, - // 2302}; - // tree_bc.num_tokens = 16; - // tree_bc.requestsInfo[0].num_tokens_in_batch = tree_bc.num_tokens; - // for (int i = 0; i < tree_bc.num_tokens; i++) { - // tree_bc.tokensInfo[i].token_id = tokens[i]; - // tree_bc.tokensInfo[i].abs_depth_in_request = i; - // tree_bc.tokensInfo[i].request_index = 0; - // } - // FutureMap fm = im.inference(&tree_model, 0, tree_bc); - // assert(fm.get_future_map_domain().get_volume() == 1); - // Future future = fm.get_future(0); - // InferenceResult ir = future.get_result(); - // for (int i = 0; i < tree_bc.num_tokens; i++) { - // printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); - // } - // } - - // Execution fence - { - Future future = runtime->issue_execution_fence(ctx); - future.get_void_result(); - } - - // float* data - std::cout << "----------inference finished--------------" << std::endl; -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/opt/CMakeLists.txt b/examples/cpp/inference/opt/CMakeLists.txt deleted file mode 100644 index 3156e71f75..0000000000 --- a/examples/cpp/inference/opt/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExample_OPT) -set(project_target OPT) - - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - opt.cc - opt.h - ${CMAKE_SOURCE_DIR}/inference/file_loader.cc) - - - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) - -set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/opt/Makefile b/examples/cpp/inference/opt/Makefile deleted file mode 100644 index afe13d305a..0000000000 --- a/examples/cpp/inference/opt/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 0 # Include debugging symbols -MAX_DIM ?= 4 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 1 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 1 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) - -# Put the binary file name here -OUTFILE ?= opt -# List all the application source files here -GEN_SRC = opt.cc -ifndef CUDA_HOME -CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) -endif - - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/opt/README.md b/examples/cpp/inference/opt/README.md deleted file mode 100644 index 8a7bd10ee5..0000000000 --- a/examples/cpp/inference/opt/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# an example of running opt model -## how to run? -1. build the flexflow with FF_BUILD_ALL_INFERENCE_EXAMPLES or FF_BUILD_ALL_EXAMPLES -2. download the weight and token file from aws s3. -```bash -aws s3 cp s3://catalyst-llama/opt_125m_native.tar.gz FF_HOME/examples/cpp/inference/opt/weights - -tar -zxvf opt_125m_native.tar.gz -``` -3. run *OPT* with `--weights` `--dataset` `--only-data-parallel` -4. run examples/cpp/inference/opt/opt_baseline.py -5. if get same result, it should be fine - -## code structure: -1. use two inputs, token & position, the position input should be after the token input -2. for the attention model, set scaling_query = true, scaling_factor = 0.125 and qk_prod_scaling = false, -all other models should set scaling_query = false and qk_prod_scaling = true -## opt default configuration from huggingface opt-125m -```python -OPTConfig { - "_remove_final_layer_norm": false, - "activation_function": "relu", - "attention_dropout": 0.0, - "bos_token_id": 2, - "do_layer_norm_before": true, - "dropout": 0.1, - "enable_bias": true, - "eos_token_id": 2, - "ffn_dim": 3072, - "hidden_size": 768, - "init_std": 0.02, - "layer_norm_elementwise_affine": true, - "layerdrop": 0.0, - "max_position_embeddings": 2048, - "model_type": "opt", - "num_attention_heads": 12, - "num_hidden_layers": 12, - "pad_token_id": 1, - "transformers_version": "4.27.2", - "use_cache": true, - "vocab_size": 50272, - "word_embed_proj_dim": 768 -} -``` - diff --git a/examples/cpp/inference/opt/opt.cc b/examples/cpp/inference/opt/opt.cc deleted file mode 100644 index 453633c383..0000000000 --- a/examples/cpp/inference/opt/opt.cc +++ /dev/null @@ -1,238 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "opt.h" -#include "flexflow/inference.h" -#include - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("opt"); - -void parse_input_args(char **argv, int argc, OptConfig &config) { - for (int i = 1; i < argc; i++) { - // weights - if (!strcmp(argv[i], "--weights")) { - config.weight_file_path = std::string(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffconfig; - OptConfig optConfig; - FFModel ff(ffconfig); - - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, optConfig); - - //------------------------------compute machine views ------------------ - int num_devices = ffconfig.workersPerNode * ffconfig.numNodes; - std::vector machine_views; - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - machine_views.push_back(view); - } - - std::unordered_map> mapping; - std::unordered_map weights_layers; - - //------------------------------ build the model -------------------------- - Tensor input; - Tensor position_input; - { - int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; - input = ff.create_tensor<2>(token_dims, DT_INT32); - position_input = ff.create_tensor<2>(token_dims, DT_INT32); - } - - mapping[input].push_back(machine_views[0]); - mapping[position_input].push_back(machine_views[0]); - - Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - std::vector axes = {0}; - - Tensor token = ff.embedding(input, - optConfig.vocab_size, - optConfig.word_embed_proj_dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - Layer *embedding = ff.layers.back(); - - weights_layers.emplace("embed_tokens_weight", embedding); - - Tensor positional_embedding = ff.embedding(position_input, - optConfig.max_position_embeddings, - optConfig.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - Layer *pos_embedding = ff.layers.back(); - weights_layers.emplace("embed_positions_weight", pos_embedding); - - Tensor residual = ff.add(token, positional_embedding); - - int num_transformer_layers_per_gpu = (32 + num_devices - 1) / num_devices; - - for (int i = 0; i < optConfig.num_hidden_layers; i++) { - // 125m, 1.7B, ..., 175B applies layer norm BEFORE attention, - // 350m applies layer norm AFTER attention - // https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#LL324C1-L325C1 - // this version is before normalization - - Tensor hidden_states = ff.layer_norm( - residual, axes, optConfig.layer_norm_elementwise_affine, 1e-05); - Layer *self_attn_layer_norm = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_self_attn_layer_norm_weight", - self_attn_layer_norm); - if (i % num_transformer_layers_per_gpu == 0) { - mapping[hidden_states].push_back( - machine_views[i / num_transformer_layers_per_gpu]); - } - - Tensor mha = ff.inc_multihead_self_attention( - hidden_states, - optConfig.hidden_size, - optConfig.num_attention_heads, - optConfig.hidden_size / optConfig.num_attention_heads, - optConfig.hidden_size / optConfig.num_attention_heads, - 0.0f, - true, - false, - false, - NULL, - false, - /*scaling query*/ true, - /*sacling factor*/ - pow((optConfig.hidden_size / optConfig.num_attention_heads), -0.5), - /*qk_prod_scaling*/ false); - - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - Tensor added = ff.add(mha, residual); - - Tensor final_norm = ff.layer_norm( - added, axes, optConfig.layer_norm_elementwise_affine, 1e-05); - Layer *final_layer_norm = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_final_layer_norm_weight", - final_layer_norm); - - //--------linear fc1 fc2 ---------- - Tensor fc1 = ff.dense(final_norm, optConfig.ffn_dim, AC_MODE_NONE, true); - Layer *fc1_linear = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_fc1_weight", - fc1_linear); - Tensor activation = ff.relu(fc1, false); - - Tensor fc2 = - ff.dense(activation, optConfig.hidden_size, AC_MODE_NONE, true); - Layer *fc2_linear = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_fc2_weight", - fc2_linear); - residual = ff.add(added, fc2); - } - - // final - Tensor all_final_norm = ff.layer_norm( - residual, axes, optConfig.layer_norm_elementwise_affine, 1e-05); - Layer *all_final_norm_layer = ff.layers.back(); - weights_layers.emplace("final_layer_norm_weight", all_final_norm_layer); - - Tensor lm_head = - ff.dense(all_final_norm, optConfig.vocab_size, AC_MODE_NONE, false); - Layer *lm_head_layer = ff.layers.back(); - weights_layers.emplace("embed_tokens_weight_lm_head", lm_head_layer); - - Tensor output = ff.arg_top_k(lm_head, /*k=*/1, false); - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager im(ffconfig, 1, 1); - im.compile_model_and_allocate_buffer(&ff, mapping); - RequestManager rm; - - ParallelTensor input_pt; - ff.get_parallel_tensor_from_tensor(input, input_pt); - assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - - ParallelTensor pos_pt; - ff.get_parallel_tensor_from_tensor(position_input, pos_pt); - assert(im.tensor_buffer.find(pos_pt) != im.tensor_buffer.end()); - - //-------------------load weights and inputs------------------ - FileDataLoader fileloader(optConfig.input_path, - optConfig.weight_file_path, - optConfig.num_attention_heads, - optConfig.hidden_size, - optConfig.hidden_size / - optConfig.num_attention_heads); - //"Today is a beautiful day and I want" - std::vector prompt = {2, 5625, 16, 10, 2721, 183, 8, 38, 236}; - rm.register_new_request(prompt, 30); - fileloader.load_weights(&ff, weights_layers); - - im.init_operators_inference(&ff); - int depth = 0; - std::map future_handlers; - std::map batch_configs; - int sentence_length = 9; - while (true) { - int bid = 0; - if (future_handlers.find(bid) == future_handlers.end()) { - BatchConfig bc; - InferenceResult ir; - bc = rm.prepare_next_batch(bc, ir); - FutureMap fm = im.inference(&ff, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; - } else { - Future future = future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } else { - std::cout << "future is ready...." << std::endl; - } - // process end - InferenceResult ir = future.get_result(); - BatchConfig bc = batch_configs[bid]; - bc = rm.prepare_next_batch(bc, ir); - sentence_length += bc.num_tokens; - FutureMap fm = im.inference(&ff, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; - } - } - std::cout << "----------inference finished--------------" << std::endl; -} - -void FlexFlow::register_custom_tasks() {} diff --git a/examples/cpp/inference/opt/opt.h b/examples/cpp/inference/opt/opt.h deleted file mode 100644 index 6b9a45f2d5..0000000000 --- a/examples/cpp/inference/opt/opt.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "file_loader.h" -#include "inference_config.h" -struct OptConfig : InferenceConfig { - OptConfig(void) : InferenceConfig() { - vocab_size = 50272, word_embed_proj_dim = 768, hidden_size = 768; - max_position_embeddings = 2048; - layer_norm_elementwise_affine = true; - num_attention_heads = 12; - dropout = 0.1; - seed = 3; - ffn_dim = 3072; - num_hidden_layers = 12; - } - int word_embed_proj_dim; - std::string input_path; - std::string weight_file_path; - int max_position_embeddings; - bool layer_norm_elementwise_affine; - float dropout; - unsigned long long seed; - int ffn_dim; - int num_hidden_layers; -}; diff --git a/examples/cpp/inference/opt/opt_baseline.py b/examples/cpp/inference/opt/opt_baseline.py deleted file mode 100644 index 3e8d7499f0..0000000000 --- a/examples/cpp/inference/opt/opt_baseline.py +++ /dev/null @@ -1,23 +0,0 @@ -from transformers import OPTConfig, OPTForCausalLM, GPT2Tokenizer - -model_id = "facebook/opt-125m" -tokenizer = GPT2Tokenizer.from_pretrained(model_id) -model = OPTForCausalLM.from_pretrained(model_id) - -prompts = [ - "Today is a beautiful day and I want", - ] - -for prompt in prompts: - input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids - print(input_ids) - generated_ids = model.generate(input_ids, max_length=30) - generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - print(generated_ids) - print(generated_string) - -#get same results with this and opt.cc -# tensor([[ 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, -# 458, 19, 47, 5, 2770, 527, 9, 127, 78, 655, -# 1805, 7, 5, 4105, 4, 50118, 100, 21, 98, 2283]]) -# 2, 5625, 16, 10, 2721, 183, 8, 38, 236, 7, 458, 19, 47, 5, 2770, 527, 9, 127, 78, 655, 1805, 7, 5, 4105, 4, 50118, 100, 21, 98, 2283, \ No newline at end of file diff --git a/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt b/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt deleted file mode 100644 index d7937d7595..0000000000 --- a/examples/cpp/inference/opt_spec_pipeline/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExample_opt_pipeline) -set(project_target opt_pipeline) - - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - opt_pipeline.cc - ${CMAKE_SOURCE_DIR}/inference/file_loader.cc - ${CMAKE_SOURCE_DIR}/inference/models/opt.cc) - - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) - -set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/examples/cpp/inference/opt_spec_pipeline/Makefile b/examples/cpp/inference/opt_spec_pipeline/Makefile deleted file mode 100644 index b4a7866073..0000000000 --- a/examples/cpp/inference/opt_spec_pipeline/Makefile +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 0 # Include debugging symbols -MAX_DIM ?= 4 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 1 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 1 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) - -# Put the binary file name here -OUTFILE ?= opt_pipeline -# List all the application source files here -ifndef CUDA_HOME -CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) -endif - - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc b/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc deleted file mode 100644 index 1229ad13c3..0000000000 --- a/examples/cpp/inference/opt_spec_pipeline/opt_pipeline.cc +++ /dev/null @@ -1,203 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/inference.h" -#include "flexflow/tokenizers.h" -#include "models/opt.h" - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("opt"); - -void parse_input_args(char **argv, int argc, OPT::Config &config) { - for (int i = 1; i < argc; i++) { - // weights - if (!strcmp(argv[i], "--weights")) { - config.weight_file_path = std::string(argv[++i]); - continue; - } - // tokenizer - if (!strcmp(argv[i], "--tokenizer")) { - config.tokenizer_assets_folder = std::string(argv[++i]); - continue; - } - } -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - FFConfig ffconfig; - OPT::Small_Config opt_config; - - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, opt_config); - std::string const vocab_filepath = - opt_config.tokenizer_assets_folder + "/gpt2-vocab.json"; - std::string const merges_filepath = - opt_config.tokenizer_assets_folder + "/gpt2-merges.txt"; - OptTokenizer opt_tokenizer(vocab_filepath, merges_filepath); - InferenceManager im(ffconfig, opt_config.batchSize, 1); - RequestManager rm(&opt_tokenizer); - // Add a single request - // std::vector prompt = { - // 2, 5625, 16, 10, 2721, 183, 8, 38, 236}; - // rm.register_new_request(prompt, opt_config.sentence_len); - std::string text = "I believe the meaning of life is"; - rm.register_new_request(text, - opt_config.sentence_len /*max_sequence_length*/); - - FFModel beam_model(ffconfig), tree_model(ffconfig); - OPT::create_opt_model(beam_model, im, opt_config, 1, BEAM_SEARCH_MODE); - OPT::create_opt_model(tree_model, im, opt_config, 1, TREE_VERIFY_MODE); - - // entry--------------------------- - int depth = 0; - std::map beam_future_handlers, tree_future_handler; - std::map beam_batch_configs; - std::map tree_batch_configs; - - bool new_req = true; - TreeVerifyBatchConfig tree_bc; - - int iteration = 0; - - while (depth < opt_config.max_beam_depth) { - int bid = 0; - if (beam_future_handlers.find(bid) == beam_future_handlers.end()) { - BeamSearchBatchConfig bc; - InferenceResult ir; - bc = rm.prepare_next_batch_init(tree_bc, ir); - - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - } else { - // have luanched this bid - Future future = beam_future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } else { - std::cout << "future is ready...." << std::endl; - } - // process end - BeamInferenceResult ir = future.get_result(); - BeamSearchBatchConfig bc = beam_batch_configs[bid]; - depth = bc.beamRequestsInfo[0].current_depth; - bc = rm.prepare_next_batch_beam(bc, ir); - - std::cout << "opt current depth: " << depth << std::endl; - std::cout << "sub_requests: " << bc.sub_requests[0] << "\n"; - FutureMap fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - - // tranverse the tree in dfs order; - if (depth >= opt_config.max_beam_depth) { - - printf("\n\n ------Final Beam Search Batch------\n"); - printf("[Beam] num_tokens: %d\n", bc.num_tokens); - for (int i = 0; i < bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << bc.tokensInfo[i].request_index - << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; - } - - // printf("\n\n prepare tree_bc from final beam search bc\n"); - tree_bc = rm.prepare_next_batch_verify(bc); - - printf("\n\n\n ------Tree Verify Batch-------\n"); - // should have the same content as the hardcoded verification block - // below right now, it only contains the prompt need to add in the beam - // search result - - printf("[Verify] num_tokens : %d\n", tree_bc.num_tokens); - printf("[Verify] num_tokens_in_batch: %d\n", - tree_bc.requestsInfo[0].num_tokens_in_batch); - printf("------------------------------\n"); - - for (int i = 0; i < tree_bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << tree_bc.tokensInfo[i].request_index << ", Abs Depth: " - << tree_bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << tree_bc.tokensInfo[i].token_id << "\n"; - } - - printf("\n\n ------Commit Verified Tokens-------\n"); - for (int i = 0; i < tree_bc.num_tokens_to_commit; i++) { - std::cout << "[Commit] Request Index: " - << tree_bc.commited_tokens[i].request_index - << ", Abs Depth: " << tree_bc.commited_tokens[i].token_depth - << ", Token Index in batch: " - << tree_bc.commited_tokens[i].token_index << "\n"; - } - - FutureMap fm = im.inference(&tree_model, 0, tree_bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - InferenceResult ir = future.get_result(); - for (int i = 0; i < tree_bc.num_tokens; i++) { - if (i == 7) { - std::cout << "------------------\n"; - } - printf("verify_tokens[%d] = %d\n", i, ir.token_ids[i]); - } - - std::cout << "------Init New Beam Search Batch------\n"; - bc = rm.prepare_next_batch_init(tree_bc, ir); - std::cout << "[Init] num_tokens: " << bc.num_tokens << "\n"; - for (int i = 0; i < bc.num_tokens; i++) { - std::cout << "[Token] Request Index: " - << bc.tokensInfo[i].request_index - << ", Abs Depth: " << bc.tokensInfo[i].abs_depth_in_request - << ", Token Id: " << bc.tokensInfo[i].token_id << "\n"; - } - std::cout << "Batch Depth: " << bc.beamRequestsInfo[0].current_depth - << "\n"; - - iteration++; - - if (iteration < 4) { - std::cout << "\n\n~~~~~~~~~~teration " << iteration << "~~~~~~~~~~\n"; - depth = bc.beamRequestsInfo[0].current_depth; - fm = im.inference(&beam_model, bid, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - beam_future_handlers[bid] = fm.get_future(0); - beam_batch_configs[bid] = bc; - } else { - break; - } - } - } - } - - // Execution fence - { - Future future = runtime->issue_execution_fence(ctx); - future.get_void_result(); - } - - // float* data - std::cout << "----------inference finished--------------" << std::endl; -} - -void FlexFlow::register_custom_tasks() {} diff --git a/include/gpt_tokenizer.h b/include/flexflow/gpt_tokenizer.h similarity index 99% rename from include/gpt_tokenizer.h rename to include/flexflow/gpt_tokenizer.h index 0a2388925a..ec08435809 100644 --- a/include/gpt_tokenizer.h +++ b/include/flexflow/gpt_tokenizer.h @@ -31,7 +31,7 @@ struct hash_pair { } }; -enum tokenizer_mode { GPT2, OPT }; +enum tokenizer_mode { GPT2_TOKENIZER, OPT_TOKENIZER }; class GPT_Tokenizer { diff --git a/include/flexflow/tokenizers.h b/include/flexflow/tokenizers.h index ffce2d423e..8f6c309aad 100644 --- a/include/flexflow/tokenizers.h +++ b/include/flexflow/tokenizers.h @@ -67,7 +67,7 @@ class OptTokenizer : public Tokenizer { public: OptTokenizer(std::string const &vocab_file, // path to "gpt2-vocab.json" std::string const &merges_file) // path to "gpt2-merges.txt" - : tokenizer(OPT, vocab_file, merges_file) { + : tokenizer(OPT_TOKENIZER, vocab_file, merges_file) { bos_token_id = 0; eos_token_id = 2; } diff --git a/inference/.gitignore b/inference/.gitignore new file mode 100644 index 0000000000..93699cdd9f --- /dev/null +++ b/inference/.gitignore @@ -0,0 +1,3 @@ +weights +tokenizer +prompt \ No newline at end of file diff --git a/inference/spec_infer/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md similarity index 89% rename from inference/spec_infer/MODEL_WEIGHTS.md rename to inference/MODEL_WEIGHTS.md index 79a194b159..e46e6b45d1 100644 --- a/inference/spec_infer/MODEL_WEIGHTS.md +++ b/inference/MODEL_WEIGHTS.md @@ -1,6 +1,7 @@ To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files. ```python +from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") for name, params in model.named_parameters(): @@ -22,6 +23,6 @@ for name, params in model.named_parameters(): .replace("lm_head", "output") .replace("model_", "") ) - params.detach().cpu().numpy().tofile('weights/' + name) + params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name) ``` diff --git a/inference/file_loader.cc b/inference/file_loader.cc index e21b39fcaf..250a030c4a 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -84,6 +84,7 @@ void load_attention_bias(float *ptr, for (auto file : bias_files) { size_t partial_size = hidden_dim; + // std::cout << "Loading filename: " << file << std::endl; std::ifstream in(file, std::ios::in | std::ios::binary); assert(in.good() && "incorrect bias file path"); std::vector host_array(partial_size); @@ -148,6 +149,10 @@ void load_attention_weights(float *ptr, size_t partial_size = one_weight_file_size; std::ifstream in(file, std::ios::in | std::ios::binary); + // std::cout << "Loading filename: " << file << std::endl; + if (!in.good()) { + std::cout << "Could not open file: " << file << std::endl; + } assert(in.good() && "incorrect weight file path"); std::vector host_array(partial_size); size_t loaded_data_size = sizeof(float) * partial_size; @@ -179,7 +184,11 @@ void load_attention_weights(float *ptr, } void load_from_file(float *ptr, size_t size, std::string filename) { + // std::cout << "Loading filename: " << filename << std::endl; std::ifstream in(filename, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << filename << std::endl; + } assert(in.good() && "incorrect weight file path"); std::vector host_array(size); size_t loaded_data_size = sizeof(float) * size; diff --git a/examples/cpp/inference/LLAMA/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt similarity index 76% rename from examples/cpp/inference/LLAMA/CMakeLists.txt rename to inference/incr_decoding/CMakeLists.txt index b31e04b0a5..761a710d71 100644 --- a/examples/cpp/inference/LLAMA/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -1,14 +1,15 @@ cmake_minimum_required(VERSION 3.10) -project(FlexFlowExample_LLAMA) -set(project_target LLAMA) +project(FlexFlow_IncrDecoding) +set(project_target incr_decoding) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} - llama.cc - ${CMAKE_SOURCE_DIR}/inference/file_loader.cc - ${CMAKE_SOURCE_DIR}/inference/models/llama.cc) + incr_decoding.cc + ../file_loader.cc + ../models/llama.cc + ../models/opt.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/examples/cpp/inference/llama_spec_pipeline/Makefile b/inference/incr_decoding/Makefile similarity index 100% rename from examples/cpp/inference/llama_spec_pipeline/Makefile rename to inference/incr_decoding/Makefile diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc new file mode 100644 index 0000000000..fb6269c568 --- /dev/null +++ b/inference/incr_decoding/incr_decoding.cc @@ -0,0 +1,187 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/tokenizers.h" +#include "models/llama.h" +#include "models/opt.h" +#include +#include + +using namespace Legion; + +LegionRuntime::Logger::Category log_app("llama"); + +struct FilePaths { + std::string llm_weight_file_path; + std::string llm_config_file_path; + std::string prompt_file_path; + std::string tokenizer_file_path; +}; + +enum ModelType { UNKNOWN, LLAMA, OPT }; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + ModelType &llm_model_type) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + std::string model_type_str = std::string(argv[++i]); + std::transform(model_type_str.begin(), + model_type_str.end(), + model_type_str.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (model_type_str == "llama") { + llm_model_type = ModelType::LLAMA; + } else if (model_type_str == "opt") { + llm_model_type = ModelType::OPT; + } else { + llm_model_type = ModelType::UNKNOWN; + } + continue; + } + // llm model weights + if (!strcmp(argv[i], "-llm-weight")) { + paths.llm_weight_file_path = std::string(argv[++i]); + continue; + } + // llm model configs + if (!strcmp(argv[i], "-llm-config")) { + paths.llm_config_file_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // tokenizer + if (!strcmp(argv[i], "-tokenizer")) { + paths.tokenizer_file_path = std::string(argv[++i]); + continue; + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + FilePaths file_paths; + ModelType model_type; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, argc, file_paths, model_type); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // Create SentencePiece tokenizer or OPT tokenizer + SentencePieceTokenizer *sp_tokenizer = nullptr; + OptTokenizer *opt_tokenizer = nullptr; + if (model_type == ModelType::LLAMA) { + sp_tokenizer = new SentencePieceTokenizer(file_paths.tokenizer_file_path); + } else { + std::string tokenizer_folder = + (!file_paths.tokenizer_file_path.empty() && + file_paths.tokenizer_file_path.back() != '/') + ? file_paths.tokenizer_file_path + '/' + : file_paths.tokenizer_file_path; + std::string vocab_file = tokenizer_folder + "gpt2-vocab.json"; + std::string merges_file = tokenizer_folder + "gpt2-merges.txt"; + std::filesystem::path path1(vocab_file); + std::filesystem::path path2(merges_file); + assert(std::filesystem::exists(path1) && + "Vocab file gpt2-vocab.json does not exist at the specified path"); + assert(std::filesystem::exists(path2) && + "Merge file gpt2-merges.txt does not exist at the specified path"); + opt_tokenizer = new OptTokenizer(vocab_file, merges_file); + } + + InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); + RequestManager rm((model_type == ModelType::LLAMA) + ? (Tokenizer *)sp_tokenizer + : (Tokenizer *)opt_tokenizer); + int total_num_requests = 0; + { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + rm.register_new_request(text, 128 /*max_sequence_length*/); + } + } + + FFModel model(ffconfig); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + im, + file_paths.llm_config_file_path, + file_paths.llm_weight_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + INC_DECODING_MODE); + } else { + assert(model_type == ModelType::OPT); + OPT::create_opt_model(model, + im, + file_paths.llm_config_file_path, + file_paths.llm_weight_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + INC_DECODING_MODE); + } + + BatchConfig bc; + InferenceResult ir; + while (rm.get_num_processed_requests() < total_num_requests) { + bc = rm.prepare_next_batch(bc, ir); + if (rm.get_num_processed_requests() >= total_num_requests) { + break; + } + FutureMap fm = im.inference(&model, 0, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + ir = future.get_result(); + } + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory + if (model_type == ModelType::LLAMA) { + delete sp_tokenizer; + } else { + delete opt_tokenizer; + } +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/models/configs/llama_190M.json b/inference/models/configs/llama_190M.json new file mode 100644 index 0000000000..d8d281fcf4 --- /dev/null +++ b/inference/models/configs/llama_190M.json @@ -0,0 +1,10 @@ +{ + "n_layers": 12, + "vocab_size": 50265, + "n_heads": 12, + "dim": 768, + "multiple_of": 256, + "norm_eps": 1e-6, + "total_requests": 2560, + "incremental_mode": true +} diff --git a/inference/models/configs/llama_7B.json b/inference/models/configs/llama_7B.json new file mode 100644 index 0000000000..5adfc68d90 --- /dev/null +++ b/inference/models/configs/llama_7B.json @@ -0,0 +1,10 @@ +{ + "n_layers": 32, + "vocab_size": 32000, + "n_heads": 32, + "dim": 4096, + "multiple_of": 256, + "norm_eps": 1e-6, + "total_requests": 2560, + "incremental_mode": true +} diff --git a/inference/models/configs/opt_125M.json b/inference/models/configs/opt_125M.json new file mode 100644 index 0000000000..0b9feed922 --- /dev/null +++ b/inference/models/configs/opt_125M.json @@ -0,0 +1,15 @@ +{ + "vocab_size": 50272, + "word_embed_proj_dim": 768, + "hidden_size": 768, + "num_attention_heads": 12, + "max_position_embeddings": 2048, + "layer_norm_elementwise_affine": true, + "num_hidden_layers": 12, + "dropout": 0.1, + "ffn_dim": 3072, + "max_beam_width": 1, + "batchSize": 8, + "sentence_len": 100, + "max_beam_depth": 4 +} diff --git a/inference/models/configs/opt_6B.json b/inference/models/configs/opt_6B.json new file mode 100644 index 0000000000..cc86ce0f8f --- /dev/null +++ b/inference/models/configs/opt_6B.json @@ -0,0 +1,15 @@ +{ + "vocab_size": 50272, + "word_embed_proj_dim": 4096, + "hidden_size": 4096, + "num_attention_heads": 32, + "max_position_embeddings": 2048, + "layer_norm_elementwise_affine": true, + "num_hidden_layers": 32, + "dropout": 0.1, + "ffn_dim": 16384, + "max_beam_width": 1, + "batchSize": 8, + "sentence_len": 100, + "max_beam_depth": 4 +} diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 4a4eca1c8a..d4b57be6e8 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -19,45 +19,14 @@ namespace FlexFlow { using namespace Legion; -LLAMA::Config LLAMA::create_190m_config() { - Config config; - config.n_layers = 12; - config.vocab_size = 50265; - config.dim = 768; - config.n_heads = 12; - config.hidden_dim = 3072; - return config; -} - -LLAMA::Config LLAMA::create_7b_config() { - // The default config is for llama 7b - Config config; - return config; -} - -// Deprecated API -void LLAMA::create_llama_model(FFModel &ff, - InferenceManager &im, - Config const &llama_config, - int num_pipeline_stages, - InferenceMode mode) { - assert(false); -} - void LLAMA::create_llama_model(FFModel &ff, InferenceManager &im, - std::string const &model_name, + std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, InferenceMode mode) { - Config llama_config; - if (model_name == "190m" || model_name == "190M") { - llama_config = create_190m_config(); - } else if (model_name == "7b" || model_name == "7B") { - llama_config = create_7b_config(); - } else { - assert(false && "Invalide model_name"); - } + Config llama_config(model_config_file_path); + llama_config.printConfig(); //------------------------------compute machine views ------------------ int num_devices = ff.config.workersPerNode * ff.config.numNodes; std::vector machine_views; diff --git a/inference/models/llama.h b/inference/models/llama.h index e99beb92ca..3233f00786 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -17,6 +17,9 @@ #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" +#include +#include +using json = nlohmann::json; namespace FlexFlow { @@ -44,28 +47,75 @@ class LLAMA { hidden_dim = multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); } + + Config(std::string config_filepath) { + std::ifstream config_file(config_filepath); + if (config_file.is_open()) { + try { + json config_json; + config_file >> config_json; + + n_layers = config_json["n_layers"]; + vocab_size = config_json["vocab_size"]; + n_heads = config_json["n_heads"]; + dim = config_json["dim"]; + multiple_of = config_json["multiple_of"]; + norm_eps = config_json["norm_eps"]; + total_requests = config_json["total_requests"]; + incremental_mode = config_json["incremental_mode"]; + // Override values below + /* max_seq_len = config_json["max_seq_len"]; + max_num_tokens = config_json["max_num_tokens"]; + max_beam_width = config_json["max_beam_width"]; + max_beam_depth = config_json["max_beam_depth"]; + hidden_dim = config_json["hidden_dim"]; */ + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + hidden_dim = 4 * dim; + hidden_dim = int(2 * hidden_dim / 3); + hidden_dim = + multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); + } catch (json::exception const &e) { + std::cerr << "Error parsing JSON file: " << e.what() << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file." << std::endl; + assert(false); + } + } + + void printConfig() const { + std::cout << "LLAMA Config:" << std::endl; + std::cout << "n_layers: " << n_layers << std::endl; + std::cout << "vocab_size: " << vocab_size << std::endl; + std::cout << "n_heads: " << n_heads << std::endl; + std::cout << "dim: " << dim << std::endl; + std::cout << "multiple_of: " << multiple_of << std::endl; + std::cout << "norm_eps: " << norm_eps << std::endl; + std::cout << "total_requests: " << total_requests << std::endl; + std::cout << "incremental_mode: " << incremental_mode << std::endl; + std::cout << "max_seq_len: " << max_seq_len << std::endl; + std::cout << "max_num_tokens: " << max_num_tokens << std::endl; + std::cout << "max_beam_width: " << max_beam_width << std::endl; + std::cout << "max_beam_depth: " << max_beam_depth << std::endl; + std::cout << "hidden_dim: " << hidden_dim << std::endl; + } + int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, total_requests, incremental_mode, max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; float norm_eps; - std::string weight_file_path; - std::string input_path; - std::string tokenizer_file_path; }; static void create_llama_model(FFModel &ff, InferenceManager &im, - std::string const &model_name, + std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, InferenceMode mode); - static void create_llama_model(FFModel &ff, - InferenceManager &im, - LLAMA::Config const &llama_config, - int num_pipeline_stages, - InferenceMode mode); - static Config create_190m_config(); - static Config create_7b_config(); }; }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 52d1ed6a84..57406929fa 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -21,9 +21,12 @@ using namespace Legion; void OPT::create_opt_model(FFModel &ff, InferenceManager &im, - Config const &opt_config, + std::string const &model_config_file_path, + std::string const &weight_file_path, int num_pipeline_stages, InferenceMode mode) { + Config opt_config(model_config_file_path); + opt_config.printConfig(); //------------------------------compute machine views ------------------ int num_devices = ff.config.workersPerNode * ff.config.numNodes; std::vector machine_views; @@ -90,7 +93,7 @@ void OPT::create_opt_model(FFModel &ff, residual, axes, opt_config.layer_norm_elementwise_affine, 1e-05); Layer *self_attn_layer_norm = ff.layers.back(); weights_layers.emplace("layers_" + std::to_string(i) + - "_self_attn_layer_norm_weight", + "_attention_layer_norm_weight", self_attn_layer_norm); if (i % num_transformer_layers_per_stage == 0) { @@ -215,14 +218,14 @@ void OPT::create_opt_model(FFModel &ff, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; im.compile_model_and_allocate_buffer(&ff, mapping); - FileDataLoader fileloader(opt_config.input_path, - opt_config.weight_file_path, + FileDataLoader fileloader("", + weight_file_path, opt_config.num_attention_heads, opt_config.hidden_size, opt_config.hidden_size / opt_config.num_attention_heads); fileloader.load_weights(&ff, weights_layers); - std::cout << "------load wieght finished----------" << std::endl; + std::cout << "------finished loading weights----------" << std::endl; im.init_operators_inference(&ff); } diff --git a/inference/models/opt.h b/inference/models/opt.h index 6ffc4220e9..d336c498d9 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -17,6 +17,9 @@ #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" +#include +#include +using json = nlohmann::json; namespace FlexFlow { @@ -27,23 +30,69 @@ class OPT { vocab_size = 50272; word_embed_proj_dim = 4096; hidden_size = 4096; + num_attention_heads = 32; max_position_embeddings = 2048; layer_norm_elementwise_affine = true; - num_hidden_layers = 32; dropout = 0.1; ffn_dim = 16384; + num_hidden_layers = 32; max_beam_width = 1; batchSize = 8; sentence_len = 100; max_beam_depth = 4; } + Config(std::string config_filepath) { + std::ifstream config_file(config_filepath); + if (config_file.is_open()) { + try { + json config_json; + config_file >> config_json; + + vocab_size = config_json["vocab_size"]; + word_embed_proj_dim = config_json["word_embed_proj_dim"]; + hidden_size = config_json["hidden_size"]; + num_attention_heads = config_json["num_attention_heads"]; + max_position_embeddings = config_json["max_position_embeddings"]; + layer_norm_elementwise_affine = + config_json["layer_norm_elementwise_affine"]; + dropout = config_json["dropout"]; + ffn_dim = config_json["ffn_dim"]; + num_hidden_layers = config_json["num_hidden_layers"]; + max_beam_width = config_json["max_beam_width"]; + batchSize = config_json["batchSize"]; + sentence_len = config_json["sentence_len"]; + max_beam_depth = config_json["max_beam_depth"]; + } catch (json::exception const &e) { + std::cerr << "Error parsing JSON file: " << e.what() << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file." << std::endl; + assert(false); + } + } + void printConfig() const { + std::cout << "OPT Config:" << std::endl; + std::cout << "vocab_size: " << vocab_size << std::endl; + std::cout << "word_embed_proj_dim: " << word_embed_proj_dim << std::endl; + std::cout << "hidden_size: " << hidden_size << std::endl; + std::cout << "num_attention_heads: " << num_attention_heads << std::endl; + std::cout << "max_position_embeddings: " << max_position_embeddings + << std::endl; + std::cout << "layer_norm_elementwise_affine: " << std::boolalpha + << layer_norm_elementwise_affine << std::endl; + std::cout << "dropout: " << dropout << std::endl; + std::cout << "ffn_dim: " << ffn_dim << std::endl; + std::cout << "num_hidden_layers: " << num_hidden_layers << std::endl; + std::cout << "max_beam_width: " << max_beam_width << std::endl; + std::cout << "batchSize: " << batchSize << std::endl; + std::cout << "sentence_len: " << sentence_len << std::endl; + std::cout << "max_beam_depth: " << max_beam_depth << std::endl; + } int vocab_size; int word_embed_proj_dim; int hidden_size; int num_attention_heads; - std::string input_path; - std::string weight_file_path; - std::string tokenizer_assets_folder; int max_position_embeddings; bool layer_norm_elementwise_affine; float dropout; @@ -55,19 +104,10 @@ class OPT { int max_beam_depth; }; - struct Small_Config : public Config { - Small_Config(void) { - word_embed_proj_dim = 768; - hidden_size = 768; - num_attention_heads = 12; - ffn_dim = 3072; - num_hidden_layers = 12; - } - }; - static void create_opt_model(FFModel &ff, InferenceManager &im, - Config const &opt_config, + std::string const &model_config_file_path, + std::string const &weight_file_path, int num_pipeline_stages, InferenceMode mode); }; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 2068da7f3e..d894b46084 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -16,6 +16,8 @@ #include "flexflow/inference.h" #include "flexflow/tokenizers.h" #include "models/llama.h" +#include "models/opt.h" +#include #include using namespace Legion; @@ -24,24 +26,79 @@ LegionRuntime::Logger::Category log_app("llama"); struct FilePaths { std::string llm_weight_file_path; + std::string llm_config_file_path; std::vector ssm_weight_file_paths; + std::vector ssm_config_file_paths; std::string prompt_file_path; std::string tokenizer_file_path; }; -void parse_input_args(char **argv, int argc, FilePaths &paths) { +enum ModelType { UNKNOWN, LLAMA, OPT }; + +struct ModelTypes { + ModelType llm_model_type; + std::vector ssm_model_types; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + ModelTypes &model_types) { for (int i = 1; i < argc; i++) { - // weights + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + std::string model_type_str = std::string(argv[++i]); + std::transform(model_type_str.begin(), + model_type_str.end(), + model_type_str.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (model_type_str == "llama") { + model_types.llm_model_type = ModelType::LLAMA; + } else if (model_type_str == "opt") { + model_types.llm_model_type = ModelType::OPT; + } else { + model_types.llm_model_type = ModelType::UNKNOWN; + } + continue; + } + // llm model weights if (!strcmp(argv[i], "-llm-weight")) { paths.llm_weight_file_path = std::string(argv[++i]); continue; } - // weights + // llm model configs + if (!strcmp(argv[i], "-llm-config")) { + paths.llm_config_file_path = std::string(argv[++i]); + continue; + } + // ssm models types + if (!strcmp(argv[i], "-ssm-model")) { + std::string model_type_str = std::string(argv[++i]); + std::transform(model_type_str.begin(), + model_type_str.end(), + model_type_str.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (model_type_str == "llama") { + model_types.ssm_model_types.push_back(ModelType::LLAMA); + } else if (model_type_str == "opt") { + model_types.ssm_model_types.push_back(ModelType::OPT); + } else { + model_types.ssm_model_types.push_back(ModelType::UNKNOWN); + } + continue; + } + // ssm model weights if (!strcmp(argv[i], "-ssm-weight")) { std::string file_path = std::string(argv[++i]); paths.ssm_weight_file_paths.push_back(file_path); continue; } + // ssm model configs + if (!strcmp(argv[i], "-ssm-config")) { + std::string file_path = std::string(argv[++i]); + paths.ssm_config_file_paths.push_back(file_path); + continue; + } // prompts if (!strcmp(argv[i], "-prompt")) { paths.prompt_file_path = std::string(argv[++i]); @@ -61,14 +118,60 @@ void FlexFlow::top_level_task(Task const *task, Runtime *runtime) { FFConfig ffconfig; FilePaths file_paths; + ModelTypes model_types; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, file_paths); - SentencePieceTokenizer tokenizer(file_paths.tokenizer_file_path); + parse_input_args(argv, argc, file_paths, model_types); + if (file_paths.ssm_weight_file_paths.size() == 0) { + assert(false && + "SpecInfer needs at least one SSM for speculative inference"); + } + if (file_paths.ssm_config_file_paths.size() != + file_paths.ssm_weight_file_paths.size()) { + assert(false && "Number of SSM config files passed does not match number " + "of SSM weights"); + } + assert(model_types.llm_model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + if (model_types.ssm_model_types.size() != + file_paths.ssm_weight_file_paths.size()) { + assert(false && "Number of valid SSM model types passed does not match " + "number of SSM weights"); + } + for (auto mt : model_types.ssm_model_types) { + if (mt == ModelType::UNKNOWN) { + assert(false && "One of the SSM model types passed is invalid."); + } + } + + // Create SentencePiece tokenizer or OPT tokenizer + SentencePieceTokenizer *sp_tokenizer = nullptr; + OptTokenizer *opt_tokenizer = nullptr; + if (model_types.llm_model_type == ModelType::LLAMA) { + sp_tokenizer = new SentencePieceTokenizer(file_paths.tokenizer_file_path); + } else { + std::string tokenizer_folder = + (!file_paths.tokenizer_file_path.empty() && + file_paths.tokenizer_file_path.back() != '/') + ? file_paths.tokenizer_file_path + '/' + : file_paths.tokenizer_file_path; + std::string vocab_file = tokenizer_folder + "gpt2-vocab.json"; + std::string merges_file = tokenizer_folder + "gpt2-merges.txt"; + std::filesystem::path path1(vocab_file); + std::filesystem::path path2(merges_file); + assert(std::filesystem::exists(path1) && + "Vocab file gpt2-vocab.json does not exist at the specified path"); + assert(std::filesystem::exists(path2) && + "Merge file gpt2-merges.txt does not exist at the specified path"); + opt_tokenizer = new OptTokenizer(vocab_file, merges_file); + } + InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); - RequestManager rm(&tokenizer); + RequestManager rm((model_types.llm_model_type == ModelType::LLAMA) + ? (Tokenizer *)sp_tokenizer + : (Tokenizer *)opt_tokenizer); int total_num_requests = 0; { using json = nlohmann::json; @@ -85,25 +188,39 @@ void FlexFlow::top_level_task(Task const *task, rm.register_new_request(text, 128 /*max_sequence_length*/); } } - if (file_paths.ssm_weight_file_paths.size() == 0) { - assert(false && - "SpecInfer needs at least one SSM for speculative inference"); - } FFModel beam_model(ffconfig); FFModel tree_model(ffconfig); - LLAMA::create_llama_model(beam_model, - im, - "190m", - file_paths.ssm_weight_file_paths[0], - 1, - BEAM_SEARCH_MODE); - LLAMA::create_llama_model(tree_model, - im, - "7b", - file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, - TREE_VERIFY_MODE); + if (model_types.ssm_model_types[0] == ModelType::LLAMA) { + LLAMA::create_llama_model(beam_model, + im, + file_paths.ssm_config_file_paths[0], + file_paths.ssm_weight_file_paths[0], + 1, + BEAM_SEARCH_MODE); + } else { + OPT::create_opt_model(beam_model, + im, + file_paths.ssm_config_file_paths[0], + file_paths.ssm_weight_file_paths[0], + 1, + BEAM_SEARCH_MODE); + } + if (model_types.llm_model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(tree_model, + im, + file_paths.llm_config_file_path, + file_paths.llm_weight_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + TREE_VERIFY_MODE); + } else { + OPT::create_opt_model(tree_model, + im, + file_paths.llm_config_file_path, + file_paths.llm_weight_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + TREE_VERIFY_MODE); + } TreeVerifyBatchConfig tree_bc; BeamSearchBatchConfig beam_bc; @@ -146,6 +263,13 @@ void FlexFlow::top_level_task(Task const *task, // float* data std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory + if (model_types.llm_model_type == ModelType::LLAMA) { + delete sp_tokenizer; + } else { + delete opt_tokenizer; + } } void FlexFlow::register_custom_tasks() {} diff --git a/inference/utils/download_llama_weights.py b/inference/utils/download_llama_weights.py new file mode 100644 index 0000000000..fa40125f00 --- /dev/null +++ b/inference/utils/download_llama_weights.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import os +import requests +from transformers import AutoModelForCausalLM + +# Change working dir to folder storing this script +abspath = os.path.abspath(__file__) +dname = os.path.dirname(abspath) +os.chdir(dname) + +def convert_hf_model(model, dst_folder): + os.makedirs(dst_folder, exist_ok=True) + for name, params in model.named_parameters(): + name = ( + name.replace(".", "_") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("o_proj", "wo") + .replace("mlp", "feed_forward") + .replace("gate_proj", "w1") + .replace("down_proj", "w2") + .replace("up_proj", "w3") + .replace("input_layernorm", "attention_norm") + .replace("post_attention_layernorm", "ffn_norm") + .replace("embed_tokens", "tok_embeddings") + .replace("lm_head", "output") + .replace("model_", "") + ) + params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") + +# Download and convert big model weights +model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") +dst_folder="../weights/llama_7B_weights" +convert_hf_model(model, dst_folder) + +# Download and convert small model weights +model = AutoModelForCausalLM.from_pretrained("Bingsu/llama-190m-arch") +dst_folder="../weights/llama_190M_weights" +convert_hf_model(model, dst_folder) + +# Download tokenizer +os.makedirs("../tokenizer", exist_ok=True) +tokenizer_filepath = '../tokenizer/tokenizer.model' +url = 'https://specinfer.s3.us-east-2.amazonaws.com/tokenizer/tokenizer.model' +r = requests.get(url) +open(tokenizer_filepath , 'wb').write(r.content) diff --git a/inference/utils/download_opt_weights.py b/inference/utils/download_opt_weights.py new file mode 100644 index 0000000000..ceade81e65 --- /dev/null +++ b/inference/utils/download_opt_weights.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +import os +import requests +from transformers import AutoModelForCausalLM + +# Change working dir to folder storing this script +abspath = os.path.abspath(__file__) +dname = os.path.dirname(abspath) +os.chdir(dname) + +def convert_hf_model(model, dst_folder): + os.makedirs(dst_folder, exist_ok=True) + for name, params in model.named_parameters(): + name = ( + name.replace(".", "_") + .replace("decoder_", "") + .replace("model_", "") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("out_proj", "wo") + ) + params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") + +# Download and convert big model weights +model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b") +dst_folder="../weights/opt_6B_weights" +convert_hf_model(model, dst_folder) + +# Download and convert small model weights +model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +dst_folder="../weights/opt_125M_weights" +convert_hf_model(model, dst_folder) + +# Download tokenizer files +os.makedirs("../tokenizer", exist_ok=True) +tokenizer_filepath = '../tokenizer/gpt2-vocab.json' +url = 'https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json' +r = requests.get(url) +open(tokenizer_filepath , 'wb').write(r.content) +tokenizer_filepath = '../tokenizer/gpt2-merges.txt' +url = 'https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt' +r = requests.get(url) +open(tokenizer_filepath , 'wb').write(r.content) diff --git a/src/runtime/gpt_tokenizer.cc b/src/runtime/gpt_tokenizer.cc index b011ad470a..56fdd05b3b 100644 --- a/src/runtime/gpt_tokenizer.cc +++ b/src/runtime/gpt_tokenizer.cc @@ -3,7 +3,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2019-2020 zili wang . -#include "gpt_tokenizer.h" +#include using json = nlohmann::json; @@ -277,7 +277,7 @@ void GPT_Tokenizer::encode(std::string str, input_ids->push_back(vocab[pad_token]); mask_ids->push_back(0); } - if (mode == OPT) { + if (mode == OPT_TOKENIZER) { mask_ids->insert(mask_ids->begin(), 1); input_ids->insert(input_ids->begin(), 2); } @@ -290,9 +290,10 @@ std::string GPT_Tokenizer::decode(std::vector input_ids, int index = 0; for (auto const &id : input_ids) { if (index == 0) { - if (mode == OPT) { - assert(id == 2); - index++; + if (mode == OPT_TOKENIZER) { + if (id == 2) { + index++; + } continue; } } diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/gpt_tokenizer.cpp b/tests/gpt_tokenizer.cpp index 8ddfa75e1c..eb8ea069af 100644 --- a/tests/gpt_tokenizer.cpp +++ b/tests/gpt_tokenizer.cpp @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "gpt_tokenizer.h" +#include #include @@ -22,11 +22,12 @@ int main(int argc, char *argv[]) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } - tokenizer_mode mode = strcmp(argv[1], "gpt-2") == 0 ? GPT2 : OPT; - std::string vocab_file = - mode == GPT2 ? "./gpt2_bpe/vocab.bpe" : "opt_bpe/vocab.bpe"; - std::string merge_file = - mode == GPT2 ? "./gpt2_bpe/encoder.json" : "opt_bpe/encoder.json"; + tokenizer_mode mode = + strcmp(argv[1], "gpt-2") == 0 ? GPT2_TOKENIZER : OPT_TOKENIZER; + std::string vocab_file = mode == GPT2_TOKENIZER ? "./gpt2_bpe/vocab.bpe" + : "opt_bpe/gpt2-merges.txt"; + std::string merge_file = mode == GPT2_TOKENIZER ? "./gpt2_bpe/encoder.json" + : "opt_bpe/gpt2-vocab.json"; GPT_Tokenizer tokenizer(mode, merge_file, vocab_file); @@ -37,7 +38,7 @@ int main(int argc, char *argv[]) { std::cout << "Error opening input file" << std::endl; return -1; } - std::ofstream outfile(mode == GPT2 + std::ofstream outfile(mode == GPT2_TOKENIZER ? "./wikitext-103-raw/wiki.valid.bpe.flexflow.gpt2" : "./wikitext-103-raw/wiki.valid.bpe.flexflow.opt", std::ofstream::out); diff --git a/tests/gpt_tokenizer_test.sh b/tests/gpt_tokenizer_test.sh index 6134d4e592..de6d018372 100755 --- a/tests/gpt_tokenizer_test.sh +++ b/tests/gpt_tokenizer_test.sh @@ -69,8 +69,8 @@ diff ./wikitext-103-raw/wiki.valid.bpe.flexflow.gpt2 ./wikitext-103-raw/wiki.val # Download OPT vocab and merge files mkdir -p opt_bpe -wget -O opt_bpe/encoder.json https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json -wget -O opt_bpe/vocab.bpe https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt +wget -O opt_bpe/gpt2-vocab.json https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json +wget -O opt_bpe/gpt2-merges.txt https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt # Run the FlexFlow C++ tokenizer (OPT) ./gpt_tokenizer opt diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh new file mode 100755 index 0000000000..5e7b746e49 --- /dev/null +++ b/tests/inference_tests.sh @@ -0,0 +1,55 @@ +#! /usr/bin/env bash +set -x +set -e + +cleanup() { + rm -rf ../inference/prompt ../inference/weights ../inference/tokenizer +} + +copy_embedding_weights(){ + cp ../inference/weights/opt_6B_weights/embed_tokens_weight ../inference/weights/opt_6B_weights/embed_tokens_weight_lm_head + cp ../inference/weights/opt_125M_weights/embed_tokens_weight ../inference/weights/opt_125M_weights/embed_tokens_weight_lm_head +} + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Clean up before test (just in case) +cleanup + +# Update the transformers library to support the LLAMA model +pip3 install --upgrade transformers + +# Download the weights +python3 ../inference/utils/download_llama_weights.py +python3 ../inference/utils/download_opt_weights.py + +# because huggingface reuse a weight in embedding and final linear +copy_embedding_weights + +# Create test prompt file +mkdir -p ../inference/prompt +echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json + +############################################################################################### +############################ Speculative inference tests ###################################### +############################################################################################### + +# LLAMA +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_190M_weights/ -ssm-config ../inference/models/configs/llama_190M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json + +# OPT +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json + +############################################################################################### +############################ Incremental decoding tests ####################################### +############################################################################################### + +# LLAMA +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json + +# OPT +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json + +# Clean up after test +cleanup From 28b31cde755d7161ca38733e6af71fb9e874e4a4 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 24 May 2023 22:00:20 -0500 Subject: [PATCH 140/344] Merging duplicate functions in IncMHA, SpecIncMHA, and TreeIncNHA (#736) * making TreeIncMultiHeadSelfAttentionMeta a subclass of IncMultiHeadSelfAttentionMeta * make BeamSearchIncMultiHeadAttentionMeta a subclass of IncMultiHeadAttentionMeta * format * merging kernel functions * merge more functions * merge compute_qkv_kernel * format * fix config --------- Co-authored-by: xinhaoc --- include/flexflow/batch_config.h | 12 +- .../ops/inc_multihead_self_attention.h | 20 + .../inc_multihead_self_attention_kernels.h | 60 +++ .../ops/spec_inc_multihead_self_attention.h | 38 +- .../ops/tree_inc_multihead_self_attention.h | 26 +- inference/models/configs/llama_190M.json | 1 + inference/models/configs/llama_7B.json | 1 + inference/models/llama.h | 12 +- src/ops/inc_multihead_self_attention.cpp | 27 + src/ops/inc_multihead_self_attention.cu | 106 +++- src/ops/spec_inc_multihead_self_attention.cpp | 21 +- src/ops/spec_inc_multihead_self_attention.cu | 472 +++--------------- src/ops/tree_inc_multihead_self_attention.cpp | 22 +- src/ops/tree_inc_multihead_self_attention.cu | 428 ++-------------- src/runtime/batch_config.cc | 2 +- src/runtime/beam_search_batch_config.cc | 2 +- src/runtime/request_manager.cc | 7 +- src/runtime/tree_verify_batch_config.cc | 2 +- 18 files changed, 361 insertions(+), 898 deletions(-) create mode 100644 include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 6e7f0cb6fd..391b514de6 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -15,6 +15,7 @@ #pragma once +#include "flexflow/ffconst.h" #include #include @@ -30,11 +31,6 @@ class BeamInferenceResult; class BatchConfig { public: - enum Mode { - INC_DECODING_MODE = 0, - BEAM_SEARCH_MODE = 1, - TREE_VERIFY_MODE = 2 - }; using RequestGuid = size_t; using TokenId = int; BatchConfig(); @@ -47,7 +43,7 @@ class BatchConfig { int num_active_requests() const; int num_active_tokens() const; void print() const; - virtual Mode get_mode() const; + virtual InferenceMode get_mode() const; static int const MAX_NUM_REQUESTS = 1; static int const MAX_NUM_TOKENS = 64; static int const MAX_SEQ_LENGTH = 256; @@ -77,7 +73,7 @@ class TreeVerifyBatchConfig : public BatchConfig { public: TreeVerifyBatchConfig(); ~TreeVerifyBatchConfig(); - Mode get_mode() const; + InferenceMode get_mode() const; // struct PerTokenInfo : BatchConfig::PerTokenInfo { // int tree_branch_idx; // }; @@ -102,7 +98,7 @@ class BeamSearchBatchConfig : public BatchConfig { public: BeamSearchBatchConfig(); BeamSearchBatchConfig(size_t beam_width, size_t target_iterations); - Mode get_mode() const; + InferenceMode get_mode() const; ~BeamSearchBatchConfig(); diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 6b29feac67..a118e62f0b 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -124,6 +124,26 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { Legion::Memory gpu_mem, int num_samples, int _num_heads); + IncMultiHeadSelfAttentionMeta(FFHandler handler, + InferenceMode infer_mode, + Op const *attn, + int _qSize, + int _kSize, + int _vSize, + int _qProjSize, + int _kProjSize, + int _vProjSize, + int _oProjSize, + bool _apply_rotary_embedding, + bool _bias, + bool _scaling_query, + bool _qk_prod_scaling, + bool _add_bias_kv, + float _scaling_factor, + float const *weight_ptr, + Legion::Memory gpu_mem, + int num_samples, + int _num_heads); ~IncMultiHeadSelfAttentionMeta(void); public: diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h new file mode 100644 index 0000000000..3cbaebe618 --- /dev/null +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -0,0 +1,60 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_ATTENTION_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_ATTENTION_KERNELS_H + +#include "flexflow/batch_config.h" +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/inc_multihead_self_attention.h" + +namespace FlexFlow { +namespace Kernels { +namespace IncMultiHeadAttention { + +__global__ void build_w_out_tensor(float const *weight_ptr, + float *contiguous_weight_ptr, + int vProjSize, + int oProjSize, + int num_heads, + int qkv_weight_block_size); + +__global__ void apply_proj_bias_w(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int oProjSize); + +__global__ void apply_proj_bias_qkv(float *input_ptr, + float const *bias_ptr, + int num_tokens, + int qProjSize, + int kProjSize, + int vProjSize, + int num_heads, + bool scaling_query, + float scaling_factor); + +__global__ void + apply_rotary_embedding(float *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int q_block_size, + int k_block_size, + int v_block_size, + bool q_tensor); + +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + float const *bias_ptr, + cudaStream_t stream); +} // namespace IncMultiHeadAttention +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_ATTENTION_KERNELS_H diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index da2825c9d2..57afb73a03 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -8,6 +8,7 @@ #include "flexflow/node.h" #include "flexflow/op_meta.h" #include "flexflow/operator.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/spec_inc_multihead_self_attention_params.h" #include "math.h" #include @@ -120,7 +121,7 @@ class SpecIncMultiHeadSelfAttention : public Op { int qoSeqLength, kvSeqLength; }; -class SpecIncMultiHeadSelfAttentionMeta : public OpMeta { +class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: SpecIncMultiHeadSelfAttentionMeta(FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, @@ -131,37 +132,10 @@ class SpecIncMultiHeadSelfAttentionMeta : public OpMeta { ~SpecIncMultiHeadSelfAttentionMeta(void); public: - Realm::RegionInstance reserveInst; - size_t weights_params, weightSize, reserveSpaceSize; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int num_heads; - bool *has_load_weights; - bool *apply_rotary_embedding; - bool *bias; - bool *scaling_query; - bool *qk_prod_scaling; - float scaling_factor; -#ifdef INFERENCE_TESTS - float *kcache, *vcache; -#endif - /*#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnAttnDescriptor_t attnDesc; - cudnnSeqDataDescriptor_t qDesc, kDesc, vDesc, oDesc; - #endif*/ - // int *devQoSeqArray, *devKvSeqArray, *loWinIdx, *hiWinIdx, *kvCache; - float *devQKVProjArray, *keyCache, *valueCache; - float *qk_prods, *qk_prods_softmax; - float *attn_heads, *W_out_contiguous; -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cuFloatComplex *complex_input; -#endif - // void *reserveSpace; - - // BatchConfig::token_idxs *dev_token2ids; - BatchConfig::PerTokenInfo *tokenInfos; - BatchConfig::PerRequestInfo *requestInfos; - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos; - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos; + Realm::RegionInstance beam_search_reserve_inst; + BatchConfig::PerRequestInfo *request_infos; + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index b1b265e6d6..58775bf40d 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -8,7 +8,7 @@ #include "flexflow/node.h" #include "flexflow/op_meta.h" #include "flexflow/operator.h" -#include "flexflow/ops/inc_multihead_self_attention_params.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "math.h" #include #include @@ -116,7 +116,7 @@ class TreeIncMultiHeadSelfAttention : public Op { int qoSeqLength, kvSeqLength; }; -class TreeIncMultiHeadSelfAttentionMeta : public OpMeta { +class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: TreeIncMultiHeadSelfAttentionMeta(FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, @@ -127,28 +127,8 @@ class TreeIncMultiHeadSelfAttentionMeta : public OpMeta { ~TreeIncMultiHeadSelfAttentionMeta(void); public: - Realm::RegionInstance reserveInst; - size_t weights_params, weightSize, reserveSpaceSize; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int num_heads; int num_active_tokens; - bool *has_load_weights; - bool *apply_rotary_embedding; - bool *bias; - bool *scaling_query; - bool *qk_prod_scaling; - float scaling_factor; -#ifdef INFERENCE_TESTS - float *kcache, *vcache; -#endif - float *devQKVProjArray, *keyCache, *valueCache; - float *qk_prods, *qk_prods_softmax; - float *attn_heads, *W_out_contiguous; -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cuFloatComplex *complex_input; -#endif - - TreeVerifyBatchConfig::PerTokenInfo *token_infos; + Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; }; diff --git a/inference/models/configs/llama_190M.json b/inference/models/configs/llama_190M.json index d8d281fcf4..b0bd90cb82 100644 --- a/inference/models/configs/llama_190M.json +++ b/inference/models/configs/llama_190M.json @@ -6,5 +6,6 @@ "multiple_of": 256, "norm_eps": 1e-6, "total_requests": 2560, + "hidden_dim": 3072, "incremental_mode": true } diff --git a/inference/models/configs/llama_7B.json b/inference/models/configs/llama_7B.json index 5adfc68d90..0c32ed320d 100644 --- a/inference/models/configs/llama_7B.json +++ b/inference/models/configs/llama_7B.json @@ -6,5 +6,6 @@ "multiple_of": 256, "norm_eps": 1e-6, "total_requests": 2560, + "hidden_dim": 11008, "incremental_mode": true } diff --git a/inference/models/llama.h b/inference/models/llama.h index 3233f00786..dba68af678 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -36,16 +36,11 @@ class LLAMA { norm_eps = 1e-6; total_requests = 2560; incremental_mode = true; + hidden_dim = 11008; max_seq_len = BatchConfig::MAX_SEQ_LENGTH; max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; - - // hidden dim - hidden_dim = 4 * dim; - hidden_dim = int(2 * hidden_dim / 3); - hidden_dim = - multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); } Config(std::string config_filepath) { @@ -63,6 +58,7 @@ class LLAMA { norm_eps = config_json["norm_eps"]; total_requests = config_json["total_requests"]; incremental_mode = config_json["incremental_mode"]; + hidden_dim = config_json["hidden_dim"]; // Override values below /* max_seq_len = config_json["max_seq_len"]; max_num_tokens = config_json["max_num_tokens"]; @@ -73,10 +69,6 @@ class LLAMA { max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; - hidden_dim = 4 * dim; - hidden_dim = int(2 * hidden_dim / 3); - hidden_dim = - multiple_of * int((hidden_dim + multiple_of - 1) / multiple_of); } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index c56e73a266..0669b347a7 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -70,6 +70,33 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( checkCUDNN(miopenSetStream(handler.dnn, stream)); } +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( + FFHandler handler, + InferenceMode infer_mode, + Op const *attn, + int _qSize, + int _kSize, + int _vSize, + int _qProjSize, + int _kProjSize, + int _vProjSize, + int _oProjSize, + bool _apply_rotary_embedding, + bool _bias, + bool _scaling_query, + bool _qk_prod_scaling, + bool _add_bias_kv, + float _scaling_factor, + float const *weight_ptr, + Memory gpu_mem, + int num_samples, + int _num_heads) + : OpMeta(handler, attn) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(handler.dnn, stream)); +} + IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 099e54305c..bc4c91aa4f 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -16,6 +16,7 @@ #include "cuComplex.h" #endif #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -24,6 +25,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +namespace Kernels { +namespace IncMultiHeadAttention { + __global__ void build_w_out_tensor(float const *weight_ptr, float *contiguous_weight_ptr, int vProjSize, @@ -301,6 +305,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +} // namespace IncMultiHeadAttention +} // namespace Kernels + +using namespace Kernels::IncMultiHeadAttention; + __global__ void store_kv_cache(float const *devQKVProjArray, float *cache_ptr, BatchConfig::PerTokenInfo const *tokenInfos, @@ -683,22 +692,64 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( Memory gpu_mem, int num_samples, int _num_heads) + : IncMultiHeadSelfAttentionMeta(handler, + INC_DECODING_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->add_bias_kv, + attn->scaling_factor, + weight_ptr, + gpu_mem, + num_samples, + _num_heads) {} + +IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( + FFHandler handler, + InferenceMode infer_mode, + Op const *attn, + int _qSize, + int _kSize, + int _vSize, + int _qProjSize, + int _kProjSize, + int _vProjSize, + int _oProjSize, + bool _apply_rotary_embedding, + bool _bias, + bool _scaling_query, + bool _qk_prod_scaling, + bool _add_bias_kv, + float _scaling_factor, + float const *weight_ptr, + Memory gpu_mem, + int num_samples, + int _num_heads) : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); - qSize = attn->qSize; - kSize = attn->kSize; - vSize = attn->vSize; + qSize = _qSize; + kSize = _kSize; + vSize = _vSize; // assume dimensions match for now assert(qSize == kSize); assert(kSize == vSize); - qProjSize = attn->qProjSize; - kProjSize = attn->kProjSize; + qProjSize = _qProjSize; + kProjSize = _kProjSize; assert(qProjSize == kProjSize); // required for attention QK^T matmul - vProjSize = attn->vProjSize; - oProjSize = attn->oProjSize; + vProjSize = _vProjSize; + oProjSize = _oProjSize; num_heads = _num_heads; weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + @@ -707,16 +758,16 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( has_load_weights = (bool *)calloc(1, sizeof(bool)); *has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = attn->apply_rotary_embedding; + *apply_rotary_embedding = _apply_rotary_embedding; bias = (bool *)calloc(1, sizeof(bool)); - *bias = attn->bias; + *bias = _bias; scaling_query = (bool *)calloc(1, sizeof(bool)); - *scaling_query = attn->scaling_query; - scaling_factor = attn->scaling_factor; + *scaling_query = _scaling_query; + scaling_factor = _scaling_factor; qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); - *qk_prod_scaling = attn->qk_prod_scaling; + *qk_prod_scaling = _qk_prod_scaling; // Currently do not support adding bias to key/value projection - assert(!attn->add_bias_kv); + assert(!_add_bias_kv); #ifdef INFERENCE_TESTS kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * @@ -732,12 +783,29 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; size_t qkv_max_proj_size = BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; - size_t key_cache_size = num_heads * kProjSize * - BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; - size_t value_cache_size = num_heads * vProjSize * - BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; + size_t key_cache_size = 0, value_cache_size = 0; + switch (infer_mode) { + case INC_DECODING_MODE: + case TREE_VERIFY_MODE: { + key_cache_size = num_heads * kProjSize * BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + value_cache_size = num_heads * vProjSize * + BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + break; + } + case BEAM_SEARCH_MODE: { + key_cache_size = + num_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + value_cache_size = + num_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + break; + } + default: + assert(false && "Unkown inference mode"); + } size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; size_t qk_prod_size = BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_heads; diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 765891ed53..b898d472a8 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -64,7 +64,26 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( Memory gpu_mem, int num_samples, int _num_heads) - : OpMeta(handler, attn) { + : IncMultiHeadSelfAttentionMeta(handler, + BEAM_SEARCH_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->add_bias_kv, + attn->scaling_factor, + weight_ptr, + gpu_mem, + num_samples, + _num_heads) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index ad4e59b710..f12d48ab1d 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -15,6 +15,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -23,286 +24,7 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; using Legion::Memory; - -__global__ void spec_build_w_out_tensor(float const *weight_ptr, - float *contiguous_weight_ptr, - int vProjSize, - int oProjSize, - int num_heads, - int qkv_weight_block_size) { - CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - int row_idx = i % vProjSize; - int col_idx = (i / vProjSize) % oProjSize; - int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[i] = - weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + col_idx * vProjSize + row_idx]; - } -} - -__global__ void spec_apply_proj_bias_w(float *input_ptr, - float const *bias_ptr, - int num_tokens, - int oProjSize) { - CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { - int bias_idx = 3 * oProjSize + i % oProjSize; - input_ptr[i] += bias_ptr[bias_idx]; - } -} - -__global__ void spec_apply_proj_bias_qkv(float *input_ptr, - float const *bias_ptr, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int num_heads, - bool scaling_query, - float scaling_factor) { - CUDA_KERNEL_LOOP( - i, num_tokens * (qProjSize + kProjSize + vProjSize) * num_heads) { - // for simplicity, assume q, k, v is in same shape - // 0->q, 1->k, 2->v - int qkv_index = i / (num_tokens * qProjSize) % 3; - - int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int q_block_size = qProjSize * num_tokens; - - int idx = i % (num_tokens * (qProjSize)); - - int real_part_index = - head_idx * qkv_block_size + qkv_index * q_block_size + idx; - int bias_idx = qkv_index * qProjSize * num_heads + head_idx * qProjSize + - (idx % qProjSize); - input_ptr[real_part_index] += bias_ptr[bias_idx]; - - if (scaling_query && qkv_index == 0) { - input_ptr[real_part_index] *= scaling_factor; - } - } -} - -__global__ void - spec_apply_rotary_embedding(float *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo *tokenInfos, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int q_block_size, - int k_block_size, - int v_block_size, - bool q_tensor) { - int proj_size = q_tensor ? qProjSize : kProjSize; - CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { - // create complex number - int head_idx = i / (num_tokens * proj_size / 2); - int idx = i % (num_tokens * proj_size / 2); - int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - - int real_part_index = - idx + token_idx * (proj_size / 2) + - head_idx * (q_block_size + k_block_size + v_block_size) + - (q_tensor ? 0 : q_block_size); - int complex_part_index = real_part_index + (proj_size / 2); - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 - // apply a Cartesian coordinate transformation - // multiple with input & /copy back to q/k - - // get position of token - // int head_idx = i / (num_tokens * proj_size); - - // size_t pos = id_map[token_idx].token_position; - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - - int pos_i = i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -void compute_qkv_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr, - cudaStream_t stream) { - - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; - assert(m->qSize == m->vSize && m->qSize == m->kSize); - cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = CUDA_R_32F; -#endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_heads - int m_q = m->qProjSize; - int m_k = m->kProjSize; - int m_v = m->vProjSize; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int lda = k, ldb = k, ldc_q = m_q, ldc_k = m_k, ldc_v = m_v; - size_t strideA = - m->weights_params; // need to also skip over all the parameters for each - // head, plus the unused W_o weights - size_t strideB = 0; // input stays the same for all heads. - size_t strideC = - (m_q + m_k + m_v) * n; // size of the output block for each head. - // Q - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_q, - n, - k, - &alpha, - weight_ptr, - data_type, - lda, - strideA, - input_ptr, - data_type, - ldb, - strideB, - &beta, - output_ptr, - data_type, - ldc_q, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_k, - n, - k, - &alpha, - weight_ptr + m_q * k, - data_type, - lda, - strideA, - input_ptr, - data_type, - ldb, - strideB, - &beta, - output_ptr + m_q * n, - data_type, - ldc_k, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // V - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_v, - n, - k, - &alpha, - weight_ptr + (m_q + m_k) * k, - data_type, - lda, - strideA, - input_ptr, - data_type, - ldb, - strideB, - &beta, - output_ptr + (m_q + m_k) * n, - data_type, - ldc_v, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - // apply rotary emmmbedding for k and v - // step1 change the k, v to complex tensor - int num_tokens = bc->num_active_tokens(); - - int parallelism = m->kProjSize * num_tokens * m->num_heads; - int q_block_size = m->qProjSize * num_tokens; - int k_block_size = m->kProjSize * num_tokens; - int v_block_size = m->vProjSize * num_tokens; - // apply bias for q, k, v - if (*m->bias) { - spec_apply_proj_bias_qkv<<>>(output_ptr, - bias_ptr, - num_tokens, - m->qProjSize, - m->kProjSize, - m->vProjSize, - m->num_heads, - *m->scaling_query, - m->scaling_factor); - } - - if (*m->apply_rotary_embedding) { - /*q*/ - spec_apply_rotary_embedding<<>>(output_ptr, - m->complex_input, - m->tokenInfos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - true); - /*k*/ - spec_apply_rotary_embedding<<>>(output_ptr, - m->complex_input, - m->tokenInfos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - false); - } - checkCUDA(cudaDeviceSynchronize()); -} +using namespace Kernels::IncMultiHeadAttention; __global__ void spec_store_kv_cache( float const *devQKVProjArray, @@ -445,10 +167,10 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(m->devQKVProjArray, m->keyCache, - m->tokenInfos, - m->requestInfos, - m->beamTokenInfos, - m->beamRequestInfos, + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, m->qProjSize, m->kProjSize, m->vProjSize, @@ -465,10 +187,10 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(m->devQKVProjArray, m->valueCache, - m->tokenInfos, - m->requestInfos, - m->beamTokenInfos, - m->beamRequestInfos, + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, m->qProjSize, m->kProjSize, m->vProjSize, @@ -719,10 +441,10 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } if (*m->bias) { int parallelism = m->oProjSize * num_tokens; - spec_apply_proj_bias_w<<>>( + apply_proj_bias_w<<>>( output_ptr, bias_ptr, num_tokens, m->oProjSize); } } @@ -752,38 +474,38 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( if (!(*m->has_load_weights)) { int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - spec_build_w_out_tensor<<>>(weight_ptr, - m->W_out_contiguous, - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); + build_w_out_tensor<<>>(weight_ptr, + m->W_out_contiguous, + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); *m->has_load_weights = true; } // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->tokenInfos, + cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(m->requestInfos, + cudaMemcpyAsync(m->request_infos, &(bc->requestsInfo), bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(m->beamTokenInfos, + cudaMemcpyAsync(m->beam_token_infos, &(bc->beamTokenInfo), bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(m->beamRequestInfos, + cudaMemcpyAsync(m->beam_request_infos, &(bc->beamRequestsInfo), bc->MAX_NUM_REQUESTS * sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), @@ -820,86 +542,37 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( Memory gpu_mem, int num_samples, int _num_heads) - : OpMeta(handler, attn) { + : IncMultiHeadSelfAttentionMeta(handler, + BEAM_SEARCH_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->add_bias_kv, + attn->scaling_factor, + weight_ptr, + gpu_mem, + num_samples, + _num_heads) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); - qSize = attn->qSize; - kSize = attn->kSize; - vSize = attn->vSize; - // assume dimensions match for now - assert(qSize == kSize); - assert(kSize == vSize); - qProjSize = attn->qProjSize; - kProjSize = attn->kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul - vProjSize = attn->vProjSize; - oProjSize = attn->oProjSize; - - // print params; - - num_heads = _num_heads; - weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + - oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - weightSize = weights_params * num_heads * sizeof(float); - has_load_weights = (bool *)calloc(1, sizeof(bool)); - *has_load_weights = false; - apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = attn->apply_rotary_embedding; - bias = (bool *)calloc(1, sizeof(bool)); - *bias = attn->bias; - scaling_query = (bool *)calloc(1, sizeof(bool)); - *scaling_query = attn->scaling_query; - scaling_factor = attn->scaling_factor; - qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); - *qk_prod_scaling = attn->qk_prod_scaling; - // Currently do not support adding bias to key/value projection - assert(!attn->add_bias_kv); - -#ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - BeamSearchBatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - BeamSearchBatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); -#endif - // allocate memory for the seqArray and reserve space { - size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; - size_t qkv_max_proj_size = - BeamSearchBatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; - size_t key_cache_size = - num_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t value_cache_size = - num_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - - // size_t token2ids_size = BatchConfig::MAX_NUM_TOKENS; - size_t tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS; - size_t beam_tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; size_t beam_requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; - - size_t qk_prod_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_TOKENS * num_heads; - size_t attn_heads_size = - BeamSearchBatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; - size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - size_t W_out_contiguous_size = W_out_block_size * num_heads; - size_t complex_size = - (BeamSearchBatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; size_t totalSize = - (qkv_max_proj_size + key_cache_size + value_cache_size + - 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * - sizeof(float) + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + beam_tokeninfo_size * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + @@ -912,57 +585,28 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( Realm::Point<1, coord_t>(totalSize - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(reserveInst, + Realm::RegionInstance::create_instance(beam_search_reserve_inst, gpu_mem, bounds, field_sizes, 0, Realm::ProfilingRequestSet()) .wait(); - devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); - keyCache = (float *)devQKVProjArray + qkv_max_proj_size; - valueCache = (float *)keyCache + key_cache_size; - // dev_token2ids = (BatchConfig::token_idxs *)(valueCache + - // value_cache_size); - - tokenInfos = (BatchConfig::PerTokenInfo *)(valueCache + value_cache_size); - beamTokenInfos = - (BeamSearchBatchConfig::BeamSearchPerTokenInfo *)(tokenInfos + - tokeninfo_size); - requestInfos = - (BatchConfig::PerRequestInfo *)(beamTokenInfos + beam_tokeninfo_size); - beamRequestInfos = - (BeamSearchBatchConfig::BeamSearchPerRequestInfo *)(requestInfos + + beam_token_infos = + (BeamSearchBatchConfig::BeamSearchPerTokenInfo *) + beam_search_reserve_inst.pointer_untyped(0, sizeof(char)); + request_infos = + (BatchConfig::PerRequestInfo *)(beam_token_infos + beam_tokeninfo_size); + beam_request_infos = + (BeamSearchBatchConfig::BeamSearchPerRequestInfo *)(request_infos + requestinfo_size); - - qk_prods = (float *)(beamRequestInfos + beam_requestinfo_size); - qk_prods_softmax = (float *)(qk_prods + qk_prod_size); - attn_heads = (float *)qk_prods_softmax + qk_prod_size; - W_out_contiguous = (float *)attn_heads + attn_heads_size; - checkCUDA( - cudaMalloc(&complex_input, complex_size * sizeof(cuFloatComplex))); - int parallelism = vProjSize * oProjSize * num_heads; - spec_build_w_out_tensor<<>>( - weight_ptr, - W_out_contiguous, - vProjSize, - oProjSize, - num_heads, - (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); } cudaStreamSynchronize(stream); } SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { - reserveInst.destroy(); -#ifdef INFERENCE_TESTS - free(kcache); - free(vcache); -#endif + beam_search_reserve_inst.destroy(); } }; // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 28285b72ce..aa5aaf3039 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -64,7 +64,27 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( Memory gpu_mem, int num_samples, int _num_heads) - : OpMeta(handler, attn) { + : IncMultiHeadSelfAttentionMeta(handler, + TREE_VERIFY_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->add_bias_kv, + attn->scaling_factor, + weight_ptr, + gpu_mem, + num_samples, + _num_heads), + num_active_tokens(0) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 0cc3cf9191..d3fc5c1fb2 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -15,6 +15,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -24,64 +25,7 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; -__global__ void tree_build_w_out_tensor(float const *weight_ptr, - float *contiguous_weight_ptr, - int vProjSize, - int oProjSize, - int num_heads, - int qkv_weight_block_size) { - CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - int row_idx = i % vProjSize; - int col_idx = (i / vProjSize) % oProjSize; - int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[i] = - weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + col_idx * vProjSize + row_idx]; - } -} - -__global__ void tree_apply_proj_bias_w(float *input_ptr, - float const *bias_ptr, - int num_tokens, - int oProjSize) { - CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { - int bias_idx = 3 * oProjSize + i % oProjSize; - input_ptr[i] += bias_ptr[bias_idx]; - } -} - -__global__ void tree_apply_proj_bias_qkv(float *input_ptr, - float const *bias_ptr, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int num_heads, - bool scaling_query, - float scaling_factor) { - CUDA_KERNEL_LOOP( - i, num_tokens * (qProjSize + kProjSize + vProjSize) * num_heads) { - // for simplicity, assume q, k, v is in same shape - // 0->q, 1->k, 2->v - int qkv_index = i / (num_tokens * qProjSize) % 3; - - int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int q_block_size = qProjSize * num_tokens; - - int idx = i % (num_tokens * (qProjSize)); - - int real_part_index = - head_idx * qkv_block_size + qkv_index * q_block_size + idx; - int bias_idx = qkv_index * qProjSize * num_heads + head_idx * qProjSize + - (idx % qProjSize); - input_ptr[real_part_index] += bias_ptr[bias_idx]; - - if (scaling_query && qkv_index == 0) { - input_ptr[real_part_index] *= scaling_factor; - } - } -} +using namespace Kernels::IncMultiHeadAttention; __global__ void commit_tokens_kernel( float const *devQKVProjArray, @@ -165,221 +109,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, } } -__global__ void tree_apply_rotary_embedding( - float *input_ptr, - cuFloatComplex *complex_input, - TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int q_block_size, - int k_block_size, - int v_block_size, - bool q_tensor) { - int proj_size = q_tensor ? qProjSize : kProjSize; - CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { - // create complex number - int head_idx = i / (num_tokens * proj_size / 2); - int idx = i % (num_tokens * proj_size / 2); - int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - - int real_part_index = - idx + token_idx * (proj_size / 2) + - head_idx * (q_block_size + k_block_size + v_block_size) + - (q_tensor ? 0 : q_block_size); - int complex_part_index = real_part_index + (proj_size / 2); - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 - // apply a Cartesian coordinate transformation - // multiple with input & /copy back to q/k - - // get position of token - // int head_idx = i / (num_tokens * proj_size); - - // size_t pos = id_map[token_idx].token_position; - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - - int pos_i = i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -void compute_qkv_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; - assert(m->qSize == m->vSize && m->qSize == m->kSize); - cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = CUDA_R_32F; -#endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_heads - int m_q = m->qProjSize; - int m_k = m->kProjSize; - int m_v = m->vProjSize; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int lda = k, ldb = k, ldc_q = m_q, ldc_k = m_k, ldc_v = m_v; - size_t strideA = - m->weights_params; // need to also skip over all the parameters for each - // head, plus the unused W_o weights - size_t strideB = 0; // input stays the same for all heads. - size_t strideC = - (m_q + m_k + m_v) * n; // size of the output block for each head. - // Q - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_q, - n, - k, - &alpha, - weight_ptr, - data_type, - lda, - strideA, - input_ptr, - data_type, - ldb, - strideB, - &beta, - output_ptr, - data_type, - ldc_q, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_k, - n, - k, - &alpha, - weight_ptr + m_q * k, - data_type, - lda, - strideA, - input_ptr, - data_type, - ldb, - strideB, - &beta, - output_ptr + m_q * n, - data_type, - ldc_k, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // V - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_v, - n, - k, - &alpha, - weight_ptr + (m_q + m_k) * k, - data_type, - lda, - strideA, - input_ptr, - data_type, - ldb, - strideB, - &beta, - output_ptr + (m_q + m_k) * n, - data_type, - ldc_v, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // apply rotary emmmbedding for k and v - // step1 change the k, v to complex tensor - int num_tokens = bc->num_active_tokens(); - int parallelism = m->kProjSize * num_tokens * m->num_heads; - int q_block_size = m->qProjSize * num_tokens; - int k_block_size = m->kProjSize * num_tokens; - int v_block_size = m->vProjSize * num_tokens; - // apply bias for q, k, v - if (*m->bias) { - tree_apply_proj_bias_qkv<<>>(output_ptr, - bias_ptr, - num_tokens, - m->qProjSize, - m->kProjSize, - m->vProjSize, - m->num_heads, - *m->scaling_query, - m->scaling_factor); - } - if (*m->apply_rotary_embedding) { - /*q*/ - tree_apply_rotary_embedding<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - true); - /*k*/ - tree_apply_rotary_embedding<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - false); - } -} - __global__ void update_tree_branch_kv_cache( float const *devQKVProjArray, float *cache_ptr, @@ -700,10 +429,10 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } if (*m->bias) { int parallelism = m->oProjSize * processed_tokens_in_batch; - tree_apply_proj_bias_w<<>>( + apply_proj_bias_w<<>>( output_ptr, bias_ptr, processed_tokens_in_batch, m->oProjSize); } @@ -747,17 +476,17 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( // reload the weight_o if (!(*m->has_load_weights)) { int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - tree_build_w_out_tensor<<>>(weight_ptr, - m->W_out_contiguous, - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); + build_w_out_tensor<<>>(weight_ptr, + m->W_out_contiguous, + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); *m->has_load_weights = true; } // here because we need postion info in infernece 1 @@ -800,127 +529,58 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( Memory gpu_mem, int num_samples, int _num_heads) - : OpMeta(handler, attn), num_active_tokens(0) { + : IncMultiHeadSelfAttentionMeta(handler, + TREE_VERIFY_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->add_bias_kv, + attn->scaling_factor, + weight_ptr, + gpu_mem, + num_samples, + _num_heads), + num_active_tokens(0) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); - qSize = attn->qSize; - kSize = attn->kSize; - vSize = attn->vSize; - // assume dimensions match for now - assert(qSize == kSize); - assert(kSize == vSize); - qProjSize = attn->qProjSize; - kProjSize = attn->kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul - vProjSize = attn->vProjSize; - oProjSize = attn->oProjSize; - - num_heads = _num_heads; - weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + - oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - weightSize = weights_params * num_heads * sizeof(float); - has_load_weights = (bool *)calloc(1, sizeof(bool)); - *has_load_weights = false; - apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = attn->apply_rotary_embedding; - bias = (bool *)calloc(1, sizeof(bool)); - *bias = attn->bias; - scaling_query = (bool *)calloc(1, sizeof(bool)); - *scaling_query = attn->scaling_query; - scaling_factor = attn->scaling_factor; - qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); - *qk_prod_scaling = attn->qk_prod_scaling; - // Currently do not support adding bias to key/value projection - assert(!attn->add_bias_kv); - -#ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); -#endif - // allocate memory for the seqArray and reserve space { - size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; - size_t qkv_max_proj_size = - TreeVerifyBatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; - size_t key_cache_size = num_heads * kProjSize * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; - size_t value_cache_size = num_heads * vProjSize * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; - size_t tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; - size_t qk_prod_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS * - TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads; - size_t attn_heads_size = - TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; - size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - size_t W_out_contiguous_size = W_out_block_size * num_heads; - size_t complex_size = - (TreeVerifyBatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; - size_t totalSize = - (qkv_max_proj_size + key_cache_size + value_cache_size + - 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * - sizeof(float) + - tokeninfo_size * sizeof(TreeVerifyBatchConfig::PerTokenInfo) + - committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo) + - complex_size * sizeof(cuFloatComplex); + size_t totalSize = committed_tokeninfo_size * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(totalSize - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(reserveInst, + Realm::RegionInstance::create_instance(committed_token_reserve_inst, gpu_mem, bounds, field_sizes, 0, Realm::ProfilingRequestSet()) .wait(); - devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); committed_token_infos = - (TreeVerifyBatchConfig::CommittedTokensInfo *)(devQKVProjArray + - qkv_max_proj_size); - keyCache = (float *)(committed_token_infos + committed_tokeninfo_size); - valueCache = (float *)keyCache + key_cache_size; - token_infos = - (TreeVerifyBatchConfig::PerTokenInfo *)(valueCache + value_cache_size); - qk_prods = (float *)(token_infos + tokeninfo_size); - qk_prods_softmax = (float *)(qk_prods + qk_prod_size); - attn_heads = (float *)qk_prods_softmax + qk_prod_size; - W_out_contiguous = (float *)attn_heads + attn_heads_size; - checkCUDA( - cudaMalloc(&complex_input, complex_size * sizeof(cuFloatComplex))); - int parallelism = vProjSize * oProjSize * num_heads; - tree_build_w_out_tensor<<>>( - weight_ptr, - W_out_contiguous, - vProjSize, - oProjSize, - num_heads, - (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); + (TreeVerifyBatchConfig::CommittedTokensInfo *) + committed_token_reserve_inst.pointer_untyped(0, sizeof(char)); } cudaStreamSynchronize(stream); } TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) { - reserveInst.destroy(); -#ifdef INFERENCE_TESTS - free(kcache); - free(vcache); -#endif + committed_token_reserve_inst.destroy(); } }; // namespace FlexFlow diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index a220c94085..4d81616dc3 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -35,7 +35,7 @@ BatchConfig::BatchConfig() : num_tokens(0) { } } -BatchConfig::Mode BatchConfig::get_mode() const { +InferenceMode BatchConfig::get_mode() const { return INC_DECODING_MODE; } diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 3adfb28241..8d4aeeabb0 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -41,7 +41,7 @@ BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width, BeamSearchBatchConfig::~BeamSearchBatchConfig() {} -BatchConfig::Mode BeamSearchBatchConfig::get_mode() const { +InferenceMode BeamSearchBatchConfig::get_mode() const { return BEAM_SEARCH_MODE; } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index ac038147d1..5349ec5439 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -71,11 +71,12 @@ RequestManager::RequestGuid request.initial_len = request.tokens.size(); pending_request_queue.push(request); - if (verbose) { - std::cout << "new req: " << request.tokens.size() << std::endl; + { + std::string output = "New request tokens:"; for (int i = 0; i < request.tokens.size(); i++) { - std::cout << i << " : " << request.tokens[i] << std::endl; + output = output + " " + std::to_string(request.tokens[i]); } + log_req_mgr.print("%s", output.c_str()); } return request.guid; } diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index e50a0e06fd..76a8025507 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -26,7 +26,7 @@ TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {} TreeVerifyBatchConfig::~TreeVerifyBatchConfig() {} -BatchConfig::Mode TreeVerifyBatchConfig::get_mode() const { +InferenceMode TreeVerifyBatchConfig::get_mode() const { return TREE_VERIFY_MODE; } From b0a5b9c0a4101ed6da7622269e0e4b8f5fceaf8b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 28 May 2023 00:09:32 +0800 Subject: [PATCH 141/344] [Inference] - Alignment fixes (#740) * fix alignment bugs (part 1) * add missing file --- .../configs/{llama_190M.json => llama_160M.json} | 2 +- inference/utils/download_llama_weights.py | 6 +++--- tests/inference_tests.sh | 12 +++++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) rename inference/models/configs/{llama_190M.json => llama_160M.json} (88%) diff --git a/inference/models/configs/llama_190M.json b/inference/models/configs/llama_160M.json similarity index 88% rename from inference/models/configs/llama_190M.json rename to inference/models/configs/llama_160M.json index b0bd90cb82..d912c64ab7 100644 --- a/inference/models/configs/llama_190M.json +++ b/inference/models/configs/llama_160M.json @@ -1,6 +1,6 @@ { "n_layers": 12, - "vocab_size": 50265, + "vocab_size": 32000, "n_heads": 12, "dim": 768, "multiple_of": 256, diff --git a/inference/utils/download_llama_weights.py b/inference/utils/download_llama_weights.py index fa40125f00..bbf4f349ee 100644 --- a/inference/utils/download_llama_weights.py +++ b/inference/utils/download_llama_weights.py @@ -37,13 +37,13 @@ def convert_hf_model(model, dst_folder): convert_hf_model(model, dst_folder) # Download and convert small model weights -model = AutoModelForCausalLM.from_pretrained("Bingsu/llama-190m-arch") -dst_folder="../weights/llama_190M_weights" +model = AutoModelForCausalLM.from_pretrained("JackFram/llama-160m") +dst_folder="../weights/llama_160M_weights" convert_hf_model(model, dst_folder) # Download tokenizer os.makedirs("../tokenizer", exist_ok=True) tokenizer_filepath = '../tokenizer/tokenizer.model' -url = 'https://specinfer.s3.us-east-2.amazonaws.com/tokenizer/tokenizer.model' +url = 'https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model' r = requests.get(url) open(tokenizer_filepath , 'wb').write(r.content) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 5e7b746e49..fa44446a3d 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -36,7 +36,7 @@ echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json ############################################################################################### # LLAMA -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_190M_weights/ -ssm-config ../inference/models/configs/llama_190M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json # OPT ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json @@ -45,10 +45,16 @@ echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json ############################ Incremental decoding tests ####################################### ############################################################################################### -# LLAMA +# LLAMA (small model) +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json + +# LLAMA (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -# OPT +# OPT (small model) +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json + +# OPT (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json # Clean up after test From 1ab3d809e8886e1b1c0f8169e15b44ac50e79b49 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 28 May 2023 00:26:00 +0800 Subject: [PATCH 142/344] Update README.md (#741) --- .github/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/README.md b/.github/README.md index 940bff486d..3f22993b00 100644 --- a/.github/README.md +++ b/.github/README.md @@ -60,7 +60,7 @@ For example, you can use the following command line to serve a LLaMA-7B or LLaMA ### Tokenizers SpecInfer supports two tokenizers: -* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). We store the tokenizer on our S3 bucket at this link: [s3://specinfer/tokenizer/tokenizer.model](https://specinfer.s3.us-east-2.amazonaws.com/tokenizer/tokenizer.model). +* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. * The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. ### LLM Weights @@ -69,7 +69,7 @@ The weight files used in our demo are extracted from HuggingFace, and stored in | Model | Model id on Hugging Face | Storage Location | | :---- | :---- | :---- | | LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | -| LLaMA-190M | Bingsu/llama-190m-arch | s3://specinfer/weights/llama_190m_weights.tar.gz | +| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | | OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | | OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125m_native.tar.gz | From 9f5bf94d1de6e14794eac794588e048cdd1f6f6d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 30 May 2023 16:22:38 -0500 Subject: [PATCH 143/344] Supporting mixed-precision (Spec/Tree/Normal) Incremental MultiHead Attention (#737) * making TreeIncMultiHeadSelfAttentionMeta a subclass of IncMultiHeadSelfAttentionMeta * make BeamSearchIncMultiHeadAttentionMeta a subclass of IncMultiHeadAttentionMeta --------- Co-authored-by: xinhaoc --- include/flexflow/ffconst_utils.h | 4 +- include/flexflow/model.h | 19 +- include/flexflow/ops/arg_topk.h | 14 +- include/flexflow/ops/beam_topk.h | 9 +- include/flexflow/ops/element_unary.h | 1 + .../ops/inc_multihead_self_attention.h | 18 +- .../ops/kernels/element_binary_kernels.h | 18 +- .../inc_multihead_self_attention_kernels.h | 27 +- include/flexflow/ops/kernels/linear_kernels.h | 9 +- .../flexflow/ops/kernels/rms_norm_kernels.h | 6 +- .../flexflow/ops/kernels/softmax_kernels.h | 24 +- include/flexflow/ops/layer_norm.h | 12 +- include/flexflow/ops/linear.h | 6 +- include/flexflow/ops/softmax.h | 4 +- .../ops/spec_inc_multihead_self_attention.h | 10 +- .../ops/tree_inc_multihead_self_attention.h | 11 +- include/flexflow/simulator.h | 4 +- include/flexflow/utils/cuda_helper.h | 8 +- include/flexflow/utils/hip_helper.h | 3 +- inference/file_loader.cc | 129 +++--- inference/file_loader.h | 5 + inference/models/llama.cc | 34 +- inference/models/llama.h | 3 +- inference/models/opt.cc | 59 ++- inference/models/opt.h | 3 +- inference/spec_infer/spec_infer.cc | 22 +- python/flexflow_c.cc | 10 +- src/ops/arg_topk.cc | 42 +- src/ops/arg_topk.cpp | 81 +++- src/ops/arg_topk.cu | 81 +++- src/ops/attention.cc | 3 + src/ops/beam_topk.cc | 22 +- src/ops/beam_topk.cpp | 222 +++++------ src/ops/beam_topk.cu | 79 ++-- src/ops/element_binary.cc | 99 +++-- src/ops/element_unary.cc | 12 +- src/ops/element_unary.cpp | 18 +- src/ops/element_unary.cu | 19 +- src/ops/fused.cpp | 9 +- src/ops/fused.cu | 9 +- src/ops/inc_multihead_self_attention.cc | 119 +++--- src/ops/inc_multihead_self_attention.cpp | 12 +- src/ops/inc_multihead_self_attention.cu | 371 ++++++++++-------- src/ops/kernels/element_binary_kernels.cpp | 19 +- src/ops/kernels/element_binary_kernels.cu | 46 ++- src/ops/kernels/linear_kernels.cpp | 89 +++-- src/ops/kernels/linear_kernels.cu | 127 ++++-- src/ops/kernels/rms_norm_kernels.cu | 100 +++-- src/ops/kernels/softmax.cpp | 39 +- src/ops/kernels/softmax.cu | 43 +- src/ops/layer_norm.cc | 80 ++-- src/ops/layer_norm.cpp | 97 ++--- src/ops/layer_norm.cu | 168 +++++--- src/ops/linear.cc | 193 +++++---- src/ops/rms_norm.cc | 45 ++- src/ops/softmax.cc | 84 ++-- src/ops/spec_inc_multihead_self_attention.cc | 115 +++--- src/ops/spec_inc_multihead_self_attention.cpp | 12 +- src/ops/spec_inc_multihead_self_attention.cu | 218 ++++++---- src/ops/tree_inc_multihead_self_attention.cc | 114 +++--- src/ops/tree_inc_multihead_self_attention.cpp | 12 +- src/ops/tree_inc_multihead_self_attention.cu | 216 ++++++---- src/runtime/accessor.cc | 2 + src/runtime/cuda_helper.cu | 36 +- src/runtime/ffconst_utils.cc | 20 + src/runtime/hip_helper.cpp | 19 +- src/runtime/parallel_tensor.cc | 7 + src/runtime/simulator.cc | 20 +- src/runtime/simulator.cpp | 4 +- src/runtime/simulator.cu | 5 +- tests/align/align_create_tensor_torch.py | 1 - tests/align/align_utils.py | 2 +- 72 files changed, 2216 insertions(+), 1387 deletions(-) diff --git a/include/flexflow/ffconst_utils.h b/include/flexflow/ffconst_utils.h index fcd881e57e..e2d04ad5a5 100644 --- a/include/flexflow/ffconst_utils.h +++ b/include/flexflow/ffconst_utils.h @@ -8,8 +8,10 @@ namespace FlexFlow { std::string get_operator_type_name(OperatorType type); +size_t data_type_size(DataType type); + std::ostream &operator<<(std::ostream &, OperatorType); }; // namespace FlexFlow -#endif // _FLEXFLOW_FFCONST_UTILS_H \ No newline at end of file +#endif // _FLEXFLOW_FFCONST_UTILS_H diff --git a/include/flexflow/model.h b/include/flexflow/model.h index f75267bb00..2345722005 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -505,6 +505,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor @@ -516,8 +517,11 @@ class FFModel { int b_seq_length_dim = -1, char const *name = nullptr); // Add a root mean square layer - Tensor - rms_norm(const Tensor input, float eps, int dim, char const *name = NULL); + Tensor rms_norm(const Tensor input, + float eps, + int dim, + DataType data_type = DT_NONE, + char const *name = NULL); // Add a beam search top k layer Tensor beam_top_k(const Tensor input, int max_beam_size, @@ -529,7 +533,7 @@ class FFModel { int outDim, ActiMode activation = AC_MODE_NONE, bool use_bias = true, - DataType data_type = DT_FLOAT, + DataType data_type = DT_NONE, Layer const *shared_op = NULL, Initializer *kernel_initializer = NULL, Initializer *bias_initializer = NULL, @@ -572,7 +576,10 @@ class FFModel { // Add a flat layer Tensor flat(const Tensor input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, int dim = -1, char const *name = NULL); + Tensor softmax(const Tensor input, + int dim = -1, + DataType data_type = DT_NONE, + char const *name = NULL); // Create input tensors and constants Tensor transpose(const Tensor input, std::vector const &perm, @@ -606,6 +613,7 @@ class FFModel { bool bias = true, bool add_bias_kv = false, bool add_zero_attn = false, + DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, char const *name = NULL); Tensor inc_multihead_self_attention(const Tensor input, @@ -617,6 +625,7 @@ class FFModel { bool bias = false, bool add_bias_kv = false, bool add_zero_attn = false, + DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, bool scaling_query = false, @@ -633,6 +642,7 @@ class FFModel { bool bias = false, bool add_bias_kv = false, bool add_zero_attn = false, + DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, bool scaling_query = false, @@ -649,6 +659,7 @@ class FFModel { bool bias = false, bool add_bias_kv = false, bool add_zero_attn = false, + DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, bool apply_rotary_embedding = false, bool scaling_query = false, diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index 4195efd142..a00ab76385 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -10,7 +10,7 @@ namespace FlexFlow { class ArgTopKMeta : public OpMeta { public: - ArgTopKMeta(FFHandler handle); + ArgTopKMeta(FFHandler handle, Op const *op); bool sorted; }; @@ -68,8 +68,9 @@ class ArgTopK : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + template static void forward_kernel(ArgTopKMeta const *m, - float const *input_ptr, + DT const *input_ptr, // float *output_ptr, int *indices_ptr, size_t batch_size, @@ -78,13 +79,8 @@ class ArgTopK : public Op { bool sorted, ffStream_t stream); static void forward_kernel_wrapper(ArgTopKMeta const *m, - float const *input_ptr, - // float *output_ptr, - int *indices_ptr, - size_t batch_size, - int length, - int k, - bool sorted); + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &indices); Params get_params() const; public: diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h index 9fc0be22f4..76404bfb6d 100644 --- a/include/flexflow/ops/beam_topk.h +++ b/include/flexflow/ops/beam_topk.h @@ -10,11 +10,11 @@ namespace FlexFlow { class BeamTopKMeta : public OpMeta { public: - BeamTopKMeta(FFHandler handle); + BeamTopKMeta(FFHandler handle, Op const *op); bool sorted; int max_beam_width; int *parent_ids; - float *acc_probs; + void *acc_probs; int *block_start_index; int *request_id; int *tokens_per_request; @@ -75,9 +75,10 @@ class BeamTopK : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + template static void forward_kernel(BeamTopKMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, + DT const *input_ptr, float *output_ptr, int *indices_ptr, int *parent_ptr, @@ -87,7 +88,7 @@ class BeamTopK : public Op { ffStream_t stream); static void forward_kernel_wrapper(BeamTopKMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, + GenericTensorAccessorR const &input, float *output_ptr, int *indices_ptr, int *parent_ptr, diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index a9ca5ddfc9..db0c4b02ca 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -3,6 +3,7 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" +#include "flexflow/inference.h" #include "flexflow/layer.h" #include "flexflow/node.h" #include "flexflow/op_meta.h" diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index a118e62f0b..baf126f41e 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -100,10 +100,10 @@ class IncMultiHeadSelfAttention : public Op { static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias); Params get_params() const; public: @@ -120,7 +120,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { public: IncMultiHeadSelfAttentionMeta(FFHandler handler, IncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Legion::Memory gpu_mem, int num_samples, int _num_heads); @@ -140,7 +140,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { bool _qk_prod_scaling, bool _add_bias_kv, float _scaling_factor, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Legion::Memory gpu_mem, int num_samples, int _num_heads); @@ -160,9 +160,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif - float *devQKVProjArray, *keyCache, *valueCache; - float *qk_prods, *qk_prods_softmax; - float *attn_heads, *W_out_contiguous; + void *devQKVProjArray, *keyCache, *valueCache; + void *qk_prods, *qk_prods_softmax; + void *attn_heads, *W_out_contiguous; BatchConfig::PerTokenInfo *token_infos; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cuFloatComplex *complex_input; diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index 529859195e..b0c596301b 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H +#include "flexflow/accessor.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -9,7 +10,7 @@ namespace FlexFlow { class ElementBinaryMeta : public OpMeta { public: - ElementBinaryMeta(FFHandler handle); + ElementBinaryMeta(FFHandler handle, Op const *op); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t input1Tensor, input2Tensor, outputTensor; cudnnOpTensorDescriptor_t opDesc; @@ -34,9 +35,9 @@ void init_kernel(ElementBinaryMeta *m, Legion::Domain const &output_domain); void forward_kernel_wrapper(ElementBinaryMeta const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr); + GenericTensorAccessorR const &in1, + GenericTensorAccessorR const &in2, + GenericTensorAccessorW const &out); void backward_kernel_wrapper(ElementBinaryMeta const *m, float const *out_grad_ptr, @@ -47,10 +48,11 @@ void backward_kernel_wrapper(ElementBinaryMeta const *m, namespace Internal { +template void forward_kernel(ElementBinaryMeta const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr, + DT const *in1_ptr, + DT const *in2_ptr, + DT *out_ptr, ffStream_t stream); void backward_kernel(ElementBinaryMeta const *m, float const *out_grad_ptr, @@ -65,4 +67,4 @@ void backward_kernel(ElementBinaryMeta const *m, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 3cbaebe618..0e0b1b4da9 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -11,20 +11,23 @@ namespace FlexFlow { namespace Kernels { namespace IncMultiHeadAttention { -__global__ void build_w_out_tensor(float const *weight_ptr, - float *contiguous_weight_ptr, +template +__global__ void build_w_out_tensor(DT const *weight_ptr, + DT *contiguous_weight_ptr, int vProjSize, int oProjSize, int num_heads, int qkv_weight_block_size); -__global__ void apply_proj_bias_w(float *input_ptr, - float const *bias_ptr, +template +__global__ void apply_proj_bias_w(DT *input_ptr, + DT const *bias_ptr, int num_tokens, int oProjSize); -__global__ void apply_proj_bias_qkv(float *input_ptr, - float const *bias_ptr, +template +__global__ void apply_proj_bias_qkv(DT *input_ptr, + DT const *bias_ptr, int num_tokens, int qProjSize, int kProjSize, @@ -33,8 +36,9 @@ __global__ void apply_proj_bias_qkv(float *input_ptr, bool scaling_query, float scaling_factor); +template __global__ void - apply_rotary_embedding(float *input_ptr, + apply_rotary_embedding(DT *input_ptr, cuFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, @@ -46,12 +50,13 @@ __global__ void int v_block_size, bool q_tensor); +template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, cudaStream_t stream); } // namespace IncMultiHeadAttention } // namespace Kernels diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 6ca9fb89ac..9ed99c93a1 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -4,12 +4,13 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" +#include "flexflow/ops/linear.h" namespace FlexFlow { class LinearMeta : public OpMeta { public: - LinearMeta(FFHandler handle, int batch_size); + LinearMeta(FFHandler handle, int batch_size, Linear const *li); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t outputTensor; cudnnActivationDescriptor_t actiDesc; @@ -17,7 +18,7 @@ class LinearMeta : public OpMeta { miopenTensorDescriptor_t outputTensor; miopenActivationDescriptor_t actiDesc; #endif - float const *one_ptr; + void *one_ptr; ActiMode activation; RegularizerMode kernel_reg_type; float kernel_reg_lambda; @@ -51,6 +52,7 @@ void backward_kernel_wrapper(LinearMeta const *m, bool use_activation(ActiMode mode); namespace Internal { +template void forward_kernel(LinearMeta const *m, void const *input_ptr, void *output_ptr, @@ -60,6 +62,7 @@ void forward_kernel(LinearMeta const *m, int out_dim, int batch_size, ffStream_t stream); +template void backward_kernel(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -72,6 +75,8 @@ void backward_kernel(LinearMeta const *m, int out_dim, int batch_size, ffStream_t stream); +template +__global__ void build_one_ptr(DT *one_ptr, int batch_size); } // namespace Internal } // namespace Linear } // namespace Kernels diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 5673880dcb..f38e55ae39 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -24,8 +24,8 @@ class RMSNormMeta : public OpMeta { public: float eps; - float *rms_ptr; - float *norm_ptr; + void *rms_ptr; + void *norm_ptr; float alpha; float beta; @@ -46,4 +46,4 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 81b34d8558..14c07414e9 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -21,27 +21,31 @@ class SoftmaxMeta : public OpMeta { bool profiling; int dim; char op_name[MAX_OPNAME]; + DataType input_type, output_type; }; namespace Kernels { namespace Softmax { - +template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); - + DT const *input_ptr, + DT *output_ptr); +template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, + DT *input_grad_ptr, + DT const *output_grad_ptr, size_t num_elements); namespace Internal { +template void forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, + DT const *input_ptr, + DT *output_ptr, ffStream_t stream); -void backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, + +template +void backward_kernel(DT *input_grad_ptr, + DT const *output_grad_ptr, size_t num_elements, ffStream_t stream); } // namespace Internal diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 60987471b2..b962edf326 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -2,7 +2,6 @@ #include "flexflow/inference.h" #include "flexflow/model.h" - namespace FlexFlow { class LayerNormMeta; @@ -76,12 +75,11 @@ class LayerNorm : public Op { T *gamma_ptr, T *beta_ptr, ffStream_t stream); - template static void forward_kernel_wrapper(LayerNormMeta const *m, - T const *input_ptr, - T *output_ptr, - T *gamma_ptr, - T *beta_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -115,7 +113,7 @@ class LayerNormMeta : public OpMeta { bool elementwise_affine; int64_t effective_batch_size, effective_num_elements; float eps; - float *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index f991bd7a9b..bbc40ef320 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -96,19 +96,19 @@ class Linear : public Op { bool allocate_weights, char const *name); - template + template static OpMeta * init_task_with_dim(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - template + template static void forward_task_with_dim(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - template + template static void backward_task_with_dim(Legion::Task const *task, std::vector const ®ions, diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 04f1283f89..3f29de905b 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -64,13 +64,13 @@ class Softmax : public Op { Params get_params() const; private: - template + template static void forward_task_with_dim(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - template + template static void backward_task_with_dim(Legion::Task const *task, std::vector const ®ions, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 57afb73a03..92d52b43b1 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -105,10 +105,10 @@ class SpecIncMultiHeadSelfAttention : public Op { static void inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias); Params get_params() const; public: @@ -125,7 +125,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: SpecIncMultiHeadSelfAttentionMeta(FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Legion::Memory gpu_mem, int num_samples, int _num_heads); diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 58775bf40d..eef51bc21f 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -100,10 +100,11 @@ class TreeIncMultiHeadSelfAttention : public Op { static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias); + Params get_params() const; public: @@ -120,7 +121,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: TreeIncMultiHeadSelfAttentionMeta(FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Legion::Memory gpu_mem, int num_samples, int _num_heads); diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h index 9ee1b1eb09..6f0f327110 100644 --- a/include/flexflow/simulator.h +++ b/include/flexflow/simulator.h @@ -684,8 +684,6 @@ class TaskManager { std::map hash_to_forward_task, hash_to_backward_task; }; -size_t data_type_size(DataType); - using ProfilingRecordKey = std::tuple; class Simulator { @@ -756,7 +754,7 @@ class Simulator { LinearMeta *linear_meta; Pool2DMeta *pool2d_meta; ElementUnaryMeta *ele_unary_meta; - ElementBinaryMeta *ele_binary_meta; + // ElementBinaryMeta *ele_binary_meta; // EmbeddingMeta *embedding_meta; // SoftmaxMeta *softmax_meta; BatchMatmulMeta *batch_matmul_meta; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index b82426ac59..5ac4571118 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -155,15 +155,17 @@ template bool download_tensor(T const *ptr, T *dst, size_t num_elements); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, - Legion::Domain domain); + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, - Legion::Domain domain); + Legion::Domain domain, + DataType data_type = DT_FLOAT); cudaDataType_t ff_to_cuda_datatype(DataType type); cudnnDataType_t ff_to_cudnn_datatype(DataType type); cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); -#endif \ No newline at end of file +#endif diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index f78102c0fe..c2c4d94cc3 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -141,7 +141,8 @@ bool download_tensor(T const *ptr, T *dst, size_t num_elements); miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, - Legion::Domain domain); + Legion::Domain domain, + DataType data_type = DT_FLOAT); hipblasDatatype_t ff_to_cuda_datatype(DataType type); diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 250a030c4a..f0eff0e50b 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -14,6 +14,7 @@ */ #include "file_loader.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" #include @@ -60,7 +61,8 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { return prompts; }; -void load_attention_bias(float *ptr, +template +void load_attention_bias(DT *ptr, int num_heads, size_t hidden_dim, size_t qkv_inner_dim, @@ -87,8 +89,8 @@ void load_attention_bias(float *ptr, // std::cout << "Loading filename: " << file << std::endl; std::ifstream in(file, std::ios::in | std::ios::binary); assert(in.good() && "incorrect bias file path"); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(float) * partial_size; + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); @@ -113,7 +115,8 @@ void load_attention_bias(float *ptr, } } -void load_attention_weights(float *ptr, +template +void load_attention_weights(DT *ptr, int num_heads, size_t hidden_dim, size_t qkv_inner_dim, @@ -154,8 +157,8 @@ void load_attention_weights(float *ptr, std::cout << "Could not open file: " << file << std::endl; } assert(in.good() && "incorrect weight file path"); - std::vector host_array(partial_size); - size_t loaded_data_size = sizeof(float) * partial_size; + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); @@ -183,15 +186,16 @@ void load_attention_weights(float *ptr, } } -void load_from_file(float *ptr, size_t size, std::string filename) { +template +void load_from_file(DT *ptr, size_t size, std::string filename) { // std::cout << "Loading filename: " << filename << std::endl; std::ifstream in(filename, std::ios::in | std::ios::binary); if (!in.good()) { std::cout << "Could not open file: " << filename << std::endl; } assert(in.good() && "incorrect weight file path"); - std::vector host_array(size); - size_t loaded_data_size = sizeof(float) * size; + std::vector
host_array(size); + size_t loaded_data_size = sizeof(DT) * size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); @@ -199,7 +203,7 @@ void load_from_file(float *ptr, size_t size, std::string filename) { size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { std::cout << "load weight data error " << in_get_size << ", " - << loaded_data_size << ", " << sizeof(float) << std::endl; + << loaded_data_size << ", " << sizeof(DT) << std::endl; return; } assert(size == host_array.size()); @@ -237,60 +241,77 @@ void FileDataLoader::load_positions(FFModel *ff, position_pt->set_tensor(ff, dims_vec, data); } +template +void FileDataLoader::load_single_weight_tensor(FFModel *ff, + Tensor weight, + int weight_idx, + std::string const &layername) { + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < weight->num_dims; i++) { + dims_vec.push_back(weight->dims[i]); + volume *= weight->dims[i]; + } + + assert(data_type_size(weight->data_type) == sizeof(DT)); + DT *data = (DT *)malloc(sizeof(DT) * volume); + + std::string file_path = + (layername.back() == '/') ? layername : "/" + layername; + + if (file_path.find("attention_w") != std::string::npos) { + if (weight_idx == 0) { + load_attention_weights(data, + num_heads, + hidden_dim, + qkv_inner_dim, + file_path, + weight_file_path, + volume); + } else { + load_attention_bias(data, + num_heads, + hidden_dim, + qkv_inner_dim, + file_path, + weight_file_path); + } + + } else { + if (weight_idx > 0) { + int index = file_path.find("_weight"); + assert(index != std::string::npos); + file_path = file_path.substr(0, index) + "_bias"; + } + load_from_file(data, volume, weight_file_path + file_path); + } + + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + weight_pt->set_tensor
(ff, dims_vec, data); + + delete data; +} + void FileDataLoader::load_weights( FFModel *ff, std::unordered_map weights_layers) { - for (auto &v : weights_layers) { - int weights_num = v.second->numWeights; for (int i = 0; i < weights_num; i++) { Tensor weight = v.second->weights[i]; if (weight == NULL) { continue; } - - size_t volume = 1; - std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; - } - - assert(weight->data_type == DT_FLOAT); - float *data = (float *)malloc(sizeof(float) * volume); - - std::string file_path = (v.first.back() == '/') ? v.first : "/" + v.first; - - if (file_path.find("attention_w") != std::string::npos) { - if (i == 0) { - load_attention_weights(data, - num_heads, - hidden_dim, - qkv_inner_dim, - file_path, - weight_file_path, - volume); - } else { - load_attention_bias(data, - num_heads, - hidden_dim, - qkv_inner_dim, - file_path, - weight_file_path); - } - - } else { - if (i > 0) { - int index = file_path.find("_weight"); - assert(index != std::string::npos); - file_path = file_path.substr(0, index) + "_bias"; - } - load_from_file(data, volume, weight_file_path + file_path); + switch (weight->data_type) { + case DT_HALF: + load_single_weight_tensor(ff, weight, i, v.first); + break; + case DT_FLOAT: + load_single_weight_tensor(ff, weight, i, v.first); + break; + default: + assert(false && "Unsupported data type"); } - - ParallelTensor weight_pt; - ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor(ff, dims_vec, data); } } } diff --git a/inference/file_loader.h b/inference/file_loader.h index 06714293da..7ca94a8893 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -32,6 +32,11 @@ class FileDataLoader { BatchConfig::TokenId *generate_requests(int num, int length); + template + void load_single_weight_tensor(FFModel *ff, + Tensor weight, + int weight_idx, + std::string const &layername); void load_weights(FFModel *ff, std::unordered_map weights_layers); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index d4b57be6e8..6a5070790c 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -24,7 +24,8 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, - InferenceMode mode) { + InferenceMode mode, + bool use_full_precision) { Config llama_config(model_config_file_path); llama_config.printConfig(); //------------------------------compute machine views ------------------ @@ -52,13 +53,27 @@ void LLAMA::create_llama_model(FFModel &ff, mapping[input].push_back(machine_views[0]); Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - Tensor token = ff.embedding(input, - llama_config.vocab_size, - llama_config.dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); + + Tensor token; + + if (use_full_precision) { + token = ff.embedding(input, + llama_config.vocab_size, + llama_config.dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + token = ff.embedding(input, + llama_config.vocab_size, + llama_config.dim, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } + Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); @@ -98,6 +113,7 @@ void LLAMA::create_llama_model(FFModel &ff, false, false, false, + DT_NONE, NULL, true); break; @@ -113,6 +129,7 @@ void LLAMA::create_llama_model(FFModel &ff, false, /*bias*/ false, /*add_bias_kv*/ false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ true /*apply_rotary_embedding*/ ); @@ -129,6 +146,7 @@ void LLAMA::create_llama_model(FFModel &ff, false, /*bias*/ false, /*add_bias_kv*/ false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ true /*apply_rotary_embedding*/ ); diff --git a/inference/models/llama.h b/inference/models/llama.h index dba68af678..11fc354a2c 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -107,7 +107,8 @@ class LLAMA { std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, - InferenceMode mode); + InferenceMode mode, + bool use_full_precision = false); }; }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 57406929fa..dd4bb18e85 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -24,7 +24,8 @@ void OPT::create_opt_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, - InferenceMode mode) { + InferenceMode mode, + bool use_full_precision) { Config opt_config(model_config_file_path); opt_config.printConfig(); //------------------------------compute machine views ------------------ @@ -57,24 +58,46 @@ void OPT::create_opt_model(FFModel &ff, Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); std::vector axes = {0}; - Tensor token = ff.embedding(input, - opt_config.vocab_size, - opt_config.word_embed_proj_dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - Layer *embedding = ff.layers.back(); + Tensor token; + if (use_full_precision) { + token = ff.embedding(input, + opt_config.vocab_size, + opt_config.word_embed_proj_dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + token = ff.embedding(input, + opt_config.vocab_size, + opt_config.word_embed_proj_dim, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } + Layer *embedding = ff.layers.back(); weights_layers.emplace("embed_tokens_weight", embedding); - Tensor positional_embedding = ff.embedding(position_input, - opt_config.max_position_embeddings, - opt_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); + Tensor positional_embedding; + if (use_full_precision) { + positional_embedding = ff.embedding(position_input, + opt_config.max_position_embeddings, + opt_config.hidden_size, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + positional_embedding = ff.embedding(position_input, + opt_config.max_position_embeddings, + opt_config.hidden_size, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } Layer *pos_embedding = ff.layers.back(); weights_layers.emplace("embed_positions_weight", pos_embedding); @@ -82,7 +105,6 @@ void OPT::create_opt_model(FFModel &ff, int num_transformer_layers_per_stage = (32 + num_pipeline_stages - 1) / num_pipeline_stages; - for (int i = 0; i < opt_config.num_hidden_layers; i++) { // 125m, 1.7B, ..., 175B applies layer norm BEFORE attention, // 350m applies layer norm AFTER attention @@ -114,6 +136,7 @@ void OPT::create_opt_model(FFModel &ff, true, false, false, + DT_NONE, /*data_type*/ NULL, false, /*scaling query*/ true, @@ -134,6 +157,7 @@ void OPT::create_opt_model(FFModel &ff, true, false, false, + DT_NONE, /*data_type*/ NULL, false, /*scaling query*/ true, @@ -154,6 +178,7 @@ void OPT::create_opt_model(FFModel &ff, true, false, false, + DT_NONE, /*data_type*/ NULL, false, /*scaling query*/ true, diff --git a/inference/models/opt.h b/inference/models/opt.h index d336c498d9..77d9aae962 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -109,7 +109,8 @@ class OPT { std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, - InferenceMode mode); + InferenceMode mode, + bool use_full_precision = false); }; }; // namespace FlexFlow diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index d894b46084..f7f7e70543 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -43,7 +43,8 @@ struct ModelTypes { void parse_input_args(char **argv, int argc, FilePaths &paths, - ModelTypes &model_types) { + ModelTypes &model_types, + bool &use_full_precision) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -109,6 +110,10 @@ void parse_input_args(char **argv, paths.tokenizer_file_path = std::string(argv[++i]); continue; } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } } } @@ -119,11 +124,12 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; FilePaths file_paths; ModelTypes model_types; + bool use_full_precision = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, file_paths, model_types); + parse_input_args(argv, argc, file_paths, model_types, use_full_precision); if (file_paths.ssm_weight_file_paths.size() == 0) { assert(false && "SpecInfer needs at least one SSM for speculative inference"); @@ -197,14 +203,16 @@ void FlexFlow::top_level_task(Task const *task, file_paths.ssm_config_file_paths[0], file_paths.ssm_weight_file_paths[0], 1, - BEAM_SEARCH_MODE); + BEAM_SEARCH_MODE, + use_full_precision); } else { OPT::create_opt_model(beam_model, im, file_paths.ssm_config_file_paths[0], file_paths.ssm_weight_file_paths[0], 1, - BEAM_SEARCH_MODE); + BEAM_SEARCH_MODE, + use_full_precision); } if (model_types.llm_model_type == ModelType::LLAMA) { LLAMA::create_llama_model(tree_model, @@ -212,14 +220,16 @@ void FlexFlow::top_level_task(Task const *task, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, ffconfig.workersPerNode * ffconfig.numNodes, - TREE_VERIFY_MODE); + TREE_VERIFY_MODE, + use_full_precision); } else { OPT::create_opt_model(tree_model, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, ffconfig.workersPerNode * ffconfig.numNodes, - TREE_VERIFY_MODE); + TREE_VERIFY_MODE, + use_full_precision); } TreeVerifyBatchConfig tree_bc; diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 776401c689..bb17f807a6 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -568,8 +568,8 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = - handle->layer_norm(input, axes_vec, elementwise_affine, eps, name); + Tensor tensor = handle->layer_norm( + input, axes_vec, elementwise_affine, eps, input->data_type, name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -737,7 +737,7 @@ flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->softmax(input, dim, name); + Tensor tensor = handle->softmax(input, dim, input->data_type, name); DEBUG_PRINT( "[Softmax] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); @@ -979,6 +979,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( bias, add_bias_kv, add_zero_attn, + query->data_type, kernel_initializer, name); DEBUG_PRINT("[MultiHeadAttention] new Tensor %p, query %p, key %p, value %p, " @@ -1027,6 +1028,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bias, add_bias_kv, add_zero_attn, + input->data_type, kernel_initializer, name); return FFCObjectWrapper::wrap(tensor); @@ -1969,4 +1971,4 @@ void finish_flexflow_task() { // The previous call is asynchronous so we still need to // wait for the shutdown of the runtime to complete Runtime::wait_for_shutdown(); -} +} \ No newline at end of file diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 5636b7b924..eedd89bd5f 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -233,7 +233,7 @@ OpMeta *ArgTopK::init_task(Task const *task, Runtime *runtime) { ArgTopK *topk = (ArgTopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - ArgTopKMeta *m = new ArgTopKMeta(handle); + ArgTopKMeta *m = new ArgTopKMeta(handle, topk); m->profiling = topk->profiling; m->sorted = topk->sorted; return m; @@ -296,42 +296,20 @@ InferenceResult assert(task->regions.size() == 2); // const ArgTopK* topk = (const ArgTopK*) task->args; ArgTopKMeta const *m = *((ArgTopKMeta **)task->local_args); - Domain in1_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - // Domain out1_domain = runtime->get_index_space_domain( - // ctx, task->regions[1].region.get_index_space()); - Domain out2_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - int numdims = in1_domain.get_dim(); - assert(out2_domain.get_dim() == numdims); - int in_cols = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; - // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1; - int out2_cols = out2_domain.hi()[0] - out2_domain.lo()[0] + 1; + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - // assert(out1_domain == out2_domain); - for (int i = 1; i < in1_domain.get_dim(); i++) { - assert(in1_domain.lo()[i] == out2_domain.lo()[i]); - assert(in1_domain.hi()[i] == out2_domain.hi()[i]); - } - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - // float *value_ptr = helperGetTensorPointerWO( - // regions[1], task->regions[1], FID_DATA, ctx, runtime); - int *index_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - int length = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; - int k = - out2_domain.hi()[0] - out2_domain.lo()[0] + 1; /*TODO: This prints to 5*/ - size_t batch_size = in1_domain.get_volume() / length; - assert(out2_domain.get_volume() / k == batch_size); + ArgTopK::forward_kernel_wrapper(m, input, indices); - ArgTopK::forward_kernel_wrapper( - m, in_ptr, index_ptr, batch_size, length, k, m->sorted); + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int batch_size = input.domain.get_volume() / length; InferenceResult ir; - download_tensor(index_ptr, ir.token_ids, batch_size); + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index cc43967894..d055e09def 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -364,8 +364,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, } /*static*/ +template void ArgTopK::forward_kernel(ArgTopKMeta const *m, - float const *input_ptr, + DT const *input_ptr, // float *output_ptr, int *indices_ptr, size_t batch_size, @@ -378,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, int num_shards = 0; { constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = k * sizeof(Entry); + auto const heap_size = k * sizeof(Entry
); // shared_memory_size = (num_shards + 1) * heap_size <=> num_shards = shared_memory_size / heap_size - 1; assert(num_shards > 0); @@ -387,7 +388,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, } } // We are limited by the amount of shared memory we have per block. - size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry); + size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; assert(num_shards >= (size_t)k); @@ -408,15 +409,41 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, - float const *input_ptr, + GenericTensorAccessorR const &input, // float *output_ptr, - int *indices_ptr, - size_t batch_size, - int length, - int k, - bool sorted) { + GenericTensorAccessorW const &indices) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + // Domain in1_domain = runtime->get_index_space_domain( + // ctx, task->regions[0].region.get_index_space()); + // Domain out1_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + // Domain out2_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + int numdims = input.domain.get_dim(); + assert(indices.domain.get_dim() == numdims); + + int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1; + // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1; + int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; + + // assert(out1_domain == out2_domain); + for (int i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.lo()[i] == indices.domain.lo()[i]); + assert(input.domain.hi()[i] == indices.domain.hi()[i]); + } + // float const *in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + // float *value_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + // int *index_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int k = indices.domain.hi()[0] - indices.domain.lo()[0] + + 1; /*TODO: This prints to 5*/ + size_t batch_size = input.domain.get_volume() / length; + assert(indices.domain.get_volume() / k == batch_size); hipEvent_t t_start, t_end; if (m->profiling) { @@ -425,16 +452,29 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, hipEventRecord(t_start, stream); } - ArgTopK::forward_kernel(m, - input_ptr, - // output_ptr, - indices_ptr, - batch_size, - length, - k, - sorted, - stream); - + if (input.data_type == DT_HALF) { + ArgTopK::forward_kernel(m, + input.get_half_ptr(), + // output_ptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + stream); + } else if (input.data_type == DT_FLOAT) { + ArgTopK::forward_kernel(m, + input.get_float_ptr(), + // output_ptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -445,6 +485,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, } } -ArgTopKMeta::ArgTopKMeta(FFHandler handler) : OpMeta(handler) {} +ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) + : OpMeta(handler, op) {} }; // namespace FlexFlow diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 82fc113d4f..9583af525e 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -363,8 +363,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, } /*static*/ +template void ArgTopK::forward_kernel(ArgTopKMeta const *m, - float const *input_ptr, + DT const *input_ptr, // float *output_ptr, int *indices_ptr, size_t batch_size, @@ -377,7 +378,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, int num_shards = 0; { constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = k * sizeof(Entry); + auto const heap_size = k * sizeof(Entry
); // shared_memory_size = (num_shards + 1) * heap_size <=> num_shards = shared_memory_size / heap_size - 1; assert(num_shards > 0); @@ -386,7 +387,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, } } // We are limited by the amount of shared memory we have per block. - size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry); + size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; assert(num_shards >= (size_t)k); @@ -403,16 +404,43 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, - float const *input_ptr, + GenericTensorAccessorR const &input, // float *output_ptr, - int *indices_ptr, - size_t batch_size, - int length, - int k, - bool sorted) { + GenericTensorAccessorW const &indices) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + // Domain in1_domain = runtime->get_index_space_domain( + // ctx, task->regions[0].region.get_index_space()); + // Domain out1_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + // Domain out2_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + int numdims = input.domain.get_dim(); + assert(indices.domain.get_dim() == numdims); + + int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1; + // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1; + int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; + + // assert(out1_domain == out2_domain); + for (int i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.lo()[i] == indices.domain.lo()[i]); + assert(input.domain.hi()[i] == indices.domain.hi()[i]); + } + // float const *in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + // float *value_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + // int *index_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int k = indices.domain.hi()[0] - indices.domain.lo()[0] + + 1; /*TODO: This prints to 5*/ + size_t batch_size = input.domain.get_volume() / length; + assert(indices.domain.get_volume() / k == batch_size); + cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -420,15 +448,29 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, cudaEventRecord(t_start, stream); } - ArgTopK::forward_kernel(m, - input_ptr, - // output_ptr, - indices_ptr, - batch_size, - length, - k, - sorted, - stream); + if (input.data_type == DT_HALF) { + ArgTopK::forward_kernel(m, + input.get_half_ptr(), + // output_ptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + stream); + } else if (input.data_type == DT_FLOAT) { + ArgTopK::forward_kernel(m, + input.get_float_ptr(), + // output_ptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); @@ -441,6 +483,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, } } -ArgTopKMeta::ArgTopKMeta(FFHandler handler) : OpMeta(handler) {} +ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) + : OpMeta(handler, op) {} }; // namespace FlexFlow diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 7af6cb8697..8494981cf6 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -59,8 +59,11 @@ Tensor FFModel::multihead_attention(const Tensor query, bool bias, bool add_bias_kv, bool add_zero_attn, + DataType data_type, Initializer *kernel_initializer, char const *name) { + // Currently only support float for the original attention operator + assert(data_type == DT_NONE || data_type == DT_FLOAT); Layer *li = new Layer(this, OP_MULTIHEAD_ATTENTION, DT_FLOAT, diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 16c14bdeca..d67c84a9df 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -145,7 +145,7 @@ BeamTopK::BeamTopK(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, inputs[0]->dims, DT_INT32, this, 0 /*owner_idx*/); outputs[1] = model.create_parallel_tensor_legion_ordering( - numdim, inputs[0]->dims, _input->data_type, this, 1 /*owner_idx*/); + numdim, inputs[0]->dims, DT_FLOAT, this, 1 /*owner_idx*/); outputs[2] = model.create_parallel_tensor_legion_ordering( numdim, inputs[0]->dims, DT_INT32, this, 2 /*owner_idx*/); } @@ -270,10 +270,11 @@ OpMeta *BeamTopK::init_task(Task const *task, Runtime *runtime) { BeamTopK *topk = (BeamTopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - BeamTopKMeta *m = new BeamTopKMeta(handle); + BeamTopKMeta *m = new BeamTopKMeta(handle, topk); m->profiling = topk->profiling; m->sorted = topk->sorted; m->max_beam_width = topk->max_beam_width; + m->input_type[0] = topk->inputs[0]->data_type; return m; } @@ -341,7 +342,6 @@ BeamInferenceResult assert(regions.size() == 4); assert(task->regions.size() == 4); - BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; // std::cout << "beam search topk inference: " @@ -356,13 +356,16 @@ BeamInferenceResult ctx, task->regions[1].region.get_index_space()); int numdims = in1_domain.get_dim(); - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // float const *in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); // float *value_ptr = helperGetTensorPointerWO( // regions[1], task->regions[1], FID_DATA, ctx, runtime); int *index_ptr = helperGetTensorPointerWO( regions[1], task->regions[1], FID_DATA, ctx, runtime); + // ); float *value_ptr = helperGetTensorPointerWO( regions[2], task->regions[2], FID_DATA, ctx, runtime); @@ -396,7 +399,7 @@ BeamInferenceResult // need meta for: how many sub requests in a main request BeamTopK::forward_kernel_wrapper(m, bc, - in_ptr, + input, value_ptr, index_ptr, parent_ptr, @@ -408,6 +411,13 @@ BeamInferenceResult download_tensor(index_ptr, ir.token_ids, batch_size * m->max_beam_width); download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); + // if(m->output_type[0] == DT_FLOAT){ + // download_tensor(value.get_float_ptr(), ir.probs, batch_size * + // m->max_beam_width); + // }else if(m->output_type[0] == DT_HALF){ + // download_tensor(value.get_half_ptr(), ir.probs, batch_size * + // m->max_beam_width); + // } download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); return ir; diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 7e9421f299..1817eae4da 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/beam_topk.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -24,6 +25,8 @@ using Legion::coord_t; enum class HeapType { kMinHeap, kMaxHeap }; enum class PreferIndices { kLower, kHigher }; +LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); + template struct Entry { int index; @@ -264,12 +267,13 @@ __device__ void mergeBeamShards(int num_shards, int max_heap_size, int request_id, int *parent_id, - float *probs, + T *probs, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - T *top_k_values, + float *top_k_values, int *top_k_indices, - int *top_k_parents) { + int *top_k_parents, + bool verbose) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -286,15 +290,9 @@ __device__ void mergeBeamShards(int num_shards, // Initialize the heap as a min-heap. for (int slot = 0; slot < heap_size; slot++) { // int beam = (slot % max_heap_size) / k; - float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((slot % max_heap_size) / k)]; + T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((slot % max_heap_size) / k)]; min_heap.assign(slot, {slot, (entries[slot].value * prob)}); - if (batch_index == 0) { - printf("slot %d, value %.15f, prob %15f\n", - slot, - entries[slot].value, - prob); - } } min_heap.build(heap_size); @@ -303,15 +301,8 @@ __device__ void mergeBeamShards(int num_shards, auto const entry = entries[shard]; auto const root = min_heap.root(); - float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard % max_heap_size) / k)]; - if (batch_index == 0) { - printf("shard %d, index %d, value %.15f, prob %.15f\n", - shard, - entry.index, - entry.value, - prob); - } + T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard % max_heap_size) / k)]; if (entry.value * prob < root.value) { continue; } @@ -339,7 +330,7 @@ __device__ void mergeBeamShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - top_k_values[rank] = max_element.value; + top_k_values[rank] = __half2float(max_element.value); int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; top_k_parents[rank] = @@ -347,14 +338,8 @@ __device__ void mergeBeamShards(int num_shards, ((shard_index % max_heap_size) / k)]; int next_shard_index = shard_index + num_shards; - float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((next_shard_index % max_heap_size) / k)]; - if (batch_index == 0) { - printf("next_shard_index %d, value %.15f, prob %.15f\n", - next_shard_index, - entries[next_shard_index].value, - prob); - } + T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((next_shard_index % max_heap_size) / k)]; max_heap.replace_root( {next_shard_index, entries[next_shard_index].value * prob}, @@ -363,7 +348,7 @@ __device__ void mergeBeamShards(int num_shards, // rank == last_k. Entry const &max_element = max_heap.root(); - top_k_values[last_k] = max_element.value; + top_k_values[last_k] = __half2float(max_element.value); int shard_index = max_element.index; top_k_indices[last_k] = entries[shard_index].index; top_k_parents[last_k] = @@ -390,15 +375,15 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, int k, int max_heap_size, int *parent_ids, - float *acc_probs, + T *acc_probs, int *gpu_block_start_index, int *gpu_request_id, int *tokens_per_request, bool sorted, - T *__restrict__ output, + float *__restrict__ output, int *__restrict__ indices, int *__restrict__ parents, - bool is_print) { + bool verbose) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; // T const *batch_input = input + batch_index * length; @@ -409,7 +394,7 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, Entry *shared_entries = (Entry *)shared_memory; int sub_request_id = thread_index / k; - // if (is_print) { + // if (verbose) { // printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d, // " // "request_id %d, token_nums %d\n", @@ -423,20 +408,6 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, T const *batch_input = input + gpu_block_start_index[batch_index] + (sub_request_id * token_nums * length); - if (batch_index == 0) { - printf("request 0 start index: thread index %d, offset %d, batch_input %p, " - "acc index %d acc " - "prob %f, thread_count %d, request_id %d\n", - thread_index, - gpu_block_start_index[batch_index] + - (sub_request_id * token_nums * length), - batch_input, - request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id, - acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - sub_request_id], - thread_count, - request_id); - } // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index, // thread_count, batch_index); heapBeamTopK(batch_input, @@ -466,18 +437,18 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, if (thread_index == 0) { // merge beam_width heaps and store the parent // find which req it belongs to, replace the offset - printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", - batch_index, - sub_request_id, - acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - sub_request_id]); + // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", + // batch_index, + // sub_request_id, + // acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + // sub_request_id]); int const offset = batch_index * k; auto batch_output = output + offset; auto batch_indices = indices + offset; auto batch_parents = parents + offset; Entry *top_k_heap = shared_entries + thread_count * k; - // if(batch_index == 0 && is_print){ + // if(batch_index == 0 && verbose) { // for(int i = 0; i < 18; i++){ // printf("see value: %.15f\n", shared_entries[i].value); // } @@ -495,14 +466,16 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, top_k_heap, batch_output, batch_indices, - batch_parents); + batch_parents, + verbose /*verbose prints*/); } } /*static*/ +template void BeamTopK::forward_kernel(BeamTopKMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, + DT const *input_ptr, float *output_ptr, int *indices_ptr, int *parent_ptr, @@ -530,14 +503,12 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, std::vector tokens_per_request; int block_start_index = 0; - int depth = - bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; // a data structure for prob, parent_id, int max_total_requests = BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests(); int parent_ids[max_total_requests]; - float acc_probs[max_total_requests]; + DT acc_probs[max_total_requests]; for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { if (bc->request_completed[i]) { @@ -552,7 +523,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; + log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] + << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -560,12 +532,12 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - std::cout << "probbbb req: " << i << ", sub req probability : " - << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j - << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] - << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j - << "\n"; + log_beam_topk.debug() + << "probbbb req: " << i + << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] + << ", sub request id " << j << ", parent id " + << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" + << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; } // process tokens @@ -582,84 +554,71 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } - std::cout << "what index: " << block_start_index - << ", block num: " << beam_num_blocks << "\n"; + log_beam_topk.debug() << "what index: " << block_start_index + << ", block num: " << beam_num_blocks << "\n"; assert(batch_size >= beam_num_blocks); assert(bc->num_active_requests() == req_index); { constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = max_heap_size * sizeof(Entry); + auto const heap_size = max_heap_size * sizeof(Entry
); // shared_memory_size = (num_shards + 1) * heap_size <=> num_shards = shared_memory_size / heap_size - 1; assert(num_shards > 0); if (num_shards > CUDA_NUM_THREADS) { num_shards = CUDA_NUM_THREADS; } - std::cout << "maxheap size: " << max_heap_size << "\n"; - std::cout << "maxbeam width: " << max_beam_width - << ", heap size: " << heap_size << "\n"; + log_beam_topk.debug() << "maxheap size: " << max_heap_size << "\n"; + log_beam_topk.debug() << "maxbeam width: " << max_beam_width + << ", heap size: " << heap_size << "\n"; } // We are limited by the amount of shared memory we have per block. size_t shared_memory_size = - (num_shards + 1) * max_heap_size * sizeof(Entry); + (num_shards + 1) * max_heap_size * sizeof(Entry
); assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; - // parent_id, per token - int *gpu_parents; - // acc_porbs, per token - float *gpu_probs; - // each block's start index; - // one block means the single token in different requests; - int *gpu_block_start_index; - int *gpu_request_id; - int *gpu_tokens_per_request; - - checkCUDA(hipMalloc(&gpu_parents, sizeof(int) * max_total_requests)); - checkCUDA(hipMalloc(&gpu_probs, sizeof(float) * max_total_requests)); - checkCUDA(hipMalloc(&gpu_block_start_index, sizeof(int) * beam_num_blocks)); - checkCUDA(hipMalloc(&gpu_request_id, sizeof(int) * beam_num_blocks)); - checkCUDA(hipMalloc(&gpu_tokens_per_request, sizeof(int) * beam_num_blocks)); - checkCUDA(hipMemcpy(gpu_parents, + checkCUDA(hipMemcpy(m->parent_ids, parent_ids, sizeof(int) * max_total_requests, hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(gpu_probs, + checkCUDA(hipMemcpy(m->acc_probs, acc_probs, - sizeof(float) * max_total_requests, + sizeof(DT) * max_total_requests, hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(gpu_block_start_index, + checkCUDA(hipMemcpy(m->block_start_index, beam_block_start_index.data(), sizeof(int) * beam_num_blocks, hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(gpu_request_id, + checkCUDA(hipMemcpy(m->request_id, request_id.data(), sizeof(int) * beam_num_blocks, hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(gpu_tokens_per_request, + checkCUDA(hipMemcpy(m->tokens_per_request, tokens_per_request.data(), sizeof(int) * beam_num_blocks, hipMemcpyHostToDevice)); - + // int depth = + // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; beam_topk_forward_kernel<<>>( input_ptr, shared_memory_size, length, max_beam_width, max_heap_size, - gpu_parents, - gpu_probs, - gpu_block_start_index, - gpu_request_id, - gpu_tokens_per_request, + m->parent_ids, + static_cast
(m->acc_probs), + m->block_start_index, + m->request_id, + m->tokens_per_request, sorted, output_ptr, indices_ptr, parent_ptr, - depth == 1); + false /*verbose*/ // depth == 1 + ); // merge sub } @@ -667,7 +626,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, /*static*/ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, + GenericTensorAccessorR const &input, float *output_ptr, int *indices_ptr, int *parent_ptr, @@ -684,16 +643,29 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, hipEventRecord(t_start, stream); } - BeamTopK::forward_kernel(m, - bc, - input_ptr, - output_ptr, - indices_ptr, - parent_ptr, - batch_size, - length, - sorted, - stream); + if (input.data_type == DT_HALF) { + BeamTopK::forward_kernel(m, + bc, + input.get_half_ptr(), + output_ptr, + indices_ptr, + parent_ptr, + batch_size, + length, + sorted, + stream); + } else if (input.data_type == DT_FLOAT) { + BeamTopK::forward_kernel(m, + bc, + input.get_float_ptr(), + output_ptr, + indices_ptr, + parent_ptr, + batch_size, + length, + sorted, + stream); + } if (m->profiling) { hipEventRecord(t_end, stream); @@ -704,13 +676,25 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, hipEventDestroy(t_end); printf("[BeamTopK] forward time = %.2lfms\n", elapsed); } - // if(bc->beam_slots.at(0).current_depth == 1){ - // print_beam_tensor((float *)input_ptr, 50, 32000, 15, "beam topk - // input"); print_tensor((float *)output_ptr, 50, "beam topk - // output"); - // } } -BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) {} - +BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op) : OpMeta(handler) { + DataType data_type = op->inputs[0]->data_type; + checkCUDA(hipMalloc(&parent_ids, + sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(hipMalloc(&acc_probs, + sizeof(data_type_size(data_type)) * + BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(hipMalloc(&block_start_index, + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(hipMalloc(&request_id, + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + checkCUDA(hipMalloc(&tokens_per_request, + sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS)); +} }; // namespace FlexFlow diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index d18d5c2f00..2c5ceda548 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/beam_topk.h" #include "flexflow/utils/cuda_helper.h" @@ -265,10 +266,10 @@ __device__ void mergeBeamShards(int num_shards, int max_heap_size, int request_id, int *parent_id, - float *probs, + T *probs, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - T *top_k_values, + float *top_k_values, int *top_k_indices, int *top_k_parents, bool verbose) { @@ -288,8 +289,8 @@ __device__ void mergeBeamShards(int num_shards, // Initialize the heap as a min-heap. for (int slot = 0; slot < heap_size; slot++) { // int beam = (slot % max_heap_size) / k; - float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((slot % max_heap_size) / k)]; + T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((slot % max_heap_size) / k)]; min_heap.assign(slot, {slot, (entries[slot].value * prob)}); if (verbose && batch_index == 0) { printf("slot %d, value %.15f, prob %15f\n", @@ -305,8 +306,8 @@ __device__ void mergeBeamShards(int num_shards, auto const entry = entries[shard]; auto const root = min_heap.root(); - float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard % max_heap_size) / k)]; + T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((shard % max_heap_size) / k)]; if (verbose && batch_index == 0) { printf("shard %d, index %d, value %.15f, prob %.15f\n", shard, @@ -341,7 +342,7 @@ __device__ void mergeBeamShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - top_k_values[rank] = max_element.value; + top_k_values[rank] = __half2float(max_element.value); int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; top_k_parents[rank] = @@ -349,8 +350,8 @@ __device__ void mergeBeamShards(int num_shards, ((shard_index % max_heap_size) / k)]; int next_shard_index = shard_index + num_shards; - float prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((next_shard_index % max_heap_size) / k)]; + T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + ((next_shard_index % max_heap_size) / k)]; if (batch_index == 0) { printf("next_shard_index %d, value %.15f, prob %.15f\n", next_shard_index, @@ -365,7 +366,7 @@ __device__ void mergeBeamShards(int num_shards, // rank == last_k. Entry const &max_element = max_heap.root(); - top_k_values[last_k] = max_element.value; + top_k_values[last_k] = __half2float(max_element.value); int shard_index = max_element.index; top_k_indices[last_k] = entries[shard_index].index; top_k_parents[last_k] = @@ -392,12 +393,12 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, int k, int max_heap_size, int *parent_ids, - float *acc_probs, + T *acc_probs, int *gpu_block_start_index, int *gpu_request_id, int *tokens_per_request, bool sorted, - T *__restrict__ output, + float *__restrict__ output, int *__restrict__ indices, int *__restrict__ parents, bool verbose) { @@ -503,9 +504,10 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, } /*static*/ +template void BeamTopK::forward_kernel(BeamTopKMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, + DT const *input_ptr, float *output_ptr, int *indices_ptr, int *parent_ptr, @@ -538,7 +540,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int max_total_requests = BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests(); int parent_ids[max_total_requests]; - float acc_probs[max_total_requests]; + DT acc_probs[max_total_requests]; for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { if (bc->request_completed[i]) { @@ -592,7 +594,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, { constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = max_heap_size * sizeof(Entry); + auto const heap_size = max_heap_size * sizeof(Entry
); // shared_memory_size = (num_shards + 1) * heap_size <=> num_shards = shared_memory_size / heap_size - 1; assert(num_shards > 0); @@ -605,7 +607,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, } // We are limited by the amount of shared memory we have per block. size_t shared_memory_size = - (num_shards + 1) * max_heap_size * sizeof(Entry); + (num_shards + 1) * max_heap_size * sizeof(Entry
); assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; @@ -616,7 +618,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, cudaMemcpyHostToDevice)); checkCUDA(cudaMemcpy(m->acc_probs, acc_probs, - sizeof(float) * max_total_requests, + sizeof(DT) * max_total_requests, cudaMemcpyHostToDevice)); checkCUDA(cudaMemcpy(m->block_start_index, beam_block_start_index.data(), @@ -639,7 +641,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_beam_width, max_heap_size, m->parent_ids, - m->acc_probs, + static_cast
(m->acc_probs), m->block_start_index, m->request_id, m->tokens_per_request, @@ -656,7 +658,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, /*static*/ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, + GenericTensorAccessorR const &input, float *output_ptr, int *indices_ptr, int *parent_ptr, @@ -673,16 +675,29 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, cudaEventRecord(t_start, stream); } - BeamTopK::forward_kernel(m, - bc, - input_ptr, - output_ptr, - indices_ptr, - parent_ptr, - batch_size, - length, - sorted, - stream); + if (input.data_type == DT_HALF) { + BeamTopK::forward_kernel(m, + bc, + input.get_half_ptr(), + output_ptr, + indices_ptr, + parent_ptr, + batch_size, + length, + sorted, + stream); + } else if (input.data_type == DT_FLOAT) { + BeamTopK::forward_kernel(m, + bc, + input.get_float_ptr(), + output_ptr, + indices_ptr, + parent_ptr, + batch_size, + length, + sorted, + stream); + } if (m->profiling) { cudaEventRecord(t_end, stream); @@ -695,12 +710,14 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, } } -BeamTopKMeta::BeamTopKMeta(FFHandler handler) : OpMeta(handler) { +BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op) : OpMeta(handler) { + DataType data_type = op->inputs[0]->data_type; checkCUDA(cudaMalloc(&parent_ids, sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&acc_probs, - sizeof(float) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * + sizeof(data_type_size(data_type)) * + BeamSearchBatchConfig::MAX_BEAM_WIDTH * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&block_start_index, sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 4e5d640c08..cf90919e6b 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -395,7 +395,7 @@ OpMeta *ElementBinary::init_task(Task const *task, Runtime *runtime) { ElementBinary *eb = (ElementBinary *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - ElementBinaryMeta *m = new ElementBinaryMeta(handle); + ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb); for (int i = 0; i < eb->numInputs; i++) { m->trainableInputs[i] = eb->trainableInputs[i]; } @@ -596,8 +596,11 @@ __host__ void Runtime *runtime) { // const ElementBinary* ele = (const ElementBinary*) task->args; ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); + GenericTensorAccessorR in1, in2; + GenericTensorAccessorW out; Domain in1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); + if (!m->has_same_operands) { Domain in2_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); @@ -607,53 +610,78 @@ __host__ void m->op_type == OP_EW_MUL); } } - float const *in1_ptr = NULL, *in2_ptr = NULL; - float *out_ptr = NULL; + if (m->inplace_a) { if (m->has_same_operands) { assert(regions.size() == 1); assert(task->regions.size() == 1); - out_ptr = helperGetTensorPointerRW( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - in2_ptr = out_ptr; - in1_ptr = out_ptr; + out = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = out; + in1 = out; } else { assert(regions.size() == 2); assert(task->regions.size() == 2); - out_ptr = helperGetTensorPointerRW( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - in2_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - in1_ptr = out_ptr; + out = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = helperGetGenericTensorAccessorRO(m->input_type[1], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + in1 = out; } } else { if (m->has_same_operands) { assert(regions.size() == 2); assert(task->regions.size() == 2); - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - // assert(out_domain == in1_domain); - in1_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - in2_ptr = in1_ptr; - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + in1 = helperGetGenericTensorAccessorRO(m->input_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = in1; + out = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); } else { assert(regions.size() == 3); assert(task->regions.size() == 3); - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - // assert(out_domain == in1_domain); - in1_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - in2_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - out_ptr = helperGetTensorPointerWO( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + in1 = helperGetGenericTensorAccessorRO(m->input_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = helperGetGenericTensorAccessorRO(m->input_type[1], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + out = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); } } - forward_kernel_wrapper(m, in1_ptr, in2_ptr, out_ptr); + forward_kernel_wrapper(m, in1, in2, out); } void ElementBinary::backward(FFModel const &ff) { @@ -855,7 +883,7 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, if (!inputs[1]->get_sub_tensor(mv, sub_input2)) { return false; } - ElementBinaryMeta *m = sim->ele_binary_meta; + ElementBinaryMeta *m = new ElementBinaryMeta(sim->handler, this); m->op_type = op_type; m->profiling = this->profiling; m->inplace_a = this->inplace_a; @@ -871,8 +899,12 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, sim->free_all(); float *input1_ptr = (float *)sim->allocate(sub_input1.get_volume(), DT_FLOAT); assert(input1_ptr != NULL); + GenericTensorAccessorR input1_acc( + inputs[0]->data_type, input1_domain, input1_ptr); float *input2_ptr = (float *)sim->allocate(sub_input2.get_volume(), DT_FLOAT); assert(input2_ptr != NULL); + GenericTensorAccessorR input2_acc( + inputs[1]->data_type, input2_domain, input2_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = NULL; @@ -882,13 +914,15 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); } assert(output_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, output_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); assert(m->profiling == false); std::function forward, backward; forward = [&] { - forward_kernel_wrapper(m, input1_ptr, input2_ptr, output_ptr); + forward_kernel_wrapper(m, input1_acc, input2_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input1_grad_ptr = @@ -937,6 +971,7 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, cost_metrics.forward_time); } + delete m; return true; } diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 7752d30828..f0713dd0a1 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -27,11 +27,11 @@ Tensor FFModel::unary(OperatorType op, char const *name, float scalar) { Layer *ele = nullptr; - DataType dtype; - // FIXME: currently cast input to float if it has a lower type - if (x->data_type < DT_FLOAT) { + DataType dtype = x->data_type; + // if (x->data_type < DT_FLOAT) { + if (false) { dtype = DT_FLOAT; - std::string str(name); + std::string str = nullptr ? "" : std::string(name); Tensor new_x = cast(x, dtype, (str + "input_pre_cast").c_str()); ele = new Layer(this, op, @@ -473,7 +473,9 @@ void ElementUnary::forward_task(Task const *task, Context ctx, Runtime *runtime) { ElementUnaryMeta const *m = *((ElementUnaryMeta **)task->local_args); - if (m->data_type == DT_FLOAT) { + if (m->data_type == DT_HALF) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (m->data_type == DT_FLOAT) { forward_task_with_type(task, regions, ctx, runtime); } else if (m->data_type == DT_DOUBLE) { forward_task_with_type(task, regions, ctx, runtime); diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index 43c84b0c41..424e739e13 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -45,10 +45,11 @@ void ElementUnary::init_kernel(ElementUnaryMeta *m, assert(false); } checkCUDNN(miopenSetActivationDescriptor(m->actiDesc, mode, 0.0, 0.0, 0.0)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(m->inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->inputTensor, input_domain, m->data_type)); // input_domain == output_domain - checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->outputTensor, output_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->outputTensor, output_domain, m->data_type)); } template @@ -81,7 +82,9 @@ __global__ void elewise_unary_forward_kernel( break; } case OP_GELU: { - out[i] = (T)(in[i] * 0.5 * erfc(-in[i] * M_SQRT1_2)); + out[i] = (T)(in[i] * static_cast(0.5f) * + static_cast(erfc(static_cast( + -in[i] * static_cast(M_SQRT1_2))))); break; } case OP_RSQRT: { @@ -189,7 +192,7 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - + (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); break; } @@ -284,6 +287,11 @@ ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); } +template void + ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, + half const *input_ptr, + half *output_ptr, + size_t num_elements); template void ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, float const *input_ptr, diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index d6e5bcfdc3..4a38dabe52 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -45,10 +45,11 @@ void ElementUnary::init_kernel(ElementUnaryMeta *m, } checkCUDNN(cudnnSetActivationDescriptor( m->actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(m->inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->inputTensor, input_domain, m->data_type)); // input_domain == output_domain - checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->outputTensor, output_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->outputTensor, output_domain, m->data_type)); } template @@ -81,7 +82,9 @@ __global__ void elewise_unary_forward_kernel( break; } case OP_GELU: { - out[i] = (T)(in[i] * 0.5 * erfc(-in[i] * M_SQRT1_2)); + out[i] = (T)(in[i] * static_cast(0.5f) * + static_cast(erfc(static_cast( + -in[i] * static_cast(M_SQRT1_2))))); break; } case OP_RSQRT: { @@ -202,7 +205,7 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * erfc(-input[i] * M_SQRT1_2) - + (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); break; } @@ -293,6 +296,11 @@ ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); } +template void + ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, + half const *input_ptr, + half *output_ptr, + size_t num_elements); template void ElementUnary::forward_kernel_wrapper(ElementUnaryMeta const *m, float const *input_ptr, @@ -313,7 +321,6 @@ template void int64_t const *input_ptr, int64_t *output_ptr, size_t num_elements); - template void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, float const *input_ptr, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index a602c5d6b1..712ed143b1 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -284,11 +284,10 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain == my_input_accessor[1].domain); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + Kernels::ElementBinary::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); break; break; } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ca2a331984..17b0f9616d 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -297,11 +297,10 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_input_accessor[0].domain == my_input_accessor[1].domain); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + Kernels::ElementBinary::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); break; } case OP_EMBEDDING: { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index a0f0c34c1b..6027a2cd21 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "flexflow/utils/cuda_helper.h" @@ -65,22 +66,38 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, bool bias, bool add_bias_kv, bool add_zero_attn, + DataType data_type, Initializer *kernel_initializer, bool apply_rotary_embedding, bool scaling_query, float scaling_factor, bool qk_prod_scaling, char const *name) { + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; int weight_num = bias ? 2 : 1; - // Currently assume that - Layer *li = new Layer(this, - OP_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - input); + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); + li = new Layer(this, + OP_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + input); + } { int numdims = input->num_dims; int dims[MAX_TENSOR_DIM]; @@ -89,7 +106,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, } dims[0] = embed_dim; li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + numdims, dims, data_type, li, 0, true /*create_grad*/); } { // Compute weight size @@ -103,7 +120,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; li->weights[0] = create_weight_legion_ordering(2, dims, - DT_FLOAT, + data_type, li, true /*create_grad*/, kernel_initializer, @@ -114,13 +131,13 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, int dims[1] = {embed_dim * 4}; li->weights[1] = create_weight_legion_ordering(1, dims, - DT_FLOAT, + data_type, li, true /*create_grad*/, kernel_initializer, CHOSEN_SYNC_TYPE); } - li->data_type = DT_FLOAT; + li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); li->add_int_property("kdim", kdim); @@ -208,7 +225,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( // Initializer* _bias_initializer) : Op(model, OP_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, + _input->data_type, name, 1 /*inputs*/, (_bias ? 2 : 1), /*weights*/ @@ -260,8 +277,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[0] = model.create_parallel_weight<3>(dims, - DT_FLOAT, - NULL /*owner_op*/, + this->data_type, + nullptr /*owner_op*/, true /*create_grad*/, initializer, comm_type); @@ -280,7 +297,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[1] = model.create_parallel_weight<2>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, NULL, @@ -288,7 +305,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( } outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, DT_FLOAT, this); + _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ /* } */ @@ -317,7 +334,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( // Initializer* _bias_initializer) : Op(model, OP_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, + _input->data_type, name, 1 /*inputs*/, (_bias ? 2 : 1), /*weights*/ @@ -367,7 +384,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[0] = model.create_parallel_weight<3>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, @@ -385,14 +402,14 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[1] = model.create_parallel_weight<2>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, NULL, comm_type); } outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, DT_FLOAT, this); + _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ @@ -547,12 +564,27 @@ OpMeta *IncMultiHeadSelfAttention::init_task( (IncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); @@ -565,10 +597,11 @@ OpMeta *IncMultiHeadSelfAttention::init_task( .best_affinity_to(task->target_proc) .first(); IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( - handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); + handle, attn, weight, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; - assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); return m; } @@ -645,8 +678,6 @@ void IncMultiHeadSelfAttention::inference_task( assert(task->regions.size() == regions.size()); - float const *bias_ptr = NULL; - BatchConfig const *bc = (BatchConfig *)task->args; IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); @@ -659,19 +690,17 @@ void IncMultiHeadSelfAttention::inference_task( m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - + GenericTensorAccessorR biases; if (*m->bias) { - GenericTensorAccessorR biases = - helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); assert(bias_domain.get_dim() == 2); - bias_ptr = biases.get_float_ptr(); } Domain input_domain = runtime->get_index_space_domain( @@ -685,12 +714,8 @@ void IncMultiHeadSelfAttention::inference_task( assert(weight_domain.get_dim() == 3); assert(output_domain.get_dim() == 4); - IncMultiHeadSelfAttention::inference_kernel_wrapper(m, - bc, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr); + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, bc, input, weight, output, biases); #ifdef INFERENCE_TESTS printf("Checking IncMultiHeadSelfAttention computations...\n"); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 0669b347a7..a627e0ef08 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -27,10 +27,10 @@ using Legion::Memory; void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -60,7 +60,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -87,7 +87,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bool _qk_prod_scaling, bool _add_bias_kv, float _scaling_factor, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index bc4c91aa4f..33bc32224b 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -15,6 +15,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/cuda_helper.h" @@ -28,8 +29,9 @@ using Legion::Memory; namespace Kernels { namespace IncMultiHeadAttention { -__global__ void build_w_out_tensor(float const *weight_ptr, - float *contiguous_weight_ptr, +template +__global__ void build_w_out_tensor(DT const *weight_ptr, + DT *contiguous_weight_ptr, int vProjSize, int oProjSize, int num_heads, @@ -44,8 +46,9 @@ __global__ void build_w_out_tensor(float const *weight_ptr, } } -__global__ void apply_proj_bias_w(float *input_ptr, - float const *bias_ptr, +template +__global__ void apply_proj_bias_w(DT *input_ptr, + DT const *bias_ptr, int num_tokens, int oProjSize) { CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { @@ -54,8 +57,9 @@ __global__ void apply_proj_bias_w(float *input_ptr, } } -__global__ void apply_proj_bias_qkv(float *input_ptr, - float const *bias_ptr, +template +__global__ void apply_proj_bias_qkv(DT *input_ptr, + DT const *bias_ptr, int num_tokens, int qProjSize, int kProjSize, @@ -87,8 +91,9 @@ __global__ void apply_proj_bias_qkv(float *input_ptr, } } +template __global__ void - apply_rotary_embedding(float *input_ptr, + apply_rotary_embedding(DT *input_ptr, cuFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, @@ -139,24 +144,25 @@ __global__ void } } +template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; + DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); - cudaDataType_t data_type = ff_to_cuda_datatype(DT_FLOAT); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = CUDA_R_32F; + cudaDataType_t compute_type = cublas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_heads @@ -184,16 +190,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, k, &alpha, weight_ptr, - data_type, + cublas_data_type, lda, strideA, input_ptr, - data_type, + cublas_data_type, ldb, strideB, &beta, output_ptr, - data_type, + cublas_data_type, ldc_q, strideC, m->num_heads, @@ -208,16 +214,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, k, &alpha, weight_ptr + m_q * k, - data_type, + cublas_data_type, lda, strideA, input_ptr, - data_type, + cublas_data_type, ldb, strideB, &beta, output_ptr + m_q * n, - data_type, + cublas_data_type, ldc_k, strideC, m->num_heads, @@ -232,16 +238,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, k, &alpha, weight_ptr + (m_q + m_k) * k, - data_type, + cublas_data_type, lda, strideA, input_ptr, - data_type, + cublas_data_type, ldb, strideB, &beta, output_ptr + (m_q + m_k) * n, - data_type, + cublas_data_type, ldc_v, strideC, m->num_heads, @@ -305,13 +311,83 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + if (num_tokens > 0) { + int parallelism = m->kProjSize * num_tokens * m->num_heads; + store_kv_cache<<>>(static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + BatchConfig::MAX_SEQ_LENGTH, + /* k_cache = */ true); + + parallelism = m->vProjSize * num_tokens * m->num_heads; + store_kv_cache<<>>(static_cast
(m->devQKVProjArray), + static_cast
(m->valueCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + BatchConfig::MAX_SEQ_LENGTH, + /* k_cache = */ false); + } +} + +template +void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + // here because we need postion info in infernece 1 + cudaMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + + // phase 2: Update key/val cache + update_kv_cache_kernel
(m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); +} + } // namespace IncMultiHeadAttention } // namespace Kernels using namespace Kernels::IncMultiHeadAttention; -__global__ void store_kv_cache(float const *devQKVProjArray, - float *cache_ptr, +template +__global__ void store_kv_cache(DT const *devQKVProjArray, + DT *cache_ptr, BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, @@ -330,7 +406,7 @@ __global__ void store_kv_cache(float const *devQKVProjArray, int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; int current_head_block_size = num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); - float val = + DT val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + token_idx * proj_size + data_idx]; // int const req_id = id_map[token_idx].request_index; @@ -344,49 +420,13 @@ __global__ void store_kv_cache(float const *devQKVProjArray, } } -void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - if (num_tokens > 0) { - int parallelism = m->kProjSize * num_tokens * m->num_heads; - store_kv_cache<<>>(m->devQKVProjArray, - m->keyCache, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ true); - - parallelism = m->vProjSize * num_tokens * m->num_heads; - store_kv_cache<<>>(m->devQKVProjArray, - m->valueCache, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ false); - } -} - -__global__ void fill_entries_above_diagonal(float *matrix, +template +__global__ void fill_entries_above_diagonal(DT *matrix, size_t num_rows, size_t num_cols, size_t num_heads, size_t entries_above_diagonal, - float value) { + DT value) { CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { size_t head_idx = i / entries_above_diagonal; size_t entry_idx = i % entries_above_diagonal; @@ -397,19 +437,22 @@ __global__ void fill_entries_above_diagonal(float *matrix, } } +template void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, - float *output_ptr, - float const *bias_ptr, + DT *output_ptr, + DT const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = CUDA_R_32F; + cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); @@ -440,16 +483,16 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int strideC = num_new_tokens * total_tokens; // a flag of using this scaling alpha - float alpha = 1.0f, beta = 0.0f; + DT alpha = 1.0f, beta = 0.0f; if (*m->qk_prod_scaling) { - alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } // To get A, skip over Q entries from previous requests (same head) - void const *A = (void const *)(m->devQKVProjArray + - tokens_previous_requests * m->qProjSize); + void const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; // To get B, skip over K entries from previous requests (all heads + // padding) - void const *B = (void const *)(m->keyCache + i * kt_req_block_size); + void const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests void *C = (void *)(m->qk_prods); @@ -486,12 +529,12 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, fill_entries_above_diagonal<<>>((float *)C, + stream>>>(static_cast
(C), num_new_tokens, total_tokens, m->num_heads, entries_above_diagonal, - -INFINITY); + static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) cudnnTensorDescriptor_t qk_tensor; @@ -511,12 +554,12 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int w_param = num_new_tokens; checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, n_param, c_param, h_param, w_param)); - alpha = 1.0f, beta = 0.0f; + float softmax_alpha = 1.0f, softmax_beta = 0.0f; void *C_softmax = (void *)(m->qk_prods_softmax); // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The @@ -525,12 +568,12 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, + &softmax_alpha, qk_tensor, - (void *)((float *)C), - &beta, + C, + &softmax_beta, qk_tensor, - (void *)((float *)C_softmax))); + C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; m_ = num_new_tokens; @@ -542,14 +585,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, strideC = num_new_tokens * m->vProjSize; // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) - A = (void const *)C_softmax; + A = static_cast
(C_softmax); // To get B, skip over V^T entries from previous requests (all heads + // padding) - B = (void const *)(m->valueCache + i * vt_req_block_size); + B = static_cast
(m->valueCache) + i * vt_req_block_size; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - C = (void *)(m->attn_heads + - tokens_previous_requests * m->num_heads * m->vProjSize); + C = static_cast
(m->attn_heads) + + tokens_previous_requests * m->num_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -580,9 +623,9 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = (void const *)m->W_out_contiguous; - B = (void const *)C; - C = (void *)(output_ptr + tokens_previous_requests * m->oProjSize); + A = m->W_out_contiguous; + B = C; + C = (output_ptr + tokens_previous_requests * m->oProjSize); checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -623,12 +666,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -637,40 +681,34 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - // reload the weight_o - - if (!(*m->has_load_weights)) { - int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - build_w_out_tensor<<>>(weight_ptr, - m->W_out_contiguous, - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); - *m->has_load_weights = true; + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + if (input.data_type == DT_HALF) { + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::inference_kernel(m, + bc, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::inference_kernel(m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); } - // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel( - m, bc, input_ptr, weight_ptr, m->devQKVProjArray, bias_ptr, stream); - - // phase 2: Update key/val cache - update_kv_cache_kernel(m, bc, stream); - - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); - if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -688,7 +726,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -708,7 +746,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->qk_prod_scaling, attn->add_bias_kv, attn->scaling_factor, - weight_ptr, + weight, gpu_mem, num_samples, _num_heads) {} @@ -730,7 +768,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bool _qk_prod_scaling, bool _add_bias_kv, float _scaling_factor, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -738,7 +776,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); - qSize = _qSize; kSize = _kSize; vSize = _vSize; @@ -750,11 +787,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(qProjSize == kProjSize); // required for attention QK^T matmul vProjSize = _vProjSize; oProjSize = _oProjSize; + size_t size_of_dt = data_type_size(attn->data_type); num_heads = _num_heads; weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - weightSize = weights_params * num_heads * sizeof(float); + weightSize = weights_params * num_heads * size_of_dt; has_load_weights = (bool *)calloc(1, sizeof(bool)); *has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); @@ -818,7 +856,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * - sizeof(float) + + size_of_dt + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + complex_size * sizeof(cuFloatComplex); // more components will // be added here later @@ -834,29 +872,54 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 0, Realm::ProfilingRequestSet()) .wait(); - devQKVProjArray = (float *)reserveInst.pointer_untyped(0, sizeof(char)); - keyCache = (float *)devQKVProjArray + qkv_max_proj_size; - valueCache = (float *)keyCache + key_cache_size; - token_infos = (BatchConfig::PerTokenInfo *)(valueCache + value_cache_size); - qk_prods = (float *)(token_infos + tokeninfo_size); - qk_prods_softmax = (float *)(qk_prods + qk_prod_size); - attn_heads = (float *)qk_prods_softmax + qk_prod_size; - W_out_contiguous = (float *)attn_heads + attn_heads_size; - complex_input = - (cuFloatComplex *)(W_out_contiguous + W_out_contiguous_size); - int parallelism = vProjSize * oProjSize * num_heads; - build_w_out_tensor<<>>( - weight_ptr, - W_out_contiguous, - vProjSize, - oProjSize, - num_heads, - (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); + off_t offset = 0; + devQKVProjArray = reserveInst.pointer_untyped(offset, 0); + offset += qkv_max_proj_size * size_of_dt; + keyCache = reserveInst.pointer_untyped(offset, 0); + offset += key_cache_size * size_of_dt; + valueCache = reserveInst.pointer_untyped(offset, 0); + offset += value_cache_size * size_of_dt; + token_infos = reserveInst.pointer(offset); + offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; + qk_prods = reserveInst.pointer_untyped(offset, 0); + offset += qk_prod_size * size_of_dt; + qk_prods_softmax = reserveInst.pointer_untyped(offset, 0); + offset += qk_prod_size * size_of_dt; + attn_heads = reserveInst.pointer_untyped(offset, 0); + offset += attn_heads_size * size_of_dt; + W_out_contiguous = reserveInst.pointer_untyped(offset, 0); + offset += W_out_contiguous_size * size_of_dt; + complex_input = reserveInst.pointer(offset); + offset += complex_size * sizeof(cuFloatComplex); + if (weight.data_type == DT_FLOAT) { + int parallelism = vProjSize * oProjSize * num_heads; + build_w_out_tensor<<>>( + weight.get_float_ptr(), + (float *)W_out_contiguous, + vProjSize, + oProjSize, + num_heads, + (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); + } else if (weight.data_type == DT_HALF) { + int parallelism = vProjSize * oProjSize * num_heads; + build_w_out_tensor<<>>( + weight.get_half_ptr(), + (half *)W_out_contiguous, + vProjSize, + oProjSize, + num_heads, + (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); + } else { + assert(false && "Unsupported data_type"); + } + assert(offset == totalSize); } - cudaStreamSynchronize(stream); } diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index 4cdc839b59..3aef875d1f 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -22,7 +22,8 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; -ElementBinaryMeta::ElementBinaryMeta(FFHandler handler) : OpMeta(handler) { +ElementBinaryMeta::ElementBinaryMeta(FFHandler handler, Op const *op) + : OpMeta(handler, op) { checkCUDNN(miopenCreateTensorDescriptor(&input1Tensor)); checkCUDNN(miopenCreateTensorDescriptor(&input2Tensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); @@ -67,9 +68,9 @@ void init_kernel(ElementBinaryMeta *m, /*static*/ void forward_kernel_wrapper(ElementBinaryMeta const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr) { + GenericTensorAccessorR const &in1, + GenericTensorAccessorR const &in2, + GenericTensorAccessorW const &out) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -81,7 +82,8 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, } // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); - Internal::forward_kernel(m, in1_ptr, in2_ptr, out_ptr, stream); + Internal::forward_kernel( + m, in1.get_float_ptr(), in2.get_float_ptr(), out.get_float_ptr(), stream); // print_tensor(out_ptr, in1_domain.get_volume(), "output:"); if (m->profiling) { hipEventRecord(t_end, stream); @@ -238,10 +240,11 @@ __global__ void elewise_binary_backward_kernel(coord_t volume, } /*static*/ +template void forward_kernel(ElementBinaryMeta const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr, + DT const *in1_ptr, + DT const *in2_ptr, + DT *out_ptr, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index 5ef4eb1142..0cbff73b82 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -21,7 +21,8 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; -ElementBinaryMeta::ElementBinaryMeta(FFHandler handler) : OpMeta(handler) { +ElementBinaryMeta::ElementBinaryMeta(FFHandler handler, Op const *op) + : OpMeta(handler, op) { checkCUDNN(cudnnCreateTensorDescriptor(&input1Tensor)); checkCUDNN(cudnnCreateTensorDescriptor(&input2Tensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); @@ -61,27 +62,28 @@ void init_kernel(ElementBinaryMeta *m, default: assert(false); } + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); checkCUDNN(cudnnSetOpTensorDescriptor( m->opDesc, mode, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN)); checkCUDNN(cudnnSetReduceTensorDescriptor(m->reduceAddDesc, CUDNN_REDUCE_TENSOR_ADD, - CUDNN_DATA_FLOAT, + cudnn_data_type, CUDNN_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->input1Tensor, input1_domain)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->input2Tensor, input2_domain)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->outputTensor, output_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->input1Tensor, input1_domain, m->input_type[0])); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->input2Tensor, input2_domain, m->input_type[1])); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + m->outputTensor, output_domain, m->output_type[0])); } /*static*/ void forward_kernel_wrapper(ElementBinaryMeta const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr) { + GenericTensorAccessorR const &in1, + GenericTensorAccessorR const &in2, + GenericTensorAccessorW const &out) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -91,7 +93,20 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::forward_kernel(m, in1_ptr, in2_ptr, out_ptr, stream); + assert(in1.data_type == in2.data_type); + assert(out.data_type == in1.data_type); + if (out.data_type == DT_HALF) { + Internal::forward_kernel( + m, in1.get_half_ptr(), in2.get_half_ptr(), out.get_half_ptr(), stream); + } else if (out.data_type == DT_FLOAT) { + Internal::forward_kernel(m, + in1.get_float_ptr(), + in2.get_float_ptr(), + out.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -292,10 +307,11 @@ __global__ void elewise_binary_backward_kernel(coord_t volume, } /*static*/ +template void forward_kernel(ElementBinaryMeta const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr, + DT const *in1_ptr, + DT const *in2_ptr, + DT *out_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 8066ddc812..a041f008bc 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -19,7 +19,8 @@ namespace FlexFlow { -LinearMeta::LinearMeta(FFHandler handler, int batch_size) : OpMeta(handler) { +LinearMeta::LinearMeta(FFHandler handler, int batch_size, Linear const *li) + : OpMeta(handler, li) { // Allocate an all-one's vector float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); for (int i = 0; i < batch_size; i++) { @@ -31,7 +32,7 @@ LinearMeta::LinearMeta(FFHandler handler, int batch_size) : OpMeta(handler) { dram_one_ptr, sizeof(float) * batch_size, hipMemcpyHostToDevice)); - one_ptr = (float const *)fb_one_ptr; + one_ptr = (void *)fb_one_ptr; // Allocate descriptors checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); @@ -96,15 +97,28 @@ void forward_kernel_wrapper(LinearMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - Internal::forward_kernel(m, - input_ptr, - output_ptr, - weight_ptr, - bias_ptr, - in_dim, - out_dim, - batch_size, - stream); + + if (m->input_type == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } if (m->profiling) { hipEventRecord(t_end, stream); @@ -143,18 +157,34 @@ void backward_kernel_wrapper(LinearMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - Internal::backward_kernel(m, - input_ptr, - input_grad_ptr, - output_ptr, - output_grad_ptr, - kernel_ptr, - kernel_grad_ptr, - bias_grad_ptr, - in_dim, - out_dim, - batch_size, - stream); + if (m->input_type == DT_FLOAT) { + Internal::backward_kernel(m, + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type == DT_HALF) { + Internal::backward_kernel(m, + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -189,7 +219,7 @@ Parameter* Linear::get_parameter(int index) */ namespace Internal { - +template void forward_kernel(LinearMeta const *m, void const *input_ptr, void *output_ptr, @@ -201,7 +231,7 @@ void forward_kernel(LinearMeta const *m, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; + DT alpha = 1.0f, beta = 0.0f; hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type); @@ -209,7 +239,7 @@ void forward_kernel(LinearMeta const *m, // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - hipblasDatatype_t compute_type = HIPBLAS_R_32F; + hipblasDatatype_t compute_type = input_type; #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, @@ -242,8 +272,8 @@ void forward_kernel(LinearMeta const *m, bias_ptr, weight_type, 1, - m->one_ptr, - HIPBLAS_R_32F, + static_cast
(m->one_ptr), + weight_type, 1, &alpha, output_ptr, @@ -281,6 +311,7 @@ void forward_kernel(LinearMeta const *m, } } +template void backward_kernel(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -296,7 +327,7 @@ void backward_kernel(LinearMeta const *m, checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - float alpha = 1.0f; + DT alpha = 1.0f; hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type); diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 3f408c7cb0..02b018566e 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -13,24 +13,34 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -LinearMeta::LinearMeta(FFHandler handler, int batch_size) : OpMeta(handler) { +LinearMeta::LinearMeta(FFHandler handler, int batch_size, Linear const *li) + : OpMeta(handler, li) { // Allocate an all-one's vector - float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); - for (int i = 0; i < batch_size; i++) { - dram_one_ptr[i] = 1.0f; + DataType data_type = li->data_type; + checkCUDA(cudaMalloc(&one_ptr, data_type_size(data_type) * batch_size)); + int parallelism = batch_size; + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (data_type == DT_FLOAT) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((float *)one_ptr, batch_size); + } else if (data_type == DT_HALF) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((half *)one_ptr, batch_size); } - float *fb_one_ptr; - checkCUDA(cudaMalloc(&fb_one_ptr, sizeof(float) * batch_size)); - checkCUDA(cudaMemcpy(fb_one_ptr, - dram_one_ptr, - sizeof(float) * batch_size, - cudaMemcpyHostToDevice)); - one_ptr = (float const *)fb_one_ptr; + // Allocate descriptors checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); @@ -97,15 +107,27 @@ void forward_kernel_wrapper(LinearMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::forward_kernel(m, - input_ptr, - output_ptr, - weight_ptr, - bias_ptr, - in_dim, - out_dim, - batch_size, - stream); + if (m->input_type == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } if (m->profiling) { cudaEventRecord(t_end, stream); @@ -143,18 +165,34 @@ void backward_kernel_wrapper(LinearMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::backward_kernel(m, - input_ptr, - input_grad_ptr, - output_ptr, - output_grad_ptr, - kernel_ptr, - kernel_grad_ptr, - bias_grad_ptr, - in_dim, - out_dim, - batch_size, - stream); + if (m->input_type == DT_FLOAT) { + Internal::backward_kernel(m, + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type == DT_HALF) { + Internal::backward_kernel(m, + input_ptr, + input_grad_ptr, + output_ptr, + output_grad_ptr, + kernel_ptr, + kernel_grad_ptr, + bias_grad_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -189,6 +227,7 @@ Parameter* Linear::get_parameter(int index) */ namespace Internal { +template void forward_kernel(LinearMeta const *m, void const *input_ptr, void *output_ptr, @@ -200,15 +239,16 @@ void forward_kernel(LinearMeta const *m, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f, beta = 0.0f; + DT alpha = 1.0f, beta = 0.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); + assert(input_type == weight_type && weight_type == output_type); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = CUDA_R_32F; + cudaDataType_t compute_type = input_type; #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -241,8 +281,8 @@ void forward_kernel(LinearMeta const *m, bias_ptr, weight_type, 1, - m->one_ptr, - CUDA_R_32F, + static_cast
(m->one_ptr), + weight_type, 1, &alpha, output_ptr, @@ -273,6 +313,7 @@ void forward_kernel(LinearMeta const *m, } } +template void backward_kernel(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -288,7 +329,8 @@ void backward_kernel(LinearMeta const *m, checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - float alpha = 1.0f; + DT alpha = 1.0f; + float sgeam_alpha = 1.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); @@ -338,7 +380,7 @@ void backward_kernel(LinearMeta const *m, CUBLAS_OP_N, in_dim, out_dim, - &alpha, + &sgeam_alpha, (float *)kernel_grad_ptr, in_dim, &(m->kernel_reg_lambda), @@ -361,7 +403,7 @@ void backward_kernel(LinearMeta const *m, out_dim, batch_size, &alpha, - m->one_ptr, + static_cast
(m->one_ptr), CUDA_R_32F, 1, output_grad_ptr, @@ -399,6 +441,13 @@ void backward_kernel(LinearMeta const *m, } } +template +__global__ void build_one_ptr(DT *one_ptr, int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size) { + one_ptr[i] = static_cast
(1.0f); + } +} + } // namespace Internal } // namespace Linear } // namespace Kernels diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index f7945b316d..bf5b1021ae 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/rms_norm_kernels.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -36,8 +37,9 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms) batch_size = rms->effective_batch_size; num_elements = in_dim * batch_size; - checkCUDA(cudaMalloc(&rms_ptr, batch_size * sizeof(float))); - checkCUDA(cudaMalloc(&norm_ptr, num_elements * sizeof(float))); + DataType data_type = rms->weights[0]->data_type; + checkCUDA(cudaMalloc(&rms_ptr, batch_size * data_type_size(data_type))); + checkCUDA(cudaMalloc(&norm_ptr, num_elements * data_type_size(data_type))); } namespace Kernels { @@ -83,27 +85,18 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { template __global__ void - RowwiseRootMeanSquareKernel(int64_t N, T eps, T const *X, T *rms) { - __shared__ T v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; - T sum = 0; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - sum += static_cast(X[index]) * static_cast(X[index]); + RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { + __shared__ float v_shared[C10_WARP_SIZE]; + long long const i = blockIdx.x; + float sum = 0.0f; + for (long long j = threadIdx.x; j < N; j += blockDim.x) { + long long const index = i * N + j; + sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum(sum, v_shared); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); // use BlockReduceSum() to sum X_ij^2 if (threadIdx.x == 0) { - rms[i] = rsqrt((sum / static_cast(N)) + static_cast(eps)); - // printf("index: %d, rms norm mean value: %.15f, rms norm sum value: " - // "%.20f, eps: %f, value: %.20f, num:%d, num2: %d\n", - // i, - // sum / static_cast(N), - // sum, - // static_cast(eps), - // rms[i], - // blockDim.x, - // warpSize); + rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); } } @@ -117,23 +110,51 @@ __global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { } } +template __global__ void elewise_apply_weights(int64_t batch_size, int64_t in_dim, - float const *norm, - float const *weights, - float *output) { + T const *norm, + T const *weights, + T *output) { CUDA_KERNEL_LOOP(i, batch_size * in_dim) { output[i] = norm[i] * weights[i % in_dim]; } } +template +void forward_kernel(RMSNormMeta const *m, + T const *input_ptr, + T const *weight_ptr, + T *output_ptr, + cudaStream_t stream) { + int parallelism = m->batch_size * m->in_dim; + RowwiseRootMeanSquareKernel + <<batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( + m->in_dim, + m->eps, + input_ptr, + static_cast(m->rms_ptr)); + NormKernel<<batch_size, kCUDANumThreads, 0, stream>>>( + m->in_dim, + input_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + elewise_apply_weights<<>>(m->batch_size, + m->in_dim, + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); +} + void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - int parallelism = m->batch_size * m->in_dim; cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -141,21 +162,24 @@ void forward_kernel_wrapper(RMSNormMeta const *m, cudaEventRecord(t_start, stream); } - RowwiseRootMeanSquareKernel - <<batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->in_dim, m->eps, input.get_float_ptr(), m->rms_ptr); - - NormKernel<<batch_size, kCUDANumThreads, 0, stream>>>( - m->in_dim, input.get_float_ptr(), m->rms_ptr, m->norm_ptr); + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } - elewise_apply_weights<<>>(m->batch_size, - m->in_dim, - m->norm_ptr, - weight.get_float_ptr(), - output.get_float_ptr()); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index d63bd0edc5..d09a5aaf6d 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -36,9 +36,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { +template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr) { + DT const *input_ptr, + DT *output_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -64,9 +65,10 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, } } +template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, + DT *input_grad_ptr, + DT const *output_grad_ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -94,11 +96,27 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -namespace Internal { +template void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr); +template void forward_kernel_wrapper(SoftmaxMeta const *m, + half const *input_ptr, + half *output_ptr); + +template void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements); +template void backward_kernel_wrapper(SoftmaxMeta const *m, + half *input_grad_ptr, + half const *output_grad_ptr, + size_t num_elements); +namespace Internal { +template void forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, + DT const *input_ptr, + DT *output_ptr, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); @@ -114,13 +132,14 @@ void forward_kernel(SoftmaxMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); } -void backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, +template +void backward_kernel(DT *input_grad_ptr, + DT const *output_grad_ptr, size_t num_elements, hipStream_t stream) { checkCUDA(hipMemcpyAsync(input_grad_ptr, output_grad_ptr, - num_elements * sizeof(float), + num_elements * sizeof(DT), hipMemcpyDeviceToDevice, stream)); } diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index c9415a89a2..80683e7a2d 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -26,8 +26,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); @@ -36,9 +36,10 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { +template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr) { + DT const *input_ptr, + DT *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -62,11 +63,13 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, log_measure.debug( "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed); } + } +template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, + DT *input_grad_ptr, + DT const *output_grad_ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -94,11 +97,26 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -namespace Internal { +template void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr); +template void forward_kernel_wrapper(SoftmaxMeta const *m, + half const *input_ptr, + half *output_ptr); +template void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements); +template void backward_kernel_wrapper(SoftmaxMeta const *m, + half *input_grad_ptr, + half const *output_grad_ptr, + size_t num_elements); +namespace Internal { +template void forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, + DT const *input_ptr, + DT *output_ptr, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -114,13 +132,14 @@ void forward_kernel(SoftmaxMeta const *m, output_ptr)); } -void backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, +template +void backward_kernel(DT *input_grad_ptr, + DT const *output_grad_ptr, size_t num_elements, cudaStream_t stream) { checkCUDA(cudaMemcpyAsync(input_grad_ptr, output_grad_ptr, - num_elements * sizeof(float), + num_elements * sizeof(DT), cudaMemcpyDeviceToDevice, stream)); } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 5103920413..3b20f932e2 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -61,6 +61,7 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of // the input tensor. However, since the tensor dimensions are reversed in @@ -93,15 +94,32 @@ Tensor FFModel::layer_norm(const Tensor input, } } #endif + if (data_type == DT_NONE) { + data_type = input->data_type; + } int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); + Layer *ln = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + casted_input); + } else { + ln = new Layer(this, + OP_LAYERNORM, + data_type, + name, + 1 /*inputs*/, + num_weights, + 1 /*outputs*/, + input); + } + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, input->dims, input->data_type, @@ -217,7 +235,7 @@ LayerNorm::LayerNorm(FFModel &model, weights[0] = model.create_parallel_weight_legion_ordering(axes.size(), dims, - DT_FLOAT, + _input->data_type, NULL /*owner_op*/, true /*create_grad*/, gamma_initializer, @@ -225,7 +243,7 @@ LayerNorm::LayerNorm(FFModel &model, weights[1] = model.create_parallel_weight_legion_ordering(axes.size(), dims, - DT_FLOAT, + _input->data_type, NULL /*owner_op*/, true /*create_grad*/, beta_initializer, @@ -337,6 +355,8 @@ OpMeta *LayerNorm::init_task(Task const *task, LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); LayerNormMeta *meta = new LayerNormMeta(handle, ln); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->output_type[0] = ln->outputs[0]->data_type; return meta; } @@ -447,14 +467,21 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in; + GenericTensorAccessorW out, gamma, beta; + Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(in_domain == out_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); @@ -462,12 +489,16 @@ void LayerNorm::forward_task(Task const *task, assert(regions.size() == 4); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // gamma_ptr = helperGetTensorPointerRW( + // regions[2], task->regions[2], FID_DATA, ctx, runtime); + gamma = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + // beta_ptr = helperGetTensorPointerRW( + // regions[3], task->regions[3], FID_DATA, ctx, runtime); + beta = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); int numdims = gamma_domain.get_dim(); @@ -479,9 +510,7 @@ void LayerNorm::forward_task(Task const *task, } else { assert(regions.size() == 2); } - - LayerNorm::forward_kernel_wrapper( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } void LayerNorm::backward(FFModel const &ff) { @@ -615,19 +644,26 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, if (!inputs[0]->get_sub_tensor(mv, sub_input)) { return false; } + Domain input_domain = sub_input.get_domain(); + Domain output_domain = sub_output.get_domain(); LayerNormMeta *m = new LayerNormMeta(sim->handler, this); sim->free_all(); float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(in_ptr != NULL); + GenericTensorAccessorR input1_acc(inputs[0]->data_type, input_domain, in_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(out_ptr != NULL); + GenericTensorAccessorW output_acc( + outputs[0]->data_type, output_domain, out_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); // FIXME please add gamma_ptr and beta_ptr after finish the implementation float *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorW gamma_acc; + GenericTensorAccessorW beta_acc; bool out_of_memory = (in_ptr == NULL) || (out_ptr == NULL) || @@ -640,7 +676,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, std::function forward, backward; forward = [&] { - forward_kernel_wrapper(m, in_ptr, out_ptr, gamma_ptr, beta_ptr); + forward_kernel_wrapper(m, input1_acc, output_acc, gamma_acc, beta_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index c3030e20b4..3f1c621e71 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -79,26 +79,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -132,7 +132,7 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, hipStream_t stream) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, kCUDABlockReduceNumThreads, 0, @@ -140,33 +140,47 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, m->effective_num_elements, m->eps, in_ptr, - m->mean_ptr, - m->rstd_ptr); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), m->effective_batch_size, kCUDANumThreads, 0, stream, m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } } template @@ -367,8 +381,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, output_grad_ptr, input_ptr, gamma_ptr, - m->ds_ptr, - m->db_ptr); + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), B, @@ -377,12 +391,12 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, stream, M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -396,8 +410,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -414,8 +428,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } @@ -443,11 +457,6 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, stream); } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index ac477ba2ad..35616de980 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -30,12 +31,19 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; eps = ln->eps; - checkCUDA(cudaMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(cudaMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + checkCUDA( + cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); + checkCUDA( + cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); } template @@ -77,26 +85,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseMomentsCUDAKernel(int64_t N, T eps, T const *X, T *mean, T *rstd) { - __shared__ T m_shared[C10_WARP_SIZE]; - __shared__ T v_shared[C10_WARP_SIZE]; +__global__ void RowwiseMomentsCUDAKernel( + int64_t N, float eps, T const *X, T *mean, T *rstd) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T sum1 = 0; - T sum2 = 0; + float sum1 = 0.0f; + float sum2 = 0.0f; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { - const T scale = T(1) / static_cast(N); + float const scale = float(1) / static_cast(N); sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, T(0)); - mean[i] = sum1; - rstd[i] = rsqrt(sum2 + static_cast(eps)); + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); } } @@ -130,27 +138,30 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T *gamma_ptr, T *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel + RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, m->eps, in_ptr, m->mean_ptr, m->rstd_ptr); - LayerNormForwardCUDAKernel + m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); + LayerNormForwardCUDAKernel <<effective_batch_size, kCUDANumThreads, 0, stream>>>( m->effective_num_elements, in_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, out_ptr); } /*static*/ -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - T const *in_ptr, - T *out_ptr, - T *gamma_ptr, - T *beta_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorW &gamma, + GenericTensorAccessorW &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -160,8 +171,24 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - LayerNorm::forward_kernel( - m, in_ptr, out_ptr, gamma_ptr, beta_ptr, stream); + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr(), + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel(m, + input.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + beta.get_half_ptr(), + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -170,8 +197,8 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); - print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); - print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } } @@ -366,17 +393,22 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, const int64_t N = m->effective_num_elements; ComputeInternalGradientsCUDAKernel <<>>( - N, output_grad_ptr, input_ptr, gamma_ptr, m->ds_ptr, m->db_ptr); + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>(M, N, - m->mean_ptr, - m->rstd_ptr, - m->ds_ptr, - m->db_ptr, - m->scale_ptr, - m->bias_ptr); + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly @@ -386,8 +418,8 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, N, output_grad_ptr, input_ptr, - m->mean_ptr, - m->rstd_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_grad_ptr, beta_grad_ptr); } else { @@ -396,14 +428,15 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; GammaBetaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - m->mean_ptr, - m->rstd_ptr, - gamma_grad_ptr, - beta_grad_ptr); + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } @@ -419,21 +452,28 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } + // }else if(m->output_type[0] == DT_HALF){ + // LayerNorm::backward_kernel(m, + // output_grad_ptr, + // input_ptr, + // input_grad_ptr, + // gamma_ptr, + // gamma_grad_ptr, + // beta_grad_ptr, + // stream); + // } } -template void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, - float const *in_ptr, - float *out_ptr, - float *gamma_ptr, - float *beta_ptr); template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, float const *output_grad_ptr, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 51f2fc6eac..19845214e2 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -40,14 +40,31 @@ Tensor FFModel::dense(const Tensor input, RegularizerMode kernel_reg_type, float kernel_reg_lambda, char const *name) { - Layer *li = new Layer(this, - OP_LINEAR, - data_type, - name, - 1 /*inputs*/, - use_bias ? 2 : 1 /*weights*/, - 1 /*outputs*/, - input); + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for dense"); + li = new Layer(this, + OP_LINEAR, + data_type, + name, + 1 /*inputs*/, + use_bias ? 2 : 1 /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_LINEAR, + data_type, + name, + 1 /*inputs*/, + use_bias ? 2 : 1 /*weights*/, + 1 /*outputs*/, + input); + } + { int numdims = input->num_dims; int dims[MAX_TENSOR_DIM]; @@ -333,12 +350,24 @@ OpMeta *Linear::init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - switch (out_domain.get_dim()) { + Linear const *linear = (Linear *)task->args; + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(linear->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + switch (output.domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - return init_task_with_dim(task, regions, ctx, runtime); + if (output.data_type == DT_HALF) { \ + return init_task_with_dim(task, regions, ctx, runtime); \ + } else if (output.data_type == DT_FLOAT) { \ + return init_task_with_dim(task, regions, ctx, runtime); \ + } else { \ + assert(false && "Unsupported data type"); \ + } LEGION_FOREACH_N(DIMFUNC) #undef DIMFUNC default: @@ -347,7 +376,7 @@ OpMeta *Linear::init_task(Task const *task, return NULL; } -template +template OpMeta *Linear::init_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -358,18 +387,18 @@ OpMeta *Linear::init_task_with_dim(Task const *task, FFHandler handle = *((FFHandler const *)task->local_args); // TensorAccessorR acc_input( // regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - TensorAccessorW acc_kernel(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); + TensorAccessorW acc_output(regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + TensorAccessorW acc_kernel(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); // int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; @@ -380,7 +409,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, in_dim, out_dim, batch_size); - LinearMeta *m = new LinearMeta(handle, batch_size); + LinearMeta *m = new LinearMeta(handle, batch_size, linear); m->activation = linear->activation; m->kernel_reg_type = linear->kernel_reg_type; m->kernel_reg_lambda = linear->kernel_reg_lambda; @@ -494,12 +523,21 @@ void Linear::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Domain in_domain = runtime->get_index_space_domain( + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - switch (in_domain.get_dim()) { + LinearMeta const *m = *((LinearMeta **)task->local_args); + assert(m->input_type == m->weight_type); + assert(m->input_type == m->output_type); + switch (input_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - return forward_task_with_dim(task, regions, ctx, runtime); + if (m->output_type == DT_HALF) { \ + return forward_task_with_dim(task, regions, ctx, runtime); \ + } else if (m->output_type == DT_FLOAT) { \ + return forward_task_with_dim(task, regions, ctx, runtime); \ + } else { \ + assert(false && "Unsupported data type"); \ + } LEGION_FOREACH_N(DIMFUNC) #undef DIMFUNC default: @@ -513,7 +551,7 @@ void Linear::forward_task(Task const *task, regions[2](I): kernel regions[3](I): bias */ -template +template void Linear::forward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -523,15 +561,15 @@ void Linear::forward_task_with_dim(Task const *task, assert(regions.size() == (3 + static_cast(m->use_bias))); assert(task->regions.size() == (3 + static_cast(m->use_bias))); - TensorAccessorR acc_input( + TensorAccessorR acc_input( regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - TensorAccessorR acc_kernel( + TensorAccessorW acc_output(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); + TensorAccessorR acc_kernel( regions[2], task->regions[2], FID_DATA, ctx, runtime); int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; @@ -539,9 +577,9 @@ void Linear::forward_task_with_dim(Task const *task, assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); - float const *acc_bias_ptr = NULL; + DT const *acc_bias_ptr = nullptr; if (m->use_bias) { - TensorAccessorR acc_bias( + TensorAccessorR acc_bias( regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(acc_bias.rect.volume() == static_cast(out_dim)); acc_bias_ptr = acc_bias.ptr; @@ -639,10 +677,19 @@ void Linear::backward_task(Task const *task, Runtime *runtime) { Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); + LinearMeta const *m = *((LinearMeta **)task->local_args); + assert(m->input_type == m->weight_type); + assert(m->input_type == m->output_type); switch (in_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - return backward_task_with_dim(task, regions, ctx, runtime); + if (m->output_type == DT_HALF) { \ + return backward_task_with_dim(task, regions, ctx, runtime); \ + } else if (m->output_type == DT_FLOAT) { \ + return backward_task_with_dim(task, regions, ctx, runtime); \ + } else { \ + assert(false && "Unsupported data type"); \ + } LEGION_FOREACH_N(DIMFUNC) #undef DIMFUNC default: @@ -659,7 +706,7 @@ void Linear::backward_task(Task const *task, regions[5](I/O): filter_grad regions[6](I/O): bias_grad */ -template +template void Linear::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -671,9 +718,9 @@ void Linear::backward_task_with_dim(Task const *task, assert(task->regions.size() == (5 + static_cast(m->trainableInputs[0]) + static_cast(m->use_bias))); - float *input_grad = NULL; + DT *input_grad = nullptr; size_t rid = 0; - TensorAccessorR acc_input( + TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; if (m->trainableInputs[0]) { @@ -681,39 +728,39 @@ void Linear::backward_task_with_dim(Task const *task, ctx, task->regions[rid].region.get_index_space()); if (domain.get_dim() == NDIM + 1) { assert(domain.get_volume() == acc_input.rect.volume()); - input_grad = helperGetTensorPointerWO( + input_grad = helperGetTensorPointerWO
( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); } else { - TensorAccessorW acc_replica_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + TensorAccessorW acc_replica_grad(regions[rid], + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); assert(acc_replica_grad.rect.volume() == acc_input.rect.volume()); input_grad = acc_replica_grad.ptr; } rid++; } - TensorAccessorR acc_output( + TensorAccessorR acc_output( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - TensorAccessorW acc_output_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + TensorAccessorW acc_output_grad(regions[rid], + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); rid++; - TensorAccessorR acc_kernel( + TensorAccessorR acc_kernel( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - TensorAccessorW acc_kernel_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + TensorAccessorW acc_kernel_grad(regions[rid], + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); rid++; // make sure the sizes match int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; @@ -725,17 +772,17 @@ void Linear::backward_task_with_dim(Task const *task, assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); assert(acc_kernel_grad.rect.volume() == static_cast(in_dim * out_dim)); - float *acc_bias_grad_ptr = NULL; + DT *acc_bias_grad_ptr = nullptr; if (m->use_bias) { - TensorAccessorW acc_bias_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); + TensorAccessorW acc_bias_grad(regions[rid], + task->regions[rid], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); rid++; assert(acc_bias_grad.rect.volume() == static_cast(out_dim)); - acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); + acc_bias_grad_ptr = static_cast
(acc_bias_grad.ptr); } assert(rid == regions.size()); diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 57578f5793..a926fd3b22 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -57,23 +57,40 @@ RMSNormParams RMSNorm::get_params() const { Tensor FFModel::rms_norm(const Tensor input, float eps, int dim, + DataType data_type, char const *name) { - Layer *rm = new Layer(this, - OP_RMS_NORM, - DT_FLOAT, - name, - 1 /*inputs*/, - 1 /*weights*/, - 1 /*outputs*/, - input); + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *rm = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for rms_norm"); + rm = new Layer(this, + OP_RMS_NORM, + data_type, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + rm = new Layer(this, + OP_RMS_NORM, + data_type, + name, + 1 /*inputs*/, + 1 /*weights*/, + 1 /*outputs*/, + input); + } rm->outputs[0] = create_tensor_legion_ordering( - input->num_dims, input->dims, DT_FLOAT, rm, 0, true /*create_grad*/); + input->num_dims, input->dims, data_type, rm, 0, true /*create_grad*/); // weights int weight_dims[1] = {dim}; rm->weights[0] = create_weight_legion_ordering(1, weight_dims, - DT_FLOAT, + data_type, rm, true /*create_grad*/, nullptr, @@ -362,11 +379,11 @@ void RMSNorm::forward_task(Task const *task, assert(regions.size() == 3); RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); forward_kernel_wrapper(m, input, weight, output); } @@ -423,4 +440,4 @@ size_t hash::operator()( hash_combine(key, params.dim); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 304fa7b418..90aef807e2 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -52,10 +52,16 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { +Tensor FFModel::softmax(const Tensor _input, + int dim, + DataType data_type, + char const *name) { + if (data_type = DT_NONE) { + data_type = _input->data_type; + } Layer *sm = new Layer(this, OP_SOFTMAX, - DT_FLOAT, + data_type, name, 1 /*inputs*/, 0 /*weights*/, @@ -67,7 +73,7 @@ Tensor FFModel::softmax(const Tensor _input, int dim, char const *name) { dims[i] = _input->dims[i]; } sm->outputs[0] = create_tensor_legion_ordering( - numdims, dims, DT_FLOAT, sm, 0, true /*create_grad*/); + numdims, dims, data_type, sm, 0, true /*create_grad*/); sm->add_int_property("softmax_dim", dim); layers.push_back(sm); return sm->outputs[0]; @@ -106,7 +112,7 @@ Softmax::Softmax(FFModel &model, for (int i = 0; i < numdim; i++) { dims[i] = _input->dims[numdim - 1 - i]; } - outputs[0] = model.create_parallel_tensor(numdim, dims, DT_FLOAT, this); + outputs[0] = model.create_parallel_tensor(numdim, dims, data_type, this); } Softmax::Softmax(FFModel &model, @@ -221,6 +227,8 @@ OpMeta *Softmax::init_task(Task const *task, domain = input_domain; } SoftmaxMeta *m = new SoftmaxMeta(handle, softmax, domain); + m->input_type = softmax->inputs[0]->data_type; + m->output_type = softmax->outputs[0]->data_type; // checkCUDNN(cudnnCreateTensorDescriptor(&m->outputTensor)); return m; } @@ -241,7 +249,7 @@ FutureMap Softmax::inference(FFModel const &ff, << std::endl; */ IndexLauncher launcher(SOFTMAX_INF_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -296,10 +304,17 @@ void Softmax::forward_task(Task const *task, Runtime *runtime) { Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); + SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); switch (in_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - return forward_task_with_dim(task, regions, ctx, runtime); + if (m->output_type == DT_HALF) { \ + return forward_task_with_dim(task, regions, ctx, runtime); \ + } else if (m->output_type == DT_FLOAT) { \ + return forward_task_with_dim(task, regions, ctx, runtime); \ + } else { \ + assert(false && "Unsupported data type"); \ + } LEGION_FOREACH_N(DIMFUNC) #undef DIMFUNC default: @@ -311,7 +326,7 @@ void Softmax::forward_task(Task const *task, regions[0](I): input regions[1](O): output */ -template +template void Softmax::forward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -320,15 +335,14 @@ void Softmax::forward_task_with_dim(Task const *task, assert(task->regions.size() == 2); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - TensorAccessorR acc_input( + TensorAccessorR acc_input( regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - + TensorAccessorW acc_output(regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime, + false /*readOutput*/); forward_kernel_wrapper(m, acc_input.ptr, acc_output.ptr); } @@ -366,10 +380,17 @@ void Softmax::backward_task(Task const *task, Runtime *runtime) { Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); + SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); switch (in_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - return backward_task_with_dim(task, regions, ctx, runtime); + if (m->output_type == DT_HALF) { \ + return backward_task_with_dim(task, regions, ctx, runtime); \ + } else if (m->output_type == DT_FLOAT) { \ + return backward_task_with_dim(task, regions, ctx, runtime); \ + } else { \ + assert(false && "Unsupported data type"); \ + } LEGION_FOREACH_N(DIMFUNC) #undef DIMFUNC default: @@ -384,7 +405,7 @@ void Softmax::backward_task(Task const *task, // Note that the backward task of softmax is actually a no op (i.e., input_grad // = output_grad) since the upstream cross_entropy_loss function computes // performs softmax_cross_entropy_loss to avoid intermediate zeros -template +template void Softmax::backward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -393,13 +414,13 @@ void Softmax::backward_task_with_dim(Task const *task, assert(task->regions.size() == 2); // const Softmax* softmax = (Softmax*) task->args; SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - TensorAccessorW acc_input_grad(regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - TensorAccessorR acc_output_grad( + TensorAccessorW acc_input_grad(regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime, + true /*readOutput*/); + TensorAccessorR acc_output_grad( regions[1], task->regions[1], FID_DATA, ctx, runtime); // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); @@ -415,12 +436,19 @@ InferenceResult Runtime *runtime) { Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); + SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); switch (in_domain.get_dim()) { #define DIMFUNC(DIM) \ - case DIM: { \ - forward_task_with_dim(task, regions, ctx, runtime); \ - break; \ - } + case DIM: \ + if (m->output_type == DT_HALF) { \ + forward_task_with_dim(task, regions, ctx, runtime); \ + break; \ + } else if (m->output_type == DT_FLOAT) { \ + forward_task_with_dim(task, regions, ctx, runtime); \ + break; \ + } else { \ + assert(false && "Unsupported data type"); \ + } LEGION_FOREACH_N(DIMFUNC) #undef DIMFUNC default: diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index a764fbe8fa..e4c2837e87 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "flexflow/utils/cuda_helper.h" @@ -64,22 +65,38 @@ Tensor bool bias, bool add_bias_kv, bool add_zero_attn, + DataType data_type, Initializer *kernel_initializer, bool apply_rotary_embedding, bool scaling_query, float scaling_factor, bool qk_prod_scaling, char const *name) { - // Currently assume that + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; int weight_num = bias ? 2 : 1; - Layer *li = new Layer(this, - OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - input); + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); + li = new Layer(this, + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + input); + } { int numdims = input->num_dims; int dims[MAX_TENSOR_DIM]; @@ -88,7 +105,7 @@ Tensor } dims[0] = embed_dim; li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + numdims, dims, data_type, li, 0, true /*create_grad*/); } { // Compute weight size @@ -102,7 +119,7 @@ Tensor int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; li->weights[0] = create_weight_legion_ordering(2, dims, - DT_FLOAT, + data_type, li, true /*create_grad*/, kernel_initializer, @@ -113,13 +130,13 @@ Tensor int dims[1] = {embed_dim * 4}; li->weights[1] = create_weight_legion_ordering(1, dims, - DT_FLOAT, + data_type, li, true /*create_grad*/, kernel_initializer, CHOSEN_SYNC_TYPE); } - li->data_type = DT_FLOAT; + li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); li->add_int_property("kdim", kdim); @@ -207,7 +224,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( // Initializer* _bias_initializer) : Op(model, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, + _input->data_type, name, 1 /*inputs*/, (_bias ? 2 : 1) /*weights*/, @@ -259,7 +276,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[0] = model.create_parallel_weight<3>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, @@ -279,14 +296,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[1] = model.create_parallel_weight<2>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, NULL, comm_type); } outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, DT_FLOAT, this); + _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ /* } */ @@ -315,7 +332,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( // Initializer* _bias_initializer) : Op(model, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, + _input->data_type, name, 1 /*inputs*/, (_bias ? 2 : 1) /*weights*/, @@ -365,7 +382,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[0] = model.create_parallel_weight<3>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, @@ -383,14 +400,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[1] = model.create_parallel_weight<2>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, NULL, comm_type); } outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, DT_FLOAT, this); + _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ @@ -546,12 +563,27 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( (SpecIncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); @@ -564,9 +596,10 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( .best_affinity_to(task->target_proc) .first(); SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta( - handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); + handle, attn, weight, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; - assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); return m; } @@ -640,7 +673,6 @@ void SpecIncMultiHeadSelfAttention::inference_task( Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - float const *bias_ptr = NULL; BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; SpecIncMultiHeadSelfAttentionMeta const *m = @@ -653,19 +685,17 @@ void SpecIncMultiHeadSelfAttention::inference_task( m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - + GenericTensorAccessorR biases; if (*m->bias) { - GenericTensorAccessorR biases = - helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); assert(bias_domain.get_dim() == 2); - bias_ptr = biases.get_float_ptr(); } Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -679,12 +709,7 @@ void SpecIncMultiHeadSelfAttention::inference_task( assert(output_domain.get_dim() == 4); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr); + m, bc, input, weight, output, biases); // print_tensor(input.get_float_ptr(), 20, "attention input"); // print_tensor(output.get_float_ptr(), 20, "attention output"); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index b898d472a8..85bd71b205 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -27,10 +27,10 @@ using Legion::Memory; void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -60,7 +60,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -80,7 +80,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->qk_prod_scaling, attn->add_bias_kv, attn->scaling_factor, - weight_ptr, + weight, gpu_mem, num_samples, _num_heads) { diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index f12d48ab1d..7370ff446e 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -15,6 +15,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -26,9 +27,13 @@ using Legion::coord_t; using Legion::Memory; using namespace Kernels::IncMultiHeadAttention; +namespace Kernels { +namespace SpecIncMultiHeadAttention { + +template __global__ void spec_store_kv_cache( - float const *devQKVProjArray, - float *cache_ptr, + DT const *devQKVProjArray, + DT *cache_ptr, BatchConfig::PerTokenInfo *tokenInfos, BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, @@ -52,7 +57,7 @@ __global__ void spec_store_kv_cache( int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; int current_head_block_size = num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); - float val = + DT val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + token_idx * proj_size + data_idx]; @@ -153,6 +158,7 @@ __global__ void spec_store_kv_cache( } } +template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, cudaStream_t stream) { @@ -165,8 +171,8 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, spec_store_kv_cache<<>>(m->devQKVProjArray, - m->keyCache, + stream>>>(static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), m->token_infos, m->request_infos, m->beam_token_infos, @@ -185,8 +191,8 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, spec_store_kv_cache<<>>(m->devQKVProjArray, - m->valueCache, + stream>>>(static_cast
(m->devQKVProjArray), + static_cast
(m->valueCache), m->token_infos, m->request_infos, m->beam_token_infos, @@ -203,11 +209,12 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } } -__global__ void spec_fill_entries_above_diagonal(float *matrix, +template +__global__ void spec_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, size_t total_tokens_in_request, size_t num_heads, - float value) { + DT value) { CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { // size_t head_idx = i / (new_tokens * total_tokens_in_request); size_t src_idx = (i / new_tokens) % total_tokens_in_request; @@ -219,19 +226,22 @@ __global__ void spec_fill_entries_above_diagonal(float *matrix, } } +template void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, - float *output_ptr, - float const *bias_ptr, + DT *output_ptr, + DT const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = CUDA_R_32F; + cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); @@ -267,18 +277,17 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int strideC = num_new_tokens * total_tokens; // a flag of using this scaling alpha - float alpha = 1.0f, beta = 0.0f; + DT alpha = 1.0f, beta = 0.0f; if (*m->qk_prod_scaling) { - alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } // To get A, skip over Q entries from previous requests (same head) - void const *A = (void const *)(m->devQKVProjArray + - tokens_previous_requests * m->qProjSize); + void const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; // To get B, skip over K entries from previous requests (all heads + // padding) - void const *B = - (void const *)(m->keyCache + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * - kt_req_block_size); + void const *B = static_cast
(m->keyCache) + + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; // if (i == 0 && sub_req_id == 0 && // bc->beam_slots.at(0).current_depth == 1) { @@ -286,9 +295,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // printf("key cache offset %d\n", kt_req_block_size); // } // To get C, skip over QK^T products from previous requests - void *C = - (void *)(m->qk_prods + m->num_heads * tokens_prev_requests_squares); - + void *C = static_cast
(m->qk_prods) + + m->num_heads * tokens_prev_requests_squares; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -322,7 +330,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, parallelism), 0, stream>>>( - (float *)C, num_new_tokens, total_tokens, m->num_heads, -INFINITY); + static_cast
(C), + num_new_tokens, + total_tokens, + m->num_heads, + static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) cudnnTensorDescriptor_t qk_tensor; @@ -342,14 +354,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int w_param = num_new_tokens; checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, n_param, c_param, h_param, w_param)); - alpha = 1.0f, beta = 0.0f; - void *C_softmax = (void *)(m->qk_prods_softmax + - m->num_heads * tokens_prev_requests_squares); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + void *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_heads * tokens_prev_requests_squares; // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -357,12 +369,12 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, + &softmax_alpha, qk_tensor, - (void *)((float *)C), - &beta, + C, + &softmax_beta, qk_tensor, - (void *)((float *)C_softmax))); + C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; m_ = num_new_tokens; @@ -377,12 +389,12 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, A = (void const *)C_softmax; // To get B, skip over V^T entries from previous requests (all heads + // padding) - B = (void const *)(m->valueCache + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * - vt_req_block_size); + B = static_cast
(m->valueCache) + + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - C = (void *)(m->attn_heads + - tokens_previous_requests * m->num_heads * m->vProjSize); + C = static_cast
(m->attn_heads) + + tokens_previous_requests * m->num_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -452,42 +464,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, assert(tokens_previous_requests == num_tokens); } -/*static*/ -void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } - - // reload the weight_o - - if (!(*m->has_load_weights)) { - int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - build_w_out_tensor<<>>(weight_ptr, - m->W_out_contiguous, - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); - *m->has_load_weights = true; - } - +template +void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { // here because we need postion info in infernece 1 cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), @@ -512,14 +496,72 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel( - m, bc, input_ptr, weight_ptr, m->devQKVProjArray, bias_ptr, stream); + compute_qkv_kernel(m, + bc, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); // phase 2: Update key/val cache - update_kv_cache_kernel(m, bc, stream); + update_kv_cache_kernel
(m, bc, stream); // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); +} + +} // namespace SpecIncMultiHeadAttention +} // namespace Kernels + +/*static*/ +void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::SpecIncMultiHeadAttention::inference_kernel(m, + bc, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::SpecIncMultiHeadAttention::inference_kernel(m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); @@ -533,12 +575,13 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } + // print_tensor(output.get_half_ptr(), 10000, "att output"); } SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -558,7 +601,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->qk_prod_scaling, attn->add_bias_kv, attn->scaling_factor, - weight_ptr, + weight, gpu_mem, num_samples, _num_heads) { @@ -572,7 +615,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( BeamSearchBatchConfig::MAX_BEAM_WIDTH; size_t requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; size_t beam_requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t totalSize = + size_t total_size = requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + beam_tokeninfo_size * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + @@ -582,7 +625,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // be added here later Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(totalSize - 1)); + Realm::Point<1, coord_t>(total_size - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); Realm::RegionInstance::create_instance(beam_search_reserve_inst, @@ -592,14 +635,21 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( 0, Realm::ProfilingRequestSet()) .wait(); + off_t offset = 0; beam_token_infos = - (BeamSearchBatchConfig::BeamSearchPerTokenInfo *) - beam_search_reserve_inst.pointer_untyped(0, sizeof(char)); + beam_search_reserve_inst + .pointer(offset); + offset += beam_tokeninfo_size * + sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); request_infos = - (BatchConfig::PerRequestInfo *)(beam_token_infos + beam_tokeninfo_size); + beam_search_reserve_inst.pointer(offset); + offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); beam_request_infos = - (BeamSearchBatchConfig::BeamSearchPerRequestInfo *)(request_infos + - requestinfo_size); + beam_search_reserve_inst + .pointer(offset); + offset += beam_requestinfo_size * + sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); + assert(offset == total_size); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index c5bbcc2cea..0df7e36f32 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "flexflow/utils/cuda_helper.h" @@ -66,22 +67,38 @@ Tensor FFModel::inc_multihead_self_attention_verify( bool bias, bool add_bias_kv, bool add_zero_attn, + DataType data_type, Initializer *kernel_initializer, bool apply_rotary_embedding, bool scaling_query, float scaling_factor, bool qk_prod_scaling, char const *name) { + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; int weight_num = bias ? 2 : 1; - // Currently assume that - Layer *li = new Layer(this, - OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - input); + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); + li = new Layer(this, + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + input); + } { int numdims = input->num_dims; int dims[MAX_TENSOR_DIM]; @@ -90,7 +107,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( } dims[0] = embed_dim; li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/); + numdims, dims, data_type, li, 0, true /*create_grad*/); } { // Compute weight size @@ -104,7 +121,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; li->weights[0] = create_weight_legion_ordering(2, dims, - DT_FLOAT, + data_type, li, true /*create_grad*/, kernel_initializer, @@ -115,13 +132,13 @@ Tensor FFModel::inc_multihead_self_attention_verify( int dims[1] = {embed_dim * 4}; li->weights[1] = create_weight_legion_ordering(1, dims, - DT_FLOAT, + data_type, li, true /*create_grad*/, kernel_initializer, CHOSEN_SYNC_TYPE); } - li->data_type = DT_FLOAT; + li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); li->add_int_property("kdim", kdim); @@ -207,7 +224,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // Initializer* _bias_initializer) : Op(model, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, + _input->data_type, name, 1 /*inputs*/, (_bias ? 2 : 1) /*weights*/, @@ -259,7 +276,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[0] = model.create_parallel_weight<3>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, @@ -279,7 +296,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[1] = model.create_parallel_weight<2>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, NULL, @@ -287,7 +304,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( } outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, DT_FLOAT, this); + _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ /* } */ @@ -316,7 +333,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // Initializer* _bias_initializer) : Op(model, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, - DT_FLOAT, + _input->data_type, name, 1 /*inputs*/, (_bias ? 2 : 1) /*weights*/, @@ -366,7 +383,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[0] = model.create_parallel_weight<3>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, @@ -384,14 +401,14 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( ParameterSyncType comm_type = ParameterSyncType::PS; #endif weights[1] = model.create_parallel_weight<2>(dims, - DT_FLOAT, + this->data_type, NULL /*owner_op*/, true /*create_grad*/, NULL, comm_type); } outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, DT_FLOAT, this); + _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ @@ -547,12 +564,27 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( (TreeIncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); @@ -565,9 +597,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( .best_affinity_to(task->target_proc) .first(); TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta( - handle, attn, weight.get_float_ptr(), gpu_mem, num_samples, num_heads); + handle, attn, weight, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; - assert(weight.domain.get_volume() * sizeof(float) == m->weightSize); + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); return m; } @@ -642,7 +675,6 @@ void TreeIncMultiHeadSelfAttention::inference_task( Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - float const *bias_ptr = NULL; TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; TreeIncMultiHeadSelfAttentionMeta *m = @@ -655,18 +687,17 @@ void TreeIncMultiHeadSelfAttention::inference_task( m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR biases; if (*m->bias) { - GenericTensorAccessorR biases = - helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); assert(bias_domain.get_dim() == 2); - bias_ptr = biases.get_float_ptr(); } Domain input_domain = runtime->get_index_space_domain( @@ -685,12 +716,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( "[Attention:forward:query]"); */ TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr); + m, bc, input, weight, output, biases); #ifdef INFERENCE_TESTS printf("Checking TreeIncMultiHeadSelfAttention computations...\n"); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index aa5aaf3039..6e63860cd0 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -27,10 +27,10 @@ using Legion::Memory; void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -60,7 +60,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -80,7 +80,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->qk_prod_scaling, attn->add_bias_kv, attn->scaling_factor, - weight_ptr, + weight, gpu_mem, num_samples, _num_heads), diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index d3fc5c1fb2..b9bfc5b6a3 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -15,6 +15,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -27,9 +28,13 @@ using Legion::Memory; using namespace Kernels::IncMultiHeadAttention; +namespace Kernels { +namespace TreeIncMultiHeadAttention { + +template __global__ void commit_tokens_kernel( - float const *devQKVProjArray, - float *cache_ptr, + DT const *devQKVProjArray, + DT *cache_ptr, TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos, int qProjSize, int kProjSize, @@ -54,7 +59,7 @@ __global__ void commit_tokens_kernel( (qProjSize + kProjSize + vProjSize) * num_active_tokens_in_last_batch; int current_head_block_size = num_active_tokens_in_last_batch * (k_cache ? qProjSize : qProjSize + kProjSize); - float val = + DT val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + token_idx_in_last_batch * proj_size + data_idx]; // int const req_id = id_map[token_idx].request_index; @@ -68,6 +73,7 @@ __global__ void commit_tokens_kernel( } } +template void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, cudaStream_t stream) { @@ -78,8 +84,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>( - m->devQKVProjArray, - m->keyCache, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), m->committed_token_infos, m->qProjSize, m->kProjSize, @@ -95,8 +101,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>( - m->devQKVProjArray, - m->valueCache, + static_cast
(m->devQKVProjArray), + static_cast
(m->valueCache), m->committed_token_infos, m->qProjSize, m->kProjSize, @@ -109,9 +115,10 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, } } +template __global__ void update_tree_branch_kv_cache( - float const *devQKVProjArray, - float *cache_ptr, + DT const *devQKVProjArray, + DT *cache_ptr, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, @@ -130,7 +137,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += processed_tokens_in_batch; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = @@ -138,7 +145,7 @@ __global__ void update_tree_branch_kv_cache( (k_cache ? qProjSize : qProjSize + kProjSize); // skip over Q entries (and K entries // if we are working on the V cache) - float val = + DT val = devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + token_idx * proj_size + data_idx]; int const req_id = tokenInfos[token_idx].request_index; @@ -150,11 +157,12 @@ __global__ void update_tree_branch_kv_cache( } } -__global__ void tree_fill_entries_above_diagonal(float *matrix, +template +__global__ void tree_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, size_t total_tokens_in_request, size_t num_heads, - float value) { + DT value) { CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { // size_t head_idx = i / (new_tokens * total_tokens_in_request); size_t src_idx = (i / new_tokens) % total_tokens_in_request; @@ -166,19 +174,22 @@ __global__ void tree_fill_entries_above_diagonal(float *matrix, } } +template void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, - float *output_ptr, - float const *bias_ptr, + DT *output_ptr, + DT const *bias_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(DT_FLOAT); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = CUDA_R_32F; + cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; @@ -215,8 +226,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>( - m->devQKVProjArray, - m->keyCache, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), m->token_infos, m->qProjSize, m->kProjSize, @@ -233,8 +244,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream>>>( - m->devQKVProjArray, - m->valueCache, + static_cast
(m->devQKVProjArray), + static_cast
(m->valueCache), m->token_infos, m->qProjSize, m->kProjSize, @@ -258,18 +269,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int strideC = num_new_tokens * total_tokens_in_request; // a flag of using this scaling alpha - float alpha = 1.0f, beta = 0.0f; + DT alpha = 1.0f, beta = 0.0f; if (*m->qk_prod_scaling) { - alpha = 1.0f / (float)sqrt(m->kProjSize), beta = 0.0f; + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } // To get A, skip over Q entries from previous requests (same head) - void const *A = (void const *)(m->devQKVProjArray + - processed_tokens_in_batch * m->qProjSize); + void const *A = static_cast
(m->devQKVProjArray) + + processed_tokens_in_batch * m->qProjSize; // To get B, skip over K entries from previous requests (all heads + // padding) - void const *B = (void const *)(m->keyCache + i * kt_req_block_size); + void const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests - void *C = (void *)(m->qk_prods); + void *C = static_cast
(m->qk_prods); checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, @@ -305,11 +316,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, min((size_t)CUDA_NUM_THREADS, parallelism), 0, - stream>>>((float *)C, - num_new_tokens, - total_tokens_in_request, - m->num_heads, - -INFINITY); + stream>>>( + static_cast
(C), + num_new_tokens, + total_tokens_in_request, + m->num_heads, + static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) cudnnTensorDescriptor_t qk_tensor; @@ -329,12 +341,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int w_param = num_new_tokens; checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, n_param, c_param, h_param, w_param)); - alpha = 1.0f, beta = 0.0f; + float softmax_alpha = 1.0f, softmax_beta = 0.0f; void *C_softmax = (void *)(m->qk_prods_softmax); // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The @@ -343,12 +355,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, + &softmax_alpha, qk_tensor, - (void *)((float *)C), - &beta, + C, + &softmax_beta, qk_tensor, - (void *)((float *)C_softmax))); + C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; m_ = num_new_tokens; @@ -360,14 +372,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, strideC = num_new_tokens * m->vProjSize; // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) - A = (void const *)C_softmax; + A = static_cast
(C_softmax); // To get B, skip over V^T entries from previous requests (all heads + // padding) - B = (void const *)(m->valueCache + i * vt_req_block_size); + B = static_cast
(m->valueCache) + i * vt_req_block_size; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - C = (void *)(m->attn_heads + - processed_tokens_in_batch * m->num_heads * m->vProjSize); + C = static_cast
(m->attn_heads) + + processed_tokens_in_batch * m->num_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -398,9 +410,9 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = (void const *)m->W_out_contiguous; - B = (void const *)C; - C = (void *)(output_ptr + processed_tokens_in_batch * m->oProjSize); + A = m->W_out_contiguous; + B = C; + C = (output_ptr + processed_tokens_in_batch * m->oProjSize); checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -439,24 +451,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(processed_tokens_in_batch == bc->num_active_tokens()); } -/*static*/ -void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - TreeIncMultiHeadSelfAttentionMeta *m, - TreeVerifyBatchConfig const *bc, - float const *input_ptr, - float const *weight_ptr, - float *output_ptr, - float const *bias_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } - +template +void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, + TreeVerifyBatchConfig const *bc, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { // copy committed tokens info to GPU for the commit_tokens kernel // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing @@ -467,28 +469,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); - commit_tokens(m, bc, stream); + commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active // tokens for the current batch m->num_active_tokens = bc->num_active_tokens(); - // reload the weight_o - if (!(*m->has_load_weights)) { - int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - build_w_out_tensor<<>>(weight_ptr, - m->W_out_contiguous, - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); - *m->has_load_weights = true; - } // here because we need postion info in infernece 1 cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), @@ -497,8 +483,13 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel( - m, bc, input_ptr, weight_ptr, m->devQKVProjArray, bias_ptr, stream); + compute_qkv_kernel(m, + bc, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -507,6 +498,59 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); +} + +} // namespace TreeIncMultiHeadAttention +} // namespace Kernels + +/*static*/ +void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + TreeIncMultiHeadSelfAttentionMeta *m, + TreeVerifyBatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::TreeIncMultiHeadAttention::inference_kernel(m, + bc, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::TreeIncMultiHeadAttention::inference_kernel(m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); @@ -525,7 +569,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, - float const *weight_ptr, + GenericTensorAccessorR const &weight, Memory gpu_mem, int num_samples, int _num_heads) @@ -545,7 +589,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->qk_prod_scaling, attn->add_bias_kv, attn->scaling_factor, - weight_ptr, + weight, gpu_mem, num_samples, _num_heads), @@ -572,8 +616,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( Realm::ProfilingRequestSet()) .wait(); committed_token_infos = - (TreeVerifyBatchConfig::CommittedTokensInfo *) - committed_token_reserve_inst.pointer_untyped(0, sizeof(char)); + committed_token_reserve_inst + .pointer(0); } cudaStreamSynchronize(stream); diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc index 809d608402..100262e85a 100644 --- a/src/runtime/accessor.cc +++ b/src/runtime/accessor.cc @@ -345,10 +345,12 @@ GenericTensorAccessorW } #define DIMFUNC(DIM) \ + template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ + template class TensorAccessorW; \ template class TensorAccessorW; \ template class TensorAccessorW; \ template class TensorAccessorW; \ diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index f4c39c6b0b..434ef1d5e1 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -310,23 +310,23 @@ __host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { // checkCUDA(cudaDeviceSynchronize()); return true; } -cudnnStatus_t - cudnnSetTensorDescriptorFromDomain4SoftMax(cudnnTensorDescriptor_t tensor, - Domain domain) { +cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); switch (domain.get_dim()) { case 1: { Rect<1> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], 1, 1, 1); + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); } case 2: { Rect<2> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; dims[1] = rect.hi[1] - rect.lo[1] + 1; return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[1], dims[0], 1, 1); + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); } case 3: { Rect<3> rect = domain; @@ -335,7 +335,7 @@ cudnnStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; return cudnnSetTensor4dDescriptor(tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, dims[2] * dims[1], dims[0], 1, @@ -349,7 +349,7 @@ cudnnStatus_t dims[3] = rect.hi[3] - rect.lo[3] + 1; return cudnnSetTensor4dDescriptor(tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, dims[3] * dims[2] * dims[1], dims[0], 1, @@ -362,21 +362,23 @@ cudnnStatus_t } cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, - Domain domain) { + Domain domain, + DataType data_type) { int dims[MAX_TENSOR_DIM]; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); switch (domain.get_dim()) { case 1: { Rect<1> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], 1, 1, 1); + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[0], 1, 1, 1); } case 2: { Rect<2> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; dims[1] = rect.hi[1] - rect.lo[1] + 1; return cudnnSetTensor4dDescriptor( - tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[1], dims[0], 1, 1); + tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, dims[1], dims[0], 1, 1); } case 3: { Rect<3> rect = domain; @@ -385,7 +387,7 @@ cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, dims[2] = rect.hi[2] - rect.lo[2] + 1; return cudnnSetTensor4dDescriptor(tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, dims[2], dims[1], dims[0], @@ -399,7 +401,7 @@ cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, dims[3] = rect.hi[3] - rect.lo[3] + 1; return cudnnSetTensor4dDescriptor(tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, dims[3], dims[2], dims[1], @@ -415,7 +417,7 @@ cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, dims[3] = rect.hi[3] - rect.lo[3] + 1; return cudnnSetTensor4dDescriptor(tensor, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, + cudnn_data_type, dims[3], dims[2], dims[1], @@ -429,6 +431,8 @@ cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, cudnnDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { + case DT_HALF: + return CUDNN_DATA_HALF; case DT_FLOAT: return CUDNN_DATA_FLOAT; case DT_DOUBLE: @@ -443,6 +447,8 @@ cudnnDataType_t ff_to_cudnn_datatype(DataType type) { cudaDataType_t ff_to_cuda_datatype(DataType type) { switch (type) { + case DT_HALF: + return CUDA_R_16F; case DT_FLOAT: return CUDA_R_32F; case DT_DOUBLE: @@ -544,6 +550,8 @@ template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + print_tensor(half const *ptr, size_t rect, char const *prefix); template __host__ void print_beam_tensor(float const *ptr, size_t num_elements, @@ -566,6 +574,8 @@ template __host__ void template __host__ void save_tensor(int64_t const *ptr, size_t rect, char const *file_name); +template __host__ void + save_tensor(half const *ptr, size_t rect, char const *file_name); template __host__ float *download_tensor(float const *ptr, size_t num_elements); diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 39e797ea42..7f50e4b69f 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -1,4 +1,5 @@ #include "flexflow/ffconst_utils.h" +#include "flexflow/accessor.h" #include namespace FlexFlow { @@ -192,6 +193,25 @@ std::string get_operator_type_name(OperatorType type) { } } +size_t data_type_size(DataType type) { + switch (type) { + case DT_HALF: + return sizeof(half); + case DT_FLOAT: + return sizeof(float); + case DT_DOUBLE: + return sizeof(double); + case DT_INT32: + return sizeof(int32_t); + case DT_INT64: + return sizeof(int64_t); + case DT_BOOLEAN: + return sizeof(bool); + default: + assert(false); + } +} + std::ostream &operator<<(std::ostream &s, OperatorType op_type) { s << get_operator_type_name(op_type); diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index d6355def9a..6354c5d737 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -273,22 +273,23 @@ __host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { return true; } -miopenStatus_t - cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, - Domain domain) { +miopenStatus_t cudnnSetTensorDescriptorFromDomain( + miopenTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); switch (domain.get_dim()) { case 1: { Rect<1> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; - return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + return miopenSet4dTensorDescriptor( + tensor, cudnn_data_type, dims[0], 1, 1, 1); } case 2: { Rect<2> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; dims[1] = rect.hi[1] - rect.lo[1] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[1], dims[0], 1, 1); } case 3: { Rect<3> rect = domain; @@ -296,7 +297,7 @@ miopenStatus_t dims[1] = rect.hi[1] - rect.lo[1] + 1; dims[2] = rect.hi[2] - rect.lo[2] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[2], dims[1], dims[0], 1); + tensor, cudnn_data_type, dims[2], dims[1], dims[0], 1); } case 4: { Rect<4> rect = domain; @@ -305,7 +306,7 @@ miopenStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; dims[3] = rect.hi[3] - rect.lo[3] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + tensor, cudnn_data_type, dims[3], dims[2], dims[1], dims[0]); } case 5: { Rect<5> rect = domain; @@ -316,7 +317,7 @@ miopenStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; dims[3] = rect.hi[3] - rect.lo[3] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + tensor, cudnn_data_type, dims[3], dims[2], dims[1], dims[0]); } default: assert(false && "Unsupported dim number"); @@ -326,6 +327,8 @@ miopenStatus_t miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { + case DT_HALF: + return miopenHalf; case DT_FLOAT: return miopenFloat; case DT_DOUBLE: diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 2147ac69b3..0fb98e05ea 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -1,3 +1,4 @@ +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/concat.h" @@ -834,6 +835,12 @@ template bool TensorBase::get_tensor(FFModel const *ff, int64_t *data, bool get_gradients); +template bool ParallelTensorBase::set_tensor(FFModel const *ff, + std::vector const &dims, + half const *data); +template bool ParallelTensorBase::get_tensor(FFModel const *ff, + half *data, + bool get_gradients); template bool ParallelTensorBase::set_tensor( FFModel const *ff, std::vector const &dims, float const *data); template bool ParallelTensorBase::get_tensor(FFModel const *ff, diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc index c363cdd296..d943376416 100644 --- a/src/runtime/simulator.cc +++ b/src/runtime/simulator.cc @@ -14,6 +14,7 @@ */ #include "flexflow/simulator.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/partition.h" @@ -349,25 +350,6 @@ void Simulator::free_all() { offset = 0; } -size_t data_type_size(DataType type) { - switch (type) { - case DT_HALF: - return sizeof(half); - case DT_FLOAT: - return sizeof(float); - case DT_DOUBLE: - return sizeof(double); - case DT_INT32: - return sizeof(int32_t); - case DT_INT64: - return sizeof(int64_t); - case DT_BOOLEAN: - return sizeof(bool); - default: - assert(false); - } -} - void *Simulator::allocate(size_t num_elements, DataType type) { size_t element_size = data_type_size(type); void *ret_ptr = base_ptr + offset; diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp index 888b563af8..209d39243f 100644 --- a/src/runtime/simulator.cpp +++ b/src/runtime/simulator.cpp @@ -85,10 +85,10 @@ Simulator::Simulator(FFModel const *model, hipEventCreate(&start_event); hipEventCreate(&end_event); conv2d_meta = new Conv2DMeta(handler); - linear_meta = new LinearMeta(handler, 4096); + // linear_meta = new LinearMeta(handler, 4096); pool2d_meta = new Pool2DMeta(handler); ele_unary_meta = new ElementUnaryMeta(handler); - ele_binary_meta = new ElementBinaryMeta(handler); + // ele_binary_meta = new ElementBinaryMeta(handler); // embedding_meta = new EmbeddingMeta(handler); // softmax_meta = new SoftmaxMeta(handler); batch_matmul_meta = new BatchMatmulMeta(handler); diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu index 523b8217ef..8b0c60bebf 100644 --- a/src/runtime/simulator.cu +++ b/src/runtime/simulator.cu @@ -84,10 +84,10 @@ Simulator::Simulator(FFModel const *model, cudaEventCreate(&start_event); cudaEventCreate(&end_event); conv2d_meta = new Conv2DMeta(handler); - linear_meta = new LinearMeta(handler, 4096); + // linear_meta = new LinearMeta(handler, 4096); pool2d_meta = new Pool2DMeta(handler); ele_unary_meta = new ElementUnaryMeta(handler); - ele_binary_meta = new ElementBinaryMeta(handler); + // ele_binary_meta = new ElementBinaryMeta(handler); // embedding_meta = new EmbeddingMeta(handler); // softmax_meta = new SoftmaxMeta(handler); batch_matmul_meta = new BatchMatmulMeta(handler); @@ -108,7 +108,6 @@ Simulator::~Simulator(void) { delete conv2d_meta; delete pool2d_meta; delete ele_unary_meta; - delete ele_binary_meta; delete batch_matmul_meta; delete concat_meta; delete transpose_meta; diff --git a/tests/align/align_create_tensor_torch.py b/tests/align/align_create_tensor_torch.py index 8b835a5276..ca1be143ed 100644 --- a/tests/align/align_create_tensor_torch.py +++ b/tests/align/align_create_tensor_torch.py @@ -2,7 +2,6 @@ import sys import torch - sys.path.append("./align/") from align_utils import gen_tensor, parse_create_tensor_args, create_general_test_tensor_torch, BATCH_SIZE, INPUT_SIZE, SEQ_LENGTH diff --git a/tests/align/align_utils.py b/tests/align/align_utils.py index 34f07a4928..368893c5eb 100644 --- a/tests/align/align_utils.py +++ b/tests/align/align_utils.py @@ -102,7 +102,7 @@ def align_tensors(tensor_alignment_data_iter: Iterable[TensorAlignmentData]): ff_tensor = torch.load(ff_filepath).cpu() torch_tensor = torch.load(torch_filepath).cpu() print(f"Checking {tensor_alignment_data.tensor_name} alignment...") - torch.testing.assert_close(ff_tensor, torch_tensor) + torch.testing.assert_close(ff_tensor, torch_tensor, rtol=1e-2, atol=1e-4) def parse_create_tensor_args(): From d7dd6bbe0fbe018623bfde715ab50795f77c9dd6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 1 Jun 2023 04:55:51 +0800 Subject: [PATCH 144/344] [Inference] - Alignment tests (#742) * save output to file * add alignment tests * fix * change conflicting name, add comments * fix typo * formatting * more comments and clean dead code * formatting * fixed issue with length mismatch * fix ci skip * update inf test * add precision selection support in incr decoding --- .github/workflows/gpu-ci-skip.yml | 9 +- include/flexflow/batch_config.h | 6 +- include/flexflow/inference.h | 9 +- inference/.gitignore | 3 +- inference/incr_decoding/incr_decoding.cc | 26 ++- inference/spec_infer/spec_infer.cc | 14 +- src/ops/tree_inc_multihead_self_attention.cu | 2 +- src/runtime/batch_config.cc | 24 +-- src/runtime/beam_search_batch_config.cc | 68 +++++- src/runtime/inference_manager.cc | 30 --- src/runtime/request_manager.cc | 212 ++++++++++++------- src/runtime/tree_verify_batch_config.cc | 50 +++++ tests/inference_tests.sh | 28 ++- 13 files changed, 327 insertions(+), 154 deletions(-) diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index 012302a57f..64907aa10f 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -29,10 +29,17 @@ jobs: needs: gpu-ci-concierge steps: - run: 'echo "No gpu-ci required"' + + inference-tests: + name: Inference Tests + runs-on: ubuntu-20.04 + needs: gpu-ci-concierge + steps: + - run: 'echo "No gpu-ci required"' gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests runs-on: ubuntu-20.04 - needs: gpu-ci-concierge + needs: inference-tests steps: - run: 'echo "No gpu-ci required"' diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 391b514de6..fd0f419db6 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -74,6 +74,7 @@ class TreeVerifyBatchConfig : public BatchConfig { TreeVerifyBatchConfig(); ~TreeVerifyBatchConfig(); InferenceMode get_mode() const; + void print() const; // struct PerTokenInfo : BatchConfig::PerTokenInfo { // int tree_branch_idx; // }; @@ -86,7 +87,7 @@ class TreeVerifyBatchConfig : public BatchConfig { // void compute_tree_branch_indexes(); int num_tokens_to_commit; - CommittedTokensInfo commited_tokens[MAX_NUM_TOKENS]; + CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS]; }; struct InferenceResult { @@ -104,6 +105,8 @@ class BeamSearchBatchConfig : public BatchConfig { void print() const; bool done() const; + int max_beam_depth_all_requests() const; + int current_depth_all_requests() const; size_t beam_width; size_t target_iterations; @@ -111,7 +114,6 @@ class BeamSearchBatchConfig : public BatchConfig { static int const MAX_BEAM_DEPTH = 8; struct BeamSearchPerRequestInfo { - bool request_completed; int beam_size; int current_depth = -1; int max_depth = MAX_BEAM_DEPTH; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 318dd7c9a3..8825a79283 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -78,7 +78,9 @@ class RequestManager { public: using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; - RequestManager(Tokenizer *tokenizer, bool verbose = false); + RequestManager(Tokenizer *tokenizer, + bool verbose = false, + std::string output_filepath = ""); RequestManager(); size_t get_num_processed_requests(); RequestGuid register_new_request(std::string const &prompt, @@ -134,6 +136,7 @@ class RequestManager { private: Tokenizer *tokenizer; bool verbose; + std::string output_filepath; std::queue pending_request_queue; std::unordered_map running_request_queue; std::mutex request_queue_mutex; @@ -143,11 +146,11 @@ class RequestManager { std::unordered_map>> - dfs_tree_inputs; + dfs_tree_inputs_map; // std::unordered_map beam_trees_v2; // TODO: cache config info for Verify/Beam exchange: Beam Width, Beam Depth, - // Commited Tokens + // Committed Tokens std::unordered_map>> committed_tokens; // Performance profiling diff --git a/inference/.gitignore b/inference/.gitignore index 93699cdd9f..05ccb57cd3 100644 --- a/inference/.gitignore +++ b/inference/.gitignore @@ -1,3 +1,4 @@ weights tokenizer -prompt \ No newline at end of file +prompt +output diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index fb6269c568..77dabeb84d 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -29,6 +29,7 @@ struct FilePaths { std::string llm_config_file_path; std::string prompt_file_path; std::string tokenizer_file_path; + std::string output_file_path; }; enum ModelType { UNKNOWN, LLAMA, OPT }; @@ -36,7 +37,8 @@ enum ModelType { UNKNOWN, LLAMA, OPT }; void parse_input_args(char **argv, int argc, FilePaths &paths, - ModelType &llm_model_type) { + ModelType &llm_model_type, + bool &use_full_precision) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -74,6 +76,15 @@ void parse_input_args(char **argv, paths.tokenizer_file_path = std::string(argv[++i]); continue; } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } } } @@ -84,11 +95,12 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; FilePaths file_paths; ModelType model_type; + bool use_full_precision = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, file_paths, model_type); + parse_input_args(argv, argc, file_paths, model_type, use_full_precision); assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -118,7 +130,9 @@ void FlexFlow::top_level_task(Task const *task, InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); RequestManager rm((model_type == ModelType::LLAMA) ? (Tokenizer *)sp_tokenizer - : (Tokenizer *)opt_tokenizer); + : (Tokenizer *)opt_tokenizer, + /*verbose*/ false, + file_paths.output_file_path); int total_num_requests = 0; { using json = nlohmann::json; @@ -143,7 +157,8 @@ void FlexFlow::top_level_task(Task const *task, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, ffconfig.workersPerNode * ffconfig.numNodes, - INC_DECODING_MODE); + INC_DECODING_MODE, + use_full_precision); } else { assert(model_type == ModelType::OPT); OPT::create_opt_model(model, @@ -151,7 +166,8 @@ void FlexFlow::top_level_task(Task const *task, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, ffconfig.workersPerNode * ffconfig.numNodes, - INC_DECODING_MODE); + INC_DECODING_MODE, + use_full_precision); } BatchConfig bc; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index f7f7e70543..8df4cf4028 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -31,6 +31,7 @@ struct FilePaths { std::vector ssm_config_file_paths; std::string prompt_file_path; std::string tokenizer_file_path; + std::string output_file_path; }; enum ModelType { UNKNOWN, LLAMA, OPT }; @@ -110,6 +111,11 @@ void parse_input_args(char **argv, paths.tokenizer_file_path = std::string(argv[++i]); continue; } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -177,7 +183,9 @@ void FlexFlow::top_level_task(Task const *task, InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); RequestManager rm((model_types.llm_model_type == ModelType::LLAMA) ? (Tokenizer *)sp_tokenizer - : (Tokenizer *)opt_tokenizer); + : (Tokenizer *)opt_tokenizer, + /*verbose*/ false, + file_paths.output_file_path); int total_num_requests = 0; { using json = nlohmann::json; @@ -244,12 +252,12 @@ void FlexFlow::top_level_task(Task const *task, break; } while (true) { - depth = beam_bc.beamRequestsInfo[0].current_depth; + depth = beam_bc.current_depth_all_requests(); FutureMap fm = im.inference(&beam_model, 0, beam_bc); assert(fm.get_future_map_domain().get_volume() == 1); Future future = fm.get_future(0); BeamInferenceResult beam_ir = future.get_result(); - if (depth - 1 >= BeamSearchBatchConfig::MAX_BEAM_DEPTH) { + if (depth - 1 >= beam_bc.max_beam_depth_all_requests()) { break; } else { beam_bc = rm.prepare_next_batch_beam(beam_bc, beam_ir); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b9bfc5b6a3..cc2c6e0bfb 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -464,7 +464,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache cudaMemcpyAsync(m->committed_token_infos, - &(bc->commited_tokens), + &(bc->committed_tokens), bc->MAX_NUM_TOKENS * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 4d81616dc3..c0e665b613 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -145,6 +145,8 @@ int BatchConfig::num_active_tokens() const { } void BatchConfig::print() const { + std::cout << "@@@@@@@@@@@@@@ Batch Config (mode " << get_mode() + << ") @@@@@@@@@@@@@@" << std::endl; std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; std::cout << "Number of tokens: " << num_tokens << std::endl; @@ -176,26 +178,8 @@ void BatchConfig::print() const { << std::endl; std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; } + std::cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + << std::endl; } -#ifdef DEADCODE -void TreeVerifyBatchConfig::compute_tree_branch_indexes() { - // Must be called only after setting num_tokens! - auto is_first_token_in_request = [&](int token_index) -> bool { - if (token_index == 0) { - return true; // First entry in tokensInfo is the first in a request. - } - return tokensInfo[token_index].request_index != - tokensInfo[token_index - 1].request_index; - }; - for (int i = 0; i < num_tokens; i++) { - if (is_first_token_in_request(i)) { - tokensInfo[i].tree_branch_idx = 0; - } else { - tokensInfo[i].tree_branch_idx = tokensInfo[i - 1].tree_branch_idx + 1; - } - } -} -#endif - }; // namespace FlexFlow diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 8d4aeeabb0..c177be0681 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -50,17 +50,54 @@ bool BeamSearchBatchConfig::done() const { return current_iteration == target_iterations; } +int BeamSearchBatchConfig::max_beam_depth_all_requests() const { + int max_depth_all_requests = 0; + for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { + if (!request_completed[i] && + beamRequestsInfo[i].max_depth > max_depth_all_requests) { + /* printf("\treq %i has max_depth=%i. Increasing max_depth_all_requests " + "from %i\n", + i, + beamRequestsInfo[i].max_depth, + max_depth_all_requests); */ + max_depth_all_requests = beamRequestsInfo[i].max_depth; + } + } + assert(max_depth_all_requests <= BeamSearchBatchConfig::MAX_BEAM_DEPTH); + return max_depth_all_requests; +} + +int BeamSearchBatchConfig::current_depth_all_requests() const { + int current_depth = 0; + for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { + if (!request_completed[i] && + beamRequestsInfo[i].current_depth > current_depth) { + /* printf("\treq %i has current_depth=%i. Increasing " + "current_depth_all_requests from %i\n", + i, + beamRequestsInfo[i].current_depth, + current_depth); */ + current_depth = beamRequestsInfo[i].current_depth; + } + } + assert(current_depth <= BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1); + return current_depth; +} + void BeamSearchBatchConfig::print() const { + std::cout << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << get_mode() + << ") @@@@@@@@@@@@@@" << std::endl; std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; std::cout << "Number of tokens: " << num_tokens << std::endl; std::cout << "Number of requests: " << num_active_requests() << std::endl; std::cout << "Beam width: " << beam_width << std::endl; - std::cout << "Target Iterations" << target_iterations << std::endl; - std::cout << "Current Iterations" << current_iteration << std::endl; + std::cout << "Target Iterations: " << target_iterations << std::endl; + std::cout << "Current Iterations: " << current_iteration << std::endl; std::cout << "Per-request info:\n"; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + // assert(beamRequestsInfo[i].request_completed == request_completed[i]); if (!request_completed[i]) { std::cout << " Request " << i << ":\n"; std::cout << " Token start offset: " @@ -70,8 +107,28 @@ void BeamSearchBatchConfig::print() const { std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; std::cout << " Max sequence length: " << requestsInfo[i].max_sequence_length << std::endl; - std::cout << " Request completed: " << request_completed[i] + std::cout << " Beam Search Specific: " << std::endl; + std::cout << " beam_size: " << beamRequestsInfo[i].beam_size << std::endl; + std::cout << " current_depth: " + << beamRequestsInfo[i].current_depth << std::endl; + std::cout << " max_depth: " << beamRequestsInfo[i].max_depth + << std::endl; + std::cout << " tokens: "; + for (int j = 0; j < MAX_BEAM_WIDTH; j++) { + std::cout << beamRequestsInfo[i].tokens[j] << ", "; + } + std::cout << std::endl; + std::cout << " probs: "; + for (int j = 0; j < MAX_BEAM_WIDTH; j++) { + std::cout << beamRequestsInfo[i].probs[j] << ", "; + } + std::cout << std::endl; + std::cout << " parent_id: "; + for (int j = 0; j < MAX_BEAM_WIDTH; j++) { + std::cout << beamRequestsInfo[i].parent_id[j] << ", "; + } + std::cout << std::endl; } } @@ -83,10 +140,15 @@ void BeamSearchBatchConfig::print() const { std::cout << " Request index: " << tokensInfo[i].request_index << std::endl; std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; + std::cout << " Beam Search Specific: " << std::endl; + std::cout << " beam_size: " << beamTokenInfo[i].sub_request_index + << std::endl; // std::cout << " Parent token id: " << tokensInfo[i].parent_token_id << // std::endl; std::cout << " Accumulated log prob: " // << tokensInfo[i].cum_log_prob << std::endl; } + std::cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + << std::endl; } }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index fd35b9aa76..5a5c57bfea 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -118,36 +118,6 @@ void InferenceManager::compile_model_and_allocate_buffer( tensor_buffer[pt_base] = list; } } -#ifdef DEADCODE - // Set machine_view for batch_tensors in the tensor_buffer - for (int batch_index = 0; batch_index < max_num_inflight_batches; - batch_index++) { - int expert_device_index = 0; - int device_index = batch_index % num_devices; - for (size_t o = 0; o < model->operators.size(); o++) { - Op *op = model->operators[o]; - if (op->op_type == OP_WEIGHT) { - continue; - } - MachineView *view; - if (op->op_type == OP_EXPERTS) { - view = get_machine_view(expert_device_index); - // view = &machine_views[expert_device_index]; - expert_device_index = (expert_device_index + 1) % num_devices; - } else { - // pick mv w startdeviceid = device_index - // view = &machine_views[device_index]; - view = get_machine_view(device_index); - } - for (int i = 0; i < op->numOutputs; i++) { - tensor_buffer[op->outputs[i]][batch_index]->machine_view = *view; - Domain part_domain = - runtime->get_index_space_domain(ctx, op->outputs[i]->parallel_is); - assert(view->get_domain() == part_domain); - } - } - } -#endif } void InferenceManager::init_operators_inference(FFModel *model) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5349ec5439..cf0aeb94de 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -29,9 +29,11 @@ RequestManager::RequestManager() : tokenizer(nullptr), verbose(false), next_available_guid(1000000), num_processed_requests(0) {} -RequestManager::RequestManager(Tokenizer *_tokenizer, bool _verbose) +RequestManager::RequestManager(Tokenizer *_tokenizer, + bool _verbose, + std::string _output_filepath) : tokenizer(_tokenizer), verbose(_verbose), next_available_guid(1000000), - num_processed_requests(0) {} + num_processed_requests(0), output_filepath(_output_filepath) {} RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, @@ -88,7 +90,7 @@ size_t RequestManager::get_num_processed_requests() { BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - // Step 1: use result to update requests + // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; @@ -106,7 +108,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("Output: %s", output.c_str()); } } - // Step 2: preparing the next batch for existing requests + // Step 2: prepare the next batch for existing requests BatchConfig new_bc; for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { if (old_bc.request_completed[i]) { @@ -140,6 +142,26 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, profile_info.finish_time, profile_info.finish_time - profile_info.start_time, total_request_run_time); + // Write output to file if needed: + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath); + if (outputFile.is_open()) { + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } + } + outputFile << std::endl; + outputFile << output; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + // std::cout << "print results: " << std::endl; // for (int i = 0; i < request.tokens.size(); i++) { // std::cout << request.tokens.at(i) << ", "; @@ -219,7 +241,9 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - + if (verbose) { + std::cout << "\n############### prepare_next_batch_beam ###############\n"; + } if (verbose) { std::cout << "print all results" << "\n"; @@ -279,6 +303,8 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.beamRequestsInfo[i].beam_size = old_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].max_depth = + old_bc.beamRequestsInfo[i].max_depth; // do the slot exchange to minimize the cache exchange in kernel. // std::cout << "update metadata" << std::endl; @@ -313,6 +339,12 @@ BeamSearchBatchConfig } } } + if (verbose) { + std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" + << std::endl; + old_bc.print(); + new_bc.print(); + } return new_bc; } @@ -320,7 +352,9 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - + if (verbose) { + std::cout << "\n############### prepare_next_batch_init ###############\n"; + } // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; @@ -345,21 +379,26 @@ BeamSearchBatchConfig } else { committed_tokens.at(guid).clear(); } + // iterate through all the tokens that belong to request i while (result_index < old_bc.num_tokens && old_bc.tokensInfo[result_index].request_index == i) { + // new tokens have not been appended yet, so the last appended token is + // the root of the beam search token tree int root_abs_depth = request.tokens.size() - 1; if (old_bc.tokensInfo[result_index].abs_depth_in_request >= root_abs_depth) { + // append to tree_outputs a pair consisting of (token id, depth) tree_outputs.push_back(std::make_pair( result.token_ids[result_index], old_bc.tokensInfo[result_index].abs_depth_in_request + 1)); - + // append (depth, index of the token in result) to committed_tokens + // array committed_tokens.at(guid).push_back( std::make_pair(old_bc.tokensInfo[result_index].abs_depth_in_request, result_index)); if (verbose) { - std::cout << "Index with old_bacth: " << result_index << std::endl; + std::cout << "Index within old batch: " << result_index << std::endl; printf(" Input: [%d] %d ---> [%d] %d \n", old_bc.tokensInfo[result_index].abs_depth_in_request, old_bc.tokensInfo[result_index].token_id, @@ -379,7 +418,7 @@ BeamSearchBatchConfig } std::vector> verified_tokens = - traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + traverse_verify_tree(guid, dfs_tree_inputs_map.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); // check if the request is finished @@ -387,12 +426,16 @@ BeamSearchBatchConfig request.max_sequence_length) { // Append all verified tokens to the request for (int j = 0; j < verified_tokens.size(); j++) { - request.tokens.push_back(verified_tokens[j].first); + if (verified_tokens[j].second < request.max_sequence_length) { + request.tokens.push_back(verified_tokens[j].first); + } } log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", request.guid, request.tokens.size()); + std::string output = tokenizer->Decode(request.tokens); + log_req_mgr.print("Final output: %s", output.c_str()); new_bc.request_completed[i] = true; num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; @@ -409,15 +452,35 @@ BeamSearchBatchConfig profile_info.finish_time - profile_info.start_time, total_request_run_time); + // Write output to file if needed: + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath); + if (outputFile.is_open()) { + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } + } + outputFile << std::endl; + outputFile << output; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + beam_trees[i] = BeamTree{}; - dfs_tree_inputs.erase( + dfs_tree_inputs_map.erase( request.guid); // delete the old input tree from cache continue; } new_bc.request_completed[i] = false; - // Normal Reuqest Info + // Normal Request Info new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = @@ -425,12 +488,14 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); // TODO: Beam Request Info, missing from VerifyTreeBatchConfig + int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].token_start_offset - + verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].beam_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH; new_bc.beamRequestsInfo[i].max_depth = - BeamSearchBatchConfig::MAX_BEAM_DEPTH; - new_bc.beamRequestsInfo[i].request_completed = false; + std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; @@ -513,13 +578,23 @@ BeamSearchBatchConfig } } } + + if (verbose) { + std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" + << std::endl; + old_bc.print(); + new_bc.print(); + } return new_bc; } TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( BeamSearchBatchConfig const &old_bc) { const std::lock_guard lock(request_queue_mutex); - + if (verbose) { + std::cout + << "\n############### prepare_next_batch_verify ###############\n"; + } TreeVerifyBatchConfig new_bc; new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; @@ -556,6 +631,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; // TODO: Add prompt token first in first verify iteration if (request.tokens.size() == request.initial_len) { + // Initialization (prompt) phase for (int j = 0; j < request.initial_len; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; @@ -565,13 +641,14 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].num_tokens_in_batch++; } if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - assert(false); + assert(false && + "Exceeding the space available in the TreeVerify batch"); break; } new_bc.requestsInfo[i].token_start_offset = 0; } else { - // Only add the last committed token + // Incremental phase: only add the last committed token new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = @@ -581,7 +658,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].num_tokens_in_batch++; if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - assert(false); + assert(false && + "Exceeding the space available in the TreeVerify batch"); break; } @@ -603,9 +681,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (dfs_tree_inputs.at(0).second == request.initial_len + committed_tokens.at(guid).size() - 1) { for (int j = 0; j < request.initial_len; j++) { - new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = j; - new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; - new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = j; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; if (verbose) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << j @@ -616,10 +695,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } else { // only add the root token auto committed_token = committed_tokens.at(guid).at(0); - new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.second; - new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; - new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; if (verbose) { std::cout << new_bc.num_tokens_to_commit @@ -656,21 +735,23 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (committed_tokens.find(guid) != committed_tokens.end()) { // if (j == 1) { // auto committed_token = committed_tokens.at(guid).at(0); - // new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = // committed_token.second; - // new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = - // i; new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth - // = committed_token.first; std:: cout << new_bc.num_tokens_to_commit + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index + // = i; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + // committed_token.first; std:: cout << new_bc.num_tokens_to_commit // << "- committed_token.token_depth: " << committed_token.first << // ", token_index: " << committed_token.second << std::endl; // new_bc.num_tokens_to_commit++; // } if (j < committed_tokens.at(guid).size()) { auto committed_token = committed_tokens.at(guid).at(j); - new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_index = + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.second; - new_bc.commited_tokens[new_bc.num_tokens_to_commit].request_index = i; - new_bc.commited_tokens[new_bc.num_tokens_to_commit].token_depth = + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; if (verbose) { std::cout << new_bc.num_tokens_to_commit @@ -696,6 +777,13 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } + if (verbose) { + std::cout << "prepare_next_batch_verify OLD vs NEW batchconfigs below:" + << std::endl; + old_bc.print(); + new_bc.print(); + } + return new_bc; } @@ -765,9 +853,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, result.parent_id[result_index]; if (verbose) { - std::cout << "tree value: " << depth << "token: " + std::cout << "tree value: " << depth << " token: " << beam_trees[index].treeLayers[depth].tokens[beam_id] - << "result tokens: " << result.token_ids[result_index]; + << " result tokens: " << result.token_ids[result_index] + << std::endl; } result_index += 1; } @@ -863,7 +952,7 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, std::cout << "-----------after parent id exchange-----------" << std::endl; for (int j = 0; j < beam_size; j++) { std::cout << "after request id: " << request_index << "beam id = " << j - << "parnt: " + << "parent: " << new_bc.beamRequestsInfo[request_index].parent_id[j] << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j] << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] @@ -936,47 +1025,6 @@ bool PreOrder( return flag; } -#ifdef DEADCODE -TreeVerifyBatchConfig RequestManager::convert_beam_to_tree_batch_config( - BeamSearchBatchConfig const &beam_bc) { - TreeVerifyBatchConfig tree_bc; - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { - if (beam_bc.request_completed[i]) { - continue; - } - // We don't modify requests during the conversion - tree_bc.request_completed[i] = beam_bc.request_completed[i]; - BeamTree const &tree = beam_trees[i]; - // token, index - // todo make this one global for different stages - std::vector> serializedTree; - PreOrder(tree, - beam_bc.beamRequestsInfo[i].max_depth, - 0, - beam_bc.beamRequestsInfo[i].beam_size, - 0, - serializedTree, - verbose); - tree_bc.requestsInfo[i].request_guid = beam_bc.requestsInfo[i].request_guid; - tree_bc.requestsInfo[i].max_sequence_length = - beam_bc.requestsInfo[i].max_sequence_length; - tree_bc.requestsInfo[i].token_start_offset = serializedTree[0].second; - tree_bc.requestsInfo[i].num_tokens_in_batch = 0; - - for (int k = 0; k < serializedTree.size(); k++) { - assert(tree_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS); - tree_bc.tokensInfo[tree_bc.num_tokens].request_index = i; - tree_bc.tokensInfo[tree_bc.num_tokens].abs_depth_in_request = - serializedTree[k].second; - tree_bc.tokensInfo[tree_bc.num_tokens].token_id = serializedTree[k].first; - tree_bc.num_tokens++; - tree_bc.requestsInfo[i].num_tokens_in_batch++; - } - } - return tree_bc; -} -#endif - std::vector> RequestManager::traverse_verify_tree( size_t guid, @@ -994,14 +1042,19 @@ std::vector> outputSerializedTree.size()); log_req_mgr.print("========Input============"); + // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token id, + // depth) pairs for (auto const &pair : inputSerializedTree) { log_req_mgr.print("(%d, %d)", pair.first, pair.second); } log_req_mgr.print("========Output============"); + // outputSerializedTree is an array of (token id, depth + 1) pairs for (auto const &pair : outputSerializedTree) { log_req_mgr.print("(%d, %d)", pair.first, pair.second); } log_req_mgr.print("========Committed============"); + // committed_tokens[guid] is an array of (depth, result_index) pairs for the + // given request for (auto const &pair : committed_tokens.at(guid)) { log_req_mgr.print("(%d, %d)", pair.first, pair.second); } @@ -1092,12 +1145,13 @@ std::vector> // std::cout << "Done printing serialized tree, " // << old_bc.requestsInfo[request_index].request_guid << "\n"; - if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid) != - dfs_tree_inputs.end()) { - dfs_tree_inputs[old_bc.requestsInfo[request_index].request_guid] = + if (dfs_tree_inputs_map.find( + old_bc.requestsInfo[request_index].request_guid) != + dfs_tree_inputs_map.end()) { + dfs_tree_inputs_map[old_bc.requestsInfo[request_index].request_guid] = serializedTree; } else { - dfs_tree_inputs.insert(std::make_pair( + dfs_tree_inputs_map.insert(std::make_pair( old_bc.requestsInfo[request_index].request_guid, serializedTree)); } diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 76a8025507..78eff184c4 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -30,4 +30,54 @@ InferenceMode TreeVerifyBatchConfig::get_mode() const { return TREE_VERIFY_MODE; } +void TreeVerifyBatchConfig::print() const { + std::cout << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << get_mode() + << ") @@@@@@@@@@@@@@" << std::endl; + std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; + std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; + std::cout << "Number of tokens: " << num_tokens << std::endl; + std::cout << "Number of requests: " << num_active_requests() << std::endl; + // std::cout << "Cached results: " << cached_results << std::endl; + + std::cout << "Per-request info:\n"; + for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + if (!request_completed[i]) { + std::cout << " Request " << i << ":\n"; + std::cout << " Token start offset: " + << requestsInfo[i].token_start_offset << std::endl; + std::cout << " Number of tokens in batch: " + << requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; + std::cout << " Max sequence length: " + << requestsInfo[i].max_sequence_length << std::endl; + std::cout << " Request completed: " << request_completed[i] + << std::endl; + } + } + + std::cout << "Per-token info:\n"; + for (int i = 0; i < num_tokens; i++) { + std::cout << " Token " << i << ":\n"; + std::cout << " Absolute depth in request: " + << tokensInfo[i].abs_depth_in_request << std::endl; + std::cout << " Request index: " << tokensInfo[i].request_index + << std::endl; + std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; + } + + std::cout << "Tokens to commit info:\n"; + for (int i = 0; i < num_tokens_to_commit; i++) { + std::cout << " Token " << i << ":\n"; + std::cout << " token_index: " << committed_tokens[i].token_index + << std::endl; + std::cout << " request_index: " << committed_tokens[i].request_index + << std::endl; + std::cout << " token_depth: " << committed_tokens[i].token_depth + << std::endl; + } + + std::cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + << std::endl; +} + }; // namespace FlexFlow diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index fa44446a3d..d82b5e26fc 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -31,31 +31,47 @@ copy_embedding_weights mkdir -p ../inference/prompt echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json +# Create output folder +mkdir -p ../inference/output + ############################################################################################### ############################ Speculative inference tests ###################################### ############################################################################################### # LLAMA -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama.txt # OPT -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt.txt ############################################################################################### ############################ Incremental decoding tests ####################################### ############################################################################################### # LLAMA (small model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M.txt # LLAMA (big model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B.txt # OPT (small model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M.txt # OPT (big model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B.txt + + +############################################################################################### +################################## Alignment tests ############################################ +############################################################################################### + +diff ../inference/output/incr_decoding_llama_7B.txt ../inference/output/spec_inference_llama.txt +diff ../inference/output/incr_decoding_opt_6B.txt ../inference/output/spec_inference_opt.txt + + +############################################################################################### +###################################### Cleanup ################################################ +############################################################################################### # Clean up after test cleanup From 6c13936d3bb967c3b41ea4b26203cff8d9300f4c Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 31 May 2023 17:08:51 -0400 Subject: [PATCH 145/344] Update README.md (#744) * Update README.md * update readme * fix --- .github/README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/README.md b/.github/README.md index 3f22993b00..59377e308e 100644 --- a/.github/README.md +++ b/.github/README.md @@ -54,7 +54,7 @@ class RequestManager { For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision ``` ### Tokenizers @@ -63,15 +63,18 @@ SpecInfer supports two tokenizers: * The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. * The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. +### Mixed-precision support +SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. + ### LLM Weights The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. -| Model | Model id on Hugging Face | Storage Location | -| :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | -| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | -| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | -| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125m_native.tar.gz | +| Model | Model id on Hugging Face | Storage Location (single precision) | Storage Location (half precision) | +| :---- | :---- | :---- | :---- | +| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | s3://specinfer/half_weights/llama_7B_weights.tar.gz +| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | s3://specinfer/half_weights/llama_160M_weights.tar.gz +| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | s3://specinfer/half_weights/opt_6B_weights.tar.gz +| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125M_weights.tar.gz | s3://specinfer/half_weights/opt_125M_weights.tar.gz You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. From d8072ab6efe7bae43058c6a3ffeb94499c804124 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 1 Jun 2023 16:09:09 +0000 Subject: [PATCH 146/344] fix --- include/flexflow/inference.h | 1 + inference/spec_infer/spec_infer.cc | 4 +++- src/runtime/request_manager.cc | 23 +++++++++++++++++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 8825a79283..8ba110583c 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -117,6 +117,7 @@ class RequestManager { &inputSerializedTree, std::vector> const &outputSerializedTree); + int get_requests_init_length(BeamSearchBatchConfig const &old_bc); // TreeVerifyBatchConfig // convert_beam_to_tree_batch_config(BeamSearchBatchConfig const diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8df4cf4028..3f08bf27fb 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -257,7 +257,9 @@ void FlexFlow::top_level_task(Task const *task, assert(fm.get_future_map_domain().get_volume() == 1); Future future = fm.get_future(0); BeamInferenceResult beam_ir = future.get_result(); - if (depth - 1 >= beam_bc.max_beam_depth_all_requests()) { + if (depth - 1 >= beam_bc.max_beam_depth_all_requests() || + depth + 1 + rm.get_requests_init_length(beam_bc) >= + BatchConfig::MAX_NUM_TOKENS) { break; } else { beam_bc = rm.prepare_next_batch_beam(beam_bc, beam_ir); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index cf0aeb94de..c5b874c798 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -236,6 +236,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, /* ----- Speculative Inference Specific functions ----- */ +int RequestManager::get_requests_init_length( + BeamSearchBatchConfig const &old_bc) { + int init_length = 0; + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + Request &request = + running_request_queue[old_bc.requestsInfo[i].request_guid]; + if (old_bc.requestsInfo[i].token_start_offset + 1 >= + request.tokens.size()) { + init_length = 0; + } else if (request.initial_len > init_length) { + init_length = request.initial_len; + } + } + return init_length; +} + // update beam search metadata BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, @@ -247,7 +266,7 @@ BeamSearchBatchConfig if (verbose) { std::cout << "print all results" << "\n"; - for (int i = 0; i < 40; i++) { + for (int i = 0; i < 64; i++) { std::cout << result.token_ids[i] << ", "; } std::cout << "Current Beam Depth: " @@ -304,7 +323,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].beam_size = old_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; + old_bc.beamRequestsInfo[i].current_depth; // do the slot exchange to minimize the cache exchange in kernel. // std::cout << "update metadata" << std::endl; From 9f2688d4357ca3d30172729deb6318c321442fb8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 4 Jun 2023 18:07:46 +0800 Subject: [PATCH 147/344] [Inference] - Add half precision & HuggingFace alignment tests + Speed tests (#749) * add support for downloading mixed precision llama/opt weights * fix * update test script to also run half precision tests * disable workflow for inference PRs * add verbose option * linting * copy opt weights in download weights script * add alignment tests with huggingface (llama) * fix, add diff to test script * fix * add opt tests * comment out tests not passing * add e2e latency to output files * add speed tests * shellcheck * shellcheck * fix * fix * linting * fix --- .github/workflows/gpu-ci-skip.yml | 1 + .github/workflows/gpu-ci.yml | 2 + inference/incr_decoding/incr_decoding.cc | 14 ++- inference/models/llama.cc | 2 +- inference/spec_infer/spec_infer.cc | 14 ++- inference/utils/download_llama_weights.py | 13 ++- inference/utils/download_opt_weights.py | 16 ++- src/ops/kernels/rms_norm_kernels.cu | 10 +- src/ops/kernels/softmax.cu | 1 - src/ops/tree_inc_multihead_self_attention.cu | 2 +- src/runtime/request_manager.cc | 9 ++ tests/inference/huggingface_inference.py | 59 +++++++++++ tests/inference_tests.sh | 104 ++++++++++++++++--- 13 files changed, 215 insertions(+), 32 deletions(-) create mode 100644 tests/inference/huggingface_inference.py diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index 64907aa10f..d8e5353e79 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -40,6 +40,7 @@ jobs: gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests runs-on: ubuntu-20.04 + if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} needs: inference-tests steps: - run: 'echo "No gpu-ci required"' diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index b24e7236a8..00b3138e00 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -181,6 +181,8 @@ jobs: gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests runs-on: self-hosted + #skip this time-consuming test for PRs to the inference branch + if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} needs: inference-tests container: image: ghcr.io/flexflow/flexflow-environment-cuda:latest diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 77dabeb84d..895eafb601 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -38,7 +38,8 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, ModelType &llm_model_type, - bool &use_full_precision) { + bool &use_full_precision, + bool &verbose) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -85,6 +86,11 @@ void parse_input_args(char **argv, use_full_precision = true; continue; } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } } } @@ -96,11 +102,13 @@ void FlexFlow::top_level_task(Task const *task, FilePaths file_paths; ModelType model_type; bool use_full_precision = false; + bool verbose = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, file_paths, model_type, use_full_precision); + parse_input_args( + argv, argc, file_paths, model_type, use_full_precision, verbose); assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -131,7 +139,7 @@ void FlexFlow::top_level_task(Task const *task, RequestManager rm((model_type == ModelType::LLAMA) ? (Tokenizer *)sp_tokenizer : (Tokenizer *)opt_tokenizer, - /*verbose*/ false, + /*verbose*/ verbose, file_paths.output_file_path); int total_num_requests = 0; { diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 6a5070790c..0344f19b8a 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -215,7 +215,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.dim, llama_config.dim / llama_config.n_heads); fileloader.load_weights(&ff, weights_layers); - std::cout << "------load wieght finished----------" << std::endl; + std::cout << "------load weight finished----------" << std::endl; // init operators im.init_operators_inference(&ff); diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8df4cf4028..896e4b082e 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -45,7 +45,8 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, ModelTypes &model_types, - bool &use_full_precision) { + bool &use_full_precision, + bool &verbose) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -120,6 +121,11 @@ void parse_input_args(char **argv, use_full_precision = true; continue; } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } } } @@ -131,11 +137,13 @@ void FlexFlow::top_level_task(Task const *task, FilePaths file_paths; ModelTypes model_types; bool use_full_precision = false; + bool verbose = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, argc, file_paths, model_types, use_full_precision); + parse_input_args( + argv, argc, file_paths, model_types, use_full_precision, verbose); if (file_paths.ssm_weight_file_paths.size() == 0) { assert(false && "SpecInfer needs at least one SSM for speculative inference"); @@ -184,7 +192,7 @@ void FlexFlow::top_level_task(Task const *task, RequestManager rm((model_types.llm_model_type == ModelType::LLAMA) ? (Tokenizer *)sp_tokenizer : (Tokenizer *)opt_tokenizer, - /*verbose*/ false, + /*verbose*/ verbose, file_paths.output_file_path); int total_num_requests = 0; { diff --git a/inference/utils/download_llama_weights.py b/inference/utils/download_llama_weights.py index bbf4f349ee..1cd6928080 100644 --- a/inference/utils/download_llama_weights.py +++ b/inference/utils/download_llama_weights.py @@ -2,8 +2,17 @@ import os import requests +import argparse from transformers import AutoModelForCausalLM +# You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. +parser = argparse.ArgumentParser() +parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") +args = parser.parse_args() +if not args.use_full_precision: + import torch + torch.set_default_tensor_type(torch.HalfTensor) + # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) @@ -33,12 +42,12 @@ def convert_hf_model(model, dst_folder): # Download and convert big model weights model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") -dst_folder="../weights/llama_7B_weights" +dst_folder="../weights/llama_7B_weights" if args.use_full_precision else "../weights/llama_7B_weights_half" convert_hf_model(model, dst_folder) # Download and convert small model weights model = AutoModelForCausalLM.from_pretrained("JackFram/llama-160m") -dst_folder="../weights/llama_160M_weights" +dst_folder="../weights/llama_160M_weights" if args.use_full_precision else "../weights/llama_160M_weights_half" convert_hf_model(model, dst_folder) # Download tokenizer diff --git a/inference/utils/download_opt_weights.py b/inference/utils/download_opt_weights.py index ceade81e65..de42689202 100644 --- a/inference/utils/download_opt_weights.py +++ b/inference/utils/download_opt_weights.py @@ -2,8 +2,18 @@ import os import requests +import argparse +import shutil from transformers import AutoModelForCausalLM +# You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. +parser = argparse.ArgumentParser() +parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") +args = parser.parse_args() +if not args.use_full_precision: + import torch + torch.set_default_tensor_type(torch.HalfTensor) + # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) @@ -23,15 +33,17 @@ def convert_hf_model(model, dst_folder): .replace("out_proj", "wo") ) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") + # copy embedding weights + shutil.copy(os.path.join(dst_folder, "embed_tokens_weight"), os.path.join(dst_folder, "embed_tokens_weight_lm_head")) # Download and convert big model weights model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b") -dst_folder="../weights/opt_6B_weights" +dst_folder="../weights/opt_6B_weights" if args.use_full_precision else "../weights/opt_6B_weights_half" convert_hf_model(model, dst_folder) # Download and convert small model weights model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") -dst_folder="../weights/opt_125M_weights" +dst_folder="../weights/opt_125M_weights" if args.use_full_precision else "../weights/opt_125M_weights_half" convert_hf_model(model, dst_folder) # Download tokenizer files diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index bf5b1021ae..44e6288529 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -93,10 +93,11 @@ __global__ void long long const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum(sum, v_shared); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, + v_shared); // use BlockReduceSum() to sum X_ij^2 if (threadIdx.x == 0) { - rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); + rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); } } @@ -130,10 +131,7 @@ void forward_kernel(RMSNormMeta const *m, int parallelism = m->batch_size * m->in_dim; RowwiseRootMeanSquareKernel <<batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->in_dim, - m->eps, - input_ptr, - static_cast(m->rms_ptr)); + m->in_dim, m->eps, input_ptr, static_cast(m->rms_ptr)); NormKernel<<batch_size, kCUDANumThreads, 0, stream>>>( m->in_dim, input_ptr, diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 80683e7a2d..15130c19a7 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -63,7 +63,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, log_measure.debug( "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed); } - } template diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index cc2c6e0bfb..b591e19ed8 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -137,7 +137,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += processed_tokens_in_batch; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index cf0aeb94de..c850e2173f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -16,6 +16,7 @@ #include "flexflow/inference.h" #include "flexflow/parallel_ops/parallel_op.h" #include "flexflow/tokenizers.h" +#include #include #include @@ -146,6 +147,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath); if (outputFile.is_open()) { + outputFile << "end-to-end latency: " << std::fixed + << std::setprecision(3) << total_request_run_time + << std::endl; + outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; if (i < request.tokens.size() - 1) { @@ -456,6 +461,10 @@ BeamSearchBatchConfig if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath); if (outputFile.is_open()) { + outputFile << "end-to-end latency: " << std::fixed + << std::setprecision(3) << total_request_run_time + << std::endl; + outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; if (i < request.tokens.size() - 1) { diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py new file mode 100644 index 0000000000..577121a8a8 --- /dev/null +++ b/tests/inference/huggingface_inference.py @@ -0,0 +1,59 @@ +import argparse +import json +import os +from transformers import AutoModelForCausalLM +from transformers import AutoTokenizer + +def main(): + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + # Parse command line arguments + parser = argparse.ArgumentParser() + parser.add_argument('--model-name', type=str, required=True) + parser.add_argument('--tokenizer-model-name', type=str, required=True) + parser.add_argument('--max-length', type=int, default=128) + parser.add_argument('--prompt-file', type=str, required=True) + parser.add_argument('--output-file', type=str, required=True) + parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument("--gpu", action="store_true", help="Run on GPU") + args = parser.parse_args() + # Check if max-length is greater than 0 + if args.max_length <= 0: + print("Error: max-length must be greater than 0.") + return + # Check if prompt-file exists + if not os.path.isfile(args.prompt_file): + print(f"Error: {args.prompt_file} does not exist.") + return + + # Read prompt-file into a list of strings + with open(args.prompt_file, 'r') as f: + try: + prompt_list = json.load(f) + except json.JSONDecodeError: + print(f"Error: Unable to parse {args.prompt_file} as JSON.") + return + + # Set default tensor type depending on argument indicating the float type to use + if not args.use_full_precision: + import torch + torch.set_default_tensor_type(torch.HalfTensor) + + # Run huggingface model + device = "cuda" if args.gpu else "cpu" + model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model_name) + with open(args.output_file, 'w') as f: + for i, prompt in enumerate(prompt_list): + batch = tokenizer(prompt_list, return_tensors="pt", add_special_tokens=True).to(device) + generated = model.generate(batch["input_ids"], max_length=args.max_length) + out = tokenizer.decode(generated[0]) + # Write output to file + out_str = out if i == (len(prompt_list) - 1) else out + "\n" + f.write(out_str) + +if __name__ == '__main__': + main() diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index d82b5e26fc..46fa70c688 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -3,12 +3,7 @@ set -x set -e cleanup() { - rm -rf ../inference/prompt ../inference/weights ../inference/tokenizer -} - -copy_embedding_weights(){ - cp ../inference/weights/opt_6B_weights/embed_tokens_weight ../inference/weights/opt_6B_weights/embed_tokens_weight_lm_head - cp ../inference/weights/opt_125M_weights/embed_tokens_weight ../inference/weights/opt_125M_weights/embed_tokens_weight_lm_head + rm -rf ../inference/prompt ../inference/weights ../inference/tokenizer ../inference/output } # Cd into directory holding this script @@ -20,12 +15,11 @@ cleanup # Update the transformers library to support the LLAMA model pip3 install --upgrade transformers -# Download the weights +# Download the weights in both half and full precision python3 ../inference/utils/download_llama_weights.py +python3 ../inference/utils/download_llama_weights.py --use-full-precision python3 ../inference/utils/download_opt_weights.py - -# because huggingface reuse a weight in embedding and final linear -copy_embedding_weights +python3 ../inference/utils/download_opt_weights.py --use-full-precision # Create test prompt file mkdir -p ../inference/prompt @@ -40,9 +34,13 @@ mkdir -p ../inference/output # LLAMA ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama.txt +# LLAMA (half precision) +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half.txt # OPT ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt.txt +# OPT (half precision) +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half.txt ############################################################################################### ############################ Incremental decoding tests ####################################### @@ -50,24 +48,104 @@ mkdir -p ../inference/output # LLAMA (small model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M.txt +# LLAMA (small model, half precision) +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half.txt # LLAMA (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B.txt +# LLAMA (big model, half precision) +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half.txt # OPT (small model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M.txt +# OPT (small model, half precision) +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half.txt # OPT (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B.txt +# OPT (big model, half precision) +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half.txt ############################################################################################### -################################## Alignment tests ############################################ +############################### Alignment and Speed tests ##################################### ############################################################################################### -diff ../inference/output/incr_decoding_llama_7B.txt ../inference/output/spec_inference_llama.txt -diff ../inference/output/incr_decoding_opt_6B.txt ../inference/output/spec_inference_opt.txt +############ Alignment between speculative inference and incremental decoding ################# +# Full precision +diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B.txt") <(tail -n +2 "../inference/output/spec_inference_llama.txt") +diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B.txt") <(tail -n +2 "../inference/output/spec_inference_opt.txt") +# Half precision +#diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half.txt") <(tail -n +2 "../inference/output/spec_inference_llama_half.txt") +#diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half.txt" ) <(tail -n +2 "../inference/output/spec_inference_opt_half.txt") + +# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding +function compare_speed_spec_infer_incr_decoding { + local incrDec_file="$1" + local specInf_file="$2" + + # Read the float numbers from the first line of the files + incrDec=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$incrDec_file") + specInf=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$specInf_file") + + if ! command -v bc &> /dev/null; then + echo "bc is not installed. Installing..." + sudo apt-get install -y bc + fi + + # Perform the comparison + threshold=$(bc <<< "$specInf * 1.5") + if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + #echo "The latency in $specInf_file is at least 1.5x smaller than the latency from $incrDec_file." + : + else + echo "Error: The latency in $specInf_file is not at least 1.5x smaller than the latency in $incrDec_file!" + exit 1 + fi +} +# Full precision +compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B.txt" "../inference/output/spec_inference_llama.txt" +compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B.txt" "../inference/output/spec_inference_opt.txt" +# Half precision +#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" +#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" + +######################### Alignment tests with HuggingFace #################################### +pip3 install protobuf==3.20.3 + +# LLAMA (small model, full precision) +python3 ./inference/huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu + +# LLAMA (small model, half precision) +python3 ./inference/huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu + +# LLAMA (big model, full precision) +python3 ./inference/huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" + +# LLAMA (big model, half precision) +python3 ./inference/huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu + +# OPT (small model, full precision) +python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 127 + +# OPT (small model, half precision) +python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 127 + +# OPT (big model, full precision) +#python3 ./inference/huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 + +# OPT (big model, half precision) +#python3 ./inference/huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 + +diff <(tail -n +2 "../inference/output/huggingface_llama_160M.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_160M.txt") +diff <(tail -n +2 "../inference/output/huggingface_llama_160M_half.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_160M_half.txt") +diff <(tail -n +2 "../inference/output/huggingface_llama_7B.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_7B.txt") +diff <(tail -n +2 "../inference/output/huggingface_llama_7B_half.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_7B_half.txt") +diff <(tail -n +2 "../inference/output/huggingface_opt_125M.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_125M.txt") +diff <(tail -n +2 "../inference/output/huggingface_opt_125M_half.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_125M_half.txt") +#diff <(tail -n +2 "../inference/output/huggingface_opt_6B.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_6B.txt") +#diff <(tail -n +2 "../inference/output/huggingface_opt_6B_half.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_6B_half.txt") ############################################################################################### ###################################### Cleanup ################################################ From 2de625568703b8ffc6ac43c0d0478285004e566b Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Thu, 8 Jun 2023 21:23:18 +0800 Subject: [PATCH 148/344] [SpecInfer] Running multiple SSMs with single RM (#734) * Add support for login information with multiple ssms. * Update prepare_next_batch_verify. * Add dedup tree merge. * Format. * Fix bugs. * Runs with mutilmodels. * Fix. * Format * Fix. * Fix increamental decoding. * fix use_full_precision issue. --- include/flexflow/batch_config.h | 4 + include/flexflow/inference.h | 39 ++-- inference/incr_decoding/incr_decoding.cc | 33 +-- inference/spec_infer/spec_infer.cc | 142 ++++++++----- src/runtime/beam_search_batch_config.cc | 17 ++ src/runtime/request_manager.cc | 256 ++++++++++++++++------- 6 files changed, 334 insertions(+), 157 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index fd0f419db6..39fcc49c68 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -98,7 +98,9 @@ struct InferenceResult { class BeamSearchBatchConfig : public BatchConfig { public: BeamSearchBatchConfig(); + BeamSearchBatchConfig(int model_id); BeamSearchBatchConfig(size_t beam_width, size_t target_iterations); + BeamSearchBatchConfig(BeamSearchBatchConfig const &other, int model_id); InferenceMode get_mode() const; ~BeamSearchBatchConfig(); @@ -113,6 +115,8 @@ class BeamSearchBatchConfig : public BatchConfig { static int const MAX_BEAM_WIDTH = 1; static int const MAX_BEAM_DEPTH = 8; + int model_id; + struct BeamSearchPerRequestInfo { int beam_size; int current_depth = -1; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 8825a79283..3753e5e50e 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -53,6 +53,8 @@ struct Request { int max_sequence_length; int initial_len; std::vector tokens; + + std::vector beam_trees; }; // store the result of beam search @@ -83,6 +85,11 @@ class RequestManager { std::string output_filepath = ""); RequestManager(); size_t get_num_processed_requests(); + + int register_new_model(FFModel *model); + + FFModel *get_model(int model_id); + RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); RequestGuid register_new_request(std::vector const &prompt, @@ -95,10 +102,11 @@ class RequestManager { BeamSearchBatchConfig prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, - InferenceResult const &result); + InferenceResult const &result, + int model_id); - TreeVerifyBatchConfig - prepare_next_batch_verify(BeamSearchBatchConfig const &old_bc); + TreeVerifyBatchConfig prepare_next_batch_verify( + std::vector const &old_batches); void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); @@ -111,6 +119,13 @@ class RequestManager { int request_index, int token_start_offset); + // remove guid after put the cached tree in request + std::vector> merge_dfs_trees( + std::vector>> + input_trees, + int root_depth, + RequestGuid guid); + std::vector> traverse_verify_tree( size_t guid, std::vector> const @@ -118,10 +133,6 @@ class RequestManager { std::vector> const &outputSerializedTree); - // TreeVerifyBatchConfig - // convert_beam_to_tree_batch_config(BeamSearchBatchConfig const - // &beam_bc); - static void load_tokens_task(Legion::Task const *task, std::vector const ®ions, @@ -142,17 +153,17 @@ class RequestManager { std::mutex request_queue_mutex; RequestGuid next_available_guid; - struct BeamTree beam_trees[BatchConfig::MAX_NUM_REQUESTS]; - + // TODO: Move this two vector to request struct std::unordered_map>> - dfs_tree_inputs_map; - - // std::unordered_map beam_trees_v2; - // TODO: cache config info for Verify/Beam exchange: Beam Width, Beam Depth, - // Committed Tokens + dfs_tree_inputs; std::unordered_map>> committed_tokens; + + // Multi-model support + int num_ssms; + std::vector models; + // Performance profiling size_t num_processed_requests; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 895eafb601..d80ed9520b 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -141,22 +141,6 @@ void FlexFlow::top_level_task(Task const *task, : (Tokenizer *)opt_tokenizer, /*verbose*/ verbose, file_paths.output_file_path); - int total_num_requests = 0; - { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - rm.register_new_request(text, 128 /*max_sequence_length*/); - } - } FFModel model(ffconfig); if (model_type == ModelType::LLAMA) { @@ -178,6 +162,23 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision); } + int total_num_requests = 0; + { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + rm.register_new_request(text, 128 /*max_sequence_length*/); + } + } + BatchConfig bc; InferenceResult ir; while (rm.get_num_processed_requests() < total_num_requests) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 896e4b082e..2e38723036 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -144,6 +144,7 @@ void FlexFlow::top_level_task(Task const *task, int argc = command_args.argc; parse_input_args( argv, argc, file_paths, model_types, use_full_precision, verbose); + if (file_paths.ssm_weight_file_paths.size() == 0) { assert(false && "SpecInfer needs at least one SSM for speculative inference"); @@ -194,42 +195,9 @@ void FlexFlow::top_level_task(Task const *task, : (Tokenizer *)opt_tokenizer, /*verbose*/ verbose, file_paths.output_file_path); - int total_num_requests = 0; - { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - rm.register_new_request(text, 128 /*max_sequence_length*/); - } - } - FFModel beam_model(ffconfig); + // Create LLM model FFModel tree_model(ffconfig); - if (model_types.ssm_model_types[0] == ModelType::LLAMA) { - LLAMA::create_llama_model(beam_model, - im, - file_paths.ssm_config_file_paths[0], - file_paths.ssm_weight_file_paths[0], - 1, - BEAM_SEARCH_MODE, - use_full_precision); - } else { - OPT::create_opt_model(beam_model, - im, - file_paths.ssm_config_file_paths[0], - file_paths.ssm_weight_file_paths[0], - 1, - BEAM_SEARCH_MODE, - use_full_precision); - } if (model_types.llm_model_type == ModelType::LLAMA) { LLAMA::create_llama_model(tree_model, im, @@ -238,7 +206,7 @@ void FlexFlow::top_level_task(Task const *task, ffconfig.workersPerNode * ffconfig.numNodes, TREE_VERIFY_MODE, use_full_precision); - } else { + } else if (model_types.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, im, file_paths.llm_config_file_path, @@ -246,35 +214,113 @@ void FlexFlow::top_level_task(Task const *task, ffconfig.workersPerNode * ffconfig.numNodes, TREE_VERIFY_MODE, use_full_precision); + } else { + assert(false && "Invalid LLM model type passed (or no type was passed)."); + } + + // Create SSM models + int num_ssms = model_types.ssm_model_types.size(); + std::vector ssm_model_ids; + std::vector ssm_models; + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + FFModel beam_model(ffconfig); + ssm_models.push_back(beam_model); + } + + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + FFModel &beam_model = ssm_models[ssm_id]; + if (model_types.ssm_model_types[ssm_id] == ModelType::LLAMA) { + LLAMA::create_llama_model(beam_model, + im, + file_paths.ssm_config_file_paths[ssm_id], + file_paths.ssm_weight_file_paths[ssm_id], + 1, + BEAM_SEARCH_MODE, + use_full_precision); + } else if (model_types.ssm_model_types[ssm_id] == ModelType::OPT) { + OPT::create_opt_model(beam_model, + im, + file_paths.ssm_config_file_paths[ssm_id], + file_paths.ssm_weight_file_paths[ssm_id], + 1, + BEAM_SEARCH_MODE, + use_full_precision); + } else { + assert(false && "Invalid SSM model type passed."); + } + + int beam_model_id = rm.register_new_model(&beam_model); + ssm_model_ids.push_back(beam_model_id); + } + + // Register requests from prompt file + int total_num_requests = 0; + { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + rm.register_new_request(text, 128 /*max_sequence_length*/); + } } TreeVerifyBatchConfig tree_bc; BeamSearchBatchConfig beam_bc; + std::vector beam_bc_vec; + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + beam_bc_vec.push_back(BeamSearchBatchConfig(ssm_model_ids[ssm_id])); + } + InferenceResult tree_ir; while (rm.get_num_processed_requests() < total_num_requests) { int depth = 0; // Beam Search - beam_bc = rm.prepare_next_batch_init(tree_bc, tree_ir); + beam_bc = rm.prepare_next_batch_init(tree_bc, tree_ir, 0); + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + beam_bc_vec[ssm_id] = beam_bc; + beam_bc_vec[ssm_id].model_id = ssm_id; + } + if (rm.get_num_processed_requests() >= total_num_requests) { break; } - while (true) { - depth = beam_bc.current_depth_all_requests(); - FutureMap fm = im.inference(&beam_model, 0, beam_bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - BeamInferenceResult beam_ir = future.get_result(); - if (depth - 1 >= beam_bc.max_beam_depth_all_requests()) { - break; - } else { - beam_bc = rm.prepare_next_batch_beam(beam_bc, beam_ir); + + for (int i = 0; i < num_ssms; i++) { + while (true) { + beam_bc = beam_bc_vec[i]; + depth = beam_bc.beamRequestsInfo[0].current_depth; + + FutureMap fm = im.inference(rm.get_model(0), 0, beam_bc_vec[i]); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + BeamInferenceResult beam_ir = future.get_result(); + + if (depth - 1 >= BeamSearchBatchConfig::MAX_BEAM_DEPTH) { + break; + } else { + beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); + if (beam_bc_vec[i].num_active_tokens() == 0 && + beam_bc_vec[i].num_active_requests() != 0) { + break; + } + } } + std::cout << "----------beam search finished for model " + << beam_bc_vec[i].model_id << "------------" << std::endl; } // Token Tree Verification { - tree_bc = rm.prepare_next_batch_verify(beam_bc); + tree_bc = rm.prepare_next_batch_verify(beam_bc_vec); FutureMap fm = im.inference(&tree_model, 0, tree_bc); + assert(fm.get_future_map_domain().get_volume() == 1); Future future = fm.get_future(0); tree_ir = future.get_result(); @@ -298,4 +344,4 @@ void FlexFlow::top_level_task(Task const *task, } } -void FlexFlow::register_custom_tasks() {} +void FlexFlow::register_custom_tasks() {} \ No newline at end of file diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index c177be0681..dc30d89d78 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -31,6 +31,14 @@ BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() { current_iteration = 0; } +BeamSearchBatchConfig::BeamSearchBatchConfig(int model_id) : BatchConfig() { + this->model_id = model_id; + std::cout << "==================\n" + << "Register Batch Config with Model " << this->model_id + << std::endl; + current_iteration = 0; +} + BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width, size_t target_iterations) : BatchConfig() { @@ -39,6 +47,15 @@ BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width, current_iteration = 0; } +BeamSearchBatchConfig::BeamSearchBatchConfig(BeamSearchBatchConfig const &other, + int model_id) + : BatchConfig() { + this->beam_width = other.beam_width; + this->target_iterations = other.target_iterations; + this->model_id = model_id; + current_iteration = 0; +} + BeamSearchBatchConfig::~BeamSearchBatchConfig() {} InferenceMode BeamSearchBatchConfig::get_mode() const { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index c850e2173f..bceb6e5953 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -36,6 +36,20 @@ RequestManager::RequestManager(Tokenizer *_tokenizer, : tokenizer(_tokenizer), verbose(_verbose), next_available_guid(1000000), num_processed_requests(0), output_filepath(_output_filepath) {} +int RequestManager::register_new_model(FFModel *model) { + int model_id = models.size(); + models.push_back(model); + std::cout << "Register new model with id: " << model_id << std::endl; + num_ssms++; + assert(models.size() == num_ssms); + return model_id; +} + +FFModel *RequestManager::get_model(int model_id) { + assert(model_id < models.size()); + return models[model_id]; +} + RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, int max_sequence_length) { @@ -48,6 +62,18 @@ RequestManager::RequestGuid request.initial_len = prompt.size(); request.tokens = prompt; + if (num_ssms == 0) { + std::cout << "No small spective model registered yet, using increamental " + "decoding." + << std::endl; + } else { + std::cout << "Num of models: " << num_ssms << std::endl; + for (int i = 0; i < num_ssms; i++) { + BeamTree beam_tree = BeamTree{}; + request.beam_trees.push_back(beam_tree); + } + } + pending_request_queue.push(request); if (verbose) { @@ -73,6 +99,18 @@ RequestManager::RequestGuid request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); request.initial_len = request.tokens.size(); + if (num_ssms == 0) { + std::cout << "No small spective model registered yet, using increamental " + "decoding." + << std::endl; + } else { + std::cout << "Num of models: " << num_ssms << std::endl; + for (int i = 0; i < num_ssms; i++) { + BeamTree beam_tree = BeamTree{}; + request.beam_trees.push_back(beam_tree); + } + } + pending_request_queue.push(request); { std::string output = "New request tokens:"; @@ -264,6 +302,8 @@ BeamSearchBatchConfig // Step 2: preparing the next batch for existing requests BeamSearchBatchConfig new_bc; + new_bc.model_id = old_bc.model_id; + std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { if (old_bc.request_completed[i]) { @@ -312,8 +352,8 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].max_depth; // do the slot exchange to minimize the cache exchange in kernel. - // std::cout << "update metadata" << std::endl; - update_beam_metadata(new_bc, beam_trees[i], i); + std::cout << "update metadata" << std::endl; + update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); if (new_bc.requestsInfo[i].token_start_offset + 1 >= request.tokens.size()) { @@ -355,7 +395,8 @@ BeamSearchBatchConfig BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, - InferenceResult const &result) { + InferenceResult const &result, + int model_id) { const std::lock_guard lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; @@ -363,6 +404,7 @@ BeamSearchBatchConfig // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; + new_bc.model_id = model_id; int result_index = 0; for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { @@ -423,7 +465,7 @@ BeamSearchBatchConfig } std::vector> verified_tokens = - traverse_verify_tree(guid, dfs_tree_inputs_map.at(guid), tree_outputs); + traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); // check if the request is finished @@ -481,9 +523,9 @@ BeamSearchBatchConfig } } - beam_trees[i] = BeamTree{}; - dfs_tree_inputs_map.erase( - request.guid); // delete the old input tree from cache + // delete the old input tree from cache + dfs_tree_inputs.erase(request.guid); + continue; } @@ -598,26 +640,38 @@ BeamSearchBatchConfig } TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( - BeamSearchBatchConfig const &old_bc) { + std::vector const &old_batches) { const std::lock_guard lock(request_queue_mutex); + if (verbose) { std::cout << "\n############### prepare_next_batch_verify ###############\n"; } + assert(old_batches.size() > 0); + TreeVerifyBatchConfig new_bc; new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { - if (old_bc.request_completed[i]) { + if (old_batches.at(0).request_completed[i]) { continue; } - size_t guid = old_bc.requestsInfo[i].request_guid; + size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = running_request_queue[guid]; // Get the dfs tree + std::vector>> + all_dfs_trees; + + for (int j = 0; j < old_batches.size(); j++) { + std::vector> new_tree = + traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1); + all_dfs_trees.push_back(new_tree); + } + assert(all_dfs_trees.size() == old_batches.size()); std::vector> dfs_tree_inputs = - traverse_beam_tree(old_bc, i, request.tokens.size() - 1); + merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); if (verbose) { std::cout << "Request Tokens Size: " << request.tokens.size() @@ -629,9 +683,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Normal Request Info new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].request_guid = + old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + old_batches.at(0).requestsInfo[i].max_sequence_length; // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; @@ -682,11 +737,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // add prompt to the dfs tree if (committed_tokens.find(guid) != committed_tokens.end()) { - // std::cout << "committed_tokens.size(): " << - // committed_tokens.at(guid).size() << std::endl; std::cout << - // "dfs_tree_inputs.at(0).second: " << dfs_tree_inputs.at(0).second << - // std::endl; std::cout << "request.initial_len: " << request.initial_len - // << std::endl; if (dfs_tree_inputs.at(0).second == request.initial_len + committed_tokens.at(guid).size() - 1) { for (int j = 0; j < request.initial_len; j++) { @@ -742,18 +792,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } if (committed_tokens.find(guid) != committed_tokens.end()) { - // if (j == 1) { - // auto committed_token = committed_tokens.at(guid).at(0); - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - // committed_token.second; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index - // = i; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - // committed_token.first; std:: cout << new_bc.num_tokens_to_commit - // << "- committed_token.token_depth: " << committed_token.first << - // ", token_index: " << committed_token.second << std::endl; - // new_bc.num_tokens_to_commit++; - // } if (j < committed_tokens.at(guid).size()) { auto committed_token = committed_tokens.at(guid).at(j); new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = @@ -789,8 +827,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (verbose) { std::cout << "prepare_next_batch_verify OLD vs NEW batchconfigs below:" << std::endl; - old_bc.print(); - new_bc.print(); + // old_batches.print(); + // new_bc.print(); } return new_bc; @@ -836,17 +874,21 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, int beam_size = old_bc.beamRequestsInfo[index].beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; + Request &request = + running_request_queue[old_bc.requestsInfo[index].request_guid]; + if (depth == 1) { // store the last input into the tree; if (verbose) { std::cout << "try to store the input" << "\n"; } - Request &request = - running_request_queue[old_bc.requestsInfo[index].request_guid]; - beam_trees[index].treeLayers[0].tokens[0] = request.tokens.back(); - beam_trees[index].treeLayers[0].probs[0] = 1; - beam_trees[index].treeLayers[0].parent_ids[0] = -1; + + request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] = + request.tokens.back(); + request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; + request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; + if (verbose) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; @@ -854,18 +896,22 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } for (int beam_id = 0; beam_id < beam_width; beam_id++) { - beam_trees[index].treeLayers[depth].tokens[beam_id] = - result.token_ids[result_index]; - beam_trees[index].treeLayers[depth].probs[beam_id] = - result.probs[result_index]; - beam_trees[index].treeLayers[depth].parent_ids[beam_id] = - result.parent_id[result_index]; + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .tokens[beam_id] = result.token_ids[result_index]; + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .probs[beam_id] = result.probs[result_index]; + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .parent_ids[beam_id] = result.parent_id[result_index]; if (verbose) { - std::cout << "tree value: " << depth << " token: " - << beam_trees[index].treeLayers[depth].tokens[beam_id] - << " result tokens: " << result.token_ids[result_index] - << std::endl; + std::cout << "tree value: " << depth << "token: " + << request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .tokens[beam_id] + << "result tokens: " << result.token_ids[result_index]; } result_index += 1; } @@ -891,27 +937,16 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; - // std::cout << "-----------before parent id exchange-----------" << - // std::endl; for (int j = 0; j < beam_size; j++) { - // std::cout << "after request id: " << request_index << "beam id = " << j - // << "parnt: " - // << new_bc.beamRequestsInfo[request_index].parent_id[j] - // << "token: " << - // new_bc.beamRequestsInfo[request_index].tokens[j] - // << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] - // << std::endl; - // // std::fixed << std::setprecision(15)<< - // } - if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct - for (int j = 0; j < beam_size; j++) { - new_bc.beamRequestsInfo[request_index].parent_id[j] = j; - new_bc.beamRequestsInfo[request_index].probs[j] = - tree.treeLayers[depth].probs[j]; // ? - new_bc.beamRequestsInfo[request_index].tokens[j] = - tree.treeLayers[depth].tokens[j]; // ? - } + // for (int j = 0; j < beam_size; j++) { + // new_bc.beamRequestsInfo[request_index].parent_id[j] = j; + // new_bc.beamRequestsInfo[request_index].probs[j] = + // tree.treeLayers[depth].probs[j]; // ? + // new_bc.beamRequestsInfo[request_index].tokens[j] = + // tree.treeLayers[depth].tokens[j]; // ? + // } + assert(false); } else { std::set parents; std::set childs; @@ -1126,7 +1161,13 @@ std::vector> std::cout << "[Traverse Beam Tree] beam_width: " << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; } - BeamTree tree = beam_trees[request_index]; + + auto guid = old_bc.requestsInfo[request_index].request_guid; + Request &request = running_request_queue[guid]; + std::cout << "request.beam_trees.size(): " << request.beam_trees.size() + << std::endl; + BeamTree tree = request.beam_trees.at(old_bc.model_id); + std::cout << "\n\n"; // token, index // todo make this one global for different stages @@ -1151,21 +1192,78 @@ std::vector> << ", depth: " << serializedTree.at(k).second << "\n"; } } - // std::cout << "Done printing serialized tree, " - // << old_bc.requestsInfo[request_index].request_guid << "\n"; - - if (dfs_tree_inputs_map.find( - old_bc.requestsInfo[request_index].request_guid) != - dfs_tree_inputs_map.end()) { - dfs_tree_inputs_map[old_bc.requestsInfo[request_index].request_guid] = - serializedTree; - } else { - dfs_tree_inputs_map.insert(std::make_pair( - old_bc.requestsInfo[request_index].request_guid, serializedTree)); - } + + // if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid) + // != + // dfs_tree_inputs.end()) { + // dfs_tree_inputs[old_bc.requestsInfo[request_index].request_guid] = + // serializedTree; + // } else { + // dfs_tree_inputs.insert(std::make_pair( + // old_bc.requestsInfo[request_index].request_guid, serializedTree)); + // } return serializedTree; // } } +std::vector> + RequestManager::merge_dfs_trees( + std::vector>> + input_trees, + int root_depth, + RequestGuid guid) { + std::vector> merged_tree; + + std::unordered_map> childrens; + std::unordered_map curr_path; + + // convert pair to an integer + auto root = input_trees.at(0).at(0); + int root_id = root.first * 10000 + root.second; + + for (int i = 0; i < input_trees.size(); i++) { + auto tree = input_trees.at(i); + // all trees should have the same root + assert(tree.at(0) == root); + + for (auto const &pair : tree) { + int id = pair.first * 10000 + pair.second; // current node + curr_path[pair.second] = id; // log node in current search + + if (childrens.find(id) == childrens.end()) { + // init empty set + childrens[id] = std::set(); + } + + if (pair.second > root_depth) { + int parent_id = curr_path[pair.second - 1]; + childrens[parent_id].insert(id); + } + } + } + + std::stack q; + q.push(root_id); + + while (!q.empty()) { + int curr = q.top(); + q.pop(); + merged_tree.push_back(std::make_pair(curr / 10000, curr % 10000)); + for (int child : childrens[curr]) { + q.push(child); + } + } + + if (verbose) { + for (auto &pair : merged_tree) { + std::cout << pair.first << ", depth: " << pair.second << std::endl; + } + } + + dfs_tree_inputs[guid] = merged_tree; + + return merged_tree; +} + }; // namespace FlexFlow From e13190852c98f084b49d5ade1b92d7bf0c4354cd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 15 Jun 2023 18:52:41 +0800 Subject: [PATCH 149/344] Fix inference test (#767) * fix * fix workflow --- .github/workflows/gpu-ci-skip.yml | 7 +++- .github/workflows/gpu-ci.yml | 4 +++ tests/inference/huggingface_inference.py | 41 +++++++++++++++--------- tests/inference_tests.sh | 2 +- 4 files changed, 36 insertions(+), 18 deletions(-) diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index d8e5353e79..766bd8d790 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -1,15 +1,20 @@ name: "gpu-ci" on: pull_request: - paths-ignore: + paths: - "cmake/**" - "config/**" - "python/**" - "setup.py" - "include/**" + - "inference/**" - "src/**" + - "tests/inference/**" - ".github/workflows/gpu-ci.yml" + - "tests/cpp_gpu_tests.sh" + - "tests/inference_tests.sh" - "tests/multi_gpu_tests.sh" + - "tests/python_interface_test.sh" workflow_dispatch: concurrency: diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 00b3138e00..95983f889b 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -9,8 +9,10 @@ on: - "include/**" - "inference/**" - "src/**" + - "tests/inference/**" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" + - "tests/inference_tests.sh" - "tests/multi_gpu_tests.sh" - "tests/python_interface_test.sh" push: @@ -24,8 +26,10 @@ on: - "include/**" - "inference/**" - "src/**" + - "tests/inference/**" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" + - "tests/inference_tests.sh" - "tests/multi_gpu_tests.sh" - "tests/python_interface_test.sh" workflow_dispatch: diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 577121a8a8..3442fe6120 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -1,23 +1,25 @@ import argparse import json import os -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer + def main(): # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) - + # Parse command line arguments parser = argparse.ArgumentParser() - parser.add_argument('--model-name', type=str, required=True) - parser.add_argument('--tokenizer-model-name', type=str, required=True) - parser.add_argument('--max-length', type=int, default=128) - parser.add_argument('--prompt-file', type=str, required=True) - parser.add_argument('--output-file', type=str, required=True) - parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument("--model-name", type=str, required=True) + parser.add_argument("--tokenizer-model-name", type=str, required=True) + parser.add_argument("--max-length", type=int, default=128) + parser.add_argument("--prompt-file", type=str, required=True) + parser.add_argument("--output-file", type=str, required=True) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) parser.add_argument("--gpu", action="store_true", help="Run on GPU") args = parser.parse_args() # Check if max-length is greater than 0 @@ -28,32 +30,39 @@ def main(): if not os.path.isfile(args.prompt_file): print(f"Error: {args.prompt_file} does not exist.") return - + # Read prompt-file into a list of strings - with open(args.prompt_file, 'r') as f: + with open(args.prompt_file, "r") as f: try: prompt_list = json.load(f) except json.JSONDecodeError: print(f"Error: Unable to parse {args.prompt_file} as JSON.") return - + # Set default tensor type depending on argument indicating the float type to use if not args.use_full_precision: import torch + torch.set_default_tensor_type(torch.HalfTensor) # Run huggingface model device = "cuda" if args.gpu else "cpu" model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device) - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model_name) - with open(args.output_file, 'w') as f: + if args.tokenizer_model_name == "JackFram/llama-160m": + tokenizer = LlamaTokenizer.from_pretrained("JackFram/llama-160m", use_fast=True) + else: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model_name) + with open(args.output_file, "w") as f: for i, prompt in enumerate(prompt_list): - batch = tokenizer(prompt_list, return_tensors="pt", add_special_tokens=True).to(device) + batch = tokenizer( + prompt_list, return_tensors="pt", add_special_tokens=True + ).to(device) generated = model.generate(batch["input_ids"], max_length=args.max_length) out = tokenizer.decode(generated[0]) # Write output to file out_str = out if i == (len(prompt_list) - 1) else out + "\n" f.write(out_str) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 46fa70c688..c04000c72b 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -13,7 +13,7 @@ cd "${BASH_SOURCE[0]%/*}" cleanup # Update the transformers library to support the LLAMA model -pip3 install --upgrade transformers +pip3 install --upgrade transformers sentencepiece # Download the weights in both half and full precision python3 ../inference/utils/download_llama_weights.py From 7e84575fe5155f87adb1bbac096796ad4eae485f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 17 Jun 2023 21:06:13 +0800 Subject: [PATCH 150/344] Merge master into inference (#777) * Fix bug in elementwise multiplication with broadcasting (#764) * Fix multinode test (#766) * Fix UCX multinode test (#768) * fix * fix 2 * Prevent format.sh from formatting triton (#756) * [CI] - Increase timeout in multinode test (UCX & MPI) (#773) * fix * fix 2 * increase timeout * Fix docker builds in CI (#774) --------- Co-authored-by: Soumya Chatterjee Co-authored-by: Colin Unger --- .github/workflows/docker-build.yml | 29 ++++++++++++++++++----- .github/workflows/multinode-test.yml | 10 ++++++++ CMakeLists.txt | 4 ++++ config/config.linux | 2 +- docker/flexflow-environment/Dockerfile | 10 ++++++-- scripts/format.sh | 5 ++-- src/ops/kernels/element_binary_kernels.cu | 2 +- 7 files changed, 50 insertions(+), 12 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index bfb01a4b5b..14f64f5fe6 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -8,10 +8,9 @@ on: push: branches: - "master" - paths: - - "docker/**" - - "!docker/README.md" - - ".github/workflows/docker-build.yml" + schedule: + # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated + - cron: "0 8 * * 0" workflow_dispatch: # Cancel outdated workflows if they are still running @@ -43,11 +42,16 @@ jobs: # On push to master, build for all compatible architectures, so that we can publish # a pre-built general-purpose image. On all other cases, only build for one architecture # to save time. - if [[ ${{ github.event_name }} == 'push' && ${GITHUB_REF#refs/heads/} == "master" ]]; then + if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then export FF_CUDA_ARCH=all else export FF_CUDA_ARCH=70 fi + if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + else + export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF + fi ./docker/build.sh flexflow - name: Check availability of Python flexflow.core module @@ -55,13 +59,26 @@ jobs: run: docker run --entrypoint python flexflow-cuda:latest -c "import flexflow.core; exit()" - name: Publish Docker environment image (on push to master) + if: github.repository_owner == 'flexflow' env: FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} FF_GPU_BACKEND: ${{ matrix.gpu_backend }} run: | - if [[ ${{ github.event_name }} == 'push' && ${GITHUB_REF#refs/heads/} == "master" ]]; then + if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then ./docker/publish.sh "flexflow-environment-${FF_GPU_BACKEND}" ./docker/publish.sh "flexflow-${FF_GPU_BACKEND}" else echo "No need to update Docker containers in ghrc.io registry at this time." fi + + notify-slack: + name: Notify Slack in case of failure + runs-on: ubuntu-20.04 + needs: docker-build + if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} + steps: + - name: Send Slack message + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + run: | + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! :x: \"}" $SLACK_WEBHOOK diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index 8bde03094c..cfe3629211 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -34,6 +34,8 @@ jobs: if: github.repository_owner == 'flexflow' runs-on: self-hosted needs: gpu-ci-concierge + # 10h timeout, instead of default of 360min (6h) + timeout-minutes: 600 container: image: ghcr.io/flexflow/flexflow-environment-cuda:latest options: --gpus all --shm-size=8192m @@ -62,6 +64,7 @@ jobs: export FF_HOME=$(pwd) export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests @@ -70,6 +73,7 @@ jobs: export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none @@ -84,6 +88,8 @@ jobs: container: image: ghcr.io/flexflow/flexflow-environment-cuda:latest options: --gpus all --shm-size=8192m + # 10h timeout, instead of default of 360min (6h) + timeout-minutes: 600 steps: - name: Install updated git version run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git @@ -109,6 +115,7 @@ jobs: export FF_HOME=$(pwd) export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests @@ -117,6 +124,7 @@ jobs: export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none @@ -155,6 +163,7 @@ jobs: export FF_HOME=$(pwd) export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests @@ -163,6 +172,7 @@ jobs: export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e305b15bb..11fcfbe533 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,10 @@ if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") set(ENV{UCX_HOME} "${UCX_DIR}/install") + install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin) + install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include) + install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib) + install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share) endif() if (FF_LEGION_NETWORKS STREQUAL "ucx") diff --git a/config/config.linux b/config/config.linux index f63d722c13..482a154145 100755 --- a/config/config.linux +++ b/config/config.linux @@ -91,7 +91,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" } if [ -n "$1" ]; then diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 0586409a9f..00ecd6956c 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -37,6 +37,11 @@ RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ] rm ./amdgpu-install_22.20.50205-1_all.deb; \ amdgpu-install -y --usecase=hip,rocm --no-dkms; \ apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk; \ + # Install protobuf v3.20.x manually + apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \ + git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git; cd protobuf/ ; git submodule update --init --recursive; \ + ./autogen.sh; ./configure; cores_available=$(nproc --all); n_build_cores=$(( cores_available -1 )); \ + if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi; make -j $n_build_cores; make install; ldconfig; cd .. ; \ else \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \ fi @@ -48,9 +53,10 @@ ENV CUDNN_DIR /usr/local/cuda ENV CUDA_DIR /usr/local/cuda # Install python packages and other dependencies -RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing +RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install CPU-only Pytorch and related dependencies RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch -RUN conda install -c conda-forge onnx tensorflow transformers sentencepiece +RUN conda install -c conda-forge onnx transformers sentencepiece +RUN pip3 install tensorflow ENTRYPOINT ["/bin/bash"] diff --git a/scripts/format.sh b/scripts/format.sh index bf13948955..2ed97b8f0a 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -52,11 +52,12 @@ download_clang_tool() { error "Unknown return value from get_os: $OS. Exiting..." esac URL="$BASE_URL/clang-${TOOL}-${VERSION}_${URL_OS}-amd64" + echo "Downloading from $URL..." if command -v wget &> /dev/null; then wget "$URL" -O "$TARGET_PATH" elif command -v curl &> /dev/null; then - curl "$URL" -o "$TARGET_PATH" + curl -L "$URL" -o "$TARGET_PATH" else error "Could not find either wget or curl. Exiting..." fi @@ -67,5 +68,5 @@ if [[ ! -e $CLANG_FORMAT_PATH ]]; then chmod u+x "$CLANG_FORMAT_PATH" fi -mapfile -t FILES < <(git ls-files | grep -E '\.(h|cc|cpp|cu)$' | grep -v '^triton') +mapfile -t FILES < <(git ls-files ':!:triton/**' '*.h' '*.cc' '*.cpp' '*.cu' '*.c') "$CLANG_FORMAT_PATH" -i "${FILES[@]}" diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index 0cbff73b82..6d30ae690a 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -386,7 +386,7 @@ void forward_kernel(ElementBinaryMeta const *m, m->opDesc, &alpha1, m->outputTensor, - in1_ptr, + out_ptr, &alpha2, m->input2Tensor, in2_ptr, From 3969a671d8a58ef84067c12f246af49f141127bb Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 17 Jun 2023 11:28:19 -0400 Subject: [PATCH 151/344] support falcon model (#762) * init * add mlc tokenizer. * . * fix * fix pipeline, fix name * . * format * ci * . * add rust * fix * . * inf test fix * . * fix * . * fix * optimize * move rust to conda env * . * . * fix * fix * fix * update git ignore * fix rust install * Update config.linux --------- Co-authored-by: Gabriele Oliaro --- .dockerignore | 6 + .gitignore | 5 + .gitmodules | 7 +- CMakeLists.txt | 14 +- FlexFlow.mk | 4 +- conda/environment.yml | 1 + conda/flexflow-cpu.yml | 1 + deps/sentencepiece | 1 - deps/tokenizers-cpp | 1 + docker/flexflow-environment/Dockerfile | 4 + include/flexflow/ffconst.h | 3 + include/flexflow/inference.h | 12 +- include/flexflow/model.h | 18 + include/flexflow/operator_params.h | 1 + .../ops/inc_multiquery_attention_params.h | 30 + .../ops/inc_multiquery_self_attention.h | 158 ++ include/flexflow/tokenizers.h | 103 -- inference/file_loader.cc | 52 +- inference/incr_decoding/CMakeLists.txt | 3 +- inference/incr_decoding/incr_decoding.cc | 53 +- inference/models/configs/falcon_7B.json | 11 + inference/models/falcon.cc | 178 ++ inference/models/falcon.h | 115 ++ inference/spec_infer/spec_infer.cc | 36 +- inference/utils/download_opt_weights.py | 4 + python/Makefile | 2 + scripts/install_tokenizer.sh | 9 + src/ops/beam_topk.cu | 12 +- src/ops/inc_multiquery_self_attention.cc | 1434 +++++++++++++++++ src/ops/inc_multiquery_self_attention.cpp | 96 ++ src/ops/inc_multiquery_self_attention.cu | 797 +++++++++ src/runtime/ffconst_utils.cc | 4 + src/runtime/graph.cc | 46 + src/runtime/model.cc | 26 + src/runtime/operator_params.cc | 1 + src/runtime/request_manager.cc | 79 +- src/runtime/substitution.cc | 8 + tests/inference/huggingface_inference.py | 1 - tests/inference_tests.sh | 7 +- 39 files changed, 3130 insertions(+), 213 deletions(-) delete mode 160000 deps/sentencepiece create mode 160000 deps/tokenizers-cpp create mode 100644 include/flexflow/ops/inc_multiquery_attention_params.h create mode 100644 include/flexflow/ops/inc_multiquery_self_attention.h delete mode 100644 include/flexflow/tokenizers.h create mode 100644 inference/models/configs/falcon_7B.json create mode 100644 inference/models/falcon.cc create mode 100644 inference/models/falcon.h create mode 100755 scripts/install_tokenizer.sh create mode 100644 src/ops/inc_multiquery_self_attention.cc create mode 100644 src/ops/inc_multiquery_self_attention.cpp create mode 100644 src/ops/inc_multiquery_self_attention.cu diff --git a/.dockerignore b/.dockerignore index 22ec965249..a7470203e3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -11,3 +11,9 @@ python/flexflow/core/legion_cffi_header.py *.pb.h *.o *.a + +# Ignore inference assets +/inference/weights/* +/inference/tokenizer/* +/inference/prompt/* +/inference/output/* diff --git a/.gitignore b/.gitignore index 1ba1e26bd9..2952fe3a2f 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,11 @@ __pycache__/ # C extensions *.so +/inference/weights/* +/inference/tokenizer/* +/inference/prompt/* +/inference/output/* + # Distribution / packaging .Python build/ diff --git a/.gitmodules b/.gitmodules index 82a77864f2..c68582d4ac 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,6 +19,7 @@ [submodule "deps/json"] path = deps/json url = https://github.com/nlohmann/json.git -[submodule "deps/sentencepiece"] - path = deps/sentencepiece - url = https://github.com/google/sentencepiece.git +[submodule "deps/tokenizers-cpp"] + path = deps/tokenizers-cpp + url = https://github.com/mlc-ai/tokenizers-cpp.git + fetchRecurseSubmodules = true \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 11fcfbe533..ae25c851a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,13 +12,13 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") endif() set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake) set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR}) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UNDEBUG") +set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG") option(INFERENCE_TESTS "Run inference tests" OFF) set(LIBTORCH_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../libtorch" CACHE STRING "LibTorch Path") if (INFERENCE_TESTS) find_package(Torch REQUIRED PATHS ${LIBTORCH_PATH} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC ${TORCH_CXX_FLAGS}") message(STATUS "LIBTORCH_PATH: ${LIBTORCH_PATH}") message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}") endif() @@ -443,7 +443,7 @@ if (INFERENCE_TESTS) endif() # build binary -option(FF_BUILD_SENTENCEPIECE "build sentencepiece for LLM serving" ON) +option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON) option(FF_BUILD_RESNET "build resnet example" OFF) option(FF_BUILD_RESNEXT "build resnext example" OFF) option(FF_BUILD_ALEXNET "build alexnet example" OFF) @@ -478,13 +478,13 @@ if(FF_BUILD_VISUALIZATION_TOOL) add_subdirectory(tools/substitutions_to_dot) endif() -if(FF_BUILD_SENTENCEPIECE OR FF_BUILD_ALL_INFERENCE_EXAMPLES) +if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) if (FF_GPU_BACKEND STREQUAL "hip_rocm") SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") endif() - add_subdirectory(deps/sentencepiece sentencepiece EXCLUDE_FROM_ALL) - target_include_directories(flexflow PUBLIC deps/sentencepiece/src) - target_link_libraries(flexflow sentencepiece) + add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) + target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) + target_link_libraries(flexflow tokenizers_cpp) endif() # Python diff --git a/FlexFlow.mk b/FlexFlow.mk index 27a0062d8c..980f600c7c 100644 --- a/FlexFlow.mk +++ b/FlexFlow.mk @@ -87,7 +87,7 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1) endif -INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/sentencepiece/src +INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 @@ -97,6 +97,8 @@ CC_FLAGS += -std=c++17 NVCC_FLAGS += -std=c++17 HIPCC_FLAGS += -std=c++17 +LD_FLAGS += -L$(FF_HOME)/deps/tokenizers-cpp/example/tokenizers -ltokenizers_cpp -ltokenizers_c -L$(FF_HOME)/deps/tokenizers-cpp/example/tokenizers/sentencepiece/src -lsentencepiece + ifeq ($(strip $(FF_USE_NCCL)), 1) INC_FLAGS += -I$(MPI_HOME)/include -I$(NCCL_HOME)/include CC_FLAGS += -DFF_USE_NCCL diff --git a/conda/environment.yml b/conda/environment.yml index 8396dafcca..2844a2d8d2 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -8,6 +8,7 @@ dependencies: - numpy>=1.16.0 - Pillow - pybind11 + - rust - cmake-build-extension - pip - pip: diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml index e8cd4c1114..5de189f049 100644 --- a/conda/flexflow-cpu.yml +++ b/conda/flexflow-cpu.yml @@ -8,6 +8,7 @@ dependencies: - numpy>=1.16.0 - Pillow - pybind11 + - rust - cmake-build-extension - pytest - pip diff --git a/deps/sentencepiece b/deps/sentencepiece deleted file mode 160000 index 3863f7648e..0000000000 --- a/deps/sentencepiece +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3863f7648e5d8edb571ac592f3ac4f5f0695275a diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp new file mode 160000 index 0000000000..c53bc0444d --- /dev/null +++ b/deps/tokenizers-cpp @@ -0,0 +1 @@ +Subproject commit c53bc0444dbe2ea1f66e364cd576a6c1e23539b4 diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 00ecd6956c..598690a8a7 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -59,4 +59,8 @@ RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch RUN conda install -c conda-forge onnx transformers sentencepiece RUN pip3 install tensorflow +# Install Rust +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y +ENV PATH /root/.cargo/bin:$PATH + ENTRYPOINT ["/bin/bash"] diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index df7eb3aeee..3cd42ccffe 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -164,6 +164,7 @@ enum OperatorType { OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, + OP_INC_MULTIQUERY_SELF_ATTENTION, // Parallel Ops OP_REPARTITION, OP_COMBINE, @@ -174,6 +175,8 @@ enum OperatorType { OP_INVALID, }; +enum ModelType { UNKNOWN, LLAMA, OPT, FALCON }; + enum PMParameter { PM_OP_TYPE, // AnyOp PM_NUM_INPUTS, // AnyOp diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 3753e5e50e..4da8dbaf20 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -18,11 +18,13 @@ #include "flexflow/batch_config.h" #include "flexflow/model.h" #include +#include namespace FlexFlow { class FFModel; class BeamTree; +using tokenizers::Tokenizer; class InferenceManager { public: @@ -74,13 +76,12 @@ struct BeamTree { // std::vector probs; // }; -class Tokenizer; - class RequestManager { public: using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; - RequestManager(Tokenizer *tokenizer, + RequestManager(ModelType model_type, + std::string const &path, bool verbose = false, std::string output_filepath = ""); RequestManager(); @@ -145,13 +146,16 @@ class RequestManager { Legion::Runtime *runtime); private: - Tokenizer *tokenizer; + std::unique_ptr tokenizer_; bool verbose; + ModelType model_type; std::string output_filepath; std::queue pending_request_queue; std::unordered_map running_request_queue; std::mutex request_queue_mutex; RequestGuid next_available_guid; + const std::map model_bos_map = {{ModelType::LLAMA, 0}, + {ModelType::OPT, 2}}; // TODO: Move this two vector to request struct std::unordered_map, IncMultiHeadSelfAttention *>, + std::unordered_map< + std::pair, + IncMultiQuerySelfAttention *>, std::unordered_map, BeamTopK *>, std::unordered_map< diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 9549ffc084..8c52dfb584 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -19,6 +19,7 @@ #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" +#include "flexflow/ops/inc_multiquery_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" #include "flexflow/ops/pool_2d_params.h" diff --git a/include/flexflow/ops/inc_multiquery_attention_params.h b/include/flexflow/ops/inc_multiquery_attention_params.h new file mode 100644 index 0000000000..b781669473 --- /dev/null +++ b/include/flexflow/ops/inc_multiquery_attention_params.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_PARAMS_H +#define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_PARAMS_H + +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct IncMultiQuerySelfAttentionParams { + LayerID layer_guid; + int embed_dim, num_heads, kdim, vdim; + float dropout; + bool bias, add_bias_kv, add_zero_attn; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(IncMultiQuerySelfAttentionParams const &, + IncMultiQuerySelfAttentionParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::IncMultiQuerySelfAttentionParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_INC_MULTIQUERY_ATTENTION_PARAMS_H diff --git a/include/flexflow/ops/inc_multiquery_self_attention.h b/include/flexflow/ops/inc_multiquery_self_attention.h new file mode 100644 index 0000000000..ba6a1feeaf --- /dev/null +++ b/include/flexflow/ops/inc_multiquery_self_attention.h @@ -0,0 +1,158 @@ +#ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H +#define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/inference.h" +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/ops/inc_multiquery_attention_params.h" +#include "math.h" +#include +#include + +namespace FlexFlow { + +class IncMultiQuerySelfAttentionMeta; + +class IncMultiQuerySelfAttention : public Op { +public: + using Params = IncMultiQuerySelfAttentionParams; + using Input = ParallelTensor; + + IncMultiQuerySelfAttention(FFModel &model, + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name); + IncMultiQuerySelfAttention(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name); + IncMultiQuerySelfAttention(FFModel &model, + IncMultiQuerySelfAttention const &other, + const ParallelTensor input, + bool allocate_weights); + IncMultiQuerySelfAttention(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + + static void inference_kernel_wrapper(IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output); + Params get_params() const; + +public: + int num_heads; + float dropout; + bool bias; + bool add_bias_kv, add_zero_attn; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, + embed_dim; + int qoSeqLength, kvSeqLength; +}; + +class IncMultiQuerySelfAttentionMeta : public OpMeta { +public: + IncMultiQuerySelfAttentionMeta(FFHandler handler, + IncMultiQuerySelfAttention const *attn, + GenericTensorAccessorR const &weight, + Legion::Memory gpu_mem, + int num_samples); + IncMultiQuerySelfAttentionMeta(FFHandler handler, + InferenceMode infer_mode, + Op const *attn, + int _qSize, + int _kSize, + int _vSize, + int _qProjSize, + int _kProjSize, + int _vProjSize, + int _oProjSize, + int _embed_dim, + bool _bias, + bool _add_bias_kv, + GenericTensorAccessorR const &weight, + Legion::Memory gpu_mem, + int num_samples); + ~IncMultiQuerySelfAttentionMeta(void); + +public: + Realm::RegionInstance reserveInst; + size_t weights_params, weightSize, reserveSpaceSize; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, + embed_dim; + int num_heads; + bool *has_load_weights; + bool *bias; + bool *multi_query_attention; +#ifdef INFERENCE_TESTS + float *kcache, *vcache; +#endif + void *devQKVProjArray, *keyCache, *valueCache; + void *qk_prods, *qk_prods_softmax; + void *attn_heads, *W_out_contiguous; + BatchConfig::PerTokenInfo *token_infos; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cuFloatComplex *complex_input; +#endif +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ATTENTION_H diff --git a/include/flexflow/tokenizers.h b/include/flexflow/tokenizers.h deleted file mode 100644 index 8f6c309aad..0000000000 --- a/include/flexflow/tokenizers.h +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include "gpt_tokenizer.h" -#include - -namespace FlexFlow { - -/*! - * \brief a universal tokenizer that loads - * either HF's tokenizer or sentence piece, depending on the type. - */ - -class Tokenizer { -public: - // bos token - int32_t bos_token_id{0}; - // eos token id - int32_t eos_token_id{1}; - - virtual ~Tokenizer() {} - virtual std::vector Encode(std::string const &text) = 0; - virtual std::string Decode(std::vector const &ids) = 0; - - // static std::unique_ptr FromFile(const std::string& path); - // static std::unique_ptr ByteLevelBPEFromFile(const std::string& - // path); -}; - -class SentencePieceTokenizer : public Tokenizer { -public: - SentencePieceTokenizer(std::string const &path) { - sentence_piece_.Load(path); - } - - std::vector Encode(std::string const &text) final { - std::vector tokens; - sentence_piece_.Encode(text, &tokens).IgnoreError(); - return tokens; - } - - std::string Decode(std::vector const &ids) final { - std::string text; - sentence_piece_.Decode(ids, &text).IgnoreError(); - return text; - } - -private: - // the tokenizer - sentencepiece::SentencePieceProcessor sentence_piece_; -}; - -class OptTokenizer : public Tokenizer { -public: - OptTokenizer(std::string const &vocab_file, // path to "gpt2-vocab.json" - std::string const &merges_file) // path to "gpt2-merges.txt" - : tokenizer(OPT_TOKENIZER, vocab_file, merges_file) { - bos_token_id = 0; - eos_token_id = 2; - } - - std::vector Encode(std::string const &text) final { - std::vector tokens; - std::vector mask_ids; - tokenizer.encode(text, text.length(), &tokens, &mask_ids); - - auto it = std::find(mask_ids.begin(), mask_ids.end(), 0); - - if (it != mask_ids.end()) { - size_t index = std::distance(mask_ids.begin(), it); - tokens.erase(tokens.begin() + index, tokens.end()); - } - - return tokens; - } - - std::string Decode(std::vector const &ids) final { - std::vector mask_ids; - for (int i = 0; i < ids.size(); i++) { - mask_ids.push_back(1); - } - std::string text = tokenizer.decode(ids, mask_ids); - return text; - } - -private: - GPT_Tokenizer tokenizer; -}; - -}; // namespace FlexFlow diff --git a/inference/file_loader.cc b/inference/file_loader.cc index f0eff0e50b..d7bb8a7b4c 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -83,7 +83,6 @@ void load_attention_bias(DT *ptr, std::vector bias_files = {q_file, k_file, v_file, o_file}; int file_index = 0; - for (auto file : bias_files) { size_t partial_size = hidden_dim; // std::cout << "Loading filename: " << file << std::endl; @@ -115,6 +114,54 @@ void load_attention_bias(DT *ptr, } } +template +void load_attention_weights_multi_query(DT *ptr, + std::string layer_name, + std::string weight_path, + size_t hidden_dim, + int num_heads) { + + std::string qkv_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_query_key_value_weight"; + std::string o_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_dense_weight"; + + // q has n_heads heads, k and v only have one head, o have n_head heads + std::vector weight_files = {qkv_file, o_file}; + int file_index = 0; + int data_index = 0; + for (auto file : weight_files) { + size_t partial_size = + file_index == 0 ? (hidden_dim + 2 * hidden_dim / num_heads) * hidden_dim + : hidden_dim * hidden_dim; + + std::ifstream in(file, std::ios::in | std::ios::binary); + // std::cout << "Loading filename: " << file << std::endl; + if (!in.good()) { + std::cout << "Could not open file: " << file << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error " << in_get_size << ", " + << loaded_data_size; + assert(false && "data size mismatch"); + } + for (int i = 0; i < partial_size; i++) { + ptr[data_index++] = host_array.at(i); + } + file_index++; + } +} + template void load_attention_weights(DT *ptr, int num_heads, @@ -277,6 +324,9 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, weight_file_path); } + } else if (file_path.find("self_attention") != std::string::npos) { + load_attention_weights_multi_query( + data, file_path, weight_file_path, hidden_dim, num_heads); } else { if (weight_idx > 0) { int index = file_path.find("_weight"); diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index 761a710d71..d1313d328b 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -9,7 +9,8 @@ set(CPU_SRC incr_decoding.cc ../file_loader.cc ../models/llama.cc - ../models/opt.cc) + ../models/opt.cc + ../models/falcon.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index d80ed9520b..a9ec63bc00 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -14,10 +14,10 @@ */ #include "flexflow/inference.h" -#include "flexflow/tokenizers.h" +#include "models/falcon.h" #include "models/llama.h" #include "models/opt.h" -#include + #include using namespace Legion; @@ -32,8 +32,6 @@ struct FilePaths { std::string output_file_path; }; -enum ModelType { UNKNOWN, LLAMA, OPT }; - void parse_input_args(char **argv, int argc, FilePaths &paths, @@ -52,6 +50,8 @@ void parse_input_args(char **argv, llm_model_type = ModelType::LLAMA; } else if (model_type_str == "opt") { llm_model_type = ModelType::OPT; + } else if (model_type_str == "falcon") { + llm_model_type = ModelType::FALCON; } else { llm_model_type = ModelType::UNKNOWN; } @@ -113,32 +113,9 @@ void FlexFlow::top_level_task(Task const *task, assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); - // Create SentencePiece tokenizer or OPT tokenizer - SentencePieceTokenizer *sp_tokenizer = nullptr; - OptTokenizer *opt_tokenizer = nullptr; - if (model_type == ModelType::LLAMA) { - sp_tokenizer = new SentencePieceTokenizer(file_paths.tokenizer_file_path); - } else { - std::string tokenizer_folder = - (!file_paths.tokenizer_file_path.empty() && - file_paths.tokenizer_file_path.back() != '/') - ? file_paths.tokenizer_file_path + '/' - : file_paths.tokenizer_file_path; - std::string vocab_file = tokenizer_folder + "gpt2-vocab.json"; - std::string merges_file = tokenizer_folder + "gpt2-merges.txt"; - std::filesystem::path path1(vocab_file); - std::filesystem::path path2(merges_file); - assert(std::filesystem::exists(path1) && - "Vocab file gpt2-vocab.json does not exist at the specified path"); - assert(std::filesystem::exists(path2) && - "Merge file gpt2-merges.txt does not exist at the specified path"); - opt_tokenizer = new OptTokenizer(vocab_file, merges_file); - } - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); - RequestManager rm((model_type == ModelType::LLAMA) - ? (Tokenizer *)sp_tokenizer - : (Tokenizer *)opt_tokenizer, + RequestManager rm(model_type, + file_paths.tokenizer_file_path, /*verbose*/ verbose, file_paths.output_file_path); @@ -151,8 +128,7 @@ void FlexFlow::top_level_task(Task const *task, ffconfig.workersPerNode * ffconfig.numNodes, INC_DECODING_MODE, use_full_precision); - } else { - assert(model_type == ModelType::OPT); + } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, im, file_paths.llm_config_file_path, @@ -160,6 +136,16 @@ void FlexFlow::top_level_task(Task const *task, ffconfig.workersPerNode * ffconfig.numNodes, INC_DECODING_MODE, use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + im, + file_paths.llm_config_file_path, + file_paths.llm_weight_file_path, + ffconfig.workersPerNode * ffconfig.numNodes, + INC_DECODING_MODE, + use_full_precision); + } else { + assert(false && "unknow model type"); } int total_num_requests = 0; @@ -202,11 +188,6 @@ void FlexFlow::top_level_task(Task const *task, std::cout << "----------inference finished--------------" << std::endl; // free tokenizer space in memory - if (model_type == ModelType::LLAMA) { - delete sp_tokenizer; - } else { - delete opt_tokenizer; - } } void FlexFlow::register_custom_tasks() {} diff --git a/inference/models/configs/falcon_7B.json b/inference/models/configs/falcon_7B.json new file mode 100644 index 0000000000..445da54272 --- /dev/null +++ b/inference/models/configs/falcon_7B.json @@ -0,0 +1,11 @@ +{ + "n_layers": 32, + "vocab_size": 65024, + "n_heads": 71, + "dim": 4544, + "multiple_of": 256, + "norm_eps": 1e-05, + "total_requests": 2560, + "hidden_dim": 11008, + "incremental_mode": true +} \ No newline at end of file diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc new file mode 100644 index 0000000000..b137bf6944 --- /dev/null +++ b/inference/models/falcon.cc @@ -0,0 +1,178 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "falcon.h" + +namespace FlexFlow { + +using namespace Legion; + +void FALCON::create_falcon_model(FFModel &ff, + InferenceManager &im, + std::string const &model_config_file_path, + std::string const &weight_file_path, + int num_pipeline_stages, + InferenceMode mode, + bool use_full_precision) { + Config falcon_config(model_config_file_path); + falcon_config.printConfig(); + //------------------------------compute machine views ------------------ + int num_devices = ff.config.workersPerNode * ff.config.numNodes; + std::vector machine_views; + for (int i = 0; i < num_devices; i++) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = 1; + view.stride[0] = 0; + view.start_device_id = i; + machine_views.push_back(view); + } + + std::unordered_map> mapping; + std::unordered_map weights_layers; + + Tensor input; + { + assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); + int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + } + mapping[input].push_back(machine_views[0]); + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + + Tensor token; + std::vector axes = {0}; + + if (use_full_precision) { + token = ff.embedding(input, + falcon_config.vocab_size, + falcon_config.dim, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + token = ff.embedding(input, + falcon_config.vocab_size, + falcon_config.dim, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } + + Layer *embedding = ff.layers.back(); + weights_layers.emplace("tok_embeddings_weight", embedding); + + int num_transformer_layers = falcon_config.n_layers; + int num_transformer_layers_per_stage = + (num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages; + + for (int i = 0; i < num_transformer_layers; i++) { + // step 1: attention + Tensor att_norm = ff.layer_norm(token, axes, true, falcon_config.norm_eps); + Layer *attention_norm = ff.layers.back(); + + if (i % num_transformer_layers_per_stage == 0) { + // Map att_norm to the next GPU + // since the size of att_norm is minimum across + // all tensors + mapping[att_norm].push_back( + machine_views[i / num_transformer_layers_per_stage]); + } + + weights_layers.emplace("layers_" + std::to_string(i) + + "_input_layernorm_weight", + attention_norm); + Tensor mha; + switch (mode) { + case INC_DECODING_MODE: { + mha = ff.inc_multiquery_self_attention( + att_norm, + falcon_config.dim, + falcon_config.n_heads, + falcon_config.dim / falcon_config.n_heads, + falcon_config.dim / falcon_config.n_heads, + 0.0f, /*dropout*/ + false, /*bias*/ + false, /*add_bias_kv*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr /*kernel_initializer*/ + ); + break; + } + default: { + assert(false); + } + } + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_self_attention_dense_weight", + attention_layer); + + Tensor dense_h_to_4h = + ff.dense(att_norm, falcon_config.dim * 4, AC_MODE_NONE, false); + Layer *dense_h_to_4h_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_mlp_dense_h_to_4layers_weight", + dense_h_to_4h_layer); + dense_h_to_4h = ff.gelu(dense_h_to_4h); + Tensor mlp_output = + ff.dense(dense_h_to_4h, falcon_config.dim, AC_MODE_NONE, false); + Layer *dense_4h_to_h_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + + "_mlp_dense_4h_to_layers_weight", + dense_4h_to_h_layer); + + token = ff.add(token, mha); + token = ff.add(token, mlp_output); + } + // final normalization and linear + Tensor ln_f = ff.layer_norm(token, axes, true, falcon_config.norm_eps); + Layer *ln_f_layer = ff.layers.back(); + weights_layers.emplace("ln_f_weight", ln_f_layer); + + Tensor lm_head = + ff.dense(ln_f, falcon_config.vocab_size, AC_MODE_NONE, false); + Layer *lm_head_layer = ff.layers.back(); + weights_layers.emplace("lm_head_weight", lm_head_layer); + + Tensor output; + if (mode == BEAM_SEARCH_MODE) { + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.beam_top_k(softmax, falcon_config.max_beam_width, false); + } else { + output = ff.arg_top_k(lm_head, /*k=*/1, false); + } + + // Compile the model + std::cout << "------start compile ----------" << std::endl; + im.compile_model_and_allocate_buffer(&ff, mapping); + FileDataLoader fileloader("", + weight_file_path, + falcon_config.n_heads, + falcon_config.dim, + falcon_config.dim / falcon_config.n_heads); + fileloader.load_weights(&ff, weights_layers); + std::cout << "------load weight finished----------" << std::endl; + + // init operators + im.init_operators_inference(&ff); +} + +}; // namespace FlexFlow diff --git a/inference/models/falcon.h b/inference/models/falcon.h new file mode 100644 index 0000000000..03cef07e58 --- /dev/null +++ b/inference/models/falcon.h @@ -0,0 +1,115 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "file_loader.h" +#include "flexflow/batch_config.h" +#include "flexflow/inference.h" +#include +#include +using json = nlohmann::json; + +namespace FlexFlow { + +class FALCON { +public: + struct Config { + Config(void) { + // todo read from config/param file + n_layers = 32; + vocab_size = 32000; + n_heads = 32; + dim = 4096; + multiple_of = 256; + norm_eps = 1e-6; + total_requests = 2560; + incremental_mode = true; + hidden_dim = 11008; + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + } + + Config(std::string config_filepath) { + std::ifstream config_file(config_filepath); + if (config_file.is_open()) { + try { + json config_json; + config_file >> config_json; + + n_layers = config_json["n_layers"]; + vocab_size = config_json["vocab_size"]; + n_heads = config_json["n_heads"]; + dim = config_json["dim"]; + multiple_of = config_json["multiple_of"]; + norm_eps = config_json["norm_eps"]; + total_requests = config_json["total_requests"]; + incremental_mode = config_json["incremental_mode"]; + hidden_dim = config_json["hidden_dim"]; + head_dim = dim / n_heads; + // Override values below + /* max_seq_len = config_json["max_seq_len"]; + max_num_tokens = config_json["max_num_tokens"]; + max_beam_width = config_json["max_beam_width"]; + max_beam_depth = config_json["max_beam_depth"]; + hidden_dim = config_json["hidden_dim"]; */ + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + } catch (json::exception const &e) { + std::cerr << "Error parsing JSON file: " << e.what() << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file." << std::endl; + assert(false); + } + } + + void printConfig() const { + std::cout << "Falcon Config:" << std::endl; + std::cout << "n_layers: " << n_layers << std::endl; + std::cout << "vocab_size: " << vocab_size << std::endl; + std::cout << "n_heads: " << n_heads << std::endl; + std::cout << "dim: " << dim << std::endl; + std::cout << "multiple_of: " << multiple_of << std::endl; + std::cout << "norm_eps: " << norm_eps << std::endl; + std::cout << "total_requests: " << total_requests << std::endl; + std::cout << "incremental_mode: " << incremental_mode << std::endl; + std::cout << "max_seq_len: " << max_seq_len << std::endl; + std::cout << "max_num_tokens: " << max_num_tokens << std::endl; + std::cout << "max_beam_width: " << max_beam_width << std::endl; + std::cout << "max_beam_depth: " << max_beam_depth << std::endl; + std::cout << "hidden_dim: " << hidden_dim << std::endl; + } + + int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, + total_requests, incremental_mode, max_seq_len, max_num_tokens, + max_beam_width, max_beam_depth, head_dim; + float norm_eps; + }; + + static void create_falcon_model(FFModel &ff, + InferenceManager &im, + std::string const &model_config_file_path, + std::string const &weight_file_path, + int num_pipeline_stages, + InferenceMode mode, + bool use_full_precision = false); +}; + +}; // namespace FlexFlow diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 2e38723036..ae577dd02e 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -14,7 +14,6 @@ */ #include "flexflow/inference.h" -#include "flexflow/tokenizers.h" #include "models/llama.h" #include "models/opt.h" #include @@ -34,8 +33,6 @@ struct FilePaths { std::string output_file_path; }; -enum ModelType { UNKNOWN, LLAMA, OPT }; - struct ModelTypes { ModelType llm_model_type; std::vector ssm_model_types; @@ -168,31 +165,9 @@ void FlexFlow::top_level_task(Task const *task, } // Create SentencePiece tokenizer or OPT tokenizer - SentencePieceTokenizer *sp_tokenizer = nullptr; - OptTokenizer *opt_tokenizer = nullptr; - if (model_types.llm_model_type == ModelType::LLAMA) { - sp_tokenizer = new SentencePieceTokenizer(file_paths.tokenizer_file_path); - } else { - std::string tokenizer_folder = - (!file_paths.tokenizer_file_path.empty() && - file_paths.tokenizer_file_path.back() != '/') - ? file_paths.tokenizer_file_path + '/' - : file_paths.tokenizer_file_path; - std::string vocab_file = tokenizer_folder + "gpt2-vocab.json"; - std::string merges_file = tokenizer_folder + "gpt2-merges.txt"; - std::filesystem::path path1(vocab_file); - std::filesystem::path path2(merges_file); - assert(std::filesystem::exists(path1) && - "Vocab file gpt2-vocab.json does not exist at the specified path"); - assert(std::filesystem::exists(path2) && - "Merge file gpt2-merges.txt does not exist at the specified path"); - opt_tokenizer = new OptTokenizer(vocab_file, merges_file); - } - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); - RequestManager rm((model_types.llm_model_type == ModelType::LLAMA) - ? (Tokenizer *)sp_tokenizer - : (Tokenizer *)opt_tokenizer, + RequestManager rm(model_types.llm_model_type, + file_paths.tokenizer_file_path, /*verbose*/ verbose, file_paths.output_file_path); @@ -335,13 +310,6 @@ void FlexFlow::top_level_task(Task const *task, // float* data std::cout << "----------inference finished--------------" << std::endl; - - // free tokenizer space in memory - if (model_types.llm_model_type == ModelType::LLAMA) { - delete sp_tokenizer; - } else { - delete opt_tokenizer; - } } void FlexFlow::register_custom_tasks() {} \ No newline at end of file diff --git a/inference/utils/download_opt_weights.py b/inference/utils/download_opt_weights.py index de42689202..747d471d1a 100644 --- a/inference/utils/download_opt_weights.py +++ b/inference/utils/download_opt_weights.py @@ -56,3 +56,7 @@ def convert_hf_model(model, dst_folder): url = 'https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt' r = requests.get(url) open(tokenizer_filepath , 'wb').write(r.content) +tokenizer_filepath = '../tokenizer/added_tokens.json' +url = 'https://huggingface.co/truongpdd/vietnews-gpt2/raw/main/added_tokens.json' +r = requests.get(url) +open(tokenizer_filepath , 'wb').write(r.content) \ No newline at end of file diff --git a/python/Makefile b/python/Makefile index 10b300841d..ad43bb8890 100644 --- a/python/Makefile +++ b/python/Makefile @@ -28,6 +28,8 @@ USE_GPU_REDUCTIONS ?= 0 FF_USE_NCCL ?= 0 # FF_PYTHON_USE_INDEX_LOADER = 1 +INSTALL_TOKENIZERS := $(shell $(FF_HOME)/scripts/install_tokenizer.sh) + ifeq ($(strip $(DARWIN)),1) PYTHON_EXT := dylib else diff --git a/scripts/install_tokenizer.sh b/scripts/install_tokenizer.sh new file mode 100755 index 0000000000..4632b7e818 --- /dev/null +++ b/scripts/install_tokenizer.sh @@ -0,0 +1,9 @@ +#! /usr/bin/env bash +set -x +set -e + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" +cd ../deps/tokenizers-cpp/example +cmake -D CMAKE_CXX_FLAGS=-fPIC +make -j diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 2c5ceda548..934353d8e8 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -352,12 +352,12 @@ __device__ void mergeBeamShards(int num_shards, T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + ((next_shard_index % max_heap_size) / k)]; - if (batch_index == 0) { - printf("next_shard_index %d, value %.15f, prob %.15f\n", - next_shard_index, - entries[next_shard_index].value, - prob); - } + // if (batch_index == 0) { + // printf("next_shard_index %d, value %.15f, prob %.15f\n", + // next_shard_index, + // entries[next_shard_index].value, + // prob); + // } max_heap.replace_root( {next_shard_index, entries[next_shard_index].value * prob}, diff --git a/src/ops/inc_multiquery_self_attention.cc b/src/ops/inc_multiquery_self_attention.cc new file mode 100644 index 0000000000..eae98cd7d5 --- /dev/null +++ b/src/ops/inc_multiquery_self_attention.cc @@ -0,0 +1,1434 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multiquery_self_attention.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#ifdef INFERENCE_TESTS +#include +using namespace at::indexing; +#endif + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +LegionRuntime::Logger::Category log_inc_mqa("IncrementalMQA"); + +bool IncMultiQuerySelfAttentionParams::is_valid( + ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor FFModel::inc_multiquery_self_attention(const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + char const *name) { + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; + int weight_num = bias ? 2 : 1; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for IncMQA"); + li = new Layer(this, + OP_INC_MULTIQUERY_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_INC_MULTIQUERY_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + input); + } + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, data_type, li, 0, true /*create_grad*/); + } + { + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + // int qSize = input->dims[0], kSize = input->dims[0], vSize = + // input->dims[0]; int qParas = qSize; int kParas = kProjSize; int vParas = + // vProjSize; int oParas = oProjSize; int dims[2] = {qParas + kParas + + // vParas + oParas, num_heads}; + + int dims[2] = {embed_dim + kProjSize + vProjSize + oProjSize, embed_dim}; + + li->weights[0] = create_weight_legion_ordering(2, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = data_type; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_heads", num_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("bias", bias); + li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + layers.push_back(li); + + return li->outputs[0]; +} + +Op *IncMultiQuerySelfAttention::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_heads", value); + int num_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("bias", value); + bool bias = (bool)value; + layer->get_int_property("add_bias_kv", value); + bool add_bias_kv = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + return new IncMultiQuerySelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + true /*allocate_weights*/, + layer->name); +} + +IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( + FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_INC_MULTIQUERY_SELF_ATTENTION, + _input->data_type, + name, + 1 /*inputs*/, + (_bias ? 2 : 1), /*weights*/ + 1 /*outputs*/, + _input), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), embed_dim(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + // int qParas = this->qProjSize * this->qSize; + // int kParas = this->kProjSize * this->kSize; + // int vParas = this->vProjSize * this->vSize; + // int oParas = + // this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : + // this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->embed_dim; + dims[2].size = + this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; + dims[2].degree = 1; + dims[2].parallel_idx = -1; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, this->data_type, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( + FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_heads, + int _kdim, + int _vdim, + float _dropout, + bool _bias, + bool _add_bias_kv, + bool _add_zero_attn, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_INC_MULTIQUERY_SELF_ATTENTION, + _input->data_type, + name, + 1 /*inputs*/, + (_bias ? 2 : 1), /*weights*/ + 1 /*outputs*/, + _input, + _weight), + num_heads(_num_heads), dropout(_dropout), bias(_bias), + add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), embed_dim(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + // int qParas = this->qProjSize * this->qSize; + // int kParas = this->kProjSize * this->kSize; + // int vParas = this->vProjSize * this->vSize; + // int oParas = + // this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : + // this->vSize); + ParallelDim dims[3]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->embed_dim; + dims[2].size = + this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); +#ifdef USE_NCCL + ParameterSyncType comm_type = ParameterSyncType::NCCL; +#else + ParameterSyncType comm_type = ParameterSyncType::PS; +#endif + weights[0] = model.create_parallel_weight<3>(dims, + this->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, this->data_type, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( + FFModel &model, + IncMultiQuerySelfAttention const &other, + const ParallelTensor input, + bool allocate_weights) + : IncMultiQuerySelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.bias, + other.add_bias_kv, + other.add_zero_attn, + allocate_weights, + other.name) {} + +IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( + FFModel &model, + IncMultiQuerySelfAttentionParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : IncMultiQuerySelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_heads, + params.kdim, + params.vdim, + params.dropout, + params.bias, + params.add_bias_kv, + params.add_zero_attn, + allocate_weights, + name) {} + +void IncMultiQuerySelfAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiQuerySelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void IncMultiQuerySelfAttention::init(FFModel const &ff) { + + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(IncMultiQuerySelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta *IncMultiQuerySelfAttention::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + IncMultiQuerySelfAttention const *attn = + (IncMultiQuerySelfAttention *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + // assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + + // 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + // + 1); + int num_heads = (weight.domain.hi()[1] - weight.domain.lo()[1] + 1); + // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + + // 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + IncMultiQuerySelfAttentionMeta *m = new IncMultiQuerySelfAttentionMeta( + handle, attn, weight, gpu_mem, num_samples); + + m->profiling = attn->profiling; + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); + return m; +} + +void IncMultiQuerySelfAttention::forward(FFModel const &ff) { + // IncMultiQuerySelfAttention doesn't support forward + assert(false); +} + +FutureMap IncMultiQuerySelfAttention::inference( + FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + log_inc_mqa.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc.num_tokens, + bc.num_active_requests()); + IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument(&bc, sizeof(BatchConfig)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiQuerySelfAttention::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + + BatchConfig const *bc = (BatchConfig *)task->args; + IncMultiQuerySelfAttentionMeta const *m = + *((IncMultiQuerySelfAttentionMeta **)task->local_args); + + assert(regions.size() == 3); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 3); + assert(output_domain.get_dim() == 4); + + IncMultiQuerySelfAttention::inference_kernel_wrapper( + m, bc, input, weight, output); +#ifdef INFERENCE_TESTS + printf("Checking IncMultiQuerySelfAttention computations...\n"); + + // ============================================================================= + // Define helper functions to handle row-major arrays + // ============================================================================= + + auto set_value_row_major = [](float *arr, + std::vector const &shape, + std::vector const &indices, + float value) -> void { + int offset = 0; + for (int i = 0; i < shape.size(); i++) { + int index = indices[i]; + int stride = 1; + for (int j = i + 1; j < shape.size(); j++) { + stride *= shape[j]; + } + offset += index * stride; + } + *(arr + offset) = value; + }; + + // ============================================================================= + // Load input/output/weights and parse general configs + // ============================================================================= + + float *input_cpu = + download_tensor(input.get_float_ptr(), input_domain.get_volume()); + assert(input_cpu != nullptr); + float *weight_cpu = download_tensor(weight.get_float_ptr(), + weight_domain.get_volume()); + assert(weight_cpu != nullptr); + float *output_cpu = download_tensor(output.get_float_ptr(), + output_domain.get_volume()); + assert(output_cpu != nullptr); + + // Input tensor dimensions + coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + coord_t max_sequence_length = input_domain.hi()[1] - input_domain.lo()[1] + 1; + coord_t batch_size = input_domain.hi()[2] - input_domain.lo()[2] + 1; + coord_t replica_dim = input_domain.hi()[3] - input_domain.lo()[3] + 1; + assert(replica_dim == 1); + + size_t effective_batch_size = max_sequence_length * batch_size; + float inputs_arr[data_dim][effective_batch_size] = {0}; + for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { + size_t data_index = i % data_dim; + size_t token_index = i / data_dim; + assert(data_index < data_dim); + assert(token_index < effective_batch_size); + inputs_arr[data_index][token_index] = input_cpu[i]; + } + torch::Tensor torch_input = torch::from_blob( + inputs_arr, {data_dim, (long int)effective_batch_size}, torch::kFloat32); + + // Weight tensor dimensions + coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; + coord_t num_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; + replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; + size_t qParas = m->qProjSize * m->qSize; + size_t kParas = m->kProjSize * m->kSize; + size_t vParas = m->vProjSize * m->vSize; + size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); + + assert(all_weight_params == qParas + kParas + vParas + oParas); + assert(num_heads == m->num_heads); + assert(replica_dim == 1); + + assert(m->qSize == m->kSize && m->kSize == m->vSize); + // printf("m->qSize: %i\n", m->qSize); + // keep things simple for now + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + long int proj_sum = m->qProjSize + m->kProjSize + m->vProjSize; + // load weight manually because Torch can't easily read a tensor serialized in + // column-major order. + + // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " + // "bc->num_active_tokens(): %i, num_heads: %lli, + // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", + // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), + // num_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); + // for (int t=0; t < bc->num_active_tokens(); t++) { + // printf("token %i has request_index: %li and token_position: %li\n", + // t, bc->token2ids.token_indexes[t].request_index, + // bc->token2ids.token_indexes[t].token_position); + // } + + // ============================================================================= + // Load the output tensor (with CUDA results), and create a Torch tensor + // ============================================================================= + + float output_cuda[m->oProjSize][effective_batch_size] = {0}; + for (int i = 0; i < m->oProjSize * effective_batch_size; i++) { + int row_idx = i % m->oProjSize; + int col_idx = i / m->oProjSize; + assert(row_idx < m->oProjSize && col_idx < effective_batch_size); + output_cuda[row_idx][col_idx] = output_cpu[i]; + } + torch::Tensor torch_out_cuda = + torch::from_blob(output_cuda, + {m->oProjSize, (int64_t)effective_batch_size}, + torch::kFloat32); + + // ============================================================================= + // Load the Q/K/V projection weights, and create a Torch tensor + // ============================================================================= + std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_heads}; + float *w_qkv = + (float *)calloc(m->qSize * m->qProjSize * 3 * num_heads, sizeof(float)); + assert(w_qkv[0] == 0.0f); + + for (int h = 0; h < num_heads; h++) { + for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { + int row_index = i % m->qSize; + int column_index = i / m->qSize; + // Q + set_value_row_major(w_qkv, + w_qkv_shape, + {row_index, column_index, 0, h}, + weight_cpu[all_weight_params * h + + m->qSize * column_index + row_index]); + // K + set_value_row_major( + w_qkv, + w_qkv_shape, + {row_index, column_index, 1, h}, + weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + + m->qSize * column_index + row_index]); + // V + set_value_row_major( + w_qkv, + w_qkv_shape, + {row_index, column_index, 2, h}, + weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + + m->qSize * column_index + row_index]); + } + } + // convert weights to torch tensor + torch::Tensor torch_w_qkv = torch::from_blob( + w_qkv, {m->qSize, m->qProjSize, 3, (int)num_heads}, torch::kFloat32); + + /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() + << std::endl; + std::cout << "Torch input size: " << torch_input.sizes() << std::endl; + std::cout << "Number of active tokens: " << bc->num_active_tokens() + << std::endl; */ + // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; + + // ============================================================================= + // Compute the Q/K/V projections, and compare the results with CUDA + // ============================================================================= + + // ----------------------- C++ computations & checks ------------------------ + torch::Tensor qkv_projs = torch::einsum( + "ijkl,im->jmkl", + {torch_w_qkv, + torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); + // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; + assert(qkv_projs.sizes()[0] == m->qProjSize); + assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && + qkv_projs.sizes()[1] <= effective_batch_size); + assert(qkv_projs.sizes()[2] == 3); + assert(qkv_projs.sizes()[3] == num_heads); + free(w_qkv); + + // ----------------------- Loading CUDA results for this step --------------- + float *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, + BatchConfig::MAX_NUM_TOKENS * + proj_sum * m->num_heads); + assert(QKVProjArray_cpu != nullptr); + + std::vector QKVProjArray_converted_shape = { + m->qProjSize, bc->num_active_tokens(), 3, (int)num_heads}; + float *QKVProjArray_converted = (float *)calloc( + m->qProjSize * bc->num_active_tokens() * 3 * num_heads, sizeof(float)); + + // skip over padding at the end of QKVProjArray_cpu + // convert from column order to 3D matrix because torch cannot automatically + // import matrices flattened in column order + for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { + int proj_size_index = i % m->qProjSize; + int head_index = i / (proj_sum * bc->num_active_tokens()); + int token_index = + ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % + bc->num_active_tokens(); + int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / + (m->qProjSize * bc->num_active_tokens()); + assert(proj_size_index < proj_sum); + assert(head_index < num_heads); + assert(token_index < bc->num_active_tokens()); + assert(qkv_offset < 3); + set_value_row_major(QKVProjArray_converted, + QKVProjArray_converted_shape, + {proj_size_index, token_index, qkv_offset, head_index}, + QKVProjArray_cpu[i]); + } + torch::Tensor QKVProjArray_torch = + torch::from_blob(QKVProjArray_converted, + {m->qProjSize, bc->num_active_tokens(), 3, num_heads}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + // std::cout << "QKVProjArray_torch" << std::endl; + // for (int i=0; inum_active_tokens(); t++) { + for (size_t d = 0; d < m->kProjSize; d++) { + size_t kcache_idx = + d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].abs_depth_in_request * m->num_heads * + BatchConfig::MAX_NUM_REQUESTS + + h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; + m->kcache[kcache_idx] = + qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) + .item(); + } + for (size_t d = 0; d < m->vProjSize; d++) { + size_t vcache_idx = + d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].abs_depth_in_request * m->num_heads * + BatchConfig::MAX_NUM_REQUESTS + + h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; + m->vcache[vcache_idx] = + qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) + .item(); + } + } + } + // Create torch tensors from the arrays + torch::Tensor K_t = torch::from_blob( + m->kcache, + {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + torch::Tensor V_t = torch::from_blob( + m->vcache, + {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + + // Compute useful indices + std::vector req_idxs; + std::vector r_first_idx; + std::vector r_num_tokens; + for (size_t t = 0; t < bc->num_active_tokens(); t++) { + size_t rid = bc->tokensInfo[t].request_index; + if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { + req_idxs.push_back(rid); + r_first_idx.push_back(t); + r_num_tokens.push_back(1); + } else { + r_num_tokens[r_num_tokens.size() - 1]++; + } + assert(req_idxs.size() == r_first_idx.size() && + r_first_idx.size() == r_num_tokens.size()); + } + assert(req_idxs.size() == bc->num_active_requests()); + assert(std::accumulate(r_num_tokens.begin(), + r_num_tokens.end(), + decltype(r_num_tokens)::value_type(0)) == + bc->num_active_tokens()); + + // ----------------------- Loading CUDA results for this step --------------- + float *keyCache_cpu = + download_tensor(m->keyCache, + m->num_heads * m->kProjSize * + BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); + float *valueCache_cpu = + download_tensor(m->valueCache, + m->num_heads * m->vProjSize * + BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); + assert(keyCache_cpu != nullptr); + assert(valueCache_cpu != nullptr); + + float *kcache_cuda = (float *)calloc( + m->kProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + float *vcache_cuda = (float *)calloc( + m->vProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + int index = 0; + for (int i = 0; i < m->kProjSize; i++) { + for (int j = 0; j < MAX_SEQ_LEN; j++) { + for (int k = 0; k < m->num_heads; k++) { + for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { + int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_heads + + k * m->kProjSize * MAX_SEQ_LEN + + j * m->kProjSize + i; + kcache_cuda[index++] = keyCache_cpu[col_major_index]; + } + } + } + } + index = 0; + for (int i = 0; i < m->vProjSize; i++) { + for (int j = 0; j < MAX_SEQ_LEN; j++) { + for (int k = 0; k < m->num_heads; k++) { + for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { + int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_heads + + k * m->vProjSize * MAX_SEQ_LEN + + j * m->vProjSize + i; + vcache_cuda[index++] = valueCache_cpu[col_major_index]; + } + } + } + } + torch::Tensor K_t_cuda = torch::from_blob( + kcache_cuda, + {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + torch::Tensor V_t_cuda = torch::from_blob( + vcache_cuda, + {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + + // std::cout << "kcache differences:" << std::endl; + // for (int i=0; i < bc->num_active_requests() + 1; i++) { + // for (int j=0; j < num_heads; j++) { + // for (int l=0; l < m->kProjSize; l++) { + // for (int k=0; k < MAX_SEQ_LEN; k++) { + // size_t kcache_idx = + // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // j * BatchConfig::MAX_NUM_REQUESTS + + // i; + // if ( abs(m->kcache[kcache_idx] - keyCache_cpu[ + // i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // j * m->kProjSize * MAX_SEQ_LEN + + // k * m->kProjSize + + // l + // ]) > 0.00001) { + // printf("req: %i (rid: %i), head: %i, data_dim: %i, token_pos: + // %i\n", + // i, req_idxs[i], j, l, k); + // } + // } + // } + // } + // } + + // std::cout << "keyCache from CUDA:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jkProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // printf("%f ", + // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // j * m->kProjSize * MAX_SEQ_LEN + + // k * m->kProjSize + + // l + // ]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // std::cout << "valueCache from CUDA:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jvProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // printf("%f ", + // valueCache_cpu[ + // i * m->vProjSize * MAX_SEQ_LEN * num_heads + + // j * m->vProjSize * MAX_SEQ_LEN + + // k * m->vProjSize + + // l]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // printf("\n"); + + // std::cout << "C++ kcache:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; j < num_heads; j++) { + // for (int l=0; l < m->kProjSize; l++) { + // for (int k=0; k < MAX_SEQ_LEN; k++) { + // size_t kcache_idx = + // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // j * BatchConfig::MAX_NUM_REQUESTS + + // i; + // printf("%f ", m->kcache[kcache_idx]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + // std::cout << "C++ vcache:" << std::endl; + // for (int i=0; inum_active_requests()+1; i++) { + // for (int j=0; jvProjSize; l++) { + // for (int k=0; k< MAX_SEQ_LEN; k++) { + // size_t vcache_idx = + // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // j * BatchConfig::MAX_NUM_REQUESTS + + // i; + // printf("%f ", m->vcache[vcache_idx]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // } + + assert(torch::allclose(K_t_cuda, K_t, 1e-05, 1e-05)); + assert(torch::allclose(V_t_cuda, V_t, 1e-05, 1e-05)); + free(kcache_cuda); + free(vcache_cuda); + + // ============================================================================= + // Load the W_out projection weights + // ============================================================================= + + // ----------------------- C++ operations & checks -------------------------- + float *w_out = (float *)calloc(m->vProjSize * m->num_heads * m->oProjSize, + sizeof(float)); + std::vector w_out_shape = {m->vProjSize, m->num_heads, m->oProjSize}; + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + for (int h = 0; h < num_heads; h++) { + for (int v = 0; v < m->vProjSize; v++) { + for (int o = 0; o < m->oProjSize; o++) { + set_value_row_major( + w_out, + w_out_shape, + {v, h, o}, + weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + + m->vProjSize * o + v]); + } + } + } + // convert weights to torch tensor + torch::Tensor torch_w_out = torch::from_blob( + w_out, {m->vProjSize, m->num_heads, m->oProjSize}, torch::kFloat32); + + // ----------------------- Loading CUDA results for this step --------------- + float *w_out_cuda = download_tensor( + m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); + assert(w_out_cuda != nullptr); + float *converted_wout_tensor = (float *)calloc( + m->vProjSize * m->num_heads * m->oProjSize, sizeof(float)); + std::vector converted_wout_tensor_shape = { + m->vProjSize, m->num_heads, m->oProjSize}; + + for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { + int v_idx = i % m->vProjSize; + int h_idx = (i / m->vProjSize) % m->num_heads; + int o_idx = i / (m->vProjSize * m->num_heads); + assert(v_idx < m->vProjSize && h_idx < m->num_heads && + o_idx < m->oProjSize); + set_value_row_major(converted_wout_tensor, + converted_wout_tensor_shape, + {v_idx, h_idx, o_idx}, + w_out_cuda[i]); + } + torch::Tensor w_out_cuda_tensor = + torch::from_blob(converted_wout_tensor, + {m->vProjSize, m->num_heads, m->oProjSize}, + torch::kFloat32); + + // ----------------------- Comparing C++ & CUDA results --------------------- + assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); + free(converted_wout_tensor); + + // ============================================================================= + // Compute the softmax(QK^T/sqrt(d_k))V product, request by request + // ============================================================================= + + // ----------------------- C++ initialization steps ------------------------- + torch::Tensor Q_projs = qkv_projs.index({Slice(), Slice(), 0, Slice()}) + .reshape({qkv_projs.sizes()[0], + qkv_projs.sizes()[1], + qkv_projs.sizes()[3]}); + + torch::Tensor qk_products[bc->num_active_requests()]; + torch::Tensor qk_softmax[bc->num_active_requests()]; + torch::Tensor attn_heads[bc->num_active_requests()]; + + torch::Tensor cpp_output = + torch::zeros({m->oProjSize, bc->num_active_tokens()}); + + // ----------------------- Loading CUDA results for this step --------------- + float *qk_prods_cpu = download_tensor( + m->qk_prods, + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); + assert(qk_prods_cpu != nullptr); + + float *qk_prods_softmax_cpu = download_tensor( + m->qk_prods_softmax, + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); + assert(qk_prods_softmax_cpu != nullptr); + + float *attn_heads_cpu = download_tensor( + m->attn_heads, BatchConfig::MAX_NUM_TOKENS * m->num_heads * m->vProjSize); + assert(attn_heads_cpu != nullptr); + + // ----------------------- Main loop (request by request) ------------------- + size_t qk_prods_cpu_offset = 0; + + for (size_t r = 0; r < bc->num_active_requests(); r++) { + // Compute pre-request parameters + size_t num_new_tokens = r_num_tokens[r]; + int64_t rid = (int64_t)(req_idxs[r]); + int64_t num_tokens_received_so_far = + (int64_t)(bc->requestsInfo[rid].token_start_offset + + bc->requestsInfo[rid].num_tokens_in_batch); + assert(num_new_tokens == bc->requestsInfo[rid].num_tokens_in_batch); + assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); + + // ----------------------- C++ computations ------------------------------- + // Get the slice of the Q projection tensor with the tokens in the current + // request + torch::Tensor Q_req = + Q_projs.index({Slice(), + Slice(r_first_idx[r], r_first_idx[r] + num_new_tokens), + Slice()}); + // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; + assert(Q_req.sizes()[0] == m->qProjSize); + assert(Q_req.sizes()[1] == num_new_tokens); + assert(Q_req.sizes()[2] == num_heads); + + /*printf("\n------------ QK multiplication (C++) -------------\n"); + printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, + rid: %li, Qproj slice: (%i, %i)\n", r, num_new_tokens, + num_tokens_received_so_far, rid, r_first_idx[r], r_first_idx[r] + + num_new_tokens); + + std::cout << "Q_req matrix (idk dims):" << std::endl << + Q_req.index({Slice(), Slice(), 0}) << std::endl << std::endl; std::cout << + "K_t matrix (ilk dims):" << std::endl << K_t.index({Slice(), Slice(0, + num_tokens_received_so_far), 0, rid}) << std::endl << std::endl; std::cout + << "C++ alpha: " << (1.0f / sqrt(m->kProjSize)) << std::endl;*/ + + // Compute (Q*K^T)/sqrt(d_k) matmul + qk_products[r] = + torch::einsum("ijk,ilk->jlk", + {Q_req, + K_t.index({Slice(), + Slice(0, num_tokens_received_so_far), + Slice(), + rid})}) * + (1.0f / sqrt(m->kProjSize)); + + // Set entries above diagonal to -inf to make attention causal. + for (int h = 0; h < num_heads; h++) { + qk_products[r].index( + {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = + qk_products[r] + .index({Slice(), + Slice(num_tokens_received_so_far - num_new_tokens), + h}) + .tril() + + torch::full({(int64_t)num_new_tokens, (int64_t)num_new_tokens}, + -INFINITY) + .triu() + .fill_diagonal_(0); + } + // Compute softmax for each request block + qk_softmax[r] = torch::softmax(qk_products[r], -2); + assert(qk_softmax[r].sizes()[0] == num_new_tokens); + assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); + assert(qk_softmax[r].sizes()[2] == m->num_heads); + + // ------------------- Loading CUDA results for this step --------------- + float *converted_qk_prod = (float *)calloc( + num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + float *converted_qk_prod_softmax = (float *)calloc( + num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + std::vector converted_qk_prod_shape = { + (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_heads}; + + for (size_t i = 0; + i < num_new_tokens * num_tokens_received_so_far * num_heads; + i++) { + size_t new_t_idx = i % num_new_tokens; + size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; + size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); + assert(new_t_idx < num_new_tokens && + all_t_idx < num_tokens_received_so_far && head_idx < num_heads); + set_value_row_major(converted_qk_prod, + converted_qk_prod_shape, + {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, + qk_prods_cpu[i + qk_prods_cpu_offset]); + set_value_row_major(converted_qk_prod_softmax, + converted_qk_prod_shape, + {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, + qk_prods_softmax_cpu[i + qk_prods_cpu_offset]); + } + torch::Tensor qk_prods_cuda = torch::from_blob( + converted_qk_prod, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + torch::kFloat32); + torch::Tensor qk_prods_softmax_cuda = torch::from_blob( + converted_qk_prod_softmax, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + torch::kFloat32); + + // ------------------- Comparing C++ & CUDA results ------------------ + /* std::cout << "C++:" <vProjSize); + assert( + V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) + .sizes()[1] == num_tokens_received_so_far); + assert( + V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) + .sizes()[2] == m->num_heads); + attn_heads[r] = torch::einsum( + "ijk,ljk->ilk", + {qk_softmax[r], + V_t.index( + {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); + assert(attn_heads[r].sizes()[0] == num_new_tokens); + assert(attn_heads[r].sizes()[1] == m->vProjSize); + assert(attn_heads[r].sizes()[2] == m->num_heads); + + // ------------------- Loading CUDA results for this step --------------- + float converted_attn_heads_cpu[num_new_tokens][m->vProjSize][m->num_heads] = + {0}; + for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_heads; i++) { + int token_ix = i % num_new_tokens; + int vproj_idx = (i / num_new_tokens) % m->vProjSize; + int head_idx = i / (num_new_tokens * m->vProjSize); + assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && + head_idx < m->num_heads); + converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = + attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_heads + i]; + } + torch::Tensor converted_attn_heads_cuda = + torch::from_blob(converted_attn_heads_cpu, + {(int64_t)num_new_tokens, m->vProjSize, m->num_heads}, + torch::kFloat32); + + // -------------------- Comparing C++ & CUDA results ------------------- + /* std::cout << "CUDA attn head for req " << r << ":" <num_heads; h++) { + std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << + std::endl; + } + std::cout << "C++ attn head for req " << r << ":" <num_heads; h++) { + std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; + } */ + assert(torch::allclose( + converted_attn_heads_cuda, attn_heads[r], 1e-05, 1e-05)); + + // ----------------------- C++ computations ---------------------------- + // Compute output values by projecting all heads to output space + cpp_output.index( + {Slice(), + Slice(r_first_idx[r], r_first_idx[r] + (int64_t)num_new_tokens)}) = + torch::einsum("jkl,ijk->li", {torch_w_out, attn_heads[r]}); + + // increment main loop's auxiliary index + qk_prods_cpu_offset += + num_new_tokens * num_tokens_received_so_far * num_heads; + } + + // ----------------------- Comparing C++ & CUDA results --------------------- + /* std::cout << "C++:" <oProjSize; i++) { + std::cout << cpp_output.index({i, Slice()}) << std::endl; + } + std::cout << "CUDA:" <oProjSize; i++) { + std::cout << torch_out_cuda.index({i, Slice(0, + (int64_t)bc->num_active_tokens())}) << std::endl; + } */ + + assert(torch::allclose( + torch_out_cuda.index( + {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), + cpp_output, + 1e-05, + 1e-05)); + + // ============================================================================= + // Cleanup + // ============================================================================= + free(w_out); + checkCUDA(cudaFreeHost(input_cpu)); + checkCUDA(cudaFreeHost(weight_cpu)); + checkCUDA(cudaFreeHost(output_cpu)); + checkCUDA(cudaFreeHost(QKVProjArray_cpu)); + checkCUDA(cudaFreeHost(keyCache_cpu)); + checkCUDA(cudaFreeHost(valueCache_cpu)); + checkCUDA(cudaFreeHost(qk_prods_cpu)); + checkCUDA(cudaFreeHost(qk_prods_softmax_cpu)); + checkCUDA(cudaFreeHost(attn_heads_cpu)); + checkCUDA(cudaFreeHost(w_out_cuda)); + // assert(false && "All good if you see this assert failure! :)"); +#endif + // Done with INFERENCE_TESTS block +} + +void IncMultiQuerySelfAttention::backward(FFModel const &ff) { + // IncMultiQuerySelfAttention does not support backward + assert(false); +} + +bool IncMultiQuerySelfAttention::get_int_parameter(PMParameter para, + int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool IncMultiQuerySelfAttention::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(IncMultiQuerySelfAttentionParams const &lhs, + IncMultiQuerySelfAttentionParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.add_zero_attn == rhs.add_zero_attn; +} + +IncMultiQuerySelfAttentionParams + IncMultiQuerySelfAttention::get_params() const { + IncMultiQuerySelfAttentionParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_heads = this->num_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.bias = this->bias; + params.add_bias_kv = this->add_bias_kv; + params.add_zero_attn = this->add_zero_attn; + + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::IncMultiQuerySelfAttentionParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.bias); + hash_combine(key, params.add_bias_kv); + hash_combine(key, params.add_zero_attn); + return key; +} +}; // namespace std diff --git a/src/ops/inc_multiquery_self_attention.cpp b/src/ops/inc_multiquery_self_attention.cpp new file mode 100644 index 0000000000..c032e887a7 --- /dev/null +++ b/src/ops/inc_multiquery_self_attention.cpp @@ -0,0 +1,96 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/inc_multiquery_self_attention.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +/*static*/ +void IncMultiQuerySelfAttention::inference_kernel_wrapper( + IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + handle_unimplemented_hip_kernel(OP_INC_MULTIQUERY_SELF_ATTENTION); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + printf("IncMultiQuerySelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( + FFHandler handler, + IncMultiQuerySelfAttention const *attn, + GenericTensorAccessorR const &weight, + Memory gpu_mem, + int num_samples) + : OpMeta(handler, attn) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(handler.dnn, stream)); +} + +IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( + FFHandler handler, + InferenceMode infer_mode, + Op const *attn, + int _qSize, + int _kSize, + int _vSize, + int _qProjSize, + int _kProjSize, + int _vProjSize, + int _oProjSize, + int _embed_dim, + bool _bias, + bool _add_bias_kv, + GenericTensorAccessorR const &weight, + Legion::Memory gpu_mem, + int num_samples) + : OpMeta(handler, attn) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(handler.dnn, stream)); +} + +IncMultiQuerySelfAttentionMeta::~IncMultiQuerySelfAttentionMeta(void) {} + +}; // namespace FlexFlow diff --git a/src/ops/inc_multiquery_self_attention.cu b/src/ops/inc_multiquery_self_attention.cu new file mode 100644 index 0000000000..ed6153de2d --- /dev/null +++ b/src/ops/inc_multiquery_self_attention.cu @@ -0,0 +1,797 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/inc_multiquery_self_attention.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +namespace Kernels { +namespace IncMultiHeadAttention { + +template +__global__ void apply_rotary_embedding_multi_query( + DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int q_block_size, + int k_block_size, + int v_block_size, + bool q_tensor) { + int proj_size = q_tensor ? qProjSize : kProjSize; + int real_num_heads = q_tensor ? num_heads : 1; + + CUDA_KERNEL_LOOP(i, num_tokens * proj_size * real_num_heads / 2) { + // create complex number + int head_idx = q_tensor ? i / (num_tokens * proj_size / 2) : 0; + int idx = i % (num_tokens * proj_size / 2); + int token_idx = + (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + + int real_part_index = + idx + token_idx * (proj_size / 2) + + (q_tensor ? head_idx * q_block_size : num_heads * q_block_size); + int complex_part_index = real_part_index + (proj_size / 2); + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + // int head_idx = i / (num_tokens * proj_size); + + // size_t pos = id_map[token_idx].token_position; + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + int pos_i = i % (proj_size / 2); + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +void compute_qkv_kernel(IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + cudaStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + assert(m->qSize == m->vSize && m->qSize == m->kSize); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; +#endif + // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) + // Weights: qSize x qProjSize x 3 x num_heads + // Input: qSize x num_tokens + // Output >>> qProjSize x num_tokens x 3 x num_heads + int num_tokens = bc->num_active_tokens(); + int m_q = m->qProjSize; + int n = bc->num_active_tokens(); + int k = m->qSize; + int lda = k, ldb = k, ldc = m_q; + size_t strideA = m_q * k; + size_t strideB = 0; + size_t strideC = m_q * n; // size of the output block for each head. + // q + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_q, + n, + k, + &alpha, + weight_ptr, + cublas_data_type, + lda, + strideA, + input_ptr, + cublas_data_type, + ldb, + strideB, + &beta, + output_ptr, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // k + int m_ = m->kProjSize; + int k_ = m->embed_dim; + int n_ = num_tokens; + lda = k_, ldb = k_, ldc = m_; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + weight_ptr + m->embed_dim * m->embed_dim, + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr + num_tokens * m->embed_dim, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // v + checkCUDA( + cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + weight_ptr + m->embed_dim * (m->embed_dim + m->kProjSize), + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr + num_tokens * (m->embed_dim + m->kProjSize), + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save_tensor
(output_ptr, 4544 * + // 7,"/home/ubuntu/FlexFlow/inference/q_before.txt"); + int q_block_size = m->qProjSize * num_tokens; + int k_block_size = m->kProjSize * num_tokens; + int v_block_size = m->vProjSize * num_tokens; + int parallelism = m->qProjSize * num_tokens * m->num_heads / 2; + apply_rotary_embedding_multi_query<<>>(output_ptr, + m->complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + true); + parallelism = m->kProjSize * num_tokens / 2; + apply_rotary_embedding_multi_query<<>>(output_ptr, + m->complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + q_block_size, + k_block_size, + v_block_size, + false); + + // save_tensor
(output_ptr, 64 * 7 * 2, + // "/home/ubuntu/FlexFlow/inference/query.txt"); + // save_tensor
(output_ptr, 4544 * + // 7,"/home/ubuntu/FlexFlow/inference/q.txt"); print_tensor
(output_ptr + // + num_new_tokens * (m->embed_dim + m->kProjSize), 32, "vvvvvvvvv"); +} + +template +void update_kv_cache_kernel(IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + if (num_tokens > 0) { + int parallelism = m->kProjSize * num_tokens; + store_kv_cache_multi_query<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + BatchConfig::MAX_SEQ_LENGTH, + /* k_cache = */ true); + + parallelism = m->vProjSize * num_tokens; + store_kv_cache_multi_query<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->valueCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_heads, + BatchConfig::MAX_SEQ_LENGTH, + /* k_cache = */ false); + } +} + +template +void inference_kernel(IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + cudaStream_t stream) { + // here because we need postion info in infernece 1 + cudaMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + stream); + + // phase 2: Update key/val cache + update_kv_cache_kernel
(m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + compute_attention_kernel(m, bc, output_ptr, weight_ptr, stream); +} + +} // namespace IncMultiHeadAttention +} // namespace Kernels + +using namespace Kernels::IncMultiHeadAttention; + +template +__global__ void + store_kv_cache_multi_query(DT const *devQKVProjArray, + DT *cache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_heads, + int max_seq_len, + bool k_cache) { + CUDA_KERNEL_LOOP(i, num_tokens * (k_cache ? kProjSize : vProjSize)) { + int proj_size = k_cache ? kProjSize : vProjSize; + // int head_idx = i / (num_tokens * proj_size); + // int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; + int token_idx = i / proj_size; + int data_idx = i % proj_size; + + // int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + // int current_head_block_size = + // num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); + + // |q|k|v| + int pre_size = num_tokens * qProjSize * num_heads + + (k_cache ? 0 : kProjSize * num_tokens); + + DT val = devQKVProjArray[pre_size + token_idx * proj_size + data_idx]; + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + cache_ptr[req_id * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +template +__global__ void + fill_entries_above_diagonal_multi_query(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + +template +void compute_attention_kernel(IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + DT const *weight_ptr, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int qkv_block_size = (m->qProjSize) * num_tokens; + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_req_block_size = kt_block_size; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_req_block_size = vt_block_size; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].token_start_offset + + bc->requestsInfo[i].num_tokens_in_batch; + // bc->token_last_available_idx[i] + 1; + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = qkv_block_size; + int strideB = 0; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + // To get A, skip over Q entries from previous requests (same head) + void const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; + // To get B, skip over K entries from previous requests (all heads + + // padding) + void const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // To get C, skip over QK^T products from previous requests + void *C = (void *)(m->qk_prods); + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save_tensor
( + // (DT *)A, 64 * 7 * 2, "/home/ubuntu/FlexFlow/inference/query.txt"); + // save_tensor
((DT *)B, 64 * 7, + // "/home/ubuntu/FlexFlow/inference/key.txt"); print_tensor
((DT + // *)m->qk_prods, 32, "output qkprod"); + + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_heads * entries_above_diagonal; + fill_entries_above_diagonal_multi_query<<>>( + static_cast
(C), + num_new_tokens, + total_tokens, + m->num_heads, + entries_above_diagonal, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + cudnnTensorDescriptor_t qk_tensor; + checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + void *C_softmax = (void *)(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + qk_tensor, + C, + &softmax_beta, + qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = 0; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = static_cast
(C_softmax); + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = static_cast
(m->attn_heads) + + tokens_previous_requests * m->num_heads * m->vProjSize; + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = weight_ptr + + m->embed_dim * (m->embed_dim + m->kProjSize + m->vProjSize); + B = C; + C = (output_ptr + tokens_previous_requests * m->oProjSize); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + } + + // print_tensor
(output_ptr, 32, "output 3"); + // save_tensor
( + // output_ptr, 7 * 4544, "/home/ubuntu/FlexFlow/inference/op.txt"); + // assert(false); + + assert(tokens_previous_requests == num_tokens); +} + +/*static*/ +void IncMultiQuerySelfAttention::inference_kernel_wrapper( + IncMultiQuerySelfAttentionMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + // bool use_bias = *m->bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (input.data_type == DT_HALF) { + Kernels::IncMultiHeadAttention::inference_kernel(m, + bc, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (input.data_type == DT_FLOAT) { + Kernels::IncMultiHeadAttention::inference_kernel(m, + bc, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unspported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiQuerySelfAttention forward time = %.2fms\n", elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( + FFHandler handler, + IncMultiQuerySelfAttention const *attn, + GenericTensorAccessorR const &weight, + Memory gpu_mem, + int num_samples) + : IncMultiQuerySelfAttentionMeta(handler, + INC_DECODING_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->embed_dim, + attn->bias, + attn->add_bias_kv, + weight, + gpu_mem, + num_samples) {} + +IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( + FFHandler handler, + InferenceMode infer_mode, + Op const *attn, + int _qSize, + int _kSize, + int _vSize, + int _qProjSize, + int _kProjSize, + int _vProjSize, + int _oProjSize, + int _embed_dim, + bool _bias, + bool _add_bias_kv, + GenericTensorAccessorR const &weight, + Memory gpu_mem, + int num_samples) + : OpMeta(handler, attn) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + qSize = _qSize; + kSize = _kSize; + vSize = _vSize; + embed_dim = _embed_dim; + // assume dimensions match for now + assert(qSize == kSize); + assert(kSize == vSize); + qProjSize = _qProjSize; + kProjSize = _kProjSize; + assert(qProjSize == kProjSize); // required for attention QK^T matmul + vProjSize = _vProjSize; + oProjSize = _oProjSize; + size_t size_of_dt = data_type_size(attn->data_type); + + num_heads = _embed_dim / qProjSize; + weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); + weightSize = (_embed_dim + _embed_dim + kProjSize + vProjSize) * _embed_dim * + size_of_dt; + has_load_weights = (bool *)calloc(1, sizeof(bool)); + *has_load_weights = false; + bias = (bool *)calloc(1, sizeof(bool)); + *bias = _bias; + assert(!_add_bias_kv); + +#ifdef INFERENCE_TESTS + kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * + BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * + BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); +#endif + + // allocate memory for the seqArray and reserve space + { + // size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; + + size_t qkv_max_proj_size = BatchConfig::MAX_NUM_TOKENS * + (qProjSize * num_heads + kProjSize + vProjSize); + size_t key_cache_size = 0, value_cache_size = 0; + switch (infer_mode) { + case INC_DECODING_MODE: + case TREE_VERIFY_MODE: { + key_cache_size = kProjSize * BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + value_cache_size = vProjSize * BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + break; + } + case BEAM_SEARCH_MODE: { + key_cache_size = kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + value_cache_size = vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + break; + } + default: + assert(false && "Unkown inference mode"); + } + size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; + size_t qk_prod_size = + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_heads; + size_t attn_heads_size = + BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; + size_t complex_size = + (BatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; + size_t totalSize = + (qkv_max_proj_size + key_cache_size + value_cache_size + + 2 * qk_prod_size + attn_heads_size) * + size_of_dt + + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + + complex_size * sizeof(cuFloatComplex); // more components will + // be added here later + + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(totalSize - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(reserveInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + off_t offset = 0; + devQKVProjArray = reserveInst.pointer_untyped(offset, 0); + offset += qkv_max_proj_size * size_of_dt; + keyCache = reserveInst.pointer_untyped(offset, 0); + offset += key_cache_size * size_of_dt; + valueCache = reserveInst.pointer_untyped(offset, 0); + offset += value_cache_size * size_of_dt; + token_infos = reserveInst.pointer(offset); + offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; + qk_prods = reserveInst.pointer_untyped(offset, 0); + offset += qk_prod_size * size_of_dt; + qk_prods_softmax = reserveInst.pointer_untyped(offset, 0); + offset += qk_prod_size * size_of_dt; + attn_heads = reserveInst.pointer_untyped(offset, 0); + offset += attn_heads_size * size_of_dt; + complex_input = reserveInst.pointer(offset); + offset += complex_size * sizeof(cuFloatComplex); + assert(offset == totalSize); + } + cudaStreamSynchronize(stream); +} + +IncMultiQuerySelfAttentionMeta::~IncMultiQuerySelfAttentionMeta(void) { + reserveInst.destroy(); +#ifdef INFERENCE_TESTS + free(kcache); + free(vcache); +#endif +} + +}; // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 7f50e4b69f..63b625edac 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -154,6 +154,8 @@ std::string get_operator_type_name(OperatorType type) { return "SpecIncMultiHeadSelfAttention"; case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: return "TreeIncMultiHeadSelfAttention"; + case OP_INC_MULTIQUERY_SELF_ATTENTION: + return "IncMultiQuerySelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: @@ -172,6 +174,8 @@ std::string get_operator_type_name(OperatorType type) { return "LayerNorm"; case OP_RMS_NORM: return "RMSNorm"; + case OP_GELU: + return "GELU"; case OP_IDENTITY: return "Identity"; // Parallel Ops diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index bcead0bfaf..c43fe140b9 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -32,6 +32,7 @@ #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -2316,6 +2317,19 @@ GraphOptimalViewSerialized sez.serialize(attn->qk_prod_scaling); break; } + case OP_INC_MULTIQUERY_SELF_ATTENTION: { + IncMultiQuerySelfAttention *attn = (IncMultiQuerySelfAttention *)op; + sez.serialize(attn->layer_guid.id); + sez.serialize(attn->oProjSize); + sez.serialize(attn->num_heads); + sez.serialize(attn->qProjSize); + sez.serialize(attn->vProjSize); + sez.serialize(attn->dropout); + sez.serialize(attn->bias); + sez.serialize(attn->add_bias_kv); + sez.serialize(attn->add_zero_attn); + break; + } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2794,6 +2808,38 @@ void FFModel::deserialize_graph_optimal_view( params); break; } + case OP_INC_MULTIQUERY_SELF_ATTENTION: { + assert(num_inputs == 1); + int embed_dim, num_heads, k_dim, v_dim; + float dropout, scaling_factor; + bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling; + size_t id; + dez.deserialize(id); + LayerID layer_guid(id); + dez.deserialize(embed_dim); + dez.deserialize(num_heads); + dez.deserialize(k_dim); + dez.deserialize(v_dim); + dez.deserialize(dropout); + dez.deserialize(bias); + dez.deserialize(add_bias_kv); + dez.deserialize(add_zero_attn); + + IncMultiQuerySelfAttentionParams params; + params.embed_dim = embed_dim; + params.num_heads = num_heads; + params.kdim = k_dim; + params.vdim = v_dim; + params.dropout = dropout; + params.bias = bias; + params.add_bias_kv = add_bias_kv; + params.add_zero_attn = add_zero_attn; + params.layer_guid = layer_guid; + node = + get_or_create_node(inputs[0], params); + break; + } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index b27d2f3421..8d1133f7c9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -43,6 +43,7 @@ #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -2783,6 +2784,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_INC_MULTIQUERY_SELF_ATTENTION: { + Op *op = IncMultiQuerySelfAttention::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -4656,6 +4663,25 @@ void register_flexflow_internal_tasks() { IncMultiHeadSelfAttention::inference_task>( registrar, "IncMultiHeadSelfAttention Inference Task"); } + // MultiQueryAttention task + { + TaskVariantRegistrar registrar(INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, + "IncMultiQuerySelfAttention Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "IncMultiQuerySelfAttention Init Task"); + } + { + TaskVariantRegistrar registrar(INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, + "IncMultiQuerySelfAttention Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant< + IncMultiQuerySelfAttention::inference_task>( + registrar, "IncMultiQuerySelfAttention Inference Task"); + } // speculative MultiHeadAttention task { TaskVariantRegistrar registrar( diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index cd5e68d750..8fdeacc623 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -18,6 +18,7 @@ #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/mean.h" diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index bceb6e5953..6cbd92dee6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -15,26 +15,71 @@ #include "flexflow/inference.h" #include "flexflow/parallel_ops/parallel_op.h" -#include "flexflow/tokenizers.h" +// #include "flexflow/tokenizers.h" +#include #include #include +#include #include namespace FlexFlow { using namespace Legion; +using tokenizers::Tokenizer; LegionRuntime::Logger::Category log_req_mgr("RequestManager"); +std::string LoadBytesFromFile(std::string const &path) { + std::ifstream fs(path, std::ios::in | std::ios::binary); + assert(!fs.fail() && "no such file"); + std::string data; + fs.seekg(0, std::ios::end); + size_t size = static_cast(fs.tellg()); + fs.seekg(0, std::ios::beg); + data.resize(size); + fs.read(data.data(), size); + return data; +} + RequestManager::RequestManager() - : tokenizer(nullptr), verbose(false), next_available_guid(1000000), - num_processed_requests(0) {} + : verbose(false), next_available_guid(1000000), num_processed_requests(0) {} -RequestManager::RequestManager(Tokenizer *_tokenizer, +RequestManager::RequestManager(ModelType model_type, + std::string const &path, bool _verbose, std::string _output_filepath) - : tokenizer(_tokenizer), verbose(_verbose), next_available_guid(1000000), - num_processed_requests(0), output_filepath(_output_filepath) {} + : verbose(_verbose), next_available_guid(1000000), + num_processed_requests(0), output_filepath(_output_filepath) { + + // bos id + this->model_type = model_type; + if (model_type == ModelType::LLAMA) { + this->tokenizer_ = + Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(path)); + } else if (model_type == ModelType::OPT) { + std::string tokenizer_folder = + (!path.empty() && path.back() != '/') ? path + '/' : path; + std::string vocab_file = tokenizer_folder + "gpt2-vocab.json"; + std::string merges_file = tokenizer_folder + "gpt2-merges.txt"; + std::string added_tokens_file = tokenizer_folder + "added_tokens.json"; + std::filesystem::path path1(vocab_file); + std::filesystem::path path2(merges_file); + std::filesystem::path path3(added_tokens_file); + assert(std::filesystem::exists(path1) && + "Vocab file gpt2-vocab.json does not exist at the specified path"); + assert(std::filesystem::exists(path2) && + "Merge file gpt2-merges.txt does not exist at the specified path"); + // opt_tokenizer = new OptTokenizer(vocab_file, merges_file); + std::string vocab = LoadBytesFromFile(path1.string()); + std::string merges = LoadBytesFromFile(path2.string()); + std::string added_tokens = LoadBytesFromFile(path3.string()); + + this->tokenizer_ = + Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); + } else if (model_type == ModelType::FALCON) { + this->tokenizer_ = Tokenizer::FromBlobJSON(LoadBytesFromFile(path)); + } +} int RequestManager::register_new_model(FFModel *model) { int model_id = models.size(); @@ -94,8 +139,14 @@ RequestManager::RequestGuid Request request; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - request.tokens.push_back(tokenizer->bos_token_id); - std::vector tokens = tokenizer->Encode(prompt); + request.tokens.push_back(this->model_bos_map.at(this->model_type)); + std::vector tokens = this->tokenizer_->Encode(prompt); + + for (int i = 0; i < tokens.size(); i++) { + std::cout << tokens.at(i) << "\n"; + } + + // assert(false); request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); request.initial_len = request.tokens.size(); @@ -143,7 +194,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // This is a decoding token log_req_mgr.print("Output token is: %d", result.token_ids[i]); request.tokens.push_back(result.token_ids[i]); - std::string output = tokenizer->Decode(request.tokens); + std::string output = this->tokenizer_->Decode(request.tokens); log_req_mgr.print("Output: %s", output.c_str()); } } @@ -165,7 +216,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", old_bc.requestsInfo[i].request_guid, request.tokens.size()); - std::string output = tokenizer->Decode(request.tokens); + std::string output = this->tokenizer_->Decode(request.tokens); + + for (int i = 0; i < request.tokens.size(); i++) { + std::cout << request.tokens.at(i) << "\n"; + } log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; @@ -481,7 +536,7 @@ BeamSearchBatchConfig log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", request.guid, request.tokens.size()); - std::string output = tokenizer->Decode(request.tokens); + std::string output = this->tokenizer_->Decode(request.tokens); log_req_mgr.print("Final output: %s", output.c_str()); new_bc.request_completed[i] = true; num_processed_requests++; @@ -574,7 +629,7 @@ BeamSearchBatchConfig break; } } - std::string output = tokenizer->Decode(request.tokens); + std::string output = this->tokenizer_->Decode(request.tokens); log_req_mgr.print("Output: %s", output.c_str()); } diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 0c2a2e3f84..58623258f1 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -29,6 +29,7 @@ #include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" @@ -3715,6 +3716,13 @@ bool FFModel::convert_graph_to_operators( new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true); break; } + case OP_INC_MULTIQUERY_SELF_ATTENTION: { + assert(inList.size() == 1); + IncMultiQuerySelfAttention *attn = + (IncMultiQuerySelfAttention *)node.ptr; + new_op = new IncMultiQuerySelfAttention(*this, *attn, inputs[0], true); + break; + } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(inList.size() == 1); TreeIncMultiHeadSelfAttention *attn = diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 3442fe6120..788d001dd8 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -3,7 +3,6 @@ import os from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer - def main(): # Change working dir to folder storing this script abspath = os.path.abspath(__file__) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index c04000c72b..8971fc206e 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -13,7 +13,8 @@ cd "${BASH_SOURCE[0]%/*}" cleanup # Update the transformers library to support the LLAMA model -pip3 install --upgrade transformers sentencepiece + +pip3 install --upgrade transformers sentencepiece # Download the weights in both half and full precision python3 ../inference/utils/download_llama_weights.py @@ -126,10 +127,10 @@ python3 ./inference/huggingface_inference.py --model-name "decapoda-research/lla python3 ./inference/huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu # OPT (small model, full precision) -python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 127 +python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 # OPT (small model, half precision) -python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 127 +python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 # OPT (big model, full precision) #python3 ./inference/huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 From 2fd3d69361726156b7995cd66a023faa937fc62c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Jun 2023 17:41:45 +0800 Subject: [PATCH 152/344] [Inference] - Fix build issues (#779) * fix gpu-ci * add check for rust in cmake --- .github/workflows/gpu-ci.yml | 9 +++++++++ CMakeLists.txt | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 95983f889b..9797670c77 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -194,10 +194,19 @@ jobs: steps: - name: Install updated git version run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git + - name: Checkout Git Repository uses: actions/checkout@v3 with: submodules: recursive + + - name: Install conda and FlexFlow dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + activate-environment: flexflow + environment-file: conda/flexflow-cpu.yml + auto-activate-base: false - name: Build and Install FlexFlow run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index ae25c851a9..aeec820452 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -482,6 +482,14 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) if (FF_GPU_BACKEND STREQUAL "hip_rocm") SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") endif() + # Ensure Rust is installed + execute_process(COMMAND rustc --version + RESULT_VARIABLE RUST_COMMAND_RESULT + OUTPUT_VARIABLE RUSTC_OUTPUT + ERROR_QUIET) + if(NOT RUST_COMMAND_RESULT EQUAL 0) + message(FATAL_ERROR "Rust is not installed on the system. Cannot build the tokenizers.") + endif() add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) target_link_libraries(flexflow tokenizers_cpp) From c44a64b00b17a84094d729f7f97798ae18506b31 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 22 Jun 2023 16:16:51 -0500 Subject: [PATCH 153/344] Support CPU Offload in SpecInfer (#765) * decomp * initial implementation * add missing file * checkpoint * more bug fixes * update default offload size * fix non-offload * undo changes to spec_inc_mha * fix a parallel tensor reuse bug * prepare_next_batch for offload(inc_decode) * format * int4&int8 offload * fix merge issue * fix build * spec_infer offload&quantize * fix, update readme. * remove redundant * hip build * hip * model param --------- Co-authored-by: xinhaoc --- .github/README.md | 15 +- include/flexflow/accessor.h | 2 + include/flexflow/batch_config.h | 4 +- include/flexflow/config.h | 8 + include/flexflow/ffconst.h | 2 + include/flexflow/ffconst_utils.h | 6 + include/flexflow/model.h | 3 +- .../ops/inc_multihead_self_attention.h | 22 +- .../ops/inc_multihead_self_attention_params.h | 3 +- .../flexflow/ops/kernels/decompress_kernels.h | 43 +++ .../inc_multihead_self_attention_kernels.h | 6 + include/flexflow/ops/kernels/linear_kernels.h | 12 +- include/flexflow/ops/linear.h | 9 +- include/flexflow/ops/linear_params.h | 2 + .../ops/spec_inc_multihead_self_attention.h | 2 +- .../ops/tree_inc_multihead_self_attention.h | 8 +- ...tree_inc_multihead_self_attention_params.h | 3 +- include/flexflow/utils/memory_allocator.h | 67 ++++ inference/file_loader.cc | 327 +++++++++++++++++- inference/file_loader.h | 9 +- inference/incr_decoding/incr_decoding.cc | 5 +- inference/models/falcon.cc | 2 +- inference/models/llama.cc | 3 +- inference/models/opt.cc | 2 +- inference/spec_infer/spec_infer.cc | 2 +- inference/utils/compress_llama_weights.py | 117 +++++++ src/ops/beam_topk.cu | 16 +- src/ops/inc_multihead_self_attention.cc | 146 +++++--- src/ops/inc_multihead_self_attention.cpp | 8 +- src/ops/inc_multihead_self_attention.cu | 290 +++++++++++++--- src/ops/kernels/decompress_kernels.cu | 261 ++++++++++++++ src/ops/kernels/linear_kernels.cpp | 6 +- src/ops/kernels/linear_kernels.cu | 69 +++- src/ops/linear.cc | 198 ++++++++--- src/ops/spec_inc_multihead_self_attention.cc | 7 +- src/ops/spec_inc_multihead_self_attention.cpp | 8 +- src/ops/spec_inc_multihead_self_attention.cu | 54 +-- src/ops/tree_inc_multihead_self_attention.cc | 141 +++++--- src/ops/tree_inc_multihead_self_attention.cpp | 8 +- src/ops/tree_inc_multihead_self_attention.cu | 125 +++++-- src/runtime/accessor.cc | 60 ++++ src/runtime/ffconst_utils.cc | 9 + src/runtime/graph.cc | 18 +- src/runtime/inference_manager.cc | 94 ++++- src/runtime/memory_allocator.cc | 54 +++ src/runtime/model.cc | 36 +- src/runtime/model.cu | 27 ++ src/runtime/parallel_tensor.cc | 8 + 48 files changed, 2008 insertions(+), 319 deletions(-) create mode 100644 include/flexflow/ops/kernels/decompress_kernels.h create mode 100644 include/flexflow/utils/memory_allocator.h create mode 100644 inference/utils/compress_llama_weights.py create mode 100644 src/ops/kernels/decompress_kernels.cu create mode 100644 src/runtime/memory_allocator.cc diff --git a/.github/README.md b/.github/README.md index 59377e308e..010d7c07bb 100644 --- a/.github/README.md +++ b/.github/README.md @@ -63,9 +63,22 @@ SpecInfer supports two tokenizers: * The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. * The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. -### Mixed-precision support +### Mixed-precision Support SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. +### CPU Offloading +SpecInfer offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by adding `-offload` and `-offload-reserve-space-size` flags. +#### Quantization +To reduce data transferred between the CPU and GPU, SpecInfer provides int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. The quantization method can be selected using the `--4bit-quantization` and `--8bit-quantization` flags. + +Below is an example command line to use offloading and quantization in SpecInfer. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -offload -offload-reserve-space-size 6000 --8bit-quantization +``` + + + ### LLM Weights The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h index 6f95354823..65ab33b513 100644 --- a/include/flexflow/accessor.h +++ b/include/flexflow/accessor.h @@ -61,6 +61,7 @@ class GenericTensorAccessorW { float *get_float_ptr() const; double *get_double_ptr() const; half *get_half_ptr() const; + char *get_byte_ptr() const; DataType data_type; Legion::Domain domain; void *ptr; @@ -79,6 +80,7 @@ class GenericTensorAccessorR { float const *get_float_ptr() const; double const *get_double_ptr() const; half const *get_half_ptr() const; + char const *get_byte_ptr() const; DataType data_type; Legion::Domain domain; void const *ptr; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 39fcc49c68..b56466bfe5 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -112,8 +112,8 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; - static int const MAX_BEAM_WIDTH = 1; - static int const MAX_BEAM_DEPTH = 8; + inline static int const MAX_BEAM_WIDTH = 1; + inline static int const MAX_BEAM_DEPTH = 8; int model_id; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c8a9f50aa2..f5eb2e069a 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -70,6 +70,9 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; + void *offload_reserve_space; + size_t offload_reserve_space_size; + DataType quantization_type; bool allowTensorOpMathConversion; #ifdef FF_USE_NCCL ncclComm_t ncclComm; @@ -78,6 +81,8 @@ struct FFHandler { struct FFInitInfo { size_t workSpaceSize; + size_t offload_reserve_space_size; + DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; }; @@ -129,6 +134,9 @@ class FFConfig { float search_alpha; bool search_overlap_backward_update; CompMode computationMode; + bool cpu_offload; + size_t offload_reserve_space_size; + DataType quantization_type; // Control parallelizable dimensions bool only_data_parallel; bool enable_sample_parallel; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 3cd42ccffe..0b572a9674 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -33,6 +33,8 @@ enum DataType { DT_HALF = 43, DT_FLOAT = 44, DT_DOUBLE = 45, + DT_INT4 = 46, + DT_INT8 = 47, DT_NONE = 49, }; diff --git a/include/flexflow/ffconst_utils.h b/include/flexflow/ffconst_utils.h index e2d04ad5a5..421a139d57 100644 --- a/include/flexflow/ffconst_utils.h +++ b/include/flexflow/ffconst_utils.h @@ -10,6 +10,12 @@ std::string get_operator_type_name(OperatorType type); size_t data_type_size(DataType type); +#define INT4_NUM_OF_ELEMENTS_PER_GROUP 32 + +size_t get_quantization_to_byte_size(DataType type, + DataType quantization_type, + size_t num_elements); + std::ostream &operator<<(std::ostream &, OperatorType); }; // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 1f06fb7994..1277b29b3d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -360,12 +360,13 @@ std::vector class FFModel { public: - FFModel(FFConfig &config); + FFModel(FFConfig &config, bool cpu_offload = false); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; static constexpr float PROPAGATION_SIZE_WEIGHT = 1.0; + bool cpu_offload; // C++ APIs for constructing models // Add an exp layer Tensor exp(const Tensor x, char const *name = NULL); diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index baf126f41e..a178dad577 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -9,6 +9,7 @@ #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" +#include "flexflow/utils/memory_allocator.h" #include "math.h" #include #include @@ -38,6 +39,8 @@ class IncMultiHeadSelfAttention : public Op { float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name); IncMultiHeadSelfAttention(FFModel &model, const ParallelTensor _input, @@ -55,6 +58,8 @@ class IncMultiHeadSelfAttention : public Op { float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name); IncMultiHeadSelfAttention(FFModel &model, IncMultiHeadSelfAttention const &other, @@ -114,6 +119,8 @@ class IncMultiHeadSelfAttention : public Op { qk_prod_scaling; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; + DataType quantization_type; + bool offload; }; class IncMultiHeadSelfAttentionMeta : public OpMeta { @@ -121,7 +128,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { IncMultiHeadSelfAttentionMeta(FFHandler handler, IncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads); IncMultiHeadSelfAttentionMeta(FFHandler handler, @@ -141,14 +148,17 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { bool _add_bias_kv, float _scaling_factor, GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads); + int _num_heads, + DataType _quantization_type, + bool _offload); ~IncMultiHeadSelfAttentionMeta(void); public: Realm::RegionInstance reserveInst; - size_t weights_params, weightSize, reserveSpaceSize; + size_t weights_params, weightSize, biasSize, reserveSpaceSize, + quantized_weightSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int num_heads; bool *has_load_weights; @@ -160,10 +170,14 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { #ifdef INFERENCE_TESTS float *kcache, *vcache; #endif + void *weight_ptr, *bias_ptr; // for weight offload void *devQKVProjArray, *keyCache, *valueCache; void *qk_prods, *qk_prods_softmax; void *attn_heads, *W_out_contiguous; + char *quantized_weight_ptr; BatchConfig::PerTokenInfo *token_infos; + DataType quantization_type; + bool offload; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cuFloatComplex *complex_input; #endif diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 66aed3bf3b..d95aaf2e05 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -12,7 +12,8 @@ struct IncMultiHeadSelfAttentionParams { float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; - + DataType quantization_type; + bool offload; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/kernels/decompress_kernels.h b/include/flexflow/ops/kernels/decompress_kernels.h new file mode 100644 index 0000000000..7cfedd6265 --- /dev/null +++ b/include/flexflow/ops/kernels/decompress_kernels.h @@ -0,0 +1,43 @@ +#ifndef _FLEXFLOW_DECOMPRESS_KERNELS_H +#define _FLEXFLOW_DECOMPRESS_KERNELS_H + +#include "flexflow/device.h" + +namespace FlexFlow { +namespace Kernels { + +template +__global__ void decompress_int4_general_weights(char const *input_weight_ptr, + DT *weight_ptr, + int in_dim, + int valueSize); +template +__global__ void decompress_int8_general_weights(char const *input_weight_ptr, + DT *weight_ptr, + int in_dim, + int valueSize); + +template +__global__ void decompress_int4_attention_weights(char *input_weight_ptr, + DT *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template +__global__ void decompress_int8_attention_weights(char *input_weight_ptr, + DT *weight_ptr, + int qProjSize, + int qSize, + int num_heads); +// template +// void decompress_weight_bias(T1 *input_weight_ptr, +// T2 *weight_ptr, +// T2 *params, +// int group_size, +// int tensor_size); + +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_DECOMPRESS_KERNELS_H diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 0e0b1b4da9..a35cf9d7f2 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -58,6 +58,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream); + +template +void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); } // namespace IncMultiHeadAttention } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 9ed99c93a1..ed4864b1ab 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -10,7 +10,11 @@ namespace FlexFlow { class LinearMeta : public OpMeta { public: - LinearMeta(FFHandler handle, int batch_size, Linear const *li); + LinearMeta(FFHandler handle, + int batch_size, + Linear const *li, + MemoryAllocator gpu_mem_allocator, + int weightSize); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t outputTensor; cudnnActivationDescriptor_t actiDesc; @@ -19,6 +23,12 @@ class LinearMeta : public OpMeta { miopenActivationDescriptor_t actiDesc; #endif void *one_ptr; + void *weight_ptr; + DataType weight_ptr_type; + DataType quantization_type; + bool offload; + char *quantized_weight_ptr; + size_t quantized_weightSize; ActiMode activation; RegularizerMode kernel_reg_type; float kernel_reg_lambda; diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index bbc40ef320..7b134502b7 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -5,6 +5,7 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/linear_params.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { @@ -25,6 +26,8 @@ class Linear : public Op { float kernel_reg_lambda, bool _use_bias, DataType _data_type, + DataType _quantization_type, + bool offload, bool allocate_weights, char const *name); Linear(FFModel &model, @@ -96,13 +99,13 @@ class Linear : public Op { bool allocate_weights, char const *name); - template + template static OpMeta * init_task_with_dim(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - template + template static void forward_task_with_dim(Legion::Task const *task, std::vector const ®ions, @@ -126,6 +129,8 @@ class Linear : public Op { float kernel_reg_lambda; bool use_bias; ParallelTensor replica; + DataType quantization_type; + bool offload; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/linear_params.h b/include/flexflow/ops/linear_params.h index 2c41694960..563304e89f 100644 --- a/include/flexflow/ops/linear_params.h +++ b/include/flexflow/ops/linear_params.h @@ -18,6 +18,8 @@ class LinearParams { ActiMode activation; RegularizerMode kernel_reg_type; float kernel_reg_lambda; + DataType quantization_type; + bool offload; bool is_valid(ParallelTensorShape const &input_shape) const; void solve_dims(const ParallelTensor input, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 92d52b43b1..db12be5792 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -126,7 +126,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { SpecIncMultiHeadSelfAttentionMeta(FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads); ~SpecIncMultiHeadSelfAttentionMeta(void); diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index eef51bc21f..328ab128b2 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -38,6 +38,8 @@ class TreeIncMultiHeadSelfAttention : public Op { float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, const ParallelTensor _input, @@ -55,6 +57,8 @@ class TreeIncMultiHeadSelfAttention : public Op { float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, TreeIncMultiHeadSelfAttention const &other, @@ -115,6 +119,8 @@ class TreeIncMultiHeadSelfAttention : public Op { qk_prod_scaling; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; + DataType quantization_type; + bool offload; }; class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { @@ -122,7 +128,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { TreeIncMultiHeadSelfAttentionMeta(FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads); ~TreeIncMultiHeadSelfAttentionMeta(void); diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index f8fbac7e8e..0eede3bd2f 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -12,7 +12,8 @@ struct TreeIncMultiHeadSelfAttentionParams { float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; - + DataType quantization_type; + bool offload; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h new file mode 100644 index 0000000000..8e50a4c3b3 --- /dev/null +++ b/include/flexflow/utils/memory_allocator.h @@ -0,0 +1,67 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _FLEXFLOW_UTILS_MEMORY_ALLOCATOR_H_ +#define _FLEXFLOW_UTILS_MEMORY_ALLOCATOR_H_ + +#include "flexflow/config.h" + +namespace FlexFlow { + +class MemoryAllocator { +public: + MemoryAllocator(Legion::Memory memory); + void create_legion_instance(Realm::RegionInstance &inst, size_t size); + void register_reserved_work_space(void *base, size_t size); + inline void *allocate_reserved_untyped(size_t datalen) { + void *ptr = static_cast(reserved_ptr) + reserved_allocated_size; + reserved_allocated_size += datalen; + assert(reserved_allocated_size <= reserved_total_size); + return ptr; + } + template + inline DT *allocate_reserved(size_t count) { + void *ptr = static_cast(reserved_ptr) + reserved_allocated_size; + reserved_allocated_size += sizeof(DT) * count; + assert(reserved_allocated_size <= reserved_total_size); + return static_cast
(ptr); + } + + inline void *allocate_instance_untyped(size_t datalen) { + void *ptr = static_cast(instance_ptr) + instance_allocated_size; + instance_allocated_size += datalen; + assert(instance_allocated_size <= instance_total_size); + return ptr; + } + + template + inline DT *allocate_instance(size_t count) { + void *ptr = static_cast(instance_ptr) + instance_allocated_size; + instance_allocated_size += sizeof(DT) * count; + assert(instance_allocated_size <= instance_total_size); + return static_cast
(ptr); + } + +public: + Legion::Memory memory; + void *reserved_ptr; + void *instance_ptr; + size_t reserved_total_size, reserved_allocated_size; + size_t instance_total_size, instance_allocated_size; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_RUNTIME_H_ diff --git a/inference/file_loader.cc b/inference/file_loader.cc index d7bb8a7b4c..a7386d6597 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -85,7 +85,6 @@ void load_attention_bias(DT *ptr, int file_index = 0; for (auto file : bias_files) { size_t partial_size = hidden_dim; - // std::cout << "Loading filename: " << file << std::endl; std::ifstream in(file, std::ios::in | std::ios::binary); assert(in.good() && "incorrect bias file path"); std::vector
host_array(partial_size); @@ -199,7 +198,6 @@ void load_attention_weights(DT *ptr, size_t partial_size = one_weight_file_size; std::ifstream in(file, std::ios::in | std::ios::binary); - // std::cout << "Loading filename: " << file << std::endl; if (!in.good()) { std::cout << "Could not open file: " << file << std::endl; } @@ -235,7 +233,6 @@ void load_attention_weights(DT *ptr, template void load_from_file(DT *ptr, size_t size, std::string filename) { - // std::cout << "Loading filename: " << filename << std::endl; std::ifstream in(filename, std::ios::in | std::ios::binary); if (!in.good()) { std::cout << "Could not open file: " << filename << std::endl; @@ -288,6 +285,320 @@ void FileDataLoader::load_positions(FFModel *ff, position_pt->set_tensor(ff, dims_vec, data); } +//--------------------- quantization functions ---------------------- +// the data layout is 32 * quantized data + 1 scaling factor + 1 offset factor +// in the decompression mode, the real data = quantized data * scaling factor + +// offset + +void load_attention_weights_quantized(char *ptr, + int num_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weight_path, + DataType data_type, + bool use_full_precision) { + // layers_0_attention_wq_weight + // layers_0_self_attn_q_proj_weight + std::string q_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wq_weight"; + std::string k_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wk_weight"; + std::string v_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wv_weight"; + std::string o_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wo_weight"; + std::vector weight_files = {q_file, k_file, v_file, o_file}; + + int file_index = 0; + + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_weight_file_size = + num_heads * single_proj_size; // size of each of Q/K/V/O for all heads + + // q, k, v, o -> 0, 1, 2, 3 + for (auto file : weight_files) { + size_t partial_size = one_weight_file_size; + std::ifstream in(file, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << file << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(char) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + assert(partial_size == host_array.size()); + + size_t one_head_size = data_type == DT_INT8 + ? hidden_dim * (hidden_dim / num_heads) + : hidden_dim * (hidden_dim / num_heads) / 2; + + size_t data_index = 0; + for (int i = 0; i < num_heads; i++) { + size_t start_index = i * one_head_size * 4 + file_index * one_head_size; + for (size_t j = start_index; j < start_index + one_head_size; j++) { + if (data_type == DT_INT4) { + char v1 = host_array.at(data_index); + char v2 = host_array.at(data_index + 1); + ptr[j] = (v2 & 0XF) | (v1 << 4); + data_index += 2; + } else { + ptr[j] = host_array.at(data_index); + data_index += 1; + } + } + } + file_index++; + in.close(); + } + + // load scale and offset to the end of weight tensor + // the layout is like |values * 32 heads|offset|scale| + size_t offset = data_type == DT_INT8 ? one_weight_file_size * 4 + : (one_weight_file_size * 4) / 2; + for (auto file : weight_files) { + for (int i = 0; i < 2; i++) { + std::string meta_file = i == 0 ? (file + "_offset") : (file + "_scale"); + size_t partial_size = + one_weight_file_size / INT4_NUM_OF_ELEMENTS_PER_GROUP; + std::ifstream in(meta_file, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << meta_file << std::endl; + } + assert(in.good() && "incorrect weight file path"); + + if (use_full_precision) { + // float + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(float) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + assert(partial_size == host_array.size()); + + for (auto v : host_array) { + *(float *)(ptr + offset) = v; + offset += sizeof(float); + } + } else { + // half + std::vector host_array(partial_size); + size_t loaded_data_size = sizeof(half) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error"; + return; + } + assert(partial_size == host_array.size()); + for (auto v : host_array) { + *(half *)(ptr + offset) = v; + offset += sizeof(half); + } + } + } + } +} + +void load_from_quantized_file(char *ptr, + size_t size, + std::string filename, + DataType data_type, + bool use_full_precision) { + assert(data_type == DT_INT4 || data_type == DT_INT8); + + std::string value_file = filename; + std::string offset_file = filename + "_offset"; + std::string scaling_file = filename + "_scale"; + size_t value_size = 0, offset_size = 0, scaling_size = 0; + + if (data_type == DT_INT4) { + // float/half + 4bit quantization + // size1 = volume / 2, size2 = volume / 32 * (sizeof(DT)), size3 = size2 + value_size = 2 * (use_full_precision ? (size * 2 / 3) : (size * 4 / 5)); + offset_size = use_full_precision ? (size / 6) : (size / 10); + scaling_size = use_full_precision ? (size / 6) : (size / 10); + } else if (data_type == DT_INT8) { + // float/half + 8bit quantization + // size1 = volume * 1, size2 = volume / 32 * (sizeof(DT)), size3 = size2 + value_size = use_full_precision ? (size * 4 / 5) : (size * 8 / 9); + offset_size = use_full_precision ? (size / 10) : (size / 18); + scaling_size = use_full_precision ? (size / 10) : (size / 18); + } + + std::vector quantized_files = { + value_file, offset_file, scaling_file}; + std::vector quantized_sizes = {value_size, offset_size, scaling_size}; + + int file_idx = 0; + long data_index = 0; + for (auto file : quantized_files) { + std::ifstream in(file, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << file << std::endl; + } + assert(in.good() && "incorrect weight file path"); + + // value file, every element is in one byte + if (file_idx == 0) { + size = quantized_sizes.at(file_idx); + std::vector host_array(size); + size_t loaded_data_size = size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load weight data error quantized" << in_get_size << ", " + << loaded_data_size << ", " << sizeof(char) << std::endl; + return; + } + assert(size == host_array.size()); + + // normal + size_t idx = 0; + while (idx < host_array.size()) { + if (data_type == DT_INT4) { + // pack 2 elements into one byte + char v1 = host_array.at(idx); + char v2 = host_array.at(idx + 1); + // v1 in first 4 bit and v2 in last 4 bit; + ptr[data_index++] = (v2 & 0XF) | (v1 << 4); + idx += 2; + } else { + ptr[data_index++] = host_array.at(idx++); + } + } + } else if (use_full_precision) { + // load offset/scale in float type; + size = quantized_sizes.at(file_idx); + std::vector host_array(size / sizeof(float)); + size_t loaded_data_size = size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load weight data error scale/offset" << in_get_size + << ", " << loaded_data_size << ", " << sizeof(float) << ", " + << file << ", " << size << std::endl; + return; + } + assert(size / sizeof(float) == host_array.size()); + for (auto v : host_array) { + *(float *)(ptr + data_index) = v; + data_index += sizeof(float); + } + + } else { + // load offset/scale in half type; + size = quantized_sizes.at(file_idx); + std::vector host_array(size / sizeof(half)); + size_t loaded_data_size = size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load weight data error " << in_get_size << ", " + << loaded_data_size << ", " << sizeof(half) << std::endl; + return; + } + assert(size / sizeof(half) == host_array.size()); + // normal + for (auto v : host_array) { + *(half *)(ptr + data_index) = v; + data_index += sizeof(half); + } + } + in.close(); + file_idx++; + } +} + +void FileDataLoader::load_quantization_weight(FFModel *ff, + Tensor weight, + int weight_idx, + std::string const &layername, + bool use_full_precision) { + size_t volume = 1; + std::vector dims_vec; + for (int i = 0; i < weight->num_dims; i++) { + dims_vec.push_back(weight->dims[i]); + volume *= weight->dims[i]; + } + + char *data = (char *)malloc(sizeof(char) * volume); + + std::string file_path = + (layername.back() == '/') ? layername : "/" + layername; + + if (file_path.find("attention_w") != std::string::npos) { + if (weight_idx == 0) { + load_attention_weights_quantized(data, + num_heads, + hidden_dim, + qkv_inner_dim, + file_path, + weight_file_path, + weight->data_type, + use_full_precision); + } + // else { + // load_attention_bias_quantized(data, + // num_heads, + // hidden_dim, + // qkv_inner_dim, + // file_path, + // weight_file_path); + // } + + } else { + if (weight_idx > 0) { + int index = file_path.find("_weight"); + assert(index != std::string::npos); + file_path = file_path.substr(0, index) + "_bias"; + } + load_from_quantized_file(data, + volume, + weight_file_path + file_path, + weight->data_type, + use_full_precision); + } + + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + weight_pt->set_tensor(ff, dims_vec, data); + + delete data; +} + template void FileDataLoader::load_single_weight_tensor(FFModel *ff, Tensor weight, @@ -344,7 +655,9 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, } void FileDataLoader::load_weights( - FFModel *ff, std::unordered_map weights_layers) { + FFModel *ff, + std::unordered_map weights_layers, + bool use_full_precision) { for (auto &v : weights_layers) { int weights_num = v.second->numWeights; for (int i = 0; i < weights_num; i++) { @@ -352,6 +665,7 @@ void FileDataLoader::load_weights( if (weight == NULL) { continue; } + switch (weight->data_type) { case DT_HALF: load_single_weight_tensor(ff, weight, i, v.first); @@ -359,6 +673,11 @@ void FileDataLoader::load_weights( case DT_FLOAT: load_single_weight_tensor(ff, weight, i, v.first); break; + case DT_INT4: + case DT_INT8: + // load weights in quantization + load_quantization_weight(ff, weight, i, v.first, use_full_precision); + break; default: assert(false && "Unsupported data type"); } diff --git a/inference/file_loader.h b/inference/file_loader.h index 7ca94a8893..8be820b1bd 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -37,8 +37,15 @@ class FileDataLoader { Tensor weight, int weight_idx, std::string const &layername); + + void load_quantization_weight(FFModel *ff, + Tensor weight, + int weight_idx, + std::string const &layername, + bool use_full_precision); void load_weights(FFModel *ff, - std::unordered_map weights_layers); + std::unordered_map weights_layers, + bool use_full_precision); void load_positions(FFModel *ff, Tensor pt, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index a9ec63bc00..b59586de07 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -99,6 +99,9 @@ void FlexFlow::top_level_task(Task const *task, Context ctx, Runtime *runtime) { FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } FilePaths file_paths; ModelType model_type; bool use_full_precision = false; @@ -119,7 +122,7 @@ void FlexFlow::top_level_task(Task const *task, /*verbose*/ verbose, file_paths.output_file_path); - FFModel model(ffconfig); + FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { LLAMA::create_llama_model(model, im, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index b137bf6944..7fc3124278 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -168,7 +168,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_heads, falcon_config.dim, falcon_config.dim / falcon_config.n_heads); - fileloader.load_weights(&ff, weights_layers); + fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 0344f19b8a..f7c1563095 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -26,6 +26,7 @@ void LLAMA::create_llama_model(FFModel &ff, int num_pipeline_stages, InferenceMode mode, bool use_full_precision) { + // do not apply cpu offload in beam search model. Config llama_config(model_config_file_path); llama_config.printConfig(); //------------------------------compute machine views ------------------ @@ -214,7 +215,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.n_heads, llama_config.dim, llama_config.dim / llama_config.n_heads); - fileloader.load_weights(&ff, weights_layers); + fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/opt.cc b/inference/models/opt.cc index dd4bb18e85..1e81e4eba7 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -249,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.hidden_size, opt_config.hidden_size / opt_config.num_attention_heads); - fileloader.load_weights(&ff, weights_layers); + fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; im.init_operators_inference(&ff); } diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index ae577dd02e..ec0b222075 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -172,7 +172,7 @@ void FlexFlow::top_level_task(Task const *task, file_paths.output_file_path); // Create LLM model - FFModel tree_model(ffconfig); + FFModel tree_model(ffconfig, ffconfig.cpu_offload); if (model_types.llm_model_type == ModelType::LLAMA) { LLAMA::create_llama_model(tree_model, im, diff --git a/inference/utils/compress_llama_weights.py b/inference/utils/compress_llama_weights.py new file mode 100644 index 0000000000..c92ae6aca9 --- /dev/null +++ b/inference/utils/compress_llama_weights.py @@ -0,0 +1,117 @@ +import torch +import numpy as np +from transformers import AutoModelForCausalLM +import dataclasses + +@dataclasses.dataclass +class CompressionConfig: + """Group-wise quantization.""" + num_bits: int + group_size: int + group_dim: int + symmetric: bool + enabled: bool = True + +def compress(tensor, config): + """Simulate group-wise quantization.""" + if not config.enabled: + return tensor + + group_size, num_bits, group_dim, symmetric = ( + config.group_size, config.num_bits, config.group_dim, config.symmetric) + assert num_bits <= 8 + + original_shape = tensor.shape + num_groups = (original_shape[group_dim] + group_size - 1) // group_size + new_shape = (original_shape[:group_dim] + (num_groups, group_size) + + original_shape[group_dim+1:]) + + # Pad + pad_len = (group_size - original_shape[group_dim] % group_size) % group_size + if pad_len != 0: + pad_shape = original_shape[:group_dim] + (pad_len,) + original_shape[group_dim+1:] + tensor = torch.cat([ + tensor, + torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)], + dim=group_dim) + data = tensor.view(new_shape) + + # Quantize + if symmetric: + B = 2 ** (num_bits - 1) - 1 + scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0] + data = data * scale + data = data.clamp_(-B, B).round_().to(torch.int8) + return data, scale, original_shape + else: + B = 2 ** num_bits - 1 + # print('max value') + # print(B) + mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0] + mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0] + + scale = B / (mx - mn) + data = data - mn + data.mul_(scale) + + data = data.clamp_(0, B).round_().to(torch.uint8) + return data, mn, scale, original_shape + + +def decompress(packed_data, config): + """Simulate group-wise dequantization.""" + if not config.enabled: + return packed_data + + group_size, num_bits, group_dim, symmetric = ( + config.group_size, config.num_bits, config.group_dim, config.symmetric) + + # Dequantize + if symmetric: + data, scale, original_shape = packed_data + data = data / scale + else: + data, mn, scale, original_shape = packed_data + data = data / scale + data.add_(mn) + + # Unpad + pad_len = (group_size - original_shape[group_dim] % group_size) % group_size + if pad_len: + padded_original_shape = ( + original_shape[:group_dim] + + (original_shape[group_dim] + pad_len,) + + original_shape[group_dim+1:]) + data = data.reshape(padded_original_shape) + indices = [slice(0, x) for x in original_shape] + return data[indices].contiguous() + else: + return data.view(original_shape) + +if __name__ == "__main__": + # torch.set_default_tensor_type(torch.HalfTensor) + # torch.set_default_tensor_type(torch.cuda.HalfTensor) + model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") + config = CompressionConfig( + num_bits=8, group_size=32, group_dim=0, symmetric=False) + for name, params in model.named_parameters(): + name = ( + name.replace(".", "_") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("o_proj", "wo") + .replace("mlp", "feed_forward") + .replace("gate_proj", "w1") + .replace("down_proj", "w2") + .replace("up_proj", "w3") + .replace("input_layernorm", "attention_norm") + .replace("post_attention_layernorm", "ffn_norm") + .replace("embed_tokens", "tok_embeddings") + .replace("lm_head", "output") + .replace("model_", "") + ) + if "feed_forward" in name or "output" in name or "attention_w" in name: + data, mn, scale, original_shape = compress(params, config) + \ No newline at end of file diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 934353d8e8..9a5cd86486 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -295,8 +295,8 @@ __device__ void mergeBeamShards(int num_shards, if (verbose && batch_index == 0) { printf("slot %d, value %.15f, prob %15f\n", slot, - entries[slot].value, - prob); + static_cast(entries[slot].value), + static_cast(prob)); } } min_heap.build(heap_size); @@ -312,8 +312,8 @@ __device__ void mergeBeamShards(int num_shards, printf("shard %d, index %d, value %.15f, prob %.15f\n", shard, entry.index, - entry.value, - prob); + static_cast(entry.value), + static_cast(prob)); } if (entry.value * prob < root.value) { continue; @@ -358,7 +358,6 @@ __device__ void mergeBeamShards(int num_shards, // entries[next_shard_index].value, // prob); // } - max_heap.replace_root( {next_shard_index, entries[next_shard_index].value * prob}, heap_size); @@ -435,8 +434,9 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input, (sub_request_id * token_nums * length), batch_input, request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id, - acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - sub_request_id], + static_cast( + acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + + sub_request_id]), thread_count, request_id); } @@ -716,7 +716,7 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op) : OpMeta(handler) { sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&acc_probs, - sizeof(data_type_size(data_type)) * + data_type_size(data_type) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); checkCUDA(cudaMalloc(&block_start_index, diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 6027a2cd21..b2528a7c14 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -76,6 +76,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, if (data_type == DT_NONE) { data_type = input->data_type; } + DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; + bool offload = cpu_offload; Layer *li = nullptr; int weight_num = bias ? 2 : 1; if (data_type != input->data_type) { @@ -117,14 +119,22 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; - li->weights[0] = create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + int one_head_size = qParas + kParas + vParas + oParas; + + // compress the weight size if quantization. + if (quantization_type != DT_NONE) { + one_head_size = get_quantization_to_byte_size( + data_type, quantization_type, one_head_size); + } + int dims[2] = {one_head_size, num_heads}; + li->weights[0] = create_weight_legion_ordering( + 2, + dims, + quantization_type == DT_NONE ? data_type : quantization_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); } if (bias) { // q, k, v, o @@ -150,6 +160,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("quantization_type", quantization_type); + li->add_int_property("offload", offload); layers.push_back(li); return li->outputs[0]; @@ -184,6 +196,10 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( layer->get_float_property("scaling_factor", scaling_factor); layer->get_int_property("qk_prod_scaling", value); bool qk_prod_scaling = (bool)value; + layer->get_int_property("quantization_type", value); + DataType quantization_type = (DataType)value; + layer->get_int_property("offload", value); + bool offload = (bool)value; return new IncMultiHeadSelfAttention(model, layer->layer_guid, @@ -201,6 +217,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( scaling_factor, qk_prod_scaling, false /*allocate_weights*/, + quantization_type, + offload, layer->name); } @@ -221,6 +239,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -239,10 +259,10 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling) { + qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), + offload(_offload) { // overwrite layer_guid layer_guid = _layer_guid; - numOutputs = 1; int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; @@ -267,6 +287,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; dims[2].size = qParas + kParas + vParas + oParas; + + if (quantization_type != DT_NONE) { + dims[2].size = get_quantization_to_byte_size( + data_type, quantization_type, dims[2].size); + } dims[2].degree = 1; dims[2].parallel_idx = -1; int seed = std::rand(); @@ -276,12 +301,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( #else ParameterSyncType comm_type = ParameterSyncType::PS; #endif - weights[0] = model.create_parallel_weight<3>(dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - comm_type); + weights[0] = model.create_parallel_weight<3>( + dims, + quantization_type == DT_NONE ? this->data_type : quantization_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); } if (bias) { ParallelDim dims[2]; @@ -330,6 +356,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -349,7 +377,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling) + qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), + offload(_offload) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -376,6 +405,10 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; dims[2].size = qParas + kParas + vParas + oParas; + if (quantization_type != DT_NONE) { + dims[2].size = get_quantization_to_byte_size( + data_type, quantization_type, dims[2].size); + } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); #ifdef USE_NCCL @@ -383,12 +416,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( #else ParameterSyncType comm_type = ParameterSyncType::PS; #endif - weights[0] = model.create_parallel_weight<3>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - comm_type); + weights[0] = model.create_parallel_weight<3>( + dims, + quantization_type == DT_NONE ? this->data_type : quantization_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); } if (bias) { ParallelDim dims[2]; @@ -441,6 +475,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.scaling_factor, other.qk_prod_scaling, allocate_weights, + other.quantization_type, + other.offload, other.name) {} IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( @@ -465,6 +501,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.scaling_factor, params.qk_prod_scaling, allocate_weights, + params.quantization_type, + params.offload, name) {} void IncMultiHeadSelfAttention::init_inference( @@ -494,11 +532,13 @@ void IncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -596,12 +636,26 @@ OpMeta *IncMultiHeadSelfAttention::init_task( .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + if (attn->offload) { + // cpu-offload enabled + // use offload_reserved_space + gpu_mem_allocator.register_reserved_work_space( + handle.offload_reserve_space, handle.offload_reserve_space_size); + } IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( - handle, attn, weight, gpu_mem, num_samples, num_heads); - + handle, attn, weight, gpu_mem_allocator, num_samples, num_heads); + if (handle.offload_reserve_space == nullptr) { + // assert that we didn't over allocate memory + assert(gpu_mem_allocator.reserved_allocated_size == + gpu_mem_allocator.reserved_total_size); + } m->profiling = attn->profiling; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); + if (attn->quantization_type == DT_NONE) { + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); + } + return m; } @@ -641,11 +695,13 @@ FutureMap IncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -655,11 +711,13 @@ FutureMap IncMultiHeadSelfAttention::inference( launcher.add_field(idx++, FID_DATA); if (bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); @@ -1524,6 +1582,8 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + params.quantization_type = this->quantization_type; + params.offload = this->offload; return params; } @@ -1547,6 +1607,8 @@ size_t hash::operator()( hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.quantization_type); + hash_combine(key, params.offload); return key; } }; // namespace std diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index a627e0ef08..669cbd2636 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -61,7 +61,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads) : OpMeta(handler, attn) { @@ -88,9 +88,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bool _add_bias_kv, float _scaling_factor, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + DataType _quantization_type, + bool _offload) : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 33bc32224b..408c1ab012 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -17,6 +17,7 @@ #endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/cuda_helper.h" @@ -349,6 +350,79 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream) { + // additional processing for weight uploading + // Note that we update weight_ptr and bias_ptr when uploading weight and + // bias + if (m->quantization_type != DT_NONE) { + // copy weight_ptr to quantized_weight_ptr, do compression and store in + // m->weight_ptr + cudaMemcpyAsync(m->quantized_weight_ptr, + weight.get_byte_ptr(), + m->quantized_weightSize, + cudaMemcpyHostToDevice, + stream); + + if (m->quantization_type == DT_INT4) { + int parallelism = m->qProjSize * m->qSize * m->num_heads / 2; + decompress_int4_attention_weights<<>>( + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_heads); + } else { + assert(m->quantization_type == DT_INT8); + int parallelism = m->qProjSize * m->qSize * m->num_heads; + decompress_int8_attention_weights<<>>( + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_heads); + } + } else { + if (data_type == DT_FLOAT) { + cudaMemcpyAsync(m->weight_ptr, + weight.get_float_ptr(), + m->weightSize, + cudaMemcpyHostToDevice, + stream); + } else if (data_type == DT_HALF) { + cudaMemcpyAsync(m->weight_ptr, + weight.get_half_ptr(), + m->weightSize, + cudaMemcpyHostToDevice, + stream); + } else { + assert(false); + } + } + // reload weight_o for offloading case + int parallelism = m->vProjSize * m->oProjSize * m->num_heads; + build_w_out_tensor<<>>(static_cast
(m->weight_ptr), + static_cast
(m->W_out_contiguous), + m->vProjSize, + m->oProjSize, + m->num_heads, + (m->qSize * m->qProjSize + + m->kSize * m->kProjSize + + m->vSize * m->vProjSize)); +} + template void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -358,6 +432,12 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, cudaStream_t stream) { // here because we need postion info in infernece 1 + + if (m->offload && m->biasSize > 0) { + cudaMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); + bias_ptr = static_cast
(m->bias_ptr); + } cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), @@ -681,31 +761,41 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - assert(input.data_type == weight.data_type); + // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); if (use_bias) { assert(input.data_type == bias.data_type); } + if (input.data_type == DT_HALF) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::IncMultiHeadAttention::inference_kernel(m, - bc, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::IncMultiHeadAttention::inference_kernel( + m, + bc, + input.get_half_ptr(), + m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); } else if (input.data_type == DT_FLOAT) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::IncMultiHeadAttention::inference_kernel(m, - bc, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::IncMultiHeadAttention::inference_kernel( + m, + bc, + input.get_float_ptr(), + m->offload ? static_cast(m->weight_ptr) + : weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -727,7 +817,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads) : IncMultiHeadSelfAttentionMeta(handler, @@ -747,9 +837,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->add_bias_kv, attn->scaling_factor, weight, - gpu_mem, + gpu_mem_allocator, num_samples, - _num_heads) {} + _num_heads, + attn->quantization_type, + attn->offload) {} IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, @@ -769,10 +861,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bool _add_bias_kv, float _scaling_factor, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) - : OpMeta(handler, attn) { + int _num_heads, + DataType _quantization_type, + bool _offload) + : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); @@ -788,13 +882,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( vProjSize = _vProjSize; oProjSize = _oProjSize; size_t size_of_dt = data_type_size(attn->data_type); + quantization_type = _quantization_type; + offload = _offload; num_heads = _num_heads; weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); weightSize = weights_params * num_heads * size_of_dt; - has_load_weights = (bool *)calloc(1, sizeof(bool)); - *has_load_weights = false; + if (quantization_type != DT_NONE) { + quantized_weightSize = get_quantization_to_byte_size( + attn->data_type, quantization_type, weightSize); + } + biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + // has_load_weights = (bool *)calloc(1, sizeof(bool)); + //*has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); *apply_rotary_embedding = _apply_rotary_embedding; bias = (bool *)calloc(1, sizeof(bool)); @@ -807,6 +908,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // Currently do not support adding bias to key/value projection assert(!_add_bias_kv); + // allocate weight and bias in the reserve space for cpu offloading + if (offload) { + weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(weightSize); + bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize); + } + #ifdef INFERENCE_TESTS kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * BatchConfig::MAX_NUM_REQUESTS, @@ -860,37 +967,91 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + complex_size * sizeof(cuFloatComplex); // more components will // be added here later + if (offload) { + // assert that we have enough reserved work space left + size_t totalSharedSize = + infer_mode == TREE_VERIFY_MODE + ? totalSize - + (key_cache_size + value_cache_size + qkv_max_proj_size) * + size_of_dt + : totalSize - (key_cache_size + value_cache_size) * size_of_dt; + + size_t instance_size = + size_of_dt * + (infer_mode == TREE_VERIFY_MODE + ? key_cache_size + value_cache_size + qkv_max_proj_size + : key_cache_size + value_cache_size); + + if (quantization_type != DT_NONE) { + totalSharedSize += quantized_weightSize; + } + assert(gpu_mem_allocator.reserved_total_size - + gpu_mem_allocator.reserved_allocated_size >= + totalSharedSize); + gpu_mem_allocator.create_legion_instance(reserveInst, instance_size); + } else { + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + } + + // in tree_verify, enable devQKVProjArray; + if (!offload || infer_mode == TREE_VERIFY_MODE) { + devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped( + qkv_max_proj_size * size_of_dt); + } else { + devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped( + qkv_max_proj_size * size_of_dt); + // offset += qkv_max_proj_size * size_of_dt; + } + + // use key value cache in all mode. + keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size * + size_of_dt); + valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * + size_of_dt); + + if (offload) { + token_infos = + gpu_mem_allocator.allocate_reserved( + tokeninfo_size); + // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; + qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * + size_of_dt); + // offset += qk_prod_size * size_of_dt; + qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped( + qk_prod_size * size_of_dt); + // offset += qk_prod_size * size_of_dt; + attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * + size_of_dt); + // offset += attn_heads_size * size_of_dt; + W_out_contiguous = gpu_mem_allocator.allocate_reserved_untyped( + W_out_contiguous_size * size_of_dt); + // offset += W_out_contiguous_size * size_of_dt; + complex_input = + gpu_mem_allocator.allocate_reserved(complex_size); + // offset += complex_size * sizeof(cuFloatComplex); + } else { + token_infos = + gpu_mem_allocator.allocate_instance( + tokeninfo_size); + qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * + size_of_dt); + qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( + qk_prod_size * size_of_dt); + attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size * + size_of_dt); + W_out_contiguous = gpu_mem_allocator.allocate_instance_untyped( + W_out_contiguous_size * size_of_dt); + complex_input = + gpu_mem_allocator.allocate_instance(complex_size); + } + + // allocate more size for quantization data + if (quantization_type != DT_NONE) { + assert(offload); + quantized_weight_ptr = + gpu_mem_allocator.allocate_reserved(quantized_weightSize); + } - Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(totalSize - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(reserveInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - off_t offset = 0; - devQKVProjArray = reserveInst.pointer_untyped(offset, 0); - offset += qkv_max_proj_size * size_of_dt; - keyCache = reserveInst.pointer_untyped(offset, 0); - offset += key_cache_size * size_of_dt; - valueCache = reserveInst.pointer_untyped(offset, 0); - offset += value_cache_size * size_of_dt; - token_infos = reserveInst.pointer(offset); - offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; - qk_prods = reserveInst.pointer_untyped(offset, 0); - offset += qk_prod_size * size_of_dt; - qk_prods_softmax = reserveInst.pointer_untyped(offset, 0); - offset += qk_prod_size * size_of_dt; - attn_heads = reserveInst.pointer_untyped(offset, 0); - offset += attn_heads_size * size_of_dt; - W_out_contiguous = reserveInst.pointer_untyped(offset, 0); - offset += W_out_contiguous_size * size_of_dt; - complex_input = reserveInst.pointer(offset); - offset += complex_size * sizeof(cuFloatComplex); if (weight.data_type == DT_FLOAT) { int parallelism = vProjSize * oProjSize * num_heads; build_w_out_tensor<<( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); + }; // namespace FlexFlow diff --git a/src/ops/kernels/decompress_kernels.cu b/src/ops/kernels/decompress_kernels.cu new file mode 100644 index 0000000000..2e02ce1eec --- /dev/null +++ b/src/ops/kernels/decompress_kernels.cu @@ -0,0 +1,261 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +namespace Kernels { + +template +__global__ void decompress_int4_general_weights(char const *input_weight_ptr, + DT *weight_ptr, + int in_dim, + int valueSize) { + // eg. in dim = 3072, out dim = 768 + CUDA_KERNEL_LOOP(i, valueSize / 2) { + size_t real_idx_first = i * 2; + size_t real_idx_second = i * 2 + 1; + size_t group_idx = + (real_idx_first / (in_dim * INT4_NUM_OF_ELEMENTS_PER_GROUP)) * in_dim + + real_idx_first % in_dim; + size_t idx = i; + size_t offset_idx = (valueSize / 2) + group_idx * sizeof(DT); + size_t scale_idx = offset_idx + sizeof(DT) * (valueSize / 32); + + weight_ptr[real_idx_first] = + static_cast
((input_weight_ptr[idx] >> 4) & 0xF) / + (*(DT *)(input_weight_ptr + scale_idx)) + + (*(DT *)(input_weight_ptr + offset_idx)); + weight_ptr[real_idx_second] = + static_cast
(input_weight_ptr[idx] & 0xF) / + (*(DT *)(input_weight_ptr + scale_idx + sizeof(DT))) + + (*(DT *)(input_weight_ptr + offset_idx + sizeof(DT))); + } +} + +template +__global__ void decompress_int8_general_weights(char const *input_weight_ptr, + DT *weight_ptr, + int in_dim, + int valueSize) { + CUDA_KERNEL_LOOP(i, valueSize) { + size_t idx = i; + size_t group_idx = + (idx / (in_dim * INT4_NUM_OF_ELEMENTS_PER_GROUP)) * in_dim + + idx % in_dim; + size_t offset_idx = valueSize + group_idx * sizeof(DT); + size_t scale_idx = offset_idx + sizeof(DT) * (valueSize / 32); + weight_ptr[idx] = static_cast
(input_weight_ptr[idx] & 0xFF) / + (*(DT *)(input_weight_ptr + scale_idx)) + + (*(DT *)(input_weight_ptr + offset_idx)); + } +} + +template +__global__ void decompress_int4_attention_weights(char *input_weight_ptr, + DT *weight_ptr, + int qProjSize, + int qSize, + int num_heads) { + // TODO this is because in top level function we assume q,k,v in same size + CUDA_KERNEL_LOOP(i, qProjSize * num_heads * qSize / 2) { + int q_block_size = (qProjSize * qSize) / 2; + int real_q_block_size = q_block_size * 2; + size_t qkvo_block_size = q_block_size * 4; + size_t real_qkvo_block_size = qkvo_block_size * 2; + + int group_idx = (i * 2 / (INT4_NUM_OF_ELEMENTS_PER_GROUP * qSize)) * qSize + + (i * 2) % qSize; + // i * 2 / (INT4_NUM_OF_ELEMENTS_PER_GROUP); + int head_idx = i / q_block_size; + int data_idx = i % q_block_size; + + size_t idx_q = head_idx * qkvo_block_size + data_idx; + size_t idx_k = idx_q + q_block_size; + size_t idx_v = idx_k + q_block_size; + size_t idx_o = idx_v + q_block_size; + + size_t real_idx_q_first = head_idx * real_qkvo_block_size + data_idx * 2; + size_t real_idx_q_second = real_idx_q_first + 1; + size_t real_idx_k_first = + head_idx * real_qkvo_block_size + real_q_block_size + data_idx * 2; + size_t real_idx_k_second = real_idx_k_first + 1; + size_t real_idx_v_first = + head_idx * real_qkvo_block_size + real_q_block_size * 2 + data_idx * 2; + size_t real_idx_v_second = real_idx_v_first + 1; + size_t real_idx_o_first = + head_idx * real_qkvo_block_size + real_q_block_size * 3 + data_idx * 2; + size_t real_idx_o_second = real_idx_o_first + 1; + + size_t meta_offset = num_heads * qkvo_block_size; + size_t one_meta_size = sizeof(DT) * (qProjSize * num_heads * qSize / 32); + size_t q_offset_idx = meta_offset + group_idx * sizeof(DT); + size_t q_scaling_idx = q_offset_idx + one_meta_size; + + size_t k_offset_idx = q_scaling_idx + one_meta_size; + size_t k_scaling_idx = k_offset_idx + one_meta_size; + + size_t v_offset_idx = k_scaling_idx + one_meta_size; + size_t v_scaling_idx = v_offset_idx + one_meta_size; + + size_t o_offset_idx = v_scaling_idx + one_meta_size; + size_t o_scaling_idx = o_offset_idx + one_meta_size; + + weight_ptr[real_idx_q_first] = + static_cast
((input_weight_ptr[idx_q] >> 4) & 0xF) / + (*(DT *)(input_weight_ptr + q_scaling_idx)) + + (*(DT *)(input_weight_ptr + q_offset_idx)); + weight_ptr[real_idx_q_second] = + static_cast
((input_weight_ptr[idx_q] & 0xF)) / + (*(DT *)(input_weight_ptr + q_scaling_idx + sizeof(DT))) + + (*(DT *)(input_weight_ptr + q_offset_idx + sizeof(DT))); + weight_ptr[real_idx_k_first] = + static_cast
((input_weight_ptr[idx_k] >> 4) & 0xF) / + (*(DT *)(input_weight_ptr + k_scaling_idx)) + + (*(DT *)(input_weight_ptr + k_offset_idx)); + weight_ptr[real_idx_k_second] = + static_cast
((input_weight_ptr[idx_k] & 0xF)) / + (*(DT *)(input_weight_ptr + k_scaling_idx + sizeof(DT))) + + (*(DT *)(input_weight_ptr + k_offset_idx + sizeof(DT))); + weight_ptr[real_idx_v_first] = + static_cast
((input_weight_ptr[idx_v] >> 4) & 0xF) / + (*(DT *)(input_weight_ptr + v_scaling_idx)) + + (*(DT *)(input_weight_ptr + v_offset_idx)); + weight_ptr[real_idx_v_second] = + static_cast
((input_weight_ptr[idx_v] & 0xF)) / + (*(DT *)(input_weight_ptr + v_scaling_idx + sizeof(DT))) + + (*(DT *)(input_weight_ptr + v_offset_idx + sizeof(DT))); + weight_ptr[real_idx_o_first] = + static_cast
((input_weight_ptr[idx_o] >> 4) & 0xF) / + (*(DT *)(input_weight_ptr + o_scaling_idx)) + + (*(DT *)(input_weight_ptr + o_offset_idx)); + weight_ptr[real_idx_o_second] = + static_cast
((input_weight_ptr[idx_o] & 0xF)) / + (*(DT *)(input_weight_ptr + o_scaling_idx + sizeof(DT))) + + (*(DT *)(input_weight_ptr + o_offset_idx + sizeof(DT))); + } +} + +template +__global__ void decompress_int8_attention_weights(char *input_weight_ptr, + DT *weight_ptr, + int qProjSize, + int qSize, + int num_heads) { + // TODO this is because in top level function we assume q,k,v in same size + CUDA_KERNEL_LOOP(i, qProjSize * num_heads * qSize) { + int q_block_size = qProjSize * qSize; + size_t qkvo_block_size = q_block_size * 4; + + int group_idx = + (i / (INT4_NUM_OF_ELEMENTS_PER_GROUP * qSize)) * qSize + i % qSize; + // i * 2 / (INT4_NUM_OF_ELEMENTS_PER_GROUP); + int head_idx = i / q_block_size; + int data_idx = i % q_block_size; + + size_t idx_q = head_idx * qkvo_block_size + data_idx; + size_t idx_k = idx_q + q_block_size; + size_t idx_v = idx_k + q_block_size; + size_t idx_o = idx_v + q_block_size; + + size_t meta_offset = num_heads * qkvo_block_size; + size_t one_meta_size = sizeof(DT) * (qProjSize * num_heads * qSize / 32); + size_t q_offset_idx = meta_offset + group_idx * sizeof(DT); + size_t q_scaling_idx = q_offset_idx + one_meta_size; + + size_t k_offset_idx = q_scaling_idx + one_meta_size; + size_t k_scaling_idx = k_offset_idx + one_meta_size; + + size_t v_offset_idx = k_scaling_idx + one_meta_size; + size_t v_scaling_idx = v_offset_idx + one_meta_size; + + size_t o_offset_idx = v_scaling_idx + one_meta_size; + size_t o_scaling_idx = o_offset_idx + one_meta_size; + + weight_ptr[idx_q] = static_cast
(input_weight_ptr[idx_q] & 0xFF) / + (*(DT *)(input_weight_ptr + q_scaling_idx)) + + (*(DT *)(input_weight_ptr + q_offset_idx)); + weight_ptr[idx_k] = static_cast
(input_weight_ptr[idx_k] & 0xFF) / + (*(DT *)(input_weight_ptr + k_scaling_idx)) + + (*(DT *)(input_weight_ptr + k_offset_idx)); + weight_ptr[idx_v] = static_cast
(input_weight_ptr[idx_v] & 0xFF) / + (*(DT *)(input_weight_ptr + v_scaling_idx)) + + (*(DT *)(input_weight_ptr + v_offset_idx)); + weight_ptr[idx_o] = static_cast
(input_weight_ptr[idx_o] & 0xFF) / + (*(DT *)(input_weight_ptr + o_scaling_idx)) + + (*(DT *)(input_weight_ptr + o_offset_idx)); + } +} + +template __global__ void decompress_int4_general_weights( + char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int4_general_weights( + char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int8_general_weights( + char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int8_general_weights( + char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void + decompress_int4_attention_weights(char *input_weight_ptr, + float *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template __global__ void + decompress_int4_attention_weights(char *input_weight_ptr, + half *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template __global__ void + decompress_int8_attention_weights(char *input_weight_ptr, + float *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template __global__ void + decompress_int8_attention_weights(char *input_weight_ptr, + half *weight_ptr, + int qProjSize, + int qSize, + int num_heads); +// template +// void decompress_weight_bias(T1 *input_weight_ptr, +// T2 *weight_ptr, +// T2 *params, +// int group_size, +// int tensor_size) { + +// // convert to DT, scaling, add offset; +// cudaStream_t stream; +// checkCUDA(get_legion_stream(&stream)); +// int parallelism = tensor_size; +// decompress_kernel<<>>( +// input_weight_ptr, weight_ptr, params, group_size); +// } +} // namespace Kernels +}; // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index a041f008bc..55a47d7108 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -19,7 +19,11 @@ namespace FlexFlow { -LinearMeta::LinearMeta(FFHandler handler, int batch_size, Linear const *li) +LinearMeta::LinearMeta(FFHandler handler, + int batch_size, + Linear const *li, + MemoryAllocator gpu_mem_allocator, + int weightSize) : OpMeta(handler, li) { // Allocate an all-one's vector float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 02b018566e..3f806dd4f5 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -14,15 +14,31 @@ */ #include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -LinearMeta::LinearMeta(FFHandler handler, int batch_size, Linear const *li) - : OpMeta(handler, li) { - // Allocate an all-one's vector +LinearMeta::LinearMeta(FFHandler handler, + int batch_size, + Linear const *li, + MemoryAllocator gpu_mem_allocator, + int weightSize) + : OpMeta(handler, li), weight_ptr(nullptr) { DataType data_type = li->data_type; + // allocate weight and bias in the reserve space for cpu offloading + if (li->offload) { + weight_ptr = gpu_mem_allocator.allocate_reserved_untyped( + weightSize * data_type_size(data_type)); + if (li->quantization_type != DT_NONE) { + quantized_weightSize = get_quantization_to_byte_size( + data_type, li->quantization_type, weightSize); + quantized_weight_ptr = + gpu_mem_allocator.allocate_reserved(quantized_weightSize); + } + } + // Allocate an all-one's vector checkCUDA(cudaMalloc(&one_ptr, data_type_size(data_type) * batch_size)); int parallelism = batch_size; cudaStream_t stream; @@ -100,7 +116,6 @@ void forward_kernel_wrapper(LinearMeta const *m, int batch_size) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -237,11 +252,53 @@ void forward_kernel(LinearMeta const *m, int out_dim, int batch_size, ffStream_t stream) { + // additional processing for uploading weights + if (m->offload) { + // Note that we update weight_ptr when uploading weight + if (m->quantization_type != DT_NONE) { + cudaMemcpyAsync(m->quantized_weight_ptr, + weight_ptr, + m->quantized_weightSize, + cudaMemcpyHostToDevice, + stream); + if (m->quantization_type == DT_INT4) { + int parallelism = in_dim * out_dim / 2; + decompress_int4_general_weights
+ <<>>(m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + in_dim, + in_dim * out_dim); + } else { + assert(m->quantization_type == DT_INT8); + int parallelism = in_dim * out_dim; + decompress_int8_general_weights
+ <<>>(m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + in_dim, + in_dim * out_dim); + } + + } else { + cudaMemcpyAsync(m->weight_ptr, + weight_ptr, + in_dim * out_dim * sizeof(DT), + cudaMemcpyHostToDevice, + stream); + } + } checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); DT alpha = 1.0f, beta = 0.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); - cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); + cudaDataType_t weight_type = m->offload + ? ff_to_cuda_datatype(m->weight_ptr_type) + : ff_to_cuda_datatype(m->weight_type); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); assert(input_type == weight_type && weight_type == output_type); #if CUDA_VERSION >= 11000 @@ -257,7 +314,7 @@ void forward_kernel(LinearMeta const *m, batch_size, in_dim, &alpha, - weight_ptr, + m->offload ? m->weight_ptr : weight_ptr, weight_type, in_dim, input_ptr, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 19845214e2..6ab99e6892 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -1,4 +1,5 @@ #include "flexflow/ops/linear.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/layer.h" #include "flexflow/model.h" #include "flexflow/ops/kernels/linear_kernels.h" @@ -15,6 +16,8 @@ using Legion::Domain; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; using Legion::PhysicalRegion; using Legion::Predicate; using Legion::Rect; @@ -43,6 +46,8 @@ Tensor FFModel::dense(const Tensor input, if (data_type == DT_NONE) { data_type = input->data_type; } + DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; + bool offload = cpu_offload; Layer *li = nullptr; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for dense"); @@ -77,14 +82,18 @@ Tensor FFModel::dense(const Tensor input, } { int dims[2] = {input->dims[0], outDim}; - li->weights[KERNEL_IDX] = - create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + if (quantization_type != DT_NONE) { + dims[0] = + get_quantization_to_byte_size(data_type, quantization_type, dims[0]); + } + li->weights[KERNEL_IDX] = create_weight_legion_ordering( + 2, + dims, + quantization_type == DT_NONE ? data_type : quantization_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); } if (use_bias) { int dims[1] = {outDim}; @@ -101,6 +110,8 @@ Tensor FFModel::dense(const Tensor input, li->add_int_property("activation", activation); li->add_int_property("kernel_reg_type", kernel_reg_type); li->add_float_property("kernel_reg_lambda", kernel_reg_lambda); + li->add_int_property("quantization_type", quantization_type); + li->add_int_property("offload", offload); layers.push_back(li); return li->outputs[0]; } @@ -120,6 +131,10 @@ Op *Linear::create_operator_from_layer( RegularizerMode kernel_reg_type = (RegularizerMode)value; float kernel_reg_lambda; layer->get_float_property("kernel_reg_lambda", kernel_reg_lambda); + layer->get_int_property("quantization_type", value); + DataType quantization_type = (DataType)value; + layer->get_int_property("offload", value); + bool offload = (bool)value; return new Linear(model, layer->layer_guid, inputs[0], @@ -129,6 +144,8 @@ Op *Linear::create_operator_from_layer( kernel_reg_lambda, use_bias, layer->data_type, + quantization_type, + offload, false /*allocate_weights*/, layer->name); } @@ -150,6 +167,8 @@ Linear::Linear(FFModel &model, other.kernel_reg_lambda, other.use_bias, other.data_type, + other.quantization_type, + other.offload, allocate_weights, other.name) {} @@ -167,6 +186,8 @@ Linear::Linear(FFModel &model, params.kernel_reg_lambda, params.use_bias, params.data_type, + params.quantization_type, + params.offload, allocate_weights, name) {} @@ -179,6 +200,8 @@ Linear::Linear(FFModel &model, float _kernel_reg_lambda, bool _use_bias, DataType _data_type, + DataType _quantization_type, + bool _offload, bool allocate_weights, char const *name) : Op(model, @@ -192,6 +215,7 @@ Linear::Linear(FFModel &model, _input), out_channels(out_dim), activation(_activation), use_bias(_use_bias), kernel_reg_type(_kernel_reg_type), kernel_reg_lambda(_kernel_reg_lambda), + quantization_type(_quantization_type), offload(_offload), replica(ParallelTensorBase::NO_TENSOR) { // overwrite layer_guid layer_guid = _layer_guid; @@ -206,18 +230,20 @@ Linear::Linear(FFModel &model, LinearParams params = this->get_params(); params.construct_mappings(*this->parallel_dims_mapping, input_shape); params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); - if (allocate_weights) { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - - weights[KERNEL_IDX] = - model.create_parallel_weight_legion_ordering(kernel_shape.num_dims, - kernel_shape.dims, - _data_type, - NULL /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + if (quantization_type != DT_NONE) { + kernel_shape.dims[0].size = get_quantization_to_byte_size( + data_type, quantization_type, kernel_shape.dims[0].size); + } + weights[KERNEL_IDX] = model.create_parallel_weight_legion_ordering( + kernel_shape.num_dims, + kernel_shape.dims, + quantization_type == DT_NONE ? _data_type : quantization_type, + NULL /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); if (use_bias) { Initializer *bias_initializer = new ZeroInitializer(); @@ -260,18 +286,24 @@ void Linear::init(FFModel const &ff) { // RegionRequirement(input_lps[0], 0/*projection id*/, // READ_ONLY, EXCLUSIVE, inputs[0]->region)); // launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[0]->region)); - launcher.add_field(0, FID_DATA); + launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(2, FID_DATA); // launcher.add_region_requirement( // RegionRequirement(weights[1]->part, 0/*projection id*/, // READ_ONLY, EXCLUSIVE, weights[1]->region)); @@ -313,18 +345,26 @@ void Linear::init_inference(FFModel const &ff, // RegionRequirement(input_lps[0], 0/*projection id*/, // READ_ONLY, EXCLUSIVE, inputs[0]->region)); // launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - WRITE_ONLY, + READ_ONLY, EXCLUSIVE, - batch_outputs[0]->region)); + batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, - READ_ONLY, + WRITE_ONLY, EXCLUSIVE, - weights[0]->region)); + batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(2, FID_DATA); // launcher.add_region_requirement( // RegionRequirement(weights[1]->part, 0/*projection id*/, // READ_ONLY, EXCLUSIVE, weights[1]->region)); @@ -351,6 +391,7 @@ OpMeta *Linear::init_task(Task const *task, Context ctx, Runtime *runtime) { Linear const *linear = (Linear *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(linear->inputs[0]->data_type, regions[0], @@ -362,9 +403,21 @@ OpMeta *Linear::init_task(Task const *task, #define DIMFUNC(DIM) \ case DIM: \ if (output.data_type == DT_HALF) { \ - return init_task_with_dim(task, regions, ctx, runtime); \ + if (linear->quantization_type != DT_NONE) { \ + return init_task_with_dim( \ + task, regions, ctx, runtime); \ + } else { \ + return init_task_with_dim( \ + task, regions, ctx, runtime); \ + } \ } else if (output.data_type == DT_FLOAT) { \ - return init_task_with_dim(task, regions, ctx, runtime); \ + if (linear->quantization_type != DT_NONE) { \ + return init_task_with_dim( \ + task, regions, ctx, runtime); \ + } else { \ + return init_task_with_dim( \ + task, regions, ctx, runtime); \ + } \ } else { \ assert(false && "Unsupported data type"); \ } @@ -376,7 +429,7 @@ OpMeta *Linear::init_task(Task const *task, return NULL; } -template +template OpMeta *Linear::init_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -387,29 +440,45 @@ OpMeta *Linear::init_task_with_dim(Task const *task, FFHandler handle = *((FFHandler const *)task->local_args); // TensorAccessorR acc_input( // regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[0], - task->regions[0], + TensorAccessorR acc_input( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + TensorAccessorW acc_output(regions[1], + task->regions[1], FID_DATA, ctx, runtime, false /*readOutput*/); - TensorAccessorW acc_kernel(regions[1], - task->regions[1], + TensorAccessorW acc_kernel(regions[2], + task->regions[2], FID_DATA, ctx, runtime, false /*readOutput*/); + // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); - // int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; - int in_dim = acc_kernel.rect.hi[0] - acc_kernel.rect.lo[0] + 1; + int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; + // int in_dim = acc_kernel.rect.hi[0] - acc_kernel.rect.lo[0] + 1; int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; printf("init linear (input): in_dim(%d) out_dim(%d) batch_size(%d)\n", in_dim, out_dim, batch_size); - LinearMeta *m = new LinearMeta(handle, batch_size, linear); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + if (linear->offload) { + // cpu-offload enabled + // use offload_reserved_space + gpu_mem_allocator.register_reserved_work_space( + handle.offload_reserve_space, handle.offload_reserve_space_size); + } + + LinearMeta *m = new LinearMeta( + handle, batch_size, linear, gpu_mem_allocator, in_dim * out_dim); m->activation = linear->activation; m->kernel_reg_type = linear->kernel_reg_type; m->kernel_reg_lambda = linear->kernel_reg_lambda; @@ -419,6 +488,9 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->input_type = linear->inputs[0]->data_type; m->weight_type = linear->weights[0]->data_type; m->output_type = linear->outputs[0]->data_type; + m->weight_ptr_type = m->input_type; + m->quantization_type = linear->quantization_type; + m->offload = linear->offload; std::strcpy(m->op_name, linear->name); init_kernel(m, batch_size, out_dim); @@ -502,11 +574,13 @@ FutureMap Linear::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(2, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, @@ -526,15 +600,29 @@ void Linear::forward_task(Task const *task, Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(m->input_type == m->weight_type); + if (m->quantization_type == DT_NONE) { + assert(m->input_type == m->weight_type); + } assert(m->input_type == m->output_type); switch (input_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ if (m->output_type == DT_HALF) { \ - return forward_task_with_dim(task, regions, ctx, runtime); \ + if (m->quantization_type != DT_NONE) { \ + return forward_task_with_dim( \ + task, regions, ctx, runtime); \ + } else { \ + return forward_task_with_dim( \ + task, regions, ctx, runtime); \ + } \ } else if (m->output_type == DT_FLOAT) { \ - return forward_task_with_dim(task, regions, ctx, runtime); \ + if (m->quantization_type != DT_NONE) { \ + return forward_task_with_dim( \ + task, regions, ctx, runtime); \ + } else { \ + return forward_task_with_dim( \ + task, regions, ctx, runtime); \ + } \ } else { \ assert(false && "Unsupported data type"); \ } @@ -551,7 +639,7 @@ void Linear::forward_task(Task const *task, regions[2](I): kernel regions[3](I): bias */ -template +template void Linear::forward_task_with_dim(Task const *task, std::vector const ®ions, Context ctx, @@ -569,14 +657,14 @@ void Linear::forward_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - TensorAccessorR acc_kernel( + TensorAccessorR acc_kernel( regions[2], task->regions[2], FID_DATA, ctx, runtime); int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); - assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); + // assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); DT const *acc_bias_ptr = nullptr; if (m->use_bias) { TensorAccessorR acc_bias( @@ -678,7 +766,9 @@ void Linear::backward_task(Task const *task, Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(m->input_type == m->weight_type); + if (m->quantization_type == DT_NONE) { + assert(m->input_type == m->weight_type); + } assert(m->input_type == m->output_type); switch (in_domain.get_dim()) { #define DIMFUNC(DIM) \ @@ -1082,6 +1172,8 @@ void Linear::serialize(Legion::Serializer &sez) const { sez.serialize(this->kernel_reg_lambda); sez.serialize(this->use_bias); sez.serialize(this->data_type); + sez.serialize(this->quantization_type); + sez.serialize(this->offload); } /* static */ @@ -1097,6 +1189,8 @@ Node Linear::deserialize(FFModel &ff, float kernel_reg_lambda; bool use_bias; DataType data_type; + DataType quantization_type; + bool offload; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -1106,6 +1200,8 @@ Node Linear::deserialize(FFModel &ff, dez.deserialize(kernel_reg_lambda); dez.deserialize(use_bias); dez.deserialize(data_type); + dez.deserialize(quantization_type); + dez.deserialize(offload); LinearParams params; params.activation = activation; @@ -1115,6 +1211,8 @@ Node Linear::deserialize(FFModel &ff, params.use_bias = use_bias; params.data_type = data_type; params.layer_guid = layer_guid; + params.quantization_type = quantization_type; + params.offload = offload; return ff.get_or_create_node(inputs[0], params); } @@ -1127,6 +1225,8 @@ LinearParams Linear::get_params() const { params.activation = this->activation; params.kernel_reg_type = this->kernel_reg_type; params.kernel_reg_lambda = this->kernel_reg_lambda; + params.quantization_type = this->quantization_type; + params.offload = this->offload; return params; } @@ -1330,6 +1430,8 @@ size_t hash::operator()( hash_combine(key, params.activation); hash_combine(key, params.kernel_reg_type); hash_combine(key, params.kernel_reg_lambda); + hash_combine(key, params.quantization_type); + hash_combine(key, params.offload); return key; } }; // namespace std diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index e4c2837e87..db3e83847f 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -595,8 +595,13 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + // We don't do offloading for SSMs (small speculative models) SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta( - handle, attn, weight, gpu_mem, num_samples, num_heads); + handle, attn, weight, gpu_mem_allocator, num_samples, num_heads); + // assert that we didn't over allocate memory + assert(gpu_mem_allocator.instance_allocated_size == + gpu_mem_allocator.instance_total_size); m->profiling = attn->profiling; assert(weight.domain.get_volume() * data_type_size(weight.data_type) == m->weightSize); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 85bd71b205..90a4c89a39 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -61,7 +61,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads) : IncMultiHeadSelfAttentionMeta(handler, @@ -81,9 +81,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->add_bias_kv, attn->scaling_factor, weight, - gpu_mem, + gpu_mem_allocator, num_samples, - _num_heads) { + _num_heads, + DT_NONE, + false) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 7370ff446e..867ef4b6b9 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -582,7 +582,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads) : IncMultiHeadSelfAttentionMeta(handler, @@ -602,9 +602,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->add_bias_kv, attn->scaling_factor, weight, - gpu_mem, + gpu_mem_allocator, num_samples, - _num_heads) { + _num_heads, + DT_NONE, + false) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); @@ -624,39 +626,37 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( BeamSearchPerRequestInfo); // more components will // be added here later - Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(total_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(beam_search_reserve_inst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - off_t offset = 0; + // We always directly allocate memory for small speculative models + gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + total_size); beam_token_infos = - beam_search_reserve_inst - .pointer(offset); - offset += beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); + gpu_mem_allocator + .allocate_instance( + beam_tokeninfo_size); + // offset += beam_tokeninfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); request_infos = - beam_search_reserve_inst.pointer(offset); - offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); + gpu_mem_allocator.allocate_instance( + requestinfo_size); + // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); beam_request_infos = - beam_search_reserve_inst - .pointer(offset); - offset += beam_requestinfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - assert(offset == total_size); + gpu_mem_allocator + .allocate_instance( + beam_requestinfo_size); + // offset += beam_requestinfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); + // assert(offset == total_size); + assert(gpu_mem_allocator.instance_total_size == + gpu_mem_allocator.instance_allocated_size); } cudaStreamSynchronize(stream); } SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { - beam_search_reserve_inst.destroy(); + if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { + beam_search_reserve_inst.destroy(); + } } }; // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 0df7e36f32..4fe218e06c 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -77,6 +77,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( if (data_type == DT_NONE) { data_type = input->data_type; } + DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; + bool offload = cpu_offload; Layer *li = nullptr; int weight_num = bias ? 2 : 1; if (data_type != input->data_type) { @@ -118,14 +120,22 @@ Tensor FFModel::inc_multihead_self_attention_verify( int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; - li->weights[0] = create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + int one_head_size = qParas + kParas + vParas + oParas; + // compress the weight size if quantization. + if (quantization_type != DT_NONE) { + one_head_size = get_quantization_to_byte_size( + data_type, quantization_type, one_head_size); + } + + int dims[2] = {one_head_size, num_heads}; + li->weights[0] = create_weight_legion_ordering( + 2, + dims, + quantization_type == DT_NONE ? data_type : quantization_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); } if (bias) { // q, k, v, o @@ -151,6 +161,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("quantization_type", quantization_type); + li->add_int_property("offload", offload); layers.push_back(li); return li->outputs[0]; } @@ -184,6 +196,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( layer->get_float_property("scaling_factor", scaling_factor); layer->get_int_property("qk_prod_scaling", value); bool qk_prod_scaling = (bool)value; + layer->get_int_property("quantization_type", value); + DataType quantization_type = (DataType)value; + layer->get_int_property("offload", value); + bool offload = (bool)value; return new TreeIncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], @@ -200,6 +216,8 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( scaling_factor, qk_prod_scaling, false /*allocate_weights*/, + quantization_type, + offload, layer->name); } @@ -220,6 +238,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -238,7 +258,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling) { + qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), + offload(_offload) { // overwrite layer_guid layer_guid = _layer_guid; @@ -266,6 +287,10 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; dims[2].size = qParas + kParas + vParas + oParas; + if (quantization_type != DT_NONE) { + dims[2].size = get_quantization_to_byte_size( + data_type, quantization_type, dims[2].size); + } dims[2].degree = 1; dims[2].parallel_idx = -1; int seed = std::rand(); @@ -275,12 +300,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( #else ParameterSyncType comm_type = ParameterSyncType::PS; #endif - weights[0] = model.create_parallel_weight<3>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - comm_type); + weights[0] = model.create_parallel_weight<3>( + dims, + quantization_type == DT_NONE ? this->data_type : quantization_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); } if (bias) { ParallelDim dims[2]; @@ -329,6 +355,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( float _scaling_factor, bool _qk_prod_scaling, bool allocate_weights, + DataType _quantization_type, + bool _offload, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -348,7 +376,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling) + qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), + offload(_offload) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -375,6 +404,10 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; dims[2].size = qParas + kParas + vParas + oParas; + if (quantization_type != DT_NONE) { + dims[2].size = get_quantization_to_byte_size( + data_type, quantization_type, dims[2].size); + } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); #ifdef USE_NCCL @@ -382,12 +415,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( #else ParameterSyncType comm_type = ParameterSyncType::PS; #endif - weights[0] = model.create_parallel_weight<3>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - comm_type); + weights[0] = model.create_parallel_weight<3>( + dims, + quantization_type == DT_NONE ? this->data_type : quantization_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + comm_type); } if (bias) { ParallelDim dims[2]; @@ -440,6 +474,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( other.scaling_factor, other.qk_prod_scaling, allocate_weights, + other.quantization_type, + other.offload, other.name) {} TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( @@ -464,6 +500,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.scaling_factor, params.qk_prod_scaling, allocate_weights, + params.quantization_type, + params.offload, name) {} void TreeIncMultiHeadSelfAttention::init_inference( @@ -494,11 +532,13 @@ void TreeIncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -596,11 +636,26 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + if (attn->offload) { + // cpu-offload enabled + // use offload_reserved_space + gpu_mem_allocator.register_reserved_work_space( + handle.offload_reserve_space, handle.offload_reserve_space_size); + } TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta( - handle, attn, weight, gpu_mem, num_samples, num_heads); + handle, attn, weight, gpu_mem_allocator, num_samples, num_heads); + if (!attn->offload) { + // assert that we didn't over allocate memory + assert(gpu_mem_allocator.reserved_allocated_size == + gpu_mem_allocator.reserved_total_size); + } m->profiling = attn->profiling; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); + + if (attn->quantization_type == DT_NONE) { + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); + } return m; } @@ -641,11 +696,13 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -654,11 +711,13 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); if (bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); @@ -1571,6 +1630,8 @@ size_t hash::operator()( hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.quantization_type); + hash_combine(key, params.offload); return key; } }; // namespace std diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 6e63860cd0..eddf20cc08 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -61,7 +61,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads) : IncMultiHeadSelfAttentionMeta(handler, @@ -81,9 +81,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->add_bias_kv, attn->scaling_factor, weight, - gpu_mem, + gpu_mem_allocator, num_samples, - _num_heads), + _num_heads, + attn->quantization_type, + attn->offload), num_active_tokens(0) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b591e19ed8..1458085d38 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -137,7 +137,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += processed_tokens_in_batch; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = @@ -459,6 +459,36 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { + // additional processing for weight uploading + // if (m->handle.offload_reserve_space != nullptr) { + // // Note that we update weight_ptr and bias_ptr when uploading weight and + // // bias + // cudaMemcpyAsync(m->weight_ptr, + // weight_ptr, + // m->weightSize, + // cudaMemcpyHostToDevice, + // stream); + // weight_ptr = static_cast
(m->weight_ptr); + // if (m->biasSize > 0) { + // cudaMemcpyAsync( + // m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, + // stream); + // bias_ptr = static_cast
(m->bias_ptr); + // } + // // reload weight_o for offloading case + // int parallelism = m->vProjSize * m->oProjSize * m->num_heads; + // build_w_out_tensor<<>>(weight_ptr, + // static_cast
(m->W_out_contiguous), + // m->vProjSize, + // m->oProjSize, + // m->num_heads, + // (m->qSize * m->qProjSize + + // m->kSize * m->kProjSize + + // m->vSize * m->vProjSize)); + // } // copy committed tokens info to GPU for the commit_tokens kernel // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing @@ -476,6 +506,11 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->num_active_tokens = bc->num_active_tokens(); // here because we need postion info in infernece 1 + if (m->offload && m->biasSize > 0) { + cudaMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); + bias_ptr = static_cast
(m->bias_ptr); + } cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), bc->MAX_NUM_TOKENS * @@ -522,32 +557,42 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - assert(input.data_type == weight.data_type); + // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); if (use_bias) { assert(input.data_type == bias.data_type); } if (input.data_type == DT_HALF) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::TreeIncMultiHeadAttention::inference_kernel(m, - bc, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::TreeIncMultiHeadAttention::inference_kernel( + m, + bc, + input.get_half_ptr(), + m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); } else if (input.data_type == DT_FLOAT) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::TreeIncMultiHeadAttention::inference_kernel(m, - bc, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::TreeIncMultiHeadAttention::inference_kernel( + m, + bc, + input.get_float_ptr(), + m->offload ? static_cast(m->weight_ptr) + : weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -570,7 +615,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, GenericTensorAccessorR const &weight, - Memory gpu_mem, + MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_heads) : IncMultiHeadSelfAttentionMeta(handler, @@ -590,9 +635,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->add_bias_kv, attn->scaling_factor, weight, - gpu_mem, + gpu_mem_allocator, num_samples, - _num_heads), + _num_heads, + attn->quantization_type, + attn->offload), num_active_tokens(0) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -601,30 +648,34 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; - size_t totalSize = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); - - Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(totalSize - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(committed_token_reserve_inst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - committed_token_infos = - committed_token_reserve_inst - .pointer(0); + size_t total_size = committed_tokeninfo_size * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + if (offload) { + // assert that we have enough reserved work space left + assert(gpu_mem_allocator.reserved_total_size - + gpu_mem_allocator.reserved_allocated_size >= + total_size); + committed_token_infos = + gpu_mem_allocator + .allocate_reserved( + committed_tokeninfo_size); + } else { + gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, + total_size); + committed_token_infos = + gpu_mem_allocator + .allocate_instance( + committed_tokeninfo_size); + } } cudaStreamSynchronize(stream); } TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) { - committed_token_reserve_inst.destroy(); + if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) { + committed_token_reserve_inst.destroy(); + } } }; // namespace FlexFlow diff --git a/src/runtime/accessor.cc b/src/runtime/accessor.cc index 100262e85a..d3b94bf14a 100644 --- a/src/runtime/accessor.cc +++ b/src/runtime/accessor.cc @@ -77,6 +77,15 @@ half const *GenericTensorAccessorR::get_half_ptr() const { } } +char const *GenericTensorAccessorR::get_byte_ptr() const { + if (data_type == DT_INT4 || data_type == DT_INT8) { + return static_cast(ptr); + } else { + assert(false && "Invalid Accessor Type"); + return static_cast(nullptr); + } +} + template TensorAccessorW::TensorAccessorW(PhysicalRegion region, RegionRequirement req, @@ -156,6 +165,15 @@ half *GenericTensorAccessorW::get_half_ptr() const { } } +char *GenericTensorAccessorW::get_byte_ptr() const { + if (data_type == DT_INT4 || data_type == DT_INT8) { + return static_cast(ptr); + } else { + assert(false && "Invalid Accessor Type"); + return static_cast(nullptr); + } +} + template const DT *helperGetTensorPointerRO(PhysicalRegion region, RegionRequirement req, @@ -261,6 +279,14 @@ GenericTensorAccessorR ptr = helperGetTensorPointerRO(region, req, fid, ctx, runtime); break; } + case DT_INT4: { + ptr = helperGetTensorPointerRO(region, req, fid, ctx, runtime); + break; + } + case DT_INT8: { + ptr = helperGetTensorPointerRO(region, req, fid, ctx, runtime); + break; + } default: { assert(false); } @@ -299,6 +325,14 @@ GenericTensorAccessorW ptr = helperGetTensorPointerWO(region, req, fid, ctx, runtime); break; } + case DT_INT4: { + ptr = helperGetTensorPointerWO(region, req, fid, ctx, runtime); + break; + } + case DT_INT8: { + ptr = helperGetTensorPointerWO(region, req, fid, ctx, runtime); + break; + } default: { assert(false); } @@ -337,6 +371,14 @@ GenericTensorAccessorW ptr = helperGetTensorPointerRW(region, req, fid, ctx, runtime); break; } + case DT_INT4: { + ptr = helperGetTensorPointerRW(region, req, fid, ctx, runtime); + break; + } + case DT_INT8: { + ptr = helperGetTensorPointerRW(region, req, fid, ctx, runtime); + break; + } default: { assert(false); } @@ -345,11 +387,13 @@ GenericTensorAccessorW } #define DIMFUNC(DIM) \ + template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ template class TensorAccessorR; \ + template class TensorAccessorW; \ template class TensorAccessorW; \ template class TensorAccessorW; \ template class TensorAccessorW; \ @@ -373,6 +417,22 @@ template half *helperGetTensorPointerWO(PhysicalRegion region, Context ctx, Runtime *runtime); +template char const *helperGetTensorPointerRO(PhysicalRegion region, + RegionRequirement req, + FieldID fid, + Context ctx, + Runtime *runtime); +template char *helperGetTensorPointerRW(PhysicalRegion region, + RegionRequirement req, + FieldID fid, + Context ctx, + Runtime *runtime); +template char *helperGetTensorPointerWO(PhysicalRegion region, + RegionRequirement req, + FieldID fid, + Context ctx, + Runtime *runtime); + template float const *helperGetTensorPointerRO(PhysicalRegion region, RegionRequirement req, FieldID fid, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 63b625edac..d2b68595bd 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -216,6 +216,15 @@ size_t data_type_size(DataType type) { } } +size_t get_quantization_to_byte_size(DataType type, + DataType quantization_type, + size_t num_elements) { + assert(quantization_type == DT_INT4 || quantization_type == DT_INT8); + return (num_elements / (quantization_type == DT_INT4 ? 2 : 1)) + + (num_elements / INT4_NUM_OF_ELEMENTS_PER_GROUP) * 2 * + data_type_size(type); +} + std::ostream &operator<<(std::ostream &s, OperatorType op_type) { s << get_operator_type_name(op_type); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index c43fe140b9..e8a1b6f9f1 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2279,6 +2279,8 @@ GraphOptimalViewSerialized sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->quantization_type); + sez.serialize(attn->offload); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2315,6 +2317,8 @@ GraphOptimalViewSerialized sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->quantization_type); + sez.serialize(attn->offload); break; } case OP_INC_MULTIQUERY_SELF_ATTENTION: { @@ -2694,7 +2698,8 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_heads, k_dim, v_dim; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling; + scaling_query, qk_prod_scaling, offload; + DataType quantization_type; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -2710,6 +2715,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); + dez.deserialize(quantization_type); + dez.deserialize(offload); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2725,6 +2732,8 @@ void FFModel::deserialize_graph_optimal_view( params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; + params.quantization_type = quantization_type; + params.offload = offload; node = get_or_create_node(inputs[0], params); break; } @@ -2773,7 +2782,8 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_heads, k_dim, v_dim; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling; + scaling_query, qk_prod_scaling, offload; + DataType quantization_type; size_t id; dez.deserialize(id); LayerID layer_guid(id); @@ -2789,6 +2799,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); + dez.deserialize(quantization_type); + dez.deserialize(offload); TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2804,6 +2816,8 @@ void FFModel::deserialize_graph_optimal_view( params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; + params.quantization_type = quantization_type; + params.offload = offload; node = get_or_create_node(inputs[0], params); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 5a5c57bfea..c7fe4d6a82 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -26,6 +26,7 @@ namespace FlexFlow { using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); +LegionRuntime::Logger::Category log_offload("Offloading"); InferenceManager::InferenceManager(FFConfig const &_config, int _max_num_tokens_per_batch, @@ -45,6 +46,18 @@ InferenceManager::InferenceManager(FFConfig const &_config, } } +bool parallel_tensor_list_overlaps(std::vector const &list1, + std::vector const &list2) { + for (auto const &pt1 : list1) { + for (auto const &pt2 : list2) { + if (pt1 == pt2) { + return true; + } + } + } + return false; +} + void InferenceManager::compile_model_and_allocate_buffer( FFModel *model, std::unordered_map> const @@ -61,7 +74,8 @@ void InferenceManager::compile_model_and_allocate_buffer( assert(pt->owner_op != nullptr); mapping[pt->owner_op] = it.second; } - for (auto const &op : model->operators) { + for (int op_idx = 0; op_idx < model->operators.size(); op_idx++) { + Op const *op = model->operators[op_idx]; // Skip weight operators if (op->op_type == OP_WEIGHT) { continue; @@ -99,20 +113,70 @@ void InferenceManager::compile_model_and_allocate_buffer( ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); std::vector list; - for (int j = 0; j < max_num_inflight_batches; j++) { - // Copy the metadata from pt_base to pt - ParallelTensor pt = new ParallelTensorBase(*pt_base); - pt->region = - runtime->create_logical_region(ctx, - pt_base->region.get_index_space(), - pt_base->region.get_field_space()); - pt->part = runtime->get_logical_partition( - ctx, pt->region, pt_base->part.get_index_partition()); - pt->machine_view = machine_views[j]; - Domain part_domain = - runtime->get_index_space_domain(ctx, pt_base->parallel_is); - assert(pt->machine_view.get_domain() == part_domain); - list.push_back(pt); + bool found_parallel_tensor = false; + if (model->cpu_offload) { + for (auto const &pre_pt : tensor_buffer) { + bool used_by_future_operator = false; + bool used_by_current_operator = false; + if (pre_pt.first->get_shape() != pt_base->get_shape()) { + // Continue if shape mismatches + continue; + } + // Check that pt cannot be used as an input to the current operator + for (int j = 0; j < op->numInputs; j++) { + if (parallel_tensor_list_overlaps(tensor_buffer[op->inputs[j]], + pre_pt.second)) { + used_by_current_operator = true; + } + } + for (int j = 0; j < i; j++) { + assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end()); + if (parallel_tensor_list_overlaps(tensor_buffer[op->outputs[j]], + pre_pt.second)) { + used_by_current_operator = true; + } + } + // Check that pt cannot be used by any subsequent operators + for (int op_idx2 = op_idx; op_idx2 < model->operators.size(); + op_idx2++) { + Op const *op2 = model->operators[op_idx2]; + for (int j = 0; j < op2->numInputs; j++) { + if (tensor_buffer.find(op2->inputs[j]) != tensor_buffer.end()) { + if (parallel_tensor_list_overlaps(tensor_buffer[op2->inputs[j]], + pre_pt.second)) { + used_by_future_operator = true; + } + } + } + } + if (!used_by_future_operator && !used_by_current_operator) { + found_parallel_tensor = true; + list = pre_pt.second; + } + } + if (!found_parallel_tensor) { + log_offload.print( + "Cannot find a previous tensor for operator(%d) output_idx(%d)", + op_idx, + i); + } + } + if (!found_parallel_tensor) { + for (int j = 0; j < max_num_inflight_batches; j++) { + // Copy the metadata from pt_base to pt + ParallelTensor pt = new ParallelTensorBase(*pt_base); + pt->region = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition( + ctx, pt->region, pt_base->part.get_index_partition()); + pt->machine_view = machine_views[j]; + Domain part_domain = + runtime->get_index_space_domain(ctx, pt_base->parallel_is); + assert(pt->machine_view.get_domain() == part_domain); + list.push_back(pt); + } } assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); tensor_buffer[pt_base] = list; diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc new file mode 100644 index 0000000000..06a7c468a4 --- /dev/null +++ b/src/runtime/memory_allocator.cc @@ -0,0 +1,54 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/memory_allocator.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; +using Realm::RegionInstance; + +MemoryAllocator::MemoryAllocator(Memory _memory) + : memory(_memory), reserved_ptr(nullptr), instance_ptr(nullptr), + reserved_total_size(0), reserved_allocated_size(0), + instance_total_size(0), instance_allocated_size(0) {} + +void MemoryAllocator::create_legion_instance(RegionInstance &inst, + size_t size) { + // Assert that we have used up previously created region instance + assert(instance_total_size == instance_allocated_size); + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance( + inst, memory, bounds, field_sizes, 0, Realm::ProfilingRequestSet()) + .wait(); + instance_ptr = inst.pointer_untyped(0, 0); + instance_total_size = size; + instance_allocated_size = 0; +} + +void MemoryAllocator::register_reserved_work_space(void *base, size_t size) { + // Assert that we haven't allocated anything before + assert(reserved_total_size == 0); + reserved_ptr = base; + reserved_total_size = size; + reserved_allocated_size = 0; +} + +}; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8d1133f7c9..e6e432040f 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1271,6 +1271,9 @@ FFRuntime::FFRuntime(FFConfig &config) { // info.myRank = rank++; // info.allRanks = config.workersPerNode * config.numNodes; info.workSpaceSize = config.workSpaceSize; + info.offload_reserve_space_size = + config.cpu_offload ? config.offload_reserve_space_size : 0; + info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); } @@ -1294,7 +1297,7 @@ FFRuntime::FFRuntime(FFConfig &config) { FFRuntime *ffruntime_singleton = nullptr; -FFModel::FFModel(FFConfig &_config) +FFModel::FFModel(FFConfig &_config, bool cpu_offload) : op_global_guid(OP_GUID_FIRST_VALID), layer_global_guid(LAYER_GUID_FIRST_VALID), tensor_global_guid(TENSOR_GUID_FIRST_VALID), @@ -1303,6 +1306,7 @@ FFModel::FFModel(FFConfig &_config) loss_op(NULL), metrics_op(NULL), simulator(NULL) { this->search = new PCG::SearchHelper(this); this->graph_search = new PCG::GraphSearchHelper(this); + this->cpu_offload = cpu_offload; if (ffruntime_singleton == nullptr) { ffruntime_singleton = new FFRuntime(_config); @@ -1715,6 +1719,12 @@ void FFModel::map_tensor_with_dim2(ParallelTensor tensor, case DT_INT64: allocator.allocate_field(sizeof(int64_t), FID_DATA); break; + case DT_INT4: + allocator.allocate_field(sizeof(char), FID_DATA); + break; + case DT_INT8: + allocator.allocate_field(sizeof(char), FID_DATA); + break; default: assert(false); } @@ -3648,9 +3658,12 @@ struct DefaultConfig { const static int cpusPerNode = 0; const static size_t searchBudget = -1; const static size_t simulatorWorkSpaceSize = - (size_t)2 * 1024 * 1024 * 1024; // 2GB + (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; const static bool searchOverlapBackwardUpdate = false; + const static size_t offloadReserveSpaceSize = + (size_t)8 * 1024 * 1024 * 1024; // 8 GB + const static bool cpuOffload = false; const static bool onlyDataParallel = true; const static bool enableSampleParallel = true; const static bool enableParameterParallel = false; @@ -3682,6 +3695,9 @@ FFConfig::FFConfig() { search_alpha = DefaultConfig::searchAlpha; search_overlap_backward_update = DefaultConfig::searchOverlapBackwardUpdate; computationMode = COMP_MODE_TRAINING; + cpu_offload = DefaultConfig::cpuOffload; + offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize; + quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; enable_sample_parallel = DefaultConfig::enableSampleParallel; enable_parameter_parallel = DefaultConfig::enableParameterParallel; @@ -3775,6 +3791,22 @@ void FFConfig::parse_args(char **argv, int argc) { export_strategy_file = std::string(argv[++i]); continue; } + if ((!strcmp(argv[i], "-offload"))) { + cpu_offload = true; + continue; + } + if (!strcmp(argv[i], "-offload-reserve-space-size")) { + offload_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } + if ((!strcmp(argv[i], "--4bit-quantization"))) { + quantization_type = DT_INT4; + continue; + } + if ((!strcmp(argv[i], "--8bit-quantization"))) { + quantization_type = DT_INT8; + continue; + } if ((!strcmp(argv[i], "--only-data-parallel"))) { only_data_parallel = true; continue; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index e07a7465a9..17401a0f14 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -86,6 +86,8 @@ FFHandler printf("workSpaceSize (%zu MB)\n", info->workSpaceSize / 1024 / 1024); FFHandler handle; handle.workSpaceSize = info->workSpaceSize; + handle.offload_reserve_space_size = info->offload_reserve_space_size; + handle.quantization_type = info->quantization_type; handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion; checkCUDA(cublasCreate(&handle.blas)); if (handle.allowTensorOpMathConversion) { @@ -125,6 +127,31 @@ FFHandler .wait(); handle.workSpace = workspaceInst.pointer_untyped(0, sizeof(char)); } + if (handle.offload_reserve_space_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.offload_reserve_space = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.offload_reserve_space = nullptr; + } + // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 0fb98e05ea..7bce941c68 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -841,6 +841,14 @@ template bool ParallelTensorBase::set_tensor(FFModel const *ff, template bool ParallelTensorBase::get_tensor(FFModel const *ff, half *data, bool get_gradients); + +template bool ParallelTensorBase::set_tensor(FFModel const *ff, + std::vector const &dims, + char const *data); +template bool ParallelTensorBase::get_tensor(FFModel const *ff, + char *data, + bool get_gradients); + template bool ParallelTensorBase::set_tensor( FFModel const *ff, std::vector const &dims, float const *data); template bool ParallelTensorBase::get_tensor(FFModel const *ff, From 0f3be1f05910d91b043ec44cc56c40e5254358c2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 25 Jun 2023 22:45:19 +0800 Subject: [PATCH 154/344] [Inference] Tensor model parallelism (#778) * add parallel operators * add cmd line param * setting machine views * move bias blocks * comment out print of partitions * add unimplemented methods * add impl of inference functions to replicate and reduce ops * replicate bias in file loader * fixes, now works * only add bias once * load and use weights according to partition * fix wout weight * cleanup * add support for mixed precision in parallel ops * cleanup * rocm build fix * hip rocm fix 2 * fix machine views * fix rocm build * adjust numbe of pipeline stages * add model parallelism to opt linear layers * fix * fxi multi gpu test * fix * add tensor parallelism tests to inference test script * enable tensor parallelism for dense layers in llama * fix * fix set_tensor-related issues * fix and linting --- .github/workflows/gpu-ci-skip.yml | 2 +- .github/workflows/gpu-ci.yml | 9 +- config/config.linux | 2 +- include/flexflow/config.h | 2 + include/flexflow/operator.h | 1 + .../ops/inc_multihead_self_attention.h | 4 +- .../inc_multihead_self_attention_kernels.h | 2 + include/flexflow/ops/kernels/linear_kernels.h | 2 +- .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + .../parallel_ops/kernels/reduction_kernels.h | 8 + .../parallel_ops/kernels/replicate_kernels.h | 8 + include/flexflow/parallel_ops/reduction.h | 17 ++ include/flexflow/parallel_ops/replicate.h | 24 +++ inference/file_loader.cc | 70 ++++++-- inference/incr_decoding/incr_decoding.cc | 25 ++- inference/spec_infer/spec_infer.cc | 25 ++- src/ops/attention.cc | 14 +- src/ops/experts.cc | 9 +- src/ops/inc_multihead_self_attention.cc | 108 +++++------ src/ops/inc_multihead_self_attention.cpp | 2 + src/ops/inc_multihead_self_attention.cu | 44 +++-- src/ops/inc_multiquery_self_attention.cc | 14 +- src/ops/inc_multiquery_self_attention.cu | 30 ++-- src/ops/layer_norm.cc | 9 +- src/ops/linear.cc | 24 ++- src/ops/spec_inc_multihead_self_attention.cc | 105 +++++------ src/ops/spec_inc_multihead_self_attention.cpp | 2 + src/ops/spec_inc_multihead_self_attention.cu | 18 +- src/ops/tree_inc_multihead_self_attention.cc | 107 +++++------ src/ops/tree_inc_multihead_self_attention.cpp | 2 + src/ops/tree_inc_multihead_self_attention.cu | 18 +- .../kernels/reduction_kernels.cpp | 12 ++ src/parallel_ops/kernels/reduction_kernels.cu | 12 ++ .../kernels/replicate_kernels.cpp | 7 + src/parallel_ops/kernels/replicate_kernels.cu | 7 + src/parallel_ops/reduction.cc | 169 +++++++++++++++++- src/parallel_ops/replicate.cc | 147 +++++++++++++-- src/runtime/cuda_helper.cu | 4 + src/runtime/inference_manager.cc | 49 +++++ src/runtime/model.cc | 76 +++++++- src/runtime/parallel_tensor.cc | 9 +- src/runtime/request_manager.cc | 4 +- tests/inference_tests.sh | 42 +++++ 44 files changed, 931 insertions(+), 316 deletions(-) diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index 766bd8d790..a9b5132ef6 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -45,7 +45,7 @@ jobs: gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests runs-on: ubuntu-20.04 - if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} + # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} needs: inference-tests steps: - run: 'echo "No gpu-ci required"' diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 9797670c77..7f83fb2691 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -185,8 +185,13 @@ jobs: gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests runs-on: self-hosted - #skip this time-consuming test for PRs to the inference branch - if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} + # skip this time-consuming test for PRs to the inference branch + # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" needs: inference-tests container: image: ghcr.io/flexflow/flexflow-environment-cuda:latest diff --git a/config/config.linux b/config/config.linux index 482a154145..8039402a13 100755 --- a/config/config.linux +++ b/config/config.linux @@ -72,7 +72,7 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF} FF_MAX_DIM=${FF_MAX_DIM:-5} # set LEGION_MAX_RETURN_SIZE -LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-65536} +LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-131072} # set ROCM path ROCM_PATH=${ROCM_PATH:-"/opt/rocm"} diff --git a/include/flexflow/config.h b/include/flexflow/config.h index f5eb2e069a..f7c59f7b58 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -143,6 +143,8 @@ class FFConfig { bool enable_parameter_parallel; bool enable_attribute_parallel; bool enable_inplace_optimizations; + // Control tensor model parallelism degree in inference + int tensor_parallelism_degree; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion; std::string dataset_path; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 122850ec05..13b2bb8ba8 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -294,6 +294,7 @@ class Op { std::map inference_meta; int numInputs, numWeights, numOutputs; bool profiling; + bool add_bias_only_once; #ifdef FF_USE_NCCL ncclUniqueId ncclId; #endif diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index a178dad577..b5d441713f 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -105,6 +105,7 @@ class IncMultiHeadSelfAttention : public Op { static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, @@ -150,6 +151,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, + int _global_num_heads, int _num_heads, DataType _quantization_type, bool _offload); @@ -160,7 +162,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, biasSize, reserveSpaceSize, quantized_weightSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int num_heads; + int global_num_heads, num_heads; bool *has_load_weights; bool *apply_rotary_embedding; bool *bias; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index a35cf9d7f2..5b40136524 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -28,6 +28,7 @@ __global__ void apply_proj_bias_w(DT *input_ptr, template __global__ void apply_proj_bias_qkv(DT *input_ptr, DT const *bias_ptr, + int shard_id, int num_tokens, int qProjSize, int kProjSize, @@ -53,6 +54,7 @@ __global__ void template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, + int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index ed4864b1ab..9644fd9c8f 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -32,7 +32,7 @@ class LinearMeta : public OpMeta { ActiMode activation; RegularizerMode kernel_reg_type; float kernel_reg_lambda; - bool use_bias; + bool use_bias, add_bias_only_once; DataType input_type, weight_type, output_type; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index db12be5792..eef684cdb7 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -105,6 +105,7 @@ class SpecIncMultiHeadSelfAttention : public Op { static void inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 328ab128b2..a9e584aa2b 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -104,6 +104,7 @@ class TreeIncMultiHeadSelfAttention : public Op { static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, diff --git a/include/flexflow/parallel_ops/kernels/reduction_kernels.h b/include/flexflow/parallel_ops/kernels/reduction_kernels.h index e9f6a9d070..51ddced227 100644 --- a/include/flexflow/parallel_ops/kernels/reduction_kernels.h +++ b/include/flexflow/parallel_ops/kernels/reduction_kernels.h @@ -3,8 +3,16 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/reduction.h" namespace FlexFlow { + +class ReductionMeta : public OpMeta { +public: + ReductionMeta(FFHandler handle, Reduction const *reduct); +}; + namespace Kernels { namespace Reduction { diff --git a/include/flexflow/parallel_ops/kernels/replicate_kernels.h b/include/flexflow/parallel_ops/kernels/replicate_kernels.h index 619d06efef..d5d52797c3 100644 --- a/include/flexflow/parallel_ops/kernels/replicate_kernels.h +++ b/include/flexflow/parallel_ops/kernels/replicate_kernels.h @@ -3,8 +3,16 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/replicate.h" namespace FlexFlow { + +class ReplicateMeta : public OpMeta { +public: + ReplicateMeta(FFHandler handle, Replicate const *repl); +}; + namespace Kernels { namespace Replicate { diff --git a/include/flexflow/parallel_ops/reduction.h b/include/flexflow/parallel_ops/reduction.h index fed5f049c7..1918c3b587 100644 --- a/include/flexflow/parallel_ops/reduction.h +++ b/include/flexflow/parallel_ops/reduction.h @@ -25,12 +25,29 @@ class Reduction : public ParallelOp { Input const input, char const *name = nullptr); void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 381f690cdc..f8f2c42559 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -10,6 +10,8 @@ namespace FlexFlow { +class ReplicateMeta; + class Replicate : public ParallelOp { public: using Params = ReplicateParams; @@ -25,12 +27,29 @@ class Replicate : public ParallelOp { Input const input, char const *name = nullptr); void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -39,6 +58,11 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void forward_kernel_wrapper(ReplicateMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + size_t num_elements, + size_t num_replicas); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index a7386d6597..071124fc0d 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -84,7 +84,11 @@ void load_attention_bias(DT *ptr, int file_index = 0; for (auto file : bias_files) { - size_t partial_size = hidden_dim; + size_t qkv_partial_size = qkv_inner_dim * num_heads; + size_t out_partial_size = hidden_dim; + size_t partial_size = + (file_index < 3) ? qkv_partial_size : out_partial_size; + // std::cout << "Loading filename: " << file << std::endl; std::ifstream in(file, std::ios::in | std::ios::binary); assert(in.good() && "incorrect bias file path"); std::vector
host_array(partial_size); @@ -95,15 +99,18 @@ void load_attention_bias(DT *ptr, size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load bias data error"; - return; + printf( + "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", + in_get_size, + loaded_data_size); + assert(false); } assert(partial_size == host_array.size()); size_t data_index = 0; - for (int i = 0; i < hidden_dim; i++) { - ptr[file_index * hidden_dim + i] = host_array.at(data_index); + for (int i = 0; i < partial_size; i++) { + ptr[file_index * qkv_partial_size + i] = host_array.at(data_index); data_index++; } @@ -183,7 +190,7 @@ void load_attention_weights(DT *ptr, std::string o_file = weight_path + layer_name.substr(0, layer_name.find("attention")) + "attention_wo_weight"; - std::vector weight_files = {q_file, k_file, v_file, o_file}; + std::vector weight_files = {q_file, k_file, v_file}; int file_index = 0; @@ -193,7 +200,7 @@ void load_attention_weights(DT *ptr, size_t one_weight_file_size = num_heads * single_proj_size; // size of each of Q/K/V/O for all heads - // q, k, v, o -> 0, 1, 2, 3 + // q, k, v -> 0, 1, 2 for (auto file : weight_files) { size_t partial_size = one_weight_file_size; @@ -210,23 +217,60 @@ void load_attention_weights(DT *ptr, size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error"; + std::cout << "load data error" << std::endl; + assert(false); return; } assert(partial_size == host_array.size()); - size_t one_head_size = hidden_dim * (hidden_dim / num_heads); size_t data_index = 0; - for (int i = 0; i < num_heads; i++) { - size_t start_index = i * one_head_size * 4 + file_index * one_head_size; - for (size_t j = start_index; j < start_index + one_head_size; j++) { + size_t start_index = + i * single_proj_size * 4 + file_index * single_proj_size; + for (size_t j = start_index; j < start_index + single_proj_size; j++) { ptr[j] = host_array.at(data_index); data_index += 1; } } + assert(data_index == partial_size); file_index++; + in.close(); + } + // output weight file gets special treatment + { + std::ifstream in(o_file, std::ios::in | std::ios::binary); + std::cout << "Loading attention filename: " << o_file << std::endl; + if (!in.good()) { + std::cout << "Could not open file: " << o_file << std::endl; + } + assert(in.good() && "incorrect weight file path"); + size_t full_output_weight_size = num_heads * single_proj_size; + std::vector
host_array(full_output_weight_size); + size_t loaded_data_size = sizeof(DT) * full_output_weight_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error" << std::endl; + assert(false); + } + assert(full_output_weight_size == host_array.size()); + + for (int i = 0; i < num_heads; i++) { + size_t start_index = i * single_proj_size * 4 + 3 * single_proj_size; + for (size_t j = 0; j < single_proj_size; j++) { + int ff_row_idx = j % hidden_dim; + int ff_col_idx = j / hidden_dim; + assert(ff_row_idx < hidden_dim && ff_col_idx < qkv_inner_dim); + size_t data_index = ff_row_idx * (qkv_inner_dim * num_heads) + + qkv_inner_dim * i + ff_col_idx; + ptr[j + start_index] = host_array.at(data_index); + } + } + in.close(); } } @@ -248,7 +292,7 @@ void load_from_file(DT *ptr, size_t size, std::string filename) { if (in_get_size != loaded_data_size) { std::cout << "load weight data error " << in_get_size << ", " << loaded_data_size << ", " << sizeof(DT) << std::endl; - return; + assert(false); } assert(size == host_array.size()); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index b59586de07..a281f52853 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -37,7 +37,8 @@ void parse_input_args(char **argv, FilePaths &paths, ModelType &llm_model_type, bool &use_full_precision, - bool &verbose) { + bool &verbose, + int &tensor_parallelism_degree) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -82,6 +83,11 @@ void parse_input_args(char **argv, paths.output_file_path = std::string(argv[++i]); continue; } + // tensor parallelism degree + if (!strcmp(argv[i], "-tensor-parallelism-degree")) { + tensor_parallelism_degree = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -106,12 +112,19 @@ void FlexFlow::top_level_task(Task const *task, ModelType model_type; bool use_full_precision = false; bool verbose = false; + int tensor_parallelism_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args( - argv, argc, file_paths, model_type, use_full_precision, verbose); + parse_input_args(argv, + argc, + file_paths, + model_type, + use_full_precision, + verbose, + tensor_parallelism_degree); + ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -128,7 +141,8 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, + ffconfig.workersPerNode * ffconfig.numNodes / + tensor_parallelism_degree, INC_DECODING_MODE, use_full_precision); } else if (model_type == ModelType::OPT) { @@ -136,7 +150,8 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, + ffconfig.workersPerNode * ffconfig.numNodes / + tensor_parallelism_degree, INC_DECODING_MODE, use_full_precision); } else if (model_type == ModelType::FALCON) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index ec0b222075..72666ed312 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -43,7 +43,8 @@ void parse_input_args(char **argv, FilePaths &paths, ModelTypes &model_types, bool &use_full_precision, - bool &verbose) { + bool &verbose, + int &tensor_parallelism_degree) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -114,6 +115,11 @@ void parse_input_args(char **argv, paths.output_file_path = std::string(argv[++i]); continue; } + // tensor parallelism degree + if (!strcmp(argv[i], "-tensor-parallelism-degree")) { + tensor_parallelism_degree = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -135,12 +141,19 @@ void FlexFlow::top_level_task(Task const *task, ModelTypes model_types; bool use_full_precision = false; bool verbose = false; + int tensor_parallelism_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args( - argv, argc, file_paths, model_types, use_full_precision, verbose); + parse_input_args(argv, + argc, + file_paths, + model_types, + use_full_precision, + verbose, + tensor_parallelism_degree); + ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; if (file_paths.ssm_weight_file_paths.size() == 0) { assert(false && @@ -178,7 +191,8 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, + ffconfig.workersPerNode * ffconfig.numNodes / + tensor_parallelism_degree, TREE_VERIFY_MODE, use_full_precision); } else if (model_types.llm_model_type == ModelType::OPT) { @@ -186,7 +200,8 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, + ffconfig.workersPerNode * ffconfig.numNodes / + tensor_parallelism_degree, TREE_VERIFY_MODE, use_full_precision); } else { diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 8494981cf6..ca709bdc51 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -220,17 +220,12 @@ MultiHeadAttention::MultiHeadAttention(FFModel &model, dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>(dims, DT_FLOAT, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); + CHOSEN_SYNC_TYPE); } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -307,17 +302,12 @@ MultiHeadAttention::MultiHeadAttention(FFModel &model, dims[2].size = qParas + kParas + vParas + oParas; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>(dims, DT_FLOAT, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); + CHOSEN_SYNC_TYPE); } outputs[0] = model.create_parallel_tensor_legion_ordering( _query->num_dims, dims, DT_FLOAT, this); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 8ec77131a9..77cd748f9c 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -340,11 +340,6 @@ Experts::Experts(FFModel &model, assert(outputs[0] != nullptr); if (allocate_weights) { -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif { ParallelDim dims[3]; int nparams = (experts_num_layers == 1) @@ -367,7 +362,7 @@ Experts::Experts(FFModel &model, NULL /*owner_op*/, true /*create_grad*/, kernel_initializer, - comm_type); + CHOSEN_SYNC_TYPE); assert(weights[0] != nullptr); } if (use_bias) { @@ -391,7 +386,7 @@ Experts::Experts(FFModel &model, NULL /*owner_op*/, true /*create_grad*/, bias_initializer, - comm_type); + CHOSEN_SYNC_TYPE); assert(weights[1] != nullptr); } } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index b2528a7c14..765b3c5bfc 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -110,17 +110,16 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int one_head_size = qParas + kParas + vParas + oParas; { - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int one_head_size = qParas + kParas + vParas + oParas; - // compress the weight size if quantization. if (quantization_type != DT_NONE) { one_head_size = get_quantization_to_byte_size( @@ -138,7 +137,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, } if (bias) { // q, k, v, o - int dims[1] = {embed_dim * 4}; + int dims[1] = {(qProjSize + kProjSize + vProjSize) * num_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -296,38 +295,27 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, nullptr /*owner_op*/, true /*create_grad*/, initializer, - comm_type); - } - if (bias) { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[0] = inputs[0]->dims[num_dims - 1]; - dims[0].size = dims[0].degree; - dims[1].size = oProjSize * 4; - dims[1].degree = 1; - dims[1].parallel_idx = -1; -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif - weights[1] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - NULL, - comm_type); + CHOSEN_SYNC_TYPE); + if (bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + bias_shape.dims[0].size = + (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -411,37 +399,29 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); - } - if (bias) { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[0] = inputs[0]->dims[num_dims - 1]; - dims[0].size = dims[0].degree; - dims[1].size = oProjSize * 4; -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif - weights[1] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - NULL, - comm_type); + CHOSEN_SYNC_TYPE); + if (bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + bias_shape.dims[0].size = + (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } } + outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); @@ -758,7 +738,7 @@ void IncMultiHeadSelfAttention::inference_task( runtime); Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 2); + assert(bias_domain.get_dim() == 4); } Domain input_domain = runtime->get_index_space_domain( @@ -772,8 +752,10 @@ void IncMultiHeadSelfAttention::inference_task( assert(weight_domain.get_dim() == 3); assert(output_domain.get_dim() == 4); + assert(task->index_point.get_dim() == 1); + IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, bc, input, weight, output, biases); + m, bc, task->index_point.point_data[0], input, weight, output, biases); #ifdef INFERENCE_TESTS printf("Checking IncMultiHeadSelfAttention computations...\n"); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 669cbd2636..9fce37fc30 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -27,6 +27,7 @@ using Legion::Memory; void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, @@ -90,6 +91,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, + int _global_num_heads, int _num_heads, DataType _quantization_type, bool _offload) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 408c1ab012..991b6d2236 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -38,12 +38,16 @@ __global__ void build_w_out_tensor(DT const *weight_ptr, int num_heads, int qkv_weight_block_size) { CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - int row_idx = i % vProjSize; - int col_idx = (i / vProjSize) % oProjSize; - int head_idx = i / (vProjSize * oProjSize); - contiguous_weight_ptr[i] = - weight_ptr[head_idx * (qkv_weight_block_size + vProjSize * oProjSize) + - qkv_weight_block_size + col_idx * vProjSize + row_idx]; + // Each slice (one per head) in the weight_ptr has shape (oProjSize, + // vProjSize) + int row_idx = i % oProjSize; + int col_idx = (i / oProjSize) % vProjSize; + int head_idx = i / (oProjSize * vProjSize); + // The contiguous_weight_ptr has shape (vProjSize * num_heads, oProjSize) + int idx = row_idx * vProjSize * num_heads + vProjSize * head_idx + col_idx; + contiguous_weight_ptr[idx] = + weight_ptr[(qkv_weight_block_size + vProjSize * oProjSize) * head_idx + + qkv_weight_block_size + col_idx * oProjSize + row_idx]; } } @@ -61,10 +65,12 @@ __global__ void apply_proj_bias_w(DT *input_ptr, template __global__ void apply_proj_bias_qkv(DT *input_ptr, DT const *bias_ptr, + int shard_id, int num_tokens, int qProjSize, int kProjSize, int vProjSize, + int global_num_heads, int num_heads, bool scaling_query, float scaling_factor) { @@ -82,8 +88,10 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int real_part_index = head_idx * qkv_block_size + qkv_index * q_block_size + idx; - int bias_idx = qkv_index * qProjSize * num_heads + head_idx * qProjSize + - (idx % qProjSize); + + int global_head_idx = head_idx + shard_id * num_heads; + int bias_idx = qkv_index * qProjSize * global_num_heads + + global_head_idx * qProjSize + (idx % qProjSize); input_ptr[real_part_index] += bias_ptr[bias_idx]; if (scaling_query && qkv_index == 0) { @@ -148,6 +156,7 @@ __global__ void template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, + int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, @@ -269,10 +278,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, 0, stream>>>(output_ptr, bias_ptr, + shard_id, num_tokens, m->qProjSize, m->kProjSize, m->vProjSize, + m->global_num_heads, m->num_heads, *m->scaling_query, m->scaling_factor); @@ -426,6 +437,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, template void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, + int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, @@ -446,6 +458,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, + shard_id, input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), @@ -457,7 +470,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); + compute_attention_kernel(m, bc, shard_id, output_ptr, bias_ptr, stream); } } // namespace IncMultiHeadAttention @@ -520,6 +533,7 @@ __global__ void fill_entries_above_diagonal(DT *matrix, template void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, + int shard_id, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { @@ -703,9 +717,9 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = m->W_out_contiguous; + A = static_cast
(m->W_out_contiguous); B = C; - C = (output_ptr + tokens_previous_requests * m->oProjSize); + C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -730,7 +744,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, tokens_previous_requests += num_new_tokens; } - if (*m->bias) { + if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; apply_proj_bias_w<<offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), output.get_half_ptr(), @@ -790,6 +806,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( Kernels::IncMultiHeadAttention::inference_kernel( m, bc, + shard_id, input.get_float_ptr(), m->offload ? static_cast(m->weight_ptr) : weight.get_float_ptr(), @@ -839,6 +856,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, + attn->num_heads, _num_heads, attn->quantization_type, attn->offload) {} @@ -863,6 +881,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, + int _global_num_heads, int _num_heads, DataType _quantization_type, bool _offload) @@ -885,6 +904,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( quantization_type = _quantization_type; offload = _offload; + global_num_heads = _global_num_heads; num_heads = _num_heads; weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)); diff --git a/src/ops/inc_multiquery_self_attention.cc b/src/ops/inc_multiquery_self_attention.cc index eae98cd7d5..05c57af2ff 100644 --- a/src/ops/inc_multiquery_self_attention.cc +++ b/src/ops/inc_multiquery_self_attention.cc @@ -235,17 +235,12 @@ IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>(dims, this->data_type, nullptr /*owner_op*/, true /*create_grad*/, initializer, - comm_type); + CHOSEN_SYNC_TYPE); } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -317,17 +312,12 @@ IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>(dims, this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); + CHOSEN_SYNC_TYPE); } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); diff --git a/src/ops/inc_multiquery_self_attention.cu b/src/ops/inc_multiquery_self_attention.cu index ed6153de2d..1193219c9c 100644 --- a/src/ops/inc_multiquery_self_attention.cu +++ b/src/ops/inc_multiquery_self_attention.cu @@ -639,21 +639,21 @@ IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( Memory gpu_mem, int num_samples) : IncMultiQuerySelfAttentionMeta(handler, - INC_DECODING_MODE, - attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->embed_dim, - attn->bias, - attn->add_bias_kv, - weight, - gpu_mem, - num_samples) {} + INC_DECODING_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->embed_dim, + attn->bias, + attn->add_bias_kv, + weight, + gpu_mem, + num_samples) {} IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( FFHandler handler, diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 3b20f932e2..0c08a2426f 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -227,11 +227,6 @@ LayerNorm::LayerNorm(FFModel &model, int seed = std::rand(); Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 0.0f); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight_legion_ordering(axes.size(), dims, @@ -239,7 +234,7 @@ LayerNorm::LayerNorm(FFModel &model, NULL /*owner_op*/, true /*create_grad*/, gamma_initializer, - comm_type); + CHOSEN_SYNC_TYPE); weights[1] = model.create_parallel_weight_legion_ordering(axes.size(), dims, @@ -247,7 +242,7 @@ LayerNorm::LayerNorm(FFModel &model, NULL /*owner_op*/, true /*create_grad*/, beta_initializer, - comm_type); + CHOSEN_SYNC_TYPE); } } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 6ab99e6892..e3204c01d9 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -230,6 +230,23 @@ Linear::Linear(FFModel &model, LinearParams params = this->get_params(); params.construct_mappings(*this->parallel_dims_mapping, input_shape); params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); + kernel_shape.dims[0].size = this->in_channels; + bias_shape.dims[0].degree = _input->dims[_input->num_dims - 1].degree; + bias_shape.dims[0].parallel_idx = + _input->dims[_input->num_dims - 1].parallel_idx; + bias_shape.dims[1].size = bias_shape.dims[1].degree = 1; + bias_shape.dims[1].parallel_idx = -1; + bias_shape.dims[bias_shape.num_dims - 1].size = + bias_shape.dims[bias_shape.num_dims - 1].degree = 1; + for (int i = 0; i < input_shape.num_dims - 1; i++) { + if (_input->dims[i].degree > 1) { + bias_shape.dims[bias_shape.num_dims - 1].size *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].degree *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].parallel_idx = + _input->dims[i].parallel_idx; + } + } + if (allocate_weights) { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); if (quantization_type != DT_NONE) { @@ -256,6 +273,7 @@ Linear::Linear(FFModel &model, true /*create_grad*/, bias_initializer, CHOSEN_SYNC_TYPE); + add_bias_only_once = _input->dims[0].degree > 1; } } @@ -263,7 +281,7 @@ Linear::Linear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( output_shape.num_dims, output_shape.dims, _data_type, this); - assert(check_output_input_weight_parallel_dims(allocate_weights)); + // assert(check_output_input_weight_parallel_dims(allocate_weights)); } void Linear::init(FFModel const &ff) { @@ -483,6 +501,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->kernel_reg_type = linear->kernel_reg_type; m->kernel_reg_lambda = linear->kernel_reg_lambda; m->use_bias = linear->use_bias; + m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; m->trainableInputs[0] = linear->trainableInputs[0]; m->input_type = linear->inputs[0]->data_type; @@ -666,7 +685,8 @@ void Linear::forward_task_with_dim(Task const *task, assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); // assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); DT const *acc_bias_ptr = nullptr; - if (m->use_bias) { + if (m->use_bias && + !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) { TensorAccessorR acc_bias( regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(acc_bias.rect.volume() == static_cast(out_dim)); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index db3e83847f..b9dedda418 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -107,15 +107,15 @@ Tensor li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); { - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; li->weights[0] = create_weight_legion_ordering(2, dims, @@ -127,7 +127,7 @@ Tensor } if (bias) { // q, k, v, o - int dims[1] = {embed_dim * 4}; + int dims[1] = {(qProjSize + kProjSize + vProjSize) * num_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -270,38 +270,28 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>(dims, this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); - } - if (bias) { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[0] = inputs[0]->dims[num_dims - 1]; - dims[0].size = dims[0].degree; - dims[1].size = oProjSize * 4; - dims[1].degree = 1; - dims[1].parallel_idx = -1; -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif - weights[1] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - NULL, - comm_type); + CHOSEN_SYNC_TYPE); + if (bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + bias_shape.dims[0].size = + (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } } + outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); /* for (int i = 0; i < numdim; i++) { */ @@ -376,36 +366,28 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[2].size = qParas + kParas + vParas + oParas; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>(dims, this->data_type, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); - } - if (bias) { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[0] = inputs[0]->dims[num_dims - 1]; - dims[0].size = dims[0].degree; - dims[1].size = oProjSize * 4; -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif - weights[1] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - NULL, - comm_type); + CHOSEN_SYNC_TYPE); + if (bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + bias_shape.dims[0].size = + (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } } + outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); @@ -700,7 +682,7 @@ void SpecIncMultiHeadSelfAttention::inference_task( runtime); Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 2); + assert(bias_domain.get_dim() == 4); } Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -713,8 +695,9 @@ void SpecIncMultiHeadSelfAttention::inference_task( assert(weight_domain.get_dim() == 3); assert(output_domain.get_dim() == 4); + assert(task->index_point.get_dim() == 1); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, bc, input, weight, output, biases); + m, bc, task->index_point.point_data[0], input, weight, output, biases); // print_tensor(input.get_float_ptr(), 20, "attention input"); // print_tensor(output.get_float_ptr(), 20, "attention output"); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 90a4c89a39..1caf1c1d1b 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -27,6 +27,7 @@ using Legion::Memory; void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, @@ -83,6 +84,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, + attn->num_heads, _num_heads, DT_NONE, false) { diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 867ef4b6b9..44080b7c5c 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -229,6 +229,7 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, template void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, + int shard_id, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { @@ -425,9 +426,10 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = (void const *)m->W_out_contiguous; - B = (void const *)C; - C = (void *)(output_ptr + tokens_previous_requests * m->oProjSize); + A = static_cast
(m->W_out_contiguous); + B = static_cast
(C); + C = static_cast
(output_ptr) + + tokens_previous_requests * m->oProjSize; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -451,7 +453,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } - if (*m->bias) { + if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; apply_proj_bias_w<< void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, + int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, @@ -498,6 +501,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, + shard_id, input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), @@ -508,7 +512,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); + compute_attention_kernel(m, bc, shard_id, output_ptr, bias_ptr, stream); } } // namespace SpecIncMultiHeadAttention @@ -518,6 +522,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, @@ -544,6 +549,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::SpecIncMultiHeadAttention::inference_kernel(m, bc, + shard_id, input.get_half_ptr(), weight.get_half_ptr(), output.get_half_ptr(), @@ -554,6 +560,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::SpecIncMultiHeadAttention::inference_kernel(m, bc, + shard_id, input.get_float_ptr(), weight.get_float_ptr(), output.get_float_ptr(), @@ -604,6 +611,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, + attn->num_heads, _num_heads, DT_NONE, false) { diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 4fe218e06c..d0bf1d5675 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -111,16 +111,16 @@ Tensor FFModel::inc_multihead_self_attention_verify( li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int one_head_size = qParas + kParas + vParas + oParas; { - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int one_head_size = qParas + kParas + vParas + oParas; // compress the weight size if quantization. if (quantization_type != DT_NONE) { one_head_size = get_quantization_to_byte_size( @@ -139,7 +139,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( } if (bias) { // q, k, v, o - int dims[1] = {embed_dim * 4}; + int dims[1] = {(qProjSize + kProjSize + vProjSize) * num_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -295,38 +295,27 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); - } - if (bias) { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[0] = inputs[0]->dims[num_dims - 1]; - dims[0].size = dims[0].degree; - dims[1].size = oProjSize * 4; - dims[1].degree = 1; - dims[1].parallel_idx = -1; -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif - weights[1] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - NULL, - comm_type); + CHOSEN_SYNC_TYPE); + if (bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + bias_shape.dims[0].size = + (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -410,37 +399,29 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif weights[0] = model.create_parallel_weight<3>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, NULL /*owner_op*/, true /*create_grad*/, initializer, - comm_type); - } - if (bias) { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[0] = inputs[0]->dims[num_dims - 1]; - dims[0].size = dims[0].degree; - dims[1].size = oProjSize * 4; -#ifdef USE_NCCL - ParameterSyncType comm_type = ParameterSyncType::NCCL; -#else - ParameterSyncType comm_type = ParameterSyncType::PS; -#endif - weights[1] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - NULL, - comm_type); + CHOSEN_SYNC_TYPE); + if (bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + bias_shape.dims[0].size = + (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } } + outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); @@ -756,7 +737,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( runtime); Domain bias_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 2); + assert(bias_domain.get_dim() == 4); } Domain input_domain = runtime->get_index_space_domain( @@ -774,8 +755,10 @@ void TreeIncMultiHeadSelfAttention::inference_task( input_domain.get_volume(), "[Attention:forward:query]"); */ + assert(task->index_point.get_dim() == 1); + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, bc, input, weight, output, biases); + m, bc, task->index_point.point_data[0], input, weight, output, biases); #ifdef INFERENCE_TESTS printf("Checking TreeIncMultiHeadSelfAttention computations...\n"); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index eddf20cc08..74cea451c4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -27,6 +27,7 @@ using Legion::Memory; void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, @@ -83,6 +84,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, + attn->num_heads, _num_heads, attn->quantization_type, attn->offload), diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 1458085d38..541322efc4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -137,7 +137,7 @@ __global__ void update_tree_branch_kv_cache( (i / proj_size) % num_tokens_in_branch; // index in the tree branch int head_idx = i / (proj_size * num_tokens_in_branch); - token_idx += processed_tokens_in_batch; // get index in the whole batch + token_idx += processed_tokens_in_batch; // get index in the whole batch int qkv_block_size = (qProjSize + kProjSize + vProjSize) * total_tokens_in_batch; // skip over previous heads int current_head_block_size = @@ -177,6 +177,7 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix, template void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, + int shard_id, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { @@ -410,9 +411,10 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = m->W_out_contiguous; + A = static_cast
(m->W_out_contiguous); B = C; - C = (output_ptr + processed_tokens_in_batch * m->oProjSize); + C = static_cast
(output_ptr) + + processed_tokens_in_batch * m->oProjSize; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -439,7 +441,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } - if (*m->bias) { + if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; apply_proj_bias_w<< void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, + int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, @@ -520,6 +523,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, + shard_id, input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), @@ -532,7 +536,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, bias_ptr, stream); + compute_attention_kernel(m, bc, shard_id, output_ptr, bias_ptr, stream); } } // namespace TreeIncMultiHeadAttention @@ -542,6 +546,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, + int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, @@ -573,6 +578,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( Kernels::TreeIncMultiHeadAttention::inference_kernel( m, bc, + shard_id, input.get_half_ptr(), m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), output.get_half_ptr(), @@ -587,6 +593,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( Kernels::TreeIncMultiHeadAttention::inference_kernel( m, bc, + shard_id, input.get_float_ptr(), m->offload ? static_cast(m->weight_ptr) : weight.get_float_ptr(), @@ -637,6 +644,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, + attn->num_heads, _num_heads, attn->quantization_type, attn->offload), diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp index 9143fee936..2a3fe5cca1 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cpp +++ b/src/parallel_ops/kernels/reduction_kernels.cpp @@ -18,6 +18,10 @@ #include namespace FlexFlow { + +ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) + : OpMeta(handle) {} + namespace Kernels { namespace Reduction { @@ -70,10 +74,18 @@ template __global__ void reduction_forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements, size_t num_replicas); +template __global__ void reduction_forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu index 8496a107e3..34ae8007da 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cu +++ b/src/parallel_ops/kernels/reduction_kernels.cu @@ -17,6 +17,10 @@ #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { + +ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) + : OpMeta(handle) {} + namespace Kernels { namespace Reduction { @@ -63,10 +67,18 @@ template __global__ void reduction_forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements, size_t num_replicas); +template __global__ void reduction_forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements, + size_t num_replicas); template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements, size_t num_replicas); +template void forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements, + size_t num_replicas); template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 29f1d30d1f..1647f014be 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -18,6 +18,10 @@ #include namespace FlexFlow { + +ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) + : OpMeta(handle) {} + namespace Kernels { namespace Replicate { @@ -66,6 +70,9 @@ void backward_kernel(T const *output_grad_ptr, template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements); +template void forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(float const *input_ptr, float *output_ptr, diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index de208d2aed..35bc109bd3 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -17,6 +17,10 @@ #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { + +ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) + : OpMeta(handle) {} + namespace Kernels { namespace Replicate { @@ -59,6 +63,9 @@ void backward_kernel(T const *output_grad_ptr, template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements); +template void forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements); template __global__ void replicate_backward_kernel(float const *input_ptr, float *output_ptr, diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 737f86239c..1d6130d6a6 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -14,6 +14,7 @@ */ #include "flexflow/parallel_ops/reduction.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/parallel_ops/kernels/reduction_kernels.h" #include "flexflow/utils/hash_utils.h" @@ -77,7 +78,7 @@ Reduction::Reduction(FFModel &model, dims[reduction_dim].size /= reduction_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this); + numdim, dims, _input->data_type, this); } Reduction::Reduction(FFModel &model, @@ -108,16 +109,153 @@ void Reduction::create_input_partition(FFModel &ff) { output_grad_lp); } +void Reduction::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + // input_lp is a disjoint partition + ff.create_disjoint_partition(batch_outputs[0]->num_dims, + batch_outputs[0]->dims, + batch_outputs[0]->parallel_is, + batch_inputs[0]->region, + inference_input_lps[batch_inputs[0]]); +} + +OpMeta *Reduction::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Reduction *reduct = (Reduction *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + ReductionMeta *meta = new ReductionMeta(handle, reduct); + meta->input_type[0] = reduct->inputs[0]->data_type; + meta->output_type[0] = reduct->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + return meta; +} + void Reduction::init(FFModel const &ff) { - forward(ff); + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(REDUCTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Reduction)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement( + input_lp, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void Reduction::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + parallel_is = batch_outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(REDUCTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Reduction)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + assert(inference_input_lps.find(batch_inputs[0]) != + inference_input_lps.end()); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +FutureMap Reduction::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(REDUCTION_FWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); } void Reduction::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; assert(numOutputs == 1); assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(REDUCTION_FWD_TASK_ID, outputs[0]->parallel_is, TaskArgument(NULL, 0), @@ -211,6 +349,9 @@ void Reduction::forward_task(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + + ReductionMeta const *m = *((ReductionMeta **)task->local_args); + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( @@ -222,12 +363,26 @@ void Reduction::forward_task(Task const *task, } size_t num_elements = output_domain.get_volume(); size_t num_replicas = input_domain.get_volume() / num_elements; - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel(input_ptr, output_ptr, num_elements, num_replicas); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + if (input.data_type == DT_HALF) { + forward_kernel(input.get_half_ptr(), + output.get_half_ptr(), + num_elements, + num_replicas); + } else if (input.data_type == DT_FLOAT) { + forward_kernel(input.get_float_ptr(), + output.get_float_ptr(), + num_elements, + num_replicas); + } else { + assert(false && "Unspported data type"); + } } void Reduction::backward_task(Task const *task, diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index fee78043bd..794db0f67f 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -75,7 +75,7 @@ Replicate::Replicate(FFModel &model, dims[replicate_dim].degree *= replicate_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this); + numdim, dims, _input->data_type, this); // inputs[0]->print("Replicate::input"); // outputs[0]->print("Replicate::output"); } @@ -108,16 +108,85 @@ void Replicate::create_input_partition(FFModel &ff) { output_grad_lp); } +void Replicate::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + // input_lp is an aliased partitioning along the replica dim + ff.create_aliased_partition(batch_outputs[0]->num_dims, + batch_outputs[0]->dims, + replicate_dim, + batch_outputs[0]->parallel_is, + batch_inputs[0]->region, + inference_input_lps[batch_inputs[0]]); +} + +OpMeta *Replicate::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Replicate *repl = (Replicate *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + ReplicateMeta *meta = new ReplicateMeta(handle, repl); + meta->input_type[0] = repl->inputs[0]->data_type; + meta->output_type[0] = repl->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + return meta; +} + +void Replicate::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(REPLICATE_INIT_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(this, sizeof(Replicate)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + void Replicate::init(FFModel const &ff) { - // Do nothing + parallel_is = outputs[0]->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(REPLICATE_FWD_TASK_ID, + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(REPLICATE_INIT_TASK_ID, outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(Replicate)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -132,15 +201,58 @@ void Replicate::init(FFModel const &ff) { EXCLUSIVE, outputs[0]->region)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +FutureMap Replicate::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(REPLICATE_FWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); } void Replicate::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; assert(numOutputs == 1); assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); IndexLauncher launcher(REPLICATE_FWD_TASK_ID, outputs[0]->parallel_is, TaskArgument(NULL, 0), @@ -233,6 +345,9 @@ void Replicate::forward_task(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); + + ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( @@ -243,12 +358,24 @@ void Replicate::forward_task(Task const *task, assert(output_domain.hi()[i] == input_domain.hi()[i]); } assert(input_domain.get_volume() == output_domain.get_volume()); - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel(input_ptr, output_ptr, input_domain.get_volume()); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + + if (input.data_type == DT_HALF) { + forward_kernel( + input.get_half_ptr(), output.get_half_ptr(), input_domain.get_volume()); + } else if (input.data_type == DT_FLOAT) { + forward_kernel(input.get_float_ptr(), + output.get_float_ptr(), + input_domain.get_volume()); + } else { + assert(false && "Unspported data type"); + } } void Replicate::backward_task(Task const *task, diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 434ef1d5e1..6ef06e1f65 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -579,6 +579,8 @@ template __host__ void template __host__ float *download_tensor(float const *ptr, size_t num_elements); +template __host__ half *download_tensor(half const *ptr, + size_t num_elements); template __host__ double *download_tensor(double const *ptr, size_t num_elements); template __host__ int32_t *download_tensor(int32_t const *ptr, @@ -587,6 +589,8 @@ template __host__ int64_t *download_tensor(int64_t const *ptr, size_t num_elements); template __host__ bool download_tensor(float const *ptr, float *dst, size_t num_elements); +template __host__ bool + download_tensor(half const *ptr, half *dst, size_t num_elements); template __host__ bool download_tensor(double const *ptr, double *dst, size_t num_elements); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index c7fe4d6a82..f844834761 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -42,8 +42,24 @@ InferenceManager::InferenceManager(FFConfig const &_config, view.dim[0] = 1; view.stride[0] = 0; view.start_device_id = i; + // std::cout << "Registering machine view: " << view << std::endl; machine_views.push_back(view); } + // multiple-device machine views + if (ff_config.tensor_parallelism_degree > 1) { + for (int i = 0; i < num_devices; i++) { + if (i + ff_config.tensor_parallelism_degree <= num_devices) { + MachineView view; + view.device_type = MachineView::GPU; + view.ndims = 1; + view.dim[0] = ff_config.tensor_parallelism_degree; + view.stride[0] = 1; + view.start_device_id = i; + // std::cout << "Registering machine view: " << view << std::endl; + machine_views.push_back(view); + } + } + } } bool parallel_tensor_list_overlaps(std::vector const &list1, @@ -105,13 +121,46 @@ void InferenceManager::compile_model_and_allocate_buffer( } } } + if (op->op_type == OP_REPLICATE) { + // std::cout << "Replicate operator got machine view: " << mv + // << std::endl; + assert(model->config.tensor_parallelism_degree > 1); + mv.dim[0] = ff_config.tensor_parallelism_degree; + mv.stride[0] = 1; + if (mv.start_device_id + mv.dim[0] > num_devices) { + mv.start_device_id -= + (mv.start_device_id + mv.dim[0]) - num_devices; + } + // std::cout << "Corrected machine view: " << mv << std::endl; + } else if (op->op_type == OP_REDUCTION) { + // std::cout << "Reduction operator got machine view: " << mv + // << std::endl; + assert(model->config.tensor_parallelism_degree > 1); + mv.dim[0] = 1; + mv.stride[0] = 0; + // std::cout << "Corrected machine view: " << mv << std::endl; + } + assert(mv.start_device_id + mv.dim[0] <= num_devices); machine_views.push_back(mv); } assert(machine_views.size() == max_num_inflight_batches); } + // std::cout << "operator: " << op->name << std::endl; + // for (int i = 0; i < op->numInputs; i++) { + // op->inputs[i]->print("input pt"); + // std::cout << "input mv: " << op->inputs[i]->machine_view << std::endl; + // } + for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); + + if (op->op_type == OP_REPLICATE) { + assert(op->numInputs == 1 && op->numOutputs == 1); + } + // pt_base->print("output pt"); + // std::cout << "output mv: " << pt_base->machine_view << std::endl; + std::vector list; bool found_parallel_tensor = false; if (model->cpu_offload) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index e6e432040f..64c3a2eb61 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2956,7 +2956,9 @@ Op *FFModel::create_operator_from_layer( void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; - for (auto const &l : layers) { + // for (auto const &l : layers) { + for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { + auto const &l = layers[layer_idx]; std::vector inputs; for (int i = 0; i < l->numInputs; i++) { // create new input tensors @@ -2964,7 +2966,63 @@ void FFModel::create_operators_from_layers() { tensors_to_parallel_tensors.end()); inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } - Op *op = create_operator_from_layer(l, inputs); + Op *op = nullptr; + // add replicate operators if needed + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + (l->op_type == OP_LINEAR && layer_idx + 3 <= layers.size() && + layers[layer_idx + 1]->op_type == OP_RELU && + layers[layer_idx + 2]->op_type == OP_LINEAR) || + (l->op_type == OP_LINEAR && layer_idx + 6 <= layers.size() && + layers[layer_idx + 1]->op_type == OP_LINEAR && + layers[layer_idx + 2]->op_type == OP_SIGMOID && + layers[layer_idx + 3]->op_type == OP_EW_MUL && + layers[layer_idx + 4]->op_type == OP_EW_MUL && + layers[layer_idx + 5]->op_type == OP_LINEAR) || + (l->op_type == OP_LINEAR && layer_idx + 5 <= layers.size() && + layer_idx >= 1 && layers[layer_idx - 1]->op_type == OP_LINEAR && + layers[layer_idx + 1]->op_type == OP_SIGMOID && + layers[layer_idx + 2]->op_type == OP_EW_MUL && + layers[layer_idx + 3]->op_type == OP_EW_MUL && + layers[layer_idx + 4]->op_type == OP_LINEAR))) { + std::vector partitioned_inputs; + assert(inputs.size() == 1); + Replicate *repl = new Replicate(*this, + inputs[0], + inputs[0]->num_dims - 1, + config.tensor_parallelism_degree); + partitioned_inputs.push_back(repl->outputs[0]); + operators.push_back(repl); + op = create_operator_from_layer(l, partitioned_inputs); + } else { + op = create_operator_from_layer(l, inputs); + } + // Op *op = create_operator_from_layer(l, inputs); + // add reduce operators if needed + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR))) { + assert(op->numOutputs == 1); + Reduction *reduct = new Reduction(*this, + op->outputs[0], + op->outputs[0]->num_dims - 1, + config.tensor_parallelism_degree); + operators.push_back(reduct); + op = reduct; + } + assert(op->numOutputs == l->numOutputs); for (int i = 0; i < op->numOutputs; i++) { tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; @@ -4834,6 +4892,13 @@ void register_flexflow_internal_tasks() { registrar, "Combine Backward Task"); } // Replicate + { + TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Replicate init Task"); + } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -4849,6 +4914,13 @@ void register_flexflow_internal_tasks() { registrar, "Replicate Backward Task"); } // Reduction + { + TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Reduction init Task"); + } { TaskVariantRegistrar registrar(REDUCTION_FWD_TASK_ID, "Reduction Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 7bce941c68..0ed594fd7e 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -656,10 +656,13 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, // TODO: check data type matches // TODO: Currently we use a task launch, change to index launch for NCCL // parameter - size_t volume = 1, num_replicas = 0; + size_t volume = 1, num_replicas = 1; if (sync_type == ParameterSyncType::NCCL) { - Domain domain = runtime->get_index_space_domain(ctx, parallel_is); - num_replicas = domain.get_volume(); + // Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + // num_replicas = domain.get_volume(); + if (this->num_dims >= 2 && this->dims[this->num_dims - 1].is_replica_dim) { + num_replicas = this->dims[this->num_dims - 1].size; + } } else if (sync_type == ParameterSyncType::PS) { num_replicas = 1; } else { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 6cbd92dee6..56b9bf6241 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -108,7 +108,7 @@ RequestManager::RequestGuid request.tokens = prompt; if (num_ssms == 0) { - std::cout << "No small spective model registered yet, using increamental " + std::cout << "No small speculative model registered yet, using incremental " "decoding." << std::endl; } else { @@ -151,7 +151,7 @@ RequestManager::RequestGuid request.initial_len = request.tokens.size(); if (num_ssms == 0) { - std::cout << "No small spective model registered yet, using increamental " + std::cout << "No small speculative model registered yet, using incremental " "decoding." << std::endl; } else { diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 8971fc206e..1262ec21d5 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -9,6 +9,9 @@ cleanup() { # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" +# Enable model parallelism tests, if desired +TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} + # Clean up before test (just in case) cleanup @@ -43,6 +46,15 @@ mkdir -p ../inference/output # OPT (half precision) ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half.txt +# Tensor parallelism tests +if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + # LLAMA (half precision) + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half_tp.txt -tensor-parallelism-degree 2 + + # OPT (half precision) + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half_tp.txt -tensor-parallelism-degree 2 +fi + ############################################################################################### ############################ Incremental decoding tests ####################################### ############################################################################################### @@ -67,6 +79,24 @@ mkdir -p ../inference/output # OPT (big model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half.txt +# Tensor parallelism tests +if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + # LLAMA (small model) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -tensor-parallelism-degree 2 + # LLAMA (small model, half precision) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -tensor-parallelism-degree 2 + + # LLAMA (big model, half precision) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half_tp.txt -tensor-parallelism-degree 2 + + # OPT (small model) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -tensor-parallelism-degree 2 + # OPT (small model, half precision) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -tensor-parallelism-degree 2 + + # OPT (big model, half precision) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half_tp.txt -tensor-parallelism-degree 2 +fi ############################################################################################### ############################### Alignment and Speed tests ##################################### @@ -111,6 +141,18 @@ compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B #compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" #compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" +############ Alignment between tensor model parallelism and pipeline parallelism only ################# +if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + # diff <(tail -n +2 "../inference/output/spec_inference_llama_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_llama_half.txt") + diff <(tail -n +2 "../inference/output/spec_inference_opt_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_opt_half.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_125M.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_opt_125M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_125M_half.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half.txt") +fi + ######################### Alignment tests with HuggingFace #################################### pip3 install protobuf==3.20.3 From f74377afa8c029fe0b87e8efb08bc36adbde7237 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Tue, 27 Jun 2023 01:43:12 +0000 Subject: [PATCH 155/344] Formatting. --- examples/cpp/inference/mixture_of_experts/moe.h | 4 ++-- include/flexflow/inference.h | 2 +- inference/spec_infer/spec_infer.cc | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index 183229bc07..4fdd3b2e3f 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -22,9 +22,9 @@ struct MoeConfig : InferenceConfig { MoeConfig(void) : InferenceConfig() { //----------------------- MoE layer -------------------------------- // total number of experts - num_exp = 128; + num_exp = 64; // number of experts in each block of fused experts - experts_per_block = 32; + experts_per_block = 16; // number of experts to route each token to num_select = 2; // expert capacity parameters diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 1d3b62fb00..5cf9926cff 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -133,7 +133,7 @@ class RequestManager { &inputSerializedTree, std::vector> const &outputSerializedTree); - int get_requests_init_length(BeamSearchBatchConfig const &old_bc); + int get_requests_init_length(BeamSearchBatchConfig const &old_bc); static void load_tokens_task(Legion::Task const *task, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 3cf568fe17..b532f7318d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -294,7 +294,7 @@ void FlexFlow::top_level_task(Task const *task, if (depth - 1 >= beam_bc_vec[i].max_beam_depth_all_requests() || depth + 1 + rm.get_requests_init_length(beam_bc_vec[i]) >= - BatchConfig::MAX_NUM_TOKENS) { + BatchConfig::MAX_NUM_TOKENS) { break; } else { beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); From 95e09ebc01d88a906ff86bd4a2702533321a296c Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Tue, 27 Jun 2023 05:22:12 -0400 Subject: [PATCH 156/344] Docker-build and Publish Modification (#776) * Docker-build and Publish Modification **Description of changes:** Add code in docker-build.yml that allows automatic build and publish process when push happens to inference branch. Moreover, modifies publish.sh so that image name will be created as "image" and "branch" name to distinguish from those created in master branch. **Related Issues:** Linked Issues: - Issue # Issues closed by this PR: - Closes # **Before merging:** - [ ] Did you update the [flexflow-third-party](https://github.com/flexflow/flexflow-third-party) repo, if modifying any of the Cmake files, the build configs, or the submodules? * update container name * specinfer env publish * tag specinfer * add spaces * newline * fix * fix gpu ci workflow --------- Co-authored-by: Gabriele Oliaro --- .github/workflows/docker-build.yml | 5 ++++- .github/workflows/gpu-ci-skip.yml | 2 +- docker/publish.sh | 26 +++++++++++++++++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 14f64f5fe6..51ac4a1bd3 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -7,6 +7,7 @@ on: - ".github/workflows/docker-build.yml" push: branches: + - "inference" - "master" schedule: # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated @@ -42,7 +43,7 @@ jobs: # On push to master, build for all compatible architectures, so that we can publish # a pre-built general-purpose image. On all other cases, only build for one architecture # to save time. - if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then + if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ( ${GITHUB_REF#refs/heads/} == "master" || ${GITHUB_REF#refs/heads/} == "inference" ) ]]; then export FF_CUDA_ARCH=all else export FF_CUDA_ARCH=70 @@ -67,6 +68,8 @@ jobs: if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then ./docker/publish.sh "flexflow-environment-${FF_GPU_BACKEND}" ./docker/publish.sh "flexflow-${FF_GPU_BACKEND}" + elif [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "inference" ]]; then + ./docker/publish.sh "specinfer-${FF_GPU_BACKEND}" else echo "No need to update Docker containers in ghrc.io registry at this time." fi diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index a9b5132ef6..b95f337760 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -1,7 +1,7 @@ name: "gpu-ci" on: pull_request: - paths: + paths-ignore: - "cmake/**" - "config/**" - "python/**" diff --git a/docker/publish.sh b/docker/publish.sh index 6f0ac19f7d..844c616091 100755 --- a/docker/publish.sh +++ b/docker/publish.sh @@ -5,13 +5,25 @@ set -euo pipefail cd "${BASH_SOURCE[0]%/*}" image=${1:-"flexflow-cuda"} -if [[ "${image}" != @(flexflow-environment-cuda|flexflow-environment-hip_cuda|flexflow-environment-hip_rocm|flexflow-environment-intel|flexflow-cuda|flexflow-hip_cuda|flexflow-hip_rocm|flexflow-intel) ]]; then +# Check publish specinfer environment image +if [[ "${image}" == @(specinfer-environment-cuda|specinfer-environment-hip_cuda|specinfer-environment-hip_rocm|specinfer-environment-intel) ]]; then + echo "specinfer does not publish environment images" + exit 1 +fi + +# Check valid image name +if [[ "${image}" != @(flexflow-environment-cuda|flexflow-environment-hip_cuda|flexflow-environment-hip_rocm|flexflow-environment-intel|flexflow-cuda|flexflow-hip_cuda|flexflow-hip_rocm|flexflow-intel|specinfer-cuda|specinfer-hip_cuda|specinfer-hip_rocm|specinfer-intel) ]]; then echo "Error, image name ${image} is invalid. Choose between 'flexflow-environment-{cuda,hip_cuda,hip_rocm,intel}' and 'flexflow-{cuda,hip_cuda,hip_rocm,intel}'." exit 1 fi # Check that image exists -docker image inspect "${image}":latest > /dev/null +if [[ "${image}" == @(specinfer-cuda|specinfer-hip_cuda|specinfer-hip_rocm|specinfer-intel) ]]; then + SUBSTR="${image:10}" + docker image inspect "flexflow-${SUBSTR}":latest > /dev/null +else + docker image inspect "${image}":latest > /dev/null +fi # Log into container registry FLEXFLOW_CONTAINER_TOKEN=${FLEXFLOW_CONTAINER_TOKEN:-} @@ -21,7 +33,15 @@ echo "$FLEXFLOW_CONTAINER_TOKEN" | docker login ghcr.io -u flexflow --password-s # Tag image to be uploaded git_sha=${GITHUB_SHA:-$(git rev-parse HEAD)} if [ -z "$git_sha" ]; then echo "Commit hash cannot be detected, cannot publish the docker image to ghrc.io"; exit; fi -docker tag "$image":latest ghcr.io/flexflow/"$image":latest + +# If in "inference" branch, which tries to publish "specinfer" images, +# tags the all images as "specinfer-{cuda, hip_cuda, hip_rocm, intel}"; if in others, do as orginal +if [[ "${image}" == @(specinfer-cuda|specinfer-hip_cuda|specinfer-hip_rocm|specinfer-intel) ]]; then + SUBSTR="${image:10}" + docker tag flexflow-"$SUBSTR":latest ghcr.io/flexflow/specinfer-"$SUBSTR":latest +else + docker tag "$image":latest ghcr.io/flexflow/"$image":latest +fi # Upload image docker push ghcr.io/flexflow/"$image":latest From c40c3f11f778cdbf36ce402698f09900ee14a410 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 28 Jun 2023 23:12:06 +0800 Subject: [PATCH 157/344] add check for cargo (#812) --- CMakeLists.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aeec820452..4e64fb2ed7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -488,7 +488,14 @@ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) OUTPUT_VARIABLE RUSTC_OUTPUT ERROR_QUIET) if(NOT RUST_COMMAND_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is not installed on the system. Cannot build the tokenizers.") + message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + endif() + # Ensure Cargo is installed + execute_process(COMMAND cargo --version + RESULT_VARIABLE CARGO_RESULT + OUTPUT_QUIET ERROR_QUIET) + if(NOT CARGO_RESULT EQUAL 0) + message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") endif() add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) From 3a87e02a9e6ffca9ebfc4dc3694dba4aeea929aa Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 29 Jun 2023 15:55:03 +0800 Subject: [PATCH 158/344] [Inference] - Fix Multiple-GPUs CI test (#804) * fix linear region requirement * fix set tensor issue --- src/ops/inc_multihead_self_attention.cc | 2 ++ src/ops/inc_multiquery_self_attention.cc | 2 ++ src/ops/linear.cc | 4 ++-- src/ops/spec_inc_multihead_self_attention.cc | 2 ++ src/ops/tree_inc_multihead_self_attention.cc | 2 ++ src/runtime/parallel_tensor.cc | 6 ++++-- 6 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 765b3c5bfc..07598f99ea 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -285,6 +285,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; + dims[1].is_replica_dim = false; dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { @@ -392,6 +393,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; + dims[1].is_replica_dim = false; dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { dims[2].size = get_quantization_to_byte_size( diff --git a/src/ops/inc_multiquery_self_attention.cc b/src/ops/inc_multiquery_self_attention.cc index 05c57af2ff..6ce448c9ec 100644 --- a/src/ops/inc_multiquery_self_attention.cc +++ b/src/ops/inc_multiquery_self_attention.cc @@ -228,6 +228,7 @@ IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].is_replica_dim = false; dims[1].size = this->embed_dim; dims[2].size = this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; @@ -308,6 +309,7 @@ IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->embed_dim; + dims[1].is_replica_dim = false; dims[2].size = this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; int seed = std::rand(); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index e3204c01d9..cca92f014f 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -306,7 +306,7 @@ void Linear::init(FFModel const &ff) { // launcher.add_field(0, FID_DATA); launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + WRITE_ONLY, EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -365,7 +365,7 @@ void Linear::init_inference(FFModel const &ff, // launcher.add_field(0, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + WRITE_ONLY, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index b9dedda418..e765960985 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -265,6 +265,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; + dims[1].is_replica_dim = false; dims[2].size = qParas + kParas + vParas + oParas; dims[2].degree = 1; dims[2].parallel_idx = -1; @@ -363,6 +364,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; + dims[1].is_replica_dim = false; dims[2].size = qParas + kParas + vParas + oParas; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d0bf1d5675..105bd41647 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -286,6 +286,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; + dims[1].is_replica_dim = false; dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { dims[2].size = get_quantization_to_byte_size( @@ -392,6 +393,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_heads; + dims[1].is_replica_dim = false; dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { dims[2].size = get_quantization_to_byte_size( diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 0ed594fd7e..8f1be15fd1 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -660,8 +660,10 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, if (sync_type == ParameterSyncType::NCCL) { // Domain domain = runtime->get_index_space_domain(ctx, parallel_is); // num_replicas = domain.get_volume(); - if (this->num_dims >= 2 && this->dims[this->num_dims - 1].is_replica_dim) { - num_replicas = this->dims[this->num_dims - 1].size; + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } } } else if (sync_type == ParameterSyncType::PS) { num_replicas = 1; From f02c9a0e870129c2cde0ef064405883a06f8d4ac Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Thu, 29 Jun 2023 04:12:56 -0400 Subject: [PATCH 159/344] Update README.md (#814) Update links/names of docker container from flexflow-{cuda, hip_rocm} to specinfer-{cuda, hip_rocm} with the disclaimer of CUDA version. Co-authored-by: Gabriele Oliaro --- .github/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/README.md b/.github/README.md index 010d7c07bb..576b1ca84e 100644 --- a/.github/README.md +++ b/.github/README.md @@ -29,7 +29,7 @@ for serving generative LLMs while provably preserving model quality.

## Build/Install SpecInfer -SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. +SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([specinfer-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-cuda) with a CUDA backend, [specinfer-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. ## Run SpecInfer The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: From 08bda773c8dd968e75c6fbbf2bfa8a902197874e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 29 Jun 2023 21:57:02 +0800 Subject: [PATCH 160/344] [Inference] - Better device placement in tensor model parallelism (#805) * add data parallelism degree setting * compute multi-device machines views * fix bugs * fix and linting * update inference test, comment out print statements * fix --- .github/workflows/gpu-ci.yml | 1 + .../cpp/inference/mixture_of_experts/moe.cc | 9 ++- .../inference/transformers/transformers.cc | 10 ++-- include/flexflow/config.h | 4 +- include/flexflow/inference.h | 5 +- inference/incr_decoding/incr_decoding.cc | 29 +++++++-- inference/models/llama.cc | 59 +++++++++++++++---- inference/models/opt.cc | 49 +++++++++++++-- inference/spec_infer/spec_infer.cc | 29 +++++++-- src/runtime/inference_manager.cc | 47 +++++++++++---- tests/inference_tests.sh | 16 ++++- 11 files changed, 203 insertions(+), 55 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 7f83fb2691..bdbb8a751b 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -180,6 +180,7 @@ jobs: ./tests/gpt_tokenizer_test.sh # Inference tests + export TENSOR_PARALLELISM_TESTS=ON ./tests/inference_tests.sh gpu-ci-flexflow: diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 0c94452ec1..39459d63ac 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -139,8 +139,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor output = ff.arg_top_k(t, /*k=*/1, /*sorted=*/false); //------------------- Initialize the inference manager ------------------ - InferenceManager im( - ff.config, moeConfig.batch_size, moeConfig.num_inflight_batches); + InferenceManager im(ff.config, moeConfig.batch_size); std::unordered_map> mapping; im.compile_model_and_allocate_buffer(&ff, mapping); im.init_operators_inference(&ff); @@ -162,7 +161,7 @@ void FlexFlow::top_level_task(Task const *task, ParallelTensor input_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - assert(im.tensor_buffer[input_pt].size() == im.max_num_inflight_batches); + assert(im.tensor_buffer[input_pt].size() == ffConfig.data_parallelism_degree); DataLoader data_loader( ff, moeConfig, data_generator, im.tensor_buffer[input_pt]); @@ -184,13 +183,13 @@ void FlexFlow::top_level_task(Task const *task, std::map batch_configs; std::pair new_prompts; BatchConfig *bc = nullptr; - std::map batch_predictions[im.max_num_inflight_batches]; + std::map batch_predictions[ffConfig.data_parallelism_degree]; assert(im.max_num_tokens_per_batch == moeConfig.batch_size); // simulation loop. For deployment, we will use a while(true) while (processed_requests < moeConfig.total_requests) { - for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { + for (int bid = 0; bid < ffConfig.data_parallelism_degree; bid++) { size_t max_reqs, max_tkns; if (future_handlers.find(bid) == future_handlers.end()) { max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index d416fdca3c..d56473c8bd 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -114,9 +114,7 @@ void FlexFlow::top_level_task(Task const *task, Tensor output = ff.arg_top_k(t, /*k=*/1, false); //------------------- Initialize the inference manager ------------------ - InferenceManager im(ff.config, - transformerConfig.batch_size, - transformerConfig.num_inflight_batches); + InferenceManager im(ff.config, transformerConfig.batch_size); std::unordered_map> mapping; im.compile_model_and_allocate_buffer(&ff, mapping); im.init_operators_inference(&ff); @@ -138,7 +136,7 @@ void FlexFlow::top_level_task(Task const *task, ParallelTensor input_pt; ff.get_parallel_tensor_from_tensor(input, input_pt); assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - assert(im.tensor_buffer[input_pt].size() == im.max_num_inflight_batches); + assert(im.tensor_buffer[input_pt].size() == ffConfig.data_parallelism_degree); DataLoader data_loader( ff, transformerConfig, data_generator, im.tensor_buffer[input_pt]); @@ -160,14 +158,14 @@ void FlexFlow::top_level_task(Task const *task, std::map batch_configs; std::pair new_prompts; BatchConfig *bc = nullptr; - std::map batch_predictions[im.max_num_inflight_batches]; + std::map batch_predictions[ffConfig.data_parallelism_degree]; assert(im.max_num_tokens_per_batch == transformerConfig.batch_size); // assert(transformerConfig.batch_size <= BatchConfig::MAX_NUM_REQUESTS); // simulation loop. For deployment, we will use a while(true) while (processed_requests < transformerConfig.total_requests) { - for (int bid = 0; bid < im.max_num_inflight_batches; bid++) { + for (int bid = 0; bid < ffConfig.data_parallelism_degree; bid++) { size_t max_reqs, max_tkns; if (future_handlers.find(bid) == future_handlers.end()) { max_reqs = transformerConfig.incremental_mode diff --git a/include/flexflow/config.h b/include/flexflow/config.h index f7c59f7b58..f1b218e50f 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -143,8 +143,10 @@ class FFConfig { bool enable_parameter_parallel; bool enable_attribute_parallel; bool enable_inplace_optimizations; - // Control tensor model parallelism degree in inference + // Control parallelism degrees in inference + int data_parallelism_degree; int tensor_parallelism_degree; + int pipeline_parallelism_degree; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion; std::string dataset_path; diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 4da8dbaf20..1fd2fdff78 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -28,9 +28,7 @@ using tokenizers::Tokenizer; class InferenceManager { public: - InferenceManager(FFConfig const &config, - int max_num_tokens_per_batch, - int max_num_inflight_batches); + InferenceManager(FFConfig const &config, int max_num_tokens_per_batch); void compile_model_and_allocate_buffer( FFModel *model, std::unordered_map> const &mapping); @@ -45,7 +43,6 @@ class InferenceManager { FFConfig ff_config; std::unordered_map> tensor_buffer; int max_num_tokens_per_batch; - int max_num_inflight_batches; int num_devices; std::vector machine_views; }; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index a281f52853..d43cab17f9 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -38,7 +38,9 @@ void parse_input_args(char **argv, ModelType &llm_model_type, bool &use_full_precision, bool &verbose, - int &tensor_parallelism_degree) { + int &data_parallelism_degree, + int &tensor_parallelism_degree, + int &pipeline_parallelism_degree) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -83,11 +85,21 @@ void parse_input_args(char **argv, paths.output_file_path = std::string(argv[++i]); continue; } + // data parallelism degree + if (!strcmp(argv[i], "-data-parallelism-degree")) { + data_parallelism_degree = std::stoi(argv[++i]); + continue; + } // tensor parallelism degree if (!strcmp(argv[i], "-tensor-parallelism-degree")) { tensor_parallelism_degree = std::stoi(argv[++i]); continue; } + // pipeline parallelism degree + if (!strcmp(argv[i], "-pipeline-parallelism-degree")) { + pipeline_parallelism_degree = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -112,7 +124,9 @@ void FlexFlow::top_level_task(Task const *task, ModelType model_type; bool use_full_precision = false; bool verbose = false; - int tensor_parallelism_degree = 1; + size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; + int data_parallelism_degree = 1, tensor_parallelism_degree = 1, + pipeline_parallelism_degree = -1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -123,13 +137,20 @@ void FlexFlow::top_level_task(Task const *task, model_type, use_full_precision, verbose, - tensor_parallelism_degree); + data_parallelism_degree, + tensor_parallelism_degree, + pipeline_parallelism_degree); + ffconfig.data_parallelism_degree = data_parallelism_degree; ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; + ffconfig.pipeline_parallelism_degree = + pipeline_parallelism_degree == -1 + ? num_devices / (tensor_parallelism_degree * data_parallelism_degree) + : pipeline_parallelism_degree; assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); + InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); RequestManager rm(model_type, file_paths.tokenizer_file_path, /*verbose*/ verbose, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index f7c1563095..1e61f43a98 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -29,8 +29,27 @@ void LLAMA::create_llama_model(FFModel &ff, // do not apply cpu offload in beam search model. Config llama_config(model_config_file_path); llama_config.printConfig(); - //------------------------------compute machine views ------------------ + //---------------------- parallelization setup work ---------------------- int num_devices = ff.config.workersPerNode * ff.config.numNodes; + int num_transformer_layers = llama_config.n_layers; + assert(num_transformer_layers % ff.config.pipeline_parallelism_degree == 0); + int num_layers_per_pp_block = + num_transformer_layers / ff.config.pipeline_parallelism_degree; + int num_devices_per_data_parallelism_line = + num_devices / ff.config.data_parallelism_degree; + + // std::cout << "dp: " << ff.config.data_parallelism_degree + // << " tp: " << ff.config.tensor_parallelism_degree + // << " pp: " << ff.config.pipeline_parallelism_degree << std::endl; + // std::cout << "num_devices: " << num_devices << std::endl; + // std::cout << "num_transformer_layers: " << num_transformer_layers + // << std::endl; + // std::cout << "num_devices_per_data_parallelism_line: " + // << num_devices_per_data_parallelism_line << std::endl; + // std::cout << "num layers: " << llama_config.n_layers << std::endl; + + //------------------------------compute machine views ------------------ + // single device std::vector machine_views; for (int i = 0; i < num_devices; i++) { MachineView view; @@ -41,6 +60,7 @@ void LLAMA::create_llama_model(FFModel &ff, view.start_device_id = i; machine_views.push_back(view); } + assert(machine_views.size() == num_devices); std::unordered_map> mapping; std::unordered_map weights_layers; @@ -51,7 +71,10 @@ void LLAMA::create_llama_model(FFModel &ff, int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } - mapping[input].push_back(machine_views[0]); + for (int i = 0; i < ff.config.data_parallelism_degree; i++) { + mapping[input].push_back( + machine_views[i * num_devices_per_data_parallelism_line]); + } Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); @@ -78,9 +101,10 @@ void LLAMA::create_llama_model(FFModel &ff, Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); - int num_transformer_layers = llama_config.n_layers; - int num_transformer_layers_per_stage = - (num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages; + // int num_transformer_layers = llama_config.n_layers; + // int num_transformer_layers_per_stage = + // (num_transformer_layers + num_pipeline_stages - 1) / + // num_pipeline_stages; for (int i = 0; i < num_transformer_layers; i++) { // step 1: attention @@ -89,12 +113,25 @@ void LLAMA::create_llama_model(FFModel &ff, ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); Layer *attention_norm = ff.layers.back(); - if (i % num_transformer_layers_per_stage == 0) { - // Map att_norm to the next GPU - // since the size of att_norm is minimum across - // all tensors - mapping[att_norm].push_back( - machine_views[i / num_transformer_layers_per_stage]); + // if (i % num_transformer_layers_per_stage == 0) { + // // Map att_norm to the next GPU + // // since the size of att_norm is minimum across + // // all tensors + // mapping[att_norm].push_back( + // machine_views[i / num_transformer_layers_per_stage]); + // } + for (int dp_index = 0; dp_index < ff.config.data_parallelism_degree; + dp_index++) { + int pp_block_idx = i / num_layers_per_pp_block; + int first_device_idx = dp_index * num_devices_per_data_parallelism_line + + ff.config.tensor_parallelism_degree * pp_block_idx; + // std::cout << "assigning layer " << i << " to devices " << + // first_device_idx + // << "-" + // << first_device_idx + ff.config.tensor_parallelism_degree - 1 + // << std::endl; + assert(first_device_idx < num_devices); + mapping[att_norm].push_back(machine_views[first_device_idx]); } weights_layers.emplace("layers_" + std::to_string(i) + diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 1e81e4eba7..499eb92642 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -28,8 +28,27 @@ void OPT::create_opt_model(FFModel &ff, bool use_full_precision) { Config opt_config(model_config_file_path); opt_config.printConfig(); - //------------------------------compute machine views ------------------ + //---------------------- parallelization setup work ---------------------- int num_devices = ff.config.workersPerNode * ff.config.numNodes; + int num_transformer_layers = opt_config.num_hidden_layers; + assert(num_transformer_layers % ff.config.pipeline_parallelism_degree == 0); + int num_layers_per_pp_block = + num_transformer_layers / ff.config.pipeline_parallelism_degree; + int num_devices_per_data_parallelism_line = + num_devices / ff.config.data_parallelism_degree; + + // std::cout << "dp: " << ff.config.data_parallelism_degree + // << " tp: " << ff.config.tensor_parallelism_degree + // << " pp: " << ff.config.pipeline_parallelism_degree << std::endl; + // std::cout << "num_devices: " << num_devices << std::endl; + // std::cout << "num_transformer_layers: " << num_transformer_layers + // << std::endl; + // std::cout << "num_devices_per_data_parallelism_line: " + // << num_devices_per_data_parallelism_line << std::endl; + // std::cout << "num layers: " << opt_config.num_hidden_layers << std::endl; + + //------------------------------compute machine views ------------------ + // single device std::vector machine_views; for (int i = 0; i < num_devices; i++) { MachineView view; @@ -40,6 +59,7 @@ void OPT::create_opt_model(FFModel &ff, view.start_device_id = i; machine_views.push_back(view); } + assert(machine_views.size() == num_devices); std::unordered_map> mapping; std::unordered_map weights_layers; @@ -52,8 +72,12 @@ void OPT::create_opt_model(FFModel &ff, input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } - mapping[input].push_back(machine_views[0]); - mapping[position_input].push_back(machine_views[0]); + for (int i = 0; i < ff.config.data_parallelism_degree; i++) { + mapping[input].push_back( + machine_views[i * num_devices_per_data_parallelism_line]); + mapping[position_input].push_back( + machine_views[i * num_devices_per_data_parallelism_line]); + } Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); std::vector axes = {0}; @@ -118,10 +142,23 @@ void OPT::create_opt_model(FFModel &ff, "_attention_layer_norm_weight", self_attn_layer_norm); - if (i % num_transformer_layers_per_stage == 0) { - mapping[hidden_states].push_back( - machine_views[i / num_transformer_layers_per_stage]); + for (int dp_index = 0; dp_index < ff.config.data_parallelism_degree; + dp_index++) { + int pp_block_idx = i / num_layers_per_pp_block; + int first_device_idx = dp_index * num_devices_per_data_parallelism_line + + ff.config.tensor_parallelism_degree * pp_block_idx; + // std::cout << "assigning layer " << i << " to devices " << + // first_device_idx + // << "-" + // << first_device_idx + ff.config.tensor_parallelism_degree - 1 + // << std::endl; + assert(first_device_idx < num_devices); + mapping[hidden_states].push_back(machine_views[first_device_idx]); } + // if (i % num_transformer_layers_per_stage == 0) { + // mapping[hidden_states].push_back( + // machine_views[i / num_transformer_layers_per_stage]); + // } Tensor mha; switch (mode) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 72666ed312..fbb07b2b25 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -44,7 +44,9 @@ void parse_input_args(char **argv, ModelTypes &model_types, bool &use_full_precision, bool &verbose, - int &tensor_parallelism_degree) { + int &data_parallelism_degree, + int &tensor_parallelism_degree, + int &pipeline_parallelism_degree) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -115,11 +117,21 @@ void parse_input_args(char **argv, paths.output_file_path = std::string(argv[++i]); continue; } + // data parallelism degree + if (!strcmp(argv[i], "-data-parallelism-degree")) { + data_parallelism_degree = std::stoi(argv[++i]); + continue; + } // tensor parallelism degree if (!strcmp(argv[i], "-tensor-parallelism-degree")) { tensor_parallelism_degree = std::stoi(argv[++i]); continue; } + // pipeline parallelism degree + if (!strcmp(argv[i], "-pipeline-parallelism-degree")) { + pipeline_parallelism_degree = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -141,7 +153,9 @@ void FlexFlow::top_level_task(Task const *task, ModelTypes model_types; bool use_full_precision = false; bool verbose = false; - int tensor_parallelism_degree = 1; + size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; + int data_parallelism_degree = 1, tensor_parallelism_degree = 1, + pipeline_parallelism_degree = -1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -152,8 +166,15 @@ void FlexFlow::top_level_task(Task const *task, model_types, use_full_precision, verbose, - tensor_parallelism_degree); + data_parallelism_degree, + tensor_parallelism_degree, + pipeline_parallelism_degree); + ffconfig.data_parallelism_degree = data_parallelism_degree; ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; + ffconfig.pipeline_parallelism_degree = + pipeline_parallelism_degree == -1 + ? num_devices / (tensor_parallelism_degree * data_parallelism_degree) + : pipeline_parallelism_degree; if (file_paths.ssm_weight_file_paths.size() == 0) { assert(false && @@ -178,7 +199,7 @@ void FlexFlow::top_level_task(Task const *task, } // Create SentencePiece tokenizer or OPT tokenizer - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS, 1); + InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); RequestManager rm(model_types.llm_model_type, file_paths.tokenizer_file_path, /*verbose*/ verbose, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index f844834761..67a78f9700 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -29,12 +29,32 @@ LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); InferenceManager::InferenceManager(FFConfig const &_config, - int _max_num_tokens_per_batch, - int _max_num_inflight_batches) - : ff_config(_config), max_num_tokens_per_batch(_max_num_tokens_per_batch), - max_num_inflight_batches(_max_num_inflight_batches) { - // populate array of valid single-device machine views + int _max_num_tokens_per_batch) + : ff_config(_config), max_num_tokens_per_batch(_max_num_tokens_per_batch) { num_devices = ff_config.workersPerNode * ff_config.numNodes; + // Check parallelization degrees + assert(ff_config.data_parallelism_degree <= num_devices && + "Data parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.data_parallelism_degree == 0 && + "Number of available devices is not divisible by data parallelism " + "degree"); + assert(ff_config.tensor_parallelism_degree <= num_devices && + "Tensor parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.tensor_parallelism_degree == 0 && + "Number of available devices is not divisible by tensor parallelism " + "degree"); + assert(ff_config.pipeline_parallelism_degree <= num_devices && + "Pipeline parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && + "Number of available devices is not divisible by pipeline parallelism " + "degree"); + assert(ff_config.data_parallelism_degree * + ff_config.tensor_parallelism_degree * + ff_config.pipeline_parallelism_degree == + num_devices && + "Product of data, tensor, and pipeline parallelism degrees does not " + "match the number of available devices"); + // populate array of valid single-device machine views for (int i = 0; i < num_devices; i++) { MachineView view; view.device_type = MachineView::GPU; @@ -90,6 +110,7 @@ void InferenceManager::compile_model_and_allocate_buffer( assert(pt->owner_op != nullptr); mapping[pt->owner_op] = it.second; } + // std::cout << std::endl << std::endl << "Operators MVs:" << std::endl; for (int op_idx = 0; op_idx < model->operators.size(); op_idx++) { Op const *op = model->operators[op_idx]; // Skip weight operators @@ -100,12 +121,12 @@ void InferenceManager::compile_model_and_allocate_buffer( std::vector machine_views; if (mapping.find(op) != mapping.end()) { machine_views = mapping[op]; - assert(machine_views.size() == max_num_inflight_batches); + assert(machine_views.size() == ff_config.data_parallelism_degree); } else { // Mapping the current operator using the same machine // view as the inputs assert(op->numInputs > 0); - for (int j = 0; j < max_num_inflight_batches; j++) { + for (int j = 0; j < ff_config.data_parallelism_degree; j++) { MachineView mv = tensor_buffer[op->inputs[0]][j]->machine_view; for (int k = 1; k < op->numInputs; k++) { if (mv != tensor_buffer[op->inputs[k]][j]->machine_view) { @@ -143,14 +164,14 @@ void InferenceManager::compile_model_and_allocate_buffer( assert(mv.start_device_id + mv.dim[0] <= num_devices); machine_views.push_back(mv); } - assert(machine_views.size() == max_num_inflight_batches); + assert(machine_views.size() == ff_config.data_parallelism_degree); } // std::cout << "operator: " << op->name << std::endl; // for (int i = 0; i < op->numInputs; i++) { // op->inputs[i]->print("input pt"); // std::cout << "input mv: " << op->inputs[i]->machine_view << std::endl; // } - + // std::cout << "Op " << op->name << ": "; for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); @@ -211,7 +232,7 @@ void InferenceManager::compile_model_and_allocate_buffer( } } if (!found_parallel_tensor) { - for (int j = 0; j < max_num_inflight_batches; j++) { + for (int j = 0; j < ff_config.data_parallelism_degree; j++) { // Copy the metadata from pt_base to pt ParallelTensor pt = new ParallelTensorBase(*pt_base); pt->region = @@ -221,6 +242,7 @@ void InferenceManager::compile_model_and_allocate_buffer( pt->part = runtime->get_logical_partition( ctx, pt->region, pt_base->part.get_index_partition()); pt->machine_view = machine_views[j]; + // std::cout << "output mv: " << pt->machine_view << std::endl; Domain part_domain = runtime->get_index_space_domain(ctx, pt_base->parallel_is); assert(pt->machine_view.get_domain() == part_domain); @@ -230,11 +252,12 @@ void InferenceManager::compile_model_and_allocate_buffer( assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); tensor_buffer[pt_base] = list; } + // std::cout << std::endl; } } void InferenceManager::init_operators_inference(FFModel *model) { - for (int batch_index = 0; batch_index < max_num_inflight_batches; + for (int batch_index = 0; batch_index < ff_config.data_parallelism_degree; batch_index++) { int expert_device_index = 0; int device_index = batch_index % num_devices; @@ -290,7 +313,7 @@ FutureMap InferenceManager::inference(FFModel *model, assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) - int batch_index = index % max_num_inflight_batches; + int batch_index = index % ff_config.data_parallelism_degree; FutureMap fm; bool found_input_operator = false; for (size_t o = 0; o < model->operators.size(); o++) { diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 1262ec21d5..3e0d7cac53 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -48,9 +48,13 @@ mkdir -p ../inference/output # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + # LLAMA + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_tp.txt -tensor-parallelism-degree 2 # LLAMA (half precision) ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half_tp.txt -tensor-parallelism-degree 2 + # OPT + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_tp.txt -tensor-parallelism-degree 2 # OPT (half precision) ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half_tp.txt -tensor-parallelism-degree 2 fi @@ -86,6 +90,8 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -tensor-parallelism-degree 2 + # LLAMA (big model) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -tensor-parallelism-degree 2 # LLAMA (big model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half_tp.txt -tensor-parallelism-degree 2 @@ -94,6 +100,8 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # OPT (small model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -tensor-parallelism-degree 2 + # OPT (big model) + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -tensor-parallelism-degree 2 # OPT (big model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half_tp.txt -tensor-parallelism-degree 2 fi @@ -143,13 +151,17 @@ compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B ############ Alignment between tensor model parallelism and pipeline parallelism only ################# if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then - # diff <(tail -n +2 "../inference/output/spec_inference_llama_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_llama_half.txt") + diff <(tail -n +2 "../inference/output/spec_inference_llama_tp.txt") <(tail -n +2 "../inference/output/spec_inference_llama.txt") + diff <(tail -n +2 "../inference/output/spec_inference_opt_tp.txt") <(tail -n +2 "../inference/output/spec_inference_opt.txt") + diff <(tail -n +2 "../inference/output/spec_inference_llama_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_llama_half.txt") diff <(tail -n +2 "../inference/output/spec_inference_opt_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_opt_half.txt") diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half.txt") + # diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_7B.txt") diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half.txt") diff <(tail -n +2 "../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_125M.txt") diff <(tail -n +2 "../inference/output/incr_decoding_opt_125M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_125M_half.txt") + diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_6B.txt") diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half.txt") fi From e47a1795045c2fc4a0fe4fe54ab87bd601069d55 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 30 Jun 2023 15:01:33 +0800 Subject: [PATCH 161/344] Revert "[Inference] fix bug when init_length + beam_depth > max_num_tokens" (#821) --- .../cpp/inference/mixture_of_experts/moe.h | 4 ++-- include/flexflow/inference.h | 1 - inference/spec_infer/spec_infer.cc | 7 +++--- src/runtime/request_manager.cc | 23 ++----------------- 4 files changed, 7 insertions(+), 28 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h index 4fdd3b2e3f..183229bc07 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ b/examples/cpp/inference/mixture_of_experts/moe.h @@ -22,9 +22,9 @@ struct MoeConfig : InferenceConfig { MoeConfig(void) : InferenceConfig() { //----------------------- MoE layer -------------------------------- // total number of experts - num_exp = 64; + num_exp = 128; // number of experts in each block of fused experts - experts_per_block = 16; + experts_per_block = 32; // number of experts to route each token to num_select = 2; // expert capacity parameters diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index ca3a61592f..1fd2fdff78 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -130,7 +130,6 @@ class RequestManager { &inputSerializedTree, std::vector> const &outputSerializedTree); - int get_requests_init_length(BeamSearchBatchConfig const &old_bc); static void load_tokens_task(Legion::Task const *task, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 2f581b7c34..fbb07b2b25 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -306,16 +306,15 @@ void FlexFlow::top_level_task(Task const *task, for (int i = 0; i < num_ssms; i++) { while (true) { - depth = beam_bc_vec[i].current_depth_all_requests(); + beam_bc = beam_bc_vec[i]; + depth = beam_bc.beamRequestsInfo[0].current_depth; FutureMap fm = im.inference(rm.get_model(0), 0, beam_bc_vec[i]); assert(fm.get_future_map_domain().get_volume() == 1); Future future = fm.get_future(0); BeamInferenceResult beam_ir = future.get_result(); - if (depth - 1 >= beam_bc_vec[i].max_beam_depth_all_requests() || - depth + 1 + rm.get_requests_init_length(beam_bc_vec[i]) >= - BatchConfig::MAX_NUM_TOKENS) { + if (depth - 1 >= BeamSearchBatchConfig::MAX_BEAM_DEPTH) { break; } else { beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2211a8df78..56b9bf6241 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -334,25 +334,6 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, /* ----- Speculative Inference Specific functions ----- */ -int RequestManager::get_requests_init_length( - BeamSearchBatchConfig const &old_bc) { - int init_length = 0; - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { - if (old_bc.request_completed[i]) { - continue; - } - Request &request = - running_request_queue[old_bc.requestsInfo[i].request_guid]; - if (old_bc.requestsInfo[i].token_start_offset + 1 >= - request.tokens.size()) { - init_length = 0; - } else if (request.initial_len > init_length) { - init_length = request.initial_len; - } - } - return init_length; -} - // update beam search metadata BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, @@ -364,7 +345,7 @@ BeamSearchBatchConfig if (verbose) { std::cout << "print all results" << "\n"; - for (int i = 0; i < 64; i++) { + for (int i = 0; i < 40; i++) { std::cout << result.token_ids[i] << ", "; } std::cout << "Current Beam Depth: " @@ -423,7 +404,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].beam_size = old_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].current_depth; + old_bc.beamRequestsInfo[i].max_depth; // do the slot exchange to minimize the cache exchange in kernel. std::cout << "update metadata" << std::endl; From d038e946e4e0dd5fdf4048e698767b447425dda0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 5 Jul 2023 14:41:31 -0400 Subject: [PATCH 162/344] Merge `master` branch into `inference` (#835) * Fix directory in python example in INSTALL.md (#783) * Remove incomplete sentence in readme (#784) * Fix Code Color in README (#822) Specify code block is Python to have correct coloring in second code block in README.md * Update README.md (#824) Co-authored-by: Zhihao Jia * fix-link (#829) Co-authored-by: Kate Unger * Fix CUDA version in Docker image (11.7.0 to 11.7.1) (#833) --------- Co-authored-by: Colin Unger Co-authored-by: Kate Unger <32380357+KateUnger@users.noreply.github.com> Co-authored-by: Zhihao Jia Co-authored-by: Kate Unger Co-authored-by: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> --- INSTALL.md | 6 +++--- README.md | 9 ++++----- docker/flexflow-environment/Dockerfile | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index b0f8133483..4165683370 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -90,11 +90,11 @@ To run the Python examples, you have two options: you can use the `flexflow_pyth * `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/python"` * `export FF_USE_NATIVE_PYTHON=1` -**We recommend that you run the `mnist_mlp` test under `native` using the following cmd to check if FlexFlow has been installed correctly:** +**We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:** ``` -cd python -./flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize -ll:zsize +cd "$FF_HOME" +./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize -ll:zsize ``` A script to run all the Python examples is available at `tests/multi_gpu_tests.sh` diff --git a/README.md b/README.md index 0420f8f902..c26904749d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras). ## Install FlexFlow -To install FlexFlow from source code, please read the [instructions](INSTALL.md). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. You can also use `conda` to install the FlexFlow Python package (coming soon). +To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages ([flexflow-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-cuda) with a CUDA backend, [flexflow-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. You can also use `conda` to install the FlexFlow Python package (coming soon). ## PyTorch Support Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`. @@ -18,7 +18,7 @@ fx.torch_to_flexflow(model, "mymodel.ff") Second, a FlexFlow program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine. -``` +```python from flexflow.pytorch.model import PyTorchModel def top_level_task(): @@ -39,7 +39,7 @@ FlexFlow prioritizes PyTorch compatibility, but also includes frontends for [Ten ## C++ Interface For users that prefer to program in C/C++. FlexFlow supports a C++ program inference that is equivalent to its Python APIs. -**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/c++). +**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp). ## Command-Line Flags @@ -69,12 +69,11 @@ Performance auto-tuning flags: For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search). ## Contributing + Please let us know if you encounter any bugs or have any suggestions by [submitting an issue](https://github.com/flexflow/flexflow/issues). We welcome all contributions to FlexFlow from bug fixes to new features and extensions. -Please subscribe to the FlexFlow users mailing list for - ## Citations * Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022. diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 598690a8a7..43c1599d0f 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.7.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow LABEL org.opencontainers.image.description="FlexFlow environment container" From 869d166916c7167eb9dea39d63419e4163990453 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Fri, 7 Jul 2023 20:11:07 -0400 Subject: [PATCH 163/344] Fixation. (#840) --- include/flexflow/batch_config.h | 1 + inference/spec_infer/spec_infer.cc | 6 +++++- src/runtime/request_manager.cc | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index b56466bfe5..61a1e345ae 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -116,6 +116,7 @@ class BeamSearchBatchConfig : public BatchConfig { inline static int const MAX_BEAM_DEPTH = 8; int model_id; + int max_init_length = 0; struct BeamSearchPerRequestInfo { int beam_size; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index fbb07b2b25..e5a6c8d5e6 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -314,7 +314,11 @@ void FlexFlow::top_level_task(Task const *task, Future future = fm.get_future(0); BeamInferenceResult beam_ir = future.get_result(); - if (depth - 1 >= BeamSearchBatchConfig::MAX_BEAM_DEPTH) { + int iteration = + std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, + BatchConfig::MAX_SEQ_LENGTH - beam_bc.max_init_length); + + if (depth - 1 >= iteration) { break; } else { beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 56b9bf6241..b47b17ad12 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -357,6 +357,7 @@ BeamSearchBatchConfig // Step 2: preparing the next batch for existing requests BeamSearchBatchConfig new_bc; + new_bc.max_init_length = 0; new_bc.model_id = old_bc.model_id; std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; @@ -634,12 +635,15 @@ BeamSearchBatchConfig } // Step 2: Initialize new request + new_bc.max_init_length = 0; for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && new_bc.num_tokens < BeamSearchBatchConfig::MAX_NUM_TOKENS) { Request new_request = pending_request_queue.front(); pending_request_queue.pop(); + new_bc.max_init_length = + std::max(new_bc.max_init_length, new_request.initial_len); running_request_queue[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; From 93e3896d219496fee4b2b3c4518e20b32c51748f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 8 Jul 2023 09:21:04 -0400 Subject: [PATCH 164/344] [Inference] - Save output of inference test as an artifact (#845) --- .github/workflows/gpu-ci.yml | 9 +++++++++ tests/inference_tests.sh | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index bdbb8a751b..699ca9fc11 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -182,6 +182,15 @@ jobs: # Inference tests export TENSOR_PARALLELISM_TESTS=ON ./tests/inference_tests.sh + cd inference + tar -zcvf output.tar.gz ./output + cd .. + + - name: Save inference output as an artifact + uses: actions/upload-artifact@v3 + with: + name: output + path: inference/output.tar.gz gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 3e0d7cac53..761c6cf332 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -207,4 +207,4 @@ diff <(tail -n +2 "../inference/output/huggingface_opt_125M_half.txt") <(tail -n ############################################################################################### # Clean up after test -cleanup +# cleanup From 53c5617a8e5149ca1475978f391e4eb73c3434c5 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 9 Jul 2023 21:53:47 -0500 Subject: [PATCH 165/344] Using AllReduce instead of Reduce + Replicate when tensor model parallelism is enabled (#813) * [AllReduce] initial implementation * checkpoint * format * fusion * support half precision in fusedop * format * checkpoint * bug fixes * fix a performance issue in linear inference * fix * fix * fix specinfer and incr decoding * update readme * default data_parallelism_degree=1 * fix fusion * reduce unnecessary calculation. * makefile & rocm cmake fixes * only compare first 30 tokens in half precision * fix test script * check incr decoding steps instead of latency * hip rocm fix * makefile fix * more inference test fixes * update fusedop to support specinfer * fix rocm linking issue --------- Co-authored-by: Gabriele Oliaro Co-authored-by: xinhaoc --- .github/README.md | 5 +- .../cpp/inference/mixture_of_experts/moe.cc | 3 +- .../inference/transformers/transformers.cc | 3 +- include/flexflow/config.h | 11 +- include/flexflow/ffconst.h | 2 + include/flexflow/fftype.h | 7 +- include/flexflow/inference.h | 4 +- include/flexflow/model.h | 13 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/arg_topk.h | 9 +- include/flexflow/ops/arg_topk_params.h | 2 + include/flexflow/ops/element_binary.h | 10 +- include/flexflow/ops/element_binary_params.h | 2 + include/flexflow/ops/fused.h | 13 + include/flexflow/ops/kernels/linear_kernels.h | 1 - include/flexflow/ops/layer_norm.h | 8 +- include/flexflow/ops/linear.h | 4 + include/flexflow/parallel_ops/allreduce.h | 70 +++ .../flexflow/parallel_ops/allreduce_params.h | 21 + include/flexflow/parallel_ops/combine.h | 14 + .../parallel_ops/kernels/allreduce_kernels.h | 31 ++ .../parallel_ops/kernels/combine_kernels.h | 1 + include/flexflow/utils/cuda_helper.h | 6 + inference/incr_decoding/incr_decoding.cc | 14 +- inference/models/falcon.cc | 27 +- inference/models/llama.cc | 37 +- inference/models/llama.h | 1 - inference/models/opt.cc | 47 +- inference/models/opt.h | 1 - inference/spec_infer/spec_infer.cc | 23 +- src/ops/arg_topk.cc | 27 +- src/ops/arg_topk.cpp | 7 +- src/ops/arg_topk.cu | 8 +- src/ops/beam_topk.cc | 6 +- src/ops/conv_2d.cc | 6 +- src/ops/element_binary.cc | 61 ++- src/ops/element_unary.cc | 6 +- src/ops/experts.cc | 6 +- src/ops/fused.cc | 155 +++++- src/ops/fused.cpp | 417 +++++++++++++++++ src/ops/fused.cu | 442 +++++++++++++++++- src/ops/gather.cc | 6 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/kernels/linear_kernels.cpp | 37 +- src/ops/kernels/linear_kernels.cu | 39 +- src/ops/layer_norm.cc | 74 +-- src/ops/layer_norm.cpp | 8 +- src/ops/layer_norm.cu | 8 +- src/ops/linear.cc | 83 +++- src/ops/reduce.cc | 6 +- src/ops/reshape.cc | 6 +- src/ops/rms_norm.cc | 23 +- src/ops/tree_inc_multihead_self_attention.cu | 2 +- src/parallel_ops/allreduce.cc | 362 ++++++++++++++ src/parallel_ops/combine.cc | 121 ++++- .../kernels/allreduce_kernels.cpp | 46 ++ src/parallel_ops/kernels/allreduce_kernels.cu | 56 +++ src/parallel_ops/kernels/combine_kernels.cpp | 6 + src/parallel_ops/kernels/combine_kernels.cu | 6 + src/runtime/cuda_helper.cu | 24 + src/runtime/ffconst_utils.cc | 2 + src/runtime/fftype.cc | 14 +- src/runtime/graph.cc | 124 +++-- src/runtime/hip_helper.cpp | 17 +- src/runtime/inference_manager.cc | 151 +++--- src/runtime/layer.cc | 10 +- src/runtime/model.cc | 128 ++++- src/runtime/operator_params.cc | 3 + src/runtime/request_manager.cc | 4 + src/runtime/substitution.cc | 21 +- tests/inference_tests.sh | 169 ++++--- 71 files changed, 2605 insertions(+), 486 deletions(-) create mode 100644 include/flexflow/parallel_ops/allreduce.h create mode 100644 include/flexflow/parallel_ops/allreduce_params.h create mode 100644 include/flexflow/parallel_ops/kernels/allreduce_kernels.h create mode 100644 src/parallel_ops/allreduce.cc create mode 100644 src/parallel_ops/kernels/allreduce_kernels.cpp create mode 100644 src/parallel_ops/kernels/allreduce_kernels.cu diff --git a/.github/README.md b/.github/README.md index 576b1ca84e..c4f220e222 100644 --- a/.github/README.md +++ b/.github/README.md @@ -44,7 +44,10 @@ The source code of the SpecInfer pipeline is available at [this folder](../infer * `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. * `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. * `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). +* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. * `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: +* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency + ```c++ class RequestManager { @@ -54,7 +57,7 @@ class RequestManager { For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -tensor-parallelism-degree 2 -pipeline-parallelism-degree 2 ``` ### Tokenizers diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 39459d63ac..ff3f6bb53a 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -140,8 +140,7 @@ void FlexFlow::top_level_task(Task const *task, //------------------- Initialize the inference manager ------------------ InferenceManager im(ff.config, moeConfig.batch_size); - std::unordered_map> mapping; - im.compile_model_and_allocate_buffer(&ff, mapping); + im.compile_model_and_allocate_buffer(&ff); im.init_operators_inference(&ff); //------------ Initialize the data loader and data generator ------------ diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index d56473c8bd..074e832d47 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -115,8 +115,7 @@ void FlexFlow::top_level_task(Task const *task, //------------------- Initialize the inference manager ------------------ InferenceManager im(ff.config, transformerConfig.batch_size); - std::unordered_map> mapping; - im.compile_model_and_allocate_buffer(&ff, mapping); + im.compile_model_and_allocate_buffer(&ff); im.init_operators_inference(&ff); //------------ Initialize the data loader and data generator ------------ diff --git a/include/flexflow/config.h b/include/flexflow/config.h index f1b218e50f..be6c0d21da 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -37,14 +37,15 @@ namespace FlexFlow { // ======================================================== // Define Runtime Constants // ======================================================== -#define MAX_NUM_INPUTS 256 -#define MAX_NUM_WEIGHTS 64 -#define MAX_NUM_OUTPUTS 256 -#define MAX_NUM_FUSED_OPERATORS 64 -#define MAX_NUM_FUSED_TENSORS 64 +#define MAX_NUM_INPUTS 2048 +#define MAX_NUM_WEIGHTS 2048 +#define MAX_NUM_OUTPUTS 2048 +#define MAX_NUM_FUSED_OPERATORS 2048 +#define MAX_NUM_FUSED_TENSORS 2048 #define MAX_NUM_WORKERS 1024 #define MAX_FILENAME 200 #define MAX_OPNAME 128 +#define MAX_NUM_TRANSFORMER_LAYERS 100 // DataLoader #define MAX_SAMPLES_PER_LOAD 64 #define MAX_FILE_LENGTH 128 diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 0b572a9674..3d899ac91d 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -173,6 +173,7 @@ enum OperatorType { OP_REPLICATE, OP_REDUCTION, OP_PIPELINE, + OP_ALLREDUCE, OP_FUSED_PARALLEL, OP_INVALID, }; @@ -207,6 +208,7 @@ enum PMParameter { PM_COMBINE_DEGREE, // Combine PM_REDUCTION_DIM, // Reduction PM_REDUCTION_DEGREE, // Reduction + PM_ALLREDUCE_DIM, // AllReduce PM_SOFTMAX_DIM, // Softmax PM_NUM_HEADS, // MultiHeadAttention PM_INVALID, diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index a71c85dbc8..18ed6b8100 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -8,15 +8,16 @@ namespace FlexFlow { class LayerID { public: + static const LayerID NO_ID; LayerID(); - LayerID(size_t id); + LayerID(size_t id, size_t transformer_layer_id); bool is_valid_id() const; friend bool operator==(LayerID const &lhs, LayerID const &rhs); public: - size_t id; + size_t id, transformer_layer_id; }; }; // namespace FlexFlow -#endif // _FF_TYPE_H \ No newline at end of file +#endif // _FF_TYPE_H diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 1fd2fdff78..a1846c96dc 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -29,9 +29,7 @@ using tokenizers::Tokenizer; class InferenceManager { public: InferenceManager(FFConfig const &config, int max_num_tokens_per_batch); - void compile_model_and_allocate_buffer( - FFModel *model, - std::unordered_map> const &mapping); + void compile_model_and_allocate_buffer(FFModel *model); void init_operators_inference(FFModel *model); MachineView *get_machine_view(int mv_id); Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 1277b29b3d..2b95eecac0 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -104,6 +104,7 @@ enum TaskIDs { LAYERNORM_BWD_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, + LINEAR_INF_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID, LINEAR_BWD2_TASK_ID, @@ -159,6 +160,7 @@ enum TaskIDs { FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, FUSEDOP_BWD_TASK_ID, + FUSEDOP_INF_TASK_ID, NOOP_INIT_TASK_ID, // Metrics tasks METRICS_COMP_TASK_ID, @@ -212,6 +214,9 @@ enum TaskIDs { PIPELINE_INIT_TASK_ID, PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, + ALLREDUCE_INIT_TASK_ID, + ALLREDUCE_FWD_TASK_ID, + ALLREDUCE_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, @@ -311,6 +316,7 @@ class Combine; class Repartition; class Reduction; class Replicate; +class AllReduce; class FusedParallelOp; class ParallelOpInfo; @@ -897,6 +903,9 @@ class FFModel { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + // ======================================== + // Internal APIs that should not be invoked from applications + // ======================================== void reset_metrics(); void init_operators(); void init_operators_inference( @@ -919,6 +928,7 @@ class FFModel { std::vector const &metrics, CompMode comp_mode = COMP_MODE_TRAINING); void compile_inference(); + void set_transformer_layer_id(int id); void graph_optimize(size_t budget, bool only_data_parallel, std::unique_ptr &best_graph, @@ -975,6 +985,7 @@ class FFModel { public: size_t op_global_guid, layer_global_guid; size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid; + size_t current_transformer_layer_id; FFConfig config; FFIterationConfig iter_config; Optimizer *optimizer; @@ -1078,6 +1089,8 @@ class FFModel { Reduction *>, std::unordered_map, Combine *>, + std::unordered_map, + AllReduce *>, std::unordered_map, FusedParallelOp *>> cached_ops; diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 8c52dfb584..f6918ff581 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -32,6 +32,7 @@ #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" #include "flexflow/ops/tree_inc_multihead_self_attention_params.h" +#include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" #include "flexflow/parallel_ops/partition_params.h" @@ -76,6 +77,7 @@ using OperatorParameters = mp::variant; tl::optional get_op_parameters(Op const *op); diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index a00ab76385..ed92200fbe 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -19,11 +19,15 @@ class ArgTopK : public Op { using Params = ArgTopKParams; using Input = ParallelTensor; ArgTopK(FFModel &model, + LayerID const &layer_guid, const ParallelTensor input, int k, bool sorted, char const *name); - ArgTopK(FFModel &model, ArgTopK const &other, const ParallelTensor input); + ArgTopK(FFModel &model, + LayerID const &layer_guid, + ArgTopK const &other, + const ParallelTensor input); ArgTopK(FFModel &model, Params const ¶ms, Input const input, @@ -80,7 +84,8 @@ class ArgTopK : public Op { ffStream_t stream); static void forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &indices); + GenericTensorAccessorW const &indices, + int batch_size); Params get_params() const; public: diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index ca88a5b9be..9d2a21034f 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -2,11 +2,13 @@ #define _FLEXFLOW_ARG_TOPK_PARAMS_H #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { struct ArgTopKParams { + LayerID layer_guid; int k; bool sorted; bool is_valid(ParallelTensorShape const &) const; diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 9c2e6c1252..fe7dc2602c 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -15,6 +15,7 @@ class ElementBinary : public Op { using Input = std::pair; ElementBinary(FFModel &model, + LayerID const &layer_guid, OperatorType type, const ParallelTensor x, const ParallelTensor y, @@ -23,8 +24,7 @@ class ElementBinary : public Op { ElementBinary(FFModel &model, Params const ¶ms, Input const &inputs, - char const *name = nullptr, - bool inplace_a = false); + char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, @@ -63,6 +63,12 @@ class ElementBinary : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); Params get_params() const; public: diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 5aa20e25a5..8b26877af2 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -7,7 +7,9 @@ namespace FlexFlow { struct ElementBinaryParams { + LayerID layer_guid; OperatorType type; + bool inplace_a; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index 87d35da902..87e562d143 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -29,8 +29,17 @@ class FusedOp : public Op { return ParallelTensor(); } void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -38,6 +47,10 @@ class FusedOp : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 9644fd9c8f..29791b53ff 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -33,7 +33,6 @@ class LinearMeta : public OpMeta { RegularizerMode kernel_reg_type; float kernel_reg_lambda; bool use_bias, add_bias_only_once; - DataType input_type, weight_type, output_type; char op_name[MAX_OPNAME]; }; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index b962edf326..b5a36262b4 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -72,14 +72,14 @@ class LayerNorm : public Op { static void forward_kernel(LayerNormMeta const *m, T const *input_ptr, T *output_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, ffStream_t stream); static void forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta); + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index 7b134502b7..ff6ba1ef90 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -62,6 +62,10 @@ class Linear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h new file mode 100644 index 0000000000..2faf128d93 --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce.h @@ -0,0 +1,70 @@ +#ifndef _FLEXFLOW_ALLREDUCE_H +#define _FLEXFLOW_ALLREDUCE_H + +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_ops/allreduce_params.h" +#include "parallel_op.h" + +namespace FlexFlow { + +class AllReduce : public ParallelOp { +public: + using Params = AllReduceParams; + using Input = ParallelTensor; + + AllReduce(FFModel &model, + const ParallelTensor input, + int allreduce_legion_dim, + char const *name = NULL); + AllReduce(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void backward(FFModel const &) override; + bool get_int_parameter(PMParameter, int *) const override; + bool append_parallel_op_info( + std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + + Params get_params() const; + +public: + int allreduce_dim; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ALLREDUCE_H diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h new file mode 100644 index 0000000000..c04676ffeb --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_ALLREDUCE_PARAMS_H +#define _FLEXFLOW_ALLREDUCE_PARAMS_H + +namespace FlexFlow { + +struct AllReduceParams { + int allreduce_legion_dim; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(AllReduceParams const &, AllReduceParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::AllReduceParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_ALLREDUCE_PARAMS_H diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h index 310e599f54..d09a789de2 100644 --- a/include/flexflow/parallel_ops/combine.h +++ b/include/flexflow/parallel_ops/combine.h @@ -3,6 +3,7 @@ #include "flexflow/layer.h" #include "flexflow/node.h" +#include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/parallel_ops/combine_params.h" #include "parallel_op.h" @@ -24,8 +25,21 @@ class Combine : public ParallelOp { Input const input, char const *name = nullptr); void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void forward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h new file mode 100644 index 0000000000..02a5026fcf --- /dev/null +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/allreduce.h" + +namespace FlexFlow { + +class AllReduceMeta : public OpMeta { +public: + AllReduceMeta(FFHandler handle, AllReduce const *reduct); +}; + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h index 6f540679a2..456013cd81 100644 --- a/include/flexflow/parallel_ops/kernels/combine_kernels.h +++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h @@ -4,6 +4,7 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/combine.h" namespace FlexFlow { diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 5ac4571118..1787c5a0b7 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -4,6 +4,9 @@ #include "legion.h" #include #include +#ifdef FF_USE_NCCL +#include +#endif #define FatalError(s) \ do { \ @@ -165,6 +168,9 @@ cudnnStatus_t cudaDataType_t ff_to_cuda_datatype(DataType type); cudnnDataType_t ff_to_cudnn_datatype(DataType type); +#ifdef FF_USE_NCCL +ncclDataType_t ff_to_nccl_datatype(DataType type); +#endif cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index d43cab17f9..68a8e10042 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -126,7 +126,7 @@ void FlexFlow::top_level_task(Task const *task, bool verbose = false; size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; int data_parallelism_degree = 1, tensor_parallelism_degree = 1, - pipeline_parallelism_degree = -1; + pipeline_parallelism_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -142,10 +142,10 @@ void FlexFlow::top_level_task(Task const *task, pipeline_parallelism_degree); ffconfig.data_parallelism_degree = data_parallelism_degree; ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; - ffconfig.pipeline_parallelism_degree = - pipeline_parallelism_degree == -1 - ? num_devices / (tensor_parallelism_degree * data_parallelism_degree) - : pipeline_parallelism_degree; + ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree; + assert(data_parallelism_degree * tensor_parallelism_degree * + pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -162,8 +162,6 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes / - tensor_parallelism_degree, INC_DECODING_MODE, use_full_precision); } else if (model_type == ModelType::OPT) { @@ -171,8 +169,6 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes / - tensor_parallelism_degree, INC_DECODING_MODE, use_full_precision); } else if (model_type == ModelType::FALCON) { diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 7fc3124278..bced5dc1e0 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -28,20 +28,6 @@ void FALCON::create_falcon_model(FFModel &ff, bool use_full_precision) { Config falcon_config(model_config_file_path); falcon_config.printConfig(); - //------------------------------compute machine views ------------------ - int num_devices = ff.config.workersPerNode * ff.config.numNodes; - std::vector machine_views; - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - machine_views.push_back(view); - } - - std::unordered_map> mapping; std::unordered_map weights_layers; Tensor input; @@ -50,7 +36,6 @@ void FALCON::create_falcon_model(FFModel &ff, int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } - mapping[input].push_back(machine_views[0]); Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); @@ -83,18 +68,12 @@ void FALCON::create_falcon_model(FFModel &ff, (num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages; for (int i = 0; i < num_transformer_layers; i++) { + // set transformer layer id + ff.set_transformer_layer_id(i); // step 1: attention Tensor att_norm = ff.layer_norm(token, axes, true, falcon_config.norm_eps); Layer *attention_norm = ff.layers.back(); - if (i % num_transformer_layers_per_stage == 0) { - // Map att_norm to the next GPU - // since the size of att_norm is minimum across - // all tensors - mapping[att_norm].push_back( - machine_views[i / num_transformer_layers_per_stage]); - } - weights_layers.emplace("layers_" + std::to_string(i) + "_input_layernorm_weight", attention_norm); @@ -162,7 +141,7 @@ void FALCON::create_falcon_model(FFModel &ff, // Compile the model std::cout << "------start compile ----------" << std::endl; - im.compile_model_and_allocate_buffer(&ff, mapping); + im.compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, falcon_config.n_heads, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 1e61f43a98..e54ec13147 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -23,7 +23,6 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, - int num_pipeline_stages, InferenceMode mode, bool use_full_precision) { // do not apply cpu offload in beam search model. @@ -62,7 +61,6 @@ void LLAMA::create_llama_model(FFModel &ff, } assert(machine_views.size() == num_devices); - std::unordered_map> mapping; std::unordered_map weights_layers; Tensor input; @@ -71,10 +69,6 @@ void LLAMA::create_llama_model(FFModel &ff, int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } - for (int i = 0; i < ff.config.data_parallelism_degree; i++) { - mapping[input].push_back( - machine_views[i * num_devices_per_data_parallelism_line]); - } Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); @@ -101,39 +95,14 @@ void LLAMA::create_llama_model(FFModel &ff, Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); - // int num_transformer_layers = llama_config.n_layers; - // int num_transformer_layers_per_stage = - // (num_transformer_layers + num_pipeline_stages - 1) / - // num_pipeline_stages; - for (int i = 0; i < num_transformer_layers; i++) { + // set transformer layer id + ff.set_transformer_layer_id(i); // step 1: attention std::vector axes = {2}; Tensor att_norm = ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); Layer *attention_norm = ff.layers.back(); - - // if (i % num_transformer_layers_per_stage == 0) { - // // Map att_norm to the next GPU - // // since the size of att_norm is minimum across - // // all tensors - // mapping[att_norm].push_back( - // machine_views[i / num_transformer_layers_per_stage]); - // } - for (int dp_index = 0; dp_index < ff.config.data_parallelism_degree; - dp_index++) { - int pp_block_idx = i / num_layers_per_pp_block; - int first_device_idx = dp_index * num_devices_per_data_parallelism_line + - ff.config.tensor_parallelism_degree * pp_block_idx; - // std::cout << "assigning layer " << i << " to devices " << - // first_device_idx - // << "-" - // << first_device_idx + ff.config.tensor_parallelism_degree - 1 - // << std::endl; - assert(first_device_idx < num_devices); - mapping[att_norm].push_back(machine_views[first_device_idx]); - } - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_norm_weight", attention_norm); @@ -246,7 +215,7 @@ void LLAMA::create_llama_model(FFModel &ff, // Compile the model std::cout << "------start compile ----------" << std::endl; - im.compile_model_and_allocate_buffer(&ff, mapping); + im.compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, llama_config.n_heads, diff --git a/inference/models/llama.h b/inference/models/llama.h index 11fc354a2c..ab9bd4c7f3 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -106,7 +106,6 @@ class LLAMA { InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, - int num_pipeline_stages, InferenceMode mode, bool use_full_precision = false); }; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 499eb92642..503be39672 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -23,7 +23,6 @@ void OPT::create_opt_model(FFModel &ff, InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, - int num_pipeline_stages, InferenceMode mode, bool use_full_precision) { Config opt_config(model_config_file_path); @@ -47,21 +46,6 @@ void OPT::create_opt_model(FFModel &ff, // << num_devices_per_data_parallelism_line << std::endl; // std::cout << "num layers: " << opt_config.num_hidden_layers << std::endl; - //------------------------------compute machine views ------------------ - // single device - std::vector machine_views; - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - machine_views.push_back(view); - } - assert(machine_views.size() == num_devices); - - std::unordered_map> mapping; std::unordered_map weights_layers; //------------------------------ build the model -------------------------- @@ -72,12 +56,6 @@ void OPT::create_opt_model(FFModel &ff, input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } - for (int i = 0; i < ff.config.data_parallelism_degree; i++) { - mapping[input].push_back( - machine_views[i * num_devices_per_data_parallelism_line]); - mapping[position_input].push_back( - machine_views[i * num_devices_per_data_parallelism_line]); - } Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); std::vector axes = {0}; @@ -127,9 +105,10 @@ void OPT::create_opt_model(FFModel &ff, Tensor residual = ff.add(token, positional_embedding); - int num_transformer_layers_per_stage = - (32 + num_pipeline_stages - 1) / num_pipeline_stages; for (int i = 0; i < opt_config.num_hidden_layers; i++) { + // set transformer layer id + ff.set_transformer_layer_id(i); + // 125m, 1.7B, ..., 175B applies layer norm BEFORE attention, // 350m applies layer norm AFTER attention // https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#LL324C1-L325C1 @@ -142,24 +121,6 @@ void OPT::create_opt_model(FFModel &ff, "_attention_layer_norm_weight", self_attn_layer_norm); - for (int dp_index = 0; dp_index < ff.config.data_parallelism_degree; - dp_index++) { - int pp_block_idx = i / num_layers_per_pp_block; - int first_device_idx = dp_index * num_devices_per_data_parallelism_line + - ff.config.tensor_parallelism_degree * pp_block_idx; - // std::cout << "assigning layer " << i << " to devices " << - // first_device_idx - // << "-" - // << first_device_idx + ff.config.tensor_parallelism_degree - 1 - // << std::endl; - assert(first_device_idx < num_devices); - mapping[hidden_states].push_back(machine_views[first_device_idx]); - } - // if (i % num_transformer_layers_per_stage == 0) { - // mapping[hidden_states].push_back( - // machine_views[i / num_transformer_layers_per_stage]); - // } - Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { @@ -279,7 +240,7 @@ void OPT::create_opt_model(FFModel &ff, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; - im.compile_model_and_allocate_buffer(&ff, mapping); + im.compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, opt_config.num_attention_heads, diff --git a/inference/models/opt.h b/inference/models/opt.h index 77d9aae962..d5fa845cd5 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -108,7 +108,6 @@ class OPT { InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, - int num_pipeline_stages, InferenceMode mode, bool use_full_precision = false); }; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index e5a6c8d5e6..9cdcb454a2 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -155,7 +155,7 @@ void FlexFlow::top_level_task(Task const *task, bool verbose = false; size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; int data_parallelism_degree = 1, tensor_parallelism_degree = 1, - pipeline_parallelism_degree = -1; + pipeline_parallelism_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -171,10 +171,10 @@ void FlexFlow::top_level_task(Task const *task, pipeline_parallelism_degree); ffconfig.data_parallelism_degree = data_parallelism_degree; ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; - ffconfig.pipeline_parallelism_degree = - pipeline_parallelism_degree == -1 - ? num_devices / (tensor_parallelism_degree * data_parallelism_degree) - : pipeline_parallelism_degree; + ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree; + assert(data_parallelism_degree * tensor_parallelism_degree * + pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); if (file_paths.ssm_weight_file_paths.size() == 0) { assert(false && @@ -212,8 +212,6 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes / - tensor_parallelism_degree, TREE_VERIFY_MODE, use_full_precision); } else if (model_types.llm_model_type == ModelType::OPT) { @@ -221,8 +219,6 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes / - tensor_parallelism_degree, TREE_VERIFY_MODE, use_full_precision); } else { @@ -233,8 +229,11 @@ void FlexFlow::top_level_task(Task const *task, int num_ssms = model_types.ssm_model_types.size(); std::vector ssm_model_ids; std::vector ssm_models; + FFConfig bm_config = ffconfig; + bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree = + bm_config.pipeline_parallelism_degree = 1; for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { - FFModel beam_model(ffconfig); + FFModel beam_model(bm_config); ssm_models.push_back(beam_model); } @@ -245,7 +244,6 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.ssm_config_file_paths[ssm_id], file_paths.ssm_weight_file_paths[ssm_id], - 1, BEAM_SEARCH_MODE, use_full_precision); } else if (model_types.ssm_model_types[ssm_id] == ModelType::OPT) { @@ -253,7 +251,6 @@ void FlexFlow::top_level_task(Task const *task, im, file_paths.ssm_config_file_paths[ssm_id], file_paths.ssm_weight_file_paths[ssm_id], - 1, BEAM_SEARCH_MODE, use_full_precision); } else { @@ -352,4 +349,4 @@ void FlexFlow::top_level_task(Task const *task, std::cout << "----------inference finished--------------" << std::endl; } -void FlexFlow::register_custom_tasks() {} \ No newline at end of file +void FlexFlow::register_custom_tasks() {} diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index eedd89bd5f..a604c016d2 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -88,7 +88,8 @@ Op *ArgTopK::create_operator_from_layer( int k = value; layer->get_int_property("sorted", value); bool sorted = (bool)value; - return new ArgTopK(model, inputs[0], k, sorted, layer->name); + return new ArgTopK( + model, layer->layer_guid, inputs[0], k, sorted, layer->name); } ArgTopKParams ArgTopK::get_params() const { @@ -108,6 +109,7 @@ bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { } ArgTopK::ArgTopK(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor _input, int _k, bool _sorted, @@ -121,6 +123,8 @@ ArgTopK::ArgTopK(FFModel &model, 1 /*outputs*/, _input), k(_k), sorted(_sorted) { + // overwrite layer_guid + layer_guid = _layer_guid; int numdim = inputs[0]->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; for (int i = 0; i < numdim; i++) { @@ -136,15 +140,16 @@ ArgTopK::ArgTopK(FFModel &model, } ArgTopK::ArgTopK(FFModel &model, + LayerID const &layer_guid, ArgTopK const &other, const ParallelTensor input) - : ArgTopK(model, input, other.k, other.sorted, other.name) {} + : ArgTopK(model, layer_guid, input, other.k, other.sorted, other.name) {} ArgTopK::ArgTopK(FFModel &model, ArgTopKParams const ¶ms, const ParallelTensor input, char const *name) - : ArgTopK(model, input, params.k, params.sorted, name) {} + : ArgTopK(model, params.layer_guid, input, params.k, params.sorted, name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -260,7 +265,7 @@ FutureMap ArgTopK::inference(FFModel const &ff, << std::endl; */ IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&bc, sizeof(BatchConfig)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -295,6 +300,7 @@ InferenceResult assert(regions.size() == 2); assert(task->regions.size() == 2); // const ArgTopK* topk = (const ArgTopK*) task->args; + BatchConfig const *bc = (BatchConfig *)task->args; ArgTopKMeta const *m = *((ArgTopKMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( @@ -302,10 +308,11 @@ InferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - ArgTopK::forward_kernel_wrapper(m, input, indices); + int batch_size = bc->num_active_tokens(); + ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; - int batch_size = input.domain.get_volume() / length; + batch_size = input.domain.get_volume() / length; InferenceResult ir; download_tensor( @@ -319,6 +326,8 @@ void ArgTopK::backward(FFModel const &ff) { } void ArgTopK::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->k); sez.serialize(this->sorted); } @@ -328,11 +337,16 @@ Node ArgTopK::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 1); + size_t id, transformer_layer_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); int k; bool sorted; dez.deserialize(k); dez.deserialize(sorted); ArgTopKParams params; + params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; return ff.get_or_create_node(inputs[0], params); @@ -357,6 +371,7 @@ namespace std { size_t hash::operator()( FlexFlow::ArgTopKParams const ¶ms) const { size_t key = 0; + hash_combine(key, params.layer_guid.id); hash_combine(key, params.k); hash_combine(key, params.sorted); return key; diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index d055e09def..4937166b66 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -411,7 +411,8 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, // float *output_ptr, - GenericTensorAccessorW const &indices) { + GenericTensorAccessorW const &indices, + int batch_size) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // Domain in1_domain = runtime->get_index_space_domain( @@ -442,8 +443,8 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; int k = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; /*TODO: This prints to 5*/ - size_t batch_size = input.domain.get_volume() / length; - assert(indices.domain.get_volume() / k == batch_size); + // size_t batch_size = input.domain.get_volume() / length; + // assert(indices.domain.get_volume() / k == batch_size); hipEvent_t t_start, t_end; if (m->profiling) { diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 9583af525e..575e0183b4 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -406,7 +406,8 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, // float *output_ptr, - GenericTensorAccessorW const &indices) { + GenericTensorAccessorW const &indices, + int batch_size) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -438,9 +439,8 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; int k = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; /*TODO: This prints to 5*/ - size_t batch_size = input.domain.get_volume() / length; - assert(indices.domain.get_volume() / k == batch_size); - + // batch_size = input.domain.get_volume() / length; + // assert(indices.domain.get_volume() / k == batch_size); cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index d67c84a9df..db507c1729 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -429,6 +429,7 @@ void BeamTopK::backward(FFModel const &ff) { void BeamTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->sorted); sez.serialize(this->max_beam_width); } @@ -439,10 +440,11 @@ Node BeamTopK::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 1); bool sorted; - size_t id; + size_t id, transformer_layer_id; int max_beam_width; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(sorted); dez.deserialize(max_beam_width); BeamTopKParams params; diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index 786c3427e9..ce7b6ebc01 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -1012,6 +1012,7 @@ bool Conv2D::estimate_sync_cost(Simulator *sim, void Conv2D::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->out_channels); sez.serialize(this->kernel_h); sez.serialize(this->kernel_w); @@ -1036,9 +1037,10 @@ Node Conv2D::deserialize(FFModel &ff, padding_w, groups; bool use_bias; ActiMode activation; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(out_channels); dez.deserialize(kernel_h); dez.deserialize(kernel_w); diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index cf90919e6b..7562a727d7 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -97,8 +97,13 @@ Op *ElementBinary::create_operator_from_layer( long long value; layer->get_int_property("inplace_a", value); bool inplace_a = (bool)value; - return new ElementBinary( - model, layer->op_type, inputs[0], inputs[1], inplace_a, layer->name); + return new ElementBinary(model, + layer->layer_guid, + layer->op_type, + inputs[0], + inputs[1], + inplace_a, + layer->name); } Tensor FFModel::add(const Tensor in1, @@ -166,10 +171,12 @@ bool ElementBinaryParams::is_valid( bool operator==(ElementBinaryParams const &lhs, ElementBinaryParams const &rhs) { - return lhs.type == rhs.type; + return lhs.type == rhs.type && lhs.layer_guid == rhs.layer_guid && + lhs.inplace_a == rhs.inplace_a; } ElementBinary::ElementBinary(FFModel &model, + LayerID const &_layer_guid, OperatorType _op_type, const ParallelTensor in1, const ParallelTensor in2, @@ -185,6 +192,8 @@ ElementBinary::ElementBinary(FFModel &model, in1, in2), inplace_a(_inplace_a) { + // overwrite layer_guid + layer_guid = _layer_guid; numOutputs = 1; numWeights = 0; assert(in1->data_type == in2->data_type); @@ -217,10 +226,14 @@ ElementBinary::ElementBinary( FFModel &model, ElementBinaryParams const ¶ms, std::pair const &inputs, - char const *name, - bool inplace_a) - : ElementBinary( - model, params.type, inputs.first, inputs.second, inplace_a, name) {} + char const *name) + : ElementBinary(model, + params.layer_guid, + params.type, + inputs.first, + inputs.second, + params.inplace_a, + name) {} void ElementBinary::map_output_tensors(FFModel &ff) { if (has_inplace_output()) { @@ -975,9 +988,41 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, return true; } +void ElementBinary::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->op_type); + sez.serialize(this->inplace_a); +} + +using PCG::Node; +/*static*/ +Node ElementBinary::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + OperatorType op_type; + size_t id, transformer_layer_id; + bool inplace_a; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(op_type); + dez.deserialize(inplace_a); + + ElementBinaryParams params; + params.layer_guid = layer_guid; + params.type = op_type; + params.inplace_a = inplace_a; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} + ElementBinaryParams ElementBinary::get_params() const { ElementBinaryParams params; + params.layer_guid = this->layer_guid; params.type = this->op_type; + params.inplace_a = this->inplace_a; return params; } @@ -987,7 +1032,9 @@ namespace std { size_t hash::operator()( FlexFlow::ElementBinaryParams const ¶ms) const { size_t key = 0; + hash_combine(key, params.layer_guid.id); hash_combine(key, params.type); + hash_combine(key, params.inplace_a); return key; } }; // namespace std diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index f0713dd0a1..69533db53d 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -672,6 +672,7 @@ void ElementUnary::serialize(Legion::Serializer &sez) const { sez.serialize(this->inplace); sez.serialize(scalar); sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); } bool ElementUnary::measure_operator_cost(Simulator *sim, @@ -782,9 +783,10 @@ Node ElementUnary::deserialize(FFModel &ff, dez.deserialize(op_type); dez.deserialize(inplace); dez.deserialize(scalar); - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); ElementUnaryParams params; params.op_type = op_type; diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 77cd748f9c..06e007abef 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -396,6 +396,7 @@ Experts::Experts(FFModel &model, void Experts::serialize(Legion::Serializer &sez) const { ExpertsParams params = get_params(); sez.serialize(params.layer_guid.id); + sez.serialize(params.layer_guid.transformer_layer_id); sez.serialize(params.num_experts); sez.serialize(params.experts_start_idx); sez.serialize(params.experts_output_dim_size); @@ -416,9 +417,10 @@ Node Experts::deserialize(FFModel &ff, float alpha; ActiMode activation; bool use_bias; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(num_experts); dez.deserialize(experts_start_idx); dez.deserialize(experts_output_dim_size); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 3dc442708f..cf01f5bd1e 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -100,6 +100,7 @@ FusedOp::FusedOp(FFModel &model, Op *op) op_num_outputs[0] = op->numOutputs; op_op_type[0] = op->op_type; operators[0] = op; + layer_guid = op->layer_guid; // for (int i = 0; i < numInputs; i++) { // op_input_source[i] = SOURCE_INPUT; // op_input_idx[i] = i; @@ -127,9 +128,9 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { // assert(model.config.find_parallel_config(my_domain.get_dim(), name, // my_config)); assert(model.config.find_parallel_config(op_domain.get_dim(), // op->name, op_config)); - // Cannot fuse parallel operators since they have different paralel_is - // in forward and backward - assert(!op->is_parallel_op()); + // Cannot fuse parallel operators (except allreduce) since they have different + // paralel_is in forward and backward + assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); // Currently don't consider nested fusion assert(op->op_type != OP_FUSED); MachineView my_view = outputs[0]->machine_view; @@ -149,12 +150,14 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { (weight_offset + op->numWeights > MAX_NUM_FUSED_TENSORS) || (output_offset + op->numOutputs > MAX_NUM_FUSED_TENSORS)) { fprintf(stderr, "Cannot fuse. Consider increase MAX_NUM_FUSED_TENSORS\n"); + assert(false); return false; } if (numOperators + 1 > MAX_NUM_FUSED_OPERATORS) { fprintf( stderr, "Reach to the fusion limit. Consider increase MAX_NUM_FUSED_OPERATORS"); + assert(false); return false; } // Set inputs @@ -331,6 +334,92 @@ void FusedOp::init(FFModel const &ff) { } } +void FusedOp::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + // Call init methods in individual operators + Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + int ioff = 0, ooff = 0; + for (int op = 0; op < numOperators; op++) { + // prepare batch_inputs, batch_outputs for operators[i] + std::vector my_batch_inputs; + std::vector my_batch_outputs; + for (int i = 0; i < op_num_inputs[op]; i++) { + int my_off = op_input_idx[i + ioff]; + if (op_input_source[i + ioff] == SOURCE_INPUT) { + my_batch_inputs.push_back(batch_inputs[my_off]); + } else if (op_input_source[i + ioff] == SOURCE_OUTPUT) { + my_batch_inputs.push_back(batch_outputs[my_off]); + } else { + assert(false); + } + } + for (int i = 0; i < op_num_outputs[op]; i++) { + assert(op_output_source[i + ooff] == SOURCE_OUTPUT); + my_batch_outputs.push_back(batch_outputs[i + ooff]); + } + ioff += op_num_inputs[op]; + ooff += op_num_outputs[op]; + operators[op]->init_inference(ff, my_batch_inputs, my_batch_outputs, mv); + for (size_t j = 0; j < domain.get_volume(); j++) { + fused_meta[j].meta[op] = + operators[op]->inference_meta[my_batch_outputs[0]][j]; + } + } + for (size_t j = 0; j < domain.get_volume(); j++) { + fused_meta[j].numOperators = numOperators; + } + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + argmap.set_point(*it, \ + TaskArgument(&fused_meta[idx++], sizeof(FusedOpMeta))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(FUSEDOP_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(FusedOp)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + inference_meta[batch_outputs[0]][idx++] = fm.get_result(*it); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } +} + void FusedOp::forward(FFModel const &ff) { // Set iter_config iter_config = ff.iter_config; @@ -380,6 +469,66 @@ void FusedOp::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap FusedOp::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // Set iter_config + iter_config = ff.iter_config; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig + // so we transfer the maximum of them + size_t batch_config_size = + std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); + IndexLauncher launcher(FUSEDOP_INF_TASK_ID, + parallel_is, + TaskArgument(&bc, batch_config_size), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + int offset = 0; + for (int i = 0; i < numInputs; i++) { + assert(inputs[i]->part != LogicalPartition::NO_PART); + assert(inputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numInputs; + for (int i = 0; i < numWeights; i++) { + assert(weights[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement(RegionRequirement(weights[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numWeights; + for (int i = 0; i < numOutputs; i++) { + assert(outputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + void FusedOp::backward(FFModel const &ff) { // Set iter_config iter_config = ff.iter_config; diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 712ed143b1..c717881e66 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -14,20 +14,29 @@ */ #include "flexflow/ops/fused.h" +#include "flexflow/accessor.h" #include "flexflow/model.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" +#include "flexflow/ops/embedding.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/ops/kernels/dropout_kernels.h" #include "flexflow/ops/kernels/element_binary_kernels.h" +#include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/rms_norm_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/hip_helper.h" #include @@ -373,6 +382,414 @@ __host__ void FusedOp::forward_task(Task const *task, // "[Fused:forward:output]"); } +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](I): outputs +*/ +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; + BatchConfig const *bc = (BatchConfig *)task->args; + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + output_accessor[i] = + helperGetGenericTensorAccessorWO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + hipStream_t stream; + if (start < fused->numOperators) { + checkCUDA(get_legion_stream(&stream)); + } + + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + my_input_accessor[i] = input_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + my_input_accessor[i] = output_accessor[my_off]; + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + my_output_accessor[i] = output_accessor[i + ooff]; + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::forward_kernel_wrapper(m, + my_output_accessor[0], + my_input_accessor, + num_inputs, + m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::forward_kernel(m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + void const *bias_ptr = nullptr; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + bias_ptr = my_weight_accessor[1].ptr; + } else { + assert(fused->op_num_weights[op] == 1); + } + LinearMeta *m = (LinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::forward_kernel_wrapper( + meta, + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + (float const *)nullptr, + m, + n, + k, + batch, + meta->a_seq_length_dim, + meta->b_seq_length_dim, + fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta const *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + GenericTensorAccessorR biases; + if (*m->bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + TreeVerifyBatchConfig const *tree_bc = + (TreeVerifyBatchConfig *)task->args; + assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + GenericTensorAccessorR biases; + if (*m->bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + BeamSearchBatchConfig const *beam_bc = + (BeamSearchBatchConfig *)task->args; + assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + GenericTensorAccessorR biases; + if (*m->bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} + /* regions[...](I): input regions[...](I): weight diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 17b0f9616d..2f84100554 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -20,6 +20,7 @@ #include "flexflow/ops/embedding.h" #include "flexflow/ops/flat.h" #include "flexflow/ops/fused.h" +#include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" @@ -30,7 +31,12 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/rms_norm_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/layer_norm.h" +#include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -62,7 +68,7 @@ OpMeta *FusedOp::init_task(Task const *task, /* regions[...](I): inputs regions[...](I): weights - regions[...](I): outputs + regions[...](O): outputs */ __host__ void FusedOp::forward_task(Task const *task, std::vector const ®ions, @@ -357,7 +363,8 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); } - assert(my_input_accessor[0].data_type == DT_INT64); + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); Kernels::Embedding::forward_kernel_wrapper(m, my_input_accessor[0], my_output_accessor[0], @@ -450,6 +457,436 @@ __host__ void FusedOp::forward_task(Task const *task, // "[Fused:forward:output]"); } +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](O): outputs +*/ +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; + BatchConfig const *bc = (BatchConfig *)task->args; + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + // Domain input_domain[MAX_NUM_INPUTS]; + // Domain weight_domain[MAX_NUM_WEIGHTS]; + // Domain output_domain[MAX_NUM_OUTPUTS]; + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + // input_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i].region.get_index_space()); + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + // weight_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + // output_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + output_accessor[i] = + helperGetGenericTensorAccessorWO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + my_input_accessor[i] = input_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + my_input_accessor[i] = output_accessor[my_off]; + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_accessor[i] = output_accessor[i + ooff]; + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::forward_kernel_wrapper(m, + my_output_accessor[0], + my_input_accessor, + num_inputs, + m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::forward_kernel(m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + void const *bias_ptr = nullptr; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + bias_ptr = my_weight_accessor[1].ptr; + } else { + assert(fused->op_num_weights[op] == 1); + } + LinearMeta *m = (LinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::forward_kernel_wrapper( + meta, + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + (float const *)nullptr, + m, + n, + k, + batch, + meta->a_seq_length_dim, + meta->b_seq_length_dim, + fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta const *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + GenericTensorAccessorR biases; + if (*m->bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + TreeVerifyBatchConfig const *tree_bc = + (TreeVerifyBatchConfig *)task->args; + assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + GenericTensorAccessorR biases; + if (*m->bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + BeamSearchBatchConfig const *beam_bc = + (BeamSearchBatchConfig *)task->args; + assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + GenericTensorAccessorR biases; + if (*m->bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + beta = my_weight_accessor[1]; + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} + /* regions[...](I): input regions[...](I): weight @@ -458,7 +895,6 @@ __host__ void FusedOp::forward_task(Task const *task, regions[...](I/O): weight_grad regions[...](I/O): output_grad */ - __host__ void FusedOp::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/gather.cc b/src/ops/gather.cc index f094fe38b0..635c741d8b 100644 --- a/src/ops/gather.cc +++ b/src/ops/gather.cc @@ -166,6 +166,7 @@ void Gather::serialize(Legion::Serializer &sez) const { GatherParams params = get_params(); sez.serialize(params.legion_dim); sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); } using PCG::Node; @@ -177,9 +178,10 @@ Node Gather::deserialize(FFModel &ff, assert(num_inputs == 2); int legion_dim; dez.deserialize(legion_dim); - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); GatherParams params; params.legion_dim = legion_dim; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 991b6d2236..f5b72b9ac8 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -452,7 +452,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, } cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 55a47d7108..41b9912702 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -75,12 +75,13 @@ void Linear::init_kernel(LinearMeta *m, int batch_size, int channel) { assert(false); } checkCUDNN(miopenSetActivationDescriptor(m->actiDesc, mode, 0.0, 0.0, 0.0)); - checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor, - ff_to_cudnn_datatype(m->output_type), - batch_size, - channel, - 1, - 1)); + checkCUDNN( + miopenSet4dTensorDescriptor(m->outputTensor, + ff_to_cudnn_datatype(m->output_type[0]), + batch_size, + channel, + 1, + 1)); } } @@ -102,7 +103,7 @@ void forward_kernel_wrapper(LinearMeta const *m, hipEventRecord(t_start, stream); } - if (m->input_type == DT_FLOAT) { + if (m->input_type[0] == DT_FLOAT) { Internal::forward_kernel(m, input_ptr, output_ptr, @@ -112,7 +113,7 @@ void forward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type == DT_HALF) { + } else if (m->input_type[0] == DT_HALF) { Internal::forward_kernel(m, input_ptr, output_ptr, @@ -161,7 +162,7 @@ void backward_kernel_wrapper(LinearMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - if (m->input_type == DT_FLOAT) { + if (m->input_type[0] == DT_FLOAT) { Internal::backward_kernel(m, input_ptr, input_grad_ptr, @@ -174,7 +175,7 @@ void backward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type == DT_HALF) { + } else if (m->input_type[0] == DT_HALF) { Internal::backward_kernel(m, input_ptr, input_grad_ptr, @@ -236,9 +237,9 @@ void forward_kernel(LinearMeta const *m, checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); DT alpha = 1.0f, beta = 0.0f; - hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type); - hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type); - hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type); + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; @@ -332,9 +333,9 @@ void backward_kernel(LinearMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); DT alpha = 1.0f; - hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type); - hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type); - hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type); + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; @@ -344,10 +345,10 @@ void backward_kernel(LinearMeta const *m, int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( - m->output_type, output_grad_ptr, output_ptr, output_size, stream); + m->output_type[0], output_grad_ptr, output_ptr, output_size, stream); } else if (m->activation == AC_MODE_SIGMOID) { sigmoid_backward_kernel( - m->output_type, output_grad_ptr, output_ptr, output_size, stream); + m->output_type[0], output_grad_ptr, output_ptr, output_size, stream); } else { // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 3f806dd4f5..06677f86e6 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -96,13 +96,14 @@ void init_kernel(LinearMeta *m, int batch_size, int channel) { } checkCUDNN(cudnnSetActivationDescriptor( m->actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor, - CUDNN_TENSOR_NCHW, - ff_to_cudnn_datatype(m->output_type), - batch_size, - channel, - 1, - 1)); + checkCUDNN( + cudnnSetTensor4dDescriptor(m->outputTensor, + CUDNN_TENSOR_NCHW, + ff_to_cudnn_datatype(m->output_type[0]), + batch_size, + channel, + 1, + 1)); } } @@ -122,7 +123,7 @@ void forward_kernel_wrapper(LinearMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - if (m->input_type == DT_FLOAT) { + if (m->input_type[0] == DT_FLOAT) { Internal::forward_kernel(m, input_ptr, output_ptr, @@ -132,7 +133,7 @@ void forward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type == DT_HALF) { + } else if (m->input_type[0] == DT_HALF) { Internal::forward_kernel(m, input_ptr, output_ptr, @@ -180,7 +181,7 @@ void backward_kernel_wrapper(LinearMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - if (m->input_type == DT_FLOAT) { + if (m->input_type[0] == DT_FLOAT) { Internal::backward_kernel(m, input_ptr, input_grad_ptr, @@ -193,7 +194,7 @@ void backward_kernel_wrapper(LinearMeta const *m, out_dim, batch_size, stream); - } else if (m->input_type == DT_HALF) { + } else if (m->input_type[0] == DT_HALF) { Internal::backward_kernel(m, input_ptr, input_grad_ptr, @@ -295,11 +296,11 @@ void forward_kernel(LinearMeta const *m, checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); DT alpha = 1.0f, beta = 0.0f; - cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = m->offload ? ff_to_cuda_datatype(m->weight_ptr_type) - : ff_to_cuda_datatype(m->weight_type); - cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); + : ff_to_cuda_datatype(m->weight_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance @@ -388,9 +389,9 @@ void backward_kernel(LinearMeta const *m, DT alpha = 1.0f; float sgeam_alpha = 1.0f; - cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); - cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); - cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; @@ -400,10 +401,10 @@ void backward_kernel(LinearMeta const *m, int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( - m->output_type, output_grad_ptr, output_ptr, output_size, stream); + m->output_type[0], output_grad_ptr, output_ptr, output_size, stream); } else if (m->activation == AC_MODE_SIGMOID) { sigmoid_backward_kernel( - m->output_type, output_grad_ptr, output_ptr, output_size, stream); + m->output_type[0], output_grad_ptr, output_ptr, output_size, stream); } else { // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 0c08a2426f..0124c827f3 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -216,33 +216,39 @@ LayerNorm::LayerNorm(FFModel &model, for (int i = 0; i < axes.size(); i++) { M *= inputs[0]->dims[axes[i]].size; } + int num_replicas = 1; + for (int i = 0; i < inputs[0]->num_dims; i++) { + if (inputs[0]->dims[i].is_replica_dim) { + num_replicas *= inputs[0]->dims[i].size; + } + } effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; + effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; assert(elementwise_affine == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { - ParallelDim dims[axes.size()]; - for (int i = 0; i < axes.size(); i++) { - dims[i] = inputs[0]->dims[i]; + ParallelTensorShape beta_gamma_shape = _input->get_shape(); + for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { + beta_gamma_shape.dims[i].size = 1; } int seed = std::rand(); Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 0.0f); - weights[0] = - model.create_parallel_weight_legion_ordering(axes.size(), - dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - gamma_initializer, - CHOSEN_SYNC_TYPE); - weights[1] = - model.create_parallel_weight_legion_ordering(axes.size(), - dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - beta_initializer, - CHOSEN_SYNC_TYPE); + weights[0] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, // axes.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + weights[1] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, //.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); } } @@ -383,13 +389,13 @@ void LayerNorm::forward(FFModel const &ff) { if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[1]->region)); launcher.add_field(3, FID_DATA); @@ -434,13 +440,13 @@ FutureMap LayerNorm::inference(FFModel const &ff, if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[1]->region)); launcher.add_field(3, FID_DATA); @@ -462,8 +468,8 @@ void LayerNorm::forward_task(Task const *task, assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; - GenericTensorAccessorR in; - GenericTensorAccessorW out, gamma, beta; + GenericTensorAccessorR in, gamma, beta; + GenericTensorAccessorW out; Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -486,21 +492,25 @@ void LayerNorm::forward_task(Task const *task, ctx, task->regions[2].region.get_index_space()); // gamma_ptr = helperGetTensorPointerRW( // regions[2], task->regions[2], FID_DATA, ctx, runtime); - gamma = helperGetGenericTensorAccessorRW( + gamma = helperGetGenericTensorAccessorRO( m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); Domain beta_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); // beta_ptr = helperGetTensorPointerRW( // regions[3], task->regions[3], FID_DATA, ctx, runtime); - beta = helperGetGenericTensorAccessorRW( + beta = helperGetGenericTensorAccessorRO( m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); assert(gamma_domain == beta_domain); assert(gamma_domain.get_volume() == m->effective_num_elements); int numdims = gamma_domain.get_dim(); - for (int i = 0; i < numdims; i++) { + size_t vol = 1; + int i = 0; + while (vol < gamma_domain.get_volume()) { int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; assert(g_d == in_d); + vol *= g_d; + i++; } } else { assert(regions.size() == 2); @@ -730,6 +740,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, void LayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->axes.size()); for (size_t i = 0; i < this->axes.size(); i++) { sez.serialize(this->axes[i]); @@ -749,9 +760,10 @@ Node LayerNorm::deserialize(FFModel &ff, std::vector axes; bool elementwise_affine; float eps; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(num_axes); for (size_t i = 0; i < num_axes; i++) { int axis_idx; diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 3f1c621e71..fc6be70c74 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -129,8 +129,8 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, hipStream_t stream) { hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), m->effective_batch_size, @@ -160,8 +160,8 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); if (m->input_type[0] == DT_FLOAT) { diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 35616de980..1f4e7d3933 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -135,8 +135,8 @@ template void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *in_ptr, T *out_ptr, - T *gamma_ptr, - T *beta_ptr, + T const *gamma_ptr, + T const *beta_ptr, cudaStream_t stream) { RowwiseMomentsCUDAKernel <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( @@ -160,8 +160,8 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW &output, - GenericTensorAccessorW &gamma, - GenericTensorAccessorW &beta) { + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index cca92f014f..c5903c1e74 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -504,10 +504,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; m->trainableInputs[0] = linear->trainableInputs[0]; - m->input_type = linear->inputs[0]->data_type; - m->weight_type = linear->weights[0]->data_type; - m->output_type = linear->outputs[0]->data_type; - m->weight_ptr_type = m->input_type; + m->weight_ptr_type = m->input_type[0]; m->quantization_type = linear->quantization_type; m->offload = linear->offload; std::strcpy(m->op_name, linear->name); @@ -573,9 +570,9 @@ FutureMap Linear::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(LINEAR_FWD_TASK_ID, + IndexLauncher launcher(LINEAR_INF_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), + TaskArgument(&bc, sizeof(BatchConfig)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -612,6 +609,52 @@ FutureMap Linear::inference(FFModel const &ff, return runtime->execute_index_space(ctx, launcher); } +void Linear::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LinearMeta const *m = *((LinearMeta **)task->local_args); + BatchConfig const *bc = (BatchConfig *)task->args; + assert(regions.size() == (3 + static_cast(m->use_bias))); + assert(task->regions.size() == (3 + static_cast(m->use_bias))); + if (m->quantization_type == DT_NONE) { + assert(m->input_type[0] == m->weight_type[0]); + } + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + int batch_size = bc->num_active_tokens(); + GenericTensorAccessorR bias; + if (m->use_bias && + !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) { + bias = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + assert(bias.domain.get_volume() == static_cast(out_dim)); + } + forward_kernel_wrapper(m, + input.ptr, + output.ptr, + weight.ptr, + bias.ptr, + in_dim, + out_dim, + batch_size); +} + void Linear::forward_task(Task const *task, std::vector const ®ions, Context ctx, @@ -620,13 +663,13 @@ void Linear::forward_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); if (m->quantization_type == DT_NONE) { - assert(m->input_type == m->weight_type); + assert(m->input_type[0] == m->weight_type[0]); } - assert(m->input_type == m->output_type); + assert(m->input_type[0] == m->output_type[0]); switch (input_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - if (m->output_type == DT_HALF) { \ + if (m->output_type[0] == DT_HALF) { \ if (m->quantization_type != DT_NONE) { \ return forward_task_with_dim( \ task, regions, ctx, runtime); \ @@ -634,7 +677,7 @@ void Linear::forward_task(Task const *task, return forward_task_with_dim( \ task, regions, ctx, runtime); \ } \ - } else if (m->output_type == DT_FLOAT) { \ + } else if (m->output_type[0] == DT_FLOAT) { \ if (m->quantization_type != DT_NONE) { \ return forward_task_with_dim( \ task, regions, ctx, runtime); \ @@ -787,15 +830,15 @@ void Linear::backward_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); if (m->quantization_type == DT_NONE) { - assert(m->input_type == m->weight_type); + assert(m->input_type[0] == m->weight_type[0]); } - assert(m->input_type == m->output_type); + assert(m->input_type[0] == m->output_type[0]); switch (in_domain.get_dim()) { #define DIMFUNC(DIM) \ case DIM: \ - if (m->output_type == DT_HALF) { \ + if (m->output_type[0] == DT_HALF) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type == DT_FLOAT) { \ + } else if (m->output_type[0] == DT_FLOAT) { \ return backward_task_with_dim(task, regions, ctx, runtime); \ } else { \ assert(false && "Unsupported data type"); \ @@ -1068,9 +1111,9 @@ bool Linear::measure_operator_cost(Simulator *sim, m->activation = activation; m->kernel_reg_type = kernel_reg_type; m->kernel_reg_lambda = kernel_reg_lambda; - m->input_type = inputs[0]->data_type; - m->weight_type = this->data_type; - m->output_type = outputs[0]->data_type; + m->input_type[0] = inputs[0]->data_type; + m->weight_type[0] = this->data_type; + m->output_type[0] = outputs[0]->data_type; assert(m->profiling == false); init_kernel(m, output_n, output_c); @@ -1186,6 +1229,7 @@ bool operator==(LinearParams const &lhs, LinearParams const &rhs) { void Linear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->out_channels); sez.serialize(this->activation); sez.serialize(this->kernel_reg_type); @@ -1211,9 +1255,10 @@ Node Linear::deserialize(FFModel &ff, DataType data_type; DataType quantization_type; bool offload; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(out_channels); dez.deserialize(activation); dez.deserialize(kernel_reg_type); diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc index 5761281686..36112b0812 100644 --- a/src/ops/reduce.cc +++ b/src/ops/reduce.cc @@ -374,6 +374,7 @@ void Reduce::serialize(Legion::Serializer &sez) const { } sez.serialize(params.keepdims); sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); } using PCG::Node; @@ -392,9 +393,10 @@ Node Reduce::deserialize(FFModel &ff, axes.push_back(dim_idx); } dez.deserialize(keepdims); - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); return ff.get_or_create_node(inputs[0], {axes, keepdims, layer_guid}); } diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 2b8a60bf21..41c3fcdbf1 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -410,6 +410,7 @@ void Reshape::serialize(Legion::Serializer &sez) const { sez.serialize(this->shape_array[i]); } sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); } using PCG::Node; @@ -427,9 +428,10 @@ Node Reshape::deserialize(FFModel &ff, dez.deserialize(value); shape.push_back(value); } - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); ReshapeParams params; params.shape = shape; diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index a926fd3b22..e0076b5202 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -165,7 +165,11 @@ RMSNorm::RMSNorm(FFModel &model, for (int i = 1; i <= num_dims - 2; i++) { effective_batch_size *= _input->dims[i].size; } - + // Currently assert that all non-replica dims are not parallelized + // We only support parallelism along the replica dim now + for (int i = 0; i < _input->num_dims - 1; i++) { + assert(_input->dims[i].degree == 1); + } // output has the same parallel dims as input ParallelDim output_dims[MAX_TENSOR_DIM]; for (int i = 0; i < _input->num_dims; i++) { @@ -173,15 +177,14 @@ RMSNorm::RMSNorm(FFModel &model, } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, output_dims, _input->data_type, this); - if (allocate_weights) { // weights should have the shape of (data_dim, data_dim) ParallelDim new_weight_dims[MAX_TENSOR_DIM]; - new_weight_dims[0] = _input->dims[_input->num_dims - 1]; - new_weight_dims[1].size = dim; - new_weight_dims[1].degree = 1; - new_weight_dims[1].parallel_idx = -1; + new_weight_dims[0].size = dim; + new_weight_dims[0].degree = 1; + new_weight_dims[0].parallel_idx = -1; + new_weight_dims[1] = _input->dims[_input->num_dims - 1]; // replica dim // weights Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); @@ -189,7 +192,7 @@ RMSNorm::RMSNorm(FFModel &model, model.create_parallel_weight_legion_ordering(2, new_weight_dims, _input->data_type, - NULL /*owner_op*/, + nullptr /*owner_op*/, false /*create_grad*/, kernel_initializer, CHOSEN_SYNC_TYPE); @@ -389,6 +392,7 @@ void RMSNorm::forward_task(Task const *task, void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->eps); sez.serialize(this->dim); } @@ -401,11 +405,12 @@ Node RMSNorm::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 1); float eps; - size_t id; + size_t id, transformer_layer_id; int dim; dez.deserialize(id); + dez.deserialize(transformer_layer_id); - LayerID layer_guid(id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(eps); dez.deserialize(dim); RMSNormParams params; diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 541322efc4..b46ccb4853 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -498,7 +498,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // keys/values to the key-value cache cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), - bc->MAX_NUM_TOKENS * + bc->num_tokens_to_commit * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc new file mode 100644 index 0000000000..123e85c7c5 --- /dev/null +++ b/src/parallel_ops/allreduce.cc @@ -0,0 +1,362 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/allreduce.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::AllReduce; + +/* Params */ +bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { + return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim; +} + +bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +AllReduceParams AllReduce::get_params() const { + AllReduceParams params; + params.allreduce_legion_dim = this->allreduce_dim; + return params; +} + +AllReduce::AllReduce(FFModel &model, + const ParallelTensor _input, + int _allreduce_legion_dim, + char const *name) + : ParallelOp(model, OP_ALLREDUCE, name, _input), + allreduce_dim(_allreduce_legion_dim) { + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + assert(dims[allreduce_dim].degree > 1); + // ParallelTensorBase::update_parallel_ids(numdim, dims); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this); +} + +AllReduce::AllReduce(FFModel &model, + AllReduceParams const ¶ms, + ParallelTensor const input, + char const *name) + : AllReduce(model, input, params.allreduce_legion_dim, name) {} + +void AllReduce::create_input_partition(FFModel &ff) { + // Do nothing + return; +} + +void AllReduce::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + // Do nothing + return; +} + +OpMeta *AllReduce::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + AllReduce *ar = (AllReduce *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + AllReduceMeta *meta = new AllReduceMeta(handle, ar); + meta->input_type[0] = ar->inputs[0]->data_type; + meta->output_type[0] = ar->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + return meta; +} + +void AllReduce::init(FFModel const &ff) { + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ALLREDUCE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AllReduce)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void AllReduce::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + parallel_is = batch_outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ALLREDUCE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AllReduce)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +FutureMap AllReduce::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +bool AllReduce::measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const { + cost_metrics = CostMetrics(); + cost_metrics.forward_time = 0.0f; + cost_metrics.backward_time = 0.0f; + + cost_metrics.sync_time = 0; + cost_metrics.inputs_memory = 0; + cost_metrics.outputs_memory = 0; + cost_metrics.weights_memory = 0; + return true; +} + +bool AllReduce::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_ALLREDUCE_DIM: + *value = allreduce_dim; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool AllReduce::append_parallel_op_info( + std::vector ¶llel_ops) const { + ParallelOpInfo ret; + ret.op_type = op_type; + ret.parallel_dim = allreduce_dim; + ret.parallel_degree = -1; // AllReduce does not affect parallel degree + parallel_ops.push_back(ret); + return true; +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::AllReduceParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.allreduce_legion_dim); + return key; +} + +} // namespace std diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index a4169ea306..198f450636 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -88,7 +88,7 @@ Combine::Combine(FFModel &model, dims[combine_dim].degree /= combine_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this); + numdim, dims, _input->data_type, this); // inputs[0]->print("Combine::input"); // outputs[0]->print("Combine::output"); } @@ -97,11 +97,13 @@ OpMeta *Combine::init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Combine *rep = (Combine *)task->args; - // FFHandler handle = *((FFHandler *)task->local_args); - // CombineMeta* m = new CombineMeta(handle); - // m->data_type = rep->outputs[0]->data_type; - return nullptr; + Combine *cmb = (Combine *)task->args; + FFHandler handle = *((FFHandler *)task->local_args); + CombineMeta *m = new CombineMeta(handle); + m->input_type[0] = cmb->inputs[0]->data_type; + m->output_type[0] = cmb->outputs[0]->data_type; + assert(m->input_type[0] == m->output_type[0]); + return m; } void Combine::init(FFModel const &ff) { @@ -111,6 +113,7 @@ void Combine::init(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); IndexLauncher launcher(COMBINE_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(Combine)), @@ -130,6 +133,48 @@ void Combine::init(FFModel const &ff) { launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void Combine::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + parallel_is = batch_outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(COMBINE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Combine)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + assert(inference_input_lps.find(batch_inputs[0]) != + inference_input_lps.end()); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); } void Combine::create_input_partition(FFModel &ff) { @@ -147,6 +192,61 @@ void Combine::create_input_partition(FFModel &ff) { output_grad_lp); } +void Combine::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + // input_lp is a disjoint partition + ff.create_disjoint_partition(batch_outputs[0]->num_dims, + batch_outputs[0]->dims, + batch_outputs[0]->parallel_is, + batch_inputs[0]->region, + inference_input_lps[batch_inputs[0]]); +} + +FutureMap Combine::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(COMBINE_FWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_input_lps[batch_inputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Combine::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -157,7 +257,7 @@ void Combine::forward(FFModel const &ff) { DataType data_type = inputs[0]->data_type; IndexLauncher launcher(COMBINE_FWD_TASK_ID, outputs[0]->parallel_is, - TaskArgument(&data_type, sizeof(data_type)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -261,8 +361,11 @@ void Combine::forward_task(Task const *task, Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); - DataType data_type = *((DataType *)task->args); - if (data_type == DT_FLOAT) { + CombineMeta const *m = *((CombineMeta **)task->local_args); + DataType data_type = m->input_type[0]; + if (data_type == DT_HALF) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_FLOAT) { forward_task_with_type(task, regions, ctx, runtime); } else if (data_type == DT_DOUBLE) { forward_task_with_type(task, regions, ctx, runtime); diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp new file mode 100644 index 0000000000..78742568c6 --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -0,0 +1,46 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + assert(false && "To be implemented"); +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu new file mode 100644 index 0000000000..1ae9ee27b8 --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -0,0 +1,56 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp index 2d748cfab3..d6e9568223 100644 --- a/src/parallel_ops/kernels/combine_kernels.cpp +++ b/src/parallel_ops/kernels/combine_kernels.cpp @@ -51,6 +51,9 @@ void backward_kernel(T const *output_grad_ptr, num_elements); } +template void forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements); template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements); @@ -63,6 +66,9 @@ template void forward_kernel(int32_t const *input_ptr, template void forward_kernel(int64_t const *input_ptr, int64_t *output_ptr, size_t num_elements); +template void backward_kernel(half const *output_grad_ptr, + half *input_grad_ptr, + size_t num_elements); template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu index d8f414ef0f..1ab79a7944 100644 --- a/src/parallel_ops/kernels/combine_kernels.cu +++ b/src/parallel_ops/kernels/combine_kernels.cu @@ -44,6 +44,9 @@ void backward_kernel(T const *output_grad_ptr, input_grad_ptr, output_grad_ptr, num_elements); } +template void forward_kernel(half const *input_ptr, + half *output_ptr, + size_t num_elements); template void forward_kernel(float const *input_ptr, float *output_ptr, size_t num_elements); @@ -56,6 +59,9 @@ template void forward_kernel(int32_t const *input_ptr, template void forward_kernel(int64_t const *input_ptr, int64_t *output_ptr, size_t num_elements); +template void backward_kernel(half const *output_grad_ptr, + half *input_grad_ptr, + size_t num_elements); template void backward_kernel(float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 6ef06e1f65..1aa216e5c9 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -461,6 +461,24 @@ cudaDataType_t ff_to_cuda_datatype(DataType type) { return CUDA_R_32F; } +#ifdef FF_USE_NCCL +ncclDataType_t ff_to_nccl_datatype(DataType type) { + switch (type) { + case DT_HALF: + return ncclHalf; + case DT_FLOAT: + return ncclFloat; + case DT_DOUBLE: + return ncclDouble; + case DT_INT32: + return ncclInt; + default: + assert(false && "Unspoorted nccl data type"); + } + return ncclFloat; +} +#endif + cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type) { switch (type) { case CUDNN_DATA_FLOAT: @@ -500,6 +518,8 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + add_kernel(half *dst, half const *src, size_t size); template __global__ void add_kernel(float *dst, float const *src, size_t size); template __global__ void @@ -509,8 +529,12 @@ template __global__ void template __global__ void add_kernel(int64_t *dst, int64_t const *src, size_t size); +template __global__ void + copy_kernel(half *dst, half const *src, coord_t size); template __global__ void copy_kernel(float *dst, float const *src, coord_t size); +template __global__ void + copy_kernel(double *dst, double const *src, coord_t size); template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index d2b68595bd..39f9d1dd0d 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -187,6 +187,8 @@ std::string get_operator_type_name(OperatorType type) { return "Replicate"; case OP_REDUCTION: return "Reduction"; + case OP_ALLREDUCE: + return "AllReduce"; case OP_PIPELINE: return "Pipeline"; case OP_FUSED_PARALLEL: diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 91e0d077c4..2b94f07999 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -1,11 +1,15 @@ #include "flexflow/fftype.h" +#include "flexflow/config.h" #include namespace FlexFlow { -LayerID::LayerID() : id(0) {} +const LayerID LayerID::NO_ID = LayerID(); -LayerID::LayerID(size_t _id) : id(_id) { +LayerID::LayerID() : id(0), transformer_layer_id(MAX_NUM_TRANSFORMER_LAYERS) {} + +LayerID::LayerID(size_t _id, size_t _transformer_layer_id) + : id(_id), transformer_layer_id(_transformer_layer_id) { assert(is_valid_id()); } @@ -14,7 +18,11 @@ bool LayerID::is_valid_id() const { } bool operator==(LayerID const &lhs, LayerID const &rhs) { + // id should be sufficient to distinguish different layers + if (lhs.id == rhs.id) { + assert(lhs.transformer_layer_id == rhs.transformer_layer_id); + } return lhs.id == rhs.id; } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index e8a1b6f9f1..5c0513baa8 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -46,6 +46,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -1961,14 +1962,61 @@ std::pair, std::unordered_map> } curr_best_graph = std::unique_ptr(graph); MachineView data_parallel_view; - data_parallel_view.device_type = MachineView::GPU; - data_parallel_view.ndims = 1; - data_parallel_view.dim[0] = - model->config.numNodes * model->config.workersPerNode; - data_parallel_view.stride[0] = 1; - data_parallel_view.start_device_id = 0; + int degree, num_transformer_layers_per_stage; + if (model->config.computationMode == COMP_MODE_TRAINING) { + data_parallel_view.device_type = MachineView::GPU; + data_parallel_view.ndims = 1; + data_parallel_view.dim[0] = + model->config.numNodes * model->config.workersPerNode; + data_parallel_view.stride[0] = 1; + data_parallel_view.start_device_id = 0; + } else { + // Currently assume a 1D machine view is needed + assert(model->config.data_parallelism_degree == 1 || + model->config.tensor_parallelism_degree == 1); + degree = model->config.data_parallelism_degree * + model->config.tensor_parallelism_degree; + num_transformer_layers_per_stage = + model->current_transformer_layer_id / + model->config.pipeline_parallelism_degree + + 1; + } for (auto const &node : curr_best_graph->inEdges) { - curr_optimal_views[node.first] = data_parallel_view; + Op const *op = node.first.ptr; + if (model->config.computationMode == COMP_MODE_TRAINING) { + curr_optimal_views[node.first] = data_parallel_view; + } else { + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + int total_parallel_degree = 1; + for (int i = 0; i < op->outputs[0]->num_dims; i++) { + total_parallel_degree *= op->outputs[0]->dims[i].degree; + } + mv.dim[0] = total_parallel_degree; + mv.stride[0] = 1; + LayerID layer_guid = op->layer_guid; + if (op->op_type == OP_INPUT) { + // All inputs are assigned to the first stage + layer_guid.transformer_layer_id = 0; + } else if (layer_guid == LayerID::NO_ID) { + // Assert that we only have a single input + while (op->layer_guid == LayerID::NO_ID) { + assert(op->numInputs == 1); + op = op->inputs[0]->owner_op; + assert(op != nullptr); + } + layer_guid = op->layer_guid; + } + mv.start_device_id = degree * (layer_guid.transformer_layer_id / + num_transformer_layers_per_stage); + assert(mv.start_device_id + degree - 1 < + model->config.numNodes * model->config.workersPerNode); + curr_optimal_views[node.first] = mv; + for (int i = 0; i < node.first.ptr->numOutputs; i++) { + assert(node.first.ptr->outputs[i]->is_valid_machine_view(mv)); + } + } } } else { // Main step to optimize the PCG of an FFModel @@ -2237,23 +2285,17 @@ GraphOptimalViewSerialized case OP_EMBEDDING: { Embedding *embed = (Embedding *)op; sez.serialize(embed->layer_guid.id); + sez.serialize(embed->layer_guid.transformer_layer_id); sez.serialize(embed->num_entries); sez.serialize(embed->out_channels); sez.serialize(embed->aggr); sez.serialize(embed->data_type); break; } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_MAX: - case OP_EW_MIN: { - sez.serialize(op->op_type); - break; - } case OP_MULTIHEAD_ATTENTION: { MultiHeadAttention *attn = (MultiHeadAttention *)op; sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); sez.serialize(attn->qProjSize); @@ -2267,6 +2309,7 @@ GraphOptimalViewSerialized case OP_INC_MULTIHEAD_SELF_ATTENTION: { IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); sez.serialize(attn->qProjSize); @@ -2287,6 +2330,7 @@ GraphOptimalViewSerialized SpecIncMultiHeadSelfAttention *attn = (SpecIncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); sez.serialize(attn->qProjSize); @@ -2305,6 +2349,7 @@ GraphOptimalViewSerialized TreeIncMultiHeadSelfAttention *attn = (TreeIncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); sez.serialize(attn->qProjSize); @@ -2324,6 +2369,7 @@ GraphOptimalViewSerialized case OP_INC_MULTIQUERY_SELF_ATTENTION: { IncMultiQuerySelfAttention *attn = (IncMultiQuerySelfAttention *)op; sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); sez.serialize(attn->qProjSize); @@ -2363,6 +2409,11 @@ GraphOptimalViewSerialized sez.serialize(combine->combine_degree); break; } + case OP_ALLREDUCE: { + AllReduce *allreduce = (AllReduce *)op; + sez.serialize(allreduce->allreduce_dim); + break; + } case OP_FUSED_PARALLEL: { FusedParallelOp *fused = (FusedParallelOp *)op; sez.serialize(fused->num_parallel_ops); @@ -2589,10 +2640,11 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); AggrMode aggr; int num_entries, out_channels; - size_t id; + size_t id, transformer_layer_id; DataType data_type; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(num_entries); dez.deserialize(out_channels); dez.deserialize(aggr); @@ -2612,11 +2664,7 @@ void FFModel::deserialize_graph_optimal_view( case OP_EW_MUL: case OP_EW_MAX: case OP_EW_MIN: { - assert(num_inputs == 2); - OperatorType op_type; - dez.deserialize(op_type); - node = get_or_create_node({inputs[0], inputs[1]}, - {op_type}); + node = ElementBinary::deserialize(*this, dez, inputs, num_inputs); break; } case OP_CONV2D: { @@ -2667,9 +2715,10 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_heads, k_dim, v_dim; float dropout; bool bias, add_bias_kv, add_zero_attn; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); dez.deserialize(num_heads); dez.deserialize(k_dim); @@ -2700,9 +2749,10 @@ void FFModel::deserialize_graph_optimal_view( bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload; DataType quantization_type; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); dez.deserialize(num_heads); dez.deserialize(k_dim); @@ -2743,9 +2793,10 @@ void FFModel::deserialize_graph_optimal_view( float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); dez.deserialize(num_heads); dez.deserialize(k_dim); @@ -2784,9 +2835,10 @@ void FFModel::deserialize_graph_optimal_view( bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload; DataType quantization_type; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); dez.deserialize(num_heads); dez.deserialize(k_dim); @@ -2828,9 +2880,10 @@ void FFModel::deserialize_graph_optimal_view( float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; - size_t id; + size_t id, transformer_layer_id; dez.deserialize(id); - LayerID layer_guid(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); dez.deserialize(num_heads); dez.deserialize(k_dim); @@ -2949,6 +3002,13 @@ void FFModel::deserialize_graph_optimal_view( {reduction_dim, reduction_degree}); break; } + case OP_ALLREDUCE: { + assert(num_inputs == 1); + int allreduce_dim; + dez.deserialize(allreduce_dim); + node = get_or_create_node(inputs[0], {allreduce_dim}); + break; + } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); std::vector parallel_ops; diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 6354c5d737..9bcccb041a 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -372,16 +372,23 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + add_kernel(half *dst, half const *src, size_t size); template __global__ void add_kernel(float *dst, float const *src, size_t size); template __global__ void add_kernel(double *dst, double const *src, size_t size); -template __global__ void add_kernel(int *dst, int const *src, size_t size); template __global__ void - add_kernel(long *dst, long const *src, size_t size); + add_kernel(int32_t *dst, int32_t const *src, size_t size); +template __global__ void + add_kernel(int64_t *dst, int64_t const *src, size_t size); +template __global__ void + copy_kernel(half *dst, half const *src, coord_t size); template __global__ void copy_kernel(float *dst, float const *src, coord_t size); +template __global__ void + copy_kernel(double *dst, double const *src, coord_t size); template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void @@ -406,13 +413,19 @@ template __global__ void apply_add_with_scale(int64_t *data_ptr, template __host__ void print_tensor(float const *ptr, size_t rect, char const *prefix); +template __host__ void + print_tensor(double const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int32_t const *ptr, size_t rect, char const *prefix); template __host__ void print_tensor(int64_t const *ptr, size_t rect, char const *prefix); +template __host__ void + print_tensor(half const *ptr, size_t rect, char const *prefix); template __host__ float *download_tensor(float const *ptr, size_t num_elements); +template __host__ half *download_tensor(half const *ptr, + size_t num_elements); template __host__ double *download_tensor(double const *ptr, size_t num_elements); template __host__ int32_t *download_tensor(int32_t const *ptr, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 67a78f9700..b6be945a94 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -54,6 +54,7 @@ InferenceManager::InferenceManager(FFConfig const &_config, num_devices && "Product of data, tensor, and pipeline parallelism degrees does not " "match the number of available devices"); + // Deprecated logic below // populate array of valid single-device machine views for (int i = 0; i < num_devices; i++) { MachineView view; @@ -94,23 +95,23 @@ bool parallel_tensor_list_overlaps(std::vector const &list1, return false; } -void InferenceManager::compile_model_and_allocate_buffer( - FFModel *model, - std::unordered_map> const - &tensor_mapping) { +void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { + // TODO: currently assume there is a single data-parallel pipeline + // (i.e., data-parallel-degree == 1) + assert(model->config.data_parallelism_degree == 1); model->config.batchSize = max_num_tokens_per_batch; model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; - std::unordered_map> mapping; - for (auto const &it : tensor_mapping) { - ParallelTensor pt; - model->get_parallel_tensor_from_tensor(it.first, pt); - assert(pt->owner_op != nullptr); - mapping[pt->owner_op] = it.second; - } // std::cout << std::endl << std::endl << "Operators MVs:" << std::endl; + int num_transformer_layers_per_stage = + model->current_transformer_layer_id / + model->config.pipeline_parallelism_degree + + 1; + int degree = model->config.data_parallelism_degree * + model->config.tensor_parallelism_degree; + for (int op_idx = 0; op_idx < model->operators.size(); op_idx++) { Op const *op = model->operators[op_idx]; // Skip weight operators @@ -119,52 +120,35 @@ void InferenceManager::compile_model_and_allocate_buffer( } // Get machine views std::vector machine_views; - if (mapping.find(op) != mapping.end()) { - machine_views = mapping[op]; - assert(machine_views.size() == ff_config.data_parallelism_degree); - } else { - // Mapping the current operator using the same machine - // view as the inputs - assert(op->numInputs > 0); - for (int j = 0; j < ff_config.data_parallelism_degree; j++) { - MachineView mv = tensor_buffer[op->inputs[0]][j]->machine_view; - for (int k = 1; k < op->numInputs; k++) { - if (mv != tensor_buffer[op->inputs[k]][j]->machine_view) { - fprintf(stderr, - "[Warning] a potentially unnecessary " - " inter-GPU copy of size %zu\n", - op->inputs[k]->get_volume()); - // Heuristics: we use the mv with a larger start_device_id - // to promote load balancing - if (mv.start_device_id < - tensor_buffer[op->inputs[k]][j]->machine_view.start_device_id) { - mv = tensor_buffer[op->inputs[k]][j]->machine_view; - } - } - } - if (op->op_type == OP_REPLICATE) { - // std::cout << "Replicate operator got machine view: " << mv - // << std::endl; - assert(model->config.tensor_parallelism_degree > 1); - mv.dim[0] = ff_config.tensor_parallelism_degree; - mv.stride[0] = 1; - if (mv.start_device_id + mv.dim[0] > num_devices) { - mv.start_device_id -= - (mv.start_device_id + mv.dim[0]) - num_devices; - } - // std::cout << "Corrected machine view: " << mv << std::endl; - } else if (op->op_type == OP_REDUCTION) { - // std::cout << "Reduction operator got machine view: " << mv - // << std::endl; - assert(model->config.tensor_parallelism_degree > 1); - mv.dim[0] = 1; - mv.stride[0] = 0; - // std::cout << "Corrected machine view: " << mv << std::endl; + for (int j = 0; j < model->config.data_parallelism_degree; j++) { + MachineView mv; + mv.device_type == MachineView::GPU; + mv.ndims = 1; + // mv.start_device_id = 0; + mv.stride[0] = 1; + int parallel_degree = 1; + for (int k = 0; k < op->outputs[0]->num_dims; k++) { + parallel_degree *= op->outputs[0]->dims[k].degree; + } + mv.dim[0] = parallel_degree; + LayerID layer_guid = op->layer_guid; + if (op->op_type == OP_INPUT) { + // All inputs are assigned to the first stage + layer_guid.transformer_layer_id = 0; + } else if (layer_guid == LayerID::NO_ID) { + Op const *op_with_guid = op; + // Assert that we only have a single input + while (op_with_guid->layer_guid == LayerID::NO_ID) { + assert(op_with_guid->numInputs == 1); + op_with_guid = op_with_guid->inputs[0]->owner_op; + assert(op_with_guid != nullptr); } - assert(mv.start_device_id + mv.dim[0] <= num_devices); - machine_views.push_back(mv); + layer_guid = op_with_guid->layer_guid; } - assert(machine_views.size() == ff_config.data_parallelism_degree); + mv.start_device_id = degree * (layer_guid.transformer_layer_id / + num_transformer_layers_per_stage); + assert(mv == op->outputs[0]->machine_view); + machine_views.push_back(mv); } // std::cout << "operator: " << op->name << std::endl; // for (int i = 0; i < op->numInputs; i++) { @@ -232,7 +216,7 @@ void InferenceManager::compile_model_and_allocate_buffer( } } if (!found_parallel_tensor) { - for (int j = 0; j < ff_config.data_parallelism_degree; j++) { + for (int j = 0; j < model->config.data_parallelism_degree; j++) { // Copy the metadata from pt_base to pt ParallelTensor pt = new ParallelTensorBase(*pt_base); pt->region = @@ -257,7 +241,7 @@ void InferenceManager::compile_model_and_allocate_buffer( } void InferenceManager::init_operators_inference(FFModel *model) { - for (int batch_index = 0; batch_index < ff_config.data_parallelism_degree; + for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { int expert_device_index = 0; int device_index = batch_index % num_devices; @@ -313,7 +297,7 @@ FutureMap InferenceManager::inference(FFModel *model, assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) - int batch_index = index % ff_config.data_parallelism_degree; + int batch_index = index % model->config.data_parallelism_degree; FutureMap fm; bool found_input_operator = false; for (size_t o = 0; o < model->operators.size(); o++) { @@ -410,15 +394,19 @@ void InferenceManager::load_positions(BatchConfig const &bc, runtime->execute_index_space(ctx, launcher); } +void FFModel::set_transformer_layer_id(int id) { + // We assume that users call this function with + // monotonically increasing ids + assert(id == current_transformer_layer_id + 1 || + (id == 0 && current_transformer_layer_id == 0)); + current_transformer_layer_id = id; + assert(id < MAX_NUM_TRANSFORMER_LAYERS); +} + void FFModel::compile_inference() { Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; - { - fprintf( - stderr, - "Note: inference currently only supports data/pipeline parallel.\n"); - } create_operators_from_layers(); // Launch the graph optimize task { @@ -651,5 +639,42 @@ void FFModel::compile_inference() { handle.get_tree_id()); } } +#ifdef FF_USE_NCCL + for (size_t l = 0; l < operators.size(); l++) { + // Only create nccl for allreduce and fusedop for inference + // (fusedop may include allreduces) + if (operators[l]->op_type == OP_ALLREDUCE || + operators[l]->op_type == OP_FUSED) { + MachineView view = operators[l]->outputs[0]->machine_view; + if (view_hash_to_nccl_comms.find(view.hash()) == + view_hash_to_nccl_comms.end()) { + TaskLauncher launcher(NCCL_GETUNIQUEID_TASK_ID, TaskArgument(NULL, 0)); + Future future = runtime->execute_task(ctx, launcher); + ncclUniqueId ncclId = future.get_result(); + IndexSpace task_is = get_or_create_task_is(view); + ArgumentMap argmap; + IndexLauncher index_launcher( + NCCL_INIT_COMMS_TASK_ID, + task_is, + TaskArgument(&ncclId, sizeof(ncclUniqueId)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + view.hash() /*MappingTagID*/); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + int idx = 0; + Domain task_domain = runtime->get_index_space_domain(ctx, task_is); + ncclComm_t *nccl_comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * task_domain.get_volume()); + for (Domain::DomainPointIterator it(task_domain); it; it++, idx++) { + nccl_comms[idx] = fm.get_result(*it); + } + view_hash_to_nccl_comms[view.hash()] = nccl_comms; + } + } + } +#endif } }; // namespace FlexFlow diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc index 6dfd5f2f35..d2473f4b2b 100644 --- a/src/runtime/layer.cc +++ b/src/runtime/layer.cc @@ -16,8 +16,9 @@ Layer::Layer(FFModel *model, const Tensor _input3, const Tensor _input4) : op_type(_otype), data_type(_dtype), - layer_guid(model->layer_global_guid++), numInputs(_numInputs), - numWeights(_numWeights), numOutputs(_numOutputs) { + layer_guid(model->layer_global_guid++, + model->current_transformer_layer_id), + numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs) { std::string pcname; if (_name == nullptr) { pcname = get_operator_type_name(op_type); @@ -50,8 +51,9 @@ Layer::Layer(FFModel *model, int _numOutputs, Tensor const *_tensors) : op_type(_otype), data_type(_dtype), - layer_guid(model->layer_global_guid++), numInputs(_numInputs), - numWeights(_numWeights), numOutputs(_numOutputs) { + layer_guid(model->layer_global_guid++, + model->current_transformer_layer_id), + numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs) { std::string pcname; if (_name == nullptr) { pcname = get_operator_type_name(op_type); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 64c3a2eb61..763a5bcfd5 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -58,6 +58,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -990,6 +991,7 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, Runtime *runtime = ff.config.lg_hlr; Domain domain = runtime->get_index_space_domain(ctx, this->parallel_is); MachineView const view = output0->machine_view; + assert(ff.config.computationMode == COMP_MODE_INFERENCE); switch (domain.get_dim()) { #ifdef FF_USE_NCCL #define DIMFUNC(DIM) \ @@ -998,8 +1000,7 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, int idx = 0; \ for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ - if (ff.config.computationMode == COMP_MODE_TRAINING && \ - op_type == OP_WEIGHT) { \ + if (op_type == OP_ALLREDUCE) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1302,8 +1303,9 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) layer_global_guid(LAYER_GUID_FIRST_VALID), tensor_global_guid(TENSOR_GUID_FIRST_VALID), parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID), - node_global_guid(NODE_GUID_FIRST_VALID), config(_config), optimizer(NULL), - loss_op(NULL), metrics_op(NULL), simulator(NULL) { + node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0), + config(_config), optimizer(NULL), loss_op(NULL), metrics_op(NULL), + simulator(NULL) { this->search = new PCG::SearchHelper(this); this->graph_search = new PCG::GraphSearchHelper(this); this->cpu_offload = cpu_offload; @@ -1348,7 +1350,7 @@ ncclComm_t *FFModel::find_nccl_comms(MachineView const &view) const { auto const &it = view_hash_to_nccl_comms.find(view.hash()); if (it == view_hash_to_nccl_comms.end()) { assert(config.computationMode == COMP_MODE_INFERENCE); - return NULL; + return nullptr; } else { return it->second; } @@ -2630,9 +2632,14 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[l]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[l]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[l]->is_parallel_op() && + operators[l]->op_type != OP_ALLREDUCE) { + continue; + } + // don't fuse softmax since it returns inference results + if (operators[l]->op_type == OP_SOFTMAX) { continue; } size_t start = 0; @@ -2675,9 +2682,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[i]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[i]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[i]->is_parallel_op() && + operators[i]->op_type != OP_ALLREDUCE) { continue; } fused_op = new FusedOp(*this, operators[i]); @@ -2967,7 +2975,51 @@ void FFModel::create_operators_from_layers() { inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } Op *op = nullptr; - // add replicate operators if needed + // add a combine before arg_topk + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && l->op_type == OP_ARG_TOPK) { + std::vector partitioned_inputs; + assert(inputs.size() == 1); + Combine *comb = new Combine(*this, + inputs[0], + 0 /*inner most dim*/, + config.tensor_parallelism_degree); + partitioned_inputs.push_back(comb->outputs[0]); + operators.push_back(comb); + op = create_operator_from_layer(l, partitioned_inputs); + } else { + op = create_operator_from_layer(l, inputs); + } + // add replicate operators after op if needed + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { + assert(op->numOutputs == 1); + Replicate *repl = new Replicate(*this, + op->outputs[0], + op->outputs[0]->num_dims - 1, + config.tensor_parallelism_degree); + operators.push_back(repl); + op = repl; + } else if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR))) { + assert(op->numOutputs == 1); + AllReduce *allreduce = + new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + operators.push_back(allreduce); + op = allreduce; + } +#ifdef DEADCODE if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || @@ -3022,7 +3074,7 @@ void FFModel::create_operators_from_layers() { operators.push_back(reduct); op = reduct; } - +#endif assert(op->numOutputs == l->numOutputs); for (int i = 0; i < op->numOutputs; i++) { tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; @@ -3364,13 +3416,10 @@ void FFModel::compile(LossType loss_type, } #ifdef FF_USE_NCCL - if (config.computationMode == COMP_MODE_TRAINING) { - // init all nccl communicators - for (size_t l = 0; l < operators.size(); l++) { - // Only create nccl for weights - if (operators[l]->op_type != OP_WEIGHT) { - continue; - } + for (size_t l = 0; l < operators.size(); l++) { + // Only create nccl for weights in training + if ((operators[l]->op_type == OP_WEIGHT && + config.computationMode == COMP_MODE_TRAINING)) { MachineView view = operators[l]->outputs[0]->machine_view; if (view_hash_to_nccl_comms.find(view.hash()) == view_hash_to_nccl_comms.end()) { @@ -3789,6 +3838,9 @@ FFConfig::FFConfig() { } // Use Real::Machine::get_address_space_count() to obtain the number of nodes numNodes = Realm::Machine::get_machine().get_address_space_count(); + data_parallelism_degree = 1; + tensor_parallelism_degree = 1; + pipeline_parallelism_degree = 1; Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; @@ -4426,6 +4478,13 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "Linear Init Task"); } + { + TaskVariantRegistrar registrar(LINEAR_INF_TASK_ID, "Linear Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "Linear Inference Task"); + } { TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -4836,6 +4895,13 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "FusedOp Forward Task"); } + { + TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "FusedOp Inference Task"); + } { TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -4935,6 +5001,28 @@ void register_flexflow_internal_tasks() { Runtime::preregister_task_variant( registrar, "Reduction Backward Task"); } + // AllReduce + { + TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "AllReduce init Task"); + } + { + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } + { + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 8fdeacc623..6b61d5ac7a 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -34,6 +34,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -105,6 +106,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Reduction *)op)->get_params(); case OP_COMBINE: return ((Combine *)op)->get_params(); + case OP_ALLREDUCE: + return ((AllReduce *)op)->get_params(); case OP_FUSED_PARALLEL: return ((FusedParallelOp *)op)->get_params(); case OP_TRANSPOSE: diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index b47b17ad12..478092727f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -243,6 +243,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; + outputFile << "num decoding steps: " << profile_info.decoding_steps + << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -562,6 +564,8 @@ BeamSearchBatchConfig outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; + outputFile << "num decoding steps: " << profile_info.decoding_steps + << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 58623258f1..6a61e70fc6 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -37,6 +37,7 @@ #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" @@ -898,8 +899,11 @@ bool GraphXfer::create_new_operator(OpX const *opx, Node &op) { case OP_EW_MUL: case OP_EW_MAX: case OP_EW_MIN: { + ElementBinaryParams params; + params.type = opx->type; + params.inplace_a = false; op = model->get_or_create_node({inputs[0], inputs[1]}, - {opx->type}); + params); break; } case OP_RELU: { @@ -3683,8 +3687,13 @@ bool FFModel::convert_graph_to_operators( case OP_EW_MIN: { assert(inList.size() == 2); ElementBinary *eb = (ElementBinary *)node.ptr; - new_op = new ElementBinary( - *this, eb->op_type, inputs[0], inputs[1], eb->inplace_a, NULL); + new_op = new ElementBinary(*this, + eb->layer_guid, + eb->op_type, + inputs[0], + inputs[1], + eb->inplace_a, + NULL); break; } case OP_POOL2D: { @@ -3777,6 +3786,12 @@ bool FFModel::convert_graph_to_operators( reduction->reduction_degree); break; } + case OP_ALLREDUCE: { + assert(inList.size() == 1); + AllReduce *allreduce = (AllReduce *)node.ptr; + new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + break; + } case OP_FUSED_PARALLEL: { assert(inList.size() == 1); FusedParallelOp *fused = (FusedParallelOp *)node.ptr; diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 761c6cf332..f50d374633 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -37,26 +37,26 @@ mkdir -p ../inference/output ############################################################################################### # LLAMA -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama.txt +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half.txt +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt.txt +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 # OPT (half precision) -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half.txt +../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_tp.txt -tensor-parallelism-degree 2 + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half_tp.txt -tensor-parallelism-degree 2 + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_tp.txt -tensor-parallelism-degree 2 + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (half precision) - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half_tp.txt -tensor-parallelism-degree 2 + ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### @@ -64,61 +64,80 @@ fi ############################################################################################### # LLAMA (small model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 # LLAMA (small model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 # OPT (small model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 # OPT (big model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 # OPT (big model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half.txt +../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (small model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (big model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (big model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half_tp.txt -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### ############################### Alignment and Speed tests ##################################### ############################################################################################### -############ Alignment between speculative inference and incremental decoding ################# -# Full precision -diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B.txt") <(tail -n +2 "../inference/output/spec_inference_llama.txt") -diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B.txt") <(tail -n +2 "../inference/output/spec_inference_opt.txt") -# Half precision -#diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half.txt") <(tail -n +2 "../inference/output/spec_inference_llama_half.txt") -#diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half.txt" ) <(tail -n +2 "../inference/output/spec_inference_opt_half.txt") +##################################### Helper functions ####################################### +function check_partial_token_match { + local file1="$1" + local file2="$2" + local num_tokens_to_match=30 + + # Read the second line of the first file + third_line=$(sed -n '3p' "$file1") + read -r line1 <<< "$third_line" + tokens1=${line1#*: } + IFS=',' read -ra arr1 <<< "$tokens1" + + # Read the second line of the second file + third_line=$(sed -n '3p' "$file2") + read -r line2 <<< "$third_line" + tokens2=${line2#*: } + IFS=',' read -ra arr2 <<< "$tokens2" + + # Compare the first few integers in the two lists + for ((i = 0; i < num_tokens_to_match; i++)); do + if [[ "${arr1[$i]}" != "${arr2[$i]}" ]]; then + echo "The first $num_tokens_to_match tokens in files $file1 and $file2 are not identical." + exit 1 + fi + done + #echo "The first $num_tokens_to_match integers are identical." +} -# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding function compare_speed_spec_infer_incr_decoding { local incrDec_file="$1" local specInf_file="$2" @@ -142,27 +161,69 @@ function compare_speed_spec_infer_incr_decoding { exit 1 fi } + +function compare_decoding_steps_spec_infer_incr_decoding { + local incrDec_file="$1" + local specInf_file="$2" + + # Read the number of decoding steps from the second line of the files + second_line=$(sed -n '2p' "$incrDec_file") + read -r line <<< "$second_line" + incrDec=${line#*: } + second_line=$(sed -n '2p' "$specInf_file") + read -r line <<< "$second_line" + specInf=${line#*: } + + if ! command -v bc &> /dev/null; then + echo "bc is not installed. Installing..." + sudo apt-get install -y bc + fi + + # Perform the comparison + threshold=$(bc <<< "$specInf * 1.5") + if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file." + : + else + echo "Error: The decoding steps in $specInf_file are not at least 1.5x less than those in $incrDec_file!" + exit 1 + fi +} + +############ Alignment between speculative inference and incremental decoding ################# +# Full precision +diff <(tail -n +3 "../inference/output/incr_decoding_llama_7B.txt") <(tail -n +3 "../inference/output/spec_inference_llama.txt") +diff <(tail -n +3 "../inference/output/incr_decoding_opt_6B.txt") <(tail -n +3 "../inference/output/spec_inference_opt.txt") +# Half precision +check_partial_token_match "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" +check_partial_token_match "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" + +# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding # Full precision -compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B.txt" "../inference/output/spec_inference_llama.txt" -compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B.txt" "../inference/output/spec_inference_opt.txt" +#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B.txt" "../inference/output/spec_inference_llama.txt" +#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B.txt" "../inference/output/spec_inference_opt.txt" +compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B.txt" "../inference/output/spec_inference_llama.txt" +compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B.txt" "../inference/output/spec_inference_opt.txt" # Half precision #compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" #compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" +compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" +compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" ############ Alignment between tensor model parallelism and pipeline parallelism only ################# if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then - diff <(tail -n +2 "../inference/output/spec_inference_llama_tp.txt") <(tail -n +2 "../inference/output/spec_inference_llama.txt") - diff <(tail -n +2 "../inference/output/spec_inference_opt_tp.txt") <(tail -n +2 "../inference/output/spec_inference_opt.txt") - diff <(tail -n +2 "../inference/output/spec_inference_llama_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_llama_half.txt") - diff <(tail -n +2 "../inference/output/spec_inference_opt_half_tp.txt") <(tail -n +2 "../inference/output/spec_inference_opt_half.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M.txt") - # diff <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_160M_half.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_7B.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_llama_7B_half.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_125M.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_opt_125M_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_125M_half.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_6B.txt") - diff <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half_tp.txt") <(tail -n +2 "../inference/output/incr_decoding_opt_6B_half.txt") + diff <(tail -n +3 "../inference/output/spec_inference_llama_tp.txt") <(tail -n +3 "../inference/output/spec_inference_llama.txt") + diff <(tail -n +3 "../inference/output/spec_inference_opt_tp.txt") <(tail -n +3 "../inference/output/spec_inference_opt.txt") + check_partial_token_match "../inference/output/spec_inference_llama_half_tp.txt" "../inference/output/spec_inference_llama_half.txt" + check_partial_token_match "../inference/output/spec_inference_opt_half_tp.txt" "../inference/output/spec_inference_opt_half.txt" + diff <(tail -n +3 "../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_llama_160M.txt") + check_partial_token_match "../inference/output/incr_decoding_llama_160M_half_tp.txt" "../inference/output/incr_decoding_llama_160M_half.txt" + diff <(tail -n +3 "../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_llama_7B.txt") + check_partial_token_match "../inference/output/incr_decoding_llama_7B_half_tp.txt" "../inference/output/incr_decoding_llama_7B_half.txt" + diff <(tail -n +3 "../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_opt_125M.txt") + check_partial_token_match "../inference/output/incr_decoding_opt_125M_half_tp.txt" "../inference/output/incr_decoding_opt_125M_half.txt" + diff <(tail -n +3 "../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_opt_6B.txt") + check_partial_token_match "../inference/output/incr_decoding_opt_6B_half_tp.txt" "../inference/output/incr_decoding_opt_6B_half.txt" fi ######################### Alignment tests with HuggingFace #################################### @@ -192,15 +253,15 @@ python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" -- # OPT (big model, half precision) #python3 ./inference/huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 -diff <(tail -n +2 "../inference/output/huggingface_llama_160M.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_160M.txt") -diff <(tail -n +2 "../inference/output/huggingface_llama_160M_half.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_160M_half.txt") -diff <(tail -n +2 "../inference/output/huggingface_llama_7B.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_7B.txt") -diff <(tail -n +2 "../inference/output/huggingface_llama_7B_half.txt") <(tail -n +4 "../inference/output/incr_decoding_llama_7B_half.txt") +diff <(tail -n +2 "../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../inference/output/incr_decoding_llama_160M.txt") +diff <(tail -n +2 "../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff <(tail -n +2 "../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../inference/output/incr_decoding_llama_7B.txt") +diff <(tail -n +2 "../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../inference/output/incr_decoding_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../inference/output/huggingface_opt_125M.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_125M.txt") -diff <(tail -n +2 "../inference/output/huggingface_opt_125M_half.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_125M_half.txt") -#diff <(tail -n +2 "../inference/output/huggingface_opt_6B.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_6B.txt") -#diff <(tail -n +2 "../inference/output/huggingface_opt_6B_half.txt") <(tail -n +4 "../inference/output/incr_decoding_opt_6B_half.txt") +diff <(tail -n +2 "../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../inference/output/incr_decoding_opt_125M.txt") +diff <(tail -n +2 "../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +#diff <(tail -n +2 "../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../inference/output/incr_decoding_opt_6B.txt") +#diff <(tail -n +2 "../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../inference/output/incr_decoding_opt_6B_half.txt") ############################################################################################### ###################################### Cleanup ################################################ From ae67898b00405a130e8197b0b7808b5fc27d4867 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sun, 16 Jul 2023 14:45:58 -0400 Subject: [PATCH 166/344] change batch_size to num_active_tokens (#861) --- include/flexflow/ops/beam_topk.h | 4 ++-- src/ops/arg_topk.cc | 3 --- src/ops/beam_topk.cc | 5 +---- src/ops/beam_topk.cpp | 4 ++-- src/ops/beam_topk.cu | 4 ++-- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h index 76404bfb6d..57ab5c1074 100644 --- a/include/flexflow/ops/beam_topk.h +++ b/include/flexflow/ops/beam_topk.h @@ -82,7 +82,7 @@ class BeamTopK : public Op { float *output_ptr, int *indices_ptr, int *parent_ptr, - size_t batch_size, + int batch_size, int length, bool sorted, ffStream_t stream); @@ -92,7 +92,7 @@ class BeamTopK : public Op { float *output_ptr, int *indices_ptr, int *parent_ptr, - size_t batch_size, + int batch_size, int length, bool sorted); Params get_params() const; diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index a604c016d2..c1bbb65f1e 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -311,9 +311,6 @@ InferenceResult int batch_size = bc->num_active_tokens(); ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); - int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; - batch_size = input.domain.get_volume() / length; - InferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index db507c1729..0920105acc 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -379,12 +379,9 @@ BeamInferenceResult // total token nums size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; - size_t batch_size = in1_domain.get_volume() / length; - + int batch_size = bc->num_active_tokens(); // std::cout << "beam search topk params: " << length << ", " << k << ", " // << batch_size << "\n"; - assert(out2_domain.get_volume() / k == batch_size); - // std::vector beam_width; // std::unordered_map sub_requests = bc->sub_requests; // for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 1817eae4da..248ab188da 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -479,7 +479,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, float *output_ptr, int *indices_ptr, int *parent_ptr, - size_t batch_size, + int batch_size, int length, bool sorted, hipStream_t stream) { @@ -630,7 +630,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, float *output_ptr, int *indices_ptr, int *parent_ptr, - size_t batch_size, + int batch_size, int length, bool sorted) { hipStream_t stream; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 9a5cd86486..ceddb55f2d 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -511,7 +511,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, float *output_ptr, int *indices_ptr, int *parent_ptr, - size_t batch_size, + int batch_size, int length, bool sorted, cudaStream_t stream) { @@ -662,7 +662,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, float *output_ptr, int *indices_ptr, int *parent_ptr, - size_t batch_size, + int batch_size, int length, bool sorted) { cudaStream_t stream; From 58b745d04c67a85fb42392ecd692fda30b8e80ae Mon Sep 17 00:00:00 2001 From: lambda shi Date: Mon, 17 Jul 2023 05:03:50 +0800 Subject: [PATCH 167/344] Add opt-13B config (#841) Co-authored-by: Zhihao Jia --- inference/models/configs/opt_13B.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 inference/models/configs/opt_13B.json diff --git a/inference/models/configs/opt_13B.json b/inference/models/configs/opt_13B.json new file mode 100644 index 0000000000..96cad5c99b --- /dev/null +++ b/inference/models/configs/opt_13B.json @@ -0,0 +1,15 @@ +{ + "vocab_size": 50272, + "word_embed_proj_dim": 5120, + "hidden_size": 5120, + "num_attention_heads": 40, + "max_position_embeddings": 2048, + "layer_norm_elementwise_affine": true, + "num_hidden_layers": 40, + "dropout": 0.1, + "ffn_dim": 20480, + "max_beam_width": 1, + "batchSize": 8, + "sentence_len": 100, + "max_beam_depth": 4 +} From b359ce9294d9b72d7fd411edfc0ac9780d206e90 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 16 Jul 2023 23:00:01 -0400 Subject: [PATCH 168/344] temp fix to bug --- src/ops/arg_topk.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 575e0183b4..1011f42d51 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -411,6 +411,9 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + // TODO: remove this + cudaStreamSynchronize(stream); + // Domain in1_domain = runtime->get_index_space_domain( // ctx, task->regions[0].region.get_index_space()); // Domain out1_domain = runtime->get_index_space_domain( @@ -481,6 +484,10 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, cudaEventDestroy(t_end); printf("[ArgTopK] forward time = %.2lfms\n", elapsed); } + + // TODO: remove this + cudaStreamSynchronize(stream); + } ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) From 28fd257bcd48a8336b5fea613080c6110b59c45d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 16 Jul 2023 23:00:24 -0400 Subject: [PATCH 169/344] linting --- src/ops/arg_topk.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 1011f42d51..8bd8222fa3 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -487,7 +487,6 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, // TODO: remove this cudaStreamSynchronize(stream); - } ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) From 96e41380326eec9ae014f768bb843793da971dbc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 17 Jul 2023 14:19:39 -0400 Subject: [PATCH 170/344] replaced cudamemcpy with cudamemcpyasync --- src/ops/arg_topk.cu | 6 +++--- src/runtime/cuda_helper.cu | 41 ++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 8bd8222fa3..3ac19dec8e 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -411,9 +411,6 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - // TODO: remove this - cudaStreamSynchronize(stream); - // Domain in1_domain = runtime->get_index_space_domain( // ctx, task->regions[0].region.get_index_space()); // Domain out1_domain = runtime->get_index_space_domain( @@ -484,9 +481,12 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, cudaEventDestroy(t_end); printf("[ArgTopK] forward time = %.2lfms\n", elapsed); } +<<<<<<< HEAD // TODO: remove this cudaStreamSynchronize(stream); +======= +>>>>>>> parent of b359ce92 (temp fix to bug) } ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index aa87a383af..d36413f993 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -211,15 +211,14 @@ __host__ void updateGAS(float *para_ptr, template __host__ void print_tensor(T const *ptr, size_t num_elements, char const *prefix) { - // device synchronize to make sure the data are ready - // checkCUDA(cudaDeviceSynchronize()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; checkCUDA(cudaHostAlloc(&host_ptr, sizeof(T) * num_elements, cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); - // checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpyAsync( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { @@ -238,14 +237,14 @@ __host__ void print_beam_tensor(T const *ptr, int skip, int channel, char const *prefix) { - // device synchronize to make sure the data are ready - // checkCUDA(cudaDeviceSynchronize()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; checkCUDA(cudaHostAlloc(&host_ptr, sizeof(T) * channel * skip, cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy( - host_ptr, ptr, sizeof(T) * channel * skip, cudaMemcpyDeviceToHost)); + checkCUDA(cudaMemcpyAsync( + host_ptr, ptr, sizeof(T) * channel * skip, cudaMemcpyDeviceToHost, stream)); // checkCUDA(cudaDeviceSynchronize()); int idx = 0; printf("%s", prefix); @@ -266,14 +265,14 @@ __host__ void print_beam_tensor(T const *ptr, template __host__ void save_tensor(T const *ptr, size_t num_elements, char const *file_name) { - // device synchronize to make sure the data are ready - // checkCUDA(cudaDeviceSynchronize()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; checkCUDA(cudaHostAlloc(&host_ptr, sizeof(T) * num_elements, cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); + checkCUDA(cudaMemcpyAsync( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); // checkCUDA(cudaDeviceSynchronize()); FILE *tensor_file; @@ -288,26 +287,24 @@ __host__ void template __host__ T *download_tensor(T const *ptr, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(cudaDeviceSynchronize()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; checkCUDA(cudaHostAlloc(&host_ptr, sizeof(T) * num_elements, cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); - // checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpyAsync( + host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); return host_ptr; } template __host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(cudaDeviceSynchronize()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA( - cudaMemcpy(dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost)); - // checkCUDA(cudaDeviceSynchronize()); + cudaMemcpyAsync(dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); return true; } cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( From f6e4c5dfeba84e96eb59f4209b61cd2291d36a6d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 17 Jul 2023 14:21:03 -0400 Subject: [PATCH 171/344] linting --- src/runtime/cuda_helper.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index d36413f993..dff5157a8a 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -243,8 +243,11 @@ __host__ void print_beam_tensor(T const *ptr, checkCUDA(cudaHostAlloc(&host_ptr, sizeof(T) * channel * skip, cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync( - host_ptr, ptr, sizeof(T) * channel * skip, cudaMemcpyDeviceToHost, stream)); + checkCUDA(cudaMemcpyAsync(host_ptr, + ptr, + sizeof(T) * channel * skip, + cudaMemcpyDeviceToHost, + stream)); // checkCUDA(cudaDeviceSynchronize()); int idx = 0; printf("%s", prefix); @@ -303,8 +306,8 @@ __host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); - checkCUDA( - cudaMemcpyAsync(dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); + checkCUDA(cudaMemcpyAsync( + dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); return true; } cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( From 319c69ddf163e50a819ed040cfd65b87905eb3e3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 17 Jul 2023 14:23:15 -0400 Subject: [PATCH 172/344] fix merge issue --- src/ops/arg_topk.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 3ac19dec8e..575e0183b4 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -481,12 +481,6 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, cudaEventDestroy(t_end); printf("[ArgTopK] forward time = %.2lfms\n", elapsed); } -<<<<<<< HEAD - - // TODO: remove this - cudaStreamSynchronize(stream); -======= ->>>>>>> parent of b359ce92 (temp fix to bug) } ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) From 3d494a1a58085c081e5585d0f186012c5754896e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 17 Jul 2023 23:01:43 +0000 Subject: [PATCH 173/344] fix bugs --- docker/run.sh | 4 +- src/runtime/model.cc | 363 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 297 insertions(+), 70 deletions(-) diff --git a/docker/run.sh b/docker/run.sh index e04e7d68c1..aad3c1da27 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -28,9 +28,9 @@ fi if [[ "$image" == "flexflow-environment" ]]; then - eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-environment-${FF_GPU_BACKEND}:latest" + eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-environment-${FF_GPU_BACKEND}-11.8.0:latest" elif [[ "$image" == "flexflow" ]]; then - eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-${FF_GPU_BACKEND}:latest" + eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-${FF_GPU_BACKEND}-11.8.0:latest" elif [[ "$image" == "mt5" ]]; then # Backward compatibility eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" \ diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f1c794cef7..5179178cd9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4069,8 +4069,16 @@ void register_flexflow_internal_tasks(Runtime *runtime, "RequestManager Load Tokens"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "RequestManager Load Tokens Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RequestManager Load Tokens Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } // RequestManager load position tokens { @@ -4078,8 +4086,16 @@ void register_flexflow_internal_tasks(Runtime *runtime, "RequestManager Load Position tokens"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "RequestManager Load Position Tokens Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RequestManager Load Position Tokens Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } // ElementUnary task { @@ -4180,29 +4196,57 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(EXPERTS_INIT_TASK_ID, "Experts Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Experts Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Experts Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(EXPERTS_FWD_TASK_ID, "Experts Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Experts Forward Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Experts Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(EXPERTS_BWD_TASK_ID, "Experts Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Experts Backward Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Experts Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(EXPERTS_INF_TASK_ID, "Experts Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Experts Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Experts Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } // Cast { @@ -4789,15 +4833,29 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(RMSNROM_INIT_TASK_ID, "rmsnorm_init_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "rmsnorm_init_task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "rmsnorm_init_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(RMSNROM_FWD_TASK_ID, "rmsnorm_fwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "rmsnorm_fwd_task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "rmsnorm_fwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); @@ -4832,8 +4890,15 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(LINEAR_INF_TASK_ID, "Linear Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Linear Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Linear Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward"); @@ -4953,8 +5018,17 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(SOFTMAX_INF_TASK_ID, "softmax_inf_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "softmax_inf_task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "softmax_inf_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } // compute Loss { @@ -5275,31 +5349,62 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "ArgTopK Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(ARG_TOPK_INF_TASK_ID, "ArgTopK Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "ArgTopK Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } // BeamTopk task { TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "BeamTopK Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "BeamTopK Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "BeamTopK Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "BeamTopK Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } // Transpose task { @@ -5400,18 +5505,35 @@ void register_flexflow_internal_tasks(Runtime *runtime, "IncMultiHeadSelfAttention Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "IncMultiHeadSelfAttention Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "IncMultiHeadSelfAttention Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } { TaskVariantRegistrar registrar(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, "IncMultiHeadSelfAttention Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant< - IncMultiHeadSelfAttention::inference_task>( - registrar, "IncMultiHeadSelfAttention Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant< + IncMultiHeadSelfAttention::inference_task>( + registrar, "IncMultiHeadSelfAttention Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } // MultiQueryAttention task { @@ -5419,18 +5541,36 @@ void register_flexflow_internal_tasks(Runtime *runtime, "IncMultiQuerySelfAttention Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "IncMultiQuerySelfAttention Init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "IncMultiQuerySelfAttention Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } { TaskVariantRegistrar registrar(INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, "IncMultiQuerySelfAttention Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant< - IncMultiQuerySelfAttention::inference_task>( - registrar, "IncMultiQuerySelfAttention Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant< + IncMultiQuerySelfAttention::inference_task>( + registrar, "IncMultiQuerySelfAttention Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime + ->register_task_variant( + registrar); + } } // speculative MultiHeadAttention task { @@ -5439,9 +5579,19 @@ void register_flexflow_internal_tasks(Runtime *runtime, "Speculative IncMultiHeadSelfAttention Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Speculative IncMultiHeadSelfAttention Init Task"); + if (pre_register) { + Runtime::preregister_task_variant< + OpMeta *, + SpecIncMultiHeadSelfAttention::init_task>( + registrar, "Speculative IncMultiHeadSelfAttention Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } { TaskVariantRegistrar registrar( @@ -5449,9 +5599,17 @@ void register_flexflow_internal_tasks(Runtime *runtime, "Speculative IncMultiHeadSelfAttention Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant< - SpecIncMultiHeadSelfAttention::inference_task>( - registrar, "Speculative IncMultiHeadSelfAttention Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant< + SpecIncMultiHeadSelfAttention::inference_task>( + registrar, "Speculative IncMultiHeadSelfAttention Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + SpecIncMultiHeadSelfAttention::inference_task>(registrar); + } } { TaskVariantRegistrar registrar( @@ -5459,9 +5617,19 @@ void register_flexflow_internal_tasks(Runtime *runtime, "TreeIncMultiHeadSelfAttention Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "TreeIncMultiHeadSelfAttention Init Task"); + if (pre_register) { + Runtime::preregister_task_variant< + OpMeta *, + TreeIncMultiHeadSelfAttention::init_task>( + registrar, "TreeIncMultiHeadSelfAttention Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } } { TaskVariantRegistrar registrar( @@ -5469,9 +5637,17 @@ void register_flexflow_internal_tasks(Runtime *runtime, "TreeIncMultiHeadSelfAttention Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant< - TreeIncMultiHeadSelfAttention::inference_task>( - registrar, "TreeIncMultiHeadSelfAttention Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant< + TreeIncMultiHeadSelfAttention::inference_task>( + registrar, "TreeIncMultiHeadSelfAttention Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + TreeIncMultiHeadSelfAttention::inference_task>(registrar); + } } // NoOp { @@ -5521,8 +5697,15 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "FusedOp Inference Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "FusedOp Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward"); @@ -5634,8 +5817,15 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Replicate init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(REPLICATE_FWD_TASK_ID, "Replicate Forward"); @@ -5670,8 +5860,15 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Reduction init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Reduction init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(REDUCTION_FWD_TASK_ID, "Reduction Forward"); @@ -5706,22 +5903,43 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "AllReduce init Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "AllReduce Forward Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } { TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "AllReduce Backward Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } } // FusedParallelOp { @@ -6006,9 +6224,18 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(TENSOR_EQUAL_TASK_ID, "Tensor Equal"); registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Tensor Equal Task"); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Tensor Equal Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime + ->register_task_variant( + registrar); + } } } From b483b6698fcf4e004c49ef92bfd5d9d7b4d3f223 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 17 Jul 2023 23:05:44 +0000 Subject: [PATCH 174/344] undo accidental change --- docker/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/run.sh b/docker/run.sh index aad3c1da27..e04e7d68c1 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -28,9 +28,9 @@ fi if [[ "$image" == "flexflow-environment" ]]; then - eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-environment-${FF_GPU_BACKEND}-11.8.0:latest" + eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-environment-${FF_GPU_BACKEND}:latest" elif [[ "$image" == "flexflow" ]]; then - eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-${FF_GPU_BACKEND}-11.8.0:latest" + eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "flexflow-${FF_GPU_BACKEND}:latest" elif [[ "$image" == "mt5" ]]; then # Backward compatibility eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" \ From d3cd3709a35dc939a60dc6e153cb9ccb2c3ef4f3 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Tue, 18 Jul 2023 21:02:13 -0400 Subject: [PATCH 175/344] Inference: Sampling result (#854) * init * sort * . * del * . * finish impl. * clean up, format, hip_rocm * format * . * fix half precision. * try torch1. * . * batch size * fix * rename GenerationConfig SamplingConfig --------- Co-authored-by: Zhihao Jia --- include/flexflow/ffconst.h | 1 + include/flexflow/inference.h | 12 + include/flexflow/model.h | 6 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/sampling.h | 108 +++++++ include/flexflow/ops/sampling_params.h | 24 ++ inference/incr_decoding/incr_decoding.cc | 25 ++ inference/models/llama.cc | 10 +- inference/models/llama.h | 1 + inference/spec_infer/spec_infer.cc | 3 + src/ops/fused.cu | 3 +- src/ops/sampling.cc | 343 +++++++++++++++++++++++ src/ops/sampling.cpp | 67 +++++ src/ops/sampling.cu | 267 ++++++++++++++++++ src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 5 + src/runtime/model.cc | 40 ++- src/runtime/operator_params.cc | 3 + 18 files changed, 919 insertions(+), 3 deletions(-) create mode 100644 include/flexflow/ops/sampling.h create mode 100644 include/flexflow/ops/sampling_params.h create mode 100644 src/ops/sampling.cc create mode 100644 src/ops/sampling.cpp create mode 100644 src/ops/sampling.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 3d899ac91d..65fa23569b 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -167,6 +167,7 @@ enum OperatorType { OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_INC_MULTIQUERY_SELF_ATTENTION, + OP_SAMPLING, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index a1846c96dc..0c5274e15b 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -65,6 +65,18 @@ struct BeamTree { treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; }; +struct SamplingConfig { + bool do_sample = false; + float temperature = 0.8; + float topp = 0.6; + SamplingConfig(bool _do_sample, float _temperature, float _topp) { + temperature = _temperature > 0 ? _temperature : temperature; + topp = _topp > 0 ? _topp : topp; + do_sample = _do_sample; + } + SamplingConfig() {} +}; + // struct BeamTree_v2 { // std::vector tokens; // std::vector parent_ids; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 38c1cec838..3a76209b98 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -136,6 +136,8 @@ enum TaskIDs { TOPK_BWD_TASK_ID, ARG_TOPK_INIT_TASK_ID, ARG_TOPK_INF_TASK_ID, + SAMPLING_INIT_TASK_ID, + SAMPLING_INF_TASK_ID, TRANSPOSE_INIT_TASK_ID, TRANSPOSE_FWD_TASK_ID, TRANSPOSE_BWD_TASK_ID, @@ -312,6 +314,7 @@ class RMSNorm; class BeamTopK; class SpecIncMultiHeadSelfAttention; class IncMultiQuerySelfAttention; +class Sampling; class Combine; class Repartition; class Reduction; @@ -612,6 +615,7 @@ class FFModel { int k, bool sorted, char const *name = NULL); + Tensor sampling(const Tensor input, float top_p, char const *name = NULL); Tensor multihead_attention(const Tensor query, const Tensor key, const Tensor value, @@ -1061,6 +1065,8 @@ class FFModel { IncMultiQuerySelfAttention *>, std::unordered_map, BeamTopK *>, + std::unordered_map, + Sampling *>, std::unordered_map< std::pair, SpecIncMultiHeadSelfAttention *>, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index f6918ff581..5c2101d190 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -26,6 +26,7 @@ #include "flexflow/ops/reduce_params.h" #include "flexflow/ops/reshape_params.h" #include "flexflow/ops/rms_norm_params.h" +#include "flexflow/ops/sampling_params.h" #include "flexflow/ops/softmax_params.h" #include "flexflow/ops/spec_inc_multihead_self_attention_params.h" #include "flexflow/ops/split_params.h" @@ -71,6 +72,7 @@ using OperatorParameters = mp::variant +#include +#endif + +namespace FlexFlow { + +class SamplingMeta : public OpMeta { +public: + float top_p; + void *sorted_logits; + int *sorted_idx; + int *begin_offset; + int *end_offset; + int *idx; + void *d_temp_storage; + size_t temp_storage_bytes; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + curandState *state; +#endif + SamplingMeta(FFHandler handle, + Op const *op, + int batch_size, + int total_ele, + GenericTensorAccessorW input); +}; + +class Sampling : public Op { +public: + using Params = SamplingParams; + using Input = ParallelTensor; + Sampling(FFModel &model, + const ParallelTensor input, + float top_p, + char const *name); + Sampling(FFModel &model, Sampling const &other, const ParallelTensor input); + Sampling(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static InferenceResult + inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void serialize(Legion::Serializer &s) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + template + static void forward_kernel(SamplingMeta const *m, + DT *input_ptr, + int *indices_ptr, + float top_p, + int length, + int batch_size, + ffStream_t stream); + static void forward_kernel_wrapper(SamplingMeta const *m, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &indices, + int batch_size); + Params get_params() const; + +public: + float top_p; +}; + +}; // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/include/flexflow/ops/sampling_params.h b/include/flexflow/ops/sampling_params.h new file mode 100644 index 0000000000..1449ddbf54 --- /dev/null +++ b/include/flexflow/ops/sampling_params.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_SAMPLING_PARAMS_H +#define _FLEXFLOW_SAMPLING_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct SamplingParams { + float top_p; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(SamplingParams const &, SamplingParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::SamplingParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_SAMPLING_PARAMS_H \ No newline at end of file diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 68a8e10042..17fc58c53a 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -38,6 +38,9 @@ void parse_input_args(char **argv, ModelType &llm_model_type, bool &use_full_precision, bool &verbose, + bool &do_sample, + float &temperature, + float &topp, int &data_parallelism_degree, int &tensor_parallelism_degree, int &pipeline_parallelism_degree) { @@ -109,6 +112,18 @@ void parse_input_args(char **argv, verbose = true; continue; } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } } } @@ -124,6 +139,9 @@ void FlexFlow::top_level_task(Task const *task, ModelType model_type; bool use_full_precision = false; bool verbose = false; + bool do_sample = false; + float temperature = 0.0f; + float topp = 0.0f; size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; int data_parallelism_degree = 1, tensor_parallelism_degree = 1, pipeline_parallelism_degree = 1; @@ -137,12 +155,16 @@ void FlexFlow::top_level_task(Task const *task, model_type, use_full_precision, verbose, + do_sample, + temperature, + topp, data_parallelism_degree, tensor_parallelism_degree, pipeline_parallelism_degree); ffconfig.data_parallelism_degree = data_parallelism_degree; ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree; + assert(data_parallelism_degree * tensor_parallelism_degree * pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -150,6 +172,7 @@ void FlexFlow::top_level_task(Task const *task, assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); + SamplingConfig samplingConfig(do_sample, temperature, topp); InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); RequestManager rm(model_type, file_paths.tokenizer_file_path, @@ -163,6 +186,7 @@ void FlexFlow::top_level_task(Task const *task, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, INC_DECODING_MODE, + samplingConfig, use_full_precision); } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, @@ -211,6 +235,7 @@ void FlexFlow::top_level_task(Task const *task, assert(fm.get_future_map_domain().get_volume() == 1); Future future = fm.get_future(0); ir = future.get_result(); + // assert(false); } // Execution fence diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e54ec13147..06dfaebcb1 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -24,6 +24,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, + SamplingConfig samplingConfig, bool use_full_precision) { // do not apply cpu offload in beam search model. Config llama_config(model_config_file_path); @@ -210,7 +211,14 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor softmax = ff.softmax(dense, -1); output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); } else { - output = ff.arg_top_k(dense, /*k=*/1, false); + // Tensor softmax = ff.softmax(dense, -1); + if (samplingConfig.do_sample) { + dense = ff.scalar_truediv(dense, samplingConfig.temperature, false); + Tensor softmax = ff.softmax(dense, -1); + output = ff.sampling(softmax, samplingConfig.topp); + } else { + output = ff.arg_top_k(dense, /*k=*/1, false); + } } // Compile the model diff --git a/inference/models/llama.h b/inference/models/llama.h index ab9bd4c7f3..6f80194d72 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -107,6 +107,7 @@ class LLAMA { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, + SamplingConfig samplingConfig, bool use_full_precision = false); }; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9cdcb454a2..a4c3dc64f9 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -199,6 +199,7 @@ void FlexFlow::top_level_task(Task const *task, } // Create SentencePiece tokenizer or OPT tokenizer + SamplingConfig samplingConfig; InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); RequestManager rm(model_types.llm_model_type, file_paths.tokenizer_file_path, @@ -213,6 +214,7 @@ void FlexFlow::top_level_task(Task const *task, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, TREE_VERIFY_MODE, + samplingConfig, use_full_precision); } else if (model_types.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, @@ -245,6 +247,7 @@ void FlexFlow::top_level_task(Task const *task, file_paths.ssm_config_file_paths[ssm_id], file_paths.ssm_weight_file_paths[ssm_id], BEAM_SEARCH_MODE, + samplingConfig, use_full_precision); } else if (model_types.ssm_model_types[ssm_id] == ModelType::OPT) { OPT::create_opt_model(beam_model, diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 2f84100554..ef6c856871 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -748,7 +748,8 @@ __host__ void case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc new file mode 100644 index 0000000000..8c01464042 --- /dev/null +++ b/src/ops/sampling.cc @@ -0,0 +1,343 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/sampling.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +// For an input tensor, computes the top k entries in each row +// (resp. vector along the last dimension). Thus, +// values.shape = indices.shape = input.shape[:-1] + [k] +Tensor FFModel::sampling(const Tensor input, float top_p, char const *name) { + Layer *li = new Layer(this, + OP_SAMPLING, + input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + // now just support 1 output + dims[0] = 1; + // li->outputs[0] = create_tensor_legion_ordering( + // numdims, dims, input->data_type, li, 0, true /*create_grad*/); + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + } + layers.push_back(li); + li->add_float_property("top_p", top_p); + // outputs[0] = li->outputs[0]; + // outputs[1] = li->outputs[1]; + return li->outputs[0]; +} + +Op *Sampling::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + float top_p; + layer->get_float_property("top_p", top_p); + return new Sampling(model, inputs[0], top_p, layer->name); +} + +SamplingParams Sampling::get_params() const { + SamplingParams params; + params.top_p = this->top_p; + return params; +} + +bool SamplingParams::is_valid(ParallelTensorShape const &) const { + return true; +} + +bool operator==(SamplingParams const &lhs, SamplingParams const &rhs) { + return lhs.top_p == rhs.top_p; +} + +Sampling::Sampling(FFModel &model, + const ParallelTensor _input, + float _top_p, + char const *name) + : Op(model, + OP_SAMPLING, + _input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + _input), + top_p(_top_p) { + int numdim = inputs[0]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[0]->dims[i]; + } + dims[0].size = 1; + std::cout << "degree: " << inputs[0]->dims[0].degree << "\n"; + assert(inputs[0]->dims[0].degree == 1); + assert(inputs[0]->dims[0].parallel_idx == -1); + // outputs[0] = model.create_parallel_tensor_legion_ordering( + // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_INT32, this, 0 /*owner_idx*/); +} + +Sampling::Sampling(FFModel &model, + Sampling const &other, + const ParallelTensor input) + : Sampling(model, input, other.top_p, other.name) {} + +Sampling::Sampling(FFModel &model, + SamplingParams const ¶ms, + const ParallelTensor input, + char const *name) + : Sampling(model, input, params.top_p, name) {} + +void Sampling::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(SAMPLING_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Sampling)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void Sampling::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(SAMPLING_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(Sampling)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *Sampling::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Sampling *s = (Sampling *)task->args; + FFHandler handle = *((FFHandler *)task->local_args); + GenericTensorAccessorW acc_input = + helperGetGenericTensorAccessorRW(s->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + + int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; + int batch_size = acc_input.domain.get_volume() / length; + + SamplingMeta *m = + new SamplingMeta(handle, s, batch_size, length * batch_size, acc_input); + m->profiling = s->profiling; + m->top_p = s->top_p; + return m; +} + +void Sampling::forward(FFModel const &ff) { + // Sampling does not support forward + assert(false); +} + +FutureMap Sampling::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Sampling op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SAMPLING_INF_TASK_ID, + parallel_is, + TaskArgument(&bc, sizeof(BatchConfig)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +InferenceResult + Sampling::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + BatchConfig const *bc = (BatchConfig *)task->args; + SamplingMeta const *m = *((SamplingMeta **)task->local_args); + + GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int batch_size = bc->num_active_tokens(); + Sampling::forward_kernel_wrapper(m, input, indices, batch_size); + + InferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size); + return ir; +} + +void Sampling::backward(FFModel const &ff) { + // Sampling does not support backward + assert(false); +} + +void Sampling::serialize(Legion::Serializer &sez) const { + sez.serialize(this->top_p); +} + +Node Sampling::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + float top_p; + dez.deserialize(top_p); + SamplingParams params; + params.top_p = top_p; + return ff.get_or_create_node(inputs[0], params); +} + +Op *Sampling::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + SamplingParams params = get_params(); + return new Sampling(ff, params, inputs[0], this->name); +} + +bool Sampling::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::SamplingParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.top_p); + return key; +} +}; // namespace std \ No newline at end of file diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp new file mode 100644 index 0000000000..4901fe400c --- /dev/null +++ b/src/ops/sampling.cpp @@ -0,0 +1,67 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/sampling.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +/*static*/ +template +void Sampling::forward_kernel(SamplingMeta const *m, + DT *input_ptr, + int *indices_ptr, + float const top_p, + int const length, + int const batch_size, + hipStream_t stream) {} + +/*static*/ +void Sampling::forward_kernel_wrapper(SamplingMeta const *m, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &indices, + int batch_size) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + handle_unimplemented_hip_kernel(OP_RMS_NORM); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + } +} + +SamplingMeta::SamplingMeta(FFHandler handler, + Op const *op, + int batch_size, + int total_ele, + GenericTensorAccessorW input) + : OpMeta(handler, op) {} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu new file mode 100644 index 0000000000..a91263a621 --- /dev/null +++ b/src/ops/sampling.cu @@ -0,0 +1,267 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cub/cub.cuh" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/sampling.h" +#include "flexflow/utils/cuda_helper.h" +#include +#include + +namespace FlexFlow { + +constexpr int SamplingNumThreads = 1024; +struct BlockPrefixCallbackOp { + // Running prefix + float running_total; + // Constructor + __device__ BlockPrefixCallbackOp(float running_total) + : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ float operator()(float block_aggregate) { + float old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +__global__ void init_idxs(int batch_size, + int vocab_size, + int total_eles, + int *idx, + int *begin_offset, + int *end_offset) { + CUDA_KERNEL_LOOP(i, total_eles) { + idx[i] = i % vocab_size; + if (i % vocab_size == 0) { + begin_offset[i / vocab_size] = i; + end_offset[i / vocab_size] = i; + } + } +} + +__global__ void + init_random_kernel(curandState *state, int batch_size, long rand) { + CUDA_KERNEL_LOOP(i, batch_size) { + curand_init(rand, i, 0, &state[i]); + } +} + +// multinominal and gather +template +__global__ void sampling_topp_kernel(int batch_size, + int const vocab_size, + curandState *state, + DT *sorted_logits, + int *sorted_idx, + int *indices_ptr, + float topp) { + // int const vocab_id = threadIdx.x; + int const batch_idx = blockIdx.x; + __shared__ float random_n; + __shared__ long long result_idx; + + // random num + if (threadIdx.x == 0) { + // number must < topp + random_n = curand_uniform(state + batch_idx) * topp; + // printf("batch idx: %d, random num%f\n", batch_idx, random_n); + } + + __syncthreads(); + + // cumsum; + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + int offset = batch_idx * vocab_size; + float prefix_sum = 0.0f; + BlockPrefixCallbackOp prefix_op(0); + result_idx = vocab_size - 1; + + for (long long j = threadIdx.x; j < vocab_size; j += blockDim.x) { + float logit = (float)(sorted_logits[offset + j]); + BlockScan(temp_storage).InclusiveSum(logit, prefix_sum, prefix_op); + prefix_sum /= topp; + if (prefix_sum >= random_n) { + atomicMin(&result_idx, j); + } + } + indices_ptr[batch_idx] = sorted_idx[offset + result_idx]; + + // if (threadIdx.x == 0) { + // printf("selected idx: %d, %d\n", blockIdx.x, result_idx); + // } +} + +/*static*/ +template +void Sampling::forward_kernel(SamplingMeta const *m, + DT *input_ptr, + int *indices_ptr, + float const top_p, + int const length, + int const batch_size, + cudaStream_t stream) { + // 1. sort + size_t temp_storage_bytes = m->temp_storage_bytes; + checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending( + m->d_temp_storage, + temp_storage_bytes, + input_ptr, + static_cast
(m->sorted_logits), + m->idx, + m->sorted_idx, + length * batch_size, + batch_size, + m->begin_offset, + m->end_offset + 1, + 0, // begin_bit + sizeof(DT) * 8, // end_bit = sizeof(KeyT) * 8 + stream)); + int parallelism = batch_size; + init_random_kernel<<>>(m->state, batch_size, rand()); + // sampling + sampling_topp_kernel + <<>>( + batch_size, + length, + m->state, + static_cast
(m->sorted_logits), + m->sorted_idx, + indices_ptr, + top_p); +} + +/*static*/ +void Sampling::forward_kernel_wrapper(SamplingMeta const *m, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &indices, + int batch_size) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + + if (input.data_type == DT_HALF) { + Sampling::forward_kernel(m, + input.get_half_ptr(), + indices.get_int32_ptr(), + m->top_p, + length, + batch_size, + stream); + } else if (input.data_type == DT_FLOAT) { + Sampling::forward_kernel(m, + input.get_float_ptr(), + indices.get_int32_ptr(), + m->top_p, + length, + batch_size, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[Sampling] forward time = %.2lfms\n", elapsed); + } +} + +SamplingMeta::SamplingMeta(FFHandler handler, + Op const *op, + int batch_size, + int total_ele, + GenericTensorAccessorW input) + : OpMeta(handler, op) { + DataType data_type = op->data_type; + checkCUDA(cudaMalloc(&begin_offset, (batch_size + 1) * sizeof(int))); + checkCUDA(cudaMalloc(&end_offset, (batch_size + 1) * sizeof(int))); + checkCUDA(cudaMalloc(&idx, total_ele * sizeof(int))); + + checkCUDA(cudaMalloc(&sorted_idx, total_ele * sizeof(int))); + checkCUDA(cudaMalloc(&sorted_logits, total_ele * data_type_size(data_type))); + cudaMalloc(&state, sizeof(curandState) * batch_size); + + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // init offset + int parallelism = total_ele; + init_idxs<<>>(batch_size, + total_ele / batch_size, + total_ele, + idx, + begin_offset, + end_offset); + + // init sort function + if (data_type == DT_FLOAT) { + checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending( + d_temp_storage, + temp_storage_bytes, + input.get_float_ptr(), + input.get_float_ptr(), + idx, + idx, + total_ele, + batch_size, + begin_offset, + end_offset + 1, + 0, // begin_bit + data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 + stream)); + } else if (data_type == DT_HALF) { + checkCUDA(cub::DeviceSegmentedRadixSort::SortPairsDescending( + d_temp_storage, + temp_storage_bytes, + input.get_half_ptr(), + input.get_half_ptr(), + idx, + idx, + total_ele, + batch_size, + begin_offset, + end_offset + 1, + 0, // begin_bit + data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 + stream)); + } else { + assert(false && "input type in float and half"); + } + checkCUDA(cudaMalloc(&d_temp_storage, temp_storage_bytes)); +} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 39f9d1dd0d..a777605daf 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -178,6 +178,8 @@ std::string get_operator_type_name(OperatorType type) { return "GELU"; case OP_IDENTITY: return "Identity"; + case OP_SAMPLING: + return "Sampling"; // Parallel Ops case OP_REPARTITION: return "Repartition"; diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5c0513baa8..16bccc25df 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -40,6 +40,7 @@ #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" #include "flexflow/ops/rms_norm.h" +#include "flexflow/ops/sampling.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" @@ -2919,6 +2920,10 @@ void FFModel::deserialize_graph_optimal_view( node = BeamTopK::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_SAMPLING: { + node = Sampling::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_GROUP_BY: { node = Group_by::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5179178cd9..22515a2bb0 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -52,6 +52,7 @@ #include "flexflow/ops/reshape.h" #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" +#include "flexflow/ops/sampling.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" @@ -2937,6 +2938,11 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_SAMPLING: { + Op *op = Sampling::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_GROUP_BY: { Op *op = Group_by::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -2977,7 +2983,8 @@ void FFModel::create_operators_from_layers() { Op *op = nullptr; // add a combine before arg_topk if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && l->op_type == OP_ARG_TOPK) { + config.tensor_parallelism_degree > 1 && + (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX)) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, @@ -5406,6 +5413,37 @@ void register_flexflow_internal_tasks(Runtime *runtime, BeamTopK::inference_task>(registrar); } } + // Sampling task + { + TaskVariantRegistrar registrar(SAMPLING_INIT_TASK_ID, "Sampling Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Sampling Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(SAMPLING_INF_TASK_ID, "Sampling Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Sampling Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // Transpose task { TaskVariantRegistrar registrar(TRANSPOSE_INIT_TASK_ID, "Transpose Init"); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 6b61d5ac7a..8fb8c89b10 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/reshape.h" #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" +#include "flexflow/ops/sampling.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" @@ -130,6 +131,8 @@ tl::optional get_op_parameters(Op const *op) { return ((ArgTopK *)op)->get_params(); case OP_BEAM_TOPK: return ((BeamTopK *)op)->get_params(); + case OP_SAMPLING: + return ((Sampling *)op)->get_params(); // TODO: implement the get_params() function for the operators below and // uncomment the lines below From 3e23dd8444af7ff24120a0cdc2e95a4afec592d5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 19 Jul 2023 16:07:26 -0400 Subject: [PATCH 176/344] fix --- .github/workflows/docker-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 280539eb5f..d059a0605f 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -44,7 +44,7 @@ jobs: env: FF_GPU_BACKEND: ${{ matrix.gpu_backend }} cuda_version: ${{ matrix.cuda_version }} - branch_name: ${GITHUB_REF#refs/heads/} + branch_name: ${{ github.head_ref || github.ref_name }} steps: - name: Checkout Git Repository uses: actions/checkout@v3 From 02d4b2058dd145e10427ce5edb931149f1e43833 Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Fri, 21 Jul 2023 00:17:21 -0400 Subject: [PATCH 177/344] update new models weights (#837) * update new models weights * update public model --------- Co-authored-by: Zhihao Jia --- inference/utils/download_llama_weights.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/inference/utils/download_llama_weights.py b/inference/utils/download_llama_weights.py index 1cd6928080..0cf4453aa0 100644 --- a/inference/utils/download_llama_weights.py +++ b/inference/utils/download_llama_weights.py @@ -8,6 +8,9 @@ # You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. parser = argparse.ArgumentParser() parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") +parser.add_argument("--use_13B", action="store_true", help="Use full precision") +parser.add_argument("--use_30B", action="store_true", help="Use full precision") +parser.add_argument("--use_65B", action="store_true", help="Use full precision") args = parser.parse_args() if not args.use_full_precision: import torch @@ -45,6 +48,22 @@ def convert_hf_model(model, dst_folder): dst_folder="../weights/llama_7B_weights" if args.use_full_precision else "../weights/llama_7B_weights_half" convert_hf_model(model, dst_folder) +# Download and convert model weights only for hf +if args.use_13B: + model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-13b-hf") + dst_folder="../weights/llama_13B_weights_half" + convert_hf_model(model, dst_folder) + +if args.use_30B: + model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-30b-hf") + dst_folder="../weights/llama_30B_weights_half" + convert_hf_model(model, dst_folder) + +if args.use_65B: + model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-65b-hf") + dst_folder="../weights/llama_65B_weights_half" + convert_hf_model(model, dst_folder) + # Download and convert small model weights model = AutoModelForCausalLM.from_pretrained("JackFram/llama-160m") dst_folder="../weights/llama_160M_weights" if args.use_full_precision else "../weights/llama_160M_weights_half" From 8caa8032a3dfef2b7b9d277d7fa192fe3fa52c96 Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Fri, 21 Jul 2023 10:52:44 -0400 Subject: [PATCH 178/344] Model weight flag explanation (#880) * update new models weights * update public model * update optional flag * add flag explanation --------- Co-authored-by: Zhihao Jia Co-authored-by: Gabriele Oliaro --- inference/utils/download_llama_weights.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inference/utils/download_llama_weights.py b/inference/utils/download_llama_weights.py index 0cf4453aa0..d2b11453e6 100644 --- a/inference/utils/download_llama_weights.py +++ b/inference/utils/download_llama_weights.py @@ -6,11 +6,12 @@ from transformers import AutoModelForCausalLM # You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. +# and pass "--use_13B", "--use_30B", and "--use_65B" to use the corresponding "llama-13B/30B/65B" model weights parser = argparse.ArgumentParser() parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") -parser.add_argument("--use_13B", action="store_true", help="Use full precision") -parser.add_argument("--use_30B", action="store_true", help="Use full precision") -parser.add_argument("--use_65B", action="store_true", help="Use full precision") +parser.add_argument("--use_13B", action="store_true", help="choose to use llama-13B") +parser.add_argument("--use_30B", action="store_true", help="choose to use llama-30B") +parser.add_argument("--use_65B", action="store_true", help="choose to use llama-65B") args = parser.parse_args() if not args.use_full_precision: import torch From 2ba481b2aec29dcc9fa34eb140cb41a3651443c2 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Fri, 21 Jul 2023 10:53:53 -0400 Subject: [PATCH 179/344] Inference: fix batch_size issue. (#863) Co-authored-by: Gabriele Oliaro --- include/flexflow/batch_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 61a1e345ae..e3ce1c5b95 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -44,7 +44,7 @@ class BatchConfig { int num_active_tokens() const; void print() const; virtual InferenceMode get_mode() const; - static int const MAX_NUM_REQUESTS = 1; + static int const MAX_NUM_REQUESTS = 16; static int const MAX_NUM_TOKENS = 64; static int const MAX_SEQ_LENGTH = 256; From d047aa6d6d84d4fa217c9c67cc218c3d800d0cac Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 22 Jul 2023 17:49:02 -0400 Subject: [PATCH 180/344] Python interface for inference (part 1) (#878) * first commit * add flexflow inference stub * some updates * fix * add stubs for models * fix * add batch config support in cffi * checkpoint * updates * finished stub of llama model * cleanup c++ models * moved inference loop into inference manager, begin surfacing to python * checkpoint * checkpoint * linting --- include/flexflow/flexflow_c.h | 137 +++++++++- include/flexflow/inference.h | 8 + include/flexflow/model.h | 2 +- inference/flexflow_inference.py | 43 +++ inference/incr_decoding/incr_decoding.cc | 14 +- inference/models/llama.cc | 35 +-- inference/models/opt.cc | 18 -- inference/spec_infer/spec_infer.cc | 60 +---- python/flexflow/core/flexflow_cffi.py | 322 ++++++++++++++++++++++- python/flexflow/serve/__init__.py | 15 ++ python/flexflow/serve/models/__init__.py | 17 ++ python/flexflow/serve/models/falcon.py | 19 ++ python/flexflow/serve/models/llama.py | 130 +++++++++ python/flexflow/serve/models/opt.py | 19 ++ python/flexflow/serve/serve.py | 85 ++++++ python/flexflow/type.py | 13 + src/c/flexflow_c.cc | 290 +++++++++++++++++++- src/ops/element_binary.cc | 14 +- src/runtime/inference_manager.cc | 84 ++++++ 19 files changed, 1181 insertions(+), 144 deletions(-) create mode 100644 inference/flexflow_inference.py create mode 100644 python/flexflow/serve/__init__.py create mode 100644 python/flexflow/serve/models/__init__.py create mode 100644 python/flexflow/serve/models/falcon.py create mode 100644 python/flexflow/serve/models/llama.py create mode 100644 python/flexflow/serve/models/opt.py create mode 100644 python/flexflow/serve/serve.py diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 93b0444cb9..f8f9b97aad 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -47,6 +47,12 @@ FF_NEW_OPAQUE_TYPE(flexflow_dlrm_config_t); FF_NEW_OPAQUE_TYPE(flexflow_dataloader_4d_t); FF_NEW_OPAQUE_TYPE(flexflow_dataloader_2d_t); FF_NEW_OPAQUE_TYPE(flexflow_single_dataloader_t); +// Inference +FF_NEW_OPAQUE_TYPE(flexflow_batch_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_tree_verify_batch_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_beam_search_batch_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t); +FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t); // ----------------------------------------------------------------------- // FFConfig @@ -73,6 +79,7 @@ int flexflow_config_get_epochs(flexflow_config_t handle); bool flexflow_config_get_enable_control_replication(flexflow_config_t handle); int flexflow_config_get_python_data_loader_type(flexflow_config_t handle); + // ----------------------------------------------------------------------- // FFModel // ----------------------------------------------------------------------- @@ -197,9 +204,10 @@ flexflow_tensor_t flexflow_tensor_t flexflow_model_add_embedding(flexflow_model_t handle, const flexflow_tensor_t input, - int num_entires, + int num_entries, int out_dim, enum AggrMode aggr, + enum DataType dtype, flexflow_op_t shared_op, flexflow_initializer_t kernel_initializer, char const *name); @@ -383,8 +391,62 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bool add_bias_kv, bool add_zero_attn, flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + char const *name); + +flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + char const *name); + +flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, char const *name); +flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, + const flexflow_tensor_t input_, + float eps, + int dim, + char const *name); + +flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, + const flexflow_tensor_t input_, + int k, + bool sorted, + char const *name); + +flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, + const flexflow_tensor_t input_, + int max_beam_size, + bool sorted, + char const *name); + +flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, + const flexflow_tensor_t input_, + float top_p, + char const *name); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); @@ -404,6 +466,8 @@ flexflow_tensor_t flexflow_model_get_parameter_by_id(flexflow_model_t handle, flexflow_perf_metrics_t flexflow_model_get_perf_metrics(flexflow_model_t handle); +void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); + // ----------------------------------------------------------------------- // Tensor // ----------------------------------------------------------------------- @@ -713,6 +777,77 @@ void flexflow_op_forward(flexflow_op_t handle, flexflow_model_t model); void flexflow_perform_registration(void); +// ----------------------------------------------------------------------- +// BatchConfig +// ----------------------------------------------------------------------- + +flexflow_batch_config_t flexflow_batch_config_create(void); + +void flexflow_batch_config_destroy(flexflow_batch_config_t handle); + +// ----------------------------------------------------------------------- +// TreeVerifyBatchConfig +// ----------------------------------------------------------------------- + +flexflow_tree_verify_batch_config_t + flexflow_tree_verify_batch_config_create(void); + +void flexflow_tree_verify_batch_config_destroy( + flexflow_tree_verify_batch_config_t handle); + +// ----------------------------------------------------------------------- +// BeamSearchBatchConfig +// ----------------------------------------------------------------------- + +flexflow_beam_search_batch_config_t + flexflow_beam_search_batch_config_create(void); + +void flexflow_beam_search_batch_config_destroy( + flexflow_beam_search_batch_config_t handle); + +// ----------------------------------------------------------------------- +// RequestManager +// ----------------------------------------------------------------------- + +flexflow_request_manager_t flexflow_request_manager_create(void); + +void flexflow_request_manager_destroy(flexflow_request_manager_t handle); + +long unsigned int flexflow_request_manager_register_new_request( + flexflow_request_manager_t handle, + char const *prompt, + int max_sequence_length); + +// ----------------------------------------------------------------------- +// InferenceManager +// ----------------------------------------------------------------------- + +flexflow_inference_manager_t + flexflow_inference_manager_create(flexflow_config_t config_handle, + int max_num_tokens_per_batch); + +void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle); + +void flexflow_inference_manager_compile_model_and_allocate_buffer( + flexflow_inference_manager_t handle, flexflow_model_t model_handle); + +void flexflow_inference_manager_init_operators_inference( + flexflow_inference_manager_t handle, flexflow_model_t model_handle); + +void flexflow_inference_manager_incr_decoding_loop( + flexflow_inference_manager_t handle, + flexflow_model_t model_handle, + flexflow_request_manager_t rm_handle, + int total_num_requests); + +void flexflow_inference_manager_spec_inference_loop( + flexflow_inference_manager_t handle, + flexflow_model_t model_handle, + flexflow_request_manager_t rm_handle, + int total_num_requests, + int num_ssms, + int *ssm_model_ids); + #ifdef __cplusplus } #endif diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 0c5274e15b..823bac9fd6 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -24,6 +24,7 @@ namespace FlexFlow { class FFModel; class BeamTree; +class RequestManager; using tokenizers::Tokenizer; class InferenceManager { @@ -36,6 +37,13 @@ class InferenceManager { void load_input_tokens_from_batch_config(BatchConfig const &bc, ParallelTensor const input); void load_positions(BatchConfig const &bc, ParallelTensor position_input); + void incr_decoding_loop(FFModel *model, + RequestManager &rm, + int total_num_requests); + void spec_inference_loop(FFModel *model, + RequestManager &rm, + int total_num_requests, + std::vector ssm_model_ids); public: FFConfig ff_config; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3a76209b98..a95c229a08 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -467,7 +467,7 @@ class FFModel { char const *name = NULL); // Add an embedding layer Tensor embedding(const Tensor input, - int num_entires, + int num_entries, int outDim, AggrMode aggr, DataType dtype = DT_FLOAT, diff --git a/inference/flexflow_inference.py b/inference/flexflow_inference.py new file mode 100644 index 0000000000..6caace0f2d --- /dev/null +++ b/inference/flexflow_inference.py @@ -0,0 +1,43 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.serve import LLM, SamplingConfig +from flexflow.core import * + +def get_prompts(json_filepath): + json_obj = None + return json_obj + +def top_level_task(): + # Incremental decoding + llama = LLM("decapoda-research/llama-30b-hf", data_type = "half") + sampling_config = SamplingConfig(do_sample=False, temperature = 0.9, topp = 0.8, topk = 1) + llama.compile(InferenceMode.INC_DECODING_MODE, sampling_config, use_full_precision=False, max_batch_size = 1, max_seq_length = 256, max_tokens_per_batch=64, tensor_parallel_degree = 4, pipeline_parallel_degree = 2) + + prompts = llama.generate(prompts, sampling=sampling_config) + # result = llama.generate("What's the best xxx in yyy?", sampling = sampling_config) + # print(result) + + # # Speculative inference + # llama = LLM("decapoda-research/llama-30b-hf", data_type = "half") + # ssm1 = LLM("Jackfram/llama-160m", data_type = "half") + # ssm2 = LLM("facebook/opt-125m", data_type = "half") + # sampling_config = SamplingConfig(temperature = 0.9, topp = 0.8, topk = 1) + # llama.serve(max_batch_size = 1, max_seq_length = 256, max_tokens_per_batch=64, tensor_parallel_degree = 4, pipeline_parallel_degree = 2, ssms = {ssm1, ssm2}) + # result = llama.generate("What's the best xxx in yyy?", sampling = sampling_config) + # print(result) + +if __name__ == "__main__": + print("flexflow inference") + top_level_task() diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 17fc58c53a..0059b30ae0 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -224,19 +224,7 @@ void FlexFlow::top_level_task(Task const *task, } } - BatchConfig bc; - InferenceResult ir; - while (rm.get_num_processed_requests() < total_num_requests) { - bc = rm.prepare_next_batch(bc, ir); - if (rm.get_num_processed_requests() >= total_num_requests) { - break; - } - FutureMap fm = im.inference(&model, 0, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - ir = future.get_result(); - // assert(false); - } + im.incr_decoding_loop(&model, rm, total_num_requests); // Execution fence { diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 06dfaebcb1..588d6d264c 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -29,38 +29,6 @@ void LLAMA::create_llama_model(FFModel &ff, // do not apply cpu offload in beam search model. Config llama_config(model_config_file_path); llama_config.printConfig(); - //---------------------- parallelization setup work ---------------------- - int num_devices = ff.config.workersPerNode * ff.config.numNodes; - int num_transformer_layers = llama_config.n_layers; - assert(num_transformer_layers % ff.config.pipeline_parallelism_degree == 0); - int num_layers_per_pp_block = - num_transformer_layers / ff.config.pipeline_parallelism_degree; - int num_devices_per_data_parallelism_line = - num_devices / ff.config.data_parallelism_degree; - - // std::cout << "dp: " << ff.config.data_parallelism_degree - // << " tp: " << ff.config.tensor_parallelism_degree - // << " pp: " << ff.config.pipeline_parallelism_degree << std::endl; - // std::cout << "num_devices: " << num_devices << std::endl; - // std::cout << "num_transformer_layers: " << num_transformer_layers - // << std::endl; - // std::cout << "num_devices_per_data_parallelism_line: " - // << num_devices_per_data_parallelism_line << std::endl; - // std::cout << "num layers: " << llama_config.n_layers << std::endl; - - //------------------------------compute machine views ------------------ - // single device - std::vector machine_views; - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - machine_views.push_back(view); - } - assert(machine_views.size() == num_devices); std::unordered_map weights_layers; @@ -96,11 +64,10 @@ void LLAMA::create_llama_model(FFModel &ff, Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); - for (int i = 0; i < num_transformer_layers; i++) { + for (int i = 0; i < llama_config.n_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention - std::vector axes = {2}; Tensor att_norm = ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); Layer *attention_norm = ff.layers.back(); diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 503be39672..94aeb7f2bd 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -27,24 +27,6 @@ void OPT::create_opt_model(FFModel &ff, bool use_full_precision) { Config opt_config(model_config_file_path); opt_config.printConfig(); - //---------------------- parallelization setup work ---------------------- - int num_devices = ff.config.workersPerNode * ff.config.numNodes; - int num_transformer_layers = opt_config.num_hidden_layers; - assert(num_transformer_layers % ff.config.pipeline_parallelism_degree == 0); - int num_layers_per_pp_block = - num_transformer_layers / ff.config.pipeline_parallelism_degree; - int num_devices_per_data_parallelism_line = - num_devices / ff.config.data_parallelism_degree; - - // std::cout << "dp: " << ff.config.data_parallelism_degree - // << " tp: " << ff.config.tensor_parallelism_degree - // << " pp: " << ff.config.pipeline_parallelism_degree << std::endl; - // std::cout << "num_devices: " << num_devices << std::endl; - // std::cout << "num_transformer_layers: " << num_transformer_layers - // << std::endl; - // std::cout << "num_devices_per_data_parallelism_line: " - // << num_devices_per_data_parallelism_line << std::endl; - // std::cout << "num layers: " << opt_config.num_hidden_layers << std::endl; std::unordered_map weights_layers; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index a4c3dc64f9..a34c1b6a84 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -282,65 +282,7 @@ void FlexFlow::top_level_task(Task const *task, } } - TreeVerifyBatchConfig tree_bc; - BeamSearchBatchConfig beam_bc; - std::vector beam_bc_vec; - for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { - beam_bc_vec.push_back(BeamSearchBatchConfig(ssm_model_ids[ssm_id])); - } - - InferenceResult tree_ir; - - while (rm.get_num_processed_requests() < total_num_requests) { - int depth = 0; - // Beam Search - beam_bc = rm.prepare_next_batch_init(tree_bc, tree_ir, 0); - for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { - beam_bc_vec[ssm_id] = beam_bc; - beam_bc_vec[ssm_id].model_id = ssm_id; - } - - if (rm.get_num_processed_requests() >= total_num_requests) { - break; - } - - for (int i = 0; i < num_ssms; i++) { - while (true) { - beam_bc = beam_bc_vec[i]; - depth = beam_bc.beamRequestsInfo[0].current_depth; - - FutureMap fm = im.inference(rm.get_model(0), 0, beam_bc_vec[i]); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - BeamInferenceResult beam_ir = future.get_result(); - - int iteration = - std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, - BatchConfig::MAX_SEQ_LENGTH - beam_bc.max_init_length); - - if (depth - 1 >= iteration) { - break; - } else { - beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); - if (beam_bc_vec[i].num_active_tokens() == 0 && - beam_bc_vec[i].num_active_requests() != 0) { - break; - } - } - } - std::cout << "----------beam search finished for model " - << beam_bc_vec[i].model_id << "------------" << std::endl; - } - // Token Tree Verification - { - tree_bc = rm.prepare_next_batch_verify(beam_bc_vec); - FutureMap fm = im.inference(&tree_model, 0, tree_bc); - - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - tree_ir = future.get_result(); - } - } + im.spec_inference_loop(&tree_model, rm, total_num_requests, ssm_model_ids); // Execution fence { diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index bb63dc153e..52ae0d9ef9 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -22,7 +22,7 @@ import warnings import numpy as np from .flexflow_logger import fflogger -from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, OpType, ParameterSyncType, enum_to_int, int_to_enum +from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, OpType, ParameterSyncType, enum_to_int, int_to_enum _FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) if not _FF_BUILD_DOCS: from .flexflowlib import ffi, flexflow_library @@ -429,12 +429,40 @@ def __init__(self, handle, idx=None, name=None): super(MultiHeadAttention, self).__init__(handle, idx, name) # ----------------------------------------------------------------------- -# Increamental MultiHeadAttention +# Incremental MultiHeadAttention # ----------------------------------------------------------------------- class IncMultiHeadAttention(Op): def __init__(self, handle, idx=None, name=None): super(IncMultiHeadAttention, self).__init__(handle, idx, name) +# ----------------------------------------------------------------------- +# RMS Norm +# ----------------------------------------------------------------------- +class RMSNorm(Op): + def __init__(self, handle, idx=None, name=None): + super(RMSNorm, self).__init__(handle, idx, name) + +# ----------------------------------------------------------------------- +# ArgTopK +# ----------------------------------------------------------------------- +class ArgTopK(Op): + def __init__(self, handle, idx=None, name=None): + super(ArgTopK, self).__init__(handle, idx, name) + +# ----------------------------------------------------------------------- +# BeamTopK +# ----------------------------------------------------------------------- +class BeamTopK(Op): + def __init__(self, handle, idx=None, name=None): + super(BeamTopK, self).__init__(handle, idx, name) + +# ----------------------------------------------------------------------- +# Sampling +# ----------------------------------------------------------------------- +class Sampling(Op): + def __init__(self, handle, idx=None, name=None): + super(Sampling, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # flexflow_op_t handle to Op # ----------------------------------------------------------------------- @@ -516,7 +544,15 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): elif op_type == OpType.MULTIHEAD_ATTENTION: return MultiHeadAttention(handle, idx, name) elif op_type == OpType.INC_MULTIHEAD_ATTENTION: - return MultiHeadAttention(handle, idx, name) + return IncMultiHeadAttention(handle, idx, name) + elif op_type == OpType.RMS_NORM: + return RMSNorm(handle, idx, name) + elif op_type == OpType.ARG_TOPK: + return ArgTopK(handle, idx, name) + elif op_type == OpType.BEAM_TOPK: + return BeamTopK(handle, idx, name) + elif op_type == OpType.SAMPLING: + return Sampling(handle, idx, name) elif op_type == OpType.RSQRT: return Rsqrt(handle, idx, name) elif op_type == OpType.POW: @@ -1299,7 +1335,7 @@ def conv2d(self, input, out_channels, return Tensor(handle, owner_op_type=OpType.CONV2D) def embedding(self, input, num_embeddings, embedding_dim, - aggr, shared_op=None, kernel_initializer=None, name=None): + aggr, dtype=DataType.DT_FLOAT, shared_op=None, kernel_initializer=None, name=None): """Layer that turns positive integers into dense vectors of fixed size :param input: the input Tensor. @@ -1313,6 +1349,9 @@ def embedding(self, input, num_embeddings, embedding_dim, :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG. :type aggr: AggrMode + + :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE + :type dtype: DataType :param shared_op: the layer whose parameters are shared with. Default is None. :type shared_op: Op @@ -1328,6 +1367,7 @@ def embedding(self, input, num_embeddings, embedding_dim, c_name = get_c_name(name) shared_op_handle = self.__get_op_handle(shared_op) c_aggr = enum_to_int(AggrMode, aggr) + c_dtype = enum_to_int(DataType, dtype) if kernel_initializer is None: kernel_initializer = GlorotUniformInitializer(42) assert (type(kernel_initializer) is GlorotUniformInitializer) or \ @@ -1336,7 +1376,7 @@ def embedding(self, input, num_embeddings, embedding_dim, (type(kernel_initializer) is NormInitializer), \ f"Unknown initializer type: {kernel_initializer}" handle = ffc.flexflow_model_add_embedding( - self.handle, input.handle, num_embeddings, embedding_dim, c_aggr, + self.handle, input.handle, num_embeddings, embedding_dim, c_aggr, c_dtype, shared_op_handle, kernel_initializer.handle, c_name, ) # NOTE: We must keep a reference to the initializer or else it will be @@ -1977,17 +2017,18 @@ def multihead_attention(self, query, key, value, handle = ffc.flexflow_model_add_multihead_attention(self.handle, query.handle, key.handle, value.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) self.add_layer(OpType.MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) + def inc_multihead_attention(self, input, embed_dim, num_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, - kernel_initializer=None, name=None): + kernel_initializer=None, apply_rotary_embedding=False, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, and returns the dot-product attention between them:. :param input: the input Tensor. - :type query: Tensor + :type input: Tensor :param embed_dim: total dimension of the model :type embed_dim: int @@ -2015,6 +2056,9 @@ def inc_multihead_attention(self, input, :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool :param name: the name of the layer. Default is None. :type name: string @@ -2023,9 +2067,201 @@ def inc_multihead_attention(self, input, """ c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) + handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, apply_rotary_embedding, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multihead_attention(self, input, + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + kernel_initializer=None, apply_rotary_embedding=False, name=None): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, + and returns the dot-product attention between them:. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + handle = ffc.flexflow_model_add_spec_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, apply_rotary_embedding, c_name) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multihead_self_attention_verify(self, input, + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + kernel_initializer=None, apply_rotary_embedding=False, name=None): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, + and returns the dot-product attention between them:. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, apply_rotary_embedding, c_name) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + + def rms_norm(self, input, eps, dim, name=None): + """Defines the RMS Norm layer. + + :param input: the input Tensor. + :type input: Tensor + + :param eps: a value added to the denominator for numerical stability + :type eps: float + + :param dim: The dimension with respect to which to take the norm + :type dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc.flexflow_model_add_rms_norm(self.handle, input.handle, eps, dim, c_name) + self.add_layer(OpType.RMS_NORM, name) + return Tensor(handle, owner_op_type=OpType.RMS_NORM) + + def arg_top_k(self, input, k, sorted, name=None): + """Defines the Arg TopK layer. + + :param input: the input Tensor. + :type input: Tensor + + :param k: the top k indices to select + :type k: int + + :param sorted: Whether the entries should be sorted + :type sorted: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc.flexflow_model_add_arg_top_k(self.handle, input.handle, k, sorted, c_name) + self.add_layer(OpType.ARG_TOPK, name) + return Tensor(handle, owner_op_type=OpType.ARG_TOPK) + + def beam_top_k(self, input, max_beam_size, sorted, name=None): + """Defines the Beam TopK layer. + + :param input: the input Tensor. + :type input: Tensor + + :param max_beam_size: the top max_beam_size indices to select + :type max_beam_size: int + + :param sorted: Whether the entries should be sorted + :type sorted: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc.flexflow_model_add_beam_top_k(self.handle, input.handle, max_beam_size, sorted, c_name) + self.add_layer(OpType.BEAM_TOPK, name) + return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) + + def sampling(self, input, top_p, name=None): + """Defines the Sampling layer. + + :param input: the input Tensor. + :type input: Tensor + + :param top_p: The top_p parameter of the sampling + :type top_p: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc.flexflow_model_add_sampling(self.handle, input.handle, top_p, c_name) + self.add_layer(OpType.SAMPLING, name) + return Tensor(handle, owner_op_type=OpType.SAMPLING) + def reset_metrics(self): """Reset performance metrics. @@ -2249,6 +2485,9 @@ def label_tensor(self): def get_perf_metrics(self): handle = ffc.flexflow_model_get_perf_metrics(self.handle) return PerfMetrics(handle) + + def set_transformer_layer_id(self, id): + ffc.flexflow_model_set_transformer_layer_id(self.handle, id) def create_data_loader(self, batch_tensor, full_array): """Create a SingleDataloader instance. @@ -2566,3 +2805,70 @@ def __init__(self, shape, data_type, base_ptr, strides, read_only): 'data': (base_ptr, read_only), 'strides': strides, } + +# ----------------------------------------------------------------------- +# BatchConfig +# ----------------------------------------------------------------------- + +class BatchConfig(object): + __slots__ = ['handle', '_handle'] + def __init__(self): + self.handle = ffc.flexflow_batch_config_create() + self._handle = ffi.gc(self.handle, ffc.flexflow_batch_config_destroy) + +# ----------------------------------------------------------------------- +# TreeVerifyBatchConfig +# ----------------------------------------------------------------------- + +class TreeVerifyBatchConfig(object): + __slots__ = ['handle', '_handle'] + def __init__(self): + self.handle = ffc.flexflow_tree_verify_batch_config_create() + self._handle = ffi.gc(self.handle, ffc.flexflow_tree_verify_batch_config_destroy) + +# ----------------------------------------------------------------------- +# BeamSearchBatchConfig +# ----------------------------------------------------------------------- + +class BatchConfig(object): + __slots__ = ['handle', '_handle'] + def __init__(self): + self.handle = ffc.flexflow_beam_search_batch_config_create() + self._handle = ffi.gc(self.handle, ffc.flexflow_beam_search_batch_config_destroy) + +# ----------------------------------------------------------------------- +# RequestManager +# ----------------------------------------------------------------------- + +class RequestManager(object): + __slots__ = ['handle', '_handle'] + def __init__(self): + self.handle = ffc.flexflow_request_manager_create() + self._handle = ffi.gc(self.handle, ffc.flexflow_request_manager_destroy) + + def flexflow_request_manager_register_new_request(self, prompt, max_sequence_length): + return ffc.flexflow_request_manager_register_new_request(self.handle, prompt, max_sequence_length) + +# ----------------------------------------------------------------------- +# InferenceManager +# ----------------------------------------------------------------------- + +class InferenceManager(object): + __slots__ = ['handle', '_handle', 'max_num_tokens_per_batch'] + def __init__(self, ffconfig, max_num_tokens_per_batch): + self.max_num_tokens_per_batch = max_num_tokens_per_batch + self.handle = ffc.flexflow_inference_manager_create(ffconfig.handle, max_num_tokens_per_batch) + self._handle = ffi.gc(self.handle, ffc.flexflow_inference_manager_destroy) + + def compile_model_and_allocate_buffer(self, model): + ffc.flexflow_inference_manager_compile_model_and_allocate_buffer(self.handle, model.handle) + + def init_operators_inference(self, model): + ffc.flexflow_inference_manager_init_operators_inference(self.handle, model.handle) + + def incr_decoding_loop(self, model, request_manager, total_num_requests): + ffc.flexflow_inference_manager_incr_decoding_loop(self.handle, model.handle, request_manager.handle, total_num_requests) + + def spec_inference_loop(self, model, request_manager, total_num_requests, ssm_model_ids): + c_ssm_model_ids = ffi.new("int[]", ssm_model_ids) + ffc.flexflow_inference_manager_spec_inference_loop(self.handle, model.handle, request_manager.handle, total_num_requests, len(ssm_model_ids), c_ssm_model_ids) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py new file mode 100644 index 0000000000..e832bc7de4 --- /dev/null +++ b/python/flexflow/serve/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .serve import LLM, SamplingConfig diff --git a/python/flexflow/serve/models/__init__.py b/python/flexflow/serve/models/__init__.py new file mode 100644 index 0000000000..3b4087203b --- /dev/null +++ b/python/flexflow/serve/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .llama import FlexFlowLLAMA +from .opt import FlexFlowOPT +from .falcon import FlexFlowFalcon diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py new file mode 100644 index 0000000000..6fb4a13320 --- /dev/null +++ b/python/flexflow/serve/models/falcon.py @@ -0,0 +1,19 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.core import * + +class FlexFlowFalcon: + def __init__(self, max_batch_size=1, max_seq_length=256, max_tokens_per_batch=64): + pass diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py new file mode 100644 index 0000000000..9d5953c773 --- /dev/null +++ b/python/flexflow/serve/models/llama.py @@ -0,0 +1,130 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.core import * +import random + +class LLAMAConfig: + def __init__(self): + self.n_layers = 32 + self.vocab_size = 3200 + self.n_heads = 32 + self.dim = 4096 + self.multiple_of = 256 + self.norm_eps = 1e-6 + self.total_requests = 2560 + self.incremental_mode = True + self.hidden_dim = 11008 + self.max_seq_len = 256 + self.max_num_tokens = 64 + self.max_beam_width = 1 + self.max_beam_depth = 8 + +class FlexFlowLLAMA: + def __init__(self, mode, sampling_config, ffconfig, max_batch_size=1, max_seq_length=256, max_tokens_per_batch=64, use_full_precision=False): + self.mode = mode + self.sampling_config = sampling_config + self.ffconfig = ffconfig + self.max_batch_size = max_batch_size + self.use_full_precision = use_full_precision + self.llama_config = LLAMAConfig() + self.llama_config.max_seq_length = max_seq_length + self.llama_config.max_num_tokens = max_tokens_per_batch + + self.build_model() + + def build_model(self): + ffmodel = FFModel(self.ffconfig) + + tokens_dims = [self.llama_config.max_num_tokens, 1] + input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) + + embed_init = UniformInitializer(random.randint(0, 2**31-1), 0, 0) + token = ffmodel.embedding(input_tensor, self.llama_config.vocab_size, self.llama_config.dim, AggrMode.AGGR_MODE_NONE, DataType.DT_FLOAT if self.use_full_precision else DataType.DT_HALF, None, embed_init) + + for i in range(self.llama_config.n_layers): + ffmodel.set_transformer_layer_id(i) + + attn_norm = ffmodel.rms_norm(token, self.llama_config.norm_eps, self.llama_config.dim) + + if self.mode == InferenceMode.BEAM_SEARCH_MODE: + mha = ffmodel.spec_inc_multihead_attention( + attn_norm, + self.llama_config.dim, + self.llama_config.n_heads, + self.llama_config.dim // self.llama_config.n_heads, + self.llama_config.dim // self.llama_config.n_heads, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + None, # kernel initializer + True # apply_rotary_embedding + ) + elif self.mode == InferenceMode.TREE_VERIFY_MODE: + mha = ffmodel.inc_multihead_self_attention_verify( + attn_norm, + self.llama_config.dim, + self.llama_config.n_heads, + self.llama_config.dim // self.llama_config.n_heads, + self.llama_config.dim // self.llama_config.n_heads, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + None, # kernel initializer + True # apply_rotary_embedding + ) + elif self.mode == InferenceMode.INC_DECODING_MODE: + mha = ffmodel.inc_multihead_attention( + attn_norm, + self.llama_config.dim, + self.llama_config.n_heads, + self.llama_config.dim // self.llama_config.n_heads, + self.llama_config.dim // self.llama_config.n_heads, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + None, # kernel initializer + True # apply_rotary_embedding + ) + else: + assert(False) + + token = ffmodel.add(token, mha) + ff_norm = ffmodel.rms_norm(token, self.llama_config.norm_eps, self.llama_config.dim) + w1 = ffmodel.dense(ff_norm, self.llama_config.hidden_dim, ActiMode.AC_MODE_NONE, False) + w3 = ffmodel.dense(ff_norm, self.llama_config.hidden_dim, ActiMode.AC_MODE_NONE, False) + sigmoid = ffmodel.sigmoid(w1) + silu = ffmodel.multiply(w1, sigmoid) + multi = ffmodel.multiply(silu, w3) + w2 = ffmodel.dense(multi, self.llama_config.dim, ActiMode.AC_MODE_NONE, False) + token = ffmodel.add(token, w2) + + token = ffmodel.rms_norm(token, self.llama_config.norm_eps, self.llama_config.dim) + dense = ffmodel.dense(token, self.llama_config.vocab_size, ActiMode.AC_MODE_NONE, False) + + if self.mode == InferenceMode.BEAM_SEARCH_MODE: + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.beam_top_k(softmax, self.llama_config.max_beam_width, False) + else: + if self.sampling_config.do_sample: + dense = ffmodel.scalar_true_divide(dense, self.sampling_config.temperature, False) + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.sampling(softmax, self.sampling_config.topp) + else: + output = ffmodel.arg_top_k(dense, 1, False) + + diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py new file mode 100644 index 0000000000..50ac3611d9 --- /dev/null +++ b/python/flexflow/serve/models/opt.py @@ -0,0 +1,19 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.core import * + +class FlexFlowOPT: + def __init__(self, max_batch_size=1, max_seq_length=256, max_tokens_per_batch=64): + pass diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py new file mode 100644 index 0000000000..2c33cff58d --- /dev/null +++ b/python/flexflow/serve/serve.py @@ -0,0 +1,85 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.serve.models import FlexFlowLLAMA, FlexFlowOPT, FlexFlowFalcon +from flexflow.core import * +from transformers import AutoConfig +import sys + +class SamplingConfig: + def __init__(self, do_sample = False, temperature=0.9, topp=0.8, topk=1): + self.do_sample = False + self.temperature = 0.8 + self.topp = 0.6 + self.topk = 1 + +class LLM: + def __init__(self, model_name, data_type="half"): + self.model_name = model_name + self.supported_models = { + "LlamaForCausalLM": FlexFlowLLAMA, + "LLaMAForCausalLM": FlexFlowLLAMA, + "OPTForCausalLM": FlexFlowOPT, + "RWForCausalLM": FlexFlowFalcon # falcon + } + self.model_type = self.__get_ff_model_type(model_name) + self.data_type = data_type + self.ffconfig = FFConfig() + + def __get_ff_model_type(self, model_name): + hf_config = AutoConfig.from_pretrained(model_name) + architectures = getattr(hf_config, "architectures", []) + ff_arch = None + if next(iter(architectures), None) is not None: + ff_arch = self.supported_models.get(architectures[0]) + if ff_arch is None: + print("Huggingface model of type {architectures} is not yet supported by FlexFlow") + sys.exit(1) + return ff_arch + + def compile( + self, + mode = InferenceMode.INC_DECODING_MODE, + sampling_config = SamplingConfig(), + use_full_precision = False, + max_batch_size=1, + max_seq_length=256, + max_tokens_per_batch=64, + tensor_parallel_degree=4, + pipeline_parallel_degree=2, + ssms=[], + ): + self.max_batch_size = max_batch_size + self.max_seq_length = max_seq_length + self.max_tokens_per_batch = max_tokens_per_batch + self.tensor_parallel_degree = tensor_parallel_degree + self.pipeline_parallel_degree = pipeline_parallel_degree + self.ssms = ssms + self.sampling_config = SamplingConfig() + assert((mode == InferenceMode.INC_DECODING_MODE or mode == InferenceMode.BEAM_SEARCH_MODE) == (len(ssms) == 0)) + + # Create model + self.model = self.model_type(mode, sampling_config, self.ffconfig, max_batch_size, max_seq_length, max_tokens_per_batch, use_full_precision) + + # Create inference manager + self.im = InferenceManager(self.ffconfig, max_tokens_per_batch) + + # Create request manager + self.rm = RequestManager() + + assert False and "Not implemented yet" + + def generate(self, prompt, sampling=None): + self.sampling = sampling if sampling is not None else self.default_config + assert False and "Not implemented yet" diff --git a/python/flexflow/type.py b/python/flexflow/type.py index cf2706f746..dd1d40baf0 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -56,6 +56,11 @@ class MetricsType(Enum): METRICS_ROOT_MEAN_SQUARED_ERROR = 1016 METRICS_MEAN_ABSOLUTE_ERROR=1032 +class InferenceMode(Enum): + INC_DECODING_MODE = 2001 + BEAM_SEARCH_MODE = 2002 + TREE_VERIFY_MODE = 2003 + class OpType(Enum): CONV2D = 2011 EMBEDDING = 2012 @@ -93,6 +98,10 @@ class OpType(Enum): MIN = 2054 MULTIHEAD_ATTENTION = 2060 INC_MULTIHEAD_ATTENTION = 2061 + SPEC_INC_MULTIHEAD_SELF_ATTENTION = 2062 + TREE_INC_MULTIHEAD_SELF_ATTENTION = 2063 + INC_MULTIQUERY_SELF_ATTENTION = 2064 + SAMPLING = 2065 GETITEM = 2070 GETATTR = 2080 EXPAND = 2081 @@ -115,6 +124,10 @@ class OpType(Enum): VIEW = 2105 GATHER = 2106 ATTRIBUTE = 2200 + RMS_NORM = 2300 + ARG_TOPK = 2301 + BEAM_TOPK = 2302 + def enum_to_int(enum, enum_item): for item in enum: if (enum_item == item): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 09258d8206..d7f1b70232 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -15,6 +15,7 @@ #include "flexflow/flexflow_c.h" #include "flexflow/dataloader.h" +#include "flexflow/inference.h" #include "flexflow/mapper.h" using namespace Legion; @@ -55,6 +56,14 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_net_config_t, NetConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_dlrm_config_t, DLRMConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *); + // inference + FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, + TreeVerifyBatchConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t, + BeamSearchBatchConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_inference_manager_t, InferenceManager *); + FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); }; Logger ffc_log("flexflow_c"); @@ -456,9 +465,10 @@ flexflow_tensor_t flexflow_tensor_t flexflow_model_add_embedding(flexflow_model_t handle_, const flexflow_tensor_t input_, - int num_entires, + int num_entries, int out_dim, enum AggrMode aggr, + DataType dtype, flexflow_op_t shared_op_, flexflow_initializer_t kernel_initializer_, char const *name) { @@ -470,20 +480,21 @@ flexflow_tensor_t // TODO: update the flexflow_c and Python API to support other data types // Currently we assume it's float Tensor tensor = handle->embedding(input, - num_entires, + num_entries, out_dim, aggr, - DT_FLOAT, + dtype, shared_op, kernel_initializer, name); - DEBUG_PRINT("[Embedding] new Tensor %p, input %p, num_entires %d, out_dim " - "%d, aggr %d, shared_op %p, kernel_init %p, name %s", + DEBUG_PRINT("[Embedding] new Tensor %p, input %p, num_entries %d, out_dim " + "%d, aggr %d, dtype %d, shared_op %p, kernel_init %p, name %s", tensor, input, - num_entires, + num_entries, out_dim, aggr, + dtype, shared_op, kernel_initializer, name); @@ -1014,6 +1025,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bool add_bias_kv, bool add_zero_attn, flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1030,10 +1042,124 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( add_zero_attn, input->data_type, kernel_initializer, + apply_rotary_embedding, name); return FFCObjectWrapper::wrap(tensor); } +flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = + handle->spec_inc_multihead_self_attention(input, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + input->data_type, + kernel_initializer, + apply_rotary_embedding, + name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = + handle->inc_multihead_self_attention_verify(input, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + input->data_type, + kernel_initializer, + apply_rotary_embedding, + name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, + const flexflow_tensor_t input_, + float eps, + int dim, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor tensor = handle->rms_norm(input, eps, dim, input->data_type, name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, + const flexflow_tensor_t input_, + int k, + bool sorted, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor tensor = handle->arg_top_k(input, k, sorted, name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, + const flexflow_tensor_t input_, + int max_beam_size, + bool sorted, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, + const flexflow_tensor_t input_, + float top_p, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor tensor = handle->sampling(input, top_p, name); + return FFCObjectWrapper::wrap(tensor); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1082,6 +1208,11 @@ flexflow_perf_metrics_t return FFCObjectWrapper::wrap(perf_metrics); } +void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_transformer_layer_id(id); +} + // ----------------------------------------------------------------------- // Tensor // ----------------------------------------------------------------------- @@ -1961,3 +2092,150 @@ void flexflow_perform_registration(void) { Runtime::perform_registration_callback(FFMapper::update_mappers, true /*global*/); } + +// ----------------------------------------------------------------------- +// BatchConfig +// ----------------------------------------------------------------------- + +flexflow_batch_config_t flexflow_batch_config_create(void) { + BatchConfig *config = new BatchConfig(); + DEBUG_PRINT("[BatchConfig] new %p", config); + return FFCObjectWrapper::wrap(config); +} + +void flexflow_batch_config_destroy(flexflow_batch_config_t handle_) { + BatchConfig *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[BatchConfig] delete %p", handle); + delete handle; +} + +// ----------------------------------------------------------------------- +// TreeVerifyBatchConfig +// ----------------------------------------------------------------------- + +flexflow_tree_verify_batch_config_t + flexflow_tree_verify_batch_config_create(void) { + TreeVerifyBatchConfig *config = new TreeVerifyBatchConfig(); + DEBUG_PRINT("[TreeVerifyBatchConfig] new %p", config); + return FFCObjectWrapper::wrap(config); +} + +void flexflow_tree_verify_batch_config_destroy( + flexflow_tree_verify_batch_config_t handle_) { + TreeVerifyBatchConfig *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[TreeVerifyBatchConfig] delete %p", handle); + delete handle; +} + +// ----------------------------------------------------------------------- +// BeamSearchBatchConfig +// ----------------------------------------------------------------------- + +flexflow_beam_search_batch_config_t + flexflow_beam_search_batch_config_create(void) { + BeamSearchBatchConfig *config = new BeamSearchBatchConfig(); + DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config); + return FFCObjectWrapper::wrap(config); +} + +void flexflow_beam_search_batch_config_destroy( + flexflow_beam_search_batch_config_t handle_) { + BeamSearchBatchConfig *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle); + delete handle; +} + +// ----------------------------------------------------------------------- +// RequestManager +// ----------------------------------------------------------------------- + +flexflow_request_manager_t flexflow_request_manager_create(void) { + RequestManager *rm = new RequestManager(); + DEBUG_PRINT("[RequestManager] new %p", rm); + return FFCObjectWrapper::wrap(rm); +} + +void flexflow_request_manager_destroy(flexflow_request_manager_t handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[RequestManager] delete %p", handle); + delete handle; +} + +long unsigned int flexflow_request_manager_register_new_request( + flexflow_request_manager_t handle_, + char const *prompt, + int max_sequence_length) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + assert(prompt != nullptr && "Cannot convert nullptr char * to std::string"); + std::string const prompt_str(prompt); + DEBUG_PRINT("[RequestManager] register_new_request %p %s", handle, prompt); + return handle->register_new_request(prompt_str, max_sequence_length); +} + +// ----------------------------------------------------------------------- +// InferenceManager +// ----------------------------------------------------------------------- + +flexflow_inference_manager_t + flexflow_inference_manager_create(flexflow_config_t config_handle, + int max_num_tokens_per_batch) { + FFConfig *config = FFCObjectWrapper::unwrap(config_handle); + InferenceManager *im = + new InferenceManager(*config, max_num_tokens_per_batch); + DEBUG_PRINT("[InferenceManager] new %p", im); + return FFCObjectWrapper::wrap(im); +} + +void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle_) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[InferenceManager] delete %p", handle); + delete handle; +} + +void flexflow_inference_manager_compile_model_and_allocate_buffer( + flexflow_inference_manager_t handle_, flexflow_model_t model_handle_) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + DEBUG_PRINT("[InferenceManager] compile_model_and_allocate_buffer %p", + handle); + handle->compile_model_and_allocate_buffer(model_handle); +} + +void flexflow_inference_manager_init_operators_inference( + flexflow_inference_manager_t handle_, flexflow_model_t model_handle_) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + DEBUG_PRINT("[InferenceManager] init_operators_inference %p", handle); + handle->init_operators_inference(model_handle); +} + +void flexflow_inference_manager_incr_decoding_loop( + flexflow_inference_manager_t handle_, + flexflow_model_t model_handle_, + flexflow_request_manager_t rm_handle_, + int total_num_requests) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + RequestManager *rm_handle = FFCObjectWrapper::unwrap(rm_handle_); + DEBUG_PRINT("[InferenceManager] incr_decoding_loop %p", handle); + handle->incr_decoding_loop(model_handle, *rm_handle, total_num_requests); +} + +void flexflow_inference_manager_spec_inference_loop( + flexflow_inference_manager_t handle_, + flexflow_model_t model_handle_, + flexflow_request_manager_t rm_handle_, + int total_num_requests, + int num_ssms, + int *ssm_model_ids) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + RequestManager *rm_handle = FFCObjectWrapper::unwrap(rm_handle_); + std::vector ssm_model_ids_vec; + for (int i = 0; i < num_ssms; i++) { + ssm_model_ids_vec.push_back(ssm_model_ids[i]); + } + DEBUG_PRINT("[InferenceManager] spec_inference_loop %p", handle); + handle->spec_inference_loop( + model_handle, *rm_handle, total_num_requests, ssm_model_ids_vec); +} diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 7562a727d7..1535cfcd77 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -45,8 +45,11 @@ Tensor FFModel::binary(OperatorType op, assert(broadcastable(in1, in2)); if (in1->data_type < in2->data_type) { dtype = in2->data_type; - std::string str(name); - Tensor new_in1 = cast(in1, dtype, (str + "input1_pre_cast").c_str()); + std::string str; + if (name != nullptr) { + str = std::string(name) + "input1_pre_cast"; + } + Tensor new_in1 = cast(in1, dtype, str.c_str()); ele = new Layer(this, op, dtype, @@ -58,8 +61,11 @@ Tensor FFModel::binary(OperatorType op, in2); } else if (in1->data_type > in2->data_type) { dtype = in1->data_type; - std::string str(name); - Tensor new_in2 = cast(in2, dtype, (str + "input2_pre_cast").c_str()); + std::string str; + if (name != nullptr) { + str = std::string(name) + "input2_pre_cast"; + } + Tensor new_in2 = cast(in2, dtype, str.c_str()); ele = new Layer(this, op, dtype, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index b6be945a94..adb8d9d706 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -347,6 +347,90 @@ FutureMap InferenceManager::inference(FFModel *model, return fm; }; +void InferenceManager::incr_decoding_loop(FFModel *model, + RequestManager &rm, + int total_num_requests) { + BatchConfig bc; + InferenceResult ir; + while (rm.get_num_processed_requests() < total_num_requests) { + bc = rm.prepare_next_batch(bc, ir); + if (rm.get_num_processed_requests() >= total_num_requests) { + break; + } + FutureMap fm = inference(model, 0, bc); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + ir = future.get_result(); + // assert(false); + } +} + +void InferenceManager::spec_inference_loop(FFModel *model, + RequestManager &rm, + int total_num_requests, + std::vector ssm_model_ids) { + TreeVerifyBatchConfig tree_bc; + BeamSearchBatchConfig beam_bc; + std::vector beam_bc_vec; + int num_ssms = ssm_model_ids.size(); + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + beam_bc_vec.push_back(BeamSearchBatchConfig(ssm_model_ids[ssm_id])); + } + + InferenceResult tree_ir; + + while (rm.get_num_processed_requests() < total_num_requests) { + int depth = 0; + // Beam Search + beam_bc = rm.prepare_next_batch_init(tree_bc, tree_ir, 0); + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + beam_bc_vec[ssm_id] = beam_bc; + beam_bc_vec[ssm_id].model_id = ssm_id; + } + + if (rm.get_num_processed_requests() >= total_num_requests) { + break; + } + + for (int i = 0; i < num_ssms; i++) { + while (true) { + beam_bc = beam_bc_vec[i]; + depth = beam_bc.beamRequestsInfo[0].current_depth; + + FutureMap fm = inference(rm.get_model(0), 0, beam_bc_vec[i]); + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + BeamInferenceResult beam_ir = future.get_result(); + + int iteration = + std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, + BatchConfig::MAX_SEQ_LENGTH - beam_bc.max_init_length); + + if (depth - 1 >= iteration) { + break; + } else { + beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); + if (beam_bc_vec[i].num_active_tokens() == 0 && + beam_bc_vec[i].num_active_requests() != 0) { + break; + } + } + } + std::cout << "----------beam search finished for model " + << beam_bc_vec[i].model_id << "------------" << std::endl; + } + // Token Tree Verification + { + tree_bc = rm.prepare_next_batch_verify(beam_bc_vec); + FutureMap fm = inference(model, 0, tree_bc); + + assert(fm.get_future_map_domain().get_volume() == 1); + Future future = fm.get_future(0); + tree_ir = future.get_result(); + } + } +} + void InferenceManager::load_input_tokens_from_batch_config( BatchConfig const &bc, ParallelTensor const input) { Context ctx = ff_config.lg_ctx; From aef158aec228fa91b8b62d7ac0092785f90dabb2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 27 Jul 2023 02:10:54 -0400 Subject: [PATCH 181/344] Fix fusion bug (#889) --- src/ops/fused.cu | 12 ++++++++---- tests/inference_tests.sh | 4 ++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ef6c856871..02a4995b0f 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -235,13 +235,15 @@ __host__ void FusedOp::forward_task(Task const *task, out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); float const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_ptr = my_weight_accessor[1].get_float_ptr(); + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].get_float_ptr(); + } } else { assert(fused->op_num_weights[op] == 1); } - LinearMeta *m = (LinearMeta *)metas->meta[op]; Kernels::Linear::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), @@ -604,13 +606,15 @@ __host__ void out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); void const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_ptr = my_weight_accessor[1].ptr; + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].ptr; + } } else { assert(fused->op_num_weights[op] == 1); } - LinearMeta *m = (LinearMeta *)metas->meta[op]; assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->input_type[0] == my_output_accessor[0].data_type); batch_size = bc->num_active_tokens(); diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index f50d374633..8616bb845e 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -87,8 +87,10 @@ fi if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -97,8 +99,10 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # OPT (small model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (small model, half precision) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (big model) ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 From 6b7e6f0ca158bb33685e6ed2fd77b9e867c2ab53 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:36:09 -0400 Subject: [PATCH 182/344] Inference: add argmax operator (#888) * add argmax operator * support spec infer. * format * remove redundant * half precision * fix * fix * hip_rocm --- include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 7 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/argmax.h | 109 +++++++ include/flexflow/ops/argmax_params.h | 24 ++ inference/models/llama.cc | 6 +- inference/models/opt.cc | 6 +- src/ops/argmax.cc | 442 +++++++++++++++++++++++++++ src/ops/argmax.cpp | 69 +++++ src/ops/argmax.cu | 151 +++++++++ src/runtime/cuda_helper.cu | 1 + src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 5 + src/runtime/model.cc | 59 +++- src/runtime/operator_params.cc | 3 + 15 files changed, 882 insertions(+), 5 deletions(-) create mode 100644 include/flexflow/ops/argmax.h create mode 100644 include/flexflow/ops/argmax_params.h create mode 100644 src/ops/argmax.cc create mode 100644 src/ops/argmax.cpp create mode 100644 src/ops/argmax.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 65fa23569b..7521613477 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -163,6 +163,7 @@ enum OperatorType { OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, OP_BEAM_TOPK, + OP_ARGMAX, OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index a95c229a08..0e98b6e8ad 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -138,6 +138,9 @@ enum TaskIDs { ARG_TOPK_INF_TASK_ID, SAMPLING_INIT_TASK_ID, SAMPLING_INF_TASK_ID, + ARGMAX_INIT_TASK_ID, + ARGMAX_BEAM_INF_TASK_ID, + ARGMAX_NORM_INF_TASK_ID, TRANSPOSE_INIT_TASK_ID, TRANSPOSE_FWD_TASK_ID, TRANSPOSE_BWD_TASK_ID, @@ -315,6 +318,7 @@ class BeamTopK; class SpecIncMultiHeadSelfAttention; class IncMultiQuerySelfAttention; class Sampling; +class ArgMax; class Combine; class Repartition; class Reduction; @@ -615,6 +619,7 @@ class FFModel { int k, bool sorted, char const *name = NULL); + Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); Tensor sampling(const Tensor input, float top_p, char const *name = NULL); Tensor multihead_attention(const Tensor query, const Tensor key, @@ -1067,6 +1072,8 @@ class FFModel { BeamTopK *>, std::unordered_map, Sampling *>, + std::unordered_map, + ArgMax *>, std::unordered_map< std::pair, SpecIncMultiHeadSelfAttention *>, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 5c2101d190..982d5482a0 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -4,6 +4,7 @@ #include "flexflow/ops/aggregate_params.h" #include "flexflow/ops/aggregate_spec_params.h" #include "flexflow/ops/arg_topk_params.h" +#include "flexflow/ops/argmax_params.h" #include "flexflow/ops/attention_params.h" #include "flexflow/ops/batch_matmul_params.h" #include "flexflow/ops/beam_topk_params.h" @@ -73,6 +74,7 @@ using OperatorParameters = mp::variant const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfig const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static BeamInferenceResult + inference_task_beam(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static InferenceResult + inference_task_norm(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void serialize(Legion::Serializer &s) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + template + static void forward_kernel(ArgMaxMeta const *m, + DT *input_ptr, + int *indices_ptr, + DT *prob_ptr, + int *parent_ptr, + int length, + int batch_size, + ffStream_t stream); + static void forward_kernel_wrapper(ArgMaxMeta const *m, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &indices, + GenericTensorAccessorW const &value, + GenericTensorAccessorW const &parent); + Params get_params() const; + +public: + bool beam_search; +}; + +}; // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/include/flexflow/ops/argmax_params.h b/include/flexflow/ops/argmax_params.h new file mode 100644 index 0000000000..a8f629619f --- /dev/null +++ b/include/flexflow/ops/argmax_params.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_ARGMAX_PARAMS_H +#define _FLEXFLOW_ARGMAX_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct ArgMaxParams { + bool beam_search; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(ArgMaxParams const &, ArgMaxParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ArgMaxParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_ARGMAX_PARAMS_H \ No newline at end of file diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 588d6d264c..e4cd54192d 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -176,7 +176,8 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor output; if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(dense, -1); - output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + output = ff.argmax(softmax, /*beam_Search*/ true); } else { // Tensor softmax = ff.softmax(dense, -1); if (samplingConfig.do_sample) { @@ -184,7 +185,8 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor softmax = ff.softmax(dense, -1); output = ff.sampling(softmax, samplingConfig.topp); } else { - output = ff.arg_top_k(dense, /*k=*/1, false); + // output = ff.arg_top_k(dense, /*k=*/1, false); + output = ff.argmax(dense, /*beam_Search*/ false); } } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 94aeb7f2bd..05cee2bf9d 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -215,9 +215,11 @@ void OPT::create_opt_model(FFModel &ff, Tensor output; if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); - output = ff.beam_top_k(softmax, opt_config.max_beam_width, false); + // output = ff.beam_top_k(softmax, opt_config.max_beam_width, false); + output = ff.argmax(softmax, /*beam_Search*/ true); } else { - output = ff.arg_top_k(lm_head, /*k=*/1, false); + // output = ff.arg_top_k(lm_head, /*k=*/1, false); + output = ff.argmax(lm_head, /*beam_Search*/ false); } //------------------- compile the model -------------------------------- diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc new file mode 100644 index 0000000000..754337448e --- /dev/null +++ b/src/ops/argmax.cc @@ -0,0 +1,442 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/argmax.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +Tensor FFModel::argmax(const Tensor input, bool beam_search, char const *name) { + Layer *li = new Layer(this, + OP_ARGMAX, + input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + beam_search ? 3 : 2 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + // now just support 1 output + dims[0] = 1; + // li->outputs[0] = create_tensor_legion_ordering( + // numdims, dims, input->data_type, li, 0, true /*create_grad*/); + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + // logits + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, input->data_type, li, 1, false /*create_grad*/); + + if (beam_search) { + // parent id + li->outputs[2] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 1, false /*create_grad*/); + } + } + li->add_int_property("beam_search", beam_search); + layers.push_back(li); + // outputs[0] = li->outputs[0]; + // outputs[1] = li->outputs[1]; + return li->outputs[0]; +} + +Op *ArgMax::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("beam_search", value); + bool beam_search = (bool)value; + return new ArgMax(model, inputs[0], beam_search, layer->name); +} + +ArgMaxParams ArgMax::get_params() const { + ArgMaxParams params; + params.beam_search = this->beam_search; + return params; +} + +bool ArgMaxParams::is_valid(ParallelTensorShape const &) const { + return true; +} + +bool operator==(ArgMaxParams const &lhs, ArgMaxParams const &rhs) { + return lhs.beam_search == rhs.beam_search; +} + +ArgMax::ArgMax(FFModel &model, + const ParallelTensor _input, + bool _beam_search, + char const *name) + : Op(model, + OP_ARGMAX, + _input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + _beam_search ? 3 : 2 /*outputs*/, + _input), + beam_search(_beam_search) { + int numdim = inputs[0]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[0]->dims[i]; + } + dims[0].size = 1; + assert(inputs[0]->dims[0].degree == 1); + assert(inputs[0]->dims[0].parallel_idx == -1); + // outputs[0] = model.create_parallel_tensor_legion_ordering( + // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_INT32, this, 0 /*owner_idx*/); + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this, 1 /*owner_idx*/); + if (_beam_search) { + outputs[2] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_INT32, this, 2 /*owner_idx*/); + } +} + +ArgMax::ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input) + : ArgMax(model, input, other.beam_search, other.name) {} + +ArgMax::ArgMax(FFModel &model, + ArgMaxParams const ¶ms, + const ParallelTensor input, + char const *name) + : ArgMax(model, input, params.beam_search, name) {} + +void ArgMax::init_inference(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ARGMAX_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ArgMax)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void ArgMax::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ARGMAX_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ArgMax)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *ArgMax::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ArgMax *s = (ArgMax *)task->args; + FFHandler handle = *((FFHandler *)task->local_args); + GenericTensorAccessorW acc_input = + helperGetGenericTensorAccessorRW(s->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + ArgMaxMeta *m = + new ArgMaxMeta(handle, s, input_domain, output_domain, acc_input); + m->profiling = s->profiling; + m->beam_search = s->beam_search; + return m; +} + +void ArgMax::forward(FFModel const &ff) { + // ArgMax does not support forward + assert(false); +} + +FutureMap ArgMax::inference(FFModel const &ff, + BatchConfig const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "ArgMax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + if (beam_search) { + IndexLauncher launcher(ARGMAX_BEAM_INF_TASK_ID, + parallel_is, + TaskArgument(&bc, sizeof(BatchConfig)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[2]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[2]->region)); + launcher.add_field(3, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + } else { + IndexLauncher launcher(ARGMAX_NORM_INF_TASK_ID, + parallel_is, + TaskArgument(&bc, sizeof(BatchConfig)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + } +} + +BeamInferenceResult + ArgMax::inference_task_beam(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 4); + assert(task->regions.size() == 4); + BatchConfig const *bc = (BatchConfig *)task->args; + ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); + + GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + int batch_size = bc->num_active_tokens(); + GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( + m->input_type[0], regions[2], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( + DT_INT32, regions[3], task->regions[1], FID_DATA, ctx, runtime); + ArgMax::forward_kernel_wrapper(m, input, indices, value, parent); + + BeamInferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size); + if (m->input_type[0] == DT_FLOAT) { + download_tensor(value.get_float_ptr(), ir.probs, batch_size); + } else if (m->input_type[0] == DT_HALF) { + download_tensor(m->probs, ir.probs, batch_size); + } + + download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); + return ir; +} + +InferenceResult + ArgMax::inference_task_norm(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == 3); + BatchConfig const *bc = (BatchConfig *)task->args; + ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); + + GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( + m->input_type[0], regions[2], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW parent; + int batch_size = bc->num_active_tokens(); + ArgMax::forward_kernel_wrapper(m, input, indices, value, parent); + InferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size); + return ir; +} + +void ArgMax::backward(FFModel const &ff) { + // ArgMax does not support backward + assert(false); +} + +void ArgMax::serialize(Legion::Serializer &sez) const { + sez.serialize(this->beam_search); +} + +Node ArgMax::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + bool beam_search; + dez.deserialize(beam_search); + ArgMaxParams params; + params.beam_search = beam_search; + return ff.get_or_create_node(inputs[0], params); +} + +Op *ArgMax::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + ArgMaxParams params = get_params(); + return new ArgMax(ff, params, inputs[0], this->name); +} + +bool ArgMax::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::ArgMaxParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.beam_search); + return key; +} +}; // namespace std \ No newline at end of file diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp new file mode 100644 index 0000000000..1395a1cdeb --- /dev/null +++ b/src/ops/argmax.cpp @@ -0,0 +1,69 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/argmax.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +/*static*/ +template +void ArgMax::forward_kernel(ArgMaxMeta const *m, + DT *input_ptr, + int *indices_ptr, + DT *prob_ptr, + int *parent_ptr, + int length, + int batch_size, + ffStream_t stream) {} + +/*static*/ +void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &indices, + GenericTensorAccessorW const &value, + GenericTensorAccessorW const &parent) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + hipEventCreate(&t_start); + hipEventCreate(&t_end); + hipEventRecord(t_start, stream); + } + + handle_unimplemented_hip_kernel(OP_RMS_NORM); + + if (m->profiling) { + hipEventRecord(t_end, stream); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + hipEventDestroy(t_start); + hipEventDestroy(t_end); + } +} + +ArgMaxMeta::ArgMaxMeta(FFHandler handler, + Op const *op, + Legion::Domain const &input_domain, + Legion::Domain const &output_domain, + GenericTensorAccessorW input) + : OpMeta(handler, op) {} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu new file mode 100644 index 0000000000..99487ea380 --- /dev/null +++ b/src/ops/argmax.cu @@ -0,0 +1,151 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/argmax.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +__global__ void + half_2_float_array(half *ptr, float *ptr_f, int num_of_elements) { + CUDA_KERNEL_LOOP(i, num_of_elements) { + ptr_f[i] = __half2float(ptr[i]); + } +} + +/*static*/ +template +void ArgMax::forward_kernel(ArgMaxMeta const *m, + DT *input_ptr, + int *indices_ptr, + DT *prob_ptr, + int *parent, + int const length, + int const batch_size, + cudaStream_t stream) { + + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + if (m->beam_search) { + // set all parents id zero in arg top1 case. + checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int))); + } + checkCUDNN(cudnnReduceTensor(m->handle.dnn, + m->reduceMaxDesc, + indices_ptr /*indices*/, + batch_size * sizeof(int) /*indicesSizeInBytes*/, + m->handle.workSpace, + m->handle.workSpaceSize, + &alpha, + m->inputTensor, + input_ptr, + &beta, + m->outputTensor, + prob_ptr)); +} + +/*static*/ +void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &indices, + GenericTensorAccessorW const &value, + GenericTensorAccessorW const &parent) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int batch_size = input.domain.get_volume() / length; + + if (input.data_type == DT_HALF) { + ArgMax::forward_kernel(m, + input.get_half_ptr(), + indices.get_int32_ptr(), + value.get_half_ptr(), + m->beam_search ? parent.get_int32_ptr() + : nullptr, + length, + batch_size, + stream); + if (m->beam_search) { + half_2_float_array<<>>( + value.get_half_ptr(), m->probs, batch_size); + } + + } else if (input.data_type == DT_FLOAT) { + ArgMax::forward_kernel(m, + input.get_float_ptr(), + indices.get_int32_ptr(), + value.get_float_ptr(), + m->beam_search ? parent.get_int32_ptr() + : nullptr, + length, + batch_size, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ArgMax] forward time = %.2lfms\n", elapsed); + } +} + +ArgMaxMeta::ArgMaxMeta(FFHandler handler, + Op const *op, + Legion::Domain const &input_domain, + Legion::Domain const &output_domain, + GenericTensorAccessorW input) + : OpMeta(handler, op) { + DataType data_type = op->data_type; + checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceMaxDesc)); + + // Float and Half use save type, according to + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnReduceTensor:~:text=not%20coordinate%20tuples.-,The%20data%20types%20of%20the%20tensors,.,-Note%3A + cudnnDataType_t cudnn_data_type = CUDNN_DATA_FLOAT; + + checkCUDNN( + cudnnSetReduceTensorDescriptor(reduceMaxDesc, + CUDNN_REDUCE_TENSOR_MAX, + cudnn_data_type, + CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, + CUDNN_32BIT_INDICES)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain( + outputTensor, output_domain, data_type)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain, data_type)); + + checkCUDA(cudaMalloc(&probs, sizeof(float) * BatchConfig::MAX_NUM_TOKENS)); +} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index dff5157a8a..da22a245f1 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -219,6 +219,7 @@ __host__ void cudaHostAllocPortable | cudaHostAllocMapped)); checkCUDA(cudaMemcpyAsync( host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); + cudaDeviceSynchronize(); int idx = 0; printf("%s", prefix); for (idx = 0; idx < num_elements; idx++) { diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index a777605daf..35ec59ce03 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -180,6 +180,8 @@ std::string get_operator_type_name(OperatorType type) { return "Identity"; case OP_SAMPLING: return "Sampling"; + case OP_ARGMAX: + return "ArgMax"; // Parallel Ops case OP_REPARTITION: return "Repartition"; diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 16bccc25df..a82add4b62 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -17,6 +17,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/arg_topk.h" +#include "flexflow/ops/argmax.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/beam_topk.h" @@ -2924,6 +2925,10 @@ void FFModel::deserialize_graph_optimal_view( node = Sampling::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_ARGMAX: { + node = ArgMax::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_GROUP_BY: { node = Group_by::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 22515a2bb0..66cad1f248 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -25,6 +25,7 @@ #include "flexflow/ops/aggregate.h" #include "flexflow/ops/aggregate_spec.h" #include "flexflow/ops/arg_topk.h" +#include "flexflow/ops/argmax.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" @@ -2943,6 +2944,11 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_ARGMAX: { + Op *op = ArgMax::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_GROUP_BY: { Op *op = Group_by::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -2984,7 +2990,8 @@ void FFModel::create_operators_from_layers() { // add a combine before arg_topk if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && - (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX)) { + (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX || + l->op_type == OP_ARGMAX)) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, @@ -5444,6 +5451,56 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // ArgMax task + { + TaskVariantRegistrar registrar(ARGMAX_INIT_TASK_ID, "ArgMax Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgMax Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ARGMAX_BEAM_INF_TASK_ID, + "ArgMax Beam Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgMax Inference Task Beam"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ARGMAX_NORM_INF_TASK_ID, + "ArgMax Norm Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgMax Inference Task Norm"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime + ->register_task_variant( + registrar); + } + } // Transpose task { TaskVariantRegistrar registrar(TRANSPOSE_INIT_TASK_ID, "Transpose Init"); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 8fb8c89b10..bf817f5351 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -2,6 +2,7 @@ #include "flexflow/ops/aggregate.h" #include "flexflow/ops/aggregate_spec.h" #include "flexflow/ops/arg_topk.h" +#include "flexflow/ops/argmax.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" @@ -133,6 +134,8 @@ tl::optional get_op_parameters(Op const *op) { return ((BeamTopK *)op)->get_params(); case OP_SAMPLING: return ((Sampling *)op)->get_params(); + case OP_ARGMAX: + return ((ArgMax *)op)->get_params(); // TODO: implement the get_params() function for the operators below and // uncomment the lines below From 821b32f25a19bfe51abdb6001dd3851cc3f1bdb7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 27 Jul 2023 16:52:19 -0400 Subject: [PATCH 183/344] [Docker] - Make it easier to attach inference weights to docker (#891) --- docker/run.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docker/run.sh b/docker/run.sh index 307628f4fd..f326db0a6c 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -17,6 +17,9 @@ ATTACH_GPUS=${ATTACH_GPUS:-true} gpu_arg="" if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi +# Whether to attach inference weights / files (make sure to download the weights first) +ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false} + # Amount of shared memory to give the Docker container access to # If you get a Bus Error, increase this value. If you don't have enough memory # on your machine, decrease this value. @@ -69,4 +72,11 @@ if [[ "$(docker images -q "$image"-"$FF_GPU_BACKEND""$cuda_version_hyphen":lates exit 1 fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}:latest" +inference_volumes="" +if $ATTACH_INFERENCE_FILES ; then + inference_volumes="-v $(pwd)/../inference/weights:/usr/FlexFlow/inference/weights \ + -v $(pwd)/../inference/prompt:/usr/FlexFlow/inference/prompt \ + -v $(pwd)/../inference/tokenizer:/usr/FlexFlow/inference/tokenizer"; +fi + +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}:latest" From bf0f30e9eae5d313b3164fb9bd7cb7e3b3f98167 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 27 Jul 2023 20:41:04 -0500 Subject: [PATCH 184/344] Make BatchConfig and InferenceResult Legion futures (#860) * checkpoint * format * bug fixes * fix python segfault * fix python segfault 2 * fix build error * compiled * make inc_decoding work * bug fix * disable early stop * fix future error * Fix edge cases with specific prompt lengths. * Discard long prompt request. * fix * include flexflow/accessor.h in the cuda_/hip_helper header files * hip rocm fix --------- Co-authored-by: Gabriele Oliaro Co-authored-by: Zeyu Wang Co-authored-by: zwang86 <46699021+zwang86@users.noreply.github.com> --- config/config.linux | 2 +- examples/cpp/inference/dataloader.cu | 1 + .../cpp/inference/mixture_of_experts/moe.cc | 1 + .../inference/transformers/transformers.cc | 1 + include/flexflow/batch_config.h | 12 +- include/flexflow/inference.h | 167 +------ include/flexflow/model.h | 10 + include/flexflow/operator.h | 2 +- include/flexflow/ops/aggregate.h | 2 +- include/flexflow/ops/aggregate_spec.h | 2 +- include/flexflow/ops/arg_topk.h | 2 +- include/flexflow/ops/argmax.h | 2 +- include/flexflow/ops/attention.h | 2 +- include/flexflow/ops/beam_topk.h | 2 +- include/flexflow/ops/cast.h | 2 +- include/flexflow/ops/element_binary.h | 2 +- include/flexflow/ops/element_unary.h | 2 +- include/flexflow/ops/embedding.h | 2 +- include/flexflow/ops/experts.h | 2 +- include/flexflow/ops/fused.h | 2 +- include/flexflow/ops/groupby.h | 2 +- .../ops/inc_multihead_self_attention.h | 3 +- .../ops/inc_multiquery_self_attention.h | 3 +- include/flexflow/ops/layer_norm.h | 2 +- include/flexflow/ops/linear.h | 2 +- include/flexflow/ops/noop.h | 2 +- include/flexflow/ops/rms_norm.h | 4 +- include/flexflow/ops/sampling.h | 2 +- include/flexflow/ops/softmax.h | 2 +- .../ops/spec_inc_multihead_self_attention.h | 3 +- include/flexflow/ops/split.h | 2 +- include/flexflow/ops/topk.h | 2 +- .../ops/tree_inc_multihead_self_attention.h | 4 +- include/flexflow/parallel_ops/allreduce.h | 6 +- include/flexflow/parallel_ops/combine.h | 2 +- .../parallel_ops/kernels/allreduce_kernels.h | 6 + include/flexflow/parallel_ops/partition.h | 2 +- include/flexflow/parallel_ops/reduction.h | 2 +- include/flexflow/parallel_ops/replicate.h | 2 +- include/flexflow/request_manager.h | 242 ++++++++++ include/flexflow/utils/cuda_helper.h | 1 + include/flexflow/utils/hip_helper.h | 1 + inference/incr_decoding/incr_decoding.cc | 58 +-- inference/models/falcon.cc | 6 +- inference/models/falcon.h | 2 +- inference/models/llama.cc | 6 +- inference/models/llama.h | 2 +- inference/models/opt.cc | 6 +- inference/models/opt.h | 2 +- inference/spec_infer/spec_infer.cc | 64 +-- src/c/flexflow_c.cc | 2 +- src/mapper/mapper.cc | 7 + src/ops/aggregate.cc | 2 +- src/ops/aggregate_spec.cc | 2 +- src/ops/arg_topk.cc | 13 +- src/ops/argmax.cc | 22 +- src/ops/attention.cc | 2 +- src/ops/beam_topk.cc | 43 +- src/ops/cast.cc | 2 +- src/ops/element_binary.cc | 2 +- src/ops/element_unary.cc | 2 +- src/ops/embedding.cc | 2 +- src/ops/experts.cc | 14 +- src/ops/fused.cc | 9 +- src/ops/fused.cu | 30 +- src/ops/group_by.cc | 2 +- src/ops/inc_multihead_self_attention.cc | 22 +- src/ops/inc_multiquery_self_attention.cc | 18 +- src/ops/layer_norm.cc | 2 +- src/ops/linear.cc | 8 +- src/ops/noop.cc | 2 +- src/ops/rms_norm.cc | 2 +- src/ops/sampling.cc | 2 +- src/ops/softmax.cc | 2 +- src/ops/spec_inc_multihead_self_attention.cc | 34 +- src/ops/split.cc | 2 +- src/ops/topk.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cc | 124 ++--- src/parallel_ops/allreduce.cc | 28 +- src/parallel_ops/combine.cc | 2 +- .../kernels/allreduce_kernels.cpp | 11 + src/parallel_ops/kernels/allreduce_kernels.cu | 24 + src/parallel_ops/partition.cc | 2 +- src/parallel_ops/reduction.cc | 2 +- src/parallel_ops/replicate.cc | 2 +- src/runtime/batch_config.cc | 19 + src/runtime/inference_manager.cc | 96 ++-- src/runtime/model.cc | 121 ++++- src/runtime/request_manager.cc | 440 ++++++++++++++++-- src/runtime/request_manager.cpp | 2 +- src/runtime/request_manager.cu | 34 +- 91 files changed, 1283 insertions(+), 539 deletions(-) create mode 100644 include/flexflow/request_manager.h diff --git a/config/config.linux b/config/config.linux index dbfdf6f275..738812b8b5 100755 --- a/config/config.linux +++ b/config/config.linux @@ -75,7 +75,7 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF} FF_MAX_DIM=${FF_MAX_DIM:-5} # set LEGION_MAX_RETURN_SIZE -LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-131072} +LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144} # set ROCM path ROCM_PATH=${ROCM_PATH:-"/opt/rocm"} diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu index 7fb3478020..434dc337c9 100644 --- a/examples/cpp/inference/dataloader.cu +++ b/examples/cpp/inference/dataloader.cu @@ -15,6 +15,7 @@ #include "dataloader.h" #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include "flexflow/utils/cuda_helper.h" void DataLoader::load_input(Task const *task, diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index ff3f6bb53a..4a5c33c9b0 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -15,6 +15,7 @@ #include "moe.h" #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include #include #include diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 074e832d47..0717ddc90f 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -15,6 +15,7 @@ #include "transformers.h" #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include #include #include diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index e3ce1c5b95..5e68a65d8c 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -16,6 +16,7 @@ #pragma once #include "flexflow/ffconst.h" +#include "legion.h" #include #include @@ -29,6 +30,12 @@ namespace FlexFlow { class InferenceResult; class BeamInferenceResult; +using BatchConfigFuture = Legion::Future; +using InferenceResultFuture = Legion::Future; +using BeamSearchBatchConfigFuture = Legion::Future; +using TreeVerifyBatchConfigFuture = Legion::Future; +using BeamInferenceResultFuture = Legion::Future; + class BatchConfig { public: using RequestGuid = size_t; @@ -44,8 +51,11 @@ class BatchConfig { int num_active_tokens() const; void print() const; virtual InferenceMode get_mode() const; - static int const MAX_NUM_REQUESTS = 16; + static BatchConfig const *from_future(BatchConfigFuture const &future); + static int const MAX_NUM_REQUESTS = 1; static int const MAX_NUM_TOKENS = 64; + static int const MAX_PROMPT_LENGTH = + 63; // should be MAX_NUM_TOKENS - 1 for SpecInfer static int const MAX_SEQ_LENGTH = 256; // These are set by update diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 823bac9fd6..c30b0c0be3 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -14,65 +14,10 @@ */ #pragma once - #include "flexflow/batch_config.h" -#include "flexflow/model.h" -#include -#include namespace FlexFlow { -class FFModel; -class BeamTree; -class RequestManager; -using tokenizers::Tokenizer; - -class InferenceManager { -public: - InferenceManager(FFConfig const &config, int max_num_tokens_per_batch); - void compile_model_and_allocate_buffer(FFModel *model); - void init_operators_inference(FFModel *model); - MachineView *get_machine_view(int mv_id); - Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); - void load_input_tokens_from_batch_config(BatchConfig const &bc, - ParallelTensor const input); - void load_positions(BatchConfig const &bc, ParallelTensor position_input); - void incr_decoding_loop(FFModel *model, - RequestManager &rm, - int total_num_requests); - void spec_inference_loop(FFModel *model, - RequestManager &rm, - int total_num_requests, - std::vector ssm_model_ids); - -public: - FFConfig ff_config; - std::unordered_map> tensor_buffer; - int max_num_tokens_per_batch; - int num_devices; - std::vector machine_views; -}; - -struct Request { - BatchConfig::RequestGuid guid; - int max_sequence_length; - int initial_len; - std::vector tokens; - - std::vector beam_trees; -}; - -// store the result of beam search -struct BeamTree { - struct treeLayer { - BeamSearchBatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - }; - treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; -}; - struct SamplingConfig { bool do_sample = false; float temperature = 0.8; @@ -85,114 +30,14 @@ struct SamplingConfig { SamplingConfig() {} }; -// struct BeamTree_v2 { -// std::vector tokens; -// std::vector parent_ids; -// std::vector probs; -// }; - -class RequestManager { -public: +struct GenerationResult { using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; - RequestManager(ModelType model_type, - std::string const &path, - bool verbose = false, - std::string output_filepath = ""); - RequestManager(); - size_t get_num_processed_requests(); - - int register_new_model(FFModel *model); - - FFModel *get_model(int model_id); - - RequestGuid register_new_request(std::string const &prompt, - int max_sequence_length); - RequestGuid register_new_request(std::vector const &prompt, - int max_sequence_length); - BatchConfig prepare_next_batch(BatchConfig const &bc, - InferenceResult const &result); - BeamSearchBatchConfig - prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result); - - BeamSearchBatchConfig - prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, - InferenceResult const &result, - int model_id); - - TreeVerifyBatchConfig prepare_next_batch_verify( - std::vector const &old_batches); - - void store_beam_metadata(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result); - void update_beam_metadata(BeamSearchBatchConfig &new_bc, - BeamTree &tree, - int request_index); - - std::vector> - traverse_beam_tree(BeamSearchBatchConfig const &old_bc, - int request_index, - int token_start_offset); - - // remove guid after put the cached tree in request - std::vector> merge_dfs_trees( - std::vector>> - input_trees, - int root_depth, - RequestGuid guid); - - std::vector> traverse_verify_tree( - size_t guid, - std::vector> const - &inputSerializedTree, - std::vector> const - &outputSerializedTree); - - static void - load_tokens_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - static void - load_positions_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - -private: - std::unique_ptr tokenizer_; - bool verbose; - ModelType model_type; - std::string output_filepath; - std::queue pending_request_queue; - std::unordered_map running_request_queue; - std::mutex request_queue_mutex; - RequestGuid next_available_guid; - const std::map model_bos_map = {{ModelType::LLAMA, 0}, - {ModelType::OPT, 2}}; - - // TODO: Move this two vector to request struct - std::unordered_map>> - dfs_tree_inputs; - std::unordered_map>> - committed_tokens; - - // Multi-model support - int num_ssms; - std::vector models; - - // Performance profiling - size_t num_processed_requests; - -private: - struct ProfileInfo { - int decoding_steps; - double start_time, finish_time; - }; - std::unordered_map profiling_requests; - double total_request_run_time; + RequestGuid guid; + std::string input_text; + std::string output_text; + std::vector input_tokens; + std::vector output_tokens; }; } // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 0e98b6e8ad..1f30d451ef 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -17,6 +17,7 @@ #include "accessor.h" #include "config.h" #include "device.h" +#include "flexflow/inference.h" #include "flexflow/memory_optimization.h" #include "flexflow/node.h" #include "flexflow/operator_params.h" @@ -220,6 +221,7 @@ enum TaskIDs { PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, ALLREDUCE_INIT_TASK_ID, + ALLREDUCE_INF_TASK_ID, ALLREDUCE_FWD_TASK_ID, ALLREDUCE_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, @@ -228,6 +230,10 @@ enum TaskIDs { // InferenceManager & RequestManager RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, + RM_PREPARE_NEXT_BATCH_TASK_ID, + RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, + RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, + RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, @@ -697,6 +703,10 @@ class FFModel { float scaling_factor = 1.0f, bool qk_prod_scaling = true, char const *name = NULL); + // ======================================== + // Inference APIs + // ======================================== + GenerationResult generate(std::string const &text, int max_seq_length); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 13b2bb8ba8..0ef7f6cbac 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -196,7 +196,7 @@ class Op { virtual void backward(FFModel const &) = 0; // Pure virtual functions for inference virtual Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) { diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 9200c4b123..3ba4f414d1 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -42,7 +42,7 @@ class Aggregate : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 2c17674181..4302dd0733 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -34,7 +34,7 @@ class AggregateSpec : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index ed92200fbe..8b2d2aa11c 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -40,7 +40,7 @@ class ArgTopK : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h index d6d15f2a3c..709861f51c 100644 --- a/include/flexflow/ops/argmax.h +++ b/include/flexflow/ops/argmax.h @@ -47,7 +47,7 @@ class ArgMax : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/attention.h b/include/flexflow/ops/attention.h index 3f4c14593f..7f52e0dad4 100644 --- a/include/flexflow/ops/attention.h +++ b/include/flexflow/ops/attention.h @@ -72,7 +72,7 @@ class MultiHeadAttention : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h index 57ab5c1074..639a8ead92 100644 --- a/include/flexflow/ops/beam_topk.h +++ b/include/flexflow/ops/beam_topk.h @@ -43,7 +43,7 @@ class BeamTopK : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/cast.h b/include/flexflow/ops/cast.h index b1e078f60e..a06f87b3c8 100644 --- a/include/flexflow/ops/cast.h +++ b/include/flexflow/ops/cast.h @@ -42,7 +42,7 @@ class Cast : public Op { void forward(FFModel const &); void backward(FFModel const &); Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index fe7dc2602c..4aa41ed9e4 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -33,7 +33,7 @@ class ElementBinary : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index db0c4b02ca..2df9ea61bc 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -53,7 +53,7 @@ class ElementUnary : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index bd7c15b2fe..ae93ef4d1d 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -56,7 +56,7 @@ class Embedding : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index c0a6c107aa..d68957d890 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -112,7 +112,7 @@ class Experts : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index 87e562d143..87c2201c28 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -36,7 +36,7 @@ class FusedOp : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index ae421751c3..ec6cdfb9ab 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -42,7 +42,7 @@ class Group_by : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index b5d441713f..244100bc6f 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H +#include "flexflow/accessor.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/inference.h" @@ -82,7 +83,7 @@ class IncMultiHeadSelfAttention : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/inc_multiquery_self_attention.h b/include/flexflow/ops/inc_multiquery_self_attention.h index ba6a1feeaf..1e36876c57 100644 --- a/include/flexflow/ops/inc_multiquery_self_attention.h +++ b/include/flexflow/ops/inc_multiquery_self_attention.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H #define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H +#include "flexflow/accessor.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/inference.h" @@ -69,7 +70,7 @@ class IncMultiQuerySelfAttention : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index b5a36262b4..058884cc9a 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -31,7 +31,7 @@ class LayerNorm : public Op { void forward(FFModel const &); void backward(FFModel const &); Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index ff6ba1ef90..025674c7ba 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -48,7 +48,7 @@ class Linear : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/noop.h b/include/flexflow/ops/noop.h index add4150e85..e07d10a05e 100644 --- a/include/flexflow/ops/noop.h +++ b/include/flexflow/ops/noop.h @@ -24,7 +24,7 @@ class NoOp : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index db18ebdd39..38ce983dd0 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -38,7 +38,7 @@ class RMSNorm : public Op { std::vector const &, MachineView const *mv = nullptr) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; @@ -79,4 +79,4 @@ class RMSNorm : public Op { int dim, data_dim; }; } // namespace FlexFlow -#endif // _FLEXFLOW_RMS_NORM_H \ No newline at end of file +#endif // _FLEXFLOW_RMS_NORM_H diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h index 8ffa6a290a..da554d4f34 100644 --- a/include/flexflow/ops/sampling.h +++ b/include/flexflow/ops/sampling.h @@ -53,7 +53,7 @@ class Sampling : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 3f29de905b..1d5191d7ee 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -28,7 +28,7 @@ class Softmax : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index eef684cdb7..c8c1c4c9cf 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H #define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_H +#include "flexflow/accessor.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/inference.h" @@ -78,7 +79,7 @@ class SpecIncMultiHeadSelfAttention : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/split.h b/include/flexflow/ops/split.h index cd40d73e18..cb9c6bdb57 100644 --- a/include/flexflow/ops/split.h +++ b/include/flexflow/ops/split.h @@ -27,7 +27,7 @@ class Split : public Op { std::vector const &, MachineView const *mv = nullptr) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index ec3691ea11..47144bf6d7 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -36,7 +36,7 @@ class TopK : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index a9e584aa2b..ba1d80dd60 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H +#include "flexflow/accessor.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/inference.h" @@ -9,6 +10,7 @@ #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/tree_inc_multihead_self_attention_params.h" #include "math.h" #include #include @@ -81,7 +83,7 @@ class TreeIncMultiHeadSelfAttention : public Op { void forward(FFModel const &) override; void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &, + BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h index 2faf128d93..045f9b36a0 100644 --- a/include/flexflow/parallel_ops/allreduce.h +++ b/include/flexflow/parallel_ops/allreduce.h @@ -35,7 +35,7 @@ class AllReduce : public ParallelOp { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; @@ -47,6 +47,10 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h index d09a789de2..2e4fdb86a9 100644 --- a/include/flexflow/parallel_ops/combine.h +++ b/include/flexflow/parallel_ops/combine.h @@ -36,7 +36,7 @@ class Combine : public ParallelOp { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h index 02a5026fcf..bdf7aae501 100644 --- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -16,6 +17,11 @@ class AllReduceMeta : public OpMeta { namespace Kernels { namespace AllReduce { +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + void forward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); diff --git a/include/flexflow/parallel_ops/partition.h b/include/flexflow/parallel_ops/partition.h index 21eda315ed..4b0013b11d 100644 --- a/include/flexflow/parallel_ops/partition.h +++ b/include/flexflow/parallel_ops/partition.h @@ -36,7 +36,7 @@ class Repartition : public ParallelOp { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/parallel_ops/reduction.h b/include/flexflow/parallel_ops/reduction.h index 1918c3b587..89f8bfbee0 100644 --- a/include/flexflow/parallel_ops/reduction.h +++ b/include/flexflow/parallel_ops/reduction.h @@ -36,7 +36,7 @@ class Reduction : public ParallelOp { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index f8f2c42559..65d69d8564 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -38,7 +38,7 @@ class Replicate : public ParallelOp { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h new file mode 100644 index 0000000000..9dd19ee7f9 --- /dev/null +++ b/include/flexflow/request_manager.h @@ -0,0 +1,242 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "flexflow/batch_config.h" +#include "flexflow/inference.h" +#include "flexflow/model.h" +#include +#include + +namespace FlexFlow { + +class FFModel; +class BeamTree; +class RequestManager; +using tokenizers::Tokenizer; + +class InferenceManager { +public: + InferenceManager(FFConfig const &config, int max_num_tokens_per_batch); + static InferenceManager *get_inference_manager(); + void compile_model_and_allocate_buffer(FFModel *model); + void init_operators_inference(FFModel *model); + MachineView *get_machine_view(int mv_id); + Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); + Legion::FutureMap + inference(FFModel *model, int index, BatchConfigFuture const &bc); + void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, + ParallelTensor const input); + void load_positions(BatchConfigFuture const &bc, + ParallelTensor position_input); + void incr_decoding_loop(FFModel *model, + RequestManager &rm, + int total_num_requests); + void spec_inference_loop(FFModel *model, + RequestManager &rm, + int total_num_requests, + std::vector ssm_model_ids); + +public: + FFConfig ff_config; + std::unordered_map> tensor_buffer; + int max_num_tokens_per_batch; + int num_devices; + std::vector machine_views; +}; + +struct Request { + BatchConfig::RequestGuid guid; + int max_sequence_length; + int initial_len; + std::vector tokens; + + std::vector beam_trees; +}; + +// store the result of beam search +struct BeamTree { + struct treeLayer { + BeamSearchBatchConfig::TokenId + tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + }; + treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; +}; + +// struct BeamTree_v2 { +// std::vector tokens; +// std::vector parent_ids; +// std::vector probs; +// }; + +class RequestManager { +public: + using RequestGuid = BatchConfig::RequestGuid; + using TokenId = BatchConfig::TokenId; + // RequestManager(ModelType model_type, + // std::string const &path, + // bool verbose = false, + // std::string output_filepath = ""); + RequestManager(); + static RequestManager *get_request_manager(); + size_t get_num_processed_requests(); + size_t get_num_ssms(); + + int register_ssm_model(FFModel *model); + void register_tokenizer(ModelType model_type, std::string const &path); + void register_output_filepath(std::string const &); + + FFModel *get_model(int model_id); + static void serve(FFModel *model); + + GenerationResult generate_incr_decoding(FFModel *model, + std::string const &text, + int max_seq_length); + GenerationResult generate_spec_infer(FFModel *model, + std::string const &text, + int max_seq_length); + GenerationResult get_generation_result(RequestGuid const &guid); + RequestGuid register_new_request(std::string const &prompt, + int max_sequence_length); + RequestGuid register_new_request(std::vector const &prompt, + int max_sequence_length); + bool is_request_completed(RequestGuid const &guid); + BatchConfig prepare_next_batch(BatchConfig const &bc, + InferenceResult const &result); + BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, + InferenceResultFuture const &result); + BeamSearchBatchConfig + prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result); + BeamSearchBatchConfigFuture + prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc, + BeamInferenceResultFuture const &result); + BeamSearchBatchConfig + prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, + InferenceResult const &result, + int model_id); + BeamSearchBatchConfigFuture + prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc, + InferenceResultFuture const &result, + int model_id); + TreeVerifyBatchConfig prepare_next_batch_verify( + std::vector const &old_batches); + TreeVerifyBatchConfigFuture prepare_next_batch_verify( + std::vector const &old_batches); + + void store_beam_metadata(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result); + void update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamTree &tree, + int request_index); + + std::vector> + traverse_beam_tree(BeamSearchBatchConfig const &old_bc, + int request_index, + int token_start_offset); + + // remove guid after put the cached tree in request + std::vector> merge_dfs_trees( + std::vector>> + input_trees, + int root_depth, + RequestGuid guid); + + std::vector> traverse_verify_tree( + size_t guid, + std::vector> const + &inputSerializedTree, + std::vector> const + &outputSerializedTree); + + static void + load_tokens_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void + load_positions_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + static BatchConfig prepare_next_batch_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + static BeamSearchBatchConfig prepare_next_batch_beam_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + static BeamSearchBatchConfig prepare_next_batch_init_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + + static TreeVerifyBatchConfig prepare_next_batch_verify_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + +private: + std::unique_ptr tokenizer_; + bool verbose; + ModelType model_type; + std::string output_filepath; + std::queue pending_request_queue; + std::unordered_map all_requests; + std::unordered_map request_generation_results; + std::mutex request_queue_mutex; + RequestGuid next_available_guid; + // Legion futures for inc_decoding and spec_infer + BatchConfigFuture last_bcf; + InferenceResultFuture last_irf; + TreeVerifyBatchConfigFuture last_tree_bcf; + InferenceResultFuture last_tree_irf; + const std::map model_bos_map = {{ModelType::LLAMA, 0}, + {ModelType::OPT, 2}}; + + // TODO: Move this two vector to request struct + std::unordered_map>> + dfs_tree_inputs; + std::unordered_map>> + committed_tokens; + + // Multi-model support + std::vector models; + + // Performance profiling + size_t num_processed_requests; + +private: + struct ProfileInfo { + int decoding_steps; + double start_time, finish_time; + }; + std::unordered_map profiling_requests; + double total_request_run_time; +}; + +}; // namespace FlexFlow diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 1787c5a0b7..e67e50484c 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -1,5 +1,6 @@ #ifndef _FLEXFLOW_CUDA_HELPER_H_ #define _FLEXFLOW_CUDA_HELPER_H_ +#include "flexflow/accessor.h" #include "flexflow/ffconst.h" #include "legion.h" #include diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index c2c4d94cc3..d16f353ade 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -1,5 +1,6 @@ #ifndef _FLEXFLOW_HIP_HELPER_H_ #define _FLEXFLOW_HIP_HELPER_H_ +#include "flexflow/accessor.h" #include "flexflow/ffconst.h" #include "legion.h" #include diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 0059b30ae0..957c41b103 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -14,6 +14,7 @@ */ #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include "models/falcon.h" #include "models/llama.h" #include "models/opt.h" @@ -40,10 +41,7 @@ void parse_input_args(char **argv, bool &verbose, bool &do_sample, float &temperature, - float &topp, - int &data_parallelism_degree, - int &tensor_parallelism_degree, - int &pipeline_parallelism_degree) { + float &topp) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -88,21 +86,6 @@ void parse_input_args(char **argv, paths.output_file_path = std::string(argv[++i]); continue; } - // data parallelism degree - if (!strcmp(argv[i], "-data-parallelism-degree")) { - data_parallelism_degree = std::stoi(argv[++i]); - continue; - } - // tensor parallelism degree - if (!strcmp(argv[i], "-tensor-parallelism-degree")) { - tensor_parallelism_degree = std::stoi(argv[++i]); - continue; - } - // pipeline parallelism degree - if (!strcmp(argv[i], "-pipeline-parallelism-degree")) { - pipeline_parallelism_degree = std::stoi(argv[++i]); - continue; - } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -143,8 +126,6 @@ void FlexFlow::top_level_task(Task const *task, float temperature = 0.0f; float topp = 0.0f; size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; - int data_parallelism_degree = 1, tensor_parallelism_degree = 1, - pipeline_parallelism_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -157,32 +138,28 @@ void FlexFlow::top_level_task(Task const *task, verbose, do_sample, temperature, - topp, - data_parallelism_degree, - tensor_parallelism_degree, - pipeline_parallelism_degree); - ffconfig.data_parallelism_degree = data_parallelism_degree; - ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; - ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree; - - assert(data_parallelism_degree * tensor_parallelism_degree * - pipeline_parallelism_degree == + topp); + + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); SamplingConfig samplingConfig(do_sample, temperature, topp); - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); - RequestManager rm(model_type, - file_paths.tokenizer_file_path, - /*verbose*/ verbose, - file_paths.output_file_path); + RequestManager *rm = RequestManager::get_request_manager(); + rm->register_tokenizer(model_type, file_paths.tokenizer_file_path); + rm->register_output_filepath(file_paths.output_file_path); + // InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); + // RequestManager rm(model_type, + // file_paths.tokenizer_file_path, + // /*verbose*/ verbose, + // file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { LLAMA::create_llama_model(model, - im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, INC_DECODING_MODE, @@ -190,14 +167,12 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision); } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, - im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, INC_DECODING_MODE, use_full_precision); } else if (model_type == ModelType::FALCON) { FALCON::create_falcon_model(model, - im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, ffconfig.workersPerNode * ffconfig.numNodes, @@ -220,12 +195,11 @@ void FlexFlow::top_level_task(Task const *task, std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - rm.register_new_request(text, 128 /*max_sequence_length*/); + GenerationResult result = + model.generate(text, 128 /*max_sequence_length*/); } } - im.incr_decoding_loop(&model, rm, total_num_requests); - // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index bced5dc1e0..d0ec83508b 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -20,7 +20,6 @@ namespace FlexFlow { using namespace Legion; void FALCON::create_falcon_model(FFModel &ff, - InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, @@ -141,7 +140,8 @@ void FALCON::create_falcon_model(FFModel &ff, // Compile the model std::cout << "------start compile ----------" << std::endl; - im.compile_model_and_allocate_buffer(&ff); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, falcon_config.n_heads, @@ -151,7 +151,7 @@ void FALCON::create_falcon_model(FFModel &ff, std::cout << "------load weight finished----------" << std::endl; // init operators - im.init_operators_inference(&ff); + im->init_operators_inference(&ff); } }; // namespace FlexFlow diff --git a/inference/models/falcon.h b/inference/models/falcon.h index 03cef07e58..d9c330a8b9 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -17,6 +17,7 @@ #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include #include using json = nlohmann::json; @@ -104,7 +105,6 @@ class FALCON { }; static void create_falcon_model(FFModel &ff, - InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, int num_pipeline_stages, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e4cd54192d..fd2b7fe4f9 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -20,7 +20,6 @@ namespace FlexFlow { using namespace Legion; void LLAMA::create_llama_model(FFModel &ff, - InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, @@ -190,9 +189,10 @@ void LLAMA::create_llama_model(FFModel &ff, } } + InferenceManager *im = InferenceManager::get_inference_manager(); // Compile the model std::cout << "------start compile ----------" << std::endl; - im.compile_model_and_allocate_buffer(&ff); + im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, llama_config.n_heads, @@ -202,7 +202,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::cout << "------load weight finished----------" << std::endl; // init operators - im.init_operators_inference(&ff); + im->init_operators_inference(&ff); } }; // namespace FlexFlow diff --git a/inference/models/llama.h b/inference/models/llama.h index 6f80194d72..61d8908d0c 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -17,6 +17,7 @@ #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include #include using json = nlohmann::json; @@ -103,7 +104,6 @@ class LLAMA { }; static void create_llama_model(FFModel &ff, - InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 05cee2bf9d..2cdffe2715 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -20,7 +20,6 @@ namespace FlexFlow { using namespace Legion; void OPT::create_opt_model(FFModel &ff, - InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, @@ -224,7 +223,8 @@ void OPT::create_opt_model(FFModel &ff, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; - im.compile_model_and_allocate_buffer(&ff); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, opt_config.num_attention_heads, @@ -233,7 +233,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.num_attention_heads); fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; - im.init_operators_inference(&ff); + im->init_operators_inference(&ff); } }; // namespace FlexFlow diff --git a/inference/models/opt.h b/inference/models/opt.h index d5fa845cd5..45ee6e6181 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -17,6 +17,7 @@ #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include #include using json = nlohmann::json; @@ -105,7 +106,6 @@ class OPT { }; static void create_opt_model(FFModel &ff, - InferenceManager &im, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index a34c1b6a84..99131edb34 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -43,10 +43,7 @@ void parse_input_args(char **argv, FilePaths &paths, ModelTypes &model_types, bool &use_full_precision, - bool &verbose, - int &data_parallelism_degree, - int &tensor_parallelism_degree, - int &pipeline_parallelism_degree) { + bool &verbose) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -117,21 +114,6 @@ void parse_input_args(char **argv, paths.output_file_path = std::string(argv[++i]); continue; } - // data parallelism degree - if (!strcmp(argv[i], "-data-parallelism-degree")) { - data_parallelism_degree = std::stoi(argv[++i]); - continue; - } - // tensor parallelism degree - if (!strcmp(argv[i], "-tensor-parallelism-degree")) { - tensor_parallelism_degree = std::stoi(argv[++i]); - continue; - } - // pipeline parallelism degree - if (!strcmp(argv[i], "-pipeline-parallelism-degree")) { - pipeline_parallelism_degree = std::stoi(argv[++i]); - continue; - } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; @@ -160,20 +142,10 @@ void FlexFlow::top_level_task(Task const *task, InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args(argv, - argc, - file_paths, - model_types, - use_full_precision, - verbose, - data_parallelism_degree, - tensor_parallelism_degree, - pipeline_parallelism_degree); - ffconfig.data_parallelism_degree = data_parallelism_degree; - ffconfig.tensor_parallelism_degree = tensor_parallelism_degree; - ffconfig.pipeline_parallelism_degree = pipeline_parallelism_degree; - assert(data_parallelism_degree * tensor_parallelism_degree * - pipeline_parallelism_degree == + parse_input_args( + argv, argc, file_paths, model_types, use_full_precision, verbose); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); if (file_paths.ssm_weight_file_paths.size() == 0) { @@ -200,17 +172,21 @@ void FlexFlow::top_level_task(Task const *task, // Create SentencePiece tokenizer or OPT tokenizer SamplingConfig samplingConfig; - InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); - RequestManager rm(model_types.llm_model_type, - file_paths.tokenizer_file_path, - /*verbose*/ verbose, - file_paths.output_file_path); + InferenceManager *im = InferenceManager::get_inference_manager(); + RequestManager *rm = RequestManager::get_request_manager(); + rm->register_tokenizer(model_types.llm_model_type, + file_paths.tokenizer_file_path); + rm->register_output_filepath(file_paths.output_file_path); + // InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); + // RequestManager rm(model_types.llm_model_type, + // file_paths.tokenizer_file_path, + // /*verbose*/ verbose, + // file_paths.output_file_path); // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); if (model_types.llm_model_type == ModelType::LLAMA) { LLAMA::create_llama_model(tree_model, - im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, TREE_VERIFY_MODE, @@ -218,7 +194,6 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision); } else if (model_types.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, - im, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, TREE_VERIFY_MODE, @@ -243,7 +218,6 @@ void FlexFlow::top_level_task(Task const *task, FFModel &beam_model = ssm_models[ssm_id]; if (model_types.ssm_model_types[ssm_id] == ModelType::LLAMA) { LLAMA::create_llama_model(beam_model, - im, file_paths.ssm_config_file_paths[ssm_id], file_paths.ssm_weight_file_paths[ssm_id], BEAM_SEARCH_MODE, @@ -251,7 +225,6 @@ void FlexFlow::top_level_task(Task const *task, use_full_precision); } else if (model_types.ssm_model_types[ssm_id] == ModelType::OPT) { OPT::create_opt_model(beam_model, - im, file_paths.ssm_config_file_paths[ssm_id], file_paths.ssm_weight_file_paths[ssm_id], BEAM_SEARCH_MODE, @@ -260,8 +233,7 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "Invalid SSM model type passed."); } - int beam_model_id = rm.register_new_model(&beam_model); - ssm_model_ids.push_back(beam_model_id); + rm->register_ssm_model(&beam_model); } // Register requests from prompt file @@ -278,12 +250,10 @@ void FlexFlow::top_level_task(Task const *task, std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - rm.register_new_request(text, 128 /*max_sequence_length*/); + tree_model.generate(text, 128 /*max_sequence_length*/); } } - im.spec_inference_loop(&tree_model, rm, total_num_requests, ssm_model_ids); - // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index d7f1b70232..1c3103683f 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -15,8 +15,8 @@ #include "flexflow/flexflow_c.h" #include "flexflow/dataloader.h" -#include "flexflow/inference.h" #include "flexflow/mapper.h" +#include "flexflow/request_manager.h" using namespace Legion; using namespace FlexFlow; diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index ca6d574501..3d08eb0bcc 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -283,6 +283,13 @@ void FFMapper::select_task_options(const MapperContext ctx, output.initial_proc = all_cpus[0]; return; } + if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) { + output.initial_proc = all_cpus[0]; + return; + } if (task.task_id == TOP_LEVEL_TASK_ID) { output.initial_proc = all_cpus[0]; // control replicate top level task diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index bb3eaf8f52..c7217bb700 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -285,7 +285,7 @@ void Aggregate::forward(FFModel const &ff) { } FutureMap Aggregate::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 5ec8ab6857..5190983148 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -260,7 +260,7 @@ void AggregateSpec::forward(FFModel const &ff) { FutureMap AggregateSpec::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index c1bbb65f1e..b877a9f96d 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -29,6 +29,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::InlineLauncher; @@ -250,7 +251,7 @@ void ArgTopK::forward(FFModel const &ff) { } FutureMap ArgTopK::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -265,12 +266,13 @@ FutureMap ArgTopK::inference(FFModel const &ff, << std::endl; */ IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -300,7 +302,12 @@ InferenceResult assert(regions.size() == 2); assert(task->regions.size() == 2); // const ArgTopK* topk = (const ArgTopK*) task->args; - BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + // Directly return for empty batch config + InferenceResult ir; + return ir; + } ArgTopKMeta const *m = *((ArgTopKMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 754337448e..8598a71d50 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -254,7 +254,7 @@ void ArgMax::forward(FFModel const &ff) { } FutureMap ArgMax::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -270,12 +270,13 @@ FutureMap ArgMax::inference(FFModel const &ff, if (beam_search) { IndexLauncher launcher(ARGMAX_BEAM_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_WRITE, @@ -307,12 +308,13 @@ FutureMap ArgMax::inference(FFModel const &ff, } else { IndexLauncher launcher(ARGMAX_NORM_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_WRITE, @@ -344,7 +346,12 @@ BeamInferenceResult Runtime *runtime) { assert(regions.size() == 4); assert(task->regions.size() == 4); - BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + // Directly return for empty batch config + BeamInferenceResult ir; + return ir; + } ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( @@ -378,8 +385,13 @@ InferenceResult Runtime *runtime) { assert(regions.size() == 3); assert(task->regions.size() == 3); - BatchConfig const *bc = (BatchConfig *)task->args; ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + // Directly return for empty batch config + InferenceResult ir; + return ir; + } GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); diff --git a/src/ops/attention.cc b/src/ops/attention.cc index ca709bdc51..23bd98c648 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -574,7 +574,7 @@ void MultiHeadAttention::forward(FFModel const &ff) { FutureMap MultiHeadAttention::inference( FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 0920105acc..1c71e69c9d 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -29,6 +29,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::InlineLauncher; @@ -283,7 +284,7 @@ void BeamTopK::forward(FFModel const &ff) { } FutureMap BeamTopK::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -295,17 +296,15 @@ FutureMap BeamTopK::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher( - BEAM_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument( - &bc, std::max(sizeof(BatchConfig), sizeof(BeamSearchBatchConfig))), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - + IndexLauncher launcher(BEAM_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -342,10 +341,16 @@ BeamInferenceResult assert(regions.size() == 4); assert(task->regions.size() == 4); - BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + // BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); // std::cout << "beam search topk inference: " // << "\n"; + if (bc.num_tokens == 0) { + BeamInferenceResult ir; + return ir; + } BeamTopKMeta const *m = *((BeamTopKMeta **)task->local_args); Domain in1_domain = runtime->get_index_space_domain( @@ -374,14 +379,12 @@ BeamInferenceResult // embedding size: eg. 4096 int length = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; - int k = - out2_domain.hi()[0] - out2_domain.lo()[0] + 1; /*TODO: This prints to 5*/ + // int k = out2_domain.hi()[0] - out2_domain.lo()[0] + 1; // total token nums - size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; - int batch_size = bc->num_active_tokens(); - // std::cout << "beam search topk params: " << length << ", " << k << ", " - // << batch_size << "\n"; + // size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; + // size_t batch_size = in1_domain.get_volume() / length; + size_t batch_size = bc.num_active_tokens(); // std::vector beam_width; // std::unordered_map sub_requests = bc->sub_requests; // for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { @@ -395,7 +398,7 @@ BeamInferenceResult // need meta for: how many sub requests in a main request BeamTopK::forward_kernel_wrapper(m, - bc, + &bc, input, value_ptr, index_ptr, diff --git a/src/ops/cast.cc b/src/ops/cast.cc index 3adf85a435..d98a54fe62 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -225,7 +225,7 @@ void Cast::forward(FFModel const &ff) { } FutureMap Cast::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 1535cfcd77..0721009bbb 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -527,7 +527,7 @@ void ElementBinary::forward(FFModel const &ff) { FutureMap ElementBinary::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 69533db53d..30397830a7 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -420,7 +420,7 @@ void ElementUnary::forward(FFModel const &ff) { FutureMap ElementUnary::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 832e3e3deb..409dcb398e 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -459,7 +459,7 @@ void Embedding::forward(FFModel const &ff) { } FutureMap Embedding::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 06e007abef..c8b0ec0f26 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -26,6 +26,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::PhysicalRegion; @@ -656,7 +657,7 @@ void Experts::forward(FFModel const &ff) { } FutureMap Experts::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -669,15 +670,16 @@ FutureMap Experts::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv << std::endl; */ - int num_active_tokens = bc.num_active_tokens(); + // int num_active_tokens = bc->num_active_tokens(); IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, - TaskArgument(&num_active_tokens, sizeof(int)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); // expert predictions launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, @@ -731,8 +733,8 @@ void Experts::inference_task(Task const *task, assert(regions.size() == task->regions.size()); ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); - int num_active_tokens = *(int *)task->args; - if (num_active_tokens == 0) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { return; } @@ -1056,7 +1058,7 @@ void Experts::inference_task(Task const *task, output_ptr, weights_ptr, bias_ptr, - num_active_tokens, + bc->num_active_tokens(), chosen_experts, batch_size, out_dim); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index cf01f5bd1e..1d5db2f461 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -470,7 +470,7 @@ void FusedOp::forward(FFModel const &ff) { } FutureMap FusedOp::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -484,16 +484,17 @@ FutureMap FusedOp::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig // so we transfer the maximum of them - size_t batch_config_size = - std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); + // size_t batch_config_size = + // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, - TaskArgument(&bc, batch_config_size), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 02a4995b0f..9b81836de5 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -44,8 +44,10 @@ namespace FlexFlow { using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::LogicalPartition; using Legion::LogicalRegion; +using Legion::Memory; using Legion::PhysicalRegion; using Legion::Runtime; using Legion::Task; @@ -472,7 +474,13 @@ __host__ void // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; - BatchConfig const *bc = (BatchConfig *)task->args; + // BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active tokens + if (bc->num_tokens == 0) { + return; + } + assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == @@ -813,8 +821,10 @@ __host__ void assert(fused->op_num_outputs[op] == 1); TreeIncMultiHeadSelfAttentionMeta *m = (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - TreeVerifyBatchConfig const *tree_bc = - (TreeVerifyBatchConfig *)task->args; + // TreeVerifyBatchConfig const *tree_bc = + // (TreeVerifyBatchConfig *)task->args; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); GenericTensorAccessorR biases; if (*m->bias) { @@ -823,7 +833,7 @@ __host__ void } TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - tree_bc, + &tree_bc, task->index_point.point_data[0], my_input_accessor[0], my_weight_accessor[0], @@ -836,8 +846,10 @@ __host__ void assert(fused->op_num_outputs[op] == 1); SpecIncMultiHeadSelfAttentionMeta const *m = (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - BeamSearchBatchConfig const *beam_bc = - (BeamSearchBatchConfig *)task->args; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); GenericTensorAccessorR biases; if (*m->bias) { @@ -846,7 +858,7 @@ __host__ void } SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - beam_bc, + &beam_bc, task->index_point.point_data[0], my_input_accessor[0], my_weight_accessor[0], @@ -872,8 +884,8 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::forward_kernel_wrapper( - m, my_input_accessor[0], my_output_accessor[0]); + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } default: { diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index c805b5fb29..f2f94234c3 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -312,7 +312,7 @@ void Group_by::forward(FFModel const &ff) { } FutureMap Group_by::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 07598f99ea..aa2310b0f2 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -35,6 +35,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::Machine; @@ -648,7 +649,7 @@ void IncMultiHeadSelfAttention::forward(FFModel const &ff) { FutureMap IncMultiHeadSelfAttention::inference( FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -660,17 +661,18 @@ FutureMap IncMultiHeadSelfAttention::inference( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; - log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", - bc.num_tokens, - bc.num_active_requests()); + // log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", + // bc->num_tokens, + // bc->num_active_requests()); IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -718,7 +720,15 @@ void IncMultiHeadSelfAttention::inference_task( assert(task->regions.size() == regions.size()); - BatchConfig const *bc = (BatchConfig *)task->args; + // BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc->num_tokens, + bc->num_active_requests()); + if (bc->num_tokens == 0) { + return; + } + IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); diff --git a/src/ops/inc_multiquery_self_attention.cc b/src/ops/inc_multiquery_self_attention.cc index 6ce448c9ec..6aa6042b1a 100644 --- a/src/ops/inc_multiquery_self_attention.cc +++ b/src/ops/inc_multiquery_self_attention.cc @@ -35,6 +35,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::Machine; @@ -518,7 +519,7 @@ void IncMultiQuerySelfAttention::forward(FFModel const &ff) { FutureMap IncMultiQuerySelfAttention::inference( FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -530,17 +531,18 @@ FutureMap IncMultiQuerySelfAttention::inference( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; - log_inc_mqa.debug("BatchConfig, num_tokens: %d, num_requests: %d", - bc.num_tokens, - bc.num_active_requests()); + // log_inc_mqa.debug("BatchConfig, num_tokens: %d, num_requests: %d", + // bc->num_tokens, + // bc->num_active_requests()); IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -575,7 +577,11 @@ void IncMultiQuerySelfAttention::inference_task( assert(task->regions.size() == regions.size()); - BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + IncMultiQuerySelfAttentionMeta const *m = *((IncMultiQuerySelfAttentionMeta **)task->local_args); diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 0124c827f3..b9b3abe0f1 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -404,7 +404,7 @@ void LayerNorm::forward(FFModel const &ff) { } FutureMap LayerNorm::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/linear.cc b/src/ops/linear.cc index c5903c1e74..2376f80bec 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -13,6 +13,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::InlineLauncher; @@ -557,7 +558,7 @@ void Linear::forward(FFModel const &ff) { } FutureMap Linear::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -572,12 +573,13 @@ FutureMap Linear::inference(FFModel const &ff, << std::endl; */ IndexLauncher launcher(LINEAR_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -616,7 +618,7 @@ void Linear::inference_task(Task const *task, Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); - BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); assert(regions.size() == (3 + static_cast(m->use_bias))); assert(task->regions.size() == (3 + static_cast(m->use_bias))); if (m->quantization_type == DT_NONE) { diff --git a/src/ops/noop.cc b/src/ops/noop.cc index 2b54bdf302..da2d4922e3 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -259,7 +259,7 @@ void NoOp::init(FFModel const &ff) { void NoOp::forward(FFModel const &ff) {} FutureMap NoOp::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index e0076b5202..5529abba20 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -328,7 +328,7 @@ void RMSNorm::forward(FFModel const &ff) { } FutureMap RMSNorm::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 8c01464042..66b3420a39 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -237,7 +237,7 @@ void Sampling::forward(FFModel const &ff) { } FutureMap Sampling::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 90aef807e2..450f7c009a 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -234,7 +234,7 @@ OpMeta *Softmax::init_task(Task const *task, } FutureMap Softmax::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index e765960985..fe241bb8de 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -35,6 +35,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::Machine; @@ -599,7 +600,7 @@ void SpecIncMultiHeadSelfAttention::forward(FFModel const &ff) { FutureMap SpecIncMultiHeadSelfAttention::inference( FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -611,16 +612,15 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; - IndexLauncher launcher( - SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - parallel_is, - TaskArgument( - &bc, std::max(sizeof(BatchConfig), sizeof(BeamSearchBatchConfig))), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -663,7 +663,13 @@ void SpecIncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(task->regions.size() == regions.size()); - BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + // BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_tokens == 0) { + return; + } + SpecIncMultiHeadSelfAttentionMeta const *m = *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); @@ -699,11 +705,11 @@ void SpecIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, bc, task->index_point.point_data[0], input, weight, output, biases); + m, &bc, task->index_point.point_data[0], input, weight, output, biases); // print_tensor(input.get_float_ptr(), 20, "attention input"); // print_tensor(output.get_float_ptr(), 20, "attention output"); - // if(bc->beam_slots.at(0).current_depth == 1){ + // if(bc.beam_slots.at(0).current_depth == 1){ // print_beam_tensor(input.get_float_ptr(), 50, 4096, 40, "mha topk // input"); print_beam_tensor(output.get_float_ptr(), 50, 4096, 40, // "mha topk output"); diff --git a/src/ops/split.cc b/src/ops/split.cc index a9a5000f3d..9298850a99 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -247,7 +247,7 @@ void Split::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } FutureMap Split::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 45fdb7a3db..d76ad75167 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -264,7 +264,7 @@ void TopK::forward(FFModel const &ff) { } FutureMap TopK::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 105bd41647..7a7ea4f366 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -35,6 +35,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::Machine; @@ -649,7 +650,7 @@ void TreeIncMultiHeadSelfAttention::forward(FFModel const &ff) { FutureMap TreeIncMultiHeadSelfAttention::inference( FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -661,18 +662,15 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); int idx = 0; - log_tree_verify.debug( - "TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d", - bc.num_tokens, - bc.num_active_requests()); IndexLauncher launcher(TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(TreeVerifyBatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -718,7 +716,17 @@ void TreeIncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(task->regions.size() == regions.size()); - TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; + // TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; + TreeVerifyBatchConfig const &bc = + Future(task->futures[0]).get_result(); + log_tree_verify.debug( + "TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d", + bc.num_tokens, + bc.num_active_requests()); + if (bc.num_tokens == 0) { + return; + } + TreeIncMultiHeadSelfAttentionMeta *m = *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); @@ -760,7 +768,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, bc, task->index_point.point_data[0], input, weight, output, biases); + m, &bc, task->index_point.point_data[0], input, weight, output, biases); #ifdef INFERENCE_TESTS printf("Checking TreeIncMultiHeadSelfAttention computations...\n"); @@ -807,7 +815,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( size_t effective_batch_size = max_sequence_length * batch_size; float inputs_arr[data_dim][effective_batch_size] = {0}; - for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { + for (size_t i = 0; i < data_dim * bc.num_active_tokens(); i++) { size_t data_index = i % data_dim; size_t token_index = i / data_dim; assert(data_index < data_dim); @@ -839,16 +847,16 @@ void TreeIncMultiHeadSelfAttention::inference_task( // column-major order. // printf("m->kProjSize: %i, TreeVerifyBatchConfig::MAX_NUM_TOKENS: %i, " - // "bc->num_active_tokens(): %i, num_heads: %lli, + // "bc.num_active_tokens(): %i, num_heads: %lli, // TreeVerifyBatchConfig::MAX_NUM_REQUESTS: %i, " - // "bc->num_active_requests(): %i\n", m->kProjSize, - // TreeVerifyBatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), + // "bc.num_active_requests(): %i\n", m->kProjSize, + // TreeVerifyBatchConfig::MAX_NUM_TOKENS, bc.num_active_tokens(), // num_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS, - // bc->num_active_requests()); - // for (int t=0; t < bc->num_active_tokens(); t++) { + // bc.num_active_requests()); + // for (int t=0; t < bc.num_active_tokens(); t++) { // printf("token %i has request_index: %li and token_position: %li\n", - // t, bc->token2ids.token_indexes[t].request_index, - // bc->token2ids.token_indexes[t].token_position); + // t, bc.token2ids.token_indexes[t].request_index, + // bc.token2ids.token_indexes[t].token_position); // } // ============================================================================= @@ -908,7 +916,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() << std::endl; std::cout << "Torch input size: " << torch_input.sizes() << std::endl; - std::cout << "Number of active tokens: " << bc->num_active_tokens() + std::cout << "Number of active tokens: " << bc.num_active_tokens() << std::endl; */ // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; @@ -920,10 +928,10 @@ void TreeIncMultiHeadSelfAttention::inference_task( torch::Tensor qkv_projs = torch::einsum( "ijkl,im->jmkl", {torch_w_qkv, - torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); + torch_input.index({Slice(), Slice(0, bc.num_active_tokens())})}); // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; assert(qkv_projs.sizes()[0] == m->qProjSize); - assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && + assert(qkv_projs.sizes()[1] == bc.num_active_tokens() && qkv_projs.sizes()[1] <= effective_batch_size); assert(qkv_projs.sizes()[2] == 3); assert(qkv_projs.sizes()[3] == num_heads); @@ -936,24 +944,24 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(QKVProjArray_cpu != nullptr); std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc->num_active_tokens(), 3, (int)num_heads}; + m->qProjSize, bc.num_active_tokens(), 3, (int)num_heads}; float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc->num_active_tokens() * 3 * num_heads, sizeof(float)); + m->qProjSize * bc.num_active_tokens() * 3 * num_heads, sizeof(float)); // skip over padding at the end of QKVProjArray_cpu // convert from column order to 3D matrix because torch cannot automatically // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { + for (size_t i = 0; i < proj_sum * bc.num_active_tokens() * num_heads; i++) { int proj_size_index = i % m->qProjSize; - int head_index = i / (proj_sum * bc->num_active_tokens()); + int head_index = i / (proj_sum * bc.num_active_tokens()); int token_index = - ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % - bc->num_active_tokens(); - int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / - (m->qProjSize * bc->num_active_tokens()); + ((i - head_index * proj_sum * bc.num_active_tokens()) / m->qProjSize) % + bc.num_active_tokens(); + int qkv_offset = (i - head_index * proj_sum * bc.num_active_tokens()) / + (m->qProjSize * bc.num_active_tokens()); assert(proj_size_index < proj_sum); assert(head_index < num_heads); - assert(token_index < bc->num_active_tokens()); + assert(token_index < bc.num_active_tokens()); assert(qkv_offset < 3); set_value_row_major(QKVProjArray_converted, QKVProjArray_converted_shape, @@ -962,7 +970,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( } torch::Tensor QKVProjArray_torch = torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc->num_active_tokens(), 3, num_heads}, + {m->qProjSize, bc.num_active_tokens(), 3, num_heads}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- @@ -989,15 +997,15 @@ void TreeIncMultiHeadSelfAttention::inference_task( // ----------------------- C++ operations & checks -------------------------- // Store projections into k/v cache arrays for (size_t h = 0; h < num_heads; h++) { - for (size_t t = 0; t < bc->num_active_tokens(); t++) { + for (size_t t = 0; t < bc.num_active_tokens(); t++) { for (size_t d = 0; d < m->kProjSize; d++) { size_t kcache_idx = d * MAX_SEQ_LEN * m->num_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * + bc.tokensInfo[t].abs_depth_in_request * m->num_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].request_index; + bc.tokensInfo[t].request_index; m->kcache[kcache_idx] = qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) .item(); @@ -1005,11 +1013,11 @@ void TreeIncMultiHeadSelfAttention::inference_task( for (size_t d = 0; d < m->vProjSize; d++) { size_t vcache_idx = d * MAX_SEQ_LEN * m->num_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * + bc.tokensInfo[t].abs_depth_in_request * m->num_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].request_index; + bc.tokensInfo[t].request_index; m->vcache[vcache_idx] = qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) .item(); @@ -1036,8 +1044,8 @@ void TreeIncMultiHeadSelfAttention::inference_task( std::vector req_idxs; std::vector r_first_idx; std::vector r_num_tokens; - for (size_t t = 0; t < bc->num_active_tokens(); t++) { - size_t rid = bc->tokensInfo[t].request_index; + for (size_t t = 0; t < bc.num_active_tokens(); t++) { + size_t rid = bc.tokensInfo[t].request_index; if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { req_idxs.push_back(rid); r_first_idx.push_back(t); @@ -1048,11 +1056,11 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(req_idxs.size() == r_first_idx.size() && r_first_idx.size() == r_num_tokens.size()); } - assert(req_idxs.size() == bc->num_active_requests()); + assert(req_idxs.size() == bc.num_active_requests()); assert(std::accumulate(r_num_tokens.begin(), r_num_tokens.end(), decltype(r_num_tokens)::value_type(0)) == - bc->num_active_tokens()); + bc.num_active_tokens()); // ----------------------- Loading CUDA results for this step --------------- float *keyCache_cpu = download_tensor( @@ -1118,7 +1126,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // ----------------------- Comparing C++ & CUDA results --------------------- // std::cout << "kcache differences:" << std::endl; - // for (int i=0; i < bc->num_active_requests() + 1; i++) { + // for (int i=0; i < bc.num_active_requests() + 1; i++) { // for (int j=0; j < num_heads; j++) { // for (int l=0; l < m->kProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { @@ -1143,7 +1151,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // } // std::cout << "keyCache from CUDA:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { + // for (int i=0; ikProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { @@ -1162,7 +1170,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // } // std::cout << "valueCache from CUDA:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { + // for (int i=0; ivProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { @@ -1183,7 +1191,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // printf("\n"); // std::cout << "C++ kcache:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { + // for (int i=0; ikProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { @@ -1202,7 +1210,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // } // std::cout << "C++ vcache:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { + // for (int i=0; ivProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { @@ -1289,12 +1297,12 @@ void TreeIncMultiHeadSelfAttention::inference_task( qkv_projs.sizes()[1], qkv_projs.sizes()[3]}); - torch::Tensor qk_products[bc->num_active_requests()]; - torch::Tensor qk_softmax[bc->num_active_requests()]; - torch::Tensor attn_heads[bc->num_active_requests()]; + torch::Tensor qk_products[bc.num_active_requests()]; + torch::Tensor qk_softmax[bc.num_active_requests()]; + torch::Tensor attn_heads[bc.num_active_requests()]; torch::Tensor cpp_output = - torch::zeros({m->oProjSize, bc->num_active_tokens()}); + torch::zeros({m->oProjSize, bc.num_active_tokens()}); // ----------------------- Loading CUDA results for this step --------------- float *qk_prods_cpu = download_tensor( @@ -1317,14 +1325,14 @@ void TreeIncMultiHeadSelfAttention::inference_task( // ----------------------- Main loop (request by request) ------------------- size_t qk_prods_cpu_offset = 0; - for (size_t r = 0; r < bc->num_active_requests(); r++) { + for (size_t r = 0; r < bc.num_active_requests(); r++) { // Compute pre-request parameters size_t num_new_tokens = r_num_tokens[r]; int64_t rid = (int64_t)(req_idxs[r]); int64_t num_tokens_received_so_far = - (int64_t)(bc->requestsInfo[rid].token_start_offset + - bc->requestsInfo[rid].num_tokens_in_batch); - assert(num_new_tokens == bc->requestsInfo[rid].num_tokens_in_batch); + (int64_t)(bc.requestsInfo[rid].token_start_offset + + bc.requestsInfo[rid].num_tokens_in_batch); + assert(num_new_tokens == bc.requestsInfo[rid].num_tokens_in_batch); assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); // ----------------------- C++ computations ------------------------------- @@ -1514,15 +1522,15 @@ void TreeIncMultiHeadSelfAttention::inference_task( std::cout << "CUDA:" <oProjSize; i++) { std::cout << torch_out_cuda.index({i, Slice(0, - (int64_t)bc->num_active_tokens())}) << std::endl; + (int64_t)bc.num_active_tokens())}) << std::endl; } */ - assert(torch::allclose( - torch_out_cuda.index( - {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), - cpp_output, - 1e-05, - 1e-05)); + assert( + torch::allclose(torch_out_cuda.index( + {Slice(), Slice(0, (int64_t)bc.num_active_tokens())}), + cpp_output, + 1e-05, + 1e-05)); // ============================================================================= // Cleanup diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 123e85c7c5..027d15c929 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -25,6 +25,7 @@ using Legion::ArgumentMap; using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::FutureMap; using Legion::IndexLauncher; using Legion::LogicalPartition; @@ -181,7 +182,7 @@ void AllReduce::init_inference(FFModel const &ff, } FutureMap AllReduce::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { @@ -196,14 +197,15 @@ FutureMap AllReduce::inference(FFModel const &ff, size_t machine_view_hash = mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + IndexLauncher launcher(ALLREDUCE_INF_TASK_ID, batch_outputs[0]->parallel_is, - TaskArgument(NULL, 0), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -313,6 +315,26 @@ bool AllReduce::append_parallel_op_info( return true; } +/*static*/ +void AllReduce::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); +} + /*static*/ void AllReduce::forward_task(Task const *task, std::vector const ®ions, diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 198f450636..7c266c5392 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -208,7 +208,7 @@ void Combine::create_input_partition_inference( } FutureMap Combine::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 78742568c6..8d0d5e97c5 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -25,6 +25,17 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) namespace Kernels { namespace AllReduce { +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + assert(false && "To be implemented"); +} + void forward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 1ae9ee27b8..2c000137a1 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -24,6 +24,30 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) namespace Kernels { namespace AllReduce { +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_tokens * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + void forward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index aaa28b7576..353b3ce398 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -201,7 +201,7 @@ void Repartition::create_input_partition_inference( FutureMap Repartition::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 1d6130d6a6..5dca591328 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -209,7 +209,7 @@ void Reduction::init_inference(FFModel const &ff, } FutureMap Reduction::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 794db0f67f..20face74e8 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -207,7 +207,7 @@ void Replicate::init(FFModel const &ff) { } FutureMap Replicate::inference(FFModel const &ff, - BatchConfig const &bc, + BatchConfigFuture const &bc, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index c0e665b613..52b1660e53 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -21,6 +21,8 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_bc("BatchConfig"); +using Legion::Future; +using Legion::Memory; BatchConfig::BatchConfig() : num_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { @@ -35,6 +37,23 @@ BatchConfig::BatchConfig() : num_tokens(0) { } } +/*static*/ +BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) { + BatchConfig const *bc = static_cast( + Future(future).get_buffer(Memory::SYSTEM_MEM)); + // Check future size + if (bc->get_mode() == INC_DECODING_MODE) { + assert(Future(future).get_untyped_size() == sizeof(BatchConfig)); + } else if (bc->get_mode() == BEAM_SEARCH_MODE) { + assert(Future(future).get_untyped_size() == sizeof(BeamSearchBatchConfig)); + } else if (bc->get_mode() == TREE_VERIFY_MODE) { + assert(Future(future).get_untyped_size() == sizeof(TreeVerifyBatchConfig)); + } else { + assert(false && "Unsupported inference mode"); + } + return bc; +} + InferenceMode BatchConfig::get_mode() const { return INC_DECODING_MODE; } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index adb8d9d706..cfcc938204 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -15,11 +15,11 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/graph.h" -#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/noop.h" #include "flexflow/parallel_ops/parallel_op.h" +#include "flexflow/request_manager.h" namespace FlexFlow { @@ -83,6 +83,18 @@ InferenceManager::InferenceManager(FFConfig const &_config, } } +InferenceManager *inference_manager_singleton = nullptr; + +/*static*/ +InferenceManager *InferenceManager::get_inference_manager() { + if (inference_manager_singleton == nullptr) { + FFConfig ffconfig; + inference_manager_singleton = + new InferenceManager(ffconfig, BatchConfig::MAX_NUM_TOKENS); + } + return inference_manager_singleton; +} + bool parallel_tensor_list_overlaps(std::vector const &list1, std::vector const &list2) { for (auto const &pt1 : list1) { @@ -289,14 +301,38 @@ MachineView *InferenceManager::get_machine_view(int mv_id) { FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfig const &bc) { - log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)", - bc.get_mode(), - bc.num_active_tokens(), - bc.num_active_requests()); - - assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); - // We currently assume that the index-th batch will be placed - // on the device_index-th device (except for the experts layers) + if (bc.get_mode() == INC_DECODING_MODE) { + BatchConfigFuture bcf = Future::from_value(bc); + return inference(model, index, bcf); + } else if (bc.get_mode() == BEAM_SEARCH_MODE) { + BatchConfig const *bc_ptr = &bc; + BeamSearchBatchConfig const *bsbc_ptr = + static_cast(bc_ptr); + BeamSearchBatchConfigFuture bcf = + Future::from_value(*bsbc_ptr); + return inference(model, index, bcf); + } else if (bc.get_mode() == TREE_VERIFY_MODE) { + BatchConfig const *bc_ptr = &bc; + TreeVerifyBatchConfig const *tvbc_ptr = + static_cast(bc_ptr); + TreeVerifyBatchConfigFuture bcf = + Future::from_value(*tvbc_ptr); + return inference(model, index, bcf); + } else { + assert(false && "Unsupported inference mode"); + } +} + +FutureMap InferenceManager::inference(FFModel *model, + int index, + BatchConfigFuture const &bc) { + // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)", + // bc.get_mode(), + // bc.num_active_tokens(), + // bc.num_active_requests()); + // assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); + // We currently assume that the index-th batch will be placed + // on the device_index-th device (except for the experts layers) int batch_index = index % model->config.data_parallelism_degree; FutureMap fm; bool found_input_operator = false; @@ -432,43 +468,41 @@ void InferenceManager::spec_inference_loop(FFModel *model, } void InferenceManager::load_input_tokens_from_batch_config( - BatchConfig const &bc, ParallelTensor const input) { + BatchConfigFuture const &bc, ParallelTensor const input) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; - IndexLauncher launcher( - RM_LOAD_TOKENS_TASK_ID, - input->parallel_is, - TaskArgument( - &bc, std::max(sizeof(BeamSearchBatchConfig), sizeof(BatchConfig))), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, + input->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement( input->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, input->region)); launcher.add_field(0, FID_DATA); runtime->execute_index_space(ctx, launcher); } -void InferenceManager::load_positions(BatchConfig const &bc, +void InferenceManager::load_positions(BatchConfigFuture const &bc, ParallelTensor position_input) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = position_input->machine_view.hash(); ArgumentMap argmap; - IndexLauncher launcher( - RM_LOAD_POSITION_TASK_ID, - position_input->parallel_is, - TaskArgument( - &bc, std::max(sizeof(BeamSearchBatchConfig), sizeof(BatchConfig))), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(RM_LOAD_POSITION_TASK_ID, + position_input->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(position_input->part, 0 /*projection id*/, WRITE_ONLY, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 66cad1f248..0eafd979c1 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -66,6 +66,7 @@ #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" +#include "flexflow/request_manager.h" #include "flexflow/substitution.h" #include "flexflow/utils/random_utils.h" #include "flexflow/utils/test_utils.h" @@ -3821,6 +3822,9 @@ FFConfig::FFConfig() { offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize; quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; + data_parallelism_degree = 1; + tensor_parallelism_degree = 1; + pipeline_parallelism_degree = 1; enable_sample_parallel = DefaultConfig::enableSampleParallel; enable_parameter_parallel = DefaultConfig::enableParameterParallel; enable_attribute_parallel = DefaultConfig::enableAttributeParallel; @@ -3862,9 +3866,6 @@ FFConfig::FFConfig() { .local_address_space() .only_kind(Processor::LOC_PROC) .count(); - data_parallelism_degree = 1; - tensor_parallelism_degree = 1; - pipeline_parallelism_degree = 1; Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; @@ -3945,6 +3946,21 @@ void FFConfig::parse_args(char **argv, int argc) { only_data_parallel = true; continue; } + // data parallelism degree + if (!strcmp(argv[i], "-data-parallelism-degree")) { + data_parallelism_degree = std::stoi(argv[++i]); + continue; + } + // tensor parallelism degree + if (!strcmp(argv[i], "-tensor-parallelism-degree")) { + tensor_parallelism_degree = std::stoi(argv[++i]); + continue; + } + // pipeline parallelism degree + if (!strcmp(argv[i], "-pipeline-parallelism-degree")) { + pipeline_parallelism_degree = std::stoi(argv[++i]); + continue; + } if ((!strcmp(argv[i], "--enable-parameter-parallel"))) { enable_parameter_parallel = true; continue; @@ -4111,6 +4127,90 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // RequestManager prepare_next_batch + { + TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, + "RequestManager Prepare Next Batch"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + BatchConfig, + RequestManager::prepare_next_batch_task>( + registrar, "RequestManager Prepare Next Batch Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + // RequestManager prepare_next_batch_beam + { + TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, + "RequestManager Prepare Next Batch (Beam)"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + BeamSearchBatchConfig, + RequestManager::prepare_next_batch_beam_task>( + registrar, "RequestManager Prepare Next Batch (Beam) Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime + ->register_task_variant( + registrar); + } + } + // RequestManager prepare_next_batch_init + { + TaskVariantRegistrar registrar( + RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, + "RequestManager Prepare Next Batch (Init Beam)"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + BeamSearchBatchConfig, + RequestManager::prepare_next_batch_init_task>( + registrar, "RequestManager Prepare Next Batch (Init Beam) Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime + ->register_task_variant( + registrar); + } + } + // RequestManager prepare_next_batch_verify + { + TaskVariantRegistrar registrar( + RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, + "RequestManager Prepare Next Batch (Verify)"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + TreeVerifyBatchConfig, + RequestManager::prepare_next_batch_verify_task>( + registrar, "RequestManager Prepare Next Batch (Verify) Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + TreeVerifyBatchConfig, + RequestManager::prepare_next_batch_verify_task>(registrar); + } + } // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, @@ -6008,6 +6108,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID, + "AllReduce Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 478092727f..0856c1663f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" #include @@ -42,17 +42,33 @@ std::string LoadBytesFromFile(std::string const &path) { } RequestManager::RequestManager() - : verbose(false), next_available_guid(1000000), num_processed_requests(0) {} - -RequestManager::RequestManager(ModelType model_type, - std::string const &path, - bool _verbose, - std::string _output_filepath) - : verbose(_verbose), next_available_guid(1000000), - num_processed_requests(0), output_filepath(_output_filepath) { + : verbose(false), next_available_guid(1000000), num_processed_requests(0) { + { + // Initialize futures for spec infer + TreeVerifyBatchConfig tree_bc; + InferenceResult tree_ir; + TreeVerifyBatchConfigFuture tree_bcf = + Future::from_value(tree_bc); + InferenceResultFuture tree_irf = + Future::from_value(tree_ir); + last_tree_bcf = tree_bcf; + last_tree_irf = tree_irf; + } + { + // Initialize futures for incr decoding + BatchConfig bc; + InferenceResult ir; + BatchConfigFuture bcf = Future::from_value(bc); + InferenceResultFuture irf = Future::from_value(ir); + last_bcf = bcf; + last_irf = irf; + } +} +void RequestManager::register_tokenizer(ModelType type, + std::string const &path) { // bos id - this->model_type = model_type; + this->model_type = type; if (model_type == ModelType::LLAMA) { this->tokenizer_ = Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(path)); @@ -81,12 +97,15 @@ RequestManager::RequestManager(ModelType model_type, } } -int RequestManager::register_new_model(FFModel *model) { +void RequestManager::register_output_filepath( + std::string const &_output_filepath) { + this->output_filepath = _output_filepath; +} + +int RequestManager::register_ssm_model(FFModel *model) { int model_id = models.size(); models.push_back(model); std::cout << "Register new model with id: " << model_id << std::endl; - num_ssms++; - assert(models.size() == num_ssms); return model_id; } @@ -95,6 +114,10 @@ FFModel *RequestManager::get_model(int model_id) { return models[model_id]; } +size_t RequestManager::get_num_ssms() { + return models.size(); +} + RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, int max_sequence_length) { @@ -104,22 +127,38 @@ RequestManager::RequestGuid Request request; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - request.initial_len = prompt.size(); - request.tokens = prompt; - if (num_ssms == 0) { + if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) { + std::cout << "Warning: too many tokens in prompt, only load up to " + << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " + << prompt.size() << ".\n"; + // Truncate the prompt to MAX_NUM_TOKENS + // request.tokens.insert(request.tokens.end(), + // prompt.begin(), + // prompt.begin() + BatchConfig::MAX_PROMPT_LENGTH); + // request.initial_len = BatchConfig::MAX_PROMPT_LENGTH; + printf("tokens size: %zu\n", request.tokens.size()); + // assert(false); + return 0; + } else { + request.initial_len = prompt.size(); + request.tokens = prompt; + } + + if (get_num_ssms() == 0) { std::cout << "No small speculative model registered yet, using incremental " "decoding." << std::endl; } else { - std::cout << "Num of models: " << num_ssms << std::endl; - for (int i = 0; i < num_ssms; i++) { + std::cout << "Num of models: " << get_num_ssms() << std::endl; + for (int i = 0; i < get_num_ssms(); i++) { BeamTree beam_tree = BeamTree{}; request.beam_trees.push_back(beam_tree); } } pending_request_queue.push(request); + all_requests[request.guid] = request; if (verbose) { std::cout << "new req: " << request.tokens.size() << std::endl; @@ -127,6 +166,15 @@ RequestManager::RequestGuid std::cout << i << " : " << request.tokens[i] << std::endl; } } + + GenerationResult gr; + gr.guid = request.guid; + gr.input_text = ""; + gr.input_tokens = prompt; + gr.output_text = ""; + gr.output_tokens = prompt; + request_generation_results[request.guid] = gr; + return request.guid; } @@ -142,27 +190,39 @@ RequestManager::RequestGuid request.tokens.push_back(this->model_bos_map.at(this->model_type)); std::vector tokens = this->tokenizer_->Encode(prompt); + if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) { + std::cout << "Warning: too many tokens in prompt, only load up to " + << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " + << tokens.size() << ".\n"; + // Truncate the prompt to MAX_NUM_TOKENS + // tokens.resize(BatchConfig::MAX_PROMPT_LENGTH); + printf("tokens size: %zu\n", tokens.size()); + // assert(false); + return 0; + } + for (int i = 0; i < tokens.size(); i++) { - std::cout << tokens.at(i) << "\n"; + std::cout << "[" << i << "]" << tokens.at(i) << "\n"; } // assert(false); request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); request.initial_len = request.tokens.size(); - if (num_ssms == 0) { + if (get_num_ssms() == 0) { std::cout << "No small speculative model registered yet, using incremental " "decoding." << std::endl; } else { - std::cout << "Num of models: " << num_ssms << std::endl; - for (int i = 0; i < num_ssms; i++) { + std::cout << "Num of models: " << get_num_ssms() << std::endl; + for (int i = 0; i < get_num_ssms(); i++) { BeamTree beam_tree = BeamTree{}; request.beam_trees.push_back(beam_tree); } } pending_request_queue.push(request); + all_requests[request.guid] = request; { std::string output = "New request tokens:"; for (int i = 0; i < request.tokens.size(); i++) { @@ -170,13 +230,62 @@ RequestManager::RequestGuid } log_req_mgr.print("%s", output.c_str()); } + + GenerationResult gr; + gr.guid = request.guid; + gr.input_text = prompt; + gr.input_tokens = request.tokens; + gr.output_text = prompt; + gr.output_tokens = request.tokens; + request_generation_results[request.guid] = gr; return request.guid; } +bool RequestManager::is_request_completed(RequestGuid const &guid) { + const std::lock_guard lock(request_queue_mutex); + assert(all_requests.find(guid) != all_requests.end()); + Request const &request = all_requests[guid]; + return request.tokens.size() >= request.max_sequence_length; +} + +GenerationResult + RequestManager::get_generation_result(RequestGuid const &guid) { + const std::lock_guard lock(request_queue_mutex); + assert(request_generation_results.find(guid) != + request_generation_results.end()); + return request_generation_results[guid]; +} + size_t RequestManager::get_num_processed_requests() { return num_processed_requests; } +BatchConfigFuture + RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc, + InferenceResultFuture const &result) { + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + launcher.add_future(old_bc); + launcher.add_future(result); + return runtime->execute_task(ctx, launcher); +} + +BatchConfig RequestManager::prepare_next_batch_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + InferenceResult const &result = + Future(task->futures[1]).get_result(); + return rm->prepare_next_batch(*bc, result); +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); @@ -184,7 +293,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, for (int i = 0; i < old_bc.num_tokens; i++) { size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; - Request &request = running_request_queue[guid]; + Request &request = all_requests[guid]; if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { // This is a prompt token continue; @@ -205,8 +314,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, continue; } assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = - running_request_queue[old_bc.requestsInfo[i].request_guid]; + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; int processed_tokens = old_bc.requestsInfo[i].token_start_offset + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); @@ -221,6 +329,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, for (int i = 0; i < request.tokens.size(); i++) { std::cout << request.tokens.at(i) << "\n"; } + { + // update generation result and trigger future + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.output_tokens = request.tokens; + gr.output_text = output; + } log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; @@ -302,7 +417,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS) { Request new_request = pending_request_queue.front(); pending_request_queue.pop(); - running_request_queue[new_request.guid] = new_request; + // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = @@ -335,6 +450,32 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } /* ----- Speculative Inference Specific functions ----- */ +BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( + BeamSearchBatchConfigFuture const &old_bc, + BeamInferenceResultFuture const &result) { + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + launcher.add_future(old_bc); + launcher.add_future(result); + return runtime->execute_task(ctx, launcher); +} + +BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + BeamInferenceResult const &result = + Future(task->futures[1]).get_result(); + return rm->prepare_next_batch_beam(bc, result); +} // update beam search metadata BeamSearchBatchConfig @@ -361,15 +502,16 @@ BeamSearchBatchConfig BeamSearchBatchConfig new_bc; new_bc.max_init_length = 0; new_bc.model_id = old_bc.model_id; - std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; + // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { if (old_bc.request_completed[i]) { continue; } - assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = - running_request_queue[old_bc.requestsInfo[i].request_guid]; + // Comment out this assertion since num_tokens_in_batch can be + // zero when beam search has reached required sequence length + // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; int processed_tokens = old_bc.requestsInfo[i].token_start_offset + old_bc.requestsInfo[i].num_tokens_in_batch; @@ -451,6 +593,36 @@ BeamSearchBatchConfig return new_bc; } +BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init( + TreeVerifyBatchConfigFuture const &old_bc, + InferenceResultFuture const &result, + int model_id) { + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + launcher.add_future(old_bc); + launcher.add_future(result); + launcher.add_future(Future::from_value(model_id)); + return runtime->execute_task(ctx, launcher); +} + +BeamSearchBatchConfig RequestManager::prepare_next_batch_init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + TreeVerifyBatchConfig const &bc = + Future(task->futures[0]).get_result(); + InferenceResult const &result = + Future(task->futures[1]).get_result(); + int model_id = Future(task->futures[2]).get_result(); + return rm->prepare_next_batch_init(bc, result, model_id); +} + BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, @@ -470,7 +642,7 @@ BeamSearchBatchConfig continue; } size_t guid = old_bc.requestsInfo[i].request_guid; - Request &request = running_request_queue[guid]; + Request &request = all_requests[guid]; // Verify this: get verified tokens from result std::vector> tree_outputs = @@ -540,6 +712,13 @@ BeamSearchBatchConfig request.guid, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); + { + // update generation result and trigger future + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.output_tokens = request.tokens; + gr.output_text = output; + } log_req_mgr.print("Final output: %s", output.c_str()); new_bc.request_completed[i] = true; num_processed_requests++; @@ -648,7 +827,7 @@ BeamSearchBatchConfig pending_request_queue.pop(); new_bc.max_init_length = std::max(new_bc.max_init_length, new_request.initial_len); - running_request_queue[new_request.guid] = new_request; + // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = @@ -656,6 +835,7 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + // add profile_info for the new request ProfileInfo profile_info; profile_info.decoding_steps = 0; @@ -665,6 +845,10 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].beam_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH; new_bc.beamRequestsInfo[i].current_depth = 1; + new_bc.beamRequestsInfo[i].max_depth = + std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, + BatchConfig::MAX_NUM_TOKENS - + new_bc.requestsInfo[i].num_tokens_in_batch - 1); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; @@ -702,6 +886,33 @@ BeamSearchBatchConfig return new_bc; } +TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( + std::vector const &old_batches) { + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + for (auto const &bcf : old_batches) { + launcher.add_future(bcf); + } + return runtime->execute_task(ctx, launcher); +} + +TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + std::vector old_batches; + for (auto const &bcf : task->futures) { + old_batches.push_back(Future(bcf).get_result()); + } + return rm->prepare_next_batch_verify(old_batches); +} + TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { const std::lock_guard lock(request_queue_mutex); @@ -721,7 +932,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( continue; } size_t guid = old_batches.at(0).requestsInfo[i].request_guid; - Request &request = running_request_queue[guid]; + Request &request = all_requests[guid]; // Get the dfs tree std::vector>> @@ -767,7 +978,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; } - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + + std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -885,6 +1098,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } } + + std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; } if (verbose) { @@ -937,8 +1152,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, int beam_size = old_bc.beamRequestsInfo[index].beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; - Request &request = - running_request_queue[old_bc.requestsInfo[index].request_guid]; + Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; if (depth == 1) { // store the last input into the tree; @@ -1009,7 +1223,8 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // new_bc.beamRequestsInfo[request_index].tokens[j] = // tree.treeLayers[depth].tokens[j]; // ? // } - assert(false); + // Do nothing + // assert(false); } else { std::set parents; std::set childs; @@ -1166,9 +1381,11 @@ std::vector> log_req_mgr.print("(%d, %d)", pair.first, pair.second); } - assert(inputSerializedTree.size() == outputSerializedTree.size()); + // It's safe to have inputSerializedTree.size() > outputSerializedTree.size() + // In this case the inputSeriedTree ends with padding 0s + assert(inputSerializedTree.size() >= outputSerializedTree.size()); - for (int i = 0; i < inputSerializedTree.size(); i++) { + for (int i = 0; i < outputSerializedTree.size(); i++) { auto input = inputSerializedTree.at(i); auto output = outputSerializedTree.at(i); @@ -1226,7 +1443,7 @@ std::vector> } auto guid = old_bc.requestsInfo[request_index].request_guid; - Request &request = running_request_queue[guid]; + Request &request = all_requests[guid]; std::cout << "request.beam_trees.size(): " << request.beam_trees.size() << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); @@ -1329,4 +1546,147 @@ std::vector> return merged_tree; } +GenerationResult FFModel::generate(std::string const &text, + int max_seq_length) { + RequestManager *rm = RequestManager::get_request_manager(); + if (rm->get_num_ssms() == 0) { + // No SSMs: perform incremental decoding + return rm->generate_incr_decoding(this, text, max_seq_length); + } else { + // Registered SSMs: perform speculative inference + return rm->generate_spec_infer(this, text, max_seq_length); + } +} + +/*static*/ +GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, + std::string const &text, + int max_seq_length) { + InferenceManager *im = InferenceManager::get_inference_manager(); + RequestGuid guid = register_new_request(text, max_seq_length); + if (guid == 0) { + std::cout + << "=========== Discard request exceed prompt maximum... ===========" + << std::endl; + return GenerationResult(); + } + + int tokens_to_generate = max_seq_length - all_requests[guid].tokens.size(); + std::queue> + batch_pipeline; + { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); } + while (!is_request_completed(guid)) { + if (batch_pipeline.size() >= 4) { + // Block here to avoid launching too many batches + auto const &batch = batch_pipeline.front(); + batch.second.get_void_result(); + } + // deque finished batches + while (batch_pipeline.size() > 1) { + auto const &batch = batch_pipeline.front(); + if (batch.second.is_ready()) { + batch_pipeline.pop(); + } else { + break; + } + } + if (is_request_completed(guid)) { + break; + } + auto const &next_batch = batch_pipeline.back(); + BatchConfigFuture bcf = + prepare_next_batch(next_batch.first, next_batch.second); + FutureMap fm = im->inference(llm, 0, bcf); + assert(fm.get_future_map_domain().get_volume() == 1); + InferenceResultFuture irf = fm.get_future(0); + batch_pipeline.push(std::make_pair(bcf, irf)); + last_bcf = bcf; + last_irf = irf; + } + GenerationResult gr = get_generation_result(guid); + assert(gr.output_tokens.size() >= max_seq_length); + return gr; +} + +/*static*/ +GenerationResult RequestManager::generate_spec_infer(FFModel *llm, + std::string const &text, + int max_seq_length) { + InferenceManager *im = InferenceManager::get_inference_manager(); + RequestGuid guid = register_new_request(text, max_seq_length); + if (guid == 0) { + std::cout + << "=========== Discard request exceed prompt maximum... ===========" + << std::endl; + return GenerationResult(); + } + + std::queue> + batch_pipeline; + batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf)); + while (!is_request_completed(guid)) { + if (batch_pipeline.size() >= 4) { + // Block here to avoid launching too many batches + auto const &batch = batch_pipeline.front(); + batch.second.get_void_result(); + } + // deque finished batches + while (batch_pipeline.size() > 1) { + auto const &batch = batch_pipeline.front(); + if (batch.second.is_ready()) { + batch_pipeline.pop(); + } else { + break; + } + } + auto const &next_batch = batch_pipeline.back(); + BeamSearchBatchConfigFuture beam_bcf = + prepare_next_batch_init(next_batch.first, next_batch.second, 0); + std::vector beam_bcf_vec(get_num_ssms()); + for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) { + beam_bcf_vec[ssm_id] = beam_bcf; + } + // if (is_request_completed(guid)) { + // break; + // } + + for (size_t i = 0; i < get_num_ssms(); i++) { + for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH; + depth++) { + beam_bcf = beam_bcf_vec[i]; + + FutureMap fm = im->inference(get_model(i), 0, beam_bcf_vec[i]); + assert(fm.get_future_map_domain().get_volume() == 1); + BeamInferenceResultFuture beam_irf = fm.get_future(0); + beam_bcf_vec[i] = prepare_next_batch_beam(beam_bcf_vec[i], beam_irf); + } + } + // Token Tree Verification + { + TreeVerifyBatchConfigFuture tree_bcf = + prepare_next_batch_verify(beam_bcf_vec); + FutureMap fm = im->inference(llm, 0, tree_bcf); + assert(fm.get_future_map_domain().get_volume() == 1); + InferenceResultFuture tree_irf = fm.get_future(0); + batch_pipeline.push(std::make_pair(tree_bcf, tree_irf)); + last_tree_bcf = tree_bcf; + last_tree_irf = tree_irf; + } + } + + GenerationResult gr = get_generation_result(guid); + assert(gr.output_tokens.size() >= max_seq_length); + return gr; +} + +RequestManager *request_manager_singleton = nullptr; + +/*static*/ +RequestManager *RequestManager::get_request_manager() { + if (request_manager_singleton == nullptr) { + request_manager_singleton = new RequestManager(); + } + return request_manager_singleton; +} + }; // namespace FlexFlow diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index ffbdac68cd..80554c2add 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include "flexflow/utils/hip_helper.h" #include diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index a50ca5ad95..abfcd72a38 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/inference.h" +#include "flexflow/request_manager.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -28,22 +28,32 @@ void RequestManager::load_tokens_task( assert(regions.size() == 1); assert(task->regions.size() == 1); - BatchConfig const batch_config = *((BatchConfig *)task->args); + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; - assert(batch_config.num_tokens <= BatchConfig::MAX_NUM_TOKENS); - for (int i = 0; i < batch_config.num_tokens; i++) { - dram_copy[i] = batch_config.tokensInfo[i].token_id; + + // Extreme long prompts are not supported, only load up to MAX_NUM_TOKENS as + // prompt + if (batch_config->num_tokens > BatchConfig::MAX_NUM_TOKENS) { + printf("Warning: too many tokens in prompt, only load up to %d tokens\n", + BatchConfig::MAX_NUM_TOKENS); + printf("Got: %d tokens\n", batch_config->num_tokens); + } + // assert(batch_config->num_tokens <= BatchConfig::MAX_NUM_TOKENS); + + for (int i = 0; i < batch_config->num_tokens; i++) { + dram_copy[i] = batch_config->tokensInfo[i].token_id; } TokenId *fb_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - assert(batch_config.num_tokens <= domain.get_volume()); + assert(batch_config->num_tokens <= domain.get_volume()); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDA(cudaMemcpyAsync(fb_ptr, dram_copy, - sizeof(TokenId) * batch_config.num_tokens, + sizeof(TokenId) * batch_config->num_tokens, cudaMemcpyHostToDevice, stream)); } @@ -56,7 +66,9 @@ void RequestManager::load_positions_task( assert(regions.size() == 1); assert(task->regions.size() == 1); - BatchConfig const batch_config = *((BatchConfig *)task->args); + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + int offset = 2; int *pos_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -64,15 +76,15 @@ void RequestManager::load_positions_task( ctx, task->regions[0].region.get_index_space()); int dram_copy[BatchConfig::MAX_NUM_TOKENS]; - for (int i = 0; i < batch_config.num_tokens; i++) { - dram_copy[i] = batch_config.tokensInfo[i].abs_depth_in_request + offset; + for (int i = 0; i < batch_config->num_tokens; i++) { + dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset; } cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDA(cudaMemcpyAsync(pos_ptr, dram_copy, - sizeof(int) * batch_config.num_tokens, + sizeof(int) * batch_config->num_tokens, cudaMemcpyHostToDevice, stream)); } From 664667ecee080f3375f9105bc0eed9d60702f666 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:50:24 -0400 Subject: [PATCH 185/344] change argmax to DeviceSegmentedReduce::ArgMax && replace cudamalloc with legion instance (#896) * change argmax to DeviceSegmentedReduce::ArgMax * replace argmax, beam_topk, rms_norm cudamalloc * replace layernorm, linear, sampling. * destructor * format --- include/flexflow/ops/argmax.h | 25 +-- include/flexflow/ops/beam_topk.h | 7 +- include/flexflow/ops/kernels/linear_kernels.h | 2 + .../flexflow/ops/kernels/rms_norm_kernels.h | 7 +- include/flexflow/ops/layer_norm.h | 7 +- include/flexflow/ops/rms_norm.h | 1 + include/flexflow/ops/sampling.h | 6 +- include/flexflow/simulator.h | 2 + src/ops/argmax.cc | 78 +++------ src/ops/argmax.cpp | 13 +- src/ops/argmax.cu | 165 ++++++++++++------ src/ops/beam_topk.cc | 7 +- src/ops/beam_topk.cpp | 7 +- src/ops/beam_topk.cu | 58 ++++-- src/ops/kernels/linear_kernels.cpp | 1 + src/ops/kernels/linear_kernels.cu | 11 +- src/ops/kernels/rms_norm_kernels.cpp | 6 +- src/ops/kernels/rms_norm_kernels.cu | 19 +- src/ops/layer_norm.cc | 9 +- src/ops/layer_norm.cpp | 6 +- src/ops/layer_norm.cu | 36 ++-- src/ops/rms_norm.cc | 9 +- src/ops/sampling.cc | 10 +- src/ops/sampling.cpp | 4 +- src/ops/sampling.cu | 36 +++- 25 files changed, 357 insertions(+), 175 deletions(-) diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h index 709861f51c..298059e3ed 100644 --- a/include/flexflow/ops/argmax.h +++ b/include/flexflow/ops/argmax.h @@ -5,6 +5,7 @@ #include "flexflow/model.h" #include "flexflow/node.h" #include "flexflow/ops/argmax_params.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { @@ -12,18 +13,20 @@ class ArgMaxMeta : public OpMeta { public: bool beam_search; float *probs; -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t inputTensor, outputTensor; - cudnnReduceTensorDescriptor_t reduceMaxDesc; -#else - miopenTensorDescriptor_t inputTensor, outputTensor; - miopenReduceTensorDescriptor_t reduceMaxDesc; -#endif + void *d_temp_storage; + size_t temp_storage_bytes = 0; + int *d_offsets; + void *d_out; + Realm::RegionInstance reserveInst; ArgMaxMeta(FFHandler handler, Op const *op, Legion::Domain const &input_domain, Legion::Domain const &output_domain, - GenericTensorAccessorW input); + GenericTensorAccessorW input, + int batch_size, + int total_ele, + MemoryAllocator &gpu_mem_allocator); + ~ArgMaxMeta(void); }; class ArgMax : public Op { @@ -88,7 +91,7 @@ class ArgMax : public Op { static void forward_kernel(ArgMaxMeta const *m, DT *input_ptr, int *indices_ptr, - DT *prob_ptr, + float *prob_ptr, int *parent_ptr, int length, int batch_size, @@ -96,8 +99,8 @@ class ArgMax : public Op { static void forward_kernel_wrapper(ArgMaxMeta const *m, GenericTensorAccessorW const &input, GenericTensorAccessorW const &indices, - GenericTensorAccessorW const &value, - GenericTensorAccessorW const &parent); + GenericTensorAccessorW const &parent, + int batch_size); Params get_params() const; public: diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h index 639a8ead92..9466ba2a3b 100644 --- a/include/flexflow/ops/beam_topk.h +++ b/include/flexflow/ops/beam_topk.h @@ -5,12 +5,16 @@ #include "flexflow/model.h" #include "flexflow/node.h" #include "flexflow/ops/beam_topk_params.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { class BeamTopKMeta : public OpMeta { public: - BeamTopKMeta(FFHandler handle, Op const *op); + BeamTopKMeta(FFHandler handle, + Op const *op, + MemoryAllocator &gpu_mem_allocator); + ~BeamTopKMeta(void); bool sorted; int max_beam_width; int *parent_ids; @@ -18,6 +22,7 @@ class BeamTopKMeta : public OpMeta { int *block_start_index; int *request_id; int *tokens_per_request; + Realm::RegionInstance reserveInst; }; class BeamTopK : public Op { diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 29791b53ff..bbebe3c79b 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -15,6 +15,7 @@ class LinearMeta : public OpMeta { Linear const *li, MemoryAllocator gpu_mem_allocator, int weightSize); + ~LinearMeta(void); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t outputTensor; cudnnActivationDescriptor_t actiDesc; @@ -34,6 +35,7 @@ class LinearMeta : public OpMeta { float kernel_reg_lambda; bool use_bias, add_bias_only_once; char op_name[MAX_OPNAME]; + Realm::RegionInstance reserveInst; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index f38e55ae39..2063777ef1 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -5,6 +5,7 @@ #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { using Legion::coord_t; @@ -13,7 +14,10 @@ class RMSNorm; class RMSNormMeta : public OpMeta { public: - RMSNormMeta(FFHandler handler, RMSNorm const *rms); + RMSNormMeta(FFHandler handler, + RMSNorm const *rms, + MemoryAllocator &gpu_mem_allocator); + ~RMSNormMeta(void); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnReduceTensorDescriptor_t reduceDesc; @@ -34,6 +38,7 @@ class RMSNormMeta : public OpMeta { int batch_size; int num_elements; char op_name[MAX_OPNAME]; + Realm::RegionInstance reserveInst; }; namespace Kernels { diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 058884cc9a..cb977fc6a6 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -2,6 +2,7 @@ #include "flexflow/inference.h" #include "flexflow/model.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { class LayerNormMeta; @@ -107,7 +108,10 @@ class LayerNorm : public Op { class LayerNormMeta : public OpMeta { public: - LayerNormMeta(FFHandler handle, LayerNorm const *ln); + LayerNormMeta(FFHandler handle, + LayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator); + ~LayerNormMeta(void); public: bool elementwise_affine; @@ -115,6 +119,7 @@ class LayerNormMeta : public OpMeta { float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; char op_name[MAX_OPNAME]; + Realm::RegionInstance reserveInst; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 38ce983dd0..979a20976c 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -4,6 +4,7 @@ #include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/ops/rms_norm_params.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h index da554d4f34..789904df32 100644 --- a/include/flexflow/ops/sampling.h +++ b/include/flexflow/ops/sampling.h @@ -9,6 +9,7 @@ #include #include #endif +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { @@ -22,6 +23,7 @@ class SamplingMeta : public OpMeta { int *idx; void *d_temp_storage; size_t temp_storage_bytes; + Realm::RegionInstance reserveInst; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) curandState *state; #endif @@ -29,7 +31,9 @@ class SamplingMeta : public OpMeta { Op const *op, int batch_size, int total_ele, - GenericTensorAccessorW input); + GenericTensorAccessorW input, + MemoryAllocator &gpu_mem_allocator); + ~SamplingMeta(void); }; class Sampling : public Op { diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h index 6f0f327110..e410f66325 100644 --- a/include/flexflow/simulator.h +++ b/include/flexflow/simulator.h @@ -38,6 +38,7 @@ class LinearMeta; class Pool2DMeta; class ElementUnaryMeta; class ElementBinaryMeta; +class LayerNormMeta; // class EmbeddingMeta; // class SoftmaxMeta; class BatchMatmulMeta; @@ -754,6 +755,7 @@ class Simulator { LinearMeta *linear_meta; Pool2DMeta *pool2d_meta; ElementUnaryMeta *ele_unary_meta; + LayerNormMeta *layernorm_meta; // ElementBinaryMeta *ele_binary_meta; // EmbeddingMeta *embedding_meta; // SoftmaxMeta *softmax_meta; diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 8598a71d50..a7476928ba 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -51,7 +51,7 @@ Tensor FFModel::argmax(const Tensor input, bool beam_search, char const *name) { name, 1 /*inputs*/, 0 /*weights*/, - beam_search ? 3 : 2 /*outputs*/, + beam_search ? 2 : 1 /*outputs*/, input); { int numdims = input->num_dims; @@ -65,13 +65,9 @@ Tensor FFModel::argmax(const Tensor input, bool beam_search, char const *name) { // numdims, dims, input->data_type, li, 0, true /*create_grad*/); li->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 0, false /*create_grad*/); - // logits - li->outputs[1] = create_tensor_legion_ordering( - numdims, dims, input->data_type, li, 1, false /*create_grad*/); - if (beam_search) { // parent id - li->outputs[2] = create_tensor_legion_ordering( + li->outputs[1] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 1, false /*create_grad*/); } } @@ -116,7 +112,7 @@ ArgMax::ArgMax(FFModel &model, name, 1 /*inputs*/, 0 /*weights*/, - _beam_search ? 3 : 2 /*outputs*/, + _beam_search ? 2 : 1 /*outputs*/, _input), beam_search(_beam_search) { int numdim = inputs[0]->num_dims; @@ -131,11 +127,9 @@ ArgMax::ArgMax(FFModel &model, // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, DT_INT32, this, 0 /*owner_idx*/); - outputs[1] = model.create_parallel_tensor_legion_ordering( - numdim, dims, _input->data_type, this, 1 /*owner_idx*/); if (_beam_search) { - outputs[2] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_INT32, this, 2 /*owner_idx*/); + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_INT32, this, 1 /*owner_idx*/); } } @@ -180,12 +174,6 @@ void ArgMax::init_inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -240,9 +228,22 @@ OpMeta *ArgMax::init_task(Task const *task, ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); + int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; + int batch_size = acc_input.domain.get_volume() / length; + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); - ArgMaxMeta *m = - new ArgMaxMeta(handle, s, input_domain, output_domain, acc_input); + ArgMaxMeta *m = new ArgMaxMeta(handle, + s, + input_domain, + output_domain, + acc_input, + batch_size, + length * batch_size, + gpu_mem_allocator); m->profiling = s->profiling; m->beam_search = s->beam_search; return m; @@ -297,13 +298,6 @@ FutureMap ArgMax::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[1]->region)); launcher.add_field(2, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_outputs[2]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[2]->region)); - launcher.add_field(3, FID_DATA); return runtime->execute_index_space(ctx, launcher); } else { IndexLauncher launcher(ARGMAX_NORM_INF_TASK_ID, @@ -328,13 +322,6 @@ FutureMap ArgMax::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); } } @@ -344,8 +331,8 @@ BeamInferenceResult std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 4); - assert(task->regions.size() == 4); + assert(regions.size() == 3); + assert(task->regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { // Directly return for empty batch config @@ -359,21 +346,14 @@ BeamInferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); int batch_size = bc->num_active_tokens(); - GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( - m->input_type[0], regions[2], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( - DT_INT32, regions[3], task->regions[1], FID_DATA, ctx, runtime); - ArgMax::forward_kernel_wrapper(m, input, indices, value, parent); + DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); + ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); BeamInferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); - if (m->input_type[0] == DT_FLOAT) { - download_tensor(value.get_float_ptr(), ir.probs, batch_size); - } else if (m->input_type[0] == DT_HALF) { - download_tensor(m->probs, ir.probs, batch_size); - } - + download_tensor(m->probs, ir.probs, batch_size); download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); return ir; } @@ -383,8 +363,8 @@ InferenceResult std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); + assert(regions.size() == 2); + assert(task->regions.size() == 2); ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { @@ -397,11 +377,9 @@ InferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( - m->input_type[0], regions[2], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW parent; int batch_size = bc->num_active_tokens(); - ArgMax::forward_kernel_wrapper(m, input, indices, value, parent); + ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); InferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 1395a1cdeb..778ddf3c9d 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -25,7 +25,7 @@ template void ArgMax::forward_kernel(ArgMaxMeta const *m, DT *input_ptr, int *indices_ptr, - DT *prob_ptr, + float *prob_ptr, int *parent_ptr, int length, int batch_size, @@ -35,8 +35,8 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, GenericTensorAccessorW const &input, GenericTensorAccessorW const &indices, - GenericTensorAccessorW const &value, - GenericTensorAccessorW const &parent) { + GenericTensorAccessorW const &parent, + int batch_size) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -63,7 +63,12 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, Op const *op, Legion::Domain const &input_domain, Legion::Domain const &output_domain, - GenericTensorAccessorW input) + GenericTensorAccessorW input, + int batch_size, + int total_ele, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, op) {} +ArgMaxMeta::~ArgMaxMeta(void) {} + }; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 99487ea380..37e067006c 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -12,17 +12,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "flexflow/ffconst_utils.h" #include "flexflow/ops/argmax.h" #include "flexflow/utils/cuda_helper.h" +#include namespace FlexFlow { -__global__ void - half_2_float_array(half *ptr, float *ptr_f, int num_of_elements) { - CUDA_KERNEL_LOOP(i, num_of_elements) { - ptr_f[i] = __half2float(ptr[i]); +__global__ void init_offset(int batch_size, + int vocab_size, + int total_eles, + int *d_offsets) { + CUDA_KERNEL_LOOP(i, total_eles) { + if (i % vocab_size == 0) { + d_offsets[i / vocab_size] = i; + } + } +} + +template +__global__ void copy_result(cub::KeyValuePair *d_out, + int *indices, + float *prob_ptr, + int batch_size, + bool beam_search) { + CUDA_KERNEL_LOOP(i, batch_size) { + indices[i] = d_out[i].key; + if (beam_search) { + prob_ptr[i] = static_cast(d_out[i].value); + } } } @@ -31,7 +49,7 @@ template void ArgMax::forward_kernel(ArgMaxMeta const *m, DT *input_ptr, int *indices_ptr, - DT *prob_ptr, + float *prob_ptr, int *parent, int const length, int const batch_size, @@ -43,26 +61,36 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, // set all parents id zero in arg top1 case. checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int))); } - checkCUDNN(cudnnReduceTensor(m->handle.dnn, - m->reduceMaxDesc, - indices_ptr /*indices*/, - batch_size * sizeof(int) /*indicesSizeInBytes*/, - m->handle.workSpace, - m->handle.workSpaceSize, - &alpha, - m->inputTensor, - input_ptr, - &beta, - m->outputTensor, - prob_ptr)); + size_t temp_storage_bytes = m->temp_storage_bytes; + // use cub + checkCUDA(cub::DeviceSegmentedReduce::ArgMax( + m->d_temp_storage, + temp_storage_bytes, + input_ptr, + static_cast *>(m->d_out), + batch_size, + m->d_offsets, + m->d_offsets + 1, + stream)); + + // copy dout to incides + int parallelism = batch_size; + copy_result<<>>(static_cast *>(m->d_out), + indices_ptr, + prob_ptr, + batch_size, + m->beam_search); } /*static*/ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, GenericTensorAccessorW const &input, GenericTensorAccessorW const &indices, - GenericTensorAccessorW const &value, - GenericTensorAccessorW const &parent) { + GenericTensorAccessorW const &parent, + int batch_size) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -73,31 +101,23 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, cudaEventRecord(t_start, stream); } int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; - int batch_size = input.domain.get_volume() / length; if (input.data_type == DT_HALF) { ArgMax::forward_kernel(m, input.get_half_ptr(), indices.get_int32_ptr(), - value.get_half_ptr(), + m->probs, m->beam_search ? parent.get_int32_ptr() : nullptr, length, batch_size, stream); - if (m->beam_search) { - half_2_float_array<<>>( - value.get_half_ptr(), m->probs, batch_size); - } } else if (input.data_type == DT_FLOAT) { ArgMax::forward_kernel(m, input.get_float_ptr(), indices.get_int32_ptr(), - value.get_float_ptr(), + m->probs, m->beam_search ? parent.get_int32_ptr() : nullptr, length, @@ -122,30 +142,71 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, Op const *op, Legion::Domain const &input_domain, Legion::Domain const &output_domain, - GenericTensorAccessorW input) + GenericTensorAccessorW input, + int batch_size, + int total_ele, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, op) { DataType data_type = op->data_type; - checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); - checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceMaxDesc)); - - // Float and Half use save type, according to - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnReduceTensor:~:text=not%20coordinate%20tuples.-,The%20data%20types%20of%20the%20tensors,.,-Note%3A - cudnnDataType_t cudnn_data_type = CUDNN_DATA_FLOAT; - - checkCUDNN( - cudnnSetReduceTensorDescriptor(reduceMaxDesc, - CUDNN_REDUCE_TENSOR_MAX, - cudnn_data_type, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, - CUDNN_32BIT_INDICES)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain( - outputTensor, output_domain, data_type)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain, data_type)); - - checkCUDA(cudaMalloc(&probs, sizeof(float) * BatchConfig::MAX_NUM_TOKENS)); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + size_t d_offsets_size = batch_size; + size_t prob_size = batch_size; + assert(data_type == DT_FLOAT || data_type == DT_HALF); + size_t total_size = + d_offsets_size * sizeof(int) + + (data_type == DT_FLOAT + ? sizeof(cub::KeyValuePair) * batch_size + : sizeof(cub::KeyValuePair) * batch_size) + + prob_size * sizeof(float); + gpu_mem_allocator.create_legion_instance(reserveInst, total_size); + d_offsets = gpu_mem_allocator.allocate_instance(d_offsets_size); + d_out = data_type == DT_FLOAT + ? gpu_mem_allocator.allocate_instance_untyped( + batch_size * sizeof(cub::KeyValuePair)) + : gpu_mem_allocator.allocate_instance_untyped( + batch_size * sizeof(cub::KeyValuePair)); + probs = gpu_mem_allocator.allocate_instance(prob_size); + // init offset + int parallelism = total_ele; + init_offset<<>>( + batch_size, total_ele / batch_size, total_ele, d_offsets); + + if (data_type == DT_FLOAT) { + checkCUDA(cub::DeviceSegmentedReduce::ArgMax( + d_temp_storage, + temp_storage_bytes, + input.get_float_ptr(), + static_cast *>(d_out), + batch_size, + d_offsets, + d_offsets + 1, + stream)); + + } else if (data_type == DT_HALF) { + checkCUDA(cub::DeviceSegmentedReduce::ArgMax( + d_temp_storage, + temp_storage_bytes, + input.get_half_ptr(), + static_cast *>(d_out), + batch_size, + d_offsets, + d_offsets + 1, + stream)); + } + + gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); + d_temp_storage = + gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); } +ArgMaxMeta::~ArgMaxMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} }; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 1c71e69c9d..93a6de5a8f 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -271,7 +271,12 @@ OpMeta *BeamTopK::init_task(Task const *task, Runtime *runtime) { BeamTopK *topk = (BeamTopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - BeamTopKMeta *m = new BeamTopKMeta(handle, topk); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator); m->profiling = topk->profiling; m->sorted = topk->sorted; m->max_beam_width = topk->max_beam_width; diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 248ab188da..293feecff0 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -678,7 +678,10 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, } } -BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op) : OpMeta(handler) { +BeamTopKMeta::BeamTopKMeta(FFHandler handler, + Op const *op, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handler) { DataType data_type = op->inputs[0]->data_type; checkCUDA(hipMalloc(&parent_ids, sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * @@ -697,4 +700,6 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op) : OpMeta(handler) { sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_NUM_REQUESTS)); } + +BeamTopKMeta::~BeamTopKMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index ceddb55f2d..42fa7a5ab5 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -710,23 +710,47 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, } } -BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op) : OpMeta(handler) { +BeamTopKMeta::BeamTopKMeta(FFHandler handler, + Op const *op, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handler) { DataType data_type = op->inputs[0]->data_type; - checkCUDA(cudaMalloc(&parent_ids, - sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(cudaMalloc(&acc_probs, - data_type_size(data_type) * - BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(cudaMalloc(&block_start_index, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(cudaMalloc(&request_id, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(cudaMalloc(&tokens_per_request, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + size_t parent_id_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t acc_probs_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t block_start_index_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t request_id_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t tokens_per_request_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t totalSize = sizeof(int) * parent_id_size + + data_type_size(data_type) * acc_probs_size + + sizeof(int) * block_start_index_size + + sizeof(int) * request_id_size + + sizeof(int) * tokens_per_request_size; + + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + parent_ids = gpu_mem_allocator.allocate_instance(parent_id_size); + if (data_type == DT_FLOAT) { + acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); + } else if (data_type == DT_HALF) { + acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); + } else { + assert(false); + } + + block_start_index = + gpu_mem_allocator.allocate_instance(block_start_index_size); + request_id = gpu_mem_allocator.allocate_instance(request_id_size); + tokens_per_request = + gpu_mem_allocator.allocate_instance(tokens_per_request_size); +} + +BeamTopKMeta::~BeamTopKMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } } }; // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 41b9912702..0d70e91d47 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -41,6 +41,7 @@ LinearMeta::LinearMeta(FFHandler handler, checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); } +LinearMeta::~LinearMeta(void) {} namespace Kernels { namespace Linear { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 06677f86e6..8a93357dcf 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -39,7 +39,10 @@ LinearMeta::LinearMeta(FFHandler handler, } } // Allocate an all-one's vector - checkCUDA(cudaMalloc(&one_ptr, data_type_size(data_type) * batch_size)); + gpu_mem_allocator.create_legion_instance( + reserveInst, data_type_size(data_type) * batch_size); + one_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * batch_size); int parallelism = batch_size; cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -62,6 +65,12 @@ LinearMeta::LinearMeta(FFHandler handler, checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); } +LinearMeta::~LinearMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + namespace Kernels { namespace Linear { diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 5de12b3f1f..b2e2648785 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -22,9 +22,11 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; -RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms) +RMSNormMeta::RMSNormMeta(FFHandler handler, + RMSNorm const *rms, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) {} - +RMSNormMeta::~RMSNormMeta(void) {} namespace Kernels { namespace RMSNorm { diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 44e6288529..234bf73150 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -27,7 +27,9 @@ using Legion::coord_t; constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; -RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms) +RMSNormMeta::RMSNormMeta(FFHandler handler, + RMSNorm const *rms, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; alpha = 1.0f; @@ -38,8 +40,19 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms) num_elements = in_dim * batch_size; DataType data_type = rms->weights[0]->data_type; - checkCUDA(cudaMalloc(&rms_ptr, batch_size * data_type_size(data_type))); - checkCUDA(cudaMalloc(&norm_ptr, num_elements * data_type_size(data_type))); + size_t rms_ptr_size = batch_size; + size_t norm_ptr_size = num_elements; + size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + rms_ptr = gpu_mem_allocator.allocate_instance_untyped( + rms_ptr_size * data_type_size(data_type)); + norm_ptr = gpu_mem_allocator.allocate_instance_untyped( + norm_ptr_size * data_type_size(data_type)); +} +RMSNormMeta::~RMSNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } } namespace Kernels { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index b9b3abe0f1..dcbb0cc1e5 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -355,7 +355,12 @@ OpMeta *LayerNorm::init_task(Task const *task, Runtime *runtime) { LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - LayerNormMeta *meta = new LayerNormMeta(handle, ln); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + LayerNormMeta *meta = new LayerNormMeta(handle, ln, gpu_mem_allocator); meta->input_type[0] = ln->inputs[0]->data_type; meta->output_type[0] = ln->outputs[0]->data_type; return meta; @@ -651,7 +656,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } Domain input_domain = sub_input.get_domain(); Domain output_domain = sub_output.get_domain(); - LayerNormMeta *m = new LayerNormMeta(sim->handler, this); + LayerNormMeta *m = sim->layernorm_meta; sim->free_all(); float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index fc6be70c74..855f7296e8 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -24,7 +24,9 @@ constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; constexpr int kColwiseReduceTileSize = 32; -LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) +LayerNormMeta::LayerNormMeta(FFHandler handle, + LayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handle) { elementwise_affine = ln->elementwise_affine; effective_batch_size = ln->effective_batch_size; @@ -38,6 +40,8 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); } +LayerNormMeta::~LayerNormMeta(void) {} + template __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 1f4e7d3933..f594f8f7a8 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -24,7 +24,9 @@ constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; constexpr int kColwiseReduceTileSize = 32; -LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) +LayerNormMeta::LayerNormMeta(FFHandler handle, + LayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handle) { elementwise_affine = ln->elementwise_affine; effective_batch_size = ln->effective_batch_size; @@ -32,18 +34,26 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln) profiling = ln->profiling; eps = ln->eps; DataType data_type = ln->data_type; - checkCUDA( - cudaMalloc(&mean_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&rstd_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&ds_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&db_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&scale_ptr, data_type_size(data_type) * effective_batch_size)); - checkCUDA( - cudaMalloc(&bias_ptr, data_type_size(data_type) * effective_batch_size)); + size_t totalSize = effective_batch_size * data_type_size(data_type) * 6; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + ds_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + db_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + scale_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); +} + +LayerNormMeta::~LayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } } template diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 5529abba20..1f21591130 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -27,6 +27,8 @@ using Legion::Context; using Legion::Domain; using Legion::FutureMap; using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; using Legion::PhysicalRegion; using Legion::Predicate; using Legion::Rect; @@ -289,7 +291,12 @@ OpMeta *RMSNorm::init_task(Task const *task, Runtime *runtime) { RMSNorm *rn = (RMSNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - RMSNormMeta *meta = new RMSNormMeta(handle, rn); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + RMSNormMeta *meta = new RMSNormMeta(handle, rn, gpu_mem_allocator); return meta; } diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 66b3420a39..79e6027b7c 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -223,9 +223,13 @@ OpMeta *Sampling::init_task(Task const *task, int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; int batch_size = acc_input.domain.get_volume() / length; - - SamplingMeta *m = - new SamplingMeta(handle, s, batch_size, length * batch_size, acc_input); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + SamplingMeta *m = new SamplingMeta( + handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator); m->profiling = s->profiling; m->top_p = s->top_p; return m; diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp index 4901fe400c..56f3f604d5 100644 --- a/src/ops/sampling.cpp +++ b/src/ops/sampling.cpp @@ -61,7 +61,9 @@ SamplingMeta::SamplingMeta(FFHandler handler, Op const *op, int batch_size, int total_ele, - GenericTensorAccessorW input) + GenericTensorAccessorW input, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, op) {} +SamplingMeta::~SamplingMeta(void) {} }; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu index a91263a621..461d72ec71 100644 --- a/src/ops/sampling.cu +++ b/src/ops/sampling.cu @@ -201,17 +201,29 @@ SamplingMeta::SamplingMeta(FFHandler handler, Op const *op, int batch_size, int total_ele, - GenericTensorAccessorW input) + GenericTensorAccessorW input, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, op) { DataType data_type = op->data_type; - checkCUDA(cudaMalloc(&begin_offset, (batch_size + 1) * sizeof(int))); - checkCUDA(cudaMalloc(&end_offset, (batch_size + 1) * sizeof(int))); - checkCUDA(cudaMalloc(&idx, total_ele * sizeof(int))); - checkCUDA(cudaMalloc(&sorted_idx, total_ele * sizeof(int))); - checkCUDA(cudaMalloc(&sorted_logits, total_ele * data_type_size(data_type))); - cudaMalloc(&state, sizeof(curandState) * batch_size); + size_t begin_offset_size, end_offset_size; + begin_offset_size = end_offset_size = batch_size + 1; + size_t idx_size, sorted_idx_size, sorted_logits_size; + idx_size = sorted_idx_size = sorted_logits_size = total_ele; + size_t state_size = batch_size; + size_t totalSize = sizeof(int) * (begin_offset_size + end_offset_size + + idx_size + sorted_idx_size) + + data_type_size(data_type) * sorted_logits_size + + sizeof(curandState) * state_size; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + begin_offset = gpu_mem_allocator.allocate_instance(begin_offset_size); + end_offset = gpu_mem_allocator.allocate_instance(end_offset_size); + idx = gpu_mem_allocator.allocate_instance(idx_size); + sorted_idx = gpu_mem_allocator.allocate_instance(sorted_idx_size); + sorted_logits = gpu_mem_allocator.allocate_instance_untyped( + sorted_logits_size * data_type_size(data_type)); + state = gpu_mem_allocator.allocate_instance(state_size); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -261,7 +273,15 @@ SamplingMeta::SamplingMeta(FFHandler handler, } else { assert(false && "input type in float and half"); } - checkCUDA(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + + gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); + d_temp_storage = + gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); } +SamplingMeta::~SamplingMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} }; // namespace FlexFlow \ No newline at end of file From 0f8b4868edb190c0d09b30fa9a762e860a4d5f29 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 30 Jul 2023 17:13:54 -0500 Subject: [PATCH 186/344] enable tracing (#901) --- src/runtime/request_manager.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 0856c1663f..eea02601e8 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1593,6 +1593,9 @@ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, if (is_request_completed(guid)) { break; } + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + runtime->begin_trace(ctx, 12346 /*trace_id*/); auto const &next_batch = batch_pipeline.back(); BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second); @@ -1602,6 +1605,7 @@ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, batch_pipeline.push(std::make_pair(bcf, irf)); last_bcf = bcf; last_irf = irf; + runtime->end_trace(ctx, 12346 /*trace_id*/); } GenerationResult gr = get_generation_result(guid); assert(gr.output_tokens.size() >= max_seq_length); @@ -1649,6 +1653,9 @@ GenerationResult RequestManager::generate_spec_infer(FFModel *llm, // if (is_request_completed(guid)) { // break; // } + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + runtime->begin_trace(ctx, 12345 /*trace_id*/); for (size_t i = 0; i < get_num_ssms(); i++) { for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH; @@ -1672,6 +1679,7 @@ GenerationResult RequestManager::generate_spec_infer(FFModel *llm, last_tree_bcf = tree_bcf; last_tree_irf = tree_irf; } + runtime->end_trace(ctx, 12345 /*trace_id*/); } GenerationResult gr = get_generation_result(guid); From f07de46bbc5dd23e0267d142b96e511fdce2cbd4 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Mon, 31 Jul 2023 09:06:01 -0400 Subject: [PATCH 187/344] Fixed edge case. (#903) --- include/flexflow/batch_config.h | 3 +-- inference/models/configs/llama_68M.json | 11 +++++++++ inference/utils/convert_llama_config.py | 32 +++++++++++++++++++++++++ src/runtime/request_manager.cc | 3 ++- 4 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 inference/models/configs/llama_68M.json create mode 100644 inference/utils/convert_llama_config.py diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 5e68a65d8c..bae847106a 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -54,8 +54,7 @@ class BatchConfig { static BatchConfig const *from_future(BatchConfigFuture const &future); static int const MAX_NUM_REQUESTS = 1; static int const MAX_NUM_TOKENS = 64; - static int const MAX_PROMPT_LENGTH = - 63; // should be MAX_NUM_TOKENS - 1 for SpecInfer + static int const MAX_PROMPT_LENGTH = 62; static int const MAX_SEQ_LENGTH = 256; // These are set by update diff --git a/inference/models/configs/llama_68M.json b/inference/models/configs/llama_68M.json new file mode 100644 index 0000000000..11e21531c4 --- /dev/null +++ b/inference/models/configs/llama_68M.json @@ -0,0 +1,11 @@ +{ + "n_layers": 2, + "vocab_size": 32000, + "n_heads": 12, + "dim": 768, + "multiple_of": 256, + "norm_eps": 1e-06, + "total_requests": 2560, + "hidden_dim": 3072, + "incremental_mode": true +} \ No newline at end of file diff --git a/inference/utils/convert_llama_config.py b/inference/utils/convert_llama_config.py new file mode 100644 index 0000000000..dfae42f841 --- /dev/null +++ b/inference/utils/convert_llama_config.py @@ -0,0 +1,32 @@ +import argparse +import json + +def convert_json(input_file, output_file): + # Load the input JSON data from the file + with open(input_file, 'r') as file: + input_data = json.load(file) + + # Extract the required fields and create the output JSON object + output_data = { + "n_layers": input_data["num_hidden_layers"], + "vocab_size": input_data["vocab_size"], + "n_heads": input_data["num_attention_heads"], + "dim": input_data["hidden_size"], + "multiple_of": 256, + "norm_eps": input_data["rms_norm_eps"], + "total_requests": 2560, + "hidden_dim": input_data["intermediate_size"], + "incremental_mode": input_data["use_cache"] + } + + # Save the output JSON data to the file + with open(output_file, 'w') as file: + json.dump(output_data, file, indent=4) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert JSON file to a different format.") + parser.add_argument("input_file", help="Path to the input JSON file.") + parser.add_argument("output_file", help="Path to the output JSON file.") + args = parser.parse_args() + + convert_json(args.input_file, args.output_file) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index eea02601e8..189cf08a39 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1094,7 +1094,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) { break; } } @@ -1148,6 +1148,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, std::cout << "i = " << i << ", result index = " << result_index << ", value: " << result.token_ids[result_index] << "\n"; } + int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; From ba91733352a29cb47c912360c9e75f2c67154a97 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 2 Aug 2023 13:52:37 -0400 Subject: [PATCH 188/344] Python interface for inference (part 2) (#893) * add argmax, add default args to test file * updates * comment out print * updates * added code to get configs and weights from hf * added FileDataLoader to cffi * remove aggressive reformatting * update * fix * add code to load weights from python * fix half precision weight loading from python * fixed loading weights * fixed loading weights * checkpoint * generation from python now works * make it easier to set flags needed to run native python * downloading tokenizers from hf * add support for opt * implement falcon * add support for multiple prompts and prompts from json file * implement speculative inference * finished specinfer implementation * updated arguments parsing * remove unnecessary args from compile func * . * update interface examples * fix ssm bug * fix fusion-related bugs * standardize argument parsing in python examples * docstrings * update * moved c++ inference tests --- CMakeLists.txt | 3 + FlexFlow.mk | 5 +- INSTALL.md | 7 +- include/flexflow/ffconst.h | 2 +- include/flexflow/flexflow_c.h | 114 ++++++-- inference/file_loader.cc | 1 - inference/flexflow_inference.py | 43 --- inference/incr_decoding/incr_decoding.cc | 5 - inference/models/falcon.cc | 6 +- inference/models/opt.cc | 6 +- inference/python/incr_decoding.py | 118 ++++++++ inference/python/spec_infer.py | 168 +++++++++++ inference/utils/download_opt_weights.py | 42 ++- python/flexflow/core/flexflow_cffi.py | 332 ++++++++++++++++++--- python/flexflow/serve/__init__.py | 129 ++++++++- python/flexflow/serve/models/base.py | 39 +++ python/flexflow/serve/models/falcon.py | 183 +++++++++++- python/flexflow/serve/models/llama.py | 263 ++++++++++++----- python/flexflow/serve/models/opt.py | 285 +++++++++++++++++- python/flexflow/serve/serve.py | 353 ++++++++++++++++++++--- python/flexflow/type.py | 273 ++++++++++-------- python/flexflow_python_build.py | 31 +- src/c/flexflow_c.cc | 254 ++++++++++++---- src/ops/argmax.cc | 2 +- src/ops/fused.cu | 41 +++ src/runtime/model.cc | 4 - src/runtime/request_manager.cc | 23 +- tests/inference/cpp_inference_tests.sh | 275 ++++++++++++++++++ tests/inference_tests.sh | 270 +---------------- 29 files changed, 2541 insertions(+), 736 deletions(-) delete mode 100644 inference/flexflow_inference.py create mode 100644 inference/python/incr_decoding.py create mode 100644 inference/python/spec_infer.py create mode 100644 python/flexflow/serve/models/base.py create mode 100755 tests/inference/cpp_inference_tests.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index f7e58bf0aa..b35fb1613d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,11 +321,13 @@ list(APPEND FLEXFLOW_INCLUDE_DIRS file(GLOB_RECURSE FLEXFLOW_HDR LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/include/*.h) + list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) file(GLOB_RECURSE FLEXFLOW_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cc) list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") +list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) @@ -460,6 +462,7 @@ if (FF_USE_PYTHON) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. + # create set_python_envs.sh script to set up the environment variables to run flexflow_python if (NOT FF_BUILD_FROM_PYPI) add_custom_command(TARGET flexflow PRE_BUILD diff --git a/FlexFlow.mk b/FlexFlow.mk index 8dc6a017b4..14f32a7639 100644 --- a/FlexFlow.mk +++ b/FlexFlow.mk @@ -59,7 +59,8 @@ GEN_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cc')\ $(shell find $(FF_HOME)/src/runtime/ -name '*.cc')\ $(shell find $(FF_HOME)/src/utils/dot/ -name '*.cc')\ $(shell find $(FF_HOME)/src/dataloader/ -name '*.cc')\ - $(shell find $(FF_HOME)/src/c/ -name '*.cc') + $(shell find $(FF_HOME)/src/c/ -name '*.cc')\ + $(shell find $(FF_HOME)/inference/ -name 'file_loader.cc') GEN_SRC := $(filter-out $(FF_HOME)/src/runtime/cpp_driver.cc, $(GEN_SRC)) FF_CUDA_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cu')\ @@ -94,7 +95,7 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1) endif -INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src +INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 diff --git a/INSTALL.md b/INSTALL.md index d2e3c1d2f6..cdc2a2abbd 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -85,10 +85,11 @@ export FF_HOME=/path/to/FlexFlow ### Run FlexFlow Python examples The Python examples are in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python). The native, Keras integration and PyTorch integration examples are listed in `native`, `keras` and `pytorch` respectively. -To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the following flags: +To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the required environment flags by running the following command (edit the path if your build folder is not named `build`): -* `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/deps/legion/bindings/python:${PYTHONPATH}"` -* `export LD_LIBRARY_PATH="${FF_HOME}/build:${FF_HOME}/build/deps/legion/lib:${LD_LIBRARY_PATH}"` +``` +source ./build/set_python_envs.sh +``` **We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:** diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 7521613477..170180aeaf 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -180,7 +180,7 @@ enum OperatorType { OP_INVALID, }; -enum ModelType { UNKNOWN, LLAMA, OPT, FALCON }; +enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 }; enum PMParameter { PM_OP_TYPE, // AnyOp diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index f8f9b97aad..9983898130 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -53,6 +53,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_tree_verify_batch_config_t); FF_NEW_OPAQUE_TYPE(flexflow_beam_search_batch_config_t); FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t); +FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t); +FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t); // ----------------------------------------------------------------------- // FFConfig @@ -78,6 +80,21 @@ int flexflow_config_get_epochs(flexflow_config_t handle); bool flexflow_config_get_enable_control_replication(flexflow_config_t handle); +int flexflow_config_get_data_parallelism_degree(flexflow_config_t handle_); + +int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_); + +int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_); + +void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_, + int value); + +void flexflow_config_set_tensor_parallelism_degree(flexflow_config_t handle_, + int value); + +void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_, + int value); + int flexflow_config_get_python_data_loader_type(flexflow_config_t handle); // ----------------------------------------------------------------------- @@ -390,8 +407,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bool bias, bool add_bias_kv, bool add_zero_attn, + enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name); flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( @@ -405,8 +426,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( bool bias, bool add_bias_kv, bool add_zero_attn, + enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name); flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( @@ -420,8 +445,27 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bool bias, bool add_bias_kv, bool add_zero_attn, + enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name); + +flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, char const *name); flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, @@ -447,6 +491,11 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, float top_p, char const *name); +flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, + const flexflow_tensor_t input_, + bool beam_search, + char const *name); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); @@ -468,6 +517,10 @@ flexflow_perf_metrics_t void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); +flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_, + char const *text, + int max_seq_length); + // ----------------------------------------------------------------------- // Tensor // ----------------------------------------------------------------------- @@ -809,44 +862,55 @@ void flexflow_beam_search_batch_config_destroy( // RequestManager // ----------------------------------------------------------------------- -flexflow_request_manager_t flexflow_request_manager_create(void); +flexflow_request_manager_t flexflow_request_manager_get_request_manager(void); + +// void flexflow_request_manager_destroy(flexflow_request_manager_t handle_); -void flexflow_request_manager_destroy(flexflow_request_manager_t handle); +void flexflow_request_manager_register_tokenizer( + flexflow_request_manager_t handle_, + enum ModelType model_type, + char const *tokenizer_filepath); -long unsigned int flexflow_request_manager_register_new_request( - flexflow_request_manager_t handle, - char const *prompt, - int max_sequence_length); +void flexflow_request_manager_register_output_filepath( + flexflow_request_manager_t handle_, char const *output_filepath); + +int flexflow_request_manager_register_ssm_model( + flexflow_request_manager_t handle_, flexflow_model_t model_handle_); // ----------------------------------------------------------------------- // InferenceManager // ----------------------------------------------------------------------- flexflow_inference_manager_t - flexflow_inference_manager_create(flexflow_config_t config_handle, - int max_num_tokens_per_batch); + flexflow_inference_manager_get_inference_manager(void); -void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle); +// void flexflow_inference_manager_destroy(flexflow_inference_manager_t +// handle_); void flexflow_inference_manager_compile_model_and_allocate_buffer( - flexflow_inference_manager_t handle, flexflow_model_t model_handle); + flexflow_inference_manager_t handle_, flexflow_model_t model_handle); void flexflow_inference_manager_init_operators_inference( - flexflow_inference_manager_t handle, flexflow_model_t model_handle); - -void flexflow_inference_manager_incr_decoding_loop( - flexflow_inference_manager_t handle, - flexflow_model_t model_handle, - flexflow_request_manager_t rm_handle, - int total_num_requests); - -void flexflow_inference_manager_spec_inference_loop( - flexflow_inference_manager_t handle, - flexflow_model_t model_handle, - flexflow_request_manager_t rm_handle, - int total_num_requests, - int num_ssms, - int *ssm_model_ids); + flexflow_inference_manager_t handle_, flexflow_model_t model_handle); + +// ----------------------------------------------------------------------- +// FileDataLoader +// ----------------------------------------------------------------------- + +flexflow_file_data_loader_t + flexflow_file_data_loader_create(char const *weight_file_path, + int num_heads, + int hidden_dim, + int qkv_inner_dim); + +void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); + +void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, + flexflow_model_t model_handle_, + int num_layers, + char const **layer_names, + flexflow_op_t *layers, + bool use_full_precision); #ifdef __cplusplus } diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 071124fc0d..e89c3eb622 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -709,7 +709,6 @@ void FileDataLoader::load_weights( if (weight == NULL) { continue; } - switch (weight->data_type) { case DT_HALF: load_single_weight_tensor(ff, weight, i, v.first); diff --git a/inference/flexflow_inference.py b/inference/flexflow_inference.py deleted file mode 100644 index 6caace0f2d..0000000000 --- a/inference/flexflow_inference.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from flexflow.serve import LLM, SamplingConfig -from flexflow.core import * - -def get_prompts(json_filepath): - json_obj = None - return json_obj - -def top_level_task(): - # Incremental decoding - llama = LLM("decapoda-research/llama-30b-hf", data_type = "half") - sampling_config = SamplingConfig(do_sample=False, temperature = 0.9, topp = 0.8, topk = 1) - llama.compile(InferenceMode.INC_DECODING_MODE, sampling_config, use_full_precision=False, max_batch_size = 1, max_seq_length = 256, max_tokens_per_batch=64, tensor_parallel_degree = 4, pipeline_parallel_degree = 2) - - prompts = llama.generate(prompts, sampling=sampling_config) - # result = llama.generate("What's the best xxx in yyy?", sampling = sampling_config) - # print(result) - - # # Speculative inference - # llama = LLM("decapoda-research/llama-30b-hf", data_type = "half") - # ssm1 = LLM("Jackfram/llama-160m", data_type = "half") - # ssm2 = LLM("facebook/opt-125m", data_type = "half") - # sampling_config = SamplingConfig(temperature = 0.9, topp = 0.8, topk = 1) - # llama.serve(max_batch_size = 1, max_seq_length = 256, max_tokens_per_batch=64, tensor_parallel_degree = 4, pipeline_parallel_degree = 2, ssms = {ssm1, ssm2}) - # result = llama.generate("What's the best xxx in yyy?", sampling = sampling_config) - # print(result) - -if __name__ == "__main__": - print("flexflow inference") - top_level_task() diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 957c41b103..4246a78824 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -151,11 +151,6 @@ void FlexFlow::top_level_task(Task const *task, RequestManager *rm = RequestManager::get_request_manager(); rm->register_tokenizer(model_type, file_paths.tokenizer_file_path); rm->register_output_filepath(file_paths.output_file_path); - // InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); - // RequestManager rm(model_type, - // file_paths.tokenizer_file_path, - // /*verbose*/ verbose, - // file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index d0ec83508b..2d79040f5f 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -62,11 +62,7 @@ void FALCON::create_falcon_model(FFModel &ff, Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); - int num_transformer_layers = falcon_config.n_layers; - int num_transformer_layers_per_stage = - (num_transformer_layers + num_pipeline_stages - 1) / num_pipeline_stages; - - for (int i = 0; i < num_transformer_layers; i++) { + for (int i = 0; i < falcon_config.n_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 2cdffe2715..68b931716f 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -119,7 +119,7 @@ void OPT::create_opt_model(FFModel &ff, NULL, false, /*scaling query*/ true, - /*sacling factor*/ + /*scaling factor*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*qk_prod_scaling*/ false); @@ -140,7 +140,7 @@ void OPT::create_opt_model(FFModel &ff, NULL, false, /*scaling query*/ true, - /*sacling factor*/ + /*scaling factor*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*qk_prod_scaling*/ false); @@ -161,7 +161,7 @@ void OPT::create_opt_model(FFModel &ff, NULL, false, /*scaling query*/ true, - /*sacling factor*/ + /*scaling factor*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*qk_prod_scaling*/ false); diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py new file mode 100644 index 0000000000..6db7d09c56 --- /dev/null +++ b/inference/python/incr_decoding.py @@ -0,0 +1,118 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_gpu": 30000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "decapoda-research/llama-7b-hf", + # optional parameters + "llm_weight": "", + "llm_tokenizer": "", + "clean_model_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + tokenizer_path=configs.llm_tokenizer, + weights_path=configs.llm_weight, + clean_cache=configs.clean_model_cache, + output_file=configs.output_file, + ) + + # Compile the LLM for inference and load the weights into memory + sampling_config = ff.SamplingConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + ff.InferenceMode.INC_DECODING_MODE, + sampling_config, + max_batch_size=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + # Generation begins! + if len(configs.prompt) > 0: + prompts = [s for s in json.load(open(configs.prompt))] + results = llm.generate(prompts) + else: + result = llm.generate("Here are some travel tips for Tokyo:\n") + + +if __name__ == "__main__": + print("flexflow inference example (incremental decoding)") + main() diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py new file mode 100644 index 0000000000..7a0cd1dc64 --- /dev/null +++ b/inference/python/spec_infer.py @@ -0,0 +1,168 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_gpu": 30000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "decapoda-research/llama-7b-hf", + # optional llm parameters + "llm_weight": "", + "llm_tokenizer": "", + "clean_model_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "ssm_weight": "", + "ssm_tokenizer": "", + "clean_model_cache": False, + "full_precision": False, + }, + { + # required ssm parameter + "ssm_model": "facebook/opt-125m", + # optional ssm parameters + "ssm_weight": "", + "ssm_tokenizer": "", + "clean_model_cache": False, + "full_precision": False, + }, + ], + "prompt": "../prompt/test.json", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + tokenizer_path=configs.llm_tokenizer, + weights_path=configs.llm_weight, + clean_cache=configs.clean_model_cache, + output_file=configs.output_file, + ) + + # Create the SSMs + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + tokenizer_path=ssm_config.ssm_tokenizer, + weights_path=ssm_config.ssm_weight, + clean_cache=ssm_config.clean_model_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + + # Create the sampling configs + sampling_config = ff.SamplingConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + + # Compile the SSMs for inference and load the weights into memory + for ssm in ssms: + ssm.compile( + ff.InferenceMode.BEAM_SEARCH_MODE, + sampling_config, + max_batch_size=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + # Compile the LLM for inference and load the weights into memory + llm.compile( + ff.InferenceMode.TREE_VERIFY_MODE, + sampling_config, + max_batch_size=1, + max_seq_length=256, + max_tokens_per_batch=64, + ssms=ssms, + ) + + # Generation begins! + if len(configs.prompt) > 0: + prompts = [s for s in json.load(open(configs.prompt))] + results = llm.generate(prompts) + else: + result = llm.generate("Here are some travel tips for Tokyo:\n") + + +if __name__ == "__main__": + print("flexflow inference example (speculative inference)") + main() diff --git a/inference/utils/download_opt_weights.py b/inference/utils/download_opt_weights.py index 747d471d1a..c3707df304 100644 --- a/inference/utils/download_opt_weights.py +++ b/inference/utils/download_opt_weights.py @@ -8,10 +8,13 @@ # You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. parser = argparse.ArgumentParser() -parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") +parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" +) args = parser.parse_args() if not args.use_full_precision: import torch + torch.set_default_tensor_type(torch.HalfTensor) # Change working dir to folder storing this script @@ -19,6 +22,7 @@ dname = os.path.dirname(abspath) os.chdir(dname) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): @@ -34,29 +38,41 @@ def convert_hf_model(model, dst_folder): ) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights - shutil.copy(os.path.join(dst_folder, "embed_tokens_weight"), os.path.join(dst_folder, "embed_tokens_weight_lm_head")) + shutil.copy( + os.path.join(dst_folder, "embed_tokens_weight"), + os.path.join(dst_folder, "embed_tokens_weight_lm_head"), + ) + # Download and convert big model weights model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b") -dst_folder="../weights/opt_6B_weights" if args.use_full_precision else "../weights/opt_6B_weights_half" +dst_folder = ( + "../weights/opt_6B_weights" + if args.use_full_precision + else "../weights/opt_6B_weights_half" +) convert_hf_model(model, dst_folder) # Download and convert small model weights model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") -dst_folder="../weights/opt_125M_weights" if args.use_full_precision else "../weights/opt_125M_weights_half" +dst_folder = ( + "../weights/opt_125M_weights" + if args.use_full_precision + else "../weights/opt_125M_weights_half" +) convert_hf_model(model, dst_folder) # Download tokenizer files os.makedirs("../tokenizer", exist_ok=True) -tokenizer_filepath = '../tokenizer/gpt2-vocab.json' -url = 'https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json' +tokenizer_filepath = "../tokenizer/vocab.json" +url = "https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json" r = requests.get(url) -open(tokenizer_filepath , 'wb').write(r.content) -tokenizer_filepath = '../tokenizer/gpt2-merges.txt' -url = 'https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt' +open(tokenizer_filepath, "wb").write(r.content) +tokenizer_filepath = "../tokenizer/merges.txt" +url = "https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt" r = requests.get(url) -open(tokenizer_filepath , 'wb').write(r.content) -tokenizer_filepath = '../tokenizer/added_tokens.json' -url = 'https://huggingface.co/truongpdd/vietnews-gpt2/raw/main/added_tokens.json' +open(tokenizer_filepath, "wb").write(r.content) +tokenizer_filepath = "../tokenizer/special_tokens_map.json" +url = "https://huggingface.co/truongpdd/vietnews-gpt2/raw/main/added_tokens.json" r = requests.get(url) -open(tokenizer_filepath , 'wb').write(r.content) \ No newline at end of file +open(tokenizer_filepath, "wb").write(r.content) diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 52ae0d9ef9..db36090587 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -22,7 +22,7 @@ import warnings import numpy as np from .flexflow_logger import fflogger -from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, OpType, ParameterSyncType, enum_to_int, int_to_enum +from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, ModelType, OpType, ParameterSyncType, enum_to_int, int_to_enum _FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) if not _FF_BUILD_DOCS: from .flexflowlib import ffi, flexflow_library @@ -39,6 +39,8 @@ def get_c_name(name): return ffi.new("char[]", name.encode('ascii')) def get_datatype_size(datatype): + if (datatype == DataType.DT_HALF): + return 2 if (datatype == DataType.DT_FLOAT): return 4 elif (datatype == DataType.DT_DOUBLE): @@ -435,6 +437,27 @@ class IncMultiHeadAttention(Op): def __init__(self, handle, idx=None, name=None): super(IncMultiHeadAttention, self).__init__(handle, idx, name) +# ----------------------------------------------------------------------- +# Speculative Incremental MultiHeadAttention +# ----------------------------------------------------------------------- +class SpecIncMultiHeadSelfAttention(Op): + def __init__(self, handle, idx=None, name=None): + super(SpecIncMultiHeadSelfAttention, self).__init__(handle, idx, name) + +# ----------------------------------------------------------------------- +# TreeVerify Incremental MultiHeadAttention +# ----------------------------------------------------------------------- +class TreeIncMultiHeadSelfAttention(Op): + def __init__(self, handle, idx=None, name=None): + super(TreeIncMultiHeadSelfAttention, self).__init__(handle, idx, name) + +# ----------------------------------------------------------------------- +# Multi-query Incremental MultiHeadAttention +# ----------------------------------------------------------------------- +class IncMultiQuerySelfAttention(Op): + def __init__(self, handle, idx=None, name=None): + super(IncMultiQuerySelfAttention, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # RMS Norm # ----------------------------------------------------------------------- @@ -463,6 +486,13 @@ class Sampling(Op): def __init__(self, handle, idx=None, name=None): super(Sampling, self).__init__(handle, idx, name) +# ----------------------------------------------------------------------- +# ArgMax +# ----------------------------------------------------------------------- +class ArgMax(Op): + def __init__(self, handle, idx=None, name=None): + super(ArgMax, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # flexflow_op_t handle to Op # ----------------------------------------------------------------------- @@ -545,6 +575,12 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): return MultiHeadAttention(handle, idx, name) elif op_type == OpType.INC_MULTIHEAD_ATTENTION: return IncMultiHeadAttention(handle, idx, name) + elif op_type == OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION: + return SpecIncMultiHeadSelfAttention(handle, idx, name) + elif op_type == OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION: + return TreeIncMultiHeadSelfAttention(handle, idx, name) + elif op_type == OpType.INC_MULTIQUERY_SELF_ATTENTION: + return IncMultiQuerySelfAttention(handle, idx, name) elif op_type == OpType.RMS_NORM: return RMSNorm(handle, idx, name) elif op_type == OpType.ARG_TOPK: @@ -553,6 +589,8 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): return BeamTopK(handle, idx, name) elif op_type == OpType.SAMPLING: return Sampling(handle, idx, name) + elif op_type == OpType.ARGMAX: + return ArgMax(handle, idx, name) elif op_type == OpType.RSQRT: return Rsqrt(handle, idx, name) elif op_type == OpType.POW: @@ -598,6 +636,42 @@ def epochs(self): @property def enable_control_replication(self): return ffc.flexflow_config_get_enable_control_replication(self.handle) + + @property + def data_parallelism_degree(self): + return ffc.flexflow_config_get_data_parallelism_degree(self.handle) + + @data_parallelism_degree.setter + def data_parallelism_degree(self, value): + if type(value) is not int: + raise ValueError("The data parallelism degree must be specified as an integer number") + elif value < 1: + raise ValueError("The data parallelism degree cannot be lower than 1") + ffc.flexflow_config_set_data_parallelism_degree(self.handle, value) + + @property + def tensor_parallelism_degree(self): + return ffc.flexflow_config_get_tensor_parallelism_degree(self.handle) + + @tensor_parallelism_degree.setter + def tensor_parallelism_degree(self, value): + if type(value) is not int: + raise ValueError("The tensor parallelism degree must be specified as an integer number") + elif value < 1: + raise ValueError("The tensor parallelism degree cannot be lower than 1") + ffc.flexflow_config_set_tensor_parallelism_degree(self.handle, value) + + @property + def pipeline_parallelism_degree(self): + return ffc.flexflow_config_get_pipeline_parallelism_degree(self.handle) + + @pipeline_parallelism_degree.setter + def pipeline_parallelism_degree(self, value): + if type(value) is not int: + raise ValueError("The pipeline parallelism degree must be specified as an integer number") + elif value < 1: + raise ValueError("The pipeline parallelism degree cannot be lower than 1") + ffc.flexflow_config_set_pipeline_parallelism_degree(self.handle, value) @property def python_data_loader_type(self): @@ -715,7 +789,11 @@ def set_tensor(self, ffmodel, np_array): assert np_shape[i] == self.dims[i], "please check shape dim %d (%d == %d)" %(i, np_shape[i], self.dims[i]) c_dims = ffi.new("int[]", self.dims) np_raw_ptr = np_array.__array_interface__['data'] - if np_array.dtype == np.float32: + if np_array.dtype == np.float16: + assert self.data_type == DataType.DT_HALF, "Wrong datatype" + raw_ptr = ffi.cast("half*", np_raw_ptr[0]) + ret_val = ffc.flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) + elif np_array.dtype == np.float32: assert self.data_type == DataType.DT_FLOAT, "Wrong datatype" raw_ptr = ffi.cast("float*", np_raw_ptr[0]) ret_val = ffc.flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) @@ -730,7 +808,9 @@ def set_tensor(self, ffmodel, np_array): def get_tensor(self, ffmodel): shape = self.dims - if self.data_type == DataType.DT_FLOAT: + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: np_array = np.empty(shape, dtype=np.float32) elif self.data_type == DataType.DT_INT32: np_array = np.empty(shape, dtype=np.int32) @@ -754,7 +834,9 @@ def get_tensor(self, ffmodel): def get_gradients(self, ffmodel, comm_type): shape = self.dims - if self.data_type == DataType.DT_FLOAT: + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: np_array = np.empty(shape, dtype=np.float32) elif self.data_type == DataType.DT_INT32: np_array = np.empty(shape, dtype=np.int32) @@ -779,7 +861,9 @@ def get_gradients(self, ffmodel, comm_type): def get_model_output_gradients(self, ffmodel, comm_type): shape = self.dims - if self.data_type == DataType.DT_FLOAT: + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: np_array = np.empty(shape, dtype=np.float32) elif self.data_type == DataType.DT_INT32: np_array = np.empty(shape, dtype=np.int32) @@ -800,7 +884,9 @@ def get_model_output_gradients(self, ffmodel, comm_type): def get_model_output_tensor(self, ffmodel): shape = self.dims - if self.data_type == DataType.DT_FLOAT: + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: np_array = np.empty(shape, dtype=np.float32) elif self.data_type == DataType.DT_INT32: np_array = np.empty(shape, dtype=np.int32) @@ -820,7 +906,9 @@ def get_model_output_tensor(self, ffmodel): def __get_raw_ptr(self, ffmodel, ffconfig, data_type): assert data_type == self.data_type, "Tensor check data type" - if (data_type == DataType.DT_FLOAT): + if (data_type == DataType.DT_HALF): + return ffc.flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) + elif (data_type == DataType.DT_FLOAT): return ffc.flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) elif (data_type == DataType.DT_INT32): return ffc.flexflow_tensor_get_raw_ptr_int32(self.handle, ffmodel.handle, ffconfig.handle) @@ -1520,7 +1608,7 @@ def batch_matmul(self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name= def dense(self, input, out_dim, activation=ActiMode.AC_MODE_NONE, use_bias=True, - datatype=DataType.DT_FLOAT, + datatype=DataType.DT_NONE, shared_op=None, kernel_initializer=None, bias_initializer=None, kernel_regularizer=None, name=None): @@ -2019,10 +2107,12 @@ def multihead_attention(self, query, key, value, return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) def inc_multihead_attention(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - kernel_initializer=None, apply_rotary_embedding=False, name=None): + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, + qk_prod_scaling=True, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, and returns the dot-product attention between them:. @@ -2053,12 +2143,24 @@ def inc_multihead_attention(self, input, :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool :param name: the name of the layer. Default is None. :type name: string @@ -2067,15 +2169,18 @@ def inc_multihead_attention(self, input, """ c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, apply_rotary_embedding, c_name) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) def spec_inc_multihead_attention(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - kernel_initializer=None, apply_rotary_embedding=False, name=None): + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, + qk_prod_scaling=True, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, and returns the dot-product attention between them:. @@ -2106,12 +2211,24 @@ def spec_inc_multihead_attention(self, input, :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool :param name: the name of the layer. Default is None. :type name: string @@ -2120,15 +2237,18 @@ def spec_inc_multihead_attention(self, input, """ c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc.flexflow_model_add_spec_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, apply_rotary_embedding, c_name) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_spec_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) def inc_multihead_self_attention_verify(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - kernel_initializer=None, apply_rotary_embedding=False, name=None): + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, + qk_prod_scaling=True, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, and returns the dot-product attention between them:. @@ -2159,12 +2279,24 @@ def inc_multihead_self_attention_verify(self, input, :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool :param name: the name of the layer. Default is None. :type name: string @@ -2173,10 +2305,64 @@ def inc_multihead_self_attention_verify(self, input, """ c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, apply_rotary_embedding, c_name) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + def inc_multiquery_self_attention(self, input, + embed_dim, num_heads, + kdim=0, vdim=0, dropout=0.0, + bias=False, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + name=None): + """Defines the Multi-query self attention operation + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, c_name) + self.add_layer(OpType.INC_MULTIQUERY_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIQUERY_SELF_ATTENTION) + def rms_norm(self, input, eps, dim, name=None): """Defines the RMS Norm layer. @@ -2261,6 +2447,25 @@ def sampling(self, input, top_p, name=None): handle = ffc.flexflow_model_add_sampling(self.handle, input.handle, top_p, c_name) self.add_layer(OpType.SAMPLING, name) return Tensor(handle, owner_op_type=OpType.SAMPLING) + + def argmax(self, input, beam_search, name=None): + """Defines the Sampling layer. + + :param input: the input Tensor. + :type input: Tensor + + :param beam_search: Whether you need to perform beam search + :type beam_search: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc.flexflow_model_add_argmax(self.handle, input.handle, beam_search, c_name) + self.add_layer(OpType.ARGMAX, name) + return Tensor(handle, owner_op_type=OpType.ARGMAX) def reset_metrics(self): """Reset performance metrics. @@ -2514,7 +2719,9 @@ def __create_data_loader_attach(self, batch_tensor, full_array): full_array_shape = full_array.shape num_samples = full_array_shape[0] num_dim = len(full_array_shape) - if (full_array.dtype == "float32"): + if (full_array.dtype == "float16"): + datatype = DataType.DT_HALF + elif (full_array.dtype == "float32"): datatype = DataType.DT_FLOAT elif (full_array.dtype == "int32"): datatype = DataType.DT_INT32 @@ -2541,7 +2748,9 @@ def __create_data_loader_attach(self, batch_tensor, full_array): def __create_data_loader_ptr(self, batch_tensor, full_array): full_array_shape = full_array.shape num_samples = full_array_shape[0] - if (full_array.dtype == "float32"): + if (full_array.dtype == "float16"): + datatype = DataType.DT_HALF + elif (full_array.dtype == "float32"): datatype = DataType.DT_FLOAT elif (full_array.dtype == "int32"): datatype = DataType.DT_INT32 @@ -2574,7 +2783,9 @@ def __get_op_handle(self, shared_op): def get_output_tensor(self, ffmodel, data_type): shape = self.dims - if data_type == DataType.DT_FLOAT: + if data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif data_type == DataType.DT_FLOAT: np_array = np.empty(shape, dtype=np.float32) elif self.data_type == DataType.DT_INT32: np_array = np.empty(shape, dtype=np.int32) @@ -2595,6 +2806,10 @@ def get_output_tensor(self, ffmodel, data_type): fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) assert ret_val == True return np_array + + def generate(self, text, max_sequence_length): + c_text = get_c_name(text) + return ffc.flexflow_model_generate(self.handle, c_text, max_sequence_length) # ----------------------------------------------------------------------- # SGDOptimizer @@ -2791,7 +3006,9 @@ class RegionNdarray(object): __slots__ = ['__array_interface__'] def __init__(self, shape, data_type, base_ptr, strides, read_only): # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html - if (data_type == DataType.DT_FLOAT): + if (data_type == DataType.DT_HALF): + field_type = " 0: + print(f"Using tokenizer from {self.tokenizer_path}") + # check that tokenizer exist + if not os.path.exists(self.tokenizer_path): + raise FileNotFoundError(f"Path {self.tokenizer_path} does not exist") + elif ( + os.path.isdir(self.tokenizer_path) + and len(os.listdir(self.tokenizer_path)) == 0 + ): + raise FileNotFoundError(f"Folder {self.tokenizer_path} is empty") + return + + # Download tokenizer + + # Use local cache, or download new version + self.tokenizer_path = os.path.expanduser( + f"~/.cache/flexflow/tokenizers/{self.hf_config._name_or_path}/" + ) + if self.clean_cache: + print( + f"Discarding cached tokenizer files (if they exist) for model {self.hf_config._name_or_path}..." + ) + if os.path.exists(self.tokenizer_path): + shutil.rmtree(self.tokenizer_path) + if not os.path.exists(self.tokenizer_path): + print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...") + os.makedirs(self.tokenizer_path, exist_ok=True) + + # Get local revision SHA, check if it matches latest one on huggingface + local_revision = None + local_revision_file = os.path.join(self.tokenizer_path, "rev_sha.txt") + if os.path.exists(local_revision_file): + local_revision = "".join(open(local_revision_file).read().split()) + hf_api = HfApi() + latest_revision = hf_api.model_info(self.hf_config._name_or_path).sha + + # Download if needed + if local_revision != latest_revision: + print( + f"'{self.hf_config._name_or_path}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." + ) + if self.model_type == ModelType.LLAMA: + hf_tokenizer = LlamaTokenizer.from_pretrained( + self.hf_config._name_or_path, use_fast=True + ) + else: + hf_tokenizer = AutoTokenizer.from_pretrained( + self.hf_config._name_or_path + ) + hf_tokenizer.save_pretrained(self.tokenizer_path) + print("Done downloading HF tokenizer.") + with open(local_revision_file, "w+") as f: + f.write(latest_revision) + print("Loading the tokenizer...") + else: + print( + f"Loading '{self.hf_config._name_or_path}' tokenizer from the cache..." + ) + + def __load_hf_weights(self): + print("Loading hf weights...") + + if self.data_type == DataType.DT_HALF: + torch.set_default_tensor_type(torch.HalfTensor) + + if len(self.weights_path) > 0: + print(f"Using weights from {self.weights_path}") + # check that weights exist + if not os.path.exists(self.weights_path) or not os.path.isdir( + self.weights_path + ): + raise FileNotFoundError( + f"Path {self.weights_path} does not exist or is not a directory" + ) + elif len(os.listdir(self.weights_path)) == 0: + raise FileNotFoundError(f"Folder {self.weights_path} is empty") + else: + self.__download_hf_weights() + + # Create file data loader, load weights into tensors + self.fileloader = FileDataLoader( + self.weights_path, + self.hf_config.num_attention_heads, + self.hf_config.hidden_size, + self.hf_config.hidden_size // self.hf_config.num_attention_heads, + ) + + model_layers_with_weights = self.model.get_layers_with_weights() + self.fileloader.load_weights( + self.model.ffmodel, model_layers_with_weights, self.data_type + ) + def compile( self, - mode = InferenceMode.INC_DECODING_MODE, - sampling_config = SamplingConfig(), - use_full_precision = False, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, - tensor_parallel_degree=4, - pipeline_parallel_degree=2, - ssms=[], + mode: InferenceMode = InferenceMode.INC_DECODING_MODE, + sampling_config: SamplingConfig = SamplingConfig(), + max_batch_size: int = 1, + max_seq_length: int = 256, + max_tokens_per_batch: int = 64, + ssms: list = [], ): + """Compile the LLM for inference and load the weights into memory + + :param mode: The LLM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE + :type mode: InferenceMode, optional + :param sampling_config: The SamplingConfig object with the configurations to use for sampling, defaults to SamplingConfig() + :type sampling_config: SamplingConfig, optional + :param max_batch_size: The maximum batch size to allow, defaults to 1 + :type max_batch_size: int, optional + :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 + :type max_seq_length: int, optional + :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 + :type max_tokens_per_batch: int, optional + :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] + :type ssms: list, optional + """ self.max_batch_size = max_batch_size self.max_seq_length = max_seq_length self.max_tokens_per_batch = max_tokens_per_batch - self.tensor_parallel_degree = tensor_parallel_degree - self.pipeline_parallel_degree = pipeline_parallel_degree self.ssms = ssms self.sampling_config = SamplingConfig() - assert((mode == InferenceMode.INC_DECODING_MODE or mode == InferenceMode.BEAM_SEARCH_MODE) == (len(ssms) == 0)) - - # Create model - self.model = self.model_type(mode, sampling_config, self.ffconfig, max_batch_size, max_seq_length, max_tokens_per_batch, use_full_precision) + assert ( + mode == InferenceMode.INC_DECODING_MODE + or mode == InferenceMode.BEAM_SEARCH_MODE + ) == (len(ssms) == 0) + + # Instantiate the relevant model + self.model = self.model_class( + mode, + sampling_config, + self.ffconfig, + self.hf_config, + self.data_type, + max_batch_size, + max_seq_length, + max_tokens_per_batch, + ) # Create inference manager - self.im = InferenceManager(self.ffconfig, max_tokens_per_batch) + self.im = InferenceManager() + self.im.compile_model_and_allocate_buffer(self.model.ffmodel) + + # Download the weights and tokenizer from huggingface (if needed) and load them + self.__load_hf_weights() + self.__load_hf_tokenizer() # Create request manager self.rm = RequestManager() - - assert False and "Not implemented yet" + self.rm.register_tokenizer(self.model_type, self.tokenizer_path) + self.rm.register_output_filepath(self.output_file) + + self.im.init_operators_inference(self.model.ffmodel) + + for ssm in self.ssms: + self.rm.register_ssm_model(ssm.model.ffmodel) + + def generate(self, prompts: Union[str, List[str]]): + """Generate tokens based on the input prompt(s) + + :param prompts: The generation prompt(s) in the form of a string, or list of strings + :type prompts: Union[str, List[str]] + :return: the generation results + :rtype: GenerationResult + """ + if type(prompts) == str: + if len(prompts) == 0: + return None + return self.model.ffmodel.generate(prompts, 128) + elif type(prompts) == list: + if len(prompts) == 0: + return [] + return [self.model.ffmodel.generate(prompt, 128) for prompt in prompts] + else: + assert False, "Please pass a non-empty string or list of strings" + + +class SSM(LLM): + """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" + + def __init__( + self, + model_name: str, + data_type: DataType = DataType.DT_HALF, + tokenizer_path: str = "", + weights_path: str = "", + clean_cache: bool = False, + output_file: str = "", + ): + """Create the SSM object - def generate(self, prompt, sampling=None): - self.sampling = sampling if sampling is not None else self.default_config - assert False and "Not implemented yet" + :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf' + :type model_name: str + :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF + :type data_type: DataType, optional + :param tokenizer_path: Path to the tokenizer file or folder for the LLM. If left blank, FlexFlow will download (and cache) the relevant tokenizer from HuggingFace, defaults to "" + :type tokenizer_path: str, optional + :param weights_path: Path to the weights for the LLM. If left blank, FlexFlow will download (and cache) the weights from HuggingFace, defaults to "" + :type weights_path: str, optional + :param clean_cache: Use this flag to discard previous weights/tokenizer cache for this LLM, defaults to False + :type clean_cache: bool, optional + :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" + :type output_file: str, optional + """ + super().__init__( + model_name, + data_type, + tokenizer_path, + weights_path, + clean_cache, + output_file, + ) + self.ffconfig.data_parallelism_degree = 1 + self.ffconfig.tensor_parallelism_degree = 1 + self.ffconfig.pipeline_parallelism_degree = 1 diff --git a/python/flexflow/type.py b/python/flexflow/type.py index dd1d40baf0..94a0b6085c 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -2,156 +2,179 @@ from enum import Enum + class ActiMode(Enum): - AC_MODE_NONE = 10 - AC_MODE_RELU = 11 - AC_MODE_SIGMOID = 12 - AC_MODE_TANH = 13 - AC_MODE_GELU = 14 + AC_MODE_NONE = 10 + AC_MODE_RELU = 11 + AC_MODE_SIGMOID = 12 + AC_MODE_TANH = 13 + AC_MODE_GELU = 14 + class RegularizerMode(Enum): - REG_MODE_NONE = 17 - REG_MODE_L1 = 18 - REG_MODE_L2 = 19 + REG_MODE_NONE = 17 + REG_MODE_L1 = 18 + REG_MODE_L2 = 19 + class AggrMode(Enum): - AGGR_MODE_NONE = 20 - AGGR_MODE_SUM = 21 - AGGR_MODE_AVG = 22 + AGGR_MODE_NONE = 20 + AGGR_MODE_SUM = 21 + AGGR_MODE_AVG = 22 + class PoolType(Enum): - POOL_MAX = 30 - POOL_AVG = 31 + POOL_MAX = 30 + POOL_AVG = 31 + class DataType(Enum): - DT_BOOLEAN = 40 - DT_INT32 = 41 - DT_INT64 = 42 - DT_HALF = 43 - DT_FLOAT = 44 - DT_DOUBLE = 45 - DT_NONE = 49 + DT_BOOLEAN = 40 + DT_INT32 = 41 + DT_INT64 = 42 + DT_HALF = 43 + DT_FLOAT = 44 + DT_DOUBLE = 45 + DT_NONE = 49 + class LossType(Enum): - LOSS_CATEGORICAL_CROSSENTROPY = 50 - LOSS_SPARSE_CATEGORICAL_CROSSENTROPY = 51 - LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE = 52 - LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE = 53 - LOSS_IDENTITY = 54 + LOSS_CATEGORICAL_CROSSENTROPY = 50 + LOSS_SPARSE_CATEGORICAL_CROSSENTROPY = 51 + LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE = 52 + LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE = 53 + LOSS_IDENTITY = 54 + class CompMode(Enum): - TRAINING = 70 - INFERENCE = 71 - + TRAINING = 70 + INFERENCE = 71 + + class ParameterSyncType(Enum): - NONE = 80 - PS = 81 - NCCL = 82 - + NONE = 80 + PS = 81 + NCCL = 82 + + class MetricsType(Enum): - METRICS_ACCURACY = 1001 - METRICS_CATEGORICAL_CROSSENTROPY = 1002 - METRICS_SPARSE_CATEGORICAL_CROSSENTROPY = 1004 - METRICS_MEAN_SQUARED_ERROR = 1008 - METRICS_ROOT_MEAN_SQUARED_ERROR = 1016 - METRICS_MEAN_ABSOLUTE_ERROR=1032 + METRICS_ACCURACY = 1001 + METRICS_CATEGORICAL_CROSSENTROPY = 1002 + METRICS_SPARSE_CATEGORICAL_CROSSENTROPY = 1004 + METRICS_MEAN_SQUARED_ERROR = 1008 + METRICS_ROOT_MEAN_SQUARED_ERROR = 1016 + METRICS_MEAN_ABSOLUTE_ERROR = 1032 + class InferenceMode(Enum): - INC_DECODING_MODE = 2001 - BEAM_SEARCH_MODE = 2002 - TREE_VERIFY_MODE = 2003 + INC_DECODING_MODE = 2001 + BEAM_SEARCH_MODE = 2002 + TREE_VERIFY_MODE = 2003 + + +class ModelType(Enum): + UNKNOWN = 3001 + LLAMA = 3002 + OPT = 3003 + FALCON = 3004 + class OpType(Enum): - CONV2D = 2011 - EMBEDDING = 2012 - POOL2D = 2013 - LINEAR = 2014 - SOFTMAX = 2015 - CONCAT = 2016 - FLAT = 2017 - MSELOSS = 2020 - BATCH_NORM = 2021 - RELU = 2022 - SIGMOID = 2023 - TANH = 2024 - ELU = 2025 - DROPOUT = 2026 - BATCH_MATMUL = 2027 - SPLIT = 2028 - RESHAPE = 2029 - TRANSPOSE = 2030 - REVERSE = 2031 - EXP = 2040 - ADD = 2041 - SUBTRACT = 2042 - MULTIPLY = 2043 - DIVIDE = 2044 - POW = 2045 - MEAN = 2046 - RSQRT = 2047 - SIN = 2048 - COS = 2049 - INPUT = 2050 - OUTPUT = 2051 - REDUCE_SUM = 2052 - MAX = 2053 - MIN = 2054 - MULTIHEAD_ATTENTION = 2060 - INC_MULTIHEAD_ATTENTION = 2061 - SPEC_INC_MULTIHEAD_SELF_ATTENTION = 2062 - TREE_INC_MULTIHEAD_SELF_ATTENTION = 2063 - INC_MULTIQUERY_SELF_ATTENTION = 2064 - SAMPLING = 2065 - GETITEM = 2070 - GETATTR = 2080 - EXPAND = 2081 - LAYER_NORM = 2082 - FLOOR_DIVIDE = 2083 - IDENTITY = 2084 - GELU = 2085 - PERMUTE = 2086 - SCALAR_MULTIPLY = 2087 - SCALAR_FLOORDIV = 2088 - SCALAR_ADD = 2089 - SCALAR_SUB = 2090 - SCALAR_TRUEDIV = 2091 - INIT_PARAM = 2092 - FLOAT = 2100 - CONTIGUOUS = 2101 - TO = 2102 - UNSQUEEZE = 2103 - TYPE_AS = 2104 - VIEW = 2105 - GATHER = 2106 - ATTRIBUTE = 2200 - RMS_NORM = 2300 - ARG_TOPK = 2301 - BEAM_TOPK = 2302 + CONV2D = 2011 + EMBEDDING = 2012 + POOL2D = 2013 + LINEAR = 2014 + SOFTMAX = 2015 + CONCAT = 2016 + FLAT = 2017 + MSELOSS = 2020 + BATCH_NORM = 2021 + RELU = 2022 + SIGMOID = 2023 + TANH = 2024 + ELU = 2025 + DROPOUT = 2026 + BATCH_MATMUL = 2027 + SPLIT = 2028 + RESHAPE = 2029 + TRANSPOSE = 2030 + REVERSE = 2031 + EXP = 2040 + ADD = 2041 + SUBTRACT = 2042 + MULTIPLY = 2043 + DIVIDE = 2044 + POW = 2045 + MEAN = 2046 + RSQRT = 2047 + SIN = 2048 + COS = 2049 + INPUT = 2050 + OUTPUT = 2051 + REDUCE_SUM = 2052 + MAX = 2053 + MIN = 2054 + MULTIHEAD_ATTENTION = 2060 + INC_MULTIHEAD_ATTENTION = 2061 + SPEC_INC_MULTIHEAD_SELF_ATTENTION = 2062 + TREE_INC_MULTIHEAD_SELF_ATTENTION = 2063 + INC_MULTIQUERY_SELF_ATTENTION = 2064 + SAMPLING = 2065 + ARGMAX = 2066 + GETITEM = 2070 + GETATTR = 2080 + EXPAND = 2081 + LAYER_NORM = 2082 + FLOOR_DIVIDE = 2083 + IDENTITY = 2084 + GELU = 2085 + PERMUTE = 2086 + SCALAR_MULTIPLY = 2087 + SCALAR_FLOORDIV = 2088 + SCALAR_ADD = 2089 + SCALAR_SUB = 2090 + SCALAR_TRUEDIV = 2091 + INIT_PARAM = 2092 + FLOAT = 2100 + CONTIGUOUS = 2101 + TO = 2102 + UNSQUEEZE = 2103 + TYPE_AS = 2104 + VIEW = 2105 + GATHER = 2106 + ATTRIBUTE = 2200 + RMS_NORM = 2300 + ARG_TOPK = 2301 + BEAM_TOPK = 2302 + def enum_to_int(enum, enum_item): - for item in enum: - if (enum_item == item): - return item.value + for item in enum: + if enum_item == item: + return item.value + + print(enum_item) + print(enum) + assert 0, "unknown enum type " + str(enum_item) + " " + str(enum) + return -1 - print(enum_item) - print(enum) - assert 0, "unknown enum type " + str(enum_item) + " " + str(enum) - return -1 def int_to_enum(enum, value): - for item in enum: - if (item.value == value): - return item + for item in enum: + if item.value == value: + return item + + assert 0, "unknown enum value " + str(value) + " " + str(enum) + - assert 0, "unknown enum value " + str(value) + " " + str(enum) - def enum_to_str(enum, enum_item): - name = enum(enum_item).name - return name - + name = enum(enum_item).name + return name + + def str_to_enum(enum, value): - for item in enum: - if (item.name == value): - return item + for item in enum: + if item.name == value: + return item - assert 0, "unknown enum value " + value + " " + str(enum) + assert 0, "unknown enum value " + value + " " + str(enum) diff --git a/python/flexflow_python_build.py b/python/flexflow_python_build.py index 0e58193ef7..c9749d8369 100755 --- a/python/flexflow_python_build.py +++ b/python/flexflow_python_build.py @@ -29,14 +29,15 @@ sys.exit(1) build_dir = os.path.abspath(build_dir) script_dir = os.path.abspath(os.path.dirname(__file__)) -script_path = os.path.join(build_dir, "flexflow_python") if not os.path.isdir(build_dir): print(f"Folder {build_dir} does not exist") sys.exit(1) if not os.path.isdir(script_dir): print(f"Folder {script_dir} does not exist") sys.exit(1) -script_path = os.path.abspath(script_path) +# Build flexflow_python script +flexflow_python_path = os.path.join(build_dir, "flexflow_python") +flexflow_python_path = os.path.abspath(flexflow_python_path) lines = [ '#! /usr/bin/env bash', f'BUILD_FOLDER="{build_dir}"', @@ -52,10 +53,26 @@ '\tlegion_python "$@"', 'fi' ] - -with open(script_path, "w+") as script_file: +with open(flexflow_python_path, "w+") as flexflow_python_file: for line in lines: - script_file.write(line + "\n") + flexflow_python_file.write(line + "\n") +cur_stat = os.stat(flexflow_python_path) +os.chmod(flexflow_python_path, cur_stat.st_mode | stat.S_IEXEC) -cur_stat = os.stat(script_path) -os.chmod(script_path, cur_stat.st_mode | stat.S_IEXEC) +# Build set_python_envs.sh +python_envs_path = os.path.join(build_dir, "set_python_envs.sh") +python_envs_path = os.path.abspath(python_envs_path) +lines = [ + '#! /usr/bin/env bash', + f'BUILD_FOLDER="{build_dir}"', + f'PYTHON_FOLDER="{script_dir}"', + 'PYLIB_PATH="$("$PYTHON_FOLDER"/flexflow/findpylib.py)"', + 'PYLIB_DIR="$(dirname "$PYLIB_PATH")"', + 'export LD_LIBRARY_PATH="$BUILD_FOLDER:$BUILD_FOLDER/deps/legion/lib:$PYLIB_DIR:$LD_LIBRARY_PATH"', + 'export PYTHONPATH="$PYTHON_FOLDER:$BUILD_FOLDER/deps/legion/bindings/python:$PYTHONPATH"', +] +with open(python_envs_path, "w+") as python_envs_file: + for line in lines: + python_envs_file.write(line + "\n") +cur_stat = os.stat(python_envs_path) +os.chmod(python_envs_path, cur_stat.st_mode | stat.S_IEXEC) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 1c3103683f..c210836d9b 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -17,6 +17,7 @@ #include "flexflow/dataloader.h" #include "flexflow/mapper.h" #include "flexflow/request_manager.h" +#include "inference/file_loader.h" using namespace Legion; using namespace FlexFlow; @@ -64,6 +65,8 @@ class FFCObjectWrapper { BeamSearchBatchConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_inference_manager_t, InferenceManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); + FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *); + FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *); }; Logger ffc_log("flexflow_c"); @@ -130,6 +133,39 @@ bool flexflow_config_get_enable_control_replication(flexflow_config_t handle_) { return handle->enable_control_replication; } +int flexflow_config_get_data_parallelism_degree(flexflow_config_t handle_) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->data_parallelism_degree; +} + +int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->tensor_parallelism_degree; +} + +int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->pipeline_parallelism_degree; +} + +void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_, + int value) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->data_parallelism_degree = value; +} + +void flexflow_config_set_tensor_parallelism_degree(flexflow_config_t handle_, + int value) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->tensor_parallelism_degree = value; +} + +void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_, + int value) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->pipeline_parallelism_degree = value; +} + int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) { FFConfig *handle = FFCObjectWrapper::unwrap(handle_); return handle->python_data_loader_type; @@ -1024,8 +1060,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bool bias, bool add_bias_kv, bool add_zero_attn, + enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1040,9 +1080,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bias, add_bias_kv, add_zero_attn, - input->data_type, + data_type, kernel_initializer, apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, name); return FFCObjectWrapper::wrap(tensor); } @@ -1058,8 +1101,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( bool bias, bool add_bias_kv, bool add_zero_attn, + enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1075,9 +1122,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( bias, add_bias_kv, add_zero_attn, - input->data_type, + data_type, kernel_initializer, apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, name); return FFCObjectWrapper::wrap(tensor); } @@ -1093,8 +1143,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bool bias, bool add_bias_kv, bool add_zero_attn, + enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1110,9 +1164,46 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bias, add_bias_kv, add_zero_attn, - input->data_type, + data_type, kernel_initializer, apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = + handle->inc_multihead_self_attention_verify(input, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, name); return FFCObjectWrapper::wrap(tensor); } @@ -1160,6 +1251,16 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, + const flexflow_tensor_t input_, + bool beam_search, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor tensor = handle->argmax(input, beam_search, name); + return FFCObjectWrapper::wrap(tensor); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1213,6 +1314,16 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { handle->set_transformer_layer_id(id); } +flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_, + char const *text, + int max_seq_length) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + std::string const text_str(text); + GenerationResult result = handle->generate(text_str, max_seq_length); + DEBUG_PRINT("[Model] generate %p %s %i", handle, text, max_seq_length); + return FFCObjectWrapper::wrap(&result); +} + // ----------------------------------------------------------------------- // Tensor // ----------------------------------------------------------------------- @@ -2149,27 +2260,43 @@ void flexflow_beam_search_batch_config_destroy( // RequestManager // ----------------------------------------------------------------------- -flexflow_request_manager_t flexflow_request_manager_create(void) { - RequestManager *rm = new RequestManager(); - DEBUG_PRINT("[RequestManager] new %p", rm); +flexflow_request_manager_t flexflow_request_manager_get_request_manager(void) { + RequestManager *rm = RequestManager::get_request_manager(); + DEBUG_PRINT("[RequestManager] get %p", rm); return FFCObjectWrapper::wrap(rm); } -void flexflow_request_manager_destroy(flexflow_request_manager_t handle_) { +void flexflow_request_manager_register_tokenizer( + flexflow_request_manager_t handle_, + enum ModelType model_type, + char const *tokenizer_filepath) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); - DEBUG_PRINT("[RequestManager] delete %p", handle); - delete handle; + assert(tokenizer_filepath != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const tokenizer_filepath_str(tokenizer_filepath); + handle->register_tokenizer(model_type, tokenizer_filepath_str); + DEBUG_PRINT( + "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath); } -long unsigned int flexflow_request_manager_register_new_request( - flexflow_request_manager_t handle_, - char const *prompt, - int max_sequence_length) { +void flexflow_request_manager_register_output_filepath( + flexflow_request_manager_t handle_, char const *output_filepath) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); - assert(prompt != nullptr && "Cannot convert nullptr char * to std::string"); - std::string const prompt_str(prompt); - DEBUG_PRINT("[RequestManager] register_new_request %p %s", handle, prompt); - return handle->register_new_request(prompt_str, max_sequence_length); + assert(output_filepath != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const output_filepath_str(output_filepath); + handle->register_output_filepath(output_filepath_str); + DEBUG_PRINT("[RequestManager] register output filepath %p %s", + handle, + output_filepath); +} + +int flexflow_request_manager_register_ssm_model( + flexflow_request_manager_t handle_, flexflow_model_t model_handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + DEBUG_PRINT("[RequestManager] register ssm %p %p", handle, model_handle); + return handle->register_ssm_model(model_handle); } // ----------------------------------------------------------------------- @@ -2177,65 +2304,66 @@ long unsigned int flexflow_request_manager_register_new_request( // ----------------------------------------------------------------------- flexflow_inference_manager_t - flexflow_inference_manager_create(flexflow_config_t config_handle, - int max_num_tokens_per_batch) { - FFConfig *config = FFCObjectWrapper::unwrap(config_handle); - InferenceManager *im = - new InferenceManager(*config, max_num_tokens_per_batch); - DEBUG_PRINT("[InferenceManager] new %p", im); + flexflow_inference_manager_get_inference_manager() { + InferenceManager *im = InferenceManager::get_inference_manager(); + DEBUG_PRINT("[InferenceManager] get %p", im); return FFCObjectWrapper::wrap(im); } -void flexflow_inference_manager_destroy(flexflow_inference_manager_t handle_) { - InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); - DEBUG_PRINT("[InferenceManager] delete %p", handle); - delete handle; -} - void flexflow_inference_manager_compile_model_and_allocate_buffer( - flexflow_inference_manager_t handle_, flexflow_model_t model_handle_) { + flexflow_inference_manager_t handle_, flexflow_model_t model_handle) { InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); - FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + FFModel *model = FFCObjectWrapper::unwrap(model_handle); DEBUG_PRINT("[InferenceManager] compile_model_and_allocate_buffer %p", handle); - handle->compile_model_and_allocate_buffer(model_handle); + handle->compile_model_and_allocate_buffer(model); } void flexflow_inference_manager_init_operators_inference( - flexflow_inference_manager_t handle_, flexflow_model_t model_handle_) { + flexflow_inference_manager_t handle_, flexflow_model_t model_handle) { InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); - FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + FFModel *model = FFCObjectWrapper::unwrap(model_handle); DEBUG_PRINT("[InferenceManager] init_operators_inference %p", handle); - handle->init_operators_inference(model_handle); + handle->init_operators_inference(model); } -void flexflow_inference_manager_incr_decoding_loop( - flexflow_inference_manager_t handle_, - flexflow_model_t model_handle_, - flexflow_request_manager_t rm_handle_, - int total_num_requests) { - InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); - FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); - RequestManager *rm_handle = FFCObjectWrapper::unwrap(rm_handle_); - DEBUG_PRINT("[InferenceManager] incr_decoding_loop %p", handle); - handle->incr_decoding_loop(model_handle, *rm_handle, total_num_requests); -} - -void flexflow_inference_manager_spec_inference_loop( - flexflow_inference_manager_t handle_, - flexflow_model_t model_handle_, - flexflow_request_manager_t rm_handle_, - int total_num_requests, - int num_ssms, - int *ssm_model_ids) { - InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); - FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); - RequestManager *rm_handle = FFCObjectWrapper::unwrap(rm_handle_); - std::vector ssm_model_ids_vec; - for (int i = 0; i < num_ssms; i++) { - ssm_model_ids_vec.push_back(ssm_model_ids[i]); +// ----------------------------------------------------------------------- +// FileDataLoader +// ----------------------------------------------------------------------- + +flexflow_file_data_loader_t + flexflow_file_data_loader_create(char const *weight_file_path, + int num_heads, + int hidden_dim, + int qkv_inner_dim) { + assert(weight_file_path != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const weight_file_path_str(weight_file_path); + FileDataLoader *handle = new FileDataLoader( + "", weight_file_path_str, num_heads, hidden_dim, qkv_inner_dim); + DEBUG_PRINT("[FileDataLoader] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_) { + FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[FileDataLoader] delete %p", handle); + delete handle; +} + +void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, + flexflow_model_t model_handle_, + int num_layers, + char const **layer_names, + flexflow_op_t *layers, + bool use_full_precision) { + FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model = FFCObjectWrapper::unwrap(model_handle_); + std::unordered_map weights_layers; + for (int i = 0; i < num_layers; i++) { + std::string const layer_name(layer_names[i]); + Layer *layer_ptr = FFCObjectWrapper::unwrap(layers[i]); + weights_layers.emplace(layer_name, layer_ptr); } - DEBUG_PRINT("[InferenceManager] spec_inference_loop %p", handle); - handle->spec_inference_loop( - model_handle, *rm_handle, total_num_requests, ssm_model_ids_vec); + handle->load_weights(model, weights_layers, use_full_precision); } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index a7476928ba..7863931c82 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -227,7 +227,7 @@ OpMeta *ArgMax::init_task(Task const *task, Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; int batch_size = acc_input.domain.get_volume() / length; Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 9b81836de5..02853bbf09 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -32,6 +32,7 @@ #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/rms_norm_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" @@ -418,6 +419,26 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; + } case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -880,6 +901,26 @@ __host__ void m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; + } case OP_ALLREDUCE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 0eafd979c1..2a7ece3c06 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2641,10 +2641,6 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[l]->op_type != OP_ALLREDUCE) { continue; } - // don't fuse softmax since it returns inference results - if (operators[l]->op_type == OP_SOFTMAX) { - continue; - } size_t start = 0; { Op *opl = operators[l]; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 189cf08a39..2712d21c3f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -69,22 +69,29 @@ void RequestManager::register_tokenizer(ModelType type, std::string const &path) { // bos id this->model_type = type; + std::string tokenizer_folder = + (!path.empty() && path.back() != '/') ? path + '/' : path; if (model_type == ModelType::LLAMA) { + bool path_to_file = !path.empty() && + (path.size() >= strlen("tokenizer.model")) && + path.find("tokenizer.model") == + (path.size() - strlen("tokenizer.model")); + std::string tokenizer_filepath = + path_to_file ? path : tokenizer_folder + "tokenizer.model"; this->tokenizer_ = - Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(path)); + Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath)); } else if (model_type == ModelType::OPT) { - std::string tokenizer_folder = - (!path.empty() && path.back() != '/') ? path + '/' : path; - std::string vocab_file = tokenizer_folder + "gpt2-vocab.json"; - std::string merges_file = tokenizer_folder + "gpt2-merges.txt"; - std::string added_tokens_file = tokenizer_folder + "added_tokens.json"; + std::string vocab_file = tokenizer_folder + "vocab.json"; + std::string merges_file = tokenizer_folder + "merges.txt"; + std::string added_tokens_file = + tokenizer_folder + "special_tokens_map.json"; std::filesystem::path path1(vocab_file); std::filesystem::path path2(merges_file); std::filesystem::path path3(added_tokens_file); assert(std::filesystem::exists(path1) && - "Vocab file gpt2-vocab.json does not exist at the specified path"); + "Vocab file vocab.json does not exist at the specified path"); assert(std::filesystem::exists(path2) && - "Merge file gpt2-merges.txt does not exist at the specified path"); + "Merge file merges.txt does not exist at the specified path"); // opt_tokenizer = new OptTokenizer(vocab_file, merges_file); std::string vocab = LoadBytesFromFile(path1.string()); std::string merges = LoadBytesFromFile(path2.string()); diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh new file mode 100755 index 0000000000..dc8cc1f78a --- /dev/null +++ b/tests/inference/cpp_inference_tests.sh @@ -0,0 +1,275 @@ +#! /usr/bin/env bash +set -x +set -e + +cleanup() { + rm -rf ../../inference/prompt ../../inference/weights ../../inference/tokenizer ../../inference/output +} + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Enable model parallelism tests, if desired +TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} + +# Clean up before test (just in case) +cleanup + +# Update the transformers library to support the LLAMA model + +pip3 install --upgrade transformers sentencepiece + +# Download the weights in both half and full precision +python3 ../../inference/utils/download_llama_weights.py +python3 ../../inference/utils/download_llama_weights.py --use-full-precision +python3 ../../inference/utils/download_opt_weights.py +python3 ../../inference/utils/download_opt_weights.py --use-full-precision + +# Create test prompt file +mkdir -p ../../inference/prompt +echo '["Give three tips for staying healthy."]' > ../../inference/prompt/test.json + +# Create output folder +mkdir -p ../../inference/output + +############################################################################################### +############################ Speculative inference tests ###################################### +############################################################################################### + +# LLAMA +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +# LLAMA (half precision) +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights_half/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 + +# OPT +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 +# OPT (half precision) +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights_half/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 + +# Tensor parallelism tests +if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + # LLAMA + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + # LLAMA (half precision) + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights_half/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + + # OPT + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + # OPT (half precision) + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights_half/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 +fi + +############################################################################################### +############################ Incremental decoding tests ####################################### +############################################################################################### + +# LLAMA (small model) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +# LLAMA (small model, half precision) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights_half/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 + +# LLAMA (big model) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 +# LLAMA (big model, half precision) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 + +# OPT (small model) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 +# OPT (small model, half precision) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights_half/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 + +# OPT (big model) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 +# OPT (big model, half precision) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 + +# Tensor parallelism tests +if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + # LLAMA (small model) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + # LLAMA (small model, half precision) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights_half/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights_half/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + + # LLAMA (big model) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + # LLAMA (big model, half precision) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + + # OPT (small model) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + # OPT (small model, half precision) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights_half/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights_half/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + + # OPT (big model) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + # OPT (big model, half precision) + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 +fi + +############################################################################################### +############################### Alignment and Speed tests ##################################### +############################################################################################### + +##################################### Helper functions ####################################### +function check_partial_token_match { + local file1="$1" + local file2="$2" + local num_tokens_to_match=30 + + # Read the second line of the first file + third_line=$(sed -n '3p' "$file1") + read -r line1 <<< "$third_line" + tokens1=${line1#*: } + IFS=',' read -ra arr1 <<< "$tokens1" + + # Read the second line of the second file + third_line=$(sed -n '3p' "$file2") + read -r line2 <<< "$third_line" + tokens2=${line2#*: } + IFS=',' read -ra arr2 <<< "$tokens2" + + # Compare the first few integers in the two lists + for ((i = 0; i < num_tokens_to_match; i++)); do + if [[ "${arr1[$i]}" != "${arr2[$i]}" ]]; then + echo "The first $num_tokens_to_match tokens in files $file1 and $file2 are not identical." + exit 1 + fi + done + #echo "The first $num_tokens_to_match integers are identical." +} + +function compare_speed_spec_infer_incr_decoding { + local incrDec_file="$1" + local specInf_file="$2" + + # Read the float numbers from the first line of the files + incrDec=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$incrDec_file") + specInf=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$specInf_file") + + if ! command -v bc &> /dev/null; then + echo "bc is not installed. Installing..." + sudo apt-get install -y bc + fi + + # Perform the comparison + threshold=$(bc <<< "$specInf * 1.5") + if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + #echo "The latency in $specInf_file is at least 1.5x smaller than the latency from $incrDec_file." + : + else + echo "Error: The latency in $specInf_file is not at least 1.5x smaller than the latency in $incrDec_file!" + exit 1 + fi +} + +function compare_decoding_steps_spec_infer_incr_decoding { + local incrDec_file="$1" + local specInf_file="$2" + + # Read the number of decoding steps from the second line of the files + second_line=$(sed -n '2p' "$incrDec_file") + read -r line <<< "$second_line" + incrDec=${line#*: } + second_line=$(sed -n '2p' "$specInf_file") + read -r line <<< "$second_line" + specInf=${line#*: } + + if ! command -v bc &> /dev/null; then + echo "bc is not installed. Installing..." + sudo apt-get install -y bc + fi + + # Perform the comparison + threshold=$(bc <<< "$specInf * 1.5") + if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file." + : + else + echo "Error: The decoding steps in $specInf_file are not at least 1.5x less than those in $incrDec_file!" + exit 1 + fi +} + +############ Alignment between speculative inference and incremental decoding ################# +# Full precision +diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt") +diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt") <(tail -n +3 "../../inference/output/spec_inference_opt.txt") +# Half precision +check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" + +# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding +# Full precision +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt" +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt" +# Half precision +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" + +############ Alignment between tensor model parallelism and pipeline parallelism only ################# +if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then + diff <(tail -n +3 "../../inference/output/spec_inference_llama_tp.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt") + diff <(tail -n +3 "../../inference/output/spec_inference_opt_tp.txt") <(tail -n +3 "../../inference/output/spec_inference_opt.txt") + check_partial_token_match "../../inference/output/spec_inference_llama_half_tp.txt" "../../inference/output/spec_inference_llama_half.txt" + check_partial_token_match "../../inference/output/spec_inference_opt_half_tp.txt" "../../inference/output/spec_inference_opt_half.txt" + diff <(tail -n +3 "../../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_160M.txt") + check_partial_token_match "../../inference/output/incr_decoding_llama_160M_half_tp.txt" "../../inference/output/incr_decoding_llama_160M_half.txt" + diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") + check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_7B_half.txt" + diff <(tail -n +3 "../../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_125M.txt") + check_partial_token_match "../../inference/output/incr_decoding_opt_125M_half_tp.txt" "../../inference/output/incr_decoding_opt_125M_half.txt" + diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt") + check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half_tp.txt" "../../inference/output/incr_decoding_opt_6B_half.txt" +fi + +######################### Alignment tests with HuggingFace #################################### +pip3 install protobuf==3.20.3 + +# LLAMA (small model, full precision) +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu + +# LLAMA (small model, half precision) +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu + +# LLAMA (big model, full precision) +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" + +# LLAMA (big model, half precision) +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu + +# OPT (small model, full precision) +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 + +# OPT (small model, half precision) +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 + +# OPT (big model, full precision) +#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 + +# OPT (big model, half precision) +#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 + +diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_7B.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) + +diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt") +diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt") +#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt") + +############################################################################################### +###################################### Cleanup ################################################ +############################################################################################### + +# Clean up after test +# cleanup diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 8616bb845e..ca95acc785 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -2,274 +2,8 @@ set -x set -e -cleanup() { - rm -rf ../inference/prompt ../inference/weights ../inference/tokenizer ../inference/output -} - # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" -# Enable model parallelism tests, if desired -TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} - -# Clean up before test (just in case) -cleanup - -# Update the transformers library to support the LLAMA model - -pip3 install --upgrade transformers sentencepiece - -# Download the weights in both half and full precision -python3 ../inference/utils/download_llama_weights.py -python3 ../inference/utils/download_llama_weights.py --use-full-precision -python3 ../inference/utils/download_opt_weights.py -python3 ../inference/utils/download_opt_weights.py --use-full-precision - -# Create test prompt file -mkdir -p ../inference/prompt -echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json - -# Create output folder -mkdir -p ../inference/output - -############################################################################################### -############################ Speculative inference tests ###################################### -############################################################################################### - -# LLAMA -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 -# LLAMA (half precision) -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 - -# OPT -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 -# OPT (half precision) -../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 - -# Tensor parallelism tests -if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then - # LLAMA - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - # LLAMA (half precision) - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../inference/weights/llama_160M_weights_half/ -ssm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - - # OPT - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - # OPT (half precision) - ../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../inference/weights/opt_125M_weights_half/ -ssm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 -fi - -############################################################################################### -############################ Incremental decoding tests ####################################### -############################################################################################### - -# LLAMA (small model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 -# LLAMA (small model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 - -# LLAMA (big model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 -# LLAMA (big model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 - -# OPT (small model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 -# OPT (small model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 - -# OPT (big model) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 -# OPT (big model, half precision) -../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 - -# Tensor parallelism tests -if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then - # LLAMA (small model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_160M_weights/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 - # LLAMA (small model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_160M_weights_half/ -llm-config ../inference/models/configs/llama_160M.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 - - # LLAMA (big model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../inference/weights/llama_7B_weights/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - # LLAMA (big model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../inference/weights/llama_7B_weights_half/ -llm-config ../inference/models/configs/llama_7B.json -tokenizer ../inference/tokenizer/tokenizer.model -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - - # OPT (small model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_125M_weights/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 - # OPT (small model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_125M_weights_half/ -llm-config ../inference/models/configs/opt_125M.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 - - # OPT (big model) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../inference/weights/opt_6B_weights/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - # OPT (big model, half precision) - ../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../inference/weights/opt_6B_weights_half/ -llm-config ../inference/models/configs/opt_6B.json -tokenizer ../inference/tokenizer/ -prompt ../inference/prompt/test.json -output-file ../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 -fi - -############################################################################################### -############################### Alignment and Speed tests ##################################### -############################################################################################### - -##################################### Helper functions ####################################### -function check_partial_token_match { - local file1="$1" - local file2="$2" - local num_tokens_to_match=30 - - # Read the second line of the first file - third_line=$(sed -n '3p' "$file1") - read -r line1 <<< "$third_line" - tokens1=${line1#*: } - IFS=',' read -ra arr1 <<< "$tokens1" - - # Read the second line of the second file - third_line=$(sed -n '3p' "$file2") - read -r line2 <<< "$third_line" - tokens2=${line2#*: } - IFS=',' read -ra arr2 <<< "$tokens2" - - # Compare the first few integers in the two lists - for ((i = 0; i < num_tokens_to_match; i++)); do - if [[ "${arr1[$i]}" != "${arr2[$i]}" ]]; then - echo "The first $num_tokens_to_match tokens in files $file1 and $file2 are not identical." - exit 1 - fi - done - #echo "The first $num_tokens_to_match integers are identical." -} - -function compare_speed_spec_infer_incr_decoding { - local incrDec_file="$1" - local specInf_file="$2" - - # Read the float numbers from the first line of the files - incrDec=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$incrDec_file") - specInf=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$specInf_file") - - if ! command -v bc &> /dev/null; then - echo "bc is not installed. Installing..." - sudo apt-get install -y bc - fi - - # Perform the comparison - threshold=$(bc <<< "$specInf * 1.5") - if (( $(echo "$incrDec >= $threshold" | bc -l) )); then - #echo "The latency in $specInf_file is at least 1.5x smaller than the latency from $incrDec_file." - : - else - echo "Error: The latency in $specInf_file is not at least 1.5x smaller than the latency in $incrDec_file!" - exit 1 - fi -} - -function compare_decoding_steps_spec_infer_incr_decoding { - local incrDec_file="$1" - local specInf_file="$2" - - # Read the number of decoding steps from the second line of the files - second_line=$(sed -n '2p' "$incrDec_file") - read -r line <<< "$second_line" - incrDec=${line#*: } - second_line=$(sed -n '2p' "$specInf_file") - read -r line <<< "$second_line" - specInf=${line#*: } - - if ! command -v bc &> /dev/null; then - echo "bc is not installed. Installing..." - sudo apt-get install -y bc - fi - - # Perform the comparison - threshold=$(bc <<< "$specInf * 1.5") - if (( $(echo "$incrDec >= $threshold" | bc -l) )); then - #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file." - : - else - echo "Error: The decoding steps in $specInf_file are not at least 1.5x less than those in $incrDec_file!" - exit 1 - fi -} - -############ Alignment between speculative inference and incremental decoding ################# -# Full precision -diff <(tail -n +3 "../inference/output/incr_decoding_llama_7B.txt") <(tail -n +3 "../inference/output/spec_inference_llama.txt") -diff <(tail -n +3 "../inference/output/incr_decoding_opt_6B.txt") <(tail -n +3 "../inference/output/spec_inference_opt.txt") -# Half precision -check_partial_token_match "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" -check_partial_token_match "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" - -# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding -# Full precision -#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B.txt" "../inference/output/spec_inference_llama.txt" -#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B.txt" "../inference/output/spec_inference_opt.txt" -compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B.txt" "../inference/output/spec_inference_llama.txt" -compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B.txt" "../inference/output/spec_inference_opt.txt" -# Half precision -#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" -#compare_speed_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" -compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_llama_7B_half.txt" "../inference/output/spec_inference_llama_half.txt" -compare_decoding_steps_spec_infer_incr_decoding "../inference/output/incr_decoding_opt_6B_half.txt" "../inference/output/spec_inference_opt_half.txt" - -############ Alignment between tensor model parallelism and pipeline parallelism only ################# -if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then - diff <(tail -n +3 "../inference/output/spec_inference_llama_tp.txt") <(tail -n +3 "../inference/output/spec_inference_llama.txt") - diff <(tail -n +3 "../inference/output/spec_inference_opt_tp.txt") <(tail -n +3 "../inference/output/spec_inference_opt.txt") - check_partial_token_match "../inference/output/spec_inference_llama_half_tp.txt" "../inference/output/spec_inference_llama_half.txt" - check_partial_token_match "../inference/output/spec_inference_opt_half_tp.txt" "../inference/output/spec_inference_opt_half.txt" - diff <(tail -n +3 "../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_llama_160M.txt") - check_partial_token_match "../inference/output/incr_decoding_llama_160M_half_tp.txt" "../inference/output/incr_decoding_llama_160M_half.txt" - diff <(tail -n +3 "../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_llama_7B.txt") - check_partial_token_match "../inference/output/incr_decoding_llama_7B_half_tp.txt" "../inference/output/incr_decoding_llama_7B_half.txt" - diff <(tail -n +3 "../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_opt_125M.txt") - check_partial_token_match "../inference/output/incr_decoding_opt_125M_half_tp.txt" "../inference/output/incr_decoding_opt_125M_half.txt" - diff <(tail -n +3 "../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../inference/output/incr_decoding_opt_6B.txt") - check_partial_token_match "../inference/output/incr_decoding_opt_6B_half_tp.txt" "../inference/output/incr_decoding_opt_6B_half.txt" -fi - -######################### Alignment tests with HuggingFace #################################### -pip3 install protobuf==3.20.3 - -# LLAMA (small model, full precision) -python3 ./inference/huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu - -# LLAMA (small model, half precision) -python3 ./inference/huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu - -# LLAMA (big model, full precision) -python3 ./inference/huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" - -# LLAMA (big model, half precision) -python3 ./inference/huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu - -# OPT (small model, full precision) -python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 - -# OPT (small model, half precision) -python3 ./inference/huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 - -# OPT (big model, full precision) -#python3 ./inference/huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 - -# OPT (big model, half precision) -#python3 ./inference/huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 - -diff <(tail -n +2 "../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../inference/output/incr_decoding_llama_160M.txt") -diff <(tail -n +2 "../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../inference/output/incr_decoding_llama_7B.txt") -diff <(tail -n +2 "../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../inference/output/incr_decoding_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) - -diff <(tail -n +2 "../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../inference/output/incr_decoding_opt_125M.txt") -diff <(tail -n +2 "../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -#diff <(tail -n +2 "../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../inference/output/incr_decoding_opt_6B.txt") -#diff <(tail -n +2 "../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../inference/output/incr_decoding_opt_6B_half.txt") - -############################################################################################### -###################################### Cleanup ################################################ -############################################################################################### - -# Clean up after test -# cleanup +# replace this with python tests +./inference/cpp_inference_tests.sh From d1ef0ed8ce6876c37bbd99dd0a8f46728b7c25d2 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 3 Aug 2023 00:30:10 -0400 Subject: [PATCH 189/344] Support Group Attention (Llama 2) (#883) * n_kv_heads in inc_mha * . * . * . * . * . * fix * fix * tensor parallelism * change weight layout * tensor parallelism * merge multiquery attention into inc_mha * llama2 70B config * spec infer change 1 * fix. * spec infer. * falcon spec infer. * fix llama 70B * fix * fix & cleanup * fix * hip rocm * issue 908 * clean debug code. * format. * remove multiquery. remove warning, fix python. --------- Co-authored-by: goliaro --- .../cpp/inference/mixture_of_experts/moe.cc | 1 + .../inference/transformers/transformers.cc | 1 + include/flexflow/ffconst.h | 9 +- include/flexflow/flexflow_c.h | 21 +- include/flexflow/model.h | 21 +- include/flexflow/operator_params.h | 1 - .../ops/inc_multihead_self_attention.h | 14 +- .../ops/inc_multihead_self_attention_params.h | 2 +- .../ops/inc_multiquery_attention_params.h | 30 - .../ops/inc_multiquery_self_attention.h | 159 -- .../inc_multihead_self_attention_kernels.h | 13 +- .../ops/spec_inc_multihead_self_attention.h | 7 +- ...spec_inc_multihead_self_attention_params.h | 2 +- .../ops/tree_inc_multihead_self_attention.h | 9 +- ...tree_inc_multihead_self_attention_params.h | 2 +- include/flexflow/request_manager.h | 4 +- include/flexflow/utils/cuda_helper.h | 5 +- inference/file_loader.cc | 237 +-- inference/file_loader.h | 6 +- inference/incr_decoding/incr_decoding.cc | 5 +- inference/models/configs/falcon_7B.json | 1 + inference/models/configs/llama2_70B.json | 12 + inference/models/configs/llama2_7B.json | 12 + inference/models/configs/llama_160M.json | 1 + inference/models/configs/llama_7B.json | 1 + inference/models/falcon.cc | 75 +- inference/models/falcon.h | 5 +- inference/models/llama.cc | 8 +- inference/models/llama.h | 4 +- inference/models/opt.cc | 8 +- inference/spec_infer/CMakeLists.txt | 3 +- inference/spec_infer/spec_infer.cc | 27 +- inference/utils/download_falcon_weights.py | 45 + python/flexflow/core/flexflow_cffi.py | 74 +- python/flexflow/serve/models/falcon.py | 3 +- python/flexflow/serve/models/llama.py | 3 + python/flexflow/serve/models/opt.py | 3 + python/flexflow/serve/serve.py | 2 + python/flexflow/type.py | 6 +- src/c/flexflow_c.cc | 53 +- src/ops/inc_multihead_self_attention.cc | 100 +- src/ops/inc_multihead_self_attention.cpp | 5 +- src/ops/inc_multihead_self_attention.cu | 672 ++++---- src/ops/inc_multiquery_self_attention.cc | 1432 ----------------- src/ops/inc_multiquery_self_attention.cpp | 96 -- src/ops/inc_multiquery_self_attention.cu | 797 --------- src/ops/spec_inc_multihead_self_attention.cc | 73 +- src/ops/spec_inc_multihead_self_attention.cpp | 5 +- src/ops/spec_inc_multihead_self_attention.cu | 296 ++-- src/ops/tree_inc_multihead_self_attention.cc | 96 +- src/ops/tree_inc_multihead_self_attention.cpp | 5 +- src/ops/tree_inc_multihead_self_attention.cu | 379 +++-- src/runtime/cuda_helper.cu | 40 +- src/runtime/ffconst_utils.cc | 2 - src/runtime/graph.cc | 71 +- src/runtime/model.cc | 44 - src/runtime/operator_params.cc | 1 - src/runtime/request_manager.cc | 12 +- src/runtime/substitution.cc | 8 - 59 files changed, 1412 insertions(+), 3617 deletions(-) delete mode 100644 include/flexflow/ops/inc_multiquery_attention_params.h delete mode 100644 include/flexflow/ops/inc_multiquery_self_attention.h create mode 100644 inference/models/configs/llama2_70B.json create mode 100644 inference/models/configs/llama2_7B.json create mode 100644 inference/utils/download_falcon_weights.py delete mode 100644 src/ops/inc_multiquery_self_attention.cc delete mode 100644 src/ops/inc_multiquery_self_attention.cpp delete mode 100644 src/ops/inc_multiquery_self_attention.cu diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 4a5c33c9b0..5125e5d98e 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -79,6 +79,7 @@ Tensor create_moe_encoder(FFModel *model, x, moeConfig->hidden_size, moeConfig->num_attention_heads, + moeConfig->num_attention_heads, moeConfig->attention_kdim, moeConfig->attention_vdim) : model->multihead_attention(x, diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 0717ddc90f..2d818e8e4e 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -47,6 +47,7 @@ Tensor create_inc_multihead_attention_decoder( input, transformerConfig->hidden_size, transformerConfig->num_attention_heads, + transformerConfig->num_attention_heads, transformerConfig->attention_kdim, transformerConfig->attention_vdim) : model->multihead_attention(input, diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 170180aeaf..1694041163 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -167,7 +167,6 @@ enum OperatorType { OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, - OP_INC_MULTIQUERY_SELF_ATTENTION, OP_SAMPLING, // Parallel Ops OP_REPARTITION, @@ -180,7 +179,13 @@ enum OperatorType { OP_INVALID, }; -enum ModelType { UNKNOWN = 3001, LLAMA = 3002, OPT = 3003, FALCON = 3004 }; +enum ModelType { + UNKNOWN = 3001, + LLAMA = 3002, + LLAMA2 = 3003, + OPT = 3004, + FALCON = 3005 +}; enum PMParameter { PM_OP_TYPE, // AnyOp diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 9983898130..949c0f7885 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -401,6 +401,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -420,6 +421,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -439,6 +441,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -453,21 +456,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bool qk_prod_scaling, char const *name); -flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( - flexflow_model_t handle_, - const flexflow_tensor_t input_, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool bias, - bool add_bias_kv, - bool add_zero_attn, - enum DataType data_type, - flexflow_initializer_t kernel_initializer_, - char const *name); - flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, const flexflow_tensor_t input_, float eps, @@ -901,7 +889,8 @@ flexflow_file_data_loader_t flexflow_file_data_loader_create(char const *weight_file_path, int num_heads, int hidden_dim, - int qkv_inner_dim); + int qkv_inner_dim, + int tensor_partition_num); void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 1f30d451ef..d34cf14a76 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -156,8 +156,6 @@ enum TaskIDs { INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, - INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, @@ -322,7 +320,6 @@ class Transpose; class RMSNorm; class BeamTopK; class SpecIncMultiHeadSelfAttention; -class IncMultiQuerySelfAttention; class Sampling; class ArgMax; class Combine; @@ -644,6 +641,7 @@ class FFModel { Tensor inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, + int num_kv_heads, int kdim = 0, int vdim = 0, float dropout = 0.0f, @@ -657,22 +655,11 @@ class FFModel { float scaling_factor = 1.0f, bool qk_prod_scaling = true, char const *name = NULL); - Tensor inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - char const *name = NULL); Tensor spec_inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, + int num_kv_heads, int kdim = 0, int vdim = 0, float dropout = 0.0f, @@ -690,6 +677,7 @@ class FFModel { const Tensor input, int embed_dim, int num_heads, + int num_kv_heads, int kdim = 0, int vdim = 0, float dropout = 0.0f, @@ -1075,9 +1063,6 @@ class FFModel { std::unordered_map< std::pair, IncMultiHeadSelfAttention *>, - std::unordered_map< - std::pair, - IncMultiQuerySelfAttention *>, std::unordered_map, BeamTopK *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 982d5482a0..4f0432cb93 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -20,7 +20,6 @@ #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" -#include "flexflow/ops/inc_multiquery_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" #include "flexflow/ops/pool_2d_params.h" diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 244100bc6f..ce1ef6f37c 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -29,6 +29,7 @@ class IncMultiHeadSelfAttention : public Op { const ParallelTensor _input, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -42,12 +43,14 @@ class IncMultiHeadSelfAttention : public Op { bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -61,6 +64,7 @@ class IncMultiHeadSelfAttention : public Op { bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, IncMultiHeadSelfAttention const &other, @@ -114,7 +118,7 @@ class IncMultiHeadSelfAttention : public Op { Params get_params() const; public: - int num_heads; + int num_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -132,7 +136,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads); + int _num_heads, + int _num_kv_heads); IncMultiHeadSelfAttentionMeta(FFHandler handler, InferenceMode infer_mode, Op const *attn, @@ -153,7 +158,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { MemoryAllocator &gpu_mem_allocator, int num_samples, int _global_num_heads, + int _global_num_kv_heads, int _num_heads, + int _num_kv_heads, DataType _quantization_type, bool _offload); ~IncMultiHeadSelfAttentionMeta(void); @@ -163,7 +170,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, biasSize, reserveSpaceSize, quantized_weightSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int global_num_heads, num_heads; + int global_num_heads, global_num_kv_heads, num_heads, num_kv_heads; bool *has_load_weights; bool *apply_rotary_embedding; bool *bias; @@ -182,6 +189,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { DataType quantization_type; bool offload; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnTensorDescriptor_t qk_tensor; cuFloatComplex *complex_input; #endif }; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index d95aaf2e05..84fedb45a7 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -8,7 +8,7 @@ namespace FlexFlow { struct IncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_heads, kdim, vdim; + int embed_dim, num_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; diff --git a/include/flexflow/ops/inc_multiquery_attention_params.h b/include/flexflow/ops/inc_multiquery_attention_params.h deleted file mode 100644 index b781669473..0000000000 --- a/include/flexflow/ops/inc_multiquery_attention_params.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_PARAMS_H -#define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_PARAMS_H - -#include "flexflow/fftype.h" -#include "flexflow/parallel_tensor.h" - -namespace FlexFlow { - -struct IncMultiQuerySelfAttentionParams { - LayerID layer_guid; - int embed_dim, num_heads, kdim, vdim; - float dropout; - bool bias, add_bias_kv, add_zero_attn; - - bool is_valid(ParallelTensorShape const &) const; -}; - -bool operator==(IncMultiQuerySelfAttentionParams const &, - IncMultiQuerySelfAttentionParams const &); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash { - size_t operator()(FlexFlow::IncMultiQuerySelfAttentionParams const &) const; -}; -} // namespace std - -#endif // _FLEXFLOW_INC_MULTIQUERY_ATTENTION_PARAMS_H diff --git a/include/flexflow/ops/inc_multiquery_self_attention.h b/include/flexflow/ops/inc_multiquery_self_attention.h deleted file mode 100644 index 1e36876c57..0000000000 --- a/include/flexflow/ops/inc_multiquery_self_attention.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H -#define _FLEXFLOW_INC_MULTIQUERY_ATTENTION_H - -#include "flexflow/accessor.h" -#include "flexflow/device.h" -#include "flexflow/fftype.h" -#include "flexflow/inference.h" -#include "flexflow/layer.h" -#include "flexflow/node.h" -#include "flexflow/op_meta.h" -#include "flexflow/operator.h" -#include "flexflow/ops/inc_multiquery_attention_params.h" -#include "math.h" -#include -#include - -namespace FlexFlow { - -class IncMultiQuerySelfAttentionMeta; - -class IncMultiQuerySelfAttention : public Op { -public: - using Params = IncMultiQuerySelfAttentionParams; - using Input = ParallelTensor; - - IncMultiQuerySelfAttention(FFModel &model, - LayerID const &layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name); - IncMultiQuerySelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name); - IncMultiQuerySelfAttention(FFModel &model, - IncMultiQuerySelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); - IncMultiQuerySelfAttention(FFModel &model, - Params const ¶ms, - Input const &inputs, - bool allocate_weights = false, - char const *name = nullptr); - static Op * - create_operator_from_layer(FFModel &model, - Layer const *layer, - std::vector const &inputs); - void init(FFModel const &) override; - void init_inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; - void forward(FFModel const &) override; - void backward(FFModel const &) override; - Legion::FutureMap inference(FFModel const &, - BatchConfigFuture const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; - void print_layer(FFModel const &model) override { - assert(0); - } - bool get_int_parameter(PMParameter, int *) const override; - - static OpMeta *init_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - static void inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - bool measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const override; - - static void inference_kernel_wrapper(IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output); - Params get_params() const; - -public: - int num_heads; - float dropout; - bool bias; - bool add_bias_kv, add_zero_attn; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, - embed_dim; - int qoSeqLength, kvSeqLength; -}; - -class IncMultiQuerySelfAttentionMeta : public OpMeta { -public: - IncMultiQuerySelfAttentionMeta(FFHandler handler, - IncMultiQuerySelfAttention const *attn, - GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, - int num_samples); - IncMultiQuerySelfAttentionMeta(FFHandler handler, - InferenceMode infer_mode, - Op const *attn, - int _qSize, - int _kSize, - int _vSize, - int _qProjSize, - int _kProjSize, - int _vProjSize, - int _oProjSize, - int _embed_dim, - bool _bias, - bool _add_bias_kv, - GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, - int num_samples); - ~IncMultiQuerySelfAttentionMeta(void); - -public: - Realm::RegionInstance reserveInst; - size_t weights_params, weightSize, reserveSpaceSize; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, - embed_dim; - int num_heads; - bool *has_load_weights; - bool *bias; - bool *multi_query_attention; -#ifdef INFERENCE_TESTS - float *kcache, *vcache; -#endif - void *devQKVProjArray, *keyCache, *valueCache; - void *qk_prods, *qk_prods_softmax; - void *attn_heads, *W_out_contiguous; - BatchConfig::PerTokenInfo *token_infos; -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cuFloatComplex *complex_input; -#endif -}; - -}; // namespace FlexFlow - -#endif // _FLEXFLOW_ATTENTION_H diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 5b40136524..6b294bc211 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -11,18 +11,11 @@ namespace FlexFlow { namespace Kernels { namespace IncMultiHeadAttention { -template -__global__ void build_w_out_tensor(DT const *weight_ptr, - DT *contiguous_weight_ptr, - int vProjSize, - int oProjSize, - int num_heads, - int qkv_weight_block_size); - template __global__ void apply_proj_bias_w(DT *input_ptr, DT const *bias_ptr, int num_tokens, + int qkv_weight_size, int oProjSize); template @@ -34,6 +27,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int kProjSize, int vProjSize, int num_heads, + int num_kv_heads, bool scaling_query, float scaling_factor); @@ -46,9 +40,10 @@ __global__ void int kProjSize, int num_heads, int num_tokens, + int num_kv_heads, int q_block_size, int k_block_size, - int v_block_size, + int q_array_size, bool q_tensor); template diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index c8c1c4c9cf..f5b06c830e 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -29,6 +29,7 @@ class SpecIncMultiHeadSelfAttention : public Op { const ParallelTensor _input, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -46,6 +47,7 @@ class SpecIncMultiHeadSelfAttention : public Op { const ParallelTensor _weight, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -114,7 +116,7 @@ class SpecIncMultiHeadSelfAttention : public Op { Params get_params() const; public: - int num_heads; + int num_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -130,7 +132,8 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads); + int _num_heads, + int _num_kv_heads); ~SpecIncMultiHeadSelfAttentionMeta(void); public: diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 5995e95fe1..1741b23745 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -8,7 +8,7 @@ namespace FlexFlow { struct SpecIncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_heads, kdim, vdim; + int embed_dim, num_heads, num_kv_heads, kdim, vdim; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index ba1d80dd60..cf714fe515 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -29,6 +29,7 @@ class TreeIncMultiHeadSelfAttention : public Op { const ParallelTensor _input, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -42,12 +43,14 @@ class TreeIncMultiHeadSelfAttention : public Op { bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -61,6 +64,7 @@ class TreeIncMultiHeadSelfAttention : public Op { bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, TreeIncMultiHeadSelfAttention const &other, @@ -115,7 +119,7 @@ class TreeIncMultiHeadSelfAttention : public Op { Params get_params() const; public: - int num_heads; + int num_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -133,7 +137,8 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads); + int _num_heads, + int _num_kv_heads); ~TreeIncMultiHeadSelfAttentionMeta(void); public: diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index 0eede3bd2f..a00e56bda6 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -8,7 +8,7 @@ namespace FlexFlow { struct TreeIncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_heads, kdim, vdim; + int embed_dim, num_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 9dd19ee7f9..446f884eac 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -214,8 +214,8 @@ class RequestManager { InferenceResultFuture last_irf; TreeVerifyBatchConfigFuture last_tree_bcf; InferenceResultFuture last_tree_irf; - const std::map model_bos_map = {{ModelType::LLAMA, 0}, - {ModelType::OPT, 2}}; + const std::map model_bos_map = { + {ModelType::LLAMA, 0}, {ModelType::OPT, 2}, {ModelType::LLAMA2, 1}}; // TODO: Move this two vector to request struct std::unordered_map -void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +void print_tensor(T const *ptr, + size_t num_elements, + char const *prefix, + int shard_id = 0); template void print_beam_tensor(T const *ptr, size_t num_elements, diff --git a/inference/file_loader.cc b/inference/file_loader.cc index e89c3eb622..94e604ac2d 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -25,11 +25,14 @@ using namespace Legion; FileDataLoader::FileDataLoader(std::string _input_path, std::string _weight_file_path, int _num_heads, + int _num_kv_heads, size_t _hidden_dim, - size_t _qkv_inner_dim) + size_t _qkv_inner_dim, + int _tensor_partition_num) : input_path(_input_path), weight_file_path(_weight_file_path), - num_heads(_num_heads), hidden_dim(_hidden_dim), - qkv_inner_dim(_qkv_inner_dim){}; + num_heads(_num_heads), num_kv_heads(_num_kv_heads), + hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim), + tensor_partition_num(_tensor_partition_num){}; BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { @@ -61,65 +64,6 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { return prompts; }; -template -void load_attention_bias(DT *ptr, - int num_heads, - size_t hidden_dim, - size_t qkv_inner_dim, - std::string layer_name, - std::string weight_path) { - std::string q_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wq_bias"; - std::string k_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wk_bias"; - std::string v_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wv_bias"; - std::string o_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wo_bias"; - std::vector bias_files = {q_file, k_file, v_file, o_file}; - - int file_index = 0; - for (auto file : bias_files) { - size_t qkv_partial_size = qkv_inner_dim * num_heads; - size_t out_partial_size = hidden_dim; - size_t partial_size = - (file_index < 3) ? qkv_partial_size : out_partial_size; - // std::cout << "Loading filename: " << file << std::endl; - std::ifstream in(file, std::ios::in | std::ios::binary); - assert(in.good() && "incorrect bias file path"); - std::vector
host_array(partial_size); - size_t loaded_data_size = sizeof(DT) * partial_size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - size_t in_get_size = in.gcount(); - - if (in_get_size != loaded_data_size) { - printf( - "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", - in_get_size, - loaded_data_size); - assert(false); - } - assert(partial_size == host_array.size()); - - size_t data_index = 0; - - for (int i = 0; i < partial_size; i++) { - ptr[file_index * qkv_partial_size + i] = host_array.at(data_index); - data_index++; - } - - file_index++; - - in.close(); - } -} - template void load_attention_weights_multi_query(DT *ptr, std::string layer_name, @@ -169,13 +113,79 @@ void load_attention_weights_multi_query(DT *ptr, } template -void load_attention_weights(DT *ptr, +void load_attention_bias_v2(DT *ptr, int num_heads, + int num_kv_heads, size_t hidden_dim, size_t qkv_inner_dim, std::string layer_name, - std::string weight_path, - size_t volume) { + std::string weight_path) { + std::string q_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wq_bias"; + std::string k_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wk_bias"; + std::string v_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wv_bias"; + std::string o_file = weight_path + + layer_name.substr(0, layer_name.find("attention")) + + "attention_wo_bias"; + std::vector bias_files = {q_file, k_file, v_file, o_file}; + + int file_index = 0; + + // now only opt use this. + assert(num_heads == num_kv_heads); + + for (auto file : bias_files) { + int n_heads = file_index == 0 ? num_heads : num_kv_heads; + size_t qkv_partial_size = qkv_inner_dim * n_heads; + size_t out_partial_size = hidden_dim; + size_t partial_size = + (file_index < 3) ? qkv_partial_size : out_partial_size; + std::ifstream in(file, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect bias file path"); + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + printf( + "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", + in_get_size, + loaded_data_size); + assert(false); + } + assert(partial_size == host_array.size()); + + size_t data_index = 0; + + for (int i = 0; i < partial_size; i++) { + ptr[file_index * qkv_partial_size + i] = host_array.at(data_index); + data_index++; + } + + file_index++; + + in.close(); + } +} + +template +void load_attention_weights_v2(DT *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weight_path, + size_t volume, + int tensor_partition_num) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight std::string q_file = weight_path + @@ -191,18 +201,28 @@ void load_attention_weights(DT *ptr, layer_name.substr(0, layer_name.find("attention")) + "attention_wo_weight"; std::vector weight_files = {q_file, k_file, v_file}; - int file_index = 0; + int base_index = 0; size_t single_proj_size = hidden_dim * qkv_inner_dim; // size of each of Q,K,V,O weights for a single head size_t one_weight_file_size = num_heads * single_proj_size; // size of each of Q/K/V/O for all heads - // q, k, v -> 0, 1, 2 + size_t q_size = one_weight_file_size, o_size = one_weight_file_size; + size_t k_size = single_proj_size * num_kv_heads, + v_size = single_proj_size * num_kv_heads; + + // stride for q, k, v, o + size_t stride_size = + (q_size + v_size + k_size + o_size) / tensor_partition_num; for (auto file : weight_files) { - size_t partial_size = one_weight_file_size; + int data_index = 0; + size_t partial_size = (file_index == 0 || file_index == 3) + ? one_weight_file_size + : single_proj_size * num_kv_heads; + size_t one_partition_size = partial_size / tensor_partition_num; std::ifstream in(file, std::ios::in | std::ios::binary); if (!in.good()) { @@ -217,37 +237,30 @@ void load_attention_weights(DT *ptr, size_t in_get_size = in.gcount(); if (in_get_size != loaded_data_size) { - std::cout << "load data error" << std::endl; - assert(false); - return; + std::cout << "load attention data error " << in_get_size << ", " + << loaded_data_size; + assert(false && "data size mismatch"); } - assert(partial_size == host_array.size()); - - size_t data_index = 0; - for (int i = 0; i < num_heads; i++) { - size_t start_index = - i * single_proj_size * 4 + file_index * single_proj_size; - for (size_t j = start_index; j < start_index + single_proj_size; j++) { - ptr[j] = host_array.at(data_index); - data_index += 1; + // wq, wk, wo + for (int i = 0; i < tensor_partition_num; i++) { + for (int j = 0; j < one_partition_size; j++) { + ptr[base_index + i * stride_size + j] = host_array.at(data_index++); } } assert(data_index == partial_size); + base_index += one_partition_size; file_index++; - - in.close(); } - // output weight file gets special treatment + assert(base_index == (q_size + k_size + v_size) / tensor_partition_num); + { std::ifstream in(o_file, std::ios::in | std::ios::binary); - std::cout << "Loading attention filename: " << o_file << std::endl; if (!in.good()) { std::cout << "Could not open file: " << o_file << std::endl; } assert(in.good() && "incorrect weight file path"); - size_t full_output_weight_size = num_heads * single_proj_size; - std::vector
host_array(full_output_weight_size); - size_t loaded_data_size = sizeof(DT) * full_output_weight_size; + std::vector
host_array(one_weight_file_size); + size_t loaded_data_size = sizeof(DT) * one_weight_file_size; in.seekg(0, in.end); in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); @@ -257,21 +270,22 @@ void load_attention_weights(DT *ptr, std::cout << "load data error" << std::endl; assert(false); } - assert(full_output_weight_size == host_array.size()); - - for (int i = 0; i < num_heads; i++) { - size_t start_index = i * single_proj_size * 4 + 3 * single_proj_size; - for (size_t j = 0; j < single_proj_size; j++) { - int ff_row_idx = j % hidden_dim; - int ff_col_idx = j / hidden_dim; - assert(ff_row_idx < hidden_dim && ff_col_idx < qkv_inner_dim); - size_t data_index = ff_row_idx * (qkv_inner_dim * num_heads) + - qkv_inner_dim * i + ff_col_idx; - ptr[j + start_index] = host_array.at(data_index); - } + assert(one_weight_file_size == host_array.size()); + int data_index = 0; + + int one_partition_size = qkv_inner_dim * (num_heads / tensor_partition_num); + for (int i = 0; i < one_weight_file_size; i++) { + int part_idx = (i / one_partition_size) % tensor_partition_num; + int block_num = (i / one_partition_size); + int offset = block_num / tensor_partition_num * one_partition_size + + (i % one_partition_size); + ptr[base_index + part_idx * stride_size + offset] = + host_array.at(data_index++); } in.close(); + + assert(data_index == one_weight_file_size); } } @@ -655,6 +669,8 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, volume *= weight->dims[i]; } + std::cout << "load weights: " << layername << "\n"; + assert(data_type_size(weight->data_type) == sizeof(DT)); DT *data = (DT *)malloc(sizeof(DT) * volume); @@ -663,20 +679,23 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (file_path.find("attention_w") != std::string::npos) { if (weight_idx == 0) { - load_attention_weights(data, + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + file_path, + weight_file_path, + volume, + tensor_partition_num); + } else { + load_attention_bias_v2(data, num_heads, + num_kv_heads, hidden_dim, qkv_inner_dim, file_path, - weight_file_path, - volume); - } else { - load_attention_bias(data, - num_heads, - hidden_dim, - qkv_inner_dim, - file_path, - weight_file_path); + weight_file_path); } } else if (file_path.find("self_attention") != std::string::npos) { diff --git a/inference/file_loader.h b/inference/file_loader.h index 8be820b1bd..0c9dfa56cd 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -27,8 +27,10 @@ class FileDataLoader { FileDataLoader(std::string _input_path, std::string _weight_file_path, int _num_heads, + int _num_kv_heads, size_t _hidden_dim, - size_t _qkv_inner_dim); + size_t _qkv_inner_dim, + int _tensor_partition_num); BatchConfig::TokenId *generate_requests(int num, int length); @@ -54,7 +56,7 @@ class FileDataLoader { int offset); private: - int num_heads; + int num_heads, num_kv_heads, tensor_partition_num; size_t hidden_dim, qkv_inner_dim; std::string input_path; std::string weight_file_path; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 4246a78824..84217a22f1 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -52,6 +52,8 @@ void parse_input_args(char **argv, [](unsigned char c) { return std::tolower(c); }); if (model_type_str == "llama") { llm_model_type = ModelType::LLAMA; + } else if (model_type_str == "llama2") { + llm_model_type = ModelType::LLAMA2; } else if (model_type_str == "opt") { llm_model_type = ModelType::OPT; } else if (model_type_str == "falcon") { @@ -153,7 +155,7 @@ void FlexFlow::top_level_task(Task const *task, rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); - if (model_type == ModelType::LLAMA) { + if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { LLAMA::create_llama_model(model, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, @@ -170,7 +172,6 @@ void FlexFlow::top_level_task(Task const *task, FALCON::create_falcon_model(model, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, - ffconfig.workersPerNode * ffconfig.numNodes, INC_DECODING_MODE, use_full_precision); } else { diff --git a/inference/models/configs/falcon_7B.json b/inference/models/configs/falcon_7B.json index 445da54272..d89564557a 100644 --- a/inference/models/configs/falcon_7B.json +++ b/inference/models/configs/falcon_7B.json @@ -2,6 +2,7 @@ "n_layers": 32, "vocab_size": 65024, "n_heads": 71, + "n_kv_heads" : 1, "dim": 4544, "multiple_of": 256, "norm_eps": 1e-05, diff --git a/inference/models/configs/llama2_70B.json b/inference/models/configs/llama2_70B.json new file mode 100644 index 0000000000..45751787e6 --- /dev/null +++ b/inference/models/configs/llama2_70B.json @@ -0,0 +1,12 @@ +{ + "n_layers": 80, + "vocab_size": 32000, + "n_heads": 64, + "n_kv_heads": 8, + "dim": 8192, + "multiple_of": 256, + "norm_eps": 1e-5, + "total_requests": 2560, + "hidden_dim": 28672, + "incremental_mode": true +} diff --git a/inference/models/configs/llama2_7B.json b/inference/models/configs/llama2_7B.json new file mode 100644 index 0000000000..5796f70db7 --- /dev/null +++ b/inference/models/configs/llama2_7B.json @@ -0,0 +1,12 @@ +{ + "n_layers": 32, + "vocab_size": 32000, + "n_heads": 32, + "n_kv_heads": 32, + "dim": 4096, + "multiple_of": 256, + "norm_eps": 1e-5, + "total_requests": 2560, + "hidden_dim": 11008, + "incremental_mode": true +} diff --git a/inference/models/configs/llama_160M.json b/inference/models/configs/llama_160M.json index d912c64ab7..85e83804aa 100644 --- a/inference/models/configs/llama_160M.json +++ b/inference/models/configs/llama_160M.json @@ -2,6 +2,7 @@ "n_layers": 12, "vocab_size": 32000, "n_heads": 12, + "n_kv_heads": 12, "dim": 768, "multiple_of": 256, "norm_eps": 1e-6, diff --git a/inference/models/configs/llama_7B.json b/inference/models/configs/llama_7B.json index 0c32ed320d..f0ef126096 100644 --- a/inference/models/configs/llama_7B.json +++ b/inference/models/configs/llama_7B.json @@ -2,6 +2,7 @@ "n_layers": 32, "vocab_size": 32000, "n_heads": 32, + "n_kv_heads": 32, "dim": 4096, "multiple_of": 256, "norm_eps": 1e-6, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 2d79040f5f..2846549d28 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -22,11 +22,25 @@ using namespace Legion; void FALCON::create_falcon_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, - int num_pipeline_stages, InferenceMode mode, bool use_full_precision) { Config falcon_config(model_config_file_path); falcon_config.printConfig(); + + if (ff.config.tensor_parallelism_degree > falcon_config.n_heads || + ff.config.tensor_parallelism_degree > falcon_config.n_kv_heads) { + assert(false && "The degree of tensor parallelism should be greater than " + "or equal to the number of heads"); + } + + int num_devices = ff.config.workersPerNode * ff.config.numNodes; + int num_transformer_layers = falcon_config.n_layers; + assert(num_transformer_layers % ff.config.pipeline_parallelism_degree == 0); + int num_layers_per_pp_block = + num_transformer_layers / ff.config.pipeline_parallelism_degree; + int num_devices_per_data_parallelism_line = + num_devices / ff.config.data_parallelism_degree; + std::unordered_map weights_layers; Tensor input; @@ -74,11 +88,49 @@ void FALCON::create_falcon_model(FFModel &ff, attention_norm); Tensor mha; switch (mode) { + case BEAM_SEARCH_MODE: { + mha = ff.spec_inc_multihead_self_attention( + att_norm, + falcon_config.dim, + falcon_config.n_heads, + falcon_config.n_kv_heads, + falcon_config.dim / falcon_config.n_heads, + falcon_config.dim / falcon_config.n_heads, + 0.0f, + false, + false, + false, + DT_NONE, + NULL, + true); + break; + } + + case TREE_VERIFY_MODE: { + mha = ff.inc_multihead_self_attention_verify( + att_norm, + falcon_config.dim, + falcon_config.n_heads, + falcon_config.n_kv_heads, + falcon_config.dim / falcon_config.n_heads, + falcon_config.dim / falcon_config.n_heads, + 0.0f, /*dropout*/ + false, /*bias*/ + false, /*add_bias_kv*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true /*apply_rotary_embedding*/ + ); + break; + } + case INC_DECODING_MODE: { - mha = ff.inc_multiquery_self_attention( + mha = ff.inc_multihead_self_attention( att_norm, falcon_config.dim, falcon_config.n_heads, + falcon_config.n_kv_heads, falcon_config.dim / falcon_config.n_heads, falcon_config.dim / falcon_config.n_heads, 0.0f, /*dropout*/ @@ -86,7 +138,8 @@ void FALCON::create_falcon_model(FFModel &ff, false, /*add_bias_kv*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ - nullptr /*kernel_initializer*/ + nullptr, /*kernel_initializer*/ + true /*apply_rotary_embedding*/ ); break; } @@ -95,10 +148,14 @@ void FALCON::create_falcon_model(FFModel &ff, } } Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_self_attention_dense_weight", - attention_layer); + // multi query + // weights_layers.emplace("layers_" + std::to_string(i) + + // "_self_attention_dense_weight", + // attention_layer); + + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); Tensor dense_h_to_4h = ff.dense(att_norm, falcon_config.dim * 4, AC_MODE_NONE, false); Layer *dense_h_to_4h_layer = ff.layers.back(); @@ -136,13 +193,17 @@ void FALCON::create_falcon_model(FFModel &ff, // Compile the model std::cout << "------start compile ----------" << std::endl; + int tensor_partition_num = ff.config.tensor_parallelism_degree; InferenceManager *im = InferenceManager::get_inference_manager(); im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, falcon_config.n_heads, + falcon_config.n_kv_heads, falcon_config.dim, - falcon_config.dim / falcon_config.n_heads); + falcon_config.dim / falcon_config.n_heads, + tensor_partition_num); + std::cout << "------laod weights ----------" << std::endl; fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------load weight finished----------" << std::endl; diff --git a/inference/models/falcon.h b/inference/models/falcon.h index d9c330a8b9..d37ffbc713 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -32,6 +32,7 @@ class FALCON { n_layers = 32; vocab_size = 32000; n_heads = 32; + n_kv_heads = 1; dim = 4096; multiple_of = 256; norm_eps = 1e-6; @@ -54,6 +55,7 @@ class FALCON { n_layers = config_json["n_layers"]; vocab_size = config_json["vocab_size"]; n_heads = config_json["n_heads"]; + n_kv_heads = config_json["n_kv_heads"]; dim = config_json["dim"]; multiple_of = config_json["multiple_of"]; norm_eps = config_json["norm_eps"]; @@ -100,14 +102,13 @@ class FALCON { int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, total_requests, incremental_mode, max_seq_len, max_num_tokens, - max_beam_width, max_beam_depth, head_dim; + max_beam_width, max_beam_depth, head_dim, n_kv_heads; float norm_eps; }; static void create_falcon_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, - int num_pipeline_stages, InferenceMode mode, bool use_full_precision = false); }; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index fd2b7fe4f9..0cd53fb141 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -81,6 +81,7 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm, llama_config.dim, llama_config.n_heads, + llama_config.n_kv_heads, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, @@ -97,6 +98,7 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm, llama_config.dim, llama_config.n_heads, + llama_config.n_kv_heads, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, /*dropout*/ @@ -114,6 +116,7 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm, llama_config.dim, llama_config.n_heads, + llama_config.n_kv_heads, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, /*dropout*/ @@ -192,12 +195,15 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); // Compile the model std::cout << "------start compile ----------" << std::endl; + int tensor_partition_num = ff.config.tensor_parallelism_degree; im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, llama_config.n_heads, + llama_config.n_kv_heads, llama_config.dim, - llama_config.dim / llama_config.n_heads); + llama_config.dim / llama_config.n_heads, + tensor_partition_num); fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------load weight finished----------" << std::endl; diff --git a/inference/models/llama.h b/inference/models/llama.h index 61d8908d0c..46a22954e0 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -38,6 +38,7 @@ class LLAMA { total_requests = 2560; incremental_mode = true; hidden_dim = 11008; + n_kv_heads = 32; max_seq_len = BatchConfig::MAX_SEQ_LENGTH; max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -54,6 +55,7 @@ class LLAMA { n_layers = config_json["n_layers"]; vocab_size = config_json["vocab_size"]; n_heads = config_json["n_heads"]; + n_kv_heads = config_json["n_kv_heads"]; dim = config_json["dim"]; multiple_of = config_json["multiple_of"]; norm_eps = config_json["norm_eps"]; @@ -99,7 +101,7 @@ class LLAMA { int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, total_requests, incremental_mode, max_seq_len, max_num_tokens, - max_beam_width, max_beam_depth; + max_beam_width, max_beam_depth, n_kv_heads; float norm_eps; }; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 68b931716f..86ced698f0 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -109,6 +109,7 @@ void OPT::create_opt_model(FFModel &ff, hidden_states, opt_config.hidden_size, opt_config.num_attention_heads, + opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, @@ -130,6 +131,7 @@ void OPT::create_opt_model(FFModel &ff, hidden_states, opt_config.hidden_size, opt_config.num_attention_heads, + opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, @@ -151,6 +153,7 @@ void OPT::create_opt_model(FFModel &ff, hidden_states, opt_config.hidden_size, opt_config.num_attention_heads, + opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, @@ -223,14 +226,17 @@ void OPT::create_opt_model(FFModel &ff, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; + int tensor_partition_num = ff.config.tensor_parallelism_degree; InferenceManager *im = InferenceManager::get_inference_manager(); im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, opt_config.num_attention_heads, + opt_config.num_attention_heads, opt_config.hidden_size, opt_config.hidden_size / - opt_config.num_attention_heads); + opt_config.num_attention_heads, + tensor_partition_num); fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; im->init_operators_inference(&ff); diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index c374d47506..9697bd8256 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -9,7 +9,8 @@ set(CPU_SRC spec_infer.cc ../file_loader.cc ../models/llama.cc - ../models/opt.cc) + ../models/opt.cc + ../models/falcon.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 99131edb34..6b218e107c 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -14,6 +14,7 @@ */ #include "flexflow/inference.h" +#include "models/falcon.h" #include "models/llama.h" #include "models/opt.h" #include @@ -54,8 +55,12 @@ void parse_input_args(char **argv, [](unsigned char c) { return std::tolower(c); }); if (model_type_str == "llama") { model_types.llm_model_type = ModelType::LLAMA; + } else if (model_type_str == "llama2") { + model_types.llm_model_type = ModelType::LLAMA2; } else if (model_type_str == "opt") { model_types.llm_model_type = ModelType::OPT; + } else if (model_type_str == "falcon") { + model_types.llm_model_type = ModelType::FALCON; } else { model_types.llm_model_type = ModelType::UNKNOWN; } @@ -80,8 +85,12 @@ void parse_input_args(char **argv, [](unsigned char c) { return std::tolower(c); }); if (model_type_str == "llama") { model_types.ssm_model_types.push_back(ModelType::LLAMA); + } else if (model_type_str == "llama2") { + model_types.ssm_model_types.push_back(ModelType::LLAMA2); } else if (model_type_str == "opt") { model_types.ssm_model_types.push_back(ModelType::OPT); + } else if (model_type_str == "falcon") { + model_types.ssm_model_types.push_back(ModelType::FALCON); } else { model_types.ssm_model_types.push_back(ModelType::UNKNOWN); } @@ -185,7 +194,8 @@ void FlexFlow::top_level_task(Task const *task, // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); - if (model_types.llm_model_type == ModelType::LLAMA) { + if (model_types.llm_model_type == ModelType::LLAMA || + model_types.llm_model_type == ModelType::LLAMA2) { LLAMA::create_llama_model(tree_model, file_paths.llm_config_file_path, file_paths.llm_weight_file_path, @@ -198,6 +208,12 @@ void FlexFlow::top_level_task(Task const *task, file_paths.llm_weight_file_path, TREE_VERIFY_MODE, use_full_precision); + } else if (model_types.llm_model_type == ModelType::FALCON) { + FALCON::create_falcon_model(tree_model, + file_paths.llm_config_file_path, + file_paths.llm_weight_file_path, + TREE_VERIFY_MODE, + use_full_precision); } else { assert(false && "Invalid LLM model type passed (or no type was passed)."); } @@ -216,7 +232,8 @@ void FlexFlow::top_level_task(Task const *task, for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { FFModel &beam_model = ssm_models[ssm_id]; - if (model_types.ssm_model_types[ssm_id] == ModelType::LLAMA) { + if (model_types.ssm_model_types[ssm_id] == ModelType::LLAMA || + model_types.ssm_model_types[ssm_id] == ModelType::LLAMA2) { LLAMA::create_llama_model(beam_model, file_paths.ssm_config_file_paths[ssm_id], file_paths.ssm_weight_file_paths[ssm_id], @@ -229,6 +246,12 @@ void FlexFlow::top_level_task(Task const *task, file_paths.ssm_weight_file_paths[ssm_id], BEAM_SEARCH_MODE, use_full_precision); + } else if (model_types.ssm_model_types[ssm_id] == ModelType::FALCON) { + FALCON::create_falcon_model(beam_model, + file_paths.ssm_config_file_paths[ssm_id], + file_paths.ssm_weight_file_paths[ssm_id], + BEAM_SEARCH_MODE, + use_full_precision); } else { assert(false && "Invalid SSM model type passed."); } diff --git a/inference/utils/download_falcon_weights.py b/inference/utils/download_falcon_weights.py new file mode 100644 index 0000000000..a9a094f327 --- /dev/null +++ b/inference/utils/download_falcon_weights.py @@ -0,0 +1,45 @@ +# from transformer import RWForCausalLM +# from configuration_RW import RWConfig +from transformers import AutoModel +import torch +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True) +# model = AutoModel.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True) + + +# model = RWForCausalLM.from_pretrained("tiiuae/falcon-7b") +# print(model.config) + +#lm_head +lm_head_weight = model.lm_head.weight +lm_head_weight.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/lm_head_weight') + +for name, params in model.named_parameters(): + name = ( + name.replace("h.", "layers_") + .replace(".", "_").replace("word_embeddings", "tok_embeddings") + .replace("self_attn", "attention").replace("transformer_", "").replace("self_attention_dense", "attention_wo")) + # name = ( + # name.replace("h.", "layers_") + # .replace(".", "_").replace("word_embeddings", "tok_embeddings") + # .replace("self_attn", "attention").replace("transformer_", "")) + + print(name) + print(params.shape) + + #split q, k, v + if "self_attention_query_key_value" in name: + name_q = name.replace("self_attention_query_key_value", "attention_wq") + name_k = name.replace("self_attention_query_key_value", "attention_wk") + name_v = name.replace("self_attention_query_key_value", "attention_wv") + q, k, v = torch.split(params, [4544, 64, 64], 0) + print(q.shape) + print(k.shape) + print(v.shape) + q.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name_q) + k.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name_k) + v.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name_v) + + else: + params.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name) + diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index db36090587..55ece74bc1 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -451,13 +451,6 @@ class TreeIncMultiHeadSelfAttention(Op): def __init__(self, handle, idx=None, name=None): super(TreeIncMultiHeadSelfAttention, self).__init__(handle, idx, name) -# ----------------------------------------------------------------------- -# Multi-query Incremental MultiHeadAttention -# ----------------------------------------------------------------------- -class IncMultiQuerySelfAttention(Op): - def __init__(self, handle, idx=None, name=None): - super(IncMultiQuerySelfAttention, self).__init__(handle, idx, name) - # ----------------------------------------------------------------------- # RMS Norm # ----------------------------------------------------------------------- @@ -579,8 +572,6 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): return SpecIncMultiHeadSelfAttention(handle, idx, name) elif op_type == OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION: return TreeIncMultiHeadSelfAttention(handle, idx, name) - elif op_type == OpType.INC_MULTIQUERY_SELF_ATTENTION: - return IncMultiQuerySelfAttention(handle, idx, name) elif op_type == OpType.RMS_NORM: return RMSNorm(handle, idx, name) elif op_type == OpType.ARG_TOPK: @@ -2107,7 +2098,7 @@ def multihead_attention(self, query, key, value, return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) def inc_multihead_attention(self, input, - embed_dim, num_heads, + embed_dim, num_heads, num_kv_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, @@ -2170,12 +2161,12 @@ def inc_multihead_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) def spec_inc_multihead_attention(self, input, - embed_dim, num_heads, + embed_dim, num_heads, num_kv_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, @@ -2238,12 +2229,12 @@ def spec_inc_multihead_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_spec_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc.flexflow_model_add_spec_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) def inc_multihead_self_attention_verify(self, input, - embed_dim, num_heads, + embed_dim, num_heads, num_kv_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, @@ -2306,63 +2297,10 @@ def inc_multihead_self_attention_verify(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) - def inc_multiquery_self_attention(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=False, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - name=None): - """Defines the Multi-query self attention operation - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, c_name) - self.add_layer(OpType.INC_MULTIQUERY_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIQUERY_SELF_ATTENTION) - def rms_norm(self, input, eps, dim, name=None): """Defines the RMS Norm layer. diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index eeee0ba19d..81f80474dd 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -94,10 +94,11 @@ def build_model(self): ) if self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multiquery_self_attention( + mha = ffmodel.inc_multihead_self_attention( att_norm, self.falcon_config.hidden_size, self.falcon_config.n_head, + 1, self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 13245af6ff..3c83905d61 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -92,6 +92,7 @@ def build_model(self): attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, + self.llama_config.num_attention_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size @@ -110,6 +111,7 @@ def build_model(self): attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, + self.llama_config.num_attention_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size @@ -128,6 +130,7 @@ def build_model(self): attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, + self.llama_config.num_attention_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index d7d89f5fda..deb7a304ff 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -118,6 +118,7 @@ def build_model(self): hidden_states, self.opt_config.hidden_size, self.opt_config.num_attention_heads, + self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout @@ -138,6 +139,7 @@ def build_model(self): hidden_states, self.opt_config.hidden_size, self.opt_config.num_attention_heads, + self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout @@ -158,6 +160,7 @@ def build_model(self): hidden_states, self.opt_config.hidden_size, self.opt_config.num_attention_heads, + self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index d1f13e17de..a6723b38a2 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -227,8 +227,10 @@ def __load_hf_weights(self): self.fileloader = FileDataLoader( self.weights_path, self.hf_config.num_attention_heads, + self.hf_config.num_attention_heads, self.hf_config.hidden_size, self.hf_config.hidden_size // self.hf_config.num_attention_heads, + self.ffconfig.tensor_parallelism_degree, ) model_layers_with_weights = self.model.get_layers_with_weights() diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 94a0b6085c..137c8a872a 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -75,8 +75,9 @@ class InferenceMode(Enum): class ModelType(Enum): UNKNOWN = 3001 LLAMA = 3002 - OPT = 3003 - FALCON = 3004 + LLAMA2 = 3003 + OPT = 3004 + FALCON = 3005 class OpType(Enum): @@ -118,7 +119,6 @@ class OpType(Enum): INC_MULTIHEAD_ATTENTION = 2061 SPEC_INC_MULTIHEAD_SELF_ATTENTION = 2062 TREE_INC_MULTIHEAD_SELF_ATTENTION = 2063 - INC_MULTIQUERY_SELF_ATTENTION = 2064 SAMPLING = 2065 ARGMAX = 2066 GETITEM = 2070 diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index c210836d9b..788b95bfcc 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1054,6 +1054,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -1074,6 +1075,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( Tensor tensor = handle->inc_multihead_self_attention(input, embed_dim, num_heads, + num_kv_heads, kdim, vdim, dropout, @@ -1095,6 +1097,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -1116,6 +1119,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( handle->spec_inc_multihead_self_attention(input, embed_dim, num_heads, + num_kv_heads, kdim, vdim, dropout, @@ -1137,6 +1141,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -1158,6 +1163,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( handle->inc_multihead_self_attention_verify(input, embed_dim, num_heads, + num_kv_heads, kdim, vdim, dropout, @@ -1174,40 +1180,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( - flexflow_model_t handle_, - const flexflow_tensor_t input_, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool bias, - bool add_bias_kv, - bool add_zero_attn, - enum DataType data_type, - flexflow_initializer_t kernel_initializer_, - char const *name) { - FFModel *handle = FFCObjectWrapper::unwrap(handle_); - Tensor input = FFCObjectWrapper::unwrap(input_); - Initializer *kernel_initializer = - FFCObjectWrapper::unwrap(kernel_initializer_); - Tensor tensor = - handle->inc_multihead_self_attention_verify(input, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - data_type, - kernel_initializer, - name); - return FFCObjectWrapper::wrap(tensor); -} - flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, const flexflow_tensor_t input_, float eps, @@ -2334,13 +2306,20 @@ void flexflow_inference_manager_init_operators_inference( flexflow_file_data_loader_t flexflow_file_data_loader_create(char const *weight_file_path, int num_heads, + int num_kv_heads, int hidden_dim, - int qkv_inner_dim) { + int qkv_inner_dim, + int tensor_partition_num) { assert(weight_file_path != nullptr && "Cannot convert nullptr char * to std::string"); std::string const weight_file_path_str(weight_file_path); - FileDataLoader *handle = new FileDataLoader( - "", weight_file_path_str, num_heads, hidden_dim, qkv_inner_dim); + FileDataLoader *handle = new FileDataLoader("", + weight_file_path_str, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + tensor_partition_num); DEBUG_PRINT("[FileDataLoader] new %p", handle); return FFCObjectWrapper::wrap(handle); } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index aa2310b0f2..e3ee54f4a2 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -61,6 +61,7 @@ bool IncMultiHeadSelfAttentionParams::is_valid( Tensor FFModel::inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -119,16 +120,19 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int weight_size = qParas * num_heads + kParas * num_kv_heads + + vParas * num_kv_heads + oParas * num_heads; int one_head_size = qParas + kParas + vParas + oParas; + { // compress the weight size if quantization. if (quantization_type != DT_NONE) { one_head_size = get_quantization_to_byte_size( data_type, quantization_type, one_head_size); } - int dims[2] = {one_head_size, num_heads}; + int dims[1] = {weight_size}; li->weights[0] = create_weight_legion_ordering( - 2, + 1, dims, quantization_type == DT_NONE ? data_type : quantization_type, li, @@ -138,7 +142,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, } if (bias) { // q, k, v, o - int dims[1] = {(qProjSize + kProjSize + vProjSize) * num_heads + oProjSize}; + int dims[1] = {qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -150,6 +155,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); + li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); li->add_int_property("bias", bias); @@ -162,6 +168,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); + li->add_int_property("tensor_parallelism_degree", + config.tensor_parallelism_degree); layers.push_back(li); return li->outputs[0]; @@ -176,6 +184,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( int embed_dim = value; layer->get_int_property("num_heads", value); int num_heads = value; + layer->get_int_property("num_kv_heads", value); + int num_kv_heads = value; layer->get_int_property("kdim", value); int kdim = value; layer->get_int_property("vdim", value); @@ -200,12 +210,15 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( DataType quantization_type = (DataType)value; layer->get_int_property("offload", value); bool offload = (bool)value; + layer->get_int_property("tensor_parallelism_degree", value); + int tensor_parallelism_degree = (int)value; return new IncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], embed_dim, num_heads, + num_kv_heads, kdim, vdim, dropout, @@ -219,6 +232,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( false /*allocate_weights*/, quantization_type, offload, + tensor_parallelism_degree, layer->name); } @@ -228,6 +242,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( const ParallelTensor _input, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -241,6 +256,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -251,8 +267,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( (_bias ? 2 : 1), /*weights*/ 1 /*outputs*/, _input), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -260,14 +276,16 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload) { + offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; numOutputs = 1; int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; + size_t x = 1; for (int i = 0; i < numdim; i++) { dims[i] = _input->dims[i]; + x *= _input->dims[i].size; } dims[0].size = _embed_dim; // Currently require no parallelism along this dim @@ -281,23 +299,21 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( int vParas = this->vProjSize * this->vSize; int oParas = this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[3]; + ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads; + dims[1].size = this->num_heads * (qParas + oParas) + + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { - dims[2].size = get_quantization_to_byte_size( - data_type, quantization_type, dims[2].size); + dims[1].size = get_quantization_to_byte_size( + data_type, quantization_type, (qParas + kParas + vParas + oParas)); } - dims[2].degree = 1; - dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>( + weights[0] = model.create_parallel_weight<2>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, nullptr /*owner_op*/, @@ -306,8 +322,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = - (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[0].size = qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -335,6 +352,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( const ParallelTensor _weight, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -348,6 +366,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -359,8 +378,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( 1 /*outputs*/, _input, _weight), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -368,7 +387,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload) + offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -389,20 +408,22 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( int vParas = this->vProjSize * this->vSize; int oParas = this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[3]; + ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads; + dims[1].size = this->num_heads * (qParas + oParas) + + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - dims[2].size = qParas + kParas + vParas + oParas; + // dims[2].size = this->num_heads * (qParas + oParas) + this->num_kv_heads * + // (kParas + vParas); if (quantization_type != DT_NONE) { - dims[2].size = get_quantization_to_byte_size( - data_type, quantization_type, dims[2].size); + dims[1].size = get_quantization_to_byte_size( + data_type, quantization_type, (qParas + kParas + vParas + oParas)); } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>( + weights[0] = model.create_parallel_weight<2>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, NULL /*owner_op*/, @@ -411,8 +432,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = - (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[0].size = qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -447,6 +469,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( input, other.oProjSize, other.num_heads, + other.num_kv_heads, other.qProjSize, other.vProjSize, other.dropout, @@ -460,6 +483,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( allocate_weights, other.quantization_type, other.offload, + other.tensor_parallelism_degree, other.name) {} IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( @@ -473,6 +497,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( input, params.embed_dim, params.num_heads, + params.num_kv_heads, params.kdim, params.vdim, params.dropout, @@ -486,6 +511,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( allocate_weights, params.quantization_type, params.offload, + params.tensor_parallelism_degree, name) {} void IncMultiHeadSelfAttention::init_inference( @@ -612,7 +638,9 @@ OpMeta *IncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + int num_heads = attn->num_heads / attn->tensor_parallelism_degree; + int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -626,8 +654,14 @@ OpMeta *IncMultiHeadSelfAttention::init_task( gpu_mem_allocator.register_reserved_work_space( handle.offload_reserve_space, handle.offload_reserve_space_size); } - IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( - handle, attn, weight, gpu_mem_allocator, num_samples, num_heads); + IncMultiHeadSelfAttentionMeta *m = + new IncMultiHeadSelfAttentionMeta(handle, + attn, + weight, + gpu_mem_allocator, + num_samples, + num_heads, + num_kv_heads); if (handle.offload_reserve_space == nullptr) { // assert that we didn't over allocate memory assert(gpu_mem_allocator.reserved_allocated_size == @@ -761,7 +795,7 @@ void IncMultiHeadSelfAttention::inference_task( ctx, task->regions[2].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 3); + assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); assert(task->index_point.get_dim() == 1); @@ -1576,8 +1610,10 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + params.tensor_parallelism_degree = this->tensor_parallelism_degree, params.quantization_type = this->quantization_type; params.offload = this->offload; + params.num_kv_heads = this->num_kv_heads; return params; } @@ -1591,6 +1627,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); hash_combine(key, params.num_heads); + hash_combine(key, params.num_kv_heads); hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); @@ -1603,6 +1640,7 @@ size_t hash::operator()( hash_combine(key, params.qk_prod_scaling); hash_combine(key, params.quantization_type); hash_combine(key, params.offload); + hash_combine(key, params.tensor_parallelism_degree); return key; } }; // namespace std diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 9fce37fc30..c68df398df 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -64,7 +64,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + int _num_kv_heads) : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -92,7 +93,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( MemoryAllocator &gpu_mem_allocator, int num_samples, int _global_num_heads, + int _global_num_kv_heads, int _num_heads, + int _num_kv_heads, DataType _quantization_type, bool _offload) : OpMeta(handler, attn) { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index f5b72b9ac8..a014b684f3 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -30,34 +30,14 @@ using Legion::Memory; namespace Kernels { namespace IncMultiHeadAttention { -template -__global__ void build_w_out_tensor(DT const *weight_ptr, - DT *contiguous_weight_ptr, - int vProjSize, - int oProjSize, - int num_heads, - int qkv_weight_block_size) { - CUDA_KERNEL_LOOP(i, vProjSize * oProjSize * num_heads) { - // Each slice (one per head) in the weight_ptr has shape (oProjSize, - // vProjSize) - int row_idx = i % oProjSize; - int col_idx = (i / oProjSize) % vProjSize; - int head_idx = i / (oProjSize * vProjSize); - // The contiguous_weight_ptr has shape (vProjSize * num_heads, oProjSize) - int idx = row_idx * vProjSize * num_heads + vProjSize * head_idx + col_idx; - contiguous_weight_ptr[idx] = - weight_ptr[(qkv_weight_block_size + vProjSize * oProjSize) * head_idx + - qkv_weight_block_size + col_idx * oProjSize + row_idx]; - } -} - template __global__ void apply_proj_bias_w(DT *input_ptr, DT const *bias_ptr, int num_tokens, + int qkv_weight_size, int oProjSize) { CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { - int bias_idx = 3 * oProjSize + i % oProjSize; + int bias_idx = qkv_weight_size + i % oProjSize; input_ptr[i] += bias_ptr[bias_idx]; } } @@ -71,60 +51,144 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int kProjSize, int vProjSize, int global_num_heads, + int global_num_kv_heads, int num_heads, + int num_kv_heads, bool scaling_query, float scaling_factor) { - CUDA_KERNEL_LOOP( - i, num_tokens * (qProjSize + kProjSize + vProjSize) * num_heads) { + CUDA_KERNEL_LOOP(i, + num_tokens * + (qProjSize * num_heads + kProjSize * num_kv_heads + + vProjSize * num_kv_heads)) { // for simplicity, assume q, k, v is in same shape // 0->q, 1->k, 2->v - int qkv_index = i / (num_tokens * qProjSize) % 3; + // int qkv_index = i / (num_tokens * qProjSize) % 3; + + int qkv_index = i < num_tokens * qProjSize * num_heads + ? 0 + : (i < num_tokens * (qProjSize * num_heads + + kProjSize * num_kv_heads) + ? 1 + : 2); + + // int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); + // int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; + int q_block_size = qProjSize * num_tokens * num_heads; + int k_block_size = kProjSize * num_tokens * num_kv_heads; + + // int idx = i % (num_tokens * (qProjSize)); + + // int real_part_index = + // head_idx * qkv_block_size + qkv_index * q_block_size + idx; + int bias_idx = 0; + if (qkv_index == 0) { + int head_idx = i / (num_tokens * qProjSize); + int global_head_idx = head_idx + shard_id * num_heads; + int global_i = i + shard_id * num_heads * num_tokens * qProjSize; + bias_idx = global_head_idx * qProjSize + + (global_i % (num_tokens * (qProjSize)) % qProjSize); + } else { - int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int q_block_size = qProjSize * num_tokens; + int idx = + qkv_index == 1 ? i - q_block_size : i - q_block_size - k_block_size; + int pre_length = qkv_index == 1 ? qProjSize * global_num_heads + : qProjSize * global_num_heads + + kProjSize * global_num_kv_heads; - int idx = i % (num_tokens * (qProjSize)); + int head_idx = idx / (num_tokens * kProjSize); + int global_head_idx = head_idx + shard_id * num_kv_heads; + int global_idx = idx + shard_id * num_tokens * num_kv_heads * kProjSize; - int real_part_index = - head_idx * qkv_block_size + qkv_index * q_block_size + idx; + bias_idx = pre_length + global_head_idx * kProjSize + + (global_idx % (num_tokens * (qProjSize)) % qProjSize); + } + // int bias_idx = qkv_index * qProjSize * global_num_heads + + // global_head_idx * qProjSize + (idx % qProjSize); - int global_head_idx = head_idx + shard_id * num_heads; - int bias_idx = qkv_index * qProjSize * global_num_heads + - global_head_idx * qProjSize + (idx % qProjSize); - input_ptr[real_part_index] += bias_ptr[bias_idx]; + input_ptr[i] += bias_ptr[bias_idx]; if (scaling_query && qkv_index == 0) { - input_ptr[real_part_index] *= scaling_factor; + input_ptr[i] *= scaling_factor; } } } template __global__ void - apply_rotary_embedding(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int q_block_size, - int k_block_size, - int v_block_size, - bool q_tensor) { - int proj_size = q_tensor ? qProjSize : kProjSize; - CUDA_KERNEL_LOOP(i, num_tokens * proj_size * num_heads / 2) { + apply_rotary_embedding_native(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int num_kv_heads, + int q_block_size, + int k_block_size, + int q_array_size) { + CUDA_KERNEL_LOOP( + i, num_tokens * (qProjSize * num_heads + kProjSize * num_kv_heads) / 2) { + // create complex number + bool q_tensor = i < (q_array_size / 2); + int proj_size = q_tensor ? qProjSize : kProjSize; + int real_i = q_tensor ? i : i - q_array_size / 2; + + int head_idx = real_i / (num_tokens * proj_size / 2); + int idx = real_i % (num_tokens * proj_size / 2); + int real_part_index = idx * 2 + + head_idx * (q_tensor ? q_block_size : k_block_size) + + (q_tensor ? 0 : q_array_size); + + int complex_part_index = real_part_index + 1; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + int token_idx = + (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + + int pos_i = real_i % (proj_size / 2); + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void + apply_rotary_embedding_hf(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int num_kv_heads, + int q_block_size, + int k_block_size, + int q_array_size) { + CUDA_KERNEL_LOOP( + i, num_tokens * (qProjSize * num_heads + kProjSize * num_kv_heads) / 2) { // create complex number - int head_idx = i / (num_tokens * proj_size / 2); - int idx = i % (num_tokens * proj_size / 2); + bool q_tensor = i < (q_array_size / 2); + int proj_size = q_tensor ? qProjSize : kProjSize; + int real_i = q_tensor ? i : i - q_array_size / 2; + + int head_idx = real_i / (num_tokens * proj_size / 2); + int idx = real_i % (num_tokens * proj_size / 2); int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - int real_part_index = - idx + token_idx * (proj_size / 2) + - head_idx * (q_block_size + k_block_size + v_block_size) + - (q_tensor ? 0 : q_block_size); + int real_part_index = idx + token_idx * (proj_size / 2) + + head_idx * (q_tensor ? q_block_size : k_block_size) + + (q_tensor ? 0 : q_array_size); int complex_part_index = real_part_index + (proj_size / 2); complex_input[i] = {input_ptr[real_part_index], @@ -135,19 +199,16 @@ __global__ void // multiple with input & /copy back to q/k // get position of token - // int head_idx = i / (num_tokens * proj_size); // size_t pos = id_map[token_idx].token_position; size_t pos = tokenInfos[token_idx].abs_depth_in_request; // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - int pos_i = i % (proj_size / 2); + int pos_i = real_i % (proj_size / 2); float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); cuFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = cuCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; input_ptr[complex_part_index] = complex_input[i].y; } @@ -184,18 +245,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(m_q == m_k && m_k == m_v); // keep things simple for now int n = bc->num_active_tokens(); int k = m->qSize; - int lda = k, ldb = k, ldc_q = m_q, ldc_k = m_k, ldc_v = m_v; - size_t strideA = - m->weights_params; // need to also skip over all the parameters for each - // head, plus the unused W_o weights - size_t strideB = 0; // input stays the same for all heads. - size_t strideC = - (m_q + m_k + m_v) * n; // size of the output block for each head. - // Q + int m_ = m_q; + int lda = k, ldb = k, ldc = m_q; + + size_t strideA = m_q * k; // query weight head size + size_t strideB = 0; // input stays the same for all heads. + size_t strideC = m_q * n; // size of the output block for each head. + + // compute QKV checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, - m_q, + m_, n, k, &alpha, @@ -210,67 +271,20 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, &beta, output_ptr, cublas_data_type, - ldc_q, + ldc, strideC, - m->num_heads, + m->num_heads + m->num_kv_heads + + m->num_kv_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_k, - n, - k, - &alpha, - weight_ptr + m_q * k, - cublas_data_type, - lda, - strideA, - input_ptr, - cublas_data_type, - ldb, - strideB, - &beta, - output_ptr + m_q * n, - cublas_data_type, - ldc_k, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // V - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_v, - n, - k, - &alpha, - weight_ptr + (m_q + m_k) * k, - cublas_data_type, - lda, - strideA, - input_ptr, - cublas_data_type, - ldb, - strideB, - &beta, - output_ptr + (m_q + m_k) * n, - cublas_data_type, - ldc_v, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - // apply rotary emmmbedding for k and v + // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_heads; int q_block_size = m->qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; - int v_block_size = m->vProjSize * num_tokens; + int q_array_size = m->qProjSize * num_tokens * m->num_heads; // apply bias for q, k, v if (*m->bias) { apply_proj_bias_qkv<<kProjSize, m->vProjSize, m->global_num_heads, + m->global_num_kv_heads, m->num_heads, + m->num_kv_heads, *m->scaling_query, m->scaling_factor); } - if (*m->apply_rotary_embedding) { - /*q*/ - apply_rotary_embedding<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - true); - /*k*/ - apply_rotary_embedding<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - false); + /*q&k*/ + parallelism = + num_tokens * + (m->qProjSize * m->num_heads + m->kProjSize * m->num_kv_heads) / 2; + apply_rotary_embedding_hf<<>>(output_ptr, + m->complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_heads, + num_tokens, + m->num_kv_heads, + q_block_size, + k_block_size, + q_array_size); } } @@ -329,26 +332,13 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); if (num_tokens > 0) { - int parallelism = m->kProjSize * num_tokens * m->num_heads; + int parallelism = + (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; store_kv_cache<<>>(static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ true); - - parallelism = m->vProjSize * num_tokens * m->num_heads; - store_kv_cache<<>>(static_cast
(m->devQKVProjArray), static_cast
(m->valueCache), m->token_infos, m->qProjSize, @@ -356,8 +346,8 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ false); + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH); } } @@ -419,19 +409,6 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(false); } } - // reload weight_o for offloading case - int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - build_w_out_tensor<<>>(static_cast
(m->weight_ptr), - static_cast
(m->W_out_contiguous), - m->vProjSize, - m->oProjSize, - m->num_heads, - (m->qSize * m->qProjSize + - m->kSize * m->kProjSize + - m->vSize * m->vProjSize)); } template @@ -470,7 +447,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, shard_id, output_ptr, bias_ptr, stream); + compute_attention_kernel( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } } // namespace IncMultiHeadAttention @@ -480,34 +458,36 @@ using namespace Kernels::IncMultiHeadAttention; template __global__ void store_kv_cache(DT const *devQKVProjArray, - DT *cache_ptr, + DT *kCache_ptr, + DT *vCache_ptr, BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, int vProjSize, int num_tokens, int num_heads, - int max_seq_len, - bool k_cache) { - CUDA_KERNEL_LOOP(i, - num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { + int num_kv_heads, + int max_seq_len) { + CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { + int q_array_size = qProjSize * num_tokens * num_heads; + int k_array_size = kProjSize * num_tokens * num_kv_heads; + + bool k_cache = i < k_array_size; + int real_i = k_cache ? i : i - k_array_size; + int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = i / (num_tokens * proj_size); - int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = i % proj_size; - - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int current_head_block_size = - num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); - DT val = - devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + - token_idx * proj_size + data_idx]; - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; + int head_idx = real_i / (num_tokens * proj_size); + int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = real_i % proj_size; + + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * num_tokens + + token_idx * proj_size + data_idx]; int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; + cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + data_idx] = val; } @@ -536,6 +516,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int shard_id, DT *output_ptr, DT const *bias_ptr, + DT const *weight_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -551,12 +532,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; - int qkv_block_size = - (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int q_block_size = m->qProjSize * num_tokens; int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; - int kt_req_block_size = kt_block_size * m->num_heads; + int kt_req_block_size = kt_block_size * m->num_kv_heads; int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; - int vt_req_block_size = vt_block_size * m->num_heads; + int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { @@ -568,51 +548,86 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, bc->requestsInfo[i].num_tokens_in_batch; // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) + // a flag of using this scaling alpha int m_ = num_new_tokens; int n = total_tokens; int k = m->qProjSize; int lda = k, ldb = k, ldc = m_; - int strideA = qkv_block_size; + int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha DT alpha = 1.0f, beta = 0.0f; if (*m->qk_prod_scaling) { alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } // To get A, skip over Q entries from previous requests (same head) - void const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; + DT const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; // To get B, skip over K entries from previous requests (all heads + // padding) - void const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests - void *C = (void *)(m->qk_prods); - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + DT *C = static_cast
(m->qk_prods); + if (m->num_kv_heads == m->num_heads) { + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + } else { + strideB = 0; + // use cublasGemmStridedBatchedEx + int one_step_heads = m->num_heads / m->num_kv_heads; + m_ = num_new_tokens; + n = total_tokens; + k = m->qProjSize; + lda = k, ldb = k, ldc = m_; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A + step * strideA * one_step_heads, + cublas_data_type, + lda, + strideA, + B + step * kt_block_size, + cublas_data_type, + ldb, + strideB, + &beta, + C + step * strideC * one_step_heads, + cublas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. @@ -623,7 +638,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, fill_entries_above_diagonal<<>>(static_cast
(C), + stream>>>(C, num_new_tokens, total_tokens, m->num_heads, @@ -631,8 +646,6 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) - cudnnTensorDescriptor_t qk_tensor; - checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); // Before modifying the parameters below, make sure to read the following // description of the CUDNN_TENSOR_NCHW tensor layout, from // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: @@ -646,7 +659,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int c_param = total_tokens; int h_param = 1; int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, n_param, @@ -654,7 +667,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, h_param, w_param)); float softmax_alpha = 1.0f, softmax_beta = 0.0f; - void *C_softmax = (void *)(m->qk_prods_softmax); + DT *C_softmax = static_cast
(m->qk_prods_softmax); // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -663,10 +676,10 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &softmax_alpha, - qk_tensor, + m->qk_tensor, C, &softmax_beta, - qk_tensor, + m->qk_tensor, C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; @@ -679,7 +692,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, strideC = num_new_tokens * m->vProjSize; // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) - A = static_cast
(C_softmax); + A = C_softmax; // To get B, skip over V^T entries from previous requests (all heads + // padding) B = static_cast
(m->valueCache) + i * vt_req_block_size; @@ -688,36 +701,73 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + tokens_previous_requests * m->num_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->num_heads == m->num_kv_heads) { + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + int one_step_heads = m->num_heads / m->num_kv_heads; + n = m->vProjSize; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = 0; + strideC = num_new_tokens * m->vProjSize; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A + step * one_step_heads * strideA, + cublas_data_type, + lda, + strideA, + B + step * vt_block_size, + cublas_data_type, + ldb, + strideB, + &beta, + C + step * one_step_heads * strideC, + cublas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = static_cast
(m->W_out_contiguous); + A = weight_ptr + m->qSize * (m->qProjSize * m->num_heads + + m->kProjSize * m->num_kv_heads + + m->vProjSize * m->num_kv_heads); B = C; C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; @@ -746,11 +796,15 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; + apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, m->oProjSize); + output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); } assert(tokens_previous_requests == num_tokens); @@ -836,7 +890,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, INC_DECODING_MODE, attn, @@ -857,7 +912,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator, num_samples, attn->num_heads, + attn->num_kv_heads, _num_heads, + _num_kv_heads, attn->quantization_type, attn->offload) {} @@ -882,13 +939,16 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( MemoryAllocator &gpu_mem_allocator, int num_samples, int _global_num_heads, + int _global_num_kv_heads, int _num_heads, + int _num_kv_heads, DataType _quantization_type, bool _offload) : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); + checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); qSize = _qSize; kSize = _kSize; vSize = _vSize; @@ -905,10 +965,19 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( offload = _offload; global_num_heads = _global_num_heads; + global_num_kv_heads = _global_num_kv_heads; num_heads = _num_heads; - weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + - oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - weightSize = weights_params * num_heads * size_of_dt; + num_kv_heads = _num_kv_heads; + // weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + // + + // oProjSize * (vProjSize > 0 ? vProjSize : vSize)); + // weightSize = weights_params * num_heads * size_of_dt; + + weightSize = + ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * + num_heads + + (kSize * kProjSize + vSize * vProjSize) * num_kv_heads) * + size_of_dt; if (quantization_type != DT_NONE) { quantized_weightSize = get_quantization_to_byte_size( attn->data_type, quantization_type, weightSize); @@ -945,26 +1014,38 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; + // size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; + // size_t qkv_max_proj_size = + // BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; + size_t qkv_max_proj_size = - BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; + BatchConfig::MAX_NUM_TOKENS * + (qProjSize * num_heads + kProjSize * num_kv_heads + + vProjSize * num_kv_heads); + // std::cout << "num_kv_heads: " << BatchConfig::MAX_NUM_TOKENS << ", " + // << qProjSize << ", " << kProjSize << ", " << vProjSize << ", " + // << num_heads << ", " << num_kv_heads << ", " << + // qkv_max_proj_size + // << std::endl; + // assert(false); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { case INC_DECODING_MODE: case TREE_VERIFY_MODE: { - key_cache_size = num_heads * kProjSize * BatchConfig::MAX_NUM_REQUESTS * + key_cache_size = num_kv_heads * kProjSize * + BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_SEQ_LENGTH; - value_cache_size = num_heads * vProjSize * + value_cache_size = num_kv_heads * vProjSize * BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_SEQ_LENGTH; break; } case BEAM_SEARCH_MODE: { key_cache_size = - num_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + num_kv_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; value_cache_size = - num_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + num_kv_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; break; } @@ -978,8 +1059,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); size_t W_out_contiguous_size = W_out_block_size * num_heads; - size_t complex_size = - (BatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; + size_t complex_size = (BatchConfig::MAX_NUM_TOKENS * + (qProjSize * num_heads + kProjSize * num_kv_heads)) / + 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * @@ -1071,34 +1153,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( quantized_weight_ptr = gpu_mem_allocator.allocate_reserved(quantized_weightSize); } - - if (weight.data_type == DT_FLOAT) { - int parallelism = vProjSize * oProjSize * num_heads; - build_w_out_tensor<<>>( - weight.get_float_ptr(), - (float *)W_out_contiguous, - vProjSize, - oProjSize, - num_heads, - (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); - } else if (weight.data_type == DT_HALF) { - int parallelism = vProjSize * oProjSize * num_heads; - build_w_out_tensor<<>>( - weight.get_half_ptr(), - (half *)W_out_contiguous, - vProjSize, - oProjSize, - num_heads, - (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize)); - } else { - assert(weight.data_type == DT_INT4 || weight.data_type == DT_INT8); - } if (!offload) { assert(gpu_mem_allocator.reserved_total_size == gpu_mem_allocator.reserved_allocated_size); diff --git a/src/ops/inc_multiquery_self_attention.cc b/src/ops/inc_multiquery_self_attention.cc deleted file mode 100644 index 6aa6042b1a..0000000000 --- a/src/ops/inc_multiquery_self_attention.cc +++ /dev/null @@ -1,1432 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ops/inc_multiquery_self_attention.h" -#include "flexflow/ffconst_utils.h" -#include "flexflow/model.h" -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif -#include "flexflow/utils/hash_utils.h" -#include "legion/legion_utilities.h" -#ifdef INFERENCE_TESTS -#include -using namespace at::indexing; -#endif - -namespace FlexFlow { - -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::Future; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; - -LegionRuntime::Logger::Category log_inc_mqa("IncrementalMQA"); - -bool IncMultiQuerySelfAttentionParams::is_valid( - ParallelTensorShape const &input) const { - bool is_valid = input.is_valid(); - return is_valid; -} - -Tensor FFModel::inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool bias, - bool add_bias_kv, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - char const *name) { - if (data_type == DT_NONE) { - data_type = input->data_type; - } - Layer *li = nullptr; - int weight_num = bias ? 2 : 1; - if (data_type != input->data_type) { - Tensor casted_input = cast(input, data_type, "type cast for IncMQA"); - li = new Layer(this, - OP_INC_MULTIQUERY_SELF_ATTENTION, - data_type, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - casted_input); - } else { - li = new Layer(this, - OP_INC_MULTIQUERY_SELF_ATTENTION, - data_type, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - input); - } - { - int numdims = input->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[i]; - } - dims[0] = embed_dim; - li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, data_type, li, 0, true /*create_grad*/); - } - { - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - // int qSize = input->dims[0], kSize = input->dims[0], vSize = - // input->dims[0]; int qParas = qSize; int kParas = kProjSize; int vParas = - // vProjSize; int oParas = oProjSize; int dims[2] = {qParas + kParas + - // vParas + oParas, num_heads}; - - int dims[2] = {embed_dim + kProjSize + vProjSize + oProjSize, embed_dim}; - - li->weights[0] = create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - li->data_type = data_type; - li->add_int_property("embed_dim", embed_dim); - li->add_int_property("num_heads", num_heads); - li->add_int_property("kdim", kdim); - li->add_int_property("vdim", vdim); - li->add_int_property("bias", bias); - li->add_int_property("add_bias_kv", add_bias_kv); - li->add_int_property("add_zero_attn", add_zero_attn); - li->add_float_property("dropout", dropout); - layers.push_back(li); - - return li->outputs[0]; -} - -Op *IncMultiQuerySelfAttention::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("embed_dim", value); - int embed_dim = value; - layer->get_int_property("num_heads", value); - int num_heads = value; - layer->get_int_property("kdim", value); - int kdim = value; - layer->get_int_property("vdim", value); - int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); - layer->get_int_property("bias", value); - bool bias = (bool)value; - layer->get_int_property("add_bias_kv", value); - bool add_bias_kv = (bool)value; - layer->get_int_property("add_zero_attn", value); - bool add_zero_attn = (bool)value; - return new IncMultiQuerySelfAttention(model, - layer->layer_guid, - inputs[0], - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - true /*allocate_weights*/, - layer->name); -} - -IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( - FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name) - // Initializer* _bias_initializer) - : Op(model, - OP_INC_MULTIQUERY_SELF_ATTENTION, - _input->data_type, - name, - 1 /*inputs*/, - (_bias ? 2 : 1), /*weights*/ - 1 /*outputs*/, - _input), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), embed_dim(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) { - // overwrite layer_guid - layer_guid = _layer_guid; - - numOutputs = 1; - int numdim = _input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = _input->dims[i]; - } - dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - // int qParas = this->qProjSize * this->qSize; - // int kParas = this->kProjSize * this->kSize; - // int vParas = this->vProjSize * this->vSize; - // int oParas = - // this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : - // this->vSize); - ParallelDim dims[3]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].is_replica_dim = false; - dims[1].size = this->embed_dim; - dims[2].size = - this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; - dims[2].degree = 1; - dims[2].parallel_idx = -1; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>(dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* // Check correctness */ - /* assert(check_output_input_weight_parallel_dims()); */ -} - -IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( - FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_heads, - int _kdim, - int _vdim, - float _dropout, - bool _bias, - bool _add_bias_kv, - bool _add_zero_attn, - bool allocate_weights, - char const *name) - // Initializer* _bias_initializer) - : Op(model, - OP_INC_MULTIQUERY_SELF_ATTENTION, - _input->data_type, - name, - 1 /*inputs*/, - (_bias ? 2 : 1), /*weights*/ - 1 /*outputs*/, - _input, - _weight), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), embed_dim(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size) -// bias_initializer(_bias_initializer) -{ - numOutputs = 1; - int numdim = _input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = _input->dims[i]; - } - dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - // int qParas = this->qProjSize * this->qSize; - // int kParas = this->kProjSize * this->kSize; - // int vParas = this->vProjSize * this->vSize; - // int oParas = - // this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : - // this->vSize); - ParallelDim dims[3]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->embed_dim; - dims[1].is_replica_dim = false; - dims[2].size = - this->embed_dim + this->kProjSize + this->vProjSize + this->oProjSize; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, this->data_type, this); - - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ - /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ - // Check correctness - /* assert(check_output_input_weight_parallel_dims()); */ -} - -IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( - FFModel &model, - IncMultiQuerySelfAttention const &other, - const ParallelTensor input, - bool allocate_weights) - : IncMultiQuerySelfAttention(model, - other.layer_guid, - input, - other.oProjSize, - other.num_heads, - other.qProjSize, - other.vProjSize, - other.dropout, - other.bias, - other.add_bias_kv, - other.add_zero_attn, - allocate_weights, - other.name) {} - -IncMultiQuerySelfAttention::IncMultiQuerySelfAttention( - FFModel &model, - IncMultiQuerySelfAttentionParams const ¶ms, - ParallelTensor const &input, - bool allocate_weights, - char const *name) - : IncMultiQuerySelfAttention(model, - params.layer_guid, - input, - params.embed_dim, - params.num_heads, - params.kdim, - params.vdim, - params.dropout, - params.bias, - params.add_bias_kv, - params.add_zero_attn, - allocate_weights, - name) {} - -void IncMultiQuerySelfAttention::init_inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = batch_outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(IncMultiQuerySelfAttention)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); -} - -void IncMultiQuerySelfAttention::init(FFModel const &ff) { - - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(IncMultiQuerySelfAttention)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -/* - regions[0](I): input - regions[1](I): weight - regions[2](O): output -*/ -OpMeta *IncMultiQuerySelfAttention::init_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - IncMultiQuerySelfAttention const *attn = - (IncMultiQuerySelfAttention *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - GenericTensorAccessorR input = - helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, - regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorW output = - helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime); - - int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; - // assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + - // 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] - // + 1); - int num_heads = (weight.domain.hi()[1] - weight.domain.lo()[1] + 1); - // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + - // 1); - - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); - IncMultiQuerySelfAttentionMeta *m = new IncMultiQuerySelfAttentionMeta( - handle, attn, weight, gpu_mem, num_samples); - - m->profiling = attn->profiling; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); - return m; -} - -void IncMultiQuerySelfAttention::forward(FFModel const &ff) { - // IncMultiQuerySelfAttention doesn't support forward - assert(false); -} - -FutureMap IncMultiQuerySelfAttention::inference( - FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - int idx = 0; - // log_inc_mqa.debug("BatchConfig, num_tokens: %d, num_requests: %d", - // bc->num_tokens, - // bc->num_active_requests()); - IndexLauncher launcher(INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(idx++, FID_DATA); - return runtime->execute_index_space(ctx, launcher); -} - -/* - regions[0](I): input - regions[3](I): weight - regions[4](O): output -*/ -void IncMultiQuerySelfAttention::inference_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - - assert(task->regions.size() == regions.size()); - - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { - return; - } - - IncMultiQuerySelfAttentionMeta const *m = - *((IncMultiQuerySelfAttentionMeta **)task->local_args); - - assert(regions.size() == 3); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - - assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 3); - assert(output_domain.get_dim() == 4); - - IncMultiQuerySelfAttention::inference_kernel_wrapper( - m, bc, input, weight, output); -#ifdef INFERENCE_TESTS - printf("Checking IncMultiQuerySelfAttention computations...\n"); - - // ============================================================================= - // Define helper functions to handle row-major arrays - // ============================================================================= - - auto set_value_row_major = [](float *arr, - std::vector const &shape, - std::vector const &indices, - float value) -> void { - int offset = 0; - for (int i = 0; i < shape.size(); i++) { - int index = indices[i]; - int stride = 1; - for (int j = i + 1; j < shape.size(); j++) { - stride *= shape[j]; - } - offset += index * stride; - } - *(arr + offset) = value; - }; - - // ============================================================================= - // Load input/output/weights and parse general configs - // ============================================================================= - - float *input_cpu = - download_tensor(input.get_float_ptr(), input_domain.get_volume()); - assert(input_cpu != nullptr); - float *weight_cpu = download_tensor(weight.get_float_ptr(), - weight_domain.get_volume()); - assert(weight_cpu != nullptr); - float *output_cpu = download_tensor(output.get_float_ptr(), - output_domain.get_volume()); - assert(output_cpu != nullptr); - - // Input tensor dimensions - coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; - coord_t max_sequence_length = input_domain.hi()[1] - input_domain.lo()[1] + 1; - coord_t batch_size = input_domain.hi()[2] - input_domain.lo()[2] + 1; - coord_t replica_dim = input_domain.hi()[3] - input_domain.lo()[3] + 1; - assert(replica_dim == 1); - - size_t effective_batch_size = max_sequence_length * batch_size; - float inputs_arr[data_dim][effective_batch_size] = {0}; - for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { - size_t data_index = i % data_dim; - size_t token_index = i / data_dim; - assert(data_index < data_dim); - assert(token_index < effective_batch_size); - inputs_arr[data_index][token_index] = input_cpu[i]; - } - torch::Tensor torch_input = torch::from_blob( - inputs_arr, {data_dim, (long int)effective_batch_size}, torch::kFloat32); - - // Weight tensor dimensions - coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; - coord_t num_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; - replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; - size_t qParas = m->qProjSize * m->qSize; - size_t kParas = m->kProjSize * m->kSize; - size_t vParas = m->vProjSize * m->vSize; - size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); - - assert(all_weight_params == qParas + kParas + vParas + oParas); - assert(num_heads == m->num_heads); - assert(replica_dim == 1); - - assert(m->qSize == m->kSize && m->kSize == m->vSize); - // printf("m->qSize: %i\n", m->qSize); - // keep things simple for now - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - long int proj_sum = m->qProjSize + m->kProjSize + m->vProjSize; - // load weight manually because Torch can't easily read a tensor serialized in - // column-major order. - - // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " - // "bc->num_active_tokens(): %i, num_heads: %lli, - // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", - // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), - // num_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); - // for (int t=0; t < bc->num_active_tokens(); t++) { - // printf("token %i has request_index: %li and token_position: %li\n", - // t, bc->token2ids.token_indexes[t].request_index, - // bc->token2ids.token_indexes[t].token_position); - // } - - // ============================================================================= - // Load the output tensor (with CUDA results), and create a Torch tensor - // ============================================================================= - - float output_cuda[m->oProjSize][effective_batch_size] = {0}; - for (int i = 0; i < m->oProjSize * effective_batch_size; i++) { - int row_idx = i % m->oProjSize; - int col_idx = i / m->oProjSize; - assert(row_idx < m->oProjSize && col_idx < effective_batch_size); - output_cuda[row_idx][col_idx] = output_cpu[i]; - } - torch::Tensor torch_out_cuda = - torch::from_blob(output_cuda, - {m->oProjSize, (int64_t)effective_batch_size}, - torch::kFloat32); - - // ============================================================================= - // Load the Q/K/V projection weights, and create a Torch tensor - // ============================================================================= - std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_heads}; - float *w_qkv = - (float *)calloc(m->qSize * m->qProjSize * 3 * num_heads, sizeof(float)); - assert(w_qkv[0] == 0.0f); - - for (int h = 0; h < num_heads; h++) { - for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { - int row_index = i % m->qSize; - int column_index = i / m->qSize; - // Q - set_value_row_major(w_qkv, - w_qkv_shape, - {row_index, column_index, 0, h}, - weight_cpu[all_weight_params * h + - m->qSize * column_index + row_index]); - // K - set_value_row_major( - w_qkv, - w_qkv_shape, - {row_index, column_index, 1, h}, - weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + - m->qSize * column_index + row_index]); - // V - set_value_row_major( - w_qkv, - w_qkv_shape, - {row_index, column_index, 2, h}, - weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + - m->qSize * column_index + row_index]); - } - } - // convert weights to torch tensor - torch::Tensor torch_w_qkv = torch::from_blob( - w_qkv, {m->qSize, m->qProjSize, 3, (int)num_heads}, torch::kFloat32); - - /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() - << std::endl; - std::cout << "Torch input size: " << torch_input.sizes() << std::endl; - std::cout << "Number of active tokens: " << bc->num_active_tokens() - << std::endl; */ - // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; - - // ============================================================================= - // Compute the Q/K/V projections, and compare the results with CUDA - // ============================================================================= - - // ----------------------- C++ computations & checks ------------------------ - torch::Tensor qkv_projs = torch::einsum( - "ijkl,im->jmkl", - {torch_w_qkv, - torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); - // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; - assert(qkv_projs.sizes()[0] == m->qProjSize); - assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && - qkv_projs.sizes()[1] <= effective_batch_size); - assert(qkv_projs.sizes()[2] == 3); - assert(qkv_projs.sizes()[3] == num_heads); - free(w_qkv); - - // ----------------------- Loading CUDA results for this step --------------- - float *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, - BatchConfig::MAX_NUM_TOKENS * - proj_sum * m->num_heads); - assert(QKVProjArray_cpu != nullptr); - - std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc->num_active_tokens(), 3, (int)num_heads}; - float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc->num_active_tokens() * 3 * num_heads, sizeof(float)); - - // skip over padding at the end of QKVProjArray_cpu - // convert from column order to 3D matrix because torch cannot automatically - // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { - int proj_size_index = i % m->qProjSize; - int head_index = i / (proj_sum * bc->num_active_tokens()); - int token_index = - ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % - bc->num_active_tokens(); - int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / - (m->qProjSize * bc->num_active_tokens()); - assert(proj_size_index < proj_sum); - assert(head_index < num_heads); - assert(token_index < bc->num_active_tokens()); - assert(qkv_offset < 3); - set_value_row_major(QKVProjArray_converted, - QKVProjArray_converted_shape, - {proj_size_index, token_index, qkv_offset, head_index}, - QKVProjArray_cpu[i]); - } - torch::Tensor QKVProjArray_torch = - torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc->num_active_tokens(), 3, num_heads}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - // std::cout << "QKVProjArray_torch" << std::endl; - // for (int i=0; inum_active_tokens(); t++) { - for (size_t d = 0; d < m->kProjSize; d++) { - size_t kcache_idx = - d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * m->num_heads * - BatchConfig::MAX_NUM_REQUESTS + - h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; - m->kcache[kcache_idx] = - qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) - .item(); - } - for (size_t d = 0; d < m->vProjSize; d++) { - size_t vcache_idx = - d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * m->num_heads * - BatchConfig::MAX_NUM_REQUESTS + - h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; - m->vcache[vcache_idx] = - qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) - .item(); - } - } - } - // Create torch tensors from the arrays - torch::Tensor K_t = torch::from_blob( - m->kcache, - {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - torch::Tensor V_t = torch::from_blob( - m->vcache, - {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - - // Compute useful indices - std::vector req_idxs; - std::vector r_first_idx; - std::vector r_num_tokens; - for (size_t t = 0; t < bc->num_active_tokens(); t++) { - size_t rid = bc->tokensInfo[t].request_index; - if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { - req_idxs.push_back(rid); - r_first_idx.push_back(t); - r_num_tokens.push_back(1); - } else { - r_num_tokens[r_num_tokens.size() - 1]++; - } - assert(req_idxs.size() == r_first_idx.size() && - r_first_idx.size() == r_num_tokens.size()); - } - assert(req_idxs.size() == bc->num_active_requests()); - assert(std::accumulate(r_num_tokens.begin(), - r_num_tokens.end(), - decltype(r_num_tokens)::value_type(0)) == - bc->num_active_tokens()); - - // ----------------------- Loading CUDA results for this step --------------- - float *keyCache_cpu = - download_tensor(m->keyCache, - m->num_heads * m->kProjSize * - BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); - float *valueCache_cpu = - download_tensor(m->valueCache, - m->num_heads * m->vProjSize * - BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); - assert(keyCache_cpu != nullptr); - assert(valueCache_cpu != nullptr); - - float *kcache_cuda = (float *)calloc( - m->kProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - float *vcache_cuda = (float *)calloc( - m->vProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - int index = 0; - for (int i = 0; i < m->kProjSize; i++) { - for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_heads; k++) { - for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_heads + - k * m->kProjSize * MAX_SEQ_LEN + - j * m->kProjSize + i; - kcache_cuda[index++] = keyCache_cpu[col_major_index]; - } - } - } - } - index = 0; - for (int i = 0; i < m->vProjSize; i++) { - for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_heads; k++) { - for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_heads + - k * m->vProjSize * MAX_SEQ_LEN + - j * m->vProjSize + i; - vcache_cuda[index++] = valueCache_cpu[col_major_index]; - } - } - } - } - torch::Tensor K_t_cuda = torch::from_blob( - kcache_cuda, - {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - torch::Tensor V_t_cuda = torch::from_blob( - vcache_cuda, - {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - - // std::cout << "kcache differences:" << std::endl; - // for (int i=0; i < bc->num_active_requests() + 1; i++) { - // for (int j=0; j < num_heads; j++) { - // for (int l=0; l < m->kProjSize; l++) { - // for (int k=0; k < MAX_SEQ_LEN; k++) { - // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // j * BatchConfig::MAX_NUM_REQUESTS + - // i; - // if ( abs(m->kcache[kcache_idx] - keyCache_cpu[ - // i * m->kProjSize * MAX_SEQ_LEN * num_heads + - // j * m->kProjSize * MAX_SEQ_LEN + - // k * m->kProjSize + - // l - // ]) > 0.00001) { - // printf("req: %i (rid: %i), head: %i, data_dim: %i, token_pos: - // %i\n", - // i, req_idxs[i], j, l, k); - // } - // } - // } - // } - // } - - // std::cout << "keyCache from CUDA:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jkProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // printf("%f ", - // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_heads + - // j * m->kProjSize * MAX_SEQ_LEN + - // k * m->kProjSize + - // l - // ]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // std::cout << "valueCache from CUDA:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jvProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // printf("%f ", - // valueCache_cpu[ - // i * m->vProjSize * MAX_SEQ_LEN * num_heads + - // j * m->vProjSize * MAX_SEQ_LEN + - // k * m->vProjSize + - // l]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // printf("\n"); - - // std::cout << "C++ kcache:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; j < num_heads; j++) { - // for (int l=0; l < m->kProjSize; l++) { - // for (int k=0; k < MAX_SEQ_LEN; k++) { - // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // j * BatchConfig::MAX_NUM_REQUESTS + - // i; - // printf("%f ", m->kcache[kcache_idx]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // std::cout << "C++ vcache:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jvProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // size_t vcache_idx = - // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // j * BatchConfig::MAX_NUM_REQUESTS + - // i; - // printf("%f ", m->vcache[vcache_idx]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - assert(torch::allclose(K_t_cuda, K_t, 1e-05, 1e-05)); - assert(torch::allclose(V_t_cuda, V_t, 1e-05, 1e-05)); - free(kcache_cuda); - free(vcache_cuda); - - // ============================================================================= - // Load the W_out projection weights - // ============================================================================= - - // ----------------------- C++ operations & checks -------------------------- - float *w_out = (float *)calloc(m->vProjSize * m->num_heads * m->oProjSize, - sizeof(float)); - std::vector w_out_shape = {m->vProjSize, m->num_heads, m->oProjSize}; - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - for (int h = 0; h < num_heads; h++) { - for (int v = 0; v < m->vProjSize; v++) { - for (int o = 0; o < m->oProjSize; o++) { - set_value_row_major( - w_out, - w_out_shape, - {v, h, o}, - weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + - m->vProjSize * o + v]); - } - } - } - // convert weights to torch tensor - torch::Tensor torch_w_out = torch::from_blob( - w_out, {m->vProjSize, m->num_heads, m->oProjSize}, torch::kFloat32); - - // ----------------------- Loading CUDA results for this step --------------- - float *w_out_cuda = download_tensor( - m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); - assert(w_out_cuda != nullptr); - float *converted_wout_tensor = (float *)calloc( - m->vProjSize * m->num_heads * m->oProjSize, sizeof(float)); - std::vector converted_wout_tensor_shape = { - m->vProjSize, m->num_heads, m->oProjSize}; - - for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { - int v_idx = i % m->vProjSize; - int h_idx = (i / m->vProjSize) % m->num_heads; - int o_idx = i / (m->vProjSize * m->num_heads); - assert(v_idx < m->vProjSize && h_idx < m->num_heads && - o_idx < m->oProjSize); - set_value_row_major(converted_wout_tensor, - converted_wout_tensor_shape, - {v_idx, h_idx, o_idx}, - w_out_cuda[i]); - } - torch::Tensor w_out_cuda_tensor = - torch::from_blob(converted_wout_tensor, - {m->vProjSize, m->num_heads, m->oProjSize}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); - free(converted_wout_tensor); - - // ============================================================================= - // Compute the softmax(QK^T/sqrt(d_k))V product, request by request - // ============================================================================= - - // ----------------------- C++ initialization steps ------------------------- - torch::Tensor Q_projs = qkv_projs.index({Slice(), Slice(), 0, Slice()}) - .reshape({qkv_projs.sizes()[0], - qkv_projs.sizes()[1], - qkv_projs.sizes()[3]}); - - torch::Tensor qk_products[bc->num_active_requests()]; - torch::Tensor qk_softmax[bc->num_active_requests()]; - torch::Tensor attn_heads[bc->num_active_requests()]; - - torch::Tensor cpp_output = - torch::zeros({m->oProjSize, bc->num_active_tokens()}); - - // ----------------------- Loading CUDA results for this step --------------- - float *qk_prods_cpu = download_tensor( - m->qk_prods, - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); - assert(qk_prods_cpu != nullptr); - - float *qk_prods_softmax_cpu = download_tensor( - m->qk_prods_softmax, - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); - assert(qk_prods_softmax_cpu != nullptr); - - float *attn_heads_cpu = download_tensor( - m->attn_heads, BatchConfig::MAX_NUM_TOKENS * m->num_heads * m->vProjSize); - assert(attn_heads_cpu != nullptr); - - // ----------------------- Main loop (request by request) ------------------- - size_t qk_prods_cpu_offset = 0; - - for (size_t r = 0; r < bc->num_active_requests(); r++) { - // Compute pre-request parameters - size_t num_new_tokens = r_num_tokens[r]; - int64_t rid = (int64_t)(req_idxs[r]); - int64_t num_tokens_received_so_far = - (int64_t)(bc->requestsInfo[rid].token_start_offset + - bc->requestsInfo[rid].num_tokens_in_batch); - assert(num_new_tokens == bc->requestsInfo[rid].num_tokens_in_batch); - assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); - - // ----------------------- C++ computations ------------------------------- - // Get the slice of the Q projection tensor with the tokens in the current - // request - torch::Tensor Q_req = - Q_projs.index({Slice(), - Slice(r_first_idx[r], r_first_idx[r] + num_new_tokens), - Slice()}); - // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; - assert(Q_req.sizes()[0] == m->qProjSize); - assert(Q_req.sizes()[1] == num_new_tokens); - assert(Q_req.sizes()[2] == num_heads); - - /*printf("\n------------ QK multiplication (C++) -------------\n"); - printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, - rid: %li, Qproj slice: (%i, %i)\n", r, num_new_tokens, - num_tokens_received_so_far, rid, r_first_idx[r], r_first_idx[r] + - num_new_tokens); - - std::cout << "Q_req matrix (idk dims):" << std::endl << - Q_req.index({Slice(), Slice(), 0}) << std::endl << std::endl; std::cout << - "K_t matrix (ilk dims):" << std::endl << K_t.index({Slice(), Slice(0, - num_tokens_received_so_far), 0, rid}) << std::endl << std::endl; std::cout - << "C++ alpha: " << (1.0f / sqrt(m->kProjSize)) << std::endl;*/ - - // Compute (Q*K^T)/sqrt(d_k) matmul - qk_products[r] = - torch::einsum("ijk,ilk->jlk", - {Q_req, - K_t.index({Slice(), - Slice(0, num_tokens_received_so_far), - Slice(), - rid})}) * - (1.0f / sqrt(m->kProjSize)); - - // Set entries above diagonal to -inf to make attention causal. - for (int h = 0; h < num_heads; h++) { - qk_products[r].index( - {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = - qk_products[r] - .index({Slice(), - Slice(num_tokens_received_so_far - num_new_tokens), - h}) - .tril() + - torch::full({(int64_t)num_new_tokens, (int64_t)num_new_tokens}, - -INFINITY) - .triu() - .fill_diagonal_(0); - } - // Compute softmax for each request block - qk_softmax[r] = torch::softmax(qk_products[r], -2); - assert(qk_softmax[r].sizes()[0] == num_new_tokens); - assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); - assert(qk_softmax[r].sizes()[2] == m->num_heads); - - // ------------------- Loading CUDA results for this step --------------- - float *converted_qk_prod = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); - float *converted_qk_prod_softmax = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); - std::vector converted_qk_prod_shape = { - (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_heads}; - - for (size_t i = 0; - i < num_new_tokens * num_tokens_received_so_far * num_heads; - i++) { - size_t new_t_idx = i % num_new_tokens; - size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; - size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); - assert(new_t_idx < num_new_tokens && - all_t_idx < num_tokens_received_so_far && head_idx < num_heads); - set_value_row_major(converted_qk_prod, - converted_qk_prod_shape, - {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, - qk_prods_cpu[i + qk_prods_cpu_offset]); - set_value_row_major(converted_qk_prod_softmax, - converted_qk_prod_shape, - {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, - qk_prods_softmax_cpu[i + qk_prods_cpu_offset]); - } - torch::Tensor qk_prods_cuda = torch::from_blob( - converted_qk_prod, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, - torch::kFloat32); - torch::Tensor qk_prods_softmax_cuda = torch::from_blob( - converted_qk_prod_softmax, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, - torch::kFloat32); - - // ------------------- Comparing C++ & CUDA results ------------------ - /* std::cout << "C++:" <vProjSize); - assert( - V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) - .sizes()[1] == num_tokens_received_so_far); - assert( - V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) - .sizes()[2] == m->num_heads); - attn_heads[r] = torch::einsum( - "ijk,ljk->ilk", - {qk_softmax[r], - V_t.index( - {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); - assert(attn_heads[r].sizes()[0] == num_new_tokens); - assert(attn_heads[r].sizes()[1] == m->vProjSize); - assert(attn_heads[r].sizes()[2] == m->num_heads); - - // ------------------- Loading CUDA results for this step --------------- - float converted_attn_heads_cpu[num_new_tokens][m->vProjSize][m->num_heads] = - {0}; - for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_heads; i++) { - int token_ix = i % num_new_tokens; - int vproj_idx = (i / num_new_tokens) % m->vProjSize; - int head_idx = i / (num_new_tokens * m->vProjSize); - assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && - head_idx < m->num_heads); - converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = - attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_heads + i]; - } - torch::Tensor converted_attn_heads_cuda = - torch::from_blob(converted_attn_heads_cpu, - {(int64_t)num_new_tokens, m->vProjSize, m->num_heads}, - torch::kFloat32); - - // -------------------- Comparing C++ & CUDA results ------------------- - /* std::cout << "CUDA attn head for req " << r << ":" <num_heads; h++) { - std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << - std::endl; - } - std::cout << "C++ attn head for req " << r << ":" <num_heads; h++) { - std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; - } */ - assert(torch::allclose( - converted_attn_heads_cuda, attn_heads[r], 1e-05, 1e-05)); - - // ----------------------- C++ computations ---------------------------- - // Compute output values by projecting all heads to output space - cpp_output.index( - {Slice(), - Slice(r_first_idx[r], r_first_idx[r] + (int64_t)num_new_tokens)}) = - torch::einsum("jkl,ijk->li", {torch_w_out, attn_heads[r]}); - - // increment main loop's auxiliary index - qk_prods_cpu_offset += - num_new_tokens * num_tokens_received_so_far * num_heads; - } - - // ----------------------- Comparing C++ & CUDA results --------------------- - /* std::cout << "C++:" <oProjSize; i++) { - std::cout << cpp_output.index({i, Slice()}) << std::endl; - } - std::cout << "CUDA:" <oProjSize; i++) { - std::cout << torch_out_cuda.index({i, Slice(0, - (int64_t)bc->num_active_tokens())}) << std::endl; - } */ - - assert(torch::allclose( - torch_out_cuda.index( - {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), - cpp_output, - 1e-05, - 1e-05)); - - // ============================================================================= - // Cleanup - // ============================================================================= - free(w_out); - checkCUDA(cudaFreeHost(input_cpu)); - checkCUDA(cudaFreeHost(weight_cpu)); - checkCUDA(cudaFreeHost(output_cpu)); - checkCUDA(cudaFreeHost(QKVProjArray_cpu)); - checkCUDA(cudaFreeHost(keyCache_cpu)); - checkCUDA(cudaFreeHost(valueCache_cpu)); - checkCUDA(cudaFreeHost(qk_prods_cpu)); - checkCUDA(cudaFreeHost(qk_prods_softmax_cpu)); - checkCUDA(cudaFreeHost(attn_heads_cpu)); - checkCUDA(cudaFreeHost(w_out_cuda)); - // assert(false && "All good if you see this assert failure! :)"); -#endif - // Done with INFERENCE_TESTS block -} - -void IncMultiQuerySelfAttention::backward(FFModel const &ff) { - // IncMultiQuerySelfAttention does not support backward - assert(false); -} - -bool IncMultiQuerySelfAttention::get_int_parameter(PMParameter para, - int *value) const { - switch (para) { - case PM_NUM_HEADS: - *value = num_heads; - return true; - default: - return Op::get_int_parameter(para, value); - } -} - -bool IncMultiQuerySelfAttention::measure_operator_cost( - Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; -} - -bool operator==(IncMultiQuerySelfAttentionParams const &lhs, - IncMultiQuerySelfAttentionParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && - lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && - lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && - lhs.add_zero_attn == rhs.add_zero_attn; -} - -IncMultiQuerySelfAttentionParams - IncMultiQuerySelfAttention::get_params() const { - IncMultiQuerySelfAttentionParams params; - params.layer_guid = this->layer_guid; - params.embed_dim = this->oProjSize; - params.num_heads = this->num_heads; - params.kdim = this->kProjSize; - params.vdim = this->vProjSize; - params.dropout = this->dropout; - params.bias = this->bias; - params.add_bias_kv = this->add_bias_kv; - params.add_zero_attn = this->add_zero_attn; - - return params; -} - -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::IncMultiQuerySelfAttentionParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.layer_guid.id); - hash_combine(key, params.embed_dim); - hash_combine(key, params.num_heads); - hash_combine(key, params.kdim); - hash_combine(key, params.vdim); - hash_combine(key, params.dropout); - hash_combine(key, params.bias); - hash_combine(key, params.add_bias_kv); - hash_combine(key, params.add_zero_attn); - return key; -} -}; // namespace std diff --git a/src/ops/inc_multiquery_self_attention.cpp b/src/ops/inc_multiquery_self_attention.cpp deleted file mode 100644 index c032e887a7..0000000000 --- a/src/ops/inc_multiquery_self_attention.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ops/inc_multiquery_self_attention.h" -#include "flexflow/utils/hip_helper.h" -#include - -namespace FlexFlow { - -// declare Legion names -using Legion::coord_t; -using Legion::Memory; - -/*static*/ -void IncMultiQuerySelfAttention::inference_kernel_wrapper( - IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - hipEvent_t t_start, t_end; - if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); - } - - handle_unimplemented_hip_kernel(OP_INC_MULTIQUERY_SELF_ATTENTION); - - if (m->profiling) { - hipEventRecord(t_end, stream); - checkCUDA(hipEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); - printf("IncMultiQuerySelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); - } -} - -IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( - FFHandler handler, - IncMultiQuerySelfAttention const *attn, - GenericTensorAccessorR const &weight, - Memory gpu_mem, - int num_samples) - : OpMeta(handler, attn) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDNN(miopenSetStream(handler.dnn, stream)); -} - -IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( - FFHandler handler, - InferenceMode infer_mode, - Op const *attn, - int _qSize, - int _kSize, - int _vSize, - int _qProjSize, - int _kProjSize, - int _vProjSize, - int _oProjSize, - int _embed_dim, - bool _bias, - bool _add_bias_kv, - GenericTensorAccessorR const &weight, - Legion::Memory gpu_mem, - int num_samples) - : OpMeta(handler, attn) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDNN(miopenSetStream(handler.dnn, stream)); -} - -IncMultiQuerySelfAttentionMeta::~IncMultiQuerySelfAttentionMeta(void) {} - -}; // namespace FlexFlow diff --git a/src/ops/inc_multiquery_self_attention.cu b/src/ops/inc_multiquery_self_attention.cu deleted file mode 100644 index 1193219c9c..0000000000 --- a/src/ops/inc_multiquery_self_attention.cu +++ /dev/null @@ -1,797 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "cuComplex.h" -#endif -#include "flexflow/ffconst_utils.h" -#include "flexflow/ops/inc_multiquery_self_attention.h" -#include "flexflow/utils/cuda_helper.h" - -namespace FlexFlow { - -// declare Legion names -using Legion::coord_t; -using Legion::Memory; - -namespace Kernels { -namespace IncMultiHeadAttention { - -template -__global__ void apply_rotary_embedding_multi_query( - DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int q_block_size, - int k_block_size, - int v_block_size, - bool q_tensor) { - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_num_heads = q_tensor ? num_heads : 1; - - CUDA_KERNEL_LOOP(i, num_tokens * proj_size * real_num_heads / 2) { - // create complex number - int head_idx = q_tensor ? i / (num_tokens * proj_size / 2) : 0; - int idx = i % (num_tokens * proj_size / 2); - int token_idx = - (i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - - int real_part_index = - idx + token_idx * (proj_size / 2) + - (q_tensor ? head_idx * q_block_size : num_heads * q_block_size); - int complex_part_index = real_part_index + (proj_size / 2); - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 - // apply a Cartesian coordinate transformation - // multiple with input & /copy back to q/k - - // get position of token - // int head_idx = i / (num_tokens * proj_size); - - // size_t pos = id_map[token_idx].token_position; - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - int pos_i = i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -template -void compute_qkv_kernel(IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - cudaStream_t stream) { - - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; - assert(m->qSize == m->vSize && m->qSize == m->kSize); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; -#endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_heads - int num_tokens = bc->num_active_tokens(); - int m_q = m->qProjSize; - int n = bc->num_active_tokens(); - int k = m->qSize; - int lda = k, ldb = k, ldc = m_q; - size_t strideA = m_q * k; - size_t strideB = 0; - size_t strideC = m_q * n; // size of the output block for each head. - // q - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_q, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - strideA, - input_ptr, - cublas_data_type, - ldb, - strideB, - &beta, - output_ptr, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // k - int m_ = m->kProjSize; - int k_ = m->embed_dim; - int n_ = num_tokens; - lda = k_, ldb = k_, ldc = m_; - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n_, - k_, - &alpha, - weight_ptr + m->embed_dim * m->embed_dim, - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr + num_tokens * m->embed_dim, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - // v - checkCUDA( - cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n_, - k_, - &alpha, - weight_ptr + m->embed_dim * (m->embed_dim + m->kProjSize), - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr + num_tokens * (m->embed_dim + m->kProjSize), - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save_tensor
(output_ptr, 4544 * - // 7,"/home/ubuntu/FlexFlow/inference/q_before.txt"); - int q_block_size = m->qProjSize * num_tokens; - int k_block_size = m->kProjSize * num_tokens; - int v_block_size = m->vProjSize * num_tokens; - int parallelism = m->qProjSize * num_tokens * m->num_heads / 2; - apply_rotary_embedding_multi_query<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - true); - parallelism = m->kProjSize * num_tokens / 2; - apply_rotary_embedding_multi_query<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - m->num_heads, - num_tokens, - q_block_size, - k_block_size, - v_block_size, - false); - - // save_tensor
(output_ptr, 64 * 7 * 2, - // "/home/ubuntu/FlexFlow/inference/query.txt"); - // save_tensor
(output_ptr, 4544 * - // 7,"/home/ubuntu/FlexFlow/inference/q.txt"); print_tensor
(output_ptr - // + num_new_tokens * (m->embed_dim + m->kProjSize), 32, "vvvvvvvvv"); -} - -template -void update_kv_cache_kernel(IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - if (num_tokens > 0) { - int parallelism = m->kProjSize * num_tokens; - store_kv_cache_multi_query<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ true); - - parallelism = m->vProjSize * num_tokens; - store_kv_cache_multi_query<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->valueCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ false); - } -} - -template -void inference_kernel(IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - cudaStream_t stream) { - // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - stream); - - // phase 2: Update key/val cache - update_kv_cache_kernel
(m, bc, stream); - - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, output_ptr, weight_ptr, stream); -} - -} // namespace IncMultiHeadAttention -} // namespace Kernels - -using namespace Kernels::IncMultiHeadAttention; - -template -__global__ void - store_kv_cache_multi_query(DT const *devQKVProjArray, - DT *cache_ptr, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens, - int num_heads, - int max_seq_len, - bool k_cache) { - CUDA_KERNEL_LOOP(i, num_tokens * (k_cache ? kProjSize : vProjSize)) { - int proj_size = k_cache ? kProjSize : vProjSize; - // int head_idx = i / (num_tokens * proj_size); - // int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; - int token_idx = i / proj_size; - int data_idx = i % proj_size; - - // int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - // int current_head_block_size = - // num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); - - // |q|k|v| - int pre_size = num_tokens * qProjSize * num_heads + - (k_cache ? 0 : kProjSize * num_tokens); - - DT val = devQKVProjArray[pre_size + token_idx * proj_size + data_idx]; - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - cache_ptr[req_id * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; - } -} - -template -__global__ void - fill_entries_above_diagonal_multi_query(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; - } -} - -template -void compute_attention_kernel(IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - DT *output_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; -#endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int qkv_block_size = (m->qProjSize) * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; - int kt_req_block_size = kt_block_size; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; - int vt_req_block_size = vt_block_size; - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { - if (bc->request_completed[i]) { - continue; - } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + - bc->requestsInfo[i].num_tokens_in_batch; - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; - int strideA = qkv_block_size; - int strideB = 0; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - // To get A, skip over Q entries from previous requests (same head) - void const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; - // To get B, skip over K entries from previous requests (all heads + - // padding) - void const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests - void *C = (void *)(m->qk_prods); - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save_tensor
( - // (DT *)A, 64 * 7 * 2, "/home/ubuntu/FlexFlow/inference/query.txt"); - // save_tensor
((DT *)B, 64 * 7, - // "/home/ubuntu/FlexFlow/inference/key.txt"); print_tensor
((DT - // *)m->qk_prods, 32, "output qkprod"); - - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_heads * entries_above_diagonal; - fill_entries_above_diagonal_multi_query<<>>( - static_cast
(C), - num_new_tokens, - total_tokens, - m->num_heads, - entries_above_diagonal, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - cudnnTensorDescriptor_t qk_tensor; - checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - void *C_softmax = (void *)(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - qk_tensor, - C, - &softmax_beta, - qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; - k = total_tokens; - lda = m_, ldb = n, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = 0; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = static_cast
(C_softmax); - // To get B, skip over V^T entries from previous requests (all heads + - // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_heads * m->vProjSize; - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + - m->embed_dim * (m->embed_dim + m->kProjSize + m->vProjSize); - B = C; - C = (output_ptr + tokens_previous_requests * m->oProjSize); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - } - - // print_tensor
(output_ptr, 32, "output 3"); - // save_tensor
( - // output_ptr, 7 * 4544, "/home/ubuntu/FlexFlow/inference/op.txt"); - // assert(false); - - assert(tokens_previous_requests == num_tokens); -} - -/*static*/ -void IncMultiQuerySelfAttention::inference_kernel_wrapper( - IncMultiQuerySelfAttentionMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // bool use_bias = *m->bias; - - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } - - assert(input.data_type == weight.data_type); - assert(input.data_type == output.data_type); - if (input.data_type == DT_HALF) { - Kernels::IncMultiHeadAttention::inference_kernel(m, - bc, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - stream); - } else if (input.data_type == DT_FLOAT) { - Kernels::IncMultiHeadAttention::inference_kernel(m, - bc, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - stream); - } else { - assert(false && "Unspported data type"); - } - if (m->profiling) { - cudaEventRecord(t_end, stream); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("IncMultiQuerySelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); - } -} - -IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( - FFHandler handler, - IncMultiQuerySelfAttention const *attn, - GenericTensorAccessorR const &weight, - Memory gpu_mem, - int num_samples) - : IncMultiQuerySelfAttentionMeta(handler, - INC_DECODING_MODE, - attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->embed_dim, - attn->bias, - attn->add_bias_kv, - weight, - gpu_mem, - num_samples) {} - -IncMultiQuerySelfAttentionMeta::IncMultiQuerySelfAttentionMeta( - FFHandler handler, - InferenceMode infer_mode, - Op const *attn, - int _qSize, - int _kSize, - int _vSize, - int _qProjSize, - int _kProjSize, - int _vProjSize, - int _oProjSize, - int _embed_dim, - bool _bias, - bool _add_bias_kv, - GenericTensorAccessorR const &weight, - Memory gpu_mem, - int num_samples) - : OpMeta(handler, attn) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDNN(cudnnSetStream(handler.dnn, stream)); - qSize = _qSize; - kSize = _kSize; - vSize = _vSize; - embed_dim = _embed_dim; - // assume dimensions match for now - assert(qSize == kSize); - assert(kSize == vSize); - qProjSize = _qProjSize; - kProjSize = _kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul - vProjSize = _vProjSize; - oProjSize = _oProjSize; - size_t size_of_dt = data_type_size(attn->data_type); - - num_heads = _embed_dim / qProjSize; - weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize + - oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - weightSize = (_embed_dim + _embed_dim + kProjSize + vProjSize) * _embed_dim * - size_of_dt; - has_load_weights = (bool *)calloc(1, sizeof(bool)); - *has_load_weights = false; - bias = (bool *)calloc(1, sizeof(bool)); - *bias = _bias; - assert(!_add_bias_kv); - -#ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); -#endif - - // allocate memory for the seqArray and reserve space - { - // size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; - - size_t qkv_max_proj_size = BatchConfig::MAX_NUM_TOKENS * - (qProjSize * num_heads + kProjSize + vProjSize); - size_t key_cache_size = 0, value_cache_size = 0; - switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { - key_cache_size = kProjSize * BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; - value_cache_size = vProjSize * BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; - break; - } - case BEAM_SEARCH_MODE: { - key_cache_size = kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; - value_cache_size = vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; - break; - } - default: - assert(false && "Unkown inference mode"); - } - size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; - size_t qk_prod_size = - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_heads; - size_t attn_heads_size = - BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; - size_t complex_size = - (BatchConfig::MAX_NUM_TOKENS * qProjSize * num_heads) / 2; - size_t totalSize = - (qkv_max_proj_size + key_cache_size + value_cache_size + - 2 * qk_prod_size + attn_heads_size) * - size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + - complex_size * sizeof(cuFloatComplex); // more components will - // be added here later - - Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(totalSize - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(reserveInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - off_t offset = 0; - devQKVProjArray = reserveInst.pointer_untyped(offset, 0); - offset += qkv_max_proj_size * size_of_dt; - keyCache = reserveInst.pointer_untyped(offset, 0); - offset += key_cache_size * size_of_dt; - valueCache = reserveInst.pointer_untyped(offset, 0); - offset += value_cache_size * size_of_dt; - token_infos = reserveInst.pointer(offset); - offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; - qk_prods = reserveInst.pointer_untyped(offset, 0); - offset += qk_prod_size * size_of_dt; - qk_prods_softmax = reserveInst.pointer_untyped(offset, 0); - offset += qk_prod_size * size_of_dt; - attn_heads = reserveInst.pointer_untyped(offset, 0); - offset += attn_heads_size * size_of_dt; - complex_input = reserveInst.pointer(offset); - offset += complex_size * sizeof(cuFloatComplex); - assert(offset == totalSize); - } - cudaStreamSynchronize(stream); -} - -IncMultiQuerySelfAttentionMeta::~IncMultiQuerySelfAttentionMeta(void) { - reserveInst.destroy(); -#ifdef INFERENCE_TESTS - free(kcache); - free(vcache); -#endif -} - -}; // namespace FlexFlow diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index fe241bb8de..c4b4214f69 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -60,6 +60,7 @@ Tensor FFModel::spec_inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -116,9 +117,11 @@ Tensor int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int weight_size = qParas * num_heads + kParas * num_kv_heads + + vParas * num_kv_heads + oParas * num_heads; { - int dims[2] = {qParas + kParas + vParas + oParas, num_heads}; - li->weights[0] = create_weight_legion_ordering(2, + int dims[1] = {weight_size}; + li->weights[0] = create_weight_legion_ordering(1, dims, data_type, li, @@ -128,7 +131,8 @@ Tensor } if (bias) { // q, k, v, o - int dims[1] = {(qProjSize + kProjSize + vProjSize) * num_heads + oProjSize}; + int dims[1] = {qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -140,6 +144,7 @@ Tensor li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); + li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); li->add_int_property("bias", bias); @@ -165,6 +170,8 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( int embed_dim = value; layer->get_int_property("num_heads", value); int num_heads = value; + layer->get_int_property("num_kv_heads", value); + int num_kv_heads = value; layer->get_int_property("kdim", value); int kdim = value; layer->get_int_property("vdim", value); @@ -190,6 +197,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( inputs[0], embed_dim, num_heads, + num_kv_heads, kdim, vdim, dropout, @@ -210,6 +218,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( const ParallelTensor _input, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -231,8 +240,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -261,18 +270,16 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( int vParas = this->vProjSize * this->vSize; int oParas = this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[3]; + ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads; + dims[1].size = this->num_heads * (qParas + oParas) + + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - dims[2].size = qParas + kParas + vParas + oParas; - dims[2].degree = 1; - dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>(dims, + weights[0] = model.create_parallel_weight<2>(dims, this->data_type, NULL /*owner_op*/, true /*create_grad*/, @@ -280,8 +287,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = - (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[0].size = qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -309,6 +317,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( const ParallelTensor _weight, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -331,8 +340,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( 1 /*outputs*/, _input, _weight), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -360,16 +369,17 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( int vParas = this->vProjSize * this->vSize; int oParas = this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[3]; + ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads; + dims[1].size = this->num_heads * (qParas + oParas) + + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - dims[2].size = qParas + kParas + vParas + oParas; + // dims[2].size = qParas + kParas + vParas + oParas; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>(dims, + weights[0] = model.create_parallel_weight<2>(dims, this->data_type, NULL /*owner_op*/, true /*create_grad*/, @@ -377,8 +387,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = - (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[0].size = qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -413,6 +424,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( input, other.oProjSize, other.num_heads, + other.num_kv_heads, other.qProjSize, other.vProjSize, other.dropout, @@ -437,6 +449,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( input, params.embed_dim, params.num_heads, + params.num_kv_heads, params.kdim, params.vdim, params.dropout, @@ -573,7 +586,8 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + int num_heads = attn->num_heads; + int num_kv_heads = attn->num_kv_heads; assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -582,8 +596,14 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( .first(); MemoryAllocator gpu_mem_allocator(gpu_mem); // We don't do offloading for SSMs (small speculative models) - SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta( - handle, attn, weight, gpu_mem_allocator, num_samples, num_heads); + SpecIncMultiHeadSelfAttentionMeta *m = + new SpecIncMultiHeadSelfAttentionMeta(handle, + attn, + weight, + gpu_mem_allocator, + num_samples, + num_heads, + num_kv_heads); // assert that we didn't over allocate memory assert(gpu_mem_allocator.instance_allocated_size == gpu_mem_allocator.instance_total_size); @@ -700,7 +720,7 @@ void SpecIncMultiHeadSelfAttention::inference_task( ctx, task->regions[2].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 3); + assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); assert(task->index_point.get_dim() == 1); @@ -764,6 +784,7 @@ SpecIncMultiHeadSelfAttentionParams params.layer_guid = this->layer_guid; params.embed_dim = this->oProjSize; params.num_heads = this->num_heads; + params.num_kv_heads = this->num_kv_heads; params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; @@ -774,6 +795,7 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + return params; } @@ -786,6 +808,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); hash_combine(key, params.num_heads); + hash_combine(key, params.num_kv_heads); hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 1caf1c1d1b..b95b215b5b 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -64,7 +64,8 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, BEAM_SEARCH_MODE, attn, @@ -85,7 +86,9 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( gpu_mem_allocator, num_samples, attn->num_heads, + attn->num_kv_heads, _num_heads, + _num_kv_heads, DT_NONE, false) { hipStream_t stream; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 44080b7c5c..664b746096 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -33,7 +33,8 @@ namespace SpecIncMultiHeadAttention { template __global__ void spec_store_kv_cache( DT const *devQKVProjArray, - DT *cache_ptr, + DT *kCache_ptr, + DT *vCache_ptr, BatchConfig::PerTokenInfo *tokenInfos, BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, @@ -43,23 +44,21 @@ __global__ void spec_store_kv_cache( int vProjSize, int num_tokens, int num_heads, + int num_kv_heads, int max_seq_len, int max_beam_width, - bool k_cache, bool is_root) { - CUDA_KERNEL_LOOP(i, - num_tokens * (k_cache ? kProjSize : vProjSize) * num_heads) { - int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = i / (num_tokens * proj_size); - int token_idx = (i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = i % proj_size; + CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { + int q_array_size = qProjSize * num_tokens * num_heads; + int k_array_size = kProjSize * num_tokens * num_kv_heads; + + bool k_cache = i < k_array_size; + int real_i = k_cache ? i : i - k_array_size; - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int current_head_block_size = - num_tokens * (k_cache ? qProjSize : qProjSize + kProjSize); - DT val = - devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + - token_idx * proj_size + data_idx]; + int proj_size = k_cache ? kProjSize : vProjSize; + int head_idx = real_i / (num_tokens * proj_size); + int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = real_i % proj_size; // above no need to be changed // int const req_id = id_map[token_idx].request_index; @@ -69,6 +68,10 @@ __global__ void spec_store_kv_cache( // int const beam_depth = id_map[token_idx].beam_depth; // int const beam_width = id_map[token_idx].beam_width; + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * num_tokens + + token_idx * proj_size + data_idx]; + int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; @@ -78,16 +81,18 @@ __global__ void spec_store_kv_cache( // new token int new_token_cache_idx = (req_id * max_beam_width + sub_req_id) * - (num_heads * max_seq_len * proj_size) + + (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + data_idx; + + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; cache_ptr[new_token_cache_idx] = val; // replica in the root iteration if (beam_depth == 1) { for (int i = 1; i < beam_width; i++) { cache_ptr[(req_id * max_beam_width + i) * - (num_heads * max_seq_len * proj_size) + + (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + data_idx] = val; } @@ -97,7 +102,8 @@ __global__ void spec_store_kv_cache( // // printf("token idx %d\n", token_idx); // printf("data idx: %d, tok_id %d, new_token_cache_idx %d, parent_id %d, // " - // "sub_req_id %d, num_tokens %d, kProjSize %d, num_heads %d, val " + // "sub_req_id %d, num_tokens %d, kProjSize %d, num_kv_heads %d, + // val " // "%f, beam_width %d\n", // data_idx, // tok_id, @@ -106,7 +112,7 @@ __global__ void spec_store_kv_cache( // sub_req_id, // num_tokens, // kProjSize, - // num_heads, + // num_kv_heads, // val, // beam_width); // } @@ -126,11 +132,11 @@ __global__ void spec_store_kv_cache( for (int depth = 0; depth < beam_depth; depth++) { int steal_token_idx = tok_id - beam_depth + depth; int steal_from_idx = (req_id * max_beam_width + parent_id) * - (num_heads * max_seq_len * proj_size) + + (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + steal_token_idx * proj_size + data_idx; int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (num_heads * max_seq_len * proj_size) + + (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + steal_token_idx * proj_size + data_idx; cache_ptr[steal_to_idx] = cache_ptr[steal_from_idx]; @@ -167,31 +173,13 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); if (num_tokens > 0) { - int parallelism = m->kProjSize * num_tokens * m->num_heads; + int parallelism = + (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; spec_store_kv_cache<<>>(static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /* k_cache = */ true, - /*root*/ curr_depth == 0); - - parallelism = m->vProjSize * num_tokens * m->num_heads; - spec_store_kv_cache<<>>(static_cast
(m->devQKVProjArray), static_cast
(m->valueCache), m->token_infos, m->request_infos, @@ -202,9 +190,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, m->num_heads, + m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH, BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /* k_cache = */ false, /*root*/ curr_depth == 0); } } @@ -232,6 +220,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int shard_id, DT *output_ptr, DT const *bias_ptr, + DT const *weight_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -248,12 +237,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; - int qkv_block_size = - (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + // int qkv_block_size = + // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int q_block_size = m->qProjSize * num_tokens; + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; - int kt_req_block_size = kt_block_size * m->num_heads; + int kt_req_block_size = kt_block_size * m->num_kv_heads; int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; - int vt_req_block_size = vt_block_size * m->num_heads; + int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { @@ -273,7 +264,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int n = total_tokens; int k = m->qProjSize; int lda = k, ldb = k, ldc = m_; - int strideA = qkv_block_size; + int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; @@ -283,12 +274,12 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } // To get A, skip over Q entries from previous requests (same head) - void const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; + DT const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; // To get B, skip over K entries from previous requests (all heads + // padding) - void const *B = static_cast
(m->keyCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; + DT const *B = static_cast
(m->keyCache) + + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; // if (i == 0 && sub_req_id == 0 && // bc->beam_slots.at(0).current_depth == 1) { @@ -296,31 +287,68 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // printf("key cache offset %d\n", kt_req_block_size); // } // To get C, skip over QK^T products from previous requests - void *C = static_cast
(m->qk_prods) + - m->num_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + DT *C = static_cast
(m->qk_prods) + + m->num_heads * tokens_prev_requests_squares; + + if (m->num_heads == m->num_kv_heads) { + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + strideB = 0; + int one_step_heads = m->num_heads / m->num_kv_heads; + m_ = num_new_tokens; + n = total_tokens; + k = m->qProjSize; + lda = k, ldb = k, ldc = m_; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A + step * strideA * one_step_heads, + cublas_data_type, + lda, + strideA, + B + step * kt_block_size, + cublas_data_type, + ldb, + strideB, + &beta, + C + step * strideC * one_step_heads, + cublas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -331,15 +359,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, parallelism), 0, stream>>>( - static_cast
(C), + C, num_new_tokens, total_tokens, m->num_heads, static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) - cudnnTensorDescriptor_t qk_tensor; - checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); // Before modifying the parameters below, make sure to read the following // description of the CUDNN_TENSOR_NCHW tensor layout, from // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: @@ -353,7 +379,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int c_param = total_tokens; int h_param = 1; int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, n_param, @@ -361,8 +387,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, h_param, w_param)); float softmax_alpha = 1.0f, softmax_beta = 0.0f; - void *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_heads * tokens_prev_requests_squares; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_heads * tokens_prev_requests_squares; // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -371,10 +397,10 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &softmax_alpha, - qk_tensor, + m->qk_tensor, C, &softmax_beta, - qk_tensor, + m->qk_tensor, C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; @@ -387,7 +413,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, strideC = num_new_tokens * m->vProjSize; // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) - A = (void const *)C_softmax; + A = C_softmax; // To get B, skip over V^T entries from previous requests (all heads + // padding) B = static_cast
(m->valueCache) + @@ -397,37 +423,75 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + tokens_previous_requests * m->num_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->num_heads == m->num_kv_heads) { + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + int one_step_heads = m->num_heads / m->num_kv_heads; + n = m->vProjSize; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = 0; + strideC = num_new_tokens * m->vProjSize; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A + step * one_step_heads * strideA, + cublas_data_type, + lda, + strideA, + B + step * vt_block_size, + cublas_data_type, + ldb, + strideB, + &beta, + C + step * one_step_heads, + cublas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = static_cast
(m->W_out_contiguous); - B = static_cast
(C); + A = weight_ptr + m->qSize * (m->qProjSize * m->num_heads + + m->kProjSize * m->num_kv_heads + + m->vProjSize * m->num_kv_heads); + B = C; C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; @@ -455,11 +519,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, m->oProjSize); + output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); } } @@ -512,7 +579,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, shard_id, output_ptr, bias_ptr, stream); + compute_attention_kernel( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } } // namespace SpecIncMultiHeadAttention @@ -582,7 +650,6 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // print_tensor(output.get_half_ptr(), 10000, "att output"); } SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( @@ -591,7 +658,8 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, BEAM_SEARCH_MODE, attn, @@ -612,7 +680,9 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( gpu_mem_allocator, num_samples, attn->num_heads, + attn->num_kv_heads, _num_heads, + _num_kv_heads, DT_NONE, false) { cudaStream_t stream; diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 7a7ea4f366..133543650b 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -62,6 +62,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( const Tensor input, int embed_dim, int num_heads, + int num_kv_heads, int kdim, int vdim, float dropout, @@ -121,6 +122,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); int one_head_size = qParas + kParas + vParas + oParas; + int weight_size = qParas * num_heads + kParas * num_kv_heads + + vParas * num_kv_heads + oParas * num_heads; { // compress the weight size if quantization. if (quantization_type != DT_NONE) { @@ -128,9 +131,9 @@ Tensor FFModel::inc_multihead_self_attention_verify( data_type, quantization_type, one_head_size); } - int dims[2] = {one_head_size, num_heads}; + int dims[1] = {weight_size}; li->weights[0] = create_weight_legion_ordering( - 2, + 1, dims, quantization_type == DT_NONE ? data_type : quantization_type, li, @@ -140,7 +143,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( } if (bias) { // q, k, v, o - int dims[1] = {(qProjSize + kProjSize + vProjSize) * num_heads + oProjSize}; + int dims[1] = {qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -152,6 +156,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_heads", num_heads); + li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); li->add_int_property("bias", bias); @@ -164,6 +169,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); + li->add_int_property("tensor_parallelism_degree", + config.tensor_parallelism_degree); layers.push_back(li); return li->outputs[0]; } @@ -177,6 +184,8 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( int embed_dim = value; layer->get_int_property("num_heads", value); int num_heads = value; + layer->get_int_property("num_kv_heads", value); + int num_kv_heads = value; layer->get_int_property("kdim", value); int kdim = value; layer->get_int_property("vdim", value); @@ -201,11 +210,14 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( DataType quantization_type = (DataType)value; layer->get_int_property("offload", value); bool offload = (bool)value; + layer->get_int_property("tensor_parallelism_degree", value); + int tensor_parallelism_degree = (int)value; return new TreeIncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], embed_dim, num_heads, + num_kv_heads, kdim, vdim, dropout, @@ -219,6 +231,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( false /*allocate_weights*/, quantization_type, offload, + tensor_parallelism_degree, layer->name); } @@ -228,6 +241,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( const ParallelTensor _input, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -241,6 +255,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -251,8 +266,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -260,7 +275,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload) { + offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; @@ -282,22 +297,23 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( int vParas = this->vProjSize * this->vSize; int oParas = this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[3]; + ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads; + dims[1].size = this->num_heads * (qParas + oParas) + + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - dims[2].size = qParas + kParas + vParas + oParas; + // dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { - dims[2].size = get_quantization_to_byte_size( + dims[1].size = get_quantization_to_byte_size( data_type, quantization_type, dims[2].size); } - dims[2].degree = 1; - dims[2].parallel_idx = -1; + // dims[2].degree = 1; + // dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>( + weights[0] = model.create_parallel_weight<2>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, NULL /*owner_op*/, @@ -306,8 +322,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = - (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[0].size = qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -335,6 +352,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( const ParallelTensor _weight, int _embed_dim, int _num_heads, + int _num_kv_heads, int _kdim, int _vdim, float _dropout, @@ -348,6 +366,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool allocate_weights, DataType _quantization_type, bool _offload, + int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -359,8 +378,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( 1 /*outputs*/, _input, _weight), - num_heads(_num_heads), dropout(_dropout), bias(_bias), - add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -368,7 +387,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload) + offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -389,20 +408,21 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( int vParas = this->vProjSize * this->vSize; int oParas = this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[3]; + ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads; + dims[1].size = this->num_heads * (qParas + oParas) + + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - dims[2].size = qParas + kParas + vParas + oParas; + // dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { - dims[2].size = get_quantization_to_byte_size( + dims[1].size = get_quantization_to_byte_size( data_type, quantization_type, dims[2].size); } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<3>( + weights[0] = model.create_parallel_weight<2>( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, NULL /*owner_op*/, @@ -411,8 +431,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = - (qProjSize + kProjSize + vProjSize) * num_heads + oProjSize; + bias_shape.dims[0].size = qProjSize * num_heads + + (kProjSize + vProjSize) * num_kv_heads + + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -447,6 +468,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( input, other.oProjSize, other.num_heads, + other.num_kv_heads, other.qProjSize, other.vProjSize, other.dropout, @@ -460,6 +482,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( allocate_weights, other.quantization_type, other.offload, + other.tensor_parallelism_degree, other.name) {} TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( @@ -473,6 +496,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( input, params.embed_dim, params.num_heads, + params.num_kv_heads, params.kdim, params.vdim, params.dropout, @@ -486,6 +510,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( allocate_weights, params.quantization_type, params.offload, + params.tensor_parallelism_degree, name) {} void TreeIncMultiHeadSelfAttention::init_inference( @@ -613,7 +638,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + // int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + int num_heads = attn->num_heads / attn->tensor_parallelism_degree; + int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -627,8 +655,14 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( gpu_mem_allocator.register_reserved_work_space( handle.offload_reserve_space, handle.offload_reserve_space_size); } - TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta( - handle, attn, weight, gpu_mem_allocator, num_samples, num_heads); + TreeIncMultiHeadSelfAttentionMeta *m = + new TreeIncMultiHeadSelfAttentionMeta(handle, + attn, + weight, + gpu_mem_allocator, + num_samples, + num_heads, + num_kv_heads); if (!attn->offload) { // assert that we didn't over allocate memory assert(gpu_mem_allocator.reserved_allocated_size == @@ -758,7 +792,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( ctx, task->regions[2].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 3); + assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); /* print_tensor(input.get_float_ptr(), @@ -1591,6 +1625,7 @@ TreeIncMultiHeadSelfAttentionParams params.layer_guid = this->layer_guid; params.embed_dim = this->oProjSize; params.num_heads = this->num_heads; + params.num_kv_heads = this->num_kv_heads; params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; @@ -1601,6 +1636,7 @@ TreeIncMultiHeadSelfAttentionParams params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + params.tensor_parallelism_degree = this->tensor_parallelism_degree; return params; } @@ -1613,6 +1649,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); hash_combine(key, params.num_heads); + hash_combine(key, params.num_kv_heads); hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); @@ -1625,6 +1662,7 @@ size_t hash::operator()( hash_combine(key, params.qk_prod_scaling); hash_combine(key, params.quantization_type); hash_combine(key, params.offload); + hash_combine(key, params.tensor_parallelism_degree); return key; } }; // namespace std diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 74cea451c4..9927ef7af0 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -64,7 +64,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, TREE_VERIFY_MODE, attn, @@ -85,7 +86,9 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( gpu_mem_allocator, num_samples, attn->num_heads, + attn->num_kv_heads, _num_heads, + _num_kv_heads, attn->quantization_type, attn->offload), num_active_tokens(0) { diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b46ccb4853..c19f4f37b3 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -34,7 +34,8 @@ namespace TreeIncMultiHeadAttention { template __global__ void commit_tokens_kernel( DT const *devQKVProjArray, - DT *cache_ptr, + DT *kCache_ptr, + DT *vCache_ptr, TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos, int qProjSize, int kProjSize, @@ -42,32 +43,36 @@ __global__ void commit_tokens_kernel( int num_tokens_to_commit, int num_active_tokens_in_last_batch, int num_heads, - int max_seq_len, - bool k_cache) { + int num_kv_heads, + int max_seq_len) { CUDA_KERNEL_LOOP( - i, num_tokens_to_commit * (k_cache ? kProjSize : vProjSize) * num_heads) { + i, num_tokens_to_commit * (kProjSize + vProjSize) * num_kv_heads) { + bool k_cache = i < (num_tokens_to_commit * kProjSize * num_kv_heads); + int real_i = + k_cache ? i : i - (num_tokens_to_commit * kProjSize * num_kv_heads); + int proj_size = k_cache ? kProjSize : vProjSize; - int data_idx = i % proj_size; - int head_idx = i / (num_tokens_to_commit * proj_size); + int data_idx = real_i % proj_size; + int head_idx = real_i / (num_tokens_to_commit * proj_size); int token_pos = - (i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; + (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - int qkv_block_size = - (qProjSize + kProjSize + vProjSize) * num_active_tokens_in_last_batch; - int current_head_block_size = num_active_tokens_in_last_batch * - (k_cache ? qProjSize : qProjSize + kProjSize); + int q_array_size = qProjSize * num_active_tokens_in_last_batch * num_heads; + int k_array_size = + kProjSize * num_active_tokens_in_last_batch * num_kv_heads; + DT val = - devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + + devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * num_active_tokens_in_last_batch + token_idx_in_last_batch * proj_size + data_idx]; - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; + cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + data_idx] = val; } @@ -79,29 +84,14 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens_to_commit = bc->num_tokens_to_commit; if (num_tokens_to_commit > 0) { - int parallelism = m->kProjSize * num_tokens_to_commit * m->num_heads; + int parallelism = + (m->kProjSize + m->vProjSize) * num_tokens_to_commit * m->num_kv_heads; commit_tokens_kernel<<>>( static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), - m->committed_token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ true); - - parallelism = m->vProjSize * num_tokens_to_commit * m->num_heads; - commit_tokens_kernel<<>>( - static_cast
(m->devQKVProjArray), static_cast
(m->valueCache), m->committed_token_infos, m->qProjSize, @@ -110,15 +100,16 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ false); + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH); } } template __global__ void update_tree_branch_kv_cache( DT const *devQKVProjArray, - DT *cache_ptr, + DT *kCache_ptr, + DT *vCache_ptr, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, @@ -127,31 +118,34 @@ __global__ void update_tree_branch_kv_cache( int processed_tokens_in_batch, int total_tokens_in_batch, int num_heads, - int max_seq_len, - bool k_cache) { + int num_kv_heads, + int max_seq_len) { CUDA_KERNEL_LOOP( - i, num_tokens_in_branch * (k_cache ? kProjSize : vProjSize) * num_heads) { + i, num_tokens_in_branch * (kProjSize + vProjSize) * num_kv_heads) { + + int q_array_size = qProjSize * total_tokens_in_batch * num_heads; + int k_array_size = kProjSize * total_tokens_in_batch * num_kv_heads; + + bool k_cache = i < (num_tokens_in_branch * kProjSize * num_kv_heads); + int real_i = + k_cache ? i : i - (num_tokens_in_branch * kProjSize * num_kv_heads); + int proj_size = k_cache ? kProjSize : vProjSize; - int data_idx = i % proj_size; + int data_idx = real_i % proj_size; int token_idx = - (i / proj_size) % num_tokens_in_branch; // index in the tree branch - int head_idx = i / (proj_size * num_tokens_in_branch); + (real_i / proj_size) % num_tokens_in_branch; // index in the tree branch + int head_idx = real_i / (proj_size * num_tokens_in_branch); token_idx += processed_tokens_in_batch; // get index in the whole batch - int qkv_block_size = (qProjSize + kProjSize + vProjSize) * - total_tokens_in_batch; // skip over previous heads - int current_head_block_size = - total_tokens_in_batch * - (k_cache ? qProjSize - : qProjSize + kProjSize); // skip over Q entries (and K entries - // if we are working on the V cache) - DT val = - devQKVProjArray[head_idx * qkv_block_size + current_head_block_size + - token_idx * proj_size + data_idx]; + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * total_tokens_in_batch + + token_idx * proj_size + data_idx]; + int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[req_id * (num_heads * max_seq_len * proj_size) + + cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + data_idx] = val; } @@ -180,6 +174,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int shard_id, DT *output_ptr, DT const *bias_ptr, + DT const *weight_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -194,12 +189,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; - int qkv_block_size = - (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); + // int qkv_block_size = + // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); + int q_block_size = m->qProjSize * bc->num_active_tokens(); int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; - int kt_req_block_size = kt_block_size * m->num_heads; + int kt_req_block_size = kt_block_size * m->num_kv_heads; int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; - int vt_req_block_size = vt_block_size * m->num_heads; + int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { @@ -222,30 +218,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { // update K-V cache - int parallelism = m->kProjSize * num_new_tokens * m->num_heads; + int parallelism = + (m->kProjSize + m->vProjSize) * num_new_tokens * m->num_kv_heads; update_tree_branch_kv_cache<<>>( static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_new_tokens, // num_tokens_in_branch - processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch - m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ true); - - parallelism = m->vProjSize * num_new_tokens * m->num_heads; - update_tree_branch_kv_cache<<>>( - static_cast
(m->devQKVProjArray), static_cast
(m->valueCache), m->token_infos, m->qProjSize, @@ -255,8 +235,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, processed_tokens_in_batch, // num_processed_tokens_in_batch m->num_active_tokens, // total_tokens_in_batch m->num_heads, - BatchConfig::MAX_SEQ_LENGTH, - /* k_cache = */ false); + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH); } // bc->token_last_available_idx[i] + 1; @@ -265,7 +245,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int n = total_tokens_in_request; int k = m->qProjSize; int lda = k, ldb = k, ldc = m_; - int strideA = qkv_block_size; + int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens_in_request; @@ -275,37 +255,68 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, alpha = static_cast
(1.0f / sqrt(m->kProjSize)); } // To get A, skip over Q entries from previous requests (same head) - void const *A = static_cast
(m->devQKVProjArray) + - processed_tokens_in_batch * m->qProjSize; + DT const *A = static_cast
(m->devQKVProjArray) + + processed_tokens_in_batch * m->qProjSize; // To get B, skip over K entries from previous requests (all heads + // padding) - void const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests - void *C = static_cast
(m->qk_prods); - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + DT *C = static_cast
(m->qk_prods); + + if (m->num_heads == m->num_kv_heads) { + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + strideB = 0; + int one_step_heads = m->num_heads / m->num_kv_heads; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A + step * strideA * one_step_heads, + cublas_data_type, + lda, + strideA, + B + step * kt_block_size, + cublas_data_type, + ldb, + strideB, + &beta, + C + step * strideC * one_step_heads, + cublas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. @@ -318,15 +329,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, parallelism), 0, stream>>>( - static_cast
(C), + C, num_new_tokens, total_tokens_in_request, m->num_heads, static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) - cudnnTensorDescriptor_t qk_tensor; - checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); // Before modifying the parameters below, make sure to read the following // description of the CUDNN_TENSOR_NCHW tensor layout, from // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: @@ -340,7 +349,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int c_param = total_tokens_in_request; int h_param = 1; int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(qk_tensor, + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, n_param, @@ -348,7 +357,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, h_param, w_param)); float softmax_alpha = 1.0f, softmax_beta = 0.0f; - void *C_softmax = (void *)(m->qk_prods_softmax); + DT *C_softmax = static_cast
(m->qk_prods_softmax); // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -357,10 +366,10 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &softmax_alpha, - qk_tensor, + m->qk_tensor, C, &softmax_beta, - qk_tensor, + m->qk_tensor, C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; @@ -373,7 +382,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, strideC = num_new_tokens * m->vProjSize; // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) - A = static_cast
(C_softmax); + A = C_softmax; // To get B, skip over V^T entries from previous requests (all heads + // padding) B = static_cast
(m->valueCache) + i * vt_req_block_size; @@ -382,36 +391,70 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + processed_tokens_in_batch * m->num_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->num_heads == m->num_kv_heads) { + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + int one_step_heads = m->num_heads / m->num_kv_heads; + strideB = 0; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A + step * one_step_heads * strideA, + cublas_data_type, + lda, + strideA, + B + step * vt_block_size, + cublas_data_type, + ldb, + strideB, + &beta, + C + step * one_step_heads * strideC, + cublas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; k = m->vProjSize * m->num_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = static_cast
(m->W_out_contiguous); + A = weight_ptr + m->qSize * (m->qProjSize * m->num_heads + + m->kProjSize * m->num_kv_heads + + m->vProjSize * m->num_kv_heads); B = C; C = static_cast
(output_ptr) + processed_tokens_in_batch * m->oProjSize; @@ -443,11 +486,17 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; + int qkv_weight_size = m->qProjSize * m->global_num_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; apply_proj_bias_w<<>>( - output_ptr, bias_ptr, processed_tokens_in_batch, m->oProjSize); + stream>>>(output_ptr, + bias_ptr, + processed_tokens_in_batch, + qkv_weight_size, + m->oProjSize); } assert(processed_tokens_in_batch == bc->num_active_tokens()); @@ -463,35 +512,21 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, DT const *bias_ptr, cudaStream_t stream) { // additional processing for weight uploading - // if (m->handle.offload_reserve_space != nullptr) { - // // Note that we update weight_ptr and bias_ptr when uploading weight and - // // bias - // cudaMemcpyAsync(m->weight_ptr, - // weight_ptr, - // m->weightSize, - // cudaMemcpyHostToDevice, - // stream); - // weight_ptr = static_cast
(m->weight_ptr); - // if (m->biasSize > 0) { - // cudaMemcpyAsync( - // m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, - // stream); - // bias_ptr = static_cast
(m->bias_ptr); - // } - // // reload weight_o for offloading case - // int parallelism = m->vProjSize * m->oProjSize * m->num_heads; - // build_w_out_tensor<<>>(weight_ptr, - // static_cast
(m->W_out_contiguous), - // m->vProjSize, - // m->oProjSize, - // m->num_heads, - // (m->qSize * m->qProjSize + - // m->kSize * m->kProjSize + - // m->vSize * m->vProjSize)); - // } + if (m->handle.offload_reserve_space != nullptr) { + // Note that we update weight_ptr and bias_ptr when uploading weight and + // bias + cudaMemcpyAsync(m->weight_ptr, + weight_ptr, + m->weightSize, + cudaMemcpyHostToDevice, + stream); + weight_ptr = static_cast
(m->weight_ptr); + if (m->biasSize > 0) { + cudaMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); + bias_ptr = static_cast
(m->bias_ptr); + } + } // copy committed tokens info to GPU for the commit_tokens kernel // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing @@ -536,7 +571,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel(m, bc, shard_id, output_ptr, bias_ptr, stream); + compute_attention_kernel( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } } // namespace TreeIncMultiHeadAttention @@ -624,7 +660,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads) + int _num_heads, + int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, TREE_VERIFY_MODE, attn, @@ -645,7 +682,9 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( gpu_mem_allocator, num_samples, attn->num_heads, + attn->num_kv_heads, _num_heads, + _num_kv_heads, attn->quantization_type, attn->offload), num_active_tokens(0) { diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index da22a245f1..e4728bdb88 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -209,8 +209,10 @@ __host__ void updateGAS(float *para_ptr, } template -__host__ void - print_tensor(T const *ptr, size_t num_elements, char const *prefix) { +__host__ void print_tensor(T const *ptr, + size_t num_elements, + char const *prefix, + int shard_id) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -221,7 +223,7 @@ __host__ void host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); cudaDeviceSynchronize(); int idx = 0; - printf("%s", prefix); + printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { printf(" %.20lf", (float)host_ptr[idx]); if (idx >= 100) { @@ -278,7 +280,7 @@ __host__ void checkCUDA(cudaMemcpyAsync( host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); // checkCUDA(cudaDeviceSynchronize()); - + cudaDeviceSynchronize(); FILE *tensor_file; tensor_file = fopen(file_name, "w"); for (unsigned i = 0; i < num_elements; i++) { @@ -567,16 +569,26 @@ template __global__ void apply_add_with_scale(int64_t *data_ptr, size_t size, int64_t scale); -template __host__ void - print_tensor(float const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(double const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(int32_t const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(int64_t const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(half const *ptr, size_t rect, char const *prefix); +template __host__ void print_tensor(float const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(double const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(int32_t const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(int64_t const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(half const *ptr, + size_t rect, + char const *prefix, + int shard_id); template __host__ void print_beam_tensor(float const *ptr, size_t num_elements, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 35ec59ce03..0723ee136d 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -154,8 +154,6 @@ std::string get_operator_type_name(OperatorType type) { return "SpecIncMultiHeadSelfAttention"; case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: return "TreeIncMultiHeadSelfAttention"; - case OP_INC_MULTIQUERY_SELF_ATTENTION: - return "IncMultiQuerySelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index a82add4b62..a3a9e5c4d9 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -33,7 +33,6 @@ #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/inc_multihead_self_attention.h" -#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -2326,6 +2325,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->quantization_type); sez.serialize(attn->offload); + sez.serialize(attn->num_kv_heads); + sez.serialize(attn->tensor_parallelism_degree); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2345,6 +2346,7 @@ GraphOptimalViewSerialized sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->num_kv_heads); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2366,20 +2368,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->quantization_type); sez.serialize(attn->offload); - break; - } - case OP_INC_MULTIQUERY_SELF_ATTENTION: { - IncMultiQuerySelfAttention *attn = (IncMultiQuerySelfAttention *)op; - sez.serialize(attn->layer_guid.id); - sez.serialize(attn->layer_guid.transformer_layer_id); - sez.serialize(attn->oProjSize); - sez.serialize(attn->num_heads); - sez.serialize(attn->qProjSize); - sez.serialize(attn->vProjSize); - sez.serialize(attn->dropout); - sez.serialize(attn->bias); - sez.serialize(attn->add_bias_kv); - sez.serialize(attn->add_zero_attn); + sez.serialize(attn->num_kv_heads); + sez.serialize(attn->tensor_parallelism_degree); break; } case OP_SOFTMAX: { @@ -2746,7 +2736,8 @@ void FFModel::deserialize_graph_optimal_view( } case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim; + int embed_dim, num_heads, k_dim, v_dim, num_kv_heads, + tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload; @@ -2769,6 +2760,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qk_prod_scaling); dez.deserialize(quantization_type); dez.deserialize(offload); + dez.deserialize(num_kv_heads); + dez.deserialize(tensor_parallelism_degree); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2786,12 +2779,14 @@ void FFModel::deserialize_graph_optimal_view( params.qk_prod_scaling = qk_prod_scaling; params.quantization_type = quantization_type; params.offload = offload; + params.num_kv_heads = num_kv_heads; + params.tensor_parallelism_degree = tensor_parallelism_degree; node = get_or_create_node(inputs[0], params); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim; + int embed_dim, num_heads, k_dim, v_dim, num_kv_heads; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; @@ -2811,6 +2806,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); + dez.deserialize(num_kv_heads); SpecIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2826,13 +2822,15 @@ void FFModel::deserialize_graph_optimal_view( params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; + params.num_kv_heads = num_kv_heads; node = get_or_create_node(inputs[0], params); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim; + int embed_dim, num_heads, k_dim, v_dim, num_kv_heads, + tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload; @@ -2855,6 +2853,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qk_prod_scaling); dez.deserialize(quantization_type); dez.deserialize(offload); + dez.deserialize(num_kv_heads); + dez.deserialize(tensor_parallelism_degree); TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2872,43 +2872,12 @@ void FFModel::deserialize_graph_optimal_view( params.qk_prod_scaling = qk_prod_scaling; params.quantization_type = quantization_type; params.offload = offload; + params.num_kv_heads = num_kv_heads; + params.tensor_parallelism_degree = tensor_parallelism_degree; node = get_or_create_node(inputs[0], params); break; } - case OP_INC_MULTIQUERY_SELF_ATTENTION: { - assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim; - float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling; - size_t id, transformer_layer_id; - dez.deserialize(id); - dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); - dez.deserialize(embed_dim); - dez.deserialize(num_heads); - dez.deserialize(k_dim); - dez.deserialize(v_dim); - dez.deserialize(dropout); - dez.deserialize(bias); - dez.deserialize(add_bias_kv); - dez.deserialize(add_zero_attn); - - IncMultiQuerySelfAttentionParams params; - params.embed_dim = embed_dim; - params.num_heads = num_heads; - params.kdim = k_dim; - params.vdim = v_dim; - params.dropout = dropout; - params.bias = bias; - params.add_bias_kv = add_bias_kv; - params.add_zero_attn = add_zero_attn; - params.layer_guid = layer_guid; - node = - get_or_create_node(inputs[0], params); - break; - } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2a7ece3c06..b54a58448e 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -44,7 +44,6 @@ #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/inc_multihead_self_attention.h" -#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" @@ -2801,12 +2800,6 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } - case OP_INC_MULTIQUERY_SELF_ATTENTION: { - Op *op = IncMultiQuerySelfAttention::create_operator_from_layer( - *this, layer, inputs); - operators.push_back(op); - return op; - } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -5726,43 +5719,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - // MultiQueryAttention task - { - TaskVariantRegistrar registrar(INC_MULTIQUERY_SELF_ATTENTION_INIT_TASK_ID, - "IncMultiQuerySelfAttention Init"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "IncMultiQuerySelfAttention Init Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant( - registrar); - } - } - { - TaskVariantRegistrar registrar(INC_MULTIQUERY_SELF_ATTENTION_INF_TASK_ID, - "IncMultiQuerySelfAttention Inference"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - IncMultiQuerySelfAttention::inference_task>( - registrar, "IncMultiQuerySelfAttention Inference Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime - ->register_task_variant( - registrar); - } - } // speculative MultiHeadAttention task { TaskVariantRegistrar registrar( diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index bf817f5351..5f9ae98936 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -19,7 +19,6 @@ #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" #include "flexflow/ops/inc_multihead_self_attention.h" -#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/mean.h" diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2712d21c3f..2041bdd9a7 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -71,7 +71,7 @@ void RequestManager::register_tokenizer(ModelType type, this->model_type = type; std::string tokenizer_folder = (!path.empty() && path.back() != '/') ? path + '/' : path; - if (model_type == ModelType::LLAMA) { + if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { bool path_to_file = !path.empty() && (path.size() >= strlen("tokenizer.model")) && path.find("tokenizer.model") == @@ -189,14 +189,15 @@ RequestManager::RequestGuid RequestManager::register_new_request(std::string const &prompt, int max_sequence_length) { const std::lock_guard lock(request_queue_mutex); - // Add a new request Request request; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - request.tokens.push_back(this->model_bos_map.at(this->model_type)); - std::vector tokens = this->tokenizer_->Encode(prompt); + if (this->model_bos_map.find(this->model_type) != this->model_bos_map.end()) { + request.tokens.push_back(this->model_bos_map.at(this->model_type)); + } + std::vector tokens = this->tokenizer_->Encode(prompt); if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " @@ -207,12 +208,9 @@ RequestManager::RequestGuid // assert(false); return 0; } - for (int i = 0; i < tokens.size(); i++) { std::cout << "[" << i << "]" << tokens.at(i) << "\n"; } - - // assert(false); request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); request.initial_len = request.tokens.size(); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 6a61e70fc6..3a25d99b6f 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -29,7 +29,6 @@ #include "flexflow/ops/experts.h" #include "flexflow/ops/flat.h" #include "flexflow/ops/inc_multihead_self_attention.h" -#include "flexflow/ops/inc_multiquery_self_attention.h" #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" @@ -3725,13 +3724,6 @@ bool FFModel::convert_graph_to_operators( new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true); break; } - case OP_INC_MULTIQUERY_SELF_ATTENTION: { - assert(inList.size() == 1); - IncMultiQuerySelfAttention *attn = - (IncMultiQuerySelfAttention *)node.ptr; - new_op = new IncMultiQuerySelfAttention(*this, *attn, inputs[0], true); - break; - } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(inList.size() == 1); TreeIncMultiHeadSelfAttention *attn = From c19882ee29db820c79f88826856e8d38a2531d18 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 4 Aug 2023 04:12:23 +0000 Subject: [PATCH 190/344] api update --- .../cpp/inference/mixture_of_experts/moe.cc | 1 - .../inference/transformers/transformers.cc | 1 - include/flexflow/flexflow_c.h | 61 +++- include/flexflow/model.h | 54 +++- .../ops/inc_multihead_self_attention.h | 14 +- .../ops/inc_multihead_self_attention_params.h | 3 +- .../ops/spec_inc_multihead_self_attention.h | 8 +- ...spec_inc_multihead_self_attention_params.h | 2 +- .../ops/tree_inc_multihead_self_attention.h | 8 +- ...tree_inc_multihead_self_attention_params.h | 3 +- inference/models/configs/llama2_70B.json | 1 - inference/models/configs/llama2_7B.json | 1 - inference/models/configs/llama_160M.json | 1 - inference/models/configs/llama_7B.json | 1 - inference/models/falcon.cc | 6 +- inference/models/llama.cc | 5 +- inference/models/llama.h | 4 +- inference/models/opt.cc | 3 - src/c/flexflow_c.cc | 137 +++++++- src/ops/inc_multihead_self_attention.cc | 300 ++++++++++-------- src/ops/inc_multihead_self_attention.cpp | 6 +- src/ops/inc_multihead_self_attention.cu | 139 ++++---- src/ops/spec_inc_multihead_self_attention.cc | 84 +++-- src/ops/spec_inc_multihead_self_attention.cpp | 6 +- src/ops/spec_inc_multihead_self_attention.cu | 46 +-- src/ops/tree_inc_multihead_self_attention.cc | 278 +++++++++------- src/ops/tree_inc_multihead_self_attention.cpp | 6 +- src/ops/tree_inc_multihead_self_attention.cu | 49 +-- src/runtime/graph.cc | 24 +- 29 files changed, 795 insertions(+), 457 deletions(-) diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc index 5125e5d98e..4a5c33c9b0 100644 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ b/examples/cpp/inference/mixture_of_experts/moe.cc @@ -79,7 +79,6 @@ Tensor create_moe_encoder(FFModel *model, x, moeConfig->hidden_size, moeConfig->num_attention_heads, - moeConfig->num_attention_heads, moeConfig->attention_kdim, moeConfig->attention_vdim) : model->multihead_attention(x, diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc index 2d818e8e4e..0717ddc90f 100644 --- a/examples/cpp/inference/transformers/transformers.cc +++ b/examples/cpp/inference/transformers/transformers.cc @@ -47,7 +47,6 @@ Tensor create_inc_multihead_attention_decoder( input, transformerConfig->hidden_size, transformerConfig->num_attention_heads, - transformerConfig->num_attention_heads, transformerConfig->attention_kdim, transformerConfig->attention_vdim) : model->multihead_attention(input, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 949c0f7885..7f1374415c 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -401,7 +401,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -421,7 +420,6 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -441,6 +439,65 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( const flexflow_tensor_t input_, int embed_dim, int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name); + +flexflow_tensor_t flexflow_model_add_inc_multiquery_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name); + +flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name); + +flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_q_heads, int num_kv_heads, int kdim, int vdim, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d34cf14a76..7bd13224df 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -641,7 +641,6 @@ class FFModel { Tensor inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, - int num_kv_heads, int kdim = 0, int vdim = 0, float dropout = 0.0f, @@ -659,7 +658,6 @@ class FFModel { spec_inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, - int num_kv_heads, int kdim = 0, int vdim = 0, float dropout = 0.0f, @@ -677,6 +675,58 @@ class FFModel { const Tensor input, int embed_dim, int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + char const *name = NULL); + Tensor inc_multiquery_self_attention(const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + char const *name = NULL); + Tensor + spec_inc_multiquery_self_attention(const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + char const *name = NULL); + Tensor inc_multiquery_self_attention_verify( + const Tensor input, + int embed_dim, + int num_q_heads, int num_kv_heads, int kdim = 0, int vdim = 0, diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index ce1ef6f37c..91621074b3 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -28,7 +28,7 @@ class IncMultiHeadSelfAttention : public Op { LayerID const &layer_guid, const ParallelTensor _input, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -49,7 +49,7 @@ class IncMultiHeadSelfAttention : public Op { const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -118,7 +118,7 @@ class IncMultiHeadSelfAttention : public Op { Params get_params() const; public: - int num_heads, num_kv_heads, tensor_parallelism_degree; + int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -136,7 +136,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads); IncMultiHeadSelfAttentionMeta(FFHandler handler, InferenceMode infer_mode, @@ -157,9 +157,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _global_num_heads, + int _global_num_q_heads, int _global_num_kv_heads, - int _num_heads, + int _num_q_heads, int _num_kv_heads, DataType _quantization_type, bool _offload); @@ -170,7 +170,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, biasSize, reserveSpaceSize, quantized_weightSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int global_num_heads, global_num_kv_heads, num_heads, num_kv_heads; + int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads; bool *has_load_weights; bool *apply_rotary_embedding; bool *bias; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 84fedb45a7..be38b9ab1b 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -8,7 +8,8 @@ namespace FlexFlow { struct IncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; + int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, + tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index f5b06c830e..c6364805e3 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -28,7 +28,7 @@ class SpecIncMultiHeadSelfAttention : public Op { LayerID const &layer_guid, const ParallelTensor _input, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -46,7 +46,7 @@ class SpecIncMultiHeadSelfAttention : public Op { const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -116,7 +116,7 @@ class SpecIncMultiHeadSelfAttention : public Op { Params get_params() const; public: - int num_heads, num_kv_heads, tensor_parallelism_degree; + int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -132,7 +132,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads); ~SpecIncMultiHeadSelfAttentionMeta(void); diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 1741b23745..d6f08dd9e6 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -8,7 +8,7 @@ namespace FlexFlow { struct SpecIncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_heads, num_kv_heads, kdim, vdim; + int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index cf714fe515..d5be344cca 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -28,7 +28,7 @@ class TreeIncMultiHeadSelfAttention : public Op { LayerID const &layer_guid, const ParallelTensor _input, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -49,7 +49,7 @@ class TreeIncMultiHeadSelfAttention : public Op { const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -119,7 +119,7 @@ class TreeIncMultiHeadSelfAttention : public Op { Params get_params() const; public: - int num_heads, num_kv_heads, tensor_parallelism_degree; + int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -137,7 +137,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads); ~TreeIncMultiHeadSelfAttentionMeta(void); diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index a00e56bda6..3ba49dcbad 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -8,7 +8,8 @@ namespace FlexFlow { struct TreeIncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; + int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, + tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; diff --git a/inference/models/configs/llama2_70B.json b/inference/models/configs/llama2_70B.json index 45751787e6..017e71888d 100644 --- a/inference/models/configs/llama2_70B.json +++ b/inference/models/configs/llama2_70B.json @@ -2,7 +2,6 @@ "n_layers": 80, "vocab_size": 32000, "n_heads": 64, - "n_kv_heads": 8, "dim": 8192, "multiple_of": 256, "norm_eps": 1e-5, diff --git a/inference/models/configs/llama2_7B.json b/inference/models/configs/llama2_7B.json index 5796f70db7..46dd138e4f 100644 --- a/inference/models/configs/llama2_7B.json +++ b/inference/models/configs/llama2_7B.json @@ -2,7 +2,6 @@ "n_layers": 32, "vocab_size": 32000, "n_heads": 32, - "n_kv_heads": 32, "dim": 4096, "multiple_of": 256, "norm_eps": 1e-5, diff --git a/inference/models/configs/llama_160M.json b/inference/models/configs/llama_160M.json index 85e83804aa..d912c64ab7 100644 --- a/inference/models/configs/llama_160M.json +++ b/inference/models/configs/llama_160M.json @@ -2,7 +2,6 @@ "n_layers": 12, "vocab_size": 32000, "n_heads": 12, - "n_kv_heads": 12, "dim": 768, "multiple_of": 256, "norm_eps": 1e-6, diff --git a/inference/models/configs/llama_7B.json b/inference/models/configs/llama_7B.json index f0ef126096..0c32ed320d 100644 --- a/inference/models/configs/llama_7B.json +++ b/inference/models/configs/llama_7B.json @@ -2,7 +2,6 @@ "n_layers": 32, "vocab_size": 32000, "n_heads": 32, - "n_kv_heads": 32, "dim": 4096, "multiple_of": 256, "norm_eps": 1e-6, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 2846549d28..00f7864e7f 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -89,7 +89,7 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( + mha = ff.spec_inc_multiquery_self_attention( att_norm, falcon_config.dim, falcon_config.n_heads, @@ -107,7 +107,7 @@ void FALCON::create_falcon_model(FFModel &ff, } case TREE_VERIFY_MODE: { - mha = ff.inc_multihead_self_attention_verify( + mha = ff.inc_multiquery_self_attention_verify( att_norm, falcon_config.dim, falcon_config.n_heads, @@ -126,7 +126,7 @@ void FALCON::create_falcon_model(FFModel &ff, } case INC_DECODING_MODE: { - mha = ff.inc_multihead_self_attention( + mha = ff.inc_multiquery_self_attention( att_norm, falcon_config.dim, falcon_config.n_heads, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 0cd53fb141..79b042b97c 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -81,7 +81,6 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm, llama_config.dim, llama_config.n_heads, - llama_config.n_kv_heads, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, @@ -98,7 +97,6 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm, llama_config.dim, llama_config.n_heads, - llama_config.n_kv_heads, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, /*dropout*/ @@ -116,7 +114,6 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm, llama_config.dim, llama_config.n_heads, - llama_config.n_kv_heads, llama_config.dim / llama_config.n_heads, llama_config.dim / llama_config.n_heads, 0.0f, /*dropout*/ @@ -200,7 +197,7 @@ void LLAMA::create_llama_model(FFModel &ff, FileDataLoader fileloader("", weight_file_path, llama_config.n_heads, - llama_config.n_kv_heads, + llama_config.n_heads, llama_config.dim, llama_config.dim / llama_config.n_heads, tensor_partition_num); diff --git a/inference/models/llama.h b/inference/models/llama.h index 46a22954e0..61d8908d0c 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -38,7 +38,6 @@ class LLAMA { total_requests = 2560; incremental_mode = true; hidden_dim = 11008; - n_kv_heads = 32; max_seq_len = BatchConfig::MAX_SEQ_LENGTH; max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -55,7 +54,6 @@ class LLAMA { n_layers = config_json["n_layers"]; vocab_size = config_json["vocab_size"]; n_heads = config_json["n_heads"]; - n_kv_heads = config_json["n_kv_heads"]; dim = config_json["dim"]; multiple_of = config_json["multiple_of"]; norm_eps = config_json["norm_eps"]; @@ -101,7 +99,7 @@ class LLAMA { int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, total_requests, incremental_mode, max_seq_len, max_num_tokens, - max_beam_width, max_beam_depth, n_kv_heads; + max_beam_width, max_beam_depth; float norm_eps; }; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 86ced698f0..8a1a17d3af 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -109,7 +109,6 @@ void OPT::create_opt_model(FFModel &ff, hidden_states, opt_config.hidden_size, opt_config.num_attention_heads, - opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, @@ -131,7 +130,6 @@ void OPT::create_opt_model(FFModel &ff, hidden_states, opt_config.hidden_size, opt_config.num_attention_heads, - opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, @@ -153,7 +151,6 @@ void OPT::create_opt_model(FFModel &ff, hidden_states, opt_config.hidden_size, opt_config.num_attention_heads, - opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 788b95bfcc..59f9046c57 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1054,7 +1054,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -1075,7 +1074,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( Tensor tensor = handle->inc_multihead_self_attention(input, embed_dim, num_heads, - num_kv_heads, kdim, vdim, dropout, @@ -1097,7 +1095,6 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( const flexflow_tensor_t input_, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -1119,7 +1116,6 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( handle->spec_inc_multihead_self_attention(input, embed_dim, num_heads, - num_kv_heads, kdim, vdim, dropout, @@ -1141,7 +1137,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( const flexflow_tensor_t input_, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -1163,7 +1158,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( handle->inc_multihead_self_attention_verify(input, embed_dim, num_heads, - num_kv_heads, kdim, vdim, dropout, @@ -1180,6 +1174,137 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( return FFCObjectWrapper::wrap(tensor); } +flexflow_tensor_t flexflow_model_add_inc_multiquery_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = handle->inc_multiquery_self_attention(input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_attention( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = + handle->spec_inc_multiquery_self_attention(input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); + return FFCObjectWrapper::wrap(tensor); +} + +flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + enum DataType data_type, + flexflow_initializer_t kernel_initializer_, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input = FFCObjectWrapper::unwrap(input_); + Initializer *kernel_initializer = + FFCObjectWrapper::unwrap(kernel_initializer_); + Tensor tensor = + handle->inc_multiquery_self_attention_verify(input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); + return FFCObjectWrapper::wrap(tensor); +} + flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, const flexflow_tensor_t input_, float eps, diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index e3ee54f4a2..f4f64aee8a 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -61,7 +61,6 @@ bool IncMultiHeadSelfAttentionParams::is_valid( Tensor FFModel::inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -75,6 +74,42 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, float scaling_factor, bool qk_prod_scaling, char const *name) { + return inc_multiquery_self_attention(input, + embed_dim, + num_heads, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); +} + +Tensor FFModel::inc_multiquery_self_attention(const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; } @@ -120,8 +155,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_heads + kParas * num_kv_heads + - vParas * num_kv_heads + oParas * num_heads; + int weight_size = qParas * num_q_heads + kParas * num_kv_heads + + vParas * num_kv_heads + oParas * num_q_heads; int one_head_size = qParas + kParas + vParas + oParas; { @@ -142,7 +177,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, } if (bias) { // q, k, v, o - int dims[1] = {qProjSize * num_heads + + int dims[1] = {qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, @@ -154,7 +189,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, } li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); - li->add_int_property("num_heads", num_heads); + li->add_int_property("num_q_heads", num_q_heads); li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); @@ -182,8 +217,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( long long value; layer->get_int_property("embed_dim", value); int embed_dim = value; - layer->get_int_property("num_heads", value); - int num_heads = value; + layer->get_int_property("num_q_heads", value); + int num_q_heads = value; layer->get_int_property("num_kv_heads", value); int num_kv_heads = value; layer->get_int_property("kdim", value); @@ -217,7 +252,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( layer->layer_guid, inputs[0], embed_dim, - num_heads, + num_q_heads, num_kv_heads, kdim, vdim, @@ -241,7 +276,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( LayerID const &_layer_guid, const ParallelTensor _input, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -267,7 +302,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( (_bias ? 2 : 1), /*weights*/ 1 /*outputs*/, _input), - num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), @@ -303,7 +338,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads * (qParas + oParas) + + dims[1].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; @@ -322,7 +357,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_heads + + bias_shape.dims[0].size = qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -351,7 +386,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -378,7 +413,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( 1 /*outputs*/, _input, _weight), - num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), @@ -412,11 +447,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads * (qParas + oParas) + + dims[1].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; - // dims[2].size = this->num_heads * (qParas + oParas) + this->num_kv_heads * - // (kParas + vParas); + // dims[2].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads + // * (kParas + vParas); if (quantization_type != DT_NONE) { dims[1].size = get_quantization_to_byte_size( data_type, quantization_type, (qParas + kParas + vParas + oParas)); @@ -432,7 +467,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_heads + + bias_shape.dims[0].size = qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -468,7 +503,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.layer_guid, input, other.oProjSize, - other.num_heads, + other.num_q_heads, other.num_kv_heads, other.qProjSize, other.vProjSize, @@ -496,7 +531,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.layer_guid, input, params.embed_dim, - params.num_heads, + params.num_q_heads, params.num_kv_heads, params.kdim, params.vdim, @@ -638,7 +673,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_heads = attn->num_heads / attn->tensor_parallelism_degree; + int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree; int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree; assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); @@ -660,7 +695,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( weight, gpu_mem_allocator, num_samples, - num_heads, + num_q_heads, num_kv_heads); if (handle.offload_reserve_space == nullptr) { // assert that we didn't over allocate memory @@ -860,7 +895,7 @@ void IncMultiHeadSelfAttention::inference_task( // Weight tensor dimensions coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; - coord_t num_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; + coord_t num_q_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; size_t qParas = m->qProjSize * m->qSize; size_t kParas = m->kProjSize * m->kSize; @@ -868,7 +903,7 @@ void IncMultiHeadSelfAttention::inference_task( size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); assert(all_weight_params == qParas + kParas + vParas + oParas); - assert(num_heads == m->num_heads); + assert(num_q_heads == m->num_q_heads); assert(replica_dim == 1); assert(m->qSize == m->kSize && m->kSize == m->vSize); @@ -880,10 +915,10 @@ void IncMultiHeadSelfAttention::inference_task( // column-major order. // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " - // "bc->num_active_tokens(): %i, num_heads: %lli, + // "bc->num_active_tokens(): %i, num_q_heads: %lli, // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), - // num_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); + // num_q_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); // for (int t=0; t < bc->num_active_tokens(); t++) { // printf("token %i has request_index: %li and token_position: %li\n", // t, bc->token2ids.token_indexes[t].request_index, @@ -909,12 +944,12 @@ void IncMultiHeadSelfAttention::inference_task( // ============================================================================= // Load the Q/K/V projection weights, and create a Torch tensor // ============================================================================= - std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_heads}; + std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_q_heads}; float *w_qkv = - (float *)calloc(m->qSize * m->qProjSize * 3 * num_heads, sizeof(float)); + (float *)calloc(m->qSize * m->qProjSize * 3 * num_q_heads, sizeof(float)); assert(w_qkv[0] == 0.0f); - for (int h = 0; h < num_heads; h++) { + for (int h = 0; h < num_q_heads; h++) { for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { int row_index = i % m->qSize; int column_index = i / m->qSize; @@ -942,7 +977,7 @@ void IncMultiHeadSelfAttention::inference_task( } // convert weights to torch tensor torch::Tensor torch_w_qkv = torch::from_blob( - w_qkv, {m->qSize, m->qProjSize, 3, (int)num_heads}, torch::kFloat32); + w_qkv, {m->qSize, m->qProjSize, 3, (int)num_q_heads}, torch::kFloat32); /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() << std::endl; @@ -965,24 +1000,25 @@ void IncMultiHeadSelfAttention::inference_task( assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && qkv_projs.sizes()[1] <= effective_batch_size); assert(qkv_projs.sizes()[2] == 3); - assert(qkv_projs.sizes()[3] == num_heads); + assert(qkv_projs.sizes()[3] == num_q_heads); free(w_qkv); // ----------------------- Loading CUDA results for this step --------------- - float *QKVProjArray_cpu = download_tensor(m->devQKVProjArray, - BatchConfig::MAX_NUM_TOKENS * - proj_sum * m->num_heads); + float *QKVProjArray_cpu = download_tensor( + m->devQKVProjArray, + BatchConfig::MAX_NUM_TOKENS * proj_sum * m->num_q_heads); assert(QKVProjArray_cpu != nullptr); std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc->num_active_tokens(), 3, (int)num_heads}; + m->qProjSize, bc->num_active_tokens(), 3, (int)num_q_heads}; float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc->num_active_tokens() * 3 * num_heads, sizeof(float)); + m->qProjSize * bc->num_active_tokens() * 3 * num_q_heads, sizeof(float)); // skip over padding at the end of QKVProjArray_cpu // convert from column order to 3D matrix because torch cannot automatically // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_heads; i++) { + for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_q_heads; + i++) { int proj_size_index = i % m->qProjSize; int head_index = i / (proj_sum * bc->num_active_tokens()); int token_index = @@ -991,7 +1027,7 @@ void IncMultiHeadSelfAttention::inference_task( int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / (m->qProjSize * bc->num_active_tokens()); assert(proj_size_index < proj_sum); - assert(head_index < num_heads); + assert(head_index < num_q_heads); assert(token_index < bc->num_active_tokens()); assert(qkv_offset < 3); set_value_row_major(QKVProjArray_converted, @@ -1001,19 +1037,19 @@ void IncMultiHeadSelfAttention::inference_task( } torch::Tensor QKVProjArray_torch = torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc->num_active_tokens(), 3, num_heads}, + {m->qProjSize, bc->num_active_tokens(), 3, num_q_heads}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- // std::cout << "QKVProjArray_torch" << std::endl; - // for (int i=0; inum_active_tokens(); t++) { for (size_t d = 0; d < m->kProjSize; d++) { size_t kcache_idx = - d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * m->num_heads * + d * MAX_SEQ_LEN * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].abs_depth_in_request * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; m->kcache[kcache_idx] = @@ -1041,8 +1077,8 @@ void IncMultiHeadSelfAttention::inference_task( } for (size_t d = 0; d < m->vProjSize; d++) { size_t vcache_idx = - d * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * m->num_heads * + d * MAX_SEQ_LEN * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + + bc->tokensInfo[t].abs_depth_in_request * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; m->vcache[vcache_idx] = @@ -1054,11 +1090,11 @@ void IncMultiHeadSelfAttention::inference_task( // Create torch tensors from the arrays torch::Tensor K_t = torch::from_blob( m->kcache, - {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + {m->kProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); torch::Tensor V_t = torch::from_blob( m->vcache, - {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + {m->vProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); // Compute useful indices @@ -1086,29 +1122,31 @@ void IncMultiHeadSelfAttention::inference_task( // ----------------------- Loading CUDA results for this step --------------- float *keyCache_cpu = download_tensor(m->keyCache, - m->num_heads * m->kProjSize * + m->num_q_heads * m->kProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); float *valueCache_cpu = download_tensor(m->valueCache, - m->num_heads * m->vProjSize * + m->num_q_heads * m->vProjSize * BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); assert(keyCache_cpu != nullptr); assert(valueCache_cpu != nullptr); - float *kcache_cuda = (float *)calloc( - m->kProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - float *vcache_cuda = (float *)calloc( - m->vProjSize * MAX_SEQ_LEN * m->num_heads * BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); + float *kcache_cuda = + (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_q_heads * + BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); + float *vcache_cuda = + (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_q_heads * + BatchConfig::MAX_NUM_REQUESTS, + sizeof(float)); int index = 0; for (int i = 0; i < m->kProjSize; i++) { for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_heads; k++) { + for (int k = 0; k < m->num_q_heads; k++) { for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_heads + - k * m->kProjSize * MAX_SEQ_LEN + - j * m->kProjSize + i; + int col_major_index = + l * m->kProjSize * MAX_SEQ_LEN * m->num_q_heads + + k * m->kProjSize * MAX_SEQ_LEN + j * m->kProjSize + i; kcache_cuda[index++] = keyCache_cpu[col_major_index]; } } @@ -1117,11 +1155,11 @@ void IncMultiHeadSelfAttention::inference_task( index = 0; for (int i = 0; i < m->vProjSize; i++) { for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_heads; k++) { + for (int k = 0; k < m->num_q_heads; k++) { for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_heads + - k * m->vProjSize * MAX_SEQ_LEN + - j * m->vProjSize + i; + int col_major_index = + l * m->vProjSize * MAX_SEQ_LEN * m->num_q_heads + + k * m->vProjSize * MAX_SEQ_LEN + j * m->vProjSize + i; vcache_cuda[index++] = valueCache_cpu[col_major_index]; } } @@ -1129,27 +1167,27 @@ void IncMultiHeadSelfAttention::inference_task( } torch::Tensor K_t_cuda = torch::from_blob( kcache_cuda, - {m->kProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + {m->kProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); torch::Tensor V_t_cuda = torch::from_blob( vcache_cuda, - {m->vProjSize, MAX_SEQ_LEN, num_heads, BatchConfig::MAX_NUM_REQUESTS}, + {m->vProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- // std::cout << "kcache differences:" << std::endl; // for (int i=0; i < bc->num_active_requests() + 1; i++) { - // for (int j=0; j < num_heads; j++) { + // for (int j=0; j < num_q_heads; j++) { // for (int l=0; l < m->kProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // l * MAX_SEQ_LEN * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + // j * BatchConfig::MAX_NUM_REQUESTS + // i; // if ( abs(m->kcache[kcache_idx] - keyCache_cpu[ - // i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->kProjSize * MAX_SEQ_LEN + // k * m->kProjSize + // l @@ -1165,11 +1203,11 @@ void IncMultiHeadSelfAttention::inference_task( // std::cout << "keyCache from CUDA:" << std::endl; // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jkProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // printf("%f ", - // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->kProjSize * MAX_SEQ_LEN + // k * m->kProjSize + // l @@ -1184,12 +1222,12 @@ void IncMultiHeadSelfAttention::inference_task( // std::cout << "valueCache from CUDA:" << std::endl; // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jvProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // printf("%f ", // valueCache_cpu[ - // i * m->vProjSize * MAX_SEQ_LEN * num_heads + + // i * m->vProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->vProjSize * MAX_SEQ_LEN + // k * m->vProjSize + // l]); @@ -1205,12 +1243,12 @@ void IncMultiHeadSelfAttention::inference_task( // std::cout << "C++ kcache:" << std::endl; // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; j < num_heads; j++) { + // for (int j=0; j < num_q_heads; j++) { // for (int l=0; l < m->kProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + + // l * MAX_SEQ_LEN * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + + // k * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + // j * BatchConfig::MAX_NUM_REQUESTS + // i; // printf("%f ", m->kcache[kcache_idx]); @@ -1224,14 +1262,13 @@ void IncMultiHeadSelfAttention::inference_task( // std::cout << "C++ vcache:" << std::endl; // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jvProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // size_t vcache_idx = - // l * MAX_SEQ_LEN * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_heads * BatchConfig::MAX_NUM_REQUESTS + - // j * BatchConfig::MAX_NUM_REQUESTS + - // i; + // l * MAX_SEQ_LEN * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + // + k * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + j * + // BatchConfig::MAX_NUM_REQUESTS + i; // printf("%f ", m->vcache[vcache_idx]); // } // printf("\n"); @@ -1251,11 +1288,11 @@ void IncMultiHeadSelfAttention::inference_task( // ============================================================================= // ----------------------- C++ operations & checks -------------------------- - float *w_out = (float *)calloc(m->vProjSize * m->num_heads * m->oProjSize, + float *w_out = (float *)calloc(m->vProjSize * m->num_q_heads * m->oProjSize, sizeof(float)); - std::vector w_out_shape = {m->vProjSize, m->num_heads, m->oProjSize}; + std::vector w_out_shape = {m->vProjSize, m->num_q_heads, m->oProjSize}; assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - for (int h = 0; h < num_heads; h++) { + for (int h = 0; h < num_q_heads; h++) { for (int v = 0; v < m->vProjSize; v++) { for (int o = 0; o < m->oProjSize; o++) { set_value_row_major( @@ -1269,22 +1306,22 @@ void IncMultiHeadSelfAttention::inference_task( } // convert weights to torch tensor torch::Tensor torch_w_out = torch::from_blob( - w_out, {m->vProjSize, m->num_heads, m->oProjSize}, torch::kFloat32); + w_out, {m->vProjSize, m->num_q_heads, m->oProjSize}, torch::kFloat32); // ----------------------- Loading CUDA results for this step --------------- float *w_out_cuda = download_tensor( - m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); + m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_q_heads); assert(w_out_cuda != nullptr); float *converted_wout_tensor = (float *)calloc( - m->vProjSize * m->num_heads * m->oProjSize, sizeof(float)); + m->vProjSize * m->num_q_heads * m->oProjSize, sizeof(float)); std::vector converted_wout_tensor_shape = { - m->vProjSize, m->num_heads, m->oProjSize}; + m->vProjSize, m->num_q_heads, m->oProjSize}; - for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { + for (int i = 0; i < m->vProjSize * m->num_q_heads * m->oProjSize; i++) { int v_idx = i % m->vProjSize; - int h_idx = (i / m->vProjSize) % m->num_heads; - int o_idx = i / (m->vProjSize * m->num_heads); - assert(v_idx < m->vProjSize && h_idx < m->num_heads && + int h_idx = (i / m->vProjSize) % m->num_q_heads; + int o_idx = i / (m->vProjSize * m->num_q_heads); + assert(v_idx < m->vProjSize && h_idx < m->num_q_heads && o_idx < m->oProjSize); set_value_row_major(converted_wout_tensor, converted_wout_tensor_shape, @@ -1293,7 +1330,7 @@ void IncMultiHeadSelfAttention::inference_task( } torch::Tensor w_out_cuda_tensor = torch::from_blob(converted_wout_tensor, - {m->vProjSize, m->num_heads, m->oProjSize}, + {m->vProjSize, m->num_q_heads, m->oProjSize}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- @@ -1320,16 +1357,17 @@ void IncMultiHeadSelfAttention::inference_task( // ----------------------- Loading CUDA results for this step --------------- float *qk_prods_cpu = download_tensor( m->qk_prods, - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_q_heads); assert(qk_prods_cpu != nullptr); float *qk_prods_softmax_cpu = download_tensor( m->qk_prods_softmax, - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_heads); + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_q_heads); assert(qk_prods_softmax_cpu != nullptr); float *attn_heads_cpu = download_tensor( - m->attn_heads, BatchConfig::MAX_NUM_TOKENS * m->num_heads * m->vProjSize); + m->attn_heads, + BatchConfig::MAX_NUM_TOKENS * m->num_q_heads * m->vProjSize); assert(attn_heads_cpu != nullptr); // ----------------------- Main loop (request by request) ------------------- @@ -1355,7 +1393,7 @@ void IncMultiHeadSelfAttention::inference_task( // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; assert(Q_req.sizes()[0] == m->qProjSize); assert(Q_req.sizes()[1] == num_new_tokens); - assert(Q_req.sizes()[2] == num_heads); + assert(Q_req.sizes()[2] == num_q_heads); /*printf("\n------------ QK multiplication (C++) -------------\n"); printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, @@ -1380,7 +1418,7 @@ void IncMultiHeadSelfAttention::inference_task( (1.0f / sqrt(m->kProjSize)); // Set entries above diagonal to -inf to make attention causal. - for (int h = 0; h < num_heads; h++) { + for (int h = 0; h < num_q_heads; h++) { qk_products[r].index( {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = qk_products[r] @@ -1397,24 +1435,26 @@ void IncMultiHeadSelfAttention::inference_task( qk_softmax[r] = torch::softmax(qk_products[r], -2); assert(qk_softmax[r].sizes()[0] == num_new_tokens); assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); - assert(qk_softmax[r].sizes()[2] == m->num_heads); + assert(qk_softmax[r].sizes()[2] == m->num_q_heads); // ------------------- Loading CUDA results for this step --------------- float *converted_qk_prod = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + num_new_tokens * num_tokens_received_so_far * num_q_heads, + sizeof(float)); float *converted_qk_prod_softmax = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + num_new_tokens * num_tokens_received_so_far * num_q_heads, + sizeof(float)); std::vector converted_qk_prod_shape = { - (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_heads}; + (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_q_heads}; for (size_t i = 0; - i < num_new_tokens * num_tokens_received_so_far * num_heads; + i < num_new_tokens * num_tokens_received_so_far * num_q_heads; i++) { size_t new_t_idx = i % num_new_tokens; size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); assert(new_t_idx < num_new_tokens && - all_t_idx < num_tokens_received_so_far && head_idx < num_heads); + all_t_idx < num_tokens_received_so_far && head_idx < num_q_heads); set_value_row_major(converted_qk_prod, converted_qk_prod_shape, {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, @@ -1426,34 +1466,34 @@ void IncMultiHeadSelfAttention::inference_task( } torch::Tensor qk_prods_cuda = torch::from_blob( converted_qk_prod, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, torch::kFloat32); torch::Tensor qk_prods_softmax_cuda = torch::from_blob( converted_qk_prod_softmax, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, torch::kFloat32); // ------------------- Comparing C++ & CUDA results ------------------ /* std::cout << "C++:" <num_heads); + .sizes()[2] == m->num_q_heads); attn_heads[r] = torch::einsum( "ijk,ljk->ilk", {qk_softmax[r], @@ -1480,33 +1520,33 @@ void IncMultiHeadSelfAttention::inference_task( {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); assert(attn_heads[r].sizes()[0] == num_new_tokens); assert(attn_heads[r].sizes()[1] == m->vProjSize); - assert(attn_heads[r].sizes()[2] == m->num_heads); + assert(attn_heads[r].sizes()[2] == m->num_q_heads); // ------------------- Loading CUDA results for this step --------------- - float converted_attn_heads_cpu[num_new_tokens][m->vProjSize][m->num_heads] = - {0}; - for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_heads; i++) { + float converted_attn_heads_cpu[num_new_tokens][m->vProjSize] + [m->num_q_heads] = {0}; + for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_q_heads; i++) { int token_ix = i % num_new_tokens; int vproj_idx = (i / num_new_tokens) % m->vProjSize; int head_idx = i / (num_new_tokens * m->vProjSize); assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && - head_idx < m->num_heads); + head_idx < m->num_q_heads); converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = - attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_heads + i]; + attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_q_heads + i]; } - torch::Tensor converted_attn_heads_cuda = - torch::from_blob(converted_attn_heads_cpu, - {(int64_t)num_new_tokens, m->vProjSize, m->num_heads}, - torch::kFloat32); + torch::Tensor converted_attn_heads_cuda = torch::from_blob( + converted_attn_heads_cpu, + {(int64_t)num_new_tokens, m->vProjSize, m->num_q_heads}, + torch::kFloat32); // -------------------- Comparing C++ & CUDA results ------------------- /* std::cout << "CUDA attn head for req " << r << ":" <num_heads; h++) { + for (int h=0; hnum_q_heads; h++) { std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << std::endl; } std::cout << "C++ attn head for req " << r << ":" <num_heads; h++) { + for (int h=0; hnum_q_heads; h++) { std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; } */ assert(torch::allclose( @@ -1521,7 +1561,7 @@ void IncMultiHeadSelfAttention::inference_task( // increment main loop's auxiliary index qk_prods_cpu_offset += - num_new_tokens * num_tokens_received_so_far * num_heads; + num_new_tokens * num_tokens_received_so_far * num_q_heads; } // ----------------------- Comparing C++ & CUDA results --------------------- @@ -1570,7 +1610,7 @@ bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_NUM_HEADS: - *value = num_heads; + *value = num_q_heads; return true; default: return Op::get_int_parameter(para, value); @@ -1585,7 +1625,7 @@ bool IncMultiHeadSelfAttention::measure_operator_cost( bool operator==(IncMultiHeadSelfAttentionParams const &lhs, IncMultiHeadSelfAttentionParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && - lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && lhs.add_zero_attn == rhs.add_zero_attn && @@ -1599,7 +1639,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { IncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; params.embed_dim = this->oProjSize; - params.num_heads = this->num_heads; + params.num_q_heads = this->num_q_heads; params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; @@ -1626,7 +1666,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); - hash_combine(key, params.num_heads); + hash_combine(key, params.num_q_heads); hash_combine(key, params.num_kv_heads); hash_combine(key, params.kdim); hash_combine(key, params.vdim); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index c68df398df..b7ed189040 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -64,7 +64,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads) : OpMeta(handler, attn) { hipStream_t stream; @@ -92,9 +92,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _global_num_heads, + int _global_num_q_heads, int _global_num_kv_heads, - int _num_heads, + int _num_q_heads, int _num_kv_heads, DataType _quantization_type, bool _offload) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a014b684f3..be6a4fd301 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -50,30 +50,30 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int qProjSize, int kProjSize, int vProjSize, - int global_num_heads, + int global_num_q_heads, int global_num_kv_heads, - int num_heads, + int num_q_heads, int num_kv_heads, bool scaling_query, float scaling_factor) { CUDA_KERNEL_LOOP(i, num_tokens * - (qProjSize * num_heads + kProjSize * num_kv_heads + + (qProjSize * num_q_heads + kProjSize * num_kv_heads + vProjSize * num_kv_heads)) { // for simplicity, assume q, k, v is in same shape // 0->q, 1->k, 2->v // int qkv_index = i / (num_tokens * qProjSize) % 3; - int qkv_index = i < num_tokens * qProjSize * num_heads + int qkv_index = i < num_tokens * qProjSize * num_q_heads ? 0 - : (i < num_tokens * (qProjSize * num_heads + + : (i < num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) ? 1 : 2); // int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); // int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int q_block_size = qProjSize * num_tokens * num_heads; + int q_block_size = qProjSize * num_tokens * num_q_heads; int k_block_size = kProjSize * num_tokens * num_kv_heads; // int idx = i % (num_tokens * (qProjSize)); @@ -83,16 +83,16 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int bias_idx = 0; if (qkv_index == 0) { int head_idx = i / (num_tokens * qProjSize); - int global_head_idx = head_idx + shard_id * num_heads; - int global_i = i + shard_id * num_heads * num_tokens * qProjSize; + int global_head_idx = head_idx + shard_id * num_q_heads; + int global_i = i + shard_id * num_q_heads * num_tokens * qProjSize; bias_idx = global_head_idx * qProjSize + (global_i % (num_tokens * (qProjSize)) % qProjSize); } else { int idx = qkv_index == 1 ? i - q_block_size : i - q_block_size - k_block_size; - int pre_length = qkv_index == 1 ? qProjSize * global_num_heads - : qProjSize * global_num_heads + + int pre_length = qkv_index == 1 ? qProjSize * global_num_q_heads + : qProjSize * global_num_q_heads + kProjSize * global_num_kv_heads; int head_idx = idx / (num_tokens * kProjSize); @@ -102,7 +102,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, bias_idx = pre_length + global_head_idx * kProjSize + (global_idx % (num_tokens * (qProjSize)) % qProjSize); } - // int bias_idx = qkv_index * qProjSize * global_num_heads + + // int bias_idx = qkv_index * qProjSize * global_num_q_heads + // global_head_idx * qProjSize + (idx % qProjSize); input_ptr[i] += bias_ptr[bias_idx]; @@ -120,14 +120,15 @@ __global__ void BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, - int num_heads, + int num_q_heads, int num_tokens, int num_kv_heads, int q_block_size, int k_block_size, int q_array_size) { CUDA_KERNEL_LOOP( - i, num_tokens * (qProjSize * num_heads + kProjSize * num_kv_heads) / 2) { + i, + num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { // create complex number bool q_tensor = i < (q_array_size / 2); int proj_size = q_tensor ? qProjSize : kProjSize; @@ -168,14 +169,15 @@ __global__ void BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, - int num_heads, + int num_q_heads, int num_tokens, int num_kv_heads, int q_block_size, int k_block_size, int q_array_size) { CUDA_KERNEL_LOOP( - i, num_tokens * (qProjSize * num_heads + kProjSize * num_kv_heads) / 2) { + i, + num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { // create complex number bool q_tensor = i < (q_array_size / 2); int proj_size = q_tensor ? qProjSize : kProjSize; @@ -236,9 +238,9 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t compute_type = cublas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_heads + // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_heads + // Output >>> qProjSize x num_tokens x 3 x num_q_heads int m_q = m->qProjSize; int m_k = m->kProjSize; int m_v = m->vProjSize; @@ -273,7 +275,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads + m->num_kv_heads + + m->num_q_heads + m->num_kv_heads + m->num_kv_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -281,10 +283,10 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor int num_tokens = bc->num_active_tokens(); - int parallelism = m->kProjSize * num_tokens * m->num_heads; + int parallelism = m->kProjSize * num_tokens * m->num_q_heads; int q_block_size = m->qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; - int q_array_size = m->qProjSize * num_tokens * m->num_heads; + int q_array_size = m->qProjSize * num_tokens * m->num_q_heads; // apply bias for q, k, v if (*m->bias) { apply_proj_bias_qkv<<qProjSize, m->kProjSize, m->vProjSize, - m->global_num_heads, + m->global_num_q_heads, m->global_num_kv_heads, - m->num_heads, + m->num_q_heads, m->num_kv_heads, *m->scaling_query, m->scaling_factor); @@ -308,7 +310,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, /*q&k*/ parallelism = num_tokens * - (m->qProjSize * m->num_heads + m->kProjSize * m->num_kv_heads) / 2; + (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads) / 2; apply_rotary_embedding_hf<<token_infos, m->qProjSize, m->kProjSize, - m->num_heads, + m->num_q_heads, num_tokens, m->num_kv_heads, q_block_size, @@ -345,7 +347,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens, - m->num_heads, + m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); } @@ -369,7 +371,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, stream); if (m->quantization_type == DT_INT4) { - int parallelism = m->qProjSize * m->qSize * m->num_heads / 2; + int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; decompress_int4_attention_weights<<(m->weight_ptr), m->qProjSize, m->qSize, - m->num_heads); + m->num_q_heads); } else { assert(m->quantization_type == DT_INT8); - int parallelism = m->qProjSize * m->qSize * m->num_heads; + int parallelism = m->qProjSize * m->qSize * m->num_q_heads; decompress_int8_attention_weights<<(m->weight_ptr), m->qProjSize, m->qSize, - m->num_heads); + m->num_q_heads); } } else { if (data_type == DT_FLOAT) { @@ -465,11 +467,11 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, int kProjSize, int vProjSize, int num_tokens, - int num_heads, + int num_q_heads, int num_kv_heads, int max_seq_len) { CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * num_tokens * num_heads; + int q_array_size = qProjSize * num_tokens * num_q_heads; int k_array_size = kProjSize * num_tokens * num_kv_heads; bool k_cache = i < k_array_size; @@ -497,10 +499,10 @@ template __global__ void fill_entries_above_diagonal(DT *matrix, size_t num_rows, size_t num_cols, - size_t num_heads, + size_t num_q_heads, size_t entries_above_diagonal, DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_heads) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { size_t head_idx = i / entries_above_diagonal; size_t entry_idx = i % entries_above_diagonal; size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; @@ -568,7 +570,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods); - if (m->num_kv_heads == m->num_heads) { + if (m->num_kv_heads == m->num_q_heads) { checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -589,14 +591,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads, + m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { strideB = 0; // use cublasGemmStridedBatchedEx - int one_step_heads = m->num_heads / m->num_kv_heads; + int one_step_heads = m->num_q_heads / m->num_kv_heads; m_ = num_new_tokens; n = total_tokens; k = m->qProjSize; @@ -634,14 +636,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(num_new_tokens <= total_tokens); size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; if (entries_above_diagonal > 0) { - size_t parallelism = m->num_heads * entries_above_diagonal; + size_t parallelism = m->num_q_heads * entries_above_diagonal; fill_entries_above_diagonal<<>>(C, num_new_tokens, total_tokens, - m->num_heads, + m->num_q_heads, entries_above_diagonal, static_cast
(-INFINITY)); } @@ -655,7 +657,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // with no padding between images, feature maps, rows, and columns; the // columns are the inner dimension and the images are the outermost // dimension. - int n_param = m->num_heads; + int n_param = m->num_q_heads; int c_param = total_tokens; int h_param = 1; int w_param = num_new_tokens; @@ -699,9 +701,9 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_heads * m->vProjSize; + tokens_previous_requests * m->num_q_heads * m->vProjSize; - if (m->num_heads == m->num_kv_heads) { + if (m->num_q_heads == m->num_kv_heads) { checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -722,11 +724,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads, + m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - int one_step_heads = m->num_heads / m->num_kv_heads; + int one_step_heads = m->num_q_heads / m->num_kv_heads; n = m->vProjSize; lda = m_, ldb = n, ldc = m_; strideA = num_new_tokens * total_tokens; @@ -762,10 +764,10 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; - k = m->vProjSize * m->num_heads; + k = m->vProjSize * m->num_q_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_heads + + A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads + m->vProjSize * m->num_kv_heads); B = C; @@ -796,7 +798,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_heads + + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + m->vProjSize * m->global_num_kv_heads; @@ -890,7 +892,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, INC_DECODING_MODE, @@ -911,9 +913,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, - attn->num_heads, + attn->num_q_heads, attn->num_kv_heads, - _num_heads, + _num_q_heads, _num_kv_heads, attn->quantization_type, attn->offload) {} @@ -938,9 +940,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _global_num_heads, + int _global_num_q_heads, int _global_num_kv_heads, - int _num_heads, + int _num_q_heads, int _num_kv_heads, DataType _quantization_type, bool _offload) @@ -964,18 +966,18 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( quantization_type = _quantization_type; offload = _offload; - global_num_heads = _global_num_heads; + global_num_q_heads = _global_num_q_heads; global_num_kv_heads = _global_num_kv_heads; - num_heads = _num_heads; + num_q_heads = _num_q_heads; num_kv_heads = _num_kv_heads; // weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize // + // oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - // weightSize = weights_params * num_heads * size_of_dt; + // weightSize = weights_params * num_q_heads * size_of_dt; weightSize = ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * - num_heads + + num_q_heads + (kSize * kProjSize + vSize * vProjSize) * num_kv_heads) * size_of_dt; if (quantization_type != DT_NONE) { @@ -1004,11 +1006,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( } #ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - BatchConfig::MAX_NUM_REQUESTS, + kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * + num_q_heads * BatchConfig::MAX_NUM_REQUESTS, sizeof(float)); - vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * num_heads * - BatchConfig::MAX_NUM_REQUESTS, + vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * + num_q_heads * BatchConfig::MAX_NUM_REQUESTS, sizeof(float)); #endif @@ -1016,15 +1018,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( { // size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; // size_t qkv_max_proj_size = - // BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_heads; + // BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_q_heads; size_t qkv_max_proj_size = BatchConfig::MAX_NUM_TOKENS * - (qProjSize * num_heads + kProjSize * num_kv_heads + + (qProjSize * num_q_heads + kProjSize * num_kv_heads + vProjSize * num_kv_heads); // std::cout << "num_kv_heads: " << BatchConfig::MAX_NUM_TOKENS << ", " // << qProjSize << ", " << kProjSize << ", " << vProjSize << ", " - // << num_heads << ", " << num_kv_heads << ", " << + // << num_q_heads << ", " << num_kv_heads << ", " << // qkv_max_proj_size // << std::endl; // assert(false); @@ -1054,14 +1056,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( } size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; size_t qk_prod_size = - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_heads; + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_q_heads; size_t attn_heads_size = - BatchConfig::MAX_NUM_TOKENS * num_heads * vProjSize; + BatchConfig::MAX_NUM_TOKENS * num_q_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - size_t W_out_contiguous_size = W_out_block_size * num_heads; - size_t complex_size = (BatchConfig::MAX_NUM_TOKENS * - (qProjSize * num_heads + kProjSize * num_kv_heads)) / - 2; + size_t W_out_contiguous_size = W_out_block_size * num_q_heads; + size_t complex_size = + (BatchConfig::MAX_NUM_TOKENS * + (qProjSize * num_q_heads + kProjSize * num_kv_heads)) / + 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index c4b4214f69..9395c9aab4 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -60,7 +60,6 @@ Tensor FFModel::spec_inc_multihead_self_attention(const Tensor input, int embed_dim, int num_heads, - int num_kv_heads, int kdim, int vdim, float dropout, @@ -74,6 +73,43 @@ Tensor float scaling_factor, bool qk_prod_scaling, char const *name) { + return spec_inc_multiquery_self_attention(input, + embed_dim, + num_heads, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); +} + +Tensor + FFModel::spec_inc_multiquery_self_attention(const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; } @@ -117,8 +153,8 @@ Tensor int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_heads + kParas * num_kv_heads + - vParas * num_kv_heads + oParas * num_heads; + int weight_size = qParas * num_q_heads + kParas * num_kv_heads + + vParas * num_kv_heads + oParas * num_q_heads; { int dims[1] = {weight_size}; li->weights[0] = create_weight_legion_ordering(1, @@ -131,7 +167,7 @@ Tensor } if (bias) { // q, k, v, o - int dims[1] = {qProjSize * num_heads + + int dims[1] = {qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, @@ -143,7 +179,7 @@ Tensor } li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); - li->add_int_property("num_heads", num_heads); + li->add_int_property("num_q_heads", num_q_heads); li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); @@ -168,8 +204,8 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( long long value; layer->get_int_property("embed_dim", value); int embed_dim = value; - layer->get_int_property("num_heads", value); - int num_heads = value; + layer->get_int_property("num_q_heads", value); + int num_q_heads = value; layer->get_int_property("num_kv_heads", value); int num_kv_heads = value; layer->get_int_property("kdim", value); @@ -196,7 +232,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( layer->layer_guid, inputs[0], embed_dim, - num_heads, + num_q_heads, num_kv_heads, kdim, vdim, @@ -217,7 +253,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( LayerID const &_layer_guid, const ParallelTensor _input, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -240,7 +276,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), - num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), @@ -274,7 +310,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads * (qParas + oParas) + + dims[1].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; int seed = std::rand(); @@ -287,7 +323,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_heads + + bias_shape.dims[0].size = qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -316,7 +352,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -340,7 +376,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( 1 /*outputs*/, _input, _weight), - num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), @@ -373,7 +409,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads * (qParas + oParas) + + dims[1].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = qParas + kParas + vParas + oParas; @@ -387,7 +423,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_heads + + bias_shape.dims[0].size = qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -423,7 +459,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( other.layer_guid, input, other.oProjSize, - other.num_heads, + other.num_q_heads, other.num_kv_heads, other.qProjSize, other.vProjSize, @@ -448,7 +484,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.layer_guid, input, params.embed_dim, - params.num_heads, + params.num_q_heads, params.num_kv_heads, params.kdim, params.vdim, @@ -586,7 +622,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_heads = attn->num_heads; + int num_q_heads = attn->num_q_heads; int num_kv_heads = attn->num_kv_heads; assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); @@ -602,7 +638,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( weight, gpu_mem_allocator, num_samples, - num_heads, + num_q_heads, num_kv_heads); // assert that we didn't over allocate memory assert(gpu_mem_allocator.instance_allocated_size == @@ -745,7 +781,7 @@ bool SpecIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_NUM_HEADS: - *value = num_heads; + *value = num_q_heads; return true; default: return Op::get_int_parameter(para, value); @@ -768,7 +804,7 @@ bool SpecIncMultiHeadSelfAttention::measure_operator_cost( bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, SpecIncMultiHeadSelfAttentionParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && - lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && lhs.add_zero_attn == rhs.add_zero_attn && @@ -783,7 +819,7 @@ SpecIncMultiHeadSelfAttentionParams SpecIncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; params.embed_dim = this->oProjSize; - params.num_heads = this->num_heads; + params.num_q_heads = this->num_q_heads; params.num_kv_heads = this->num_kv_heads; params.kdim = this->kProjSize; params.vdim = this->vProjSize; @@ -807,7 +843,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); - hash_combine(key, params.num_heads); + hash_combine(key, params.num_q_heads); hash_combine(key, params.num_kv_heads); hash_combine(key, params.kdim); hash_combine(key, params.vdim); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index b95b215b5b..09198c5751 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -64,7 +64,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, BEAM_SEARCH_MODE, @@ -85,9 +85,9 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, - attn->num_heads, + attn->num_q_heads, attn->num_kv_heads, - _num_heads, + _num_q_heads, _num_kv_heads, DT_NONE, false) { diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 664b746096..52a619a5d8 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -43,13 +43,13 @@ __global__ void spec_store_kv_cache( int kProjSize, int vProjSize, int num_tokens, - int num_heads, + int num_q_heads, int num_kv_heads, int max_seq_len, int max_beam_width, bool is_root) { CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * num_tokens * num_heads; + int q_array_size = qProjSize * num_tokens * num_q_heads; int k_array_size = kProjSize * num_tokens * num_kv_heads; bool k_cache = i < k_array_size; @@ -189,7 +189,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens, - m->num_heads, + m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH, BeamSearchBatchConfig::MAX_BEAM_WIDTH, @@ -201,9 +201,9 @@ template __global__ void spec_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, size_t total_tokens_in_request, - size_t num_heads, + size_t num_q_heads, DT value) { - CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { // size_t head_idx = i / (new_tokens * total_tokens_in_request); size_t src_idx = (i / new_tokens) % total_tokens_in_request; size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; @@ -288,9 +288,9 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // } // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods) + - m->num_heads * tokens_prev_requests_squares; + m->num_q_heads * tokens_prev_requests_squares; - if (m->num_heads == m->num_kv_heads) { + if (m->num_q_heads == m->num_kv_heads) { checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -311,12 +311,12 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads, + m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { strideB = 0; - int one_step_heads = m->num_heads / m->num_kv_heads; + int one_step_heads = m->num_q_heads / m->num_kv_heads; m_ = num_new_tokens; n = total_tokens; k = m->qProjSize; @@ -353,7 +353,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // causal attention. assert(num_new_tokens <= total_tokens); if (num_new_tokens > 1) { - size_t parallelism = m->num_heads * num_new_tokens * total_tokens; + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; spec_fill_entries_above_diagonal<<num_heads, + m->num_q_heads, static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) @@ -375,7 +375,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // with no padding between images, feature maps, rows, and columns; the // columns are the inner dimension and the images are the outermost // dimension. - int n_param = m->num_heads; + int n_param = m->num_q_heads; int c_param = total_tokens; int h_param = 1; int w_param = num_new_tokens; @@ -388,7 +388,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, w_param)); float softmax_alpha = 1.0f, softmax_beta = 0.0f; DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_heads * tokens_prev_requests_squares; + m->num_q_heads * tokens_prev_requests_squares; // The softmax operation below is executed according to the // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The // softmax operation is computed per spatial location (H,W) per image (N) @@ -421,9 +421,9 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_heads * m->vProjSize; + tokens_previous_requests * m->num_q_heads * m->vProjSize; - if (m->num_heads == m->num_kv_heads) { + if (m->num_q_heads == m->num_kv_heads) { checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -444,11 +444,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads, + m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - int one_step_heads = m->num_heads / m->num_kv_heads; + int one_step_heads = m->num_q_heads / m->num_kv_heads; n = m->vProjSize; lda = m_, ldb = n, ldc = m_; strideA = num_new_tokens * total_tokens; @@ -485,10 +485,10 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; - k = m->vProjSize * m->num_heads; + k = m->vProjSize * m->num_q_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_heads + + A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads + m->vProjSize * m->num_kv_heads); B = C; @@ -519,7 +519,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_heads + + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + m->vProjSize * m->global_num_kv_heads; apply_proj_bias_w<<num_heads, + attn->num_q_heads, attn->num_kv_heads, - _num_heads, + _num_q_heads, _num_kv_heads, DT_NONE, false) { diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 133543650b..875f38c77a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -62,6 +62,42 @@ Tensor FFModel::inc_multihead_self_attention_verify( const Tensor input, int embed_dim, int num_heads, + int kdim, + int vdim, + float dropout, + bool bias, + bool add_bias_kv, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + char const *name) { + return inc_multiquery_self_attention_verify(input, + embed_dim, + num_heads, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + name); +} + +Tensor FFModel::inc_multiquery_self_attention_verify( + const Tensor input, + int embed_dim, + int num_q_heads, int num_kv_heads, int kdim, int vdim, @@ -122,8 +158,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); int one_head_size = qParas + kParas + vParas + oParas; - int weight_size = qParas * num_heads + kParas * num_kv_heads + - vParas * num_kv_heads + oParas * num_heads; + int weight_size = qParas * num_q_heads + kParas * num_kv_heads + + vParas * num_kv_heads + oParas * num_q_heads; { // compress the weight size if quantization. if (quantization_type != DT_NONE) { @@ -143,7 +179,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( } if (bias) { // q, k, v, o - int dims[1] = {qProjSize * num_heads + + int dims[1] = {qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize}; li->weights[1] = create_weight_legion_ordering(1, dims, @@ -155,7 +191,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( } li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); - li->add_int_property("num_heads", num_heads); + li->add_int_property("num_q_heads", num_q_heads); li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); @@ -182,8 +218,8 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( long long value; layer->get_int_property("embed_dim", value); int embed_dim = value; - layer->get_int_property("num_heads", value); - int num_heads = value; + layer->get_int_property("num_q_heads", value); + int num_q_heads = value; layer->get_int_property("num_kv_heads", value); int num_kv_heads = value; layer->get_int_property("kdim", value); @@ -216,7 +252,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( layer->layer_guid, inputs[0], embed_dim, - num_heads, + num_q_heads, num_kv_heads, kdim, vdim, @@ -240,7 +276,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( LayerID const &_layer_guid, const ParallelTensor _input, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -266,7 +302,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( (_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), - num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), @@ -301,7 +337,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads * (qParas + oParas) + + dims[1].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = qParas + kParas + vParas + oParas; @@ -322,7 +358,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_heads + + bias_shape.dims[0].size = qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -351,7 +387,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( const ParallelTensor _input, const ParallelTensor _weight, int _embed_dim, - int _num_heads, + int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, @@ -378,7 +414,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( 1 /*outputs*/, _input, _weight), - num_heads(_num_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), @@ -412,7 +448,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_heads * (qParas + oParas) + + dims[1].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = qParas + kParas + vParas + oParas; @@ -431,7 +467,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_heads + + bias_shape.dims[0].size = qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads + oProjSize; bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -467,7 +503,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( other.layer_guid, input, other.oProjSize, - other.num_heads, + other.num_q_heads, other.num_kv_heads, other.qProjSize, other.vProjSize, @@ -495,7 +531,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.layer_guid, input, params.embed_dim, - params.num_heads, + params.num_q_heads, params.num_kv_heads, params.kdim, params.vdim, @@ -638,8 +674,8 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - // int num_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; - int num_heads = attn->num_heads / attn->tensor_parallelism_degree; + // int num_q_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree; int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree; assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); @@ -661,7 +697,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( weight, gpu_mem_allocator, num_samples, - num_heads, + num_q_heads, num_kv_heads); if (!attn->offload) { // assert that we didn't over allocate memory @@ -861,7 +897,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // Weight tensor dimensions coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; - coord_t num_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; + coord_t num_q_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; size_t qParas = m->qProjSize * m->qSize; size_t kParas = m->kProjSize * m->kSize; @@ -869,7 +905,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); assert(all_weight_params == qParas + kParas + vParas + oParas); - assert(num_heads == m->num_heads); + assert(num_q_heads == m->num_q_heads); assert(replica_dim == 1); assert(m->qSize == m->kSize && m->kSize == m->vSize); @@ -881,11 +917,11 @@ void TreeIncMultiHeadSelfAttention::inference_task( // column-major order. // printf("m->kProjSize: %i, TreeVerifyBatchConfig::MAX_NUM_TOKENS: %i, " - // "bc.num_active_tokens(): %i, num_heads: %lli, + // "bc.num_active_tokens(): %i, num_q_heads: %lli, // TreeVerifyBatchConfig::MAX_NUM_REQUESTS: %i, " // "bc.num_active_requests(): %i\n", m->kProjSize, // TreeVerifyBatchConfig::MAX_NUM_TOKENS, bc.num_active_tokens(), - // num_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS, + // num_q_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS, // bc.num_active_requests()); // for (int t=0; t < bc.num_active_tokens(); t++) { // printf("token %i has request_index: %li and token_position: %li\n", @@ -912,12 +948,12 @@ void TreeIncMultiHeadSelfAttention::inference_task( // ============================================================================= // Load the Q/K/V projection weights, and create a Torch tensor // ============================================================================= - std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_heads}; + std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_q_heads}; float *w_qkv = - (float *)calloc(m->qSize * m->qProjSize * 3 * num_heads, sizeof(float)); + (float *)calloc(m->qSize * m->qProjSize * 3 * num_q_heads, sizeof(float)); assert(w_qkv[0] == 0.0f); - for (int h = 0; h < num_heads; h++) { + for (int h = 0; h < num_q_heads; h++) { for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { int row_index = i % m->qSize; int column_index = i / m->qSize; @@ -945,7 +981,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( } // convert weights to torch tensor torch::Tensor torch_w_qkv = torch::from_blob( - w_qkv, {m->qSize, m->qProjSize, 3, (int)num_heads}, torch::kFloat32); + w_qkv, {m->qSize, m->qProjSize, 3, (int)num_q_heads}, torch::kFloat32); /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() << std::endl; @@ -968,24 +1004,24 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(qkv_projs.sizes()[1] == bc.num_active_tokens() && qkv_projs.sizes()[1] <= effective_batch_size); assert(qkv_projs.sizes()[2] == 3); - assert(qkv_projs.sizes()[3] == num_heads); + assert(qkv_projs.sizes()[3] == num_q_heads); free(w_qkv); // ----------------------- Loading CUDA results for this step --------------- float *QKVProjArray_cpu = download_tensor( m->devQKVProjArray, - TreeVerifyBatchConfig::MAX_NUM_TOKENS * proj_sum * m->num_heads); + TreeVerifyBatchConfig::MAX_NUM_TOKENS * proj_sum * m->num_q_heads); assert(QKVProjArray_cpu != nullptr); std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc.num_active_tokens(), 3, (int)num_heads}; + m->qProjSize, bc.num_active_tokens(), 3, (int)num_q_heads}; float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc.num_active_tokens() * 3 * num_heads, sizeof(float)); + m->qProjSize * bc.num_active_tokens() * 3 * num_q_heads, sizeof(float)); // skip over padding at the end of QKVProjArray_cpu // convert from column order to 3D matrix because torch cannot automatically // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc.num_active_tokens() * num_heads; i++) { + for (size_t i = 0; i < proj_sum * bc.num_active_tokens() * num_q_heads; i++) { int proj_size_index = i % m->qProjSize; int head_index = i / (proj_sum * bc.num_active_tokens()); int token_index = @@ -994,7 +1030,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( int qkv_offset = (i - head_index * proj_sum * bc.num_active_tokens()) / (m->qProjSize * bc.num_active_tokens()); assert(proj_size_index < proj_sum); - assert(head_index < num_heads); + assert(head_index < num_q_heads); assert(token_index < bc.num_active_tokens()); assert(qkv_offset < 3); set_value_row_major(QKVProjArray_converted, @@ -1004,19 +1040,19 @@ void TreeIncMultiHeadSelfAttention::inference_task( } torch::Tensor QKVProjArray_torch = torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc.num_active_tokens(), 3, num_heads}, + {m->qProjSize, bc.num_active_tokens(), 3, num_q_heads}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- // std::cout << "QKVProjArray_torch" << std::endl; - // for (int i=0; ikProjSize; d++) { - size_t kcache_idx = d * MAX_SEQ_LEN * m->num_heads * + size_t kcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + bc.tokensInfo[t].abs_depth_in_request * - m->num_heads * + m->num_q_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + bc.tokensInfo[t].request_index; @@ -1045,10 +1081,10 @@ void TreeIncMultiHeadSelfAttention::inference_task( .item(); } for (size_t d = 0; d < m->vProjSize; d++) { - size_t vcache_idx = d * MAX_SEQ_LEN * m->num_heads * + size_t vcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + bc.tokensInfo[t].abs_depth_in_request * - m->num_heads * + m->num_q_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + bc.tokensInfo[t].request_index; @@ -1063,14 +1099,14 @@ void TreeIncMultiHeadSelfAttention::inference_task( torch::from_blob(m->kcache, {m->kProjSize, MAX_SEQ_LEN, - num_heads, + num_q_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); torch::Tensor V_t = torch::from_blob(m->vcache, {m->vProjSize, MAX_SEQ_LEN, - num_heads, + num_q_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); @@ -1099,31 +1135,31 @@ void TreeIncMultiHeadSelfAttention::inference_task( // ----------------------- Loading CUDA results for this step --------------- float *keyCache_cpu = download_tensor( m->keyCache, - m->num_heads * m->kProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * + m->num_q_heads * m->kProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); float *valueCache_cpu = download_tensor( m->valueCache, - m->num_heads * m->vProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * + m->num_q_heads * m->vProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); assert(keyCache_cpu != nullptr); assert(valueCache_cpu != nullptr); float *kcache_cuda = - (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_heads * + (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_q_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS, sizeof(float)); float *vcache_cuda = - (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_heads * + (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_q_heads * TreeVerifyBatchConfig::MAX_NUM_REQUESTS, sizeof(float)); int index = 0; for (int i = 0; i < m->kProjSize; i++) { for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_heads; k++) { + for (int k = 0; k < m->num_q_heads; k++) { for (int l = 0; l < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_heads + - k * m->kProjSize * MAX_SEQ_LEN + - j * m->kProjSize + i; + int col_major_index = + l * m->kProjSize * MAX_SEQ_LEN * m->num_q_heads + + k * m->kProjSize * MAX_SEQ_LEN + j * m->kProjSize + i; kcache_cuda[index++] = keyCache_cpu[col_major_index]; } } @@ -1132,11 +1168,11 @@ void TreeIncMultiHeadSelfAttention::inference_task( index = 0; for (int i = 0; i < m->vProjSize; i++) { for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_heads; k++) { + for (int k = 0; k < m->num_q_heads; k++) { for (int l = 0; l < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_heads + - k * m->vProjSize * MAX_SEQ_LEN + - j * m->vProjSize + i; + int col_major_index = + l * m->vProjSize * MAX_SEQ_LEN * m->num_q_heads + + k * m->vProjSize * MAX_SEQ_LEN + j * m->vProjSize + i; vcache_cuda[index++] = valueCache_cpu[col_major_index]; } } @@ -1146,14 +1182,14 @@ void TreeIncMultiHeadSelfAttention::inference_task( torch::from_blob(kcache_cuda, {m->kProjSize, MAX_SEQ_LEN, - num_heads, + num_q_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); torch::Tensor V_t_cuda = torch::from_blob(vcache_cuda, {m->vProjSize, MAX_SEQ_LEN, - num_heads, + num_q_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, torch::kFloat32); @@ -1161,16 +1197,16 @@ void TreeIncMultiHeadSelfAttention::inference_task( // std::cout << "kcache differences:" << std::endl; // for (int i=0; i < bc.num_active_requests() + 1; i++) { - // for (int j=0; j < num_heads; j++) { + // for (int j=0; j < num_q_heads; j++) { // for (int l=0; l < m->kProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_heads * + // l * MAX_SEQ_LEN * num_q_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_q_heads * // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; if ( // abs(m->kcache[kcache_idx] - keyCache_cpu[ - // i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->kProjSize * MAX_SEQ_LEN + // k * m->kProjSize + // l @@ -1186,11 +1222,11 @@ void TreeIncMultiHeadSelfAttention::inference_task( // std::cout << "keyCache from CUDA:" << std::endl; // for (int i=0; ikProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // printf("%f ", - // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_heads + + // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->kProjSize * MAX_SEQ_LEN + // k * m->kProjSize + // l @@ -1205,12 +1241,12 @@ void TreeIncMultiHeadSelfAttention::inference_task( // std::cout << "valueCache from CUDA:" << std::endl; // for (int i=0; ivProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // printf("%f ", // valueCache_cpu[ - // i * m->vProjSize * MAX_SEQ_LEN * num_heads + + // i * m->vProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->vProjSize * MAX_SEQ_LEN + // k * m->vProjSize + // l]); @@ -1226,12 +1262,12 @@ void TreeIncMultiHeadSelfAttention::inference_task( // std::cout << "C++ kcache:" << std::endl; // for (int i=0; ikProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_heads * + // l * MAX_SEQ_LEN * num_q_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_q_heads * // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; // printf("%f ", m->kcache[kcache_idx]); @@ -1245,12 +1281,12 @@ void TreeIncMultiHeadSelfAttention::inference_task( // std::cout << "C++ vcache:" << std::endl; // for (int i=0; ivProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // size_t vcache_idx = - // l * MAX_SEQ_LEN * num_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_heads * + // l * MAX_SEQ_LEN * num_q_heads * + // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_q_heads * // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; // printf("%f ", m->vcache[vcache_idx]); @@ -1272,11 +1308,11 @@ void TreeIncMultiHeadSelfAttention::inference_task( // ============================================================================= // ----------------------- C++ operations & checks -------------------------- - float *w_out = (float *)calloc(m->vProjSize * m->num_heads * m->oProjSize, + float *w_out = (float *)calloc(m->vProjSize * m->num_q_heads * m->oProjSize, sizeof(float)); - std::vector w_out_shape = {m->vProjSize, m->num_heads, m->oProjSize}; + std::vector w_out_shape = {m->vProjSize, m->num_q_heads, m->oProjSize}; assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - for (int h = 0; h < num_heads; h++) { + for (int h = 0; h < num_q_heads; h++) { for (int v = 0; v < m->vProjSize; v++) { for (int o = 0; o < m->oProjSize; o++) { set_value_row_major( @@ -1290,22 +1326,22 @@ void TreeIncMultiHeadSelfAttention::inference_task( } // convert weights to torch tensor torch::Tensor torch_w_out = torch::from_blob( - w_out, {m->vProjSize, m->num_heads, m->oProjSize}, torch::kFloat32); + w_out, {m->vProjSize, m->num_q_heads, m->oProjSize}, torch::kFloat32); // ----------------------- Loading CUDA results for this step --------------- float *w_out_cuda = download_tensor( - m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_heads); + m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_q_heads); assert(w_out_cuda != nullptr); float *converted_wout_tensor = (float *)calloc( - m->vProjSize * m->num_heads * m->oProjSize, sizeof(float)); + m->vProjSize * m->num_q_heads * m->oProjSize, sizeof(float)); std::vector converted_wout_tensor_shape = { - m->vProjSize, m->num_heads, m->oProjSize}; + m->vProjSize, m->num_q_heads, m->oProjSize}; - for (int i = 0; i < m->vProjSize * m->num_heads * m->oProjSize; i++) { + for (int i = 0; i < m->vProjSize * m->num_q_heads * m->oProjSize; i++) { int v_idx = i % m->vProjSize; - int h_idx = (i / m->vProjSize) % m->num_heads; - int o_idx = i / (m->vProjSize * m->num_heads); - assert(v_idx < m->vProjSize && h_idx < m->num_heads && + int h_idx = (i / m->vProjSize) % m->num_q_heads; + int o_idx = i / (m->vProjSize * m->num_q_heads); + assert(v_idx < m->vProjSize && h_idx < m->num_q_heads && o_idx < m->oProjSize); set_value_row_major(converted_wout_tensor, converted_wout_tensor_shape, @@ -1314,7 +1350,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( } torch::Tensor w_out_cuda_tensor = torch::from_blob(converted_wout_tensor, - {m->vProjSize, m->num_heads, m->oProjSize}, + {m->vProjSize, m->num_q_heads, m->oProjSize}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- @@ -1342,18 +1378,18 @@ void TreeIncMultiHeadSelfAttention::inference_task( float *qk_prods_cpu = download_tensor( m->qk_prods, TreeVerifyBatchConfig::MAX_NUM_TOKENS * - TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads); + TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_q_heads); assert(qk_prods_cpu != nullptr); float *qk_prods_softmax_cpu = download_tensor( m->qk_prods_softmax, TreeVerifyBatchConfig::MAX_NUM_TOKENS * - TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_heads); + TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_q_heads); assert(qk_prods_softmax_cpu != nullptr); float *attn_heads_cpu = download_tensor( m->attn_heads, - TreeVerifyBatchConfig::MAX_NUM_TOKENS * m->num_heads * m->vProjSize); + TreeVerifyBatchConfig::MAX_NUM_TOKENS * m->num_q_heads * m->vProjSize); assert(attn_heads_cpu != nullptr); // ----------------------- Main loop (request by request) ------------------- @@ -1379,7 +1415,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; assert(Q_req.sizes()[0] == m->qProjSize); assert(Q_req.sizes()[1] == num_new_tokens); - assert(Q_req.sizes()[2] == num_heads); + assert(Q_req.sizes()[2] == num_q_heads); /*printf("\n------------ QK multiplication (C++) -------------\n"); printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, @@ -1404,7 +1440,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( (1.0f / sqrt(m->kProjSize)); // Set entries above diagonal to -inf to make attention causal. - for (int h = 0; h < num_heads; h++) { + for (int h = 0; h < num_q_heads; h++) { qk_products[r].index( {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = qk_products[r] @@ -1421,24 +1457,26 @@ void TreeIncMultiHeadSelfAttention::inference_task( qk_softmax[r] = torch::softmax(qk_products[r], -2); assert(qk_softmax[r].sizes()[0] == num_new_tokens); assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); - assert(qk_softmax[r].sizes()[2] == m->num_heads); + assert(qk_softmax[r].sizes()[2] == m->num_q_heads); // ------------------- Loading CUDA results for this step --------------- float *converted_qk_prod = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + num_new_tokens * num_tokens_received_so_far * num_q_heads, + sizeof(float)); float *converted_qk_prod_softmax = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_heads, sizeof(float)); + num_new_tokens * num_tokens_received_so_far * num_q_heads, + sizeof(float)); std::vector converted_qk_prod_shape = { - (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_heads}; + (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_q_heads}; for (size_t i = 0; - i < num_new_tokens * num_tokens_received_so_far * num_heads; + i < num_new_tokens * num_tokens_received_so_far * num_q_heads; i++) { size_t new_t_idx = i % num_new_tokens; size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); assert(new_t_idx < num_new_tokens && - all_t_idx < num_tokens_received_so_far && head_idx < num_heads); + all_t_idx < num_tokens_received_so_far && head_idx < num_q_heads); set_value_row_major(converted_qk_prod, converted_qk_prod_shape, {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, @@ -1450,34 +1488,34 @@ void TreeIncMultiHeadSelfAttention::inference_task( } torch::Tensor qk_prods_cuda = torch::from_blob( converted_qk_prod, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, torch::kFloat32); torch::Tensor qk_prods_softmax_cuda = torch::from_blob( converted_qk_prod_softmax, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_heads}, + {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, torch::kFloat32); // ------------------- Comparing C++ & CUDA results ------------------ /* std::cout << "C++:" <num_heads); + .sizes()[2] == m->num_q_heads); attn_heads[r] = torch::einsum( "ijk,ljk->ilk", {qk_softmax[r], @@ -1504,33 +1542,33 @@ void TreeIncMultiHeadSelfAttention::inference_task( {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); assert(attn_heads[r].sizes()[0] == num_new_tokens); assert(attn_heads[r].sizes()[1] == m->vProjSize); - assert(attn_heads[r].sizes()[2] == m->num_heads); + assert(attn_heads[r].sizes()[2] == m->num_q_heads); // ------------------- Loading CUDA results for this step --------------- - float converted_attn_heads_cpu[num_new_tokens][m->vProjSize][m->num_heads] = - {0}; - for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_heads; i++) { + float converted_attn_heads_cpu[num_new_tokens][m->vProjSize] + [m->num_q_heads] = {0}; + for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_q_heads; i++) { int token_ix = i % num_new_tokens; int vproj_idx = (i / num_new_tokens) % m->vProjSize; int head_idx = i / (num_new_tokens * m->vProjSize); assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && - head_idx < m->num_heads); + head_idx < m->num_q_heads); converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = - attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_heads + i]; + attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_q_heads + i]; } - torch::Tensor converted_attn_heads_cuda = - torch::from_blob(converted_attn_heads_cpu, - {(int64_t)num_new_tokens, m->vProjSize, m->num_heads}, - torch::kFloat32); + torch::Tensor converted_attn_heads_cuda = torch::from_blob( + converted_attn_heads_cpu, + {(int64_t)num_new_tokens, m->vProjSize, m->num_q_heads}, + torch::kFloat32); // -------------------- Comparing C++ & CUDA results ------------------- /* std::cout << "CUDA attn head for req " << r << ":" <num_heads; h++) { + for (int h=0; hnum_q_heads; h++) { std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << std::endl; } std::cout << "C++ attn head for req " << r << ":" <num_heads; h++) { + for (int h=0; hnum_q_heads; h++) { std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; } */ assert(torch::allclose( @@ -1545,7 +1583,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( // increment main loop's auxiliary index qk_prods_cpu_offset += - num_new_tokens * num_tokens_received_so_far * num_heads; + num_new_tokens * num_tokens_received_so_far * num_q_heads; } // ----------------------- Comparing C++ & CUDA results --------------------- @@ -1594,7 +1632,7 @@ bool TreeIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_NUM_HEADS: - *value = num_heads; + *value = num_q_heads; return true; default: return Op::get_int_parameter(para, value); @@ -1609,7 +1647,7 @@ bool TreeIncMultiHeadSelfAttention::measure_operator_cost( bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, TreeIncMultiHeadSelfAttentionParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && - lhs.num_heads == rhs.num_heads && lhs.kdim == rhs.kdim && + lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && lhs.add_zero_attn == rhs.add_zero_attn && @@ -1624,7 +1662,7 @@ TreeIncMultiHeadSelfAttentionParams TreeIncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; params.embed_dim = this->oProjSize; - params.num_heads = this->num_heads; + params.num_q_heads = this->num_q_heads; params.num_kv_heads = this->num_kv_heads; params.kdim = this->kProjSize; params.vdim = this->vProjSize; @@ -1648,7 +1686,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.embed_dim); - hash_combine(key, params.num_heads); + hash_combine(key, params.num_q_heads); hash_combine(key, params.num_kv_heads); hash_combine(key, params.kdim); hash_combine(key, params.vdim); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 9927ef7af0..a20077efb4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -64,7 +64,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, - int _num_heads, + int _num_q_heads, int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, TREE_VERIFY_MODE, @@ -85,9 +85,9 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( weight, gpu_mem_allocator, num_samples, - attn->num_heads, + attn->num_q_heads, attn->num_kv_heads, - _num_heads, + _num_q_heads, _num_kv_heads, attn->quantization_type, attn->offload), diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index c19f4f37b3..69f085d3eb 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -42,7 +42,7 @@ __global__ void commit_tokens_kernel( int vProjSize, int num_tokens_to_commit, int num_active_tokens_in_last_batch, - int num_heads, + int num_q_heads, int num_kv_heads, int max_seq_len) { @@ -60,7 +60,8 @@ __global__ void commit_tokens_kernel( int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - int q_array_size = qProjSize * num_active_tokens_in_last_batch * num_heads; + int q_array_size = + qProjSize * num_active_tokens_in_last_batch * num_q_heads; int k_array_size = kProjSize * num_active_tokens_in_last_batch * num_kv_heads; @@ -99,7 +100,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - m->num_heads, + m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); } @@ -117,13 +118,13 @@ __global__ void update_tree_branch_kv_cache( int num_tokens_in_branch, int processed_tokens_in_batch, int total_tokens_in_batch, - int num_heads, + int num_q_heads, int num_kv_heads, int max_seq_len) { CUDA_KERNEL_LOOP( i, num_tokens_in_branch * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * total_tokens_in_batch * num_heads; + int q_array_size = qProjSize * total_tokens_in_batch * num_q_heads; int k_array_size = kProjSize * total_tokens_in_batch * num_kv_heads; bool k_cache = i < (num_tokens_in_branch * kProjSize * num_kv_heads); @@ -155,9 +156,9 @@ template __global__ void tree_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, size_t total_tokens_in_request, - size_t num_heads, + size_t num_q_heads, DT value) { - CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_heads) { + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { // size_t head_idx = i / (new_tokens * total_tokens_in_request); size_t src_idx = (i / new_tokens) % total_tokens_in_request; size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; @@ -234,7 +235,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch m->num_active_tokens, // total_tokens_in_batch - m->num_heads, + m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); } @@ -263,7 +264,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods); - if (m->num_heads == m->num_kv_heads) { + if (m->num_q_heads == m->num_kv_heads) { checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -284,12 +285,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads, + m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { strideB = 0; - int one_step_heads = m->num_heads / m->num_kv_heads; + int one_step_heads = m->num_q_heads / m->num_kv_heads; for (int step = 0; step < m->num_kv_heads; step++) { checkCUDA( cublasGemmStridedBatchedEx(m->handle.blas, @@ -323,7 +324,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(num_new_tokens <= total_tokens_in_request); if (num_new_tokens > 1) { size_t parallelism = - m->num_heads * num_new_tokens * total_tokens_in_request; + m->num_q_heads * num_new_tokens * total_tokens_in_request; tree_fill_entries_above_diagonal<<num_heads, + m->num_q_heads, static_cast
(-INFINITY)); } // Compute Softmax(QK^T/sqrt(d_k)) @@ -345,7 +346,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // with no padding between images, feature maps, rows, and columns; the // columns are the inner dimension and the images are the outermost // dimension. - int n_param = m->num_heads; + int n_param = m->num_q_heads; int c_param = total_tokens_in_request; int h_param = 1; int w_param = num_new_tokens; @@ -389,9 +390,9 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + - processed_tokens_in_batch * m->num_heads * m->vProjSize; + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - if (m->num_heads == m->num_kv_heads) { + if (m->num_q_heads == m->num_kv_heads) { checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -412,11 +413,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cublas_data_type, ldc, strideC, - m->num_heads, + m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - int one_step_heads = m->num_heads / m->num_kv_heads; + int one_step_heads = m->num_q_heads / m->num_kv_heads; strideB = 0; for (int step = 0; step < m->num_kv_heads; step++) { checkCUDA( @@ -449,10 +450,10 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; - k = m->vProjSize * m->num_heads; + k = m->vProjSize * m->num_q_heads; n = num_new_tokens; lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_heads + + A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads + m->vProjSize * m->num_kv_heads); B = C; @@ -486,7 +487,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } if (*m->bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; - int qkv_weight_size = m->qProjSize * m->global_num_heads + + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + m->vProjSize * m->global_num_kv_heads; apply_proj_bias_w<<num_heads, + attn->num_q_heads, attn->num_kv_heads, - _num_heads, + _num_q_heads, _num_kv_heads, attn->quantization_type, attn->offload), diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index a3a9e5c4d9..f348ca9016 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2312,7 +2312,7 @@ GraphOptimalViewSerialized sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); - sez.serialize(attn->num_heads); + sez.serialize(attn->num_q_heads); sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); @@ -2335,7 +2335,7 @@ GraphOptimalViewSerialized sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); - sez.serialize(attn->num_heads); + sez.serialize(attn->num_q_heads); sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); @@ -2355,7 +2355,7 @@ GraphOptimalViewSerialized sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->oProjSize); - sez.serialize(attn->num_heads); + sez.serialize(attn->num_q_heads); sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); @@ -2736,7 +2736,7 @@ void FFModel::deserialize_graph_optimal_view( } case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim, num_kv_heads, + int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, @@ -2747,7 +2747,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(transformer_layer_id); LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); - dez.deserialize(num_heads); + dez.deserialize(num_q_heads); dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); @@ -2765,7 +2765,7 @@ void FFModel::deserialize_graph_optimal_view( IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; - params.num_heads = num_heads; + params.num_q_heads = num_q_heads; params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; @@ -2786,7 +2786,7 @@ void FFModel::deserialize_graph_optimal_view( } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim, num_kv_heads; + int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling; @@ -2795,7 +2795,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(transformer_layer_id); LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); - dez.deserialize(num_heads); + dez.deserialize(num_q_heads); dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); @@ -2810,7 +2810,7 @@ void FFModel::deserialize_graph_optimal_view( SpecIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; - params.num_heads = num_heads; + params.num_q_heads = num_q_heads; params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; @@ -2829,7 +2829,7 @@ void FFModel::deserialize_graph_optimal_view( } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_heads, k_dim, v_dim, num_kv_heads, + int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, @@ -2840,7 +2840,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(transformer_layer_id); LayerID layer_guid(id, transformer_layer_id); dez.deserialize(embed_dim); - dez.deserialize(num_heads); + dez.deserialize(num_q_heads); dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); @@ -2858,7 +2858,7 @@ void FFModel::deserialize_graph_optimal_view( TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; - params.num_heads = num_heads; + params.num_q_heads = num_q_heads; params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; From fafbbc2ff5f6d16fed5c1a9350c002b1a6f3a8f8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 4 Aug 2023 19:34:53 -0400 Subject: [PATCH 191/344] Cleanup (#914) --- CMakeLists.txt | 5 - examples/cpp/inference/.gitignore | 1 - examples/cpp/inference/data_generator.cc | 165 ----------- examples/cpp/inference/data_generator.cpp | 80 ------ examples/cpp/inference/data_generator.h | 76 ------ examples/cpp/inference/dataloader.cc | 218 --------------- examples/cpp/inference/dataloader.cu | 136 --------- examples/cpp/inference/dataloader.h | 68 ----- examples/cpp/inference/inference_config.h | 79 ------ .../mixture_of_experts/CMakeLists.txt | 23 -- .../cpp/inference/mixture_of_experts/Makefile | 35 --- .../cpp/inference/mixture_of_experts/moe.cc | 257 ------------------ .../cpp/inference/mixture_of_experts/moe.h | 43 --- .../cpp/inference/transformers/CMakeLists.txt | 23 -- .../inference/transformers/transformers.cc | 236 ---------------- .../cpp/inference/transformers/transformers.h | 25 -- include/flexflow/batch_config.h | 13 - include/flexflow/request_manager.h | 2 - src/runtime/batch_config.cc | 91 ------- src/runtime/inference_manager.cc | 34 --- 20 files changed, 1610 deletions(-) delete mode 100644 examples/cpp/inference/.gitignore delete mode 100644 examples/cpp/inference/data_generator.cc delete mode 100644 examples/cpp/inference/data_generator.cpp delete mode 100644 examples/cpp/inference/data_generator.h delete mode 100644 examples/cpp/inference/dataloader.cc delete mode 100644 examples/cpp/inference/dataloader.cu delete mode 100644 examples/cpp/inference/dataloader.h delete mode 100644 examples/cpp/inference/inference_config.h delete mode 100644 examples/cpp/inference/mixture_of_experts/CMakeLists.txt delete mode 100644 examples/cpp/inference/mixture_of_experts/Makefile delete mode 100644 examples/cpp/inference/mixture_of_experts/moe.cc delete mode 100644 examples/cpp/inference/mixture_of_experts/moe.h delete mode 100644 examples/cpp/inference/transformers/CMakeLists.txt delete mode 100644 examples/cpp/inference/transformers/transformers.cc delete mode 100644 examples/cpp/inference/transformers/transformers.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b35fb1613d..a5852406a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -593,11 +593,6 @@ if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) endif() -if(FF_BUILD_MOE OR FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/inference/mixture_of_experts) - add_subdirectory(examples/cpp/inference/transformers) -endif() - if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) diff --git a/examples/cpp/inference/.gitignore b/examples/cpp/inference/.gitignore deleted file mode 100644 index 05424f2a4c..0000000000 --- a/examples/cpp/inference/.gitignore +++ /dev/null @@ -1 +0,0 @@ -weights diff --git a/examples/cpp/inference/data_generator.cc b/examples/cpp/inference/data_generator.cc deleted file mode 100644 index 9d8fe1b7be..0000000000 --- a/examples/cpp/inference/data_generator.cc +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "data_generator.h" -#include "flexflow/batch_config.h" -#include -#include -#include -using namespace std; -using namespace FlexFlow; - -DataGenerator::DataGenerator(size_t _num_requests, - size_t _vocab_size, - size_t _min_input_tokens, - size_t _max_input_tokens, - size_t _min_tokens_to_generate, - size_t _max_tokens_to_generate, - bool _poisson_distr, - double _lambda) - : num_requests(_num_requests), vocab_size(_vocab_size), - min_input_tokens(_min_input_tokens), max_input_tokens(_max_input_tokens), - min_tokens_to_generate(_min_tokens_to_generate), - max_tokens_to_generate(_max_tokens_to_generate), - poisson_distr(_poisson_distr), lambda(_lambda), timer_started(false) { - assert(max_input_tokens >= min_input_tokens); - assert(max_tokens_to_generate >= min_tokens_to_generate); - assert(max_input_tokens + max_tokens_to_generate <= - BatchConfig::MAX_SEQ_LENGTH); - generate_requests_meta(); -}; - -// generate each request's arrival time and sequence length -void DataGenerator::generate_requests_meta() { - random_device rnd1, rnd2, rnd3; - mt19937 gen1(rnd1()), gen2(rnd2()), gen3(rnd3()); - // set up a uniform number generator with range [0,1) (in seconds) for the - // arrival times - uniform_real_distribution dist1{0, 1.0}; - double cur_arrival = 0; // assume first request comes in at time 0 - // set up a uniform number generator for the initial/generated sequence length - uniform_int_distribution dist2{min_input_tokens, - max_input_tokens}; - uniform_int_distribution dist3{min_tokens_to_generate, - max_tokens_to_generate}; - size_t cur_seq_len = dist2(gen2); - size_t tokens_to_generate = dist3(gen3); - - for (size_t i = 0; i < num_requests; i++) { - arrivals.push_back(cur_arrival); - if (poisson_distr) { - double u = dist1(gen1); - double interval = -(1 / lambda) * log(1 - u) * 1000; - cur_arrival += interval; - } else { - cur_arrival += (1000 / lambda); - } - seq_lengths.push_back(std::make_pair(cur_seq_len, tokens_to_generate)); - cur_seq_len = dist2(gen2); - tokens_to_generate = dist3(gen3); - } - // cout << "Arrivals : ["; - // copy(arrivals.begin(), arrivals.end(), ostream_iterator(cout, " ")); - // cout << "]" << endl; -}; - -void DataGenerator::generate_requests(int *req_ptr) { - assert(req_ptr != nullptr); - /* for (size_t i=0; i float_dist{0, 1.0}; - // auto gen = [&float_dist, &mersenne_engine]() { - // return float_dist(mersenne_engine); - // }; - std::uniform_int_distribution int_dist(0, vocab_size - 1); - auto gen = [&int_dist, &mersenne_engine]() { - return int_dist(mersenne_engine); - }; - std::generate(req_ptr, req_ptr + max_input_tokens * num_requests, gen); -}; - -void DataGenerator::start_timer(void) { - arrivals_ptr = arrivals.begin(); - start_time = Clock::now(); - timer_started = true; -}; - -// In non-incremental mode, the number of requests we want is limited by the -// tensor's batch size. As long as each request has a length that is shorter -// than the tensor's max sequence length, we do not need to impose any -// additional requirement on the max number of tokens across requests. We can -// thus pass max_tokens = max_requests * tensor max sequence length as a -// placeholder. In incremental mode, the max number of requests is only limited -// by the BatchConfig request capacity (for storing each request's metadata), -// whereas the total number number of tokens across requests will be limited by -// the tensor's batch_size * sequence length. -std::pair DataGenerator::get_requests(size_t max_requests, - size_t max_tokens) { - // printf("\nget_requests(%lu, %lu)\n\n", max_requests, max_tokens); - if (!timer_started) { - std::cout << "Warning: tried to get number of requests before the timer " - "was started." - << std::endl; - return std::make_pair(0, 0); - } - Clock::time_point cur_time = Clock::now(); - size_t ms_from_start = - chrono::duration_cast(cur_time - start_time).count(); - std::vector::iterator new_arrivals_ptr = - upper_bound(arrivals_ptr, arrivals.end(), ms_from_start); - // number of new requests received - size_t received_requests = 0; - // id of first received request - size_t first_request_guid = arrivals_ptr - arrivals.begin(); - size_t new_tokens = 0; - for (size_t j = 0; - j < std::min((size_t)(new_arrivals_ptr - arrivals_ptr), max_requests) && - new_tokens < max_tokens; - j++) { - if (seq_lengths[first_request_guid + j].first <= max_tokens - new_tokens) { - received_requests++; - new_tokens += seq_lengths[first_request_guid + j].first; - } else { - break; - } - } - std::advance(arrivals_ptr, received_requests); - - /* if (received_requests > 0) { - std::cout << "received " << received_requests - << " request(s) by arrival time +" << ms_from_start << "ms" - << "\n"; - } */ - - return std::make_pair(first_request_guid, received_requests); -} - -std::pair DataGenerator::get_request_length(size_t guid) { - assert(seq_lengths.size() > - guid); // make sure the guid is valid (seq_lengths has an entry for the - // sequence with given guid) - return seq_lengths[guid]; -} diff --git a/examples/cpp/inference/data_generator.cpp b/examples/cpp/inference/data_generator.cpp deleted file mode 100644 index 765e9813b9..0000000000 --- a/examples/cpp/inference/data_generator.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// -// main.cpp -// dataloader -// -// Created by User on 11/15/22. -// - -#include "data_generator.h" -#include -#include -#include -#include -using namespace std; - -// This is for testing the request generator standalone -int main(int argc, char const *argv[]) { - - cout << "Starting the Data DataGenerator!\n"; - - // DataGenerator parameters - size_t total_requests = 2560; - size_t vocab_size = 50257; - size_t max_sequence_length = 512 + 128; - bool use_poisson_distr = true; - // average number of request arrivals per second - double lambda = 250; - - size_t min_input_tokens = 32, max_input_tokens = 512, - min_tokens_to_generate = 1, max_tokens_to_generate = 128; - - int *requests = - (int *)calloc(max_sequence_length * total_requests, sizeof(int)); - - DataGenerator data_generator(total_requests, - vocab_size, - min_input_tokens, - max_input_tokens, - min_tokens_to_generate, - max_tokens_to_generate, - use_poisson_distr, - lambda); - data_generator.generate_requests(requests); - data_generator.start_timer(); - - size_t received_requests = 0; - std::pair reqs = data_generator.get_requests(0, 0); - size_t guid = reqs.first; - assert(reqs.second == 0); - this_thread::sleep_for(milliseconds(50)); - - reqs = data_generator.get_requests(2560, 2560 * (512)); - received_requests += reqs.second; - std::cout << "t=0ms: received " << received_requests << std::endl; - - this_thread::sleep_for(milliseconds(1200)); - reqs = data_generator.get_requests(2560, 2560 * (512)); - received_requests += reqs.second; - std::cout << "t=1200ms: received " << received_requests << std::endl; - - this_thread::sleep_for(milliseconds(10)); - reqs = data_generator.get_requests(2560, 2560 * (512)); - received_requests += reqs.second; - std::cout << "t=1210ms: received " << received_requests << std::endl; - - this_thread::sleep_for(milliseconds(4000)); - reqs = data_generator.get_requests(2560, 2560 * (512)); - received_requests += reqs.second; - std::cout << "t=5210ms: received " << received_requests << std::endl; - this_thread::sleep_for(milliseconds(5000)); - - reqs = data_generator.get_requests(2560, 2560 * (512)); - received_requests += reqs.second; - std::cout << "t=10210ms: received " << received_requests << std::endl; - - free(requests); - - assert(received_requests == total_requests); - - return 0; -} diff --git a/examples/cpp/inference/data_generator.h b/examples/cpp/inference/data_generator.h deleted file mode 100644 index 3ba3007123..0000000000 --- a/examples/cpp/inference/data_generator.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -typedef std::chrono::high_resolution_clock Clock; -typedef std::chrono::milliseconds milliseconds; - -class DataGenerator { -public: - DataGenerator(size_t _num_requests, - size_t _vocab_size, - size_t _min_input_tokens, - size_t _max_input_tokens, - size_t _min_tokens_to_generate, - size_t _max_tokens_to_generate, - bool _poisson_distr, - double _lambda); - - // Generate random requests by filling each tensor with random tokens. For - // now, assume all requests have the same sequence length. - void generate_requests(int *req_ptr); - void start_timer(void); - // Get number of requests that have arrived since the last time this function - // was called - std::pair get_requests(size_t max_requests, - size_t max_tokens); - std::pair get_request_length(size_t guid); - -private: - // Compute the arrival times of each request and save them in the arrivals - // vector. - // void generate_arrival_times(void); - void generate_requests_meta(); - - size_t num_requests; // total number of requests - size_t vocab_size; // number of words in the vocab - size_t min_input_tokens; - size_t max_input_tokens; - size_t min_tokens_to_generate; - size_t max_tokens_to_generate; - bool poisson_distr; // false implies uniform distribution - double lambda; // mean #num of arrivals per sec - bool timer_started; // whether timer was initiated - // time when get_requests() is called for the first time - Clock::time_point start_time; - // arrival times (ms) generated based on distribution - std::vector arrivals; - std::vector::iterator arrivals_ptr; - // sequence lengths generated based on uniform distribution - std::vector> seq_lengths; -}; diff --git a/examples/cpp/inference/dataloader.cc b/examples/cpp/inference/dataloader.cc deleted file mode 100644 index ce49086a92..0000000000 --- a/examples/cpp/inference/dataloader.cc +++ /dev/null @@ -1,218 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "dataloader.h" -#include "flexflow/inference.h" -#include "inference_config.h" - -using namespace Legion; - -DataLoader::DataLoader(FFModel &ff, - InferenceConfig const &inferenceConfig, - DataGenerator &data_generator, - std::vector input) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - - assert(input.size() > 0); - int numdims = input[0]->num_dims; - for (int i = 1; i < input.size(); i++) { - assert(input[i]->num_dims == numdims); - for (int j = 0; j < numdims; j++) { - assert(input[i]->dims[j].size == input[0]->dims[j].size); - assert(input[i]->dims[j].degree == input[0]->dims[j].degree); - assert(input[i]->dims[j].parallel_idx == input[0]->dims[j].parallel_idx); - } - } - - int replica_idx = numdims - 1; - int batch_idx = numdims - 2; - num_samples = inferenceConfig.total_requests; - - // Create full input - { - batch_input = input; - - ParallelDim dims[numdims]; - for (int i = 0; i < numdims; i++) { - dims[i].size = input[0]->dims[i].size; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = input[0]->dims[i].is_replica_dim; - // Assume only the first dim can be the replica dim - assert(i == replica_idx || (!dims[i].is_replica_dim)); - } - assert(dims[batch_idx].size == inferenceConfig.batch_size); - dims[batch_idx].size = num_samples; - - full_input = - ff.create_parallel_tensor_legion_ordering(numdims, dims, DT_INT32); - ff.map_tensor(full_input, NULL /*parallel_op*/); - } - - // Load entire dataset - // TODO: Use index launcher instead of task launcher - assert(full_input != nullptr && "full_input is nullptr"); - - DataLoaderInput dataloader_input = {inferenceConfig, data_generator}; - DataLoaderInput const *ptr = &dataloader_input; - - TaskLauncher launcher(CUSTOM_CPU_TASK_ID_1, - TaskArgument(ptr, sizeof(DataLoaderInput))); - // regions[0]: full_input - launcher.add_region_requirement(RegionRequirement(full_input->region, - WRITE_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - - runtime->execute_task(ctx, launcher); -} - -void DataLoader::load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - DataLoaderInput const input_struct = *((DataLoaderInput *)task->args); - InferenceConfig const &conf = input_struct._inferenceConfig; - DataGenerator &datagen = input_struct._data_generator; - assert(regions.size() == 1); - assert(task->regions.size() == regions.size()); - - // get input pointer - int *input_ptr = helperGetTensorPointerWO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - int input_dims = input_domain.get_dim(); - for (int i = 0; i < input_dims; i++) { - int input_dim = input_domain.hi()[i] - input_domain.lo()[i] + 1; - } - - if (conf.dataset_path.length() == 0) { - printf("Input dataset path is empty, using random input samples\n"); - datagen.generate_requests(input_ptr); - } else { - // Load specific dataset - } -} - -void DataLoader::next_batch(FFModel &ff, - int bid, - BatchConfig *bc, - std::map &batch_predictions, - MachineView const *mv) { - size_t num_active_tokens = bc->num_active_tokens(); - if (num_active_tokens == 0) { - return; - } - assert(bid < batch_input.size()); - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - // Load input - { - Domain domain = - runtime->get_index_space_domain(ctx, batch_input[bid]->parallel_is); - ArgumentMap argmap; - // No partitioning of the batch input token in inference mode - int input_dims = batch_input[bid]->num_dims; - for (int i = 0; i < input_dims; i++) { - assert(batch_input[bid]->dims[i].degree == 1 && - "Dataloader does not support input token partitioning in " - "inference mode"); - } - int batch_size = batch_input[bid]->dims[input_dims - 2].size; - int seq_len = batch_input[bid]->dims[input_dims - 3].size; - - assert(ff.config.batchSize == batch_size && - batch_size * seq_len >= num_active_tokens); - - DataLoaderNextBatchInput next_batch_input = {bc, batch_predictions}; - DataLoaderNextBatchInput const *ptr = &next_batch_input; - size_t next_batch_input_sz = sizeof(next_batch_input); - assert(ptr->prev_batch_preds.size() == batch_predictions.size()); - MachineView const *view = mv ? mv : &batch_input[bid]->machine_view; - size_t machine_view_hash = view->hash(); - IndexLauncher launcher(CUSTOM_GPU_TASK_ID_1, - batch_input[bid]->parallel_is, - TaskArgument(ptr, next_batch_input_sz), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_region_requirement(RegionRequirement(full_input->region, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - full_input->region, - MAP_TO_ZC_MEMORY)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_input[bid]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_input[bid]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); - } -} - -void DataLoader::store_outputs(BatchConfig *bc, - InferenceResult const &ir, - std::map &batch_predictions) { - assert((bc->num_active_tokens() == 0) == (bc->num_active_requests() == 0)); - if (bc->num_active_tokens() == 0) { - return; - } - // there is no num_samples, replace it with num_active_tokens - batch_predictions.clear(); - for (size_t i = 0; i < bc->num_active_tokens(); i++) { - auto guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; - if (i == bc->num_active_tokens() - 1 || - guid != bc->requestsInfo[bc->tokensInfo[i + 1].request_index] - .request_guid) { - if (outputs.find(guid) == outputs.end()) { - std::vector v{ir.token_ids[i]}; - outputs[guid] = v; - } else { - outputs[guid].push_back(ir.token_ids[i]); - } - batch_predictions[guid] = ir.token_ids[i]; - } - } - assert(batch_predictions.size() == bc->num_active_requests()); -} - -void FlexFlow::register_custom_tasks() { - // Load entire dataset - { - TaskVariantRegistrar registrar(CUSTOM_CPU_TASK_ID_1, "Load Entire Dataset"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Entire Dataset Task"); - } - // Load input - { - TaskVariantRegistrar registrar(CUSTOM_GPU_TASK_ID_1, "Load Inputs"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "Load Input Task"); - } -} diff --git a/examples/cpp/inference/dataloader.cu b/examples/cpp/inference/dataloader.cu deleted file mode 100644 index 434dc337c9..0000000000 --- a/examples/cpp/inference/dataloader.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "dataloader.h" -#include "flexflow/inference.h" -#include "flexflow/request_manager.h" -#include "flexflow/utils/cuda_helper.h" - -void DataLoader::load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - DataLoaderNextBatchInput const input_struct = - *((DataLoaderNextBatchInput *)task->args); - - BatchConfig *bc = input_struct.bc; - BatchConfig::PerRequestInfo *requestInfo = bc->requestsInfo; - BatchConfig::PerTokenInfo *tokensInfo = bc->tokensInfo; - std::map const &prev_batch_preds = input_struct.prev_batch_preds; - - if (bc->num_active_tokens() == 0) { - return; - } - int const *full_input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int *batch_input_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - Domain full_input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain batch_input_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - - coord_t sequence_length = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; - coord_t batch_size = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - - coord_t full_input_sequence_length = - batch_input_domain.hi()[0] - batch_input_domain.lo()[0] + 1; - coord_t full_input_batch_size = - batch_input_domain.hi()[1] - batch_input_domain.lo()[1] + 1; - - assert(sequence_length == full_input_sequence_length); - assert(batch_size <= full_input_batch_size); - - // Currently assume continous indices - assert(bc->num_active_tokens() <= batch_size * sequence_length); - for (int i = 1; i < bc->num_active_tokens(); i++) { - auto prev_guid = requestInfo[tokensInfo[i - 1].request_index].request_guid; - auto guid = requestInfo[tokensInfo[i].request_index].request_guid; - if (guid == prev_guid) { - assert(tokensInfo[i].abs_depth_in_request == - tokensInfo[i - 1].abs_depth_in_request + 1); - } - } - // keep things simple for now - assert(batch_input_domain.get_volume() == batch_size * sequence_length); - - // pad inputs if needed (this is really only useful for debugging) - checkCUDA(cudaMemset( - batch_input_ptr, 0, batch_input_domain.get_volume() * sizeof(int))); - - auto guid = requestInfo[tokensInfo[0].request_index].request_guid; - int start_idx = tokensInfo[0].abs_depth_in_request; - int dst_idx = 0; - int total_tokens = 0; - - for (size_t i = 1; i <= bc->num_active_tokens(); i++) { - auto current_guid = requestInfo[tokensInfo[i].request_index].request_guid; - if (i == bc->num_active_tokens() || current_guid != guid) { - - size_t tokens_to_copy = - (tokensInfo[i - 1].abs_depth_in_request - start_idx + 1); - assert(tokens_to_copy > 0); - - int request_index = tokensInfo[i - 1].request_index; - int token_start_offset = - bc->requestsInfo[request_index].token_start_offset; - int num_processing_tokens = - bc->requestsInfo[request_index].num_tokens_in_batch; - if (tokens_to_copy > 1 || token_start_offset == 0) { - // initialization phase - assert(tokensInfo[i - 1].abs_depth_in_request < - (token_start_offset + num_processing_tokens)); - int const *input_zc = - full_input_ptr + (guid * sequence_length) + start_idx; - int *dst_ptr = batch_input_ptr + dst_idx; - copy_kernel<<>>( - dst_ptr, input_zc, tokens_to_copy); - } else { - // incremental phase - assert(tokensInfo[i - 1].abs_depth_in_request >= token_start_offset); - assert(tokens_to_copy == 1); - - assert(prev_batch_preds.find(guid) != prev_batch_preds.end()); - int token = prev_batch_preds.at(guid); - int *dst_ptr = batch_input_ptr + dst_idx; - cudaMemcpy(dst_ptr, - &token, - sizeof(FlexFlow::RequestManager::TokenId), - cudaMemcpyHostToDevice); - } - total_tokens += tokens_to_copy; - - if (i < bc->num_active_tokens()) { - guid = bc->requestsInfo[bc->tokensInfo[i].request_index].request_guid; - start_idx = tokensInfo[i].abs_depth_in_request; - } - dst_idx = i; - } - } - assert(total_tokens == bc->num_active_tokens()); - /*printf("token_dim: %lli, sequence_length: %lli, batch_size: %lli\n", - token_dim, sequence_length, batch_size); printf("total_tokens: %lu\n", - total_tokens); printf("guid: %lu\n", guid); - print_tensor(batch_input_ptr, - batch_input_domain.get_volume(), - "[BatchInput]");*/ - checkCUDA(cudaDeviceSynchronize()); -} diff --git a/examples/cpp/inference/dataloader.h b/examples/cpp/inference/dataloader.h deleted file mode 100644 index c77c70502a..0000000000 --- a/examples/cpp/inference/dataloader.h +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "data_generator.h" -#include "flexflow/batch_config.h" -#include "flexflow/model.h" -#include "inference_config.h" - -#include -#include -#include -#include -#include -#include - -using namespace Legion; -using namespace FlexFlow; - -class DataLoader { -public: - DataLoader(FFModel &ff, - InferenceConfig const &inferenceConfig, - DataGenerator &data_generator, - std::vector input); - static void load_input(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - static void load_entire_dataset(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - void next_batch(FFModel &ff, - int bid, - BatchConfig *bc, - std::map &batch_predictions, - MachineView const *mv = nullptr); - void store_outputs(BatchConfig *bc, - InferenceResult const &ir, - std::map &batch_predictions); - -public: - size_t num_samples; - ParallelTensor full_input; - std::vector batch_input; - std::map> outputs; - struct DataLoaderInput { - InferenceConfig const &_inferenceConfig; - DataGenerator &_data_generator; - }; - struct DataLoaderNextBatchInput { - BatchConfig *bc; - std::map const &prev_batch_preds; - }; -}; diff --git a/examples/cpp/inference/inference_config.h b/examples/cpp/inference/inference_config.h deleted file mode 100644 index c6cdd5da6c..0000000000 --- a/examples/cpp/inference/inference_config.h +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "flexflow/batch_config.h" -#include -// #define MAX_SEQ_LEN 1024 -static int const MAX_SEQ_LEN = FlexFlow::BatchConfig::MAX_SEQ_LENGTH; -#define BATCH_SIZE 16 -#define MNIST_DIMS 28 * 28 -#define DATA_DIM MNIST_DIMS -// #define DATA_DIM 3 - -struct InferenceConfig { - InferenceConfig(void) { - //----------------------- Input/output data ------------------------ - token_dim = DATA_DIM; - sequence_length = MAX_SEQ_LEN; - batch_size = BATCH_SIZE; - out_dim = DATA_DIM; - num_labels = out_dim; - num_layers = 12; - - vocab_size = 50257; - block_size = 1024; - - //----------------------- Inference parameters --------------------- - // total number of requests processed as part of the simulation - total_requests = 2560; - poisson_distribution = true; - // average number of request arrivals per second - arrival_rate = 250; - num_inflight_batches = 4; - incremental_mode = true; - //----------------------- Rest of model parameters ------------------ - hidden_size = DATA_DIM; - // Encoder layer - num_attention_heads = 16; - attention_kdim = attention_vdim = hidden_size / num_attention_heads; - num_encoder_layers = 12; - } - - // Input/output data - int token_dim; - int sequence_length; - int batch_size; - int out_dim; - int num_labels; - int num_layers; - - int vocab_size; - int block_size; - - std::string dataset_path; - // Inference parameters - int total_requests; - bool poisson_distribution; - double arrival_rate; - int num_inflight_batches; - bool incremental_mode; - // Model parameters - int hidden_size; - int num_attention_heads; - int attention_kdim; - int attention_vdim; - int num_encoder_layers; -}; diff --git a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt b/examples/cpp/inference/mixture_of_experts/CMakeLists.txt deleted file mode 100644 index b943623857..0000000000 --- a/examples/cpp/inference/mixture_of_experts/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExample_MoE) -set(project_target inference_moe) - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - moe.cc - ../dataloader.cc - ../data_generator.cc - ${FLEXFLOW_ROOT}/src/runtime/gpt_tokenizer.cc) - -set(GPU_SRC - ../dataloader.cu) - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) - -set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) - diff --git a/examples/cpp/inference/mixture_of_experts/Makefile b/examples/cpp/inference/mixture_of_experts/Makefile deleted file mode 100644 index 15fbf25b9a..0000000000 --- a/examples/cpp/inference/mixture_of_experts/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2020 Stanford University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Flags for directing the runtime makefile what to include -DEBUG ?= 1 # Include debugging symbols -MAX_DIM ?= 4 # Maximum number of dimensions -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 1 # Include CUDA support (requires CUDA) -USE_GASNET ?= 0 # Include GASNet support (requires GASNet) -USE_HDF ?= 0 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) - -# Put the binary file name here -OUTFILE ?= inference_moe -# List all the application source files here -GEN_SRC = moe.cc dataloader.cc ../data_generator.cc -GEN_GPU_SRC = dataloader.cu - -ifndef FF_HOME -$(error FF_HOME variable is not defined, aborting build) -endif - -include $(FF_HOME)/FlexFlow.mk diff --git a/examples/cpp/inference/mixture_of_experts/moe.cc b/examples/cpp/inference/mixture_of_experts/moe.cc deleted file mode 100644 index 4a5c33c9b0..0000000000 --- a/examples/cpp/inference/mixture_of_experts/moe.cc +++ /dev/null @@ -1,257 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "moe.h" -#include "flexflow/inference.h" -#include "flexflow/request_manager.h" -#include -#include -#include -#include -#include -#include - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("MoE"); - -void parse_input_args(char **argv, int argc, MoeConfig &config) { - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "--dataset")) { - config.dataset_path = std::string(argv[++i]); - continue; - } - } -} - -Tensor create_moe(FFModel *model, - MoeConfig const *moeConfig, - Tensor const &input) { - // MoE model - Tensor gate_preds = model->dense(input, moeConfig->num_exp, AC_MODE_RELU); - Tensor topK_output[2]; - model->top_k(gate_preds, topK_output, moeConfig->num_select, false); - - assert(moeConfig->num_exp % moeConfig->experts_per_block == 0); - int nblocks = moeConfig->num_exp / moeConfig->experts_per_block; - Tensor exp_preds; - Tensor expert_block_inputs[3] = {input, topK_output[1], topK_output[0]}; - for (int i = 0; i < nblocks /*number of experts layers*/; i++) { - Tensor block_preds = - model->experts(expert_block_inputs, - moeConfig->experts_per_block, /*number of experts*/ - moeConfig->experts_per_block * i, /*expert start index*/ - moeConfig->hidden_size, /*output_size*/ - moeConfig->alpha); - assert(block_preds != nullptr); - if (i == 0) { - exp_preds = block_preds; - } else { - assert(exp_preds != nullptr); - model->add(exp_preds, block_preds, /*inplace_a*/ true); - } - } - - // model->get_metrics(); - return exp_preds; -} - -Tensor create_moe_encoder(FFModel *model, - MoeConfig const *moeConfig, - Tensor const &input) { - std::vector axes = {0, 1, 2}; - Tensor x = input; - for (int i = 0; i < moeConfig->num_encoder_layers; i++) { - Tensor t = moeConfig->incremental_mode - ? model->inc_multihead_self_attention( - x, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim) - : model->multihead_attention(x, - x, - x, - moeConfig->hidden_size, - moeConfig->num_attention_heads, - moeConfig->attention_kdim, - moeConfig->attention_vdim); - x = model->layer_norm(model->add(t, x), axes, true, 1e-05); - x = model->layer_norm( - model->add(create_moe(model, moeConfig, x), x), axes, true, 1e-05); - } - return x; -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - //----------------------- Initial configurations ------------------------ - MoeConfig moeConfig; - FFConfig ffConfig; - ffConfig.batchSize = moeConfig.batch_size; - { - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, moeConfig); - log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes); - } - FFModel ff(ffConfig); - - //----------------------- Create inputs -------------------------------- - Tensor input; - { - int const dims[] = {ffConfig.batchSize, moeConfig.sequence_length}; - input = ff.create_tensor<2>(dims, DT_INT32); - } - Tensor t = input; - Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - t = ff.embedding(t, - moeConfig.vocab_size, - moeConfig.token_dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - - //----------------------- Define the model ------------------------------ - t = create_moe_encoder(&ff, &moeConfig, t); - // Tensor t = create_moe(&ff, &moeConfig, input); - t = ff.dense(t, moeConfig.out_dim, AC_MODE_RELU); - t = ff.softmax(t); - // select most likely next token - Tensor output = ff.arg_top_k(t, /*k=*/1, /*sorted=*/false); - - //------------------- Initialize the inference manager ------------------ - InferenceManager im(ff.config, moeConfig.batch_size); - im.compile_model_and_allocate_buffer(&ff); - im.init_operators_inference(&ff); - - //------------ Initialize the data loader and data generator ------------ - /*size_t min_input_tokens = 32, max_input_tokens = 512, - min_tokens_to_generate = 1, max_tokens_to_generate = 128;*/ - size_t min_input_tokens = 5, max_input_tokens = 10, - min_tokens_to_generate = 1, - max_tokens_to_generate = MAX_SEQ_LEN - max_input_tokens; - DataGenerator data_generator(moeConfig.total_requests, - moeConfig.vocab_size, - min_input_tokens, - max_input_tokens, - min_tokens_to_generate, - max_tokens_to_generate, - moeConfig.poisson_distribution, - moeConfig.arrival_rate); - ParallelTensor input_pt; - ff.get_parallel_tensor_from_tensor(input, input_pt); - assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - assert(im.tensor_buffer[input_pt].size() == ffConfig.data_parallelism_degree); - DataLoader data_loader( - ff, moeConfig, data_generator, im.tensor_buffer[input_pt]); - - //----------------------- Start timer ----------------------------------- - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_start = Realm::Clock::current_time_in_microseconds(); - - //----------------------- Begin inference! ------------------------------- - int index = 0; - int processed_requests = 0; - int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; - data_generator.start_timer(); - std::map future_handlers; - std::map batch_configs; - std::pair new_prompts; - BatchConfig *bc = nullptr; - std::map batch_predictions[ffConfig.data_parallelism_degree]; - - assert(im.max_num_tokens_per_batch == moeConfig.batch_size); - - // simulation loop. For deployment, we will use a while(true) - while (processed_requests < moeConfig.total_requests) { - for (int bid = 0; bid < ffConfig.data_parallelism_degree; bid++) { - size_t max_reqs, max_tkns; - if (future_handlers.find(bid) == future_handlers.end()) { - max_reqs = moeConfig.incremental_mode ? bc->MAX_NUM_REQUESTS - : im.max_num_tokens_per_batch; - max_tkns = moeConfig.sequence_length * moeConfig.batch_size; - new_prompts = data_generator.get_requests(max_reqs, max_tkns); - bc = new BatchConfig(); - } else { - Future future = future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } - InferenceResult ir = future.get_result(); - bc = batch_configs[bid]; - data_loader.store_outputs(bc, ir, batch_predictions[bid]); - processed_requests += bc->update_results(&ir); - max_reqs = moeConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() - : im.max_num_tokens_per_batch; - max_tkns = moeConfig.sequence_length * moeConfig.batch_size - - (moeConfig.incremental_mode ? bc->num_active_tokens() : 0); - new_prompts = data_generator.get_requests(max_reqs, max_tkns); - } - assert(new_prompts.second <= max_reqs); - if (bc->num_active_tokens() == 0 && new_prompts.second == 0) { - continue; - } - for (size_t i = 0; i < new_prompts.second; i++) { - size_t guid = new_prompts.first + i; - std::pair seq_lens = - data_generator.get_request_length(guid); - assert(seq_lens.first >= min_input_tokens && - seq_lens.first <= max_input_tokens && - seq_lens.second >= min_tokens_to_generate && - seq_lens.second <= max_tokens_to_generate); - assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); - } - bc->prepare_next_batch(); - MachineView *view = im.get_machine_view(bid % im.num_devices); - - // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - data_loader.next_batch(ff, bid, bc, batch_predictions[bid], view); - FutureMap fm = im.inference(&ff, bid, *bc); - // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); - - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; - } - } - //----------------------- End of inference! ------------------------------ - - //----------------------- Stop timer ------------------------------------- - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_end = Realm::Clock::current_time_in_microseconds(); - double run_time = 1e-6 * (ts_end - ts_start); - printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f requests/s\n", - run_time, - moeConfig.total_requests / run_time); -} diff --git a/examples/cpp/inference/mixture_of_experts/moe.h b/examples/cpp/inference/mixture_of_experts/moe.h deleted file mode 100644 index 183229bc07..0000000000 --- a/examples/cpp/inference/mixture_of_experts/moe.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "dataloader.h" -#include "inference_config.h" - -struct MoeConfig : InferenceConfig { - MoeConfig(void) : InferenceConfig() { - //----------------------- MoE layer -------------------------------- - // total number of experts - num_exp = 128; - // number of experts in each block of fused experts - experts_per_block = 32; - // number of experts to route each token to - num_select = 2; - // expert capacity parameters - alpha = 2.0f; // factor overhead tensor size for imbalance - lambda = 0.04f; // multiplier for load balance term - // expert hidden size - hidden_size = DATA_DIM; - } - - // MoE layer - int num_exp; - int experts_per_block; - int num_select; - float alpha; - float lambda; -}; \ No newline at end of file diff --git a/examples/cpp/inference/transformers/CMakeLists.txt b/examples/cpp/inference/transformers/CMakeLists.txt deleted file mode 100644 index 0aa95f1058..0000000000 --- a/examples/cpp/inference/transformers/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(FlexFlowExample_Transformers) -set(project_target inference_transformers) - -set(CPU_SRC - ${FLEXFLOW_CPP_DRV_SRC} - transformers.cc - ../dataloader.cc - ../data_generator.cc - ${FLEXFLOW_ROOT}/src/runtime/gpt_tokenizer.cc) - -set(GPU_SRC - ../dataloader.cu) - -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/examples/cpp/inference) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) - -set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) - diff --git a/examples/cpp/inference/transformers/transformers.cc b/examples/cpp/inference/transformers/transformers.cc deleted file mode 100644 index 0717ddc90f..0000000000 --- a/examples/cpp/inference/transformers/transformers.cc +++ /dev/null @@ -1,236 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "transformers.h" -#include "flexflow/inference.h" -#include "flexflow/request_manager.h" -#include -#include -#include -#include -#include -#include - -using namespace Legion; - -LegionRuntime::Logger::Category log_app("Transformers"); - -void parse_input_args(char **argv, int argc, TransformerConfig &config) { - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "--dataset")) { - config.dataset_path = std::string(argv[++i]); - continue; - } - } -} - -Tensor create_inc_multihead_attention_decoder( - FFModel *model, - TransformerConfig const *transformerConfig, - Tensor const &input) { - std::vector axes{0}; - Tensor t = - transformerConfig->incremental_mode - ? model->inc_multihead_self_attention( - input, - transformerConfig->hidden_size, - transformerConfig->num_attention_heads, - transformerConfig->attention_kdim, - transformerConfig->attention_vdim) - : model->multihead_attention(input, - input, - input, - transformerConfig->hidden_size, - transformerConfig->num_attention_heads, - transformerConfig->attention_kdim, - transformerConfig->attention_vdim); - t = model->layer_norm(model->add(t, input), axes, true, 1e-05); - Tensor x = model->dense( - model->dense( - t, transformerConfig->hidden_size, AC_MODE_RELU, false /*bias*/), - transformerConfig->hidden_size, - AC_MODE_NONE, - false /*bias*/); - t = model->layer_norm(model->add(x, t), axes, true, 1e-05); - return t; -} - -void FlexFlow::top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - //----------------------- Initial configurations ------------------------ - TransformerConfig transformerConfig; - FFConfig ffConfig; - ffConfig.batchSize = transformerConfig.batch_size; - { - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, argc, transformerConfig); - log_app.print("batchSize(%d) workersPerNodes(%d) numNodes(%d)", - ffConfig.batchSize, - ffConfig.workersPerNode, - ffConfig.numNodes); - } - FFModel ff(ffConfig); - - //----------------------- Create inputs -------------------------------- - Tensor input; - { - int const dims[] = {ffConfig.batchSize, transformerConfig.sequence_length}; - input = ff.create_tensor<2>(dims, DT_INT32); - } - - //----------------------- Define the model ------------------------------ - Tensor t = input; - - Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - t = ff.embedding(t, - transformerConfig.vocab_size, - transformerConfig.token_dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - - for (int i = 0; i < transformerConfig.num_layers; i++) { - t = create_inc_multihead_attention_decoder(&ff, &transformerConfig, t); - } - t = ff.dense(t, transformerConfig.out_dim, AC_MODE_RELU); - t = ff.softmax(t); - // select most likely next token - Tensor output = ff.arg_top_k(t, /*k=*/1, false); - - //------------------- Initialize the inference manager ------------------ - InferenceManager im(ff.config, transformerConfig.batch_size); - im.compile_model_and_allocate_buffer(&ff); - im.init_operators_inference(&ff); - - //------------ Initialize the data loader and data generator ------------ - /* size_t min_input_tokens = 32, max_input_tokens = 512, - min_tokens_to_generate = 1, max_tokens_to_generate = 128; */ - size_t min_input_tokens = 5, max_input_tokens = 10, - min_tokens_to_generate = 1, - max_tokens_to_generate = MAX_SEQ_LEN - max_input_tokens; - DataGenerator data_generator(transformerConfig.total_requests, - transformerConfig.vocab_size, - min_input_tokens, - max_input_tokens, - min_tokens_to_generate, - max_tokens_to_generate, - transformerConfig.poisson_distribution, - transformerConfig.arrival_rate); - ParallelTensor input_pt; - ff.get_parallel_tensor_from_tensor(input, input_pt); - assert(im.tensor_buffer.find(input_pt) != im.tensor_buffer.end()); - assert(im.tensor_buffer[input_pt].size() == ffConfig.data_parallelism_degree); - DataLoader data_loader( - ff, transformerConfig, data_generator, im.tensor_buffer[input_pt]); - - //----------------------- Start timer ----------------------------------- - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_start = Realm::Clock::current_time_in_microseconds(); - - //----------------------- Begin inference! ------------------------------- - int index = 0; - int processed_requests = 0; - int num_devices = ffConfig.workersPerNode * ffConfig.numNodes; - data_generator.start_timer(); - std::map future_handlers; - std::map batch_configs; - std::pair new_prompts; - BatchConfig *bc = nullptr; - std::map batch_predictions[ffConfig.data_parallelism_degree]; - - assert(im.max_num_tokens_per_batch == transformerConfig.batch_size); - // assert(transformerConfig.batch_size <= BatchConfig::MAX_NUM_REQUESTS); - - // simulation loop. For deployment, we will use a while(true) - while (processed_requests < transformerConfig.total_requests) { - for (int bid = 0; bid < ffConfig.data_parallelism_degree; bid++) { - size_t max_reqs, max_tkns; - if (future_handlers.find(bid) == future_handlers.end()) { - max_reqs = transformerConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - : im.max_num_tokens_per_batch; - max_tkns = - transformerConfig.sequence_length * transformerConfig.batch_size; - new_prompts = data_generator.get_requests(max_reqs, max_tkns); - bc = new BatchConfig(); - } else { - Future future = future_handlers[bid]; - if (!future.is_ready(true /*subscribe*/)) { - continue; - } - InferenceResult ir = future.get_result(); - bc = batch_configs[bid]; - data_loader.store_outputs(bc, ir, batch_predictions[bid]); - processed_requests += bc->update_results(&ir); - max_reqs = transformerConfig.incremental_mode - ? bc->MAX_NUM_REQUESTS - bc->num_active_requests() - : im.max_num_tokens_per_batch; - max_tkns = - transformerConfig.sequence_length * transformerConfig.batch_size - - (transformerConfig.incremental_mode ? bc->num_active_tokens() : 0); - new_prompts = data_generator.get_requests(max_reqs, max_tkns); - } - assert(new_prompts.second <= max_reqs); - if (bc->num_active_tokens() == 0 && new_prompts.second == 0) { - continue; - } - for (size_t i = 0; i < new_prompts.second; i++) { - size_t guid = new_prompts.first + i; - std::pair seq_lens = - data_generator.get_request_length(guid); - assert(seq_lens.first >= min_input_tokens && - seq_lens.first <= max_input_tokens && - seq_lens.second >= min_tokens_to_generate && - seq_lens.second <= max_tokens_to_generate); - assert(bc->register_new_request(guid, seq_lens.first, seq_lens.second)); - } - bc->prepare_next_batch(); - MachineView *view = im.get_machine_view(bid % im.num_devices); - - // runtime->begin_trace(ctx, 111 + bid % num_devices /*trace_id*/); - data_loader.next_batch(ff, bid, bc, batch_predictions[bid], view); - FutureMap fm = im.inference(&ff, bid, *bc); - // runtime->end_trace(ctx, 111 + bid % num_devices /*trace_id*/); - - assert(fm.get_future_map_domain().get_volume() == 1); - future_handlers[bid] = fm.get_future(0); - batch_configs[bid] = bc; - } - } - //----------------------- End of inference! ------------------------------ - - //----------------------- Stop timer ------------------------------------- - { - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - } - double ts_end = Realm::Clock::current_time_in_microseconds(); - double run_time = 1e-6 * (ts_end - ts_start); - printf("ELAPSED TIME = %.4fs, THROUGHPUT = %.2f requests/s\n", - run_time, - transformerConfig.total_requests / run_time); -} diff --git a/examples/cpp/inference/transformers/transformers.h b/examples/cpp/inference/transformers/transformers.h deleted file mode 100644 index fe474e7949..0000000000 --- a/examples/cpp/inference/transformers/transformers.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "dataloader.h" -#include "inference_config.h" - -struct TransformerConfig : InferenceConfig { - TransformerConfig(void) : InferenceConfig() { - hidden_size = DATA_DIM; - } -}; diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index bae847106a..ce331d3e41 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -41,12 +41,6 @@ class BatchConfig { using RequestGuid = size_t; using TokenId = int; BatchConfig(); - bool register_new_request(size_t guid, - int initial_len, - int tokens_to_generate); - void prepare_next_batch(); - int update_results(InferenceResult const *ir); - void update_num_active_requests_tokens(); int num_active_requests() const; int num_active_tokens() const; void print() const; @@ -74,7 +68,6 @@ class BatchConfig { PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; - // size_t max_sequence_length[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; }; @@ -84,17 +77,12 @@ class TreeVerifyBatchConfig : public BatchConfig { ~TreeVerifyBatchConfig(); InferenceMode get_mode() const; void print() const; - // struct PerTokenInfo : BatchConfig::PerTokenInfo { - // int tree_branch_idx; - // }; struct CommittedTokensInfo { int token_index; // the index of the token in the previous batch int request_index; // request index in the batch int token_depth; // position of the token in the request's sequence }; - // void compute_tree_branch_indexes(); - int num_tokens_to_commit; CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS]; }; @@ -145,7 +133,6 @@ class BeamSearchBatchConfig : public BatchConfig { BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; - // BeamSlot beam_slots[MAX_NUM_REQUESTS]; private: size_t current_iteration; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 446f884eac..1d03aa72ec 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -34,7 +34,6 @@ class InferenceManager { static InferenceManager *get_inference_manager(); void compile_model_and_allocate_buffer(FFModel *model); void init_operators_inference(FFModel *model); - MachineView *get_machine_view(int mv_id); Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); @@ -55,7 +54,6 @@ class InferenceManager { std::unordered_map> tensor_buffer; int max_num_tokens_per_batch; int num_devices; - std::vector machine_views; }; struct Request { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 52b1660e53..d658b6590f 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -58,109 +58,18 @@ InferenceMode BatchConfig::get_mode() const { return INC_DECODING_MODE; } -// Deprecated API; should use RequestManager::update_batch -int BatchConfig::update_results(InferenceResult const *ir) { - assert(false); - int completed = 0; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (request_completed[i]) { - continue; - } - assert(requestsInfo[i].num_tokens_in_batch > 0); - int processed_tokens = requestsInfo[i].token_start_offset + - requestsInfo[i].num_tokens_in_batch; - if (processed_tokens >= requestsInfo[i].max_sequence_length - // || ir.results[t] == 0 TODO: replace this with - ) { - log_bc.print("[Done] guid(%zu) final_length(%d)", - requestsInfo[i].request_guid, - processed_tokens); - request_completed[i] = true; - requestsInfo[i].num_tokens_in_batch = 0; - requestsInfo[i].token_start_offset = 0; - completed++; - } else { - requestsInfo[i].token_start_offset += requestsInfo[i].num_tokens_in_batch; - requestsInfo[i].num_tokens_in_batch = 1; - } - } - return completed; -} - -// Deprecated API; RequestManager::new_batch and RequestManager::update_batch -// automatically register new requests. -bool BatchConfig::register_new_request(size_t guid, - int initial_len, - int tokens_to_generate) { - assert(false); - assert(initial_len > 0 && tokens_to_generate > 0); - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (request_completed[i]) { - log_bc.print("[NewRequest] guid(%zu) length(%d)", guid, initial_len); - requestsInfo[i].token_start_offset = 0; - requestsInfo[i].num_tokens_in_batch = initial_len; - requestsInfo[i].request_guid = guid; - requestsInfo[i].max_sequence_length = initial_len + tokens_to_generate; - request_completed[i] = false; - update_num_active_requests_tokens(); - return true; - } - } - update_num_active_requests_tokens(); - return false; -} - -// Deprecated API -void BatchConfig::prepare_next_batch() { - assert(false); - assert(num_tokens > 0); - log_bc.print("[NextBatch] num_tokens(%d)", num_tokens); -} - -// Deprecated API; cannot use this since we need to -// add token_id, which is missing in this API -void BatchConfig::update_num_active_requests_tokens() { - assert(false); - num_tokens = 0; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - if (!request_completed[i]) { - int start_idx = requestsInfo[i].token_start_offset; - for (int j = 0; j < requestsInfo[i].num_tokens_in_batch; j++) { - tokensInfo[num_tokens].abs_depth_in_request = start_idx + j; - tokensInfo[num_tokens].request_index = i; - num_tokens++; - } - } - } -} - int BatchConfig::num_active_requests() const { int num_requests = 0; for (int i = 0; i < MAX_NUM_REQUESTS; i++) { if (!request_completed[i]) { num_requests++; - // } else { - // std::cout << "request " << i << " is completed" << std::endl; } } return num_requests; - // if (cached_results) { - // return num_requests; - // } else { - // assert(false && - // "some BatchConfig functions updated requests but didn't call " - // "() before exit"); - // } } int BatchConfig::num_active_tokens() const { - // if (cached_results) { return num_tokens; - //} else { - // assert(false && - // "some BatchConfig functions updated requests but didn't call " - // "update_num_active_requests_tokens() before exit"); - //} } void BatchConfig::print() const { diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index cfcc938204..39dee64ff1 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -54,33 +54,6 @@ InferenceManager::InferenceManager(FFConfig const &_config, num_devices && "Product of data, tensor, and pipeline parallelism degrees does not " "match the number of available devices"); - // Deprecated logic below - // populate array of valid single-device machine views - for (int i = 0; i < num_devices; i++) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = 1; - view.stride[0] = 0; - view.start_device_id = i; - // std::cout << "Registering machine view: " << view << std::endl; - machine_views.push_back(view); - } - // multiple-device machine views - if (ff_config.tensor_parallelism_degree > 1) { - for (int i = 0; i < num_devices; i++) { - if (i + ff_config.tensor_parallelism_degree <= num_devices) { - MachineView view; - view.device_type = MachineView::GPU; - view.ndims = 1; - view.dim[0] = ff_config.tensor_parallelism_degree; - view.stride[0] = 1; - view.start_device_id = i; - // std::cout << "Registering machine view: " << view << std::endl; - machine_views.push_back(view); - } - } - } } InferenceManager *inference_manager_singleton = nullptr; @@ -291,13 +264,6 @@ void InferenceManager::init_operators_inference(FFModel *model) { } } -// Deprecated API -MachineView *InferenceManager::get_machine_view(int mv_id) { - assert(false); - assert(mv_id >= 0 && mv_id < machine_views.size()); - return &machine_views[mv_id]; -} - FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfig const &bc) { From 654095e9e3e00fc78509c67bff9ef37e6a284e43 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Mon, 7 Aug 2023 18:07:28 -0400 Subject: [PATCH 192/344] fix (#916) --- inference/models/configs/opt_30B.json | 15 +++++++++++++ src/ops/spec_inc_multihead_self_attention.cu | 22 ++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 inference/models/configs/opt_30B.json diff --git a/inference/models/configs/opt_30B.json b/inference/models/configs/opt_30B.json new file mode 100644 index 0000000000..0618b81b77 --- /dev/null +++ b/inference/models/configs/opt_30B.json @@ -0,0 +1,15 @@ +{ + "vocab_size": 50272, + "word_embed_proj_dim": 7168, + "hidden_size": 7168, + "num_attention_heads": 56, + "max_position_embeddings": 2048, + "layer_norm_elementwise_affine": true, + "num_hidden_layers": 48, + "dropout": 0.1, + "ffn_dim": 28672, + "max_beam_width": 1, + "batchSize": 8, + "sentence_len": 100, + "max_beam_depth": 4 +} \ No newline at end of file diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 52a619a5d8..d1faba9c68 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -517,17 +517,17 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } - if (*m->bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; - apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); - } + } + if (*m->bias && shard_id == 0) { + int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; + apply_proj_bias_w<<>>( + output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); } assert(tokens_previous_requests == num_tokens); From 0bc2b01021c2d2458bc3e53912d197fa034bfd57 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 8 Aug 2023 16:36:25 -0400 Subject: [PATCH 193/344] [Inference] - Cleanup, C++/Python API update (#915) * update python api slightly * update * fix * update * add function to save huggingface configs to file * cleanup * backup * hf configs parsing update * replace legacy configs * update tests, delete old downloader scripts * fix path * fix * fix * fix * fixes * falcon fix * remove legacy file * fixes * linting * fix * fix * fix * fix * fix * added falcon to c++ tests * fix * update * fix * clear cache * fix * add support for gelu in fusion * replace c++ tests with python tests * removed config * shellcheck * enable fusion by default in c++ tests * enable all tests temporarily * fix * fix * fix * fix * fix * fix * ready to merge --------- Co-authored-by: Zhihao Jia --- .github/workflows/gpu-ci.yml | 15 +- .github/workflows/pip-deploy.yml | 2 - conda/flexflow-cpu.yml | 4 + docker/flexflow-environment/Dockerfile | 4 +- include/flexflow/flexflow_c.h | 13 +- include/flexflow/inference.h | 7 + include/flexflow/request_manager.h | 7 - inference/.gitignore | 3 +- inference/file_loader.cc | 23 +- inference/file_loader.h | 4 +- inference/incr_decoding/incr_decoding.cc | 111 +++--- inference/models/configs/falcon_7B.json | 12 - inference/models/configs/llama2_70B.json | 11 - inference/models/configs/llama2_7B.json | 11 - inference/models/configs/llama_160M.json | 11 - inference/models/configs/llama_68M.json | 11 - inference/models/configs/llama_7B.json | 11 - inference/models/configs/opt_125M.json | 15 - inference/models/configs/opt_13B.json | 15 - inference/models/configs/opt_30B.json | 15 - inference/models/configs/opt_6B.json | 15 - inference/models/falcon.cc | 88 +++-- inference/models/falcon.h | 107 +++--- inference/models/llama.cc | 69 ++-- inference/models/llama.h | 98 ++---- inference/models/opt.cc | 15 +- inference/models/opt.h | 109 +++--- inference/python/incr_decoding.py | 18 +- inference/python/spec_infer.py | 25 +- inference/spec_infer/spec_infer.cc | 315 +++++++++++------- inference/utils/convert_llama_config.py | 32 -- inference/utils/download_falcon_weights.py | 45 --- inference/utils/download_hf_model.py | 63 ++++ inference/utils/download_llama_weights.py | 78 ----- inference/utils/download_opt_weights.py | 78 ----- python/flexflow/core/__init__.py | 148 ++++---- python/flexflow/core/flexflow_cffi.py | 245 +++++++++++++- python/flexflow/serve/__init__.py | 18 +- python/flexflow/serve/models/falcon.py | 97 +++++- python/flexflow/serve/models/llama.py | 25 +- python/flexflow/serve/models/opt.py | 25 +- python/flexflow/serve/serve.py | 231 ++++++++----- src/c/flexflow_c.cc | 16 +- src/ops/fused.cu | 3 + src/runtime/inference_manager.cc | 104 ++---- src/runtime/request_manager.cc | 4 +- tests/.gitignore | 1 + tests/inference/cpp_inference_tests.sh | 96 ++---- tests/inference/python_inference_tests.sh | 191 +++++++++++ .../python_test_configs/generate_configs.py | 123 +++++++ tests/inference_tests.sh | 38 ++- 51 files changed, 1626 insertions(+), 1199 deletions(-) delete mode 100644 inference/models/configs/falcon_7B.json delete mode 100644 inference/models/configs/llama2_70B.json delete mode 100644 inference/models/configs/llama2_7B.json delete mode 100644 inference/models/configs/llama_160M.json delete mode 100644 inference/models/configs/llama_68M.json delete mode 100644 inference/models/configs/llama_7B.json delete mode 100644 inference/models/configs/opt_125M.json delete mode 100644 inference/models/configs/opt_13B.json delete mode 100644 inference/models/configs/opt_30B.json delete mode 100644 inference/models/configs/opt_6B.json delete mode 100644 inference/utils/convert_llama_config.py delete mode 100644 inference/utils/download_falcon_weights.py create mode 100644 inference/utils/download_hf_model.py delete mode 100644 inference/utils/download_llama_weights.py delete mode 100644 inference/utils/download_opt_weights.py create mode 100755 tests/inference/python_inference_tests.sh create mode 100644 tests/inference/python_test_configs/generate_configs.py diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 8064710d04..bdd2e4dbf5 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -183,17 +183,26 @@ jobs: ./tests/gpt_tokenizer_test.sh # Inference tests - export TENSOR_PARALLELISM_TESTS=ON + source ./build/set_python_envs.sh ./tests/inference_tests.sh + + - name: Save inference output as an artifact + if: always() + run: | cd inference tar -zcvf output.tar.gz ./output - cd .. - - name: Save inference output as an artifact + - name: Upload artifact uses: actions/upload-artifact@v3 + if: always() with: name: output path: inference/output.tar.gz + + # Github persists the .cache folder across different runs/containers + - name: Clear cache + if: always() + run: sudo rm -rf ~/.cache gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests diff --git a/.github/workflows/pip-deploy.yml b/.github/workflows/pip-deploy.yml index c3840a6671..c542b86a5e 100644 --- a/.github/workflows/pip-deploy.yml +++ b/.github/workflows/pip-deploy.yml @@ -3,8 +3,6 @@ on: push: tags: - '*' - branches: - - inference # only trigger on push to inference branch for now workflow_dispatch: concurrency: diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml index b8d1c5dcdf..6435b19343 100644 --- a/conda/flexflow-cpu.yml +++ b/conda/flexflow-cpu.yml @@ -19,3 +19,7 @@ dependencies: - torchaudio --index-url https://download.pytorch.org/whl/cpu - torchvision --index-url https://download.pytorch.org/whl/cpu - regex + - onnx + - transformers + - sentencepiece + - einops diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index a53102fa73..2f970f272d 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -5,7 +5,7 @@ LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow LABEL org.opencontainers.image.description="FlexFlow environment container" # Install basic dependencies -RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano libhdf5-dev && \ +RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev && \ rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \ @@ -57,7 +57,7 @@ ENV CUDA_DIR /usr/local/cuda RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install CPU-only Pytorch and related dependencies RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch -RUN conda install -c conda-forge onnx transformers sentencepiece +RUN conda install -c conda-forge onnx transformers sentencepiece einops RUN pip3 install tensorflow # Install Rust diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 7f1374415c..1aa192f4e3 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -396,7 +396,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_initializer_t kernel_initializer, char const *name); -flexflow_tensor_t flexflow_model_add_inc_multihead_attention( +flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -415,7 +415,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( bool qk_prod_scaling, char const *name); -flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( +flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -453,7 +453,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bool qk_prod_scaling, char const *name); -flexflow_tensor_t flexflow_model_add_inc_multiquery_attention( +flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -473,7 +473,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_attention( bool qk_prod_scaling, char const *name); -flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_attention( +flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -944,10 +944,11 @@ void flexflow_inference_manager_init_operators_inference( flexflow_file_data_loader_t flexflow_file_data_loader_create(char const *weight_file_path, - int num_heads, + int num_q_heads, + int num_kv_heads, int hidden_dim, int qkv_inner_dim, - int tensor_partition_num); + int tensor_parallelism_degree); void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index c30b0c0be3..4e7d9ffcbc 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -15,6 +15,8 @@ #pragma once #include "flexflow/batch_config.h" +#include +#include namespace FlexFlow { @@ -40,4 +42,9 @@ struct GenerationResult { std::vector output_tokens; }; +#include +#include + +std::string join_path(std::vector const &paths); + } // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 1d03aa72ec..6aa69786ca 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -41,13 +41,6 @@ class InferenceManager { ParallelTensor const input); void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input); - void incr_decoding_loop(FFModel *model, - RequestManager &rm, - int total_num_requests); - void spec_inference_loop(FFModel *model, - RequestManager &rm, - int total_num_requests, - std::vector ssm_model_ids); public: FFConfig ff_config; diff --git a/inference/.gitignore b/inference/.gitignore index 05ccb57cd3..8ab99cb1eb 100644 --- a/inference/.gitignore +++ b/inference/.gitignore @@ -1,4 +1,5 @@ +configs weights -tokenizer +tokenizers prompt output diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 94e604ac2d..c15a3c0f2b 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -28,11 +28,11 @@ FileDataLoader::FileDataLoader(std::string _input_path, int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, - int _tensor_partition_num) + int _tensor_parallelism_degree) : input_path(_input_path), weight_file_path(_weight_file_path), num_heads(_num_heads), num_kv_heads(_num_kv_heads), hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim), - tensor_partition_num(_tensor_partition_num){}; + tensor_parallelism_degree(_tensor_parallelism_degree){}; BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { @@ -185,7 +185,7 @@ void load_attention_weights_v2(DT *ptr, std::string layer_name, std::string weight_path, size_t volume, - int tensor_partition_num) { + int tensor_parallelism_degree) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight std::string q_file = weight_path + @@ -216,13 +216,13 @@ void load_attention_weights_v2(DT *ptr, // stride for q, k, v, o size_t stride_size = - (q_size + v_size + k_size + o_size) / tensor_partition_num; + (q_size + v_size + k_size + o_size) / tensor_parallelism_degree; for (auto file : weight_files) { int data_index = 0; size_t partial_size = (file_index == 0 || file_index == 3) ? one_weight_file_size : single_proj_size * num_kv_heads; - size_t one_partition_size = partial_size / tensor_partition_num; + size_t one_partition_size = partial_size / tensor_parallelism_degree; std::ifstream in(file, std::ios::in | std::ios::binary); if (!in.good()) { @@ -242,7 +242,7 @@ void load_attention_weights_v2(DT *ptr, assert(false && "data size mismatch"); } // wq, wk, wo - for (int i = 0; i < tensor_partition_num; i++) { + for (int i = 0; i < tensor_parallelism_degree; i++) { for (int j = 0; j < one_partition_size; j++) { ptr[base_index + i * stride_size + j] = host_array.at(data_index++); } @@ -251,7 +251,7 @@ void load_attention_weights_v2(DT *ptr, base_index += one_partition_size; file_index++; } - assert(base_index == (q_size + k_size + v_size) / tensor_partition_num); + assert(base_index == (q_size + k_size + v_size) / tensor_parallelism_degree); { std::ifstream in(o_file, std::ios::in | std::ios::binary); @@ -273,11 +273,12 @@ void load_attention_weights_v2(DT *ptr, assert(one_weight_file_size == host_array.size()); int data_index = 0; - int one_partition_size = qkv_inner_dim * (num_heads / tensor_partition_num); + int one_partition_size = + qkv_inner_dim * (num_heads / tensor_parallelism_degree); for (int i = 0; i < one_weight_file_size; i++) { - int part_idx = (i / one_partition_size) % tensor_partition_num; + int part_idx = (i / one_partition_size) % tensor_parallelism_degree; int block_num = (i / one_partition_size); - int offset = block_num / tensor_partition_num * one_partition_size + + int offset = block_num / tensor_parallelism_degree * one_partition_size + (i % one_partition_size); ptr[base_index + part_idx * stride_size + offset] = host_array.at(data_index++); @@ -687,7 +688,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, file_path, weight_file_path, volume, - tensor_partition_num); + tensor_parallelism_degree); } else { load_attention_bias_v2(data, num_heads, diff --git a/inference/file_loader.h b/inference/file_loader.h index 0c9dfa56cd..aaef861d09 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -30,7 +30,7 @@ class FileDataLoader { int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, - int _tensor_partition_num); + int _tensor_parallelism_degree); BatchConfig::TokenId *generate_requests(int num, int length); @@ -56,7 +56,7 @@ class FileDataLoader { int offset); private: - int num_heads, num_kv_heads, tensor_partition_num; + int num_heads, num_kv_heads, tensor_parallelism_degree; size_t hidden_dim, qkv_inner_dim; std::string input_path; std::string weight_file_path; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 84217a22f1..cd84d6cb5c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -18,25 +18,25 @@ #include "models/falcon.h" #include "models/llama.h" #include "models/opt.h" +#include #include using namespace Legion; +using json = nlohmann::json; LegionRuntime::Logger::Category log_app("llama"); struct FilePaths { - std::string llm_weight_file_path; - std::string llm_config_file_path; + std::string cache_folder_path; std::string prompt_file_path; - std::string tokenizer_file_path; std::string output_file_path; }; void parse_input_args(char **argv, int argc, FilePaths &paths, - ModelType &llm_model_type, + std::string &llm_model_name, bool &use_full_precision, bool &verbose, bool &do_sample, @@ -45,32 +45,15 @@ void parse_input_args(char **argv, for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { - std::string model_type_str = std::string(argv[++i]); - std::transform(model_type_str.begin(), - model_type_str.end(), - model_type_str.begin(), - [](unsigned char c) { return std::tolower(c); }); - if (model_type_str == "llama") { - llm_model_type = ModelType::LLAMA; - } else if (model_type_str == "llama2") { - llm_model_type = ModelType::LLAMA2; - } else if (model_type_str == "opt") { - llm_model_type = ModelType::OPT; - } else if (model_type_str == "falcon") { - llm_model_type = ModelType::FALCON; - } else { - llm_model_type = ModelType::UNKNOWN; + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); } continue; } - // llm model weights - if (!strcmp(argv[i], "-llm-weight")) { - paths.llm_weight_file_path = std::string(argv[++i]); - continue; - } - // llm model configs - if (!strcmp(argv[i], "-llm-config")) { - paths.llm_config_file_path = std::string(argv[++i]); + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); continue; } // prompts @@ -78,11 +61,6 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } - // tokenizer - if (!strcmp(argv[i], "-tokenizer")) { - paths.tokenizer_file_path = std::string(argv[++i]); - continue; - } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -110,6 +88,14 @@ void parse_input_args(char **argv, continue; } } + if (paths.cache_folder_path.empty()) { + paths.cache_folder_path = "~/.cache/flexflow"; + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); } void FlexFlow::top_level_task(Task const *task, @@ -121,13 +107,12 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "Doesn't support quantization in non-offload mode"); } FilePaths file_paths; - ModelType model_type; + std::string llm_model_name; bool use_full_precision = false; bool verbose = false; bool do_sample = false; float temperature = 0.0f; float topp = 0.0f; - size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -135,7 +120,7 @@ void FlexFlow::top_level_task(Task const *task, parse_input_args(argv, argc, file_paths, - model_type, + llm_model_name, use_full_precision, verbose, do_sample, @@ -146,32 +131,74 @@ void FlexFlow::top_level_task(Task const *task, ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + std::string nameOrPath = model_config["_name_or_path"]; + // TODO: support LLAMA-2 models not from Meta + bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; + if (llama2) { + model_type = ModelType::LLAMA2; + } else { + model_type = ModelType::LLAMA; + } + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM") { + model_type = ModelType::FALCON; + break; + } + } + assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); SamplingConfig samplingConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); - rm->register_tokenizer(model_type, file_paths.tokenizer_file_path); + rm->register_tokenizer(model_type, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { LLAMA::create_llama_model(model, - file_paths.llm_config_file_path, - file_paths.llm_weight_file_path, + config_filepath, + weights_filepath, INC_DECODING_MODE, samplingConfig, use_full_precision); } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, - file_paths.llm_config_file_path, - file_paths.llm_weight_file_path, + config_filepath, + weights_filepath, INC_DECODING_MODE, use_full_precision); } else if (model_type == ModelType::FALCON) { FALCON::create_falcon_model(model, - file_paths.llm_config_file_path, - file_paths.llm_weight_file_path, + config_filepath, + weights_filepath, INC_DECODING_MODE, use_full_precision); } else { diff --git a/inference/models/configs/falcon_7B.json b/inference/models/configs/falcon_7B.json deleted file mode 100644 index d89564557a..0000000000 --- a/inference/models/configs/falcon_7B.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "n_layers": 32, - "vocab_size": 65024, - "n_heads": 71, - "n_kv_heads" : 1, - "dim": 4544, - "multiple_of": 256, - "norm_eps": 1e-05, - "total_requests": 2560, - "hidden_dim": 11008, - "incremental_mode": true -} \ No newline at end of file diff --git a/inference/models/configs/llama2_70B.json b/inference/models/configs/llama2_70B.json deleted file mode 100644 index 017e71888d..0000000000 --- a/inference/models/configs/llama2_70B.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_layers": 80, - "vocab_size": 32000, - "n_heads": 64, - "dim": 8192, - "multiple_of": 256, - "norm_eps": 1e-5, - "total_requests": 2560, - "hidden_dim": 28672, - "incremental_mode": true -} diff --git a/inference/models/configs/llama2_7B.json b/inference/models/configs/llama2_7B.json deleted file mode 100644 index 46dd138e4f..0000000000 --- a/inference/models/configs/llama2_7B.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_layers": 32, - "vocab_size": 32000, - "n_heads": 32, - "dim": 4096, - "multiple_of": 256, - "norm_eps": 1e-5, - "total_requests": 2560, - "hidden_dim": 11008, - "incremental_mode": true -} diff --git a/inference/models/configs/llama_160M.json b/inference/models/configs/llama_160M.json deleted file mode 100644 index d912c64ab7..0000000000 --- a/inference/models/configs/llama_160M.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_layers": 12, - "vocab_size": 32000, - "n_heads": 12, - "dim": 768, - "multiple_of": 256, - "norm_eps": 1e-6, - "total_requests": 2560, - "hidden_dim": 3072, - "incremental_mode": true -} diff --git a/inference/models/configs/llama_68M.json b/inference/models/configs/llama_68M.json deleted file mode 100644 index 11e21531c4..0000000000 --- a/inference/models/configs/llama_68M.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_layers": 2, - "vocab_size": 32000, - "n_heads": 12, - "dim": 768, - "multiple_of": 256, - "norm_eps": 1e-06, - "total_requests": 2560, - "hidden_dim": 3072, - "incremental_mode": true -} \ No newline at end of file diff --git a/inference/models/configs/llama_7B.json b/inference/models/configs/llama_7B.json deleted file mode 100644 index 0c32ed320d..0000000000 --- a/inference/models/configs/llama_7B.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_layers": 32, - "vocab_size": 32000, - "n_heads": 32, - "dim": 4096, - "multiple_of": 256, - "norm_eps": 1e-6, - "total_requests": 2560, - "hidden_dim": 11008, - "incremental_mode": true -} diff --git a/inference/models/configs/opt_125M.json b/inference/models/configs/opt_125M.json deleted file mode 100644 index 0b9feed922..0000000000 --- a/inference/models/configs/opt_125M.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "vocab_size": 50272, - "word_embed_proj_dim": 768, - "hidden_size": 768, - "num_attention_heads": 12, - "max_position_embeddings": 2048, - "layer_norm_elementwise_affine": true, - "num_hidden_layers": 12, - "dropout": 0.1, - "ffn_dim": 3072, - "max_beam_width": 1, - "batchSize": 8, - "sentence_len": 100, - "max_beam_depth": 4 -} diff --git a/inference/models/configs/opt_13B.json b/inference/models/configs/opt_13B.json deleted file mode 100644 index 96cad5c99b..0000000000 --- a/inference/models/configs/opt_13B.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "vocab_size": 50272, - "word_embed_proj_dim": 5120, - "hidden_size": 5120, - "num_attention_heads": 40, - "max_position_embeddings": 2048, - "layer_norm_elementwise_affine": true, - "num_hidden_layers": 40, - "dropout": 0.1, - "ffn_dim": 20480, - "max_beam_width": 1, - "batchSize": 8, - "sentence_len": 100, - "max_beam_depth": 4 -} diff --git a/inference/models/configs/opt_30B.json b/inference/models/configs/opt_30B.json deleted file mode 100644 index 0618b81b77..0000000000 --- a/inference/models/configs/opt_30B.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "vocab_size": 50272, - "word_embed_proj_dim": 7168, - "hidden_size": 7168, - "num_attention_heads": 56, - "max_position_embeddings": 2048, - "layer_norm_elementwise_affine": true, - "num_hidden_layers": 48, - "dropout": 0.1, - "ffn_dim": 28672, - "max_beam_width": 1, - "batchSize": 8, - "sentence_len": 100, - "max_beam_depth": 4 -} \ No newline at end of file diff --git a/inference/models/configs/opt_6B.json b/inference/models/configs/opt_6B.json deleted file mode 100644 index cc86ce0f8f..0000000000 --- a/inference/models/configs/opt_6B.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "vocab_size": 50272, - "word_embed_proj_dim": 4096, - "hidden_size": 4096, - "num_attention_heads": 32, - "max_position_embeddings": 2048, - "layer_norm_elementwise_affine": true, - "num_hidden_layers": 32, - "dropout": 0.1, - "ffn_dim": 16384, - "max_beam_width": 1, - "batchSize": 8, - "sentence_len": 100, - "max_beam_depth": 4 -} diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 00f7864e7f..d57504b8cf 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -18,29 +18,24 @@ namespace FlexFlow { using namespace Legion; +using json = nlohmann::json; void FALCON::create_falcon_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, bool use_full_precision) { - Config falcon_config(model_config_file_path); - falcon_config.printConfig(); - - if (ff.config.tensor_parallelism_degree > falcon_config.n_heads || - ff.config.tensor_parallelism_degree > falcon_config.n_kv_heads) { - assert(false && "The degree of tensor parallelism should be greater than " - "or equal to the number of heads"); + FalconConfig falcon_config(model_config_file_path); + falcon_config.print(); + + if (ff.config.tensor_parallelism_degree > falcon_config.n_head || + falcon_config.n_head % ff.config.tensor_parallelism_degree != 0 || + ff.config.tensor_parallelism_degree > falcon_config.n_head_kv || + falcon_config.n_head_kv % ff.config.tensor_parallelism_degree != 0) { + assert(false && "The number of attention heads is smaller, or it is not " + "divisible by the tensor parallelism degree"); } - int num_devices = ff.config.workersPerNode * ff.config.numNodes; - int num_transformer_layers = falcon_config.n_layers; - assert(num_transformer_layers % ff.config.pipeline_parallelism_degree == 0); - int num_layers_per_pp_block = - num_transformer_layers / ff.config.pipeline_parallelism_degree; - int num_devices_per_data_parallelism_line = - num_devices / ff.config.data_parallelism_degree; - std::unordered_map weights_layers; Tensor input; @@ -58,7 +53,7 @@ void FALCON::create_falcon_model(FFModel &ff, if (use_full_precision) { token = ff.embedding(input, falcon_config.vocab_size, - falcon_config.dim, + falcon_config.hidden_size, AGGR_MODE_NONE, DT_FLOAT, NULL, @@ -66,7 +61,7 @@ void FALCON::create_falcon_model(FFModel &ff, } else { token = ff.embedding(input, falcon_config.vocab_size, - falcon_config.dim, + falcon_config.hidden_size, AGGR_MODE_NONE, DT_HALF, NULL, @@ -74,13 +69,14 @@ void FALCON::create_falcon_model(FFModel &ff, } Layer *embedding = ff.layers.back(); - weights_layers.emplace("tok_embeddings_weight", embedding); + weights_layers.emplace("word_embeddings_weight", embedding); - for (int i = 0; i < falcon_config.n_layers; i++) { + for (int i = 0; i < falcon_config.n_layer; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention - Tensor att_norm = ff.layer_norm(token, axes, true, falcon_config.norm_eps); + Tensor att_norm = + ff.layer_norm(token, axes, true, falcon_config.layer_norm_epsilon); Layer *attention_norm = ff.layers.back(); weights_layers.emplace("layers_" + std::to_string(i) + @@ -91,11 +87,11 @@ void FALCON::create_falcon_model(FFModel &ff, case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multiquery_self_attention( att_norm, - falcon_config.dim, - falcon_config.n_heads, - falcon_config.n_kv_heads, - falcon_config.dim / falcon_config.n_heads, - falcon_config.dim / falcon_config.n_heads, + falcon_config.hidden_size, + falcon_config.n_head, + falcon_config.n_head_kv, + falcon_config.hidden_size / falcon_config.n_head, + falcon_config.hidden_size / falcon_config.n_head, 0.0f, false, false, @@ -109,11 +105,11 @@ void FALCON::create_falcon_model(FFModel &ff, case TREE_VERIFY_MODE: { mha = ff.inc_multiquery_self_attention_verify( att_norm, - falcon_config.dim, - falcon_config.n_heads, - falcon_config.n_kv_heads, - falcon_config.dim / falcon_config.n_heads, - falcon_config.dim / falcon_config.n_heads, + falcon_config.hidden_size, + falcon_config.n_head, + falcon_config.n_head_kv, + falcon_config.hidden_size / falcon_config.n_head, + falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ false, /*bias*/ false, /*add_bias_kv*/ @@ -128,11 +124,11 @@ void FALCON::create_falcon_model(FFModel &ff, case INC_DECODING_MODE: { mha = ff.inc_multiquery_self_attention( att_norm, - falcon_config.dim, - falcon_config.n_heads, - falcon_config.n_kv_heads, - falcon_config.dim / falcon_config.n_heads, - falcon_config.dim / falcon_config.n_heads, + falcon_config.hidden_size, + falcon_config.n_head, + falcon_config.n_head_kv, + falcon_config.hidden_size / falcon_config.n_head, + falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ false, /*bias*/ false, /*add_bias_kv*/ @@ -157,24 +153,25 @@ void FALCON::create_falcon_model(FFModel &ff, weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", attention_layer); Tensor dense_h_to_4h = - ff.dense(att_norm, falcon_config.dim * 4, AC_MODE_NONE, false); + ff.dense(att_norm, falcon_config.hidden_size * 4, AC_MODE_NONE, false); Layer *dense_h_to_4h_layer = ff.layers.back(); weights_layers.emplace("layers_" + std::to_string(i) + - "_mlp_dense_h_to_4layers_weight", + "_mlp_dense_h_to_4h_weight", dense_h_to_4h_layer); dense_h_to_4h = ff.gelu(dense_h_to_4h); Tensor mlp_output = - ff.dense(dense_h_to_4h, falcon_config.dim, AC_MODE_NONE, false); + ff.dense(dense_h_to_4h, falcon_config.hidden_size, AC_MODE_NONE, false); Layer *dense_4h_to_h_layer = ff.layers.back(); weights_layers.emplace("layers_" + std::to_string(i) + - "_mlp_dense_4h_to_layers_weight", + "_mlp_dense_4h_to_h_weight", dense_4h_to_h_layer); token = ff.add(token, mha); token = ff.add(token, mlp_output); } // final normalization and linear - Tensor ln_f = ff.layer_norm(token, axes, true, falcon_config.norm_eps); + Tensor ln_f = + ff.layer_norm(token, axes, true, falcon_config.layer_norm_epsilon); Layer *ln_f_layer = ff.layers.back(); weights_layers.emplace("ln_f_weight", ln_f_layer); @@ -193,16 +190,15 @@ void FALCON::create_falcon_model(FFModel &ff, // Compile the model std::cout << "------start compile ----------" << std::endl; - int tensor_partition_num = ff.config.tensor_parallelism_degree; InferenceManager *im = InferenceManager::get_inference_manager(); im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, - falcon_config.n_heads, - falcon_config.n_kv_heads, - falcon_config.dim, - falcon_config.dim / falcon_config.n_heads, - tensor_partition_num); + falcon_config.n_head, + falcon_config.n_head_kv, + falcon_config.hidden_size, + falcon_config.hidden_size / falcon_config.n_head, + ff.config.tensor_parallelism_degree); std::cout << "------laod weights ----------" << std::endl; fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------load weight finished----------" << std::endl; diff --git a/inference/models/falcon.h b/inference/models/falcon.h index d37ffbc713..a822f9be34 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -26,84 +26,63 @@ namespace FlexFlow { class FALCON { public: - struct Config { - Config(void) { - // todo read from config/param file - n_layers = 32; - vocab_size = 32000; - n_heads = 32; - n_kv_heads = 1; - dim = 4096; - multiple_of = 256; - norm_eps = 1e-6; - total_requests = 2560; - incremental_mode = true; - hidden_dim = 11008; - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; - } - - Config(std::string config_filepath) { - std::ifstream config_file(config_filepath); + struct FalconConfig { + FalconConfig(std::string const &model_config_file_path) { + std::ifstream config_file(model_config_file_path); if (config_file.is_open()) { try { - json config_json; - config_file >> config_json; - - n_layers = config_json["n_layers"]; - vocab_size = config_json["vocab_size"]; - n_heads = config_json["n_heads"]; - n_kv_heads = config_json["n_kv_heads"]; - dim = config_json["dim"]; - multiple_of = config_json["multiple_of"]; - norm_eps = config_json["norm_eps"]; - total_requests = config_json["total_requests"]; - incremental_mode = config_json["incremental_mode"]; - hidden_dim = config_json["hidden_dim"]; - head_dim = dim / n_heads; - // Override values below - /* max_seq_len = config_json["max_seq_len"]; - max_num_tokens = config_json["max_num_tokens"]; - max_beam_width = config_json["max_beam_width"]; - max_beam_depth = config_json["max_beam_depth"]; - hidden_dim = config_json["hidden_dim"]; */ - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + json model_config; + config_file >> model_config; + bias = model_config["bias"]; + hidden_size = model_config["hidden_size"]; + layer_norm_epsilon = model_config["layer_norm_epsilon"]; + multi_query = model_config["multi_query"]; + n_head = model_config["n_head"]; + if (model_config.contains("n_head_kv")) { + n_head_kv = model_config["n_head_kv"]; + } else { + n_head_kv = 1; + } + n_layer = model_config["n_layer"]; + parallel_attn = model_config["parallel_attn"]; + vocab_size = model_config["vocab_size"]; } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); } } else { - std::cerr << "Error opening JSON file." << std::endl; + std::cerr << "Error opening JSON file " << model_config_file_path + << std::endl; assert(false); } + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } - void printConfig() const { + void print() const { std::cout << "Falcon Config:" << std::endl; - std::cout << "n_layers: " << n_layers << std::endl; - std::cout << "vocab_size: " << vocab_size << std::endl; - std::cout << "n_heads: " << n_heads << std::endl; - std::cout << "dim: " << dim << std::endl; - std::cout << "multiple_of: " << multiple_of << std::endl; - std::cout << "norm_eps: " << norm_eps << std::endl; - std::cout << "total_requests: " << total_requests << std::endl; - std::cout << "incremental_mode: " << incremental_mode << std::endl; - std::cout << "max_seq_len: " << max_seq_len << std::endl; - std::cout << "max_num_tokens: " << max_num_tokens << std::endl; - std::cout << "max_beam_width: " << max_beam_width << std::endl; - std::cout << "max_beam_depth: " << max_beam_depth << std::endl; - std::cout << "hidden_dim: " << hidden_dim << std::endl; + std::cout << "\tbias: " << bias << std::endl; + std::cout << "\thidden_size: " << hidden_size << std::endl; + std::cout << "\tlayer_norm_epsilon: " << layer_norm_epsilon << std::endl; + std::cout << "\tmulti_query: " << multi_query << std::endl; + std::cout << "\tn_head: " << n_head << std::endl; + std::cout << "\tn_head_kv: " << n_head << std::endl; + std::cout << "\tn_layer: " << n_layer << std::endl; + std::cout << "\tparallel_attn: " << parallel_attn << std::endl; + std::cout << "\tvocab_size: " << vocab_size << std::endl; + + std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; + std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; + std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } - int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, - total_requests, incremental_mode, max_seq_len, max_num_tokens, - max_beam_width, max_beam_depth, head_dim, n_kv_heads; - float norm_eps; + bool bias, multi_query, parallel_attn; + int hidden_size, n_head, n_head_kv, n_layer, vocab_size; + float layer_norm_epsilon; + int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; }; static void create_falcon_model(FFModel &ff, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 79b042b97c..5a607e4872 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -18,6 +18,7 @@ namespace FlexFlow { using namespace Legion; +using json = nlohmann::json; void LLAMA::create_llama_model(FFModel &ff, std::string const &model_config_file_path, @@ -26,8 +27,15 @@ void LLAMA::create_llama_model(FFModel &ff, SamplingConfig samplingConfig, bool use_full_precision) { // do not apply cpu offload in beam search model. - Config llama_config(model_config_file_path); - llama_config.printConfig(); + LLAMAConfig llama_config(model_config_file_path); + llama_config.print(); + + if (ff.config.tensor_parallelism_degree > llama_config.num_attention_heads || + llama_config.num_attention_heads % ff.config.tensor_parallelism_degree != + 0) { + assert(false && "The number of attention heads is smaller, or it is not " + "divisible by the tensor parallelism degree"); + } std::unordered_map weights_layers; @@ -45,7 +53,7 @@ void LLAMA::create_llama_model(FFModel &ff, if (use_full_precision) { token = ff.embedding(input, llama_config.vocab_size, - llama_config.dim, + llama_config.hidden_size, AGGR_MODE_NONE, DT_FLOAT, NULL, @@ -53,7 +61,7 @@ void LLAMA::create_llama_model(FFModel &ff, } else { token = ff.embedding(input, llama_config.vocab_size, - llama_config.dim, + llama_config.hidden_size, AGGR_MODE_NONE, DT_HALF, NULL, @@ -63,12 +71,12 @@ void LLAMA::create_llama_model(FFModel &ff, Layer *embedding = ff.layers.back(); weights_layers.emplace("tok_embeddings_weight", embedding); - for (int i = 0; i < llama_config.n_layers; i++) { + for (int i = 0; i < llama_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention Tensor att_norm = - ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); + ff.rms_norm(token, llama_config.rms_norm_eps, llama_config.hidden_size); Layer *attention_norm = ff.layers.back(); weights_layers.emplace("layers_" + std::to_string(i) + "_attention_norm_weight", @@ -79,10 +87,10 @@ void LLAMA::create_llama_model(FFModel &ff, case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( att_norm, - llama_config.dim, - llama_config.n_heads, - llama_config.dim / llama_config.n_heads, - llama_config.dim / llama_config.n_heads, + llama_config.hidden_size, + llama_config.num_attention_heads, + llama_config.hidden_size / llama_config.num_attention_heads, + llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, false, false, @@ -95,10 +103,10 @@ void LLAMA::create_llama_model(FFModel &ff, case TREE_VERIFY_MODE: { mha = ff.inc_multihead_self_attention_verify( att_norm, - llama_config.dim, - llama_config.n_heads, - llama_config.dim / llama_config.n_heads, - llama_config.dim / llama_config.n_heads, + llama_config.hidden_size, + llama_config.num_attention_heads, + llama_config.hidden_size / llama_config.num_attention_heads, + llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ false, /*bias*/ false, /*add_bias_kv*/ @@ -112,10 +120,10 @@ void LLAMA::create_llama_model(FFModel &ff, case INC_DECODING_MODE: { mha = ff.inc_multihead_self_attention( att_norm, - llama_config.dim, - llama_config.n_heads, - llama_config.dim / llama_config.n_heads, - llama_config.dim / llama_config.n_heads, + llama_config.hidden_size, + llama_config.num_attention_heads, + llama_config.hidden_size / llama_config.num_attention_heads, + llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ false, /*bias*/ false, /*add_bias_kv*/ @@ -137,17 +145,19 @@ void LLAMA::create_llama_model(FFModel &ff, // step 2: SILU activaion Tensor ff_norm = - ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); + ff.rms_norm(token, llama_config.rms_norm_eps, llama_config.hidden_size); Layer *ffn_layer = ff.layers.back(); weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", ffn_layer); - Tensor w1 = ff.dense(ff_norm, llama_config.hidden_dim, AC_MODE_NONE, false); + Tensor w1 = + ff.dense(ff_norm, llama_config.intermediate_size, AC_MODE_NONE, false); Layer *w1_layer = ff.layers.back(); weights_layers.emplace( "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); - Tensor w3 = ff.dense(ff_norm, llama_config.hidden_dim, AC_MODE_NONE, false); + Tensor w3 = + ff.dense(ff_norm, llama_config.intermediate_size, AC_MODE_NONE, false); Layer *w3_layer = ff.layers.back(); weights_layers.emplace( "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); @@ -156,7 +166,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor silu = ff.multiply(w1, sigmoid); Tensor multi = ff.multiply(silu, w3); - Tensor w2 = ff.dense(multi, llama_config.dim, AC_MODE_NONE, false); + Tensor w2 = ff.dense(multi, llama_config.hidden_size, AC_MODE_NONE, false); Layer *w2_layer = ff.layers.back(); weights_layers.emplace( "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); @@ -164,7 +174,8 @@ void LLAMA::create_llama_model(FFModel &ff, } // final normalization and linear std::vector axes = {2}; - token = ff.rms_norm(token, llama_config.norm_eps, llama_config.dim); + token = + ff.rms_norm(token, llama_config.rms_norm_eps, llama_config.hidden_size); Layer *final_norm = ff.layers.back(); weights_layers.emplace("norm_weight", final_norm); @@ -192,15 +203,15 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); // Compile the model std::cout << "------start compile ----------" << std::endl; - int tensor_partition_num = ff.config.tensor_parallelism_degree; im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", weight_file_path, - llama_config.n_heads, - llama_config.n_heads, - llama_config.dim, - llama_config.dim / llama_config.n_heads, - tensor_partition_num); + llama_config.num_attention_heads, + llama_config.num_attention_heads, + llama_config.hidden_size, + llama_config.hidden_size / + llama_config.num_attention_heads, + ff.config.tensor_parallelism_degree); fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------load weight finished----------" << std::endl; diff --git a/inference/models/llama.h b/inference/models/llama.h index 61d8908d0c..311bdedbe6 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -26,81 +26,55 @@ namespace FlexFlow { class LLAMA { public: - struct Config { - Config(void) { - // todo read from config/param file - n_layers = 32; - vocab_size = 32000; - n_heads = 32; - dim = 4096; - multiple_of = 256; - norm_eps = 1e-6; - total_requests = 2560; - incremental_mode = true; - hidden_dim = 11008; - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; - } - - Config(std::string config_filepath) { - std::ifstream config_file(config_filepath); + struct LLAMAConfig { + LLAMAConfig(std::string const &model_config_file_path) { + std::ifstream config_file(model_config_file_path); if (config_file.is_open()) { try { - json config_json; - config_file >> config_json; - - n_layers = config_json["n_layers"]; - vocab_size = config_json["vocab_size"]; - n_heads = config_json["n_heads"]; - dim = config_json["dim"]; - multiple_of = config_json["multiple_of"]; - norm_eps = config_json["norm_eps"]; - total_requests = config_json["total_requests"]; - incremental_mode = config_json["incremental_mode"]; - hidden_dim = config_json["hidden_dim"]; - // Override values below - /* max_seq_len = config_json["max_seq_len"]; - max_num_tokens = config_json["max_num_tokens"]; - max_beam_width = config_json["max_beam_width"]; - max_beam_depth = config_json["max_beam_depth"]; - hidden_dim = config_json["hidden_dim"]; */ - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + json model_config; + config_file >> model_config; + num_hidden_layers = model_config["num_hidden_layers"]; + vocab_size = model_config["vocab_size"]; + num_attention_heads = model_config["num_attention_heads"]; + hidden_size = model_config["hidden_size"]; + rms_norm_eps = model_config["rms_norm_eps"]; + intermediate_size = model_config["intermediate_size"]; } catch (json::exception const &e) { - std::cerr << "Error parsing JSON file: " << e.what() << std::endl; + std::cerr << "Error parsing LLAMA config from JSON file: " << e.what() + << std::endl; assert(false); } } else { - std::cerr << "Error opening JSON file." << std::endl; + std::cerr << "Error opening JSON file " << model_config_file_path + << std::endl; assert(false); } + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } - void printConfig() const { + void print() const { std::cout << "LLAMA Config:" << std::endl; - std::cout << "n_layers: " << n_layers << std::endl; - std::cout << "vocab_size: " << vocab_size << std::endl; - std::cout << "n_heads: " << n_heads << std::endl; - std::cout << "dim: " << dim << std::endl; - std::cout << "multiple_of: " << multiple_of << std::endl; - std::cout << "norm_eps: " << norm_eps << std::endl; - std::cout << "total_requests: " << total_requests << std::endl; - std::cout << "incremental_mode: " << incremental_mode << std::endl; - std::cout << "max_seq_len: " << max_seq_len << std::endl; - std::cout << "max_num_tokens: " << max_num_tokens << std::endl; - std::cout << "max_beam_width: " << max_beam_width << std::endl; - std::cout << "max_beam_depth: " << max_beam_depth << std::endl; - std::cout << "hidden_dim: " << hidden_dim << std::endl; + std::cout << "\tnum_hidden_layers: " << num_hidden_layers << std::endl; + std::cout << "\tvocab_size: " << vocab_size << std::endl; + std::cout << "\tnum_attention_heads: " << num_attention_heads + << std::endl; + std::cout << "\thidden_size: " << hidden_size << std::endl; + std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl; + std::cout << "\tintermediate_size: " << intermediate_size << std::endl; + + std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; + std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; + std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } - int n_heads, n_layers, vocab_size, dim, multiple_of, hidden_dim, - total_requests, incremental_mode, max_seq_len, max_num_tokens, - max_beam_width, max_beam_depth; - float norm_eps; + int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, + intermediate_size; + float rms_norm_eps; }; static void create_llama_model(FFModel &ff, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 8a1a17d3af..fc1d5512ba 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -18,14 +18,22 @@ namespace FlexFlow { using namespace Legion; +using json = nlohmann::json; void OPT::create_opt_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, bool use_full_precision) { - Config opt_config(model_config_file_path); - opt_config.printConfig(); + OPTConfig opt_config(model_config_file_path); + opt_config.print(); + + if (ff.config.tensor_parallelism_degree > opt_config.num_attention_heads || + opt_config.num_attention_heads % ff.config.tensor_parallelism_degree != + 0) { + assert(false && "The number of attention heads is smaller, or it is not " + "divisible by the tensor parallelism degree"); + } std::unordered_map weights_layers; @@ -223,7 +231,6 @@ void OPT::create_opt_model(FFModel &ff, //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; - int tensor_partition_num = ff.config.tensor_parallelism_degree; InferenceManager *im = InferenceManager::get_inference_manager(); im->compile_model_and_allocate_buffer(&ff); FileDataLoader fileloader("", @@ -233,7 +240,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.hidden_size, opt_config.hidden_size / opt_config.num_attention_heads, - tensor_partition_num); + ff.config.tensor_parallelism_degree); fileloader.load_weights(&ff, weights_layers, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; im->init_operators_inference(&ff); diff --git a/inference/models/opt.h b/inference/models/opt.h index 45ee6e6181..ab972ae10c 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -26,83 +26,70 @@ namespace FlexFlow { class OPT { public: - struct Config { - Config(void) { - vocab_size = 50272; - word_embed_proj_dim = 4096; - hidden_size = 4096; - num_attention_heads = 32; - max_position_embeddings = 2048; - layer_norm_elementwise_affine = true; - dropout = 0.1; - ffn_dim = 16384; - num_hidden_layers = 32; - max_beam_width = 1; - batchSize = 8; - sentence_len = 100; - max_beam_depth = 4; - } - Config(std::string config_filepath) { - std::ifstream config_file(config_filepath); + struct OPTConfig { + OPTConfig(std::string const &model_config_file_path) { + std::ifstream config_file(model_config_file_path); if (config_file.is_open()) { try { - json config_json; - config_file >> config_json; - - vocab_size = config_json["vocab_size"]; - word_embed_proj_dim = config_json["word_embed_proj_dim"]; - hidden_size = config_json["hidden_size"]; - num_attention_heads = config_json["num_attention_heads"]; - max_position_embeddings = config_json["max_position_embeddings"]; + json model_config; + config_file >> model_config; + do_layer_norm_before = model_config["do_layer_norm_before"]; + dropout = model_config["dropout"]; + enable_bias = model_config["enable_bias"]; + ffn_dim = model_config["ffn_dim"]; + hidden_size = model_config["hidden_size"]; layer_norm_elementwise_affine = - config_json["layer_norm_elementwise_affine"]; - dropout = config_json["dropout"]; - ffn_dim = config_json["ffn_dim"]; - num_hidden_layers = config_json["num_hidden_layers"]; - max_beam_width = config_json["max_beam_width"]; - batchSize = config_json["batchSize"]; - sentence_len = config_json["sentence_len"]; - max_beam_depth = config_json["max_beam_depth"]; + model_config["layer_norm_elementwise_affine"]; + max_position_embeddings = model_config["max_position_embeddings"]; + num_attention_heads = model_config["num_attention_heads"]; + num_hidden_layers = model_config["num_hidden_layers"]; + vocab_size = model_config["vocab_size"]; + word_embed_proj_dim = model_config["word_embed_proj_dim"]; } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); } } else { - std::cerr << "Error opening JSON file." << std::endl; + std::cerr << "Error opening JSON file " << model_config_file_path + << std::endl; assert(false); } + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } - void printConfig() const { + + void print() const { std::cout << "OPT Config:" << std::endl; - std::cout << "vocab_size: " << vocab_size << std::endl; - std::cout << "word_embed_proj_dim: " << word_embed_proj_dim << std::endl; - std::cout << "hidden_size: " << hidden_size << std::endl; - std::cout << "num_attention_heads: " << num_attention_heads << std::endl; - std::cout << "max_position_embeddings: " << max_position_embeddings + std::cout << "\tdo_layer_norm_before: " << do_layer_norm_before << std::endl; - std::cout << "layer_norm_elementwise_affine: " << std::boolalpha + std::cout << "\tdropout: " << dropout << std::endl; + std::cout << "\tenable_bias: " << enable_bias << std::endl; + std::cout << "\tffn_dim: " << ffn_dim << std::endl; + std::cout << "\thidden_size: " << hidden_size << std::endl; + std::cout << "\tlayer_norm_elementwise_affine: " << layer_norm_elementwise_affine << std::endl; - std::cout << "dropout: " << dropout << std::endl; - std::cout << "ffn_dim: " << ffn_dim << std::endl; - std::cout << "num_hidden_layers: " << num_hidden_layers << std::endl; - std::cout << "max_beam_width: " << max_beam_width << std::endl; - std::cout << "batchSize: " << batchSize << std::endl; - std::cout << "sentence_len: " << sentence_len << std::endl; - std::cout << "max_beam_depth: " << max_beam_depth << std::endl; + std::cout << "\tmax_position_embeddings: " << max_position_embeddings + << std::endl; + std::cout << "\tnum_attention_heads: " << num_attention_heads + << std::endl; + std::cout << "\tnum_hidden_layers: " << num_hidden_layers << std::endl; + std::cout << "\tvocab_size: " << vocab_size << std::endl; + std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim + << std::endl; + + std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; + std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; + std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } - int vocab_size; - int word_embed_proj_dim; - int hidden_size; - int num_attention_heads; - int max_position_embeddings; - bool layer_norm_elementwise_affine; + + int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + bool do_layer_norm_before, enable_bias, layer_norm_elementwise_affine; float dropout; - int ffn_dim; - int num_hidden_layers; - int max_beam_width; - int batchSize; - int sentence_len; - int max_beam_depth; + int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads, + num_hidden_layers, vocab_size, word_embed_proj_dim; }; static void create_opt_model(FFModel &ff, diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 6db7d09c56..b3cee48458 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -48,8 +48,8 @@ def get_configs(): "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 2, - "pipeline_parallelism_degree": 2, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 4, "offload": False, "offload_reserve_space_size": 1024**2, "use_4bit_quantization": False, @@ -59,12 +59,11 @@ def get_configs(): } llm_configs = { # required parameters - "llm_model": "decapoda-research/llama-7b-hf", + "llm_model": "tiiuae/falcon-7b", # optional parameters - "llm_weight": "", - "llm_tokenizer": "", - "clean_model_cache": False, - "full_precision": False, + "cache_path": "", + "refresh_cache": False, + "full_precision": True, "prompt": "", "output_file": "", } @@ -87,9 +86,8 @@ def main(): llm = ff.LLM( configs.llm_model, data_type=ff_data_type, - tokenizer_path=configs.llm_tokenizer, - weights_path=configs.llm_weight, - clean_cache=configs.clean_model_cache, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, output_file=configs.output_file, ) diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 7a0cd1dc64..a19a930f2d 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -61,27 +61,24 @@ def get_configs(): # required llm arguments "llm_model": "decapoda-research/llama-7b-hf", # optional llm parameters - "llm_weight": "", - "llm_tokenizer": "", - "clean_model_cache": False, + "cache_path": "", + "refresh_cache": False, "full_precision": False, "ssms": [ { # required ssm parameter "ssm_model": "JackFram/llama-160m", # optional ssm parameters - "ssm_weight": "", - "ssm_tokenizer": "", - "clean_model_cache": False, + "cache_path": "", + "refresh_cache": False, "full_precision": False, }, { # required ssm parameter "ssm_model": "facebook/opt-125m", # optional ssm parameters - "ssm_weight": "", - "ssm_tokenizer": "", - "clean_model_cache": False, + "cache_path": "", + "refresh_cache": False, "full_precision": False, }, ], @@ -107,9 +104,8 @@ def main(): llm = ff.LLM( configs.llm_model, data_type=ff_data_type, - tokenizer_path=configs.llm_tokenizer, - weights_path=configs.llm_weight, - clean_cache=configs.clean_model_cache, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, output_file=configs.output_file, ) @@ -123,9 +119,8 @@ def main(): ssm = ff.SSM( ssm_config.ssm_model, data_type=ff_data_type, - tokenizer_path=ssm_config.ssm_tokenizer, - weights_path=ssm_config.ssm_weight, - clean_cache=ssm_config.clean_model_cache, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, output_file=configs.output_file, ) ssms.append(ssm) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 6b218e107c..67faf98536 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -19,93 +19,64 @@ #include "models/opt.h" #include #include +#include using namespace Legion; +using json = nlohmann::json; LegionRuntime::Logger::Category log_app("llama"); struct FilePaths { - std::string llm_weight_file_path; - std::string llm_config_file_path; - std::vector ssm_weight_file_paths; - std::vector ssm_config_file_paths; + std::string cache_folder_path; std::string prompt_file_path; - std::string tokenizer_file_path; std::string output_file_path; }; -struct ModelTypes { +struct ModelNames { + std::string llm_model_name; + std::vector ssm_model_names; +}; + +struct ModelMeta { + ModelNames model_names; + ModelType llm_model_type; + std::string llm_tokenizer_path; + std::string llm_weights_path; + std::string llm_model_config_path; + std::vector ssm_model_types; + std::vector ssm_model_config_paths; + std::vector ssm_model_weights_paths; }; void parse_input_args(char **argv, int argc, FilePaths &paths, - ModelTypes &model_types, + ModelNames &model_names, bool &use_full_precision, bool &verbose) { for (int i = 1; i < argc; i++) { - // llm model type + // llm model name if (!strcmp(argv[i], "-llm-model")) { - std::string model_type_str = std::string(argv[++i]); - std::transform(model_type_str.begin(), - model_type_str.end(), - model_type_str.begin(), - [](unsigned char c) { return std::tolower(c); }); - if (model_type_str == "llama") { - model_types.llm_model_type = ModelType::LLAMA; - } else if (model_type_str == "llama2") { - model_types.llm_model_type = ModelType::LLAMA2; - } else if (model_type_str == "opt") { - model_types.llm_model_type = ModelType::OPT; - } else if (model_type_str == "falcon") { - model_types.llm_model_type = ModelType::FALCON; - } else { - model_types.llm_model_type = ModelType::UNKNOWN; + model_names.llm_model_name = std::string(argv[++i]); + for (char &c : model_names.llm_model_name) { + c = std::tolower(c); } continue; } - // llm model weights - if (!strcmp(argv[i], "-llm-weight")) { - paths.llm_weight_file_path = std::string(argv[++i]); - continue; - } - // llm model configs - if (!strcmp(argv[i], "-llm-config")) { - paths.llm_config_file_path = std::string(argv[++i]); - continue; - } - // ssm models types + // ssm models names if (!strcmp(argv[i], "-ssm-model")) { - std::string model_type_str = std::string(argv[++i]); - std::transform(model_type_str.begin(), - model_type_str.end(), - model_type_str.begin(), - [](unsigned char c) { return std::tolower(c); }); - if (model_type_str == "llama") { - model_types.ssm_model_types.push_back(ModelType::LLAMA); - } else if (model_type_str == "llama2") { - model_types.ssm_model_types.push_back(ModelType::LLAMA2); - } else if (model_type_str == "opt") { - model_types.ssm_model_types.push_back(ModelType::OPT); - } else if (model_type_str == "falcon") { - model_types.ssm_model_types.push_back(ModelType::FALCON); - } else { - model_types.ssm_model_types.push_back(ModelType::UNKNOWN); + std::string ssm_model_name = std::string(argv[++i]); + for (char &c : ssm_model_name) { + c = std::tolower(c); } + model_names.ssm_model_names.push_back(ssm_model_name); continue; } - // ssm model weights - if (!strcmp(argv[i], "-ssm-weight")) { - std::string file_path = std::string(argv[++i]); - paths.ssm_weight_file_paths.push_back(file_path); - continue; - } - // ssm model configs - if (!strcmp(argv[i], "-ssm-config")) { - std::string file_path = std::string(argv[++i]); - paths.ssm_config_file_paths.push_back(file_path); + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); continue; } // prompts @@ -113,11 +84,6 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } - // tokenizer - if (!strcmp(argv[i], "-tokenizer")) { - paths.tokenizer_file_path = std::string(argv[++i]); - continue; - } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -133,6 +99,131 @@ void parse_input_args(char **argv, continue; } } + if (paths.cache_folder_path.empty()) { + paths.cache_folder_path = "~/.cache/flexflow"; + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void get_model_meta(FilePaths &file_paths, + ModelMeta &model_metadata, + bool use_full_precision) { + if (model_metadata.model_names.llm_model_name.empty() || + model_metadata.model_names.ssm_model_names.size() == 0) { + assert(false && "SpecInfer needs at least one LLM and one SSM for " + "speculative inference"); + } + model_metadata.llm_model_config_path = + join_path({file_paths.cache_folder_path, + "configs", + model_metadata.model_names.llm_model_name, + "config.json"}); + model_metadata.llm_tokenizer_path = + join_path({file_paths.cache_folder_path, + "tokenizers", + model_metadata.model_names.llm_model_name}); + model_metadata.llm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + model_metadata.model_names.llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path); + if (!llm_config_file_handle.good()) { + std::cout << "LLM Model config file " + << model_metadata.llm_model_config_path << " not found." + << std::endl; + assert(false); + } + json llm_model_config = json::parse(llm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + model_metadata.llm_model_type = ModelType::UNKNOWN; + auto architectures = llm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + std::string nameOrPath = llm_model_config["_name_or_path"]; + // TODO: support LLAMA-2 models not from Meta + bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; + if (llama2) { + model_metadata.llm_model_type = ModelType::LLAMA2; + } else { + model_metadata.llm_model_type = ModelType::LLAMA; + } + break; + } else if (str == "OPTForCausalLM") { + model_metadata.llm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM") { + model_metadata.llm_model_type = ModelType::FALCON; + break; + } + } + + for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { + std::string ssm_config_path = join_path({file_paths.cache_folder_path, + "configs", + ssm_model_name, + "config.json"}); + std::string ssm_tokenizer_path = + join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name}); + std::string ssm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + ssm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream ssm_config_file_handle(ssm_config_path); + if (!ssm_config_file_handle.good()) { + std::cout << "SSM Model config file " << ssm_config_path << " not found." + << std::endl; + assert(false); + } + json ssm_model_config = json::parse(ssm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + ModelType ssm_model_type = ModelType::UNKNOWN; + auto architectures = ssm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + std::string nameOrPath = ssm_model_config["_name_or_path"]; + // TODO: support LLAMA-2 models not from Meta + bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; + if (llama2) { + ssm_model_type = ModelType::LLAMA2; + } else { + ssm_model_type = ModelType::LLAMA; + } + break; + } else if (str == "OPTForCausalLM") { + ssm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM") { + ssm_model_type = ModelType::FALCON; + break; + } + } + model_metadata.ssm_model_types.push_back(ssm_model_type); + model_metadata.ssm_model_config_paths.push_back(ssm_config_path); + model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); + } + + assert(model_metadata.llm_model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + for (auto mt : model_metadata.ssm_model_types) { + if (mt == ModelType::UNKNOWN) { + assert(false && "One of the SSM model types passed is invalid."); + } + } } void FlexFlow::top_level_task(Task const *task, @@ -141,77 +232,54 @@ void FlexFlow::top_level_task(Task const *task, Runtime *runtime) { FFConfig ffconfig; FilePaths file_paths; - ModelTypes model_types; + ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; - size_t num_devices = ffconfig.workersPerNode * ffconfig.numNodes; - int data_parallelism_degree = 1, tensor_parallelism_degree = 1, - pipeline_parallelism_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; int argc = command_args.argc; - parse_input_args( - argv, argc, file_paths, model_types, use_full_precision, verbose); + parse_input_args(argv, + argc, + file_paths, + model_metadata.model_names, + use_full_precision, + verbose); + + get_model_meta(file_paths, model_metadata, use_full_precision); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); - if (file_paths.ssm_weight_file_paths.size() == 0) { - assert(false && - "SpecInfer needs at least one SSM for speculative inference"); - } - if (file_paths.ssm_config_file_paths.size() != - file_paths.ssm_weight_file_paths.size()) { - assert(false && "Number of SSM config files passed does not match number " - "of SSM weights"); - } - assert(model_types.llm_model_type != ModelType::UNKNOWN && - "Invalid LLM model type passed (or no type was passed)."); - if (model_types.ssm_model_types.size() != - file_paths.ssm_weight_file_paths.size()) { - assert(false && "Number of valid SSM model types passed does not match " - "number of SSM weights"); - } - for (auto mt : model_types.ssm_model_types) { - if (mt == ModelType::UNKNOWN) { - assert(false && "One of the SSM model types passed is invalid."); - } - } - // Create SentencePiece tokenizer or OPT tokenizer SamplingConfig samplingConfig; InferenceManager *im = InferenceManager::get_inference_manager(); RequestManager *rm = RequestManager::get_request_manager(); - rm->register_tokenizer(model_types.llm_model_type, - file_paths.tokenizer_file_path); + rm->register_tokenizer(model_metadata.llm_model_type, + model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); - // InferenceManager im(ffconfig, BatchConfig::MAX_NUM_TOKENS); - // RequestManager rm(model_types.llm_model_type, - // file_paths.tokenizer_file_path, - // /*verbose*/ verbose, - // file_paths.output_file_path); // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); - if (model_types.llm_model_type == ModelType::LLAMA || - model_types.llm_model_type == ModelType::LLAMA2) { + if (model_metadata.llm_model_type == ModelType::LLAMA || + model_metadata.llm_model_type == ModelType::LLAMA2) { LLAMA::create_llama_model(tree_model, - file_paths.llm_config_file_path, - file_paths.llm_weight_file_path, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, TREE_VERIFY_MODE, samplingConfig, use_full_precision); - } else if (model_types.llm_model_type == ModelType::OPT) { + } else if (model_metadata.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, - file_paths.llm_config_file_path, - file_paths.llm_weight_file_path, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, TREE_VERIFY_MODE, use_full_precision); - } else if (model_types.llm_model_type == ModelType::FALCON) { + } else if (model_metadata.llm_model_type == ModelType::FALCON) { FALCON::create_falcon_model(tree_model, - file_paths.llm_config_file_path, - file_paths.llm_weight_file_path, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, TREE_VERIFY_MODE, use_full_precision); } else { @@ -219,7 +287,7 @@ void FlexFlow::top_level_task(Task const *task, } // Create SSM models - int num_ssms = model_types.ssm_model_types.size(); + int num_ssms = model_metadata.ssm_model_types.size(); std::vector ssm_model_ids; std::vector ssm_models; FFConfig bm_config = ffconfig; @@ -232,26 +300,27 @@ void FlexFlow::top_level_task(Task const *task, for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { FFModel &beam_model = ssm_models[ssm_id]; - if (model_types.ssm_model_types[ssm_id] == ModelType::LLAMA || - model_types.ssm_model_types[ssm_id] == ModelType::LLAMA2) { + if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA || + model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA2) { LLAMA::create_llama_model(beam_model, - file_paths.ssm_config_file_paths[ssm_id], - file_paths.ssm_weight_file_paths[ssm_id], + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, samplingConfig, use_full_precision); - } else if (model_types.ssm_model_types[ssm_id] == ModelType::OPT) { + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) { OPT::create_opt_model(beam_model, - file_paths.ssm_config_file_paths[ssm_id], - file_paths.ssm_weight_file_paths[ssm_id], + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, use_full_precision); - } else if (model_types.ssm_model_types[ssm_id] == ModelType::FALCON) { - FALCON::create_falcon_model(beam_model, - file_paths.ssm_config_file_paths[ssm_id], - file_paths.ssm_weight_file_paths[ssm_id], - BEAM_SEARCH_MODE, - use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) { + FALCON::create_falcon_model( + beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + BEAM_SEARCH_MODE, + use_full_precision); } else { assert(false && "Invalid SSM model type passed."); } diff --git a/inference/utils/convert_llama_config.py b/inference/utils/convert_llama_config.py deleted file mode 100644 index dfae42f841..0000000000 --- a/inference/utils/convert_llama_config.py +++ /dev/null @@ -1,32 +0,0 @@ -import argparse -import json - -def convert_json(input_file, output_file): - # Load the input JSON data from the file - with open(input_file, 'r') as file: - input_data = json.load(file) - - # Extract the required fields and create the output JSON object - output_data = { - "n_layers": input_data["num_hidden_layers"], - "vocab_size": input_data["vocab_size"], - "n_heads": input_data["num_attention_heads"], - "dim": input_data["hidden_size"], - "multiple_of": 256, - "norm_eps": input_data["rms_norm_eps"], - "total_requests": 2560, - "hidden_dim": input_data["intermediate_size"], - "incremental_mode": input_data["use_cache"] - } - - # Save the output JSON data to the file - with open(output_file, 'w') as file: - json.dump(output_data, file, indent=4) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Convert JSON file to a different format.") - parser.add_argument("input_file", help="Path to the input JSON file.") - parser.add_argument("output_file", help="Path to the output JSON file.") - args = parser.parse_args() - - convert_json(args.input_file, args.output_file) diff --git a/inference/utils/download_falcon_weights.py b/inference/utils/download_falcon_weights.py deleted file mode 100644 index a9a094f327..0000000000 --- a/inference/utils/download_falcon_weights.py +++ /dev/null @@ -1,45 +0,0 @@ -# from transformer import RWForCausalLM -# from configuration_RW import RWConfig -from transformers import AutoModel -import torch -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True) -# model = AutoModel.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True) - - -# model = RWForCausalLM.from_pretrained("tiiuae/falcon-7b") -# print(model.config) - -#lm_head -lm_head_weight = model.lm_head.weight -lm_head_weight.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/lm_head_weight') - -for name, params in model.named_parameters(): - name = ( - name.replace("h.", "layers_") - .replace(".", "_").replace("word_embeddings", "tok_embeddings") - .replace("self_attn", "attention").replace("transformer_", "").replace("self_attention_dense", "attention_wo")) - # name = ( - # name.replace("h.", "layers_") - # .replace(".", "_").replace("word_embeddings", "tok_embeddings") - # .replace("self_attn", "attention").replace("transformer_", "")) - - print(name) - print(params.shape) - - #split q, k, v - if "self_attention_query_key_value" in name: - name_q = name.replace("self_attention_query_key_value", "attention_wq") - name_k = name.replace("self_attention_query_key_value", "attention_wk") - name_v = name.replace("self_attention_query_key_value", "attention_wv") - q, k, v = torch.split(params, [4544, 64, 64], 0) - print(q.shape) - print(k.shape) - print(v.shape) - q.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name_q) - k.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name_k) - v.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name_v) - - else: - params.detach().cpu().numpy().tofile('/home/ubuntu/FlexFlow/inference/weights/falcon_7B_weights_new/' + name) - diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py new file mode 100644 index 0000000000..689730f32b --- /dev/null +++ b/inference/utils/download_hf_model.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +import flexflow.serve as ff +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "model_names", type=str, nargs="+", help="Name of the model(s) to download" + ) + parser.add_argument( + "--cache-folder", + type=str, + help="Folder to use to store the model(s) assets in FlexFlow format", + default="", + ) + parser.add_argument( + "--refresh-cache", + action="store_true", + help="Use this flag to force the refresh of the model(s) weights/tokenizer cache", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--full-precision-only", + action="store_true", + help="Only download the full precision version of the weights", + ) + group.add_argument( + "--half-precision-only", + action="store_true", + help="Only download the half precision version of the weights", + ) + args = parser.parse_args() + return args + + +def main(args): + # Initialize FF serve to gain access to its utils + ff.init_cpu() + + if args.full_precision_only: + data_types = ff.DataType.DT_FLOAT + elif args.half_precision_only: + data_types = ff.DataType.DT_HALF + else: + data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) + + for model_name in args.model_names: + for data_type in data_types: + llm = ff.LLM( + model_name, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + llm.download_hf_weights_if_needed() + llm.download_hf_tokenizer_if_needed() + llm.download_hf_config() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/inference/utils/download_llama_weights.py b/inference/utils/download_llama_weights.py deleted file mode 100644 index d2b11453e6..0000000000 --- a/inference/utils/download_llama_weights.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python - -import os -import requests -import argparse -from transformers import AutoModelForCausalLM - -# You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. -# and pass "--use_13B", "--use_30B", and "--use_65B" to use the corresponding "llama-13B/30B/65B" model weights -parser = argparse.ArgumentParser() -parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") -parser.add_argument("--use_13B", action="store_true", help="choose to use llama-13B") -parser.add_argument("--use_30B", action="store_true", help="choose to use llama-30B") -parser.add_argument("--use_65B", action="store_true", help="choose to use llama-65B") -args = parser.parse_args() -if not args.use_full_precision: - import torch - torch.set_default_tensor_type(torch.HalfTensor) - -# Change working dir to folder storing this script -abspath = os.path.abspath(__file__) -dname = os.path.dirname(abspath) -os.chdir(dname) - -def convert_hf_model(model, dst_folder): - os.makedirs(dst_folder, exist_ok=True) - for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) - params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") - -# Download and convert big model weights -model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") -dst_folder="../weights/llama_7B_weights" if args.use_full_precision else "../weights/llama_7B_weights_half" -convert_hf_model(model, dst_folder) - -# Download and convert model weights only for hf -if args.use_13B: - model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-13b-hf") - dst_folder="../weights/llama_13B_weights_half" - convert_hf_model(model, dst_folder) - -if args.use_30B: - model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-30b-hf") - dst_folder="../weights/llama_30B_weights_half" - convert_hf_model(model, dst_folder) - -if args.use_65B: - model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-65b-hf") - dst_folder="../weights/llama_65B_weights_half" - convert_hf_model(model, dst_folder) - -# Download and convert small model weights -model = AutoModelForCausalLM.from_pretrained("JackFram/llama-160m") -dst_folder="../weights/llama_160M_weights" if args.use_full_precision else "../weights/llama_160M_weights_half" -convert_hf_model(model, dst_folder) - -# Download tokenizer -os.makedirs("../tokenizer", exist_ok=True) -tokenizer_filepath = '../tokenizer/tokenizer.model' -url = 'https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model' -r = requests.get(url) -open(tokenizer_filepath , 'wb').write(r.content) diff --git a/inference/utils/download_opt_weights.py b/inference/utils/download_opt_weights.py deleted file mode 100644 index c3707df304..0000000000 --- a/inference/utils/download_opt_weights.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python - -import os -import requests -import argparse -import shutil -from transformers import AutoModelForCausalLM - -# You can pass the --use-full-precision flag to use the full-precision weight. By default, we use half precision. -parser = argparse.ArgumentParser() -parser.add_argument( - "--use-full-precision", action="store_true", help="Use full precision" -) -args = parser.parse_args() -if not args.use_full_precision: - import torch - - torch.set_default_tensor_type(torch.HalfTensor) - -# Change working dir to folder storing this script -abspath = os.path.abspath(__file__) -dname = os.path.dirname(abspath) -os.chdir(dname) - - -def convert_hf_model(model, dst_folder): - os.makedirs(dst_folder, exist_ok=True) - for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("decoder_", "") - .replace("model_", "") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("out_proj", "wo") - ) - params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") - # copy embedding weights - shutil.copy( - os.path.join(dst_folder, "embed_tokens_weight"), - os.path.join(dst_folder, "embed_tokens_weight_lm_head"), - ) - - -# Download and convert big model weights -model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b") -dst_folder = ( - "../weights/opt_6B_weights" - if args.use_full_precision - else "../weights/opt_6B_weights_half" -) -convert_hf_model(model, dst_folder) - -# Download and convert small model weights -model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") -dst_folder = ( - "../weights/opt_125M_weights" - if args.use_full_precision - else "../weights/opt_125M_weights_half" -) -convert_hf_model(model, dst_folder) - -# Download tokenizer files -os.makedirs("../tokenizer", exist_ok=True) -tokenizer_filepath = "../tokenizer/vocab.json" -url = "https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json" -r = requests.get(url) -open(tokenizer_filepath, "wb").write(r.content) -tokenizer_filepath = "../tokenizer/merges.txt" -url = "https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt" -r = requests.get(url) -open(tokenizer_filepath, "wb").write(r.content) -tokenizer_filepath = "../tokenizer/special_tokens_map.json" -url = "https://huggingface.co/truongpdd/vietnews-gpt2/raw/main/added_tokens.json" -r = requests.get(url) -open(tokenizer_filepath, "wb").write(r.content) diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index b0177be6fa..adb4fe1926 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -24,79 +24,89 @@ from flexflow.config import * from flexflow.jupyter import * + def rerun_if_needed(): - def update_ld_library_path_if_needed(path): - ld_lib_path = os.environ.get("LD_LIBRARY_PATH") or "" - if path not in ld_lib_path.split(":"): - os.environ["LD_LIBRARY_PATH"] = path + ":" + ld_lib_path - return True - return False - from distutils import sysconfig - # When installing FlexFlow with pip, the library files are installed within - # the pip package folder, instead of at /usr/local/lib - packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False) - ff_lib_path = os.path.join(packages_dir, "flexflow", "lib") - # If the library exists at the ff_lib_path, rerun with the ff_lib_path in the LD_LIBRARY_PATH - rerun=False - if os.path.isdir(ff_lib_path): - rerun = update_ld_library_path_if_needed(ff_lib_path) - if rerun: - run_from_python_c = ((sys.argv or [''])[0] == '-c') - # re-running with os.execv only works with 'python -c' for python >= 3.10 - # (see https://bugs.python.org/issue23427) - if not run_from_python_c: - os.execv(sys.executable, ["python"] + sys.argv) - else: - if hasattr(sys, 'orig_argv'): - assert(len(sys.orig_argv) >= 3) - os.execv(sys.executable, ["python"] + sys.orig_argv[1:]) - else: - print(f'Error: Please export LD_LIBRARY_PATH={os.environ.get("LD_LIBRARY_PATH")} and rerun') - sys.exit(1) + def update_ld_library_path_if_needed(path): + ld_lib_path = os.environ.get("LD_LIBRARY_PATH") or "" + if path not in ld_lib_path.split(":"): + os.environ["LD_LIBRARY_PATH"] = path + ":" + ld_lib_path + return True + return False + + from distutils import sysconfig + + # When installing FlexFlow with pip, the library files are installed within + # the pip package folder, instead of at /usr/local/lib + packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False) + ff_lib_path = os.path.join(packages_dir, "flexflow", "lib") + # If the library exists at the ff_lib_path, rerun with the ff_lib_path in the LD_LIBRARY_PATH + rerun = False + if os.path.isdir(ff_lib_path): + rerun = update_ld_library_path_if_needed(ff_lib_path) + if rerun: + run_from_python_c = (sys.argv or [""])[0] == "-c" + # re-running with os.execv only works with 'python -c' for python >= 3.10 + # (see https://bugs.python.org/issue23427) + if not run_from_python_c: + os.execv(sys.executable, ["python"] + sys.argv) + else: + if hasattr(sys, "orig_argv"): + assert len(sys.orig_argv) >= 3 + os.execv(sys.executable, ["python"] + sys.orig_argv[1:]) + else: + print( + f'Error: Please export LD_LIBRARY_PATH={os.environ.get("LD_LIBRARY_PATH")} and rerun' + ) + sys.exit(1) + if flexflow_init_import(): - os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" - from legion_cffi import ffi, is_legion_python - from .flexflowlib import flexflow_library - - # Default python mode - if is_legion_python == False: - os.environ["REALM_DEFAULT_ARGS"] = "-ll:gpu 1" - rerun_if_needed() - print("Using Default Python") - _FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) - _CPU_ONLY = bool(os.environ.get('CPU_ONLY_TEST')) - if not _FF_BUILD_DOCS and not _CPU_ONLY: - from legion_top import ( - legion_canonical_python_main, - legion_canonical_python_cleanup, - ) - import atexit, sys, os - # run from jupyter - if "ipykernel_launcher.py" in sys.argv[0]: - sys_argv = ["python", "dummy.py"] - argv_dict = load_jupyter_config() - for key, value in argv_dict.items(): - sys_argv.append(key) - sys_argv.append(str(value)) - else: - sys_argv = [ - "python", - ] + sys.argv - legion_canonical_python_main(sys_argv) - atexit.register(legion_canonical_python_cleanup) - else: - print("Using Legion Python") + os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" + from legion_cffi import ffi, is_legion_python + from .flexflowlib import flexflow_library - flexflow_library.initialize() + # Default python mode + if is_legion_python == False: + _FF_BUILD_DOCS = bool( + os.environ.get("READTHEDOCS") or os.environ.get("FF_BUILD_DOCS") + ) + _CPU_ONLY = bool(os.environ.get("CPU_ONLY_TEST")) + if not _CPU_ONLY: + os.environ["REALM_DEFAULT_ARGS"] = "-ll:gpu 1" + rerun_if_needed() + print("Using Default Python") + if not _FF_BUILD_DOCS and not _CPU_ONLY: + from legion_top import ( + legion_canonical_python_main, + legion_canonical_python_cleanup, + ) + import atexit, sys, os - # check which python binding to use - if flexflow_python_binding() == 'pybind11': - print("Using pybind11 flexflow bindings.") - from .flexflow_pybind11 import * - else: - print("Using cffi flexflow bindings.") - from .flexflow_cffi import * + # run from jupyter + if "ipykernel_launcher.py" in sys.argv[0]: + sys_argv = ["python", "dummy.py"] + argv_dict = load_jupyter_config() + for key, value in argv_dict.items(): + sys_argv.append(key) + sys_argv.append(str(value)) + else: + sys_argv = [ + "python", + ] + sys.argv + legion_canonical_python_main(sys_argv) + atexit.register(legion_canonical_python_cleanup) + else: + print("Using Legion Python") + + flexflow_library.initialize() + + # check which python binding to use + if flexflow_python_binding() == "pybind11": + print("Using pybind11 flexflow bindings.") + from .flexflow_pybind11 import * + else: + print("Using cffi flexflow bindings.") + from .flexflow_cffi import * else: - pass \ No newline at end of file + pass diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 55ece74bc1..b02ac5bdb9 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2097,16 +2097,16 @@ def multihead_attention(self, query, key, value, self.add_layer(OpType.MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) - def inc_multihead_attention(self, input, - embed_dim, num_heads, num_kv_heads, + def inc_multihead_self_attention(self, input, + embed_dim, num_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, - and returns the dot-product attention between them:. + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. :param input: the input Tensor. :type input: Tensor @@ -2161,20 +2161,20 @@ def inc_multihead_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc.flexflow_model_add_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) - def spec_inc_multihead_attention(self, input, - embed_dim, num_heads, num_kv_heads, + def spec_inc_multihead_self_attention(self, input, + embed_dim, num_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, - and returns the dot-product attention between them:. + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. :param input: the input Tensor. :type input: Tensor @@ -2229,20 +2229,20 @@ def spec_inc_multihead_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_spec_inc_multihead_attention(self.handle, input.handle, embed_dim, num_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc.flexflow_model_add_spec_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) def inc_multihead_self_attention_verify(self, input, - embed_dim, num_heads, num_kv_heads, + embed_dim, num_heads, kdim=0, vdim=0, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, - and returns the dot-product attention between them:. + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. :param input: the input Tensor. :type input: Tensor @@ -2297,7 +2297,220 @@ def inc_multihead_self_attention_verify(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multiquery_self_attention(self, input, + embed_dim, num_q_heads, num_kv_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, + qk_prod_scaling=True, name=None): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multiquery_self_attention(self, input, + embed_dim, num_q_heads, num_kv_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, + qk_prod_scaling=True, name=None): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_spec_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multiquery_self_attention_verify(self, input, + embed_dim, num_q_heads, num_kv_heads, + kdim=0, vdim=0, dropout=0.0, + bias=True, add_bias_kv=False, add_zero_attn=False, + data_type=DataType.DT_NONE, kernel_initializer=None, + apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, + qk_prod_scaling=True, name=None): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc.flexflow_model_add_inc_multiquery_self_attention_verify(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) @@ -3035,9 +3248,9 @@ def init_operators_inference(self, model): class FileDataLoader(object): __slots__ = ['handle', '_handle'] - def __init__(self, weight_file_path, num_heads, hidden_dim, qkv_inner_dim): + def __init__(self, weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, tensor_parallelism_degree): c_weight_file_path = get_c_name(weight_file_path) - self.handle = ffc.flexflow_file_data_loader_create(c_weight_file_path, num_heads, hidden_dim, qkv_inner_dim) + self.handle = ffc.flexflow_file_data_loader_create(c_weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, tensor_parallelism_degree) self._handle = ffi.gc(self.handle, ffc.flexflow_file_data_loader_destroy) def load_weights(self, model, model_layers_with_weights, data_type): diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 4c28146d82..8b054f0120 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json, sys +import json, sys, os from typing import Union from ..type import * @@ -76,6 +76,9 @@ def init(configs: Union[str, dict]): "configs should be a dictionary or the path to a valid JSON file" ) + # Remove the arguments to avoid interferences + sys.argv = [sys.argv[0]] + # configs should contain the following mandatory keys with non-zero integer values: num_gpus = configs_dict.get("num_gpus") memory_per_gpu = configs_dict.get("memory_per_gpu") @@ -140,3 +143,16 @@ def init(configs: Union[str, dict]): global LLM, SSM, SamplingConfig from .serve import LLM, SSM, SamplingConfig + + +def init_cpu(): + """Start the FlexFlow runtime and import the inference package without access to GPU functionalities. + This is useful to access the utilies from the flexflow package without using up GPU memory. + """ + # Remove the arguments to avoid interferences + sys.argv = [sys.argv[0]] + # Ask the runtime to avoid using GPU/GPU memory + os.environ["CPU_ONLY_TEST"] = "1" + + global LLM, SSM, SamplingConfig + from .serve import LLM, SSM, SamplingConfig diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 81f80474dd..5cb5443f42 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -14,7 +14,7 @@ from flexflow.core import * from .base import FlexFlowModel -import random, shutil +import random, torch class FalconConfig: @@ -28,6 +28,7 @@ def __init__(self, hf_config): self.layer_norm_epsilon = hf_config.layer_norm_epsilon self.multi_query = hf_config.multi_query self.n_head = hf_config.n_head + self.n_head_kv = hf_config.n_head_kv if "n_head_kv" in hf_config.__dict__ else 1 self.n_layer = hf_config.n_layer self.parallel_attn = hf_config.parallel_attn self.vocab_size = hf_config.vocab_size @@ -59,6 +60,27 @@ def __init__( self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + # Sanity checks + if self.falcon_config.hidden_size % self.falcon_config.n_head != 0: + raise ValueError( + f"Hidden size ({self.falcon_config.hidden_size}) is not divisible by n_head ({self.falcon_config.n_head})" + ) + if ( + self.falcon_config.n_head < self.ffconfig.tensor_parallelism_degree + or self.falcon_config.n_head % self.ffconfig.tensor_parallelism_degree != 0 + ): + raise ValueError( + f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + if ( + self.falcon_config.n_head_kv < self.ffconfig.tensor_parallelism_degree + or self.falcon_config.n_head_kv % self.ffconfig.tensor_parallelism_degree + != 0 + ): + raise ValueError( + f"Number of k/v attention heads ({self.falcon_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + self.build_model() def build_model(self): @@ -93,12 +115,29 @@ def build_model(self): name=f"layers_{i}_input_layernorm_weight", ) - if self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multihead_self_attention( + if self.mode == InferenceMode.BEAM_SEARCH_MODE: + mha = ffmodel.spec_inc_multiquery_self_attention( + att_norm, + self.falcon_config.hidden_size, + self.falcon_config.n_head, + self.falcon_config.n_head_kv, + self.falcon_config.hidden_size // self.falcon_config.n_head, + self.falcon_config.hidden_size // self.falcon_config.n_head, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + DataType.DT_NONE, # data_type + None, # kernel initializer + True, # apply_rotary_embedding + name=f"layers_{i}_attention_weight", + ) + elif self.mode == InferenceMode.TREE_VERIFY_MODE: + mha = ffmodel.inc_multiquery_self_attention_verify( att_norm, self.falcon_config.hidden_size, self.falcon_config.n_head, - 1, + self.falcon_config.n_head_kv, self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout @@ -107,7 +146,25 @@ def build_model(self): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - name=f"layers_{i}_self_attention_dense_weight", + True, # apply_rotary_embedding + name=f"layers_{i}_attention_weight", + ) + elif self.mode == InferenceMode.INC_DECODING_MODE: + mha = ffmodel.inc_multiquery_self_attention( + att_norm, + self.falcon_config.hidden_size, + self.falcon_config.n_head, + self.falcon_config.n_head_kv, + self.falcon_config.hidden_size // self.falcon_config.n_head, + self.falcon_config.hidden_size // self.falcon_config.n_head, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + DataType.DT_NONE, # data_type + None, # kernel initializer + True, # apply_rotary_embedding + name=f"layers_{i}_attention_weight", ) else: assert False @@ -166,12 +223,30 @@ def convert_hf_model(model, dst_folder): name.replace(".", "_") .replace("transformer_h_", "layers_") .replace("transformer_", "") + .replace("self_attention_dense", "attention_wo") ) - params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") - # copy embedding weights - shutil.copy( - os.path.join(dst_folder, "word_embeddings_weight"), - os.path.join(dst_folder, "lm_head_weight"), + # Split Q,K,V attention weights + if "self_attention_query_key_value" in name: + name_q = name.replace("self_attention_query_key_value", "attention_wq") + name_k = name.replace("self_attention_query_key_value", "attention_wk") + name_v = name.replace("self_attention_query_key_value", "attention_wv") + q, k, v = torch.split( + params, + [ + model.config.hidden_size, + model.config.hidden_size // model.config.n_head, + model.config.hidden_size // model.config.n_head, + ], + 0, + ) + q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) + k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) + v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) + else: + params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) + # LM head weight + model.lm_head.weight.detach().cpu().numpy().tofile( + os.path.join(dst_folder, "lm_head_weight") ) def get_layers_with_weights(self): @@ -184,7 +259,7 @@ def get_layers_with_weights(self): for i in range(self.falcon_config.n_layer) for expr in ( f"layers_{i}_input_layernorm_weight", - f"layers_{i}_self_attention_dense_weight", + f"layers_{i}_attention_weight", f"layers_{i}_mlp_dense_h_to_4h_weight", f"layers_{i}_mlp_dense_4h_to_h_weight", ) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 3c83905d61..cb707f3e57 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -57,6 +57,24 @@ def __init__( self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + # Sanity checks + if self.llama_config.hidden_size % self.llama_config.num_attention_heads != 0: + raise ValueError( + f"Hidden size ({self.llama_config.hidden_size}) is not divisible by number of attention heads ({self.llama_config.num_attention_heads})" + ) + + # Sanity checks + if ( + self.llama_config.num_attention_heads + < self.ffconfig.tensor_parallelism_degree + or self.llama_config.num_attention_heads + % self.ffconfig.tensor_parallelism_degree + != 0 + ): + raise ValueError( + f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + self.build_model() def build_model(self): @@ -88,11 +106,10 @@ def build_model(self): ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: - mha = ffmodel.spec_inc_multihead_attention( + mha = ffmodel.spec_inc_multihead_self_attention( attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, - self.llama_config.num_attention_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size @@ -111,7 +128,6 @@ def build_model(self): attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, - self.llama_config.num_attention_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size @@ -126,11 +142,10 @@ def build_model(self): name=f"layers_{i}_attention_weight", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multihead_attention( + mha = ffmodel.inc_multihead_self_attention( attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, - self.llama_config.num_attention_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index deb7a304ff..7e38b5de85 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -62,6 +62,24 @@ def __init__( self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + # Sanity checks + if self.opt_config.hidden_size % self.opt_config.num_attention_heads != 0: + raise ValueError( + f"Hidden size ({self.opt_config.hidden_size}) is not divisible by n_head ({self.opt_config.num_attention_heads})" + ) + + # Sanity checks + if ( + self.opt_config.num_attention_heads + < self.ffconfig.tensor_parallelism_degree + or self.opt_config.num_attention_heads + % self.ffconfig.tensor_parallelism_degree + != 0 + ): + raise ValueError( + f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + self.build_model() def build_model(self): @@ -114,11 +132,10 @@ def build_model(self): hidden_states = residual if self.mode == InferenceMode.BEAM_SEARCH_MODE: - mha = ffmodel.spec_inc_multihead_attention( + mha = ffmodel.spec_inc_multihead_self_attention( hidden_states, self.opt_config.hidden_size, self.opt_config.num_attention_heads, - self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout @@ -139,7 +156,6 @@ def build_model(self): hidden_states, self.opt_config.hidden_size, self.opt_config.num_attention_heads, - self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout @@ -156,11 +172,10 @@ def build_model(self): name=f"layers_{i}_attention_weight", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multihead_attention( + mha = ffmodel.inc_multihead_self_attention( hidden_states, self.opt_config.hidden_size, self.opt_config.num_attention_heads, - self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index a6723b38a2..bf2dcc84de 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -54,9 +54,8 @@ def __init__( self, model_name: str, data_type: DataType = DataType.DT_HALF, - tokenizer_path: str = "", - weights_path: str = "", - clean_cache: bool = False, + cache_path: str = "", + refresh_cache: bool = False, output_file: str = "", ): """Create the LLM object @@ -65,12 +64,10 @@ def __init__( :type model_name: str :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF :type data_type: DataType, optional - :param tokenizer_path: Path to the tokenizer file or folder for the LLM. If left blank, FlexFlow will download (and cache) the relevant tokenizer from HuggingFace, defaults to "" + :param cache_path: Path to the folder (which will be created if it does not yet exist) to use for the FlexFlow weights/tokenizers cache, defaults to "~/.cache/flexflow" :type tokenizer_path: str, optional - :param weights_path: Path to the weights for the LLM. If left blank, FlexFlow will download (and cache) the weights from HuggingFace, defaults to "" - :type weights_path: str, optional - :param clean_cache: Use this flag to discard previous weights/tokenizer cache for this LLM, defaults to False - :type clean_cache: bool, optional + :param refresh_cache: Use this flag to force the refresh of the model's weights/tokenizer cache, defaults to False + :type refresh_cache: bool, optional :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ @@ -81,14 +78,13 @@ def __init__( "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon), } self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + self.model_name = self.hf_config._name_or_path self.model_type, self.model_class = self.__get_ff_model_type() self.data_type = data_type assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT - self.tokenizer_path = tokenizer_path - self.weights_path = weights_path - self.clean_cache = clean_cache + self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" + self.refresh_cache = refresh_cache self.output_file = output_file - self.ffconfig = FFConfig() def __get_ff_model_type(self): architectures = getattr(self.hf_config, "architectures", []) @@ -102,14 +98,40 @@ def __get_ff_model_type(self): sys.exit(1) return ff_arch - def __download_hf_weights(self): + def download_hf_config(self): + """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" + self.config_dir = os.path.join( + os.path.expanduser(self.cache_path), "configs", self.model_name.lower() + ) + self.config_path = os.path.join(self.config_dir, "config.json") + os.makedirs(self.config_dir, exist_ok=True) + print(f"Creating directory {self.config_dir} (if it doesn't exist)...") + print(f"Saving {self.model_name} configs to file {self.config_path}...") + self.hf_config.to_json_file(self.config_path) + + def download_hf_weights_if_needed(self): + """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new weights. + """ + if self.data_type == DataType.DT_HALF: + torch.set_default_tensor_type(torch.HalfTensor) + elif self.data_type == DataType.DT_FLOAT: + torch.set_default_tensor_type(torch.FloatTensor) + else: + assert False, "Data type not yet supported -- cannot download weights!" + # Use local cache, or download new version - self.weights_path = os.path.expanduser( - f"~/.cache/flexflow/models/{self.hf_config._name_or_path}/{'full-precision' if self.data_type == DataType.DT_FLOAT else 'half-precision'}" + self.weights_path = os.path.join( + os.path.expanduser(self.cache_path), + "weights", + self.model_name.lower(), + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision", ) - if self.clean_cache: + if self.refresh_cache: print( - f"Discarding cached weights (if they exist) for model {self.hf_config._name_or_path}..." + f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..." ) if os.path.exists(self.weights_path): shutil.rmtree(self.weights_path) @@ -122,15 +144,15 @@ def __download_hf_weights(self): if os.path.exists(local_revision_file): local_revision = "".join(open(local_revision_file).read().split()) hf_api = HfApi() - latest_revision = hf_api.model_info(self.hf_config._name_or_path).sha + latest_revision = hf_api.model_info(self.model_name).sha # Download if needed if local_revision != latest_revision: print( - f"'{self.hf_config._name_or_path}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." ) hf_model = AutoModelForCausalLM.from_pretrained( - self.hf_config._name_or_path, trust_remote_code=True + self.model_name, trust_remote_code=True ) print("Done downloading HF weights. Converting them now...") self.model_class.convert_hf_model(hf_model, self.weights_path) @@ -138,33 +160,23 @@ def __download_hf_weights(self): f.write(latest_revision) print("Done converting the weights...") else: - print( - f"Loading '{self.hf_config._name_or_path}' model weights from the cache..." - ) + print(f"Loading '{self.model_name}' model weights from the cache...") - def __load_hf_tokenizer(self): + def download_hf_tokenizer_if_needed(self): + """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new tokenizer files. + """ print("Loading tokenizer...") - if len(self.tokenizer_path) > 0: - print(f"Using tokenizer from {self.tokenizer_path}") - # check that tokenizer exist - if not os.path.exists(self.tokenizer_path): - raise FileNotFoundError(f"Path {self.tokenizer_path} does not exist") - elif ( - os.path.isdir(self.tokenizer_path) - and len(os.listdir(self.tokenizer_path)) == 0 - ): - raise FileNotFoundError(f"Folder {self.tokenizer_path} is empty") - return - - # Download tokenizer # Use local cache, or download new version - self.tokenizer_path = os.path.expanduser( - f"~/.cache/flexflow/tokenizers/{self.hf_config._name_or_path}/" + self.tokenizer_path = os.path.join( + os.path.expanduser(self.cache_path), + "tokenizers", + self.model_name.lower(), ) - if self.clean_cache: + if self.refresh_cache: print( - f"Discarding cached tokenizer files (if they exist) for model {self.hf_config._name_or_path}..." + f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..." ) if os.path.exists(self.tokenizer_path): shutil.rmtree(self.tokenizer_path) @@ -178,58 +190,47 @@ def __load_hf_tokenizer(self): if os.path.exists(local_revision_file): local_revision = "".join(open(local_revision_file).read().split()) hf_api = HfApi() - latest_revision = hf_api.model_info(self.hf_config._name_or_path).sha + latest_revision = hf_api.model_info(self.model_name).sha # Download if needed if local_revision != latest_revision: print( - f"'{self.hf_config._name_or_path}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." + f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." ) if self.model_type == ModelType.LLAMA: hf_tokenizer = LlamaTokenizer.from_pretrained( - self.hf_config._name_or_path, use_fast=True + self.model_name, use_fast=True ) else: - hf_tokenizer = AutoTokenizer.from_pretrained( - self.hf_config._name_or_path - ) + hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) hf_tokenizer.save_pretrained(self.tokenizer_path) print("Done downloading HF tokenizer.") with open(local_revision_file, "w+") as f: f.write(latest_revision) print("Loading the tokenizer...") else: - print( - f"Loading '{self.hf_config._name_or_path}' tokenizer from the cache..." - ) + print(f"Loading '{self.model_name}' tokenizer from the cache...") def __load_hf_weights(self): print("Loading hf weights...") - if self.data_type == DataType.DT_HALF: - torch.set_default_tensor_type(torch.HalfTensor) - - if len(self.weights_path) > 0: - print(f"Using weights from {self.weights_path}") - # check that weights exist - if not os.path.exists(self.weights_path) or not os.path.isdir( - self.weights_path - ): - raise FileNotFoundError( - f"Path {self.weights_path} does not exist or is not a directory" - ) - elif len(os.listdir(self.weights_path)) == 0: - raise FileNotFoundError(f"Folder {self.weights_path} is empty") - else: - self.__download_hf_weights() + self.download_hf_weights_if_needed() # Create file data loader, load weights into tensors + if self.model_type == ModelType.FALCON: + n_q_heads = self.hf_config.num_attention_heads + if "n_head_kv" in self.hf_config.__dict__: + n_kv_heads = self.hf_config.n_head_kv + else: + n_kv_heads = 1 + else: + n_q_heads = n_kv_heads = self.hf_config.num_attention_heads self.fileloader = FileDataLoader( self.weights_path, - self.hf_config.num_attention_heads, - self.hf_config.num_attention_heads, + n_q_heads, + n_kv_heads, self.hf_config.hidden_size, - self.hf_config.hidden_size // self.hf_config.num_attention_heads, + self.hf_config.hidden_size // n_q_heads, self.ffconfig.tensor_parallelism_degree, ) @@ -245,6 +246,9 @@ def compile( max_batch_size: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, + model_specific_data_parallelism_degree: int = None, + model_specific_tensor_parallelism_degree: int = None, + model_specific_pipeline_parallelism_degree: int = None, ssms: list = [], ): """Compile the LLM for inference and load the weights into memory @@ -259,6 +263,12 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 :type max_tokens_per_batch: int, optional + :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None + :type model_specific_data_parallelism_degree: int, optional + :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None + :type model_specific_tensor_parallelism_degree: int, optional + :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the LLM a different pipeline parallelism degree than the one used to initialize the runtime, defaults to None + :type model_specific_pipeline_parallelism_degree: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ @@ -267,11 +277,26 @@ def compile( self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.sampling_config = SamplingConfig() + self.ffconfig = FFConfig() assert ( mode == InferenceMode.INC_DECODING_MODE or mode == InferenceMode.BEAM_SEARCH_MODE ) == (len(ssms) == 0) + # Apply model-specific parallelism degrees, if needed + if model_specific_data_parallelism_degree: + self.ffconfig.data_parallelism_degree = ( + model_specific_data_parallelism_degree + ) + if model_specific_tensor_parallelism_degree: + self.ffconfig.tensor_parallelism_degree = ( + model_specific_tensor_parallelism_degree + ) + if model_specific_pipeline_parallelism_degree: + self.ffconfig.pipeline_parallelism_degree = ( + model_specific_pipeline_parallelism_degree + ) + # Instantiate the relevant model self.model = self.model_class( mode, @@ -290,7 +315,7 @@ def compile( # Download the weights and tokenizer from huggingface (if needed) and load them self.__load_hf_weights() - self.__load_hf_tokenizer() + self.download_hf_tokenizer_if_needed() # Create request manager self.rm = RequestManager() @@ -329,9 +354,8 @@ def __init__( self, model_name: str, data_type: DataType = DataType.DT_HALF, - tokenizer_path: str = "", - weights_path: str = "", - clean_cache: bool = False, + cache_path: str = "~/.cache/flexflow", + refresh_cache: bool = False, output_file: str = "", ): """Create the SSM object @@ -340,23 +364,62 @@ def __init__( :type model_name: str :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF :type data_type: DataType, optional - :param tokenizer_path: Path to the tokenizer file or folder for the LLM. If left blank, FlexFlow will download (and cache) the relevant tokenizer from HuggingFace, defaults to "" + :param cache_path: Path to the folder (which will be created if it does not yet exist) to use for the FlexFlow weights/tokenizers cache, defaults to "~/.cache/flexflow" :type tokenizer_path: str, optional - :param weights_path: Path to the weights for the LLM. If left blank, FlexFlow will download (and cache) the weights from HuggingFace, defaults to "" - :type weights_path: str, optional - :param clean_cache: Use this flag to discard previous weights/tokenizer cache for this LLM, defaults to False - :type clean_cache: bool, optional + :param refresh_cache: Use this flag to force the refresh of the model's weights/tokenizer cache, defaults to False + :type refresh_cache: bool, optional :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ super().__init__( model_name, data_type, - tokenizer_path, - weights_path, - clean_cache, + cache_path, + refresh_cache, output_file, ) - self.ffconfig.data_parallelism_degree = 1 - self.ffconfig.tensor_parallelism_degree = 1 - self.ffconfig.pipeline_parallelism_degree = 1 + + def compile( + self, + mode: InferenceMode = InferenceMode.INC_DECODING_MODE, + sampling_config: SamplingConfig = SamplingConfig(), + max_batch_size: int = 1, + max_seq_length: int = 256, + max_tokens_per_batch: int = 64, + model_specific_data_parallelism_degree: int = 1, + model_specific_tensor_parallelism_degree: int = 1, + model_specific_pipeline_parallelism_degree: int = 1, + ssms: list = [], + ): + """Compile the SSM for inference and load the weights into memory + + :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE + :type mode: InferenceMode, optional + :param sampling_config: The SamplingConfig object with the configurations to use for sampling, defaults to SamplingConfig() + :type sampling_config: SamplingConfig, optional + :param max_batch_size: The maximum batch size to allow, defaults to 1 + :type max_batch_size: int, optional + :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 + :type max_seq_length: int, optional + :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 + :type max_tokens_per_batch: int, optional + :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 + :type model_specific_data_parallelism_degree: int, optional + :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1 + :type model_specific_tensor_parallelism_degree: int, optional + :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the SSM a different pipeline parallelism degree than the default one, defaults to 1 + :type model_specific_pipeline_parallelism_degree: int, optional + :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] + :type ssms: list, optional + """ + super().compile( + mode, + sampling_config, + max_batch_size, + max_seq_length, + max_tokens_per_batch, + model_specific_data_parallelism_degree, + model_specific_tensor_parallelism_degree, + model_specific_pipeline_parallelism_degree, + ssms, + ) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 59f9046c57..e84cd5db67 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1049,7 +1049,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_inc_multihead_attention( +flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -1090,7 +1090,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_attention( return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_spec_inc_multihead_attention( +flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -1174,7 +1174,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_inc_multiquery_attention( +flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -1217,7 +1217,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_attention( return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_attention( +flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( flexflow_model_t handle_, const flexflow_tensor_t input_, int embed_dim, @@ -2430,21 +2430,21 @@ void flexflow_inference_manager_init_operators_inference( flexflow_file_data_loader_t flexflow_file_data_loader_create(char const *weight_file_path, - int num_heads, + int num_q_heads, int num_kv_heads, int hidden_dim, int qkv_inner_dim, - int tensor_partition_num) { + int tensor_parallelism_degree) { assert(weight_file_path != nullptr && "Cannot convert nullptr char * to std::string"); std::string const weight_file_path_str(weight_file_path); FileDataLoader *handle = new FileDataLoader("", weight_file_path_str, - num_heads, + num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, - tensor_partition_num); + tensor_parallelism_degree); DEBUG_PRINT("[FileDataLoader] new %p", handle); return FFCObjectWrapper::wrap(handle); } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 02853bbf09..b834073064 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -379,6 +379,7 @@ __host__ void FusedOp::forward_task(Task const *task, effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: @@ -778,6 +779,7 @@ __host__ void effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: @@ -1323,6 +1325,7 @@ __host__ void FusedOp::backward_task(Task const *task, batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 39dee64ff1..bd9079ec0c 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -349,90 +349,6 @@ FutureMap InferenceManager::inference(FFModel *model, return fm; }; -void InferenceManager::incr_decoding_loop(FFModel *model, - RequestManager &rm, - int total_num_requests) { - BatchConfig bc; - InferenceResult ir; - while (rm.get_num_processed_requests() < total_num_requests) { - bc = rm.prepare_next_batch(bc, ir); - if (rm.get_num_processed_requests() >= total_num_requests) { - break; - } - FutureMap fm = inference(model, 0, bc); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - ir = future.get_result(); - // assert(false); - } -} - -void InferenceManager::spec_inference_loop(FFModel *model, - RequestManager &rm, - int total_num_requests, - std::vector ssm_model_ids) { - TreeVerifyBatchConfig tree_bc; - BeamSearchBatchConfig beam_bc; - std::vector beam_bc_vec; - int num_ssms = ssm_model_ids.size(); - for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { - beam_bc_vec.push_back(BeamSearchBatchConfig(ssm_model_ids[ssm_id])); - } - - InferenceResult tree_ir; - - while (rm.get_num_processed_requests() < total_num_requests) { - int depth = 0; - // Beam Search - beam_bc = rm.prepare_next_batch_init(tree_bc, tree_ir, 0); - for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { - beam_bc_vec[ssm_id] = beam_bc; - beam_bc_vec[ssm_id].model_id = ssm_id; - } - - if (rm.get_num_processed_requests() >= total_num_requests) { - break; - } - - for (int i = 0; i < num_ssms; i++) { - while (true) { - beam_bc = beam_bc_vec[i]; - depth = beam_bc.beamRequestsInfo[0].current_depth; - - FutureMap fm = inference(rm.get_model(0), 0, beam_bc_vec[i]); - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - BeamInferenceResult beam_ir = future.get_result(); - - int iteration = - std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, - BatchConfig::MAX_SEQ_LENGTH - beam_bc.max_init_length); - - if (depth - 1 >= iteration) { - break; - } else { - beam_bc_vec[i] = rm.prepare_next_batch_beam(beam_bc_vec[i], beam_ir); - if (beam_bc_vec[i].num_active_tokens() == 0 && - beam_bc_vec[i].num_active_requests() != 0) { - break; - } - } - } - std::cout << "----------beam search finished for model " - << beam_bc_vec[i].model_id << "------------" << std::endl; - } - // Token Tree Verification - { - tree_bc = rm.prepare_next_batch_verify(beam_bc_vec); - FutureMap fm = inference(model, 0, tree_bc); - - assert(fm.get_future_map_domain().get_volume() == 1); - Future future = fm.get_future(0); - tree_ir = future.get_result(); - } - } -} - void InferenceManager::load_input_tokens_from_batch_config( BatchConfigFuture const &bc, ParallelTensor const input) { Context ctx = ff_config.lg_ctx; @@ -761,4 +677,24 @@ void FFModel::compile_inference() { } #endif } + +std::string join_path(std::vector const &paths) { + std::string joined; + for (auto const &path : paths) { + if (joined.empty()) { + joined = path; + } else { + if (path[0] == '/') { + joined = path; + } else if (joined.back() != '/') { + joined += '/'; + joined += path; + } else { + joined += path; + } + } + } + return joined; +} + }; // namespace FlexFlow diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2041bdd9a7..a49f827482 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -100,7 +100,9 @@ void RequestManager::register_tokenizer(ModelType type, this->tokenizer_ = Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); } else if (model_type == ModelType::FALCON) { - this->tokenizer_ = Tokenizer::FromBlobJSON(LoadBytesFromFile(path)); + std::string falcon_tokenizer_path = join_path({path, "tokenizer.json"}); + this->tokenizer_ = + Tokenizer::FromBlobJSON(LoadBytesFromFile(falcon_tokenizer_path)); } } diff --git a/tests/.gitignore b/tests/.gitignore index e69de29bb2..f3732d54f4 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -0,0 +1 @@ +inference/python_test_configs/*.json diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index dc8cc1f78a..fba42538ef 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -2,61 +2,34 @@ set -x set -e -cleanup() { - rm -rf ../../inference/prompt ../../inference/weights ../../inference/tokenizer ../../inference/output -} - # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" -# Enable model parallelism tests, if desired -TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} - -# Clean up before test (just in case) -cleanup - -# Update the transformers library to support the LLAMA model - -pip3 install --upgrade transformers sentencepiece - -# Download the weights in both half and full precision -python3 ../../inference/utils/download_llama_weights.py -python3 ../../inference/utils/download_llama_weights.py --use-full-precision -python3 ../../inference/utils/download_opt_weights.py -python3 ../../inference/utils/download_opt_weights.py --use-full-precision - -# Create test prompt file -mkdir -p ../../inference/prompt -echo '["Give three tips for staying healthy."]' > ../../inference/prompt/test.json - -# Create output folder -mkdir -p ../../inference/output - ############################################################################################### ############################ Speculative inference tests ###################################### ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights_half/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 # OPT (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights_half/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -ssm-model llama -ssm-weight ../../inference/weights/llama_160M_weights_half/ -ssm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -ssm-model opt -ssm-weight ../../inference/weights/opt_125M_weights_half/ -ssm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### @@ -64,50 +37,55 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights_half/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 # OPT (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights_half/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 # OPT (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 # OPT (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 + +# Falcon (full precision) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +# Falcon (half precision) +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights_half/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_160M_weights_half/ -llm-config ../../inference/models/configs/llama_160M.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight ../../inference/weights/llama_7B_weights_half/ -llm-config ../../inference/models/configs/llama_7B.json -tokenizer ../../inference/tokenizer/tokenizer.model -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights_half/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_125M_weights_half/ -llm-config ../../inference/models/configs/opt_125M.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --use-full-precision -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model opt -llm-weight ../../inference/weights/opt_6B_weights_half/ -llm-config ../../inference/models/configs/opt_6B.json -tokenizer ../../inference/tokenizer/ -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### @@ -231,7 +209,6 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then fi ######################### Alignment tests with HuggingFace #################################### -pip3 install protobuf==3.20.3 # LLAMA (small model, full precision) python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu @@ -266,10 +243,3 @@ diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n + diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) #diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt") #diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt") - -############################################################################################### -###################################### Cleanup ################################################ -############################################################################################### - -# Clean up after test -# cleanup diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh new file mode 100755 index 0000000000..800c0ad043 --- /dev/null +++ b/tests/inference/python_inference_tests.sh @@ -0,0 +1,191 @@ +#! /usr/bin/env bash +set -x +set -e + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Generate test configs +python python_test_configs/generate_configs.py + +# Run all tests +# Loop through .json files in the ./python_test_configs dir +for file in ./python_test_configs/*.json; do + # Check filename prefix + if [[ $file == *"incr_dec"* ]]; then + script="../../inference/python/incr_decoding.py" + elif [[ $file == *"spec_infer"* ]]; then + script="../../inference/python/spec_infer.py" + fi + # Run script + python "$script" -config-file "$file" +done + + +############################################################################################### +############################### Alignment and Speed tests ##################################### +############################################################################################### + +##################################### Helper functions ####################################### +function check_partial_token_match { + local file1="$1" + local file2="$2" + local num_tokens_to_match=30 + + # Read the second line of the first file + third_line=$(sed -n '3p' "$file1") + read -r line1 <<< "$third_line" + tokens1=${line1#*: } + IFS=',' read -ra arr1 <<< "$tokens1" + + # Read the second line of the second file + third_line=$(sed -n '3p' "$file2") + read -r line2 <<< "$third_line" + tokens2=${line2#*: } + IFS=',' read -ra arr2 <<< "$tokens2" + + # Compare the first few integers in the two lists + for ((i = 0; i < num_tokens_to_match; i++)); do + if [[ "${arr1[$i]}" != "${arr2[$i]}" ]]; then + echo "The first $num_tokens_to_match tokens in files $file1 and $file2 are not identical." + exit 1 + fi + done + #echo "The first $num_tokens_to_match integers are identical." +} + +function compare_speed_spec_infer_incr_decoding { + local incrDec_file="$1" + local specInf_file="$2" + + # Read the float numbers from the first line of the files + incrDec=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$incrDec_file") + specInf=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$specInf_file") + + if ! command -v bc &> /dev/null; then + echo "bc is not installed. Installing..." + sudo apt-get install -y bc + fi + + # Perform the comparison + threshold=$(bc <<< "$specInf * 1.5") + if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + #echo "The latency in $specInf_file is at least 1.5x smaller than the latency from $incrDec_file." + : + else + echo "Error: The latency in $specInf_file is not at least 1.5x smaller than the latency in $incrDec_file!" + exit 1 + fi +} + +function compare_decoding_steps_spec_infer_incr_decoding { + local incrDec_file="$1" + local specInf_file="$2" + + # Read the number of decoding steps from the second line of the files + second_line=$(sed -n '2p' "$incrDec_file") + read -r line <<< "$second_line" + incrDec=${line#*: } + second_line=$(sed -n '2p' "$specInf_file") + read -r line <<< "$second_line" + specInf=${line#*: } + + if ! command -v bc &> /dev/null; then + echo "bc is not installed. Installing..." + sudo apt-get install -y bc + fi + + # Perform the comparison + threshold=$(bc <<< "$specInf * 1.5") + if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file." + : + else + echo "Error: The decoding steps in $specInf_file are not at least 1.5x less than those in $incrDec_file!" + exit 1 + fi +} + +############ Alignment between speculative inference and incremental decoding ################# +# Full precision +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +# Half precision +check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" + +# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding +# Full precision +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt" +# Half precision +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" + +############ Alignment between tensor model parallelism and pipeline parallelism only ################# +## Specinfer +# LLAMA +diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +# OPT +diff <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/spec_infer-python-opt-6.7b-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" + +## Incremental decoding +# Small LLAMA +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" +# Big LLAMA +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +#check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +# Small OPT +diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-opt-125m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-opt-125m-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" +# Big OPT +diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" +#diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +#check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" + + +######################### Alignment tests with HuggingFace #################################### + +# LLAMA (small model, full precision) +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu + +# LLAMA (small model, half precision) +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu + +# LLAMA (big model, full precision) +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" + +# LLAMA (big model, half precision) +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu + +# OPT (small model, full precision) +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 + +# OPT (small model, half precision) +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 + +# OPT (big model, full precision) +#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 + +# OPT (big model, half precision) +#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 + +diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) + +diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py new file mode 100644 index 0000000000..b4c3dd8039 --- /dev/null +++ b/tests/inference/python_test_configs/generate_configs.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +import os, json + +# Base configs dictionaries +ff_init_configs = { + # required parameters + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_gpu": 30000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 4, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "fusion": True, +} +llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": True, + "prompt": "", + "output_file": "", +} +ssm_configs = { + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + }, + ] +} +# Merge dictionaries +ff_init_configs.update(llm_configs) + +# Test parameters to fill in +llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m"] +opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] +falcon_models = ["tiiuae/falcon-7b",] +parallelism_settings = [(1,4), (2,2), (4,1)] + +# The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference) +prompt_file = "../../inference/prompt/test.json" +output_folder = "../../inference/output" + +# Change working dir to folder storing this script +abspath = os.path.abspath(__file__) +dname = os.path.dirname(abspath) +os.chdir(dname) + + +# Generate incremental decoding configs +all_models = llama_models + opt_models + falcon_models +for model_name in all_models: + for full_precision in (True, False): + for parallelism_degrees in parallelism_settings: + + tp, pp = parallelism_degrees + + # Tensor parallelism not supported by small Falcon model atm + if tp > 1 and "falcon" in model_name: + continue + # skip tp=4 for big models + if tp > 2 and ("7b" in model_name or "6.7b" in model_name): + continue + + _, after_slash = model_name.rsplit("/", maxsplit=1) + filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + test_configs_file = "./" + filename + ".json" + output_file = os.path.join(output_folder, filename+".txt") + + ff_init_configs["tensor_parallelism_degree"] = tp + ff_init_configs["pipeline_parallelism_degree"] = pp + ff_init_configs["llm_model"] = model_name + ff_init_configs["full_precision"] = full_precision + ff_init_configs["output_file"] = output_file + ff_init_configs["prompt"] = prompt_file + + with open(test_configs_file, "w+") as outfile: + json.dump(ff_init_configs, outfile, indent=4) + +# Generate speculative inference configs +model_pairs = [llama_models, opt_models] +for model_pair in model_pairs: + for full_precision in (True, False): + for parallelism_degrees in parallelism_settings: + big_model, small_model = model_pair + tp, pp = parallelism_degrees + + # Skip fully tp tests + if tp > 2: + continue + + _, after_slash = big_model.rsplit("/", maxsplit=1) + filename = "spec_infer-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + test_configs_file = "./" + filename + ".json" + output_file = os.path.join(output_folder, filename+".txt") + + ff_init_configs["tensor_parallelism_degree"] = tp + ff_init_configs["pipeline_parallelism_degree"] = pp + ff_init_configs["llm_model"] = big_model + ff_init_configs["full_precision"] = full_precision + ff_init_configs["output_file"] = output_file + ff_init_configs["prompt"] = prompt_file + + ssm_configs["ssms"][0]["ssm_model"] = small_model + ssm_configs["ssms"][0]["full_precision"] = full_precision + ff_init_configs.update(ssm_configs) + + with open(test_configs_file, "w+") as outfile: + json.dump(ff_init_configs, outfile, indent=4) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index ca95acc785..b1d45853e2 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -2,8 +2,42 @@ set -x set -e +cleanup() { + rm -rf ../inference/prompt ../inference/output +} + # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" -# replace this with python tests -./inference/cpp_inference_tests.sh +# Enable Python tests (on by default) +PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-ON} +# Enable C++ tests, (off by default) +CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF} +# Enable model parallelism tests in C++, if desired +TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} + +# Clean up before test (just in case) +cleanup + +# Make sure supported version of protobuf is installed +pip3 install protobuf==3.20.3 + +# Download the weights in both half and full precision +python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" + +# Create test prompt file +mkdir -p ../inference/prompt +echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json + +# Create output folder +mkdir -p ../inference/output + +if [[ "$PYTHON_INFERENCE_TESTS" == "ON" ]]; then + echo "Running Python inference tests..." + ./inference/python_inference_tests.sh +fi +if [[ "$CPP_INFERENCE_TESTS" == "ON" ]]; then + echo "Running C++ inference tests..." + ./inference/cpp_inference_tests.sh +fi + From bcf14a731c3d29896cc80a6209063c02d2914700 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 13 Aug 2023 17:52:26 +0000 Subject: [PATCH 194/344] merge fix --- python/flexflow/core/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 25f4ec7575..5b421a74ed 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -59,15 +59,15 @@ else: print("Using Legion Python") - flexflow_library.initialize() + flexflow_library.initialize() - # check which python binding to use - if flexflow_python_binding() == "pybind11": - print("Using pybind11 flexflow bindings.") - from .flexflow_pybind11 import * - else: - print("Using cffi flexflow bindings.") - from .flexflow_cffi import * + # check which python binding to use + if flexflow_python_binding() == "pybind11": + print("Using pybind11 flexflow bindings.") + from .flexflow_pybind11 import * + else: + print("Using cffi flexflow bindings.") + from .flexflow_cffi import * else: - pass + pass From a78947c7864689b747eb634ee5f60e59e325bbf1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 15 Aug 2023 17:18:51 -0400 Subject: [PATCH 195/344] update tokenizers-cpp repo --- deps/tokenizers-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp index c53bc0444d..4f42c9fa74 160000 --- a/deps/tokenizers-cpp +++ b/deps/tokenizers-cpp @@ -1 +1 @@ -Subproject commit c53bc0444dbe2ea1f66e364cd576a6c1e23539b4 +Subproject commit 4f42c9fa74946d70af86671a3804b6f2433e5dac From 77e4841e9e8c0df06754780783ef6a8757751054 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 16 Aug 2023 08:45:09 -0400 Subject: [PATCH 196/344] starcoder model. (#962) * starcoder model. * python inferface + add cpp test. * fix * . * remove test starcoder * remove * fix python opt * fix --- include/flexflow/ffconst.h | 3 +- include/flexflow/flexflow_c.h | 2 + include/flexflow/model.h | 3 + include/flexflow/request_manager.h | 3 +- inference/file_loader.cc | 9 +- inference/incr_decoding/CMakeLists.txt | 3 +- inference/incr_decoding/incr_decoding.cc | 11 + inference/models/opt.cc | 1 + inference/models/starcoder.cc | 216 ++++++++++++++ inference/models/starcoder.h | 76 +++++ python/flexflow/core/flexflow_cffi.py | 3 + python/flexflow/serve/models/__init__.py | 1 + python/flexflow/serve/models/opt.py | 2 + python/flexflow/serve/models/starcoder.py | 277 ++++++++++++++++++ python/flexflow/serve/serve.py | 13 +- python/flexflow/type.py | 1 + src/c/flexflow_c.cc | 6 + src/ops/inc_multihead_self_attention.cu | 6 - src/runtime/inference_manager.cc | 12 +- src/runtime/request_manager.cc | 3 +- src/runtime/request_manager.cu | 4 +- tests/inference/cpp_inference_tests.sh | 5 + .../python_test_configs/generate_configs.py | 3 +- 23 files changed, 642 insertions(+), 21 deletions(-) create mode 100644 inference/models/starcoder.cc create mode 100644 inference/models/starcoder.h create mode 100644 python/flexflow/serve/models/starcoder.py diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 1694041163..2f97d48997 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -184,7 +184,8 @@ enum ModelType { LLAMA = 3002, LLAMA2 = 3003, OPT = 3004, - FALCON = 3005 + FALCON = 3005, + STARCODER = 3006 }; enum PMParameter { diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 1aa192f4e3..f2deacd8d7 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -566,6 +566,8 @@ flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_, char const *text, int max_seq_length); +void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); + // ----------------------------------------------------------------------- // Tensor // ----------------------------------------------------------------------- diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 7bd13224df..bc3c7e6545 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -986,6 +986,7 @@ class FFModel { CompMode comp_mode = COMP_MODE_TRAINING); void compile_inference(); void set_transformer_layer_id(int id); + void set_position_offset(int offset); void graph_optimize(size_t budget, bool only_data_parallel, std::unique_ptr &best_graph, @@ -1043,6 +1044,8 @@ class FFModel { size_t op_global_guid, layer_global_guid; size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid; size_t current_transformer_layer_id; + // positional embedding start offset + int position_offset; FFConfig config; FFIterationConfig iter_config; Optimizer *optimizer; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 6aa69786ca..d6c30c6b78 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -40,7 +40,8 @@ class InferenceManager { void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, ParallelTensor const input); void load_positions(BatchConfigFuture const &bc, - ParallelTensor position_input); + ParallelTensor position_input, + int offset); public: FFConfig ff_config; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index c15a3c0f2b..78f190dad6 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -137,7 +137,8 @@ void load_attention_bias_v2(DT *ptr, int file_index = 0; // now only opt use this. - assert(num_heads == num_kv_heads); + // assert(num_heads == num_kv_heads); + int idx = 0; for (auto file : bias_files) { int n_heads = file_index == 0 ? num_heads : num_kv_heads; @@ -166,11 +167,12 @@ void load_attention_bias_v2(DT *ptr, size_t data_index = 0; for (int i = 0; i < partial_size; i++) { - ptr[file_index * qkv_partial_size + i] = host_array.at(data_index); + ptr[idx + i] = host_array.at(data_index); data_index++; } file_index++; + idx += qkv_partial_size; in.close(); } @@ -238,7 +240,8 @@ void load_attention_weights_v2(DT *ptr, if (in_get_size != loaded_data_size) { std::cout << "load attention data error " << in_get_size << ", " - << loaded_data_size; + << loaded_data_size << ", " << file_index << ", " << file + << "\n"; assert(false && "data size mismatch"); } // wq, wk, wo diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index d1313d328b..4ed60abb87 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -10,7 +10,8 @@ set(CPU_SRC ../file_loader.cc ../models/llama.cc ../models/opt.cc - ../models/falcon.cc) + ../models/falcon.cc + ../models/starcoder.cc) cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index cd84d6cb5c..e94cb4a4e9 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -18,6 +18,7 @@ #include "models/falcon.h" #include "models/llama.h" #include "models/opt.h" +#include "models/starcoder.h" #include #include @@ -170,6 +171,9 @@ void FlexFlow::top_level_task(Task const *task, } else if (str == "RWForCausalLM") { model_type = ModelType::FALCON; break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; } } @@ -201,6 +205,13 @@ void FlexFlow::top_level_task(Task const *task, weights_filepath, INC_DECODING_MODE, use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + samplingConfig, + use_full_precision); } else { assert(false && "unknow model type"); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index fc1d5512ba..9b3670ed89 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -40,6 +40,7 @@ void OPT::create_opt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; Tensor position_input; + ff.set_position_offset(2); { int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc new file mode 100644 index 0000000000..fcd41e21ea --- /dev/null +++ b/inference/models/starcoder.cc @@ -0,0 +1,216 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "starcoder.h" + +namespace FlexFlow { + +using namespace Legion; +using json = nlohmann::json; + +void STARCODER::create_starcoder_model( + FFModel &ff, + std::string const &model_config_file_path, + std::string const &weight_file_path, + InferenceMode mode, + SamplingConfig samplingConfig, + bool use_full_precision) { + // do not apply cpu offload in beam search model. + STARCODERConfig startcoder_config(model_config_file_path); + startcoder_config.print(); + + if (ff.config.tensor_parallelism_degree > + startcoder_config.num_attention_heads || + startcoder_config.num_attention_heads % + ff.config.tensor_parallelism_degree != + 0) { + assert(false && "The number of attention heads is smaller, or it is not " + "divisible by the tensor parallelism degree"); + } + + std::unordered_map weights_layers; + std::vector axes = {0}; + + Tensor input; + Tensor position_input; + ff.set_position_offset(0); + { + assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); + int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + position_input = ff.create_tensor<2>(token_dims, DT_INT32); + } + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + + Tensor token; + + if (use_full_precision) { + token = ff.embedding(input, + startcoder_config.vocab_size, + startcoder_config.hidden_size, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + token = ff.embedding(input, + startcoder_config.vocab_size, + startcoder_config.hidden_size, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } + + Layer *embedding = ff.layers.back(); + weights_layers.emplace("transformer_wte_weight", embedding); + + Tensor positional_embedding; + if (use_full_precision) { + positional_embedding = + ff.embedding(position_input, + startcoder_config.max_position_embeddings, + startcoder_config.hidden_size, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + positional_embedding = + ff.embedding(position_input, + startcoder_config.max_position_embeddings, + startcoder_config.hidden_size, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } + Layer *pos_embedding = ff.layers.back(); + weights_layers.emplace("transformer_wpe_weight", pos_embedding); + + Tensor hidden_states = ff.add(token, positional_embedding); + + for (int i = 0; i < startcoder_config.num_hidden_layers; i++) { + // set transformer layer id + ff.set_transformer_layer_id(i); + // step 1: attention + Tensor ln_1 = ff.layer_norm( + hidden_states, axes, true, startcoder_config.layer_norm_epsilon); + Layer *layer_norm = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_ln_1_weight", + layer_norm); + + Tensor mha; + switch (mode) { + case INC_DECODING_MODE: { + mha = ff.inc_multiquery_self_attention( + ln_1, + startcoder_config.hidden_size, + startcoder_config.num_attention_heads, + 1, + startcoder_config.hidden_size / + startcoder_config.num_attention_heads, + startcoder_config.hidden_size / + startcoder_config.num_attention_heads, + startcoder_config.dropout_p, /*dropout*/ + true, /*bias*/ + false, /*add_bias_kv*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + false /*apply_rotary_embedding*/ + ); + break; + } + default: { + assert(false); + } + } + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + Tensor residual = ff.add(hidden_states, mha); + + Tensor l2_norm = ff.layer_norm( + residual, axes, true, startcoder_config.layer_norm_epsilon); + Layer *l2_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_ln_2_weight", + l2_layer); + + // mlp + std::cout << "intermediate_size: " << startcoder_config.intermediate_size + << "\n"; + Tensor c_fc = ff.dense( + l2_norm, startcoder_config.intermediate_size, AC_MODE_NONE, true); + Layer *c_fc_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_mlp_c_fc_weight", + c_fc_layer); + c_fc = ff.gelu(c_fc); + + Tensor c_proj = + ff.dense(c_fc, startcoder_config.hidden_size, AC_MODE_NONE, true); + Layer *c_proj_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_mlp_c_proj_weight", + c_proj_layer); + + hidden_states = ff.add(residual, c_proj); + } + // final normalization and linear + Tensor ln_f = ff.layer_norm( + hidden_states, axes, true, startcoder_config.layer_norm_epsilon); + Layer *final_norm = ff.layers.back(); + weights_layers.emplace("transformer_ln_f_weight", final_norm); + + Tensor lm_head = + ff.dense(ln_f, startcoder_config.vocab_size, AC_MODE_NONE, false); + Layer *final_linear = ff.layers.back(); + weights_layers.emplace("lm_head_weight", final_linear); + + Tensor output; + if (mode == BEAM_SEARCH_MODE) { + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.argmax(softmax, /*beam_Search*/ true); + } else { + // Tensor softmax = ff.softmax(dense, -1); + if (samplingConfig.do_sample) { + lm_head = ff.scalar_truediv(lm_head, samplingConfig.temperature, false); + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.sampling(softmax, samplingConfig.topp); + } else { + output = ff.argmax(lm_head, /*beam_Search*/ false); + } + } + + InferenceManager *im = InferenceManager::get_inference_manager(); + // Compile the model + std::cout << "------start compile ----------" << std::endl; + im->compile_model_and_allocate_buffer(&ff); + FileDataLoader fileloader("", + weight_file_path, + startcoder_config.num_attention_heads, + 1, + startcoder_config.hidden_size, + startcoder_config.hidden_size / + startcoder_config.num_attention_heads, + ff.config.tensor_parallelism_degree); + fileloader.load_weights(&ff, weights_layers, use_full_precision); + std::cout << "------load weight finished----------" << std::endl; + + // init operators + im->init_operators_inference(&ff); +} + +}; // namespace FlexFlow diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h new file mode 100644 index 0000000000..af0732f3c4 --- /dev/null +++ b/inference/models/starcoder.h @@ -0,0 +1,76 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "file_loader.h" +#include "flexflow/batch_config.h" +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include +#include +using json = nlohmann::json; + +namespace FlexFlow { + +class STARCODER { +public: + struct STARCODERConfig { + STARCODERConfig(std::string const &model_config_file_path) { + std::ifstream config_file(model_config_file_path); + if (config_file.is_open()) { + try { + json model_config; + config_file >> model_config; + num_hidden_layers = model_config["n_layer"]; + vocab_size = model_config["vocab_size"]; + num_attention_heads = model_config["n_head"]; + hidden_size = model_config["n_embd"]; + layer_norm_epsilon = model_config["layer_norm_epsilon"]; + intermediate_size = model_config["n_inner"]; + dropout_p = model_config["attn_pdrop"]; + max_position_embeddings = model_config["n_positions"]; + } catch (json::exception const &e) { + std::cerr << "Error parsing STARCODER config from JSON file: " + << e.what() << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file " << model_config_file_path + << std::endl; + assert(false); + } + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + } + + void print() const {} + + int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, + intermediate_size, max_position_embeddings; + float layer_norm_epsilon, dropout_p; + }; + + static void create_starcoder_model(FFModel &ff, + std::string const &model_config_file_path, + std::string const &weight_file_path, + InferenceMode mode, + SamplingConfig samplingConfig, + bool use_full_precision = false); +}; + +}; // namespace FlexFlow diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index b02ac5bdb9..48193720eb 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2961,6 +2961,9 @@ def get_output_tensor(self, ffmodel, data_type): def generate(self, text, max_sequence_length): c_text = get_c_name(text) return ffc.flexflow_model_generate(self.handle, c_text, max_sequence_length) + + def set_position_offset(self, offset): + ffc.flexflow_model_set_position_offset(self.handle, offset) # ----------------------------------------------------------------------- # SGDOptimizer diff --git a/python/flexflow/serve/models/__init__.py b/python/flexflow/serve/models/__init__.py index 3b4087203b..6b405b2f99 100644 --- a/python/flexflow/serve/models/__init__.py +++ b/python/flexflow/serve/models/__init__.py @@ -15,3 +15,4 @@ from .llama import FlexFlowLLAMA from .opt import FlexFlowOPT from .falcon import FlexFlowFalcon +from .starcoder import FlexFlowSTARCODER diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 7e38b5de85..a33b261cb7 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -89,6 +89,8 @@ def build_model(self): input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) + # OPT model positional embedding start offset is 2 + ffmodel.set_position_offset(2) embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) token = ffmodel.embedding( input_tensor, diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py new file mode 100644 index 0000000000..193f7c8e1a --- /dev/null +++ b/python/flexflow/serve/models/starcoder.py @@ -0,0 +1,277 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.core import * +from .base import FlexFlowModel +import random, torch + + +class STARCODERConfig: + def __init__(self, hf_config): + self.max_seq_len = 256 + self.max_num_tokens = 64 + self.max_beam_width = 1 + self.max_beam_depth = 8 + self.dropout_p = hf_config.dropout_p + self.hidden_size = hf_config.hidden_size + self.layer_norm_epsilon = hf_config.layer_norm_epsilon + self.max_position_embeddings = hf_config.max_position_embeddings + self.num_attention_heads = hf_config.num_attention_heads + self.num_hidden_layers = hf_config.num_hidden_layers + self.vocab_size = hf_config.vocab_size + self.intermediate_size = hf_config.intermediate_size + + +class FlexFlowSTARCODER(FlexFlowModel): + def __init__( + self, + mode, + sampling_config, + ffconfig, + hf_config, + data_type, + max_batch_size=1, + max_seq_length=256, + max_tokens_per_batch=64, + weights_filepath="", + tokenizer_filepath="", + ): + self.mode = mode + self.sampling_config = sampling_config + self.ffconfig = ffconfig + self.max_batch_size = max_batch_size + self.data_type = data_type + self.starcoder_config = STARCODERConfig(hf_config) + self.starcoder_config.max_seq_length = max_seq_length + self.starcoder_config.max_num_tokens = max_tokens_per_batch + self.weights_filepath = weights_filepath + self.tokenizer_filepath = tokenizer_filepath + self.maxint = 2**31 - 1 + + # Sanity checks + if ( + self.starcoder_config.hidden_size + % self.starcoder_config.num_attention_heads + != 0 + ): + raise ValueError( + f"Hidden size ({self.starcoder_config.hidden_size}) is not divisible by n_head ({self.starcoder_config.num_attention_heads})" + ) + + # Sanity checks + if ( + self.starcoder_config.num_attention_heads + < self.ffconfig.tensor_parallelism_degree + or self.starcoder_config.num_attention_heads + % self.ffconfig.tensor_parallelism_degree + != 0 + ): + raise ValueError( + f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + + self.build_model() + + def build_model(self): + ffmodel = FFModel(self.ffconfig) + + tokens_dims = [self.starcoder_config.max_num_tokens, 1] + input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) + position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) + + embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) + ffmodel.set_position_offset(2) + token = ffmodel.embedding( + input_tensor, + self.starcoder_config.vocab_size, + self.starcoder_config.hidden_size, + AggrMode.AGGR_MODE_NONE, + self.data_type, + None, + embed_init, + name="transformer_wte_weight", + ) + positional_embedding = ffmodel.embedding( + position_tensor, + self.starcoder_config.max_position_embeddings, + self.starcoder_config.hidden_size, + AggrMode.AGGR_MODE_NONE, + self.data_type, + None, + embed_init, + name="transformer_wpe_weight", + ) + + hidden_states = ffmodel.add(token, positional_embedding) + + axes = [ + 0, + ] + + for i in range(self.starcoder_config.num_hidden_layers): + ffmodel.set_transformer_layer_id(i) + ln_1 = ffmodel.layer_norm( + hidden_states, + axes, + True, + self.starcoder_config.layer_norm_epsilon, + name=f"layers_{i}_ln_1_weight", + ) + + assert self.mode == InferenceMode.INC_DECODING_MODE + mha = ffmodel.inc_multiquery_self_attention( + ln_1, + self.starcoder_config.hidden_size, + self.starcoder_config.num_attention_heads, + 1, + self.starcoder_config.hidden_size + // self.starcoder_config.num_attention_heads, + self.starcoder_config.hidden_size + // self.starcoder_config.num_attention_heads, + 0.0, # dropout + True, # bias + False, # add_bias_kv + False, # add_zero_attn + DataType.DT_NONE, # data_type + None, # kernel initializer + False, # apply_rotary_embedding + name=f"layers_{i}_attention_weight", + ) + + residual = ffmodel.add(mha, hidden_states) + + l2_norm = ffmodel.layer_norm( + residual, + axes, + True, + self.starcoder_config.layer_norm_epsilon, + name=f"layers_{i}_ln_2_weight", + ) + + # mlp + + c_fc = ffmodel.dense( + l2_norm, + self.starcoder_config.intermediate_size, + ActiMode.AC_MODE_NONE, + True, + name=f"layers_{i}_mlp_c_fc_weight", + ) + activation = ffmodel.gelu(c_fc, False) + c_proj = ffmodel.dense( + activation, + self.starcoder_config.hidden_size, + ActiMode.AC_MODE_NONE, + True, + name=f"layers_{i}_mlp_c_proj_weight", + ) + hidden_states = ffmodel.add(residual, c_proj) + + ln_f = ffmodel.layer_norm( + hidden_states, + axes, + True, + self.starcoder_config.layer_norm_epsilon, + name=f"transformer_ln_f_weight", + ) + lm_head = ffmodel.dense( + ln_f, + self.starcoder_config.vocab_size, + ActiMode.AC_MODE_NONE, + False, + name="lm_head_weight", + ) + + if self.sampling_config.do_sample: + dense = ffmodel.scalar_true_divide( + lm_head, self.sampling_config.temperature, False + ) + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.sampling(softmax, self.sampling_config.topp) + else: + output = ffmodel.argmax(lm_head, False) + + self.ffmodel = ffmodel + + def convert_hf_model(model, dst_folder): + os.makedirs(dst_folder, exist_ok=True) + for name, params in model.named_parameters(): + name = name.replace("transformer.h", "layers").replace(".", "_") + if "c_attn_weight" in name: + name_q = name.replace("attn_c_attn", "attention_wq") + name_k = name.replace("attn_c_attn", "attention_wk") + name_v = name.replace("attn_c_attn", "attention_wv") + q, k, v = torch.split( + params, + [ + model.config.hidden_size, + model.config.hidden_size // model.config.num_attention_heads, + model.config.hidden_size // model.config.num_attention_heads, + ], + 0, + ) + q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) + k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) + v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) + elif "c_attn_bias" in name: + name_q = name.replace("attn_c_attn", "attention_wq") + name_k = name.replace("attn_c_attn", "attention_wk") + name_v = name.replace("attn_c_attn", "attention_wv") + q, k, v = torch.split( + params, + [ + model.config.hidden_size, + model.config.hidden_size // model.config.num_attention_heads, + model.config.hidden_size // model.config.num_attention_heads, + ], + 0, + ) + q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) + k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) + v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) + elif "c_proj_bias" in name: + name = name.replace("attn_c_proj", "attention_wo") + params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) + elif "c_proj_weight" in name: + name = name.replace("attn_c_proj", "attention_wo") + params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) + else: + params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) + model.lm_head.weight.detach().cpu().numpy().tofile( + os.path.join(dst_folder, "lm_head_weight") + ) + + def get_layers_with_weights(self): + layer_names = [ + "transformer_wte_weight", + "transformer_wpe_weight", + "transformer_ln_f_weight", + "lm_head_weight", + ] + [ + expr + for i in range(self.starcoder_config.num_hidden_layers) + for expr in ( + f"layers_{i}_ln_1_weight", + f"layers_{i}_attention_weight", + f"layers_{i}_ln_2_weight", + f"layers_{i}_mlp_c_fc_weight", + f"layers_{i}_mlp_c_proj_weight", + ) + ] + layers_with_weights = { + layer_name: self.ffmodel.get_layer_by_name(layer_name) + for layer_name in layer_names + } + + return layers_with_weights diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index bf2dcc84de..7abea56a7d 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -12,7 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from flexflow.serve.models import FlexFlowLLAMA, FlexFlowOPT, FlexFlowFalcon +from flexflow.serve.models import ( + FlexFlowLLAMA, + FlexFlowOPT, + FlexFlowFalcon, + FlexFlowSTARCODER, +) from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from huggingface_hub import HfApi @@ -76,6 +81,7 @@ def __init__( "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA), "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT), "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon), + "GPTBigCodeForCausalLM": (ModelType.STARCODER, FlexFlowSTARCODER), } self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.model_name = self.hf_config._name_or_path @@ -217,7 +223,10 @@ def __load_hf_weights(self): self.download_hf_weights_if_needed() # Create file data loader, load weights into tensors - if self.model_type == ModelType.FALCON: + if ( + self.model_type == ModelType.FALCON + or self.model_type == ModelType.STARCODER + ): n_q_heads = self.hf_config.num_attention_heads if "n_head_kv" in self.hf_config.__dict__: n_kv_heads = self.hf_config.n_head_kv diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 137c8a872a..5232ddd431 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -78,6 +78,7 @@ class ModelType(Enum): LLAMA2 = 3003 OPT = 3004 FALCON = 3005 + STARCODER = 3006 class OpType(Enum): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e84cd5db67..b1363faacc 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1421,6 +1421,12 @@ flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_, return FFCObjectWrapper::wrap(&result); } +void flexflow_model_set_position_offset(flexflow_model_t handle_, + int const offset) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_position_offset(offset); +} + // ----------------------------------------------------------------------- // Tensor // ----------------------------------------------------------------------- diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index be6a4fd301..b694797830 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -630,7 +630,6 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } - // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -792,7 +791,6 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - tokens_previous_requests += num_new_tokens; } @@ -970,10 +968,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( global_num_kv_heads = _global_num_kv_heads; num_q_heads = _num_q_heads; num_kv_heads = _num_kv_heads; - // weights_params = (qSize * qProjSize + kSize * kProjSize + vSize * vProjSize - // + - // oProjSize * (vProjSize > 0 ? vProjSize : vSize)); - // weightSize = weights_params * num_q_heads * size_of_dt; weightSize = ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index bd9079ec0c..62ab947f8f 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -315,7 +315,7 @@ FutureMap InferenceManager::inference(FFModel *model, // input. assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_positions(bc, pt); + load_positions(bc, pt, model->position_offset); } else { found_input_operator = true; assert(op->numOutputs == 1); @@ -371,14 +371,15 @@ void InferenceManager::load_input_tokens_from_batch_config( } void InferenceManager::load_positions(BatchConfigFuture const &bc, - ParallelTensor position_input) { + ParallelTensor position_input, + int offset) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = position_input->machine_view.hash(); ArgumentMap argmap; IndexLauncher launcher(RM_LOAD_POSITION_TASK_ID, position_input->parallel_is, - TaskArgument(nullptr, 0), + TaskArgument(&offset, sizeof(int)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -403,6 +404,11 @@ void FFModel::set_transformer_layer_id(int id) { assert(id < MAX_NUM_TRANSFORMER_LAYERS); } +void FFModel::set_position_offset(int offset) { + assert(offset == 0 || offset == 2); + position_offset = offset; +} + void FFModel::compile_inference() { Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a49f827482..0c32da3291 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -99,7 +99,8 @@ void RequestManager::register_tokenizer(ModelType type, this->tokenizer_ = Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); - } else if (model_type == ModelType::FALCON) { + } else if (model_type == ModelType::FALCON || + model_type == ModelType::STARCODER) { std::string falcon_tokenizer_path = join_path({path, "tokenizer.json"}); this->tokenizer_ = Tokenizer::FromBlobJSON(LoadBytesFromFile(falcon_tokenizer_path)); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index abfcd72a38..7363e14cf0 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -68,8 +68,8 @@ void RequestManager::load_positions_task( // BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); - - int offset = 2; + + const int offset = *((const int*)task->args); int *pos_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain domain = runtime->get_index_space_domain( diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index fba42538ef..6a108303d6 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -61,6 +61,11 @@ fi # Falcon (half precision) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B_half.txt -pipeline-parallelism-degree 4 +# # StarCoder (full precision) +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 +# # StarCoder (half precision) +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 + # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index b4c3dd8039..91cf317db4 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -49,6 +49,7 @@ llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m"] opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] falcon_models = ["tiiuae/falcon-7b",] +# starcoder_models = ["bigcode/starcoderbase-7b",] parallelism_settings = [(1,4), (2,2), (4,1)] # The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference) @@ -70,7 +71,7 @@ tp, pp = parallelism_degrees # Tensor parallelism not supported by small Falcon model atm - if tp > 1 and "falcon" in model_name: + if tp > 1 and ("falcon" in model_name or "starcoder" in model_name): continue # skip tp=4 for big models if tp > 2 and ("7b" in model_name or "6.7b" in model_name): From 1f0432831241c7ffb9035495619ceb80813fe065 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 16 Aug 2023 07:58:27 -0500 Subject: [PATCH 197/344] New README.md for FlexFlow Serve (#960) * New README.md for FlexFlow Serve * Update README.md * update performance.png * Update README.md * Update README.md * update install instructions * update cpp specinfer instructions * Update README.md * update readme * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * update requirements.txt, renamed conda env, updated readme * fix * Update README.md * Update README.md --------- Co-authored-by: Gabriele Oliaro --- .github/README.md | 262 ++++++++++++++++------- .github/workflows/gpu-ci.yml | 6 +- conda/{flexflow-cpu.yml => flexflow.yml} | 0 img/performance.png | Bin 18951 -> 58476 bytes requirements.txt | 10 + 5 files changed, 192 insertions(+), 86 deletions(-) rename conda/{flexflow-cpu.yml => flexflow.yml} (100%) diff --git a/.github/README.md b/.github/README.md index c4f220e222..e61c1648ba 100644 --- a/.github/README.md +++ b/.github/README.md @@ -1,129 +1,225 @@ -# SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification -![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) +# FlexFlow Serve: Low-Latency, High-Performance LLM Serving +![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=inference) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=inference) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=inference) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=inference) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=inference) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=inference) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) -

-A SpecInfer Demo -

-## What is SpecInfer +--- -

-An overview of SpecInfer -

+## News: + +* [08/14/2023] Released Dockerfile for different CUDA versions + +## What is FlexFlow Serve The high computational and memory requirements of generative large language models (LLMs) make it challenging to serve them quickly and cheaply. -SpecInfer is an open-source distributed multi-GPU system that accelerates generative LLM -inference with __speculative inference__ and __token tree verification__. A key insight -behind SpecInfer is to combine various collectively boost-tuned small speculative -models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a -token tree, whose nodes each represent a candidate token sequence. The correctness -of all candidate token sequences represented by a token tree is verified against the -LLM’s output in parallel using a novel tree-based parallel decoding mechanism. -SpecInfer uses an LLM as a token tree verifier instead of an incremental decoder, -which largely reduces the end-to-end inference latency and computational requirement -for serving generative LLMs while provably preserving model quality. +FlexFlow Serve is an open-source compiler and distributed system for +__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms +existing systems by 1.3-2.0x for single-node, multi-GPU inference and by +1.4-2.4x for multi-node, multi-GPU inference.

Performance comparison

-## Build/Install SpecInfer -SpecInfer is built on top of FlexFlow. You can build/install SpecInfer by building the inference branch of FlexFlow. Please read the [instructions](../INSTALL.md) for building/installing FlexFlow from source code. If you would like to quickly try SpecInfer, we also provide pre-built Docker packages ([specinfer-cuda](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-cuda) with a CUDA backend, [specinfer-hip_rocm](https://github.com/flexflow/FlexFlow/pkgs/container/specinfer-hip_rocm) with a HIP-ROCM backend) with all dependencies pre-installed (N.B.: currently, the CUDA pre-built containers are only fully compatible with host machines that have CUDA 11.7 installed), together with [Dockerfiles](./docker) if you wish to build the containers manually. +## Install FlexFlow Serve -## Run SpecInfer -The source code of the SpecInfer pipeline is available at [this folder](../inference/spec_infer/). The SpecInfer executable will be available at `/build_dir/inference/spec_infer/spec_infer` at compilation. You can use the following command-line arguments to run SpecInfer: -* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) -* `-ll:fsize`: size of device memory on each GPU in MB -* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. SpecInfer keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama") -* `-llm-weight`: path to the folder that stores the LLM weights -* `-llm-config`: path to the json file that stores the LLM model configs -* `-ssm-model`: the LLM model type as a case-insensitive string (e.g. "opt" or "llama"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. -* `-ssm-weight`: path to the folder that stores the small speculative models' weights. The number of `-ssm-weight`s must match the number of `-ssm-model`s and `-ssm-config`s. -* `-ssm-config`: path to the json file that stores the SSM model configs. The number of `-ssm-config`s must match the number of `-ssm-model`s and `-ssm-weight`s. -* `-tokenizer`: path to the tokenizer file (see [Tokenizers](#tokenizers) for preparing a tokenizer for SpecInfer). -* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. -* `-prompt`: (optional) path to the prompt file. SpecInfer expects a json format file for prompts, all of which will be served by SpecInfer. In addition, users can also use the following API for registering requests: -* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency +### Requirements +* OS: Linux +* GPU backend: Hip-ROCm or CUDA + * CUDA version: 10.2 – 12.0 + * NVIDIA compute capability: 6.0 or higher +* Python: 3.6 or higher +* Package dependencies: [see here](https://github.com/flexflow/FlexFlow/blob/inference/requirements.txt) +### Install with pip +You can install FlexFlow Serve using pip: -```c++ -class RequestManager { - RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); -} +```bash +pip install flexflow ``` -For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-190M models for speculative inference. + +### Try it in Docker +If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions and the `hip_rocm` backend. To download and run our pre-built Docker container: ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -tensor-parallelism-degree 2 -pipeline-parallelism-degree 2 +docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-11.8:latest ``` -### Tokenizers -SpecInfer supports two tokenizers: +To download a Docker container for a backend other than CUDA v11.8, you can replace the `cuda-11.8` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md). -* The SentencePiece tokenizer is used to support the LLaMA model family (e.g., LLaMA-6B, LLaMA-13B, and LLaMA-190M in our demo). We used the pretrained sentencepiece tokenizer from LLAMA, which is also available on Hugging Face (model id: `decapoda-research/llama-7b-hf`). If you are using our LLAMA-160M weights for the demo, however, you should use the tokenizer from the [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m/resolve/main/tokenizer.model) HuggingFace repo. -* The GPT2 tokenizer is used to support the Open Pre-trained Transformer model family (e.g., OPT-13B and OPT-125M). To use it, download the [vocab](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-vocab.json) and [merges](https://raw.githubusercontent.com/facebookresearch/metaseq/main/projects/OPT/assets/gpt2-merges.txt) files and pass the folder containing them as a parameter. +### Build from source -### Mixed-precision Support -SpecInfer now supports single-precision floating points and half-precision floating points. By default we use half-precision. Add `--use-full-precision` to the command line to run the demo with single-precision, please make sure to use the correct weight files in the form below. +You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). -### CPU Offloading -SpecInfer offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by adding `-offload` and `-offload-reserve-space-size` flags. -#### Quantization -To reduce data transferred between the CPU and GPU, SpecInfer provides int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. The quantization method can be selected using the `--4bit-quantization` and `--8bit-quantization` flags. +## Quickstart +The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving. +```python +import flexflow.serve as ff -Below is an example command line to use offloading and quantization in SpecInfer. +ff.init( + { + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 30000, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, + } +) +``` +Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). +```python +# Specify the LLM +llm = ff.LLM("decapoda-research/llama-7b-hf") + +# Specify a list of SSMs (just one in this case) +ssms=[] +ssm = ff.SSM("JackFram/llama-68m") +ssms.append(ssm) +``` +Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. +```python +# Create the sampling configs +generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 +) + +# Compile the SSMs for inference and load the weights into memory +for ssm in ssms: + ssm.compile(generation_config) + +# Compile the LLM for inference and load the weights into memory +llm.compile(generation_config, ssms=ssms) +``` +Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. +```python +result = llm.generate("Here are some travel tips for Tokyo:\n") +``` + +### Incremental decoding +
+Expand here +
+ +```python +import flexflow.serve as ff + +# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs +ff.init( + { + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_gpu": 30000, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, + } +) + +# Create the FlexFlow LLM +llm = ff.LLM("decapoda-research/llama-7b-hf") + +# Create the sampling configs +generation_config = ff.GenerationConfig( + do_sample=True, temperature=0.9, topp=0.8, topk=1 +) + +# Compile the LLM for inference and load the weights into memory +llm.compile(generation_config) + +# Generation begins! +result = llm.generate("Here are some travel tips for Tokyo:\n") +``` + +
+ +### C++ interface +If you'd like to use the C++ interface (mostly used for development and benchmarking purposes), you should install from source, and follow the instructions below. + +
+Expand here +
+ +#### Downloading models +Before running FlexFlow Serve, you should manually download the LLM and SSM(s) model of interest using the [inference/utils/download_hf_model.py](https://github.com/flexflow/FlexFlow/blob/inference/inference/utils/download_hf_model.py) script (see example below). By default, the script will download all of a model's assets (weights, configs, tokenizer files, etc...) into the cache folder `~/.cache/flexflow`. If you would like to use a different folder, you can request that via the parameter `--cache-folder`. ```bash -./inference/spec_infer/spec_infer -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 -llm-model llama -llm-weight /path/to/llm/weights -llm-config /path/to/llm/config.json -ssm-model llama -ssm-weight /path/to/ssm1/weights -ssm-config /path/to/ssm/config.json -ssm-model llama -smm-weight /path/to/ssm2/weights -ssm-config /path/to/ssm2/config.json -tokenizer /path/to/tokenizer.model -prompt /path/to/prompt.json --use-full-precision -offload -offload-reserve-space-size 6000 --8bit-quantization +python3 ./inference/utils/download_hf_model.py ... ``` +#### Running the C++ examples +A C++ example is available at [this folder](../inference/spec_infer/). After building FlexFlow Serve, the executable will be available at `/build_dir/inference/spec_infer/spec_infer`. You can use the following command-line arguments to run FlexFlow Serve: +* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) +* `-ll:fsize`: size of device memory on each GPU in MB +* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. +* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf") +* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-cache-folder`: the folder +* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. +* `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests: +* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency -### LLM Weights -The weight files used in our demo are extracted from HuggingFace, and stored in our AWS S3 bucket. +For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. -| Model | Model id on Hugging Face | Storage Location (single precision) | Storage Location (half precision) | -| :---- | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | s3://specinfer/weights/llama_7B_weights.tar.gz | s3://specinfer/half_weights/llama_7B_weights.tar.gz -| LLaMA-190M | JackFram/llama-160m | s3://specinfer/weights/llama_160M_weights.tar.gz | s3://specinfer/half_weights/llama_160M_weights.tar.gz -| OPT-6.7B | facebook/opt-6.7b | s3://specinfer/weights/opt_6B_weights.tar.gz | s3://specinfer/half_weights/opt_6B_weights.tar.gz -| OPT-125M | facebook/opt-125m | s3://specinfer/weights/opt_125M_weights.tar.gz | s3://specinfer/half_weights/opt_125M_weights.tar.gz +```bash +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +``` +
-You can use [this script](../inference/utils/download_llama_weights.py) to automatically download and convert the weights of a HuggingFace LLAMA LLM and a LLAMA SSM to the SpecInfer weight format. The script also downloads the LLAMA tokenizer. If you would like to try the OPT model instead, use [this script](../inference/utils/download_opt_weights.py) to download (and convert) the OPT weights and tokenizer. +## Speculative Inference +A key technique that enables FlexFlow Serve to accelerate LLM serving is speculative +inference, which combines various collectively boost-tuned small speculative +models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The correctness +of all candidate token sequences represented by a token tree is verified against the +LLM’s output in parallel using a novel tree-based parallel decoding mechanism. +FlexFlow Serve uses an LLM as a token tree verifier instead of an incremental decoder, +which largely reduces the end-to-end inference latency and computational requirement +for serving generative LLMs while provably preserving model quality. -### Prompt Datasets -We have evaluated SpecInfer on the following prompts datasets: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). +

+A Speculative Inference Demo +

-### Script to run the demo -You can take a look at [this script](../tests/inference_tests.sh), which is run in CI for each new commit, for an example of how to run the demo. +### Supported LLMs and SSMs -## Difference between SpecInfer and HuggingFace Assistant Model +FlexFlow Serve supports a variety of HuggingFace models: -There are two major differences between the two systems. +| Model | Model id on HuggingFace | Boost-tuned SSMs | +| :---- | :---- | :---- | +| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| Falcon-7B | tiiuae/falcon-7b | | +| Falcon-40B | tiiuae/falcon-40b | | +| StarCoder-15.5B | bigcode/starcoder | | -* First, the HuggingFace assistant model produces a single candidate token sequence during speculation, while SpecInfer generates and verifies a speculated token tree, whose tokens each represent a candidate token sequence. To deal with the more complex verification task, SpecInfer includes a number of systems and algorithmic optimizations to quickly and efficiently verify all tokens of a token tree in parallel. - -* Second, instead of considering a single assistant model, SpecInfer combines a variety of collectively boost-tuned small speculative models (SSMs) to jointly predict the LLM's outputs. We observe that using multiple boost-tuned SSMs is critical for improving speculative performance. -## TODOs +### CPU Offloading +FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags. -SpecInfer is under active development. We currently focus on the following tasks and strongly welcome all contributions to SpecInfer from bug fixes to new features and extensions. +### Quantization +FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. [TODO: update instructions for quantization]. -* Low-precision and mixed-precision support. The current version uses single-precision floating points for computing tree attention. We are actively working on support half-precision floating points, and int4 and int8 quantizations. -* Offloading-based generative LLM inference. Another promising avenue for future work is using speculative inference and token tree verification to reduce the end-to-end inference for offloading-based generative LLM inference. A potential application of this technique is enabling a single commodity GPU to serve LLMs for latency critical tasks. +### Prompt Datasets +We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). -## Acknowledgements -This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting SpecInfer and the underlying FlexFlow runtime system. The following paper describes design, implementation, and key optimizations of SpecInfer. +## TODOs -* Xupeng Miao*, Gabriele Oliaro*, Zhihao Zhang*, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. [SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). +FlexFlow Serve and FlexFlow are under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. -\* Denotes equal contribution +* AMD support. We are actively working on supporting FlexFlow Serve on AMD GPUs and welcome any contributions to this effort. -### Citation -Please cite as: +## Acknowledgements +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: ``` bibtex @misc{miao2023specinfer, @@ -137,4 +233,4 @@ Please cite as: ``` ## License -Both SpecInfer and FlexFlow use Apache License 2.0. +FlexFlow uses Apache License 2.0. diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index bdd2e4dbf5..a6ea492bcf 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -83,7 +83,7 @@ jobs: with: miniconda-version: "latest" activate-environment: flexflow - environment-file: conda/flexflow-cpu.yml + environment-file: conda/flexflow.yml auto-activate-base: false auto-update-conda: false @@ -157,7 +157,7 @@ jobs: with: miniconda-version: "latest" activate-environment: flexflow - environment-file: conda/flexflow-cpu.yml + environment-file: conda/flexflow.yml auto-activate-base: false - name: Build FlexFlow @@ -232,7 +232,7 @@ jobs: with: miniconda-version: "latest" activate-environment: flexflow - environment-file: conda/flexflow-cpu.yml + environment-file: conda/flexflow.yml auto-activate-base: false - name: Build and Install FlexFlow diff --git a/conda/flexflow-cpu.yml b/conda/flexflow.yml similarity index 100% rename from conda/flexflow-cpu.yml rename to conda/flexflow.yml diff --git a/img/performance.png b/img/performance.png index a27d26dc198c351fdd0c9370f129465948f50daf..668e579197c42fc18211940941d6f5d757bbc0bf 100644 GIT binary patch literal 58476 zcmb@u2{e}NyEd+Qpd>^kN-{-;RHkS!W=bM+#taD|vna|`gfb+A3L!HIA%skYGG-pj z5He-@j+@@!{_VZj-s`{q-}kIlZ*O^?=f1D&yw3ADj`KKg?~C%1JGSrJPC`PmLt0Ag z5(&x177~(mU0XNfZ$b_8j^IB6HsY!_N*32_>~yURN#t~GEX^!z%uMtS+8SC}n^>41 zKX&REFULV+8yib&K`yQv|N4Pr7FI@FY+C)U_>gUuQfk&DB;>lpuXXXFaV8`ry)DvW zXD{178~JYM{BU-me4?pRP+IC-b+h{8nC#cP8f2yimit)Ua{D*~k`*w1HsD*dq3xbkXA8(12=r~CzD_&+~ap?@hrPVDcik0+smnep#yGW4Ilb^sLlW?z?yIR)igAahMt? zYiZf*UaT{ox%Pcb)UwQ!_$H70_jglKZG3(%+(2V^N<+zh4q1(k5*i_utku zy`ZDONuQf`>(;HQcI=FdCvQ&wPQ+hQ2QE0h?cw@&;_DSSn%QFz;hZ1aXIDcW*8*<#+iX~{O@f8*pM zF7OWEi>QuiHt&u*=e4wH<3=5BTH|j?YUw=7%ggw&%{%FgORCdDu>5MbQ-WGsvu|7< zr&9VIzZld?Z@V~m6F*QfRVV5n7sqn!*fA;&)khy6?mw>ok?hUY!pOc8Rl%Isee+n@ zr0kE`HwLTS3-@SxS{-Z|(P)m(4~tZi82X-fN+y`2xTz`RORVH^+YwbwEv?arVBzkO z`DX$9wvB5AwpsOmGR$eO?;WTKe?TV?IN4v-J2vK*mnT%_OB1JCa{mE?Xr$}PQd0+; zPsRCXBN1*=*L+Tu_3BhS&FV{frlBstA39$C;T~z2fc3SQC5>#e?oUsS@(T!1(9jtD z{9RvTz-w6Pa46kWdxp$nKC|4@M~5YMzR}A2cxS^F`<12Hy+XDeu1nJ)-dq1N6cKCaqT}&kV&u_`SrQVdke|$Zv$Q7Q{{*(EvtSp(3 z?Z|V*SP2e|?COr;3iqDzJMOUs8B*$-2ZFyw9e=HyD1o)5%rd&ye(5N}v^DFx_57qE z`;|AcQ6erCPmaSk1D@zJiJnvk%4GsF(uu)wF?U-y{9YPJ@HD)ZUzPgY!lufTwP4@?YH}WqtOWO ztbD`gOLxhHv{YPctp($W$Za`pCnTFR4s7WosKn^LYC zmE7N{fQt#a~X1i|my(?hS;=YsJo(#LGsh{*MvPsue z26G!2w7RBkc{ zdEX!oZHu7{8KC&W&i$NpDTrRz6^TjMvWgLtEPRGrs zFBsU9QA8`MnPwP%8LtR9Otz2DYdbZ2SAYM#@bLZAM^y|yJ!UKY_>tL1)NYK<<hqvdYTJNxCQB z${D+zQH#h92tGb{Z@VtvK8NmmG^z!Tw_@f#f4(TPJf|P&y6j*zQ0!=b0M8>1@JHTE3+^KvHZIzA9d`ROSO zbBATAn)l_mP!&5|X0@uORF@a$hZ_@=VmcetGOp=z)9zk6UAQoohadmZn4teP@|Wh< zIE5%GYktcv5)$_<)%5wPar9X;Esyw&^5fPE!W8d2*=_a3h%w9fw z_U!MGmY}SMR3%H4s*h%EZEcczmXGMl;_;fMmC>Bexsi-II9yIaqM z9Rii&9T>E#7J(?&1n z`AG^Ys?vA2ei}E%Q(SL)^Ag8OWg_ERjb3xA?%UklF9=5;>Z1}DFYYQ_oFqkl%FZ`A z6(*a%{O2df)%VYkTN{!!PHAdtHmB)t!B+SG9&SL+KCLAn`sq}sb4h{Ito>BYsmc(p zt3DI*@{V}{5iZkTub4HbuDg2mD)wI9z<>$Wb8dcqqRpZ(?OM$~nUN<)uf(fmFx!o_ zM~^Yi;)Z-4px+$w1-Y zE4gs1UtMRlZ_gV(J#qcMyZeTaV_H4s46ZLE{pqQ=w4Whxlya>Muhl#iN5pYhi!uw_ zDk>`GPkuT|Curk^0ysIAH$FLDxRP$wevFnCa!H+1=1}{9v&m5+ zOcVk^8&(0UzOG;0C0Lt%Zqh-_UouUU{OAPqhEGqXqO!55k7hR7*_M`YAT zNq6pi1*Gl#t9Xlwcs3=AJh$_lB??e;jzyF*7k5yln5AWE{|xR$^}VwI%9w6PzC$ph z4iQ6wHaZ_4>;({bf|UPqLeVMH!=g!P>d@JA&%Q`f#QDc~#n`#dm1QGk8=f%P-#?rD z5nn_U%A^dRxc&{G{sUlE?dRz8$+fqBe&duWzJp|O#c66l&hhuqS$ty) zF6V1!2Qtl2Yt~VmW1g$d6ZgK}MDg$K^%uQ{y&ZO8ed{>Q4(z7a`QF9bQ75iHP0F(W z6SZk${GGzW!k1l5NJRTin3h(D9p@C!#sR&FBMFFKMJ~FibIWgN$XG@p7u!y`YgZ9+ zcD!b;RZRJKMx976u5dM>sil`*a;D`q*qUYfRjuiHkARq*;#lQ+3bVj<%MeLU6bany{s#h)xBsvquV=BpiV(2Ae^}D*GvFP7)s5gf6wl(3mW=s< za0^-HGXem%Bt)=KX7|9%bfQc7M=!p_O}xYd|6X;IWM|sdfKJGpQhEx#ssS zdCCh&tGRaL^y3Aykze;4VRI$?=_7_7YF}`6aY^28rn9W2g#%x>G!y=^BK!7|BbY4$ z!s}(x<^gS0;=;17H_6ZJ!;MI%Z>h=~fA5=^cs*wQ{?NgI>z(gUBWyPzZ>CdGpsq+d zI}772CNa9fo85~|cfM-LFbV>xTAnRjQJYS;zIju(r_>vz@D8A|k$nKCZh!ZS0r$z- zS%pkNeXMdufBqGPARIP=GY$((9&|4*5*p7hK^kj8*u{LfbgAr7WsxZi^5N15g; z*}Z>e$LnxP1#bPg9CObgaN(aLAfBE1-nPFwEC}nNlxKV6o$cra@WPftS5a^inQY0$ zfWwmfR((=2GdK>S=@ok@DNC@Ul|d}zU|!j+P5BPi`%alx`yCKMhGRg|$s)_q%((Wj zU*?*tt1C1U{MZ0^Pj_6@?R56^(^J*QrFTv_H$Ok?{?>V(mjFHp8re|pb?(Jya;^G} zzrLmhu+4hvZ;J!IapOjA-47cRvE7nI1RcQ_Q0&=rO*tNW*^Nvh)993SIpGpf6oF6N z0jSgpSTqn&fN(0~Dc9;EcN8toPbEhl1z+Vqg+EnL(s>DiK}X-Z2=xt(Ur5QD`~8KlL;JiabUUU{RLw%Gn!`RL-} zqWxfm9f3Gd`eOltz3;QU>}$H}x~ckn)G+{9W7o@ucNIdj7lsfA;HWp!tDIZ_1A>doG4Pf zYtV@9<6oN7C}Zl>E%ehXhi0bY2OmgEk6*ts{p0I=UyuSVDlQh9g~ZU=YExtx8yqVZoGmIVMqgJtPXId|Ci?zm63_XDV; zTAH4C42_P%J0ziXOahNLNTvUpiPx_`POfmNgnjq+Yvhn=<9`bB^S@Y_ox`0qCaDEM z-Xw}mqDu0w-@h|uIr5F_=o4c!jEz}0yB{k1ddZ^m{X>YBtgJ%;1r+1$xj}F88D5^b zv;G-s!qF>ln0`=jo44~@CIZuZI`x;I!XvBzZ zyS+4h1!6-vva(v1DK!c1#1IEvx^`3WW3c~qzY15D#sh8wd#3$qtMY30xlJ2cdwP5K za_bc7(=-MJMixeI88nn(O`resr>Cz^41qI;SldcUD${WB%a;C{@DPBCdO*Y?O^*He zdekFb0G42`ZIsIdUY@C#I=$E|^24IgdayPh%$Y;$UE?JKA{9NFt6Ib@6LR6yEEA*J zLlqk8$^nR6cQZ0FUe2~6Xsb~^hd{qydj=c4z^ObC`$cNJqhK4pemdt?pbI}g|1O%4 z-|TTv!c=l7MVz(rY&k3Epjw_%Y>V}iW*3PS{_$>NVghjQu-4#Go$GSRXj_gNv@-~S zT@Y-Hu=o%5pE_1~=qPcP`DR;VCzDl@_b~^twTvnr4SjXx!{?T2{u~2Nl;ZuU;Lovc zM6jp4|KsCL$-;~5Xy9a_5`Xayz?3z>OA`q@>{O@pku61Cd2J?~b*#$@`{)dW_cJO0EPg(K(R1oD?Tlzi5G zGa3Q@fd1;QF4?y$uHQtu2O`lCs6@9{mh7rbtF{T;Xin`8@V`;uIGt2G8(+v{VPOGI zT-#mZIS;LyWXfwV7Z(@Efg$xz@OdCPdWTWy8w+V;c~Z8|8N;PchMcm{Ca`!);qs^X zsX@-NlTJIn{GOS4b)zkNp);y|mbP!kloVJ0p}6=CzCA8)jdz&?^#J`)^@y_C-HD!8887Yz}5LB%coSUi^iFZvA?nF4&vcJ_yN*HlVSNwW zZ*Ol;sNhhlT$x%K_U<(abFEOR|D*as`OKL!fvzdir)~Y~6R&XT>FZBSO=Udg8pA3X zK_Etd=?iYSy!KSG>9?dx(DVwxI5j9FF-u4rK=|bY4&I|Lr?aj@->MGfamN|p!BVMb zn^ghOOSujOn0D}|`Y5jy1~7amJxvd0Ju z04Ujur7HsbQ_Z-x1BtyKIl3`fBc!P43=Wt$ zN-0!3cV(WL+`&kKLdrhn^9F^iPf%h({Zhs0hj878H-%lwuR&7i3W$06>7Tc?`i$=Le&XBe*_= za4EE9-(b_u=LbpX2VNqGs52JP%|UPQs_W7dXqETzNnhd>4?{;UMG~t5GOPrvCTKHn z|M*RCA>u$_@v$AgDJfjw`X9c3k3dG9fOKv>^!X5UZIck~3s8ES>Kt9|S9m3#s4l^xAAhdnp#bzD(dl3K{4Ka|QA9HF1LqJ%+etmO>5jnvDn@WFXJ95{bIt|OtOpdo88|G^Ynks!F*Rf9^Pio%QY8x3ikIBfunyJ=|3ap>-OdY0?=Kh*{> zi$lS%8Lq#F0{jeuV9&t7M@YUU5ZvD0UJ`^_Xj>^~#=@DR1zh|bt4y3eh;nKJ3*jOz zg5Z9-;2eBr&7bRij~R`s zF?*j}(T>Kcfi{mIDuuuTI;QZHhJkzELn3YfPt%OQF#6DN`K6>-#JmM4L-L{(neU^- za*KmyPFjf5Lc-nPGA?rwnCDBf$xSG9f8bv#Ao)`uP*G+$GSRy&;#qa zU%Cj|I&5>U{RAVS5%7tAMjsAc^Q{Cp!0{r~IbuTqf|>V5DT1{s0N01{nFPKpmqczy zSCO3{&Mih!a#I3{mlHmq$4clsh}g0(Ujm@&XFv_Ry*N?M#6rRVm`2Thnb6eCTGHi}ccA@vzg)9xqRjP8sq3d{VD#FXZ@H)D@v6|0 zmNiMq-qBGxq?E9eH}+w(Wo^4jEC0|PIM4#v1=+*^*GT+eaAz_i?S!Ik0xnEUTs#dI z1u0g{#)ePahpIa{r=!Q6TqITf9wKEK95+?3Z2K#;(=|2eP}p~#($Z3cdU?XM@i~Ako#<`h5}$G`OK(@P zyYplvOA>lH1PXI>!%ciMoo?G`_~~&pq$1!WwfjEI_Nl4C!B4VjIMqwh@|nYtY0kc} zA1I5<&(Dug?{hjFeTk;lI~$WkedvwFK%k*mb3(6$dIdnYo#vP<@?S#%ts z`!;~3EnHJ9vY~FsW*``@EnvKd4jlq0^~5Oyh~?KtOWj86Dw7aTDE)}!vA~De9|CPC zr5jXA?u>4}<#PB?ns**Yo6@mCVZQ37`VJE*_C?!ckALM2hx;BqZvR$5G1jL3=vPBY zB0O5s4S|vcl{?K4C_!L6Q)s>NV!ihp7~3u{I$Q`jCI`_jkZyoUA^W)GdT%oSG`oE& z?WZ+auOxdFH0%m^ku<#JRBfve>2-&yGYgw$YR@r=CoDZ+2~Hs+92BCxm|P8|U8{AjDf=t*URuV#Qk=6clUjI;)_b$JvbQBXHbSR z@d~(Q1UrGtLLf_l>(Cy*MrKjCWc#WgG3!8xRrq?<9P3JQPX1S`w6Fym96bZW4_({_Pw?c7F&I#^Po`xVlEL&VPs^qY987g z^qMm8A*@X3ABq9|pwIZ>f3_&0r-rUOYH+5VI^!_xJO)m>y1HP8?kK8Gaewb-Sch+4 zW%#L-?Ph_=P{R}+05U@tF@gtZ^~JOUx$T75<;bo7$%Dn|eDMFj|9p&XWc`E!v)8@yxM0o3y%$*I4jooKIzZX z=ZqSA0zZ$oH8d!3J!52C88@Nd9ow`C9~b$pX;T5Smlsp;S8dAl+9KtD9Bu(-7yjFV zNU;MO_@TUCgye>l%?5Hs$t-)2m$#nX9OMy@a*Lgv-S#~k6~M)a)KsN!tIMlL3Op<( z#zYOyAPc}ow4*;iKWp#gWZhM?j&QJmy{Q)7K21W1V#u+Nk6!T$J8ra_mbLxxF0l{9w>;+d!VN`fqs{ItLUGh!y-$+ zl}PToyRWkz&-R?3n@hDD?~v3!EGYOCho)-Oqr-ajE&``ly^4Vsz#xj?+`vI2oT0r@+HP1as@!5qE;4i-?!yrP-^v z=8*W)Cr|Eid;6B3{c^lpP!J__&ukxWZ|`p4?#pj9TlOTZSrQM#6Nb&e?WpYCkPn`n zcKY_(1sDKq`WLiea2s)84AplwY)I3=p@bI$NeNKnMnv}7hI$Ra<>|1#G`!~~_lsqx zpv;e?MB9|WYUsi88h<{BH7h}dK#e6OnlogZBKU7I5=kBcHW@9q8)z6<`C0UuU^)zo z7{BKw%t;(##%w*_Fm771usu-`#AQxywB+Vb{rN}YoSwxi+VCm?@*PuInP*iL_P`P` zGu~PE_GWMBaig8U#4jKi$v)!`KnwHp?kj{pX{@~x1d#Tm87)0M{qNDX5TNZg9w!Qi z)}dQx#l+0Ybhs@Cqz(pj^RRt<=k)gC8}$IH-t?n{Q8jYcnj@^H!$kNC!MnlslpZ81 z#qC7PG2^TRBV%R5&%0h;&40~LNJ3+cESmqM&HsO<*#D=QR>3fUlacXmntpkGlG<@7 zcRaLYaDnhwHWSt<{1st*f;sH8X9cW;7?h7tK)XdaX#8*GL&n{N z&m6_BqUUo5A969_(o2J0v=Jtzr~5%j2nr2276s|{%iEj!dU|?Edc@%b(mhMGe0zMk zSXGi};lR;H=RVBE?w{2vttGshr|0J8cG1zv7T?_@;5^3-h$_Wv4zK}*(h~^%B0NeY z1ee7}5$Y&RbtDH_S-tiizxJzEXq;%Fkl;Qf0mJ8DRu}>y1tiVRT|40tZYZs{q}=y@4nj20Pf( zj~v#GH~NtJWy_{5Tfp9K3>OUa_Ld;i!9k&NUV_q(^K$&fz$S2K=^3{**ZIP)iHSqN zuAoy^V6EUE0S1~7EGe_*7ZzZvhGpVa+9xRNVLIVRqCy+nI`TT8ON;$s?#z7e9+vN0pv^2%FC9U z+C$VBindntuE2f>VK92(k+1P^_$RLC(hlWer0PHjW&yU2#xnBQFNEek; z;*ydYH4Aim_5>%)`+8OMI+^;K5q?0}FoN4q7!6@xb#u6V&*Q5I{GKJS=8Z>OGzSS_LdYTZ6M7(GHF)>%x8!iS@sPC&q@t=y3+fz$9|0(=GQFCnR|X~; zuB-u0X%8*U0k(xe&U866XaP5jR6gPCZHsJW)PUUjx(7%oYDk=iZRgINaMOW<%!Y=o z<~b?bblB=*VqyXhu>av+lfO#f$k<9oCTq3x)PF7G97YrrmG%lZ{*FU19frS*jA%n- zn}z+*M>h|L`$BrfHNJh64{;wePnXvn4<=a!Z!D48rVv04_ zCqTQfVLP2Q<}>PTWYv44E`ybSL}tj_1R}0CJKm`fA$ZHu&Mr=;_%1|K5A3hdH6)lY zU2Jh_c{z*F<=;eFZ$*+)eQSA$EC1Je_&dWQVo!xFbo`E!m9n zu#Q&Po?Bac_9{&Xr-lR!+wj%h4jlZ07A;ID>(PpQaWVQFVHki02`*j&#}V(+d#uyw zDg=1Ey>P#zvB4+tley!tv-RLty6HlH$W^Zg4>Erg6AeMD!P2$1&jhZlQ;A)N&sk`#W>e!5Q99SHoPUDEMDaLFh zVNr<0$?*H}HBqDN-P^YpA^IbMT=SF_7YhveX0ma6pH;Lo5BYur0?!q!!8&sy~#ElCys@np20y$W8-+kJa`X2vK%9l3uD;1Ra#N63sB&#LVa(cM2zzOLlIV_ za9FmFILXI;2{j<+U(Jb?e;QZ$(E+p;BtWB0zQ4;-c7j-8X=T-eG@>%f!@~nb>lsL5 z8@J5rd(x;^r;h4}a1;n_0d}tLu`%2yIC0K2+9pzwANC^fIC<%8cYJUsA#@hQfO%oGP(y@dAK-Bd% z687W1WkgRpjNt9$wWg7+%Bl{cLk@g@G~gc#H@n0 z)94AJUDIOnAMez7HEWP~G5YRH`{09^!@8WJ9l^9Vc&o+0GB9_YmIR(k9|Wv<$T z5ecFOf`wF$Tt;xMz3PuN_?n@4MM9uVLNUcqH<$c2N73muRiIJ;nOXbeui7T?}j&>5UB`%e5SFYQykO3 zE`$9fSGvo5e+Z$NA_ss*N+LrJ)xRV~zpy~L8a-(?m88(*hJVe%NgodV42$~T@ z_XNWk5LGwfuy1m!H27e^dlL>%WGVBXjsIMVSbf=mcsA}5xFsIMg9O!Lb5UibHwtxc zU!O;%fgbWlw*0@Yok`0~+W@6`pPONAmgw@_1~(AfRP*nG$Vl6Y&>zwNuOE)0L4qY* zR|NAtNY~{gRSGxQ-Pgf;fBnDw>`7F$BH>eQ;BbTwm;~8$a~H0J1OjdOw}gL@gcrUw zzGl$wNpwIMf#@x9Uj%I5b_Tw95}01IZ=u5deGThPF>0h}7XTfT1iN1U`nv3Rov=G@ z43G`$3izh`pDv^F8x6W1r`D}oR~;b~5+Bd{<6CkSENa_|ke0xVOL44WGjA3=@z3@K z*rYiG?9hRHMp!5m7f8ACi_l9mS@8JRGSj=)c%mHseF>Y4)CcY8g3Xm^2eiL;3CB$^ z>5E`mWWUjo)$3LOJ|)1^M>OOFY=(BhcSKAs5Tg_9Zy-E4!wl%{=Q@@IQ7O3#>vF%0Ag@B>gI*?w{c*1E~0N;!;0D%p%_DTCkuH*a;#cvxWGeLXM&GaZN zbWKl_hz)=-LN&|5!QqI{!ceS#D7xqH3&tUr5z0NBQWyY19Lh*^Zo;Qi;?q0e*oG%} zgIlpqY7+M!ZcIj2{hf^b?_k+13AzDDgKOVBa6>4M{$|3?gp$7bIcj!e;uU{#kvXC@ z&@(g?8a)dVJyL=oMKkfsB~5E9D@L?|_yt;SG`?P2&U=_L0zgUdD}<300J*rK;dtfJ z&!!X>_+`-^ZmYk>O>6%%IcMlw^8PJ5=?GJoe?$%-N>ojv2Hvoa7==sH$_s{-sZ|j7 zgjbs9(AMUat6PQ1&PpC+-`HAxhdm|EK3=~oC271zqIUj7_ZX+;^xeDE(H))_lT-!< zBRwiAq=ZkO_I~hy37<*Wv|&0|cljt_I|5Hy-*ecTRmx#gYDm8dJuiU89We}&U`WVy z@rvg+;qT0fhDLBwFEns3XMO~(RbrfE9tqp7Enc%J-K`)&q_KOZvo!qRt4ns8Q&jGG zhkh-vtNv!US)jjg>QjDxP#2gBrUs?p7bYSC-6ohiJz+BM8XvdVn@kDbduRQo-37<~ z%*~O&s{Mqijsa7pHa404wyLVC%-@koDp`?spt}1Z_R@ii1#O1TfK-`9K!pqujDsJ{D3|vZxep7crMwcSVUB3@kc3g5XHYB zIohXcu?*F(Hx}07q}+gRqmq+nGkgw&cn$%E?qHVTS$Cu!Jz&@aYapO~9tvbsv6`=! z)HSQr=}qVeR)8;VTQkr?-0aYoyrpE@xaGUaN9qCdndeBbMHJXKC3RS)13TfYGx zO*5~LuTWnFsbchOn;+-Bo7$E(-nT~T8fr$3a*W!i8F=gn)AyW5o@jy-2AplWule4@-pOWx~V}u~Mrn7S;K*kVIrMU`| zrhpA6IyySgQo)Glrsajs74%9yAr9cRnF0%Ba$P=0+-<%_F)*y=sm7M=1i zDZoj-9TSU$p+)npCBg4xs=;SW;6p}`5WPP*A9-E=*uVi(4q!AATHwVbZY_UJbZmd; zT^Xn9Q*>I-)fs**eHM)W>Z)7on|gbfjm9QXGLL7Qo)5o}ts#m%D)l3-t0FQQ_(-)> zopt!KZkQEvG8xv0Z#s)y0wZ|_!UJ)Oyo@@J+lj$c9>#&11$Thh@WJpFYg89WkT~ce zy z(^GkYCCd43r2*kT2+k&4f(RXA0t)t+b=GN60&_9z9Et!jl9FfCW7wwj+RN>Lj~VLB zhUfa_ewhBL{}uv$MLGoIh<Ybfv=KvH{4ANy2Jh? zy#I)++O}<5TGC=J7?Mm%KMDzkH*Wv|okK|{ItC;_@%qFt8q{vwj%uwkx(URXXg9smGM^vgoW%Fl_3rSA!^tNix-j>D#CqezF3-$LX=(3ksyNlHl6cQMrY z*USl zvYr8^r|`sHDZfBzmxh(qok8t;$M+wciB=XoY*GAV#kHegnWi*u*-jsY4`b5z;e^=j z2H?O8F9Q%W;cq0E734Pq0wiFZ4V_o3;rb(}-KCgO2#wE~0_P6pGucg$84TeaI(X0x zh@&1o$?|s#0D@V-%WXzcE*}XCCRiLe$BlU9#NcElsVw+IlrpZ#!Bj-JO>x$t-nn7# zfOIInrCcR9Chj|~7?{V)*EbgL{3{Ljj*r*TTy;JxCAF3C{>tiMpeAU@o`Hd3X6*Yy z#dR?#8_8#Aplqbso$F)k(y`QOnttd}+9Au69j_s6Ixv%URwXc@Cx(PlOww&?!_iyj zj^uib&%0HdPtf0v*pRj5<&Vd}E^HsslVK_g-S{u{A>@ZXg!}~}hrtoe1oQVIz%Sv# zBG@dLP&XcmK#c)x*uX=Q8#2)F-VM|+2M){%jh2KMFcuMsM6WcsEyp4V&77LT!;F(u zD8lGF1;Q4iu0rVor0?bJP59^u0XNUCrhxTw%CQKJ^9~qiYjVZY|&fJqU0sDx?slF z(z2+mOnhn_ecL1L?bQW`!H!M8c3?^gOtu`?52a&E>W_!P2ie&_S^4!LW_Y02RgJ#Y+XP=P1I{gj=P$E*ggdQR*^+vM4T|)Zt?T-+NvOQUdt7H|{81i1$=G$L^fwv% zk8LR=8E4$JZ)F4viHxr^^dmx|;{4j$XwT6@QXYwUYL^jD6p!W$P>!cFYJ1L_Y7uD4aV zdnBd!aSwMbsUN3IXYOq}Gm+bBaSv*E9-4IoC4+xtzc+XXJ3Y}wE9Jdoa@Swvj~9$Y zR9$HUonP987#Uk6G1LVa+taYyedD&Hbr>=6CkDFv`bxoUh8u^MNu9w}0RnPN(FZc! zArxjMo5xktwn6Xc>_@wai|5WO2>+4ss+9>c=W3(-c< zqS!$$d6mzK{L2UUbFZ1*9@d#+Y=f2W(#}ukP$}-f1cEF9{X^KSHPag+u_A_ZVE6m2 zIyG)QhVwfTi>WnYP7!9_r&qKc%Ie?i6t9PMj~Lq2*48FQc9AiNlmiPc$rN1_O1jNR z^B3*mHT|3OFDNR~WLX&MVgjk=qCl^;>|r41^dMLOVKXNoCIkW)#Yv`^ znm}3~V3G{adQcx|lCE16dg>&JlS`Co;?V$@Fe3yTI2|zKQHN%Te^gYIA?py#Qjm8~ zqPMA!1}sssVOu4XR7l_>&RnZpBz-GOY;MsE?X$Y1JJ*e>3qrY0UZ6{L+C4H)=J>|l z=trl;%1y7_**3TD1KKuwnHmNSWM0o1_*=gyN6scD&R!M;KgW~}a=R7!vE@%)C9QFw z47gAM3_*bCgzX>Mb-IM0+)YHSEP7&5)%1ubGN1t5w6@lV39lL!XIL!KVG0zwwyMxf z8dMIi|5y+dJ@%{5_ttxcP+#MpTh80O%^h8o=l`>#aeqMcrjw~>12Dp-M~s2CL$|Ie z?*MHg#FBjb3E2t`z~}`oR9cxv`u+P!&uD5s`?@GZej(JAm^;aqQbs4P2yzoL^zL1| zh>={3?Dj&NmZQ27Cw~Csx*Uuo7rYicB%AC2HTrk_`wtuvnQ;0cdEt6Ka>gx@H%5GQ zB{TIxm0JV^&*wSpx=Ee=ke4ep<2E-D$p8`{f5IUod4Vw9g~AES=fzjtvmZF(aI zr%B)gQKuOW5D*<~5f-oH(&uzfUc*m>)=upH>R>F2LOHzKG}`uOiA;r;@ZF#cv~oT? zy=~cR_GwEW=Y~-lgWGzjPd5bG-&cI5(eiq|KrtroJy?-*smmdLd`sTPl0w%|8qJ}J z4gA-fr8%I3z_`^9$pk8V`i-`eu=;L31A`7LtPm(&Zn;hBAkJVdX>`|l!$?NH1&zXh zrUOx)aI-?(4#fAGOZ5xEC1>E!)JhvqyAGIpWFyZszXTeO;|SFYky_`Z&d z{GQVFLql8n7HKC#Y98nFy1X4iEhNYun!qJ6L8HsNmB5n(zXjI^)&cSTzz&DAFk!PJ zG^&gS*+87ph1u~q*mejmLxdK>pZjKy*!#uF<-i`q-(QYhy7|~xF>j%RbY6D;({kEbncBA3v%!my9E&&_RrHwJqn#k_6D#xBp=v0t}t{B-HQ zqVtXPvkh;a}Ma8AfsQtd?BXI zaYvR*e{PXX4%AdZn>dee!db~sR{uxB2vojz^~9{~)G-t7isvu50+I`3G;*5Y(=~jX zmmA)DlI)(Qc=Xu9fJ5_cy3idVu3mv`p+o_`Zl`eagX z>FUkWC#IVtBM0V^Z$COL`uMuT%3{;X-S3pqlRtm8Ck|w1U;WT7IB-rQ&Vu*%So6#O zcx018?$XsWL!z@5Q7t+hB-%p;$I@S3Sv)vYdRPt<$03!zot%8yoS#Rq+V6F@rTW5)r)yZ zvqhz$XNSPQ8CZ-mX{|w{Ql#~QvtGreIv%lLX!!5-Ye%b9lW1!pB-QcJ_~)! z32`;lmcs?)o;CgQA4RgX!~EI3F3Oo&#*!B2Zg&9IKnS;l8-n}#Hx4v0b$?R*4Q4Sh z-pAinXqMnTp%roFM;|R3EgZA}NH<~xcmS>|DkyWy2wEM#CJ>>>W_!sx(Hs(-Fq ztiJSEiMY&T*5WM;&N^&w<-&N6C>D>x%$bk5Us$CcLo97MEcZ8{9K0$GJ~i40jC>kM z^c6|M138E!f=dZPF{PlS)Ne834Fgs=2_$(EViVedZCr|f-(39P+!7S~4UXu!+G@-Z zLLgyaM?dxsc=RuT*j@Ov;mR2p3hFU=k2Vu#;$T*4J*te43*g}Fc${4pTfdKMk`HU3 zL%6XqK#qPJeiBC>z7?n>n10AK|Na!PAP}R#aJ3X+{^Y{t%a>!U_mSOa%3)J_)jXu_ zZIJ(Ot!zq%O9VAQ`XZPtl%|~D>M}!urg*v)hN|&oi&t0?62d2f2NGFAZPviEkq}y( z{Iqmy4$@Of4{G|KdETE_Z#{T4U}PBbHG!95+g41tbObIb$}*JL`Zxs!DDq`RcQ#Z; ziHZ`X12-|!n&k@xdKMWAWEMr9$FQUq-A_=jnGz32e{yN@SYz+C@Y z*$G#P$KX)HTtK)47sp*!dhrAb7Cyemup}R?v?lt}a$$HpKyU&a5EZ?UGYHTUOyh-S zMHd#n1=8(l2n8G5z?0?WPbw&^o_MAjf|Vkk5WPDA71Ghx?-WUFBj6s?Jbeg+*vI{+ zPUVOVh6_9Jf)dnKKSrCzy~SBR8{@~sGz{E+#>4UvT@WCMC#9jqcpQ(u(ugDab{v>! zBHMgEunkDolsXR+2sr=lxCUYd4&FFiMKhZA<1Lsl`kNsknf992!muJ`sK&wXhm>nkj(pLK+Y(E;} z$PwU}N4tbxAEEN#cGmG@eEjCko55|{Fu?v-eW^+n=99f7#xzxpfh;&XFMryHAA*VL3gWX<4& zoq<~?r38*pV zo5aMnKdd)t4pZ0gl#)--RQ}Q_d8<&O^RHrq$6(^wT|~bay*r}G2q%|D8D`8Zkq$63 zU>3!+iaD4Xh^)2+La{H=^gfP`Ou*$_%v|BLWta-x?AeIM<6&al3E~zpx`?3`;w!-} ze*OAIqJyzd+~(&reMzSmYZic$IysCQ7@vUvPJ_(K43kPFE&2zm-L}9?B;MYE0SW9H zQD%vUR6$RHcuzcPL#F+I;?*%+!KAVDJgMs)O zCbTQZ6jX0s#?d)Sv=7eTCzD2FM(`FMt#OsvXsvUs=8YdG-NQi=$n`3ri14<0`?)Qx z+$$>CWwSBn-18c`_({jsy6IjW{y&_L&9|qdeERjbXHM^lw;~lI!U!gU26q_}sOJ}B zJP@>55RCY*7x#8_U*^XueV_boPOt7>QDjx6SU>Y>T==YH_nci<-_x}DHNa`G3kL;{ zHX@$m0mEY~RD~vyL7~ z+-xH1S-DX|;>^oqmJ@>y>J-<)c0+T%kR;Sx90T+@+y2xuPI_$jE~tDMdY#QYfvr?} zvo-HiQZxVJ`RaXdZ8vMr^AEdy57N;*_IAkP`*QED&_^_HBAt^~r#G0Ss(wp6-7eiX ztUncLhW3Xp&KKchhmC?4dJK`KBJLxDZxGNbdTYnqcp%a_t#)?9+;!3<1~QBBEwNjk zb#9X^yt6sBret*mdC~p+`*=zEy@P*yV~?MB8Bh}-{f#ENWLuGQ%F*`S3s6tEFk4z$ z*lPiK(WTdjV_{@et0T0T{^3eY+)&#P!;N2i>2wo zD$^CbU-R@hgS8Ne`O*sChLy-{hfjgixdH=H))o%aZwS|EKqK+Q-#&`pciQF8ZCE!K zogw6`)#iA0jyiODJzoSFSHTT)Uw_YA7GX4}|BY6+@!zcc^gp@G`n+iltXo(iz2jL^ za3l6__Ir`j2Y3tmk zlRG?nxX$|t&*Ulu8vd*6BX!^V?}@X%!=YpG!@t0Ro^0pE0`fi%61($Vv&yMM%0n}! zADcfmF#+xFp$|knum>g?Ba~NS@*U`L!)M6e&z_B3-2nH25C-pKF#Acg zQ6TeKAsXSRhk5l!&@AvyNexJX7*Kyp(akpI8^?IGJ z^L#!Z>p5Dan0n?zr@SoxJ=L{}^*%HG9Q2u%b5ljs6jSbhK;vB-_n2%D|1{6O+a%tm zeQLf?HGNv;@~qyA?w@03^Gxo#XUks}y1cyA)ab|H(sz@|AvPze#h2Z45qB)_hi4Pc zp1-#UdU-~(Z5lt5K6*u^(|!C(m$EVcrOQHVUZ&c=ugD0H4iwq~fbmC#HlsKPp9_cC zC2Y%JJyK6Nv{@*@PeOtxf4h@;c*0GEEpct22wja*S#f?o(444k+H#dac02}*nWkl| zw(n}A>E;C~Ta~`~Jm<3Mnx9R!(l=!SIOU@**!Qygv-)w-4|?8d=$aYuj0*}rV$nx0 z`-)>@R^*)BImPanSmtjZXEI_gQWhESa&`{k@vJg47nDs~`6-ts!FK0IzDUZ(+$@3A&vA#K_@?8`Wm9wDF(C5>>SA&4hpjziJZka{el<$`^i?*uOaK*9CD z1$u~d#&!zD#=xG5i3!gpFIdFNedaWzgob|b2s#QVKuX^`)R$d5(&&4v^_`a(-EOW# zh2DB2nyTn?R`fr>rPR@$lWok?fca zhYvxzfHG9$joksETX12bp1=`;q_dkx{c#(hy8*~m`YC=rQK#XFVPItZf+lVqo)`}= zuhCD&j%@7Kei0~BHIWg9G~G8Jlh75yO%jC<->}XNKb_4p+^22zqwO^F&n)6Xl7ErS zk3Rb6=Tm=TJ(I!pbjWni17jAJ4@d29pf6I3lg6I=^ZR>Ud=eC7u~yXI;0j(ss3B;B zLSx{bf`cQ>9@k#WaZuamRP=Ht6wM62|BzGhDn+m-oXGsvmbq$#I5 zBy}!A#r$5dM|SVg3(%AvGu6a7`I|W#Ma9e+@Y&S+(#@M(gpGonJ;QrKjB3}BFEZkA zN<{wDjQZX8#dMJ(sjIAGtl;Nuv#-o!J{B#;wVt7FCchcr&Y|R23>){qOY=leq*w$( z2C{~w0a(YvujgUnqB<}Z-LO{`^n&}V<3oTUsQ@e_z7^b7VzPh#HEaKUf67JD<30JH zuGU|?0D`R%1(b5^3*{yaOs0>H;1>eYU0s%@+EbOZ=X}31wUin=6)76MmFn8~)$tIj zb{oP4!LLIwPH@5_#P2WwvUN4eMA%~VUYN)kKaSLZSQo->Wf8qRuWe6Y@t~ooS`-X_ zI%{#Cl@V$W$ac%(+7JTLK!esu!R4g;R&kBQNBqsQ(WBH0p4h0{_4<`x%~7TdH@(&? z75tpVT8U`?i@beV~bPUTWhk(`d|6R$xN2UJvfGD&O+5bS-~6X zCcSU|#?n>NfV(Vdqye;dbmW6vA=$gw!F0Cv>_y8@P)aDG67e9SZ3!YcJNiT7%fogt zJTcJ&OF>CZO^xJAS7cBSOcGqz7Sv=om7>5P00+8@XVyK3lRS-lv47u zsIzJrOd`1N9ezj`V?>i%y@j*W#3jm*(;=7Y{O270^kZCai*_{04bPidmSj-cdm2TL zj=$fyF;+e=^!%%c9MpT*{ZLO)5kCMni)wVXB)JZNJxMqPv_ic1u)Pk19SA3$3RnUT zrXFP@=`tjG0Ktc>5O=&-PXe2X7#P?Lk%@pFB!DpL_%yRG^WWmU0^vXK7*gQ)L}Kr4-*Ij{jT2b3AhNvktgHt==lv&%0uy~*or@2`8?wzo+IkXI%- zBeB?^d){7`gE%RWty?Mva2(>Bu>g9!AtGi0;8l^^Ued%BondGr(Q+Zy&%oHqN)X_- zv@pdgy%#DtwSCrjhzR5fsLJKrhYinsL8$cI8u=GU za1Eg(&GmL5h7X8u1UQe2oQN+W*twK8&>-CoTiSCWj2A$8h3|D)Z3fbE9aJa?2jqb% zN)Uepn8yV{(`oh%mLc})Z%_QDP*f%B0d-K1)xHaP4Fj#^#a;Ki+jbYNC!I6#(ca$& zkJIYJW2chndMv#^WZ6?8kE62R71!h40Aw$Z0X&f~FraG(2Ua7k3gl!l1SAm7QT>~U zY-`f)ay+dnklj%|_qw(I-1+m#=fTeHHE1rxIr*I9JZh}_fFrNm$iUvQF?wYKN;8|@ zvfn@?0dswZHQM|8wE%?^Av_@+?Za+qq%DLtT%A6~{vIc3*$l*RB*g{g+7SS32WAQi z3P|o2E=P)2Y~R0?Zs)9A7Si`#k;jI`@{sj_- z3KhElvQ6J=H-d6eFT=?K>#78jjTHk5&?&>okk$-DPZAge!Sfa8XU8JPL5UcR0}uzI z-(m!%I7vcBhWnWWanaGY41YTKx{<#&u+;e-H0o53<3KUi4UI8TziSTa9+T|v63U<* zOS}&;&NJ=)lCS-DdUV({4?uuP(p6BZq@@DCxd8h99e|&`#?Wdxxw~Uyl#7&(D0>SW zYXi(h5QwE6YsC%qw9h$fTk!3!fThANx^-X$+p2?>`^ktEd9ca|AHO!9tYRb3u#hZ_ z+60ac zkO~zh3vgk#goL^Xf(>0bz)5^G&S0%*=}TBJgz2ex4gFjJd@*DU!nsSA9umPdLRVm` zNw#m0B!|vPel*#gAb284>@b^*4aR7g=_ zYK$d2*u$;d?k8861PZ;beu@DY=jbmbpIRkgxs&YN|Fq3fxem4R;-ffLO{Mp{q(6(GzO-+Q}ybi?eD}jhg_V_T+xq9F_yf7h6%}S z)}VzNgxWMqv&naL*gD%B);U#H5d0A(FAoHsTK?o>N zwZi$W)RqfAf4V`yaubNkhbV4C8$r;zP(AZS2l)U7CZfVe?!Yp{J^&Rs`tAV(aPSoE zf`!s$n;du5lC+B_N4=e@sjZ{i9GkW?jQh}T|C}%NV<>%IcJXC@IvV$A?>f!k{2s=>|rOIMOB8wS3UH79zpD`Q5t11%L(i## zI>$S$^z8MLp;g|=5ImrzbaHcp9p>Y^IWQ1B#8Fg6 zoWtY4>z^i}2o*H!=pUQs3s^~_x5WW?{*HvL<-#kE^6sY8&g5nQ|mU$&mT)0(j zAJj=37PRXkpkm22E|Z(up3xIz8-DY1Lf)8O8>{=J5XivAGip2OoKihEe2upl+YP?u zb~lVB9k@ooE(+h}*nJkm%?pCr2klj|mF5EZM}D3_`1a=M)r(kFs;J92t4;Zi4-Rm+ zE?(uwJXO&sT>{}=kJU3=ejL$AS0+v*q+Z5#DT%p=d_Bbe9*QGUS$Zx|5*)$EsAur{l-^ zdar8TD$SuQ1kyWQNAoR*?(u=bUVew@PW@s^p*ju#ZPaq2>Dk%YgvUZ$1*U6-!|sNi z&>=SRoZ=b_>wG#xDjiK6803}!J1lsqx}VGA##V8w;XCasoI1sae@aYsvQ`FH(B180 z6lqzx$nP{Xn3EM(*XdFybJxlxQ(9hUW@6&g38dOAPS`B+>uSZbREIp#qi2!oWp-+SR zYl5eoNvW<BBn0V4G>b*xu^0U!VH*>Yk3zuWLK(#uDG}-Y~5AVIuHwXGfc|#I6>L zotBlpqVHDfCxm>1Gs@aNl&ckigM5Xz# zOSJI`ryp6lL++r+sRDWu%loei zr4vAs`S$=L$G_}{nD=mODr+mMXcU9{oXzvwLZ%IgH}Ye@SCE(az;kV2cuuY7p)VOr zcdj-3>&Z-xCtuoPI-A1oz*@H@*G`mQO0kP~F=xtOa-OaG+lkB>vaRWETX4#PR_2Le z{p;>BKKmSBsHhKbnVos!`?de-qciu_^ATn+C(Y=J{D351Z$8`75Fk(?$-v@fsWxHa6!;rT`8+PjBxrASVRZ-GU@iW3vSwQ!*?f<^F5) zw13C#Zfeka!^n6HiXww(STccN5i8#FSc8OD#m!nqTjI1t$S)MG11;3}}DRC`Q- z*?=cBr(`N1`^gfM>Gi$7EIPv@Rm|E3YLY5l+VxWGS?5*r&o4`q@$F%dXDrF35&nF! zN>@#?2&z>ebtDThbR1E^Y^Z%nvwby6gJUGaO!e?2!|dZ*xo@3Mo(}(Ur|j|7Opiy8 zM8l0((sIm!JHeIEgKtaNeWG25=HVZ%w}E(TS9guEJui*84+t({;7@dn+2)_8!_+o$ zr^Tl)q1(T5CeC+#r_1&IQ-D}WUJaqL1_uYbs-ihTfJ@ugUlkcgpGG`D#Fm2qON3h> z;Pee|GDqf!2>noOkj3LWBWJ(I{@J=q!<3JVtq5c0#PBSTrBH`S1AR^6zvL~7>Zl$trQs7*B&(;Y18&ITEt~;v7PZa zIZr5S1me-JNS|oUiaPq_D;Yh39`6YvTd|*5KxIR8>m-|nM85ocMuO-mwrEdWAcYdu z!R5on?k?ie0X?_!70o8Qm1|usA3fL{&H*HqmLkv?xc%<(6UxnTABh7IU>Xc^%Lkf3 z?!l&?k7QoJQIaHCT0o%xs`a#baLF>ZRoA~n-cpSz)fZo# zs`~e->R5c(>}YG292TlIbcWxD+HT+78Ip{N2rvgPjPY>>)A^ZeOl(IL+Iyjl616LPYzD1OG0Lt#_S^jf7%hPsci?e8bbC`9KpOowxmVXOaV zn5irDWDl)L^U`iJnj3U+fj-IsxoJ0*&2zK)Gv=vh^sakN2RP7>EK+D+QQlR(OZx=@ zY61=3>~Lw-wtMK1F(R(~U8i=`}SGBhOgAWx}f~8sjT|!nlym3(Yjmp?X@{alCyyOIzu9aSsr{t+y5su+(Itj zm22u<%5R2GHa{NNqUH4@uyDsj6JyY%PSP{3UU&Kwb=rONi z+M8XIosRCwC%U;-8WOu zLL`cip-c6@nSE?(B=H)c2X%X2pWVw3&k4E-Z^e5AQEWV*)tvDCm(6nhw^zh~0Bsd~ z={FPhv4;uQCFHIhUiN|K>TL9x8zBYpuRP|x!vhUgf2DC3^2U?eTYHxVB&Y>wWAa09 za*ai2f`?cn*f5rdwt9JlNzGZbeo}t3lUn5_$w3#qD|Z_Gmx45VzKAcKrv1|rsw_vd z6o(vv0+bM#nj{PwvZ$q{E@kee5G(oCNM1oK{q^D*hFAAxa?~xIQ8i!1| zxNaDJ(4wZ8Ou2$*s~*cXd`AD*?$wRkI>|zvHVXB4g>#7`ABE%_^uKy))Hzy=BD^Q|_V^(||`29S_s1 z97aYG2Sp{khHaM5@zN7S3#$`z*+)Fs^f$}6B(5D3(nL^KKvSA{%WG~hHpu01UBGd z=n`X4w-d}Z<4+uTfwrG$`*C1>8V%Z8fUq|(vBgbIGht(WN9ST6>;1NBQsk`mOsn=- zhyLwBSu4T7^1I!hEA_l?VyL6vlgDv}Y2~`|kK0E))dUN%Xm10B&!M>4T%i@O6!@LV!%iO&B z=M>{L{pY88IO%Q*JGIL3>G;s?^bct+stQUq(ahqk|7JT^oMTckRZM)e#yLuu4LLR?t6ZYNXgLfC*z`VGcNf71sp>*Y8zNmNFRY4iQ0gG<- zfM4GhcpbGK!ivcd9g@KRY94Y5ns!vpX*yNq#Ac=JuLN934o_`T>h1*t``#TpsE}N> zVcZC&FTLc?dRLZ&37dG4nqXqyUw>}+={<(;)J>0|3fKklV?rZlL`(t40oBnt)l-#y z0b0`Pl{Tsy;;-)}3-@y$vsYaZK?QWi;(&~;=n zaLB6&Npl$)dT+9MYIdCtiWEz_Uf(M|mg{4t*v@~JuE}+;D+()g)IupwDCBLMm&!pKr%N48X`X@cc)Do1guW{@Z9Pgt^OUTBW zhsvV^qhCSRyvBNj1+EjQA<1Bh6uH15LOUf-r}#dzh|Y+MdKLQ)`#oN9f> z!PemIw^dA5c5PTHI)B8}ZL11uoU*aaTY2nE^JhL7JsCDCZWCwg*RyiLus8VEB&oj_ zVu%bYvG;{HJ8ptLp`B!cd{Dv!5*!*)+w3@Mlx8;nSNAFziGWg=hyp&vU(h%(BFmr1 z%hR4MRmT6AwO;x&Hq4y4X*>At_M*xSMU_vS39u2Qp78beyMo>%855(Ba#js{GW6tR zR4jUc&fc%tJf17YwU)fRrdQ0ZNF(*%R+N3PA8Uwb;th4A?}U1(TxhhFB=1C_mVkhNC{R zmn}~I-&0Kd`v>+W_WF8^XnVLFr*toXgW(K3_P+UsX4|fWve47tfr24&9OB-6P_ko9 zQ=x!yoRMOyRoyX3L+VJJ3qRu9i3$0vN? zcXU^pHln+_G*8y?k4Bz$Q81t3qR(CQ;`JRH|0l}jiWgVqEl65t&Acag`P5&{JI`jv zFV^_Vt2&guda)|!0~xl58&5jqEm_I5QTwlDdM}#o9y6Q^pa&tKvEl*azD4vJj;GIF zRV|;LyJ;r9Ig=qilflQdCPlY2N=m9t&YqI#Z9|Fa-r3&GOSLlB>3q4$Ov7P?tL<6w z{at(4={lZr*~}#~(e?7$yJ><{Ogg$Sr&)_fui#zz!iMGSY*WcHPMU<`TR)n&-|f3GY_R6J@9LUw2K3E2 z;m@YdJKdS?6k}>DeBUzbJ?Fg4gw<=9aca|!2}emK2`oL%ceYlZ*JQIJmaah#1e!rf z?{RxVsv}s1kR)hzkSR=sa7KJigc>=kdmvKzo_M{$F;|(&812-L2ahT?Z+`Z!Ax5)* zLi|pd#;~+7*F8Pu9GbkW{O59LK6=$h8un2O%j7*&zWZVO zsi)hOm|4Wv4_)8c|MY!~@oOeo8B=^KO(ox5k-G!li%;ahXt+Q)i9z%rj~e|59MC!! z_uvzlRHr9SN!hAkDEEAuCx>T6+AM{eUTMjW)@Vbitw@RiPfzFUC2S1Fj;PZSxa?km zsen76os;v>%WlFIxe{~lRpsrwoDLO^W|ICkxwcuKGGq^qFZY-^`TW4@^4oJ4-q~J} zTdzfa7~5_WNheJxrBLXRc+;fFSQt8>K_ET=s^*q$rFVfr?d%)3&fr*_RL5X#kQt>s z;eSJqgOiSq5~v$@R!24w16mTI0_|TWnK+p&Ds4ZS$2HxzDtaif`5)xwgrB=nd>;kC9tzH zd1+^R?hl6{qh}xZ_Rtl4I+ke3)jzk2;uh%du3E9By~E*CraOL24R}Lplooub@CKvL>s7 zQs7np<$lU<3Vf(PcrYe+pUjFYTBIu;VAd9r8(DCA$<>#Nt}~4$tZT$L{Jhwbdjiw= zjSQFmsHhl}uk!Snke^fXJ)67Nu*Et-UC4G%jN6jtpfBgBh| zv>7}2VYWzIum>R>A$%r}`+6vSeywtMTXs__*kn#|WtDZ@D>PW;l(8pp1KE5a*fXtF{j^o)6G96)|Ns%3_+T zSf4!@H9$G|_{RKMh0j&a+DwKad4ILf-*oxH8D`~De$!slEiV3ucGub!?6eAVG>BW} z7ZGq&>6upl^=TPmYPbGSS}CKDtX`35_5IUdjIVY#G58u%U2tcqo=9k!*NK?L7gm2*O)tl!@>X=xg{|^9UCrj; z(S#V+Wdh+m+*op0%@ooh1qzPGcF8U_ruS^1dbYDcN$J*)j`JUqNd z_fg8sPSwnxDrYrW+Awx&{>21OrPp$n&3eMo8aAi4(~I%Cx~(K{b-{n0_BYoWbje^oTy!LwS`#xqNMg@yfzN$(WxSC149Y&klpHcD%z z*)q34+{NTetm$86RdXY2I_$0!pckeyc=o=riLAJpnB~s@q?Y5;<>0GhHVxNvD}}X= zPZ-cepNq(N9^rtP8H_pz0dTnw5q7ABHPK5%Df{OQ6qen(oSg7}M7fI*4lv>^-r zCIom_B}PtLa1kpRQe2V{Fo|}Sa=upW?e%Xg0d^&oCA%i7wpDCK;6;hv{M1>m!MNHV zdnY$8`;>FZ_Q%G>C(Oz|`l!q5{dB%@cEQ0EzWMVR^B;D4S=1lAggpCFS20~O4Z6zY zH0*uuQg8-WlYCB4d)bEJ&M}E+q9#gl zK|UXJPp&7-1-^Kl7`*y;>oq%u!#M$bDR(Rt4oWdzevz7B{Dg6z730HQz z&FqY_>E+Horx<{bF@rft3G?I7cg8=Cy9Ze>thhla5}kn?+hwD3l~+(?bAFic-uyGC zgcZL>&(?~kZh;NFmCtw|`*XbC` z`IuY}jXi3+3(h?5DBkcjy*cL4R#8gGrQN#o z>b)r?F)Mz(-N~Y&g7DK$elp)Bfe6iobJ`k=~Wk!ABNZO*aR zQIAVcl?$WZk#ll*LTF5SqTjPBIpg4DpYZVLw!Y#fIT~z{txQv!daO^HDXMnH+GXr7 zy$e@;wrJ0`)3kwA$-@;n52qLU4 z%z-35jm#Y-$)b3#N$6ms6i$iZEJA>gEY4C*@ywhvqWpz(o|GXKyUMn7AN$KH)Rd5O z^PfU$FEI3N<2&IuJo#pc{me>tUk0hBf`>G<9tTxzn_qP%_R>80@q9`ozUY$nC4~-d zmm9^oFP$l2c(c-V$H5;Q+{_X3awa6F0opt$LWMBrjikN-D2Kg|a<8X@O_ytr=}(+=A&YddfTXWtdcNi-#-wp=lId-+-s)8b95XR4n=Mkca1sGs*>KR z%zg5xfMrGRsbshQ`Y_18mV;sWO;-_lW`bt%!hs3P-$~Rh@to{7LNCU=E2Dh=am>Y= zS&Rv0xYqKNrn;_4^;GU)Ja8bTQ;L2{xMSk}rb^BommaS%7Axud=uti^k@#ap%J%yw zcIJ%McC2_Z5pziR|5R9LN}#m*_W7lwKH4$4zFbxw*&XPV<01P)N!W>MohZa0B}%lZ z+UeswTwp3YygjCYF@28uc9vciU#bV^=ne1T>4|Ai`u=FU>RVI&_b!IYtXpbwJO5URO=Wcy1VaEiVohbetCmx6v80S`F14Aq+UhZ=HI{6`YU!ChEjl|0~{z zaX{I8E@kHm&FtaTiPp=zHc|&Yi{=*!@0%1)Oq5v}f9&$@Wp1n8eNVLJ|M+qMBvxTT zwL~H7_KeH-S=p$%jiO$@&3IkduFb;9`{G?dp}}(QPF2B6bC|kGGXAiC%eqjA3a~S& zgs&44()7%kX((u!0TKa_>A*k}lidXyEKTojW{*svbNIa+XWFD_)=**CJm_*5ZRsXk+z zG2NAH`qM9?rC5RvOF!=B2-C6vpt%sHZ{H9Ofh}+`bHAAZ8T9gn5OOB`0Dz!NYI|rt zaTB}1%n;8IiOB%&`(~IP%8UDC=YX&+bF{Wr5AY&>z|wV;Gpa z#W8N`W-)xexV9{y3yPow4nPoC?He~c=)|kq^I5sPQK=+fLrNST^I!xgz=Y5NN|nr- zp&|wsT#LS;5fD{K2;_jm`jmQUbw^um&StYiA_o~>@awipvxuF2w!3Qg+Ijck2Rj9B zUzzfJ;yKaOu1)DV&c-I}tSP*jq(ZQ$Jhte=pwVW5YpM!3IhDx12j^O*kw9b7 z_vvj2SLpfk#8)@%ee~$jY5ub*+`)28Hu-y4U#xa(^7(SV?A%*%8-+=q%Ce=2jx1aK zOCIvt$F&Yp4i=o)ka1e@;SY7e_vj>}G1`OZ4lp5*ZqA8zTJ0 zM2_)R1l8;j1)A%gAx-mU9hts$v*sOx42N^izFcH6&{XTS$j#3MB-KgGm6fY+@qK9sDZm$0uXrqd?n1)sFEW z>3&d}Kd<1Of;5v`=7h;K-QXU7F-henY5$pgw{$JZ5(_4e^&|2uO8hB4^eYn_s*Q$J z`Qip`LK?0FwtLkE8kU+-zwwV4IUTAIe6lRoe&R}z`%CCBlQ3$fPmo@Kb(@R~3duBo z#?{5Q^j&uc6--YoA3S;Cvepx`g_=wHa^NQa4qnd?_JrHTPA=2K=cQdA>hKv5?JN-9 zi~Nq4ize?Va9~~&?#0PTYbL&TnH-1@Bw4NZv2ZY1{2C(ueS9h(vkz`hFCk_#u~S`e z`X|5MuolD}V!HX!wK+PLvG%OphY{a+_*`Fn!uu^qmfAo{$tFGZ!gLdpIok2(ZD9GWB9F$%^9hSfW37gX9qYM z>Q)d(*OKJq$h({G&B7H|Ap_O{hL8=9WGS(-vf{l8^>w#fC%EA2U3rTB^%ZMwg%Cgvo^Rx@*@J)SnJS3aX_m0R)cZBc&mjY^fn{N4=n z3Hlob%x=A3@AgXP?IGlSl9X^Di0HNE=!?O zwK`ha|Lc+5{&Dr2rZjPr%NChRR}9YhRDO!5@H!dDKWuYGdP0Q0BWLER&1nyct!a=v zXSOS3-LY2#`m*UvPhC%|8m#T#*gwaZW6u~UeY10&ppX~~hvJeOtc9IB*Lo{WcCNnJ zLOu7YD5iDB!gWScEm*bn9c4#CXu}9UeI~Q6PMT<&T%r9~E7RTC6(+1kd;TtCrKH{2 z)*{ptxigp^Os0@N*;h2KXox10aivHeD8c?uE!o~Oh_F1l?*UMQIylsVU=}6ZKQSa> zGy$nl!5EYC&Z)$98--!DB&rLwgP7tOW)4oKy|e{WEyLd@^zNPR z^qTNj5^GuMRh9lS-IA+Ij9S|?ya&a(?Q6VINYO;}jv+IEAla|u%!N3xE$wEI(2>MdJxb$nXZ zRa9ElyRhsZu#7YqV7&eDwDyWB6`hn@hs~nW^V{a;6%L;HwBxJ#Sld#%=#+COxajCO z^AeTU>T0?)??OI6{yCgy(2l4b?%J} z+11xR&m6mK(8acL_3FJy1^@Iu(Oq4 z!4~%YJ<^3KHMeV^fH2vEA;%k&k+JdDWV?Dvz~3Bdy5K+gDs}NSIB`bc?ziyk5?`yqw2J=}|?`u=kqtUz${$gitins{f1>8|#^);Qjd~3>qgs~>o*+4(rV<9~Z)y5)+S%jY$5E1E$neSWp?L1*eAT!KB`r5)R z2wrOhgW>fNvy5RoUyr`Ws$GJS=hx7=2t7N23*LY_i+1<5G7}}MIcB3HIfI;SWsydq zK1xDnhmOh>$0&9T?;8`MBBjc7{~{E6g_cQrU_ESyj1~Z##=sKej9KB8@eK2Ay~hon zi}lAk=un}e-THUQLOIAJETcDF&*J`>rBV^~A}PRiKG#%z&OZM+==8G))0!$?TE~+a zU!AVg^1KKy9hksC4s4`Ud05MkV+OU1PU@uLyhub+6o$U#V~9p5CQ+dgutoj4$`O_g zK3*Eb^)KV@*OXjtox=LtLXnZdz6kU*;XVK0ErjI5L-8DHZK7)meB)&n+X7^Hz5LfTH?4;S&ouJ8*l~s~hAS z1TS3pbq4wREzY%a4VdDPRGawfA}mNsk~DZ){0fTj|Pr;=SgMCb=qVR z&_7(RH~)xu{-^krqTv_%icGv^q5*JvmbCt7iA1hh#yO>R7fSDI#oC8^IVUArt%+${ zs_W0XetA^K=(zPc=Af*HRiVsOPhEr8Rdisld3E8Cdk-zoAzJoEp^H_ti7vD;O-r&l zRi}j8E2WP+8msB`aGtA-+`jILgy##o{D*-rE+exX9m`m)treT+;@4RbqTt0RXJ$Gd zsr0ABy}I8qdHhK&i@=G3e17{IpZl%nnk86R!~!h*RpIM7gr02H<=`Bni2_T*SOI_A zty2jt3R1nlk%Gn z2$WHK|KZ2=7=v1$fsLg{K@IFUWk>f-Jp{eSGc?uERFs3LsBwY;kBQ=1;9E|68^TW}Bt5LL2e z(m}wIh{Q@nzW{KhH}SlX<`ItD4DMFhLW^2R8>aIA5<%t|S3hWYz=5JkU5HGLH(@ zLrcSSB=q2Nv zQXSjkfCwicBCffmYJ}Lksq6 z3&a@ti7&o{>!cy+;~hkIL~>r_-*%6dy=r&tnPKZ$>dF zkLyN(;klZnsv4g!;&3`dugK|>?mYA7K8I;lR5d6lBL59gFEqmXaK~qMW0P3a4idGR zWx=gU7N?*A=wA-xfCs#M`LeJZ+dXw&MMcH2Eq^G=a}R>vE09U0jP1j9<**bZK8|q* zFvQqZzh*y@XYx3-2%-`a%L*k(p-SesMv6n0fY$OP9O>9&L87}X@W;QdTReG&QHFMO zSb$JR3?BXp8L`m`)Za3i1-H`2X5o zZ4&wh$Os6ZBPo8fXC1+x3}aob&I8h{hOa~#6Ot8!)JAlE?~tuWN=!gqgXa@*ZNpb` za1`TYsRV;kj}{0+k*zS*2unQg#ULOj5^aVrT?oz_Iz}mn#;xd>+Gq11w`sYAl>uu& zy;l9FXNKZW4PRE1oCv(j^KOz<+xxzR-2y=Siq{V!mXeuF=$J|kGEs^bU!!*oSCSlv1x2UAUIYtnN)?uMKamzmWr`j*V^{4nOcBs$;pN4{gpA+ zLkzI$r0$uX@a~cw>iN9$MJ^VK^ zB3q*hiTWfJ9oNV15M~H)AcL;loYB{~BU{YMw2RPg|o>HWsk;0&%gaR}xHGPQ~6 z9_-#1-4RYfJShmFL*Pg5t679gq{dIcBS5Af?YhEO1%;yr3=q60hVVyLh=br>h52R= z#AR&zyCQUz)+#DC+X&Tpz~V`K64mrO@eCKDgkg{vGRZK*SAy(1?X^^gDLyy^nvGec zp2LhsNix3yk2iAr7N0sbA=t-??hg|oiLK>mEHZg7AiK1$4_@_NUfi1pQuwA0%^7vm zLx}Z_MAb2jxreeQpjgANqjsXA+0Z_Pu%=`7!#V_UdAAM1n!!b8;&1dT`o z90zFQXbhh2*Zky#5K9VBA(FlkxDI+j+mq$7#JWZe@lxKSSx^)-%X9A$J;TSh%0jE= z>%V40PNjzKkhis)_KlqiG9(dB^5+nOPp!%KGj+;}K|HhW-7AE&@+6oW(;WIP>cMxV z86%x35^&>&VR$Zyv~~OEDcFFDEFLp)#saYtTxCvnX%G#*yn8v*JnvSpWo-&MnDIS2TrCDP}qLV-2rw zfT=ObxC1xxQ^|Ij6U`c!SAcMsQ_imVihTdxA1e7ylOh;T3(1QiXKGSX5>XXnTEu9h z-OCON`uIJh-q8q3D9AW@qr*I!>9oR6ICf%1>U(iv3++k1vv9|xk+7gvfK_SD>_ zBk6wxcS>#JqTo4$ey=h|Yq*fD2H2r^Al z7eU`FB+o~zdAR-#*fuiLT|^&}DkiE3UohO@pn@zxE!LU;VG+b?ZLiUvGgw)L0e&_rOZQOOTlSDGtJSkzMEaOLHBe z5`MoPgvtMM(~`G)rx6PX<@f8@109kPx9|%4|NCt;IB=;|aF#yVA5XfaPopS`Q17Z@ zdNpD;wB3F_R~<0lry!_jeDecK_1)heHEeS=!h|sr+X=yR*u)$}{RxE4z+eH8HzPCi z@f*Jbciko8d6z#C)jCesGwkF>*l4`M-BB=F!I4jPdmNJ_nVCb(q?829gQ9+CqWnxx z0dK|dfkWuk&SL%FSG{znC92TpCa=R$+rUq<%iMeog!tO&(}#i@^B=ft+)2Z`Nk;OK znX!0zY9RY|iwi|d=eAN6HbNrp<+!lE<(n=0ak_}cwWk{k9=Mkt{ByTF&n=`LV@e_H zT494fQW#$aY7Si8;b5^(!HVb4ZB!JTn;p|d9!3ow1~N_tJ3aP2!T<56(j)SR=oU#{ z2u^U)Lt#Ullt-S?m0(^wvq#g|Jmt#<(JW8`IRqQy?s$wS{zSiDug#)+pVF6Cqa z;=gK8E8j#iI0hr*ItCIm4qTW?=;OA2uG#p31=nrSs~~**kq>2&I<(AOc|Zm|5Qsq1 zuV8Ofoa%}|JWAC^QWY_Yh+{!3+IF3VdXbb0roKN65O8op)F0;4c+B!4gR6m)?)i_U z(9!XYijNkYrw!!bKC>ewS>ug%RNwnPU%}(H1>Px~d>votu|-Dx`S7YB=g(cfoVMxI zsZ+M7Lyo<;ZSaIw>v+iG<4h5a`Estp#_~jIjn9Zon0g~T>(T_P#Pql_9Q`I-u_z&; zM=@>jt-j7OHc!_l5u&_AM?_{L_4L^4x86~qp~8mOe&z`-n!&I4Jqcv~{?02G9tA>F z@)xGKWsXiJqso>y5#PA+JlZ%=lkeMugM;Ot4{`VI96#DdS9~PNV;MXX>!o?FWYp7r zk{K*sw(8rinDvZ`{f>0mW?Ci>xQ~qQyQ)Ar{iDp3ak<>159Q1bcilDGzolXI@};?- zQ|4S1Jksu9C0WkIdS@a>bHLn7W9)w8+Wh-*oH5r|NE#iGr?X0Zev+m1sFyC*yHNMy z2>FJ080fheb&Y#BsFM^hQ@=|B_ipqL8f{dEPY4gkJ_{^e{QCh3+|cVz#zmmcKWQr4 zyTh|X<$J44JgzBON#>dk#pI%1*i0c*c1O!I$^+xPe z{PVBMOOe=P_;8=9XGh%~vXzi^QqGiyFSPW}iuwMed$G39RTiu`bv?bSEtB+^zMCw@ ziNV?-|FKNqatQ?UmoxoslieXyd5Dy`hkk^$BLiu2;VZk5qaFG6)2DE9OJTHC2+b>{J%4Tn9y!_Wppa;?A^qC>c=UD1 zb@{amzQ5nd!m%!;>eMZDlr5tmfFwZ(!7rc*W`rn4brna>KHG`&gHz8GRklYtKH3%9 zKf4bOk_TbM|I6{hCxBYK*fOa;3)2oXP+wqIxTM_z7(oKLYUPoldjU9*De{=OXASp3 z@A3-tYaTEJ#AE(69y$yT-FP@HnkVWZFR{>Jg2P@MlOVb2$yS4?L15`I_4V~z9p_af z_bN0aAJZ7q6qccVM-ltZcjmIyqcrqiofpJ&?tt$iA^z|f*S$Clk6C?~SksjoQJ3W& z_5_0ST^Op3kzyzj$pk&PTXk+65V^yBrfqhBDDD|j3k3sqCH@tImGmgQ`+^Q&cxlwBP z`T5~+h$QtX2`s{)_uze)P}Ro)r(Cl2B30eB#pa}SaGD1!xkTgQJN>-_g8?aICJUK5 zOiPRSBX{6vS6O!7S|i2dvZtDkjM~NRS-I)JMWnlh!%&6Wzz^8%21S<7;LyPKt6?TB z9OX=xUvCt*3dZ-wHxGI~u2d@W;sjmy;*#%&RXo>jI3ys>HCl^tB9YWw=$S*bJ)N8q zJ9>MMDUMZ21zT_x!^Llp?wSiEY*JG(6aeqmN1_|zLbtVzxljTSgd=7ONdCU*IF4IW zHYGLij)AJTjfpP*jVICE9z&ZN-o(hnv`$iT^z_VkOaVoG(7W8@KO4>ei< zeIXzmo)YMIqxSRb8fCl@#iN6Zx)u+;5L_9=B7o*gNbBZ~I7Lq`MGvP&)={MwN|bA) zGor$wLUOAhocjVb7BrSqZS!-x(evnMX_pxdoxT`(s7ci`f179iM_m7_Tf5I*jJV!P z$GssX4L2ue6CTJ&>(MhzgK-he%l#{Qeu^Vn?+B{BPhKIc{L=U=%z0$~=N4S$BRIT@ z0VVXOAza%(q$z8jor~+{QIi*MmMEsY3bf$bg)H5_02j6_P5gXhGM6wHc1xC@`Vd3m;G~DY`(t<2R#4`lMtseb)o*q4)R4S#9dd17U zQ%>d2dQ)iNQzf7P68IKTkzx&(2&4o(Xj5#KyCC&~Z;}d4pgANZWUeVRVi8idI?%us zW45_@%O_YjpJ&%=!u6{g%&U9cy4al%1Rw|KiM_h@Omv;(P6eOub-PgVdHeqbVdZ9Wwn2 z8}7s7O;?hif9ESb8kHn^jnm`?C*Dd#SWri9ZKu^kJY3}$adYnsF_trhR!HJjXZwWv zEnl4_AuUZlb}*@q)hkb5X_&9aG9lqc}(z zLLD+iRK|)*QAmzsXdo(MWfqPMB?=`psT4w_Ln@h~$q-4N*LGd!Uh8?DwVt(}KknPT z)_pmse!t)M^V$2o-|zR{8!=!gqPeRggL3iSuV-%Q?Kyd>XdZ^WW)3W6>S3P{vFBEP zYv$L=5s*a=GjnEpd1XKN5gFv(%bR!2-tfmCs%Y-AtY`7##$>&mA;p7Fz8?v<)au$E z{e{~mexb#O*nzF+az^dBoIp-w=zmV+{}CTz{&@>(W9Qt7atTquX#iOB0N6(ErGMrs zB1mO2UXFVQdn~I##Pxnb#Sn^uB&t#g|JMJ#sd7|vyNj})*r#mkuvInboqQNRnO}9# z=6d0~YUT?}!O$Sxa9VSF_{9Xkf4jf9r2)rA*Jq>YfIPP#oQqqjvxUZsFovp@ZHg}R zSZjsk#ok&oe9M8jqPq@f6_9`vY7ElnEbvnk@M`6A{b=O&OL;2UE%L(l7OUhPjdZZF zwmg_5e>S7{;1gl*+OP>sS^Mw#uPvGRt4Lh&I`o1ImzYT;FV`2Tsj52hmKOgjzs2mk zt+5C!L^cz_|9fLy%E>Lp-;B;{NiLD}$8CB0M)tP&`}}r`3+a(M$u7?&2)RUI82M9Q zW8=r2$6MsYyxhRit|cCMVswtE1sgY+d~n42_RDNFb7HEL>?Y6Am7v#7(9CD&g1YtR zb^7w3ZHWodN!ty*;>?;HK5|6nkWrG$XmU<$?IXvK!iiz*Pg|m3-1(w`IoV|c{=VJe zU3Jm2lp5%-d#uR8wNMq>U$@TF+a!q*?Xy$-o-Fu+%hHEStN~HS2t?@mUIRCWFX8k4 zT+^CL(GmjnbU6p7j0G_odX9I#5w8#8?UHrSq*R%OTv40@g{v&OmF)%l*qwWemGh#L z9BgQSj*1z63t!{pL#`#SI7$w;qF7NKD~ zp|b9oAq>QpR@WL*`9uey?3EKq6TD_YJ{@9FfX=oOVA69gPqSx`+l;!bX6~EvWaiSt z{Orj%vPRb~A(hiEPCK~zu4caH*uWK0&Me?_mBV@gKspa|bFu5VuJ>Q`M-TvdPtDxf z`HWi-4=EqhB?(~EqDB4i$CStnP%;Y%_>qfwa$^0S9$#K=fRojypOq7A21`ZFV{ga~ zNihZ!{G<7nix&ruoi@!^&1tAz!p7P^bk0Bj$u8kX20ry8NO*V=Gxq3>w^lsl?{AC> z?KIo0U*yqlWA9Zj$k~-EQN9Lm)2!q1Pp+=LQj^{PtU6&)pVRNmz{<{uUKw=g2a~Ct zrw^z(OTm6TXShw=GQ0Ru4gOI9$@B|{y;!T=o3!6+@4T6Ab{EEwV28la9^v!b6cQ4$ zqrcstBb4_UeC<8wuQhu2h8%yshN0dI#|F_BgsJ8aMWLYQoe`o=S zg#Sqr=jaBoxhZWO*w@$(4Hakn=_= zmoHtC4XEtWy+JG;qN==n&D9BSc7`KU-@CBXjGzaR0o2??$ zKM(qCQNryrbJomB{=SnL&P(o}xQ=R#J#PHGoWL!Ko38fPen&KyuO=^sK@0FdW66~C z2wt*(aA@6zl6%m}zR|_L9i-)2-14uttpG-CH_c^gP*@jeAOre0*a~Gv9FDiZ1gv z`OAq6z;Y>&TufCVzN>=KUntnc8O>0c(dhqgKZOD9g;J@gf|+$7aLZ6zlErz(h*R>~ zXfml?$fB0d|-FpeX_miG6OwL*&hw(iuYEInl5wEh3to_ z#`V2AaDJyYKH<*MRV`&{!a=G`mJx?Rn%_GzRt*J87J~Jk7V``}Hg48&r>xM{CinU2 zJ74*gdTNh=DwZ9NkV&!;N^&bl@7OPXZA><8Ia&zHYISW@X+Yo5m;_FxvTozYv+Itn zR%Pby5XBJ!b~6W=yX5VK4C$t@IBp%KdQDw_L2#-oI9#1*JB&j$510Pq<({7UL&aL6$Hggc+Cfd}{@X^gElY|}S>-@@uO|K=ZXmE~)(ObPtq?H(Db&AXQqOq({ z7U#XP?p0Ld_V}wyUCt(d+NY$>+?HW<2rN54fO!L}s*>weul1XCF{Vx1w)5>fs;e(j zWI%qn6ntUQFXcuzXcl#hvI(hccMX`4Z<(8UJuR<_S#n#I{#{Y2wQQ+pYHAt`!KDrd z7q_dbIHj1oXdoNGm>@PYCoKCE%8;z*YIW@q16=|gJGF5)SZlX@fVOrT`vQicFgl=c z-Es3-u_E4H!+~Jv8}=E0W26I}i|<9lbcgg;Yd}+f1d^=Ta4CaX&q@;KZ?r6VL!s9q0_- zgg9T$7xa%eSbAxgKh7`v#B#TONRu2E2ck1lbY(Bg%ua6%Hy|4^Mq1_}?rEC0& zt>zVOhzzF(XUXm@yl%{|-;?VBi^$>3WGiqg8UXrcwsHbf!o7%actXTMvot9ns<^Dm zDe-&5aMsAO1eT9=C|4_P*TcW21~C`YlSM@w-(4|lS2GX?)ndfUl@N<&s>L0EQ{QU{ zoz<9BCE;HwuPa`>`)O&n;;Dmne1NpyG2snzhfH~3)BF~`nLZ*m@h*qlZ%uIO(W6J} zhmZ(exSF_bent6ApXmqR4N@Od^)$KNH;?vJ#2hC4M2X}kA1##x|I}IQVc*ljdLm20 z&Fq`_g;=e$=w9^JC&o2e3uaw(vHIi(Ijknkg^k9^En7SRhno`<4KdJ8V0?1h({<2G zO$-}6!oyXz#_pQKkNj!-p_g^=wKFpZj(c;rJ54x+snzz(mhrnbKUP*RGGIQB*O9?9 z67TXQW}6+ZAOHo4TUYcUu@CvqzQD3FrTR17UypZ;3vjX#zXj2_AXd3F7y5#XtL*=pdQFiTJ` zJu}Yj30UdTNO-!w){#b60Es6p`YA6&0b`W(#Go{%wBqjy&YOYqd8)K9J}l4Y zH`|ErM5Se;iHfTK?oq7rv-K|@0W4&4uNZ`QG3<^Uec0=JeZ^oHF?6S_bFmAE*l7e` zGQ&&ufX?Sag%7gd2f|irkeP9P z@&Lb6K$)D!FsIVsZfWszQ~V0z0DMDMR?e0{_4~@WiBZN6>Q{3Ern9$^2YKOSr*-M? zo{kMDO`kOT*{i?)rfrWREYjXbmgj;m1Is$Jalc5Jp{!lFxZ_zM*K8sLsIu^W$oD2~ z@C*=|MV|vJsp6PrRFkvi-z3ITd=T-CbFiTZXy;qGz9AHA?|T?(oN~a}E4$=NGU@rU zKNM$tOrw1dEEVvxYh%l+Za$$vfAQd|GhR`%E z+V!EEuCu6`e0EyaTRf{)?pTGYbr_;c%xp7;1Zi(ZfYXa5{MJIw#gTr6d3AA#@hSv6 zh0U&mBrF5T4}*_xDh?v%9-3u&;qekz90aI=*E^0E8M$U|cN{WMk3!mTjh?;;n(UDa!3jcq6wYXt^varBIGPkMd{)xgE-Nf3 z58_t5wz*Px#I}r!^$fx?w$ZOFqZNZX@?@bqS^SHTPLz`S>Uy!&+qt}|WzEXd=@}Wd zZ%&=nL!Vw6#ew%^(uCXwINg}@6O_FjLz^{iDxNzi((4?^=~venaq!LuedBx-m{g(a zVd2nbbDET**!$Dw^^LS#azak*J`6KtM*9BR3m|Aec#bA%Pa*h(Pd0RR3f8OO&c9|G zyMX84XvXoci|da#AGuDGgTxrAs}Vo#jTLTPr523}~yCi3>8=eoYLlK5Rdst+&A zf?^rkOIPR;uHA?mS}sK3=Q+D?FL**9UWBn#R+Pyw!j~pLNEXe=(iyI50S| zubn()vU{gFtJcH)oP#aij3bdPTZh0v0~m2jGSgjKzhYBst#Q2gu-TKejbyMv3xK2U zyce0Jav0&p+Tf265TDDy90Pg~w5 zD_*c6U&rte$Iy_GT}`}$N#jf!h;=fXxo>7oyADUgnaL=ylN2a6FNgsR-P=#NUP5kp zYpp+Nl#*GT{E5qP^cQPJ%)g!Y`OJ3UuMFT|=aHtLMaW=^vQ^Jw6@>q8x|y@a52Eao zPE0Nn)BRF#Z`;c;Yj>HkEj(oB zhMEm1?jVW|DmY|_FmwyP6k9b4y6 z&gC#nW;7YLMf2w8cA0;?Z!1;}E?Y2-bF*@8A27d-qvPfQ^L2q;zech|iiK{!vwKyV zn^v1)pgF~?R&X*Yp^Nw?CHfBt0pPJ*M+z!J;N=y4;wc&=bZpK69A;sC(LvYY0*6r# zMDSDW;YT@{{_VMYKJcL z0zBppwV)&fbW8N2u9AvOUNOWQFzFy;x-bqsHtu!2KcDLc6frS@GM;ucqgNAomSU@f zI7`Kb<)*rynnDUV+4r=hAVxCkC6r{U^QO-#&ZcDK<~BaAB9^%3RpH-)*Nh?|OQny{ z5Pm50m!6^fKiFTNhEGKNICMnwyUubtSbszjmWStOF9H%~ymJNQ`(J=$hgmm8Y2JiO z+z9sKj%|kEU^nt$21iJ9Pl7DKz?pbyY4hJ#J zN2?T6;HR6XzDXuSmT~!X*^9D_Nrqj7fFkj3rj_S3~P;uSVt2lnB zSn?+nK?H!Aj%&7qdC%RHdE4VV96zbN*wC>QbnKB#4o&teYU{f`0?XH5J=IU zfiZh45x)23O73Yei9&;!6&Cbk1|x#zKf1E=tbKA??O&MHr*KZnsJitq)R;ifQg4R-n=_k-R(A3!!qQOZ?C=r>s*FpyHhT65qmmMoE#K@G zwf$^cuQKeXa&(UGbN_Hz;DRSV%b9;pnkjIL0s+s9ogOoo@)X`+60Z`#bZO!fK4$akqkAHa zv)F6>D|FWG7w?>$VT8&k1Uk`yky=#Lg~AsW&0cHQJs)3RJ!o8Z2;HKxu6(n6w9Q1L z9v{Qog|{o!>1#RdnC5oh5Z~4~Dw;6AMu(_&oz8rGqUvF%ZzR-sjc7Ou{`;}Q&ZEoX9P}ec|@{RuaE7PWr|>xNi|a8WTETvJ%oP+baAx^(f3;rTvRh_uT9g*uU8Sw?4I=0`NFe1hSgexwU@l z02Xj*osleI{Y6ctJPuBMCs~5L6gF(x958_e`4$dNm7fq$nLr)NiJA-qo3pCqCtlg! zSMIb4?Ltv*3Rw<6?ub<4cN)&}k^>5pJh0;Hk#r2VD=1*?%6 z-7&wxqn>TuyuLx6UB(pB zJj!c)2OI6bL4;7ib;3I>Cx*tPg&Y7GgaJwPIK6F$4(V`k*U50OX~N{-9bV2jDsjJa zh4SyaX_Rnq76JmvW7VZWgPXjV{!^W~o1=O&>>vyK;f3a?GwyPW!h?NMpurunw! z1R%nL7;$9uixj7CV>4;!S#>z1<6Fk(!hlogY7*sh_)Cewz?)Ww&S%-cC=gylI=4lUrLJ)ZE@kU|t{oecgfwn%nd1Ci6k?{QvYAW1%-X z@)_V?Pf-R}J~2KIB)(9Q!C@67zya&xUz65ulJ;tIntp~V%3$XvPK^{00TKEx8w-g8 z%_r=9)3iXc0(=bN{weEhwz2LC}B zQ(i2cVQINVR4!tE5%S!@3#Lf$ugD2X>eSH*C#$eCO6E26Ye>A%K>W5KGPkvWrr6x_b@G$LFLR9-jM2uHTI&oka+yjiCUv)k1^_TXq(4x^^+yG zTTbKoOdf00eQgv!O}K1nuk*Y*aon`qN;+%RaDFiI`+*`jg;i}D1GROInG~k!{21E{ z!=^bXWmQd2F5EGgoyDE!3iYTENaa>Y0nAAIm|V7?ID3DD>y61Fn90Tt^t6GS=1HN9 zN5Mv`I9b!RFP;%^XKAqKHv)XzX7B;O<^fDB^t^4R;-J5(f^QAp{$U;?2^M(ZGw3sE ze=vP3xdT%vYI`v+gv~J90MgTQ%*@QRo_cF`>Xa$riF0Dg!1_47q~7l0hUS)3Lz~)O@pg@HrlyhUI8p?ZOz(k8+7tc^`(pHm>M^q|jIQtZ z_@hFn?-s+&9{y(K{_#QEwng)eMvoaI7ct_!tB2D^&Fz2J{jA;aF8%w!arOCWDy8j_ ztrXqqo@l*W4I7L=cY?6fbX5^+C^HP42^|+pAa4p0e@>X$i&2cFCIV_i2i1qp&_uI8J+@V9^mqO5yFE-#w~% zP*d@Gw=y5cs>(|*g{c!8Bt}rn2i|c@Bs2Eg?=dDLzwB^nd&odVTkFnsfhqg)&o7)b zpkvU-sX#DA!5Mo*?nC4+wZA8U?iA!sTjMeN6H!i-TG*9o2BaAOJa^a}9DLKM(xtnO zODk}kfFOEY#QFg|K5IyQkqYzMFI|(L=usY+T3(ZIFY{qg$ius3efV6wGc{ABrF4jD zue){)R_3R0cZ~mVH{xqi>sz0KQpT+`ANyUUrs53fDC?8w&>)+J;@=yJ_Yk=v<^Qnj zY#+PB!{1|sZ@Ir{F0D0hLPNfyfd?<%#n)-Kn(rN=qFra!e$u6$jsmz3Q?Dgnx>RU- zyX#=AdPIn)l^x8ZrK!>fE%J52=_2JiQAr7?GM0jB-mP(Zw z-C!FogeXqZtxD~5&hMM=wF#XmzK(*HNEp2)#%7q`P9f!Aai&|gjB*7kx*1VO0XS-L zjjC3wt1a%Xi8BDC;%WiPVMIHwQ*i49bsSvNd32vE&NZ&SRhyHcMeq@5aqAY^6;Fwv zb?j%$`sDCEKZg7Qbk_AHm`$6et=Z;w{ z$WyXP-dyPgJw#^4lVB5A*8@K5p3WA-$@B@QK79C4+D)c?yML>vllmrL(p+_asaxgs zwXWdaWGI4R?25ws8U#(Ak3%N%mCFLDsfmeOsaQ1NtO{x&gw#2o-XG@@ew1~YPv%NH zku*Cff=T<2&$u-V_-)$)-N0oDt3w9*bvz`MJ^sauyE4<$HPGoUMvQsIXB3dvv%R|d ztK}UJv4A~Q5sXKnGYRKHPzBCn@zK65O^!7R`Z=-3ouO3mn(53j_GB6cdj$ zi`W|KNE6yMw~mLXL82FRJcMl`wYa#L`N|QojOe(shqSQN<;mp-XBw!AnmJHkn`|7| zy_6c2et;swom{ZzcMobh*t9;kRsWJvcY>^MWo6;{;}avIe1a=?w%~o5EU0g?kw2<=HKfdridOP?IjFl1FubHR~G&Qp`e(KQihP33T@$g ztowzlUtVeyc%O3HJD ztQ03AO6(o@%xjTZ!k;1BMC(}SE;E7iIV)Xm6x&A;llf@;`7Jv}=Ez10I48H$2Y^J9 zUvx2X(NuaHp=F{>m|}l62rTw@^|78%qQcfcIho68Dt_D#hWd>oybRoK&dHd zsa(J<#5JMULk!Ozf0zb_D^JK)I+>#+>IpzhEc4(Av5XZx z5dAH#Yk%vLw{|Iqe(f5V0Lnr&3Gc5LdTRBT=Xc$)>!tEbioVdP<`?W@N}~+jZwNK% zDmzcB-@n(V3JvDRYk&<5fT@lIstmWN@Hw%1s;P;qY7P&fvXF{jY*$oOuJV>qQO>yw z`(K~ZP#biYhW#TbR-5P-7phB4S{VRiSD9up^DN)of>^Ctseg>0Pgq)Jfrjt{VFDJ# zs}&5$e#q(bOgsQnk#%BL!WsEVJhnUPnBv=LYPwM8<(KS?#*kBnWqoMZbq(&qQVKf6 zak-WHyR%q*a{YeTJ2Ft`v?_~seC|InAX!t()27I(vO|9WJk+COcDRlOWJ)XS$r4qX ztEvv=iz=6Q1H%jf$2sG->9E8M50}_akBRL9~=b+$c#s)k3@D?(=S(qdHCxbGkP}i{S(Pw^h_~U;87ZTBE literal 18951 zcmeIaWmJ_>+wV(CcQ;5#rzqXhjnZ8LBHi6cNViBREl5arBcVu3cXv1Lxdfm6p7DM- z=fnB1$Jp-}Fb4Nxtvlwt<`uvHHCLFjq7)i3F)|bs6q<~*genvi^Z@wPLVN^X!Ly@q z0S8Yk#l@B1h)apvTH89P*%_IbiJICPo2g2Paq#l;Kta*Rniv|YN;A`cH^MeF{QiT9 z0olP-_1(KjRYTvOt$nS7l;5*@vf@&7br%RR7doKe*opDH8oFB%Ke25G474Zopwz0W zM#TG{P*DCVh zi=li?c?}!$BN@(|%YSD{F0>3@#JZq~l5!&@E;FGB>ZmCsJ2y9&F3g7#QC5{nR;1F= zN7PKk+(R-ffSNr=6;I$e?$il45;ZcCh)`q^UFDV+lQbjJ_%)_6q95i5>bBN}mIQzQ zRSdC#oMe9^e{@*bKQDt)(x_kq{e)mUnt6M3UzjPI5T^RMU@yT!X|c;X27a)!2P4O0 zrYU2tpa8`P{zim?4z+}W1Ajq-Ut;hJ1qG7@^Y0btfh^en{H+D~C+x9W5EPUMl#GPv zOE>7SCwHu>W=Ly5KFIY0kugXi^*0N1g6kX)Fu zv>)944-ED@RJQ~KnSaMmp&8}p5NSkUAU}oyPFm>e>WwMnbrOjI!xu?T_eYz-(Ly}DTPub*b)Z&^TV~F@aXm* zE``l5>W4f6MP*T~tgBnaJN6t!qC%nC<*7 zSFJFWnNA-2=4X`6Zf~FAbtE(B;h6c?4k$k&7He0@vg$XcDAwf*OqVIAOn!Tv?783c zd^N$1?yo_Jd`FC=5Z13S?ZvJ*8D^w9Gs?rWToeCmVha|)bfTKQm<8_Ns*RP^F3&VP z5c>D*twV6_pD;#PHR!3RM59P~A0HkbVq;+$B^#HMf{8T}q2%OLby)4A)%s?R7C5D? z_JUFVO)Z@aUZ2Bs_1m~7O5|;nG=Hr!2M@Y2%b?ETh0FdNORijUlJ#Uo?yGSkMheD( zr}gQtf2I$GSJ*AKTP9F{gP4zhC)}scpH~kSJ5=-JQ#%nZe&9l%8Kt!+TuHNcba!^< z2vOzzwPOaF7+yzn7lhmxGR1GNP>yHn9FMk3Tl(gG|FEDDKaB~;W4bDy3PvF??T#dJ zSn5Qo+i$+U={|OM;CocB`_*aSqgFf=QV>71P@t+S!tbKs%hhCqzJ<2H)%VZmV#dB` zT$gBpXMwlJAmgKPo%gP>o^PJD)g`C&PryzQ^sFWoaAVKS&!2Rse==YlW8ic7!grYY z7&UHawLjTxFhejP*`B}8w&gaD1SwzfV};3Kd!Br?YKmTiOCq=_*&k`#xB5_(B7ZKH zF|^7I{UZrE#K^r4M0ewV?IS%C!X(Jdl5?eH6)nDiQ#&QlcN4AI3gqpvw5Racpz_{N zVR=5oU^7+O``YQCT?E<7(5h59JHVx-)^%S$^RNWfYp1*iZjXYB!uurF>tZof-8^}# z5@(5Z`zbbS1{RZ08?N2NS3QS6x94(qH+SIpPSfw0g({yz)b(FP;43(j z&mj@+z#jSq)0LD)t0<0K)?=@%+|;bju#H5jiP!&_T_fwYfAd7mviV{u0w(wJcu-g_ zS2oeKCyERv7$5ntrvtHP{P%e>MSWrO?U~NyW`5?%xQ^}E(9K7jjpl=7p^iXm!!P)G zshfMB znmY`=(ho(il3gGD(I|S=qD3JNEN+^XMY}o)SoW8b@LoB#7RG#Iw(Se=3{RB|?>%_S zsp8<3fmF3UE4P}R8Ap-3{Zt{}!@ zU*FwcBbWue=;zd!6I1;p|DvwOT_^Q?7JHR~>r3O&7L;Diaz2}dBCTA~u=AU_Pl?vk zb%Wlwe=gpHR$7lq9*1cKZ zS`n}vFqkD8IAdLQ+=f80fpG?_Cq*jkew5?1@k?}FugSyN{e0GS^=#Ucr@$O9M++mX<6fMMDCMermOSdjQIgixO=rDYaGgtvexelD zlexk0#^+K`f9pKlH0o#Z_UU)uWc)x2F?V zqR-3+2=x?6h3yvnP$`6TgRbdJO$Vud=v2okkUZb0MU9oGE!s zyv&>A)jaG`+i#hQn^109_vhn1La8^|%(@33QAwXlJZj>4iO-VGR@yS9M1TUj^ouc| zju3Xf`Ff9El&KSwy%hm_gKz1gK;`oaxVhv)wfwE8z`J`3Y7q9Q(sqOC|4`H^?#Ve& zu4%qin4dAs!n-ddFYv}rsp;mZpVMZBN2|g#Og`XnM`?(f6v&o%THrd1; zx3NSz@wA$$%@(3)(V|z7-9Y*+Yt-K8FDk(9&a7AeGQ;=Io!9%S@~d++xnO05gfJ}m zoaR>)a>s)Mzs>j9+AxH0{uJbJ=_t7LSvtO}jOp<>J<&|Z%HrMw6D=O6TMdEdJCnKR z{FSz==xRy?h5Ah)*Pbl7F=z}bjXCPAb%|0mh`3~Xq2FE)ZZ*>dwFQ|YlE18(afmqN z9wOnUdyTNwA~}yIKUhGr6%LK(dv}@7Uy`ts)umahj5WK(@3K=MpU$qHS3rQuQg$dM zL9ccoG}-XFMt`nAeRtbgiqwhsrg80w``+arT|ueamfh-lPUbVRTOE)PP`rH8SbePt z0>I35v&)U1)Cw#-lH+Hp1wtESoLQ(cp#@ka-O}c(qLaN@9UV$L)BplW@yKMU8P|if zKp~}N*8rXksm^|Z1oouTqvmEwzPah?E&b=g;vB@EGn$rDF9nTc$-9VQK*q!Vwl#7^ z&Jg`6+?wbKZD)5kvNlq}FB}oyC^rA2PU-gdBt8Tuy_az*Dci8`k@4!cCM_<*Gjrr8 z9hyJMnMy1*HnxyFS=^<-$fv$&dBC)fj!P`j3fo?ns&RBmu}UGnu&+F zHsKB2$WF1=xAw2jNaJP3{!irZHk3jyiIIC8tw^p8jlyRu%p&$i5S}AGk~dPOB3zpK zw10HH-!gIgu2uu3NK~i!YpD=#@E@^DPgx_Z_N%8lRqP%6vb8H~M|C+8*~NQBQX5TQ z{>=I_%=CWX`nJh+*uBGWr%m_n_-8{$&bnH3uXo9Z%_LlM++zY(wOe5gDQeHWVD+KM zRL7M~eZd^bQ`vfFWV6$1;$IF@P5f2Z2LsO^bBf)pzOQ88!cY@ue~oU=mJ{qR$X?>H z7V-3OmW!p5oc$MkAkQz?U>`;&*o7{XL&K?a>+(gf{ip9M9+P5#aL~8qkU*~@yEUt2 z5N-FA2bG*t_ex+|PP@vYn76HR=3r0K!Jy&V`HlV7iPq65&g=?)7yMd-YHr{6!R=?# z=RFxQX7g;Z9?qqT)U~;5C7dTM)K24skYnH z-WTFpBJP2`&Dg8U_n*HWlMbC8#d@bqaCueA&9F7H+D7u^tds@xIULmLQtKcRWSDx@pt?i=njFP1PnldnN_HheS9&(^ra%Fvhjz zeib|dYCeuCI3bq!c=>RJGv;P@!_=oa!G*Nu!^jB}AklPwgA-L4h;{Jpl^X2mwU~(i zCjqww1MU|&G3p6(u|u*iw7YPnmfPO(NJWv3s|<;SN=Ut^-N&e8r7A41pi~;XVJAb7 zSF=iz^hlvwB!@4O+qlkL_ul81JjzxHms%_MF5xVFSDeFREh$AhEbJ#PJJC!;fz;-3E^PO= zE6PV{M$F-C8S3jF(@rMa<L!fFoy}!$=X~ZHFt=Zxw&c?iwr` zPQXT9l;f2b$m_A%XFw*h$^?U(%f^A5TWV6Mk|60pXcReAz0t2*ZXC>swm-C{wOpI1 zGk7hOdnv*AMuHwr7K>#^1O~;gC6r&%0D*6Q{uC`CGu0&fMliKE+TME2>mxxBoHXSw z;^-F*rK^)oYP_(q;oa6IvS(TO1V7kCX-i?)gaeK!oA58Q(rk8d(PMRC~ zkFLXzTRMapFjGT?;%zwQ=0rj%R4^>#pJ*lPayZgIC38Q}IM$$t3;A>gUnNR{qdj z;Dt6C?e{TFS(1HIRbxTd%QxIo+t~%S4wkyA=Z{h@u-V%!7EF4g%9X|mDC3O)7Zc4D z(heeRNLMnB&PXnhpU}`blz*-*h$)BPm$wENVt$hOdMf*ODD0JZ4_mkoeaPcB)8X$$ zA-x2>$*(vk6A(zZ{y45Q>PE?t=y*n1tEjTOM!t3TYv1!V8S6LXBzPtfdc5*3$*ayd zp&eBvfJ*Vv$a@#EcWCxSckxVFd+%q$!-Pfqv2AVwJ_@a}HQAx&x65~YLI1V`uYo=m zdGgUx-C3M~G2MXS<-BcJ^Pp^lGIM{WK}oIs*%VBx`Z{SMxAu=W{oCgylUq0Oauml- zH59yb>!ab!mQ=i-zR{D{S@%C@^)JM(UPnUsPSE~PCKM#?*; z8ZtS;*ibgmlcBU$)`(mD^e^o3`v^$k)a6#0^*yAkrUGQ|gIrs{J=*)73RvXFQuKbw zdo)P`NKthiE;Hi)eE9#o0TW4>MXs@#<W)t244Q}kp^wZ>h@mLqxZK~~lW>Z}~s{W%#~$IBzLLcNA+ zND2&6C(6_9iR45k4Z5%%s0WygNDB?Zbtnb}E9@iG82u)XUQnFn0E)+Hw;^PiB$n(GD+5X zcr0lwXdMRm6hkncj0zu~B`)_wqYRV=Bjd@wEH~><+HCIUbzIvxC@7$Z5R$!qdE8io z2T)EK1ZT~?kq^pxm|#S$X0JLM(If^nN#9IPv+!pnNo)rCOd~i?6yNo)#*y*6NCW8R zOnOKCfXlxi0{l3cuS8)v^eI?`A%oXZGEd+bXq@#7;FJy3*}$((;Ls{JUy6+!e-5|>qYaz2>nB6^AZk2_p-s-5V_$2?~L2ghL_e195K5=8c=fuB( zK!Ex|?nn6jm2*P;XpwRBUGkJNzpl&W<=F`&KYVUStQ9*rEcPn|cs_m+E)9_A>35BW zmno|c3Wa|HW|@T1ocH;xhYyWctztTl%GPM%=XHfBfI&ibT(vWp!u zv>dKEFd@^EjiG>Md3ps~2-9o~y!iMqPSy&o*zwkYPn@<7UZ3x~YWUn-9C89cbMCtL zAhr0#BWiwPFOQS{b9W1mB1TeSzbxs#mil@n)Ax=epqBRGd3zw^3G)AzIDmR2)-+Av zu}S}_HGU(IQw&u+O~*HErj_jena->B?a55biOzh>TniA5ZI}u|OlhbElAx|mjo957JZ3$dNKRj3Q zW5eFcxQz`9h-ZShk_GHi5r3HB0u}1r$l0QBcD;wWEsKMg5bNPWW9z|AAhOrZVNX5z zyw~yjmG7|~eS92m|2Z9AFqCUMliyZB`Ff_MNw4-#_TqUD&M;zEJXifzrx0L2i5Gf< zbZ^Fs6-_#i{w)yQ2~(;df7-^!j~}l3+HQl7>?z8^Y&Co03ZE^bshEh?^I$;~!oNv*9VDo2fQBNUA)s{~FX*&^`J<}w@ZK^>rGPE% z{K^)uswmd2t04J)dp^(h;E8KHYVVaF5g0`(!?5;F;~_xO69E#d<6RldG}srnRQ+e) zmT=6Q9i4qNPn0qe0Ctc%t_2SDj46T*9tGd&zuwnohxM;4f0t(cZ{`Z`Yr!Imi;1Cx zs648{V%)fQ|1RL15Vk+m0@nS7%96I?Jz@jWz0Tue}bvmX2!KsM2 zjQ>>9bP32%0G?$woFfy)<8?Oq8b)mwFk9^4W0Ze4<}V_xlhSf?6k(&)-hg2W49+99 zw6>NQxHisN;4?O&<_5;QQT-C<2mb&l+6!_u%N5{|bCyS<_Hqss+mR)4^6#}p z@Oj$4Z;ZccmWyT4G&pb5eU^M52ZXQwTKna=91bA3w55Hp+wi%&ao_#*5t$2#?$NzN z3W@>i?HJhBc_A~GSs(5%34X5&J0f;Nzmu&oImM5Bv_RoX@`pu`ZT7mHc@^TTO+V1` z=T3)BSuaN#>zw01e(lv?m$L_};buW=oP{w^T-tP&CLtvy2XcVhakKTGA5{xgX}w7g zv*^o7Q}Wv-ra(+o1vvax@>PNdZ^L=wop0`O@U($4&y(YbHz+w+;UWOW$+&=ysEffcf zfR3mxnaW`ja!qQo-c4eQTig?dfy@2Rv{c0*kqv*b+@vQ*v)s7e`t5XeB7uRI%03DD zq&FTh-R}~enl163Xo?9GMbC4Eb~uy~R^2_L17+al;k`5GhCy(#ocQ|j&a^-2$2EHk z&>2iv z8Ge~=@oI@@xCb9L@}Flhq=ooNAceAm1T6`y0F=f*hGTKS^taw-^_^^vNTHJp6byP^ zZRSHX<~>odl=(0du;^kBmwUFcDu9YJ`q{L>WtW&_3+ zoOVjHEI$Zu5dQ2G@oU0%LGxtkQQ-SgaZr&K4(bd0C(}G3_ zrcKWKfZ&fRPMPDK0+OoHW)trr|4?#dq#Rmk`L_X<^O;QDmB5Pw?`|jLAHyhr;XPYv z8Gl?@_NbZ~iXaSx&AN>I>1=U?rv1_wyNhU1BM>cyJ{NQ7S)T1o+H?`gYu_Ci@_;G1 zzyj-@S+gw5P+>qMpo)Wm29p`OR(5B$p=KbBn`u(bXwxtL573{s$853;uTQt1#M3DZ zuRn@cuSn&90Vnv42~N=IC5McpDY}sQI%BB!E0FC4@(C~d-hj+Cpso&h;IYv1o9TxA z<}d5}?GhyN+RekS3BhOGunisA1WG+azH;Z0w9S`M{qEtkv&$2OI7FaABU++xQq=XPz*_mRkhr+iwJ~Z`N9c!R+QlMol6p|@3DB_5=Ju04sw*nc8mE@Uq z%zB2WA{$p|qQgQ&N?%+J0=S|T;xC#H#2x|@NpK74zEhQ!9$o1mf!j1&1KdJ!P`SEM zn;930C)Cdj8OL#94rmFRqlNJ<^+G^v(5kh|cKlwJQ~lU)buR1XXNKSx;om)ikJ`#Z zDSR~VZjP01y@8+}(Dl7MV++8}m8SE#5gVe@wl{~jxn`YC~1z=>Mo2=aegDBB~~^-(hdfzpyhK_7_`uIjqL=}RO9X0^q$!gCY7=_ zAM|||PXRfh%}lKdMpUAMV<_U+^eBPSlvw@)DzVrlLSKWnJ37+g2lPceQ(FdV7W zt9`L#4sOOeTtnj6`3uY6XdH*EsAM9-fcF3s3^L?LXvE)vV1FXOv#>S!Eq_mmQLnxd zs5^`%Sf+28X|p2B9@{4~|LEYd*YG+h(HcCiQ-Yz1u}9Jz4n@k$+)xs0!KMKnAA_NR z(MsqSy|vGA-u#6CM5%tXECU0nT>TYh{U=V41asp~u9&bbY?0!}SJR7kVI?tE{S({WAy zNB6g}&8M2oZ1;Zk0+KjD3XL>+$^Nz8cqx8HKOKQuu0Rd^|3?amBP2xf+@@FyRKr^= z=|pzgVIwve4@nrv`FrUn|4TXxDL&wlaWUnY7*uoK#j^eLaSQZdBIGgOy?Yk}nsGFI zNnZp7gACG+#=j#JLoSI~3vRr41#XY8ya>DdeHzdfi>Fa{}jf37QUI*ASvjloMKR&p!AF8Aa9B?98_qfZ9B-$9S~pRX+&ip;VC2 zJ6fn#_1U?xZE>05iDJ6pmk}Jqb5J^oi5UVOGhFpXgGkV$5(Gv8!!-5|V_o9g7Dci9=01*)g_aud{FRTPRwqF4atAr) z%AdRaKk~05c$YO*Hh~Hh1JM$X*?%J#5!zp$P8gshG3CTS{Vhve!-mY8h3pcg9%iD| zApp!XJ}Rfzf^1>1u`bK!(Q~rBC62oSmGMl0pdS%E|$qWJi#Wj}G8plR+whe8mO=@}{EgXn{)i z=1d3&wA>Z@qA18uiR&q#taYlb3;CB8{m5*s*(X%vo`~2|_i`DdVz)lU94w@qlhF4% ze9_hu12kqykc{T)#1blOUkbE4_OCh+snpT>OTaM?>57M5 zXspf>3YazzlD2okwU1A3{A#E`5R!L7Jv?}(NCo`_N4ch4dgg^pt(VY-(ogHuTzRC` zX};gbrMga>oJoR-|AN4^p>CB`;vSC_(U3p7|1h&ob@AM~@H`x~t0sQE+G3y}%4))g$b+sn1 z(#9xlr?(Z+hM7lj&2b}b^-TTOe+vJ(`J6e%>#!o7@TSa?mX8ew#S>+@9XaeA!NH=-S36 zdTVQIc2E)#+wOMP-ml#eg0`Bdp^s(q(b!hfj_TrSyA*|4WqmMn;&_E46Yq0Wph7?Y zZsgf4KpBCL&{o=mQR+uXTRqNp7!15m6BE?Qc0u=5p9LyQX|$FtHLxs#TJmCpmgcO) zeFj3AcZ-L>R1(2rR^5P`k4Hb3pXy*x0A$n^nL6i2B(&#?X%?sm9TpBoL0Z4s=P!cJv4Cm;U^qwwAA6j(~XiXc5Q2NY01th%8wHMS_%&PCXw*?St z341u($BS#Q8Y8pD5DV2y($-MtwkJxH`S%;u_>-TxE&RZ8{Kbx&i9C|k&VppB3*+y# zZQM$kK5D{<=C$8WGhOtG>GGfh!}}`}F?~s-+=0grX0F~(gQzTN>zTg#p=<1I%+eV8Wa^FJ7Opo@Ud&qqbz$dV9l?z#5u;%hy01MA!mxHAQu(J&2$2q-1fo>jKVda_0dS*MVA5$ByIt zBBdrwi05RYZDvEMEd1(i&1{}sb@>@IK+f_~-2D$-Ku&e$ysfPyIWGZ&@MHDbBhKsjE6vYTguOD|?SjqEm&9}5% z^DGhd$m53%SxdGS+v>@ScQ|%95+j`*MsjpoYfIilHdf_e;9ywD)3z0@&9S74d`Vm& zt&Br$Lu47;N0dr)gUX^k1$h#&O1yOo=yi30HU39rq%9~e->t9SX&vDv#htu~@X zggJ0O=Vu8Mm8iNsZzI(dkM-zYY<5J1{e8pG4dNu&=t9-BraalSx#x4ksk*~4aVAlb8zPHpHig;f;X%rFof0^hB+ zKnTl%RvR8T|2o_e32s~G9I%9mZ-k8M#q@wb5HuQQ_oKw=YX8hlsEE@UZVyuJc%s?% zUGe6EqmjrQsxssNHY96o3iMgI@71kW%3v9}UAO6|N^bTKkdhT@`}W!24peR%42+n^ zNUX7;q~juBvvci!FTgLxz92KgB-bVmfq)9o!6o<>FQ7ug7hnG3LQpkbic)jFo{$Ws z)Y>8|Yk%rnEaVGg8c_fo>CP0s5Yo$@(G(6!|%i zI{WGKaEVF~s7aAWF;Stk0r_eb<;BYrr4^`FPETls#8(Er61Y2a4hlpcl=A4y^pK0R z$3vBtYHgunFpH9uv?Ec5r)?s&Kry-K3Ql^WFUFDKx5Nxn_tkx;;bd^%Qq9f`J%Yh! zj9PihH07=iP+&m~rb~baoLD$?iM*pcM++ua(n~$l;D#x%Cg_`d=cW%iCYQ~0`k=t0rF4B zlG{kcdvxpeRo!@W%CQQYIub}O{tHOJgb{f`_A3=# z+?~LtBwEw0rX-l#RBssGRx|JO<(jMffFDp7N)Z3`(Q372jKAMV!erB6?o?8*yyt+r za6`!>&PohpkkRZ7>2G$&S#rg$RfNEqFH$ z>x!~FX$$K-`IQ5_TQ123Grgsp3BxDucGDzeLmz&I=ncwSC~j^U|MC9pmMUkKmZ138N64{H{W?_YMrC|3gKwz}teB*Fb4x z(F6O)Pc-lmQNtd~cu!8d)A9oP-wMc295G1V=uY(OvJ6wnS#DU8Z-@ssIMyT>ZGSH-`N8eTLgodpfx_p)DAPHFGxvXA7MSGg zfs}0C%D2<%MIcaGlus3Fm6nJuKUMu#!D&KDOXF~)N7sg%RLsvM{yls?1WPFa-%ktn zBYZG3%FwHG$i2p)pxAdZXBNJ%{s^)EgYAPN1iS>`<&et9Hf4gY@3E31K+o+G+t?KU zN zJNCxW_OA9U-@lz^K^i6r^~#V{xA41CZLsO z4gXE*4s*T&rNqf0P^smB&XxnxvN$FU)&|_n2chwM7994Q&rV-gz>W|I00yv_2|~I7 z1Af9V1W`;oFoY9;s9VC5{S+(n-wrHw?6OxVju!`uoFMzz9Q%@-&gU$POu)(v@Nk0H zZLGk^vd^O_gfmunU?1k2k`N-=Ze%uk zT~;*Q0L4w_WY$|`mz2j&gwJJ%9wg}T#$Dm=EjTiSytRL33bO$almI$idqLa2*pI-?dxMVgQ-a*A_ti;kAOZ$tZ__H+ zUy%fOxvYdl5UU%w{7E^G2>L)hX$l}?9M@~b+qMF&e=AT^rd8cL1+)qi$dj}oYV5r3 zOX$@*2f9pCw5TX+azEM6|2!DVUnx7-`&5Ux=Rab{38f)ZytN0EVaCI1G7JG7Ekv@B z_W~H^8Q2=vafeU&@Tgqi@Q1STz1Hks_CzF@=&S|``rbjMKzd_Bu$S+Tf&`%%2g+LD z5%2fQwW0zd&6PlSziEnc2ej3M+_H#3-Cx!mvdu2Lljr`(ND1iZJGRg;_wqM8kkv;} zX}j+|8KNh^vqrusm=AJ35jZrb=eYOYBENCK(!bX6gSxkc0mvRb@i6=QqxK9aJoDf4 zFmU%B5dZV2|Nqab#N_){z@qKm+&^?~?z z5d(2R7L?ykWRB}UOYl~~I>%;^?<@p6#?+JEAv*3npZAIn>Ic|r6>K%)Bkgkr=t*XY zh5$1&hSL6!m6yzuwyuDRmmG(Tp~dWdLjP;4U>ZTLFw;w;Qi)>D9y+Qvu!E23YwJhvn}S>GVf%#$gxsC(U{-B zIin5CQe;rN>`txv2ziD~d(W#Q9}nlsIe^xs=iit#MmxSQ^P{JP1-fx?a9~sTp0@ey zH=WCZ0thqxaj&y@xFX6`4sa+$&#Y4#K}(2e<_GmBK#f!94TEfD%kw!4uI9g?4-zr4 z^VU-th1ii5U4eK(mQc$$OGM7{#P%R07r+aKLH5q;b5rYMh|lq`4Ar&UX)xzrW9yLE$neUugE7yr>B9K>+Y+2 zO|`U|4M1;LhPV13$qts=h3R2@Y^%|NSiqg+*UP{b6$SvH68^O#r32RugzGOlAHgFv zWW5Lpl?J*b3o~9k-9sY@7HICe2Cbtk+LbXtKkNs}Da1|hHTRJVdRUl+2`Qo;h{g+f zp6xgQUMvT;H*$7XMTUpR0f56PK1z$5`2E9kw)rGM5n~XBl2`=A+C6JPl-MKKI-s+! zI&+`Nk zP%<4ms7`&+D8=X5Wv3?OMLv&JOa#2Izbz2qqxOvx%X89ZLq2@4U*ToSxZUXoMjLZ^ z_clM;NT{E&)S=`9rUIyP3G`F|)A|ef0*TGzSN!5ofzRy~r{yqBizWXh6t38NV(zCH zc5Mm%pi2n}w*ZG%;Cknqm5?skIc=M=KMXwbcK{=j=!*da<}~g^bWLd?WmK;FI(4HTRUCz3F?+I-Mbwa3QW-@)OHYGyH8?;O|2e()9~mon`g43xhF zZAp*43R`zzc%vfKI1A=vR(8D(1otHAja zah&I4(l%ceVM$6A^p23gQxdy!a1HE+nAUWLn!H2AoVvS#Qib?t$h;{ zlg?z#;41{=Am0UmGlBO>{)BaG39#tI{#TV2Did!?o7PlWq`Y?kns#F74^kA@q6J)c z%%M;{MJP|7Q5UQWm%=XO6W8T=(CXn#l;CSl{rwbVleMEKv=cz$)GZ2L7fY|iA0;^b z06GT0fPJ>Cxf?im(5KG{Pj{y$PQfnA>*5-ZItOm+-^yIfuZj}3flqPyqaKsxLFLR) zs>RApCUso5nfF0=tE($<#yt!x8Dt;$*Z~8>UT7Ret36?Xhf2gL0py&|WR8TVncQ!Z4EGknX6Sr{uVv3f z$_SwwCAe+pX5JOm$f|5`60Z5ceR_qYq;2;kKcOaLJAEVP=|#Zm^+NBqKLUA-6uG-Z zjoYmmitsOXt?Q!aDcPQ|EX<=Q<}#o8VK+lKz@B1=mO0gCr!-hU#b?`6lz-dQ?NgS9a@_bSw|_$9ywM&t>B;i>XZ^2n3667( z?rAK9;&ICvG|-P|sK6u>H~-9{p8NKB&U*ZO#xYvpYU6W&5Zg%v!?&-KYHukpe)cbV zwWAW`+l#a?YYPKub5yY%FwYYCAR}cIZKb}Pf>NxTHL+i^omK)D5aH}s1$(C>CYFG% zk-hVYHvyZzu9QEQgRLmxlGxfn>gVg1LrRh3OkUBC=RrTPN2d9tEAv&+j=^TB-BEK_e;i+fcU<%246?`WUNpz4TvhLoau)p6sd)&R>Gf`#G;8Id8 zVTh?GTbu7XyPG16XHKk-H@2TckWmseDPO9i6U{6FiAWy^>j9dXxxyt%uAx#(dX=F!Iz- zahwVf8u{f}n6nUU3Va05v)EIHUv4Y)_*-HJI*kSn=4ywquIfw%i!z6tfV%R74j8Ww zI~;z}EV5b41+le!3hAX+<&kP9E=Mc9R|4D)&sAz)8L(^0G~Lk3>Nk;YwAon^2ce%o z>kUXO^#J0h$D&>Kj;!U@JpK|fM9O^YpkMD)h_1?p;i0VTuuim|zdk;)v!#c=Z?Z)z zH&4Po|AheS$MUS}ymKKfnjU+<*nr*n&(QgJ>{=gJTET8i6hy-lnk!rOA}m=LVLP)T zMYzQ#3TAd&u@yf9iwk7V8G5dVWizO@iA)akacXtZI9u8dyWUG!?bIf#%fybQ$f zhK{m8kD@xKs}w7;x`etPxsS>>c$6zPI=WEst}5(reE(c$UVBCQby{c!E%OYAiUm6a z7eKSKa%1+IMSka2e5t0WYqhI6i(wU{RrPAkOpmn+TP2QJ;QD+lEsE8|i9;XvW{iZ| zYmAcbRwuwgc^03~o7%pSzMv4F57oQID((H@rI{-cy~7!IjTW2aQZ6H)ro?1 zAeVjFBiy5G``tPGmyH+nC&Xoqc^7AFxu$9W%$ zn3W%4rF<%#aHP6i`RRyT@Fw_0%PwfRFV?VA9kDn4l_jcJ9P+kbKGfV#g6Hx7^3eBW4-DA@Za4No?fb!b=XV`?a!h6?pY20I#Q1F#0PXG0RTmQh;-gF6b Ub4Tw%zG_J3g`&h)F~flW1=?|Ev;Y7A diff --git a/requirements.txt b/requirements.txt index 4ac0a8a047..2e23b0a981 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,13 @@ pybind11 cmake-build-extension ninja requests +qualname +regex +requests +--index-url https://download.pytorch.org/whl/cpu torch +--index-url https://download.pytorch.org/whl/cpu torchaudio +--index-url https://download.pytorch.org/whl/cpu torchvision +onnx +transformers +sentencepiece +einops From 4fd369ae121131a98896562b10c30374fd592b28 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 16 Aug 2023 15:22:41 -0500 Subject: [PATCH 198/344] Fix CUDA Error in the sampling operator (#966) * fix sampling issue * rename SamplingConfig as GenerationConfig to be consistent with HuggingFace * no longer need to pass mode to compile * . * prettier print * sampling_config -> generation_config * fix python interface --------- Co-authored-by: xinhaoc --- include/flexflow/flexflow_c.h | 5 +- include/flexflow/inference.h | 6 +- inference/incr_decoding/incr_decoding.cc | 6 +- inference/models/llama.cc | 8 +- inference/models/llama.h | 2 +- inference/models/starcoder.cc | 10 +-- inference/models/starcoder.h | 2 +- inference/python/incr_decoding.py | 7 +- inference/python/spec_infer.py | 10 +-- inference/spec_infer/spec_infer.cc | 6 +- python/flexflow/core/flexflow_cffi.py | 6 +- python/flexflow/serve/__init__.py | 20 ++--- python/flexflow/serve/models/base.py | 2 +- python/flexflow/serve/models/falcon.py | 10 +-- python/flexflow/serve/models/llama.py | 10 +-- python/flexflow/serve/models/opt.py | 10 +-- python/flexflow/serve/models/starcoder.py | 24 +++--- python/flexflow/serve/serve.py | 41 ++++----- src/c/flexflow_c.cc | 9 +- src/ops/linear.cc | 8 +- src/ops/sampling.cc | 13 ++- src/runtime/request_manager.cc | 86 ++++++++++++------- .../python_test_configs/generate_configs.py | 2 +- 23 files changed, 170 insertions(+), 133 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index f2deacd8d7..76cfd16ee1 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -97,11 +97,14 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_, int flexflow_config_get_python_data_loader_type(flexflow_config_t handle); +bool flexflow_config_get_offload(flexflow_config_t handle); + // ----------------------------------------------------------------------- // FFModel // ----------------------------------------------------------------------- -flexflow_model_t flexflow_model_create(flexflow_config_t config); +flexflow_model_t flexflow_model_create(flexflow_config_t config, + bool cpu_offload); void flexflow_model_destroy(flexflow_model_t handle); diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index 4e7d9ffcbc..f24a797ffd 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -20,16 +20,16 @@ namespace FlexFlow { -struct SamplingConfig { +struct GenerationConfig { bool do_sample = false; float temperature = 0.8; float topp = 0.6; - SamplingConfig(bool _do_sample, float _temperature, float _topp) { + GenerationConfig(bool _do_sample, float _temperature, float _topp) { temperature = _temperature > 0 ? _temperature : temperature; topp = _topp > 0 ? _topp : topp; do_sample = _do_sample; } - SamplingConfig() {} + GenerationConfig() {} }; struct GenerationResult { diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index e94cb4a4e9..e60e6a5283 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -180,7 +180,7 @@ void FlexFlow::top_level_task(Task const *task, assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); - SamplingConfig samplingConfig(do_sample, temperature, topp); + GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); rm->register_tokenizer(model_type, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); @@ -191,7 +191,7 @@ void FlexFlow::top_level_task(Task const *task, config_filepath, weights_filepath, INC_DECODING_MODE, - samplingConfig, + generationConfig, use_full_precision); } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, @@ -210,7 +210,7 @@ void FlexFlow::top_level_task(Task const *task, config_filepath, weights_filepath, INC_DECODING_MODE, - samplingConfig, + generationConfig, use_full_precision); } else { assert(false && "unknow model type"); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 5a607e4872..e2eabec341 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -24,7 +24,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - SamplingConfig samplingConfig, + GenerationConfig generation_config, bool use_full_precision) { // do not apply cpu offload in beam search model. LLAMAConfig llama_config(model_config_file_path); @@ -190,10 +190,10 @@ void LLAMA::create_llama_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ true); } else { // Tensor softmax = ff.softmax(dense, -1); - if (samplingConfig.do_sample) { - dense = ff.scalar_truediv(dense, samplingConfig.temperature, false); + if (generation_config.do_sample) { + dense = ff.scalar_truediv(dense, generation_config.temperature, false); Tensor softmax = ff.softmax(dense, -1); - output = ff.sampling(softmax, samplingConfig.topp); + output = ff.sampling(softmax, generation_config.topp); } else { // output = ff.arg_top_k(dense, /*k=*/1, false); output = ff.argmax(dense, /*beam_Search*/ false); diff --git a/inference/models/llama.h b/inference/models/llama.h index 311bdedbe6..f01a7dbd52 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -81,7 +81,7 @@ class LLAMA { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - SamplingConfig samplingConfig, + GenerationConfig generation_config, bool use_full_precision = false); }; diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index fcd41e21ea..4b27498cfd 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -25,7 +25,7 @@ void STARCODER::create_starcoder_model( std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - SamplingConfig samplingConfig, + GenerationConfig generationConfig, bool use_full_precision) { // do not apply cpu offload in beam search model. STARCODERConfig startcoder_config(model_config_file_path); @@ -151,8 +151,6 @@ void STARCODER::create_starcoder_model( l2_layer); // mlp - std::cout << "intermediate_size: " << startcoder_config.intermediate_size - << "\n"; Tensor c_fc = ff.dense( l2_norm, startcoder_config.intermediate_size, AC_MODE_NONE, true); Layer *c_fc_layer = ff.layers.back(); @@ -185,10 +183,10 @@ void STARCODER::create_starcoder_model( output = ff.argmax(softmax, /*beam_Search*/ true); } else { // Tensor softmax = ff.softmax(dense, -1); - if (samplingConfig.do_sample) { - lm_head = ff.scalar_truediv(lm_head, samplingConfig.temperature, false); + if (generationConfig.do_sample) { + lm_head = ff.scalar_truediv(lm_head, generationConfig.temperature, false); Tensor softmax = ff.softmax(lm_head, -1); - output = ff.sampling(softmax, samplingConfig.topp); + output = ff.sampling(softmax, generationConfig.topp); } else { output = ff.argmax(lm_head, /*beam_Search*/ false); } diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index af0732f3c4..9789a1c36e 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -69,7 +69,7 @@ class STARCODER { std::string const &model_config_file_path, std::string const &weight_file_path, InferenceMode mode, - SamplingConfig samplingConfig, + GenerationConfig generationConfig, bool use_full_precision = false); }; diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index b3cee48458..1ed7791143 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -43,7 +43,7 @@ def get_configs(): # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_gpu": 30000, + "zero_copy_memory_per_node": 30000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -92,12 +92,11 @@ def main(): ) # Compile the LLM for inference and load the weights into memory - sampling_config = ff.SamplingConfig( + generation_config = ff.GenerationConfig( do_sample=False, temperature=0.9, topp=0.8, topk=1 ) llm.compile( - ff.InferenceMode.INC_DECODING_MODE, - sampling_config, + generation_config, max_batch_size=1, max_seq_length=256, max_tokens_per_batch=64, diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index a19a930f2d..192960b533 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -43,7 +43,7 @@ def get_configs(): # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_gpu": 30000, + "zero_copy_memory_per_node": 30000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -126,15 +126,14 @@ def main(): ssms.append(ssm) # Create the sampling configs - sampling_config = ff.SamplingConfig( + generation_config = ff.GenerationConfig( do_sample=False, temperature=0.9, topp=0.8, topk=1 ) # Compile the SSMs for inference and load the weights into memory for ssm in ssms: ssm.compile( - ff.InferenceMode.BEAM_SEARCH_MODE, - sampling_config, + generation_config, max_batch_size=1, max_seq_length=256, max_tokens_per_batch=64, @@ -142,8 +141,7 @@ def main(): # Compile the LLM for inference and load the weights into memory llm.compile( - ff.InferenceMode.TREE_VERIFY_MODE, - sampling_config, + generation_config, max_batch_size=1, max_seq_length=256, max_tokens_per_batch=64, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 67faf98536..975b66c82e 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -253,7 +253,7 @@ void FlexFlow::top_level_task(Task const *task, ffconfig.numNodes * ffconfig.workersPerNode); // Create SentencePiece tokenizer or OPT tokenizer - SamplingConfig samplingConfig; + GenerationConfig generationConfig; InferenceManager *im = InferenceManager::get_inference_manager(); RequestManager *rm = RequestManager::get_request_manager(); rm->register_tokenizer(model_metadata.llm_model_type, @@ -268,7 +268,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_model_config_path, model_metadata.llm_weights_path, TREE_VERIFY_MODE, - samplingConfig, + generationConfig, use_full_precision); } else if (model_metadata.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, @@ -306,7 +306,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, - samplingConfig, + generationConfig, use_full_precision); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) { OPT::create_opt_model(beam_model, diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 48193720eb..9fa3ab3aad 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -667,6 +667,10 @@ def pipeline_parallelism_degree(self, value): @property def python_data_loader_type(self): return ffc.flexflow_config_get_python_data_loader_type(self.handle) + + @property + def cpu_offload(self): + return ffc.flexflow_config_get_offload(self.handle) def get_current_time(self): return ffc.flexflow_get_current_time(self.handle) @@ -1020,7 +1024,7 @@ def __init__(self, ffconfig): :returns: FFModel -- the model. """ - self.handle = ffc.flexflow_model_create(ffconfig.handle) + self.handle = ffc.flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) self._handle = ffi.gc(self.handle, ffc.flexflow_model_destroy) self._layers = dict() self._nb_layers = 0 diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 8b054f0120..315a1e4317 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -41,7 +41,7 @@ def init(configs: Union[str, dict]): The init function takes three mandatory parameters, which cannot be changed after starting the runtime. These are: - num_gpus: the number of GPUs to reserve for the runtime - memory_per_gpu: the amount of memory (in MB) to pre-allocate on each GPU - - zero_copy_memory_per_gpu: the amount of zero-copy memory (in MB) to pre-allocate for each GPU + - zero_copy_memory_per_node: the amount of zero-copy memory (in MB) to pre-allocate for each node In addition, the following optional parameters can be passed: - num_cpus: the number of CPU processors to reserve for the runtime, defaults to 4 @@ -59,7 +59,7 @@ def init(configs: Union[str, dict]): :param configs: The runtime configs, in the form of a dictionary or the path to a JSON file :type configs: Union[str, dict] :raises ValueError: This function will raise an exception if the JSON file pointed to by the input string is not in the right format - :raises ValueError: This function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_gpu + :raises ValueError: This function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node """ configs_dict = {} if type(configs) == str: @@ -82,15 +82,15 @@ def init(configs: Union[str, dict]): # configs should contain the following mandatory keys with non-zero integer values: num_gpus = configs_dict.get("num_gpus") memory_per_gpu = configs_dict.get("memory_per_gpu") - zero_copy_memory_per_gpu = configs_dict.get("zero_copy_memory_per_gpu") - if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_gpu: + zero_copy_memory_per_node = configs_dict.get("zero_copy_memory_per_node") + if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_node: raise ValueError( - "Missing one of the following configs: num_gpus, memory_per_gpu, zero_copy_memory_per_gpu" + "Missing one of the following configs: num_gpus, memory_per_gpu, zero_copy_memory_per_node" ) _parse_positive_int_config("num_gpus", num_gpus, "-ll:gpu") _parse_positive_int_config("memory_per_gpu", memory_per_gpu, "-ll:fsize") _parse_positive_int_config( - "zero_copy_memory_per_gpu", zero_copy_memory_per_gpu, "-ll:zsize" + "zero_copy_memory_per_node", zero_copy_memory_per_node, "-ll:zsize" ) # parse optional arguments @@ -141,8 +141,8 @@ def init(configs: Union[str, dict]): if fusion: sys.argv += ["--fusion"] - global LLM, SSM, SamplingConfig - from .serve import LLM, SSM, SamplingConfig + global LLM, SSM, GenerationConfig + from .serve import LLM, SSM, GenerationConfig def init_cpu(): @@ -154,5 +154,5 @@ def init_cpu(): # Ask the runtime to avoid using GPU/GPU memory os.environ["CPU_ONLY_TEST"] = "1" - global LLM, SSM, SamplingConfig - from .serve import LLM, SSM, SamplingConfig + global LLM, SSM, GenerationConfig + from .serve import LLM, SSM, GenerationConfig diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index 7538ed1676..b7f4e54fc1 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -17,7 +17,7 @@ class FlexFlowModel: def __init__( self, mode, - sampling_config, + generation_config, ffconfig, hf_config, data_type, diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 5cb5443f42..4fcaca6c33 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -38,7 +38,7 @@ class FlexFlowFalcon(FlexFlowModel): def __init__( self, mode, - sampling_config, + generation_config, ffconfig, hf_config, data_type, @@ -49,7 +49,7 @@ def __init__( tokenizer_filepath="", ): self.mode = mode - self.sampling_config = sampling_config + self.generation_config = generation_config self.ffconfig = ffconfig self.max_batch_size = max_batch_size self.data_type = data_type @@ -204,12 +204,12 @@ def build_model(self): # output = ffmodel.beam_top_k(softmax, self.falcon_config.max_beam_width, False) output = ffmodel.argmax(softmax, True) else: - if self.sampling_config.do_sample: + if self.generation_config.do_sample: dense = ffmodel.scalar_true_divide( - lm_head, self.sampling_config.temperature, False + lm_head, self.generation_config.temperature, False ) softmax = ffmodel.softmax(dense, -1) - output = ffmodel.sampling(softmax, self.sampling_config.topp) + output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) output = ffmodel.argmax(lm_head, False) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index cb707f3e57..c716bff34d 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -35,7 +35,7 @@ class FlexFlowLLAMA(FlexFlowModel): def __init__( self, mode, - sampling_config, + generation_config, ffconfig, hf_config, data_type, @@ -46,7 +46,7 @@ def __init__( tokenizer_filepath="", ): self.mode = mode - self.sampling_config = sampling_config + self.generation_config = generation_config self.ffconfig = ffconfig self.max_batch_size = max_batch_size self.data_type = data_type @@ -214,12 +214,12 @@ def build_model(self): # output = ffmodel.beam_top_k(softmax, self.llama_config.max_beam_width, False) output = ffmodel.argmax(softmax, True) else: - if self.sampling_config.do_sample: + if self.generation_config.do_sample: dense = ffmodel.scalar_true_divide( - dense, self.sampling_config.temperature, False + dense, self.generation_config.temperature, False ) softmax = ffmodel.softmax(dense, -1) - output = ffmodel.sampling(softmax, self.sampling_config.topp) + output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(dense, 1, False) output = ffmodel.argmax(dense, False) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index a33b261cb7..d18c0d4cc9 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -40,7 +40,7 @@ class FlexFlowOPT(FlexFlowModel): def __init__( self, mode, - sampling_config, + generation_config, ffconfig, hf_config, data_type, @@ -51,7 +51,7 @@ def __init__( tokenizer_filepath="", ): self.mode = mode - self.sampling_config = sampling_config + self.generation_config = generation_config self.ffconfig = ffconfig self.max_batch_size = max_batch_size self.data_type = data_type @@ -261,12 +261,12 @@ def build_model(self): # output = ffmodel.beam_top_k(softmax, self.opt_config.max_beam_width, False) output = ffmodel.argmax(softmax, True) else: - if self.sampling_config.do_sample: + if self.generation_config.do_sample: dense = ffmodel.scalar_true_divide( - lm_head, self.sampling_config.temperature, False + lm_head, self.generation_config.temperature, False ) softmax = ffmodel.softmax(dense, -1) - output = ffmodel.sampling(softmax, self.sampling_config.topp) + output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) output = ffmodel.argmax(lm_head, False) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 193f7c8e1a..ec02249841 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -23,21 +23,21 @@ def __init__(self, hf_config): self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 - self.dropout_p = hf_config.dropout_p - self.hidden_size = hf_config.hidden_size + self.dropout_p = hf_config.attn_pdrop + self.hidden_size = hf_config.n_embd self.layer_norm_epsilon = hf_config.layer_norm_epsilon - self.max_position_embeddings = hf_config.max_position_embeddings - self.num_attention_heads = hf_config.num_attention_heads - self.num_hidden_layers = hf_config.num_hidden_layers + self.max_position_embeddings = hf_config.n_positions + self.num_attention_heads = hf_config.n_head + self.num_hidden_layers = hf_config.n_layer self.vocab_size = hf_config.vocab_size - self.intermediate_size = hf_config.intermediate_size + self.intermediate_size = hf_config.n_inner class FlexFlowSTARCODER(FlexFlowModel): def __init__( self, mode, - sampling_config, + generation_config, ffconfig, hf_config, data_type, @@ -48,7 +48,7 @@ def __init__( tokenizer_filepath="", ): self.mode = mode - self.sampling_config = sampling_config + self.generation_config = generation_config self.ffconfig = ffconfig self.max_batch_size = max_batch_size self.data_type = data_type @@ -91,7 +91,7 @@ def build_model(self): position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) - ffmodel.set_position_offset(2) + ffmodel.set_position_offset(0) token = ffmodel.embedding( input_tensor, self.starcoder_config.vocab_size, @@ -193,12 +193,12 @@ def build_model(self): name="lm_head_weight", ) - if self.sampling_config.do_sample: + if self.generation_config.do_sample: dense = ffmodel.scalar_true_divide( - lm_head, self.sampling_config.temperature, False + lm_head, self.generation_config.temperature, False ) softmax = ffmodel.softmax(dense, -1) - output = ffmodel.sampling(softmax, self.sampling_config.topp) + output = ffmodel.sampling(softmax, self.generation_config.topp) else: output = ffmodel.argmax(lm_head, False) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 7abea56a7d..ad4e8f594b 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -25,7 +25,7 @@ from typing import Union, List -class SamplingConfig: +class GenerationConfig: """A class to store the sampling configs.""" def __init__( @@ -250,8 +250,7 @@ def __load_hf_weights(self): def compile( self, - mode: InferenceMode = InferenceMode.INC_DECODING_MODE, - sampling_config: SamplingConfig = SamplingConfig(), + generation_config: GenerationConfig = GenerationConfig(), max_batch_size: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, @@ -264,8 +263,8 @@ def compile( :param mode: The LLM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE :type mode: InferenceMode, optional - :param sampling_config: The SamplingConfig object with the configurations to use for sampling, defaults to SamplingConfig() - :type sampling_config: SamplingConfig, optional + :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() + :type generation_config: GenerationConfig, optional :param max_batch_size: The maximum batch size to allow, defaults to 1 :type max_batch_size: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 @@ -285,12 +284,16 @@ def compile( self.max_seq_length = max_seq_length self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms - self.sampling_config = SamplingConfig() + self.generation_config = GenerationConfig() self.ffconfig = FFConfig() - assert ( - mode == InferenceMode.INC_DECODING_MODE - or mode == InferenceMode.BEAM_SEARCH_MODE - ) == (len(ssms) == 0) + if len(ssms) > 0: + assert type(self) == LLM + mode = InferenceMode.TREE_VERIFY_MODE + elif type(self) == SSM: + mode = InferenceMode.BEAM_SEARCH_MODE + else: + assert type(self) == LLM + mode = InferenceMode.INC_DECODING_MODE # Apply model-specific parallelism degrees, if needed if model_specific_data_parallelism_degree: @@ -309,7 +312,7 @@ def compile( # Instantiate the relevant model self.model = self.model_class( mode, - sampling_config, + generation_config, self.ffconfig, self.hf_config, self.data_type, @@ -336,7 +339,7 @@ def compile( for ssm in self.ssms: self.rm.register_ssm_model(ssm.model.ffmodel) - def generate(self, prompts: Union[str, List[str]]): + def generate(self, prompts: Union[str, List[str]], max_length: int = 128): """Generate tokens based on the input prompt(s) :param prompts: The generation prompt(s) in the form of a string, or list of strings @@ -347,11 +350,11 @@ def generate(self, prompts: Union[str, List[str]]): if type(prompts) == str: if len(prompts) == 0: return None - return self.model.ffmodel.generate(prompts, 128) + return self.model.ffmodel.generate(prompts, max_length) elif type(prompts) == list: if len(prompts) == 0: return [] - return [self.model.ffmodel.generate(prompt, 128) for prompt in prompts] + return [self.model.ffmodel.generate(prompt, max_length) for prompt in prompts] else: assert False, "Please pass a non-empty string or list of strings" @@ -390,8 +393,7 @@ def __init__( def compile( self, - mode: InferenceMode = InferenceMode.INC_DECODING_MODE, - sampling_config: SamplingConfig = SamplingConfig(), + generation_config: GenerationConfig = GenerationConfig(), max_batch_size: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, @@ -404,8 +406,8 @@ def compile( :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE :type mode: InferenceMode, optional - :param sampling_config: The SamplingConfig object with the configurations to use for sampling, defaults to SamplingConfig() - :type sampling_config: SamplingConfig, optional + :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() + :type generation_config: GenerationConfig, optional :param max_batch_size: The maximum batch size to allow, defaults to 1 :type max_batch_size: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 @@ -422,8 +424,7 @@ def compile( :type ssms: list, optional """ super().compile( - mode, - sampling_config, + generation_config, max_batch_size, max_seq_length, max_tokens_per_batch, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index b1363faacc..47c5d59f66 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -170,14 +170,19 @@ int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) { FFConfig *handle = FFCObjectWrapper::unwrap(handle_); return handle->python_data_loader_type; } +bool flexflow_config_get_offload(flexflow_config_t handle_) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->cpu_offload; +} // ----------------------------------------------------------------------- // FFModel // ----------------------------------------------------------------------- -flexflow_model_t flexflow_model_create(flexflow_config_t config_) { +flexflow_model_t flexflow_model_create(flexflow_config_t config_, + bool cpu_offload) { FFConfig *config = FFCObjectWrapper::unwrap(config_); - FFModel *model = new FFModel(*config); + FFModel *model = new FFModel(*config, cpu_offload); DEBUG_PRINT("[FFModel] new %p", model); return FFCObjectWrapper::wrap(model); } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 2376f80bec..21436d24d6 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -480,10 +480,10 @@ OpMeta *Linear::init_task_with_dim(Task const *task, // int in_dim = acc_kernel.rect.hi[0] - acc_kernel.rect.lo[0] + 1; int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; - printf("init linear (input): in_dim(%d) out_dim(%d) batch_size(%d)\n", - in_dim, - out_dim, - batch_size); + // printf("init linear (input): in_dim(%d) out_dim(%d) batch_size(%d)\n", + // in_dim, + // out_dim, + // batch_size); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 79e6027b7c..6eb62b2933 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -256,12 +256,13 @@ FutureMap Sampling::inference(FFModel const &ff, << std::endl; */ IndexLauncher launcher(SAMPLING_INF_TASK_ID, parallel_is, - TaskArgument(&bc, sizeof(BatchConfig)), + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_WRITE, @@ -284,8 +285,14 @@ InferenceResult Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); - BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // BatchConfig const *bc = (BatchConfig *)task->args; SamplingMeta const *m = *((SamplingMeta **)task->local_args); + if (bc->num_tokens == 0) { + // Directly return for empty batch config + InferenceResult ir; + return ir; + } GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -344,4 +351,4 @@ size_t hash::operator()( hash_combine(key, params.top_p); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 0c32da3291..348272a69b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -311,8 +311,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // This is a decoding token log_req_mgr.print("Output token is: %d", result.token_ids[i]); request.tokens.push_back(result.token_ids[i]); - std::string output = this->tokenizer_->Decode(request.tokens); - log_req_mgr.print("Output: %s", output.c_str()); + // std::string output = this->tokenizer_->Decode(request.tokens); + // log_req_mgr.print("Output: %s", output.c_str()); } } // Step 2: prepare the next batch for existing requests @@ -334,9 +334,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); - for (int i = 0; i < request.tokens.size(); i++) { - std::cout << request.tokens.at(i) << "\n"; - } + // for (int i = 0; i < request.tokens.size(); i++) { + // std::cout << request.tokens.at(i) << "\n"; + // } { // update generation result and trigger future GenerationResult &gr = request_generation_results[request.guid]; @@ -560,7 +560,7 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].max_depth; // do the slot exchange to minimize the cache exchange in kernel. - std::cout << "update metadata" << std::endl; + // std::cout << "update metadata" << std::endl; update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); if (new_bc.requestsInfo[i].token_start_offset + 1 >= @@ -1368,26 +1368,39 @@ std::vector> std::vector> new_committed_tokens = std::vector>(); - log_req_mgr.print("Input size (%zu) Output size (%zu)", + log_req_mgr.print("Input tree size (%zu) Output tree size (%zu)", inputSerializedTree.size(), outputSerializedTree.size()); - - log_req_mgr.print("========Input============"); - // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token id, - // depth) pairs - for (auto const &pair : inputSerializedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + { // Input tree + std::ostringstream oss; + // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token id, + // depth) pairs + for (auto const &pair : inputSerializedTree) { + oss << " " << pair.second << ":" << pair.first; + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + } + log_req_mgr.print("Input tree:%s", oss.str().c_str()); } - log_req_mgr.print("========Output============"); - // outputSerializedTree is an array of (token id, depth + 1) pairs - for (auto const &pair : outputSerializedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + { // Output tree + // log_req_mgr.print("========Output============"); + // outputSerializedTree is an array of (token id, depth + 1) pairs + std::ostringstream oss; + for (auto const &pair : outputSerializedTree) { + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + oss << " " << pair.second << ":" << pair.first; + } + log_req_mgr.print("Output tree:%s", oss.str().c_str()); } - log_req_mgr.print("========Committed============"); - // committed_tokens[guid] is an array of (depth, result_index) pairs for the - // given request - for (auto const &pair : committed_tokens.at(guid)) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + { + // log_req_mgr.print("========Committed============"); + // committed_tokens[guid] is an array of (depth, result_index) pairs for + // the given request + std::ostringstream oss; + for (auto const &pair : committed_tokens.at(guid)) { + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + oss << " " << pair.second << ":" << pair.first; + } + log_req_mgr.print("Committed tokens:%s", oss.str().c_str()); } // It's safe to have inputSerializedTree.size() > outputSerializedTree.size() @@ -1423,14 +1436,23 @@ std::vector> } } committed_tokens[guid] = new_committed_tokens; - log_req_mgr.print("========Verified============"); - for (auto const &pair : verifiedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + { + // log_req_mgr.print("========Verified============"); + std::ostringstream oss; + for (auto const &pair : verifiedTree) { + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + oss << " " << pair.second << ":" << pair.first; + } + log_req_mgr.print("Verified:%s", oss.str().c_str()); } - - log_req_mgr.print("========New Committed============"); - for (auto const &pair : committed_tokens.at(guid)) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + { + // log_req_mgr.print("========New Committed============"); + std::ostringstream oss; + for (auto const &pair : committed_tokens.at(guid)) { + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + oss << " " << pair.second << ":" << pair.first; + } + log_req_mgr.print("New committed:%s", oss.str().c_str()); } return verifiedTree; @@ -1453,10 +1475,10 @@ std::vector> auto guid = old_bc.requestsInfo[request_index].request_guid; Request &request = all_requests[guid]; - std::cout << "request.beam_trees.size(): " << request.beam_trees.size() - << std::endl; + // std::cout << "request.beam_trees.size(): " << request.beam_trees.size() + // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - std::cout << "\n\n"; + // std::cout << "\n\n"; // token, index // todo make this one global for different stages diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 91cf317db4..9c4c37b2e7 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -6,7 +6,7 @@ # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_gpu": 30000, + "zero_copy_memory_per_node": 30000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, From 1179a8e70ee01c74f8f08420aed1c0355ab46b4d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 17 Aug 2023 09:44:16 -0400 Subject: [PATCH 199/344] Fix `requirements.txt` (#969) * fix * update workflow --- .github/workflows/gpu-ci-skip.yml | 1 + .github/workflows/gpu-ci.yml | 2 ++ .github/workflows/pip-install-skip.yml | 1 + .github/workflows/pip-install.yml | 2 ++ conda/flexflow.yml | 8 ++++---- docker/flexflow-environment/Dockerfile | 2 +- requirements.txt | 8 ++++---- 7 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index 01ceba65dd..6a18e56bd1 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -11,6 +11,7 @@ on: - "inference/**" - "src/**" - "tests/inference/**" + - "conda/flexflow.yml" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index a6ea492bcf..f732513e0d 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -11,6 +11,7 @@ on: - "inference/**" - "src/**" - "tests/inference/**" + - "conda/flexflow.yml" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" @@ -29,6 +30,7 @@ on: - "inference/**" - "src/**" - "tests/inference/**" + - "conda/flexflow.yml" - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" diff --git a/.github/workflows/pip-install-skip.yml b/.github/workflows/pip-install-skip.yml index f2606b94d8..92c3223e32 100644 --- a/.github/workflows/pip-install-skip.yml +++ b/.github/workflows/pip-install-skip.yml @@ -7,6 +7,7 @@ on: - "deps/**" - "python/**" - "setup.py" + - "requirements.txt" - ".github/workflows/helpers/install_dependencies.sh" - ".github/workflows/pip-install.yml" workflow_dispatch: diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index 7d60d3bf52..d79834e31d 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -7,6 +7,7 @@ on: - "deps/**" - "python/**" - "setup.py" + - "requirements.txt" - ".github/workflows/helpers/install_dependencies.sh" - ".github/workflows/pip-install.yml" push: @@ -18,6 +19,7 @@ on: - "deps/**" - "python/**" - "setup.py" + - "requirements.txt" - ".github/workflows/helpers/install_dependencies.sh" - ".github/workflows/pip-install.yml" workflow_dispatch: diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 5987cae0a3..57101ac4ff 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -15,12 +15,12 @@ dependencies: - qualname>=0.1.0 - keras_preprocessing>=1.1.2 - numpy>=1.16.0 - - torch --index-url https://download.pytorch.org/whl/cpu - - torchaudio --index-url https://download.pytorch.org/whl/cpu - - torchvision --index-url https://download.pytorch.org/whl/cpu + - torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu + - torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu + - torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cpu - regex - onnx - - transformers + - transformers>=4.31.0 - sentencepiece - einops - requests diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 2f970f272d..09f8be51dc 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -57,7 +57,7 @@ ENV CUDA_DIR /usr/local/cuda RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install CPU-only Pytorch and related dependencies RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch -RUN conda install -c conda-forge onnx transformers sentencepiece einops +RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow # Install Rust diff --git a/requirements.txt b/requirements.txt index 2e23b0a981..2e8d4ad962 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,10 +10,10 @@ requests qualname regex requests ---index-url https://download.pytorch.org/whl/cpu torch ---index-url https://download.pytorch.org/whl/cpu torchaudio ---index-url https://download.pytorch.org/whl/cpu torchvision +torch==2.0.1 +torchaudio==2.0.2 +torchvision==0.15.2 onnx -transformers +transformers>=4.31.0 sentencepiece einops From 534adaf58663f7fa937aa026cccf537e479e6642 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 17 Aug 2023 11:55:54 -0400 Subject: [PATCH 200/344] check starcoder not run with tp (#971) * check starcoder * num_kv_head * fix --------- Co-authored-by: Zhihao Jia --- python/flexflow/serve/models/starcoder.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index ec02249841..922d0e4746 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -31,6 +31,7 @@ def __init__(self, hf_config): self.num_hidden_layers = hf_config.n_layer self.vocab_size = hf_config.vocab_size self.intermediate_size = hf_config.n_inner + self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head class FlexFlowSTARCODER(FlexFlowModel): @@ -80,7 +81,15 @@ def __init__( raise ValueError( f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - + if ( + self.starcoder_config.n_head_kv < self.ffconfig.tensor_parallelism_degree + or self.starcoder_config.n_head_kv % self.ffconfig.tensor_parallelism_degree + != 0 + ): + raise ValueError( + f"Number of k/v attention heads ({self.starcoder_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + self.build_model() def build_model(self): @@ -134,7 +143,7 @@ def build_model(self): ln_1, self.starcoder_config.hidden_size, self.starcoder_config.num_attention_heads, - 1, + self.starcoder_config.n_head_kv, self.starcoder_config.hidden_size // self.starcoder_config.num_attention_heads, self.starcoder_config.hidden_size From d5a1dccd148bdc633e4b998545496abaf328df80 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 17 Aug 2023 14:46:28 -0400 Subject: [PATCH 201/344] Docs update (#970) * update * group together training docs * C++ docs (#972) * added stub * cleanup doxyfile * added internals stub * cuda first * updated docs copyright * add comments * sphinx update * updated sphinx pages * small update * more docs * update makefile * more docs * integrating doxygen in rtd * formatting * added docs * more docs * update * updates * fix * add missing file * updated readmes * maybe fix * Revert "maybe fix" This reverts commit a73ec2ffb2267bf46b242440a00b705e020a24fb. * fix * fix * fix * fixed everything (for now) * Create github/README.md * Update README.md * update --------- Co-authored-by: Zhihao Jia --- .github/README.md | 37 ++- .gitignore | 5 +- CONTRIBUTING.md | 25 +- INSTALL.md | 2 +- README.md | 94 ++++--- SERVE.md | 209 ++++++++++++++++ TRAIN.md | 65 +++++ docs/Makefile | 2 +- docs/doxygen/Doxyfile | 17 +- docs/source/conf.py | 94 ++++--- docs/source/cpp_api.rst | 10 + .../developers_guide.rst | 2 +- docs/source/developers_guide/ff_internals.rst | 6 + docs/source/developers_guide/internals.md | 15 ++ docs/source/docker.rst | 1 + docs/source/index.rst | 34 +-- docs/source/installation.rst | 3 +- docs/source/keras.rst | 7 +- docs/source/mt5.rst | 6 +- docs/source/onnx.rst | 1 + docs/source/pytorch.rst | 7 +- docs/source/serve_overview.rst | 7 + docs/source/train_examples.rst | 6 + docs/source/train_interface.rst | 8 + docs/source/train_overview.rst | 7 + docs/source/train_python_api.rst | 11 + docs/source/welcome.rst | 1 + include/flexflow/operator.h | 22 ++ src/ops/linear.cc | 94 +++++++ src/runtime/model.cc | 230 +++++++++++++++++- 30 files changed, 873 insertions(+), 155 deletions(-) create mode 100644 SERVE.md create mode 100644 TRAIN.md create mode 100644 docs/source/cpp_api.rst rename docs/source/{ => developers_guide}/developers_guide.rst (64%) create mode 100644 docs/source/developers_guide/ff_internals.rst create mode 100644 docs/source/developers_guide/internals.md create mode 100644 docs/source/serve_overview.rst create mode 100644 docs/source/train_examples.rst create mode 100644 docs/source/train_interface.rst create mode 100644 docs/source/train_overview.rst create mode 100644 docs/source/train_python_api.rst diff --git a/.github/README.md b/.github/README.md index e61c1648ba..99e0f82b62 100644 --- a/.github/README.md +++ b/.github/README.md @@ -4,8 +4,9 @@ --- -## News: +## News🔥: +* [08/16/2023] Adding Starcoder model support * [08/14/2023] Released Dockerfile for different CUDA versions ## What is FlexFlow Serve @@ -18,9 +19,10 @@ existing systems by 1.3-2.0x for single-node, multi-GPU inference and by 1.4-2.4x for multi-node, multi-GPU inference.

-Performance comparison +Performance comparison

+ ## Install FlexFlow Serve @@ -178,12 +180,18 @@ which largely reduces the end-to-end inference latency and computational require for serving generative LLMs while provably preserving model quality.

-A Speculative Inference Demo +A Speculative Inference Demo

### Supported LLMs and SSMs -FlexFlow Serve supports a variety of HuggingFace models: +FlexFlow Serve currently supports all HuggingFace models with the following architectures: +* `LlamaForCausalLM` / `LLaMAForCausalLM` (e.g. LLaMA/LLaMA-2, Guanaco, Vicuna, Alpaca, ...) +* `OPTForCausalLM` (models from the OPT family) +* `RWForCausalLM` (models from the Falcon family) +* `GPTBigCodeForCausalLM` (models from the Starcoder family) + +Below is a list of models that we have explicitly tested and for which a SSM may be available: | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | @@ -200,37 +208,26 @@ FlexFlow Serve supports a variety of HuggingFace models: | OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | Falcon-7B | tiiuae/falcon-7b | | | Falcon-40B | tiiuae/falcon-40b | | +| StarCoder-7B | bigcode/starcoderbase-7b | | | StarCoder-15.5B | bigcode/starcoder | | - ### CPU Offloading -FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags. +FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags. ### Quantization -FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. [TODO: update instructions for quantization]. +FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. ### Prompt Datasets We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). ## TODOs -FlexFlow Serve and FlexFlow are under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. +FlexFlow Serve is under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. * AMD support. We are actively working on supporting FlexFlow Serve on AMD GPUs and welcome any contributions to this effort. ## Acknowledgements -This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: - -``` bibtex -@misc{miao2023specinfer, - title={SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}, - author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, - year={2023}, - eprint={2305.09781}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} -``` +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. ## License FlexFlow uses Apache License 2.0. diff --git a/.gitignore b/.gitignore index 4aecfb11ff..be0266c9b5 100644 --- a/.gitignore +++ b/.gitignore @@ -88,10 +88,7 @@ docs/build/ # Doxygen documentation docs/doxygen/output/ - -# Exhale documentation -docs/source/_doxygen/ -docs/source/c++_api/ +docs/doxygen/cpp_api/ # PyBuilder .pybuilder/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e607fddb1a..c3c0b5173f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -119,7 +119,26 @@ After adding the DNN layers, the next step before compiling the model for traini #### Model compilation -TODO +Model compilation consists of the following steps: + +1. We initialize an operator for each layer in the model, via the function `create_operators_from_layers()`. Layers work with `Tensor` input/weights/outputs, and are created directly by the user when writing a FlexFlow program. Operators work with `ParallelTensor` objects and they are responsible for running computations by launching kernels on GPUs. +2. Launch the graph optimize task (`GRAPH_OPTIMIZE_TASK_ID`), implemented by`PCG::Graph::graph_optimize_task`, which returns `PCG::GraphOptimalViewSerialized` + 1. call `deserialize_graph_optimal_view(...)` to get `PCG::Graph *best_graph` and `std::unordered_map optimal_views` from deserialized `PCG::GraphOptimalViewSerialized` + 2. `convert_graph_to_operators()` + 3. print the dot of the best graph obtained + 4. map inputs to parallel tensor and weights to parallel tensor? -> strange for loop to understand better +3. Init performance metrics via the `FFModel::update_metrics_task` +4. Perform inplace optimizations (if enabled) +5. Loop through the operators to do the following (to be understood better): + 1. `parameters.push_back(op->weights[i]);` for each weight in each operator + 2. `op->map_output_tensors(*this);` + 3. `((ParallelOp *)op)->create_input_partition(*this);` if the operator is a parallel operator +6. Check correctness of the operator's input and output tensors' settings +7. Perform fusion optimizations, if enabled +8. Print all operators and their input and output regions +9. Create the tensor for the label +10. Initialize the optimizer +11. In training mode, if NCCL is enabled, initialize all the communicators and other objects ## Continuous Integration @@ -281,6 +300,10 @@ We want to make contributing to this project as easy and transparent as possible ### Formatting We use `clang-format` to format our C++ code. If you make changes to the code and the Clang format CI test is failing, you can lint your code by running: `./scripts/format.sh` from the main folder of this repo. +### Documenting the code +We follow the Python Docstring conventions for documenting the Python code. We document the C++ code using comments in any of the conventioned supported by Doxygen [see here](https://doxygen.nl/manual/docblocks.html). + + ### Pull Requests We actively welcome your pull requests. diff --git a/INSTALL.md b/INSTALL.md index cdc2a2abbd..8d33770c92 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,4 +1,4 @@ -# Installing FlexFlow +# Building from source To build and install FlexFlow, follow the instructions below. ## 1. Download the source code diff --git a/README.md b/README.md index 9ad900fb3c..e84bf20605 100644 --- a/README.md +++ b/README.md @@ -1,72 +1,53 @@ -# FlexFlow -![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) +# FlexFlow: Low-Latency, High-Performance Training and Serving +![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=inference) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=inference) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=inference) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=inference) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=inference) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=inference) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest) -FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras). -## Install FlexFlow -To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages for several versions of CUDA and for the `hip_rocm` backend, together with [Dockerfiles](./docker) if you wish to build the containers manually. More info on the Docker images can be found [here](./docker/README.md). You can also use `conda` to install the FlexFlow Python package (coming soon). +--- -## PyTorch Support -Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`. -```python -import torch -import flexflow.torch.fx as fx +## News 🔥: -model = MyPyTorchModule() -fx.torch_to_flexflow(model, "mymodel.ff") -``` +* [08/16/2023] Adding Starcoder model support +* [08/14/2023] Released Dockerfile for different CUDA versions + +## Install FlexFlow -Second, a FlexFlow program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine. -```python -from flexflow.pytorch.model import PyTorchModel +### Requirements +* OS: Linux +* GPU backend: Hip-ROCm or CUDA + * CUDA version: 10.2 – 12.0 + * NVIDIA compute capability: 6.0 or higher +* Python: 3.6 or higher +* Package dependencies: [see here](https://github.com/flexflow/FlexFlow/blob/inference/requirements.txt) -def top_level_task(): - torch_model = PyTorchModel("mymodel.ff") - output_tensor = torch_model.apply(ffmodel, input_tensor) - ## Model compilation - ffmodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) - ## Model training - (x_train, y_train) = cifar10.load_data() - ffmodel.fit(x_train, y_train, epochs=30) +### Install with pip +You can install FlexFlow using pip: + +```bash +pip install flexflow ``` -**More FlexFlow PyTorch examples**: see the [pytorch examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch). +### Try it in Docker +If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions and the `hip_rocm` backend. To download and run our pre-built Docker container: + +```bash +docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-11.8:latest +``` -## TensorFlow Keras and ONNX Support -FlexFlow prioritizes PyTorch compatibility, but also includes frontends for [Tensorflow Keras](./docs/source/keras.rst) and [ONNX](./docs/source/onnx.rst) models. +To download a Docker container for a backend other than CUDA v11.8, you can replace the `cuda-11.8` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md). -## C++ Interface -For users that prefer to program in C/C++. FlexFlow supports a C++ program inference that is equivalent to its Python APIs. +### Build from source -**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp). +You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). -## Command-Line Flags -In addition to setting runtime configurations in a FlexFlow Python/C++ program, the FlexFlow runtime also accepts command-line arguments for various runtime parameters: +## Get Started! -FlexFlow training flags: -* `-e` or `--epochs`: number of total epochs to run (default: 1) -* `-b` or `--batch-size`: global batch size in each iteration (default: 64) -* `-p` or `--print-freq`: print frequency (default: 10) -* `-d` or `--dataset`: path to the training dataset. If not set, synthetic data is used to conduct training. +To get started, check out the quickstart guides below for the FlexFlow training and serving libraries. -Legion runtime flags: -* `-ll:gpu`: number of GPU processors to use on each node (default: 0) -* `-ll:fsize`: size of device memory on each GPU (in MB) -* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) on each node (in MB). This is used for prefecthing training images from disk. -* `-ll:cpu`: number of data loading workers (default: 4) -* `-ll:util`: number of utility threads to create per process (default: 1) -* `-ll:bgwork`: number of background worker threads to create per process (default: 1) +* [FlexFlow Train](./TRAIN.md) +* [FlexFlow Serve](./SERVE.md) -Performance auto-tuning flags: -* `--search-budget` or `--budget`: the number of iterations for the MCMC search (default: 0) -* `--search-alpha` or `--alpha`: a hyper-parameter for the search procedure (default: 0.05) -* `--export-strategy` or `--export`: path to export the best discovered strategy (default: None) -* `--import-strategy` or `--import`: path to import a previous saved strategy (default: None) -* `--enable-parameter-parallel`: allow FlexFlow to explore parameter parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.) -* `--enable-attribute-parallel`: allow FlexFlow to explore attribute parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.) -For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search). ## Contributing @@ -75,6 +56,14 @@ Please let us know if you encounter any bugs or have any suggestions by [submitt We welcome all contributions to FlexFlow from bug fixes to new features and extensions. ## Citations + +**FlexFlow Serve:** + +* Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, Chunan Shi, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, Zhihao Jia. [SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). In ArXiV, May 2023. + + +**FlexFlow Train:** + * Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022. * Zhihao Jia, Matei Zaharia, and Alex Aiken. [Beyond Data and Model Parallelism for Deep Neural Networks](https://cs.stanford.edu/~zhihao/papers/sysml19a.pdf). In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), April 2019. @@ -86,3 +75,4 @@ FlexFlow is developed and maintained by teams at CMU, Facebook, Los Alamos Natio ## License FlexFlow uses Apache License 2.0. + diff --git a/SERVE.md b/SERVE.md new file mode 100644 index 0000000000..e716392b32 --- /dev/null +++ b/SERVE.md @@ -0,0 +1,209 @@ +# FlexFlow Serve: Low-Latency, High-Performance LLM Serving + + +## What is FlexFlow Serve + +The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. +FlexFlow Serve is an open-source compiler and distributed system for +__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms +existing systems by 1.3-2.0x for single-node, multi-GPU inference and by +1.4-2.4x for multi-node, multi-GPU inference. + +

+Performance comparison +

+ + +## Quickstart +The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving. +```python +import flexflow.serve as ff + +ff.init( + { + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 30000, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, + } +) +``` +Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). +```python +# Specify the LLM +llm = ff.LLM("decapoda-research/llama-7b-hf") + +# Specify a list of SSMs (just one in this case) +ssms=[] +ssm = ff.SSM("JackFram/llama-68m") +ssms.append(ssm) +``` +Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. +```python +# Create the sampling configs +generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 +) + +# Compile the SSMs for inference and load the weights into memory +for ssm in ssms: + ssm.compile(generation_config) + +# Compile the LLM for inference and load the weights into memory +llm.compile(generation_config, ssms=ssms) +``` +Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. +```python +result = llm.generate("Here are some travel tips for Tokyo:\n") +``` + +### Incremental decoding + +
+Expand here +
+ +```python + +import flexflow.serve as ff + +# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs +ff.init( + { + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_gpu": 30000, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, + } +) + +# Create the FlexFlow LLM +llm = ff.LLM("decapoda-research/llama-7b-hf") + +# Create the sampling configs +generation_config = ff.GenerationConfig( + do_sample=True, temperature=0.9, topp=0.8, topk=1 +) + +# Compile the LLM for inference and load the weights into memory +llm.compile(generation_config) + +# Generation begins! +result = llm.generate("Here are some travel tips for Tokyo:\n") + +``` + +
+ +### C++ interface +If you'd like to use the C++ interface (mostly used for development and benchmarking purposes), you should install from source, and follow the instructions below. + +
+Expand here +
+ +#### Downloading models + +Before running FlexFlow Serve, you should manually download the LLM and SSM(s) model of interest using the [inference/utils/download_hf_model.py](https://github.com/flexflow/FlexFlow/blob/inference/inference/utils/download_hf_model.py) script (see example below). By default, the script will download all of a model's assets (weights, configs, tokenizer files, etc...) into the cache folder `~/.cache/flexflow`. If you would like to use a different folder, you can request that via the parameter `--cache-folder`. + +```bash +python3 ./inference/utils/download_hf_model.py ... +``` + +#### Running the C++ examples +A C++ example is available at [this folder](../inference/spec_infer/). After building FlexFlow Serve, the executable will be available at `/build_dir/inference/spec_infer/spec_infer`. You can use the following command-line arguments to run FlexFlow Serve: + +* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) +* `-ll:fsize`: size of device memory on each GPU in MB +* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. +* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf") +* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-cache-folder`: the folder +* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. +* `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests: +* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency + +For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. + +```bash +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +``` +
+ +## Speculative Inference +A key technique that enables FlexFlow Serve to accelerate LLM serving is speculative +inference, which combines various collectively boost-tuned small speculative +models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The correctness +of all candidate token sequences represented by a token tree is verified against the +LLM’s output in parallel using a novel tree-based parallel decoding mechanism. +FlexFlow Serve uses an LLM as a token tree verifier instead of an incremental decoder, +which largely reduces the end-to-end inference latency and computational requirement +for serving generative LLMs while provably preserving model quality. + +

+A Speculative Inference Demo +

+ +### Supported LLMs and SSMs + +FlexFlow Serve currently supports all HuggingFace models with the following architectures: +* `LlamaForCausalLM` / `LLaMAForCausalLM` (e.g. LLaMA/LLaMA-2, Guanaco, Vicuna, Alpaca, ...) +* `OPTForCausalLM` (models from the OPT family) +* `RWForCausalLM` (models from the Falcon family) +* `GPTBigCodeForCausalLM` (models from the Starcoder family) + +Below is a list of models that we have explicitly tested and for which a SSM may be available: + +| Model | Model id on HuggingFace | Boost-tuned SSMs | +| :---- | :---- | :---- | +| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | +| Falcon-7B | tiiuae/falcon-7b | | +| Falcon-40B | tiiuae/falcon-40b | | +| StarCoder-15.5B | bigcode/starcoder | | + + +### CPU Offloading +FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags. + +### Quantization +FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. [TODO: update instructions for quantization]. + +### Prompt Datasets +We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). + +## TODOs + +FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. + +* AMD support. We are actively working on supporting FlexFlow Serve on AMD GPUs and welcome any contributions to this effort. + +## Acknowledgements +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: + +``` bibtex +@misc{miao2023specinfer, + title={SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification}, + author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Alan Zhu and Lijie Yang and Xiaoxiang Shi and Chunan Shi and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, + year={2023}, + eprint={2305.09781}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## License +FlexFlow uses Apache License 2.0. diff --git a/TRAIN.md b/TRAIN.md new file mode 100644 index 0000000000..1595274a4c --- /dev/null +++ b/TRAIN.md @@ -0,0 +1,65 @@ +# FlexFlow Train: Distributed DNN Training with Flexible Parallelization Strategies. +FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow Train provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow oTrain nly requires [a few lines of changes to the program](https://flexflow.ai/keras). + + +## PyTorch Support +Users can also use FlexFlow Train to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`. +```python +import torch +import flexflow.torch.fx as fx + +model = MyPyTorchModule() +fx.torch_to_flexflow(model, "mymodel.ff") +``` + +Second, a FlexFlow Train program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine. + +```python +from flexflow.pytorch.model import PyTorchModel + +def top_level_task(): + torch_model = PyTorchModel("mymodel.ff") + output_tensor = torch_model.apply(ffmodel, input_tensor) + ## Model compilation + ffmodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) + ## Model training + (x_train, y_train) = cifar10.load_data() + ffmodel.fit(x_train, y_train, epochs=30) +``` + +**More FlexFlow PyTorch examples**: see the [pytorch examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch). + +## TensorFlow Keras and ONNX Support +FlexFlow Train prioritizes PyTorch compatibility, but also includes frontends for [Tensorflow Keras](./docs/source/keras.rst) and [ONNX](./docs/source/onnx.rst) models. + +## C++ Interface +For users that prefer to program in C/C++. FlexFlow Train supports a C++ program inference that is equivalent to its Python APIs. + +**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp). + + +## Command-Line Flags +In addition to setting runtime configurations in a FlexFlow Train Python/C++ program, the FlexFlow Train runtime also accepts command-line arguments for various runtime parameters: + +FlexFlow training flags: +* `-e` or `--epochs`: number of total epochs to run (default: 1) +* `-b` or `--batch-size`: global batch size in each iteration (default: 64) +* `-p` or `--print-freq`: print frequency (default: 10) +* `-d` or `--dataset`: path to the training dataset. If not set, synthetic data is used to conduct training. + +Legion runtime flags: +* `-ll:gpu`: number of GPU processors to use on each node (default: 0) +* `-ll:fsize`: size of device memory on each GPU (in MB) +* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) on each node (in MB). This is used for prefecthing training images from disk. +* `-ll:cpu`: number of data loading workers (default: 4) +* `-ll:util`: number of utility threads to create per process (default: 1) +* `-ll:bgwork`: number of background worker threads to create per process (default: 1) + +Performance auto-tuning flags: +* `--search-budget` or `--budget`: the number of iterations for the MCMC search (default: 0) +* `--search-alpha` or `--alpha`: a hyper-parameter for the search procedure (default: 0.05) +* `--export-strategy` or `--export`: path to export the best discovered strategy (default: None) +* `--import-strategy` or `--import`: path to import a previous saved strategy (default: None) +* `--enable-parameter-parallel`: allow FlexFlow Train to explore parameter parallelism for performance auto-tuning. (By default FlexFlow Train only considers data and model parallelism.) +* `--enable-attribute-parallel`: allow FlexFlow Train to explore attribute parallelism for performance auto-tuning. (By default FlexFlow Train only considers data and model parallelism.) +For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search). diff --git a/docs/Makefile b/docs/Makefile index 5424c5bc9f..d14c2ef91f 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,7 +15,7 @@ help: .PHONY: help Makefile clean clean: - rm -rf build source/_doxygen/ source/c++_api/ doxygen/output + rm -rf build doxygen/output doxygen/cpp_api @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) # Catch-all target: route all unknown targets to Sphinx using the new diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index b38bfc12b5..aafa65d79b 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -44,7 +44,7 @@ PROJECT_NUMBER = # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = A distributed deep learning framework that supports flexible parallelization strategies. +PROJECT_BRIEF = "A distributed deep learning framework that supports flexible parallelization strategies." # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 @@ -150,7 +150,7 @@ INLINE_INHERITED_MEMB = NO # shortest path that makes the file name unique will be used # The default value is: YES. -FULL_PATH_NAMES = YES +FULL_PATH_NAMES = NO # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand @@ -874,12 +874,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = $(FF_HOME)/align -INPUT += $(FF_HOME)/bootcamp_demo -INPUT += $(FF_HOME)/examples INPUT += $(FF_HOME)/include -INPUT += $(FF_HOME)/nmt -INPUT += $(FF_HOME)/python INPUT += $(FF_HOME)/src # This tag can be used to specify the character encoding of the source files @@ -911,12 +906,10 @@ INPUT_ENCODING = UTF-8 FILE_PATTERNS = *.c \ *.cc \ - *.cpp \ *.cu \ + *.cpp \ *.h \ - *.hpp \ - *.md \ - *.py + *.hpp # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. @@ -2110,7 +2103,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = YES +GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of diff --git a/docs/source/conf.py b/docs/source/conf.py index 0e614f37c2..f67c0dae01 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,28 +13,42 @@ import os import sys import subprocess +import shutil +import sphinx # only needed for the manual post processing +from pathlib import Path +from m2r2 import convert +from docutils.core import publish_string +import re def get_parent_dir_path(path): return os.path.abspath(os.path.join(path, "..")) docs_path = get_parent_dir_path(os.path.dirname(os.path.abspath(__file__))) doxygen_path = os.path.join(docs_path, "doxygen") +doxygen_output = os.path.join(doxygen_path, "output") +doxygen_cpp_api_out = os.path.join(doxygen_path, "cpp_api") FF_HOME = get_parent_dir_path(docs_path) python_package_path = os.path.join(FF_HOME, "python") sys.path.insert(0, os.path.abspath(python_package_path)) # Build the Doxygen docs -#subprocess.call(f'cd {doxygen_path}; FF_HOME={FF_HOME} doxygen', shell=True) +shutil.rmtree(doxygen_cpp_api_out, ignore_errors=True) +for gpu_backend in ("cuda", "hip"): + doxygen_dest = os.path.join(doxygen_cpp_api_out, f"{gpu_backend}_api") + os.makedirs(doxygen_dest, exist_ok=True) + exclude_extension = ".cu" if gpu_backend == "hip" else ".cpp" + doxygen_cmd = f'export FF_HOME={FF_HOME}; ( cat Doxyfile ; echo "EXCLUDE_PATTERNS+=*{exclude_extension}" ) | doxygen -' + subprocess.check_call(doxygen_cmd, cwd=doxygen_path, shell=True) + subprocess.check_call(f'mv {os.path.join(doxygen_output, "html")}/* {doxygen_dest}/', shell=True) import sphinx_rtd_theme # -- Project information ----------------------------------------------------- project = 'FlexFlow' -copyright = '2020, Stanford, LANL, CMU, Facebook' -author = 'Stanford, LANL, CMU, Facebook' - +copyright = '2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)' +author = 'CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)' # -- General configuration --------------------------------------------------- @@ -45,8 +59,6 @@ def get_parent_dir_path(path): 'sphinx_rtd_theme', 'sphinx.ext.autodoc', 'm2r2', - 'breathe', - 'exhale', ] # Theme options are theme-specific and customize the look and feel of a theme @@ -55,6 +67,7 @@ def get_parent_dir_path(path): html_theme_options = { "collapse_navigation" : False } +html_extra_path = [doxygen_cpp_api_out] # Add any paths that contain templates here, relative to this directory. # templates_path = ['_templates'] @@ -86,27 +99,50 @@ def get_parent_dir_path(path): # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] -# Breathe + Exhale configuration -# Setup the breathe extension -breathe_projects = { - "FlexFlow": "./_doxygen/xml" -} -breathe_default_project = "FlexFlow" - -c_plus_plus_src_dirs = " ".join([f"\"{os.path.join(FF_HOME, 'src', dirname)}\"" for dirname in ("loss_functions", "mapper", "metrics_functions", "ops", "parallel_ops", "recompile", "runtime", "utils")]) -# Setup the exhale extension -exhale_args = { - # These arguments are required - "containmentFolder": "./c++_api", - "rootFileName": "c++_api_root.rst", - "doxygenStripFromPath": "..", - # Heavily encouraged optional argument (see docs) - #"rootFileTitle": "Library API", - # Suggested optional arguments - "createTreeView": True, - # TIP: if using the sphinx-bootstrap-theme, you need - # "treeViewIsBootstrap": True, - "exhaleExecutesDoxygen": True, - "exhaleDoxygenStdin": f'INPUT = {c_plus_plus_src_dirs}' -} +def manual_post_processing(app, exception): + if exception is None and app.builder.name == 'html': # build succeeded + print(f'Post-processing HTML docs at path {app.outdir}') + build_dir = Path(app.outdir) + + # List of subfolders to search + folder_paths = [build_dir, build_dir / 'developers_guide'] + + for folder_path in folder_paths: + + # Only get HTML files in build dir, not subfolders + html_files = folder_path.glob('*.html') + + for html_file in html_files: + content = html_file.read_text() + + # Find dropdown menus, and manually convert their contents + pattern = r'
\nExpand here\n
(.*?)
' + blocks = re.findall(pattern, content, re.DOTALL) + + for block in blocks: + # Convert Markdown to HTML + rst = convert(block, github_markdown=True) + html = publish_string(rst, writer_name='html') + html_str = html.decode('utf-8') + + # Replace block with converted HTML + content = content.replace(block, html_str) + + # Add space after dropdown menu block + content = content.replace('', + '\n

') + + # Replace incorrect links + content = content.replace('href="../docker/README.md"', 'href="docker.html"') + content = content.replace('href="./TRAIN.md"', 'href="train_overview.html"') + content = content.replace('href="./SERVE.md"', 'href="serve_overview.html"') + content = content.replace('href="./docs/source/keras.rst"', 'href="keras.html"') + content = content.replace('href="./docs/source/onnx.rst"', 'href="onnx.html"') + + + html_file.write_text(content) + + +def setup(app): + app.connect('build-finished', manual_post_processing) diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst new file mode 100644 index 0000000000..b5d39be62e --- /dev/null +++ b/docs/source/cpp_api.rst @@ -0,0 +1,10 @@ +************* +C++ API +************* + +The FlexFlow backend is at the core of FlexFlow Train and FlexFlow Serve. It is written entirely in C/C++ and CUDA/HIP. This section documents the API, which is generated by Doxygen and it is available at the following links: + +* `CUDA version <./cuda_api/index.html>`_ (default version) +* `HIP version <./hip_api/index.html>`_ + +The two versions only differ when it comes to the GPU kernels, so the great majority of the entries are identical. If you are unsure which version to use, take a look at the CUDA version. diff --git a/docs/source/developers_guide.rst b/docs/source/developers_guide/developers_guide.rst similarity index 64% rename from docs/source/developers_guide.rst rename to docs/source/developers_guide/developers_guide.rst index 107135fae4..a125e60460 100644 --- a/docs/source/developers_guide.rst +++ b/docs/source/developers_guide/developers_guide.rst @@ -2,5 +2,5 @@ Developers Guide ****************** -.. mdinclude:: ../../CONTRIBUTING.md +.. mdinclude:: ../../../CONTRIBUTING.md :start-line: 2 diff --git a/docs/source/developers_guide/ff_internals.rst b/docs/source/developers_guide/ff_internals.rst new file mode 100644 index 0000000000..15c0804255 --- /dev/null +++ b/docs/source/developers_guide/ff_internals.rst @@ -0,0 +1,6 @@ +******************* +FlexFlow Internals +******************* + +.. mdinclude:: internals.md + :start-line: 2 diff --git a/docs/source/developers_guide/internals.md b/docs/source/developers_guide/internals.md new file mode 100644 index 0000000000..243b14a174 --- /dev/null +++ b/docs/source/developers_guide/internals.md @@ -0,0 +1,15 @@ +# FlexFlow Internals + +## The Parallel Computation Graph (PCG) + +FlexFlow uses a _Parallel Computation Graph (PCG)_ to simultaneously represent tensor operations, as well as parallelism choices and data movement across nodes. + +### Tensor representations + +There are two types of tensor representations in FlexFlow: a [Tensor](./cuda_api/de/da9/structFlexFlow_1_1TensorBase.html) and a [ParallelTensor](./cuda_api/d3/dfc/structFlexFlow_1_1ParallelTensorBase.html). The first variant is used when writing a FlexFlow DNN program, whereas the second is used by the runtime to run all the computations in a distributed fashion. `Tensor` and `ParallelTensor` are implemented as typedef-ed pointers to, respectively, the `TensorBase` (defined in `include/flexflow/tensor.h`) and `ParallelTensorBase` (defined in `include/flexflow/parallel_tensor.h`) structs. + +The `ParallelTensor` struct contains all the information that a `Tensor` also stores, but in addition, it also codifies how the tensor should be parallelized. For instance, a ParallelTensor records how each dimension is *partitioned*, how many *replicas* of the tensors have been created, and the *mapping* between the partitions of the tensors and the physical machines that will store them. + +## Transformation generation + +## Joint optimization diff --git a/docs/source/docker.rst b/docs/source/docker.rst index 4a457a8dcc..63f84e460c 100644 --- a/docs/source/docker.rst +++ b/docs/source/docker.rst @@ -1,3 +1,4 @@ +:tocdepth: 1 ************* Docker ************* diff --git a/docs/source/index.rst b/docs/source/index.rst index 7af62e417e..2b369ac8e6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -7,47 +7,37 @@ Welcome to FlexFlow's documentation! ==================================== .. toctree:: - :maxdepth: 2 :caption: Getting Started welcome installation docker - jupyter .. toctree:: - :maxdepth: 2 - :caption: Interoperability + :caption: FlexFlow Serve - keras - pytorch - onnx + serve_overview .. toctree:: - :maxdepth: 2 - :caption: Examples - - mt5 + :caption: FlexFlow Train -.. toctree:: - :maxdepth: 3 - :caption: Python API + train_overview + train_interface + train_examples - python/models - python/layers - python/dataloader + train_python_api .. toctree:: - :maxdepth: 2 - :caption: C++ API + :caption: FlexFlow Backend - c++_api/c++_api_root + cpp_api .. toctree:: - :maxdepth: 2 + :maxdepth: 3 :caption: Developers Guide - developers_guide + developers_guide/developers_guide.rst +.. developers_guide/ff_internals.rst .. Indices and tables diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 109b546834..95ec8596e6 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -1,5 +1,6 @@ +:tocdepth: 1 ************* -Installing FlexFlow +Building from source ************* .. mdinclude:: ../../INSTALL.md diff --git a/docs/source/keras.rst b/docs/source/keras.rst index eb4f2d7fa7..f1c0743c70 100644 --- a/docs/source/keras.rst +++ b/docs/source/keras.rst @@ -1,6 +1,7 @@ -************* -Keras Support -************* +:tocdepth: 1 +**************** +Keras Interface +**************** FlexFlow provides a drop-in replacement for TensorFlow Keras. Running an existing Keras program on the FlexFlow backend only requires a few lines of changes to the program. The detailed instructions are as follows: diff --git a/docs/source/mt5.rst b/docs/source/mt5.rst index c9c3af080a..8a632b90d6 100644 --- a/docs/source/mt5.rst +++ b/docs/source/mt5.rst @@ -1,6 +1,6 @@ -**************** -HuggingFace mT5 -**************** +************************ +mT5 Model +************************ .. mdinclude:: ../../examples/python/pytorch/mt5/README.md :start-line: 2 diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index 91b314ac96..b6bc49b146 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -1,3 +1,4 @@ +:tocdepth: 1 ************* ONNX Support ************* diff --git a/docs/source/pytorch.rst b/docs/source/pytorch.rst index a6d4e23311..3dbe337d55 100644 --- a/docs/source/pytorch.rst +++ b/docs/source/pytorch.rst @@ -1,6 +1,7 @@ -*************** -PyTorch Support -*************** +:tocdepth: 1 +****************** +PyTorch Interface +****************** Users can use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. The PyTorch support requires the `PyTorch FX module `_, so make sure your PyTorch is up to date. diff --git a/docs/source/serve_overview.rst b/docs/source/serve_overview.rst new file mode 100644 index 0000000000..35c992a853 --- /dev/null +++ b/docs/source/serve_overview.rst @@ -0,0 +1,7 @@ +:tocdepth: 1 +************* +Serving Overview +************* + +.. mdinclude:: ../../SERVE.md + :start-line: 3 diff --git a/docs/source/train_examples.rst b/docs/source/train_examples.rst new file mode 100644 index 0000000000..84d58c3465 --- /dev/null +++ b/docs/source/train_examples.rst @@ -0,0 +1,6 @@ +************* +Training Examples +************* + +.. toctree:: + mt5 \ No newline at end of file diff --git a/docs/source/train_interface.rst b/docs/source/train_interface.rst new file mode 100644 index 0000000000..ce81fc1f3c --- /dev/null +++ b/docs/source/train_interface.rst @@ -0,0 +1,8 @@ +******************* +Training Interface +******************* + +.. toctree:: + keras + pytorch + onnx \ No newline at end of file diff --git a/docs/source/train_overview.rst b/docs/source/train_overview.rst new file mode 100644 index 0000000000..58898ad35c --- /dev/null +++ b/docs/source/train_overview.rst @@ -0,0 +1,7 @@ +:tocdepth: 1 +************* +Training Overview +************* + +.. mdinclude:: ../../TRAIN.md + :start-line: 3 diff --git a/docs/source/train_python_api.rst b/docs/source/train_python_api.rst new file mode 100644 index 0000000000..40451dedf9 --- /dev/null +++ b/docs/source/train_python_api.rst @@ -0,0 +1,11 @@ +******************* +Python API +******************* +This section documents the Python API for FlexFlow Train. + +.. toctree:: + :maxdepth: 3 + + python/models + python/layers + python/dataloader \ No newline at end of file diff --git a/docs/source/welcome.rst b/docs/source/welcome.rst index 8108b1dd67..7f73f15563 100644 --- a/docs/source/welcome.rst +++ b/docs/source/welcome.rst @@ -1,3 +1,4 @@ +:tocdepth: 1 ************* Overview ************* diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 0ef7f6cbac..1b2fc7bbfc 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -20,11 +20,33 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT }; enum class MappingOperation { PARTITION, REPLICATE }; +/** @brief A class to keep track of a dimension relation between two tensors + * used by an operator. + * + * Dimension relations are one-to-one mappings between the dimensions of the + * input, weights, and output tensors of an operator. Introduced in the Unity + * paper, dimension relations allow FlexFlow to keep track of an operator's + * parallelization plans as part of the Parallel Computation Graph (PCG). + * + * Each ParallelDimMappingRecord only keeps track of a single dimension + * relation. + * + * ParallelDimMappingRecord objects must be initialized with a + * MappingRecordType, which can be INPUT_OUTPUT, if the ParallelDimMappingRecord + * is tracking a dimension relation between the input and the output tensor, or + * INPUT_WEIGHT, if the ParallelDimMappingRecord is tracking a dimension + * relation between the input tensor and the weights tensor. + * + */ class ParallelDimMappingRecord { private: ParallelDimMappingRecord(MappingRecordType); public: + /** + * @brief We disable this constructor because ParallelDimMappingRecord objects + * must specify the MappingRecordType upon creation. + */ ParallelDimMappingRecord() = delete; static ParallelDimMappingRecord input_output_record( diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 21436d24d6..9e353fb374 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -1317,6 +1317,11 @@ bool LinearParams::is_valid(ParallelTensorShape const &input_shape) const { return is_valid; } +/** @brief A wrapper around the main version of the solve_dims function. + * + * It takes a the input tensor as a parameter, instead of the input's + * ParallelTensorShape. + */ void LinearParams::solve_dims(const ParallelTensor input, ParallelDim output_dims[MAX_TENSOR_DIM], int *output_ndims, @@ -1333,6 +1338,13 @@ void LinearParams::solve_dims(const ParallelTensor input, bias_ndims); } +/** @brief A wrapper around the main version of the solve_dims function. + * + * For each of the output, weights, and bias tensors, it takes a + * ParallelTensorShape argument, instead of a pointer to an integer variable to + * record the number of dimensions, plus a ParallelDim array to record all the + * information regarding each dimension. + */ void LinearParams::solve_dims(ParallelTensorShape const &input_shape, ParallelTensorShape &output_shape, ParallelTensorShape &kernel_shape, @@ -1359,11 +1371,14 @@ void LinearParams::solve_dims(ParallelTensorShape const &input_shape, std::vector mapping; this->construct_mappings(mapping, input_shape); + // sets the is_replica_dim field to true for the dimensions that are used to + // record the number of replicas this->mark_replica_dims(input_shape, output_dims, kernel_dims, bias_dims); solve_parallel_dim_mappings( mapping, {input_shape.dims}, {kernel_dims, bias_dims}, {output_dims}); + // sets the dimension sizes of the output, weights, and bias tensors this->calculate_nonreplica_dim_sizes(input_shape, output_dims, output_ndims, @@ -1373,6 +1388,34 @@ void LinearParams::solve_dims(ParallelTensorShape const &input_shape, bias_ndims); } +/** @brief Create a map between each of a tensor's dimension name and its + * corresponding index + * + * The tensor dimension names are defined as follows. For the input tensor, the + * first dimension is called INPUT_CHANNEL, and generally corresponds to number + * of floats needed to store a single element from the input dataset. For + * example, when each element in the dataset is a flattened MNIST image, the + * INPUT_CHANNEL dimension will have a size of 28x28=784. The second to last and + * last dimensions in the input tensor are, respectively, the INPUT_SAMPLE and + * INPUT_REPLICA dimensions. The size of the INPUT_SAMPLE dimension generally + * corresponds to the batch size used for training. The size of the + * INPUT_REPLICA tells us how many replicas of the tensors have been created. + * The dimensions of the output tensor are named analogously: the first + * dimension is OUTPUT_CHANNEL, the second to last is OUTPUT_SAMPLE, and the + * last one is OUTPUT_REPLICA. Both the input and output tensor may have + * additional dimensions, without a name, between {INPUT,OUTPUT}_CHANNEL and + * {INPUT,OUTPUT}_SAMPLE. For instance, when the input data comes in textual + * form, it is common to have an additional dimension representing the sequence + * length. When it comes to the weights, the dimensions are named simply as + * KERNEL_CHANNEL_IN (first dimension of a weight's tensor), KERNEL_CHANNEL_OUT + * (second dimension) and BIAS_CHANNEL_OUT (first dimension of the bias tensor) + * + * @param[in] input_shape A ParallelTensorShape object representing the shape + * of the ParallelTensor used for the input to the operator + * @return dimension_names A map from each LinearParams::NamedDimensions to the + * index corresponding to that dimension in the input, weight, (bias), or output + * tensor. + */ std::unordered_map LinearParams::get_dimension_names( ParallelTensorShape const &input_shape) const { @@ -1389,6 +1432,43 @@ std::unordered_map {BIAS_CHANNEL_OUT, 0}}; } +/** @brief Sets the size field of ParallelDim objects passed as arguments to + * the expected (non-replica) dimensions of the output, weights, and bias + * tensors. In addition, it sets the output_ndims, kernel_ndims and bias_ndims + * variables to the number of dimensions (including the replica dimensions) of, + * respectively, the ouput, weights, and bias tensors. + * + * The number of dimensions, and dimension sizes of the output, weights, and + * bias dimensions are set as follows. The number of dimensions of all three + * tensors are copied from the dimensions of the input tensor. The replica + * dimensions are not subtracted or otherwise excluded. The size of the output + * tensor dimensions are also copied from the input tensor, with the exception + * of the last dimension (replica dimension), which is not set, and the first + * dimension, whose size is set equal to the out_channels member of the + * LinearParams struct, which in turn is set by the outDim parameter of the + * FModel::dense function. When it comes to the size of the weights dimensions, + * the first dimension is set to have size equal to the quotient of the size of + * the INPUT_CHANNEL dimension of the input (first dimension) and the degree + * (number of partitions) of the same input dimension. The second dimension of + * the the weights tensor is set equal to out_channels, just like the first + * dimension of the output tensor. Finally, the size of the first dimension of + * the bias tensor is also set equal to the value of out_channels. + * + * @param[in] input_shape A required argument recording the dimensions of + * the input tensor + * @param[out] output_dims An array of ParallelDim objects representing the + * dimensions of the output tensor + * @param[out] output_ndims The number of dimensions (including the replica + * dimension(s)) of the output tensor + * @param[out] kernel_dims An array of ParallelDim objects representing the + * dimensions of the weights tensor + * @param[out] kernel_ndims The number of dimensions (including the replica + * dimension(s)) of the weights tensor + * @param[out] bias_dims An array of ParallelDim objects representing the + * dimensions of the bias tensor + * @param[out] bias_ndims The number of dimensions (including the replica + * dimension(s)) of the bias tensor + */ void LinearParams::calculate_nonreplica_dim_sizes( ParallelTensorShape const &input_shape, ParallelDim output_dims[MAX_TENSOR_DIM], @@ -1421,6 +1501,20 @@ void LinearParams::calculate_nonreplica_dim_sizes( } } +/** @brief Switch the is_replica_dim field to true in each ParallelDim of + * the output, weight and bias tensor, if the corresponding dimension + * is used to keep track of the number of replicas + * + * @param[in] input_shape A required argument recording the dimensions of + * the input tensor + * @param[out] output_dims An array of ParallelDim objects representing the + * dimensions of the output tensor + * @param[out] kernel_dims An array of ParallelDim objects representing the + * dimensions of the weights tensor + * @param[out] bias_dims An array of ParallelDim objects representing the + * dimensions of the bias tensor + * + */ void LinearParams::mark_replica_dims( ParallelTensorShape const &input_shape, ParallelDim output_dims[MAX_TENSOR_DIM], diff --git a/src/runtime/model.cc b/src/runtime/model.cc index b54a58448e..43b5df1f39 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -602,11 +602,35 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, } #endif +/** + * @brief The ParallelDimMappingRecord class's constructor. It sets the object's + * type field equal to the value passed as the constructor's argument, and + * initializes all other fields to -1. + * + * @param[in] type The MappingRecordType to use to initialize the + * ParallelDimMappingRecord. + */ ParallelDimMappingRecord::ParallelDimMappingRecord(MappingRecordType type) : type(type), output_dim(-1), input_dim(-1), weight_dim(-1), output_idx(-1), input_idx(-1), weight_idx(-1) {} /*static*/ +/** + * @brief Builds and initializes a ParallelDimMappingRecord object of + * INPUT_OUTPUT MappingRecordType. + * + * This function should be used to create a ParallelDimMappingRecord to track an + * operator's dimension relation between the input and the output tensor + * + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] input_dim The index of the input dimension part of the + * dimension relation + * @param[in] output_idx The index of the output tensor (nonzero if there are + * multiple outputs) + * @param[in] output_dim The index of the output dimension part of the + * dimension relation + */ ParallelDimMappingRecord ParallelDimMappingRecord::input_output_record( int input_idx, int input_dim, @@ -630,6 +654,22 @@ ParallelDimMappingRecord ParallelDimMappingRecord::input_output_record( } /*static*/ +/** + * @brief Builds and initializes a ParallelDimMappingRecord object of + * INPUT_WEIGHT MappingRecordType. + * + * This function should be used to create a ParallelDimMappingRecord to track an + * operator's dimension relation between the input and the weights tensor + * + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] input_dim The index of the input dimension part of the + * dimension relation + * @param[in] weight_idx The index of the weight tensor (nonzero if there are + * multiple weights) + * @param[in] weight_dim The index of the weight dimension part of the + * dimension relation + */ ParallelDimMappingRecord ParallelDimMappingRecord::input_weight_record( int input_idx, int input_dim, @@ -657,6 +697,39 @@ MappingRecordType ParallelDimMappingRecord::get_type() const { } /*static*/ +/** @brief A wrapper around the main version of the + * construct_weight_parallel_dims function. + * + * This wrapper allows you to append multiple dimension relations at once to a + * vector of ParallelDimMappingRecord entries. The relations must be between + * dimensions of the same pair of input and weight tensors. Unlike the other + * construct_weight_parallel_dims wrapper below, this function allows you to + * specify the MappingOperation for each pair of dimensions for which you will + * be creating a new ParallelDimMappingRecord. + * + * The function takes a vector of (int, MappingOperation, int) tuples, where the + * int members represent the indexes of the two dimensions in a relation, and + * the MappingOperation member specifies the type of mapping operation. Just + * like the other wrapper, this function simply calls the main version of + * construct_weight_parallel_dims for each pair, using the same values across + * all calls for all other parameters. + * + * This function should NOT be used to track dimension relations between the + * input and weights tensors; construct_weight_parallel_dims should be used + * instead. + * + * @param[out] records The (potentially empty) vector of existing + * ParallelDimMappingRecord entries + * @param[in] mappings A vector of tuples, each including a pair of + * integers (representing the indexes of the input and weight dimensions in a + * relation), and a MappingOperation, specifying the mapping operation for the + * pair of dimensions. + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] weight_idx The index of the weight tensor (nonzero if there are + * multiple weights) + * + */ void Op::construct_weight_parallel_dims( std::vector &records, std::vector> mappings, @@ -673,6 +746,30 @@ void Op::construct_weight_parallel_dims( } /*static*/ +/** @brief A wrapper around the main version of the + * construct_weight_parallel_dims function. + * + * This wrapper allows you to append multiple dimension relations at once to a + * vector of ParallelDimMappingRecord entries. The relations must be between + * dimensions of the same pair of input and weight tensors. The function takes a + * vector of (input, weight) dimension index pairs and simply calls the main + * version of construct_weight_parallel_dims for each such pair, using the same + * values across all calls for all other parameters. + * + * This function should NOT be used to track dimension relations between the + * input and weights tensors; construct_weight_parallel_dims should be used + * instead. + * + * @param[out] records The (potentially empty) vector of existing + * ParallelDimMappingRecord entries + * @param[in] mappings A vector of integer pairs, each representing the + * indexes of the input and weight dimensions in a relation. + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] weight_idx The index of the weight tensor (nonzero if there are + * multiple weights) + * + */ void Op::construct_weight_parallel_dims( std::vector &records, std::vector> mappings, @@ -685,6 +782,30 @@ void Op::construct_weight_parallel_dims( } /*static*/ +/** + * @brief Creates a new ParallelDimMappingRecord (of the INPUT_WEIGHT + * MappingRecordType flavor) and appends it to an existing vector of + * ParallelDimMappingRecord entries. + * + * This function creates a new ParallelDimMappingRecord to track a dimension + * relation between a dimension from the input tensor and a dimension from the + * weight tensor. This function should NOT be used to track dimension relations + * between the input and output tensors; construct_output_parallel_dims should + * be used instead. + * + * @param[out] records The (potentially empty) vector of existing + * ParallelDimMappingRecord entries + * @param[in] input_dim The index of the input dimension part of the + * dimension relation + * @param[in] weight_dim The index of the weight dimension part of the + * dimension relation + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] weight_idx The index of the weight tensor (nonzero if there are + * multiple weights) + * @param[in] operation The parallelization operation (partition or + * replication) associated with the dimension relation + */ void Op::construct_weight_parallel_dims( std::vector &records, int input_dim, @@ -696,12 +817,20 @@ void Op::construct_weight_parallel_dims( input_idx, input_dim, weight_idx, weight_dim, operation)); } +/** @brief Calls the corresponding version of construct_weight_parallel_dims, + * and passes the Op class's parallel_dims_mapping vector, so that the resulting + * ParallelDimMappingRecord are appended to it + */ void Op::register_weight_parallel_dims( std::vector> mappings, int input_idx, int weight_idx) { Op::construct_weight_parallel_dims( *this->parallel_dims_mapping, mappings, input_idx, weight_idx); } +/** @brief Calls the corresponding version of construct_weight_parallel_dims, + * and passes the Op class's parallel_dims_mapping vector, so that the resulting + * ParallelDimMappingRecord are appended to it + */ void Op::register_weight_parallel_dims( std::vector> mappings, int input_idx, @@ -710,6 +839,10 @@ void Op::register_weight_parallel_dims( *this->parallel_dims_mapping, mappings, input_idx, weight_idx); } +/** @brief Calls the corresponding version of construct_weight_parallel_dims, + * and passes the Op class's parallel_dims_mapping vector, so that the resulting + * ParallelDimMappingRecord are appended to it + */ void Op::register_weight_parallel_dims( int input_dim, int weight_dim, @@ -725,6 +858,39 @@ void Op::register_weight_parallel_dims( } /*static*/ +/** @brief A wrapper around the main version of the + * construct_output_parallel_dims function. + * + * This wrapper allows you to append multiple dimension relations at once to a + * vector of ParallelDimMappingRecord entries. The relations must be between + * dimensions of the same pair of input and output tensors. Unlike the other + * construct_output_parallel_dims wrapper below, this function allows you to + * specify the MappingOperation for each pair of dimensions for which you will + * be creating a new ParallelDimMappingRecord. + * + * The function takes a vector of (int, MappingOperation, int) tuples, where the + * int members represent the indexes of the two dimensions in a relation, and + * the MappingOperation member specifies the type of mapping operation. Just + * like the other wrapper, this function simply calls the main version of + * construct_output_parallel_dims for each pair, using the same values across + * all calls for all other parameters. + * + * This function should NOT be used to track dimension relations between the + * input and weights tensors; construct_weight_parallel_dims should be used + * instead. + * + * @param[out] records The (potentially empty) vector of existing + * ParallelDimMappingRecord entries + * @param[in] mappings A vector of tuples, each including a pair of + * integers (representing the indexes of the input and output dimensions in a + * relation), and a MappingOperation, specifying the mapping operation for the + * pair of dimensions. + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] output_idx The index of the output tensor (nonzero if there are + * multiple outputs) + * + */ void Op::construct_output_parallel_dims( std::vector &records, std::vector> mappings, @@ -741,6 +907,30 @@ void Op::construct_output_parallel_dims( } /*static*/ +/** @brief A wrapper around the main version of the + * construct_output_parallel_dims function. + * + * This wrapper allows you to append multiple dimension relations at once to a + * vector of ParallelDimMappingRecord entries. The relations must be between + * dimensions of the same pair of input and output tensors. The function takes a + * vector of (input, output) dimension index pairs and simply calls the main + * version of construct_output_parallel_dims for each such pair, using the same + * values across all calls for all other parameters. + * + * This function should NOT be used to track dimension relations between the + * input and weights tensors; construct_weight_parallel_dims should be used + * instead. + * + * @param[out] records The (potentially empty) vector of existing + * ParallelDimMappingRecord entries + * @param[in] mappings A vector of integer pairs, each representing the + * indexes of the input and output dimensions in a relation. + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] output_idx The index of the output tensor (nonzero if there are + * multiple outputs) + * + */ void Op::construct_output_parallel_dims( std::vector &records, std::vector> mappings, @@ -753,6 +943,30 @@ void Op::construct_output_parallel_dims( } /*static*/ +/** + * @brief Creates a new ParallelDimMappingRecord (of the INPUT_OUTPUT + * MappingRecordType flavor) and appends it to an existing vector of + * ParallelDimMappingRecord entries. + * + * This function creates a new ParallelDimMappingRecord to track a dimension + * relation between a dimension from the input tensor and a dimension from the + * output tensor. This function should NOT be used to track dimension relations + * between the input and weights tensors; construct_weight_parallel_dims should + * be used instead. + * + * @param[out] records The (potentially empty) vector of existing + * ParallelDimMappingRecord entries + * @param[in] input_dim The index of the input dimension part of the + * dimension relation + * @param[in] output_dim The index of the output dimension part of the + * dimension relation + * @param[in] input_idx The index of the input tensor (nonzero if there are + * multiple inputs) + * @param[in] output_idx The index of the output tensor (nonzero if there are + * multiple outputs) + * @param[in] operation The parallelization operation (partition or + * replication) associated with the dimension relation + */ void Op::construct_output_parallel_dims( std::vector &records, int input_dim, @@ -764,12 +978,20 @@ void Op::construct_output_parallel_dims( input_idx, input_dim, output_idx, output_dim, operation)); } +/** @brief Calls the corresponding version of construct_output_parallel_dims, + * and passes the Op class's parallel_dims_mapping vector, so that the resulting + * ParallelDimMappingRecord are appended to it + */ void Op::register_output_parallel_dims( std::vector> mappings, int input_idx, int output_idx) { Op::construct_output_parallel_dims( *this->parallel_dims_mapping, mappings, input_idx, output_idx); } +/** @brief Calls the corresponding version of construct_output_parallel_dims, + * and passes the Op class's parallel_dims_mapping vector, so that the resulting + * ParallelDimMappingRecord are appended to it + */ void Op::register_output_parallel_dims( std::vector> mappings, int input_idx, @@ -778,6 +1000,10 @@ void Op::register_output_parallel_dims( *this->parallel_dims_mapping, mappings, input_idx, output_idx); } +/** @brief Calls the corresponding version of construct_output_parallel_dims, + * and passes the Op class's parallel_dims_mapping vector, so that the resulting + * ParallelDimMappingRecord are appended to it + */ void Op::register_output_parallel_dims( int input_dim, int output_dim, @@ -6334,12 +6560,12 @@ void register_flexflow_internal_tasks(Runtime *runtime, #endif // Search { - TaskVariantRegistrar registrar(STRATEGY_SEARCH_TASK_ID, "Stretegy Search"); + TaskVariantRegistrar registrar(STRATEGY_SEARCH_TASK_ID, "Strategy Search"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "Stretegy Search Task"); + registrar, "Strategy Search Task"); } else { if (enable_control_replication) { registrar.global_registration = false; From 88f70e3946bdd518fb632fb57de0a259eccbb2d2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 17 Aug 2023 15:23:54 -0400 Subject: [PATCH 202/344] Fix conda in CI (#974) * maybe fix * fix --- conda/flexflow.yml | 6 +++--- requirements.txt | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 57101ac4ff..9ff7f3957a 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -15,9 +15,9 @@ dependencies: - qualname>=0.1.0 - keras_preprocessing>=1.1.2 - numpy>=1.16.0 - - torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu - - torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu - - torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cpu + - torch>=1.13.1 --index-url https://download.pytorch.org/whl/cpu + - torchaudio>=0.13.1 --index-url https://download.pytorch.org/whl/cpu + - torchvision>=0.14.1 --index-url https://download.pytorch.org/whl/cpu - regex - onnx - transformers>=4.31.0 diff --git a/requirements.txt b/requirements.txt index 2e8d4ad962..c6337dccfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,9 +10,9 @@ requests qualname regex requests -torch==2.0.1 -torchaudio==2.0.2 -torchvision==0.15.2 +torch>=1.13.1 +torchaudio>=0.13.1 +torchvision>=0.14.1 onnx transformers>=4.31.0 sentencepiece From 97c62b1b92ffd87bae82e55375f71950c2fb9f40 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 17 Aug 2023 17:53:57 -0400 Subject: [PATCH 203/344] change ff.init interface to accept parameters (#973) * change init interface * fix. * update method signature * update defaults * update * fix * update docs --------- Co-authored-by: Gabriele Oliaro --- python/flexflow/serve/__init__.py | 171 +++++++++++++++++++++--------- 1 file changed, 121 insertions(+), 50 deletions(-) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 315a1e4317..19a2774a54 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json, sys, os -from typing import Union +import sys, os +from typing import Union, Optional from ..type import * @@ -33,17 +33,36 @@ def _parse_positive_int_config(name: str, variable: str, ff_cli_name: str = None sys.argv += [f"{ff_cli_name}", str(variable)] -def init(configs: Union[str, dict]): - """Configure FlexFlow for inference and start the FlexFlow runtime by importing the flexflow.core package. - - The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments. - - The init function takes three mandatory parameters, which cannot be changed after starting the runtime. These are: +def init(configs_dict: Optional[dict] = None, + *, + num_gpus: Optional[int] = None, + memory_per_gpu: Optional[int] = None, + zero_copy_memory_per_node: Optional[int] = None, + num_cpus: Optional[int] = None, + legion_utility_processors: Optional[int] = None, + data_parallelism_degree: Optional[int] = None, + tensor_parallelism_degree: Optional[int] = None, + pipeline_parallelism_degree: Optional[int] = None, + offload: Optional[bool] = None, + offload_reserve_space_size: Optional[int] = None, + use_4bit_quantization: Optional[bool] = None, + use_8bit_quantization: Optional[bool] = None, + profiling: Optional[bool] = None, + fusion: Optional[bool] = None): + """ + Configure FlexFlow Serve and start the runtime. + + The function takes, alternatively, configs_dict (a positional argument of type dictionary), + or three mandatory named parameters, plus some additional optional named parameters. When passing + a configs_dict, no named parameter should be specified, and the dictionary should have keys matching + at least the mandatory named parameters. + + The three mandatory parameters, which cannot be changed after starting the runtime, are: - num_gpus: the number of GPUs to reserve for the runtime - memory_per_gpu: the amount of memory (in MB) to pre-allocate on each GPU - zero_copy_memory_per_node: the amount of zero-copy memory (in MB) to pre-allocate for each node - - In addition, the following optional parameters can be passed: + + The optional parameters are: - num_cpus: the number of CPU processors to reserve for the runtime, defaults to 4 - legion_utility_processors: number of Legion utility threads to create per process, defaults to 1 - data_parallelism_degree: the degree of parallelization in the data parallel dimension, defaults to 1 @@ -55,38 +74,104 @@ def init(configs: Union[str, dict]): - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - profiling: whether to enable the FlexFlow profiling mode, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True + + The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments. - :param configs: The runtime configs, in the form of a dictionary or the path to a JSON file - :type configs: Union[str, dict] - :raises ValueError: This function will raise an exception if the JSON file pointed to by the input string is not in the right format - :raises ValueError: This function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node + + :param configs_dict: A Python dictionary to pass all configurations as a single object + :type configs_dict: dict + :param num_gpus: the number of GPUs to reserve for the runtime + :type num_gpus: int + :param memory_per_gpu: memory_per_gpu: the amount of memory (in MB) to pre-allocate on each GPU + :type memory_per_gpu: int + :param zero_copy_memory_per_node: zero_copy_memory_per_node: the amount of zero-copy memory (in MB) to pre-allocate for each node + :type zero_copy_memory_per_node: int + :param num_cpus: the number of CPU processors to reserve for the runtime, defaults to 4 + :type num_cpus: Optional[int], optional + :param legion_utility_processors: number of Legion utility threads to create per process, defaults to 1 + :type legion_utility_processors: Optional[int], optional + :param data_parallelism_degree: the degree of parallelization in the data parallel dimension, defaults to 1 + :type data_parallelism_degree: Optional[int], optional + :param tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1 + :type tensor_parallelism_degree: Optional[int], optional + :param pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1 + :type pipeline_parallelism_degree: Optional[int], optional + :param offload: whether to enable offloading of the weights to CPU, defaults to False + :type offload: Optional[bool], optional + :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + :type offload_reserve_space_size: Optional[int], optional + :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False + :type use_4bit_quantization: Optional[bool], optional + :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False + :type use_8bit_quantization: Optional[bool], optional + :param profiling: whether to enable the FlexFlow profiling mode, defaults to False + :type profiling: Optional[bool], optional + :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True + :type fusion: Optional[bool], optional + + :raises ValueError: this function will raise an exception if the user passes both a configs_dict and some named parameters + :raises TypeError: this function will raise an exception if the configs_dict is not a dictionary + :raises ValueError: this function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node """ - configs_dict = {} - if type(configs) == str: - try: - with open(configs) as f: - configs_dict = json.load(f) - except json.JSONDecodeError as e: - print("JSON format error:") - print(e) - elif type(configs) == dict: - configs_dict = configs + + # Check that either configs_dict or any of individual, non-positional arguments (after the *) is passed, but not both + if configs_dict is not None and any([ + num_gpus is not None, + memory_per_gpu is not None, + zero_copy_memory_per_node is not None, + num_cpus is not None, + legion_utility_processors is not None, + data_parallelism_degree is not None, + tensor_parallelism_degree is not None, + pipeline_parallelism_degree is not None, + offload is not None, + offload_reserve_space_size is not None, + use_4bit_quantization is not None, + use_8bit_quantization is not None, + profiling is not None, + fusion is not None, + ]): + raise ValueError("Cannot pass both configs_dict and individual args") + + if configs_dict is not None: + # If configs_dict is passed, check that the type is dictionary and that the mandatory key-value pairs are present (num_gpus, memory_per_gpu, zero_copy_memory_per_node) + if type(configs_dict) != dict: + raise TypeError("configs_dict is not a dictionary") + # configs should contain the following mandatory keys with non-zero integer values: + num_gpus = configs_dict.get("num_gpus") + memory_per_gpu = configs_dict.get("memory_per_gpu") + zero_copy_memory_per_node = configs_dict.get("zero_copy_memory_per_node") + if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_node: + raise ValueError( + "Missing one of the following configs in config dict: num_gpus, memory_per_gpu, zero_copy_memory_per_node" + ) + num_cpus = configs_dict.get("num_cpus") + legion_utility_processors = configs_dict.get("legion_utility_processors", 8) + data_parallelism_degree = configs_dict.get("data_parallelism_degree") + tensor_parallelism_degree = configs_dict.get("tensor_parallelism_degree") + pipeline_parallelism_degree = configs_dict.get("pipeline_parallelism_degree") + offload = configs_dict.get("offload", False) + offload_reserve_space_size = configs_dict.get("offload_reserve_space_size") + use_4bit_quantization = configs_dict.get("use_4bit_quantization", False) + use_8bit_quantization = configs_dict.get("use_8bit_quantization", False) + profiling = configs_dict.get("profiling", False) + fusion = configs_dict.get("fusion", True) else: - raise ValueError( - "configs should be a dictionary or the path to a valid JSON file" + # If configs_dict is not passed, check that the mandatory parameters are passed directly as arguments + if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_node: + raise ValueError( + "Missing one of the following configs in input params: num_gpus, memory_per_gpu, zero_copy_memory_per_node" ) - + offload = False if offload is None else offload + use_4bit_quantization = False if use_4bit_quantization is None else use_4bit_quantization + use_8bit_quantization = False if use_8bit_quantization is None else use_8bit_quantization + profiling = False if profiling is None else profiling + fusion = True if fusion is None else fusion + # Remove the arguments to avoid interferences sys.argv = [sys.argv[0]] - - # configs should contain the following mandatory keys with non-zero integer values: - num_gpus = configs_dict.get("num_gpus") - memory_per_gpu = configs_dict.get("memory_per_gpu") - zero_copy_memory_per_node = configs_dict.get("zero_copy_memory_per_node") - if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_node: - raise ValueError( - "Missing one of the following configs: num_gpus, memory_per_gpu, zero_copy_memory_per_node" - ) + + # parse arguments _parse_positive_int_config("num_gpus", num_gpus, "-ll:gpu") _parse_positive_int_config("memory_per_gpu", memory_per_gpu, "-ll:fsize") _parse_positive_int_config( @@ -94,16 +179,10 @@ def init(configs: Union[str, dict]): ) # parse optional arguments - num_cpus = configs_dict.get("num_cpus") _parse_positive_int_config("num_cpus", num_cpus, "-ll:cpu") - legion_utility_processors = configs_dict.get("legion_utility_processors") _parse_positive_int_config( "legion_utility_processors", legion_utility_processors, "-ll:util" ) - - data_parallelism_degree = configs_dict.get("data_parallelism_degree") - tensor_parallelism_degree = configs_dict.get("tensor_parallelism_degree") - pipeline_parallelism_degree = configs_dict.get("pipeline_parallelism_degree") _parse_positive_int_config( "data_parallelism_degree", data_parallelism_degree, "-data-parallelism-degree" ) @@ -117,27 +196,19 @@ def init(configs: Union[str, dict]): pipeline_parallelism_degree, "-pipeline-parallelism-degree", ) - - offload = configs_dict.get("offload", False) if offload: sys.argv += ["-offload"] - offload_reserve_space_size = configs_dict.get("offload_reserve_space_size") _parse_positive_int_config( "offload_reserve_space_size", offload_reserve_space_size, "-offload-reserve-space-size", ) - use_4bit_quantization = configs_dict.get("use_4bit_quantization", False) if use_4bit_quantization: sys.argv += ["--4bit-quantization"] - use_8bit_quantization = configs_dict.get("use_8bit_quantization", False) if use_8bit_quantization: sys.argv += ["--8bit-quantization"] - - profiling = configs_dict.get("profiling", False) if profiling: sys.argv += ["--profiling"] - fusion = configs_dict.get("fusion", True) if fusion: sys.argv += ["--fusion"] From d2a0629c6ffdd946baa2b5b625588b51ea6fc20c Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Thu, 17 Aug 2023 18:21:06 -0400 Subject: [PATCH 204/344] Update README.md (#975) --- .github/README.md | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/.github/README.md b/.github/README.md index 99e0f82b62..b8039463a5 100644 --- a/.github/README.md +++ b/.github/README.md @@ -60,14 +60,12 @@ The following example shows how to deploy an LLM using FlexFlow Serve and accele import flexflow.serve as ff ff.init( - { - "num_gpus": 4, - "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, - "tensor_parallelism_degree": 4, - "pipeline_parallelism_degree": 1, - } -) + num_gpus=4, + memory_per_gpu=14000, + zero_copy_memory_per_node=30000, + tensor_parallelism_degree=4, + pipeline_parallelism_degree=1 + ) ``` Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). ```python @@ -108,14 +106,12 @@ import flexflow.serve as ff # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs ff.init( - { - "num_gpus": 4, - "memory_per_gpu": 14000, - "zero_copy_memory_per_gpu": 30000, - "tensor_parallelism_degree": 4, - "pipeline_parallelism_degree": 1, - } -) + num_gpus=4, + memory_per_gpu=14000, + zero_copy_memory_per_node=30000, + tensor_parallelism_degree=4, + pipeline_parallelism_degree=1 + ) # Create the FlexFlow LLM llm = ff.LLM("decapoda-research/llama-7b-hf") From 66570c51e29e83194c349c986f60e6c98e49d119 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 17 Aug 2023 18:54:57 -0700 Subject: [PATCH 205/344] Update README.md --- .github/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/README.md b/.github/README.md index b8039463a5..56434f6bf9 100644 --- a/.github/README.md +++ b/.github/README.md @@ -55,7 +55,8 @@ To download a Docker container for a backend other than CUDA v11.8, you can repl You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). ## Quickstart -The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving. +The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively. +We need to make sure the aggregated GPU memory and zero-copy memory are **both** sufficient to store LLM parameters in non-offloading serving. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving. ```python import flexflow.serve as ff From 18946bac9549b1ddd1e9db32042d30aa5d3e25d2 Mon Sep 17 00:00:00 2001 From: Brian Yu <138826504+brianyu-nexusflowai@users.noreply.github.com> Date: Sat, 19 Aug 2023 08:13:26 -0700 Subject: [PATCH 206/344] adding f for fstring (#990) --- python/flexflow/serve/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index ad4e8f594b..3c26f7ab18 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -99,7 +99,7 @@ def __get_ff_model_type(self): ff_arch = self.supported_models.get(architectures[0]) if ff_arch is None: print( - "Huggingface model of type {architectures} is not yet supported by FlexFlow" + f"Huggingface model of type {architectures} is not yet supported by FlexFlow" ) sys.exit(1) return ff_arch From 68a5a541051cba114a6d8fa155842987c90ab9ae Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 19 Aug 2023 10:23:29 -0500 Subject: [PATCH 207/344] link to stdc++fs (#985) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fced69cf8..71077d22ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,7 +307,8 @@ list(APPEND FF_NVCC_FLAGS list(APPEND FF_LD_FLAGS -lrt -ldl - -rdynamic) + -rdynamic + -lstdc++fs) # Set FF FLAGS add_compile_options(${FF_CC_FLAGS}) From 0ec418985c78ce7b1f045fa45a74bf510c26219b Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 21 Aug 2023 07:46:33 -0500 Subject: [PATCH 208/344] add GenerationResult to the Python interface (#1000) --- include/flexflow/flexflow_c.h | 10 +++++++--- python/flexflow/core/flexflow_cffi.py | 16 +++++++++++++--- python/flexflow/serve/__init__.py | 8 ++++---- python/flexflow/serve/serve.py | 5 +++++ src/c/flexflow_c.cc | 21 ++++++++++++++++----- src/runtime/request_manager.cu | 4 ++-- 6 files changed, 47 insertions(+), 17 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 76cfd16ee1..a0481ac702 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -565,9 +565,13 @@ flexflow_perf_metrics_t void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); -flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_, - char const *text, - int max_seq_length); +flexflow_generation_result_t + flexflow_model_generate(flexflow_model_t handle_, + char const *input_text, + int max_num_chars, + char *output_text, + int max_seq_length, + int *output_length_and_tokens); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 9fa3ab3aad..a1d8e1434e 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -23,6 +23,7 @@ import numpy as np from .flexflow_logger import fflogger from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, ModelType, OpType, ParameterSyncType, enum_to_int, int_to_enum + _FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) if not _FF_BUILD_DOCS: from .flexflowlib import ffi, flexflow_library @@ -2962,9 +2963,18 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate(self, text, max_sequence_length): - c_text = get_c_name(text) - return ffc.flexflow_model_generate(self.handle, c_text, max_sequence_length) + def generate(self, prompt, max_sequence_length): + c_input_text = get_c_name(prompt) + max_num_chars = 36000 + c_output_text = ffi.new("char[]", max_num_chars) + c_output_length_and_tokens = ffi.new("int[]", max_sequence_length + 100) + ffc.flexflow_model_generate(self.handle, c_input_text, max_num_chars, c_output_text, max_sequence_length, c_output_length_and_tokens) + output_length = c_output_length_and_tokens[0] + output_tokens = [] + for i in range(output_length): + output_tokens.append(c_output_length_and_tokens[i+1]) + from flexflow.serve import GenerationResult + return GenerationResult(ffi.string(c_output_text), output_tokens) def set_position_offset(self, offset): ffc.flexflow_model_set_position_offset(self.handle, offset) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 19a2774a54..e45b9759a0 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -212,8 +212,8 @@ def init(configs_dict: Optional[dict] = None, if fusion: sys.argv += ["--fusion"] - global LLM, SSM, GenerationConfig - from .serve import LLM, SSM, GenerationConfig + global LLM, SSM, GenerationConfig, GenerationResult + from .serve import LLM, SSM, GenerationConfig, GenerationResult def init_cpu(): @@ -225,5 +225,5 @@ def init_cpu(): # Ask the runtime to avoid using GPU/GPU memory os.environ["CPU_ONLY_TEST"] = "1" - global LLM, SSM, GenerationConfig - from .serve import LLM, SSM, GenerationConfig + global LLM, SSM, GenerationConfig, GenerationResult + from .serve import LLM, SSM, GenerationConfig, GenerationResult diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 3c26f7ab18..17bb953c9e 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -51,6 +51,11 @@ def __init__( self.topp = topp self.topk = topk +class GenerationResult: + """A class to store the output of a generation request.""" + def __init__(self, text: str = None, tokens: list = None): + self.output_text = text + self.output_tokens = tokens class LLM: """This class creates a LLM (Large-Language Model) object based on a model from HuggingFace""" diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 47c5d59f66..35c76b025d 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1416,13 +1416,24 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { handle->set_transformer_layer_id(id); } -flexflow_generation_result_t flexflow_model_generate(flexflow_model_t handle_, - char const *text, - int max_seq_length) { - FFModel *handle = FFCObjectWrapper::unwrap(handle_); - std::string const text_str(text); +flexflow_generation_result_t + flexflow_model_generate(flexflow_model_t handle_, + char const *input_text, + int max_num_chars, + char *output_text, + int max_seq_length, + int *output_length_and_tokens) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + std::string const text_str(input_text); GenerationResult result = handle->generate(text_str, max_seq_length); DEBUG_PRINT("[Model] generate %p %s %i", handle, text, max_seq_length); + assert(result.output_tokens.size() <= max_seq_length); + output_length_and_tokens[0] = result.output_tokens.size(); + std::copy(result.output_tokens.begin(), + result.output_tokens.end(), + output_length_and_tokens + 1); + std::memcpy( + output_text, result.output_text.c_str(), result.output_text.length()); return FFCObjectWrapper::wrap(&result); } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 7363e14cf0..58e996629e 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -68,8 +68,8 @@ void RequestManager::load_positions_task( // BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); - - const int offset = *((const int*)task->args); + + int const offset = *((int const *)task->args); int *pos_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain domain = runtime->get_index_space_domain( From 2f6f864037f5a52d42326360ecae9e149a3ee1d1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 21 Aug 2023 12:08:42 -0400 Subject: [PATCH 209/344] update pr template --- .github/PULL_REQUEST_TEMPLATE.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 183028b022..e8177cd9b7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,6 +10,3 @@ Linked Issues: Issues closed by this PR: - Closes # -**Before merging:** - -- [ ] Did you update the [flexflow-third-party](https://github.com/flexflow/flexflow-third-party) repo, if modifying any of the Cmake files, the build configs, or the submodules? From a5ffc62e44f9da46f3f5400a43c0a7bfa41ed6b6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 21 Aug 2023 15:55:53 -0400 Subject: [PATCH 210/344] support loading local model (#1004) --- python/flexflow/serve/serve.py | 84 +++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 17bb953c9e..cc4886d1e2 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -21,7 +21,7 @@ from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from huggingface_hub import HfApi -import sys, torch, shutil +import sys, torch, shutil, hashlib from typing import Union, List @@ -120,6 +120,23 @@ def download_hf_config(self): print(f"Saving {self.model_name} configs to file {self.config_path}...") self.hf_config.to_json_file(self.config_path) + def __get_revision_hashes(self, model_name: str, weights: bool): + ff_revision = None + ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") if weights else os.path.join(self.tokenizer_path, "rev_sha.txt") + if os.path.exists(ff_revision_file): + ff_revision = "".join(open(ff_revision_file).read().split()) + + if os.path.exists(model_name) and os.path.isdir(model_name): + # Local model + files = os.listdir(model_name) + state = files + [os.path.getmtime(os.path.join(model_name, f)) for f in files] + latest_revision = hashlib.md5(str(state).encode('utf-8')).hexdigest() + else: + # Remote HuggingFace model + hf_api = HfApi() + latest_revision = hf_api.model_info(self.model_name).sha + return ff_revision, ff_revision_file, latest_revision + def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. @@ -149,25 +166,27 @@ def download_hf_weights_if_needed(self): os.makedirs(self.weights_path, exist_ok=True) print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - # Get local revision SHA, check if it matches latest one on huggingface - local_revision = None - local_revision_file = os.path.join(self.weights_path, "rev_sha.txt") - if os.path.exists(local_revision_file): - local_revision = "".join(open(local_revision_file).read().split()) - hf_api = HfApi() - latest_revision = hf_api.model_info(self.model_name).sha + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=True) # Download if needed - if local_revision != latest_revision: - print( - f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." - ) - hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, trust_remote_code=True - ) - print("Done downloading HF weights. Converting them now...") + if ff_revision != latest_revision: + if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): + # Local model + print( + f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + ) + else: + # Remote model + print(f"'{self.model_name}' local model weights were updated! Converting new weights now...") + # Download model from HuggingFace, or load it from the local folder + hf_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True) + # Print log message to notify user download of model has finished + if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): + print("Done downloading HF weights. Converting them now...") + # Convert the model to FlexFlow format self.model_class.convert_hf_model(hf_model, self.weights_path) - with open(local_revision_file, "w+") as f: + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: f.write(latest_revision) print("Done converting the weights...") else: @@ -196,29 +215,32 @@ def download_hf_tokenizer_if_needed(self): os.makedirs(self.tokenizer_path, exist_ok=True) # Get local revision SHA, check if it matches latest one on huggingface - local_revision = None - local_revision_file = os.path.join(self.tokenizer_path, "rev_sha.txt") - if os.path.exists(local_revision_file): - local_revision = "".join(open(local_revision_file).read().split()) - hf_api = HfApi() - latest_revision = hf_api.model_info(self.model_name).sha + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=False) - # Download if needed - if local_revision != latest_revision: - print( - f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." - ) + if ff_revision != latest_revision: + if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): + # Local model + print(f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ...") + else: + # Remote model + print(f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now...") + # Download tokenizer from HuggingFace, or load it from the local folder if self.model_type == ModelType.LLAMA: hf_tokenizer = LlamaTokenizer.from_pretrained( self.model_name, use_fast=True ) else: hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) + # Print log message to notify user download of tokenizer has finished + if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): + print("Done downloading tokenizer. Saving it now...") + # Save tokenizer hf_tokenizer.save_pretrained(self.tokenizer_path) - print("Done downloading HF tokenizer.") - with open(local_revision_file, "w+") as f: + print("Done saving HF tokenizer.") + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: f.write(latest_revision) - print("Loading the tokenizer...") + else: print(f"Loading '{self.model_name}' tokenizer from the cache...") From cf13ee7dcdeff3ea10ee0e0cb8b333e5c7a09661 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 22 Aug 2023 21:37:22 -0400 Subject: [PATCH 211/344] Add multinode tutorial to readthedocs (#1019) * add multinode tutorial to readtheocs * add missing file * add missing extension * fix --- MULTI-NODE.md | 2 +- docs/source/index.rst | 1 + docs/source/multinode.rst | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 docs/source/multinode.rst diff --git a/MULTI-NODE.md b/MULTI-NODE.md index a8fd2fb705..4bae47cfa6 100644 --- a/MULTI-NODE.md +++ b/MULTI-NODE.md @@ -68,4 +68,4 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su Follow step 6 in [INSTALL.md](INSTALL.md) to set environment variables. -A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). \ No newline at end of file +A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). diff --git a/docs/source/index.rst b/docs/source/index.rst index 2b369ac8e6..a7ea2ff3ac 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,6 +12,7 @@ Welcome to FlexFlow's documentation! welcome installation docker + multinode .. toctree:: :caption: FlexFlow Serve diff --git a/docs/source/multinode.rst b/docs/source/multinode.rst new file mode 100644 index 0000000000..8827200582 --- /dev/null +++ b/docs/source/multinode.rst @@ -0,0 +1,8 @@ +:tocdepth: 1 +****************** +Multinode tutorial +****************** + + +.. mdinclude:: ../../MULTI-NODE.md + :start-line: 3 From 9d0bc56a7b53ee89cec9de3921b8e6da23d92b20 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 25 Aug 2023 16:58:42 -0500 Subject: [PATCH 212/344] Allow FlexFlow Serve to stop when EOS token is generated (#1026) * enable early stop * fix EOS token id --- include/flexflow/request_manager.h | 18 +++++++++++-- src/runtime/request_manager.cc | 41 ++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index d6c30c6b78..1eb5643483 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -51,9 +51,15 @@ class InferenceManager { }; struct Request { + enum Status { + PENDING = 101, + RUNNING = 102, + COMPLETED = 103, + }; BatchConfig::RequestGuid guid; int max_sequence_length; int initial_len; + Status status = PENDING; std::vector tokens; std::vector beam_trees; @@ -206,8 +212,16 @@ class RequestManager { InferenceResultFuture last_irf; TreeVerifyBatchConfigFuture last_tree_bcf; InferenceResultFuture last_tree_irf; - const std::map model_bos_map = { - {ModelType::LLAMA, 0}, {ModelType::OPT, 2}, {ModelType::LLAMA2, 1}}; + const std::map model_bos_map = {{ModelType::LLAMA, 0}, + {ModelType::OPT, 2}, + {ModelType::LLAMA2, 1}, + {ModelType::FALCON, 11}, + {ModelType::STARCODER, 0}}; + const std::map model_eos_map = {{ModelType::LLAMA, 1}, + {ModelType::OPT, 2}, + {ModelType::LLAMA2, 2}, + {ModelType::FALCON, 11}, + {ModelType::STARCODER, 0}}; // TODO: Move this two vector to request struct std::unordered_map lock(request_queue_mutex); // Add a new request Request request; + request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; if (this->model_bos_map.find(this->model_type) != this->model_bos_map.end()) { @@ -253,7 +255,8 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { const std::lock_guard lock(request_queue_mutex); assert(all_requests.find(guid) != all_requests.end()); Request const &request = all_requests[guid]; - return request.tokens.size() >= request.max_sequence_length; + // return request.tokens.size() >= request.max_sequence_length; + return request.status == Request::COMPLETED; } GenerationResult @@ -326,9 +329,23 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, int processed_tokens = old_bc.requestsInfo[i].token_start_offset + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length - // || ir.results[t] == 0 TODO: replace this with - ) { + bool request_completed = false; + printf("model_type = %d\n", this->model_type); + if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + request_completed = true; + } else if (this->model_eos_map.find(this->model_type) != + this->model_eos_map.end()) { + TokenId eos_token_id = this->model_eos_map.at(this->model_type); + printf("request_tokens.back() == %d eos_token_id = %d\n", + request.tokens.back(), + eos_token_id); + // Encounter EOS token id + if (request.tokens.back() == eos_token_id) { + request_completed = true; + } + } + if (request_completed) { + request.status = Request::COMPLETED; log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", old_bc.requestsInfo[i].request_guid, request.tokens.size()); @@ -352,13 +369,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf) acc_latency(%.1lf)", + "finish(%.1lf) latency(%.1lf)", request.guid, profile_info.decoding_steps, profile_info.start_time, profile_info.finish_time, - profile_info.finish_time - profile_info.start_time, - total_request_run_time); + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath); @@ -715,7 +731,7 @@ BeamSearchBatchConfig request.tokens.push_back(verified_tokens[j].first); } } - + request.status = Request::COMPLETED; log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", request.guid, request.tokens.size()); @@ -736,13 +752,12 @@ BeamSearchBatchConfig profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf) acc_latency(%.1lf)", + "finish(%.1lf) latency(%.1lf)", request.guid, profile_info.decoding_steps, profile_info.start_time, profile_info.finish_time, - profile_info.finish_time - profile_info.start_time, - total_request_run_time); + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { @@ -1639,7 +1654,7 @@ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, runtime->end_trace(ctx, 12346 /*trace_id*/); } GenerationResult gr = get_generation_result(guid); - assert(gr.output_tokens.size() >= max_seq_length); + // assert(gr.output_tokens.size() >= max_seq_length); return gr; } @@ -1714,7 +1729,7 @@ GenerationResult RequestManager::generate_spec_infer(FFModel *llm, } GenerationResult gr = get_generation_result(guid); - assert(gr.output_tokens.size() >= max_seq_length); + // assert(gr.output_tokens.size() >= max_seq_length); return gr; } From dfbe55494801a6ed6ef426b6ecd2e8d7b64b6e17 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 27 Aug 2023 02:21:05 -0400 Subject: [PATCH 213/344] Build docker images in more cuda versions (#1030) --- .github/workflows/docker-build.yml | 6 +++++- .github/workflows/helpers/install_cudnn.sh | 3 +++ docker/build.sh | 8 +++++--- docker/publish.sh | 4 ++-- docker/pull.sh | 4 ++-- docker/run.sh | 4 ++-- 6 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 748d3365da..304b63d65c 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: gpu_backend: ["cuda", "hip_rocm"] - cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"] + cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"] # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported exclude: - gpu_backend: "hip_rocm" @@ -35,12 +35,16 @@ jobs: cuda_version: "11.2" - gpu_backend: "hip_rocm" cuda_version: "11.3" + - gpu_backend: "hip_rocm" + cuda_version: "11.4" - gpu_backend: "hip_rocm" cuda_version: "11.5" - gpu_backend: "hip_rocm" cuda_version: "11.6" - gpu_backend: "hip_rocm" cuda_version: "11.7" + - gpu_backend: "hip_rocm" + cuda_version: "12.0" fail-fast: false env: FF_GPU_BACKEND: ${{ matrix.gpu_backend }} diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh index 318134e331..75e59109eb 100755 --- a/.github/workflows/helpers/install_cudnn.sh +++ b/.github/workflows/helpers/install_cudnn.sh @@ -44,6 +44,9 @@ elif [[ "$cuda_version" == "11.7" ]]; then elif [[ "$cuda_version" == "11.8" ]]; then CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz +elif [[ "$cuda_version" == "11.8" ]]; then + echo "CUDNN support for CUDA version 12.0 not yet added" + exit 1 fi wget -c -q $CUDNN_LINK if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" == "11.8" ]]; then diff --git a/docker/build.sh b/docker/build.sh index 6ed5cbe00e..f0f2610b04 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -36,14 +36,16 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version=${cuda_version:1:4} fi # Check that CUDA version is supported, and modify cuda version to include default subsubversion - if [[ "$cuda_version" == @(11.1|11.3|11.7) ]]; then + if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then cuda_version_input=${cuda_version}.1 elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then cuda_version_input=${cuda_version}.2 - elif [[ "$cuda_version" == @(11.8) ]]; then + elif [[ "$cuda_version" == @(11.4) ]]; then + cuda_version_input=${cuda_version}.3 + elif [[ "$cuda_version" == @(11.8|12.2) ]]; then cuda_version_input=${cuda_version}.0 else - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}" + echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi # Set cuda version suffix to docker image name diff --git a/docker/publish.sh b/docker/publish.sh index b8668d3c0e..8073abdd72 100755 --- a/docker/publish.sh +++ b/docker/publish.sh @@ -36,8 +36,8 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version=${cuda_version:1:4} fi # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}" + if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then + echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi # Set cuda version suffix to docker image name diff --git a/docker/pull.sh b/docker/pull.sh index f8624a1072..fb56a26902 100755 --- a/docker/pull.sh +++ b/docker/pull.sh @@ -36,8 +36,8 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version=${cuda_version:1:4} fi # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}" + if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then + echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi # Set cuda version suffix to docker image name diff --git a/docker/run.sh b/docker/run.sh index f326db0a6c..2ebc19d834 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -49,8 +49,8 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version=${cuda_version:1:4} fi # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}" + if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then + echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi # Set cuda version suffix to docker image name From 00be68d19fb8410aca13f5656dc074c8c2943489 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 28 Aug 2023 03:59:00 -0400 Subject: [PATCH 214/344] Automatic bos/eos token determination, plus docker fix (#1031) * automatic bos/eos token determination, plus docker fix * fix --- .github/workflows/gpu-ci.yml | 2 ++ docker/build.sh | 5 ++++ docker/pull.sh | 4 ++++ docker/run.sh | 4 ++++ include/flexflow/flexflow_c.h | 2 ++ include/flexflow/request_manager.h | 23 ++++++------------- inference/incr_decoding/incr_decoding.cc | 5 +++- inference/spec_infer/spec_infer.cc | 11 +++++++++ python/flexflow/core/flexflow_cffi.py | 4 ++-- python/flexflow/serve/serve.py | 4 ++-- src/c/flexflow_c.cc | 5 +++- src/runtime/request_manager.cc | 29 ++++++++---------------- 12 files changed, 57 insertions(+), 41 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index f732513e0d..d604a7cea9 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -174,6 +174,8 @@ jobs: make -j - name: Run inference tests + env: + CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} run: | export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) diff --git a/docker/build.sh b/docker/build.sh index f0f2610b04..e4f06a0ec1 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -48,6 +48,11 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi + # Use CUDA 12.0 for all versions greater or equal to 12.0 for now + if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + cuda_version=12.0 + cuda_version_input=${cuda_version}.1 + fi # Set cuda version suffix to docker image name echo "Building $image docker image with CUDA $cuda_version" cuda_version="-${cuda_version}" diff --git a/docker/pull.sh b/docker/pull.sh index fb56a26902..989fc89ccb 100755 --- a/docker/pull.sh +++ b/docker/pull.sh @@ -40,6 +40,10 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi + # Use CUDA 12.0 for all versions greater or equal to 12.0 for now + if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + cuda_version=12.0 + fi # Set cuda version suffix to docker image name echo "Downloading $image docker image with CUDA $cuda_version" cuda_version="-${cuda_version}" diff --git a/docker/run.sh b/docker/run.sh index 2ebc19d834..bb948d6b78 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -53,6 +53,10 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi + # Use CUDA 12.0 for all versions greater or equal to 12.0 for now + if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + cuda_version=12.0 + fi # Set cuda version suffix to docker image name echo "Running $image docker image with CUDA $cuda_version" cuda_version_hyphen="-${cuda_version}" diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index a0481ac702..003533bb80 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -923,6 +923,8 @@ flexflow_request_manager_t flexflow_request_manager_get_request_manager(void); void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, + int bos_token_id, + int eos_token_id, char const *tokenizer_filepath); void flexflow_request_manager_register_output_filepath( diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 1eb5643483..e444402dd0 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -86,21 +86,20 @@ class RequestManager { public: using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; - // RequestManager(ModelType model_type, - // std::string const &path, - // bool verbose = false, - // std::string output_filepath = ""); + RequestManager(); static RequestManager *get_request_manager(); size_t get_num_processed_requests(); size_t get_num_ssms(); int register_ssm_model(FFModel *model); - void register_tokenizer(ModelType model_type, std::string const &path); + void register_tokenizer(ModelType model_type, + int bos_token_id, + int eos_token_id, + std::string const &path); void register_output_filepath(std::string const &); FFModel *get_model(int model_id); - static void serve(FFModel *model); GenerationResult generate_incr_decoding(FFModel *model, std::string const &text, @@ -201,6 +200,8 @@ class RequestManager { std::unique_ptr tokenizer_; bool verbose; ModelType model_type; + int bos_token_id; + int eos_token_id; std::string output_filepath; std::queue pending_request_queue; std::unordered_map all_requests; @@ -212,16 +213,6 @@ class RequestManager { InferenceResultFuture last_irf; TreeVerifyBatchConfigFuture last_tree_bcf; InferenceResultFuture last_tree_irf; - const std::map model_bos_map = {{ModelType::LLAMA, 0}, - {ModelType::OPT, 2}, - {ModelType::LLAMA2, 1}, - {ModelType::FALCON, 11}, - {ModelType::STARCODER, 0}}; - const std::map model_eos_map = {{ModelType::LLAMA, 1}, - {ModelType::OPT, 2}, - {ModelType::LLAMA2, 2}, - {ModelType::FALCON, 11}, - {ModelType::STARCODER, 0}}; // TODO: Move this two vector to request struct std::unordered_mapregister_tokenizer(model_type, tokenizer_filepath); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 975b66c82e..16eab8d077 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -45,6 +45,8 @@ struct ModelMeta { std::string llm_weights_path; std::string llm_model_config_path; + int bos_token_id, eos_token_id; + std::vector ssm_model_types; std::vector ssm_model_config_paths; std::vector ssm_model_weights_paths; @@ -165,6 +167,8 @@ void get_model_meta(FilePaths &file_paths, break; } } + model_metadata.bos_token_id = llm_model_config["bos_token_id"]; + model_metadata.eos_token_id = llm_model_config["eos_token_id"]; for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { std::string ssm_config_path = join_path({file_paths.cache_folder_path, @@ -211,6 +215,11 @@ void get_model_meta(FilePaths &file_paths, break; } } + if (ssm_model_config["bos_token_id"] != model_metadata.bos_token_id || + ssm_model_config["eos_token_id"] != model_metadata.eos_token_id) { + printf("Warning: bos/eos token id mismatch between LLM and one of the " + "SSMs!\n"); + } model_metadata.ssm_model_types.push_back(ssm_model_type); model_metadata.ssm_model_config_paths.push_back(ssm_config_path); model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); @@ -257,6 +266,8 @@ void FlexFlow::top_level_task(Task const *task, InferenceManager *im = InferenceManager::get_inference_manager(); RequestManager *rm = RequestManager::get_request_manager(); rm->register_tokenizer(model_metadata.llm_model_type, + model_metadata.bos_token_id, + model_metadata.eos_token_id, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index a1d8e1434e..1508371ae7 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3231,10 +3231,10 @@ def __init__(self): self.handle = ffc.flexflow_request_manager_get_request_manager() #self._handle = ffi.gc(self.handle, ffc.flexflow_request_manager_destroy) - def register_tokenizer(self, model_type, tokenizer_filepath): + def register_tokenizer(self, model_type, bos_token_id, eos_token_id, tokenizer_filepath): c_model_type = enum_to_int(ModelType, model_type) c_tokenizer_filepath = get_c_name(tokenizer_filepath) - return ffc.flexflow_request_manager_register_tokenizer(self.handle, c_model_type, c_tokenizer_filepath) + return ffc.flexflow_request_manager_register_tokenizer(self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath) def register_output_filepath(self, output_filepath): c_output_filepath = get_c_name(output_filepath) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index cc4886d1e2..dea21389d1 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -212,7 +212,7 @@ def download_hf_tokenizer_if_needed(self): shutil.rmtree(self.tokenizer_path) if not os.path.exists(self.tokenizer_path): print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...") - os.makedirs(self.tokenizer_path, exist_ok=True) + os.makedirs(self.tokenizer_path, exist_ok=True) # Get local revision SHA, check if it matches latest one on huggingface ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=False) @@ -358,7 +358,7 @@ def compile( # Create request manager self.rm = RequestManager() - self.rm.register_tokenizer(self.model_type, self.tokenizer_path) + self.rm.register_tokenizer(self.model_type, self.hf_config.bos_token_id, self.hf_config.eos_token_id, self.tokenizer_path) self.rm.register_output_filepath(self.output_file) self.im.init_operators_inference(self.model.ffmodel) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 35c76b025d..96ff84c85f 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2388,12 +2388,15 @@ flexflow_request_manager_t flexflow_request_manager_get_request_manager(void) { void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, + int bos_token_id, + int eos_token_id, char const *tokenizer_filepath) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); assert(tokenizer_filepath != nullptr && "Cannot convert nullptr char * to std::string"); std::string const tokenizer_filepath_str(tokenizer_filepath); - handle->register_tokenizer(model_type, tokenizer_filepath_str); + handle->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath_str); DEBUG_PRINT( "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5b4eae13cc..d75b0fbe0b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -66,9 +66,12 @@ RequestManager::RequestManager() } void RequestManager::register_tokenizer(ModelType type, + int bos_token_id, + int eos_token_id, std::string const &path) { - // bos id this->model_type = type; + this->bos_token_id = bos_token_id; + this->eos_token_id = eos_token_id; std::string tokenizer_folder = (!path.empty() && path.back() != '/') ? path + '/' : path; if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { @@ -157,7 +160,7 @@ RequestManager::RequestGuid } if (get_num_ssms() == 0) { - std::cout << "No small speculative model registered yet, using incremental " + std::cout << "No small speculative model registered, using incremental " "decoding." << std::endl; } else { @@ -198,9 +201,7 @@ RequestManager::RequestGuid request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - if (this->model_bos_map.find(this->model_type) != this->model_bos_map.end()) { - request.tokens.push_back(this->model_bos_map.at(this->model_type)); - } + request.tokens.push_back(bos_token_id); std::vector tokens = this->tokenizer_->Encode(prompt); if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) { @@ -220,7 +221,7 @@ RequestManager::RequestGuid request.initial_len = request.tokens.size(); if (get_num_ssms() == 0) { - std::cout << "No small speculative model registered yet, using incremental " + std::cout << "No small speculative model registered, using incremental " "decoding." << std::endl; } else { @@ -330,19 +331,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); bool request_completed = false; - printf("model_type = %d\n", this->model_type); + // printf("model_type = %d\n", this->model_type); if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { request_completed = true; - } else if (this->model_eos_map.find(this->model_type) != - this->model_eos_map.end()) { - TokenId eos_token_id = this->model_eos_map.at(this->model_type); - printf("request_tokens.back() == %d eos_token_id = %d\n", - request.tokens.back(), - eos_token_id); + } else if (request.tokens.back() == eos_token_id) { // Encounter EOS token id - if (request.tokens.back() == eos_token_id) { - request_completed = true; - } + request_completed = true; } if (request_completed) { request.status = Request::COMPLETED; @@ -351,9 +345,6 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); - // for (int i = 0; i < request.tokens.size(); i++) { - // std::cout << request.tokens.at(i) << "\n"; - // } { // update generation result and trigger future GenerationResult &gr = request_generation_results[request.guid]; From 5bf476c062e4808d2138889cee1b66ec6d75a338 Mon Sep 17 00:00:00 2001 From: raphaelauv Date: Mon, 28 Aug 2023 16:02:32 +0200 Subject: [PATCH 215/344] clean: duplicate in requirements.txt (#1034) --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c6337dccfa..1037661337 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,9 +7,7 @@ pybind11 cmake-build-extension ninja requests -qualname regex -requests torch>=1.13.1 torchaudio>=0.13.1 torchvision>=0.14.1 From e6763fa2aadcbb7ac98d800cf367b64498d93557 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 30 Aug 2023 22:47:24 -0400 Subject: [PATCH 216/344] Build Docker images for AMD gpus (#1041) * enable building docker images for different hip versions * ignore shellcheck error code * support hip compilation in inference cmake files * fix * fix * cmake fix * cmake fixes * cmake fixes * nested if condition fix * update docker workflow and config scripts * update scripts * fix * fix * cleanup * rocm 5.6 by default in workflow * update workflow * fix * fix * fix * fix permissions * fix workflow * fix * fix --- .github/workflows/build.yml | 53 ++++---- .github/workflows/docker-build.yml | 60 ++++++---- .../workflows/helpers/install_dependencies.sh | 38 ++++-- .github/workflows/pip-install.yml | 2 + CMakeLists.txt | 24 +++- cmake/hip.cmake | 11 ++ cmake/legion.cmake | 3 + config/config.inc | 7 +- config/config.linux | 5 +- docker/build.sh | 113 ++++++++++++------ docker/flexflow-environment/Dockerfile | 33 ++++- docker/flexflow/Dockerfile | 4 +- docker/publish.sh | 47 ++++++-- docker/pull.sh | 47 ++++++-- docker/run.sh | 71 ++++++++--- inference/incr_decoding/CMakeLists.txt | 17 ++- inference/spec_infer/CMakeLists.txt | 17 ++- 17 files changed, 412 insertions(+), 140 deletions(-) create mode 100644 cmake/hip.cmake diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a9ad7186c..1c6eff4a12 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -40,6 +40,8 @@ jobs: matrix: gpu_backend: ["cuda", "hip_rocm"] fail-fast: false + env: + FF_GPU_BACKEND: ${{ matrix.gpu_backend }} steps: - name: Checkout Git Repository uses: actions/checkout@v3 @@ -51,6 +53,7 @@ jobs: - name: Install CUDA uses: Jimver/cuda-toolkit@v0.2.11 + if: ${{ matrix.gpu_backend == 'cuda' }} id: cuda-toolkit with: cuda: "11.8.0" @@ -58,7 +61,7 @@ jobs: use-github-cache: "false" - name: Install system dependencies - run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh + run: .github/workflows/helpers/install_dependencies.sh - name: Install conda and FlexFlow dependencies uses: conda-incubator/setup-miniconda@v2 @@ -72,22 +75,25 @@ jobs: export CUDNN_DIR="$CUDA_PATH" export CUDA_DIR="$CUDA_PATH" export FF_HOME=$(pwd) - export FF_GPU_BACKEND=${{ matrix.gpu_backend }} export FF_CUDA_ARCH=70 - cores_available=$(nproc --all) - n_build_cores=$(( cores_available -1 )) - if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi - mkdir build - cd build + export FF_HIP_ARCH=gfx1100,gfx1036 + export hip_version=5.6 + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then export FF_BUILD_ALL_EXAMPLES=ON - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON else export FF_BUILD_ALL_EXAMPLES=OFF - export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF export FF_BUILD_UNIT_TESTS=OFF fi + + cores_available=$(nproc --all) + n_build_cores=$(( cores_available -1 )) + if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi + mkdir build + cd build + ../config/config.linux make -j $n_build_cores @@ -96,30 +102,24 @@ jobs: export CUDNN_DIR="$CUDA_PATH" export CUDA_DIR="$CUDA_PATH" export FF_HOME=$(pwd) - export FF_GPU_BACKEND=${{ matrix.gpu_backend }} export FF_CUDA_ARCH=70 - cd build + export FF_HIP_ARCH=gfx1100,gfx1036 + export hip_version=5.6 + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then export FF_BUILD_ALL_EXAMPLES=ON - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON else export FF_BUILD_ALL_EXAMPLES=OFF - export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF export FF_BUILD_UNIT_TESTS=OFF fi + + cd build ../config/config.linux sudo make install sudo ldconfig - - name: Check availability of Python flexflow.core module - if: ${{ matrix.gpu_backend == 'cuda' }} - run: | - export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH" - sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1" - export CPU_ONLY_TEST=1 - python -c "import flexflow.core; exit()" - - name: Run C++ unit tests if: ${{ matrix.gpu_backend == 'cuda' }} run: | @@ -127,9 +127,20 @@ jobs: export CUDA_DIR="$CUDA_PATH" export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH" export FF_HOME=$(pwd) + sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1" cd build ./tests/unit/unit-test + - name: Check availability of Python flexflow.core module + run: | + if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then + export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH" + fi + # Remove build folder to check that the installed version can run independently of the build files + rm -rf build + export CPU_ONLY_TEST=1 + python -c "import flexflow.core; exit()" + makefile-build: name: Build FlexFlow with the Makefile runs-on: ubuntu-20.04 diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 304b63d65c..b0ca251510 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -26,29 +26,42 @@ jobs: strategy: matrix: gpu_backend: ["cuda", "hip_rocm"] - cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"] + gpu_backend_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0", "5.3", "5.4", "5.5", "5.6"] # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported exclude: + - gpu_backend: "cuda" + gpu_backend_version: "5.3" + - gpu_backend: "cuda" + gpu_backend_version: "5.4" + - gpu_backend: "cuda" + gpu_backend_version: "5.5" + - gpu_backend: "cuda" + gpu_backend_version: "5.6" - gpu_backend: "hip_rocm" - cuda_version: "11.1" + gpu_backend_version: "11.1" - gpu_backend: "hip_rocm" - cuda_version: "11.2" + gpu_backend_version: "11.2" - gpu_backend: "hip_rocm" - cuda_version: "11.3" + gpu_backend_version: "11.3" - gpu_backend: "hip_rocm" - cuda_version: "11.4" + gpu_backend_version: "11.4" - gpu_backend: "hip_rocm" - cuda_version: "11.5" + gpu_backend_version: "11.5" - gpu_backend: "hip_rocm" - cuda_version: "11.6" + gpu_backend_version: "11.6" - gpu_backend: "hip_rocm" - cuda_version: "11.7" + gpu_backend_version: "11.7" - gpu_backend: "hip_rocm" - cuda_version: "12.0" + gpu_backend_version: "11.8" + - gpu_backend: "hip_rocm" + gpu_backend_version: "12.0" fail-fast: false env: FF_GPU_BACKEND: ${{ matrix.gpu_backend }} - cuda_version: ${{ matrix.cuda_version }} + gpu_backend_version: ${{ matrix.gpu_backend_version }} + # one of the two variables below will be unused + cuda_version: ${{ matrix.gpu_backend_version }} + hip_version: ${{ matrix.gpu_backend_version }} branch_name: ${{ github.head_ref || github.ref_name }} steps: - name: Checkout Git Repository @@ -58,8 +71,8 @@ jobs: - name: Free additional space on runner env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} - build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} + build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }} run: | if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then .github/workflows/helpers/free_space_on_runner.sh @@ -69,22 +82,19 @@ jobs: - name: Build Docker container env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} - build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} + build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }} run: | - if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON - else - export FF_BUILD_ALL_INFERENCE_EXAMPLES=OFF - fi # On push to inference, build for all compatible architectures, so that we can publish # a pre-built general-purpose image. On all other cases, only build for one architecture # to save time. if [[ $deploy_needed == "true" ]] ; then export FF_CUDA_ARCH=all + export FF_HIP_ARCH=all ./docker/build.sh flexflow elif [[ $build_needed == "true" ]]; then export FF_CUDA_ARCH=70 + export FF_HIP_ARCH=gfx1100,gfx1036 ./docker/build.sh flexflow else echo "Skipping build to save time" @@ -93,11 +103,15 @@ jobs: - name: Check availability of Python flexflow.core module if: ${{ matrix.gpu_backend == 'cuda' }} env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} - build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} + build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }} run: | if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then - docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'" + if [[ $FF_GPU_BACKEND == "cuda" ]]; then + docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'" + else + docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; exit()'" + fi else echo "Skipping test to save time" fi @@ -106,7 +120,7 @@ jobs: if: github.repository_owner == 'flexflow' env: FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} run: | if [[ $deploy_needed == "true" ]]; then ./docker/publish.sh flexflow-environment diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh index cf37f3b820..1357882b5d 100755 --- a/.github/workflows/helpers/install_dependencies.sh +++ b/.github/workflows/helpers/install_dependencies.sh @@ -10,21 +10,41 @@ echo "Installing apt dependencies..." sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \ sudo rm -rf /var/lib/apt/lists/* -# Install CUDNN -./install_cudnn.sh - -# Install HIP dependencies if needed FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"} +hip_version=${hip_version:-"5.6"} if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." exit 1 -elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then +fi +# Install CUDNN if needed +if [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then + # Install CUDNN + ./install_cudnn.sh +fi +# Install HIP dependencies if needed +if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies" - wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/focal/amdgpu-install_22.20.50205-1_all.deb - sudo apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb - rm ./amdgpu-install_22.20.50205-1_all.deb + # Check that hip_version is one of 5.3,5.4,5.5,5.6 + if [[ "$hip_version" != "5.3" && "$hip_version" != "5.4" && "$hip_version" != "5.5" && "$hip_version" != "5.6" ]]; then + echo "hip_version '${hip_version}' is not supported, please choose among {5.3, 5.4, 5.5, 5.6}" + exit 1 + fi + # Compute script name and url given the version + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.6.50600-1_all.deb + if [ "$hip_version" = "5.3" ]; then + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.3.50300-1_all.deb + elif [ "$hip_version" = "5.4" ]; then + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.4.50400-1_all.deb + elif [ "$hip_version" = "5.5" ]; then + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb + fi + AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}" + # Download and install AMD GPU software with ROCM and HIP support + wget "$AMD_GPU_SCRIPT_URL" + sudo apt-get install -y ./${AMD_GPU_SCRIPT_NAME} + sudo rm ./${AMD_GPU_SCRIPT_NAME} sudo amdgpu-install -y --usecase=hip,rocm --no-dkms - sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk + sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs # Install protobuf v3.20.x manually sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index d79834e31d..695ed9857b 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -66,6 +66,8 @@ jobs: export FF_HOME=$(pwd) export FF_CUDA_ARCH=70 pip install . --verbose + # Remove build folder to check that the installed version can run independently of the build files + rm -rf build - name: Check availability of Python flexflow.core module run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 71077d22ef..2985732a8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -160,9 +160,14 @@ set_property(CACHE FF_GPU_BACKEND PROPERTY STRINGS ${FF_GPU_BACKENDS}) # option for cuda arch set(FF_CUDA_ARCH "autodetect" CACHE STRING "Target CUDA Arch") -if (FF_CUDA_ARCH STREQUAL "") +if ((FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") AND FF_CUDA_ARCH STREQUAL "") message(FATAL_ERROR "FF_CUDA_ARCH cannot be an empty string. Set it to `autodetect`, `all`, or pass one or multiple valid CUDA archs.") endif() +# option for hip arch +set(FF_HIP_ARCH "all" CACHE STRING "Target HIP Arch") +if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_CUDA_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH cannot be an empty string. Set it to `all`, or pass one or multiple valid HIP archs.") +endif() # option for nccl option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF) @@ -226,6 +231,11 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") include(cuda) endif() +# HIP +if (FF_GPU_BACKEND STREQUAL "hip_rocm" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + include(hip) +endif() + # CUDNN if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") include(cudnn) @@ -397,6 +407,18 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") add_compile_definitions(FF_USE_HIP_ROCM) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is undefined") + endif() + set_property(TARGET flexflow PROPERTY HIP_ARCHITECTURES "${HIP_ARCH_LIST}") + + message(STATUS "FF_GPU_BACKEND: ${FF_GPU_BACKEND}") + message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") + message(STATUS "HIP_ARCH_LIST: ${HIP_ARCH_LIST}") + get_property(CHECK_HIP_ARCHS TARGET flexflow PROPERTY HIP_ARCHITECTURES) + message(STATUS "CHECK_HIP_ARCHS: ${CHECK_HIP_ARCHS}") + message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") + # The hip cmake config module defines three targets, # hip::amdhip64, hip::host, and hip::device. # diff --git a/cmake/hip.cmake b/cmake/hip.cmake new file mode 100644 index 0000000000..b32d68d608 --- /dev/null +++ b/cmake/hip.cmake @@ -0,0 +1,11 @@ +if (NOT FF_HIP_ARCH STREQUAL "") + if (FF_HIP_ARCH STREQUAL "all") + set(FF_HIP_ARCH "gfx900,gfx902,gfx904,gfx906,gfx908,gfx909,gfx90a,gfx90c,gfx940,gfx1010,gfx1011,gfx1012,gfx1013,gfx1030,gfx1031,gfx1032,gfx1033,gfx1034,gfx1035,gfx1036,gfx1100,gfx1101,gfx1102,gfx1103") + endif() + string(REPLACE "," " " HIP_ARCH_LIST "${FF_HIP_ARCH}") +endif() + +message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") +if(FF_GPU_BACKEND STREQUAL "hip_rocm") + set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) +endif() diff --git a/cmake/legion.cmake b/cmake/legion.cmake index b4cfad20e2..b83cbc52f2 100644 --- a/cmake/legion.cmake +++ b/cmake/legion.cmake @@ -142,8 +142,11 @@ else() set(Legion_USE_HIP ON CACHE BOOL "enable Legion_USE_HIP" FORCE) if (FF_GPU_BACKEND STREQUAL "hip_cuda") set(Legion_HIP_TARGET "CUDA" CACHE STRING "Legion_HIP_TARGET CUDA" FORCE) + set(Legion_CUDA_ARCH ${FF_CUDA_ARCH} CACHE STRING "Legion CUDA ARCH" FORCE) elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") set(Legion_HIP_TARGET "ROCM" CACHE STRING "Legion HIP_TARGET ROCM" FORCE) + set(Legion_HIP_ARCH ${FF_HIP_ARCH} CACHE STRING "Legion HIP ARCH" FORCE) + message(STATUS "Legion_HIP_ARCH: ${Legion_HIP_ARCH}") endif() endif() set(Legion_REDOP_COMPLEX OFF CACHE BOOL "disable complex") diff --git a/config/config.inc b/config/config.inc index 804757af78..eb1ad21fc0 100644 --- a/config/config.inc +++ b/config/config.inc @@ -50,6 +50,11 @@ if [ -n "$FF_CUDA_ARCH" ]; then SET_CUDA_ARCH="-DFF_CUDA_ARCH=${FF_CUDA_ARCH}" fi +# set HIP Arch +if [ -n "$FF_HIP_ARCH" ]; then + SET_HIP_ARCH="-DFF_HIP_ARCH=${FF_HIP_ARCH}" +fi + # set CUDA dir if [ -n "$CUDA_DIR" ]; then SET_CUDA="-DCUDA_PATH=${CUDA_DIR}" @@ -213,7 +218,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 023ae1a183..e9c580220b 100755 --- a/config/config.linux +++ b/config/config.linux @@ -32,6 +32,9 @@ fi # Alternatively, set "FF_CUDA_ARCH=autodetect" to build FlexFlow for all architectures detected on the machine, # or set "FF_CUDA_ARCH=all" to build FlexFlow for all supported GPU architectures FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} +# FF_HIP_ARCH only supports building for a specific AMD architecture, a list of architectures separated by a comma +# or all available architectures. TODO: support autodetect +FF_HIP_ARCH=${FF_HIP_ARCH:-"all"} # set CUDNN dir in case cmake cannot autodetect a path CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} @@ -94,7 +97,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/docker/build.sh b/docker/build.sh index e4f06a0ec1..e72c23fcd8 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -2,7 +2,7 @@ set -euo pipefail # Usage: ./build.sh -# Optional environment variables: FF_GPU_BACKEND, cuda_version +# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version # Cd into $FF_HOME. Assumes this script is in $FF_HOME/docker cd "${BASH_SOURCE[0]%/*}/.." @@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}/.." image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} cuda_version=${cuda_version:-"empty"} +hip_version=${hip_version:-"empty"} # Check docker image name if [[ "$image" != @(flexflow-environment|flexflow) ]]; then @@ -28,12 +29,22 @@ else echo "Building $image docker image with default GPU backend: cuda" fi +# base image to use when building the flexflow environment docker image. +ff_environment_base_image="ubuntu:20.04" +# gpu backend version suffix for the docker image. +gpu_backend_version="" + if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then # Autodetect cuda version if not specified if [[ $cuda_version == "empty" ]]; then - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}') + # shellcheck disable=SC2015 + cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) # Change cuda_version eg. V11.7.99 to 11.7 cuda_version=${cuda_version:1:4} + if [[ -z "$cuda_version" ]]; then + echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." + exit 1 + fi fi # Check that CUDA version is supported, and modify cuda version to include default subsubversion if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then @@ -53,34 +64,57 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version=12.0 cuda_version_input=${cuda_version}.1 fi - # Set cuda version suffix to docker image name echo "Building $image docker image with CUDA $cuda_version" - cuda_version="-${cuda_version}" -else - # Empty cuda version suffix for non-CUDA images - cuda_version="" - # Pick a default CUDA version for the base docker image from NVIDIA - cuda_version_input="11.8.0" + ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04" + gpu_backend_version="-${cuda_version}" fi -docker build --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "cuda_version=${cuda_version_input}" -t "flexflow-environment-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow-environment/Dockerfile . +if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then + # Autodetect HIP version if not specified + if [[ $hip_version == "empty" ]]; then + # shellcheck disable=SC2015 + hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true) + # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6 + hip_version=${hip_version:0:3} + if [[ -z "$hip_version" ]]; then + echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env." + exit 1 + fi + fi + # Check that HIP version is supported + if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then + echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}" + exit 1 + fi + echo "Building $image docker image with HIP $hip_version" + if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then + gpu_backend_version="-${hip_version}" + fi +fi + +# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking +cores_available=$(nproc --all) +n_build_cores=$(( cores_available -1 )) + +docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile . # If the user only wants to build the environment image, we are done if [[ "$image" == "flexflow-environment" ]]; then exit 0 fi -# Gather arguments needed to build the FlexFlow image -# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking -cores_available=$(nproc --all) -n_build_cores=$(( cores_available -1 )) +# Done with flexflow-environment image + +########################################################################################### -# If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker -# image will not have access to GPUs during the build phase (due to a Docker restriction). In all other -# cases, we pass the value of FF_CUDA_ARCH directly to Cmake. -if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then - # Get CUDA architecture(s), if GPUs are available - cat << EOF > ./get_gpu_arch.cu +# Build flexflow image if requested +if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then + # If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker + # image will not have access to GPUs during the build phase (due to a Docker restriction). In all other + # cases, we pass the value of FF_CUDA_ARCH directly to Cmake. + if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then + # Get CUDA architecture(s), if GPUs are available + cat << EOF > ./get_gpu_arch.cu #include int main() { int count = 0; @@ -94,24 +128,25 @@ int main() { return 0; } EOF - gpu_arch_codes="" - if command -v nvcc &> /dev/null - then - nvcc ./get_gpu_arch.cu -o ./get_gpu_arch - gpu_arch_codes="$(./get_gpu_arch)" - fi - gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)" - gpu_arch_codes="${gpu_arch_codes// /,}" - rm -f ./get_gpu_arch.cu ./get_gpu_arch - - if [[ -n "$gpu_arch_codes" ]]; then - echo "Host machine has GPUs with architecture codes: $gpu_arch_codes" - echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)." - FF_CUDA_ARCH="${gpu_arch_codes}" - export FF_CUDA_ARCH - else - echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs." - exit 1 + gpu_arch_codes="" + if command -v nvcc &> /dev/null + then + nvcc ./get_gpu_arch.cu -o ./get_gpu_arch + gpu_arch_codes="$(./get_gpu_arch)" + fi + gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)" + gpu_arch_codes="${gpu_arch_codes// /,}" + rm -f ./get_gpu_arch.cu ./get_gpu_arch + + if [[ -n "$gpu_arch_codes" ]]; then + echo "Host machine has GPUs with architecture codes: $gpu_arch_codes" + echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)." + FF_CUDA_ARCH="${gpu_arch_codes}" + export FF_CUDA_ARCH + else + echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs." + exit 1 + fi fi fi @@ -121,4 +156,4 @@ fi # Set value of BUILD_CONFIGS get_build_configs -docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "cuda_version=${cuda_version}" -t "flexflow-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow/Dockerfile . +docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "gpu_backend_version=${gpu_backend_version}" -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow/Dockerfile . diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 09f8be51dc..f1ebdcc28a 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -1,5 +1,5 @@ -ARG cuda_version -FROM nvidia/cuda:${cuda_version}-cudnn8-devel-ubuntu20.04 +ARG ff_environment_base_image +FROM ${ff_environment_base_image} LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow LABEL org.opencontainers.image.description="FlexFlow environment container" @@ -31,13 +31,34 @@ RUN wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_6 # in the container. It also attempts to install packages for a graphical install. # For our container, we don't need `hip-runtime-nvidia` ARG FF_GPU_BACKEND "cuda" +ARG hip_version "5.6" +ARG N_BUILD_CORES +# set MAKEFLAGS to speedup any dependency that uses make +ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}" + RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \ - wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/bionic/amdgpu-install_22.20.50205-1_all.deb; \ - apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb; \ - rm ./amdgpu-install_22.20.50205-1_all.deb; \ + # Check that hip_version is one of 5.3,5.4,5.5,5.6 + if [ "$hip_version" != "5.3" ] && [ "$hip_version" != "5.4" ] && [ "$hip_version" != "5.5" ] && [ "$hip_version" != "5.6" ]; then \ + echo "hip_version '${hip_version}' is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"; \ + exit 1; \ + fi; \ + # Compute script name and url given the version + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.6.50600-1_all.deb; \ + if [ "$hip_version" = "5.3" ]; then \ + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.3.50300-1_all.deb; \ + elif [ "$hip_version" = "5.4" ]; then \ + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.4.50400-1_all.deb; \ + elif [ "$hip_version" = "5.5" ]; then \ + AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb; \ + fi; \ + AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"; \ + # Download and install AMD GPU software with ROCM and HIP support + wget $AMD_GPU_SCRIPT_URL; \ + apt-get install -y ./${AMD_GPU_SCRIPT_NAME}; \ + rm ./${AMD_GPU_SCRIPT_NAME}; \ amdgpu-install -y --usecase=hip,rocm --no-dkms; \ - apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk; \ + apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \ # Install protobuf v3.20.x manually apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \ git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git; cd protobuf/ ; git submodule update --init --recursive; \ diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile index 0cda5cbc18..a7d540bc71 100644 --- a/docker/flexflow/Dockerfile +++ b/docker/flexflow/Dockerfile @@ -1,6 +1,6 @@ ARG FF_GPU_BACKEND "cuda" -ARG cuda_version "" -FROM flexflow-environment-$FF_GPU_BACKEND$cuda_version:latest +ARG gpu_backend_version "" +FROM flexflow-environment-$FF_GPU_BACKEND$gpu_backend_version:latest LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow LABEL org.opencontainers.image.description="FlexFlow container" diff --git a/docker/publish.sh b/docker/publish.sh index 8073abdd72..c70419a9cc 100755 --- a/docker/publish.sh +++ b/docker/publish.sh @@ -2,7 +2,7 @@ set -euo pipefail # Usage: ./publish.sh -# Optional environment variables: FF_GPU_BACKEND, cuda_version +# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" @@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}" image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} cuda_version=${cuda_version:-"empty"} +hip_version=${hip_version:-"empty"} # Check docker image name if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then @@ -18,6 +19,9 @@ if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then exit 1 fi +# gpu backend version suffix for the docker image. +gpu_backend_version="" + # Check GPU backend if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'." @@ -31,9 +35,14 @@ fi if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then # Autodetect cuda version if not specified if [[ $cuda_version == "empty" ]]; then - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}') + # shellcheck disable=SC2015 + cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) # Change cuda_version eg. V11.7.99 to 11.7 cuda_version=${cuda_version:1:4} + if [[ -z "$cuda_version" ]]; then + echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." + exit 1 + fi fi # Check that CUDA version is supported if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then @@ -42,14 +51,34 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the fi # Set cuda version suffix to docker image name echo "Publishing $image docker image with CUDA $cuda_version" - cuda_version="-${cuda_version}" -else - # Empty cuda version suffix for non-CUDA images - cuda_version="" + gpu_backend_version="-${cuda_version}" +fi + +if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then + # Autodetect HIP version if not specified + if [[ $hip_version == "empty" ]]; then + # shellcheck disable=SC2015 + hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true) + # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6 + hip_version=${hip_version:0:3} + if [[ -z "$hip_version" ]]; then + echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env." + exit 1 + fi + fi + # Check that HIP version is supported + if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then + echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}" + exit 1 + fi + echo "Pubilishing $image docker image with HIP $hip_version" + if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then + gpu_backend_version="-${hip_version}" + fi fi # Check that image exists -docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null +docker image inspect "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest > /dev/null # Log into container registry FLEXFLOW_CONTAINER_TOKEN=${FLEXFLOW_CONTAINER_TOKEN:-} @@ -59,8 +88,8 @@ echo "$FLEXFLOW_CONTAINER_TOKEN" | docker login ghcr.io -u flexflow --password-s # Tag image to be uploaded git_sha=${GITHUB_SHA:-$(git rev-parse HEAD)} if [ -z "$git_sha" ]; then echo "Commit hash cannot be detected, cannot publish the docker image to ghrc.io"; exit; fi -docker tag "${image}-${FF_GPU_BACKEND}${cuda_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest +docker tag "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest # Upload image -docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest +docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest diff --git a/docker/pull.sh b/docker/pull.sh index 989fc89ccb..e5b6f26f3c 100755 --- a/docker/pull.sh +++ b/docker/pull.sh @@ -2,7 +2,7 @@ set -euo pipefail # Usage: ./pull.sh -# Optional environment variables: FF_GPU_BACKEND, cuda_version +# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" @@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}" image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} cuda_version=${cuda_version:-"empty"} +hip_version=${hip_version:-"empty"} # Check docker image name if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then @@ -28,12 +29,20 @@ else echo "Downloading $image docker image with default GPU backend: cuda" fi +# gpu backend version suffix for the docker image. +gpu_backend_version="" + if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then # Autodetect cuda version if not specified if [[ $cuda_version == "empty" ]]; then - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}') + # shellcheck disable=SC2015 + cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) # Change cuda_version eg. V11.7.99 to 11.7 cuda_version=${cuda_version:1:4} + if [[ -z "$cuda_version" ]]; then + echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." + exit 1 + fi fi # Check that CUDA version is supported if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then @@ -46,17 +55,37 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the fi # Set cuda version suffix to docker image name echo "Downloading $image docker image with CUDA $cuda_version" - cuda_version="-${cuda_version}" -else - # Empty cuda version suffix for non-CUDA images - cuda_version="" + gpu_backend_version="-${cuda_version}" +fi + +if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then + # Autodetect HIP version if not specified + if [[ $hip_version == "empty" ]]; then + # shellcheck disable=SC2015 + hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true) + # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6 + hip_version=${hip_version:0:3} + if [[ -z "$hip_version" ]]; then + echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env." + exit 1 + fi + fi + # Check that HIP version is supported + if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then + echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}" + exit 1 + fi + echo "Downloading $image docker image with HIP $hip_version" + if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then + gpu_backend_version="-${hip_version}" + fi fi # Download image -docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}" +docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${gpu_backend_version}" # Tag downloaded image -docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}":latest "$image-${FF_GPU_BACKEND}${cuda_version}":latest +docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${gpu_backend_version}":latest "$image-${FF_GPU_BACKEND}${gpu_backend_version}":latest # Check that image exists -docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null +docker image inspect "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest > /dev/null diff --git a/docker/run.sh b/docker/run.sh index bb948d6b78..76ec1e1ceb 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -2,7 +2,7 @@ set -euo pipefail # Usage: ./run.sh -# Optional environment variables: FF_GPU_BACKEND, cuda_version, ATTACH_GPUS, SHM_SIZE +# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version, ATTACH_GPUS, SHM_SIZE # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" @@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}" image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} cuda_version=${cuda_version:-"empty"} +hip_version=${hip_version:-"empty"} # Parameter controlling whether to attach GPUs to the Docker container ATTACH_GPUS=${ATTACH_GPUS:-true} @@ -41,12 +42,20 @@ else echo "Running $image docker image with default GPU backend: cuda" fi +# gpu backend version suffix for the docker image. +gpu_backend_version="" + if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then # Autodetect cuda version if not specified if [[ $cuda_version == "empty" ]]; then - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}') + # shellcheck disable=SC2015 + cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) # Change cuda_version eg. V11.7.99 to 11.7 cuda_version=${cuda_version:1:4} + if [[ -z "$cuda_version" ]]; then + echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." + exit 1 + fi fi # Check that CUDA version is supported if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then @@ -59,28 +68,56 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the fi # Set cuda version suffix to docker image name echo "Running $image docker image with CUDA $cuda_version" - cuda_version_hyphen="-${cuda_version}" -else - # Empty cuda version suffix for non-CUDA images - cuda_version_hyphen="" + gpu_backend_version="-${cuda_version}" +fi + +if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then + # Autodetect HIP version if not specified + if [[ $hip_version == "empty" ]]; then + # shellcheck disable=SC2015 + hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true) + # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6 + hip_version=${hip_version:0:3} + if [[ -z "$hip_version" ]]; then + echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env." + exit 1 + fi + fi + # Check that HIP version is supported + if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then + echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}" + exit 1 + fi + echo "Running $image docker image with HIP $hip_version" + if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then + gpu_backend_version="-${hip_version}" + fi fi # Check that image exists, if fails, print the default error message. -if [[ "$(docker images -q "$image"-"$FF_GPU_BACKEND""$cuda_version_hyphen":latest 2> /dev/null)" == "" ]]; then - echo "" - echo "To download the docker image, run:" - echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image" - echo "To build the docker image from source, run:" - echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image" - echo "" +if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest 2> /dev/null)" == "" ]]; then + echo "Error, ${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest does not exist!" + if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then + echo "" + echo "To download the docker image, run:" + echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image" + echo "To build the docker image from source, run:" + echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image" + echo "" + elif [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then + echo "" + echo "To download the docker image, run:" + echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} hip_version=${hip_version} $(pwd)/pull.sh $image" + echo "To build the docker image from source, run:" + echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} hip_version=${hip_version} $(pwd)/build.sh $image" + echo "" + fi exit 1 fi inference_volumes="" if $ATTACH_INFERENCE_FILES ; then - inference_volumes="-v $(pwd)/../inference/weights:/usr/FlexFlow/inference/weights \ - -v $(pwd)/../inference/prompt:/usr/FlexFlow/inference/prompt \ - -v $(pwd)/../inference/tokenizer:/usr/FlexFlow/inference/tokenizer"; + inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference"; fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}:latest" +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index 4ed60abb87..c3b97d094a 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -13,7 +13,22 @@ set(CPU_SRC ../models/falcon.cc ../models/starcoder.cc) -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target} ${CPU_SRC}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + hip_add_executable(${project_target} ${CPU_SRC}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index 9697bd8256..3d6b48b802 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -12,7 +12,22 @@ set(CPU_SRC ../models/opt.cc ../models/falcon.cc) -cuda_add_executable(${project_target} ${CPU_SRC} ${GPU_SRC}) +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target} ${CPU_SRC}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + hip_add_executable(${project_target} ${CPU_SRC}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) From 85acb4108d99f9329eea70149e7f4472062e3076 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 1 Sep 2023 23:40:55 -0400 Subject: [PATCH 217/344] Remove zlib (#1086) --- CMakeLists.txt | 3 --- cmake/zlib.cmake | 8 ------ include/flexflow/config.h | 2 +- include/flexflow/ops/cast.h | 10 +++---- include/flexflow/ops/layer_norm.h | 10 +++---- include/flexflow/ops/mean.h | 10 +++---- include/flexflow/ops/rms_norm.h | 10 +++---- include/flexflow/utils/hip_helper.h | 2 +- src/ops/aggregate.cpp | 12 ++++----- src/ops/aggregate_spec.cpp | 16 ++++++------ src/ops/arg_topk.cpp | 12 ++++----- src/ops/argmax.cpp | 12 ++++----- src/ops/attention.cpp | 24 ++++++++--------- src/ops/batch_norm.cpp | 24 ++++++++--------- src/ops/beam_topk.cpp | 12 ++++----- src/ops/cache.cpp | 16 ++++++------ src/ops/element_unary.cc | 2 +- src/ops/group_by.cpp | 12 ++++----- src/ops/inc_multihead_self_attention.cpp | 12 ++++----- src/ops/kernels/batch_matmul.cpp | 24 ++++++++--------- src/ops/kernels/cast_kernels.cpp | 12 ++++----- src/ops/kernels/concat_kernels.cpp | 24 ++++++++--------- src/ops/kernels/conv_2d_kernels.cpp | 24 ++++++++--------- src/ops/kernels/element_binary_kernels.cpp | 24 ++++++++--------- src/ops/kernels/linear_kernels.cpp | 26 +++++++++---------- src/ops/kernels/pool_2d_kernels.cpp | 24 ++++++++--------- src/ops/kernels/rms_norm_kernels.cpp | 12 ++++----- src/ops/kernels/softmax.cpp | 24 ++++++++--------- src/ops/layer_norm.cpp | 6 ++--- src/ops/rms_norm.cc | 2 +- src/ops/sampling.cpp | 12 ++++----- src/ops/softmax.cc | 2 +- src/ops/spec_inc_multihead_self_attention.cpp | 12 ++++----- src/ops/topk.cpp | 18 ++++++------- src/ops/tree_inc_multihead_self_attention.cc | 4 +-- src/ops/tree_inc_multihead_self_attention.cpp | 12 ++++----- src/runtime/inference_manager.cc | 2 +- src/runtime/initializer_kernel.cpp | 2 +- src/runtime/model.cpp | 6 +++-- src/runtime/simulator.cpp | 4 +-- src/runtime/substitution.cc | 4 +++ 41 files changed, 241 insertions(+), 248 deletions(-) delete mode 100644 cmake/zlib.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 2985732a8c..59eb27df2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -223,9 +223,6 @@ if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.") endif() -# ZLIB -include(zlib) - # CUDA if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") include(cuda) diff --git a/cmake/zlib.cmake b/cmake/zlib.cmake deleted file mode 100644 index 0281e02b88..0000000000 --- a/cmake/zlib.cmake +++ /dev/null @@ -1,8 +0,0 @@ -find_package(ZLIB REQUIRED) -if(ZLIB_FOUND) - list(APPEND FLEXFLOW_EXT_LIBRARIES - ${ZLIB_LIBRARIES}) - message( STATUS "ZLIB libraries : ${ZLIB_LIBRARIES}" ) -else() - message( FATAL_ERROR "ZLIB package not found") -endif() \ No newline at end of file diff --git a/include/flexflow/config.h b/include/flexflow/config.h index be6c0d21da..a55b13ad6d 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -22,7 +22,7 @@ #include #include #elif defined(FF_USE_HIP_ROCM) -#include +#include #include #else #error "Unknown device" diff --git a/include/flexflow/ops/cast.h b/include/flexflow/ops/cast.h index a06f87b3c8..a88e7d6bb0 100644 --- a/include/flexflow/ops/cast.h +++ b/include/flexflow/ops/cast.h @@ -34,19 +34,19 @@ class Cast : public Op { Params const ¶ms, Input const &input, char const *name = nullptr); - void init(FFModel const &); + void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void forward(FFModel const &); - void backward(FFModel const &); + void forward(FFModel const &) override; + void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void print_layer(FFModel const &model) { + void print_layer(FFModel const &model) override { assert(0); } static Op * @@ -92,7 +92,7 @@ class Cast : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, - CostMetrics &cost_metrics) const; + CostMetrics &cost_metrics) const override; void serialize(Legion::Serializer &s) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index cb977fc6a6..a36e41a19f 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -24,19 +24,19 @@ class LayerNorm : public Op { float _eps, bool allocate_weights, char const *name); - void init(FFModel const &); + void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void forward(FFModel const &); - void backward(FFModel const &); + void forward(FFModel const &) override; + void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void print_layer(FFModel const &model) { + void print_layer(FFModel const &model) override { assert(0); } static Op * @@ -68,7 +68,7 @@ class LayerNorm : public Op { Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, - CostMetrics &cost_metrics) const; + CostMetrics &cost_metrics) const override; template static void forward_kernel(LayerNormMeta const *m, T const *input_ptr, diff --git a/include/flexflow/ops/mean.h b/include/flexflow/ops/mean.h index 3dc0ac9aa5..33d041031c 100644 --- a/include/flexflow/ops/mean.h +++ b/include/flexflow/ops/mean.h @@ -11,10 +11,10 @@ class Mean : public Op { std::vector const &dims, bool keepdims, char const *name); - void init(FFModel const &); - void forward(FFModel const &); - void backward(FFModel const &); - void print_layer(FFModel const &model) { + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + void print_layer(FFModel const &model) override { assert(0); } @@ -32,7 +32,7 @@ class Mean : public Op { Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, - CostMetrics &cost_metrics) const; + CostMetrics &cost_metrics) const override; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 979a20976c..36dde15b90 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -31,9 +31,9 @@ class RMSNorm : public Op { RMSNorm const &other, const ParallelTensor input, bool allocate_weights); - void init(FFModel const &); - void forward(FFModel const &); - void backward(FFModel const &); + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, std::vector const &, @@ -43,7 +43,7 @@ class RMSNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void print_layer(FFModel const &model) { + void print_layer(FFModel const &model) override { assert(0); } @@ -71,7 +71,7 @@ class RMSNorm : public Op { Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, - CostMetrics &cost_metrics) const; + CostMetrics &cost_metrics) const override; public: float eps; diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index d16f353ade..374ea30064 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -3,7 +3,7 @@ #include "flexflow/accessor.h" #include "flexflow/ffconst.h" #include "legion.h" -#include +#include #include #define FatalError(s) \ diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp index bc4391c426..d5ebdb0c22 100644 --- a/src/ops/aggregate.cpp +++ b/src/ops/aggregate.cpp @@ -216,8 +216,8 @@ void Aggregate::forward_kernel_wrapper(AggregateMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); // call forward_kernel - hipMemcpy( - m->dev_exp_preds, exp_preds, n * sizeof(float *), hipMemcpyHostToDevice); + checkCUDA(hipMemcpy( + m->dev_exp_preds, exp_preds, n * sizeof(float *), hipMemcpyHostToDevice)); hipLaunchKernelGGL(agg_forward_kernel, GET_BLOCKS(batch_size * k * out_dim), @@ -256,10 +256,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); // call backward kernel - hipMemcpy( - m->dev_exp_preds, exp_preds, n * sizeof(float *), hipMemcpyHostToDevice); - hipMemcpy( - m->dev_exp_grads, exp_grads, n * sizeof(float *), hipMemcpyHostToDevice); + checkCUDA(hipMemcpy( + m->dev_exp_preds, exp_preds, n * sizeof(float *), hipMemcpyHostToDevice)); + checkCUDA(hipMemcpy( + m->dev_exp_grads, exp_grads, n * sizeof(float *), hipMemcpyHostToDevice)); hipLaunchKernelGGL(agg_backward_kernel, GET_BLOCKS(batch_size * k * out_dim), diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp index e961c3ae7b..314e20a59c 100644 --- a/src/ops/aggregate_spec.cpp +++ b/src/ops/aggregate_spec.cpp @@ -226,10 +226,10 @@ void AggregateSpec::forward_kernel_wrapper(AggregateSpecMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); // call forward kernel - hipMemcpy(m->dev_region_ptrs, - exp_preds, - n * sizeof(float *), - hipMemcpyHostToDevice); + checkCUDA(hipMemcpy(m->dev_region_ptrs, + exp_preds, + n * sizeof(float *), + hipMemcpyHostToDevice)); hipLaunchKernelGGL(aggspec_forward_kernel, GET_BLOCKS(batch_size * k * out_dim), @@ -266,10 +266,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); // call backward kernel - hipMemcpy(m->dev_region_ptrs, - exp_grads, - n * sizeof(float *), - hipMemcpyHostToDevice); + checkCUDA(hipMemcpy(m->dev_region_ptrs, + exp_grads, + n * sizeof(float *), + hipMemcpyHostToDevice)); hipLaunchKernelGGL(aggspec_backward_kernel, GET_BLOCKS(batch_size * k * out_dim), diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index 4937166b66..6db8abb8c4 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -448,9 +448,9 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA((&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } if (input.data_type == DT_HALF) { @@ -477,12 +477,12 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, assert(false && "Unsupported data type"); } if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 778ddf3c9d..17b8c9ad16 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -42,20 +42,20 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } handle_unimplemented_hip_kernel(OP_RMS_NORM); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp index 9b6ad6cb46..ee7f87a7fb 100644 --- a/src/ops/attention.cpp +++ b/src/ops/attention.cpp @@ -56,19 +56,19 @@ void MultiHeadAttention::forward_kernel_wrapper(MultiHeadAttentionMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } MultiHeadAttention::forward_kernel( m, query_ptr, key_ptr, value_ptr, weight_ptr, output_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("MultiHeadAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, @@ -124,9 +124,9 @@ void MultiHeadAttention::backward_kernel_wrapper( hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } MultiHeadAttention::backward_kernel(m, @@ -141,12 +141,12 @@ void MultiHeadAttention::backward_kernel_wrapper( output_grad_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("MultiHeadAttention backward time = %.2fms\n", elapsed); } } diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index a0a2d47e24..34a7fcbe72 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -133,9 +133,9 @@ __host__ void hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } forward_kernel(m, acc_input.ptr, @@ -143,12 +143,12 @@ __host__ void acc_scale.ptr, acc_bias.ptr /*, stream*/); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("BatchNorm forward time (BF) = %.2fms\n", elapsed); } } @@ -256,9 +256,9 @@ __host__ void hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } backward_kernel(m, acc_input.ptr, @@ -270,12 +270,12 @@ __host__ void acc_bias_grad.ptr, acc_output.rect.volume() /*, stream*/); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("BatchNorm backward time = %.2fms\n", elapsed); } } diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 293feecff0..134b0c6abe 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -638,9 +638,9 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } if (input.data_type == DT_HALF) { @@ -668,12 +668,12 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, } if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("[BeamTopK] forward time = %.2lfms\n", elapsed); } } diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp index 8dd1e098c2..95c5995f9e 100644 --- a/src/ops/cache.cpp +++ b/src/ops/cache.cpp @@ -43,10 +43,10 @@ void Cache::cache_forward(Task const *task, checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - hipMemcpy(output_ptr, - batch_ptrs[batch_ctr], - c->inputs[0]->get_volume() * sizeof(T), - hipMemcpyHostToDevice); + checkCUDA(hipMemcpy(output_ptr, + batch_ptrs[batch_ctr], + c->inputs[0]->get_volume() * sizeof(T), + hipMemcpyHostToDevice)); } template @@ -61,10 +61,10 @@ float Cache::cache_update(Task const *task, T const *input_ptr = helperGetTensorPointerRW( regions[0], task->regions[0], FID_DATA, ctx, runtime); T *host_input = (T *)c->batch_cmp; - hipMemcpy(host_input, - input_ptr, - c->inputs[0]->get_volume() * sizeof(T), - hipMemcpyDeviceToHost); + checkCUDA(hipMemcpy(host_input, + input_ptr, + c->inputs[0]->get_volume() * sizeof(T), + hipMemcpyDeviceToHost)); float cache_score = c->score_f(&m->cache_score, host_input, c->batch_ptrs[batch_ctr], diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 5ecb812b68..c82c1196a2 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -31,7 +31,7 @@ Tensor FFModel::unary(OperatorType op, // if (x->data_type < DT_FLOAT) { if (false) { dtype = DT_FLOAT; - std::string str = nullptr ? "" : std::string(name); + std::string str = (name == nullptr) ? "" : std::string(name); Tensor new_x = cast(x, dtype, (str + "input_pre_cast").c_str()); ele = new Layer(this, op, diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index 51bcd7d7b4..761c35f182 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -134,8 +134,8 @@ void Group_by::forward_kernel_wrapper(GroupByMeta const *m, checkCUDA(get_legion_stream(&stream)); // call forward kernel - hipMemcpy( - m->dev_region_ptrs, outputs, n * sizeof(float *), hipMemcpyHostToDevice); + checkCUDA(hipMemcpy( + m->dev_region_ptrs, outputs, n * sizeof(float *), hipMemcpyHostToDevice)); hipLaunchKernelGGL(gb_forward_kernel, GET_BLOCKS(batch_size * k * data_dim), @@ -168,10 +168,10 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, checkCUDA(get_legion_stream(&stream)); // call forward kernel - hipMemcpy(m->dev_region_ptrs, - output_grads, - n * sizeof(float *), - hipMemcpyHostToDevice); + checkCUDA(hipMemcpy(m->dev_region_ptrs, + output_grads, + n * sizeof(float *), + hipMemcpyHostToDevice)); hipLaunchKernelGGL(gb_backward_kernel, GET_BLOCKS(batch_size * k * data_dim), diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index b7ed189040..99f8f49abd 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -37,20 +37,20 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp index 34468d28a1..7145af2108 100644 --- a/src/ops/kernels/batch_matmul.cpp +++ b/src/ops/kernels/batch_matmul.cpp @@ -41,9 +41,9 @@ void forward_kernel_wrapper(BatchMatmulMeta const *meta, hipEvent_t t_start, t_end; if (meta->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::forward_kernel(meta, o_ptr, @@ -59,12 +59,12 @@ void forward_kernel_wrapper(BatchMatmulMeta const *meta, b_seq_length_dim, seq_length); if (meta->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("BatchMatmul forward time = %.2lfms\n", elapsed); } } @@ -86,9 +86,9 @@ void backward_kernel_wrapper(BatchMatmulMeta const *meta, hipEvent_t t_start, t_end; if (meta->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel(meta, o_ptr, @@ -104,12 +104,12 @@ void backward_kernel_wrapper(BatchMatmulMeta const *meta, batch, stream); if (meta->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("BatchMatmul backward time = %.2lfms\n", elapsed); } } diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp index f47bd0ed92..16b9b4cec0 100644 --- a/src/ops/kernels/cast_kernels.cpp +++ b/src/ops/kernels/cast_kernels.cpp @@ -34,19 +34,19 @@ void forward_kernel_wrapper(CastMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::forward_kernel(input_ptr, output_ptr, volume, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("[%s] forward time (CF) = %.2fms\n", "Cast", elapsed); // print_tensor(input_ptr, 32, "[Cast:forward:input]"); // print_tensor(output_ptr, 32, "[Cast:forward:output]"); diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp index 5f6e04abc9..bf5d46b9cc 100644 --- a/src/ops/kernels/concat_kernels.cpp +++ b/src/ops/kernels/concat_kernels.cpp @@ -40,13 +40,13 @@ void forward_kernel_wrapper(ConcatMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::forward_kernel(output, inputs, num_inputs, axis, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); // print_tensor<4, float>(output - output_blk_size, output_rect, // "[Concat:forward:output]"); printf("output_blk_size=%zu\n", @@ -56,8 +56,8 @@ void forward_kernel_wrapper(ConcatMeta const *m, float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); printf("[%s] forward time = %.4f ms\n", m->op_name, elapsed); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } @@ -71,19 +71,19 @@ void backward_kernel_wrapper(ConcatMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel(output_grad, input_grads, num_inputs, axis, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); printf("[%s] forward time = %.4f ms\n", m->op_name, elapsed); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp index b4ec1545c3..7d2fa20c49 100644 --- a/src/ops/kernels/conv_2d_kernels.cpp +++ b/src/ops/kernels/conv_2d_kernels.cpp @@ -174,15 +174,15 @@ void forward_kernel_wrapper(Conv2DMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::forward_kernel( m, input_ptr, output_ptr, filter_ptr, bias_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); print_tensor(input_ptr, 16, "[Conv2D:forward:input]"); print_tensor(filter_ptr, 16, "[Conv2D:forward:kernel]"); @@ -190,8 +190,8 @@ void forward_kernel_wrapper(Conv2DMeta const *m, print_tensor(output_ptr, 16, "[Conv2D:forward:output]"); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("%s [Conv2D] forward time (CF) = %.2fms\n", m->op_name, elapsed); } } @@ -209,9 +209,9 @@ void backward_kernel_wrapper(Conv2DMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel(m, @@ -224,12 +224,12 @@ void backward_kernel_wrapper(Conv2DMeta const *m, bias_grad_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("%s [Conv2D] backward time = %.2fms\n", m->op_name, elapsed); // print_tensor<4, float>(acc_output_grad.ptr, acc_output_grad.rect, // "[Conv2D:backward:output_grad]"); print_tensor<4, diff --git a/src/ops/kernels/element_binary_kernels.cpp b/src/ops/kernels/element_binary_kernels.cpp index 3aef875d1f..a65372de85 100644 --- a/src/ops/kernels/element_binary_kernels.cpp +++ b/src/ops/kernels/element_binary_kernels.cpp @@ -76,9 +76,9 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } // print_tensor(in1_ptr, in1_domain.get_volume(), "input1:"); // print_tensor(in2_ptr, in2_domain.get_volume(), "input2:"); @@ -86,12 +86,12 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, m, in1.get_float_ptr(), in2.get_float_ptr(), out.get_float_ptr(), stream); // print_tensor(out_ptr, in1_domain.get_volume(), "output:"); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); char const *opName; switch (m->op_type) { case OP_EW_ADD: @@ -124,9 +124,9 @@ void backward_kernel_wrapper(ElementBinaryMeta const *m, checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel( @@ -135,12 +135,12 @@ void backward_kernel_wrapper(ElementBinaryMeta const *m, // CUDA_NUM_THREADS>>>( out_grad_domain.get_volume(), alpha, alpha, // ele->op_type, out_grad_ptr, in1_ptr, in2_ptr, in1_grad_ptr, in2_grad_ptr); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); char const *opName; switch (m->op_type) { case OP_EW_ADD: diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 0d70e91d47..231ca0f3d7 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -61,7 +61,7 @@ bool use_activation(ActiMode mode) { return false; } -void Linear::init_kernel(LinearMeta *m, int batch_size, int channel) { +void init_kernel(LinearMeta *m, int batch_size, int channel) { if (use_activation(m->activation)) { miopenActivationMode_t mode; switch (m->activation) { @@ -99,9 +99,9 @@ void forward_kernel_wrapper(LinearMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } if (m->input_type[0] == DT_FLOAT) { @@ -127,12 +127,12 @@ void forward_kernel_wrapper(LinearMeta const *m, } if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed); // print_tensor(acc_input.ptr, acc_input.rect.volume(), // "[Linear:forward:input]"); print_tensor(acc_kernel.ptr, @@ -159,9 +159,9 @@ void backward_kernel_wrapper(LinearMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } if (m->input_type[0] == DT_FLOAT) { Internal::backward_kernel(m, @@ -192,12 +192,12 @@ void backward_kernel_wrapper(LinearMeta const *m, } if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("%s Linear backward time = %.2lfms\n", m->op_name, elapsed); // print_tensor(acc_output_grad.ptr, acc_output_grad.rect.volume(), // "[Linear:backward:output_grad]"); diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp index f302969559..8af85612ca 100644 --- a/src/ops/kernels/pool_2d_kernels.cpp +++ b/src/ops/kernels/pool_2d_kernels.cpp @@ -75,21 +75,21 @@ void forward_kernel_wrapper(Pool2DMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::forward_kernel(m, input_ptr, output_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); // print_tensor<4, float>(acc_input.ptr, acc_input.rect, // "[Pool2D:forward:input]"); print_tensor<4, float>(acc_output.ptr, // acc_output.rect, "[Pool2D:forward:output]"); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("%s [Pool2D] forward time = %.2fms\n", m->op_name, elapsed); } } @@ -104,19 +104,19 @@ void backward_kernel_wrapper(Pool2DMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel( m, input_ptr, input_grad_ptr, output_ptr, output_grad_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("Pool2D backward time = %.2fms\n", elapsed); } } diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index b2e2648785..03f49774c5 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -39,20 +39,20 @@ void forward_kernel_wrapper(RMSNormMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } handle_unimplemented_hip_kernel(OP_RMS_NORM); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index d09a5aaf6d..8599f09244 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -45,21 +45,21 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::forward_kernel(m, input_ptr, output_ptr, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); // print_tensor(acc_input.ptr, acc_input.rect.volume(), // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, // acc_output.rect.volume(), "[Softmax:forward:output]"); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); log_measure.debug( "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed); } @@ -75,14 +75,14 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } Internal::backward_kernel( input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); // print_tensor(acc_output_grad.ptr, acc_output_grad.rect.volume(), // "[Softmax:backward:output_grad]"); @@ -90,8 +90,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, // "[Softmax:backward:input_grad]"); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); log_measure.debug("Softmax backward time = %.2fms\n", elapsed); } } diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 855f7296e8..ddbf96a493 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -47,12 +47,10 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) { -#if 0 #ifndef __HIP_PLATFORM_HCC__ - return __shfl_down_sync(mask, value, delta, width); + return __shfl_down_sync(mask, value, delta, width); #else - return __shfl_down(value, delta, width); -#endif + return __shfl_down(value, delta, width); #endif } diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 1f21591130..8fdfc7bc1e 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -431,7 +431,7 @@ Op *RMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { RMSNormParams params = get_params(); - return new RMSNorm(ff, params, inputs[0], this->name); + return new RMSNorm(ff, params, inputs[0], true, this->name); } void RMSNorm::backward(FFModel const &ff) {} diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp index 56f3f604d5..c4aa3a804a 100644 --- a/src/ops/sampling.cpp +++ b/src/ops/sampling.cpp @@ -40,20 +40,20 @@ void Sampling::forward_kernel_wrapper(SamplingMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } handle_unimplemented_hip_kernel(OP_RMS_NORM); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 450f7c009a..2d72151035 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -56,7 +56,7 @@ Tensor FFModel::softmax(const Tensor _input, int dim, DataType data_type, char const *name) { - if (data_type = DT_NONE) { + if (data_type == DT_NONE) { data_type = _input->data_type; } Layer *sm = new Layer(this, diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 09198c5751..8092ed8bd7 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -37,20 +37,20 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } handle_unimplemented_hip_kernel(OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp index 4bb32192ef..b6e898b654 100644 --- a/src/ops/topk.cpp +++ b/src/ops/topk.cpp @@ -421,9 +421,9 @@ void TopK::forward_kernel_wrapper(TopKMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } TopK::forward_kernel(m, @@ -437,12 +437,12 @@ void TopK::forward_kernel_wrapper(TopKMeta const *m, stream); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); } } @@ -496,9 +496,9 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } TopK::backward_kernel(m, diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 875f38c77a..f10c9a8f0f 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -343,7 +343,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { dims[1].size = get_quantization_to_byte_size( - data_type, quantization_type, dims[2].size); + data_type, quantization_type, dims[1].size); } // dims[2].degree = 1; // dims[2].parallel_idx = -1; @@ -454,7 +454,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { dims[1].size = get_quantization_to_byte_size( - data_type, quantization_type, dims[2].size); + data_type, quantization_type, dims[1].size); } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index a20077efb4..dc68a54bf2 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -37,20 +37,20 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( hipEvent_t t_start, t_end; if (m->profiling) { - hipEventCreate(&t_start); - hipEventCreate(&t_end); - hipEventRecord(t_start, stream); + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); } handle_unimplemented_hip_kernel(OP_TREE_INC_MULTIHEAD_SELF_ATTENTION); if (m->profiling) { - hipEventRecord(t_end, stream); + checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); float elapsed = 0; checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); printf("TreeIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 62ab947f8f..f36dcb2922 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -107,7 +107,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { std::vector machine_views; for (int j = 0; j < model->config.data_parallelism_degree; j++) { MachineView mv; - mv.device_type == MachineView::GPU; + mv.device_type = MachineView::GPU; mv.ndims = 1; // mv.start_device_id = 0; mv.stride[0] = 1; diff --git a/src/runtime/initializer_kernel.cpp b/src/runtime/initializer_kernel.cpp index 6a0ebe3ba9..1005d93cec 100644 --- a/src/runtime/initializer_kernel.cpp +++ b/src/runtime/initializer_kernel.cpp @@ -19,7 +19,7 @@ #include "flexflow/utils/hip_helper.h" #include #include -#include +#include #include namespace FlexFlow { diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index d6ae0ec948..6c482426eb 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -54,7 +54,8 @@ void Op::inner_measure_operator_cost(Simulator *sim, checkCUDA(hipEventRecord(sim->end_event, stream)); checkCUDA(hipEventSynchronize(sim->end_event)); float milliseconds; - hipEventElapsedTime(&milliseconds, sim->start_event, sim->end_event); + checkCUDA( + hipEventElapsedTime(&milliseconds, sim->start_event, sim->end_event)); cost_metrics.forward_time = milliseconds / sim->repeat_times; // measure backward time @@ -68,7 +69,8 @@ void Op::inner_measure_operator_cost(Simulator *sim, } checkCUDA(hipEventRecord(sim->end_event, stream)); checkCUDA(hipEventSynchronize(sim->end_event)); - hipEventElapsedTime(&milliseconds, sim->start_event, sim->end_event); + checkCUDA( + hipEventElapsedTime(&milliseconds, sim->start_event, sim->end_event)); cost_metrics.backward_time = milliseconds / sim->repeat_times; } else { cost_metrics.backward_time = 0.0f; diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp index e10923cd8d..0daf151d2c 100644 --- a/src/runtime/simulator.cpp +++ b/src/runtime/simulator.cpp @@ -80,8 +80,8 @@ Simulator::Simulator(FFModel const *model, size_t max_num_tasks = 1024 * 1024; - hipEventCreate(&start_event); - hipEventCreate(&end_event); + checkCUDA(hipEventCreate(&start_event)); + checkCUDA(hipEventCreate(&end_event)); conv2d_meta = new Conv2DMeta(handler); // linear_meta = new LinearMeta(handler, 4096); pool2d_meta = new Pool2DMeta(handler); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 3a25d99b6f..5071b5dd66 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -1488,6 +1488,8 @@ OpX *create_opx(sl::Operator const &op, case OP_REPLICATE: degree_key = PM_REPLICATE_DEGREE; break; + default: + break; } if (degree_key.has_value()) { @@ -1510,6 +1512,8 @@ OpX *create_opx(sl::Operator const &op, case OP_REPLICATE: dim_key = PM_REPLICATE_DIM; break; + default: + break; } if (dim_key.has_value()) { From 3af422dee4fe47767fc601084883b818740d7182 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 2 Sep 2023 19:03:02 -0400 Subject: [PATCH 218/344] support AMD in inference branch (#996) * init * fix build issue. * fix * fix * rccl * fix fuse * fix fuse * fix specinfer * argmax * . * format * fix * remove print * minor fix * fix * fix --------- Co-authored-by: Gabriele Oliaro --- CMakeLists.txt | 17 +- cmake/hip.cmake | 1 + config/config.linux | 2 +- include/flexflow/config.h | 4 + include/flexflow/machine_view.h | 4 + .../ops/inc_multihead_self_attention.h | 7 + .../inc_multihead_self_attention_kernels.h | 21 +- include/flexflow/ops/sampling.h | 5 + include/flexflow/utils/hip_helper.h | 10 + inference/models/falcon.cc | 4 +- inference/models/starcoder.cc | 2 + src/ops/argmax.cpp | 448 ++++++- src/ops/beam_topk.cpp | 53 +- src/ops/fused.cpp | 104 +- src/ops/inc_multihead_self_attention.cpp | 1043 ++++++++++++++++- src/ops/inc_multihead_self_attention.cu | 10 - src/ops/kernels/decompress_kernels.cpp | 90 ++ src/ops/kernels/rms_norm_kernels.cpp | 163 ++- src/ops/kernels/softmax.cpp | 3 +- src/ops/sampling.cpp | 206 +++- src/ops/spec_inc_multihead_self_attention.cpp | 631 +++++++++- src/ops/tree_inc_multihead_self_attention.cpp | 633 +++++++++- .../kernels/allreduce_kernels.cpp | 29 +- src/runtime/hip_helper.cpp | 80 +- src/runtime/optimizer_kernel.cpp | 2 + src/runtime/request_manager.cpp | 31 +- 26 files changed, 3501 insertions(+), 102 deletions(-) create mode 100644 src/ops/kernels/decompress_kernels.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 59eb27df2d..ef571dc59c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.10) project(FlexFlow) + include(ExternalProject) # Set policy CMP0074 to eliminate cmake warnings @@ -172,10 +173,6 @@ endif() # option for nccl option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF) -if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_USE_NCCL STREQUAL "ON") - message(FATAL_ERROR "NCCL: ON for FF_GPU_BACKEND: hip_rocm. hip_rocm backend must have NCCL disabled.") -endif() - # option for avx2 option(FF_USE_AVX2 "Run FlexFlow with AVX2" OFF) @@ -240,7 +237,9 @@ endif() # NCCL if(FF_USE_NCCL) - include(nccl) + if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") + include(nccl) + endif() list(APPEND FF_CC_FLAGS -DFF_USE_NCCL) list(APPEND FF_NVCC_FLAGS @@ -399,6 +398,9 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") find_package(hipblas REQUIRED) find_package(miopen REQUIRED) + if(FF_USE_NCCL) + find_package(rccl REQUIRED) + endif() # find_package(rocrand REQUIRED) find_library(HIP_RAND_LIBRARY hiprand REQUIRED) @@ -429,12 +431,15 @@ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") # Docs (outdated): # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) + if(FF_USE_NCCL) + target_link_libraries(flexflow rccl) + endif() endif() else() message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") endif() -if(FF_USE_NCCL) +if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) add_dependencies(flexflow ${NCCL_NAME}) endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index b32d68d608..abcc82b03a 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -8,4 +8,5 @@ endif() message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") if(FF_GPU_BACKEND STREQUAL "hip_rocm") set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) + set(GPU_TARGETS "${FF_HIP_ARCH}" CACHE STRING "The GPU TARGETs") endif() diff --git a/config/config.linux b/config/config.linux index e9c580220b..3686237538 100755 --- a/config/config.linux +++ b/config/config.linux @@ -88,7 +88,7 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." exit 1 -elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then +elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then # enable NCCL FF_USE_NCCL=${FF_USE_NCCL:-ON} else diff --git a/include/flexflow/config.h b/include/flexflow/config.h index a55b13ad6d..2479358bfb 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -29,7 +29,11 @@ #endif #include "tl/optional.hpp" #ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include +#endif #endif namespace FlexFlow { diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h index 8843dc4d6a..719792c10c 100644 --- a/include/flexflow/machine_view.h +++ b/include/flexflow/machine_view.h @@ -4,7 +4,11 @@ #include "legion.h" #include #ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include +#else +#include +#endif #endif #include "flexflow/config.h" diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 91621074b3..e48a8d4240 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -14,6 +14,9 @@ #include "math.h" #include #include +#if defined(FF_USE_HIP_ROCM) +#include +#endif namespace FlexFlow { @@ -191,6 +194,10 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t qk_tensor; cuFloatComplex *complex_input; +#elif defined(FF_USE_HIP_ROCM) + miopenTensorDescriptor_t qk_tensor; + // typedef hipFloatComplex attFloatComplex; + hipFloatComplex *complex_input; #endif }; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 6b294bc211..f578249045 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -31,6 +31,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, bool scaling_query, float scaling_factor); +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) template __global__ void apply_rotary_embedding(DT *input_ptr, @@ -45,6 +46,22 @@ __global__ void int k_block_size, int q_array_size, bool q_tensor); +#elif defined(FF_USE_HIP_ROCM) +template +__global__ void + apply_rotary_embedding(DT *input_ptr, + hipFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_heads, + int num_tokens, + int num_kv_heads, + int q_block_size, + int k_block_size, + int q_array_size, + bool q_tensor); +#endif template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, @@ -54,13 +71,13 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, - cudaStream_t stream); + ffStream_t stream); template void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, DataType data_type, - cudaStream_t stream); + ffStream_t stream); } // namespace IncMultiHeadAttention } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h index 789904df32..d690888a39 100644 --- a/include/flexflow/ops/sampling.h +++ b/include/flexflow/ops/sampling.h @@ -8,6 +8,9 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include #include +#elif defined(FF_USE_HIP_ROCM) +#include +#include #endif #include "flexflow/utils/memory_allocator.h" @@ -26,6 +29,8 @@ class SamplingMeta : public OpMeta { Realm::RegionInstance reserveInst; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) curandState *state; +#elif defined(FF_USE_HIP_ROCM) + hiprandState *state; #endif SamplingMeta(FFHandler handle, Op const *op, diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 374ea30064..7f6403c767 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -5,6 +5,9 @@ #include "legion.h" #include #include +#ifdef FF_USE_NCCL +#include +#endif #define FatalError(s) \ do { \ @@ -145,9 +148,16 @@ miopenStatus_t Legion::Domain domain, DataType data_type = DT_FLOAT); +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Legion::Domain domain); + hipblasDatatype_t ff_to_cuda_datatype(DataType type); miopenDataType_t ff_to_cudnn_datatype(DataType type); +#ifdef FF_USE_NCCL +ncclDataType_t ff_to_nccl_datatype(DataType type); +#endif void handle_unimplemented_hip_kernel(OperatorType op_type); #endif diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index d57504b8cf..a26a6eaf4b 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -183,9 +183,9 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor output; if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); - output = ff.beam_top_k(softmax, falcon_config.max_beam_width, false); + output = ff.argmax(softmax, /*beam_Search*/ true); } else { - output = ff.arg_top_k(lm_head, /*k=*/1, false); + output = ff.argmax(lm_head, /*beam_Search*/ false); } // Compile the model diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 4b27498cfd..d32f5e9430 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -180,6 +180,7 @@ void STARCODER::create_starcoder_model( Tensor output; if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); + // output = ff.beam_top_k(softmax, startcoder_config.max_beam_width, false); output = ff.argmax(softmax, /*beam_Search*/ true); } else { // Tensor softmax = ff.softmax(dense, -1); @@ -188,6 +189,7 @@ void STARCODER::create_starcoder_model( Tensor softmax = ff.softmax(lm_head, -1); output = ff.sampling(softmax, generationConfig.topp); } else { + // output = ff.arg_top_k(lm_head, /*k=*/1, false); output = ff.argmax(lm_head, /*beam_Search*/ false); } } diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 17b8c9ad16..ec5ea6c36a 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -17,19 +17,415 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include +#include namespace FlexFlow { +using Legion::coord_t; + +enum class HeapType { kMinHeap, kMaxHeap }; +enum class PreferIndices { kLower, kHigher }; + +template +struct Entry { + int index; + T value; +}; + +template +struct LinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return data[i].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; +}; + +template +struct IndirectLinearData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index]; + } + + __device__ int get_index(int i) const { + return backing_data[data[i].index].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + + Entry *const data; + Entry *const backing_data; +}; + +template +struct StridedData { + typedef Entry Entry; + + __device__ Entry &operator[](std::size_t index) const { + return data[index * blockDim.x + threadIdx.x]; + } + + __device__ int get_index(int i) const { + return (*this)[i].index; + } + __device__ T get_value(int i) const { + return (*this)[i].value; + } + + Entry *const data; +}; + +// A heap of Entry that can either work as a min-heap or as a max-heap. +template + class Data, + typename T> +struct IndexedHeap { + typedef typename Data::Entry Entry; + Data const data; + __device__ IndexedHeap(Data const &d) : data(d) {} + + __device__ bool is_above(int left, int right) { + T left_value = data.get_value(left); + T right_value = data.get_value(right); + if (left_value == right_value) { + if (preferIndices == PreferIndices::kLower) { + return data.get_index(left) < data.get_index(right); + } else { + return data.get_index(left) > data.get_index(right); + } + } + if (heapType == HeapType::kMinHeap) { + return left_value < right_value; + } else { + return left_value > right_value; + } + } + + __device__ void assign(int i, Entry const &entry) { + data[i] = entry; + } + + __device__ void push_up(int i) { + int child = i; + int parent; + for (; child > 0; child = parent) { + parent = (child - 1) / 2; + if (!is_above(child, parent)) { + // Heap property satisfied. + break; + } + swap(child, parent); + } + } + + __device__ void swap(int a, int b) { + auto tmp = data[b]; + data[b] = data[a]; + data[a] = tmp; + } + + __device__ void push_root_down(int k) { + push_down(0, k); + } + + // MAX-HEAPIFY in Cormen + __device__ void push_down(int node, int k) { + while (true) { + int const left = 2 * node + 1; + int const right = left + 1; + int smallest = node; + if (left < k && is_above(left, smallest)) { + smallest = left; + } + if (right < k && is_above(right, smallest)) { + smallest = right; + } + if (smallest == node) { + break; + } + swap(smallest, node); + node = smallest; + } + } + + // BUILD-MAX-HEAPIFY in Cormen + __device__ void build(int k) { + for (int node = (k - 1) / 2; node >= 0; node--) { + push_down(node, k); + } + } + + // HEAP-EXTRACT-MAX in Cormen + __device__ void remove_root(int k) { + data[0] = data[k - 1]; + push_root_down(k - 1); + } + + // in-place HEAPSORT in Cormen + // This method destroys the heap property. + __device__ void sort(int k) { + for (int slot = k - 1; slot > 0; slot--) { + // This is like remove_root but we insert the element at the end. + swap(slot, 0); + // Heap is now an element smaller. + push_root_down(/*k=*/slot); + } + } + + __device__ void replace_root(Entry const &entry, int k) { + data[0] = entry; + push_root_down(k); + } + + __device__ Entry const &root() { + return data[0]; + } +}; + +template + class Data, + typename T> +__device__ IndexedHeap + make_indexed_heap(typename Data::Entry *data) { + return IndexedHeap{Data{data}}; +} + +// heapArgTopK walks over [input, input+length) with `step_size` stride starting +// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries` +// using `Accessor` to access elements in `heap_entries`. If sorted=true, the +// elements will be sorted at the end. +template class Data = LinearData> +__device__ void heapArgTopK(T const *__restrict__ input, + int length, + int k, + Entry *__restrict__ heap_entries, + bool sorted = false, + int start_index = 0, + int step_size = 1) { + assert(k <= length); + + auto heap = + make_indexed_heap( + heap_entries); + + int heap_end_index = start_index + k * step_size; + if (heap_end_index > length) { + heap_end_index = length; + } + // Initialize the min-heap. + for (int index = start_index, slot = 0; index < heap_end_index; + index += step_size, slot++) { + heap.assign(slot, {index, input[index]}); + } + + heap.build(k); + + // Now iterate over the remaining items. + // If an item is smaller than the min element, it is not amongst the top k. + // Otherwise, replace the min element with it and push upwards. + for (int index = heap_end_index; index < length; index += step_size) { + // We prefer elements with lower indices. This is given here. + // Later elements automatically have higher indices, so can be discarded. + if (input[index] > heap.root().value) { + // This element should replace the min. + heap.replace_root({index, input[index]}, k); + } + } + + // Sort if wanted. + if (sorted) { + heap.sort(k); + } +} + +// mergeShards performs a top-k merge on `num_shards` many sorted streams that +// are sorted and stored in `entries` in a strided way: +// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|... +// The overall top k elements are written to `top_k_values` and their indices +// to top_k_indices. +// `top_k_heap` is used as temporary storage for the merge heap. +template +__device__ void mergeShards(int num_shards, + int k, + Entry *__restrict__ entries, + Entry *__restrict__ top_k_heap, + float *top_k_values, + int *top_k_indices) { + // If k < num_shards, we can use a min-heap with k elements to get the top k + // of the sorted blocks. + // If k > num_shards, we can initialize a min-heap with the top element from + // each sorted block. + int const heap_size = k < num_shards ? k : num_shards; + + // Min-heap part. + { + auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Initialize the heap as a min-heap. + for (int slot = 0; slot < heap_size; slot++) { + min_heap.assign(slot, {slot, entries[slot].value}); + } + min_heap.build(heap_size); + + // Now perform top k with the remaining shards (if num_shards > heap_size). + for (int shard = heap_size; shard < num_shards; shard++) { + auto const entry = entries[shard]; + auto const root = min_heap.root(); + if (entry.value < root.value) { + continue; + } + if (entry.value == root.value && + entry.index > entries[root.index].index) { + continue; + } + // This element should replace the min. + min_heap.replace_root({shard, entry.value}, heap_size); + } + } + + // Max-part. + { + // Turn the min-heap into a max-heap in-place. + auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Heapify into a max heap. + max_heap.build(heap_size); + + // Now extract the minimum k-1 times. + // k is treated specially. + int const last_k = k - 1; + for (int rank = 0; rank < last_k; rank++) { + Entry const &max_element = max_heap.root(); + top_k_values[rank] = __half2float(max_element.value); + int shard_index = max_element.index; + top_k_indices[rank] = entries[shard_index].index; + int next_shard_index = shard_index + num_shards; + // For rank < k-1, each top k heap still contains at least 1 element, + // so we can draw a replacement. + max_heap.replace_root({next_shard_index, entries[next_shard_index].value}, + heap_size); + } + + // rank == last_k. + Entry const &max_element = max_heap.root(); + top_k_values[last_k] = __half2float(max_element.value); + int shard_index = max_element.index; + top_k_indices[last_k] = entries[shard_index].index; + } +} + +template +__global__ void argmax_forward_kernel(T const *__restrict__ input, + size_t shared_memory_size, + int length, + int k, + float *__restrict__ output, + int *__restrict__ indices) { + __shared__ char shared_memory[48 << 10]; + int const batch_index = blockIdx.x; + T const *batch_input = input + batch_index * length; + int const thread_index = threadIdx.x; + int const thread_count = blockDim.x; + Entry *shared_entries = (Entry *)shared_memory; + heapArgTopK( + batch_input, length, k, shared_entries, true, thread_index, thread_count); + __syncthreads(); + if (thread_index == 0) { + int const offset = batch_index * k; + auto batch_output = output + offset; + auto batch_indices = indices + offset; + Entry *top_k_heap = shared_entries + thread_count * k; + mergeShards(thread_count, + k, + shared_entries, + top_k_heap, + batch_output, + batch_indices); + } +} + +template +__global__ void copy_result(hipcub::KeyValuePair *d_out, + int *indices, + float *prob_ptr, + int batch_size, + bool beam_search) { + CUDA_KERNEL_LOOP(i, batch_size) { + indices[i] = d_out[i].key; + if (beam_search) { + prob_ptr[i] = static_cast(d_out[i].value); + } + } +} + /*static*/ template void ArgMax::forward_kernel(ArgMaxMeta const *m, DT *input_ptr, int *indices_ptr, float *prob_ptr, - int *parent_ptr, - int length, - int batch_size, - ffStream_t stream) {} + int *parent, + int const length, + int const batch_size, + hipStream_t stream) { + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + if (m->beam_search) { + // set all parents id zero in arg top1 case. + checkCUDA(hipMemset(parent, 0, batch_size * sizeof(int))); + } + int num_shards = 0; + int k = 1; + { + constexpr auto shared_memory_size = 48 << 10; + auto const heap_size = k * sizeof(Entry
); + // shared_memory_size = (num_shards + 1) * heap_size <=> + num_shards = shared_memory_size / heap_size - 1; + assert(num_shards > 0); + if (num_shards > CUDA_NUM_THREADS) { + num_shards = CUDA_NUM_THREADS; + } + } + // We are limited by the amount of shared memory we have per block. + size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); + // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; + size_t num_blocks = batch_size; + assert(num_shards >= (size_t)k); + num_shards = k; + + hipLaunchKernelGGL(argmax_forward_kernel, + num_blocks, + num_shards, + 0, + stream, + input_ptr, + shared_memory_size, + length, + k, + prob_ptr, + indices_ptr); +} /*static*/ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, @@ -47,7 +443,32 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, checkCUDA(hipEventRecord(t_start, stream)); } - handle_unimplemented_hip_kernel(OP_RMS_NORM); + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + + if (input.data_type == DT_HALF) { + ArgMax::forward_kernel(m, + input.get_half_ptr(), + indices.get_int32_ptr(), + m->probs, + m->beam_search ? parent.get_int32_ptr() + : nullptr, + length, + batch_size, + stream); + + } else if (input.data_type == DT_FLOAT) { + ArgMax::forward_kernel(m, + input.get_float_ptr(), + indices.get_int32_ptr(), + m->probs, + m->beam_search ? parent.get_int32_ptr() + : nullptr, + length, + batch_size, + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); @@ -67,8 +488,17 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, int batch_size, int total_ele, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler, op) {} - -ArgMaxMeta::~ArgMaxMeta(void) {} - + : OpMeta(handler, op) { + DataType data_type = op->data_type; + size_t prob_size = batch_size; + assert(data_type == DT_FLOAT || data_type == DT_HALF); + size_t total_size = prob_size * sizeof(float); + gpu_mem_allocator.create_legion_instance(reserveInst, total_size); + probs = gpu_mem_allocator.allocate_instance(prob_size); +} +ArgMaxMeta::~ArgMaxMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} }; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 134b0c6abe..5ee260714d 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -683,23 +683,42 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler) { DataType data_type = op->inputs[0]->data_type; - checkCUDA(hipMalloc(&parent_ids, - sizeof(int) * BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(hipMalloc(&acc_probs, - sizeof(data_type_size(data_type)) * - BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(hipMalloc(&block_start_index, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(hipMalloc(&request_id, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); - checkCUDA(hipMalloc(&tokens_per_request, - sizeof(int) * BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS)); + size_t parent_id_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t acc_probs_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t block_start_index_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t request_id_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t tokens_per_request_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t totalSize = sizeof(int) * parent_id_size + + data_type_size(data_type) * acc_probs_size + + sizeof(int) * block_start_index_size + + sizeof(int) * request_id_size + + sizeof(int) * tokens_per_request_size; + + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + parent_ids = gpu_mem_allocator.allocate_instance(parent_id_size); + if (data_type == DT_FLOAT) { + acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); + } else if (data_type == DT_HALF) { + acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); + } else { + assert(false); + } + + block_start_index = + gpu_mem_allocator.allocate_instance(block_start_index_size); + request_id = gpu_mem_allocator.allocate_instance(request_id_size); + tokens_per_request = + gpu_mem_allocator.allocate_instance(tokens_per_request_size); } -BeamTopKMeta::~BeamTopKMeta(void) {} +BeamTopKMeta::~BeamTopKMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} }; // namespace FlexFlow diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index c717881e66..99c5bc2631 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -31,6 +31,7 @@ #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/rms_norm_kernels.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -45,6 +46,7 @@ namespace FlexFlow { using Legion::Context; using Legion::coord_t; using Legion::Domain; +using Legion::Future; using Legion::LogicalPartition; using Legion::LogicalRegion; using Legion::PhysicalRegion; @@ -225,13 +227,16 @@ __host__ void FusedOp::forward_task(Task const *task, out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); float const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].get_float_ptr(); + } bias_ptr = my_weight_accessor[1].get_float_ptr(); } else { assert(fused->op_num_weights[op] == 1); } - LinearMeta *m = (LinearMeta *)metas->meta[op]; Kernels::Linear::forward_kernel_wrapper( m, my_input_accessor[0].get_float_ptr(), @@ -298,8 +303,8 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[1], my_output_accessor[0]); break; - break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: @@ -339,6 +344,26 @@ __host__ void FusedOp::forward_task(Task const *task, my_input_accessor[0].domain.get_volume()); break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; + } case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -395,7 +420,10 @@ __host__ void // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; - BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == @@ -516,13 +544,15 @@ __host__ void out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); void const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); - bias_ptr = my_weight_accessor[1].ptr; + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].ptr; + } } else { assert(fused->op_num_weights[op] == 1); } - LinearMeta *m = (LinearMeta *)metas->meta[op]; assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->input_type[0] == my_output_accessor[0].data_type); batch_size = bc->num_active_tokens(); @@ -591,7 +621,6 @@ __host__ void my_input_accessor[1], my_output_accessor[0]); break; - break; } case OP_EMBEDDING: { assert(fused->op_num_inputs[op] == 1); @@ -658,20 +687,32 @@ __host__ void effective_batch_size); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } break; } case OP_RMS_NORM: { @@ -711,8 +752,8 @@ __host__ void assert(fused->op_num_outputs[op] == 1); TreeIncMultiHeadSelfAttentionMeta *m = (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - TreeVerifyBatchConfig const *tree_bc = - (TreeVerifyBatchConfig *)task->args; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); GenericTensorAccessorR biases; if (*m->bias) { @@ -721,7 +762,7 @@ __host__ void } TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - tree_bc, + &tree_bc, task->index_point.point_data[0], my_input_accessor[0], my_weight_accessor[0], @@ -734,8 +775,10 @@ __host__ void assert(fused->op_num_outputs[op] == 1); SpecIncMultiHeadSelfAttentionMeta const *m = (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - BeamSearchBatchConfig const *beam_bc = - (BeamSearchBatchConfig *)task->args; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); GenericTensorAccessorR biases; if (*m->bias) { @@ -744,7 +787,7 @@ __host__ void } SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - beam_bc, + &beam_bc, task->index_point.point_data[0], my_input_accessor[0], my_weight_accessor[0], @@ -766,12 +809,32 @@ __host__ void m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; + } case OP_ALLREDUCE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::forward_kernel_wrapper( - m, my_input_accessor[0], my_output_accessor[0]); + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } default: { @@ -1103,6 +1166,7 @@ __host__ void FusedOp::backward_task(Task const *task, my_input_grad_accessor[1].get_float_ptr()); break; } + case OP_GELU: case OP_RELU: case OP_SIGMOID: case OP_TANH: diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 99f8f49abd..b4102a7dba 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -14,7 +14,11 @@ */ #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/hip_helper.h" +#include #include namespace FlexFlow { @@ -23,6 +27,784 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +namespace Kernels { +namespace IncMultiHeadAttention { + +template +__global__ void apply_proj_bias_w(DT *input_ptr, + DT const *bias_ptr, + int num_tokens, + int qkv_weight_size, + int oProjSize) { + CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { + int bias_idx = qkv_weight_size + i % oProjSize; + input_ptr[i] += bias_ptr[bias_idx]; + } +} + +template +__global__ void apply_proj_bias_qkv(DT *input_ptr, + DT const *bias_ptr, + int shard_id, + int num_tokens, + int qProjSize, + int kProjSize, + int vProjSize, + int global_num_q_heads, + int global_num_kv_heads, + int num_q_heads, + int num_kv_heads, + bool scaling_query, + float scaling_factor) { + CUDA_KERNEL_LOOP(i, + num_tokens * + (qProjSize * num_q_heads + kProjSize * num_kv_heads + + vProjSize * num_kv_heads)) { + // for simplicity, assume q, k, v is in same shape + // 0->q, 1->k, 2->v + // int qkv_index = i / (num_tokens * qProjSize) % 3; + + int qkv_index = i < num_tokens * qProjSize * num_q_heads + ? 0 + : (i < num_tokens * (qProjSize * num_q_heads + + kProjSize * num_kv_heads) + ? 1 + : 2); + + int q_block_size = qProjSize * num_tokens * num_q_heads; + int k_block_size = kProjSize * num_tokens * num_kv_heads; + + int bias_idx = 0; + if (qkv_index == 0) { + int head_idx = i / (num_tokens * qProjSize); + int global_head_idx = head_idx + shard_id * num_q_heads; + int global_i = i + shard_id * num_q_heads * num_tokens * qProjSize; + bias_idx = global_head_idx * qProjSize + + (global_i % (num_tokens * (qProjSize)) % qProjSize); + } else { + + int idx = + qkv_index == 1 ? i - q_block_size : i - q_block_size - k_block_size; + int pre_length = qkv_index == 1 ? qProjSize * global_num_q_heads + : qProjSize * global_num_q_heads + + kProjSize * global_num_kv_heads; + + int head_idx = idx / (num_tokens * kProjSize); + int global_head_idx = head_idx + shard_id * num_kv_heads; + int global_idx = idx + shard_id * num_tokens * num_kv_heads * kProjSize; + + bias_idx = pre_length + global_head_idx * kProjSize + + (global_idx % (num_tokens * (qProjSize)) % qProjSize); + } + + input_ptr[i] += bias_ptr[bias_idx]; + + if (scaling_query && qkv_index == 0) { + input_ptr[i] *= scaling_factor; + } + } +} + +template +__global__ void + apply_rotary_embedding_native(DT *input_ptr, + hipFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_q_heads, + int num_tokens, + int num_kv_heads, + int q_block_size, + int k_block_size, + int q_array_size) { + CUDA_KERNEL_LOOP( + i, + num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { + // create complex number + bool q_tensor = i < (q_array_size / 2); + int proj_size = q_tensor ? qProjSize : kProjSize; + int real_i = q_tensor ? i : i - q_array_size / 2; + + int head_idx = real_i / (num_tokens * proj_size / 2); + int idx = real_i % (num_tokens * proj_size / 2); + int real_part_index = idx * 2 + + head_idx * (q_tensor ? q_block_size : k_block_size) + + (q_tensor ? 0 : q_array_size); + + int complex_part_index = real_part_index + 1; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + int token_idx = + (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + int pos_i = real_i % (proj_size / 2); + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = hipCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void + apply_rotary_embedding_hf(DT *input_ptr, + hipFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int num_q_heads, + int num_tokens, + int num_kv_heads, + int q_block_size, + int k_block_size, + int q_array_size) { + CUDA_KERNEL_LOOP( + i, + num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { + // create complex number + bool q_tensor = i < (q_array_size / 2); + int proj_size = q_tensor ? qProjSize : kProjSize; + int real_i = q_tensor ? i : i - q_array_size / 2; + + int head_idx = real_i / (num_tokens * proj_size / 2); + int idx = real_i % (num_tokens * proj_size / 2); + int token_idx = + (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + + int real_part_index = idx + token_idx * (proj_size / 2) + + head_idx * (q_tensor ? q_block_size : k_block_size) + + (q_tensor ? 0 : q_array_size); + int complex_part_index = real_part_index + (proj_size / 2); + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + + // size_t pos = id_map[token_idx].token_position; + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + int pos_i = real_i % (proj_size / 2); + float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = hipCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void store_kv_cache(DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_q_heads, + int num_kv_heads, + int max_seq_len) { + CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { + int q_array_size = qProjSize * num_tokens * num_q_heads; + int k_array_size = kProjSize * num_tokens * num_kv_heads; + + bool k_cache = i < k_array_size; + int real_i = k_cache ? i : i - k_array_size; + + int proj_size = k_cache ? kProjSize : vProjSize; + int head_idx = real_i / (num_tokens * proj_size); + int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = real_i % proj_size; + + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * num_tokens + + token_idx * proj_size + data_idx]; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; + cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +template +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + hipStream_t stream) { + + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + assert(m->qSize == m->vSize && m->qSize == m->kSize); + hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F; +#else + hipblasDatatype_t compute_type = hipblas_data_type; +#endif + // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) + // Weights: qSize x qProjSize x 3 x num_q_heads + // Input: qSize x num_tokens + // Output >>> qProjSize x num_tokens x 3 x num_q_heads + int m_q = m->qProjSize; + int m_k = m->kProjSize; + int m_v = m->vProjSize; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->qSize; + int m_ = m_q; + int lda = k, ldb = k, ldc = m_q; + + size_t strideA = m_q * k; // query weight head size + size_t strideB = 0; // input stays the same for all heads. + size_t strideC = m_q * n; // size of the output block for each head. + + // compute QKV + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + hipblas_data_type, + lda, + strideA, + input_ptr, + hipblas_data_type, + ldb, + strideB, + &beta, + output_ptr, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads + m->num_kv_heads + + m->num_kv_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + + // apply rotary emmmbedding for q and k + // step1 change the k, v to complex tensor + int num_tokens = bc->num_active_tokens(); + int parallelism = m->kProjSize * num_tokens * m->num_q_heads; + int q_block_size = m->qProjSize * num_tokens; + int k_block_size = m->kProjSize * num_tokens; + int q_array_size = m->qProjSize * num_tokens * m->num_q_heads; + // apply bias for q, k, v + if (*m->bias) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + bias_ptr, + shard_id, + num_tokens, + m->qProjSize, + m->kProjSize, + m->vProjSize, + m->global_num_q_heads, + m->global_num_kv_heads, + m->num_q_heads, + m->num_kv_heads, + *m->scaling_query, + m->scaling_factor); + } + if (*m->apply_rotary_embedding) { + /*q&k*/ + parallelism = + num_tokens * + (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads) / 2; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + m->complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + m->num_q_heads, + num_tokens, + m->num_kv_heads, + q_block_size, + k_block_size, + q_array_size); + } +} + +template +void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + hipStream_t stream) { + int num_tokens = bc->num_active_tokens(); + if (num_tokens > 0) { + int parallelism = + (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_q_heads, + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH); + } +} + +template +void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + hipStream_t stream) { + // additional processing for weight uploading + // Note that we update weight_ptr and bias_ptr when uploading weight and + // bias + if (m->quantization_type != DT_NONE) { + // copy weight_ptr to quantized_weight_ptr, do compression and store in + // m->weight_ptr + hipMemcpyAsync(m->quantized_weight_ptr, + weight.get_byte_ptr(), + m->quantized_weightSize, + hipMemcpyHostToDevice, + stream); + + if (m->quantization_type == DT_INT4) { + int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; + decompress_int4_attention_weights<<>>( + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_q_heads); + } else { + assert(m->quantization_type == DT_INT8); + int parallelism = m->qProjSize * m->qSize * m->num_q_heads; + decompress_int8_attention_weights<<>>( + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_q_heads); + } + } else { + if (data_type == DT_FLOAT) { + hipMemcpyAsync(m->weight_ptr, + weight.get_float_ptr(), + m->weightSize, + hipMemcpyHostToDevice, + stream); + } else if (data_type == DT_HALF) { + hipMemcpyAsync(m->weight_ptr, + weight.get_half_ptr(), + m->weightSize, + hipMemcpyHostToDevice, + stream); + } else { + assert(false); + } + } +} + +template +void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + hipStream_t stream) { + // here because we need postion info in infernece 1 + + if (m->offload && m->biasSize > 0) { + hipMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream); + bias_ptr = static_cast
(m->bias_ptr); + } + hipMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), + hipMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + + // phase 2: Update key/val cache + update_kv_cache_kernel
(m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + compute_attention_kernel( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); +} + +} // namespace IncMultiHeadAttention +} // namespace Kernels + +using namespace Kernels::IncMultiHeadAttention; + +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + +template +void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + hipStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + hipblasDatatype_t compute_type = hipblas_data_type; +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int q_block_size = m->qProjSize * num_tokens; + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_req_block_size = kt_block_size * m->num_kv_heads; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_req_block_size = vt_block_size * m->num_kv_heads; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].token_start_offset + + bc->requestsInfo[i].num_tokens_in_batch; + // bc->token_last_available_idx[i] + 1; + // Compute (QK^T/sqrt(d_k)) + // a flag of using this scaling alpha + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods); + if (m->num_kv_heads == m->num_q_heads) { + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + + } else { + strideB = 0; + // use hipblasGemmStridedBatchedEx + int one_step_heads = m->num_q_heads / m->num_kv_heads; + m_ = num_new_tokens; + n = total_tokens; + k = m->qProjSize; + lda = k, ldb = k, ldc = m_; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A + step * strideA * one_step_heads, + hipblas_data_type, + lda, + strideA, + B + step * kt_block_size, + hipblas_data_type, + ldb, + strideB, + &beta, + C + step * strideC * one_step_heads, + hipblas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + entries_above_diagonal, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = vt_block_size; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = C_softmax; + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = static_cast
(m->attn_heads) + + tokens_previous_requests * m->num_q_heads * m->vProjSize; + + if (m->num_q_heads == m->num_kv_heads) { + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } else { + int one_step_heads = m->num_q_heads / m->num_kv_heads; + n = m->vProjSize; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = 0; + strideC = num_new_tokens * m->vProjSize; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A + step * one_step_heads * strideA, + hipblas_data_type, + lda, + strideA, + B + step * vt_block_size, + hipblas_data_type, + ldb, + strideB, + &beta, + C + step * one_step_heads * strideC, + hipblas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + } + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_q_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_kv_heads + + m->vProjSize * m->num_kv_heads); + B = C; + C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + B, + hipblas_data_type, + ldb, + &beta, + C, + hipblas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + tokens_previous_requests += num_new_tokens; + } + + if (*m->bias && shard_id == 0) { + int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + bias_ptr, + num_tokens, + qkv_weight_size, + m->oProjSize); + } + + assert(tokens_previous_requests == num_tokens); +} + /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta const *m, @@ -34,6 +816,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -42,7 +825,46 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - handle_unimplemented_hip_kernel(OP_INC_MULTIHEAD_SELF_ATTENTION); + // assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + m->offload ? static_cast(m->weight_ptr) + : weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); @@ -66,11 +888,31 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int num_samples, int _num_q_heads, int _num_kv_heads) - : OpMeta(handler, attn) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDNN(miopenSetStream(handler.dnn, stream)); -} + : IncMultiHeadSelfAttentionMeta(handler, + INC_DECODING_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->add_bias_kv, + attn->scaling_factor, + weight, + gpu_mem_allocator, + num_samples, + attn->num_q_heads, + attn->num_kv_heads, + _num_q_heads, + _num_kv_heads, + attn->quantization_type, + attn->offload) {} IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, @@ -102,6 +944,195 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); + checkCUDNN(miopenCreateTensorDescriptor(&qk_tensor)); + qSize = _qSize; + kSize = _kSize; + vSize = _vSize; + // assume dimensions match for now + assert(qSize == kSize); + assert(kSize == vSize); + qProjSize = _qProjSize; + kProjSize = _kProjSize; + assert(qProjSize == kProjSize); // required for attention QK^T matmul + vProjSize = _vProjSize; + oProjSize = _oProjSize; + size_t size_of_dt = data_type_size(attn->data_type); + quantization_type = _quantization_type; + offload = _offload; + + global_num_q_heads = _global_num_q_heads; + global_num_kv_heads = _global_num_kv_heads; + num_q_heads = _num_q_heads; + num_kv_heads = _num_kv_heads; + + weightSize = + ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * + num_q_heads + + (kSize * kProjSize + vSize * vProjSize) * num_kv_heads) * + size_of_dt; + if (quantization_type != DT_NONE) { + quantized_weightSize = get_quantization_to_byte_size( + attn->data_type, quantization_type, weightSize); + } + biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + // has_load_weights = (bool *)calloc(1, sizeof(bool)); + //*has_load_weights = false; + apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); + *apply_rotary_embedding = _apply_rotary_embedding; + bias = (bool *)calloc(1, sizeof(bool)); + *bias = _bias; + scaling_query = (bool *)calloc(1, sizeof(bool)); + *scaling_query = _scaling_query; + scaling_factor = _scaling_factor; + qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); + *qk_prod_scaling = _qk_prod_scaling; + // Currently do not support adding bias to key/value projection + assert(!_add_bias_kv); + + // allocate weight and bias in the reserve space for cpu offloading + if (offload) { + weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(weightSize); + bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize); + } + + // allocate memory for the seqArray and reserve space + { + size_t qkv_max_proj_size = + BatchConfig::MAX_NUM_TOKENS * + (qProjSize * num_q_heads + kProjSize * num_kv_heads + + vProjSize * num_kv_heads); + size_t key_cache_size = 0, value_cache_size = 0; + switch (infer_mode) { + case INC_DECODING_MODE: + case TREE_VERIFY_MODE: { + key_cache_size = num_kv_heads * kProjSize * + BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + value_cache_size = num_kv_heads * vProjSize * + BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH; + break; + } + case BEAM_SEARCH_MODE: { + key_cache_size = + num_kv_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + value_cache_size = + num_kv_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + break; + } + default: + assert(false && "Unkown inference mode"); + } + size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; + size_t qk_prod_size = + BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_q_heads; + size_t attn_heads_size = + BatchConfig::MAX_NUM_TOKENS * num_q_heads * vProjSize; + size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + size_t W_out_contiguous_size = W_out_block_size * num_q_heads; + size_t complex_size = + (BatchConfig::MAX_NUM_TOKENS * + (qProjSize * num_q_heads + kProjSize * num_kv_heads)) / + 2; + size_t totalSize = + (qkv_max_proj_size + key_cache_size + value_cache_size + + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * + size_of_dt + + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + + complex_size * sizeof(hipFloatComplex); // more components will + // be added here later + if (offload) { + // assert that we have enough reserved work space left + size_t totalSharedSize = + infer_mode == TREE_VERIFY_MODE + ? totalSize - + (key_cache_size + value_cache_size + qkv_max_proj_size) * + size_of_dt + : totalSize - (key_cache_size + value_cache_size) * size_of_dt; + + size_t instance_size = + size_of_dt * + (infer_mode == TREE_VERIFY_MODE + ? key_cache_size + value_cache_size + qkv_max_proj_size + : key_cache_size + value_cache_size); + + if (quantization_type != DT_NONE) { + totalSharedSize += quantized_weightSize; + } + assert(gpu_mem_allocator.reserved_total_size - + gpu_mem_allocator.reserved_allocated_size >= + totalSharedSize); + gpu_mem_allocator.create_legion_instance(reserveInst, instance_size); + } else { + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + } + + // in tree_verify, enable devQKVProjArray; + if (!offload || infer_mode == TREE_VERIFY_MODE) { + devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped( + qkv_max_proj_size * size_of_dt); + } else { + devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped( + qkv_max_proj_size * size_of_dt); + // offset += qkv_max_proj_size * size_of_dt; + } + + // use key value cache in all mode. + keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size * + size_of_dt); + valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * + size_of_dt); + + if (offload) { + token_infos = + gpu_mem_allocator.allocate_reserved( + tokeninfo_size); + // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; + qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * + size_of_dt); + // offset += qk_prod_size * size_of_dt; + qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped( + qk_prod_size * size_of_dt); + // offset += qk_prod_size * size_of_dt; + attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * + size_of_dt); + // offset += attn_heads_size * size_of_dt; + W_out_contiguous = gpu_mem_allocator.allocate_reserved_untyped( + W_out_contiguous_size * size_of_dt); + // offset += W_out_contiguous_size * size_of_dt; + complex_input = + gpu_mem_allocator.allocate_reserved(complex_size); + // offset += complex_size * sizeof(hipFloatComplex); + } else { + token_infos = + gpu_mem_allocator.allocate_instance( + tokeninfo_size); + qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * + size_of_dt); + qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( + qk_prod_size * size_of_dt); + attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size * + size_of_dt); + W_out_contiguous = gpu_mem_allocator.allocate_instance_untyped( + W_out_contiguous_size * size_of_dt); + complex_input = + gpu_mem_allocator.allocate_instance(complex_size); + } + + // allocate more size for quantization data + if (quantization_type != DT_NONE) { + assert(offload); + quantized_weight_ptr = + gpu_mem_allocator.allocate_reserved(quantized_weightSize); + } + if (!offload) { + assert(gpu_mem_allocator.reserved_total_size == + gpu_mem_allocator.reserved_allocated_size); + } + } + hipStreamSynchronize(stream); } IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {} diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b694797830..37223e11c9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1010,20 +1010,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - // size_t qkv_proj_dim = qProjSize + kProjSize + vProjSize; - // size_t qkv_max_proj_size = - // BatchConfig::MAX_NUM_TOKENS * qkv_proj_dim * num_q_heads; - size_t qkv_max_proj_size = BatchConfig::MAX_NUM_TOKENS * (qProjSize * num_q_heads + kProjSize * num_kv_heads + vProjSize * num_kv_heads); - // std::cout << "num_kv_heads: " << BatchConfig::MAX_NUM_TOKENS << ", " - // << qProjSize << ", " << kProjSize << ", " << vProjSize << ", " - // << num_q_heads << ", " << num_kv_heads << ", " << - // qkv_max_proj_size - // << std::endl; - // assert(false); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { case INC_DECODING_MODE: diff --git a/src/ops/kernels/decompress_kernels.cpp b/src/ops/kernels/decompress_kernels.cpp new file mode 100644 index 0000000000..22bf93d449 --- /dev/null +++ b/src/ops/kernels/decompress_kernels.cpp @@ -0,0 +1,90 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +namespace Kernels { + +template +__global__ void decompress_int4_general_weights(char const *input_weight_ptr, + DT *weight_ptr, + int in_dim, + int valueSize) {} + +template +__global__ void decompress_int8_general_weights(char const *input_weight_ptr, + DT *weight_ptr, + int in_dim, + int valueSize) {} + +template +__global__ void decompress_int4_attention_weights(char *input_weight_ptr, + DT *weight_ptr, + int qProjSize, + int qSize, + int num_heads) {} + +template +__global__ void decompress_int8_attention_weights(char *input_weight_ptr, + DT *weight_ptr, + int qProjSize, + int qSize, + int num_heads) {} + +template __global__ void decompress_int4_general_weights( + char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int4_general_weights( + char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int8_general_weights( + char const *input_weight_ptr, float *weight_ptr, int in_dim, int valueSize); +template __global__ void decompress_int8_general_weights( + char const *input_weight_ptr, half *weight_ptr, int in_dim, int valueSize); +template __global__ void + decompress_int4_attention_weights(char *input_weight_ptr, + float *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template __global__ void + decompress_int4_attention_weights(char *input_weight_ptr, + half *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template __global__ void + decompress_int8_attention_weights(char *input_weight_ptr, + float *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +template __global__ void + decompress_int8_attention_weights(char *input_weight_ptr, + half *weight_ptr, + int qProjSize, + int qSize, + int num_heads); + +} // namespace Kernels +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 03f49774c5..3f4952b4a6 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/rms_norm_kernels.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/utils/hip_helper.h" #include @@ -21,15 +22,155 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler, rms) {} -RMSNormMeta::~RMSNormMeta(void) {} + : OpMeta(handler, rms) { + eps = rms->eps; + alpha = 1.0f; + beta = 0.0f; + + in_dim = rms->data_dim; + batch_size = rms->effective_batch_size; + num_elements = in_dim * batch_size; + + DataType data_type = rms->weights[0]->data_type; + size_t rms_ptr_size = batch_size; + size_t norm_ptr_size = num_elements; + size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + rms_ptr = gpu_mem_allocator.allocate_instance_untyped( + rms_ptr_size * data_type_size(data_type)); + norm_ptr = gpu_mem_allocator.allocate_instance_untyped( + norm_ptr_size * data_type_size(data_type)); +} +RMSNormMeta::~RMSNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} namespace Kernels { namespace RMSNorm { +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void + RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { + __shared__ float v_shared[C10_WARP_SIZE]; + long long const i = blockIdx.x; + float sum = 0.0f; + for (long long j = threadIdx.x; j < N; j += blockDim.x) { + long long const index = i * N + j; + sum += (static_cast(X[index]) * static_cast(X[index])); + } + sum = BlockReduceSum(sum, + v_shared); // use BlockReduceSum() to sum X_ij^2 + + if (threadIdx.x == 0) { + rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); + } +} + +template +__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { + using T_ACC = T; + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + Y[index] = static_cast(X[index]) * static_cast(rstd[i]); + } +} + +template +__global__ void elewise_apply_weights(int64_t batch_size, + int64_t in_dim, + T const *norm, + T const *weights, + T *output) { + CUDA_KERNEL_LOOP(i, batch_size * in_dim) { + output[i] = norm[i] * weights[i % in_dim]; + } +} + +template +void forward_kernel(RMSNormMeta const *m, + T const *input_ptr, + T const *weight_ptr, + T *output_ptr, + hipStream_t stream) { + int parallelism = m->batch_size * m->in_dim; + hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseRootMeanSquareKernel), + m->batch_size, + kCUDABlockReduceNumThreads, + 0, + stream, + m->in_dim, + m->eps, + input_ptr, + static_cast(m->rms_ptr)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(NormKernel), + m->batch_size, + kCUDANumThreads, + 0, + stream, + m->in_dim, + input_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(elewise_apply_weights), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->batch_size, + m->in_dim, + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); +} + void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, @@ -44,7 +185,23 @@ void forward_kernel_wrapper(RMSNormMeta const *m, checkCUDA(hipEventRecord(t_start, stream)); } - handle_unimplemented_hip_kernel(OP_RMS_NORM); + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index 8599f09244..bd8b46116d 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -27,7 +27,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, Domain const &input_domain) : OpMeta(handler) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp index c4aa3a804a..f76acc8e71 100644 --- a/src/ops/sampling.cpp +++ b/src/ops/sampling.cpp @@ -17,9 +17,96 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include +#include namespace FlexFlow { +constexpr int SamplingNumThreads = 1024; +struct BlockPrefixCallbackOp { + // Running prefix + float running_total; + // Constructor + __device__ BlockPrefixCallbackOp(float running_total) + : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ float operator()(float block_aggregate) { + float old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +__global__ void init_idxs(int batch_size, + int vocab_size, + int total_eles, + int *idx, + int *begin_offset, + int *end_offset) { + CUDA_KERNEL_LOOP(i, total_eles) { + idx[i] = i % vocab_size; + if (i % vocab_size == 0) { + begin_offset[i / vocab_size] = i; + end_offset[i / vocab_size] = i; + } + } +} + +__global__ void + init_random_kernel(hiprandState *state, int batch_size, long rand) { + CUDA_KERNEL_LOOP(i, batch_size) { + hiprand_init(rand, i, 0, &state[i]); + } +} + +// multinominal and gather +template +__global__ void sampling_topp_kernel(int batch_size, + int const vocab_size, + hiprandState *state, + DT *sorted_logits, + int *sorted_idx, + int *indices_ptr, + float topp) { + // int const vocab_id = threadIdx.x; + int const batch_idx = blockIdx.x; + __shared__ float random_n; + __shared__ unsigned long long result_idx; + + // random num + if (threadIdx.x == 0) { + // number must < topp + random_n = hiprand_uniform(state + batch_idx) * topp; + // printf("batch idx: %d, random num%f\n", batch_idx, random_n); + } + + __syncthreads(); + + // cumsum; + typedef hipcub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + int offset = batch_idx * vocab_size; + float prefix_sum = 0.0f; + BlockPrefixCallbackOp prefix_op(0); + result_idx = vocab_size - 1; + + for (unsigned long long j = threadIdx.x; j < vocab_size; j += blockDim.x) { + float logit = (float)(sorted_logits[offset + j]); + BlockScan(temp_storage).InclusiveSum(logit, prefix_sum, prefix_op); + prefix_sum /= topp; + if (prefix_sum >= random_n) { + atomicMin(&result_idx, j); + } + } + indices_ptr[batch_idx] = sorted_idx[offset + result_idx]; + + // if (threadIdx.x == 0) { + // printf("selected idx: %d, %d\n", blockIdx.x, result_idx); + // } +} + /*static*/ template void Sampling::forward_kernel(SamplingMeta const *m, @@ -28,7 +115,48 @@ void Sampling::forward_kernel(SamplingMeta const *m, float const top_p, int const length, int const batch_size, - hipStream_t stream) {} + hipStream_t stream) { + + size_t temp_storage_bytes = m->temp_storage_bytes; + // checkCUDA(hipcub::DeviceSegmentedRadixSort::SortPairsDescending( + // m->d_temp_storage, + // temp_storage_bytes, + // input_ptr, + // static_cast
(m->sorted_logits), + // m->idx, + // m->sorted_idx, + // length * batch_size, + // batch_size, + // m->begin_offset, + // m->end_offset + 1, + // 0, // begin_bit + // sizeof(DT) * 8, // end_bit = sizeof(KeyT) * 8 + // stream)); + return; + int parallelism = batch_size; + hipLaunchKernelGGL(init_random_kernel, + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->state, + batch_size, + rand()); + // sampling + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sampling_topp_kernel), + batch_size, + SamplingNumThreads, + 0, + stream, + batch_size, + length, + m->state, + static_cast
(m->sorted_logits), + m->sorted_idx, + indices_ptr, + top_p); +} /*static*/ void Sampling::forward_kernel_wrapper(SamplingMeta const *m, @@ -63,7 +191,81 @@ SamplingMeta::SamplingMeta(FFHandler handler, int total_ele, GenericTensorAccessorW input, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler, op) {} + : OpMeta(handler, op) { + DataType data_type = op->data_type; + + size_t begin_offset_size, end_offset_size; + begin_offset_size = end_offset_size = batch_size + 1; + size_t idx_size, sorted_idx_size, sorted_logits_size; + idx_size = sorted_idx_size = sorted_logits_size = total_ele; + size_t state_size = batch_size; + + size_t totalSize = sizeof(int) * (begin_offset_size + end_offset_size + + idx_size + sorted_idx_size) + + data_type_size(data_type) * sorted_logits_size + + sizeof(hiprandState) * state_size; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + begin_offset = gpu_mem_allocator.allocate_instance(begin_offset_size); + end_offset = gpu_mem_allocator.allocate_instance(end_offset_size); + idx = gpu_mem_allocator.allocate_instance(idx_size); + sorted_idx = gpu_mem_allocator.allocate_instance(sorted_idx_size); + sorted_logits = gpu_mem_allocator.allocate_instance_untyped( + sorted_logits_size * data_type_size(data_type)); + state = gpu_mem_allocator.allocate_instance(state_size); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // init offset + int parallelism = total_ele; + init_idxs<<>>(batch_size, + total_ele / batch_size, + total_ele, + idx, + begin_offset, + end_offset); + + // init sort function + // if (data_type == DT_FLOAT) { + // checkCUDA(hipcub::DeviceSegmentedRadixSort::SortPairsDescending( + // d_temp_storage, + // temp_storage_bytes, + // input.get_float_ptr(), + // input.get_float_ptr(), + // idx, + // idx, + // total_ele, + // batch_size, + // begin_offset, + // end_offset + 1, + // 0, // begin_bit + // data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 + // stream)); + // } else if (data_type == DT_HALF) { + // checkCUDA(hipcub::DeviceSegmentedRadixSort::SortPairsDescending( + // d_temp_storage, + // temp_storage_bytes, + // input.get_half_ptr(), + // input.get_half_ptr(), + // idx, + // idx, + // total_ele, + // batch_size, + // begin_offset, + // end_offset + 1, + // 0, // begin_bit + // data_type_size(data_type) * 8, // end_bit = sizeof(KeyT) * 8 + // stream)); + // } else { + // assert(false && "input type in float and half"); + // } + + gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); + d_temp_storage = + gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); +} SamplingMeta::~SamplingMeta(void) {} }; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 8092ed8bd7..c9ef952d3b 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -14,7 +14,10 @@ */ #include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/hip_helper.h" +#include #include namespace FlexFlow { @@ -23,6 +26,551 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +using namespace Kernels::IncMultiHeadAttention; + +namespace Kernels { +namespace SpecIncMultiHeadAttention { + +template +__global__ void spec_store_kv_cache( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo *tokenInfos, + BatchConfig::PerRequestInfo *requestInfo, + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int num_q_heads, + int num_kv_heads, + int max_seq_len, + int max_beam_width, + bool is_root) { + CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { + int q_array_size = qProjSize * num_tokens * num_q_heads; + int k_array_size = kProjSize * num_tokens * num_kv_heads; + + bool k_cache = i < k_array_size; + int real_i = k_cache ? i : i - k_array_size; + + int proj_size = k_cache ? kProjSize : vProjSize; + int head_idx = real_i / (num_tokens * proj_size); + int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; + int data_idx = real_i % proj_size; + + // above no need to be changed + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + // int const sub_req_id = id_map[token_idx].sub_request_index; + // int const parent_id = id_map[token_idx].parent_id; + // int const beam_depth = id_map[token_idx].beam_depth; + // int const beam_width = id_map[token_idx].beam_width; + + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * num_tokens + + token_idx * proj_size + data_idx]; + + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; + int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; + int const beam_depth = beamRequestInfos[req_id].current_depth; + int const beam_width = beamRequestInfos[req_id].beam_size; + + // new token + int new_token_cache_idx = (req_id * max_beam_width + sub_req_id) * + (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + + tok_id * proj_size + data_idx; + + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; + cache_ptr[new_token_cache_idx] = val; + + // replica in the root iteration + if (beam_depth == 1) { + for (int i = 1; i < beam_width; i++) { + cache_ptr[(req_id * max_beam_width + i) * + (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } + } + + // naive cache stealing + if (sub_req_id != parent_id) { + if (data_idx == 0 && head_idx == 0 && k_cache) { + printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " + "%d, tok_id %d\n", + beam_depth, + req_id, + sub_req_id, + parent_id, + tok_id); + } + + for (int depth = 0; depth < beam_depth; depth++) { + int steal_token_idx = tok_id - beam_depth + depth; + int steal_from_idx = (req_id * max_beam_width + parent_id) * + (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + + steal_token_idx * proj_size + data_idx; + int steal_to_idx = (req_id * max_beam_width + sub_req_id) * + (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + + steal_token_idx * proj_size + data_idx; + cache_ptr[steal_to_idx] = cache_ptr[steal_from_idx]; + + // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ + // printf("cache stealing kernel!, steal_token_idx %d\n", + // steal_token_idx); + // } + } + } + + // parallel cache stealing not yet implemented + // logic shld be + // launch spec_store_kv_cache with parallelism * current depth + // from the i here, get depth index + // if depth index not the current one, check if we need to steal + // steal if needed + + // cache stealing theory + // identify which sub request does this token come from + // for initial token, 0 + // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and + // which to be delete copy beam_size bunch of blocks when sub_req_id == + // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + } +} + +template +void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + hipStream_t stream) { + int num_tokens = bc->num_active_tokens(); + int curr_depth = bc->beamRequestsInfo[0].current_depth; + // printf("curr depth: %d\n", curr_depth); + // assert(curr_depth < 3); + if (num_tokens > 0) { + int parallelism = + (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + m->num_q_heads, + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH, + BeamSearchBatchConfig::MAX_BEAM_WIDTH, + /*root*/ curr_depth == 0); + } +} + +template +__global__ void spec_fill_entries_above_diagonal(DT *matrix, + size_t new_tokens, + size_t total_tokens_in_request, + size_t num_q_heads, + DT value) { + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { + // size_t head_idx = i / (new_tokens * total_tokens_in_request); + size_t src_idx = (i / new_tokens) % total_tokens_in_request; + size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; + // Casual Mask + if (src_idx > dst_idx) { + matrix[i] = value; + } + } +} + +template +void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + hipStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + hipblasDatatype_t compute_type = hipblas_data_type; +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int tokens_prev_requests_squares = 0; + // int qkv_block_size = + // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int q_block_size = m->qProjSize * num_tokens; + + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_req_block_size = kt_block_size * m->num_kv_heads; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_req_block_size = vt_block_size * m->num_kv_heads; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { + + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; + + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].token_start_offset + + bc->requestsInfo[i].num_tokens_in_batch; + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize; + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + + if (m->num_q_heads == m->num_kv_heads) { + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } else { + strideB = 0; + int one_step_heads = m->num_q_heads / m->num_kv_heads; + m_ = num_new_tokens; + n = total_tokens; + k = m->qProjSize; + lda = k, ldb = k, ldc = m_; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A + step * strideA * one_step_heads, + hipblas_data_type, + lda, + strideA, + B + step * kt_block_size, + hipblas_data_type, + ldb, + strideB, + &beta, + C + step * strideC * one_step_heads, + hipblas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + } + + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(spec_fill_entries_above_diagonal
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = vt_block_size; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = C_softmax; + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = static_cast
(m->valueCache) + + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = static_cast
(m->attn_heads) + + tokens_previous_requests * m->num_q_heads * m->vProjSize; + + if (m->num_q_heads == m->num_kv_heads) { + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } else { + int one_step_heads = m->num_q_heads / m->num_kv_heads; + n = m->vProjSize; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens; + strideB = 0; + strideC = num_new_tokens * m->vProjSize; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A + step * one_step_heads * strideA, + hipblas_data_type, + lda, + strideA, + B + step * vt_block_size, + hipblas_data_type, + ldb, + strideB, + &beta, + C + step * one_step_heads, + hipblas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + } + + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_q_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_kv_heads + + m->vProjSize * m->num_kv_heads); + B = C; + C = static_cast
(output_ptr) + + tokens_previous_requests * m->oProjSize; + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + B, + hipblas_data_type, + ldb, + &beta, + C, + hipblas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; + } + } + if (*m->bias && shard_id == 0) { + int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + bias_ptr, + num_tokens, + qkv_weight_size, + m->oProjSize); + } + + assert(tokens_previous_requests == num_tokens); +} + +template +void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + hipStream_t stream) { + // here because we need postion info in infernece 1 + hipMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + hipMemcpyHostToDevice, + stream); + hipMemcpyAsync(m->request_infos, + &(bc->requestsInfo), + bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), + hipMemcpyHostToDevice, + stream); + hipMemcpyAsync(m->beam_token_infos, + &(bc->beamTokenInfo), + bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * + sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), + hipMemcpyHostToDevice, + stream); + hipMemcpyAsync(m->beam_request_infos, + &(bc->beamRequestsInfo), + bc->MAX_NUM_REQUESTS * + sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), + hipMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + // phase 2: Update key/val cache + update_kv_cache_kernel
(m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + compute_attention_kernel( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); +} + +} // namespace SpecIncMultiHeadAttention +} // namespace Kernels + /*static*/ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta const *m, @@ -34,6 +582,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -42,7 +591,37 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - handle_unimplemented_hip_kernel(OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION); + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::SpecIncMultiHeadAttention::inference_kernel(m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::SpecIncMultiHeadAttention::inference_kernel(m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); @@ -52,9 +631,6 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventDestroy(t_start)); checkCUDA(hipEventDestroy(t_end)); printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); } } @@ -94,8 +670,53 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); + + // allocate memory for the seqArray and reserve space + { + size_t beam_tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + size_t requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t beam_requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; + size_t total_size = + requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + + beam_tokeninfo_size * + sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + + beam_requestinfo_size * + sizeof(BeamSearchBatchConfig:: + BeamSearchPerRequestInfo); // more components will + // be added here later + + // We always directly allocate memory for small speculative models + gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + total_size); + beam_token_infos = + gpu_mem_allocator + .allocate_instance( + beam_tokeninfo_size); + // offset += beam_tokeninfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); + request_infos = + gpu_mem_allocator.allocate_instance( + requestinfo_size); + // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); + beam_request_infos = + gpu_mem_allocator + .allocate_instance( + beam_requestinfo_size); + // offset += beam_requestinfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); + // assert(offset == total_size); + assert(gpu_mem_allocator.instance_total_size == + gpu_mem_allocator.instance_allocated_size); + } + + hipStreamSynchronize(stream); } -SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) {} +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { + if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { + beam_search_reserve_inst.destroy(); + } +} }; // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index dc68a54bf2..fbd6d1cc48 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -14,7 +14,10 @@ */ #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/hip_helper.h" +#include #include namespace FlexFlow { @@ -23,6 +26,561 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +using namespace Kernels::IncMultiHeadAttention; + +namespace Kernels { +namespace TreeIncMultiHeadAttention { + +template +__global__ void commit_tokens_kernel( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens_to_commit, + int num_active_tokens_in_last_batch, + int num_q_heads, + int num_kv_heads, + int max_seq_len) { + + CUDA_KERNEL_LOOP( + i, num_tokens_to_commit * (kProjSize + vProjSize) * num_kv_heads) { + bool k_cache = i < (num_tokens_to_commit * kProjSize * num_kv_heads); + int real_i = + k_cache ? i : i - (num_tokens_to_commit * kProjSize * num_kv_heads); + + int proj_size = k_cache ? kProjSize : vProjSize; + int data_idx = real_i % proj_size; + int head_idx = real_i / (num_tokens_to_commit * proj_size); + int token_pos = + (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; + int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; + assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); + + int q_array_size = + qProjSize * num_active_tokens_in_last_batch * num_q_heads; + int k_array_size = + kProjSize * num_active_tokens_in_last_batch * num_kv_heads; + + DT val = + devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * num_active_tokens_in_last_batch + + token_idx_in_last_batch * proj_size + data_idx]; + int const req_id = committedTokenInfos[token_pos].request_index; + int const tok_id = committedTokenInfos[token_pos].token_depth; + + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; + cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +template +void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + hipStream_t stream) { + int num_tokens_to_commit = bc->num_tokens_to_commit; + if (num_tokens_to_commit > 0) { + int parallelism = + (m->kProjSize + m->vProjSize) * num_tokens_to_commit * m->num_kv_heads; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(commit_tokens_kernel
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->committed_token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens_to_commit, + m->num_active_tokens, // number of active tokens in previous batch + m->num_q_heads, + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH); + } +} + +template +__global__ void update_tree_branch_kv_cache( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens_in_branch, + int processed_tokens_in_batch, + int total_tokens_in_batch, + int num_q_heads, + int num_kv_heads, + int max_seq_len) { + CUDA_KERNEL_LOOP( + i, num_tokens_in_branch * (kProjSize + vProjSize) * num_kv_heads) { + + int q_array_size = qProjSize * total_tokens_in_batch * num_q_heads; + int k_array_size = kProjSize * total_tokens_in_batch * num_kv_heads; + + bool k_cache = i < (num_tokens_in_branch * kProjSize * num_kv_heads); + int real_i = + k_cache ? i : i - (num_tokens_in_branch * kProjSize * num_kv_heads); + + int proj_size = k_cache ? kProjSize : vProjSize; + int data_idx = real_i % proj_size; + int token_idx = + (real_i / proj_size) % num_tokens_in_branch; // index in the tree branch + int head_idx = real_i / (proj_size * num_tokens_in_branch); + + token_idx += processed_tokens_in_batch; // get index in the whole batch + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * total_tokens_in_batch + + token_idx * proj_size + data_idx]; + + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; + + cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + + head_idx * (max_seq_len * proj_size) + tok_id * proj_size + + data_idx] = val; + } +} + +template +__global__ void tree_fill_entries_above_diagonal(DT *matrix, + size_t new_tokens, + size_t total_tokens_in_request, + size_t num_q_heads, + DT value) { + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { + // size_t head_idx = i / (new_tokens * total_tokens_in_request); + size_t src_idx = (i / new_tokens) % total_tokens_in_request; + size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; + // Casual Mask + if (src_idx > dst_idx) { + matrix[i] = value; + } + } +} + +template +void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + hipStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + hipblasDatatype_t compute_type = hipblas_data_type; +#endif + // int num_requests = bc->num_active_requests(); + int processed_tokens_in_batch = 0; + // int qkv_block_size = + // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); + int q_block_size = m->qProjSize * bc->num_active_tokens(); + int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_req_block_size = kt_block_size * m->num_kv_heads; + int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_req_block_size = vt_block_size * m->num_kv_heads; + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + if (bc->request_completed[i]) { + continue; + } + int last_token_idx_of_the_request = + processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; + while (processed_tokens_in_batch <= last_token_idx_of_the_request) { + int num_new_tokens = 1; + int j = processed_tokens_in_batch; + while ((j + 1 <= last_token_idx_of_the_request) && + (bc->tokensInfo[j].abs_depth_in_request + 1 == + bc->tokensInfo[j + 1].abs_depth_in_request)) { + j++; + num_new_tokens++; + } + + int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; + assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); + { + // update K-V cache + int parallelism = + (m->kProjSize + m->vProjSize) * num_new_tokens * m->num_kv_heads; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(update_tree_branch_kv_cache
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_new_tokens, // num_tokens_in_branch + processed_tokens_in_batch, // num_processed_tokens_in_batch + m->num_active_tokens, // total_tokens_in_batch + m->num_q_heads, + m->num_kv_heads, + BatchConfig::MAX_SEQ_LENGTH); + } + + // bc->token_last_available_idx[i] + 1; + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens_in_request; + int k = m->qProjSize; + int lda = k, ldb = k, ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens_in_request; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + processed_tokens_in_batch * m->qProjSize; + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods); + + if (m->num_q_heads == m->num_kv_heads) { + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } else { + strideB = 0; + int one_step_heads = m->num_q_heads / m->num_kv_heads; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A + step * strideA * one_step_heads, + hipblas_data_type, + lda, + strideA, + B + step * kt_block_size, + hipblas_data_type, + ldb, + strideB, + &beta, + C + step * strideC * one_step_heads, + hipblas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + } + + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens_in_request); + if (num_new_tokens > 1) { + size_t parallelism = + m->num_q_heads * num_new_tokens * total_tokens_in_request; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(tree_fill_entries_above_diagonal
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens_in_request, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens_in_request; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = num_new_tokens; + n = m->vProjSize; + k = total_tokens_in_request; + lda = m_, ldb = n, ldc = m_; + strideA = num_new_tokens * total_tokens_in_request; + strideB = vt_block_size; + strideC = num_new_tokens * m->vProjSize; + // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + A = C_softmax; + // To get B, skip over V^T entries from previous requests (all heads + + // padding) + B = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = static_cast
(m->attn_heads) + + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; + + if (m->num_q_heads == m->num_kv_heads) { + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } else { + int one_step_heads = m->num_q_heads / m->num_kv_heads; + strideB = 0; + for (int step = 0; step < m->num_kv_heads; step++) { + checkCUDA( + hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A + step * one_step_heads * strideA, + hipblas_data_type, + lda, + strideA, + B + step * vt_block_size, + hipblas_data_type, + ldb, + strideB, + &beta, + C + step * one_step_heads * strideC, + hipblas_data_type, + ldc, + strideC, + one_step_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + } + + // Project to output, save result directly on output tensor + alpha = 1.0f, beta = 0.0f; + m_ = m->oProjSize; + k = m->vProjSize * m->num_q_heads; + n = num_new_tokens; + lda = k, ldb = n, ldc = m_; + A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_kv_heads + + m->vProjSize * m->num_kv_heads); + B = C; + C = static_cast
(output_ptr) + + processed_tokens_in_batch * m->oProjSize; + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + B, + hipblas_data_type, + ldb, + &beta, + C, + hipblas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + processed_tokens_in_batch += num_new_tokens; + } + // Before moving to the next request + // check that we have finished all tokens of the request + assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); + } + if (*m->bias && shard_id == 0) { + int parallelism = m->oProjSize * processed_tokens_in_batch; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_kv_heads + + m->vProjSize * m->global_num_kv_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + bias_ptr, + processed_tokens_in_batch, + qkv_weight_size, + m->oProjSize); + } + + assert(processed_tokens_in_batch == bc->num_active_tokens()); +} + +template +void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, + TreeVerifyBatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + hipStream_t stream) { + // additional processing for weight uploading + if (m->handle.offload_reserve_space != nullptr) { + // Note that we update weight_ptr and bias_ptr when uploading weight and + // bias + hipMemcpyAsync(m->weight_ptr, + weight_ptr, + m->weightSize, + hipMemcpyHostToDevice, + stream); + weight_ptr = static_cast
(m->weight_ptr); + if (m->biasSize > 0) { + hipMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream); + bias_ptr = static_cast
(m->bias_ptr); + } + } + // copy committed tokens info to GPU for the commit_tokens kernel + // Note that m->num_active_tokens stores the number of active + // tokens in the previous batch, which is needed for committing + // keys/values to the key-value cache + hipMemcpyAsync(m->committed_token_infos, + &(bc->committed_tokens), + bc->num_tokens_to_commit * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), + hipMemcpyHostToDevice, + stream); + commit_tokens
(m, bc, stream); + + // After commit we update m->num_active_tokens to be the number of active + // tokens for the current batch + m->num_active_tokens = bc->num_active_tokens(); + + // here because we need postion info in infernece 1 + if (m->offload && m->biasSize > 0) { + hipMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream); + bias_ptr = static_cast
(m->bias_ptr); + } + hipMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * + sizeof(TreeVerifyBatchConfig::PerTokenInfo), + hipMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + + // phase 2: No need to update key/val cache + // IncMultiHeadSelfAttention::update_kv_cache_kernel( + // m, bc, stream); + + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + compute_attention_kernel( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); +} + +} // namespace TreeIncMultiHeadAttention +} // namespace Kernels + /*static*/ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta *m, @@ -34,6 +592,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -42,7 +601,47 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - handle_unimplemented_hip_kernel(OP_TREE_INC_MULTIHEAD_SELF_ATTENTION); + // assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::TreeIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + if (m->offload) { + pre_build_weight_kernel(m, weight, input.data_type, stream); + } + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::TreeIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + m->offload ? static_cast(m->weight_ptr) + : weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); @@ -95,8 +694,38 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); + + // allocate memory for the seqArray and reserve space + { + size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; + size_t total_size = committed_tokeninfo_size * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + if (offload) { + // assert that we have enough reserved work space left + assert(gpu_mem_allocator.reserved_total_size - + gpu_mem_allocator.reserved_allocated_size >= + total_size); + committed_token_infos = + gpu_mem_allocator + .allocate_reserved( + committed_tokeninfo_size); + } else { + gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, + total_size); + committed_token_infos = + gpu_mem_allocator + .allocate_instance( + committed_tokeninfo_size); + } + } + + hipStreamSynchronize(stream); } -TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {} +TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) { + if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) { + committed_token_reserve_inst.destroy(); + } +} }; // namespace FlexFlow diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 8d0d5e97c5..8d7e20e395 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -33,7 +33,20 @@ void inference_kernel_wrapper(AllReduceMeta const *m, checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - assert(false && "To be implemented"); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_tokens * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } void forward_kernel_wrapper(AllReduceMeta const *m, @@ -43,7 +56,19 @@ void forward_kernel_wrapper(AllReduceMeta const *m, checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - assert(false && "To be implemented"); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } void backward_kernel_wrapper(AllReduceMeta const *m, diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index fb570a33f5..1f27dc15e7 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -251,12 +251,14 @@ template __host__ T *download_tensor(T const *ptr, size_t num_elements) { // device synchronize to make sure the data are ready // checkCUDA(hipDeviceSynchronize()); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; checkCUDA(hipHostMalloc(&host_ptr, sizeof(T) * num_elements, hipHostMallocPortable | hipHostMallocMapped)); - checkCUDA(hipMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost)); + checkCUDA(hipMemcpyAsync( + host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); // checkCUDA(hipDeviceSynchronize()); return host_ptr; } @@ -265,9 +267,11 @@ template __host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { // device synchronize to make sure the data are ready // checkCUDA(hipDeviceSynchronize()); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); - checkCUDA( - hipMemcpy(dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost)); + checkCUDA(hipMemcpyAsync( + dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); // checkCUDA(hipDeviceSynchronize()); return true; } @@ -324,6 +328,57 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain( return miopenStatusBadParm; } +miopenStatus_t + cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, + Domain domain) { + int dims[MAX_TENSOR_DIM]; + switch (domain.get_dim()) { + case 1: { + Rect<1> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + } + case 2: { + Rect<2> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[1], dims[0], 1, 1); + } + case 3: { + Rect<3> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + } + case 4: { + Rect<4> rect = domain; + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + } + case 5: { + Rect<5> rect = domain; + int leading_dim_size = rect.hi[4] - rect.lo[4] + 1; + assert(leading_dim_size == 1); + dims[0] = rect.hi[0] - rect.lo[0] + 1; + dims[1] = rect.hi[1] - rect.lo[1] + 1; + dims[2] = rect.hi[2] - rect.lo[2] + 1; + dims[3] = rect.hi[3] - rect.lo[3] + 1; + return miopenSet4dTensorDescriptor( + tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + } + default: + assert(false && "Unsupported dim number"); + } + return miopenStatusBadParm; +} + miopenDataType_t ff_to_cudnn_datatype(DataType type) { switch (type) { case DT_HALF: @@ -354,6 +409,23 @@ hipblasDatatype_t ff_to_cuda_datatype(DataType type) { } return HIPBLAS_R_32F; } +#ifdef FF_USE_NCCL +ncclDataType_t ff_to_nccl_datatype(DataType type) { + switch (type) { + case DT_HALF: + return ncclHalf; + case DT_FLOAT: + return ncclFloat; + case DT_DOUBLE: + return ncclDouble; + case DT_INT32: + return ncclInt; + default: + assert(false && "Unspoorted nccl data type"); + } + return ncclFloat; +} +#endif void handle_unimplemented_hip_kernel(OperatorType op_type) { throw std::runtime_error("Unimplemented hip kernel for Operator: " + diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 232799e027..e71adc87a8 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -87,6 +87,7 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, #ifdef FF_USE_NCCL __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -208,6 +209,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, #ifdef FF_USE_NCCL __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 80554c2add..f323f262f2 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -29,21 +29,31 @@ void RequestManager::load_tokens_task( assert(regions.size() == 1); assert(task->regions.size() == 1); - BatchConfig const batch_config = *((BatchConfig *)task->args); + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; - for (int i = 0; i < batch_config.num_tokens; i++) { - dram_copy[i] = batch_config.tokensInfo[i].token_id; + + // Extreme long prompts are not supported, only load up to MAX_NUM_TOKENS as + // prompt + if (batch_config->num_tokens > BatchConfig::MAX_NUM_TOKENS) { + printf("Warning: too many tokens in prompt, only load up to %d tokens\n", + BatchConfig::MAX_NUM_TOKENS); + printf("Got: %d tokens\n", batch_config->num_tokens); + } + + for (int i = 0; i < batch_config->num_tokens; i++) { + dram_copy[i] = batch_config->tokensInfo[i].token_id; } TokenId *fb_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - assert(batch_config.num_tokens <= domain.get_volume()); + assert(batch_config->num_tokens <= domain.get_volume()); hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDA(hipMemcpyAsync(fb_ptr, dram_copy, - sizeof(TokenId) * batch_config.num_tokens, + sizeof(TokenId) * batch_config->num_tokens, hipMemcpyHostToDevice, stream)); } @@ -55,22 +65,23 @@ void RequestManager::load_positions_task( Runtime *runtime) { assert(regions.size() == 1); assert(task->regions.size() == 1); - BatchConfig const batch_config = *((BatchConfig *)task->args); - int offset = 2; + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + + int const offset = *((int const *)task->args); int *pos_ptr = helperGetTensorPointerWO( regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); int dram_copy[BatchConfig::MAX_NUM_TOKENS]; - for (int i = 0; i < batch_config.num_tokens; i++) { - dram_copy[i] = batch_config.tokensInfo[i].abs_depth_in_request + offset; + for (int i = 0; i < batch_config->num_tokens; i++) { + dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset; } hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDA(hipMemcpyAsync(pos_ptr, dram_copy, - sizeof(int) * batch_config.num_tokens, + sizeof(int) * batch_config->num_tokens, hipMemcpyHostToDevice, stream)); } From 7aa18625831975297ac2285f8cbb1b18e30355f7 Mon Sep 17 00:00:00 2001 From: vincent163 Date: Sat, 2 Sep 2023 19:07:37 -0400 Subject: [PATCH 219/344] Fix compile error in debug mode (#1088) Co-authored-by: vincent-163 Co-authored-by: Zhihao Jia --- src/c/flexflow_c.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 96ff84c85f..2ddb65fc9d 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1426,7 +1426,7 @@ flexflow_generation_result_t FFModel *handle = FFCObjectWrapper::unwrap(handle_); std::string const text_str(input_text); GenerationResult result = handle->generate(text_str, max_seq_length); - DEBUG_PRINT("[Model] generate %p %s %i", handle, text, max_seq_length); + DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length); assert(result.output_tokens.size() <= max_seq_length); output_length_and_tokens[0] = result.output_tokens.size(); std::copy(result.output_tokens.begin(), From 7adf106e64dc92a69d303338692cd943ed105645 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 2 Sep 2023 19:42:56 -0400 Subject: [PATCH 220/344] Update docs (#1091) * update docs * fix --- .github/README.md | 29 ++++++++++++++++++++++------- INSTALL.md | 2 +- MULTI-NODE.md | 4 ++-- README.md | 7 ++++--- SERVE.md | 38 +++++++++++++++++++------------------- docker/README.md | 33 ++++++++++++++++++++++----------- docs/source/jupyter.rst | 6 ------ jupyter_notebook/README.md | 2 +- 8 files changed, 71 insertions(+), 50 deletions(-) delete mode 100644 docs/source/jupyter.rst diff --git a/.github/README.md b/.github/README.md index 56434f6bf9..a8846260c8 100644 --- a/.github/README.md +++ b/.github/README.md @@ -6,8 +6,9 @@ ## News🔥: +* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6 * [08/16/2023] Adding Starcoder model support -* [08/14/2023] Released Dockerfile for different CUDA versions +* [08/14/2023] Released Docker images for different CUDA versions ## What is FlexFlow Serve @@ -42,13 +43,13 @@ pip install flexflow ``` ### Try it in Docker -If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions and the `hip_rocm` backend. To download and run our pre-built Docker container: +If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions (NVIDIA backend) and multiple ROCM versions (AMD backend). To download and run our pre-built Docker container: ```bash -docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-11.8:latest +docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest ``` -To download a Docker container for a backend other than CUDA v11.8, you can replace the `cuda-11.8` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md). +To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md). ### Build from source @@ -209,7 +210,7 @@ Below is a list of models that we have explicitly tested and for which a SSM may | StarCoder-15.5B | bigcode/starcoder | | ### CPU Offloading -FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags. +FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags. ### Quantization FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. @@ -221,10 +222,24 @@ We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruct FlexFlow Serve is under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. -* AMD support. We are actively working on supporting FlexFlow Serve on AMD GPUs and welcome any contributions to this effort. +* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs. +* Chatbot prompt templates and Multi-round conversations +* Support for FastAPI server +* Integration with LangChain for document question answering ## Acknowledgements -This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. +This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: + +``` bibtex +@misc{miao2023specinfer, + title={SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification}, + author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Alan Zhu and Lijie Yang and Xiaoxiang Shi and Chunan Shi and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia}, + year={2023}, + eprint={2305.09781}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` ## License FlexFlow uses Apache License 2.0. diff --git a/INSTALL.md b/INSTALL.md index 8d33770c92..72993f3330 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ You can configure a FlexFlow build by running the `config/config.linux` file in 3. `FF_CUDA_ARCH` is used to set the architecture of targeted GPUs, for example, the value can be 60 if the GPU architecture is Pascal. To build for more than one architecture, pass a list of comma separated values (e.g. `FF_CUDA_ARCH=70,75`). To compile FlexFlow for all GPU architectures that are detected on the machine, pass `FF_CUDA_ARCH=autodetect` (this is the default value, so you can also leave `FF_CUDA_ARCH` unset. If you want to build for all GPU architectures compatible with FlexFlow, pass `FF_CUDA_ARCH=all`. **If your machine does not have any GPU, you have to set FF_CUDA_ARCH to at least one valid architecture code (or `all`)**, since the compiler won't be able to detect the architecture(s) automatically. 4. `FF_USE_PYTHON` controls whether to build the FlexFlow Python interface. 5. `FF_USE_NCCL` controls whether to build FlexFlow with NCCL support. By default, it is set to ON. -6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in [MULTI-NODE.md](MULTI-NODE.md) and set the corresponding parameters as follows: +6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in the [Multinode tutorial](https://flexflow.readthedocs.io/en/latest/multinode.html) and set the corresponding parameters as follows: * To build FlexFlow with GASNet, set `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT` as a specific conduit (e.g. `ibv`, `mpi`, `udp`, `ucx`) in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX. * To build FlexFlow with native UCX, set `FF_LEGION_NETWORKS=ucx` in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX. 8. `FF_BUILD_EXAMPLES` controls whether to build all C++ example programs. diff --git a/MULTI-NODE.md b/MULTI-NODE.md index 4bae47cfa6..9cf95976ac 100644 --- a/MULTI-NODE.md +++ b/MULTI-NODE.md @@ -19,7 +19,7 @@ You can also use your own GPU cluster, as long as all machines are interconnecte ## 2. Configure and build FlexFlow -Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. +Follow steps 1 to 5 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance. @@ -66,6 +66,6 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su ## 4. Test FlexFlow -Follow step 6 in [INSTALL.md](INSTALL.md) to set environment variables. +Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set environment variables. A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). diff --git a/README.md b/README.md index e84bf20605..318d2e38da 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,9 @@ ## News 🔥: +* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6 * [08/16/2023] Adding Starcoder model support -* [08/14/2023] Released Dockerfile for different CUDA versions +* [08/14/2023] Released Docker image for different CUDA versions ## Install FlexFlow @@ -31,10 +32,10 @@ pip install flexflow If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions and the `hip_rocm` backend. To download and run our pre-built Docker container: ```bash -docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-11.8:latest +docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest ``` -To download a Docker container for a backend other than CUDA v11.8, you can replace the `cuda-11.8` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md). +To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md). ### Build from source diff --git a/SERVE.md b/SERVE.md index e716392b32..60d0b566f0 100644 --- a/SERVE.md +++ b/SERVE.md @@ -16,19 +16,18 @@ existing systems by 1.3-2.0x for single-node, multi-GPU inference and by ## Quickstart -The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving. +The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively. +We need to make sure the aggregated GPU memory and zero-copy memory are **both** sufficient to store LLM parameters in non-offloading serving. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving. ```python import flexflow.serve as ff ff.init( - { - "num_gpus": 4, - "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, - "tensor_parallelism_degree": 4, - "pipeline_parallelism_degree": 1, - } -) + num_gpus=4, + memory_per_gpu=14000, + zero_copy_memory_per_node=30000, + tensor_parallelism_degree=4, + pipeline_parallelism_degree=1 + ) ``` Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). ```python @@ -69,16 +68,14 @@ result = llm.generate("Here are some travel tips for Tokyo:\n") import flexflow.serve as ff -# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs +# Initialize the FlexFlow runtime. ff.init() takes a dictionary (as a positional argument) or named key-value parameters ff.init( - { - "num_gpus": 4, - "memory_per_gpu": 14000, - "zero_copy_memory_per_gpu": 30000, - "tensor_parallelism_degree": 4, - "pipeline_parallelism_degree": 1, - } -) + num_gpus=4, + memory_per_gpu=14000, + zero_copy_memory_per_node=30000, + tensor_parallelism_degree=4, + pipeline_parallelism_degree=1 + ) # Create the FlexFlow LLM llm = ff.LLM("decapoda-research/llama-7b-hf") @@ -189,7 +186,10 @@ We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruct FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. -* AMD support. We are actively working on supporting FlexFlow Serve on AMD GPUs and welcome any contributions to this effort. +* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs. +* Chatbot prompt templates and Multi-round conversations +* Support for FastAPI server +* Integration with LangChain for document question answering ## Acknowledgements This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: diff --git a/docker/README.md b/docker/README.md index 916b78acf6..b7ec7c3631 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,50 +2,61 @@ This folder contains the Dockerfiles and scripts that you can use to quickly run FlexFlow with no manual installation required. To use the containers, follow the steps below. ## Prerequisites -You will need a machine with a NVIDIA GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine. +You can build and run the FlexFlow Docker images on any machine, but if you want to train or serve a model, you will need a machine with a NVIDIA or AMD GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine. If using an AMD GPU, follow the [Deploy ROCm Docker containers](https://rocm.docs.amd.com/en/latest/deploy/docker.html) instructions. ## Downloading a pre-built package The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow): -* `flexflow`: the pre-built version of FlexFlow. We currently publish one version targeting GPUs with a `hip_rocm` backend (`flexflow-hip_rocm`), and several versions for CUDA GPUs (one for each of the following CUDA versions 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, and 11.8). The CUDA images are named `flexflow-cuda-`, e.g. [flexflow-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-11.8) -* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish one version of `flexflow-environment` for `hip_rocm` and one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 11.8 is tagged [flexflow-environment-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-11.8). +* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0). The CUDA images are named `flexflow--`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or +* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish four version of `flexflow-environment` for AMD GPUs and, for NVIDIA GPUs, one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 12.0 is tagged [flexflow-environment-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-12.0). The easiest way to download any of the Docker containers above is to call: ``` -FF_GPU_BACKEND= cuda_version= ./docker/pull.sh +./docker/pull.sh ``` -where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`), and `FF_GPU_BACKEND`/`cuda_version` are optional environment variables you can use if you wish to download the docker image for a GPU backend and/or cuda version other than those installed on your machine (leaving these variables unset will let the script autodetect which version to download depending on your setup). +where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`). By default, the script will assume a NVIDIA backend and attempt to detect the CUDA version on your machine, to download the relevant container. If your machine has AMD GPUs, or no GPUs, or if you want to specify the CUDA/ROCM version to download, set the environment variables below: + +* `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be downloaded. +* `cuda_version` (supported options: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored +* `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored. + After downloading a container you can use the `run.sh` script to run it by following the instructions in the section below. ## Building a Docker container from scratch -If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=11.8`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA. +If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](https://flexflow.readthedocs.io/en/latest/installation.html) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=12.0`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA. Similarly, you can pick the ROCm version by setting `hip_version` when the backend is `FF_GPU_BACKEND=hip_rocm`, whereas the env will be ignored for non-HIP backends. To build the FlexFlow container, run (the `flexflow` argument of the build script can be omitted): ``` -FF_GPU_BACKEND= cuda_version= ./docker/build.sh flexflow +./docker/build.sh flexflow ``` If you only want to build the `flexflow-environment` image (the base layers of the `flexflow` container, used in CI and for other internal purposes), run: ``` -FF_GPU_BACKEND= cuda_version= ./docker/build.sh flexflow-environment +./docker/build.sh flexflow-environment ``` ## Running a Docker container -After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND` and `cuda_version` optional environment variables to run the docker image with the desired GPU backend and CUDA version. Leaving these variables unset will instruct the script to autodetect the GPU backend and CUDA version installed on the current machine and run the Docker container with it if available. +After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND`, `cuda_version` and `hip_version` optional environment variables to run the docker image with the desired GPU backend and CUDA/HIP version: + +* `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be run. +* `cuda_version` (supported options: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored +* `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored. + +Leaving these variables unset will assume a GPU backend, and instruct the script to autodetect the CUDA version installed on the current machine and run the Docker container with it if available. ``` -FF_GPU_BACKEND= cuda_version= ./docker/run.sh --image_name flexflow +./docker/run.sh --image_name flexflow ``` If you wish to run the `flexflow-environment` container, run: ``` -FF_GPU_BACKEND= cuda_version= ./docker/run.sh --image_name flexflow-environment +./docker/run.sh --image_name flexflow-environment ``` N.B.: If you don't have GPUs available on the machine, or you wish to run the docker image without attaching GPUs, you can set the environment variable `ATTACH_GPUS=false` before running the script. diff --git a/docs/source/jupyter.rst b/docs/source/jupyter.rst deleted file mode 100644 index 2e37bfb183..0000000000 --- a/docs/source/jupyter.rst +++ /dev/null @@ -1,6 +0,0 @@ -***************** -Jupyter Notebook -***************** - -.. mdinclude:: ../../jupyter_notebook/README.md - :start-line: 2 diff --git a/jupyter_notebook/README.md b/jupyter_notebook/README.md index 70d94f0f16..fe25df6dbf 100644 --- a/jupyter_notebook/README.md +++ b/jupyter_notebook/README.md @@ -9,7 +9,7 @@ the in-browser jupyter notebook UI. ## Quick Start ### Pre-requisite * Python >= 3.6 -* FlexFlow Python binding needs to be installed, please check the [installation guide](https://github.com/flexflow/FlexFlow/blob/master/INSTALL.md) +* FlexFlow Python binding needs to be installed, please check the [installation guide](https://flexflow.readthedocs.io/en/latest/installation.html) * Install Jupyter notebook pip install notebook From b2ec6cb5d2b898db1ad4df32adf5699bc48aaac7 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Sun, 3 Sep 2023 10:15:07 -0400 Subject: [PATCH 221/344] fix group attention issue (#1062) * fix * tp degree > num_kv_heads --------- Co-authored-by: Gabriele Oliaro Co-authored-by: Zhihao Jia --- python/flexflow/serve/models/llama.py | 10 +++++++--- src/ops/inc_multihead_self_attention.cc | 4 +++- src/ops/tree_inc_multihead_self_attention.cc | 4 +++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index c716bff34d..9eacccfda6 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -26,6 +26,7 @@ def __init__(self, hf_config): self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.num_attention_heads = hf_config.num_attention_heads + self.num_key_value_heads = hf_config.num_attention_heads if hf_config.num_key_value_heads is None else hf_config.num_key_value_heads self.hidden_size = hf_config.hidden_size self.rms_norm_eps = hf_config.rms_norm_eps self.intermediate_size = hf_config.intermediate_size @@ -106,10 +107,11 @@ def build_model(self): ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: - mha = ffmodel.spec_inc_multihead_self_attention( + mha = ffmodel.spec_inc_multiquery_self_attention( attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, + self.llama_config.num_key_value_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size @@ -124,10 +126,11 @@ def build_model(self): name=f"layers_{i}_attention_weight", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: - mha = ffmodel.inc_multihead_self_attention_verify( + mha = ffmodel.inc_multiquery_self_attention_verify( attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, + self.llama_config.num_key_value_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size @@ -142,10 +145,11 @@ def build_model(self): name=f"layers_{i}_attention_weight", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multihead_self_attention( + mha = ffmodel.inc_multiquery_self_attention( attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, + self.llama_config.num_key_value_heads, self.llama_config.hidden_size // self.llama_config.num_attention_heads, self.llama_config.hidden_size diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index f4f64aee8a..ec8bc8839e 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -674,7 +674,9 @@ OpMeta *IncMultiHeadSelfAttention::init_task( assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree; - int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree; + int num_kv_heads = + attn->num_kv_heads / attn->tensor_parallelism_degree + + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index f10c9a8f0f..e26c306cf1 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -676,7 +676,9 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); // int num_q_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree; - int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree; + int num_kv_heads = + attn->num_kv_heads / attn->tensor_parallelism_degree + + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); From 1f5fe029fafa649d1bb2bc18266d307596876f7a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 5 Sep 2023 21:09:21 -0400 Subject: [PATCH 222/344] Add method to initialize FlexFlow runtime (#1089) * added init to flexflow core, native support of jupyter * cleanup * update workflows * fix * fixed warnings * fix * fix * update build * install nccl in ci for faster workflows * add jq to dependencies * cleanup * fix python tests * fix * fix legion python issue * fix * fix * fix * fix * fix * fix * fix * remove more warnings * remove redundant function * fix * fix * remove debug prints * install jupyter notebook in docker --------- Co-authored-by: Zhihao Jia --- .github/workflows/build.yml | 8 +- .github/workflows/docker-build.yml | 6 +- .github/workflows/helpers/install_cudnn.sh | 2 +- .../workflows/helpers/install_dependencies.sh | 4 +- .github/workflows/helpers/install_nccl.sh | 51 +++ .github/workflows/pip-install.yml | 5 +- INSTALL.md | 2 +- conda/environment.yml | 1 + conda/flexflow.yml | 1 + docker/flexflow-environment/Dockerfile | 4 +- examples/python/keras/callback.py | 3 + examples/python/keras/elementwise_max_min.py | 4 +- .../python/keras/elementwise_mul_broadcast.py | 5 +- examples/python/keras/func_cifar10_alexnet.py | 2 + examples/python/keras/func_cifar10_cnn.py | 3 + .../python/keras/func_cifar10_cnn_concat.py | 2 + .../keras/func_cifar10_cnn_concat_model.py | 3 + .../func_cifar10_cnn_concat_seq_model.py | 3 + .../python/keras/func_cifar10_cnn_nested.py | 3 + .../python/keras/func_cifar10_cnn_net2net.py | 2 + examples/python/keras/func_mnist_cnn.py | 3 + .../python/keras/func_mnist_cnn_concat.py | 3 + examples/python/keras/func_mnist_mlp.py | 3 + .../python/keras/func_mnist_mlp_concat.py | 3 + .../python/keras/func_mnist_mlp_concat2.py | 3 + .../python/keras/func_mnist_mlp_net2net.py | 5 +- examples/python/keras/gather.py | 4 +- examples/python/keras/identity_loss.py | 4 +- examples/python/keras/reduce_sum.py | 4 +- examples/python/keras/regularizer.py | 4 +- examples/python/keras/reshape.py | 3 + examples/python/keras/rsqrt.py | 4 +- examples/python/keras/seq_cifar10_cnn.py | 3 + examples/python/keras/seq_mnist_cnn.py | 3 + examples/python/keras/seq_mnist_cnn_nested.py | 3 + .../python/keras/seq_mnist_cnn_net2net.py | 3 + examples/python/keras/seq_mnist_mlp.py | 3 + .../python/keras/seq_mnist_mlp_net2net.py | 3 + examples/python/keras/seq_reuters_mlp.py | 4 + examples/python/keras/unary.py | 4 +- examples/python/native/alexnet.py | 13 +- examples/python/native/cifar10_cnn.py | 13 +- examples/python/native/cifar10_cnn_attach.py | 2 + examples/python/native/cifar10_cnn_concat.py | 4 + examples/python/native/mnist_cnn.py | 13 +- examples/python/native/mnist_mlp.py | 13 +- examples/python/native/mnist_mlp_attach.py | 2 + examples/python/native/print_layers.py | 3 + examples/python/native/split.py | 3 + include/flexflow/config.h | 2 +- include/flexflow/machine_view.h | 2 +- include/flexflow/ops/sampling.h | 2 +- include/flexflow/utils/hip_helper.h | 2 +- inference/utils/download_hf_model.py | 3 - python/flexflow/config.py | 69 ++- python/flexflow/core/__init__.py | 162 +++++-- python/flexflow/core/flexflow_cffi.py | 401 +++++++++--------- python/flexflow/flexflow_python | 3 +- python/flexflow/jupyter.py | 37 -- python/flexflow/serve/__init__.py | 229 +++++----- python/flexflow_python_build.py | 8 +- src/ops/inc_multihead_self_attention.cpp | 47 +- src/ops/spec_inc_multihead_self_attention.cpp | 50 ++- src/ops/tree_inc_multihead_self_attention.cpp | 45 +- tests/align/align_create_tensor_ff.py | 12 +- tests/align/align_utils.py | 7 +- tests/align/test_all_operators.sh | 2 +- tests/cpp_gpu_tests.sh | 10 - tests/multi_gpu_tests.sh | 101 +++-- tests/multinode_helpers/mpi_wrapper1.sh | 1 - tests/multinode_helpers/mpi_wrapper2.sh | 11 +- tests/python_interface_test.sh | 19 +- 72 files changed, 853 insertions(+), 621 deletions(-) create mode 100755 .github/workflows/helpers/install_nccl.sh delete mode 100644 python/flexflow/jupyter.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1c6eff4a12..d05856f1a9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -67,7 +67,7 @@ jobs: uses: conda-incubator/setup-miniconda@v2 with: activate-environment: flexflow - environment-file: conda/environment.yml + environment-file: conda/flexflow.yml auto-activate-base: false - name: Build FlexFlow @@ -131,15 +131,14 @@ jobs: cd build ./tests/unit/unit-test - - name: Check availability of Python flexflow.core module + - name: Check availability of flexflow modules in Python run: | if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH" fi # Remove build folder to check that the installed version can run independently of the build files rm -rf build - export CPU_ONLY_TEST=1 - python -c "import flexflow.core; exit()" + python -c "import flexflow.core; import flexflow.serve as ff; exit()" makefile-build: name: Build FlexFlow with the Makefile @@ -186,5 +185,4 @@ jobs: cd python make -j $n_build_cores - export CPU_ONLY_TEST=1 python -c 'import flexflow.core' diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index b0ca251510..2234ec60aa 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -100,7 +100,7 @@ jobs: echo "Skipping build to save time" fi - - name: Check availability of Python flexflow.core module + - name: Check availability of flexflow modules in Python if: ${{ matrix.gpu_backend == 'cuda' }} env: deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} @@ -108,9 +108,9 @@ jobs: run: | if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then if [[ $FF_GPU_BACKEND == "cuda" ]]; then - docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'" + docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" else - docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; exit()'" + docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" fi else echo "Skipping test to save time" diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh index 75e59109eb..7c11a4a420 100755 --- a/.github/workflows/helpers/install_cudnn.sh +++ b/.github/workflows/helpers/install_cudnn.sh @@ -44,7 +44,7 @@ elif [[ "$cuda_version" == "11.7" ]]; then elif [[ "$cuda_version" == "11.8" ]]; then CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -elif [[ "$cuda_version" == "11.8" ]]; then +elif [[ "$cuda_version" == "12.0" ]]; then echo "CUDNN support for CUDA version 12.0 not yet added" exit 1 fi diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh index 1357882b5d..6435a37eea 100755 --- a/.github/workflows/helpers/install_dependencies.sh +++ b/.github/workflows/helpers/install_dependencies.sh @@ -7,7 +7,7 @@ cd "${BASH_SOURCE[0]%/*}" # General dependencies echo "Installing apt dependencies..." -sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \ +sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev jq && \ sudo rm -rf /var/lib/apt/lists/* FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"} @@ -20,6 +20,8 @@ fi if [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then # Install CUDNN ./install_cudnn.sh + # Install NCCL + ./install_nccl.sh fi # Install HIP dependencies if needed if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh new file mode 100755 index 0000000000..ca88668d84 --- /dev/null +++ b/.github/workflows/helpers/install_nccl.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -euo pipefail +set -x + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Add NCCL key ring +ubuntu_version=$(lsb_release -rs) +ubuntu_version=${ubuntu_version//./} +wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb" +sudo dpkg -i cuda-keyring_1.0-1_all.deb +sudo apt update -y +rm -f cuda-keyring_1.0-1_all.deb + +# Install NCCL +cuda_version=${1:-11.8.0} +cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') +echo "Installing NCCL for CUDA version: ${cuda_version} ..." + +# We need to run a different install command based on the CUDA version, otherwise running `sudo apt install libnccl2 libnccl-dev` +# will automatically upgrade CUDA to the latest version. + +if [[ "$cuda_version" == "11.0" ]]; then + sudo apt install libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0 +elif [[ "$cuda_version" == "11.1" ]]; then + sudo apt install libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1 +elif [[ "$cuda_version" == "11.2" ]]; then + sudo apt install libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2 +elif [[ "$cuda_version" == "11.3" ]]; then + sudo apt install libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3 +elif [[ "$cuda_version" == "11.4" ]]; then + sudo apt install libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4 +elif [[ "$cuda_version" == "11.5" ]]; then + sudo apt install libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5 +elif [[ "$cuda_version" == "11.6" ]]; then + sudo apt install libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6 +elif [[ "$cuda_version" == "11.7" ]]; then + sudo apt install libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7 +elif [[ "$cuda_version" == "11.8" ]]; then + sudo apt install libnccl2=2.16.5-1+cuda11.8 libnccl-dev=2.16.5-1+cuda11.8 +elif [[ "$cuda_version" == "12.0" ]]; then + sudo apt install libnccl2=2.18.3-1+cuda12.0 libnccl-dev=2.18.3-1+cuda12.0 +elif [[ "$cuda_version" == "12.1" ]]; then + sudo apt install libnccl2=2.18.3-1+cuda12.1 libnccl-dev=2.18.3-1+cuda12.1 +elif [[ "$cuda_version" == "12.2" ]]; then + sudo apt install libnccl2=2.18.3-1+cuda12.2 libnccl-dev=2.18.3-1+cuda12.2 +else + echo "Installing NCCL for CUDA version ${cuda_version} is not supported" + exit 1 +fi diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index 695ed9857b..3562134987 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -69,9 +69,8 @@ jobs: # Remove build folder to check that the installed version can run independently of the build files rm -rf build - - name: Check availability of Python flexflow.core module + - name: Check availability of flexflow modules in Python run: | export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH" sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1" - export CPU_ONLY_TEST=1 - python -c "import flexflow.core; exit()" + python -c 'import flexflow.core; import flexflow.serve as ff; exit()' diff --git a/INSTALL.md b/INSTALL.md index 72993f3330..a197df24ed 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -30,7 +30,7 @@ If you are planning to build the Python interface, you will need to install seve The `conda` environment can be created and activated as: ``` -conda env create -f conda/environment.yml +conda env create -f conda/flexflow.yml conda activate flexflow ``` diff --git a/conda/environment.yml b/conda/environment.yml index c1acd7b3da..9ae0dc9c7a 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -9,6 +9,7 @@ dependencies: - pybind11 - rust - cmake-build-extension + - jq - pip - pip: - qualname>=0.1.0 diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 9ff7f3957a..c9226269f2 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -9,6 +9,7 @@ dependencies: - pybind11 - rust - cmake-build-extension + - jq - pytest - pip - pip: diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index f1ebdcc28a..774c585b44 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -5,7 +5,7 @@ LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow LABEL org.opencontainers.image.description="FlexFlow environment container" # Install basic dependencies -RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev && \ +RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \ rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \ @@ -79,7 +79,7 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1 # Install CPU-only Pytorch and related dependencies RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops -RUN pip3 install tensorflow +RUN pip3 install tensorflow notebook # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/examples/python/keras/callback.py b/examples/python/keras/callback.py index f4ebc03d17..c647822957 100644 --- a/examples/python/keras/callback.py +++ b/examples/python/keras/callback.py @@ -20,6 +20,7 @@ from flexflow.keras.datasets import cifar10 from flexflow.keras import backend as K from accuracy import ModelAccuracy +import flexflow.core as ff import numpy as np @@ -68,4 +69,6 @@ def top_level_task(): if __name__ == "__main__": print("Functional API, cifar10 cnn callback") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/elementwise_max_min.py b/examples/python/keras/elementwise_max_min.py index 95291f1273..52a80b431b 100644 --- a/examples/python/keras/elementwise_max_min.py +++ b/examples/python/keras/elementwise_max_min.py @@ -1,5 +1,6 @@ from flexflow.keras.layers import Dense, Input, Maximum, Minimum import flexflow.keras.optimizers +import flexflow.core as ff import numpy as np @@ -54,7 +55,8 @@ def elementwise_min(): epochs = 2 ) - if __name__ == '__main__': + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) elementwise_max() elementwise_min() diff --git a/examples/python/keras/elementwise_mul_broadcast.py b/examples/python/keras/elementwise_mul_broadcast.py index d68476a6cb..1405871a7a 100644 --- a/examples/python/keras/elementwise_mul_broadcast.py +++ b/examples/python/keras/elementwise_mul_broadcast.py @@ -1,6 +1,6 @@ from flexflow.keras.layers import Dense, Input, Reshape, Multiply import flexflow.keras.optimizers - +import flexflow.core as ff import numpy as np def broadcast1(): @@ -92,8 +92,9 @@ def broadcast_both(): epochs = 2 ) - if __name__ == '__main__': + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) broadcast1() broadcast2() broadcast_both() diff --git a/examples/python/keras/func_cifar10_alexnet.py b/examples/python/keras/func_cifar10_alexnet.py index c0ade0b722..a4f8dc61ac 100644 --- a/examples/python/keras/func_cifar10_alexnet.py +++ b/examples/python/keras/func_cifar10_alexnet.py @@ -77,5 +77,7 @@ def top_level_task(): if __name__ == "__main__": print("Functional API, cifar10 alexnet") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_cifar10_cnn.py b/examples/python/keras/func_cifar10_cnn.py index 423541386f..ce0358da53 100644 --- a/examples/python/keras/func_cifar10_cnn.py +++ b/examples/python/keras/func_cifar10_cnn.py @@ -61,7 +61,10 @@ def top_level_task(): model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)]) + if __name__ == "__main__": print("Functional API, cifar10 cnn") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_cifar10_cnn_concat.py b/examples/python/keras/func_cifar10_cnn_concat.py index 72dfdeffaf..4fe0f5ce18 100644 --- a/examples/python/keras/func_cifar10_cnn_concat.py +++ b/examples/python/keras/func_cifar10_cnn_concat.py @@ -75,5 +75,7 @@ def top_level_task(): if __name__ == "__main__": print("Functional API, cifar10 cnn concat") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_cifar10_cnn_concat_model.py b/examples/python/keras/func_cifar10_cnn_concat_model.py index 39885bac8c..c8838de1eb 100644 --- a/examples/python/keras/func_cifar10_cnn_concat_model.py +++ b/examples/python/keras/func_cifar10_cnn_concat_model.py @@ -75,7 +75,10 @@ def top_level_task(): model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)]) + if __name__ == "__main__": print("Functional API, cifar10 cnn concat model") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py index cda95beb49..3e4f939283 100644 --- a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py +++ b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py @@ -68,7 +68,10 @@ def top_level_task(): model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)]) + if __name__ == "__main__": print("Functional API, cifar10 cnn concat sequential model") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_cifar10_cnn_nested.py b/examples/python/keras/func_cifar10_cnn_nested.py index def8a6bcf4..7391ba5a2b 100644 --- a/examples/python/keras/func_cifar10_cnn_nested.py +++ b/examples/python/keras/func_cifar10_cnn_nested.py @@ -67,7 +67,10 @@ def top_level_task(): model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)]) + if __name__ == "__main__": print("Functional API, cifar10 cnn nested") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_cifar10_cnn_net2net.py b/examples/python/keras/func_cifar10_cnn_net2net.py index 5434e28aca..695a1157dd 100644 --- a/examples/python/keras/func_cifar10_cnn_net2net.py +++ b/examples/python/keras/func_cifar10_cnn_net2net.py @@ -120,5 +120,7 @@ def top_level_task(): if __name__ == "__main__": print("Functional API, cifarf10 cnn teach student") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_mnist_cnn.py b/examples/python/keras/func_mnist_cnn.py index a81ddd0f94..8f2041dfe2 100644 --- a/examples/python/keras/func_mnist_cnn.py +++ b/examples/python/keras/func_mnist_cnn.py @@ -70,7 +70,10 @@ def top_level_task(): model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)]) + if __name__ == "__main__": print("Functional API, mnist cnn") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_mnist_cnn_concat.py b/examples/python/keras/func_mnist_cnn_concat.py index 54c1f32d36..64bb2cdbb0 100644 --- a/examples/python/keras/func_mnist_cnn_concat.py +++ b/examples/python/keras/func_mnist_cnn_concat.py @@ -61,7 +61,10 @@ def top_level_task(): model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)]) + if __name__ == "__main__": print("Functional API, mnist cnn concat") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_mnist_mlp.py b/examples/python/keras/func_mnist_mlp.py index 5521f193c1..ddf2022366 100644 --- a/examples/python/keras/func_mnist_mlp.py +++ b/examples/python/keras/func_mnist_mlp.py @@ -54,7 +54,10 @@ def top_level_task(): model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)]) + if __name__ == "__main__": print("Functional API, mnist mlp") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_mnist_mlp_concat.py b/examples/python/keras/func_mnist_mlp_concat.py index 29b982cea8..6b282f65e6 100644 --- a/examples/python/keras/func_mnist_mlp_concat.py +++ b/examples/python/keras/func_mnist_mlp_concat.py @@ -76,7 +76,10 @@ def top_level_task(): model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)]) + if __name__ == "__main__": print("Functional API, mnist mlp concat") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_mnist_mlp_concat2.py b/examples/python/keras/func_mnist_mlp_concat2.py index 5a35bd9f8b..b309a00187 100644 --- a/examples/python/keras/func_mnist_mlp_concat2.py +++ b/examples/python/keras/func_mnist_mlp_concat2.py @@ -87,7 +87,10 @@ def top_level_task(): model.fit([x_train, x_train, x_train], y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)]) + if __name__ == "__main__": print("Functional API, mnist mlp concat with input") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/func_mnist_mlp_net2net.py b/examples/python/keras/func_mnist_mlp_net2net.py index ed8589e22e..0b44029938 100644 --- a/examples/python/keras/func_mnist_mlp_net2net.py +++ b/examples/python/keras/func_mnist_mlp_net2net.py @@ -88,7 +88,10 @@ def top_level_task(): student_model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)]) + if __name__ == "__main__": print("Functional API, mnist mlp teach student") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() - gc.collect() \ No newline at end of file + gc.collect() diff --git a/examples/python/keras/gather.py b/examples/python/keras/gather.py index 15ccd61579..f14d737d17 100644 --- a/examples/python/keras/gather.py +++ b/examples/python/keras/gather.py @@ -1,7 +1,7 @@ from flexflow.keras.layers import Dense, Input, Reshape from flexflow.keras.backend.internal import gather import flexflow.keras.optimizers - +import flexflow.core as ff import numpy as np @@ -42,4 +42,6 @@ def gather_example(): if __name__ == '__main__': + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) gather_example() diff --git a/examples/python/keras/identity_loss.py b/examples/python/keras/identity_loss.py index d0396c6d46..8e26fc246b 100644 --- a/examples/python/keras/identity_loss.py +++ b/examples/python/keras/identity_loss.py @@ -15,7 +15,7 @@ from flexflow.keras.layers import Dense, Input, Reshape, Multiply import flexflow.keras.optimizers - +import flexflow.core as ff import numpy as np def test_identity_loss(): @@ -36,4 +36,6 @@ def test_identity_loss(): if __name__ == "__main__": + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) test_identity_loss() diff --git a/examples/python/keras/reduce_sum.py b/examples/python/keras/reduce_sum.py index 3857738d4b..33030e2cec 100644 --- a/examples/python/keras/reduce_sum.py +++ b/examples/python/keras/reduce_sum.py @@ -15,7 +15,7 @@ from flexflow.keras.layers import Dense, Input, Reshape, Multiply import flexflow.keras.optimizers - +import flexflow.core as ff import numpy as np def test_reduce_sum1(): @@ -74,6 +74,8 @@ def test_reduce_sum3(): if __name__ == "__main__": + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) test_reduce_sum1() test_reduce_sum2() test_reduce_sum3() diff --git a/examples/python/keras/regularizer.py b/examples/python/keras/regularizer.py index 3b1e30d04d..3a24129db2 100644 --- a/examples/python/keras/regularizer.py +++ b/examples/python/keras/regularizer.py @@ -2,7 +2,7 @@ from flexflow.keras.layers import Dense, Input, Reshape from flexflow.keras.backend.internal import gather import flexflow.keras.optimizers - +import flexflow.core as ff import numpy as np @@ -26,4 +26,6 @@ def regularizer_example(): if __name__ == '__main__': + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) regularizer_example() diff --git a/examples/python/keras/reshape.py b/examples/python/keras/reshape.py index 1acce1b2b6..ae756a8f70 100644 --- a/examples/python/keras/reshape.py +++ b/examples/python/keras/reshape.py @@ -55,7 +55,10 @@ def top_level_task(): print(model.summary()) model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)]) + if __name__ == "__main__": print("Functional API, mnist mlp") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() gc.collect() diff --git a/examples/python/keras/rsqrt.py b/examples/python/keras/rsqrt.py index be55c8a1fd..e33873ecd5 100644 --- a/examples/python/keras/rsqrt.py +++ b/examples/python/keras/rsqrt.py @@ -16,7 +16,7 @@ from flexflow.keras.layers import Dense, Input from flexflow.keras.backend.internal import rsqrt import flexflow.keras.optimizers - +import flexflow.core as ff import numpy as np def test_rsqrt(): @@ -40,4 +40,6 @@ def test_rsqrt(): if __name__ == "__main__": + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) test_rsqrt() diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py index 80f4390d4c..281a09ed70 100644 --- a/examples/python/keras/seq_cifar10_cnn.py +++ b/examples/python/keras/seq_cifar10_cnn.py @@ -54,6 +54,9 @@ def top_level_task(): model.fit(x_train, y_train, epochs=80, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)]) + if __name__ == "__main__": print("Sequantial model, cifar10 cnn") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/seq_mnist_cnn.py b/examples/python/keras/seq_mnist_cnn.py index eaf0fdfc16..09ad4ea4cf 100644 --- a/examples/python/keras/seq_mnist_cnn.py +++ b/examples/python/keras/seq_mnist_cnn.py @@ -55,6 +55,9 @@ def top_level_task(): model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)]) + if __name__ == "__main__": print("Sequential model, mnist cnn") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/seq_mnist_cnn_nested.py b/examples/python/keras/seq_mnist_cnn_nested.py index 2c92349cd6..628129ddb9 100644 --- a/examples/python/keras/seq_mnist_cnn_nested.py +++ b/examples/python/keras/seq_mnist_cnn_nested.py @@ -65,6 +65,9 @@ def top_level_task(): model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)]) + if __name__ == "__main__": print("Sequential model, mnist cnn nested model") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/seq_mnist_cnn_net2net.py b/examples/python/keras/seq_mnist_cnn_net2net.py index 4b9c9c16ba..e2a04ba686 100644 --- a/examples/python/keras/seq_mnist_cnn_net2net.py +++ b/examples/python/keras/seq_mnist_cnn_net2net.py @@ -98,6 +98,9 @@ def top_level_task(): create_student_model_cnn(teacher_model, num_classes, x_train, y_train) + if __name__ == "__main__": print("Sequential model, mnist mlp teacher student") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/seq_mnist_mlp.py b/examples/python/keras/seq_mnist_mlp.py index 21c7435eb7..46b774a2e1 100644 --- a/examples/python/keras/seq_mnist_mlp.py +++ b/examples/python/keras/seq_mnist_mlp.py @@ -55,6 +55,9 @@ def top_level_task(): model.fit(x_train, y_train, epochs=20, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)]) model.evaluate(x=x_train, y=y_train) + if __name__ == "__main__": print("Sequential model, mnist mlp") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/seq_mnist_mlp_net2net.py b/examples/python/keras/seq_mnist_mlp_net2net.py index 628f76db3a..c7a7d7a6f8 100644 --- a/examples/python/keras/seq_mnist_mlp_net2net.py +++ b/examples/python/keras/seq_mnist_mlp_net2net.py @@ -91,6 +91,9 @@ def top_level_task(): create_student_model_mlp(teacher_model, num_classes, x_train, y_train) + if __name__ == "__main__": print("Sequential model, mnist mlp teacher student") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/seq_reuters_mlp.py b/examples/python/keras/seq_reuters_mlp.py index 5412ad0599..ed748f67d8 100644 --- a/examples/python/keras/seq_reuters_mlp.py +++ b/examples/python/keras/seq_reuters_mlp.py @@ -19,6 +19,7 @@ from flexflow.keras.datasets import reuters from flexflow.keras.preprocessing.text import Tokenizer from flexflow.keras.callbacks import Callback, VerifyMetrics +import flexflow.core as ff import numpy as np from accuracy import ModelAccuracy @@ -61,6 +62,9 @@ def top_level_task(): model.fit(x_train, y_train, epochs=epochs, callbacks=[VerifyMetrics(ModelAccuracy.REUTERS_MLP)]) + if __name__ == "__main__": print("Sequential model, reuters mlp") + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/keras/unary.py b/examples/python/keras/unary.py index 622e15dc2d..63c83b9af2 100644 --- a/examples/python/keras/unary.py +++ b/examples/python/keras/unary.py @@ -62,4 +62,6 @@ def top_level_task(): if __name__ == "__main__": print("alexnet keras") - top_level_task() \ No newline at end of file + configs = ff.get_configs() + ff.init_flexflow_runtime(configs) + top_level_task() diff --git a/examples/python/native/alexnet.py b/examples/python/native/alexnet.py index 61397cefc1..6d6e58a7f2 100644 --- a/examples/python/native/alexnet.py +++ b/examples/python/native/alexnet.py @@ -3,7 +3,7 @@ from accuracy import ModelAccuracy from PIL import Image -import argparse +import argparse, json import numpy as np @@ -133,7 +133,18 @@ def test_accuracy(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--test_acc", action="store_true", help="Test accuracy flag") + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default=None, + ) args, unknown = parser.parse_known_args() + configs_dict = None + if args.config_file is not None: + with open(args.config_file) as f: + configs_dict = json.load(f) + init_flexflow_runtime(configs_dict) if args.test_acc: print("Testing cifar10 alexnet training accuracy") test_accuracy() diff --git a/examples/python/native/cifar10_cnn.py b/examples/python/native/cifar10_cnn.py index 44bdce4519..11bc936617 100644 --- a/examples/python/native/cifar10_cnn.py +++ b/examples/python/native/cifar10_cnn.py @@ -2,7 +2,7 @@ from flexflow.keras.datasets import cifar10 from accuracy import ModelAccuracy -import argparse +import argparse, json def top_level_task(): @@ -90,7 +90,18 @@ def test_accuracy(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--test_acc", action="store_true", help="Test accuracy flag") + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default=None, + ) args, unknown = parser.parse_known_args() + configs_dict = None + if args.config_file is not None: + with open(args.config_file) as f: + configs_dict = json.load(f) + init_flexflow_runtime(configs_dict) if args.test_acc: print("Testing cifar10 cnn training accuracy") test_accuracy() diff --git a/examples/python/native/cifar10_cnn_attach.py b/examples/python/native/cifar10_cnn_attach.py index ba4288c8cd..e200cc03cf 100644 --- a/examples/python/native/cifar10_cnn_attach.py +++ b/examples/python/native/cifar10_cnn_attach.py @@ -144,4 +144,6 @@ def top_level_task(): if __name__ == "__main__": print("cifar10 cnn attach") + configs = get_configs() + init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/native/cifar10_cnn_concat.py b/examples/python/native/cifar10_cnn_concat.py index b177295ad6..7234116b3c 100644 --- a/examples/python/native/cifar10_cnn_concat.py +++ b/examples/python/native/cifar10_cnn_concat.py @@ -70,6 +70,10 @@ def top_level_task(): if accuracy < ModelAccuracy.CIFAR10_CNN.value: assert 0, 'Check Accuracy' + + if __name__ == "__main__": print("cifar10 cnn concat") + configs = get_configs() + init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/native/mnist_cnn.py b/examples/python/native/mnist_cnn.py index 6eabbe57db..f6787a4827 100644 --- a/examples/python/native/mnist_cnn.py +++ b/examples/python/native/mnist_cnn.py @@ -18,7 +18,7 @@ from flexflow.keras.datasets import mnist from accuracy import ModelAccuracy -import argparse +import argparse, json def top_level_task(): @@ -89,7 +89,18 @@ def test_accuracy(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--test_acc", action="store_true", help="Test accuracy flag") + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default=None, + ) args, unknown = parser.parse_known_args() + configs_dict = None + if args.config_file is not None: + with open(args.config_file) as f: + configs_dict = json.load(f) + init_flexflow_runtime(configs_dict) if args.test_acc: print("Testing mnist cnn training accuracy") test_accuracy() diff --git a/examples/python/native/mnist_mlp.py b/examples/python/native/mnist_mlp.py index aefe7cfd57..8763eba40c 100644 --- a/examples/python/native/mnist_mlp.py +++ b/examples/python/native/mnist_mlp.py @@ -3,7 +3,7 @@ from flexflow.keras.datasets import mnist from accuracy import ModelAccuracy -import argparse +import argparse, json def top_level_task(): @@ -75,7 +75,18 @@ def test_accuracy(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--test_acc", action="store_true", help="Test accuracy flag") + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default=None, + ) args, unknown = parser.parse_known_args() + configs_dict = None + if args.config_file is not None: + with open(args.config_file) as f: + configs_dict = json.load(f) + init_flexflow_runtime(configs_dict) if args.test_acc: print("Testing mnist mlp training accuracy") test_accuracy() diff --git a/examples/python/native/mnist_mlp_attach.py b/examples/python/native/mnist_mlp_attach.py index 6e7c8f8405..1294432ec5 100644 --- a/examples/python/native/mnist_mlp_attach.py +++ b/examples/python/native/mnist_mlp_attach.py @@ -134,4 +134,6 @@ def top_level_task(): if __name__ == "__main__": print("mnist mlp attach") + configs = get_configs() + init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/native/print_layers.py b/examples/python/native/print_layers.py index 22b87e0b86..481ecc3477 100644 --- a/examples/python/native/print_layers.py +++ b/examples/python/native/print_layers.py @@ -119,6 +119,9 @@ def top_level_task(): # ffmodel.print_layers(0) + if __name__ == "__main__": print("alexnet") + configs = get_configs() + init_flexflow_runtime(configs) top_level_task() diff --git a/examples/python/native/split.py b/examples/python/native/split.py index dfd8b0e572..f79ff04e14 100644 --- a/examples/python/native/split.py +++ b/examples/python/native/split.py @@ -77,6 +77,9 @@ def top_level_task(): # if accuracy < ModelAccuracy.CIFAR10_CNN.value: # assert 0, 'Check Accuracy' + if __name__ == "__main__": print("cifar10 cnn split") + configs = get_configs() + init_flexflow_runtime(configs) top_level_task() diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 2479358bfb..9716060173 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -32,7 +32,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include #else -#include +#include #endif #endif diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h index 719792c10c..807b0c9c0d 100644 --- a/include/flexflow/machine_view.h +++ b/include/flexflow/machine_view.h @@ -7,7 +7,7 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include #else -#include +#include #endif #endif #include "flexflow/config.h" diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h index d690888a39..1696582cc1 100644 --- a/include/flexflow/ops/sampling.h +++ b/include/flexflow/ops/sampling.h @@ -9,7 +9,7 @@ #include #include #elif defined(FF_USE_HIP_ROCM) -#include +#include #include #endif #include "flexflow/utils/memory_allocator.h" diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 7f6403c767..be6f4a713d 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -6,7 +6,7 @@ #include #include #ifdef FF_USE_NCCL -#include +#include #endif #define FatalError(s) \ diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 689730f32b..03fc8e1633 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -35,9 +35,6 @@ def parse_args(): def main(args): - # Initialize FF serve to gain access to its utils - ff.init_cpu() - if args.full_precision_only: data_types = ff.DataType.DT_FLOAT elif args.half_precision_only: diff --git a/python/flexflow/config.py b/python/flexflow/config.py index 44d460d832..d5f2131ae8 100644 --- a/python/flexflow/config.py +++ b/python/flexflow/config.py @@ -16,35 +16,60 @@ import os # python binding -_FF_PYTHON_BINDING = 'cffi' +_FF_PYTHON_BINDING = "cffi" -if 'FF_USE_CFFI' in os.environ: - use_pybind = not int(os.environ['FF_USE_CFFI']) +if "FF_USE_CFFI" in os.environ: + use_pybind = not int(os.environ["FF_USE_CFFI"]) else: - use_pybind = False + use_pybind = False if use_pybind: - _FF_PYTHON_BINDING = 'pybind11' + _FF_PYTHON_BINDING = "pybind11" else: - _FF_PYTHON_BINDING = 'cffi' - + _FF_PYTHON_BINDING = "cffi" + + def flexflow_python_binding(): - return _FF_PYTHON_BINDING - -# build docs -_FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) - -# init import -# It is used to run __init__.py in flexflow/core -# The following cases __init__.py is not needed: -# 1. build docs = True -_FF_INIT_IMPORT = _FF_BUILD_DOCS == False - -def flexflow_init_import(): - return _FF_INIT_IMPORT - + return _FF_PYTHON_BINDING + + +_FF_ALREADY_INITIALIZED = False + + +def flexflow_already_initialized(): + global _FF_ALREADY_INITIALIZED + return _FF_ALREADY_INITIALIZED + + +def set_flexflow_initialized(): + global _FF_ALREADY_INITIALIZED + if _FF_ALREADY_INITIALIZED == True: + raise RuntimeError( + "Attempting to set _FF_ALREADY_INITIALIZED=True, but _FF_ALREADY_INITIALIZED is already True" + ) + _FF_ALREADY_INITIALIZED = True + + # FlexFlow dir _FF_DIR = os.path.dirname(os.path.realpath(__file__)) + def flexflow_dir(): - return _FF_DIR + return _FF_DIR + +# Get runtime configs from the command line +def get_configs(): + import argparse,json + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default=None, + ) + args, unknown = parser.parse_known_args() + if args.config_file is not None: + with open(args.config_file) as f: + return json.load(f) + else: + return None diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 5b421a74ed..5e8e4ece81 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -20,54 +20,128 @@ import atexit import os import sys +import warnings +from typing import Optional from flexflow.config import * -from flexflow.jupyter import * +# check which python binding to use +if flexflow_python_binding() == "pybind11": + # print("Using pybind11 flexflow bindings.") + from .flexflow_pybind11 import * +else: + # print("Using cffi flexflow bindings.") + from .flexflow_cffi import * -if flexflow_init_import(): - os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" - from legion_cffi import ffi, is_legion_python - from .flexflowlib import flexflow_library - - # Default python mode - if is_legion_python == False: - print("Using Default Python") - _FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) - _CPU_ONLY = bool(os.environ.get('CPU_ONLY_TEST')) - if not _CPU_ONLY and not "-ll:gpu" in sys.argv: - os.environ["REALM_DEFAULT_ARGS"] = "-ll:gpu 1" - if not _FF_BUILD_DOCS and not _CPU_ONLY: - from legion_top import ( - legion_canonical_python_main, - legion_canonical_python_cleanup, - ) - import atexit, sys, os - # run from jupyter - if "ipykernel_launcher.py" in sys.argv[0]: - sys_argv = ["python", "dummy.py"] - argv_dict = load_jupyter_config() - for key, value in argv_dict.items(): - sys_argv.append(key) - sys_argv.append(str(value)) - else: - sys_argv = [ - "python", - ] + sys.argv - legion_canonical_python_main(sys_argv) - atexit.register(legion_canonical_python_cleanup) - else: - print("Using Legion Python") +ff_arg_to_sysarg = { + # General args + "num_gpus": "-ll:gpu", + "memory_per_gpu": "-ll:fsize", + "zero_copy_memory_per_node": "-ll:zsize", + "num_cpus": "-ll:cpu", + "legion_utility_processors": "-ll:util", + "profiling": "--profiling", + "fusion": "--fusion", + "disable_control_replication": "--disable-control-replication", + # Training args + "epochs": "--epochs", + "batch_size": "--batch-size", + "learning_rate": "--learning-rate", + "weight_decay": "--weight-decay", + "print_frequency": "--print-freq", + "dataset": "--dataset", + "budget": "--budget", + "search_budget": "--search-budget", + "alpha": "--alpha", + "search_alpha": "--search-alpha", + "simulator_workspace_size": "--simulator-workspace-size", + "import": "--import", + "import_strategy": "--import-strategy", + "export": "--export", + "export_strategy": "--export-strategy", + "only_data_parallel": "--only-data-parallel", + "enable_parameter_parallel": "--enable-parameter-parallel", + "enable_attribute_parallel": "--enable-attribute-parallel", + "allow_tensor_op_math_conversion": "--allow-tensor-op-math-conversion", + "search_overlap_backward_update": "--overlap", + "export_strategy_task_graph_file": "--taskgraph", + "include_costs_dot_graph": "--include-costs-dot-graph", + "export_strategy_computation_graph_file": "--compgraph", + "machine_model_version": "--machine-model-version", + "machine_model_file": "--machine-model-file", + "simulator_segment_size": "--simulator-segment-size", + "simulator_max_num_segments": "--simulator-max-num-segments", + "enable_propagation": "--enable-propagation", + "enable_inplace_optimizations": "--enable-inplace-optimization", + "search_num_nodes": "--search-num-nodes", + "search_num_workers": "--search-num-workers", + "base_optimize_threshold": "--base-optimize-threshold", + "python_data_loader_type": "--python-data-loader-type", + "substitution_json_path": "--substitution-json", + "perform_memory_search": "--memory-search", + # Inference args + "data_parallelism_degree": "-data-parallelism-degree", + "tensor_parallelism_degree": "-tensor-parallelism-degree", + "pipeline_parallelism_degree": "-pipeline-parallelism-degree", + "offload": "-offload", + "offload_reserve_space_size": "-offload-reserve-space-size", + "use_4bit_quantization": "--4bit-quantization", + "use_8bit_quantization": "--8bit-quantization" +} - flexflow_library.initialize() - # check which python binding to use - if flexflow_python_binding() == "pybind11": - print("Using pybind11 flexflow bindings.") - from .flexflow_pybind11 import * - else: - print("Using cffi flexflow bindings.") - from .flexflow_cffi import * +def init_flexflow_runtime(configs_dict: Optional[dict] = None, **kwargs): + if not flexflow_already_initialized(): + os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" + from legion_cffi import is_legion_python + from .flexflowlib import flexflow_library -else: - pass + # Default python mode + if is_legion_python == False: + # print("Using Default Python") + from legion_top import ( + legion_canonical_python_main, + legion_canonical_python_cleanup, + ) + + # Either a configs_dict dictionary, or individual key-value parameters should be passed. Not both. + if configs_dict is not None and len(kwargs.items()) > 0: + raise ValueError("Cannot pass both configs_dict and individual args") + ff_args = configs_dict if configs_dict is not None else dict(kwargs.items()) + # Check presence of mandatory parameters + if ( + "num_gpus" not in ff_args + or "memory_per_gpu" not in ff_args + or "zero_copy_memory_per_node" not in ff_args + ): + raise ValueError( + "Missing one of the following required configs: num_gpus, memory_per_gpu, zero_copy_memory_per_node" + ) + + # Remove any existing arguments to avoid interferences + sys.argv = [sys.argv[0]] + + # Pass parameters to the FlexFlow C++ runtime via command line arguments + for arg in ff_args: + if arg not in ff_arg_to_sysarg: + warnings.warn(f"Ignoring parameter {arg}: not recognized.") + else: + sys_arg = [ff_arg_to_sysarg[arg]] + if type(ff_args[arg]) == bool: + if ff_args[arg] is not True: + continue + else: + sys_arg += [str(ff_args[arg])] + sys.argv += sys_arg + + legion_canonical_python_main(sys.argv) + atexit.register(legion_canonical_python_cleanup) + else: + # print("Using FlexFlow Python") + if configs_dict is not None or len(kwargs.items()) > 0: + warnings.warn("init_flexflow_runtime are ignored when using the FlexFlow Python interpreter") + + flexflow_library.initialize() + set_flexflow_initialized() + else: + warnings.warn("Attempting to initialize FlexFlow more than once") diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 1508371ae7..2d71dd18b3 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -15,19 +15,20 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import cffi -import os -import subprocess -import logging import warnings import numpy as np from .flexflow_logger import fflogger from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, ModelType, OpType, ParameterSyncType, enum_to_int, int_to_enum +from flexflow.config import * +from .flexflowlib import ffi, flexflow_library -_FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) -if not _FF_BUILD_DOCS: - from .flexflowlib import ffi, flexflow_library +def ffc(): + if not flexflow_already_initialized(): + raise RuntimeError("Cannot use FlexFlow library before initializing FlexFlow") ffc = flexflow_library.lib + if ffc is None: + raise RuntimeError("FlexFlow library is None") + return ffc ff_tracing_id = 200 @@ -66,35 +67,35 @@ def __init__(self, handle, idx=None, name=None): self.name = name def get_number_parameters(self): - return ffc.flexflow_op_get_num_parameters(self.handle) + return ffc().flexflow_op_get_num_parameters(self.handle) def get_parameter_by_id(self, id): - handle = ffc.flexflow_op_get_parameter_by_id(self.handle, id) + handle = ffc().flexflow_op_get_parameter_by_id(self.handle, id) return Parameter(handle) def get_number_inputs(self): - return ffc.flexflow_op_get_num_inputs(self.handle) + return ffc().flexflow_op_get_num_inputs(self.handle) def get_input_by_id(self, id): - handle = ffc.flexflow_op_get_input_by_id(self.handle, id) + handle = ffc().flexflow_op_get_input_by_id(self.handle, id) return Tensor(handle, False) def get_number_outputs(self): - return ffc.flexflow_op_get_num_outputs(self.handle) + return ffc().flexflow_op_get_num_outputs(self.handle) def get_output_by_id(self, id): - handle = ffc.flexflow_op_get_output_by_id(self.handle, id) + handle = ffc().flexflow_op_get_output_by_id(self.handle, id) return Tensor(handle, False) def init(self, model): - ffc.flexflow_op_init(self.handle, model.handle) + ffc().flexflow_op_init(self.handle, model.handle) def forward(self, model): - ffc.flexflow_op_forward(self.handle, model.handle) + ffc().flexflow_op_forward(self.handle, model.handle) #return Tensor(handle) def _add_to_model(self, model): - ffc.flexflow_op_add_to_model(self.handle, model.handle) + ffc().flexflow_op_add_to_model(self.handle, model.handle) def get_output_tensor(self): return self.get_output_by_id(0) @@ -602,36 +603,36 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): class FFConfig(object): __slots__ = ['handle', '_handle', 'enable_tracing'] def __init__(self): - self.handle = ffc.flexflow_config_create() - self._handle = ffi.gc(self.handle, ffc.flexflow_config_destroy) + self.handle = ffc().flexflow_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_config_destroy) self.enable_tracing = False def parse_args(self): - ffc.flexflow_config_parse_args_default(self.handle) + ffc().flexflow_config_parse_args_default(self.handle) @property def batch_size(self): - return ffc.flexflow_config_get_batch_size(self.handle) + return ffc().flexflow_config_get_batch_size(self.handle) @property def workers_per_node(self): - return ffc.flexflow_config_get_workers_per_node(self.handle) + return ffc().flexflow_config_get_workers_per_node(self.handle) @property def num_nodes(self): - return ffc.flexflow_config_get_num_nodes(self.handle) + return ffc().flexflow_config_get_num_nodes(self.handle) @property def epochs(self): - return ffc.flexflow_config_get_epochs(self.handle) + return ffc().flexflow_config_get_epochs(self.handle) @property def enable_control_replication(self): - return ffc.flexflow_config_get_enable_control_replication(self.handle) + return ffc().flexflow_config_get_enable_control_replication(self.handle) @property def data_parallelism_degree(self): - return ffc.flexflow_config_get_data_parallelism_degree(self.handle) + return ffc().flexflow_config_get_data_parallelism_degree(self.handle) @data_parallelism_degree.setter def data_parallelism_degree(self, value): @@ -639,11 +640,11 @@ def data_parallelism_degree(self, value): raise ValueError("The data parallelism degree must be specified as an integer number") elif value < 1: raise ValueError("The data parallelism degree cannot be lower than 1") - ffc.flexflow_config_set_data_parallelism_degree(self.handle, value) + ffc().flexflow_config_set_data_parallelism_degree(self.handle, value) @property def tensor_parallelism_degree(self): - return ffc.flexflow_config_get_tensor_parallelism_degree(self.handle) + return ffc().flexflow_config_get_tensor_parallelism_degree(self.handle) @tensor_parallelism_degree.setter def tensor_parallelism_degree(self, value): @@ -651,11 +652,11 @@ def tensor_parallelism_degree(self, value): raise ValueError("The tensor parallelism degree must be specified as an integer number") elif value < 1: raise ValueError("The tensor parallelism degree cannot be lower than 1") - ffc.flexflow_config_set_tensor_parallelism_degree(self.handle, value) + ffc().flexflow_config_set_tensor_parallelism_degree(self.handle, value) @property def pipeline_parallelism_degree(self): - return ffc.flexflow_config_get_pipeline_parallelism_degree(self.handle) + return ffc().flexflow_config_get_pipeline_parallelism_degree(self.handle) @pipeline_parallelism_degree.setter def pipeline_parallelism_degree(self, value): @@ -663,26 +664,26 @@ def pipeline_parallelism_degree(self, value): raise ValueError("The pipeline parallelism degree must be specified as an integer number") elif value < 1: raise ValueError("The pipeline parallelism degree cannot be lower than 1") - ffc.flexflow_config_set_pipeline_parallelism_degree(self.handle, value) + ffc().flexflow_config_set_pipeline_parallelism_degree(self.handle, value) @property def python_data_loader_type(self): - return ffc.flexflow_config_get_python_data_loader_type(self.handle) + return ffc().flexflow_config_get_python_data_loader_type(self.handle) @property def cpu_offload(self): - return ffc.flexflow_config_get_offload(self.handle) + return ffc().flexflow_config_get_offload(self.handle) def get_current_time(self): - return ffc.flexflow_get_current_time(self.handle) + return ffc().flexflow_get_current_time(self.handle) def begin_trace(self, trace_id): if self.enable_tracing: - ffc.flexflow_begin_trace(self.handle, trace_id) + ffc().flexflow_begin_trace(self.handle, trace_id) def end_trace(self, trace_id): if self.enable_tracing: - ffc.flexflow_end_trace(self.handle, trace_id) + ffc().flexflow_end_trace(self.handle, trace_id) # ----------------------------------------------------------------------- # Tensor @@ -709,7 +710,7 @@ def __init__(self, handle, deallocate=True, owner_op_type=None, p_handle=None): self.__get_dims() self.__get_data_type() # if (deallocate == True): - # self._handle = ffi.gc(self.handle, ffc.flexflow_tensor_destroy) + # self._handle = ffi.gc(self.handle, ffc().flexflow_tensor_destroy) # if (self.is_mapped() == True): # self.mapped = True @@ -719,13 +720,13 @@ def __init__(self, handle, deallocate=True, owner_op_type=None, p_handle=None): def inline_map(self, ffmodel, ffconfig): assert self.mapped == False, "Tensor is already mapped." - ffc.flexflow_tensor_inline_map(self.handle, ffmodel.handle, ffconfig.handle); + ffc().flexflow_tensor_inline_map(self.handle, ffmodel.handle, ffconfig.handle); self.mapped = True assert self.num_dims > 0, "check dims" def inline_unmap(self, ffmodel, ffconfig): assert self.mapped == True, "Tensor is not inline mapped." - ffc.flexflow_tensor_inline_unmap(self.handle, ffmodel.handle, ffconfig.handle); + ffc().flexflow_tensor_inline_unmap(self.handle, ffmodel.handle, ffconfig.handle); self.mapped = False def get_array(self, ffmodel, ffconfig): @@ -774,7 +775,7 @@ def detach_numpy_array(self, ffconfig): self.__detach_raw_ptr(ffconfig) def is_mapped(self): - return ffc.flexflow_tensor_is_mapped(self.handle) + return ffc().flexflow_tensor_is_mapped(self.handle) def set_tensor(self, ffmodel, np_array): assert np_array.__array_interface__['strides'] == None, "Parameter set_weights, numpy array strides is not None" @@ -788,15 +789,15 @@ def set_tensor(self, ffmodel, np_array): if np_array.dtype == np.float16: assert self.data_type == DataType.DT_HALF, "Wrong datatype" raw_ptr = ffi.cast("half*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) + ret_val = ffc().flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) elif np_array.dtype == np.float32: assert self.data_type == DataType.DT_FLOAT, "Wrong datatype" raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) + ret_val = ffc().flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) elif np_array.dtype == np.int32: assert self.data_type == DataType.DT_INT32, "Wrong datatype" raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_set_tensor_int(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) + ret_val = ffc().flexflow_tensor_set_tensor_int(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) else: assert 0, "Unsupported datatype" fflogger.debug("set tensor raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(np_shape))) @@ -817,13 +818,13 @@ def get_tensor(self, ffmodel): np_raw_ptr = np_array.__array_interface__['data'] if np_array.dtype == np.float32: raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, False) elif np_array.dtype == np.int32: raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, False) elif np_array.dtype == np.int64: raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, False) fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) assert ret_val == True return np_array @@ -844,13 +845,13 @@ def get_gradients(self, ffmodel, comm_type): c_comm_type = enum_to_int(ParameterSyncType, comm_type) if np_array.dtype == np.float32: raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, True) + ret_val = ffc().flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, True) elif np_array.dtype == np.int32: raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, True) + ret_val = ffc().flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, True) elif np_array.dtype == np.int64: raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, True) + ret_val = ffc().flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, True) fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) assert ret_val == True return np_array @@ -871,7 +872,7 @@ def get_model_output_gradients(self, ffmodel, comm_type): c_comm_type = enum_to_int(ParameterSyncType, comm_type) if np_array.dtype == np.float32: raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc.flexflow_model_get_output_tensor_float(ffmodel.handle, self.handle, raw_ptr, True) + ret_val = ffc().flexflow_model_get_output_tensor_float(ffmodel.handle, self.handle, raw_ptr, True) else: assert 0, "unknown data type" fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) @@ -893,7 +894,7 @@ def get_model_output_tensor(self, ffmodel): np_raw_ptr = np_array.__array_interface__['data'] if np_array.dtype == np.float32: raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc.flexflow_model_get_output_tensor_float(ffmodel.handle, self.handle, raw_ptr, False) + ret_val = ffc().flexflow_model_get_output_tensor_float(ffmodel.handle, self.handle, raw_ptr, False) else: assert 0, "unknown data type" fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) @@ -903,29 +904,29 @@ def get_model_output_tensor(self, ffmodel): def __get_raw_ptr(self, ffmodel, ffconfig, data_type): assert data_type == self.data_type, "Tensor check data type" if (data_type == DataType.DT_HALF): - return ffc.flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) + return ffc().flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) elif (data_type == DataType.DT_FLOAT): - return ffc.flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) + return ffc().flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) elif (data_type == DataType.DT_INT32): - return ffc.flexflow_tensor_get_raw_ptr_int32(self.handle, ffmodel.handle, ffconfig.handle) + return ffc().flexflow_tensor_get_raw_ptr_int32(self.handle, ffmodel.handle, ffconfig.handle) else: assert 0, "unknown data type" def __get_dims(self): - self.num_dims = ffc.flexflow_tensor_get_num_dims(self.handle) + self.num_dims = ffc().flexflow_tensor_get_num_dims(self.handle) # if (self.num_dims == 1): - # self.dims = (ffc.flexflow_tensor_get_dim(self.handle, 0),) + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 0),) # elif (self.num_dims == 2): - # self.dims = (ffc.flexflow_tensor_get_dim(self.handle, 1), ffc.flexflow_tensor_get_dim(self.handle, 0)) + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) # elif (self.num_dims == 3): - # self.dims = (ffc.flexflow_tensor_get_dim(self.handle, 2), ffc.flexflow_tensor_get_dim(self.handle, 1), ffc.flexflow_tensor_get_dim(self.handle, 0)) + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) # elif (self.num_dims == 4): - # self.dims = (ffc.flexflow_tensor_get_dim(self.handle, 3), ffc.flexflow_tensor_get_dim(self.handle, 2), ffc.flexflow_tensor_get_dim(self.handle, 1), ffc.flexflow_tensor_get_dim(self.handle, 0)) + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 3), ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) # elif (self.num_dims == 5): - # self.dims = (ffc.flexflow_tensor_get_dim(self.handle, 4), ffc.flexflow_tensor_get_dim(self.handle, 3), ffc.flexflow_tensor_get_dim(self.handle, 2), ffc.flexflow_tensor_get_dim(self.handle, 1), ffc.flexflow_tensor_get_dim(self.handle, 0)) + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 4), ffc().flexflow_tensor_get_dim(self.handle, 3), ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) # else: # assert 0, "unknown num_dims" - d = ffc.flexflow_tensor_get_dims(self.handle) + d = ffc().flexflow_tensor_get_dims(self.handle) if (self.num_dims == 1): self.dims = (d[0],) elif (self.num_dims == 2): @@ -940,7 +941,7 @@ def __get_dims(self): assert 0, "unknown num_dims" def __get_data_type(self): - dtype = ffc.flexflow_tensor_get_data_type(self.handle) + dtype = ffc().flexflow_tensor_get_data_type(self.handle) if (dtype == 40): self.data_type = DataType.DT_BOOLEAN elif (dtype == 41): @@ -957,7 +958,7 @@ def __get_data_type(self): assert 0, "unknown data type {}".format(dtype) def __get_owner_op(self, op_type): - op_handle = ffc.flexflow_tensor_get_owner_op(self.handle) + op_handle = ffc().flexflow_tensor_get_owner_op(self.handle) if op_handle.impl == ffi.NULL: self.owner_op = None else: @@ -965,12 +966,12 @@ def __get_owner_op(self, op_type): def __attach_raw_ptr(self, ffmodel, ffconfig, raw_ptr, column_major=True): assert self.mapped == False, "Tensor is already mapped." - ffc.flexflow_tensor_attach_raw_ptr(self.handle, ffmodel.handle, ffconfig.handle, raw_ptr, column_major) + ffc().flexflow_tensor_attach_raw_ptr(self.handle, ffmodel.handle, ffconfig.handle, raw_ptr, column_major) self.mapped = True def __detach_raw_ptr(self, ffconfig): assert self.mapped == True, "Tensor is not mapped." - ffc.flexflow_tensor_detach_raw_ptr(self.handle, ffconfig.handle) + ffc().flexflow_tensor_detach_raw_ptr(self.handle, ffconfig.handle) self.mapped = False # ----------------------------------------------------------------------- @@ -996,7 +997,7 @@ def set_weights(self, ffmodel, np_array): np_raw_ptr = np_array.__array_interface__['data'] raw_ptr = ffi.cast("float*", np_raw_ptr[0]) fflogger.debug("set weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(np_shape))) - ret_val = ffc.flexflow_tensor_set_tensor_float(self.parameter_handle, ffmodel.handle, num_dims, c_dims, raw_ptr) + ret_val = ffc().flexflow_tensor_set_tensor_float(self.parameter_handle, ffmodel.handle, num_dims, c_dims, raw_ptr) assert ret_val == True, ret_val def get_weights(self, ffmodel): @@ -1005,7 +1006,7 @@ def get_weights(self, ffmodel): np_raw_ptr = np_array.__array_interface__['data'] raw_ptr = ffi.cast("float*", np_raw_ptr[0]) fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - ret_val = ffc.flexflow_tensor_get_tensor_float(self.parameter_handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_float(self.parameter_handle, ffmodel.handle, raw_ptr, False) assert ret_val == True return np_array @@ -1025,8 +1026,8 @@ def __init__(self, ffconfig): :returns: FFModel -- the model. """ - self.handle = ffc.flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) - self._handle = ffi.gc(self.handle, ffc.flexflow_model_destroy) + self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) + self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) self._layers = dict() self._nb_layers = 0 self._ffconfig = ffconfig @@ -1041,7 +1042,7 @@ def get_layers(self): def add_layer(self, op_type, name): layer_id = self._nb_layers - op_handle = ffc.flexflow_model_get_last_layer(self.handle) + op_handle = ffc().flexflow_model_get_last_layer(self.handle) self._layers[self._nb_layers] = convert_op_handle_to_op(op_type, op_handle, idx=layer_id, name=name) self._nb_layers += 1 @@ -1064,18 +1065,18 @@ def create_tensor(self, dims, data_type, create_grad=True): c_dims = ffi.new("int[]", dims) c_data_type = enum_to_int(DataType, data_type) num_dims = len(dims) - handle = ffc.flexflow_tensor_create(self.handle, num_dims, c_dims, c_data_type, create_grad); + handle = ffc().flexflow_tensor_create(self.handle, num_dims, c_dims, c_data_type, create_grad); return Tensor(handle) def map_tensor(self, tensor, parallel_op = None): op_handle = self.__get_op_handle(parallel_op) - ffc.flexflow_tensor_map(self.handle, tensor.handle, op_handle) + ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) def create_constant(self, dims, value, data_type): c_dims = ffi.new("int[]", dims) c_data_type = enum_to_int(DataType, data_type) num_dims = len(dims) - handle = ffc.flexflow_constant_create(self.handle, num_dims, c_dims, value, c_data_type); + handle = ffc().flexflow_constant_create(self.handle, num_dims, c_dims, value, c_data_type); return Tensor(handle) def exp(self, x, name=None): @@ -1090,7 +1091,7 @@ def exp(self, x, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_exp(self.handle, x.handle, c_name) + handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) self.add_layer(OpType.EXP, name) return Tensor(handle, owner_op_type=OpType.EXP) @@ -1106,7 +1107,7 @@ def sin(self, x, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_sin(self.handle, x.handle, c_name) + handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) self.add_layer(OpType.SIN, name) return Tensor(handle, owner_op_type=OpType.SIN) @@ -1122,7 +1123,7 @@ def cos(self, x, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_cos(self.handle, x.handle, c_name) + handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) self.add_layer(OpType.COS, name) return Tensor(handle, owner_op_type=OpType.COS) @@ -1142,7 +1143,7 @@ def add(self, x, y, inplace_a=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_add(self.handle, x.handle, y.handle, inplace_a, c_name) + handle = ffc().flexflow_model_add_add(self.handle, x.handle, y.handle, inplace_a, c_name) self.add_layer(OpType.ADD, name) return Tensor(handle, owner_op_type=OpType.ADD) @@ -1161,7 +1162,7 @@ def subtract(self, x, y, inplace_a=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_subtract(self.handle, x.handle, y.handle, inplace_a, c_name) + handle = ffc().flexflow_model_add_subtract(self.handle, x.handle, y.handle, inplace_a, c_name) self.add_layer(OpType.SUBTRACT, name) return Tensor(handle, owner_op_type=OpType.SUBTRACT) @@ -1180,7 +1181,7 @@ def multiply(self, x, y, inplace_a=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_multiply(self.handle, x.handle, y.handle, inplace_a, c_name) + handle = ffc().flexflow_model_add_multiply(self.handle, x.handle, y.handle, inplace_a, c_name) self.add_layer(OpType.MULTIPLY, name) return Tensor(handle, owner_op_type=OpType.MULTIPLY) @@ -1199,7 +1200,7 @@ def divide(self, x, y, inplace_a=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_divide(self.handle, x.handle, y.handle, inplace_a, c_name) + handle = ffc().flexflow_model_add_divide(self.handle, x.handle, y.handle, inplace_a, c_name) self.add_layer(OpType.DIVIDE, name) return Tensor(handle, owner_op_type=OpType.DIVIDE) @@ -1218,7 +1219,7 @@ def max(self, x, y, inplace_a=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_max(self.handle, x.handle, y.handle, inplace_a, c_name) + handle = ffc().flexflow_model_add_max(self.handle, x.handle, y.handle, inplace_a, c_name) self.add_layer(OpType.MAX, name) return Tensor(handle, owner_op_type=OpType.MAX) @@ -1237,7 +1238,7 @@ def min(self, x, y, inplace_a=False, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_min(self.handle, x.handle, y.handle, inplace_a, c_name) + handle = ffc().flexflow_model_add_min(self.handle, x.handle, y.handle, inplace_a, c_name) self.add_layer(OpType.MIN, name) return Tensor(handle, owner_op_type=OpType.MIN) @@ -1257,7 +1258,7 @@ def reduce_sum(self, input, axes, keepdims=False, name=None): """ c_name = get_c_name(name) c_axes = ffi.new("int[]", axes) - handle = ffc.flexflow_model_add_reduce_sum(self.handle, input.handle, c_axes, len(axes), keepdims, c_name) + handle = ffc().flexflow_model_add_reduce_sum(self.handle, input.handle, c_axes, len(axes), keepdims, c_name) self.add_layer(OpType.REDUCE_SUM, name) return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) @@ -1273,7 +1274,7 @@ def rsqrt(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_rsqrt(self.handle, input.handle, c_name) + handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) self.add_layer(OpType.RSQRT, name) return Tensor(handle, owner_op_type=OpType.RSQRT) @@ -1292,7 +1293,7 @@ def pow(self, input, exponent, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_pow(self.handle, input.handle, exponent, c_name) + handle = ffc().flexflow_model_add_pow(self.handle, input.handle, exponent, c_name) self.add_layer(OpType.POW, name) return Tensor(handle, owner_op_type=OpType.POW) @@ -1318,7 +1319,7 @@ def mean(self, input, dims, keepdims=False, name=None): dims = list(dims) c_dims = ffi.new("int[]", dims) c_name = get_c_name(name) - handle = ffc.flexflow_model_add_mean(self.handle, input.handle, c_dims, len(dims), keepdims, c_name) + handle = ffc().flexflow_model_add_mean(self.handle, input.handle, c_dims, len(dims), keepdims, c_name) self.add_layer(OpType.MEAN, name) return Tensor(handle, owner_op_type=OpType.MEAN) @@ -1414,7 +1415,7 @@ def conv2d(self, input, out_channels, kernel_init_handle = self.__get_initializer_handle(kernel_initializer) bias_init_handle = self.__get_initializer_handle(bias_initializer) c_name = get_c_name(name) - handle = ffc.flexflow_model_add_conv2d(self.handle, input.handle, out_channels, kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w, c_activation, groups, use_bias, shared_op_handle, kernel_init_handle, bias_init_handle, c_name) + handle = ffc().flexflow_model_add_conv2d(self.handle, input.handle, out_channels, kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w, c_activation, groups, use_bias, shared_op_handle, kernel_init_handle, bias_init_handle, c_name) self.add_layer(OpType.CONV2D, name) return Tensor(handle, owner_op_type=OpType.CONV2D) @@ -1459,7 +1460,7 @@ def embedding(self, input, num_embeddings, embedding_dim, (type(kernel_initializer) is UniformInitializer) or \ (type(kernel_initializer) is NormInitializer), \ f"Unknown initializer type: {kernel_initializer}" - handle = ffc.flexflow_model_add_embedding( + handle = ffc().flexflow_model_add_embedding( self.handle, input.handle, num_embeddings, embedding_dim, c_aggr, c_dtype, shared_op_handle, kernel_initializer.handle, c_name, ) @@ -1541,7 +1542,7 @@ def pool2d(self, input, kernel_h, kernel_w, c_name = get_c_name(name) c_pool_type = enum_to_int(PoolType, pool_type) c_activation = enum_to_int(ActiMode, activation) - handle = ffc.flexflow_model_add_pool2d(self.handle, input.handle, kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w, c_pool_type, c_activation, c_name) + handle = ffc().flexflow_model_add_pool2d(self.handle, input.handle, kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w, c_pool_type, c_activation, c_name) self.add_layer(OpType.POOL2D, name) return Tensor(handle, owner_op_type=OpType.POOL2D) @@ -1562,14 +1563,14 @@ def batch_norm(self, input, relu=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_batch_norm(self.handle, input.handle, relu, c_name) + handle = ffc().flexflow_model_add_batch_norm(self.handle, input.handle, relu, c_name) self.add_layer(OpType.BATCH_NORM, name) return Tensor(handle, owner_op_type=OpType.BATCH_NORM) def layer_norm(self, input, axes, elementwise_affine=True, eps=1e-5, name=None): c_name = get_c_name(name) c_axes = ffi.new("int[]", axes) - handle = ffc.flexflow_model_add_layer_norm(self.handle, input.handle, len(axes), c_axes, elementwise_affine, eps, c_name) + handle = ffc().flexflow_model_add_layer_norm(self.handle, input.handle, len(axes), c_axes, elementwise_affine, eps, c_name) self.add_layer(OpType.LAYER_NORM, name) return Tensor(handle, owner_op_type=OpType.LAYER_NORM) @@ -1597,7 +1598,7 @@ def batch_matmul(self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name= a_seq_length_dim = -1 if b_seq_length_dim is None: b_seq_length_dim = -1 - handle = ffc.flexflow_model_add_batch_matmul(self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim) + handle = ffc().flexflow_model_add_batch_matmul(self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim) self.add_layer(OpType.BATCH_MATMUL, name) return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) @@ -1658,7 +1659,7 @@ def dense(self, input, out_dim, c_kernel_reg_type = enum_to_int( RegularizerMode, RegularizerMode.REG_MODE_NONE) kernel_reg_lambda = 0.0 - handle = ffc.flexflow_model_add_dense( + handle = ffc().flexflow_model_add_dense( self.handle, input.handle, out_dim, c_activation, use_bias, c_datatype, shared_op_handle, kernel_init_handle, bias_init_handle, c_kernel_reg_type, kernel_reg_lambda, c_name) @@ -1689,7 +1690,7 @@ def concat(self, tensors, axis, name=None): tensor_handle_list.append(tensor.handle) c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) c_name = get_c_name(name) - handle = ffc.flexflow_model_add_concat(self.handle, n, c_tensor_handle_list, axis, c_name) + handle = ffc().flexflow_model_add_concat(self.handle, n, c_tensor_handle_list, axis, c_name) self.add_layer(OpType.CONCAT, name) return Tensor(handle, owner_op_type=OpType.CONCAT) @@ -1720,7 +1721,7 @@ def split(self, input, sizes, axis, name=None): c_split = ffi.new("int[]", split) c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") c_name = get_c_name(name) - ffc.flexflow_model_add_split(self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name) + ffc().flexflow_model_add_split(self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name) output_tensor_list = [] for i in range(n): tensor_p_handle = ffi.new("flexflow_tensor_t*") @@ -1742,7 +1743,7 @@ def flat(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_flat(self.handle, input.handle, c_name) + handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) self.add_layer(OpType.FLAT, name) return Tensor(handle, owner_op_type=OpType.FLAT) @@ -1758,7 +1759,7 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) + handle = ffc().flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) self.add_layer(OpType.SOFTMAX, name) return Tensor(handle, owner_op_type=OpType.SOFTMAX) @@ -1781,7 +1782,7 @@ def reshape(self, input, shape, name=None): """ c_name = get_c_name(name) c_shape = ffi.new("int[]", shape) - handle = ffc.flexflow_model_add_reshape(self.handle, input.handle, len(shape), c_shape, c_name) + handle = ffc().flexflow_model_add_reshape(self.handle, input.handle, len(shape), c_shape, c_name) self.add_layer(OpType.RESHAPE, name) return Tensor(handle, owner_op_type=OpType.RESHAPE) @@ -1803,7 +1804,7 @@ def gather(self, input, index, dim, name=None): :returns: Tensor -- the output tensor """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_gather(self.handle, input.handle, index.handle, dim, c_name) + handle = ffc().flexflow_model_add_gather(self.handle, input.handle, index.handle, dim, c_name) self.add_layer(OpType.GATHER, name) return Tensor(handle, owner_op_type=OpType.GATHER) @@ -1823,7 +1824,7 @@ def transpose(self, input, perm, name=None): """ c_name = get_c_name(name) c_perm = ffi.new("int[]", perm) - handle = ffc.flexflow_model_add_transpose(self.handle, input.handle, len(perm), c_perm, c_name) + handle = ffc().flexflow_model_add_transpose(self.handle, input.handle, len(perm), c_perm, c_name) self.add_layer(OpType.TRANSPOSE, name) return Tensor(handle, owner_op_type=OpType.TRANSPOSE) @@ -1844,7 +1845,7 @@ def reverse(self, input, axis, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_reverse(self.handle, input.handle, axis, c_name) + handle = ffc().flexflow_model_add_reverse(self.handle, input.handle, axis, c_name) self.add_layer(OpType.REVERSE, name) return Tensor(handle, owner_op_type=OpType.REVERSE) @@ -1863,7 +1864,7 @@ def scalar_multiply(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_scalar_multiply(self.handle, input.handle, scalar, inplace, c_name) + handle = ffc().flexflow_model_add_scalar_multiply(self.handle, input.handle, scalar, inplace, c_name) self.add_layer(OpType.SCALAR_MULTIPLY, name) return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) @@ -1882,7 +1883,7 @@ def scalar_add(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_scalar_add(self.handle, input.handle, scalar, inplace, c_name) + handle = ffc().flexflow_model_add_scalar_add(self.handle, input.handle, scalar, inplace, c_name) self.add_layer(OpType.SCALAR_ADD, name) return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) @@ -1901,7 +1902,7 @@ def scalar_sub(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_scalar_sub(self.handle, input.handle, scalar, inplace, c_name) + handle = ffc().flexflow_model_add_scalar_sub(self.handle, input.handle, scalar, inplace, c_name) self.add_layer(OpType.SCALAR_SUB, name) return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) @@ -1920,7 +1921,7 @@ def scalar_true_divide(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_scalar_truediv(self.handle, input.handle, scalar, inplace, c_name) + handle = ffc().flexflow_model_add_scalar_truediv(self.handle, input.handle, scalar, inplace, c_name) self.add_layer(OpType.SCALAR_TRUEDIV, name) return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) @@ -1936,7 +1937,7 @@ def gelu(self, input, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_gelu(self.handle, input.handle, c_name) + handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) self.add_layer(OpType.GELU, name) return Tensor(handle, owner_op_type=OpType.GELU) @@ -1952,7 +1953,7 @@ def relu(self, input, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_relu(self.handle, input.handle, inplace, c_name) + handle = ffc().flexflow_model_add_relu(self.handle, input.handle, inplace, c_name) self.add_layer(OpType.RELU, name) return Tensor(handle, owner_op_type=OpType.RELU) @@ -1968,7 +1969,7 @@ def identity(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_identity(self.handle, input.handle, c_name) + handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) self.add_layer(OpType.IDENTITY, name) return Tensor(handle, owner_op_type=OpType.IDENTITY) @@ -1984,7 +1985,7 @@ def sigmoid(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_sigmoid(self.handle, input.handle, c_name) + handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) self.add_layer(OpType.SIGMOID, name) return Tensor(handle, owner_op_type=OpType.SIGMOID) @@ -2000,7 +2001,7 @@ def tanh(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_tanh(self.handle, input.handle, c_name) + handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) self.add_layer(OpType.TANH, name) return Tensor(handle, owner_op_type=OpType.TANH) @@ -2016,7 +2017,7 @@ def elu(self, input, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_elu(self.handle, input.handle, inplace, c_name) + handle = ffc().flexflow_model_add_elu(self.handle, input.handle, inplace, c_name) self.add_layer(OpType.ELU, name) return Tensor(handle, owner_op_type=OpType.ELU) @@ -2042,7 +2043,7 @@ def dropout(self, input, rate, seed, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_dropout(self.handle, input.handle, rate, seed, c_name) + handle = ffc().flexflow_model_add_dropout(self.handle, input.handle, rate, seed, c_name) self.add_layer(OpType.DROPOUT, name) return Tensor(handle, owner_op_type=OpType.DROPOUT) @@ -2098,7 +2099,7 @@ def multihead_attention(self, query, key, value, """ c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc.flexflow_model_add_multihead_attention(self.handle, query.handle, key.handle, value.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) + handle = ffc().flexflow_model_add_multihead_attention(self.handle, query.handle, key.handle, value.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) self.add_layer(OpType.MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) @@ -2166,7 +2167,7 @@ def inc_multihead_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) @@ -2234,7 +2235,7 @@ def spec_inc_multihead_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_spec_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) @@ -2302,7 +2303,7 @@ def inc_multihead_self_attention_verify(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) @@ -2373,7 +2374,7 @@ def inc_multiquery_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) @@ -2444,7 +2445,7 @@ def spec_inc_multiquery_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_spec_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) @@ -2515,7 +2516,7 @@ def inc_multiquery_self_attention_verify(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc.flexflow_model_add_inc_multiquery_self_attention_verify(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) @@ -2537,7 +2538,7 @@ def rms_norm(self, input, eps, dim, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_rms_norm(self.handle, input.handle, eps, dim, c_name) + handle = ffc().flexflow_model_add_rms_norm(self.handle, input.handle, eps, dim, c_name) self.add_layer(OpType.RMS_NORM, name) return Tensor(handle, owner_op_type=OpType.RMS_NORM) @@ -2559,7 +2560,7 @@ def arg_top_k(self, input, k, sorted, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_arg_top_k(self.handle, input.handle, k, sorted, c_name) + handle = ffc().flexflow_model_add_arg_top_k(self.handle, input.handle, k, sorted, c_name) self.add_layer(OpType.ARG_TOPK, name) return Tensor(handle, owner_op_type=OpType.ARG_TOPK) @@ -2581,7 +2582,7 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_beam_top_k(self.handle, input.handle, max_beam_size, sorted, c_name) + handle = ffc().flexflow_model_add_beam_top_k(self.handle, input.handle, max_beam_size, sorted, c_name) self.add_layer(OpType.BEAM_TOPK, name) return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) @@ -2600,7 +2601,7 @@ def sampling(self, input, top_p, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_sampling(self.handle, input.handle, top_p, c_name) + handle = ffc().flexflow_model_add_sampling(self.handle, input.handle, top_p, c_name) self.add_layer(OpType.SAMPLING, name) return Tensor(handle, owner_op_type=OpType.SAMPLING) @@ -2619,7 +2620,7 @@ def argmax(self, input, beam_search, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc.flexflow_model_add_argmax(self.handle, input.handle, beam_search, c_name) + handle = ffc().flexflow_model_add_argmax(self.handle, input.handle, beam_search, c_name) self.add_layer(OpType.ARGMAX, name) return Tensor(handle, owner_op_type=OpType.ARGMAX) @@ -2628,17 +2629,17 @@ def reset_metrics(self): :returns: None -- no returns. """ - ffc.flexflow_model_reset_metrics(self.handle) + ffc().flexflow_model_reset_metrics(self.handle) def init_layers(self): """Initialize layers. :returns: None -- no returns. """ - ffc.flexflow_model_init_layers(self.handle) + ffc().flexflow_model_init_layers(self.handle) def prefetch(self): - ffc.flexflow_model_prefetch(self.handle) + ffc().flexflow_model_prefetch(self.handle) def forward(self, seq_length=None): """Forward propagation of all layers. @@ -2647,7 +2648,7 @@ def forward(self, seq_length=None): """ if seq_length is None: seq_length = -1 - ffc.flexflow_model_forward(self.handle, seq_length) + ffc().flexflow_model_forward(self.handle, seq_length) #TODO: seperate compute_metrics from backward def backward(self, seq_length=None): @@ -2657,21 +2658,21 @@ def backward(self, seq_length=None): """ if seq_length is None: seq_length = -1 - ffc.flexflow_model_backward(self.handle, seq_length) + ffc().flexflow_model_backward(self.handle, seq_length) def compute_metrics(self): """Compute performance metrics. :returns: None -- no returns. """ - ffc.flexflow_model_compute_metrics(self.handle) + ffc().flexflow_model_compute_metrics(self.handle) def update(self): """Update weights and biases of all layers. :returns: None -- no returns. """ - ffc.flexflow_model_update(self.handle) + ffc().flexflow_model_update(self.handle) def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): """Configure the model for trainting. FlexFlow uses lazy initialization, @@ -2708,7 +2709,7 @@ def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): if comp_mode == None: comp_mode = CompMode.TRAINING c_comp_mode = enum_to_int(CompMode, comp_mode) - ffc.flexflow_model_compile(self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode) + ffc().flexflow_model_compile(self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode) for (ff_tensor, np_tensor) in self.attr_tensors.items(): ff_tensor.set_tensor(self, np_tensor) print("Compiled ffmodel!") @@ -2803,13 +2804,13 @@ def zero_gradients(self): :returns: None -- no returns. """ - ffc.flexflow_model_zero_gradients(self.handle) + ffc().flexflow_model_zero_gradients(self.handle) def set_optimizer(self, optimizer): if isinstance(optimizer, SGDOptimizer) == True: - ffc.flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle) + ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle) elif isinstance(optimizer, AdamOptimizer) == True: - ffc.flexflow_model_set_adam_optimizer(self.handle, optimizer.handle) + ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle) elif optimizer == None: pass else: @@ -2818,7 +2819,7 @@ def set_optimizer(self, optimizer): optimizer = property(fset=set_optimizer) def print_layers(self, id=-1): - ffc.flexflow_model_print_layers(self.handle, id) + ffc().flexflow_model_print_layers(self.handle, id) def get_layer_by_id(self, layer_id): return self._layers[layer_id] @@ -2835,20 +2836,20 @@ def get_layer_by_name(self, layer_name): return None def get_tensor_by_id(self, id): - handle = ffc.flexflow_model_get_parameter_by_id(self.handle, id) + handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id) return Parameter(handle) @property def label_tensor(self): - handle = ffc.flexflow_model_get_label_tensor(self.handle) + handle = ffc().flexflow_model_get_label_tensor(self.handle) return Tensor(handle, deallocate=False) def get_perf_metrics(self): - handle = ffc.flexflow_model_get_perf_metrics(self.handle) + handle = ffc().flexflow_model_get_perf_metrics(self.handle) return PerfMetrics(handle) def set_transformer_layer_id(self, id): - ffc.flexflow_model_set_transformer_layer_id(self.handle, id) + ffc().flexflow_model_set_transformer_layer_id(self.handle, id) def create_data_loader(self, batch_tensor, full_array): """Create a SingleDataloader instance. @@ -2952,13 +2953,13 @@ def get_output_tensor(self, ffmodel, data_type): np_raw_ptr = np_array.__array_interface__['data'] if np_array.dtype == np.float32: raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, False) elif np_array.dtype == np.int32: raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, False) elif np_array.dtype == np.int64: raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc.flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, False) + ret_val = ffc().flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, False) fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) assert ret_val == True return np_array @@ -2968,7 +2969,7 @@ def generate(self, prompt, max_sequence_length): max_num_chars = 36000 c_output_text = ffi.new("char[]", max_num_chars) c_output_length_and_tokens = ffi.new("int[]", max_sequence_length + 100) - ffc.flexflow_model_generate(self.handle, c_input_text, max_num_chars, c_output_text, max_sequence_length, c_output_length_and_tokens) + ffc().flexflow_model_generate(self.handle, c_input_text, max_num_chars, c_output_text, max_sequence_length, c_output_length_and_tokens) output_length = c_output_length_and_tokens[0] output_tokens = [] for i in range(output_length): @@ -2977,7 +2978,7 @@ def generate(self, prompt, max_sequence_length): return GenerationResult(ffi.string(c_output_text), output_tokens) def set_position_offset(self, offset): - ffc.flexflow_model_set_position_offset(self.handle, offset) + ffc().flexflow_model_set_position_offset(self.handle, offset) # ----------------------------------------------------------------------- # SGDOptimizer @@ -2986,11 +2987,11 @@ def set_position_offset(self, offset): class SGDOptimizer(object): __slots__ = ['handle', '_handle'] def __init__(self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0): - self.handle = ffc.flexflow_sgd_optimizer_create(ffmodel.handle, lr, momentum, nesterov, weight_decay) - self._handle = ffi.gc(self.handle, ffc.flexflow_sgd_optimizer_destroy) + self.handle = ffc().flexflow_sgd_optimizer_create(ffmodel.handle, lr, momentum, nesterov, weight_decay) + self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) def set_learning_rate(self, learning_rate): - ffc.flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) + ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) # ----------------------------------------------------------------------- # AdamOptimizer @@ -2999,11 +3000,11 @@ def set_learning_rate(self, learning_rate): class AdamOptimizer(object): __slots__ = ['handle', '_handle'] def __init__(self, ffmodel, alpha=0.001, beta1=0.9, beta2=0.999, weight_decay=0.0, epsilon=1e-8): - self.handle = ffc.flexflow_adam_optimizer_create(ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon) - self._handle = ffi.gc(self.handle, ffc.flexflow_adam_optimizer_destroy) + self.handle = ffc().flexflow_adam_optimizer_create(ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon) + self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) def set_learning_rate(self, learning_rate): - ffc.flexflow_adam_optimizer_set_lr(self.handle, learning_rate) + ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) # ----------------------------------------------------------------------- # Initializer @@ -3026,8 +3027,8 @@ def __init__(self, handle, p_handle=0): class GlorotUniformInitializer(Initializer): __slots__ = ['glorot_handle', '_glorot_handle'] def __init__(self, seed): - self.glorot_handle = ffc.flexflow_glorot_uniform_initializer_create(seed) - self._glorot_handle = ffi.gc(self.glorot_handle, ffc.flexflow_glorot_uniform_initializer_destroy) + self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) + self._glorot_handle = ffi.gc(self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy) super(GlorotUniformInitializer, self).__init__(self.glorot_handle) # ----------------------------------------------------------------------- @@ -3037,8 +3038,8 @@ def __init__(self, seed): class ZeroInitializer(Initializer): __slots__ = ['zero_handle', '_zero_handle'] def __init__(self): - self.zero_handle = ffc.flexflow_zero_initializer_create() - self._zero_handle = ffi.gc(self.zero_handle, ffc.flexflow_zero_initializer_destroy) + self.zero_handle = ffc().flexflow_zero_initializer_create() + self._zero_handle = ffi.gc(self.zero_handle, ffc().flexflow_zero_initializer_destroy) super(ZeroInitializer, self).__init__(self.zero_handle) # ----------------------------------------------------------------------- @@ -3048,8 +3049,8 @@ def __init__(self): class UniformInitializer(Initializer): __slots__ = ['uniform_handle', '_uniform_handle'] def __init__(self, seed, minv, maxv): - self.uniform_handle = ffc.flexflow_uniform_initializer_create(seed, minv, maxv) - self._uniform_handle = ffi.gc(self.uniform_handle, ffc.flexflow_uniform_initializer_destroy) + self.uniform_handle = ffc().flexflow_uniform_initializer_create(seed, minv, maxv) + self._uniform_handle = ffi.gc(self.uniform_handle, ffc().flexflow_uniform_initializer_destroy) super(UniformInitializer, self).__init__(self.uniform_handle) # ----------------------------------------------------------------------- @@ -3059,8 +3060,8 @@ def __init__(self, seed, minv, maxv): class NormInitializer(Initializer): __slots__ = ['norm_handle', '_norm_handle'] def __init__(self, seed, mean, stddev): - self.norm_handle = ffc.flexflow_norm_initializer_create(seed, mean, stddev) - self._norm_handle = ffi.gc(self.norm_handle, ffc.flexflow_norm_initializer_destroy) + self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) + self._norm_handle = ffi.gc(self.norm_handle, ffc().flexflow_norm_initializer_destroy) super(NormInitializer, self).__init__(self.norm_handle) # ----------------------------------------------------------------------- @@ -3071,10 +3072,10 @@ class PerfMetrics(object): __slots__= ['handle', '_handle'] def __init__(self, handle): self.handle = handle - self._handle = ffi.gc(self.handle, ffc.flexflow_per_metrics_destroy) + self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) def get_accuracy(self): - return ffc.flexflow_per_metrics_get_accuracy(self.handle) + return ffc().flexflow_per_metrics_get_accuracy(self.handle) # ----------------------------------------------------------------------- # NetConfig @@ -3082,9 +3083,9 @@ def get_accuracy(self): class NetConfig(object): def __init__(self): - self.handle = ffc.flexflow_net_config_create() - self._handle = ffi.gc(self.handle, ffc.flexflow_net_config_destroy) - cpath = ffc.flexflow_net_config_get_dataset_path(self.handle) + self.handle = ffc().flexflow_net_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) + cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) self.dataset_path = ffi.string(cpath) # ----------------------------------------------------------------------- @@ -3093,32 +3094,32 @@ def __init__(self): class DLRMConfig(object): def __init__(self): - self.handle = ffc.flexflow_dlrm_config_create() - self._handle = ffi.gc(self.handle, ffc.flexflow_dlrm_config_destroy) + self.handle = ffc().flexflow_dlrm_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) - cstr = ffc.flexflow_dlrm_config_get_dataset_path(self.handle) + cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) self.dataset_path = ffi.string(cstr) - cstr = ffc.flexflow_dlrm_config_get_arch_interaction_op(self.handle) + cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) self.arch_interaction_op = ffi.string(cstr) - self.sparse_feature_size = ffc.flexflow_dlrm_config_get_sparse_feature_size(self.handle) - self.sigmoid_bot = ffc.flexflow_dlrm_config_get_sigmoid_bot(self.handle) - self.sigmoid_top = ffc.flexflow_dlrm_config_get_sigmoid_top(self.handle) - self.embedding_bag_size = ffc.flexflow_dlrm_config_get_embedding_bag_size(self.handle) - self.loss_threshold = ffc.flexflow_dlrm_config_get_loss_threshold(self.handle) + self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(self.handle) + self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) + self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) + self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(self.handle) + self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) - mlp_bot_c = ffc.flexflow_dlrm_config_get_mlp_bot(self.handle) + mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) self.mlp_bot = [] for i in range(0, mlp_bot_c[0]): self.mlp_bot.append(mlp_bot_c[i+1]) - mlp_top_c = ffc.flexflow_dlrm_config_get_mlp_top(self.handle) + mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) self.mlp_top = [] for i in range(0, mlp_top_c[0]): self.mlp_top.append(mlp_top_c[i+1]) - embedding_size_c = ffc.flexflow_dlrm_config_get_embedding_size(self.handle) + embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) self.embedding_size = [] for i in range(0, embedding_size_c[0]): self.embedding_size.append(embedding_size_c[i+1]) @@ -3136,39 +3137,39 @@ def __init__(self, ffmodel, input, full_input, num_samples, data_type): self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) else: self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) - self._handle = ffi.gc(self.handle, ffc.flexflow_single_dataloader_destroy) + self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc.flexflow_single_dataloader_create(ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type) + self.handle = ffc().flexflow_single_dataloader_create(ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type) def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc.flexflow_single_dataloader_create2(ffmodel.handle, input.handle, full_input, num_samples, c_data_type) + self.handle = ffc().flexflow_single_dataloader_create2(ffmodel.handle, input.handle, full_input, num_samples, c_data_type) @property def num_samples(self): - return ffc.flexflow_single_dataloader_get_num_samples(self.handle) + return ffc().flexflow_single_dataloader_get_num_samples(self.handle) @num_samples.setter def num_samples(self, samples): - ffc.flexflow_single_dataloader_set_num_samples(self.handle, samples) + ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) def next_batch(self, ffmodel): """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. :returns: None -- no returns. """ - ffc.flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) + ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) def reset(self): """Reset the current position of the dataloder to 0. :returns: None -- no returns. """ - ffc.flexflow_single_dataloader_reset(self.handle) + ffc().flexflow_single_dataloader_reset(self.handle) class RegionNdarray(object): __slots__ = ['__array_interface__'] @@ -3198,8 +3199,8 @@ def __init__(self, shape, data_type, base_ptr, strides, read_only): class BatchConfig(object): __slots__ = ['handle', '_handle'] def __init__(self): - self.handle = ffc.flexflow_batch_config_create() - self._handle = ffi.gc(self.handle, ffc.flexflow_batch_config_destroy) + self.handle = ffc().flexflow_batch_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy) # ----------------------------------------------------------------------- # TreeVerifyBatchConfig @@ -3208,8 +3209,8 @@ def __init__(self): class TreeVerifyBatchConfig(object): __slots__ = ['handle', '_handle'] def __init__(self): - self.handle = ffc.flexflow_tree_verify_batch_config_create() - self._handle = ffi.gc(self.handle, ffc.flexflow_tree_verify_batch_config_destroy) + self.handle = ffc().flexflow_tree_verify_batch_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_tree_verify_batch_config_destroy) # ----------------------------------------------------------------------- # BeamSearchBatchConfig @@ -3218,8 +3219,8 @@ def __init__(self): class BatchConfig(object): __slots__ = ['handle', '_handle'] def __init__(self): - self.handle = ffc.flexflow_beam_search_batch_config_create() - self._handle = ffi.gc(self.handle, ffc.flexflow_beam_search_batch_config_destroy) + self.handle = ffc().flexflow_beam_search_batch_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_beam_search_batch_config_destroy) # ----------------------------------------------------------------------- # RequestManager @@ -3228,20 +3229,20 @@ def __init__(self): class RequestManager(object): __slots__ = ['handle'] def __init__(self): - self.handle = ffc.flexflow_request_manager_get_request_manager() - #self._handle = ffi.gc(self.handle, ffc.flexflow_request_manager_destroy) + self.handle = ffc().flexflow_request_manager_get_request_manager() + #self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy) def register_tokenizer(self, model_type, bos_token_id, eos_token_id, tokenizer_filepath): c_model_type = enum_to_int(ModelType, model_type) c_tokenizer_filepath = get_c_name(tokenizer_filepath) - return ffc.flexflow_request_manager_register_tokenizer(self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath) + return ffc().flexflow_request_manager_register_tokenizer(self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath) def register_output_filepath(self, output_filepath): c_output_filepath = get_c_name(output_filepath) - return ffc.flexflow_request_manager_register_output_filepath(self.handle, c_output_filepath) + return ffc().flexflow_request_manager_register_output_filepath(self.handle, c_output_filepath) def register_ssm_model(self, model): - return ffc.flexflow_request_manager_register_ssm_model(self.handle, model.handle) + return ffc().flexflow_request_manager_register_ssm_model(self.handle, model.handle) # ----------------------------------------------------------------------- # InferenceManager @@ -3250,14 +3251,14 @@ def register_ssm_model(self, model): class InferenceManager(object): __slots__ = ['handle'] def __init__(self): - self.handle = ffc.flexflow_inference_manager_get_inference_manager() - #self._handle = ffi.gc(self.handle, ffc.flexflow_inference_manager_destroy) + self.handle = ffc().flexflow_inference_manager_get_inference_manager() + #self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy) def compile_model_and_allocate_buffer(self, model): - ffc.flexflow_inference_manager_compile_model_and_allocate_buffer(self.handle, model.handle) + ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(self.handle, model.handle) def init_operators_inference(self, model): - ffc.flexflow_inference_manager_init_operators_inference(self.handle, model.handle) + ffc().flexflow_inference_manager_init_operators_inference(self.handle, model.handle) # ----------------------------------------------------------------------- # FileDataLoader @@ -3267,8 +3268,8 @@ class FileDataLoader(object): __slots__ = ['handle', '_handle'] def __init__(self, weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, tensor_parallelism_degree): c_weight_file_path = get_c_name(weight_file_path) - self.handle = ffc.flexflow_file_data_loader_create(c_weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, tensor_parallelism_degree) - self._handle = ffi.gc(self.handle, ffc.flexflow_file_data_loader_destroy) + self.handle = ffc().flexflow_file_data_loader_create(c_weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, tensor_parallelism_degree) + self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy) def load_weights(self, model, model_layers_with_weights, data_type): # Extract keys and values into arrays @@ -3287,4 +3288,4 @@ def load_weights(self, model, model_layers_with_weights, data_type): # Check data type and create use_full_precision boolean assert(data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF) use_full_precision = data_type == DataType.DT_FLOAT - ffc.flexflow_file_data_loader_load_weights(self.handle, model.handle, num_layers, layer_names_c, layer_handles_c, use_full_precision) + ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle, num_layers, layer_names_c, layer_handles_c, use_full_precision) diff --git a/python/flexflow/flexflow_python b/python/flexflow/flexflow_python index 7fed992c6d..cf247b9ede 100644 --- a/python/flexflow/flexflow_python +++ b/python/flexflow/flexflow_python @@ -7,5 +7,6 @@ pylib_path="$(python "$python_packages"/flexflow/findpylib.py)" pylib_dir="$(dirname "$pylib_path")" export PATH="${python_packages}/flexflow/bin:${PATH}" export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${PATH}" +legion_python_args=("$@" "-ll:py" "1") -legion_python "$@" \ No newline at end of file +legion_python "${legion_python_args[@]}" diff --git a/python/flexflow/jupyter.py b/python/flexflow/jupyter.py deleted file mode 100644 index e2ed529c85..0000000000 --- a/python/flexflow/jupyter.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json -from flexflow.config import flexflow_dir - -_CONFIG_FILENAME = None - -def set_jupyter_config(filename): - global _CONFIG_FILENAME - _CONFIG_FILENAME = filename - print("config file is set to:", _CONFIG_FILENAME) - -def load_jupyter_config(): - cmd_dict_key = ["cpus", "gpus", "utility", "sysmem", "fbmem", "zcmem"] - argv_dict = {} - global _CONFIG_FILENAME - if _CONFIG_FILENAME is None: - raise Exception("Sorry, jupyter configuration file is not set, please call set_jupyter_config to set the path to the configuration json file.") - with open(_CONFIG_FILENAME) as json_file: - cmd_dict = json.load(json_file) - for key in cmd_dict_key: - if key in cmd_dict and cmd_dict[key]["value"] is not None: - argv_dict[cmd_dict[key]["cmd"]] = cmd_dict[key]["value"] - return argv_dict diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index e45b9759a0..a8d0a0294c 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -12,43 +12,41 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys, os -from typing import Union, Optional +from typing import Optional from ..type import * +from flexflow.core import * +from .serve import LLM, SSM, GenerationConfig, GenerationResult -def _parse_positive_int_config(name: str, variable: str, ff_cli_name: str = None): - if variable is not None: - if type(variable) is not int: +def __check_positive_int(configs_dict: dict, key: str): + value = configs_dict.get(key, None) + if value is not None: + if type(value) is not int: + raise TypeError(f"Parameter {key} has value {value}, which is not an int!") + elif value <= 0: raise ValueError( - f"The following configs take positive integers only: {name}" + f"Parameter {key} has value {value}, which is not a positive number!" ) - elif variable <= 0: - raise ValueError( - f"The following configs take positive integers only: {name}" - ) - if not ff_cli_name: - sys.argv += ["-{name}", str(variable)] - else: - sys.argv += [f"{ff_cli_name}", str(variable)] -def init(configs_dict: Optional[dict] = None, - *, - num_gpus: Optional[int] = None, - memory_per_gpu: Optional[int] = None, - zero_copy_memory_per_node: Optional[int] = None, - num_cpus: Optional[int] = None, - legion_utility_processors: Optional[int] = None, - data_parallelism_degree: Optional[int] = None, - tensor_parallelism_degree: Optional[int] = None, - pipeline_parallelism_degree: Optional[int] = None, - offload: Optional[bool] = None, - offload_reserve_space_size: Optional[int] = None, - use_4bit_quantization: Optional[bool] = None, - use_8bit_quantization: Optional[bool] = None, - profiling: Optional[bool] = None, - fusion: Optional[bool] = None): +def init( + configs_dict: Optional[dict] = None, + *, + num_gpus: Optional[int] = None, + memory_per_gpu: Optional[int] = None, + zero_copy_memory_per_node: Optional[int] = None, + num_cpus: Optional[int] = None, + legion_utility_processors: Optional[int] = None, + data_parallelism_degree: Optional[int] = None, + tensor_parallelism_degree: Optional[int] = None, + pipeline_parallelism_degree: Optional[int] = None, + offload: Optional[bool] = None, + offload_reserve_space_size: Optional[int] = None, + use_4bit_quantization: Optional[bool] = None, + use_8bit_quantization: Optional[bool] = None, + profiling: Optional[bool] = None, + fusion: Optional[bool] = None, +): """ Configure FlexFlow Serve and start the runtime. @@ -113,117 +111,86 @@ def init(configs_dict: Optional[dict] = None, :raises TypeError: this function will raise an exception if the configs_dict is not a dictionary :raises ValueError: this function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node """ - - # Check that either configs_dict or any of individual, non-positional arguments (after the *) is passed, but not both - if configs_dict is not None and any([ - num_gpus is not None, - memory_per_gpu is not None, - zero_copy_memory_per_node is not None, - num_cpus is not None, - legion_utility_processors is not None, - data_parallelism_degree is not None, - tensor_parallelism_degree is not None, - pipeline_parallelism_degree is not None, - offload is not None, - offload_reserve_space_size is not None, - use_4bit_quantization is not None, - use_8bit_quantization is not None, - profiling is not None, - fusion is not None, - ]): + + # Check that if configs_dict is passed, no other key-value argument (after the *) is passed. + if configs_dict is not None and any( + [ + num_gpus is not None, + memory_per_gpu is not None, + zero_copy_memory_per_node is not None, + num_cpus is not None, + legion_utility_processors is not None, + data_parallelism_degree is not None, + tensor_parallelism_degree is not None, + pipeline_parallelism_degree is not None, + offload is not None, + offload_reserve_space_size is not None, + use_4bit_quantization is not None, + use_8bit_quantization is not None, + profiling is not None, + fusion is not None, + ] + ): raise ValueError("Cannot pass both configs_dict and individual args") if configs_dict is not None: - # If configs_dict is passed, check that the type is dictionary and that the mandatory key-value pairs are present (num_gpus, memory_per_gpu, zero_copy_memory_per_node) if type(configs_dict) != dict: raise TypeError("configs_dict is not a dictionary") - # configs should contain the following mandatory keys with non-zero integer values: - num_gpus = configs_dict.get("num_gpus") - memory_per_gpu = configs_dict.get("memory_per_gpu") - zero_copy_memory_per_node = configs_dict.get("zero_copy_memory_per_node") - if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_node: - raise ValueError( - "Missing one of the following configs in config dict: num_gpus, memory_per_gpu, zero_copy_memory_per_node" - ) - num_cpus = configs_dict.get("num_cpus") - legion_utility_processors = configs_dict.get("legion_utility_processors", 8) - data_parallelism_degree = configs_dict.get("data_parallelism_degree") - tensor_parallelism_degree = configs_dict.get("tensor_parallelism_degree") - pipeline_parallelism_degree = configs_dict.get("pipeline_parallelism_degree") - offload = configs_dict.get("offload", False) - offload_reserve_space_size = configs_dict.get("offload_reserve_space_size") - use_4bit_quantization = configs_dict.get("use_4bit_quantization", False) - use_8bit_quantization = configs_dict.get("use_8bit_quantization", False) - profiling = configs_dict.get("profiling", False) - fusion = configs_dict.get("fusion", True) else: - # If configs_dict is not passed, check that the mandatory parameters are passed directly as arguments - if not num_gpus or not memory_per_gpu or not zero_copy_memory_per_node: + # Add named key-value arguments into dictionary + configs_dict["num_gpus"] = num_gpus + configs_dict["memory_per_gpu"] = memory_per_gpu + configs_dict["zero_copy_memory_per_node"] = zero_copy_memory_per_node + configs_dict["legion_utility_processors"] = legion_utility_processors + configs_dict["data_parallelism_degree"] = data_parallelism_degree + configs_dict["tensor_parallelism_degree"] = tensor_parallelism_degree + configs_dict["pipeline_parallelism_degree"] = pipeline_parallelism_degree + configs_dict["offload"] = offload + configs_dict["offload_reserve_space_size"] = offload_reserve_space_size + configs_dict["use_4bit_quantization"] = use_4bit_quantization + configs_dict["use_8bit_quantization"] = use_8bit_quantization + configs_dict["profiling"] = profiling + configs_dict["fusion"] = fusion + + # Check that mandatory configs are present + required_keys = ["num_gpus", "memory_per_gpu", "zero_copy_memory_per_node"] + for required_key in required_keys: + if configs_dict.get(required_key, None) is None: raise ValueError( - "Missing one of the following configs in input params: num_gpus, memory_per_gpu, zero_copy_memory_per_node" - ) - offload = False if offload is None else offload - use_4bit_quantization = False if use_4bit_quantization is None else use_4bit_quantization - use_8bit_quantization = False if use_8bit_quantization is None else use_8bit_quantization - profiling = False if profiling is None else profiling - fusion = True if fusion is None else fusion - - # Remove the arguments to avoid interferences - sys.argv = [sys.argv[0]] - - # parse arguments - _parse_positive_int_config("num_gpus", num_gpus, "-ll:gpu") - _parse_positive_int_config("memory_per_gpu", memory_per_gpu, "-ll:fsize") - _parse_positive_int_config( - "zero_copy_memory_per_node", zero_copy_memory_per_node, "-ll:zsize" - ) + "Missing one of the following required configs: num_gpus, memory_per_gpu, zero_copy_memory_per_node" + ) - # parse optional arguments - _parse_positive_int_config("num_cpus", num_cpus, "-ll:cpu") - _parse_positive_int_config( - "legion_utility_processors", legion_utility_processors, "-ll:util" - ) - _parse_positive_int_config( - "data_parallelism_degree", data_parallelism_degree, "-data-parallelism-degree" - ) - _parse_positive_int_config( + # Sanity check parameters + positive_int_params = required_keys + [ + "legion_utility_processors", + "data_parallelism_degree", "tensor_parallelism_degree", - tensor_parallelism_degree, - "-tensor-parallelism-degree", - ) - _parse_positive_int_config( "pipeline_parallelism_degree", - pipeline_parallelism_degree, - "-pipeline-parallelism-degree", - ) - if offload: - sys.argv += ["-offload"] - _parse_positive_int_config( "offload_reserve_space_size", - offload_reserve_space_size, - "-offload-reserve-space-size", - ) - if use_4bit_quantization: - sys.argv += ["--4bit-quantization"] - if use_8bit_quantization: - sys.argv += ["--8bit-quantization"] - if profiling: - sys.argv += ["--profiling"] - if fusion: - sys.argv += ["--fusion"] - - global LLM, SSM, GenerationConfig, GenerationResult - from .serve import LLM, SSM, GenerationConfig, GenerationResult + ] + for param in positive_int_params: + __check_positive_int(configs_dict, param) + # Set default values + if configs_dict.get("legion_utility_processors", None) is None: + configs_dict["legion_utility_processors"] = 8 + if configs_dict.get("data_parallelism_degree", None) is None: + configs_dict["data_parallelism_degree"] = 1 + if configs_dict.get("tensor_parallelism_degree", None) is None: + configs_dict["tensor_parallelism_degree"] = 1 + if configs_dict.get("pipeline_parallelism_degree", None) is None: + configs_dict["pipeline_parallelism_degree"] = 1 + if configs_dict.get("offload", None) is None: + configs_dict["offload"] = False + if configs_dict.get("offload_reserve_space_size", None) is None: + configs_dict["offload_reserve_space_size"] = 1024 ** 2 + if configs_dict.get("use_4bit_quantization", None) is None: + configs_dict["use_4bit_quantization"] = False + if configs_dict.get("use_8bit_quantization", None) is None: + configs_dict["use_8bit_quantization"] = False + if configs_dict.get("profiling", None) is None: + configs_dict["profiling"] = False + if configs_dict.get("fusion", None) is None: + configs_dict["fusion"] = True -def init_cpu(): - """Start the FlexFlow runtime and import the inference package without access to GPU functionalities. - This is useful to access the utilies from the flexflow package without using up GPU memory. - """ - # Remove the arguments to avoid interferences - sys.argv = [sys.argv[0]] - # Ask the runtime to avoid using GPU/GPU memory - os.environ["CPU_ONLY_TEST"] = "1" - - global LLM, SSM, GenerationConfig, GenerationResult - from .serve import LLM, SSM, GenerationConfig, GenerationResult + init_flexflow_runtime(configs_dict) diff --git a/python/flexflow_python_build.py b/python/flexflow_python_build.py index a9d8e8983e..65aff5af56 100755 --- a/python/flexflow_python_build.py +++ b/python/flexflow_python_build.py @@ -42,15 +42,19 @@ '#! /usr/bin/env bash', f'BUILD_FOLDER="{build_dir}"', 'SCRIPT_DIR="$(realpath "${BASH_SOURCE[0]%/*}")"', + 'legion_python_args=("$@" "-ll:py" "1")', 'if [[ "$SCRIPT_DIR" == "$BUILD_FOLDER" ]]; then', f'\tPYTHON_FOLDER="{script_dir}"', '\tPYLIB_PATH="$("$PYTHON_FOLDER"/flexflow/findpylib.py)"', '\tPYLIB_DIR="$(dirname "$PYLIB_PATH")"', '\texport LD_LIBRARY_PATH="$BUILD_FOLDER:$BUILD_FOLDER/deps/legion/lib:$PYLIB_DIR:$LD_LIBRARY_PATH"', '\texport PYTHONPATH="$PYTHON_FOLDER:$BUILD_FOLDER/deps/legion/bindings/python:$PYTHONPATH"', - '\t$BUILD_FOLDER/deps/legion/bin/legion_python "$@"', + '\t$BUILD_FOLDER/deps/legion/bin/legion_python "${legion_python_args[@]}"', 'else', - '\tlegion_python "$@"', + '\tPYLIB_PATH="$(python3 -m flexflow.findpylib)"', + '\tPYLIB_DIR="$(dirname "$PYLIB_PATH")"', + '\texport LD_LIBRARY_PATH="$PYLIB_DIR:$LD_LIBRARY_PATH"', + '\tlegion_python "${legion_python_args[@]}"', 'fi' ] with open(flexflow_python_path, "w+") as flexflow_python_file: diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index b4102a7dba..75e68a7332 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -395,11 +395,11 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, if (m->quantization_type != DT_NONE) { // copy weight_ptr to quantized_weight_ptr, do compression and store in // m->weight_ptr - hipMemcpyAsync(m->quantized_weight_ptr, - weight.get_byte_ptr(), - m->quantized_weightSize, - hipMemcpyHostToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr, + weight.get_byte_ptr(), + m->quantized_weightSize, + hipMemcpyHostToDevice, + stream)); if (m->quantization_type == DT_INT4) { int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; @@ -427,17 +427,17 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, } } else { if (data_type == DT_FLOAT) { - hipMemcpyAsync(m->weight_ptr, - weight.get_float_ptr(), - m->weightSize, - hipMemcpyHostToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->weight_ptr, + weight.get_float_ptr(), + m->weightSize, + hipMemcpyHostToDevice, + stream)); } else if (data_type == DT_HALF) { - hipMemcpyAsync(m->weight_ptr, - weight.get_half_ptr(), - m->weightSize, - hipMemcpyHostToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->weight_ptr, + weight.get_half_ptr(), + m->weightSize, + hipMemcpyHostToDevice, + stream)); } else { assert(false); } @@ -456,15 +456,16 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { - hipMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream); + checkCUDA(hipMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } - hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->num_active_tokens() * + sizeof(BatchConfig::PerTokenInfo), + hipMemcpyHostToDevice, + stream)); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -1132,7 +1133,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } - hipStreamSynchronize(stream); + checkCUDA(hipStreamSynchronize(stream)); } IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {} diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index c9ef952d3b..e5e35a4c90 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -528,28 +528,32 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, hipStream_t stream) { // here because we need postion info in infernece 1 - hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream); - hipMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), - hipMemcpyHostToDevice, - stream); - hipMemcpyAsync(m->beam_token_infos, - &(bc->beamTokenInfo), - bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), - hipMemcpyHostToDevice, - stream); - hipMemcpyAsync(m->beam_request_infos, - &(bc->beamRequestsInfo), - bc->MAX_NUM_REQUESTS * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), - hipMemcpyHostToDevice, - stream); + checkCUDA( + hipMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + hipMemcpyHostToDevice, + stream)); + checkCUDA( + hipMemcpyAsync(m->request_infos, + &(bc->requestsInfo), + bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), + hipMemcpyHostToDevice, + stream)); + checkCUDA( + hipMemcpyAsync(m->beam_token_infos, + &(bc->beamTokenInfo), + bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * + sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), + hipMemcpyHostToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + m->beam_request_infos, + &(bc->beamRequestsInfo), + bc->MAX_NUM_REQUESTS * + sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), + hipMemcpyHostToDevice, + stream)); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -710,7 +714,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( gpu_mem_allocator.instance_allocated_size); } - hipStreamSynchronize(stream); + checkCUDA(hipStreamSynchronize(stream)); } SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index fbd6d1cc48..b3a56f650a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -518,15 +518,15 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, if (m->handle.offload_reserve_space != nullptr) { // Note that we update weight_ptr and bias_ptr when uploading weight and // bias - hipMemcpyAsync(m->weight_ptr, - weight_ptr, - m->weightSize, - hipMemcpyHostToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->weight_ptr, + weight_ptr, + m->weightSize, + hipMemcpyHostToDevice, + stream)); weight_ptr = static_cast
(m->weight_ptr); if (m->biasSize > 0) { - hipMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream); + checkCUDA(hipMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } } @@ -534,12 +534,13 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache - hipMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - hipMemcpyHostToDevice, - stream); + checkCUDA( + hipMemcpyAsync(m->committed_token_infos, + &(bc->committed_tokens), + bc->num_tokens_to_commit * + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), + hipMemcpyHostToDevice, + stream)); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -548,16 +549,16 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { - hipMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream); + checkCUDA(hipMemcpyAsync( + m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } - hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->token_infos, + &(bc->tokensInfo), + bc->MAX_NUM_TOKENS * + sizeof(TreeVerifyBatchConfig::PerTokenInfo), + hipMemcpyHostToDevice, + stream)); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -719,7 +720,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( } } - hipStreamSynchronize(stream); + checkCUDA(hipStreamSynchronize(stream)); } TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) { diff --git a/tests/align/align_create_tensor_ff.py b/tests/align/align_create_tensor_ff.py index 2dbcb942d3..6c8774a33e 100644 --- a/tests/align/align_create_tensor_ff.py +++ b/tests/align/align_create_tensor_ff.py @@ -1,7 +1,7 @@ import os import sys import torch -import argparse +import json from flexflow.core import * from flexflow.core.flexflow_cffi import Linear, Op, Parameter from flexflow.type import AggrMode @@ -20,8 +20,14 @@ param_bias_op = {'conv2d': Conv2D, 'layernorm': LayerNorm, 'linear': Linear} -def create_single_operator_ff(): +def top_level_task(): args = parse_create_tensor_args() + configs_dict = None + if args.config_file is not None: + with open(args.config_file) as f: + configs_dict = json.load(f) + init_flexflow_runtime(configs_dict) + operator_name = args.operator OUT_DIR = os.path.join("tests", "align", "out", operator_name) @@ -669,4 +675,4 @@ def create_tensors_for_gather_ff(ffmodel): if __name__ == "__main__": - create_single_operator_ff() + top_level_task() diff --git a/tests/align/align_utils.py b/tests/align/align_utils.py index 368893c5eb..d53e5cbba9 100644 --- a/tests/align/align_utils.py +++ b/tests/align/align_utils.py @@ -112,7 +112,12 @@ def parse_create_tensor_args(): parser = ArgumentParser(description='Pytorch Aligment Test Suite') parser.add_argument("-o", "--operator", dest="operator", required=False, metavar="", help="operator needs to be test") - + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default=None, + ) args, unknown = parser.parse_known_args() return args diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh index 484e121eee..3fb361f25c 100755 --- a/tests/align/test_all_operators.sh +++ b/tests/align/test_all_operators.sh @@ -4,7 +4,7 @@ eval "$(conda shell.bash hook)" rm -rf align/out function generate_ff_tensor(){ - ./build/flexflow_python tests/align/align_create_tensor_ff.py -ll:py 1 -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16 -o "$1" + ./build/flexflow_python tests/align/align_create_tensor_ff.py -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16 -o "$1" } function generate_torch_tensor(){ diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index 29e377e5bc..1e8dd4298f 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -51,11 +51,6 @@ if [[ -f "$FF_HOME/build/examples/cpp/AlexNet/alexnet" ]]; then # TODO: fix split tests # "$FF_HOME"/build/examples/cpp/split_test/split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # "$FF_HOME"/build/examples/cpp/split_test_2/split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel - # Inference examples - # if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi - # "$FF_HOME"/build/examples/cpp/inference/LLAMA/LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel - #"$FF_HOME"/build/examples/cpp/inference/mixture_of_experts/inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel - #"$FF_HOME"/build/examples/cpp/inference/transformers/inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel else python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))") OLD_PATH="$PATH" @@ -84,11 +79,6 @@ else # TODO: fix split tests # split_test -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel # split_test_2 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel - # Inference examples - # if [ $(( GPU_AVAILABLE )) -lt $(( 4 )) ]; then echo "Skipping LLAMA test because it requires 4 GPUs, but only $GPU_AVAILABLE are available. " ; exit 1; fi - # LLAMA -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize 30000 --only-data-parallel - #inference_moe -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel - #inference_transformers -ll:gpu "$GPUS" -ll:util 8 -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --only-data-parallel fi done export PATH="$OLD_PATH" diff --git a/tests/multi_gpu_tests.sh b/tests/multi_gpu_tests.sh index 0321068641..3a6f6467df 100755 --- a/tests/multi_gpu_tests.sh +++ b/tests/multi_gpu_tests.sh @@ -8,75 +8,82 @@ NUM_NODES=${2:-1} # number of nodes BATCHSIZE=$(( NUM_NODES * GPUS * 64)) FSIZE=13800 ZSIZE=12192 +ONLY_DATA_PARALLEL=true FF_HOME="$(realpath "${BASH_SOURCE[0]%/*}/..")" export FF_HOME -# Edit the folder below if you did not build FlexFlow in $FF_HOME/build -BUILD_FOLDER="${FF_HOME}/build" -export BUILD_FOLDER if [[ $NUM_NODES -gt 1 ]]; then export GPUS export NUM_NODES EXE="$FF_HOME"/tests/multinode_helpers/mpi_wrapper1.sh else - if [[ -f "$BUILD_FOLDER/flexflow_python" ]]; then - EXE="$BUILD_FOLDER"/flexflow_python - else - EXE="flexflow_python" - fi + EXE="python" fi +# Check that number of GPUs requested is available echo "Running GPU tests with $NUM_NODES node(s) and $GPUS gpu(s)/node" GPU_AVAILABLE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) GPU_REQUESTED=$(( GPUS * NUM_NODES)) if [ $GPU_REQUESTED -gt $(( GPU_AVAILABLE )) ]; then echo "The test requires $GPU_REQUESTED GPUs, but only $GPU_AVAILABLE are available. Try reducing the number of nodes, or the number of gpus/node." ; exit; fi +# Generate configs JSON files +test_params=$(jq -n --arg num_gpus "$GPUS" --arg memory_per_gpu "$FSIZE" --arg zero_copy_memory_per_node "$ZSIZE" --arg batch_size "$BATCHSIZE" --arg only_data_parallel "$ONLY_DATA_PARALLEL" '{"num_gpus":$num_gpus,"memory_per_gpu":$memory_per_gpu,"zero_copy_memory_per_node":$zero_copy_memory_per_node,"batch_size":$batch_size,"only_data_parallel":$only_data_parallel}') +test_params_5_epochs=$(echo "$test_params" | jq '. + {"epochs": 5}') +test_params_40_epochs=$(echo "$test_params" | jq '. + {"epochs": 40}') +test_params_5_epochs_no_batch_size=$(echo "$test_params_5_epochs" | jq 'del(.batch_size)') +test_params_40_epochs_no_batch_size=$(echo "$test_params_40_epochs" | jq 'del(.batch_size)') +mkdir -p /tmp/flexflow/multi_gpu_tests +echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json +echo "$test_params_5_epochs" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json +echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json +echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json + #Sequential model tests -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json #Keras other -$EXE "$FF_HOME"/examples/python/keras/callback.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/unary.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/reshape.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/gather.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/regularizer.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel +$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json #Functional API -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json #Python -$EXE "$FF_HOME"/examples/python/native/print_layers.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/split.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/alexnet.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 40 --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 40 --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 5 --only-data-parallel +$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json #Possible crash -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" --epochs 40 --only-data-parallel +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json diff --git a/tests/multinode_helpers/mpi_wrapper1.sh b/tests/multinode_helpers/mpi_wrapper1.sh index 2e493f63e7..87d17d11a3 100755 --- a/tests/multinode_helpers/mpi_wrapper1.sh +++ b/tests/multinode_helpers/mpi_wrapper1.sh @@ -3,7 +3,6 @@ set -x set -e if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi -if [ -z "$BUILD_FOLDER" ]; then echo "BUILD_FOLDER variable is not defined, aborting tests"; exit; fi if [ -z "$NUM_NODES" ]; then echo "NUM_NODES variable is not defined, aborting tests"; exit; fi if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exit; fi diff --git a/tests/multinode_helpers/mpi_wrapper2.sh b/tests/multinode_helpers/mpi_wrapper2.sh index a4e871d700..57812884dc 100755 --- a/tests/multinode_helpers/mpi_wrapper2.sh +++ b/tests/multinode_helpers/mpi_wrapper2.sh @@ -2,8 +2,6 @@ set -x set -e -if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi -if [ -z "$BUILD_FOLDER" ]; then echo "BUILD_FOLDER variable is not defined, aborting tests"; exit; fi if [ -z "$NUM_NODES" ]; then echo "NUM_NODES variable is not defined, aborting tests"; exit; fi if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exit; fi @@ -13,11 +11,4 @@ if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exi CUDA_VISIBLE_DEVICES=$(seq -s, $((OMPI_COMM_WORLD_RANK * GPUS )) $(( OMPI_COMM_WORLD_RANK * GPUS +1 )) ) export CUDA_VISIBLE_DEVICES -if [[ -f "$BUILD_FOLDER/flexflow_python" ]]; then - EXE="$BUILD_FOLDER"/flexflow_python -else - EXE="flexflow_python" -fi - -$EXE "$@" - +python "$@" diff --git a/tests/python_interface_test.sh b/tests/python_interface_test.sh index 6c452bd10f..4f83918a49 100755 --- a/tests/python_interface_test.sh +++ b/tests/python_interface_test.sh @@ -8,12 +8,19 @@ check_python_interface() { BATCHSIZE=$((GPUS * 64)) FSIZE=14048 ZSIZE=12192 + ONLY_DATA_PARALLEL=true interpreter=${1:-python} installation_status=${2:-"before-installation"} + + # Generate configs JSON files + test_params=$(jq -n --arg num_gpus "$GPUS" --arg memory_per_gpu "$FSIZE" --arg zero_copy_memory_per_node "$ZSIZE" --arg batch_size "$BATCHSIZE" --arg only_data_parallel "$ONLY_DATA_PARALLEL" '{"num_gpus":$num_gpus,"memory_per_gpu":$memory_per_gpu,"zero_copy_memory_per_node":$zero_copy_memory_per_node,"batch_size":$batch_size,"only_data_parallel":$only_data_parallel}') + mkdir -p /tmp/flexflow/multi_gpu_tests + echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json + if [[ "$interpreter" == "python" ]]; then EXE="python" echo "Running a single-GPU Python test to check the Python interface (native python interpreter)" - $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel + $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json elif [[ "$interpreter" == "flexflow_python" ]]; then if [[ "$installation_status" == "before-installation" ]]; then EXE="$BUILD_FOLDER"/flexflow_python @@ -21,7 +28,7 @@ check_python_interface() { EXE="flexflow_python" fi echo "Running a single-GPU Python test to check the Python interface (flexflow_python interpreter)" - $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:py 1 -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel + $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -ll:gpu "$GPUS" -ll:fsize "$FSIZE" -ll:zsize "$ZSIZE" -b ${BATCHSIZE} --only-data-parallel else echo "Invalid Python interpreter" exit 1 @@ -38,10 +45,10 @@ export BUILD_FOLDER installation_status=${1:-"before-installation"} echo "Running Python interface tests (installation status: ${installation_status})" if [[ "$installation_status" == "before-installation" ]]; then - # Import flexflow.core module in Python + # Check availability of flexflow modules in Python export PYTHONPATH="${FF_HOME}/python:${BUILD_FOLDER}/deps/legion/bindings/python:${PYTHONPATH}" export LD_LIBRARY_PATH="${BUILD_FOLDER}:${LD_LIBRARY_PATH}" - python -c "import flexflow.core; exit()" + python -c "import flexflow.core; import flexflow.serve as ff; exit()" unset PYTHONPATH unset LD_LIBRARY_PATH # Run a single-gpu test using the flexflow_python interpreter @@ -53,8 +60,8 @@ if [[ "$installation_status" == "before-installation" ]]; then unset PYTHONPATH unset LD_LIBRARY_PATH elif [[ "$installation_status" == "after-installation" ]]; then - # Import flexflow.core module in Python - python -c "import flexflow.core; exit()" + # Check availability of flexflow modules in Python + python -c "import flexflow.core; import flexflow.serve as ff; exit()" # Run a single-gpu test using the flexflow_python interpreter check_python_interface flexflow_python after-installation # Run a single-gpu test using the native python interpreter From 1d5b4c6ce89806bddb857cf7bcc1729fd5764d34 Mon Sep 17 00:00:00 2001 From: xinhaoc <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 6 Sep 2023 03:34:57 -0400 Subject: [PATCH 223/344] support MPT model (#1093) * init. * layernorm add option not use bias * add alibi kernel * finish cpp interface. * python interface * fix tensor parallelism issue * fix * clean up * fix * hip --------- Co-authored-by: Zhihao Jia --- include/flexflow/ffconst.h | 3 +- include/flexflow/flexflow_c.h | 7 + include/flexflow/model.h | 7 + .../ops/inc_multihead_self_attention.h | 6 +- .../ops/inc_multihead_self_attention_params.h | 2 +- .../inc_multihead_self_attention_kernels.h | 8 + include/flexflow/ops/layer_norm.h | 5 +- include/flexflow/ops/layer_norm_params.h | 1 + .../ops/spec_inc_multihead_self_attention.h | 4 +- ...spec_inc_multihead_self_attention_params.h | 2 +- .../ops/tree_inc_multihead_self_attention.h | 4 +- ...tree_inc_multihead_self_attention_params.h | 2 +- inference/incr_decoding/CMakeLists.txt | 3 +- inference/incr_decoding/incr_decoding.cc | 20 +- inference/models/mpt.cc | 215 +++++++++++++ inference/models/mpt.h | 75 +++++ inference/spec_infer/CMakeLists.txt | 3 +- inference/spec_infer/spec_infer.cc | 43 ++- python/flexflow/core/flexflow_cffi.py | 49 ++- python/flexflow/serve/models/__init__.py | 1 + python/flexflow/serve/models/mpt.py | 290 ++++++++++++++++++ python/flexflow/serve/serve.py | 6 +- python/flexflow/type.py | 1 + src/c/flexflow_c.cc | 22 +- src/ops/fused.cpp | 8 +- src/ops/fused.cu | 8 +- src/ops/inc_multihead_self_attention.cc | 27 +- src/ops/inc_multihead_self_attention.cpp | 65 ++++ src/ops/inc_multihead_self_attention.cu | 64 +++- src/ops/layer_norm.cc | 156 ++++++---- src/ops/layer_norm.cpp | 6 +- src/ops/layer_norm.cu | 6 +- src/ops/spec_inc_multihead_self_attention.cc | 21 +- src/ops/spec_inc_multihead_self_attention.cpp | 15 + src/ops/spec_inc_multihead_self_attention.cu | 16 +- src/ops/tree_inc_multihead_self_attention.cc | 26 +- src/ops/tree_inc_multihead_self_attention.cpp | 16 + src/ops/tree_inc_multihead_self_attention.cu | 16 + src/runtime/graph.cc | 15 +- src/runtime/model.cc | 3 + src/runtime/request_manager.cc | 8 +- .../python_test_configs/generate_configs.py | 3 +- 42 files changed, 1134 insertions(+), 124 deletions(-) create mode 100644 inference/models/mpt.cc create mode 100644 inference/models/mpt.h create mode 100644 python/flexflow/serve/models/mpt.py diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 2f97d48997..78d98284a4 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -185,7 +185,8 @@ enum ModelType { LLAMA2 = 3003, OPT = 3004, FALCON = 3005, - STARCODER = 3006 + STARCODER = 3006, + MPT = 3007 }; enum PMParameter { diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 003533bb80..7977a083cc 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -256,6 +256,7 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle, int *axes, bool elementwise_affine, float eps, + bool use_bias, char const *name); flexflow_tensor_t @@ -416,6 +417,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name); flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( @@ -435,6 +437,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name); flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( @@ -454,6 +457,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name); flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( @@ -474,6 +478,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name); flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( @@ -494,6 +499,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name); flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( @@ -514,6 +520,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name); flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index bc3c7e6545..d4d829b019 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -525,6 +525,7 @@ class FFModel { std::vector const &axes, bool elementwise_affine, float eps, + bool use_bias = true, DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer @@ -653,6 +654,7 @@ class FFModel { bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, + bool position_bias = false, char const *name = NULL); Tensor spec_inc_multihead_self_attention(const Tensor input, @@ -670,6 +672,7 @@ class FFModel { bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, + bool position_bias = false, char const *name = NULL); Tensor inc_multihead_self_attention_verify( const Tensor input, @@ -687,6 +690,7 @@ class FFModel { bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, + bool position_bias = false, char const *name = NULL); Tensor inc_multiquery_self_attention(const Tensor input, int embed_dim, @@ -704,6 +708,7 @@ class FFModel { bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, + bool position_bias = false, char const *name = NULL); Tensor spec_inc_multiquery_self_attention(const Tensor input, @@ -722,6 +727,7 @@ class FFModel { bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, + bool position_bias = false, char const *name = NULL); Tensor inc_multiquery_self_attention_verify( const Tensor input, @@ -740,6 +746,7 @@ class FFModel { bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, + bool position_bias = false, char const *name = NULL); // ======================================== // Inference APIs diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index e48a8d4240..010c3f6085 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -43,6 +43,7 @@ class IncMultiHeadSelfAttention : public Op { bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -64,6 +65,7 @@ class IncMultiHeadSelfAttention : public Op { bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -125,7 +127,7 @@ class IncMultiHeadSelfAttention : public Op { float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling; + qk_prod_scaling, position_bias; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; DataType quantization_type; @@ -155,6 +157,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { bool _bias, bool _scaling_query, bool _qk_prod_scaling, + bool _position_bias, bool _add_bias_kv, float _scaling_factor, GenericTensorAccessorR const &weight, @@ -179,6 +182,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { bool *bias; bool *scaling_query; bool *qk_prod_scaling; + bool *position_bias; float scaling_factor; #ifdef INFERENCE_TESTS float *kcache, *vcache; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index be38b9ab1b..202ff70bc9 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -12,7 +12,7 @@ struct IncMultiHeadSelfAttentionParams { tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling; + qk_prod_scaling, position_bias; DataType quantization_type; bool offload; bool is_valid(ParallelTensorShape const &) const; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index f578249045..4d2002d10b 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -11,6 +11,14 @@ namespace FlexFlow { namespace Kernels { namespace IncMultiHeadAttention { +template +__global__ void apply_position_bias_qkprd(DT *input_ptr, + int num_tokens, + int num_total_tokens, + int num_heads, + int global_num_q_heads, + int shard_id); + template __global__ void apply_proj_bias_w(DT *input_ptr, DT const *bias_ptr, diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index a36e41a19f..8e0b4f61ff 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -21,6 +21,7 @@ class LayerNorm : public Op { const ParallelTensor _input, std::vector const &axes, bool _elementwise_affine, + bool _use_bias, float _eps, bool allocate_weights, char const *name); @@ -100,7 +101,7 @@ class LayerNorm : public Op { T *beta_grad_ptr); public: - bool elementwise_affine; + bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; std::vector axes; @@ -114,7 +115,7 @@ class LayerNormMeta : public OpMeta { ~LayerNormMeta(void); public: - bool elementwise_affine; + bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; diff --git a/include/flexflow/ops/layer_norm_params.h b/include/flexflow/ops/layer_norm_params.h index 5d06428f4f..509593c285 100644 --- a/include/flexflow/ops/layer_norm_params.h +++ b/include/flexflow/ops/layer_norm_params.h @@ -9,6 +9,7 @@ struct LayerNormParams { std::vector axes; bool elementwise_affine; float eps; + bool use_bias; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index c6364805e3..fa9251c871 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -40,6 +40,7 @@ class SpecIncMultiHeadSelfAttention : public Op { bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, @@ -58,6 +59,7 @@ class SpecIncMultiHeadSelfAttention : public Op { bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, @@ -120,7 +122,7 @@ class SpecIncMultiHeadSelfAttention : public Op { float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling; + qk_prod_scaling, position_bias; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; }; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index d6f08dd9e6..b59a237e20 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -11,7 +11,7 @@ struct SpecIncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling; + qk_prod_scaling, position_bias; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index d5be344cca..c4d7ae17e9 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -40,6 +40,7 @@ class TreeIncMultiHeadSelfAttention : public Op { bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -61,6 +62,7 @@ class TreeIncMultiHeadSelfAttention : public Op { bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -123,7 +125,7 @@ class TreeIncMultiHeadSelfAttention : public Op { float dropout, scaling_factor; bool bias; bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling; + qk_prod_scaling, position_bias; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; DataType quantization_type; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index 3ba49dcbad..a897c76162 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -12,7 +12,7 @@ struct TreeIncMultiHeadSelfAttentionParams { tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling; + qk_prod_scaling, position_bias; DataType quantization_type; bool offload; bool is_valid(ParallelTensorShape const &) const; diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index c3b97d094a..e415835a79 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -11,7 +11,8 @@ set(CPU_SRC ../models/llama.cc ../models/opt.cc ../models/falcon.cc - ../models/starcoder.cc) + ../models/starcoder.cc + ../models/mpt.cc) if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") cuda_add_executable(${project_target} ${CPU_SRC}) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 10b4744195..19cd8726e2 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -17,6 +17,7 @@ #include "flexflow/request_manager.h" #include "models/falcon.h" #include "models/llama.h" +#include "models/mpt.h" #include "models/opt.h" #include "models/starcoder.h" #include @@ -151,7 +152,6 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - ModelType model_type = ModelType::UNKNOWN; auto architectures = model_config["architectures"]; for (auto const &str : architectures) { @@ -174,10 +174,17 @@ void FlexFlow::top_level_task(Task const *task, } else if (str == "GPTBigCodeForCausalLM") { model_type = ModelType::STARCODER; break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; } } - int bos_token_id = model_config["bos_token_id"]; - int eos_token_id = model_config["eos_token_id"]; + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -215,6 +222,13 @@ void FlexFlow::top_level_task(Task const *task, INC_DECODING_MODE, generationConfig, use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); } else { assert(false && "unknow model type"); } diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc new file mode 100644 index 0000000000..d1ca03a335 --- /dev/null +++ b/inference/models/mpt.cc @@ -0,0 +1,215 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mpt.h" + +namespace FlexFlow { + +using namespace Legion; +using json = nlohmann::json; + +void MPT::create_mpt_model(FFModel &ff, + std::string const &model_config_file_path, + std::string const &weight_file_path, + InferenceMode mode, + GenerationConfig generationConfig, + bool use_full_precision) { + MPTConfig mpt_config(model_config_file_path); + mpt_config.print(); + + if (ff.config.tensor_parallelism_degree > mpt_config.n_heads || + mpt_config.n_heads % ff.config.tensor_parallelism_degree != 0) { + assert(false && "The number of attention heads is smaller, or it is not " + "divisible by the tensor parallelism degree"); + } + + std::unordered_map weights_layers; + + //------------------------------ build the model -------------------------- + Tensor input; + { + int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + input = ff.create_tensor<2>(token_dims, DT_INT32); + } + + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + std::vector axes = {0}; + + Tensor hidden_states; + if (use_full_precision) { + hidden_states = ff.embedding(input, + mpt_config.vocab_size, + mpt_config.hidden_size, + AGGR_MODE_NONE, + DT_FLOAT, + NULL, + embed_init); + } else { + hidden_states = ff.embedding(input, + mpt_config.vocab_size, + mpt_config.hidden_size, + AGGR_MODE_NONE, + DT_HALF, + NULL, + embed_init); + } + + Layer *embedding = ff.layers.back(); + weights_layers.emplace("transformer_wte_weight", embedding); + + for (int i = 0; i < mpt_config.n_layers; i++) { + ff.set_transformer_layer_id(i); + + Tensor residual = hidden_states; + + Tensor layernorm_output = + ff.layer_norm(hidden_states, axes, true, 1e-05, false); + Layer *norm_1 = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_norm_1_weight", + norm_1); + + Tensor attn_outputs; + switch (mode) { + case BEAM_SEARCH_MODE: { + attn_outputs = ff.spec_inc_multihead_self_attention( + layernorm_output, + mpt_config.hidden_size, + mpt_config.n_heads, + mpt_config.hidden_size / mpt_config.n_heads, + mpt_config.hidden_size / mpt_config.n_heads, + 0.0f, + false, + false, + false, + DT_NONE, /*data_type*/ + NULL, + false, + /*scaling query*/ true, + /*scaling factor*/ + pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), + /*qk_prod_scaling*/ false, + /*position_bias*/ true); + break; + } + case TREE_VERIFY_MODE: { + attn_outputs = ff.inc_multihead_self_attention_verify( + layernorm_output, + mpt_config.hidden_size, + mpt_config.n_heads, + mpt_config.hidden_size / mpt_config.n_heads, + mpt_config.hidden_size / mpt_config.n_heads, + 0.0f, + false, + false, + false, + DT_NONE, /*data_type*/ + NULL, + false, + /*scaling query*/ true, + /*scaling factor*/ + pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), + /*qk_prod_scaling*/ false, + /*position_bias*/ true); + break; + } + case INC_DECODING_MODE: { + attn_outputs = ff.inc_multihead_self_attention( + layernorm_output, + mpt_config.hidden_size, + mpt_config.n_heads, + mpt_config.hidden_size / mpt_config.n_heads, + mpt_config.hidden_size / mpt_config.n_heads, + 0.0f, + false, + false, + false, + DT_NONE, /*data_type*/ + NULL, + false, + /*scaling query*/ true, + /*scaling factor*/ + pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), + /*qk_prod_scaling*/ false, + /*position_bias*/ true); + break; + } + default: { + assert(false); + } + } + + Layer *attention_layer = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", + attention_layer); + + hidden_states = ff.add(attn_outputs, residual); + layernorm_output = ff.layer_norm(hidden_states, axes, true, 1e-05, false); + Layer *norm_2 = ff.layers.back(); + weights_layers.emplace("layers_" + std::to_string(i) + "_norm_2_weight", + norm_2); + + residual = hidden_states; + + // MLP + // output = self.ffn(layernorm_output, residual) + layernorm_output = ff.dense( + layernorm_output, 4 * mpt_config.hidden_size, AC_MODE_NONE, false); + Layer *up_proj = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_ffn_up_proj_weight", up_proj); + layernorm_output = ff.gelu(layernorm_output); + Tensor intermediate_output = + ff.dense(layernorm_output, mpt_config.hidden_size, AC_MODE_NONE, false); + Layer *down_proj = ff.layers.back(); + weights_layers.emplace( + "layers_" + std::to_string(i) + "_ffn_down_proj_weight", down_proj); + + hidden_states = ff.add(intermediate_output, residual); + } + + // final + Tensor all_final_norm = + ff.layer_norm(hidden_states, axes, true, 1e-05, false); + Layer *norm_f = ff.layers.back(); + weights_layers.emplace("transformer_norm_f_weight", norm_f); + + Tensor lm_head = + ff.dense(all_final_norm, mpt_config.vocab_size, AC_MODE_NONE, false); + Layer *lm_head_layer = ff.layers.back(); + weights_layers.emplace("lm_head_weight", lm_head_layer); + + Tensor output; + if (mode == BEAM_SEARCH_MODE) { + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.argmax(softmax, /*beam_Search*/ true); + } else { + output = ff.argmax(lm_head, /*beam_Search*/ false); + } + + //------------------- compile the model -------------------------------- + InferenceManager *im = InferenceManager::get_inference_manager(); + im->compile_model_and_allocate_buffer(&ff); + FileDataLoader fileloader("", + weight_file_path, + mpt_config.n_heads, + mpt_config.n_heads, + mpt_config.hidden_size, + mpt_config.hidden_size / mpt_config.n_heads, + ff.config.tensor_parallelism_degree); + fileloader.load_weights(&ff, weights_layers, use_full_precision); + im->init_operators_inference(&ff); +} + +}; // namespace FlexFlow diff --git a/inference/models/mpt.h b/inference/models/mpt.h new file mode 100644 index 0000000000..437e0cb247 --- /dev/null +++ b/inference/models/mpt.h @@ -0,0 +1,75 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "file_loader.h" +#include "flexflow/batch_config.h" +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include +#include +using json = nlohmann::json; + +namespace FlexFlow { + +class MPT { +public: + struct MPTConfig { + MPTConfig(std::string const &model_config_file_path) { + std::ifstream config_file(model_config_file_path); + if (config_file.is_open()) { + try { + json model_config; + config_file >> model_config; + hidden_size = model_config["d_model"]; + n_heads = model_config["n_heads"]; + n_layers = model_config["n_layers"]; + vocab_size = model_config["vocab_size"]; + } catch (json::exception const &e) { + std::cerr << "Error parsing JSON file: " << e.what() << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file " << model_config_file_path + << std::endl; + assert(false); + } + max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; + max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + } + + void print() const { + std::cout << "MPT Config:" << std::endl; + std::cout << "\thidden_size: " << hidden_size << std::endl; + std::cout << "\tn_heads: " << n_heads << std::endl; + std::cout << "\tn_layers: " << n_layers << std::endl; + std::cout << "\tvocab_size: " << vocab_size << std::endl; + } + + int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + int hidden_size, n_heads, n_layers, vocab_size; + }; + + static void create_mpt_model(FFModel &ff, + std::string const &model_config_file_path, + std::string const &weight_file_path, + InferenceMode mode, + GenerationConfig generationConfig, + bool use_full_precision = false); +}; + +}; // namespace FlexFlow diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index 3d6b48b802..26d5bd1894 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -10,7 +10,8 @@ set(CPU_SRC ../file_loader.cc ../models/llama.cc ../models/opt.cc - ../models/falcon.cc) + ../models/falcon.cc + ../models/mpt.cc) if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") cuda_add_executable(${project_target} ${CPU_SRC}) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 16eab8d077..9d139997f7 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -16,6 +16,7 @@ #include "flexflow/inference.h" #include "models/falcon.h" #include "models/llama.h" +#include "models/mpt.h" #include "models/opt.h" #include #include @@ -165,10 +166,19 @@ void get_model_meta(FilePaths &file_paths, } else if (str == "RWForCausalLM") { model_metadata.llm_model_type = ModelType::FALCON; break; + } else if (str == "MPTForCausalLM") { + model_metadata.llm_model_type = ModelType::MPT; + break; } } - model_metadata.bos_token_id = llm_model_config["bos_token_id"]; - model_metadata.eos_token_id = llm_model_config["eos_token_id"]; + model_metadata.bos_token_id = + llm_model_config.find("bos_token_id") == llm_model_config.end() + ? -1 + : (int)llm_model_config.at("bos_token_id"); + model_metadata.eos_token_id = + llm_model_config.find("eos_token_id") == llm_model_config.end() + ? -1 + : (int)llm_model_config.at("eos_token_id"); for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { std::string ssm_config_path = join_path({file_paths.cache_folder_path, @@ -213,10 +223,21 @@ void get_model_meta(FilePaths &file_paths, } else if (str == "RWForCausalLM") { ssm_model_type = ModelType::FALCON; break; + } else if (str == "MPTForCausalLM") { + ssm_model_type = ModelType::MPT; + break; } } - if (ssm_model_config["bos_token_id"] != model_metadata.bos_token_id || - ssm_model_config["eos_token_id"] != model_metadata.eos_token_id) { + int ssm_bos_id = + ssm_model_config.find("bos_token_id") == ssm_model_config.end() + ? -1 + : (int)ssm_model_config.at("bos_token_id"); + int ssm_eos_id = + ssm_model_config.find("eos_token_id") == ssm_model_config.end() + ? -1 + : (int)ssm_model_config.at("eos_token_id"); + if (ssm_bos_id != model_metadata.bos_token_id || + ssm_eos_id != model_metadata.eos_token_id) { printf("Warning: bos/eos token id mismatch between LLM and one of the " "SSMs!\n"); } @@ -293,6 +314,13 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_weights_path, TREE_VERIFY_MODE, use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::MPT) { + MPT::create_mpt_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + generationConfig, + use_full_precision); } else { assert(false && "Invalid LLM model type passed (or no type was passed)."); } @@ -332,6 +360,13 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.ssm_model_weights_paths[ssm_id], BEAM_SEARCH_MODE, use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) { + MPT::create_mpt_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + BEAM_SEARCH_MODE, + generationConfig, + use_full_precision); } else { assert(false && "Invalid SSM model type passed."); } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 2d71dd18b3..5d9480280e 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1567,10 +1567,10 @@ def batch_norm(self, input, relu=True, name=None): self.add_layer(OpType.BATCH_NORM, name) return Tensor(handle, owner_op_type=OpType.BATCH_NORM) - def layer_norm(self, input, axes, elementwise_affine=True, eps=1e-5, name=None): + def layer_norm(self, input, axes, elementwise_affine=True, eps=1e-5, use_bias = True, name=None): c_name = get_c_name(name) c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_layer_norm(self.handle, input.handle, len(axes), c_axes, elementwise_affine, eps, c_name) + handle = ffc().flexflow_model_add_layer_norm(self.handle, input.handle, len(axes), c_axes, elementwise_affine, eps, use_bias, c_name) self.add_layer(OpType.LAYER_NORM, name) return Tensor(handle, owner_op_type=OpType.LAYER_NORM) @@ -1592,6 +1592,9 @@ def batch_matmul(self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name= :param name: the name of the layer. Default is None. :type name: string + :param name: Whether to add use bias in layer normalization + :type name: bool + :returns: Tensor -- the output tensor. """ if a_seq_length_dim is None: @@ -2109,7 +2112,7 @@ def inc_multihead_self_attention(self, input, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, name=None): + qk_prod_scaling=True, position_bias=False, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. In inference mode, the attention is computed using incremental decoding. @@ -2158,6 +2161,9 @@ def inc_multihead_self_attention(self, input, :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool :param name: the name of the layer. Default is None. :type name: string @@ -2167,7 +2173,7 @@ def inc_multihead_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) @@ -2177,7 +2183,7 @@ def spec_inc_multihead_self_attention(self, input, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, name=None): + qk_prod_scaling=True, position_bias=False, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. This operator only supports computing the attention in inference (beam search) mode. @@ -2226,6 +2232,9 @@ def spec_inc_multihead_self_attention(self, input, :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool :param name: the name of the layer. Default is None. :type name: string @@ -2235,7 +2244,7 @@ def spec_inc_multihead_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) @@ -2245,7 +2254,7 @@ def inc_multihead_self_attention_verify(self, input, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, name=None): + qk_prod_scaling=True, position_bias=False, name=None): """Defines the MultiHead Attention operation as described in Attention Is All You Need which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. This operator only supports computing the attention in inference (tree verify) mode. @@ -2294,6 +2303,9 @@ def inc_multihead_self_attention_verify(self, input, :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool :param name: the name of the layer. Default is None. :type name: string @@ -2303,7 +2315,7 @@ def inc_multihead_self_attention_verify(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) @@ -2313,7 +2325,7 @@ def inc_multiquery_self_attention(self, input, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, name=None): + qk_prod_scaling=True, position_bias=False, name=None): """Defines the multi-query head attention, which allows a different number of Q and KV heads, and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. In inference mode, the attention is computed using incremental decoding. @@ -2365,6 +2377,9 @@ def inc_multiquery_self_attention(self, input, :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool :param name: the name of the layer. Default is None. :type name: string @@ -2374,7 +2389,7 @@ def inc_multiquery_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) @@ -2384,7 +2399,7 @@ def spec_inc_multiquery_self_attention(self, input, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, name=None): + qk_prod_scaling=True, position_bias=False, name=None): """Defines the multi-query head attention, which allows a different number of Q and KV heads, and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. This operator only supports computing the attention in inference (beam search) mode. @@ -2436,6 +2451,9 @@ def spec_inc_multiquery_self_attention(self, input, :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool :param name: the name of the layer. Default is None. :type name: string @@ -2445,7 +2463,7 @@ def spec_inc_multiquery_self_attention(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) @@ -2455,7 +2473,7 @@ def inc_multiquery_self_attention_verify(self, input, bias=True, add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, name=None): + qk_prod_scaling=True, position_bias=False, name=None): """Defines the multi-query head attention, which allows a different number of Q and KV heads, and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. This operator only supports computing the attention in inference (tree verify) mode. @@ -2507,6 +2525,9 @@ def inc_multiquery_self_attention_verify(self, input, :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool :param name: the name of the layer. Default is None. :type name: string @@ -2516,7 +2537,7 @@ def inc_multiquery_self_attention_verify(self, input, c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, c_name) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) diff --git a/python/flexflow/serve/models/__init__.py b/python/flexflow/serve/models/__init__.py index 6b405b2f99..a1ca9152ce 100644 --- a/python/flexflow/serve/models/__init__.py +++ b/python/flexflow/serve/models/__init__.py @@ -16,3 +16,4 @@ from .opt import FlexFlowOPT from .falcon import FlexFlowFalcon from .starcoder import FlexFlowSTARCODER +from .mpt import FlexFlowMPT diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py new file mode 100644 index 0000000000..a5a0c7da18 --- /dev/null +++ b/python/flexflow/serve/models/mpt.py @@ -0,0 +1,290 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flexflow.core import * +from .base import FlexFlowModel +import random, torch, shutil + + +class MPTConfig: + def __init__(self, hf_config): + self.max_seq_len = 256 + self.max_num_tokens = 64 + self.max_beam_width = 1 + self.max_beam_depth = 8 + self.hidden_size = hf_config.d_model + self.n_heads = hf_config.n_heads + self.n_layers = hf_config.n_layers + self.vocab_size = hf_config.vocab_size + hf_config.num_attention_heads = hf_config.n_heads + hf_config.hidden_size = hf_config.d_model + +class FlexFlowMPT(FlexFlowModel): + def __init__( + self, + mode, + generation_config, + ffconfig, + hf_config, + data_type, + max_batch_size=1, + max_seq_length=256, + max_tokens_per_batch=64, + weights_filepath="", + tokenizer_filepath="", + ): + self.mode = mode + self.generation_config = generation_config + self.ffconfig = ffconfig + self.max_batch_size = max_batch_size + self.data_type = data_type + self.mpt_config = MPTConfig(hf_config) + self.mpt_config.max_seq_length = max_seq_length + self.mpt_config.max_num_tokens = max_tokens_per_batch + self.weights_filepath = weights_filepath + self.tokenizer_filepath = tokenizer_filepath + self.maxint = 2**31 - 1 + + # Sanity checks + if ( + self.mpt_config.hidden_size + % self.mpt_config.n_heads + != 0 + ): + raise ValueError( + f"Hidden size ({self.mpt_config.hidden_size}) is not divisible by n_head ({self.mpt_config.n_heads})" + ) + + # Sanity checks + if ( + self.mpt_config.n_heads + < self.ffconfig.tensor_parallelism_degree + or self.mpt_config.n_heads + % self.ffconfig.tensor_parallelism_degree + != 0 + ): + raise ValueError( + f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" + ) + self.build_model() + + def build_model(self): + ffmodel = FFModel(self.ffconfig) + + tokens_dims = [self.mpt_config.max_num_tokens, 1] + input = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) + + embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) + hidden_states = ffmodel.embedding( + input, + self.mpt_config.vocab_size, + self.mpt_config.hidden_size, + AggrMode.AGGR_MODE_NONE, + self.data_type, + None, + embed_init, + name="transformer_wte_weight", + ) + + axes = [ + 0, + ] + + for i in range(self.mpt_config.n_layers): + ffmodel.set_transformer_layer_id(i) + residual = hidden_states + layernorm_output = ffmodel.layer_norm( + hidden_states, + axes, + True, + 1e-05, + False, + name=f"layers_{i}_norm_1_weight", + ) + + if self.mode == InferenceMode.BEAM_SEARCH_MODE: + attn_outputs = ffmodel.spec_inc_multihead_self_attention( + layernorm_output, + self.mpt_config.hidden_size, + self.mpt_config.n_heads, + self.mpt_config.hidden_size // self.mpt_config.n_heads, + self.mpt_config.hidden_size // self.mpt_config.n_heads, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + DataType.DT_NONE, # data_type + None, # kernel initializer + False, # apply_rotary_embedding + True, # scaling_query + (self.mpt_config.hidden_size / self.mpt_config.n_heads) + ** (-0.5), # scaling_factor + False, # qk_prod_scaling + True, # qk_prod_scaling + name=f"layers_{i}_attention_weight", + ) + elif self.mode == InferenceMode.TREE_VERIFY_MODE: + attn_outputs = ffmodel.inc_multihead_self_attention_verify( + layernorm_output, + self.mpt_config.hidden_size, + self.mpt_config.n_heads, + self.mpt_config.hidden_size // self.mpt_config.n_heads, + self.mpt_config.hidden_size // self.mpt_config.n_heads, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + DataType.DT_NONE, # data_type + None, # kernel initializer + False, # apply_rotary_embedding + True, # scaling_query + (self.mpt_config.hidden_size / self.mpt_config.n_heads) + ** (-0.5), # scaling_factor + False, # qk_prod_scaling + True, # qk_prod_scaling + name=f"layers_{i}_attention_weight", + ) + elif self.mode == InferenceMode.INC_DECODING_MODE: + attn_outputs = ffmodel.inc_multihead_self_attention( + layernorm_output, + self.mpt_config.hidden_size, + self.mpt_config.n_heads, + self.mpt_config.hidden_size // self.mpt_config.n_heads, + self.mpt_config.hidden_size // self.mpt_config.n_heads, + 0.0, # dropout + False, # bias + False, # add_bias_kv + False, # add_zero_attn + DataType.DT_NONE, # data_type + None, # kernel initializer + False, # apply_rotary_embedding + True, # scaling_query + (self.mpt_config.hidden_size / self.mpt_config.n_heads) + ** (-0.5), # scaling_factor + False, # qk_prod_scaling + True, # qk_prod_scaling + name=f"layers_{i}_attention_weight", + ) + else: + assert False + + hidden_states = ffmodel.add(attn_outputs, residual) + + layernorm_output = ffmodel.layer_norm( + hidden_states, + axes, + True, + 1e-05, + False, + name=f"layers_{i}_norm_2_weight", + ) + residual = hidden_states + # mlp + + layernorm_output = ffmodel.dense( + layernorm_output, + 4 * self.mpt_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers_{i}_ffn_up_proj_weight", + ) + layernorm_output = ffmodel.gelu(layernorm_output) + intermediate_output = ffmodel.dense( + layernorm_output, + self.mpt_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers_{i}_ffn_down_proj_weight", + ) + hidden_states = ffmodel.add(intermediate_output, residual) + + all_final_norm = ffmodel.layer_norm( + hidden_states, + axes, + True, + 1e-05, + False, + name=f"transformer_norm_f_weight", + ) + lm_head = ffmodel.dense( + all_final_norm, + self.mpt_config.vocab_size, + ActiMode.AC_MODE_NONE, + False, + name="lm_head_weight", + ) + + if self.generation_config.do_sample: + dense = ffmodel.scalar_true_divide( + lm_head, self.generation_config.temperature, False + ) + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.sampling(softmax, self.generation_config.topp) + else: + output = ffmodel.argmax(lm_head, False) + + self.ffmodel = ffmodel + + def convert_hf_model(model, dst_folder): + os.makedirs(dst_folder, exist_ok=True) + for name, params in model.named_parameters(): + name = name.replace("transformer.blocks.", "layers.").replace(".", "_") + if 'Wqkv' in name: + name_q = name.replace("attn_Wqkv", "attention_wq") + name_k = name.replace("attn_Wqkv", "attention_wk") + name_v = name.replace("attn_Wqkv", "attention_wv") + q, k, v = torch.split( + params, + [ + model.config.d_model, + model.config.d_model, + model.config.d_model, + ], + 0, + ) + q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) + k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) + v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) + elif 'out_proj' in name: + name = name.replace("attn_out_proj", "attention_wo") + params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) + else: + params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) + + shutil.copy( + os.path.join(dst_folder, "transformer_wte_weight"), + os.path.join(dst_folder, "lm_head_weight"), + ) + def get_layers_with_weights(self): + layer_names = [ + "transformer_wte_weight", + "transformer_norm_f_weight", + "lm_head_weight", + ] + [ + expr + for i in range(self.mpt_config.n_layers) + for expr in ( + f"layers_{i}_norm_1_weight", + f"layers_{i}_attention_weight", + f"layers_{i}_norm_2_weight", + f"layers_{i}_ffn_up_proj_weight", + f"layers_{i}_ffn_down_proj_weight", + ) + ] + layers_with_weights = { + layer_name: self.ffmodel.get_layer_by_name(layer_name) + for layer_name in layer_names + } + + return layers_with_weights diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index dea21389d1..58f7221082 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -17,6 +17,7 @@ FlexFlowOPT, FlexFlowFalcon, FlexFlowSTARCODER, + FlexFlowMPT, ) from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer @@ -87,6 +88,7 @@ def __init__( "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT), "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon), "GPTBigCodeForCausalLM": (ModelType.STARCODER, FlexFlowSTARCODER), + "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT), } self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.model_name = self.hf_config._name_or_path @@ -358,7 +360,9 @@ def compile( # Create request manager self.rm = RequestManager() - self.rm.register_tokenizer(self.model_type, self.hf_config.bos_token_id, self.hf_config.eos_token_id, self.tokenizer_path) + bos_token_id = -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id + eos_token_id = -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id + self.rm.register_tokenizer(self.model_type, bos_token_id, eos_token_id, self.tokenizer_path) self.rm.register_output_filepath(self.output_file) self.im.init_operators_inference(self.model.ffmodel) diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 5232ddd431..d7cc145fde 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -79,6 +79,7 @@ class ModelType(Enum): OPT = 3004 FALCON = 3005 STARCODER = 3006 + MPT = 3007 class OpType(Enum): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 2ddb65fc9d..0c1fad17df 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -613,6 +613,7 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, int *axes, bool elementwise_affine, float eps, + bool use_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); const Tensor input = FFCObjectWrapper::unwrap(input_); @@ -620,8 +621,13 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); } - Tensor tensor = handle->layer_norm( - input, axes_vec, elementwise_affine, eps, input->data_type, name); + Tensor tensor = handle->layer_norm(input, + axes_vec, + elementwise_affine, + eps, + use_bias, + input->data_type, + name); DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, elementwise_affine %d, eps " "%f, name %s", tensor, @@ -1071,6 +1077,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1091,6 +1098,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); return FFCObjectWrapper::wrap(tensor); } @@ -1112,6 +1120,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1133,6 +1142,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); return FFCObjectWrapper::wrap(tensor); } @@ -1154,6 +1164,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1175,6 +1186,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); return FFCObjectWrapper::wrap(tensor); } @@ -1197,6 +1209,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1218,6 +1231,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); return FFCObjectWrapper::wrap(tensor); } @@ -1240,6 +1254,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1262,6 +1277,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); return FFCObjectWrapper::wrap(tensor); } @@ -1284,6 +1300,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1306,6 +1323,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); return FFCObjectWrapper::wrap(tensor); } diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 99c5bc2631..9824e8469d 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -799,11 +799,15 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } GenericTensorAccessorR gamma, beta; if (m->elementwise_affine) { gamma = my_weight_accessor[0]; - beta = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } } LayerNorm::forward_kernel_wrapper( m, my_input_accessor[0], my_output_accessor[0], gamma, beta); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index b834073064..5cecbd168e 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -893,11 +893,15 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == 2 * (int)(m->elementwise_affine)); + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } GenericTensorAccessorR gamma, beta; if (m->elementwise_affine) { gamma = my_weight_accessor[0]; - beta = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } } LayerNorm::forward_kernel_wrapper( m, my_input_accessor[0], my_output_accessor[0], gamma, beta); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index ec8bc8839e..2c7518bae9 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -73,6 +73,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { return inc_multiquery_self_attention(input, embed_dim, @@ -90,6 +91,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); } @@ -109,6 +111,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; @@ -201,6 +204,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); li->add_int_property("tensor_parallelism_degree", @@ -241,6 +245,9 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( layer->get_float_property("scaling_factor", scaling_factor); layer->get_int_property("qk_prod_scaling", value); bool qk_prod_scaling = (bool)value; + layer->get_int_property("position_bias", value); + bool position_bias = (bool)value; + layer->get_int_property("quantization_type", value); DataType quantization_type = (DataType)value; layer->get_int_property("offload", value); @@ -264,6 +271,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, false /*allocate_weights*/, quantization_type, offload, @@ -288,6 +296,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -310,8 +319,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) { + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), + quantization_type(_quantization_type), offload(_offload), + tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; numOutputs = 1; @@ -398,6 +408,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -421,8 +432,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), + quantization_type(_quantization_type), offload(_offload), + tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -515,6 +527,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.scaling_query, other.scaling_factor, other.qk_prod_scaling, + other.position_bias, allocate_weights, other.quantization_type, other.offload, @@ -543,6 +556,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.scaling_query, params.scaling_factor, params.qk_prod_scaling, + params.position_bias, allocate_weights, params.quantization_type, params.offload, @@ -1634,7 +1648,8 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs, lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && - lhs.qk_prod_scaling == rhs.qk_prod_scaling; + lhs.qk_prod_scaling == rhs.qk_prod_scaling && + lhs.position_bias == rhs.position_bias; } IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { @@ -1652,6 +1667,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + params.position_bias = this->position_bias, params.tensor_parallelism_degree = this->tensor_parallelism_degree, params.quantization_type = this->quantization_type; params.offload = this->offload; @@ -1680,6 +1696,7 @@ size_t hash::operator()( hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.position_bias); hash_combine(key, params.quantization_type); hash_combine(key, params.offload); hash_combine(key, params.tensor_parallelism_degree); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 75e68a7332..a08114fec9 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -30,6 +30,31 @@ using Legion::Memory; namespace Kernels { namespace IncMultiHeadAttention { +// only used by MPT model. https://arxiv.org/abs/2108.12409 +template +__global__ void apply_position_bias_qkprd(DT *input_ptr, + int num_tokens, + int num_total_tokens, + int num_heads, + int global_num_q_heads, + int shard_id) { + CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) { + // get head_idx, + int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id); + int position_idx = (i / num_tokens) % num_total_tokens; + position_idx = position_idx + 1 - num_total_tokens; + // 8 is alibi_bias_max in + // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json + float base = (float)(head_idx + 1) * 8 / global_num_q_heads; + float slopes = 1.0 / pow(2, base); + // if(i == 0){ + // printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes, + // position_idx * slopes); + // } + input_ptr[i] += static_cast
(position_idx * slopes); + } +} + template __global__ void apply_proj_bias_w(DT *input_ptr, DT const *bias_ptr, @@ -104,6 +129,16 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, } } } +template +__global__ void scaling_query_kernel(DT *input_ptr, + int qProjSize, + int num_tokens, + int num_q_heads, + float scaling_factor) { + CUDA_KERNEL_LOOP(i, num_tokens * (qProjSize * num_q_heads)) { + input_ptr[i] *= scaling_factor; + } +} template __global__ void @@ -332,6 +367,17 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_kv_heads, *m->scaling_query, m->scaling_factor); + } else if (m->scaling_query) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + num_tokens, + m->num_q_heads, + m->qProjSize, + m->scaling_factor); } if (*m->apply_rotary_embedding) { /*q&k*/ @@ -625,6 +671,21 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, HIPBLAS_GEMM_DEFAULT)); } } + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -903,6 +964,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->bias, attn->scaling_query, attn->qk_prod_scaling, + attn->position_bias, attn->add_bias_kv, attn->scaling_factor, weight, @@ -930,6 +992,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bool _bias, bool _scaling_query, bool _qk_prod_scaling, + bool _position_bias, bool _add_bias_kv, float _scaling_factor, GenericTensorAccessorR const &weight, @@ -987,6 +1050,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( scaling_factor = _scaling_factor; qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); *qk_prod_scaling = _qk_prod_scaling; + position_bias = (bool *)calloc(1, sizeof(bool)); + *position_bias = _position_bias; // Currently do not support adding bias to key/value projection assert(!_add_bias_kv); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 37223e11c9..f0e6d9df1d 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -30,6 +30,31 @@ using Legion::Memory; namespace Kernels { namespace IncMultiHeadAttention { +// only used by MPT model. https://arxiv.org/abs/2108.12409 +template +__global__ void apply_position_bias_qkprd(DT *input_ptr, + int num_tokens, + int num_total_tokens, + int num_heads, + int global_num_q_heads, + int shard_id) { + CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) { + // get head_idx, + int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id); + int position_idx = (i / num_tokens) % num_total_tokens; + position_idx = position_idx + 1 - num_total_tokens; + // 8 is alibi_bias_max in + // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json + float base = (float)(head_idx + 1) * 8 / global_num_q_heads; + float slopes = 1.0 / pow(2, base); + // if(i == 0){ + // printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes, + // position_idx * slopes); + // } + input_ptr[i] += static_cast
(position_idx * slopes); + } +} + template __global__ void apply_proj_bias_w(DT *input_ptr, DT const *bias_ptr, @@ -113,6 +138,17 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, } } +template +__global__ void scaling_query_kernel(DT *input_ptr, + int qProjSize, + int num_tokens, + int num_q_heads, + float scaling_factor) { + CUDA_KERNEL_LOOP(i, num_tokens * (qProjSize * num_q_heads)) { + input_ptr[i] *= scaling_factor; + } +} + template __global__ void apply_rotary_embedding_native(DT *input_ptr, @@ -279,7 +315,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_kv_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor int num_tokens = bc->num_active_tokens(); @@ -305,6 +340,15 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_kv_heads, *m->scaling_query, m->scaling_factor); + } else if (m->scaling_query) { + scaling_query_kernel<<>>(output_ptr, + num_tokens, + m->num_q_heads, + m->qProjSize, + m->scaling_factor); } if (*m->apply_rotary_embedding) { /*q&k*/ @@ -630,6 +674,20 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } + // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -906,6 +964,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->bias, attn->scaling_query, attn->qk_prod_scaling, + attn->position_bias, attn->add_bias_kv, attn->scaling_factor, weight, @@ -933,6 +992,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bool _bias, bool _scaling_query, bool _qk_prod_scaling, + bool _position_bias, bool _add_bias_kv, float _scaling_factor, GenericTensorAccessorR const &weight, @@ -990,6 +1050,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( scaling_factor = _scaling_factor; qk_prod_scaling = (bool *)calloc(1, sizeof(bool)); *qk_prod_scaling = _qk_prod_scaling; + position_bias = (bool *)calloc(1, sizeof(bool)); + *position_bias = _position_bias; // Currently do not support adding bias to key/value projection assert(!_add_bias_kv); diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 2dca38578f..09a5e40851 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -41,7 +41,8 @@ using Legion::TaskLauncher; bool operator==(LayerNormParams const &lhs, LayerNormParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && - lhs.elementwise_affine == rhs.elementwise_affine; + lhs.elementwise_affine == rhs.elementwise_affine && + lhs.use_bias == rhs.use_bias; } bool LayerNormParams::is_valid(ParallelTensorShape const &input) const { @@ -54,6 +55,7 @@ LayerNormParams LayerNorm::get_params() const { params.axes = this->axes; params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; + params.use_bias = this->use_bias; return params; } @@ -61,6 +63,7 @@ Tensor FFModel::layer_norm(const Tensor input, std::vector const &axes, bool elementwise_affine, float eps, + bool use_bias, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -97,7 +100,7 @@ Tensor FFModel::layer_norm(const Tensor input, if (data_type == DT_NONE) { data_type = input->data_type; } - int num_weights = elementwise_affine ? 2 : 0; + int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0; Layer *ln = nullptr; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for layer_norm"); @@ -126,7 +129,7 @@ Tensor FFModel::layer_norm(const Tensor input, ln, 0, true /*create_grad*/); - if (num_weights == 2) { + if (num_weights > 0) { int numdims = axes.size(); int dims[numdims]; for (int i = 0; i < numdims; i++) { @@ -139,15 +142,18 @@ Tensor FFModel::layer_norm(const Tensor input, true /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(numdims, - dims, - input->data_type, - ln, - true /*create_grad*/, - nullptr, - CHOSEN_SYNC_TYPE); + if (num_weights == 2) { + ln->weights[1] = create_weight_legion_ordering(numdims, + dims, + input->data_type, + ln, + true /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + } } ln->add_int_property("elementwise_affine", elementwise_affine); + ln->add_int_property("use_bias", use_bias); ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); layers.push_back(ln); @@ -161,6 +167,8 @@ Op *LayerNorm::create_operator_from_layer( long long value; layer->get_int_property("elementwise_affine", value); bool elementwise_affine = (bool)value; + layer->get_int_property("use_bias", value); + bool use_bias = (bool)value; std::vector axes; layer->get_int_vector_property("axes", axes); float eps; @@ -170,6 +178,7 @@ Op *LayerNorm::create_operator_from_layer( inputs[0], axes, elementwise_affine, + use_bias, eps, false, // allocate_weights layer->name); @@ -185,6 +194,7 @@ LayerNorm::LayerNorm(FFModel &model, input, params.axes, params.elementwise_affine, + params.use_bias, params.eps, allocate_weights, name) {} @@ -194,6 +204,7 @@ LayerNorm::LayerNorm(FFModel &model, const ParallelTensor _input, std::vector const &_axes, bool _elementwise_affine, + bool _use_bias, float _eps, bool allocate_weights, char const *name) @@ -202,10 +213,11 @@ LayerNorm::LayerNorm(FFModel &model, _input->data_type, name, 1 /*inputs*/, - _elementwise_affine ? 2 : 0 /*weights*/, + _elementwise_affine ? (_use_bias ? 2 : 1) : 0 /*weights*/, 1 /*outputs*/, _input), - elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes) { + elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), + use_bias(_use_bias) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -224,7 +236,7 @@ LayerNorm::LayerNorm(FFModel &model, } effective_num_elements = M; effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; - assert(elementwise_affine == (numWeights == 2)); + assert(use_bias == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { ParallelTensorShape beta_gamma_shape = _input->get_shape(); for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { @@ -291,12 +303,15 @@ void LayerNorm::init_inference(FFModel const &ff, EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); @@ -337,12 +352,14 @@ void LayerNorm::init(FFModel const &ff) { EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } } FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); @@ -397,12 +414,15 @@ void LayerNorm::forward(FFModel const &ff) { READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); + if (use_bias) { + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + } + launcher.add_field(3, FID_DATA); } runtime->execute_index_space(ctx, launcher); @@ -449,12 +469,15 @@ FutureMap LayerNorm::inference(FFModel const &ff, EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } } return runtime->execute_index_space(ctx, launcher); } @@ -492,20 +515,23 @@ void LayerNorm::forward_task(Task const *task, assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { - assert(regions.size() == 4); + assert(m->use_bias == (regions.size() == 4)); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - // gamma_ptr = helperGetTensorPointerRW( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); gamma = helperGetGenericTensorAccessorRO( m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - Domain beta_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - // beta_ptr = helperGetTensorPointerRW( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - beta = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); - assert(gamma_domain == beta_domain); + if (m->use_bias) { + Domain beta_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + beta = helperGetGenericTensorAccessorRO(m->input_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + assert(gamma_domain == beta_domain); + } + assert(gamma_domain.get_volume() == m->effective_num_elements); int numdims = gamma_domain.get_dim(); size_t vol = 1; @@ -573,12 +599,15 @@ void LayerNorm::backward(FFModel const &ff) { weights[0]->region_grad)); launcher.add_field(4, FID_DATA); // regions[5](I/O): beta_grad - launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[1]->region_grad)); - launcher.add_field(5, FID_DATA); + if (use_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(5, FID_DATA); + } } runtime->execute_index_space(ctx, launcher); } @@ -615,7 +644,7 @@ void LayerNorm::backward_task(Task const *task, assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { - assert(regions.size() == 6); + assert(m->use_bias == (regions.size() == 6)); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); gamma_ptr = helperGetTensorPointerRO( @@ -624,12 +653,16 @@ void LayerNorm::backward_task(Task const *task, ctx, task->regions[4].region.get_index_space()); gamma_grad_ptr = helperGetTensorPointerRW( regions[4], task->regions[4], FID_DATA, ctx, runtime); - Domain beta_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); - beta_grad_ptr = helperGetTensorPointerRW( - regions[5], task->regions[5], FID_DATA, ctx, runtime); + if (m->use_bias) { + Domain beta_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[5].region.get_index_space()); + beta_grad_ptr = helperGetTensorPointerRW( + regions[5], task->regions[5], FID_DATA, ctx, runtime); + assert(gamma_domain == beta_grad_domain); + } + assert(gamma_domain == gamma_grad_domain); - assert(gamma_domain == beta_grad_domain); + assert(gamma_domain.get_volume() == m->effective_num_elements); } else { assert(regions.size() == 3); @@ -752,6 +785,7 @@ void LayerNorm::serialize(Legion::Serializer &sez) const { } sez.serialize(this->elementwise_affine); sez.serialize(this->eps); + sez.serialize(this->use_bias); } using PCG::Node; @@ -764,6 +798,7 @@ Node LayerNorm::deserialize(FFModel &ff, size_t num_axes; std::vector axes; bool elementwise_affine; + bool use_bias; float eps; size_t id, transformer_layer_id; dez.deserialize(id); @@ -777,12 +812,14 @@ Node LayerNorm::deserialize(FFModel &ff, } dez.deserialize(elementwise_affine); dez.deserialize(eps); + dez.deserialize(use_bias); LayerNormParams params; params.layer_guid = layer_guid; params.axes = axes; params.elementwise_affine = elementwise_affine; params.eps = eps; + params.use_bias = use_bias; return ff.get_or_create_node(inputs[0], params); } @@ -805,6 +842,7 @@ size_t hash::operator()( hash_combine(key, n); } hash_combine(key, params.elementwise_affine); + hash_combine(key, params.use_bias); return key; } }; // namespace std diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index ddbf96a493..d97c2f62ff 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -31,6 +31,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, elementwise_affine = ln->elementwise_affine; effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; + use_bias = ln->use_bias; eps = ln->eps; checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); @@ -171,14 +172,15 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, input.get_float_ptr(), output.get_float_ptr(), gamma.get_float_ptr(), - beta.get_float_ptr(), + m->use_bias ? beta.get_float_ptr() + : nullptr, stream); } else if (m->input_type[0] == DT_HALF) { LayerNorm::forward_kernel(m, input.get_half_ptr(), output.get_half_ptr(), gamma.get_half_ptr(), - beta.get_half_ptr(), + m->use_bias ? beta.get_half_ptr() : nullptr, stream); } else { assert(false && "unsupport datatype in layernorm"); diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index f594f8f7a8..97b5094a21 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -29,6 +29,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, MemoryAllocator &gpu_mem_allocator) : OpMeta(handle) { elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; @@ -186,14 +187,15 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, input.get_float_ptr(), output.get_float_ptr(), gamma.get_float_ptr(), - beta.get_float_ptr(), + m->use_bias ? beta.get_float_ptr() + : nullptr, stream); } else if (m->input_type[0] == DT_HALF) { LayerNorm::forward_kernel(m, input.get_half_ptr(), output.get_half_ptr(), gamma.get_half_ptr(), - beta.get_half_ptr(), + m->use_bias ? beta.get_half_ptr() : nullptr, stream); } else { assert(false && "unsupport datatype in layernorm"); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 9395c9aab4..01275c9875 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -72,6 +72,7 @@ Tensor bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { return spec_inc_multiquery_self_attention(input, embed_dim, @@ -89,6 +90,7 @@ Tensor scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); } @@ -109,6 +111,7 @@ Tensor bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; @@ -191,6 +194,7 @@ Tensor li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("position_bias", position_bias); layers.push_back(li); return li->outputs[0]; } @@ -228,6 +232,9 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( layer->get_float_property("scaling_factor", scaling_factor); layer->get_int_property("qk_prod_scaling", value); bool qk_prod_scaling = (bool)value; + layer->get_int_property("position_bias", value); + bool position_bias = (bool)value; + return new SpecIncMultiHeadSelfAttention(model, layer->layer_guid, inputs[0], @@ -244,6 +251,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, false /*allocate_weights*/, layer->name); } @@ -265,6 +273,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -284,7 +293,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling) { + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { // overwrite layer_guid layer_guid = _layer_guid; @@ -364,6 +373,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, char const *name) // Initializer* _bias_initializer) @@ -384,7 +394,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling) + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -471,6 +481,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( other.scaling_query, other.scaling_factor, other.qk_prod_scaling, + other.position_bias, allocate_weights, other.name) {} @@ -496,6 +507,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.scaling_query, params.scaling_factor, params.qk_prod_scaling, + params.position_bias, allocate_weights, name) {} @@ -811,7 +823,8 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && - lhs.qk_prod_scaling == rhs.qk_prod_scaling; + lhs.qk_prod_scaling == rhs.qk_prod_scaling && + lhs.position_bias == rhs.position_bias; } SpecIncMultiHeadSelfAttentionParams @@ -831,6 +844,7 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + params.position_bias = this->position_bias; return params; } @@ -855,6 +869,7 @@ size_t hash::operator()( hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.position_bias); return key; } }; // namespace std diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index e5e35a4c90..7e85a65e05 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -332,6 +332,20 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, HIPBLAS_GEMM_DEFAULT)); } } + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. @@ -660,6 +674,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->bias, attn->scaling_query, attn->qk_prod_scaling, + attn->position_bias, attn->add_bias_kv, attn->scaling_factor, weight, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index d1faba9c68..af70a07e83 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -348,7 +348,20 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } - + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); @@ -674,6 +687,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->bias, attn->scaling_query, attn->qk_prod_scaling, + attn->position_bias, attn->add_bias_kv, attn->scaling_factor, weight, diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index e26c306cf1..9597482ad2 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -74,6 +74,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { return inc_multiquery_self_attention_verify(input, embed_dim, @@ -91,6 +92,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, name); } @@ -111,6 +113,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( bool scaling_query, float scaling_factor, bool qk_prod_scaling, + bool position_bias, char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; @@ -203,6 +206,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); li->add_int_property("tensor_parallelism_degree", @@ -242,6 +246,8 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( layer->get_float_property("scaling_factor", scaling_factor); layer->get_int_property("qk_prod_scaling", value); bool qk_prod_scaling = (bool)value; + layer->get_int_property("position_bias", value); + bool position_bias = (bool)value; layer->get_int_property("quantization_type", value); DataType quantization_type = (DataType)value; layer->get_int_property("offload", value); @@ -264,6 +270,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( scaling_query, scaling_factor, qk_prod_scaling, + position_bias, false /*allocate_weights*/, quantization_type, offload, @@ -288,6 +295,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -310,8 +318,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) { + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), + quantization_type(_quantization_type), offload(_offload), + tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; @@ -399,6 +408,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, + bool _position_bias, bool allocate_weights, DataType _quantization_type, bool _offload, @@ -422,8 +432,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), quantization_type(_quantization_type), - offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), + quantization_type(_quantization_type), offload(_offload), + tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -515,6 +526,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( other.scaling_query, other.scaling_factor, other.qk_prod_scaling, + other.position_bias, allocate_weights, other.quantization_type, other.offload, @@ -543,6 +555,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.scaling_query, params.scaling_factor, params.qk_prod_scaling, + params.position_bias, allocate_weights, params.quantization_type, params.offload, @@ -1656,7 +1669,8 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && - lhs.qk_prod_scaling == rhs.qk_prod_scaling; + lhs.qk_prod_scaling == rhs.qk_prod_scaling && + lhs.position_bias == lhs.position_bias; } TreeIncMultiHeadSelfAttentionParams @@ -1676,6 +1690,7 @@ TreeIncMultiHeadSelfAttentionParams params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; + params.position_bias = this->position_bias; params.tensor_parallelism_degree = this->tensor_parallelism_degree; return params; } @@ -1700,6 +1715,7 @@ size_t hash::operator()( hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.position_bias); hash_combine(key, params.quantization_type); hash_combine(key, params.offload); hash_combine(key, params.tensor_parallelism_degree); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index b3a56f650a..1a9d1b6968 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -322,6 +322,21 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, HIPBLAS_GEMM_DEFAULT)); } } + if (*m->position_bias) { + size_t parallelism = + m->num_q_heads * total_tokens_in_request * num_new_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens_in_request, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. @@ -680,6 +695,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->bias, attn->scaling_query, attn->qk_prod_scaling, + attn->position_bias, attn->add_bias_kv, attn->scaling_factor, weight, diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 69f085d3eb..f916bdb925 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -318,6 +318,21 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = + m->num_q_heads * total_tokens_in_request * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens_in_request, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. @@ -677,6 +692,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->bias, attn->scaling_query, attn->qk_prod_scaling, + attn->position_bias, attn->add_bias_kv, attn->scaling_factor, weight, diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f348ca9016..0e957f0702 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2323,6 +2323,7 @@ GraphOptimalViewSerialized sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->position_bias); sez.serialize(attn->quantization_type); sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); @@ -2346,6 +2347,7 @@ GraphOptimalViewSerialized sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->position_bias); sez.serialize(attn->num_kv_heads); break; } @@ -2366,6 +2368,7 @@ GraphOptimalViewSerialized sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->position_bias); sez.serialize(attn->quantization_type); sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); @@ -2740,7 +2743,7 @@ void FFModel::deserialize_graph_optimal_view( tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload; + scaling_query, qk_prod_scaling, offload, position_bias; DataType quantization_type; size_t id, transformer_layer_id; dez.deserialize(id); @@ -2758,6 +2761,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); + dez.deserialize(position_bias); dez.deserialize(quantization_type); dez.deserialize(offload); dez.deserialize(num_kv_heads); @@ -2777,6 +2781,7 @@ void FFModel::deserialize_graph_optimal_view( params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; + params.position_bias = position_bias; params.quantization_type = quantization_type; params.offload = offload; params.num_kv_heads = num_kv_heads; @@ -2789,7 +2794,7 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling; + scaling_query, qk_prod_scaling, position_bias; size_t id, transformer_layer_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); @@ -2806,6 +2811,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); + dez.deserialize(position_bias); dez.deserialize(num_kv_heads); SpecIncMultiHeadSelfAttentionParams params; @@ -2822,6 +2828,7 @@ void FFModel::deserialize_graph_optimal_view( params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; + params.position_bias = position_bias; params.num_kv_heads = num_kv_heads; node = get_or_create_node(inputs[0], params); @@ -2833,7 +2840,7 @@ void FFModel::deserialize_graph_optimal_view( tensor_parallelism_degree; float dropout, scaling_factor; bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload; + scaling_query, qk_prod_scaling, offload, position_bias; DataType quantization_type; size_t id, transformer_layer_id; dez.deserialize(id); @@ -2851,6 +2858,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); + dez.deserialize(position_bias); dez.deserialize(quantization_type); dez.deserialize(offload); dez.deserialize(num_kv_heads); @@ -2870,6 +2878,7 @@ void FFModel::deserialize_graph_optimal_view( params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; + params.position_bias = position_bias; params.quantization_type = quantization_type; params.offload = offload; params.num_kv_heads = num_kv_heads; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 43b5df1f39..8c72cc51d1 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3237,6 +3237,9 @@ void FFModel::create_operators_from_layers() { (l->op_type == OP_LINEAR && layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_RELU && layers[layer_idx - 2]->op_type == OP_LINEAR) || + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || (l->op_type == OP_LINEAR && layer_idx >= 5 && layers[layer_idx - 1]->op_type == OP_EW_MUL && layers[layer_idx - 2]->op_type == OP_EW_MUL && diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d75b0fbe0b..d915a0e4aa 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -103,7 +103,8 @@ void RequestManager::register_tokenizer(ModelType type, this->tokenizer_ = Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); } else if (model_type == ModelType::FALCON || - model_type == ModelType::STARCODER) { + model_type == ModelType::STARCODER || + model_type == ModelType::MPT) { std::string falcon_tokenizer_path = join_path({path, "tokenizer.json"}); this->tokenizer_ = Tokenizer::FromBlobJSON(LoadBytesFromFile(falcon_tokenizer_path)); @@ -201,8 +202,9 @@ RequestManager::RequestGuid request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - request.tokens.push_back(bos_token_id); - + if (bos_token_id >= 0) { + request.tokens.push_back(bos_token_id); + } std::vector tokens = this->tokenizer_->Encode(prompt); if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 9c4c37b2e7..e780bc17b0 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -49,6 +49,7 @@ llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m"] opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] falcon_models = ["tiiuae/falcon-7b",] +mpt_models = ["mosaicml/mpt-7b", ] # starcoder_models = ["bigcode/starcoderbase-7b",] parallelism_settings = [(1,4), (2,2), (4,1)] @@ -63,7 +64,7 @@ # Generate incremental decoding configs -all_models = llama_models + opt_models + falcon_models +all_models = llama_models + opt_models + falcon_models + mpt_models for model_name in all_models: for full_precision in (True, False): for parallelism_degrees in parallelism_settings: From 4adad7d21da363c0fefb724a642e17c2a00925f8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 6 Sep 2023 11:45:58 -0400 Subject: [PATCH 224/344] Update docker-build.yml --- .github/workflows/docker-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 2234ec60aa..899de4664e 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -63,6 +63,7 @@ jobs: cuda_version: ${{ matrix.gpu_backend_version }} hip_version: ${{ matrix.gpu_backend_version }} branch_name: ${{ github.head_ref || github.ref_name }} + timeout-minutes: 480 steps: - name: Checkout Git Repository uses: actions/checkout@v3 From 8f04bea0ca3abe521548b0e70695323284483f29 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 11 Sep 2023 11:10:52 -0400 Subject: [PATCH 225/344] bug fix --- python/flexflow/serve/__init__.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index a8d0a0294c..7531c006a8 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -138,19 +138,21 @@ def init( raise TypeError("configs_dict is not a dictionary") else: # Add named key-value arguments into dictionary - configs_dict["num_gpus"] = num_gpus - configs_dict["memory_per_gpu"] = memory_per_gpu - configs_dict["zero_copy_memory_per_node"] = zero_copy_memory_per_node - configs_dict["legion_utility_processors"] = legion_utility_processors - configs_dict["data_parallelism_degree"] = data_parallelism_degree - configs_dict["tensor_parallelism_degree"] = tensor_parallelism_degree - configs_dict["pipeline_parallelism_degree"] = pipeline_parallelism_degree - configs_dict["offload"] = offload - configs_dict["offload_reserve_space_size"] = offload_reserve_space_size - configs_dict["use_4bit_quantization"] = use_4bit_quantization - configs_dict["use_8bit_quantization"] = use_8bit_quantization - configs_dict["profiling"] = profiling - configs_dict["fusion"] = fusion + configs_dict = { + "num_gpus": num_gpus, + "memory_per_gpu": memory_per_gpu, + "zero_copy_memory_per_node": zero_copy_memory_per_node, + "legion_utility_processors": legion_utility_processors, + "data_parallelism_degree": data_parallelism_degree, + "tensor_parallelism_degree": tensor_parallelism_degree, + "pipeline_parallelism_degree": pipeline_parallelism_degree, + "offload": offload, + "offload_reserve_space_size": offload_reserve_space_size, + "use_4bit_quantization": use_4bit_quantization, + "use_8bit_quantization": use_8bit_quantization, + "profiling": profiling, + "fusion": fusion + } # Check that mandatory configs are present required_keys = ["num_gpus", "memory_per_gpu", "zero_copy_memory_per_node"] From c7cc6b4b906788126b5b833c1acad20454888520 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 17 Sep 2023 19:09:37 -0400 Subject: [PATCH 226/344] Fix Falcon model, inference test in CI (#1138) * . * gpu torch in docker * fix * add falcon to ci * re-enabled opt tests, linting --- docker/flexflow-environment/Dockerfile | 2 +- inference/python/spec_infer.py | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 24 +++++----- src/runtime/request_manager.cc | 2 +- tests/inference/cpp_inference_tests.sh | 46 +++++++++++-------- tests/inference/huggingface_inference.py | 16 ++++--- tests/inference/python_inference_tests.sh | 22 +++++---- .../python_test_configs/generate_configs.py | 4 +- tests/inference_tests.sh | 2 +- 9 files changed, 67 insertions(+), 53 deletions(-) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 774c585b44..a12f31c738 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -77,7 +77,7 @@ ENV CUDA_DIR /usr/local/cuda # Install python packages and other dependencies RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install CPU-only Pytorch and related dependencies -RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch +RUN conda install pytorch torchvision torchaudio -c pytorch RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 192960b533..7dc6635819 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -67,7 +67,7 @@ def get_configs(): "ssms": [ { # required ssm parameter - "ssm_model": "JackFram/llama-160m", + "ssm_model": "JackFram/llama-160m-base", # optional ssm parameters "cache_path": "", "refresh_cache": False, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index af70a07e83..47e9941e1d 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -350,18 +350,18 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } // add alibi position bias to qk production // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d915a0e4aa..5eb3192e25 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -202,7 +202,7 @@ RequestManager::RequestGuid request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - if (bos_token_id >= 0) { + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } std::vector tokens = this->tokenizer_->Encode(prompt); diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 6a108303d6..8c8de22364 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,9 +10,9 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 @@ -22,9 +22,9 @@ cd "${BASH_SOURCE[0]%/*}" # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -37,9 +37,9 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 @@ -69,11 +69,11 @@ fi # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -216,28 +216,32 @@ fi ######################### Alignment tests with HuggingFace #################################### # LLAMA (small model, full precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu # LLAMA (small model, half precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" # LLAMA (big model, half precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu # OPT (small model, full precision) -python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 # OPT (small model, half precision) -python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 # OPT (big model, full precision) -#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 +python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 128 # OPT (big model, half precision) -#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 +# python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 128 + +# Falcon (full precision) +python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 + diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt") diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) @@ -246,5 +250,7 @@ diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt") diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt") -#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt") +diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt") +# diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt") +diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_falcon_7B.txt") + diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 788d001dd8..072e8f2bdb 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -1,7 +1,7 @@ import argparse import json import os -from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer def main(): # Change working dir to folder storing this script @@ -12,7 +12,6 @@ def main(): # Parse command line arguments parser = argparse.ArgumentParser() parser.add_argument("--model-name", type=str, required=True) - parser.add_argument("--tokenizer-model-name", type=str, required=True) parser.add_argument("--max-length", type=int, default=128) parser.add_argument("--prompt-file", type=str, required=True) parser.add_argument("--output-file", type=str, required=True) @@ -46,15 +45,20 @@ def main(): # Run huggingface model device = "cuda" if args.gpu else "cpu" + # Get Model model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device) - if args.tokenizer_model_name == "JackFram/llama-160m": - tokenizer = LlamaTokenizer.from_pretrained("JackFram/llama-160m", use_fast=True) + # Get Tokenizer + hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True) else: - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + # Generate output with open(args.output_file, "w") as f: for i, prompt in enumerate(prompt_list): batch = tokenizer( - prompt_list, return_tensors="pt", add_special_tokens=True + prompt, return_tensors="pt", add_special_tokens=True ).to(device) generated = model.generate(batch["input_ids"], max_length=args.max_length) out = tokenizer.decode(generated[0]) diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 800c0ad043..3618401c9d 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -157,28 +157,31 @@ check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_ ######################### Alignment tests with HuggingFace #################################### # LLAMA (small model, full precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu # LLAMA (small model, half precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" # LLAMA (big model, half precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --tokenizer-model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu # OPT (small model, full precision) -python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 # OPT (small model, half precision) -python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --tokenizer-model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 +python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128 # OPT (big model, full precision) -#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 127 +python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 128 # OPT (big model, half precision) -#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --tokenizer-model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 127 +#python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 128 + +# Falcon (full precision) +python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) @@ -187,5 +190,6 @@ diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") #diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index e780bc17b0..e683faa469 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -34,7 +34,7 @@ "ssms": [ { # required ssm parameter - "ssm_model": "JackFram/llama-160m", + "ssm_model": "JackFram/llama-160m-base", # optional ssm parameters "cache_path": "", "refresh_cache": False, @@ -46,7 +46,7 @@ ff_init_configs.update(llm_configs) # Test parameters to fill in -llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m"] +llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m-base"] opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] falcon_models = ["tiiuae/falcon-7b",] mpt_models = ["mosaicml/mpt-7b", ] diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index b1d45853e2..c01b0730b6 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -23,7 +23,7 @@ cleanup pip3 install protobuf==3.20.3 # Download the weights in both half and full precision -python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" +python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" # Create test prompt file mkdir -p ../inference/prompt From b1b44610b570aafb1fc1dc6e61864e5751b269f7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 19 Sep 2023 16:22:34 -0400 Subject: [PATCH 227/344] fix ci --- tests/inference/python_inference_tests.sh | 12 ++++++------ tests/inference_tests.sh | 8 +++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 3618401c9d..5cbcbc31f9 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -133,10 +133,10 @@ check_partial_token_match "../../inference/output/spec_infer-python-opt-6.7b-hal ## Incremental decoding # Small LLAMA -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" # Big LLAMA diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" @@ -183,8 +183,8 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p # Falcon (full precision) python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index c01b0730b6..c757dd5ee6 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -22,9 +22,6 @@ cleanup # Make sure supported version of protobuf is installed pip3 install protobuf==3.20.3 -# Download the weights in both half and full precision -python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" - # Create test prompt file mkdir -p ../inference/prompt echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json @@ -32,11 +29,16 @@ echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json # Create output folder mkdir -p ../inference/output +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + if [[ "$PYTHON_INFERENCE_TESTS" == "ON" ]]; then echo "Running Python inference tests..." ./inference/python_inference_tests.sh fi if [[ "$CPP_INFERENCE_TESTS" == "ON" ]]; then + # Manually download the weights in both half and full precision + python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" echo "Running C++ inference tests..." ./inference/cpp_inference_tests.sh fi From 2ef52f881571808bdc3ada371aeefa64894e60cc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 20 Sep 2023 16:24:55 -0400 Subject: [PATCH 228/344] Do not run empty kernels (`num_tokens=0`) (#1141) * . * fix --- include/flexflow/model.h | 4 + include/flexflow/ops/element_binary.h | 4 + include/flexflow/ops/element_unary.h | 4 + include/flexflow/ops/layer_norm.h | 4 + include/flexflow/ops/rms_norm.h | 4 + include/flexflow/ops/softmax.h | 9 +-- src/ops/element_binary.cc | 108 +++++++++++++++++++++++++- src/ops/element_unary.cc | 28 ++++++- src/ops/layer_norm.cc | 76 +++++++++++++++++- src/ops/linear.cc | 3 + src/ops/rms_norm.cc | 28 ++++++- src/ops/softmax.cc | 18 +++-- src/runtime/model.cc | 69 ++++++++++++++-- 13 files changed, 336 insertions(+), 23 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d4d829b019..6a2bfdb666 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -52,10 +52,12 @@ enum TaskIDs { LOAD_IMAGES_TASK_ID, NORMALIZE_IMAGES_TASK_ID, ELEMENTBINARY_INIT_TASK_ID, + ELEMENTBINARY_INF_TASK_ID, ELEMENTBINARY_FWD_TASK_ID, ELEMENTBINARY_BWD_TASK_ID, ELEMENTUNARY_INIT_TASK_ID, ELEMENTUNARY_FWD_TASK_ID, + ELEMENTUNARY_INF_TASK_ID, ELEMENTUNARY_BWD_TASK_ID, EXPERTS_INIT_TASK_ID, EXPERTS_FWD_TASK_ID, @@ -102,6 +104,7 @@ enum TaskIDs { BATCHMATMUL_BWD_TASK_ID, LAYERNORM_INIT_TASK_ID, LAYERNORM_FWD_TASK_ID, + LAYERNORM_INF_TASK_ID, LAYERNORM_BWD_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, @@ -150,6 +153,7 @@ enum TaskIDs { ATTENTION_BWD_TASK_ID, RMSNROM_INIT_TASK_ID, RMSNROM_FWD_TASK_ID, + RMSNROM_INF_TASK_ID, BEAM_TOPK_INIT_TASK_ID, BEAM_TOPK_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index 4aa41ed9e4..08747bb9a4 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -56,6 +56,10 @@ class ElementBinary : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index 2df9ea61bc..f82db5f910 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -77,6 +77,10 @@ class ElementUnary : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 8e0b4f61ff..c65370e0fd 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -63,6 +63,10 @@ class LayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 36dde15b90..a3074de015 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -69,6 +69,10 @@ class RMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 1d5191d7ee..6fd1a434d4 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -53,11 +53,10 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static InferenceResult - inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 2cd5ba100e..21edad11e3 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -540,7 +540,7 @@ FutureMap size_t machine_view_hash = view->hash(); /* std::cout << "ElementBinary op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(ELEMENTBINARY_FWD_TASK_ID, + IndexLauncher launcher(ELEMENTBINARY_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -548,6 +548,7 @@ FutureMap false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); if (inplace_a) { assert(batch_outputs[0]->part == batch_inputs[0]->part); assert(batch_outputs[0]->region == batch_inputs[0]->region); @@ -603,6 +604,111 @@ FutureMap return runtime->execute_index_space(ctx, launcher); } +/* + regions[0](I): in1 + regions[1](I): in2 + regions[2](O): output +*/ +__host__ void + ElementBinary::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + // const ElementBinary* ele = (const ElementBinary*) task->args; + ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); + GenericTensorAccessorR in1, in2; + GenericTensorAccessorW out; + Domain in1_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + + if (!m->has_same_operands) { + Domain in2_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + // Currently only support broadcast for add and sub + if (in1_domain != in2_domain) { + assert(m->op_type == OP_EW_SUB || m->op_type == OP_EW_ADD || + m->op_type == OP_EW_MUL); + } + } + + if (m->inplace_a) { + if (m->has_same_operands) { + assert(regions.size() == 1); + assert(task->regions.size() == 1); + out = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = out; + in1 = out; + } else { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + out = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = helperGetGenericTensorAccessorRO(m->input_type[1], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + in1 = out; + } + } else { + if (m->has_same_operands) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + in1 = helperGetGenericTensorAccessorRO(m->input_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = in1; + out = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + } else { + assert(regions.size() == 3); + assert(task->regions.size() == 3); + in1 = helperGetGenericTensorAccessorRO(m->input_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + in2 = helperGetGenericTensorAccessorRO(m->input_type[1], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + out = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + } + } + + forward_kernel_wrapper(m, in1, in2, out); +} + /* regions[0](I): in1 regions[1](I): in2 diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index c82c1196a2..bdb594b0f6 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -432,7 +432,7 @@ FutureMap set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(ELEMENTUNARY_FWD_TASK_ID, + IndexLauncher launcher(ELEMENTUNARY_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -440,6 +440,7 @@ FutureMap false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); if (inplace) { assert(batch_outputs[0]->part == batch_inputs[0]->part); assert(batch_outputs[0]->region == batch_inputs[0]->region); @@ -468,6 +469,31 @@ FutureMap return runtime->execute_index_space(ctx, launcher); } +void ElementUnary::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + ElementUnaryMeta const *m = *((ElementUnaryMeta **)task->local_args); + if (m->data_type == DT_HALF) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (m->data_type == DT_FLOAT) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (m->data_type == DT_DOUBLE) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (m->data_type == DT_INT32) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (m->data_type == DT_INT64) { + forward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Embedding forward"); + } +} + void ElementUnary::forward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 09a5e40851..758d7cfcce 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -442,7 +442,7 @@ FutureMap LayerNorm::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(LAYERNORM_FWD_TASK_ID, + IndexLauncher launcher(LAYERNORM_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -450,6 +450,7 @@ FutureMap LayerNorm::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -482,6 +483,79 @@ FutureMap LayerNorm::inference(FFModel const &ff, return runtime->execute_index_space(ctx, launcher); } +/* + regions[0](I): input + regions[1](O): output + regions[2](I/O): gamma + regions[3](I/O): beta +*/ +void LayerNorm::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + + LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); + assert(task->regions.size() == regions.size()); + float const *in_ptr = NULL; + float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; + GenericTensorAccessorR in, gamma, beta; + GenericTensorAccessorW out; + + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + // in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + in = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain out_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + // out_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + out = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + assert(in_domain == out_domain); + assert(in_domain.get_volume() == + m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 4)); + Domain gamma_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + gamma = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (m->use_bias) { + Domain beta_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + beta = helperGetGenericTensorAccessorRO(m->input_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + assert(gamma_domain == beta_domain); + } + + assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + size_t vol = 1; + int i = 0; + while (vol < gamma_domain.get_volume()) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + vol *= g_d; + i++; + } + } else { + assert(regions.size() == 2); + } + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); +} + /* regions[0](I): input regions[1](O): output diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 8eb3db2869..a751ebcc57 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -619,6 +619,9 @@ void Linear::inference_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } assert(regions.size() == (3 + static_cast(m->use_bias))); assert(task->regions.size() == (3 + static_cast(m->use_bias))); if (m->quantization_type == DT_NONE) { diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 8fdfc7bc1e..c3a4e9b58c 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -347,7 +347,7 @@ FutureMap RMSNorm::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RMSNROM_FWD_TASK_ID, + IndexLauncher launcher(RMSNROM_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -355,6 +355,7 @@ FutureMap RMSNorm::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -397,6 +398,31 @@ void RMSNorm::forward_task(Task const *task, forward_kernel_wrapper(m, input, weight, output); } +/* + regions[0](I): input + regions[1](O): output + regions[2](I/O): weight +*/ +void RMSNorm::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 3); + assert(regions.size() == 3); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input, weight, output); +} + void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 2d72151035..28c9ecea67 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -255,6 +255,7 @@ FutureMap Softmax::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -429,11 +430,15 @@ void Softmax::backward_task_with_dim(Task const *task, m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); } -InferenceResult - Softmax::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void Softmax::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); @@ -454,9 +459,6 @@ InferenceResult default: assert(false); } - // FIXME: replace this with actual result - InferenceResult ir; - return ir; } bool Softmax::get_int_parameter(PMParameter para, int *value) const { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8c72cc51d1..2768439117 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4446,6 +4446,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ELEMENTUNARY_INF_TASK_ID, + "ElementWiseUnary Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ElementWiseUnary Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(ELEMENTUNARY_FWD_TASK_ID, "ElementWiseUnary Forward"); @@ -4493,6 +4508,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ELEMENTBINARY_INF_TASK_ID, + "ElementWiseBinary Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ElementWiseBinary Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(ELEMENTBINARY_FWD_TASK_ID, "ElementWiseBinary Forward"); @@ -5146,6 +5176,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(LAYERNORM_INF_TASK_ID, + "LayerNorm Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LayerNorm Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_FWD_TASK_ID, "layernorm_fwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5189,6 +5234,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(RMSNROM_INF_TASK_ID, "RMS Norm Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5347,19 +5406,17 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(SOFTMAX_INF_TASK_ID, "softmax_inf_task"); + TaskVariantRegistrar registrar(SOFTMAX_INF_TASK_ID, "Softmax Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "softmax_inf_task"); + Runtime::preregister_task_variant( + registrar, "Softmax Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant( - registrar); + runtime->register_task_variant(registrar); } } // compute Loss From a4f2588b37fc2476ed90bf8370e9843d0c5e3a47 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 21 Sep 2023 15:24:57 -0400 Subject: [PATCH 229/344] Fuse inference kernels to reduce kernel launch overhead (part 1) (#1128) * add add_bias_residual_layer_norm layer * progress * make it compile * fix bias param * . * . * . * file loader update * . * . * . * . * fixes * fix * fix * backup * finished debugging * . * gpu torch in docker * fix * add falcon to ci * re-enabled opt tests, linting * fix * cpu * temporary ci fix * . * fix * fix * fix * linting * Revert "linting" This reverts commit 5bbe29e4b059d759eb71392858350fd52c700167. * fix * fix * fix * fix * turn on backtrace * fix * fix rocm kernel * fix * . * fix * do not run empty AddBiasResidualLayerNorm * fix * fix rocm compilation * remove clutter --- include/flexflow/ffconst.h | 1 + include/flexflow/flexflow_c.h | 14 +- include/flexflow/model.h | 17 + include/flexflow/operator_params.h | 2 + .../ops/add_bias_residual_layer_norm.h | 113 +++ .../ops/add_bias_residual_layer_norm_params.h | 29 + include/flexflow/ops/beam_topk_params.h | 1 + include/flexflow/ops/element_binary_params.h | 1 + include/flexflow/ops/element_unary_params.h | 1 + include/flexflow/ops/experts_params.h | 2 + include/flexflow/ops/gather_params.h | 2 + .../ops/inc_multihead_self_attention.h | 19 +- .../ops/inc_multihead_self_attention_params.h | 5 +- include/flexflow/ops/layer_norm_params.h | 2 + include/flexflow/ops/reduce_params.h | 2 + include/flexflow/ops/reshape_params.h | 2 + include/flexflow/ops/rms_norm_params.h | 2 + .../ops/spec_inc_multihead_self_attention.h | 12 +- ...spec_inc_multihead_self_attention_params.h | 5 +- .../ops/tree_inc_multihead_self_attention.h | 12 +- ...tree_inc_multihead_self_attention_params.h | 5 +- include/flexflow/substitution_loader.h | 1 + inference/file_loader.cc | 298 +++--- inference/file_loader.h | 20 +- inference/models/falcon.cc | 25 +- inference/models/llama.cc | 25 +- inference/models/mpt.cc | 2 +- inference/models/opt.cc | 191 ++-- inference/models/starcoder.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 42 +- python/flexflow/serve/models/falcon.py | 30 +- python/flexflow/serve/models/llama.py | 34 +- python/flexflow/serve/models/mpt.py | 32 +- python/flexflow/serve/models/opt.py | 59 +- python/flexflow/serve/models/starcoder.py | 4 +- python/flexflow/serve/serve.py | 5 +- python/flexflow/type.py | 1 + src/c/flexflow_c.cc | 58 +- src/ops/add_bias_residual_layer_norm.cc | 829 ++++++++++++++++ src/ops/add_bias_residual_layer_norm.cpp | 262 ++++++ src/ops/add_bias_residual_layer_norm.cu | 299 ++++++ src/ops/fused.cpp | 348 +++++-- src/ops/fused.cu | 78 +- src/ops/inc_multihead_self_attention.cc | 99 +- src/ops/inc_multihead_self_attention.cpp | 51 +- src/ops/inc_multihead_self_attention.cu | 64 +- src/ops/kernels/element_binary_kernels.cu | 2 +- src/ops/layer_norm.cc | 23 +- src/ops/layer_norm.cu | 108 ++- src/ops/spec_inc_multihead_self_attention.cc | 103 +- src/ops/spec_inc_multihead_self_attention.cpp | 8 +- src/ops/spec_inc_multihead_self_attention.cu | 8 +- src/ops/tree_inc_multihead_self_attention.cc | 887 ++---------------- src/ops/tree_inc_multihead_self_attention.cpp | 8 +- src/ops/tree_inc_multihead_self_attention.cu | 8 +- src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 48 +- src/runtime/model.cc | 43 + src/runtime/operator_params.cc | 3 + src/runtime/substitution.cc | 17 + tests/inference/huggingface_inference.py | 6 +- 61 files changed, 2875 insertions(+), 1507 deletions(-) create mode 100644 include/flexflow/ops/add_bias_residual_layer_norm.h create mode 100644 include/flexflow/ops/add_bias_residual_layer_norm_params.h create mode 100644 src/ops/add_bias_residual_layer_norm.cc create mode 100644 src/ops/add_bias_residual_layer_norm.cpp create mode 100644 src/ops/add_bias_residual_layer_norm.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 78d98284a4..f86cbff399 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -159,6 +159,7 @@ enum OperatorType { OP_POW, // https://pytorch.org/docs/stable/generated/torch.pow.html OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html OP_LAYERNORM, + OP_ADD_BIAS_RESIDUAL_LAYERNORM, OP_EXPERTS, OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 7977a083cc..e363ccf888 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -259,6 +259,17 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle, bool use_bias, char const *name); +flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( + flexflow_model_t handle, + const flexflow_tensor_t input, + const flexflow_tensor_t residual, + int n, + int *axes, + bool elementwise_affine, + float eps, + bool use_bias, + char const *name); + flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle, const flexflow_tensor_t a, @@ -972,9 +983,6 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_, - int num_layers, - char const **layer_names, - flexflow_op_t *layers, bool use_full_precision); #ifdef __cplusplus diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6a2bfdb666..05a12bee31 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -106,6 +106,8 @@ enum TaskIDs { LAYERNORM_FWD_TASK_ID, LAYERNORM_INF_TASK_ID, LAYERNORM_BWD_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, LINEAR_INF_TASK_ID, @@ -309,6 +311,7 @@ class Flat; class Gather; class Group_by; class LayerNorm; +class AddBiasResidualLayerNorm; class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; @@ -532,6 +535,16 @@ class FFModel { bool use_bias = true, DataType data_type = DT_NONE, char const *name = NULL); + // Add a add_bias_residual_layer_norm layer + void add_bias_residual_layer_norm(const Tensor input, + const Tensor residual, + Tensor *outputs, + std::vector const &axes, + bool elementwise_affine, + float eps, + bool use_bias = true, + DataType data_type = DT_NONE, + char const *name = NULL); // Add a batch_norm layer Tensor batch_norm(const Tensor input, bool relu = true, char const *name = NULL); @@ -1115,6 +1128,10 @@ class FFModel { Group_by *>, std::unordered_map, LayerNorm *>, + std::unordered_map< + std::pair, + AddBiasResidualLayerNormParams>, + AddBiasResidualLayerNorm *>, std::unordered_map, Linear *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 4f0432cb93..514c70f2ec 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -1,6 +1,7 @@ #ifndef _OPERATOR_PARAMS_H #define _OPERATOR_PARAMS_H +#include "flexflow/ops/add_bias_residual_layer_norm_params.h" #include "flexflow/ops/aggregate_params.h" #include "flexflow/ops/aggregate_spec_params.h" #include "flexflow/ops/arg_topk_params.h" @@ -59,6 +60,7 @@ using OperatorParameters = mp::variant; + AddBiasResidualLayerNorm(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name = nullptr, + bool allocate_weights = false); + AddBiasResidualLayerNorm(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + const ParallelTensor _residual, + std::vector const &axes, + bool _elementwise_affine, + bool _use_bias, + float _eps, + bool allocate_weights, + char const *name); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + + AddBiasResidualLayerNormParams get_params() const; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + template + static void inference_kernel(AddBiasResidualLayerNormMeta const *m, + int attn_bias_dim, + int residual_volume, + T const *input_ptr, + T const *attn_bias_ptr, + T const *residual_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + ffStream_t stream); + static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + int attn_bias_dim, + int residual_volume, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &added_output, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &residual, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); + +public: + bool elementwise_affine, use_bias; + int64_t effective_batch_size, effective_num_elements; + float eps; + std::vector axes; +}; + +class AddBiasResidualLayerNormMeta : public OpMeta { +public: + AddBiasResidualLayerNormMeta(FFHandler handle, + AddBiasResidualLayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator); + ~AddBiasResidualLayerNormMeta(void); + +public: + bool elementwise_affine, use_bias; + int64_t effective_batch_size, effective_num_elements; + float eps; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + char op_name[MAX_OPNAME]; + Realm::RegionInstance reserveInst; +}; + +}; // namespace FlexFlow diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h new file mode 100644 index 0000000000..6f49983467 --- /dev/null +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -0,0 +1,29 @@ +#pragma once + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct AddBiasResidualLayerNormParams { + LayerID layer_guid; + std::vector axes; + bool elementwise_affine; + float eps; + bool use_bias; + bool is_valid( + std::pair const &) const; +}; + +bool operator==(AddBiasResidualLayerNormParams const &, + AddBiasResidualLayerNormParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::AddBiasResidualLayerNormParams const &) const; +}; +} // namespace std diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h index c217b0f671..430f16e249 100644 --- a/include/flexflow/ops/beam_topk_params.h +++ b/include/flexflow/ops/beam_topk_params.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_BEAM_TOPK_PARAMS_H #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 8b26877af2..9489b793a7 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_ELEMENT_BINARY_PARAMS_H #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/element_unary_params.h b/include/flexflow/ops/element_unary_params.h index 00683c89a0..1aac85c43e 100644 --- a/include/flexflow/ops/element_unary_params.h +++ b/include/flexflow/ops/element_unary_params.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_ELEMENTARY_UNARY_PARAMS_H #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index b6ba88a96e..7adced3c8c 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -1,5 +1,7 @@ #pragma once +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/operator.h" #include "flexflow/parallel_tensor.h" diff --git a/include/flexflow/ops/gather_params.h b/include/flexflow/ops/gather_params.h index 768d135e88..51f1184a72 100644 --- a/include/flexflow/ops/gather_params.h +++ b/include/flexflow/ops/gather_params.h @@ -1,6 +1,8 @@ #ifndef _FLEXFLOW_GATHER_PARAMS_H #define _FLEXFLOW_GATHER_PARAMS_H +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 010c3f6085..8290998f02 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -36,8 +36,8 @@ class IncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -58,8 +58,8 @@ class IncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -125,8 +125,8 @@ class IncMultiHeadSelfAttention : public Op { public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias; - bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + bool qkv_bias; + bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; @@ -154,11 +154,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { int _vProjSize, int _oProjSize, bool _apply_rotary_embedding, - bool _bias, + bool _qkv_bias, bool _scaling_query, bool _qk_prod_scaling, bool _position_bias, - bool _add_bias_kv, + bool _final_bias, float _scaling_factor, GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, @@ -179,7 +179,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads; bool *has_load_weights; bool *apply_rotary_embedding; - bool *bias; + bool *qkv_bias; + bool *final_bias; bool *scaling_query; bool *qk_prod_scaling; bool *position_bias; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 202ff70bc9..7ae39f1cfe 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#include "flexflow/ffconst.h" #include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" @@ -11,8 +12,8 @@ struct IncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; bool is_valid(ParallelTensorShape const &) const; diff --git a/include/flexflow/ops/layer_norm_params.h b/include/flexflow/ops/layer_norm_params.h index 509593c285..c9aa40048d 100644 --- a/include/flexflow/ops/layer_norm_params.h +++ b/include/flexflow/ops/layer_norm_params.h @@ -1,5 +1,7 @@ #pragma once +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/reduce_params.h b/include/flexflow/ops/reduce_params.h index a4777f2be9..b79ba9157a 100644 --- a/include/flexflow/ops/reduce_params.h +++ b/include/flexflow/ops/reduce_params.h @@ -1,5 +1,7 @@ #pragma once +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/reshape_params.h b/include/flexflow/ops/reshape_params.h index f6aa4f8c51..ffd88948ea 100644 --- a/include/flexflow/ops/reshape_params.h +++ b/include/flexflow/ops/reshape_params.h @@ -1,6 +1,8 @@ #ifndef _FLEXFLOW_RESHAPE_PARAMS_H #define _FLEXFLOW_RESHAPE_PARAMS_H +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h index 82a459009a..81295322f0 100644 --- a/include/flexflow/ops/rms_norm_params.h +++ b/include/flexflow/ops/rms_norm_params.h @@ -1,6 +1,8 @@ #ifndef _FLEXFLOW_RMSNORM_PARAMS_H #define _FLEXFLOW_RMSNORM_PARAMS_H +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index fa9251c871..363776cdb0 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -33,8 +33,8 @@ class SpecIncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -52,8 +52,8 @@ class SpecIncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -120,8 +120,8 @@ class SpecIncMultiHeadSelfAttention : public Op { public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias; - bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + bool qkv_bias; + bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index b59a237e20..2f7a706bf1 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H #define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#include "flexflow/ffconst.h" #include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" @@ -10,8 +11,8 @@ struct SpecIncMultiHeadSelfAttentionParams { LayerID layer_guid; int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index c4d7ae17e9..6e2da19ce9 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -33,8 +33,8 @@ class TreeIncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -55,8 +55,8 @@ class TreeIncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -123,8 +123,8 @@ class TreeIncMultiHeadSelfAttention : public Op { public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias; - bool add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, + bool qkv_bias; + bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index a897c76162..14fcde74ba 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H +#include "flexflow/ffconst.h" #include "flexflow/fftype.h" #include "flexflow/parallel_tensor.h" @@ -11,8 +12,8 @@ struct TreeIncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; bool is_valid(ParallelTensorShape const &) const; diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h index 776fe2c78e..6dbb12e28b 100644 --- a/include/flexflow/substitution_loader.h +++ b/include/flexflow/substitution_loader.h @@ -125,6 +125,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM( {OP_POW, "OP_POW"}, {OP_MEAN, "OP_MEAN"}, {OP_LAYERNORM, "OP_LAYERNORM"}, + {OP_ADD_BIAS_RESIDUAL_LAYERNORM, "OP_ADD_BIAS_RESIDUAL_LAYERNORM"}, {OP_RMS_NORM, "OP_RMS_NORM"}, {OP_REPARTITION, "OP_PARTITION"}, {OP_COMBINE, "OP_COMBINE"}, diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 78f190dad6..5b92f31552 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -22,14 +22,14 @@ using namespace std; using namespace Legion; -FileDataLoader::FileDataLoader(std::string _input_path, - std::string _weight_file_path, +FileDataLoader::FileDataLoader(std::string _prompts_filepath, + std::string _weights_folder, int _num_heads, int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, int _tensor_parallelism_degree) - : input_path(_input_path), weight_file_path(_weight_file_path), + : prompts_filepath(_prompts_filepath), weights_folder(_weights_folder), num_heads(_num_heads), num_kv_heads(_num_kv_heads), hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim), tensor_parallelism_degree(_tensor_parallelism_degree){}; @@ -38,7 +38,7 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { BatchConfig::TokenId *prompts = (BatchConfig::TokenId *)malloc(sizeof(BatchConfig::TokenId) * 40); - std::ifstream in(input_path, std::ios::in | std::ios::binary); + std::ifstream in(prompts_filepath, std::ios::in | std::ios::binary); int size = num * length; std::vector host_array(size); size_t loaded_data_size = sizeof(long) * size; @@ -64,33 +64,46 @@ BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { return prompts; }; +std::string removeGuidOperatorName(std::string const &input) { + // Find the last underscore in the string + size_t underscorePos = input.find_last_of('_'); + + if (underscorePos != std::string::npos) { + // Remove the underscore and the characters after it + return input.substr(0, underscorePos); + } else { + // No underscore found, return the original string + return input; + } +} + template void load_attention_weights_multi_query(DT *ptr, std::string layer_name, - std::string weight_path, + std::string weights_folder, size_t hidden_dim, int num_heads) { - std::string qkv_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + + std::string qkv_file = layer_name.substr(0, layer_name.find("attention")) + "attention_query_key_value_weight"; - std::string o_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + + std::string o_file = layer_name.substr(0, layer_name.find("attention")) + "attention_dense_weight"; // q has n_heads heads, k and v only have one head, o have n_head heads - std::vector weight_files = {qkv_file, o_file}; + std::vector weight_filenames = {qkv_file, o_file}; int file_index = 0; int data_index = 0; - for (auto file : weight_files) { + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); size_t partial_size = file_index == 0 ? (hidden_dim + 2 * hidden_dim / num_heads) * hidden_dim : hidden_dim * hidden_dim; - std::ifstream in(file, std::ios::in | std::ios::binary); - // std::cout << "Loading filename: " << file << std::endl; + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + // std::cout << "Loading filename: " << weight_filepath << std::endl; if (!in.good()) { - std::cout << "Could not open file: " << file << std::endl; + std::cout << "Could not open file: " << weight_filepath << std::endl; } assert(in.good() && "incorrect weight file path"); std::vector
host_array(partial_size); @@ -118,21 +131,17 @@ void load_attention_bias_v2(DT *ptr, int num_kv_heads, size_t hidden_dim, size_t qkv_inner_dim, + bool final_bias, std::string layer_name, - std::string weight_path) { - std::string q_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wq_bias"; - std::string k_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wk_bias"; - std::string v_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wv_bias"; - std::string o_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wo_bias"; - std::vector bias_files = {q_file, k_file, v_file, o_file}; + std::string weights_folder) { + std::string q_file = layer_name + "_wq_bias"; + std::string k_file = layer_name + "_wk_bias"; + std::string v_file = layer_name + "_wv_bias"; + std::vector bias_files = {q_file, k_file, v_file}; + if (final_bias) { + std::string o_file = layer_name + "_wo_bias"; + bias_files.push_back(o_file); + } int file_index = 0; @@ -140,13 +149,16 @@ void load_attention_bias_v2(DT *ptr, // assert(num_heads == num_kv_heads); int idx = 0; - for (auto file : bias_files) { + for (auto filename : bias_files) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + int n_heads = file_index == 0 ? num_heads : num_kv_heads; size_t qkv_partial_size = qkv_inner_dim * n_heads; size_t out_partial_size = hidden_dim; size_t partial_size = (file_index < 3) ? qkv_partial_size : out_partial_size; - std::ifstream in(file, std::ios::in | std::ios::binary); + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); assert(in.good() && "incorrect bias file path"); std::vector
host_array(partial_size); size_t loaded_data_size = sizeof(DT) * partial_size; @@ -185,24 +197,16 @@ void load_attention_weights_v2(DT *ptr, size_t hidden_dim, size_t qkv_inner_dim, std::string layer_name, - std::string weight_path, + std::string weights_folder, size_t volume, int tensor_parallelism_degree) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight - std::string q_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wq_weight"; - std::string k_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wk_weight"; - std::string v_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wv_weight"; - std::string o_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wo_weight"; - std::vector weight_files = {q_file, k_file, v_file}; + std::string q_file = layer_name + "_wq_weight"; + std::string k_file = layer_name + "_wk_weight"; + std::string v_file = layer_name + "_wv_weight"; + std::string o_file = layer_name + "_wo_weight"; + std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; int base_index = 0; @@ -219,16 +223,19 @@ void load_attention_weights_v2(DT *ptr, // stride for q, k, v, o size_t stride_size = (q_size + v_size + k_size + o_size) / tensor_parallelism_degree; - for (auto file : weight_files) { + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + int data_index = 0; size_t partial_size = (file_index == 0 || file_index == 3) ? one_weight_file_size : single_proj_size * num_kv_heads; size_t one_partition_size = partial_size / tensor_parallelism_degree; - std::ifstream in(file, std::ios::in | std::ios::binary); + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); if (!in.good()) { - std::cout << "Could not open file: " << file << std::endl; + std::cout << "Could not open file: " << weight_filepath << std::endl; } assert(in.good() && "incorrect weight file path"); std::vector
host_array(partial_size); @@ -240,8 +247,8 @@ void load_attention_weights_v2(DT *ptr, if (in_get_size != loaded_data_size) { std::cout << "load attention data error " << in_get_size << ", " - << loaded_data_size << ", " << file_index << ", " << file - << "\n"; + << loaded_data_size << ", " << file_index << ", " + << weight_filepath << "\n"; assert(false && "data size mismatch"); } // wq, wk, wo @@ -257,9 +264,12 @@ void load_attention_weights_v2(DT *ptr, assert(base_index == (q_size + k_size + v_size) / tensor_parallelism_degree); { - std::ifstream in(o_file, std::ios::in | std::ios::binary); + std::cout << "Loading weight file " << o_file << std::endl; + std::string weight_filepath = join_path({weights_folder, o_file}); + + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); if (!in.good()) { - std::cout << "Could not open file: " << o_file << std::endl; + std::cout << "Could not open file: " << weight_filepath << std::endl; } assert(in.good() && "incorrect weight file path"); std::vector
host_array(one_weight_file_size); @@ -294,10 +304,10 @@ void load_attention_weights_v2(DT *ptr, } template -void load_from_file(DT *ptr, size_t size, std::string filename) { - std::ifstream in(filename, std::ios::in | std::ios::binary); +void load_from_file(DT *ptr, size_t size, std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { - std::cout << "Could not open file: " << filename << std::endl; + std::cout << "Could not open file: " << filepath << std::endl; } assert(in.good() && "incorrect weight file path"); std::vector
host_array(size); @@ -357,24 +367,16 @@ void load_attention_weights_quantized(char *ptr, size_t hidden_dim, size_t qkv_inner_dim, std::string layer_name, - std::string weight_path, + std::string weights_folder, DataType data_type, bool use_full_precision) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight - std::string q_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wq_weight"; - std::string k_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wk_weight"; - std::string v_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wv_weight"; - std::string o_file = weight_path + - layer_name.substr(0, layer_name.find("attention")) + - "attention_wo_weight"; - std::vector weight_files = {q_file, k_file, v_file, o_file}; + std::string q_file = layer_name + "_wq_weight"; + std::string k_file = layer_name + "_wk_weight"; + std::string v_file = layer_name + "_wv_weight"; + std::string o_file = layer_name + "_wo_weight"; + std::vector weight_filenames = {q_file, k_file, v_file, o_file}; int file_index = 0; @@ -385,11 +387,14 @@ void load_attention_weights_quantized(char *ptr, num_heads * single_proj_size; // size of each of Q/K/V/O for all heads // q, k, v, o -> 0, 1, 2, 3 - for (auto file : weight_files) { + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + size_t partial_size = one_weight_file_size; - std::ifstream in(file, std::ios::in | std::ios::binary); + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); if (!in.good()) { - std::cout << "Could not open file: " << file << std::endl; + std::cout << "Could not open file: " << weight_filepath << std::endl; } assert(in.good() && "incorrect weight file path"); std::vector host_array(partial_size); @@ -432,9 +437,13 @@ void load_attention_weights_quantized(char *ptr, // the layout is like |values * 32 heads|offset|scale| size_t offset = data_type == DT_INT8 ? one_weight_file_size * 4 : (one_weight_file_size * 4) / 2; - for (auto file : weight_files) { + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + for (int i = 0; i < 2; i++) { - std::string meta_file = i == 0 ? (file + "_offset") : (file + "_scale"); + std::string meta_file = + i == 0 ? (weight_filepath + "_offset") : (weight_filepath + "_scale"); size_t partial_size = one_weight_file_size / INT4_NUM_OF_ELEMENTS_PER_GROUP; std::ifstream in(meta_file, std::ios::in | std::ios::binary); @@ -605,30 +614,30 @@ void load_from_quantized_file(char *ptr, } void FileDataLoader::load_quantization_weight(FFModel *ff, - Tensor weight, + Layer *l, int weight_idx, - std::string const &layername, bool use_full_precision) { + Tensor weight = l->weights[weight_idx]; size_t volume = 1; std::vector dims_vec; for (int i = 0; i < weight->num_dims; i++) { dims_vec.push_back(weight->dims[i]); volume *= weight->dims[i]; } - char *data = (char *)malloc(sizeof(char) * volume); - std::string file_path = - (layername.back() == '/') ? layername : "/" + layername; + std::string weight_filename = removeGuidOperatorName(std::string(l->name)); - if (file_path.find("attention_w") != std::string::npos) { + if (weight_filename.find("attention") != std::string::npos && + weight_filename.rfind("attention") == + weight_filename.length() - strlen("attention")) { if (weight_idx == 0) { load_attention_weights_quantized(data, num_heads, hidden_dim, qkv_inner_dim, - file_path, - weight_file_path, + weight_filename, + weights_folder, weight->data_type, use_full_precision); } @@ -637,19 +646,20 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, // num_heads, // hidden_dim, // qkv_inner_dim, - // file_path, - // weight_file_path); + // weight_filename, + // weights_folder); // } } else { if (weight_idx > 0) { - int index = file_path.find("_weight"); - assert(index != std::string::npos); - file_path = file_path.substr(0, index) + "_bias"; + assert(weight_idx == 0 || weight_idx == 1); + if (weight_filename != "embed_tokens_weight_lm_head") { + weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + } } load_from_quantized_file(data, volume, - weight_file_path + file_path, + join_path({weights_folder, weight_filename}), weight->data_type, use_full_precision); } @@ -663,86 +673,108 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, template void FileDataLoader::load_single_weight_tensor(FFModel *ff, - Tensor weight, - int weight_idx, - std::string const &layername) { + Layer *l, + int weight_idx) { + Tensor weight = l->weights[weight_idx]; + + // Create a buffer to store weight data from the file size_t volume = 1; std::vector dims_vec; for (int i = 0; i < weight->num_dims; i++) { dims_vec.push_back(weight->dims[i]); volume *= weight->dims[i]; } - - std::cout << "load weights: " << layername << "\n"; - assert(data_type_size(weight->data_type) == sizeof(DT)); DT *data = (DT *)malloc(sizeof(DT) * volume); - std::string file_path = - (layername.back() == '/') ? layername : "/" + layername; + std::string weight_filename = removeGuidOperatorName(std::string(l->name)); + + if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { + if (weight_filename.find("self_attention") != std::string::npos) { + load_attention_weights_multi_query( + data, weight_filename, weights_folder, hidden_dim, num_heads); + } else if (weight_filename.find("attention") != std::string::npos && + weight_filename.rfind("attention") == + weight_filename.length() - strlen("attention")) { + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); + } else { + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); + } - if (file_path.find("attention_w") != std::string::npos) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - file_path, - weight_file_path, - volume, - tensor_parallelism_degree); } else { - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - file_path, - weight_file_path); + assert(false); } - - } else if (file_path.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, file_path, weight_file_path, hidden_dim, num_heads); + } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { + assert(weight_idx >= 0 || weight_idx <= 2); + weight_filename += (weight_idx == 0) + ? "_attn_bias" + : ((weight_idx == 1) ? "_weight" : "_bias"); + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } else { - if (weight_idx > 0) { - int index = file_path.find("_weight"); - assert(index != std::string::npos); - file_path = file_path.substr(0, index) + "_bias"; + // default op + assert(weight_idx == 0 || weight_idx == 1); + // handle exception + if (weight_filename != "embed_tokens_weight_lm_head") { + weight_filename += weight_idx == 0 ? "_weight" : "_bias"; } - load_from_file(data, volume, weight_file_path + file_path); + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } + // Copy the weight data from the buffer to the weight's ParallelTensor ParallelTensor weight_pt; ff->get_parallel_tensor_from_tensor(weight, weight_pt); weight_pt->set_tensor
(ff, dims_vec, data); + // Free buffer memory delete data; } -void FileDataLoader::load_weights( - FFModel *ff, - std::unordered_map weights_layers, - bool use_full_precision) { - for (auto &v : weights_layers) { - int weights_num = v.second->numWeights; - for (int i = 0; i < weights_num; i++) { - Tensor weight = v.second->weights[i]; +void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { + for (Layer *l : ff->layers) { + if (l->numWeights < 1 || !l->name || strlen(l->name) < 1) { + continue; + } + for (int i = 0; i < l->numWeights; i++) { + Tensor weight = l->weights[i]; if (weight == NULL) { continue; } switch (weight->data_type) { case DT_HALF: - load_single_weight_tensor(ff, weight, i, v.first); + load_single_weight_tensor(ff, l, i); break; case DT_FLOAT: - load_single_weight_tensor(ff, weight, i, v.first); + load_single_weight_tensor(ff, l, i); break; case DT_INT4: case DT_INT8: // load weights in quantization - load_quantization_weight(ff, weight, i, v.first, use_full_precision); + load_quantization_weight(ff, l, i, use_full_precision); break; default: assert(false && "Unsupported data type"); diff --git a/inference/file_loader.h b/inference/file_loader.h index aaef861d09..6f01a79b80 100644 --- a/inference/file_loader.h +++ b/inference/file_loader.h @@ -24,8 +24,8 @@ using namespace FlexFlow; class FileDataLoader { public: - FileDataLoader(std::string _input_path, - std::string _weight_file_path, + FileDataLoader(std::string _prompts_filepath, + std::string _weights_folder, int _num_heads, int _num_kv_heads, size_t _hidden_dim, @@ -35,19 +35,13 @@ class FileDataLoader { BatchConfig::TokenId *generate_requests(int num, int length); template - void load_single_weight_tensor(FFModel *ff, - Tensor weight, - int weight_idx, - std::string const &layername); + void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); void load_quantization_weight(FFModel *ff, - Tensor weight, + Layer *l, int weight_idx, - std::string const &layername, bool use_full_precision); - void load_weights(FFModel *ff, - std::unordered_map weights_layers, - bool use_full_precision); + void load_weights(FFModel *ff, bool use_full_precision); void load_positions(FFModel *ff, Tensor pt, @@ -58,6 +52,6 @@ class FileDataLoader { private: int num_heads, num_kv_heads, tensor_parallelism_degree; size_t hidden_dim, qkv_inner_dim; - std::string input_path; - std::string weight_file_path; + std::string prompts_filepath; + std::string weights_folder; }; diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index a26a6eaf4b..72cbd8d551 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -92,13 +92,14 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, - false, - false, - false, - DT_NONE, - NULL, - true); + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + NULL, /*kernel_initializer*/ + true /*apply_rotary_embedding*/ + ); break; } @@ -111,8 +112,8 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ - false, /*bias*/ - false, /*add_bias_kv*/ + false, /*qkv_bias*/ + false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ @@ -130,8 +131,8 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ - false, /*bias*/ - false, /*add_bias_kv*/ + false, /*qkv_bias*/ + false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ @@ -200,7 +201,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.hidden_size / falcon_config.n_head, ff.config.tensor_parallelism_degree); std::cout << "------laod weights ----------" << std::endl; - fileloader.load_weights(&ff, weights_layers, use_full_precision); + fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e2eabec341..463c96527b 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -91,13 +91,14 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, - false, - false, - false, - DT_NONE, - NULL, - true); + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + NULL, /*kernel_initializer*/ + true /*apply_rotary_embedding*/ + ); break; } case TREE_VERIFY_MODE: { @@ -108,8 +109,8 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ - false, /*bias*/ - false, /*add_bias_kv*/ + false, /*qkv_bias*/ + false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ @@ -125,8 +126,8 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ - false, /*bias*/ - false, /*add_bias_kv*/ + false, /*qkv_bias*/ + false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ @@ -212,7 +213,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, weights_layers, use_full_precision); + fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index d1ca03a335..1ef15654b3 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -208,7 +208,7 @@ void MPT::create_mpt_model(FFModel &ff, mpt_config.hidden_size, mpt_config.hidden_size / mpt_config.n_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, weights_layers, use_full_precision); + fileloader.load_weights(&ff, use_full_precision); im->init_operators_inference(&ff); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9b3670ed89..5afef5e3a6 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -58,7 +58,8 @@ void OPT::create_opt_model(FFModel &ff, AGGR_MODE_NONE, DT_FLOAT, NULL, - embed_init); + embed_init, + "embed_tokens"); } else { token = ff.embedding(input, opt_config.vocab_size, @@ -66,12 +67,10 @@ void OPT::create_opt_model(FFModel &ff, AGGR_MODE_NONE, DT_HALF, NULL, - embed_init); + embed_init, + "embed_tokens"); } - Layer *embedding = ff.layers.back(); - weights_layers.emplace("embed_tokens_weight", embedding); - Tensor positional_embedding; if (use_full_precision) { positional_embedding = ff.embedding(position_input, @@ -80,7 +79,8 @@ void OPT::create_opt_model(FFModel &ff, AGGR_MODE_NONE, DT_FLOAT, NULL, - embed_init); + embed_init, + "embed_positions"); } else { positional_embedding = ff.embedding(position_input, opt_config.max_position_embeddings, @@ -88,10 +88,9 @@ void OPT::create_opt_model(FFModel &ff, AGGR_MODE_NONE, DT_HALF, NULL, - embed_init); + embed_init, + "embed_positions"); } - Layer *pos_embedding = ff.layers.back(); - weights_layers.emplace("embed_positions_weight", pos_embedding); Tensor residual = ff.add(token, positional_embedding); @@ -104,14 +103,19 @@ void OPT::create_opt_model(FFModel &ff, // https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#LL324C1-L325C1 // this version is before normalization - Tensor hidden_states = ff.layer_norm( - residual, axes, opt_config.layer_norm_elementwise_affine, 1e-05); - Layer *self_attn_layer_norm = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_attention_layer_norm_weight", - self_attn_layer_norm); + std::string layer_name = + "layers_" + std::to_string(i) + "_attention_layer_norm"; + Tensor hidden_states = + ff.layer_norm(residual, + axes, + opt_config.layer_norm_elementwise_affine, + 1e-05, + true, + DT_NONE, + layer_name.c_str()); Tensor mha; + layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( @@ -120,18 +124,20 @@ void OPT::create_opt_model(FFModel &ff, opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, - 0.0f, - true, - false, - false, + 0.0f, /*dropout*/ + true, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ DT_NONE, /*data_type*/ - NULL, - false, - /*scaling query*/ true, - /*scaling factor*/ + NULL, /*kernel_initializer*/ + false, /*apply_rotary_embedding*/ + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), - -0.5), - /*qk_prod_scaling*/ false); + -0.5), /*scaling factor*/ + false, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ + ); break; } case TREE_VERIFY_MODE: { @@ -141,18 +147,20 @@ void OPT::create_opt_model(FFModel &ff, opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, - 0.0f, - true, - false, - false, + 0.0f, /*dropout*/ + true, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ DT_NONE, /*data_type*/ - NULL, - false, - /*scaling query*/ true, - /*scaling factor*/ + NULL, /*kernel_initializer*/ + false, /*apply_rotary_embedding*/ + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), - -0.5), - /*qk_prod_scaling*/ false); + -0.5), /*scaling factor*/ + false, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ + ); break; } case INC_DECODING_MODE: { @@ -162,18 +170,20 @@ void OPT::create_opt_model(FFModel &ff, opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, - 0.0f, - true, - false, - false, + 0.0f, /*dropout*/ + true, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ DT_NONE, /*data_type*/ - NULL, - false, - /*scaling query*/ true, - /*scaling factor*/ + NULL, /*kernel_initializer*/ + false, /*apply_rotary_embedding*/ + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), - -0.5), - /*qk_prod_scaling*/ false); + -0.5), /*scaling factor*/ + false, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ + ); break; } default: { @@ -181,44 +191,75 @@ void OPT::create_opt_model(FFModel &ff, } } - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - - Tensor added = ff.add(mha, residual); + // Tensor added = ff.add(mha, residual); + // Tensor final_norm = ff.layer_norm( + // added, axes, opt_config.layer_norm_elementwise_affine, 1e-05); - Tensor final_norm = ff.layer_norm( - added, axes, opt_config.layer_norm_elementwise_affine, 1e-05); - Layer *final_layer_norm = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_final_layer_norm_weight", - final_layer_norm); + layer_name = + "layers_" + std::to_string(i) + "_add_bias_residual_layer_norm"; + Tensor added_final_norm[2]; + ff.add_bias_residual_layer_norm(mha, + residual, + added_final_norm, + axes, + opt_config.layer_norm_elementwise_affine, + 1e-05, + true, + DT_NONE, + layer_name.c_str()); + Tensor added = added_final_norm[0]; + Tensor final_norm = added_final_norm[1]; //--------linear fc1 fc2 ---------- - Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, AC_MODE_NONE, true); - Layer *fc1_linear = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_fc1_weight", - fc1_linear); + layer_name = "layers_" + std::to_string(i) + "_fc1"; + Tensor fc1 = ff.dense(final_norm, + opt_config.ffn_dim, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); Tensor activation = ff.relu(fc1, false); - - Tensor fc2 = - ff.dense(activation, opt_config.hidden_size, AC_MODE_NONE, true); - Layer *fc2_linear = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_fc2_weight", - fc2_linear); + layer_name = "layers_" + std::to_string(i) + "_fc2"; + Tensor fc2 = ff.dense(activation, + opt_config.hidden_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); residual = ff.add(added, fc2); } // final - Tensor all_final_norm = ff.layer_norm( - residual, axes, opt_config.layer_norm_elementwise_affine, 1e-05); - Layer *all_final_norm_layer = ff.layers.back(); - weights_layers.emplace("final_layer_norm_weight", all_final_norm_layer); + Tensor all_final_norm = + ff.layer_norm(residual, + axes, + opt_config.layer_norm_elementwise_affine, + 1e-05, + true, + DT_NONE, + "final_layer_norm"); - Tensor lm_head = - ff.dense(all_final_norm, opt_config.vocab_size, AC_MODE_NONE, false); - Layer *lm_head_layer = ff.layers.back(); - weights_layers.emplace("embed_tokens_weight_lm_head", lm_head_layer); + Tensor lm_head = ff.dense(all_final_norm, + opt_config.vocab_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + "embed_tokens_weight_lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -242,7 +283,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.hidden_size / opt_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, weights_layers, use_full_precision); + fileloader.load_weights(&ff, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; im->init_operators_inference(&ff); } diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index d32f5e9430..982d58654b 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -206,7 +206,7 @@ void STARCODER::create_starcoder_model( startcoder_config.hidden_size / startcoder_config.num_attention_heads, ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, weights_layers, use_full_precision); + fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 5d9480280e..08b87856de 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -278,6 +278,22 @@ def get_weight_tensor(self): def get_bias_tensor(self): return self.get_parameter_by_id(1) +# ----------------------------------------------------------------------- +# AddBiasResidualLayerNorm +# ----------------------------------------------------------------------- +class AddBiasResidualLayerNorm(Op): + def __init__(self, handle, idx=None, name=None): + super(AddBiasResidualLayerNorm, self).__init__(handle, idx, name) + + def get_attn_bias_tensor(self): + return self.get_parameter_by_id(0) + + def get_weight_tensor(self): + return self.get_parameter_by_id(1) + + def get_bias_tensor(self): + return self.get_parameter_by_id(2) + # ----------------------------------------------------------------------- # Dropout # ----------------------------------------------------------------------- @@ -554,6 +570,8 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): return BatchNorm(handle, idx, name) elif op_type == OpType.LAYER_NORM: return LayerNorm(handle, idx, name) + elif op_type == OpType.ADD_BIAS_RESIDUAL_LAYERNORM: + return AddBiasResidualLayerNorm(handle, idx, name) elif op_type == OpType.BATCH_MATMUL: return Batch_Matmul(handle, idx, name) elif op_type == OpType.SPLIT: @@ -1573,6 +1591,13 @@ def layer_norm(self, input, axes, elementwise_affine=True, eps=1e-5, use_bias = handle = ffc().flexflow_model_add_layer_norm(self.handle, input.handle, len(axes), c_axes, elementwise_affine, eps, use_bias, c_name) self.add_layer(OpType.LAYER_NORM, name) return Tensor(handle, owner_op_type=OpType.LAYER_NORM) + + def add_bias_residual_layer_norm(self, input, residual, axes, elementwise_affine=True, eps=1e-5, use_bias = True, name=None): + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(self.handle, input.handle, residual.handle, len(axes), c_axes, elementwise_affine, eps, use_bias, c_name) + self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) + return Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) def batch_matmul(self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None): """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. @@ -3292,21 +3317,8 @@ def __init__(self, weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_ self.handle = ffc().flexflow_file_data_loader_create(c_weight_file_path, num_q_heads, num_kv_heads, hidden_dim, qkv_inner_dim, tensor_parallelism_degree) self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy) - def load_weights(self, model, model_layers_with_weights, data_type): - # Extract keys and values into arrays - layer_names = list(model_layers_with_weights.keys()) - layers = list(model_layers_with_weights.values()) - - # Convert to char** and flexflow_op_t* for CFFI - layer_names_c = [ffi.new("char[]", x.encode('ascii')) for x in layer_names] - layer_handles_list = [layer.handle for layer in layers] - layer_handles_c = ffi.new("flexflow_op_t[]", layer_handles_list) - - # Compute number of layers (key-value pairs) - num_layers = len(layer_names) - assert(len(layer_names) == len(layers)) - + def load_weights(self, model, data_type): # Check data type and create use_full_precision boolean assert(data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF) use_full_precision = data_type == DataType.DT_FLOAT - ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle, num_layers, layer_names_c, layer_handles_c, use_full_precision) + ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle, use_full_precision) diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 4fcaca6c33..6e161f8bf7 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -98,7 +98,7 @@ def build_model(self): self.data_type, None, embed_init, - name="word_embeddings_weight", + name="word_embeddings", ) axes = [ 0, @@ -112,7 +112,7 @@ def build_model(self): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm_weight", + name=f"layers_{i}_input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -124,13 +124,13 @@ def build_model(self): self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -141,13 +141,13 @@ def build_model(self): self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -158,13 +158,13 @@ def build_model(self): self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) else: assert False @@ -174,7 +174,7 @@ def build_model(self): self.falcon_config.hidden_size * 4, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_h_to_4h_weight", + name=f"layers_{i}_mlp_dense_h_to_4h", ) dense_h_to_4h = ffmodel.gelu(dense_h_to_4h) mlp_output = ffmodel.dense( @@ -182,21 +182,21 @@ def build_model(self): self.falcon_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_4h_to_h_weight", + name=f"layers_{i}_mlp_dense_4h_to_h", ) token = ffmodel.add(token, mha) token = ffmodel.add(token, mlp_output) ln_f = ffmodel.layer_norm( - token, axes, True, self.falcon_config.layer_norm_epsilon, name="ln_f_weight" + token, axes, True, self.falcon_config.layer_norm_epsilon, name="ln_f" ) lm_head = ffmodel.dense( ln_f, self.falcon_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="lm_head_weight", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 9eacccfda6..d1171cc3d3 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -93,7 +93,7 @@ def build_model(self): self.data_type, None, embed_init, - name="tok_embeddings_weight", + name="tok_embeddings", ) for i in range(self.llama_config.num_hidden_layers): @@ -103,7 +103,7 @@ def build_model(self): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm_weight", + name=f"layers_{i}_attention_norm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -117,13 +117,13 @@ def build_model(self): self.llama_config.hidden_size // self.llama_config.num_attention_heads, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -136,13 +136,13 @@ def build_model(self): self.llama_config.hidden_size // self.llama_config.num_attention_heads, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -155,13 +155,13 @@ def build_model(self): self.llama_config.hidden_size // self.llama_config.num_attention_heads, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) else: assert False @@ -171,21 +171,21 @@ def build_model(self): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_ffn_norm_weight", + name=f"layers_{i}_ffn_norm", ) w1 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w1_weight", + name=f"layers_{i}_feed_forward_w1", ) w3 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w3_weight", + name=f"layers_{i}_feed_forward_w3", ) sigmoid = ffmodel.sigmoid(w1) silu = ffmodel.multiply(w1, sigmoid) @@ -195,7 +195,7 @@ def build_model(self): self.llama_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w2_weight", + name=f"layers_{i}_feed_forward_w2", ) token = ffmodel.add(token, w2) @@ -203,14 +203,14 @@ def build_model(self): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name="norm_weight", + name="norm", ) dense = ffmodel.dense( token, self.llama_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="output_weight", + name="output", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index a5a0c7da18..10353c5a96 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -94,7 +94,7 @@ def build_model(self): self.data_type, None, embed_init, - name="transformer_wte_weight", + name="transformer_wte", ) axes = [ @@ -110,7 +110,7 @@ def build_model(self): True, 1e-05, False, - name=f"layers_{i}_norm_1_weight", + name=f"layers_{i}_norm_1", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -121,8 +121,8 @@ def build_model(self): self.mpt_config.hidden_size // self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer @@ -132,7 +132,7 @@ def build_model(self): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: attn_outputs = ffmodel.inc_multihead_self_attention_verify( @@ -142,8 +142,8 @@ def build_model(self): self.mpt_config.hidden_size // self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer @@ -153,7 +153,7 @@ def build_model(self): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: attn_outputs = ffmodel.inc_multihead_self_attention( @@ -163,8 +163,8 @@ def build_model(self): self.mpt_config.hidden_size // self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, 0.0, # dropout - False, # bias - False, # add_bias_kv + False, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer @@ -174,7 +174,7 @@ def build_model(self): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) else: assert False @@ -187,7 +187,7 @@ def build_model(self): True, 1e-05, False, - name=f"layers_{i}_norm_2_weight", + name=f"layers_{i}_norm_2", ) residual = hidden_states # mlp @@ -197,7 +197,7 @@ def build_model(self): 4 * self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_up_proj_weight", + name=f"layers_{i}_ffn_up_proj", ) layernorm_output = ffmodel.gelu(layernorm_output) intermediate_output = ffmodel.dense( @@ -205,7 +205,7 @@ def build_model(self): self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_down_proj_weight", + name=f"layers_{i}_ffn_down_proj", ) hidden_states = ffmodel.add(intermediate_output, residual) @@ -215,14 +215,14 @@ def build_model(self): True, 1e-05, False, - name=f"transformer_norm_f_weight", + name=f"transformer_norm_f", ) lm_head = ffmodel.dense( all_final_norm, self.mpt_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="lm_head_weight", + name="lm_head", ) if self.generation_config.do_sample: diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index d18c0d4cc9..d90dabad1d 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -100,7 +100,7 @@ def build_model(self): self.data_type, None, embed_init, - name="embed_tokens_weight", + name="embed_tokens", ) positional_embedding = ffmodel.embedding( position_tensor, @@ -110,7 +110,7 @@ def build_model(self): self.data_type, None, embed_init, - name="embed_positions_weight", + name="embed_positions", ) residual = ffmodel.add(token, positional_embedding) @@ -128,7 +128,7 @@ def build_model(self): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_attention_layer_norm_weight", + name=f"layers_{i}_attention_layer_norm", ) else: hidden_states = residual @@ -141,8 +141,8 @@ def build_model(self): self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout - True, # bias - False, # add_bias_kv + True, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer @@ -151,7 +151,7 @@ def build_model(self): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multihead_self_attention_verify( @@ -161,8 +161,8 @@ def build_model(self): self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout - True, # bias - False, # add_bias_kv + True, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer @@ -171,7 +171,7 @@ def build_model(self): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multihead_self_attention( @@ -181,8 +181,8 @@ def build_model(self): self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout - True, # bias - False, # add_bias_kv + True, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer @@ -191,26 +191,27 @@ def build_model(self): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) else: assert False - residual = ffmodel.add(mha, residual) + # residual = ffmodel.add(mha, residual) # This is either a before or after attention LayerNorm. In both cases, we need to compute the LN here. - norm_name = ( - f"layers_{i}_final_layer_norm_weight" + """ norm_name = ( + f"layers_{i}_final_layer_norm" if self.opt_config.do_layer_norm_before - else f"layers_{i}_attention_layer_norm_weight" - ) - ff_norm = ffmodel.layer_norm( - residual, - axes, - self.opt_config.layer_norm_elementwise_affine, - 1e-05, - name=norm_name, - ) + else f"layers_{i}_attention_layer_norm" + ) """ + # ff_norm = ffmodel.layer_norm( + # residual, + # axes, + # self.opt_config.layer_norm_elementwise_affine, + # 1e-05, + # name=norm_name, + # ) + residual, ff_norm = ffmodel.add_bias_residual_layer_norm(mha, residual, axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, name=f"layers_{i}_add_bias_residual_layer_norm") if not self.opt_config.do_layer_norm_before: residual = ff_norm @@ -220,7 +221,7 @@ def build_model(self): self.opt_config.ffn_dim, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_fc1_weight", + name=f"layers_{i}_fc1", ) activation = ffmodel.relu(fc1, False) fc2 = ffmodel.dense( @@ -228,7 +229,7 @@ def build_model(self): self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_fc2_weight", + name=f"layers_{i}_fc2", ) residual = ffmodel.add(residual, fc2) @@ -238,7 +239,7 @@ def build_model(self): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_final_layer_norm_weight", + name=f"layers_{i}_final_layer_norm", ) all_final_norm = ffmodel.layer_norm( @@ -246,7 +247,7 @@ def build_model(self): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"final_layer_norm_weight", + name=f"final_layer_norm", ) lm_head = ffmodel.dense( all_final_norm, @@ -285,6 +286,8 @@ def convert_hf_model(model, dst_folder): .replace("k_proj", "wk") .replace("v_proj", "wv") .replace("out_proj", "wo") + .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") + .replace("_final_layer_norm", "_add_bias_residual_layer_norm") # important to use the leading "_" to avoid matching the last LayerNorm ) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 922d0e4746..05594f81e5 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -149,8 +149,8 @@ def build_model(self): self.starcoder_config.hidden_size // self.starcoder_config.num_attention_heads, 0.0, # dropout - True, # bias - False, # add_bias_kv + True, # qkv_bias + False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 58f7221082..5cbe16b064 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -272,10 +272,7 @@ def __load_hf_weights(self): self.ffconfig.tensor_parallelism_degree, ) - model_layers_with_weights = self.model.get_layers_with_weights() - self.fileloader.load_weights( - self.model.ffmodel, model_layers_with_weights, self.data_type - ) + self.fileloader.load_weights(self.model.ffmodel, self.data_type) def compile( self, diff --git a/python/flexflow/type.py b/python/flexflow/type.py index d7cc145fde..a4785dba51 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -148,6 +148,7 @@ class OpType(Enum): RMS_NORM = 2300 ARG_TOPK = 2301 BEAM_TOPK = 2302 + ADD_BIAS_RESIDUAL_LAYERNORM = 2303 def enum_to_int(enum, enum_item): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 0c1fad17df..dee030abee 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -638,6 +638,53 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( + flexflow_model_t handle_, + const flexflow_tensor_t input_, + const flexflow_tensor_t residual_, + int n, + int *axes, + bool elementwise_affine, + float eps, + bool use_bias, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + const Tensor input = FFCObjectWrapper::unwrap(input_); + const Tensor residual = FFCObjectWrapper::unwrap(residual_); + Tensor tensor_outputs[2]; + std::vector axes_vec; + for (int i = 0; i < n; i++) { + axes_vec.push_back(axes[i]); + } + handle->add_bias_residual_layer_norm(input, + residual, + tensor_outputs, + axes_vec, + elementwise_affine, + eps, + use_bias, + input->data_type, + name); + assert(tensor_outputs[0] != nullptr); + assert(tensor_outputs[1] != nullptr); + DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, residual %p, output0: %p, " + "output1: %p, elementwise_affine %d, eps " + "%f, name %s", + tensor, + input, + residual, + tensor_outputs[0], + tensor_outputs[1], + elementwise_affine, + eps, + name); + flexflow_tensor_t *tensor_outputs_wrapped = + (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); + tensor_outputs_wrapped[0] = FFCObjectWrapper::wrap(tensor_outputs[0]); + tensor_outputs_wrapped[1] = FFCObjectWrapper::wrap(tensor_outputs[1]); + return tensor_outputs_wrapped; +} + flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_, const flexflow_tensor_t a_, const flexflow_tensor_t b_, @@ -2500,17 +2547,8 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_) { void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_, - int num_layers, - char const **layer_names, - flexflow_op_t *layers, bool use_full_precision) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - std::unordered_map weights_layers; - for (int i = 0; i < num_layers; i++) { - std::string const layer_name(layer_names[i]); - Layer *layer_ptr = FFCObjectWrapper::unwrap(layers[i]); - weights_layers.emplace(layer_name, layer_ptr); - } - handle->load_weights(model, weights_layers, use_full_precision); + handle->load_weights(model, use_full_precision); } diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc new file mode 100644 index 0000000000..ea770f2ac7 --- /dev/null +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -0,0 +1,829 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/add_bias_residual_layer_norm.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +bool operator==(AddBiasResidualLayerNormParams const &lhs, + AddBiasResidualLayerNormParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && + lhs.elementwise_affine == rhs.elementwise_affine && + lhs.use_bias == rhs.use_bias; +} + +bool AddBiasResidualLayerNormParams::is_valid( + std::pair const &input) const { + return input.first.is_valid() && input.second.is_valid(); +} + +AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { + AddBiasResidualLayerNormParams params; + params.layer_guid = this->layer_guid; + params.axes = this->axes; + params.elementwise_affine = this->elementwise_affine; + params.eps = this->eps; + params.use_bias = this->use_bias; + return params; +} + +void FFModel::add_bias_residual_layer_norm(const Tensor input, + const Tensor residual, + Tensor *outputs, + std::vector const &axes, + bool elementwise_affine, + float eps, + bool use_bias, + DataType data_type, + char const *name) { + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } + + // Check dims + assert(input->num_dims == residual->num_dims); + for (int i = 0; i < input->num_dims; i++) { + assert(input->dims[i] == residual->dims[i]); + } + + if (data_type == DT_NONE) { + data_type = input->data_type; + } + int num_weights = + 1 + (elementwise_affine ? (use_bias ? 2 : 1) + : 0); // attention bias + layernorm weights + Layer *ln = nullptr; + Tensor casted_input = + (data_type != input->data_type) + ? cast(input, data_type, "type cast for add_bias_residual_layer_norm") + : input; + Tensor casted_residual = + (data_type != residual->data_type) + ? cast(residual, + data_type, + "type cast for add_bias_residual_layer_norm") + : residual; + ln = new Layer(this, + OP_ADD_BIAS_RESIDUAL_LAYERNORM, + data_type, + name, + 2 /*inputs*/, + num_weights, + 2 /*outputs*/, + casted_input, + residual); + // added: attn_output + final attention bias + residual. To be added to the + // output of FC2 + ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, + input->dims, + input->data_type, + ln, + 0, + false /*create_grad*/); + // layer_norm(added) + ln->outputs[1] = create_tensor_legion_ordering(input->num_dims, + input->dims, + input->data_type, + ln, + 0, + false /*create_grad*/); + { + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; + } + // Attention bias + int attn_bias_dims[1] = {dims[0]}; + ln->weights[0] = create_weight_legion_ordering(1, + attn_bias_dims, + input->data_type, + ln, + false /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + if (num_weights > 1) { + assert(elementwise_affine); + ln->weights[1] = create_weight_legion_ordering(numdims, + dims, + input->data_type, + ln, + false /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + if (num_weights == 3) { + assert(use_bias); + ln->weights[2] = create_weight_legion_ordering(numdims, + dims, + input->data_type, + ln, + false /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + } + } + } + ln->add_int_property("elementwise_affine", elementwise_affine); + ln->add_int_property("use_bias", use_bias); + ln->add_int_vector_property("axes", axes); + ln->add_float_property("eps", eps); + layers.push_back(ln); + outputs[0] = ln->outputs[0]; + outputs[1] = ln->outputs[1]; +} + +Op *AddBiasResidualLayerNorm::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("elementwise_affine", value); + bool elementwise_affine = (bool)value; + layer->get_int_property("use_bias", value); + bool use_bias = (bool)value; + std::vector axes; + layer->get_int_vector_property("axes", axes); + float eps; + layer->get_float_property("eps", eps); + return new AddBiasResidualLayerNorm(model, + layer->layer_guid, + inputs[0], + inputs[1], + axes, + elementwise_affine, + use_bias, + eps, + false, // allocate_weights + layer->name); +} + +AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( + FFModel &model, + AddBiasResidualLayerNormParams const ¶ms, + std::pair const &inputs, + char const *name, + bool allocate_weights) + : AddBiasResidualLayerNorm(model, + params.layer_guid, + inputs.first, + inputs.second, + params.axes, + params.elementwise_affine, + params.use_bias, + params.eps, + allocate_weights, + name) {} + +AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( + FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + const ParallelTensor _residual, + std::vector const &_axes, + bool _elementwise_affine, + bool _use_bias, + float _eps, + bool allocate_weights, + char const *name) + : Op(model, + OP_ADD_BIAS_RESIDUAL_LAYERNORM, + _input->data_type, + name, + 2 /*inputs*/, + 1 + (_elementwise_affine ? (_use_bias ? 2 : 1) : 0) /*weights*/, + 2 /*outputs*/, + _input, + _residual), + elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), + use_bias(_use_bias) { + // overwrite layer_guid + layer_guid = _layer_guid; + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, _input->dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[1] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, _input->dims, _input->data_type, this, 1 /*owner_idx*/); + assert(check_output_input_weight_parallel_dims(allocate_weights)); + + int M = 1; + for (int i = 0; i < axes.size(); i++) { + M *= inputs[0]->dims[axes[i]].size; + } + int num_replicas = 1; + for (int i = 0; i < inputs[0]->num_dims; i++) { + if (inputs[0]->dims[i].is_replica_dim) { + num_replicas *= inputs[0]->dims[i].size; + } + } + effective_num_elements = M; + effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; + if (!elementwise_affine) { + assert(numWeights == 1); // attn bias + } else { + if (!use_bias) { + assert(numWeights == 2); // attn bias + weight + } else { + assert(numWeights == 3); // attn bias + weight + bias + } + } + + if (allocate_weights) { + // always need to allocate attn bias + ParallelTensorShape attention_bias_shape = _input->get_shape(); + for (int i = 1; i < attention_bias_shape.num_dims - 1; i++) { + attention_bias_shape.dims[i].size = 1; + } + + int seed = std::rand(); + Initializer *attn_bias_initializer = + new UniformInitializer(seed, 1.0f, 1.0f); + + weights[0] = model.create_parallel_weight_legion_ordering( + attention_bias_shape.num_dims, + attention_bias_shape.dims, + _input->data_type, + NULL /*owner_op*/, + false /*create_grad*/, + attn_bias_initializer, + CHOSEN_SYNC_TYPE); + + if (numWeights > 1) { + assert(elementwise_affine); + + ParallelTensorShape beta_gamma_shape = _input->get_shape(); + for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { + beta_gamma_shape.dims[i].size = 1; + } + + // weight + Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); + weights[1] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, // axes.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + false /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + + // bias + if (numWeights == 3) { + assert(use_bias); + Initializer *beta_initializer = + new UniformInitializer(seed, 0.0f, 0.0f); + weights[2] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, //.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + false /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); + } + } + } +} + +void AddBiasResidualLayerNorm::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AddBiasResidualLayerNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + // attn output + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // residual + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // added: attn_output + attn final bias + residual + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + // layer norm output + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(3, FID_DATA); + // attn final bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[2]->region)); + launcher.add_field(6, FID_DATA); + } + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void AddBiasResidualLayerNorm::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AddBiasResidualLayerNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // attn output + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // residual + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // added: attn_output + attn final bias + residual + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + // layer norm output + launcher.add_region_requirement(RegionRequirement(outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[1]->region)); + launcher.add_field(3, FID_DATA); + // attn final bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[2]->region)); + launcher.add_field(6, FID_DATA); + } + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): attn output + regions[1](I): residual + regions[2](O): added output (attn output + final attn bias + residual) + regions[3](O): layer norm output + regions[4](I): final attn bias + regions[5](I): gamma + regions[6](I): beta +*/ +OpMeta *AddBiasResidualLayerNorm::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + AddBiasResidualLayerNorm *ln = (AddBiasResidualLayerNorm *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + AddBiasResidualLayerNormMeta *meta = + new AddBiasResidualLayerNormMeta(handle, ln, gpu_mem_allocator); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->input_type[1] = ln->inputs[1]->data_type; + meta->weight_type[0] = ln->weights[0]->data_type; + if (ln->elementwise_affine) { + meta->weight_type[1] = ln->weights[1]->data_type; + if (ln->use_bias) { + meta->weight_type[2] = ln->weights[2]->data_type; + } + } + meta->output_type[0] = ln->outputs[0]->data_type; + meta->output_type[1] = ln->outputs[1]->data_type; + return meta; +} + +void AddBiasResidualLayerNorm::forward(FFModel const &ff) { + assert(false); +} + +void AddBiasResidualLayerNorm::backward(FFModel const &ff) { + assert(false); +} + +FutureMap AddBiasResidualLayerNorm::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "AddBiasResidualLayerNorm op machine_view: " << *(MachineView + const *)mv + << std::endl; */ + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // attn output + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // residual + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // added: attn_output + attn final bias + residual + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + // layer norm output + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(3, FID_DATA); + // attn final bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(5, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[2]->region)); + launcher.add_field(6, FID_DATA); + } + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): attn output + regions[1](I): residual + regions[2](O): added output (attn output + final attn bias + residual) + regions[3](O): layer norm output + regions[4](I): final attn bias + regions[5](I): gamma + regions[6](I): beta +*/ +void AddBiasResidualLayerNorm::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + + AddBiasResidualLayerNormMeta const *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + + assert(regions.size() == + 5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma, beta; + + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain residual_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + Domain out_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + Domain attn_bias_domain = runtime->get_index_space_domain( + ctx, task->regions[4].region.get_index_space()); + Domain gamma_domain, beta_domain; + + assert(in_domain.get_volume() == out_domain.get_volume()); + assert(out_domain.get_volume() == added_out_domain.get_volume()); + assert(in_domain.get_volume() == residual_domain.get_volume()); + assert(in_domain == out_domain); + assert(added_out_domain == out_domain); + assert(residual_domain == in_domain); + + coord_t attn_bias_dim = + attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; + assert((in_domain.hi()[0] - in_domain.lo()[0] + 1) == attn_bias_dim); + assert((residual_domain.hi()[0] - residual_domain.lo()[0] + 1) == + attn_bias_dim); + assert((out_domain.hi()[0] - out_domain.lo()[0] + 1) == attn_bias_dim); + assert((added_out_domain.hi()[0] - added_out_domain.lo()[0] + 1) == + attn_bias_dim); + + assert(in_domain.get_volume() == + m->effective_num_elements * m->effective_batch_size); + + // std::cout << std::endl << "INFERENCE task tensor dims:" << std::endl; + // std::cout << "input: "; + // for (int i=0; ielementwise_affine) { + gamma = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[5], + task->regions[5], + FID_DATA, + ctx, + runtime); + gamma_domain = runtime->get_index_space_domain( + ctx, task->regions[5].region.get_index_space()); + + if (m->use_bias) { + beta = helperGetGenericTensorAccessorRO(m->weight_type[2], + regions[6], + task->regions[6], + FID_DATA, + ctx, + runtime); + beta_domain = runtime->get_index_space_domain( + ctx, task->regions[6].region.get_index_space()); + assert(gamma_domain == beta_domain); + } + + assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + size_t vol = 1; + int i = 0; + while (vol < gamma_domain.get_volume()) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + vol *= g_d; + i++; + } + } + + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + (int)attn_bias_dim, + (int)residual_domain.get_volume(), + input, + added_output, + output, + residual, + attn_bias, + gamma, + beta); +} + +bool AddBiasResidualLayerNorm::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + return false; +} + +void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->axes.size()); + for (size_t i = 0; i < this->axes.size(); i++) { + sez.serialize(this->axes[i]); + } + sez.serialize(this->elementwise_affine); + sez.serialize(this->eps); + sez.serialize(this->use_bias); +} + +using PCG::Node; +/*static*/ +Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + size_t num_axes; + std::vector axes; + bool elementwise_affine; + bool use_bias; + float eps; + size_t id, transformer_layer_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(num_axes); + for (size_t i = 0; i < num_axes; i++) { + int axis_idx; + dez.deserialize(axis_idx); + axes.push_back(axis_idx); + } + dez.deserialize(elementwise_affine); + dez.deserialize(eps); + dez.deserialize(use_bias); + + AddBiasResidualLayerNormParams params; + params.layer_guid = layer_guid; + params.axes = axes; + params.elementwise_affine = elementwise_affine; + params.eps = eps; + params.use_bias = use_bias; + return ff.get_or_create_node({inputs[0], inputs[1]}, + params); +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::AddBiasResidualLayerNormParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.axes.size()); + for (int n : params.axes) { + hash_combine(key, n); + } + hash_combine(key, params.elementwise_affine); + hash_combine(key, params.use_bias); + return key; +} +}; // namespace std diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp new file mode 100644 index 0000000000..3570ae42dc --- /dev/null +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -0,0 +1,262 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/add_bias_residual_layer_norm.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( + FFHandler handle, + AddBiasResidualLayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; + effective_batch_size = ln->effective_batch_size; + effective_num_elements = ln->effective_num_elements; + profiling = ln->profiling; + eps = ln->eps; + DataType data_type = ln->data_type; + size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); +} + +AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + ? shared[lid] + : 0; + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void LayerNormFusedForwardKernel(int attn_bias_dim, + int residual_volume, + int64_t effective_num_elements, + int64_t effective_batch_size, + float eps, + T const *input_ptr, + T const *attn_bias_ptr, + T const *residual_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + T *mean, + T *rstd) { + // Add attention bias and residual + CUDA_KERNEL_LOOP(i, residual_volume) { + int bias_idx = i % attn_bias_dim; + added_output_ptr[i] = + input_ptr[i] + attn_bias_ptr[bias_idx] + residual_ptr[i]; + } + + __syncthreads(); + + // LayerNorm + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + if (i >= effective_batch_size) { + return; + } + float sum1 = 0.0f; + float sum2 = 0.0f; + for (int64_t j = threadIdx.x; j < effective_num_elements; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + const int64_t index = i * effective_num_elements + j; + sum1 += static_cast(added_output_ptr[index]); + sum2 += static_cast(added_output_ptr[index]) * + static_cast(added_output_ptr[index]); + } + if (threadIdx.x < kCUDABlockReduceNumThreads) { + sum1 = BlockReduceSum( + sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + sum2 = BlockReduceSum( + sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + } + if (threadIdx.x == 0) { + float const scale = float(1) / static_cast(effective_num_elements); + sum1 *= scale; + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < effective_num_elements; + j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * effective_num_elements + j; + const T_ACC gamma_v = + gamma_ptr == nullptr ? T_ACC(1) : static_cast(gamma_ptr[j]); + const T_ACC beta_v = + beta_ptr == nullptr ? T_ACC(0) : static_cast(beta_ptr[j]); + output_ptr[index] = (static_cast(added_output_ptr[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::inference_kernel( + AddBiasResidualLayerNormMeta const *m, + int attn_bias_dim, + int residual_volume, + T const *input_ptr, + T const *attn_bias_ptr, + T const *residual_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + hipStream_t stream) { + + std::pair kernel1_parallelism = std::make_pair( + GET_BLOCKS(residual_volume), std::min(residual_volume, CUDA_NUM_THREADS)); + std::pair kernel2_parallelism = + std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); + std::pair kernel3_parallelism = + std::make_pair(m->effective_batch_size, kCUDANumThreads); + + int num_blocks = std::max({kernel1_parallelism.first, + kernel2_parallelism.first, + kernel3_parallelism.first}); + int num_threads = std::max({kernel1_parallelism.second, + kernel2_parallelism.second, + kernel3_parallelism.second}); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormFusedForwardKernel), + num_blocks, + num_threads, + 0, + stream, + attn_bias_dim, + residual_volume, + m->effective_num_elements, + m->effective_batch_size, + m->eps, + input_ptr, + attn_bias_ptr, + residual_ptr, + added_output_ptr, + output_ptr, + gamma_ptr, + beta_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr)); +} + +/*static*/ +void AddBiasResidualLayerNorm::inference_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + int attn_bias_dim, + int residual_volume, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &added_output, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &residual, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (m->input_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::inference_kernel( + m, + attn_bias_dim, + residual_volume, + input.get_float_ptr(), + attn_bias.get_float_ptr(), + residual.get_float_ptr(), + added_output.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + m->use_bias ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::inference_kernel( + m, + attn_bias_dim, + residual_volume, + input.get_half_ptr(), + attn_bias.get_half_ptr(), + residual.get_half_ptr(), + added_output.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + m->use_bias ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } +} + +}; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu new file mode 100644 index 0000000000..9ac440080f --- /dev/null +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -0,0 +1,299 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( + FFHandler handle, + AddBiasResidualLayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; + effective_batch_size = ln->effective_batch_size; + effective_num_elements = ln->effective_num_elements; + profiling = ln->profiling; + eps = ln->eps; + DataType data_type = ln->data_type; + size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); +} + +AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + ? shared[lid] + : 0; + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void LayerNormFusedForwardKernel(int64_t N, + int64_t attn_bias_dim, + float eps, + T const *input_ptr, + T const *attn_bias_ptr, + T const *residual_ptr, + T *X, + T *mean, + T *rstd, + T const *gamma, + T const *beta, + T *Y) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float sum1 = 0.0f; + float sum2 = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + const int64_t index = i * N + j; + const int64_t bias_idx = index % attn_bias_dim; + X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); + } + if (threadIdx.x < kCUDABlockReduceNumThreads) { + sum1 = BlockReduceSum( + sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + sum2 = BlockReduceSum( + sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + } + if (threadIdx.x == 0) { + float const scale = float(1) / static_cast(N); + sum1 *= scale; + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + const T_ACC beta_v = + beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::inference_kernel( + AddBiasResidualLayerNormMeta const *m, + int attn_bias_dim, + int residual_volume, + T const *input_ptr, + T const *attn_bias_ptr, + T const *residual_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + cudaStream_t stream) { + + std::pair kernel1_parallelism = + std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->effective_batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + LayerNormFusedForwardKernel + <<>>(m->effective_num_elements, + attn_bias_dim, + m->eps, + input_ptr, + attn_bias_ptr, + residual_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); +} + +/*static*/ +void AddBiasResidualLayerNorm::inference_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + int attn_bias_dim, + int residual_volume, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &added_output, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &residual, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::inference_kernel( + m, + attn_bias_dim, + residual_volume, + input.get_float_ptr(), + attn_bias.get_float_ptr(), + residual.get_float_ptr(), + added_output.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + m->use_bias ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::inference_kernel( + m, + attn_bias_dim, + residual_volume, + input.get_half_ptr(), + attn_bias.get_half_ptr(), + residual.get_half_ptr(), + added_output.get_half_ptr(), + output.get_half_ptr(), + gamma.get_half_ptr(), + m->use_bias ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed); + // if (m->input_type[0] == DT_FLOAT) { + // print_tensor(input.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(attn_bias.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:attn_bias]"); + // print_tensor(residual.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:residual]"); + // print_tensor(added_output.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:added_output]"); + // print_tensor(output.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + // print_tensor(gamma.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:gamma]"); + // print_tensor( + // beta.get_float_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:beta]"); + // } else { + // print_tensor( + // input.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(attn_bias.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:attn_bias]"); + // print_tensor(residual.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:residual]"); + // print_tensor(added_output.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:added_output]"); + // print_tensor(output.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + // print_tensor( + // gamma.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:gamma]"); + // print_tensor( + // beta.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:beta]"); + // } + // print_tensor(in_ptr, 32, "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + } +} + +}; // namespace FlexFlow diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 9824e8469d..c2780545f3 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -16,9 +16,11 @@ #include "flexflow/ops/fused.h" #include "flexflow/accessor.h" #include "flexflow/model.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" +#include "flexflow/ops/flat.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/ops/kernels/concat_kernels.h" @@ -34,7 +36,6 @@ #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" -#include "flexflow/ops/linear.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" @@ -49,9 +50,8 @@ using Legion::Domain; using Legion::Future; using Legion::LogicalPartition; using Legion::LogicalRegion; +using Legion::Memory; using Legion::PhysicalRegion; -using Legion::PointInRectIterator; -using Legion::Rect; using Legion::Runtime; using Legion::Task; @@ -73,7 +73,7 @@ OpMeta *FusedOp::init_task(Task const *task, /* regions[...](I): inputs regions[...](I): weights - regions[...](I): outputs + regions[...](O): outputs */ __host__ void FusedOp::forward_task(Task const *task, std::vector const ®ions, @@ -233,7 +233,6 @@ __host__ void FusedOp::forward_task(Task const *task, if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { bias_ptr = my_weight_accessor[1].get_float_ptr(); } - bias_ptr = my_weight_accessor[1].get_float_ptr(); } else { assert(fused->op_num_weights[op] == 1); } @@ -304,6 +303,71 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0]); break; } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } case OP_GELU: case OP_RELU: case OP_SIGMOID: @@ -391,6 +455,29 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -410,7 +497,7 @@ __host__ void FusedOp::forward_task(Task const *task, /* regions[...](I): inputs regions[...](I): weights - regions[...](I): outputs + regions[...](O): outputs */ __host__ void FusedOp::inference_task(Task const *task, @@ -731,9 +818,10 @@ __host__ void assert(fused->op_num_outputs[op] == 1); IncMultiHeadSelfAttentionMeta const *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } @@ -754,9 +842,10 @@ __host__ void (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; TreeVerifyBatchConfig const &tree_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } @@ -779,9 +868,10 @@ __host__ void // (BeamSearchBatchConfig *)task->args; BeamSearchBatchConfig const &beam_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } @@ -813,6 +903,45 @@ __host__ void m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + Domain attn_bias_domain = my_weight_accessor[0].domain; + Domain residual_domain = my_input_accessor[1].domain; + int attn_bias_dim = + attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; + int residual_volume = residual_domain.get_volume(); + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + attn_bias_dim, + residual_volume, + my_input_accessor[0], + my_output_accessor[0], + my_output_accessor[1], + my_input_accessor[1], + my_weight_accessor[0], + gamma, + beta); + break; + } case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -865,7 +994,6 @@ __host__ void regions[...](I/O): weight_grad regions[...](I/O): output_grad */ - __host__ void FusedOp::backward_task(Task const *task, std::vector const ®ions, Context ctx, @@ -1018,6 +1146,65 @@ __host__ void FusedOp::backward_task(Task const *task, assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + // check dims + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::backward_kernel_wrapper( + meta, + (float const *)my_output_accessor[0].get_float_ptr(), + (float const *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_input_accessor[1].get_float_ptr(), + (float *)my_input_grad_accessor[1].get_float_ptr(), + (float *)nullptr, + m, + n, + k, + batch); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + assert(my_output_accessor[0].domain.get_dim() == 5); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::backward_kernel( + m, + (float const *)my_input_accessor[0].get_float_ptr(), + (float *)my_output_grad_accessor[0].get_float_ptr(), + (float const *)my_output_accessor[0].get_float_ptr(), + (float *)my_input_grad_accessor[0].get_float_ptr(), + (float const *)my_weight_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[0].get_float_ptr(), + (float *)my_weight_grad_accessor[1].get_float_ptr(), + my_output_accessor[0].domain.get_volume()); + break; + } case OP_CONCAT: { assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); @@ -1048,26 +1235,6 @@ __host__ void FusedOp::backward_task(Task const *task, my_weight_grad_accessor[1].get_float_ptr()); break; } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - assert(my_output_accessor[0].domain.get_dim() == 5); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::backward_kernel( - m, - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_output_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_weight_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[0].get_float_ptr(), - (float *)my_weight_grad_accessor[1].get_float_ptr(), - my_output_accessor[0].domain.get_volume()); - break; - } case OP_DROPOUT: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -1078,6 +1245,61 @@ __host__ void FusedOp::backward_task(Task const *task, my_input_grad_accessor[0].get_float_ptr()); break; } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::backward_kernel_wrapper( + m, + my_output_grad_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + my_input_grad_accessor[0].get_float_ptr(), + my_input_grad_accessor[1].get_float_ptr()); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + assert(my_input_accessor[0].data_type == DT_INT64); + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_grad_accessor[0].domain.hi()[0] - + my_output_grad_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_grad_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + Kernels::Embedding::backward_kernel_wrapper(m, + my_input_accessor[0], + my_output_grad_accessor[0], + my_weight_grad_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -1110,66 +1332,6 @@ __host__ void FusedOp::backward_task(Task const *task, batch_size); break; } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; - // check dims - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; - } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::backward_kernel_wrapper( - meta, - (float const *)my_output_accessor[0].get_float_ptr(), - (float const *)my_output_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[0].get_float_ptr(), - (float *)my_input_grad_accessor[0].get_float_ptr(), - (float const *)my_input_accessor[1].get_float_ptr(), - (float *)my_input_grad_accessor[1].get_float_ptr(), - (float *)nullptr, - m, - n, - k, - batch); - break; - } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_DIV: - case OP_EW_MAX: - case OP_EW_MIN: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[1].get_float_ptr()); - break; - } case OP_GELU: case OP_RELU: case OP_SIGMOID: diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 5cecbd168e..011dd9be75 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -15,6 +15,7 @@ #include "flexflow/accessor.h" #include "flexflow/model.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" #include "flexflow/ops/element_unary.h" #include "flexflow/ops/embedding.h" @@ -467,6 +468,29 @@ __host__ void FusedOp::forward_task(Task const *task, my_output_accessor[0].domain); break; } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -823,9 +847,10 @@ __host__ void assert(fused->op_num_outputs[op] == 1); IncMultiHeadSelfAttentionMeta const *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } @@ -848,9 +873,10 @@ __host__ void // (TreeVerifyBatchConfig *)task->args; TreeVerifyBatchConfig const &tree_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } @@ -873,9 +899,10 @@ __host__ void // (BeamSearchBatchConfig *)task->args; BeamSearchBatchConfig const &beam_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == (1 + (int)(*m->bias))); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } @@ -907,6 +934,45 @@ __host__ void m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + Domain attn_bias_domain = my_weight_accessor[0].domain; + Domain residual_domain = my_input_accessor[1].domain; + int attn_bias_dim = + attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; + int residual_volume = residual_domain.get_volume(); + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + attn_bias_dim, + residual_volume, + my_input_accessor[0], + my_output_accessor[0], + my_output_accessor[1], + my_input_accessor[1], + my_weight_accessor[0], + gamma, + beta); + break; + } case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 2c7518bae9..7cb9867312 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -64,8 +64,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, + bool qkv_bias, + bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, @@ -82,8 +82,8 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, kdim, vdim, dropout, - bias, - add_bias_kv, + qkv_bias, + final_bias, add_zero_attn, data_type, kernel_initializer, @@ -102,8 +102,8 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, + bool qkv_bias, + bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, @@ -119,7 +119,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; bool offload = cpu_offload; Layer *li = nullptr; - int weight_num = bias ? 2 : 1; + int weight_num = (qkv_bias || final_bias) ? 2 : 1; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); li = new Layer(this, @@ -178,10 +178,12 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, kernel_initializer, CHOSEN_SYNC_TYPE); } - if (bias) { + if (qkv_bias || final_bias) { // q, k, v, o - int dims[1] = {qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + oProjSize}; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + + (final_bias ? oProjSize : 0)}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -196,8 +198,8 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); - li->add_int_property("bias", bias); - li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("qkv_bias", qkv_bias); + li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); @@ -231,10 +233,10 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( int vdim = value; float dropout; layer->get_float_property("dropout", dropout); - layer->get_int_property("bias", value); - bool bias = (bool)value; - layer->get_int_property("add_bias_kv", value); - bool add_bias_kv = (bool)value; + layer->get_int_property("qkv_bias", value); + bool qkv_bias = (bool)value; + layer->get_int_property("final_bias", value); + bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); @@ -264,8 +266,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( kdim, vdim, dropout, - bias, - add_bias_kv, + qkv_bias, + final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -289,8 +291,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -308,11 +310,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_bias ? 2 : 1), /*weights*/ + (_qkv_bias || _final_bias ? 2 : 1), /*weights*/ 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -365,11 +368,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); - if (bias) { + if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + - oProjSize; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -401,8 +405,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -420,12 +424,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_bias ? 2 : 1), /*weights*/ + (_qkv_bias || _final_bias ? 2 : 1), /*weights*/ 1 /*outputs*/, _input, _weight), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -477,11 +482,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); - if (bias) { + if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + - oProjSize; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -520,8 +526,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.qProjSize, other.vProjSize, other.dropout, - other.bias, - other.add_bias_kv, + other.qkv_bias, + other.final_bias, other.add_zero_attn, other.apply_rotary_embedding, other.scaling_query, @@ -549,8 +555,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.kdim, params.vdim, params.dropout, - params.bias, - params.add_bias_kv, + params.qkv_bias, + params.final_bias, params.add_zero_attn, params.apply_rotary_embedding, params.scaling_query, @@ -779,7 +785,7 @@ FutureMap IncMultiHeadSelfAttention::inference( batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (bias) { + if (qkv_bias || final_bias) { launcher.add_region_requirement( RegionRequirement(weights[1]->part, 0 /*projection id*/, @@ -817,7 +823,8 @@ void IncMultiHeadSelfAttention::inference_task( IncMultiHeadSelfAttentionMeta const *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); - assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -826,7 +833,7 @@ void IncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { biases = helperGetGenericTensorAccessorRO(m->weight_type[1], regions[3], task->regions[3], @@ -1643,7 +1650,7 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && lhs.scaling_query == rhs.scaling_query && @@ -1660,8 +1667,8 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; - params.bias = this->bias; - params.add_bias_kv = this->add_bias_kv; + params.qkv_bias = this->qkv_bias; + params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; params.apply_rotary_embedding = this->apply_rotary_embedding; params.scaling_query = this->scaling_query; @@ -1689,8 +1696,8 @@ size_t hash::operator()( hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); - hash_combine(key, params.bias); - hash_combine(key, params.add_bias_kv); + hash_combine(key, params.qkv_bias); + hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); hash_combine(key, params.apply_rotary_embedding); hash_combine(key, params.scaling_query); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index a08114fec9..8fb635bace 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -348,7 +348,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int k_block_size = m->kProjSize * num_tokens; int q_array_size = m->qProjSize * num_tokens * m->num_q_heads; // apply bias for q, k, v - if (*m->bias) { + if (*m->qkv_bias) { hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -847,7 +847,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, tokens_previous_requests += num_new_tokens; } - if (*m->bias && shard_id == 0) { + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + @@ -878,7 +878,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->bias; + bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -961,11 +961,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->vProjSize, attn->oProjSize, attn->apply_rotary_embedding, - attn->bias, + attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->add_bias_kv, + attn->final_bias, attn->scaling_factor, weight, gpu_mem_allocator, @@ -989,11 +989,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _vProjSize, int _oProjSize, bool _apply_rotary_embedding, - bool _bias, + bool _qkv_bias, bool _scaling_query, bool _qk_prod_scaling, bool _position_bias, - bool _add_bias_kv, + bool _final_bias, float _scaling_factor, GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, @@ -1004,7 +1004,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _num_kv_heads, DataType _quantization_type, bool _offload) - : OpMeta(handler, attn) { + : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); @@ -1038,13 +1038,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( quantized_weightSize = get_quantization_to_byte_size( attn->data_type, quantization_type, weightSize); } - biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + int final_bias_size = oProjSize; + biasSize = + (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); + // has_load_weights = (bool *)calloc(1, sizeof(bool)); //*has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); *apply_rotary_embedding = _apply_rotary_embedding; - bias = (bool *)calloc(1, sizeof(bool)); - *bias = _bias; + qkv_bias = (bool *)calloc(1, sizeof(bool)); + *qkv_bias = _qkv_bias; scaling_query = (bool *)calloc(1, sizeof(bool)); *scaling_query = _scaling_query; scaling_factor = _scaling_factor; @@ -1052,8 +1059,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( *qk_prod_scaling = _qk_prod_scaling; position_bias = (bool *)calloc(1, sizeof(bool)); *position_bias = _position_bias; - // Currently do not support adding bias to key/value projection - assert(!_add_bias_kv); + final_bias = (bool *)calloc(1, sizeof(bool)); + *final_bias = _final_bias; // allocate weight and bias in the reserve space for cpu offloading if (offload) { @@ -1201,6 +1208,22 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( checkCUDA(hipStreamSynchronize(stream)); } -IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {} +IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + hipStream_t stream); + +template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + hipStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index f0e6d9df1d..ec776f4cda 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -323,7 +323,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int k_block_size = m->kProjSize * num_tokens; int q_array_size = m->qProjSize * num_tokens * m->num_q_heads; // apply bias for q, k, v - if (*m->bias) { + if (*m->qkv_bias) { apply_proj_bias_qkv<<offload && m->biasSize > 0) { cudaMemcpyAsync( @@ -852,7 +852,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, tokens_previous_requests += num_new_tokens; } - if (*m->bias && shard_id == 0) { + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + @@ -879,7 +879,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->bias; + bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -935,7 +935,36 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed); + + // if (input.data_type == DT_HALF) { + // print_tensor(input.get_half_ptr(), + // 32, + // "[IncMultiHeadSelfAttention:forward:input]"); + // print_tensor(weight.get_half_ptr(), + // 32, + // "[IncMultiHeadSelfAttention:forward:weight]"); + // print_tensor(output.get_half_ptr(), + // 32, + // "[IncMultiHeadSelfAttention:forward:output]"); + // print_tensor( + // bias.get_half_ptr(), 32, + // "[IncMultiHeadSelfAttention:forward:bias]"); + // } else { + // print_tensor(input.get_float_ptr(), + // 32, + // "[IncMultiHeadSelfAttention:forward:input]"); + // print_tensor(weight.get_float_ptr(), + // 32, + // "[IncMultiHeadSelfAttention:forward:weight]"); + // print_tensor(output.get_float_ptr(), + // 32, + // "[IncMultiHeadSelfAttention:forward:output]"); + // print_tensor( + // bias.get_float_ptr(), 32, + // "[IncMultiHeadSelfAttention:forward:bias]"); + // } + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); @@ -961,11 +990,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->vProjSize, attn->oProjSize, attn->apply_rotary_embedding, - attn->bias, + attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->add_bias_kv, + attn->final_bias, attn->scaling_factor, weight, gpu_mem_allocator, @@ -989,11 +1018,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _vProjSize, int _oProjSize, bool _apply_rotary_embedding, - bool _bias, + bool _qkv_bias, bool _scaling_query, bool _qk_prod_scaling, bool _position_bias, - bool _add_bias_kv, + bool _final_bias, float _scaling_factor, GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, @@ -1038,13 +1067,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( quantized_weightSize = get_quantization_to_byte_size( attn->data_type, quantization_type, weightSize); } - biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + int final_bias_size = oProjSize; + biasSize = + (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); + // has_load_weights = (bool *)calloc(1, sizeof(bool)); //*has_load_weights = false; apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); *apply_rotary_embedding = _apply_rotary_embedding; - bias = (bool *)calloc(1, sizeof(bool)); - *bias = _bias; + qkv_bias = (bool *)calloc(1, sizeof(bool)); + *qkv_bias = _qkv_bias; scaling_query = (bool *)calloc(1, sizeof(bool)); *scaling_query = _scaling_query; scaling_factor = _scaling_factor; @@ -1052,8 +1088,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( *qk_prod_scaling = _qk_prod_scaling; position_bias = (bool *)calloc(1, sizeof(bool)); *position_bias = _position_bias; - // Currently do not support adding bias to key/value projection - assert(!_add_bias_kv); + final_bias = (bool *)calloc(1, sizeof(bool)); + *final_bias = _final_bias; // allocate weight and bias in the reserve space for cpu offloading if (offload) { diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index 6d30ae690a..ff5d5a67e1 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -137,7 +137,7 @@ void forward_kernel_wrapper(ElementBinaryMeta const *m, default: assert(false); } - printf("[%s] forward time (CF) = %.2fms\n", opName, elapsed); + printf("[%s] forward time (CF) = %.9fms\n", opName, elapsed); // print_tensor(in1_ptr, 32, "[EWB:forward:input1]"); // print_tensor(in2_ptr, 32, "[EWB:forward:input2]"); // print_tensor(out_ptr, 32, "[EWB:forward:output]"); diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 758d7cfcce..cb519239c5 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -130,6 +130,7 @@ Tensor FFModel::layer_norm(const Tensor input, 0, true /*create_grad*/); if (num_weights > 0) { + assert(elementwise_affine); int numdims = axes.size(); int dims[numdims]; for (int i = 0; i < numdims; i++) { @@ -238,13 +239,13 @@ LayerNorm::LayerNorm(FFModel &model, effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; assert(use_bias == (numWeights == 2)); if (numWeights > 0 && allocate_weights) { + assert(elementwise_affine); ParallelTensorShape beta_gamma_shape = _input->get_shape(); for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { beta_gamma_shape.dims[i].size = 1; } int seed = std::rand(); Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); - Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 0.0f); weights[0] = model.create_parallel_weight_legion_ordering( beta_gamma_shape.num_dims, // axes.size(), beta_gamma_shape.dims, @@ -253,14 +254,18 @@ LayerNorm::LayerNorm(FFModel &model, true /*create_grad*/, gamma_initializer, CHOSEN_SYNC_TYPE); - weights[1] = model.create_parallel_weight_legion_ordering( - beta_gamma_shape.num_dims, //.size(), - beta_gamma_shape.dims, - _input->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - beta_initializer, - CHOSEN_SYNC_TYPE); + if (numWeights == 2) { + assert(use_bias); + Initializer *beta_initializer = new UniformInitializer(seed, 0.0f, 0.0f); + weights[1] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, //.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); + } } } diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 97b5094a21..202a8837ff 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -95,6 +95,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + ? shared[lid] + : 0; + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +#ifdef DEADCODE template __global__ void RowwiseMomentsCUDAKernel( int64_t N, float eps, T const *X, T *mean, T *rstd) { @@ -140,6 +160,56 @@ __global__ void LayerNormForwardCUDAKernel(int64_t N, beta_v; } } +#endif + +template +__global__ void LayerNormFusedForwardKernel(int64_t N, + float eps, + T const *X, + T *mean, + T *rstd, + T const *gamma, + T const *beta, + T *Y) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float sum1 = 0.0f; + float sum2 = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + const int64_t index = i * N + j; + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); + } + if (threadIdx.x < kCUDABlockReduceNumThreads) { + sum1 = BlockReduceSum( + sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + sum2 = BlockReduceSum( + sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + } + if (threadIdx.x == 0) { + float const scale = float(1) / static_cast(N); + sum1 *= scale; + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + const T_ACC beta_v = + beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; + } +} /*static*/ template @@ -149,22 +219,26 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *gamma_ptr, T const *beta_ptr, cudaStream_t stream) { - RowwiseMomentsCUDAKernel - <<effective_batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); - LayerNormForwardCUDAKernel - <<effective_batch_size, kCUDANumThreads, 0, stream>>>( - m->effective_num_elements, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + + std::pair kernel1_parallelism = + std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->effective_batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + LayerNormFusedForwardKernel + <<>>(m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -208,7 +282,7 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("[LayerNorm] forward time (CF) = %.2fms\n", elapsed); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 01275c9875..350ab3c167 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -23,10 +23,6 @@ #endif #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" -#ifdef INFERENCE_TESTS -#include -using namespace at::indexing; -#endif namespace FlexFlow { @@ -63,8 +59,8 @@ Tensor int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, + bool qkv_bias, + bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, @@ -81,8 +77,8 @@ Tensor kdim, vdim, dropout, - bias, - add_bias_kv, + qkv_bias, + final_bias, add_zero_attn, data_type, kernel_initializer, @@ -102,8 +98,8 @@ Tensor int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, + bool qkv_bias, + bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, @@ -117,7 +113,7 @@ Tensor data_type = input->data_type; } Layer *li = nullptr; - int weight_num = bias ? 2 : 1; + int weight_num = (qkv_bias || final_bias) ? 2 : 1; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); li = new Layer(this, @@ -168,10 +164,12 @@ Tensor kernel_initializer, CHOSEN_SYNC_TYPE); } - if (bias) { + if (qkv_bias || final_bias) { // q, k, v, o - int dims[1] = {qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + oProjSize}; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + + (final_bias ? oProjSize : 0)}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -186,8 +184,8 @@ Tensor li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); - li->add_int_property("bias", bias); - li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("qkv_bias", qkv_bias); + li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); @@ -218,10 +216,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( int vdim = value; float dropout; layer->get_float_property("dropout", dropout); - layer->get_int_property("bias", value); - bool bias = (bool)value; - layer->get_int_property("add_bias_kv", value); - bool add_bias_kv = (bool)value; + layer->get_int_property("qkv_bias", value); + bool qkv_bias = (bool)value; + layer->get_int_property("final_bias", value); + bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); @@ -244,8 +242,8 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( kdim, vdim, dropout, - bias, - add_bias_kv, + qkv_bias, + final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -266,8 +264,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -282,11 +280,12 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_bias ? 2 : 1) /*weights*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -330,11 +329,12 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); - if (bias) { + if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + - oProjSize; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -366,8 +366,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -382,12 +382,13 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_bias ? 2 : 1) /*weights*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input, _weight), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -431,11 +432,12 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); - if (bias) { + if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + - oProjSize; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -474,8 +476,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( other.qProjSize, other.vProjSize, other.dropout, - other.bias, - other.add_bias_kv, + other.qkv_bias, + other.final_bias, other.add_zero_attn, other.apply_rotary_embedding, other.scaling_query, @@ -500,8 +502,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.kdim, params.vdim, params.dropout, - params.bias, - params.add_bias_kv, + params.qkv_bias, + params.final_bias, params.add_zero_attn, params.apply_rotary_embedding, params.scaling_query, @@ -708,7 +710,7 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (bias) { + if (qkv_bias || final_bias) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, @@ -740,7 +742,8 @@ void SpecIncMultiHeadSelfAttention::inference_task( SpecIncMultiHeadSelfAttentionMeta const *m = *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); - assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -749,7 +752,7 @@ void SpecIncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { biases = helperGetGenericTensorAccessorRO(m->weight_type[1], regions[3], task->regions[3], @@ -818,7 +821,7 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && lhs.scaling_query == rhs.scaling_query && @@ -837,8 +840,8 @@ SpecIncMultiHeadSelfAttentionParams params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; - params.bias = this->bias; - params.add_bias_kv = this->add_bias_kv; + params.qkv_bias = this->qkv_bias; + params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; params.apply_rotary_embedding = this->apply_rotary_embedding; params.scaling_query = this->scaling_query; @@ -862,8 +865,8 @@ size_t hash::operator()( hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); - hash_combine(key, params.bias); - hash_combine(key, params.add_bias_kv); + hash_combine(key, params.qkv_bias); + hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); hash_combine(key, params.apply_rotary_embedding); hash_combine(key, params.scaling_query); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 7e85a65e05..f983238198 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -512,7 +512,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } } - if (*m->bias && shard_id == 0) { + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + @@ -600,7 +600,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->bias; + bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -671,11 +671,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->vProjSize, attn->oProjSize, attn->apply_rotary_embedding, - attn->bias, + attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->add_bias_kv, + attn->final_bias, attn->scaling_factor, weight, gpu_mem_allocator, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 47e9941e1d..6ef5145654 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -531,7 +531,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } } - if (*m->bias && shard_id == 0) { + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + @@ -610,7 +610,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->bias; + bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -684,11 +684,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->vProjSize, attn->oProjSize, attn->apply_rotary_embedding, - attn->bias, + attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->add_bias_kv, + attn->final_bias, attn->scaling_factor, weight, gpu_mem_allocator, diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 9597482ad2..207dae0785 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -23,10 +23,6 @@ #endif #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" -#ifdef INFERENCE_TESTS -#include -using namespace at::indexing; -#endif namespace FlexFlow { @@ -65,8 +61,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, + bool qkv_bias, + bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, @@ -83,8 +79,8 @@ Tensor FFModel::inc_multihead_self_attention_verify( kdim, vdim, dropout, - bias, - add_bias_kv, + qkv_bias, + final_bias, add_zero_attn, data_type, kernel_initializer, @@ -104,8 +100,8 @@ Tensor FFModel::inc_multiquery_self_attention_verify( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, + bool qkv_bias, + bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, @@ -121,7 +117,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; bool offload = cpu_offload; Layer *li = nullptr; - int weight_num = bias ? 2 : 1; + int weight_num = (qkv_bias || final_bias) ? 2 : 1; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); li = new Layer(this, @@ -180,10 +176,12 @@ Tensor FFModel::inc_multiquery_self_attention_verify( kernel_initializer, CHOSEN_SYNC_TYPE); } - if (bias) { + if (qkv_bias || final_bias) { // q, k, v, o - int dims[1] = {qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + oProjSize}; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + + (final_bias ? oProjSize : 0)}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -198,8 +196,8 @@ Tensor FFModel::inc_multiquery_self_attention_verify( li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); - li->add_int_property("bias", bias); - li->add_int_property("add_bias_kv", add_bias_kv); + li->add_int_property("qkv_bias", qkv_bias); + li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); @@ -232,10 +230,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( int vdim = value; float dropout; layer->get_float_property("dropout", dropout); - layer->get_int_property("bias", value); - bool bias = (bool)value; - layer->get_int_property("add_bias_kv", value); - bool add_bias_kv = (bool)value; + layer->get_int_property("qkv_bias", value); + bool qkv_bias = (bool)value; + layer->get_int_property("final_bias", value); + bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; layer->get_int_property("apply_rotary_embedding", value); @@ -263,8 +261,8 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( kdim, vdim, dropout, - bias, - add_bias_kv, + qkv_bias, + final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, @@ -288,8 +286,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -307,11 +305,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_bias ? 2 : 1) /*weights*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -365,11 +364,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); - if (bias) { + if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + - oProjSize; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -401,8 +401,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _bias, - bool _add_bias_kv, + bool _qkv_bias, + bool _final_bias, bool _add_zero_attn, bool _apply_rotary_embedding, bool _scaling_query, @@ -420,12 +420,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_bias ? 2 : 1) /*weights*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, 1 /*outputs*/, _input, _weight), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - bias(_bias), add_bias_kv(_add_bias_kv), add_zero_attn(_add_zero_attn), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), apply_rotary_embedding(_apply_rotary_embedding), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), @@ -476,11 +477,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); - if (bias) { + if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - bias_shape.dims[0].size = qProjSize * num_q_heads + - (kProjSize + vProjSize) * num_kv_heads + - oProjSize; + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -519,8 +521,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( other.qProjSize, other.vProjSize, other.dropout, - other.bias, - other.add_bias_kv, + other.qkv_bias, + other.final_bias, other.add_zero_attn, other.apply_rotary_embedding, other.scaling_query, @@ -548,8 +550,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.kdim, params.vdim, params.dropout, - params.bias, - params.add_bias_kv, + params.qkv_bias, + params.final_bias, params.add_zero_attn, params.apply_rotary_embedding, params.scaling_query, @@ -776,7 +778,7 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (bias) { + if (qkv_bias || final_bias) { launcher.add_region_requirement( RegionRequirement(weights[1]->part, 0 /*projection id*/, @@ -814,7 +816,8 @@ void TreeIncMultiHeadSelfAttention::inference_task( TreeIncMultiHeadSelfAttentionMeta *m = *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); - assert((*m->bias ? regions.size() == 4 : regions.size() == 3)); + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -823,7 +826,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR biases; - if (*m->bias) { + if (*m->qkv_bias || *m->final_bias) { biases = helperGetGenericTensorAccessorRO(m->weight_type[1], regions[3], task->regions[3], @@ -854,788 +857,6 @@ void TreeIncMultiHeadSelfAttention::inference_task( TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &bc, task->index_point.point_data[0], input, weight, output, biases); -#ifdef INFERENCE_TESTS - printf("Checking TreeIncMultiHeadSelfAttention computations...\n"); - - // ============================================================================= - // Define helper functions to handle row-major arrays - // ============================================================================= - - auto set_value_row_major = [](float *arr, - std::vector const &shape, - std::vector const &indices, - float value) -> void { - int offset = 0; - for (int i = 0; i < shape.size(); i++) { - int index = indices[i]; - int stride = 1; - for (int j = i + 1; j < shape.size(); j++) { - stride *= shape[j]; - } - offset += index * stride; - } - *(arr + offset) = value; - }; - - // ============================================================================= - // Load input/output/weights and parse general configs - // ============================================================================= - - float *input_cpu = - download_tensor(input.get_float_ptr(), input_domain.get_volume()); - assert(input_cpu != nullptr); - float *weight_cpu = download_tensor(weight.get_float_ptr(), - weight_domain.get_volume()); - assert(weight_cpu != nullptr); - float *output_cpu = download_tensor(output.get_float_ptr(), - output_domain.get_volume()); - assert(output_cpu != nullptr); - - // Input tensor dimensions - coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; - coord_t max_sequence_length = input_domain.hi()[1] - input_domain.lo()[1] + 1; - coord_t batch_size = input_domain.hi()[2] - input_domain.lo()[2] + 1; - coord_t replica_dim = input_domain.hi()[3] - input_domain.lo()[3] + 1; - assert(replica_dim == 1); - - size_t effective_batch_size = max_sequence_length * batch_size; - float inputs_arr[data_dim][effective_batch_size] = {0}; - for (size_t i = 0; i < data_dim * bc.num_active_tokens(); i++) { - size_t data_index = i % data_dim; - size_t token_index = i / data_dim; - assert(data_index < data_dim); - assert(token_index < effective_batch_size); - inputs_arr[data_index][token_index] = input_cpu[i]; - } - torch::Tensor torch_input = torch::from_blob( - inputs_arr, {data_dim, (long int)effective_batch_size}, torch::kFloat32); - - // Weight tensor dimensions - coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; - coord_t num_q_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; - replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; - size_t qParas = m->qProjSize * m->qSize; - size_t kParas = m->kProjSize * m->kSize; - size_t vParas = m->vProjSize * m->vSize; - size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); - - assert(all_weight_params == qParas + kParas + vParas + oParas); - assert(num_q_heads == m->num_q_heads); - assert(replica_dim == 1); - - assert(m->qSize == m->kSize && m->kSize == m->vSize); - // printf("m->qSize: %i\n", m->qSize); - // keep things simple for now - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - long int proj_sum = m->qProjSize + m->kProjSize + m->vProjSize; - // load weight manually because Torch can't easily read a tensor serialized in - // column-major order. - - // printf("m->kProjSize: %i, TreeVerifyBatchConfig::MAX_NUM_TOKENS: %i, " - // "bc.num_active_tokens(): %i, num_q_heads: %lli, - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS: %i, " - // "bc.num_active_requests(): %i\n", m->kProjSize, - // TreeVerifyBatchConfig::MAX_NUM_TOKENS, bc.num_active_tokens(), - // num_q_heads, TreeVerifyBatchConfig::MAX_NUM_REQUESTS, - // bc.num_active_requests()); - // for (int t=0; t < bc.num_active_tokens(); t++) { - // printf("token %i has request_index: %li and token_position: %li\n", - // t, bc.token2ids.token_indexes[t].request_index, - // bc.token2ids.token_indexes[t].token_position); - // } - - // ============================================================================= - // Load the output tensor (with CUDA results), and create a Torch tensor - // ============================================================================= - - float output_cuda[m->oProjSize][effective_batch_size] = {0}; - for (int i = 0; i < m->oProjSize * effective_batch_size; i++) { - int row_idx = i % m->oProjSize; - int col_idx = i / m->oProjSize; - assert(row_idx < m->oProjSize && col_idx < effective_batch_size); - output_cuda[row_idx][col_idx] = output_cpu[i]; - } - torch::Tensor torch_out_cuda = - torch::from_blob(output_cuda, - {m->oProjSize, (int64_t)effective_batch_size}, - torch::kFloat32); - - // ============================================================================= - // Load the Q/K/V projection weights, and create a Torch tensor - // ============================================================================= - std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_q_heads}; - float *w_qkv = - (float *)calloc(m->qSize * m->qProjSize * 3 * num_q_heads, sizeof(float)); - assert(w_qkv[0] == 0.0f); - - for (int h = 0; h < num_q_heads; h++) { - for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { - int row_index = i % m->qSize; - int column_index = i / m->qSize; - // Q - set_value_row_major(w_qkv, - w_qkv_shape, - {row_index, column_index, 0, h}, - weight_cpu[all_weight_params * h + - m->qSize * column_index + row_index]); - // K - set_value_row_major( - w_qkv, - w_qkv_shape, - {row_index, column_index, 1, h}, - weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + - m->qSize * column_index + row_index]); - // V - set_value_row_major( - w_qkv, - w_qkv_shape, - {row_index, column_index, 2, h}, - weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + - m->qSize * column_index + row_index]); - } - } - // convert weights to torch tensor - torch::Tensor torch_w_qkv = torch::from_blob( - w_qkv, {m->qSize, m->qProjSize, 3, (int)num_q_heads}, torch::kFloat32); - - /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() - << std::endl; - std::cout << "Torch input size: " << torch_input.sizes() << std::endl; - std::cout << "Number of active tokens: " << bc.num_active_tokens() - << std::endl; */ - // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; - - // ============================================================================= - // Compute the Q/K/V projections, and compare the results with CUDA - // ============================================================================= - - // ----------------------- C++ computations & checks ------------------------ - torch::Tensor qkv_projs = torch::einsum( - "ijkl,im->jmkl", - {torch_w_qkv, - torch_input.index({Slice(), Slice(0, bc.num_active_tokens())})}); - // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; - assert(qkv_projs.sizes()[0] == m->qProjSize); - assert(qkv_projs.sizes()[1] == bc.num_active_tokens() && - qkv_projs.sizes()[1] <= effective_batch_size); - assert(qkv_projs.sizes()[2] == 3); - assert(qkv_projs.sizes()[3] == num_q_heads); - free(w_qkv); - - // ----------------------- Loading CUDA results for this step --------------- - float *QKVProjArray_cpu = download_tensor( - m->devQKVProjArray, - TreeVerifyBatchConfig::MAX_NUM_TOKENS * proj_sum * m->num_q_heads); - assert(QKVProjArray_cpu != nullptr); - - std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc.num_active_tokens(), 3, (int)num_q_heads}; - float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc.num_active_tokens() * 3 * num_q_heads, sizeof(float)); - - // skip over padding at the end of QKVProjArray_cpu - // convert from column order to 3D matrix because torch cannot automatically - // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc.num_active_tokens() * num_q_heads; i++) { - int proj_size_index = i % m->qProjSize; - int head_index = i / (proj_sum * bc.num_active_tokens()); - int token_index = - ((i - head_index * proj_sum * bc.num_active_tokens()) / m->qProjSize) % - bc.num_active_tokens(); - int qkv_offset = (i - head_index * proj_sum * bc.num_active_tokens()) / - (m->qProjSize * bc.num_active_tokens()); - assert(proj_size_index < proj_sum); - assert(head_index < num_q_heads); - assert(token_index < bc.num_active_tokens()); - assert(qkv_offset < 3); - set_value_row_major(QKVProjArray_converted, - QKVProjArray_converted_shape, - {proj_size_index, token_index, qkv_offset, head_index}, - QKVProjArray_cpu[i]); - } - torch::Tensor QKVProjArray_torch = - torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc.num_active_tokens(), 3, num_q_heads}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - // std::cout << "QKVProjArray_torch" << std::endl; - // for (int i=0; ikProjSize; d++) { - size_t kcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc.tokensInfo[t].abs_depth_in_request * - m->num_q_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc.tokensInfo[t].request_index; - m->kcache[kcache_idx] = - qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) - .item(); - } - for (size_t d = 0; d < m->vProjSize; d++) { - size_t vcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc.tokensInfo[t].abs_depth_in_request * - m->num_q_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - h * TreeVerifyBatchConfig::MAX_NUM_REQUESTS + - bc.tokensInfo[t].request_index; - m->vcache[vcache_idx] = - qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) - .item(); - } - } - } - // Create torch tensors from the arrays - torch::Tensor K_t = - torch::from_blob(m->kcache, - {m->kProjSize, - MAX_SEQ_LEN, - num_q_heads, - TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - torch::Tensor V_t = - torch::from_blob(m->vcache, - {m->vProjSize, - MAX_SEQ_LEN, - num_q_heads, - TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - - // Compute useful indices - std::vector req_idxs; - std::vector r_first_idx; - std::vector r_num_tokens; - for (size_t t = 0; t < bc.num_active_tokens(); t++) { - size_t rid = bc.tokensInfo[t].request_index; - if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { - req_idxs.push_back(rid); - r_first_idx.push_back(t); - r_num_tokens.push_back(1); - } else { - r_num_tokens[r_num_tokens.size() - 1]++; - } - assert(req_idxs.size() == r_first_idx.size() && - r_first_idx.size() == r_num_tokens.size()); - } - assert(req_idxs.size() == bc.num_active_requests()); - assert(std::accumulate(r_num_tokens.begin(), - r_num_tokens.end(), - decltype(r_num_tokens)::value_type(0)) == - bc.num_active_tokens()); - - // ----------------------- Loading CUDA results for this step --------------- - float *keyCache_cpu = download_tensor( - m->keyCache, - m->num_q_heads * m->kProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * - MAX_SEQ_LEN); - float *valueCache_cpu = download_tensor( - m->valueCache, - m->num_q_heads * m->vProjSize * TreeVerifyBatchConfig::MAX_NUM_REQUESTS * - MAX_SEQ_LEN); - assert(keyCache_cpu != nullptr); - assert(valueCache_cpu != nullptr); - - float *kcache_cuda = - (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_q_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - float *vcache_cuda = - (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_q_heads * - TreeVerifyBatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - int index = 0; - for (int i = 0; i < m->kProjSize; i++) { - for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_q_heads; k++) { - for (int l = 0; l < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = - l * m->kProjSize * MAX_SEQ_LEN * m->num_q_heads + - k * m->kProjSize * MAX_SEQ_LEN + j * m->kProjSize + i; - kcache_cuda[index++] = keyCache_cpu[col_major_index]; - } - } - } - } - index = 0; - for (int i = 0; i < m->vProjSize; i++) { - for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_q_heads; k++) { - for (int l = 0; l < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; l++) { - int col_major_index = - l * m->vProjSize * MAX_SEQ_LEN * m->num_q_heads + - k * m->vProjSize * MAX_SEQ_LEN + j * m->vProjSize + i; - vcache_cuda[index++] = valueCache_cpu[col_major_index]; - } - } - } - } - torch::Tensor K_t_cuda = - torch::from_blob(kcache_cuda, - {m->kProjSize, - MAX_SEQ_LEN, - num_q_heads, - TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - torch::Tensor V_t_cuda = - torch::from_blob(vcache_cuda, - {m->vProjSize, - MAX_SEQ_LEN, - num_q_heads, - TreeVerifyBatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - - // std::cout << "kcache differences:" << std::endl; - // for (int i=0; i < bc.num_active_requests() + 1; i++) { - // for (int j=0; j < num_q_heads; j++) { - // for (int l=0; l < m->kProjSize; l++) { - // for (int k=0; k < MAX_SEQ_LEN; k++) { - // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_q_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; if ( - // abs(m->kcache[kcache_idx] - keyCache_cpu[ - // i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + - // j * m->kProjSize * MAX_SEQ_LEN + - // k * m->kProjSize + - // l - // ]) > 0.00001) { - // printf("req: %i (rid: %i), head: %i, data_dim: %i, token_pos: - // %i\n", - // i, req_idxs[i], j, l, k); - // } - // } - // } - // } - // } - - // std::cout << "keyCache from CUDA:" << std::endl; - // for (int i=0; ikProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // printf("%f ", - // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + - // j * m->kProjSize * MAX_SEQ_LEN + - // k * m->kProjSize + - // l - // ]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // std::cout << "valueCache from CUDA:" << std::endl; - // for (int i=0; ivProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // printf("%f ", - // valueCache_cpu[ - // i * m->vProjSize * MAX_SEQ_LEN * num_q_heads + - // j * m->vProjSize * MAX_SEQ_LEN + - // k * m->vProjSize + - // l]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // printf("\n"); - - // std::cout << "C++ kcache:" << std::endl; - // for (int i=0; ikProjSize; l++) { - // for (int k=0; k < MAX_SEQ_LEN; k++) { - // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_q_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; - // printf("%f ", m->kcache[kcache_idx]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // std::cout << "C++ vcache:" << std::endl; - // for (int i=0; ivProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // size_t vcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + k * num_q_heads * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + j * - // TreeVerifyBatchConfig::MAX_NUM_REQUESTS + i; - // printf("%f ", m->vcache[vcache_idx]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - assert(torch::allclose(K_t_cuda, K_t, 1e-05, 1e-05)); - assert(torch::allclose(V_t_cuda, V_t, 1e-05, 1e-05)); - free(kcache_cuda); - free(vcache_cuda); - - // ============================================================================= - // Load the W_out projection weights - // ============================================================================= - - // ----------------------- C++ operations & checks -------------------------- - float *w_out = (float *)calloc(m->vProjSize * m->num_q_heads * m->oProjSize, - sizeof(float)); - std::vector w_out_shape = {m->vProjSize, m->num_q_heads, m->oProjSize}; - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - for (int h = 0; h < num_q_heads; h++) { - for (int v = 0; v < m->vProjSize; v++) { - for (int o = 0; o < m->oProjSize; o++) { - set_value_row_major( - w_out, - w_out_shape, - {v, h, o}, - weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + - m->vProjSize * o + v]); - } - } - } - // convert weights to torch tensor - torch::Tensor torch_w_out = torch::from_blob( - w_out, {m->vProjSize, m->num_q_heads, m->oProjSize}, torch::kFloat32); - - // ----------------------- Loading CUDA results for this step --------------- - float *w_out_cuda = download_tensor( - m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_q_heads); - assert(w_out_cuda != nullptr); - float *converted_wout_tensor = (float *)calloc( - m->vProjSize * m->num_q_heads * m->oProjSize, sizeof(float)); - std::vector converted_wout_tensor_shape = { - m->vProjSize, m->num_q_heads, m->oProjSize}; - - for (int i = 0; i < m->vProjSize * m->num_q_heads * m->oProjSize; i++) { - int v_idx = i % m->vProjSize; - int h_idx = (i / m->vProjSize) % m->num_q_heads; - int o_idx = i / (m->vProjSize * m->num_q_heads); - assert(v_idx < m->vProjSize && h_idx < m->num_q_heads && - o_idx < m->oProjSize); - set_value_row_major(converted_wout_tensor, - converted_wout_tensor_shape, - {v_idx, h_idx, o_idx}, - w_out_cuda[i]); - } - torch::Tensor w_out_cuda_tensor = - torch::from_blob(converted_wout_tensor, - {m->vProjSize, m->num_q_heads, m->oProjSize}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); - free(converted_wout_tensor); - - // ============================================================================= - // Compute the softmax(QK^T/sqrt(d_k))V product, request by request - // ============================================================================= - - // ----------------------- C++ initialization steps ------------------------- - torch::Tensor Q_projs = qkv_projs.index({Slice(), Slice(), 0, Slice()}) - .reshape({qkv_projs.sizes()[0], - qkv_projs.sizes()[1], - qkv_projs.sizes()[3]}); - - torch::Tensor qk_products[bc.num_active_requests()]; - torch::Tensor qk_softmax[bc.num_active_requests()]; - torch::Tensor attn_heads[bc.num_active_requests()]; - - torch::Tensor cpp_output = - torch::zeros({m->oProjSize, bc.num_active_tokens()}); - - // ----------------------- Loading CUDA results for this step --------------- - float *qk_prods_cpu = download_tensor( - m->qk_prods, - TreeVerifyBatchConfig::MAX_NUM_TOKENS * - TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_q_heads); - assert(qk_prods_cpu != nullptr); - - float *qk_prods_softmax_cpu = download_tensor( - m->qk_prods_softmax, - TreeVerifyBatchConfig::MAX_NUM_TOKENS * - TreeVerifyBatchConfig::MAX_NUM_TOKENS * num_q_heads); - assert(qk_prods_softmax_cpu != nullptr); - - float *attn_heads_cpu = download_tensor( - m->attn_heads, - TreeVerifyBatchConfig::MAX_NUM_TOKENS * m->num_q_heads * m->vProjSize); - assert(attn_heads_cpu != nullptr); - - // ----------------------- Main loop (request by request) ------------------- - size_t qk_prods_cpu_offset = 0; - - for (size_t r = 0; r < bc.num_active_requests(); r++) { - // Compute pre-request parameters - size_t num_new_tokens = r_num_tokens[r]; - int64_t rid = (int64_t)(req_idxs[r]); - int64_t num_tokens_received_so_far = - (int64_t)(bc.requestsInfo[rid].token_start_offset + - bc.requestsInfo[rid].num_tokens_in_batch); - assert(num_new_tokens == bc.requestsInfo[rid].num_tokens_in_batch); - assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); - - // ----------------------- C++ computations ------------------------------- - // Get the slice of the Q projection tensor with the tokens in the current - // request - torch::Tensor Q_req = - Q_projs.index({Slice(), - Slice(r_first_idx[r], r_first_idx[r] + num_new_tokens), - Slice()}); - // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; - assert(Q_req.sizes()[0] == m->qProjSize); - assert(Q_req.sizes()[1] == num_new_tokens); - assert(Q_req.sizes()[2] == num_q_heads); - - /*printf("\n------------ QK multiplication (C++) -------------\n"); - printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, - rid: %li, Qproj slice: (%i, %i)\n", r, num_new_tokens, - num_tokens_received_so_far, rid, r_first_idx[r], r_first_idx[r] + - num_new_tokens); - - std::cout << "Q_req matrix (idk dims):" << std::endl << - Q_req.index({Slice(), Slice(), 0}) << std::endl << std::endl; std::cout << - "K_t matrix (ilk dims):" << std::endl << K_t.index({Slice(), Slice(0, - num_tokens_received_so_far), 0, rid}) << std::endl << std::endl; std::cout - << "C++ alpha: " << (1.0f / sqrt(m->kProjSize)) << std::endl;*/ - - // Compute (Q*K^T)/sqrt(d_k) matmul - qk_products[r] = - torch::einsum("ijk,ilk->jlk", - {Q_req, - K_t.index({Slice(), - Slice(0, num_tokens_received_so_far), - Slice(), - rid})}) * - (1.0f / sqrt(m->kProjSize)); - - // Set entries above diagonal to -inf to make attention causal. - for (int h = 0; h < num_q_heads; h++) { - qk_products[r].index( - {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = - qk_products[r] - .index({Slice(), - Slice(num_tokens_received_so_far - num_new_tokens), - h}) - .tril() + - torch::full({(int64_t)num_new_tokens, (int64_t)num_new_tokens}, - -INFINITY) - .triu() - .fill_diagonal_(0); - } - // Compute softmax for each request block - qk_softmax[r] = torch::softmax(qk_products[r], -2); - assert(qk_softmax[r].sizes()[0] == num_new_tokens); - assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); - assert(qk_softmax[r].sizes()[2] == m->num_q_heads); - - // ------------------- Loading CUDA results for this step --------------- - float *converted_qk_prod = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_q_heads, - sizeof(float)); - float *converted_qk_prod_softmax = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_q_heads, - sizeof(float)); - std::vector converted_qk_prod_shape = { - (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_q_heads}; - - for (size_t i = 0; - i < num_new_tokens * num_tokens_received_so_far * num_q_heads; - i++) { - size_t new_t_idx = i % num_new_tokens; - size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; - size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); - assert(new_t_idx < num_new_tokens && - all_t_idx < num_tokens_received_so_far && head_idx < num_q_heads); - set_value_row_major(converted_qk_prod, - converted_qk_prod_shape, - {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, - qk_prods_cpu[i + qk_prods_cpu_offset]); - set_value_row_major(converted_qk_prod_softmax, - converted_qk_prod_shape, - {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, - qk_prods_softmax_cpu[i + qk_prods_cpu_offset]); - } - torch::Tensor qk_prods_cuda = torch::from_blob( - converted_qk_prod, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, - torch::kFloat32); - torch::Tensor qk_prods_softmax_cuda = torch::from_blob( - converted_qk_prod_softmax, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, - torch::kFloat32); - - // ------------------- Comparing C++ & CUDA results ------------------ - /* std::cout << "C++:" <vProjSize); - assert( - V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) - .sizes()[1] == num_tokens_received_so_far); - assert( - V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) - .sizes()[2] == m->num_q_heads); - attn_heads[r] = torch::einsum( - "ijk,ljk->ilk", - {qk_softmax[r], - V_t.index( - {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); - assert(attn_heads[r].sizes()[0] == num_new_tokens); - assert(attn_heads[r].sizes()[1] == m->vProjSize); - assert(attn_heads[r].sizes()[2] == m->num_q_heads); - - // ------------------- Loading CUDA results for this step --------------- - float converted_attn_heads_cpu[num_new_tokens][m->vProjSize] - [m->num_q_heads] = {0}; - for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_q_heads; i++) { - int token_ix = i % num_new_tokens; - int vproj_idx = (i / num_new_tokens) % m->vProjSize; - int head_idx = i / (num_new_tokens * m->vProjSize); - assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && - head_idx < m->num_q_heads); - converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = - attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_q_heads + i]; - } - torch::Tensor converted_attn_heads_cuda = torch::from_blob( - converted_attn_heads_cpu, - {(int64_t)num_new_tokens, m->vProjSize, m->num_q_heads}, - torch::kFloat32); - - // -------------------- Comparing C++ & CUDA results ------------------- - /* std::cout << "CUDA attn head for req " << r << ":" <num_q_heads; h++) { - std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << - std::endl; - } - std::cout << "C++ attn head for req " << r << ":" <num_q_heads; h++) { - std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; - } */ - assert(torch::allclose( - converted_attn_heads_cuda, attn_heads[r], 1e-05, 1e-05)); - - // ----------------------- C++ computations ---------------------------- - // Compute output values by projecting all heads to output space - cpp_output.index( - {Slice(), - Slice(r_first_idx[r], r_first_idx[r] + (int64_t)num_new_tokens)}) = - torch::einsum("jkl,ijk->li", {torch_w_out, attn_heads[r]}); - - // increment main loop's auxiliary index - qk_prods_cpu_offset += - num_new_tokens * num_tokens_received_so_far * num_q_heads; - } - - // ----------------------- Comparing C++ & CUDA results --------------------- - /* std::cout << "C++:" <oProjSize; i++) { - std::cout << cpp_output.index({i, Slice()}) << std::endl; - } - std::cout << "CUDA:" <oProjSize; i++) { - std::cout << torch_out_cuda.index({i, Slice(0, - (int64_t)bc.num_active_tokens())}) << std::endl; - } */ - - assert( - torch::allclose(torch_out_cuda.index( - {Slice(), Slice(0, (int64_t)bc.num_active_tokens())}), - cpp_output, - 1e-05, - 1e-05)); - - // ============================================================================= - // Cleanup - // ============================================================================= - free(w_out); - checkCUDA(cudaFreeHost(input_cpu)); - checkCUDA(cudaFreeHost(weight_cpu)); - checkCUDA(cudaFreeHost(output_cpu)); - checkCUDA(cudaFreeHost(QKVProjArray_cpu)); - checkCUDA(cudaFreeHost(keyCache_cpu)); - checkCUDA(cudaFreeHost(valueCache_cpu)); - checkCUDA(cudaFreeHost(qk_prods_cpu)); - checkCUDA(cudaFreeHost(qk_prods_softmax_cpu)); - checkCUDA(cudaFreeHost(attn_heads_cpu)); - checkCUDA(cudaFreeHost(w_out_cuda)); - // assert(false && "All good if you see this assert failure! :)"); -#endif - // Done with INFERENCE_TESTS block } void TreeIncMultiHeadSelfAttention::backward(FFModel const &ff) { @@ -1664,13 +885,13 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.bias == rhs.bias && lhs.add_bias_kv == rhs.add_bias_kv && + lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && - lhs.position_bias == lhs.position_bias; + lhs.position_bias == rhs.position_bias; } TreeIncMultiHeadSelfAttentionParams @@ -1683,8 +904,8 @@ TreeIncMultiHeadSelfAttentionParams params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; - params.bias = this->bias; - params.add_bias_kv = this->add_bias_kv; + params.qkv_bias = this->qkv_bias; + params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; params.apply_rotary_embedding = this->apply_rotary_embedding; params.scaling_query = this->scaling_query; @@ -1708,8 +929,8 @@ size_t hash::operator()( hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); - hash_combine(key, params.bias); - hash_combine(key, params.add_bias_kv); + hash_combine(key, params.qkv_bias); + hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); hash_combine(key, params.apply_rotary_embedding); hash_combine(key, params.scaling_query); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 1a9d1b6968..0fa68bed08 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -500,7 +500,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } - if (*m->bias && shard_id == 0) { + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + @@ -608,7 +608,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->bias; + bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -692,11 +692,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->vProjSize, attn->oProjSize, attn->apply_rotary_embedding, - attn->bias, + attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->add_bias_kv, + attn->final_bias, attn->scaling_factor, weight, gpu_mem_allocator, diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index f916bdb925..95ac93ad8a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -500,7 +500,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } - if (*m->bias && shard_id == 0) { + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + m->kProjSize * m->global_num_kv_heads + @@ -605,7 +605,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( GenericTensorAccessorR const &bias) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->bias; + bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -689,11 +689,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->vProjSize, attn->oProjSize, attn->apply_rotary_embedding, - attn->bias, + attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->add_bias_kv, + attn->final_bias, attn->scaling_factor, weight, gpu_mem_allocator, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 0723ee136d..2b1910637f 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -170,6 +170,8 @@ std::string get_operator_type_name(OperatorType type) { return "Mean"; case OP_LAYERNORM: return "LayerNorm"; + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: + return "AddBiasResidualLayerNorm"; case OP_RMS_NORM: return "RMSNorm"; case OP_GELU: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 0e957f0702..5a5e267d96 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -15,6 +15,7 @@ #include "flexflow/graph.h" #include "flexflow/dominators.h" #include "flexflow/ffconst_utils.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/arg_topk.h" #include "flexflow/ops/argmax.h" @@ -2316,8 +2317,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); - sez.serialize(attn->bias); - sez.serialize(attn->add_bias_kv); + sez.serialize(attn->qkv_bias); + sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); sez.serialize(attn->apply_rotary_embedding); sez.serialize(attn->scaling_query); @@ -2340,8 +2341,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); - sez.serialize(attn->bias); - sez.serialize(attn->add_bias_kv); + sez.serialize(attn->qkv_bias); + sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); sez.serialize(attn->apply_rotary_embedding); sez.serialize(attn->scaling_query); @@ -2361,8 +2362,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); - sez.serialize(attn->bias); - sez.serialize(attn->add_bias_kv); + sez.serialize(attn->qkv_bias); + sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); sez.serialize(attn->apply_rotary_embedding); sez.serialize(attn->scaling_query); @@ -2701,6 +2702,11 @@ void FFModel::deserialize_graph_optimal_view( node = LayerNorm::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + node = AddBiasResidualLayerNorm::deserialize( + *this, dez, inputs, num_inputs); + break; + } case OP_LINEAR: { node = Linear::deserialize(*this, dez, inputs, num_inputs); break; @@ -2742,7 +2748,7 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload, position_bias; DataType quantization_type; size_t id, transformer_layer_id; @@ -2754,8 +2760,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); - dez.deserialize(bias); - dez.deserialize(add_bias_kv); + dez.deserialize(qkv_bias); + dez.deserialize(final_bias); dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); dez.deserialize(scaling_query); @@ -2773,8 +2779,8 @@ void FFModel::deserialize_graph_optimal_view( params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; - params.bias = bias; - params.add_bias_kv = add_bias_kv; + params.qkv_bias = qkv_bias; + params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; @@ -2793,7 +2799,7 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; size_t id, transformer_layer_id; dez.deserialize(id); @@ -2804,8 +2810,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); - dez.deserialize(bias); - dez.deserialize(add_bias_kv); + dez.deserialize(qkv_bias); + dez.deserialize(final_bias); dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); dez.deserialize(scaling_query); @@ -2820,8 +2826,8 @@ void FFModel::deserialize_graph_optimal_view( params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; - params.bias = bias; - params.add_bias_kv = add_bias_kv; + params.qkv_bias = qkv_bias; + params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; @@ -2839,7 +2845,7 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool bias, add_bias_kv, add_zero_attn, apply_rotary_embedding, + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload, position_bias; DataType quantization_type; size_t id, transformer_layer_id; @@ -2851,8 +2857,8 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); - dez.deserialize(bias); - dez.deserialize(add_bias_kv); + dez.deserialize(qkv_bias); + dez.deserialize(final_bias); dez.deserialize(add_zero_attn); dez.deserialize(apply_rotary_embedding); dez.deserialize(scaling_query); @@ -2870,8 +2876,8 @@ void FFModel::deserialize_graph_optimal_view( params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; - params.bias = bias; - params.add_bias_kv = add_bias_kv; + params.qkv_bias = qkv_bias; + params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; params.apply_rotary_embedding = apply_rotary_embedding; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2768439117..4f8caaa570 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -22,6 +22,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/graph.h" #include "flexflow/mapper.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/aggregate_spec.h" #include "flexflow/ops/arg_topk.h" @@ -3100,6 +3101,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + Op *op = AddBiasResidualLayerNorm::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_RMS_NORM: { Op *op = RMSNorm::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -5205,6 +5212,42 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + // AddBiasResidualLayerNorm task + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, + "add_bias_residual_layernorm_init_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "add_bias_residual_layernorm_init_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, + "add_bias_residual_layernorm_fwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::inference_task>( + registrar, "add_bias_residual_layernorm_inference_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // rms norm task { TaskVariantRegistrar registrar(RMSNROM_INIT_TASK_ID, "rmsnorm_init_task"); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 5f9ae98936..a983dcdb03 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -1,4 +1,5 @@ #include "flexflow/operator_params.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/aggregate_spec.h" #include "flexflow/ops/arg_topk.h" @@ -93,6 +94,8 @@ tl::optional get_op_parameters(Op const *op) { return ((TreeIncMultiHeadSelfAttention *)op)->get_params(); case OP_LAYERNORM: return ((LayerNorm *)op)->get_params(); + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: + return ((AddBiasResidualLayerNorm *)op)->get_params(); case OP_REDUCE_SUM: return ((Reduce *)op)->get_params(); case OP_RESHAPE: diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 5071b5dd66..ae3be1222e 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -18,6 +18,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/graph.h" #include "flexflow/graph_structures.h" +#include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/aggregate.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/concat.h" @@ -3798,6 +3799,22 @@ bool FFModel::convert_graph_to_operators( new_op = new FusedParallelOp(*this, inputs[0], parallel_ops); break; } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(inList.size() == 2); + AddBiasResidualLayerNorm *abr_ln = (AddBiasResidualLayerNorm *)node.ptr; + AddBiasResidualLayerNormParams params = abr_ln->get_params(); + new_op = new AddBiasResidualLayerNorm(*this, + abr_ln->layer_guid, + inputs[0], + inputs[1], + abr_ln->axes, + abr_ln->elementwise_affine, + abr_ln->use_bias, + abr_ln->eps, + true, + NULL); + break; + } default: { new_op = node.ptr->materialize(*this, inputs, num_inputs); break; diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 072e8f2bdb..fee215f4c4 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -1,6 +1,7 @@ import argparse import json import os +import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer def main(): @@ -39,12 +40,11 @@ def main(): # Set default tensor type depending on argument indicating the float type to use if not args.use_full_precision: - import torch - torch.set_default_tensor_type(torch.HalfTensor) # Run huggingface model - device = "cuda" if args.gpu else "cpu" + cuda_availble = torch.cuda.is_available() + device = "cuda" if args.gpu and cuda_availble else "cpu" # Get Model model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device) # Get Tokenizer From 322afa9d878b4b4a063e63a65f26d67f9c7a5aa0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 23 Sep 2023 17:14:25 -0400 Subject: [PATCH 230/344] Fuse inference kernels (part 2) (#1143) * fuse sigmoid,silu,multi in llama model * linting * fix * use new fused op * fix * fix * . * fuse rms norm kernels * cleanup * hip rocm fix * fix typo * add residual rms norm * fix * fix * add missing file * linting * fix * fix * fix * fix hip rocm * fix warning * removed deadcode * fix, apply fused op for rms norm and add * fix sigmoid-silu-multi with tensor parallelism * fix * fix * fix floating point issue * remove comment --- include/flexflow/ffconst.h | 2 + include/flexflow/flexflow_c.h | 14 + include/flexflow/model.h | 33 +- include/flexflow/operator_params.h | 4 + .../ops/kernels/residual_rms_norm_kernels.h | 56 ++ include/flexflow/ops/residual_rms_norm.h | 88 +++ .../flexflow/ops/residual_rms_norm_params.h | 29 + include/flexflow/ops/sigmoid_silu_multi.h | 86 +++ .../flexflow/ops/sigmoid_silu_multi_params.h | 24 + include/flexflow/substitution_loader.h | 2 + inference/file_loader.cc | 2 +- inference/models/falcon.cc | 183 ++++--- inference/models/llama.cc | 209 +++++--- inference/models/mpt.cc | 122 +++-- inference/models/opt.cc | 57 +- inference/models/starcoder.cc | 165 +++--- python/flexflow/core/flexflow_cffi.py | 49 ++ python/flexflow/serve/models/llama.py | 14 +- python/flexflow/serve/models/mpt.py | 25 +- python/flexflow/serve/models/opt.py | 27 +- python/flexflow/serve/models/starcoder.py | 20 +- python/flexflow/type.py | 2 + src/c/flexflow_c.cc | 40 ++ src/ops/add_bias_residual_layer_norm.cc | 24 +- src/ops/fused.cpp | 35 ++ src/ops/fused.cu | 35 ++ src/ops/kernels/residual_rms_norm_kernels.cpp | 227 ++++++++ src/ops/kernels/residual_rms_norm_kernels.cu | 224 ++++++++ src/ops/kernels/rms_norm_kernels.cpp | 85 ++- src/ops/kernels/rms_norm_kernels.cu | 93 +++- src/ops/layer_norm.cu | 6 +- src/ops/residual_rms_norm.cc | 505 ++++++++++++++++++ src/ops/rms_norm.cc | 8 +- src/ops/sampling.cpp | 2 +- src/ops/sigmoid_silu_multi.cc | 390 ++++++++++++++ src/ops/sigmoid_silu_multi.cpp | 118 ++++ src/ops/sigmoid_silu_multi.cu | 97 ++++ src/runtime/ffconst_utils.cc | 4 + src/runtime/graph.cc | 10 + src/runtime/model.cc | 150 +++--- src/runtime/operator_params.cc | 6 + src/runtime/substitution.cc | 10 + 42 files changed, 2741 insertions(+), 541 deletions(-) create mode 100644 include/flexflow/ops/kernels/residual_rms_norm_kernels.h create mode 100644 include/flexflow/ops/residual_rms_norm.h create mode 100644 include/flexflow/ops/residual_rms_norm_params.h create mode 100644 include/flexflow/ops/sigmoid_silu_multi.h create mode 100644 include/flexflow/ops/sigmoid_silu_multi_params.h create mode 100644 src/ops/kernels/residual_rms_norm_kernels.cpp create mode 100644 src/ops/kernels/residual_rms_norm_kernels.cu create mode 100644 src/ops/residual_rms_norm.cc create mode 100644 src/ops/sigmoid_silu_multi.cc create mode 100644 src/ops/sigmoid_silu_multi.cpp create mode 100644 src/ops/sigmoid_silu_multi.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index f86cbff399..9dc68e21ed 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -160,9 +160,11 @@ enum OperatorType { OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html OP_LAYERNORM, OP_ADD_BIAS_RESIDUAL_LAYERNORM, + OP_SIGMOID_SILU_MULTI, OP_EXPERTS, OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, + OP_RESIDUAL_RMS_NORM, OP_BEAM_TOPK, OP_ARGMAX, OP_INC_MULTIHEAD_SELF_ATTENTION, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index e363ccf888..995c238a8c 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -270,6 +270,12 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool use_bias, char const *name); +flexflow_tensor_t + flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle, + const flexflow_tensor_t input1, + const flexflow_tensor_t input2, + char const *name); + flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle, const flexflow_tensor_t a, @@ -540,6 +546,14 @@ flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, int dim, char const *name); +flexflow_tensor_t * + flexflow_model_add_residual_rms_norm(flexflow_model_t handle_, + const flexflow_tensor_t input1_, + const flexflow_tensor_t input2_, + float eps, + int dim, + char const *name); + flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 05a12bee31..f88f96cd5a 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -108,6 +108,8 @@ enum TaskIDs { LAYERNORM_BWD_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, + SIGMOID_SILU_MULTI_INIT_TASK_ID, + SIGMOID_SILU_MULTI_INF_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, LINEAR_INF_TASK_ID, @@ -153,9 +155,11 @@ enum TaskIDs { ATTENTION_INIT_TASK_ID, ATTENTION_FWD_TASK_ID, ATTENTION_BWD_TASK_ID, - RMSNROM_INIT_TASK_ID, - RMSNROM_FWD_TASK_ID, - RMSNROM_INF_TASK_ID, + RMSNORM_INIT_TASK_ID, + RMSNORM_FWD_TASK_ID, + RMSNORM_INF_TASK_ID, + RESIDUAL_RMSNORM_INIT_TASK_ID, + RESIDUAL_RMSNORM_INF_TASK_ID, BEAM_TOPK_INIT_TASK_ID, BEAM_TOPK_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, @@ -312,6 +316,7 @@ class Gather; class Group_by; class LayerNorm; class AddBiasResidualLayerNorm; +class SigmoidSiluMulti; class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; @@ -325,6 +330,7 @@ class TopK; class ArgTopK; class Transpose; class RMSNorm; +class ResidualRMSNorm; class BeamTopK; class SpecIncMultiHeadSelfAttention; class Sampling; @@ -545,6 +551,11 @@ class FFModel { bool use_bias = true, DataType data_type = DT_NONE, char const *name = NULL); + // Add a sigmoid_silu_multi layer + Tensor sigmoid_silu_multi(const Tensor input1, + const Tensor input2, + DataType data_type = DT_NONE, + char const *name = NULL); // Add a batch_norm layer Tensor batch_norm(const Tensor input, bool relu = true, char const *name = NULL); @@ -560,6 +571,14 @@ class FFModel { int dim, DataType data_type = DT_NONE, char const *name = NULL); + // Add a residual root mean square layer + void residual_rms_norm(const Tensor input1, + const Tensor input2, + Tensor *outputs, + float eps, + int dim, + DataType data_type = DT_NONE, + char const *name = NULL); // Add a beam search top k layer Tensor beam_top_k(const Tensor input, int max_beam_size, @@ -1132,6 +1151,10 @@ class FFModel { std::pair, AddBiasResidualLayerNormParams>, AddBiasResidualLayerNorm *>, + std::unordered_map< + std::pair, + SigmoidSiluMultiParams>, + SigmoidSiluMulti *>, std::unordered_map, Linear *>, std::unordered_map, @@ -1170,6 +1193,10 @@ class FFModel { Transpose *>, std::unordered_map, RMSNorm *>, + std::unordered_map< + std::pair, + ResidualRMSNormParams>, + ResidualRMSNorm *>, std::unordered_map, Repartition *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 514c70f2ec..31f18049ff 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -26,8 +26,10 @@ #include "flexflow/ops/pool_2d_params.h" #include "flexflow/ops/reduce_params.h" #include "flexflow/ops/reshape_params.h" +#include "flexflow/ops/residual_rms_norm_params.h" #include "flexflow/ops/rms_norm_params.h" #include "flexflow/ops/sampling_params.h" +#include "flexflow/ops/sigmoid_silu_multi_params.h" #include "flexflow/ops/softmax_params.h" #include "flexflow/ops/spec_inc_multihead_self_attention_params.h" #include "flexflow/ops/split_params.h" @@ -61,6 +63,7 @@ using OperatorParameters = mp::variant; + ResidualRMSNorm(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input1, + const ParallelTensor _input2, + float _eps, + int dim, + bool allocate_weights, + char const *name); + ResidualRMSNorm(FFModel &model, + ResidualRMSNormParams const ¶ms, + Input const &inputs, + bool allocate_weights, + char const *name = nullptr); + + ResidualRMSNorm(FFModel &model, + ResidualRMSNorm const &other, + Input const &inputs, + bool allocate_weights); + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + ResidualRMSNormParams get_params() const; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + +public: + float eps; + char op_name[MAX_OPNAME]; + int effective_batch_size; + int dim, data_dim; +}; +} // namespace FlexFlow +#endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h new file mode 100644 index 0000000000..64751a30b0 --- /dev/null +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -0,0 +1,29 @@ +#ifndef _FLEXFLOW_RESIDUAL_RMSNORM_PARAMS_H +#define _FLEXFLOW_RESIDUAL_RMSNORM_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct ResidualRMSNormParams { + LayerID layer_guid; + float eps; + int dim; + bool is_valid( + std::pair const &input) const; +}; + +bool operator==(ResidualRMSNormParams const &, ResidualRMSNormParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ResidualRMSNormParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_RESIDUAL_RMSNORM_PARAMS_H \ No newline at end of file diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h new file mode 100644 index 0000000000..6a69288607 --- /dev/null +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -0,0 +1,86 @@ +#pragma once + +#include "flexflow/inference.h" +#include "flexflow/model.h" +#include "flexflow/utils/memory_allocator.h" +namespace FlexFlow { + +class SigmoidSiluMultiMeta; + +class SigmoidSiluMulti : public Op { +public: + using Params = SigmoidSiluMultiParams; + using Input = std::pair; + SigmoidSiluMulti(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name = nullptr); + SigmoidSiluMulti(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input1, + const ParallelTensor _input2, + char const *name = nullptr); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + + SigmoidSiluMultiParams get_params() const; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + template + static void inference_kernel(SigmoidSiluMultiMeta const *m, + int num_elements, + T const *input1_ptr, + T const *input2_ptr, + T *output_ptr, + ffStream_t stream); + static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &output); +}; + +class SigmoidSiluMultiMeta : public OpMeta { +public: + SigmoidSiluMultiMeta(FFHandler handle, + SigmoidSiluMulti const *ln, + MemoryAllocator &gpu_mem_allocator); + ~SigmoidSiluMultiMeta(void); + +public: + char op_name[MAX_OPNAME]; + Realm::RegionInstance reserveInst; +}; + +}; // namespace FlexFlow diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h new file mode 100644 index 0000000000..c8182505b3 --- /dev/null +++ b/include/flexflow/ops/sigmoid_silu_multi_params.h @@ -0,0 +1,24 @@ +#pragma once + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct SigmoidSiluMultiParams { + LayerID layer_guid; + bool is_valid( + std::pair const &) const; +}; + +bool operator==(SigmoidSiluMultiParams const &, SigmoidSiluMultiParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::SigmoidSiluMultiParams const &) const; +}; +} // namespace std diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h index 6dbb12e28b..ff2fd10446 100644 --- a/include/flexflow/substitution_loader.h +++ b/include/flexflow/substitution_loader.h @@ -126,7 +126,9 @@ NLOHMANN_JSON_SERIALIZE_ENUM( {OP_MEAN, "OP_MEAN"}, {OP_LAYERNORM, "OP_LAYERNORM"}, {OP_ADD_BIAS_RESIDUAL_LAYERNORM, "OP_ADD_BIAS_RESIDUAL_LAYERNORM"}, + {OP_SIGMOID_SILU_MULTI, "OP_SIGMOID_SILU_MULTI"}, {OP_RMS_NORM, "OP_RMS_NORM"}, + {OP_RESIDUAL_RMS_NORM, "OP_RESIDUAL_RMS_NORM"}, {OP_REPARTITION, "OP_PARTITION"}, {OP_COMBINE, "OP_COMBINE"}, {OP_REPLICATE, "OP_REPLICATE"}, diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 5b92f31552..dc724319d2 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -756,7 +756,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { for (Layer *l : ff->layers) { - if (l->numWeights < 1 || !l->name || strlen(l->name) < 1) { + if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { continue; } for (int i = 0; i < l->numWeights; i++) { diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 72cbd8d551..e89e22450c 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -45,44 +45,34 @@ void FALCON::create_falcon_model(FFModel &ff, input = ff.create_tensor<2>(token_dims, DT_INT32); } - Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - - Tensor token; std::vector axes = {0}; - if (use_full_precision) { - token = ff.embedding(input, - falcon_config.vocab_size, - falcon_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - } else { - token = ff.embedding(input, - falcon_config.vocab_size, - falcon_config.hidden_size, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init); - } - - Layer *embedding = ff.layers.back(); - weights_layers.emplace("word_embeddings_weight", embedding); + Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); + Tensor token = ff.embedding(input, + falcon_config.vocab_size, + falcon_config.hidden_size, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "word_embeddings"); for (int i = 0; i < falcon_config.n_layer; i++) { // set transformer layer id ff.set_transformer_layer_id(i); + // step 1: attention - Tensor att_norm = - ff.layer_norm(token, axes, true, falcon_config.layer_norm_epsilon); - Layer *attention_norm = ff.layers.back(); + std::string layer_name = "layers_" + std::to_string(i) + "_input_layernorm"; + Tensor att_norm = ff.layer_norm(token, + axes, + true, + falcon_config.layer_norm_epsilon, + true, + DT_NONE, + layer_name.c_str()); - weights_layers.emplace("layers_" + std::to_string(i) + - "_input_layernorm_weight", - attention_norm); Tensor mha; + layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multiquery_self_attention( @@ -92,13 +82,18 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - NULL, /*kernel_initializer*/ - true /*apply_rotary_embedding*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + NULL, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -111,13 +106,18 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true /*apply_rotary_embedding*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -130,13 +130,18 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true /*apply_rotary_embedding*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -144,42 +149,58 @@ void FALCON::create_falcon_model(FFModel &ff, assert(false); } } - Layer *attention_layer = ff.layers.back(); - - // multi query - // weights_layers.emplace("layers_" + std::to_string(i) + - // "_self_attention_dense_weight", - // attention_layer); - - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - Tensor dense_h_to_4h = - ff.dense(att_norm, falcon_config.hidden_size * 4, AC_MODE_NONE, false); - Layer *dense_h_to_4h_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_mlp_dense_h_to_4h_weight", - dense_h_to_4h_layer); + + layer_name = "layers_" + std::to_string(i) + "_mlp_dense_h_to_4h"; + Tensor dense_h_to_4h = ff.dense(att_norm, + falcon_config.hidden_size * 4, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); + dense_h_to_4h = ff.gelu(dense_h_to_4h); - Tensor mlp_output = - ff.dense(dense_h_to_4h, falcon_config.hidden_size, AC_MODE_NONE, false); - Layer *dense_4h_to_h_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_mlp_dense_4h_to_h_weight", - dense_4h_to_h_layer); + + layer_name = "layers_" + std::to_string(i) + "_mlp_dense_4h_to_h"; + Tensor mlp_output = ff.dense(dense_h_to_4h, + falcon_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); token = ff.add(token, mha); token = ff.add(token, mlp_output); } // final normalization and linear - Tensor ln_f = - ff.layer_norm(token, axes, true, falcon_config.layer_norm_epsilon); - Layer *ln_f_layer = ff.layers.back(); - weights_layers.emplace("ln_f_weight", ln_f_layer); - - Tensor lm_head = - ff.dense(ln_f, falcon_config.vocab_size, AC_MODE_NONE, false); - Layer *lm_head_layer = ff.layers.back(); - weights_layers.emplace("lm_head_weight", lm_head_layer); + Tensor ln_f = ff.layer_norm(token, + axes, + true, + falcon_config.layer_norm_epsilon, + true, + DT_NONE, + "ln_f"); + + Tensor lm_head = ff.dense(ln_f, + falcon_config.vocab_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -200,7 +221,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.hidden_size, falcon_config.hidden_size / falcon_config.n_head, ff.config.tensor_parallelism_degree); - std::cout << "------laod weights ----------" << std::endl; + std::cout << "------load weights ----------" << std::endl; fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 463c96527b..c71755a3d3 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -48,41 +48,29 @@ void LLAMA::create_llama_model(FFModel &ff, Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - Tensor token; - - if (use_full_precision) { - token = ff.embedding(input, - llama_config.vocab_size, - llama_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - } else { - token = ff.embedding(input, - llama_config.vocab_size, - llama_config.hidden_size, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init); - } - - Layer *embedding = ff.layers.back(); - weights_layers.emplace("tok_embeddings_weight", embedding); + Tensor token = ff.embedding(input, + llama_config.vocab_size, + llama_config.hidden_size, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "tok_embeddings"); for (int i = 0; i < llama_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); + // step 1: attention - Tensor att_norm = - ff.rms_norm(token, llama_config.rms_norm_eps, llama_config.hidden_size); - Layer *attention_norm = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + - "_attention_norm_weight", - attention_norm); + std::string layer_name = "layers_" + std::to_string(i) + "_attention_norm"; + Tensor att_norm = ff.rms_norm(token, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + layer_name.c_str()); Tensor mha; + layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( @@ -91,13 +79,18 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - NULL, /*kernel_initializer*/ - true /*apply_rotary_embedding*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + NULL, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -108,13 +101,18 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true /*apply_rotary_embedding*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -125,13 +123,18 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true /*apply_rotary_embedding*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -139,50 +142,82 @@ void LLAMA::create_llama_model(FFModel &ff, assert(false); } } - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - token = ff.add(token, mha); // step 2: SILU activaion - Tensor ff_norm = - ff.rms_norm(token, llama_config.rms_norm_eps, llama_config.hidden_size); - Layer *ffn_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_ffn_norm_weight", - ffn_layer); - - Tensor w1 = - ff.dense(ff_norm, llama_config.intermediate_size, AC_MODE_NONE, false); - Layer *w1_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w1_weight", w1_layer); - - Tensor w3 = - ff.dense(ff_norm, llama_config.intermediate_size, AC_MODE_NONE, false); - Layer *w3_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w3_weight", w3_layer); - - Tensor sigmoid = ff.sigmoid(w1); - Tensor silu = ff.multiply(w1, sigmoid); - Tensor multi = ff.multiply(silu, w3); - - Tensor w2 = ff.dense(multi, llama_config.hidden_size, AC_MODE_NONE, false); - Layer *w2_layer = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_feed_forward_w2_weight", w2_layer); + layer_name = "layers_" + std::to_string(i) + "_ffn_norm"; + Tensor token_ff_norm[2]; + ff.residual_rms_norm(token, + mha, + token_ff_norm, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + layer_name.c_str()); + + token = token_ff_norm[0]; + Tensor ff_norm = token_ff_norm[1]; + + layer_name = "layers_" + std::to_string(i) + "_feed_forward_w1"; + Tensor w1 = ff.dense(ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); + + layer_name = "layers_" + std::to_string(i) + "_feed_forward_w3"; + Tensor w3 = ff.dense(ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); + + Tensor multi = ff.sigmoid_silu_multi(w1, w3); + + layer_name = "layers_" + std::to_string(i) + "_feed_forward_w2"; + Tensor w2 = ff.dense(multi, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); token = ff.add(token, w2); } // final normalization and linear std::vector axes = {2}; - token = - ff.rms_norm(token, llama_config.rms_norm_eps, llama_config.hidden_size); - Layer *final_norm = ff.layers.back(); - weights_layers.emplace("norm_weight", final_norm); - - Tensor dense = ff.dense(token, llama_config.vocab_size, AC_MODE_NONE, false); - Layer *final_linear = ff.layers.back(); - weights_layers.emplace("output_weight", final_linear); + token = ff.rms_norm(token, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + "norm"); + + Tensor dense = ff.dense(token, + llama_config.vocab_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + "output"); Tensor output; if (mode == BEAM_SEARCH_MODE) { diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 1ef15654b3..933d1a0b18 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -47,40 +47,26 @@ void MPT::create_mpt_model(FFModel &ff, Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); std::vector axes = {0}; - Tensor hidden_states; - if (use_full_precision) { - hidden_states = ff.embedding(input, - mpt_config.vocab_size, - mpt_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - } else { - hidden_states = ff.embedding(input, - mpt_config.vocab_size, - mpt_config.hidden_size, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init); - } - - Layer *embedding = ff.layers.back(); - weights_layers.emplace("transformer_wte_weight", embedding); + Tensor hidden_states = ff.embedding(input, + mpt_config.vocab_size, + mpt_config.hidden_size, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "transformer_wte"); for (int i = 0; i < mpt_config.n_layers; i++) { ff.set_transformer_layer_id(i); Tensor residual = hidden_states; - Tensor layernorm_output = - ff.layer_norm(hidden_states, axes, true, 1e-05, false); - Layer *norm_1 = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_norm_1_weight", - norm_1); + std::string layer_name = "layers_" + std::to_string(i) + "_norm_1"; + Tensor layernorm_output = ff.layer_norm( + hidden_states, axes, true, 1e-05, false, DT_NONE, layer_name.c_str()); Tensor attn_outputs; + layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { attn_outputs = ff.spec_inc_multihead_self_attention( @@ -100,7 +86,9 @@ void MPT::create_mpt_model(FFModel &ff, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, - /*position_bias*/ true); + /*position_bias*/ true, + layer_name.c_str() /*name*/ + ); break; } case TREE_VERIFY_MODE: { @@ -121,7 +109,9 @@ void MPT::create_mpt_model(FFModel &ff, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, - /*position_bias*/ true); + /*position_bias*/ true, + layer_name.c_str() /*name*/ + ); break; } case INC_DECODING_MODE: { @@ -142,7 +132,9 @@ void MPT::create_mpt_model(FFModel &ff, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, - /*position_bias*/ true); + /*position_bias*/ true, + layer_name.c_str() /*name*/ + ); break; } default: { @@ -150,45 +142,61 @@ void MPT::create_mpt_model(FFModel &ff, } } - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); - hidden_states = ff.add(attn_outputs, residual); - layernorm_output = ff.layer_norm(hidden_states, axes, true, 1e-05, false); - Layer *norm_2 = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_norm_2_weight", - norm_2); + + layer_name = "layers_" + std::to_string(i) + "_norm_2"; + layernorm_output = ff.layer_norm( + hidden_states, axes, true, 1e-05, false, DT_NONE, layer_name.c_str()); residual = hidden_states; // MLP - // output = self.ffn(layernorm_output, residual) - layernorm_output = ff.dense( - layernorm_output, 4 * mpt_config.hidden_size, AC_MODE_NONE, false); - Layer *up_proj = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_ffn_up_proj_weight", up_proj); + layer_name = "layers_" + std::to_string(i) + "_ffn_up_proj"; + layernorm_output = ff.dense(layernorm_output, + 4 * mpt_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); + layernorm_output = ff.gelu(layernorm_output); - Tensor intermediate_output = - ff.dense(layernorm_output, mpt_config.hidden_size, AC_MODE_NONE, false); - Layer *down_proj = ff.layers.back(); - weights_layers.emplace( - "layers_" + std::to_string(i) + "_ffn_down_proj_weight", down_proj); + + layer_name = "layers_" + std::to_string(i) + "_ffn_down_proj"; + Tensor intermediate_output = ff.dense(layernorm_output, + mpt_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); hidden_states = ff.add(intermediate_output, residual); } // final - Tensor all_final_norm = - ff.layer_norm(hidden_states, axes, true, 1e-05, false); - Layer *norm_f = ff.layers.back(); - weights_layers.emplace("transformer_norm_f_weight", norm_f); - - Tensor lm_head = - ff.dense(all_final_norm, mpt_config.vocab_size, AC_MODE_NONE, false); - Layer *lm_head_layer = ff.layers.back(); - weights_layers.emplace("lm_head_weight", lm_head_layer); + Tensor all_final_norm = ff.layer_norm( + hidden_states, axes, true, 1e-05, false, DT_NONE, "transformer_norm_f"); + + Tensor lm_head = ff.dense(all_final_norm, + mpt_config.vocab_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 5afef5e3a6..5f2494d0b2 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -50,47 +50,24 @@ void OPT::create_opt_model(FFModel &ff, Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); std::vector axes = {0}; - Tensor token; - if (use_full_precision) { - token = ff.embedding(input, - opt_config.vocab_size, - opt_config.word_embed_proj_dim, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init, - "embed_tokens"); - } else { - token = ff.embedding(input, - opt_config.vocab_size, - opt_config.word_embed_proj_dim, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init, - "embed_tokens"); - } + Tensor token = ff.embedding(input, + opt_config.vocab_size, + opt_config.word_embed_proj_dim, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "embed_tokens"); - Tensor positional_embedding; - if (use_full_precision) { - positional_embedding = ff.embedding(position_input, - opt_config.max_position_embeddings, - opt_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init, - "embed_positions"); - } else { - positional_embedding = ff.embedding(position_input, - opt_config.max_position_embeddings, - opt_config.hidden_size, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init, - "embed_positions"); - } + Tensor positional_embedding = + ff.embedding(position_input, + opt_config.max_position_embeddings, + opt_config.hidden_size, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "embed_positions"); Tensor residual = ff.add(token, positional_embedding); diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 982d58654b..f9659c7279 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -55,65 +55,43 @@ void STARCODER::create_starcoder_model( Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0); - Tensor token; - - if (use_full_precision) { - token = ff.embedding(input, - startcoder_config.vocab_size, - startcoder_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - } else { - token = ff.embedding(input, - startcoder_config.vocab_size, - startcoder_config.hidden_size, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init); - } - - Layer *embedding = ff.layers.back(); - weights_layers.emplace("transformer_wte_weight", embedding); - - Tensor positional_embedding; - if (use_full_precision) { - positional_embedding = - ff.embedding(position_input, - startcoder_config.max_position_embeddings, - startcoder_config.hidden_size, - AGGR_MODE_NONE, - DT_FLOAT, - NULL, - embed_init); - } else { - positional_embedding = - ff.embedding(position_input, - startcoder_config.max_position_embeddings, - startcoder_config.hidden_size, - AGGR_MODE_NONE, - DT_HALF, - NULL, - embed_init); - } - Layer *pos_embedding = ff.layers.back(); - weights_layers.emplace("transformer_wpe_weight", pos_embedding); + Tensor token = ff.embedding(input, + startcoder_config.vocab_size, + startcoder_config.hidden_size, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "transformer_wte"); + + Tensor positional_embedding = + ff.embedding(position_input, + startcoder_config.max_position_embeddings, + startcoder_config.hidden_size, + AGGR_MODE_NONE, + use_full_precision ? DT_FLOAT : DT_HALF, + NULL, + embed_init, + "transformer_wpe"); Tensor hidden_states = ff.add(token, positional_embedding); for (int i = 0; i < startcoder_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); + // step 1: attention - Tensor ln_1 = ff.layer_norm( - hidden_states, axes, true, startcoder_config.layer_norm_epsilon); - Layer *layer_norm = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_ln_1_weight", - layer_norm); + std::string layer_name = "layers_" + std::to_string(i) + "_ln_1"; + Tensor ln_1 = ff.layer_norm(hidden_states, + axes, + true, + startcoder_config.layer_norm_epsilon, + true, + DT_NONE, + layer_name.c_str()); Tensor mha; + layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case INC_DECODING_MODE: { mha = ff.inc_multiquery_self_attention( @@ -131,7 +109,12 @@ void STARCODER::create_starcoder_model( false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - false /*apply_rotary_embedding*/ + false, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + layer_name.c_str() /*name*/ ); break; } @@ -139,43 +122,69 @@ void STARCODER::create_starcoder_model( assert(false); } } - Layer *attention_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_attention_weight", - attention_layer); + Tensor residual = ff.add(hidden_states, mha); - Tensor l2_norm = ff.layer_norm( - residual, axes, true, startcoder_config.layer_norm_epsilon); - Layer *l2_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_ln_2_weight", - l2_layer); + layer_name = "layers_" + std::to_string(i) + "_ln_2"; + Tensor l2_norm = ff.layer_norm(residual, + axes, + true, + startcoder_config.layer_norm_epsilon, + true, + DT_NONE, + layer_name.c_str()); // mlp - Tensor c_fc = ff.dense( - l2_norm, startcoder_config.intermediate_size, AC_MODE_NONE, true); - Layer *c_fc_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_mlp_c_fc_weight", - c_fc_layer); + layer_name = "layers_" + std::to_string(i) + "_mlp_c_fc"; + Tensor c_fc = ff.dense(l2_norm, + startcoder_config.intermediate_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); + c_fc = ff.gelu(c_fc); - Tensor c_proj = - ff.dense(c_fc, startcoder_config.hidden_size, AC_MODE_NONE, true); - Layer *c_proj_layer = ff.layers.back(); - weights_layers.emplace("layers_" + std::to_string(i) + "_mlp_c_proj_weight", - c_proj_layer); + layer_name = "layers_" + std::to_string(i) + "_mlp_c_proj"; + Tensor c_proj = ff.dense(c_fc, + startcoder_config.hidden_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + layer_name.c_str()); hidden_states = ff.add(residual, c_proj); } // final normalization and linear - Tensor ln_f = ff.layer_norm( - hidden_states, axes, true, startcoder_config.layer_norm_epsilon); - Layer *final_norm = ff.layers.back(); - weights_layers.emplace("transformer_ln_f_weight", final_norm); - - Tensor lm_head = - ff.dense(ln_f, startcoder_config.vocab_size, AC_MODE_NONE, false); - Layer *final_linear = ff.layers.back(); - weights_layers.emplace("lm_head_weight", final_linear); + Tensor ln_f = ff.layer_norm(hidden_states, + axes, + true, + startcoder_config.layer_norm_epsilon, + true, + DT_NONE, + "transformer_ln_f"); + + Tensor lm_head = ff.dense(ln_f, + startcoder_config.vocab_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 08b87856de..b2231b58e6 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -294,6 +294,13 @@ def get_weight_tensor(self): def get_bias_tensor(self): return self.get_parameter_by_id(2) +# ----------------------------------------------------------------------- +# SigmoidSiluMulti +# ----------------------------------------------------------------------- +class SigmoidSiluMulti(Op): + def __init__(self, handle, idx=None, name=None): + super(SigmoidSiluMulti, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Dropout # ----------------------------------------------------------------------- @@ -476,6 +483,13 @@ class RMSNorm(Op): def __init__(self, handle, idx=None, name=None): super(RMSNorm, self).__init__(handle, idx, name) +# ----------------------------------------------------------------------- +# Residual RMS Norm +# ----------------------------------------------------------------------- +class ResidualRMSNorm(Op): + def __init__(self, handle, idx=None, name=None): + super(ResidualRMSNorm, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ArgTopK # ----------------------------------------------------------------------- @@ -572,6 +586,8 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): return LayerNorm(handle, idx, name) elif op_type == OpType.ADD_BIAS_RESIDUAL_LAYERNORM: return AddBiasResidualLayerNorm(handle, idx, name) + elif op_type == OpType.SIGMOID_SILU_MULTI: + return SigmoidSiluMulti(handle, idx, name) elif op_type == OpType.BATCH_MATMUL: return Batch_Matmul(handle, idx, name) elif op_type == OpType.SPLIT: @@ -594,6 +610,8 @@ def convert_op_handle_to_op(op_type, handle, idx=None, name=None): return TreeIncMultiHeadSelfAttention(handle, idx, name) elif op_type == OpType.RMS_NORM: return RMSNorm(handle, idx, name) + elif op_type == OpType.RESIDUAL_RMS_NORM: + return ResidualRMSNorm(handle, idx, name) elif op_type == OpType.ARG_TOPK: return ArgTopK(handle, idx, name) elif op_type == OpType.BEAM_TOPK: @@ -1598,6 +1616,12 @@ def add_bias_residual_layer_norm(self, input, residual, axes, elementwise_affine handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(self.handle, input.handle, residual.handle, len(axes), c_axes, elementwise_affine, eps, use_bias, c_name) self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) return Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) + + def sigmoid_silu_multi(self, input1, input2, name=None): + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid_silu_multi(self.handle, input1.handle, input2.handle, c_name) + self.add_layer(OpType.SIGMOID_SILU_MULTI, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) def batch_matmul(self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None): """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. @@ -2588,6 +2612,31 @@ def rms_norm(self, input, eps, dim, name=None): self.add_layer(OpType.RMS_NORM, name) return Tensor(handle, owner_op_type=OpType.RMS_NORM) + def residual_rms_norm(self, input1, input2, eps, dim, name=None): + """Defines the Residual RMS Norm layer. + + :param input: the input 1 Tensor. + :type input: Tensor + + :param input: the input 2 Tensor. + :type input: Tensor + + :param eps: a value added to the denominator for numerical stability + :type eps: float + + :param dim: The dimension with respect to which to take the norm + :type dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handles_array = ffc().flexflow_model_add_residual_rms_norm(self.handle, input1.handle, input2.handle, eps, dim, c_name) + self.add_layer(OpType.RESIDUAL_RMS_NORM, name) + return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM) + def arg_top_k(self, input, k, sorted, name=None): """Defines the Arg TopK layer. diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index d1171cc3d3..5bf302f895 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -26,7 +26,11 @@ def __init__(self, hf_config): self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.num_attention_heads = hf_config.num_attention_heads - self.num_key_value_heads = hf_config.num_attention_heads if hf_config.num_key_value_heads is None else hf_config.num_key_value_heads + self.num_key_value_heads = ( + hf_config.num_attention_heads + if hf_config.num_key_value_heads is None + else hf_config.num_key_value_heads + ) self.hidden_size = hf_config.hidden_size self.rms_norm_eps = hf_config.rms_norm_eps self.intermediate_size = hf_config.intermediate_size @@ -166,9 +170,9 @@ def build_model(self): else: assert False - token = ffmodel.add(token, mha) - ff_norm = ffmodel.rms_norm( + token, ff_norm = ffmodel.residual_rms_norm( token, + mha, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, name=f"layers_{i}_ffn_norm", @@ -187,9 +191,7 @@ def build_model(self): False, name=f"layers_{i}_feed_forward_w3", ) - sigmoid = ffmodel.sigmoid(w1) - silu = ffmodel.multiply(w1, sigmoid) - multi = ffmodel.multiply(silu, w3) + multi = ffmodel.sigmoid_silu_multi(w1, w3) w2 = ffmodel.dense( multi, self.llama_config.hidden_size, diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 10353c5a96..8487a42c2a 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -30,6 +30,7 @@ def __init__(self, hf_config): hf_config.num_attention_heads = hf_config.n_heads hf_config.hidden_size = hf_config.d_model + class FlexFlowMPT(FlexFlowModel): def __init__( self, @@ -57,22 +58,15 @@ def __init__( self.maxint = 2**31 - 1 # Sanity checks - if ( - self.mpt_config.hidden_size - % self.mpt_config.n_heads - != 0 - ): + if self.mpt_config.hidden_size % self.mpt_config.n_heads != 0: raise ValueError( f"Hidden size ({self.mpt_config.hidden_size}) is not divisible by n_head ({self.mpt_config.n_heads})" ) # Sanity checks if ( - self.mpt_config.n_heads - < self.ffconfig.tensor_parallelism_degree - or self.mpt_config.n_heads - % self.ffconfig.tensor_parallelism_degree - != 0 + self.mpt_config.n_heads < self.ffconfig.tensor_parallelism_degree + or self.mpt_config.n_heads % self.ffconfig.tensor_parallelism_degree != 0 ): raise ValueError( f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" @@ -131,7 +125,7 @@ def build_model(self): (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - True, # qk_prod_scaling + True, # qk_prod_scaling name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: @@ -152,7 +146,7 @@ def build_model(self): (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - True, # qk_prod_scaling + True, # qk_prod_scaling name=f"layers_{i}_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: @@ -173,7 +167,7 @@ def build_model(self): (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - True, # qk_prod_scaling + True, # qk_prod_scaling name=f"layers_{i}_attention", ) else: @@ -240,7 +234,7 @@ def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): name = name.replace("transformer.blocks.", "layers.").replace(".", "_") - if 'Wqkv' in name: + if "Wqkv" in name: name_q = name.replace("attn_Wqkv", "attention_wq") name_k = name.replace("attn_Wqkv", "attention_wk") name_v = name.replace("attn_Wqkv", "attention_wv") @@ -256,7 +250,7 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif 'out_proj' in name: + elif "out_proj" in name: name = name.replace("attn_out_proj", "attention_wo") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: @@ -266,6 +260,7 @@ def convert_hf_model(model, dst_folder): os.path.join(dst_folder, "transformer_wte_weight"), os.path.join(dst_folder, "lm_head_weight"), ) + def get_layers_with_weights(self): layer_names = [ "transformer_wte_weight", diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index d90dabad1d..5c7fff2dfe 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -196,22 +196,15 @@ def build_model(self): else: assert False - # residual = ffmodel.add(mha, residual) - # This is either a before or after attention LayerNorm. In both cases, we need to compute the LN here. - """ norm_name = ( - f"layers_{i}_final_layer_norm" - if self.opt_config.do_layer_norm_before - else f"layers_{i}_attention_layer_norm" - ) """ - # ff_norm = ffmodel.layer_norm( - # residual, - # axes, - # self.opt_config.layer_norm_elementwise_affine, - # 1e-05, - # name=norm_name, - # ) - residual, ff_norm = ffmodel.add_bias_residual_layer_norm(mha, residual, axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, name=f"layers_{i}_add_bias_residual_layer_norm") + residual, ff_norm = ffmodel.add_bias_residual_layer_norm( + mha, + residual, + axes, + self.opt_config.layer_norm_elementwise_affine, + 1e-05, + name=f"layers_{i}_add_bias_residual_layer_norm", + ) if not self.opt_config.do_layer_norm_before: residual = ff_norm @@ -287,7 +280,9 @@ def convert_hf_model(model, dst_folder): .replace("v_proj", "wv") .replace("out_proj", "wo") .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") - .replace("_final_layer_norm", "_add_bias_residual_layer_norm") # important to use the leading "_" to avoid matching the last LayerNorm + .replace( + "_final_layer_norm", "_add_bias_residual_layer_norm" + ) # important to use the leading "_" to avoid matching the last LayerNorm ) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 05594f81e5..6a80a31514 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -89,7 +89,7 @@ def __init__( raise ValueError( f"Number of k/v attention heads ({self.starcoder_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - + self.build_model() def build_model(self): @@ -109,7 +109,7 @@ def build_model(self): self.data_type, None, embed_init, - name="transformer_wte_weight", + name="transformer_wte", ) positional_embedding = ffmodel.embedding( position_tensor, @@ -119,7 +119,7 @@ def build_model(self): self.data_type, None, embed_init, - name="transformer_wpe_weight", + name="transformer_wpe", ) hidden_states = ffmodel.add(token, positional_embedding) @@ -135,7 +135,7 @@ def build_model(self): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_1_weight", + name=f"layers_{i}_ln_1", ) assert self.mode == InferenceMode.INC_DECODING_MODE @@ -155,7 +155,7 @@ def build_model(self): DataType.DT_NONE, # data_type None, # kernel initializer False, # apply_rotary_embedding - name=f"layers_{i}_attention_weight", + name=f"layers_{i}_attention", ) residual = ffmodel.add(mha, hidden_states) @@ -165,7 +165,7 @@ def build_model(self): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_2_weight", + name=f"layers_{i}_ln_2", ) # mlp @@ -175,7 +175,7 @@ def build_model(self): self.starcoder_config.intermediate_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_fc_weight", + name=f"layers_{i}_mlp_c_fc", ) activation = ffmodel.gelu(c_fc, False) c_proj = ffmodel.dense( @@ -183,7 +183,7 @@ def build_model(self): self.starcoder_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_proj_weight", + name=f"layers_{i}_mlp_c_proj", ) hidden_states = ffmodel.add(residual, c_proj) @@ -192,14 +192,14 @@ def build_model(self): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"transformer_ln_f_weight", + name=f"transformer_ln_f", ) lm_head = ffmodel.dense( ln_f, self.starcoder_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="lm_head_weight", + name="lm_head", ) if self.generation_config.do_sample: diff --git a/python/flexflow/type.py b/python/flexflow/type.py index a4785dba51..ec8a6bc432 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -149,6 +149,8 @@ class OpType(Enum): ARG_TOPK = 2301 BEAM_TOPK = 2302 ADD_BIAS_RESIDUAL_LAYERNORM = 2303 + SIGMOID_SILU_MULTI = 2304 + RESIDUAL_RMS_NORM = 2305 def enum_to_int(enum, enum_item): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index dee030abee..0ebe29e3e9 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -685,6 +685,24 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( return tensor_outputs_wrapped; } +flexflow_tensor_t + flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle_, + const flexflow_tensor_t input1_, + const flexflow_tensor_t input2_, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + const Tensor input1 = FFCObjectWrapper::unwrap(input1_); + const Tensor input2 = FFCObjectWrapper::unwrap(input2_); + Tensor tensor = + handle->sigmoid_silu_multi(input1, input2, input1->data_type, name); + DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s", + tensor, + input1, + input2, + name); + return FFCObjectWrapper::wrap(tensor); +} + flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_, const flexflow_tensor_t a_, const flexflow_tensor_t b_, @@ -1386,6 +1404,28 @@ flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_tensor_t * + flexflow_model_add_residual_rms_norm(flexflow_model_t handle_, + const flexflow_tensor_t input1_, + const flexflow_tensor_t input2_, + float eps, + int dim, + char const *name) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + Tensor input1 = FFCObjectWrapper::unwrap(input1_); + Tensor input2 = FFCObjectWrapper::unwrap(input2_); + Tensor tensor_outputs[2]; + handle->residual_rms_norm( + input1, input2, tensor_outputs, eps, dim, input1->data_type, name); + assert(tensor_outputs[0] != nullptr); + assert(tensor_outputs[1] != nullptr); + flexflow_tensor_t *tensor_outputs_wrapped = + (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); + tensor_outputs_wrapped[0] = FFCObjectWrapper::wrap(tensor_outputs[0]); + tensor_outputs_wrapped[1] = FFCObjectWrapper::wrap(tensor_outputs[1]); + return tensor_outputs_wrapped; +} + flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index ea770f2ac7..159c82b346 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -123,19 +123,11 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, residual); // added: attn_output + final attention bias + residual. To be added to the // output of FC2 - ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, - input->dims, - input->data_type, - ln, - 0, - false /*create_grad*/); + ln->outputs[0] = create_tensor_legion_ordering( + input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/); // layer_norm(added) - ln->outputs[1] = create_tensor_legion_ordering(input->num_dims, - input->dims, - input->data_type, - ln, - 0, - false /*create_grad*/); + ln->outputs[1] = create_tensor_legion_ordering( + input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/); { int numdims = axes.size(); int dims[numdims]; @@ -146,7 +138,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, int attn_bias_dims[1] = {dims[0]}; ln->weights[0] = create_weight_legion_ordering(1, attn_bias_dims, - input->data_type, + data_type, ln, false /*create_grad*/, nullptr, @@ -155,7 +147,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, assert(elementwise_affine); ln->weights[1] = create_weight_legion_ordering(numdims, dims, - input->data_type, + data_type, ln, false /*create_grad*/, nullptr, @@ -164,7 +156,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, assert(use_bias); ln->weights[2] = create_weight_legion_ordering(numdims, dims, - input->data_type, + data_type, ln, false /*create_grad*/, nullptr, @@ -818,6 +810,8 @@ namespace std { size_t hash::operator()( FlexFlow::AddBiasResidualLayerNormParams const ¶ms) const { size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.axes.size()); for (int n : params.axes) { hash_combine(key, n); diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index c2780545f3..f865c6dd2a 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -32,10 +32,12 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/residual_rms_norm_kernels.h" #include "flexflow/ops/kernels/rms_norm_kernels.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" +#include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" @@ -478,6 +480,16 @@ __host__ void FusedOp::forward_task(Task const *task, "the forward() task"); break; } + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -813,6 +825,19 @@ __host__ void my_output_accessor[0]); break; } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); + break; + } case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -942,6 +967,16 @@ __host__ void beta); break; } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 011dd9be75..13927e8ee6 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -32,10 +32,12 @@ #include "flexflow/ops/kernels/linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/kernels/residual_rms_norm_kernels.h" #include "flexflow/ops/kernels/rms_norm_kernels.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" +#include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" @@ -491,6 +493,16 @@ __host__ void FusedOp::forward_task(Task const *task, "the forward() task"); break; } + case OP_SIGMOID_SILU_MULTI: { + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); + break; + } default: { fprintf(stderr, "Fusion currently does not support type = %d\n", @@ -842,6 +854,19 @@ __host__ void my_output_accessor[0]); break; } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); + break; + } case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -973,6 +998,16 @@ __host__ void beta); break; } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp new file mode 100644 index 0000000000..6906556452 --- /dev/null +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -0,0 +1,227 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/kernels/residual_rms_norm_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/residual_rms_norm.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, + ResidualRMSNorm const *rms, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handler, rms) { + eps = rms->eps; + alpha = 1.0f; + beta = 0.0f; + + in_dim = rms->data_dim; + batch_size = rms->effective_batch_size; + num_elements = in_dim * batch_size; + + DataType data_type = rms->weights[0]->data_type; + size_t rms_ptr_size = batch_size; + size_t norm_ptr_size = num_elements; + size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + rms_ptr = gpu_mem_allocator.allocate_instance_untyped( + rms_ptr_size * data_type_size(data_type)); + norm_ptr = gpu_mem_allocator.allocate_instance_untyped( + norm_ptr_size * data_type_size(data_type)); +} +ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} +namespace Kernels { +namespace ResidualRMSNorm { + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) + ? shared[lid] + : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void ResidualRMSNormFusedForwardKernel(int64_t N, + float eps, + T const *X1, + T const *X2, + T *X_out, + T *rms, + T *Y, + T const *weights, + T *output) { + __shared__ float v_shared[C10_WARP_SIZE]; + int64_t const i = blockIdx.x; + float sum = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + int64_t const index = i * N + j; + X_out[index] = X1[index] + X2[index]; + sum += + (static_cast(X_out[index]) * static_cast(X_out[index])); + } + sum = BlockReduceSum( + sum, + v_shared, + min(blockDim.x, + kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + + if (threadIdx.x == 0) { + rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); + output[index] = Y[index] * weights[index % N]; + } +} + +template +void forward_kernel(ResidualRMSNormMeta const *m, + T const *input1_ptr, + T const *input2_ptr, + T const *weight_ptr, + T *residual_output_ptr, + T *output_ptr, + hipStream_t stream) { + std::pair kernel1_parallelism = + std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualRMSNormFusedForwardKernel), + num_blocks, + num_threads, + 0, + stream, + m->in_dim, + m->eps, + input1_ptr, + input2_ptr, + residual_output_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); +} + +void forward_kernel_wrapper(ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +} // namespace ResidualRMSNorm +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu new file mode 100644 index 0000000000..17ac14449b --- /dev/null +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -0,0 +1,224 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/residual_rms_norm_kernels.h" +#include "flexflow/ops/residual_rms_norm.h" +#include "flexflow/utils/cuda_helper.h" +#include + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, + ResidualRMSNorm const *rms, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handler, rms) { + eps = rms->eps; + alpha = 1.0f; + beta = 0.0f; + + in_dim = rms->data_dim; + batch_size = rms->effective_batch_size; + num_elements = in_dim * batch_size; + + DataType data_type = rms->weights[0]->data_type; + size_t rms_ptr_size = batch_size; + size_t norm_ptr_size = num_elements; + size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + rms_ptr = gpu_mem_allocator.allocate_instance_untyped( + rms_ptr_size * data_type_size(data_type)); + norm_ptr = gpu_mem_allocator.allocate_instance_untyped( + norm_ptr_size * data_type_size(data_type)); +} +ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +namespace Kernels { +namespace ResidualRMSNorm { + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) + ? shared[lid] + : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void ResidualRMSNormFusedForwardKernel(int64_t N, + float eps, + T const *X1, + T const *X2, + T *X_out, + T *rms, + T *Y, + T const *weights, + T *output) { + __shared__ float v_shared[C10_WARP_SIZE]; + int64_t const i = blockIdx.x; + float sum = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + int64_t const index = i * N + j; + X_out[index] = X1[index] + X2[index]; + sum += + (static_cast(X_out[index]) * static_cast(X_out[index])); + } + sum = BlockReduceSum( + sum, + v_shared, + min(blockDim.x, + kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + + if (threadIdx.x == 0) { + rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); + output[index] = Y[index] * weights[index % N]; + } +} + +template +void forward_kernel(ResidualRMSNormMeta const *m, + T const *input1_ptr, + T const *input2_ptr, + T const *weight_ptr, + T *residual_output_ptr, + T *output_ptr, + cudaStream_t stream) { + + std::pair kernel1_parallelism = + std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + ResidualRMSNormFusedForwardKernel + <<>>(m->in_dim, + m->eps, + input1_ptr, + input2_ptr, + residual_output_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); +} + +void forward_kernel_wrapper(ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +} // namespace ResidualRMSNorm +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 3f4952b4a6..24ab7051e6 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -78,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared) { +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -87,7 +87,9 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) + ? shared[lid] + : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -95,41 +97,38 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void - RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { +__global__ void RMSNormFusedForwardKernel(int64_t N, + float eps, + T const *X, + T *rms, + T *Y, + T const *weights, + T *output) { __shared__ float v_shared[C10_WARP_SIZE]; - long long const i = blockIdx.x; + int64_t const i = blockIdx.x; float sum = 0.0f; - for (long long j = threadIdx.x; j < N; j += blockDim.x) { - long long const index = i * N + j; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + int64_t const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum(sum, - v_shared); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum( + sum, + v_shared, + min(blockDim.x, + kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); } -} -template -__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { + __syncthreads(); + using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rstd[i]); - } -} - -template -__global__ void elewise_apply_weights(int64_t batch_size, - int64_t in_dim, - T const *norm, - T const *weights, - T *output) { - CUDA_KERNEL_LOOP(i, batch_size * in_dim) { - output[i] = norm[i] * weights[i % in_dim]; + Y[index] = static_cast(X[index]) * static_cast(rms[i]); + output[index] = Y[index] * weights[index % N]; } } @@ -139,33 +138,25 @@ void forward_kernel(RMSNormMeta const *m, T const *weight_ptr, T *output_ptr, hipStream_t stream) { - int parallelism = m->batch_size * m->in_dim; - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseRootMeanSquareKernel), - m->batch_size, - kCUDABlockReduceNumThreads, + std::pair kernel1_parallelism = + std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormFusedForwardKernel), + num_blocks, + num_threads, 0, stream, m->in_dim, m->eps, input_ptr, - static_cast(m->rms_ptr)); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(NormKernel), - m->batch_size, - kCUDANumThreads, - 0, - stream, - m->in_dim, - input_ptr, static_cast(m->rms_ptr), - static_cast(m->norm_ptr)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(elewise_apply_weights), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - m->batch_size, - m->in_dim, static_cast(m->norm_ptr), weight_ptr, output_ptr); diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 234bf73150..7c9f4a9f98 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -96,6 +96,26 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) + ? shared[lid] + : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +#ifdef DEADCODE template __global__ void RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { @@ -134,6 +154,43 @@ __global__ void elewise_apply_weights(int64_t batch_size, output[i] = norm[i] * weights[i % in_dim]; } } +#endif + +template +__global__ void RMSNormFusedForwardKernel(int64_t N, + float eps, + T const *X, + T *rms, + T *Y, + T const *weights, + T *output) { + __shared__ float v_shared[C10_WARP_SIZE]; + int64_t const i = blockIdx.x; + float sum = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + int64_t const index = i * N + j; + sum += (static_cast(X[index]) * static_cast(X[index])); + } + sum = BlockReduceSum( + sum, + v_shared, + min(blockDim.x, + kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + + if (threadIdx.x == 0) { + rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + Y[index] = static_cast(X[index]) * static_cast(rms[i]); + output[index] = Y[index] * weights[index % N]; + } +} template void forward_kernel(RMSNormMeta const *m, @@ -141,23 +198,25 @@ void forward_kernel(RMSNormMeta const *m, T const *weight_ptr, T *output_ptr, cudaStream_t stream) { - int parallelism = m->batch_size * m->in_dim; - RowwiseRootMeanSquareKernel - <<batch_size, kCUDABlockReduceNumThreads, 0, stream>>>( - m->in_dim, m->eps, input_ptr, static_cast(m->rms_ptr)); - NormKernel<<batch_size, kCUDANumThreads, 0, stream>>>( - m->in_dim, - input_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr)); - elewise_apply_weights<<>>(m->batch_size, - m->in_dim, - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + + std::pair kernel1_parallelism = + std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + RMSNormFusedForwardKernel + <<>>(m->in_dim, + m->eps, + input_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(RMSNormMeta const *m, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 202a8837ff..4d04710b2a 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -88,7 +88,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -105,9 +105,9 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) ? shared[lid] - : 0; + : T(0); if (wid == 0) { val = WarpReduceSum(val); } diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc new file mode 100644 index 0000000000..3efb7274a0 --- /dev/null +++ b/src/ops/residual_rms_norm.cc @@ -0,0 +1,505 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/residual_rms_norm.h" +#include "flexflow/model.h" +#include "flexflow/ops/kernels/residual_rms_norm_kernels.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::ResidualRMSNorm; + +bool operator==(ResidualRMSNormParams const &lhs, + ResidualRMSNormParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps; +} + +bool ResidualRMSNormParams::is_valid( + std::pair const &input) const { + return input.first.is_valid() && input.second.is_valid(); +} + +ResidualRMSNormParams ResidualRMSNorm::get_params() const { + ResidualRMSNormParams params; + params.layer_guid = this->layer_guid; + params.eps = this->eps; + params.dim = this->dim; + return params; +} + +void FFModel::residual_rms_norm(const Tensor input1, + const Tensor input2, + Tensor *outputs, + float eps, + int dim, + DataType data_type, + char const *name) { + if (data_type == DT_NONE) { + data_type = input1->data_type; + } + Tensor casted_input1 = + (data_type != input1->data_type) + ? cast(input1, data_type, "type cast for residual_rms_norm") + : input1; + Tensor casted_input2 = + (data_type != input2->data_type) + ? cast(input2, data_type, "type cast for residual_rms_norm") + : input2; + Layer *rm = new Layer(this, + OP_RESIDUAL_RMS_NORM, + data_type, + name, + 2 /*inputs*/, + 1 /*weights*/, + 2 /*outputs*/, + casted_input1, + casted_input2); + + rm->outputs[0] = create_tensor_legion_ordering( + input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/); + rm->outputs[1] = create_tensor_legion_ordering( + input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/); + + // weights + int weight_dims[1] = {dim}; + rm->weights[0] = create_weight_legion_ordering(1, + weight_dims, + data_type, + rm, + true /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + + rm->add_float_property("eps", eps); + rm->add_int_property("dim", dim); + layers.push_back(rm); + outputs[0] = rm->outputs[0]; + outputs[1] = rm->outputs[1]; +} + +Op *ResidualRMSNorm::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + float eps; + layer->get_float_property("eps", eps); + long long value; + layer->get_int_property("dim", value); + int dim = value; + + return new ResidualRMSNorm(model, + layer->layer_guid, + inputs[0], + inputs[1], + eps, + dim, + false, + layer->name); +} + +ResidualRMSNorm::ResidualRMSNorm( + FFModel &model, + ResidualRMSNormParams const ¶ms, + std::pair const &inputs, + bool allocate_weights = false, + char const *name) + : ResidualRMSNorm(model, + params.layer_guid, + inputs.first, + inputs.second, + params.eps, + params.dim, + allocate_weights, + name) {} + +ResidualRMSNorm::ResidualRMSNorm( + FFModel &model, + ResidualRMSNorm const &other, + std::pair const &inputs, + bool allocate_weights) + : ResidualRMSNorm(model, + other.layer_guid, + inputs.first, + inputs.second, + other.eps, + other.dim, + allocate_weights, + other.name) {} +ResidualRMSNorm::ResidualRMSNorm(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input1, + const ParallelTensor _input2, + float _eps, + int dim, + bool allocate_weights, + char const *name) + : Op(model, + OP_RESIDUAL_RMS_NORM, + _input1->data_type, + name, + 2 /*num of inputs tensor */, + 1 /*num of weights tensor */, + 2 /*num of outputs tensor */, + _input1, + _input2) { + eps = _eps; + inputs[0] = _input1; + inputs[1] = _input2; + layer_guid = _layer_guid; + int num_dims = _input1->num_dims; + this->dim = dim; + data_dim = _input1->dims[0].size; + effective_batch_size = 1; + for (int i = 1; i <= num_dims - 2; i++) { + effective_batch_size *= _input1->dims[i].size; + } + // Currently assert that all non-replica dims are not parallelized + // We only support parallelism along the replica dim now + for (int i = 0; i < _input1->num_dims - 1; i++) { + assert(_input1->dims[i].degree == 1); + } + // Check that the two inputs have the same dimensions + for (int i = 0; i < _input1->num_dims; i++) { + assert(_input2->dims[i] == _input1->dims[i]); + } + // output has the same parallel dims as input + ParallelDim output_dims[MAX_TENSOR_DIM]; + for (int i = 0; i < _input1->num_dims; i++) { + output_dims[i] = _input1->dims[i]; + } + outputs[0] = model.create_parallel_tensor_legion_ordering(_input1->num_dims, + output_dims, + _input1->data_type, + this, + 0 /*owner_idx*/); + outputs[1] = model.create_parallel_tensor_legion_ordering(_input1->num_dims, + output_dims, + _input1->data_type, + this, + 1 /*owner_idx*/); + + if (allocate_weights) { + // weights should have the shape of (data_dim, data_dim) + ParallelDim new_weight_dims[MAX_TENSOR_DIM]; + + new_weight_dims[0].size = dim; + new_weight_dims[0].degree = 1; + new_weight_dims[0].parallel_idx = -1; + new_weight_dims[1] = _input1->dims[_input1->num_dims - 1]; // replica dim + + // weights + Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); + weights[0] = + model.create_parallel_weight_legion_ordering(2, + new_weight_dims, + _input1->data_type, + nullptr /*owner_op*/, + false /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } +} + +void ResidualRMSNorm::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(RESIDUAL_RMSNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ResidualRMSNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[1]->region)); + launcher.add_field(3, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void ResidualRMSNorm::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + + IndexLauncher launcher(RESIDUAL_RMSNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ResidualRMSNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(3, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +OpMeta *ResidualRMSNorm::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ResidualRMSNorm *rn = (ResidualRMSNorm *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + ResidualRMSNormMeta *meta = + new ResidualRMSNormMeta(handle, rn, gpu_mem_allocator); + return meta; +} + +void ResidualRMSNorm::forward(FFModel const &ff) { + assert(false); +} + +FutureMap + ResidualRMSNorm::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(RESIDUAL_RMSNORM_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(3, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input1 + regions[1](I): input2 + regions[2](O): residual output + regions[3](O): output + regions[4](I/O): weight +*/ +void ResidualRMSNorm::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 5); + assert(regions.size() == 5); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input1, input2, weight, residual_output, output); +} + +void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->eps); + sez.serialize(this->dim); +} + +using PCG::Node; +/*static*/ +Node ResidualRMSNorm::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + float eps; + size_t id, transformer_layer_id; + int dim; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + + LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(eps); + dez.deserialize(dim); + ResidualRMSNormParams params; + params.layer_guid = layer_guid; + params.eps = eps; + params.dim = dim; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} + +void ResidualRMSNorm::backward(FFModel const &ff) { + assert(false); +} +Op *ResidualRMSNorm::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + ResidualRMSNormParams params = get_params(); + return new ResidualRMSNorm( + ff, params, {inputs[0], inputs[1]}, true, this->name); +} + +bool ResidualRMSNorm::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +} // namespace FlexFlow +namespace std { +size_t hash::operator()( + FlexFlow::ResidualRMSNormParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.eps); + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.dim); + return key; +} +}; // namespace std diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index c3a4e9b58c..5b1634472d 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -208,7 +208,7 @@ void RMSNorm::init(FFModel const &ff) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(RMSNROM_INIT_TASK_ID, + IndexLauncher launcher(RMSNORM_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(RMSNorm)), argmap, @@ -253,7 +253,7 @@ void RMSNorm::init_inference(FFModel const &ff, size_t machine_view_hash = view->hash(); set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(RMSNROM_INIT_TASK_ID, + IndexLauncher launcher(RMSNORM_INIT_TASK_ID, parallel_is, TaskArgument(this, sizeof(RMSNorm)), argmap, @@ -305,7 +305,7 @@ void RMSNorm::forward(FFModel const &ff) { Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(RMSNROM_FWD_TASK_ID, + IndexLauncher launcher(RMSNORM_FWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -347,7 +347,7 @@ FutureMap RMSNorm::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RMSNROM_INF_TASK_ID, + IndexLauncher launcher(RMSNORM_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp index f76acc8e71..3d8f103524 100644 --- a/src/ops/sampling.cpp +++ b/src/ops/sampling.cpp @@ -173,7 +173,7 @@ void Sampling::forward_kernel_wrapper(SamplingMeta const *m, checkCUDA(hipEventRecord(t_start, stream)); } - handle_unimplemented_hip_kernel(OP_RMS_NORM); + handle_unimplemented_hip_kernel(OP_SAMPLING); if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc new file mode 100644 index 0000000000..031c7833a4 --- /dev/null +++ b/src/ops/sigmoid_silu_multi.cc @@ -0,0 +1,390 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/sigmoid_silu_multi.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +bool operator==(SigmoidSiluMultiParams const &lhs, + SigmoidSiluMultiParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid; +} + +bool SigmoidSiluMultiParams::is_valid( + std::pair const &input) const { + return input.first.is_valid() && input.second.is_valid(); +} + +SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { + SigmoidSiluMultiParams params; + params.layer_guid = this->layer_guid; + return params; +} + +Tensor FFModel::sigmoid_silu_multi(const Tensor input1, + const Tensor input2, + DataType data_type, + char const *name) { + + // Check dims + assert(input1->num_dims == input2->num_dims); + for (int i = 0; i < input1->num_dims; i++) { + assert(input1->dims[i] == input2->dims[i]); + } + // Tensor Data type + if (data_type == DT_NONE) { + data_type = input1->data_type; + assert(input2->data_type == input1->data_type); + } + Tensor casted_input1 = + (data_type != input1->data_type) + ? cast(input1, data_type, "type cast for sigmoid_silu_multi") + : input1; + Tensor casted_input2 = + (data_type != input2->data_type) + ? cast(input2, data_type, "type cast for sigmoid_silu_multi") + : input2; + + // Create layer + Layer *ssm = new Layer(this, + OP_SIGMOID_SILU_MULTI, + data_type, + name, + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + casted_input1, + casted_input2); + ssm->outputs[0] = create_tensor_legion_ordering( + input1->num_dims, input1->dims, data_type, ssm, 0, false /*create_grad*/); + layers.push_back(ssm); + return ssm->outputs[0]; +} + +Op *SigmoidSiluMulti::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + + return new SigmoidSiluMulti( + model, layer->layer_guid, inputs[0], inputs[1], layer->name); +} + +SigmoidSiluMulti::SigmoidSiluMulti( + FFModel &model, + SigmoidSiluMultiParams const ¶ms, + std::pair const &inputs, + char const *name) + : SigmoidSiluMulti( + model, params.layer_guid, inputs.first, inputs.second, name) {} + +SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input1, + const ParallelTensor _input2, + char const *name) + : Op(model, + OP_SIGMOID_SILU_MULTI, + _input1->data_type, + name, + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + _input1, + _input2) { + // overwrite layer_guid + layer_guid = _layer_guid; + outputs[0] = model.create_parallel_tensor_legion_ordering(_input1->num_dims, + _input1->dims, + _input1->data_type, + this, + 0 /*owner_idx*/); +} + +void SigmoidSiluMulti::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(SIGMOID_SILU_MULTI_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SigmoidSiluMulti)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + // input 1 + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // input 2 + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // output + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void SigmoidSiluMulti::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(SIGMOID_SILU_MULTI_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SigmoidSiluMulti)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // input 1 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // input 2 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input 1 + regions[1](I): input 2 + regions[2](O): output +*/ +OpMeta *SigmoidSiluMulti::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + SigmoidSiluMultiMeta *meta = + new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator); + meta->input_type[0] = ssm->inputs[0]->data_type; + meta->input_type[1] = ssm->inputs[1]->data_type; + meta->output_type[0] = ssm->outputs[0]->data_type; + return meta; +} + +void SigmoidSiluMulti::forward(FFModel const &ff) { + assert(false); +} + +void SigmoidSiluMulti::backward(FFModel const &ff) { + assert(false); +} + +FutureMap SigmoidSiluMulti::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "SigmoidSiluMulti op machine_view: " << *(MachineView + const *)mv + << std::endl; */ + IndexLauncher launcher(SIGMOID_SILU_MULTI_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // input 1 + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + // input 2 + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + // output + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input 1 + regions[1](I): input 2 + regions[2](O): output +*/ +void SigmoidSiluMulti::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 3); + + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + + SigmoidSiluMultiMeta const *m = *((SigmoidSiluMultiMeta **)task->local_args); + + GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + Domain input1_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain input2_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input1_domain.get_volume() == input2_domain.get_volume()); + assert(input1_domain.get_volume() == output_domain.get_volume()); + + assert(input1_domain == input2_domain); + assert(input1_domain == output_domain); + + SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output); +} + +bool SigmoidSiluMulti::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); +} + +using PCG::Node; +/*static*/ +Node SigmoidSiluMulti::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + size_t id, transformer_layer_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); + + SigmoidSiluMultiParams params; + params.layer_guid = layer_guid; + return ff.get_or_create_node({inputs[0], inputs[1]}, + params); +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::SigmoidSiluMultiParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.layer_guid.transformer_layer_id); + return key; +} +}; // namespace std diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp new file mode 100644 index 0000000000..fa73a55722 --- /dev/null +++ b/src/ops/sigmoid_silu_multi.cpp @@ -0,0 +1,118 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/sigmoid_silu_multi.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, + SigmoidSiluMulti const *ssm, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + profiling = ssm->profiling; +} + +SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +__device__ __forceinline__ float sigmoid_float(float x) { + return 1.0 / (1.0 + expf(-x)); +} + +__device__ __forceinline__ half sigmoid_half(half x) { + return (half)1.0 / ((half)1.0 + hexp(-x)); +} + +__global__ void SigmoidSiluMultiKernelFloat(int num_elements, + float const *input1_ptr, + float const *input2_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + output_ptr[i] = + input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i]; + } +} + +__global__ void SigmoidSiluMultiKernelHalf(int num_elements, + half const *input1_ptr, + half const *input2_ptr, + half *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i]; + } +} + +/*static*/ +void SigmoidSiluMulti::inference_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = input1.domain.get_volume(); + assert(input2.domain.get_volume() == num_elements); + assert(output.domain.get_volume() == num_elements); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + input1.domain.get_volume(), + input1.get_float_ptr(), + input2.get_float_ptr(), + output.get_float_ptr()); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + input1.domain.get_volume(), + input1.get_half_ptr(), + input2.get_half_ptr(), + output.get_half_ptr()); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] forward time (CF) = %.9fms\n", elapsed); + } +} + +}; // namespace FlexFlow diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu new file mode 100644 index 0000000000..fd69f6a8aa --- /dev/null +++ b/src/ops/sigmoid_silu_multi.cu @@ -0,0 +1,97 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/sigmoid_silu_multi.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, + SigmoidSiluMulti const *ssm, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + profiling = ssm->profiling; +} + +SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +template +__global__ void SigmoidSiluMultiKernel(int num_elements, + T const *input1_ptr, + T const *input2_ptr, + T *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i]; + } +} + +/*static*/ +void SigmoidSiluMulti::inference_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = input1.domain.get_volume(); + assert(input2.domain.get_volume() == num_elements); + assert(output.domain.get_volume() == num_elements); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiKernel<<>>(input1.domain.get_volume(), + input1.get_float_ptr(), + input2.get_float_ptr(), + output.get_float_ptr()); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiKernel<<>>(input1.domain.get_volume(), + input1.get_half_ptr(), + input2.get_half_ptr(), + output.get_half_ptr()); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] forward time (CF) = %.9fms\n", elapsed); + } +} + +}; // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 2b1910637f..91b21e8d8f 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -172,8 +172,12 @@ std::string get_operator_type_name(OperatorType type) { return "LayerNorm"; case OP_ADD_BIAS_RESIDUAL_LAYERNORM: return "AddBiasResidualLayerNorm"; + case OP_SIGMOID_SILU_MULTI: + return "SigmoidSiluMulti"; case OP_RMS_NORM: return "RMSNorm"; + case OP_RESIDUAL_RMS_NORM: + return "ResidualRMSNorm"; case OP_GELU: return "GELU"; case OP_IDENTITY: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5a5e267d96..037be739e7 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -40,8 +40,10 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/sampling.h" +#include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" @@ -2707,6 +2709,10 @@ void FFModel::deserialize_graph_optimal_view( *this, dez, inputs, num_inputs); break; } + case OP_SIGMOID_SILU_MULTI: { + node = SigmoidSiluMulti::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_LINEAR: { node = Linear::deserialize(*this, dez, inputs, num_inputs); break; @@ -2960,6 +2966,10 @@ void FFModel::deserialize_graph_optimal_view( node = RMSNorm::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_RESIDUAL_RMS_NORM: { + node = ResidualRMSNorm::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_COMBINE: { assert(num_inputs == 1); int combine_dim, combine_degree; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4f8caaa570..0cb50733a3 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -51,9 +51,11 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/sampling.h" +#include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" @@ -3107,11 +3109,23 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_SIGMOID_SILU_MULTI: { + Op *op = + SigmoidSiluMulti::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_RMS_NORM: { Op *op = RMSNorm::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); return op; } + case OP_RESIDUAL_RMS_NORM: { + Op *op = + ResidualRMSNorm::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_LINEAR: { Op *op = Linear::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3247,74 +3261,24 @@ void FFModel::create_operators_from_layers() { (l->op_type == OP_LINEAR && layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_GELU && layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion (l->op_type == OP_LINEAR && layer_idx >= 5 && layers[layer_idx - 1]->op_type == OP_EW_MUL && layers[layer_idx - 2]->op_type == OP_EW_MUL && layers[layer_idx - 3]->op_type == OP_SIGMOID && layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR))) { + layers[layer_idx - 5]->op_type == OP_LINEAR) || + // LLAMA with element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 3 && + layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && + layers[layer_idx - 2]->op_type == OP_LINEAR && + layers[layer_idx - 3]->op_type == OP_LINEAR))) { assert(op->numOutputs == 1); AllReduce *allreduce = new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); operators.push_back(allreduce); op = allreduce; } -#ifdef DEADCODE - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (l->op_type == OP_LINEAR && layer_idx + 3 <= layers.size() && - layers[layer_idx + 1]->op_type == OP_RELU && - layers[layer_idx + 2]->op_type == OP_LINEAR) || - (l->op_type == OP_LINEAR && layer_idx + 6 <= layers.size() && - layers[layer_idx + 1]->op_type == OP_LINEAR && - layers[layer_idx + 2]->op_type == OP_SIGMOID && - layers[layer_idx + 3]->op_type == OP_EW_MUL && - layers[layer_idx + 4]->op_type == OP_EW_MUL && - layers[layer_idx + 5]->op_type == OP_LINEAR) || - (l->op_type == OP_LINEAR && layer_idx + 5 <= layers.size() && - layer_idx >= 1 && layers[layer_idx - 1]->op_type == OP_LINEAR && - layers[layer_idx + 1]->op_type == OP_SIGMOID && - layers[layer_idx + 2]->op_type == OP_EW_MUL && - layers[layer_idx + 3]->op_type == OP_EW_MUL && - layers[layer_idx + 4]->op_type == OP_LINEAR))) { - std::vector partitioned_inputs; - assert(inputs.size() == 1); - Replicate *repl = new Replicate(*this, - inputs[0], - inputs[0]->num_dims - 1, - config.tensor_parallelism_degree); - partitioned_inputs.push_back(repl->outputs[0]); - operators.push_back(repl); - op = create_operator_from_layer(l, partitioned_inputs); - } else { - op = create_operator_from_layer(l, inputs); - } - // Op *op = create_operator_from_layer(l, inputs); - // add reduce operators if needed - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_RELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || - (l->op_type == OP_LINEAR && layer_idx >= 5 && - layers[layer_idx - 1]->op_type == OP_EW_MUL && - layers[layer_idx - 2]->op_type == OP_EW_MUL && - layers[layer_idx - 3]->op_type == OP_SIGMOID && - layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR))) { - assert(op->numOutputs == 1); - Reduction *reduct = new Reduction(*this, - op->outputs[0], - op->outputs[0]->num_dims - 1, - config.tensor_parallelism_degree); - operators.push_back(reduct); - op = reduct; - } -#endif assert(op->numOutputs == l->numOutputs); for (int i = 0; i < op->numOutputs; i++) { tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; @@ -5248,9 +5212,42 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // SigmoidSiluMulti task + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID, + "SigmoidSiluMulti Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INF_TASK_ID, + "SigmoidSiluMulti Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // rms norm task { - TaskVariantRegistrar registrar(RMSNROM_INIT_TASK_ID, "rmsnorm_init_task"); + TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { @@ -5264,7 +5261,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(RMSNROM_FWD_TASK_ID, "rmsnorm_fwd_task"); + TaskVariantRegistrar registrar(RMSNORM_FWD_TASK_ID, "rmsnorm_fwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { @@ -5278,7 +5275,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(RMSNROM_INF_TASK_ID, "RMS Norm Inference"); + TaskVariantRegistrar registrar(RMSNORM_INF_TASK_ID, "RMS Norm Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { @@ -5291,6 +5288,39 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + // rms norm task + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID, + "Residual RMS Norm Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Residual RMS Norm Init"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INF_TASK_ID, + "Residual RMS Norm Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index a983dcdb03..43f3dc7cf9 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -27,9 +27,11 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/sampling.h" +#include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" @@ -96,6 +98,8 @@ tl::optional get_op_parameters(Op const *op) { return ((LayerNorm *)op)->get_params(); case OP_ADD_BIAS_RESIDUAL_LAYERNORM: return ((AddBiasResidualLayerNorm *)op)->get_params(); + case OP_SIGMOID_SILU_MULTI: + return ((SigmoidSiluMulti *)op)->get_params(); case OP_REDUCE_SUM: return ((Reduce *)op)->get_params(); case OP_RESHAPE: @@ -130,6 +134,8 @@ tl::optional get_op_parameters(Op const *op) { return ((AggregateSpec *)op)->get_params(); case OP_RMS_NORM: return ((RMSNorm *)op)->get_params(); + case OP_RESIDUAL_RMS_NORM: + return ((ResidualRMSNorm *)op)->get_params(); case OP_ARG_TOPK: return ((ArgTopK *)op)->get_params(); case OP_BEAM_TOPK: diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index ae3be1222e..8618c41129 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -33,7 +33,9 @@ #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" +#include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/rms_norm.h" +#include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/split.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" @@ -3815,6 +3817,14 @@ bool FFModel::convert_graph_to_operators( NULL); break; } + case OP_SIGMOID_SILU_MULTI: { + assert(inList.size() == 2); + SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr; + SigmoidSiluMultiParams params = ssm->get_params(); + new_op = new SigmoidSiluMulti( + *this, ssm->layer_guid, inputs[0], inputs[1], NULL); + break; + } default: { new_op = node.ptr->materialize(*this, inputs, num_inputs); break; From f2f97117b4a81897b2e11af200a325584bc9848f Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Sat, 23 Sep 2023 22:26:27 -0400 Subject: [PATCH 231/344] Build ROCm Docker images on Oracle instance (#1144) * added oracle machine start/close and incorporate building rocm with oracle runner if needed in docker-build workflow * bug fix * delete building legion * bug fix * bug fix * cleanup * fix * rm unnecessary workflow * newline * fix * fix * fix * fix * fix * update docker skip workflow --------- Co-authored-by: Gabriele Oliaro --- .github/workflows/docker-build-skip.yml | 33 ++-- .github/workflows/docker-build.yml | 210 +++++++++++++++--------- .github/workflows/gpu-ci.yml | 6 +- .github/workflows/helpers/oracle_con.py | 38 +++++ 4 files changed, 187 insertions(+), 100 deletions(-) create mode 100644 .github/workflows/helpers/oracle_con.py diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml index 59b584c6c4..02b703467c 100644 --- a/.github/workflows/docker-build-skip.yml +++ b/.github/workflows/docker-build-skip.yml @@ -13,27 +13,22 @@ concurrency: cancel-in-progress: true jobs: - docker-build: - name: Build and Install FlexFlow in a Docker Container - runs-on: ubuntu-20.04 + docker-build-rocm: + name: Build and Install FlexFlow in a Docker Container (ROCm backend) + runs-on: ubuntu-latest strategy: matrix: - gpu_backend: ["cuda", "hip_rocm"] - cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"] - # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported - exclude: - - gpu_backend: "hip_rocm" - cuda_version: "11.1" - - gpu_backend: "hip_rocm" - cuda_version: "11.2" - - gpu_backend: "hip_rocm" - cuda_version: "11.3" - - gpu_backend: "hip_rocm" - cuda_version: "11.5" - - gpu_backend: "hip_rocm" - cuda_version: "11.6" - - gpu_backend: "hip_rocm" - cuda_version: "11.7" + hip_version: ["5.3", "5.4", "5.5", "5.6"] + fail-fast: false + steps: + - run: 'echo "No docker-build required"' + + docker-build-cuda: + name: Build and Install FlexFlow in a Docker Container (CUDA backend) + runs-on: ubuntu-latest + strategy: + matrix: + cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"] fail-fast: false steps: - run: 'echo "No docker-build required"' diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 899de4664e..655310e141 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -9,9 +9,9 @@ on: branches: - "inference" - "master" - schedule: - # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated - - cron: "0 8 * * 0" + # schedule: + # # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated + # - cron: "0 8 * * 0" workflow_dispatch: # Cancel outdated workflows if they are still running @@ -20,120 +20,174 @@ concurrency: cancel-in-progress: true jobs: - docker-build: - name: Build and Install FlexFlow in a Docker Container - runs-on: ubuntu-20.04 + oracle-runner-start: + name: Start a self-hosted Oracle machine to build the ROCM Docker images + runs-on: ubuntu-latest + if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + env: + OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} + OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} + OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} + OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} + OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} + OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }} + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + + - name: Install Oracle Cloud Infrastructure library + run: pip install oci + + - name: Start Oracle Machine + run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID + + # 1. in docker-build cases: if there are push/ workflow_dispatch to inference, + # the job should be run on oracle machine to build rocm and 11.8 cuda version + # 2. add a job to turn off the oracle machine if not needed + rocm_runner_choice: + name: Determine what runner to use to build the ROCm Docker image(s) + runs-on: ubuntu-latest + outputs: + rocm_runner: ${{ steps.step1.outputs.runner }} + steps: + - name: Determine the runner + id: step1 + env: + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + run: | + if [ $deploy_needed == "true" ]; then + echo "::set-output name=runner::[self-hosted, cpu_only]" + else + echo "::set-output name=runner::ubuntu-20.04" + fi + + docker-build-rocm: + needs: rocm_runner_choice + name: Build and Install FlexFlow in a Docker Container (ROCm backend) + runs-on: ${{ needs.rocm_runner_choice.outputs.rocm_runner }} strategy: matrix: - gpu_backend: ["cuda", "hip_rocm"] - gpu_backend_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0", "5.3", "5.4", "5.5", "5.6"] - # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported - exclude: - - gpu_backend: "cuda" - gpu_backend_version: "5.3" - - gpu_backend: "cuda" - gpu_backend_version: "5.4" - - gpu_backend: "cuda" - gpu_backend_version: "5.5" - - gpu_backend: "cuda" - gpu_backend_version: "5.6" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.1" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.2" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.3" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.4" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.5" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.6" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.7" - - gpu_backend: "hip_rocm" - gpu_backend_version: "11.8" - - gpu_backend: "hip_rocm" - gpu_backend_version: "12.0" + hip_version: ["5.3", "5.4", "5.5", "5.6"] fail-fast: false env: - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} - gpu_backend_version: ${{ matrix.gpu_backend_version }} - # one of the two variables below will be unused - cuda_version: ${{ matrix.gpu_backend_version }} - hip_version: ${{ matrix.gpu_backend_version }} - branch_name: ${{ github.head_ref || github.ref_name }} - timeout-minutes: 480 + FF_GPU_BACKEND: "hip_rocm" + hip_version: ${{ matrix.hip_version }} steps: - name: Checkout Git Repository + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} uses: actions/checkout@v3 with: submodules: recursive - name: Free additional space on runner - env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} - build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }} - run: | - if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then - .github/workflows/helpers/free_space_on_runner.sh - else - echo "Skipping this step to save time" - fi + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} + run: .github/workflows/helpers/free_space_on_runner.sh - name: Build Docker container + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} - build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + build_needed: ${{ matrix.hip_version == '5.6' }} run: | # On push to inference, build for all compatible architectures, so that we can publish # a pre-built general-purpose image. On all other cases, only build for one architecture # to save time. if [[ $deploy_needed == "true" ]] ; then - export FF_CUDA_ARCH=all export FF_HIP_ARCH=all ./docker/build.sh flexflow elif [[ $build_needed == "true" ]]; then - export FF_CUDA_ARCH=70 export FF_HIP_ARCH=gfx1100,gfx1036 ./docker/build.sh flexflow - else - echo "Skipping build to save time" fi - name: Check availability of flexflow modules in Python - if: ${{ matrix.gpu_backend == 'cuda' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} + run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" + + - name: Publish Docker environment image (on push to inference) + if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} - build_needed: ${{ ( matrix.gpu_backend == 'hip_rocm' && matrix.gpu_backend_version == '5.6' ) || ( matrix.gpu_backend == 'cuda' && matrix.gpu_backend_version == '11.8' ) }} + FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} run: | - if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then - if [[ $FF_GPU_BACKEND == "cuda" ]]; then - docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - else - docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${gpu_backend_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - fi - else - echo "Skipping test to save time" + ./docker/publish.sh flexflow-environment + ./docker/publish.sh flexflow + + docker-build-cuda: + name: Build and Install FlexFlow in a Docker Container (CUDA backend) + runs-on: ubuntu-20.04 + strategy: + matrix: + cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"] + fail-fast: false + env: + FF_GPU_BACKEND: "cuda" + cuda_version: ${{ matrix.cuda_version }} + steps: + - name: Checkout Git Repository + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Free additional space on runner + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + run: .github/workflows/helpers/free_space_on_runner.sh + + - name: Build Docker container + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + env: + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + build_needed: ${{ matrix.cuda_version == '11.8' }} + run: | + # On push to inference, build for all compatible architectures, so that we can publish + # a pre-built general-purpose image. On all other cases, only build for one architecture + # to save time. + if [[ $deploy_needed == "true" ]] ; then + export FF_CUDA_ARCH=all + ./docker/build.sh flexflow + elif [[ $build_needed == "true" ]]; then + export FF_CUDA_ARCH=86 + ./docker/build.sh flexflow fi + - name: Check availability of flexflow modules in Python + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" + - name: Publish Docker environment image (on push to inference) - if: github.repository_owner == 'flexflow' + if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && env.branch_name == 'inference' }} run: | - if [[ $deploy_needed == "true" ]]; then - ./docker/publish.sh flexflow-environment - ./docker/publish.sh flexflow - else - echo "No need to update Docker containers in ghrc.io registry at this time." - fi + ./docker/publish.sh flexflow-environment + ./docker/publish.sh flexflow + + oracle-runner-stop: + needs: docker-build-rocm + if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + runs-on: ubuntu-latest + name: Turn off the self-hosted Oracle machine where we built the ROCM Docker images + env: + OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} + OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} + OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} + OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} + OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} + OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }} + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + + - name: Install Oracle Cloud Infrastructure library + run: pip install oci + + - name: Stop Oracle Machine + run: python3 docker/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID notify-slack: name: Notify Slack in case of failure runs-on: ubuntu-20.04 - needs: docker-build + needs: [docker-build-cuda, docker-build-rocm] if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} steps: - name: Send Slack message diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index d604a7cea9..aee16832f3 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -61,7 +61,7 @@ jobs: python-interface-check: name: Check Python Interface - runs-on: self-hosted + runs-on: [self-hosted, gpu] defaults: run: shell: bash -l {0} # required to use an activated conda environment @@ -135,7 +135,7 @@ jobs: inference-tests: name: Inference Tests - runs-on: self-hosted + runs-on: [self-hosted, gpu] defaults: run: shell: bash -l {0} # required to use an activated conda environment @@ -210,7 +210,7 @@ jobs: gpu-ci-flexflow: name: Single Machine, Multiple GPUs Tests - runs-on: self-hosted + runs-on: [self-hosted, gpu] # skip this time-consuming test for PRs to the inference branch # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} defaults: diff --git a/.github/workflows/helpers/oracle_con.py b/.github/workflows/helpers/oracle_con.py new file mode 100644 index 0000000000..4fd41930b7 --- /dev/null +++ b/.github/workflows/helpers/oracle_con.py @@ -0,0 +1,38 @@ +import oci +import argparse +import os + +parser = argparse.ArgumentParser(description="Program with optional flags") +group = parser.add_mutually_exclusive_group() +group.add_argument("--start", action="store_true", help="Start action") +group.add_argument("--stop", action="store_true", help="Stop action") +parser.add_argument("--instance_id", type=str, required=True, help="instance id required") +args = parser.parse_args() + +oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT") + +config = { + "user": os.getenv("OCI_CLI_USER"), + "key_content": os.getenv("OCI_CLI_KEY_CONTENT"), + "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"), + "tenancy": os.getenv("OCI_CLI_TENANCY"), + "region": os.getenv("OCI_CLI_REGION") +} + +# Initialize the OCI configuration +# config = oci.config.from_file() +oci.config.validate_config(config) + +# Initialize the ComputeClient to interact with VM instances +compute = oci.core.ComputeClient(config) + +# Replace 'your_instance_id' with the actual instance ID of your VM +instance_id = args.instance_id + +# Perform the action +if args.start: + # Start the VM + compute.instance_action(instance_id, "START") +else: + # Stop the VM + compute.instance_action(instance_id, "STOP") From 48cca2bf61c4cfe82b44adee375fe8de2ce3479a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 23 Sep 2023 22:31:55 -0400 Subject: [PATCH 232/344] fix (#1147) Co-authored-by: Zhihao Jia --- python/flexflow/serve/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 7531c006a8..2c6395aca1 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -141,6 +141,7 @@ def init( configs_dict = { "num_gpus": num_gpus, "memory_per_gpu": memory_per_gpu, + "num_cpus": num_cpus, "zero_copy_memory_per_node": zero_copy_memory_per_node, "legion_utility_processors": legion_utility_processors, "data_parallelism_degree": data_parallelism_degree, @@ -174,6 +175,8 @@ def init( __check_positive_int(configs_dict, param) # Set default values + if configs_dict.get("num_cpus", None) is None: + configs_dict["num_cpus"] = 4 if configs_dict.get("legion_utility_processors", None) is None: configs_dict["legion_utility_processors"] = 8 if configs_dict.get("data_parallelism_degree", None) is None: From 02326e04f0c8e574d00515b473cc9bebcfcd1862 Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Sun, 24 Sep 2023 00:28:11 -0400 Subject: [PATCH 233/344] Docker workflow cleanup (#1148) --- .github/workflows/docker-build.yml | 3 --- .github/workflows/helpers/oracle_con.py | 1 - 2 files changed, 4 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 655310e141..cce6b5e74f 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -41,9 +41,6 @@ jobs: - name: Start Oracle Machine run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID - # 1. in docker-build cases: if there are push/ workflow_dispatch to inference, - # the job should be run on oracle machine to build rocm and 11.8 cuda version - # 2. add a job to turn off the oracle machine if not needed rocm_runner_choice: name: Determine what runner to use to build the ROCm Docker image(s) runs-on: ubuntu-latest diff --git a/.github/workflows/helpers/oracle_con.py b/.github/workflows/helpers/oracle_con.py index 4fd41930b7..0891d66e99 100644 --- a/.github/workflows/helpers/oracle_con.py +++ b/.github/workflows/helpers/oracle_con.py @@ -20,7 +20,6 @@ } # Initialize the OCI configuration -# config = oci.config.from_file() oci.config.validate_config(config) # Initialize the ComputeClient to interact with VM instances From 191df5dcc821d45317723415e587a9df49a8ee67 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 24 Sep 2023 14:58:32 -0400 Subject: [PATCH 234/344] fix oracle instance script --- .github/workflows/docker-build.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index cce6b5e74f..4009f3c338 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -21,7 +21,7 @@ concurrency: jobs: oracle-runner-start: - name: Start a self-hosted Oracle machine to build the ROCM Docker images + name: Start an Oracle instance to build the ROCM Docker images runs-on: ubuntu-latest if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: @@ -30,7 +30,7 @@ jobs: OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} - OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }} + OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }} steps: - name: Checkout Git Repository uses: actions/checkout@v3 @@ -163,14 +163,14 @@ jobs: needs: docker-build-rocm if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} runs-on: ubuntu-latest - name: Turn off the self-hosted Oracle machine where we built the ROCM Docker images + name: Stop the Oracle instance we used to build the ROCM Docker images env: OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} - OCI_INSTANCE_ID: ${{ secrets.INSTANCE_ID_FFCI }} + OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }} steps: - name: Checkout Git Repository uses: actions/checkout@v3 @@ -179,7 +179,7 @@ jobs: run: pip install oci - name: Stop Oracle Machine - run: python3 docker/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID + run: python3 .github/workflows/helpers/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID notify-slack: name: Notify Slack in case of failure From dfbd0fbfee616b3924828ba78c422459d3a8e7b5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 24 Sep 2023 15:40:57 -0400 Subject: [PATCH 235/344] fix --- .github/workflows/docker-build.yml | 72 +++++++++++++----------------- 1 file changed, 32 insertions(+), 40 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 4009f3c338..db6553cd45 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -41,27 +41,34 @@ jobs: - name: Start Oracle Machine run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID - rocm_runner_choice: - name: Determine what runner to use to build the ROCm Docker image(s) - runs-on: ubuntu-latest - outputs: - rocm_runner: ${{ steps.step1.outputs.runner }} - steps: - - name: Determine the runner - id: step1 - env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} - run: | - if [ $deploy_needed == "true" ]; then - echo "::set-output name=runner::[self-hosted, cpu_only]" - else - echo "::set-output name=runner::ubuntu-20.04" - fi - docker-build-rocm: - needs: rocm_runner_choice name: Build and Install FlexFlow in a Docker Container (ROCm backend) - runs-on: ${{ needs.rocm_runner_choice.outputs.rocm_runner }} + runs-on: ubuntu-20.04 + if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }} + env: + FF_GPU_BACKEND: "hip_rocm" + hip_version: 5.6 + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Free additional space on runner + run: .github/workflows/helpers/free_space_on_runner.sh + + - name: Build Docker container + run: FF_HIP_ARCH="gfx1100,gfx1036" ./docker/build.sh flexflow + + - name: Check availability of flexflow modules in Python + run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" + + + docker-build-and-publish-rocm: + name: Build and Deploy FlexFlow Docker Containers (ROCm backend) + needs: oracle-runner-start + runs-on: [self-hosted, cpu_only] + if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} strategy: matrix: hip_version: ["5.3", "5.4", "5.5", "5.6"] @@ -71,38 +78,23 @@ jobs: hip_version: ${{ matrix.hip_version }} steps: - name: Checkout Git Repository - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} uses: actions/checkout@v3 with: submodules: recursive - name: Free additional space on runner - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} run: .github/workflows/helpers/free_space_on_runner.sh - name: Build Docker container - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} - env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} - build_needed: ${{ matrix.hip_version == '5.6' }} - run: | - # On push to inference, build for all compatible architectures, so that we can publish - # a pre-built general-purpose image. On all other cases, only build for one architecture - # to save time. - if [[ $deploy_needed == "true" ]] ; then - export FF_HIP_ARCH=all - ./docker/build.sh flexflow - elif [[ $build_needed == "true" ]]; then - export FF_HIP_ARCH=gfx1100,gfx1036 - ./docker/build.sh flexflow - fi + # On push to inference, build for all compatible architectures, so that we can publish + # a pre-built general-purpose image. On all other cases, only build for one architecture + # to save time. + run: FF_HIP_ARCH=all ./docker/build.sh flexflow - name: Check availability of flexflow modules in Python - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.hip_version == '5.6' }} run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - name: Publish Docker environment image (on push to inference) - if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} run: | @@ -160,7 +152,7 @@ jobs: ./docker/publish.sh flexflow oracle-runner-stop: - needs: docker-build-rocm + needs: docker-build-and-publish-rocm if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} runs-on: ubuntu-latest name: Stop the Oracle instance we used to build the ROCM Docker images @@ -184,7 +176,7 @@ jobs: notify-slack: name: Notify Slack in case of failure runs-on: ubuntu-20.04 - needs: [docker-build-cuda, docker-build-rocm] + needs: [docker-build-cuda, docker-build-and-publish-rocm] if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} steps: - name: Send Slack message From 5958971bc4b980e750b0a6e35be9359f1d3403d3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 24 Sep 2023 16:07:01 -0400 Subject: [PATCH 236/344] fix --- .github/workflows/docker-build.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index db6553cd45..05c94c7e84 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -82,9 +82,6 @@ jobs: with: submodules: recursive - - name: Free additional space on runner - run: .github/workflows/helpers/free_space_on_runner.sh - - name: Build Docker container # On push to inference, build for all compatible architectures, so that we can publish # a pre-built general-purpose image. On all other cases, only build for one architecture From 0a56d0170f1c8bed1c866eb33f480aa5ff57769c Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Mon, 25 Sep 2023 12:49:35 -0400 Subject: [PATCH 237/344] [SpecInfer] Update RequestManager (#1096) * Reorder pipeline. * refactor and small fixes. * Update * Refactor backup. * pipeline update. * Format. * fix * . * fix * fix * fix. * Fix reloading new request with long prompts. * Fix edge cases. * Fix edge case * fix * try a fix to CI * . * fix * Fix: clean up code and fix decoding_steps. * try 1 try * fix: allow parse 0 tokens for pending request. * format. * remove comment tests * remove print. * fix decoding steps * . * quick fix. * remove debugging prints. * fix store_beam_metadata. * hip --------- Co-authored-by: Zeyu Wang Co-authored-by: Zeyu Wang Co-authored-by: xinhaoc Co-authored-by: xinhaoc Co-authored-by: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Co-authored-by: Zhihao Jia --- include/flexflow/batch_config.h | 5 +- include/flexflow/model.h | 5 +- .../flexflow/ops/kernels/softmax_kernels.h | 2 + include/flexflow/request_manager.h | 14 +- inference/incr_decoding/incr_decoding.cc | 6 +- inference/spec_infer/spec_infer.cc | 6 +- src/c/flexflow_c.cc | 4 +- src/mapper/mapper.cc | 2 +- src/ops/argmax.cpp | 2 +- src/ops/argmax.cu | 4 +- src/ops/kernels/softmax.cpp | 5 +- src/ops/kernels/softmax.cu | 6 +- src/ops/spec_inc_multihead_self_attention.cu | 8 +- src/runtime/request_manager.cc | 1035 ++++++++++------- 14 files changed, 645 insertions(+), 459 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ce331d3e41..8aa69a3cad 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -46,13 +46,14 @@ class BatchConfig { void print() const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); - static int const MAX_NUM_REQUESTS = 1; + static int const MAX_NUM_REQUESTS = 4; static int const MAX_NUM_TOKENS = 64; static int const MAX_PROMPT_LENGTH = 62; static int const MAX_SEQ_LENGTH = 256; // These are set by update int num_tokens; + bool loading_prompt = false; struct PerRequestInfo { int token_start_offset; @@ -69,6 +70,7 @@ class BatchConfig { PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; bool request_completed[MAX_NUM_REQUESTS]; + bool request_running[MAX_NUM_TOKENS]; }; class TreeVerifyBatchConfig : public BatchConfig { @@ -113,7 +115,6 @@ class BeamSearchBatchConfig : public BatchConfig { inline static int const MAX_BEAM_DEPTH = 8; int model_id; - int max_init_length = 0; struct BeamSearchPerRequestInfo { int beam_size; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index f88f96cd5a..177575e809 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -239,8 +239,8 @@ enum TaskIDs { RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, - RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, + RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, @@ -787,7 +787,8 @@ class FFModel { // ======================================== // Inference APIs // ======================================== - GenerationResult generate(std::string const &text, int max_seq_length); + GenerationResult generate(std::vector &prompts, + int max_seq_length); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 14c07414e9..987a546459 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -15,8 +15,10 @@ class SoftmaxMeta : public OpMeta { Legion::Domain const &input_domain); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor; + cudnnTensorDescriptor_t outputTensor; #else miopenTensorDescriptor_t inputTensor; + miopenTensorDescriptor_t outputTensor; #endif bool profiling; int dim; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index e444402dd0..8515d8a04b 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -52,13 +52,17 @@ class InferenceManager { struct Request { enum Status { - PENDING = 101, - RUNNING = 102, - COMPLETED = 103, + PENDING = 101, // loading prompt + RUNNING = 102, // running inference + COMPLETED = 103, // finished and verified + FINISHING = 104, // finishing request, but not yet verified }; BatchConfig::RequestGuid guid; int max_sequence_length; int initial_len; + int ssm_cache_size = 0; + int llm_cache_size = 0; + Status status = PENDING; std::vector tokens; @@ -102,10 +106,10 @@ class RequestManager { FFModel *get_model(int model_id); GenerationResult generate_incr_decoding(FFModel *model, - std::string const &text, + std::vector &prompts, int max_seq_length); GenerationResult generate_spec_infer(FFModel *model, - std::string const &text, + std::vector &prompts, int max_seq_length); GenerationResult get_generation_result(RequestGuid const &guid); RequestGuid register_new_request(std::string const &prompt, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 19cd8726e2..3f913e4573 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -242,13 +242,15 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); + std::vector prompts; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - GenerationResult result = - model.generate(text, 128 /*max_sequence_length*/); + prompts.push_back(text); } + GenerationResult result = + model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9d139997f7..2b1fb6e817 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -384,12 +384,16 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); + + std::vector prompts; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - tree_model.generate(text, 128 /*max_sequence_length*/); + prompts.push_back(text); + // tree_model.generate(text, 128 /*max_sequence_length*/); } + tree_model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 0ebe29e3e9..fcdae9cf33 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1529,8 +1529,10 @@ flexflow_generation_result_t int max_seq_length, int *output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); + std::vector prompts; std::string const text_str(input_text); - GenerationResult result = handle->generate(text_str, max_seq_length); + prompts.push_back(input_text); + GenerationResult result = handle->generate(prompts, max_seq_length); DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length); assert(result.output_tokens.size() <= max_seq_length); output_length_and_tokens[0] = result.output_tokens.size(); diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index 3d08eb0bcc..a86a6167a6 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -284,8 +284,8 @@ void FFMapper::select_task_options(const MapperContext ctx, return; } if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) || - (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) { output.initial_proc = all_cpus[0]; return; diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index ec5ea6c36a..8a1cf0b3b0 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -393,7 +393,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, if (m->beam_search) { // set all parents id zero in arg top1 case. - checkCUDA(hipMemset(parent, 0, batch_size * sizeof(int))); + checkCUDA(hipMemsetAsync(parent, 0, batch_size * sizeof(int), stream)); } int num_shards = 0; int k = 1; diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 37e067006c..05c84719c1 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -59,7 +59,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, DT alpha = 1.0f, beta = 0.0f; if (m->beam_search) { // set all parents id zero in arg top1 case. - checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int))); + checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream)); } size_t temp_storage_bytes = m->temp_storage_bytes; // use cub @@ -83,6 +83,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, prob_ptr, batch_size, m->beam_search); + // print_tensor(indices_ptr, 32, "argmax op"); } /*static*/ @@ -93,7 +94,6 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, int batch_size) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index bd8b46116d..ca4872d51b 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -29,6 +29,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN( cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); + checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); @@ -127,7 +130,7 @@ void forward_kernel(SoftmaxMeta const *m, m->inputTensor, input_ptr, &beta, - m->inputTensor, + m->outputTensor, output_ptr, MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_MODE_CHANNEL)); diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 15130c19a7..67a9c21038 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -28,6 +28,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( inputTensor, input_domain, softmax->data_type)); + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); @@ -42,7 +45,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, DT *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -127,7 +129,7 @@ void forward_kernel(SoftmaxMeta const *m, m->inputTensor, input_ptr, &beta, - m->inputTensor, + m->outputTensor, output_ptr)); } diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 6ef5145654..b4cdc77e2a 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -251,6 +251,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i]) { continue; } + for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { // int num_new_tokens = bc->num_processing_tokens[i]; @@ -259,6 +260,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].token_start_offset + bc->requestsInfo[i].num_tokens_in_batch; + + if (num_new_tokens <= 0) { + continue; + } + // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; int n = total_tokens; @@ -543,7 +549,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); } - assert(tokens_previous_requests == num_tokens); + // assert(tokens_previous_requests == num_tokens); } template diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5eb3192e25..5489c9b06d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -143,17 +143,12 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) { + if (prompt.size() >= BatchConfig::MAX_SEQ_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " - << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " + << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got " << prompt.size() << ".\n"; - // Truncate the prompt to MAX_NUM_TOKENS - // request.tokens.insert(request.tokens.end(), - // prompt.begin(), - // prompt.begin() + BatchConfig::MAX_PROMPT_LENGTH); - // request.initial_len = BatchConfig::MAX_PROMPT_LENGTH; + printf("tokens size: %zu\n", request.tokens.size()); - // assert(false); return 0; } else { request.initial_len = prompt.size(); @@ -206,14 +201,12 @@ RequestManager::RequestGuid request.tokens.push_back(bos_token_id); } std::vector tokens = this->tokenizer_->Encode(prompt); - if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) { + if (tokens.size() >= BatchConfig::MAX_SEQ_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " - << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " + << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got " << tokens.size() << ".\n"; - // Truncate the prompt to MAX_NUM_TOKENS - // tokens.resize(BatchConfig::MAX_PROMPT_LENGTH); + printf("tokens size: %zu\n", tokens.size()); - // assert(false); return 0; } for (int i = 0; i < tokens.size(); i++) { @@ -238,6 +231,7 @@ RequestManager::RequestGuid all_requests[request.guid] = request; { std::string output = "New request tokens:"; + output = "[" + std::to_string(request.guid) + "]" + output; for (int i = 0; i < request.tokens.size(); i++) { output = output + " " + std::to_string(request.tokens[i]); } @@ -467,149 +461,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } /* ----- Speculative Inference Specific functions ----- */ -BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( - BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); - - RequestManager *rm = this; - TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, - TaskArgument(&rm, sizeof(RequestManager *))); - launcher.add_future(old_bc); - launcher.add_future(result); - return runtime->execute_task(ctx, launcher); -} - -BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - RequestManager *rm = *((RequestManager **)task->args); - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - BeamInferenceResult const &result = - Future(task->futures[1]).get_result(); - return rm->prepare_next_batch_beam(bc, result); -} - -// update beam search metadata -BeamSearchBatchConfig - RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); - if (verbose) { - std::cout << "\n############### prepare_next_batch_beam ###############\n"; - } - if (verbose) { - std::cout << "print all results" - << "\n"; - for (int i = 0; i < 40; i++) { - std::cout << result.token_ids[i] << ", "; - } - std::cout << "Current Beam Depth: " - << old_bc.beamRequestsInfo[0].current_depth << "\n"; - } - - // Step 1: Store result to the beam tree struct - store_beam_metadata(old_bc, result); - - // Step 2: preparing the next batch for existing requests - BeamSearchBatchConfig new_bc; - new_bc.max_init_length = 0; - new_bc.model_id = old_bc.model_id; - // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; - - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { - if (old_bc.request_completed[i]) { - continue; - } - // Comment out this assertion since num_tokens_in_batch can be - // zero when beam search has reached required sequence length - // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + - old_bc.requestsInfo[i].num_tokens_in_batch; - - // assert(processed_tokens < request.tokens.size()); - log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - if (processed_tokens > - old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() - // || ir.results[t] == 0 TODO: replace this with - ) { - log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - old_bc.requestsInfo[i].request_guid, - old_bc.beamRequestsInfo[i].max_depth); - // new_bc.request_completed[i] = true; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - } else { - log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " - << new_bc.num_tokens; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - - // update the beam search metadata - // how many sub request in current request - // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - // update the parentid, accumalated_probs, depth, and token_ids - new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth + 1; - new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; - new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; - - // do the slot exchange to minimize the cache exchange in kernel. - // std::cout << "update metadata" << std::endl; - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); - - if (new_bc.requestsInfo[i].token_start_offset + 1 >= - request.tokens.size()) { - // Incremental phase - new_bc.requestsInfo[i].num_tokens_in_batch = 1; - } else { - // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); - } - - // register more tokens due to the beam width - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - - // get value from requestinfo - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_bc.beamRequestsInfo[i].tokens[k]; - // request.tokens[depth]; - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; - new_bc.num_tokens++; - } - } - } - } - if (verbose) { - std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" - << std::endl; - old_bc.print(); - new_bc.print(); - } - return new_bc; -} +/***** Request Init Phase *****/ BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init( TreeVerifyBatchConfigFuture const &old_bc, InferenceResultFuture const &result, @@ -648,6 +501,9 @@ BeamSearchBatchConfig if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; } + + std::cout << "\n############### prepare_next_batch_init ###############\n"; + // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; @@ -661,188 +517,226 @@ BeamSearchBatchConfig size_t guid = old_bc.requestsInfo[i].request_guid; Request &request = all_requests[guid]; + std::cout << "[ " << guid << " ]" << std::endl; + // Verify this: get verified tokens from result std::vector> tree_outputs = std::vector>(); assert(old_bc.num_tokens > 0); - int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; - if (committed_tokens.find(guid) == committed_tokens.end()) { - committed_tokens[guid] = std::vector>(); + // reset committed_tokens + if (committed_tokens.count(guid) == 0) { + committed_tokens[guid] = {}; } else { - committed_tokens.at(guid).clear(); + committed_tokens[guid].clear(); } + // iterate through all the tokens that belong to request i + int root_abs_depth = request.tokens.size() - 1; + while (result_index < old_bc.num_tokens && old_bc.tokensInfo[result_index].request_index == i) { - // new tokens have not been appended yet, so the last appended token is - // the root of the beam search token tree - int root_abs_depth = request.tokens.size() - 1; - if (old_bc.tokensInfo[result_index].abs_depth_in_request >= - root_abs_depth) { - // append to tree_outputs a pair consisting of (token id, depth) - tree_outputs.push_back(std::make_pair( - result.token_ids[result_index], - old_bc.tokensInfo[result_index].abs_depth_in_request + 1)); - // append (depth, index of the token in result) to committed_tokens - // array - committed_tokens.at(guid).push_back( - std::make_pair(old_bc.tokensInfo[result_index].abs_depth_in_request, - result_index)); + int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; + int token_id = result.token_ids[result_index]; + + if (request.status == Request::PENDING) { + committed_tokens[guid].emplace_back(abs_depth, result_index); + } else if (abs_depth >= root_abs_depth) { + tree_outputs.emplace_back(token_id, abs_depth + 1); + committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { std::cout << "Index within old batch: " << result_index << std::endl; printf(" Input: [%d] %d ---> [%d] %d \n", - old_bc.tokensInfo[result_index].abs_depth_in_request, + abs_depth, old_bc.tokensInfo[result_index].token_id, tree_outputs.back().second, - tree_outputs.back().first); + token_id); } - // std::cout << " Input: " << old_bc.tokensInfo[result_index].token_id - // << "" - // << old_bc.tokensInfo[result_index].abs_depth_in_request << - // std::endl; - // std::cout << " Result: " << result.token_ids[result_index] << ", - // depth: " - // << old_bc.tokensInfo[result_index].abs_depth_in_request + 1 << - // std::endl; + std::cout << "Index within old batch: " << result_index << std::endl; + printf(" Input: [%d] %d ---> [%d] %d \n", + abs_depth, + old_bc.tokensInfo[result_index].token_id, + tree_outputs.back().second, + token_id); } result_index++; } - std::vector> verified_tokens = - traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); - log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); - // check if the request is finished - if (verified_tokens.size() + request.tokens.size() >= - request.max_sequence_length) { - // Append all verified tokens to the request - for (int j = 0; j < verified_tokens.size(); j++) { - if (verified_tokens[j].second < request.max_sequence_length) { - request.tokens.push_back(verified_tokens[j].first); + if (request.status == Request::RUNNING) { + std::vector> verified_tokens = + traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + log_req_mgr.print("Number of Verified Tokens = %zu", + verified_tokens.size()); + + // check if the request is finished + if (verified_tokens.size() + request.tokens.size() >= + request.max_sequence_length) { + // Append all verified tokens to the request + for (auto const &token_pair : verified_tokens) { + if (token_pair.second < request.max_sequence_length) { + request.tokens.push_back(token_pair.first); + } } - } - request.status = Request::COMPLETED; - log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", - request.guid, - request.tokens.size()); - std::string output = this->tokenizer_->Decode(request.tokens); - { - // update generation result and trigger future - GenerationResult &gr = request_generation_results[request.guid]; - assert(gr.guid == request.guid); - gr.output_tokens = request.tokens; - gr.output_text = output; - } - log_req_mgr.print("Final output: %s", output.c_str()); - new_bc.request_completed[i] = true; - num_processed_requests++; - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); - - // Write output to file if needed: - if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath); - if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; + request.status = Request::COMPLETED; + log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", + request.guid, + request.tokens.size()); + std::string output = this->tokenizer_->Decode(request.tokens); + { + // update generation result and trigger future + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.output_tokens = request.tokens; + gr.output_text = output; + } + log_req_mgr.print("Final output: %s", output.c_str()); + + new_bc.request_completed[i] = true; + new_bc.request_running[i] = false; + num_processed_requests++; + + // Log profiling info + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); + + // Write output to file if needed: + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath); + if (outputFile.is_open()) { + outputFile << "end-to-end latency: " << std::fixed + << std::setprecision(3) + << profile_info.finish_time - profile_info.start_time + << std::endl; + outputFile << "num decoding steps: " << profile_info.decoding_steps + << std::endl; + outputFile << "token IDs: "; + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } } + outputFile << std::endl; + outputFile << output; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); } - outputFile << std::endl; - outputFile << output; - outputFile.close(); - } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); } - } - // delete the old input tree from cache - dfs_tree_inputs.erase(request.guid); + // delete the old input tree from cache + dfs_tree_inputs.erase(request.guid); - continue; - } + } else { // Request not finished, pass verified_tokens to next iteration - new_bc.request_completed[i] = false; - - // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - - // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - - new_bc.requestsInfo[i].token_start_offset - - verified_tokens.size(); - new_bc.beamRequestsInfo[i].current_depth = 1; - new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; - new_bc.beamRequestsInfo[i].max_depth = - std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { - new_bc.beamRequestsInfo[i].parent_id[j] = 0; - new_bc.beamRequestsInfo[i].probs[j] = 1; - } + new_bc.request_completed[i] = false; + new_bc.request_running[i] = true; + + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = + verified_tokens.front().second; + new_bc.requestsInfo[i].request_guid = + old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - new_bc.sub_requests[i] = 1; + // TODO: Beam Request Info, missing from VerifyTreeBatchConfig + int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].token_start_offset - + verified_tokens.size(); + new_bc.beamRequestsInfo[i].current_depth = 1; + new_bc.beamRequestsInfo[i].beam_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + new_bc.beamRequestsInfo[i].max_depth = + std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); + for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + new_bc.beamRequestsInfo[i].parent_id[j] = 0; + new_bc.beamRequestsInfo[i].probs[j] = 1; + } - // Token Info - for (int j = 0; j < verified_tokens.size(); j++) { - auto token = verified_tokens.at(j); + new_bc.sub_requests[i] = 1; - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + // Token Info + for (int j = 0; j < verified_tokens.size(); j++) { + auto token = verified_tokens.at(j); - // Beam Token Info - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; - new_bc.num_tokens++; + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + token.second; - // Add verified token to request's token list - request.tokens.push_back(token.first); + // Beam Token Info + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; + new_bc.num_tokens++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - break; + // Add verified token to request's token list + request.tokens.push_back(token.first); + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } + } + std::string output = this->tokenizer_->Decode(request.tokens); + log_req_mgr.print("Output: %s", output.c_str()); } + } else if (request.status == Request::PENDING) { + new_bc.request_completed[i] = false; + new_bc.request_running[i] = false; + + std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " + << "initial_len: " << request.initial_len << std::endl; + assert(request.ssm_cache_size == request.initial_len); + + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = request.ssm_cache_size; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].num_tokens_in_batch = 0; + + // TODO: Beam Request Info, missing from VerifyTreeBatchConfig + new_bc.beamRequestsInfo[i].current_depth = 1; + new_bc.beamRequestsInfo[i].beam_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + new_bc.beamRequestsInfo[i].max_depth = 0; + for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + new_bc.beamRequestsInfo[i].parent_id[j] = 0; + new_bc.beamRequestsInfo[i].probs[j] = 1; + } + + new_bc.sub_requests[i] = 1; + + // Token Info + std::string output = this->tokenizer_->Decode(request.tokens); + log_req_mgr.print("Output: %s", output.c_str()); + } else { + assert(false); } - std::string output = this->tokenizer_->Decode(request.tokens); - log_req_mgr.print("Output: %s", output.c_str()); } // Step 2: Initialize new request - new_bc.max_init_length = 0; for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && new_bc.num_tokens < BeamSearchBatchConfig::MAX_NUM_TOKENS) { Request new_request = pending_request_queue.front(); pending_request_queue.pop(); - new_bc.max_init_length = - std::max(new_bc.max_init_length, new_request.initial_len); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -886,6 +780,33 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; } + + // if (new_bc.requestsInfo[i].num_tokens_in_batch < + // new_request.initial_len) { + // all_requests[new_request.guid].status = Request::PENDING; + // new_bc.request_running[i] = false; + // std::cout << "Request " << new_request.guid << " is pending" + // << std::endl; + // } else { + // all_requests[new_request.guid].status = Request::RUNNING; + // new_bc.request_running[i] = true; + // std::cout << "Request " << new_request.guid << " is running" + // << std::endl; + // } + all_requests[new_request.guid].status = Request::PENDING; + all_requests[new_request.guid].ssm_cache_size = + new_bc.requestsInfo[i].num_tokens_in_batch; + new_bc.request_running[i] = false; + std::cout << "SSM KV Cache Size init: " + << all_requests[new_request.guid].ssm_cache_size << std::endl; + std::cout << "LLM KV Cache Size init: " + << all_requests[new_request.guid].llm_cache_size << std::endl; + + std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch + << " tokens for request " << new_request.guid << std::endl; + std::cout << "total prompt in request: " << new_request.initial_len + << std::endl; + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { break; } @@ -902,6 +823,209 @@ BeamSearchBatchConfig return new_bc; } +/***** Beam Search Phase *****/ +BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( + BeamSearchBatchConfigFuture const &old_bc, + BeamInferenceResultFuture const &result) { + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + launcher.add_future(old_bc); + launcher.add_future(result); + return runtime->execute_task(ctx, launcher); +} + +BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + BeamInferenceResult const &result = + Future(task->futures[1]).get_result(); + return rm->prepare_next_batch_beam(bc, result); +} + +// update beam search metadata +BeamSearchBatchConfig + RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result) { + const std::lock_guard lock(request_queue_mutex); + if (verbose) { + std::cout << "\n############### prepare_next_batch_beam ###############\n"; + } + if (verbose) { + std::cout << "print all results" + << "\n"; + for (int i = 0; i < 40; i++) { + std::cout << result.token_ids[i] << ", "; + } + std::cout << "Current Beam Depth: " + << old_bc.beamRequestsInfo[0].current_depth << "\n"; + } + // Step 1: Store result to the beam tree struct + store_beam_metadata(old_bc, result); + + // Step 2: preparing the next batch for existing requests + BeamSearchBatchConfig new_bc; + new_bc.model_id = old_bc.model_id; + // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; + + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + // Comment out this assertion since num_tokens_in_batch can be + // zero when beam search has reached required sequence length + // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + old_bc.requestsInfo[i].num_tokens_in_batch; + + // assert(processed_tokens < request.tokens.size()); + log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; + // if (processed_tokens > + // old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() && + // request.status == Request::RUNNING + // // || ir.results[t] == 0 TODO: replace this with + // ) { + // // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", + // // old_bc.requestsInfo[i].request_guid, + // // old_bc.beamRequestsInfo[i].max_depth); + // // // new_bc.request_completed[i] = true; + // // new_bc.request_completed[i] = false; + // // new_bc.requestsInfo[i].token_start_offset = processed_tokens; + // // new_bc.requestsInfo[i].request_guid = + // // old_bc.requestsInfo[i].request_guid; + // // new_bc.requestsInfo[i].max_sequence_length = + // // old_bc.requestsInfo[i].max_sequence_length; + // // new_bc.beamRequestsInfo[i].current_depth = + // // old_bc.beamRequestsInfo[i].current_depth; + // // new_bc.request_running[i] = false; + // std::cout << "beam search end:" << request.status << i << ", " + // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; + // } + // else + { + log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " + << new_bc.num_tokens; + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + + // update the beam search metadata + // how many sub request in current request + // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? + new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + + // update the parentid, accumalated_probs, depth, and token_ids + new_bc.beamRequestsInfo[i].beam_size = + old_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].max_depth = + old_bc.beamRequestsInfo[i].max_depth; + if (request.status == Request::RUNNING) { + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth + 1; + new_bc.request_running[i] = true; + // do the slot exchange to minimize the cache exchange in kernel. + update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + } else { + // if the request is pending, we need to update the beam search + // metadata based on the initial length + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth; + new_bc.request_running[i] = false; + } + + // do the slot exchange to minimize the cache exchange in kernel. + // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), + // i); + if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) { + // Incremental phase + if (request.status == Request::RUNNING) { + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + } else { + new_bc.requestsInfo[i].num_tokens_in_batch = 0; + } + + if (verbose) { + std::cout << "[ Beam Spec] " << request.guid << std::endl; + std::cout << "Incremental phase: " << request.tokens.size() + << ", num_tokens_in_batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + } + } else { + // Prompt phase + new_bc.requestsInfo[i].num_tokens_in_batch = + // std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens - + BatchConfig::MAX_NUM_REQUESTS + i, + (int)request.tokens.size() - + new_bc.requestsInfo[i].token_start_offset); + request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; + if (verbose) { + std::cout << "[ Beam Spec] " << request.guid << std::endl; + std::cout << "Prompt phase: " << request.tokens.size() + << ", num_tokens_in_batch:" + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << "Update ssm cache size: " << request.ssm_cache_size + << std::endl; + } + } + + if (verbose) { + std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size + << std::endl; + } + + // register more tokens due to the beam width + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + for (int k = 0; k < new_bc.sub_requests[i]; k++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + + // get value from requestinfo + if (request.status == Request::RUNNING) { + // std::cout << "[running ]Num of token in batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch + // << std::endl; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_bc.beamRequestsInfo[i].tokens[k]; + } else { + // std::cout << "[pending ]Num of token in batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch + // << std::endl; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[request.tokens.size() - 1]; + } + + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; + new_bc.num_tokens++; + } + } + } + } + if (verbose) { + std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" + << std::endl; + old_bc.print(); + new_bc.print(); + } + return new_bc; +} + +/***** Verify Phase *****/ + TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { Runtime *runtime = Runtime::get_runtime(); @@ -943,6 +1067,17 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; + int max_prompt_load_size = BatchConfig::MAX_NUM_TOKENS; + for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_batches.at(0).request_completed[i]) { + continue; + } else if (old_batches.at(0).request_running[i]) { + max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1); + } else { + max_prompt_load_size -= 1; + } + } + for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { if (old_batches.at(0).request_completed[i]) { continue; @@ -950,60 +1085,73 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = all_requests[guid]; - // Get the dfs tree - std::vector>> - all_dfs_trees; - - for (int j = 0; j < old_batches.size(); j++) { - std::vector> new_tree = - traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1); - all_dfs_trees.push_back(new_tree); - } - assert(all_dfs_trees.size() == old_batches.size()); - std::vector> dfs_tree_inputs = - merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); + // Profiling + profiling_requests[request.guid].decoding_steps += 1; - if (verbose) { - std::cout << "Request Tokens Size: " << request.tokens.size() + if (request.status == Request::RUNNING) { + new_bc.request_running[i] = true; + std::cout << "[Verify] Request " << request.guid << " is running" << std::endl; - for (int k = 0; k < request.tokens.size(); k++) { - std::cout << k << ": " << request.tokens[k] << std::endl; - } - } - // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second; - new_bc.requestsInfo[i].request_guid = - old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; - // TODO: Check this - new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.request_completed[i] = false; + // Get the dfs tree + std::vector>> + all_dfs_trees; - // Profiling - profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; - // TODO: Add prompt token first in first verify iteration - if (request.tokens.size() == request.initial_len) { - // Initialization (prompt) phase - for (int j = 0; j < request.initial_len; j++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; + for (int j = 0; j < old_batches.size(); j++) { + std::vector> new_tree = + traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1); + all_dfs_trees.push_back(new_tree); + } + assert(all_dfs_trees.size() == old_batches.size()); + std::vector> dfs_tree_inputs = + merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; + if (verbose) { + std::cout << "Request Tokens Size: " << request.tokens.size() + << std::endl; + for (int k = 0; k < request.tokens.size(); k++) { + std::cout << k << ": " << request.tokens[k] << std::endl; + } } - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = + dfs_tree_inputs.front().second; + new_bc.requestsInfo[i].request_guid = + old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_batches.at(0).requestsInfo[i].max_sequence_length; + // TODO: Check this + new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.request_completed[i] = false; + + // Committed Tokens + if (committed_tokens.find(guid) != committed_tokens.end()) { + for (int j = 0; j < dfs_tree_inputs.size(); j++) { + if (j < committed_tokens.at(guid).size()) { + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; + } + new_bc.num_tokens_to_commit++; + } + } + } + if (verbose) { + std::cout << "new_bc.num_tokens_to_commit: " + << new_bc.num_tokens_to_commit << std::endl; } - new_bc.requestsInfo[i].token_start_offset = 0; - } else { // Incremental phase: only add the last committed token new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); @@ -1013,116 +1161,124 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; } new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; - } - - if (verbose) { - std::cout << "dfs_tree_inputs.size(): " << dfs_tree_inputs.size() - << std::endl; - } - // add prompt to the dfs tree - if (committed_tokens.find(guid) != committed_tokens.end()) { - if (dfs_tree_inputs.at(0).second == - request.initial_len + committed_tokens.at(guid).size() - 1) { - for (int j = 0; j < request.initial_len; j++) { - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " << j - << ", token_index: " << j << std::endl; - } - new_bc.num_tokens_to_commit++; - } - } else { - // only add the root token - auto committed_token = committed_tokens.at(guid).at(0); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; + // Add Tokens from the DFS Tree to the next batch + for (int j = 1; j < dfs_tree_inputs.size(); j++) { + auto token = dfs_tree_inputs.at(j); if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second << std::endl; + std::cout << "[" << j << "] Token: " << token.first + << ", Depth:" << token.second << std::endl; } - new_bc.num_tokens_to_commit++; - } - if (verbose) { - std::cout << "new_bc.num_tokens_to_commit: " - << new_bc.num_tokens_to_commit << std::endl; - } - } + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + token.second; - // Token Info - for (int j = 1; j < dfs_tree_inputs.size(); j++) { - auto token = dfs_tree_inputs.at(j); - if (verbose) { - std::cout << "[" << j << "] Token: " << token.first - << ", Depth:" << token.second << std::endl; + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) { + break; + } } - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; - // TODO: Add committed token info + } else if (request.status == Request::PENDING) { + new_bc.request_running[i] = false; if (verbose) { - std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit + std::cout << "[Verify] Request " << request.guid + << " is pending in loading prompt phase" << std::endl; + std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size << std::endl; } + // Commit all tokens from the last loading batch if (committed_tokens.find(guid) != committed_tokens.end()) { - if (j < committed_tokens.at(guid).size()) { - auto committed_token = committed_tokens.at(guid).at(j); + for (int j = 0; j < committed_tokens.at(guid).size(); j++) { + auto token = committed_tokens.at(guid).at(j); new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; + token.second; new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } + token.first; + new_bc.num_tokens_to_commit++; + request.llm_cache_size++; } - } - if (verbose) { - std::cout << "new_bc.num_tokens_to_commit: " + std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl; } - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; + new_bc.requestsInfo[i].request_guid = + old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_batches.at(0).requestsInfo[i].max_sequence_length; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) { - break; - } - } + new_bc.request_completed[i] = false; - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - } + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + max_prompt_load_size, + (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); + max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch; - if (verbose) { - std::cout << "prepare_next_batch_verify OLD vs NEW batchconfigs below:" - << std::endl; - // old_batches.print(); - // new_bc.print(); + std::cout << "max_prompt_load_size: " << max_prompt_load_size + << std::endl; + std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << i << ", " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + + if (request.llm_cache_size < request.initial_len) { + // Initialization (prompt) phase + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[request.llm_cache_size + j]; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.llm_cache_size + j; + + new_bc.num_tokens++; + } + + if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { + assert(false && + "Exceeding the space available in the TreeVerify batch"); + break; + } + } else { // launch the request into running phase after loading all prompt + if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) { + request.status = Request::RUNNING; + new_bc.request_running[i] = true; + + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.tokens.size() - 1; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + + dfs_tree_inputs[guid] = + std::vector>{std::make_pair( + request.tokens.back(), request.tokens.size() - 1)}; + } + } + + } else { + assert(false && "Request status is not RUNNING or PENDING"); + } } return new_bc; @@ -1145,14 +1301,16 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } for (int i = 0; i <= old_bc.num_tokens; i++) { - int request_index = old_bc.tokensInfo[i].request_index; - - // End of the request if (i == old_bc.num_tokens || - old_bc.requestsInfo[request_index].request_guid != guid) { + old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != + guid) { + + int index = old_bc.tokensInfo[i - 1].request_index; + int beam_size = old_bc.beamRequestsInfo[index].beam_size; + int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results - int beam_width = old_bc.beamRequestsInfo[request_index].beam_size; + int beam_width = old_bc.beamRequestsInfo[index].beam_size; // Count tokens sent to model in this request to find the final token's // index @@ -1165,10 +1323,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, << ", value: " << result.token_ids[result_index] << "\n"; } - int index = old_bc.tokensInfo[i - 1].request_index; - int beam_size = old_bc.beamRequestsInfo[index].beam_size; - int depth = old_bc.beamRequestsInfo[index].current_depth; - Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; if (depth == 1) { @@ -1212,7 +1366,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // update the guid and start_depth for current request if (i < old_bc.num_tokens) { - guid = old_bc.requestsInfo[request_index].request_guid; + guid = old_bc.requestsInfo[index].request_guid; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -1585,24 +1739,27 @@ std::vector> return merged_tree; } -GenerationResult FFModel::generate(std::string const &text, +GenerationResult FFModel::generate(std::vector &prompts, int max_seq_length) { RequestManager *rm = RequestManager::get_request_manager(); if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding - return rm->generate_incr_decoding(this, text, max_seq_length); + return rm->generate_incr_decoding(this, prompts, max_seq_length); } else { // Registered SSMs: perform speculative inference - return rm->generate_spec_infer(this, text, max_seq_length); + return rm->generate_spec_infer(this, prompts, max_seq_length); } } /*static*/ -GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, - std::string const &text, - int max_seq_length) { +GenerationResult RequestManager::generate_incr_decoding( + FFModel *llm, std::vector &prompts, int max_seq_length) { InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid = register_new_request(text, max_seq_length); + RequestGuid guid; + for (int i = 0; i < prompts.size(); i++) { + guid = register_new_request(prompts.at(i), max_seq_length); + } + if (guid == 0) { std::cout << "=========== Discard request exceed prompt maximum... ===========" @@ -1652,11 +1809,13 @@ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, } /*static*/ -GenerationResult RequestManager::generate_spec_infer(FFModel *llm, - std::string const &text, - int max_seq_length) { +GenerationResult RequestManager::generate_spec_infer( + FFModel *llm, std::vector &prompts, int max_seq_length) { InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid = register_new_request(text, max_seq_length); + RequestGuid guid; + for (int i = 0; i < prompts.size(); i++) { + guid = register_new_request(prompts.at(i), max_seq_length); + } if (guid == 0) { std::cout << "=========== Discard request exceed prompt maximum... ===========" From 1d5e0c593a956b7fcc789a1b034e6ff920aad1d4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 26 Sep 2023 18:01:58 -0400 Subject: [PATCH 238/344] Fuse inference kernels (part 3) (#1146) * cleanup * linting * fuse residual + rms_norm (across layers) * add ResidualLayerNorm operator * fix * bug fix, apply residual_layer_norm operator in opt model * fix bugs * mpt * cleanup opt * starcoder * falcon * fusion fix, falcon python * mpt python * fix * fix * python opt * starcoder python * formatting * rocm fix * fix rocm 2 * fix rocm 3 * linting --- include/flexflow/ffconst.h | 1 + include/flexflow/flexflow_c.h | 13 + include/flexflow/model.h | 20 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/residual_layer_norm.h | 113 + .../flexflow/ops/residual_layer_norm_params.h | 31 + include/flexflow/substitution_loader.h | 1 + inference/models/falcon.cc | 190 +- inference/models/llama.cc | 218 +- inference/models/mpt.cc | 130 +- inference/models/opt.cc | 142 +- inference/models/starcoder.cc | 122 +- python/flexflow/core/flexflow_cffi.py | 6689 ++++++++++------- python/flexflow/serve/models/falcon.py | 40 +- python/flexflow/serve/models/llama.py | 25 +- python/flexflow/serve/models/mpt.py | 46 +- python/flexflow/serve/models/opt.py | 23 +- python/flexflow/serve/models/starcoder.py | 26 +- python/flexflow/type.py | 1 + src/c/flexflow_c.cc | 65 +- src/ops/add_bias_residual_layer_norm.cu | 8 +- src/ops/fused.cpp | 45 + src/ops/fused.cu | 45 + src/ops/layer_norm.cu | 27 +- src/ops/residual_layer_norm.cc | 823 ++ src/ops/residual_layer_norm.cpp | 247 + src/ops/residual_layer_norm.cu | 242 + src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 5 + src/runtime/model.cc | 40 + src/runtime/operator_params.cc | 3 + src/runtime/substitution.cc | 1 + 32 files changed, 6058 insertions(+), 3328 deletions(-) create mode 100644 include/flexflow/ops/residual_layer_norm.h create mode 100644 include/flexflow/ops/residual_layer_norm_params.h create mode 100644 src/ops/residual_layer_norm.cc create mode 100644 src/ops/residual_layer_norm.cpp create mode 100644 src/ops/residual_layer_norm.cu diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 9dc68e21ed..124b46862a 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -159,6 +159,7 @@ enum OperatorType { OP_POW, // https://pytorch.org/docs/stable/generated/torch.pow.html OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html OP_LAYERNORM, + OP_RESIDUAL_LAYERNORM, OP_ADD_BIAS_RESIDUAL_LAYERNORM, OP_SIGMOID_SILU_MULTI, OP_EXPERTS, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 995c238a8c..db034a78c9 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -259,6 +259,19 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle, bool use_bias, char const *name); +flexflow_tensor_t * + flexflow_model_add_residual_layer_norm(flexflow_model_t handle, + const flexflow_tensor_t input, + const flexflow_tensor_t residual1, + const flexflow_tensor_t residual2, + bool use_two_residuals, + int n, + int *axes, + bool elementwise_affine, + float eps, + bool use_bias, + char const *name); + flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_model_t handle, const flexflow_tensor_t input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 177575e809..97ee553fb3 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -106,6 +106,8 @@ enum TaskIDs { LAYERNORM_FWD_TASK_ID, LAYERNORM_INF_TASK_ID, LAYERNORM_BWD_TASK_ID, + RESIDUAL_LAYERNORM_INIT_TASK_ID, + RESIDUAL_LAYERNORM_INF_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, SIGMOID_SILU_MULTI_INIT_TASK_ID, @@ -315,6 +317,7 @@ class Flat; class Gather; class Group_by; class LayerNorm; +class ResidualLayerNorm; class AddBiasResidualLayerNorm; class SigmoidSiluMulti; class Linear; @@ -541,6 +544,18 @@ class FFModel { bool use_bias = true, DataType data_type = DT_NONE, char const *name = NULL); + // Add a layer_norm layer with residual(s) + void residual_layer_norm(const Tensor input, + const Tensor residual1, + const Tensor residual2, + Tensor *outputs, + bool use_two_residuals, + std::vector const &axes, + bool elementwise_affine, + float eps, + bool use_bias = true, + DataType data_type = DT_NONE, + char const *name = NULL); // Add a add_bias_residual_layer_norm layer void add_bias_residual_layer_norm(const Tensor input, const Tensor residual, @@ -1148,6 +1163,11 @@ class FFModel { Group_by *>, std::unordered_map, LayerNorm *>, + std::unordered_map, + ResidualLayerNormParams>, + ResidualLayerNorm *>, std::unordered_map< std::pair, AddBiasResidualLayerNormParams>, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 31f18049ff..5b187839ef 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -26,6 +26,7 @@ #include "flexflow/ops/pool_2d_params.h" #include "flexflow/ops/reduce_params.h" #include "flexflow/ops/reshape_params.h" +#include "flexflow/ops/residual_layer_norm_params.h" #include "flexflow/ops/residual_rms_norm_params.h" #include "flexflow/ops/rms_norm_params.h" #include "flexflow/ops/sampling_params.h" @@ -62,6 +63,7 @@ using OperatorParameters = mp::variant; + ResidualLayerNorm(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + ResidualLayerNorm(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + const ParallelTensor _residual1, + const ParallelTensor _residual2, + bool _use_two_residuals, + std::vector const &axes, + bool _elementwise_affine, + bool _use_bias, + float _eps, + bool allocate_weights, + char const *name); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + ResidualLayerNormParams get_params() const; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + template + static void inference_kernel(ResidualLayerNormMeta const *m, + T const *input_ptr, + T const *residual1_ptr, + T const *residual2_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + ffStream_t stream); + static void inference_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &residual1, + GenericTensorAccessorR const &residual2, + GenericTensorAccessorW &added_output, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); + +public: + bool elementwise_affine, use_bias, use_two_residuals; + int64_t effective_batch_size, effective_num_elements; + float eps; + std::vector axes; +}; + +class ResidualLayerNormMeta : public OpMeta { +public: + ResidualLayerNormMeta(FFHandler handle, + ResidualLayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator); + ~ResidualLayerNormMeta(void); + +public: + bool elementwise_affine, use_bias, use_two_residuals; + int64_t effective_batch_size, effective_num_elements; + float eps; + void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; + char op_name[MAX_OPNAME]; + Realm::RegionInstance reserveInst; +}; + +}; // namespace FlexFlow diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h new file mode 100644 index 0000000000..24da4a2c08 --- /dev/null +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -0,0 +1,31 @@ +#pragma once + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct ResidualLayerNormParams { + LayerID layer_guid; + std::vector axes; + bool elementwise_affine; + float eps; + bool use_bias; + bool use_two_residuals; + bool is_valid(std::tuple const &) const; +}; + +bool operator==(ResidualLayerNormParams const &, + ResidualLayerNormParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ResidualLayerNormParams const &) const; +}; +} // namespace std diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h index ff2fd10446..e0c252ffd8 100644 --- a/include/flexflow/substitution_loader.h +++ b/include/flexflow/substitution_loader.h @@ -125,6 +125,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM( {OP_POW, "OP_POW"}, {OP_MEAN, "OP_MEAN"}, {OP_LAYERNORM, "OP_LAYERNORM"}, + {OP_RESIDUAL_LAYERNORM, "OP_RESIDUAL_LAYERNORM"}, {OP_ADD_BIAS_RESIDUAL_LAYERNORM, "OP_ADD_BIAS_RESIDUAL_LAYERNORM"}, {OP_SIGMOID_SILU_MULTI, "OP_SIGMOID_SILU_MULTI"}, {OP_RMS_NORM, "OP_RMS_NORM"}, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index e89e22450c..3be92a953c 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -57,22 +57,43 @@ void FALCON::create_falcon_model(FFModel &ff, embed_init, "word_embeddings"); + Tensor mha = nullptr, mlp_output = nullptr; + Tensor res_ln_outputs[2] = {nullptr, nullptr}; + for (int i = 0; i < falcon_config.n_layer; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention - std::string layer_name = "layers_" + std::to_string(i) + "_input_layernorm"; - Tensor att_norm = ff.layer_norm(token, - axes, - true, - falcon_config.layer_norm_epsilon, - true, - DT_NONE, - layer_name.c_str()); - - Tensor mha; - layer_name = "layers_" + std::to_string(i) + "_attention"; + Tensor att_norm = nullptr; + if (i == 0) { + att_norm = ff.layer_norm( + token, + axes, + true, + falcon_config.layer_norm_epsilon, + true, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_input_layernorm") + .c_str()); + } else { + ff.residual_layer_norm( + token, + mha, + mlp_output, + res_ln_outputs, + true, + axes, + true, + falcon_config.layer_norm_epsilon, + true, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_input_layernorm") + .c_str()); + token = res_ln_outputs[0]; + att_norm = res_ln_outputs[1]; + } + switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multiquery_self_attention( @@ -82,18 +103,19 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - NULL, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + NULL, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -106,18 +128,19 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -130,18 +153,19 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -150,45 +174,49 @@ void FALCON::create_falcon_model(FFModel &ff, } } - layer_name = "layers_" + std::to_string(i) + "_mlp_dense_h_to_4h"; - Tensor dense_h_to_4h = ff.dense(att_norm, - falcon_config.hidden_size * 4, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); + Tensor dense_h_to_4h = ff.dense( + att_norm, + falcon_config.hidden_size * 4, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h") + .c_str()); dense_h_to_4h = ff.gelu(dense_h_to_4h); - layer_name = "layers_" + std::to_string(i) + "_mlp_dense_4h_to_h"; - Tensor mlp_output = ff.dense(dense_h_to_4h, - falcon_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); - - token = ff.add(token, mha); - token = ff.add(token, mlp_output); + mlp_output = ff.dense( + dense_h_to_4h, + falcon_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h") + .c_str()); } // final normalization and linear - Tensor ln_f = ff.layer_norm(token, - axes, - true, - falcon_config.layer_norm_epsilon, - true, - DT_NONE, - "ln_f"); + ff.residual_layer_norm(token, + mha, + mlp_output, + res_ln_outputs, + true, + axes, + true, + falcon_config.layer_norm_epsilon, + true, + DT_NONE, + "ln_f"); + Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, falcon_config.vocab_size, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index c71755a3d3..56f919ace1 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -57,20 +57,38 @@ void LLAMA::create_llama_model(FFModel &ff, embed_init, "tok_embeddings"); + Tensor w2 = nullptr; + for (int i = 0; i < llama_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention - std::string layer_name = "layers_" + std::to_string(i) + "_attention_norm"; - Tensor att_norm = ff.rms_norm(token, - llama_config.rms_norm_eps, - llama_config.hidden_size, - DT_NONE, - layer_name.c_str()); + Tensor att_norm = nullptr; + Tensor token_att_norm[2] = {nullptr, nullptr}; + if (i == 0) { + att_norm = ff.rms_norm( + token, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_attention_norm") + .c_str()); + } else { + ff.residual_rms_norm( + token, + w2, + token_att_norm, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_attention_norm") + .c_str()); + token = token_att_norm[0]; + att_norm = token_att_norm[1]; + } Tensor mha; - layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( @@ -79,18 +97,19 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - NULL, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + NULL, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -101,18 +120,19 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -123,18 +143,19 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, - 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + 0.0f, /*dropout*/ + false, /*qkv_bias*/ + false, /*final_bias*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + true, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -144,70 +165,73 @@ void LLAMA::create_llama_model(FFModel &ff, } // step 2: SILU activaion - layer_name = "layers_" + std::to_string(i) + "_ffn_norm"; - Tensor token_ff_norm[2]; - ff.residual_rms_norm(token, - mha, - token_ff_norm, - llama_config.rms_norm_eps, - llama_config.hidden_size, - DT_NONE, - layer_name.c_str()); - + Tensor token_ff_norm[2] = {nullptr, nullptr}; + ff.residual_rms_norm( + token, + mha, + token_ff_norm, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); token = token_ff_norm[0]; Tensor ff_norm = token_ff_norm[1]; - layer_name = "layers_" + std::to_string(i) + "_feed_forward_w1"; - Tensor w1 = ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); + Tensor w1 = + ff.dense(ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_feed_forward_w1") + .c_str()); - layer_name = "layers_" + std::to_string(i) + "_feed_forward_w3"; - Tensor w3 = ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); + Tensor w3 = + ff.dense(ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_feed_forward_w3") + .c_str()); Tensor multi = ff.sigmoid_silu_multi(w1, w3); - layer_name = "layers_" + std::to_string(i) + "_feed_forward_w2"; - Tensor w2 = ff.dense(multi, - llama_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); - token = ff.add(token, w2); + w2 = + ff.dense(multi, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_feed_forward_w2") + .c_str()); } // final normalization and linear - std::vector axes = {2}; - token = ff.rms_norm(token, - llama_config.rms_norm_eps, - llama_config.hidden_size, - DT_NONE, - "norm"); + Tensor final_rms_norm_output[2] = {nullptr, nullptr}; + ff.residual_rms_norm(token, + w2, + final_rms_norm_output, + llama_config.rms_norm_eps, + llama_config.hidden_size, + DT_NONE, + "norm"); - Tensor dense = ff.dense(token, + Tensor dense = ff.dense(final_rms_norm_output[1], llama_config.vocab_size, AC_MODE_NONE, false, diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 933d1a0b18..3bd1b912ed 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -56,17 +56,39 @@ void MPT::create_mpt_model(FFModel &ff, embed_init, "transformer_wte"); + Tensor intermediate_output = nullptr, layernorm_output = nullptr; + Tensor res_ln_outputs[2] = {nullptr, nullptr}; + for (int i = 0; i < mpt_config.n_layers; i++) { ff.set_transformer_layer_id(i); - Tensor residual = hidden_states; - - std::string layer_name = "layers_" + std::to_string(i) + "_norm_1"; - Tensor layernorm_output = ff.layer_norm( - hidden_states, axes, true, 1e-05, false, DT_NONE, layer_name.c_str()); + if (i == 0) { + layernorm_output = ff.layer_norm( + hidden_states, + axes, + true, + 1e-05, + false, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + } else { + ff.residual_layer_norm( + intermediate_output, + hidden_states, + nullptr, + res_ln_outputs, + false, + axes, + true, + 1e-05, + false, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + hidden_states = res_ln_outputs[0]; + layernorm_output = res_ln_outputs[1]; + } Tensor attn_outputs; - layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { attn_outputs = ff.spec_inc_multihead_self_attention( @@ -87,7 +109,8 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - layer_name.c_str() /*name*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -110,7 +133,8 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - layer_name.c_str() /*name*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -133,7 +157,8 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - layer_name.c_str() /*name*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -142,49 +167,62 @@ void MPT::create_mpt_model(FFModel &ff, } } - hidden_states = ff.add(attn_outputs, residual); - - layer_name = "layers_" + std::to_string(i) + "_norm_2"; - layernorm_output = ff.layer_norm( - hidden_states, axes, true, 1e-05, false, DT_NONE, layer_name.c_str()); - - residual = hidden_states; + ff.residual_layer_norm( + attn_outputs, + hidden_states, + nullptr, + res_ln_outputs, + false, + axes, + true, + 1e-05, + false, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_norm_2").c_str()); + hidden_states = res_ln_outputs[0]; + layernorm_output = res_ln_outputs[1]; // MLP - layer_name = "layers_" + std::to_string(i) + "_ffn_up_proj"; - layernorm_output = ff.dense(layernorm_output, - 4 * mpt_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); - + layernorm_output = ff.dense( + layernorm_output, + 4 * mpt_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str()); layernorm_output = ff.gelu(layernorm_output); - - layer_name = "layers_" + std::to_string(i) + "_ffn_down_proj"; - Tensor intermediate_output = ff.dense(layernorm_output, - mpt_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); - - hidden_states = ff.add(intermediate_output, residual); + intermediate_output = ff.dense( + layernorm_output, + mpt_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str()); } // final - Tensor all_final_norm = ff.layer_norm( - hidden_states, axes, true, 1e-05, false, DT_NONE, "transformer_norm_f"); + ff.residual_layer_norm(intermediate_output, + hidden_states, + nullptr, + res_ln_outputs, + false, + axes, + true, + 1e-05, + false, + DT_NONE, + "transformer_norm_f"); + Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, mpt_config.vocab_size, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 5f2494d0b2..cdab25bfca 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -69,7 +69,8 @@ void OPT::create_opt_model(FFModel &ff, embed_init, "embed_positions"); - Tensor residual = ff.add(token, positional_embedding); + Tensor fc2 = nullptr, added = nullptr; + Tensor res_ln_outputs[2] = {nullptr, nullptr}; for (int i = 0; i < opt_config.num_hidden_layers; i++) { // set transformer layer id @@ -79,20 +80,23 @@ void OPT::create_opt_model(FFModel &ff, // 350m applies layer norm AFTER attention // https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#LL324C1-L325C1 // this version is before normalization - - std::string layer_name = - "layers_" + std::to_string(i) + "_attention_layer_norm"; - Tensor hidden_states = - ff.layer_norm(residual, - axes, - opt_config.layer_norm_elementwise_affine, - 1e-05, - true, - DT_NONE, - layer_name.c_str()); + ff.residual_layer_norm( + (i == 0) ? token : added, + (i == 0) ? positional_embedding : fc2, + nullptr, + res_ln_outputs, + false, + axes, + opt_config.layer_norm_elementwise_affine, + 1e-05, + true, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_attention_layer_norm") + .c_str()); + Tensor residual = res_ln_outputs[0]; + Tensor hidden_states = res_ln_outputs[1]; Tensor mha; - layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( @@ -110,10 +114,11 @@ void OPT::create_opt_model(FFModel &ff, false, /*apply_rotary_embedding*/ true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), - -0.5), /*scaling factor*/ - false, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + -0.5), /*scaling factor*/ + false, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -133,10 +138,11 @@ void OPT::create_opt_model(FFModel &ff, false, /*apply_rotary_embedding*/ true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), - -0.5), /*scaling factor*/ - false, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + -0.5), /*scaling factor*/ + false, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -156,10 +162,11 @@ void OPT::create_opt_model(FFModel &ff, false, /*apply_rotary_embedding*/ true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), - -0.5), /*scaling factor*/ - false, /*qk_prod_scaling*/ - false, /*position_bias*/ - layer_name.c_str() /*name*/ + -0.5), /*scaling factor*/ + false, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -168,63 +175,60 @@ void OPT::create_opt_model(FFModel &ff, } } - // Tensor added = ff.add(mha, residual); - // Tensor final_norm = ff.layer_norm( - // added, axes, opt_config.layer_norm_elementwise_affine, 1e-05); - - layer_name = - "layers_" + std::to_string(i) + "_add_bias_residual_layer_norm"; - Tensor added_final_norm[2]; ff.add_bias_residual_layer_norm(mha, residual, - added_final_norm, + res_ln_outputs, axes, opt_config.layer_norm_elementwise_affine, 1e-05, true, DT_NONE, - layer_name.c_str()); - Tensor added = added_final_norm[0]; - Tensor final_norm = added_final_norm[1]; + std::string("layers_" + std::to_string(i) + + "_add_bias_residual_layer_norm") + .c_str()); + added = res_ln_outputs[0]; + Tensor final_norm = res_ln_outputs[1]; //--------linear fc1 fc2 ---------- - layer_name = "layers_" + std::to_string(i) + "_fc1"; - Tensor fc1 = ff.dense(final_norm, - opt_config.ffn_dim, - AC_MODE_NONE, - true, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); + Tensor fc1 = + ff.dense(final_norm, + opt_config.ffn_dim, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_fc1").c_str()); Tensor activation = ff.relu(fc1, false); - layer_name = "layers_" + std::to_string(i) + "_fc2"; - Tensor fc2 = ff.dense(activation, - opt_config.hidden_size, - AC_MODE_NONE, - true, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); - residual = ff.add(added, fc2); + fc2 = ff.dense(activation, + opt_config.hidden_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_fc2").c_str()); } // final - Tensor all_final_norm = - ff.layer_norm(residual, - axes, - opt_config.layer_norm_elementwise_affine, - 1e-05, - true, - DT_NONE, - "final_layer_norm"); + ff.residual_layer_norm(added, + fc2, + nullptr, + res_ln_outputs, + false, + axes, + opt_config.layer_norm_elementwise_affine, + 1e-05, + true, + DT_NONE, + "final_layer_norm"); + Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, opt_config.vocab_size, diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index f9659c7279..89b53b1cf5 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -74,24 +74,30 @@ void STARCODER::create_starcoder_model( embed_init, "transformer_wpe"); - Tensor hidden_states = ff.add(token, positional_embedding); + Tensor residual = nullptr, c_proj = nullptr; + Tensor res_ln_outputs[2] = {nullptr, nullptr}; for (int i = 0; i < startcoder_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); // step 1: attention - std::string layer_name = "layers_" + std::to_string(i) + "_ln_1"; - Tensor ln_1 = ff.layer_norm(hidden_states, - axes, - true, - startcoder_config.layer_norm_epsilon, - true, - DT_NONE, - layer_name.c_str()); + ff.residual_layer_norm( + (i == 0) ? token : residual, + (i == 0) ? positional_embedding : c_proj, + nullptr, + res_ln_outputs, + false, + axes, + true, + startcoder_config.layer_norm_epsilon, + true, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_ln_1").c_str()); + Tensor hidden_states = res_ln_outputs[0]; + Tensor ln_1 = res_ln_outputs[1]; Tensor mha; - layer_name = "layers_" + std::to_string(i) + "_attention"; switch (mode) { case INC_DECODING_MODE: { mha = ff.inc_multiquery_self_attention( @@ -114,7 +120,8 @@ void STARCODER::create_starcoder_model( 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - layer_name.c_str() /*name*/ + std::string("layers_" + std::to_string(i) + "_attention") + .c_str() /*name*/ ); break; } @@ -123,56 +130,63 @@ void STARCODER::create_starcoder_model( } } - Tensor residual = ff.add(hidden_states, mha); - - layer_name = "layers_" + std::to_string(i) + "_ln_2"; - Tensor l2_norm = ff.layer_norm(residual, - axes, - true, - startcoder_config.layer_norm_epsilon, - true, - DT_NONE, - layer_name.c_str()); + ff.residual_layer_norm( + hidden_states, + mha, + nullptr, + res_ln_outputs, + false, + axes, + true, + startcoder_config.layer_norm_epsilon, + true, + DT_NONE, + std::string("layers_" + std::to_string(i) + "_ln_2").c_str()); + residual = res_ln_outputs[0]; + Tensor l2_norm = res_ln_outputs[1]; // mlp - layer_name = "layers_" + std::to_string(i) + "_mlp_c_fc"; - Tensor c_fc = ff.dense(l2_norm, - startcoder_config.intermediate_size, - AC_MODE_NONE, - true, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); + Tensor c_fc = ff.dense( + l2_norm, + startcoder_config.intermediate_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str()); c_fc = ff.gelu(c_fc); - layer_name = "layers_" + std::to_string(i) + "_mlp_c_proj"; - Tensor c_proj = ff.dense(c_fc, - startcoder_config.hidden_size, - AC_MODE_NONE, - true, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - layer_name.c_str()); - - hidden_states = ff.add(residual, c_proj); + c_proj = ff.dense( + c_fc, + startcoder_config.hidden_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str()); } // final normalization and linear - Tensor ln_f = ff.layer_norm(hidden_states, - axes, - true, - startcoder_config.layer_norm_epsilon, - true, - DT_NONE, - "transformer_ln_f"); + ff.residual_layer_norm(residual, + c_proj, + nullptr, + res_ln_outputs, + false, + axes, + true, + startcoder_config.layer_norm_epsilon, + true, + DT_NONE, + "transformer_ln_f"); + Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, startcoder_config.vocab_size, diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index b2231b58e6..b029f1e2ff 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -18,3356 +18,4231 @@ import warnings import numpy as np from .flexflow_logger import fflogger -from flexflow.type import ActiMode, RegularizerMode, AggrMode, PoolType, DataType, LossType, CompMode, MetricsType, InferenceMode, ModelType, OpType, ParameterSyncType, enum_to_int, int_to_enum +from flexflow.type import ( + ActiMode, + RegularizerMode, + AggrMode, + PoolType, + DataType, + LossType, + CompMode, + MetricsType, + InferenceMode, + ModelType, + OpType, + ParameterSyncType, + enum_to_int, + int_to_enum, +) from flexflow.config import * from .flexflowlib import ffi, flexflow_library + def ffc(): - if not flexflow_already_initialized(): - raise RuntimeError("Cannot use FlexFlow library before initializing FlexFlow") - ffc = flexflow_library.lib - if ffc is None: - raise RuntimeError("FlexFlow library is None") - return ffc + if not flexflow_already_initialized(): + raise RuntimeError("Cannot use FlexFlow library before initializing FlexFlow") + ffc = flexflow_library.lib + if ffc is None: + raise RuntimeError("FlexFlow library is None") + return ffc + ff_tracing_id = 200 -warnings.simplefilter('always', DeprecationWarning) +warnings.simplefilter("always", DeprecationWarning) + def get_c_name(name): - if name is None: - return ffi.NULL - else: - return ffi.new("char[]", name.encode('ascii')) + if name is None: + return ffi.NULL + else: + return ffi.new("char[]", name.encode("ascii")) + def get_datatype_size(datatype): - if (datatype == DataType.DT_HALF): - return 2 - if (datatype == DataType.DT_FLOAT): - return 4 - elif (datatype == DataType.DT_DOUBLE): - return 8 - elif (datatype == DataType.DT_INT32): - return 4 - elif (datatype == DataType.DT_INT64): - return 8 - else: - assert 0, "unknow datatype" + str(datatype) - return 0 + if datatype == DataType.DT_HALF: + return 2 + if datatype == DataType.DT_FLOAT: + return 4 + elif datatype == DataType.DT_DOUBLE: + return 8 + elif datatype == DataType.DT_INT32: + return 4 + elif datatype == DataType.DT_INT64: + return 8 + else: + assert 0, "unknow datatype" + str(datatype) + return 0 + # ----------------------------------------------------------------------- # Op # ----------------------------------------------------------------------- class Op(object): - __slots__ = ['handle', 'idx', 'name'] - def __init__(self, handle, idx=None, name=None): - assert ffi.typeof(handle) == ffi.typeof('flexflow_op_t'), "Op handle is wrong" - self.handle = handle - self.idx = idx - self.name = name + __slots__ = ["handle", "idx", "name"] + + def __init__(self, handle, idx=None, name=None): + assert ffi.typeof(handle) == ffi.typeof("flexflow_op_t"), "Op handle is wrong" + self.handle = handle + self.idx = idx + self.name = name + + def get_number_parameters(self): + return ffc().flexflow_op_get_num_parameters(self.handle) - def get_number_parameters(self): - return ffc().flexflow_op_get_num_parameters(self.handle) + def get_parameter_by_id(self, id): + handle = ffc().flexflow_op_get_parameter_by_id(self.handle, id) + return Parameter(handle) - def get_parameter_by_id(self, id): - handle = ffc().flexflow_op_get_parameter_by_id(self.handle, id) - return Parameter(handle) + def get_number_inputs(self): + return ffc().flexflow_op_get_num_inputs(self.handle) - def get_number_inputs(self): - return ffc().flexflow_op_get_num_inputs(self.handle) + def get_input_by_id(self, id): + handle = ffc().flexflow_op_get_input_by_id(self.handle, id) + return Tensor(handle, False) - def get_input_by_id(self, id): - handle = ffc().flexflow_op_get_input_by_id(self.handle, id) - return Tensor(handle, False) + def get_number_outputs(self): + return ffc().flexflow_op_get_num_outputs(self.handle) - def get_number_outputs(self): - return ffc().flexflow_op_get_num_outputs(self.handle) + def get_output_by_id(self, id): + handle = ffc().flexflow_op_get_output_by_id(self.handle, id) + return Tensor(handle, False) - def get_output_by_id(self, id): - handle = ffc().flexflow_op_get_output_by_id(self.handle, id) - return Tensor(handle, False) + def init(self, model): + ffc().flexflow_op_init(self.handle, model.handle) - def init(self, model): - ffc().flexflow_op_init(self.handle, model.handle) + def forward(self, model): + ffc().flexflow_op_forward(self.handle, model.handle) + # return Tensor(handle) - def forward(self, model): - ffc().flexflow_op_forward(self.handle, model.handle) - #return Tensor(handle) + def _add_to_model(self, model): + ffc().flexflow_op_add_to_model(self.handle, model.handle) - def _add_to_model(self, model): - ffc().flexflow_op_add_to_model(self.handle, model.handle) + def get_output_tensor(self): + return self.get_output_by_id(0) - def get_output_tensor(self): - return self.get_output_by_id(0) # ----------------------------------------------------------------------- # Exp # ----------------------------------------------------------------------- class Exp(Op): - def __init__(self, handle, idx=None, name=None): - super(Exp, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Exp, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Sin # ----------------------------------------------------------------------- class Sin(Op): - def __init__(self, handle, idx=None, name=None): - super(Sin, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Sin, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Cos # ----------------------------------------------------------------------- class Cos(Op): - def __init__(self, handle, idx=None, name=None): - super(Cos, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Cos, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Add # ----------------------------------------------------------------------- class Add(Op): - def __init__(self, handle, idx=None, name=None): - super(Add, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Add, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Subtract # ----------------------------------------------------------------------- class Subtract(Op): - def __init__(self, handle, idx=None, name=None): - super(Subtract, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Subtract, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Multiply # ----------------------------------------------------------------------- class Multiply(Op): - def __init__(self, handle, idx=None, name=None): - super(Multiply, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Multiply, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Divide # ----------------------------------------------------------------------- class Divide(Op): - def __init__(self, handle, idx=None, name=None): - super(Divide, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Divide, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Max # ----------------------------------------------------------------------- class Max(Op): - def __init__(self, handle, idx=None, name=None): - super(Max, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Max, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Min # ----------------------------------------------------------------------- class Min(Op): - def __init__(self, handle, idx=None, name=None): - super(Min, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Min, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ReduceSum # ----------------------------------------------------------------------- class ReduceSum(Op): - def __init__(self, handle, idx=None, name=None): - super(ReduceSum, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ReduceSum, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Conv2D # ----------------------------------------------------------------------- class Conv2D(Op): - def __init__(self, handle, idx=None, name=None): - super(Conv2D, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Conv2D, self).__init__(handle, idx, name) - def get_weight_tensor(self): - return self.get_parameter_by_id(0) + def get_weight_tensor(self): + return self.get_parameter_by_id(0) - def get_bias_tensor(self): - return self.get_parameter_by_id(1) + def get_bias_tensor(self): + return self.get_parameter_by_id(1) - def get_input_tensor(self): - return self.get_input_by_id(0) + def get_input_tensor(self): + return self.get_input_by_id(0) + + def get_output_tensor(self): + return self.get_output_by_id(0) - def get_output_tensor(self): - return self.get_output_by_id(0) # ----------------------------------------------------------------------- # Pool2D # ----------------------------------------------------------------------- class Pool2D(Op): - def __init__(self, handle, idx=None, name=None): - super(Pool2D, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Pool2D, self).__init__(handle, idx, name) + + def get_input_tensor(self): + return self.get_input_by_id(0) - def get_input_tensor(self): - return self.get_input_by_id(0) + def get_output_tensor(self): + return self.get_output_by_id(0) - def get_output_tensor(self): - return self.get_output_by_id(0) # ----------------------------------------------------------------------- # Linear # ----------------------------------------------------------------------- class Linear(Op): - def __init__(self, handle, idx=None, name=None): - super(Linear, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Linear, self).__init__(handle, idx, name) - def get_weight_tensor(self): - return self.get_parameter_by_id(0) + def get_weight_tensor(self): + return self.get_parameter_by_id(0) - def get_bias_tensor(self): - return self.get_parameter_by_id(1) + def get_bias_tensor(self): + return self.get_parameter_by_id(1) - def get_input_tensor(self): - return self.get_input_by_id(0) + def get_input_tensor(self): + return self.get_input_by_id(0) + + def get_output_tensor(self): + return self.get_output_by_id(0) - def get_output_tensor(self): - return self.get_output_by_id(0) # ----------------------------------------------------------------------- # Flat # ----------------------------------------------------------------------- class Flat(Op): - def __init__(self, handle, idx=None, name=None): - super(Flat, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Flat, self).__init__(handle, idx, name) + + def get_input_tensor(self): + return self.get_input_by_id(0) - def get_input_tensor(self): - return self.get_input_by_id(0) + def get_output_tensor(self): + return self.get_output_by_id(0) - def get_output_tensor(self): - return self.get_output_by_id(0) # ----------------------------------------------------------------------- # Softmax # ----------------------------------------------------------------------- class Softmax(Op): - def __init__(self, handle, idx=None, name=None): - super(Softmax, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Softmax, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Embedding # ----------------------------------------------------------------------- class Embedding(Op): - def __init__(self, handle, idx=None, name=None): - super(Embedding, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Embedding, self).__init__(handle, idx, name) + + def get_weight_tensor(self): + return self.get_parameter_by_id(0) - def get_weight_tensor(self): - return self.get_parameter_by_id(0) # ----------------------------------------------------------------------- # Concat # ----------------------------------------------------------------------- class Concat(Op): - def __init__(self, handle, idx=None, name=None): - super(Concat, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Concat, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # BatchNorm # ----------------------------------------------------------------------- class BatchNorm(Op): - def __init__(self, handle, idx=None, name=None): - super(BatchNorm, self).__init__(handle, idx, name) - + def __init__(self, handle, idx=None, name=None): + super(BatchNorm, self).__init__(handle, idx, name) + + # ----------------------------------------------------------------------- # LayerNorm # ----------------------------------------------------------------------- class LayerNorm(Op): - def __init__(self, handle, idx=None, name=None): - super(LayerNorm, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(LayerNorm, self).__init__(handle, idx, name) + + def get_weight_tensor(self): + return self.get_parameter_by_id(0) + + def get_bias_tensor(self): + return self.get_parameter_by_id(1) + - def get_weight_tensor(self): - return self.get_parameter_by_id(0) +# ----------------------------------------------------------------------- +# ResidualLayerNorm +# ----------------------------------------------------------------------- +class ResidualLayerNorm(Op): + def __init__(self, handle, idx=None, name=None): + super(ResidualLayerNorm, self).__init__(handle, idx, name) + + def get_weight_tensor(self): + return self.get_parameter_by_id(1) + + def get_bias_tensor(self): + return self.get_parameter_by_id(2) - def get_bias_tensor(self): - return self.get_parameter_by_id(1) # ----------------------------------------------------------------------- # AddBiasResidualLayerNorm # ----------------------------------------------------------------------- class AddBiasResidualLayerNorm(Op): - def __init__(self, handle, idx=None, name=None): - super(AddBiasResidualLayerNorm, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(AddBiasResidualLayerNorm, self).__init__(handle, idx, name) + + def get_attn_bias_tensor(self): + return self.get_parameter_by_id(0) - def get_attn_bias_tensor(self): - return self.get_parameter_by_id(0) - - def get_weight_tensor(self): - return self.get_parameter_by_id(1) + def get_weight_tensor(self): + return self.get_parameter_by_id(1) + + def get_bias_tensor(self): + return self.get_parameter_by_id(2) - def get_bias_tensor(self): - return self.get_parameter_by_id(2) # ----------------------------------------------------------------------- # SigmoidSiluMulti # ----------------------------------------------------------------------- class SigmoidSiluMulti(Op): - def __init__(self, handle, idx=None, name=None): - super(SigmoidSiluMulti, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(SigmoidSiluMulti, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Dropout # ----------------------------------------------------------------------- class Dropout(Op): - def __init__(self, handle, idx=None, name=None): - super(Dropout, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Dropout, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ScalarMultiply # ----------------------------------------------------------------------- class ScalarMultiply(Op): - def __init__(self, handle, idx=None, name=None): - super(ScalarMultiply, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ScalarMultiply, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ScalarAdd # ----------------------------------------------------------------------- class ScalarAdd(Op): - def __init__(self, handle, idx=None, name=None): - super(ScalarAdd, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ScalarAdd, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ScalarSub # ----------------------------------------------------------------------- class ScalarSub(Op): - def __init__(self, handle, idx=None, name=None): - super(ScalarSub, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ScalarSub, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ScalarTrueDiv # ----------------------------------------------------------------------- class ScalarTrueDiv(Op): - def __init__(self, handle, idx=None, name=None): - super(ScalarTrueDiv, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ScalarTrueDiv, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Rsqrt # ----------------------------------------------------------------------- class Rsqrt(Op): - def __init__(self, handle, idx=None, name=None): - super(Rsqrt, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Rsqrt, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Pow # ----------------------------------------------------------------------- class Pow(Op): - def __init__(self, handle, idx=None, name=None): - super(Pow, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Pow, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Mean # ----------------------------------------------------------------------- class Mean(Op): - def __init__(self, handle, idx=None, name=None): - super(Mean, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Mean, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Relu # ----------------------------------------------------------------------- class Relu(Op): - def __init__(self, handle, idx=None, name=None): - super(Relu, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Relu, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Gelu # ----------------------------------------------------------------------- class Gelu(Op): - def __init__(self, handle, idx=None, name=None): - super(Gelu, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Gelu, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Sigmod # ----------------------------------------------------------------------- class Sigmoid(Op): - def __init__(self, handle, idx=None, name=None): - super(Sigmoid, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Sigmoid, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Tanh # ----------------------------------------------------------------------- class Tanh(Op): - def __init__(self, handle, idx=None, name=None): - super(Tanh, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Tanh, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Elu # ----------------------------------------------------------------------- class Elu(Op): - def __init__(self, handle, idx=None, name=None): - super(Elu, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Elu, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Batch_Norm # ----------------------------------------------------------------------- class Batch_Norm(Op): - def __init__(self, handle, idx=None, name=None): - super(Batch_Norm, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Batch_Norm, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Batch_Matmul # ----------------------------------------------------------------------- class Batch_Matmul(Op): - def __init__(self, handle, idx=None, name=None): - super(Batch_Matmul, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Batch_Matmul, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Split # ----------------------------------------------------------------------- class Split(Op): - def __init__(self, handle, idx=None, name=None): - super(Split, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Split, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Reshape # ----------------------------------------------------------------------- class Reshape(Op): - def __init__(self, handle, idx=None, name=None): - super(Reshape, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Reshape, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Gather # ----------------------------------------------------------------------- class Gather(Op): - def __init__(self, handle, idx=None, name=None): - super(Gather, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Gather, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Identity # ----------------------------------------------------------------------- class Identity(Op): - def __init__(self, handle, idx=None, name=None): - super(Identity, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Identity, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Transpose # ----------------------------------------------------------------------- class Transpose(Op): - def __init__(self, handle, idx=None, name=None): - super(Transpose, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Transpose, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Reverse # ----------------------------------------------------------------------- class Reverse(Op): - def __init__(self, handle, idx=None, name=None): - super(Reverse, self).__init__(handle, idx, name) - + def __init__(self, handle, idx=None, name=None): + super(Reverse, self).__init__(handle, idx, name) + + # ----------------------------------------------------------------------- # MultiHeadAttention # ----------------------------------------------------------------------- class MultiHeadAttention(Op): - def __init__(self, handle, idx=None, name=None): - super(MultiHeadAttention, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(MultiHeadAttention, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Incremental MultiHeadAttention # ----------------------------------------------------------------------- class IncMultiHeadAttention(Op): - def __init__(self, handle, idx=None, name=None): - super(IncMultiHeadAttention, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(IncMultiHeadAttention, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Speculative Incremental MultiHeadAttention # ----------------------------------------------------------------------- class SpecIncMultiHeadSelfAttention(Op): - def __init__(self, handle, idx=None, name=None): - super(SpecIncMultiHeadSelfAttention, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(SpecIncMultiHeadSelfAttention, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # TreeVerify Incremental MultiHeadAttention # ----------------------------------------------------------------------- class TreeIncMultiHeadSelfAttention(Op): - def __init__(self, handle, idx=None, name=None): - super(TreeIncMultiHeadSelfAttention, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(TreeIncMultiHeadSelfAttention, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # RMS Norm # ----------------------------------------------------------------------- class RMSNorm(Op): - def __init__(self, handle, idx=None, name=None): - super(RMSNorm, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(RMSNorm, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Residual RMS Norm # ----------------------------------------------------------------------- class ResidualRMSNorm(Op): - def __init__(self, handle, idx=None, name=None): - super(ResidualRMSNorm, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ResidualRMSNorm, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ArgTopK # ----------------------------------------------------------------------- class ArgTopK(Op): - def __init__(self, handle, idx=None, name=None): - super(ArgTopK, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ArgTopK, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # BeamTopK # ----------------------------------------------------------------------- class BeamTopK(Op): - def __init__(self, handle, idx=None, name=None): - super(BeamTopK, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(BeamTopK, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # Sampling # ----------------------------------------------------------------------- class Sampling(Op): - def __init__(self, handle, idx=None, name=None): - super(Sampling, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(Sampling, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # ArgMax # ----------------------------------------------------------------------- class ArgMax(Op): - def __init__(self, handle, idx=None, name=None): - super(ArgMax, self).__init__(handle, idx, name) + def __init__(self, handle, idx=None, name=None): + super(ArgMax, self).__init__(handle, idx, name) + # ----------------------------------------------------------------------- # flexflow_op_t handle to Op # ----------------------------------------------------------------------- def convert_op_handle_to_op(op_type, handle, idx=None, name=None): - if op_type == OpType.CONV2D: - return Conv2D(handle, idx, name) - elif op_type == OpType.POOL2D: - return Pool2D(handle, idx, name) - elif op_type == OpType.LINEAR: - return Linear(handle, idx, name) - elif op_type == OpType.EMBEDDING: - return Embedding(handle, idx, name) - elif op_type == OpType.FLAT: - return Flat(handle, idx, name) - elif op_type == OpType.CONCAT: - return Concat(handle, idx, name) - elif op_type == OpType.SOFTMAX: - return Softmax(handle, idx, name) - elif op_type == OpType.EXP: - return Exp(handle, idx, name) - elif op_type == OpType.SIN: - return Sin(handle, idx, name) - elif op_type == OpType.COS: - return Cos(handle, idx, name) - elif op_type == OpType.ADD: - return Add(handle, idx, name) - elif op_type == OpType.SUBTRACT: - return Subtract(handle, idx, name) - elif op_type == OpType.MULTIPLY: - return Multiply(handle, idx, name) - elif op_type == OpType.DIVIDE: - return Divide(handle, idx, name) - elif op_type == OpType.MAX: - return Max(handle, idx, name) - elif op_type == OpType.MIN: - return Min(handle, idx, name) - elif op_type == OpType.REDUCE_SUM: - return ReduceSum(handle, idx, name) - elif op_type == OpType.MSELOSS: - return MSELoss(handle, idx, name) - elif op_type == OpType.SCALAR_MULTIPLY: - return ScalarMultiply(handle, idx, name) - elif op_type == OpType.SCALAR_ADD: - return ScalarAdd(handle, idx, name) - elif op_type == OpType.SCALAR_SUB: - return ScalarSub(handle, idx, name) - elif op_type == OpType.SCALAR_FLOORDIV: - return ScalarFloorDiv(handle, idx, name) - elif op_type == OpType.SCALAR_TRUEDIV: - return ScalarTrueDiv(handle, idx, name) - elif op_type == OpType.GELU: - return Gelu(handle, idx, name) - elif op_type == OpType.RELU: - return Relu(handle, idx, name) - elif op_type == OpType.SIGMOID: - return Sigmoid(handle, idx, name) - elif op_type == OpType.TANH: - return Tanh(handle, idx, name) - elif op_type == OpType.ELU: - return Elu(handle, idx, name) - elif op_type == OpType.DROPOUT: - return Dropout(handle, idx, name) - elif op_type == OpType.BATCH_NORM: - return BatchNorm(handle, idx, name) - elif op_type == OpType.LAYER_NORM: - return LayerNorm(handle, idx, name) - elif op_type == OpType.ADD_BIAS_RESIDUAL_LAYERNORM: - return AddBiasResidualLayerNorm(handle, idx, name) - elif op_type == OpType.SIGMOID_SILU_MULTI: - return SigmoidSiluMulti(handle, idx, name) - elif op_type == OpType.BATCH_MATMUL: - return Batch_Matmul(handle, idx, name) - elif op_type == OpType.SPLIT: - return Split(handle, idx, name) - elif op_type == OpType.RESHAPE: - return Reshape(handle, idx, name) - elif op_type == OpType.IDENTITY: - return Identity(handle,idx,name) - elif op_type == OpType.TRANSPOSE: - return Transpose(handle, idx, name) - elif op_type == OpType.REVERSE: - return Reverse(handle, idx, name) - elif op_type == OpType.MULTIHEAD_ATTENTION: - return MultiHeadAttention(handle, idx, name) - elif op_type == OpType.INC_MULTIHEAD_ATTENTION: - return IncMultiHeadAttention(handle, idx, name) - elif op_type == OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION: - return SpecIncMultiHeadSelfAttention(handle, idx, name) - elif op_type == OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION: - return TreeIncMultiHeadSelfAttention(handle, idx, name) - elif op_type == OpType.RMS_NORM: - return RMSNorm(handle, idx, name) - elif op_type == OpType.RESIDUAL_RMS_NORM: - return ResidualRMSNorm(handle, idx, name) - elif op_type == OpType.ARG_TOPK: - return ArgTopK(handle, idx, name) - elif op_type == OpType.BEAM_TOPK: - return BeamTopK(handle, idx, name) - elif op_type == OpType.SAMPLING: - return Sampling(handle, idx, name) - elif op_type == OpType.ARGMAX: - return ArgMax(handle, idx, name) - elif op_type == OpType.RSQRT: - return Rsqrt(handle, idx, name) - elif op_type == OpType.POW: - return Pow(handle, idx, name) - elif op_type == OpType.MEAN: - return Mean(handle, idx, name) - elif op_type == OpType.GATHER: - return Gather(handle, idx, name) - else: - assert 0, "unknown layer type {}".format(op_type) - return None + if op_type == OpType.CONV2D: + return Conv2D(handle, idx, name) + elif op_type == OpType.POOL2D: + return Pool2D(handle, idx, name) + elif op_type == OpType.LINEAR: + return Linear(handle, idx, name) + elif op_type == OpType.EMBEDDING: + return Embedding(handle, idx, name) + elif op_type == OpType.FLAT: + return Flat(handle, idx, name) + elif op_type == OpType.CONCAT: + return Concat(handle, idx, name) + elif op_type == OpType.SOFTMAX: + return Softmax(handle, idx, name) + elif op_type == OpType.EXP: + return Exp(handle, idx, name) + elif op_type == OpType.SIN: + return Sin(handle, idx, name) + elif op_type == OpType.COS: + return Cos(handle, idx, name) + elif op_type == OpType.ADD: + return Add(handle, idx, name) + elif op_type == OpType.SUBTRACT: + return Subtract(handle, idx, name) + elif op_type == OpType.MULTIPLY: + return Multiply(handle, idx, name) + elif op_type == OpType.DIVIDE: + return Divide(handle, idx, name) + elif op_type == OpType.MAX: + return Max(handle, idx, name) + elif op_type == OpType.MIN: + return Min(handle, idx, name) + elif op_type == OpType.REDUCE_SUM: + return ReduceSum(handle, idx, name) + elif op_type == OpType.MSELOSS: + return MSELoss(handle, idx, name) + elif op_type == OpType.SCALAR_MULTIPLY: + return ScalarMultiply(handle, idx, name) + elif op_type == OpType.SCALAR_ADD: + return ScalarAdd(handle, idx, name) + elif op_type == OpType.SCALAR_SUB: + return ScalarSub(handle, idx, name) + elif op_type == OpType.SCALAR_FLOORDIV: + return ScalarFloorDiv(handle, idx, name) + elif op_type == OpType.SCALAR_TRUEDIV: + return ScalarTrueDiv(handle, idx, name) + elif op_type == OpType.GELU: + return Gelu(handle, idx, name) + elif op_type == OpType.RELU: + return Relu(handle, idx, name) + elif op_type == OpType.SIGMOID: + return Sigmoid(handle, idx, name) + elif op_type == OpType.TANH: + return Tanh(handle, idx, name) + elif op_type == OpType.ELU: + return Elu(handle, idx, name) + elif op_type == OpType.DROPOUT: + return Dropout(handle, idx, name) + elif op_type == OpType.BATCH_NORM: + return BatchNorm(handle, idx, name) + elif op_type == OpType.LAYER_NORM: + return LayerNorm(handle, idx, name) + elif op_type == OpType.RESIDUAL_LAYERNORM: + return ResidualLayerNorm(handle, idx, name) + elif op_type == OpType.ADD_BIAS_RESIDUAL_LAYERNORM: + return AddBiasResidualLayerNorm(handle, idx, name) + elif op_type == OpType.SIGMOID_SILU_MULTI: + return SigmoidSiluMulti(handle, idx, name) + elif op_type == OpType.BATCH_MATMUL: + return Batch_Matmul(handle, idx, name) + elif op_type == OpType.SPLIT: + return Split(handle, idx, name) + elif op_type == OpType.RESHAPE: + return Reshape(handle, idx, name) + elif op_type == OpType.IDENTITY: + return Identity(handle, idx, name) + elif op_type == OpType.TRANSPOSE: + return Transpose(handle, idx, name) + elif op_type == OpType.REVERSE: + return Reverse(handle, idx, name) + elif op_type == OpType.MULTIHEAD_ATTENTION: + return MultiHeadAttention(handle, idx, name) + elif op_type == OpType.INC_MULTIHEAD_ATTENTION: + return IncMultiHeadAttention(handle, idx, name) + elif op_type == OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION: + return SpecIncMultiHeadSelfAttention(handle, idx, name) + elif op_type == OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION: + return TreeIncMultiHeadSelfAttention(handle, idx, name) + elif op_type == OpType.RMS_NORM: + return RMSNorm(handle, idx, name) + elif op_type == OpType.RESIDUAL_RMS_NORM: + return ResidualRMSNorm(handle, idx, name) + elif op_type == OpType.ARG_TOPK: + return ArgTopK(handle, idx, name) + elif op_type == OpType.BEAM_TOPK: + return BeamTopK(handle, idx, name) + elif op_type == OpType.SAMPLING: + return Sampling(handle, idx, name) + elif op_type == OpType.ARGMAX: + return ArgMax(handle, idx, name) + elif op_type == OpType.RSQRT: + return Rsqrt(handle, idx, name) + elif op_type == OpType.POW: + return Pow(handle, idx, name) + elif op_type == OpType.MEAN: + return Mean(handle, idx, name) + elif op_type == OpType.GATHER: + return Gather(handle, idx, name) + else: + assert 0, "unknown layer type {}".format(op_type) + return None + # ----------------------------------------------------------------------- # FFConfig # ----------------------------------------------------------------------- + class FFConfig(object): - __slots__ = ['handle', '_handle', 'enable_tracing'] - def __init__(self): - self.handle = ffc().flexflow_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_config_destroy) - self.enable_tracing = False - - def parse_args(self): - ffc().flexflow_config_parse_args_default(self.handle) - - @property - def batch_size(self): - return ffc().flexflow_config_get_batch_size(self.handle) - - @property - def workers_per_node(self): - return ffc().flexflow_config_get_workers_per_node(self.handle) - - @property - def num_nodes(self): - return ffc().flexflow_config_get_num_nodes(self.handle) - - @property - def epochs(self): - return ffc().flexflow_config_get_epochs(self.handle) - - @property - def enable_control_replication(self): - return ffc().flexflow_config_get_enable_control_replication(self.handle) - - @property - def data_parallelism_degree(self): - return ffc().flexflow_config_get_data_parallelism_degree(self.handle) - - @data_parallelism_degree.setter - def data_parallelism_degree(self, value): - if type(value) is not int: - raise ValueError("The data parallelism degree must be specified as an integer number") - elif value < 1: - raise ValueError("The data parallelism degree cannot be lower than 1") - ffc().flexflow_config_set_data_parallelism_degree(self.handle, value) - - @property - def tensor_parallelism_degree(self): - return ffc().flexflow_config_get_tensor_parallelism_degree(self.handle) - - @tensor_parallelism_degree.setter - def tensor_parallelism_degree(self, value): - if type(value) is not int: - raise ValueError("The tensor parallelism degree must be specified as an integer number") - elif value < 1: - raise ValueError("The tensor parallelism degree cannot be lower than 1") - ffc().flexflow_config_set_tensor_parallelism_degree(self.handle, value) - - @property - def pipeline_parallelism_degree(self): - return ffc().flexflow_config_get_pipeline_parallelism_degree(self.handle) - - @pipeline_parallelism_degree.setter - def pipeline_parallelism_degree(self, value): - if type(value) is not int: - raise ValueError("The pipeline parallelism degree must be specified as an integer number") - elif value < 1: - raise ValueError("The pipeline parallelism degree cannot be lower than 1") - ffc().flexflow_config_set_pipeline_parallelism_degree(self.handle, value) - - @property - def python_data_loader_type(self): - return ffc().flexflow_config_get_python_data_loader_type(self.handle) - - @property - def cpu_offload(self): - return ffc().flexflow_config_get_offload(self.handle) - - def get_current_time(self): - return ffc().flexflow_get_current_time(self.handle) - - def begin_trace(self, trace_id): - if self.enable_tracing: - ffc().flexflow_begin_trace(self.handle, trace_id) - - def end_trace(self, trace_id): - if self.enable_tracing: - ffc().flexflow_end_trace(self.handle, trace_id) + __slots__ = ["handle", "_handle", "enable_tracing"] + + def __init__(self): + self.handle = ffc().flexflow_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_config_destroy) + self.enable_tracing = False + + def parse_args(self): + ffc().flexflow_config_parse_args_default(self.handle) + + @property + def batch_size(self): + return ffc().flexflow_config_get_batch_size(self.handle) + + @property + def workers_per_node(self): + return ffc().flexflow_config_get_workers_per_node(self.handle) + + @property + def num_nodes(self): + return ffc().flexflow_config_get_num_nodes(self.handle) + + @property + def epochs(self): + return ffc().flexflow_config_get_epochs(self.handle) + + @property + def enable_control_replication(self): + return ffc().flexflow_config_get_enable_control_replication(self.handle) + + @property + def data_parallelism_degree(self): + return ffc().flexflow_config_get_data_parallelism_degree(self.handle) + + @data_parallelism_degree.setter + def data_parallelism_degree(self, value): + if type(value) is not int: + raise ValueError( + "The data parallelism degree must be specified as an integer number" + ) + elif value < 1: + raise ValueError("The data parallelism degree cannot be lower than 1") + ffc().flexflow_config_set_data_parallelism_degree(self.handle, value) + + @property + def tensor_parallelism_degree(self): + return ffc().flexflow_config_get_tensor_parallelism_degree(self.handle) + + @tensor_parallelism_degree.setter + def tensor_parallelism_degree(self, value): + if type(value) is not int: + raise ValueError( + "The tensor parallelism degree must be specified as an integer number" + ) + elif value < 1: + raise ValueError("The tensor parallelism degree cannot be lower than 1") + ffc().flexflow_config_set_tensor_parallelism_degree(self.handle, value) + + @property + def pipeline_parallelism_degree(self): + return ffc().flexflow_config_get_pipeline_parallelism_degree(self.handle) + + @pipeline_parallelism_degree.setter + def pipeline_parallelism_degree(self, value): + if type(value) is not int: + raise ValueError( + "The pipeline parallelism degree must be specified as an integer number" + ) + elif value < 1: + raise ValueError("The pipeline parallelism degree cannot be lower than 1") + ffc().flexflow_config_set_pipeline_parallelism_degree(self.handle, value) + + @property + def python_data_loader_type(self): + return ffc().flexflow_config_get_python_data_loader_type(self.handle) + + @property + def cpu_offload(self): + return ffc().flexflow_config_get_offload(self.handle) + + def get_current_time(self): + return ffc().flexflow_get_current_time(self.handle) + + def begin_trace(self, trace_id): + if self.enable_tracing: + ffc().flexflow_begin_trace(self.handle, trace_id) + + def end_trace(self, trace_id): + if self.enable_tracing: + ffc().flexflow_end_trace(self.handle, trace_id) + # ----------------------------------------------------------------------- # Tensor # ----------------------------------------------------------------------- -class Tensor(object): - __slots__ = ['p_handle', 'handle', '_handle', 'num_dims', 'dims', 'data_type', 'owner_op', 'mapped'] - def __init__(self, handle, deallocate=True, owner_op_type=None, p_handle=None): - if handle == None and ffi.typeof(p_handle) == ffi.typeof('flexflow_tensor_t*'): - self.p_handle = p_handle - self.handle = self.p_handle[0] - elif handle != None and ffi.typeof(handle) == ffi.typeof('flexflow_tensor_t'): - self.p_handle = 0 - self.handle = handle - #elif handle != None and ffi.typeof(handle) == ffi.typeof('flexflow_tensor_t'): - # self.p_handle = ffi.new('flexflow_tensor_t *') - # self.p_handle.impl = handle.impl - # self.handle = self.p_handle[0] - else: - assert 0, "Tensor handle is wrong" - self.num_dims = 0 - self.dims = 0 - self.mapped = False - self.__get_dims() - self.__get_data_type() - # if (deallocate == True): - # self._handle = ffi.gc(self.handle, ffc().flexflow_tensor_destroy) - # if (self.is_mapped() == True): - # self.mapped = True - - if owner_op_type != None: - self.__get_owner_op(owner_op_type) - assert self.owner_op != None - - def inline_map(self, ffmodel, ffconfig): - assert self.mapped == False, "Tensor is already mapped." - ffc().flexflow_tensor_inline_map(self.handle, ffmodel.handle, ffconfig.handle); - self.mapped = True - assert self.num_dims > 0, "check dims" - - def inline_unmap(self, ffmodel, ffconfig): - assert self.mapped == True, "Tensor is not inline mapped." - ffc().flexflow_tensor_inline_unmap(self.handle, ffmodel.handle, ffconfig.handle); - self.mapped = False - - def get_array(self, ffmodel, ffconfig): - assert self.mapped == True, "Tensor is not mapped." - raw_ptr = self.__get_raw_ptr(ffmodel, ffconfig, self.data_type) - raw_ptr_int = int(ffi.cast("uintptr_t", raw_ptr)) - fflogger.debug("raw_ptr: %s, %d" %( str(raw_ptr), raw_ptr_int)) - strides = None - if (self.num_dims >= 1 or self.num_dims <= 4): - shape = self.dims - else: - assert 0, "unknow num_dims" - initializer = RegionNdarray(shape, self.data_type, raw_ptr_int, strides, False) - array = np.asarray(initializer) - # print("stride", array.__array_interface__['strides']) - return array - - def get_flat_array(self, ffmodel, ffconfig): - assert self.mapped == True, "Tensor is not mapped." - raw_ptr = self.__get_raw_ptr(ffmodel, ffconfig, self.data_type) - raw_ptr_int = int(ffi.cast("uintptr_t", raw_ptr)) - fflogger.debug("raw_ptr: %s, %d" %( str(raw_ptr), raw_ptr_int)) - strides = None - if (self.num_dims >= 1 or self.num_dims <= 4): - shape_prod = np.prod(self.dims) - shape = (shape_prod,) - else: - assert 0, "unknown num_dims" - initializer = RegionNdarray(shape, self.data_type, raw_ptr_int, strides, False) - array = np.asarray(initializer) - return array - - def attach_numpy_array(self, ffmodel, ffconfig, np_array): - assert np_array.__array_interface__['strides'] == None, "numpy array strides is not None" - np_shape = np_array.shape - num_dims = len(np_shape) - assert num_dims == self.num_dims, "please check dims (%d == %d)" %(num_dims, self.num_dims) - for i in range(0, num_dims): - assert np_shape[i] == self.dims[i], "please check shape dim %d (%d == %d)" %(i, np_shape[i], self.dims[i]) - np_raw_ptr = np_array.__array_interface__['data'] - raw_ptr = ffi.cast("void*", np_raw_ptr[0]) - fflogger.debug("attach numpy array: %s, %s, %s" %( str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))) - self.__attach_raw_ptr(ffmodel, ffconfig, raw_ptr) - - def detach_numpy_array(self, ffconfig): - self.__detach_raw_ptr(ffconfig) - - def is_mapped(self): - return ffc().flexflow_tensor_is_mapped(self.handle) - - def set_tensor(self, ffmodel, np_array): - assert np_array.__array_interface__['strides'] == None, "Parameter set_weights, numpy array strides is not None" - np_shape = np_array.shape - num_dims = len(np_shape) - assert num_dims == self.num_dims, "please check dims (%d == %d)" %(num_dims, self.num_dims) - for i in range(0, num_dims): - assert np_shape[i] == self.dims[i], "please check shape dim %d (%d == %d)" %(i, np_shape[i], self.dims[i]) - c_dims = ffi.new("int[]", self.dims) - np_raw_ptr = np_array.__array_interface__['data'] - if np_array.dtype == np.float16: - assert self.data_type == DataType.DT_HALF, "Wrong datatype" - raw_ptr = ffi.cast("half*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) - elif np_array.dtype == np.float32: - assert self.data_type == DataType.DT_FLOAT, "Wrong datatype" - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_set_tensor_float(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) - elif np_array.dtype == np.int32: - assert self.data_type == DataType.DT_INT32, "Wrong datatype" - raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_set_tensor_int(self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr) - else: - assert 0, "Unsupported datatype" - fflogger.debug("set tensor raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(np_shape))) - assert ret_val == True, ret_val - - def get_tensor(self, ffmodel): - shape = self.dims - if self.data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif self.data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__['data'] - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, False) - elif np_array.dtype == np.int32: - raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, False) - elif np_array.dtype == np.int64: - raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, False) - fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - assert ret_val == True - return np_array - - def get_gradients(self, ffmodel, comm_type): - shape = self.dims - if self.data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif self.data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__['data'] - c_comm_type = enum_to_int(ParameterSyncType, comm_type) - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, True) - elif np_array.dtype == np.int32: - raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, True) - elif np_array.dtype == np.int64: - raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, True) - fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - assert ret_val == True - return np_array - - def get_model_output_gradients(self, ffmodel, comm_type): - shape = self.dims - if self.data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif self.data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__['data'] - c_comm_type = enum_to_int(ParameterSyncType, comm_type) - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_model_get_output_tensor_float(ffmodel.handle, self.handle, raw_ptr, True) - else: - assert 0, "unknown data type" - fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - assert ret_val == True - return np_array - - def get_model_output_tensor(self, ffmodel): - shape = self.dims - if self.data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif self.data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__['data'] - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_model_get_output_tensor_float(ffmodel.handle, self.handle, raw_ptr, False) - else: - assert 0, "unknown data type" - fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - assert ret_val == True - return np_array - - def __get_raw_ptr(self, ffmodel, ffconfig, data_type): - assert data_type == self.data_type, "Tensor check data type" - if (data_type == DataType.DT_HALF): - return ffc().flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) - elif (data_type == DataType.DT_FLOAT): - return ffc().flexflow_tensor_get_raw_ptr_float(self.handle, ffmodel.handle, ffconfig.handle) - elif (data_type == DataType.DT_INT32): - return ffc().flexflow_tensor_get_raw_ptr_int32(self.handle, ffmodel.handle, ffconfig.handle) - else: - assert 0, "unknown data type" - - def __get_dims(self): - self.num_dims = ffc().flexflow_tensor_get_num_dims(self.handle) - # if (self.num_dims == 1): - # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 0),) - # elif (self.num_dims == 2): - # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) - # elif (self.num_dims == 3): - # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) - # elif (self.num_dims == 4): - # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 3), ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) - # elif (self.num_dims == 5): - # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 4), ffc().flexflow_tensor_get_dim(self.handle, 3), ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) - # else: - # assert 0, "unknown num_dims" - d = ffc().flexflow_tensor_get_dims(self.handle) - if (self.num_dims == 1): - self.dims = (d[0],) - elif (self.num_dims == 2): - self.dims = (d[1], d[0]) - elif (self.num_dims == 3): - self.dims = (d[2], d[1], d[0]) - elif (self.num_dims == 4): - self.dims = (d[3], d[2], d[1], d[0]) - elif (self.num_dims == 5): - self.dims = (d[4], d[3], d[2], d[1], d[0]) - else: - assert 0, "unknown num_dims" - - def __get_data_type(self): - dtype = ffc().flexflow_tensor_get_data_type(self.handle) - if (dtype == 40): - self.data_type = DataType.DT_BOOLEAN - elif (dtype == 41): - self.data_type = DataType.DT_INT32 - elif (dtype == 42): - self.data_type = DataType.DT_INT64 - elif (dtype == 43): - self.data_type = DataType.DT_HALF - elif (dtype == 44): - self.data_type = DataType.DT_FLOAT - elif (dtype == 45): - self.data_type = DataType.DT_DOUBLE - else: - assert 0, "unknown data type {}".format(dtype) - def __get_owner_op(self, op_type): - op_handle = ffc().flexflow_tensor_get_owner_op(self.handle) - if op_handle.impl == ffi.NULL: - self.owner_op = None - else: - self.owner_op = convert_op_handle_to_op(op_type, op_handle) - - def __attach_raw_ptr(self, ffmodel, ffconfig, raw_ptr, column_major=True): - assert self.mapped == False, "Tensor is already mapped." - ffc().flexflow_tensor_attach_raw_ptr(self.handle, ffmodel.handle, ffconfig.handle, raw_ptr, column_major) - self.mapped = True +class Tensor(object): + __slots__ = [ + "p_handle", + "handle", + "_handle", + "num_dims", + "dims", + "data_type", + "owner_op", + "mapped", + ] + + def __init__(self, handle, deallocate=True, owner_op_type=None, p_handle=None): + if handle == None and ffi.typeof(p_handle) == ffi.typeof("flexflow_tensor_t*"): + self.p_handle = p_handle + self.handle = self.p_handle[0] + elif handle != None and ffi.typeof(handle) == ffi.typeof("flexflow_tensor_t"): + self.p_handle = 0 + self.handle = handle + # elif handle != None and ffi.typeof(handle) == ffi.typeof('flexflow_tensor_t'): + # self.p_handle = ffi.new('flexflow_tensor_t *') + # self.p_handle.impl = handle.impl + # self.handle = self.p_handle[0] + else: + assert 0, "Tensor handle is wrong" + self.num_dims = 0 + self.dims = 0 + self.mapped = False + self.__get_dims() + self.__get_data_type() + # if (deallocate == True): + # self._handle = ffi.gc(self.handle, ffc().flexflow_tensor_destroy) + # if (self.is_mapped() == True): + # self.mapped = True + + if owner_op_type != None: + self.__get_owner_op(owner_op_type) + assert self.owner_op != None + + def inline_map(self, ffmodel, ffconfig): + assert self.mapped == False, "Tensor is already mapped." + ffc().flexflow_tensor_inline_map(self.handle, ffmodel.handle, ffconfig.handle) + self.mapped = True + assert self.num_dims > 0, "check dims" + + def inline_unmap(self, ffmodel, ffconfig): + assert self.mapped == True, "Tensor is not inline mapped." + ffc().flexflow_tensor_inline_unmap(self.handle, ffmodel.handle, ffconfig.handle) + self.mapped = False + + def get_array(self, ffmodel, ffconfig): + assert self.mapped == True, "Tensor is not mapped." + raw_ptr = self.__get_raw_ptr(ffmodel, ffconfig, self.data_type) + raw_ptr_int = int(ffi.cast("uintptr_t", raw_ptr)) + fflogger.debug("raw_ptr: %s, %d" % (str(raw_ptr), raw_ptr_int)) + strides = None + if self.num_dims >= 1 or self.num_dims <= 4: + shape = self.dims + else: + assert 0, "unknow num_dims" + initializer = RegionNdarray(shape, self.data_type, raw_ptr_int, strides, False) + array = np.asarray(initializer) + # print("stride", array.__array_interface__['strides']) + return array + + def get_flat_array(self, ffmodel, ffconfig): + assert self.mapped == True, "Tensor is not mapped." + raw_ptr = self.__get_raw_ptr(ffmodel, ffconfig, self.data_type) + raw_ptr_int = int(ffi.cast("uintptr_t", raw_ptr)) + fflogger.debug("raw_ptr: %s, %d" % (str(raw_ptr), raw_ptr_int)) + strides = None + if self.num_dims >= 1 or self.num_dims <= 4: + shape_prod = np.prod(self.dims) + shape = (shape_prod,) + else: + assert 0, "unknown num_dims" + initializer = RegionNdarray(shape, self.data_type, raw_ptr_int, strides, False) + array = np.asarray(initializer) + return array + + def attach_numpy_array(self, ffmodel, ffconfig, np_array): + assert ( + np_array.__array_interface__["strides"] == None + ), "numpy array strides is not None" + np_shape = np_array.shape + num_dims = len(np_shape) + assert num_dims == self.num_dims, "please check dims (%d == %d)" % ( + num_dims, + self.num_dims, + ) + for i in range(0, num_dims): + assert ( + np_shape[i] == self.dims[i] + ), "please check shape dim %d (%d == %d)" % (i, np_shape[i], self.dims[i]) + np_raw_ptr = np_array.__array_interface__["data"] + raw_ptr = ffi.cast("void*", np_raw_ptr[0]) + fflogger.debug( + "attach numpy array: %s, %s, %s" + % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0])) + ) + self.__attach_raw_ptr(ffmodel, ffconfig, raw_ptr) + + def detach_numpy_array(self, ffconfig): + self.__detach_raw_ptr(ffconfig) + + def is_mapped(self): + return ffc().flexflow_tensor_is_mapped(self.handle) + + def set_tensor(self, ffmodel, np_array): + assert ( + np_array.__array_interface__["strides"] == None + ), "Parameter set_weights, numpy array strides is not None" + np_shape = np_array.shape + num_dims = len(np_shape) + assert num_dims == self.num_dims, "please check dims (%d == %d)" % ( + num_dims, + self.num_dims, + ) + for i in range(0, num_dims): + assert ( + np_shape[i] == self.dims[i] + ), "please check shape dim %d (%d == %d)" % (i, np_shape[i], self.dims[i]) + c_dims = ffi.new("int[]", self.dims) + np_raw_ptr = np_array.__array_interface__["data"] + if np_array.dtype == np.float16: + assert self.data_type == DataType.DT_HALF, "Wrong datatype" + raw_ptr = ffi.cast("half*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_set_tensor_float( + self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr + ) + elif np_array.dtype == np.float32: + assert self.data_type == DataType.DT_FLOAT, "Wrong datatype" + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_set_tensor_float( + self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr + ) + elif np_array.dtype == np.int32: + assert self.data_type == DataType.DT_INT32, "Wrong datatype" + raw_ptr = ffi.cast("int*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_set_tensor_int( + self.handle, ffmodel.handle, num_dims, c_dims, raw_ptr + ) + else: + assert 0, "Unsupported datatype" + fflogger.debug( + "set tensor raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(np_shape)) + ) + assert ret_val == True, ret_val + + def get_tensor(self, ffmodel): + shape = self.dims + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: + np_array = np.empty(shape, dtype=np.float32) + elif self.data_type == DataType.DT_INT32: + np_array = np.empty(shape, dtype=np.int32) + elif self.data_type == DataType.DT_INT64: + np_array = np.empty(shape, dtype=np.int64) + else: + assert 0, f"Unsupported datatype: {self.data_type}" + np_raw_ptr = np_array.__array_interface__["data"] + if np_array.dtype == np.float32: + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_float( + self.handle, ffmodel.handle, raw_ptr, False + ) + elif np_array.dtype == np.int32: + raw_ptr = ffi.cast("int*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_int( + self.handle, ffmodel.handle, raw_ptr, False + ) + elif np_array.dtype == np.int64: + raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_int64( + self.handle, ffmodel.handle, raw_ptr, False + ) + fflogger.debug( + "get weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) + ) + assert ret_val == True + return np_array + + def get_gradients(self, ffmodel, comm_type): + shape = self.dims + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: + np_array = np.empty(shape, dtype=np.float32) + elif self.data_type == DataType.DT_INT32: + np_array = np.empty(shape, dtype=np.int32) + elif self.data_type == DataType.DT_INT64: + np_array = np.empty(shape, dtype=np.int64) + else: + assert 0, f"Unsupported datatype: {self.data_type}" + np_raw_ptr = np_array.__array_interface__["data"] + c_comm_type = enum_to_int(ParameterSyncType, comm_type) + if np_array.dtype == np.float32: + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_float( + self.handle, ffmodel.handle, raw_ptr, True + ) + elif np_array.dtype == np.int32: + raw_ptr = ffi.cast("int*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_int( + self.handle, ffmodel.handle, raw_ptr, True + ) + elif np_array.dtype == np.int64: + raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_int64( + self.handle, ffmodel.handle, raw_ptr, True + ) + fflogger.debug( + "get weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) + ) + assert ret_val == True + return np_array + + def get_model_output_gradients(self, ffmodel, comm_type): + shape = self.dims + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: + np_array = np.empty(shape, dtype=np.float32) + elif self.data_type == DataType.DT_INT32: + np_array = np.empty(shape, dtype=np.int32) + elif self.data_type == DataType.DT_INT64: + np_array = np.empty(shape, dtype=np.int64) + else: + assert 0, f"Unsupported datatype: {self.data_type}" + np_raw_ptr = np_array.__array_interface__["data"] + c_comm_type = enum_to_int(ParameterSyncType, comm_type) + if np_array.dtype == np.float32: + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + ret_val = ffc().flexflow_model_get_output_tensor_float( + ffmodel.handle, self.handle, raw_ptr, True + ) + else: + assert 0, "unknown data type" + fflogger.debug( + "get weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) + ) + assert ret_val == True + return np_array + + def get_model_output_tensor(self, ffmodel): + shape = self.dims + if self.data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif self.data_type == DataType.DT_FLOAT: + np_array = np.empty(shape, dtype=np.float32) + elif self.data_type == DataType.DT_INT32: + np_array = np.empty(shape, dtype=np.int32) + elif self.data_type == DataType.DT_INT64: + np_array = np.empty(shape, dtype=np.int64) + else: + assert 0, f"Unsupported datatype: {self.data_type}" + np_raw_ptr = np_array.__array_interface__["data"] + if np_array.dtype == np.float32: + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + ret_val = ffc().flexflow_model_get_output_tensor_float( + ffmodel.handle, self.handle, raw_ptr, False + ) + else: + assert 0, "unknown data type" + fflogger.debug( + "get weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) + ) + assert ret_val == True + return np_array + + def __get_raw_ptr(self, ffmodel, ffconfig, data_type): + assert data_type == self.data_type, "Tensor check data type" + if data_type == DataType.DT_HALF: + return ffc().flexflow_tensor_get_raw_ptr_float( + self.handle, ffmodel.handle, ffconfig.handle + ) + elif data_type == DataType.DT_FLOAT: + return ffc().flexflow_tensor_get_raw_ptr_float( + self.handle, ffmodel.handle, ffconfig.handle + ) + elif data_type == DataType.DT_INT32: + return ffc().flexflow_tensor_get_raw_ptr_int32( + self.handle, ffmodel.handle, ffconfig.handle + ) + else: + assert 0, "unknown data type" + + def __get_dims(self): + self.num_dims = ffc().flexflow_tensor_get_num_dims(self.handle) + # if (self.num_dims == 1): + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 0),) + # elif (self.num_dims == 2): + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) + # elif (self.num_dims == 3): + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) + # elif (self.num_dims == 4): + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 3), ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) + # elif (self.num_dims == 5): + # self.dims = (ffc().flexflow_tensor_get_dim(self.handle, 4), ffc().flexflow_tensor_get_dim(self.handle, 3), ffc().flexflow_tensor_get_dim(self.handle, 2), ffc().flexflow_tensor_get_dim(self.handle, 1), ffc().flexflow_tensor_get_dim(self.handle, 0)) + # else: + # assert 0, "unknown num_dims" + d = ffc().flexflow_tensor_get_dims(self.handle) + if self.num_dims == 1: + self.dims = (d[0],) + elif self.num_dims == 2: + self.dims = (d[1], d[0]) + elif self.num_dims == 3: + self.dims = (d[2], d[1], d[0]) + elif self.num_dims == 4: + self.dims = (d[3], d[2], d[1], d[0]) + elif self.num_dims == 5: + self.dims = (d[4], d[3], d[2], d[1], d[0]) + else: + assert 0, "unknown num_dims" + + def __get_data_type(self): + dtype = ffc().flexflow_tensor_get_data_type(self.handle) + if dtype == 40: + self.data_type = DataType.DT_BOOLEAN + elif dtype == 41: + self.data_type = DataType.DT_INT32 + elif dtype == 42: + self.data_type = DataType.DT_INT64 + elif dtype == 43: + self.data_type = DataType.DT_HALF + elif dtype == 44: + self.data_type = DataType.DT_FLOAT + elif dtype == 45: + self.data_type = DataType.DT_DOUBLE + else: + assert 0, "unknown data type {}".format(dtype) + + def __get_owner_op(self, op_type): + op_handle = ffc().flexflow_tensor_get_owner_op(self.handle) + if op_handle.impl == ffi.NULL: + self.owner_op = None + else: + self.owner_op = convert_op_handle_to_op(op_type, op_handle) + + def __attach_raw_ptr(self, ffmodel, ffconfig, raw_ptr, column_major=True): + assert self.mapped == False, "Tensor is already mapped." + ffc().flexflow_tensor_attach_raw_ptr( + self.handle, ffmodel.handle, ffconfig.handle, raw_ptr, column_major + ) + self.mapped = True + + def __detach_raw_ptr(self, ffconfig): + assert self.mapped == True, "Tensor is not mapped." + ffc().flexflow_tensor_detach_raw_ptr(self.handle, ffconfig.handle) + self.mapped = False - def __detach_raw_ptr(self, ffconfig): - assert self.mapped == True, "Tensor is not mapped." - ffc().flexflow_tensor_detach_raw_ptr(self.handle, ffconfig.handle) - self.mapped = False # ----------------------------------------------------------------------- # Parameter # ----------------------------------------------------------------------- + class Parameter(Tensor): - __slots__ = ['parameter_handle'] - def __init__(self, handle): - assert ffi.typeof(handle) == ffi.typeof('flexflow_tensor_t'), "Parameter handle is wrong" - self.parameter_handle = handle - super(Parameter, self).__init__(self.parameter_handle, deallocate=False) - - def set_weights(self, ffmodel, np_array): - assert np_array.__array_interface__['strides'] == None, "Parameter set_weights, numpy array strides is not None" - np_shape = np_array.shape - num_dims = len(np_shape) - assert num_dims == self.num_dims, "please check dims (%d == %d)" %(num_dims, self.num_dims) - print(np_shape, self.dims) - for i in range(0, num_dims): - assert np_shape[i] == self.dims[i], "please check shape dim %d (%d == %d)" %(i, np_shape[i], self.dims[i]) - c_dims = ffi.new("int[]", self.dims) - np_raw_ptr = np_array.__array_interface__['data'] - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - fflogger.debug("set weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(np_shape))) - ret_val = ffc().flexflow_tensor_set_tensor_float(self.parameter_handle, ffmodel.handle, num_dims, c_dims, raw_ptr) - assert ret_val == True, ret_val - - def get_weights(self, ffmodel): - shape = self.dims - np_array = np.empty(shape, dtype=np.float32) - np_raw_ptr = np_array.__array_interface__['data'] - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - ret_val = ffc().flexflow_tensor_get_tensor_float(self.parameter_handle, ffmodel.handle, raw_ptr, False) - assert ret_val == True - return np_array + __slots__ = ["parameter_handle"] + + def __init__(self, handle): + assert ffi.typeof(handle) == ffi.typeof( + "flexflow_tensor_t" + ), "Parameter handle is wrong" + self.parameter_handle = handle + super(Parameter, self).__init__(self.parameter_handle, deallocate=False) + + def set_weights(self, ffmodel, np_array): + assert ( + np_array.__array_interface__["strides"] == None + ), "Parameter set_weights, numpy array strides is not None" + np_shape = np_array.shape + num_dims = len(np_shape) + assert num_dims == self.num_dims, "please check dims (%d == %d)" % ( + num_dims, + self.num_dims, + ) + print(np_shape, self.dims) + for i in range(0, num_dims): + assert ( + np_shape[i] == self.dims[i] + ), "please check shape dim %d (%d == %d)" % (i, np_shape[i], self.dims[i]) + c_dims = ffi.new("int[]", self.dims) + np_raw_ptr = np_array.__array_interface__["data"] + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + fflogger.debug( + "set weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(np_shape)) + ) + ret_val = ffc().flexflow_tensor_set_tensor_float( + self.parameter_handle, ffmodel.handle, num_dims, c_dims, raw_ptr + ) + assert ret_val == True, ret_val + + def get_weights(self, ffmodel): + shape = self.dims + np_array = np.empty(shape, dtype=np.float32) + np_raw_ptr = np_array.__array_interface__["data"] + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + fflogger.debug( + "get weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) + ) + ret_val = ffc().flexflow_tensor_get_tensor_float( + self.parameter_handle, ffmodel.handle, raw_ptr, False + ) + assert ret_val == True + return np_array + # ----------------------------------------------------------------------- # FFModel # ----------------------------------------------------------------------- + class FFModel(object): - """ - """ - __slots__ = ['handle', '_handle', '_layers', '_nb_layers', '_ffconfig', '_tracing_id', 'initializers', 'attr_tensors'] - def __init__(self, ffconfig): - """Constructor of FFModel. - - :param ffconfig: configurations of FlexFlow and the created model. - :type ffconfig: FFConfig - - :returns: FFModel -- the model. - """ - self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) - self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) - self._layers = dict() - self._nb_layers = 0 - self._ffconfig = ffconfig - global ff_tracing_id - self._tracing_id = ff_tracing_id - ff_tracing_id += 1 - self.initializers = {} - self.attr_tensors = {} - - def get_layers(self): - return self._layers - - def add_layer(self, op_type, name): - layer_id = self._nb_layers - op_handle = ffc().flexflow_model_get_last_layer(self.handle) - self._layers[self._nb_layers] = convert_op_handle_to_op(op_type, op_handle, idx=layer_id, name=name) - self._nb_layers += 1 - - def create_tensor(self, dims, data_type, create_grad=True): - """Instantiate a FlexFlow tensor. - - :param x: a shape tuple/list (integers), including the batch size. - :type x: list of int - - :param data_type: the datatype of the created tensor. Options are - DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. - :type data_type: DataType - - :param create_grad: weather the tensor creates a gradients vector. - If you don't specify anything, a gradients vector is used. - :type create_grad: bool - - :returns: Tensor -- the output tensor. - """ - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_tensor_create(self.handle, num_dims, c_dims, c_data_type, create_grad); - return Tensor(handle) - - def map_tensor(self, tensor, parallel_op = None): - op_handle = self.__get_op_handle(parallel_op) - ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) - - def create_constant(self, dims, value, data_type): - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_constant_create(self.handle, num_dims, c_dims, value, c_data_type); - return Tensor(handle) - - def exp(self, x, name=None): - """Exponential activation function. - - :param x: the input Tensor. - :type x: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) - self.add_layer(OpType.EXP, name) - return Tensor(handle, owner_op_type=OpType.EXP) - - def sin(self, x, name=None): - """Elementwise sine function. - - :param x: the input Tensor. - :type x: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) - self.add_layer(OpType.SIN, name) - return Tensor(handle, owner_op_type=OpType.SIN) - - def cos(self, x, name=None): - """Elementwise cosine function. - - :param x: the input Tensor. - :type x: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) - self.add_layer(OpType.COS, name) - return Tensor(handle, owner_op_type=OpType.COS) - - - def add(self, x, y, inplace_a=False, name=None): - """Layer that adds two input Tensors, :attr:`output = x + y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_add(self.handle, x.handle, y.handle, inplace_a, c_name) - self.add_layer(OpType.ADD, name) - return Tensor(handle, owner_op_type=OpType.ADD) - - def subtract(self, x, y, inplace_a=False, name=None): - """Layer that subtracts two input Tensors, :attr:`output = x * y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_subtract(self.handle, x.handle, y.handle, inplace_a, c_name) - self.add_layer(OpType.SUBTRACT, name) - return Tensor(handle, owner_op_type=OpType.SUBTRACT) - - def multiply(self, x, y, inplace_a=False, name=None): - """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_multiply(self.handle, x.handle, y.handle, inplace_a, c_name) - self.add_layer(OpType.MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.MULTIPLY) - - def divide(self, x, y, inplace_a=False, name=None): - """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_divide(self.handle, x.handle, y.handle, inplace_a, c_name) - self.add_layer(OpType.DIVIDE, name) - return Tensor(handle, owner_op_type=OpType.DIVIDE) - - def max(self, x, y, inplace_a=False, name=None): - """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_max(self.handle, x.handle, y.handle, inplace_a, c_name) - self.add_layer(OpType.MAX, name) - return Tensor(handle, owner_op_type=OpType.MAX) - - def min(self, x, y, inplace_a=False, name=None): - """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_min(self.handle, x.handle, y.handle, inplace_a, c_name) - self.add_layer(OpType.MIN, name) - return Tensor(handle, owner_op_type=OpType.MIN) - - def reduce_sum(self, input, axes, keepdims=False, name=None): - """Layer that computes the sum of the input Tensor along given axes. - - :param input: the input Tensor. - :type input: Tensor - - :param axes: the axes along which reduction is applied - :type axes: List[int] - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_reduce_sum(self.handle, input.handle, c_axes, len(axes), keepdims, c_name) - self.add_layer(OpType.REDUCE_SUM, name) - return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) - - def rsqrt(self, input, name=None): - """Layer that computes the element-wise reciprocal square-root. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) - self.add_layer(OpType.RSQRT, name) - return Tensor(handle, owner_op_type=OpType.RSQRT) - - def pow(self, input, exponent, name=None): - """Layer that computes the element-wise power. - - :param input: the input Tensor. - :type input: Tensor - - :param exponent: exponent to raise each element in the input tensor. - :type exponent: float - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_pow(self.handle, input.handle, exponent, c_name) - self.add_layer(OpType.POW, name) - return Tensor(handle, owner_op_type=OpType.POW) - - def mean(self, input, dims, keepdims=False, name=None): - """Layer that computes the mean of the input tensor across the given - dimensions. - - :param input: the input Tensor. - :type input: Tensor - - :param dims: dimensions to take the mean over. - :type dims: list - - :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and - collapses the dimension if False. Default is False. - :type keepdims: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - dims = list(dims) - c_dims = ffi.new("int[]", dims) - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_mean(self.handle, input.handle, c_dims, len(dims), keepdims, c_name) - self.add_layer(OpType.MEAN, name) - return Tensor(handle, owner_op_type=OpType.MEAN) - - def conv2d(self, input, out_channels, - kernel_h, kernel_w, - stride_h, stride_w, - padding_h, padding_w, - activation=ActiMode.AC_MODE_NONE, - groups=1, use_bias=True, shared_op=None, - kernel_initializer=None, bias_initializer=None, name=None): - """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input` - to produce a tensor of :attr:`output`. - - The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor - is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: - - .. math:: - C_{out} = out\_channels - - .. math:: - K_{H} = kernel\_h - - .. math:: - K_{W} = kernel\_w - - .. math:: - S_{H} = stride\_h - - .. math:: - S_{W} = stride\_w - - .. math:: - P_{H} = padding\_h - - .. math:: - P_{S} = padding\_s - - .. math:: - H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 - - .. math:: - W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 - - :param input: the input Tensor. - :type input: Tensor - - :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution). - :type out\_channels: int - - :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`. - :type kernel_h: int - - :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`. - :type kernel_w: int - - :param stride_h: the stride of the convolution along the height: :math:`S_{H}`. - :type stride_h: int - - :param stride_w: the stride of the convolution along the width: :math:`S_{W}`. - :type stride_w: int - - :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. - :type padding_h: int - - :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. - :type padding_w: int - - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode - - :param groups: the number of groups in this convolution - :type groups: int - - :param use_bias: whether the layer uses a bias vector. Default is True. - :type use_bias: bool - - :param shared_op: the layer whose parameters are shared with. Default is None. - :type shared_op: Op - - :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. - :type bias_initializer: Initializer - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - shared_op_handle = self.__get_op_handle(shared_op) - c_activation = enum_to_int(ActiMode, activation) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - bias_init_handle = self.__get_initializer_handle(bias_initializer) - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_conv2d(self.handle, input.handle, out_channels, kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w, c_activation, groups, use_bias, shared_op_handle, kernel_init_handle, bias_init_handle, c_name) - self.add_layer(OpType.CONV2D, name) - return Tensor(handle, owner_op_type=OpType.CONV2D) - - def embedding(self, input, num_embeddings, embedding_dim, - aggr, dtype=DataType.DT_FLOAT, shared_op=None, kernel_initializer=None, name=None): - """Layer that turns positive integers into dense vectors of fixed size - - :param input: the input Tensor. - :type input: Tensor - - :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1 - :type num_embeddings: int - - :param embedding_dim: dimension of the dense embedding. - :type embedding_dim: int - - :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG. - :type aggr: AggrMode - - :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE - :type dtype: DataType - - :param shared_op: the layer whose parameters are shared with. Default is None. - :type shared_op: Op - - :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - shared_op_handle = self.__get_op_handle(shared_op) - c_aggr = enum_to_int(AggrMode, aggr) - c_dtype = enum_to_int(DataType, dtype) - if kernel_initializer is None: - kernel_initializer = GlorotUniformInitializer(42) - assert (type(kernel_initializer) is GlorotUniformInitializer) or \ - (type(kernel_initializer) is ZeroInitializer) or \ - (type(kernel_initializer) is UniformInitializer) or \ - (type(kernel_initializer) is NormInitializer), \ - f"Unknown initializer type: {kernel_initializer}" - handle = ffc().flexflow_model_add_embedding( - self.handle, input.handle, num_embeddings, embedding_dim, c_aggr, c_dtype, - shared_op_handle, kernel_initializer.handle, c_name, - ) - # NOTE: We must keep a reference to the initializer or else it will be - # immediately destructed - self.initializers[name] = kernel_initializer - self.add_layer(OpType.EMBEDDING, name) - return Tensor(handle, owner_op_type=OpType.EMBEDDING) - - def pool2d(self, input, kernel_h, kernel_w, - stride_h, stride_w, - padding_h, padding_w, - pool_type=PoolType.POOL_MAX, - activation=ActiMode.AC_MODE_NONE, name=None): - """Pooling operation for 2D spatial data. - - The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor - is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: - - .. math:: - C_{out} = out\_channels - - .. math:: - K_{H} = kernel\_h - - .. math:: - K_{W} = kernel\_w - - .. math:: - S_{H} = stride\_h - - .. math:: - S_{W} = stride\_w - - .. math:: - P_{H} = padding\_h - - .. math:: - P_{S} = padding\_s - - .. math:: - H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 - - .. math:: - W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 - - :param input: the input Tensor. - :type input: Tensor - - :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`. - :type kernel_h: int - - :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`. - :type kernel_w: int - - :param stride_h: the stride of the pooling along the height: :math:`S_{H}`. - :type stride_h: int - - :param stride_w: the stride of the pooling along the width: :math:`S_{W}`. - :type stride_w: int - - :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. - :type padding_h: int - - :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. - :type padding_w: int - - :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied. - :type activation: PoolType - - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_pool_type = enum_to_int(PoolType, pool_type) - c_activation = enum_to_int(ActiMode, activation) - handle = ffc().flexflow_model_add_pool2d(self.handle, input.handle, kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w, c_pool_type, c_activation, c_name) - self.add_layer(OpType.POOL2D, name) - return Tensor(handle, owner_op_type=OpType.POOL2D) - - def batch_norm(self, input, relu=True, name=None): - """Layer that normalizes its inputs. - - Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. - - :param input: the list of input Tensors. - :type input: Tensor - - :param relu: whether a ReLU function is applied. Default is True. - :type relu: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_batch_norm(self.handle, input.handle, relu, c_name) - self.add_layer(OpType.BATCH_NORM, name) - return Tensor(handle, owner_op_type=OpType.BATCH_NORM) - - def layer_norm(self, input, axes, elementwise_affine=True, eps=1e-5, use_bias = True, name=None): - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_layer_norm(self.handle, input.handle, len(axes), c_axes, elementwise_affine, eps, use_bias, c_name) - self.add_layer(OpType.LAYER_NORM, name) - return Tensor(handle, owner_op_type=OpType.LAYER_NORM) - - def add_bias_residual_layer_norm(self, input, residual, axes, elementwise_affine=True, eps=1e-5, use_bias = True, name=None): - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(self.handle, input.handle, residual.handle, len(axes), c_axes, elementwise_affine, eps, use_bias, c_name) - self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) - return Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) - - def sigmoid_silu_multi(self, input1, input2, name=None): - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sigmoid_silu_multi(self.handle, input1.handle, input2.handle, c_name) - self.add_layer(OpType.SIGMOID_SILU_MULTI, name) - return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) - - def batch_matmul(self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None): - """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. - - :param A: the first input Tensor. - :type A: Tensor - - :param B: the second input Tensor. - :type B: Tensor - - :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension - :type a_seq_length_dim: int - - :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension - :type b_seq_length_dim: int - - :param name: the name of the layer. Default is None. - :type name: string - - :param name: Whether to add use bias in layer normalization - :type name: bool - - :returns: Tensor -- the output tensor. - """ - if a_seq_length_dim is None: - a_seq_length_dim = -1 - if b_seq_length_dim is None: - b_seq_length_dim = -1 - handle = ffc().flexflow_model_add_batch_matmul(self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim) - self.add_layer(OpType.BATCH_MATMUL, name) - return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) - - def dense(self, input, out_dim, - activation=ActiMode.AC_MODE_NONE, - use_bias=True, - datatype=DataType.DT_NONE, - shared_op=None, - kernel_initializer=None, bias_initializer=None, - kernel_regularizer=None, name=None): - """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where - :attr:`activation` is the element-wise activation function passed as the activation argument, - :attr:`kernel` is a weights matrix created by the layer, and - :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True). - - The size of input tensor is :math:`(N, C_{in})` and the size of output tensor - is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim` - - :param input: the input Tensor. - :type input: Tensor - - :param out\_dim: dimensionality of the output space. - :type out\_dim: int - - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode - - :param use_bias: whether the layer uses a bias vector. Default is True. - :type use_bias: bool - - :param shared_op: the layer whose parameters are shared with. Default is None. - :type shared_op: Op - - :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. - :type bias_initializer: Initializer - - :param kernel_regularizer: Regularizer for the kernel weights matrix - :type bias_initializer: Regularizer - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - shared_op_handle = self.__get_op_handle(shared_op) - c_activation = enum_to_int(ActiMode, activation) - c_datatype = enum_to_int(DataType, datatype) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - bias_init_handle = self.__get_initializer_handle(bias_initializer) - if kernel_regularizer: - c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type) - kernel_reg_lambda = kernel_regularizer._lambda - else: - c_kernel_reg_type = enum_to_int( - RegularizerMode, RegularizerMode.REG_MODE_NONE) - kernel_reg_lambda = 0.0 - handle = ffc().flexflow_model_add_dense( - self.handle, input.handle, out_dim, c_activation, use_bias, c_datatype, - shared_op_handle, kernel_init_handle, bias_init_handle, - c_kernel_reg_type, kernel_reg_lambda, c_name) - self.add_layer(OpType.LINEAR, name) - return Tensor(handle, owner_op_type=OpType.LINEAR) - - def concat(self, tensors, axis, name=None): - """Layer that concatenates a list of inputs. - - It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs. - - :param input: the list of input Tensors. - :type input: List of Tensors - - :param axis: the dimension along which to concatenate. - :type axis: int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - assert type(tensors) is list, "tensors should be a list" - tensor_handle_list = [] - n = len(tensors) - assert n <= 256, "Please increase MAX_NUM_INPUTS" - for tensor in tensors: - tensor_handle_list.append(tensor.handle) - c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_concat(self.handle, n, c_tensor_handle_list, axis, c_name) - self.add_layer(OpType.CONCAT, name) - return Tensor(handle, owner_op_type=OpType.CONCAT) - - def split(self, input, sizes, axis, name=None): - """Layer that splits a :attr:`input` tensor into a list of tensors. - - :param input: the input Tensor. - :type input: Tensor - - :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`. - :type sizes: int or list of int - - :param axis: the dimension along which to split. - :type axis: int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: list of Tensors -- the output tensors. - """ - if type(sizes) is list: - split = sizes - else: - assert input.dims[axis] % sizes == 0, "Split dimension is not divisible" - split = [input.dims[axis] // sizes for i in range(sizes)] - n = len(split) - assert n <= 256, "Please increase MAX_NUM_OUTPUTS" - c_split = ffi.new("int[]", split) - c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") - c_name = get_c_name(name) - ffc().flexflow_model_add_split(self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name) - output_tensor_list = [] - for i in range(n): - tensor_p_handle = ffi.new("flexflow_tensor_t*") - tensor_p_handle.impl = c_outputs_handle_list[i].impl - output_tensor_list.append(Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)) - self.add_layer(OpType.SPLIT, name) - del c_outputs_handle_list - return output_tensor_list - - def flat(self, input, name=None): - """Flattens the input. Does not affect the batch size. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) - self.add_layer(OpType.FLAT, name) - return Tensor(handle, owner_op_type=OpType.FLAT) - - def softmax(self, input, axis=-1, name=None): - """Softmax activation function. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_softmax(self.handle, input.handle, axis, c_name) - self.add_layer(OpType.SOFTMAX, name) - return Tensor(handle, owner_op_type=OpType.SOFTMAX) - - def reshape(self, input, shape, name=None): - """Layer that reshapes inputs into the given shape. - - Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order, - except with a new shape given by :attr:`shape`. - - :param input: the input Tensor. - :type input: Tensor - - :param shape: A list defining the shape of the output tensor. - :type shape: list of int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_shape = ffi.new("int[]", shape) - handle = ffc().flexflow_model_add_reshape(self.handle, input.handle, len(shape), c_shape, c_name) - self.add_layer(OpType.RESHAPE, name) - return Tensor(handle, owner_op_type=OpType.RESHAPE) - - def gather(self, input, index, dim, name=None): - """Layer that gathers values along the dim axis. - - :param input: the input tensor - :type input: Tensor - - :param index: the index tensor, which specifies the indices of elements to gather - :type index: Tensor - - :param dim: the axis along which to index - :type dim: int - - :param name: the name of the layer. Default is None - :type name: string - - :returns: Tensor -- the output tensor - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_gather(self.handle, input.handle, index.handle, dim, c_name) - self.add_layer(OpType.GATHER, name) - return Tensor(handle, owner_op_type=OpType.GATHER) - - def transpose(self, input, perm, name=None): - """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm - - :param input: the input Tensor. - :type input: Tensor - - :param perm: A permutation of the dimensions of a. - :type perm: List of int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_perm = ffi.new("int[]", perm) - handle = ffc().flexflow_model_add_transpose(self.handle, input.handle, len(perm), c_perm, c_name) - self.add_layer(OpType.TRANSPOSE, name) - return Tensor(handle, owner_op_type=OpType.TRANSPOSE) - - def reverse(self, input, axis, name=None): - """Layer that reverses specific dimensions of a tensor. - - Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`. - - :param input: the input Tensor. - :type input: Tensor - - :param axis: the dimension to reverse. - :type axis: int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_reverse(self.handle, input.handle, axis, c_name) - self.add_layer(OpType.REVERSE, name) - return Tensor(handle, owner_op_type=OpType.REVERSE) - - def scalar_multiply(self, input, scalar, inplace=True, name=None): - """Scalar multiplication of a tensor by an scalar. - - :param input: the input Tensor. - :type input: Tensor - - :param input: the scalar - :type scalar: float - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_multiply(self.handle, input.handle, scalar, inplace, c_name) - self.add_layer(OpType.SCALAR_MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) - - def scalar_add(self, input, scalar, inplace=True, name=None): - """Scalar addition of a scalar to each entry of a tensor. - - :param input: the input Tensor. - :type input: Tensor - - :param input: the scalar - :type scalar: float - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_add(self.handle, input.handle, scalar, inplace, c_name) - self.add_layer(OpType.SCALAR_ADD, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) - - def scalar_sub(self, input, scalar, inplace=True, name=None): - """Scalar subtraction of a scalar to each entry of a tensor. - - :param input: the input Tensor. - :type input: Tensor - - :param input: the scalar - :type scalar: float - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_sub(self.handle, input.handle, scalar, inplace, c_name) - self.add_layer(OpType.SCALAR_SUB, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) - - def scalar_true_divide(self, input, scalar, inplace=True, name=None): - """Scalar regular division of a tensor by an scalar. - - :param input: the input Tensor. - :type input: Tensor - - :param input: the scalar - :type scalar: float - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_truediv(self.handle, input.handle, scalar, inplace, c_name) - self.add_layer(OpType.SCALAR_TRUEDIV, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) - - def gelu(self, input, inplace=True, name=None): - """Gaussian Error Linear Unit activation function. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) - self.add_layer(OpType.GELU, name) - return Tensor(handle, owner_op_type=OpType.GELU) - - def relu(self, input, inplace=True, name=None): - """Rectified Linear Unit activation function. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_relu(self.handle, input.handle, inplace, c_name) - self.add_layer(OpType.RELU, name) - return Tensor(handle, owner_op_type=OpType.RELU) - - def identity(self, input, name=None): - """Identity function. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) - self.add_layer(OpType.IDENTITY, name) - return Tensor(handle, owner_op_type=OpType.IDENTITY) - - def sigmoid(self, input, name=None): - """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) - self.add_layer(OpType.SIGMOID, name) - return Tensor(handle, owner_op_type=OpType.SIGMOID) - - def tanh(self, input, name=None): - """Hyperbolic tangent activation function. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) - self.add_layer(OpType.TANH, name) - return Tensor(handle, owner_op_type=OpType.TANH) - - def elu(self, input, inplace=True, name=None): - """Exponential Linear Unit. activation function. - - :param input: the input Tensor. - :type input: Tensor - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_elu(self.handle, input.handle, inplace, c_name) - self.add_layer(OpType.ELU, name) - return Tensor(handle, owner_op_type=OpType.ELU) - - def dropout(self, input, rate, seed, name=None): - """The Dropout layer randomly sets input units to 0 with - a frequency of :attr:`rate` at each step during training time, - which helps prevent overfitting. - Inputs not set to 0 are scaled up by 1/(1 - rate) such that the - sum over all inputs is unchanged. - - :param input: the input Tensor. - :type input: Tensor - - :param rate: Fraction of the input units to drop. - :type rate: float(0-1) - - :param seed: random seed. - :type seed: int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_dropout(self.handle, input.handle, rate, seed, c_name) - self.add_layer(OpType.DROPOUT, name) - return Tensor(handle, owner_op_type=OpType.DROPOUT) - - def multihead_attention(self, query, key, value, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - kernel_initializer=None, name=None): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, - and returns the dot-product attention between them:. - - :param query: the query Tensor. - :type query: Tensor - - :param key: the key Tensor. - :type key: Tensor - - :param value: the value Tensor. - :type value: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc().flexflow_model_add_multihead_attention(self.handle, query.handle, key.handle, value.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, kernel_init_handle, c_name) - self.add_layer(OpType.MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) - - def inc_multihead_self_attention(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, position_bias=False, name=None): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - In inference mode, the attention is computed using incremental decoding. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool - - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) - self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) - - def spec_inc_multihead_self_attention(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, position_bias=False, name=None): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (beam search) mode. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool - - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) - self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) - - def inc_multihead_self_attention_verify(self, input, - embed_dim, num_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, position_bias=False, name=None): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (tree verify) mode. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool - - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(self.handle, input.handle, embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) - self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) - - def inc_multiquery_self_attention(self, input, - embed_dim, num_q_heads, num_kv_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, position_bias=False, name=None): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - In inference mode, the attention is computed using incremental decoding. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int - - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool - - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) - self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) - - def spec_inc_multiquery_self_attention(self, input, - embed_dim, num_q_heads, num_kv_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, position_bias=False, name=None): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (beam search) mode. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int - - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool - - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) - self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) - - def inc_multiquery_self_attention_verify(self, input, - embed_dim, num_q_heads, num_kv_heads, - kdim=0, vdim=0, dropout=0.0, - bias=True, add_bias_kv=False, add_zero_attn=False, - data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, scaling_query=False, scaling_factor=1.0, - qk_prod_scaling=True, position_bias=False, name=None): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (tree verify) mode. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int - - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool - - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(self.handle, input.handle, embed_dim, num_q_heads, num_kv_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, apply_rotary_embedding, scaling_query, scaling_factor, qk_prod_scaling, position_bias, c_name) - self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) - - def rms_norm(self, input, eps, dim, name=None): - """Defines the RMS Norm layer. - - :param input: the input Tensor. - :type input: Tensor - - :param eps: a value added to the denominator for numerical stability - :type eps: float - - :param dim: The dimension with respect to which to take the norm - :type dim: int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rms_norm(self.handle, input.handle, eps, dim, c_name) - self.add_layer(OpType.RMS_NORM, name) - return Tensor(handle, owner_op_type=OpType.RMS_NORM) - - def residual_rms_norm(self, input1, input2, eps, dim, name=None): - """Defines the Residual RMS Norm layer. - - :param input: the input 1 Tensor. - :type input: Tensor - - :param input: the input 2 Tensor. - :type input: Tensor - - :param eps: a value added to the denominator for numerical stability - :type eps: float - - :param dim: The dimension with respect to which to take the norm - :type dim: int - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handles_array = ffc().flexflow_model_add_residual_rms_norm(self.handle, input1.handle, input2.handle, eps, dim, c_name) - self.add_layer(OpType.RESIDUAL_RMS_NORM, name) - return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM) - - def arg_top_k(self, input, k, sorted, name=None): - """Defines the Arg TopK layer. - - :param input: the input Tensor. - :type input: Tensor - - :param k: the top k indices to select - :type k: int - - :param sorted: Whether the entries should be sorted - :type sorted: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_arg_top_k(self.handle, input.handle, k, sorted, c_name) - self.add_layer(OpType.ARG_TOPK, name) - return Tensor(handle, owner_op_type=OpType.ARG_TOPK) - - def beam_top_k(self, input, max_beam_size, sorted, name=None): - """Defines the Beam TopK layer. - - :param input: the input Tensor. - :type input: Tensor - - :param max_beam_size: the top max_beam_size indices to select - :type max_beam_size: int - - :param sorted: Whether the entries should be sorted - :type sorted: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_beam_top_k(self.handle, input.handle, max_beam_size, sorted, c_name) - self.add_layer(OpType.BEAM_TOPK, name) - return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) - - def sampling(self, input, top_p, name=None): - """Defines the Sampling layer. - - :param input: the input Tensor. - :type input: Tensor - - :param top_p: The top_p parameter of the sampling - :type top_p: float - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sampling(self.handle, input.handle, top_p, c_name) - self.add_layer(OpType.SAMPLING, name) - return Tensor(handle, owner_op_type=OpType.SAMPLING) - - def argmax(self, input, beam_search, name=None): - """Defines the Sampling layer. - - :param input: the input Tensor. - :type input: Tensor - - :param beam_search: Whether you need to perform beam search - :type beam_search: bool - - :param name: the name of the layer. Default is None. - :type name: string - - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_argmax(self.handle, input.handle, beam_search, c_name) - self.add_layer(OpType.ARGMAX, name) - return Tensor(handle, owner_op_type=OpType.ARGMAX) - - def reset_metrics(self): - """Reset performance metrics. - - :returns: None -- no returns. - """ - ffc().flexflow_model_reset_metrics(self.handle) - - def init_layers(self): - """Initialize layers. - - :returns: None -- no returns. - """ - ffc().flexflow_model_init_layers(self.handle) - - def prefetch(self): - ffc().flexflow_model_prefetch(self.handle) - - def forward(self, seq_length=None): - """Forward propagation of all layers. - - :returns: None -- no returns. - """ - if seq_length is None: - seq_length = -1 - ffc().flexflow_model_forward(self.handle, seq_length) - - #TODO: seperate compute_metrics from backward - def backward(self, seq_length=None): - """Backward propagation of all layers. - - :returns: None -- no returns. - """ - if seq_length is None: - seq_length = -1 - ffc().flexflow_model_backward(self.handle, seq_length) - - def compute_metrics(self): - """Compute performance metrics. - - :returns: None -- no returns. - """ - ffc().flexflow_model_compute_metrics(self.handle) - - def update(self): - """Update weights and biases of all layers. - - :returns: None -- no returns. - """ - ffc().flexflow_model_update(self.handle) - - def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): - """Configure the model for trainting. FlexFlow uses lazy initialization, - so the actual creating of all operations (including creating and partitioning - of weight, bias and output tensors) happen during compile. - - :param optimizer: optimizer instance. - :type optimizer: Optimizer - - :param loss_type: Enum of LossType. - Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, - LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE. - :type loss_type: LossType - - :param metrics: List of metrics to be evaluated by the model during training and testing. - Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY, - METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, - METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR - :type metrics: MetricsType - - :param comp_mode: Enum of CompMode. - Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE - :type comp_mode: CompMode - - :returns: None -- no returns. - """ - self.optimizer = optimizer - - c_loss_type = enum_to_int(LossType, loss_type) - metrics_int = [] - for metric in metrics: - metrics_int.append(enum_to_int(MetricsType, metric)) - c_metrics = ffi.new("int[]", metrics_int) - if comp_mode == None: - comp_mode = CompMode.TRAINING - c_comp_mode = enum_to_int(CompMode, comp_mode) - ffc().flexflow_model_compile(self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode) - for (ff_tensor, np_tensor) in self.attr_tensors.items(): - ff_tensor.set_tensor(self, np_tensor) - print("Compiled ffmodel!") - - def fit(self, x=None, y=None, batch_size=None, epochs=1): - """Trains the model for a fixed number of epochs (iterations on a dataset). - - :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. - :type x: Dataloader - - :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. - :type y: Dataloader - - :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` - or :attr:`--batch-size` from the command line. - :type batch_size: int - - :param epochs: Number of epochs to train the model. - An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. - The default value is 1. - :type epochs: int - - :returns: None -- no returns. - """ - if (isinstance(x, list) == False): - dataloaders = [x] - else: - dataloaders = x - dataloaders.append(y) - - num_samples = y.num_samples - batch_size = self._ffconfig.batch_size - self._tracing_id += 1 # get a new tracing id - for epoch in range(0,epochs): - for d in dataloaders: - d.reset() - self.reset_metrics() - iterations = num_samples / batch_size - for iter in range(0, int(iterations)): - self._ffconfig.begin_trace(self._tracing_id) + """ """ + + __slots__ = [ + "handle", + "_handle", + "_layers", + "_nb_layers", + "_ffconfig", + "_tracing_id", + "initializers", + "attr_tensors", + ] + + def __init__(self, ffconfig): + """Constructor of FFModel. + + :param ffconfig: configurations of FlexFlow and the created model. + :type ffconfig: FFConfig + + :returns: FFModel -- the model. + """ + self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) + self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) + self._layers = dict() + self._nb_layers = 0 + self._ffconfig = ffconfig + global ff_tracing_id + self._tracing_id = ff_tracing_id + ff_tracing_id += 1 + self.initializers = {} + self.attr_tensors = {} + + def get_layers(self): + return self._layers + + def add_layer(self, op_type, name): + layer_id = self._nb_layers + op_handle = ffc().flexflow_model_get_last_layer(self.handle) + self._layers[self._nb_layers] = convert_op_handle_to_op( + op_type, op_handle, idx=layer_id, name=name + ) + self._nb_layers += 1 + + def create_tensor(self, dims, data_type, create_grad=True): + """Instantiate a FlexFlow tensor. + + :param x: a shape tuple/list (integers), including the batch size. + :type x: list of int + + :param data_type: the datatype of the created tensor. Options are + DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. + :type data_type: DataType + + :param create_grad: weather the tensor creates a gradients vector. + If you don't specify anything, a gradients vector is used. + :type create_grad: bool + + :returns: Tensor -- the output tensor. + """ + c_dims = ffi.new("int[]", dims) + c_data_type = enum_to_int(DataType, data_type) + num_dims = len(dims) + handle = ffc().flexflow_tensor_create( + self.handle, num_dims, c_dims, c_data_type, create_grad + ) + return Tensor(handle) + + def map_tensor(self, tensor, parallel_op=None): + op_handle = self.__get_op_handle(parallel_op) + ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) + + def create_constant(self, dims, value, data_type): + c_dims = ffi.new("int[]", dims) + c_data_type = enum_to_int(DataType, data_type) + num_dims = len(dims) + handle = ffc().flexflow_constant_create( + self.handle, num_dims, c_dims, value, c_data_type + ) + return Tensor(handle) + + def exp(self, x, name=None): + """Exponential activation function. + + :param x: the input Tensor. + :type x: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) + self.add_layer(OpType.EXP, name) + return Tensor(handle, owner_op_type=OpType.EXP) + + def sin(self, x, name=None): + """Elementwise sine function. + + :param x: the input Tensor. + :type x: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) + self.add_layer(OpType.SIN, name) + return Tensor(handle, owner_op_type=OpType.SIN) + + def cos(self, x, name=None): + """Elementwise cosine function. + + :param x: the input Tensor. + :type x: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) + self.add_layer(OpType.COS, name) + return Tensor(handle, owner_op_type=OpType.COS) + + def add(self, x, y, inplace_a=False, name=None): + """Layer that adds two input Tensors, :attr:`output = x + y`. + + :param x: the first input Tensor. + :type x: Tensor + + :param y: the second input Tensor. + :type y: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_add( + self.handle, x.handle, y.handle, inplace_a, c_name + ) + self.add_layer(OpType.ADD, name) + return Tensor(handle, owner_op_type=OpType.ADD) + + def subtract(self, x, y, inplace_a=False, name=None): + """Layer that subtracts two input Tensors, :attr:`output = x * y`. + + :param x: the first input Tensor. + :type x: Tensor + + :param y: the second input Tensor. + :type y: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_subtract( + self.handle, x.handle, y.handle, inplace_a, c_name + ) + self.add_layer(OpType.SUBTRACT, name) + return Tensor(handle, owner_op_type=OpType.SUBTRACT) + + def multiply(self, x, y, inplace_a=False, name=None): + """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. + + :param x: the first input Tensor. + :type x: Tensor + + :param y: the second input Tensor. + :type y: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_multiply( + self.handle, x.handle, y.handle, inplace_a, c_name + ) + self.add_layer(OpType.MULTIPLY, name) + return Tensor(handle, owner_op_type=OpType.MULTIPLY) + + def divide(self, x, y, inplace_a=False, name=None): + """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. + + :param x: the first input Tensor. + :type x: Tensor + + :param y: the second input Tensor. + :type y: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_divide( + self.handle, x.handle, y.handle, inplace_a, c_name + ) + self.add_layer(OpType.DIVIDE, name) + return Tensor(handle, owner_op_type=OpType.DIVIDE) + + def max(self, x, y, inplace_a=False, name=None): + """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. + + :param x: the first input Tensor. + :type x: Tensor + + :param y: the second input Tensor. + :type y: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_max( + self.handle, x.handle, y.handle, inplace_a, c_name + ) + self.add_layer(OpType.MAX, name) + return Tensor(handle, owner_op_type=OpType.MAX) + + def min(self, x, y, inplace_a=False, name=None): + """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. + + :param x: the first input Tensor. + :type x: Tensor + + :param y: the second input Tensor. + :type y: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_min( + self.handle, x.handle, y.handle, inplace_a, c_name + ) + self.add_layer(OpType.MIN, name) + return Tensor(handle, owner_op_type=OpType.MIN) + + def reduce_sum(self, input, axes, keepdims=False, name=None): + """Layer that computes the sum of the input Tensor along given axes. + + :param input: the input Tensor. + :type input: Tensor + + :param axes: the axes along which reduction is applied + :type axes: List[int] + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handle = ffc().flexflow_model_add_reduce_sum( + self.handle, input.handle, c_axes, len(axes), keepdims, c_name + ) + self.add_layer(OpType.REDUCE_SUM, name) + return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) + + def rsqrt(self, input, name=None): + """Layer that computes the element-wise reciprocal square-root. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) + self.add_layer(OpType.RSQRT, name) + return Tensor(handle, owner_op_type=OpType.RSQRT) + + def pow(self, input, exponent, name=None): + """Layer that computes the element-wise power. + + :param input: the input Tensor. + :type input: Tensor + + :param exponent: exponent to raise each element in the input tensor. + :type exponent: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_pow( + self.handle, input.handle, exponent, c_name + ) + self.add_layer(OpType.POW, name) + return Tensor(handle, owner_op_type=OpType.POW) + + def mean(self, input, dims, keepdims=False, name=None): + """Layer that computes the mean of the input tensor across the given + dimensions. + + :param input: the input Tensor. + :type input: Tensor + + :param dims: dimensions to take the mean over. + :type dims: list + + :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and + collapses the dimension if False. Default is False. + :type keepdims: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + dims = list(dims) + c_dims = ffi.new("int[]", dims) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_mean( + self.handle, input.handle, c_dims, len(dims), keepdims, c_name + ) + self.add_layer(OpType.MEAN, name) + return Tensor(handle, owner_op_type=OpType.MEAN) + + def conv2d( + self, + input, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + activation=ActiMode.AC_MODE_NONE, + groups=1, + use_bias=True, + shared_op=None, + kernel_initializer=None, + bias_initializer=None, + name=None, + ): + """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input` + to produce a tensor of :attr:`output`. + + The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor + is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: + + .. math:: + C_{out} = out\_channels + + .. math:: + K_{H} = kernel\_h + + .. math:: + K_{W} = kernel\_w + + .. math:: + S_{H} = stride\_h + + .. math:: + S_{W} = stride\_w + + .. math:: + P_{H} = padding\_h + + .. math:: + P_{S} = padding\_s + + .. math:: + H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 + + .. math:: + W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 + + :param input: the input Tensor. + :type input: Tensor + + :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution). + :type out\_channels: int + + :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`. + :type kernel_h: int + + :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`. + :type kernel_w: int + + :param stride_h: the stride of the convolution along the height: :math:`S_{H}`. + :type stride_h: int + + :param stride_w: the stride of the convolution along the width: :math:`S_{W}`. + :type stride_w: int + + :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. + :type padding_h: int + + :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. + :type padding_w: int + + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode + + :param groups: the number of groups in this convolution + :type groups: int + + :param use_bias: whether the layer uses a bias vector. Default is True. + :type use_bias: bool + + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op + + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. + :type bias_initializer: Initializer + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + shared_op_handle = self.__get_op_handle(shared_op) + c_activation = enum_to_int(ActiMode, activation) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + bias_init_handle = self.__get_initializer_handle(bias_initializer) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_conv2d( + self.handle, + input.handle, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + c_activation, + groups, + use_bias, + shared_op_handle, + kernel_init_handle, + bias_init_handle, + c_name, + ) + self.add_layer(OpType.CONV2D, name) + return Tensor(handle, owner_op_type=OpType.CONV2D) + + def embedding( + self, + input, + num_embeddings, + embedding_dim, + aggr, + dtype=DataType.DT_FLOAT, + shared_op=None, + kernel_initializer=None, + name=None, + ): + """Layer that turns positive integers into dense vectors of fixed size + + :param input: the input Tensor. + :type input: Tensor + + :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1 + :type num_embeddings: int + + :param embedding_dim: dimension of the dense embedding. + :type embedding_dim: int + + :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG. + :type aggr: AggrMode + + :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE + :type dtype: DataType + + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op + + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + shared_op_handle = self.__get_op_handle(shared_op) + c_aggr = enum_to_int(AggrMode, aggr) + c_dtype = enum_to_int(DataType, dtype) + if kernel_initializer is None: + kernel_initializer = GlorotUniformInitializer(42) + assert ( + (type(kernel_initializer) is GlorotUniformInitializer) + or (type(kernel_initializer) is ZeroInitializer) + or (type(kernel_initializer) is UniformInitializer) + or (type(kernel_initializer) is NormInitializer) + ), f"Unknown initializer type: {kernel_initializer}" + handle = ffc().flexflow_model_add_embedding( + self.handle, + input.handle, + num_embeddings, + embedding_dim, + c_aggr, + c_dtype, + shared_op_handle, + kernel_initializer.handle, + c_name, + ) + # NOTE: We must keep a reference to the initializer or else it will be + # immediately destructed + self.initializers[name] = kernel_initializer + self.add_layer(OpType.EMBEDDING, name) + return Tensor(handle, owner_op_type=OpType.EMBEDDING) + + def pool2d( + self, + input, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + pool_type=PoolType.POOL_MAX, + activation=ActiMode.AC_MODE_NONE, + name=None, + ): + """Pooling operation for 2D spatial data. + + The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor + is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: + + .. math:: + C_{out} = out\_channels + + .. math:: + K_{H} = kernel\_h + + .. math:: + K_{W} = kernel\_w + + .. math:: + S_{H} = stride\_h + + .. math:: + S_{W} = stride\_w + + .. math:: + P_{H} = padding\_h + + .. math:: + P_{S} = padding\_s + + .. math:: + H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 + + .. math:: + W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 + + :param input: the input Tensor. + :type input: Tensor + + :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`. + :type kernel_h: int + + :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`. + :type kernel_w: int + + :param stride_h: the stride of the pooling along the height: :math:`S_{H}`. + :type stride_h: int + + :param stride_w: the stride of the pooling along the width: :math:`S_{W}`. + :type stride_w: int + + :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. + :type padding_h: int + + :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. + :type padding_w: int + + :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied. + :type activation: PoolType + + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_pool_type = enum_to_int(PoolType, pool_type) + c_activation = enum_to_int(ActiMode, activation) + handle = ffc().flexflow_model_add_pool2d( + self.handle, + input.handle, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + c_pool_type, + c_activation, + c_name, + ) + self.add_layer(OpType.POOL2D, name) + return Tensor(handle, owner_op_type=OpType.POOL2D) + + def batch_norm(self, input, relu=True, name=None): + """Layer that normalizes its inputs. + + Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. + + :param input: the list of input Tensors. + :type input: Tensor + + :param relu: whether a ReLU function is applied. Default is True. + :type relu: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_batch_norm( + self.handle, input.handle, relu, c_name + ) + self.add_layer(OpType.BATCH_NORM, name) + return Tensor(handle, owner_op_type=OpType.BATCH_NORM) + + def layer_norm( + self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None + ): + """Add a LayerNorm layer + + :param input: The input tensor + :type input: Tensor + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: Union[int, List[int]] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: _type_, optional + :return: The LayerNorm output tensor + :rtype: Tensor + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handle = ffc().flexflow_model_add_layer_norm( + self.handle, + input.handle, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + c_name, + ) + self.add_layer(OpType.LAYER_NORM, name) + return Tensor(handle, owner_op_type=OpType.LAYER_NORM) + + def residual_layer_norm( + self, + input, + residual1, + residual2, + use_two_residuals, + axes, + elementwise_affine=True, + eps=1e-5, + use_bias=True, + name=None, + ): + """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in + better efficiency compared to using separate element-wise add and LayerNorm operators. + + :param input: The input tensor + :type input: Tensor + :param residual1: The residual tensor to add to the input before computing the LayerNorm + :type residual1: Tensor + :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm + :type residual2: Tensor + :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise + :type use_two_residuals: bool + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: List[int] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: str, optional + :return: A tensor with the sum of the input and residual(s), and the LayerNorm output + :rtype: (Tensor, Tensor) + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + residual2_handle = ( + residual1.handle + ) # This is intentional. Data will be ignored, and we cannot pass None + if use_two_residuals: + assert residual2 is not None + residual2_handle = residual2.handle + handles_array = ffc().flexflow_model_add_residual_layer_norm( + self.handle, + input.handle, + residual1.handle, + residual2_handle, + use_two_residuals, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + c_name, + ) + self.add_layer(OpType.RESIDUAL_LAYERNORM, name) + return Tensor( + handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM + ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM) + + def add_bias_residual_layer_norm( + self, + input, + residual, + axes, + elementwise_affine=True, + eps=1e-5, + use_bias=True, + name=None, + ): + """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, + resulting in better efficiency compared to using separate attention bias addition + + element-wise residual addition + LayerNorm operators. + + :param input: The input tensor + :type input: Tensor + :param residual: The residual tensor + :type residual: Tensor + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: Union[int, List[int]] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: _type_, optional + :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output + :rtype: (Tensor, Tensor) + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm( + self.handle, + input.handle, + residual.handle, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + c_name, + ) + self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) + return Tensor( + handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM + ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) + + def sigmoid_silu_multi(self, input1, input2, name=None): + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid_silu_multi( + self.handle, input1.handle, input2.handle, c_name + ) + self.add_layer(OpType.SIGMOID_SILU_MULTI, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) + + def batch_matmul( + self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None + ): + """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. + + :param A: the first input Tensor. + :type A: Tensor + + :param B: the second input Tensor. + :type B: Tensor + + :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension + :type a_seq_length_dim: int + + :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension + :type b_seq_length_dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :param name: Whether to add use bias in layer normalization + :type name: bool + + :returns: Tensor -- the output tensor. + """ + if a_seq_length_dim is None: + a_seq_length_dim = -1 + if b_seq_length_dim is None: + b_seq_length_dim = -1 + handle = ffc().flexflow_model_add_batch_matmul( + self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim + ) + self.add_layer(OpType.BATCH_MATMUL, name) + return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) + + def dense( + self, + input, + out_dim, + activation=ActiMode.AC_MODE_NONE, + use_bias=True, + datatype=DataType.DT_NONE, + shared_op=None, + kernel_initializer=None, + bias_initializer=None, + kernel_regularizer=None, + name=None, + ): + """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where + :attr:`activation` is the element-wise activation function passed as the activation argument, + :attr:`kernel` is a weights matrix created by the layer, and + :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True). + + The size of input tensor is :math:`(N, C_{in})` and the size of output tensor + is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim` + + :param input: the input Tensor. + :type input: Tensor + + :param out\_dim: dimensionality of the output space. + :type out\_dim: int + + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode + + :param use_bias: whether the layer uses a bias vector. Default is True. + :type use_bias: bool + + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op + + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. + :type bias_initializer: Initializer + + :param kernel_regularizer: Regularizer for the kernel weights matrix + :type bias_initializer: Regularizer + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + shared_op_handle = self.__get_op_handle(shared_op) + c_activation = enum_to_int(ActiMode, activation) + c_datatype = enum_to_int(DataType, datatype) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + bias_init_handle = self.__get_initializer_handle(bias_initializer) + if kernel_regularizer: + c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type) + kernel_reg_lambda = kernel_regularizer._lambda + else: + c_kernel_reg_type = enum_to_int( + RegularizerMode, RegularizerMode.REG_MODE_NONE + ) + kernel_reg_lambda = 0.0 + handle = ffc().flexflow_model_add_dense( + self.handle, + input.handle, + out_dim, + c_activation, + use_bias, + c_datatype, + shared_op_handle, + kernel_init_handle, + bias_init_handle, + c_kernel_reg_type, + kernel_reg_lambda, + c_name, + ) + self.add_layer(OpType.LINEAR, name) + return Tensor(handle, owner_op_type=OpType.LINEAR) + + def concat(self, tensors, axis, name=None): + """Layer that concatenates a list of inputs. + + It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs. + + :param input: the list of input Tensors. + :type input: List of Tensors + + :param axis: the dimension along which to concatenate. + :type axis: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + assert type(tensors) is list, "tensors should be a list" + tensor_handle_list = [] + n = len(tensors) + assert n <= 256, "Please increase MAX_NUM_INPUTS" + for tensor in tensors: + tensor_handle_list.append(tensor.handle) + c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_concat( + self.handle, n, c_tensor_handle_list, axis, c_name + ) + self.add_layer(OpType.CONCAT, name) + return Tensor(handle, owner_op_type=OpType.CONCAT) + + def split(self, input, sizes, axis, name=None): + """Layer that splits a :attr:`input` tensor into a list of tensors. + + :param input: the input Tensor. + :type input: Tensor + + :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`. + :type sizes: int or list of int + + :param axis: the dimension along which to split. + :type axis: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: list of Tensors -- the output tensors. + """ + if type(sizes) is list: + split = sizes + else: + assert input.dims[axis] % sizes == 0, "Split dimension is not divisible" + split = [input.dims[axis] // sizes for i in range(sizes)] + n = len(split) + assert n <= 256, "Please increase MAX_NUM_OUTPUTS" + c_split = ffi.new("int[]", split) + c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") + c_name = get_c_name(name) + ffc().flexflow_model_add_split( + self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name + ) + output_tensor_list = [] + for i in range(n): + tensor_p_handle = ffi.new("flexflow_tensor_t*") + tensor_p_handle.impl = c_outputs_handle_list[i].impl + output_tensor_list.append( + Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle) + ) + self.add_layer(OpType.SPLIT, name) + del c_outputs_handle_list + return output_tensor_list + + def flat(self, input, name=None): + """Flattens the input. Does not affect the batch size. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) + self.add_layer(OpType.FLAT, name) + return Tensor(handle, owner_op_type=OpType.FLAT) + + def softmax(self, input, axis=-1, name=None): + """Softmax activation function. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_softmax( + self.handle, input.handle, axis, c_name + ) + self.add_layer(OpType.SOFTMAX, name) + return Tensor(handle, owner_op_type=OpType.SOFTMAX) + + def reshape(self, input, shape, name=None): + """Layer that reshapes inputs into the given shape. + + Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order, + except with a new shape given by :attr:`shape`. + + :param input: the input Tensor. + :type input: Tensor + + :param shape: A list defining the shape of the output tensor. + :type shape: list of int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_shape = ffi.new("int[]", shape) + handle = ffc().flexflow_model_add_reshape( + self.handle, input.handle, len(shape), c_shape, c_name + ) + self.add_layer(OpType.RESHAPE, name) + return Tensor(handle, owner_op_type=OpType.RESHAPE) + + def gather(self, input, index, dim, name=None): + """Layer that gathers values along the dim axis. + + :param input: the input tensor + :type input: Tensor + + :param index: the index tensor, which specifies the indices of elements to gather + :type index: Tensor + + :param dim: the axis along which to index + :type dim: int + + :param name: the name of the layer. Default is None + :type name: string + + :returns: Tensor -- the output tensor + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_gather( + self.handle, input.handle, index.handle, dim, c_name + ) + self.add_layer(OpType.GATHER, name) + return Tensor(handle, owner_op_type=OpType.GATHER) + + def transpose(self, input, perm, name=None): + """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm + + :param input: the input Tensor. + :type input: Tensor + + :param perm: A permutation of the dimensions of a. + :type perm: List of int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_perm = ffi.new("int[]", perm) + handle = ffc().flexflow_model_add_transpose( + self.handle, input.handle, len(perm), c_perm, c_name + ) + self.add_layer(OpType.TRANSPOSE, name) + return Tensor(handle, owner_op_type=OpType.TRANSPOSE) + + def reverse(self, input, axis, name=None): + """Layer that reverses specific dimensions of a tensor. + + Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`. + + :param input: the input Tensor. + :type input: Tensor + + :param axis: the dimension to reverse. + :type axis: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_reverse( + self.handle, input.handle, axis, c_name + ) + self.add_layer(OpType.REVERSE, name) + return Tensor(handle, owner_op_type=OpType.REVERSE) + + def scalar_multiply(self, input, scalar, inplace=True, name=None): + """Scalar multiplication of a tensor by an scalar. + + :param input: the input Tensor. + :type input: Tensor + + :param input: the scalar + :type scalar: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_scalar_multiply( + self.handle, input.handle, scalar, inplace, c_name + ) + self.add_layer(OpType.SCALAR_MULTIPLY, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) + + def scalar_add(self, input, scalar, inplace=True, name=None): + """Scalar addition of a scalar to each entry of a tensor. + + :param input: the input Tensor. + :type input: Tensor + + :param input: the scalar + :type scalar: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_scalar_add( + self.handle, input.handle, scalar, inplace, c_name + ) + self.add_layer(OpType.SCALAR_ADD, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) + + def scalar_sub(self, input, scalar, inplace=True, name=None): + """Scalar subtraction of a scalar to each entry of a tensor. + + :param input: the input Tensor. + :type input: Tensor + + :param input: the scalar + :type scalar: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_scalar_sub( + self.handle, input.handle, scalar, inplace, c_name + ) + self.add_layer(OpType.SCALAR_SUB, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) + + def scalar_true_divide(self, input, scalar, inplace=True, name=None): + """Scalar regular division of a tensor by an scalar. + + :param input: the input Tensor. + :type input: Tensor + + :param input: the scalar + :type scalar: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_scalar_truediv( + self.handle, input.handle, scalar, inplace, c_name + ) + self.add_layer(OpType.SCALAR_TRUEDIV, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) + + def gelu(self, input, inplace=True, name=None): + """Gaussian Error Linear Unit activation function. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) + self.add_layer(OpType.GELU, name) + return Tensor(handle, owner_op_type=OpType.GELU) + + def relu(self, input, inplace=True, name=None): + """Rectified Linear Unit activation function. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_relu( + self.handle, input.handle, inplace, c_name + ) + self.add_layer(OpType.RELU, name) + return Tensor(handle, owner_op_type=OpType.RELU) + + def identity(self, input, name=None): + """Identity function. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) + self.add_layer(OpType.IDENTITY, name) + return Tensor(handle, owner_op_type=OpType.IDENTITY) + + def sigmoid(self, input, name=None): + """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) + self.add_layer(OpType.SIGMOID, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID) + + def tanh(self, input, name=None): + """Hyperbolic tangent activation function. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) + self.add_layer(OpType.TANH, name) + return Tensor(handle, owner_op_type=OpType.TANH) + + def elu(self, input, inplace=True, name=None): + """Exponential Linear Unit. activation function. + + :param input: the input Tensor. + :type input: Tensor + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_elu( + self.handle, input.handle, inplace, c_name + ) + self.add_layer(OpType.ELU, name) + return Tensor(handle, owner_op_type=OpType.ELU) + + def dropout(self, input, rate, seed, name=None): + """The Dropout layer randomly sets input units to 0 with + a frequency of :attr:`rate` at each step during training time, + which helps prevent overfitting. + Inputs not set to 0 are scaled up by 1/(1 - rate) such that the + sum over all inputs is unchanged. + + :param input: the input Tensor. + :type input: Tensor + + :param rate: Fraction of the input units to drop. + :type rate: float(0-1) + + :param seed: random seed. + :type seed: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_dropout( + self.handle, input.handle, rate, seed, c_name + ) + self.add_layer(OpType.DROPOUT, name) + return Tensor(handle, owner_op_type=OpType.DROPOUT) + + def multihead_attention( + self, + query, + key, + value, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kernel_initializer=None, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, + and returns the dot-product attention between them:. + + :param query: the query Tensor. + :type query: Tensor + + :param key: the key Tensor. + :type key: Tensor + + :param value: the value Tensor. + :type value: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + handle = ffc().flexflow_model_add_multihead_attention( + self.handle, + query.handle, + key.handle, + value.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + kernel_init_handle, + c_name, + ) + self.add_layer(OpType.MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) + + def inc_multihead_self_attention( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multihead_self_attention( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multihead_self_attention( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multihead_self_attention_verify( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multiquery_self_attention( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multiquery_self_attention( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multiquery_self_attention_verify( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + + def rms_norm(self, input, eps, dim, name=None): + """Defines the RMS Norm layer. + + :param input: the input Tensor. + :type input: Tensor + + :param eps: a value added to the denominator for numerical stability + :type eps: float + + :param dim: The dimension with respect to which to take the norm + :type dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_rms_norm( + self.handle, input.handle, eps, dim, c_name + ) + self.add_layer(OpType.RMS_NORM, name) + return Tensor(handle, owner_op_type=OpType.RMS_NORM) + + def residual_rms_norm(self, input1, input2, eps, dim, name=None): + """Defines the Residual RMS Norm layer. + + :param input: the input 1 Tensor. + :type input: Tensor + + :param input: the input 2 Tensor. + :type input: Tensor + + :param eps: a value added to the denominator for numerical stability + :type eps: float + + :param dim: The dimension with respect to which to take the norm + :type dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handles_array = ffc().flexflow_model_add_residual_rms_norm( + self.handle, input1.handle, input2.handle, eps, dim, c_name + ) + self.add_layer(OpType.RESIDUAL_RMS_NORM, name) + return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor( + handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM + ) + + def arg_top_k(self, input, k, sorted, name=None): + """Defines the Arg TopK layer. + + :param input: the input Tensor. + :type input: Tensor + + :param k: the top k indices to select + :type k: int + + :param sorted: Whether the entries should be sorted + :type sorted: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_arg_top_k( + self.handle, input.handle, k, sorted, c_name + ) + self.add_layer(OpType.ARG_TOPK, name) + return Tensor(handle, owner_op_type=OpType.ARG_TOPK) + + def beam_top_k(self, input, max_beam_size, sorted, name=None): + """Defines the Beam TopK layer. + + :param input: the input Tensor. + :type input: Tensor + + :param max_beam_size: the top max_beam_size indices to select + :type max_beam_size: int + + :param sorted: Whether the entries should be sorted + :type sorted: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_beam_top_k( + self.handle, input.handle, max_beam_size, sorted, c_name + ) + self.add_layer(OpType.BEAM_TOPK, name) + return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) + + def sampling(self, input, top_p, name=None): + """Defines the Sampling layer. + + :param input: the input Tensor. + :type input: Tensor + + :param top_p: The top_p parameter of the sampling + :type top_p: float + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sampling( + self.handle, input.handle, top_p, c_name + ) + self.add_layer(OpType.SAMPLING, name) + return Tensor(handle, owner_op_type=OpType.SAMPLING) + + def argmax(self, input, beam_search, name=None): + """Defines the Sampling layer. + + :param input: the input Tensor. + :type input: Tensor + + :param beam_search: Whether you need to perform beam search + :type beam_search: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_argmax( + self.handle, input.handle, beam_search, c_name + ) + self.add_layer(OpType.ARGMAX, name) + return Tensor(handle, owner_op_type=OpType.ARGMAX) + + def reset_metrics(self): + """Reset performance metrics. + + :returns: None -- no returns. + """ + ffc().flexflow_model_reset_metrics(self.handle) + + def init_layers(self): + """Initialize layers. + + :returns: None -- no returns. + """ + ffc().flexflow_model_init_layers(self.handle) + + def prefetch(self): + ffc().flexflow_model_prefetch(self.handle) + + def forward(self, seq_length=None): + """Forward propagation of all layers. + + :returns: None -- no returns. + """ + if seq_length is None: + seq_length = -1 + ffc().flexflow_model_forward(self.handle, seq_length) + + # TODO: seperate compute_metrics from backward + def backward(self, seq_length=None): + """Backward propagation of all layers. + + :returns: None -- no returns. + """ + if seq_length is None: + seq_length = -1 + ffc().flexflow_model_backward(self.handle, seq_length) + + def compute_metrics(self): + """Compute performance metrics. + + :returns: None -- no returns. + """ + ffc().flexflow_model_compute_metrics(self.handle) + + def update(self): + """Update weights and biases of all layers. + + :returns: None -- no returns. + """ + ffc().flexflow_model_update(self.handle) + + def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): + """Configure the model for trainting. FlexFlow uses lazy initialization, + so the actual creating of all operations (including creating and partitioning + of weight, bias and output tensors) happen during compile. + + :param optimizer: optimizer instance. + :type optimizer: Optimizer + + :param loss_type: Enum of LossType. + Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE. + :type loss_type: LossType + + :param metrics: List of metrics to be evaluated by the model during training and testing. + Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY, + METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, + METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR + :type metrics: MetricsType + + :param comp_mode: Enum of CompMode. + Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE + :type comp_mode: CompMode + + :returns: None -- no returns. + """ + self.optimizer = optimizer + + c_loss_type = enum_to_int(LossType, loss_type) + metrics_int = [] + for metric in metrics: + metrics_int.append(enum_to_int(MetricsType, metric)) + c_metrics = ffi.new("int[]", metrics_int) + if comp_mode == None: + comp_mode = CompMode.TRAINING + c_comp_mode = enum_to_int(CompMode, comp_mode) + ffc().flexflow_model_compile( + self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode + ) + for ff_tensor, np_tensor in self.attr_tensors.items(): + ff_tensor.set_tensor(self, np_tensor) + print("Compiled ffmodel!") + + def fit(self, x=None, y=None, batch_size=None, epochs=1): + """Trains the model for a fixed number of epochs (iterations on a dataset). + + :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. + :type x: Dataloader + + :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. + :type y: Dataloader + + :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` + or :attr:`--batch-size` from the command line. + :type batch_size: int + + :param epochs: Number of epochs to train the model. + An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. + The default value is 1. + :type epochs: int + + :returns: None -- no returns. + """ + if isinstance(x, list) == False: + dataloaders = [x] + else: + dataloaders = x + dataloaders.append(y) + + num_samples = y.num_samples + batch_size = self._ffconfig.batch_size + self._tracing_id += 1 # get a new tracing id + for epoch in range(0, epochs): + for d in dataloaders: + d.reset() + self.reset_metrics() + iterations = num_samples / batch_size + for iter in range(0, int(iterations)): + self._ffconfig.begin_trace(self._tracing_id) + for d in dataloaders: + d.next_batch(self) + self.forward() + self.zero_gradients() + self.backward() + self.update() + self._ffconfig.end_trace(self._tracing_id) + + def eval(self, x=None, y=None, batch_size=None): + """Returns the loss value & metrics values for the model in test mode. + + :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. + :type x: Dataloader + + :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. + :type y: Dataloader + + :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` + or :attr:`--batch-size` from the command line. + :type batch_size: int + + :param epochs: Number of epochs to train the model. + An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. + The default value is 1. + :type epochs: int + + :returns: None -- no returns. + """ + if isinstance(x, list) == False: + dataloaders = [x] + else: + dataloaders = x + dataloaders.append(y) + + num_samples = y.num_samples + batch_size = self._ffconfig.batch_size for d in dataloaders: - d.next_batch(self) - self.forward() - self.zero_gradients() - self.backward() - self.update() - self._ffconfig.end_trace(self._tracing_id) - - def eval(self, x=None, y=None, batch_size=None): - """Returns the loss value & metrics values for the model in test mode. - - :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. - :type x: Dataloader - - :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. - :type y: Dataloader - - :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` - or :attr:`--batch-size` from the command line. - :type batch_size: int - - :param epochs: Number of epochs to train the model. - An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. - The default value is 1. - :type epochs: int - - :returns: None -- no returns. - """ - if (isinstance(x, list) == False): - dataloaders = [x] - else: - dataloaders = x - dataloaders.append(y) - - num_samples = y.num_samples - batch_size = self._ffconfig.batch_size - for d in dataloaders: - d.reset() - self.reset_metrics() - iterations = num_samples / batch_size - self._tracing_id += 1 # get a new tracing id - for iter in range(0, int(iterations)): - for d in dataloaders: - d.next_batch(self) - self._ffconfig.begin_trace(self._tracing_id) - self.forward() - self.compute_metrics() - self._ffconfig.end_trace(self._tracing_id) - - def zero_gradients(self): - """Empty the gradients of all layers. - - :returns: None -- no returns. - """ - ffc().flexflow_model_zero_gradients(self.handle) - - def set_optimizer(self, optimizer): - if isinstance(optimizer, SGDOptimizer) == True: - ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle) - elif isinstance(optimizer, AdamOptimizer) == True: - ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle) - elif optimizer == None: - pass - else: - assert 0, "[Model]: unknown optimizer" - - optimizer = property(fset=set_optimizer) - - def print_layers(self, id=-1): - ffc().flexflow_model_print_layers(self.handle, id) - - def get_layer_by_id(self, layer_id): - return self._layers[layer_id] - - def get_last_layer(self): - return self._layers[self._nb_layers-1] - - def get_layer_by_name(self, layer_name): - for layer_id in self._layers: - layer = self._layers[layer_id] - if layer.name == layer_name: - return layer - assert 0, f"Cannot find the layer with name {layer_name}" - return None - - def get_tensor_by_id(self, id): - handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id) - return Parameter(handle) - - @property - def label_tensor(self): - handle = ffc().flexflow_model_get_label_tensor(self.handle) - return Tensor(handle, deallocate=False) - - def get_perf_metrics(self): - handle = ffc().flexflow_model_get_perf_metrics(self.handle) - return PerfMetrics(handle) - - def set_transformer_layer_id(self, id): - ffc().flexflow_model_set_transformer_layer_id(self.handle, id) - - def create_data_loader(self, batch_tensor, full_array): - """Create a SingleDataloader instance. - - :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model. - :type batch_tensor: Tensor - - :param full_array: the entire data. - :type full_array: Numpy Array - - :returns: SingleDataloader -- returns a dataloader instance. - """ - - if (self._ffconfig.enable_control_replication): - assert self._ffconfig.python_data_loader_type != 1, 'To enable control replication, please set --python-data-loader-type 2' - return self.__create_data_loader_ptr(batch_tensor, full_array) - else: - if (self._ffconfig.python_data_loader_type == 1): - return self.__create_data_loader_attach(batch_tensor, full_array) - else: - return self.__create_data_loader_ptr(batch_tensor, full_array) - - def __create_data_loader_attach(self, batch_tensor, full_array): - full_array_shape = full_array.shape - num_samples = full_array_shape[0] - num_dim = len(full_array_shape) - if (full_array.dtype == "float16"): - datatype = DataType.DT_HALF - elif (full_array.dtype == "float32"): - datatype = DataType.DT_FLOAT - elif (full_array.dtype == "int32"): - datatype = DataType.DT_INT32 - elif (full_array.dtype == "int64"): - datatype = DataType.DT_INT64 - else: - assert 0, "unsupported datatype" - - if (num_dim == 2): - full_tensor = self.create_tensor([num_samples, full_array_shape[1]], datatype) - self.map_tensor(full_tensor) - elif (num_dim == 4): - full_tensor = self.create_tensor([num_samples, full_array_shape[1], full_array_shape[2], full_array_shape[3]], datatype) - self.map_tensor(full_tensor) - else: - assert 0, "unsupported dims" - - full_tensor.attach_numpy_array(self._ffconfig, full_array) - dataloader = SingleDataLoader(self, batch_tensor, full_tensor, num_samples, datatype) - full_tensor.detach_numpy_array(self._ffconfig) - - return dataloader - - def __create_data_loader_ptr(self, batch_tensor, full_array): - full_array_shape = full_array.shape - num_samples = full_array_shape[0] - if (full_array.dtype == "float16"): - datatype = DataType.DT_HALF - elif (full_array.dtype == "float32"): - datatype = DataType.DT_FLOAT - elif (full_array.dtype == "int32"): - datatype = DataType.DT_INT32 - elif (full_array.dtype == "int64"): - datatype = DataType.DT_INT64 - else: - assert 0, "unsupported datatype" - np_raw_ptr = full_array.__array_interface__['data'] - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - print("numpy array: %s, %s, %s" % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))) - dataloader = SingleDataLoader(self, batch_tensor, raw_ptr, num_samples, datatype) - - return dataloader - - def __get_initializer_handle(self, initializer): - if (initializer == None): - null_initializer = Initializer(None) - return null_initializer.handle - else: - return initializer.handle + d.reset() + self.reset_metrics() + iterations = num_samples / batch_size + self._tracing_id += 1 # get a new tracing id + for iter in range(0, int(iterations)): + for d in dataloaders: + d.next_batch(self) + self._ffconfig.begin_trace(self._tracing_id) + self.forward() + self.compute_metrics() + self._ffconfig.end_trace(self._tracing_id) + + def zero_gradients(self): + """Empty the gradients of all layers. + + :returns: None -- no returns. + """ + ffc().flexflow_model_zero_gradients(self.handle) + + def set_optimizer(self, optimizer): + if isinstance(optimizer, SGDOptimizer) == True: + ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle) + elif isinstance(optimizer, AdamOptimizer) == True: + ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle) + elif optimizer == None: + pass + else: + assert 0, "[Model]: unknown optimizer" + + optimizer = property(fset=set_optimizer) + + def print_layers(self, id=-1): + ffc().flexflow_model_print_layers(self.handle, id) + + def get_layer_by_id(self, layer_id): + return self._layers[layer_id] + + def get_last_layer(self): + return self._layers[self._nb_layers - 1] + + def get_layer_by_name(self, layer_name): + for layer_id in self._layers: + layer = self._layers[layer_id] + if layer.name == layer_name: + return layer + assert 0, f"Cannot find the layer with name {layer_name}" + return None + + def get_tensor_by_id(self, id): + handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id) + return Parameter(handle) + + @property + def label_tensor(self): + handle = ffc().flexflow_model_get_label_tensor(self.handle) + return Tensor(handle, deallocate=False) + + def get_perf_metrics(self): + handle = ffc().flexflow_model_get_perf_metrics(self.handle) + return PerfMetrics(handle) + + def set_transformer_layer_id(self, id): + ffc().flexflow_model_set_transformer_layer_id(self.handle, id) + + def create_data_loader(self, batch_tensor, full_array): + """Create a SingleDataloader instance. + + :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model. + :type batch_tensor: Tensor + + :param full_array: the entire data. + :type full_array: Numpy Array + + :returns: SingleDataloader -- returns a dataloader instance. + """ + + if self._ffconfig.enable_control_replication: + assert ( + self._ffconfig.python_data_loader_type != 1 + ), "To enable control replication, please set --python-data-loader-type 2" + return self.__create_data_loader_ptr(batch_tensor, full_array) + else: + if self._ffconfig.python_data_loader_type == 1: + return self.__create_data_loader_attach(batch_tensor, full_array) + else: + return self.__create_data_loader_ptr(batch_tensor, full_array) + + def __create_data_loader_attach(self, batch_tensor, full_array): + full_array_shape = full_array.shape + num_samples = full_array_shape[0] + num_dim = len(full_array_shape) + if full_array.dtype == "float16": + datatype = DataType.DT_HALF + elif full_array.dtype == "float32": + datatype = DataType.DT_FLOAT + elif full_array.dtype == "int32": + datatype = DataType.DT_INT32 + elif full_array.dtype == "int64": + datatype = DataType.DT_INT64 + else: + assert 0, "unsupported datatype" + + if num_dim == 2: + full_tensor = self.create_tensor( + [num_samples, full_array_shape[1]], datatype + ) + self.map_tensor(full_tensor) + elif num_dim == 4: + full_tensor = self.create_tensor( + [ + num_samples, + full_array_shape[1], + full_array_shape[2], + full_array_shape[3], + ], + datatype, + ) + self.map_tensor(full_tensor) + else: + assert 0, "unsupported dims" + + full_tensor.attach_numpy_array(self._ffconfig, full_array) + dataloader = SingleDataLoader( + self, batch_tensor, full_tensor, num_samples, datatype + ) + full_tensor.detach_numpy_array(self._ffconfig) + + return dataloader + + def __create_data_loader_ptr(self, batch_tensor, full_array): + full_array_shape = full_array.shape + num_samples = full_array_shape[0] + if full_array.dtype == "float16": + datatype = DataType.DT_HALF + elif full_array.dtype == "float32": + datatype = DataType.DT_FLOAT + elif full_array.dtype == "int32": + datatype = DataType.DT_INT32 + elif full_array.dtype == "int64": + datatype = DataType.DT_INT64 + else: + assert 0, "unsupported datatype" + np_raw_ptr = full_array.__array_interface__["data"] + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + print( + "numpy array: %s, %s, %s" + % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0])) + ) + dataloader = SingleDataLoader( + self, batch_tensor, raw_ptr, num_samples, datatype + ) + + return dataloader + + def __get_initializer_handle(self, initializer): + if initializer == None: + null_initializer = Initializer(None) + return null_initializer.handle + else: + return initializer.handle + + def __get_op_handle(self, shared_op): + if shared_op == None: + op_handle = ffi.new("flexflow_op_t *") + op_handle.impl = ffi.NULL + op = Op(op_handle[0]) + else: + op = shared_op + return op.handle + + def get_output_tensor(self, ffmodel, data_type): + shape = self.dims + if data_type == DataType.DT_HALF: + np_array = np.empty(shape, dtype=np.float16) + elif data_type == DataType.DT_FLOAT: + np_array = np.empty(shape, dtype=np.float32) + elif self.data_type == DataType.DT_INT32: + np_array = np.empty(shape, dtype=np.int32) + elif self.data_type == DataType.DT_INT64: + np_array = np.empty(shape, dtype=np.int64) + else: + assert 0, f"Unsupported datatype: {self.data_type}" + np_raw_ptr = np_array.__array_interface__["data"] + if np_array.dtype == np.float32: + raw_ptr = ffi.cast("float*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_float( + self.handle, ffmodel.handle, raw_ptr, False + ) + elif np_array.dtype == np.int32: + raw_ptr = ffi.cast("int*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_int( + self.handle, ffmodel.handle, raw_ptr, False + ) + elif np_array.dtype == np.int64: + raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) + ret_val = ffc().flexflow_tensor_get_tensor_int64( + self.handle, ffmodel.handle, raw_ptr, False + ) + fflogger.debug( + "get weights raw_ptr: %s, %s, %s, %s" + % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) + ) + assert ret_val == True + return np_array + + def generate(self, prompt, max_sequence_length): + c_input_text = get_c_name(prompt) + max_num_chars = 36000 + c_output_text = ffi.new("char[]", max_num_chars) + c_output_length_and_tokens = ffi.new("int[]", max_sequence_length + 100) + ffc().flexflow_model_generate( + self.handle, + c_input_text, + max_num_chars, + c_output_text, + max_sequence_length, + c_output_length_and_tokens, + ) + output_length = c_output_length_and_tokens[0] + output_tokens = [] + for i in range(output_length): + output_tokens.append(c_output_length_and_tokens[i + 1]) + from flexflow.serve import GenerationResult + + return GenerationResult(ffi.string(c_output_text), output_tokens) + + def set_position_offset(self, offset): + ffc().flexflow_model_set_position_offset(self.handle, offset) - def __get_op_handle(self, shared_op): - if shared_op == None: - op_handle = ffi.new('flexflow_op_t *') - op_handle.impl = ffi.NULL - op = Op(op_handle[0]) - else: - op = shared_op - return op.handle - - def get_output_tensor(self, ffmodel, data_type): - shape = self.dims - if data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__['data'] - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_float(self.handle, ffmodel.handle, raw_ptr, False) - elif np_array.dtype == np.int32: - raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int(self.handle, ffmodel.handle, raw_ptr, False) - elif np_array.dtype == np.int64: - raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int64(self.handle, ffmodel.handle, raw_ptr, False) - fflogger.debug("get weights raw_ptr: %s, %s, %s, %s" %( str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))) - assert ret_val == True - return np_array - - def generate(self, prompt, max_sequence_length): - c_input_text = get_c_name(prompt) - max_num_chars = 36000 - c_output_text = ffi.new("char[]", max_num_chars) - c_output_length_and_tokens = ffi.new("int[]", max_sequence_length + 100) - ffc().flexflow_model_generate(self.handle, c_input_text, max_num_chars, c_output_text, max_sequence_length, c_output_length_and_tokens) - output_length = c_output_length_and_tokens[0] - output_tokens = [] - for i in range(output_length): - output_tokens.append(c_output_length_and_tokens[i+1]) - from flexflow.serve import GenerationResult - return GenerationResult(ffi.string(c_output_text), output_tokens) - - def set_position_offset(self, offset): - ffc().flexflow_model_set_position_offset(self.handle, offset) # ----------------------------------------------------------------------- # SGDOptimizer # ----------------------------------------------------------------------- + class SGDOptimizer(object): - __slots__ = ['handle', '_handle'] - def __init__(self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0): - self.handle = ffc().flexflow_sgd_optimizer_create(ffmodel.handle, lr, momentum, nesterov, weight_decay) - self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) + __slots__ = ["handle", "_handle"] + + def __init__( + self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 + ): + self.handle = ffc().flexflow_sgd_optimizer_create( + ffmodel.handle, lr, momentum, nesterov, weight_decay + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) + + def set_learning_rate(self, learning_rate): + ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) - def set_learning_rate(self, learning_rate): - ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) # ----------------------------------------------------------------------- # AdamOptimizer # ----------------------------------------------------------------------- + class AdamOptimizer(object): - __slots__ = ['handle', '_handle'] - def __init__(self, ffmodel, alpha=0.001, beta1=0.9, beta2=0.999, weight_decay=0.0, epsilon=1e-8): - self.handle = ffc().flexflow_adam_optimizer_create(ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon) - self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) + __slots__ = ["handle", "_handle"] + + def __init__( + self, + ffmodel, + alpha=0.001, + beta1=0.9, + beta2=0.999, + weight_decay=0.0, + epsilon=1e-8, + ): + self.handle = ffc().flexflow_adam_optimizer_create( + ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) + + def set_learning_rate(self, learning_rate): + ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) - def set_learning_rate(self, learning_rate): - ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) # ----------------------------------------------------------------------- # Initializer # ----------------------------------------------------------------------- class Initializer(object): - __slots__ = ['handle', 'p_handle'] - def __init__(self, handle, p_handle=0): - self.p_handle = ffi.new('flexflow_initializer_t *') - if (handle == None): - self.p_handle.impl = ffi.NULL - else: - self.p_handle.impl = handle.impl - self.handle = self.p_handle[0] - assert ffi.typeof(self.handle) == ffi.typeof('flexflow_initializer_t'), "Initializer handle is wrong" + __slots__ = ["handle", "p_handle"] + + def __init__(self, handle, p_handle=0): + self.p_handle = ffi.new("flexflow_initializer_t *") + if handle == None: + self.p_handle.impl = ffi.NULL + else: + self.p_handle.impl = handle.impl + self.handle = self.p_handle[0] + assert ffi.typeof(self.handle) == ffi.typeof( + "flexflow_initializer_t" + ), "Initializer handle is wrong" + # ----------------------------------------------------------------------- # GlorotUniform # ----------------------------------------------------------------------- + class GlorotUniformInitializer(Initializer): - __slots__ = ['glorot_handle', '_glorot_handle'] - def __init__(self, seed): - self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) - self._glorot_handle = ffi.gc(self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy) - super(GlorotUniformInitializer, self).__init__(self.glorot_handle) + __slots__ = ["glorot_handle", "_glorot_handle"] + + def __init__(self, seed): + self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) + self._glorot_handle = ffi.gc( + self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + ) + super(GlorotUniformInitializer, self).__init__(self.glorot_handle) + # ----------------------------------------------------------------------- # ZeroInitializer # ----------------------------------------------------------------------- + class ZeroInitializer(Initializer): - __slots__ = ['zero_handle', '_zero_handle'] - def __init__(self): - self.zero_handle = ffc().flexflow_zero_initializer_create() - self._zero_handle = ffi.gc(self.zero_handle, ffc().flexflow_zero_initializer_destroy) - super(ZeroInitializer, self).__init__(self.zero_handle) + __slots__ = ["zero_handle", "_zero_handle"] + + def __init__(self): + self.zero_handle = ffc().flexflow_zero_initializer_create() + self._zero_handle = ffi.gc( + self.zero_handle, ffc().flexflow_zero_initializer_destroy + ) + super(ZeroInitializer, self).__init__(self.zero_handle) + # ----------------------------------------------------------------------- # UniformInitializer # ----------------------------------------------------------------------- + class UniformInitializer(Initializer): - __slots__ = ['uniform_handle', '_uniform_handle'] - def __init__(self, seed, minv, maxv): - self.uniform_handle = ffc().flexflow_uniform_initializer_create(seed, minv, maxv) - self._uniform_handle = ffi.gc(self.uniform_handle, ffc().flexflow_uniform_initializer_destroy) - super(UniformInitializer, self).__init__(self.uniform_handle) + __slots__ = ["uniform_handle", "_uniform_handle"] + + def __init__(self, seed, minv, maxv): + self.uniform_handle = ffc().flexflow_uniform_initializer_create( + seed, minv, maxv + ) + self._uniform_handle = ffi.gc( + self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + ) + super(UniformInitializer, self).__init__(self.uniform_handle) + # ----------------------------------------------------------------------- # NormInitializer # ----------------------------------------------------------------------- + class NormInitializer(Initializer): - __slots__ = ['norm_handle', '_norm_handle'] - def __init__(self, seed, mean, stddev): - self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) - self._norm_handle = ffi.gc(self.norm_handle, ffc().flexflow_norm_initializer_destroy) - super(NormInitializer, self).__init__(self.norm_handle) + __slots__ = ["norm_handle", "_norm_handle"] + + def __init__(self, seed, mean, stddev): + self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) + self._norm_handle = ffi.gc( + self.norm_handle, ffc().flexflow_norm_initializer_destroy + ) + super(NormInitializer, self).__init__(self.norm_handle) + # ----------------------------------------------------------------------- # PerfMetrics # ----------------------------------------------------------------------- + class PerfMetrics(object): - __slots__= ['handle', '_handle'] - def __init__(self, handle): - self.handle = handle - self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) + __slots__ = ["handle", "_handle"] + + def __init__(self, handle): + self.handle = handle + self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) + + def get_accuracy(self): + return ffc().flexflow_per_metrics_get_accuracy(self.handle) - def get_accuracy(self): - return ffc().flexflow_per_metrics_get_accuracy(self.handle) # ----------------------------------------------------------------------- # NetConfig # ----------------------------------------------------------------------- + class NetConfig(object): - def __init__(self): - self.handle = ffc().flexflow_net_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) - cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) - self.dataset_path = ffi.string(cpath) + def __init__(self): + self.handle = ffc().flexflow_net_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) + cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cpath) + # ----------------------------------------------------------------------- # DLRMConfig # ----------------------------------------------------------------------- + class DLRMConfig(object): - def __init__(self): - self.handle = ffc().flexflow_dlrm_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) + def __init__(self): + self.handle = ffc().flexflow_dlrm_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) + + cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cstr) - cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) - self.dataset_path = ffi.string(cstr) + cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) + self.arch_interaction_op = ffi.string(cstr) - cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) - self.arch_interaction_op = ffi.string(cstr) + self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( + self.handle + ) + self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) + self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) + self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( + self.handle + ) + self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) - self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(self.handle) - self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) - self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) - self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(self.handle) - self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) + mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) + self.mlp_bot = [] + for i in range(0, mlp_bot_c[0]): + self.mlp_bot.append(mlp_bot_c[i + 1]) - mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) - self.mlp_bot = [] - for i in range(0, mlp_bot_c[0]): - self.mlp_bot.append(mlp_bot_c[i+1]) + mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) + self.mlp_top = [] + for i in range(0, mlp_top_c[0]): + self.mlp_top.append(mlp_top_c[i + 1]) - mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) - self.mlp_top = [] - for i in range(0, mlp_top_c[0]): - self.mlp_top.append(mlp_top_c[i+1]) + embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) + self.embedding_size = [] + for i in range(0, embedding_size_c[0]): + self.embedding_size.append(embedding_size_c[i + 1]) - embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) - self.embedding_size = [] - for i in range(0, embedding_size_c[0]): - self.embedding_size.append(embedding_size_c[i+1]) # ----------------------------------------------------------------------- # Single DataLoader # ----------------------------------------------------------------------- + class SingleDataLoader(object): - __slots__ = ['handle', '_handle'] - def __init__(self, ffmodel, input, full_input, num_samples, data_type): - assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" - assert type(input) is Tensor, "SingleDataLoader input is wrong" - if type(full_input) is Tensor: - self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) - else: - self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) - self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) - - def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): - assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" - c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc().flexflow_single_dataloader_create(ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type) - - def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): - # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" - c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc().flexflow_single_dataloader_create2(ffmodel.handle, input.handle, full_input, num_samples, c_data_type) - - @property - def num_samples(self): - return ffc().flexflow_single_dataloader_get_num_samples(self.handle) - - @num_samples.setter - def num_samples(self, samples): - ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) - - def next_batch(self, ffmodel): - """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. - - :returns: None -- no returns. - """ - ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) - - def reset(self): - """Reset the current position of the dataloder to 0. - - :returns: None -- no returns. - """ - ffc().flexflow_single_dataloader_reset(self.handle) + __slots__ = ["handle", "_handle"] + + def __init__(self, ffmodel, input, full_input, num_samples, data_type): + assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" + assert type(input) is Tensor, "SingleDataLoader input is wrong" + if type(full_input) is Tensor: + self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) + else: + self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) + self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) + + def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): + assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create( + ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type + ) + + def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): + # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create2( + ffmodel.handle, input.handle, full_input, num_samples, c_data_type + ) + + @property + def num_samples(self): + return ffc().flexflow_single_dataloader_get_num_samples(self.handle) + + @num_samples.setter + def num_samples(self, samples): + ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) + + def next_batch(self, ffmodel): + """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. + + :returns: None -- no returns. + """ + ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) + + def reset(self): + """Reset the current position of the dataloder to 0. + + :returns: None -- no returns. + """ + ffc().flexflow_single_dataloader_reset(self.handle) + class RegionNdarray(object): - __slots__ = ['__array_interface__'] - def __init__(self, shape, data_type, base_ptr, strides, read_only): - # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html - if (data_type == DataType.DT_HALF): - field_type = " axes_vec; + for (int i = 0; i < n; i++) { + axes_vec.push_back(axes[i]); + } + if (use_two_residuals) { + assert(residual2 != nullptr); + } + handle->residual_layer_norm(input, + residual1, + residual2, + tensor_outputs, + use_two_residuals, + axes_vec, + elementwise_affine, + eps, + use_bias, + input->data_type, + name); + assert(tensor_outputs[0] != nullptr); + assert(tensor_outputs[1] != nullptr); + DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 " + "%p, output0: %p, " + "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps " + "%f, use_bias: %d, name %s", + input, + residual1, + residual2, + tensor_outputs[0], + tensor_outputs[1], + use_two_residuals, + elementwise_affine, + eps, + use_bias, + name); + flexflow_tensor_t *tensor_outputs_wrapped = + (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); + tensor_outputs_wrapped[0] = FFCObjectWrapper::wrap(tensor_outputs[0]); + tensor_outputs_wrapped[1] = FFCObjectWrapper::wrap(tensor_outputs[1]); + return tensor_outputs_wrapped; +} + flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_model_t handle_, const flexflow_tensor_t input_, @@ -667,16 +726,16 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); - DEBUG_PRINT("[LayerNorm] new Tensor %p, input %p, residual %p, output0: %p, " + DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, " "output1: %p, elementwise_affine %d, eps " - "%f, name %s", - tensor, + "%f, use_bias %d, name %s", input, residual, tensor_outputs[0], tensor_outputs[1], elementwise_affine, eps, + use_bias, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 9ac440080f..626e56d64f 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -217,8 +217,8 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( residual.get_float_ptr(), added_output.get_float_ptr(), output.get_float_ptr(), - gamma.get_float_ptr(), - m->use_bias ? beta.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, stream); } else if (m->input_type[0] == DT_HALF) { AddBiasResidualLayerNorm::inference_kernel( @@ -230,8 +230,8 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( residual.get_half_ptr(), added_output.get_half_ptr(), output.get_half_ptr(), - gamma.get_half_ptr(), - m->use_bias ? beta.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); } else { assert(false && "unsupport datatype in layernorm"); diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index f865c6dd2a..3282bc57d9 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -37,6 +37,7 @@ #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" +#include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" @@ -475,6 +476,11 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_RESIDUAL_LAYERNORM: { + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); + break; + } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { assert(false && "Operator AddBiasResidualLayerNorm does not support " "the forward() task"); @@ -928,6 +934,45 @@ __host__ void m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 2); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 13927e8ee6..5f2874e662 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -37,6 +37,7 @@ #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" #include "flexflow/ops/layer_norm.h" +#include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" @@ -488,6 +489,11 @@ __host__ void FusedOp::forward_task(Task const *task, m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_RESIDUAL_LAYERNORM: { + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); + break; + } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { assert(false && "Operator AddBiasResidualLayerNorm does not support " "the forward() task"); @@ -959,6 +965,45 @@ __host__ void m, my_input_accessor[0], my_output_accessor[0], gamma, beta); break; } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 2); diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 4d04710b2a..12751113a2 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -257,20 +257,21 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, cudaEventRecord(t_start, stream); } if (m->input_type[0] == DT_FLOAT) { - LayerNorm::forward_kernel(m, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - m->use_bias ? beta.get_float_ptr() - : nullptr, - stream); + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); } else if (m->input_type[0] == DT_HALF) { - LayerNorm::forward_kernel(m, - input.get_half_ptr(), - output.get_half_ptr(), - gamma.get_half_ptr(), - m->use_bias ? beta.get_half_ptr() : nullptr, - stream); + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); } else { assert(false && "unsupport datatype in layernorm"); } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc new file mode 100644 index 0000000000..3bec09521a --- /dev/null +++ b/src/ops/residual_layer_norm.cc @@ -0,0 +1,823 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/residual_layer_norm.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +bool operator==(ResidualLayerNormParams const &lhs, + ResidualLayerNormParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && + lhs.elementwise_affine == rhs.elementwise_affine && + lhs.use_bias == rhs.use_bias && + lhs.use_two_residuals == rhs.use_two_residuals; +} + +bool ResidualLayerNormParams::is_valid( + std::tuple const &input) const { + return std::get<0>(input).is_valid() && std::get<1>(input).is_valid() && + (!use_two_residuals || std::get<2>(input).is_valid()); +} + +ResidualLayerNormParams ResidualLayerNorm::get_params() const { + ResidualLayerNormParams params; + params.layer_guid = this->layer_guid; + params.axes = this->axes; + params.elementwise_affine = this->elementwise_affine; + params.eps = this->eps; + params.use_bias = this->use_bias; + params.use_two_residuals = this->use_two_residuals; + return params; +} + +void FFModel::residual_layer_norm(const Tensor input, + const Tensor residual1, + const Tensor residual2, + Tensor *outputs, + bool use_two_residuals, + std::vector const &axes, + bool elementwise_affine, + float eps, + bool use_bias, + DataType data_type, + char const *name) { + // In PyTorch, axes must be the sizes of the last axes.size() dimensions of + // the input tensor. However, since the tensor dimensions are reversed in + // FlexFlow (batch size is the last dimension), we require that axes must be + // the sizes of the FIRST axes.size() dimensions of the input tensor. + + // Another difference is that in PyTorch, the axes vector should contain the + // sizes of the dimensions with respect to which you want to compute the + // layernorm. In FlexFlow, instead, axes should contain the INDICES of the + // dimensions in question. We do this because the size of a dimension might be + // different when splitting a tensor in model parallelism. + assert( + axes.size() <= input->num_dims && + "number of axes must be less than tensor dimensions"); // input does not + // have replica + // dimension here + for (int i = 0; i < axes.size(); i++) { + assert(axes[i] == i && "axes must be the first axes.size() dimensions"); + } + + // Check dims + assert(input->num_dims == residual1->num_dims); + if (use_two_residuals) { + assert(residual2 != nullptr); + assert(input->num_dims == residual2->num_dims); + } + for (int i = 0; i < input->num_dims; i++) { + assert(input->dims[i] == residual1->dims[i]); + if (use_two_residuals) { + assert(input->dims[i] == residual2->dims[i]); + } + } + + if (data_type == DT_NONE) { + data_type = input->data_type; + } + + int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0; + Layer *ln = nullptr; + Tensor casted_input = + (data_type != input->data_type) + ? cast(input, data_type, "type cast for residual_layer_norm") + : input; + Tensor casted_residual1 = + (data_type != residual1->data_type) + ? cast(residual1, data_type, "type cast for residual1_layer_norm") + : residual1; + Tensor casted_residual2 = nullptr; + if (use_two_residuals) { + casted_residual2 = + (data_type != residual2->data_type) + ? cast(residual2, data_type, "type cast for residual2_layer_norm") + : residual2; + } + ln = new Layer(this, + OP_RESIDUAL_LAYERNORM, + data_type, + name, + 2 + use_two_residuals /*inputs*/, + num_weights, + 2 /*outputs*/, + casted_input, + casted_residual1, + casted_residual2); + ln->outputs[0] = create_tensor_legion_ordering( + input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/); + ln->outputs[1] = create_tensor_legion_ordering( + input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/); + { + int numdims = axes.size(); + int dims[numdims]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[axes[i]]; + } + if (num_weights >= 1) { + assert(elementwise_affine); + ln->weights[0] = create_weight_legion_ordering(numdims, + dims, + data_type, + ln, + false /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + if (num_weights == 2) { + assert(use_bias); + ln->weights[1] = create_weight_legion_ordering(numdims, + dims, + data_type, + ln, + false /*create_grad*/, + nullptr, + CHOSEN_SYNC_TYPE); + } + } + } + ln->add_int_property("elementwise_affine", elementwise_affine); + ln->add_int_property("use_bias", use_bias); + ln->add_int_vector_property("axes", axes); + ln->add_float_property("eps", eps); + ln->add_int_property("use_two_residuals", use_two_residuals); + layers.push_back(ln); + outputs[0] = ln->outputs[0]; + outputs[1] = ln->outputs[1]; +} + +Op *ResidualLayerNorm::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("elementwise_affine", value); + bool elementwise_affine = (bool)value; + layer->get_int_property("use_bias", value); + bool use_bias = (bool)value; + std::vector axes; + layer->get_int_vector_property("axes", axes); + float eps; + layer->get_float_property("eps", eps); + layer->get_int_property("use_two_residuals", value); + bool use_two_residuals = (bool)value; + return new ResidualLayerNorm(model, + layer->layer_guid, + inputs[0], + inputs[1], + use_two_residuals ? inputs[2] : nullptr, + use_two_residuals, + axes, + elementwise_affine, + use_bias, + eps, + false, // allocate_weights + layer->name); +} + +ResidualLayerNorm::ResidualLayerNorm( + FFModel &model, + ResidualLayerNormParams const ¶ms, + std::tuple const &inputs, + bool allocate_weights, + char const *name) + : ResidualLayerNorm(model, + params.layer_guid, + std::get<0>(inputs), + std::get<1>(inputs), + params.use_two_residuals ? std::get<2>(inputs) + : nullptr, + params.use_two_residuals, + params.axes, + params.elementwise_affine, + params.use_bias, + params.eps, + allocate_weights, + name) {} + +ResidualLayerNorm::ResidualLayerNorm(FFModel &model, + LayerID const &_layer_guid, + const ParallelTensor _input, + const ParallelTensor _residual1, + const ParallelTensor _residual2, + bool _use_two_residuals, + std::vector const &_axes, + bool _elementwise_affine, + bool _use_bias, + float _eps, + bool allocate_weights, + char const *name) + : Op(model, + OP_RESIDUAL_LAYERNORM, + _input->data_type, + name, + 2 + _use_two_residuals /*inputs*/, + _elementwise_affine ? (_use_bias ? 2 : 1) : 0 /*weights*/, + 2 /*outputs*/, + _input, + _residual1, + _use_two_residuals ? _residual2 : nullptr), + elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), + use_bias(_use_bias), use_two_residuals(_use_two_residuals) { + // overwrite layer_guid + layer_guid = _layer_guid; + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, _input->dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[1] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, _input->dims, _input->data_type, this, 1 /*owner_idx*/); + assert(check_output_input_weight_parallel_dims(allocate_weights)); + + int M = 1; + for (int i = 0; i < axes.size(); i++) { + M *= inputs[0]->dims[axes[i]].size; + } + int num_replicas = 1; + for (int i = 0; i < inputs[0]->num_dims; i++) { + if (inputs[0]->dims[i].is_replica_dim) { + num_replicas *= inputs[0]->dims[i].size; + } + } + effective_num_elements = M; + effective_batch_size = (inputs[0]->get_volume() / num_replicas) / M; + if (!elementwise_affine) { + assert(numWeights == 0); + } else { + if (!use_bias) { + assert(numWeights == 1); // weight + } else { + assert(numWeights == 2); // weight + bias + } + } + + if (allocate_weights) { + int seed = std::rand(); + if (numWeights >= 1) { + assert(elementwise_affine); + + ParallelTensorShape beta_gamma_shape = _input->get_shape(); + for (int i = axes.size(); i < beta_gamma_shape.num_dims - 1; i++) { + beta_gamma_shape.dims[i].size = 1; + } + + // weight + Initializer *gamma_initializer = new UniformInitializer(seed, 1.0f, 1.0f); + weights[0] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, // axes.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + false /*create_grad*/, + gamma_initializer, + CHOSEN_SYNC_TYPE); + + // bias + if (numWeights == 2) { + assert(use_bias); + Initializer *beta_initializer = + new UniformInitializer(seed, 0.0f, 0.0f); + weights[1] = model.create_parallel_weight_legion_ordering( + beta_gamma_shape.num_dims, //.size(), + beta_gamma_shape.dims, + _input->data_type, + NULL /*owner_op*/, + false /*create_grad*/, + beta_initializer, + CHOSEN_SYNC_TYPE); + } + } + } +} + +void ResidualLayerNorm::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(RESIDUAL_LAYERNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ResidualLayerNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + int field_id = 0; + // input + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual1 + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual2 + if (use_two_residuals) { + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[2]->region)); + launcher.add_field(field_id++, FID_DATA); + } + // added: input + residual(s) + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // layer norm output + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // weights + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + } + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void ResidualLayerNorm::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ResidualLayerNorm)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // input + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual2 + if (use_two_residuals) { + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + } + // added: input + residual(s) + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // layer norm output + launcher.add_region_requirement(RegionRequirement(outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // weights + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + } + } + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *ResidualLayerNorm::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ResidualLayerNorm *ln = (ResidualLayerNorm *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + ResidualLayerNormMeta *meta = + new ResidualLayerNormMeta(handle, ln, gpu_mem_allocator); + meta->input_type[0] = ln->inputs[0]->data_type; + meta->input_type[1] = ln->inputs[1]->data_type; + if (ln->use_two_residuals) { + meta->input_type[2] = ln->inputs[2]->data_type; + } + if (ln->elementwise_affine) { + meta->weight_type[0] = ln->weights[0]->data_type; + if (ln->use_bias) { + meta->weight_type[1] = ln->weights[1]->data_type; + } + } + meta->output_type[0] = ln->outputs[0]->data_type; + meta->output_type[1] = ln->outputs[1]->data_type; + return meta; +} + +void ResidualLayerNorm::forward(FFModel const &ff) { + assert(false); +} + +void ResidualLayerNorm::backward(FFModel const &ff) { + assert(false); +} + +Op *ResidualLayerNorm::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + ResidualLayerNormParams params = get_params(); + return new ResidualLayerNorm( + ff, + params, + {inputs[0], inputs[1], params.use_two_residuals ? inputs[2] : nullptr}, + true, + this->name); +} + +FutureMap ResidualLayerNorm::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + + IndexLauncher launcher(RESIDUAL_LAYERNORM_INF_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int field_id = 0; + // input + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual1 + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual2 + if (use_two_residuals) { + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[2]->region)); + launcher.add_field(field_id++, FID_DATA); + } + // added: input + residual(s) + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // layer norm output + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + } + } + return runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + + ResidualLayerNormMeta const *m = + *((ResidualLayerNormMeta **)task->local_args); + + assert(regions.size() == + 4 + m->use_two_residuals + + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + + int region_idx = 0, task_region_idx = 0; + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual1 = + helperGetGenericTensorAccessorRO(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = + helperGetGenericTensorAccessorRO(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorW added_output = + helperGetGenericTensorAccessorWO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + + task_region_idx = 0; + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + Domain residual1_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + Domain residual2_domain; + if (m->use_two_residuals) { + residual2_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + assert(in_domain.get_volume() == residual2_domain.get_volume()); + assert(residual2_domain == in_domain); + } + Domain added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + Domain out_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + Domain gamma_domain, beta_domain; + if (m->elementwise_affine) { + gamma_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + assert(gamma_domain.get_volume() == m->effective_num_elements); + int numdims = gamma_domain.get_dim(); + size_t vol = 1; + int i = 0; + while (vol < gamma_domain.get_volume()) { + int g_d = gamma_domain.hi()[i] - gamma_domain.lo()[i] + 1; + int in_d = in_domain.hi()[i] - in_domain.lo()[i] + 1; + assert(g_d == in_d); + vol *= g_d; + i++; + } + if (m->use_bias) { + beta_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + assert(gamma_domain == beta_domain); + } + } + assert(in_domain.get_volume() == out_domain.get_volume()); + assert(out_domain.get_volume() == added_out_domain.get_volume()); + assert(in_domain.get_volume() == residual1_domain.get_volume()); + assert(in_domain == out_domain); + assert(added_out_domain == out_domain); + assert(residual1_domain == in_domain); + assert(in_domain.get_volume() == + m->effective_num_elements * m->effective_batch_size); + + ResidualLayerNorm::inference_kernel_wrapper( + m, input, residual1, residual2, added_output, output, gamma, beta); +} + +bool ResidualLayerNorm::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->axes.size()); + for (size_t i = 0; i < this->axes.size(); i++) { + sez.serialize(this->axes[i]); + } + sez.serialize(this->elementwise_affine); + sez.serialize(this->eps); + sez.serialize(this->use_bias); + sez.serialize(this->use_two_residuals); +} + +using PCG::Node; +/*static*/ +Node ResidualLayerNorm::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + size_t num_axes; + std::vector axes; + bool elementwise_affine; + bool use_bias; + bool use_two_residuals; + float eps; + size_t id, transformer_layer_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(num_axes); + for (size_t i = 0; i < num_axes; i++) { + int axis_idx; + dez.deserialize(axis_idx); + axes.push_back(axis_idx); + } + dez.deserialize(elementwise_affine); + dez.deserialize(eps); + dez.deserialize(use_bias); + dez.deserialize(use_two_residuals); + if (use_two_residuals) { + assert(num_inputs == 3); + } else { + assert(num_inputs == 2); + } + + ResidualLayerNormParams params; + params.layer_guid = layer_guid; + params.axes = axes; + params.elementwise_affine = elementwise_affine; + params.eps = eps; + params.use_bias = use_bias; + params.use_two_residuals = use_two_residuals; + if (use_two_residuals) { + return ff.get_or_create_node( + {inputs[0], inputs[1], inputs[2]}, params); + } else { + return ff.get_or_create_node( + {inputs[0], inputs[1], inputs[1]}, params); + } +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::ResidualLayerNormParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.axes.size()); + for (int n : params.axes) { + hash_combine(key, n); + } + hash_combine(key, params.elementwise_affine); + hash_combine(key, params.use_bias); + hash_combine(key, params.use_two_residuals); + return key; +} +}; // namespace std diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp new file mode 100644 index 0000000000..7b42392326 --- /dev/null +++ b/src/ops/residual_layer_norm.cpp @@ -0,0 +1,247 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/residual_layer_norm.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, + ResidualLayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; + use_two_residuals = ln->use_two_residuals; + effective_batch_size = ln->effective_batch_size; + effective_num_elements = ln->effective_num_elements; + profiling = ln->profiling; + eps = ln->eps; + DataType data_type = ln->data_type; + size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); +} + +ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + ? shared[lid] + : 0; + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void ResidualLayerNormKernel(int64_t N, + float eps, + T const *input_ptr, + T const *residual1_ptr, + T const *residual2_ptr, + T *X, + T *mean, + T *rstd, + T const *gamma, + T const *beta, + T *Y) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float sum1 = 0.0f; + float sum2 = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + const int64_t index = i * N + j; + const T residual2_val = (residual2_ptr == nullptr) + ? T(0) + : static_cast(residual2_ptr[index]); + X[index] = input_ptr[index] + residual1_ptr[index] + residual2_val; + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); + } + if (threadIdx.x < kCUDABlockReduceNumThreads) { + sum1 = BlockReduceSum( + sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + sum2 = BlockReduceSum( + sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + } + if (threadIdx.x == 0) { + float const scale = float(1) / static_cast(N); + sum1 *= scale; + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + const T_ACC beta_v = + beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; + } +} + +/*static*/ +template +void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, + T const *input_ptr, + T const *residual1_ptr, + T const *residual2_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + hipStream_t stream) { + + std::pair kernel1_parallelism = + std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->effective_batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualLayerNormKernel), + num_blocks, + num_threads, + 0, + stream, + m->effective_num_elements, + m->eps, + input_ptr, + residual1_ptr, + residual2_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); +} + +/*static*/ +void ResidualLayerNorm::inference_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &residual1, + GenericTensorAccessorR const &residual2, + GenericTensorAccessorW &added_output, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + ResidualLayerNorm::inference_kernel( + m, + input.get_float_ptr(), + residual1.get_float_ptr(), + residual2.get_float_ptr(), + added_output.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + ResidualLayerNorm::inference_kernel( + m, + input.get_half_ptr(), + residual1.get_half_ptr(), + residual2.get_half_ptr(), + added_output.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed); + } +} + +}; // namespace FlexFlow diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu new file mode 100644 index 0000000000..3bd18217cb --- /dev/null +++ b/src/ops/residual_layer_norm.cu @@ -0,0 +1,242 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/residual_layer_norm.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +#define C10_WARP_SIZE 32 +constexpr int kCUDABlockReduceNumThreads = 512; +constexpr int kCUDANumThreads = 256; + +ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, + ResidualLayerNorm const *ln, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; + use_two_residuals = ln->use_two_residuals; + effective_batch_size = ln->effective_batch_size; + effective_num_elements = ln->effective_num_elements; + profiling = ln->profiling; + eps = ln->eps; + DataType data_type = ln->data_type; + size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); +} + +ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, + unsigned int delta, + int width = warpSize, + unsigned int mask = 0xffffffff) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +template +__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + ? shared[lid] + : 0; + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +template +__global__ void ResidualLayerNormKernel(int64_t N, + float eps, + T const *input_ptr, + T const *residual1_ptr, + T const *residual2_ptr, + T *X, + T *mean, + T *rstd, + T const *gamma, + T const *beta, + T *Y) { + __shared__ float m_shared[C10_WARP_SIZE]; + __shared__ float v_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float sum1 = 0.0f; + float sum2 = 0.0f; + for (int64_t j = threadIdx.x; j < N; + j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + const int64_t index = i * N + j; + const T residual2_val = (residual2_ptr == nullptr) + ? T(0) + : static_cast(residual2_ptr[index]); + X[index] = input_ptr[index] + residual1_ptr[index] + residual2_val; + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); + } + if (threadIdx.x < kCUDABlockReduceNumThreads) { + sum1 = BlockReduceSum( + sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + sum2 = BlockReduceSum( + sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + } + if (threadIdx.x == 0) { + float const scale = float(1) / static_cast(N); + sum1 *= scale; + sum2 = max(sum2 * scale - sum1 * sum1, float(0)); + mean[i] = static_cast(sum1); + rstd[i] = static_cast(rsqrt(sum2 + eps)); + } + + __syncthreads(); + + using T_ACC = T; + for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + const T_ACC beta_v = + beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; + } +} + +/*static*/ +template +void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, + T const *input_ptr, + T const *residual1_ptr, + T const *residual2_ptr, + T *added_output_ptr, + T *output_ptr, + T const *gamma_ptr, + T const *beta_ptr, + cudaStream_t stream) { + + std::pair kernel1_parallelism = + std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); + std::pair kernel2_parallelism = + std::make_pair(m->effective_batch_size, kCUDANumThreads); + + int num_blocks = + std::max(kernel1_parallelism.first, kernel2_parallelism.first); + int num_threads = + std::max(kernel1_parallelism.second, kernel2_parallelism.second); + + ResidualLayerNormKernel + <<>>(m->effective_num_elements, + m->eps, + input_ptr, + residual1_ptr, + residual2_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); +} + +/*static*/ +void ResidualLayerNorm::inference_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &residual1, + GenericTensorAccessorR const &residual2, + GenericTensorAccessorW &added_output, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + ResidualLayerNorm::inference_kernel( + m, + input.get_float_ptr(), + residual1.get_float_ptr(), + m->use_two_residuals ? residual2.get_float_ptr() : nullptr, + added_output.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + ResidualLayerNorm::inference_kernel( + m, + input.get_half_ptr(), + residual1.get_half_ptr(), + m->use_two_residuals ? residual2.get_half_ptr() : nullptr, + added_output.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed); + } +} + +}; // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 91b21e8d8f..c7b6e1257a 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -170,6 +170,8 @@ std::string get_operator_type_name(OperatorType type) { return "Mean"; case OP_LAYERNORM: return "LayerNorm"; + case OP_RESIDUAL_LAYERNORM: + return "ResidualLayerNorm"; case OP_ADD_BIAS_RESIDUAL_LAYERNORM: return "AddBiasResidualLayerNorm"; case OP_SIGMOID_SILU_MULTI: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 037be739e7..408de57c54 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -40,6 +40,7 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/sampling.h" @@ -2704,6 +2705,10 @@ void FFModel::deserialize_graph_optimal_view( node = LayerNorm::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_RESIDUAL_LAYERNORM: { + node = ResidualLayerNorm::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { node = AddBiasResidualLayerNorm::deserialize( *this, dez, inputs, num_inputs); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 0cb50733a3..5ef55992ef 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -51,6 +51,7 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" @@ -3103,6 +3104,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_RESIDUAL_LAYERNORM: { + Op *op = + ResidualLayerNorm::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { Op *op = AddBiasResidualLayerNorm::create_operator_from_layer( *this, layer, inputs); @@ -5176,6 +5183,39 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + // ResidualLayerNorm task + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_INIT_TASK_ID, + "residual_layernorm_init_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_init_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_INF_TASK_ID, + "residual_layernorm_fwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_inference_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // AddBiasResidualLayerNorm task { TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 43f3dc7cf9..6b2d223f54 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -27,6 +27,7 @@ #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" #include "flexflow/ops/reshape.h" +#include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/reverse.h" #include "flexflow/ops/rms_norm.h" @@ -96,6 +97,8 @@ tl::optional get_op_parameters(Op const *op) { return ((TreeIncMultiHeadSelfAttention *)op)->get_params(); case OP_LAYERNORM: return ((LayerNorm *)op)->get_params(); + case OP_RESIDUAL_LAYERNORM: + return ((ResidualLayerNorm *)op)->get_params(); case OP_ADD_BIAS_RESIDUAL_LAYERNORM: return ((AddBiasResidualLayerNorm *)op)->get_params(); case OP_SIGMOID_SILU_MULTI: diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 8618c41129..e8b986582f 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -33,6 +33,7 @@ #include "flexflow/ops/linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" +#include "flexflow/ops/residual_layer_norm.h" #include "flexflow/ops/residual_rms_norm.h" #include "flexflow/ops/rms_norm.h" #include "flexflow/ops/sigmoid_silu_multi.h" From ee6090ebec6548ef5294acec91cd0bc59934a70c Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Fri, 29 Sep 2023 11:04:16 -0400 Subject: [PATCH 239/344] [SpecInfer] Reduce single request per batch overhead (#1155) * Initial commit. * Format * Update batch_config setup. --- include/flexflow/batch_config.h | 3 +-- src/runtime/request_manager.cc | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 8aa69a3cad..d2fbd6219a 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -46,9 +46,8 @@ class BatchConfig { void print() const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); - static int const MAX_NUM_REQUESTS = 4; + static int const MAX_NUM_REQUESTS = 7; static int const MAX_NUM_TOKENS = 64; - static int const MAX_PROMPT_LENGTH = 62; static int const MAX_SEQ_LENGTH = 256; // These are set by update diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5489c9b06d..6f0a1f3851 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1144,6 +1144,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; } new_bc.num_tokens_to_commit++; + request.llm_cache_size++; } } } @@ -1255,6 +1256,19 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( "Exceeding the space available in the TreeVerify batch"); break; } + + if (new_bc.num_tokens + request.llm_cache_size >= request.initial_len) { + // launch the request into running phase after loading all prompt + request.status = Request::RUNNING; + new_bc.request_running[i] = true; + + std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + + dfs_tree_inputs[guid] = + std::vector>{std::make_pair( + request.tokens.back(), request.tokens.size() - 1)}; + } } else { // launch the request into running phase after loading all prompt if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) { request.status = Request::RUNNING; From 426aa7d6f6a8627e7751a9a768b7e7c0f004c435 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 29 Sep 2023 17:27:22 -0400 Subject: [PATCH 240/344] Support new Falcon model (#1158) * support new falcon * maybe fix * . * . * . * fix * fix * cleanup --- .github/workflows/gpu-ci-daemon.yml | 1 + .github/workflows/gpu-ci.yml | 1 + .github/workflows/multinode-test.yml | 1 + python/flexflow/serve/__init__.py | 18 ++++---- python/flexflow/serve/models/falcon.py | 21 +++++++-- python/flexflow/serve/serve.py | 62 +++++++++++++++++++------- 6 files changed, 75 insertions(+), 29 deletions(-) diff --git a/.github/workflows/gpu-ci-daemon.yml b/.github/workflows/gpu-ci-daemon.yml index 603b44c34e..b36e7b49e1 100644 --- a/.github/workflows/gpu-ci-daemon.yml +++ b/.github/workflows/gpu-ci-daemon.yml @@ -34,5 +34,6 @@ jobs: run: | pip3 install pip --upgrade pip3 install pyopenssl --upgrade + pip3 install urllib3 --upgrade pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py --daemon diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index aee16832f3..4a43a3dee7 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -56,6 +56,7 @@ jobs: run: | pip3 install pip --upgrade pip3 install pyopenssl --upgrade + pip3 install urllib3 --upgrade pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index 37f81b615f..ca2b47df27 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -25,6 +25,7 @@ jobs: run: | pip3 install pip --upgrade pip3 install pyopenssl --upgrade + pip3 install urllib3 --upgrade pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 2c6395aca1..9b282ae5f4 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -48,19 +48,19 @@ def init( fusion: Optional[bool] = None, ): """ - Configure FlexFlow Serve and start the runtime. - + Configure FlexFlow Serve and start the runtime. + The function takes, alternatively, configs_dict (a positional argument of type dictionary), or three mandatory named parameters, plus some additional optional named parameters. When passing a configs_dict, no named parameter should be specified, and the dictionary should have keys matching at least the mandatory named parameters. - + The three mandatory parameters, which cannot be changed after starting the runtime, are: - num_gpus: the number of GPUs to reserve for the runtime - memory_per_gpu: the amount of memory (in MB) to pre-allocate on each GPU - zero_copy_memory_per_node: the amount of zero-copy memory (in MB) to pre-allocate for each node - - The optional parameters are: + + The optional parameters are: - num_cpus: the number of CPU processors to reserve for the runtime, defaults to 4 - legion_utility_processors: number of Legion utility threads to create per process, defaults to 1 - data_parallelism_degree: the degree of parallelization in the data parallel dimension, defaults to 1 @@ -72,7 +72,7 @@ def init( - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - profiling: whether to enable the FlexFlow profiling mode, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True - + The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments. @@ -106,7 +106,7 @@ def init( :type profiling: Optional[bool], optional :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True :type fusion: Optional[bool], optional - + :raises ValueError: this function will raise an exception if the user passes both a configs_dict and some named parameters :raises TypeError: this function will raise an exception if the configs_dict is not a dictionary :raises ValueError: this function will raise an exception if the mandatory FlexFlow initialization parameters are missing, or are not positive integers: num_gpus, memory_per_gpu, zero_copy_memory_per_node @@ -152,7 +152,7 @@ def init( "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, "profiling": profiling, - "fusion": fusion + "fusion": fusion, } # Check that mandatory configs are present @@ -188,7 +188,7 @@ def init( if configs_dict.get("offload", None) is None: configs_dict["offload"] = False if configs_dict.get("offload_reserve_space_size", None) is None: - configs_dict["offload_reserve_space_size"] = 1024 ** 2 + configs_dict["offload_reserve_space_size"] = 1024**2 if configs_dict.get("use_4bit_quantization", None) is None: configs_dict["use_4bit_quantization"] = False if configs_dict.get("use_8bit_quantization", None) is None: diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 3d61349d67..2fd2f4953f 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -27,9 +27,17 @@ def __init__(self, hf_config): self.hidden_size = hf_config.hidden_size self.layer_norm_epsilon = hf_config.layer_norm_epsilon self.multi_query = hf_config.multi_query - self.n_head = hf_config.n_head + self.n_head = ( + hf_config.n_head + if "n_head" in hf_config.__dict__ + else hf_config.num_attention_heads + ) self.n_head_kv = hf_config.n_head_kv if "n_head_kv" in hf_config.__dict__ else 1 - self.n_layer = hf_config.n_layer + self.n_layer = ( + hf_config.n_layer + if "n_layer" in hf_config.__dict__ + else hf_config.num_hidden_layers + ) self.parallel_attn = hf_config.parallel_attn self.vocab_size = hf_config.vocab_size @@ -234,6 +242,11 @@ def build_model(self): def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) + n_head = ( + model.config.n_head + if "n_head" in model.config.__dict__ + else model.config.num_attention_heads + ) for name, params in model.named_parameters(): name = ( name.replace(".", "_") @@ -250,8 +263,8 @@ def convert_hf_model(model, dst_folder): params, [ model.config.hidden_size, - model.config.hidden_size // model.config.n_head, - model.config.hidden_size // model.config.n_head, + model.config.hidden_size // n_head, + model.config.hidden_size // n_head, ], 0, ) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 5cbe16b064..7e340a04e2 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -52,12 +52,15 @@ def __init__( self.topp = topp self.topk = topk + class GenerationResult: """A class to store the output of a generation request.""" + def __init__(self, text: str = None, tokens: list = None): self.output_text = text self.output_tokens = tokens + class LLM: """This class creates a LLM (Large-Language Model) object based on a model from HuggingFace""" @@ -87,6 +90,7 @@ def __init__( "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA), "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT), "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon), + "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon), "GPTBigCodeForCausalLM": (ModelType.STARCODER, FlexFlowSTARCODER), "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT), } @@ -124,21 +128,27 @@ def download_hf_config(self): def __get_revision_hashes(self, model_name: str, weights: bool): ff_revision = None - ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") if weights else os.path.join(self.tokenizer_path, "rev_sha.txt") + ff_revision_file = ( + os.path.join(self.weights_path, "rev_sha.txt") + if weights + else os.path.join(self.tokenizer_path, "rev_sha.txt") + ) if os.path.exists(ff_revision_file): ff_revision = "".join(open(ff_revision_file).read().split()) - + if os.path.exists(model_name) and os.path.isdir(model_name): # Local model files = os.listdir(model_name) - state = files + [os.path.getmtime(os.path.join(model_name, f)) for f in files] - latest_revision = hashlib.md5(str(state).encode('utf-8')).hexdigest() + state = files + [ + os.path.getmtime(os.path.join(model_name, f)) for f in files + ] + latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest() else: # Remote HuggingFace model hf_api = HfApi() latest_revision = hf_api.model_info(self.model_name).sha return ff_revision, ff_revision_file, latest_revision - + def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. @@ -168,7 +178,9 @@ def download_hf_weights_if_needed(self): os.makedirs(self.weights_path, exist_ok=True) print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=True) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, weights=True + ) # Download if needed if ff_revision != latest_revision: @@ -179,9 +191,13 @@ def download_hf_weights_if_needed(self): ) else: # Remote model - print(f"'{self.model_name}' local model weights were updated! Converting new weights now...") + print( + f"'{self.model_name}' local model weights were updated! Converting new weights now..." + ) # Download model from HuggingFace, or load it from the local folder - hf_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True) + hf_model = AutoModelForCausalLM.from_pretrained( + self.model_name, trust_remote_code=True + ) # Print log message to notify user download of model has finished if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): print("Done downloading HF weights. Converting them now...") @@ -217,15 +233,21 @@ def download_hf_tokenizer_if_needed(self): os.makedirs(self.tokenizer_path, exist_ok=True) # Get local revision SHA, check if it matches latest one on huggingface - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(self.model_name, weights=False) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, weights=False + ) if ff_revision != latest_revision: if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): # Local model - print(f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ...") + print( + f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." + ) else: # Remote model - print(f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now...") + print( + f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..." + ) # Download tokenizer from HuggingFace, or load it from the local folder if self.model_type == ModelType.LLAMA: hf_tokenizer = LlamaTokenizer.from_pretrained( @@ -242,7 +264,7 @@ def download_hf_tokenizer_if_needed(self): # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) - + else: print(f"Loading '{self.model_name}' tokenizer from the cache...") @@ -357,9 +379,15 @@ def compile( # Create request manager self.rm = RequestManager() - bos_token_id = -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id - eos_token_id = -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id - self.rm.register_tokenizer(self.model_type, bos_token_id, eos_token_id, self.tokenizer_path) + bos_token_id = ( + -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id + ) + eos_token_id = ( + -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id + ) + self.rm.register_tokenizer( + self.model_type, bos_token_id, eos_token_id, self.tokenizer_path + ) self.rm.register_output_filepath(self.output_file) self.im.init_operators_inference(self.model.ffmodel) @@ -382,7 +410,9 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128): elif type(prompts) == list: if len(prompts) == 0: return [] - return [self.model.ffmodel.generate(prompt, max_length) for prompt in prompts] + return [ + self.model.ffmodel.generate(prompt, max_length) for prompt in prompts + ] else: assert False, "Please pass a non-empty string or list of strings" From 0e68bb7070862a4f965ac643ef50067122e4ecbd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 30 Sep 2023 13:02:44 -0400 Subject: [PATCH 241/344] Fix `pip install` issues affecting some platforms (#1159) --- CMakeLists.txt | 2 +- cmake/pip_install/CMakeLists.txt | 2 +- python/flexflow/core/flexflowlib.py | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef571dc59c..32399ed4d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -630,7 +630,7 @@ install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST}) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index b7795daf71..7ce38c4abc 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,6 +1,6 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install diff --git a/python/flexflow/core/flexflowlib.py b/python/flexflow/core/flexflowlib.py index 6fc8e52cf7..717696e4ae 100644 --- a/python/flexflow/core/flexflowlib.py +++ b/python/flexflow/core/flexflowlib.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os, platform +import site, os, platform from typing import Any, Union from .flexflow_cffi_header import flexflow_header @@ -47,14 +47,14 @@ def get_shared_library(self) -> str: libname = "libflexflow" + self.get_library_extension() # If we installed with pip, use the full path instead of just the library name, because the library will not be in the LD_LIBRARY_PATH - packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False) - ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname) - installed_with_pip = os.path.exists(ff_lib_path) - - if installed_with_pip: - return ff_lib_path - else: - return libname + candidate_package_dirs = [pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func)] + candidate_package_dirs += sysconfig.get_python_lib(plat_specific=False, standard_lib=False) + for packages_dir in candidate_package_dirs: + ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname) + installed_with_pip = os.path.exists(ff_lib_path) + if installed_with_pip: + return ff_lib_path + return libname def get_c_header(self) -> str: return self._header From 65cb5706e74d380011db377429c6ed1fb911bde4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 30 Sep 2023 23:54:06 -0400 Subject: [PATCH 242/344] [Python] - Automatically install Rust with `pip install` if not available (#1161) * install rust if not available * fix * fix * fix * cleanup --- setup.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/setup.py b/setup.py index 5cc4d72b20..ad48fb9367 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,21 @@ def compute_version() -> str: with open(Path(__file__).parent / "requirements.txt", "r") as reqs_file: requirements = reqs_file.read().strip().split("\n") +# Install Rust if not yet available +try: + # Attempt to run a Rust command to check if Rust is installed + subprocess.check_output(['cargo', '--version']) +except FileNotFoundError: + print("Rust/Cargo not found, installing it...") + # Rust is not installed, so install it using rustup + try: + subprocess.run("curl https://sh.rustup.rs -sSf | sh -s -- -y", shell=True, check=True) + print("Rust and Cargo installed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error: {e}") + # Add the cargo binary directory to the PATH + os.environ["PATH"] = f"{os.path.join(os.environ.get('HOME', '/root'), '.cargo', 'bin')}:{os.environ.get('PATH', '')}" + setup( name="flexflow", version=compute_version(), From 5919fff9099b50a492edc1a9ce2d94a5868bc779 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 1 Oct 2023 05:47:32 -0400 Subject: [PATCH 243/344] Fix model configs (Falcon in C++, LLAMA in Python) (#1162) * cleanup * fix * fix * fix * fix loading of weights * import configs in models init (python) * remove unnecessary warning --- inference/incr_decoding/incr_decoding.cc | 2 +- inference/models/falcon.h | 8 +++- inference/spec_infer/spec_infer.cc | 2 +- python/flexflow/core/__init__.py | 3 +- python/flexflow/serve/models/__init__.py | 10 ++--- python/flexflow/serve/models/base.py | 3 -- python/flexflow/serve/models/falcon.py | 25 ++--------- python/flexflow/serve/models/llama.py | 27 ++---------- python/flexflow/serve/models/mpt.py | 28 ++---------- python/flexflow/serve/models/opt.py | 28 ++---------- python/flexflow/serve/models/starcoder.py | 28 ++---------- python/flexflow/serve/serve.py | 52 +++++++++++++---------- 12 files changed, 60 insertions(+), 156 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 3f913e4573..f3fd32878f 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -168,7 +168,7 @@ void FlexFlow::top_level_task(Task const *task, } else if (str == "OPTForCausalLM") { model_type = ModelType::OPT; break; - } else if (str == "RWForCausalLM") { + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { model_type = ModelType::FALCON; break; } else if (str == "GPTBigCodeForCausalLM") { diff --git a/inference/models/falcon.h b/inference/models/falcon.h index a822f9be34..6c9124fe4c 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -37,13 +37,17 @@ class FALCON { hidden_size = model_config["hidden_size"]; layer_norm_epsilon = model_config["layer_norm_epsilon"]; multi_query = model_config["multi_query"]; - n_head = model_config["n_head"]; + n_head = (model_config.find("n_head") != model_config.end()) + ? model_config["n_head"] + : model_config["num_attention_heads"]; if (model_config.contains("n_head_kv")) { n_head_kv = model_config["n_head_kv"]; } else { n_head_kv = 1; } - n_layer = model_config["n_layer"]; + n_layer = (model_config.find("n_layer") != model_config.end()) + ? model_config["n_layer"] + : model_config["num_hidden_layers"]; parallel_attn = model_config["parallel_attn"]; vocab_size = model_config["vocab_size"]; } catch (json::exception const &e) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 2b1fb6e817..a95b26c930 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -163,7 +163,7 @@ void get_model_meta(FilePaths &file_paths, } else if (str == "OPTForCausalLM") { model_metadata.llm_model_type = ModelType::OPT; break; - } else if (str == "RWForCausalLM") { + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { model_metadata.llm_model_type = ModelType::FALCON; break; } else if (str == "MPTForCausalLM") { diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 5e8e4ece81..ace6030a1b 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -124,7 +124,8 @@ def init_flexflow_runtime(configs_dict: Optional[dict] = None, **kwargs): # Pass parameters to the FlexFlow C++ runtime via command line arguments for arg in ff_args: if arg not in ff_arg_to_sysarg: - warnings.warn(f"Ignoring parameter {arg}: not recognized.") + # warnings.warn(f"Ignoring parameter {arg}: not recognized.") + continue else: sys_arg = [ff_arg_to_sysarg[arg]] if type(ff_args[arg]) == bool: diff --git a/python/flexflow/serve/models/__init__.py b/python/flexflow/serve/models/__init__.py index a1ca9152ce..7b0e632f53 100644 --- a/python/flexflow/serve/models/__init__.py +++ b/python/flexflow/serve/models/__init__.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .llama import FlexFlowLLAMA -from .opt import FlexFlowOPT -from .falcon import FlexFlowFalcon -from .starcoder import FlexFlowSTARCODER -from .mpt import FlexFlowMPT +from .llama import FlexFlowLLAMA, LLAMAConfig +from .opt import FlexFlowOPT, OPTConfig +from .falcon import FlexFlowFalcon, FalconConfig +from .starcoder import FlexFlowSTARCODER, STARCODERConfig +from .mpt import FlexFlowMPT, MPTConfig diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index b7f4e54fc1..19affd9b47 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -34,6 +34,3 @@ def build_model(self): def convert_hf_model(model, dst_folder): assert False, "Not implemented yet" - - def get_layers_with_weights(self): - assert False, "Not implemented yet" diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 2fd2f4953f..96268f5347 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -40,6 +40,9 @@ def __init__(self, hf_config): ) self.parallel_attn = hf_config.parallel_attn self.vocab_size = hf_config.vocab_size + # Standardized FlexFlow num heads fields below + self.num_attention_heads = self.n_head + self.num_key_value_heads = self.n_head_kv class FlexFlowFalcon(FlexFlowModel): @@ -277,25 +280,3 @@ def convert_hf_model(model, dst_folder): model.lm_head.weight.detach().cpu().numpy().tofile( os.path.join(dst_folder, "lm_head_weight") ) - - def get_layers_with_weights(self): - layer_names = [ - "word_embeddings_weight", - "ln_f_weight", - "lm_head_weight", - ] + [ - expr - for i in range(self.falcon_config.n_layer) - for expr in ( - f"layers_{i}_input_layernorm_weight", - f"layers_{i}_attention_weight", - f"layers_{i}_mlp_dense_h_to_4h_weight", - f"layers_{i}_mlp_dense_4h_to_h_weight", - ) - ] - layers_with_weights = { - layer_name: self.ffmodel.get_layer_by_name(layer_name) - for layer_name in layer_names - } - - return layers_with_weights diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index b8ea85b287..ba2f6e0826 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -25,15 +25,16 @@ def __init__(self, hf_config): self.max_beam_depth = 8 self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size + self.hidden_size = hf_config.hidden_size + self.rms_norm_eps = hf_config.rms_norm_eps + self.intermediate_size = hf_config.intermediate_size + # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.num_attention_heads self.num_key_value_heads = ( hf_config.num_attention_heads if hf_config.num_key_value_heads is None else hf_config.num_key_value_heads ) - self.hidden_size = hf_config.hidden_size - self.rms_norm_eps = hf_config.rms_norm_eps - self.intermediate_size = hf_config.intermediate_size class FlexFlowLLAMA(FlexFlowModel): @@ -262,23 +263,3 @@ def convert_hf_model(model, dst_folder): .replace("model_", "") ) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") - - def get_layers_with_weights(self): - layer_names = ["tok_embeddings_weight", "norm_weight", "output_weight"] + [ - expr - for i in range(self.llama_config.num_hidden_layers) - for expr in ( - f"layers_{i}_attention_norm_weight", - f"layers_{i}_attention_weight", - f"layers_{i}_ffn_norm_weight", - f"layers_{i}_feed_forward_w1_weight", - f"layers_{i}_feed_forward_w3_weight", - f"layers_{i}_feed_forward_w2_weight", - ) - ] - layers_with_weights = { - layer_name: self.ffmodel.get_layer_by_name(layer_name) - for layer_name in layer_names - } - - return layers_with_weights diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 6e1ca9fdfa..43a2514394 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -27,8 +27,9 @@ def __init__(self, hf_config): self.n_heads = hf_config.n_heads self.n_layers = hf_config.n_layers self.vocab_size = hf_config.vocab_size - hf_config.num_attention_heads = hf_config.n_heads - hf_config.hidden_size = hf_config.d_model + # Standardized FlexFlow num heads fields below + self.num_attention_heads = hf_config.n_heads + self.num_key_value_heads = hf_config.n_heads class FlexFlowMPT(FlexFlowModel): @@ -274,26 +275,3 @@ def convert_hf_model(model, dst_folder): os.path.join(dst_folder, "transformer_wte_weight"), os.path.join(dst_folder, "lm_head_weight"), ) - - def get_layers_with_weights(self): - layer_names = [ - "transformer_wte_weight", - "transformer_norm_f_weight", - "lm_head_weight", - ] + [ - expr - for i in range(self.mpt_config.n_layers) - for expr in ( - f"layers_{i}_norm_1_weight", - f"layers_{i}_attention_weight", - f"layers_{i}_norm_2_weight", - f"layers_{i}_ffn_up_proj_weight", - f"layers_{i}_ffn_down_proj_weight", - ) - ] - layers_with_weights = { - layer_name: self.ffmodel.get_layer_by_name(layer_name) - for layer_name in layer_names - } - - return layers_with_weights diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 639be2d5c4..d51287a181 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -30,10 +30,12 @@ def __init__(self, hf_config): self.hidden_size = hf_config.hidden_size self.layer_norm_elementwise_affine = hf_config.layer_norm_elementwise_affine self.max_position_embeddings = hf_config.max_position_embeddings - self.num_attention_heads = hf_config.num_attention_heads self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.word_embed_proj_dim = hf_config.word_embed_proj_dim + # Standardized FlexFlow num heads fields below + self.num_attention_heads = hf_config.num_attention_heads + self.num_key_value_heads = hf_config.num_attention_heads class FlexFlowOPT(FlexFlowModel): @@ -297,27 +299,3 @@ def convert_hf_model(model, dst_folder): os.path.join(dst_folder, "embed_tokens_weight"), os.path.join(dst_folder, "embed_tokens_weight_lm_head"), ) - - def get_layers_with_weights(self): - layer_names = [ - "embed_tokens_weight", - "embed_positions_weight", - "final_layer_norm_weight", - "embed_tokens_weight_lm_head", - ] + [ - expr - for i in range(self.opt_config.num_hidden_layers) - for expr in ( - f"layers_{i}_attention_layer_norm_weight", - f"layers_{i}_attention_weight", - f"layers_{i}_final_layer_norm_weight", - f"layers_{i}_fc1_weight", - f"layers_{i}_fc2_weight", - ) - ] - layers_with_weights = { - layer_name: self.ffmodel.get_layer_by_name(layer_name) - for layer_name in layer_names - } - - return layers_with_weights diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index feb5be7d75..4eee3182d1 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -27,11 +27,13 @@ def __init__(self, hf_config): self.hidden_size = hf_config.n_embd self.layer_norm_epsilon = hf_config.layer_norm_epsilon self.max_position_embeddings = hf_config.n_positions - self.num_attention_heads = hf_config.n_head self.num_hidden_layers = hf_config.n_layer self.vocab_size = hf_config.vocab_size self.intermediate_size = hf_config.n_inner self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head + # Standardized FlexFlow num heads fields below + self.num_attention_heads = hf_config.n_head + self.num_key_value_heads = self.n_head_kv class FlexFlowSTARCODER(FlexFlowModel): @@ -266,27 +268,3 @@ def convert_hf_model(model, dst_folder): model.lm_head.weight.detach().cpu().numpy().tofile( os.path.join(dst_folder, "lm_head_weight") ) - - def get_layers_with_weights(self): - layer_names = [ - "transformer_wte_weight", - "transformer_wpe_weight", - "transformer_ln_f_weight", - "lm_head_weight", - ] + [ - expr - for i in range(self.starcoder_config.num_hidden_layers) - for expr in ( - f"layers_{i}_ln_1_weight", - f"layers_{i}_attention_weight", - f"layers_{i}_ln_2_weight", - f"layers_{i}_mlp_c_fc_weight", - f"layers_{i}_mlp_c_proj_weight", - ) - ] - layers_with_weights = { - layer_name: self.ffmodel.get_layer_by_name(layer_name) - for layer_name in layer_names - } - - return layers_with_weights diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 7e340a04e2..eace15f691 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -19,6 +19,13 @@ FlexFlowSTARCODER, FlexFlowMPT, ) +from flexflow.serve.models import ( + LLAMAConfig, + OPTConfig, + FalconConfig, + STARCODERConfig, + MPTConfig, +) from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from huggingface_hub import HfApi @@ -86,17 +93,25 @@ def __init__( :type output_file: str, optional """ self.supported_models = { - "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA), - "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA), - "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT), - "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon), - "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon), - "GPTBigCodeForCausalLM": (ModelType.STARCODER, FlexFlowSTARCODER), - "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT), + "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), + "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "GPTBigCodeForCausalLM": ( + ModelType.STARCODER, + FlexFlowSTARCODER, + STARCODERConfig, + ), + "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), } self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.model_name = self.hf_config._name_or_path - self.model_type, self.model_class = self.__get_ff_model_type() + ( + self.model_type, + self.model_class, + self.config_class, + ) = self.__get_ff_model_type() self.data_type = data_type assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" @@ -274,23 +289,14 @@ def __load_hf_weights(self): self.download_hf_weights_if_needed() # Create file data loader, load weights into tensors - if ( - self.model_type == ModelType.FALCON - or self.model_type == ModelType.STARCODER - ): - n_q_heads = self.hf_config.num_attention_heads - if "n_head_kv" in self.hf_config.__dict__: - n_kv_heads = self.hf_config.n_head_kv - else: - n_kv_heads = 1 - else: - n_q_heads = n_kv_heads = self.hf_config.num_attention_heads + model_configs = self.config_class(self.hf_config) + self.fileloader = FileDataLoader( self.weights_path, - n_q_heads, - n_kv_heads, - self.hf_config.hidden_size, - self.hf_config.hidden_size // n_q_heads, + model_configs.num_attention_heads, + model_configs.num_key_value_heads, + model_configs.hidden_size, + model_configs.hidden_size // model_configs.num_attention_heads, self.ffconfig.tensor_parallelism_degree, ) From d9a95ef5d722551046f93df40a884feb9c2959fe Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 1 Oct 2023 16:02:03 -0500 Subject: [PATCH 244/344] Make MAX_BATCH_SIZE, MAX_NUM_TOKENS, MAX_SEQ_LENGTH user-provided input arguments (#1018) * add max_tokens_per_batch, max_requests_per_batch, and max_sequence_length in RequestManager * initial implementation * fix c++ examples * fix * . * more tries to fix * remove MAX_SEQ_LENGTH --------- Co-authored-by: zwang86 <46699021+zwang86@users.noreply.github.com> Co-authored-by: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> --- include/flexflow/batch_config.h | 11 +- include/flexflow/flexflow_c.h | 9 ++ include/flexflow/request_manager.h | 14 +- inference/incr_decoding/incr_decoding.cc | 28 +++- inference/models/falcon.cc | 4 +- inference/models/falcon.h | 11 +- inference/models/llama.cc | 3 +- inference/models/llama.h | 11 +- inference/models/mpt.cc | 2 +- inference/models/mpt.h | 7 +- inference/models/opt.cc | 2 +- inference/models/opt.h | 11 +- inference/models/starcoder.cc | 4 +- inference/models/starcoder.h | 7 +- inference/python/incr_decoding.py | 2 +- inference/python/spec_infer.py | 4 +- inference/spec_infer/spec_infer.cc | 28 +++- python/flexflow/core/flexflow_cffi.py | 11 ++ python/flexflow/serve/models/base.py | 6 +- python/flexflow/serve/models/falcon.py | 22 +-- python/flexflow/serve/models/llama.py | 22 +-- python/flexflow/serve/models/mpt.py | 22 +-- python/flexflow/serve/models/opt.py | 22 +-- python/flexflow/serve/models/starcoder.py | 22 +-- python/flexflow/serve/serve.py | 33 +++-- src/c/flexflow_c.cc | 22 +++ src/ops/beam_topk.cpp | 22 +-- src/ops/beam_topk.cu | 23 +-- src/ops/inc_multihead_self_attention.cc | 135 ++++++++++-------- src/ops/inc_multihead_self_attention.cpp | 47 +++--- src/ops/inc_multihead_self_attention.cu | 61 ++++---- src/ops/spec_inc_multihead_self_attention.cpp | 37 ++--- src/ops/spec_inc_multihead_self_attention.cu | 27 ++-- src/ops/tree_inc_multihead_self_attention.cpp | 15 +- src/ops/tree_inc_multihead_self_attention.cu | 15 +- src/runtime/batch_config.cc | 25 +++- src/runtime/beam_search_batch_config.cc | 11 +- src/runtime/inference_manager.cc | 10 +- src/runtime/model.cc | 2 +- src/runtime/request_manager.cc | 99 +++++++++---- src/runtime/request_manager.cpp | 8 +- src/runtime/request_manager.cu | 9 +- src/runtime/tree_verify_batch_config.cc | 7 +- 43 files changed, 547 insertions(+), 346 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index d2fbd6219a..6dabc70f4b 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -43,12 +43,17 @@ class BatchConfig { BatchConfig(); int num_active_requests() const; int num_active_tokens() const; + static int max_requests_per_batch(); + static int max_tokens_per_batch(); + static int max_sequence_length(); void print() const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); - static int const MAX_NUM_REQUESTS = 7; - static int const MAX_NUM_TOKENS = 64; - static int const MAX_SEQ_LENGTH = 256; + // Maximum possible values for different parameters + // These maximum values are used for copying BatchConfig + // across workers + static int const MAX_NUM_REQUESTS = 64; + static int const MAX_NUM_TOKENS = 1024; // These are set by update int num_tokens; diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index db034a78c9..01a2818a2b 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -965,6 +965,15 @@ flexflow_request_manager_t flexflow_request_manager_get_request_manager(void); // void flexflow_request_manager_destroy(flexflow_request_manager_t handle_); +void flexflow_request_manager_set_max_requests_per_batch( + flexflow_request_manager_t handle_, int max_num_requests); + +void flexflow_request_manager_set_max_tokens_per_batch( + flexflow_request_manager_t handle_, int max_num_tokens); + +void flexflow_request_manager_set_max_sequence_length( + flexflow_request_manager_t handle_, int max_seq_length); + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 8515d8a04b..3081aaa1c2 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -30,7 +30,7 @@ using tokenizers::Tokenizer; class InferenceManager { public: - InferenceManager(FFConfig const &config, int max_num_tokens_per_batch); + InferenceManager(FFConfig const &config); static InferenceManager *get_inference_manager(); void compile_model_and_allocate_buffer(FFModel *model); void init_operators_inference(FFModel *model); @@ -46,7 +46,6 @@ class InferenceManager { public: FFConfig ff_config; std::unordered_map> tensor_buffer; - int max_num_tokens_per_batch; int num_devices; }; @@ -96,6 +95,12 @@ class RequestManager { size_t get_num_processed_requests(); size_t get_num_ssms(); + void set_max_requests_per_batch(int max_num_requests); + int get_max_requests_per_batch(); + void set_max_tokens_per_batch(int max_num_tokens); + int get_max_tokens_per_batch(); + void set_max_sequence_length(int max_seq_length); + int get_max_sequence_length(); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, @@ -201,6 +206,11 @@ class RequestManager { Legion::Runtime *runtime); private: + // configuration parameters + int max_requests_per_batch; + int max_tokens_per_batch; + int max_sequence_length; + // private fields std::unique_ptr tokenizer_; bool verbose; ModelType model_type; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index f3fd32878f..463bc10151 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -43,7 +43,10 @@ void parse_input_args(char **argv, bool &verbose, bool &do_sample, float &temperature, - float &topp) { + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -89,6 +92,18 @@ void parse_input_args(char **argv, topp = std::stof(argv[++i]); continue; } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -115,6 +130,9 @@ void FlexFlow::top_level_task(Task const *task, bool do_sample = false; float temperature = 0.0f; float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -127,7 +145,10 @@ void FlexFlow::top_level_task(Task const *task, verbose, do_sample, temperature, - topp); + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == @@ -191,6 +212,9 @@ void FlexFlow::top_level_task(Task const *task, GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 3be92a953c..553a2f0d3d 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -40,8 +40,8 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { - assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); + int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/falcon.h b/inference/models/falcon.h index 6c9124fe4c..01226a30dc 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -59,8 +59,8 @@ class FALCON { << std::endl; assert(false); } - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -77,8 +77,8 @@ class FALCON { std::cout << "\tparallel_attn: " << parallel_attn << std::endl; std::cout << "\tvocab_size: " << vocab_size << std::endl; - std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; - std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; + // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } @@ -86,7 +86,8 @@ class FALCON { bool bias, multi_query, parallel_attn; int hidden_size, n_head, n_head_kv, n_layer, vocab_size; float layer_norm_epsilon; - int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + // int max_seq_len, max_num_tokens; + int max_beam_width, max_beam_depth; }; static void create_falcon_model(FFModel &ff, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 56f919ace1..b8fe70526d 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,8 +41,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - assert(llama_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/llama.h b/inference/models/llama.h index f01a7dbd52..8ecfcd7155 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -49,8 +49,8 @@ class LLAMA { << std::endl; assert(false); } - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -65,13 +65,14 @@ class LLAMA { std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl; std::cout << "\tintermediate_size: " << intermediate_size << std::endl; - std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; - std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; + // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } - int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + // int max_seq_len, max_num_tokens; + int max_beam_width, max_beam_depth; int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, intermediate_size; float rms_norm_eps; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 3bd1b912ed..b074d332ed 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,7 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/mpt.h b/inference/models/mpt.h index 437e0cb247..1969cd9c89 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -46,8 +46,8 @@ class MPT { << std::endl; assert(false); } - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -60,7 +60,8 @@ class MPT { std::cout << "\tvocab_size: " << vocab_size << std::endl; } - int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + // int max_seq_len, max_num_tokens; + int max_beam_width, max_beam_depth; int hidden_size, n_heads, n_layers, vocab_size; }; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index cdab25bfca..9b29ae5410 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/opt.h b/inference/models/opt.h index ab972ae10c..1ffe096bca 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -54,8 +54,8 @@ class OPT { << std::endl; assert(false); } - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -79,13 +79,14 @@ class OPT { std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim << std::endl; - std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; - std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; + // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } - int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + // int max_seq_len, max_num_tokens; + int max_beam_width, max_beam_depth; bool do_layer_norm_before, enable_bias, layer_norm_elementwise_affine; float dropout; int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads, diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 89b53b1cf5..ba7b2cb43a 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -47,8 +47,8 @@ void STARCODER::create_starcoder_model( Tensor position_input; ff.set_position_offset(0); { - assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::MAX_NUM_TOKENS, 1}; + // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); + int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index 9789a1c36e..bc113e4d52 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -51,15 +51,16 @@ class STARCODER { << std::endl; assert(false); } - max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - max_num_tokens = BatchConfig::MAX_NUM_TOKENS; + // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; + // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } void print() const {} - int max_seq_len, max_num_tokens, max_beam_width, max_beam_depth; + // int max_seq_len, max_num_tokens; + int max_beam_width, max_beam_depth; int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, intermediate_size, max_position_embeddings; float layer_norm_epsilon, dropout_p; diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 1ed7791143..d8a494b4d5 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -97,7 +97,7 @@ def main(): ) llm.compile( generation_config, - max_batch_size=1, + max_requests_per_batch=1, max_seq_length=256, max_tokens_per_batch=64, ) diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 7dc6635819..c9e87bd29f 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -134,7 +134,7 @@ def main(): for ssm in ssms: ssm.compile( generation_config, - max_batch_size=1, + max_requests_per_batch=1, max_seq_length=256, max_tokens_per_batch=64, ) @@ -142,7 +142,7 @@ def main(): # Compile the LLM for inference and load the weights into memory llm.compile( generation_config, - max_batch_size=1, + max_requests_per_batch=1, max_seq_length=256, max_tokens_per_batch=64, ssms=ssms, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index a95b26c930..98b5ec4633 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -58,7 +58,10 @@ void parse_input_args(char **argv, FilePaths &paths, ModelNames &model_names, bool &use_full_precision, - bool &verbose) { + bool &verbose, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length) { for (int i = 1; i < argc; i++) { // llm model name if (!strcmp(argv[i], "-llm-model")) { @@ -101,6 +104,18 @@ void parse_input_args(char **argv, verbose = true; continue; } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -265,6 +280,9 @@ void FlexFlow::top_level_task(Task const *task, ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; + int max_requests_per_batch = 16; + int max_tokens_per_batch = 256; + int max_sequence_length = 1024; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -274,7 +292,10 @@ void FlexFlow::top_level_task(Task const *task, file_paths, model_metadata.model_names, use_full_precision, - verbose); + verbose, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length); get_model_meta(file_paths, model_metadata, use_full_precision); @@ -286,6 +307,9 @@ void FlexFlow::top_level_task(Task const *task, GenerationConfig generationConfig; InferenceManager *im = InferenceManager::get_inference_manager(); RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer(model_metadata.llm_model_type, model_metadata.bos_token_id, model_metadata.eos_token_id, diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index b029f1e2ff..de3f7e6929 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -4187,6 +4187,17 @@ def register_ssm_model(self, model): self.handle, model.handle ) + def set_max_requests_per_batch(self, max_requests): + return ffc().flexflow_request_manager_set_max_requests_per_batch( + self.handle, max_requests) + + def set_max_tokens_per_batch(self, max_tokens): + return ffc().flexflow_request_manager_set_max_tokens_per_batch( + self.handle, max_tokens) + + def set_max_sequence_length(self, max_length): + return ffc().flexflow_request_manager_set_max_sequence_length( + self.handle, max_length) # ----------------------------------------------------------------------- # InferenceManager diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index 19affd9b47..025008ec78 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -21,9 +21,9 @@ def __init__( ffconfig, hf_config, data_type, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, + #max_batch_size=1, + #max_seq_length=256, + #max_tokens_per_batch=64, weights_filepath="", tokenizer_filepath="", ): diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 96268f5347..9a1bca48c4 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -19,8 +19,8 @@ class FalconConfig: def __init__(self, hf_config): - self.max_seq_len = 256 - self.max_num_tokens = 64 + #self.max_seq_len = 256 + #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.bias = hf_config.bias @@ -53,20 +53,20 @@ def __init__( ffconfig, hf_config, data_type, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, + #max_batch_size=1, + #max_seq_length=256, + max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", ): self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - self.max_batch_size = max_batch_size + #self.max_batch_size = max_batch_size self.data_type = data_type self.falcon_config = FalconConfig(hf_config) - self.falcon_config.max_seq_length = max_seq_length - self.falcon_config.max_num_tokens = max_tokens_per_batch + #self.falcon_config.max_seq_length = max_seq_length + #self.falcon_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -92,12 +92,12 @@ def __init__( f"Number of k/v attention heads ({self.falcon_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model() + self.build_model(max_tokens_per_batch) - def build_model(self): + def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) - tokens_dims = [self.falcon_config.max_num_tokens, 1] + tokens_dims = [max_tokens_per_batch, 1] input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index ba2f6e0826..7ba0e78a37 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -19,8 +19,8 @@ class LLAMAConfig: def __init__(self, hf_config): - self.max_seq_len = 256 - self.max_num_tokens = 64 + #self.max_seq_len = 256 + #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.num_hidden_layers = hf_config.num_hidden_layers @@ -45,20 +45,20 @@ def __init__( ffconfig, hf_config, data_type, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, + #max_batch_size=1, + #max_seq_length=256, + max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", ): self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - self.max_batch_size = max_batch_size + #self.max_batch_size = max_batch_size self.data_type = data_type self.llama_config = LLAMAConfig(hf_config) - self.llama_config.max_seq_length = max_seq_length - self.llama_config.max_num_tokens = max_tokens_per_batch + #self.llama_config.max_seq_length = max_seq_length + #self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -81,12 +81,12 @@ def __init__( f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model() + self.build_model(max_tokens_per_batch) - def build_model(self): + def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) - tokens_dims = [self.llama_config.max_num_tokens, 1] + tokens_dims = [max_tokens_per_batch, 1] input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 43a2514394..79a5bb940f 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -19,8 +19,8 @@ class MPTConfig: def __init__(self, hf_config): - self.max_seq_len = 256 - self.max_num_tokens = 64 + #self.max_seq_len = 256 + #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.hidden_size = hf_config.d_model @@ -40,20 +40,20 @@ def __init__( ffconfig, hf_config, data_type, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, + #max_batch_size=1, + #max_seq_length=256, + max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", ): self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - self.max_batch_size = max_batch_size + #self.max_batch_size = max_batch_size self.data_type = data_type self.mpt_config = MPTConfig(hf_config) - self.mpt_config.max_seq_length = max_seq_length - self.mpt_config.max_num_tokens = max_tokens_per_batch + #self.mpt_config.max_seq_length = max_seq_length + #self.mpt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -72,12 +72,12 @@ def __init__( raise ValueError( f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model() + self.build_model(max_tokens_per_batch) - def build_model(self): + def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) - tokens_dims = [self.mpt_config.max_num_tokens, 1] + tokens_dims = [max_tokens_per_batch, 1] input = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index d51287a181..dfd1cde7d4 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -19,8 +19,8 @@ class OPTConfig: def __init__(self, hf_config): - self.max_seq_len = 256 - self.max_num_tokens = 64 + #self.max_seq_len = 256 + #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.do_layer_norm_before = hf_config.do_layer_norm_before @@ -46,20 +46,20 @@ def __init__( ffconfig, hf_config, data_type, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, + #max_batch_size=1, + #max_seq_length=256, + max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", ): self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - self.max_batch_size = max_batch_size + #self.max_batch_size = max_batch_size self.data_type = data_type self.opt_config = OPTConfig(hf_config) - self.opt_config.max_seq_length = max_seq_length - self.opt_config.max_num_tokens = max_tokens_per_batch + #self.opt_config.max_seq_length = max_seq_length + #self.opt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -82,12 +82,12 @@ def __init__( f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model() + self.build_model(max_tokens_per_batch) - def build_model(self): + def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) - tokens_dims = [self.opt_config.max_num_tokens, 1] + tokens_dims = [max_tokens_per_batch, 1] input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 4eee3182d1..33b0b26ff8 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -19,8 +19,8 @@ class STARCODERConfig: def __init__(self, hf_config): - self.max_seq_len = 256 - self.max_num_tokens = 64 + #self.max_seq_len = 256 + #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.dropout_p = hf_config.attn_pdrop @@ -44,20 +44,20 @@ def __init__( ffconfig, hf_config, data_type, - max_batch_size=1, - max_seq_length=256, - max_tokens_per_batch=64, + #max_batch_size=1, + #max_seq_length=256, + max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", ): self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - self.max_batch_size = max_batch_size + #self.max_batch_size = max_batch_size self.data_type = data_type self.starcoder_config = STARCODERConfig(hf_config) - self.starcoder_config.max_seq_length = max_seq_length - self.starcoder_config.max_num_tokens = max_tokens_per_batch + #self.starcoder_config.max_seq_length = max_seq_length + #self.starcoder_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -92,12 +92,12 @@ def __init__( f"Number of k/v attention heads ({self.starcoder_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model() + self.build_model(max_tokens_per_batch) - def build_model(self): + def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) - tokens_dims = [self.starcoder_config.max_num_tokens, 1] + tokens_dims = [max_tokens_per_batch, 1] input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index eace15f691..549677d77a 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -305,7 +305,7 @@ def __load_hf_weights(self): def compile( self, generation_config: GenerationConfig = GenerationConfig(), - max_batch_size: int = 1, + max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, model_specific_data_parallelism_degree: int = None, @@ -319,8 +319,8 @@ def compile( :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional - :param max_batch_size: The maximum batch size to allow, defaults to 1 - :type max_batch_size: int, optional + :param max_requests_per_batch: The maximum batch size to allow, defaults to 1 + :type max_requests_per_batch: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 @@ -334,9 +334,9 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - self.max_batch_size = max_batch_size - self.max_seq_length = max_seq_length - self.max_tokens_per_batch = max_tokens_per_batch + #self.max_requests_per_batch = max_requests_per_batch + #self.max_seq_length = max_seq_length + #self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -363,6 +363,12 @@ def compile( model_specific_pipeline_parallelism_degree ) + # Create request manager and set serving configuration + self.rm = RequestManager() + self.rm.set_max_requests_per_batch(max_requests_per_batch) + self.rm.set_max_tokens_per_batch(max_tokens_per_batch) + self.rm.set_max_sequence_length(max_seq_length) + # Instantiate the relevant model self.model = self.model_class( mode, @@ -370,9 +376,7 @@ def compile( self.ffconfig, self.hf_config, self.data_type, - max_batch_size, - max_seq_length, - max_tokens_per_batch, + max_tokens_per_batch ) # Create inference manager @@ -383,8 +387,7 @@ def compile( self.__load_hf_weights() self.download_hf_tokenizer_if_needed() - # Create request manager - self.rm = RequestManager() + # Create tokenizer (this must be done after we have downloaded the tokenizer bos_token_id = ( -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id ) @@ -458,7 +461,7 @@ def __init__( def compile( self, generation_config: GenerationConfig = GenerationConfig(), - max_batch_size: int = 1, + max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, model_specific_data_parallelism_degree: int = 1, @@ -472,8 +475,8 @@ def compile( :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional - :param max_batch_size: The maximum batch size to allow, defaults to 1 - :type max_batch_size: int, optional + :param max_requests_per_batch: The maximum batch size to allow, defaults to 1 + :type max_requests_per_batch: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 @@ -489,7 +492,7 @@ def compile( """ super().compile( generation_config, - max_batch_size, + max_requests_per_batch, max_seq_length, max_tokens_per_batch, model_specific_data_parallelism_degree, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 3684725c0a..5bb5249f5d 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2551,6 +2551,28 @@ flexflow_request_manager_t flexflow_request_manager_get_request_manager(void) { return FFCObjectWrapper::wrap(rm); } +void flexflow_request_manager_set_max_requests_per_batch( + flexflow_request_manager_t handle_, int max_num_requests) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_requests_per_batch(max_num_requests); + DEBUG_PRINT("[RequestManager] set max_requests_per_batch %d", + max_num_requests); +} + +void flexflow_request_manager_set_max_tokens_per_batch( + flexflow_request_manager_t handle_, int max_num_tokens) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_tokens_per_batch(max_num_tokens); + DEBUG_PRINT("[RequestManager] set max_tokens_per_batch %d", max_num_tokens); +} + +void flexflow_request_manager_set_max_sequence_length( + flexflow_request_manager_t handle_, int max_seq_length) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_sequence_length(max_seq_length); + DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length); +} + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 5ee260714d..18534455a0 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -510,7 +510,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int parent_ids[max_total_requests]; DT acc_probs[max_total_requests]; - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -683,16 +683,16 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler) { DataType data_type = op->inputs[0]->data_type; - size_t parent_id_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t acc_probs_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t block_start_index_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t request_id_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t tokens_per_request_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_requests_per_batch = BatchConfig::max_requests_per_batch(); + size_t parent_id_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; + size_t acc_probs_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; + size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch; + size_t request_id_size = max_tokens_per_batch * max_requests_per_batch; + size_t tokens_per_request_size = + max_tokens_per_batch * max_requests_per_batch; size_t totalSize = sizeof(int) * parent_id_size + data_type_size(data_type) * acc_probs_size + sizeof(int) * block_start_index_size + diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 42fa7a5ab5..72ab7862a6 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -15,6 +15,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/beam_topk.h" +#include "flexflow/request_manager.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -542,7 +543,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int parent_ids[max_total_requests]; DT acc_probs[max_total_requests]; - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -715,16 +716,16 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler) { DataType data_type = op->inputs[0]->data_type; - size_t parent_id_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t acc_probs_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t block_start_index_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t request_id_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t tokens_per_request_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_NUM_REQUESTS; + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_requests_per_batch = BatchConfig::max_requests_per_batch(); + size_t parent_id_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; + size_t acc_probs_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; + size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch; + size_t request_id_size = max_tokens_per_batch * max_requests_per_batch; + size_t tokens_per_request_size = + max_tokens_per_batch * max_requests_per_batch; size_t totalSize = sizeof(int) * parent_id_size + data_type_size(data_type) * acc_probs_size + sizeof(int) * block_start_index_size + diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 7cb9867312..68b5fa39a1 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -937,11 +937,13 @@ void IncMultiHeadSelfAttention::inference_task( // load weight manually because Torch can't easily read a tensor serialized in // column-major order. - // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " + // printf("m->kProjSize: %i, BatchConfig::max_tokens_per_batch(): %i, " // "bc->num_active_tokens(): %i, num_q_heads: %lli, - // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", - // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), - // num_q_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); + // BatchConfig::max_requests_per_batch(): %i, " + // "bc->num_active_requests(): %i\n", m->kProjSize, + // BatchConfig::max_tokens_per_batch(), bc->num_active_tokens(), + // num_q_heads, BatchConfig::max_requests_per_batch(), + // bc->num_active_requests()); // for (int t=0; t < bc->num_active_tokens(); t++) { // printf("token %i has request_index: %li and token_position: %li\n", // t, bc->token2ids.token_indexes[t].request_index, @@ -1029,7 +1031,7 @@ void IncMultiHeadSelfAttention::inference_task( // ----------------------- Loading CUDA results for this step --------------- float *QKVProjArray_cpu = download_tensor( m->devQKVProjArray, - BatchConfig::MAX_NUM_TOKENS * proj_sum * m->num_q_heads); + BatchConfig::max_tokens_per_batch() * proj_sum * m->num_q_heads); assert(QKVProjArray_cpu != nullptr); std::vector QKVProjArray_converted_shape = { @@ -1089,21 +1091,25 @@ void IncMultiHeadSelfAttention::inference_task( for (size_t h = 0; h < num_q_heads; h++) { for (size_t t = 0; t < bc->num_active_tokens(); t++) { for (size_t d = 0; d < m->kProjSize; d++) { - size_t kcache_idx = - d * MAX_SEQ_LEN * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * m->num_q_heads * - BatchConfig::MAX_NUM_REQUESTS + - h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; + size_t kcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * + BatchConfig::max_requests_per_batch() + + bc->tokensInfo[t].abs_depth_in_request * + m->num_q_heads * + BatchConfig::max_requests_per_batch() + + h * BatchConfig::max_requests_per_batch() + + bc->tokensInfo[t].request_index; m->kcache[kcache_idx] = qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) .item(); } for (size_t d = 0; d < m->vProjSize; d++) { - size_t vcache_idx = - d * MAX_SEQ_LEN * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + - bc->tokensInfo[t].abs_depth_in_request * m->num_q_heads * - BatchConfig::MAX_NUM_REQUESTS + - h * BatchConfig::MAX_NUM_REQUESTS + bc->tokensInfo[t].request_index; + size_t vcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * + BatchConfig::max_requests_per_batch() + + bc->tokensInfo[t].abs_depth_in_request * + m->num_q_heads * + BatchConfig::max_requests_per_batch() + + h * BatchConfig::max_requests_per_batch() + + bc->tokensInfo[t].request_index; m->vcache[vcache_idx] = qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) .item(); @@ -1111,14 +1117,18 @@ void IncMultiHeadSelfAttention::inference_task( } } // Create torch tensors from the arrays - torch::Tensor K_t = torch::from_blob( - m->kcache, - {m->kProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - torch::Tensor V_t = torch::from_blob( - m->vcache, - {m->vProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); + torch::Tensor K_t = torch::from_blob(m->kcache, + {m->kProjSize, + MAX_SEQ_LEN, + num_q_heads, + BatchConfig::max_requests_per_batch()}, + torch::kFloat32); + torch::Tensor V_t = torch::from_blob(m->vcache, + {m->vProjSize, + MAX_SEQ_LEN, + num_q_heads, + BatchConfig::max_requests_per_batch()}, + torch::kFloat32); // Compute useful indices std::vector req_idxs; @@ -1143,30 +1153,30 @@ void IncMultiHeadSelfAttention::inference_task( bc->num_active_tokens()); // ----------------------- Loading CUDA results for this step --------------- - float *keyCache_cpu = - download_tensor(m->keyCache, - m->num_q_heads * m->kProjSize * - BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); - float *valueCache_cpu = - download_tensor(m->valueCache, - m->num_q_heads * m->vProjSize * - BatchConfig::MAX_NUM_REQUESTS * MAX_SEQ_LEN); + float *keyCache_cpu = download_tensor( + m->keyCache, + m->num_q_heads * m->kProjSize * BatchConfig::max_requests_per_batch() * + MAX_SEQ_LEN); + float *valueCache_cpu = download_tensor( + m->valueCache, + m->num_q_heads * m->vProjSize * BatchConfig::max_requests_per_batch() * + MAX_SEQ_LEN); assert(keyCache_cpu != nullptr); assert(valueCache_cpu != nullptr); float *kcache_cuda = (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_q_heads * - BatchConfig::MAX_NUM_REQUESTS, + BatchConfig::max_requests_per_batch(), sizeof(float)); float *vcache_cuda = (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_q_heads * - BatchConfig::MAX_NUM_REQUESTS, + BatchConfig::max_requests_per_batch(), sizeof(float)); int index = 0; for (int i = 0; i < m->kProjSize; i++) { for (int j = 0; j < MAX_SEQ_LEN; j++) { for (int k = 0; k < m->num_q_heads; k++) { - for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { + for (int l = 0; l < BatchConfig::max_requests_per_batch(); l++) { int col_major_index = l * m->kProjSize * MAX_SEQ_LEN * m->num_q_heads + k * m->kProjSize * MAX_SEQ_LEN + j * m->kProjSize + i; @@ -1179,7 +1189,7 @@ void IncMultiHeadSelfAttention::inference_task( for (int i = 0; i < m->vProjSize; i++) { for (int j = 0; j < MAX_SEQ_LEN; j++) { for (int k = 0; k < m->num_q_heads; k++) { - for (int l = 0; l < BatchConfig::MAX_NUM_REQUESTS; l++) { + for (int l = 0; l < BatchConfig::max_requests_per_batch(); l++) { int col_major_index = l * m->vProjSize * MAX_SEQ_LEN * m->num_q_heads + k * m->vProjSize * MAX_SEQ_LEN + j * m->vProjSize + i; @@ -1188,14 +1198,20 @@ void IncMultiHeadSelfAttention::inference_task( } } } - torch::Tensor K_t_cuda = torch::from_blob( - kcache_cuda, - {m->kProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); - torch::Tensor V_t_cuda = torch::from_blob( - vcache_cuda, - {m->vProjSize, MAX_SEQ_LEN, num_q_heads, BatchConfig::MAX_NUM_REQUESTS}, - torch::kFloat32); + torch::Tensor K_t_cuda = + torch::from_blob(kcache_cuda, + {m->kProjSize, + MAX_SEQ_LEN, + num_q_heads, + BatchConfig::max_requests_per_batch()}, + torch::kFloat32); + torch::Tensor V_t_cuda = + torch::from_blob(vcache_cuda, + {m->vProjSize, + MAX_SEQ_LEN, + num_q_heads, + BatchConfig::max_requests_per_batch()}, + torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- @@ -1205,11 +1221,11 @@ void IncMultiHeadSelfAttention::inference_task( // for (int l=0; l < m->kProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + - // j * BatchConfig::MAX_NUM_REQUESTS + - // i; - // if ( abs(m->kcache[kcache_idx] - keyCache_cpu[ + // l * MAX_SEQ_LEN * num_q_heads * + // BatchConfig::max_requests_per_batch() + k * num_q_heads * + // BatchConfig::max_requests_per_batch() + j * + // BatchConfig::max_requests_per_batch() + i; if ( + // abs(m->kcache[kcache_idx] - keyCache_cpu[ // i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + // j * m->kProjSize * MAX_SEQ_LEN + // k * m->kProjSize + @@ -1270,10 +1286,10 @@ void IncMultiHeadSelfAttention::inference_task( // for (int l=0; l < m->kProjSize; l++) { // for (int k=0; k < MAX_SEQ_LEN; k++) { // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + - // k * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + - // j * BatchConfig::MAX_NUM_REQUESTS + - // i; + // l * MAX_SEQ_LEN * num_q_heads * + // BatchConfig::max_requests_per_batch() + k * num_q_heads * + // BatchConfig::max_requests_per_batch() + j * + // BatchConfig::max_requests_per_batch() + i; // printf("%f ", m->kcache[kcache_idx]); // } // printf("\n"); @@ -1289,9 +1305,10 @@ void IncMultiHeadSelfAttention::inference_task( // for (int l=0; lvProjSize; l++) { // for (int k=0; k< MAX_SEQ_LEN; k++) { // size_t vcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * BatchConfig::MAX_NUM_REQUESTS - // + k * num_q_heads * BatchConfig::MAX_NUM_REQUESTS + j * - // BatchConfig::MAX_NUM_REQUESTS + i; + // l * MAX_SEQ_LEN * num_q_heads * + // BatchConfig::max_requests_per_batch() + // + k * num_q_heads * BatchConfig::max_requests_per_batch() + j + // * BatchConfig::max_requests_per_batch() + i; // printf("%f ", m->vcache[vcache_idx]); // } // printf("\n"); @@ -1380,17 +1397,19 @@ void IncMultiHeadSelfAttention::inference_task( // ----------------------- Loading CUDA results for this step --------------- float *qk_prods_cpu = download_tensor( m->qk_prods, - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_q_heads); + BatchConfig::max_tokens_per_batch() * + BatchConfig::max_tokens_per_batch() * num_q_heads); assert(qk_prods_cpu != nullptr); float *qk_prods_softmax_cpu = download_tensor( m->qk_prods_softmax, - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_NUM_TOKENS * num_q_heads); + BatchConfig::max_tokens_per_batch() * + BatchConfig::max_tokens_per_batch() * num_q_heads); assert(qk_prods_softmax_cpu != nullptr); float *attn_heads_cpu = download_tensor( m->attn_heads, - BatchConfig::MAX_NUM_TOKENS * m->num_q_heads * m->vProjSize); + BatchConfig::max_tokens_per_batch() * m->num_q_heads * m->vProjSize); assert(attn_heads_cpu != nullptr); // ----------------------- Main loop (request by request) ------------------- diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 8fb635bace..f09d905dd3 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -426,7 +426,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, num_tokens, m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH); + BatchConfig::max_sequence_length()); } } @@ -576,13 +576,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; int q_block_size = m->qProjSize * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -1070,8 +1070,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = - BatchConfig::MAX_NUM_TOKENS * + max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_kv_heads + vProjSize * num_kv_heads); size_t key_cache_size = 0, value_cache_size = 0; @@ -1079,36 +1080,36 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( case INC_DECODING_MODE: case TREE_VERIFY_MODE: { key_cache_size = num_kv_heads * kProjSize * - BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; + BatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length(); value_cache_size = num_kv_heads * vProjSize * - BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; + BatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length(); break; } case BEAM_SEARCH_MODE: { - key_cache_size = - num_kv_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - value_cache_size = - num_kv_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + key_cache_size = num_kv_heads * kProjSize * + BeamSearchBatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length() * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + value_cache_size = num_kv_heads * vProjSize * + BeamSearchBatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length() * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; break; } default: assert(false && "Unkown inference mode"); } - size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; + size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_q_heads; - size_t attn_heads_size = - BatchConfig::MAX_NUM_TOKENS * num_q_heads * vProjSize; + max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; + size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); size_t W_out_contiguous_size = W_out_block_size * num_q_heads; - size_t complex_size = - (BatchConfig::MAX_NUM_TOKENS * - (qProjSize * num_q_heads + kProjSize * num_kv_heads)) / - 2; + size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads + + kProjSize * num_kv_heads)) / + 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ec776f4cda..eaaa398654 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -393,7 +393,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, num_tokens, m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH); + BatchConfig::max_sequence_length()); } } @@ -579,13 +579,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; int q_block_size = m->qProjSize * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -1098,18 +1098,21 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( } #ifdef INFERENCE_TESTS - kcache = (float *)calloc(kProjSize * BatchConfig::MAX_SEQ_LENGTH * - num_q_heads * BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); - vcache = (float *)calloc(vProjSize * BatchConfig::MAX_SEQ_LENGTH * - num_q_heads * BatchConfig::MAX_NUM_REQUESTS, - sizeof(float)); + kcache = + (float *)calloc(kProjSize * BatchConfig::max_sequence_length() * + num_q_heads * BatchConfig::max_requests_per_batch(), + sizeof(float)); + vcache = + (float *)calloc(vProjSize * BatchConfig::max_sequence_length() * + num_q_heads * BatchConfig::max_requests_per_batch(), + sizeof(float)); #endif // allocate memory for the seqArray and reserve space { + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = - BatchConfig::MAX_NUM_TOKENS * + max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_kv_heads + vProjSize * num_kv_heads); size_t key_cache_size = 0, value_cache_size = 0; @@ -1117,36 +1120,36 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( case INC_DECODING_MODE: case TREE_VERIFY_MODE: { key_cache_size = num_kv_heads * kProjSize * - BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; + BatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length(); value_cache_size = num_kv_heads * vProjSize * - BatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH; + BatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length(); break; } case BEAM_SEARCH_MODE: { - key_cache_size = - num_kv_heads * kProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - value_cache_size = - num_kv_heads * vProjSize * BeamSearchBatchConfig::MAX_NUM_REQUESTS * - BatchConfig::MAX_SEQ_LENGTH * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + key_cache_size = num_kv_heads * kProjSize * + BeamSearchBatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length() * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + value_cache_size = num_kv_heads * vProjSize * + BeamSearchBatchConfig::max_requests_per_batch() * + BatchConfig::max_sequence_length() * + BeamSearchBatchConfig::MAX_BEAM_WIDTH; break; } default: assert(false && "Unkown inference mode"); } - size_t tokeninfo_size = BatchConfig::MAX_NUM_TOKENS; + size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = - BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_SEQ_LENGTH * num_q_heads; - size_t attn_heads_size = - BatchConfig::MAX_NUM_TOKENS * num_q_heads * vProjSize; + max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; + size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); size_t W_out_contiguous_size = W_out_block_size * num_q_heads; - size_t complex_size = - (BatchConfig::MAX_NUM_TOKENS * - (qProjSize * num_q_heads + kProjSize * num_kv_heads)) / - 2; + size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads + + kProjSize * num_kv_heads)) / + 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index f983238198..30b6f5cb84 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -175,7 +175,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, num_tokens, m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH, + BatchConfig::max_sequence_length(), BeamSearchBatchConfig::MAX_BEAM_WIDTH, /*root*/ curr_depth == 0); } @@ -225,13 +225,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; int q_block_size = m->qProjSize * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -542,29 +542,30 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, hipStream_t stream) { // here because we need postion info in infernece 1 + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); checkCUDA( hipMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream)); - checkCUDA( - hipMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), + max_tokens_per_batch * sizeof(BatchConfig::PerTokenInfo), hipMemcpyHostToDevice, stream)); + checkCUDA(hipMemcpyAsync(m->request_infos, + &(bc->requestsInfo), + bc->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + hipMemcpyHostToDevice, + stream)); checkCUDA( hipMemcpyAsync(m->beam_token_infos, &(bc->beamTokenInfo), - bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * + max_tokens_per_batch * bc->MAX_BEAM_WIDTH * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), hipMemcpyHostToDevice, stream)); checkCUDA(hipMemcpyAsync( m->beam_request_infos, &(bc->beamRequestsInfo), - bc->MAX_NUM_REQUESTS * + bc->max_requests_per_batch() * sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), hipMemcpyHostToDevice, stream)); @@ -692,10 +693,12 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t beam_tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t beam_requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + size_t beam_tokeninfo_size = + max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); + size_t beam_requestinfo_size = + BeamSearchBatchConfig::max_requests_per_batch(); size_t total_size = requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + beam_tokeninfo_size * diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b4cdc77e2a..b479528607 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -191,7 +191,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, num_tokens, m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH, + BatchConfig::max_sequence_length(), BeamSearchBatchConfig::MAX_BEAM_WIDTH, /*root*/ curr_depth == 0); } @@ -241,13 +241,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; int q_block_size = m->qProjSize * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -564,23 +564,24 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // here because we need postion info in infernece 1 cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * sizeof(BatchConfig::PerTokenInfo), + bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(m->request_infos, &(bc->requestsInfo), - bc->MAX_NUM_REQUESTS * sizeof(BatchConfig::PerRequestInfo), + bc->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(m->beam_token_infos, &(bc->beamTokenInfo), - bc->MAX_NUM_TOKENS * bc->MAX_BEAM_WIDTH * + bc->num_active_tokens() * bc->MAX_BEAM_WIDTH * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(m->beam_request_infos, &(bc->beamRequestsInfo), - bc->MAX_NUM_REQUESTS * + bc->max_requests_per_batch() * sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), cudaMemcpyHostToDevice, stream); @@ -711,10 +712,12 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t beam_tokeninfo_size = BeamSearchBatchConfig::MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; - size_t beam_requestinfo_size = BeamSearchBatchConfig::MAX_NUM_REQUESTS; + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + size_t beam_tokeninfo_size = + max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); + size_t beam_requestinfo_size = + BeamSearchBatchConfig::max_requests_per_batch(); size_t total_size = requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + beam_tokeninfo_size * diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 0fa68bed08..c10cf9d0ca 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -104,7 +104,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_active_tokens, // number of active tokens in previous batch m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH); + BatchConfig::max_sequence_length()); } } @@ -195,13 +195,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); int q_block_size = m->qProjSize * bc->num_active_tokens(); - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -241,7 +241,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_active_tokens, // total_tokens_in_batch m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH); + BatchConfig::max_sequence_length()); } // bc->token_last_available_idx[i] + 1; @@ -570,7 +570,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } checkCUDA(hipMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * + bc->num_active_tokens() * sizeof(TreeVerifyBatchConfig::PerTokenInfo), hipMemcpyHostToDevice, stream)); @@ -714,7 +714,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + size_t committed_tokeninfo_size = max_tokens_per_batch; size_t total_size = committed_tokeninfo_size * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); if (offload) { diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 95ac93ad8a..5901c0e3ab 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -102,7 +102,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_active_tokens, // number of active tokens in previous batch m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH); + BatchConfig::max_sequence_length()); } } @@ -193,13 +193,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); int q_block_size = m->qProjSize * bc->num_active_tokens(); - int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; + int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; + int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); int vt_req_block_size = vt_block_size * m->num_kv_heads; assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } @@ -237,7 +237,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_active_tokens, // total_tokens_in_batch m->num_q_heads, m->num_kv_heads, - BatchConfig::MAX_SEQ_LENGTH); + BatchConfig::max_sequence_length()); } // bc->token_last_available_idx[i] + 1; @@ -567,7 +567,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->MAX_NUM_TOKENS * + bc->num_active_tokens() * sizeof(TreeVerifyBatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); @@ -711,7 +711,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t committed_tokeninfo_size = TreeVerifyBatchConfig::MAX_NUM_TOKENS; + int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + size_t committed_tokeninfo_size = max_tokens_per_batch; size_t total_size = committed_tokeninfo_size * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); if (offload) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d658b6590f..cbf839c6b2 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -14,6 +14,7 @@ */ #include "flexflow/batch_config.h" +#include "flexflow/request_manager.h" #include "legion.h" #include #include @@ -60,7 +61,7 @@ InferenceMode BatchConfig::get_mode() const { int BatchConfig::num_active_requests() const { int num_requests = 0; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < max_requests_per_batch(); i++) { if (!request_completed[i]) { num_requests++; } @@ -72,17 +73,33 @@ int BatchConfig::num_active_tokens() const { return num_tokens; } +/*static*/ +int BatchConfig::max_requests_per_batch() { + return RequestManager::get_request_manager()->get_max_requests_per_batch(); +} + +/*static*/ +int BatchConfig::max_tokens_per_batch() { + return RequestManager::get_request_manager()->get_max_tokens_per_batch(); +} + +/*static*/ +int BatchConfig::max_sequence_length() { + return RequestManager::get_request_manager()->get_max_sequence_length(); +} + void BatchConfig::print() const { std::cout << "@@@@@@@@@@@@@@ Batch Config (mode " << get_mode() << ") @@@@@@@@@@@@@@" << std::endl; - std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; - std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; + std::cout << "Max number of requests: " << max_requests_per_batch() + << std::endl; + std::cout << "Max number of tokens: " << max_tokens_per_batch() << std::endl; std::cout << "Number of tokens: " << num_tokens << std::endl; std::cout << "Number of requests: " << num_active_requests() << std::endl; // std::cout << "Cached results: " << cached_results << std::endl; std::cout << "Per-request info:\n"; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < max_requests_per_batch(); i++) { if (!request_completed[i]) { std::cout << " Request " << i << ":\n"; std::cout << " Token start offset: " diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index dc30d89d78..634d60a352 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -69,7 +69,7 @@ bool BeamSearchBatchConfig::done() const { int BeamSearchBatchConfig::max_beam_depth_all_requests() const { int max_depth_all_requests = 0; - for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { if (!request_completed[i] && beamRequestsInfo[i].max_depth > max_depth_all_requests) { /* printf("\treq %i has max_depth=%i. Increasing max_depth_all_requests " @@ -86,7 +86,7 @@ int BeamSearchBatchConfig::max_beam_depth_all_requests() const { int BeamSearchBatchConfig::current_depth_all_requests() const { int current_depth = 0; - for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { if (!request_completed[i] && beamRequestsInfo[i].current_depth > current_depth) { /* printf("\treq %i has current_depth=%i. Increasing " @@ -104,8 +104,9 @@ int BeamSearchBatchConfig::current_depth_all_requests() const { void BeamSearchBatchConfig::print() const { std::cout << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << get_mode() << ") @@@@@@@@@@@@@@" << std::endl; - std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; - std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; + std::cout << "Max number of requests: " << max_requests_per_batch() + << std::endl; + std::cout << "Max number of tokens: " << max_tokens_per_batch() << std::endl; std::cout << "Number of tokens: " << num_tokens << std::endl; std::cout << "Number of requests: " << num_active_requests() << std::endl; std::cout << "Beam width: " << beam_width << std::endl; @@ -113,7 +114,7 @@ void BeamSearchBatchConfig::print() const { std::cout << "Current Iterations: " << current_iteration << std::endl; std::cout << "Per-request info:\n"; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < max_requests_per_batch(); i++) { // assert(beamRequestsInfo[i].request_completed == request_completed[i]); if (!request_completed[i]) { std::cout << " Request " << i << ":\n"; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index f36dcb2922..eb045e8159 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,9 +28,8 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager(FFConfig const &_config, - int _max_num_tokens_per_batch) - : ff_config(_config), max_num_tokens_per_batch(_max_num_tokens_per_batch) { +InferenceManager::InferenceManager(FFConfig const &_config) + : ff_config(_config) { num_devices = ff_config.workersPerNode * ff_config.numNodes; // Check parallelization degrees assert(ff_config.data_parallelism_degree <= num_devices && @@ -62,8 +61,7 @@ InferenceManager *inference_manager_singleton = nullptr; InferenceManager *InferenceManager::get_inference_manager() { if (inference_manager_singleton == nullptr) { FFConfig ffconfig; - inference_manager_singleton = - new InferenceManager(ffconfig, BatchConfig::MAX_NUM_TOKENS); + inference_manager_singleton = new InferenceManager(ffconfig); } return inference_manager_singleton; } @@ -84,7 +82,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // TODO: currently assume there is a single data-parallel pipeline // (i.e., data-parallel-degree == 1) assert(model->config.data_parallelism_degree == 1); - model->config.batchSize = max_num_tokens_per_batch; + model->config.batchSize = BatchConfig::max_tokens_per_batch(); model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5ef55992ef..3fa201e7ab 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3971,7 +3971,7 @@ struct DefaultConfig { const static bool profiling = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 2GB + const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB const static int numNodes = 1; const static int workersPerNode = 0; const static int cpusPerNode = 0; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 6f0a1f3851..1b825318dd 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -43,6 +43,14 @@ std::string LoadBytesFromFile(std::string const &path) { RequestManager::RequestManager() : verbose(false), next_available_guid(1000000), num_processed_requests(0) { + // The following config parameters are set + // during ffmodel.compile() + // Initialize them to -1 to make sure no one + // gets an incorrect value of them before + // ffmodel.compile() + max_requests_per_batch = -1; + max_tokens_per_batch = -1; + max_sequence_length = -1; { // Initialize futures for spec infer TreeVerifyBatchConfig tree_bc; @@ -65,6 +73,39 @@ RequestManager::RequestManager() } } +void RequestManager::set_max_requests_per_batch(int max_num_requests) { + assert(max_requests_per_batch == -1 || + max_requests_per_batch == max_num_requests); + max_requests_per_batch = max_num_requests; + assert(max_requests_per_batch <= BatchConfig::MAX_NUM_REQUESTS); +} + +int RequestManager::get_max_requests_per_batch() { + assert(max_requests_per_batch > 0); + return max_requests_per_batch; +} + +void RequestManager::set_max_tokens_per_batch(int max_num_tokens) { + assert(max_tokens_per_batch == -1 || max_tokens_per_batch == max_num_tokens); + max_tokens_per_batch = max_num_tokens; + assert(max_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS); +} + +int RequestManager::get_max_tokens_per_batch() { + assert(max_tokens_per_batch > 0); + return max_tokens_per_batch; +} + +void RequestManager::set_max_sequence_length(int max_seq_length) { + assert(max_sequence_length == -1 || max_sequence_length == max_seq_length); + max_sequence_length = max_seq_length; +} + +int RequestManager::get_max_sequence_length() { + assert(max_sequence_length > 0); + return max_sequence_length; +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -143,9 +184,9 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - if (prompt.size() >= BatchConfig::MAX_SEQ_LENGTH) { + if (prompt.size() >= get_max_sequence_length()) { std::cout << "Warning: too many tokens in prompt, only load up to " - << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got " + << get_max_sequence_length() << " tokens, but got " << prompt.size() << ".\n"; printf("tokens size: %zu\n", request.tokens.size()); @@ -201,9 +242,9 @@ RequestManager::RequestGuid request.tokens.push_back(bos_token_id); } std::vector tokens = this->tokenizer_->Encode(prompt); - if (tokens.size() >= BatchConfig::MAX_SEQ_LENGTH) { + if (tokens.size() >= get_max_sequence_length()) { std::cout << "Warning: too many tokens in prompt, only load up to " - << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got " + << get_max_sequence_length() << " tokens, but got " << tokens.size() << ".\n"; printf("tokens size: %zu\n", tokens.size()); @@ -317,7 +358,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } // Step 2: prepare the next batch for existing requests BatchConfig new_bc; - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { continue; } @@ -405,7 +446,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - new_bc.requestsInfo[i].token_start_offset); } @@ -422,17 +463,17 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } // Step 3: add new requests to the next batch - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && - new_bc.num_tokens < BatchConfig::MAX_NUM_TOKENS) { + new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; @@ -451,7 +492,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_request.tokens[depth]; new_bc.num_tokens++; } - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens == get_max_tokens_per_batch()) { break; } } @@ -510,7 +551,7 @@ BeamSearchBatchConfig new_bc.model_id = model_id; int result_index = 0; - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { continue; } @@ -688,7 +729,7 @@ BeamSearchBatchConfig // Add verified token to request's token list request.tokens.push_back(token.first); - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens == get_max_tokens_per_batch()) { break; } } @@ -731,17 +772,17 @@ BeamSearchBatchConfig } // Step 2: Initialize new request - for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && - new_bc.num_tokens < BeamSearchBatchConfig::MAX_NUM_TOKENS) { + new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(BeamSearchBatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; @@ -757,7 +798,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].max_depth = std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, - BatchConfig::MAX_NUM_TOKENS - + get_max_tokens_per_batch() - new_bc.requestsInfo[i].num_tokens_in_batch - 1); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; @@ -807,7 +848,7 @@ BeamSearchBatchConfig std::cout << "total prompt in request: " << new_request.initial_len << std::endl; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens == get_max_tokens_per_batch()) { break; } } @@ -876,7 +917,7 @@ BeamSearchBatchConfig new_bc.model_id = old_bc.model_id; // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { continue; } @@ -922,7 +963,8 @@ BeamSearchBatchConfig // update the beam search metadata // how many sub request in current request - // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? + // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH + // entries? new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; // update the parentid, accumalated_probs, depth, and token_ids @@ -964,9 +1006,8 @@ BeamSearchBatchConfig } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = - // std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, - std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens - - BatchConfig::MAX_NUM_REQUESTS + i, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens - + BatchConfig::max_requests_per_batch() + i, (int)request.tokens.size() - new_bc.requestsInfo[i].token_start_offset); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; @@ -1067,8 +1108,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; - int max_prompt_load_size = BatchConfig::MAX_NUM_TOKENS; - for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { + int max_prompt_load_size = get_max_tokens_per_batch(); + for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; } else if (old_batches.at(0).request_running[i]) { @@ -1078,7 +1119,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; } @@ -1162,7 +1203,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens > get_max_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1186,7 +1227,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) { + if (new_bc.num_tokens == get_max_tokens_per_batch() - 1) { break; } } @@ -1251,7 +1292,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens > get_max_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1270,7 +1311,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens.back(), request.tokens.size() - 1)}; } } else { // launch the request into running phase after loading all prompt - if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) { + if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { request.status = Request::RUNNING; new_bc.request_running[i] = true; diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index f323f262f2..1e756606f8 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -33,11 +33,11 @@ void RequestManager::load_tokens_task( BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; - // Extreme long prompts are not supported, only load up to MAX_NUM_TOKENS as - // prompt - if (batch_config->num_tokens > BatchConfig::MAX_NUM_TOKENS) { + // Extreme long prompts are not supported, only load up to + // max_tokens_per_batch as prompt + if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", - BatchConfig::MAX_NUM_TOKENS); + BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 58e996629e..cd3e03fff6 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -32,14 +32,13 @@ void RequestManager::load_tokens_task( BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; - // Extreme long prompts are not supported, only load up to MAX_NUM_TOKENS as - // prompt - if (batch_config->num_tokens > BatchConfig::MAX_NUM_TOKENS) { + // Extreme long prompts are not supported, only load up to + // BatchConfig::max_tokens_per_batch() as prompt + if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", - BatchConfig::MAX_NUM_TOKENS); + BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); } - // assert(batch_config->num_tokens <= BatchConfig::MAX_NUM_TOKENS); for (int i = 0; i < batch_config->num_tokens; i++) { dram_copy[i] = batch_config->tokensInfo[i].token_id; diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 78eff184c4..9efa06a2d5 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -33,14 +33,15 @@ InferenceMode TreeVerifyBatchConfig::get_mode() const { void TreeVerifyBatchConfig::print() const { std::cout << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << get_mode() << ") @@@@@@@@@@@@@@" << std::endl; - std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; - std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; + std::cout << "Max number of requests: " << max_requests_per_batch() + << std::endl; + std::cout << "Max number of tokens: " << max_tokens_per_batch() << std::endl; std::cout << "Number of tokens: " << num_tokens << std::endl; std::cout << "Number of requests: " << num_active_requests() << std::endl; // std::cout << "Cached results: " << cached_results << std::endl; std::cout << "Per-request info:\n"; - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { + for (int i = 0; i < max_requests_per_batch(); i++) { if (!request_completed[i]) { std::cout << " Request " << i << ":\n"; std::cout << " Token start offset: " From edc6c49e01d2796fb5a12440064b68a86bff6081 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 2 Oct 2023 14:23:36 -0400 Subject: [PATCH 245/344] [Cleanup] - Remove obsolete stuff (#1160) * remove obsolete stuff * fix --------- Co-authored-by: Zhihao Jia --- .github/workflows/clang-format-check.yml | 1 - bootcamp_demo/ff_alexnet_cifar10.py | 70 --- bootcamp_demo/keras_cnn_cifar10.py | 56 -- bootcamp_demo/torch_alexnet_cifar10.py | 44 -- conda/build.sh | 30 - conda/meta.yaml | 28 - jupyter_notebook/README.md | 89 --- jupyter_notebook/flexflow_jupyter.json | 67 -- jupyter_notebook/flexflow_kernel_nocr.py | 59 -- jupyter_notebook/install.py | 408 ------------ nmt/Makefile | 50 -- nmt/embed.cu | 373 ----------- nmt/linear.cu | 618 ------------------ nmt/lstm.cu | 652 ------------------- nmt/nmt.cc | 359 ----------- nmt/ops.h | 177 ------ nmt/rnn.cu | 770 ----------------------- nmt/rnn.h | 438 ------------- nmt/rnn_mapper.cc | 138 ---- nmt/rnn_mapper.h | 63 -- nmt/softmax_data_parallel.cu | 392 ------------ scripts/FC_env_setup.sh | 33 - scripts/FC_setup.sh | 34 - scripts/FC_setup.txt | 24 - scripts/Makefile | 2 - scripts/compile_protobuf.sh | 6 - scripts/osdi22ae/bert.sh | 7 - scripts/osdi22ae/candle_uno.sh | 7 - scripts/osdi22ae/dlrm.sh | 7 - scripts/osdi22ae/inception.sh | 7 - scripts/osdi22ae/mlp.sh | 7 - scripts/osdi22ae/resnext-50.sh | 7 - scripts/osdi22ae/xdl.sh | 7 - scripts/test_run.sh | 38 -- 34 files changed, 5068 deletions(-) delete mode 100644 bootcamp_demo/ff_alexnet_cifar10.py delete mode 100644 bootcamp_demo/keras_cnn_cifar10.py delete mode 100644 bootcamp_demo/torch_alexnet_cifar10.py delete mode 100755 conda/build.sh delete mode 100644 conda/meta.yaml delete mode 100644 jupyter_notebook/README.md delete mode 100644 jupyter_notebook/flexflow_jupyter.json delete mode 100644 jupyter_notebook/flexflow_kernel_nocr.py delete mode 100644 jupyter_notebook/install.py delete mode 100644 nmt/Makefile delete mode 100644 nmt/embed.cu delete mode 100644 nmt/linear.cu delete mode 100644 nmt/lstm.cu delete mode 100644 nmt/nmt.cc delete mode 100644 nmt/ops.h delete mode 100644 nmt/rnn.cu delete mode 100644 nmt/rnn.h delete mode 100644 nmt/rnn_mapper.cc delete mode 100644 nmt/rnn_mapper.h delete mode 100644 nmt/softmax_data_parallel.cu delete mode 100755 scripts/FC_env_setup.sh delete mode 100644 scripts/FC_setup.sh delete mode 100644 scripts/FC_setup.txt delete mode 100644 scripts/Makefile delete mode 100755 scripts/compile_protobuf.sh delete mode 100755 scripts/osdi22ae/bert.sh delete mode 100755 scripts/osdi22ae/candle_uno.sh delete mode 100755 scripts/osdi22ae/dlrm.sh delete mode 100755 scripts/osdi22ae/inception.sh delete mode 100755 scripts/osdi22ae/mlp.sh delete mode 100755 scripts/osdi22ae/resnext-50.sh delete mode 100755 scripts/osdi22ae/xdl.sh delete mode 100644 scripts/test_run.sh diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 1601da86b3..fdf53e8254 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -11,7 +11,6 @@ jobs: exclude: '\.proto$' - check: "include" - check: "inference" - - check: "nmt" - check: "python" - check: "scripts" - check: "tests" diff --git a/bootcamp_demo/ff_alexnet_cifar10.py b/bootcamp_demo/ff_alexnet_cifar10.py deleted file mode 100644 index cb0b0e99ad..0000000000 --- a/bootcamp_demo/ff_alexnet_cifar10.py +++ /dev/null @@ -1,70 +0,0 @@ -#./flexflow_python $FF_HOME/bootcamp_demo/ff_alexnet_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192 - -from flexflow.core import * -from flexflow.keras.datasets import cifar10 -from flexflow.torch.model import PyTorchModel -from PIL import Image - -def top_level_task(): - ffconfig = FFConfig() - ffconfig.parse_args() - print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.get_batch_size(), ffconfig.get_workers_per_node(), ffconfig.get_num_nodes())) - ffmodel = FFModel(ffconfig) - - dims_input = [ffconfig.get_batch_size(), 3, 229, 229] - input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT) - - torch_model = PyTorchModel("alexnet.ff") - output_tensors = torch_model.apply(ffmodel, [input_tensor]) - - ffoptimizer = SGDOptimizer(ffmodel, 0.01) - ffmodel.set_sgd_optimizer(ffoptimizer) - ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) - label_tensor = ffmodel.get_label_tensor() - - num_samples = 10000 - - (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples) - - full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32) - - for i in range(0, num_samples): - image = x_train[i, :, :, :] - image = image.transpose(1, 2, 0) - pil_image = Image.fromarray(image) - pil_image = pil_image.resize((229,229), Image.NEAREST) - image = np.array(pil_image, dtype=np.float32) - image = image.transpose(2, 0, 1) - full_input_np[i, :, :, :] = image - - full_input_np /= 255 - - y_train = y_train.astype('int32') - full_label_np = y_train - - dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_np) - dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_np) - - num_samples = dataloader_input.num_samples - - ffmodel.init_layers() - - epochs = ffconfig.get_epochs() - - ts_start = ffconfig.get_current_time() - - ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs) - - ts_end = ffconfig.get_current_time() - run_time = 1e-6 * (ts_end - ts_start); - print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time)); - - # perf_metrics = ffmodel.get_perf_metrics() - # accuracy = perf_metrics.get_accuracy() - # if accuracy < ModelAccuracy.CIFAR10_CNN.value: - # assert 0, 'Check Accuracy' - - -if __name__ == "__main__": - print("cifar10 cnn") - top_level_task() diff --git a/bootcamp_demo/keras_cnn_cifar10.py b/bootcamp_demo/keras_cnn_cifar10.py deleted file mode 100644 index a62f625449..0000000000 --- a/bootcamp_demo/keras_cnn_cifar10.py +++ /dev/null @@ -1,56 +0,0 @@ -#./flexflow_python $FF_HOME/bootcamp_demo/keras_cnn_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192 - -# from keras.models import Model, Sequential -# from keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Dropout -# from keras.optimizers import SGD -# from keras.datasets import cifar10 -# from keras import losses -# from keras import metrics - -from flexflow.keras.models import Model, Sequential -from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Dropout -from flexflow.keras.optimizers import SGD -from flexflow.keras.datasets import cifar10 -from flexflow.keras import losses -from flexflow.keras import metrics - -import numpy as np - -def top_level_task(): - num_classes = 10 - - num_samples = 10000 - - #(x_train, y_train), (x_test, y_test) = cifar10.load_data() - (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples) - - x_train = x_train.astype('float32') - x_train /= 255 - y_train = y_train.astype('int32') - print("shape: ", x_train.shape[1:]) - - model = Sequential() - - model.add(Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")) - model.add(Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")) - model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")) - model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")) - model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid")) - model.add(Activation("relu")) - model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid")) - model.add(Flatten()) - model.add(Dense(512)) - model.add(Activation("relu")) - model.add(Dropout(0.5)) - model.add(Dense(num_classes)) - model.add(Activation("softmax")) - - opt = SGD(learning_rate=0.01) - model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy']) - print(model.summary()) - - model.fit(x_train, y_train, batch_size=64, epochs=4) - -if __name__ == "__main__": - print("Functional API, cifar10 cnn") - top_level_task() \ No newline at end of file diff --git a/bootcamp_demo/torch_alexnet_cifar10.py b/bootcamp_demo/torch_alexnet_cifar10.py deleted file mode 100644 index 394161c5a3..0000000000 --- a/bootcamp_demo/torch_alexnet_cifar10.py +++ /dev/null @@ -1,44 +0,0 @@ -#./flexflow_python $FF_HOME/bootcamp_demo/torch_alexnet_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192 - -# https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py - -import torch.nn as nn -import torch -import flexflow.torch.fx as fx -import torchvision.models as models - -class AlexNet(nn.Module): - def __init__(self, num_classes: int = 1000) -> None: - super(AlexNet, self).__init__() - self.features = nn.Sequential( - nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2), - nn.Conv2d(64, 192, kernel_size=5, padding=2), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2), - nn.Conv2d(192, 384, kernel_size=3, padding=1), - nn.ReLU(inplace=True), - nn.Conv2d(384, 256, kernel_size=3, padding=1), - nn.ReLU(inplace=True), - nn.Conv2d(256, 256, kernel_size=3, padding=1), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2), - ) - self.classifier = nn.Sequential( - nn.Linear(256 * 6 * 6, 4096), - nn.ReLU(inplace=True), - nn.Linear(4096, 4096), - nn.ReLU(inplace=True), - nn.Linear(4096, num_classes), - nn.Softmax(), - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.features(x) - x = torch.flatten(x, 1) - x = self.classifier(x) - return x - -model = AlexNet(num_classes=10) -fx.torch_to_flexflow(model, "alexnet.ff") \ No newline at end of file diff --git a/conda/build.sh b/conda/build.sh deleted file mode 100755 index 0e84b7489a..0000000000 --- a/conda/build.sh +++ /dev/null @@ -1,30 +0,0 @@ -#! /usr/bin/env bash -set -euo pipefail - -# Cd into FF_HOME -cd "${BASH_SOURCE[0]%/*}/../" - -# build flexflow -# "search and replace" bash syntax used below to make shellcheck happy. -# see here: https://wiki-dev.bash-hackers.org/syntax/pe -CXXFLAGS="${CXXFLAGS//-O2/}" -CXXFLAGS="${CXXFLAGS//-std=c++17/}" -CXXFLAGS="${CXXFLAGS//-DNDEBUG/}" -CXXFLAGS="${CXXFLAGS//-D_FORTIFY_SOURCE=2/}" -export CXXFLAGS -CPPFLAGS="${CPPFLAGS//-O2/}" -CPPFLAGS="${CPPFLAGS//-std=c++17/}" -CPPFLAGS="${CPPFLAGS//-DNDEBUG/}" -CPPFLAGS="${CPPFLAGS//-D_FORTIFY_SOURCE=2/}" -export CPPFLAGS - -#export CUDNN_HOME=/projects/opt/centos7/cuda/10.1 -#export CUDA_HOME=/projects/opt/centos7/cuda/10.1 -export PROTOBUF_DIR=$BUILD_PREFIX -export FF_HOME=$SRC_DIR -export LG_RT_DIR=$SRC_DIR/legion/runtime -#export FF_ENABLE_DEBUG=1 -#export DEBUG=0 - -cd python -make diff --git a/conda/meta.yaml b/conda/meta.yaml deleted file mode 100644 index b6e14b2957..0000000000 --- a/conda/meta.yaml +++ /dev/null @@ -1,28 +0,0 @@ -package: - name: flexflow - version: "1.0" - -source: - git_rev: master - git_url: https://github.com/flexflow/FlexFlow.git - -build: - number: 0 - -requirements: - build: - - make - - git - - zlib - - protobuf - - {{ compiler('c') }} - - {{ compiler('cxx') }} - host: - - python - - cffi - run: - - cffi - - numpy - - python - - zlib - - keras-preprocessing diff --git a/jupyter_notebook/README.md b/jupyter_notebook/README.md deleted file mode 100644 index fe25df6dbf..0000000000 --- a/jupyter_notebook/README.md +++ /dev/null @@ -1,89 +0,0 @@ -# Jupyter Notebook - -This directory contains Jupyter notebook support for -FlexFlow. -It allows user to run any FlexFlow Python -program (e.g., training models) on a single node using -the in-browser jupyter notebook UI. - -## Quick Start -### Pre-requisite -* Python >= 3.6 -* FlexFlow Python binding needs to be installed, please check the [installation guide](https://flexflow.readthedocs.io/en/latest/installation.html) -* Install Jupyter notebook - - pip install notebook - -### Install the FlexFlow IPython kernel -``` -python ./install.py --(configurations) -``` -Please refer to the [IPython Kernel Configurations](#kernel-configurations) section for the configuration details. - -If the installation is successed, the following log will be printed to the terminal. -The `flexflow_kernel_nocr` is the IPython kernel name, where `nocr` means control replication is not enabled. -The control replication can be enabled once multi-node jupyter notebook support is provided in the future. -The `FlexFlow_SM_GPU` is the display name -of the kernel, which can be modified by the configuration json file. -`FlexFlow` is the name entry in the json file, `SM` means the IPython kernel -is only for shared memory machine, and `GPU` means GPU execution is enabled. -``` -IPython kernel: flexflow_kernel_nocr(FlexFlow_SM_GPU) has been installed -``` -The installed IPython kernel can be also seen by using the following command: -``` -jupyter kernelspec list -``` - -### Create a turnel (Optional) -If you want to run the jupyter notebook server on a remote compute node instead of localhost, -you can create a turnel from localhost to the compute node. -``` -ssh -4 -t -L 8888:localhost:8002 username@login-node-hostname ssh -t -L 8002:localhost:8888 computing_node -``` - -### Start the Jupyter Notebook server -Launch jupyter notebook server on the compute node or localhost if the turnel is not created -``` -jupyter notebook --port=8888 --no-browser -``` - -### Use the Jupyter Notebook in the browser -* Open the browser, type the addredd http://localhost:8888/?token=xxx, the token will be -displayed in the terminal once the server is started. -* Once the webpage is loaded, click "New" on the right top corner, and click the kernel -just installed. It is shown as the display name of the kernel, e.g. `FlexFlow_SM_GPU`. - -### Uninstall the IPython kernel -``` -jupyter kernelspec uninstall flexflow_kernel_nocr -``` -If the IPython kernel is re-installed, the old one will be automatically uninstalled by the install.py - - -## IPython Kernel Configurations -The IPython kernel can be configured by either passing arguments to `install.py` or using a json file. -The accepted arguments can be listed with -``` -python ./install.py --help -``` - -It is always preferred to use a json file. -The `flexflow_python.json` is the template respect to the -flexflow_python. Most entries are using the following format: -``` -"cpus": { - "cmd": "--cpus", - "value": 1 -} -``` -* `cpus` is the name of the field. - -* `cmd` is used to tell how to pass the value to the field. -For example, flexflow uses `-ll:cpu` to set the number of CPUs, so the `cmd` in `flexflow_python.json` is `-ll:cpu`. - -* `value` is the value of the field. It can be set to `null`. In this case, the value is read -from the command line arguments. - -Other configuration options can be added by either appending them to the command line arguments or -using the `other_options` field of the json file. diff --git a/jupyter_notebook/flexflow_jupyter.json b/jupyter_notebook/flexflow_jupyter.json deleted file mode 100644 index 0ff79c7234..0000000000 --- a/jupyter_notebook/flexflow_jupyter.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "name": "FlexFlow", - "kernel_name": "flexflow_kernel_nocr", - "flexflow_python_prefix": null, - "exe": "flexflow_python", - "cpus": { - "cmd": "-ll:cpu", - "value": 1 - }, - "gpus": { - "cmd": "-ll:gpu", - "value": 1 - }, - "openmp": { - "cmd": "-ll:ocpu", - "value": 0 - }, - "ompthreads": { - "cmd": "-ll:othr", - "value": 0 - }, - "utility": { - "cmd": "-ll:util", - "value": 1 - }, - "sysmem": { - "cmd": "-ll:csize", - "value": null - }, - "fbmem": { - "cmd": "-ll:fsize", - "value": 4096 - }, - "zcmem": { - "cmd": "-ll:zsize", - "value": 10240 - }, - "regmem": { - "cmd": "-ll:rsize", - "value": null - }, - "not_control_replicable": { - "action": "store_true", - "cmd": "--nocr", - "value": null - }, - "nodes": { - "cmd": "-n", - "value": 1 - }, - "ranks_per_node": { - "cmd": "--npernode", - "value": 1 - }, - "launcher": { - "type": "generic", - "cmd": "--launcher", - "value": null, - "launcher_extra": null - }, - "other_options": [ - { - "cmd": "-ll:py", - "value": 1 - } - ] -} \ No newline at end of file diff --git a/jupyter_notebook/flexflow_kernel_nocr.py b/jupyter_notebook/flexflow_kernel_nocr.py deleted file mode 100644 index 8441db5d3a..0000000000 --- a/jupyter_notebook/flexflow_kernel_nocr.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -from ipykernel.ipkernel import IPythonKernel -import sys - -__version__ = '0.1' - -class FlexFlowKernelNoCR(IPythonKernel): - implementation = 'flexflow_kernel_nocr' - implementation_version = __version__ - - banner = "FlexFlow IPython Kernel for SM" - language = 'python' - language_version = __version__ - language_info = {'name': 'flexflow_kernel_nocr', - 'mimetype': 'text/x-python', - 'codemirror_mode': { - 'name': 'ipython', - 'version': 3 - }, - 'pygments_lexer': 'ipython3', - 'nbconvert_exporter': 'python', - 'file_extension': '.py'} - - def __init__(self, **kwargs): - self.__stdout = None - self._set_stdout() - print("Init FlexFlow kernel for single node or multi-nodes without control replication.") - self._reset_stdout() - super().__init__(**kwargs) - - def _set_stdout(self): - assert(self.__stdout == None), "stdout should be None" - self.__stdout = sys.stdout - sys.stdout = open('/dev/stdout', 'w') - - def _reset_stdout(self): - assert(self.__stdout != None), "stdout should not be None" - sys.stdout = self.__stdout - -if __name__ == "__main__": - from ipykernel.kernelapp import IPKernelApp - IPKernelApp.launch_instance(kernel_class=FlexFlowKernelNoCR) diff --git a/jupyter_notebook/install.py b/jupyter_notebook/install.py deleted file mode 100644 index 9073620d26..0000000000 --- a/jupyter_notebook/install.py +++ /dev/null @@ -1,408 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json -import os -import re -import sys -import argparse -from distutils import log -import json -import inspect -import shutil - -from jupyter_client.kernelspec import KernelSpecManager, NoSuchKernel -from IPython.utils.tempdir import TemporaryDirectory - -kernel_json = {"argv": [], - "display_name": "None", - "language": "python", -} - -kernel_json_suffix_nocr = ["flexflow_kernel_nocr.py", "-f", "{connection_file}"] - - -required_cmd_dict_key = ["name", "kernel_name", "flexflow_python_prefix", "exe", "cpus", "gpus", "openmp", "ompthreads", "utility", "sysmem", "fbmem", "zcmem", "regmem", "not_control_replicable"] - -# This internal method is used to delete a kernel specified by kernel_name -def _delete_kernel(ksm, kernel_name, mute=True): - try: - spec = ksm.get_kernel_spec(kernel_name) - shutil.rmtree(spec.resource_dir) - if mute == False: - print("Find existing kernel:" + kernel_name + ", delete it before installation.") - except NoSuchKernel: - if mute == False: - print("No existing kernel:" + kernel_name + " has been installed, continue to installation.") - -# This internal method is used to install a kernel -def _install_kernel(ksm, kernel_name, kernel_json, user, prefix, mute=True): - with TemporaryDirectory() as td: - os.chmod(td, 0o755) - with open(os.path.join(td, "kernel.json"), "w") as f: - json.dump(kernel_json, f, sort_keys=True) - try: - ksm.install_kernel_spec(td, kernel_name, user=user, prefix=prefix) - if mute == False: - print("IPython kernel: " + kernel_name + "(" + kernel_json["display_name"] + ") has been installed") - except Exception as e: - if mute == False: - log.error("Failed to install the IPython kernel: " + kernel_name + "(" + kernel_json["display_name"] + ") with error: " + str(e)) - -# This method parses the json file into a dict named cmd_dict -def parse_json(flexflow_python_prefix, - cpus, - gpus, - openmp, - ompthreads, - utility, - sysmem, - fbmem, - zcmem, - regmem, - launcher, - nodes, - ranks_per_node, - not_control_replicable, - kernel_name, - filename): - with open(filename) as json_file: - cmd_dict = json.load(json_file) - for key in required_cmd_dict_key: - if key not in cmd_dict: - assert 0, "Key: " + key + " is not existed." - # Criterion - # if entry in the json file is set to null, we load it from the cmd line - args = inspect.getfullargspec(parse_json) - keys = args.args[0: len(args.args)-1] - sig = inspect.signature(parse_json) - argv_dict = locals() - for key in keys: - if key == "launcher": - if cmd_dict[key]["value"] == None and argv_dict[key] != "none": - cmd_dict[key]["value"] = argv_dict[key] - if cmd_dict[key]["launcher_extra"] == None: - cmd_dict[key]["launcher_extra"] = list() - elif key == "flexflow_python_prefix" or key == "kernel_name": - if cmd_dict[key] == None: - cmd_dict[key] = argv_dict[key] - else: - if cmd_dict[key]["value"] == None: - cmd_dict[key]["value"] = argv_dict[key] - - return cmd_dict - -# This method is used to install the kernel for jupyter notebook support for single or -# multiple nodes runs without control replication -def install_kernel_nocr(user, prefix, cmd_opts, cmd_dict, verbose, kernel_file_dir): - if verbose: - print("cmd_dict is:\n" + str(cmd_dict)) - - # setup name and argv - kernel_json["argv"] = [cmd_dict["flexflow_python_prefix"] + "/" + cmd_dict["exe"]] + kernel_json["argv"] - kernel_json["display_name"] = cmd_dict["name"] - - # launcher - if cmd_dict["launcher"]["value"] == None: - kernel_json["display_name"] += "_SM" - else: - kernel_json["display_name"] += "_DM" - nodes = cmd_dict["nodes"]["value"] - ranks_per_node = cmd_dict["ranks_per_node"]["value"] - launcher = cmd_dict["launcher"]["value"] - if cmd_dict["launcher"]["type"] == "legate": - # use legate launcher - kernel_json["argv"] += cmd_dict["launcher"]["cmd"], launcher, \ - cmd_dict["nodes"]["cmd"], str(nodes), \ - cmd_dict["ranks_per_node"]["cmd"], str(ranks_per_node) - else: - # use mpirun, srun and jsrun launcher - ranks = nodes * ranks_per_node - if launcher == "mpirun": - kernel_json["argv"] = ["mpirun", "-n", str(ranks), "--npernode", str(ranks_per_node)] + cmd_dict["launcher"]["launcher_extra"] + kernel_json["argv"] - elif launcher == "srun": - kernel_json["argv"] = ["srun", "-n", str(ranks), "--ntasks-per-node", str(ranks_per_node)] + cmd_dict["launcher"]["launcher_extra"] + kernel_json["argv"] - elif launcher == "jsrun": - kernel_json["argv"] = ["jsrun", "-n", str(ranks // ranks_per_node), "-r", "1", "-a", str(ranks_per_node)] + cmd_dict["launcher"]["launcher_extra"] + kernel_json["argv"] - else: - assert 0, "Unknown launcher" - - # let's do not enable control replication because pygion has issue with cleaning up - # disable control replication - # assert cmd_dict["not_control_replicable"]["value"] == True - # kernel_json["argv"].append(cmd_dict["not_control_replicable"]["cmd"]) - - # cpu - if cmd_dict["cpus"]["value"] > 0: - kernel_json["argv"] += cmd_dict["cpus"]["cmd"], str(cmd_dict["cpus"]["value"]) - - # gpu - if cmd_dict["gpus"]["value"] > 0: - kernel_json["display_name"] += "_GPU" - kernel_json["argv"] += cmd_dict["gpus"]["cmd"], str(cmd_dict["gpus"]["value"]) - if cmd_dict["fbmem"]["value"] > 0: - kernel_json["argv"] += cmd_dict["fbmem"]["cmd"], str(cmd_dict["fbmem"]["value"]) - if cmd_dict["zcmem"]["value"] > 0: - kernel_json["argv"] += cmd_dict["zcmem"]["cmd"], str(cmd_dict["zcmem"]["value"]) - - # openmp - if cmd_dict["openmp"]["value"] > 0: - if cmd_dict["ompthreads"]["value"] > 0: - kernel_json["argv"] += cmd_dict["openmp"]["cmd"], str(cmd_dict["openmp"]["value"]) - kernel_json["argv"] += cmd_dict["ompthreads"]["cmd"], str(cmd_dict["ompthreads"]["value"]) - else: - print( - "WARNING: ignore request for " - + str(cmd_dict["openmp"]["value"]) - + "OpenMP processors with 0 threads" - ) - - # utility - if cmd_dict["utility"]["value"] > 0: - kernel_json["argv"] += cmd_dict["utility"]["cmd"], str(cmd_dict["utility"]["value"]) - - # system memory - if cmd_dict["sysmem"]["value"] > 0: - kernel_json["argv"] += cmd_dict["sysmem"]["cmd"], str(cmd_dict["sysmem"]["value"]) - - # register memory - if cmd_dict["regmem"]["value"] > 0: - kernel_json["argv"] += cmd_dict["regmem"]["cmd"], str(cmd_dict["regmem"]["value"]) - - # other options from json - if "other_options" in cmd_dict: - other_options = cmd_dict["other_options"] - for option in other_options: - if option["value"] == None: - kernel_json["argv"].append(option["cmd"]) - else: - kernel_json["argv"] += option["cmd"], str(option["value"]) - - # other options from cmd line - for option in cmd_opts: - kernel_json["argv"].append(option) - - ksm = KernelSpecManager() - - # we need the installation dir of kernel, so first install a fake one - tmp_kernel_name = "tmp_legion_kernel" - tmp_kernel_json = {"argv": [], "display_name": "Tmp", "language": "python"} - _install_kernel(ksm, tmp_kernel_name, tmp_kernel_json, user, prefix) - spec = ksm.get_kernel_spec(tmp_kernel_name) - kernel_install_dir = os.path.dirname(spec.resource_dir) - _delete_kernel(ksm, tmp_kernel_name) - - # Now start installation - kernel_name = cmd_dict["kernel_name"] - - # add installation dir to legin_kernel_nocr.py - kernel_install_dir = os.path.join(kernel_install_dir, kernel_name) - kernel_filename = kernel_json_suffix_nocr[0] - kernel_json_suffix_nocr[0] = os.path.join(kernel_install_dir, kernel_filename) - kernel_json["argv"] += kernel_json_suffix_nocr - if verbose: - print("The kernel_json is:\n" + str(kernel_json)) - - # check if kernel is existed, if yes, then delete the old one before installation. - _delete_kernel(ksm, kernel_name, False) - - # install the kernel - _install_kernel(ksm, kernel_name, kernel_json, user, prefix, False) - - # copy legion_kernel_nocr.py into kernel dir - if kernel_file_dir == None: - file_path = os.getcwd() + "/" + kernel_filename - else: - file_path = kernel_file_dir + "/" + kernel_filename - shutil.copy(file_path, kernel_install_dir) - -def parse_args(argv=None): - parser = argparse.ArgumentParser( - description="Install Legion IPython Kernel" - ) - - parser.add_argument( - "--user", - action="store_true", - default=True, - dest="user", - help="Install the kernel in user home directory", - ) - parser.add_argument( - "--kernel-name", - default="", - dest="kernel_name", - help="Install the kernel into prefix", - ) - parser.add_argument( - "--prefix", - default=None, - dest="prefix", - help="Install the kernel into prefix", - ) - parser.add_argument( - "--json", - default="flexflow_jupyter.json", - dest="json", - help="Configuration file of flexflow_python", - ) - parser.add_argument( - "--flexflow-python-prefix", - default=None, - dest="flexflow_python_prefix", - help="The dirctory where flexflow_python is installed", - ) - parser.add_argument( - "--cpus", - type=int, - default=1, - dest="cpus", - help="Number of CPUs to use per rank", - ) - parser.add_argument( - "--gpus", - type=int, - default=1, - dest="gpus", - help="Number of GPUs to use per rank", - ) - parser.add_argument( - "--omps", - type=int, - default=0, - dest="openmp", - help="Number of OpenMP groups to use per rank", - ) - parser.add_argument( - "--ompthreads", - type=int, - default=4, - dest="ompthreads", - help="Number of threads per OpenMP group", - ) - parser.add_argument( - "--utility", - type=int, - default=1, - dest="utility", - help="Number of Utility processors per rank to request for meta-work", - ) - parser.add_argument( - "--sysmem", - type=int, - default=4000, - dest="sysmem", - help="Amount of DRAM memory per rank (in MBs)", - ) - parser.add_argument( - "--fbmem", - type=int, - default=4000, - dest="fbmem", - help="Amount of framebuffer memory per GPU (in MBs)", - ) - parser.add_argument( - "--zcmem", - type=int, - default=32, - dest="zcmem", - help="Amount of zero-copy memory per rank (in MBs)", - ) - parser.add_argument( - "--regmem", - type=int, - default=0, - dest="regmem", - help="Amount of registered CPU-side pinned memory per rank (in MBs)", - ) - parser.add_argument( - "--no-replicate", - dest="not_control_replicable", - action="store_true", - required=False, - default=True, - help="Execute this program without control replication. Most of the " - "time, this is not recommended. This option should be used for " - "debugging. The -lg:safe_ctrlrepl Legion option may be helpful " - "with discovering issues with replicated control.", - ) - parser.add_argument( - "--launcher", - dest="launcher", - choices=["mpirun", "jsrun", "srun", "none"], - default="none", - help='launcher program to use (set to "none" for local runs, or if ' - "the launch has already happened by the time legate is invoked)", - ) - parser.add_argument( - "--nodes", - type=int, - default=1, - dest="nodes", - help="Number of nodes to use", - ) - parser.add_argument( - "--ranks-per-node", - type=int, - default=1, - dest="ranks_per_node", - help="Number of ranks (processes running copies of the program) to " - "launch per node. The default (1 rank per node) will typically result " - "in the best performance.", - ) - parser.add_argument( - "--verbose", - action="store_true", - default=False, - dest="verbose", - help="Display the detailed log of installation", - ) - - args, opts = parser.parse_known_args() - return args, opts - -def driver(args, opts, kernel_file_dir=None): - cmd_dict = parse_json(flexflow_python_prefix=args.flexflow_python_prefix, - cpus=args.cpus, - gpus=args.gpus, - openmp=args.openmp, - ompthreads=args.ompthreads, - utility=args.utility, - sysmem=args.sysmem, - fbmem=args.fbmem, - zcmem=args.zcmem, - regmem=args.regmem, - launcher=args.launcher, - nodes=args.nodes, - ranks_per_node=args.ranks_per_node, - not_control_replicable=args.not_control_replicable, - kernel_name=args.kernel_name, - filename=args.json) - - if cmd_dict["not_control_replicable"]: - install_kernel_nocr(user=args.user, - prefix=args.prefix, - cmd_opts=opts, - cmd_dict=cmd_dict, - verbose=args.verbose, - kernel_file_dir=kernel_file_dir) - else: - assert 0, "Control replication is not supported yet" - -if __name__ == '__main__': - args, opts = parse_args() - driver(args, opts) diff --git a/nmt/Makefile b/nmt/Makefile deleted file mode 100644 index 261da88655..0000000000 --- a/nmt/Makefile +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -ifndef LG_RT_DIR -$(error LG_RT_DIR variable is not defined, aborting build) -endif - -# Flags for directing the runtime makefile what to include -DEBUG ?= 0 # Include debugging symbols -OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level -USE_CUDA ?= 1 # Include CUDA support (requires CUDA) -USE_GASNET ?= 1 # Include GASNet support (requires GASNet) -USE_HDF ?= 0 # Include HDF5 support (requires HDF5) -ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) - -# Put the binary file name here -OUTFILE ?= nmt -# List all the application source files here -GEN_SRC ?= nmt.cc rnn_mapper.cc # .cc files -GEN_GPU_SRC ?= lstm.cu linear.cu embed.cu rnn.cu softmax_data_parallel.cu ../cnn_helper.cu# .cu files - -# You can modify these variables, some will be appended to by the runtime makefile -INC_FLAGS ?= -CC_FLAGS ?= -NVCC_FLAGS ?= -GASNET_FLAGS ?= -LD_FLAGS ?= -lcudnn -lcublas -lcurand -# For Point and Rect typedefs -CC_FLAGS += -std=c++11 -NVCC_FLAGS += -std=c++11 -########################################################################### -# -# Don't change anything below here -# -########################################################################### - -include $(LG_RT_DIR)/runtime.mk - diff --git a/nmt/embed.cu b/nmt/embed.cu deleted file mode 100644 index 077c5ec565..0000000000 --- a/nmt/embed.cu +++ /dev/null @@ -1,373 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../cnn_helper.h" -#include "rnn.h" -#include "rnn_mapper.h" - -struct EmbedInitParams { - DnnHandle handle; - int batchSize, outputSize, vocabSize; -}; - -Tensor RnnModel::add_embed_node(Tensor x, - int vocab_size, - int output_size, - ParallelConfig pc, - SharedVariable params) { - assert(x.numDim == 2); - assert(x.adim[1] == LSTM_PER_NODE_LENGTH); - assert(x.pdim[1] == LSTM_PER_NODE_LENGTH); - Embed *node = new Embed(config, x, vocab_size, output_size, pc, params); - layers.push_back(node); - return node->outputs[0]; -} - -Embed::Embed(RnnConfig config, - Tensor x, - int _vocab_size, - int _output_size, - ParallelConfig pc, - SharedVariable _params) - : RnnOp(x, pc, _params), batchSize(x.adim[0]), vocabSize(_vocab_size), - outputSize(_output_size) { - Context ctx = config.lg_ctx; - HighLevelRuntime *runtime = config.lg_hlr; - assert(pc.nDims == 1); - { - Rect<1> rect(Point<1>(0), Point<1>(pc.dim[0] - 1)); - part_rect = rect; - } - IndexSpaceT<1> part_is = runtime->create_index_space(ctx, part_rect); - FieldSpace fs = config.field_space; - Rect<3, coord_t> y_rect( - Point<3>(0, 0, 0), - Point<3>(outputSize - 1, batchSize - 1, LSTM_PER_NODE_LENGTH - 1)); - IndexSpaceT<3> y_is = runtime->create_index_space(ctx, y_rect); - LogicalRegion y_lr = runtime->create_logical_region(ctx, y_is, fs); - LogicalRegion y_grad_lr = runtime->create_logical_region(ctx, y_is, fs); - int num_par_n = part_rect.hi[0] - part_rect.lo[0] + 1; - assert(batchSize % num_par_n == 0); - int extent_n = batchSize / num_par_n; - int extent_c = outputSize; - Rect<3, coord_t> extent( - Point<3>(0, 0, 0), - Point<3>(extent_c - 1, extent_n - 1, LSTM_PER_NODE_LENGTH - 1)); - Transform<3, 1, coord_t> trans; - trans[0][0] = 0; - trans[1][0] = extent_n; - trans[2][0] = 0; - IndexPartition y_ip = runtime->create_partition_by_restriction( - ctx, y_is, part_is, trans, extent); - assert(runtime->is_index_partition_disjoint(ctx, y_ip)); - assert(runtime->is_index_partition_complete(ctx, y_ip)); - LogicalPartition y_lp = runtime->get_logical_partition(ctx, y_lr, y_ip); - LogicalPartition y_grad_lp = - runtime->get_logical_partition(ctx, y_grad_lr, y_ip); - outputs[0].region = y_lr; - outputs[0].region_grad = y_grad_lr; - outputs[0].partition = y_lp; - outputs[0].partition_grad = y_grad_lp; - outputs[0].numDim = 3; - outputs[0].adim[0] = outputSize; - outputs[0].adim[1] = batchSize; - outputs[0].adim[2] = LSTM_PER_NODE_LENGTH; - outputs[0].pdim[0] = extent_c; - outputs[0].pdim[1] = extent_n; - outputs[0].pdim[2] = LSTM_PER_NODE_LENGTH; -} - -/* - regions[0] (I): x - regions[1] (I): w - regions[2] (O): y - */ -OpMeta *Embed::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - EmbedInitParams const *embed = (EmbedInitParams *)task->args; - Rect<2> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - assert(rect_x.hi[0] - rect_x.lo[0] + 1 == embed->batchSize); - assert(rect_x.hi[1] - rect_x.lo[1] + 1 == LSTM_PER_NODE_LENGTH); - assert(rect_w.hi[0] - rect_w.lo[0] + 1 == - embed->vocabSize * embed->outputSize); - assert(rect_y.hi[0] - rect_y.lo[0] + 1 == embed->outputSize); - assert(rect_y.hi[1] - rect_y.lo[1] + 1 == embed->batchSize); - assert(rect_y.hi[2] - rect_y.lo[2] + 1 == LSTM_PER_NODE_LENGTH); - EmbedMeta *m = new EmbedMeta(embed->handle); - m->profiling_runtime = false; - return m; -} - -void Embed::init(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - EmbedInitParams initParams; - initParams.handle = model.dnn_handlers[paraConfig.gpu[idx]]; - initParams.batchSize = outputs[0].pdim[1]; - initParams.outputSize = outputs[0].pdim[0]; - initParams.vocabSize = vocabSize; - // batch is the first dim of input and the second dim of output - assert(inputs[0].pdim[0] == outputs[0].pdim[1]); - TaskLauncher launcher(EMBED_INIT_TASK_ID, - TaskArgument(&initParams, sizeof(initParams)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.region, READ_ONLY, EXCLUSIVE, params.region)); - launcher.add_field(1, FID_DATA); - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, WRITE_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(2, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - meta[idx] = f.get_result(); - } -} - -__global__ void embedForward(int const *x_ptr, - float const *embed, - float *y_ptr, - coord_t numElements, - int shift, - int outputSize) { - CUDA_KERNEL_LOOP(i, numElements) { - int idx = i >> shift; - int off = i & (outputSize - 1); - int wordIdx = x_ptr[idx]; - y_ptr[i] = embed[(wordIdx << shift) + off]; - } -} - -__global__ void embedBackward(int const *x_ptr, - float *embed, - float const *y_ptr, - coord_t numElements, - int shift, - int outputSize) { - CUDA_KERNEL_LOOP(i, numElements) { - int idx = i >> shift; - int off = i & (outputSize - 1); - int wordIdx = x_ptr[idx]; - atomicAdd(embed + (wordIdx << shift) + off, y_ptr[i]); - } -} - -/* - regions[0](I): x - regions[1](I): w - regions[2](O): y -*/ -void Embed::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 3); - assert(task->regions.size() == 3); - EmbedMeta const *m = *((EmbedMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorRO const acc_w(regions[1], FID_DATA); - AccessorWO const acc_y(regions[2], FID_DATA); - Rect<2> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - int batch_size = rect_y.hi[1] - rect_y.lo[1] + 1; - int output_size = rect_y.hi[0] - rect_y.lo[0] + 1; - int const *x_ptr = acc_x.ptr(rect_x.lo); - float const *w_ptr = acc_w.ptr(rect_w.lo); - float *y_ptr = acc_y.ptr(rect_y.lo); - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - int shift = 0; - int size = 1; - while (size < output_size) { - size = size * 2; - shift = shift + 1; - } - assert(size == output_size); - embedForward<<>>( - x_ptr, w_ptr, y_ptr, rect_y.volume(), shift, output_size); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("Embed forward time = %.2lfms\n", elapsed); - } -#endif -} - -void Embed::forward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(EMBED_FWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.region, READ_ONLY, EXCLUSIVE, params.region)); - launcher.add_field(1, FID_DATA); - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, WRITE_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(2, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -/* - regions[0](I): x - regions[1](I/O): w_grad - regions[2](I): y_grad -*/ -void Embed::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 3); - assert(task->regions.size() == 3); - EmbedMeta const *m = *((EmbedMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorRW const acc_w(regions[1], FID_DATA); - AccessorRO const acc_y(regions[2], FID_DATA); - Rect<2> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - int batch_size = rect_y.hi[1] - rect_y.lo[1] + 1; - int output_size = rect_y.hi[0] - rect_y.lo[0] + 1; - int const *x_ptr = acc_x.ptr(rect_x.lo); - float *w_ptr = acc_w.ptr(rect_w.lo); - float const *y_ptr = acc_y.ptr(rect_y.lo); - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - int shift = 0; - int size = 1; - while (size < output_size) { - size = size * 2; - shift = shift + 1; - } - assert(size == output_size); - embedBackward<<>>( - x_ptr, w_ptr, y_ptr, rect_y.volume(), shift, output_size); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("Embed backward time = %.2lfms\n", elapsed); - } -#endif -} - -void Embed::backward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(EMBED_BWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.gradients[paraConfig.gpu[idx]], - READ_WRITE, - EXCLUSIVE, - params.gradients[paraConfig.gpu[idx]])); - launcher.add_field(1, FID_DATA); - { - LogicalRegion y_grad = runtime->get_logical_subregion_by_color( - outputs[0].partition_grad, dp); - launcher.add_region_requirement(RegionRequirement( - y_grad, READ_ONLY, EXCLUSIVE, outputs[0].region_grad)); - launcher.add_field(2, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -void Embed::update(RnnModel const &model) {} diff --git a/nmt/linear.cu b/nmt/linear.cu deleted file mode 100644 index 48a7290bf0..0000000000 --- a/nmt/linear.cu +++ /dev/null @@ -1,618 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../cnn_helper.h" -#include "rnn.h" -#include "rnn_mapper.h" - -struct LinearInitParams { - DnnHandle handle; - int batchSize, inputSize, outputSize; -}; - -Tensor RnnModel::add_linear_node(Tensor x, - int output_size, - ParallelConfig pc, - SharedVariable params) { - assert(x.numDim == 3); - assert(x.adim[2] == LSTM_PER_NODE_LENGTH); - assert(x.pdim[2] == LSTM_PER_NODE_LENGTH); - Linear *node = new Linear(config, x, output_size, pc, params, part_is); - layers.push_back(node); - return node->outputs[0]; -} - -Linear::Linear(RnnConfig config, - Tensor input, - int _output_size, - ParallelConfig pc, - SharedVariable _params, - IndexSpaceT<1> input_part_is) - : RnnOp(input, pc, _params), input_size(input.adim[0]), - output_size(_output_size) { - Context ctx = config.lg_ctx; - HighLevelRuntime *runtime = config.lg_hlr; - assert(pc.nDims == 2); - int num_par_n = pc.dim[1]; - int num_par_c = pc.dim[0]; - input_part_rect = runtime->get_index_space_domain(ctx, input_part_is); - { - Rect<2> rect(Point<2>(0, 0), Point<2>(num_par_c - 1, num_par_n - 1)); - part_rect = rect; - } - IndexSpaceT<2> part_is = runtime->create_index_space(ctx, part_rect); - int batch_size = input.adim[1]; - FieldSpace fs = config.field_space; - Rect<3, coord_t> y_rect( - Point<3>(0, 0, 0), - Point<3>(output_size - 1, batch_size - 1, LSTM_PER_NODE_LENGTH - 1)); - IndexSpaceT<3> y_is = runtime->create_index_space(ctx, y_rect); - LogicalRegion y_lr = runtime->create_logical_region(ctx, y_is, fs); - LogicalRegion y_grad_lr = runtime->create_logical_region(ctx, y_is, fs); - assert(output_size % num_par_c == 0); - assert(batch_size % num_par_n == 0); - int extent_c = output_size / num_par_c; - int extent_n = batch_size / num_par_n; - Rect<3, coord_t> extent( - Point<3>(0, 0, 0), - Point<3>(extent_c - 1, extent_n - 1, LSTM_PER_NODE_LENGTH - 1)); - Transform<3, 2, coord_t> trans; - trans[0][0] = extent_c; - trans[0][1] = 0; - trans[1][0] = 0; - trans[1][1] = extent_n; - trans[2][0] = 0; - trans[2][1] = 0; - IndexPartition y_ip = runtime->create_partition_by_restriction( - ctx, y_is, part_is, trans, extent); - assert(runtime->is_index_partition_disjoint(ctx, y_ip)); - assert(runtime->is_index_partition_complete(ctx, y_ip)); - LogicalPartition y_lp = runtime->get_logical_partition(ctx, y_lr, y_ip); - LogicalPartition y_grad_lp = - runtime->get_logical_partition(ctx, y_grad_lr, y_ip); - - // Note: we only need replica's grad, so no need to create lr/lp for forward - Rect<3, coord_t> replica_rect(Point<3>(0, 0, 0), - Point<3>(input_size - 1, - batch_size - 1, - LSTM_PER_NODE_LENGTH * num_par_c - 1)); - IndexSpaceT<3> replica_is = runtime->create_index_space(ctx, replica_rect); - replica.region_grad = runtime->create_logical_region(ctx, replica_is, fs); - trans[0][0] = 0; - trans[0][1] = 0; - trans[1][0] = 0; - trans[1][1] = extent_n; - trans[2][0] = LSTM_PER_NODE_LENGTH; - trans[2][1] = 0; - Rect<3, coord_t> replica_ext( - Point<3>(0, 0, 0), - Point<3>(input_size - 1, extent_n - 1, LSTM_PER_NODE_LENGTH - 1)); - IndexPartition replica_ip = runtime->create_partition_by_restriction( - ctx, replica_is, part_is, trans, replica_ext); - assert(runtime->is_index_partition_disjoint(ctx, replica_ip)); - assert(runtime->is_index_partition_complete(ctx, replica_ip)); - replica.partition_grad = - runtime->get_logical_partition(ctx, replica.region_grad, replica_ip); - for (int i = 0; i < num_par_c; i++) { - Transform<3, 1, coord_t> input_trans; - input_trans[0][0] = 0; - input_trans[1][0] = inputs[0].pdim[1]; - input_trans[2][0] = 0; - Rect<3, coord_t> ext(Point<3>(0, 0, LSTM_PER_NODE_LENGTH * i), - Point<3>(inputs[0].pdim[0] - 1, - inputs[0].pdim[1] - 1, - LSTM_PER_NODE_LENGTH * (i + 1) - 1)); - IndexPartition ip = runtime->create_partition_by_restriction( - ctx, replica_is, input_part_is, input_trans, ext); - assert(runtime->is_index_partition_disjoint(ctx, ip)); - replica_sub_lps[i] = - runtime->get_logical_partition(ctx, replica.region_grad, ip); - } - - outputs[0].numDim = 3; - outputs[0].adim[0] = output_size; - outputs[0].adim[1] = batch_size; - outputs[0].adim[2] = LSTM_PER_NODE_LENGTH; - outputs[0].pdim[0] = extent_c; - outputs[0].pdim[1] = extent_n; - outputs[0].pdim[2] = LSTM_PER_NODE_LENGTH; - outputs[0].region = y_lr; - outputs[0].partition = y_lp; - outputs[0].region_grad = y_grad_lr; - outputs[0].partition_grad = y_grad_lp; - - // Every partition reads all in_channels - trans[0][0] = 0; - trans[0][1] = 0; - trans[1][0] = 0; - trans[1][1] = extent_n; - trans[2][0] = 0; - trans[2][1] = 0; - Rect<3, coord_t> input_ext( - Point<3>(0, 0, 0), - Point<3>(input_size - 1, extent_n - 1, LSTM_PER_NODE_LENGTH)); - IndexSpaceT<3> input_is = IndexSpaceT<3>(inputs[0].region.get_index_space()); - IndexPartition input_ip = runtime->create_partition_by_restriction( - ctx, input_is, part_is, trans, input_ext); - input_lp = runtime->get_logical_partition(ctx, inputs[0].region, input_ip); -} - -/* - regions[0](I): x - regions[1](I): w - regions[2](O): y - */ -OpMeta *Linear::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - LinearInitParams const *linear = (LinearInitParams *)task->args; - Rect<3> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - assert(rect_x.hi[0] - rect_x.lo[0] + 1 == linear->inputSize); - assert(rect_x.hi[1] - rect_x.lo[1] + 1 == linear->batchSize); - assert(rect_x.hi[2] - rect_x.lo[2] + 1 == LSTM_PER_NODE_LENGTH); - assert(rect_y.hi[0] - rect_y.lo[0] + 1 == linear->outputSize); - assert(rect_y.hi[1] - rect_y.lo[1] + 1 == linear->batchSize); - assert(rect_y.hi[2] - rect_y.lo[2] + 1 == LSTM_PER_NODE_LENGTH); - assert(rect_w.hi[0] - rect_w.lo[0] + 1 == - linear->outputSize * (linear->inputSize + 1)); - LinearMeta *m = new LinearMeta(linear->handle); - m->profiling_runtime = false; -#ifndef DISABLE_COMPUTATION - int batch_size = linear->batchSize * LSTM_PER_NODE_LENGTH; - float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); - for (int i = 0; i < batch_size; i++) { - dram_one_ptr[i] = 1.0f; - } - checkCUDA(cudaMalloc(&m->one_ptr, sizeof(float) * batch_size)); - checkCUDA(cudaMemcpy(m->one_ptr, - dram_one_ptr, - sizeof(float) * batch_size, - cudaMemcpyHostToDevice)); -#endif - return m; -} - -void Linear::init(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - int num_par_c = part_rect.hi[0] - part_rect.lo[0] + 1; - for (PointInRectIterator<2> it(part_rect); it(); it++, idx++) { - LinearInitParams initParams; - initParams.handle = model.dnn_handlers[paraConfig.gpu[idx]]; - initParams.batchSize = outputs[0].pdim[1]; - initParams.inputSize = inputs[0].pdim[0]; - initParams.outputSize = outputs[0].pdim[0]; - TaskLauncher launcher(RNN_LINEAR_INIT_TASK_ID, - TaskArgument(&initParams, sizeof(initParams)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - // Add input - { - LogicalRegion x = runtime->get_logical_subregion_by_color(input_lp, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.subregions[num_par_c + dp[0]], - READ_ONLY, - EXCLUSIVE, - params.region)); - launcher.add_field(1, FID_DATA); - // Add output - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, WRITE_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(2, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - meta[idx] = f.get_result(); - } -} - -/* - regions[0] (I): x - regions[1] (I): w - regions[2] (O): y - */ -void Linear::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 3); - assert(task->regions.size() == 3); - float alpha = 1.0f, beta = 0.0f; - LinearMeta const *m = *((LinearMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorRO const acc_w(regions[1], FID_DATA); - AccessorWO const acc_y(regions[2], FID_DATA); - Rect<3> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - int input_size = rect_x.hi[0] - rect_x.lo[0] + 1; - int output_size = rect_y.hi[0] - rect_y.lo[0] + 1; - int batch_size = (rect_x.hi[1] - rect_x.lo[1] + 1) * LSTM_PER_NODE_LENGTH; - float const *x_ptr = acc_x.ptr(rect_x.lo); - float const *w_ptr = acc_w.ptr(rect_w.lo); - float const *bias_ptr = w_ptr + input_size; - float *y_ptr = acc_y.ptr(rect_y.lo); - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDA(cublasSgemm(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - output_size, - batch_size, - input_size, - &alpha, - w_ptr, - input_size + 1, - x_ptr, - input_size, - &beta, - y_ptr, - output_size)); - checkCUDA(cublasSgemm(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - output_size, - batch_size, - 1, - &alpha, - bias_ptr, - input_size + 1, - m->one_ptr, - 1, - &alpha, - y_ptr, - output_size)); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("Linear forward time = %.2lfms\n", elapsed); - } -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<3, float>(y_ptr, rect_y, "linear(fwd):y"); -#endif -#endif -} - -void Linear::forward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - int num_par_c = part_rect.hi[0] - part_rect.lo[0] + 1; - for (PointInRectIterator<2> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(RNN_LINEAR_FWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - // Add input - { - LogicalRegion x = runtime->get_logical_subregion_by_color(input_lp, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.subregions[num_par_c + dp[0]], - READ_ONLY, - EXCLUSIVE, - params.region)); - launcher.add_field(1, FID_DATA); - // Add output - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, WRITE_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(2, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -/* - regions[0](I): x - regions[1](I): w - regions[2](I): y - regions[3](O); replica_grad - regions[4](I/O): w_grad - regions[5](I): y_grad -*/ -void Linear::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 6); - assert(task->regions.size() == 6); - float alpha = 1.0f, beta = 0.0f; - LinearMeta const *m = *((LinearMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorRO const acc_w(regions[1], FID_DATA); - AccessorRO const acc_y(regions[2], FID_DATA); - AccessorWO const acc_replica_grad(regions[3], FID_DATA); - AccessorRW const acc_w_grad(regions[4], FID_DATA); - AccessorRO const acc_y_grad(regions[5], FID_DATA); - - Rect<3> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - Rect<3> rect_replica_grad = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - Rect<1> rect_w_grad = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); - Rect<3> rect_y_grad = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - assert(acc_replica_grad.accessor.is_dense_arbitrary(rect_replica_grad)); - assert(acc_w_grad.accessor.is_dense_arbitrary(rect_w_grad)); - assert(acc_y_grad.accessor.is_dense_arbitrary(rect_y_grad)); - int input_size = rect_x.hi[0] - rect_x.lo[0] + 1; - int output_size = rect_y.hi[0] - rect_y.lo[0] + 1; - int batch_size = (rect_x.hi[1] - rect_x.lo[1] + 1) * LSTM_PER_NODE_LENGTH; - float const *x_ptr = acc_x.ptr(rect_x.lo); - float const *w_ptr = acc_w.ptr(rect_w.lo); - float const *y_ptr = acc_y.ptr(rect_y.lo); - float *replica_grad_ptr = acc_replica_grad.ptr(rect_replica_grad.lo); - // Note that w_grad might be bigger than w - assert(rect_w_grad.contains(rect_w)); - float *w_grad_ptr = acc_w_grad.ptr(rect_w_grad.lo); - float *bias_grad_ptr = w_grad_ptr + input_size; - float const *y_grad_ptr = acc_y_grad.ptr(rect_y_grad.lo); - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - checkCUDA(cublasSetStream(m->handle.blas, stream)); - // Compute weight gradient - checkCUDA(cublasSgemm(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - input_size, - output_size, - batch_size, - &alpha, - x_ptr, - input_size, - y_grad_ptr, - output_size, - &alpha, - w_grad_ptr, - input_size + 1)); - // Compute bias gradient - checkCUDA(cublasSgemv(m->handle.blas, - CUBLAS_OP_N, - output_size, - batch_size, - &alpha, - y_grad_ptr, - output_size, - m->one_ptr, - 1, - &alpha, - bias_grad_ptr, - input_size + 1)); - // Compute data gradient - checkCUDA(cublasSgemm(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_N, - input_size, - batch_size, - output_size, - &alpha, - w_ptr, - input_size + 1, - y_grad_ptr, - output_size, - &beta, - replica_grad_ptr, - input_size)); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("Linear backward time = %.2lfms\n", elapsed); - } -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<1, float>(w_grad_ptr, rect_w_grad, "linear(bwd):w_grad"); -#endif -#endif -} - -/* - regions[0](O): input - regions[1..num_par_c](I): replicas -*/ -void Linear::backward2_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - float alpha = 1.0f; - LinearMeta const *m = *((LinearMeta **)task->args); - AccessorWO const acc_input(regions[0], FID_DATA); - Rect<3> rect_input = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(acc_input.accessor.is_dense_arbitrary(rect_input)); - float *input_ptr = acc_input.ptr(rect_input.lo); - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - checkCUDA(cublasSetStream(m->handle.blas, stream)); - for (int i = 1; i < task->regions.size(); i++) { - AccessorRO const acc_replica(regions[i], FID_DATA); - Rect<3> rect_replica = runtime->get_index_space_domain( - ctx, task->regions[i].region.get_index_space()); - assert(rect_replica.volume() == rect_input.volume()); - assert(acc_replica.accessor.is_dense_arbitrary(rect_replica)); - float const *replica_ptr = acc_replica.ptr(rect_replica.lo); - if (i == 1) { - checkCUDA(cublasScopy( - m->handle.blas, rect_input.volume(), replica_ptr, 1, input_ptr, 1)); - } else { - checkCUDA(cublasSaxpy(m->handle.blas, - rect_input.volume(), - &alpha, - replica_ptr, - 1, - input_ptr, - 1)); - } - } -#endif -} - -void Linear::backward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - int num_par_c = part_rect.hi[0] - part_rect.lo[0] + 1; - for (PointInRectIterator<2> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(RNN_LINEAR_BWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - // Add x - { - LogicalRegion x = runtime->get_logical_subregion_by_color(input_lp, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - // Add w - launcher.add_region_requirement( - RegionRequirement(params.subregions[num_par_c + dp[0]], - READ_ONLY, - EXCLUSIVE, - params.region)); - launcher.add_field(1, FID_DATA); - // Add y - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, READ_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(2, FID_DATA); - } - // Add replica_grad - { - LogicalRegion replica_grad = - runtime->get_logical_subregion_by_color(replica.partition_grad, dp); - launcher.add_region_requirement(RegionRequirement( - replica_grad, WRITE_ONLY, EXCLUSIVE, replica.region_grad)); - launcher.add_field(3, FID_DATA); - } - // Add w_grad - launcher.add_region_requirement( - RegionRequirement(params.gradients[paraConfig.gpu[idx]], - READ_WRITE, - EXCLUSIVE, - params.gradients[paraConfig.gpu[idx]])); - launcher.add_field(4, FID_DATA); - // Add y_grad - { - LogicalRegion y_grad = runtime->get_logical_subregion_by_color( - outputs[0].partition_grad, dp); - launcher.add_region_requirement(RegionRequirement( - y_grad, READ_ONLY, EXCLUSIVE, outputs[0].region_grad)); - launcher.add_field(5, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } - - // We aggregate data from replica tensor to input tensor - idx = 0; - for (PointInRectIterator<1> it(input_part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(RNN_LINEAR_BWD2_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - LogicalRegion input = - runtime->get_logical_subregion_by_color(inputs[0].partition_grad, dp); - launcher.add_region_requirement( - RegionRequirement(input, WRITE_ONLY, EXCLUSIVE, inputs[0].region_grad)); - launcher.add_field(0, FID_DATA); - int num_par_c = part_rect.hi[0] - part_rect.lo[0] + 1; - for (int i = 0; i < num_par_c; i++) { - LogicalRegion r = - runtime->get_logical_subregion_by_color(replica_sub_lps[i], dp); - launcher.add_region_requirement( - RegionRequirement(r, READ_ONLY, EXCLUSIVE, replica.region_grad)); - launcher.add_field(i + 1, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -void Linear::update_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) {} - -void Linear::update(RnnModel const &model) {} diff --git a/nmt/lstm.cu b/nmt/lstm.cu deleted file mode 100644 index 1a405bb1a0..0000000000 --- a/nmt/lstm.cu +++ /dev/null @@ -1,652 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../cnn_helper.h" -#include "rnn.h" -#include "rnn_mapper.h" - -struct LSTMInitParams { - DnnHandle handle; - int batchSize, inputSize, outputSize; -}; - -LSTMTensors RnnModel::add_lstm_node( - Tensor x, Tensor hx, Tensor cx, ParallelConfig pc, SharedVariable params) { - assert(x.numDim == 3); - assert(hx.numDim == 2); - assert(cx.numDim == 2); - assert(x.adim[2] == LSTM_PER_NODE_LENGTH); - assert(x.pdim[2] == LSTM_PER_NODE_LENGTH); - int batch_size = x.adim[1]; - assert(hx.adim[1] == batch_size); - assert(cx.adim[1] == batch_size); - int input_size = x.adim[0]; - int output_size = hx.adim[0]; - assert(cx.adim[0] == output_size); - LSTM *node = new LSTM( - config, x, hx, cx, batch_size, input_size, output_size, pc, params); - layers.push_back(node); - LSTMTensors output; - output.x = node->outputs[0]; - output.hx = node->outputs[1]; - output.cx = node->outputs[2]; - return output; -} - -/* - output[0]: y - output[1]: hy - output[2]: cy - */ -LSTM::LSTM(RnnConfig config, - Tensor x, - Tensor hx, - Tensor cx, - int _batch_size, - int _input_size, - int _output_size, - ParallelConfig pc, - SharedVariable _params) - : RnnOp(x, hx, cx, pc, _params), batch_size(_batch_size), - input_size(_input_size), output_size(_output_size) { - printf("LSTM node: batch(%d) input(%d) output(%d)\n", - batch_size, - input_size, - output_size); - Context ctx = config.lg_ctx; - HighLevelRuntime *runtime = config.lg_hlr; - assert(pc.nDims == 1); - { - Rect<1> rect(Point<1>(0), Point<1>(pc.dim[0] - 1)); - part_rect = rect; - } - IndexSpaceT<1> part_is = runtime->create_index_space(ctx, part_rect); - FieldSpace fs = config.field_space; - Rect<3, coord_t> y_rect( - Point<3>(0, 0, 0), - Point<3>(output_size - 1, batch_size - 1, LSTM_PER_NODE_LENGTH - 1)); - IndexSpaceT<3> y_is = runtime->create_index_space(ctx, y_rect); - LogicalRegion y_lr = runtime->create_logical_region(ctx, y_is, fs); - LogicalRegion y_grad_lr = runtime->create_logical_region(ctx, y_is, fs); - int num_par_n = part_rect.hi[0] - part_rect.lo[0] + 1; - assert(batch_size % num_par_n == 0); - int extent_n = batch_size / num_par_n; - int extent_c = output_size; - Rect<3, coord_t> extent( - Point<3>(0, 0, 0), - Point<3>(extent_c - 1, extent_n - 1, LSTM_PER_NODE_LENGTH - 1)); - Transform<3, 1, coord_t> trans; - trans[0][0] = 0; - trans[1][0] = extent_n; - trans[2][0] = 0; - IndexPartition y_ip = runtime->create_partition_by_restriction( - ctx, y_is, part_is, trans, extent); - assert(runtime->is_index_partition_disjoint(ctx, y_ip)); - assert(runtime->is_index_partition_complete(ctx, y_ip)); - LogicalPartition y_lp = runtime->get_logical_partition(ctx, y_lr, y_ip); - LogicalPartition y_grad_lp = - runtime->get_logical_partition(ctx, y_grad_lr, y_ip); - outputs[0].region = y_lr; - outputs[0].region_grad = y_grad_lr; - outputs[0].partition = y_lp; - outputs[0].partition_grad = y_grad_lp; - outputs[0].numDim = 3; - outputs[0].adim[0] = output_size; - outputs[0].adim[1] = batch_size; - outputs[0].adim[2] = LSTM_PER_NODE_LENGTH; - outputs[0].pdim[0] = extent_c; - outputs[0].pdim[1] = extent_n; - outputs[0].pdim[2] = LSTM_PER_NODE_LENGTH; - - Rect<2, coord_t> hy_rect(Point<2>(0, 0), - Point<2>(output_size - 1, batch_size - 1)); - IndexSpaceT<2> hy_is = runtime->create_index_space(ctx, hy_rect); - LogicalRegion hy_lr = runtime->create_logical_region(ctx, hy_is, fs); - LogicalRegion hy_grad_lr = runtime->create_logical_region(ctx, hy_is, fs); - Rect<2, coord_t> hy_ext(Point<2>(0, 0), Point<2>(extent_c - 1, extent_n - 1)); - Transform<2, 1, coord_t> hy_trans; - hy_trans[0][0] = 0; - hy_trans[1][0] = extent_n; - IndexPartition hy_ip = runtime->create_partition_by_restriction( - ctx, hy_is, part_is, hy_trans, hy_ext); - assert(runtime->is_index_partition_disjoint(ctx, hy_ip)); - assert(runtime->is_index_partition_complete(ctx, hy_ip)); - LogicalPartition hy_lp = runtime->get_logical_partition(ctx, hy_lr, hy_ip); - LogicalPartition hy_grad_lp = - runtime->get_logical_partition(ctx, hy_grad_lr, hy_ip); - outputs[1].region = hy_lr; - outputs[1].region_grad = hy_grad_lr; - outputs[1].partition = hy_lp; - outputs[1].partition_grad = hy_grad_lp; - outputs[1].numDim = 2; - outputs[1].adim[0] = output_size; - outputs[1].adim[1] = batch_size; - outputs[1].pdim[0] = extent_c; - outputs[1].pdim[1] = extent_n; - - LogicalRegion cy_lr = runtime->create_logical_region(ctx, hy_is, fs); - LogicalRegion cy_grad_lr = runtime->create_logical_region(ctx, hy_is, fs); - LogicalPartition cy_lp = runtime->get_logical_partition(ctx, cy_lr, hy_ip); - LogicalPartition cy_grad_lp = - runtime->get_logical_partition(ctx, cy_grad_lr, hy_ip); - outputs[2] = outputs[1]; - outputs[2].region = cy_lr; - outputs[2].region_grad = cy_grad_lr; - outputs[2].partition = cy_lp; - outputs[2].partition_grad = cy_grad_lp; -} - -/* - regions[0] (I): x - regions[1] (I): hx - regions[2] (I): cx - regions[3] (I): w - regions[4] (O): y - regions[5] (O): hy - regions[6] (O): cy -*/ -OpMeta *LSTM::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - int const numLayers = 1; - int const seqLength = LSTM_PER_NODE_LENGTH; - float const dropoutRate = 0.2f; - assert(regions.size() == 7); - assert(task->regions.size() == 7); - Rect<1> para_rect = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - LSTMInitParams const *lstm = (LSTMInitParams *)task->args; - LSTMMeta *m = new LSTMMeta(lstm->handle); -#ifndef DISABLE_COMPUTATION - checkCUDNN(cudnnCreateRNNDescriptor(&m->rnnDesc)); - checkCUDNN(cudnnCreateDropoutDescriptor(&m->dropoutDesc)); - size_t dropoutSize; - void *dropoutStates; - checkCUDNN(cudnnDropoutGetStatesSize(m->handle.dnn, &dropoutSize)); - checkCUDA(cudaMalloc(&dropoutStates, dropoutSize)); - checkCUDNN(cudnnSetDropoutDescriptor(m->dropoutDesc, - m->handle.dnn, - dropoutRate, - dropoutStates, - dropoutSize, - 10 /*seed*/)); - checkCUDNN(cudnnSetRNNDescriptor_v5(m->rnnDesc, - lstm->outputSize, - numLayers, - m->dropoutDesc, - CUDNN_LINEAR_INPUT, - CUDNN_UNIDIRECTIONAL, - CUDNN_LSTM, - CUDNN_DATA_FLOAT)); - for (int i = 0; i < seqLength; i++) { - checkCUDNN(cudnnCreateTensorDescriptor(&m->xDescs[i])); - int dims[] = {lstm->batchSize, lstm->inputSize, 1}; - int strides[] = {dims[1] * dims[2], dims[2], 1}; - checkCUDNN(cudnnSetTensorNdDescriptor( - m->xDescs[i], CUDNN_DATA_FLOAT, 3, dims, strides)); - } - size_t workSpaceSize; - checkCUDNN(cudnnGetRNNWorkspaceSize( - m->handle.dnn, m->rnnDesc, seqLength, m->xDescs, &workSpaceSize)); - // Assert that we have enough work space - assert(workSpaceSize <= m->handle.workSpaceSize); - checkCUDNN(cudnnGetRNNTrainingReserveSize( - m->handle.dnn, m->rnnDesc, seqLength, m->xDescs, &m->reserveSpaceSize)); - checkCUDA(cudaMalloc(&m->reserveSpace, m->reserveSpaceSize)); - size_t paramsSize; - checkCUDNN(cudnnGetRNNParamsSize( - m->handle.dnn, m->rnnDesc, m->xDescs[0], ¶msSize, CUDNN_DATA_FLOAT)); - assert(paramsSize == sizeof(float) * para_rect.volume()); - { - int dims[] = {(int)paramsSize, 1, 1}; - checkCUDNN(cudnnCreateFilterDescriptor(&m->wDesc)); - checkCUDNN(cudnnSetFilterNdDescriptor( - m->wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dims)); - } - { - checkCUDNN(cudnnCreateTensorDescriptor(&m->hxDesc)); - checkCUDNN(cudnnCreateTensorDescriptor(&m->cxDesc)); - checkCUDNN(cudnnCreateTensorDescriptor(&m->hyDesc)); - checkCUDNN(cudnnCreateTensorDescriptor(&m->cyDesc)); - int dims[] = {numLayers, lstm->batchSize, lstm->outputSize}; - int strides[] = {dims[1] * dims[2], dims[2], 1}; - checkCUDNN(cudnnSetTensorNdDescriptor( - m->hxDesc, CUDNN_DATA_FLOAT, 3, dims, strides)); - checkCUDNN(cudnnSetTensorNdDescriptor( - m->cxDesc, CUDNN_DATA_FLOAT, 3, dims, strides)); - checkCUDNN(cudnnSetTensorNdDescriptor( - m->hyDesc, CUDNN_DATA_FLOAT, 3, dims, strides)); - checkCUDNN(cudnnSetTensorNdDescriptor( - m->cyDesc, CUDNN_DATA_FLOAT, 3, dims, strides)); - } - for (int i = 0; i < seqLength; i++) { - checkCUDNN(cudnnCreateTensorDescriptor(&m->yDescs[i])); - int dims[] = {lstm->batchSize, lstm->outputSize, 1}; - int strides[] = {dims[1] * dims[2], dims[2], 1}; - checkCUDNN(cudnnSetTensorNdDescriptor( - m->yDescs[i], CUDNN_DATA_FLOAT, 3, dims, strides)); - } - m->profiling_runtime = true; - return m; -#endif -} - -void LSTM::init(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - LSTMInitParams initParams; - initParams.handle = model.dnn_handlers[paraConfig.gpu[idx]]; - initParams.batchSize = outputs[0].pdim[1]; - initParams.inputSize = inputs[0].pdim[0]; - initParams.outputSize = outputs[0].pdim[0]; - // For now assume batch sizes equal - assert(inputs[0].pdim[1] == outputs[0].pdim[1]); - - TaskLauncher launcher(LSTM_INIT_TASK_ID, - TaskArgument(&initParams, sizeof(initParams)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - // add region requirements for x, hx, cx - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[i].region)); - launcher.add_field(i, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.region, READ_ONLY, EXCLUSIVE, params.region)); - launcher.add_field(3, FID_DATA); - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(outputs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, WRITE_ONLY, EXCLUSIVE, outputs[i].region)); - launcher.add_field(4 + i, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - meta[idx] = f.get_result(); - } -} - -/* - regions[0] (I): x - regions[1] (I): hx - regions[2] (I): cx - regions[3] (I): w - regions[4] (O): y - regions[5] (O): hy - regions[6] (O): cy -*/ -void LSTM::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 7); - assert(task->regions.size() == 7); - LSTMMeta const *m = *((LSTMMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorRO const acc_hx(regions[1], FID_DATA); - AccessorRO const acc_cx(regions[2], FID_DATA); - AccessorRO const acc_w(regions[3], FID_DATA); - AccessorWO const acc_y(regions[4], FID_DATA); - AccessorWO const acc_hy(regions[5], FID_DATA); - AccessorWO const acc_cy(regions[6], FID_DATA); - Rect<3> rect_x, rect_y; - Rect<2> rect_hx, rect_cx, rect_hy, rect_cy; - Rect<1> rect_w; - rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - rect_hx = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - rect_cx = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - rect_w = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - rect_y = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); - rect_hy = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); - rect_cy = runtime->get_index_space_domain( - ctx, task->regions[6].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_hx.accessor.is_dense_arbitrary(rect_hx)); - assert(acc_cx.accessor.is_dense_arbitrary(rect_cx)); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - assert(acc_hy.accessor.is_dense_arbitrary(rect_hy)); - assert(acc_cy.accessor.is_dense_arbitrary(rect_cy)); - assert(rect_hx == rect_cx); - assert(rect_hx == rect_hy); - assert(rect_hx == rect_cy); - float const *x_ptr = acc_x.ptr(rect_x.lo); - float const *hx_ptr = acc_hx.ptr(rect_hx.lo); - float const *cx_ptr = acc_cx.ptr(rect_cx.lo); - float const *w_ptr = acc_w.ptr(rect_w.lo); - float *y_ptr = acc_y.ptr(rect_y.lo); - float *hy_ptr = acc_hy.ptr(rect_hy.lo); - float *cy_ptr = acc_cy.ptr(rect_cy.lo); - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnRNNForwardTraining(m->handle.dnn, - m->rnnDesc, - LSTM_PER_NODE_LENGTH /*seqLength*/, - m->xDescs, - x_ptr, - m->hxDesc, - hx_ptr, - m->cxDesc, - cx_ptr, - m->wDesc, - w_ptr, - m->yDescs, - y_ptr, - m->hyDesc, - hy_ptr, - m->cyDesc, - cy_ptr, - m->handle.workSpace, - m->handle.workSpaceSize, - m->reserveSpace, - m->reserveSpaceSize)); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("LSTM forward time = %.2fms\n", elapsed); - } -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<3, float>(y_ptr, rect_y, "lstm_fwd:y"); -#endif -#endif -} - -void LSTM::forward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(LSTM_FWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - // add region requirements for x, hx, cx - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[i].region)); - launcher.add_field(i, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.region, READ_ONLY, EXCLUSIVE, params.region)); - launcher.add_field(3, FID_DATA); - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(outputs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, WRITE_ONLY, EXCLUSIVE, outputs[i].region)); - launcher.add_field(4 + i, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -/* - regions[0] (I): x - regions[1] (I): hx - regions[2] (I): cx - regions[3] (I): w - regions[4] (I): y - regions[5] (I): hy - regions[6] (I): cy - regions[7] (O): x_grad - regions[8] (O): hx_grad - regions[9] (O): cx_grad - regions[10] (I/O): w_grad - regions[11] (I): y_grad - regions[12] (I): hy_grad - regions[13] (I): cy_grad -*/ -void LSTM::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 14); - assert(task->regions.size() == 14); - LSTMMeta const *m = *((LSTMMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorRO const acc_hx(regions[1], FID_DATA); - AccessorRO const acc_cx(regions[2], FID_DATA); - AccessorRO const acc_w(regions[3], FID_DATA); - AccessorRO const acc_y(regions[4], FID_DATA); - AccessorRO const acc_hy(regions[5], FID_DATA); - AccessorRO const acc_cy(regions[6], FID_DATA); - AccessorWO const acc_x_grad(regions[7], FID_DATA); - AccessorWO const acc_hx_grad(regions[8], FID_DATA); - AccessorWO const acc_cx_grad(regions[9], FID_DATA); - AccessorRW const acc_w_grad(regions[10], FID_DATA); - AccessorRO const acc_y_grad(regions[11], FID_DATA); - AccessorRO const acc_hy_grad(regions[12], FID_DATA); - AccessorRO const acc_cy_grad(regions[13], FID_DATA); - - Rect<3> rect_x, rect_y, rect_x_grad, rect_y_grad; - Rect<2> rect_hx, rect_cx, rect_hy, rect_cy, rect_hx_grad, rect_cx_grad, - rect_hy_grad, rect_cy_grad; - Rect<1> rect_w, rect_w_grad; - rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - rect_hx = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - rect_cx = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - rect_w = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - rect_y = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); - rect_hy = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); - rect_cy = runtime->get_index_space_domain( - ctx, task->regions[6].region.get_index_space()); - rect_x_grad = runtime->get_index_space_domain( - ctx, task->regions[7].region.get_index_space()); - rect_hx_grad = runtime->get_index_space_domain( - ctx, task->regions[8].region.get_index_space()); - rect_cx_grad = runtime->get_index_space_domain( - ctx, task->regions[9].region.get_index_space()); - rect_w_grad = runtime->get_index_space_domain( - ctx, task->regions[10].region.get_index_space()); - rect_y_grad = runtime->get_index_space_domain( - ctx, task->regions[11].region.get_index_space()); - rect_hy_grad = runtime->get_index_space_domain( - ctx, task->regions[12].region.get_index_space()); - rect_cy_grad = runtime->get_index_space_domain( - ctx, task->regions[13].region.get_index_space()); - - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_hx.accessor.is_dense_arbitrary(rect_hx)); - assert(acc_cx.accessor.is_dense_arbitrary(rect_cx)); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - assert(acc_hy.accessor.is_dense_arbitrary(rect_hy)); - assert(acc_cy.accessor.is_dense_arbitrary(rect_cy)); - assert(acc_x_grad.accessor.is_dense_arbitrary(rect_x_grad)); - assert(acc_hx_grad.accessor.is_dense_arbitrary(rect_hx_grad)); - assert(acc_cx_grad.accessor.is_dense_arbitrary(rect_cx_grad)); - assert(acc_w_grad.accessor.is_dense_arbitrary(rect_w_grad)); - assert(acc_y_grad.accessor.is_dense_arbitrary(rect_y_grad)); - assert(acc_hy_grad.accessor.is_dense_arbitrary(rect_hy_grad)); - assert(acc_cy_grad.accessor.is_dense_arbitrary(rect_cy_grad)); - - float const *x_ptr = acc_x.ptr(rect_x.lo); - float const *hx_ptr = acc_hx.ptr(rect_hx.lo); - float const *cx_ptr = acc_cx.ptr(rect_cx.lo); - float const *w_ptr = acc_w.ptr(rect_w.lo); - float const *y_ptr = acc_y.ptr(rect_y.lo); - float const *hy_ptr = acc_hy.ptr(rect_hy.lo); - float const *cy_ptr = acc_cy.ptr(rect_cy.lo); - float *x_grad_ptr = acc_x_grad.ptr(rect_x_grad.lo); - float *hx_grad_ptr = acc_hx_grad.ptr(rect_hx_grad.lo); - float *cx_grad_ptr = acc_cx_grad.ptr(rect_cx_grad.lo); - float *w_grad_ptr = acc_w_grad.ptr(rect_w_grad.lo); - float const *y_grad_ptr = acc_y_grad.ptr(rect_y_grad.lo); - float const *hy_grad_ptr = acc_hy_grad.ptr(rect_hy_grad.lo); - float const *cy_grad_ptr = acc_cy_grad.ptr(rect_cy_grad.lo); - - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnRNNBackwardData(m->handle.dnn, - m->rnnDesc, - LSTM_PER_NODE_LENGTH /*seqLength*/, - m->yDescs, - y_ptr, - m->yDescs, - y_grad_ptr, - m->hyDesc, - hy_grad_ptr, - m->cyDesc, - cy_grad_ptr, - m->wDesc, - w_ptr, - m->hxDesc, - hx_ptr, - m->cxDesc, - cx_ptr, - m->xDescs, - x_grad_ptr, - m->hxDesc, - hx_grad_ptr, - m->cxDesc, - cx_grad_ptr, - m->handle.workSpace, - m->handle.workSpaceSize, - m->reserveSpace, - m->reserveSpaceSize)); - checkCUDNN(cudnnRNNBackwardWeights(m->handle.dnn, - m->rnnDesc, - LSTM_PER_NODE_LENGTH /*seqLength*/, - m->xDescs, - x_ptr, - m->hxDesc, - hx_ptr, - m->yDescs, - y_ptr, - m->handle.workSpace, - m->handle.workSpaceSize, - m->wDesc, - w_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("LSTM backward time = %.2fms\n", elapsed); - } -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<1, float>(w_grad_ptr, rect_w_grad, "lstm_bwd:w_grad"); - print_tensor<3, float>(x_grad_ptr, rect_x_grad, "lstm_bwd:x_grad"); - print_tensor<2, float>(hx_grad_ptr, rect_hx_grad, "lstm_bwd:hx_grad"); - print_tensor<2, float>(cx_grad_ptr, rect_cx_grad, "lstm_bwd:cx_grad"); -#endif -#endif -} - -void LSTM::backward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - DomainPoint dp(*it); - TaskLauncher launcher(LSTM_BWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - // add region requirements for x, hx, cx - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[i].region)); - launcher.add_field(i, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.region, READ_ONLY, EXCLUSIVE, params.region)); - launcher.add_field(3, FID_DATA); - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(outputs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, outputs[i].region)); - launcher.add_field(4 + i, FID_DATA); - } - // add region requirements for gradients - for (int i = 0; i < 3; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(inputs[i].partition_grad, dp); - launcher.add_region_requirement( - RegionRequirement(x, WRITE_ONLY, EXCLUSIVE, inputs[i].region_grad)); - launcher.add_field(7 + i, FID_DATA); - } - launcher.add_region_requirement( - RegionRequirement(params.gradients[paraConfig.gpu[idx]], - READ_WRITE, - EXCLUSIVE, - params.gradients[paraConfig.gpu[idx]])); - launcher.add_field(10, FID_DATA); - for (int i = 0; i < 3; i++) { - LogicalRegion x = runtime->get_logical_subregion_by_color( - outputs[i].partition_grad, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, outputs[i].region_grad)); - launcher.add_field(11 + i, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -void LSTM::update(RnnModel const &model) {} diff --git a/nmt/nmt.cc b/nmt/nmt.cc deleted file mode 100644 index cc8c09024b..0000000000 --- a/nmt/nmt.cc +++ /dev/null @@ -1,359 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "rnn.h" -#include "rnn_mapper.h" -#include - -using namespace Legion; - -LegionRuntime::Logger::Category log_nmt("nmt"); - -void parse_input_args(char **argv, - int argc, - int &batch_size, - int &num_layers, - int &seq_length, - int &hidden_size, - int &embed_size); - -void set_global_config(GlobalConfig &global, - int num_layers, - int seq_length, - int workers_per_node, - int num_nodes); - -void top_level_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - int bs_per_worker = 64; - int num_layers = 2; - int seq_length = 20; - int hidden_size = 2048; - int embed_size = 2048; - int vocab_size = 20 * 1024; - int num_nodes = 1; - int workers_per_node = 1; - int num_parts = workers_per_node * num_nodes; - int batch_size = bs_per_worker * num_parts; - int num_iterations = 10; - { - InputArgs const &command_args = HighLevelRuntime::get_input_args(); - char **argv = command_args.argv; - int argc = command_args.argc; - parse_input_args(argv, - argc, - batch_size, - num_layers, - seq_length, - hidden_size, - embed_size); - } - GlobalConfig global; - set_global_config( - global, num_layers, seq_length, workers_per_node, num_nodes); - RnnModel model(batch_size, - num_layers, - seq_length, - hidden_size, - embed_size, - vocab_size, - num_parts, - num_nodes, - workers_per_node, - global, - ctx, - runtime); - ArgumentMap local_args; - size_t workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; - Rect<1> workers_rect(Point<1>(0), Point<1>(num_nodes * workers_per_node - 1)); - int idx = 0; - for (PointInRectIterator<1> it(workers_rect); it(); it++) { - TaskLauncher launcher(CUDNN_INIT_TASK_ID, - TaskArgument(&workSpaceSize, sizeof(workSpaceSize)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(idx)); - Future f = runtime->execute_task(ctx, launcher); - model.dnn_handlers[idx++] = f.get_result(); - } - - model.init(); - double ts_start = Realm::Clock::current_time_in_microseconds(); - for (int i = 0; i < num_iterations; i++) { - model.forward(); - model.backward(); - model.update(); - } - runtime->issue_execution_fence(ctx); - TimingLauncher timer(MEASURE_MICRO_SECONDS); - Future future = runtime->issue_timing_measurement(ctx, timer); - future.get_void_result(); - double ts_end = Realm::Clock::current_time_in_microseconds(); - double run_time = 1e-6 * (ts_end - ts_start); - printf("time = %.4fs\n", run_time); -} - -int main(int argc, char **argv) { - Runtime::set_top_level_task_id(TOP_LEVEL_TASK_ID); - { - TaskVariantRegistrar registrar(TOP_LEVEL_TASK_ID, "top_level"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - // registrar.set_inner(); - Runtime::preregister_task_variant(registrar, "top_level"); - } - - // DNN_INIT_TASK - { - TaskVariantRegistrar registrar(CUDNN_INIT_TASK_ID, "cudnn_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "cudnn_init_task"); - } - // - { - TaskVariantRegistrar registrar(WORD_INIT_TASK_ID, "word_init_task(dummy)"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "word_init_task(dummy)"); - } - // Word Embedding task - { - TaskVariantRegistrar registrar(EMBED_INIT_TASK_ID, "embed_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "embed_init_task"); - } - { - TaskVariantRegistrar registrar(EMBED_FWD_TASK_ID, "embed_fwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "embed_fwd_task"); - } - { - TaskVariantRegistrar registrar(EMBED_BWD_TASK_ID, "embed_bwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "embed_bwd_task"); - } - // LSTM task - { - TaskVariantRegistrar registrar(LSTM_INIT_TASK_ID, "lstm_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "lstm_init_task"); - } - { - TaskVariantRegistrar registrar(LSTM_FWD_TASK_ID, "lstm_fwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "lstm_fwd_task"); - } - { - TaskVariantRegistrar registrar(LSTM_BWD_TASK_ID, "lstm_bwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "lstm_bwd_task"); - } - // Rnn Linear task - { - TaskVariantRegistrar registrar(RNN_LINEAR_INIT_TASK_ID, "linear_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "linear_init_task"); - } - { - TaskVariantRegistrar registrar(RNN_LINEAR_FWD_TASK_ID, "linar_fwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "linear_fwd_task"); - } - { - TaskVariantRegistrar registrar(RNN_LINEAR_BWD_TASK_ID, "linear_bwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "linear_bwd_task"); - } - { - TaskVariantRegistrar registrar(RNN_LINEAR_BWD2_TASK_ID, "linear_bwd2_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "linear_bwd2_task"); - } - // Softmax (Data Parallel Implementation) task - { - TaskVariantRegistrar registrar(RNN_SOFTMAXDP_INIT_TASK_ID, - "softmaxDP_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "softmaxDP_init_task"); - } - { - TaskVariantRegistrar registrar(RNN_SOFTMAXDP_FWD_TASK_ID, - "softmaxDP_fwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "softmaxDP_fwd_task"); - } - { - TaskVariantRegistrar registrar(RNN_SOFTMAXDP_BWD_TASK_ID, - "softmaxDP_bwd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "softmaxDP_bwd_task"); - } - // Params related tasks - { - TaskVariantRegistrar registrar(PARAMS_INIT_TASK_ID, "params_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "params_init_task"); - } - { - TaskVariantRegistrar registrar(ZERO_1D_INIT_TASK_ID, "zero_1d_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "zero_1d_init_task"); - } - { - TaskVariantRegistrar registrar(ZERO_2D_INIT_TASK_ID, "zero_2d_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "zero_2d_init_task"); - } - { - TaskVariantRegistrar registrar(ZERO_3D_INIT_TASK_ID, "zero_3d_init_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "zero_3d_init_task"); - } - { - TaskVariantRegistrar registrar(PARAMS_UPD_TASK_ID, "params_upd_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant( - registrar, "params_upd_task"); - } - // Dummy tasks - { - TaskVariantRegistrar registrar(DUMMY_TASK_ID, "dummy_task"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - Runtime::preregister_task_variant(registrar, - "dummy_task"); - } - - Runtime::add_registration_callback(update_mappers); - return Runtime::start(argc, argv); -} - -void parse_input_args(char **argv, - int argc, - int &batch_size, - int &num_layers, - int &seq_length, - int &hidden_size, - int &embed_size) { - for (int i = 1; i < argc; i++) { - if (!strcmp(argv[i], "-b")) { - batch_size = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "-l")) { - num_layers = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "-s")) { - seq_length = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "-h")) { - hidden_size = atoi(argv[++i]); - continue; - } - if (!strcmp(argv[i], "-e")) { - embed_size = atoi(argv[++i]); - continue; - } - } -} - -void set_global_config(GlobalConfig &global, - int num_layers, - int seq_length, - int workers_per_node, - int num_nodes) { - int num_parts = workers_per_node * num_nodes; - for (int i = 0; i * LSTM_PER_NODE_LENGTH < 2 * seq_length; i++) { - ParallelConfig pc; - pc.nDims = 1; - pc.dim[0] = num_parts; - for (int j = 0; j < num_parts; j++) { - pc.gpu[j] = i * LSTM_PER_NODE_LENGTH < seq_length ? 0 : 1; - } - // pc.gpu[j] = j; - global.embed[i] = pc; - } - for (int i = 0; i < num_layers; i++) { - for (int j = 0; j * LSTM_PER_NODE_LENGTH < 2 * seq_length; j++) { - ParallelConfig pc; - pc.nDims = 1; - pc.dim[0] = num_parts; - for (int k = 0; k < num_parts; k++) { - pc.gpu[k] = k; - } - global.lstm[i][j] = pc; - } - } - for (int i = 0; i * LSTM_PER_NODE_LENGTH < seq_length; i++) { - ParallelConfig pc; - pc.nDims = 2; - pc.dim[0] = 1; - pc.dim[1] = num_parts; - for (int j = 0; j < num_parts; j++) { - pc.gpu[j] = j; - } - global.linear[i] = pc; - } - for (int i = 0; i * LSTM_PER_NODE_LENGTH < seq_length; i++) { - ParallelConfig pc; - pc.nDims = 1; - pc.dim[0] = num_parts; - for (int j = 0; j < num_parts; j++) { - pc.gpu[j] = j; - } - global.softmax[i] = pc; - } -} diff --git a/nmt/ops.h b/nmt/ops.h deleted file mode 100644 index d6faf662a0..0000000000 --- a/nmt/ops.h +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _LEGION_CNN_OPS_H_ -#define _LEGION_CNN_OPS_H_ - -// #define DISABLE_COMPUTATION -#include "legion.h" -#include -#include -#include -#include -#include -using namespace Legion; - -template -using AccessorRO = - FieldAccessor>; -template -using AccessorRW = - FieldAccessor>; -template -using AccessorWO = - FieldAccessor>; - -#define MAX_NUM_INPUTS 6 -#define MAX_NUM_OUTPUTS 6 -#define MAX_NUM_LOCALS 3 -#define MAX_NUM_WORKERS 16 -#define MAX_NUM_PARTS 16 -#define MAX_DIM 4 -#define MAX_FILENAME 200 - -enum TaskIDs { - TOP_LEVEL_TASK_ID, - CUDNN_INIT_TASK_ID, - IMAGE_INIT_TASK_ID, - LABEL_INIT_TASK_ID, - LOAD_IMAGES_TASK_ID, - NORMALIZE_IMAGES_TASK_ID, - CONV2D_INIT_TASK_ID, - CONV2D_INIT_PARA_TASK_ID, - CONV2D_FWD_TASK_ID, - CONV2D_BWD_TASK_ID, - CONV2D_UPD_TASK_ID, - POOL2D_INIT_TASK_ID, - POOL2D_FWD_TASK_ID, - POOL2D_BWD_TASK_ID, - LINEAR_INIT_TASK_ID, - LINEAR_INIT_PARA_TASK_ID, - LINEAR_FWD_TASK_ID, - LINEAR_BWD_TASK_ID, - LINEAR_BWD2_TASK_ID, - LINEAR_UPD_TASK_ID, - FLAT_INIT_TASK_ID, - FLAT_FWD_TASK_ID, - FLAT_BWD_TASK_ID, - SOFTMAX_INIT_TASK_ID, - SOFTMAX_FWD_TASK_ID, - SOFTMAX_BWD_TASK_ID, - CONCAT_INIT_TASK_ID, - CONCAT_FWD_TASK_ID, - CONCAT_BWD_TASK_ID, - // RNN Task IDs - LSTM_INIT_TASK_ID, - LSTM_FWD_TASK_ID, - LSTM_BWD_TASK_ID, - RNN_LINEAR_INIT_TASK_ID, - RNN_LINEAR_FWD_TASK_ID, - RNN_LINEAR_BWD_TASK_ID, - RNN_LINEAR_BWD2_TASK_ID, - EMBED_INIT_TASK_ID, - EMBED_FWD_TASK_ID, - EMBED_BWD_TASK_ID, - RNN_SOFTMAXDP_INIT_TASK_ID, - RNN_SOFTMAXDP_FWD_TASK_ID, - RNN_SOFTMAXDP_BWD_TASK_ID, - PARAMS_INIT_TASK_ID, - PARAMS_UPD_TASK_ID, - WORD_INIT_TASK_ID, // DUMMY_TASK_ID: To be removed - ZERO_1D_INIT_TASK_ID, - ZERO_2D_INIT_TASK_ID, - ZERO_3D_INIT_TASK_ID, - // Dummy task ID - DUMMY_TASK_ID, -}; - -enum Pool2DType { - POOL2D_MAX, - POOL2D_AVG, -}; - -enum FieldIDs { - FID_DATA, -}; - -struct DnnHandle { -#ifndef DISABLE_COMPUTATION - cudnnHandle_t dnn; - cublasHandle_t blas; -#endif - void *workSpace; - size_t workSpaceSize; -}; - -struct Tensor { - // Tensor(int _numDim, int* _dim, LogicalRegion lr, LogicalPartition lp) - // { - // numDim = _numDim; - // for (int i = 0; i < numDim; i++) - // dim[i] = _dim[i]; - // region = lr; - // partition = lp; - // } - int numDim, adim[MAX_DIM], pdim[MAX_DIM]; - LogicalRegion region, region_grad; - LogicalPartition partition, partition_grad; -}; - -struct TensorWithGrad { - // int dim[MAX_DIM]; - LogicalRegion region, region_grad; - LogicalPartition partition, partition_grad; -}; - -class OpMeta { -public: - OpMeta(DnnHandle _handle) : handle(_handle){}; - -public: - DnnHandle handle; -}; - -// Empty base class -class CnnModel; -class DataLoader; - -class Op { -public: - Op(Tensor input); - Op(int num, Tensor *inputs); - virtual void init(CnnModel const &) = 0; - - virtual void forward(CnnModel const &) = 0; - - virtual void backward(CnnModel const &) = 0; - - virtual void update(CnnModel const &) = 0; - -public: - Tensor output; - // Op* pre_ops[MAX_NUM_INPUTS]; - Tensor inputs[MAX_NUM_INPUTS]; - LogicalPartition input_lps[MAX_NUM_INPUTS]; - TensorWithGrad locals[MAX_NUM_LOCALS]; - OpMeta *meta[MAX_NUM_WORKERS]; - // std::vector inputs, grads; -}; - -DnnHandle init_cudnn(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - -#endif // _LEGION_OPS_H_ diff --git a/nmt/rnn.cu b/nmt/rnn.cu deleted file mode 100644 index 3d59116833..0000000000 --- a/nmt/rnn.cu +++ /dev/null @@ -1,770 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../cnn_helper.h" -#include "rnn.h" -#include "rnn_mapper.h" - -DnnHandle init_cudnn(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime) { - assert(regions.size() == 0); - assert(task->arglen == sizeof(size_t)); - size_t workSpaceSize = *(size_t const *)task->args; - DnnHandle handle; - handle.workSpaceSize = workSpaceSize; - printf("workSpaceSize = %zu\n", workSpaceSize); -#ifndef DISABLE_COMPUTATION - checkCUDA(cublasCreate(&handle.blas)); - checkCUDNN(cudnnCreate(&handle.dnn)); -#endif - checkCUDA(cudaMalloc(&handle.workSpace, workSpaceSize)); - return handle; -} - -const SharedVariable SharedVariable::NO_VARIABLE = SharedVariable(); - -RnnOp::RnnOp(Tensor input, ParallelConfig pc, SharedVariable _params) - : paraConfig(pc), params(_params) { - inputs[0] = input; -} - -RnnOp::RnnOp( - Tensor t1, Tensor t2, Tensor t3, ParallelConfig pc, SharedVariable _params) - : paraConfig(pc), params(_params) { - inputs[0] = t1; - inputs[1] = t2; - inputs[2] = t3; -} - -RnnOp::RnnOp(int n, Tensor *_inputs) { - for (int i = 0; i < n; i++) { - inputs[i] = _inputs[i]; - } -} - -RnnModel::RnnModel(int batch_size, - int numLayers, - int seqLength, - int hidden_size, - int embed_size, - int vocab_size, - int num_parts, - int num_nodes, - int num_gpus_per_node, - GlobalConfig global, - Context ctx, - Runtime *runtime) { - config.lg_ctx = ctx; - config.lg_hlr = runtime; - config.batchSize = batch_size; - config.hiddenSize = hidden_size; - config.embedSize = embed_size; - config.vocabSize = vocab_size; - config.numLayers = numLayers; - config.seqLength = seqLength; - config.numParts = num_parts; - config.numNodes = num_nodes; - config.workersPerNode = num_gpus_per_node; - config.field_space = runtime->create_field_space(ctx); - { - FieldAllocator allocator = - runtime->create_field_allocator(ctx, config.field_space); - allocator.allocate_field(sizeof(float), FID_DATA); - } - Rect<1> part_rect(Point<1>(0), Point<1>(num_parts - 1)); - part_is = runtime->create_index_space(ctx, part_rect); - assert(seqLength <= MAX_SEQ_LENGTH); - assert(numLayers <= MAX_NUM_LAYERS); - int nodes_per_layer = seqLength / LSTM_PER_NODE_LENGTH; - // Create srcs/dsts tensors - { - Rect<2> word_rect(Point<2>(0, 0), - Point<2>(batch_size - 1, LSTM_PER_NODE_LENGTH - 1)); - IndexSpaceT<2> word_is = runtime->create_index_space(ctx, word_rect); - int extent_n = batch_size / num_parts; - Rect<2, coord_t> extent(Point<2>(0, 0), - Point<2>(extent_n - 1, LSTM_PER_NODE_LENGTH - 1)); - Transform<2, 1, coord_t> trans; - trans[0][0] = extent_n; - trans[1][0] = 0; - IndexPartition word_ip = runtime->create_partition_by_restriction( - ctx, word_is, part_is, trans, extent); - assert(runtime->is_index_partition_disjoint(ctx, word_ip)); - assert(runtime->is_index_partition_complete(ctx, word_ip)); - assert(seqLength % LSTM_PER_NODE_LENGTH == 0); - for (int i = 0; i < nodes_per_layer; i++) { - srcs[i].numDim = 2; - srcs[i].adim[0] = batch_size; - srcs[i].adim[1] = LSTM_PER_NODE_LENGTH; - srcs[i].pdim[0] = extent_n; - srcs[i].pdim[1] = LSTM_PER_NODE_LENGTH; - srcs[i].region = - runtime->create_logical_region(ctx, word_is, config.field_space); - srcs[i].partition = - runtime->get_logical_partition(ctx, srcs[i].region, word_ip); - srcs[i].region_grad = - runtime->create_logical_region(ctx, word_is, config.field_space); - srcs[i].partition_grad = - runtime->get_logical_partition(ctx, srcs[i].region_grad, word_ip); - dsts[i] = srcs[i]; - dsts[i].region = - runtime->create_logical_region(ctx, word_is, config.field_space); - dsts[i].partition = - runtime->get_logical_partition(ctx, dsts[i].region, word_ip); - dsts[i].region_grad = - runtime->create_logical_region(ctx, word_is, config.field_space); - dsts[i].partition_grad = - runtime->get_logical_partition(ctx, dsts[i].region_grad, word_ip); - } - } - // Create zeroed tensors - { - Rect<2> hx_rect(Point<2>(0, 0), Point<2>(hidden_size - 1, batch_size - 1)); - IndexSpaceT<2> hx_is = runtime->create_index_space(ctx, hx_rect); - int extent_c = hidden_size; - int extent_n = batch_size / num_parts; - Rect<2> hx_ext(Point<2>(0, 0), Point<2>(extent_c - 1, extent_n - 1)); - Transform<2, 1, coord_t> hx_trans; - hx_trans[0][0] = 0; - hx_trans[1][0] = extent_n; - IndexPartition hx_ip = runtime->create_partition_by_restriction( - ctx, hx_is, part_is, hx_trans, hx_ext); - assert(runtime->is_index_partition_disjoint(ctx, hx_ip)); - assert(runtime->is_index_partition_complete(ctx, hx_ip)); - for (int i = 0; i < numLayers; i++) { - for (int j = 0; j < 2; j++) { - Tensor t; - t.numDim = 2; - t.adim[0] = hidden_size; - t.adim[1] = batch_size; - t.pdim[0] = extent_c; - t.pdim[1] = extent_n; - t.region = - runtime->create_logical_region(ctx, hx_is, config.field_space); - t.partition = runtime->get_logical_partition(ctx, t.region, hx_ip); - t.region_grad = - runtime->create_logical_region(ctx, hx_is, config.field_space); - t.partition_grad = - runtime->get_logical_partition(ctx, t.region_grad, hx_ip); - if (j == 0) { - zero[i].hx = t; - } else { - zero[i].cx = t; - } - } - } - } - // Embedding - SharedVariable srcEmbed, dstEmbed; - { - int numParams = config.vocabSize * config.embedSize; - Rect<1> params_rect(Point<1>(0), Point<1>(numParams - 1)); - IndexSpaceT<1> params_is = runtime->create_index_space(ctx, params_rect); - srcEmbed.region = - runtime->create_logical_region(ctx, params_is, config.field_space); - dstEmbed.region = - runtime->create_logical_region(ctx, params_is, config.field_space); - for (int i = 0; i < 2 * nodes_per_layer; i++) { - ParallelConfig pc = global.embed[i]; - assert(pc.nDims == 1); - for (int j = 0; j < pc.dim[0]; j++) { - int gpuId = pc.gpu[j]; - if (i < nodes_per_layer) { - if (srcEmbed.gradients[gpuId] == LogicalRegion::NO_REGION) { - srcEmbed.gradients[gpuId] = runtime->create_logical_region( - ctx, params_is, config.field_space); - } - } else { - if (dstEmbed.gradients[gpuId] == LogicalRegion::NO_REGION) { - dstEmbed.gradients[gpuId] = runtime->create_logical_region( - ctx, params_is, config.field_space); - } - } - } - } - // Collect masterOnNode for srcEmbed/dstEmbed - for (int i = 0; i < config.numNodes; i++) { - for (int j = config.workersPerNode - 1; j >= 0; j--) { - int gpuId = i * config.workersPerNode + j; - if (srcEmbed.gradients[gpuId] != LogicalRegion::NO_REGION) { - srcEmbed.masterOnNode[i] = gpuId; - } - if (dstEmbed.gradients[gpuId] != LogicalRegion::NO_REGION) { - dstEmbed.masterOnNode[i] = gpuId; - } - } - } - } - - // Encoders/decoders - SharedVariable encoders[MAX_NUM_LAYERS], decoders[MAX_NUM_LAYERS]; - for (int i = 0; i < numLayers; i++) { - int input_size = (i == 0) ? embed_size : hidden_size; - int output_size = hidden_size; - int numParams = (input_size + 1 + output_size + 1) * output_size * 4; - Rect<1> params_rect(Point<1>(0), Point<1>(numParams - 1)); - IndexSpaceT<1> params_is = runtime->create_index_space(ctx, params_rect); - encoders[i].region = - runtime->create_logical_region(ctx, params_is, config.field_space); - decoders[i].region = - runtime->create_logical_region(ctx, params_is, config.field_space); - for (int j = 0; j < 2 * nodes_per_layer; j++) { - ParallelConfig pc = global.lstm[i][j]; - assert(pc.nDims == 1); - for (int k = 0; k < pc.dim[0]; k++) { - int gpuId = pc.gpu[k]; - if (j < nodes_per_layer) { - if (encoders[i].gradients[gpuId] == LogicalRegion::NO_REGION) { - encoders[i].gradients[gpuId] = runtime->create_logical_region( - ctx, params_is, config.field_space); - } - } else { - if (decoders[i].gradients[gpuId] == LogicalRegion::NO_REGION) { - decoders[i].gradients[gpuId] = runtime->create_logical_region( - ctx, params_is, config.field_space); - } - } - } - } - // Collect masterOnNode for encoders[i]/decoders[i] - for (int j = 0; j < config.numNodes; j++) { - for (int k = config.workersPerNode - 1; k >= 0; k--) { - int gpuId = j * config.workersPerNode + k; - if (encoders[i].gradients[gpuId] != LogicalRegion::NO_REGION) { - encoders[i].masterOnNode[j] = gpuId; - } - if (decoders[i].gradients[gpuId] != LogicalRegion::NO_REGION) { - decoders[i].masterOnNode[j] = gpuId; - } - } - } - } - SharedVariable linear; - { - int numParams = (hidden_size + 1) * vocab_size; - Rect<1> params_rect(Point<1>(0), Point<1>(numParams - 1)); - IndexSpaceT<1> params_is = runtime->create_index_space(ctx, params_rect); - linear.region = - runtime->create_logical_region(ctx, params_is, config.field_space); - linear.subregions[1] = linear.region; - // Create subregions for the shared variable linear - for (int parts = 2; parts <= MAX_NUM_PARTS; parts *= 2) { - Rect<1> rect(Point<1>(0), Point<1>(parts - 1)); - IndexSpaceT<1> is = runtime->create_index_space(ctx, rect); - IndexPartition ip = runtime->create_equal_partition(ctx, params_is, is); - LogicalPartition lp = - runtime->get_logical_partition(ctx, linear.region, ip); - int idx = 0; - for (PointInRectIterator<1> it(rect); it(); it++, idx++) { - DomainPoint dp(*it); - linear.subregions[parts + idx] = - runtime->get_logical_subregion_by_color(ctx, lp, dp); - } - } - // Compute bboxes for the shared variable linear - // Also compute masterOnNode which is the largest gradients on each node - std::map> bboxes; - for (int i = 0; i < nodes_per_layer; i++) { - ParallelConfig pc = global.linear[i]; - assert(pc.nDims == 2); - for (int j = 0; j < pc.dim[1]; j++) { - for (int k = 0; k < pc.dim[0]; k++) { - int gpuIdx = pc.gpu[j * pc.dim[0] + k]; - Rect<1> rect = runtime->get_index_space_domain( - ctx, linear.subregions[pc.dim[0] + k].get_index_space()); - if (bboxes.find(gpuIdx) == bboxes.end()) { - bboxes[gpuIdx] = rect; - } else { - bboxes[gpuIdx] = bboxes[gpuIdx].union_bbox(rect); - } - int nodeIdx = gpuIdx / config.workersPerNode; - if (linear.masterOnNode[nodeIdx] == MASTER_NOT_ASSIGNED) { - linear.masterOnNode[nodeIdx] = gpuIdx; - } else { - int masterIdx = linear.masterOnNode[nodeIdx]; - if (bboxes[gpuIdx].volume() > bboxes[masterIdx].volume()) { - linear.masterOnNode[nodeIdx] = gpuIdx; - } - } - } - } - } - // The first bbox on each node is a superset of all bboxes on that node - for (int n = 0; n < config.numNodes; n++) { - if (linear.masterOnNode[n] != MASTER_NOT_ASSIGNED) { - for (int j = 0; j < config.workersPerNode; j++) { - if (bboxes.find(n * config.workersPerNode + j) != bboxes.end()) { - Rect<1> rect = bboxes[n * config.workersPerNode + j]; - bboxes[linear.masterOnNode[n]] = - bboxes[linear.masterOnNode[n]].union_bbox(rect); - } - } - } - } - for (int i = 0; i < config.numNodes * config.workersPerNode; i++) { - if (bboxes.find(i) != bboxes.end()) { - IndexSpaceT<1> params_is = runtime->create_index_space(ctx, bboxes[i]); - linear.gradients[i] = - runtime->create_logical_region(ctx, params_is, config.field_space); - } else { - linear.gradients[i] = LogicalRegion::NO_REGION; - } - } - } - - Tensor embed[2 * MAX_SEQ_LENGTH]; - for (int i = 0; i < 2 * nodes_per_layer; i++) { - embed[i] = add_embed_node(i < nodes_per_layer ? srcs[i] - : dsts[i - nodes_per_layer], - config.vocabSize, - config.embedSize, - global.embed[i], - i < nodes_per_layer ? srcEmbed : dstEmbed); - } - for (int i = 0; i < numLayers; i++) { - // Add encoder lstm nodes - for (int j = 0; j < nodes_per_layer; j++) { - Tensor x = (i == 0) ? embed[j] : lstm[i - 1][j].x; - Tensor hx = (j == 0) ? zero[i].hx : lstm[i][j - 1].hx; - Tensor cx = (j == 0) ? zero[i].cx : lstm[i][j - 1].cx; - lstm[i][j] = add_lstm_node(x, hx, cx, global.lstm[i][j], encoders[i]); - } - // Add decoder lstm nodes - for (int j = nodes_per_layer; j < 2 * nodes_per_layer; j++) { - Tensor x = (i == 0) ? embed[j] : lstm[i - 1][j].x; - Tensor hx = lstm[i][j - 1].hx; - Tensor cx = lstm[i][j - 1].cx; - lstm[i][j] = add_lstm_node(x, hx, cx, global.lstm[i][j], decoders[i]); - } - } - // Add linear nodes - for (int j = nodes_per_layer; j < 2 * nodes_per_layer; j++) { - Tensor logit = add_linear_node(lstm[numLayers - 1][j].x, - vocab_size, - global.linear[j - nodes_per_layer], - linear); - add_softmaxDP_node( - logit, dsts[j - nodes_per_layer], global.softmax[j - nodes_per_layer]); - } - - // Add shared variables - sharedVariables.push_back(srcEmbed); - sharedVariables.push_back(dstEmbed); - for (int i = 0; i < config.numLayers; i++) { - sharedVariables.push_back(encoders[i]); - sharedVariables.push_back(decoders[i]); - } - sharedVariables.push_back(linear); -} - -void RnnModel::word_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Rect<2> rect0 = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - int *host_ptr; - bool same = *((bool *)task->args); - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(int) * rect0.volume(), - cudaHostAllocPortable | cudaHostAllocMapped)); - for (int i = 0; i < rect0.volume(); i++) { - host_ptr[i] = same ? 1 : i % 16; - } - for (int i = 0; i < regions.size(); i++) { - AccessorWO const acc(regions[i], FID_DATA); - Rect<2> rect = runtime->get_index_space_domain( - ctx, task->regions[i].region.get_index_space()); - assert(acc.accessor.is_dense_arbitrary(rect)); - assert(rect == rect0); - int *ptr = acc.ptr(rect.lo); - checkCUDA(cudaMemcpy( - ptr, host_ptr, sizeof(int) * rect0.volume(), cudaMemcpyHostToDevice)); - } - checkCUDA(cudaFreeHost(host_ptr)); -} - -void RnnModel::init() { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - // Init words - Rect<1> part_rect = runtime->get_index_space_domain(ctx, part_is); - for (PointInRectIterator<1> it(part_rect); it(); it++) { - int idx = 0; - bool same = false; - TaskLauncher launcher(WORD_INIT_TASK_ID, - TaskArgument(&same, sizeof(same)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(0)); - DomainPoint dp(*it); - for (int i = 0; i * LSTM_PER_NODE_LENGTH < config.seqLength; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(srcs[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, WRITE_ONLY, EXCLUSIVE, srcs[i].region)); - launcher.add_field(idx++, FID_DATA); - } - for (int i = 0; i * LSTM_PER_NODE_LENGTH < config.seqLength; i++) { - LogicalRegion x = - runtime->get_logical_subregion_by_color(dsts[i].partition, dp); - launcher.add_region_requirement( - RegionRequirement(x, WRITE_ONLY, EXCLUSIVE, dsts[i].region)); - launcher.add_field(idx++, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - f.get_void_result(); - } - // Init zero tensors - for (PointInRectIterator<1> it(part_rect); it(); it++) { - int idx = 0; - TaskLauncher launcher(ZERO_2D_INIT_TASK_ID, - TaskArgument(NULL, 0), - Predicate::TRUE_PRED, - 0, - RnnMapper::assign_to_gpu(0)); - DomainPoint dp(*it); - for (int i = 0; i < config.numLayers; i++) { - LogicalRegion hx = - runtime->get_logical_subregion_by_color(zero[i].hx.partition, dp); - launcher.add_region_requirement( - RegionRequirement(hx, WRITE_ONLY, EXCLUSIVE, zero[i].hx.region)); - launcher.add_field(idx++, FID_DATA); - } - for (int i = 0; i < config.numLayers; i++) { - LogicalRegion cx = - runtime->get_logical_subregion_by_color(zero[i].cx.partition, dp); - launcher.add_region_requirement( - RegionRequirement(cx, WRITE_ONLY, EXCLUSIVE, zero[i].cx.region)); - launcher.add_field(idx++, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - f.get_void_result(); - } - // Init hx_grad/cx_grad for the last LSTM node on each layer - int nodes_per_layer = config.seqLength / LSTM_PER_NODE_LENGTH; - for (PointInRectIterator<1> it(part_rect); it(); it++) { - int idx = 0; - TaskLauncher launcher(ZERO_2D_INIT_TASK_ID, - TaskArgument(NULL, 0), - Predicate::TRUE_PRED, - 0, - RnnMapper::assign_to_gpu(0)); - DomainPoint dp(*it); - for (int i = 0; i < config.numLayers; i++) { - LSTMTensors last_lstm = lstm[i][2 * nodes_per_layer - 1]; - // hx - LogicalRegion hx_grad = runtime->get_logical_subregion_by_color( - last_lstm.hx.partition_grad, dp); - launcher.add_region_requirement(RegionRequirement( - hx_grad, WRITE_ONLY, EXCLUSIVE, last_lstm.hx.region_grad)); - launcher.add_field(idx++, FID_DATA); - // cx - LogicalRegion cx_grad = runtime->get_logical_subregion_by_color( - last_lstm.cx.partition_grad, dp); - launcher.add_region_requirement(RegionRequirement( - cx_grad, WRITE_ONLY, EXCLUSIVE, last_lstm.cx.region_grad)); - launcher.add_field(idx++, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - f.get_void_result(); - } - // TODO: to be removed when we have attention layers - // Init y_grad for the decoder lstm nodes - for (PointInRectIterator<1> it(part_rect); it(); it++) { - int idx = 0; - TaskLauncher launcher(ZERO_3D_INIT_TASK_ID, - TaskArgument(NULL, 0), - Predicate::TRUE_PRED, - 0, - RnnMapper::assign_to_gpu(0)); - DomainPoint dp(*it); - for (int i = 0; i < nodes_per_layer; i++) { - LSTMTensors top_lstm = lstm[config.numLayers - 1][i]; - LogicalRegion y_grad = runtime->get_logical_subregion_by_color( - top_lstm.x.partition_grad, dp); - launcher.add_region_requirement(RegionRequirement( - y_grad, WRITE_ONLY, EXCLUSIVE, top_lstm.x.region_grad)); - launcher.add_field(idx++, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - f.get_void_result(); - } - // Init shared variables - for (int i = 0; i < sharedVariables.size(); i++) { - init_shared_variable(sharedVariables[i]); - } - for (size_t i = 0; i < layers.size(); i++) { - layers[i]->init(*this); - } -} - -void RnnModel::zero_3d_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - for (int i = 0; i < task->regions.size(); i++) { - AccessorWO const acc_w(regions[i], FID_DATA); - Rect<3> rect_w = runtime->get_index_space_domain( - ctx, task->regions[i].region.get_index_space()); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - float *w_ptr = acc_w.ptr(rect_w.lo); - assign_kernel<<>>( - w_ptr, rect_w.volume(), 0.0f); - } -} - -void RnnModel::zero_2d_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - for (int i = 0; i < task->regions.size(); i++) { - AccessorWO const acc_w(regions[i], FID_DATA); - Rect<2> rect_w = runtime->get_index_space_domain( - ctx, task->regions[i].region.get_index_space()); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - float *w_ptr = acc_w.ptr(rect_w.lo); - assign_kernel<<>>( - w_ptr, rect_w.volume(), 0.0f); - } -} - -void RnnModel::zero_1d_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - for (int i = 0; i < task->regions.size(); i++) { - AccessorWO const acc_w(regions[i], FID_DATA); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[i].region.get_index_space()); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - float *w_ptr = acc_w.ptr(rect_w.lo); - assign_kernel<<>>( - w_ptr, rect_w.volume(), 0.0f); - } -} - -void RnnModel::dummy_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) {} - -void RnnModel::forward() { - config.iterator++; - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - // Step 1: launch dummy tasks to prefetch shared variables - for (size_t i = 0; i < sharedVariables.size(); i++) { - for (int n = 0; n < config.numNodes; n++) { - if (sharedVariables[i].masterOnNode[n] != MASTER_NOT_ASSIGNED) { - int gpuId = sharedVariables[i].masterOnNode[n]; - TaskLauncher launcher(DUMMY_TASK_ID, - TaskArgument(NULL, 0), - Predicate::TRUE_PRED, - 0, - RnnMapper::assign_to_gpu(gpuId)); - launcher.add_region_requirement( - RegionRequirement(sharedVariables[i].region, - READ_ONLY, - EXCLUSIVE, - sharedVariables[i].region)); - launcher.add_field(0, FID_DATA); - runtime->execute_task(ctx, launcher); - } - } - } - runtime->issue_mapping_fence(ctx); - // Step 2: zero gradients - for (size_t i = 0; i < sharedVariables.size(); i++) { - for (int j = 0; j < config.workersPerNode * config.numNodes; j++) { - if (sharedVariables[i].gradients[j] != LogicalRegion::NO_REGION) { - TaskLauncher launcher(ZERO_1D_INIT_TASK_ID, - TaskArgument(NULL, 0), - Predicate::TRUE_PRED, - 0, - RnnMapper::assign_to_gpu(j)); - LogicalRegion gradient = sharedVariables[i].gradients[j]; - launcher.add_region_requirement( - RegionRequirement(gradient, WRITE_ONLY, EXCLUSIVE, gradient)); - launcher.add_field(0, FID_DATA); - runtime->execute_task(ctx, launcher); - } - } - } - // Step 3: launch forward tasks - for (size_t i = 0; i < layers.size(); i++) { - layers[i]->forward(*this); - } -} - -void RnnModel::backward() { - for (int i = layers.size() - 1; i >= 0; i--) { - layers[i]->backward(*this); - } -} - -void RnnModel::update() { - for (int i = sharedVariables.size() - 1; i >= 0; i--) { - update_shared_variable(sharedVariables[i]); - } -} - -/* - regions[0](O): w -*/ -void RnnModel::params_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 1); - assert(task->regions.size() == 1); - float value = *((float *)task->args); - AccessorWO const acc_w(regions[0], FID_DATA); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - float *w_ptr = acc_w.ptr(rect_w.lo); - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - curandGenerator_t genGPU; - curandCreateGenerator(&genGPU, CURAND_RNG_PSEUDO_DEFAULT); - curandSetStream(genGPU, stream); - curandSetPseudoRandomGeneratorSeed(genGPU, 1234LL); - curandGenerateUniform(genGPU, w_ptr, rect_w.volume()); - checkCUDA(cudaDeviceSynchronize()); - scale_kernel<<>>( - w_ptr, rect_w.volume(), -value, value); - // assign_kernel<<>>( - // w_ptr, rect_w.volume(), value); -} - -void RnnModel::init_shared_variable(SharedVariable params) { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - float value = 0.1f; - TaskLauncher launcher(PARAMS_INIT_TASK_ID, - TaskArgument(&value, sizeof(value)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(params.masterOnNode[0])); - launcher.add_region_requirement( - RegionRequirement(params.region, WRITE_ONLY, EXCLUSIVE, params.region)); - launcher.add_field(0, FID_DATA); - Future f = runtime->execute_task(ctx, launcher); - f.get_void_result(); -} - -/* - regions[0]: (I/O): w - regions[1..]: (O): w_grad - */ -void RnnModel::params_update_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == task->regions.size()); - float rate = *((float *)task->args); - AccessorRW const acc_w(regions[0], FID_DATA); - Rect<1> rect_w = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(acc_w.accessor.is_dense_arbitrary(rect_w)); - for (int i = 1; i < regions.size(); i++) { - AccessorRO const acc_w_grad(regions[i], FID_DATA); - Rect<1> rect_w_grad = runtime->get_index_space_domain( - ctx, task->regions[i].region.get_index_space()); - assert(rect_w.contains(rect_w_grad)); - assert(acc_w_grad.accessor.is_dense_arbitrary(rect_w_grad)); - float *w_ptr = acc_w.ptr(rect_w_grad.lo); - float const *w_grad_ptr = acc_w_grad.ptr(rect_w_grad.lo); - apply_add_with_scale<<>>( - w_ptr, w_grad_ptr, rect_w_grad.volume(), rate); -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<1, float>(w_grad_ptr, rect_w_grad, "partial_w"); -#endif - } -#ifdef PRINT_INTERMEDIATE_RESULT - float *w_ptr = acc_w.ptr(rect_w.lo); - print_tensor<1, float>(w_ptr, rect_w, "final_w"); -#endif -} - -void RnnModel::update_shared_variable(SharedVariable params) { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - // for (int i = 0; i < config.workersPerNode; i++) - // if (params.gradients[i] != LogicalRegion::NO_REGION) { - // Rect<1> rect = - // runtime->get_index_space_domain(ctx, - // params.gradients[i].get_index_space()); - // printf("rect[%d]: lo(%d) hi(%d)\n", i, rect.lo[0], rect.hi[0]); - // } - float rate = 1.0f; - for (int node = 0; node < config.numNodes; node++) { - if (params.masterOnNode[node] != MASTER_NOT_ASSIGNED) { - TaskLauncher launcher( - PARAMS_UPD_TASK_ID, - TaskArgument(&rate, sizeof(rate)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(params.masterOnNode[node])); - LogicalRegion masterGrad = params.gradients[params.masterOnNode[node]]; - assert(masterGrad != LogicalRegion::NO_REGION); - launcher.add_region_requirement( - RegionRequirement(masterGrad, READ_WRITE, EXCLUSIVE, masterGrad)); - launcher.add_field(0, FID_DATA); - int cnt = 1; - for (int idx = 0; idx < config.workersPerNode; idx++) { - int gpuIdx = node * config.workersPerNode + idx; - if (gpuIdx == params.masterOnNode[node]) { - continue; - } - LogicalRegion grad = params.gradients[gpuIdx]; - if (grad == LogicalRegion::NO_REGION) { - continue; - } - launcher.add_region_requirement( - RegionRequirement(grad, READ_ONLY, EXCLUSIVE, grad)); - launcher.add_field(cnt++, FID_DATA); - } - // printf("Step 1: cnt = %d\n", cnt); - runtime->execute_task(ctx, launcher); - } - } - rate = -0.1f; - TaskLauncher launcher(PARAMS_UPD_TASK_ID, - TaskArgument(&rate, sizeof(rate)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(params.masterOnNode[0])); - launcher.add_region_requirement( - RegionRequirement(params.region, READ_WRITE, EXCLUSIVE, params.region)); - launcher.add_field(0, FID_DATA); - int cnt = 1; - for (int node = 0; node < config.numNodes; node++) { - if (params.masterOnNode[node] != MASTER_NOT_ASSIGNED) { - int gpuIdx = params.masterOnNode[node]; - LogicalRegion grad = params.gradients[gpuIdx]; - assert(grad != LogicalRegion::NO_REGION); - launcher.add_region_requirement( - RegionRequirement(grad, READ_ONLY, EXCLUSIVE, grad)); - launcher.add_field(cnt++, FID_DATA); - } - } - // printf("Step 2: cnt = %d\n", cnt); - runtime->execute_task(ctx, launcher); -} diff --git a/nmt/rnn.h b/nmt/rnn.h deleted file mode 100644 index 001e7e06e2..0000000000 --- a/nmt/rnn.h +++ /dev/null @@ -1,438 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _LEGION_RNN_H_ -#define _LEGION_RNN_H_ - -#include "ops.h" - -#define MAX_SEQ_LENGTH 100 -#define MAX_NUM_LAYERS 4 -#define LSTM_PER_NODE_LENGTH 10 -#define MASTER_NOT_ASSIGNED -1 -// #define PRINT_INTERMEDIATE_RESULT - -struct RnnConfig { - Context lg_ctx; - HighLevelRuntime *lg_hlr; - FieldSpace field_space; - int batchSize, hiddenSize, embedSize, vocabSize; - int numLayers, seqLength, numParts; - int numNodes, workersPerNode; - int iterator; -}; - -struct SharedVariable { - static const SharedVariable NO_VARIABLE; /*empty SharedVariable handle*/ - LogicalRegion region, gradients[MAX_NUM_WORKERS]; - LogicalRegion subregions[2 * MAX_NUM_PARTS]; - int masterOnNode[MAX_NUM_WORKERS]; - SharedVariable() { - region = LogicalRegion::NO_REGION; - for (int i = 0; i < MAX_NUM_WORKERS; i++) { - gradients[i] = LogicalRegion::NO_REGION; - } - for (int i = 0; i < 2 * MAX_NUM_PARTS; i++) { - subregions[i] = LogicalRegion::NO_REGION; - } - for (int i = 0; i < MAX_NUM_WORKERS; i++) { - masterOnNode[i] = MASTER_NOT_ASSIGNED; - } - } -}; - -struct ParallelConfig { - int nDims, dim[MAX_DIM]; - int gpu[MAX_NUM_WORKERS]; -}; - -struct GlobalConfig { - ParallelConfig linear[MAX_SEQ_LENGTH]; - ParallelConfig lstm[MAX_NUM_LAYERS][2 * MAX_SEQ_LENGTH]; - ParallelConfig embed[2 * MAX_SEQ_LENGTH]; - ParallelConfig softmax[MAX_SEQ_LENGTH]; -}; - -class RnnModel; - -class RnnOp { -public: - RnnOp(Tensor input, ParallelConfig pc, SharedVariable _params); - RnnOp(Tensor t1, - Tensor t2, - Tensor t3, - ParallelConfig pc, - SharedVariable _params); - RnnOp(int num, Tensor *inputs); - virtual void init(RnnModel const &) = 0; - - virtual void forward(RnnModel const &) = 0; - - virtual void backward(RnnModel const &) = 0; - - virtual void update(RnnModel const &) = 0; - -public: - Tensor outputs[MAX_NUM_OUTPUTS]; - Tensor inputs[MAX_NUM_INPUTS]; - OpMeta *meta[MAX_NUM_WORKERS]; - ParallelConfig paraConfig; - SharedVariable params; -}; - -struct LSTMTensors { - Tensor x, hx, cx; -}; - -class RnnModel { -public: - RnnModel(int batch_size, - int numLayers, - int seqLength, - int hidden_size, - int embed_size, - int vocab_size, - int num_parts, - int num_nodes, - int num_workers_per_node, - GlobalConfig global, - Context ctx, - Runtime *runtime); - - void init(); - - void forward(); - - void backward(); - - void update(); - - void init_shared_variable(SharedVariable params); - - void update_shared_variable(SharedVariable params); - - static void word_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void zero_1d_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void zero_2d_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void zero_3d_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void dummy_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void params_init_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void params_update_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - LSTMTensors add_lstm_node( - Tensor x, Tensor hx, Tensor cx, ParallelConfig pc, SharedVariable params); - - Tensor add_linear_node(Tensor x, - int output_size, - ParallelConfig pc, - SharedVariable params); - - Tensor add_embed_node(Tensor x, - int vocab_size, - int output_size, - ParallelConfig pc, - SharedVariable params); - - Tensor add_softmaxDP_node(Tensor x, Tensor label, ParallelConfig pc); - -public: - RnnConfig config; - std::vector layers; - std::vector sharedVariables; - DnnHandle dnn_handlers[MAX_NUM_WORKERS]; - Tensor srcs[MAX_SEQ_LENGTH], dsts[MAX_SEQ_LENGTH]; - LSTMTensors zero[MAX_NUM_LAYERS]; - LSTMTensors lstm[MAX_NUM_LAYERS][2 * MAX_SEQ_LENGTH]; - IndexSpaceT<1> part_is; -}; - -/* - * For now, every single LSTM cell with 1 word and 1 layer is a - * LSTM operation. - */ -class LSTM : public RnnOp { -public: - LSTM(RnnConfig config, - Tensor x, - Tensor hx, - Tensor cx, - int batch_size, - int input_size, - int output_size, - ParallelConfig pc, - SharedVariable params); - - void init(RnnModel const &); - - void forward(RnnModel const &); - - void backward(RnnModel const &); - - void update(RnnModel const &); - - static OpMeta *init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void update_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - -public: - int batch_size, input_size, output_size; - Rect<1> part_rect; -}; - -class LSTMMeta : public OpMeta { -public: - LSTMMeta(DnnHandle handle) : OpMeta(handle){}; - cudnnRNNDescriptor_t rnnDesc; - cudnnDropoutDescriptor_t dropoutDesc; - cudnnTensorDescriptor_t xDescs[LSTM_PER_NODE_LENGTH], - yDescs[LSTM_PER_NODE_LENGTH], cxDesc, hxDesc, cyDesc, hyDesc; - cudnnFilterDescriptor_t wDesc; - size_t reserveSpaceSize; - void *reserveSpace; - bool profiling_runtime; -}; - -class Linear : public RnnOp { -public: - Linear(RnnConfig config, - Tensor input, - int output_channels, - ParallelConfig pc, - SharedVariable params, - IndexSpaceT<1> input_part_is); - - void init(RnnModel const &); - - void forward(RnnModel const &); - - void backward(RnnModel const &); - - void update(RnnModel const &); - - static OpMeta *init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void backward2_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void update_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - -public: - int batch_size, input_size, output_size; - Tensor replica; - // each replica_sub_lps[i] is a disjoint partition - LogicalPartition replica_sub_lps[MAX_NUM_WORKERS]; - // input_lp may be an aliased partition if num_par_c > 1 - LogicalPartition input_lp; - Rect<2> part_rect; - Rect<1> input_part_rect; -}; - -class LinearMeta : public OpMeta { -public: - LinearMeta(DnnHandle handle) : OpMeta(handle){}; - float *one_ptr; - bool profiling_runtime; -}; - -class Embed : public RnnOp { -public: - Embed(RnnConfig config, - Tensor input, - int embed_size, - int output_size, - ParallelConfig pc, - SharedVariable params); - - void init(RnnModel const &); - - void forward(RnnModel const &); - - void backward(RnnModel const &); - - void update(RnnModel const &); - - static OpMeta *init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - - static void update_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - -public: - int batchSize, outputSize, vocabSize; - Rect<1> part_rect; -}; - -class EmbedMeta : public OpMeta { -public: - EmbedMeta(DnnHandle handle) : OpMeta(handle){}; - bool profiling_runtime; -}; - -/*class Softmax : public RnnOp { -public: - Softmax(RnnConfig config, Tensor input, Tensor output, - ParallelConfig pc); - - void init(const RnnModel&); - - void forward(const RnnModel&); - - void backward(const RnnModel&); - - void update(const RnnModel&); - - static OpMeta* init_task(const Task *task, - const std::vector ®ions, - Context ctx, Runtime *runtime); - - static void forward_task(const Task *task, - const std::vector ®ions, - Context ctx, Runtime *runtime); - - static void backward_task(const Task *task, - const std::vector ®ions, - Context ctx, HighLevelRuntime *runtime); -public: - Rect<1> part_rect; -}; - -class SoftmaxMeta : public OpMeta { -public: - SoftmaxMeta(DnnHandle handle) : OpMeta(handle) {}; - size_t storage_bytes; - void* storage; - int* offsets; - bool profiling_runtime; -}; -*/ -class SoftmaxDP : public RnnOp { -public: - SoftmaxDP(RnnConfig config, Tensor logit, Tensor label, ParallelConfig pc); - - void init(RnnModel const &); - - void forward(RnnModel const &); - - void backward(RnnModel const &); - - void update(RnnModel const &); - - static OpMeta *init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime); - - static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - HighLevelRuntime *runtime); - -public: - Rect<1> part_rect; - Tensor label; - LogicalPartition logit_lp, logit_grad_lp; -}; - -class SoftmaxDPMeta : public OpMeta { -public: - SoftmaxDPMeta(DnnHandle handle) : OpMeta(handle){}; -#ifndef DISABLE_COMPUTATION - cudnnTensorDescriptor_t inputTensor; -#endif - int batchSize; - bool profiling_runtime; -}; - -#endif //_LEGION_RNN_H_ diff --git a/nmt/rnn_mapper.cc b/nmt/rnn_mapper.cc deleted file mode 100644 index 9a50d2b3e0..0000000000 --- a/nmt/rnn_mapper.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "rnn_mapper.h" -#define ASSIGN_TO_GPU_MASK 0xABCD0000 - -RnnMapper::RnnMapper(MapperRuntime *rt, - Machine machine, - Processor local, - char const *mapper_name, - std::vector *_gpus, - std::map *_proc_fbmems, - std::vector *_cpus) - : DefaultMapper(rt, machine, local, mapper_name), gpus(*_gpus), - proc_fbmems(*_proc_fbmems), cpus(*_cpus) {} - -void RnnMapper::select_task_options(const MapperContext ctx, - Task const &task, - TaskOptions &output) { - if ((task.tag & ASSIGN_TO_GPU_MASK) == ASSIGN_TO_GPU_MASK) { - output.inline_task = false; - output.stealable = false; - output.map_locally = true; - unsigned long gpuId = task.tag ^ ASSIGN_TO_GPU_MASK; - output.initial_proc = gpus[gpuId % gpus.size()]; - } else { - DefaultMapper::select_task_options(ctx, task, output); - } -} - -#ifdef DEADCODE -void RnnMapper::map_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput &output) { - printf("Task(%s %zx):", task.get_task_name(), task.tag); - for (size_t i = 0; i < input.valid_instances.size(); i++) { - printf(" ("); - for (size_t j = 0; j < input.valid_instances[i].size(); j++) { - printf("%zx ", input.valid_instances[i][j].get_location().id); - } - printf(")"); - } - printf("\n"); - DefaultMapper::map_task(ctx, task, input, output); -} - -void RnnMapper::select_task_sources(const MapperContext ctx, - Task const &task, - SelectTaskSrcInput const &input, - SelectTaskSrcOutput &output) { - printf("Slct(%s %zx)[%d]:", - task.get_task_name(), - task.tag, - input.region_req_index); - for (size_t i = 0; i < input.source_instances.size(); i++) { - printf(" %zx", input.source_instances[i].get_location().id); - } - DefaultMapper::select_task_sources(ctx, task, input, output); - printf(" chosen = %zx\n", output.chosen_ranking.front().get_location().id); -} -#endif - -void update_mappers(Machine machine, - Runtime *runtime, - std::set const &local_procs) { - std::vector *gpus = new std::vector(); - std::map *proc_fbmems = new std::map(); - std::vector *cpus = new std::vector(); - // std::map* proc_zcmems = new std::map(); - std::vector proc_mem_affinities; - machine.get_proc_mem_affinity(proc_mem_affinities); - Machine::ProcessorQuery proc_query(machine); - for (Machine::ProcessorQuery::iterator it = proc_query.begin(); - it != proc_query.end(); - it++) { - if (it->kind() == Processor::TOC_PROC) { - gpus->push_back(*it); - Machine::MemoryQuery fb_query(machine); - fb_query.only_kind(Memory::GPU_FB_MEM); - fb_query.best_affinity_to(*it); - assert(fb_query.count() == 1); - (*proc_fbmems)[*it] = *(fb_query.begin()); - } else if (it->kind() == Processor::LOC_PROC) { - cpus->push_back(*it); - } - } - - /* - for (unsigned idx = 0; idx < proc_mem_affinities.size(); ++idx) { - Machine::ProcessorMemoryAffinity& affinity = proc_mem_affinities[idx]; - if (affinity.p.kind() == Processor::TOC_PROC) { - if (affinity.m.kind() == Memory::GPU_FB_MEM) { - (*proc_fbmems)[affinity.p] = affinity.m; - } - else if (affinity.m.kind() == Memory::Z_COPY_MEM) { - (*proc_zcmems)[affinity.p] = affinity.m; - } - } - } - - for (std::map::iterator it = proc_fbmems->begin(); - it != proc_fbmems->end(); it++) { - gpus->push_back(it->first); - } - */ - - for (std::set::const_iterator it = local_procs.begin(); - it != local_procs.end(); - it++) { - RnnMapper *mapper = new RnnMapper(runtime->get_mapper_runtime(), - machine, - *it, - "rnn_mapper", - gpus, - proc_fbmems, - cpus); - runtime->replace_default_mapper(mapper, *it); - } -} - -MappingTagID RnnMapper::assign_to_gpu(int idx) { - assert(idx <= 0xFFFF); - return (ASSIGN_TO_GPU_MASK | idx); -} diff --git a/nmt/rnn_mapper.h b/nmt/rnn_mapper.h deleted file mode 100644 index 357eab97ba..0000000000 --- a/nmt/rnn_mapper.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __RNN_MAPPER_H__ -#define __RNN_MAPPER_H__ - -#include "default_mapper.h" -#include "legion.h" -#include "ops.h" - -using namespace Legion; -using namespace Legion::Mapping; - -class RnnMapper : public DefaultMapper { -public: - RnnMapper(MapperRuntime *rt, - Machine machine, - Processor local, - char const *mapper_name, - std::vector *gpus, - std::map *proc_fbmems, - std::vector *cpus); - -public: - virtual void select_task_options(const MapperContext ctx, - Task const &task, - TaskOptions &output); - // virtual void slice_task(const MapperContext ctx, - // const Task& task, - // const SliceTaskInput& input, - // SliceTaskOutput& output); - // virtual void map_task(const MapperContext ctx, - // const Task& task, - // const MapTaskInput& input, - // MapTaskOutput& output); - // virtual void select_task_sources(const MapperContext ctx, - // const Task& task, - // const SelectTaskSrcInput& input, - // SelectTaskSrcOutput& output); - static MappingTagID assign_to_gpu(int gpuIdx); - -protected: - std::vector &gpus; - std::map &proc_fbmems; - std::vector &cpus; -}; - -void update_mappers(Machine machine, - Runtime *rt, - std::set const &local_procs); -#endif diff --git a/nmt/softmax_data_parallel.cu b/nmt/softmax_data_parallel.cu deleted file mode 100644 index 9b41a332ec..0000000000 --- a/nmt/softmax_data_parallel.cu +++ /dev/null @@ -1,392 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../cnn_helper.h" -#include "rnn.h" -#include "rnn_mapper.h" - -struct SoftmaxDPInitParams { - DnnHandle handle; - int batchSize; - bool profiling; -}; - -Tensor RnnModel::add_softmaxDP_node(Tensor logit, - Tensor label, - ParallelConfig pc) { - assert(logit.numDim == 3); - assert(logit.adim[2] == LSTM_PER_NODE_LENGTH); - assert(logit.pdim[2] == LSTM_PER_NODE_LENGTH); - SoftmaxDP *node = new SoftmaxDP(config, logit, label, pc); - layers.push_back(node); - return node->outputs[0]; -} - -SoftmaxDP::SoftmaxDP(RnnConfig config, - Tensor logit, - Tensor _label, - ParallelConfig pc) - : RnnOp(logit, pc, SharedVariable::NO_VARIABLE), label(_label) { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - assert(pc.nDims == 1); - int num_par_n = pc.dim[0]; - { - Rect<1> rect(Point<1>(0), Point<1>(num_par_n - 1)); - part_rect = rect; - } - IndexSpaceT<1> part_is = runtime->create_index_space(ctx, part_rect); - int batch_size = logit.adim[1]; - int output_size = logit.adim[0]; - FieldSpace fs = config.field_space; - Rect<3, coord_t> y_rect( - Point<3>(0, 0, 0), - Point<3>(output_size - 1, batch_size - 1, LSTM_PER_NODE_LENGTH - 1)); - IndexSpaceT<3> y_is = runtime->create_index_space(ctx, y_rect); - LogicalRegion y_lr = runtime->create_logical_region(ctx, y_is, fs); - LogicalRegion y_grad_lr = runtime->create_logical_region(ctx, y_is, fs); - assert(batch_size % num_par_n == 0); - int extent_n = batch_size / num_par_n; - Rect<3, coord_t> extent( - Point<3>(0, 0, 0), - Point<3>(output_size - 1, extent_n - 1, LSTM_PER_NODE_LENGTH - 1)); - Transform<3, 1, coord_t> trans; - trans[0][0] = 0; - trans[1][0] = extent_n; - trans[2][0] = 0; - IndexPartition y_ip = runtime->create_partition_by_restriction( - ctx, y_is, part_is, trans, extent); - assert(runtime->is_index_partition_disjoint(ctx, y_ip)); - assert(runtime->is_index_partition_complete(ctx, y_ip)); - LogicalPartition y_lp = runtime->get_logical_partition(ctx, y_lr, y_ip); - LogicalPartition y_grad_lp = - runtime->get_logical_partition(ctx, y_grad_lr, y_ip); - outputs[0].numDim = 3; - outputs[0].adim[0] = output_size; - outputs[0].adim[1] = batch_size; - outputs[0].adim[2] = LSTM_PER_NODE_LENGTH; - outputs[0].pdim[0] = output_size; - outputs[0].pdim[1] = extent_n; - outputs[0].pdim[2] = LSTM_PER_NODE_LENGTH; - outputs[0].region = y_lr; - outputs[0].partition = y_lp; - outputs[0].region_grad = y_grad_lr; - outputs[0].partition_grad = y_grad_lp; - // Every partition reads all input_channels - // Use the same partitioning as outputs - // if (inputs[0].pdim[0] == outputs[0].pdim[0] - // && inputs[0].pdim[1] == outputs[0].pdim[1]) { - // logit_lp = inputs[0].partition; - // logit_grad_lp = inputs[0].partition_grad; - //} else { - IndexSpaceT<3> logit_is(inputs[0].region.get_index_space()); - IndexPartition logit_ip = runtime->create_partition_by_restriction( - ctx, logit_is, part_is, trans, extent); - logit_lp = runtime->get_logical_partition(ctx, inputs[0].region, logit_ip); - logit_grad_lp = - runtime->get_logical_partition(ctx, inputs[0].region_grad, logit_ip); - //} -} - -/* - regions[0](I): x - regions[1](O): y -*/ -OpMeta *SoftmaxDP::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - SoftmaxDPInitParams const *softmaxDP = (SoftmaxDPInitParams *)task->args; - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorWO const acc_y(regions[1], FID_DATA); - Rect<3> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - SoftmaxDPMeta *m = new SoftmaxDPMeta(softmaxDP->handle); - m->profiling_runtime = softmaxDP->profiling; - m->batchSize = softmaxDP->batchSize; -#ifndef DISABLE_COMPUTATION - checkCUDNN(cudnnCreateTensorDescriptor(&m->inputTensor)); - assert(rect_x == rect_y); - int input_c = rect_x.hi[0] - rect_x.lo[0] + 1; - int input_n = (rect_x.hi[1] - rect_x.lo[1] + 1) * LSTM_PER_NODE_LENGTH; - checkCUDNN(cudnnSetTensor4dDescriptor(m->inputTensor, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - input_n, - input_c, - 1, - 1)); -#endif - return m; -} - -void SoftmaxDP::init(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - SoftmaxDPInitParams initParams; - initParams.handle = model.dnn_handlers[paraConfig.gpu[idx]]; - initParams.batchSize = model.config.batchSize; - initParams.profiling = false; - TaskLauncher launcher(RNN_SOFTMAXDP_INIT_TASK_ID, - TaskArgument(&initParams, sizeof(initParams)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - { - LogicalRegion x = runtime->get_logical_subregion_by_color(logit_lp, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, WRITE_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(1, FID_DATA); - } - Future f = runtime->execute_task(ctx, launcher); - meta[idx] = f.get_result(); - } -} - -/* - regions[0](I): x - regions[1](O): y -*/ -void SoftmaxDP::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 2); - assert(task->regions.size() == 2); - float alpha = 1.0f, beta = 0.0f; - SoftmaxDPMeta const *m = *((SoftmaxDPMeta **)task->args); - AccessorRO const acc_x(regions[0], FID_DATA); - AccessorWO const acc_y(regions[1], FID_DATA); - Rect<3> rect_x = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(acc_x.accessor.is_dense_arbitrary(rect_x)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - float const *x_ptr = acc_x.ptr(rect_x.lo); - float *y_ptr = acc_y.ptr(rect_y.lo); - - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - cudaStream_t stream; - checkCUDA(cudaStreamCreate(&stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->inputTensor, - x_ptr, - &beta, - m->inputTensor, - y_ptr)); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("SoftmaxDP forward time = %.2fms\n", elapsed); - } -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<3, float>(y_ptr, rect_y, "softmax"); -#endif -#endif -} - -void SoftmaxDP::forward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(RNN_SOFTMAXDP_FWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - { - LogicalRegion x = runtime->get_logical_subregion_by_color(logit_lp, dp); - launcher.add_region_requirement( - RegionRequirement(x, READ_ONLY, EXCLUSIVE, inputs[0].region)); - launcher.add_field(0, FID_DATA); - } - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, WRITE_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(1, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -__global__ void SoftmaxLossBackprop(float *input, - int const *label, - int vocab_size, - int batch_size) { - CUDA_KERNEL_LOOP(i, batch_size) { - int label_idx = label[i]; - input[i * vocab_size + label_idx] -= 1.0f; - } -} - -/* - regions[0](O): x_grad - regions[1](I): y - regions[2](I): labels -*/ -void SoftmaxDP::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { -#ifndef DISABLE_COMPUTATION - assert(regions.size() == 3); - assert(task->regions.size() == 3); - SoftmaxDPMeta const *m = *((SoftmaxDPMeta **)task->args); - AccessorWO const acc_x_grad(regions[0], FID_DATA); - AccessorRO const acc_y(regions[1], FID_DATA); - AccessorRO const acc_label(regions[2], FID_DATA); - Rect<3> rect_x_grad = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Rect<3> rect_y = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Rect<2> rect_label = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - assert(acc_x_grad.accessor.is_dense_arbitrary(rect_x_grad)); - assert(acc_y.accessor.is_dense_arbitrary(rect_y)); - assert(acc_label.accessor.is_dense_arbitrary(rect_label)); - float *x_grad_ptr = acc_x_grad.ptr(rect_x_grad.lo); - float const *y_ptr = acc_y.ptr(rect_y.lo); - int const *label_ptr = acc_label.ptr(rect_label.lo); - assert(rect_x_grad == rect_y); - assert(rect_y.hi[1] - rect_y.lo[1] == rect_label.hi[0] - rect_label.lo[0]); - assert(rect_y.hi[2] - rect_y.lo[2] == rect_label.hi[1] - rect_label.lo[1]); - int num_labels = rect_label.volume(); - int vocab_size = rect_y.hi[0] - rect_y.lo[0] + 1; - - cudaEvent_t t_start, t_end; - if (m->profiling_runtime) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start); - } - checkCUDA(cudaMemcpyAsync(x_grad_ptr, - y_ptr, - rect_x_grad.volume() * sizeof(float), - cudaMemcpyDeviceToDevice)); - SoftmaxLossBackprop<<>>( - x_grad_ptr, label_ptr, vocab_size, num_labels); - - // Accouting for batch size in SGD - float scalVal = 1.0f / static_cast(m->batchSize); - scale_kernel<<>>( - x_grad_ptr, rect_x_grad.volume(), 0.0f, scalVal); - // checkCUDA(cublasSscal(m->handle.blas, rect_x_grad.volume(), - // &scalVal, x_grad_ptr, 1)); - if (m->profiling_runtime) { - cudaEventRecord(t_end); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("Softmax backward time = %.2fms\n", elapsed); - } -#ifdef PRINT_INTERMEDIATE_RESULT - print_tensor<3, float>(x_grad_ptr, rect_x_grad, "softmax bwd:x_grad"); - float *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(float) * rect_x_grad.volume(), - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpy(host_ptr, - x_grad_ptr, - sizeof(float) * rect_x_grad.volume(), - cudaMemcpyDeviceToHost)); - int idx = 0; - float loss = 0.0f; - for (PointInRectIterator<3> it(rect_x_grad); it(); it++, idx++) { - if (host_ptr[idx] < 0) { - loss += -std::log(host_ptr[idx] + 1); - } - } - printf("lost = %.4lf\n", loss); - checkCUDA(cudaFreeHost(host_ptr)); -#endif -#endif -} - -void SoftmaxDP::backward(RnnModel const &model) { - Context ctx = model.config.lg_ctx; - Runtime *runtime = model.config.lg_hlr; - int idx = 0; - for (PointInRectIterator<1> it(part_rect); it(); it++, idx++) { - OpMeta *mp = meta[idx]; - TaskLauncher launcher(RNN_SOFTMAXDP_BWD_TASK_ID, - TaskArgument(&mp, sizeof(OpMeta *)), - Predicate::TRUE_PRED, - 0 /*MapperID*/, - RnnMapper::assign_to_gpu(paraConfig.gpu[idx])); - DomainPoint dp(*it); - { - LogicalRegion x = - runtime->get_logical_subregion_by_color(logit_grad_lp, dp); - launcher.add_region_requirement( - RegionRequirement(x, WRITE_ONLY, EXCLUSIVE, inputs[0].region_grad)); - launcher.add_field(0, FID_DATA); - } - { - LogicalRegion y = - runtime->get_logical_subregion_by_color(outputs[0].partition, dp); - launcher.add_region_requirement( - RegionRequirement(y, READ_ONLY, EXCLUSIVE, outputs[0].region)); - launcher.add_field(1, FID_DATA); - } - { - LogicalRegion l = - runtime->get_logical_subregion_by_color(label.partition, dp); - launcher.add_region_requirement( - RegionRequirement(l, READ_ONLY, EXCLUSIVE, label.region)); - launcher.add_field(2, FID_DATA); - } - runtime->execute_task(ctx, launcher); - } -} - -void SoftmaxDP::update(RnnModel const &model) {} diff --git a/scripts/FC_env_setup.sh b/scripts/FC_env_setup.sh deleted file mode 100755 index ad58118761..0000000000 --- a/scripts/FC_env_setup.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Cd into FF_HOME -cd "${BASH_SOURCE[0]%/*}/../" - -export GASNET=${PWD}/GASNet-2019.9.0 -export LEGION=${PWD}/legion -export PROTOBUF=${PWD}/protobuf - -module unload cuda cudnn NCCL - -#cuda v10 -#module load cuda/10.0 -#module load cudnn/v7.6-cuda.10.0 -#module load NCCL/2.4.8-1-cuda.10.0 -#export CUDA=/public/apps/cuda/10.1 -#export CUDNN=/public/apps/cudnn/v7.6/cuda -#export NCCL=/public/apps/NCCL/2.4.8-1 - -#cuda v9.2 -module load cuda/9.2 -module load cudnn/v7.3-cuda.9.2 -module load NCCL/2.2.13-1-cuda.9.2 -export CUDA=/public/apps/cuda/9.2 -export CUDNN=/public/apps/cudnn/v7.3/cuda -export NCCL=/public/apps/NCCL/2.2.13-1 - -module load cmake/3.15.3/gcc.7.3.0 -module load anaconda3/2019.07 - -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PROTOBUF/src/.libs -export PATH=$PATH:$PROTOBUF diff --git a/scripts/FC_setup.sh b/scripts/FC_setup.sh deleted file mode 100644 index 537d0c0b83..0000000000 --- a/scripts/FC_setup.sh +++ /dev/null @@ -1,34 +0,0 @@ -#! /usr/bin/env bash -set -euo pipefail - -# Cd into FF_HOME -cd "${BASH_SOURCE[0]%/*}/../" - -git submodule update --init --recursive -./scripts/FC_env_setup.sh - -cd "$PROTOBUF" -git submodule update --init --recursive -##git checkout 6d4e7fd #still cannot get the strategy compile to use the local runtime. So need to checkout v 3.10.0 -./autogen.sh -./configure -make -j -cd .. - -cd "$GASNET" -./FC.build_script.sh -cd .. - -cd src/runtime -../../protobuf/src/protoc --cpp_out=. strategy.proto -./gen_strategy.sh 8 8 1 # for 8 gpu per node, and 8 embeddings per node, and 1 node -cd ../.. - -cd "$LEGION" -git checkout control_replication -cd ../ - - -make app=examples/DLRM/dlrm -j -cd examples/DLRM -./run_random.sh 1 \ No newline at end of file diff --git a/scripts/FC_setup.txt b/scripts/FC_setup.txt deleted file mode 100644 index 0702815343..0000000000 --- a/scripts/FC_setup.txt +++ /dev/null @@ -1,24 +0,0 @@ -git clone --recursive -git submodule update --init --recursive -source FC_env_setup.sh - -cd $PROTOBUF -git submodule update --init --recursive -##git checkout 6d4e7fd #still cannot get the strategy compile to use the local runtime. So need to checkout v 3.10.0 -./autogen.sh -./configure -make -j -cd .. - -cd $GASNET -./FC.build_script.sh -cd .. - -cd src/runtime -../../protobuf/src/protoc --cpp_out=. strategy.proto -./gen_strategy.sh 8 8 # for 8 gpu and 8 embeddings -cd ../.. - -make app=examples/DLRM/dlrm -j -cd examples/DLRM -./run_random.sh 1 diff --git a/scripts/Makefile b/scripts/Makefile deleted file mode 100644 index 7fa21fb11a..0000000000 --- a/scripts/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -simulator: - nvcc simulator.cc -lcudnn -lcublas -std=c++11 -arch=compute_37 -code=sm_37 diff --git a/scripts/compile_protobuf.sh b/scripts/compile_protobuf.sh deleted file mode 100755 index bea26e6940..0000000000 --- a/scripts/compile_protobuf.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -euo pipefail - -cd src/runtime -protoc --cpp_out=. strategy.proto -cd ../.. diff --git a/scripts/osdi22ae/bert.sh b/scripts/osdi22ae/bert.sh deleted file mode 100755 index 18d2c3195c..0000000000 --- a/scripts/osdi22ae/bert.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running BERT with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/Transformer/transformer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 -b 8 --budget 30 - -echo "Running BERT Uno with data parallelism" -"$FF_HOME"/build/examples/cpp/Transformer/transformer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 -b 8 --budget 30 --only-data-parallel diff --git a/scripts/osdi22ae/candle_uno.sh b/scripts/osdi22ae/candle_uno.sh deleted file mode 100755 index 22458149f1..0000000000 --- a/scripts/osdi22ae/candle_uno.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running CANDLE Uno with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/candle_uno/candle_uno -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 - -echo "Running CANDLE Uno with data parallelism" -"$FF_HOME"/build/examples/cpp/candle_uno/candle_uno -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel diff --git a/scripts/osdi22ae/dlrm.sh b/scripts/osdi22ae/dlrm.sh deleted file mode 100755 index a75e78bc0a..0000000000 --- a/scripts/osdi22ae/dlrm.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running DLRM with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/DLRM/dlrm -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 - -echo "Running DLRM with data parallelism" -"$FF_HOME"/build/examples/cpp/DLRM/dlrm -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel diff --git a/scripts/osdi22ae/inception.sh b/scripts/osdi22ae/inception.sh deleted file mode 100755 index 7b6c079eab..0000000000 --- a/scripts/osdi22ae/inception.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running Inception-v3 with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/InceptionV3/inception -ll:gpu 4 -ll:fsize 11000 -ll:zsize 14000 -b 64 --budget 10 - -echo "Running Inception-v3 with data parallelism" -"$FF_HOME"/build/examples/cpp/InceptionV3/inception -ll:gpu 4 -ll:fsize 11000 -ll:zsize 14000 -b 64 --budget 10 --only-data-parallel diff --git a/scripts/osdi22ae/mlp.sh b/scripts/osdi22ae/mlp.sh deleted file mode 100755 index fa84607983..0000000000 --- a/scripts/osdi22ae/mlp.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running MLP with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/MLP_Unify/mlp_unify -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 - -echo "Running MLP with data parallelism" -"$FF_HOME"/build/examples/cpp/MLP_Unify/mlp_unify -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel diff --git a/scripts/osdi22ae/resnext-50.sh b/scripts/osdi22ae/resnext-50.sh deleted file mode 100755 index c73e079361..0000000000 --- a/scripts/osdi22ae/resnext-50.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running ResNeXt-50 with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/resnext50/resnext50 -ll:gpu 4 -ll:fsize 12000 -ll:zsize 14000 -b 16 --budget 20 - -echo "Running ResNeXt-50 with data parallelism" -"$FF_HOME"/build/examples/cpp/resnext50/resnext50 -ll:gpu 4 -ll:fsize 12000 -ll:zsize 14000 -b 16 --budget 20 --only-data-parallel diff --git a/scripts/osdi22ae/xdl.sh b/scripts/osdi22ae/xdl.sh deleted file mode 100755 index fcb5172b30..0000000000 --- a/scripts/osdi22ae/xdl.sh +++ /dev/null @@ -1,7 +0,0 @@ -#! /usr/bin/env bash - -echo "Running XDL with a parallelization strategy discovered by Unity" -"$FF_HOME"/build/examples/cpp/XDL/xdl -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 - -echo "Running XDL with data parallelism" -"$FF_HOME"/build/examples/cpp/XDL/xdl -ll:gpu 4 -ll:fsize 14000 -ll:zsize 14000 --budget 20 --only-data-parallel diff --git a/scripts/test_run.sh b/scripts/test_run.sh deleted file mode 100644 index 9ff8f71129..0000000000 --- a/scripts/test_run.sh +++ /dev/null @@ -1,38 +0,0 @@ -#! /usr/bin/env bash -set -euo pipefail - -# Cd into FF_HOME -cd "${BASH_SOURCE[0]%/*}/../" - -# git checkout dcr # We are using the dcr branch by default -git submodule update --init --recursive -./scripts/FC_env_setup.sh - -cd "$PROTOBUF" -git submodule update --init --recursive -##git checkout 6d4e7fd #still cannot get the strategy compile to use the local runtime. So need to checkout v 3.10.0 -./autogen.sh -./configure -make -j -cd .. - -cd "$GASNET" -./FC.build_script.sh -cd .. - -cd src/runtime -../../protobuf/src/protoc --cpp_out=. strategy.proto -./gen_strategy.sh 8 8 1 # for 8 gpu per node, and 8 embeddings per node, and 1 node -./gen_strategy.sh 2 1 1 # for 2 gpu per node, testing purpose -cd ../.. - -cd "$LEGION" -git checkout control_replication -cd ../ - - -make app=src/ops/tests/concat_test -j -f Makefile -cd src/ops/tests -./test_run_FF_target.sh concat_test 2 && cp output.txt output_2gpus.txt -./test_run_FF_target.sh concat_test 1 && cp output.txt output_1gpus.txt - From de6933b57a8b3d37bb26fb5e8a0bd77326c28c6b Mon Sep 17 00:00:00 2001 From: Pinku Surana Date: Thu, 5 Oct 2023 15:52:51 -0400 Subject: [PATCH 246/344] Compare flle path reliably (#1173) Fixes #957. `-ef` is "True if file1 and file2 refer to the same device and inode numbers." --- python/flexflow_python_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow_python_build.py b/python/flexflow_python_build.py index 65aff5af56..45b858b113 100755 --- a/python/flexflow_python_build.py +++ b/python/flexflow_python_build.py @@ -43,7 +43,7 @@ f'BUILD_FOLDER="{build_dir}"', 'SCRIPT_DIR="$(realpath "${BASH_SOURCE[0]%/*}")"', 'legion_python_args=("$@" "-ll:py" "1")', - 'if [[ "$SCRIPT_DIR" == "$BUILD_FOLDER" ]]; then', + 'if [[ "$SCRIPT_DIR" -ef "$BUILD_FOLDER" ]]; then', f'\tPYTHON_FOLDER="{script_dir}"', '\tPYLIB_PATH="$("$PYTHON_FOLDER"/flexflow/findpylib.py)"', '\tPYLIB_DIR="$(dirname "$PYLIB_PATH")"', From 50ff264ee02956bd464e575c70011ed9550bef56 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 8 Oct 2023 19:39:01 -0400 Subject: [PATCH 247/344] [Tool] - Add mechanism to save operators' tensors to file (#1174) * add model id, layer_id and op_name to opmeta * pass model id to opmeta * . * implement inference tensor save function * add calls to save tensors function in ops * more ops * done * fix bugs, implement batchconfig << operator, add function to save bc to file * fixes * hip_rocm fixes * fix * fix bug * fix ci * removed out of date incmha inference test * add save tensors function to fused.cu --- .gitignore | 2 + conda/pytorch-gpu.yml | 2 +- include/flexflow/batch_config.h | 13 +- include/flexflow/config.h | 1 + include/flexflow/fftype.h | 4 +- include/flexflow/layer.h | 1 + include/flexflow/model.h | 5 + include/flexflow/op_meta.h | 4 + include/flexflow/operator.h | 10 + .../ops/add_bias_residual_layer_norm.h | 1 - include/flexflow/ops/element_unary.h | 1 - .../ops/inc_multihead_self_attention.h | 3 - include/flexflow/ops/kernels/concat_kernels.h | 1 - .../flexflow/ops/kernels/conv_2d_kernels.h | 1 - .../ops/kernels/element_binary_kernels.h | 1 - include/flexflow/ops/kernels/linear_kernels.h | 1 - .../flexflow/ops/kernels/pool_2d_kernels.h | 1 - .../ops/kernels/residual_rms_norm_kernels.h | 1 - .../flexflow/ops/kernels/rms_norm_kernels.h | 1 - .../flexflow/ops/kernels/softmax_kernels.h | 2 +- include/flexflow/ops/layer_norm.h | 1 - include/flexflow/ops/linear.h | 1 + include/flexflow/ops/residual_layer_norm.h | 1 - include/flexflow/ops/residual_rms_norm.h | 1 - include/flexflow/ops/rms_norm.h | 1 - include/flexflow/ops/sigmoid_silu_multi.h | 1 - include/flexflow/utils/hip_helper.h | 3 + inference/python/incr_decoding.py | 1 + inference/python/spec_infer.py | 1 + python/flexflow/core/__init__.py | 1 + python/flexflow/serve/__init__.py | 8 + src/c/flexflow_c.cc | 3 +- src/ops/add_bias_residual_layer_norm.cc | 65 +- src/ops/add_bias_residual_layer_norm.cpp | 1 + src/ops/add_bias_residual_layer_norm.cu | 1 + src/ops/aggregate.cc | 3 + src/ops/aggregate_spec.cc | 3 + src/ops/arg_topk.cc | 18 +- src/ops/argmax.cc | 21 +- src/ops/attention.cc | 3 + src/ops/batch_matmul.cc | 3 + src/ops/batch_norm.cpp | 1 + src/ops/batch_norm.cu | 1 + src/ops/beam_topk.cc | 78 +- src/ops/cache.cc | 3 + src/ops/cast.cc | 2 + src/ops/concat.cc | 2 + src/ops/conv_2d.cc | 8 +- src/ops/dropout.cc | 2 + src/ops/element_binary.cc | 19 +- src/ops/element_unary.cc | 31 +- src/ops/embedding.cc | 77 +- src/ops/experts.cc | 39 +- src/ops/fused.cu | 25 + src/ops/gather.cc | 8 +- src/ops/group_by.cc | 41 +- src/ops/inc_multihead_self_attention.cc | 797 +----------------- src/ops/inc_multihead_self_attention.cu | 15 - src/ops/kernels/dropout_kernels.cpp | 1 + src/ops/kernels/dropout_kernels.cu | 1 + src/ops/kernels/element_binary_kernels.cu | 1 + src/ops/kernels/softmax.cpp | 1 + src/ops/kernels/softmax.cu | 1 + src/ops/layer_norm.cc | 25 +- src/ops/layer_norm.cu | 1 + src/ops/linear.cc | 22 +- src/ops/pool_2d.cc | 2 + src/ops/reduce.cc | 8 +- src/ops/reshape.cc | 8 +- src/ops/residual_layer_norm.cc | 36 +- src/ops/residual_layer_norm.cpp | 1 + src/ops/residual_layer_norm.cu | 1 + src/ops/residual_rms_norm.cc | 17 +- src/ops/rms_norm.cc | 16 +- src/ops/sampling.cc | 12 +- src/ops/sigmoid_silu_multi.cc | 17 +- src/ops/sigmoid_silu_multi.cpp | 1 + src/ops/sigmoid_silu_multi.cu | 1 + src/ops/softmax.cc | 89 +- src/ops/spec_inc_multihead_self_attention.cc | 25 +- src/ops/topk.cc | 3 + src/ops/transpose.cc | 3 + src/ops/tree_inc_multihead_self_attention.cc | 16 +- src/runtime/batch_config.cc | 84 +- src/runtime/beam_search_batch_config.cc | 140 +-- src/runtime/cuda_helper.cu | 104 ++- src/runtime/fftype.cc | 13 +- src/runtime/graph.cc | 30 +- src/runtime/hip_helper.cpp | 117 +++ src/runtime/layer.cc | 6 +- src/runtime/model.cc | 20 +- src/runtime/operator.cc | 109 +++ src/runtime/tree_verify_batch_config.cc | 100 ++- .../python_test_configs/generate_configs.py | 1 + 94 files changed, 1109 insertions(+), 1270 deletions(-) diff --git a/.gitignore b/.gitignore index be0266c9b5..8fcc105f01 100644 --- a/.gitignore +++ b/.gitignore @@ -185,3 +185,5 @@ gpt_tokenizer # pip version python/flexflow/version.txt + +inference_tensors \ No newline at end of file diff --git a/conda/pytorch-gpu.yml b/conda/pytorch-gpu.yml index 677e71d73f..85d24ced17 100644 --- a/conda/pytorch-gpu.yml +++ b/conda/pytorch-gpu.yml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python>=3.6 + - python>=3.6,<3.12 - pip - pip: - numpy>=1.16.0 diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 6dabc70f4b..108bc8d172 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -46,7 +46,9 @@ class BatchConfig { static int max_requests_per_batch(); static int max_tokens_per_batch(); static int max_sequence_length(); + friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; + void save_to_file(std::string const &filename) const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); // Maximum possible values for different parameters @@ -55,9 +57,8 @@ class BatchConfig { static int const MAX_NUM_REQUESTS = 64; static int const MAX_NUM_TOKENS = 1024; - // These are set by update + // Set by update int num_tokens; - bool loading_prompt = false; struct PerRequestInfo { int token_start_offset; @@ -74,7 +75,7 @@ class BatchConfig { PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; bool request_completed[MAX_NUM_REQUESTS]; - bool request_running[MAX_NUM_TOKENS]; + bool request_running[MAX_NUM_REQUESTS]; }; class TreeVerifyBatchConfig : public BatchConfig { @@ -82,7 +83,10 @@ class TreeVerifyBatchConfig : public BatchConfig { TreeVerifyBatchConfig(); ~TreeVerifyBatchConfig(); InferenceMode get_mode() const; + friend std::ostream &operator<<(std::ostream &os, + TreeVerifyBatchConfig const &bc); void print() const; + void save_to_file(std::string const &filename) const; struct CommittedTokensInfo { int token_index; // the index of the token in the previous batch int request_index; // request index in the batch @@ -108,7 +112,10 @@ class BeamSearchBatchConfig : public BatchConfig { ~BeamSearchBatchConfig(); + friend std::ostream &operator<<(std::ostream &os, + BeamSearchBatchConfig const &bc); void print() const; + void save_to_file(std::string const &filename) const; bool done() const; int max_beam_depth_all_requests() const; int current_depth_all_requests() const; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 9716060173..c2af6d707c 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -134,6 +134,7 @@ class FFConfig { Legion::Runtime *lg_hlr; // Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; + bool inference_debugging; size_t simulator_work_space_size; size_t search_budget; float search_alpha; diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 18ed6b8100..1cd90fda26 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -10,12 +10,12 @@ class LayerID { public: static const LayerID NO_ID; LayerID(); - LayerID(size_t id, size_t transformer_layer_id); + LayerID(size_t id, size_t transformer_layer_id, size_t model_id); bool is_valid_id() const; friend bool operator==(LayerID const &lhs, LayerID const &rhs); public: - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, model_id; }; }; // namespace FlexFlow diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index 0c1d7a6092..69a57e4e1c 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -52,6 +52,7 @@ class Layer { bool trainableInputs[MAX_NUM_INPUTS]; int numInputs, numWeights, numOutputs; bool profiling; + bool inference_debugging; private: std::unordered_map int_properties; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 97ee553fb3..d8402ba622 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1234,6 +1234,8 @@ class FFModel { std::unordered_map cached_noop_ops; std::unordered_map cached_input_ops; std::vector all_valid_views; + int model_id; // unique incremental id assigned to each model. Used in the + // inference_debugging mode. #ifdef FF_USE_NCCL std::unordered_map view_hash_to_nccl_comms; #endif @@ -1262,6 +1264,9 @@ class FFModel { ElementUnary * unary(OperatorType op, char const *name = NULL, float scalar = 0.0); PCG::Node new_node(Op *); + static int model_counter; // number of instantiated FFModel objects. Used to + // assign a unique incremental id to each model. + // Used in the inference_debugging mode. }; class UtilityTasks { diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h index 512844db92..60785a1e29 100644 --- a/include/flexflow/op_meta.h +++ b/include/flexflow/op_meta.h @@ -15,6 +15,10 @@ class OpMeta { public: FFHandler handle; bool profiling; // Measure the run time of the task + bool inference_debugging; + int decoding_step; + char op_name[MAX_OPNAME]; + LayerID layer_guid; bool trainableInputs[MAX_NUM_INPUTS]; DataType input_type[MAX_NUM_INPUTS]; DataType weight_type[MAX_NUM_WEIGHTS]; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b2fc7bbfc..fd21436681 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -1,6 +1,7 @@ #ifndef _OPERATOR_H #define _OPERATOR_H +#include "flexflow/accessor.h" #include "flexflow/batch_config.h" #include "flexflow/fftype.h" #include "flexflow/machine_view.h" @@ -183,6 +184,7 @@ class Op { const ParallelTensor input4 = NULL); Op(int guid, bool profiling, + bool inference_debugging, OperatorType otype, DataType dtype, char const *name, @@ -225,6 +227,13 @@ class Op { assert(false); }; virtual void print_layer(FFModel const &model) = 0; + static void save_inference_tensors_to_file( + OpMeta *m, + int shard_id, + BatchConfig const *bc, + std::vector input_tensors, + std::vector weight_tensors, + std::vector output_tensors); virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const = 0; @@ -316,6 +325,7 @@ class Op { std::map inference_meta; int numInputs, numWeights, numOutputs; bool profiling; + bool inference_debugging; bool add_bias_only_once; #ifdef FF_USE_NCCL ncclUniqueId ncclId; diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index 523f4d3b7c..bb470376c3 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -106,7 +106,6 @@ class AddBiasResidualLayerNormMeta : public OpMeta { int64_t effective_batch_size, effective_num_elements; float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index f82db5f910..ddef59549c 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -26,7 +26,6 @@ class ElementUnaryMeta : public OpMeta { DataType data_type; bool inplace; float scalar; - char op_name[MAX_OPNAME]; }; class ElementUnary : public Op { diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 8290998f02..51a3b9fbe1 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -185,9 +185,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { bool *qk_prod_scaling; bool *position_bias; float scaling_factor; -#ifdef INFERENCE_TESTS - float *kcache, *vcache; -#endif void *weight_ptr, *bias_ptr; // for weight offload void *devQKVProjArray, *keyCache, *valueCache; void *qk_prods, *qk_prods_softmax; diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h index 755e1800da..4da6aaf5e2 100644 --- a/include/flexflow/ops/kernels/concat_kernels.h +++ b/include/flexflow/ops/kernels/concat_kernels.h @@ -12,7 +12,6 @@ class ConcatMeta : public OpMeta { public: ConcatMeta(FFHandler handle) : OpMeta(handle){}; int legion_axis; - char op_name[MAX_OPNAME]; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h index a848d83d60..7b2a0fe135 100644 --- a/include/flexflow/ops/kernels/conv_2d_kernels.h +++ b/include/flexflow/ops/kernels/conv_2d_kernels.h @@ -28,7 +28,6 @@ class Conv2DMeta : public OpMeta { miopenConvBwdDataAlgorithm_t bwdDataAlgo; #endif bool relu, use_bias; - char op_name[MAX_OPNAME]; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/element_binary_kernels.h b/include/flexflow/ops/kernels/element_binary_kernels.h index b0c596301b..5a375fb661 100644 --- a/include/flexflow/ops/kernels/element_binary_kernels.h +++ b/include/flexflow/ops/kernels/element_binary_kernels.h @@ -23,7 +23,6 @@ class ElementBinaryMeta : public OpMeta { OperatorType op_type; bool inplace_a, has_same_operands; bool broadcast_input1, broadcast_input2; - char op_name[MAX_OPNAME]; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index bbebe3c79b..a5fdc7c602 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -34,7 +34,6 @@ class LinearMeta : public OpMeta { RegularizerMode kernel_reg_type; float kernel_reg_lambda; bool use_bias, add_bias_only_once; - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h index ac86cb50c3..7f73a8295d 100644 --- a/include/flexflow/ops/kernels/pool_2d_kernels.h +++ b/include/flexflow/ops/kernels/pool_2d_kernels.h @@ -14,7 +14,6 @@ class Pool2DMeta : public OpMeta { ffActivationDescriptor_t actiDesc; ffPoolingDescriptor_t poolDesc; bool relu; - char op_name[MAX_OPNAME]; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index aa454711ec..0eef4ca72b 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -37,7 +37,6 @@ class ResidualRMSNormMeta : public OpMeta { int in_dim; int batch_size; int num_elements; - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 2063777ef1..35c5aa69fa 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -37,7 +37,6 @@ class RMSNormMeta : public OpMeta { int in_dim; int batch_size; int num_elements; - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 987a546459..8cfaf3c586 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -21,8 +21,8 @@ class SoftmaxMeta : public OpMeta { miopenTensorDescriptor_t outputTensor; #endif bool profiling; + bool inference_debugging; int dim; - char op_name[MAX_OPNAME]; DataType input_type, output_type; }; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index c65370e0fd..9e48d81190 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -123,7 +123,6 @@ class LayerNormMeta : public OpMeta { int64_t effective_batch_size, effective_num_elements; float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index 025674c7ba..a32df80537 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -96,6 +96,7 @@ class Linear : public Op { private: Linear(int guid, bool profiling, + bool inference_debugging, const ParallelTensor input, int out_dim, ActiMode activation, diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index 39f149554e..0e9be82125 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -106,7 +106,6 @@ class ResidualLayerNormMeta : public OpMeta { int64_t effective_batch_size, effective_num_elements; float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h index 783173c5eb..0d92a236e8 100644 --- a/include/flexflow/ops/residual_rms_norm.h +++ b/include/flexflow/ops/residual_rms_norm.h @@ -80,7 +80,6 @@ class ResidualRMSNorm : public Op { public: float eps; - char op_name[MAX_OPNAME]; int effective_batch_size; int dim, data_dim; }; diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index a3074de015..1dc940ebd3 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -79,7 +79,6 @@ class RMSNorm : public Op { public: float eps; - char op_name[MAX_OPNAME]; int effective_batch_size; int dim, data_dim; }; diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h index 6a69288607..604438260a 100644 --- a/include/flexflow/ops/sigmoid_silu_multi.h +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -79,7 +79,6 @@ class SigmoidSiluMultiMeta : public OpMeta { ~SigmoidSiluMultiMeta(void); public: - char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; }; diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index be6f4a713d..5d3c831d4f 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -137,6 +137,9 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +template +void save_tensor(T const *ptr, size_t num_elements, char const *file_name); + template T *download_tensor(T const *ptr, size_t num_elements); diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index d8a494b4d5..3621ee83a3 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -55,6 +55,7 @@ def get_configs(): "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "inference_debugging": False, "fusion": True, } llm_configs = { diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index c9e87bd29f..3d0f1a1c0e 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -55,6 +55,7 @@ def get_configs(): "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "inference_debugging": False, "fusion": True, } llm_configs = { diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index ace6030a1b..d7b1a595d2 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -41,6 +41,7 @@ "num_cpus": "-ll:cpu", "legion_utility_processors": "-ll:util", "profiling": "--profiling", + "inference_debugging": "--inference-debugging", "fusion": "--fusion", "disable_control_replication": "--disable-control-replication", # Training args diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 9b282ae5f4..cf467280bd 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -45,6 +45,7 @@ def init( use_4bit_quantization: Optional[bool] = None, use_8bit_quantization: Optional[bool] = None, profiling: Optional[bool] = None, + inference_debugging: Optional[bool] = None, fusion: Optional[bool] = None, ): """ @@ -71,6 +72,7 @@ def init( - use_4bit_quantization: whether to use 4-bit quantization, defaults to False - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - profiling: whether to enable the FlexFlow profiling mode, defaults to False + - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments. @@ -104,6 +106,8 @@ def init( :type use_8bit_quantization: Optional[bool], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional + :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False + :type inference_debugging: Optional[bool], optional :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True :type fusion: Optional[bool], optional @@ -128,6 +132,7 @@ def init( use_4bit_quantization is not None, use_8bit_quantization is not None, profiling is not None, + inference_debugging is not None, fusion is not None, ] ): @@ -152,6 +157,7 @@ def init( "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, "profiling": profiling, + "inference_debugging": inference_debugging, "fusion": fusion, } @@ -195,6 +201,8 @@ def init( configs_dict["use_8bit_quantization"] = False if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False + if configs_dict.get("inference_debugging", None) is None: + configs_dict["inference_debugging"] = False if configs_dict.get("fusion", None) is None: configs_dict["fusion"] = True diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 5bb5249f5d..80202f6f99 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1592,7 +1592,8 @@ flexflow_generation_result_t std::string const text_str(input_text); prompts.push_back(input_text); GenerationResult result = handle->generate(prompts, max_seq_length); - DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length); + DEBUG_PRINT( + "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); assert(result.output_tokens.size() <= max_seq_length); output_length_and_tokens[0] = result.output_tokens.size(); std::copy(result.output_tokens.begin(), diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 159c82b346..42fbb3016a 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -508,6 +508,8 @@ OpMeta *AddBiasResidualLayerNorm::init_task( } meta->output_type[0] = ln->outputs[0]->data_type; meta->output_type[1] = ln->outputs[1]->data_type; + std::strcpy(meta->op_name, ln->name); + meta->layer_guid = ln->layer_guid; return meta; } @@ -620,7 +622,7 @@ void AddBiasResidualLayerNorm::inference_task( return; } - AddBiasResidualLayerNormMeta const *m = + AddBiasResidualLayerNormMeta *m = *((AddBiasResidualLayerNormMeta **)task->local_args); assert(regions.size() == @@ -669,40 +671,6 @@ void AddBiasResidualLayerNorm::inference_task( assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); - // std::cout << std::endl << "INFERENCE task tensor dims:" << std::endl; - // std::cout << "input: "; - // for (int i=0; ielementwise_affine) { gamma = helperGetGenericTensorAccessorRO(m->weight_type[1], regions[5], @@ -749,6 +717,26 @@ void AddBiasResidualLayerNorm::inference_task( attn_bias, gamma, beta); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(attn_bias); + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + if (m->use_bias) { + weights_accessors.push_back(beta); + } + } + AddBiasResidualLayerNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input, residual}, + weights_accessors, + {added_output, output}); + } } bool AddBiasResidualLayerNorm::measure_operator_cost( @@ -759,6 +747,7 @@ bool AddBiasResidualLayerNorm::measure_operator_cost( void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->axes.size()); for (size_t i = 0; i < this->axes.size(); i++) { sez.serialize(this->axes[i]); @@ -780,10 +769,11 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; float eps; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(num_axes); for (size_t i = 0; i < num_axes; i++) { int axis_idx; @@ -812,6 +802,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.layer_guid.model_id); hash_combine(key, params.axes.size()); for (int n : params.axes) { hash_combine(key, n); diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 3570ae42dc..1add43ecd9 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -34,6 +34,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 626e56d64f..ceb1a6514e 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -33,6 +33,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index c7217bb700..67810d3f5b 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -235,6 +235,9 @@ OpMeta *Aggregate::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); AggregateMeta *m = new AggregateMeta(handle, agg->n); m->profiling = agg->profiling; + m->inference_debugging = agg->inference_debugging; + std::strcpy(m->op_name, agg->name); + m->layer_guid = agg->layer_guid; return m; } diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 5190983148..19b2edc14a 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -209,6 +209,9 @@ OpMeta *AggregateSpec::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n); m->profiling = agg->profiling; + m->inference_debugging = agg->inference_debugging; + std::strcpy(m->op_name, agg->name); + m->layer_guid = agg->layer_guid; return m; } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index b877a9f96d..a06b89de07 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -241,7 +241,10 @@ OpMeta *ArgTopK::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); ArgTopKMeta *m = new ArgTopKMeta(handle, topk); m->profiling = topk->profiling; + m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; + std::strcpy(m->op_name, topk->name); + m->layer_guid = topk->layer_guid; return m; } @@ -308,7 +311,7 @@ InferenceResult InferenceResult ir; return ir; } - ArgTopKMeta const *m = *((ArgTopKMeta **)task->local_args); + ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -318,6 +321,13 @@ InferenceResult int batch_size = bc->num_active_tokens(); ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ArgTopK::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {indices}); + } + InferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); @@ -332,6 +342,7 @@ void ArgTopK::backward(FFModel const &ff) { void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->k); sez.serialize(this->sorted); } @@ -341,10 +352,11 @@ Node ArgTopK::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 1); - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); int k; bool sorted; dez.deserialize(k); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 7863931c82..f336c843e8 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -245,7 +245,10 @@ OpMeta *ArgMax::init_task(Task const *task, length * batch_size, gpu_mem_allocator); m->profiling = s->profiling; + m->inference_debugging = s->inference_debugging; m->beam_search = s->beam_search; + std::strcpy(m->op_name, s->name); + m->layer_guid = s->layer_guid; return m; } @@ -339,7 +342,7 @@ BeamInferenceResult BeamInferenceResult ir; return ir; } - ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); + ArgMaxMeta *m = *((ArgMaxMeta **)task->local_args); GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -355,6 +358,14 @@ BeamInferenceResult indices.get_int32_ptr(), ir.token_ids, batch_size); download_tensor(m->probs, ir.probs, batch_size); download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ArgMax::save_inference_tensors_to_file( + m, shard_id, bc, {}, {}, {input, indices, parent}); + } + return ir; } @@ -365,7 +376,7 @@ InferenceResult Runtime *runtime) { assert(regions.size() == 2); assert(task->regions.size() == 2); - ArgMaxMeta const *m = *((ArgMaxMeta **)task->local_args); + ArgMaxMeta *m = *((ArgMaxMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { // Directly return for empty batch config @@ -381,6 +392,12 @@ InferenceResult int batch_size = bc->num_active_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); InferenceResult ir; + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ArgMax::save_inference_tensors_to_file( + m, shard_id, bc, {}, {}, {input, indices}); + } download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 027ea18634..1f71be07a8 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -521,6 +521,9 @@ OpMeta * MultiHeadAttentionMeta *m = new MultiHeadAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; + m->inference_debugging = attn->inference_debugging; + std::strcpy(m->op_name, attn->name); + m->layer_guid = attn->layer_guid; assert(acc_weight.rect.volume() * sizeof(float) == m->weightSize); return m; } diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc index 977c5443b9..f4b06877e5 100644 --- a/src/ops/batch_matmul.cc +++ b/src/ops/batch_matmul.cc @@ -274,8 +274,11 @@ OpMeta *BatchMatmul::init_task(Task const *task, FFHandler handle = *((FFHandler const *)task->local_args); BatchMatmulMeta *m = new BatchMatmulMeta(handle); m->profiling = bmm->profiling; + m->inference_debugging = bmm->inference_debugging; m->a_seq_length_dim = bmm->a_seq_length_dim; m->b_seq_length_dim = bmm->b_seq_length_dim; + std::strcpy(m->op_name, bmm->name); + m->layer_guid = bmm->layer_guid; return m; } diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index 34a7fcbe72..106e5ebad2 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -293,6 +293,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); relu = bn->relu; profiling = bn->profiling; + inference_debugging = bn->inference_debugging; mode = miopenBNSpatial; // #if HIPDNN_VERSION >= 7000 // mode = HIPDNN_BATCHNORM_SPATIAL_PERSISTENT; diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu index c17244dce0..b77e9d489f 100644 --- a/src/ops/batch_norm.cu +++ b/src/ops/batch_norm.cu @@ -279,6 +279,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); relu = bn->relu; profiling = bn->profiling; + inference_debugging = bn->inference_debugging; mode = CUDNN_BATCHNORM_SPATIAL; #if CUDNN_VERSION >= 7000 mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 93a6de5a8f..2883428254 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -278,6 +278,9 @@ OpMeta *BeamTopK::init_task(Task const *task, MemoryAllocator gpu_mem_allocator(gpu_mem); BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator); m->profiling = topk->profiling; + m->inference_debugging = topk->inference_debugging; + std::strcpy(m->op_name, topk->name); + m->layer_guid = topk->layer_guid; m->sorted = topk->sorted; m->max_beam_width = topk->max_beam_width; m->input_type[0] = topk->inputs[0]->data_type; @@ -346,60 +349,36 @@ BeamInferenceResult assert(regions.size() == 4); assert(task->regions.size() == 4); - // BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; + BeamTopKMeta *m = *((BeamTopKMeta **)task->local_args); BeamSearchBatchConfig const &bc = Future(task->futures[0]).get_result(); - // std::cout << "beam search topk inference: " - // << "\n"; + if (bc.num_tokens == 0) { BeamInferenceResult ir; return ir; } - BeamTopKMeta const *m = *((BeamTopKMeta **)task->local_args); - Domain in1_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - // Domain out1_domain = runtime->get_index_space_domain( - // ctx, task->regions[1].region.get_index_space()); - Domain out2_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - int numdims = in1_domain.get_dim(); - - // float const *in_ptr = helperGetTensorPointerRO( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - // float *value_ptr = helperGetTensorPointerWO( - // regions[1], task->regions[1], FID_DATA, ctx, runtime); - int *index_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW index = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); - // ); - float *value_ptr = helperGetTensorPointerWO( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + int *index_ptr = index.get_int32_ptr(); + float *value_ptr = value.get_float_ptr(); + int *parent_ptr = parent.get_int32_ptr(); - int *parent_ptr = helperGetTensorPointerWO( - regions[3], task->regions[3], FID_DATA, ctx, runtime); // embedding size: eg. 4096 - int length = in1_domain.hi()[0] - in1_domain.lo()[0] + 1; - - // int k = out2_domain.hi()[0] - out2_domain.lo()[0] + 1; - + int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; // total token nums - // size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; - // size_t batch_size = in1_domain.get_volume() / length; size_t batch_size = bc.num_active_tokens(); - // std::vector beam_width; - // std::unordered_map sub_requests = bc->sub_requests; - // for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { - // if (bc->request_completed[i]) { - // continue; - // } - // // add beam width for each main request - // beam_width.push_back(sub_requests[i]); - // std::cout << "sub req num: " <(index_ptr, ir.token_ids, batch_size * m->max_beam_width); download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); - // if(m->output_type[0] == DT_FLOAT){ - // download_tensor(value.get_float_ptr(), ir.probs, batch_size * - // m->max_beam_width); - // }else if(m->output_type[0] == DT_HALF){ - // download_tensor(value.get_half_ptr(), ir.probs, batch_size * - // m->max_beam_width); - // } download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + BeamTopK::save_inference_tensors_to_file( + m, shard_id, &bc, {input}, {}, {index, value, parent}); + } + return ir; } @@ -435,6 +415,7 @@ void BeamTopK::backward(FFModel const &ff) { void BeamTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->sorted); sez.serialize(this->max_beam_width); } @@ -445,11 +426,12 @@ Node BeamTopK::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 1); bool sorted; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; int max_beam_width; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(sorted); dez.deserialize(max_beam_width); BeamTopKParams params; diff --git a/src/ops/cache.cc b/src/ops/cache.cc index 339b2cab55..691e45b559 100644 --- a/src/ops/cache.cc +++ b/src/ops/cache.cc @@ -168,6 +168,9 @@ OpMeta *Cache::init_task(Task const *task, CacheMeta *m = new CacheMeta(handle); m->cache_score = 0.0f; m->profiling = c->profiling; + m->inference_debugging = c->inference_debugging; + std::strcpy(m->op_name, c->name); + m->layer_guid = c->layer_guid; return m; } diff --git a/src/ops/cast.cc b/src/ops/cast.cc index d98a54fe62..2a845cb303 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -193,6 +193,8 @@ OpMeta *Cast::init_task(Task const *task, CastMeta *m = new CastMeta(handler); m->input_data_type = cast->inputs[0]->data_type; m->output_data_type = cast->outputs[0]->data_type; + std::strcpy(m->op_name, cast->name); + m->layer_guid = cast->layer_guid; return m; } diff --git a/src/ops/concat.cc b/src/ops/concat.cc index 8014d1e145..80935e387b 100644 --- a/src/ops/concat.cc +++ b/src/ops/concat.cc @@ -201,7 +201,9 @@ OpMeta *Concat::init_task(Task const *task, // Note that our internal axis index ordering is opposite to other frameworks init_meta(m, cc->legion_axis); m->profiling = cc->profiling; + m->inference_debugging = cc->inference_debugging; std::strcpy(m->op_name, cc->name); + m->layer_guid = cc->layer_guid; return m; } diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index ce7b6ebc01..7d8fd32570 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -592,8 +592,10 @@ OpMeta *Conv2D::init_task(Task const *task, m->relu = conv->activation == AC_MODE_RELU; m->use_bias = conv->use_bias; m->profiling = conv->profiling; + m->inference_debugging = conv->inference_debugging; m->trainableInputs[0] = conv->trainableInputs[0]; std::strcpy(m->op_name, conv->name); + m->layer_guid = conv->layer_guid; int input_w = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; int input_h = acc_input.rect.hi[1] - acc_input.rect.lo[1] + 1; @@ -1013,6 +1015,7 @@ bool Conv2D::estimate_sync_cost(Simulator *sim, void Conv2D::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->out_channels); sez.serialize(this->kernel_h); sez.serialize(this->kernel_w); @@ -1037,10 +1040,11 @@ Node Conv2D::deserialize(FFModel &ff, padding_w, groups; bool use_bias; ActiMode activation; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(out_channels); dez.deserialize(kernel_h); dez.deserialize(kernel_w); diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index e4021a404e..9b11c9d912 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -170,6 +170,8 @@ OpMeta *Dropout::init_task(Task const *task, .first(); assert(input_domain == output_domain); DropoutMeta *m = new DropoutMeta(handle, dropout, gpu_mem, output_domain); + std::strcpy(m->op_name, dropout->name); + m->layer_guid = dropout->layer_guid; return m; } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 21edad11e3..aa31477815 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -420,11 +420,13 @@ OpMeta *ElementBinary::init_task(Task const *task, } m->op_type = eb->op_type; m->profiling = eb->profiling; + m->inference_debugging = eb->inference_debugging; m->inplace_a = eb->inplace_a; m->has_same_operands = eb->has_same_operands; m->broadcast_input1 = eb->broadcast_input1; m->broadcast_input2 = eb->broadcast_input2; std::strcpy(m->op_name, eb->name); + m->layer_guid = eb->layer_guid; Domain input1_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain input2_domain, output_domain; @@ -620,7 +622,7 @@ __host__ void return; } // const ElementBinary* ele = (const ElementBinary*) task->args; - ElementBinaryMeta const *m = *((ElementBinaryMeta **)task->local_args); + ElementBinaryMeta *m = *((ElementBinaryMeta **)task->local_args); GenericTensorAccessorR in1, in2; GenericTensorAccessorW out; Domain in1_domain = runtime->get_index_space_domain( @@ -705,8 +707,14 @@ __host__ void runtime); } } - forward_kernel_wrapper(m, in1, in2, out); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + ElementBinary::save_inference_tensors_to_file( + m, shard_id, bc, {in1, in2}, {}, {out}); + } } /* @@ -1011,6 +1019,7 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, ElementBinaryMeta *m = new ElementBinaryMeta(sim->handler, this); m->op_type = op_type; m->profiling = this->profiling; + m->inference_debugging = this->inference_debugging; m->inplace_a = this->inplace_a; m->has_same_operands = this->has_same_operands; m->broadcast_input1 = this->broadcast_input1; @@ -1103,6 +1112,7 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, void ElementBinary::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); sez.serialize(this->inplace_a); } @@ -1115,11 +1125,12 @@ Node ElementBinary::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 2); OperatorType op_type; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; bool inplace_a; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(op_type); dez.deserialize(inplace_a); diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index bdb594b0f6..9fb2e6dc1f 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -360,9 +360,11 @@ OpMeta *ElementUnary::init_task(Task const *task, // Input and output should have the same data type assert(eu->outputs[0]->data_type == eu->inputs[0]->data_type); m->profiling = eu->profiling; + m->inference_debugging = eu->inference_debugging; m->inplace = eu->inplace; m->scalar = eu->scalar; std::strcpy(m->op_name, eu->name); + m->layer_guid = eu->layer_guid; if (m->inplace) { assert(regions.size() == 1); assert(task->regions.size() == 1); @@ -525,7 +527,7 @@ void ElementUnary::forward_task_with_type( Context ctx, Runtime *runtime) { // const ElementUnary* ele = (const ElementUnary*) task->args; - ElementUnaryMeta const *m = *((ElementUnaryMeta **)task->local_args); + ElementUnaryMeta *m = *((ElementUnaryMeta **)task->local_args); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); const DT *input_ptr = NULL; @@ -550,6 +552,27 @@ void ElementUnary::forward_task_with_type( ElementUnary::forward_kernel_wrapper
( m, input_ptr, output_ptr, input_domain.get_volume()); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector input_accessors; + std::vector output_accessors; + if (m->inplace) { + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + output_accessors.push_back(output); + } else { + GenericTensorAccessorR input = helperGetGenericTensorAccessorWO( + m->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + input_accessors.push_back(input); + output_accessors.push_back(output); + } + ElementUnary::save_inference_tensors_to_file( + m, shard_id, nullptr, input_accessors, {}, output_accessors); + } } void ElementUnary::backward(FFModel const &ff) { @@ -699,6 +722,7 @@ void ElementUnary::serialize(Legion::Serializer &sez) const { sez.serialize(scalar); sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); } bool ElementUnary::measure_operator_cost(Simulator *sim, @@ -809,10 +833,11 @@ Node ElementUnary::deserialize(FFModel &ff, dez.deserialize(op_type); dez.deserialize(inplace); dez.deserialize(scalar); - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); ElementUnaryParams params; params.op_type = op_type; diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 409dcb398e..007e799fe0 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -416,7 +416,10 @@ OpMeta *Embedding::init_task(Task const *task, FFHandler handle = *((FFHandler const *)task->local_args); EmbeddingMeta *m = new EmbeddingMeta(handle, embed); m->profiling = embed->profiling; + m->inference_debugging = embed->inference_debugging; m->aggr = embed->aggr; + std::strcpy(m->op_name, embed->name); + m->layer_guid = embed->layer_guid; return m; } @@ -514,7 +517,7 @@ void Embedding::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - EmbeddingMeta const *m = *((EmbeddingMeta **)task->local_args); + EmbeddingMeta *m = *((EmbeddingMeta **)task->local_args); assert(regions.size() == 3); assert(task->regions.size() == 3); // Assert that weight and output must have the same data type @@ -561,75 +564,13 @@ void Embedding::forward_task(Task const *task, } forward_kernel_wrapper( m, input, output, kernel, in_dim, out_dim, effective_batch_size); -} - -#ifdef DEADCODE -template -void Embedding::forward_task_with_type( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - // const Embedding* embed = (Embedding*) task->args; - EmbeddingMeta const *m = *((EmbeddingMeta **)task->local_args); - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Domain kernel_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - if (m->aggr == AGGR_MODE_NONE) { - // assert(kernel_domain.get_dim() == 2); - assert(input_domain.get_dim() + 1 == output_domain.get_dim()); - for (size_t i = 0; i < input_domain.get_dim(); i++) { - assert(input_domain.hi()[i] == output_domain.hi()[i + 1]); - assert(input_domain.lo()[i] == output_domain.lo()[i + 1]); - } - assert(kernel_domain.hi()[0] - kernel_domain.lo()[0] == - output_domain.hi()[0] - output_domain.lo()[0]); - } else { - // assert(kernel_domain.get_dim() == 2); - assert(input_domain.get_dim() == output_domain.get_dim()); - for (size_t i = 1; i < input_domain.get_dim(); i++) { - assert(input_domain.hi()[i] == output_domain.hi()[i]); - assert(input_domain.lo()[i] == output_domain.lo()[i]); - } - assert(kernel_domain.hi()[0] - kernel_domain.lo()[0] == - output_domain.hi()[0] - output_domain.lo()[0]); - } - const TI *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - float const *kernel_ptr = helperGetTensorPointerRO( - regions[2], task->regions[2], FID_DATA, ctx, runtime); - - int in_dim, out_dim, effective_batch_size; - if (m->aggr == AGGR_MODE_NONE) { - in_dim = 1; - out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; - effective_batch_size = output_domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == input_domain.get_volume()); - } else { - in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; - out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; - effective_batch_size = output_domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == input_domain.get_volume()); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Embedding::save_inference_tensors_to_file( + m, shard_id, nullptr, {input}, {kernel}, {output}); } - - forward_kernel_wrapper(m, - input_ptr, - output_ptr, - kernel_ptr, - in_dim, - out_dim, - effective_batch_size, - m->aggr, - output_domain.get_volume()); } -#endif void Embedding::backward(FFModel const &ff) { ArgumentMap argmap; diff --git a/src/ops/experts.cc b/src/ops/experts.cc index c8b0ec0f26..6a7d622e51 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -398,6 +398,7 @@ void Experts::serialize(Legion::Serializer &sez) const { ExpertsParams params = get_params(); sez.serialize(params.layer_guid.id); sez.serialize(params.layer_guid.transformer_layer_id); + sez.serialize(params.layer_guid.model_id); sez.serialize(params.num_experts); sez.serialize(params.experts_start_idx); sez.serialize(params.experts_output_dim_size); @@ -418,10 +419,11 @@ Node Experts::deserialize(FFModel &ff, float alpha; ActiMode activation; bool use_bias; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(num_experts); dez.deserialize(experts_start_idx); dez.deserialize(experts_output_dim_size); @@ -593,6 +595,9 @@ OpMeta *Experts::init_task(Task const *task, exp->use_bias, exp->activation); m->profiling = exp->profiling; + m->inference_debugging = exp->inference_debugging; + std::strcpy(m->op_name, exp->name); + m->layer_guid = exp->layer_guid; return m; } @@ -732,7 +737,7 @@ void Experts::inference_task(Task const *task, Runtime *runtime) { assert(regions.size() == task->regions.size()); - ExpertsMeta const *m = *((ExpertsMeta **)task->local_args); + ExpertsMeta *m = *((ExpertsMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -743,14 +748,19 @@ void Experts::inference_task(Task const *task, assert(regions.size() - 4 == (1 + use_bias)); // get input, indices, topk_gate_preds, outputs - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int const *indices_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - float const *topk_gate_pred_ptr = helperGetTensorPointerRO( - regions[2], task->regions[2], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerWO( - regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR indices = helperGetGenericTensorAccessorRO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR topk_gate_preds = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + + float const *input_ptr = input.get_float_ptr(); + int const *indices_ptr = indices.get_int32_ptr(); + float const *topk_gate_pred_ptr = topk_gate_preds.get_float_ptr(); + float *output_ptr = output.get_float_ptr(); assert(input_ptr != nullptr && indices_ptr != nullptr && topk_gate_pred_ptr != nullptr && output_ptr != nullptr); @@ -1107,6 +1117,13 @@ void Experts::inference_task(Task const *task, free(cpu_output_ptr); } #endif + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Experts::save_inference_tensors_to_file( + m, shard_id, bc, {input, indices, topk_gate_preds}, {}, {output}); + } } void Experts::forward_task(Task const *task, diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 5f2874e662..7d0d5841f0 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1088,6 +1088,31 @@ __host__ void assert(false && "Fusion currently does not support type"); } } + if (metas->meta[op]->inference_debugging) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + input_accessors_to_save.push_back(input_accessor[my_off]); + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + input_accessors_to_save.push_back(output_accessor[my_off]); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + weight_accessors_to_save.push_back(weight_accessor[fused->op_weight_idx[i + woff]]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(output_accessor[i + ooff]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], shard_id, bc, input_accessors_to_save, weight_accessors_to_save, output_accessors_to_save); + } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; diff --git a/src/ops/gather.cc b/src/ops/gather.cc index 635c741d8b..d7c1dee44c 100644 --- a/src/ops/gather.cc +++ b/src/ops/gather.cc @@ -167,6 +167,7 @@ void Gather::serialize(Legion::Serializer &sez) const { sez.serialize(params.legion_dim); sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); } using PCG::Node; @@ -178,10 +179,11 @@ Node Gather::deserialize(FFModel &ff, assert(num_inputs == 2); int legion_dim; dez.deserialize(legion_dim); - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); GatherParams params; params.legion_dim = legion_dim; @@ -243,6 +245,8 @@ OpMeta *Gather::init_task(Task const *task, Gather const *gather = (Gather const *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); GatherMeta *m = new GatherMeta(handle, gather); + std::strcpy(m->op_name, gather->name); + m->layer_guid = gather->layer_guid; GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index f2f94234c3..50871983f5 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -266,6 +266,9 @@ OpMeta *Group_by::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha); m->profiling = gb->profiling; + m->inference_debugging = gb->inference_debugging; + std::strcpy(m->op_name, gb->name); + m->layer_guid = gb->layer_guid; return m; } @@ -369,35 +372,39 @@ void Group_by::forward_task(Task const *task, int n = (int)regions.size() - 2; assert((int)task->regions.size() == n + 2); - GroupByMeta const *m = *((GroupByMeta **)task->local_args); + GroupByMeta *m = *((GroupByMeta **)task->local_args); // get input and assign regions. Each tensor has three dimensions: // (datapoint_dim, batch_size, replica_dim) - AccessorRO const acc_input(regions[0], FID_DATA); - AccessorRO const acc_assign(regions[1], FID_DATA); - - Rect<3> rect_input = runtime->get_index_space_domain( + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR assign = helperGetGenericTensorAccessorRO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Rect<3> rect_assign = runtime->get_index_space_domain( + Domain assign_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - coord_t input_rows = rect_input.hi[1] - rect_input.lo[1] + 1; - coord_t input_cols = rect_input.hi[0] - rect_input.lo[0] + 1; - assert(input_rows == rect_assign.hi[1] - rect_assign.lo[1] + 1); + coord_t input_rows = input_domain.hi()[1] - input_domain.lo()[1] + 1; + coord_t input_cols = input_domain.hi()[0] - input_domain.lo()[0] + 1; + assert(input_rows == assign_domain.hi()[1] - assign_domain.lo()[1] + 1); - int k = rect_assign.hi[0] - rect_assign.lo[0] + 1; + int k = assign_domain.hi()[0] - assign_domain.lo()[0] + 1; int batch_size = input_rows; int data_dim = input_cols; // Create a vector of n outputs, where n is the number of experts. // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert + std::vector output_accessors; float *outputs[n]; for (int i = 0; i < n; i++) { + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[i + 2], task->regions[i + 2], FID_DATA, ctx, runtime); + output_accessors.push_back(output); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[i + 2].region.get_index_space()); - outputs[i] = helperGetTensorPointerWO( - regions[i + 2], task->regions[i + 2], FID_DATA, ctx, runtime); + outputs[i] = output.get_float_ptr(); coord_t output_rows = out_domain.hi()[1] - out_domain.lo()[1] + 1; coord_t output_cols = out_domain.hi()[0] - out_domain.lo()[0] + 1; @@ -405,13 +412,19 @@ void Group_by::forward_task(Task const *task, } Group_by::forward_kernel_wrapper(m, - acc_input.ptr(rect_input), - acc_assign.ptr(rect_assign), + input.get_float_ptr(), + assign.get_int32_ptr(), outputs, n, k, batch_size, data_dim); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Group_by::save_inference_tensors_to_file( + m, shard_id, nullptr, {input, assign}, {}, output_accessors); + } } void Group_by::backward(FFModel const &ff) { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 68b5fa39a1..2f72976d30 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -23,10 +23,6 @@ #endif #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" -#ifdef INFERENCE_TESTS -#include -using namespace at::indexing; -#endif namespace FlexFlow { @@ -725,6 +721,9 @@ OpMeta *IncMultiHeadSelfAttention::init_task( gpu_mem_allocator.reserved_total_size); } m->profiling = attn->profiling; + m->inference_debugging = attn->inference_debugging; + std::strcpy(m->op_name, attn->name); + m->layer_guid = attn->layer_guid; if (attn->quantization_type == DT_NONE) { assert(weight.domain.get_volume() * data_type_size(weight.data_type) == m->weightSize); @@ -811,7 +810,6 @@ void IncMultiHeadSelfAttention::inference_task( assert(task->regions.size() == regions.size()); - // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", bc->num_tokens, @@ -820,7 +818,7 @@ void IncMultiHeadSelfAttention::inference_task( return; } - IncMultiHeadSelfAttentionMeta const *m = + IncMultiHeadSelfAttentionMeta *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 @@ -860,787 +858,18 @@ void IncMultiHeadSelfAttention::inference_task( IncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, task->index_point.point_data[0], input, weight, output, biases); -#ifdef INFERENCE_TESTS - printf("Checking IncMultiHeadSelfAttention computations...\n"); - - // ============================================================================= - // Define helper functions to handle row-major arrays - // ============================================================================= - - auto set_value_row_major = [](float *arr, - std::vector const &shape, - std::vector const &indices, - float value) -> void { - int offset = 0; - for (int i = 0; i < shape.size(); i++) { - int index = indices[i]; - int stride = 1; - for (int j = i + 1; j < shape.size(); j++) { - stride *= shape[j]; - } - offset += index * stride; - } - *(arr + offset) = value; - }; - - // ============================================================================= - // Load input/output/weights and parse general configs - // ============================================================================= - - float *input_cpu = - download_tensor(input.get_float_ptr(), input_domain.get_volume()); - assert(input_cpu != nullptr); - float *weight_cpu = download_tensor(weight.get_float_ptr(), - weight_domain.get_volume()); - assert(weight_cpu != nullptr); - float *output_cpu = download_tensor(output.get_float_ptr(), - output_domain.get_volume()); - assert(output_cpu != nullptr); - - // Input tensor dimensions - coord_t data_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; - coord_t max_sequence_length = input_domain.hi()[1] - input_domain.lo()[1] + 1; - coord_t batch_size = input_domain.hi()[2] - input_domain.lo()[2] + 1; - coord_t replica_dim = input_domain.hi()[3] - input_domain.lo()[3] + 1; - assert(replica_dim == 1); - - size_t effective_batch_size = max_sequence_length * batch_size; - float inputs_arr[data_dim][effective_batch_size] = {0}; - for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { - size_t data_index = i % data_dim; - size_t token_index = i / data_dim; - assert(data_index < data_dim); - assert(token_index < effective_batch_size); - inputs_arr[data_index][token_index] = input_cpu[i]; - } - torch::Tensor torch_input = torch::from_blob( - inputs_arr, {data_dim, (long int)effective_batch_size}, torch::kFloat32); - - // Weight tensor dimensions - coord_t all_weight_params = weight_domain.hi()[0] - weight_domain.lo()[0] + 1; - coord_t num_q_heads = weight_domain.hi()[1] - weight_domain.lo()[1] + 1; - replica_dim = weight_domain.hi()[2] - weight_domain.lo()[2] + 1; - size_t qParas = m->qProjSize * m->qSize; - size_t kParas = m->kProjSize * m->kSize; - size_t vParas = m->vProjSize * m->vSize; - size_t oParas = m->oProjSize * (m->vProjSize > 0 ? m->vProjSize : m->vSize); - - assert(all_weight_params == qParas + kParas + vParas + oParas); - assert(num_q_heads == m->num_q_heads); - assert(replica_dim == 1); - - assert(m->qSize == m->kSize && m->kSize == m->vSize); - // printf("m->qSize: %i\n", m->qSize); - // keep things simple for now - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - long int proj_sum = m->qProjSize + m->kProjSize + m->vProjSize; - // load weight manually because Torch can't easily read a tensor serialized in - // column-major order. - - // printf("m->kProjSize: %i, BatchConfig::max_tokens_per_batch(): %i, " - // "bc->num_active_tokens(): %i, num_q_heads: %lli, - // BatchConfig::max_requests_per_batch(): %i, " - // "bc->num_active_requests(): %i\n", m->kProjSize, - // BatchConfig::max_tokens_per_batch(), bc->num_active_tokens(), - // num_q_heads, BatchConfig::max_requests_per_batch(), - // bc->num_active_requests()); - // for (int t=0; t < bc->num_active_tokens(); t++) { - // printf("token %i has request_index: %li and token_position: %li\n", - // t, bc->token2ids.token_indexes[t].request_index, - // bc->token2ids.token_indexes[t].token_position); - // } - - // ============================================================================= - // Load the output tensor (with CUDA results), and create a Torch tensor - // ============================================================================= - - float output_cuda[m->oProjSize][effective_batch_size] = {0}; - for (int i = 0; i < m->oProjSize * effective_batch_size; i++) { - int row_idx = i % m->oProjSize; - int col_idx = i / m->oProjSize; - assert(row_idx < m->oProjSize && col_idx < effective_batch_size); - output_cuda[row_idx][col_idx] = output_cpu[i]; - } - torch::Tensor torch_out_cuda = - torch::from_blob(output_cuda, - {m->oProjSize, (int64_t)effective_batch_size}, - torch::kFloat32); - - // ============================================================================= - // Load the Q/K/V projection weights, and create a Torch tensor - // ============================================================================= - std::vector w_qkv_shape = {m->qSize, m->qProjSize, 3, (int)num_q_heads}; - float *w_qkv = - (float *)calloc(m->qSize * m->qProjSize * 3 * num_q_heads, sizeof(float)); - assert(w_qkv[0] == 0.0f); - - for (int h = 0; h < num_q_heads; h++) { - for (size_t i = 0; i < m->qProjSize * m->qSize; i++) { - int row_index = i % m->qSize; - int column_index = i / m->qSize; - // Q - set_value_row_major(w_qkv, - w_qkv_shape, - {row_index, column_index, 0, h}, - weight_cpu[all_weight_params * h + - m->qSize * column_index + row_index]); - // K - set_value_row_major( - w_qkv, - w_qkv_shape, - {row_index, column_index, 1, h}, - weight_cpu[all_weight_params * h + m->qProjSize * m->qSize + - m->qSize * column_index + row_index]); - // V - set_value_row_major( - w_qkv, - w_qkv_shape, - {row_index, column_index, 2, h}, - weight_cpu[all_weight_params * h + 2 * m->qProjSize * m->qSize + - m->qSize * column_index + row_index]); - } - } - // convert weights to torch tensor - torch::Tensor torch_w_qkv = torch::from_blob( - w_qkv, {m->qSize, m->qProjSize, 3, (int)num_q_heads}, torch::kFloat32); - - /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() - << std::endl; - std::cout << "Torch input size: " << torch_input.sizes() << std::endl; - std::cout << "Number of active tokens: " << bc->num_active_tokens() - << std::endl; */ - // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; - - // ============================================================================= - // Compute the Q/K/V projections, and compare the results with CUDA - // ============================================================================= - // ----------------------- C++ computations & checks ------------------------ - torch::Tensor qkv_projs = torch::einsum( - "ijkl,im->jmkl", - {torch_w_qkv, - torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); - // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; - assert(qkv_projs.sizes()[0] == m->qProjSize); - assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && - qkv_projs.sizes()[1] <= effective_batch_size); - assert(qkv_projs.sizes()[2] == 3); - assert(qkv_projs.sizes()[3] == num_q_heads); - free(w_qkv); - - // ----------------------- Loading CUDA results for this step --------------- - float *QKVProjArray_cpu = download_tensor( - m->devQKVProjArray, - BatchConfig::max_tokens_per_batch() * proj_sum * m->num_q_heads); - assert(QKVProjArray_cpu != nullptr); - - std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc->num_active_tokens(), 3, (int)num_q_heads}; - float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc->num_active_tokens() * 3 * num_q_heads, sizeof(float)); - - // skip over padding at the end of QKVProjArray_cpu - // convert from column order to 3D matrix because torch cannot automatically - // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_q_heads; - i++) { - int proj_size_index = i % m->qProjSize; - int head_index = i / (proj_sum * bc->num_active_tokens()); - int token_index = - ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % - bc->num_active_tokens(); - int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / - (m->qProjSize * bc->num_active_tokens()); - assert(proj_size_index < proj_sum); - assert(head_index < num_q_heads); - assert(token_index < bc->num_active_tokens()); - assert(qkv_offset < 3); - set_value_row_major(QKVProjArray_converted, - QKVProjArray_converted_shape, - {proj_size_index, token_index, qkv_offset, head_index}, - QKVProjArray_cpu[i]); - } - torch::Tensor QKVProjArray_torch = - torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc->num_active_tokens(), 3, num_q_heads}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - // std::cout << "QKVProjArray_torch" << std::endl; - // for (int i=0; inum_active_tokens(); t++) { - for (size_t d = 0; d < m->kProjSize; d++) { - size_t kcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * - BatchConfig::max_requests_per_batch() + - bc->tokensInfo[t].abs_depth_in_request * - m->num_q_heads * - BatchConfig::max_requests_per_batch() + - h * BatchConfig::max_requests_per_batch() + - bc->tokensInfo[t].request_index; - m->kcache[kcache_idx] = - qkv_projs.index({(int64_t)d, (int64_t)t, 1, (int64_t)h}) - .item(); - } - for (size_t d = 0; d < m->vProjSize; d++) { - size_t vcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * - BatchConfig::max_requests_per_batch() + - bc->tokensInfo[t].abs_depth_in_request * - m->num_q_heads * - BatchConfig::max_requests_per_batch() + - h * BatchConfig::max_requests_per_batch() + - bc->tokensInfo[t].request_index; - m->vcache[vcache_idx] = - qkv_projs.index({(int64_t)d, (int64_t)t, 2, (int64_t)h}) - .item(); - } - } - } - // Create torch tensors from the arrays - torch::Tensor K_t = torch::from_blob(m->kcache, - {m->kProjSize, - MAX_SEQ_LEN, - num_q_heads, - BatchConfig::max_requests_per_batch()}, - torch::kFloat32); - torch::Tensor V_t = torch::from_blob(m->vcache, - {m->vProjSize, - MAX_SEQ_LEN, - num_q_heads, - BatchConfig::max_requests_per_batch()}, - torch::kFloat32); - - // Compute useful indices - std::vector req_idxs; - std::vector r_first_idx; - std::vector r_num_tokens; - for (size_t t = 0; t < bc->num_active_tokens(); t++) { - size_t rid = bc->tokensInfo[t].request_index; - if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { - req_idxs.push_back(rid); - r_first_idx.push_back(t); - r_num_tokens.push_back(1); - } else { - r_num_tokens[r_num_tokens.size() - 1]++; + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(weight); + if (*m->qkv_bias || *m->final_bias) { + weights_accessors.push_back(biases); } - assert(req_idxs.size() == r_first_idx.size() && - r_first_idx.size() == r_num_tokens.size()); + IncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, bc, {input}, weights_accessors, {output}); } - assert(req_idxs.size() == bc->num_active_requests()); - assert(std::accumulate(r_num_tokens.begin(), - r_num_tokens.end(), - decltype(r_num_tokens)::value_type(0)) == - bc->num_active_tokens()); - - // ----------------------- Loading CUDA results for this step --------------- - float *keyCache_cpu = download_tensor( - m->keyCache, - m->num_q_heads * m->kProjSize * BatchConfig::max_requests_per_batch() * - MAX_SEQ_LEN); - float *valueCache_cpu = download_tensor( - m->valueCache, - m->num_q_heads * m->vProjSize * BatchConfig::max_requests_per_batch() * - MAX_SEQ_LEN); - assert(keyCache_cpu != nullptr); - assert(valueCache_cpu != nullptr); - - float *kcache_cuda = - (float *)calloc(m->kProjSize * MAX_SEQ_LEN * m->num_q_heads * - BatchConfig::max_requests_per_batch(), - sizeof(float)); - float *vcache_cuda = - (float *)calloc(m->vProjSize * MAX_SEQ_LEN * m->num_q_heads * - BatchConfig::max_requests_per_batch(), - sizeof(float)); - int index = 0; - for (int i = 0; i < m->kProjSize; i++) { - for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_q_heads; k++) { - for (int l = 0; l < BatchConfig::max_requests_per_batch(); l++) { - int col_major_index = - l * m->kProjSize * MAX_SEQ_LEN * m->num_q_heads + - k * m->kProjSize * MAX_SEQ_LEN + j * m->kProjSize + i; - kcache_cuda[index++] = keyCache_cpu[col_major_index]; - } - } - } - } - index = 0; - for (int i = 0; i < m->vProjSize; i++) { - for (int j = 0; j < MAX_SEQ_LEN; j++) { - for (int k = 0; k < m->num_q_heads; k++) { - for (int l = 0; l < BatchConfig::max_requests_per_batch(); l++) { - int col_major_index = - l * m->vProjSize * MAX_SEQ_LEN * m->num_q_heads + - k * m->vProjSize * MAX_SEQ_LEN + j * m->vProjSize + i; - vcache_cuda[index++] = valueCache_cpu[col_major_index]; - } - } - } - } - torch::Tensor K_t_cuda = - torch::from_blob(kcache_cuda, - {m->kProjSize, - MAX_SEQ_LEN, - num_q_heads, - BatchConfig::max_requests_per_batch()}, - torch::kFloat32); - torch::Tensor V_t_cuda = - torch::from_blob(vcache_cuda, - {m->vProjSize, - MAX_SEQ_LEN, - num_q_heads, - BatchConfig::max_requests_per_batch()}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - - // std::cout << "kcache differences:" << std::endl; - // for (int i=0; i < bc->num_active_requests() + 1; i++) { - // for (int j=0; j < num_q_heads; j++) { - // for (int l=0; l < m->kProjSize; l++) { - // for (int k=0; k < MAX_SEQ_LEN; k++) { - // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * - // BatchConfig::max_requests_per_batch() + k * num_q_heads * - // BatchConfig::max_requests_per_batch() + j * - // BatchConfig::max_requests_per_batch() + i; if ( - // abs(m->kcache[kcache_idx] - keyCache_cpu[ - // i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + - // j * m->kProjSize * MAX_SEQ_LEN + - // k * m->kProjSize + - // l - // ]) > 0.00001) { - // printf("req: %i (rid: %i), head: %i, data_dim: %i, token_pos: - // %i\n", - // i, req_idxs[i], j, l, k); - // } - // } - // } - // } - // } - - // std::cout << "keyCache from CUDA:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jkProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // printf("%f ", - // keyCache_cpu[i * m->kProjSize * MAX_SEQ_LEN * num_q_heads + - // j * m->kProjSize * MAX_SEQ_LEN + - // k * m->kProjSize + - // l - // ]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // std::cout << "valueCache from CUDA:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jvProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // printf("%f ", - // valueCache_cpu[ - // i * m->vProjSize * MAX_SEQ_LEN * num_q_heads + - // j * m->vProjSize * MAX_SEQ_LEN + - // k * m->vProjSize + - // l]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // printf("\n"); - - // std::cout << "C++ kcache:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; j < num_q_heads; j++) { - // for (int l=0; l < m->kProjSize; l++) { - // for (int k=0; k < MAX_SEQ_LEN; k++) { - // size_t kcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * - // BatchConfig::max_requests_per_batch() + k * num_q_heads * - // BatchConfig::max_requests_per_batch() + j * - // BatchConfig::max_requests_per_batch() + i; - // printf("%f ", m->kcache[kcache_idx]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - // std::cout << "C++ vcache:" << std::endl; - // for (int i=0; inum_active_requests()+1; i++) { - // for (int j=0; jvProjSize; l++) { - // for (int k=0; k< MAX_SEQ_LEN; k++) { - // size_t vcache_idx = - // l * MAX_SEQ_LEN * num_q_heads * - // BatchConfig::max_requests_per_batch() - // + k * num_q_heads * BatchConfig::max_requests_per_batch() + j - // * BatchConfig::max_requests_per_batch() + i; - // printf("%f ", m->vcache[vcache_idx]); - // } - // printf("\n"); - // } - // printf("\n"); - // } - // printf("\n"); - // } - - assert(torch::allclose(K_t_cuda, K_t, 1e-05, 1e-05)); - assert(torch::allclose(V_t_cuda, V_t, 1e-05, 1e-05)); - free(kcache_cuda); - free(vcache_cuda); - - // ============================================================================= - // Load the W_out projection weights - // ============================================================================= - - // ----------------------- C++ operations & checks -------------------------- - float *w_out = (float *)calloc(m->vProjSize * m->num_q_heads * m->oProjSize, - sizeof(float)); - std::vector w_out_shape = {m->vProjSize, m->num_q_heads, m->oProjSize}; - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - for (int h = 0; h < num_q_heads; h++) { - for (int v = 0; v < m->vProjSize; v++) { - for (int o = 0; o < m->oProjSize; o++) { - set_value_row_major( - w_out, - w_out_shape, - {v, h, o}, - weight_cpu[all_weight_params * h + 3 * m->qProjSize * m->qSize + - m->vProjSize * o + v]); - } - } - } - // convert weights to torch tensor - torch::Tensor torch_w_out = torch::from_blob( - w_out, {m->vProjSize, m->num_q_heads, m->oProjSize}, torch::kFloat32); - - // ----------------------- Loading CUDA results for this step --------------- - float *w_out_cuda = download_tensor( - m->W_out_contiguous, m->vProjSize * m->oProjSize * m->num_q_heads); - assert(w_out_cuda != nullptr); - float *converted_wout_tensor = (float *)calloc( - m->vProjSize * m->num_q_heads * m->oProjSize, sizeof(float)); - std::vector converted_wout_tensor_shape = { - m->vProjSize, m->num_q_heads, m->oProjSize}; - - for (int i = 0; i < m->vProjSize * m->num_q_heads * m->oProjSize; i++) { - int v_idx = i % m->vProjSize; - int h_idx = (i / m->vProjSize) % m->num_q_heads; - int o_idx = i / (m->vProjSize * m->num_q_heads); - assert(v_idx < m->vProjSize && h_idx < m->num_q_heads && - o_idx < m->oProjSize); - set_value_row_major(converted_wout_tensor, - converted_wout_tensor_shape, - {v_idx, h_idx, o_idx}, - w_out_cuda[i]); - } - torch::Tensor w_out_cuda_tensor = - torch::from_blob(converted_wout_tensor, - {m->vProjSize, m->num_q_heads, m->oProjSize}, - torch::kFloat32); - - // ----------------------- Comparing C++ & CUDA results --------------------- - assert(torch::allclose(w_out_cuda_tensor, torch_w_out, 1e-05, 1e-05)); - free(converted_wout_tensor); - - // ============================================================================= - // Compute the softmax(QK^T/sqrt(d_k))V product, request by request - // ============================================================================= - - // ----------------------- C++ initialization steps ------------------------- - torch::Tensor Q_projs = qkv_projs.index({Slice(), Slice(), 0, Slice()}) - .reshape({qkv_projs.sizes()[0], - qkv_projs.sizes()[1], - qkv_projs.sizes()[3]}); - - torch::Tensor qk_products[bc->num_active_requests()]; - torch::Tensor qk_softmax[bc->num_active_requests()]; - torch::Tensor attn_heads[bc->num_active_requests()]; - - torch::Tensor cpp_output = - torch::zeros({m->oProjSize, bc->num_active_tokens()}); - - // ----------------------- Loading CUDA results for this step --------------- - float *qk_prods_cpu = download_tensor( - m->qk_prods, - BatchConfig::max_tokens_per_batch() * - BatchConfig::max_tokens_per_batch() * num_q_heads); - assert(qk_prods_cpu != nullptr); - - float *qk_prods_softmax_cpu = download_tensor( - m->qk_prods_softmax, - BatchConfig::max_tokens_per_batch() * - BatchConfig::max_tokens_per_batch() * num_q_heads); - assert(qk_prods_softmax_cpu != nullptr); - - float *attn_heads_cpu = download_tensor( - m->attn_heads, - BatchConfig::max_tokens_per_batch() * m->num_q_heads * m->vProjSize); - assert(attn_heads_cpu != nullptr); - - // ----------------------- Main loop (request by request) ------------------- - size_t qk_prods_cpu_offset = 0; - - for (size_t r = 0; r < bc->num_active_requests(); r++) { - // Compute pre-request parameters - size_t num_new_tokens = r_num_tokens[r]; - int64_t rid = (int64_t)(req_idxs[r]); - int64_t num_tokens_received_so_far = - (int64_t)(bc->requestsInfo[rid].token_start_offset + - bc->requestsInfo[rid].num_tokens_in_batch); - assert(num_new_tokens == bc->requestsInfo[rid].num_tokens_in_batch); - assert(num_tokens_received_so_far >= (int64_t)num_new_tokens); - - // ----------------------- C++ computations ------------------------------- - // Get the slice of the Q projection tensor with the tokens in the current - // request - torch::Tensor Q_req = - Q_projs.index({Slice(), - Slice(r_first_idx[r], r_first_idx[r] + num_new_tokens), - Slice()}); - // std::cout << "Q_req.sizes(): " << Q_req.sizes() << std::endl; - assert(Q_req.sizes()[0] == m->qProjSize); - assert(Q_req.sizes()[1] == num_new_tokens); - assert(Q_req.sizes()[2] == num_q_heads); - - /*printf("\n------------ QK multiplication (C++) -------------\n"); - printf("Request r=%lu. num_new_tokens: %lu, num_tokens_received_so_far: %li, - rid: %li, Qproj slice: (%i, %i)\n", r, num_new_tokens, - num_tokens_received_so_far, rid, r_first_idx[r], r_first_idx[r] + - num_new_tokens); - - std::cout << "Q_req matrix (idk dims):" << std::endl << - Q_req.index({Slice(), Slice(), 0}) << std::endl << std::endl; std::cout << - "K_t matrix (ilk dims):" << std::endl << K_t.index({Slice(), Slice(0, - num_tokens_received_so_far), 0, rid}) << std::endl << std::endl; std::cout - << "C++ alpha: " << (1.0f / sqrt(m->kProjSize)) << std::endl;*/ - - // Compute (Q*K^T)/sqrt(d_k) matmul - qk_products[r] = - torch::einsum("ijk,ilk->jlk", - {Q_req, - K_t.index({Slice(), - Slice(0, num_tokens_received_so_far), - Slice(), - rid})}) * - (1.0f / sqrt(m->kProjSize)); - - // Set entries above diagonal to -inf to make attention causal. - for (int h = 0; h < num_q_heads; h++) { - qk_products[r].index( - {Slice(), Slice(num_tokens_received_so_far - num_new_tokens), h}) = - qk_products[r] - .index({Slice(), - Slice(num_tokens_received_so_far - num_new_tokens), - h}) - .tril() + - torch::full({(int64_t)num_new_tokens, (int64_t)num_new_tokens}, - -INFINITY) - .triu() - .fill_diagonal_(0); - } - // Compute softmax for each request block - qk_softmax[r] = torch::softmax(qk_products[r], -2); - assert(qk_softmax[r].sizes()[0] == num_new_tokens); - assert(qk_softmax[r].sizes()[1] == num_tokens_received_so_far); - assert(qk_softmax[r].sizes()[2] == m->num_q_heads); - - // ------------------- Loading CUDA results for this step --------------- - float *converted_qk_prod = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_q_heads, - sizeof(float)); - float *converted_qk_prod_softmax = (float *)calloc( - num_new_tokens * num_tokens_received_so_far * num_q_heads, - sizeof(float)); - std::vector converted_qk_prod_shape = { - (int)num_new_tokens, (int)num_tokens_received_so_far, (int)num_q_heads}; - - for (size_t i = 0; - i < num_new_tokens * num_tokens_received_so_far * num_q_heads; - i++) { - size_t new_t_idx = i % num_new_tokens; - size_t all_t_idx = (i / num_new_tokens) % num_tokens_received_so_far; - size_t head_idx = i / (num_new_tokens * num_tokens_received_so_far); - assert(new_t_idx < num_new_tokens && - all_t_idx < num_tokens_received_so_far && head_idx < num_q_heads); - set_value_row_major(converted_qk_prod, - converted_qk_prod_shape, - {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, - qk_prods_cpu[i + qk_prods_cpu_offset]); - set_value_row_major(converted_qk_prod_softmax, - converted_qk_prod_shape, - {(int)new_t_idx, (int)all_t_idx, (int)head_idx}, - qk_prods_softmax_cpu[i + qk_prods_cpu_offset]); - } - torch::Tensor qk_prods_cuda = torch::from_blob( - converted_qk_prod, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, - torch::kFloat32); - torch::Tensor qk_prods_softmax_cuda = torch::from_blob( - converted_qk_prod_softmax, - {(int64_t)num_new_tokens, num_tokens_received_so_far, num_q_heads}, - torch::kFloat32); - - // ------------------- Comparing C++ & CUDA results ------------------ - /* std::cout << "C++:" <vProjSize); - assert( - V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) - .sizes()[1] == num_tokens_received_so_far); - assert( - V_t.index({Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid}) - .sizes()[2] == m->num_q_heads); - attn_heads[r] = torch::einsum( - "ijk,ljk->ilk", - {qk_softmax[r], - V_t.index( - {Slice(), Slice(0, num_tokens_received_so_far), Slice(), rid})}); - assert(attn_heads[r].sizes()[0] == num_new_tokens); - assert(attn_heads[r].sizes()[1] == m->vProjSize); - assert(attn_heads[r].sizes()[2] == m->num_q_heads); - - // ------------------- Loading CUDA results for this step --------------- - float converted_attn_heads_cpu[num_new_tokens][m->vProjSize] - [m->num_q_heads] = {0}; - for (int i = 0; i < num_new_tokens * m->vProjSize * m->num_q_heads; i++) { - int token_ix = i % num_new_tokens; - int vproj_idx = (i / num_new_tokens) % m->vProjSize; - int head_idx = i / (num_new_tokens * m->vProjSize); - assert(token_ix < num_new_tokens && vproj_idx < m->vProjSize && - head_idx < m->num_q_heads); - converted_attn_heads_cpu[token_ix][vproj_idx][head_idx] = - attn_heads_cpu[r_first_idx[r] * m->vProjSize * m->num_q_heads + i]; - } - torch::Tensor converted_attn_heads_cuda = torch::from_blob( - converted_attn_heads_cpu, - {(int64_t)num_new_tokens, m->vProjSize, m->num_q_heads}, - torch::kFloat32); - - // -------------------- Comparing C++ & CUDA results ------------------- - /* std::cout << "CUDA attn head for req " << r << ":" <num_q_heads; h++) { - std::cout << converted_attn_heads_cuda.index({Slice(), Slice(), h}) << - std::endl; - } - std::cout << "C++ attn head for req " << r << ":" <num_q_heads; h++) { - std::cout << attn_heads[r].index({Slice(), Slice(), h}) << std::endl; - } */ - assert(torch::allclose( - converted_attn_heads_cuda, attn_heads[r], 1e-05, 1e-05)); - - // ----------------------- C++ computations ---------------------------- - // Compute output values by projecting all heads to output space - cpp_output.index( - {Slice(), - Slice(r_first_idx[r], r_first_idx[r] + (int64_t)num_new_tokens)}) = - torch::einsum("jkl,ijk->li", {torch_w_out, attn_heads[r]}); - - // increment main loop's auxiliary index - qk_prods_cpu_offset += - num_new_tokens * num_tokens_received_so_far * num_q_heads; - } - - // ----------------------- Comparing C++ & CUDA results --------------------- - /* std::cout << "C++:" <oProjSize; i++) { - std::cout << cpp_output.index({i, Slice()}) << std::endl; - } - std::cout << "CUDA:" <oProjSize; i++) { - std::cout << torch_out_cuda.index({i, Slice(0, - (int64_t)bc->num_active_tokens())}) << std::endl; - } */ - - assert(torch::allclose( - torch_out_cuda.index( - {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), - cpp_output, - 1e-05, - 1e-05)); - - // ============================================================================= - // Cleanup - // ============================================================================= - free(w_out); - checkCUDA(cudaFreeHost(input_cpu)); - checkCUDA(cudaFreeHost(weight_cpu)); - checkCUDA(cudaFreeHost(output_cpu)); - checkCUDA(cudaFreeHost(QKVProjArray_cpu)); - checkCUDA(cudaFreeHost(keyCache_cpu)); - checkCUDA(cudaFreeHost(valueCache_cpu)); - checkCUDA(cudaFreeHost(qk_prods_cpu)); - checkCUDA(cudaFreeHost(qk_prods_softmax_cpu)); - checkCUDA(cudaFreeHost(attn_heads_cpu)); - checkCUDA(cudaFreeHost(w_out_cuda)); - // assert(false && "All good if you see this assert failure! :)"); -#endif - // Done with INFERENCE_TESTS block } void IncMultiHeadSelfAttention::backward(FFModel const &ff) { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index eaaa398654..5a2a14387e 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1097,17 +1097,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize); } -#ifdef INFERENCE_TESTS - kcache = - (float *)calloc(kProjSize * BatchConfig::max_sequence_length() * - num_q_heads * BatchConfig::max_requests_per_batch(), - sizeof(float)); - vcache = - (float *)calloc(vProjSize * BatchConfig::max_sequence_length() * - num_q_heads * BatchConfig::max_requests_per_batch(), - sizeof(float)); -#endif - // allocate memory for the seqArray and reserve space { int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); @@ -1253,10 +1242,6 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } -#ifdef INFERENCE_TESTS - free(kcache); - free(vcache); -#endif } template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index b0dd4c644e..14225f0bce 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -30,6 +30,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + inference_debugging = dropout->inference_debugging; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateDropoutDescriptor(&dropoutDesc)); diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 4a76301fd6..e142bba83b 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -29,6 +29,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + inference_debugging = dropout->inference_debugging; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc)); diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu index ff5d5a67e1..42b31a664a 100644 --- a/src/ops/kernels/element_binary_kernels.cu +++ b/src/ops/kernels/element_binary_kernels.cu @@ -30,6 +30,7 @@ ElementBinaryMeta::ElementBinaryMeta(FFHandler handler, Op const *op) checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceAddDesc)); op_type = OP_NOOP; profiling = false; + inference_debugging = false; inplace_a = false; has_same_operands = false; broadcast_input1 = false; diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index ca4872d51b..89c9f14a01 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -34,6 +34,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); dim = softmax->dim; profiling = softmax->profiling; + inference_debugging = softmax->inference_debugging; std::strcpy(op_name, softmax->name); } diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 67a9c21038..e47006cc9d 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -33,6 +33,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; + inference_debugging = softmax->inference_debugging; std::strcpy(op_name, softmax->name); } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index cb519239c5..bc1358e49c 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -383,6 +383,8 @@ OpMeta *LayerNorm::init_task(Task const *task, .first(); MemoryAllocator gpu_mem_allocator(gpu_mem); LayerNormMeta *meta = new LayerNormMeta(handle, ln, gpu_mem_allocator); + std::strcpy(meta->op_name, ln->name); + meta->layer_guid = ln->layer_guid; meta->input_type[0] = ln->inputs[0]->data_type; meta->output_type[0] = ln->outputs[0]->data_type; return meta; @@ -504,7 +506,7 @@ void LayerNorm::inference_task(Task const *task, return; } - LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); + LayerNormMeta *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); float const *in_ptr = NULL; float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; @@ -558,7 +560,22 @@ void LayerNorm::inference_task(Task const *task, } else { assert(regions.size() == 2); } + LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + if (m->use_bias) { + weights_accessors.push_back(beta); + } + } + LayerNorm::save_inference_tensors_to_file( + m, shard_id, bc, {in}, weights_accessors, {out}); + } } /* @@ -858,6 +875,7 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, void LayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->axes.size()); for (size_t i = 0; i < this->axes.size(); i++) { sez.serialize(this->axes[i]); @@ -879,10 +897,11 @@ Node LayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; float eps; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(num_axes); for (size_t i = 0; i < num_axes; i++) { int axis_idx; diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 12751113a2..6d29071e38 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -33,6 +33,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 6; diff --git a/src/ops/linear.cc b/src/ops/linear.cc index a751ebcc57..63b26bfe7d 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -504,11 +504,13 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->use_bias = linear->use_bias; m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; + m->inference_debugging = linear->inference_debugging; m->trainableInputs[0] = linear->trainableInputs[0]; m->weight_ptr_type = m->input_type[0]; m->quantization_type = linear->quantization_type; m->offload = linear->offload; std::strcpy(m->op_name, linear->name); + m->layer_guid = linear->layer_guid; init_kernel(m, batch_size, out_dim); @@ -617,7 +619,7 @@ void Linear::inference_task(Task const *task, Runtime *runtime) { Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - LinearMeta const *m = *((LinearMeta **)task->local_args); + LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -658,6 +660,18 @@ void Linear::inference_task(Task const *task, in_dim, out_dim, batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(weight); + if (m->use_bias && + !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) { + weights_accessors.push_back(bias); + } + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input}, weights_accessors, {output}); + } } void Linear::forward_task(Task const *task, @@ -1235,6 +1249,7 @@ bool operator==(LinearParams const &lhs, LinearParams const &rhs) { void Linear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->out_channels); sez.serialize(this->activation); sez.serialize(this->kernel_reg_type); @@ -1260,10 +1275,11 @@ Node Linear::deserialize(FFModel &ff, DataType data_type; DataType quantization_type; bool offload; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(out_channels); dez.deserialize(activation); dez.deserialize(kernel_reg_type); diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc index f56a60641d..e358448ddf 100644 --- a/src/ops/pool_2d.cc +++ b/src/ops/pool_2d.cc @@ -317,7 +317,9 @@ OpMeta *Pool2D::init_task(Task const *task, FFHandler handle = *((FFHandler const *)task->local_args); Pool2DMeta *m = new Pool2DMeta(handle); m->profiling = pool->profiling; + m->inference_debugging = pool->inference_debugging; std::strcpy(m->op_name, pool->name); + m->layer_guid = pool->layer_guid; TensorAccessorR acc_input( regions[0], task->regions[0], FID_DATA, ctx, runtime); TensorAccessorW acc_output(regions[1], diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc index 6c999c8858..7a443e6ad0 100644 --- a/src/ops/reduce.cc +++ b/src/ops/reduce.cc @@ -210,6 +210,8 @@ OpMeta *Reduce::init_task(Task const *task, GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); ReduceMeta *m = new ReduceMeta(handle, rd, input.domain); + std::strcpy(m->op_name, rd->name); + m->layer_guid = rd->layer_guid; return m; } @@ -375,6 +377,7 @@ void Reduce::serialize(Legion::Serializer &sez) const { sez.serialize(params.keepdims); sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); } using PCG::Node; @@ -393,10 +396,11 @@ Node Reduce::deserialize(FFModel &ff, axes.push_back(dim_idx); } dez.deserialize(keepdims); - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); return ff.get_or_create_node(inputs[0], {axes, keepdims, layer_guid}); } diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 41c3fcdbf1..45da190680 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -181,6 +181,8 @@ OpMeta *Reshape::init_task(Task const *task, Reshape const *reshape = (Reshape *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); ReshapeMeta *m = new ReshapeMeta(handle); + std::strcpy(m->op_name, reshape->name); + m->layer_guid = reshape->layer_guid; m->data_type = reshape->outputs[0]->data_type; return m; } @@ -411,6 +413,7 @@ void Reshape::serialize(Legion::Serializer &sez) const { } sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); } using PCG::Node; @@ -428,10 +431,11 @@ Node Reshape::deserialize(FFModel &ff, dez.deserialize(value); shape.push_back(value); } - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); ReshapeParams params; params.shape = shape; diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 3bec09521a..7de40fb389 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -493,6 +493,8 @@ OpMeta *ResidualLayerNorm::init_task(Task const *task, MemoryAllocator gpu_mem_allocator(gpu_mem); ResidualLayerNormMeta *meta = new ResidualLayerNormMeta(handle, ln, gpu_mem_allocator); + std::strcpy(meta->op_name, ln->name); + meta->layer_guid = ln->layer_guid; meta->input_type[0] = ln->inputs[0]->data_type; meta->input_type[1] = ln->inputs[1]->data_type; if (ln->use_two_residuals) { @@ -622,8 +624,7 @@ void ResidualLayerNorm::inference_task( return; } - ResidualLayerNormMeta const *m = - *((ResidualLayerNormMeta **)task->local_args); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); assert(regions.size() == 4 + m->use_two_residuals + @@ -734,6 +735,30 @@ void ResidualLayerNorm::inference_task( ResidualLayerNorm::inference_kernel_wrapper( m, input, residual1, residual2, added_output, output, gamma, beta); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector input_accessors; + input_accessors.push_back(input); + input_accessors.push_back(residual1); + if (m->use_two_residuals) { + input_accessors.push_back(residual2); + } + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + if (m->use_bias) { + weights_accessors.push_back(beta); + } + } + ResidualLayerNorm::save_inference_tensors_to_file(m, + shard_id, + bc, + input_accessors, + weights_accessors, + {added_output, output}); + } } bool ResidualLayerNorm::measure_operator_cost(Simulator *sim, @@ -745,6 +770,7 @@ bool ResidualLayerNorm::measure_operator_cost(Simulator *sim, void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->axes.size()); for (size_t i = 0; i < this->axes.size(); i++) { sez.serialize(this->axes[i]); @@ -767,10 +793,11 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, bool use_bias; bool use_two_residuals; float eps; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(num_axes); for (size_t i = 0; i < num_axes; i++) { int axis_idx; @@ -811,6 +838,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.layer_guid.model_id); hash_combine(key, params.axes.size()); for (int n : params.axes) { hash_combine(key, n); diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index 7b42392326..f1b7a537b0 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -34,6 +34,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 3bd18217cb..e5ebdce6ed 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -33,6 +33,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 3efb7274a0..b447a2a3b5 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -351,6 +351,8 @@ OpMeta *ResidualRMSNorm::init_task(Task const *task, MemoryAllocator gpu_mem_allocator(gpu_mem); ResidualRMSNormMeta *meta = new ResidualRMSNormMeta(handle, rn, gpu_mem_allocator); + std::strcpy(meta->op_name, rn->name); + meta->layer_guid = rn->layer_guid; return meta; } @@ -431,7 +433,7 @@ void ResidualRMSNorm::inference_task(Task const *task, if (bc->num_tokens == 0) { return; } - ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( @@ -443,11 +445,18 @@ void ResidualRMSNorm::inference_task(Task const *task, GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); forward_kernel_wrapper(m, input1, input2, weight, residual_output, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ResidualRMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output}); + } } void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); } @@ -460,12 +469,12 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 2); float eps; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; int dim; dez.deserialize(id); dez.deserialize(transformer_layer_id); - - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); ResidualRMSNormParams params; diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 5b1634472d..2a34f83be2 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -297,6 +297,8 @@ OpMeta *RMSNorm::init_task(Task const *task, .first(); MemoryAllocator gpu_mem_allocator(gpu_mem); RMSNormMeta *meta = new RMSNormMeta(handle, rn, gpu_mem_allocator); + std::strcpy(meta->op_name, rn->name); + meta->layer_guid = rn->layer_guid; return meta; } @@ -413,7 +415,7 @@ void RMSNorm::inference_task(Task const *task, if (bc->num_tokens == 0) { return; } - RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + RMSNormMeta *m = *((RMSNormMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( @@ -421,11 +423,18 @@ void RMSNorm::inference_task(Task const *task, GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); forward_kernel_wrapper(m, input, weight, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + RMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {weight}, {output}); + } } void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); } @@ -438,12 +447,13 @@ Node RMSNorm::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 1); float eps; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; int dim; dez.deserialize(id); dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); - LayerID layer_guid(id, transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); RMSNormParams params; diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 6eb62b2933..463b15aadb 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -231,6 +231,9 @@ OpMeta *Sampling::init_task(Task const *task, SamplingMeta *m = new SamplingMeta( handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator); m->profiling = s->profiling; + m->inference_debugging = s->inference_debugging; + std::strcpy(m->op_name, s->name); + m->layer_guid = s->layer_guid; m->top_p = s->top_p; return m; } @@ -287,7 +290,7 @@ InferenceResult assert(task->regions.size() == 2); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); // BatchConfig const *bc = (BatchConfig *)task->args; - SamplingMeta const *m = *((SamplingMeta **)task->local_args); + SamplingMeta *m = *((SamplingMeta **)task->local_args); if (bc->num_tokens == 0) { // Directly return for empty batch config InferenceResult ir; @@ -302,6 +305,13 @@ InferenceResult int batch_size = bc->num_active_tokens(); Sampling::forward_kernel_wrapper(m, input, indices, batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Sampling::save_inference_tensors_to_file( + m, shard_id, bc, {}, {}, {input, indices}); + } + InferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 031c7833a4..3b2ed7cef4 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -244,6 +244,8 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task, meta->input_type[0] = ssm->inputs[0]->data_type; meta->input_type[1] = ssm->inputs[1]->data_type; meta->output_type[0] = ssm->outputs[0]->data_type; + std::strcpy(meta->op_name, ssm->name); + meta->layer_guid = ssm->layer_guid; return meta; } @@ -323,7 +325,7 @@ void SigmoidSiluMulti::inference_task( return; } - SigmoidSiluMultiMeta const *m = *((SigmoidSiluMultiMeta **)task->local_args); + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -346,6 +348,12 @@ void SigmoidSiluMulti::inference_task( assert(input1_domain == output_domain); SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file( + m, shard_id, bc, {input1, input2}, {}, {output}); + } } bool SigmoidSiluMulti::measure_operator_cost(Simulator *sim, @@ -357,6 +365,7 @@ bool SigmoidSiluMulti::measure_operator_cost(Simulator *sim, void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); } using PCG::Node; @@ -366,10 +375,11 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 2); - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); SigmoidSiluMultiParams params; params.layer_guid = layer_guid; @@ -385,6 +395,7 @@ size_t hash::operator()( size_t key = 0; hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.layer_guid.model_id); return key; } }; // namespace std diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index fa73a55722..7b7f30a288 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -25,6 +25,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, MemoryAllocator &gpu_mem_allocator) : OpMeta(handle) { profiling = ssm->profiling; + inference_debugging = ssm->inference_debugging; } SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index fd69f6a8aa..590b641b5a 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -24,6 +24,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, MemoryAllocator &gpu_mem_allocator) : OpMeta(handle) { profiling = ssm->profiling; + inference_debugging = ssm->inference_debugging; } SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 28c9ecea67..ba0a1288d6 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -230,6 +230,8 @@ OpMeta *Softmax::init_task(Task const *task, m->input_type = softmax->inputs[0]->data_type; m->output_type = softmax->outputs[0]->data_type; // checkCUDNN(cudnnCreateTensorDescriptor(&m->outputTensor)); + std::strcpy(m->op_name, softmax->name); + m->layer_guid = softmax->layer_guid; return m; } @@ -303,50 +305,25 @@ void Softmax::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - if (m->output_type == DT_HALF) { \ - return forward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type == DT_FLOAT) { \ - return forward_task_with_dim(task, regions, ctx, runtime); \ - } else { \ - assert(false && "Unsupported data type"); \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + + if (m->output_type == DT_HALF) { + forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); + } else if (m->output_type == DT_FLOAT) { + forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); + } else { + assert(false && "Unsupported data type"); } } -/* - regions[0](I): input - regions[1](O): output -*/ -template -void Softmax::forward_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Softmax* softmax = (Softmax*) task->args; - SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - TensorAccessorR acc_input( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - forward_kernel_wrapper(m, acc_input.ptr, acc_output.ptr); -} - void Softmax::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -435,29 +412,31 @@ void Softmax::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); + assert(regions.size() == 2); + assert(task->regions.size() == 2); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; } Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - if (m->output_type == DT_HALF) { \ - forward_task_with_dim(task, regions, ctx, runtime); \ - break; \ - } else if (m->output_type == DT_FLOAT) { \ - forward_task_with_dim(task, regions, ctx, runtime); \ - break; \ - } else { \ - assert(false && "Unsupported data type"); \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); + SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + if (m->output_type == DT_HALF) { + forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); + } else if (m->output_type == DT_FLOAT) { + forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); + } else { + assert(false && "Unsupported data type"); + } + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Softmax::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {output}); } } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 350ab3c167..4c78960d5f 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -658,6 +658,9 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( assert(gpu_mem_allocator.instance_allocated_size == gpu_mem_allocator.instance_total_size); m->profiling = attn->profiling; + m->inference_debugging = attn->inference_debugging; + std::strcpy(m->op_name, attn->name); + m->layer_guid = attn->layer_guid; assert(weight.domain.get_volume() * data_type_size(weight.data_type) == m->weightSize); return m; @@ -733,14 +736,13 @@ void SpecIncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(task->regions.size() == regions.size()); - // BeamSearchBatchConfig const *bc = (BeamSearchBatchConfig *)task->args; BeamSearchBatchConfig const &bc = Future(task->futures[0]).get_result(); if (bc.num_tokens == 0) { return; } - SpecIncMultiHeadSelfAttentionMeta const *m = + SpecIncMultiHeadSelfAttentionMeta *m = *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 : regions.size() == 3)); @@ -777,14 +779,17 @@ void SpecIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &bc, task->index_point.point_data[0], input, weight, output, biases); - - // print_tensor(input.get_float_ptr(), 20, "attention input"); - // print_tensor(output.get_float_ptr(), 20, "attention output"); - // if(bc.beam_slots.at(0).current_depth == 1){ - // print_beam_tensor(input.get_float_ptr(), 50, 4096, 40, "mha topk - // input"); print_beam_tensor(output.get_float_ptr(), 50, 4096, 40, - // "mha topk output"); - // } + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(weight); + if (*m->qkv_bias || *m->final_bias) { + weights_accessors.push_back(biases); + } + SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, &bc, {input}, weights_accessors, {output}); + } } void SpecIncMultiHeadSelfAttention::backward(FFModel const &ff) { diff --git a/src/ops/topk.cc b/src/ops/topk.cc index d76ad75167..b38ff85f90 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -225,7 +225,10 @@ OpMeta *TopK::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); TopKMeta *m = new TopKMeta(handle); m->profiling = topk->profiling; + m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; + std::strcpy(m->op_name, topk->name); + m->layer_guid = topk->layer_guid; return m; } diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index 303948964b..500b7867af 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -193,6 +193,9 @@ OpMeta *Transpose::init_task(Task const *task, TransposeMeta *m = new TransposeMeta(handle); transpose->init_meta(m, in_domain, out_domain); m->profiling = transpose->profiling; + m->inference_debugging = transpose->inference_debugging; + std::strcpy(m->op_name, transpose->name); + m->layer_guid = transpose->layer_guid; return m; } diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 207dae0785..6b520aa37b 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -722,6 +722,9 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( gpu_mem_allocator.reserved_total_size); } m->profiling = attn->profiling; + m->inference_debugging = attn->inference_debugging; + std::strcpy(m->op_name, attn->name); + m->layer_guid = attn->layer_guid; if (attn->quantization_type == DT_NONE) { assert(weight.domain.get_volume() * data_type_size(weight.data_type) == @@ -803,7 +806,6 @@ void TreeIncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(task->regions.size() == regions.size()); - // TreeVerifyBatchConfig const *bc = (TreeVerifyBatchConfig *)task->args; TreeVerifyBatchConfig const &bc = Future(task->futures[0]).get_result(); log_tree_verify.debug( @@ -857,6 +859,18 @@ void TreeIncMultiHeadSelfAttention::inference_task( TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &bc, task->index_point.point_data[0], input, weight, output, biases); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(weight); + if (*m->qkv_bias || *m->final_bias) { + weights_accessors.push_back(biases); + } + TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, &bc, {input}, weights_accessors, {output}); + } } void TreeIncMultiHeadSelfAttention::backward(FFModel const &ff) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index cbf839c6b2..72572c4e06 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -88,43 +88,61 @@ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); } -void BatchConfig::print() const { - std::cout << "@@@@@@@@@@@@@@ Batch Config (mode " << get_mode() - << ") @@@@@@@@@@@@@@" << std::endl; - std::cout << "Max number of requests: " << max_requests_per_batch() - << std::endl; - std::cout << "Max number of tokens: " << max_tokens_per_batch() << std::endl; - std::cout << "Number of tokens: " << num_tokens << std::endl; - std::cout << "Number of requests: " << num_active_requests() << std::endl; - // std::cout << "Cached results: " << cached_results << std::endl; - - std::cout << "Per-request info:\n"; - for (int i = 0; i < max_requests_per_batch(); i++) { - if (!request_completed[i]) { - std::cout << " Request " << i << ":\n"; - std::cout << " Token start offset: " - << requestsInfo[i].token_start_offset << std::endl; - std::cout << " Number of tokens in batch: " - << requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; - std::cout << " Max sequence length: " - << requestsInfo[i].max_sequence_length << std::endl; - std::cout << " Request completed: " << request_completed[i] - << std::endl; +std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { + os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode() + << ") @@@@@@@@@@@@@@" << std::endl; + // Max values + os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl; + os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; + os << "Max sequence length: " << bc.max_sequence_length() << std::endl; + // Current values + os << "Number of tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of requests: " << bc.num_active_requests() << std::endl; + + // Per-request info + os << "Per-request info:\n"; + for (int i = 0; i < bc.max_requests_per_batch(); i++) { + if (!bc.request_completed[i]) { + os << " Request " << i << ":\n"; + os << " Token start offset: " << bc.requestsInfo[i].token_start_offset + << std::endl; + os << " Number of tokens in batch: " + << bc.requestsInfo[i].num_tokens_in_batch << std::endl; + os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + os << " Max sequence length: " + << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Request completed: " << bc.request_completed[i] << std::endl; + os << " Request running: " << bc.request_running[i] << std::endl; } } - std::cout << "Per-token info:\n"; - for (int i = 0; i < num_tokens; i++) { - std::cout << " Token " << i << ":\n"; - std::cout << " Absolute depth in request: " - << tokensInfo[i].abs_depth_in_request << std::endl; - std::cout << " Request index: " << tokensInfo[i].request_index - << std::endl; - std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; + // Per-token info + os << "Per-token info:\n"; + for (int i = 0; i < bc.num_tokens; i++) { + os << " Token " << i << ":\n"; + os << " Absolute depth in request: " + << bc.tokensInfo[i].abs_depth_in_request << std::endl; + os << " Request index: " << bc.tokensInfo[i].request_index << std::endl; + os << " Token id: " << bc.tokensInfo[i].token_id << std::endl; + } + os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl; + return os; +} + +void BatchConfig::print() const { + std::cout << *this << std::endl; +} + +void BatchConfig::save_to_file(std::string const &filename) const { + std::ofstream outputFile(filename); + if (outputFile.is_open()) { + outputFile << *this << std::endl; + outputFile.close(); + } else { + std::cerr << "Error: Unable to open the batch config output file: " + << filename << std::endl; + assert(false); } - std::cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" - << std::endl; } }; // namespace FlexFlow diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 634d60a352..811ef00ba2 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -14,6 +14,7 @@ */ #include "flexflow/batch_config.h" +#include "flexflow/request_manager.h" #include "legion.h" #include #include @@ -101,72 +102,93 @@ int BeamSearchBatchConfig::current_depth_all_requests() const { return current_depth; } -void BeamSearchBatchConfig::print() const { - std::cout << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << get_mode() - << ") @@@@@@@@@@@@@@" << std::endl; - std::cout << "Max number of requests: " << max_requests_per_batch() - << std::endl; - std::cout << "Max number of tokens: " << max_tokens_per_batch() << std::endl; - std::cout << "Number of tokens: " << num_tokens << std::endl; - std::cout << "Number of requests: " << num_active_requests() << std::endl; - std::cout << "Beam width: " << beam_width << std::endl; - std::cout << "Target Iterations: " << target_iterations << std::endl; - std::cout << "Current Iterations: " << current_iteration << std::endl; - - std::cout << "Per-request info:\n"; - for (int i = 0; i < max_requests_per_batch(); i++) { - // assert(beamRequestsInfo[i].request_completed == request_completed[i]); - if (!request_completed[i]) { - std::cout << " Request " << i << ":\n"; - std::cout << " Token start offset: " - << requestsInfo[i].token_start_offset << std::endl; - std::cout << " Number of tokens in batch: " - << requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; - std::cout << " Max sequence length: " - << requestsInfo[i].max_sequence_length << std::endl; - std::cout << " Beam Search Specific: " << std::endl; - std::cout << " beam_size: " << beamRequestsInfo[i].beam_size - << std::endl; - std::cout << " current_depth: " - << beamRequestsInfo[i].current_depth << std::endl; - std::cout << " max_depth: " << beamRequestsInfo[i].max_depth - << std::endl; - std::cout << " tokens: "; - for (int j = 0; j < MAX_BEAM_WIDTH; j++) { - std::cout << beamRequestsInfo[i].tokens[j] << ", "; +std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { + os << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << bc.get_mode() + << ") @@@@@@@@@@@@@@" << std::endl; + // Max values + os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl; + os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; + os << "Max sequence length: " << bc.max_sequence_length() << std::endl; + // Current values + os << "Number of tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of requests: " << bc.num_active_requests() << std::endl; + // BeamSearch-specific + os << "Model ID: " << bc.model_id << std::endl; + os << "Max Beam Depth (all requests): " << bc.max_beam_depth_all_requests() + << std::endl; + os << "Current depth (all requests): " << bc.current_depth_all_requests() + << std::endl; + os << "Beam width: " << bc.beam_width << std::endl; + os << "Target Iterations: " << bc.target_iterations << std::endl; + os << "Current Iterations: " << bc.current_iteration << std::endl; + + os << "Per-request info:\n"; + for (int i = 0; i < bc.max_requests_per_batch(); i++) { + if (!bc.request_completed[i]) { + os << " Request " << i << ":\n"; + os << " Token start offset: " << bc.requestsInfo[i].token_start_offset + << std::endl; + os << " Number of tokens in batch: " + << bc.requestsInfo[i].num_tokens_in_batch << std::endl; + os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + os << " Max sequence length: " + << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Request completed: " << bc.request_completed[i] << std::endl; + os << " Request running: " << bc.request_running[i] << std::endl; + os << " Beam Search Specific: " << std::endl; + os << " beam_size: " << bc.beamRequestsInfo[i].beam_size + << std::endl; + os << " current_depth: " << bc.beamRequestsInfo[i].current_depth + << std::endl; + os << " max_depth: " << bc.beamRequestsInfo[i].max_depth + << std::endl; + os << " tokens: "; + for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) { + os << bc.beamRequestsInfo[i].tokens[j] << ", "; } - std::cout << std::endl; - std::cout << " probs: "; - for (int j = 0; j < MAX_BEAM_WIDTH; j++) { - std::cout << beamRequestsInfo[i].probs[j] << ", "; + os << std::endl; + os << " probs: "; + for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) { + os << bc.beamRequestsInfo[i].probs[j] << ", "; } - std::cout << std::endl; - std::cout << " parent_id: "; - for (int j = 0; j < MAX_BEAM_WIDTH; j++) { - std::cout << beamRequestsInfo[i].parent_id[j] << ", "; + os << std::endl; + os << " parent_id: "; + for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) { + os << bc.beamRequestsInfo[i].parent_id[j] << ", "; } - std::cout << std::endl; + os << std::endl; } } - std::cout << "Per-token info:\n"; - for (int i = 0; i < num_tokens; i++) { - std::cout << " Token " << i << ":\n"; - std::cout << " Absolute depth in request: " - << tokensInfo[i].abs_depth_in_request << std::endl; - std::cout << " Request index: " << tokensInfo[i].request_index - << std::endl; - std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; - std::cout << " Beam Search Specific: " << std::endl; - std::cout << " beam_size: " << beamTokenInfo[i].sub_request_index - << std::endl; - // std::cout << " Parent token id: " << tokensInfo[i].parent_token_id << - // std::endl; std::cout << " Accumulated log prob: " - // << tokensInfo[i].cum_log_prob << std::endl; + os << "Per-token info:\n"; + for (int i = 0; i < bc.num_tokens; i++) { + os << " Token " << i << ":\n"; + os << " Absolute depth in request: " + << bc.tokensInfo[i].abs_depth_in_request << std::endl; + os << " Request index: " << bc.tokensInfo[i].request_index << std::endl; + os << " Token id: " << bc.tokensInfo[i].token_id << std::endl; + os << " Beam Search Specific: " << std::endl; + os << " beam_size: " << bc.beamTokenInfo[i].sub_request_index + << std::endl; + } + os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl; + return os; +} + +void BeamSearchBatchConfig::print() const { + std::cout << *this << std::endl; +} + +void BeamSearchBatchConfig::save_to_file(std::string const &filename) const { + std::ofstream outputFile(filename); + if (outputFile.is_open()) { + outputFile << *this << std::endl; + outputFile.close(); + } else { + std::cerr << "Error: Unable to open the batch config output file: " + << filename << std::endl; + assert(false); } - std::cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" - << std::endl; } }; // namespace FlexFlow diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index e4728bdb88..fa6bf55fe5 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -268,23 +268,106 @@ __host__ void print_beam_tensor(T const *ptr, checkCUDA(cudaFreeHost(host_ptr)); } -template +template <> __host__ void - save_tensor(T const *ptr, size_t num_elements, char const *file_name) { + save_tensor(float const *ptr, size_t num_elements, char const *file_name) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - T *host_ptr; + float *host_ptr; checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(T) * num_elements, + sizeof(float) * num_elements, cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync( - host_ptr, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); - // checkCUDA(cudaDeviceSynchronize()); - cudaDeviceSynchronize(); + checkCUDA(cudaMemcpyAsync(host_ptr, + ptr, + sizeof(float) * num_elements, + cudaMemcpyDeviceToHost, + stream)); + checkCUDA(cudaDeviceSynchronize()); FILE *tensor_file; tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.20f, ", (float)host_ptr[i]); + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +template <> +__host__ void + save_tensor(half const *ptr, size_t num_elements, char const *file_name) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + half *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(half) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpyAsync(host_ptr, + ptr, + sizeof(half) * num_elements, + cudaMemcpyDeviceToHost, + stream)); + checkCUDA(cudaDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +template <> +__host__ void save_tensor(int32_t const *ptr, + size_t num_elements, + char const *file_name) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + int32_t *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(int32_t) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpyAsync(host_ptr, + ptr, + sizeof(int32_t) * num_elements, + cudaMemcpyDeviceToHost, + stream)); + checkCUDA(cudaDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%d, ", host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(cudaFreeHost(host_ptr)); +} + +template <> +__host__ void save_tensor(int64_t const *ptr, + size_t num_elements, + char const *file_name) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + int64_t *host_ptr; + checkCUDA(cudaHostAlloc(&host_ptr, + sizeof(int64_t) * num_elements, + cudaHostAllocPortable | cudaHostAllocMapped)); + checkCUDA(cudaMemcpyAsync(host_ptr, + ptr, + sizeof(int64_t) * num_elements, + cudaMemcpyDeviceToHost, + stream)); + checkCUDA(cudaDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%ld, ", host_ptr[i]); } fclose(tensor_file); @@ -608,6 +691,9 @@ template __host__ void print_beam_tensor(int64_t const *ptr, template __host__ void save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); template __host__ void save_tensor(int64_t const *ptr, size_t rect, char const *file_name); diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 2b94f07999..819e6527e5 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -6,21 +6,26 @@ namespace FlexFlow { const LayerID LayerID::NO_ID = LayerID(); -LayerID::LayerID() : id(0), transformer_layer_id(MAX_NUM_TRANSFORMER_LAYERS) {} +LayerID::LayerID() + : id(0), transformer_layer_id(MAX_NUM_TRANSFORMER_LAYERS), model_id(0) {} -LayerID::LayerID(size_t _id, size_t _transformer_layer_id) - : id(_id), transformer_layer_id(_transformer_layer_id) { +LayerID::LayerID(size_t _id, size_t _transformer_layer_id, size_t _model_id) + : id(_id), transformer_layer_id(_transformer_layer_id), + model_id(_model_id) { assert(is_valid_id()); } bool LayerID::is_valid_id() const { - return (id >= LAYER_GUID_FIRST_VALID && id <= LAYER_GUID_LAST_VALID); + return (id >= LAYER_GUID_FIRST_VALID && id <= LAYER_GUID_LAST_VALID && + transformer_layer_id >= 0 && + transformer_layer_id < MAX_NUM_TRANSFORMER_LAYERS && model_id >= 0); } bool operator==(LayerID const &lhs, LayerID const &rhs) { // id should be sufficient to distinguish different layers if (lhs.id == rhs.id) { assert(lhs.transformer_layer_id == rhs.transformer_layer_id); + assert(lhs.model_id == rhs.model_id); } return lhs.id == rhs.id; } diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 408de57c54..6d33dd9f27 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2291,6 +2291,7 @@ GraphOptimalViewSerialized Embedding *embed = (Embedding *)op; sez.serialize(embed->layer_guid.id); sez.serialize(embed->layer_guid.transformer_layer_id); + sez.serialize(embed->layer_guid.model_id); sez.serialize(embed->num_entries); sez.serialize(embed->out_channels); sez.serialize(embed->aggr); @@ -2301,6 +2302,7 @@ GraphOptimalViewSerialized MultiHeadAttention *attn = (MultiHeadAttention *)op; sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); + sez.serialize(attn->layer_guid.model_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_heads); sez.serialize(attn->qProjSize); @@ -2315,6 +2317,7 @@ GraphOptimalViewSerialized IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); + sez.serialize(attn->layer_guid.model_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_q_heads); sez.serialize(attn->qProjSize); @@ -2339,6 +2342,7 @@ GraphOptimalViewSerialized (SpecIncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); + sez.serialize(attn->layer_guid.model_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_q_heads); sez.serialize(attn->qProjSize); @@ -2360,6 +2364,7 @@ GraphOptimalViewSerialized (TreeIncMultiHeadSelfAttention *)op; sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); + sez.serialize(attn->layer_guid.model_id); sez.serialize(attn->oProjSize); sez.serialize(attn->num_q_heads); sez.serialize(attn->qProjSize); @@ -2639,11 +2644,12 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); AggrMode aggr; int num_entries, out_channels; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; DataType data_type; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(num_entries); dez.deserialize(out_channels); dez.deserialize(aggr); @@ -2727,10 +2733,11 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_heads, k_dim, v_dim; float dropout; bool bias, add_bias_kv, add_zero_attn; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(embed_dim); dez.deserialize(num_heads); dez.deserialize(k_dim); @@ -2762,10 +2769,11 @@ void FFModel::deserialize_graph_optimal_view( bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload, position_bias; DataType quantization_type; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(embed_dim); dez.deserialize(num_q_heads); dez.deserialize(k_dim); @@ -2812,10 +2820,11 @@ void FFModel::deserialize_graph_optimal_view( float dropout, scaling_factor; bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(embed_dim); dez.deserialize(num_q_heads); dez.deserialize(k_dim); @@ -2859,10 +2868,11 @@ void FFModel::deserialize_graph_optimal_view( bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, offload, position_bias; DataType quantization_type; - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(embed_dim); dez.deserialize(num_q_heads); dez.deserialize(k_dim); diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 1f27dc15e7..fb94135c8f 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -247,6 +247,112 @@ __host__ void checkCUDA(hipHostFree(host_ptr)); } +template <> +__host__ void + save_tensor(float const *ptr, size_t num_elements, char const *file_name) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + float *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(float) * num_elements, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(float) * num_elements, + hipMemcpyDeviceToHost, + stream)); + checkCUDA(hipDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(hipHostFree(host_ptr)); +} + +template <> +__host__ void + save_tensor(half const *ptr, size_t num_elements, char const *file_name) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + half *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(half) * num_elements, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(half) * num_elements, + hipMemcpyDeviceToHost, + stream)); + checkCUDA(hipDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(hipHostFree(host_ptr)); +} + +template <> +__host__ void save_tensor(int32_t const *ptr, + size_t num_elements, + char const *file_name) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + int32_t *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(int32_t) * num_elements, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(int32_t) * num_elements, + hipMemcpyDeviceToHost, + stream)); + checkCUDA(hipDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%d, ", host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(hipHostFree(host_ptr)); +} + +template <> +__host__ void save_tensor(int64_t const *ptr, + size_t num_elements, + char const *file_name) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + int64_t *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(int64_t) * num_elements, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(int64_t) * num_elements, + hipMemcpyDeviceToHost, + stream)); + checkCUDA(hipDeviceSynchronize()); + FILE *tensor_file; + tensor_file = fopen(file_name, "w"); + assert(tensor_file != NULL); + for (unsigned i = 0; i < num_elements; i++) { + fprintf(tensor_file, "%ld, ", host_ptr[i]); + } + + fclose(tensor_file); + checkCUDA(hipHostFree(host_ptr)); +} + template __host__ T *download_tensor(T const *ptr, size_t num_elements) { // device synchronize to make sure the data are ready @@ -493,6 +599,17 @@ template __host__ void template __host__ void print_tensor(half const *ptr, size_t rect, char const *prefix); +template __host__ void + save_tensor(float const *ptr, size_t rect, char const *file_name); +template __host__ void save_tensor(int32_t const *ptr, + size_t rect, + char const *file_name); +template __host__ void save_tensor(int64_t const *ptr, + size_t rect, + char const *file_name); +template __host__ void + save_tensor(half const *ptr, size_t rect, char const *file_name); + template __host__ float *download_tensor(float const *ptr, size_t num_elements); template __host__ half *download_tensor(half const *ptr, diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc index d2473f4b2b..8f33f6db87 100644 --- a/src/runtime/layer.cc +++ b/src/runtime/layer.cc @@ -17,7 +17,8 @@ Layer::Layer(FFModel *model, const Tensor _input4) : op_type(_otype), data_type(_dtype), layer_guid(model->layer_global_guid++, - model->current_transformer_layer_id), + model->current_transformer_layer_id, + model->model_id), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs) { std::string pcname; if (_name == nullptr) { @@ -52,7 +53,8 @@ Layer::Layer(FFModel *model, Tensor const *_tensors) : op_type(_otype), data_type(_dtype), layer_guid(model->layer_global_guid++, - model->current_transformer_layer_id), + model->current_transformer_layer_id, + model->model_id), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs) { std::string pcname; if (_name == nullptr) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 3fa201e7ab..92f0cff472 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -122,7 +122,8 @@ Op::Op(FFModel &model, const ParallelTensor _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), - profiling(model.config.profiling) { + profiling(model.config.profiling), + inference_debugging(model.config.inference_debugging) { for (int i = 0; i < MAX_NUM_INPUTS; i++) { inputs[i] = NULL; } @@ -167,7 +168,8 @@ Op::Op(FFModel &model, ParallelTensor const *_inputs) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), - profiling(model.config.profiling) { + profiling(model.config.profiling), + inference_debugging(model.config.inference_debugging) { std::string pcname; if (_name == NULL) { pcname = get_operator_type_name(op_type); @@ -1462,7 +1464,8 @@ bool Op::get_weight_parameter(TNParameter tnp, return true; } -OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false) { +OpMeta::OpMeta(FFHandler _handle) + : handle(_handle), profiling(false), inference_debugging(false) { for (int i = 0; i < MAX_NUM_INPUTS; i++) { trainableInputs[i] = true; } @@ -1475,6 +1478,7 @@ OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false) { for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { output_type[i] = DT_NONE; } + decoding_step = 0; } OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { @@ -1487,6 +1491,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { for (int i = 0; i < op->numOutputs; i++) { output_type[i] = op->outputs[i]->data_type; } + decoding_step = 0; } FFRuntime::FFRuntime(FFConfig &config) { @@ -1530,6 +1535,8 @@ FFRuntime::FFRuntime(FFConfig &config) { FFRuntime *ffruntime_singleton = nullptr; +int FFModel::model_counter = 0; + FFModel::FFModel(FFConfig &_config, bool cpu_offload) : op_global_guid(OP_GUID_FIRST_VALID), layer_global_guid(LAYER_GUID_FIRST_VALID), @@ -1570,6 +1577,7 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) for (int idx = 0; idx < config.workersPerNode * config.numNodes; idx++) { handlers[idx] = ffruntime_singleton->handlers[idx]; } + model_id = model_counter++; } void FFModel::clear_graph_search_cache() { @@ -3969,6 +3977,7 @@ struct DefaultConfig { // const static int iterations = 1; const static int batchSize = 64; const static bool profiling = false; + const static bool inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB @@ -4003,6 +4012,7 @@ FFConfig::FFConfig() { // iterations = DefaultConfig::iterations; batchSize = DefaultConfig::batchSize; profiling = DefaultConfig::profiling; + inference_debugging = DefaultConfig::inference_debugging; learningRate = DefaultConfig::learningRate; weightDecay = DefaultConfig::weightDecay; workSpaceSize = DefaultConfig::workSpaceSize; @@ -4188,6 +4198,10 @@ void FFConfig::parse_args(char **argv, int argc) { profiling = true; continue; } + if (!strcmp(argv[i], "--inference-debugging")) { + inference_debugging = true; + continue; + } if (!strcmp(argv[i], "--allow-tensor-op-math-conversion")) { allow_tensor_op_math_conversion = true; continue; diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 08b1af8ca5..0b3813f41c 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -3,6 +3,14 @@ #include "flexflow/simulator.h" #include +#include +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + namespace FlexFlow { size_t Op::get_untyped_params_hash() const { @@ -17,4 +25,105 @@ size_t Op::get_params_hash() const { get_operator_type_name(this->op_type)); } +/*static*/ +void Op::save_inference_tensors_to_file( + OpMeta *m, + int shard_id, + BatchConfig const *bc, + std::vector input_tensors, + std::vector weight_tensors, + std::vector output_tensors) { + + // Check if output directory exists, and create it if it does not + char const *folder_path = "./inference_tensors"; + struct stat st = {0}; + if (stat(folder_path, &st) == -1) { + // Directory does not exist, create it + mkdir(folder_path, 0700); + } + // output base filepath, shared by all tensors from the same operator + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + + m->op_name + "_shard-id_" + std::to_string(shard_id); + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(base_filepath + "_batch-config"); + } + // save all inputs + for (int i = 0; i < input_tensors.size(); i++) { + std::string filename = base_filepath + "_input_" + std::to_string(i); + if (input_tensors[i].data_type == DT_FLOAT) { + save_tensor(input_tensors[i].get_float_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_HALF) { + save_tensor(input_tensors[i].get_half_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT32) { + save_tensor(input_tensors[i].get_int32_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT64) { + save_tensor(input_tensors[i].get_int64_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // only dump the weights once + if (m->decoding_step == 0) { + for (int i = 0; i < weight_tensors.size(); i++) { + std::string filename = base_filepath + "_weight_" + std::to_string(i); + if (weight_tensors[i].data_type == DT_FLOAT) { + save_tensor(weight_tensors[i].get_float_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_HALF) { + save_tensor(weight_tensors[i].get_half_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT32) { + save_tensor(weight_tensors[i].get_int32_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT64) { + save_tensor(weight_tensors[i].get_int64_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + } + // save all outputs + for (int i = 0; i < output_tensors.size(); i++) { + std::string filename = base_filepath + "_output_" + std::to_string(i); + if (output_tensors[i].data_type == DT_FLOAT) { + save_tensor(output_tensors[i].get_float_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_HALF) { + save_tensor(output_tensors[i].get_half_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT32) { + save_tensor(output_tensors[i].get_int32_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT64) { + save_tensor(output_tensors[i].get_int64_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // increase count of decoding steps + m->decoding_step++; +} + }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 9efa06a2d5..cb68ecc5f1 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -14,6 +14,7 @@ */ #include "flexflow/batch_config.h" +#include "flexflow/request_manager.h" #include "legion.h" #include #include @@ -30,55 +31,72 @@ InferenceMode TreeVerifyBatchConfig::get_mode() const { return TREE_VERIFY_MODE; } -void TreeVerifyBatchConfig::print() const { - std::cout << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << get_mode() - << ") @@@@@@@@@@@@@@" << std::endl; - std::cout << "Max number of requests: " << max_requests_per_batch() - << std::endl; - std::cout << "Max number of tokens: " << max_tokens_per_batch() << std::endl; - std::cout << "Number of tokens: " << num_tokens << std::endl; - std::cout << "Number of requests: " << num_active_requests() << std::endl; - // std::cout << "Cached results: " << cached_results << std::endl; +std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { + os << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << bc.get_mode() + << ") @@@@@@@@@@@@@@" << std::endl; + // Max values + os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl; + os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; + os << "Max sequence length: " << bc.max_sequence_length() << std::endl; + // Current values + os << "Number of tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of requests: " << bc.num_active_requests() << std::endl; + os << "Number of tokens to commit: " << bc.num_tokens_to_commit << std::endl; - std::cout << "Per-request info:\n"; - for (int i = 0; i < max_requests_per_batch(); i++) { - if (!request_completed[i]) { - std::cout << " Request " << i << ":\n"; - std::cout << " Token start offset: " - << requestsInfo[i].token_start_offset << std::endl; - std::cout << " Number of tokens in batch: " - << requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << " GUID: " << requestsInfo[i].request_guid << std::endl; - std::cout << " Max sequence length: " - << requestsInfo[i].max_sequence_length << std::endl; - std::cout << " Request completed: " << request_completed[i] - << std::endl; + os << "Per-request info:\n"; + for (int i = 0; i < bc.max_requests_per_batch(); i++) { + if (!bc.request_completed[i]) { + os << " Request " << i << ":\n"; + os << " Token start offset: " << bc.requestsInfo[i].token_start_offset + << std::endl; + os << " Number of tokens in batch: " + << bc.requestsInfo[i].num_tokens_in_batch << std::endl; + os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + os << " Max sequence length: " + << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Request completed: " << bc.request_completed[i] << std::endl; + os << " Request running: " << bc.request_running[i] << std::endl; } } - std::cout << "Per-token info:\n"; - for (int i = 0; i < num_tokens; i++) { - std::cout << " Token " << i << ":\n"; - std::cout << " Absolute depth in request: " - << tokensInfo[i].abs_depth_in_request << std::endl; - std::cout << " Request index: " << tokensInfo[i].request_index - << std::endl; - std::cout << " Token id: " << tokensInfo[i].token_id << std::endl; + os << "Per-token info:\n"; + for (int i = 0; i < bc.num_tokens; i++) { + os << " Token " << i << ":\n"; + os << " Absolute depth in request: " + << bc.tokensInfo[i].abs_depth_in_request << std::endl; + os << " Request index: " << bc.tokensInfo[i].request_index << std::endl; + os << " Token id: " << bc.tokensInfo[i].token_id << std::endl; } - std::cout << "Tokens to commit info:\n"; - for (int i = 0; i < num_tokens_to_commit; i++) { - std::cout << " Token " << i << ":\n"; - std::cout << " token_index: " << committed_tokens[i].token_index - << std::endl; - std::cout << " request_index: " << committed_tokens[i].request_index - << std::endl; - std::cout << " token_depth: " << committed_tokens[i].token_depth - << std::endl; + os << "Tokens to commit info:\n"; + for (int i = 0; i < bc.num_tokens_to_commit; i++) { + os << " Token " << i << ":\n"; + os << " token_index: " << bc.committed_tokens[i].token_index + << std::endl; + os << " request_index: " << bc.committed_tokens[i].request_index + << std::endl; + os << " token_depth: " << bc.committed_tokens[i].token_depth + << std::endl; } - std::cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" - << std::endl; + os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl; + return os; +} + +void TreeVerifyBatchConfig::print() const { + std::cout << *this << std::endl; +} + +void TreeVerifyBatchConfig::save_to_file(std::string const &filename) const { + std::ofstream outputFile(filename); + if (outputFile.is_open()) { + outputFile << *this << std::endl; + outputFile.close(); + } else { + std::cerr << "Error: Unable to open the batch config output file: " + << filename << std::endl; + assert(false); + } } }; // namespace FlexFlow diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index e683faa469..609e15de49 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -18,6 +18,7 @@ "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "inference_debugging": False, "fusion": True, } llm_configs = { From 5e34846c5904c11e2a8fd1d027c9a5aab43d0f92 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Mon, 9 Oct 2023 23:51:45 -0400 Subject: [PATCH 248/344] fix backward gelu, layernorm (#1187) --- src/ops/element_unary.cpp | 5 ++- src/ops/element_unary.cu | 5 ++- src/ops/layer_norm.cpp | 94 +++++++++++++++++++++++++++++++++++++++ src/ops/layer_norm.cu | 89 ++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 4 deletions(-) diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index 424e739e13..e20200420f 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -192,8 +192,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5)))); break; } case OP_RSQRT: { diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index 4a38dabe52..c7f5e90f4c 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -205,8 +205,9 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, case OP_GELU: { input_grad[i] = (T)(output_grad[i] * - (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) - - 0.5 * M_SQRT1_2 * input[i] * exp(-input[i] * input[i] * 0.5))); + (0.5 * static_cast(erfc(-input[i] * M_SQRT1_2)) + + 0.5 * M_SQRT1_2 * input[i] * + ((2 / sqrt(M_PI)) * exp(-input[i] * input[i] * 0.5)))); break; } case OP_RSQRT: { diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index d97c2f62ff..07dbdb3dfb 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -364,6 +364,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -401,6 +477,24 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, static_cast(m->db_ptr), static_cast(m->scale_ptr), static_cast(m->bias_ptr)); + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 6d29071e38..44979c48fe 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -467,6 +467,82 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } } +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] = f_grad_input; + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, N, buf); +} + /*static*/ template void LayerNorm::backward_kernel(LayerNormMeta const *m, @@ -497,6 +573,19 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, static_cast(m->db_ptr), static_cast(m->scale_ptr), static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly From 7b574630744fe3c8a80feb634f02e0395c43613d Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 14 Oct 2023 17:52:48 -0400 Subject: [PATCH 249/344] Optimize attention kernel v2 1.0, use Gemm replace GemmStridedBatch (#1190) * change layout * main change * fix * change spec&tree kernel * fix tp * fix * fix multi requests * replicate key&value * ci * cleanup&hip * more fix. * ci --- .../ops/inc_multihead_self_attention.h | 5 +- .../inc_multihead_self_attention_kernels.h | 3 + inference/file_loader.cc | 59 ++- inference/models/falcon.cc | 4 +- python/flexflow/serve/models/falcon.py | 8 - python/flexflow/serve/models/starcoder.py | 8 - src/ops/inc_multihead_self_attention.cc | 16 +- src/ops/inc_multihead_self_attention.cpp | 439 ++++++----------- src/ops/inc_multihead_self_attention.cu | 457 ++++++------------ src/ops/spec_inc_multihead_self_attention.cc | 14 +- src/ops/spec_inc_multihead_self_attention.cpp | 261 ++++------ src/ops/spec_inc_multihead_self_attention.cu | 264 ++++------ src/ops/tree_inc_multihead_self_attention.cc | 14 +- src/ops/tree_inc_multihead_self_attention.cpp | 280 ++++------- src/ops/tree_inc_multihead_self_attention.cu | 280 ++++------- tests/inference/cpp_inference_tests.sh | 4 +- tests/inference/python_inference_tests.sh | 2 +- .../python_test_configs/generate_configs.py | 5 +- 18 files changed, 777 insertions(+), 1346 deletions(-) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 51a3b9fbe1..5ff0942fff 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -176,7 +176,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { size_t weights_params, weightSize, biasSize, reserveSpaceSize, quantized_weightSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads; + int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads, + hidden_size; bool *has_load_weights; bool *apply_rotary_embedding; bool *qkv_bias; @@ -188,7 +189,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { void *weight_ptr, *bias_ptr; // for weight offload void *devQKVProjArray, *keyCache, *valueCache; void *qk_prods, *qk_prods_softmax; - void *attn_heads, *W_out_contiguous; + void *attn_heads; char *quantized_weight_ptr; BatchConfig::PerTokenInfo *token_infos; DataType quantization_type; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 4d2002d10b..763f654e28 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -1,6 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_ATTENTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_ATTENTION_KERNELS_H +#define QKV_WEIGHT_NUM 3 +#define KV_WEIGHT_NUM 2 + #include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" diff --git a/inference/file_loader.cc b/inference/file_loader.cc index dc724319d2..7c6870d439 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -154,7 +154,11 @@ void load_attention_bias_v2(DT *ptr, std::string weight_filepath = join_path({weights_folder, filename}); int n_heads = file_index == 0 ? num_heads : num_kv_heads; + + int replicate_num = num_heads / num_kv_heads; + size_t qkv_partial_size = qkv_inner_dim * n_heads; + size_t qkv_replicate_size = qkv_inner_dim * num_heads; size_t out_partial_size = hidden_dim; size_t partial_size = (file_index < 3) ? qkv_partial_size : out_partial_size; @@ -178,13 +182,24 @@ void load_attention_bias_v2(DT *ptr, size_t data_index = 0; - for (int i = 0; i < partial_size; i++) { - ptr[idx + i] = host_array.at(data_index); - data_index++; + // q, o + if (file_index == 0 || file_index == 3) { + for (int i = 0; i < partial_size; i++) { + ptr[idx + i] = host_array.at(data_index); + data_index++; + } + } else { + // k, v + for (int i = 0; i < partial_size; i++) { + for (int j = 0; j < replicate_num; j++) { + ptr[idx + j * partial_size + i] = host_array.at(data_index); + } + data_index++; + } } file_index++; - idx += qkv_partial_size; + idx += qkv_replicate_size; in.close(); } @@ -220,9 +235,14 @@ void load_attention_weights_v2(DT *ptr, size_t k_size = single_proj_size * num_kv_heads, v_size = single_proj_size * num_kv_heads; + size_t k_replicate_size = one_weight_file_size; + size_t v_replicate_size = one_weight_file_size; + + int replicate_num = num_heads / num_kv_heads; + // stride for q, k, v, o - size_t stride_size = - (q_size + v_size + k_size + o_size) / tensor_parallelism_degree; + size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) / + tensor_parallelism_degree; for (auto filename : weight_filenames) { std::cout << "Loading weight file " << filename << std::endl; std::string weight_filepath = join_path({weights_folder, filename}); @@ -231,7 +251,8 @@ void load_attention_weights_v2(DT *ptr, size_t partial_size = (file_index == 0 || file_index == 3) ? one_weight_file_size : single_proj_size * num_kv_heads; - size_t one_partition_size = partial_size / tensor_parallelism_degree; + size_t one_partition_size = + one_weight_file_size / tensor_parallelism_degree; std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); if (!in.good()) { @@ -252,16 +273,30 @@ void load_attention_weights_v2(DT *ptr, assert(false && "data size mismatch"); } // wq, wk, wo - for (int i = 0; i < tensor_parallelism_degree; i++) { - for (int j = 0; j < one_partition_size; j++) { - ptr[base_index + i * stride_size + j] = host_array.at(data_index++); + if (file_index == 0) { + for (int i = 0; i < tensor_parallelism_degree; i++) { + for (int j = 0; j < one_partition_size; j++) { + ptr[base_index + i * stride_size + j] = host_array.at(data_index++); + } + } + } else { + for (int i = 0; i < num_heads; i++) { + int kv_idx = i / (num_heads / num_kv_heads); + int head_idx = i % (num_heads / tensor_parallelism_degree); + int tp_idx = (i / (num_heads / tensor_parallelism_degree)); + for (int j = 0; j < single_proj_size; j++) { + ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx + + j] = host_array.at(kv_idx * single_proj_size + j); + } } } - assert(data_index == partial_size); + + // assert(data_index == partial_size); base_index += one_partition_size; file_index++; } - assert(base_index == (q_size + k_size + v_size) / tensor_parallelism_degree); + assert(base_index == (q_size + k_replicate_size + v_replicate_size) / + tensor_parallelism_degree); { std::cout << "Loading weight file " << o_file << std::endl; diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 553a2f0d3d..bfcec847b9 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -29,9 +29,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.print(); if (ff.config.tensor_parallelism_degree > falcon_config.n_head || - falcon_config.n_head % ff.config.tensor_parallelism_degree != 0 || - ff.config.tensor_parallelism_degree > falcon_config.n_head_kv || - falcon_config.n_head_kv % ff.config.tensor_parallelism_degree != 0) { + falcon_config.n_head % ff.config.tensor_parallelism_degree != 0) { assert(false && "The number of attention heads is smaller, or it is not " "divisible by the tensor parallelism degree"); } diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 9a1bca48c4..2b114f09b3 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -83,14 +83,6 @@ def __init__( raise ValueError( f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - if ( - self.falcon_config.n_head_kv < self.ffconfig.tensor_parallelism_degree - or self.falcon_config.n_head_kv % self.ffconfig.tensor_parallelism_degree - != 0 - ): - raise ValueError( - f"Number of k/v attention heads ({self.falcon_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" - ) self.build_model(max_tokens_per_batch) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 33b0b26ff8..f4f28a70e1 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -83,14 +83,6 @@ def __init__( raise ValueError( f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - if ( - self.starcoder_config.n_head_kv < self.ffconfig.tensor_parallelism_degree - or self.starcoder_config.n_head_kv % self.ffconfig.tensor_parallelism_degree - != 0 - ): - raise ValueError( - f"Number of k/v attention heads ({self.starcoder_config.n_head_kv}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" - ) self.build_model(max_tokens_per_batch) diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 2f72976d30..8a3e9c96b1 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -154,8 +154,10 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_q_heads + kParas * num_kv_heads + - vParas * num_kv_heads + oParas * num_q_heads; + + // allocate num_q_heads for key, value for replication + int weight_size = qParas * num_q_heads + kParas * num_q_heads + + vParas * num_q_heads + oParas * num_q_heads; int one_head_size = qParas + kParas + vParas + oParas; { @@ -177,7 +179,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, if (qkv_bias || final_bias) { // q, k, v, o int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0)}; li->weights[1] = create_weight_legion_ordering(1, @@ -348,7 +350,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_kv_heads * (kParas + vParas); + this->num_q_heads * (kParas + vParas); dims[1].is_replica_dim = false; if (quantization_type != DT_NONE) { @@ -367,7 +369,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; bias_shape.dims[0].size = (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -461,7 +463,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_kv_heads * (kParas + vParas); + this->num_q_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads // * (kParas + vParas); @@ -481,7 +483,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; bias_shape.dims[0].size = (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index f09d905dd3..562898a220 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -76,51 +76,31 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int kProjSize, int vProjSize, int global_num_q_heads, - int global_num_kv_heads, int num_q_heads, - int num_kv_heads, bool scaling_query, - float scaling_factor) { - CUDA_KERNEL_LOOP(i, - num_tokens * - (qProjSize * num_q_heads + kProjSize * num_kv_heads + - vProjSize * num_kv_heads)) { + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) { // for simplicity, assume q, k, v is in same shape // 0->q, 1->k, 2->v // int qkv_index = i / (num_tokens * qProjSize) % 3; - int qkv_index = i < num_tokens * qProjSize * num_q_heads - ? 0 - : (i < num_tokens * (qProjSize * num_q_heads + - kProjSize * num_kv_heads) - ? 1 - : 2); - - int q_block_size = qProjSize * num_tokens * num_q_heads; - int k_block_size = kProjSize * num_tokens * num_kv_heads; - - int bias_idx = 0; - if (qkv_index == 0) { - int head_idx = i / (num_tokens * qProjSize); - int global_head_idx = head_idx + shard_id * num_q_heads; - int global_i = i + shard_id * num_q_heads * num_tokens * qProjSize; - bias_idx = global_head_idx * qProjSize + - (global_i % (num_tokens * (qProjSize)) % qProjSize); - } else { + int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); + size_t in_token_idx = i - token_idx * hidden_size * 3; + int qkv_index = in_token_idx / hidden_size; + int proj_size = qkv_index == 0 ? qProjSize : kProjSize; - int idx = - qkv_index == 1 ? i - q_block_size : i - q_block_size - k_block_size; - int pre_length = qkv_index == 1 ? qProjSize * global_num_q_heads - : qProjSize * global_num_q_heads + - kProjSize * global_num_kv_heads; + int head_idx = + (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size; + int global_head_idx = head_idx + shard_id * num_q_heads; - int head_idx = idx / (num_tokens * kProjSize); - int global_head_idx = head_idx + shard_id * num_kv_heads; - int global_idx = idx + shard_id * num_tokens * num_kv_heads * kProjSize; + size_t pre_length = + qkv_index == 0 + ? 0 + : (qkv_index == 1 ? qProjSize * global_num_q_heads + : qProjSize * global_num_q_heads * KV_WEIGHT_NUM); - bias_idx = pre_length + global_head_idx * kProjSize + - (global_idx % (num_tokens * (qProjSize)) % qProjSize); - } + size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size; input_ptr[i] += bias_ptr[bias_idx]; @@ -134,9 +114,12 @@ __global__ void scaling_query_kernel(DT *input_ptr, int qProjSize, int num_tokens, int num_q_heads, - float scaling_factor) { - CUDA_KERNEL_LOOP(i, num_tokens * (qProjSize * num_q_heads)) { - input_ptr[i] *= scaling_factor; + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *= + scaling_factor; } } @@ -192,28 +175,22 @@ __global__ void BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, - int num_q_heads, int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size) { - CUDA_KERNEL_LOOP( - i, - num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { + size_t q_array_size, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { // create complex number bool q_tensor = i < (q_array_size / 2); int proj_size = q_tensor ? qProjSize : kProjSize; int real_i = q_tensor ? i : i - q_array_size / 2; - int head_idx = real_i / (num_tokens * proj_size / 2); - int idx = real_i % (num_tokens * proj_size / 2); - int token_idx = - (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + int token_idx = real_i / (hidden_size / 2); + int idx = real_i % (proj_size / 2); + int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); - int real_part_index = idx + token_idx * (proj_size / 2) + - head_idx * (q_tensor ? q_block_size : k_block_size) + - (q_tensor ? 0 : q_array_size); + int real_part_index = idx + head_idx * proj_size + + token_idx * hidden_size * 3 + + hidden_size * (q_tensor ? 0 : 1); int complex_part_index = real_part_index + (proj_size / 2); complex_input[i] = {input_ptr[real_part_index], @@ -244,35 +221,24 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, int num_tokens, - int num_q_heads, - int num_kv_heads, - int max_seq_len) { - CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * num_tokens * num_q_heads; - int k_array_size = kProjSize * num_tokens * num_kv_heads; - - bool k_cache = i < k_array_size; - int real_i = k_cache ? i : i - k_array_size; - - int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = real_i / (num_tokens * proj_size); - int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = real_i % proj_size; - - DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_tokens + - token_idx * proj_size + data_idx]; + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset; + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + // key cache + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -301,52 +267,39 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens // Output >>> qProjSize x num_tokens x 3 x num_q_heads - int m_q = m->qProjSize; - int m_k = m->kProjSize; - int m_v = m->vProjSize; + int m_q = m->qProjSize * m->num_q_heads; + int m_k = m->kProjSize * m->num_q_heads; + int m_v = m->vProjSize * m->num_q_heads; assert(m_q == m_k && m_k == m_v); // keep things simple for now int n = bc->num_active_tokens(); int k = m->qSize; - int m_ = m_q; - int lda = k, ldb = k, ldc = m_q; - - size_t strideA = m_q * k; // query weight head size - size_t strideB = 0; // input stays the same for all heads. - size_t strideC = m_q * n; // size of the output block for each head. - - // compute QKV - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - hipblas_data_type, - lda, - strideA, - input_ptr, - hipblas_data_type, - ldb, - strideB, - &beta, - output_ptr, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads + m->num_kv_heads + - m->num_kv_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + int m_ = m_q * QKV_WEIGHT_NUM; + int lda = k, ldb = k, ldc = m_; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + hipblas_data_type, + lda, + input_ptr, + hipblas_data_type, + ldb, + &beta, + output_ptr, + hipblas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; - int q_block_size = m->qProjSize * num_tokens; - int k_block_size = m->kProjSize * num_tokens; - int q_array_size = m->qProjSize * num_tokens * m->num_q_heads; + size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; // apply bias for q, k, v if (*m->qkv_bias) { hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), @@ -362,11 +315,10 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, m->global_num_q_heads, - m->global_num_kv_heads, m->num_q_heads, - m->num_kv_heads, *m->scaling_query, - m->scaling_factor); + m->scaling_factor, + m->hidden_size); } else if (m->scaling_query) { hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), GET_BLOCKS(parallelism), @@ -377,13 +329,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, num_tokens, m->num_q_heads, m->qProjSize, - m->scaling_factor); + m->scaling_factor, + m->hidden_size); } if (*m->apply_rotary_embedding) { /*q&k*/ - parallelism = - num_tokens * - (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads) / 2; + parallelism = num_tokens * m->hidden_size; hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -394,12 +345,9 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->token_infos, m->qProjSize, m->kProjSize, - m->num_q_heads, num_tokens, - m->num_kv_heads, - q_block_size, - k_block_size, - q_array_size); + q_array_size, + m->hidden_size); } } @@ -409,8 +357,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, hipStream_t stream) { int num_tokens = bc->num_active_tokens(); if (num_tokens > 0) { - int parallelism = - (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; + int parallelism = m->hidden_size * num_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -420,13 +367,9 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, static_cast
(m->keyCache), static_cast
(m->valueCache), m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, num_tokens, - m->num_q_heads, - m->num_kv_heads, - BatchConfig::max_sequence_length()); + BatchConfig::max_sequence_length(), + m->hidden_size); } } @@ -575,11 +518,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; - int q_block_size = m->qProjSize * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); - int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); - int vt_req_block_size = vt_block_size * m->num_kv_heads; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -595,7 +540,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int m_ = num_new_tokens; int n = total_tokens; int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; @@ -605,72 +551,37 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; + tokens_previous_requests * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods); - if (m->num_kv_heads == m->num_q_heads) { - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); - } else { - strideB = 0; - // use hipblasGemmStridedBatchedEx - int one_step_heads = m->num_q_heads / m->num_kv_heads; - m_ = num_new_tokens; - n = total_tokens; - k = m->qProjSize; - lda = k, ldb = k, ldc = m_; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A + step * strideA * one_step_heads, - hipblas_data_type, - lda, - strideA, - B + step * kt_block_size, - hipblas_data_type, - ldb, - strideB, - &beta, - C + step * strideC * one_step_heads, - hipblas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - } // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; @@ -740,7 +651,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m_ = num_new_tokens; n = m->vProjSize; k = total_tokens; - lda = m_, ldb = n, ldc = m_; + lda = m_, ldb = n * m->num_q_heads, ldc = m_; strideA = num_new_tokens * total_tokens; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; @@ -755,64 +666,29 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + tokens_previous_requests * m->num_q_heads * m->vProjSize; - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } else { - int one_step_heads = m->num_q_heads / m->num_kv_heads; - n = m->vProjSize; - lda = m_, ldb = n, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = 0; - strideC = num_new_tokens * m->vProjSize; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A + step * one_step_heads * strideA, - hipblas_data_type, - lda, - strideA, - B + step * vt_block_size, - hipblas_data_type, - ldb, - strideB, - &beta, - C + step * one_step_heads * strideC, - hipblas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - } + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; @@ -820,8 +696,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, n = num_new_tokens; lda = k, ldb = n, ldc = m_; A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_kv_heads + - m->vProjSize * m->num_kv_heads); + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); B = C; C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; @@ -850,8 +726,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -1028,11 +904,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( global_num_kv_heads = _global_num_kv_heads; num_q_heads = _num_q_heads; num_kv_heads = _num_kv_heads; + hidden_size = num_q_heads * qProjSize; weightSize = ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * num_q_heads + - (kSize * kProjSize + vSize * vProjSize) * num_kv_heads) * + (kSize * kProjSize + vSize * vProjSize) * num_q_heads) * size_of_dt; if (quantization_type != DT_NONE) { quantized_weightSize = get_quantization_to_byte_size( @@ -1041,7 +918,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; int final_bias_size = oProjSize; biasSize = (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); @@ -1071,28 +948,27 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t qkv_max_proj_size = - max_tokens_per_batch * - (qProjSize * num_q_heads + kProjSize * num_kv_heads + - vProjSize * num_kv_heads); + size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + + kProjSize * num_q_heads + + vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { case INC_DECODING_MODE: case TREE_VERIFY_MODE: { - key_cache_size = num_kv_heads * kProjSize * + key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); - value_cache_size = num_kv_heads * vProjSize * + value_cache_size = num_q_heads * vProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); break; } case BEAM_SEARCH_MODE: { - key_cache_size = num_kv_heads * kProjSize * + key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - value_cache_size = num_kv_heads * vProjSize * + value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -1105,14 +981,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; - size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - size_t W_out_contiguous_size = W_out_block_size * num_q_heads; size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads + - kProjSize * num_kv_heads)) / + kProjSize * num_q_heads)) / 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + - 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * + 2 * qk_prod_size + attn_heads_size) * size_of_dt + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + complex_size * sizeof(hipFloatComplex); // more components will @@ -1173,9 +1047,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * size_of_dt); // offset += attn_heads_size * size_of_dt; - W_out_contiguous = gpu_mem_allocator.allocate_reserved_untyped( - W_out_contiguous_size * size_of_dt); - // offset += W_out_contiguous_size * size_of_dt; complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(hipFloatComplex); @@ -1189,8 +1060,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( qk_prod_size * size_of_dt); attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size * size_of_dt); - W_out_contiguous = gpu_mem_allocator.allocate_instance_untyped( - W_out_contiguous_size * size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 5a2a14387e..00d45a9cfa 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -76,59 +76,33 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int kProjSize, int vProjSize, int global_num_q_heads, - int global_num_kv_heads, int num_q_heads, - int num_kv_heads, bool scaling_query, - float scaling_factor) { - CUDA_KERNEL_LOOP(i, - num_tokens * - (qProjSize * num_q_heads + kProjSize * num_kv_heads + - vProjSize * num_kv_heads)) { + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) { // for simplicity, assume q, k, v is in same shape // 0->q, 1->k, 2->v // int qkv_index = i / (num_tokens * qProjSize) % 3; - int qkv_index = i < num_tokens * qProjSize * num_q_heads - ? 0 - : (i < num_tokens * (qProjSize * num_q_heads + - kProjSize * num_kv_heads) - ? 1 - : 2); - - // int head_idx = i / (num_tokens * (qProjSize + kProjSize + vProjSize)); - // int qkv_block_size = (qProjSize + kProjSize + vProjSize) * num_tokens; - int q_block_size = qProjSize * num_tokens * num_q_heads; - int k_block_size = kProjSize * num_tokens * num_kv_heads; - - // int idx = i % (num_tokens * (qProjSize)); - - // int real_part_index = - // head_idx * qkv_block_size + qkv_index * q_block_size + idx; - int bias_idx = 0; - if (qkv_index == 0) { - int head_idx = i / (num_tokens * qProjSize); - int global_head_idx = head_idx + shard_id * num_q_heads; - int global_i = i + shard_id * num_q_heads * num_tokens * qProjSize; - bias_idx = global_head_idx * qProjSize + - (global_i % (num_tokens * (qProjSize)) % qProjSize); - } else { + int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); + size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; - int idx = - qkv_index == 1 ? i - q_block_size : i - q_block_size - k_block_size; - int pre_length = qkv_index == 1 ? qProjSize * global_num_q_heads - : qProjSize * global_num_q_heads + - kProjSize * global_num_kv_heads; + int qkv_index = in_token_idx / hidden_size; - int head_idx = idx / (num_tokens * kProjSize); - int global_head_idx = head_idx + shard_id * num_kv_heads; - int global_idx = idx + shard_id * num_tokens * num_kv_heads * kProjSize; + int proj_size = qkv_index == 0 ? qProjSize : kProjSize; - bias_idx = pre_length + global_head_idx * kProjSize + - (global_idx % (num_tokens * (qProjSize)) % qProjSize); - } - // int bias_idx = qkv_index * qProjSize * global_num_q_heads + - // global_head_idx * qProjSize + (idx % qProjSize); + int head_idx = + (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size; + int global_head_idx = head_idx + shard_id * num_q_heads; + + size_t pre_length = + qkv_index == 0 + ? 0 + : (qkv_index == 1 ? qProjSize * global_num_q_heads + : qProjSize * global_num_q_heads * KV_WEIGHT_NUM); + + size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size; input_ptr[i] += bias_ptr[bias_idx]; @@ -143,9 +117,12 @@ __global__ void scaling_query_kernel(DT *input_ptr, int qProjSize, int num_tokens, int num_q_heads, - float scaling_factor) { - CUDA_KERNEL_LOOP(i, num_tokens * (qProjSize * num_q_heads)) { - input_ptr[i] *= scaling_factor; + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *= + scaling_factor; } } @@ -205,28 +182,22 @@ __global__ void BatchConfig::PerTokenInfo const *tokenInfos, int qProjSize, int kProjSize, - int num_q_heads, int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size) { - CUDA_KERNEL_LOOP( - i, - num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { + size_t q_array_size, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { // create complex number bool q_tensor = i < (q_array_size / 2); int proj_size = q_tensor ? qProjSize : kProjSize; int real_i = q_tensor ? i : i - q_array_size / 2; - int head_idx = real_i / (num_tokens * proj_size / 2); - int idx = real_i % (num_tokens * proj_size / 2); - int token_idx = - (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); + int token_idx = real_i / (hidden_size / 2); + int idx = real_i % (proj_size / 2); + int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); - int real_part_index = idx + token_idx * (proj_size / 2) + - head_idx * (q_tensor ? q_block_size : k_block_size) + - (q_tensor ? 0 : q_array_size); + int real_part_index = idx + head_idx * proj_size + + token_idx * hidden_size * QKV_WEIGHT_NUM + + hidden_size * (q_tensor ? 0 : 1); int complex_part_index = real_part_index + (proj_size / 2); complex_input[i] = {input_ptr[real_part_index], @@ -277,51 +248,38 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens // Output >>> qProjSize x num_tokens x 3 x num_q_heads - int m_q = m->qProjSize; - int m_k = m->kProjSize; - int m_v = m->vProjSize; + int m_q = m->qProjSize * m->num_q_heads; + int m_k = m->kProjSize * m->num_q_heads; + int m_v = m->vProjSize * m->num_q_heads; assert(m_q == m_k && m_k == m_v); // keep things simple for now int n = bc->num_active_tokens(); int k = m->qSize; - int m_ = m_q; - int lda = k, ldb = k, ldc = m_q; - - size_t strideA = m_q * k; // query weight head size - size_t strideB = 0; // input stays the same for all heads. - size_t strideC = m_q * n; // size of the output block for each head. - - // compute QKV - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - strideA, - input_ptr, - cublas_data_type, - ldb, - strideB, - &beta, - output_ptr, - cublas_data_type, - ldc, - strideC, - m->num_q_heads + m->num_kv_heads + - m->num_kv_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // apply rotary emmmbedding for q and k - // step1 change the k, v to complex tensor + int m_ = m_q * QKV_WEIGHT_NUM; + int lda = k, ldb = k, ldc = m_; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // apply rotary emmmbedding for q + // and k step1 change the k, v to complex tensor int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; - int q_block_size = m->qProjSize * num_tokens; - int k_block_size = m->kProjSize * num_tokens; - int q_array_size = m->qProjSize * num_tokens * m->num_q_heads; + size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; // apply bias for q, k, v if (*m->qkv_bias) { apply_proj_bias_qkv<<kProjSize, m->vProjSize, m->global_num_q_heads, - m->global_num_kv_heads, m->num_q_heads, - m->num_kv_heads, *m->scaling_query, - m->scaling_factor); + m->scaling_factor, + m->hidden_size); } else if (m->scaling_query) { scaling_query_kernel<<num_q_heads, m->qProjSize, - m->scaling_factor); + m->scaling_factor, + m->hidden_size); } if (*m->apply_rotary_embedding) { /*q&k*/ - parallelism = - num_tokens * - (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_kv_heads) / 2; + parallelism = num_tokens * m->hidden_size; apply_rotary_embedding_hf<<token_infos, m->qProjSize, m->kProjSize, - m->num_q_heads, num_tokens, - m->num_kv_heads, - q_block_size, - k_block_size, - q_array_size); + q_array_size, + m->hidden_size); } } @@ -378,8 +331,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); if (num_tokens > 0) { - int parallelism = - (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; + int parallelism = m->hidden_size * num_tokens; store_kv_cache<<(m->keyCache), static_cast
(m->valueCache), m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, num_tokens, - m->num_q_heads, - m->num_kv_heads, - BatchConfig::max_sequence_length()); + BatchConfig::max_sequence_length(), + m->hidden_size); } } @@ -507,35 +455,26 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, int num_tokens, - int num_q_heads, - int num_kv_heads, - int max_seq_len) { - CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * num_tokens * num_q_heads; - int k_array_size = kProjSize * num_tokens * num_kv_heads; - - bool k_cache = i < k_array_size; - int real_i = k_cache ? i : i - k_array_size; - - int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = real_i / (num_tokens * proj_size); - int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = real_i % proj_size; - - DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_tokens + - token_idx * proj_size + data_idx]; + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + // key cache + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -578,11 +517,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; - int q_block_size = m->qProjSize * num_tokens; - int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); - int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); - int vt_req_block_size = vt_block_size * m->num_kv_heads; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -598,7 +539,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int m_ = num_new_tokens; int n = total_tokens; int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; @@ -608,72 +550,36 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; + tokens_previous_requests * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods); - if (m->num_kv_heads == m->num_q_heads) { - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - } else { - strideB = 0; - // use cublasGemmStridedBatchedEx - int one_step_heads = m->num_q_heads / m->num_kv_heads; - m_ = num_new_tokens; - n = total_tokens; - k = m->qProjSize; - lda = k, ldb = k, ldc = m_; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A + step * strideA * one_step_heads, - cublas_data_type, - lda, - strideA, - B + step * kt_block_size, - cublas_data_type, - ldb, - strideB, - &beta, - C + step * strideC * one_step_heads, - cublas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; @@ -745,7 +651,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m_ = num_new_tokens; n = m->vProjSize; k = total_tokens; - lda = m_, ldb = n, ldc = m_; + lda = m_, ldb = n * m->num_q_heads, ldc = m_; strideA = num_new_tokens * total_tokens; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; @@ -759,65 +665,29 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, // requests C = static_cast
(m->attn_heads) + tokens_previous_requests * m->num_q_heads * m->vProjSize; - - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - int one_step_heads = m->num_q_heads / m->num_kv_heads; - n = m->vProjSize; - lda = m_, ldb = n, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = 0; - strideC = num_new_tokens * m->vProjSize; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A + step * one_step_heads * strideA, - cublas_data_type, - lda, - strideA, - B + step * vt_block_size, - cublas_data_type, - ldb, - strideB, - &beta, - C + step * one_step_heads * strideC, - cublas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; m_ = m->oProjSize; @@ -825,8 +695,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, n = num_new_tokens; lda = k, ldb = n, ldc = m_; A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_kv_heads + - m->vProjSize * m->num_kv_heads); + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); B = C; C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; @@ -855,8 +725,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; apply_proj_bias_w<< 0 ? vProjSize : vSize)) * num_q_heads + - (kSize * kProjSize + vSize * vProjSize) * num_kv_heads) * + (kSize * kProjSize + vSize * vProjSize) * num_q_heads) * size_of_dt; if (quantization_type != DT_NONE) { quantized_weightSize = get_quantization_to_byte_size( @@ -1070,7 +941,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; int final_bias_size = oProjSize; biasSize = (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); @@ -1100,28 +971,27 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t qkv_max_proj_size = - max_tokens_per_batch * - (qProjSize * num_q_heads + kProjSize * num_kv_heads + - vProjSize * num_kv_heads); + size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + + kProjSize * num_q_heads + + vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { case INC_DECODING_MODE: case TREE_VERIFY_MODE: { - key_cache_size = num_kv_heads * kProjSize * + key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); - value_cache_size = num_kv_heads * vProjSize * + value_cache_size = num_q_heads * vProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); break; } case BEAM_SEARCH_MODE: { - key_cache_size = num_kv_heads * kProjSize * + key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - value_cache_size = num_kv_heads * vProjSize * + value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -1134,14 +1004,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; - size_t W_out_block_size = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - size_t W_out_contiguous_size = W_out_block_size * num_q_heads; size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads + - kProjSize * num_kv_heads)) / + kProjSize * num_q_heads)) / 2; size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + - 2 * qk_prod_size + attn_heads_size + W_out_contiguous_size) * + 2 * qk_prod_size + attn_heads_size) * size_of_dt + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + complex_size * sizeof(cuFloatComplex); // more components will @@ -1202,9 +1070,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * size_of_dt); // offset += attn_heads_size * size_of_dt; - W_out_contiguous = gpu_mem_allocator.allocate_reserved_untyped( - W_out_contiguous_size * size_of_dt); - // offset += W_out_contiguous_size * size_of_dt; complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(cuFloatComplex); @@ -1218,8 +1083,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( qk_prod_size * size_of_dt); attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size * size_of_dt); - W_out_contiguous = gpu_mem_allocator.allocate_instance_untyped( - W_out_contiguous_size * size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 4c78960d5f..eb6fd721e6 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -152,8 +152,8 @@ Tensor int kParas = kProjSize * kSize; int vParas = vProjSize * vSize; int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_q_heads + kParas * num_kv_heads + - vParas * num_kv_heads + oParas * num_q_heads; + int weight_size = qParas * num_q_heads + kParas * num_q_heads + + vParas * num_q_heads + oParas * num_q_heads; { int dims[1] = {weight_size}; li->weights[0] = create_weight_legion_ordering(1, @@ -167,7 +167,7 @@ Tensor if (qkv_bias || final_bias) { // q, k, v, o int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0)}; li->weights[1] = create_weight_legion_ordering(1, @@ -319,7 +319,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_kv_heads * (kParas + vParas); + this->num_q_heads * (kParas + vParas); dims[1].is_replica_dim = false; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); @@ -332,7 +332,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; bias_shape.dims[0].size = (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -421,7 +421,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_kv_heads * (kParas + vParas); + this->num_q_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = qParas + kParas + vParas + oParas; int seed = std::rand(); @@ -435,7 +435,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; bias_shape.dims[0].size = (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 30b6f5cb84..173d4a5b1d 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -44,22 +44,19 @@ __global__ void spec_store_kv_cache( int kProjSize, int vProjSize, int num_tokens, - int num_q_heads, - int num_kv_heads, int max_seq_len, int max_beam_width, - bool is_root) { - CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * num_tokens * num_q_heads; - int k_array_size = kProjSize * num_tokens * num_kv_heads; + bool is_root, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { + int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int offset = i % hidden_size; - bool k_cache = i < k_array_size; - int real_i = k_cache ? i : i - k_array_size; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = real_i / (num_tokens * proj_size); - int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = real_i % proj_size; + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; // above no need to be changed // int const req_id = id_map[token_idx].request_index; @@ -69,10 +66,6 @@ __global__ void spec_store_kv_cache( // int const beam_depth = id_map[token_idx].beam_depth; // int const beam_width = id_map[token_idx].beam_width; - DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_tokens + - token_idx * proj_size + data_idx]; - int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; @@ -81,27 +74,26 @@ __global__ void spec_store_kv_cache( int const beam_width = beamRequestInfos[req_id].beam_size; // new token - int new_token_cache_idx = (req_id * max_beam_width + sub_req_id) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + - tok_id * proj_size + data_idx; - - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[new_token_cache_idx] = val; + kCache_ptr[(req_id * max_beam_width + sub_req_id) * + (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_beam_width + sub_req_id) * + (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = vVal; // replica in the root iteration if (beam_depth == 1) { for (int i = 1; i < beam_width; i++) { - cache_ptr[(req_id * max_beam_width + i) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = vVal; } } // naive cache stealing if (sub_req_id != parent_id) { - if (data_idx == 0 && head_idx == 0 && k_cache) { + if (offset == 0 && tok_id == 0) { printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " "%d, tok_id %d\n", beam_depth, @@ -114,14 +106,13 @@ __global__ void spec_store_kv_cache( for (int depth = 0; depth < beam_depth; depth++) { int steal_token_idx = tok_id - beam_depth + depth; int steal_from_idx = (req_id * max_beam_width + parent_id) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + - steal_token_idx * proj_size + data_idx; + (hidden_size * max_seq_len) + + steal_token_idx * hidden_size + offset; int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + - steal_token_idx * proj_size + data_idx; - cache_ptr[steal_to_idx] = cache_ptr[steal_from_idx]; + (hidden_size * max_seq_len) + + steal_token_idx * hidden_size + offset; + kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; + vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ // printf("cache stealing kernel!, steal_token_idx %d\n", @@ -155,8 +146,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); if (num_tokens > 0) { - int parallelism = - (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -173,11 +163,10 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens, - m->num_q_heads, - m->num_kv_heads, BatchConfig::max_sequence_length(), BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /*root*/ curr_depth == 0); + /*root*/ curr_depth == 0, + m->hidden_size); } } @@ -223,12 +212,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int tokens_prev_requests_squares = 0; // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int q_block_size = m->qProjSize * num_tokens; - - int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); - int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); - int vt_req_block_size = vt_block_size * m->num_kv_heads; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -247,7 +237,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int m_ = num_new_tokens; int n = total_tokens; int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; @@ -259,7 +250,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; + tokens_previous_requests * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + @@ -274,64 +266,30 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT *C = static_cast
(m->qk_prods) + m->num_q_heads * tokens_prev_requests_squares; - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } else { - strideB = 0; - int one_step_heads = m->num_q_heads / m->num_kv_heads; - m_ = num_new_tokens; - n = total_tokens; - k = m->qProjSize; - lda = k, ldb = k, ldc = m_; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A + step * strideA * one_step_heads, - hipblas_data_type, - lda, - strideA, - B + step * kt_block_size, - hipblas_data_type, - ldb, - strideB, - &beta, - C + step * strideC * one_step_heads, - hipblas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - } + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), @@ -401,7 +359,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m_ = num_new_tokens; n = m->vProjSize; k = total_tokens; - lda = m_, ldb = n, ldc = m_; + lda = m_, ldb = n * m->num_q_heads, ldc = m_; strideA = num_new_tokens * total_tokens; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; @@ -417,64 +375,29 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + tokens_previous_requests * m->num_q_heads * m->vProjSize; - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } else { - int one_step_heads = m->num_q_heads / m->num_kv_heads; - n = m->vProjSize; - lda = m_, ldb = n, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = 0; - strideC = num_new_tokens * m->vProjSize; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A + step * one_step_heads * strideA, - hipblas_data_type, - lda, - strideA, - B + step * vt_block_size, - hipblas_data_type, - ldb, - strideB, - &beta, - C + step * one_step_heads, - hipblas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - } + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; @@ -483,8 +406,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, n = num_new_tokens; lda = k, ldb = n, ldc = m_; A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_kv_heads + - m->vProjSize * m->num_kv_heads); + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); B = C; C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; @@ -515,8 +438,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b479528607..00eec96824 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -43,22 +43,19 @@ __global__ void spec_store_kv_cache( int kProjSize, int vProjSize, int num_tokens, - int num_q_heads, - int num_kv_heads, int max_seq_len, int max_beam_width, - bool is_root) { - CUDA_KERNEL_LOOP(i, num_tokens * (kProjSize + vProjSize) * num_kv_heads) { - int q_array_size = qProjSize * num_tokens * num_q_heads; - int k_array_size = kProjSize * num_tokens * num_kv_heads; + bool is_root, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { + int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int offset = i % hidden_size; - bool k_cache = i < k_array_size; - int real_i = k_cache ? i : i - k_array_size; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - int proj_size = k_cache ? kProjSize : vProjSize; - int head_idx = real_i / (num_tokens * proj_size); - int token_idx = (real_i - head_idx * (num_tokens * proj_size)) / proj_size; - int data_idx = real_i % proj_size; + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; // above no need to be changed // int const req_id = id_map[token_idx].request_index; @@ -68,10 +65,6 @@ __global__ void spec_store_kv_cache( // int const beam_depth = id_map[token_idx].beam_depth; // int const beam_width = id_map[token_idx].beam_width; - DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_tokens + - token_idx * proj_size + data_idx]; - int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; @@ -79,22 +72,20 @@ __global__ void spec_store_kv_cache( int const beam_depth = beamRequestInfos[req_id].current_depth; int const beam_width = beamRequestInfos[req_id].beam_size; - // new token - int new_token_cache_idx = (req_id * max_beam_width + sub_req_id) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + - tok_id * proj_size + data_idx; - - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[new_token_cache_idx] = val; + kCache_ptr[(req_id * max_beam_width + sub_req_id) * + (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_beam_width + sub_req_id) * + (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = vVal; // replica in the root iteration if (beam_depth == 1) { for (int i = 1; i < beam_width; i++) { - cache_ptr[(req_id * max_beam_width + i) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = vVal; } } @@ -119,7 +110,7 @@ __global__ void spec_store_kv_cache( // naive cache stealing if (sub_req_id != parent_id) { - if (data_idx == 0 && head_idx == 0 && k_cache) { + if (offset == 0 && tok_id == 0) { printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " "%d, tok_id %d\n", beam_depth, @@ -132,14 +123,13 @@ __global__ void spec_store_kv_cache( for (int depth = 0; depth < beam_depth; depth++) { int steal_token_idx = tok_id - beam_depth + depth; int steal_from_idx = (req_id * max_beam_width + parent_id) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + - steal_token_idx * proj_size + data_idx; + (hidden_size * max_seq_len) + + steal_token_idx * hidden_size + offset; int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + - steal_token_idx * proj_size + data_idx; - cache_ptr[steal_to_idx] = cache_ptr[steal_from_idx]; + (hidden_size * max_seq_len) + + steal_token_idx * hidden_size + offset; + kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; + vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ // printf("cache stealing kernel!, steal_token_idx %d\n", @@ -173,8 +163,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); if (num_tokens > 0) { - int parallelism = - (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; spec_store_kv_cache<<kProjSize, m->vProjSize, num_tokens, - m->num_q_heads, - m->num_kv_heads, BatchConfig::max_sequence_length(), BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /*root*/ curr_depth == 0); + /*root*/ curr_depth == 0, + m->hidden_size); } } @@ -239,12 +227,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int tokens_prev_requests_squares = 0; // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int q_block_size = m->qProjSize * num_tokens; - - int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); - int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); - int vt_req_block_size = vt_block_size * m->num_kv_heads; + int q_block_size = m->qProjSize; + + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -269,7 +259,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int m_ = num_new_tokens; int n = total_tokens; int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens; @@ -281,7 +272,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize; + tokens_previous_requests * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + @@ -295,65 +287,29 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods) + m->num_q_heads * tokens_prev_requests_squares; - - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - strideB = 0; - int one_step_heads = m->num_q_heads / m->num_kv_heads; - m_ = num_new_tokens; - n = total_tokens; - k = m->qProjSize; - lda = k, ldb = k, ldc = m_; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A + step * strideA * one_step_heads, - cublas_data_type, - lda, - strideA, - B + step * kt_block_size, - cublas_data_type, - ldb, - strideB, - &beta, - C + step * strideC * one_step_heads, - cublas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // add alibi position bias to qk production // add alibi position bias to qk production if (*m->position_bias) { @@ -426,7 +382,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m_ = num_new_tokens; n = m->vProjSize; k = total_tokens; - lda = m_, ldb = n, ldc = m_; + lda = m_, ldb = n * m->num_q_heads, ldc = m_; strideA = num_new_tokens * total_tokens; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; @@ -441,65 +397,29 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // requests C = static_cast
(m->attn_heads) + tokens_previous_requests * m->num_q_heads * m->vProjSize; - - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - int one_step_heads = m->num_q_heads / m->num_kv_heads; - n = m->vProjSize; - lda = m_, ldb = n, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = 0; - strideC = num_new_tokens * m->vProjSize; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A + step * one_step_heads * strideA, - cublas_data_type, - lda, - strideA, - B + step * vt_block_size, - cublas_data_type, - ldb, - strideB, - &beta, - C + step * one_step_heads, - cublas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; @@ -508,8 +428,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, n = num_new_tokens; lda = k, ldb = n, ldc = m_; A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_kv_heads + - m->vProjSize * m->num_kv_heads); + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); B = C; C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; @@ -540,8 +460,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; apply_proj_bias_w<< 0 ? vProjSize : vSize); int one_head_size = qParas + kParas + vParas + oParas; - int weight_size = qParas * num_q_heads + kParas * num_kv_heads + - vParas * num_kv_heads + oParas * num_q_heads; + int weight_size = qParas * num_q_heads + kParas * num_q_heads + + vParas * num_q_heads + oParas * num_q_heads; { // compress the weight size if quantization. if (quantization_type != DT_NONE) { @@ -179,7 +179,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( if (qkv_bias || final_bias) { // q, k, v, o int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0)}; li->weights[1] = create_weight_legion_ordering(1, @@ -346,7 +346,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_kv_heads * (kParas + vParas); + this->num_q_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { @@ -367,7 +367,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; bias_shape.dims[0].size = (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; @@ -461,7 +461,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[0].size = dims[0].degree; dims[1] = inputs[0]->dims[num_dims - 1]; dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_kv_heads * (kParas + vParas); + this->num_q_heads * (kParas + vParas); dims[1].is_replica_dim = false; // dims[2].size = qParas + kParas + vParas + oParas; if (quantization_type != DT_NONE) { @@ -480,7 +480,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_kv_heads; + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; bias_shape.dims[0].size = (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index c10cf9d0ca..1d9ebf67e0 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -42,40 +42,29 @@ __global__ void commit_tokens_kernel( int vProjSize, int num_tokens_to_commit, int num_active_tokens_in_last_batch, - int num_q_heads, - int num_kv_heads, - int max_seq_len) { - - CUDA_KERNEL_LOOP( - i, num_tokens_to_commit * (kProjSize + vProjSize) * num_kv_heads) { - bool k_cache = i < (num_tokens_to_commit * kProjSize * num_kv_heads); - int real_i = - k_cache ? i : i - (num_tokens_to_commit * kProjSize * num_kv_heads); - - int proj_size = k_cache ? kProjSize : vProjSize; - int data_idx = real_i % proj_size; - int head_idx = real_i / (num_tokens_to_commit * proj_size); - int token_pos = - (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; + int max_seq_len, + int hidden_size) { + + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + + int token_pos = i / (hidden_size * KV_WEIGHT_NUM); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; + int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - int q_array_size = - qProjSize * num_active_tokens_in_last_batch * num_q_heads; - int k_array_size = - kProjSize * num_active_tokens_in_last_batch * num_kv_heads; + size_t val_idx = + token_idx_in_last_batch * 3 * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; - DT val = - devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_active_tokens_in_last_batch + - token_idx_in_last_batch * proj_size + data_idx]; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -85,8 +74,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, hipStream_t stream) { int num_tokens_to_commit = bc->num_tokens_to_commit; if (num_tokens_to_commit > 0) { - int parallelism = - (m->kProjSize + m->vProjSize) * num_tokens_to_commit * m->num_kv_heads; + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit; hipLaunchKernelGGL( HIP_KERNEL_NAME(commit_tokens_kernel
), GET_BLOCKS(parallelism), @@ -102,9 +90,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - m->num_q_heads, - m->num_kv_heads, - BatchConfig::max_sequence_length()); + BatchConfig::max_sequence_length(), + m->hidden_size); } } @@ -120,37 +107,23 @@ __global__ void update_tree_branch_kv_cache( int num_tokens_in_branch, int processed_tokens_in_batch, int total_tokens_in_batch, - int num_q_heads, - int num_kv_heads, - int max_seq_len) { - CUDA_KERNEL_LOOP( - i, num_tokens_in_branch * (kProjSize + vProjSize) * num_kv_heads) { - - int q_array_size = qProjSize * total_tokens_in_batch * num_q_heads; - int k_array_size = kProjSize * total_tokens_in_batch * num_kv_heads; - - bool k_cache = i < (num_tokens_in_branch * kProjSize * num_kv_heads); - int real_i = - k_cache ? i : i - (num_tokens_in_branch * kProjSize * num_kv_heads); - - int proj_size = k_cache ? kProjSize : vProjSize; - int data_idx = real_i % proj_size; - int token_idx = - (real_i / proj_size) % num_tokens_in_branch; // index in the tree branch - int head_idx = real_i / (proj_size * num_tokens_in_branch); + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { + int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch - DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * total_tokens_in_batch + - token_idx * proj_size + data_idx]; + size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset; + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - - cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -194,11 +167,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int processed_tokens_in_batch = 0; // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize * bc->num_active_tokens(); - int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); - int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); - int vt_req_block_size = vt_block_size * m->num_kv_heads; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -221,8 +196,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { // update K-V cache - int parallelism = - (m->kProjSize + m->vProjSize) * num_new_tokens * m->num_kv_heads; + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens; hipLaunchKernelGGL( HIP_KERNEL_NAME(update_tree_branch_kv_cache
), GET_BLOCKS(parallelism), @@ -239,9 +213,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch m->num_active_tokens, // total_tokens_in_batch - m->num_q_heads, - m->num_kv_heads, - BatchConfig::max_sequence_length()); + BatchConfig::max_sequence_length(), + m->hidden_size); } // bc->token_last_available_idx[i] + 1; @@ -249,7 +222,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int m_ = num_new_tokens; int n = total_tokens_in_request; int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens_in_request; @@ -261,67 +235,38 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - processed_tokens_in_batch * m->qProjSize; + processed_tokens_in_batch * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods); - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } else { - strideB = 0; - int one_step_heads = m->num_q_heads / m->num_kv_heads; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A + step * strideA * one_step_heads, - hipblas_data_type, - lda, - strideA, - B + step * kt_block_size, - hipblas_data_type, - ldb, - strideB, - &beta, - C + step * strideC * one_step_heads, - hipblas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - } + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens_in_request * num_new_tokens; @@ -392,7 +337,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m_ = num_new_tokens; n = m->vProjSize; k = total_tokens_in_request; - lda = m_, ldb = n, ldc = m_; + lda = m_, ldb = n * m->num_q_heads, ldc = m_; strideA = num_new_tokens * total_tokens_in_request; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; @@ -407,60 +352,29 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } else { - int one_step_heads = m->num_q_heads / m->num_kv_heads; - strideB = 0; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A + step * one_step_heads * strideA, - hipblas_data_type, - lda, - strideA, - B + step * vt_block_size, - hipblas_data_type, - ldb, - strideB, - &beta, - C + step * one_step_heads * strideC, - hipblas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - } + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; @@ -469,8 +383,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, n = num_new_tokens; lda = k, ldb = n, ldc = m_; A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_kv_heads + - m->vProjSize * m->num_kv_heads); + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); B = C; C = static_cast
(output_ptr) + processed_tokens_in_batch * m->oProjSize; @@ -503,8 +417,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 5901c0e3ab..0da432b732 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -42,40 +42,29 @@ __global__ void commit_tokens_kernel( int vProjSize, int num_tokens_to_commit, int num_active_tokens_in_last_batch, - int num_q_heads, - int num_kv_heads, - int max_seq_len) { - - CUDA_KERNEL_LOOP( - i, num_tokens_to_commit * (kProjSize + vProjSize) * num_kv_heads) { - bool k_cache = i < (num_tokens_to_commit * kProjSize * num_kv_heads); - int real_i = - k_cache ? i : i - (num_tokens_to_commit * kProjSize * num_kv_heads); - - int proj_size = k_cache ? kProjSize : vProjSize; - int data_idx = real_i % proj_size; - int head_idx = real_i / (num_tokens_to_commit * proj_size); - int token_pos = - (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; + int max_seq_len, + int hidden_size) { + + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + + int token_pos = i / (hidden_size * KV_WEIGHT_NUM); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; + int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - int q_array_size = - qProjSize * num_active_tokens_in_last_batch * num_q_heads; - int k_array_size = - kProjSize * num_active_tokens_in_last_batch * num_kv_heads; + size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size + + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; - DT val = - devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_active_tokens_in_last_batch + - token_idx_in_last_batch * proj_size + data_idx]; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -85,8 +74,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens_to_commit = bc->num_tokens_to_commit; if (num_tokens_to_commit > 0) { - int parallelism = - (m->kProjSize + m->vProjSize) * num_tokens_to_commit * m->num_kv_heads; + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit; commit_tokens_kernel<<vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - m->num_q_heads, - m->num_kv_heads, - BatchConfig::max_sequence_length()); + BatchConfig::max_sequence_length(), + m->hidden_size); } } @@ -118,37 +105,26 @@ __global__ void update_tree_branch_kv_cache( int num_tokens_in_branch, int processed_tokens_in_batch, int total_tokens_in_batch, - int num_q_heads, - int num_kv_heads, - int max_seq_len) { - CUDA_KERNEL_LOOP( - i, num_tokens_in_branch * (kProjSize + vProjSize) * num_kv_heads) { - - int q_array_size = qProjSize * total_tokens_in_batch * num_q_heads; - int k_array_size = kProjSize * total_tokens_in_batch * num_kv_heads; - - bool k_cache = i < (num_tokens_in_branch * kProjSize * num_kv_heads); - int real_i = - k_cache ? i : i - (num_tokens_in_branch * kProjSize * num_kv_heads); + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { - int proj_size = k_cache ? kProjSize : vProjSize; - int data_idx = real_i % proj_size; - int token_idx = - (real_i / proj_size) % num_tokens_in_branch; // index in the tree branch - int head_idx = real_i / (proj_size * num_tokens_in_branch); + int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch - DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * total_tokens_in_batch + - token_idx * proj_size + data_idx]; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - DT *cache_ptr = k_cache ? kCache_ptr : vCache_ptr; - - cache_ptr[req_id * (num_kv_heads * max_seq_len * proj_size) + - head_idx * (max_seq_len * proj_size) + tok_id * proj_size + - data_idx] = val; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -192,11 +168,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int processed_tokens_in_batch = 0; // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize * bc->num_active_tokens(); - int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); - int kt_req_block_size = kt_block_size * m->num_kv_heads; - int vt_block_size = m->vProjSize * BatchConfig::max_sequence_length(); - int vt_req_block_size = vt_block_size * m->num_kv_heads; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -219,8 +197,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { // update K-V cache - int parallelism = - (m->kProjSize + m->vProjSize) * num_new_tokens * m->num_kv_heads; + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens; update_tree_branch_kv_cache<<num_active_tokens, // total_tokens_in_batch - m->num_q_heads, - m->num_kv_heads, - BatchConfig::max_sequence_length()); + BatchConfig::max_sequence_length(), + m->hidden_size); } // bc->token_last_available_idx[i] + 1; @@ -245,7 +221,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int m_ = num_new_tokens; int n = total_tokens_in_request; int k = m->qProjSize; - int lda = k, ldb = k, ldc = m_; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; int strideA = q_block_size; int strideB = kt_block_size; int strideC = num_new_tokens * total_tokens_in_request; @@ -257,67 +234,37 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - processed_tokens_in_batch * m->qProjSize; + processed_tokens_in_batch * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // To get C, skip over QK^T products from previous requests DT *C = static_cast
(m->qk_prods); - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - strideB = 0; - int one_step_heads = m->num_q_heads / m->num_kv_heads; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A + step * strideA * one_step_heads, - cublas_data_type, - lda, - strideA, - B + step * kt_block_size, - cublas_data_type, - ldb, - strideB, - &beta, - C + step * strideC * one_step_heads, - cublas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // add alibi position bias to qk production // add alibi position bias to qk production if (*m->position_bias) { @@ -392,7 +339,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m_ = num_new_tokens; n = m->vProjSize; k = total_tokens_in_request; - lda = m_, ldb = n, ldc = m_; + lda = m_, ldb = n * m->num_q_heads, ldc = m_; strideA = num_new_tokens * total_tokens_in_request; strideB = vt_block_size; strideC = num_new_tokens * m->vProjSize; @@ -407,60 +354,29 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(m->attn_heads) + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - if (m->num_q_heads == m->num_kv_heads) { - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - int one_step_heads = m->num_q_heads / m->num_kv_heads; - strideB = 0; - for (int step = 0; step < m->num_kv_heads; step++) { - checkCUDA( - cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A + step * one_step_heads * strideA, - cublas_data_type, - lda, - strideA, - B + step * vt_block_size, - cublas_data_type, - ldb, - strideB, - &beta, - C + step * one_step_heads * strideC, - cublas_data_type, - ldc, - strideC, - one_step_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Project to output, save result directly on output tensor alpha = 1.0f, beta = 0.0f; @@ -469,8 +385,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, n = num_new_tokens; lda = k, ldb = n, ldc = m_; A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_kv_heads + - m->vProjSize * m->num_kv_heads); + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); B = C; C = static_cast
(output_ptr) + processed_tokens_in_batch * m->oProjSize; @@ -503,8 +419,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_kv_heads + - m->vProjSize * m->global_num_kv_heads; + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; apply_proj_bias_w<< 1 and ("falcon" in model_name or "starcoder" in model_name): + if tp > 1 and ("falcon" in model_name): continue # skip tp=4 for big models if tp > 2 and ("7b" in model_name or "6.7b" in model_name): continue + + if full_precision and ("falcon" in model_name or "starcoder" in model_name): + continue _, after_slash = model_name.rsplit("/", maxsplit=1) filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" From f243b40382304b618d9d2312ec94907eae4f4167 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Mon, 16 Oct 2023 13:42:48 -0400 Subject: [PATCH 250/344] Allow token arrangement align with request index in batch (#1176) * arrange tokens by request index in incr decoding. * fix logic. * fix issues. * format. * undo output format change. * format. * remove empty line in end of the file. --- src/ops/fused.cu | 10 +- src/runtime/request_manager.cc | 229 ++++++++++++++++----------------- 2 files changed, 120 insertions(+), 119 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 7d0d5841f0..b157453035 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1104,14 +1104,20 @@ __host__ void } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back(weight_accessor[fused->op_weight_idx[i + woff]]); + weight_accessors_to_save.push_back( + weight_accessor[fused->op_weight_idx[i + woff]]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { output_accessors_to_save.push_back(output_accessor[i + ooff]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], shard_id, bc, input_accessors_to_save, weight_accessors_to_save, output_accessors_to_save); + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1b825318dd..b5688c07e6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -338,6 +338,7 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); + // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { size_t guid = @@ -356,115 +357,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // log_req_mgr.print("Output: %s", output.c_str()); } } + // Step 2: prepare the next batch for existing requests BatchConfig new_bc; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i]) { - continue; - } - assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + - old_bc.requestsInfo[i].num_tokens_in_batch; - assert(processed_tokens < request.tokens.size()); - bool request_completed = false; - // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { - request_completed = true; - } else if (request.tokens.back() == eos_token_id) { - // Encounter EOS token id - request_completed = true; - } - if (request_completed) { - request.status = Request::COMPLETED; - log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", - old_bc.requestsInfo[i].request_guid, - request.tokens.size()); - std::string output = this->tokenizer_->Decode(request.tokens); - - { - // update generation result and trigger future - GenerationResult &gr = request_generation_results[request.guid]; - assert(gr.guid == request.guid); - gr.output_tokens = request.tokens; - gr.output_text = output; - } - log_req_mgr.print("Final output: %s", output.c_str()); - num_processed_requests++; - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); - // Write output to file if needed: - if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath); - if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; - } - } - outputFile << std::endl; - outputFile << output; - outputFile.close(); - } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); - } - } - - // std::cout << "print results: " << std::endl; - // for (int i = 0; i < request.tokens.size(); i++) { - // std::cout << request.tokens.at(i) << ", "; - // } - } else { - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - if (new_bc.requestsInfo[i].token_start_offset + 1 == - request.tokens.size()) { - // Incremental phase - new_bc.requestsInfo[i].num_tokens_in_batch = 1; - } else { - // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); - } - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - assert(depth < request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; - new_bc.num_tokens++; - } - // Update profiling - profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps++; - } - } - // Step 3: add new requests to the next batch - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (new_bc.request_completed[i]) { + if (old_bc.request_completed[i]) { // add new requests to the next batch if (!pending_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_request_queue.front(); @@ -473,7 +370,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens - + BatchConfig::max_requests_per_batch() + (i + 1), (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; @@ -496,8 +394,107 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, break; } } + } else { + assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + old_bc.requestsInfo[i].num_tokens_in_batch; + assert(processed_tokens < request.tokens.size()); + bool request_completed = false; + // printf("model_type = %d\n", this->model_type); + if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + request_completed = true; + } else if (request.tokens.back() == eos_token_id) { + // Encounter EOS token id + request_completed = true; + } + if (request_completed) { + request.status = Request::COMPLETED; + log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", + old_bc.requestsInfo[i].request_guid, + request.tokens.size()); + std::string output = this->tokenizer_->Decode(request.tokens); + + { + // update generation result and trigger future + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.output_tokens = request.tokens; + gr.output_text = output; + } + log_req_mgr.print("Final output: %s", output.c_str()); + num_processed_requests++; + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); + // Write output to file if needed: + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath, std::ios::app); + if (outputFile.is_open()) { + outputFile << "end-to-end latency: " << std::fixed + << std::setprecision(3) << total_request_run_time + << std::endl; + outputFile << "num decoding steps: " << profile_info.decoding_steps + << std::endl; + outputFile << "token IDs: "; + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } + } + outputFile << std::endl; + outputFile << output; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + + } else { + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = + old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + if (new_bc.requestsInfo[i].token_start_offset + 1 == + request.tokens.size()) { + // Incremental phase + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + } else { + // Prompt phase + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(get_max_tokens_per_batch() - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].token_start_offset); + } + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + assert(depth < request.tokens.size()); + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; + new_bc.num_tokens++; + } + // Update profiling + profiling_requests[new_bc.requestsInfo[i].request_guid] + .decoding_steps++; + } } } + return new_bc; } @@ -654,11 +651,10 @@ BeamSearchBatchConfig // Write output to file if needed: if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath); + std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) - << profile_info.finish_time - profile_info.start_time + << std::setprecision(3) << total_request_run_time << std::endl; outputFile << "num decoding steps: " << profile_info.decoding_steps << std::endl; @@ -671,6 +667,7 @@ BeamSearchBatchConfig } outputFile << std::endl; outputFile << output; + outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath @@ -1098,10 +1095,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { const std::lock_guard lock(request_queue_mutex); - if (verbose) { - std::cout - << "\n############### prepare_next_batch_verify ###############\n"; - } + std::cout << "\n############### prepare_next_batch_verify ###############\n"; + assert(old_batches.size() > 0); TreeVerifyBatchConfig new_bc; @@ -1277,8 +1272,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "max_prompt_load_size: " << max_prompt_load_size << std::endl; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << i << ", " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; if (request.llm_cache_size < request.initial_len) { // Initialization (prompt) phase @@ -1298,7 +1291,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } - if (new_bc.num_tokens + request.llm_cache_size >= request.initial_len) { + if (new_bc.requestsInfo[i].num_tokens_in_batch + + request.llm_cache_size >= + request.initial_len) { // launch the request into running phase after loading all prompt request.status = Request::RUNNING; new_bc.request_running[i] = true; From 4c06a0907ec694b21a989a51120e846d0f0cfa74 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 17 Oct 2023 14:21:13 -0500 Subject: [PATCH 251/344] variable renaming (#1194) --- include/flexflow/batch_config.h | 2 +- include/flexflow/request_manager.h | 2 +- src/ops/inc_multihead_self_attention.cpp | 2 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/spec_inc_multihead_self_attention.cpp | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 2 +- src/runtime/batch_config.cc | 6 +- src/runtime/beam_search_batch_config.cc | 4 +- src/runtime/request_manager.cc | 66 +++++++++++-------- src/runtime/tree_verify_batch_config.cc | 4 +- 10 files changed, 50 insertions(+), 42 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 108bc8d172..25bc206bf9 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -61,7 +61,7 @@ class BatchConfig { int num_tokens; struct PerRequestInfo { - int token_start_offset; + int first_token_depth_in_request; int num_tokens_in_batch; int max_sequence_length; RequestGuid request_guid; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 3081aaa1c2..baf6844801 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -154,7 +154,7 @@ class RequestManager { std::vector> traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, - int token_start_offset); + int first_token_depth_in_request); // remove guid after put the cached tree in request std::vector> merge_dfs_trees( diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 562898a220..37cc986f5e 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -532,7 +532,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 00d45a9cfa..6ec077c328 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -531,7 +531,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 173d4a5b1d..1d81ae0c11 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -231,7 +231,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // int total_tokens = bc->token_last_available_idx[i] + 1; int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 00eec96824..8b89acf3b7 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -248,7 +248,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // int total_tokens = bc->token_last_available_idx[i] + 1; int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; if (num_new_tokens <= 0) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 72572c4e06..4781f09cab 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -27,7 +27,7 @@ using Legion::Memory; BatchConfig::BatchConfig() : num_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - requestsInfo[i].token_start_offset = 0; + requestsInfo[i].first_token_depth_in_request = 0; requestsInfo[i].num_tokens_in_batch = 0; request_completed[i] = true; } @@ -104,8 +104,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " << bc.requestsInfo[i].token_start_offset - << std::endl; + os << " Token start offset: " + << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 811ef00ba2..f785dc5b74 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -126,8 +126,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " << bc.requestsInfo[i].token_start_offset - << std::endl; + os << " Token start offset: " + << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index b5688c07e6..1c5a6ae5da 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -367,7 +367,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; - new_bc.requestsInfo[i].token_start_offset = 0; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens - @@ -382,7 +382,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < new_request.tokens.size()); @@ -397,8 +397,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + - old_bc.requestsInfo[i].num_tokens_in_batch; + int processed_tokens = + old_bc.requestsInfo[i].first_token_depth_in_request + + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); bool request_completed = false; // printf("model_type = %d\n", this->model_type); @@ -464,12 +465,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - if (new_bc.requestsInfo[i].token_start_offset + 1 == + if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; @@ -478,10 +479,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].first_token_depth_in_request); } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < request.tokens.size()); @@ -685,7 +686,7 @@ BeamSearchBatchConfig new_bc.request_running[i] = true; // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = + new_bc.requestsInfo[i].first_token_depth_in_request = verified_tokens.front().second; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; @@ -694,9 +695,10 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - - new_bc.requestsInfo[i].token_start_offset - - verified_tokens.size(); + int new_max_depth = + new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].first_token_depth_in_request - + verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].beam_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -742,7 +744,8 @@ BeamSearchBatchConfig assert(request.ssm_cache_size == request.initial_len); // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = request.ssm_cache_size; + new_bc.requestsInfo[i].first_token_depth_in_request = + request.ssm_cache_size; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -776,7 +779,7 @@ BeamSearchBatchConfig Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; - new_bc.requestsInfo[i].token_start_offset = 0; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, @@ -806,7 +809,7 @@ BeamSearchBatchConfig new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < new_request.tokens.size()); @@ -922,7 +925,7 @@ BeamSearchBatchConfig // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + old_bc.requestsInfo[i].num_tokens_in_batch; // assert(processed_tokens < request.tokens.size()); @@ -937,7 +940,8 @@ BeamSearchBatchConfig // // old_bc.beamRequestsInfo[i].max_depth); // // // new_bc.request_completed[i] = true; // // new_bc.request_completed[i] = false; - // // new_bc.requestsInfo[i].token_start_offset = processed_tokens; + // // new_bc.requestsInfo[i].first_token_depth_in_request = + // processed_tokens; // // new_bc.requestsInfo[i].request_guid = // // old_bc.requestsInfo[i].request_guid; // // new_bc.requestsInfo[i].max_sequence_length = @@ -953,7 +957,7 @@ BeamSearchBatchConfig log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -986,7 +990,8 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), // i); - if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) { + if (new_bc.requestsInfo[i].first_token_depth_in_request >= + request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { new_bc.requestsInfo[i].num_tokens_in_batch = 1; @@ -1006,7 +1011,7 @@ BeamSearchBatchConfig std::min(get_max_tokens_per_batch() - new_bc.num_tokens - BatchConfig::max_requests_per_batch() + i, (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; if (verbose) { std::cout << "[ Beam Spec] " << request.guid << std::endl; @@ -1027,7 +1032,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.sub_requests[i]; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1151,7 +1156,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = + new_bc.requestsInfo[i].first_token_depth_in_request = dfs_tree_inputs.front().second; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; @@ -1204,7 +1209,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } - new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; + new_bc.requestsInfo[i].first_token_depth_in_request = + request.tokens.size() - 1; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { @@ -1257,7 +1263,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; + new_bc.requestsInfo[i].first_token_depth_in_request = + request.llm_cache_size; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = @@ -1265,9 +1272,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - max_prompt_load_size, - (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(max_prompt_load_size, + (int)request.initial_len - + new_bc.requestsInfo[i].first_token_depth_in_request); max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch; std::cout << "max_prompt_load_size: " << max_prompt_load_size @@ -1673,7 +1681,7 @@ std::vector> std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, - int token_start_offset) { + int first_token_depth_in_request) { if (verbose) { std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; @@ -1709,7 +1717,7 @@ std::vector> << serializedTree.size() << "\n"; } for (int k = 0; k < serializedTree.size(); k++) { - serializedTree.at(k).second += token_start_offset; + serializedTree.at(k).second += first_token_depth_in_request; if (verbose) { std::cout << "token id: " << serializedTree.at(k).first << ", depth: " << serializedTree.at(k).second << "\n"; diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index cb68ecc5f1..6dbcaceaa4 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -47,8 +47,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " << bc.requestsInfo[i].token_start_offset - << std::endl; + os << " Token start offset: " + << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; From fb0b21cf78f61d2a553f7940f9207469fe20696b Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 18 Oct 2023 16:54:35 -0500 Subject: [PATCH 252/344] Add `first_token_offset_in_batch` to indicate the offset of the request's first token in a `BatchConfig` (#1197) * Add first_token_offset_in_batch to indicate the offset of the request's first token in a BatchConfig * format --- include/flexflow/batch_config.h | 1 + src/ops/inc_multihead_self_attention.cu | 2 ++ src/ops/spec_inc_multihead_self_attention.cu | 3 ++- src/ops/tree_inc_multihead_self_attention.cu | 2 ++ src/runtime/batch_config.cc | 5 ++++- src/runtime/beam_search_batch_config.cc | 4 +++- src/runtime/request_manager.cc | 8 ++++++++ src/runtime/tree_verify_batch_config.cc | 4 +++- 8 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 25bc206bf9..d625985552 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -62,6 +62,7 @@ class BatchConfig { struct PerRequestInfo { int first_token_depth_in_request; + int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; RequestGuid request_guid; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 6ec077c328..ced1459b59 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -530,6 +530,8 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i]) { continue; } + assert(tokens_previous_requests == + bc->requestsInfo[i].first_token_offset_in_batch); int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 8b89acf3b7..fddbd252b6 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -241,7 +241,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i]) { continue; } - + assert(tokens_previous_requests == + bc->requestsInfo[i].first_token_offset_in_batch); for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { // int num_new_tokens = bc->num_processing_tokens[i]; diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 0da432b732..98a9c6557a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -181,6 +181,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i]) { continue; } + assert(processed_tokens_in_batch == + bc->requestsInfo[i].first_token_offset_in_batch); int last_token_idx_of_the_request = processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; while (processed_tokens_in_batch <= last_token_idx_of_the_request) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 4781f09cab..d2fbc0883f 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -28,6 +28,7 @@ using Legion::Memory; BatchConfig::BatchConfig() : num_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].first_token_depth_in_request = 0; + requestsInfo[i].first_token_offset_in_batch = 0; requestsInfo[i].num_tokens_in_batch = 0; request_completed[i] = true; } @@ -104,8 +105,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " + os << " First token depth in request: " << bc.requestsInfo[i].first_token_depth_in_request << std::endl; + os << " First token offset in batch: " + << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index f785dc5b74..74843e9460 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -126,8 +126,10 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " + os << " First token depth in request: " << bc.requestsInfo[i].first_token_depth_in_request << std::endl; + os << " First token offset in batch: " + << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1c5a6ae5da..4d232b6d44 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -368,6 +368,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens - @@ -466,6 +467,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { new_bc.request_completed[i] = false; new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = @@ -688,6 +690,7 @@ BeamSearchBatchConfig // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = verified_tokens.front().second; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = @@ -746,6 +749,7 @@ BeamSearchBatchConfig // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = request.ssm_cache_size; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -780,6 +784,7 @@ BeamSearchBatchConfig pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, @@ -958,6 +963,7 @@ BeamSearchBatchConfig << new_bc.num_tokens; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -1158,6 +1164,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = dfs_tree_inputs.front().second; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = @@ -1265,6 +1272,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 6dbcaceaa4..841c735f59 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -47,8 +47,10 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " + os << " First token depth in request: " << bc.requestsInfo[i].first_token_depth_in_request << std::endl; + os << " First token offset in batch: " + << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; From caf5d61a4fdfc71a10667aeb0bec8f841c67599d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 22 Oct 2023 13:26:48 -0500 Subject: [PATCH 253/344] Update the data layout of m->attn_heads (#1204) * Update the data layout of m->attn_heads to make it consistent with others * move output product out of loop --------- Co-authored-by: xinhaoc --- src/ops/inc_multihead_self_attention.cu | 85 ++++++++-------- src/ops/spec_inc_multihead_self_attention.cu | 100 +++++++++---------- src/ops/tree_inc_multihead_self_attention.cu | 87 ++++++++-------- 3 files changed, 134 insertions(+), 138 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ced1459b59..3b24a5a324 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -650,19 +650,19 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; + m_ = m->vProjSize; + n = num_new_tokens; k = total_tokens; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + @@ -690,40 +690,41 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); tokens_previous_requests += num_new_tokens; } + // Project to output, save result directly on output tensor + DT alpha = 1.0f, beta = 0.0f; + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = bc->num_active_tokens(); + int lda = k, ldb = k, ldc = m_; + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + DT const *B = static_cast
(m->attn_heads); + DT *C = static_cast
(output_ptr); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index fddbd252b6..ac74eb1c8f 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -223,7 +223,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; + // int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; @@ -241,10 +241,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i]) { continue; } - assert(tokens_previous_requests == - bc->requestsInfo[i].first_token_offset_in_batch); for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - // int num_new_tokens = bc->num_processing_tokens[i]; // int total_tokens = bc->token_last_available_idx[i] + 1; @@ -273,8 +270,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } // To get A, skip over Q entries from previous requests (same head) DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) DT const *B = static_cast
(m->keyCache) + @@ -380,24 +377,25 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; + m_ = m->vProjSize; + n = num_new_tokens; k = total_tokens; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + // padding) - B = static_cast
(m->valueCache) + + A = static_cast
(m->valueCache) + (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_q_heads * m->vProjSize; + bc->requestsInfo[i].first_token_offset_in_batch * m->num_q_heads * + m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -422,42 +420,42 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + - tokens_previous_requests * m->oProjSize; - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - tokens_previous_requests += num_new_tokens; + // tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } } + + // Project to output, save result directly on output tensor + DT alpha = 1.0f, beta = 0.0f; + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = bc->num_active_tokens(); + int lda = k, ldb = k, ldc = m_; + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + DT const *B = static_cast
(m->attn_heads); + DT *C = static_cast
(output_ptr); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 98a9c6557a..edf7a2d075 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -338,24 +338,23 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, C_softmax)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; + m_ = m->vProjSize; + n = num_new_tokens; k = total_tokens_in_request; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens_in_request; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens_in_request; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -379,45 +378,43 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + - processed_tokens_in_batch * m->oProjSize; - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); processed_tokens_in_batch += num_new_tokens; } // Before moving to the next request // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } + // Project to output, save result directly on output tensor + DT alpha = 1.0f, beta = 0.0f; + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = processed_tokens_in_batch; + int lda = k, ldb = k, ldc = m_; + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + DT const *B = static_cast
(m->attn_heads); + DT *C = static_cast
(output_ptr); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + From dd9f62d2122df87fcec96694fd41fc37abfcfb4c Mon Sep 17 00:00:00 2001 From: DerrickYLJ <99985904+DerrickYLJ@users.noreply.github.com> Date: Mon, 23 Oct 2023 00:03:33 -0400 Subject: [PATCH 254/344] Pre-build Legion library (#1042) * add optional flag for building legion only * added build path and legion-only flag * bug fix * pass new variable with config file * move nccl * bug fix * add cuda_arch list * export position move * cd into legion * quick fix * retrieve os version and cd directory * using ubuntu * directory fix * bug fix * add touch * create the release to flexflow-third-party * bug fix * bug fix * fix indentation * fix * bash launching * bug fix * bug fix * extract tar file` * bug fix * add parameter * bash fix * python version * bug fix * bug fix * bug fix * bug fix * bug fix * build bash * bug fix * bug fix * bug fix * bug fix * bug fix * auto running docker container * renew bash script * bug fix * bug fix * bug fix * non-running container * bug fix * make it easier to switch between inference and master branch * multiple fixes * bug fix * bug fix * add python version * bug fix * restore * enable building docker images for different hip versions * ignore shellcheck error code * support hip compilation in inference cmake files * fix * workflow and hardcode * bug fix * fix * cmake fix * python versions * cmake fixes * cmake fixes * move install * order * bug fix * nested if condition fix * update docker workflow and config scripts * update scripts * fix * fix * cleanup * rocm 5.6 by default in workflow * move outside * update workflow * incorp install.sh * bug fix * fix * fix * fix * bg fix * fix permissions * bug fix * bug fix * bug fix * bug fix * updated * bug fix * fix workflow * check * check * bug fix * fix * add python env * fix * cleanup * update workflow * newline * added runner * added endif * Code Cleanup * restore to self-hosted * bug fix * fix * fix * update workflow * fixes * fix cmake for hip rocm --------- Co-authored-by: Gabriele Oliaro --- .github/workflows/helpers/prebuild_legion.sh | 75 +++ .github/workflows/prebuild-legion.yml | 84 ++++ CMakeLists.txt | 468 ++++++++++--------- cmake/cuda.cmake | 2 +- config/config.inc | 11 +- config/config.linux | 5 +- docker/build.sh | 9 +- docker/flexflow-environment/Dockerfile | 31 +- docker/flexflow/Dockerfile | 9 + 9 files changed, 452 insertions(+), 242 deletions(-) create mode 100755 .github/workflows/helpers/prebuild_legion.sh create mode 100644 .github/workflows/prebuild-legion.yml diff --git a/.github/workflows/helpers/prebuild_legion.sh b/.github/workflows/helpers/prebuild_legion.sh new file mode 100755 index 0000000000..ccaa58383e --- /dev/null +++ b/.github/workflows/helpers/prebuild_legion.sh @@ -0,0 +1,75 @@ +#! /usr/bin/env bash +set -euo pipefail + +# Parse input params +python_version=${python_version:-"empty"} +gpu_backend=${gpu_backend:-"empty"} +gpu_backend_version=${gpu_backend_version:-"empty"} + +if [[ "${gpu_backend}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then + echo "Error, value of gpu_backend (${gpu_backend}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'." + exit 1 +else + echo "Pre-building Legion with GPU backend: ${gpu_backend}" +fi + +if [[ "${gpu_backend}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then + # Check that CUDA version is supported. Versions above 12.0 not supported because we don't publish docker images for it yet. + if [[ "$gpu_backend_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0) ]]; then + echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0}" + exit 1 + fi + export cuda_version="$gpu_backend_version" +elif [[ "${gpu_backend}" == "hip_rocm" ]]; then + # Check that HIP version is supported + if [[ "$gpu_backend_version" != @(5.3|5.4|5.5|5.6) ]]; then + echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}" + exit 1 + fi + export hip_version="$gpu_backend_version" +else + echo "gpu backend: ${gpu_backend} and gpu_backend_version: ${gpu_backend_version} not yet supported." + exit 1 +fi + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +export FF_GPU_BACKEND="${gpu_backend}" +export FF_CUDA_ARCH=all +export FF_HIP_ARCH=all +export BUILD_LEGION_ONLY=ON +export INSTALL_DIR="/usr/legion" +export python_version="${python_version}" + +# Build Docker Flexflow Container +echo "building docker" +../../../docker/build.sh flexflow + +# Cleanup any existing container with the same name +docker rm prelegion || true + +# Create container to be able to copy data from the image +docker create --name prelegion flexflow-"${gpu_backend}"-"${gpu_backend_version}":latest + +# Copy legion libraries to host +echo "extract legion library assets" +mkdir -p ../../../prebuilt_legion_assets +rm -rf ../../../prebuilt_legion_assets/tmp || true +docker cp prelegion:$INSTALL_DIR ../../../prebuilt_legion_assets/tmp + + +# Create the tarball file +cd ../../../prebuilt_legion_assets/tmp +export LEGION_TARBALL="legion_ubuntu-20.04_${gpu_backend}-${gpu_backend_version}_py${python_version}.tar.gz" + +echo "Creating archive $LEGION_TARBALL" +tar -zcvf "../$LEGION_TARBALL" ./ +cd .. +echo "Checking the size of the Legion tarball..." +du -h "$LEGION_TARBALL" + + +# Cleanup +rm -rf tmp/* +docker rm prelegion diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml new file mode 100644 index 0000000000..00e7e78a77 --- /dev/null +++ b/.github/workflows/prebuild-legion.yml @@ -0,0 +1,84 @@ +name: "prebuild-legion" +on: + push: + branches: + - "inference" + paths: + - "cmake/**" + - "config/**" + - "deps/legion/**" + - ".github/workflows/helpers/install_dependencies.sh" + workflow_dispatch: +concurrency: + group: prebuild-legion-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + prebuild-legion: + name: Prebuild Legion with CMake + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + strategy: + matrix: + gpu_backend: ["cuda", "hip_rocm"] + gpu_backend_version: ["11.8", "5.6"] + python_version: "3.11" + exclude: + - gpu_backend: "cuda" + gpu_backend_version: "5.6" + - gpu_backend: "hip_rocm" + gpu_backend_version: "11.8" + fail-fast: false + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Free additional space on runner + run: .github/workflows/helpers/free_space_on_runner.sh + + - name: Build Legion + env: + FF_GPU_BACKEND: ${{ matrix.gpu_backend }} + run: .github/workflows/helpers/prebuild_legion.sh + + - name: Archive compiled Legion library (CUDA) + env: + FF_GPU_BACKEND: ${{ matrix.gpu_backend }} + uses: actions/upload-artifact@v3 + with: + name: legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }} + path: prebuilt_legion_assets/legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }}.tar.gz + + create-release: + name: Create new release + runs-on: ubuntu-20.04 + needs: prebuild-legion + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + - name: Free additional space on runner + run: .github/workflows/helpers/free_space_on_runner.sh + - name: Create folder for artifacts + run: mkdir artifacts unwrapped_artifacts + - name: Download artifacts + uses: actions/download-artifact@v3 + with: + path: ./artifacts + - name: Display structure of downloaded files + working-directory: ./artifacts + run: ls -R + - name: Unwrap all artifacts + working-directory: ./artifacts + run: find . -maxdepth 2 -mindepth 2 -type f -name "*.tar.gz" -exec mv {} ../unwrapped_artifacts/ \; + - name: Get datetime + run: echo "RELEASE_DATETIME=$(date '+%Y-%m-%dT%H-%M-%S')" >> $GITHUB_ENV + - name: Release + env: + NAME: ${{ env.RELEASE_DATETIME }} + TAG_NAME: ${{ env.RELEASE_DATETIME }} + GITHUB_TOKEN: ${{ secrets.FLEXFLOW_TOKEN }} + run: gh release create $TAG_NAME ./unwrapped_artifacts/*.tar.gz --repo flexflow/flexflow-third-party diff --git a/CMakeLists.txt b/CMakeLists.txt index 32399ed4d8..648b46b49e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,9 @@ option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON) # option for using Python option(FF_USE_PYTHON "Enable Python" ON) +# option for building legion only +option(BUILD_LEGION_ONLY "Build Legion only" OFF) + # option to download pre-compiled NCCL/Legion libraries option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if available" ON) option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON) @@ -235,266 +238,271 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") include(cudnn) endif() -# NCCL -if(FF_USE_NCCL) - if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") - include(nccl) - endif() - list(APPEND FF_CC_FLAGS - -DFF_USE_NCCL) - list(APPEND FF_NVCC_FLAGS - -DFF_USE_NCCL) -endif() - -# Inference tests -if(INFERENCE_TESTS) - list(APPEND FF_CC_FLAGS - -DINFERENCE_TESTS) - list(APPEND FF_NVCC_FLAGS - -DINFERENCE_TESTS) -endif() - # Legion include(legion) -# json -include(json) - -# variant -include(variant) - -# optional -include(optional) - -if (FF_GPU_BACKEND STREQUAL "cuda") - list(APPEND FF_CC_FLAGS - -DFF_USE_CUDA) - list(APPEND FF_NVCC_FLAGS - -DFF_USE_CUDA) -elseif (FF_GPU_BACKEND STREQUAL "hip_cuda") - list(APPEND FF_CC_FLAGS - -DFF_USE_HIP_CUDA) - list(APPEND FF_HIPCC_FLAGS - -DFF_USE_HIP_CUDA) -elseif (FF_GPU_BACKEND STREQUAL "hip_rocm") - list(APPEND FF_CC_FLAGS - -DFF_USE_HIP_ROCM) - list(APPEND FF_HIPCC_FLAGS - -DFF_USE_HIP_ROCM) -else() -endif() +# Not build FlexFlow if BUILD_LEGION_ONLY is ON +if(NOT BUILD_LEGION_ONLY) + # NCCL + if(FF_USE_NCCL) + if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") + include(nccl) + endif() + list(APPEND FF_CC_FLAGS + -DFF_USE_NCCL) + list(APPEND FF_NVCC_FLAGS + -DFF_USE_NCCL) + endif() -# Start build FlexFlow -if (CMAKE_BUILD_TYPE STREQUAL "Debug") + # Inference tests + if(INFERENCE_TESTS) list(APPEND FF_CC_FLAGS - -DFF_DEBUG) + -DINFERENCE_TESTS) list(APPEND FF_NVCC_FLAGS - -DFF_DEBUG) -endif() + -DINFERENCE_TESTS) + endif() + + # json + include(json) + + # variant + include(variant) + + # optional + include(optional) + + if (FF_GPU_BACKEND STREQUAL "cuda") + list(APPEND FF_CC_FLAGS + -DFF_USE_CUDA) + list(APPEND FF_NVCC_FLAGS + -DFF_USE_CUDA) + elseif (FF_GPU_BACKEND STREQUAL "hip_cuda") + list(APPEND FF_CC_FLAGS + -DFF_USE_HIP_CUDA) + list(APPEND FF_HIPCC_FLAGS + -DFF_USE_HIP_CUDA) + elseif (FF_GPU_BACKEND STREQUAL "hip_rocm") + list(APPEND FF_CC_FLAGS + -DFF_USE_HIP_ROCM) + list(APPEND FF_HIPCC_FLAGS + -DFF_USE_HIP_ROCM) + else() + endif() -message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}") -message(STATUS "LEGION_MAX_RETURN_SIZE: ${LEGION_MAX_RETURN_SIZE}") + # Start build FlexFlow + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND FF_CC_FLAGS + -DFF_DEBUG) + list(APPEND FF_NVCC_FLAGS + -DFF_DEBUG) + endif() -list(APPEND FF_CC_FLAGS - -DMAX_TENSOR_DIM=${FF_MAX_DIM} - -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}) + message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}") + message(STATUS "LEGION_MAX_RETURN_SIZE: ${LEGION_MAX_RETURN_SIZE}") -if(FF_USE_AVX2) list(APPEND FF_CC_FLAGS - -DFF_USE_AVX2 - -mavx2) -endif() - -list(APPEND FF_NVCC_FLAGS - -Wno-deprecated-gpu-targets - -DMAX_TENSOR_DIM=${FF_MAX_DIM} - -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}) - -list(APPEND FF_LD_FLAGS - -lrt - -ldl - -rdynamic - -lstdc++fs) - -# Set FF FLAGS -add_compile_options(${FF_CC_FLAGS}) -set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG) -link_libraries(${FF_LD_FLAGS}) - -list(APPEND FLEXFLOW_INCLUDE_DIRS - ${FLEXFLOW_ROOT}/include - ${FLEXFLOW_ROOT}) - -file(GLOB_RECURSE FLEXFLOW_HDR - LIST_DIRECTORIES False - ${FLEXFLOW_ROOT}/include/*.h) - list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) - -file(GLOB_RECURSE FLEXFLOW_SRC - LIST_DIRECTORIES False - ${FLEXFLOW_ROOT}/src/*.cc) -list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") -list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) - -set(FLEXFLOW_CPP_DRV_SRC - ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) - -add_library(substitution_loader SHARED - ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc) -target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS}) -target_link_libraries(substitution_loader nlohmann_json::nlohmann_json) + -DMAX_TENSOR_DIM=${FF_MAX_DIM} + -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}) + if(FF_USE_AVX2) + list(APPEND FF_CC_FLAGS + -DFF_USE_AVX2 + -mavx2) + endif() -#message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}") + list(APPEND FF_NVCC_FLAGS + -Wno-deprecated-gpu-targets + -DMAX_TENSOR_DIM=${FF_MAX_DIM} + -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}) + + list(APPEND FF_LD_FLAGS + -lrt + -ldl + -rdynamic + -lstdc++fs) + + # Set FF FLAGS + add_compile_options(${FF_CC_FLAGS}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG) + link_libraries(${FF_LD_FLAGS}) + + list(APPEND FLEXFLOW_INCLUDE_DIRS + ${FLEXFLOW_ROOT}/include + ${FLEXFLOW_ROOT}) + + file(GLOB_RECURSE FLEXFLOW_HDR + LIST_DIRECTORIES False + ${FLEXFLOW_ROOT}/include/*.h) + + list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) -# compile flexflow lib -if (FF_GPU_BACKEND STREQUAL "cuda") - file(GLOB_RECURSE FLEXFLOW_GPU_SRC + file(GLOB_RECURSE FLEXFLOW_SRC LIST_DIRECTORIES False - ${FLEXFLOW_ROOT}/src/*.cu) + ${FLEXFLOW_ROOT}/src/*.cc) + + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") + list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) - add_compile_definitions(FF_USE_CUDA) + set(FLEXFLOW_CPP_DRV_SRC + ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) - if(BUILD_SHARED_LIBS) - cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE}) - else() - cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE}) - endif() -elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") - file(GLOB_RECURSE FLEXFLOW_GPU_SRC - LIST_DIRECTORIES False - ${FLEXFLOW_ROOT}/src/*.cpp) + add_library(substitution_loader SHARED + ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc) + target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS}) + target_link_libraries(substitution_loader nlohmann_json::nlohmann_json) - if(BUILD_SHARED_LIBS) - add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) - else() - add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) - endif() - list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH}) + #message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}") - find_package(hip REQUIRED) + # compile flexflow lib + if (FF_GPU_BACKEND STREQUAL "cuda") + file(GLOB_RECURSE FLEXFLOW_GPU_SRC + LIST_DIRECTORIES False + ${FLEXFLOW_ROOT}/src/*.cu) - if (FF_GPU_BACKEND STREQUAL "hip_cuda") - # The targets defined by the hip cmake config only target amd devices. - # For targeting nvidia devices, we'll make our own interface target, - # hip_device_nvidia, that includes the rocm and hip headers. - add_library(hip_device_nvidia INTERFACE) + add_compile_definitions(FF_USE_CUDA) - if (NOT FF_CUDA_ARCH STREQUAL "") - target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH}) + if(BUILD_SHARED_LIBS) + cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE}) + else() + cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE}) endif() + elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm") + file(GLOB_RECURSE FLEXFLOW_GPU_SRC + LIST_DIRECTORIES False + ${FLEXFLOW_ROOT}/src/*.cpp) - target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include) - target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include) - - add_compile_definitions(FF_USE_HIP_CUDA) - - # Linking cuda: - # We do not explicitly link cuda. hipcc when targeting nvidia will - # use nvcc under the hood. nvcc when used for linking will handle - # linking cuda dependencies - target_link_libraries(flexflow hip_device_nvidia) - elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") - find_package(hipblas REQUIRED) - find_package(miopen REQUIRED) - if(FF_USE_NCCL) - find_package(rccl REQUIRED) + if(BUILD_SHARED_LIBS) + add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) + else() + add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) endif() - # find_package(rocrand REQUIRED) - find_library(HIP_RAND_LIBRARY hiprand REQUIRED) - add_compile_definitions(FF_USE_HIP_ROCM) - - if (FF_HIP_ARCH STREQUAL "") - message(FATAL_ERROR "FF_HIP_ARCH is undefined") - endif() - set_property(TARGET flexflow PROPERTY HIP_ARCHITECTURES "${HIP_ARCH_LIST}") - - message(STATUS "FF_GPU_BACKEND: ${FF_GPU_BACKEND}") - message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") - message(STATUS "HIP_ARCH_LIST: ${HIP_ARCH_LIST}") - get_property(CHECK_HIP_ARCHS TARGET flexflow PROPERTY HIP_ARCHITECTURES) - message(STATUS "CHECK_HIP_ARCHS: ${CHECK_HIP_ARCHS}") - message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") - - # The hip cmake config module defines three targets, - # hip::amdhip64, hip::host, and hip::device. - # - # hip::host and hip::device are interface targets. hip::amdhip64 is an - # imported target for libamdhip. - # - # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64 - # and hip::device links to hip::host. Link to hip::host to just use hip without - # compiling any GPU code. Link to hip::device to compile the GPU device code. - # - # Docs (outdated): - # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html - target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) - if(FF_USE_NCCL) + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH}) + + find_package(hip REQUIRED) + + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + # The targets defined by the hip cmake config only target amd devices. + # For targeting nvidia devices, we'll make our own interface target, + # hip_device_nvidia, that includes the rocm and hip headers. + add_library(hip_device_nvidia INTERFACE) + + if (NOT FF_CUDA_ARCH STREQUAL "") + target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH}) + endif() + + target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include) + target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include) + + add_compile_definitions(FF_USE_HIP_CUDA) + + # Linking cuda: + # We do not explicitly link cuda. hipcc when targeting nvidia will + # use nvcc under the hood. nvcc when used for linking will handle + # linking cuda dependencies + target_link_libraries(flexflow hip_device_nvidia) + elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + find_package(hipblas REQUIRED) + find_package(miopen REQUIRED) + if(FF_USE_NCCL) + find_package(rccl REQUIRED) + endif() + # find_package(rocrand REQUIRED) + find_library(HIP_RAND_LIBRARY hiprand REQUIRED) + + add_compile_definitions(FF_USE_HIP_ROCM) + + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is undefined") + endif() + set_property(TARGET flexflow PROPERTY HIP_ARCHITECTURES "${HIP_ARCH_LIST}") + + message(STATUS "FF_GPU_BACKEND: ${FF_GPU_BACKEND}") + message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") + message(STATUS "HIP_ARCH_LIST: ${HIP_ARCH_LIST}") + get_property(CHECK_HIP_ARCHS TARGET flexflow PROPERTY HIP_ARCHITECTURES) + message(STATUS "CHECK_HIP_ARCHS: ${CHECK_HIP_ARCHS}") + message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") + + # The hip cmake config module defines three targets, + # hip::amdhip64, hip::host, and hip::device. + # + # hip::host and hip::device are interface targets. hip::amdhip64 is an + # imported target for libamdhip. + # + # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64 + # and hip::device links to hip::host. Link to hip::host to just use hip without + # compiling any GPU code. Link to hip::device to compile the GPU device code. + # + # Docs (outdated): + # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html + target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY}) + if(FF_USE_NCCL) target_link_libraries(flexflow rccl) + endif() endif() + else() + message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") endif() -else() - message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}") -endif() -if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) - add_dependencies(flexflow ${NCCL_NAME}) -endif() + if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")) + add_dependencies(flexflow ${NCCL_NAME}) + endif() -target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) -# LEGION_URL is defined if we found a precompiled Legion library to download -if(LEGION_URL) - # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. - # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files. - target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) - add_dependencies(flexflow ${LEGION_NAME}) -else() - # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the - # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need - # to link FlexFlow to ${LEGION_LIBRARY} - target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) -endif() + target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) + # LEGION_URL is defined if we found a precompiled Legion library to download + if(LEGION_URL) + # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime. + # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files. + target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) + add_dependencies(flexflow ${LEGION_NAME}) + else() + # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the + # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need + # to link FlexFlow to ${LEGION_LIBRARY} + target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) + endif() -#library api version, bump from time to time -set(SOVERSION 1) - -set_target_properties(flexflow PROPERTIES POSITION_INDEPENDENT_CODE ON) -set_target_properties(flexflow PROPERTIES OUTPUT_NAME "flexflow${INSTALL_SUFFIX}") -set_target_properties(flexflow PROPERTIES SOVERSION ${SOVERSION}) -if (CMAKE_SYSTEM_NAME STREQUAL "Linux") - set_target_properties(flexflow PROPERTIES BUILD_RPATH "\$ORIGIN") - set_target_properties(flexflow PROPERTIES INSTALL_RPATH "\$ORIGIN") -elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin") - set_target_properties(flexflow PROPERTIES BUILD_RPATH "@loader_path") - set_target_properties(flexflow PROPERTIES INSTALL_RPATH "@loader_path") -endif() + #library api version, bump from time to time + set(SOVERSION 1) + + set_target_properties(flexflow PROPERTIES POSITION_INDEPENDENT_CODE ON) + set_target_properties(flexflow PROPERTIES OUTPUT_NAME "flexflow${INSTALL_SUFFIX}") + set_target_properties(flexflow PROPERTIES SOVERSION ${SOVERSION}) + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set_target_properties(flexflow PROPERTIES BUILD_RPATH "\$ORIGIN") + set_target_properties(flexflow PROPERTIES INSTALL_RPATH "\$ORIGIN") + elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set_target_properties(flexflow PROPERTIES BUILD_RPATH "@loader_path") + set_target_properties(flexflow PROPERTIES INSTALL_RPATH "@loader_path") + endif() -# python related -if (FF_USE_PYTHON) - # create flexflow_cffi_header.py - add_custom_command(TARGET flexflow - PRE_BUILD - COMMAND ${FLEXFLOW_ROOT}/python/flexflow_cffi_build.py --ffhome-dir ${FLEXFLOW_ROOT} --output-dir ${FLEXFLOW_ROOT}/python/flexflow/core - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Creating flexflow_cffi_header.py..." - ) - if (NOT FF_BUILD_FROM_PYPI) - # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library - add_custom_command(TARGET flexflow - POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python - ) - # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. + # python related + if (FF_USE_PYTHON) + # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${FLEXFLOW_ROOT}/python/flexflow_cffi_build.py --ffhome-dir ${FLEXFLOW_ROOT} --output-dir ${FLEXFLOW_ROOT}/python/flexflow/core WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Creating flexflow_python interpreter..." + COMMENT "Creating flexflow_cffi_header.py..." ) - install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin") + if (NOT FF_BUILD_FROM_PYPI) + # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library + add_custom_command(TARGET flexflow + POST_BUILD + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python + ) + # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. + add_custom_command(TARGET flexflow + PRE_BUILD + COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Creating flexflow_python interpreter..." + ) + install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin") + endif() endif() endif() @@ -531,13 +539,13 @@ if(FF_BUILD_UNIT_TESTS) add_subdirectory(tests/unit) endif() -if(FF_BUILD_SUBSTITUTION_TOOL) - add_subdirectory(tools/protobuf_to_json) -endif() + if(FF_BUILD_SUBSTITUTION_TOOL) + add_subdirectory(tools/protobuf_to_json) + endif() -if(FF_BUILD_VISUALIZATION_TOOL) - add_subdirectory(tools/substitutions_to_dot) -endif() + if(FF_BUILD_VISUALIZATION_TOOL) + add_subdirectory(tools/substitutions_to_dot) + endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) if (FF_GPU_BACKEND STREQUAL "hip_rocm") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index f4111d8ea6..d7f52543a1 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -54,7 +54,7 @@ if(CUDA_FOUND) set(FF_CUDA_ARCH ${DETECTED_CUDA_ARCH}) # Set FF_CUDA_ARCH to the list of all GPU architectures compatible with FlexFlow elseif("${FF_CUDA_ARCH}" STREQUAL "all") - set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86) + set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86,90) endif() # create CUDA_GENCODE list based on FF_CUDA_ARCH diff --git a/config/config.inc b/config/config.inc index eb1ad21fc0..7f1f0ffcf4 100644 --- a/config/config.inc +++ b/config/config.inc @@ -67,6 +67,15 @@ if [ -n "$CUDNN_DIR" ]; then SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}" fi +# build legion only +if [ "$BUILD_LEGION_ONLY" = "ON" ]; then + SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=ON" +elif [ "$BUILD_LEGION_ONLY" = "OFF" ]; then + SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=OFF" +else + SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=OFF" +fi + # enable Python if [ "$FF_USE_PYTHON" = "ON" ]; then SET_PYTHON="-DFF_USE_PYTHON=ON" @@ -218,7 +227,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 3686237538..5f15090a02 100755 --- a/config/config.linux +++ b/config/config.linux @@ -77,6 +77,9 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF} # set MAX_DIM FF_MAX_DIM=${FF_MAX_DIM:-5} +# set BUILD_LEGION_ONLY +BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF} + # set LEGION_MAX_RETURN_SIZE LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144} @@ -97,7 +100,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/docker/build.sh b/docker/build.sh index e72c23fcd8..6603d919f5 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -12,6 +12,7 @@ image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} cuda_version=${cuda_version:-"empty"} hip_version=${hip_version:-"empty"} +python_version=${python_version:-latest} # Check docker image name if [[ "$image" != @(flexflow-environment|flexflow) ]]; then @@ -96,7 +97,13 @@ fi cores_available=$(nproc --all) n_build_cores=$(( cores_available -1 )) -docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile . +# check python_version +if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then + echo "python_version not supported!" + exit 0 +fi + +docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" --build-arg "python_version=${python_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile . # If the user only wants to build the environment image, we are done if [[ "$image" == "flexflow-environment" ]]; then diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index a12f31c738..0e9a3cda82 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -16,14 +16,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut apt-get upgrade -y libstdc++6 # Install Python3 with Miniconda -RUN wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - mv Miniconda3-latest-Linux-x86_64.sh ~/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \ - bash ~/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ - rm ~/Miniconda3-latest-Linux-x86_64.sh && \ - /opt/conda/bin/conda upgrade --all && \ - /opt/conda/bin/conda install conda-build conda-verify && \ - /opt/conda/bin/conda clean -ya +ARG python_version "latest" +RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ + if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \ + echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \ + exit 1; \ + fi; \ + if [ "${python_version}" = "3.8" ]; then \ + MINICONDA_SCRIPT_NAME=Miniconda3-py38_23.5.2-0-Linux-x86_64.sh; \ + elif [ "${python_version}" = "3.9" ]; then \ + MINICONDA_SCRIPT_NAME=Miniconda3-py39_23.5.2-0-Linux-x86_64.sh; \ + elif [ "${python_version}" = "3.10" ]; then \ + MINICONDA_SCRIPT_NAME=Miniconda3-py310_23.5.2-0-Linux-x86_64.sh; \ + elif [ "${python_version}" = "3.11" ]; then \ + MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ + fi; \ + wget -c -q https://repo.continuum.io/miniconda/${MINICONDA_SCRIPT_NAME} && \ + mv ./${MINICONDA_SCRIPT_NAME} ~/${MINICONDA_SCRIPT_NAME} && \ + chmod +x ~/${MINICONDA_SCRIPT_NAME} && \ + bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \ + rm ~/${MINICONDA_SCRIPT_NAME} && \ + /opt/conda/bin/conda upgrade --all && \ + /opt/conda/bin/conda install conda-build conda-verify && \ + /opt/conda/bin/conda clean -ya # Optionally install HIP dependencies # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile index a7d540bc71..60f9d4d653 100644 --- a/docker/flexflow/Dockerfile +++ b/docker/flexflow/Dockerfile @@ -15,6 +15,15 @@ COPY . . ARG BUILD_CONFIGS ARG N_BUILD_CORES +# Create install directory if needed +RUN for pair in $BUILD_CONFIGS; do \ + key=${pair%%=*}; \ + value=${pair#*=}; \ + if [ "$key" = "INSTALL_DIR" ] && [ -n "$value" ]; then \ + mkdir -p "$value"; \ + fi; \ + done + # Build and install C++ and Python versions of FlexFlow RUN mkdir -p build && cd build && \ eval "$BUILD_CONFIGS" ../config/config.linux && \ From 300989077a52094fc1f5762eb4fcab421a427e03 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 23 Oct 2023 01:24:13 -0400 Subject: [PATCH 255/344] Fix CUDA cmake (#1205) --- cmake/cuda.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index d7f52543a1..a1a66c7cc8 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -53,8 +53,12 @@ if(CUDA_FOUND) message( STATUS "CUDA Detected CUDA_ARCH : ${DETECTED_CUDA_ARCH}" ) set(FF_CUDA_ARCH ${DETECTED_CUDA_ARCH}) # Set FF_CUDA_ARCH to the list of all GPU architectures compatible with FlexFlow - elseif("${FF_CUDA_ARCH}" STREQUAL "all") - set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86,90) + elseif("${FF_CUDA_ARCH}" STREQUAL "all") + if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8") + set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86,90) + else() + set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86) + endif() endif() # create CUDA_GENCODE list based on FF_CUDA_ARCH From 452fa9c21878c9c337ddfeab69e31f096cdf1b61 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 23 Oct 2023 01:35:35 -0400 Subject: [PATCH 256/344] Fix Legion prebuild workflow (#1207) --- .github/workflows/prebuild-legion.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml index 00e7e78a77..1cf0ea2dd8 100644 --- a/.github/workflows/prebuild-legion.yml +++ b/.github/workflows/prebuild-legion.yml @@ -24,7 +24,7 @@ jobs: matrix: gpu_backend: ["cuda", "hip_rocm"] gpu_backend_version: ["11.8", "5.6"] - python_version: "3.11" + python_version: ["3.11"] exclude: - gpu_backend: "cuda" gpu_backend_version: "5.6" From d1da022b0e46715d926e4eb9edb669ded4126995 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 23 Oct 2023 20:56:51 -0400 Subject: [PATCH 257/344] Fix Legion prebuild workflow (2) (#1208) * fix * fix * fix * fix --- .github/workflows/helpers/prebuild_legion.sh | 2 +- .github/workflows/prebuild-legion.yml | 6 +- CMakeLists.txt | 260 +++++++++---------- config/config.linux | 2 +- 4 files changed, 135 insertions(+), 135 deletions(-) diff --git a/.github/workflows/helpers/prebuild_legion.sh b/.github/workflows/helpers/prebuild_legion.sh index ccaa58383e..9f5cbe147a 100755 --- a/.github/workflows/helpers/prebuild_legion.sh +++ b/.github/workflows/helpers/prebuild_legion.sh @@ -13,7 +13,7 @@ else echo "Pre-building Legion with GPU backend: ${gpu_backend}" fi -if [[ "${gpu_backend}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then +if [[ "${gpu_backend}" == "cuda" || "${gpu_backend}" == "hip_cuda" ]]; then # Check that CUDA version is supported. Versions above 12.0 not supported because we don't publish docker images for it yet. if [[ "$gpu_backend_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0) ]]; then echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0}" diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml index 1cf0ea2dd8..267daaee6b 100644 --- a/.github/workflows/prebuild-legion.yml +++ b/.github/workflows/prebuild-legion.yml @@ -42,12 +42,12 @@ jobs: - name: Build Legion env: - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} + gpu_backend: ${{ matrix.gpu_backend }} + gpu_backend_version: ${{ matrix.gpu_backend_version }} + python_version: ${{ matrix.python_version }} run: .github/workflows/helpers/prebuild_legion.sh - name: Archive compiled Legion library (CUDA) - env: - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} uses: actions/upload-artifact@v3 with: name: legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 648b46b49e..f9ce66a0f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -504,156 +504,156 @@ if(NOT BUILD_LEGION_ONLY) install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin") endif() endif() -endif() - -if (INFERENCE_TESTS) - target_link_libraries(flexflow "${TORCH_LIBRARIES}") - set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) -endif() - -# build binary -option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON) -option(FF_BUILD_RESNET "build resnet example" OFF) -option(FF_BUILD_RESNEXT "build resnext example" OFF) -option(FF_BUILD_ALEXNET "build alexnet example" OFF) -option(FF_BUILD_DLRM "build DLRM example" OFF) -option(FF_BUILD_XDL "build XDL example" OFF) -option(FF_BUILD_INCEPTION "build inception example" OFF) -option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF) -option(FF_BUILD_TRANSFORMER "build transformer example" OFF) -option(FF_BUILD_MOE "build mixture of experts example" OFF) -option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) -option(FF_BUILD_SPLIT_TEST "build split test example" OFF) -option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) -option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) -option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) -option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) -option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) -option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) -option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) - -if(FF_BUILD_UNIT_TESTS) - set(BUILD_GMOCK OFF) - add_subdirectory(deps/googletest) - enable_testing() - add_subdirectory(tests/unit) -endif() - - if(FF_BUILD_SUBSTITUTION_TOOL) - add_subdirectory(tools/protobuf_to_json) + + if (INFERENCE_TESTS) + target_link_libraries(flexflow "${TORCH_LIBRARIES}") + set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) endif() - if(FF_BUILD_VISUALIZATION_TOOL) - add_subdirectory(tools/substitutions_to_dot) + # build binary + option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON) + option(FF_BUILD_RESNET "build resnet example" OFF) + option(FF_BUILD_RESNEXT "build resnext example" OFF) + option(FF_BUILD_ALEXNET "build alexnet example" OFF) + option(FF_BUILD_DLRM "build DLRM example" OFF) + option(FF_BUILD_XDL "build XDL example" OFF) + option(FF_BUILD_INCEPTION "build inception example" OFF) + option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF) + option(FF_BUILD_TRANSFORMER "build transformer example" OFF) + option(FF_BUILD_MOE "build mixture of experts example" OFF) + option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) + option(FF_BUILD_SPLIT_TEST "build split test example" OFF) + option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) + option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) + option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) + option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) + option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) + option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) + option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) + + if(FF_BUILD_UNIT_TESTS) + set(BUILD_GMOCK OFF) + add_subdirectory(deps/googletest) + enable_testing() + add_subdirectory(tests/unit) endif() -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) - if (FF_GPU_BACKEND STREQUAL "hip_rocm") - SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") - endif() - # Ensure Rust is installed - execute_process(COMMAND rustc --version - RESULT_VARIABLE RUST_COMMAND_RESULT - OUTPUT_VARIABLE RUSTC_OUTPUT - ERROR_QUIET) - if(NOT RUST_COMMAND_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + if(FF_BUILD_SUBSTITUTION_TOOL) + add_subdirectory(tools/protobuf_to_json) + endif() + + if(FF_BUILD_VISUALIZATION_TOOL) + add_subdirectory(tools/substitutions_to_dot) + endif() + + if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) + if (FF_GPU_BACKEND STREQUAL "hip_rocm") + SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") + endif() + # Ensure Rust is installed + execute_process(COMMAND rustc --version + RESULT_VARIABLE RUST_COMMAND_RESULT + OUTPUT_VARIABLE RUSTC_OUTPUT + ERROR_QUIET) + if(NOT RUST_COMMAND_RESULT EQUAL 0) + message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + endif() + # Ensure Cargo is installed + execute_process(COMMAND cargo --version + RESULT_VARIABLE CARGO_RESULT + OUTPUT_QUIET ERROR_QUIET) + if(NOT CARGO_RESULT EQUAL 0) + message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + endif() + add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) + target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) + target_link_libraries(flexflow tokenizers_cpp) endif() - # Ensure Cargo is installed - execute_process(COMMAND cargo --version - RESULT_VARIABLE CARGO_RESULT - OUTPUT_QUIET ERROR_QUIET) - if(NOT CARGO_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/ResNet) endif() - add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) - target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) - target_link_libraries(flexflow tokenizers_cpp) -endif() -if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/ResNet) -endif() -if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/resnext50) -endif() + if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/resnext50) + endif() -if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/AlexNet) -endif() + if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/AlexNet) + endif() -if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/MLP_Unify) -endif() + if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/MLP_Unify) + endif() -if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/split_test) -endif() + if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/split_test) + endif() -if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/split_test_2) -endif() + if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/split_test_2) + endif() -if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/InceptionV3) -endif() + if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/InceptionV3) + endif() -#TODO: Once functional add to BUILD_ALL_EXAMPLES -if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/candle_uno) -endif() + #TODO: Once functional add to BUILD_ALL_EXAMPLES + if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/candle_uno) + endif() -if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/DLRM) + if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/DLRM) - #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc) - #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) + #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc) + #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc) - #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) -endif() + #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc) + #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) + endif() -if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/XDL) -endif() + if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/XDL) + endif() -if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/Transformer) -endif() + if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/Transformer) + endif() -if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/mixture_of_experts) -endif() + if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/mixture_of_experts) + endif() -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(inference/spec_infer) - add_subdirectory(inference/incr_decoding) -endif() + if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(inference/spec_infer) + add_subdirectory(inference/incr_decoding) + endif() -# installation -set(INCLUDE_DEST "include") -set(LIB_DEST "lib") -install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST}) -install(TARGETS flexflow DESTINATION ${LIB_DEST}) -# install python -if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT FF_BUILD_FROM_PYPI) - install( - DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ - DESTINATION ${PY_DEST}/flexflow - FILES_MATCHING - PATTERN "*.py") - else() - # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually. - install( - PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py - DESTINATION ${PY_DEST}/flexflow/core - ) - # Use setup.py script to re-install the Python bindings library with the right library paths. - # Need to put the instructions in a subfolder because of issue below: - # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake - add_subdirectory(cmake/pip_install) + # installation + set(INCLUDE_DEST "include") + set(LIB_DEST "lib") + install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST}) + install(TARGETS flexflow DESTINATION ${LIB_DEST}) + # install python + if (FF_USE_PYTHON) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT FF_BUILD_FROM_PYPI) + install( + DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ + DESTINATION ${PY_DEST}/flexflow + FILES_MATCHING + PATTERN "*.py") + else() + # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually. + install( + PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py + DESTINATION ${PY_DEST}/flexflow/core + ) + # Use setup.py script to re-install the Python bindings library with the right library paths. + # Need to put the instructions in a subfolder because of issue below: + # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake + add_subdirectory(cmake/pip_install) + endif() endif() -endif() +endif() # if(NOT BUILD_LEGION_ONLY) diff --git a/config/config.linux b/config/config.linux index 5f15090a02..224509d616 100755 --- a/config/config.linux +++ b/config/config.linux @@ -100,7 +100,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then From 1105f4e157afc67ccce7df0e9aef601d7490e61b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 23 Oct 2023 22:02:14 -0400 Subject: [PATCH 258/344] Fix Legion prebuild workflow (3) (#1210) --- config/config.linux | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.linux b/config/config.linux index 224509d616..37b9bd16fd 100755 --- a/config/config.linux +++ b/config/config.linux @@ -10,7 +10,7 @@ #LD_FLAGS=${LD_FLAGS+=""} #set install dir -#INSTALL_DIR= +INSTALL_DIR=${INSTALL_DIR:-} # set build type BUILD_TYPE=${BUILD_TYPE:-Release} From bd305f77b0c247edb86a70d0c2d1f1b4868bd4e3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 4 Nov 2023 22:29:16 -0400 Subject: [PATCH 259/344] [CI/Docs/Examples] - Replace llama with llama2 model (#1219) * replace llama with llama2 * shellcheck * rename test * fix * rename test * fix * use text completion prompt, turn off hf sampling by default * fix output name * formatting * avoid python 3.12 for now * fix * fixes for falcon * fix --- .github/README.md | 24 ++--- .github/workflows/gpu-ci-skip.yml | 6 +- .github/workflows/gpu-ci.yml | 15 ++-- .github/workflows/multinode-test.yml | 6 +- INSTALL.md | 2 +- SERVE.md | 24 ++--- conda/environment.yml | 2 +- conda/flexflow.yml | 2 +- inference/MODEL_WEIGHTS.md | 2 +- inference/python/incr_decoding.py | 4 +- inference/python/spec_infer.py | 6 +- inference/utils/compress_llama_weights.py | 2 +- python/flexflow/serve/serve.py | 4 +- tests/inference/cpp_inference_tests.sh | 44 +++++----- tests/inference/huggingface_inference.py | 24 +++-- tests/inference/python_inference_tests.sh | 30 +++---- .../python_test_configs/generate_configs.py | 46 ++++++---- tests/inference_tests.sh | 10 ++- tests/multinode_helpers/mpi_wrapper1.sh | 2 +- tests/python_interface_test.sh | 6 +- .../{multi_gpu_tests.sh => training_tests.sh} | 88 +++++++++---------- 21 files changed, 193 insertions(+), 156 deletions(-) rename tests/{multi_gpu_tests.sh => training_tests.sh} (61%) diff --git a/.github/README.md b/.github/README.md index a8846260c8..5db26c5aa8 100644 --- a/.github/README.md +++ b/.github/README.md @@ -72,7 +72,7 @@ ff.init( Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). ```python # Specify the LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Specify a list of SSMs (just one in this case) ssms=[] @@ -116,7 +116,7 @@ ff.init( ) # Create the FlexFlow LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Create the sampling configs generation_config = ff.GenerationConfig( @@ -152,8 +152,8 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf") -* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf") +* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m-base"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. * `-cache-folder`: the folder * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. * `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests: @@ -162,7 +162,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` @@ -193,13 +193,13 @@ Below is a list of models that we have explicitly tested and for which a SSM may | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | | OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index 6a18e56bd1..f4cb950931 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -15,7 +15,7 @@ on: - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" - - "tests/multi_gpu_tests.sh" + - "tests/training_tests.sh" - "tests/python_interface_test.sh" workflow_dispatch: @@ -44,8 +44,8 @@ jobs: steps: - run: 'echo "No gpu-ci required"' - gpu-ci-flexflow: - name: Single Machine, Multiple GPUs Tests + training-tests: + name: Training Tests runs-on: ubuntu-20.04 # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} needs: inference-tests diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 4a43a3dee7..3901d6b5f7 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -15,7 +15,7 @@ on: - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" - - "tests/multi_gpu_tests.sh" + - "tests/training_tests.sh" - "tests/python_interface_test.sh" push: branches: @@ -34,7 +34,7 @@ on: - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" - - "tests/multi_gpu_tests.sh" + - "tests/training_tests.sh" - "tests/python_interface_test.sh" workflow_dispatch: @@ -141,7 +141,8 @@ jobs: run: shell: bash -l {0} # required to use an activated conda environment env: - CONDA: "3" + CONDA: "3" + HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} needs: gpu-ci-concierge container: image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest @@ -185,7 +186,7 @@ jobs: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # GPT tokenizer test - ./tests/gpt_tokenizer_test.sh + # ./tests/gpt_tokenizer_test.sh # Inference tests source ./build/set_python_envs.sh @@ -209,8 +210,8 @@ jobs: if: always() run: sudo rm -rf ~/.cache - gpu-ci-flexflow: - name: Single Machine, Multiple GPUs Tests + training-tests: + name: Training Tests runs-on: [self-hosted, gpu] # skip this time-consuming test for PRs to the inference branch # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} @@ -266,5 +267,5 @@ jobs: # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests - ./tests/multi_gpu_tests.sh 4 + ./tests/training_tests.sh 4 diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index ca2b47df27..226f953b38 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -78,7 +78,7 @@ jobs: export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 + ./tests/training_tests.sh 2 2 multinode-gpu-test-ucx: name: Multinode GPU Test with UCX @@ -129,7 +129,7 @@ jobs: export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 + ./tests/training_tests.sh 2 2 multinode-gpu-test-native-ucx: name: Multinode GPU Test with native UCX @@ -177,7 +177,7 @@ jobs: export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 + ./tests/training_tests.sh 2 2 notify-slack: name: Notify Slack in case of failure diff --git a/INSTALL.md b/INSTALL.md index a197df24ed..1734319540 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -97,7 +97,7 @@ source ./build/set_python_envs.sh cd "$FF_HOME" ./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize -ll:zsize ``` -A script to run all the Python examples is available at `tests/multi_gpu_tests.sh` +A script to run all the Python examples is available at `tests/training_tests.sh` ### Run FlexFlow C++ examples diff --git a/SERVE.md b/SERVE.md index 60d0b566f0..1a00be2589 100644 --- a/SERVE.md +++ b/SERVE.md @@ -32,7 +32,7 @@ ff.init( Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). ```python # Specify the LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Specify a list of SSMs (just one in this case) ssms=[] @@ -78,7 +78,7 @@ ff.init( ) # Create the FlexFlow LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Create the sampling configs generation_config = ff.GenerationConfig( @@ -116,8 +116,8 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf") -* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf") +* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m-base"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. * `-cache-folder`: the folder * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. * `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests: @@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` @@ -157,13 +157,13 @@ Below is a list of models that we have explicitly tested and for which a SSM may | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | -| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | | OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | diff --git a/conda/environment.yml b/conda/environment.yml index 9ae0dc9c7a..48cd8ddb33 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python>=3.6 + - python>=3.6,<3.12 - cffi>=1.11.0 - Pillow - pybind11 diff --git a/conda/flexflow.yml b/conda/flexflow.yml index c9226269f2..67ef6b3419 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python>=3.6 + - python>=3.6,<3.12 - cffi>=1.11.0 - Pillow - pybind11 diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md index e46e6b45d1..d78fb37be9 100644 --- a/inference/MODEL_WEIGHTS.md +++ b/inference/MODEL_WEIGHTS.md @@ -2,7 +2,7 @@ To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we fir ```python from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") for name, params in model.named_parameters(): for name, params in model.named_parameters(): diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 3621ee83a3..4a146ab503 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -43,7 +43,7 @@ def get_configs(): # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, + "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -108,7 +108,7 @@ def main(): prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: - result = llm.generate("Here are some travel tips for Tokyo:\n") + result = llm.generate("Three tips for staying healthy are: ") if __name__ == "__main__": diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 3d0f1a1c0e..201b8d4a63 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -43,7 +43,7 @@ def get_configs(): # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, + "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -60,7 +60,7 @@ def get_configs(): } llm_configs = { # required llm arguments - "llm_model": "decapoda-research/llama-7b-hf", + "llm_model": "meta-llama/Llama-2-7b-hf", # optional llm parameters "cache_path": "", "refresh_cache": False, @@ -154,7 +154,7 @@ def main(): prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: - result = llm.generate("Here are some travel tips for Tokyo:\n") + result = llm.generate("Three tips for staying healthy are: ") if __name__ == "__main__": diff --git a/inference/utils/compress_llama_weights.py b/inference/utils/compress_llama_weights.py index c92ae6aca9..daaee9c9d5 100644 --- a/inference/utils/compress_llama_weights.py +++ b/inference/utils/compress_llama_weights.py @@ -91,7 +91,7 @@ def decompress(packed_data, config): if __name__ == "__main__": # torch.set_default_tensor_type(torch.HalfTensor) # torch.set_default_tensor_type(torch.cuda.HalfTensor) - model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") + model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") config = CompressionConfig( num_bits=8, group_size=32, group_dim=0, symmetric=False) for name, params in model.named_parameters(): diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 549677d77a..55601f957e 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -81,7 +81,7 @@ def __init__( ): """Create the LLM object - :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf' + :param model_name: The name of the HuggingFace model to use. E.g. 'meta-llama/Llama-2-7b-hf' :type model_name: str :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF :type data_type: DataType, optional @@ -439,7 +439,7 @@ def __init__( ): """Create the SSM object - :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf' + :param model_name: The name of the HuggingFace model to use. E.g. 'meta-llama/Llama-2-7b-hf' :type model_name: str :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF :type data_type: DataType, optional diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 42a6db09d8..211d7fe1bf 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,9 +10,9 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 @@ -22,9 +22,9 @@ cd "${BASH_SOURCE[0]%/*}" # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -42,9 +42,9 @@ fi ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 @@ -57,9 +57,9 @@ fi ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 # Falcon (full precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # Falcon (half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (full precision) # ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 @@ -76,9 +76,9 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -179,22 +179,22 @@ function compare_decoding_steps_spec_infer_incr_decoding { ############ Alignment between speculative inference and incremental decoding ################# # Full precision -diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt") +diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt") diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt") <(tail -n +3 "../../inference/output/spec_inference_opt.txt") # Half precision -check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" # Speed test: speculative inference should be at very least 1.5x faster than incremental decoding # Full precision -#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt" +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt" #compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt" -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt" # Half precision -#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" #compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" ############ Alignment between tensor model parallelism and pipeline parallelism only ################# @@ -205,8 +205,8 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then check_partial_token_match "../../inference/output/spec_inference_opt_half_tp.txt" "../../inference/output/spec_inference_opt_half.txt" diff <(tail -n +3 "../../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_160M.txt") check_partial_token_match "../../inference/output/incr_decoding_llama_160M_half_tp.txt" "../../inference/output/incr_decoding_llama_160M_half.txt" - diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") - check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_7B_half.txt" + diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt") + check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_2_7B_half.txt" diff <(tail -n +3 "../../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_125M.txt") check_partial_token_match "../../inference/output/incr_decoding_opt_125M_half_tp.txt" "../../inference/output/incr_decoding_opt_125M_half.txt" diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt") @@ -222,10 +222,10 @@ python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B.txt" # LLAMA (big model, half precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B_half.txt" --gpu # OPT (small model, full precision) python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 @@ -245,8 +245,8 @@ python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-pr diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt") diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_7B.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff <(tail -n +2 "../../inference/output/huggingface_llama_2_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_2_7B.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt") diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index fee215f4c4..5b533bf3c0 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -2,7 +2,14 @@ import json import os import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + LlamaTokenizer, + GenerationConfig, +) + def main(): # Change working dir to folder storing this script @@ -19,6 +26,7 @@ def main(): parser.add_argument( "--use-full-precision", action="store_true", help="Use full precision" ) + parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument("--gpu", action="store_true", help="Run on GPU") args = parser.parse_args() # Check if max-length is greater than 0 @@ -54,13 +62,19 @@ def main(): tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True) else: tokenizer = AutoTokenizer.from_pretrained(args.model_name) + generation_config = GenerationConfig.from_pretrained(args.model_name) + generation_config.do_sample = args.do_sample # Generate output with open(args.output_file, "w") as f: for i, prompt in enumerate(prompt_list): - batch = tokenizer( - prompt, return_tensors="pt", add_special_tokens=True - ).to(device) - generated = model.generate(batch["input_ids"], max_length=args.max_length) + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to( + device + ) + generated = model.generate( + batch["input_ids"], + max_length=args.max_length, + generation_config=generation_config, + ) out = tokenizer.decode(generated[0]) # Write output to file out_str = out if i == (len(prompt_list) - 1) else out + "\n" diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 64c61ba0dc..b0f8daa14e 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -108,25 +108,25 @@ function compare_decoding_steps_spec_infer_incr_decoding { ############ Alignment between speculative inference and incremental decoding ################# # Full precision -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt") # Half precision -check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" # Speed test: speculative inference should be at very least 1.5x faster than incremental decoding # Full precision -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt" # Half precision -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" ############ Alignment between tensor model parallelism and pipeline parallelism only ################# ## Specinfer # LLAMA -diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" # OPT diff <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/spec_infer-python-opt-6.7b-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" @@ -138,10 +138,10 @@ check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-bas diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" # Big LLAMA -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" -#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -#check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" +#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +#check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" # Small OPT diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/incr_dec-python-opt-125m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" @@ -163,10 +163,10 @@ python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" # LLAMA (big model, half precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu # OPT (small model, full precision) python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 @@ -185,11 +185,11 @@ python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-pr diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") #diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-falcon-7b-half_prec-1_tp_4_pp.txt") +diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 8efe8999c4..c77e19053d 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -6,7 +6,7 @@ # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, + "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -47,12 +47,16 @@ ff_init_configs.update(llm_configs) # Test parameters to fill in -llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m-base"] +llama_models = ["meta-llama/Llama-2-7b-hf", "JackFram/llama-160m-base"] opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] -falcon_models = ["tiiuae/falcon-7b",] -mpt_models = ["mosaicml/mpt-7b", ] +falcon_models = [ + "tiiuae/falcon-7b", +] +mpt_models = [ + "mosaicml/mpt-7b", +] # starcoder_models = ["bigcode/starcoderbase-7b",] -parallelism_settings = [(1,4), (2,2), (4,1)] +parallelism_settings = [(1, 4), (2, 2), (4, 1)] # The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference) prompt_file = "../../inference/prompt/test.json" @@ -69,7 +73,6 @@ for model_name in all_models: for full_precision in (True, False): for parallelism_degrees in parallelism_settings: - tp, pp = parallelism_degrees # Tensor parallelism not supported by small Falcon model atm @@ -79,14 +82,21 @@ if tp > 2 and ("7b" in model_name or "6.7b" in model_name): continue - if full_precision and ("falcon" in model_name or "starcoder" in model_name): + # Run Falcon only in full precision, Starcoder only in half precision + if (not full_precision and "falcon" in model_name) or (full_precision and "starcoder" in model_name): continue - + _, after_slash = model_name.rsplit("/", maxsplit=1) - filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + filename = ( + "incr_dec-" + + "python-" + + after_slash.lower() + + ("-full_prec-" if full_precision else "-half_prec-") + + f"{tp}_tp_{pp}_pp" + ) test_configs_file = "./" + filename + ".json" - output_file = os.path.join(output_folder, filename+".txt") - + output_file = os.path.join(output_folder, filename + ".txt") + ff_init_configs["tensor_parallelism_degree"] = tp ff_init_configs["pipeline_parallelism_degree"] = pp ff_init_configs["llm_model"] = model_name @@ -110,17 +120,23 @@ continue _, after_slash = big_model.rsplit("/", maxsplit=1) - filename = "spec_infer-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + filename = ( + "spec_infer-" + + "python-" + + after_slash.lower() + + ("-full_prec-" if full_precision else "-half_prec-") + + f"{tp}_tp_{pp}_pp" + ) test_configs_file = "./" + filename + ".json" - output_file = os.path.join(output_folder, filename+".txt") - + output_file = os.path.join(output_folder, filename + ".txt") + ff_init_configs["tensor_parallelism_degree"] = tp ff_init_configs["pipeline_parallelism_degree"] = pp ff_init_configs["llm_model"] = big_model ff_init_configs["full_precision"] = full_precision ff_init_configs["output_file"] = output_file ff_init_configs["prompt"] = prompt_file - + ssm_configs["ssms"][0]["ssm_model"] = small_model ssm_configs["ssms"][0]["full_precision"] = full_precision ff_init_configs.update(ssm_configs) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index c757dd5ee6..d450c2d6d5 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -16,6 +16,12 @@ CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF} # Enable model parallelism tests in C++, if desired TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} +# Token to access private huggingface models (e.g. LLAMA-2) +HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} +if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then + huggingface-cli login --token "$HUGGINGFACE_TOKEN" +fi + # Clean up before test (just in case) cleanup @@ -24,7 +30,7 @@ pip3 install protobuf==3.20.3 # Create test prompt file mkdir -p ../inference/prompt -echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json +echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json # Create output folder mkdir -p ../inference/output @@ -38,7 +44,7 @@ if [[ "$PYTHON_INFERENCE_TESTS" == "ON" ]]; then fi if [[ "$CPP_INFERENCE_TESTS" == "ON" ]]; then # Manually download the weights in both half and full precision - python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" + python3 ../inference/utils/download_hf_model.py "meta-llama/Llama-2-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" echo "Running C++ inference tests..." ./inference/cpp_inference_tests.sh fi diff --git a/tests/multinode_helpers/mpi_wrapper1.sh b/tests/multinode_helpers/mpi_wrapper1.sh index 87d17d11a3..076fd2d66c 100755 --- a/tests/multinode_helpers/mpi_wrapper1.sh +++ b/tests/multinode_helpers/mpi_wrapper1.sh @@ -8,5 +8,5 @@ if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exi # We need to wrap the instruction below in its own script because MPI throws an error if we try # to run "mpirun" more than once in the same script. Hence, we cannot simply call "mpirun" in the -# multi_gpu_tests.sh script +# training_tests.sh script mpirun -np "$NUM_NODES" "$FF_HOME"/tests/multinode_helpers/mpi_wrapper2.sh "$@" diff --git a/tests/python_interface_test.sh b/tests/python_interface_test.sh index 4f83918a49..5ce4d9803b 100755 --- a/tests/python_interface_test.sh +++ b/tests/python_interface_test.sh @@ -14,13 +14,13 @@ check_python_interface() { # Generate configs JSON files test_params=$(jq -n --arg num_gpus "$GPUS" --arg memory_per_gpu "$FSIZE" --arg zero_copy_memory_per_node "$ZSIZE" --arg batch_size "$BATCHSIZE" --arg only_data_parallel "$ONLY_DATA_PARALLEL" '{"num_gpus":$num_gpus,"memory_per_gpu":$memory_per_gpu,"zero_copy_memory_per_node":$zero_copy_memory_per_node,"batch_size":$batch_size,"only_data_parallel":$only_data_parallel}') - mkdir -p /tmp/flexflow/multi_gpu_tests - echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json + mkdir -p /tmp/flexflow/training_tests + echo "$test_params" > /tmp/flexflow/training_tests/test_params.json if [[ "$interpreter" == "python" ]]; then EXE="python" echo "Running a single-GPU Python test to check the Python interface (native python interpreter)" - $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json + $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json elif [[ "$interpreter" == "flexflow_python" ]]; then if [[ "$installation_status" == "before-installation" ]]; then EXE="$BUILD_FOLDER"/flexflow_python diff --git a/tests/multi_gpu_tests.sh b/tests/training_tests.sh similarity index 61% rename from tests/multi_gpu_tests.sh rename to tests/training_tests.sh index 3a6f6467df..2d1f00883b 100755 --- a/tests/multi_gpu_tests.sh +++ b/tests/training_tests.sh @@ -33,57 +33,57 @@ test_params_5_epochs=$(echo "$test_params" | jq '. + {"epochs": 5}') test_params_40_epochs=$(echo "$test_params" | jq '. + {"epochs": 40}') test_params_5_epochs_no_batch_size=$(echo "$test_params_5_epochs" | jq 'del(.batch_size)') test_params_40_epochs_no_batch_size=$(echo "$test_params_40_epochs" | jq 'del(.batch_size)') -mkdir -p /tmp/flexflow/multi_gpu_tests -echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json -echo "$test_params_5_epochs" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json -echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json +mkdir -p /tmp/flexflow/training_tests +echo "$test_params" > /tmp/flexflow/training_tests/test_params.json +echo "$test_params_5_epochs" > /tmp/flexflow/training_tests/test_params_5_epochs.json +echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json +echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json #Sequential model tests -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/training_tests/test_params.json #Keras other -$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/training_tests/test_params.json #Functional API -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json #Python -$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json -$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json -$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json #Possible crash -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json From b0fe5220770777c9297d028de5e466f5dd68b2d9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 5 Nov 2023 21:08:09 -0500 Subject: [PATCH 260/344] Fix inference tests in CI (#1225) * updated diffs in tests * manually add BOS token in LLAMA * shellcheck * fix --- .github/README.md | 16 +++---- CMakeLists.txt | 13 ++++++ SERVE.md | 16 +++---- config/config.inc | 2 +- include/flexflow/ffconst.h | 9 ++-- inference/incr_decoding/incr_decoding.cc | 11 +---- inference/python/spec_infer.py | 2 +- inference/spec_infer/spec_infer.cc | 24 ++-------- python/flexflow/type.py | 9 ++-- src/runtime/request_manager.cc | 26 ++++++++++- tests/inference/cpp_inference_tests.sh | 45 ++++++++++--------- tests/inference/python_inference_tests.sh | 30 ++++++------- .../python_test_configs/generate_configs.py | 4 +- tests/inference_tests.sh | 2 +- 14 files changed, 112 insertions(+), 97 deletions(-) diff --git a/.github/README.md b/.github/README.md index 5db26c5aa8..528df18faf 100644 --- a/.github/README.md +++ b/.github/README.md @@ -153,7 +153,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. * `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf") -* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m-base"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. * `-cache-folder`: the folder * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. * `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests: @@ -193,13 +193,13 @@ Below is a list of models that we have explicitly tested and for which a SSM may | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | -| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | diff --git a/CMakeLists.txt b/CMakeLists.txt index f9ce66a0f1..3732d5ff6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) STRING "Choose the type of build." FORCE) endif() +if(INSTALL_DIR) + message(STATUS "INSTALL_DIR: ${INSTALL_DIR}") + set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE) +else() + # Install DIR not set. Use default, unless a conda environment is active + if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI) + set(CONDA_PREFIX $ENV{CONDA_PREFIX}) + # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path + set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE) + message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") + endif() +endif() + # do not disable assertions even if in release mode set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") diff --git a/SERVE.md b/SERVE.md index 1a00be2589..f6e34750cd 100644 --- a/SERVE.md +++ b/SERVE.md @@ -117,7 +117,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. * `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf") -* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m-base"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. +* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. * `-cache-folder`: the folder * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. * `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests: @@ -157,13 +157,13 @@ Below is a list of models that we have explicitly tested and for which a SSM may | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | -| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | -| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m-base) | +| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | | OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) | diff --git a/config/config.inc b/config/config.inc index 7f1f0ffcf4..5a7bde5ce9 100644 --- a/config/config.inc +++ b/config/config.inc @@ -24,7 +24,7 @@ fi #set installation dir if [ -n "$INSTALL_DIR" ]; then - SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}" + SET_INSTALL_DIR="-DINSTALL_DIR=${INSTALL_DIR}" fi if [ "$INFERENCE_TESTS" = "ON" ]; then diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 124b46862a..512645e624 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -186,11 +186,10 @@ enum OperatorType { enum ModelType { UNKNOWN = 3001, LLAMA = 3002, - LLAMA2 = 3003, - OPT = 3004, - FALCON = 3005, - STARCODER = 3006, - MPT = 3007 + OPT = 3003, + FALCON = 3004, + STARCODER = 3005, + MPT = 3006 }; enum PMParameter { diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 463bc10151..c3f9052305 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -177,14 +177,7 @@ void FlexFlow::top_level_task(Task const *task, auto architectures = model_config["architectures"]; for (auto const &str : architectures) { if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { - std::string nameOrPath = model_config["_name_or_path"]; - // TODO: support LLAMA-2 models not from Meta - bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; - if (llama2) { - model_type = ModelType::LLAMA2; - } else { - model_type = ModelType::LLAMA; - } + model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { model_type = ModelType::OPT; @@ -220,7 +213,7 @@ void FlexFlow::top_level_task(Task const *task, rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); - if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { + if (model_type == ModelType::LLAMA) { LLAMA::create_llama_model(model, config_filepath, weights_filepath, diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 201b8d4a63..c9fb5cc7bb 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -68,7 +68,7 @@ def get_configs(): "ssms": [ { # required ssm parameter - "ssm_model": "JackFram/llama-160m-base", + "ssm_model": "JackFram/llama-160m", # optional ssm parameters "cache_path": "", "refresh_cache": False, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 98b5ec4633..8b0eb926d9 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -166,14 +166,7 @@ void get_model_meta(FilePaths &file_paths, auto architectures = llm_model_config["architectures"]; for (auto const &str : architectures) { if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { - std::string nameOrPath = llm_model_config["_name_or_path"]; - // TODO: support LLAMA-2 models not from Meta - bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; - if (llama2) { - model_metadata.llm_model_type = ModelType::LLAMA2; - } else { - model_metadata.llm_model_type = ModelType::LLAMA; - } + model_metadata.llm_model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { model_metadata.llm_model_type = ModelType::OPT; @@ -223,14 +216,7 @@ void get_model_meta(FilePaths &file_paths, auto architectures = ssm_model_config["architectures"]; for (auto const &str : architectures) { if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { - std::string nameOrPath = ssm_model_config["_name_or_path"]; - // TODO: support LLAMA-2 models not from Meta - bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; - if (llama2) { - ssm_model_type = ModelType::LLAMA2; - } else { - ssm_model_type = ModelType::LLAMA; - } + ssm_model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { ssm_model_type = ModelType::OPT; @@ -318,8 +304,7 @@ void FlexFlow::top_level_task(Task const *task, // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); - if (model_metadata.llm_model_type == ModelType::LLAMA || - model_metadata.llm_model_type == ModelType::LLAMA2) { + if (model_metadata.llm_model_type == ModelType::LLAMA) { LLAMA::create_llama_model(tree_model, model_metadata.llm_model_config_path, model_metadata.llm_weights_path, @@ -363,8 +348,7 @@ void FlexFlow::top_level_task(Task const *task, for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { FFModel &beam_model = ssm_models[ssm_id]; - if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA || - model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA2) { + if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) { LLAMA::create_llama_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 9caecdde54..994a85f57e 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -75,11 +75,10 @@ class InferenceMode(Enum): class ModelType(Enum): UNKNOWN = 3001 LLAMA = 3002 - LLAMA2 = 3003 - OPT = 3004 - FALCON = 3005 - STARCODER = 3006 - MPT = 3007 + OPT = 3003 + FALCON = 3004 + STARCODER = 3005 + MPT = 3006 class OpType(Enum): diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 4d232b6d44..e052c8716a 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -115,7 +115,7 @@ void RequestManager::register_tokenizer(ModelType type, this->eos_token_id = eos_token_id; std::string tokenizer_folder = (!path.empty() && path.back() != '/') ? path + '/' : path; - if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { + if (model_type == ModelType::LLAMA) { bool path_to_file = !path.empty() && (path.size() >= strlen("tokenizer.model")) && path.find("tokenizer.model") == @@ -416,6 +416,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } { // update generation result and trigger future @@ -625,6 +631,12 @@ BeamSearchBatchConfig request.guid, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } { // update generation result and trigger future GenerationResult &gr = request_generation_results[request.guid]; @@ -736,6 +748,12 @@ BeamSearchBatchConfig } } std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } log_req_mgr.print("Output: %s", output.c_str()); } } else if (request.status == Request::PENDING) { @@ -769,6 +787,12 @@ BeamSearchBatchConfig // Token Info std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically removes + // the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } log_req_mgr.print("Output: %s", output.c_str()); } else { assert(false); diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 211d7fe1bf..8beea55999 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,9 +10,9 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 @@ -22,9 +22,9 @@ cd "${BASH_SOURCE[0]%/*}" # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -37,9 +37,12 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 + +../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1 + # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 @@ -69,11 +72,11 @@ fi # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -216,10 +219,10 @@ fi ######################### Alignment tests with HuggingFace #################################### # LLAMA (small model, full precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu # LLAMA (small model, half precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B.txt" @@ -243,14 +246,14 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_llama_2_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_2_7B.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_160M.txt") +diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_2_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B.txt") +diff <( < ../../inference/output/huggingface_llama_2_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt") -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt") -# diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt") -diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_falcon_7B.txt") +diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_125M.txt") +diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B.txt") +# diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B_half.txt") +diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_falcon_7B.txt") diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index b0f8daa14e..3544f58e26 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -133,10 +133,10 @@ check_partial_token_match "../../inference/output/spec_infer-python-opt-6.7b-hal ## Incremental decoding # Small LLAMA -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" # Big LLAMA diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" @@ -157,10 +157,10 @@ check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_ ######################### Alignment tests with HuggingFace #################################### # LLAMA (small model, full precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu # LLAMA (small model, half precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" @@ -183,13 +183,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p # Falcon (full precision) python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") -#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index c77e19053d..ebaadade32 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -35,7 +35,7 @@ "ssms": [ { # required ssm parameter - "ssm_model": "JackFram/llama-160m-base", + "ssm_model": "JackFram/llama-160m", # optional ssm parameters "cache_path": "", "refresh_cache": False, @@ -47,7 +47,7 @@ ff_init_configs.update(llm_configs) # Test parameters to fill in -llama_models = ["meta-llama/Llama-2-7b-hf", "JackFram/llama-160m-base"] +llama_models = ["meta-llama/Llama-2-7b-hf", "JackFram/llama-160m"] opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] falcon_models = [ "tiiuae/falcon-7b", diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index d450c2d6d5..895b74c798 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -44,7 +44,7 @@ if [[ "$PYTHON_INFERENCE_TESTS" == "ON" ]]; then fi if [[ "$CPP_INFERENCE_TESTS" == "ON" ]]; then # Manually download the weights in both half and full precision - python3 ../inference/utils/download_hf_model.py "meta-llama/Llama-2-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" + python3 ../inference/utils/download_hf_model.py "meta-llama/Llama-2-7b-hf" "JackFram/llama-160m" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" echo "Running C++ inference tests..." ./inference/cpp_inference_tests.sh fi From c6ad6e2056c5739a138d80fc3af1a06249e7ddba Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 9 Nov 2023 17:46:23 -0500 Subject: [PATCH 261/344] Update the default cublas behavior when CUDA_VERSION is not specified (#1209) * Update the default cublas behavior when CUDA_VERSION is not specified * fix hip error * use CUBLAS_COMPUTE_FAST_16F for full-precision gemm --- src/ops/inc_multihead_self_attention.cpp | 14 +++++----- src/ops/inc_multihead_self_attention.cu | 26 +++++++++++++------ src/ops/kernels/linear_kernels.cpp | 18 +++++++------ src/ops/kernels/linear_kernels.cu | 26 +++++++++++++------ src/ops/spec_inc_multihead_self_attention.cpp | 7 ++--- src/ops/spec_inc_multihead_self_attention.cu | 13 +++++++--- src/ops/tree_inc_multihead_self_attention.cpp | 7 ++--- src/ops/tree_inc_multihead_self_attention.cu | 13 +++++++--- 8 files changed, 80 insertions(+), 44 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 37cc986f5e..d60386f927 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -257,10 +257,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) @@ -509,10 +510,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 3b24a5a324..cff5550c85 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -238,11 +238,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads @@ -508,11 +513,16 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 231ca0f3d7..072eb5e96b 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -241,11 +241,12 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = output_type; #else - hipblasDatatype_t compute_type = input_type; + // TODO: currently use the output_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, @@ -337,11 +338,12 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = output_type; #else - hipblasDatatype_t compute_type = HIPBLAS_R_32F; + // TODO: currently use output_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 8a93357dcf..9373c2fb2f 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -311,11 +311,16 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; #else - cudaDataType_t compute_type = input_type; + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -401,11 +406,16 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; #else - cudaDataType_t compute_type = CUDA_R_32F; + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 1d81ae0c11..b1687d12a2 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -200,10 +200,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index ac74eb1c8f..52e083889e 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -215,11 +215,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 1d9ebf67e0..26291fb3b4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -157,10 +157,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index edf7a2d075..0aa50f605c 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -158,11 +158,16 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; From 3bcf3d4975db86c071e3118c6d027851925613c7 Mon Sep 17 00:00:00 2001 From: zwang86 <46699021+zwang86@users.noreply.github.com> Date: Fri, 10 Nov 2023 08:30:45 -0500 Subject: [PATCH 262/344] Reorder tokens in batch using based on token type (#1214) * Reorder tokens in init and verify batch. * Format code * fix --------- Co-authored-by: Zhihao Jia --- src/runtime/request_manager.cc | 194 ++++++++++++++++++++++----------- 1 file changed, 133 insertions(+), 61 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e052c8716a..f1164d3c49 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -362,39 +362,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, BatchConfig new_bc; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { // add new requests to the next batch - if (!pending_request_queue.empty() && - new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); - // all_requests[new_request.guid] = new_request; - new_bc.requestsInfo[i].first_token_depth_in_request = 0; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = new_request.guid; - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens - - BatchConfig::max_requests_per_batch() + (i + 1), - (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; - new_bc.request_completed[i] = false; - // add profile_info for the new request - ProfileInfo profile_info; - profile_info.decoding_steps = 1; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - assert(depth < new_request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_request.tokens[depth]; - new_bc.num_tokens++; - } - if (new_bc.num_tokens == get_max_tokens_per_batch()) { - break; - } - } + continue; } else { assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; @@ -504,6 +472,44 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } + // Step 3: add new requests to the next batch + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + if (new_bc.request_completed[i]) { + if (!pending_request_queue.empty() && + new_bc.num_tokens < get_max_tokens_per_batch()) { + Request new_request = pending_request_queue.front(); + pending_request_queue.pop(); + // all_requests[new_request.guid] = new_request; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[i].request_guid = new_request.guid; + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(get_max_tokens_per_batch() - new_bc.num_tokens, + (int)new_request.tokens.size()); + new_bc.requestsInfo[i].max_sequence_length = + new_request.max_sequence_length; + new_bc.request_completed[i] = false; + // add profile_info for the new request + ProfileInfo profile_info; + profile_info.decoding_steps = 1; + profile_info.start_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[new_request.guid] = profile_info; + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + assert(depth < new_request.tokens.size()); + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_request.tokens[depth]; + new_bc.num_tokens++; + } + if (new_bc.num_tokens == get_max_tokens_per_batch()) { + break; + } + } + } + } + return new_bc; } @@ -946,8 +952,9 @@ BeamSearchBatchConfig new_bc.model_id = old_bc.model_id; // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; + // Add incremental tokens to the batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i]) { + if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } // Comment out this assertion since num_tokens_in_batch can be @@ -1003,6 +1010,7 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = old_bc.beamRequestsInfo[i].current_depth + 1; @@ -1010,11 +1018,7 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); } else { - // if the request is pending, we need to update the beam search - // metadata based on the initial length - new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth; - new_bc.request_running[i] = false; + assert(false && "Request should not be pending in beam search phase"); } // do the slot exchange to minimize the cache exchange in kernel. @@ -1026,7 +1030,8 @@ BeamSearchBatchConfig if (request.status == Request::RUNNING) { new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { - new_bc.requestsInfo[i].num_tokens_in_batch = 0; + assert(false && "Request should be done"); + // new_bc.requestsInfo[i].num_tokens_in_batch = 0; } if (verbose) { @@ -1035,6 +1040,84 @@ BeamSearchBatchConfig << ", num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; } + } + + if (verbose) { + std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size + << std::endl; + } + + // register more tokens due to the beam width + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; + for (int k = 0; k < new_bc.sub_requests[i]; k++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + + // get value from requestinfo + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_bc.beamRequestsInfo[i].tokens[k]; + + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; + new_bc.num_tokens++; + } + } + } + } + + // Add prompt tokens to the batch + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + if (old_bc.request_completed[i] || old_bc.request_running[i]) { + continue; + } + // Comment out this assertion since num_tokens_in_batch can be + // zero when beam search has reached required sequence length + // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + + old_bc.requestsInfo[i].num_tokens_in_batch; + + // assert(processed_tokens < request.tokens.size()); + log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; + + { + log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " + << new_bc.num_tokens; + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + + // update the beam search metadata + // how many sub request in current request + // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH + // entries? + new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + + // update the parentid, accumalated_probs, depth, and token_ids + new_bc.beamRequestsInfo[i].beam_size = + old_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].max_depth = + old_bc.beamRequestsInfo[i].max_depth; + + if (request.status == Request::PENDING) { + // if the request is pending, we need to update the beam search + // metadata based on the initial length + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth; + new_bc.request_running[i] = false; + } else { + assert(false && "Request should be pending"); + } + + if (new_bc.requestsInfo[i].first_token_depth_in_request >= + request.tokens.size()) { + // request is done + new_bc.requestsInfo[i].num_tokens_in_batch = 0; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -1043,17 +1126,16 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; - if (verbose) { - std::cout << "[ Beam Spec] " << request.guid << std::endl; - std::cout << "Prompt phase: " << request.tokens.size() - << ", num_tokens_in_batch:" - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << "Update ssm cache size: " << request.ssm_cache_size - << std::endl; - } } if (verbose) { + std::cout << "[ Beam Spec] " << request.guid << std::endl; + std::cout << "Prompt phase: " << request.tokens.size() + << ", num_tokens_in_batch:" + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << "Update ssm cache size: " << request.ssm_cache_size + << std::endl; + std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size << std::endl; std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size @@ -1068,19 +1150,8 @@ BeamSearchBatchConfig new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; // get value from requestinfo - if (request.status == Request::RUNNING) { - // std::cout << "[running ]Num of token in batch: " - // << new_bc.requestsInfo[i].num_tokens_in_batch - // << std::endl; - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_bc.beamRequestsInfo[i].tokens[k]; - } else { - // std::cout << "[pending ]Num of token in batch: " - // << new_bc.requestsInfo[i].num_tokens_in_batch - // << std::endl; - new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.tokens.size() - 1]; - } + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[request.tokens.size() - 1]; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; @@ -1088,6 +1159,7 @@ BeamSearchBatchConfig } } } + if (verbose) { std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" << std::endl; From b15d06082babe07649e3dc20f0c516029054f9fc Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:33:39 -0500 Subject: [PATCH 263/344] Optimize attention kernel (#1228) * change layout * main change * fix * change spec&tree kernel * fix tp * fix * fix multi requests * replicate key&value * ci * cleanup&hip * more fix. * ci * new kernel * draft * fix * align inc * fix * . * multi batch * fix * fix * fix different thread per key case * fix * . * . * . * fix. * fix. * . * . * .. * opt * fix half * fix. * . * hip * clean --------- Co-authored-by: Zhihao Jia --- include/flexflow/batch_config.h | 3 + .../ops/inc_multihead_self_attention.h | 10 +- .../inc_multihead_self_attention_kernels.h | 16 + .../inc_multihead_self_attention_utils.cuh | 524 ++++++++++++++++++ .../ops/spec_inc_multihead_self_attention.h | 1 - src/ops/inc_multihead_self_attention.cu | 523 +++++++++++++---- src/ops/spec_inc_multihead_self_attention.cu | 89 +-- src/ops/tree_inc_multihead_self_attention.cu | 376 ++++++++++++- src/runtime/request_manager.cc | 9 + 9 files changed, 1380 insertions(+), 171 deletions(-) create mode 100644 include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index d625985552..e2903c4d11 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -59,6 +59,9 @@ class BatchConfig { // Set by update int num_tokens; + // number of tokens in prompt phase, start offset of tokens in inc_decoding + // phase. num_tokens - num_prompt_tokens = num_generation_tokens; + int num_generation_tokens; struct PerRequestInfo { int first_token_depth_in_request; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 5ff0942fff..43dc527bc8 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -29,7 +29,7 @@ class IncMultiHeadSelfAttention : public Op { IncMultiHeadSelfAttention(FFModel &model, LayerID const &layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -50,8 +50,8 @@ class IncMultiHeadSelfAttention : public Op { int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -73,7 +73,7 @@ class IncMultiHeadSelfAttention : public Op { char const *name); IncMultiHeadSelfAttention(FFModel &model, IncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights); IncMultiHeadSelfAttention(FFModel &model, Params const ¶ms, @@ -192,9 +192,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { void *attn_heads; char *quantized_weight_ptr; BatchConfig::PerTokenInfo *token_infos; + BatchConfig::PerRequestInfo *request_infos; DataType quantization_type; bool offload; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + // cudaStream_t task_local_stream; cudnnTensorDescriptor_t qk_tensor; cuFloatComplex *complex_input; #elif defined(FF_USE_HIP_ROCM) diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 763f654e28..9bf2f581e2 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -14,6 +14,22 @@ namespace FlexFlow { namespace Kernels { namespace IncMultiHeadAttention { +template +void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + ffStream_t stream); + +template +void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *weight_ptr, + DT const *bias_ptr, + int num_tokens, + ffStream_t stream); + template __global__ void apply_position_bias_qkprd(DT *input_ptr, int num_tokens, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh new file mode 100644 index 0000000000..c128c1a126 --- /dev/null +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -0,0 +1,524 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H +#define _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H + +#include "flexflow/inference.h" + +namespace FlexFlow { + +////////////////basic datatype////////////////////// +struct half4 { + half x; + half y; + half z; + half w; +}; + +struct half8 { + half x; + half y; + half z; + half w; + half a; + half b; + half c; + half d; +}; +struct float8 { + float x; + float y; + float z; + float w; + float a; + float b; + float c; + float d; +}; + +////////////////data type/////////////// +template +struct VEC_K {}; +template <> +struct VEC_K { + using Type = float; +}; +template <> +struct VEC_K { + using Type = float2; +}; +template <> +struct VEC_K { + using Type = float4; +}; +template <> +struct VEC_K { + using Type = half; +}; +template <> +struct VEC_K { + using Type = half2; +}; +template <> +struct VEC_K { + using Type = half4; +}; + +// data type for QK production +template +struct Vec_fp32_ {}; + +template <> +struct Vec_fp32_ { + using Type = float; +}; +template <> +struct Vec_fp32_ { + using Type = float2; +}; +template <> +struct Vec_fp32_ { + using Type = float4; +}; +template <> +struct Vec_fp32_ { + using Type = float; +}; +template <> +struct Vec_fp32_ { + using Type = float2; +}; +template <> +struct Vec_fp32_ { + using Type = float4; +}; +template <> +struct Vec_fp32_ { + using Type = float8; +}; + +template +struct VEC_V {}; +template <> +struct VEC_V { + using Type = float4; +}; +template <> +struct VEC_V { + using Type = half8; +}; + +////////////////data structures half/////////////// + +////////////////////////////////////floating point +/// operations/////////////////////////////////////////// + +template +inline __device__ Acc mul(A a, B b) { + return Acc{}; // for compile +} +template <> +inline __device__ float mul(float a, float b) { + return a * b; +} + +template <> +inline __device__ float2 mul(float2 a, float2 b) { + float2 c; + c.x = a.x * b.x; + c.y = a.y * b.y; + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template <> +inline __device__ float2 mul(float a, float2 b) { + float2 c; + c.x = a * b.x; + c.y = a * b.y; + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template <> +inline __device__ float4 mul(float4 a, float4 b) { + float4 c; + c.x = a.x * b.x; + c.y = a.y * b.y; + c.z = a.z * b.z; + c.w = a.w * b.w; + return c; +} + +// template <> +// inline __device__ float4 mul(half4 a, half4 b) { +// float4 c; +// c.x = a.x * b.x; +// c.y = a.y * b.y; +// c.z = a.z * b.z; +// c.w = a.w * b.w; +// return c; +// } + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float fma(float a, float b, float c) { + return a * b + c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(float2 a, float2 b, float2 c) { + float2 d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(float a, float2 b, float2 c) { + float2 d; + d.x = fma(a, b.x, c.x); + d.y = fma(a, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 fma(float4 a, float4 b, float4 c) { + float4 d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + d.z = fma(a.z, b.z, c.z); + d.w = fma(a.w, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 fma(float a, float4 b, float4 c) { + float4 d; + d.x = fma(a, b.x, c.x); + d.y = fma(a, b.y, c.y); + d.z = fma(a, b.z, c.z); + d.w = fma(a, b.w, c.w); + return d; +} + +inline __device__ float8 fma(float a, float8 f1, float8 f2) { + float8 res; + res.x = fma(a, f1.x, f2.x); + res.y = fma(a, f1.y, f2.y); + res.z = fma(a, f1.z, f2.z); + res.w = fma(a, f1.w, f2.w); + res.a = fma(a, f1.a, f2.a); + res.b = fma(a, f1.b, f2.b); + res.c = fma(a, f1.c, f2.c); + res.d = fma(a, f1.d, f2.d); + return res; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float add(float a, float b) { + return a + b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 add(float2 a, float2 b) { + float2 c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 add(float4 a, float4 b) { + float4 c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + c.z = add(a.z, b.z); + c.w = add(a.w, b.w); + return c; +} + +inline __device__ float8 add(float8 f1, float8 f2) { + float8 res; + res.x = add(f1.x, f2.x); + res.y = add(f1.y, f2.y); + res.z = add(f1.z, f2.z); + res.w = add(f1.w, f2.w); + res.a = add(f1.a, f2.a); + res.b = add(f1.b, f2.b); + res.c = add(f1.c, f2.c); + res.d = add(f1.d, f2.d); + return res; +} + +inline __device__ float sum(float v) { + return v; +} + +template +inline __device__ __host__ T div_up(T m, T n) { + return (m + n - 1) / n; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(float2 v) { + return v.x + v.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(float4 v) { + return v.x + v.y + v.z + v.w; +} + +inline __device__ float cast_to_float(float u) { + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 cast_to_float(float2 u) { + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 cast_to_float(float4 u) { + return u; +} + +inline __device__ float cast_to_float(half u) { + return __half2float(u); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 cast_to_float(half2 u) { + float2 tmp; + tmp.x = __half2float(u.x); + tmp.y = __half2float(u.y); + return tmp; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 cast_to_float(half4 u) { + float4 tmp; + tmp.x = __half2float(u.x); + tmp.y = __half2float(u.y); + tmp.z = __half2float(u.z); + tmp.w = __half2float(u.w); + return tmp; +} +inline __device__ float8 cast_to_float(half8 u) { + float8 tmp; + tmp.x = __half2float(u.x); + tmp.y = __half2float(u.y); + tmp.z = __half2float(u.z); + tmp.w = __half2float(u.w); + tmp.a = __half2float(u.a); + tmp.b = __half2float(u.b); + tmp.c = __half2float(u.c); + tmp.d = __half2float(u.d); + return tmp; +} + +inline __device__ void convert_from_float(float4 &dst, float4 src) { + dst = src; +} +inline __device__ void convert_from_float(float &dst, float src) { + dst = src; +} +inline __device__ void convert_from_float(float2 &dst, float2 src) { + dst = src; +} +inline __device__ void convert_from_float(float8 &dst, float8 src) { + dst = src; +} + +inline __device__ void convert_from_float(half4 &dst, float4 src) { + dst.x = __float2half(src.x); + dst.y = __float2half(src.y); + dst.z = __float2half(src.z); + dst.w = __float2half(src.w); +} + +inline __device__ void convert_from_float(half8 &dst, float8 src) { + dst.x = __float2half(src.x); + dst.y = __float2half(src.y); + dst.z = __float2half(src.z); + dst.w = __float2half(src.w); + dst.a = __float2half(src.a); + dst.b = __float2half(src.b); + dst.c = __float2half(src.c); + dst.d = __float2half(src.d); +} +inline __device__ void convert_from_float(half2 &dst, float2 src) { + dst.x = __float2half(src.x); + dst.y = __float2half(src.y); +} +inline __device__ void convert_from_float(half &dst, float src) { + dst = __float2half(src); +} + +//////////////////////////////////////utils/////////////////////////////////////////////// + +template +inline __device__ void zero(T &dst) { + constexpr int WORDS = sizeof(T) / 4; + union { + T raw; + uint32_t words[WORDS]; + } tmp; +#pragma unroll + for (int ii = 0; ii < WORDS; ++ii) { + tmp.words[ii] = 0u; + } + dst = tmp.raw; +} + +template +inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) { + // use float32 to get better accuracy + using Vec_sum = typename Vec_fp32_::Type; + // Compute the parallel products for Q*K^T (treat vector lanes separately). + Vec_sum qk_vec = + mul(cast_to_float(q[0]), cast_to_float(k[0])); +#pragma unroll + for (int ii = 1; ii < N; ++ii) { + qk_vec = FlexFlow::fma(cast_to_float(q[ii]), cast_to_float(k[ii]), qk_vec); + } + + // Finalize the reduction across lanes. + float qk = sum(qk_vec); +#pragma unroll + for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) { + qk += __shfl_xor_sync(uint32_t(-1), qk, mask); + } + return qk; +} +template +struct Qk_dot { + template + static inline __device__ float dot(K_vec const (&q)[N], K_vec const (&k)[N]) { + return qk_dot_(q, k); + } +}; + +template +inline __device__ float block_sum(float *red_smem, float sum) { + + // Decompose the thread index into warp / lane. + int warp = threadIdx.x / WARP_SIZE; + int lane = threadIdx.x % WARP_SIZE; + +// Compute the sum per warp. +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + } + + // Warp leaders store the data to shared memory. + if (lane == 0) { + red_smem[warp] = sum; + } + + // Make sure the data is in shared memory. + __syncthreads(); + + // The warps compute the final sums. + if (lane < WARPS_PER_BLOCK) { + sum = red_smem[lane]; + } + +// Parallel reduction inside the warp. +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + } + + // Broadcast to other threads. + return __shfl_sync(uint32_t(-1), sum, 0); +} + +template +inline size_t smem_size_in_bytes(int hidden_size_per_head, + int max_sequence_length, + int threads_per_value, + int threads_per_block) { + // The amount of shared memory needed to store the Q*K^T values in float. + + size_t qk_sz = div_up(max_sequence_length + 1, 4) * 16; + size_t logits_sz = qk_sz; + + // The total size needed during softmax. + size_t softmax_sz = qk_sz + logits_sz; + size_t q_size = hidden_size_per_head * sizeof(DT); + + // The number of partial rows to reduce in the final reduction. + int rows_per_red = threads_per_block / threads_per_value; + // The amount of storage needed to finalize the outputs. + size_t red_sz = rows_per_red * hidden_size_per_head * sizeof(float) / 2; + // The max. + return max(softmax_sz, red_sz) + q_size; +} + +template +inline void smem_size_in_bytes_tree(int hidden_size_per_head, + int max_sequence_length, + int threads_per_value, + int threads_per_block, + TreeVerifyBatchConfig const *bc, + int shared_mem[]) { + + int max_query_length = 0; + int max_total_length = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + max_query_length = + max(max_query_length, bc->requestsInfo[i].num_tokens_in_batch); + max_total_length = max(max_total_length, + bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch); + } + + // todo fix this + int max_qk_length = max_query_length * max_total_length; + + // The amount of shared memory needed to store the Q*K^T values in float. + size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; + + size_t logits_sz = qk_sz; + + // The total size needed during softmax. + size_t softmax_sz = qk_sz + logits_sz; + + size_t q_size = hidden_size_per_head * sizeof(DT); + + // The number of partial rows to reduce in the final reduction. + int rows_per_red = threads_per_block / threads_per_value; + // The amount of storage needed to finalize the outputs. + // use 4 + size_t red_sz = rows_per_red * hidden_size_per_head * sizeof(float) / 2; + // The max. + shared_mem[0] = qk_sz; + shared_mem[1] = softmax_sz + red_sz + q_size; +} + +template +struct threads_per_value_t { + static int const value = Dh * sizeof(T) / 16; +}; + +} // namespace FlexFlow +#endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H \ No newline at end of file diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 363776cdb0..56bb2bd80d 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -140,7 +140,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: Realm::RegionInstance beam_search_reserve_inst; - BatchConfig::PerRequestInfo *request_infos; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; }; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cff5550c85..20f7d64936 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -19,6 +19,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -27,9 +28,277 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + namespace Kernels { namespace IncMultiHeadAttention { +// gridDim = num_heads +// blockDim = num_tokens/num_request * head_size +// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| +// one thread process one head_size +template +__global__ void compute_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + bool is_beam, + int max_beam_width) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + // eg. if head_size = 128, thread_per_key = 4, with float32 precision + // then K_VEC_SIZE = 1, QK_VEC_SIZE = 4 + // K_ELTS_PER_THREAD = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 / 1 = 32 + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + // constexpr int QK_VEC_SIZE = 16 / sizeof(DT); + // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const beam_request_idx = + is_beam ? request_idx / max_beam_width : request_idx; + int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; + + int const first_step = 0; + + int const tlength = + request_infos[beam_request_idx].first_token_depth_in_request + + request_infos[beam_request_idx].num_tokens_in_batch; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + beam_request_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + // DT const *q_ptr = + // query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size; + + // q tensor in this thread + // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total + // K_VECS_PER_THREAD elements + // QK_vec_k: 32->1, 64->2, 128->4... head_size + // K_vec_k: 4->1, 2->2, 1->4 threads_per_key + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + // first iter = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 + // K_PER_ITER how many keys in this loop + // The number of timesteps loaded per iteration. + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + + (beam_request_idx * max_beam_width + beam_sub_request_idx) * + max_seq_length * hidden_size + + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + // get k, perform qk proj + + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast(k_cache_batch + + ti_circ * hidden_size + + head_idx * per_head_size + jj); + } + // Compute dot product. + // This includes a reduction across the threads in the same thread group. + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + // // todo add positional embedding to the qk production + // // Store the product to shared memory. There's one qk value per + // timestep. + // // Update the max. + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); + } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("softmax %.10f\n", qk_smem[0]); + // } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + + (beam_request_idx * max_beam_width + beam_sub_request_idx) * + max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + float logit = qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float( + *reinterpret_cast(output_ptr + beam_request_idx * hidden_size + + head_idx * per_head_size + vi), + out); + } +} + // only used by MPT model. https://arxiv.org/abs/2108.12409 template __global__ void apply_position_bias_qkprd(DT *input_ptr, @@ -350,6 +619,117 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *weight_ptr, + DT const *bias_ptr, + int num_tokens, + cudaStream_t stream) { + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; +#endif + // Project to output, save result directly on output tensor + DT alpha = 1.0f, beta = 0.0f; + // int num_tokens = bc->num_active_tokens(); + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = num_tokens; + int lda = k, ldb = k, ldc = m_; + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + DT const *B = static_cast
(m->attn_heads); + DT *C = static_cast
(output_ptr); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + if (*m->final_bias && shard_id == 0) { + int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; + apply_proj_bias_w<<>>( + output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); + } +} + +#define LAUNCH_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + false, \ + 0) + +template +void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + template void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, @@ -419,18 +799,26 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { - // here because we need position info in inference 1 if (m->offload && m->biasSize > 0) { cudaMemcpyAsync( m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } + + // todo Xinhao copy how many requests if requests are not continous? cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(m->request_infos, + &(bc->requestsInfo), + bc->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -440,14 +828,24 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - - // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + if (bc->num_generation_tokens > 0) { + // phase 3: Compute attention score for generation tokens + compute_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } + + if (bc->num_tokens > bc->num_generation_tokens) { + // phase 4: Compute attention score for prompt tokens; + compute_attention_kernel_prompt( + m, bc, shard_id, bias_ptr, weight_ptr, stream); + } + + // compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); + compute_o_prod_bias( + m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } } // namespace IncMultiHeadAttention @@ -501,13 +899,12 @@ __global__ void fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); @@ -675,8 +1072,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests + + // store the result attn heads, also skip the genration tokens C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_q_heads * m->vProjSize; + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -702,52 +1102,6 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); tokens_previous_requests += num_new_tokens; } - - // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = bc->num_active_tokens(); - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - - apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); - } - assert(tokens_previous_requests == num_tokens); } @@ -811,6 +1165,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( } else { assert(false && "Unspported data type"); } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -819,38 +1174,6 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed); - - // if (input.data_type == DT_HALF) { - // print_tensor(input.get_half_ptr(), - // 32, - // "[IncMultiHeadSelfAttention:forward:input]"); - // print_tensor(weight.get_half_ptr(), - // 32, - // "[IncMultiHeadSelfAttention:forward:weight]"); - // print_tensor(output.get_half_ptr(), - // 32, - // "[IncMultiHeadSelfAttention:forward:output]"); - // print_tensor( - // bias.get_half_ptr(), 32, - // "[IncMultiHeadSelfAttention:forward:bias]"); - // } else { - // print_tensor(input.get_float_ptr(), - // 32, - // "[IncMultiHeadSelfAttention:forward:input]"); - // print_tensor(weight.get_float_ptr(), - // 32, - // "[IncMultiHeadSelfAttention:forward:weight]"); - // print_tensor(output.get_float_ptr(), - // 32, - // "[IncMultiHeadSelfAttention:forward:output]"); - // print_tensor( - // bias.get_float_ptr(), 32, - // "[IncMultiHeadSelfAttention:forward:bias]"); - // } - - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); } } @@ -1013,6 +1336,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( default: assert(false && "Unkown inference mode"); } + size_t requestinfo_size = BatchConfig::max_requests_per_batch(); size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; @@ -1025,8 +1349,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 2 * qk_prod_size + attn_heads_size) * size_of_dt + tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + - complex_size * sizeof(cuFloatComplex); // more components will - // be added here later + complex_size * sizeof(cuFloatComplex) + + requestinfo_size * + sizeof(BatchConfig::PerRequestInfo); // more components will + // be added here later if (offload) { // assert that we have enough reserved work space left size_t totalSharedSize = @@ -1086,6 +1412,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(cuFloatComplex); + request_infos = + gpu_mem_allocator.allocate_reserved( + requestinfo_size); } else { token_infos = gpu_mem_allocator.allocate_instance( @@ -1098,6 +1427,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); + request_infos = + gpu_mem_allocator.allocate_instance( + requestinfo_size); } // allocate more size for quantization data @@ -1131,5 +1463,4 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( GenericTensorAccessorR const weight, DataType data_type, cudaStream_t stream); - }; // namespace FlexFlow diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 52e083889e..6dad1c6de9 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -17,6 +17,7 @@ #endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -203,13 +204,13 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); @@ -228,7 +229,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); - // int tokens_previous_requests = 0; + int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; // int qkv_block_size = // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; @@ -399,8 +400,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + - bc->requestsInfo[i].first_token_offset_in_batch * m->num_q_heads * - m->vProjSize; + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -425,54 +426,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // tokens_previous_requests += num_new_tokens; + tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } } - // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = bc->num_active_tokens(); - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); - } - // assert(tokens_previous_requests == num_tokens); } @@ -520,11 +478,23 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - + if (bc->num_generation_tokens > 0) { + compute_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + if (bc->num_tokens > bc->num_generation_tokens) { + compute_attention_kernel_prompt( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + } + + // compute output production and bias together for all tokens + int num_tokens = + bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + + compute_o_prod_bias( + m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } } // namespace SpecIncMultiHeadAttention @@ -643,7 +613,6 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( size_t beam_requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); size_t total_size = - requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + beam_tokeninfo_size * sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + beam_requestinfo_size * @@ -660,10 +629,6 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( beam_tokeninfo_size); // offset += beam_tokeninfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - request_infos = - gpu_mem_allocator.allocate_instance( - requestinfo_size); - // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); beam_request_infos = gpu_mem_allocator .allocate_instance( diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 0aa50f605c..bc7d1017b7 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -17,6 +17,7 @@ #endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" @@ -26,11 +27,251 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + using namespace Kernels::IncMultiHeadAttention; namespace Kernels { namespace TreeIncMultiHeadAttention { +template +__global__ void compute_attention_kernel_fused_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int const max_token_per_batch, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + int num_heads, + int num_requests, + int qk_smem_sz) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const first_step = 0; + + int const tlength = request_infos[request_idx].first_token_depth_in_request + + request_infos[request_idx].num_tokens_in_batch; + int const qlength = request_infos[request_idx].num_tokens_in_batch; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += request_infos[request_idx].num_tokens_in_batch; + } + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_ + qk_smem_sz); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + request_idx * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < qlength; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); + } + + int pos = ti * qlength + qi; + if (((pos / qlength) % tlength) > (pos % qlength + tlength - qlength)) { + qk = -FLT_MAX; + } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + + qk_smem[pos] = mask ? 0.f : qk; + } + } + __syncthreads(); + + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; + + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti * qlength + qi] - qk_max); + exp_sum += logit; + qk_smem[ti * qlength + qi] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti * qlength + qi] *= inv_sum; + } + + __syncthreads(); + } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + request_idx * max_seq_length * hidden_size + vi; + + for (int qi = 0; qi < qlength; qi++) { + zero(out); + __syncthreads(); + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + float logit = qk_smem[ti * qlength + qi]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} + template __global__ void commit_tokens_kernel( DT const *devQKVProjArray, @@ -128,6 +369,37 @@ __global__ void update_tree_branch_kv_cache( } } +template +__global__ void update_tree_branch_kv_cache_fused( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_new_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { + + int token_idx = i / hidden_size; + int offset = i % hidden_size; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; + } +} + template __global__ void tree_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, @@ -200,6 +472,9 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens++; } + std::cout << "num_new_tokens: " << num_new_tokens << "\n"; + assert(false); + int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { @@ -438,6 +713,79 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(processed_tokens_in_batch == bc->num_active_tokens()); } +#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_size_in_bytes_tree
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THDS_PER_VALUE, \ + THDS_PER_BLOCK, \ + bc, \ + smem_sz); \ + compute_attention_kernel_fused_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + BatchConfig::max_tokens_per_batch(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->num_q_heads, \ + bc->num_active_requests(), \ + smem_sz[0]) + +template +void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + + // update the kv cache + // update K-V cache + int num_new_tokens = bc->num_active_tokens(); + int parallelism = m->hidden_size * num_new_tokens; + update_tree_branch_kv_cache_fused<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_new_tokens, + BatchConfig::max_sequence_length(), + m->hidden_size); + + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + + // 0->qk production size, 1->total shared size + int smem_sz[2]; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, @@ -463,6 +811,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, bias_ptr = static_cast
(m->bias_ptr); } } + // copy committed tokens info to GPU for the commit_tokens kernel // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing @@ -491,6 +840,12 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, sizeof(TreeVerifyBatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(m->request_infos, + &(bc->requestsInfo), + bc->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -504,11 +859,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( // m, bc, stream); + // use the new kernel + compute_attention_kernel_fused
( + m, bc, static_cast
(m->attn_heads), stream); + + int processed_tokens_in_batch = bc->num_active_tokens(); - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + compute_o_prod_bias(m, + bc, + shard_id, + output_ptr, + weight_ptr, + bias_ptr, + processed_tokens_in_batch, + stream); } } // namespace TreeIncMultiHeadAttention @@ -583,10 +947,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("TreeIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); } } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index f1164d3c49..7c37f3391e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -357,6 +357,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // log_req_mgr.print("Output: %s", output.c_str()); } } + int num_generation_tokens = 0; // Step 2: prepare the next batch for existing requests BatchConfig new_bc; @@ -450,6 +451,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request.tokens.size()) { // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; + num_generation_tokens++; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -471,6 +473,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } } + new_bc.num_generation_tokens = num_generation_tokens; // Step 3: add new requests to the next batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { @@ -563,6 +566,8 @@ BeamSearchBatchConfig new_bc.model_id = model_id; int result_index = 0; + int num_generation_tokens = 0; + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { continue; @@ -889,6 +894,7 @@ BeamSearchBatchConfig } } } + new_bc.num_generation_tokens = num_generation_tokens; if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" @@ -951,6 +957,7 @@ BeamSearchBatchConfig BeamSearchBatchConfig new_bc; new_bc.model_id = old_bc.model_id; // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; + int num_generation_tokens = 0; // Add incremental tokens to the batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { @@ -1155,11 +1162,13 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; + num_generation_tokens++; } } } } + new_bc.num_generation_tokens = num_generation_tokens; if (verbose) { std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" << std::endl; From 672cdad3b1e7f4416bcc694c88d07c7145bb9295 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Fri, 17 Nov 2023 05:57:14 -0800 Subject: [PATCH 264/344] fix ucx against inference branch (#1230) * fix ucx * use the old way to set c++17 --- CMakeLists.txt | 108 +++++--------------------------------------- MULTI-NODE.md | 57 +++++++++++++++++++++-- cmake/cuda.cmake | 2 + cmake/legion.cmake | 4 ++ cmake/nccl.cmake | 3 +- config/config.inc | 9 ++-- config/config.linux | 8 ++-- 7 files changed, 81 insertions(+), 110 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3732d5ff6f..b201cf99dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,10 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake) set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR}) set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG") +# set std 17 +#set(CMAKE_CXX_STANDARD 17) +#set(CMAKE_CUDA_STANDARD 17) + option(INFERENCE_TESTS "Run inference tests" OFF) set(LIBTORCH_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../libtorch" CACHE STRING "LibTorch Path") if (INFERENCE_TESTS) @@ -69,106 +73,15 @@ option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if availab option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON) option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF) -# option for using Python -set(FF_GASNET_CONDUITS aries udp mpi ibv ucx) +# option for using network +set(FF_GASNET_CONDUITS aries udp mpi ibv) set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}") set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS}) set(FF_LEGION_NETWORKS "" CACHE STRING "Network backend(s) to use") -if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") OR FF_LEGION_NETWORKS STREQUAL "ucx") - if("${FF_UCX_URL}" STREQUAL "") - set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz") - else() - set(UCX_URL "${FF_UCX_URL}") - endif() - - set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx) - get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME) - # message(STATUS "UCX_URL: ${UCX_URL}") - # message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}") - set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}") - set(UCX_BUILD_NEEDED OFF) - set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt) - set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log) - - if(EXISTS ${UCX_CONFIG_FILE}) - file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG) - # message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}") - if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}") - # configs match - no build needed - set(UCX_BUILD_NEEDED OFF) - else() - message(STATUS "UCX configuration has changed - rebuilding...") - set(UCX_BUILD_NEEDED ON) - endif() - else() - message(STATUS "Configuring and building UCX...") - set(UCX_BUILD_NEEDED ON) - endif() - - if(UCX_BUILD_NEEDED) - if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}") - message(STATUS "Downloading openucx/ucx from: ${UCX_URL}") - file( - DOWNLOAD - "${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}" - SHOW_PROGRESS - STATUS status - LOG log - ) - - list(GET status 0 status_code) - list(GET status 1 status_string) - - if(status_code EQUAL 0) - message(STATUS "Downloading... done") - else() - message(FATAL_ERROR "error: downloading '${UCX_URL}' failed - status_code: ${status_code} - status_string: ${status_string} - log: - --- LOG BEGIN --- - ${log} - --- LOG END ---" - ) - endif() - else() - message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists") - endif() - - execute_process(COMMAND mkdir -p ${UCX_DIR}) - execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1) - message(STATUS "Building UCX...") - execute_process( - COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install" - RESULT_VARIABLE UCX_BUILD_STATUS - OUTPUT_FILE ${UCX_BUILD_OUTPUT} - ERROR_FILE ${UCX_BUILD_OUTPUT} - ) - - if(UCX_BUILD_STATUS) - message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details") - endif() - - # Currently, we use default build configurations for UCX and therefore only save URL as configuration settings - file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}") - endif() - - if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") - set(ENV{UCX_HOME} "${UCX_DIR}/install") - install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin) - install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include) - install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib) - install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share) - endif() - - if (FF_LEGION_NETWORKS STREQUAL "ucx") - set(ucx_DIR ${UCX_DIR}/cmake) - set(ENV{Legion_NETWORKS} "ucx") - message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}") - endif() -else() - message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}") +message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}") +if (FF_LEGION_NETWORKS STREQUAL "gasnet") + message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}") endif() set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel) @@ -213,6 +126,7 @@ list(APPEND CC_FLAGS list(APPEND NVCC_FLAGS -std=c++17) + add_compile_options(${CC_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS}) link_libraries(${LD_FLAGS}) @@ -524,7 +438,7 @@ if(NOT BUILD_LEGION_ONLY) endif() # build binary - option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON) + option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF) option(FF_BUILD_RESNET "build resnet example" OFF) option(FF_BUILD_RESNEXT "build resnext example" OFF) option(FF_BUILD_ALEXNET "build alexnet example" OFF) diff --git a/MULTI-NODE.md b/MULTI-NODE.md index 9cf95976ac..37e8f6182c 100644 --- a/MULTI-NODE.md +++ b/MULTI-NODE.md @@ -17,15 +17,33 @@ Source: Custom (use the security group ID) You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network. -## 2. Configure and build FlexFlow +## 2. Configure and build UCX -Follow steps 1 to 5 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. +Find the latest source code release for UCX at https://github.com/openucx/ucx/releases. As of writing this documentation, the latest UCX was 1.15.0 at https://github.com/openucx/ucx/releases/download/v1.15.0/ucx-1.15.0.tar.gz. Extract it and switch to the directory with UCX source code, and run: + +``` +CUDA_PATH=/usr/local/cuda +PREFIX=$PWD/install +./contrib/configure-release-mt --prefix="$PREFIX" --without-go --enable-mt --with-cuda="$CUDA_PATH" +make -j install +echo "$PREFIX" +``` + +Replace `{{ CUDA_PATH }}` with the path of your CUDA installation. If you don't know the path, try `which nvcc`. Take note of the path of UCX installation, echoed as part of the last command. + +## 3. Configure and build FlexFlow + +Follow steps 1 to 5 in [INSTALL.md](INSTALL.md#1-download-the-source-code) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. Or you can use NFS to mount home directory of each instance so that only a single build is necessary. You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance. -For step 4 (Configuring the FlexFlow build), make sure to specify a network using the `FF_LEGION_NETWORKS` parameter. We recommend using `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT=ucx`. Other configurations are optional. +For step 4 (Configuring the FlexFlow build), here are the parameters that need to be configured: +* Set `FF_LEGION_NETWORKS=ucx` +* Set `UCX_DIR` to the UCX installation path mentioned in [Configure and build UCX](#2-configure-and-build-ucx) -## 3. Configure MPI +Other configuration options are optional. + +## 4. Configure MPI MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them. @@ -64,8 +82,37 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su 5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)). -## 4. Test FlexFlow +## 5. Test FlexFlow +<<<<<<< HEAD Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set environment variables. A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). +======= +Follow step 6 in [INSTALL.md](INSTALL.md#6-test-flexflow) to set environment variables. + +Save the following script as `mnist_mlp_run.sh` and make sure to change `FLEXFLOW_DIR` and `UCX_DIR` to appropriate paths: + +```bash +#!/bin/bash +eval "$(conda shell.bash hook)" +conda activate flexflow +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + +# Path to your FlexFlow build +FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build + +# Path to your UCX installation +UCX_DIR=/home/ubuntu/ucx-1.15.0/install + +export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so +export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH + +mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000 +``` + +Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program. +>>>>>>> c031ab1f... fix ucx diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index a1a66c7cc8..68e4ca07b1 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -70,6 +70,7 @@ if(CUDA_FOUND) endforeach() string(REGEX REPLACE "([0-9]+)" "-gencode arch=compute_\\1,code=sm_\\1" CUDA_GENCODE "${CUDA_GENCODE}") + set(CMAKE_CUDA_COMPILER "${CUDA_NVCC_EXECUTABLE}") #output message( STATUS "CUDA_VERSION: ${CUDA_VERSION}") message( STATUS "CUDA root path : ${CUDA_TOOLKIT_ROOT_DIR}" ) @@ -80,6 +81,7 @@ if(CUDA_FOUND) message( STATUS "CURAND libraries : ${CUDA_curand_LIBRARY}" ) message( STATUS "CUDA Arch : ${FF_CUDA_ARCH}" ) message( STATUS "CUDA_GENCODE: ${CUDA_GENCODE}") + message( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}") list(APPEND FLEXFLOW_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) diff --git a/cmake/legion.cmake b/cmake/legion.cmake index b83cbc52f2..2afb507d3b 100644 --- a/cmake/legion.cmake +++ b/cmake/legion.cmake @@ -132,6 +132,10 @@ else() set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version") set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit") set(GASNet_CONDUIT ${FF_GASNET_CONDUIT}) + elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx") + set(ucx_ROOT ${UCX_PATH}/lib/cmake) + message(STATUS "Find ucx: ${UCX_PATH}") + set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX") endif() message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}") set(Legion_MAX_DIM ${FF_MAX_DIM} CACHE STRING "Maximum number of dimensions") diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index 04a23dcb8a..c140a44ec8 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -109,8 +109,9 @@ else() message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" ) message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" ) add_library(nccl SHARED IMPORTED) + + # Build NCCL from source else() - # Build NCCL from source message(STATUS "Building NCCL from source") list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE) diff --git a/config/config.inc b/config/config.inc index 5a7bde5ce9..e5c9c69acf 100644 --- a/config/config.inc +++ b/config/config.inc @@ -108,12 +108,13 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi" elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp" - elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then - SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx" - SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL" fi elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx" + # set ucx dir + if [ -n "$UCX_DIR" ]; then + SET_UCX="-DUCX_PATH=${UCX_DIR}" + fi fi # build C++ examples @@ -227,7 +228,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 37b9bd16fd..db71abcacd 100755 --- a/config/config.linux +++ b/config/config.linux @@ -42,7 +42,8 @@ CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} # set CUDA dir in case cmake cannot autodetect a path CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} -#set NCCL dir +# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib, +# otherwise, we will build nccl from source NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"} # enable Python @@ -54,8 +55,8 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-} # select GASNET conduit FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} -# set UCX URL -FF_UCX_URL=${FF_UCX_URL:-""} +# set UCX dir if Legion networks is set to ucx +UCX_DIR=${UCX_DIR:-""} # build C++ examples FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} @@ -67,6 +68,7 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} # use precompiled NCCL and Legion libraries, where available FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF} + # use the flag below to use both the NCCL and Legion pre-built libraries. # when the flag below is set to ON, the two flags above are ignored. FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF} From 457b5f2e8b2bf751d30fb074ebe4e8b3b937be85 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 28 Nov 2023 10:33:58 -0500 Subject: [PATCH 265/344] post ucx fixes --- MULTI-NODE.md | 32 ++------------------------------ config/config.linux | 2 +- scripts/mnist_mlp_run.sh | 15 ++++++++++++++- 3 files changed, 17 insertions(+), 32 deletions(-) diff --git a/MULTI-NODE.md b/MULTI-NODE.md index 37e8f6182c..28f2eab8ed 100644 --- a/MULTI-NODE.md +++ b/MULTI-NODE.md @@ -84,35 +84,7 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su ## 5. Test FlexFlow -<<<<<<< HEAD -Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set environment variables. +Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set the environment variables. -A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). -======= -Follow step 6 in [INSTALL.md](INSTALL.md#6-test-flexflow) to set environment variables. +A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program. Make sure to change the `FLEXFLOW_DIR` and `UCX_DIR` variables in it to appropriate paths. -Save the following script as `mnist_mlp_run.sh` and make sure to change `FLEXFLOW_DIR` and `UCX_DIR` to appropriate paths: - -```bash -#!/bin/bash -eval "$(conda shell.bash hook)" -conda activate flexflow -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - -# Path to your FlexFlow build -FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build - -# Path to your UCX installation -UCX_DIR=/home/ubuntu/ucx-1.15.0/install - -export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so -export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH - -mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000 -``` - -Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program. ->>>>>>> c031ab1f... fix ucx diff --git a/config/config.linux b/config/config.linux index db71abcacd..30edfa7dfe 100755 --- a/config/config.linux +++ b/config/config.linux @@ -102,7 +102,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/scripts/mnist_mlp_run.sh b/scripts/mnist_mlp_run.sh index 8842790e6a..b070195d88 100755 --- a/scripts/mnist_mlp_run.sh +++ b/scripts/mnist_mlp_run.sh @@ -2,4 +2,17 @@ eval "$(conda shell.bash hook)" conda activate flexflow export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib -~/FlexFlow/python/flexflow_python ~/FlexFlow/examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000 + +# Path to your FlexFlow build +FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build + +# Path to your UCX installation +UCX_DIR=/home/ubuntu/ucx-1.15.0/install + +export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so +export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH + +mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000 From 5501cf86a5bf00a8cf4589559c3c45732b4a8d26 Mon Sep 17 00:00:00 2001 From: Soumya Chatterjee Date: Thu, 30 Nov 2023 18:00:23 -0800 Subject: [PATCH 266/344] Fix tensor shapes for elementwise binary operations with broadcasting (#1234) * Fix shapes in keras * remove extra lines * Add python<3.12 requirement to fix CI errors * Add python<=3.11 requirement to fix CI errors * Tweak around requirement to fix CI errors * Restore flexflow.yml * Restore pytorch-gpu.yml --------- Co-authored-by: Zhihao Jia --- python/flexflow/keras/layers/merge.py | 33 ++++++++++++++++++++++++--- src/ops/element_binary.cc | 15 +++++++++++- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/python/flexflow/keras/layers/merge.py b/python/flexflow/keras/layers/merge.py index fa967422d8..be2fe0c8c9 100644 --- a/python/flexflow/keras/layers/merge.py +++ b/python/flexflow/keras/layers/merge.py @@ -101,7 +101,16 @@ def __init__(self, **kwargs): def _calculate_inout_shape(self, input_tensors): assert len(input_tensors) == 2, "check input_tensors" self.input_shape = input_tensors[0].batch_shape - self.output_shape = input_tensors[0].batch_shape + self.output_shape = list(input_tensors[0].batch_shape) + for i, d in enumerate(input_tensors[1].batch_shape): + if self.output_shape[i] != d: + if self.output_shape[i] == 1 or d == 1: + self.output_shape[i] *= d + else: + raise AssertionError( + f"Tensor with shape {input_tensors[0].batch_shape} and " + f"{input_tensors[1].batch_shape} cannot be added") + self.output_shape = tuple(self.output_shape) fflogger.debug("add output %s" %( str(self.output_shape))) def subtract(input_tensors): @@ -114,7 +123,16 @@ def __init__(self, **kwargs): def _calculate_inout_shape(self, input_tensors): assert len(input_tensors) == 2, "check input_tensors" self.input_shape = input_tensors[0].batch_shape - self.output_shape = input_tensors[0].batch_shape + self.output_shape = list(input_tensors[0].batch_shape) + for i, d in enumerate(input_tensors[1].batch_shape): + if self.output_shape[i] != d: + if self.output_shape[i] == 1 or d == 1: + self.output_shape[i] *= d + else: + raise AssertionError( + f"Tensor with shape {input_tensors[0].batch_shape} and " + f"{input_tensors[1].batch_shape} cannot be subtracted") + self.output_shape = tuple(self.output_shape) fflogger.debug("subtract output %s" %( str(self.output_shape))) def multiply(input_tensors): @@ -127,7 +145,16 @@ def __init__(self, **kwargs): def _calculate_inout_shape(self, input_tensors): assert len(input_tensors) == 2, "check input_tensors" self.input_shape = input_tensors[0].batch_shape - self.output_shape = input_tensors[0].batch_shape + self.output_shape = list(input_tensors[0].batch_shape) + for i, d in enumerate(input_tensors[1].batch_shape): + if self.output_shape[i] != d: + if self.output_shape[i] == 1 or d == 1: + self.output_shape[i] *= d + else: + raise AssertionError( + f"Tensor with shape {input_tensors[0].batch_shape} and " + f"{input_tensors[1].batch_shape} cannot be multiplied") + self.output_shape = tuple(self.output_shape) fflogger.debug("multiply output %s" %( str(self.output_shape))) class Maximum(_Merge): diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index aa31477815..42c6487581 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -89,8 +89,21 @@ Tensor FFModel::binary(OperatorType op, } // Assert type match after broadcast assert(ele->inputs[0]->data_type == ele->inputs[1]->data_type); + + int numdim = in1->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + if (in1->dims[i] == 1) { + dims[i] = in2->dims[i]; + } else if (in2->dims[i] == 1) { + dims[i] = in1->dims[i]; + } else { + dims[i] = in1->dims[i]; + } + } + ele->outputs[0] = create_tensor_legion_ordering( - in1->num_dims, in1->dims, ele->data_type, ele, 0, true /*create_grad*/); + in1->num_dims, dims, ele->data_type, ele, 0, true /*create_grad*/); ele->add_int_property("inplace_a", inplace_a); layers.push_back(ele); return ele->outputs[0]; From 477afcb83abec5f3e1043269798d9314f0e8f18e Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 2 Dec 2023 19:27:49 -0500 Subject: [PATCH 267/344] Fix attention (#1238) * fix * . --- src/ops/inc_multihead_self_attention.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 20f7d64936..ce30b5dfda 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -713,7 +713,7 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, DT *output_ptr, cudaStream_t stream) { - dim3 grid(m->num_q_heads, bc->num_active_requests()); + dim3 grid(m->num_q_heads, bc->num_generation_tokens); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; size_t smem_sz; @@ -936,6 +936,9 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; } assert(tokens_previous_requests == bc->requestsInfo[i].first_token_offset_in_batch); From 08f60b15031b8c1eb5bb331005f9cb3755386b72 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 11 Dec 2023 22:34:36 -0500 Subject: [PATCH 268/344] Fix HIP build for AMD (#1243) * fix * update hip docker * undo legion update until pr is merged on gitlab --- CMakeLists.txt | 11 +++++++---- cmake/hip.cmake | 4 ++-- config/config.inc | 7 +++++-- docker/flexflow-environment/Dockerfile | 5 +---- inference/incr_decoding/CMakeLists.txt | 1 + inference/spec_infer/CMakeLists.txt | 1 + 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b201cf99dc..90cab126e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ endif() set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake) set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR}) set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG") +set(CMAKE_HIP_FLAGS "-std=c++17 ${CMAKE_HIP_FLAGS} -fPIC -UNDEBUG") # set std 17 #set(CMAKE_CXX_STANDARD 17) @@ -51,6 +52,7 @@ endif() # do not disable assertions even if in release mode set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") +set(CMAKE_HIP_FLAGS_RELEASE "${CMAKE_HIP_FLAGS_RELEASE} -UNDEBUG") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") set(LIBEXT ".so") @@ -157,6 +159,7 @@ endif() # HIP if (FF_GPU_BACKEND STREQUAL "hip_rocm" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + enable_language(HIP) include(hip) endif() @@ -299,7 +302,10 @@ if(NOT BUILD_LEGION_ONLY) LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cpp) - if(BUILD_SHARED_LIBS) + set_source_files_properties(${FLEXFLOW_GPU_SRC} PROPERTIES LANGUAGE HIP) + set_source_files_properties(${FLEXFLOW_SRC} PROPERTIES LANGUAGE HIP) + + if(BUILD_SHARED_LIBS) add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) else() add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) @@ -474,9 +480,6 @@ if(NOT BUILD_LEGION_ONLY) endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) - if (FF_GPU_BACKEND STREQUAL "hip_rocm") - SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") - endif() # Ensure Rust is installed execute_process(COMMAND rustc --version RESULT_VARIABLE RUST_COMMAND_RESULT diff --git a/cmake/hip.cmake b/cmake/hip.cmake index abcc82b03a..25f2e05e19 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -2,11 +2,11 @@ if (NOT FF_HIP_ARCH STREQUAL "") if (FF_HIP_ARCH STREQUAL "all") set(FF_HIP_ARCH "gfx900,gfx902,gfx904,gfx906,gfx908,gfx909,gfx90a,gfx90c,gfx940,gfx1010,gfx1011,gfx1012,gfx1013,gfx1030,gfx1031,gfx1032,gfx1033,gfx1034,gfx1035,gfx1036,gfx1100,gfx1101,gfx1102,gfx1103") endif() - string(REPLACE "," " " HIP_ARCH_LIST "${FF_HIP_ARCH}") + string(REPLACE "," "," HIP_ARCH_LIST "${FF_HIP_ARCH}") endif() message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") if(FF_GPU_BACKEND STREQUAL "hip_rocm") - set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) + #set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) set(GPU_TARGETS "${FF_HIP_ARCH}" CACHE STRING "The GPU TARGETs") endif() diff --git a/config/config.inc b/config/config.inc index e5c9c69acf..1121c114c4 100644 --- a/config/config.inc +++ b/config/config.inc @@ -190,6 +190,8 @@ if [ -n "$ROCM_PATH" ]; then SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}" fi +ADD_ROCM_TO_PATH="" + # set GPU backend if [ -n "$FF_GPU_BACKEND" ]; then SET_FF_GPU_BACKEND="-DFF_GPU_BACKEND=${FF_GPU_BACKEND}" @@ -222,7 +224,8 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" + ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin" + #SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" fi fi fi @@ -232,7 +235,7 @@ CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} -CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}" +CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} ${ADD_ROCM_TO_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}" echo $CMAKE_COMMAND eval $CMAKE_COMMAND } diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 0e9a3cda82..edbf9a7e52 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -74,11 +74,8 @@ RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ] rm ./${AMD_GPU_SCRIPT_NAME}; \ amdgpu-install -y --usecase=hip,rocm --no-dkms; \ apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \ - # Install protobuf v3.20.x manually + # Install protobuf dependencies apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \ - git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git; cd protobuf/ ; git submodule update --init --recursive; \ - ./autogen.sh; ./configure; cores_available=$(nproc --all); n_build_cores=$(( cores_available -1 )); \ - if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi; make -j $n_build_cores; make install; ldconfig; cd .. ; \ else \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \ fi diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index e415835a79..53b7cf0c2f 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -20,6 +20,7 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) endif() elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) hip_add_executable(${project_target} ${CPU_SRC}) if (FF_HIP_ARCH STREQUAL "") message(FATAL_ERROR "FF_HIP_ARCH is empty!") diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index 26d5bd1894..c877a3530b 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -19,6 +19,7 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) endif() elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) hip_add_executable(${project_target} ${CPU_SRC}) if (FF_HIP_ARCH STREQUAL "") message(FATAL_ERROR "FF_HIP_ARCH is empty!") From 3cf49a6d89b9ce60efde018fc99565390ee37eb7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 12 Dec 2023 03:55:52 -0500 Subject: [PATCH 269/344] [Documentation] - Annotate attention kernel with shapes of tensors (#1244) * add attention shape annotations * linting * fix --- src/ops/inc_multihead_self_attention.cu | 443 ++++++++++--------- src/ops/spec_inc_multihead_self_attention.cu | 2 +- 2 files changed, 247 insertions(+), 198 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ce30b5dfda..7da9aa389c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -504,7 +504,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) @@ -518,43 +517,52 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, compute_type = CUBLAS_COMPUTE_32F_FAST_16F; } #endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_q_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_q_heads - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - int lda = k, ldb = k, ldc = m_; - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // apply rotary emmmbedding for q - // and k step1 change the k, v to complex tensor + + // Step 1: Compute QKV projections + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_q = m->qProjSize * m->num_q_heads; + int m_k = m->kProjSize * m->num_q_heads; + int m_v = m->vProjSize * m->num_q_heads; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->qSize; + int m_ = m_q * QKV_WEIGHT_NUM; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: QKV weights + // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] + // matrix B: input + // matrix B's layout: [qSize (hidden_dim), num_new_tokens] + // matrix C: devQKVProjArray + // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - // apply bias for q, k, v + + // Step 2: apply bias for QKV, or scale the query if (*m->qkv_bias) { apply_proj_bias_qkv<<scaling_factor, m->hidden_size); } + + // Step 3: apply rotary embedding if needed if (*m->apply_rotary_embedding) { /*q&k*/ parallelism = num_tokens * m->hidden_size; @@ -638,38 +648,47 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t compute_type = cublas_data_type; #endif // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - // int num_tokens = bc->num_active_tokens(); - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = num_tokens; - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = num_tokens; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: attn heads + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->attn_heads); + // matrix B: output + // matrix B's layout: [oProjSize, num_new_tokens] + DT *C = static_cast
(output_ptr); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Add final output bias if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + @@ -945,54 +964,69 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - // a flag of using this scaling alpha - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + // Step 1: compute query-key product QK.T/sqrt(d_k) + { + // Scale by sqrt(d_k) as per the original attention paper + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // after transpositions + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + // before transpositions + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + // N.B. strides are applied before transpose operations + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // matrix A: devQKVProjArray + // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] + // To get query projection, skip over Q entries from previous requests + DT const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; + // matrix B: key cache + // matrix B's layout: [kProjSize * num_heads, total_tokens] + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests + // Step 2: Add alibi position bias to qk production + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests DT *C = static_cast
(m->qk_prods); - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; apply_position_bias_qkprd<< 0) { @@ -1022,87 +1056,102 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, static_cast
(-INFINITY)); } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - - // store the result attn heads, also skip the genration tokens - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + { + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + } + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ + // softmax(QK.T/sqrt(d_k)).T + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->vProjSize; + int n = num_new_tokens; + int k = total_tokens; + // before transpositions + int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + // N.B. strides are applied before transpose operations + int strideA = vt_block_size; + int strideB = num_new_tokens * total_tokens; + int strideC = m->vProjSize; + // matrix A: value cache + // matrix A's layout: [vProjSize, num_heads, total_tokens] + // To get A, skip over V.T entries from previous requests (all heads + + // padding) + DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: qk_prods_softmax + // matrix B's layout: [num_new_tokens, total_tokens, num_heads] + // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous + // requests (all heads) + DT *B = static_cast
(m->qk_prods_softmax); + ; + // matrix C: attn heads + // matrix C's layout: [vProjSize, num_heads, num_new_tokens] + // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous + // requests + // store the result attn heads, also skip the genration tokens + DT *C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } tokens_previous_requests += num_new_tokens; } assert(tokens_previous_requests == num_tokens); @@ -1255,7 +1304,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(kSize == vSize); qProjSize = _qProjSize; kProjSize = _kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul + assert(qProjSize == kProjSize); // required for attention QK.T matmul vProjSize = _vProjSize; oProjSize = _oProjSize; size_t size_of_dt = data_type_size(attn->data_type); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 6dad1c6de9..562dee4d93 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -492,7 +492,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - + compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } From 7e7f955f7a4a1f5de9f78d7e964f8e4d0baabb72 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Sun, 24 Dec 2023 10:14:12 -0500 Subject: [PATCH 270/344] Fix link issue (#1247) --- src/ops/inc_multihead_self_attention.cu | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7da9aa389c..695f4b13b9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1515,4 +1515,24 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( GenericTensorAccessorR const weight, DataType data_type, cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + cudaStream_t stream); }; // namespace FlexFlow From ed5a2e07fdc9285612f167c150f8d138e51895f7 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 25 Dec 2023 12:17:48 -0500 Subject: [PATCH 271/344] init --- include/flexflow/batch_config.h | 12 + include/flexflow/config.h | 9 + include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 45 + include/flexflow/operator_params.h | 2 + .../specinfer_inc_multihead_self_attention.h | 150 +++ ...nfer_inc_multihead_self_attention_params.h | 33 + include/flexflow/request_manager.h | 14 +- inference/file_loader.cc | 3 +- inference/models/llama.cc | 5 +- inference/spec_infer/spec_infer.cc | 3 + src/ops/inc_multihead_self_attention.cpp | 19 + src/ops/inc_multihead_self_attention.cu | 61 +- .../specinfer_inc_multihead_self_attention.cc | 883 +++++++++++++++++ .../specinfer_inc_multihead_self_attention.cu | 890 ++++++++++++++++++ src/ops/tree_inc_multihead_self_attention.cu | 24 +- src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 71 +- src/runtime/inference_manager.cc | 13 +- src/runtime/model.cc | 149 ++- src/runtime/model.cpp | 48 + src/runtime/model.cu | 28 +- src/runtime/request_manager.cc | 250 +++-- src/runtime/request_manager.cpp | 16 + src/runtime/request_manager.cu | 50 + 25 files changed, 2589 insertions(+), 192 deletions(-) create mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention.h create mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h create mode 100644 src/ops/specinfer_inc_multihead_self_attention.cc create mode 100644 src/ops/specinfer_inc_multihead_self_attention.cu diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index e2903c4d11..c33c3558cc 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -129,6 +129,9 @@ class BeamSearchBatchConfig : public BatchConfig { inline static int const MAX_BEAM_WIDTH = 1; inline static int const MAX_BEAM_DEPTH = 8; + // maximum tree branches for a request + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 9; + int model_id; struct BeamSearchPerRequestInfo { @@ -139,14 +142,23 @@ class BeamSearchBatchConfig : public BatchConfig { BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int sub_request_num; }; struct BeamSearchPerTokenInfo { int sub_request_index; }; + struct SpecInferTopology { + int real_token_pos[MAX_SPECULATIVE_TREE_BRANCHES][MAX_NUM_TOKENS]; + int allocated_tokens; + }; + + BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + SpecInferTopology topology_mask[MAX_NUM_REQUESTS]; + // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c2af6d707c..321d14961b 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -16,6 +16,7 @@ #ifndef _FLEXFLOW_CONFIG_H_ #define _FLEXFLOW_CONFIG_H_ #include "ffconst.h" +#include "flexflow/batch_config.h" #include "legion.h" #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -75,6 +76,14 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; + void *batch_config_metadata; + + // request info + token info + topolopgy mask info + size_t batch_config_metadata_size = + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 512645e624..ef0003b08e 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -171,6 +171,7 @@ enum OperatorType { OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // Parallel Ops OP_REPARTITION, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d8402ba622..3602cb108b 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -172,6 +172,8 @@ enum TaskIDs { SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -324,6 +326,7 @@ class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; +class SpecInferIncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -743,6 +746,25 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); + +Tensor specinfer_inc_multihead_self_attention( + const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); Tensor inc_multiquery_self_attention(const Tensor input, int embed_dim, int num_q_heads, @@ -799,6 +821,26 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); + + Tensor specinfer_inc_multiquery_self_attention( + const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); // ======================================== // Inference APIs // ======================================== @@ -1200,6 +1242,9 @@ class FFModel { std::unordered_map< std::pair, TreeIncMultiHeadSelfAttention *>, + std::unordered_map< + std::pair, + SpecInferIncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 5b187839ef..cee2ae95a4 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -37,6 +37,7 @@ #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" #include "flexflow/ops/tree_inc_multihead_self_attention_params.h" +#include "flexflow/ops/specinfer_inc_multihead_self_attention_params.h" #include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" @@ -72,6 +73,7 @@ using OperatorParameters = mp::variant +#include + +namespace FlexFlow { + +class SpecInferIncMultiHeadSelfAttentionMeta; + +class SpecInferIncMultiHeadSelfAttention : public Op { +public: + using Params = SpecInferIncMultiHeadSelfAttentionParams; + using Input = ParallelTensor; + + SpecInferIncMultiHeadSelfAttention(FFModel &model, + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name); + SpecInferIncMultiHeadSelfAttention(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name); + SpecInferIncMultiHeadSelfAttention(FFModel &model, + SpecInferIncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights); + SpecInferIncMultiHeadSelfAttention(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + + static void + inference_kernel_wrapper(SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias); + Params get_params() const; + +public: + int num_q_heads, num_kv_heads, tensor_parallelism_degree; + float dropout, scaling_factor; + bool qkv_bias; + bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling, position_bias; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int qoSeqLength, kvSeqLength; +}; + +class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { +public: + SpecInferIncMultiHeadSelfAttentionMeta(FFHandler handler, + SpecInferIncMultiHeadSelfAttention const *attn, + GenericTensorAccessorR const &weight, + MemoryAllocator &gpu_mem_allocator, + int num_samples, + int _num_q_heads, + int _num_kv_heads); + ~SpecInferIncMultiHeadSelfAttentionMeta(void); + +public: + Realm::RegionInstance beam_search_reserve_inst; + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_H diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h new file mode 100644 index 0000000000..b57b06a7f7 --- /dev/null +++ b/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#define _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct SpecInferIncMultiHeadSelfAttentionParams { + LayerID layer_guid; + int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; + float dropout, scaling_factor; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &, + SpecInferIncMultiHeadSelfAttentionParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t + operator()(FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index baf6844801..e67888d2d6 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -38,7 +38,8 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, - ParallelTensor const input); + ParallelTensor const input, + FFHandler *handlers); void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset); @@ -72,9 +73,10 @@ struct Request { struct BeamTree { struct treeLayer { BeamSearchBatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int nodes_num_this_layer = 0; }; treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; }; @@ -100,6 +102,7 @@ class RequestManager { void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); + void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, @@ -148,6 +151,7 @@ class RequestManager { void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); void update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index); @@ -210,6 +214,7 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + std::vector spec_infer_tree_width; // private fields std::unique_ptr tokenizer_; bool verbose; @@ -243,7 +248,8 @@ class RequestManager { private: struct ProfileInfo { - int decoding_steps; + int llm_decoding_steps; + int ssm_decoding_steps; double start_time, finish_time; }; std::unordered_map profiling_requests; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 7c6870d439..3f70ddf488 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -726,7 +726,8 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION) { if (weight_filename.find("self_attention") != std::string::npos) { load_attention_weights_multi_query( data, weight_filename, weights_folder, hidden_dim, num_heads); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index b8fe70526d..f62df1b1d7 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -90,7 +90,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( + mha = ff.specinfer_inc_multihead_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, @@ -246,7 +246,8 @@ void LLAMA::create_llama_model(FFModel &ff, if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); - output = ff.argmax(softmax, /*beam_Search*/ true); + // output = ff.argmax(softmax, /*beam_Search*/ true); + output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); } else { // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8b0eb926d9..e2594ba87f 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -302,6 +302,9 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); + //first decoding step: 3 results + rm->push_spec_infer_tree_width(1); + // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); if (model_metadata.llm_model_type == ModelType::LLAMA) { diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d60386f927..a59740f4a3 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1098,4 +1098,23 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + cudaStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + cudaStream_t stream); + }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 695f4b13b9..4c184acb3c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -826,17 +826,17 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, } // todo Xinhao copy how many requests if requests are not continous? - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); + // cudaMemcpyAsync(m->token_infos, + // &(bc->tokensInfo), + // bc->num_active_tokens() * + // sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, + // stream); + // cudaMemcpyAsync(m->request_infos, + // &(bc->requestsInfo), + // bc->max_requests_per_batch() * + // sizeof(BatchConfig::PerRequestInfo), + // cudaMemcpyHostToDevice, + // stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, @@ -1375,14 +1375,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( break; } case BEAM_SEARCH_MODE: { + // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; break; } default: @@ -1400,10 +1401,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + - complex_size * sizeof(cuFloatComplex) + - requestinfo_size * - sizeof(BatchConfig::PerRequestInfo); // more components will + complex_size * sizeof(cuFloatComplex); // more components will // be added here later if (offload) { // assert that we have enough reserved work space left @@ -1447,10 +1445,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); + token_infos = + static_cast(handler.batch_config_metadata); + request_infos = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo)); + if (offload) { - token_infos = - gpu_mem_allocator.allocate_reserved( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_reserved( + // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); @@ -1464,13 +1467,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(cuFloatComplex); - request_infos = - gpu_mem_allocator.allocate_reserved( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_reserved( + // requestinfo_size); } else { - token_infos = - gpu_mem_allocator.allocate_instance( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_instance( + // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -1479,9 +1482,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); - request_infos = - gpu_mem_allocator.allocate_instance( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_instance( + // requestinfo_size); } // allocate more size for quantization data diff --git a/src/ops/specinfer_inc_multihead_self_attention.cc b/src/ops/specinfer_inc_multihead_self_attention.cc new file mode 100644 index 0000000000..42074f39e4 --- /dev/null +++ b/src/ops/specinfer_inc_multihead_self_attention.cc @@ -0,0 +1,883 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +bool SpecInferIncMultiHeadSelfAttentionParams::is_valid( + ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor FFModel::specinfer_inc_multihead_self_attention( + Tensor const input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { + return specinfer_inc_multiquery_self_attention(input, + embed_dim, + num_heads, + num_heads, + kdim, + vdim, + dropout, + qkv_bias, + final_bias, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + name); +} + +Tensor FFModel::specinfer_inc_multiquery_self_attention( + Tensor const input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; + int weight_num = (qkv_bias || final_bias) ? 2 : 1; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); + li = new Layer(this, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + input); + } + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, data_type, li, 0, true /*create_grad*/); + } + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int weight_size = qParas * num_q_heads + kParas * num_q_heads + + vParas * num_q_heads + oParas * num_q_heads; + { + int dims[1] = {weight_size}; + li->weights[0] = create_weight_legion_ordering(1, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + if (qkv_bias || final_bias) { + // q, k, v, o + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + + (final_bias ? oProjSize : 0)}; + li->weights[1] = create_weight_legion_ordering(1, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = data_type; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_q_heads", num_q_heads); + li->add_int_property("num_kv_heads", num_kv_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("qkv_bias", qkv_bias); + li->add_int_property("final_bias", final_bias); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("scaling_query", scaling_query); + li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("position_bias", position_bias); + layers.push_back(li); + return li->outputs[0]; +} + +Op *SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + + std::cout << "spec create operator: " << layer->name << "\n"; + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_q_heads", value); + int num_q_heads = value; + layer->get_int_property("num_kv_heads", value); + int num_kv_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("qkv_bias", value); + bool qkv_bias = (bool)value; + layer->get_int_property("final_bias", value); + bool final_bias = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + layer->get_int_property("apply_rotary_embedding", value); + bool apply_rotary_embedding = (bool)value; + layer->get_int_property("scaling_query", value); + bool scaling_query = (bool)value; + float scaling_factor; + layer->get_float_property("scaling_factor", scaling_factor); + layer->get_int_property("qk_prod_scaling", value); + bool qk_prod_scaling = (bool)value; + layer->get_int_property("position_bias", value); + bool position_bias = (bool)value; + + return new SpecInferIncMultiHeadSelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + qkv_bias, + final_bias, + add_zero_attn, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + false /*allocate_weights*/, + layer->name); +} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + LayerID const &_layer_guid, + ParallelTensor const _input, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + _input->data_type, + name, + 1 /*inputs*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 1 /*outputs*/, + _input), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[2]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_q_heads * (qParas + oParas) + + this->num_q_heads * (kParas + vParas); + dims[1].is_replica_dim = false; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); + weights[0] = model.create_parallel_weight<2>(dims, + this->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + if (qkv_bias || final_bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, this->data_type, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + ParallelTensor const _input, + ParallelTensor const _weight, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + _input->data_type, + name, + 1 /*inputs*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 1 /*outputs*/, + _input, + _weight), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[2]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_q_heads * (qParas + oParas) + + this->num_q_heads * (kParas + vParas); + dims[1].is_replica_dim = false; + // dims[2].size = qParas + kParas + vParas + oParas; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); + weights[0] = model.create_parallel_weight<2>(dims, + this->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + if (qkv_bias || final_bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, this->data_type, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + SpecInferIncMultiHeadSelfAttention const &other, + ParallelTensor const input, + bool allocate_weights) + : SpecInferIncMultiHeadSelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_q_heads, + other.num_kv_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.qkv_bias, + other.final_bias, + other.add_zero_attn, + other.apply_rotary_embedding, + other.scaling_query, + other.scaling_factor, + other.qk_prod_scaling, + other.position_bias, + allocate_weights, + other.name) {} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + SpecInferIncMultiHeadSelfAttentionParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : SpecInferIncMultiHeadSelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_q_heads, + params.num_kv_heads, + params.kdim, + params.vdim, + params.dropout, + params.qkv_bias, + params.final_bias, + params.add_zero_attn, + params.apply_rotary_embedding, + params.scaling_query, + params.scaling_factor, + params.qk_prod_scaling, + params.position_bias, + allocate_weights, + name) {} + +void SpecInferIncMultiHeadSelfAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void SpecInferIncMultiHeadSelfAttention::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta *SpecInferIncMultiHeadSelfAttention::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + SpecInferIncMultiHeadSelfAttention const *attn = + (SpecInferIncMultiHeadSelfAttention *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + int num_q_heads = attn->num_q_heads; + int num_kv_heads = attn->num_kv_heads; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + // We don't do offloading for SSMs (small speculative models) + SpecInferIncMultiHeadSelfAttentionMeta *m = + new SpecInferIncMultiHeadSelfAttentionMeta(handle, + attn, + weight, + gpu_mem_allocator, + num_samples, + num_q_heads, + num_kv_heads); + // assert that we didn't over allocate memory + assert(gpu_mem_allocator.instance_allocated_size == + gpu_mem_allocator.instance_total_size); + m->profiling = attn->profiling; + m->inference_debugging = attn->inference_debugging; + std::strcpy(m->op_name, attn->name); + m->layer_guid = attn->layer_guid; + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); + return m; +} + +void SpecInferIncMultiHeadSelfAttention::forward(FFModel const &ff) { + // SpecInferIncMultiHeadSelfAttention doesn't support forward + assert(false); +} + +FutureMap SpecInferIncMultiHeadSelfAttention::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + IndexLauncher launcher(SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + + if (qkv_bias || final_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(idx++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void SpecInferIncMultiHeadSelfAttention::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_tokens == 0) { + return; + } + + SpecInferIncMultiHeadSelfAttentionMeta *m = + *((SpecInferIncMultiHeadSelfAttentionMeta **)task->local_args); + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 4); + } + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 2); + assert(output_domain.get_dim() == 4); + + assert(task->index_point.get_dim() == 1); + SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, &bc, task->index_point.point_data[0], input, weight, output, biases); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(weight); + if (*m->qkv_bias || *m->final_bias) { + weights_accessors.push_back(biases); + } + SpecInferIncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, &bc, {input}, weights_accessors, {output}); + } +} + +void SpecInferIncMultiHeadSelfAttention::backward(FFModel const &ff) { + // SpecInferIncMultiHeadSelfAttention does not support backward + assert(false); +} + +bool SpecInferIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, + int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_q_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +Op *SpecInferIncMultiHeadSelfAttention::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + SpecInferIncMultiHeadSelfAttentionParams params = get_params(); + return new SpecInferIncMultiHeadSelfAttention( + ff, params, inputs[0], true, this->name); +} + +bool SpecInferIncMultiHeadSelfAttention::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &lhs, + SpecInferIncMultiHeadSelfAttentionParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && + lhs.add_zero_attn == rhs.add_zero_attn && + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.scaling_query == rhs.scaling_query && + lhs.scaling_factor == rhs.scaling_factor && + lhs.qk_prod_scaling == rhs.qk_prod_scaling && + lhs.position_bias == rhs.position_bias; +} + +SpecInferIncMultiHeadSelfAttentionParams + SpecInferIncMultiHeadSelfAttention::get_params() const { + SpecInferIncMultiHeadSelfAttentionParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_q_heads = this->num_q_heads; + params.num_kv_heads = this->num_kv_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.qkv_bias = this->qkv_bias; + params.final_bias = this->final_bias; + params.add_zero_attn = this->add_zero_attn; + params.apply_rotary_embedding = this->apply_rotary_embedding; + params.scaling_query = this->scaling_query; + params.scaling_factor = this->scaling_factor; + params.qk_prod_scaling = this->qk_prod_scaling; + params.position_bias = this->position_bias; + + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_q_heads); + hash_combine(key, params.num_kv_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.qkv_bias); + hash_combine(key, params.final_bias); + hash_combine(key, params.add_zero_attn); + hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.scaling_query); + hash_combine(key, params.scaling_factor); + hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.position_bias); + return key; +} +}; // namespace std diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu new file mode 100644 index 0000000000..0bdf07a9d7 --- /dev/null +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -0,0 +1,890 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +#define WARP_SIZE 32 + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; +using namespace Kernels::IncMultiHeadAttention; + +namespace Kernels { +namespace SpecInferIncMultiHeadAttention { + +template +__global__ void compute_specinfer_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BeamSearchBatchConfig::SpecInferTopology *topology_mask, + int max_tree_branches) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + BeamSearchBatchConfig::SpecInferTopology topology = + topology_mask[request_idx]; + + int const first_step = 0; + + int const tlength = request_infos[request_idx].first_token_depth_in_request + + request_infos[request_idx].num_tokens_in_batch; + // int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; + + // will decode qlength tokens in this thread block + // int const qlength = tree_branch_num; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += request_infos[request_idx].num_tokens_in_batch; + } + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + + request_idx * max_seq_length * hidden_size * max_tree_branches + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int sub_req_idx = 0; sub_req_idx < tree_branch_num; sub_req_idx += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * sub_req_idx) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + // find the real position of the cache; + // depth: 0, 1, 2, 3, 4, 4, 5, 5 ,5, 5, + int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + k[ii] = *reinterpret_cast( + k_cache_batch + real_cache_idx * hidden_size + + head_idx * per_head_size + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); + } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("softmax %.10f\n", qk_smem[0]); + // } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + + request_idx * max_seq_length * hidden_size * max_tree_branches + vi; + // DT const *v_cache_batch = + // value_cache + + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * + // max_seq_length * hidden_size + + // vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + V_vec v = *reinterpret_cast( + v_cache_batch + real_cache_idx * hidden_size + + head_idx * per_head_size); + float logit = qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float( + *reinterpret_cast(output_ptr + request_idx * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} + +template +__global__ void specinfer_store_kv_cache( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo *tokenInfos, + BatchConfig::PerRequestInfo *requestInfo, + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int max_seq_len, + int max_tree_branches, + bool is_root, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { + int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int offset = i % hidden_size; + + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + + // above no need to be changed + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + // int const sub_req_id = id_map[token_idx].sub_request_index; + // int const parent_id = id_map[token_idx].parent_id; + // int const beam_depth = id_map[token_idx].beam_depth; + // int const beam_width = id_map[token_idx].beam_width; + + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; + // int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; + // int const beam_depth = beamRequestInfos[req_id].current_depth; + // int const beam_width = beamRequestInfos[req_id].beam_size; + int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; + + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (allocated_tokens + sub_req_id) * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (allocated_tokens + sub_req_id) * hidden_size + offset] = vVal; + } +} + +template +void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + int curr_depth = bc->beamRequestsInfo[0].current_depth; + // printf("curr depth: %d\n", curr_depth); + // assert(curr_depth < 3); + if (num_tokens > 0) { + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; + specinfer_store_kv_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->beam_topology_mask, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + BatchConfig::max_sequence_length(), + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + /*root*/ curr_depth == 0, + m->hidden_size); + } +} + +#define LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_specinfer_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->beam_topology_mask, \ + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) + +template +void compute_specinfer_attention_kernel_generation( + SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // one block == one head per request + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + +template +__global__ void spec_fill_entries_above_diagonal(DT *matrix, + size_t new_tokens, + size_t total_tokens_in_request, + size_t num_q_heads, + DT value) { + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { + // size_t head_idx = i / (new_tokens * total_tokens_in_request); + size_t src_idx = (i / new_tokens) % total_tokens_in_request; + size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; + // Casual Mask + if (src_idx > dst_idx) { + matrix[i] = value; + } + } +} + +template +void compute_attention_kernel_prompt( + SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int tokens_prev_requests_squares = 0; + // int qkv_block_size = + // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int q_block_size = m->qProjSize; + + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; + + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + + if (num_new_tokens <= 0) { + continue; + } + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + spec_fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; + } + + // assert(tokens_previous_requests == num_tokens); +} + +template +void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + // phase 2: Update key/val cache + update_kv_cache_kernel
(m, bc, stream); + if (bc->num_generation_tokens > 0) { + compute_specinfer_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + if (bc->num_tokens > bc->num_generation_tokens) { + compute_attention_kernel_prompt( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + } + + // compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); + + compute_o_prod_bias( + m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); +} + +} // namespace SpecInferIncMultiHeadAttention +} // namespace Kernels + +/*static*/ +void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( + SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::SpecInferIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::SpecInferIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("SpecInferIncMultiHeadSelfAttention forward time = %.2fms\n", + elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( + FFHandler handler, + SpecInferIncMultiHeadSelfAttention const *attn, + GenericTensorAccessorR const &weight, + MemoryAllocator &gpu_mem_allocator, + int num_samples, + int _num_q_heads, + int _num_kv_heads) + : IncMultiHeadSelfAttentionMeta(handler, + BEAM_SEARCH_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->qkv_bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->position_bias, + attn->final_bias, + attn->scaling_factor, + weight, + gpu_mem_allocator, + num_samples, + attn->num_q_heads, + attn->num_kv_heads, + _num_q_heads, + _num_kv_heads, + DT_NONE, + false) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + // allocate memory for the seqArray and reserve space + { + // int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + // size_t beam_tokeninfo_size = + // max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + // size_t requestinfo_size = + // BeamSearchBatchConfig::max_requests_per_batch(); size_t + // beam_requestinfo_size = + // BeamSearchBatchConfig::max_requests_per_batch(); + // size_t total_size = + // beam_tokeninfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + + // beam_requestinfo_size * + // sizeof(BeamSearchBatchConfig:: + // BeamSearchPerRequestInfo); // more components will + // // be added here later + + // We always directly allocate memory for small speculative models + // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + // total_size); + beam_topology_mask = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + + beam_token_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask)); + + beam_request_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + // beam_token_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_tokeninfo_size); + // offset += beam_tokeninfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); + // beam_request_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_requestinfo_size); + // offset += beam_requestinfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); + // assert(offset == total_size); + // assert(gpu_mem_allocator.instance_total_size == + // gpu_mem_allocator.instance_allocated_size); + } + + cudaStreamSynchronize(stream); +} + +SpecInferIncMultiHeadSelfAttentionMeta::~SpecInferIncMultiHeadSelfAttentionMeta( + void) { + if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { + beam_search_reserve_inst.destroy(); + } +} + +}; // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index bc7d1017b7..1da56e383a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -834,18 +834,18 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); + // cudaMemcpyAsync(m->token_infos, + // &(bc->tokensInfo), + // bc->num_active_tokens() * + // sizeof(TreeVerifyBatchConfig::PerTokenInfo), + // cudaMemcpyHostToDevice, + // stream); + // cudaMemcpyAsync(m->request_infos, + // &(bc->requestsInfo), + // bc->max_requests_per_batch() * + // sizeof(BatchConfig::PerRequestInfo), + // cudaMemcpyHostToDevice, + // stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..904bfbcaff 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -154,6 +154,8 @@ std::string get_operator_type_name(OperatorType type) { return "SpecIncMultiHeadSelfAttention"; case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: return "TreeIncMultiHeadSelfAttention"; + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: + return "SpecInferPgraoIncMultiHeadSelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 6d33dd9f27..46f7cc0f29 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -51,6 +51,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" @@ -69,7 +70,7 @@ using FlexFlow::MachineView; LegionRuntime::Logger::Category log_graph("graph"); LegionRuntime::Logger::Category log_simplify("graph_simplify"); -const Node Node::INVALID_NODE = Node(); +Node const Node::INVALID_NODE = Node(); Node::Node(void) : guid(0), ptr(NULL) {} @@ -2384,6 +2385,28 @@ GraphOptimalViewSerialized sez.serialize(attn->tensor_parallelism_degree); break; } + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { + SpecInferIncMultiHeadSelfAttention *attn = + (SpecInferIncMultiHeadSelfAttention *)op; + sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); + sez.serialize(attn->layer_guid.model_id); + sez.serialize(attn->oProjSize); + sez.serialize(attn->num_q_heads); + sez.serialize(attn->qProjSize); + sez.serialize(attn->vProjSize); + sez.serialize(attn->dropout); + sez.serialize(attn->qkv_bias); + sez.serialize(attn->final_bias); + sez.serialize(attn->add_zero_attn); + sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->scaling_query); + sez.serialize(attn->scaling_factor); + sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->position_bias); + sez.serialize(attn->num_kv_heads); + break; + } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2914,6 +2937,52 @@ void FFModel::deserialize_graph_optimal_view( params); break; } + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { + assert(num_inputs == 1); + int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; + float dropout, scaling_factor; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + dez.deserialize(embed_dim); + dez.deserialize(num_q_heads); + dez.deserialize(k_dim); + dez.deserialize(v_dim); + dez.deserialize(dropout); + dez.deserialize(qkv_bias); + dez.deserialize(final_bias); + dez.deserialize(add_zero_attn); + dez.deserialize(apply_rotary_embedding); + dez.deserialize(scaling_query); + dez.deserialize(scaling_factor); + dez.deserialize(qk_prod_scaling); + dez.deserialize(position_bias); + dez.deserialize(num_kv_heads); + + SpecInferIncMultiHeadSelfAttentionParams params; + params.embed_dim = embed_dim; + params.num_q_heads = num_q_heads; + params.kdim = k_dim; + params.vdim = v_dim; + params.dropout = dropout; + params.qkv_bias = qkv_bias; + params.final_bias = final_bias; + params.add_zero_attn = add_zero_attn; + params.layer_guid = layer_guid; + params.apply_rotary_embedding = apply_rotary_embedding; + params.scaling_query = scaling_query; + params.scaling_factor = scaling_factor; + params.qk_prod_scaling = qk_prod_scaling; + params.position_bias = position_bias; + params.num_kv_heads = num_kv_heads; + node = get_or_create_node(inputs[0], + params); + break; + } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index eb045e8159..fb978adfff 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -318,7 +318,7 @@ FutureMap InferenceManager::inference(FFModel *model, found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt); + load_input_tokens_from_batch_config(bc, pt, model->handlers); } } @@ -348,11 +348,20 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( - BatchConfigFuture const &bc, ParallelTensor const input) { + BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; + Rect<1> task_rect(Point<1>(0), + Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); + IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + MachineView view = input->machine_view; + for (PointInRectIterator<1> it(task_rect); it(); it++) { + FFHandler handle = handlers[view.get_device_id(*it)]; + argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); + } + IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, input->parallel_is, TaskArgument(nullptr, 0), diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92f0cff472..8bda9016c3 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -59,6 +59,7 @@ #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -93,10 +94,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -116,10 +117,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -1024,9 +1025,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1059,9 +1060,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1658,7 +1659,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1691,7 +1692,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1741,7 +1742,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1822,7 +1823,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1853,7 +1854,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1873,7 +1874,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2087,7 +2088,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2124,7 +2125,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2147,7 +2148,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2180,7 +2181,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2204,7 +2205,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2241,7 +2242,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2289,7 +2290,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2671,7 +2672,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -3038,6 +3039,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { + Op *op = SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3227,7 +3234,7 @@ Op *FFModel::create_operator_from_layer( } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; + std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; @@ -3973,38 +3980,38 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; - const static bool inference_debugging = false; + static int const batchSize = 64; + static bool const profiling = false; + static bool const inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static size_t offloadReserveSpaceSize = + static bool const searchOverlapBackwardUpdate = false; + static size_t const offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB - const static bool cpuOffload = false; - const static bool onlyDataParallel = true; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const cpuOffload = false; + static bool const onlyDataParallel = true; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -6209,6 +6216,44 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } + { + TaskVariantRegistrar registrar( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + "SpecInferIncMultiHeadSelfAttention Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + OpMeta *, + SpecInferIncMultiHeadSelfAttention::init_task>( + registrar, "SpecInferIncMultiHeadSelfAttention Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + OpMeta *, + SpecInferIncMultiHeadSelfAttention::init_task>(registrar); + } + } + { + TaskVariantRegistrar registrar( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + "SpecInferIncMultiHeadSelfAttention Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + SpecInferIncMultiHeadSelfAttention::inference_task>( + registrar, "SpecInferIncMultiHeadSelfAttention Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + SpecInferIncMultiHeadSelfAttention::inference_task>(registrar); + } + } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 6c482426eb..b51ab83091 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -131,6 +131,54 @@ FFHandler .wait(); handle.workSpace = workspaceInst.pointer_untyped(0, sizeof(char)); } + if (handle.offload_reserve_space_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.offload_reserve_space = + workspaceInst.pointer_untyped(0, sizeof(char)); + }else { + handle.offload_reserve_space = nullptr; + } + if (handle.batch_config_metadata_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + }else { + handle.batch_config_metadata = nullptr; + } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 17401a0f14..523b3c76f3 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -148,9 +148,35 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - } else { + }else { handle.offload_reserve_space = nullptr; } + if (handle.batch_config_metadata_size > 0) { + printf("allocate instance for metadata %d\n", handle.batch_config_metadata_size); + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + }else { + handle.batch_config_metadata = nullptr; + } + // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7c37f3391e..e1b591c320 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -106,6 +106,11 @@ int RequestManager::get_max_sequence_length() { return max_sequence_length; } +void RequestManager::push_spec_infer_tree_width(int tree_width) { + assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH); + spec_infer_tree_width.emplace_back(tree_width); +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -176,7 +181,7 @@ size_t RequestManager::get_num_ssms() { RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, int max_sequence_length) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); // Add a new request Request request; @@ -232,7 +237,7 @@ RequestManager::RequestGuid RequestManager::RequestGuid RequestManager::register_new_request(std::string const &prompt, int max_sequence_length) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; @@ -290,7 +295,7 @@ RequestManager::RequestGuid } bool RequestManager::is_request_completed(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); assert(all_requests.find(guid) != all_requests.end()); Request const &request = all_requests[guid]; // return request.tokens.size() >= request.max_sequence_length; @@ -299,7 +304,7 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); assert(request_generation_results.find(guid) != request_generation_results.end()); return request_generation_results[guid]; @@ -337,7 +342,7 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -406,13 +411,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); @@ -420,8 +426,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -469,7 +475,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } // Update profiling profiling_requests[new_bc.requestsInfo[i].request_guid] - .decoding_steps++; + .llm_decoding_steps++; } } } @@ -494,7 +500,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.request_completed[i] = false; // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 1; + profile_info.llm_decoding_steps = 1; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -553,7 +559,7 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, int model_id) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; } @@ -664,16 +670,18 @@ BeamSearchBatchConfig // Log profiling info ProfileInfo profile_info = profiling_requests[request.guid]; profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.ssm_decoding_steps = 0; total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { @@ -682,8 +690,8 @@ BeamSearchBatchConfig outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -726,8 +734,14 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; + + profiling_requests[request.guid].ssm_decoding_steps = 0; + + int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { @@ -735,6 +749,8 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; // Token Info @@ -746,6 +762,8 @@ BeamSearchBatchConfig new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + new_bc.topology_mask[i].real_token_pos[0][token.second] = + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request; // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; @@ -786,14 +804,20 @@ BeamSearchBatchConfig // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = 0; for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; // Token Info @@ -829,12 +853,17 @@ BeamSearchBatchConfig // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 0; + profile_info.llm_decoding_steps = 0; + profile_info.ssm_decoding_steps = 0; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; // init the beam search metadata per request + int ssm_decoding_steps = profile_info.ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].max_depth = std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, @@ -846,6 +875,7 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.beamRequestsInfo[i].sub_request_num = 1; new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -855,6 +885,7 @@ BeamSearchBatchConfig assert(depth < new_request.tokens.size()); new_bc.tokensInfo[new_bc.num_tokens].token_id = new_request.tokens[depth]; + new_bc.topology_mask[i].real_token_pos[0][depth] = depth; // beam search meta data, indicate which sub request this token // belongs to, init to 0; @@ -937,7 +968,7 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_beam ###############\n"; } @@ -1005,25 +1036,38 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - + profiling_requests[request.guid].ssm_decoding_steps += 1; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - // update the parentid, accumalated_probs, depth, and token_ids + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num * + new_bc.beamRequestsInfo[i].beam_size; + + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.request_running[i] = true; // do the slot exchange to minimize the cache exchange in kernel. - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + update_beam_metadata( + new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1059,7 +1103,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1103,13 +1147,24 @@ BeamSearchBatchConfig // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; - // update the parentid, accumalated_probs, depth, and token_ids new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num * + new_bc.beamRequestsInfo[i].beam_size; + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + + // update the parentid, accumalated_probs, depth, and token_ids if (request.status == Request::PENDING) { // if the request is pending, we need to update the beam search @@ -1152,7 +1207,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1209,7 +1264,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task( TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); std::cout << "\n############### prepare_next_batch_verify ###############\n"; @@ -1238,7 +1293,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( Request &request = all_requests[guid]; // Profiling - profiling_requests[request.guid].decoding_steps += 1; + profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { new_bc.request_running[i] = true; @@ -1478,16 +1533,19 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; + + // int leaf_node_num = old_bc.sub_requests[index]; + int leaf_node_num = old_bc.beamRequestsInfo[i].sub_request_num; int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results - int beam_width = old_bc.beamRequestsInfo[index].beam_size; + // int beam_width = old_bc.beamRequestsInfo[index].beam_size; // Count tokens sent to model in this request to find the final token's // index result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - beam_width; + leaf_node_num; if (verbose) { std::cout << "i = " << i << ", result index = " << result_index @@ -1514,7 +1572,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } - for (int beam_id = 0; beam_id < beam_width; beam_id++) { + for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .tokens[beam_id] = result.token_ids[result_index]; @@ -1546,6 +1604,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // for updating the beam search metadata in requests in incremental phase void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index) { @@ -1556,6 +1615,9 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; + // int leaf_node_num = old_bc.sub_requests[request_index]; + int leaf_node_num = old_bc.beamRequestsInfo[request_index].sub_request_num; + if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct // for (int j = 0; j < beam_size; j++) { @@ -1568,49 +1630,61 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // Do nothing // assert(false); } else { - std::set parents; - std::set childs; - // cache stealing - for (int j = 0; j < beam_size; j++) { - int parent_id = tree.treeLayers[depth].parent_ids[j]; - if (childs.find(parent_id) == childs.end()) { - // copy beam slot - new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[parent_id] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[parent_id] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(parent_id); - } - } - if (parents.size() < beam_size) { - for (int j = 0; j < beam_size; j++) { - if (parents.find(j) == parents.end()) { - // this slot has not been assigned - // find the smallest not assigned child and put in - if (verbose) { - std::cout << "request_index" << request_index - << ", miss slot: " << j << "\n"; - } - for (int k = 0; k < beam_size; k++) { - if (childs.find(k) == childs.end()) { - // parent -> j to child k; - new_bc.beamRequestsInfo[request_index].parent_id[k] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[k] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[k] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(k); - break; - } - } - } - } + for (int j = 0; j < leaf_node_num; j++) { + new_bc.beamRequestsInfo[request_index].parent_id[j] = + tree.treeLayers[depth].parent_ids[j]; + new_bc.beamRequestsInfo[request_index].probs[j] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[j] = + tree.treeLayers[depth].tokens[j]; + + // new_bc.topology_mask[request_index].real_token_pos[j] = } + assert(false); + + // std::set parents; + // std::set childs; + // // cache stealing + // for (int j = 0; j < beam_size; j++) { + // int parent_id = tree.treeLayers[depth].parent_ids[j]; + // if (childs.find(parent_id) == childs.end()) { + // // copy beam slot + // new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = + // tree.treeLayers[depth].parent_ids[j]; + // new_bc.beamRequestsInfo[request_index].probs[parent_id] = + // tree.treeLayers[depth].probs[j]; + // new_bc.beamRequestsInfo[request_index].tokens[parent_id] = + // tree.treeLayers[depth].tokens[j]; + // parents.emplace(j); + // childs.emplace(parent_id); + // } + // } + // if (parents.size() < beam_size) { + // for (int j = 0; j < beam_size; j++) { + // if (parents.find(j) == parents.end()) { + // // this slot has not been assigned + // // find the smallest not assigned child and put in + // if (verbose) { + // std::cout << "request_index" << request_index + // << ", miss slot: " << j << "\n"; + // } + // for (int k = 0; k < beam_size; k++) { + // if (childs.find(k) == childs.end()) { + // // parent -> j to child k; + // new_bc.beamRequestsInfo[request_index].parent_id[k] = + // tree.treeLayers[depth].parent_ids[j]; + // new_bc.beamRequestsInfo[request_index].probs[k] = + // tree.treeLayers[depth].probs[j]; + // new_bc.beamRequestsInfo[request_index].tokens[k] = + // tree.treeLayers[depth].tokens[j]; + // parents.emplace(j); + // childs.emplace(k); + // break; + // } + // } + // } + // } + // } } if (verbose) { std::cout << "-----------after parent id exchange-----------" << std::endl; diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 1e756606f8..9635b3bc1e 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -56,6 +56,22 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, hipMemcpyHostToDevice, stream)); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + batch_config->num_active_tokens() * + sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo), + &(batch_config->requestsInfo), + batch_config->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index cd3e03fff6..f4500d152d 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -30,6 +30,7 @@ void RequestManager::load_tokens_task( // BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; // Extreme long prompts are not supported, only load up to @@ -55,6 +56,55 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, cudaMemcpyHostToDevice, stream)); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + batch_config->num_active_tokens() * + sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo), + &(batch_config->requestsInfo), + batch_config->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); + + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo), + &(beam_batch_config->topology_mask), + sizeof(BeamSearchBatchConfig::topology_mask), + cudaMemcpyHostToDevice, + stream); + + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask), + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream); + } } void RequestManager::load_positions_task( From d3a57cb22b080741d9677d82701f035ccd33f8da Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 26 Dec 2023 03:09:33 -0500 Subject: [PATCH 272/344] fix speculative --- include/flexflow/batch_config.h | 4 +- inference/models/llama.cc | 1 + inference/spec_infer/spec_infer.cc | 4 +- src/ops/beam_topk.cc | 11 ++- src/ops/beam_topk.cu | 61 ++++++------ .../specinfer_inc_multihead_self_attention.cu | 91 +++++++++++------- src/runtime/inference_manager.cc | 1 + src/runtime/request_manager.cc | 93 +++++++++++++++---- src/runtime/request_manager.cu | 10 +- 9 files changed, 185 insertions(+), 91 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index c33c3558cc..dd947bbd85 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -126,11 +126,11 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; - inline static int const MAX_BEAM_WIDTH = 1; + inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; // maximum tree branches for a request - inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 9; + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; int model_id; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index f62df1b1d7..4f76e9e0fa 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -248,6 +248,7 @@ void LLAMA::create_llama_model(FFModel &ff, // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); // output = ff.argmax(softmax, /*beam_Search*/ true); output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index e2594ba87f..2ccdfd388d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -303,7 +303,7 @@ void FlexFlow::top_level_task(Task const *task, rm->register_output_filepath(file_paths.output_file_path); //first decoding step: 3 results - rm->push_spec_infer_tree_width(1); + rm->push_spec_infer_tree_width(3); // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); @@ -404,7 +404,7 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(text); // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 128 /*max_sequence_length*/); + tree_model.generate(prompts, 15 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 2883428254..3f636c2c98 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -366,14 +366,18 @@ BeamInferenceResult GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - + + printf("----------1-----------\n"); int *index_ptr = index.get_int32_ptr(); + printf("----------2-----------\n"); float *value_ptr = value.get_float_ptr(); + printf("----------3-----------\n"); int *parent_ptr = parent.get_int32_ptr(); + printf("----------4-----------\n"); // embedding size: eg. 4096 int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; @@ -398,6 +402,9 @@ BeamInferenceResult download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); + print_tensor(index_ptr, 32, "indexxxxxxx"); + printf("max beam width %d\n", m->max_beam_width); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 72ab7862a6..515bba4bc0 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -379,9 +379,9 @@ template __global__ void mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { using T_ACC = T; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rstd[i]); } } @@ -556,8 +556,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] - << "\n"; + std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -565,12 +564,12 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - log_beam_topk.debug() - << "probbbb req: " << i - << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] - << ", sub request id " << j << ", parent id " - << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; + std::cout << "probbbb req: " << i << ", sub req probability : " + << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j + << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + << ", data inddd" + << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + << "\n"; } // process tokens @@ -584,6 +583,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); max_beam_width = std::max(max_beam_width, beam_size); + + std::cout << "max beam width: " << max_beam_width << "\n"; req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } @@ -613,26 +614,34 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; - checkCUDA(cudaMemcpy(m->parent_ids, - parent_ids, - sizeof(int) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->acc_probs, - acc_probs, - sizeof(DT) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->block_start_index, - beam_block_start_index.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->request_id, + checkCUDA(cudaMemcpyAsync(m->parent_ids, + parent_ids, + sizeof(int) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->acc_probs, + acc_probs, + sizeof(DT) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + // trick, set acc_probs to 0; + checkCUDA( + cudaMemsetAsync(m->acc_probs, 1.0, batch_size * sizeof(DT), stream)); + checkCUDA(cudaMemcpyAsync(m->block_start_index, + beam_block_start_index.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->request_id, request_id.data(), sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->tokens_per_request, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->tokens_per_request, tokens_per_request.data(), sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); + cudaMemcpyHostToDevice, + stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; beam_topk_forward_kernel<<>>( diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 0bdf07a9d7..9d6f70d5ba 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -133,6 +133,13 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * sub_req_idx) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); } + + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][10]); + } __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; @@ -317,26 +324,38 @@ __global__ void specinfer_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const first_token_in_req = requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - // int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - // int const beam_depth = beamRequestInfos[req_id].current_depth; - // int const beam_width = beamRequestInfos[req_id].beam_size; int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; + int const beam_size = beamRequestInfos[req_id].sub_request_num; + + int real_idx = tok_id - first_token_in_req + allocated_tokens; + + if (i == 0) { + printf("ffasdasds%d, %d, %d, %d, %d, %d\n", + beamTokenInfos[0].sub_request_index, + allocated_tokens, + sub_req_id, + tok_id, + first_token_in_req, + real_idx); + } + // }else if(i == hidden_size * 2){ + // printf("ffasdasdskkkk%d, %d, %d\n", allocated_tokens, tok_id, + // sub_req_id); + // } + + + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (allocated_tokens + sub_req_id) * hidden_size + offset] = kVal; + (real_idx) * hidden_size + + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (allocated_tokens + sub_req_id) * hidden_size + offset] = vVal; + (real_idx) * hidden_size + + offset] = vVal; } } @@ -350,6 +369,9 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; + printf("tokenInfo %d, %d\n", + bc->beamTokenInfo[0].sub_request_index, + num_tokens); specinfer_store_kv_cache<<max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - continue; - } + } + // else if (tokens_previous_requests < bc->num_generation_tokens) { + // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // continue; + // } // all requests in prompt phase should only have one sub requests; assert(bc->sub_requests[i] == 1); @@ -523,6 +546,9 @@ void compute_attention_kernel_prompt( m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) + + print_tensor((float*)A, 32, "A"); + std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; DT const *B = static_cast
(m->keyCache) + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; @@ -557,6 +583,7 @@ void compute_attention_kernel_prompt( m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + print_tensor((float*)C, 32, "C"); // add alibi position bias to qk production // add alibi position bias to qk production if (*m->position_bias) { @@ -641,6 +668,8 @@ void compute_attention_kernel_prompt( B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests + + print_tensor((float*)C_softmax, 32, "C_softmax"); C = static_cast
(m->attn_heads) + (tokens_previous_requests + bc->num_generation_tokens) * m->num_q_heads * m->vProjSize; @@ -695,6 +724,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); + std::cout << "specinfer kernel token num: " << bc->num_generation_tokens + << ", " << bc->num_tokens << "\n"; if (bc->num_generation_tokens > 0) { compute_specinfer_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); @@ -705,6 +736,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } + // compute_attention_kernel_prompt( + // m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); @@ -783,6 +816,12 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } + + // if(bc->num_tokens == 1){ + // print_tensor(input.get_float_ptr(), 32, "specinc input"); + // print_tensor(output.get_float_ptr(), 32, "specinc output"); + // assert(false); + // } } SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( @@ -825,24 +864,6 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - // int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - // size_t beam_tokeninfo_size = - // max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - // size_t requestinfo_size = - // BeamSearchBatchConfig::max_requests_per_batch(); size_t - // beam_requestinfo_size = - // BeamSearchBatchConfig::max_requests_per_batch(); - // size_t total_size = - // beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - // beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig:: - // BeamSearchPerRequestInfo); // more components will - // // be added here later - - // We always directly allocate memory for small speculative models - // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - // total_size); beam_topology_mask = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index fb978adfff..52fd64c606 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -257,6 +257,7 @@ void InferenceManager::init_operators_inference(FFModel *model) { ((ParallelOp *)op) ->create_input_partition_inference(*model, inputs, outputs); } + printf("init op %s\n", op->name); op->init_inference(*model, inputs, outputs); } } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e1b591c320..845a580c13 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -714,7 +714,8 @@ BeamSearchBatchConfig dfs_tree_inputs.erase(request.guid); } else { // Request not finished, pass verified_tokens to next iteration - + + std::cout << "parse to next iteration: " << "\n"; new_bc.request_completed[i] = false; new_bc.request_running[i] = true; @@ -752,6 +753,12 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].sub_request_num = 1; new_bc.sub_requests[i] = 1; + new_bc.topology_mask[i].allocated_tokens = request.tokens.size(); + + //assign new kv cache position + for(int j = 0; j < request.tokens.size(); j++){ + new_bc.topology_mask[i].real_token_pos[0][j] = j; + } // Token Info for (int j = 0; j < verified_tokens.size(); j++) { @@ -768,6 +775,8 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; + std::cout << "num_gen ++ " << "\n"; + num_generation_tokens++; // Add verified token to request's token list request.tokens.push_back(token.first); @@ -776,6 +785,8 @@ BeamSearchBatchConfig break; } } + + std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -817,6 +828,7 @@ BeamSearchBatchConfig } new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.topology_mask[i].allocated_tokens = 0; new_bc.sub_requests[i] = 1; @@ -875,7 +887,11 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.beamRequestsInfo[i].sub_request_num = 1; + printf("sub request num == 1, %d \n", + new_bc.beamRequestsInfo[i].beam_size); + new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -892,6 +908,7 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; } + new_bc.topology_mask[i].allocated_tokens = 0; // if (new_bc.requestsInfo[i].num_tokens_in_batch < // new_request.initial_len) { @@ -927,6 +944,8 @@ BeamSearchBatchConfig } new_bc.num_generation_tokens = num_generation_tokens; + std::cout << "prepare next batch init gen tokens: " << new_bc.num_generation_tokens << "\n"; + if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" << std::endl; @@ -969,10 +988,10 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { std::lock_guard const lock(request_queue_mutex); - if (verbose) { + if (true) { std::cout << "\n############### prepare_next_batch_beam ###############\n"; } - if (verbose) { + if (true) { std::cout << "print all results" << "\n"; for (int i = 0; i < 40; i++) { @@ -980,6 +999,8 @@ BeamSearchBatchConfig } std::cout << "Current Beam Depth: " << old_bc.beamRequestsInfo[0].current_depth << "\n"; + std::cout << "Current sub request num: " + << old_bc.beamRequestsInfo[0].sub_request_num << "\n"; } // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); @@ -1049,6 +1070,7 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; + new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; @@ -1154,13 +1176,16 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; + printf("beam size: %d, %d\n", + new_bc.beamRequestsInfo[i].beam_size, + ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; new_bc.sub_requests[i] = old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].sub_request_num = - old_bc.beamRequestsInfo[i].sub_request_num * - new_bc.beamRequestsInfo[i].beam_size; + old_bc.beamRequestsInfo[i].sub_request_num; + assert(new_bc.beamRequestsInfo[i].sub_request_num <= BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); @@ -1230,6 +1255,16 @@ BeamSearchBatchConfig old_bc.print(); new_bc.print(); } + + if (true) { + std::cout << "print all resultsBBB" + << "\n"; + for (int i = 0; i < 40; i++) { + std::cout << result.token_ids[i] << ", "; + } + std::cout << "Current Beam DepthBBB: " + << old_bc.beamRequestsInfo[0].current_depth << "\n"; + } return new_bc; } @@ -1296,6 +1331,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { + std::cout << "prepare next batch running: pending\n" + << "\n"; new_bc.request_running[i] = true; std::cout << "[Verify] Request " << request.guid << " is running" << std::endl; @@ -1401,6 +1438,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else if (request.status == Request::PENDING) { + std::cout << "prepare next batch verify: pending\n" + << "\n"; new_bc.request_running[i] = false; if (verbose) { std::cout << "[Verify] Request " << request.guid @@ -1450,6 +1489,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; if (request.llm_cache_size < request.initial_len) { + std::cout << "Initialization (prompt) phase: " + << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1457,7 +1499,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - + std::cout << "load prompt tokens: " << j << ": " << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; new_bc.num_tokens++; } @@ -1483,6 +1525,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else { // launch the request into running phase after loading all prompt if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + std::cout << "Initialization running phase: " + << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1521,7 +1565,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; int result_index = 0; - if (verbose) { + if (true) { std::cout << "Store total of " << old_bc.num_tokens << " tokens in the current batch.\n"; } @@ -1535,7 +1579,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, int beam_size = old_bc.beamRequestsInfo[index].beam_size; // int leaf_node_num = old_bc.sub_requests[index]; - int leaf_node_num = old_bc.beamRequestsInfo[i].sub_request_num; + int leaf_node_num = + old_bc.beamRequestsInfo[index].sub_request_num * beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results @@ -1545,18 +1590,26 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // index result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - leaf_node_num; + beam_size; - if (verbose) { + // result_index += old_bc.topology_mask[index].allocated_tokens; + + if (true) { std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] << "\n"; + << ", value: " << result.token_ids[result_index] + << ", leaf node num: " << leaf_node_num << ", depth" << depth + << ", beam size: " << beam_size << "\n"; } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; + if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) { + continue; + } + if (depth == 1) { // store the last input into the tree; - if (verbose) { + if (true) { std::cout << "try to store the input" << "\n"; } @@ -1566,7 +1619,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; - if (verbose) { + if (true) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; } @@ -1583,7 +1636,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - if (verbose) { + if (true) { std::cout << "tree value: " << depth << "token: " << request.beam_trees.at(old_bc.model_id) .treeLayers[depth] @@ -1592,7 +1645,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } result_index += 1; } - // update the guid and start_depth for current request if (i < old_bc.num_tokens) { guid = old_bc.requestsInfo[index].request_guid; @@ -1600,6 +1652,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } } + + if (old_bc.num_tokens != 10) { + assert(false); + } } // for updating the beam search metadata in requests in incremental phase @@ -1638,7 +1694,6 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, new_bc.beamRequestsInfo[request_index].tokens[j] = tree.treeLayers[depth].tokens[j]; - // new_bc.topology_mask[request_index].real_token_pos[j] = } assert(false); @@ -1784,7 +1839,7 @@ std::vector> // depth) pairs for (auto const &pair : inputSerializedTree) { oss << " " << pair.second << ":" << pair.first; - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } log_req_mgr.print("Input tree:%s", oss.str().c_str()); } @@ -1793,7 +1848,7 @@ std::vector> // outputSerializedTree is an array of (token id, depth + 1) pairs std::ostringstream oss; for (auto const &pair : outputSerializedTree) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Output tree:%s", oss.str().c_str()); @@ -1847,7 +1902,7 @@ std::vector> // log_req_mgr.print("========Verified============"); std::ostringstream oss; for (auto const &pair : verifiedTree) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Verified:%s", oss.str().c_str()); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index f4500d152d..b76c5c326e 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -91,17 +91,17 @@ void RequestManager::load_tokens_task( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask), - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), cudaMemcpyHostToDevice, stream); } From 617a29fdda4e79d0d9c7bbcc1455ed447c42988f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 26 Dec 2023 13:43:49 -0500 Subject: [PATCH 273/344] fix speculative --- .../specinfer_inc_multihead_self_attention.cu | 42 ++++--- src/runtime/request_manager.cc | 107 +++++++++++++----- 2 files changed, 109 insertions(+), 40 deletions(-) diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 9d6f70d5ba..63cd90f44f 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -134,11 +134,20 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][10]); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { + printf("cacheposssssA %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); + printf("cacheposssssB %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); + printf("cacheposssssC %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); + printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][11]); + printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][12]); + printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][13]); + }else if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 1) { + printf("cacheposssssE %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][0]); + printf("cacheposssssF %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][1]); + printf("cacheposssssG %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][2]); + printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][11]); + printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][12]); + printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][13]); } __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { @@ -289,7 +298,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + request_idx * hidden_size + + *reinterpret_cast(output_ptr + (request_idx + sub_req_idx) * hidden_size + head_idx * per_head_size + vi), out); } @@ -332,7 +341,7 @@ __global__ void specinfer_store_kv_cache( int const beam_size = beamRequestInfos[req_id].sub_request_num; - int real_idx = tok_id - first_token_in_req + allocated_tokens; + int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; if (i == 0) { printf("ffasdasds%d, %d, %d, %d, %d, %d\n", @@ -343,10 +352,15 @@ __global__ void specinfer_store_kv_cache( first_token_in_req, real_idx); } - // }else if(i == hidden_size * 2){ - // printf("ffasdasdskkkk%d, %d, %d\n", allocated_tokens, tok_id, - // sub_req_id); - // } + else if(i == hidden_size * 2){ + printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d\n", + beamTokenInfos[0].sub_request_index, + allocated_tokens, + sub_req_id, + tok_id, + first_token_in_req, + real_idx); + } @@ -547,7 +561,7 @@ void compute_attention_kernel_prompt( // To get B, skip over K entries from previous requests (all heads + // padding) - print_tensor((float*)A, 32, "A"); + // print_tensor((float*)A, 32, "A"); std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; DT const *B = static_cast
(m->keyCache) + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; @@ -583,7 +597,7 @@ void compute_attention_kernel_prompt( m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - print_tensor((float*)C, 32, "C"); + // print_tensor((float*)C, 32, "C"); // add alibi position bias to qk production // add alibi position bias to qk production if (*m->position_bias) { @@ -669,7 +683,7 @@ void compute_attention_kernel_prompt( // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - print_tensor((float*)C_softmax, 32, "C_softmax"); + // print_tensor((float*)C_softmax, 32, "C_softmax"); C = static_cast
(m->attn_heads) + (tokens_previous_requests + bc->num_generation_tokens) * m->num_q_heads * m->vProjSize; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 845a580c13..775280e2cf 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -714,8 +714,9 @@ BeamSearchBatchConfig dfs_tree_inputs.erase(request.guid); } else { // Request not finished, pass verified_tokens to next iteration - - std::cout << "parse to next iteration: " << "\n"; + + std::cout << "parse to next iteration: " + << "\n"; new_bc.request_completed[i] = false; new_bc.request_running[i] = true; @@ -755,8 +756,8 @@ BeamSearchBatchConfig new_bc.sub_requests[i] = 1; new_bc.topology_mask[i].allocated_tokens = request.tokens.size(); - //assign new kv cache position - for(int j = 0; j < request.tokens.size(); j++){ + // assign new kv cache position + for (int j = 0; j < request.tokens.size(); j++) { new_bc.topology_mask[i].real_token_pos[0][j] = j; } @@ -775,7 +776,8 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; - std::cout << "num_gen ++ " << "\n"; + std::cout << "num_gen ++ " + << "\n"; num_generation_tokens++; // Add verified token to request's token list @@ -785,7 +787,6 @@ BeamSearchBatchConfig break; } } - std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically @@ -944,7 +945,8 @@ BeamSearchBatchConfig } new_bc.num_generation_tokens = num_generation_tokens; - std::cout << "prepare next batch init gen tokens: " << new_bc.num_generation_tokens << "\n"; + std::cout << "prepare next batch init gen tokens: " + << new_bc.num_generation_tokens << "\n"; if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" @@ -1078,7 +1080,14 @@ BeamSearchBatchConfig old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].sub_request_num = old_bc.beamRequestsInfo[i].sub_request_num * - new_bc.beamRequestsInfo[i].beam_size; + old_bc.beamRequestsInfo[i].beam_size; + + std::cout << "oldbc : " << old_bc.beamRequestsInfo[i].sub_request_num + << ", " << old_bc.beamRequestsInfo[i].beam_size << "\n"; + + // if (old_bc.beamRequestsInfo[i].current_depth == 3) { + // assert(false); + // } assert(new_bc.beamRequestsInfo[i].sub_request_num <= BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); @@ -1090,6 +1099,10 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. update_beam_metadata( new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); + + new_bc.topology_mask[i].allocated_tokens = + old_bc.topology_mask[i].allocated_tokens + + old_bc.beamRequestsInfo[i].sub_request_num; } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1101,6 +1114,7 @@ BeamSearchBatchConfig request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { + // todo check it new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { assert(false && "Request should be done"); @@ -1122,7 +1136,31 @@ BeamSearchBatchConfig << std::endl; } + // for (int j = 0; j < request.tokens.size(); j++) { + // new_bc.topology_mask[i].real_token_pos[0][j] = j; + // } + // register more tokens due to the beam width + std::cout << "register more tokens: " + << new_bc.beamRequestsInfo[i].sub_request_num << ", " + << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + << new_bc.topology_mask[i].allocated_tokens << "\n"; + + // copy meta data and replicate + int replicate_num = new_bc.beamRequestsInfo[i].sub_request_num / + old_bc.beamRequestsInfo[i].sub_request_num; + + for (int j = 0; j < old_bc.beamRequestsInfo[i].sub_request_num; j++) { + int old_idx = j; + for (int k = 0; k < replicate_num; k++) { + int new_idx = j * replicate_num + k; + std::cout << "copy from " << old_idx << "to: " << new_idx << "\n"; + memcpy(new_bc.topology_mask[i].real_token_pos[new_idx], + old_bc.topology_mask[i].real_token_pos[old_idx], + sizeof(int) * BatchConfig::MAX_NUM_TOKENS); + } + } + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1135,6 +1173,15 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; + + // width first + new_bc.topology_mask[i].real_token_pos[k][depth] = + new_bc.topology_mask[i].allocated_tokens + num_generation_tokens; + + std::cout << "topology: sub request: " << k << ", " + << ", " << depth << ", " + << new_bc.topology_mask[i].real_token_pos[k][depth] << "\n"; + num_generation_tokens++; } } } @@ -1331,6 +1378,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { + std::cout << "prepare next batch running: pending\n" << "\n"; new_bc.request_running[i] = true; @@ -1415,11 +1463,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; - + + std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() << "\n"; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); - if (verbose) { + if (true) { std::cout << "[" << j << "] Token: " << token.first << ", Depth:" << token.second << std::endl; } @@ -1436,6 +1485,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } } + assert(false); } else if (request.status == Request::PENDING) { std::cout << "prepare next batch verify: pending\n" @@ -1499,7 +1549,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - std::cout << "load prompt tokens: " << j << ": " << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; + std::cout << "load prompt tokens: " << j << ": " + << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; new_bc.num_tokens++; } @@ -1625,7 +1676,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } + std::cout << "leaffffff: " << leaf_node_num << "\n"; + for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { + request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .tokens[beam_id] = result.token_ids[result_index]; @@ -1635,14 +1689,19 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - - if (true) { - std::cout << "tree value: " << depth << "token: " - << request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .tokens[beam_id] - << "result tokens: " << result.token_ids[result_index]; - } + std::cout << "??????? beam id: " << beam_id << ", token: " + << request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .tokens[beam_id] + << "\n"; + + // if (true) { + // std::cout << "tree value: " << depth << "token: " + // << request.beam_trees.at(old_bc.model_id) + // .treeLayers[depth] + // .tokens[beam_id] + // << "result tokens: " << result.token_ids[result_index]; + // } result_index += 1; } // update the guid and start_depth for current request @@ -1652,10 +1711,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } } - - if (old_bc.num_tokens != 10) { - assert(false); - } } // for updating the beam search metadata in requests in incremental phase @@ -1672,7 +1727,7 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; // int leaf_node_num = old_bc.sub_requests[request_index]; - int leaf_node_num = old_bc.beamRequestsInfo[request_index].sub_request_num; + int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num; if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct @@ -1693,9 +1748,9 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, tree.treeLayers[depth].probs[j]; new_bc.beamRequestsInfo[request_index].tokens[j] = tree.treeLayers[depth].tokens[j]; - + std::cout << "token: " << j << ": " + << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } - assert(false); // std::set parents; // std::set childs; From b5f9d5d2d5eea50951a466d339bdc47910e69e07 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 28 Dec 2023 01:57:39 -0500 Subject: [PATCH 274/344] bitmap+tree verify --- include/flexflow/batch_config.h | 20 +- include/flexflow/config.h | 3 +- .../inc_multihead_self_attention_utils.cuh | 2 +- .../specinfer_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 10 + src/ops/argmax.cc | 2 + src/ops/inc_multihead_self_attention.cu | 8 +- src/ops/kernels/embedding_kernels.cu | 1 + .../specinfer_inc_multihead_self_attention.cu | 202 ++++++++---- src/ops/tree_inc_multihead_self_attention.cu | 197 ++++++++---- src/runtime/request_manager.cc | 291 ++++++++++++++---- src/runtime/request_manager.cu | 12 + 13 files changed, 562 insertions(+), 188 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index dd947bbd85..db5d4a8e48 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -56,6 +56,7 @@ class BatchConfig { // across workers static int const MAX_NUM_REQUESTS = 64; static int const MAX_NUM_TOKENS = 1024; + static int const MAX_SPEC_TREE_TOKEN_NUM = 64; // Set by update int num_tokens; @@ -75,6 +76,24 @@ class BatchConfig { int request_index; TokenId token_id; }; + + struct BitMask { + unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0}; + + // how many tokens before the tree, every sub requests need this part of + // cache + int non_tree_cache_size; + + // current tree size + int tree_size; + + int this_layer_size; + + // input length-> prompt/root + int prompt_size; + }; + + BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; @@ -154,7 +173,6 @@ class BeamSearchBatchConfig : public BatchConfig { int allocated_tokens; }; - BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; SpecInferTopology topology_mask[MAX_NUM_REQUESTS]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 321d14961b..fe261dfb48 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -83,7 +83,8 @@ struct FFHandler { sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo); + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index c128c1a126..0c065b6b0e 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -456,7 +456,7 @@ inline size_t smem_size_in_bytes(int hidden_size_per_head, int threads_per_block) { // The amount of shared memory needed to store the Q*K^T values in float. - size_t qk_sz = div_up(max_sequence_length + 1, 4) * 16; + size_t qk_sz = div_up(1000 + 1, 4) * 16; size_t logits_sz = qk_sz; // The total size needed during softmax. diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h index 6e5dc73b5c..eb1b2882c3 100644 --- a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h +++ b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h @@ -143,6 +143,7 @@ class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionM BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 6e2da19ce9..d160da4a72 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index e67888d2d6..dc1939c74b 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -110,6 +110,16 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth); + void updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size); FFModel *get_model(int model_id); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index f336c843e8..0344c707fc 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -398,6 +398,8 @@ InferenceResult ArgMax::save_inference_tensors_to_file( m, shard_id, bc, {}, {}, {input, indices}); } + + print_tensor(indices.get_int32_ptr(), 32, "tree attn output"); download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 4c184acb3c..a05dbbf919 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1364,8 +1364,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { + case INC_DECODING_MODE: { key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); @@ -1374,7 +1373,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case BEAM_SEARCH_MODE: + case TREE_VERIFY_MODE: { // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * @@ -1402,7 +1402,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 2 * qk_prod_size + attn_heads_size) * size_of_dt + complex_size * sizeof(cuFloatComplex); // more components will - // be added here later + // be added here later if (offload) { // assert that we have enough reserved work space left size_t totalSharedSize = diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 22d8161ff1..91f5d60e85 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,6 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } + print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); } /*static*/ diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 63cd90f44f..e8ac1d980c 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -51,6 +51,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, BeamSearchBatchConfig::SpecInferTopology *topology_mask, + BatchConfig::BitMask *causalMask, int max_tree_branches) { // q, k @@ -75,11 +76,18 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( BeamSearchBatchConfig::SpecInferTopology topology = topology_mask[request_idx]; + BatchConfig::BitMask bitmask = causalMask[request_idx]; int const first_step = 0; int const tlength = request_infos[request_idx].first_token_depth_in_request + request_infos[request_idx].num_tokens_in_batch; + + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); + } + + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; // int const qlength = request_infos[request_idx].num_tokens_in_batch; int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; @@ -88,7 +96,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[request_idx].num_tokens_in_batch; + // first_token_idx += request_infos[request_idx].num_tokens_in_batch; + first_token_idx += bitmask.this_layer_size; } // shared memory objects @@ -124,7 +133,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( request_idx * max_seq_length * hidden_size * max_tree_branches + ki; int ti_end = - div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; for (int sub_req_idx = 0; sub_req_idx < tree_branch_num; sub_req_idx += 1) { #pragma unroll @@ -134,21 +143,25 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { - printf("cacheposssssA %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); - printf("cacheposssssB %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); - printf("cacheposssssC %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); - printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][11]); - printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][12]); - printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][13]); - }else if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 1) { - printf("cacheposssssE %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][0]); - printf("cacheposssssF %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][1]); - printf("cacheposssssG %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][2]); - printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][11]); - printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][12]); - printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][13]); - } + int const query_token = bitmask.tree_size - tree_branch_num + sub_req_idx; + + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { + // printf("fuckmasksss %d, %d, %d, %d, %d\n", + // bitmask.prompt_size, + // bitmask.non_tree_cache_size, + // tree_branch_num, + // bitmask.tree_size, + // tlength); + // printf("cacheposssssB %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][1]); + // printf("cacheposssssC %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][2]); + // printf("cacheposssssD %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][11]); printf("cacheposssssD %d, %d\n", + // tree_branch_num, topology.real_token_pos[0][12]); + // printf("cacheposssssD %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][13]); + } __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; @@ -156,22 +169,33 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < tlength) { + if (ti < totalCacheSize) { // find the real position of the cache; // depth: 0, 1, 2, 3, 4, 4, 5, 5 ,5, 5, - int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + // int const real_cache_idx = + // topology.real_token_pos[sub_req_idx][ti]; k[ii] = *reinterpret_cast( - k_cache_batch + real_cache_idx * hidden_size + - head_idx * per_head_size + jj); + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); } } float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { // todo add alobi here - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + if (blockIdx.y == 0 && blockIdx.x == 0 && mask && sub_req_idx == 0) { + // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", + // ti, + // totalCacheSize, + // ti - bitmask.non_tree_cache_size, + // query_token, + // bitmask.mask[ti - bitmask.non_tree_cache_size]); + // assert(false); } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -208,10 +232,14 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti - first_step] - qk_max); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; - qk_smem[ti - first_step] = logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; } // Compute the sum. @@ -219,7 +247,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { qk_smem[ti - first_step] *= inv_sum; } @@ -254,14 +283,17 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // vi; if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( - v_cache_batch + real_cache_idx * hidden_size + - head_idx * per_head_size); - float logit = qk_smem[ti - first_step]; + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); } } @@ -298,7 +330,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + (request_idx + sub_req_idx) * hidden_size + + *reinterpret_cast(output_ptr + + (request_idx + sub_req_idx) * hidden_size + head_idx * per_head_size + vi), out); } @@ -315,6 +348,7 @@ __global__ void specinfer_store_kv_cache( BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, @@ -335,41 +369,57 @@ __global__ void specinfer_store_kv_cache( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const first_token_in_req = requestInfo[req_id].first_token_depth_in_request; + int const first_token_in_req = + requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; + int const total_token = requestInfo[req_id].num_tokens_in_batch; + + BatchConfig::BitMask bitmask = causalMask[req_id]; + + int const sub_request_num = beamRequestInfos[req_id].sub_request_num; - int const beam_size = beamRequestInfos[req_id].sub_request_num; + int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + + // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - + // tree_branch_num + sub_req_id + tok_id; + // bitmask.tree_size - tree_branch_num + sub_req_id; + + // if prompt token -> token id + // if tree token: + int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - + bitmask.this_layer_size + token_idx; int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; - if (i == 0) { - printf("ffasdasds%d, %d, %d, %d, %d, %d\n", - beamTokenInfos[0].sub_request_index, - allocated_tokens, - sub_req_id, - tok_id, - first_token_in_req, - real_idx); - } - else if(i == hidden_size * 2){ - printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d\n", - beamTokenInfos[0].sub_request_index, - allocated_tokens, - sub_req_id, - tok_id, - first_token_in_req, - real_idx); - } - - + // if (i == 0) { + // printf("ffasdasds%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", + // beamTokenInfos[0].sub_request_index, + // allocated_tokens, + // sub_req_id, + // tok_id, + // first_token_in_req, + // real_idx, + // cache_idx, + // bitmask.non_tree_cache_size, + // bitmask.tree_size, + // sub_request_num, + // token_idx ); + // } else if (i == hidden_size * 2) { + // printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d, %d\n", + // beamTokenInfos[0].sub_request_index, + // allocated_tokens, + // sub_req_id, + // tok_id, + // first_token_in_req, + // real_idx, + // cache_idx); + // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (real_idx) * hidden_size + - offset] = kVal; + (cache_idx)*hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (real_idx) * hidden_size + - offset] = vVal; + (cache_idx)*hidden_size + offset] = vVal; } } @@ -398,6 +448,7 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->beam_token_infos, m->beam_request_infos, m->beam_topology_mask, + m->causalMask, m->qProjSize, m->kProjSize, m->vProjSize, @@ -433,6 +484,7 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->request_infos, \ m->beam_request_infos, \ m->beam_topology_mask, \ + m->causalMask, \ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) template @@ -520,7 +572,7 @@ void compute_attention_kernel_prompt( for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; - } + } // else if (tokens_previous_requests < bc->num_generation_tokens) { // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; // continue; @@ -728,6 +780,16 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, cudaStream_t stream) { // phase 1: Implement kernel to compute KQV for input tokens + + cudaMemcpyAsync(m->causalMask, + &(bc->causalMask), + bc->num_active_requests() * sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream); + std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " + << bc->causalMask[0].non_tree_cache_size << ", " + << bc->causalMask[0].mask[0] << ", " << sizeof(BatchConfig::BitMask) + << "\n"; compute_qkv_kernel(m, bc, shard_id, @@ -830,6 +892,7 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } + // print_tensor(output.get_float_ptr(), 32, "specinc output"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); @@ -878,6 +941,11 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { + size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); + gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + total_size); + beam_topology_mask = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + @@ -895,6 +963,16 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + // causalMask = + // static_cast( + // handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + // sizeof(BatchConfig::requestsInfo) + + // sizeof(BeamSearchBatchConfig::topology_mask) + + // sizeof(BeamSearchBatchConfig::beamTokenInfo)) + + // sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + causalMask = gpu_mem_allocator.allocate_instance( + causal_mask_size); // beam_token_infos = // gpu_mem_allocator // .allocate_instance( diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 1da56e383a..a3e3adcc30 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -53,6 +53,8 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::PerRequestInfo *request_infos, int num_heads, int num_requests, + int max_tree_branches, + BatchConfig::BitMask *causalMask, int qk_smem_sz) { // q, k @@ -81,6 +83,17 @@ __global__ void compute_attention_kernel_fused_kernel( request_infos[request_idx].num_tokens_in_batch; int const qlength = request_infos[request_idx].num_tokens_in_batch; + BatchConfig::BitMask bitmask = causalMask[request_idx]; + + // bitmask.mask[1] = 3; + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("tree attn fused kernel %d, %d, %d, %lld\n", + tlength, + qlength, + bitmask.non_tree_cache_size, + bitmask.mask[1]); + } + int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { first_token_idx += request_infos[request_idx].num_tokens_in_batch; @@ -115,7 +128,8 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + + request_idx * max_tree_branches * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -127,10 +141,12 @@ __global__ void compute_attention_kernel_fused_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); } + __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; int const ti_circ = ti % max_seq_length; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; if (ti < tlength) { @@ -142,22 +158,35 @@ __global__ void compute_attention_kernel_fused_kernel( float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + + if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { + printf("tree attn mask for first token %d, %lld, %d, %d\n", + ti, + bitmask.mask[ti - bitmask.non_tree_cache_size], + bitmask.non_tree_cache_size, + qi); } - int pos = ti * qlength + qi; - if (((pos / qlength) % tlength) > (pos % qlength + tlength - qlength)) { - qk = -FLT_MAX; - } qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[pos] = mask ? 0.f : qk; + if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { + printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", + ti, + bitmask.mask[ti - bitmask.non_tree_cache_size], + bitmask.non_tree_cache_size, + qi, + qk); + } + qk_smem[ti - first_step] = mask ? 0.0f : qk; } } + __syncthreads(); +#pragma unroll for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } @@ -176,66 +205,97 @@ __global__ void compute_attention_kernel_fused_kernel( // The warps finalize the reduction. qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; - +#pragma unroll for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { + printf("tree attn first token qk_max %f\n", + qk_max); + } - float exp_sum = 0.f; + float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti * qlength + qi] - qk_max); + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; - qk_smem[ti * qlength + qi] = logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; } // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("expsum %.10f\n", exp_sum); + } + // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti * qlength + qi] *= inv_sum; + qk_smem[ti - first_step] *= inv_sum; } __syncthreads(); - } + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("softmax %.10f\n", qk_smem[0]); + } - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; - Out_sum out; - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - for (int qi = 0; qi < qlength; qi++) { + Out_sum out; zero(out); - __syncthreads(); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + + request_idx * max_seq_length * hidden_size * max_tree_branches + vi; + // DT const *v_cache_batch = + // value_cache + + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * + // max_seq_length * hidden_size + + // vi; + if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - float logit = qk_smem[ti * qlength + qi]; + + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); + } } - // Make sure we can start writing to shared memory. + // // Make sure we can start writing to shared memory. __syncthreads(); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("valueX %.10f\n", out.x); + } + // Run the final reduction amongst the different groups computing different // partial outputs. if (Dh == Dh_MAX || vi < Dh) { @@ -268,6 +328,11 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi); + } } } } @@ -380,7 +445,9 @@ __global__ void update_tree_branch_kv_cache_fused( int vProjSize, int num_new_tokens, int max_seq_len, - int hidden_size) { + int hidden_size, + int max_tree_branches, + int first_token_depth) { CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { int token_idx = i / hidden_size; @@ -393,10 +460,10 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (token_idx + first_token_depth) * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (token_idx + first_token_depth) * hidden_size + offset] = vVal; } } @@ -473,7 +540,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } std::cout << "num_new_tokens: " << num_new_tokens << "\n"; - assert(false); int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); @@ -728,22 +794,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, THDS_PER_KEY, \ THDS_PER_VALUE> \ <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length(), \ - BatchConfig::max_tokens_per_batch(), \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos, \ - m->num_q_heads, \ - bc->num_active_requests(), \ + static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), static_cast
(m->valueCache), output_ptr, scale, BatchConfig::max_sequence_length(), BatchConfig::max_tokens_per_batch(), m->qProjSize, m->hidden_size, m->request_infos, m->num_q_heads, bc->num_active_requests(), BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, m->causalMask, \ smem_sz[0]) template -void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, DT *output_ptr, cudaStream_t stream) { @@ -752,6 +807,12 @@ void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, // update K-V cache int num_new_tokens = bc->num_active_tokens(); int parallelism = m->hidden_size * num_new_tokens; + printf("update KV cache %d, idx: %d\n", + num_new_tokens, + bc->requestsInfo[0].first_token_depth_in_request); + for (int i = 0; i < num_new_tokens; i++) { + printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); + } update_tree_branch_kv_cache_fused<<vProjSize, num_new_tokens, BatchConfig::max_sequence_length(), - m->hidden_size); + m->hidden_size, + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + bc->requestsInfo[0].first_token_depth_in_request); dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; @@ -816,12 +879,19 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache + std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << "\n"; + cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), bc->num_tokens_to_commit * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(m->causalMask, + &(bc->causalMask), + bc->num_active_requests() * sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -948,6 +1018,20 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventDestroy(t_start); cudaEventDestroy(t_end); } + + // print_tensor(output.get_float_ptr(), 32, "tree attn kernel"); + + // save_tensor( + // input.get_float_ptr(), + // 768 * bc->num_active_tokens(), + // "/home/xinhaoc/FlexFlow/inference/output/Newtreeinput.txt"); + // save_tensor( + // output.get_float_ptr(), + // 768 * bc->num_active_tokens(), + // "/home/xinhaoc/FlexFlow/inference/output/Newtreeoutput.txt"); + // std::cout << "new tokens: " << bc->num_active_tokens() << "\n"; + + // assert(bc->num_tokens_to_commit == 0); } TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( @@ -993,8 +1077,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( { int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); size_t committed_tokeninfo_size = max_tokens_per_batch; + size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo) + + causal_mask_size * sizeof(BatchConfig::BitMask); if (offload) { // assert that we have enough reserved work space left assert(gpu_mem_allocator.reserved_total_size - @@ -1004,6 +1091,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( gpu_mem_allocator .allocate_reserved( committed_tokeninfo_size); + causalMask = gpu_mem_allocator.allocate_instance( + causal_mask_size); } else { gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, total_size); @@ -1011,6 +1100,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( gpu_mem_allocator .allocate_instance( committed_tokeninfo_size); + causalMask = gpu_mem_allocator.allocate_instance( + causal_mask_size); } } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 775280e2cf..8a7cea1cc3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -16,6 +16,7 @@ #include "flexflow/request_manager.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" +#include #include #include #include @@ -735,6 +736,11 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length - new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); + // std::cout << "max depth: " << new_max_depth << ", " + // << new_bc.requestsInfo[i].first_token_depth_in_request << + // ", " + // << verified_tokens.size() << "\n"; + // assert(false); new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; @@ -761,6 +767,10 @@ BeamSearchBatchConfig new_bc.topology_mask[i].real_token_pos[0][j] = j; } + updateBitMask(new_bc.causalMask[i], + verified_tokens.size(), + request.tokens.size()); + // Token Info for (int j = 0; j < verified_tokens.size(); j++) { auto token = verified_tokens.at(j); @@ -910,6 +920,11 @@ BeamSearchBatchConfig new_bc.num_tokens++; } new_bc.topology_mask[i].allocated_tokens = 0; + new_bc.causalMask[i].non_tree_cache_size = 0; + new_bc.causalMask[i].tree_size = + new_bc.requestsInfo[i].num_tokens_in_batch; + initBitMask(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); // if (new_bc.requestsInfo[i].num_tokens_in_batch < // new_request.initial_len) { @@ -1161,6 +1176,27 @@ BeamSearchBatchConfig } } + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + // sub_request_num -> nodes of input next iteration + // beam_size replicate num + + std::cout << "print beam tree: " + << old_bc.beamRequestsInfo[i].current_depth << "\n"; + BeamTree tree = request.beam_trees[old_bc.model_id]; + for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { + std::cout << "layer: " << k << "\n"; + std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer + << "\n"; + } + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); + // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1248,6 +1284,10 @@ BeamSearchBatchConfig assert(false && "Request should be pending"); } + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done @@ -1260,6 +1300,13 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; + BeamTree tree = request.beam_trees[old_bc.model_id]; + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); } if (verbose) { @@ -1378,7 +1425,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { - + std::cout << "prepare next batch running: pending\n" << "\n"; new_bc.request_running[i] = true; @@ -1398,7 +1445,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector> dfs_tree_inputs = merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - if (verbose) { + if (true) { std::cout << "Request Tokens Size: " << request.tokens.size() << std::endl; for (int k = 0; k < request.tokens.size(); k++) { @@ -1414,6 +1461,13 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + + // copy bitmask to verify batchconfig + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); + // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; + // assert(false); // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; @@ -1429,7 +1483,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; - if (verbose) { + if (true) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << committed_token.first @@ -1441,7 +1495,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } } - if (verbose) { + if (true) { std::cout << "new_bc.num_tokens_to_commit: " << new_bc.num_tokens_to_commit << std::endl; } @@ -1463,8 +1517,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; - - std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() << "\n"; + + std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() + << "\n"; + // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); @@ -1485,7 +1541,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } } - assert(false); } else if (request.status == Request::PENDING) { std::cout << "prepare next batch verify: pending\n" @@ -1518,6 +1573,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); + // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; + // assert(false); + // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size; @@ -1643,8 +1704,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - // result_index += old_bc.topology_mask[index].allocated_tokens; - if (true) { std::cout << "i = " << i << ", result index = " << result_index << ", value: " << result.token_ids[result_index] @@ -1669,6 +1728,9 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.tokens.back(); request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; + request.beam_trees.at(old_bc.model_id) + .treeLayers[0] + .nodes_num_this_layer = 1; if (true) { std::cout << "Store the previous last token to the tree root: " @@ -1677,7 +1739,9 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } std::cout << "leaffffff: " << leaf_node_num << "\n"; - + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .nodes_num_this_layer = leaf_node_num; for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { request.beam_trees.at(old_bc.model_id) @@ -1751,50 +1815,6 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, std::cout << "token: " << j << ": " << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } - - // std::set parents; - // std::set childs; - // // cache stealing - // for (int j = 0; j < beam_size; j++) { - // int parent_id = tree.treeLayers[depth].parent_ids[j]; - // if (childs.find(parent_id) == childs.end()) { - // // copy beam slot - // new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = - // tree.treeLayers[depth].parent_ids[j]; - // new_bc.beamRequestsInfo[request_index].probs[parent_id] = - // tree.treeLayers[depth].probs[j]; - // new_bc.beamRequestsInfo[request_index].tokens[parent_id] = - // tree.treeLayers[depth].tokens[j]; - // parents.emplace(j); - // childs.emplace(parent_id); - // } - // } - // if (parents.size() < beam_size) { - // for (int j = 0; j < beam_size; j++) { - // if (parents.find(j) == parents.end()) { - // // this slot has not been assigned - // // find the smallest not assigned child and put in - // if (verbose) { - // std::cout << "request_index" << request_index - // << ", miss slot: " << j << "\n"; - // } - // for (int k = 0; k < beam_size; k++) { - // if (childs.find(k) == childs.end()) { - // // parent -> j to child k; - // new_bc.beamRequestsInfo[request_index].parent_id[k] = - // tree.treeLayers[depth].parent_ids[j]; - // new_bc.beamRequestsInfo[request_index].probs[k] = - // tree.treeLayers[depth].probs[j]; - // new_bc.beamRequestsInfo[request_index].tokens[k] = - // tree.treeLayers[depth].tokens[j]; - // parents.emplace(j); - // childs.emplace(k); - // break; - // } - // } - // } - // } - // } } if (verbose) { std::cout << "-----------after parent id exchange-----------" << std::endl; @@ -1809,6 +1829,128 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, } } +// bit mask related function + +// prompt phase, init task +void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + + bitmask.prompt_size = initLength; + bitmask.this_layer_size = initLength; + bitmask.tree_size = initLength; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + std::cout << "see bit mask" << bitmask.prompt_size << "\n"; + std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; + std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; + std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; +} + +// prepare next init +void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size) { + // assert(initLength == 1); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + bitmask.non_tree_cache_size = non_tree_size; + bitmask.tree_size = initLength; + bitmask.this_layer_size = initLength; + std::cout << "non_tree_size: " << non_tree_size << "\n"; + bitmask.prompt_size = initLength; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + + std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; + std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) + << "\n"; + std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[1]) + << "\n"; + std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[2]) + << "\n"; +} + +// prepare next beam, append layers to the tree +void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth) { + int pre_tree_size = bitmask.tree_size; + bitmask.tree_size += newNodes; + bitmask.this_layer_size = newNodes; + assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // preBeamSize: replicate num + + // add relationship with input/prompt + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = pre_tree_size; j < bitmask.tree_size; j++) { + bitmask.mask[i] |= (1 << j); + std::cout << "see bit mask append: " << i << ", to" << j + << std::bitset<64>(bitmask.mask[i]) << "\n"; + } + } + + std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " + << pre_tree_size << ", " << bitmask.prompt_size << ", " + << preBeamSize << "\n"; + + // int num_groups = newNodes / preBeamSize; + // int group_size = newNodes / num_groups; + // add relations to branch + // requests in same groups share same relations, except the last token. + + // set middle layers + // skip the root prompt/tokens + int token_idx = bitmask.prompt_size; + int new_nodes_start_idx = pre_tree_size; + std::cout << "new nodes start " << new_nodes_start_idx << "\n"; + for (int i = 1; i < currentDepth; i++) { + new_nodes_start_idx = pre_tree_size; + int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; + std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer + << "group size: " << newNodes / nodes_this_layer << "\n"; + for (int j = 0; j < nodes_this_layer; j++) { + int group_size = newNodes / nodes_this_layer; + for (int k = 0; k < group_size; k++) { + bitmask.mask[token_idx] |= (1 << new_nodes_start_idx); + new_nodes_start_idx += 1; + } + token_idx += 1; + } + } + + std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " + << new_nodes_start_idx << ", " << newNodes + << "current depth: " << currentDepth << "\n"; + std::cout << "new nodes end " << new_nodes_start_idx << "\n"; + + std::cout << "tree size: " << bitmask.tree_size << "\n"; + assert(token_idx == pre_tree_size); + assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); + + // assert(currentDepth <= 2); + // set last layer, all tokens are only relevant to it self; + for (int i = token_idx; i < bitmask.tree_size; i++) { + bitmask.mask[i] |= (1 << i); + std::cout << "set rel: " << i << "to: " << i << "\n"; + } +} + bool PreOrder( BeamTree const &tree, int max_depth, @@ -1979,7 +2121,7 @@ std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, int first_token_depth_in_request) { - if (verbose) { + if (true) { std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; std::cout << "[Traverse Beam Tree] max_depth: " @@ -1988,6 +2130,8 @@ std::vector> << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; std::cout << "[Traverse Beam Tree] beam_width: " << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + std::cout << "[Traverse Beam Tree] start index: " + << first_token_depth_in_request << "\n"; } auto guid = old_bc.requestsInfo[request_index].request_guid; @@ -1995,27 +2139,39 @@ std::vector> // std::cout << "request.beam_trees.size(): " << request.beam_trees.size() // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - // std::cout << "\n\n"; + std::cout << "print beam tree: " + << "\n"; + std::vector> serializedTree; + for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { + std::cout << "tree layer: " << i + << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer + << "\n"; + // push tokens into tree + for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { + std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; + serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); + } + } // token, index // todo make this one global for different stages - std::vector> serializedTree; - PreOrder(tree, - old_bc.beamRequestsInfo[request_index].max_depth, - 0, - old_bc.beamRequestsInfo[request_index].beam_size, - 0, - serializedTree, - verbose); + + // PreOrder(tree, + // old_bc.beamRequestsInfo[request_index].max_depth, + // 0, + // old_bc.beamRequestsInfo[request_index].beam_size, + // 0, + // serializedTree, + // verbose); // print it - if (verbose) { + if (true) { std::cout << "Print serialized tree: size:" << request_index << serializedTree.size() << "\n"; } for (int k = 0; k < serializedTree.size(); k++) { serializedTree.at(k).second += first_token_depth_in_request; - if (verbose) { + if (true) { std::cout << "token id: " << serializedTree.at(k).first << ", depth: " << serializedTree.at(k).second << "\n"; } @@ -2041,6 +2197,9 @@ std::vector> input_trees, int root_depth, RequestGuid guid) { + assert(input_trees.size() == 1 && "currently using one ssm"); + return input_trees.at(0); + std::vector> merged_tree; std::unordered_map> childrens; diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index b76c5c326e..4d7e2c8806 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -104,6 +104,18 @@ void RequestManager::load_tokens_task( sizeof(BeamSearchBatchConfig::beamRequestsInfo), cudaMemcpyHostToDevice, stream); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // sizeof(BatchConfig::tokensInfo) + + // sizeof(BatchConfig::requestsInfo) + + // sizeof(BeamSearchBatchConfig::topology_mask) + + // sizeof(BeamSearchBatchConfig::beamTokenInfo) + + // sizeof(BeamSearchBatchConfig::beamRequestsInfo), + // &(beam_batch_config->causalMask), + // sizeof(BatchConfig::causalMask), + // cudaMemcpyHostToDevice, + // stream); + // std::cout << "copy calsual mask info: " << beam_batch_config->causalMask[0].prompt_size << "\n"; } } From 945268f1a56e804b62b731c136bf8358c47b765f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 28 Dec 2023 11:19:16 -0500 Subject: [PATCH 275/344] fix. --- inference/spec_infer/spec_infer.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cu | 78 ++++++++++---------- src/runtime/request_manager.cc | 11 ++- 3 files changed, 50 insertions(+), 41 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 2ccdfd388d..e4fa71a1d5 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -404,7 +404,7 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(text); // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 15 /*max_sequence_length*/); + tree_model.generate(prompts, 23 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index a3e3adcc30..3d5ccf9431 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -162,24 +162,24 @@ __global__ void compute_attention_kernel_fused_kernel( (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { - printf("tree attn mask for first token %d, %lld, %d, %d\n", - ti, - bitmask.mask[ti - bitmask.non_tree_cache_size], - bitmask.non_tree_cache_size, - qi); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { + // printf("tree attn mask for first token %d, %lld, %d, %d\n", + // ti, + // bitmask.mask[ti - bitmask.non_tree_cache_size], + // bitmask.non_tree_cache_size, + // qi); + // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); - if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { - printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", - ti, - bitmask.mask[ti - bitmask.non_tree_cache_size], - bitmask.non_tree_cache_size, - qi, - qk); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { + // printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", + // ti, + // bitmask.mask[ti - bitmask.non_tree_cache_size], + // bitmask.non_tree_cache_size, + // qi, + // qk); + // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } } @@ -213,10 +213,10 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { - printf("tree attn first token qk_max %f\n", - qk_max); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", + // qk_max); + // } float exp_sum = 0.f; @@ -232,9 +232,9 @@ __global__ void compute_attention_kernel_fused_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("expsum %.10f\n", exp_sum); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("expsum %.10f\n", exp_sum); + // } // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); @@ -243,9 +243,9 @@ __global__ void compute_attention_kernel_fused_kernel( } __syncthreads(); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("softmax %.10f\n", qk_smem[0]); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("softmax %.10f\n", qk_smem[0]); + // } // value projection constexpr int V_VEC_SIZE = 16 / sizeof(DT); @@ -292,9 +292,9 @@ __global__ void compute_attention_kernel_fused_kernel( // // Make sure we can start writing to shared memory. __syncthreads(); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("valueX %.10f\n", out.x); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("valueX %.10f\n", out.x); + // } // Run the final reduction amongst the different groups computing different // partial outputs. @@ -328,11 +328,11 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", - out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + - head_idx * per_head_size + vi); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + + // head_idx * per_head_size + vi); + // } } } } @@ -807,12 +807,12 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, // update K-V cache int num_new_tokens = bc->num_active_tokens(); int parallelism = m->hidden_size * num_new_tokens; - printf("update KV cache %d, idx: %d\n", - num_new_tokens, - bc->requestsInfo[0].first_token_depth_in_request); - for (int i = 0; i < num_new_tokens; i++) { - printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); - } + // printf("update KV cache %d, idx: %d\n", + // num_new_tokens, + // bc->requestsInfo[0].first_token_depth_in_request); + // for (int i = 0; i < num_new_tokens; i++) { + // printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); + // } update_tree_branch_kv_cache_fused<<> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); @@ -1426,7 +1429,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (request.status == Request::RUNNING) { - std::cout << "prepare next batch running: pending\n" + std::cout << "prepare next batch running:\n" << "\n"; new_bc.request_running[i] = true; std::cout << "[Verify] Request " << request.guid << " is running" @@ -1663,6 +1666,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } + std::cout << "check dfs tree input size: " << dfs_tree_inputs[1000000].size() + << "\n"; + return new_bc; } @@ -2198,6 +2204,7 @@ std::vector> int root_depth, RequestGuid guid) { assert(input_trees.size() == 1 && "currently using one ssm"); + dfs_tree_inputs[guid] = input_trees.at(0); return input_trees.at(0); std::vector> merged_tree; @@ -2249,6 +2256,8 @@ std::vector> } dfs_tree_inputs[guid] = merged_tree; + // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << ", " + // << dfs_tree_inputs[guid].size() << "\n"; return merged_tree; } From ce95127aecaf553679539310574b48417609efa2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 29 Dec 2023 03:41:26 -0500 Subject: [PATCH 276/344] fix --- inference/spec_infer/spec_infer.cc | 4 +- src/ops/kernels/embedding_kernels.cu | 2 +- .../specinfer_inc_multihead_self_attention.cu | 76 ++++--- src/ops/tree_inc_multihead_self_attention.cu | 114 ++++++---- src/runtime/request_manager.cc | 198 +++++++++++------- 5 files changed, 246 insertions(+), 148 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index e4fa71a1d5..9af3e12e5a 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -402,9 +402,9 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); + // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 23 /*max_sequence_length*/); + tree_model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 91f5d60e85..0cde42de56 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); + // print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); } /*static*/ diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index e8ac1d980c..f2ea63d904 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -83,9 +83,9 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int const tlength = request_infos[request_idx].first_token_depth_in_request + request_infos[request_idx].num_tokens_in_batch; - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); + // } int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; // int const qlength = request_infos[request_idx].num_tokens_in_batch; @@ -181,6 +181,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( } float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + // if (blockIdx.y == 0 && blockIdx.x == 0) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, sub_req_idx); + // } + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { // todo add alobi here // bool const mask = ti_circ >= totalCacheSize; @@ -188,15 +192,15 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - if (blockIdx.y == 0 && blockIdx.x == 0 && mask && sub_req_idx == 0) { - // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", - // ti, - // totalCacheSize, - // ti - bitmask.non_tree_cache_size, - // query_token, - // bitmask.mask[ti - bitmask.non_tree_cache_size]); - // assert(false); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && sub_req_idx == 0) { + // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", + // ti, + // totalCacheSize, + // bitmask.non_tree_cache_size, + // query_token, + // bitmask.mask[ti - bitmask.non_tree_cache_size]); + // // assert(false); + // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; } @@ -231,6 +235,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + float exp_sum = 0.f; for (int ti = first_step + tidx; ti < totalCacheSize; ti += THREADS_PER_BLOCK) { @@ -245,6 +253,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn exp_sum %.10f\n", exp_sum); + // } + // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); for (int ti = first_step + tidx; ti < totalCacheSize; @@ -301,6 +313,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // // Make sure we can start writing to shared memory. __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("valueX %.10f\n", out.x); + // } + // Run the final reduction amongst the different groups computing different // partial outputs. if (Dh == Dh_MAX || vi < Dh) { @@ -357,8 +373,8 @@ __global__ void specinfer_store_kv_cache( int max_tree_branches, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -416,6 +432,16 @@ __global__ void specinfer_store_kv_cache( // cache_idx); // } + // if (i % hidden_size == 0) { + // printf("update cache: %d, %d, %d, %d, %d, %d\n", + // cache_idx, + // num_tokens, + // bitmask.non_tree_cache_size, + // bitmask.tree_size, + // bitmask.this_layer_size, + // token_idx); + // } + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -433,9 +459,9 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - printf("tokenInfo %d, %d\n", - bc->beamTokenInfo[0].sub_request_index, - num_tokens); + // printf("tokenInfo %d, %d\n", + // bc->beamTokenInfo[0].sub_request_index, + // num_tokens); specinfer_store_kv_cache<<num_active_requests() * sizeof(BatchConfig::BitMask), cudaMemcpyHostToDevice, stream); - std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " - << bc->causalMask[0].non_tree_cache_size << ", " - << bc->causalMask[0].mask[0] << ", " << sizeof(BatchConfig::BitMask) - << "\n"; + // std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " + // << bc->causalMask[0].non_tree_cache_size << ", " + // << bc->causalMask[0].mask[0] << ", " << + // sizeof(BatchConfig::BitMask) + // << "\n"; compute_qkv_kernel(m, bc, shard_id, @@ -800,8 +827,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - std::cout << "specinfer kernel token num: " << bc->num_generation_tokens - << ", " << bc->num_tokens << "\n"; + // std::cout << "specinfer kernel token num: " << bc->num_generation_tokens + // << ", " << bc->num_tokens << "\n"; if (bc->num_generation_tokens > 0) { compute_specinfer_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); @@ -809,6 +836,7 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 if (bc->num_tokens > bc->num_generation_tokens) { + // printf("spec inc prompt decoding\n"); compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } @@ -892,7 +920,7 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // print_tensor(output.get_float_ptr(), 32, "specinc output"); + // print_tensor(output.get_float_ptr(), 32, "specinc output"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 3d5ccf9431..180a165451 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -86,13 +86,13 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[request_idx]; // bitmask.mask[1] = 3; - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("tree attn fused kernel %d, %d, %d, %lld\n", - tlength, - qlength, - bitmask.non_tree_cache_size, - bitmask.mask[1]); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("tree attn fused kernel %d, %d, %d, %lld\n", + // tlength, + // qlength, + // bitmask.non_tree_cache_size, + // bitmask.mask[3]); + // } int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { @@ -161,7 +161,7 @@ __global__ void compute_attention_kernel_fused_kernel( bool const mask = (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { // printf("tree attn mask for first token %d, %lld, %d, %d\n", // ti, @@ -169,16 +169,22 @@ __global__ void compute_attention_kernel_fused_kernel( // bitmask.non_tree_cache_size, // qi); // } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 3 && mask) { + // printf("tree attn mask for third token %d, %lld, %d, %d\n", + // ti, + // bitmask.mask[ti - bitmask.non_tree_cache_size], + // bitmask.non_tree_cache_size, + // qi); + // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { - // printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", + // printf("tree attn qkqkqkqk %d %.10f, %.10f, %.10f\n", // ti, - // bitmask.mask[ti - bitmask.non_tree_cache_size], - // bitmask.non_tree_cache_size, - // qi, - // qk); + // qk, + // q_vecs[ki_o][0].x, + // k[0].x); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } @@ -212,12 +218,10 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { - // printf("tree attn first token qk_max %f\n", - // qk_max); - // } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", qk_max); + // } float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { @@ -244,7 +248,7 @@ __global__ void compute_attention_kernel_fused_kernel( __syncthreads(); // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("softmax %.10f\n", qk_smem[0]); + // printf("softmax %.10f\n", qk_smem[1]); // } // value projection @@ -280,12 +284,13 @@ __global__ void compute_attention_kernel_fused_kernel( V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - float logit = mask ? 0.0f : qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - + if (ti < tlength) { + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } } } @@ -328,11 +333,16 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", - // out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + - // head_idx * per_head_size + vi); - // } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, + // out.y, + // out.z, + // out.w, + // vi, + // (first_token_idx + qi) * hidden_size + head_idx * per_head_size + + // vi); + // } } } } @@ -349,11 +359,12 @@ __global__ void commit_tokens_kernel( int num_tokens_to_commit, int num_active_tokens_in_last_batch, int max_seq_len, - int hidden_size) { + int hidden_size, + int max_tree_branches) { - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - int token_pos = i / (hidden_size * KV_WEIGHT_NUM); + int token_pos = i / (hidden_size); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); @@ -367,10 +378,23 @@ __global__ void commit_tokens_kernel( int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + // if(i == 0){ + // printf("commit token: %d %d %f\n", token_idx_in_last_batch, tok_id, + // kVal); + // } + // if(i == hidden_size){ + // printf("commit token 1: %d %d %f\n", token_idx_in_last_batch, tok_id, + // kVal); + // } + // if(i == 2 * hidden_size){ + // printf("commit token 2: %d %d %f\n", token_idx_in_last_batch, tok_id, + // kVal); + // } + + kCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = kVal; + vCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = vVal; } } @@ -395,7 +419,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch BatchConfig::max_sequence_length(), - m->hidden_size); + m->hidden_size, + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); } } @@ -413,9 +438,9 @@ __global__ void update_tree_branch_kv_cache( int total_tokens_in_batch, int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int token_idx = i / (hidden_size); int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch @@ -460,6 +485,11 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + // if(i % hidden_size == 0){ + // printf("update token id: %d, %d\n", token_idx, token_idx + + // first_token_depth); + // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + (token_idx + first_token_depth) * hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -879,7 +909,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache - std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << "\n"; + // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << + // "\n"; cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), @@ -925,6 +956,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); + // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e7b08f653d..d5c2b7392d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -609,6 +609,8 @@ BeamSearchBatchConfig committed_tokens[guid].emplace_back(abs_depth, result_index); } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); + std::cout << "committred tokens push: " << abs_depth + << " ,result index: " << result_index << "\n"; committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { @@ -789,9 +791,9 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; - std::cout << "num_gen ++ " - << "\n"; - num_generation_tokens++; + // std::cout << "num_gen ++ " + // << "\n"; + // num_generation_tokens++; // Add verified token to request's token list request.tokens.push_back(token.first); @@ -923,9 +925,7 @@ BeamSearchBatchConfig new_bc.num_tokens++; } new_bc.topology_mask[i].allocated_tokens = 0; - new_bc.causalMask[i].non_tree_cache_size = 0; - new_bc.causalMask[i].tree_size = - new_bc.requestsInfo[i].num_tokens_in_batch; + initBitMask(new_bc.causalMask[i], new_bc.requestsInfo[i].num_tokens_in_batch); @@ -1185,14 +1185,14 @@ BeamSearchBatchConfig // sub_request_num -> nodes of input next iteration // beam_size replicate num - std::cout << "print beam tree: " - << old_bc.beamRequestsInfo[i].current_depth << "\n"; + // std::cout << "print beam tree: " + // << old_bc.beamRequestsInfo[i].current_depth << "\n"; BeamTree tree = request.beam_trees[old_bc.model_id]; - for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { - std::cout << "layer: " << k << "\n"; - std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer - << "\n"; - } + // for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { + // std::cout << "layer: " << k << "\n"; + // std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer + // << "\n"; + // } appendBitMask(new_bc.causalMask[i], new_bc.beamRequestsInfo[i].sub_request_num, old_bc.beamRequestsInfo[i].beam_size, @@ -1217,9 +1217,10 @@ BeamSearchBatchConfig new_bc.topology_mask[i].real_token_pos[k][depth] = new_bc.topology_mask[i].allocated_tokens + num_generation_tokens; - std::cout << "topology: sub request: " << k << ", " - << ", " << depth << ", " - << new_bc.topology_mask[i].real_token_pos[k][depth] << "\n"; + // std::cout << "topology: sub request: " << k << ", " + // << ", " << depth << ", " + // << new_bc.topology_mask[i].real_token_pos[k][depth] << + // "\n"; num_generation_tokens++; } } @@ -1354,13 +1355,13 @@ BeamSearchBatchConfig } if (true) { - std::cout << "print all resultsBBB" - << "\n"; - for (int i = 0; i < 40; i++) { - std::cout << result.token_ids[i] << ", "; - } - std::cout << "Current Beam DepthBBB: " - << old_bc.beamRequestsInfo[0].current_depth << "\n"; + // std::cout << "print all resultsBBB" + // << "\n"; + // for (int i = 0; i < 40; i++) { + // std::cout << result.token_ids[i] << ", "; + // } + // std::cout << "Current Beam DepthBBB: " + // << old_bc.beamRequestsInfo[0].current_depth << "\n"; } return new_bc; } @@ -1449,11 +1450,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); if (true) { - std::cout << "Request Tokens Size: " << request.tokens.size() - << std::endl; - for (int k = 0; k < request.tokens.size(); k++) { - std::cout << k << ": " << request.tokens[k] << std::endl; - } + // std::cout << "Request Tokens Size: " << request.tokens.size() + // << std::endl; + // for (int k = 0; k < request.tokens.size(); k++) { + // std::cout << k << ": " << request.tokens[k] << std::endl; + // } } // Normal Request Info @@ -1475,27 +1476,42 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; + std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " + << new_bc.causalMask[i].tree_size << ", " + << new_bc.causalMask[i].non_tree_cache_size << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[1]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[2]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[3]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[4]) + << "\n"; + // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { - for (int j = 0; j < dfs_tree_inputs.size(); j++) { - if (j < committed_tokens.at(guid).size()) { - auto committed_token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (true) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } - new_bc.num_tokens_to_commit++; - request.llm_cache_size++; + for (int j = 0; j < committed_tokens.at(guid).size(); j++) { + // if (j < committed_tokens.at(guid).size()) { + + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + if (true) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; } + new_bc.num_tokens_to_commit++; + request.llm_cache_size++; + // } } } if (true) { @@ -1759,11 +1775,11 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - std::cout << "??????? beam id: " << beam_id << ", token: " - << request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .tokens[beam_id] - << "\n"; + // std::cout << "??????? beam id: " << beam_id << ", token: " + // << request.beam_trees.at(old_bc.model_id) + // .treeLayers[depth] + // .tokens[beam_id] + // << "\n"; // if (true) { // std::cout << "tree value: " << depth << "token: " @@ -1844,19 +1860,20 @@ void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, "do not support tree size > 64"); // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = initLength; bitmask.prompt_size = initLength; bitmask.this_layer_size = initLength; - bitmask.tree_size = initLength; for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = i; j < bitmask.prompt_size; j++) { bitmask.mask[i] |= (1 << j); } } - std::cout << "see bit mask" << bitmask.prompt_size << "\n"; - std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; - std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; - std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; + // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; } // prepare next init @@ -1868,11 +1885,16 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, // 0000000..1000 assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && "do not support tree size > 64"); - bitmask.non_tree_cache_size = non_tree_size; - bitmask.tree_size = initLength; + assert(initLength >= 1 && "verified token num should >= 1"); + + std::cout << "non tree size: " << non_tree_size << ", " + << bitmask.non_tree_cache_size << "\n"; + + bitmask.non_tree_cache_size = non_tree_size + initLength - 1; + bitmask.tree_size = 1; bitmask.this_layer_size = initLength; std::cout << "non_tree_size: " << non_tree_size << "\n"; - bitmask.prompt_size = initLength; + bitmask.prompt_size = 1; for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = i; j < bitmask.prompt_size; j++) { bitmask.mask[i] |= (1 << j); @@ -1906,14 +1928,14 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = pre_tree_size; j < bitmask.tree_size; j++) { bitmask.mask[i] |= (1 << j); - std::cout << "see bit mask append: " << i << ", to" << j - << std::bitset<64>(bitmask.mask[i]) << "\n"; + // std::cout << "see bit mask append: " << i << ", to" << j + // << std::bitset<64>(bitmask.mask[i]) << "\n"; } } - std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " - << pre_tree_size << ", " << bitmask.prompt_size << ", " - << preBeamSize << "\n"; + // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " + // << pre_tree_size << ", " << bitmask.prompt_size << ", " + // << preBeamSize << "\n"; // int num_groups = newNodes / preBeamSize; // int group_size = newNodes / num_groups; @@ -1924,12 +1946,12 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, // skip the root prompt/tokens int token_idx = bitmask.prompt_size; int new_nodes_start_idx = pre_tree_size; - std::cout << "new nodes start " << new_nodes_start_idx << "\n"; + // std::cout << "new nodes start " << new_nodes_start_idx << "\n"; for (int i = 1; i < currentDepth; i++) { new_nodes_start_idx = pre_tree_size; int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; - std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer - << "group size: " << newNodes / nodes_this_layer << "\n"; + // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer + // << "group size: " << newNodes / nodes_this_layer << "\n"; for (int j = 0; j < nodes_this_layer; j++) { int group_size = newNodes / nodes_this_layer; for (int k = 0; k < group_size; k++) { @@ -1940,12 +1962,12 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, } } - std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " - << new_nodes_start_idx << ", " << newNodes - << "current depth: " << currentDepth << "\n"; - std::cout << "new nodes end " << new_nodes_start_idx << "\n"; + // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " + // << new_nodes_start_idx << ", " << newNodes + // << "current depth: " << currentDepth << "\n"; + // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; - std::cout << "tree size: " << bitmask.tree_size << "\n"; + // std::cout << "tree size: " << bitmask.tree_size << "\n"; assert(token_idx == pre_tree_size); assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); @@ -1953,8 +1975,23 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, // set last layer, all tokens are only relevant to it self; for (int i = token_idx; i < bitmask.tree_size; i++) { bitmask.mask[i] |= (1 << i); - std::cout << "set rel: " << i << "to: " << i << "\n"; + // std::cout << "set rel: " << i << "to: " << i << "\n"; } + + // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){ + // assert(false); + // } + + std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; + std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) + << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[1]) + << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[2]) + << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[3]) + << "\n"; } bool PreOrder( @@ -2146,16 +2183,16 @@ std::vector> // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - std::cout << "print beam tree: " - << "\n"; + // std::cout << "print beam tree: " + // << "\n"; std::vector> serializedTree; for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { - std::cout << "tree layer: " << i - << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer - << "\n"; + // std::cout << "tree layer: " << i + // << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer + // << "\n"; // push tokens into tree for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { - std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; + // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); } } @@ -2256,7 +2293,8 @@ std::vector> } dfs_tree_inputs[guid] = merged_tree; - // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << ", " + // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << + // ", " // << dfs_tree_inputs[guid].size() << "\n"; return merged_tree; From 3ed25d681127d742770776b8d07d9771e0e19f79 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 29 Dec 2023 16:10:16 -0500 Subject: [PATCH 277/344] multi batch --- src/ops/beam_topk.cc | 3 +- src/ops/beam_topk.cu | 3 +- .../specinfer_inc_multihead_self_attention.cu | 66 +++++++------------ .../tree attn kernel, 0----> -0.029753357172 | 1 + src/ops/tree_inc_multihead_self_attention.cu | 45 +++++++++---- src/runtime/request_manager.cc | 37 ++++++++--- 6 files changed, 89 insertions(+), 66 deletions(-) create mode 100644 src/ops/tree attn kernel, 0----> -0.029753357172 diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 3f636c2c98..20d019eec3 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -402,8 +402,7 @@ BeamInferenceResult download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); - print_tensor(index_ptr, 32, "indexxxxxxx"); - printf("max beam width %d\n", m->max_beam_width); + // print_tensor(index_ptr, 32, "indexxxxxxx"); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 515bba4bc0..d647fe9ed7 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -626,7 +626,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, stream)); // trick, set acc_probs to 0; checkCUDA( - cudaMemsetAsync(m->acc_probs, 1.0, batch_size * sizeof(DT), stream)); + cudaMemsetAsync(m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); checkCUDA(cudaMemcpyAsync(m->block_start_index, beam_block_start_index.data(), sizeof(int) * beam_num_blocks, @@ -644,6 +644,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; + beam_num_blocks = bc->num_active_tokens(); beam_topk_forward_kernel<<>>( input_ptr, shared_memory_size, diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index f2ea63d904..3fdd1ab554 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -100,6 +100,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( first_token_idx += bitmask.this_layer_size; } + // if (tidx == 0 && head_idx == 0) { + // printf("spec req: %d, %d\n", request_idx, first_token_idx); + // } + // shared memory objects extern __shared__ char smem_[]; @@ -135,17 +139,16 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int ti_end = div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; - for (int sub_req_idx = 0; sub_req_idx < tree_branch_num; sub_req_idx += 1) { + for (int qi = 0; qi < tree_branch_num; qi += 1) { #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + (hidden_size * QKV_WEIGHT_NUM * sub_req_idx) + ki + + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); } - int const query_token = bitmask.tree_size - tree_branch_num + sub_req_idx; - - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { + int const query_token = bitmask.tree_size - tree_branch_num + qi; + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 0) { // printf("fuckmasksss %d, %d, %d, %d, %d\n", // bitmask.prompt_size, // bitmask.non_tree_cache_size, @@ -345,11 +348,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float( - *reinterpret_cast(output_ptr + - (request_idx + sub_req_idx) * hidden_size + - head_idx * per_head_size + vi), - out); + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); } } } @@ -391,6 +393,9 @@ __global__ void specinfer_store_kv_cache( int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; int const total_token = requestInfo[req_id].num_tokens_in_batch; + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; + BatchConfig::BitMask bitmask = causalMask[req_id]; int const sub_request_num = beamRequestInfos[req_id].sub_request_num; @@ -404,42 +409,18 @@ __global__ void specinfer_store_kv_cache( // if prompt token -> token id // if tree token: int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx; + bitmask.this_layer_size + token_idx - + request_token_offset; int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; - // if (i == 0) { - // printf("ffasdasds%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", - // beamTokenInfos[0].sub_request_index, - // allocated_tokens, - // sub_req_id, - // tok_id, - // first_token_in_req, + // if (i % hidden_size == 0) { + // printf("ffasdasds request %d, real idx %d, cache idx %d token id %d, kval %.10f\n", + // req_id, // real_idx, // cache_idx, - // bitmask.non_tree_cache_size, - // bitmask.tree_size, - // sub_request_num, - // token_idx ); - // } else if (i == hidden_size * 2) { - // printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d, %d\n", - // beamTokenInfos[0].sub_request_index, - // allocated_tokens, - // sub_req_id, // tok_id, - // first_token_in_req, - // real_idx, - // cache_idx); - // } - - // if (i % hidden_size == 0) { - // printf("update cache: %d, %d, %d, %d, %d, %d\n", - // cache_idx, - // num_tokens, - // bitmask.non_tree_cache_size, - // bitmask.tree_size, - // bitmask.this_layer_size, - // token_idx); + // kVal); // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -846,6 +827,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); + // std::cout << "specinfer num tokens: " << num_tokens; + compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } @@ -920,7 +903,8 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // print_tensor(output.get_float_ptr(), 32, "specinc output"); + // save_tensor(output.get_float_ptr(), 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); + // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); diff --git a/src/ops/tree attn kernel, 0----> -0.029753357172 b/src/ops/tree attn kernel, 0----> -0.029753357172 new file mode 100644 index 0000000000..e4f14ee757 --- /dev/null +++ b/src/ops/tree attn kernel, 0----> -0.029753357172 @@ -0,0 +1 @@ +tree attn kernel, 0----> -0.02975335717201232910 0.01930358447134494781 0.03780741989612579346 0.11878532171249389648 -0.03523746877908706665 0.02421043440699577332 0.03719477355480194092 -0.00304851122200489044 0.02062662504613399506 0.06683708727359771729 -0.00642335414886474609 -0.00504039414227008820 0.02955199964344501495 0.00648811273276805878 0.00558663159608840942 0.02003456838428974152 -0.04041406139731407166 0.00736814411357045174 -0.04575226455926895142 0.03949077427387237549 0.05742383748292922974 0.04866250604391098022 0.04687267541885375977 -0.00701304525136947632 -0.03712264448404312134 -0.02175992354750633240 -0.03979443758726119995 0.03961737453937530518 -0.07450901716947555542 0.02090370282530784607 -0.03487894684076309204 0.01653470844030380249 \ No newline at end of file diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 180a165451..11169fa36d 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -99,6 +99,10 @@ __global__ void compute_attention_kernel_fused_kernel( first_token_idx += request_infos[request_idx].num_tokens_in_batch; } + // if(tidx == 0 && head_idx == 0){ + // printf("tree req: %d, %d\n", request_idx, first_token_idx); + // } + // shared memory objects extern __shared__ char smem_[]; @@ -140,6 +144,12 @@ __global__ void compute_attention_kernel_fused_kernel( q_vecs[ki_o][ii] = *reinterpret_cast( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + + // if (head_idx == 0 && qi == 1 && tidx == 0) { + // printf("laod q %d, %d %.10f\n", + // request_idx, + // qi,q_vecs[ki_o][ii].x); + // } } __syncthreads(); @@ -162,11 +172,12 @@ __global__ void compute_attention_kernel_fused_kernel( (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { - // printf("tree attn mask for first token %d, %lld, %d, %d\n", + // if (head_idx == 0 && qi == 9 && mask) { + // printf("tree attn mask for first token %d, %lld, %d, %d, %d\n", // ti, // bitmask.mask[ti - bitmask.non_tree_cache_size], // bitmask.non_tree_cache_size, + // request_idx, // qi); // } // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 3 && mask) { @@ -179,11 +190,15 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { - // printf("tree attn qkqkqkqk %d %.10f, %.10f, %.10f\n", + // if (head_idx == 0 && qi == 1 && !mask && tidx == 0) { + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n", + // request_idx, // ti, // qk, // q_vecs[ki_o][0].x, + // q_vecs[ki_o][1].x, + // q_vecs[ki_o][2].x, + // q_vecs[ki_o][3].x, // k[0].x); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; @@ -219,7 +234,7 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && tidx == 0) { + // if (head_idx == 0 && qi == 9 && tidx == 0) { // printf("tree attn first token qk_max %f\n", qk_max); // } @@ -236,7 +251,7 @@ __global__ void compute_attention_kernel_fused_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // if (head_idx == 0 && tidx == 0 && qi == 9) { // printf("expsum %.10f\n", exp_sum); // } @@ -247,7 +262,7 @@ __global__ void compute_attention_kernel_fused_kernel( } __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // if (head_idx == 0 && tidx == 0 && qi == 9) { // printf("softmax %.10f\n", qk_smem[1]); // } @@ -465,6 +480,7 @@ __global__ void update_tree_branch_kv_cache_fused( DT *kCache_ptr, DT *vCache_ptr, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo *request_infos, int qProjSize, int kProjSize, int vProjSize, @@ -486,14 +502,15 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; + // if(i % hidden_size == 0){ - // printf("update token id: %d, %d\n", token_idx, token_idx + - // first_token_depth); + // printf("update token request id: %d, %d, %d value%.10f\n", req_id, token_idx, request_token_offset, kVal); // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth) * hidden_size + offset] = kVal; + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth) * hidden_size + offset] = vVal; + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = vVal; } } @@ -851,6 +868,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, static_cast
(m->keyCache), static_cast
(m->valueCache), m->token_infos, + m->request_infos, m->qProjSize, m->kProjSize, m->vProjSize, @@ -956,7 +974,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor"); + // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, "qkvtenor1"); + // print_tensor((float *)m->devQKVProjArray + 768 * 18 * 3 + 768, 32, "qkvtenor2"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -1000,6 +1019,8 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } + std::cout << "tree input tokens: " <num_active_tokens() << "\n"; + // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); if (use_bias) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d5c2b7392d..ab062a4610 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -634,6 +634,7 @@ BeamSearchBatchConfig if (request.status == Request::RUNNING) { std::cout << "verify running: " << dfs_tree_inputs.at(guid).size() << ", " << tree_outputs.size() << "\n"; + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); @@ -812,6 +813,7 @@ BeamSearchBatchConfig } log_req_mgr.print("Output: %s", output.c_str()); } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; @@ -1185,8 +1187,8 @@ BeamSearchBatchConfig // sub_request_num -> nodes of input next iteration // beam_size replicate num - // std::cout << "print beam tree: " - // << old_bc.beamRequestsInfo[i].current_depth << "\n"; + std::cout << "print beam tree: " + << old_bc.beamRequestsInfo[i].current_depth << "\n"; BeamTree tree = request.beam_trees[old_bc.model_id]; // for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { // std::cout << "layer: " << k << "\n"; @@ -1224,6 +1226,12 @@ BeamSearchBatchConfig num_generation_tokens++; } } + // if(new_bc.beamRequestsInfo[i].current_depth >= 3 && i > 0){ + // assert(false); + // } + + + } } @@ -1709,6 +1717,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { + std::cout << "i is: " << i << "old guid" << guid << " new guid" << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid <<"\n"; + int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -1722,16 +1732,21 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // Count tokens sent to model in this request to find the final token's // index + + std::cout << "previous result index: "<< result_index; + result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - - if (true) { - std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] - << ", leaf node num: " << leaf_node_num << ", depth" << depth - << ", beam size: " << beam_size << "\n"; - } + + std::cout << "after result index: "<< result_index; + + // if (true) { + // std::cout << "i = " << i << ", result index = " << result_index + // << ", value: " << result.token_ids[result_index] + // << ", leaf node num: " << leaf_node_num << ", depth" << depth + // << ", beam size: " << beam_size << "\n"; + // } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; @@ -1792,7 +1807,9 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } // update the guid and start_depth for current request if (i < old_bc.num_tokens) { - guid = old_bc.requestsInfo[index].request_guid; + int new_req_idx = old_bc.tokensInfo[i].request_index; + guid = old_bc.requestsInfo[new_req_idx].request_guid; + std::cout << "update guid: " << guid << ", request idx: " << index<< "\n"; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } From 5c3ad3592f7b71dc705466fa24cb7c7c1e179deb Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 29 Dec 2023 17:37:28 -0500 Subject: [PATCH 278/344] copy metadata once --- include/flexflow/batch_config.h | 6 -- include/flexflow/config.h | 4 +- .../specinfer_inc_multihead_self_attention.h | 1 - src/ops/inc_multihead_self_attention.cu | 13 --- .../specinfer_inc_multihead_self_attention.cu | 94 ++++--------------- src/ops/tree_inc_multihead_self_attention.cu | 65 ++++++------- src/runtime/request_manager.cc | 46 +-------- src/runtime/request_manager.cu | 74 ++++++++------- 8 files changed, 89 insertions(+), 214 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index db5d4a8e48..c3a75e59a4 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -168,14 +168,8 @@ class BeamSearchBatchConfig : public BatchConfig { int sub_request_index; }; - struct SpecInferTopology { - int real_token_pos[MAX_SPECULATIVE_TREE_BRANCHES][MAX_NUM_TOKENS]; - int allocated_tokens; - }; - BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; - SpecInferTopology topology_mask[MAX_NUM_REQUESTS]; // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index fe261dfb48..1526b9291f 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -81,10 +81,10 @@ struct FFHandler { // request info + token info + topolopgy mask info size_t batch_config_metadata_size = sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask); + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h index eb1b2882c3..b6fed1ae25 100644 --- a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h +++ b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h @@ -142,7 +142,6 @@ class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionM Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; - BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask; BatchConfig::BitMask *causalMask; }; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a05dbbf919..a084f216e9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -825,19 +825,6 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, bias_ptr = static_cast
(m->bias_ptr); } - // todo Xinhao copy how many requests if requests are not continous? - // cudaMemcpyAsync(m->token_infos, - // &(bc->tokensInfo), - // bc->num_active_tokens() * - // sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, - // stream); - // cudaMemcpyAsync(m->request_infos, - // &(bc->requestsInfo), - // bc->max_requests_per_batch() * - // sizeof(BatchConfig::PerRequestInfo), - // cudaMemcpyHostToDevice, - // stream); - // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 3fdd1ab554..4d4afd28e4 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -50,7 +50,6 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BeamSearchBatchConfig::SpecInferTopology *topology_mask, BatchConfig::BitMask *causalMask, int max_tree_branches) { @@ -74,8 +73,6 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // request idx int const request_idx = blockIdx.y; - BeamSearchBatchConfig::SpecInferTopology topology = - topology_mask[request_idx]; BatchConfig::BitMask bitmask = causalMask[request_idx]; int const first_step = 0; @@ -148,23 +145,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( } int const query_token = bitmask.tree_size - tree_branch_num + qi; - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 0) { - // printf("fuckmasksss %d, %d, %d, %d, %d\n", - // bitmask.prompt_size, - // bitmask.non_tree_cache_size, - // tree_branch_num, - // bitmask.tree_size, - // tlength); - // printf("cacheposssssB %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][1]); - // printf("cacheposssssC %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][2]); - // printf("cacheposssssD %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][11]); printf("cacheposssssD %d, %d\n", - // tree_branch_num, topology.real_token_pos[0][12]); - // printf("cacheposssssD %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][13]); - } + __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; @@ -173,10 +154,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; if (ti < totalCacheSize) { - // find the real position of the cache; - // depth: 0, 1, 2, 3, 4, 4, 5, 5 ,5, 5, - // int const real_cache_idx = - // topology.real_token_pos[sub_req_idx][ti]; + k[ii] = *reinterpret_cast( k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + jj); @@ -291,17 +269,12 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( DT const *v_cache_batch = value_cache + request_idx * max_seq_length * hidden_size * max_tree_branches + vi; - // DT const *v_cache_batch = - // value_cache + - // (beam_request_idx * max_beam_width + beam_sub_request_idx) * - // max_seq_length * hidden_size + - // vi; + if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); @@ -365,7 +338,6 @@ __global__ void specinfer_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, - BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask, BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, @@ -390,7 +362,6 @@ __global__ void specinfer_store_kv_cache( int const first_token_in_req = requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; int const total_token = requestInfo[req_id].num_tokens_in_batch; int const request_token_offset = @@ -412,17 +383,6 @@ __global__ void specinfer_store_kv_cache( bitmask.this_layer_size + token_idx - request_token_offset; - int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; - - // if (i % hidden_size == 0) { - // printf("ffasdasds request %d, real idx %d, cache idx %d token id %d, kval %.10f\n", - // req_id, - // real_idx, - // cache_idx, - // tok_id, - // kVal); - // } - kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -454,7 +414,6 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->request_infos, m->beam_token_infos, m->beam_request_infos, - m->beam_topology_mask, m->causalMask, m->qProjSize, m->kProjSize, @@ -490,7 +449,6 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->beam_topology_mask, \ m->causalMask, \ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) @@ -788,16 +746,6 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { // phase 1: Implement kernel to compute KQV for input tokens - cudaMemcpyAsync(m->causalMask, - &(bc->causalMask), - bc->num_active_requests() * sizeof(BatchConfig::BitMask), - cudaMemcpyHostToDevice, - stream); - // std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " - // << bc->causalMask[0].non_tree_cache_size << ", " - // << bc->causalMask[0].mask[0] << ", " << - // sizeof(BatchConfig::BitMask) - // << "\n"; compute_qkv_kernel(m, bc, shard_id, @@ -953,38 +901,30 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); + // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); + // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + // total_size); - beam_topology_mask = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); beam_token_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask)); + sizeof(BatchConfig::requestsInfo)); beam_request_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); - // causalMask = - // static_cast( - // handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - // sizeof(BatchConfig::requestsInfo) + - // sizeof(BeamSearchBatchConfig::topology_mask) + - // sizeof(BeamSearchBatchConfig::beamTokenInfo)) + - // sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - causalMask = gpu_mem_allocator.allocate_instance( - causal_mask_size); + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + // causalMask = gpu_mem_allocator.allocate_instance( + // causal_mask_size); // beam_token_infos = // gpu_mem_allocator // .allocate_instance( diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 11169fa36d..ebbfac23ea 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -191,8 +191,8 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); // if (head_idx == 0 && qi == 1 && !mask && tidx == 0) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n", - // request_idx, + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, + // %.10f\n", request_idx, // ti, // qk, // q_vecs[ki_o][0].x, @@ -355,7 +355,8 @@ __global__ void compute_attention_kernel_fused_kernel( // out.z, // out.w, // vi, - // (first_token_idx + qi) * hidden_size + head_idx * per_head_size + + // (first_token_idx + qi) * hidden_size + head_idx * + // per_head_size + // vi); // } } @@ -502,15 +503,21 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; + int const request_token_offset = + request_infos[req_id].first_token_offset_in_batch; // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d value%.10f\n", req_id, token_idx, request_token_offset, kVal); + // printf("update token request id: %d, %d, %d value%.10f\n", req_id, + // token_idx, request_token_offset, kVal); // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = kVal; + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = vVal; + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = vVal; } } @@ -974,8 +981,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, "qkvtenor1"); - // print_tensor((float *)m->devQKVProjArray + 768 * 18 * 3 + 768, 32, "qkvtenor2"); + // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, + // "qkvtenor1"); print_tensor((float *)m->devQKVProjArray + 768 * 18 * + // 3 + 768, 32, "qkvtenor2"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -1019,7 +1027,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - std::cout << "tree input tokens: " <num_active_tokens() << "\n"; + std::cout << "tree input tokens: " << bc->num_active_tokens() << "\n"; // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); @@ -1128,34 +1136,15 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo) + - causal_mask_size * sizeof(BatchConfig::BitMask); - if (offload) { - // assert that we have enough reserved work space left - assert(gpu_mem_allocator.reserved_total_size - - gpu_mem_allocator.reserved_allocated_size >= - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); - causalMask = gpu_mem_allocator.allocate_instance( - causal_mask_size); - } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); - causalMask = gpu_mem_allocator.allocate_instance( - causal_mask_size); - } + + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + committed_token_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index ab062a4610..670db1ab0e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -766,12 +766,6 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].sub_request_num = 1; new_bc.sub_requests[i] = 1; - new_bc.topology_mask[i].allocated_tokens = request.tokens.size(); - - // assign new kv cache position - for (int j = 0; j < request.tokens.size(); j++) { - new_bc.topology_mask[i].real_token_pos[0][j] = j; - } updateBitMask(new_bc.causalMask[i], verified_tokens.size(), @@ -786,8 +780,6 @@ BeamSearchBatchConfig new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; - new_bc.topology_mask[i].real_token_pos[0][token.second] = - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request; // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; @@ -846,7 +838,6 @@ BeamSearchBatchConfig } new_bc.beamRequestsInfo[i].sub_request_num = 1; - new_bc.topology_mask[i].allocated_tokens = 0; new_bc.sub_requests[i] = 1; @@ -919,14 +910,12 @@ BeamSearchBatchConfig assert(depth < new_request.tokens.size()); new_bc.tokensInfo[new_bc.num_tokens].token_id = new_request.tokens[depth]; - new_bc.topology_mask[i].real_token_pos[0][depth] = depth; // beam search meta data, indicate which sub request this token // belongs to, init to 0; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; } - new_bc.topology_mask[i].allocated_tokens = 0; initBitMask(new_bc.causalMask[i], new_bc.requestsInfo[i].num_tokens_in_batch); @@ -1120,9 +1109,6 @@ BeamSearchBatchConfig update_beam_metadata( new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); - new_bc.topology_mask[i].allocated_tokens = - old_bc.topology_mask[i].allocated_tokens + - old_bc.beamRequestsInfo[i].sub_request_num; } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1156,31 +1142,9 @@ BeamSearchBatchConfig << std::endl; } - // for (int j = 0; j < request.tokens.size(); j++) { - // new_bc.topology_mask[i].real_token_pos[0][j] = j; - // } - // register more tokens due to the beam width - std::cout << "register more tokens: " - << new_bc.beamRequestsInfo[i].sub_request_num << ", " - << new_bc.requestsInfo[i].num_tokens_in_batch << ", " - << new_bc.topology_mask[i].allocated_tokens << "\n"; - - // copy meta data and replicate - int replicate_num = new_bc.beamRequestsInfo[i].sub_request_num / - old_bc.beamRequestsInfo[i].sub_request_num; - - for (int j = 0; j < old_bc.beamRequestsInfo[i].sub_request_num; j++) { - int old_idx = j; - for (int k = 0; k < replicate_num; k++) { - int new_idx = j * replicate_num + k; - std::cout << "copy from " << old_idx << "to: " << new_idx << "\n"; - memcpy(new_bc.topology_mask[i].real_token_pos[new_idx], - old_bc.topology_mask[i].real_token_pos[old_idx], - sizeof(int) * BatchConfig::MAX_NUM_TOKENS); - } - } + //copy metadata memcpy(&new_bc.causalMask[i], &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); @@ -1215,14 +1179,6 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; - // width first - new_bc.topology_mask[i].real_token_pos[k][depth] = - new_bc.topology_mask[i].allocated_tokens + num_generation_tokens; - - // std::cout << "topology: sub request: " << k << ", " - // << ", " << depth << ", " - // << new_bc.topology_mask[i].real_token_pos[k][depth] << - // "\n"; num_generation_tokens++; } } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 4d7e2c8806..e8824feda5 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -59,64 +59,74 @@ void RequestManager::load_tokens_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); + size_t total_copy_size = 0; cudaMemcpyAsync(handle.batch_config_metadata, &(batch_config->tokensInfo), - batch_config->num_active_tokens() * - sizeof(BatchConfig::PerTokenInfo), + sizeof(BatchConfig::tokensInfo), cudaMemcpyHostToDevice, stream); + total_copy_size += sizeof(BatchConfig::tokensInfo); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo), + total_copy_size, &(batch_config->requestsInfo), - batch_config->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), + sizeof(BatchConfig::requestsInfo), cudaMemcpyHostToDevice, stream); + total_copy_size += sizeof(BatchConfig::requestsInfo); - // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo), - &(beam_batch_config->topology_mask), - sizeof(BeamSearchBatchConfig::topology_mask), - cudaMemcpyHostToDevice, - stream); - - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask), + total_copy_size, &(beam_batch_config->beamTokenInfo), sizeof(BeamSearchBatchConfig::beamTokenInfo), cudaMemcpyHostToDevice, stream); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask) + - sizeof(BeamSearchBatchConfig::beamTokenInfo), + total_copy_size, &(beam_batch_config->beamRequestsInfo), sizeof(BeamSearchBatchConfig::beamRequestsInfo), cudaMemcpyHostToDevice, stream); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // sizeof(BatchConfig::tokensInfo) + - // sizeof(BatchConfig::requestsInfo) + - // sizeof(BeamSearchBatchConfig::topology_mask) + - // sizeof(BeamSearchBatchConfig::beamTokenInfo) + - // sizeof(BeamSearchBatchConfig::beamRequestsInfo), - // &(beam_batch_config->causalMask), - // sizeof(BatchConfig::causalMask), - // cudaMemcpyHostToDevice, - // stream); - // std::cout << "copy calsual mask info: " << beam_batch_config->causalMask[0].prompt_size << "\n"; + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream); + total_copy_size += sizeof(BatchConfig::causalMask); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( From fae148da9a4b495d26642c1929ebe9f25cdf3b1d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 05:11:38 -0500 Subject: [PATCH 279/344] fix some corner cases --- include/flexflow/model.h | 1 + .../inc_multihead_self_attention_utils.cuh | 4 +- include/flexflow/request_manager.h | 7 + inference/spec_infer/spec_infer.cc | 6 +- src/ops/argmax.cc | 2 +- src/ops/beam_topk.cc | 1 + src/ops/inc_multihead_self_attention.cu | 8 +- src/ops/kernels/embedding_kernels.cu | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 18 +-- .../specinfer_inc_multihead_self_attention.cu | 75 +++++----- src/ops/tree_inc_multihead_self_attention.cu | 94 ++++++------ src/runtime/cuda_helper.cu | 2 +- src/runtime/inference_manager.cc | 61 +++++++- src/runtime/model.cc | 17 +++ src/runtime/request_manager.cc | 141 ++++++++++++++---- src/runtime/request_manager.cu | 87 +++++++++++ 16 files changed, 389 insertions(+), 137 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3602cb108b..9cdbec64a9 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -242,6 +242,7 @@ enum TaskIDs { // InferenceManager & RequestManager RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, + RM_LOAD_BATCH_CONFIG_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index 0c065b6b0e..1b21a80dc9 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -456,7 +456,7 @@ inline size_t smem_size_in_bytes(int hidden_size_per_head, int threads_per_block) { // The amount of shared memory needed to store the Q*K^T values in float. - size_t qk_sz = div_up(1000 + 1, 4) * 16; + size_t qk_sz = div_up(2000 + 1, 4) * 16; size_t logits_sz = qk_sz; // The total size needed during softmax. @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length; + int max_qk_length = max_query_length * max_total_length + 1000; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index dc1939c74b..8cb45e55b4 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -43,6 +43,8 @@ class InferenceManager { void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset); + void load_inference_metadata_batch_config(BatchConfigFuture const &bc, + FFHandler *handlers); public: FFConfig ff_config; @@ -195,6 +197,11 @@ class RequestManager { Legion::Context ctx, Legion::Runtime *runtime); + static void + load_batch_config_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static BatchConfig prepare_next_batch_task( Legion::Task const *task, std::vector const ®ions, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9af3e12e5a..258b2d78eb 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -266,9 +266,9 @@ void FlexFlow::top_level_task(Task const *task, ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; - int max_requests_per_batch = 16; - int max_tokens_per_batch = 256; - int max_sequence_length = 1024; + int max_requests_per_batch = 10; + int max_tokens_per_batch = 199; + int max_sequence_length = 200; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 0344c707fc..d195a5af75 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -399,7 +399,7 @@ InferenceResult m, shard_id, bc, {}, {}, {input, indices}); } - print_tensor(indices.get_int32_ptr(), 32, "tree attn output"); + // print_tensor(indices.get_int32_ptr(), 199, "tree attn output"); download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 20d019eec3..5dfaae41ee 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -404,6 +404,7 @@ BeamInferenceResult // print_tensor(index_ptr, 32, "indexxxxxxx"); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a084f216e9..2f16dd71c2 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1365,12 +1365,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); break; } default: diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 0cde42de56..3085fdb6ba 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - // print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); + print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 562dee4d93..29e3d9a48d 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -111,15 +111,15 @@ __global__ void spec_store_kv_cache( // naive cache stealing if (sub_req_id != parent_id) { - if (offset == 0 && tok_id == 0) { - printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - "%d, tok_id %d\n", - beam_depth, - req_id, - sub_req_id, - parent_id, - tok_id); - } + // if (offset == 0 && tok_id == 0) { + // printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " + // "%d, tok_id %d\n", + // beam_depth, + // req_id, + // sub_req_id, + // parent_id, + // tok_id); + // } for (int depth = 0; depth < beam_depth; depth++) { int steal_token_idx = tok_id - beam_depth + depth; diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 4d4afd28e4..e84ec3095c 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -50,8 +50,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask, - int max_tree_branches) { + BatchConfig::BitMask *causalMask) { // q, k using Q_vec = typename VEC_K::Type; @@ -83,8 +82,14 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { // printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); // } + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("specinfer attn fused kernel %d, %d\n", + // totalCacheSize,request_infos[request_idx].num_tokens_in_batch); + // } // int const qlength = request_infos[request_idx].num_tokens_in_batch; int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; @@ -94,7 +99,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { // first_token_idx += request_infos[request_idx].num_tokens_in_batch; - first_token_idx += bitmask.this_layer_size; + first_token_idx += causalMask[r].this_layer_size; } // if (tidx == 0 && head_idx == 0) { @@ -130,8 +135,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - request_idx * max_seq_length * hidden_size * max_tree_branches + ki; + key_cache + request_idx * max_seq_length * hidden_size + ki; int ti_end = div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -267,9 +271,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - request_idx * max_seq_length * hidden_size * max_tree_branches + vi; - + value_cache + request_idx * max_seq_length * hidden_size + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { @@ -344,7 +346,6 @@ __global__ void specinfer_store_kv_cache( int vProjSize, int num_tokens, int max_seq_len, - int max_tree_branches, bool is_root, int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { @@ -383,10 +384,10 @@ __global__ void specinfer_store_kv_cache( bitmask.this_layer_size + token_idx - request_token_offset; - kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (cache_idx)*hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (cache_idx)*hidden_size + offset] = vVal; + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -419,8 +420,8 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, /*root*/ curr_depth == 0, m->hidden_size); } @@ -429,7 +430,8 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, #define LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THREADS_PER_VALUE, \ THDS_PER_BLOCK); \ compute_specinfer_attention_kernel_generation_kernel(m->valueCache), \ output_ptr, \ scale, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ m->qProjSize, \ m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->causalMask, \ - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) + m->causalMask) template void compute_specinfer_attention_kernel_generation( @@ -527,11 +529,13 @@ void compute_attention_kernel_prompt( int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -580,8 +584,7 @@ void compute_attention_kernel_prompt( // print_tensor((float*)A, 32, "A"); std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // if (i == 0 && sub_req_id == 0 && // bc->beam_slots.at(0).current_depth == 1) { @@ -692,8 +695,7 @@ void compute_attention_kernel_prompt( strideC = m->vProjSize; // To get A, skip over V^T entries from previous requests (all heads + // padding) - A = static_cast
(m->valueCache) + - (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * vt_req_block_size; + A = static_cast
(m->valueCache) + i * vt_req_block_size; // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) B = C_softmax; @@ -851,8 +853,10 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // save_tensor(output.get_float_ptr(), 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); - // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); + // save_tensor(output.get_float_ptr(), 768 * 3, + // "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); + // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, + // "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); @@ -906,7 +910,6 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, // total_size); - beam_token_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + @@ -915,13 +918,13 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( beam_request_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) - + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); // causalMask = gpu_mem_allocator.allocate_instance( // causal_mask_size); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index ebbfac23ea..8641e63e38 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -53,7 +53,6 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::PerRequestInfo *request_infos, int num_heads, int num_requests, - int max_tree_branches, BatchConfig::BitMask *causalMask, int qk_smem_sz) { @@ -86,8 +85,9 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[request_idx]; // bitmask.mask[1] = 3; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("tree attn fused kernel %d, %d, %d, %lld\n", + // if (head_idx == 0 && tidx == 0) { + // printf("tree attn fused kernel req id %d %d, %d, %d, %lld\n", + // request_idx, // tlength, // qlength, // bitmask.non_tree_cache_size, @@ -96,12 +96,12 @@ __global__ void compute_attention_kernel_fused_kernel( int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[request_idx].num_tokens_in_batch; + first_token_idx += request_infos[r].num_tokens_in_batch; } - // if(tidx == 0 && head_idx == 0){ - // printf("tree req: %d, %d\n", request_idx, first_token_idx); - // } + if(tidx == 0 && head_idx == 0){ + printf("tree req: %d, %d\n", request_idx, first_token_idx); + } // shared memory objects extern __shared__ char smem_[]; @@ -132,8 +132,7 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - request_idx * max_tree_branches * max_seq_length * hidden_size + ki; + key_cache + request_idx * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -190,17 +189,14 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (head_idx == 0 && qi == 1 && !mask && tidx == 0) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, - // %.10f\n", request_idx, - // ti, - // qk, - // q_vecs[ki_o][0].x, - // q_vecs[ki_o][1].x, - // q_vecs[ki_o][2].x, - // q_vecs[ki_o][3].x, - // k[0].x); - // } + if (head_idx == 0 && qi == 0 && !mask) { + printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", + request_idx, + ti, + qk, + q_vecs[ki_o][0].x, + k[0].x); + } qk_smem[ti - first_step] = mask ? 0.0f : qk; } } @@ -283,8 +279,7 @@ __global__ void compute_attention_kernel_fused_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - request_idx * max_seq_length * hidden_size * max_tree_branches + vi; + value_cache + request_idx * max_seq_length * hidden_size + vi; // DT const *v_cache_batch = // value_cache + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * @@ -375,8 +370,7 @@ __global__ void commit_tokens_kernel( int num_tokens_to_commit, int num_active_tokens_in_last_batch, int max_seq_len, - int hidden_size, - int max_tree_branches) { + int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { @@ -407,10 +401,10 @@ __global__ void commit_tokens_kernel( // kVal); // } - kCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -434,9 +428,9 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length(), - m->hidden_size, - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + m->hidden_size); } } @@ -488,7 +482,6 @@ __global__ void update_tree_branch_kv_cache_fused( int num_new_tokens, int max_seq_len, int hidden_size, - int max_tree_branches, int first_token_depth) { CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { @@ -510,11 +503,11 @@ __global__ void update_tree_branch_kv_cache_fused( // printf("update token request id: %d, %d, %d value%.10f\n", req_id, // token_idx, request_token_offset, kVal); // } - kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + kCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + vCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = vVal; @@ -569,10 +562,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; int vt_block_size = m->vProjSize; int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -836,7 +831,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THDS_PER_VALUE, \ THDS_PER_BLOCK, \ bc, \ @@ -848,7 +844,20 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, THDS_PER_KEY, \ THDS_PER_VALUE> \ <<>>( \ - static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), static_cast
(m->valueCache), output_ptr, scale, BatchConfig::max_sequence_length(), BatchConfig::max_tokens_per_batch(), m->qProjSize, m->hidden_size, m->request_infos, m->num_q_heads, bc->num_active_requests(), BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, m->causalMask, \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + BatchConfig::max_tokens_per_batch(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->num_q_heads, \ + bc->num_active_requests(), \ + m->causalMask, \ smem_sz[0]) template @@ -880,9 +889,8 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_new_tokens, - BatchConfig::max_sequence_length(), + BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, m->hidden_size, - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, bc->requestsInfo[0].first_token_depth_in_request); dim3 grid(m->num_q_heads, bc->num_active_requests()); @@ -981,9 +989,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, - // "qkvtenor1"); print_tensor((float *)m->devQKVProjArray + 768 * 18 * - // 3 + 768, 32, "qkvtenor2"); + + // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor1"); + // print_tensor((float *)m->devQKVProjArray + 768 * (25 * 7) * 3, 32, "qkvtenor2"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index fa6bf55fe5..398ed7f3cd 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -226,7 +226,7 @@ __host__ void print_tensor(T const *ptr, printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { printf(" %.20lf", (float)host_ptr[idx]); - if (idx >= 100) { + if (idx >= 200) { break; } } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 52fd64c606..e7f7c5f52d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -320,6 +320,7 @@ FutureMap InferenceManager::inference(FFModel *model, assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; load_input_tokens_from_batch_config(bc, pt, model->handlers); + load_inference_metadata_batch_config(bc, model->handlers); } } @@ -349,18 +350,32 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( - BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers) { + BatchConfigFuture const &bc, + ParallelTensor const input, + FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); - MachineView view = input->machine_view; - for (PointInRectIterator<1> it(task_rect); it(); it++) { - FFHandler handle = handlers[view.get_device_id(*it)]; - argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); + Domain domain = runtime->get_index_space_domain(ctx, input->parallel_is); + + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + MachineView view = input->machine_view; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + argmap.set_point(*it, \ + TaskArgument(&handlers[view.get_device_id(*it)], \ + sizeof(FFHandler))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); } IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, @@ -378,6 +393,36 @@ void InferenceManager::load_input_tokens_from_batch_config( runtime->execute_index_space(ctx, launcher); } +void InferenceManager::load_inference_metadata_batch_config( + BatchConfigFuture const &bc, + FFHandler *handlers) { + Context ctx = ff_config.lg_ctx; + Runtime *runtime = ff_config.lg_hlr; + ArgumentMap argmap; + + Rect<1> task_rect(Point<1>(0), + Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); + IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + + // int rank = 0; + int idx = 0; + for (PointInRectIterator<1> it(task_rect); it(); it++) { + FFHandler handler = handlers[idx++]; + argmap.set_point(*it, TaskArgument(&handler, sizeof(FFHandler))); + } + + IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + FFConfig::DataParallelism_GPU); + launcher.add_future(bc); + runtime->execute_index_space(ctx, launcher); +} + void InferenceManager::load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8bda9016c3..cf72f2d40b 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4344,6 +4344,23 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // RequestManager load metadata + { + TaskVariantRegistrar registrar(RM_LOAD_BATCH_CONFIG_TASK_ID, + "RequestManager Load meta data"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RequestManager Load metadata Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // RequestManager prepare_next_batch { TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 670db1ab0e..5c3262eb27 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -609,8 +609,8 @@ BeamSearchBatchConfig committed_tokens[guid].emplace_back(abs_depth, result_index); } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); - std::cout << "committred tokens push: " << abs_depth - << " ,result index: " << result_index << "\n"; + // std::cout << "committred tokens push: " << abs_depth + // << " ,result index: " << result_index << "\n"; committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { @@ -621,12 +621,12 @@ BeamSearchBatchConfig tree_outputs.back().second, token_id); } - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - abs_depth, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - token_id); + // std::cout << "Index within old batch: " << result_index << std::endl; + // printf(" Input: [%d] %d ---> [%d] %d \n", + // abs_depth, + // old_bc.tokensInfo[result_index].token_id, + // tree_outputs.back().second, + // token_id); } result_index++; } @@ -634,13 +634,12 @@ BeamSearchBatchConfig if (request.status == Request::RUNNING) { std::cout << "verify running: " << dfs_tree_inputs.at(guid).size() << ", " << tree_outputs.size() << "\n"; - + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); - // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -805,7 +804,12 @@ BeamSearchBatchConfig } log_req_mgr.print("Output: %s", output.c_str()); } - + + if (request.tokens.size() > 19 && i >= 7) { + std::cout << request.tokens.size() << "\n"; + assert(false); + } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; @@ -1099,7 +1103,8 @@ BeamSearchBatchConfig // } assert(new_bc.beamRequestsInfo[i].sub_request_num <= - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = @@ -1144,7 +1149,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width - //copy metadata + // copy metadata memcpy(&new_bc.causalMask[i], &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); @@ -1185,9 +1190,6 @@ BeamSearchBatchConfig // if(new_bc.beamRequestsInfo[i].current_depth >= 3 && i > 0){ // assert(false); // } - - - } } @@ -1238,7 +1240,8 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num; assert(new_bc.beamRequestsInfo[i].sub_request_num <= - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); // update the parentid, accumalated_probs, depth, and token_ids @@ -1504,6 +1507,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() << "\n"; + bool cutLayer = false; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); @@ -1520,11 +1524,27 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() - 1) { + if (new_bc.num_tokens == get_max_tokens_per_batch() && + (j != dfs_tree_inputs.size() - 1)) { + cutLayer = true; break; } } + // delete the last incomplete layer + if (cutLayer) { + int total_tokens = new_bc.num_tokens; + for (int j = total_tokens - 1; j >= 1; j--) { + new_bc.num_tokens--; + new_bc.requestsInfo[i].num_tokens_in_batch--; + std::cout << "cut: " << j << "\n"; + if (new_bc.tokensInfo[j].abs_depth_in_request != + new_bc.tokensInfo[j - 1].abs_depth_in_request) { + break; + } + } + } + } else if (request.status == Request::PENDING) { std::cout << "prepare next batch verify: pending\n" << "\n"; @@ -1646,6 +1666,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } + std::cout << "how many tokens in verify? " << new_bc.num_tokens << "\n"; + std::cout << "check dfs tree input size: " << dfs_tree_inputs[1000000].size() << "\n"; @@ -1673,7 +1695,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { - std::cout << "i is: " << i << "old guid" << guid << " new guid" << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid <<"\n"; + std::cout << "i is: " << i << "old guid" << guid << " new guid" + << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] + .request_guid + << "\n"; int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -1689,18 +1714,19 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // Count tokens sent to model in this request to find the final token's // index - std::cout << "previous result index: "<< result_index; + std::cout << "previous result index: " << result_index; result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - - std::cout << "after result index: "<< result_index; + + std::cout << "after result index: " << result_index; // if (true) { // std::cout << "i = " << i << ", result index = " << result_index // << ", value: " << result.token_ids[result_index] - // << ", leaf node num: " << leaf_node_num << ", depth" << depth + // << ", leaf node num: " << leaf_node_num << ", depth" << + // depth // << ", beam size: " << beam_size << "\n"; // } @@ -1765,7 +1791,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, if (i < old_bc.num_tokens) { int new_req_idx = old_bc.tokensInfo[i].request_index; guid = old_bc.requestsInfo[new_req_idx].request_guid; - std::cout << "update guid: " << guid << ", request idx: " << index<< "\n"; + std::cout << "update guid: " << guid << ", request idx: " << index + << "\n"; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -2082,12 +2109,42 @@ std::vector> // In this case the inputSeriedTree ends with padding 0s assert(inputSerializedTree.size() >= outputSerializedTree.size()); + int *treeLayers = new int[inputSerializedTree.size()]; + int node_num = 1; + int layer_num = 0; + for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) { + if (token_id == (inputSerializedTree.size() - 1) || + inputSerializedTree.at(token_id + 1).second != + inputSerializedTree.at(token_id).second) { + treeLayers[layer_num] = node_num; + layer_num += 1; + node_num = 1; + } else { + node_num++; + } + } + + // to avoid branch switch when same tokens in input tree. + + bool findFirst = false; + layer_num = -1; + int first_layer_slot = 0; + int first_layer_slot_total = 0; + int processed_whole_layer_tokens = 0; + for (int i = 0; i < outputSerializedTree.size(); i++) { auto input = inputSerializedTree.at(i); auto output = outputSerializedTree.at(i); + if (i == 0 || inputSerializedTree.at(i - 1).second != + inputSerializedTree.at(i).second) { + layer_num += 1; + processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1]; + } + if (i == 0) { verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); // > if (input.first == verifiedTree.back().first && input.second == verifiedTree.back().second) { - verifiedTree.push_back(output); - new_committed_tokens.push_back(std::make_pair( - input.second, - committed_tokens.at(guid).at(i).second)); // + if (findFirst) { + // must in this branch. + int layer_slot = i - processed_whole_layer_tokens; + int layer_slot_total = treeLayers[layer_num]; + if ((first_layer_slot == layer_slot)) { + verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( + input.second, committed_tokens.at(guid).at(i).second)); + // at this point, you'll not go other branches + std::cout << "verify tree push back: " << output.first + << ", tree size is: " << verifiedTree.size() + << ", ??: " << input.first << ", " << input.second << "\n"; + + } else { + printf("not correct slot\n"); + } + } else { + verifiedTree.push_back(output); + first_layer_slot = i - processed_whole_layer_tokens; + first_layer_slot_total = treeLayers[layer_num]; + findFirst = true; + new_committed_tokens.push_back(std::make_pair( + input.second, + committed_tokens.at(guid).at(i).second)); // + // at this point, you'll not go other branches + std::cout << "verify tree push back: " << output.first + << ", tree size is: " << verifiedTree.size() + << ", ??: " << input.first << ", " << input.second << "\n"; + } + assert(committed_tokens.at(guid).at(i).first == input.second); } } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index e8824feda5..bb6b6030aa 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -57,6 +57,92 @@ void RequestManager::load_tokens_task( cudaMemcpyHostToDevice, stream)); + // // copy meta data to workSpace + // FFHandler handle = *((FFHandler const *)task->local_args); + // size_t total_copy_size = 0; + // cudaMemcpyAsync(handle.batch_config_metadata, + // &(batch_config->tokensInfo), + // sizeof(BatchConfig::tokensInfo), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BatchConfig::tokensInfo); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(batch_config->requestsInfo), + // sizeof(BatchConfig::requestsInfo), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BatchConfig::requestsInfo); + + // // load speculative metadata + // if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + // BeamSearchBatchConfig const *beam_batch_config = + // static_cast(batch_config); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(beam_batch_config->beamTokenInfo), + // sizeof(BeamSearchBatchConfig::beamTokenInfo), + // cudaMemcpyHostToDevice, + // stream); + + // total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(beam_batch_config->beamRequestsInfo), + // sizeof(BeamSearchBatchConfig::beamRequestsInfo), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(beam_batch_config->causalMask), + // sizeof(BatchConfig::causalMask), + // cudaMemcpyHostToDevice, + // stream); + + // total_copy_size += sizeof(BatchConfig::causalMask); + // } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + // TreeVerifyBatchConfig const *tree_batch_config = + // static_cast(batch_config); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(tree_batch_config->causalMask), + // sizeof(BatchConfig::causalMask), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BatchConfig::causalMask); + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(tree_batch_config->committed_tokens), + // sizeof(TreeVerifyBatchConfig::committed_tokens), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + // } + + // // add a size check + // std::cout << "handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; + // assert(total_copy_size <= handle.batch_config_metadata_size); +} + +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); size_t total_copy_size = 0; @@ -126,6 +212,7 @@ void RequestManager::load_tokens_task( } // add a size check + std::cout << "hahaha handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; assert(total_copy_size <= handle.batch_config_metadata_size); } From 6c442593976ebc7efa6a50087a486ee613616a74 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 30 Dec 2023 13:06:37 -0500 Subject: [PATCH 280/344] Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op --- include/flexflow/config.h | 1 + src/ops/embedding.cc | 18 ++++++------------ src/runtime/model.cc | 31 ++++++++++++++++++++----------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c2af6d707c..01f318c6d5 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -132,6 +132,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; bool inference_debugging; diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 007e799fe0..76236e65ff 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -155,11 +155,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; @@ -170,11 +167,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -189,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92f0cff472..975045cd3b 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1499,10 +1499,8 @@ FFRuntime::FFRuntime(FFConfig &config) { Context ctx = config.lg_ctx; ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); - + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { FFInitInfo info; @@ -1518,7 +1516,7 @@ FFRuntime::FFRuntime(FFConfig &config) { // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -2993,6 +2991,12 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -3002,6 +3006,7 @@ Op *FFModel::create_operator_from_layer( 0, true /*gradients*/, tensor->tensor_guid); + assert(pt->get_shape().is_valid()); // assert that this tensor hasn't been mapped before assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; @@ -3260,12 +3265,12 @@ void FFModel::create_operators_from_layers() { if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { assert(op->numOutputs == 1); - Replicate *repl = new Replicate(*this, - op->outputs[0], - op->outputs[0]->num_dims - 1, - config.tensor_parallelism_degree); - operators.push_back(repl); - op = repl; + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; } else if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || @@ -4076,6 +4081,10 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); + // field_space = runtime->create_field_space(lg_ctx); } From ac112037a8e88193d3377684ae2821d253551c2d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 15:09:19 -0500 Subject: [PATCH 281/344] more fix. --- include/flexflow/batch_config.h | 3 + src/ops/inc_multihead_self_attention.cu | 13 ++-- src/ops/kernels/embedding_kernels.cu | 2 +- .../specinfer_inc_multihead_self_attention.cu | 58 ++++++++--------- src/ops/tree_inc_multihead_self_attention.cu | 42 ++++++------ src/runtime/request_manager.cc | 65 ++++++++++--------- 6 files changed, 98 insertions(+), 85 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index c3a75e59a4..8065e0f038 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -69,6 +69,9 @@ class BatchConfig { int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; + + //request id in batch config: + int batch_config_request_id; RequestGuid request_guid; }; struct PerTokenInfo { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 2f16dd71c2..3b3879e8e5 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -82,6 +82,9 @@ __global__ void compute_attention_kernel_generation_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const beam_request_idx = is_beam ? request_idx / max_beam_width : request_idx; int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; @@ -89,8 +92,8 @@ __global__ void compute_attention_kernel_generation_kernel( int const first_step = 0; int const tlength = - request_infos[beam_request_idx].first_token_depth_in_request + - request_infos[beam_request_idx].num_tokens_in_batch; + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; // shared memory objects extern __shared__ char smem_[]; @@ -103,7 +106,7 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + beam_request_idx * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -139,7 +142,7 @@ __global__ void compute_attention_kernel_generation_kernel( DT const *k_cache_batch = key_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + ki; @@ -245,7 +248,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = value_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + vi; diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 3085fdb6ba..6947be432e 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); + // print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); } /*static*/ diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index e84ec3095c..8340519ff3 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -69,36 +69,43 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int const tidx = threadIdx.x; // head id int const head_idx = blockIdx.x; - // request idx + // nth request idx int const request_idx = blockIdx.y; - BatchConfig::BitMask bitmask = causalMask[request_idx]; + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int const first_step = 0; - int const tlength = request_infos[request_idx].first_token_depth_in_request + - request_infos[request_idx].num_tokens_in_batch; + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); - // } - + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("specinfer attn fused kernel!!!\n"); + } int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("specinfer attn fused kernel %d, %d\n", - // totalCacheSize,request_infos[request_idx].num_tokens_in_batch); - // } + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("specinfer attn fused kernel %d, %d\n", + totalCacheSize, + request_infos[batch_config_request_id].num_tokens_in_batch); + } // int const qlength = request_infos[request_idx].num_tokens_in_batch; - int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; // will decode qlength tokens in this thread block // int const qlength = tree_branch_num; int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - // first_token_idx += request_infos[request_idx].num_tokens_in_batch; first_token_idx += causalMask[r].this_layer_size; } @@ -135,7 +142,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -166,10 +173,6 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( } float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - // if (blockIdx.y == 0 && blockIdx.x == 0) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, sub_req_idx); - // } - if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { // todo add alobi here // bool const mask = ti_circ >= totalCacheSize; @@ -177,14 +180,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && sub_req_idx == 0) { - // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", - // ti, - // totalCacheSize, - // bitmask.non_tree_cache_size, - // query_token, - // bitmask.mask[ti - bitmask.non_tree_cache_size]); - // // assert(false); + // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -271,7 +268,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { @@ -461,6 +459,7 @@ void compute_specinfer_attention_kernel_generation( DT *output_ptr, cudaStream_t stream) { // one block == one head per request + printf("??? at here: %d\n", bc->num_active_requests()); dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; @@ -761,13 +760,14 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // std::cout << "specinfer kernel token num: " << bc->num_generation_tokens // << ", " << bc->num_tokens << "\n"; if (bc->num_generation_tokens > 0) { + printf("spec inc generation decoding\n"); compute_specinfer_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); } // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 if (bc->num_tokens > bc->num_generation_tokens) { - // printf("spec inc prompt decoding\n"); + printf("spec inc prompt decoding\n"); compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 8641e63e38..a4329f52db 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -76,13 +76,16 @@ __global__ void compute_attention_kernel_fused_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const first_step = 0; - int const tlength = request_infos[request_idx].first_token_depth_in_request + - request_infos[request_idx].num_tokens_in_batch; - int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tlength = request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = request_infos[batch_config_request_id].num_tokens_in_batch; - BatchConfig::BitMask bitmask = causalMask[request_idx]; + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; // bitmask.mask[1] = 3; // if (head_idx == 0 && tidx == 0) { @@ -132,7 +135,7 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -189,14 +192,14 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); - if (head_idx == 0 && qi == 0 && !mask) { - printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", - request_idx, - ti, - qk, - q_vecs[ki_o][0].x, - k[0].x); - } + // if (head_idx == 0 && qi == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", + // request_idx, + // ti, + // qk, + // q_vecs[ki_o][0].x, + // k[0].x); + // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } } @@ -279,7 +282,7 @@ __global__ void compute_attention_kernel_fused_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; // DT const *v_cache_batch = // value_cache + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * @@ -481,8 +484,7 @@ __global__ void update_tree_branch_kv_cache_fused( int vProjSize, int num_new_tokens, int max_seq_len, - int hidden_size, - int first_token_depth) { + int hidden_size) { CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { int token_idx = i / hidden_size; @@ -498,10 +500,11 @@ __global__ void update_tree_branch_kv_cache_fused( int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; + int const first_token_depth = request_infos[req_id].first_token_depth_in_request; // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d value%.10f\n", req_id, - // token_idx, request_token_offset, kVal); + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", req_id, + // token_idx, request_token_offset,(token_idx + first_token_depth - request_token_offset), kVal); // } kCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * @@ -890,8 +893,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, - m->hidden_size, - bc->requestsInfo[0].first_token_depth_in_request); + m->hidden_size); dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5c3262eb27..e30a7ee478 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -364,6 +364,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } int num_generation_tokens = 0; + int num_active_req = -1; // Step 2: prepare the next batch for existing requests BatchConfig new_bc; @@ -454,6 +455,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase @@ -490,6 +493,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -499,6 +503,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; profile_info.llm_decoding_steps = 1; @@ -574,6 +580,7 @@ BeamSearchBatchConfig int result_index = 0; int num_generation_tokens = 0; + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { @@ -596,10 +603,11 @@ BeamSearchBatchConfig } else { committed_tokens[guid].clear(); } + // iterate through all the tokens that belong to request i int root_abs_depth = request.tokens.size() - 1; - + while (result_index < old_bc.num_tokens && old_bc.tokensInfo[result_index].request_index == i) { int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; @@ -639,7 +647,7 @@ BeamSearchBatchConfig traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); + verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -723,8 +731,10 @@ BeamSearchBatchConfig std::cout << "parse to next iteration: " << "\n"; + new_bc.request_completed[i] = false; new_bc.request_running[i] = true; + num_active_req++; // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = @@ -735,6 +745,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = @@ -805,14 +816,15 @@ BeamSearchBatchConfig log_req_mgr.print("Output: %s", output.c_str()); } - if (request.tokens.size() > 19 && i >= 7) { - std::cout << request.tokens.size() << "\n"; - assert(false); - } + // if (request.tokens.size() > 19 && i >= 7) { + // std::cout << request.tokens.size() << "\n"; + // assert(false); + // } } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; + num_active_req++; std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " << "initial_len: " << request.initial_len << std::endl; @@ -826,6 +838,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; @@ -867,6 +880,7 @@ BeamSearchBatchConfig Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + num_active_req++; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -875,6 +889,7 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; @@ -967,6 +982,8 @@ BeamSearchBatchConfig old_bc.print(); new_bc.print(); } + std::cout << "prepare next batch init active tokens: " + << new_bc.num_tokens << "\n"; return new_bc; } @@ -1027,10 +1044,12 @@ BeamSearchBatchConfig int num_generation_tokens = 0; // Add incremental tokens to the batch + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } + num_active_req ++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1040,29 +1059,6 @@ BeamSearchBatchConfig // assert(processed_tokens < request.tokens.size()); log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - // if (processed_tokens > - // old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() && - // request.status == Request::RUNNING - // // || ir.results[t] == 0 TODO: replace this with - // ) { - // // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - // // old_bc.requestsInfo[i].request_guid, - // // old_bc.beamRequestsInfo[i].max_depth); - // // // new_bc.request_completed[i] = true; - // // new_bc.request_completed[i] = false; - // // new_bc.requestsInfo[i].first_token_depth_in_request = - // processed_tokens; - // // new_bc.requestsInfo[i].request_guid = - // // old_bc.requestsInfo[i].request_guid; - // // new_bc.requestsInfo[i].max_sequence_length = - // // old_bc.requestsInfo[i].max_sequence_length; - // // new_bc.beamRequestsInfo[i].current_depth = - // // old_bc.beamRequestsInfo[i].current_depth; - // // new_bc.request_running[i] = false; - // std::cout << "beam search end:" << request.status << i << ", " - // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; - // } - // else { log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; @@ -1073,6 +1069,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; profiling_requests[request.guid].ssm_decoding_steps += 1; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH @@ -1164,6 +1161,7 @@ BeamSearchBatchConfig // std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer // << "\n"; // } + std::cout << "append bit mask: "<< i << "\n"; appendBitMask(new_bc.causalMask[i], new_bc.beamRequestsInfo[i].sub_request_num, old_bc.beamRequestsInfo[i].beam_size, @@ -1198,6 +1196,7 @@ BeamSearchBatchConfig if (old_bc.request_completed[i] || old_bc.request_running[i]) { continue; } + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1217,6 +1216,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request @@ -1330,6 +1330,8 @@ BeamSearchBatchConfig // std::cout << "Current Beam DepthBBB: " // << old_bc.beamRequestsInfo[0].current_depth << "\n"; } + std::cout << "prepare next batch beam total tokens: " << new_bc.num_tokens + << "gneration tokens: " << new_bc.num_generation_tokens << "\n"; return new_bc; } @@ -1384,11 +1386,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( max_prompt_load_size -= 1; } } - + int num_active_req = -1; for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; } + num_active_req++; size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = all_requests[guid]; @@ -1432,6 +1435,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // copy bitmask to verify batchconfig memcpy(&(new_bc.causalMask[i]), @@ -1590,6 +1594,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; From 7eaffbc480b05d674bbf465c903b2277f6240e0b Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 17:24:08 -0500 Subject: [PATCH 282/344] clean up --- include/flexflow/batch_config.h | 2 +- include/flexflow/ffconst.h | 1 - include/flexflow/model.h | 45 - include/flexflow/operator_params.h | 2 - .../ops/spec_inc_multihead_self_attention.h | 1 + .../specinfer_inc_multihead_self_attention.h | 150 --- ...nfer_inc_multihead_self_attention_params.h | 33 - include/flexflow/request_manager.h | 2 + inference/file_loader.cc | 3 +- inference/models/llama.cc | 2 +- src/ops/argmax.cc | 1 - src/ops/beam_topk.cc | 7 +- src/ops/beam_topk.cu | 39 +- src/ops/inc_multihead_self_attention.cu | 3 +- src/ops/kernels/embedding_kernels.cu | 1 - src/ops/spec_inc_multihead_self_attention.cc | 12 +- src/ops/spec_inc_multihead_self_attention.cu | 1011 +++++++++++------ .../specinfer_inc_multihead_self_attention.cc | 883 -------------- .../specinfer_inc_multihead_self_attention.cu | 958 ---------------- .../tree attn kernel, 0----> -0.029753357172 | 1 - src/ops/tree_inc_multihead_self_attention.cu | 122 +- src/runtime/ffconst_utils.cc | 2 - src/runtime/graph.cc | 71 +- src/runtime/inference_manager.cc | 8 +- src/runtime/model.cc | 149 +-- src/runtime/model.cpp | 4 +- src/runtime/model.cu | 5 +- src/runtime/request_manager.cc | 288 ++--- src/runtime/request_manager.cu | 1 - 29 files changed, 835 insertions(+), 2972 deletions(-) delete mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention.h delete mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h delete mode 100644 src/ops/specinfer_inc_multihead_self_attention.cc delete mode 100644 src/ops/specinfer_inc_multihead_self_attention.cu delete mode 100644 src/ops/tree attn kernel, 0----> -0.029753357172 diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 8065e0f038..13904aaa46 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -70,7 +70,7 @@ class BatchConfig { int num_tokens_in_batch; int max_sequence_length; - //request id in batch config: + // request id in batch config: int batch_config_request_id; RequestGuid request_guid; }; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index ef0003b08e..512645e624 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -171,7 +171,6 @@ enum OperatorType { OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // Parallel Ops OP_REPARTITION, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 9cdbec64a9..16df99ab1a 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -172,8 +172,6 @@ enum TaskIDs { SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -327,7 +325,6 @@ class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; -class SpecInferIncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -747,25 +744,6 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); - -Tensor specinfer_inc_multihead_self_attention( - const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); Tensor inc_multiquery_self_attention(const Tensor input, int embed_dim, int num_q_heads, @@ -822,26 +800,6 @@ Tensor specinfer_inc_multihead_self_attention( bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); - - Tensor specinfer_inc_multiquery_self_attention( - const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); // ======================================== // Inference APIs // ======================================== @@ -1243,9 +1201,6 @@ Tensor specinfer_inc_multihead_self_attention( std::unordered_map< std::pair, TreeIncMultiHeadSelfAttention *>, - std::unordered_map< - std::pair, - SpecInferIncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index cee2ae95a4..5b187839ef 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -37,7 +37,6 @@ #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" #include "flexflow/ops/tree_inc_multihead_self_attention_params.h" -#include "flexflow/ops/specinfer_inc_multihead_self_attention_params.h" #include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" @@ -73,7 +72,6 @@ using OperatorParameters = mp::variant -#include - -namespace FlexFlow { - -class SpecInferIncMultiHeadSelfAttentionMeta; - -class SpecInferIncMultiHeadSelfAttention : public Op { -public: - using Params = SpecInferIncMultiHeadSelfAttentionParams; - using Input = ParallelTensor; - - SpecInferIncMultiHeadSelfAttention(FFModel &model, - LayerID const &layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name); - SpecInferIncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name); - SpecInferIncMultiHeadSelfAttention(FFModel &model, - SpecInferIncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); - SpecInferIncMultiHeadSelfAttention(FFModel &model, - Params const ¶ms, - Input const &inputs, - bool allocate_weights = false, - char const *name = nullptr); - static Op * - create_operator_from_layer(FFModel &model, - Layer const *layer, - std::vector const &inputs); - void init(FFModel const &) override; - void init_inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; - void forward(FFModel const &) override; - void backward(FFModel const &) override; - Legion::FutureMap inference(FFModel const &, - BatchConfigFuture const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; - void print_layer(FFModel const &model) override { - assert(0); - } - bool get_int_parameter(PMParameter, int *) const override; - - static OpMeta *init_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - static void inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - Op *materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const override; - bool measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const override; - - static void - inference_kernel_wrapper(SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); - Params get_params() const; - -public: - int num_q_heads, num_kv_heads, tensor_parallelism_degree; - float dropout, scaling_factor; - bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int qoSeqLength, kvSeqLength; -}; - -class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { -public: - SpecInferIncMultiHeadSelfAttentionMeta(FFHandler handler, - SpecInferIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, - MemoryAllocator &gpu_mem_allocator, - int num_samples, - int _num_q_heads, - int _num_kv_heads); - ~SpecInferIncMultiHeadSelfAttentionMeta(void); - -public: - Realm::RegionInstance beam_search_reserve_inst; - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; - BatchConfig::BitMask *causalMask; -}; - -}; // namespace FlexFlow - -#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_H diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h deleted file mode 100644 index b57b06a7f7..0000000000 --- a/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H -#define _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H - -#include "flexflow/ffconst.h" -#include "flexflow/fftype.h" -#include "flexflow/parallel_tensor.h" - -namespace FlexFlow { - -struct SpecInferIncMultiHeadSelfAttentionParams { - LayerID layer_guid; - int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; - float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; - - bool is_valid(ParallelTensorShape const &) const; -}; - -bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &, - SpecInferIncMultiHeadSelfAttentionParams const &); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash { - size_t - operator()(FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const &) const; -}; -} // namespace std - -#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 8cb45e55b4..1c4b0b2a2f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -231,6 +231,8 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + + // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; // private fields std::unique_ptr tokenizer_; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 3f70ddf488..7c6870d439 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -726,8 +726,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION) { + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { if (weight_filename.find("self_attention") != std::string::npos) { load_attention_weights_multi_query( data, weight_filename, weights_folder, hidden_dim, num_heads); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 4f76e9e0fa..10001ee916 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -90,7 +90,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.specinfer_inc_multihead_self_attention( + mha = ff.spec_inc_multihead_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index d195a5af75..c3bb3d493e 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -399,7 +399,6 @@ InferenceResult m, shard_id, bc, {}, {}, {input, indices}); } - // print_tensor(indices.get_int32_ptr(), 199, "tree attn output"); download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 5dfaae41ee..87d357b535 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -370,14 +370,10 @@ BeamInferenceResult Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - - printf("----------1-----------\n"); + int *index_ptr = index.get_int32_ptr(); - printf("----------2-----------\n"); float *value_ptr = value.get_float_ptr(); - printf("----------3-----------\n"); int *parent_ptr = parent.get_int32_ptr(); - printf("----------4-----------\n"); // embedding size: eg. 4096 int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; @@ -404,7 +400,6 @@ BeamInferenceResult // print_tensor(index_ptr, 32, "indexxxxxxx"); - if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index d647fe9ed7..a958786be3 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -379,9 +379,9 @@ template __global__ void mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { using T_ACC = T; - int64_t const i = blockIdx.x; + const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - int64_t const index = i * N + j; + const int64_t index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rstd[i]); } } @@ -556,7 +556,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -564,12 +563,13 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - std::cout << "probbbb req: " << i << ", sub req probability : " - << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j - << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] - << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j - << "\n"; + // std::cout << "probbbb req: " << i << ", sub req probability : " + // << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << + // j + // << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + // << ", data inddd" + // << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + // << "\n"; } // process tokens @@ -584,7 +584,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); max_beam_width = std::max(max_beam_width, beam_size); - std::cout << "max beam width: " << max_beam_width << "\n"; req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } @@ -625,23 +624,23 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, cudaMemcpyHostToDevice, stream)); // trick, set acc_probs to 0; - checkCUDA( - cudaMemsetAsync(m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); + checkCUDA(cudaMemsetAsync( + m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); checkCUDA(cudaMemcpyAsync(m->block_start_index, beam_block_start_index.data(), sizeof(int) * beam_num_blocks, cudaMemcpyHostToDevice, stream)); checkCUDA(cudaMemcpyAsync(m->request_id, - request_id.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); + request_id.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); checkCUDA(cudaMemcpyAsync(m->tokens_per_request, - tokens_per_request.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); + tokens_per_request.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; beam_num_blocks = bc->num_active_tokens(); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 3b3879e8e5..cca0b230c3 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -106,7 +106,8 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 6947be432e..22d8161ff1 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,6 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - // print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index eb6fd721e6..5d234df822 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -53,7 +53,7 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid( } Tensor - FFModel::spec_inc_multihead_self_attention(const Tensor input, + FFModel::spec_inc_multihead_self_attention(Tensor const input, int embed_dim, int num_heads, int kdim, @@ -91,7 +91,7 @@ Tensor } Tensor - FFModel::spec_inc_multiquery_self_attention(const Tensor input, + FFModel::spec_inc_multiquery_self_attention(Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -257,7 +257,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -358,8 +358,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -465,7 +465,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, SpecIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : SpecIncMultiHeadSelfAttention(model, other.layer_guid, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 29e3d9a48d..b3a87fe244 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -23,16 +23,295 @@ namespace FlexFlow { +#define WARP_SIZE 32 + // declare Legion names using Legion::coord_t; using Legion::Memory; using namespace Kernels::IncMultiHeadAttention; namespace Kernels { -namespace SpecIncMultiHeadAttention { +namespace SpecIncMultiHeadSelfAttention { + +template +__global__ void compute_spec_inc_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BatchConfig::BitMask *causalMask) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // nth request idx + int const request_idx = blockIdx.y; + + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn fused kernel %d, %d\n", + // totalCacheSize, + // request_infos[batch_config_request_id].num_tokens_in_batch); + // } + // int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + + // will decode qlength tokens in this thread block + // int const qlength = tree_branch_num; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += causalMask[r].this_layer_size; + } + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < tree_branch_num; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + + int const query_token = bitmask.tree_size - tree_branch_num + qi; + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < totalCacheSize) { + + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} template -__global__ void spec_store_kv_cache( +__global__ void spec_inc_store_kv_cache( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, @@ -40,16 +319,16 @@ __global__ void spec_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, int num_tokens, int max_seq_len, - int max_beam_width, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -58,100 +337,36 @@ __global__ void spec_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const first_token_in_req = + requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - int const beam_depth = beamRequestInfos[req_id].current_depth; - int const beam_width = beamRequestInfos[req_id].beam_size; - - kCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - - // replica in the root iteration - if (beam_depth == 1) { - for (int i = 1; i < beam_width; i++) { - kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - } - } + int const total_token = requestInfo[req_id].num_tokens_in_batch; - // if (head_idx == 0 && beam_depth == 0 && token_idx == 8 && k_cache) { - // // printf("token idx %d\n", token_idx); - // printf("data idx: %d, tok_id %d, new_token_cache_idx %d, parent_id %d, - // " - // "sub_req_id %d, num_tokens %d, kProjSize %d, num_kv_heads %d, - // val " - // "%f, beam_width %d\n", - // data_idx, - // tok_id, - // new_token_cache_idx, - // parent_id, - // sub_req_id, - // num_tokens, - // kProjSize, - // num_kv_heads, - // val, - // beam_width); - // } + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; - // naive cache stealing - if (sub_req_id != parent_id) { - // if (offset == 0 && tok_id == 0) { - // printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - // "%d, tok_id %d\n", - // beam_depth, - // req_id, - // sub_req_id, - // parent_id, - // tok_id); - // } - - for (int depth = 0; depth < beam_depth; depth++) { - int steal_token_idx = tok_id - beam_depth + depth; - int steal_from_idx = (req_id * max_beam_width + parent_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; - vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; - - // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ - // printf("cache stealing kernel!, steal_token_idx %d\n", - // steal_token_idx); - // } - } - } + BatchConfig::BitMask bitmask = causalMask[req_id]; - // parallel cache stealing not yet implemented - // logic shld be - // launch spec_store_kv_cache with parallelism * current depth - // from the i here, get depth index - // if depth index not the current one, check if we need to steal - // steal if needed - - // cache stealing theory - // identify which sub request does this token come from - // for initial token, 0 - // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and - // which to be delete copy beam_size bunch of blocks when sub_req_id == - // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + int const sub_request_num = beamRequestInfos[req_id].sub_request_num; + + int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + + // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - + // tree_branch_num + sub_req_id + tok_id; + // bitmask.tree_size - tree_branch_num + sub_req_id; + + // if prompt token -> token id + // if tree token: + int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - + bitmask.this_layer_size + token_idx - + request_token_offset; + + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -161,28 +376,79 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - spec_store_kv_cache<<>>(static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /*root*/ curr_depth == 0, - m->hidden_size); + spec_inc_store_kv_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->causalMask, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + /*root*/ curr_depth == 0, + m->hidden_size); + } +} + +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_spec_inc_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->causalMask) + +template +void compute_spec_inc_attention_kernel_generation( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // one block == one head per request + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); } } @@ -204,13 +470,14 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void compute_attention_kernel_prompt( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); @@ -236,199 +503,208 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } - for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; + // else if (tokens_previous_requests < bc->num_generation_tokens) { + // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // continue; + // } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; - if (num_new_tokens <= 0) { - continue; - } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>( - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; + if (num_new_tokens <= 0) { + continue; + } + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // To get B, skip over K entries from previous requests (all heads + + // padding) + + // print_tensor((float*)A, 32, "A"); + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // print_tensor((float*)C, 32, "C"); + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + spec_fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + + // print_tensor((float*)C_softmax, 32, "C_softmax"); + C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; } // assert(tokens_previous_requests == num_tokens); @@ -443,31 +719,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { - // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_token_infos, - &(bc->beamTokenInfo), - bc->num_active_tokens() * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_request_infos, - &(bc->beamRequestsInfo), - bc->max_requests_per_batch() * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), - cudaMemcpyHostToDevice, - stream); // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, bc, shard_id, @@ -479,7 +732,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { - compute_attention_kernel_generation
( + compute_spec_inc_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); } // phase 3: Compute attention score @@ -488,16 +741,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } - // compute output production and bias together for all tokens - int num_tokens = - bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + int num_tokens = bc->num_active_tokens(); compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } -} // namespace SpecIncMultiHeadAttention +} // namespace SpecIncMultiHeadSelfAttention } // namespace Kernels /*static*/ @@ -529,25 +780,27 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); } else if (input.data_type == DT_FLOAT) { float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -559,7 +812,8 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", + elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); @@ -606,44 +860,51 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t beam_tokeninfo_size = - max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); - size_t beam_requestinfo_size = - BeamSearchBatchConfig::max_requests_per_batch(); - size_t total_size = - beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: - BeamSearchPerRequestInfo); // more components will - // be added here later - - // We always directly allocate memory for small speculative models - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); + // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); + // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + // total_size); + beam_token_infos = - gpu_mem_allocator - .allocate_instance( - beam_tokeninfo_size); + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + + beam_request_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + // causalMask = gpu_mem_allocator.allocate_instance( + // causal_mask_size); + // beam_token_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_tokeninfo_size); // offset += beam_tokeninfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - beam_request_infos = - gpu_mem_allocator - .allocate_instance( - beam_requestinfo_size); + // beam_request_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_requestinfo_size); // offset += beam_requestinfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); // assert(offset == total_size); - assert(gpu_mem_allocator.instance_total_size == - gpu_mem_allocator.instance_allocated_size); + // assert(gpu_mem_allocator.instance_total_size == + // gpu_mem_allocator.instance_allocated_size); } cudaStreamSynchronize(stream); } -SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta( + void) { if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { beam_search_reserve_inst.destroy(); } diff --git a/src/ops/specinfer_inc_multihead_self_attention.cc b/src/ops/specinfer_inc_multihead_self_attention.cc deleted file mode 100644 index 42074f39e4..0000000000 --- a/src/ops/specinfer_inc_multihead_self_attention.cc +++ /dev/null @@ -1,883 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" -#include "flexflow/ffconst_utils.h" -#include "flexflow/model.h" -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif -#include "flexflow/utils/hash_utils.h" -#include "legion/legion_utilities.h" - -namespace FlexFlow { - -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::Future; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; - -bool SpecInferIncMultiHeadSelfAttentionParams::is_valid( - ParallelTensorShape const &input) const { - bool is_valid = input.is_valid(); - return is_valid; -} - -Tensor FFModel::specinfer_inc_multihead_self_attention( - Tensor const input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { - return specinfer_inc_multiquery_self_attention(input, - embed_dim, - num_heads, - num_heads, - kdim, - vdim, - dropout, - qkv_bias, - final_bias, - add_zero_attn, - data_type, - kernel_initializer, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - name); -} - -Tensor FFModel::specinfer_inc_multiquery_self_attention( - Tensor const input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { - if (data_type == DT_NONE) { - data_type = input->data_type; - } - Layer *li = nullptr; - int weight_num = (qkv_bias || final_bias) ? 2 : 1; - if (data_type != input->data_type) { - Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); - li = new Layer(this, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - data_type, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - casted_input); - } else { - li = new Layer(this, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - data_type, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - input); - } - { - int numdims = input->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[i]; - } - dims[0] = embed_dim; - li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, data_type, li, 0, true /*create_grad*/); - } - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_q_heads + kParas * num_q_heads + - vParas * num_q_heads + oParas * num_q_heads; - { - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - li->data_type = data_type; - li->add_int_property("embed_dim", embed_dim); - li->add_int_property("num_q_heads", num_q_heads); - li->add_int_property("num_kv_heads", num_kv_heads); - li->add_int_property("kdim", kdim); - li->add_int_property("vdim", vdim); - li->add_int_property("qkv_bias", qkv_bias); - li->add_int_property("final_bias", final_bias); - li->add_int_property("add_zero_attn", add_zero_attn); - li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); - li->add_int_property("scaling_query", scaling_query); - li->add_float_property("scaling_factor", scaling_factor); - li->add_int_property("qk_prod_scaling", qk_prod_scaling); - li->add_int_property("position_bias", position_bias); - layers.push_back(li); - return li->outputs[0]; -} - -Op *SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - - std::cout << "spec create operator: " << layer->name << "\n"; - long long value; - layer->get_int_property("embed_dim", value); - int embed_dim = value; - layer->get_int_property("num_q_heads", value); - int num_q_heads = value; - layer->get_int_property("num_kv_heads", value); - int num_kv_heads = value; - layer->get_int_property("kdim", value); - int kdim = value; - layer->get_int_property("vdim", value); - int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); - layer->get_int_property("qkv_bias", value); - bool qkv_bias = (bool)value; - layer->get_int_property("final_bias", value); - bool final_bias = (bool)value; - layer->get_int_property("add_zero_attn", value); - bool add_zero_attn = (bool)value; - layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; - layer->get_int_property("scaling_query", value); - bool scaling_query = (bool)value; - float scaling_factor; - layer->get_float_property("scaling_factor", scaling_factor); - layer->get_int_property("qk_prod_scaling", value); - bool qk_prod_scaling = (bool)value; - layer->get_int_property("position_bias", value); - bool position_bias = (bool)value; - - return new SpecInferIncMultiHeadSelfAttention(model, - layer->layer_guid, - inputs[0], - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - qkv_bias, - final_bias, - add_zero_attn, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - false /*allocate_weights*/, - layer->name); -} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - LayerID const &_layer_guid, - ParallelTensor const _input, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name) - // Initializer* _bias_initializer) - : Op(model, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - _input->data_type, - name, - 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, - 1 /*outputs*/, - _input), - num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), - add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { - // overwrite layer_guid - layer_guid = _layer_guid; - - numOutputs = 1; - int numdim = _input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = _input->dims[i]; - } - dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* // Check correctness */ - /* assert(check_output_input_weight_parallel_dims()); */ -} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - ParallelTensor const _input, - ParallelTensor const _weight, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name) - // Initializer* _bias_initializer) - : Op(model, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - _input->data_type, - name, - 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, - 1 /*outputs*/, - _input, - _weight), - num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), - add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) -// bias_initializer(_bias_initializer) -{ - numOutputs = 1; - int numdim = _input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = _input->dims[i]; - } - dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - // dims[2].size = qParas + kParas + vParas + oParas; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, this->data_type, this); - - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ - /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ - // Check correctness - /* assert(check_output_input_weight_parallel_dims()); */ -} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - SpecInferIncMultiHeadSelfAttention const &other, - ParallelTensor const input, - bool allocate_weights) - : SpecInferIncMultiHeadSelfAttention(model, - other.layer_guid, - input, - other.oProjSize, - other.num_q_heads, - other.num_kv_heads, - other.qProjSize, - other.vProjSize, - other.dropout, - other.qkv_bias, - other.final_bias, - other.add_zero_attn, - other.apply_rotary_embedding, - other.scaling_query, - other.scaling_factor, - other.qk_prod_scaling, - other.position_bias, - allocate_weights, - other.name) {} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - SpecInferIncMultiHeadSelfAttentionParams const ¶ms, - ParallelTensor const &input, - bool allocate_weights, - char const *name) - : SpecInferIncMultiHeadSelfAttention(model, - params.layer_guid, - input, - params.embed_dim, - params.num_q_heads, - params.num_kv_heads, - params.kdim, - params.vdim, - params.dropout, - params.qkv_bias, - params.final_bias, - params.add_zero_attn, - params.apply_rotary_embedding, - params.scaling_query, - params.scaling_factor, - params.qk_prod_scaling, - params.position_bias, - allocate_weights, - name) {} - -void SpecInferIncMultiHeadSelfAttention::init_inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = batch_outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); -} - -void SpecInferIncMultiHeadSelfAttention::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -/* - regions[0](I): input - regions[1](I): weight - regions[2](O): output -*/ -OpMeta *SpecInferIncMultiHeadSelfAttention::init_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - SpecInferIncMultiHeadSelfAttention const *attn = - (SpecInferIncMultiHeadSelfAttention *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - - GenericTensorAccessorR input = - helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, - regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorW output = - helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime); - - int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; - assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_q_heads = attn->num_q_heads; - int num_kv_heads = attn->num_kv_heads; - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); - MemoryAllocator gpu_mem_allocator(gpu_mem); - // We don't do offloading for SSMs (small speculative models) - SpecInferIncMultiHeadSelfAttentionMeta *m = - new SpecInferIncMultiHeadSelfAttentionMeta(handle, - attn, - weight, - gpu_mem_allocator, - num_samples, - num_q_heads, - num_kv_heads); - // assert that we didn't over allocate memory - assert(gpu_mem_allocator.instance_allocated_size == - gpu_mem_allocator.instance_total_size); - m->profiling = attn->profiling; - m->inference_debugging = attn->inference_debugging; - std::strcpy(m->op_name, attn->name); - m->layer_guid = attn->layer_guid; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); - return m; -} - -void SpecInferIncMultiHeadSelfAttention::forward(FFModel const &ff) { - // SpecInferIncMultiHeadSelfAttention doesn't support forward - assert(false); -} - -FutureMap SpecInferIncMultiHeadSelfAttention::inference( - FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - int idx = 0; - IndexLauncher launcher(SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(idx++, FID_DATA); - - if (qkv_bias || final_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(idx++, FID_DATA); - } - return runtime->execute_index_space(ctx, launcher); -} - -/* - regions[0](I): input - regions[3](I): weight - regions[4](O): output -*/ -void SpecInferIncMultiHeadSelfAttention::inference_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(task->regions.size() == regions.size()); - - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - if (bc.num_tokens == 0) { - return; - } - - SpecInferIncMultiHeadSelfAttentionMeta *m = - *((SpecInferIncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - - assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); - assert(output_domain.get_dim() == 4); - - assert(task->index_point.get_dim() == 1); - SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); - if (m->inference_debugging) { - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } - SpecInferIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); - } -} - -void SpecInferIncMultiHeadSelfAttention::backward(FFModel const &ff) { - // SpecInferIncMultiHeadSelfAttention does not support backward - assert(false); -} - -bool SpecInferIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, - int *value) const { - switch (para) { - case PM_NUM_HEADS: - *value = num_q_heads; - return true; - default: - return Op::get_int_parameter(para, value); - } -} - -Op *SpecInferIncMultiHeadSelfAttention::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - SpecInferIncMultiHeadSelfAttentionParams params = get_params(); - return new SpecInferIncMultiHeadSelfAttention( - ff, params, inputs[0], true, this->name); -} - -bool SpecInferIncMultiHeadSelfAttention::measure_operator_cost( - Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; -} - -bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &lhs, - SpecInferIncMultiHeadSelfAttentionParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && - lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && - lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && - lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && - lhs.scaling_query == rhs.scaling_query && - lhs.scaling_factor == rhs.scaling_factor && - lhs.qk_prod_scaling == rhs.qk_prod_scaling && - lhs.position_bias == rhs.position_bias; -} - -SpecInferIncMultiHeadSelfAttentionParams - SpecInferIncMultiHeadSelfAttention::get_params() const { - SpecInferIncMultiHeadSelfAttentionParams params; - params.layer_guid = this->layer_guid; - params.embed_dim = this->oProjSize; - params.num_q_heads = this->num_q_heads; - params.num_kv_heads = this->num_kv_heads; - params.kdim = this->kProjSize; - params.vdim = this->vProjSize; - params.dropout = this->dropout; - params.qkv_bias = this->qkv_bias; - params.final_bias = this->final_bias; - params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; - params.scaling_query = this->scaling_query; - params.scaling_factor = this->scaling_factor; - params.qk_prod_scaling = this->qk_prod_scaling; - params.position_bias = this->position_bias; - - return params; -} - -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.layer_guid.id); - hash_combine(key, params.embed_dim); - hash_combine(key, params.num_q_heads); - hash_combine(key, params.num_kv_heads); - hash_combine(key, params.kdim); - hash_combine(key, params.vdim); - hash_combine(key, params.dropout); - hash_combine(key, params.qkv_bias); - hash_combine(key, params.final_bias); - hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); - hash_combine(key, params.scaling_query); - hash_combine(key, params.scaling_factor); - hash_combine(key, params.qk_prod_scaling); - hash_combine(key, params.position_bias); - return key; -} -}; // namespace std diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu deleted file mode 100644 index 8340519ff3..0000000000 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ /dev/null @@ -1,958 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "cuComplex.h" -#endif -#include "flexflow/ffconst_utils.h" -#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" -#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" -#include "flexflow/utils/cuda_helper.h" - -namespace FlexFlow { - -#define WARP_SIZE 32 - -// declare Legion names -using Legion::coord_t; -using Legion::Memory; -using namespace Kernels::IncMultiHeadAttention; - -namespace Kernels { -namespace SpecInferIncMultiHeadAttention { - -template -__global__ void compute_specinfer_attention_kernel_generation_kernel( - DT const *query, - DT const *key_cache, - DT const *value_cache, - DT *output_ptr, - float const scale, - int const max_seq_length, - int per_head_size, - int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask) { - - // q, k - using Q_vec = typename VEC_K::Type; - using K_vec = typename VEC_K::Type; - using V_vec = typename VEC_V
::Type; - using Out_sum = typename Vec_fp32_::Type; - - constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - - constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); - constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; - constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); - - // thread id - int const tidx = threadIdx.x; - // head id - int const head_idx = blockIdx.x; - // nth request idx - int const request_idx = blockIdx.y; - - // request id in batch config - int const batch_config_request_id = - request_infos[request_idx].batch_config_request_id; - - // request_idx = re - - BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; - - int const first_step = 0; - - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; - - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("specinfer attn fused kernel!!!\n"); - } - - int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; - - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("specinfer attn fused kernel %d, %d\n", - totalCacheSize, - request_infos[batch_config_request_id].num_tokens_in_batch); - } - // int const qlength = request_infos[request_idx].num_tokens_in_batch; - int const tree_branch_num = - beam_request_infos[batch_config_request_id].sub_request_num; - - // will decode qlength tokens in this thread block - // int const qlength = tree_branch_num; - - int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += causalMask[r].this_layer_size; - } - - // if (tidx == 0 && head_idx == 0) { - // printf("spec req: %d, %d\n", request_idx, first_token_idx); - // } - - // shared memory objects - extern __shared__ char smem_[]; - - float *qk_smem = reinterpret_cast(smem_); - float *out_smem = reinterpret_cast(smem_); - - float qk_max = -FLT_MAX; - - // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum - __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - - const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + - head_idx * per_head_size; - __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; - - // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE - int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; - int ki_o = tidx % THREADS_PER_KEY; - // the first key's offset for this thread - // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... - int ko = tidx / THREADS_PER_KEY; - // load q tensor - Q_vec q_vec[K_VECS_PER_THREAD]; - - constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; - // The number of keys per warp. - constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; - - DT const *k_cache_batch = - key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; - - int ti_end = - div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; - - for (int qi = 0; qi < tree_branch_num; qi += 1) { -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + - ii * THREADS_PER_KEY * K_VEC_SIZE); - } - - int const query_token = bitmask.tree_size - tree_branch_num + qi; - - __syncthreads(); - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - K_vec k[K_VECS_PER_THREAD]; - int const ti_circ = ti % max_seq_length; - - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < totalCacheSize) { - - k[ii] = *reinterpret_cast( - k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + - jj); - } - } - float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - - if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { - // todo add alobi here - // bool const mask = ti_circ >= totalCacheSize; - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); - // } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[ti - first_step] = mask ? 0.f : qk; - } - } - - __syncthreads(); - -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Decompose the thread index into warp and lane. - int const warp = tidx / WARP_SIZE; - int const lane = tidx % WARP_SIZE; - - // The warp leader writes the max to shared memory. - if (lane == 0) { - red_smem[warp] = qk_max; - } - - // Make sure the products are in shared memory. - __syncthreads(); - - // The warps finalize the reduction. - qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Broadcast to all the threads in the warp. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn first token qk_max %.10f\n", qk_max); - // } - - float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < totalCacheSize; - ti += THREADS_PER_BLOCK) { - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); - exp_sum += logit; - qk_smem[ti - first_step] = mask ? 0.0f : logit; - } - - // Compute the sum. - exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn exp_sum %.10f\n", exp_sum); - // } - - // softmax - float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < totalCacheSize; - ti += THREADS_PER_BLOCK) { - qk_smem[ti - first_step] *= inv_sum; - } - - __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("softmax %.10f\n", qk_smem[0]); - // } - - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // A vector of V elements for the current timestep. - // using V_vec_k = typename V_vec_k_::Type; - // using V_vec_acum = typename V_vec_acum_fp32_::Type; - - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - - Out_sum out; - zero(out); - - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + - vi; - - if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { - // Load the values from the cache. - int const ti_circ = ti % max_seq_length; - V_vec v = *reinterpret_cast( - v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - float logit = mask ? 0.0f : qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - } - } - - // // Make sure we can start writing to shared memory. - __syncthreads(); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("valueX %.10f\n", out.x); - // } - - // Run the final reduction amongst the different groups computing different - // partial outputs. - if (Dh == Dh_MAX || vi < Dh) { -#pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; - active_groups /= 2) { - - // The midpoint in the number of active groups. - int midpoint = active_groups / 2; - - // The upper part of active threads store to shared memory. - if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { - *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = - out; - } - __syncthreads(); - - // The bottom warps update their values. - if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { - out = add(*reinterpret_cast(out_smem + vo * Dh + vi), - out); - } - __syncthreads(); - } - } - - // Output the final values. - if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float(*reinterpret_cast( - output_ptr + (first_token_idx + qi) * hidden_size + - head_idx * per_head_size + vi), - out); - } - } -} - -template -__global__ void specinfer_store_kv_cache( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - BatchConfig::PerTokenInfo *tokenInfos, - BatchConfig::PerRequestInfo *requestInfo, - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, - BatchConfig::BitMask *causalMask, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens, - int max_seq_len, - bool is_root, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / (hidden_size); - int offset = i % hidden_size; - - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const first_token_in_req = - requestInfo[req_id].first_token_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const total_token = requestInfo[req_id].num_tokens_in_batch; - - int const request_token_offset = - requestInfo[req_id].first_token_offset_in_batch; - - BatchConfig::BitMask bitmask = causalMask[req_id]; - - int const sub_request_num = beamRequestInfos[req_id].sub_request_num; - - int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; - - // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - - // tree_branch_num + sub_req_id + tok_id; - // bitmask.tree_size - tree_branch_num + sub_req_id; - - // if prompt token -> token id - // if tree token: - int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx - - request_token_offset; - - kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + - offset] = vVal; - } -} - -template -void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); - if (num_tokens > 0) { - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - // printf("tokenInfo %d, %d\n", - // bc->beamTokenInfo[0].sub_request_index, - // num_tokens); - specinfer_store_kv_cache<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->causalMask, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, - /*root*/ curr_depth == 0, - m->hidden_size); - } -} - -#define LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( \ - DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ - smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ - THREADS_PER_VALUE, \ - THDS_PER_BLOCK); \ - compute_specinfer_attention_kernel_generation_kernel \ - <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos, \ - m->beam_request_infos, \ - m->causalMask) - -template -void compute_specinfer_attention_kernel_generation( - SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream) { - // one block == one head per request - printf("??? at here: %d\n", bc->num_active_requests()); - dim3 grid(m->num_q_heads, bc->num_active_requests()); - int const per_head_size = m->qProjSize; - float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - size_t smem_sz; - if (per_head_size == 64) { - constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; - LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( - DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); - } else if (per_head_size == 128) { - constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; - LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( - DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); - } else { - assert(false && "a unsupported head size"); - } -} - -template -__global__ void spec_fill_entries_above_diagonal(DT *matrix, - size_t new_tokens, - size_t total_tokens_in_request, - size_t num_q_heads, - DT value) { - CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { - // size_t head_idx = i / (new_tokens * total_tokens_in_request); - size_t src_idx = (i / new_tokens) % total_tokens_in_request; - size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; - // Casual Mask - if (src_idx > dst_idx) { - matrix[i] = value; - } - } -} - -template -void compute_attention_kernel_prompt( - SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int tokens_prev_requests_squares = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int q_block_size = m->qProjSize; - - int kt_block_size = m->kProjSize; - int kt_req_block_size = kt_block_size * m->num_q_heads * - (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); - int vt_block_size = m->vProjSize; - int vt_req_block_size = vt_block_size * m->num_q_heads * - (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // else if (tokens_previous_requests < bc->num_generation_tokens) { - // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - // continue; - // } - - // all requests in prompt phase should only have one sub requests; - assert(bc->sub_requests[i] == 1); - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; - - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - - if (num_new_tokens <= 0) { - continue; - } - - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - - // print_tensor((float*)A, 32, "A"); - std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // print_tensor((float*)C, 32, "C"); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - - // print_tensor((float*)C_softmax, 32, "C_softmax"); - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; - } - - // assert(tokens_previous_requests == num_tokens); -} - -template -void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - cudaStream_t stream) { - // phase 1: Implement kernel to compute KQV for input tokens - - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); - // phase 2: Update key/val cache - update_kv_cache_kernel
(m, bc, stream); - // std::cout << "specinfer kernel token num: " << bc->num_generation_tokens - // << ", " << bc->num_tokens << "\n"; - if (bc->num_generation_tokens > 0) { - printf("spec inc generation decoding\n"); - compute_specinfer_attention_kernel_generation
( - m, bc, static_cast
(m->attn_heads), stream); - } - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - if (bc->num_tokens > bc->num_generation_tokens) { - printf("spec inc prompt decoding\n"); - compute_attention_kernel_prompt( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); - } - // compute_attention_kernel_prompt( - // m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); - - // compute output production and bias together for all tokens - int num_tokens = bc->num_active_tokens(); - - // std::cout << "specinfer num tokens: " << num_tokens; - - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); -} - -} // namespace SpecInferIncMultiHeadAttention -} // namespace Kernels - -/*static*/ -void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( - SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; - - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } - - assert(input.data_type == weight.data_type); - assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } - - if (input.data_type == DT_HALF) { - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecInferIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); - } else if (input.data_type == DT_FLOAT) { - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecInferIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); - } else { - assert(false && "Unspported data type"); - } - - if (m->profiling) { - cudaEventRecord(t_end, stream); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("SpecInferIncMultiHeadSelfAttention forward time = %.2fms\n", - elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); - } - // save_tensor(output.get_float_ptr(), 768 * 3, - // "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); - // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, - // "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); - - // if(bc->num_tokens == 1){ - // print_tensor(input.get_float_ptr(), 32, "specinc input"); - // print_tensor(output.get_float_ptr(), 32, "specinc output"); - // assert(false); - // } -} - -SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( - FFHandler handler, - SpecInferIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, - MemoryAllocator &gpu_mem_allocator, - int num_samples, - int _num_q_heads, - int _num_kv_heads) - : IncMultiHeadSelfAttentionMeta(handler, - BEAM_SEARCH_MODE, - attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, - attn->scaling_query, - attn->qk_prod_scaling, - attn->position_bias, - attn->final_bias, - attn->scaling_factor, - weight, - gpu_mem_allocator, - num_samples, - attn->num_q_heads, - attn->num_kv_heads, - _num_q_heads, - _num_kv_heads, - DT_NONE, - false) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDNN(cudnnSetStream(handler.dnn, stream)); - - // allocate memory for the seqArray and reserve space - { - // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); - // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - // total_size); - - beam_token_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); - - beam_request_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - // causalMask = gpu_mem_allocator.allocate_instance( - // causal_mask_size); - // beam_token_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - // beam_request_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - // assert(gpu_mem_allocator.instance_total_size == - // gpu_mem_allocator.instance_allocated_size); - } - - cudaStreamSynchronize(stream); -} - -SpecInferIncMultiHeadSelfAttentionMeta::~SpecInferIncMultiHeadSelfAttentionMeta( - void) { - if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { - beam_search_reserve_inst.destroy(); - } -} - -}; // namespace FlexFlow diff --git a/src/ops/tree attn kernel, 0----> -0.029753357172 b/src/ops/tree attn kernel, 0----> -0.029753357172 deleted file mode 100644 index e4f14ee757..0000000000 --- a/src/ops/tree attn kernel, 0----> -0.029753357172 +++ /dev/null @@ -1 +0,0 @@ -tree attn kernel, 0----> -0.02975335717201232910 0.01930358447134494781 0.03780741989612579346 0.11878532171249389648 -0.03523746877908706665 0.02421043440699577332 0.03719477355480194092 -0.00304851122200489044 0.02062662504613399506 0.06683708727359771729 -0.00642335414886474609 -0.00504039414227008820 0.02955199964344501495 0.00648811273276805878 0.00558663159608840942 0.02003456838428974152 -0.04041406139731407166 0.00736814411357045174 -0.04575226455926895142 0.03949077427387237549 0.05742383748292922974 0.04866250604391098022 0.04687267541885375977 -0.00701304525136947632 -0.03712264448404312134 -0.02175992354750633240 -0.03979443758726119995 0.03961737453937530518 -0.07450901716947555542 0.02090370282530784607 -0.03487894684076309204 0.01653470844030380249 \ No newline at end of file diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index a4329f52db..5c6527baf9 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -81,30 +81,22 @@ __global__ void compute_attention_kernel_fused_kernel( int const first_step = 0; - int const tlength = request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; - int const qlength = request_infos[batch_config_request_id].num_tokens_in_batch; + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = + request_infos[batch_config_request_id].num_tokens_in_batch; BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; - // bitmask.mask[1] = 3; - // if (head_idx == 0 && tidx == 0) { - // printf("tree attn fused kernel req id %d %d, %d, %d, %lld\n", - // request_idx, - // tlength, - // qlength, - // bitmask.non_tree_cache_size, - // bitmask.mask[3]); - // } - int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { first_token_idx += request_infos[r].num_tokens_in_batch; } - if(tidx == 0 && head_idx == 0){ - printf("tree req: %d, %d\n", request_idx, first_token_idx); - } + // if(tidx == 0 && head_idx == 0){ + // printf("tree req: %d, %d\n", request_idx, first_token_idx); + // } // shared memory objects extern __shared__ char smem_[]; @@ -174,26 +166,11 @@ __global__ void compute_attention_kernel_fused_kernel( (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - // if (head_idx == 0 && qi == 9 && mask) { - // printf("tree attn mask for first token %d, %lld, %d, %d, %d\n", - // ti, - // bitmask.mask[ti - bitmask.non_tree_cache_size], - // bitmask.non_tree_cache_size, - // request_idx, - // qi); - // } - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 3 && mask) { - // printf("tree attn mask for third token %d, %lld, %d, %d\n", - // ti, - // bitmask.mask[ti - bitmask.non_tree_cache_size], - // bitmask.non_tree_cache_size, - // qi); - // } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); // if (head_idx == 0 && qi == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n + // ", // request_idx, // ti, // qk, @@ -250,10 +227,6 @@ __global__ void compute_attention_kernel_fused_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - // if (head_idx == 0 && tidx == 0 && qi == 9) { - // printf("expsum %.10f\n", exp_sum); - // } - // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { @@ -261,9 +234,6 @@ __global__ void compute_attention_kernel_fused_kernel( } __syncthreads(); - // if (head_idx == 0 && tidx == 0 && qi == 9) { - // printf("softmax %.10f\n", qk_smem[1]); - // } // value projection constexpr int V_VEC_SIZE = 16 / sizeof(DT); @@ -282,12 +252,8 @@ __global__ void compute_attention_kernel_fused_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; - // DT const *v_cache_batch = - // value_cache + - // (beam_request_idx * max_beam_width + beam_sub_request_idx) * - // max_seq_length * hidden_size + - // vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { @@ -310,10 +276,6 @@ __global__ void compute_attention_kernel_fused_kernel( // // Make sure we can start writing to shared memory. __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("valueX %.10f\n", out.x); - // } - // Run the final reduction amongst the different groups computing different // partial outputs. if (Dh == Dh_MAX || vi < Dh) { @@ -391,19 +353,6 @@ __global__ void commit_tokens_kernel( int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - // if(i == 0){ - // printf("commit token: %d %d %f\n", token_idx_in_last_batch, tok_id, - // kVal); - // } - // if(i == hidden_size){ - // printf("commit token 1: %d %d %f\n", token_idx_in_last_batch, tok_id, - // kVal); - // } - // if(i == 2 * hidden_size){ - // printf("commit token 2: %d %d %f\n", token_idx_in_last_batch, tok_id, - // kVal); - // } - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + offset] = kVal; vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + @@ -500,11 +449,13 @@ __global__ void update_tree_branch_kv_cache_fused( int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; - int const first_token_depth = request_infos[req_id].first_token_depth_in_request; + int const first_token_depth = + request_infos[req_id].first_token_depth_in_request; // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", req_id, - // token_idx, request_token_offset,(token_idx + first_token_depth - request_token_offset), kVal); + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", + // req_id, token_idx, request_token_offset,(token_idx + first_token_depth + // - request_token_offset), kVal); // } kCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * @@ -591,8 +542,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens++; } - std::cout << "num_new_tokens: " << num_new_tokens << "\n"; - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { @@ -873,12 +822,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, // update K-V cache int num_new_tokens = bc->num_active_tokens(); int parallelism = m->hidden_size * num_new_tokens; - // printf("update KV cache %d, idx: %d\n", - // num_new_tokens, - // bc->requestsInfo[0].first_token_depth_in_request); - // for (int i = 0; i < num_new_tokens; i++) { - // printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); - // } update_tree_branch_kv_cache_fused<<bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - // cudaMemcpyAsync(m->token_infos, - // &(bc->tokensInfo), - // bc->num_active_tokens() * - // sizeof(TreeVerifyBatchConfig::PerTokenInfo), - // cudaMemcpyHostToDevice, - // stream); - // cudaMemcpyAsync(m->request_infos, - // &(bc->requestsInfo), - // bc->max_requests_per_batch() * - // sizeof(BatchConfig::PerRequestInfo), - // cudaMemcpyHostToDevice, - // stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -992,9 +923,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor1"); - // print_tensor((float *)m->devQKVProjArray + 768 * (25 * 7) * 3, 32, "qkvtenor2"); - // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( // m, bc, stream); @@ -1037,8 +965,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - std::cout << "tree input tokens: " << bc->num_active_tokens() << "\n"; - // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); if (use_bias) { @@ -1089,20 +1015,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventDestroy(t_start); cudaEventDestroy(t_end); } - - // print_tensor(output.get_float_ptr(), 32, "tree attn kernel"); - - // save_tensor( - // input.get_float_ptr(), - // 768 * bc->num_active_tokens(), - // "/home/xinhaoc/FlexFlow/inference/output/Newtreeinput.txt"); - // save_tensor( - // output.get_float_ptr(), - // 768 * bc->num_active_tokens(), - // "/home/xinhaoc/FlexFlow/inference/output/Newtreeoutput.txt"); - // std::cout << "new tokens: " << bc->num_active_tokens() << "\n"; - - // assert(bc->num_tokens_to_commit == 0); } TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 904bfbcaff..c7b6e1257a 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -154,8 +154,6 @@ std::string get_operator_type_name(OperatorType type) { return "SpecIncMultiHeadSelfAttention"; case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: return "TreeIncMultiHeadSelfAttention"; - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: - return "SpecInferPgraoIncMultiHeadSelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 46f7cc0f29..6d33dd9f27 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -51,7 +51,6 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" @@ -70,7 +69,7 @@ using FlexFlow::MachineView; LegionRuntime::Logger::Category log_graph("graph"); LegionRuntime::Logger::Category log_simplify("graph_simplify"); -Node const Node::INVALID_NODE = Node(); +const Node Node::INVALID_NODE = Node(); Node::Node(void) : guid(0), ptr(NULL) {} @@ -2385,28 +2384,6 @@ GraphOptimalViewSerialized sez.serialize(attn->tensor_parallelism_degree); break; } - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { - SpecInferIncMultiHeadSelfAttention *attn = - (SpecInferIncMultiHeadSelfAttention *)op; - sez.serialize(attn->layer_guid.id); - sez.serialize(attn->layer_guid.transformer_layer_id); - sez.serialize(attn->layer_guid.model_id); - sez.serialize(attn->oProjSize); - sez.serialize(attn->num_q_heads); - sez.serialize(attn->qProjSize); - sez.serialize(attn->vProjSize); - sez.serialize(attn->dropout); - sez.serialize(attn->qkv_bias); - sez.serialize(attn->final_bias); - sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); - sez.serialize(attn->scaling_query); - sez.serialize(attn->scaling_factor); - sez.serialize(attn->qk_prod_scaling); - sez.serialize(attn->position_bias); - sez.serialize(attn->num_kv_heads); - break; - } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2937,52 +2914,6 @@ void FFModel::deserialize_graph_optimal_view( params); break; } - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { - assert(num_inputs == 1); - int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; - float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; - size_t id, transformer_layer_id, deserialized_model_id; - dez.deserialize(id); - dez.deserialize(transformer_layer_id); - dez.deserialize(deserialized_model_id); - LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); - dez.deserialize(embed_dim); - dez.deserialize(num_q_heads); - dez.deserialize(k_dim); - dez.deserialize(v_dim); - dez.deserialize(dropout); - dez.deserialize(qkv_bias); - dez.deserialize(final_bias); - dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); - dez.deserialize(scaling_query); - dez.deserialize(scaling_factor); - dez.deserialize(qk_prod_scaling); - dez.deserialize(position_bias); - dez.deserialize(num_kv_heads); - - SpecInferIncMultiHeadSelfAttentionParams params; - params.embed_dim = embed_dim; - params.num_q_heads = num_q_heads; - params.kdim = k_dim; - params.vdim = v_dim; - params.dropout = dropout; - params.qkv_bias = qkv_bias; - params.final_bias = final_bias; - params.add_zero_attn = add_zero_attn; - params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; - params.scaling_query = scaling_query; - params.scaling_factor = scaling_factor; - params.qk_prod_scaling = qk_prod_scaling; - params.position_bias = position_bias; - params.num_kv_heads = num_kv_heads; - node = get_or_create_node(inputs[0], - params); - break; - } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index e7f7c5f52d..52a1efc2ab 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -257,7 +257,6 @@ void InferenceManager::init_operators_inference(FFModel *model) { ((ParallelOp *)op) ->create_input_partition_inference(*model, inputs, outputs); } - printf("init op %s\n", op->name); op->init_inference(*model, inputs, outputs); } } @@ -394,14 +393,13 @@ void InferenceManager::load_input_tokens_from_batch_config( } void InferenceManager::load_inference_metadata_batch_config( - BatchConfigFuture const &bc, - FFHandler *handlers) { + BatchConfigFuture const &bc, FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); + Rect<1> task_rect( + Point<1>(0), Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); // int rank = 0; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index cf72f2d40b..c3ee73d78c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -59,7 +59,6 @@ #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -94,10 +93,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - ParallelTensor const input1, - ParallelTensor const input2, - ParallelTensor const input3, - ParallelTensor const input4) + const ParallelTensor input1, + const ParallelTensor input2, + const ParallelTensor input3, + const ParallelTensor input4) : Op(model, otype, dtype, @@ -117,10 +116,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - ParallelTensor const _input1, - ParallelTensor const _input2, - ParallelTensor const _input3, - ParallelTensor const _input4) + const ParallelTensor _input1, + const ParallelTensor _input2, + const ParallelTensor _input3, + const ParallelTensor _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -1025,9 +1024,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(ParallelTensor const output, +int Op::get_output_to_input_dim_mapping(const ParallelTensor output, int output_dim, - ParallelTensor const input) { + const ParallelTensor input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1060,9 +1059,9 @@ int Op::get_output_to_input_dim_mapping(ParallelTensor const output, return -1; } -int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, +int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, int output_dim, - ParallelTensor const weight) { + const ParallelTensor weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1659,7 +1658,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *op, int idx, @@ -1692,7 +1691,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *op, int idx, @@ -1742,7 +1741,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], +ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1823,7 +1822,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], +ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1854,7 +1853,7 @@ ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1874,7 +1873,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2088,7 +2087,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - Tensor const tensor, ParallelTensor ¶llel_tensor) const { + const Tensor tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2125,7 +2124,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - ParallelDim const dims[], + const ParallelDim dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2148,7 +2147,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - ParallelDim const dims[], + const ParallelDim dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2181,7 +2180,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - ParallelDim const dims[], + const ParallelDim dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2205,7 +2204,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - ParallelDim const dims[], + const ParallelDim dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2242,7 +2241,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(ParallelTensor const tensor, +void FFModel::create_disjoint_partition(const ParallelTensor tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2290,7 +2289,7 @@ void FFModel::create_disjoint_partition(ParallelTensor const tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - ParallelTensor const tensor, + const ParallelTensor tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2672,7 +2671,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { +IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -3039,12 +3038,6 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { - Op *op = SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( - *this, layer, inputs); - operators.push_back(op); - return op; - } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3234,7 +3227,7 @@ Op *FFModel::create_operator_from_layer( } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; + std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; @@ -3980,38 +3973,38 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - static int const epochs = 1; + const static int epochs = 1; // const static int iterations = 1; - static int const batchSize = 64; - static bool const profiling = false; - static bool const inference_debugging = false; + const static int batchSize = 64; + const static bool profiling = false; + const static bool inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - static int const numNodes = 1; - static int const workersPerNode = 0; - static int const cpusPerNode = 0; - static size_t const searchBudget = -1; - static size_t const simulatorWorkSpaceSize = + const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + const static int numNodes = 1; + const static int workersPerNode = 0; + const static int cpusPerNode = 0; + const static size_t searchBudget = -1; + const static size_t simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - static bool const searchOverlapBackwardUpdate = false; - static size_t const offloadReserveSpaceSize = + const static bool searchOverlapBackwardUpdate = false; + const static size_t offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB - static bool const cpuOffload = false; - static bool const onlyDataParallel = true; - static bool const enableSampleParallel = true; - static bool const enableParameterParallel = false; - static bool const enableAttributeParallel = false; - static bool const enableInplaceOptimizations = false; - static bool const allowTensorOpMathConversion = false; - static int const machine_model_version = 0; - static int const simulator_segment_size = 16777216; // 16 MB - static int const simulator_max_num_segments = 1; - static int const base_optimize_threshold = 10; - static bool const enable_control_replication = true; + const static bool cpuOffload = false; + const static bool onlyDataParallel = true; + const static bool enableSampleParallel = true; + const static bool enableParameterParallel = false; + const static bool enableAttributeParallel = false; + const static bool enableInplaceOptimizations = false; + const static bool allowTensorOpMathConversion = false; + const static int machine_model_version = 0; + const static int simulator_segment_size = 16777216; // 16 MB + const static int simulator_max_num_segments = 1; + const static int base_optimize_threshold = 10; + const static bool enable_control_replication = true; // The default python data loader type is 2 to enable control replication - static int const python_data_loader_type = 2; + const static int python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -6233,44 +6226,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } - { - TaskVariantRegistrar registrar( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - "SpecInferIncMultiHeadSelfAttention Init"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - OpMeta *, - SpecInferIncMultiHeadSelfAttention::init_task>( - registrar, "SpecInferIncMultiHeadSelfAttention Init Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant< - OpMeta *, - SpecInferIncMultiHeadSelfAttention::init_task>(registrar); - } - } - { - TaskVariantRegistrar registrar( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - "SpecInferIncMultiHeadSelfAttention Inference"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - SpecInferIncMultiHeadSelfAttention::inference_task>( - registrar, "SpecInferIncMultiHeadSelfAttention Inference Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant< - SpecInferIncMultiHeadSelfAttention::inference_task>(registrar); - } - } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index b51ab83091..5499a280a8 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -152,7 +152,7 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.offload_reserve_space = nullptr; } if (handle.batch_config_metadata_size > 0) { @@ -176,7 +176,7 @@ FFHandler .wait(); handle.batch_config_metadata = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.batch_config_metadata = nullptr; } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 523b3c76f3..c885b29db2 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -148,11 +148,10 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.offload_reserve_space = nullptr; } if (handle.batch_config_metadata_size > 0) { - printf("allocate instance for metadata %d\n", handle.batch_config_metadata_size); // allocate memory for offload reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -173,7 +172,7 @@ FFHandler .wait(); handle.batch_config_metadata = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.batch_config_metadata = nullptr; } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e30a7ee478..89d4ddaed4 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -182,7 +182,7 @@ size_t RequestManager::get_num_ssms() { RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, int max_sequence_length) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; @@ -238,7 +238,7 @@ RequestManager::RequestGuid RequestManager::RequestGuid RequestManager::register_new_request(std::string const &prompt, int max_sequence_length) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; @@ -296,7 +296,7 @@ RequestManager::RequestGuid } bool RequestManager::is_request_completed(RequestGuid const &guid) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); assert(all_requests.find(guid) != all_requests.end()); Request const &request = all_requests[guid]; // return request.tokens.size() >= request.max_sequence_length; @@ -305,7 +305,7 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); assert(request_generation_results.find(guid) != request_generation_results.end()); return request_generation_results[guid]; @@ -343,7 +343,7 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -456,7 +456,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase @@ -504,7 +504,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_request.max_sequence_length; new_bc.request_completed[i] = false; num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; profile_info.llm_decoding_steps = 1; @@ -566,7 +566,7 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, int model_id) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; } @@ -603,11 +603,10 @@ BeamSearchBatchConfig } else { committed_tokens[guid].clear(); } - // iterate through all the tokens that belong to request i int root_abs_depth = request.tokens.size() - 1; - + while (result_index < old_bc.num_tokens && old_bc.tokensInfo[result_index].request_index == i) { int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; @@ -640,14 +639,12 @@ BeamSearchBatchConfig } if (request.status == Request::RUNNING) { - std::cout << "verify running: " << dfs_tree_inputs.at(guid).size() << ", " - << tree_outputs.size() << "\n"; std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); + verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -729,9 +726,6 @@ BeamSearchBatchConfig } else { // Request not finished, pass verified_tokens to next iteration - std::cout << "parse to next iteration: " - << "\n"; - new_bc.request_completed[i] = false; new_bc.request_running[i] = true; num_active_req++; @@ -745,18 +739,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); - // std::cout << "max depth: " << new_max_depth << ", " - // << new_bc.requestsInfo[i].first_token_depth_in_request << - // ", " - // << verified_tokens.size() << "\n"; - // assert(false); new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; @@ -794,9 +783,6 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; - // std::cout << "num_gen ++ " - // << "\n"; - // num_generation_tokens++; // Add verified token to request's token list request.tokens.push_back(token.first); @@ -816,11 +802,6 @@ BeamSearchBatchConfig log_req_mgr.print("Output: %s", output.c_str()); } - // if (request.tokens.size() > 19 && i >= 7) { - // std::cout << request.tokens.size() << "\n"; - // assert(false); - // } - } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; @@ -838,7 +819,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; @@ -889,7 +870,7 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; @@ -973,17 +954,12 @@ BeamSearchBatchConfig } new_bc.num_generation_tokens = num_generation_tokens; - std::cout << "prepare next batch init gen tokens: " - << new_bc.num_generation_tokens << "\n"; - if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" << std::endl; old_bc.print(); new_bc.print(); } - std::cout << "prepare next batch init active tokens: " - << new_bc.num_tokens << "\n"; return new_bc; } @@ -1019,11 +995,11 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { - std::lock_guard const lock(request_queue_mutex); - if (true) { + const std::lock_guard lock(request_queue_mutex); + if (verbose) { std::cout << "\n############### prepare_next_batch_beam ###############\n"; } - if (true) { + if (verbose) { std::cout << "print all results" << "\n"; for (int i = 0; i < 40; i++) { @@ -1049,7 +1025,7 @@ BeamSearchBatchConfig if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } - num_active_req ++; + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1092,13 +1068,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num * old_bc.beamRequestsInfo[i].beam_size; - std::cout << "oldbc : " << old_bc.beamRequestsInfo[i].sub_request_num - << ", " << old_bc.beamRequestsInfo[i].beam_size << "\n"; - - // if (old_bc.beamRequestsInfo[i].current_depth == 3) { - // assert(false); - // } - assert(new_bc.beamRequestsInfo[i].sub_request_num <= BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && "exceed maximum nodes per layer"); @@ -1122,7 +1091,7 @@ BeamSearchBatchConfig request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { - // todo check it + // todo this is replaced by this_layer_size, but should check it new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { assert(false && "Request should be done"); @@ -1150,18 +1119,7 @@ BeamSearchBatchConfig memcpy(&new_bc.causalMask[i], &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); - // sub_request_num -> nodes of input next iteration - // beam_size replicate num - - std::cout << "print beam tree: " - << old_bc.beamRequestsInfo[i].current_depth << "\n"; BeamTree tree = request.beam_trees[old_bc.model_id]; - // for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { - // std::cout << "layer: " << k << "\n"; - // std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer - // << "\n"; - // } - std::cout << "append bit mask: "<< i << "\n"; appendBitMask(new_bc.causalMask[i], new_bc.beamRequestsInfo[i].sub_request_num, old_bc.beamRequestsInfo[i].beam_size, @@ -1185,9 +1143,6 @@ BeamSearchBatchConfig num_generation_tokens++; } } - // if(new_bc.beamRequestsInfo[i].current_depth >= 3 && i > 0){ - // assert(false); - // } } } @@ -1320,18 +1275,6 @@ BeamSearchBatchConfig old_bc.print(); new_bc.print(); } - - if (true) { - // std::cout << "print all resultsBBB" - // << "\n"; - // for (int i = 0; i < 40; i++) { - // std::cout << result.token_ids[i] << ", "; - // } - // std::cout << "Current Beam DepthBBB: " - // << old_bc.beamRequestsInfo[0].current_depth << "\n"; - } - std::cout << "prepare next batch beam total tokens: " << new_bc.num_tokens - << "gneration tokens: " << new_bc.num_generation_tokens << "\n"; return new_bc; } @@ -1366,7 +1309,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task( TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); std::cout << "\n############### prepare_next_batch_verify ###############\n"; @@ -1399,12 +1342,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { - - std::cout << "prepare next batch running:\n" - << "\n"; new_bc.request_running[i] = true; - std::cout << "[Verify] Request " << request.guid << " is running" - << std::endl; // Get the dfs tree std::vector>> @@ -1419,12 +1357,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector> dfs_tree_inputs = merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - if (true) { - // std::cout << "Request Tokens Size: " << request.tokens.size() - // << std::endl; - // for (int k = 0; k < request.tokens.size(); k++) { - // std::cout << k << ": " << request.tokens[k] << std::endl; - // } + if (verbose) { + std::cout << "Request Tokens Size: " << request.tokens.size() + << std::endl; + for (int k = 0; k < request.tokens.size(); k++) { + std::cout << k << ": " << request.tokens[k] << std::endl; + } } // Normal Request Info @@ -1435,31 +1373,21 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // copy bitmask to verify batchconfig memcpy(&(new_bc.causalMask[i]), &(old_batches.at(0).causalMask[i]), sizeof(BatchConfig::BitMask)); - // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; - // assert(false); // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; - std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " - << new_bc.causalMask[i].tree_size << ", " - << new_bc.causalMask[i].non_tree_cache_size << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[1]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[2]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[3]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[4]) - << "\n"; + // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " + // << new_bc.causalMask[i].tree_size << ", " + // << new_bc.causalMask[i].non_tree_cache_size << "\n"; + // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) + // << "\n"; // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { @@ -1473,7 +1401,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; - if (true) { + if (verbose) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << committed_token.first @@ -1485,7 +1413,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // } } } - if (true) { + if (verbose) { std::cout << "new_bc.num_tokens_to_commit: " << new_bc.num_tokens_to_commit << std::endl; } @@ -1508,14 +1436,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; - std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() - << "\n"; - bool cutLayer = false; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); - if (true) { + if (verbose) { std::cout << "[" << j << "] Token: " << token.first << ", Depth:" << token.second << std::endl; } @@ -1541,7 +1466,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( for (int j = total_tokens - 1; j >= 1; j--) { new_bc.num_tokens--; new_bc.requestsInfo[i].num_tokens_in_batch--; - std::cout << "cut: " << j << "\n"; + // std::cout << "cut: " << j << "\n"; if (new_bc.tokensInfo[j].abs_depth_in_request != new_bc.tokensInfo[j - 1].abs_depth_in_request) { break; @@ -1550,8 +1475,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else if (request.status == Request::PENDING) { - std::cout << "prepare next batch verify: pending\n" - << "\n"; new_bc.request_running[i] = false; if (verbose) { std::cout << "[Verify] Request " << request.guid @@ -1583,8 +1506,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( memcpy(&(new_bc.causalMask[i]), &(old_batches.at(0).causalMask[i]), sizeof(BatchConfig::BitMask)); - // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; - // assert(false); // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = @@ -1594,7 +1515,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; @@ -1608,9 +1529,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; if (request.llm_cache_size < request.initial_len) { - std::cout << "Initialization (prompt) phase: " - << new_bc.requestsInfo[i].num_tokens_in_batch << ", " - << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; + // std::cout << "Initialization (prompt) phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + // << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1618,8 +1539,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - std::cout << "load prompt tokens: " << j << ": " - << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; new_bc.num_tokens++; } @@ -1645,8 +1564,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else { // launch the request into running phase after loading all prompt if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { - std::cout << "Initialization running phase: " - << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; + // std::cout << "Initialization running phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1671,11 +1590,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - std::cout << "how many tokens in verify? " << new_bc.num_tokens << "\n"; - - std::cout << "check dfs tree input size: " << dfs_tree_inputs[1000000].size() - << "\n"; - return new_bc; } @@ -1690,7 +1604,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; int result_index = 0; - if (true) { + if (verbose) { std::cout << "Store total of " << old_bc.num_tokens << " tokens in the current batch.\n"; } @@ -1700,10 +1614,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { - std::cout << "i is: " << i << "old guid" << guid << " new guid" - << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] - .request_guid - << "\n"; + // std::cout << "i is: " << i << "old guid" << guid << " new guid" + // << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] + // .request_guid + // << "\n"; int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -1718,22 +1632,16 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // Count tokens sent to model in this request to find the final token's // index - - std::cout << "previous result index: " << result_index; - result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - std::cout << "after result index: " << result_index; - - // if (true) { - // std::cout << "i = " << i << ", result index = " << result_index - // << ", value: " << result.token_ids[result_index] - // << ", leaf node num: " << leaf_node_num << ", depth" << - // depth - // << ", beam size: " << beam_size << "\n"; - // } + if (verbose) { + std::cout << "i = " << i << ", result index = " << result_index + << ", value: " << result.token_ids[result_index] + << ", leaf node num: " << leaf_node_num << ", depth" << depth + << ", beam size: " << beam_size << "\n"; + } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; @@ -1743,7 +1651,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, if (depth == 1) { // store the last input into the tree; - if (true) { + if (verbose) { std::cout << "try to store the input" << "\n"; } @@ -1756,13 +1664,11 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, .treeLayers[0] .nodes_num_this_layer = 1; - if (true) { + if (verbose) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; } } - - std::cout << "leaffffff: " << leaf_node_num << "\n"; request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .nodes_num_this_layer = leaf_node_num; @@ -1777,27 +1683,20 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - // std::cout << "??????? beam id: " << beam_id << ", token: " - // << request.beam_trees.at(old_bc.model_id) - // .treeLayers[depth] - // .tokens[beam_id] - // << "\n"; - - // if (true) { - // std::cout << "tree value: " << depth << "token: " - // << request.beam_trees.at(old_bc.model_id) - // .treeLayers[depth] - // .tokens[beam_id] - // << "result tokens: " << result.token_ids[result_index]; - // } + + if (verbose) { + std::cout << "tree value: " << depth << "token: " + << request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .tokens[beam_id] + << "result tokens: " << result.token_ids[result_index]; + } result_index += 1; } // update the guid and start_depth for current request if (i < old_bc.num_tokens) { int new_req_idx = old_bc.tokensInfo[i].request_index; guid = old_bc.requestsInfo[new_req_idx].request_guid; - std::cout << "update guid: " << guid << ", request idx: " << index - << "\n"; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -1839,8 +1738,8 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, tree.treeLayers[depth].probs[j]; new_bc.beamRequestsInfo[request_index].tokens[j] = tree.treeLayers[depth].tokens[j]; - std::cout << "token: " << j << ": " - << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; + // std::cout << "token: " << j << ": " + // << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } } if (verbose) { @@ -1892,13 +1791,13 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, "do not support tree size > 64"); assert(initLength >= 1 && "verified token num should >= 1"); - std::cout << "non tree size: " << non_tree_size << ", " - << bitmask.non_tree_cache_size << "\n"; + // std::cout << "non tree size: " << non_tree_size << ", " + // << bitmask.non_tree_cache_size << "\n"; bitmask.non_tree_cache_size = non_tree_size + initLength - 1; bitmask.tree_size = 1; bitmask.this_layer_size = initLength; - std::cout << "non_tree_size: " << non_tree_size << "\n"; + // std::cout << "non_tree_size: " << non_tree_size << "\n"; bitmask.prompt_size = 1; for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = i; j < bitmask.prompt_size; j++) { @@ -1906,13 +1805,9 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, } } - std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; - std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) - << "\n"; - std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[1]) - << "\n"; - std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[2]) - << "\n"; + // std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; } // prepare next beam, append layers to the tree @@ -1987,16 +1882,10 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, // assert(false); // } - std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; - std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) - << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[1]) - << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[2]) - << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[3]) - << "\n"; + // std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; + // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; } bool PreOrder( @@ -2084,7 +1973,7 @@ std::vector> // depth) pairs for (auto const &pair : inputSerializedTree) { oss << " " << pair.second << ":" << pair.first; - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); } log_req_mgr.print("Input tree:%s", oss.str().c_str()); } @@ -2093,7 +1982,7 @@ std::vector> // outputSerializedTree is an array of (token id, depth + 1) pairs std::ostringstream oss; for (auto const &pair : outputSerializedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Output tree:%s", oss.str().c_str()); @@ -2130,6 +2019,7 @@ std::vector> } // to avoid branch switch when same tokens in input tree. + // todo, only checked for N->1->1->1 cases bool findFirst = false; layer_num = -1; @@ -2173,9 +2063,10 @@ std::vector> new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); // at this point, you'll not go other branches - std::cout << "verify tree push back: " << output.first - << ", tree size is: " << verifiedTree.size() - << ", ??: " << input.first << ", " << input.second << "\n"; + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << + // "\n"; } else { printf("not correct slot\n"); @@ -2190,9 +2081,9 @@ std::vector> committed_tokens.at(guid).at(i).second)); // // at this point, you'll not go other branches - std::cout << "verify tree push back: " << output.first - << ", tree size is: " << verifiedTree.size() - << ", ??: " << input.first << ", " << input.second << "\n"; + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << "\n"; } assert(committed_tokens.at(guid).at(i).first == input.second); @@ -2203,7 +2094,7 @@ std::vector> // log_req_mgr.print("========Verified============"); std::ostringstream oss; for (auto const &pair : verifiedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Verified:%s", oss.str().c_str()); @@ -2225,7 +2116,7 @@ std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, int first_token_depth_in_request) { - if (true) { + if (verbose) { std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; std::cout << "[Traverse Beam Tree] max_depth: " @@ -2269,13 +2160,13 @@ std::vector> // verbose); // print it - if (true) { + if (verbose) { std::cout << "Print serialized tree: size:" << request_index << serializedTree.size() << "\n"; } for (int k = 0; k < serializedTree.size(); k++) { serializedTree.at(k).second += first_token_depth_in_request; - if (true) { + if (verbose) { std::cout << "token id: " << serializedTree.at(k).first << ", depth: " << serializedTree.at(k).second << "\n"; } @@ -2354,9 +2245,6 @@ std::vector> } dfs_tree_inputs[guid] = merged_tree; - // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << - // ", " - // << dfs_tree_inputs[guid].size() << "\n"; return merged_tree; } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index bb6b6030aa..bb20fb263f 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -212,7 +212,6 @@ void RequestManager::load_batch_config_task( } // add a size check - std::cout << "hahaha handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; assert(total_copy_size <= handle.batch_config_metadata_size); } From b621f2a9f62f24a8112df7af3850dc3bdb494dc7 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 17:25:28 -0500 Subject: [PATCH 283/344] . --- inference/spec_infer/spec_infer.cc | 10 +++++----- src/runtime/cuda_helper.cu | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 258b2d78eb..b369a13c1d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -266,9 +266,9 @@ void FlexFlow::top_level_task(Task const *task, ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; - int max_requests_per_batch = 10; - int max_tokens_per_batch = 199; - int max_sequence_length = 200; + int max_requests_per_batch = 16; + int max_tokens_per_batch = 256; + int max_sequence_length = 1024; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -302,7 +302,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); - //first decoding step: 3 results + // first decoding step: 3 results rm->push_spec_infer_tree_width(3); // Create LLM model @@ -402,7 +402,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); + // tree_model.generate(text, 128 /*max_sequence_length*/); } tree_model.generate(prompts, 128 /*max_sequence_length*/); } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 398ed7f3cd..fa6bf55fe5 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -226,7 +226,7 @@ __host__ void print_tensor(T const *ptr, printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { printf(" %.20lf", (float)host_ptr[idx]); - if (idx >= 200) { + if (idx >= 100) { break; } } From 8a0b007bfe20b50302ad201c01c7ac1dfb30a25a Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 18:49:19 -0500 Subject: [PATCH 284/344] load batchconfig --- src/ops/inc_multihead_self_attention.cpp | 4 ++-- src/runtime/inference_manager.cc | 9 ++++----- src/runtime/model.cpp | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index a59740f4a3..00cc4d8868 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1106,7 +1106,7 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( float const *weight_ptr, float const *bias_ptr, int num_tokens, - cudaStream_t stream); + hipStream_t stream); template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -1115,6 +1115,6 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( half const *weight_ptr, half const *bias_ptr, int num_tokens, - cudaStream_t stream); + hipStream_t stream); }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 52a1efc2ab..8af0ed8978 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -398,11 +398,10 @@ void InferenceManager::load_inference_metadata_batch_config( Runtime *runtime = ff_config.lg_hlr; ArgumentMap argmap; - Rect<1> task_rect( - Point<1>(0), Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + Domain domain = + runtime->get_index_space_domain(ctx, ff_config.all_gpu_task_is); + Rect<1> task_rect = domain; - // int rank = 0; int idx = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { FFHandler handler = handlers[idx++]; @@ -410,7 +409,7 @@ void InferenceManager::load_inference_metadata_batch_config( } IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, - task_is, + ff_config.all_gpu_task_is, TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 5499a280a8..ad2b781567 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -152,7 +152,7 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - } else { + } else { handle.offload_reserve_space = nullptr; } if (handle.batch_config_metadata_size > 0) { From 17a718f95523ed3892d0324ed493ef6043607b13 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 19:18:22 -0500 Subject: [PATCH 285/344] clean --- .../inc_multihead_self_attention_utils.cuh | 4 +- src/ops/argmax.cc | 1 - src/ops/beam_topk.cc | 2 - src/ops/inc_multihead_self_attention.cu | 7 +- src/ops/spec_inc_multihead_self_attention.cu | 111 ++++++------------ src/ops/tree_inc_multihead_self_attention.cu | 13 +- 6 files changed, 49 insertions(+), 89 deletions(-) diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index 1b21a80dc9..c128c1a126 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -456,7 +456,7 @@ inline size_t smem_size_in_bytes(int hidden_size_per_head, int threads_per_block) { // The amount of shared memory needed to store the Q*K^T values in float. - size_t qk_sz = div_up(2000 + 1, 4) * 16; + size_t qk_sz = div_up(max_sequence_length + 1, 4) * 16; size_t logits_sz = qk_sz; // The total size needed during softmax. @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length + 1000; + int max_qk_length = max_query_length * max_total_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index c3bb3d493e..dc7e4ea3b3 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -352,7 +352,6 @@ BeamInferenceResult GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - BeamInferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 87d357b535..18d0ec1587 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -398,8 +398,6 @@ BeamInferenceResult download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); - // print_tensor(index_ptr, 32, "indexxxxxxx"); - if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cca0b230c3..da70e23f87 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1381,7 +1381,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(false && "Unkown inference mode"); } size_t requestinfo_size = BatchConfig::max_requests_per_batch(); - size_t tokeninfo_size = max_tokens_per_batch; + // size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; @@ -1438,8 +1438,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( token_infos = static_cast(handler.batch_config_metadata); - request_infos = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo)); + request_infos = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo)); if (offload) { // token_infos = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b3a87fe244..88dd3f92e4 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -82,29 +82,20 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( int const first_step = 0; - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; + // int const tlength = + // request_infos[batch_config_request_id].first_token_depth_in_request + + // request_infos[batch_config_request_id].num_tokens_in_batch; int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn fused kernel %d, %d\n", - // totalCacheSize, - // request_infos[batch_config_request_id].num_tokens_in_batch); - // } - // int const qlength = request_infos[request_idx].num_tokens_in_batch; - int const tree_branch_num = - beam_request_infos[batch_config_request_id].sub_request_num; - - // will decode qlength tokens in this thread block - // int const qlength = tree_branch_num; - int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { first_token_idx += causalMask[r].this_layer_size; } + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + // shared memory objects extern __shared__ char smem_[]; @@ -338,20 +329,14 @@ __global__ void spec_inc_store_kv_cache( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const first_token_in_req = - requestInfo[req_id].first_token_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const total_token = requestInfo[req_id].num_tokens_in_batch; + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const request_token_offset = requestInfo[req_id].first_token_offset_in_batch; BatchConfig::BitMask bitmask = causalMask[req_id]; - int const sub_request_num = beamRequestInfos[req_id].sub_request_num; - - int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - // tree_branch_num + sub_req_id + tok_id; @@ -379,9 +364,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; spec_inc_store_kv_cache<<>>( + min(CUDA_NUM_THREADS, parallelism), + 0, + stream>>>( static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), static_cast
(m->valueCache), @@ -401,19 +386,19 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } } -#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_sz = smem_size_in_bytes
(m->qProjSize, \ BatchConfig::max_sequence_length() + \ BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THREADS_PER_VALUE, \ THDS_PER_BLOCK); \ - compute_spec_inc_attention_kernel_generation_kernel \ + compute_spec_inc_attention_kernel_generation_kernel \ <<>>( \ static_cast
(m->devQKVProjArray), \ static_cast
(m->keyCache), \ @@ -470,14 +455,13 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel_prompt( - SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); @@ -812,8 +796,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", - elapsed); + printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); @@ -860,51 +843,29 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); - // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - // total_size); - beam_token_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); beam_request_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - // causalMask = gpu_mem_allocator.allocate_instance( - // causal_mask_size); - // beam_token_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - // beam_request_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - // assert(gpu_mem_allocator.instance_total_size == - // gpu_mem_allocator.instance_allocated_size); } cudaStreamSynchronize(stream); } -SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta( - void) { +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { beam_search_reserve_inst.destroy(); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 5c6527baf9..b4af80976f 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -445,7 +445,7 @@ __global__ void update_tree_branch_kv_cache_fused( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; @@ -1059,12 +1059,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); committed_token_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BatchConfig::causalMask)); } From c8a107b1b75e5c90a9c7329ab2618b940a4b260f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 19:19:45 -0500 Subject: [PATCH 286/344] hip --- src/ops/inc_multihead_self_attention.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 00cc4d8868..d60386f927 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1098,23 +1098,4 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - float *output_ptr, - float const *weight_ptr, - float const *bias_ptr, - int num_tokens, - hipStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - half *output_ptr, - half const *weight_ptr, - half const *bias_ptr, - int num_tokens, - hipStream_t stream); - }; // namespace FlexFlow From 42e1b5d92cf3e93e3f56d3d18d3fb68803b6caaf Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 20:42:49 -0500 Subject: [PATCH 287/344] hip --- src/runtime/request_manager.cpp | 95 +++++++++++++++++--- src/runtime/request_manager.cu | 154 +++++++++----------------------- 2 files changed, 123 insertions(+), 126 deletions(-) diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 9635b3bc1e..fadbf80d6d 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -56,22 +56,91 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, hipMemcpyHostToDevice, stream)); +} + +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - cudaMemcpyAsync(handle.batch_config_metadata, - &(batch_config->tokensInfo), - batch_config->num_active_tokens() * - sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo), - &(batch_config->requestsInfo), - batch_config->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); + size_t total_copy_size = 0; + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::tokensInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::requestsInfo); + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index bb20fb263f..51c52c3026 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -56,78 +56,6 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, cudaMemcpyHostToDevice, stream)); - - // // copy meta data to workSpace - // FFHandler handle = *((FFHandler const *)task->local_args); - // size_t total_copy_size = 0; - // cudaMemcpyAsync(handle.batch_config_metadata, - // &(batch_config->tokensInfo), - // sizeof(BatchConfig::tokensInfo), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BatchConfig::tokensInfo); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(batch_config->requestsInfo), - // sizeof(BatchConfig::requestsInfo), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BatchConfig::requestsInfo); - - // // load speculative metadata - // if (batch_config->get_mode() == BEAM_SEARCH_MODE) { - // BeamSearchBatchConfig const *beam_batch_config = - // static_cast(batch_config); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(beam_batch_config->beamTokenInfo), - // sizeof(BeamSearchBatchConfig::beamTokenInfo), - // cudaMemcpyHostToDevice, - // stream); - - // total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(beam_batch_config->beamRequestsInfo), - // sizeof(BeamSearchBatchConfig::beamRequestsInfo), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(beam_batch_config->causalMask), - // sizeof(BatchConfig::causalMask), - // cudaMemcpyHostToDevice, - // stream); - - // total_copy_size += sizeof(BatchConfig::causalMask); - // } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { - // TreeVerifyBatchConfig const *tree_batch_config = - // static_cast(batch_config); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(tree_batch_config->causalMask), - // sizeof(BatchConfig::causalMask), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BatchConfig::causalMask); - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(tree_batch_config->committed_tokens), - // sizeof(TreeVerifyBatchConfig::committed_tokens), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - // } - - // // add a size check - // std::cout << "handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; - // assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_batch_config_task( @@ -146,19 +74,19 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); size_t total_copy_size = 0; - cudaMemcpyAsync(handle.batch_config_metadata, - &(batch_config->tokensInfo), - sizeof(BatchConfig::tokensInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::tokensInfo); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(batch_config->requestsInfo), - sizeof(BatchConfig::requestsInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata @@ -166,48 +94,48 @@ void RequestManager::load_batch_config_task( BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::causalMask); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(tree_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::causalMask); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); } From 4957b7c7d4c73a6fca94ea40f140319b50b49e9a Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 30 Dec 2023 23:24:37 -0500 Subject: [PATCH 288/344] Specinfer - new kernel (#1252) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip --------- Co-authored-by: Zhihao Jia --- include/flexflow/batch_config.h | 29 +- include/flexflow/config.h | 11 + include/flexflow/model.h | 1 + .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 33 +- inference/models/llama.cc | 4 +- inference/spec_infer/spec_infer.cc | 3 + src/ops/argmax.cc | 2 +- src/ops/beam_topk.cc | 2 +- src/ops/beam_topk.cu | 65 +- src/ops/embedding.cc | 18 +- src/ops/inc_multihead_self_attention.cu | 81 +- src/ops/spec_inc_multihead_self_attention.cc | 12 +- src/ops/spec_inc_multihead_self_attention.cu | 964 +++++++++++------- src/ops/tree_inc_multihead_self_attention.cu | 232 +++-- src/runtime/inference_manager.cc | 56 +- src/runtime/model.cc | 48 +- src/runtime/model.cpp | 48 + src/runtime/model.cu | 25 + src/runtime/request_manager.cc | 639 +++++++++--- src/runtime/request_manager.cpp | 85 ++ src/runtime/request_manager.cu | 86 ++ 23 files changed, 1727 insertions(+), 719 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index e2903c4d11..13904aaa46 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -56,6 +56,7 @@ class BatchConfig { // across workers static int const MAX_NUM_REQUESTS = 64; static int const MAX_NUM_TOKENS = 1024; + static int const MAX_SPEC_TREE_TOKEN_NUM = 64; // Set by update int num_tokens; @@ -68,6 +69,9 @@ class BatchConfig { int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; + + // request id in batch config: + int batch_config_request_id; RequestGuid request_guid; }; struct PerTokenInfo { @@ -75,6 +79,24 @@ class BatchConfig { int request_index; TokenId token_id; }; + + struct BitMask { + unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0}; + + // how many tokens before the tree, every sub requests need this part of + // cache + int non_tree_cache_size; + + // current tree size + int tree_size; + + int this_layer_size; + + // input length-> prompt/root + int prompt_size; + }; + + BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; @@ -126,9 +148,12 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; - inline static int const MAX_BEAM_WIDTH = 1; + inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; + // maximum tree branches for a request + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; + int model_id; struct BeamSearchPerRequestInfo { @@ -139,6 +164,7 @@ class BeamSearchBatchConfig : public BatchConfig { BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int sub_request_num; }; struct BeamSearchPerTokenInfo { @@ -147,6 +173,7 @@ class BeamSearchBatchConfig : public BatchConfig { BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c2af6d707c..e1480264cc 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -16,6 +16,7 @@ #ifndef _FLEXFLOW_CONFIG_H_ #define _FLEXFLOW_CONFIG_H_ #include "ffconst.h" +#include "flexflow/batch_config.h" #include "legion.h" #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -75,6 +76,15 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; + void *batch_config_metadata; + + // request info + token info + topolopgy mask info + size_t batch_config_metadata_size = + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; @@ -132,6 +142,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; bool inference_debugging; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d8402ba622..16df99ab1a 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -240,6 +240,7 @@ enum TaskIDs { // InferenceManager & RequestManager RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, + RM_LOAD_BATCH_CONFIG_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 56bb2bd80d..a306f7985a 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -142,6 +142,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 6e2da19ce9..d160da4a72 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index baf6844801..1c4b0b2a2f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -38,10 +38,13 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, - ParallelTensor const input); + ParallelTensor const input, + FFHandler *handlers); void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset); + void load_inference_metadata_batch_config(BatchConfigFuture const &bc, + FFHandler *handlers); public: FFConfig ff_config; @@ -72,9 +75,10 @@ struct Request { struct BeamTree { struct treeLayer { BeamSearchBatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int nodes_num_this_layer = 0; }; treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; }; @@ -100,6 +104,7 @@ class RequestManager { void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); + void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, @@ -107,6 +112,16 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth); + void updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size); FFModel *get_model(int model_id); @@ -148,6 +163,7 @@ class RequestManager { void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); void update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index); @@ -181,6 +197,11 @@ class RequestManager { Legion::Context ctx, Legion::Runtime *runtime); + static void + load_batch_config_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static BatchConfig prepare_next_batch_task( Legion::Task const *task, std::vector const ®ions, @@ -210,6 +231,9 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + + // tree width in each speculative step, if not specified 1 + std::vector spec_infer_tree_width; // private fields std::unique_ptr tokenizer_; bool verbose; @@ -243,7 +267,8 @@ class RequestManager { private: struct ProfileInfo { - int decoding_steps; + int llm_decoding_steps; + int ssm_decoding_steps; double start_time, finish_time; }; std::unordered_map profiling_requests; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index b8fe70526d..10001ee916 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -246,7 +246,9 @@ void LLAMA::create_llama_model(FFModel &ff, if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); - output = ff.argmax(softmax, /*beam_Search*/ true); + // output = ff.argmax(softmax, /*beam_Search*/ true); + output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8b0eb926d9..b369a13c1d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -302,6 +302,9 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); + // first decoding step: 3 results + rm->push_spec_infer_tree_width(3); + // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); if (model_metadata.llm_model_type == ModelType::LLAMA) { diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index f336c843e8..dc7e4ea3b3 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -352,7 +352,6 @@ BeamInferenceResult GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - BeamInferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); @@ -398,6 +397,7 @@ InferenceResult ArgMax::save_inference_tensors_to_file( m, shard_id, bc, {}, {}, {input, indices}); } + download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 2883428254..18d0ec1587 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -366,7 +366,7 @@ BeamInferenceResult GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 72ab7862a6..a958786be3 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -556,8 +556,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] - << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -565,12 +563,13 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - log_beam_topk.debug() - << "probbbb req: " << i - << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] - << ", sub request id " << j << ", parent id " - << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; + // std::cout << "probbbb req: " << i << ", sub req probability : " + // << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << + // j + // << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + // << ", data inddd" + // << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + // << "\n"; } // process tokens @@ -584,6 +583,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); max_beam_width = std::max(max_beam_width, beam_size); + req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } @@ -613,28 +613,37 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; - checkCUDA(cudaMemcpy(m->parent_ids, - parent_ids, - sizeof(int) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->acc_probs, - acc_probs, - sizeof(DT) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->block_start_index, - beam_block_start_index.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->request_id, - request_id.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->tokens_per_request, - tokens_per_request.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); + checkCUDA(cudaMemcpyAsync(m->parent_ids, + parent_ids, + sizeof(int) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->acc_probs, + acc_probs, + sizeof(DT) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + // trick, set acc_probs to 0; + checkCUDA(cudaMemsetAsync( + m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); + checkCUDA(cudaMemcpyAsync(m->block_start_index, + beam_block_start_index.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->request_id, + request_id.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->tokens_per_request, + tokens_per_request.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; + beam_num_blocks = bc->num_active_tokens(); beam_topk_forward_kernel<<>>( input_ptr, shared_memory_size, diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 007e799fe0..76236e65ff 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -155,11 +155,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; @@ -170,11 +167,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -189,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 695f4b13b9..da70e23f87 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -82,6 +82,9 @@ __global__ void compute_attention_kernel_generation_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const beam_request_idx = is_beam ? request_idx / max_beam_width : request_idx; int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; @@ -89,8 +92,8 @@ __global__ void compute_attention_kernel_generation_kernel( int const first_step = 0; int const tlength = - request_infos[beam_request_idx].first_token_depth_in_request + - request_infos[beam_request_idx].num_tokens_in_batch; + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; // shared memory objects extern __shared__ char smem_[]; @@ -103,7 +106,8 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + beam_request_idx * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -139,7 +143,7 @@ __global__ void compute_attention_kernel_generation_kernel( DT const *k_cache_batch = key_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + ki; @@ -245,7 +249,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = value_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + vi; @@ -825,19 +829,6 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, bias_ptr = static_cast
(m->bias_ptr); } - // todo Xinhao copy how many requests if requests are not continous? - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); - // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -1364,8 +1355,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { + case INC_DECODING_MODE: { key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); @@ -1374,22 +1364,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case BEAM_SEARCH_MODE: + case TREE_VERIFY_MODE: { + // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); break; } default: assert(false && "Unkown inference mode"); } size_t requestinfo_size = BatchConfig::max_requests_per_batch(); - size_t tokeninfo_size = max_tokens_per_batch; + // size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; @@ -1400,11 +1392,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + - complex_size * sizeof(cuFloatComplex) + - requestinfo_size * - sizeof(BatchConfig::PerRequestInfo); // more components will - // be added here later + complex_size * sizeof(cuFloatComplex); // more components will + // be added here later if (offload) { // assert that we have enough reserved work space left size_t totalSharedSize = @@ -1447,10 +1436,16 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); + token_infos = + static_cast(handler.batch_config_metadata); + request_infos = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo)); + if (offload) { - token_infos = - gpu_mem_allocator.allocate_reserved( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_reserved( + // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); @@ -1464,13 +1459,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(cuFloatComplex); - request_infos = - gpu_mem_allocator.allocate_reserved( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_reserved( + // requestinfo_size); } else { - token_infos = - gpu_mem_allocator.allocate_instance( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_instance( + // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -1479,9 +1474,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); - request_infos = - gpu_mem_allocator.allocate_instance( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_instance( + // requestinfo_size); } // allocate more size for quantization data diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index eb6fd721e6..5d234df822 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -53,7 +53,7 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid( } Tensor - FFModel::spec_inc_multihead_self_attention(const Tensor input, + FFModel::spec_inc_multihead_self_attention(Tensor const input, int embed_dim, int num_heads, int kdim, @@ -91,7 +91,7 @@ Tensor } Tensor - FFModel::spec_inc_multiquery_self_attention(const Tensor input, + FFModel::spec_inc_multiquery_self_attention(Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -257,7 +257,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -358,8 +358,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -465,7 +465,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, SpecIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : SpecIncMultiHeadSelfAttention(model, other.layer_guid, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 562dee4d93..88dd3f92e4 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -23,16 +23,286 @@ namespace FlexFlow { +#define WARP_SIZE 32 + // declare Legion names using Legion::coord_t; using Legion::Memory; using namespace Kernels::IncMultiHeadAttention; namespace Kernels { -namespace SpecIncMultiHeadAttention { +namespace SpecIncMultiHeadSelfAttention { + +template +__global__ void compute_spec_inc_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BatchConfig::BitMask *causalMask) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // nth request idx + int const request_idx = blockIdx.y; + + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int const first_step = 0; + + // int const tlength = + // request_infos[batch_config_request_id].first_token_depth_in_request + + // request_infos[batch_config_request_id].num_tokens_in_batch; + + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += causalMask[r].this_layer_size; + } + + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < tree_branch_num; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + + int const query_token = bitmask.tree_size - tree_branch_num + qi; + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < totalCacheSize) { + + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} template -__global__ void spec_store_kv_cache( +__global__ void spec_inc_store_kv_cache( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, @@ -40,16 +310,16 @@ __global__ void spec_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, int num_tokens, int max_seq_len, - int max_beam_width, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -58,100 +328,30 @@ __global__ void spec_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - int const beam_depth = beamRequestInfos[req_id].current_depth; - int const beam_width = beamRequestInfos[req_id].beam_size; - - kCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - - // replica in the root iteration - if (beam_depth == 1) { - for (int i = 1; i < beam_width; i++) { - kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - } - } + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - // if (head_idx == 0 && beam_depth == 0 && token_idx == 8 && k_cache) { - // // printf("token idx %d\n", token_idx); - // printf("data idx: %d, tok_id %d, new_token_cache_idx %d, parent_id %d, - // " - // "sub_req_id %d, num_tokens %d, kProjSize %d, num_kv_heads %d, - // val " - // "%f, beam_width %d\n", - // data_idx, - // tok_id, - // new_token_cache_idx, - // parent_id, - // sub_req_id, - // num_tokens, - // kProjSize, - // num_kv_heads, - // val, - // beam_width); - // } + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; - // naive cache stealing - if (sub_req_id != parent_id) { - if (offset == 0 && tok_id == 0) { - printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - "%d, tok_id %d\n", - beam_depth, - req_id, - sub_req_id, - parent_id, - tok_id); - } + BatchConfig::BitMask bitmask = causalMask[req_id]; - for (int depth = 0; depth < beam_depth; depth++) { - int steal_token_idx = tok_id - beam_depth + depth; - int steal_from_idx = (req_id * max_beam_width + parent_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; - vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; - - // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ - // printf("cache stealing kernel!, steal_token_idx %d\n", - // steal_token_idx); - // } - } - } + // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + + // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - + // tree_branch_num + sub_req_id + tok_id; + // bitmask.tree_size - tree_branch_num + sub_req_id; + + // if prompt token -> token id + // if tree token: + int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - + bitmask.this_layer_size + token_idx - + request_token_offset; - // parallel cache stealing not yet implemented - // logic shld be - // launch spec_store_kv_cache with parallelism * current depth - // from the i here, get depth index - // if depth index not the current one, check if we need to steal - // steal if needed - - // cache stealing theory - // identify which sub request does this token come from - // for initial token, 0 - // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and - // which to be delete copy beam_size bunch of blocks when sub_req_id == - // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -161,28 +361,79 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - spec_store_kv_cache<<>>(static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /*root*/ curr_depth == 0, - m->hidden_size); + spec_inc_store_kv_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->causalMask, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + /*root*/ curr_depth == 0, + m->hidden_size); + } +} + +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_spec_inc_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->causalMask) + +template +void compute_spec_inc_attention_kernel_generation( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // one block == one head per request + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); } } @@ -236,199 +487,208 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } - for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; + // else if (tokens_previous_requests < bc->num_generation_tokens) { + // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // continue; + // } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; - if (num_new_tokens <= 0) { - continue; - } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>( - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; + if (num_new_tokens <= 0) { + continue; + } + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // To get B, skip over K entries from previous requests (all heads + + // padding) + + // print_tensor((float*)A, 32, "A"); + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // print_tensor((float*)C, 32, "C"); + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + spec_fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + + // print_tensor((float*)C_softmax, 32, "C_softmax"); + C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; } // assert(tokens_previous_requests == num_tokens); @@ -443,31 +703,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { - // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_token_infos, - &(bc->beamTokenInfo), - bc->num_active_tokens() * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_request_infos, - &(bc->beamRequestsInfo), - bc->max_requests_per_batch() * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), - cudaMemcpyHostToDevice, - stream); // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, bc, shard_id, @@ -479,7 +716,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { - compute_attention_kernel_generation
( + compute_spec_inc_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); } // phase 3: Compute attention score @@ -488,16 +725,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } - // compute output production and bias together for all tokens - int num_tokens = - bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + int num_tokens = bc->num_active_tokens(); compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } -} // namespace SpecIncMultiHeadAttention +} // namespace SpecIncMultiHeadSelfAttention } // namespace Kernels /*static*/ @@ -529,25 +764,27 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); } else if (input.data_type == DT_FLOAT) { float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -606,38 +843,23 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t beam_tokeninfo_size = - max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); - size_t beam_requestinfo_size = - BeamSearchBatchConfig::max_requests_per_batch(); - size_t total_size = - beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: - BeamSearchPerRequestInfo); // more components will - // be added here later - - // We always directly allocate memory for small speculative models - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); beam_token_infos = - gpu_mem_allocator - .allocate_instance( - beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + beam_request_infos = - gpu_mem_allocator - .allocate_instance( - beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - assert(gpu_mem_allocator.instance_total_size == - gpu_mem_allocator.instance_allocated_size); + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index bc7d1017b7..b4af80976f 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -53,6 +53,7 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::PerRequestInfo *request_infos, int num_heads, int num_requests, + BatchConfig::BitMask *causalMask, int qk_smem_sz) { // q, k @@ -75,17 +76,28 @@ __global__ void compute_attention_kernel_fused_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const first_step = 0; - int const tlength = request_infos[request_idx].first_token_depth_in_request + - request_infos[request_idx].num_tokens_in_batch; - int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = + request_infos[batch_config_request_id].num_tokens_in_batch; + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[request_idx].num_tokens_in_batch; + first_token_idx += request_infos[r].num_tokens_in_batch; } + // if(tidx == 0 && head_idx == 0){ + // printf("tree req: %d, %d\n", request_idx, first_token_idx); + // } + // shared memory objects extern __shared__ char smem_[]; @@ -115,7 +127,7 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -126,11 +138,19 @@ __global__ void compute_attention_kernel_fused_kernel( q_vecs[ki_o][ii] = *reinterpret_cast( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + + // if (head_idx == 0 && qi == 1 && tidx == 0) { + // printf("laod q %d, %d %.10f\n", + // request_idx, + // qi,q_vecs[ki_o][ii].x); + // } } + __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; int const ti_circ = ti % max_seq_length; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; if (ti < tlength) { @@ -142,22 +162,28 @@ __global__ void compute_attention_kernel_fused_kernel( float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); - } + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - int pos = ti * qlength + qi; - if (((pos / qlength) % tlength) > (pos % qlength + tlength - qlength)) { - qk = -FLT_MAX; - } qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[pos] = mask ? 0.f : qk; + // if (head_idx == 0 && qi == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n + // ", + // request_idx, + // ti, + // qk, + // q_vecs[ki_o][0].x, + // k[0].x); + // } + qk_smem[ti - first_step] = mask ? 0.0f : qk; } } + __syncthreads(); +#pragma unroll for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } @@ -176,7 +202,7 @@ __global__ void compute_attention_kernel_fused_kernel( // The warps finalize the reduction. qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; - +#pragma unroll for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } @@ -184,12 +210,18 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - float exp_sum = 0.f; + // if (head_idx == 0 && qi == 9 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", qk_max); + // } + float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti * qlength + qi] - qk_max); + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; - qk_smem[ti * qlength + qi] = logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; } // Compute the sum. @@ -197,43 +229,51 @@ __global__ void compute_attention_kernel_fused_kernel( // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti * qlength + qi] *= inv_sum; + qk_smem[ti - first_step] *= inv_sum; } __syncthreads(); - } - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; - Out_sum out; - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - for (int qi = 0; qi < qlength; qi++) { + Out_sum out; zero(out); - __syncthreads(); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - float logit = qk_smem[ti * qlength + qi]; - out = FlexFlow::fma(logit, cast_to_float(v), out); + + if (ti < tlength) { + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } } } - // Make sure we can start writing to shared memory. + // // Make sure we can start writing to shared memory. __syncthreads(); // Run the final reduction amongst the different groups computing different @@ -268,6 +308,17 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, + // out.y, + // out.z, + // out.w, + // vi, + // (first_token_idx + qi) * hidden_size + head_idx * + // per_head_size + + // vi); + // } } } } @@ -286,9 +337,9 @@ __global__ void commit_tokens_kernel( int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - int token_pos = i / (hidden_size * KV_WEIGHT_NUM); + int token_pos = i / (hidden_size); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); @@ -329,7 +380,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length(), + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, m->hidden_size); } } @@ -348,9 +400,9 @@ __global__ void update_tree_branch_kv_cache( int total_tokens_in_batch, int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int token_idx = i / (hidden_size); int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch @@ -375,6 +427,7 @@ __global__ void update_tree_branch_kv_cache_fused( DT *kCache_ptr, DT *vCache_ptr, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo *request_infos, int qProjSize, int kProjSize, int vProjSize, @@ -392,10 +445,25 @@ __global__ void update_tree_branch_kv_cache_fused( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + int const request_token_offset = + request_infos[req_id].first_token_offset_in_batch; + int const first_token_depth = + request_infos[req_id].first_token_depth_in_request; + + // if(i % hidden_size == 0){ + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", + // req_id, token_idx, request_token_offset,(token_idx + first_token_depth + // - request_token_offset), kVal); + // } + kCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + vCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + offset] = vVal; } } @@ -448,10 +516,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; int vt_block_size = m->vProjSize; int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -472,9 +542,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens++; } - std::cout << "num_new_tokens: " << num_new_tokens << "\n"; - assert(false); - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { @@ -716,7 +783,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THDS_PER_VALUE, \ THDS_PER_BLOCK, \ bc, \ @@ -733,17 +801,19 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, static_cast
(m->valueCache), \ output_ptr, \ scale, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ BatchConfig::max_tokens_per_batch(), \ m->qProjSize, \ m->hidden_size, \ m->request_infos, \ m->num_q_heads, \ bc->num_active_requests(), \ + m->causalMask, \ smem_sz[0]) template -void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, DT *output_ptr, cudaStream_t stream) { @@ -760,11 +830,12 @@ void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, static_cast
(m->keyCache), static_cast
(m->valueCache), m->token_infos, + m->request_infos, m->qProjSize, m->kProjSize, m->vProjSize, num_new_tokens, - BatchConfig::max_sequence_length(), + BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, m->hidden_size); dim3 grid(m->num_q_heads, bc->num_active_requests()); @@ -816,12 +887,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache + // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << + // "\n"; + cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), bc->num_tokens_to_commit * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(m->causalMask, + &(bc->causalMask), + bc->num_active_requests() * sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -834,18 +913,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -991,27 +1058,16 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); - if (offload) { - // assert that we have enough reserved work space left - assert(gpu_mem_allocator.reserved_total_size - - gpu_mem_allocator.reserved_allocated_size >= - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); - } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); - } + + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); + committed_token_infos = + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index eb045e8159..8af0ed8978 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -318,7 +318,8 @@ FutureMap InferenceManager::inference(FFModel *model, found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt); + load_input_tokens_from_batch_config(bc, pt, model->handlers); + load_inference_metadata_batch_config(bc, model->handlers); } } @@ -348,11 +349,34 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( - BatchConfigFuture const &bc, ParallelTensor const input) { + BatchConfigFuture const &bc, + ParallelTensor const input, + FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, input->parallel_is); + + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + MachineView view = input->machine_view; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + argmap.set_point(*it, \ + TaskArgument(&handlers[view.get_device_id(*it)], \ + sizeof(FFHandler))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, input->parallel_is, TaskArgument(nullptr, 0), @@ -368,6 +392,34 @@ void InferenceManager::load_input_tokens_from_batch_config( runtime->execute_index_space(ctx, launcher); } +void InferenceManager::load_inference_metadata_batch_config( + BatchConfigFuture const &bc, FFHandler *handlers) { + Context ctx = ff_config.lg_ctx; + Runtime *runtime = ff_config.lg_hlr; + ArgumentMap argmap; + + Domain domain = + runtime->get_index_space_domain(ctx, ff_config.all_gpu_task_is); + Rect<1> task_rect = domain; + + int idx = 0; + for (PointInRectIterator<1> it(task_rect); it(); it++) { + FFHandler handler = handlers[idx++]; + argmap.set_point(*it, TaskArgument(&handler, sizeof(FFHandler))); + } + + IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, + ff_config.all_gpu_task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + FFConfig::DataParallelism_GPU); + launcher.add_future(bc); + runtime->execute_index_space(ctx, launcher); +} + void InferenceManager::load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92f0cff472..37605c44a4 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1499,10 +1499,8 @@ FFRuntime::FFRuntime(FFConfig &config) { Context ctx = config.lg_ctx; ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); - + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { FFInitInfo info; @@ -1518,7 +1516,7 @@ FFRuntime::FFRuntime(FFConfig &config) { // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -2993,6 +2991,12 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -3002,6 +3006,7 @@ Op *FFModel::create_operator_from_layer( 0, true /*gradients*/, tensor->tensor_guid); + assert(pt->get_shape().is_valid()); // assert that this tensor hasn't been mapped before assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; @@ -3260,12 +3265,12 @@ void FFModel::create_operators_from_layers() { if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { assert(op->numOutputs == 1); - Replicate *repl = new Replicate(*this, - op->outputs[0], - op->outputs[0]->num_dims - 1, - config.tensor_parallelism_degree); - operators.push_back(repl); - op = repl; + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; } else if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || @@ -4076,6 +4081,10 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); + // field_space = runtime->create_field_space(lg_ctx); } @@ -4337,6 +4346,23 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // RequestManager load metadata + { + TaskVariantRegistrar registrar(RM_LOAD_BATCH_CONFIG_TASK_ID, + "RequestManager Load meta data"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RequestManager Load metadata Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // RequestManager prepare_next_batch { TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 6c482426eb..ad2b781567 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -131,6 +131,54 @@ FFHandler .wait(); handle.workSpace = workspaceInst.pointer_untyped(0, sizeof(char)); } + if (handle.offload_reserve_space_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.offload_reserve_space = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.offload_reserve_space = nullptr; + } + if (handle.batch_config_metadata_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.batch_config_metadata = nullptr; + } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 17401a0f14..c885b29db2 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -151,6 +151,31 @@ FFHandler } else { handle.offload_reserve_space = nullptr; } + if (handle.batch_config_metadata_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.batch_config_metadata = nullptr; + } + // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7c37f3391e..89d4ddaed4 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -16,6 +16,7 @@ #include "flexflow/request_manager.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" +#include #include #include #include @@ -106,6 +107,11 @@ int RequestManager::get_max_sequence_length() { return max_sequence_length; } +void RequestManager::push_spec_infer_tree_width(int tree_width) { + assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH); + spec_infer_tree_width.emplace_back(tree_width); +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -358,6 +364,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } int num_generation_tokens = 0; + int num_active_req = -1; // Step 2: prepare the next batch for existing requests BatchConfig new_bc; @@ -406,13 +413,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); @@ -420,8 +428,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -447,6 +455,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase @@ -469,7 +479,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } // Update profiling profiling_requests[new_bc.requestsInfo[i].request_guid] - .decoding_steps++; + .llm_decoding_steps++; } } } @@ -483,6 +493,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -492,9 +503,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 1; + profile_info.llm_decoding_steps = 1; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -567,6 +580,7 @@ BeamSearchBatchConfig int result_index = 0; int num_generation_tokens = 0; + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { @@ -602,6 +616,8 @@ BeamSearchBatchConfig committed_tokens[guid].emplace_back(abs_depth, result_index); } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); + // std::cout << "committred tokens push: " << abs_depth + // << " ,result index: " << result_index << "\n"; committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { @@ -612,22 +628,23 @@ BeamSearchBatchConfig tree_outputs.back().second, token_id); } - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - abs_depth, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - token_id); + // std::cout << "Index within old batch: " << result_index << std::endl; + // printf(" Input: [%d] %d ---> [%d] %d \n", + // abs_depth, + // old_bc.tokensInfo[result_index].token_id, + // tree_outputs.back().second, + // token_id); } result_index++; } if (request.status == Request::RUNNING) { + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); - // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -664,16 +681,18 @@ BeamSearchBatchConfig // Log profiling info ProfileInfo profile_info = profiling_requests[request.guid]; profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.ssm_decoding_steps = 0; total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { @@ -682,8 +701,8 @@ BeamSearchBatchConfig outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -709,6 +728,7 @@ BeamSearchBatchConfig new_bc.request_completed[i] = false; new_bc.request_running[i] = true; + num_active_req++; // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = @@ -719,6 +739,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = @@ -726,8 +747,14 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; + + profiling_requests[request.guid].ssm_decoding_steps = 0; + + int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { @@ -735,8 +762,14 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; + updateBitMask(new_bc.causalMask[i], + verified_tokens.size(), + request.tokens.size()); + // Token Info for (int j = 0; j < verified_tokens.size(); j++) { auto token = verified_tokens.at(j); @@ -758,6 +791,7 @@ BeamSearchBatchConfig break; } } + std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -767,9 +801,11 @@ BeamSearchBatchConfig } log_req_mgr.print("Output: %s", output.c_str()); } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; + num_active_req++; std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " << "initial_len: " << request.initial_len << std::endl; @@ -783,17 +819,24 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = 0; for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; // Token Info @@ -818,6 +861,7 @@ BeamSearchBatchConfig Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + num_active_req++; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -826,15 +870,21 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 0; + profile_info.llm_decoding_steps = 0; + profile_info.ssm_decoding_steps = 0; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; // init the beam search metadata per request + int ssm_decoding_steps = profile_info.ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].max_depth = std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, @@ -846,6 +896,11 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + + new_bc.beamRequestsInfo[i].sub_request_num = 1; + printf("sub request num == 1, %d \n", + new_bc.beamRequestsInfo[i].beam_size); + new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -862,6 +917,9 @@ BeamSearchBatchConfig new_bc.num_tokens++; } + initBitMask(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); + // if (new_bc.requestsInfo[i].num_tokens_in_batch < // new_request.initial_len) { // all_requests[new_request.guid].status = Request::PENDING; @@ -949,6 +1007,8 @@ BeamSearchBatchConfig } std::cout << "Current Beam Depth: " << old_bc.beamRequestsInfo[0].current_depth << "\n"; + std::cout << "Current sub request num: " + << old_bc.beamRequestsInfo[0].sub_request_num << "\n"; } // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); @@ -960,10 +1020,12 @@ BeamSearchBatchConfig int num_generation_tokens = 0; // Add incremental tokens to the batch + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -973,29 +1035,6 @@ BeamSearchBatchConfig // assert(processed_tokens < request.tokens.size()); log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - // if (processed_tokens > - // old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() && - // request.status == Request::RUNNING - // // || ir.results[t] == 0 TODO: replace this with - // ) { - // // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - // // old_bc.requestsInfo[i].request_guid, - // // old_bc.beamRequestsInfo[i].max_depth); - // // // new_bc.request_completed[i] = true; - // // new_bc.request_completed[i] = false; - // // new_bc.requestsInfo[i].first_token_depth_in_request = - // processed_tokens; - // // new_bc.requestsInfo[i].request_guid = - // // old_bc.requestsInfo[i].request_guid; - // // new_bc.requestsInfo[i].max_sequence_length = - // // old_bc.requestsInfo[i].max_sequence_length; - // // new_bc.beamRequestsInfo[i].current_depth = - // // old_bc.beamRequestsInfo[i].current_depth; - // // new_bc.request_running[i] = false; - // std::cout << "beam search end:" << request.status << i << ", " - // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; - // } - // else { log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; @@ -1005,25 +1044,42 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - + profiling_requests[request.guid].ssm_decoding_steps += 1; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - // update the parentid, accumalated_probs, depth, and token_ids + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; + new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num * + old_bc.beamRequestsInfo[i].beam_size; + + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); + if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.request_running[i] = true; // do the slot exchange to minimize the cache exchange in kernel. - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + update_beam_metadata( + new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); + } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1035,6 +1091,7 @@ BeamSearchBatchConfig request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { + // todo this is replaced by this_layer_size, but should check it new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { assert(false && "Request should be done"); @@ -1057,9 +1114,22 @@ BeamSearchBatchConfig } // register more tokens due to the beam width + + // copy metadata + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + BeamTree tree = request.beam_trees[old_bc.model_id]; + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); + // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1069,6 +1139,8 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; + + num_generation_tokens++; } } } @@ -1079,6 +1151,7 @@ BeamSearchBatchConfig if (old_bc.request_completed[i] || old_bc.request_running[i]) { continue; } + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1098,18 +1171,34 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; - // update the parentid, accumalated_probs, depth, and token_ids new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; + printf("beam size: %d, %d\n", + new_bc.beamRequestsInfo[i].beam_size, + ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num; + + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); + + // update the parentid, accumalated_probs, depth, and token_ids if (request.status == Request::PENDING) { // if the request is pending, we need to update the beam search @@ -1121,6 +1210,10 @@ BeamSearchBatchConfig assert(false && "Request should be pending"); } + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done @@ -1133,6 +1226,13 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; + BeamTree tree = request.beam_trees[old_bc.model_id]; + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); } if (verbose) { @@ -1152,7 +1252,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1229,21 +1329,20 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( max_prompt_load_size -= 1; } } - + int num_active_req = -1; for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; } + num_active_req++; size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = all_requests[guid]; // Profiling - profiling_requests[request.guid].decoding_steps += 1; + profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { new_bc.request_running[i] = true; - std::cout << "[Verify] Request " << request.guid << " is running" - << std::endl; // Get the dfs tree std::vector>> @@ -1274,31 +1373,44 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + + // copy bitmask to verify batchconfig + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; + // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " + // << new_bc.causalMask[i].tree_size << ", " + // << new_bc.causalMask[i].non_tree_cache_size << "\n"; + // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) + // << "\n"; + // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { - for (int j = 0; j < dfs_tree_inputs.size(); j++) { - if (j < committed_tokens.at(guid).size()) { - auto committed_token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } - new_bc.num_tokens_to_commit++; - request.llm_cache_size++; + for (int j = 0; j < committed_tokens.at(guid).size(); j++) { + // if (j < committed_tokens.at(guid).size()) { + + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; } + new_bc.num_tokens_to_commit++; + request.llm_cache_size++; + // } } } if (verbose) { @@ -1324,6 +1436,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; + bool cutLayer = false; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); @@ -1340,11 +1453,27 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() - 1) { + if (new_bc.num_tokens == get_max_tokens_per_batch() && + (j != dfs_tree_inputs.size() - 1)) { + cutLayer = true; break; } } + // delete the last incomplete layer + if (cutLayer) { + int total_tokens = new_bc.num_tokens; + for (int j = total_tokens - 1; j >= 1; j--) { + new_bc.num_tokens--; + new_bc.requestsInfo[i].num_tokens_in_batch--; + // std::cout << "cut: " << j << "\n"; + if (new_bc.tokensInfo[j].abs_depth_in_request != + new_bc.tokensInfo[j - 1].abs_depth_in_request) { + break; + } + } + } + } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; if (verbose) { @@ -1374,6 +1503,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); + // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size; @@ -1382,6 +1515,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; @@ -1395,6 +1529,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; if (request.llm_cache_size < request.initial_len) { + // std::cout << "Initialization (prompt) phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + // << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1402,7 +1539,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - new_bc.num_tokens++; } @@ -1428,6 +1564,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else { // launch the request into running phase after loading all prompt if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + // std::cout << "Initialization running phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1476,26 +1614,41 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { + // std::cout << "i is: " << i << "old guid" << guid << " new guid" + // << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] + // .request_guid + // << "\n"; + int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; + + // int leaf_node_num = old_bc.sub_requests[index]; + int leaf_node_num = + old_bc.beamRequestsInfo[index].sub_request_num * beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results - int beam_width = old_bc.beamRequestsInfo[index].beam_size; + // int beam_width = old_bc.beamRequestsInfo[index].beam_size; // Count tokens sent to model in this request to find the final token's // index result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - beam_width; + beam_size; if (verbose) { std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] << "\n"; + << ", value: " << result.token_ids[result_index] + << ", leaf node num: " << leaf_node_num << ", depth" << depth + << ", beam size: " << beam_size << "\n"; } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; + if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) { + continue; + } + if (depth == 1) { // store the last input into the tree; if (verbose) { @@ -1507,14 +1660,20 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.tokens.back(); request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; + request.beam_trees.at(old_bc.model_id) + .treeLayers[0] + .nodes_num_this_layer = 1; if (verbose) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; } } + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .nodes_num_this_layer = leaf_node_num; + for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { - for (int beam_id = 0; beam_id < beam_width; beam_id++) { request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .tokens[beam_id] = result.token_ids[result_index]; @@ -1534,10 +1693,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } result_index += 1; } - // update the guid and start_depth for current request if (i < old_bc.num_tokens) { - guid = old_bc.requestsInfo[index].request_guid; + int new_req_idx = old_bc.tokensInfo[i].request_index; + guid = old_bc.requestsInfo[new_req_idx].request_guid; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -1546,6 +1705,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // for updating the beam search metadata in requests in incremental phase void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index) { @@ -1556,6 +1716,9 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; + // int leaf_node_num = old_bc.sub_requests[request_index]; + int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num; + if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct // for (int j = 0; j < beam_size; j++) { @@ -1568,48 +1731,15 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // Do nothing // assert(false); } else { - std::set parents; - std::set childs; - // cache stealing - for (int j = 0; j < beam_size; j++) { - int parent_id = tree.treeLayers[depth].parent_ids[j]; - if (childs.find(parent_id) == childs.end()) { - // copy beam slot - new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[parent_id] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[parent_id] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(parent_id); - } - } - if (parents.size() < beam_size) { - for (int j = 0; j < beam_size; j++) { - if (parents.find(j) == parents.end()) { - // this slot has not been assigned - // find the smallest not assigned child and put in - if (verbose) { - std::cout << "request_index" << request_index - << ", miss slot: " << j << "\n"; - } - for (int k = 0; k < beam_size; k++) { - if (childs.find(k) == childs.end()) { - // parent -> j to child k; - new_bc.beamRequestsInfo[request_index].parent_id[k] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[k] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[k] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(k); - break; - } - } - } - } + for (int j = 0; j < leaf_node_num; j++) { + new_bc.beamRequestsInfo[request_index].parent_id[j] = + tree.treeLayers[depth].parent_ids[j]; + new_bc.beamRequestsInfo[request_index].probs[j] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[j] = + tree.treeLayers[depth].tokens[j]; + // std::cout << "token: " << j << ": " + // << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } } if (verbose) { @@ -1625,6 +1755,139 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, } } +// bit mask related function + +// prompt phase, init task +void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = initLength; + + bitmask.prompt_size = initLength; + bitmask.this_layer_size = initLength; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; +} + +// prepare next init +void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size) { + // assert(initLength == 1); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + assert(initLength >= 1 && "verified token num should >= 1"); + + // std::cout << "non tree size: " << non_tree_size << ", " + // << bitmask.non_tree_cache_size << "\n"; + + bitmask.non_tree_cache_size = non_tree_size + initLength - 1; + bitmask.tree_size = 1; + bitmask.this_layer_size = initLength; + // std::cout << "non_tree_size: " << non_tree_size << "\n"; + bitmask.prompt_size = 1; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + + // std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; +} + +// prepare next beam, append layers to the tree +void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth) { + int pre_tree_size = bitmask.tree_size; + bitmask.tree_size += newNodes; + bitmask.this_layer_size = newNodes; + assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // preBeamSize: replicate num + + // add relationship with input/prompt + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = pre_tree_size; j < bitmask.tree_size; j++) { + bitmask.mask[i] |= (1 << j); + // std::cout << "see bit mask append: " << i << ", to" << j + // << std::bitset<64>(bitmask.mask[i]) << "\n"; + } + } + + // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " + // << pre_tree_size << ", " << bitmask.prompt_size << ", " + // << preBeamSize << "\n"; + + // int num_groups = newNodes / preBeamSize; + // int group_size = newNodes / num_groups; + // add relations to branch + // requests in same groups share same relations, except the last token. + + // set middle layers + // skip the root prompt/tokens + int token_idx = bitmask.prompt_size; + int new_nodes_start_idx = pre_tree_size; + // std::cout << "new nodes start " << new_nodes_start_idx << "\n"; + for (int i = 1; i < currentDepth; i++) { + new_nodes_start_idx = pre_tree_size; + int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; + // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer + // << "group size: " << newNodes / nodes_this_layer << "\n"; + for (int j = 0; j < nodes_this_layer; j++) { + int group_size = newNodes / nodes_this_layer; + for (int k = 0; k < group_size; k++) { + bitmask.mask[token_idx] |= (1 << new_nodes_start_idx); + new_nodes_start_idx += 1; + } + token_idx += 1; + } + } + + // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " + // << new_nodes_start_idx << ", " << newNodes + // << "current depth: " << currentDepth << "\n"; + // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; + + // std::cout << "tree size: " << bitmask.tree_size << "\n"; + assert(token_idx == pre_tree_size); + assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); + + // assert(currentDepth <= 2); + // set last layer, all tokens are only relevant to it self; + for (int i = token_idx; i < bitmask.tree_size; i++) { + bitmask.mask[i] |= (1 << i); + // std::cout << "set rel: " << i << "to: " << i << "\n"; + } + + // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){ + // assert(false); + // } + + // std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; + // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; +} + bool PreOrder( BeamTree const &tree, int max_depth, @@ -1740,12 +2003,43 @@ std::vector> // In this case the inputSeriedTree ends with padding 0s assert(inputSerializedTree.size() >= outputSerializedTree.size()); + int *treeLayers = new int[inputSerializedTree.size()]; + int node_num = 1; + int layer_num = 0; + for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) { + if (token_id == (inputSerializedTree.size() - 1) || + inputSerializedTree.at(token_id + 1).second != + inputSerializedTree.at(token_id).second) { + treeLayers[layer_num] = node_num; + layer_num += 1; + node_num = 1; + } else { + node_num++; + } + } + + // to avoid branch switch when same tokens in input tree. + // todo, only checked for N->1->1->1 cases + + bool findFirst = false; + layer_num = -1; + int first_layer_slot = 0; + int first_layer_slot_total = 0; + int processed_whole_layer_tokens = 0; + for (int i = 0; i < outputSerializedTree.size(); i++) { auto input = inputSerializedTree.at(i); auto output = outputSerializedTree.at(i); + if (i == 0 || inputSerializedTree.at(i - 1).second != + inputSerializedTree.at(i).second) { + layer_num += 1; + processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1]; + } + if (i == 0) { verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); // > if (input.first == verifiedTree.back().first && input.second == verifiedTree.back().second) { - verifiedTree.push_back(output); - new_committed_tokens.push_back(std::make_pair( - input.second, - committed_tokens.at(guid).at(i).second)); // + if (findFirst) { + // must in this branch. + int layer_slot = i - processed_whole_layer_tokens; + int layer_slot_total = treeLayers[layer_num]; + if ((first_layer_slot == layer_slot)) { + verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( + input.second, committed_tokens.at(guid).at(i).second)); + // at this point, you'll not go other branches + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << + // "\n"; + + } else { + printf("not correct slot\n"); + } + } else { + verifiedTree.push_back(output); + first_layer_slot = i - processed_whole_layer_tokens; + first_layer_slot_total = treeLayers[layer_num]; + findFirst = true; + new_committed_tokens.push_back(std::make_pair( + input.second, + committed_tokens.at(guid).at(i).second)); // + // at this point, you'll not go other branches + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << "\n"; + } + assert(committed_tokens.at(guid).at(i).first == input.second); } } @@ -1804,6 +2125,8 @@ std::vector> << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; std::cout << "[Traverse Beam Tree] beam_width: " << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + std::cout << "[Traverse Beam Tree] start index: " + << first_token_depth_in_request << "\n"; } auto guid = old_bc.requestsInfo[request_index].request_guid; @@ -1811,18 +2134,30 @@ std::vector> // std::cout << "request.beam_trees.size(): " << request.beam_trees.size() // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - // std::cout << "\n\n"; + // std::cout << "print beam tree: " + // << "\n"; + std::vector> serializedTree; + for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { + // std::cout << "tree layer: " << i + // << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer + // << "\n"; + // push tokens into tree + for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { + // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; + serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); + } + } // token, index // todo make this one global for different stages - std::vector> serializedTree; - PreOrder(tree, - old_bc.beamRequestsInfo[request_index].max_depth, - 0, - old_bc.beamRequestsInfo[request_index].beam_size, - 0, - serializedTree, - verbose); + + // PreOrder(tree, + // old_bc.beamRequestsInfo[request_index].max_depth, + // 0, + // old_bc.beamRequestsInfo[request_index].beam_size, + // 0, + // serializedTree, + // verbose); // print it if (verbose) { @@ -1857,6 +2192,10 @@ std::vector> input_trees, int root_depth, RequestGuid guid) { + assert(input_trees.size() == 1 && "currently using one ssm"); + dfs_tree_inputs[guid] = input_trees.at(0); + return input_trees.at(0); + std::vector> merged_tree; std::unordered_map> childrens; diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 1e756606f8..fadbf80d6d 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -58,6 +58,91 @@ void RequestManager::load_tokens_task( stream)); } +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + size_t total_copy_size = 0; + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::tokensInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::requestsInfo); + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); +} + void RequestManager::load_positions_task( Task const *task, std::vector const ®ions, diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index cd3e03fff6..51c52c3026 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -30,6 +30,7 @@ void RequestManager::load_tokens_task( // BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; // Extreme long prompts are not supported, only load up to @@ -57,6 +58,91 @@ void RequestManager::load_tokens_task( stream)); } +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + size_t total_copy_size = 0; + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::tokensInfo); + + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::requestsInfo); + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); +} + void RequestManager::load_positions_task( Task const *task, std::vector const ®ions, From 3047c82aab223b7ff2f6b49cc5489bd89d5b07af Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 31 Dec 2023 19:17:30 -0500 Subject: [PATCH 289/344] Reducing memory requirements by reusing logical regions (#1254) * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * Reuse regions for inference to reduce memory requirement * bug fix when reused regions are assigned to different pipeline stages --- include/flexflow/model.h | 11 +- include/flexflow/ops/fused.h | 11 +- src/mapper/mapper.cc | 3 +- src/ops/fused.cc | 56 ++++++- src/ops/fused.cu | 31 ++-- src/ops/inc_multihead_self_attention.cu | 14 ++ src/runtime/inference_manager.cc | 190 +++++++++--------------- src/runtime/model.cc | 116 ++++++++------- 8 files changed, 239 insertions(+), 193 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 16df99ab1a..cda1f91c89 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1034,8 +1034,15 @@ class FFModel { void get_metrics(); void backward(int seq_length = -1); void update(); - bool apply_fusion(std::vector const &operators, - std::vector &new_operators); + bool apply_fusion( + std::vector const &operators, + std::vector &new_operators, + std::unordered_map> + *parallel_tensor_mapping = nullptr); + bool check_operators_integrity( + std::vector const &old_operators, + std::unordered_map> + *pt_mapping = nullptr); Op *get_final_operator() const; void compile(LossType loss_type, std::vector const &metrics, diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index 87c2201c28..a8326e9ab4 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -23,7 +23,16 @@ class FusedOp : public Op { SOURCE_OUTPUT, }; FusedOp(FFModel &model, Op *op); - bool add_operator(FFModel &model, Op *op); + static bool use_same_regions( + ParallelTensor const source_tensor, + ParallelTensor const target_tensor, + std::unordered_map> + *pt_mapping = nullptr); + bool add_operator( + FFModel &model, + Op *op, + std::unordered_map> + *parallel_tensor_mapping = nullptr); ParallelTensor init_inout(FFModel &model, const ParallelTensor input) { assert(0); return ParallelTensor(); diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index a86a6167a6..a2fb1d89be 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -934,13 +934,14 @@ void FFMapper::map_inline(const MapperContext ctx, &footprint)) { log_ff_mapper.error( "FlexFlow Mapper failed allocation of size %zd bytes" - " for region requirement of inline ammping in task %s (UID %lld)" + " for region requirement of inline mapping in task %s (UID %lld)" " in memory " IDFMT "for processor " IDFMT ".", footprint, inline_op.parent_task->get_task_name(), inline_op.parent_task->get_unique_id(), target_memory.id, inline_op.parent_task->current_proc.id); + printf("target_memory.kind() = %d\n", target_memory.kind()); assert(false); } else { output.chosen_instances.push_back(result); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 1d5db2f461..9ad5c4dc9c 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -115,7 +115,42 @@ FusedOp::FusedOp(FFModel &model, Op *op) } } -bool FusedOp::add_operator(FFModel &model, Op *op) { +bool FusedOp::use_same_regions( + ParallelTensor const source_tensor, + ParallelTensor const target_tensor, + std::unordered_map> + *pt_mapping) { + if (pt_mapping == nullptr) { + return (source_tensor->region == target_tensor->region); + } else { + assert(pt_mapping->find(source_tensor) != pt_mapping->end()); + assert(pt_mapping->find(target_tensor) != pt_mapping->end()); + std::vector const &source_mapped_tensor_vector = + (*pt_mapping)[source_tensor]; + std::vector const &target_mapped_tensor_vector = + (*pt_mapping)[target_tensor]; + assert(source_mapped_tensor_vector.size() == + target_mapped_tensor_vector.size()); + bool same_region = source_mapped_tensor_vector[0]->region == + target_mapped_tensor_vector[0]->region + ? true + : false; + // Same that the two vectors use the exact same regions + if (same_region) { + for (size_t i = 0; i < source_mapped_tensor_vector.size(); i++) { + assert(source_mapped_tensor_vector[i]->region == + target_mapped_tensor_vector[i]->region); + } + } + return same_region; + } +} + +bool FusedOp::add_operator( + FFModel &model, + Op *op, + std::unordered_map> + *pt_mapping) { // Context ctx = model.config.lg_ctx; // Runtime* runtime = model.config.lg_hlr; // Currently assume fusion optimization is performed @@ -164,7 +199,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { for (int i = 0; i < op->numInputs; i++) { bool found = false; for (int j = 0; j < numInputs; j++) { - if (inputs[j]->region == op->inputs[i]->region) { + if (use_same_regions(inputs[j], op->inputs[i], pt_mapping)) { // This input is one of my inputs assert(!found); assert(inputs[j]->region != LogicalRegion::NO_REGION); @@ -175,7 +210,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { } } for (int j = 0; j < numOutputs; j++) { - if ((outputs[j]->region == op->inputs[i]->region) && (!found)) { + if (use_same_regions(outputs[j], op->inputs[i], pt_mapping) && (!found)) { // This input is one of my outputs assert(!found); assert(outputs[j]->region != LogicalRegion::NO_REGION); @@ -201,6 +236,11 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { for (int i = 0; i < op->numWeights; i++) { bool found = false; for (int j = 0; j < numWeights; j++) { + // pt_mapping does not apply to weights + if (pt_mapping != nullptr) { + assert(pt_mapping->find(weights[j]) == pt_mapping->end()); + assert(pt_mapping->find(op->weights[i]) == pt_mapping->end()); + } if (weights[j]->region == op->weights[i]->region) { assert(!found); assert(weights[j]->region != LogicalRegion::NO_REGION); @@ -226,7 +266,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { for (int i = 0; i < op->numOutputs; i++) { bool found = false; for (int j = 0; j < numOutputs; j++) { - if (outputs[j]->region == op->outputs[i]->region) { + if (use_same_regions(outputs[j], op->outputs[i], pt_mapping)) { assert(!found); found = true; op_output_source[output_offset + i] = SOURCE_OUTPUT; @@ -347,22 +387,26 @@ void FusedOp::init_inference(FFModel const &ff, Domain domain = runtime->get_index_space_domain(ctx, parallel_is); int ioff = 0, ooff = 0; for (int op = 0; op < numOperators; op++) { - // prepare batch_inputs, batch_outputs for operators[i] + // prepare batch_inputs, batch_outputs for operators[op] std::vector my_batch_inputs; std::vector my_batch_outputs; for (int i = 0; i < op_num_inputs[op]; i++) { int my_off = op_input_idx[i + ioff]; if (op_input_source[i + ioff] == SOURCE_INPUT) { + assert(my_off < batch_inputs.size()); my_batch_inputs.push_back(batch_inputs[my_off]); } else if (op_input_source[i + ioff] == SOURCE_OUTPUT) { + assert(my_off < batch_outputs.size()); my_batch_inputs.push_back(batch_outputs[my_off]); } else { assert(false); } } for (int i = 0; i < op_num_outputs[op]; i++) { + int my_off = op_output_idx[i + ooff]; assert(op_output_source[i + ooff] == SOURCE_OUTPUT); - my_batch_outputs.push_back(batch_outputs[i + ooff]); + assert(my_off < batch_outputs.size()); + my_batch_outputs.push_back(batch_outputs[my_off]); } ioff += op_num_inputs[op]; ooff += op_num_outputs[op]; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index b157453035..c6ba0b04c5 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -173,10 +173,11 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[i + ooff]; + // my_od[i] = output_domain[my_off]; + // my_op[i] = output_ptr[my_off]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -619,9 +620,11 @@ __host__ void int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; + assert(my_off < fused->numInputs); my_input_accessor[i] = input_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; + assert(my_off < fused->numOutputs); my_input_accessor[i] = output_accessor[my_off]; } else { assert(false); @@ -631,13 +634,16 @@ __host__ void assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + assert(fused->op_weight_idx[i + woff] < fused->numWeights); my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + assert(my_off < fused->numOutputs); // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -1108,7 +1114,8 @@ __host__ void weight_accessor[fused->op_weight_idx[i + woff]]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); + int my_off = fused->op_output_idx[i + ooff]; + output_accessors_to_save.push_back(output_accessor[my_off]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -1310,13 +1317,13 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; - // my_grad_od[i] = output_grad_domain[fused->op_output_idx[i + ooff]]; - // my_grad_op[i] = output_grad_ptr[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; + int my_off = fused->op_output_idx[i + ooff]; + // my_od[i] = output_domain[my_off]; + // my_op[i] = output_ptr[my_off]; + my_output_accessor[i] = output_accessor[my_off]; + // my_grad_od[i] = output_grad_domain[my_off]; + // my_grad_op[i] = output_grad_ptr[my_off]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index da70e23f87..db64868cb9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1530,4 +1530,18 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( half const *bias_ptr, int num_tokens, cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + cudaStream_t stream); }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 8af0ed8978..cc76da58bb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -151,7 +151,9 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { std::vector list; bool found_parallel_tensor = false; - if (model->cpu_offload) { + // Always enable memory reuse + // if (model->cpu_offload) { + if (true) { for (auto const &pre_pt : tensor_buffer) { bool used_by_future_operator = false; bool used_by_current_operator = false; @@ -159,6 +161,12 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // Continue if shape mismatches continue; } + // Skip if pre_pt and pt_base are in different pipeline stages + // we compare their pipeline stages using the machine views + // of the first data pipeline + if (pre_pt.second[0]->machine_view != machine_views[0]) { + continue; + } // Check that pt cannot be used as an input to the current operator for (int j = 0; j < op->numInputs; j++) { if (parallel_tensor_list_overlaps(tensor_buffer[op->inputs[j]], @@ -221,6 +229,67 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } // std::cout << std::endl; } + + // Perform fusion optimizations + if (model->config.perform_fusion) { + fprintf(stderr, "Applying fusion optimizations during compilation...\n"); + fprintf( + stderr, "%zu operators before fusion...\n", model->operators.size()); + std::vector new_operators; + std::vector old_operators = model->operators; + while ( + model->apply_fusion(model->operators, new_operators, &tensor_buffer)) { + for (size_t i = 0; i < new_operators.size(); i++) { + for (int idx = 0; idx < new_operators[i]->numInputs; idx++) { + for (size_t j = i + 1; j < new_operators.size(); j++) { + if (new_operators[i]->inputs[idx]->owner_op == new_operators[j]) { + assert(false); + } + } + } + } + model->operators = new_operators; + } + assert(model->check_operators_integrity(old_operators, &tensor_buffer)); + fprintf(stderr, "%zu operators after fusion...\n", model->operators.size()); + } + + // print optimized graph + for (size_t i = 0; i < model->operators.size(); i++) { + Op *op = model->operators[i]; + if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { + continue; + } + printf("operator[%zu]: type(%s) guid(%lu)\n", + i, + get_operator_type_name(model->operators[i]->op_type).c_str(), + model->operators[i]->op_guid); + for (int j = 0; j < op->numInputs; j++) { + assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); + LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; + printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numOutputs; j++) { + LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; + printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numWeights; j++) { + LogicalRegion handle = op->weights[j]->region; + printf("\tweights[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + } } void InferenceManager::init_operators_inference(FFModel *model) { @@ -577,124 +646,7 @@ void FFModel::compile_inference() { assert(op->outputs[i]->parallel_tensor_guid != 0); } } - // Perform fusion optimizations - if (config.perform_fusion) { - fprintf(stderr, "Applying fusion optimizations during compilation...\n"); - fprintf(stderr, "%zu operators before fusion...\n", operators.size()); - std::vector new_operators; - std::vector old_operators = operators; - while (apply_fusion(operators, new_operators)) { - for (size_t i = 0; i < new_operators.size(); i++) { - for (int idx = 0; idx < new_operators[i]->numInputs; idx++) { - for (size_t j = i + 1; j < new_operators.size(); j++) { - if (new_operators[i]->inputs[idx]->owner_op == new_operators[j]) { - assert(false); - } - } - } - } - operators = new_operators; - } - // Check integrity - for (size_t l = 0; l < operators.size(); l++) { - if (operators[l]->op_type == OP_FUSED) { - FusedOp *fused = (FusedOp *)operators[l]; - int ioff = 0, woff = 0, ooff = 0; - for (int op = 0; op < fused->numOperators; op++) { - Op *old_op = fused->operators[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { - assert(fused->inputs[my_off]->region == - old_op->inputs[i]->region); - } else if (fused->op_input_source[i + ioff] == - FusedOp::SOURCE_OUTPUT) { - assert(fused->outputs[my_off]->region == - old_op->inputs[i]->region); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - int my_off = fused->op_weight_idx[i + woff]; - assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); - assert(fused->weights[my_off]->region == - old_op->weights[i]->region); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(fused->outputs[my_off]->region == - old_op->outputs[i]->region); - } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - } else { - bool found = false; - for (size_t i = 0; i < old_operators.size(); i++) { - if (old_operators[i] == operators[l]) { - assert(!found); - found = true; - } - } - assert(found); - } - } - fprintf(stderr, "%zu operators after fusion...\n", operators.size()); - for (size_t i = 0; i < operators.size(); i++) { - Op *op = operators[i]; - printf("operator[%zu]: type(%s) guid(%lu)\n", - i, - get_operator_type_name(operators[i]->op_type).c_str(), - operators[i]->op_guid); - for (int j = 0; j < op->numInputs; j++) { - LogicalRegion handle = op->inputs[j]->region; - printf("\tinputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numOutputs; j++) { - LogicalRegion handle = op->outputs[j]->region; - printf("\toutputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numWeights; j++) { - LogicalRegion handle = op->weights[j]->region; - printf("\tweights[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - } - } - for (size_t i = 0; i < operators.size(); i++) { - Op *op = operators[i]; - printf("operator[%zu]: type(%d)\n", i, operators[i]->op_type); - for (int j = 0; j < op->numInputs; j++) { - LogicalRegion handle = op->inputs[j]->region; - printf("\tinputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numOutputs; j++) { - LogicalRegion handle = op->outputs[j]->region; - printf("\toutputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - } + #ifdef FF_USE_NCCL for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 37605c44a4..3bfe429ddd 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2859,8 +2859,11 @@ void FFModel::compile(Optimizer *_optimizer, compile(loss_type, metrics, comp_mode); } -bool FFModel::apply_fusion(std::vector const &operators, - std::vector &new_operators) { +bool FFModel::apply_fusion( + std::vector const &operators, + std::vector &new_operators, + std::unordered_map> + *parallel_tensor_mapping) { // Context ctx = config.lg_ctx; // Runtime* runtime = config.lg_hlr; for (size_t l = 1; l < operators.size() - 1; l++) { @@ -2925,7 +2928,8 @@ bool FFModel::apply_fusion(std::vector const &operators, fused_op = new FusedOp(*this, operators[i]); allocate_new_fused_op = true; } - if (fused_op->add_operator(*this, operators[l])) { + if (fused_op->add_operator( + *this, operators[l], parallel_tensor_mapping)) { // Construct new operators new_operators.clear(); for (size_t j = 0; j < i; j++) { @@ -2943,7 +2947,9 @@ bool FFModel::apply_fusion(std::vector const &operators, (op->inputs[idx]->owner_op == operators[i])) { int found = -1; for (int k = 0; k < fused_op->numOutputs; k++) { - if (fused_op->outputs[k]->region == op->inputs[idx]->region) { + if (fused_op->use_same_regions(fused_op->outputs[k], + op->inputs[idx], + parallel_tensor_mapping)) { assert(found == -1); found = k; } @@ -2959,7 +2965,6 @@ bool FFModel::apply_fusion(std::vector const &operators, assert(new_operators.size() + 1 == operators.size()); return true; } else { - // TODO: delete fused_op to avoid memory leakage if (allocate_new_fused_op) { delete fused_op; } @@ -3490,53 +3495,7 @@ void FFModel::compile(LossType loss_type, } operators = new_operators; } - // Check integrity - for (size_t l = 0; l < operators.size(); l++) { - if (operators[l]->op_type == OP_FUSED) { - FusedOp *fused = (FusedOp *)operators[l]; - int ioff = 0, woff = 0, ooff = 0; - for (int op = 0; op < fused->numOperators; op++) { - Op *old_op = fused->operators[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { - assert(fused->inputs[my_off]->region == - old_op->inputs[i]->region); - } else if (fused->op_input_source[i + ioff] == - FusedOp::SOURCE_OUTPUT) { - assert(fused->outputs[my_off]->region == - old_op->inputs[i]->region); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - int my_off = fused->op_weight_idx[i + woff]; - assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); - assert(fused->weights[my_off]->region == - old_op->weights[i]->region); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(fused->outputs[my_off]->region == - old_op->outputs[i]->region); - } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - } else { - bool found = false; - for (size_t i = 0; i < old_operators.size(); i++) { - if (old_operators[i] == operators[l]) { - assert(!found); - found = true; - } - } - assert(found); - } - } + assert(check_operators_integrity(old_operators)); fprintf(stderr, "%zu operators after fusion...\n", operators.size()); for (size_t i = 0; i < operators.size(); i++) { Op *op = operators[i]; @@ -3678,6 +3637,59 @@ void FFModel::compile(LossType loss_type, #endif } +bool FFModel::check_operators_integrity( + std::vector const &old_operators, + std::unordered_map> + *pt_mapping) { + // Check integrity + for (size_t l = 0; l < operators.size(); l++) { + if (operators[l]->op_type == OP_FUSED) { + FusedOp *fused = (FusedOp *)operators[l]; + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + Op *old_op = fused->operators[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { + assert(FusedOp::use_same_regions( + fused->inputs[my_off], old_op->inputs[i], pt_mapping)); + } else if (fused->op_input_source[i + ioff] == + FusedOp::SOURCE_OUTPUT) { + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->inputs[i], pt_mapping)); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + int my_off = fused->op_weight_idx[i + woff]; + assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); + assert(fused->weights[my_off]->region == old_op->weights[i]->region); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + } else { + bool found = false; + for (size_t i = 0; i < old_operators.size(); i++) { + if (old_operators[i] == operators[l]) { + assert(!found); + found = true; + } + } + assert(found); + } + } + return true; +} + struct PropagationEdgeInfo { Op *dstOp; size_t size; From 1901f65bc2045860d4c26c26c2a158b270cb300a Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 31 Dec 2023 23:25:21 -0500 Subject: [PATCH 290/344] embedding return when no token --- src/ops/embedding.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 76236e65ff..3be3eac618 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -478,6 +478,7 @@ FutureMap Embedding::inference(FFModel const &ff, 0 /*mapper_id*/, machine_view_hash); // regions[0]: input + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection*/, READ_ONLY, @@ -516,6 +517,10 @@ void Embedding::forward_task(Task const *task, assert(task->regions.size() == 3); // Assert that weight and output must have the same data type // otherwise, a cast operator should be inserted + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } assert(m->weight_type[0] == m->output_type[0]); assert(m->input_type[0] == DT_INT32 || m->input_type[0] == DT_INT64); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( From 130ad92f8369d6ba39dd470dafd160b844e49e99 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 01:39:41 -0500 Subject: [PATCH 291/344] use arg topk instead of beam topk --- include/flexflow/flexflow_c.h | 1 + include/flexflow/model.h | 2 + include/flexflow/ops/arg_topk.h | 16 ++- include/flexflow/ops/arg_topk_params.h | 1 + inference/models/llama.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 5 +- src/c/flexflow_c.cc | 4 +- src/ops/arg_topk.cc | 185 +++++++++++++++++++------ src/ops/arg_topk.cu | 91 +++++++++--- src/runtime/model.cc | 18 +++ 10 files changed, 258 insertions(+), 67 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 01a2818a2b..305c8da513 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -571,6 +571,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name); flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 16df99ab1a..01244a371b 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -146,6 +146,7 @@ enum TaskIDs { TOPK_BWD_TASK_ID, ARG_TOPK_INIT_TASK_ID, ARG_TOPK_INF_TASK_ID, + ARG_TOPK_INF_SPECULATIVE_TASK_ID, SAMPLING_INIT_TASK_ID, SAMPLING_INF_TASK_ID, ARGMAX_INIT_TASK_ID, @@ -674,6 +675,7 @@ class FFModel { // Tensor *outputs, int k, bool sorted, + bool speculative_decoding, char const *name = NULL); Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); Tensor sampling(const Tensor input, float top_p, char const *name = NULL); diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index 8b2d2aa11c..3822a5e41e 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -12,6 +12,8 @@ class ArgTopKMeta : public OpMeta { public: ArgTopKMeta(FFHandler handle, Op const *op); bool sorted; + int k; + bool speculative_decoding; }; class ArgTopK : public Op { @@ -23,6 +25,7 @@ class ArgTopK : public Op { const ParallelTensor input, int k, bool sorted, + bool speculative_decoding, char const *name); ArgTopK(FFModel &model, LayerID const &layer_guid, @@ -61,6 +64,11 @@ class ArgTopK : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static BeamInferenceResult inference_speculative_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); void serialize(Legion::Serializer &s) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, @@ -75,22 +83,26 @@ class ArgTopK : public Op { template static void forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, ffStream_t stream); static void forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &prob, GenericTensorAccessorW const &indices, - int batch_size); + int batch_size, + BeamSearchBatchConfig const *bc); Params get_params() const; public: int k; bool sorted; + bool speculative_decoding; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index 9d2a21034f..bd9c38e2a9 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -11,6 +11,7 @@ struct ArgTopKParams { LayerID layer_guid; int k; bool sorted; + bool speculative_decoding; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ArgTopKParams const &, ArgTopKParams const &); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 10001ee916..e9c84efe90 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -247,7 +247,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); // output = ff.argmax(softmax, /*beam_Search*/ true); - output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true); // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index de3f7e6929..a3c221474d 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3349,7 +3349,7 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM ) - def arg_top_k(self, input, k, sorted, name=None): + def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): """Defines the Arg TopK layer. :param input: the input Tensor. @@ -3361,6 +3361,9 @@ def arg_top_k(self, input, k, sorted, name=None): :param sorted: Whether the entries should be sorted :type sorted: bool + :param speculative_decoding: Whether you need to perform beam search + :type speculative_decoding: bool + :param name: the name of the layer. Default is None. :type name: string diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 80202f6f99..579fc5e2d1 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1489,10 +1489,12 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->arg_top_k(input, k, sorted, name); + Tensor tensor = + handle->arg_top_k(input, k, sorted, speculative_decoding, name); return FFCObjectWrapper::wrap(tensor); } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index a06b89de07..2727a1d249 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -51,6 +51,7 @@ using PCG::Node; Tensor FFModel::arg_top_k(const Tensor input, int k, bool sorted, + bool speculative_decoding, char const *name) { Layer *li = new Layer(this, OP_ARG_TOPK, @@ -58,7 +59,7 @@ Tensor FFModel::arg_top_k(const Tensor input, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + speculative_decoding ? 2 : 1 /*outputs*/, input); { int numdims = input->num_dims; @@ -71,9 +72,14 @@ Tensor FFModel::arg_top_k(const Tensor input, // numdims, dims, input->data_type, li, 0, true /*create_grad*/); li->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + if (speculative_decoding) { + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); + } } li->add_int_property("k", k); li->add_int_property("sorted", sorted); + li->add_int_property("speculative_decoding", speculative_decoding); layers.push_back(li); // outputs[0] = li->outputs[0]; // outputs[1] = li->outputs[1]; @@ -89,14 +95,23 @@ Op *ArgTopK::create_operator_from_layer( int k = value; layer->get_int_property("sorted", value); bool sorted = (bool)value; - return new ArgTopK( - model, layer->layer_guid, inputs[0], k, sorted, layer->name); + layer->get_int_property("speculative_decoding", value); + bool speculative_decoding = (bool)value; + + return new ArgTopK(model, + layer->layer_guid, + inputs[0], + k, + sorted, + speculative_decoding, + layer->name); } ArgTopKParams ArgTopK::get_params() const { ArgTopKParams params; params.k = this->k; params.sorted = this->sorted; + params.speculative_decoding = this->speculative_decoding; return params; } @@ -106,7 +121,8 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const { } bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { - return lhs.k == rhs.k && lhs.sorted == rhs.sorted; + return lhs.k == rhs.k && lhs.sorted == rhs.sorted && + lhs.speculative_decoding == rhs.speculative_decoding; } ArgTopK::ArgTopK(FFModel &model, @@ -114,6 +130,7 @@ ArgTopK::ArgTopK(FFModel &model, const ParallelTensor _input, int _k, bool _sorted, + bool _speculative_decoding, char const *name) : Op(model, OP_ARG_TOPK, @@ -121,9 +138,9 @@ ArgTopK::ArgTopK(FFModel &model, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + _speculative_decoding ? 2 : 1 /*outputs*/, _input), - k(_k), sorted(_sorted) { + k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) { // overwrite layer_guid layer_guid = _layer_guid; int numdim = inputs[0]->num_dims; @@ -131,26 +148,42 @@ ArgTopK::ArgTopK(FFModel &model, for (int i = 0; i < numdim; i++) { dims[i] = inputs[0]->dims[i]; } + dims[0].size = k; assert(inputs[0]->dims[0].degree == 1); assert(inputs[0]->dims[0].parallel_idx == -1); - // outputs[0] = model.create_parallel_tensor_legion_ordering( - // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, DT_INT32, this, 0 /*owner_idx*/); + if (_speculative_decoding) { + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); + } } ArgTopK::ArgTopK(FFModel &model, LayerID const &layer_guid, ArgTopK const &other, const ParallelTensor input) - : ArgTopK(model, layer_guid, input, other.k, other.sorted, other.name) {} + : ArgTopK(model, + layer_guid, + input, + other.k, + other.sorted, + other.speculative_decoding, + other.name) {} ArgTopK::ArgTopK(FFModel &model, ArgTopKParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) - : ArgTopK(model, params.layer_guid, input, params.k, params.sorted, name) {} + : ArgTopK(model, + params.layer_guid, + input, + params.k, + params.sorted, + params.speculative_decoding, + name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -243,8 +276,10 @@ OpMeta *ArgTopK::init_task(Task const *task, m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; + m->k = topk->k; std::strcpy(m->op_name, topk->name); m->layer_guid = topk->layer_guid; + m->speculative_decoding = topk->speculative_decoding; return m; } @@ -267,34 +302,64 @@ FutureMap ArgTopK::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // batch_outputs[1]->region)); - // launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + if (speculative_decoding) { + IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + + } else { + IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + } } InferenceResult @@ -317,9 +382,11 @@ InferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs; int batch_size = bc->num_active_tokens(); - ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); + ArgTopK::forward_kernel_wrapper( + m, input, probs, indices, batch_size, nullptr); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -334,6 +401,39 @@ InferenceResult return ir; } +BeamInferenceResult ArgTopK::inference_speculative_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == 3); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_active_tokens() == 0) { + // Directly return for empty batch config + BeamInferenceResult ir; + return ir; + } + ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int batch_size = bc.num_active_tokens(); + ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); + + BeamInferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); + download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + return ir; +} + void ArgTopK::backward(FFModel const &ff) { // ArgTopK does not support backward assert(false); @@ -345,6 +445,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->k); sez.serialize(this->sorted); + sez.serialize(this->speculative_decoding); } Node ArgTopK::deserialize(FFModel &ff, @@ -359,12 +460,15 @@ Node ArgTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); int k; bool sorted; + bool speculative_decoding; dez.deserialize(k); dez.deserialize(sorted); + dez.deserialize(speculative_decoding); ArgTopKParams params; params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; + params.speculative_decoding = speculative_decoding; return ff.get_or_create_node(inputs[0], params); } @@ -390,6 +494,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.k); hash_combine(key, params.sorted); + hash_combine(key, params.speculative_decoding); return key; } }; // namespace std diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 575e0183b4..0b8bb8b563 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -262,8 +262,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -313,7 +314,11 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } + int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -337,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -350,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -366,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, cudaStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -390,24 +398,58 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - arg_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, // float *output_ptr, + GenericTensorAccessorW const &probs, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -439,6 +481,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; int k = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; /*TODO: This prints to 5*/ + // batch_size = input.domain.get_volume() / length; // assert(indices.domain.get_volume() / k == batch_size); cudaEvent_t t_start, t_end; @@ -451,22 +494,26 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, if (input.data_type == DT_HALF) { ArgTopK::forward_kernel(m, input.get_half_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 37605c44a4..f72d320bc8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5917,6 +5917,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + "ArgTopK Speculative Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Speculative Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // BeamTopk task { TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); From 4259d2dfa5c42488dad76d511517e45c0ad438c7 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 10:08:38 -0500 Subject: [PATCH 292/344] embedding --- include/flexflow/ops/embedding.h | 4 ++ src/ops/embedding.cc | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ae93ef4d1d..0f1b1335d4 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -80,6 +80,10 @@ class Embedding : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 3be3eac618..40d5b600be 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -517,6 +517,70 @@ void Embedding::forward_task(Task const *task, assert(task->regions.size() == 3); // Assert that weight and output must have the same data type // otherwise, a cast operator should be inserted + assert(m->weight_type[0] == m->output_type[0]); + assert(m->input_type[0] == DT_INT32 || m->input_type[0] == DT_INT64); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR kernel = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() + 1 == output.domain.get_dim()); + for (size_t i = 0; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i + 1]); + assert(input.domain.lo()[i] == output.domain.lo()[i + 1]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } else { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() == output.domain.get_dim()); + for (size_t i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i]); + assert(input.domain.lo()[i] == output.domain.lo()[i]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } + + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } else { + in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } + forward_kernel_wrapper( + m, input, output, kernel, in_dim, out_dim, effective_batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Embedding::save_inference_tensors_to_file( + m, shard_id, nullptr, {input}, {kernel}, {output}); + } +} + +/* + regions[0](I): input + regions[1](O): output + regions[2](I): kernel +*/ +void Embedding::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + EmbeddingMeta *m = *((EmbeddingMeta **)task->local_args); + assert(regions.size() == 3); + assert(task->regions.size() == 3); + // Assert that weight and output must have the same data type + // otherwise, a cast operator should be inserted BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_tokens() == 0) { return; From fae7fba1994aaf3c04da250a04bec3beb217236e Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 10:13:30 -0500 Subject: [PATCH 293/344] fmt --- include/flexflow/ops/embedding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index 0f1b1335d4..ed89fcf37a 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -83,7 +83,7 @@ class Embedding : public Op { static void inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, - Legion::Runtime *runtime); + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, From 8d1d5842253a0b6c894bec14550dd1e88eb9c4fd Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 12:05:12 -0500 Subject: [PATCH 294/344] hip --- src/ops/arg_topk.cpp | 90 ++++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index 6db8abb8c4..f431d3d4bf 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -263,8 +263,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -314,7 +315,10 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -338,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -351,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -367,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, hipStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -391,28 +398,57 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - hipLaunchKernelGGL(arg_topk_forward_kernel, - num_blocks, - num_shards, - 0, - stream, - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &probs, // float *output_ptr, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // Domain in1_domain = runtime->get_index_space_domain( @@ -457,21 +493,27 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, ArgTopK::forward_kernel(m, input.get_half_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); From 25097e084772ed9693bef408315385a11340671b Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:12:07 -0500 Subject: [PATCH 295/344] SpecInfer: optimize performance (#1255) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip * embedding return when no token * use arg topk instead of beam topk * embedding * fmt * hip --------- Co-authored-by: Zhihao Jia --- include/flexflow/flexflow_c.h | 1 + include/flexflow/model.h | 2 + include/flexflow/ops/arg_topk.h | 16 ++- include/flexflow/ops/arg_topk_params.h | 1 + include/flexflow/ops/embedding.h | 4 + inference/models/llama.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 5 +- src/c/flexflow_c.cc | 4 +- src/ops/arg_topk.cc | 185 +++++++++++++++++++------ src/ops/arg_topk.cpp | 90 ++++++++---- src/ops/arg_topk.cu | 91 +++++++++--- src/ops/embedding.cc | 69 +++++++++ src/runtime/model.cc | 18 +++ 13 files changed, 397 insertions(+), 91 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 01a2818a2b..305c8da513 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -571,6 +571,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name); flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cda1f91c89..cf7bb3dd2d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -146,6 +146,7 @@ enum TaskIDs { TOPK_BWD_TASK_ID, ARG_TOPK_INIT_TASK_ID, ARG_TOPK_INF_TASK_ID, + ARG_TOPK_INF_SPECULATIVE_TASK_ID, SAMPLING_INIT_TASK_ID, SAMPLING_INF_TASK_ID, ARGMAX_INIT_TASK_ID, @@ -674,6 +675,7 @@ class FFModel { // Tensor *outputs, int k, bool sorted, + bool speculative_decoding, char const *name = NULL); Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); Tensor sampling(const Tensor input, float top_p, char const *name = NULL); diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index 8b2d2aa11c..3822a5e41e 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -12,6 +12,8 @@ class ArgTopKMeta : public OpMeta { public: ArgTopKMeta(FFHandler handle, Op const *op); bool sorted; + int k; + bool speculative_decoding; }; class ArgTopK : public Op { @@ -23,6 +25,7 @@ class ArgTopK : public Op { const ParallelTensor input, int k, bool sorted, + bool speculative_decoding, char const *name); ArgTopK(FFModel &model, LayerID const &layer_guid, @@ -61,6 +64,11 @@ class ArgTopK : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static BeamInferenceResult inference_speculative_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); void serialize(Legion::Serializer &s) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, @@ -75,22 +83,26 @@ class ArgTopK : public Op { template static void forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, ffStream_t stream); static void forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &prob, GenericTensorAccessorW const &indices, - int batch_size); + int batch_size, + BeamSearchBatchConfig const *bc); Params get_params() const; public: int k; bool sorted; + bool speculative_decoding; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index 9d2a21034f..bd9c38e2a9 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -11,6 +11,7 @@ struct ArgTopKParams { LayerID layer_guid; int k; bool sorted; + bool speculative_decoding; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ArgTopKParams const &, ArgTopKParams const &); diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ae93ef4d1d..ed89fcf37a 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -80,6 +80,10 @@ class Embedding : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 10001ee916..e9c84efe90 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -247,7 +247,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); // output = ff.argmax(softmax, /*beam_Search*/ true); - output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true); // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index de3f7e6929..a3c221474d 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3349,7 +3349,7 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM ) - def arg_top_k(self, input, k, sorted, name=None): + def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): """Defines the Arg TopK layer. :param input: the input Tensor. @@ -3361,6 +3361,9 @@ def arg_top_k(self, input, k, sorted, name=None): :param sorted: Whether the entries should be sorted :type sorted: bool + :param speculative_decoding: Whether you need to perform beam search + :type speculative_decoding: bool + :param name: the name of the layer. Default is None. :type name: string diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 80202f6f99..579fc5e2d1 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1489,10 +1489,12 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->arg_top_k(input, k, sorted, name); + Tensor tensor = + handle->arg_top_k(input, k, sorted, speculative_decoding, name); return FFCObjectWrapper::wrap(tensor); } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index a06b89de07..2727a1d249 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -51,6 +51,7 @@ using PCG::Node; Tensor FFModel::arg_top_k(const Tensor input, int k, bool sorted, + bool speculative_decoding, char const *name) { Layer *li = new Layer(this, OP_ARG_TOPK, @@ -58,7 +59,7 @@ Tensor FFModel::arg_top_k(const Tensor input, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + speculative_decoding ? 2 : 1 /*outputs*/, input); { int numdims = input->num_dims; @@ -71,9 +72,14 @@ Tensor FFModel::arg_top_k(const Tensor input, // numdims, dims, input->data_type, li, 0, true /*create_grad*/); li->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + if (speculative_decoding) { + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); + } } li->add_int_property("k", k); li->add_int_property("sorted", sorted); + li->add_int_property("speculative_decoding", speculative_decoding); layers.push_back(li); // outputs[0] = li->outputs[0]; // outputs[1] = li->outputs[1]; @@ -89,14 +95,23 @@ Op *ArgTopK::create_operator_from_layer( int k = value; layer->get_int_property("sorted", value); bool sorted = (bool)value; - return new ArgTopK( - model, layer->layer_guid, inputs[0], k, sorted, layer->name); + layer->get_int_property("speculative_decoding", value); + bool speculative_decoding = (bool)value; + + return new ArgTopK(model, + layer->layer_guid, + inputs[0], + k, + sorted, + speculative_decoding, + layer->name); } ArgTopKParams ArgTopK::get_params() const { ArgTopKParams params; params.k = this->k; params.sorted = this->sorted; + params.speculative_decoding = this->speculative_decoding; return params; } @@ -106,7 +121,8 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const { } bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { - return lhs.k == rhs.k && lhs.sorted == rhs.sorted; + return lhs.k == rhs.k && lhs.sorted == rhs.sorted && + lhs.speculative_decoding == rhs.speculative_decoding; } ArgTopK::ArgTopK(FFModel &model, @@ -114,6 +130,7 @@ ArgTopK::ArgTopK(FFModel &model, const ParallelTensor _input, int _k, bool _sorted, + bool _speculative_decoding, char const *name) : Op(model, OP_ARG_TOPK, @@ -121,9 +138,9 @@ ArgTopK::ArgTopK(FFModel &model, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + _speculative_decoding ? 2 : 1 /*outputs*/, _input), - k(_k), sorted(_sorted) { + k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) { // overwrite layer_guid layer_guid = _layer_guid; int numdim = inputs[0]->num_dims; @@ -131,26 +148,42 @@ ArgTopK::ArgTopK(FFModel &model, for (int i = 0; i < numdim; i++) { dims[i] = inputs[0]->dims[i]; } + dims[0].size = k; assert(inputs[0]->dims[0].degree == 1); assert(inputs[0]->dims[0].parallel_idx == -1); - // outputs[0] = model.create_parallel_tensor_legion_ordering( - // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, DT_INT32, this, 0 /*owner_idx*/); + if (_speculative_decoding) { + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); + } } ArgTopK::ArgTopK(FFModel &model, LayerID const &layer_guid, ArgTopK const &other, const ParallelTensor input) - : ArgTopK(model, layer_guid, input, other.k, other.sorted, other.name) {} + : ArgTopK(model, + layer_guid, + input, + other.k, + other.sorted, + other.speculative_decoding, + other.name) {} ArgTopK::ArgTopK(FFModel &model, ArgTopKParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) - : ArgTopK(model, params.layer_guid, input, params.k, params.sorted, name) {} + : ArgTopK(model, + params.layer_guid, + input, + params.k, + params.sorted, + params.speculative_decoding, + name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -243,8 +276,10 @@ OpMeta *ArgTopK::init_task(Task const *task, m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; + m->k = topk->k; std::strcpy(m->op_name, topk->name); m->layer_guid = topk->layer_guid; + m->speculative_decoding = topk->speculative_decoding; return m; } @@ -267,34 +302,64 @@ FutureMap ArgTopK::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // batch_outputs[1]->region)); - // launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + if (speculative_decoding) { + IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + + } else { + IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + } } InferenceResult @@ -317,9 +382,11 @@ InferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs; int batch_size = bc->num_active_tokens(); - ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); + ArgTopK::forward_kernel_wrapper( + m, input, probs, indices, batch_size, nullptr); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -334,6 +401,39 @@ InferenceResult return ir; } +BeamInferenceResult ArgTopK::inference_speculative_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == 3); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_active_tokens() == 0) { + // Directly return for empty batch config + BeamInferenceResult ir; + return ir; + } + ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int batch_size = bc.num_active_tokens(); + ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); + + BeamInferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); + download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + return ir; +} + void ArgTopK::backward(FFModel const &ff) { // ArgTopK does not support backward assert(false); @@ -345,6 +445,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->k); sez.serialize(this->sorted); + sez.serialize(this->speculative_decoding); } Node ArgTopK::deserialize(FFModel &ff, @@ -359,12 +460,15 @@ Node ArgTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); int k; bool sorted; + bool speculative_decoding; dez.deserialize(k); dez.deserialize(sorted); + dez.deserialize(speculative_decoding); ArgTopKParams params; params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; + params.speculative_decoding = speculative_decoding; return ff.get_or_create_node(inputs[0], params); } @@ -390,6 +494,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.k); hash_combine(key, params.sorted); + hash_combine(key, params.speculative_decoding); return key; } }; // namespace std diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index 6db8abb8c4..f431d3d4bf 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -263,8 +263,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -314,7 +315,10 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -338,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -351,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -367,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, hipStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -391,28 +398,57 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - hipLaunchKernelGGL(arg_topk_forward_kernel, - num_blocks, - num_shards, - 0, - stream, - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &probs, // float *output_ptr, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // Domain in1_domain = runtime->get_index_space_domain( @@ -457,21 +493,27 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, ArgTopK::forward_kernel(m, input.get_half_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 575e0183b4..0b8bb8b563 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -262,8 +262,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -313,7 +314,11 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } + int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -337,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -350,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -366,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, cudaStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -390,24 +398,58 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - arg_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, // float *output_ptr, + GenericTensorAccessorW const &probs, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -439,6 +481,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; int k = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; /*TODO: This prints to 5*/ + // batch_size = input.domain.get_volume() / length; // assert(indices.domain.get_volume() / k == batch_size); cudaEvent_t t_start, t_end; @@ -451,22 +494,26 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, if (input.data_type == DT_HALF) { ArgTopK::forward_kernel(m, input.get_half_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 76236e65ff..40d5b600be 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -478,6 +478,7 @@ FutureMap Embedding::inference(FFModel const &ff, 0 /*mapper_id*/, machine_view_hash); // regions[0]: input + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection*/, READ_ONLY, @@ -566,6 +567,74 @@ void Embedding::forward_task(Task const *task, } } +/* + regions[0](I): input + regions[1](O): output + regions[2](I): kernel +*/ +void Embedding::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + EmbeddingMeta *m = *((EmbeddingMeta **)task->local_args); + assert(regions.size() == 3); + assert(task->regions.size() == 3); + // Assert that weight and output must have the same data type + // otherwise, a cast operator should be inserted + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + assert(m->weight_type[0] == m->output_type[0]); + assert(m->input_type[0] == DT_INT32 || m->input_type[0] == DT_INT64); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR kernel = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() + 1 == output.domain.get_dim()); + for (size_t i = 0; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i + 1]); + assert(input.domain.lo()[i] == output.domain.lo()[i + 1]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } else { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() == output.domain.get_dim()); + for (size_t i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i]); + assert(input.domain.lo()[i] == output.domain.lo()[i]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } + + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } else { + in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } + forward_kernel_wrapper( + m, input, output, kernel, in_dim, out_dim, effective_batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Embedding::save_inference_tensors_to_file( + m, shard_id, nullptr, {input}, {kernel}, {output}); + } +} + void Embedding::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 3bfe429ddd..32b524f643 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5929,6 +5929,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + "ArgTopK Speculative Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Speculative Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // BeamTopk task { TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); From d7e8d728b67557bebbf9f76de9b806575b8a4cc2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 2 Jan 2024 13:54:29 -0500 Subject: [PATCH 296/344] fix corner case --- include/flexflow/batch_config.h | 14 ++- include/flexflow/config.h | 3 +- include/flexflow/model.h | 1 + .../inc_multihead_self_attention_utils.cuh | 2 +- .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 2 + inference/models/falcon.cc | 5 +- inference/models/llama.cc | 5 +- inference/models/mpt.cc | 5 +- inference/models/opt.cc | 5 +- inference/models/starcoder.cc | 5 +- src/ops/arg_topk.cu | 11 ++- src/ops/inc_multihead_self_attention.cu | 4 +- src/ops/spec_inc_multihead_self_attention.cu | 60 +++++++----- src/ops/tree_inc_multihead_self_attention.cu | 62 +++++++------ src/runtime/batch_config.cc | 6 ++ src/runtime/beam_search_batch_config.cc | 4 + src/runtime/model.cc | 14 +++ src/runtime/request_manager.cc | 93 +++++++++++-------- src/runtime/request_manager.cu | 28 +++++- 21 files changed, 225 insertions(+), 106 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 13904aaa46..ef17ef43ed 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -45,6 +45,7 @@ class BatchConfig { int num_active_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); + static int max_verify_tokens_per_batch(); static int max_sequence_length(); friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; @@ -72,6 +73,7 @@ class BatchConfig { // request id in batch config: int batch_config_request_id; + bool prompt_phase = false; RequestGuid request_guid; }; struct PerTokenInfo { @@ -85,15 +87,15 @@ class BatchConfig { // how many tokens before the tree, every sub requests need this part of // cache - int non_tree_cache_size; + int non_tree_cache_size = 0; // current tree size - int tree_size; + int tree_size = 0; - int this_layer_size; + int this_layer_size = 0; // input length-> prompt/root - int prompt_size; + int prompt_size = 0; }; BitMask causalMask[MAX_NUM_REQUESTS]; @@ -145,9 +147,13 @@ class BeamSearchBatchConfig : public BatchConfig { bool done() const; int max_beam_depth_all_requests() const; int current_depth_all_requests() const; + int get_speculative_request_num() const; size_t beam_width; size_t target_iterations; + + // how many requests is in speculative phase + int speculative_request_num = 0; inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index e1480264cc..17a3f59e29 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -84,7 +84,8 @@ struct FFHandler { sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens); + sizeof(TreeVerifyBatchConfig::committed_tokens) + + sizeof(BatchConfig::request_completed); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cf7bb3dd2d..6f805e21bd 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -73,6 +73,7 @@ enum TaskIDs { DROPOUT_BWD_TASK_ID, EMBED_INIT_TASK_ID, EMBED_FWD_TASK_ID, + EMBED_INF_TASK_ID, EMBED_BWD_TASK_ID, GATHER_INIT_TASK_ID, GATHER_FWD_TASK_ID, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index c128c1a126..d1e0e050b2 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length; + int max_qk_length = max_query_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a306f7985a..a0d01092bf 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -142,6 +142,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index d160da4a72..02df0c0137 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 1c4b0b2a2f..33714c106e 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -103,6 +103,7 @@ class RequestManager { int get_max_requests_per_batch(); void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); + int get_max_verify_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); @@ -113,6 +114,7 @@ class RequestManager { std::string const &path); void register_output_filepath(std::string const &); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, int preBeamSize, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index bfcec847b9..999ca37037 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,10 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e9c84efe90..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,10 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index b074d332ed..3df67b264c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,10 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9b29ae5410..0279f83239 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,10 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index ba7b2cb43a..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,10 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 0b8bb8b563..3302178728 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -405,13 +405,20 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, // check int beam_size = -1; - for (int i = 1; i < bc->max_requests_per_batch(); i++) { + + // allow last request different with others + int num_activate_requests = bc->num_active_requests(); + int last_request_idx = + bc->requestsInfo[num_activate_requests - 1].batch_config_request_id; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } else if (beam_size == -1) { beam_size = bc->beamRequestsInfo[i].beam_size; - } else { + + } else if (i != last_request_idx) { assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } else if (i == last_request_idx) { } } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index db64868cb9..7c8601d3c8 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1349,7 +1349,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 88dd3f92e4..b31e5d0994 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -50,7 +50,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask) { + BatchConfig::BitMask *causalMask, + bool *request_completed) { // q, k using Q_vec = typename VEC_K::Type; @@ -86,11 +87,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( // request_infos[batch_config_request_id].first_token_depth_in_request + // request_infos[batch_config_request_id].num_tokens_in_batch; - int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + int const totalCacheSize = + bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += causalMask[r].this_layer_size; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size; } int const tree_branch_num = @@ -138,7 +140,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - int const query_token = bitmask.tree_size - tree_branch_num + qi; + int const query_token = + bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi; __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { @@ -163,8 +166,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) { + // printf("spec inc attn qkqkqk request id %d, %.10f, %d\n", + // batch_config_request_id, + // ti, + // qk, + // qi); // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -336,17 +343,12 @@ __global__ void spec_inc_store_kv_cache( BatchConfig::BitMask bitmask = causalMask[req_id]; - // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; - - // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - - // tree_branch_num + sub_req_id + tok_id; - // bitmask.tree_size - tree_branch_num + sub_req_id; - // if prompt token -> token id // if tree token: - int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx - - request_token_offset; + + int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size + + bitmask.tree_size - 1 - bitmask.this_layer_size + + token_idx - request_token_offset; kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; @@ -411,7 +413,8 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->causalMask) + m->causalMask, \ + m->request_completed) template void compute_spec_inc_attention_kernel_generation( @@ -420,7 +423,8 @@ void compute_spec_inc_attention_kernel_generation( DT *output_ptr, cudaStream_t stream) { // one block == one head per request - dim3 grid(m->num_q_heads, bc->num_active_requests()); + // how many generation requests + dim3 grid(m->num_q_heads, bc->get_speculative_request_num()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; size_t smem_sz; @@ -499,11 +503,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; } - // else if (tokens_previous_requests < bc->num_generation_tokens) { - // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - // continue; - // } // all requests in prompt phase should only have one sub requests; assert(bc->sub_requests[i] == 1); @@ -659,10 +662,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - // print_tensor((float*)C_softmax, 32, "C_softmax"); + int token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; + (token_offset)*m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -860,6 +863,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b4af80976f..fc86e1498e 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -54,6 +54,7 @@ __global__ void compute_attention_kernel_fused_kernel( int num_heads, int num_requests, BatchConfig::BitMask *causalMask, + bool *request_completed, int qk_smem_sz) { // q, k @@ -90,13 +91,14 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[r].num_tokens_in_batch; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += + request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; } - // if(tidx == 0 && head_idx == 0){ - // printf("tree req: %d, %d\n", request_idx, first_token_idx); - // } + bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; + int q_start = + request_infos[batch_config_request_id].first_token_depth_in_request; // shared memory objects extern __shared__ char smem_[]; @@ -139,7 +141,7 @@ __global__ void compute_attention_kernel_fused_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); - // if (head_idx == 0 && qi == 1 && tidx == 0) { + // if (head_idx == 0 && request_idx == 1 && tidx == 0) { // printf("laod q %d, %d %.10f\n", // request_idx, // qi,q_vecs[ki_o][ii].x); @@ -163,19 +165,23 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength && tidx % THREADS_PER_KEY == 0) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (qi + q_start < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (head_idx == 0 && qi == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n - // ", + // if (head_idx == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, + // %.10f, %d\n", // request_idx, + // qi, // ti, // qk, // q_vecs[ki_o][0].x, - // k[0].x); + // k[0].x, + // bitmask.non_tree_cache_size); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } @@ -217,8 +223,10 @@ __global__ void compute_attention_kernel_fused_kernel( float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; qk_smem[ti - first_step] = mask ? 0.0f : logit; @@ -265,8 +273,11 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase + ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); } @@ -810,6 +821,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, \ bc->num_active_requests(), \ m->causalMask, \ + m->request_completed, \ smem_sz[0]) template @@ -841,7 +853,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - // 0->qk production size, 1->total shared size int smem_sz[2]; if (per_head_size == 64) { @@ -890,17 +901,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << // "\n"; - cudaMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->causalMask, - &(bc->causalMask), - bc->num_active_requests() * sizeof(BatchConfig::BitMask), - cudaMemcpyHostToDevice, - stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -1068,6 +1068,12 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BatchConfig::causalMask)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d2fbc0883f..c432208eca 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -84,6 +84,12 @@ int BatchConfig::max_tokens_per_batch() { return RequestManager::get_request_manager()->get_max_tokens_per_batch(); } +/*static*/ +int BatchConfig::max_verify_tokens_per_batch() { + return RequestManager::get_request_manager() + ->get_max_verify_tokens_per_batch(); +} + /*static*/ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 74843e9460..ff7bf1a819 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -85,6 +85,10 @@ int BeamSearchBatchConfig::max_beam_depth_all_requests() const { return max_depth_all_requests; } +int BeamSearchBatchConfig::get_speculative_request_num() const { + return speculative_request_num; +} + int BeamSearchBatchConfig::current_depth_all_requests() const { int current_depth = 0; for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 32b524f643..76bed36bda 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4805,6 +4805,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(EMBED_INF_TASK_ID, "Embedding Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Embedding Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(EMBED_BWD_TASK_ID, "Embedding Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 89d4ddaed4..88754f5a82 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -97,6 +97,12 @@ int RequestManager::get_max_tokens_per_batch() { return max_tokens_per_batch; } +int RequestManager::get_max_verify_tokens_per_batch() { + assert(max_tokens_per_batch > 0); + return max_tokens_per_batch + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM * max_requests_per_batch; +} + void RequestManager::set_max_sequence_length(int max_seq_length) { assert(max_sequence_length == -1 || max_sequence_length == max_seq_length); max_sequence_length = max_seq_length; @@ -1126,7 +1132,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num, tree, old_bc.beamRequestsInfo[i].current_depth); - // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1146,6 +1151,9 @@ BeamSearchBatchConfig } } + // how many requests is in speculative phase + new_bc.speculative_request_num = num_active_req + 1; + // Add prompt tokens to the batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || old_bc.request_running[i]) { @@ -1184,13 +1192,14 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; - printf("beam size: %d, %d\n", - new_bc.beamRequestsInfo[i].beam_size, - ssm_decoding_steps); + // printf("beam size: %d, %d\n", + // new_bc.beamRequestsInfo[i].beam_size, + // ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; - new_bc.sub_requests[i] = - old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + // new_bc.sub_requests[i] = + // old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.sub_requests[i] = 1; new_bc.beamRequestsInfo[i].sub_request_num = old_bc.beamRequestsInfo[i].sub_request_num; @@ -1218,6 +1227,9 @@ BeamSearchBatchConfig request.tokens.size()) { // request is done new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.causalMask[i].this_layer_size = 0; + new_bc.beamRequestsInfo[i].sub_request_num = 0; + new_bc.beamRequestsInfo[i].beam_size = 1; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -1227,12 +1239,8 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; BeamTree tree = request.beam_trees[old_bc.model_id]; - appendBitMask(new_bc.causalMask[i], - new_bc.beamRequestsInfo[i].sub_request_num, - old_bc.beamRequestsInfo[i].beam_size, - old_bc.beamRequestsInfo[i].sub_request_num, - tree, - old_bc.beamRequestsInfo[i].current_depth); + appendPendingRequest(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); } if (verbose) { @@ -1258,11 +1266,11 @@ BeamSearchBatchConfig // get value from requestinfo new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.tokens.size() - 1]; + request.tokens[request.tokens.size() - + new_bc.requestsInfo[i].num_tokens_in_batch + j]; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; - num_generation_tokens++; } } } @@ -1319,7 +1327,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; - int max_prompt_load_size = get_max_tokens_per_batch(); + int max_prompt_load_size = get_max_verify_tokens_per_batch(); for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; @@ -1427,7 +1435,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1453,7 +1461,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() && + if (new_bc.num_tokens == get_max_verify_tokens_per_batch() && (j != dfs_tree_inputs.size() - 1)) { cutLayer = true; break; @@ -1542,7 +1550,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1555,15 +1563,17 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.status = Request::RUNNING; new_bc.request_running[i] = true; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; } } else { // launch the request into running phase after loading all prompt - if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) { // std::cout << "Initialization running phase: " // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; @@ -1576,9 +1586,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; @@ -1760,20 +1772,14 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // prompt phase, init task void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, int initLength) { - assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && - "do not support tree size > 64"); + assert(initLength > 0); // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; - bitmask.tree_size = initLength; + bitmask.tree_size = 1; bitmask.prompt_size = initLength; bitmask.this_layer_size = initLength; - for (int i = 0; i < bitmask.prompt_size; i++) { - for (int j = i; j < bitmask.prompt_size; j++) { - bitmask.mask[i] |= (1 << j); - } - } // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; @@ -1810,6 +1816,25 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, // << "\n"; } +// prompt phase, init task +void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength > 0); + std::cout << "append pending bit mask: " << initLength << "\n"; + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = 1; + bitmask.prompt_size += initLength; + bitmask.this_layer_size = initLength; + + // for (int i = 0; i < bitmask.prompt_size; i++) { + // for (int j = i; j < bitmask.prompt_size; j++) { + // bitmask.mask[i] |= (1 << j); + // } + // } +} + // prepare next beam, append layers to the tree void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, @@ -1862,12 +1887,6 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, } } - // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " - // << new_nodes_start_idx << ", " << newNodes - // << "current depth: " << currentDepth << "\n"; - // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; - - // std::cout << "tree size: " << bitmask.tree_size << "\n"; assert(token_idx == pre_tree_size); assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 51c52c3026..8380d6be73 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -35,10 +35,17 @@ void RequestManager::load_tokens_task( // Extreme long prompts are not supported, only load up to // BatchConfig::max_tokens_per_batch() as prompt - if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { + if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() && + batch_config->get_mode() == INC_DECODING_MODE) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + } else if (batch_config->num_tokens > + BatchConfig::max_verify_tokens_per_batch()) { + printf("Warning: Speculative decoding. too many tokens in prompt, only " + "load up to %d tokens\n", + BatchConfig::max_verify_tokens_per_batch()); + printf("Got: %d tokens\n", batch_config->num_tokens); } for (int i = 0; i < batch_config->num_tokens; i++) { @@ -117,8 +124,16 @@ void RequestManager::load_batch_config_task( sizeof(BatchConfig::causalMask), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); @@ -137,6 +152,15 @@ void RequestManager::load_batch_config_task( cudaMemcpyHostToDevice, stream)); total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } // add a size check From a45826e9daa0364b49f353c1c85cf2a9800bc1d9 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Tue, 2 Jan 2024 15:28:52 -0500 Subject: [PATCH 297/344] SpecInfer fix corner case (#1258) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip * embedding return when no token * use arg topk instead of beam topk * embedding * fmt * hip * fix corner case --------- Co-authored-by: Zhihao Jia --- include/flexflow/batch_config.h | 14 ++- include/flexflow/config.h | 3 +- include/flexflow/model.h | 1 + .../inc_multihead_self_attention_utils.cuh | 2 +- .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 2 + inference/models/falcon.cc | 5 +- inference/models/llama.cc | 5 +- inference/models/mpt.cc | 5 +- inference/models/opt.cc | 5 +- inference/models/starcoder.cc | 5 +- src/ops/arg_topk.cu | 11 ++- src/ops/inc_multihead_self_attention.cu | 4 +- src/ops/spec_inc_multihead_self_attention.cu | 60 +++++++----- src/ops/tree_inc_multihead_self_attention.cu | 62 +++++++------ src/runtime/batch_config.cc | 6 ++ src/runtime/beam_search_batch_config.cc | 4 + src/runtime/model.cc | 14 +++ src/runtime/request_manager.cc | 93 +++++++++++-------- src/runtime/request_manager.cu | 28 +++++- 21 files changed, 224 insertions(+), 107 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 13904aaa46..ef17ef43ed 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -45,6 +45,7 @@ class BatchConfig { int num_active_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); + static int max_verify_tokens_per_batch(); static int max_sequence_length(); friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; @@ -72,6 +73,7 @@ class BatchConfig { // request id in batch config: int batch_config_request_id; + bool prompt_phase = false; RequestGuid request_guid; }; struct PerTokenInfo { @@ -85,15 +87,15 @@ class BatchConfig { // how many tokens before the tree, every sub requests need this part of // cache - int non_tree_cache_size; + int non_tree_cache_size = 0; // current tree size - int tree_size; + int tree_size = 0; - int this_layer_size; + int this_layer_size = 0; // input length-> prompt/root - int prompt_size; + int prompt_size = 0; }; BitMask causalMask[MAX_NUM_REQUESTS]; @@ -145,9 +147,13 @@ class BeamSearchBatchConfig : public BatchConfig { bool done() const; int max_beam_depth_all_requests() const; int current_depth_all_requests() const; + int get_speculative_request_num() const; size_t beam_width; size_t target_iterations; + + // how many requests is in speculative phase + int speculative_request_num = 0; inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index e1480264cc..17a3f59e29 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -84,7 +84,8 @@ struct FFHandler { sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens); + sizeof(TreeVerifyBatchConfig::committed_tokens) + + sizeof(BatchConfig::request_completed); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cf7bb3dd2d..6f805e21bd 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -73,6 +73,7 @@ enum TaskIDs { DROPOUT_BWD_TASK_ID, EMBED_INIT_TASK_ID, EMBED_FWD_TASK_ID, + EMBED_INF_TASK_ID, EMBED_BWD_TASK_ID, GATHER_INIT_TASK_ID, GATHER_FWD_TASK_ID, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index c128c1a126..d1e0e050b2 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length; + int max_qk_length = max_query_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a306f7985a..a0d01092bf 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -142,6 +142,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index d160da4a72..02df0c0137 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 1c4b0b2a2f..33714c106e 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -103,6 +103,7 @@ class RequestManager { int get_max_requests_per_batch(); void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); + int get_max_verify_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); @@ -113,6 +114,7 @@ class RequestManager { std::string const &path); void register_output_filepath(std::string const &); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, int preBeamSize, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index bfcec847b9..999ca37037 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,10 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e9c84efe90..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,10 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index b074d332ed..3df67b264c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,10 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9b29ae5410..0279f83239 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,10 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index ba7b2cb43a..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,10 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 0b8bb8b563..5b7978812c 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -404,17 +404,22 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, assert(bc->num_active_requests() >= 0); // check + // allow last request different with others int beam_size = -1; - for (int i = 1; i < bc->max_requests_per_batch(); i++) { + int num_activate_requests = bc->num_active_requests(); + int last_request_idx = + bc->requestsInfo[num_activate_requests - 1].batch_config_request_id; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } else if (beam_size == -1) { beam_size = bc->beamRequestsInfo[i].beam_size; - } else { + + } else if (i != last_request_idx) { assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } else if (i == last_request_idx) { } } - assert(num_shards >= (size_t)beam_size); num_shards = k; arg_topk_forward_kernel<<>>( diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index db64868cb9..7c8601d3c8 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1349,7 +1349,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 88dd3f92e4..b31e5d0994 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -50,7 +50,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask) { + BatchConfig::BitMask *causalMask, + bool *request_completed) { // q, k using Q_vec = typename VEC_K::Type; @@ -86,11 +87,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( // request_infos[batch_config_request_id].first_token_depth_in_request + // request_infos[batch_config_request_id].num_tokens_in_batch; - int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + int const totalCacheSize = + bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += causalMask[r].this_layer_size; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size; } int const tree_branch_num = @@ -138,7 +140,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - int const query_token = bitmask.tree_size - tree_branch_num + qi; + int const query_token = + bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi; __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { @@ -163,8 +166,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) { + // printf("spec inc attn qkqkqk request id %d, %.10f, %d\n", + // batch_config_request_id, + // ti, + // qk, + // qi); // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -336,17 +343,12 @@ __global__ void spec_inc_store_kv_cache( BatchConfig::BitMask bitmask = causalMask[req_id]; - // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; - - // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - - // tree_branch_num + sub_req_id + tok_id; - // bitmask.tree_size - tree_branch_num + sub_req_id; - // if prompt token -> token id // if tree token: - int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx - - request_token_offset; + + int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size + + bitmask.tree_size - 1 - bitmask.this_layer_size + + token_idx - request_token_offset; kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; @@ -411,7 +413,8 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->causalMask) + m->causalMask, \ + m->request_completed) template void compute_spec_inc_attention_kernel_generation( @@ -420,7 +423,8 @@ void compute_spec_inc_attention_kernel_generation( DT *output_ptr, cudaStream_t stream) { // one block == one head per request - dim3 grid(m->num_q_heads, bc->num_active_requests()); + // how many generation requests + dim3 grid(m->num_q_heads, bc->get_speculative_request_num()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; size_t smem_sz; @@ -499,11 +503,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; } - // else if (tokens_previous_requests < bc->num_generation_tokens) { - // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - // continue; - // } // all requests in prompt phase should only have one sub requests; assert(bc->sub_requests[i] == 1); @@ -659,10 +662,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - // print_tensor((float*)C_softmax, 32, "C_softmax"); + int token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; + (token_offset)*m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -860,6 +863,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b4af80976f..fc86e1498e 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -54,6 +54,7 @@ __global__ void compute_attention_kernel_fused_kernel( int num_heads, int num_requests, BatchConfig::BitMask *causalMask, + bool *request_completed, int qk_smem_sz) { // q, k @@ -90,13 +91,14 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[r].num_tokens_in_batch; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += + request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; } - // if(tidx == 0 && head_idx == 0){ - // printf("tree req: %d, %d\n", request_idx, first_token_idx); - // } + bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; + int q_start = + request_infos[batch_config_request_id].first_token_depth_in_request; // shared memory objects extern __shared__ char smem_[]; @@ -139,7 +141,7 @@ __global__ void compute_attention_kernel_fused_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); - // if (head_idx == 0 && qi == 1 && tidx == 0) { + // if (head_idx == 0 && request_idx == 1 && tidx == 0) { // printf("laod q %d, %d %.10f\n", // request_idx, // qi,q_vecs[ki_o][ii].x); @@ -163,19 +165,23 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength && tidx % THREADS_PER_KEY == 0) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (qi + q_start < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (head_idx == 0 && qi == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n - // ", + // if (head_idx == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, + // %.10f, %d\n", // request_idx, + // qi, // ti, // qk, // q_vecs[ki_o][0].x, - // k[0].x); + // k[0].x, + // bitmask.non_tree_cache_size); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } @@ -217,8 +223,10 @@ __global__ void compute_attention_kernel_fused_kernel( float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; qk_smem[ti - first_step] = mask ? 0.0f : logit; @@ -265,8 +273,11 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase + ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); } @@ -810,6 +821,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, \ bc->num_active_requests(), \ m->causalMask, \ + m->request_completed, \ smem_sz[0]) template @@ -841,7 +853,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - // 0->qk production size, 1->total shared size int smem_sz[2]; if (per_head_size == 64) { @@ -890,17 +901,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << // "\n"; - cudaMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->causalMask, - &(bc->causalMask), - bc->num_active_requests() * sizeof(BatchConfig::BitMask), - cudaMemcpyHostToDevice, - stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -1068,6 +1068,12 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BatchConfig::causalMask)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d2fbc0883f..c432208eca 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -84,6 +84,12 @@ int BatchConfig::max_tokens_per_batch() { return RequestManager::get_request_manager()->get_max_tokens_per_batch(); } +/*static*/ +int BatchConfig::max_verify_tokens_per_batch() { + return RequestManager::get_request_manager() + ->get_max_verify_tokens_per_batch(); +} + /*static*/ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 74843e9460..ff7bf1a819 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -85,6 +85,10 @@ int BeamSearchBatchConfig::max_beam_depth_all_requests() const { return max_depth_all_requests; } +int BeamSearchBatchConfig::get_speculative_request_num() const { + return speculative_request_num; +} + int BeamSearchBatchConfig::current_depth_all_requests() const { int current_depth = 0; for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 32b524f643..76bed36bda 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4805,6 +4805,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(EMBED_INF_TASK_ID, "Embedding Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Embedding Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(EMBED_BWD_TASK_ID, "Embedding Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 89d4ddaed4..88754f5a82 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -97,6 +97,12 @@ int RequestManager::get_max_tokens_per_batch() { return max_tokens_per_batch; } +int RequestManager::get_max_verify_tokens_per_batch() { + assert(max_tokens_per_batch > 0); + return max_tokens_per_batch + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM * max_requests_per_batch; +} + void RequestManager::set_max_sequence_length(int max_seq_length) { assert(max_sequence_length == -1 || max_sequence_length == max_seq_length); max_sequence_length = max_seq_length; @@ -1126,7 +1132,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num, tree, old_bc.beamRequestsInfo[i].current_depth); - // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1146,6 +1151,9 @@ BeamSearchBatchConfig } } + // how many requests is in speculative phase + new_bc.speculative_request_num = num_active_req + 1; + // Add prompt tokens to the batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || old_bc.request_running[i]) { @@ -1184,13 +1192,14 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; - printf("beam size: %d, %d\n", - new_bc.beamRequestsInfo[i].beam_size, - ssm_decoding_steps); + // printf("beam size: %d, %d\n", + // new_bc.beamRequestsInfo[i].beam_size, + // ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; - new_bc.sub_requests[i] = - old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + // new_bc.sub_requests[i] = + // old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.sub_requests[i] = 1; new_bc.beamRequestsInfo[i].sub_request_num = old_bc.beamRequestsInfo[i].sub_request_num; @@ -1218,6 +1227,9 @@ BeamSearchBatchConfig request.tokens.size()) { // request is done new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.causalMask[i].this_layer_size = 0; + new_bc.beamRequestsInfo[i].sub_request_num = 0; + new_bc.beamRequestsInfo[i].beam_size = 1; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -1227,12 +1239,8 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; BeamTree tree = request.beam_trees[old_bc.model_id]; - appendBitMask(new_bc.causalMask[i], - new_bc.beamRequestsInfo[i].sub_request_num, - old_bc.beamRequestsInfo[i].beam_size, - old_bc.beamRequestsInfo[i].sub_request_num, - tree, - old_bc.beamRequestsInfo[i].current_depth); + appendPendingRequest(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); } if (verbose) { @@ -1258,11 +1266,11 @@ BeamSearchBatchConfig // get value from requestinfo new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.tokens.size() - 1]; + request.tokens[request.tokens.size() - + new_bc.requestsInfo[i].num_tokens_in_batch + j]; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; - num_generation_tokens++; } } } @@ -1319,7 +1327,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; - int max_prompt_load_size = get_max_tokens_per_batch(); + int max_prompt_load_size = get_max_verify_tokens_per_batch(); for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; @@ -1427,7 +1435,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1453,7 +1461,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() && + if (new_bc.num_tokens == get_max_verify_tokens_per_batch() && (j != dfs_tree_inputs.size() - 1)) { cutLayer = true; break; @@ -1542,7 +1550,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1555,15 +1563,17 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.status = Request::RUNNING; new_bc.request_running[i] = true; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; } } else { // launch the request into running phase after loading all prompt - if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) { // std::cout << "Initialization running phase: " // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; @@ -1576,9 +1586,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; @@ -1760,20 +1772,14 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // prompt phase, init task void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, int initLength) { - assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && - "do not support tree size > 64"); + assert(initLength > 0); // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; - bitmask.tree_size = initLength; + bitmask.tree_size = 1; bitmask.prompt_size = initLength; bitmask.this_layer_size = initLength; - for (int i = 0; i < bitmask.prompt_size; i++) { - for (int j = i; j < bitmask.prompt_size; j++) { - bitmask.mask[i] |= (1 << j); - } - } // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; @@ -1810,6 +1816,25 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, // << "\n"; } +// prompt phase, init task +void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength > 0); + std::cout << "append pending bit mask: " << initLength << "\n"; + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = 1; + bitmask.prompt_size += initLength; + bitmask.this_layer_size = initLength; + + // for (int i = 0; i < bitmask.prompt_size; i++) { + // for (int j = i; j < bitmask.prompt_size; j++) { + // bitmask.mask[i] |= (1 << j); + // } + // } +} + // prepare next beam, append layers to the tree void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, @@ -1862,12 +1887,6 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, } } - // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " - // << new_nodes_start_idx << ", " << newNodes - // << "current depth: " << currentDepth << "\n"; - // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; - - // std::cout << "tree size: " << bitmask.tree_size << "\n"; assert(token_idx == pre_tree_size); assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 51c52c3026..8380d6be73 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -35,10 +35,17 @@ void RequestManager::load_tokens_task( // Extreme long prompts are not supported, only load up to // BatchConfig::max_tokens_per_batch() as prompt - if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { + if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() && + batch_config->get_mode() == INC_DECODING_MODE) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + } else if (batch_config->num_tokens > + BatchConfig::max_verify_tokens_per_batch()) { + printf("Warning: Speculative decoding. too many tokens in prompt, only " + "load up to %d tokens\n", + BatchConfig::max_verify_tokens_per_batch()); + printf("Got: %d tokens\n", batch_config->num_tokens); } for (int i = 0; i < batch_config->num_tokens; i++) { @@ -117,8 +124,16 @@ void RequestManager::load_batch_config_task( sizeof(BatchConfig::causalMask), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); @@ -137,6 +152,15 @@ void RequestManager::load_batch_config_task( cudaMemcpyHostToDevice, stream)); total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } // add a size check From 8490e50d5744b6731df9fdc4147b2a6ebd4f2d71 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 2 Jan 2024 16:20:24 -0500 Subject: [PATCH 298/344] fix --- src/runtime/request_manager.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 88754f5a82..a285932b7f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1188,10 +1188,7 @@ BeamSearchBatchConfig int ssm_decoding_steps = profiling_requests[request.guid].ssm_decoding_steps; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; + new_bc.beamRequestsInfo[i].beam_size = 1; // printf("beam size: %d, %d\n", // new_bc.beamRequestsInfo[i].beam_size, // ssm_decoding_steps); @@ -1820,7 +1817,7 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength) { assert(initLength > 0); - std::cout << "append pending bit mask: " << initLength << "\n"; + // std::cout << "append pending bit mask: " << initLength << "\n"; // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; From c12f0c6ddaea6629214278167b047ffa3158b491 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 3 Jan 2024 00:28:15 -0500 Subject: [PATCH 299/344] fix request id issue --- src/ops/inc_multihead_self_attention.cu | 42 +++++--------------- src/ops/spec_inc_multihead_self_attention.cu | 8 ++-- src/runtime/request_manager.cc | 6 +++ 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7c8601d3c8..42933cee27 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -52,9 +52,7 @@ __global__ void compute_attention_kernel_generation_kernel( int max_seq_length, int per_head_size, int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - bool is_beam, - int max_beam_width) { + BatchConfig::PerRequestInfo *request_infos) { // q, k using Q_vec = typename VEC_K::Type; @@ -85,10 +83,6 @@ __global__ void compute_attention_kernel_generation_kernel( int const batch_config_request_id = request_infos[request_idx].batch_config_request_id; - int const beam_request_idx = - is_beam ? request_idx / max_beam_width : request_idx; - int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; - int const first_step = 0; int const tlength = @@ -106,8 +100,7 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + - batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -142,10 +135,7 @@ __global__ void compute_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -248,10 +238,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { @@ -297,7 +284,7 @@ __global__ void compute_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + beam_request_idx * hidden_size + + *reinterpret_cast(output_ptr + request_idx * hidden_size + head_idx * per_head_size + vi), out); } @@ -727,9 +714,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, BatchConfig::max_sequence_length(), \ m->qProjSize, \ m->hidden_size, \ - m->request_infos, \ - false, \ - 0) + m->request_infos) template void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, @@ -944,14 +929,9 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { continue; } - assert(tokens_previous_requests == - bc->requestsInfo[i].first_token_offset_in_batch); int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; @@ -978,8 +958,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] // To get query projection, skip over Q entries from previous requests DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // matrix B: key cache // matrix B's layout: [kProjSize * num_heads, total_tokens] // To get B, skip over K entries from previous requests (all heads + @@ -1117,7 +1097,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // requests // store the result attn heads, also skip the genration tokens DT *C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * + (bc->requestsInfo[i].first_token_offset_in_batch) * m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -1145,7 +1125,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, } tokens_previous_requests += num_new_tokens; } - assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b31e5d0994..a63417de51 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -501,10 +501,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) || + (bc->requestsInfo[i].num_tokens_in_batch == 0)) { continue; } @@ -694,7 +692,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } - // assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } template diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a285932b7f..c867d2a979 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -468,12 +468,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; num_generation_tokens++; + new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].prompt_phase = true; } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; @@ -509,6 +511,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request @@ -755,6 +758,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; + new_bc.requestsInfo[i].prompt_phase = true; int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = @@ -902,6 +906,7 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; new_bc.beamRequestsInfo[i].sub_request_num = 1; printf("sub request num == 1, %d \n", @@ -1220,6 +1225,7 @@ BeamSearchBatchConfig &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); + new_bc.requestsInfo[i].prompt_phase = true; if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done From 284ad772692e8b5f0c012de7d2493d95f3380428 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 3 Jan 2024 03:08:04 -0500 Subject: [PATCH 300/344] Fix Request Id order issue (#1260) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip * embedding return when no token * use arg topk instead of beam topk * embedding * fmt * hip * fix corner case * fix * fix request id issue --------- Co-authored-by: Zhihao Jia --- src/ops/inc_multihead_self_attention.cu | 42 +++++--------------- src/ops/spec_inc_multihead_self_attention.cu | 5 ++- src/runtime/request_manager.cc | 13 +++--- 3 files changed, 22 insertions(+), 38 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7c8601d3c8..42933cee27 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -52,9 +52,7 @@ __global__ void compute_attention_kernel_generation_kernel( int max_seq_length, int per_head_size, int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - bool is_beam, - int max_beam_width) { + BatchConfig::PerRequestInfo *request_infos) { // q, k using Q_vec = typename VEC_K::Type; @@ -85,10 +83,6 @@ __global__ void compute_attention_kernel_generation_kernel( int const batch_config_request_id = request_infos[request_idx].batch_config_request_id; - int const beam_request_idx = - is_beam ? request_idx / max_beam_width : request_idx; - int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; - int const first_step = 0; int const tlength = @@ -106,8 +100,7 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + - batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -142,10 +135,7 @@ __global__ void compute_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -248,10 +238,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { @@ -297,7 +284,7 @@ __global__ void compute_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + beam_request_idx * hidden_size + + *reinterpret_cast(output_ptr + request_idx * hidden_size + head_idx * per_head_size + vi), out); } @@ -727,9 +714,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, BatchConfig::max_sequence_length(), \ m->qProjSize, \ m->hidden_size, \ - m->request_infos, \ - false, \ - 0) + m->request_infos) template void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, @@ -944,14 +929,9 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { continue; } - assert(tokens_previous_requests == - bc->requestsInfo[i].first_token_offset_in_batch); int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; @@ -978,8 +958,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] // To get query projection, skip over Q entries from previous requests DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // matrix B: key cache // matrix B's layout: [kProjSize * num_heads, total_tokens] // To get B, skip over K entries from previous requests (all heads + @@ -1117,7 +1097,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // requests // store the result attn heads, also skip the genration tokens DT *C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * + (bc->requestsInfo[i].first_token_offset_in_batch) * m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -1145,7 +1125,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, } tokens_previous_requests += num_new_tokens; } - assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b31e5d0994..2d80ed2221 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -501,7 +501,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) || + (bc->requestsInfo[i].num_tokens_in_batch == 0)) { continue; } else if (tokens_previous_requests < bc->num_generation_tokens) { tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; @@ -694,7 +695,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } - // assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } template diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 88754f5a82..c867d2a979 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -468,12 +468,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; num_generation_tokens++; + new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].prompt_phase = true; } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; @@ -509,6 +511,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request @@ -755,6 +758,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; + new_bc.requestsInfo[i].prompt_phase = true; int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = @@ -902,6 +906,7 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; new_bc.beamRequestsInfo[i].sub_request_num = 1; printf("sub request num == 1, %d \n", @@ -1188,10 +1193,7 @@ BeamSearchBatchConfig int ssm_decoding_steps = profiling_requests[request.guid].ssm_decoding_steps; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; + new_bc.beamRequestsInfo[i].beam_size = 1; // printf("beam size: %d, %d\n", // new_bc.beamRequestsInfo[i].beam_size, // ssm_decoding_steps); @@ -1223,6 +1225,7 @@ BeamSearchBatchConfig &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); + new_bc.requestsInfo[i].prompt_phase = true; if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done @@ -1820,7 +1823,7 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength) { assert(initLength > 0); - std::cout << "append pending bit mask: " << initLength << "\n"; + // std::cout << "append pending bit mask: " << initLength << "\n"; // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; From e17fb8d923b38221d3ab8ba52677505c2c4a9f93 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 3 Jan 2024 23:32:45 -0500 Subject: [PATCH 301/344] change MAX_SPECULATIVE_TREE_BRANCHES --- include/flexflow/batch_config.h | 23 ++++++++++++++--------- include/flexflow/request_manager.h | 2 +- src/runtime/request_manager.cc | 11 ++++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ef17ef43ed..3dcae464cc 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -167,9 +167,10 @@ class BeamSearchBatchConfig : public BatchConfig { int current_depth = -1; int max_depth = MAX_BEAM_DEPTH; - BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + BatchConfig::TokenId + tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int sub_request_num; }; @@ -178,10 +179,11 @@ class BeamSearchBatchConfig : public BatchConfig { }; BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; - BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + BeamSearchPerTokenInfo + beamTokenInfo[MAX_NUM_TOKENS + + MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS]; - // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? - int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; + int sub_requests[MAX_SPECULATIVE_TREE_BRANCHES]; private: size_t current_iteration; @@ -190,9 +192,12 @@ class BeamSearchBatchConfig : public BatchConfig { struct BeamInferenceResult { static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; BatchConfig::TokenId - token_ids[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - int parent_id[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + token_ids[MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + float probs[MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int parent_id[MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; }; }; // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 33714c106e..f74b6c5b9f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -76,7 +76,7 @@ struct BeamTree { struct treeLayer { BeamSearchBatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int parent_ids[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int nodes_num_this_layer = 0; }; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index c867d2a979..91a5d3be86 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -767,7 +767,9 @@ BeamSearchBatchConfig : 1; new_bc.beamRequestsInfo[i].max_depth = std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + for (int j = 0; + j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } @@ -840,7 +842,8 @@ BeamSearchBatchConfig ? spec_infer_tree_width[ssm_decoding_steps] : 1; new_bc.beamRequestsInfo[i].max_depth = 0; - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + for (int j = 0; j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } @@ -900,7 +903,9 @@ BeamSearchBatchConfig std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, get_max_tokens_per_batch() - new_bc.requestsInfo[i].num_tokens_in_batch - 1); - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + for (int j = 0; + j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } From 429ddb59073f3155acd7f255c97f2153f99d130b Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 4 Jan 2024 00:06:48 -0500 Subject: [PATCH 302/344] =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/flexflow/batch_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 3dcae464cc..5c126293cf 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -183,7 +183,7 @@ class BeamSearchBatchConfig : public BatchConfig { beamTokenInfo[MAX_NUM_TOKENS + MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS]; - int sub_requests[MAX_SPECULATIVE_TREE_BRANCHES]; + int sub_requests[MAX_NUM_REQUESTS]; private: size_t current_iteration; From 4f61b9f348094f87cc4d32625a65ffb64156d325 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 Jan 2024 19:31:20 +0000 Subject: [PATCH 303/344] fix --- src/runtime/request_manager.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 91a5d3be86..56a2c122d3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -43,7 +43,8 @@ std::string LoadBytesFromFile(std::string const &path) { } RequestManager::RequestManager() - : verbose(false), next_available_guid(1000000), num_processed_requests(0) { + : verbose(false), next_available_guid(1000000), num_processed_requests(0), + total_request_run_time(0.0f) { // The following config parameters are set // during ffmodel.compile() // Initialize them to -1 to make sure no one From 29735f2432efd8290bf4ebb301fa96cbb5530eff Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 Jan 2024 22:33:22 +0000 Subject: [PATCH 304/344] fixes to run chatgpt.json prompt dataset in python --- .dockerignore | 2 ++ .gitignore | 3 ++- python/flexflow/core/flexflow_cffi.py | 2 +- src/c/flexflow_c.cc | 6 +++++- src/runtime/model.cu | 1 - tests/inference/python_inference_tests.sh | 3 ++- 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.dockerignore b/.dockerignore index a7470203e3..b9f228c009 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,3 +17,5 @@ python/flexflow/core/legion_cffi_header.py /inference/tokenizer/* /inference/prompt/* /inference/output/* + +/tests/inference/python_test_configs/*.json diff --git a/.gitignore b/.gitignore index 8fcc105f01..7f6a3c4137 100644 --- a/.gitignore +++ b/.gitignore @@ -186,4 +186,5 @@ gpt_tokenizer # pip version python/flexflow/version.txt -inference_tensors \ No newline at end of file +inference_tensors +tests/inference/python_test_configs/*.json diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index a3c221474d..00133dacb4 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -56,7 +56,7 @@ def get_c_name(name): if name is None: return ffi.NULL else: - return ffi.new("char[]", name.encode("ascii")) + return ffi.new("char[]", name.encode("utf-8")) def get_datatype_size(datatype): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 579fc5e2d1..82a37a9736 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1596,7 +1596,11 @@ flexflow_generation_result_t GenerationResult result = handle->generate(prompts, max_seq_length); DEBUG_PRINT( "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); - assert(result.output_tokens.size() <= max_seq_length); + // If the prompt exceeds max seq len, check that we return the prompt with no + // additional token. Otherwise, check that the output does not exceed the max + // sequence length. + assert(result.output_tokens.size() <= max_seq_length || + result.output_tokens.size() == result.input_tokens.size()); output_length_and_tokens[0] = result.output_tokens.size(); std::copy(result.output_tokens.begin(), result.output_tokens.end(), diff --git a/src/runtime/model.cu b/src/runtime/model.cu index c885b29db2..23b7f0efbe 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -175,7 +175,6 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } - // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 3544f58e26..10c0821835 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -6,11 +6,12 @@ set -e cd "${BASH_SOURCE[0]%/*}" # Generate test configs +rm -rf python_test_configs/*.json python python_test_configs/generate_configs.py # Run all tests # Loop through .json files in the ./python_test_configs dir -for file in ./python_test_configs/*.json; do +for file in ./python_test_configs/*"llama"*.json; do # Check filename prefix if [[ $file == *"incr_dec"* ]]; then script="../../inference/python/incr_decoding.py" From ba4af39404bb92af10926222ceb6d9e88a147fb9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 06:56:36 +0000 Subject: [PATCH 305/344] fix --- tests/inference/python_inference_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 10c0821835..a1ee281914 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -11,7 +11,7 @@ python python_test_configs/generate_configs.py # Run all tests # Loop through .json files in the ./python_test_configs dir -for file in ./python_test_configs/*"llama"*.json; do +for file in ./python_test_configs/*.json; do # Check filename prefix if [[ $file == *"incr_dec"* ]]; then script="../../inference/python/incr_decoding.py" From 9c85a4f5900e45e2e7dfbc98f57bf43237b4dbc9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 10 Jan 2024 13:54:11 -0500 Subject: [PATCH 306/344] Fuse bias and relu in OPT (#1265) --- include/flexflow/model.h | 3 ++- inference/models/opt.cc | 5 ++--- python/flexflow/serve/models/opt.py | 5 ++--- src/ops/kernels/linear_kernels.cu | 22 ++++++++++++++++++++++ src/runtime/model.cc | 27 ++++++++++++++++++++++++--- 5 files changed, 52 insertions(+), 10 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6f805e21bd..75b1dbcbe9 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1090,7 +1090,7 @@ class FFModel { std::unordered_map>> get_bwd_edge_map() const; - // Internal funcitons + // Internal functions Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc); Legion::IndexSpace get_or_create_task_is(MachineView const &view); Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain); @@ -1098,6 +1098,7 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 0279f83239..e260f8fa36 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -196,7 +196,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, - AC_MODE_NONE, + AC_MODE_RELU, true, DT_NONE, nullptr, @@ -205,8 +205,7 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc1").c_str()); - Tensor activation = ff.relu(fc1, false); - fc2 = ff.dense(activation, + fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, true, diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dfd1cde7d4..dd36fa6592 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -216,13 +216,12 @@ def build_model(self, max_tokens_per_batch): fc1 = ffmodel.dense( ff_norm, self.opt_config.ffn_dim, - ActiMode.AC_MODE_NONE, + ActiMode.AC_MODE_RELU, True, name=f"layers_{i}_fc1", ) - activation = ffmodel.relu(fc1, False) fc2 = ffmodel.dense( - activation, + fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 9373c2fb2f..c30c9f71c1 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -252,6 +252,18 @@ Parameter* Linear::get_parameter(int index) */ namespace Internal { +template +__global__ void AddBiasWithReLU(DT *output_ptr, + DT const *bias_ptr, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, out_dim * batch_size) { + int bias_idx = i % out_dim; + DT value = output_ptr[i] + bias_ptr[bias_idx]; + output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f; + } +} + template void forward_kernel(LinearMeta const *m, void const *input_ptr, @@ -343,6 +355,16 @@ void forward_kernel(LinearMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // use_bias = True if (bias_ptr != NULL) { + // fuse bias and relu + if (m->activation == AC_MODE_RELU) { + int parallelism = out_dim * batch_size; + AddBiasWithReLU<<>>( + static_cast
(output_ptr), + static_cast
(bias_ptr), + out_dim, + batch_size); + return; + } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 76bed36bda..4270515224 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3236,6 +3236,27 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + // standard opt relu + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } + } + return false; +} + void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { @@ -3280,9 +3301,9 @@ void FFModel::create_operators_from_layers() { config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_RELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer (l->op_type == OP_LINEAR && layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_GELU && layers[layer_idx - 2]->op_type == OP_LINEAR) || From 197e308ffb872aee9a326eff1b6c6c0bccb075a7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 12 Jan 2024 12:13:34 -0500 Subject: [PATCH 307/344] fix spec decoding --- deps/legion | 2 +- inference/models/falcon.cc | 2 +- inference/models/llama.cc | 2 +- inference/models/mpt.cc | 2 +- inference/models/opt.cc | 2 +- inference/models/starcoder.cc | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deps/legion b/deps/legion index 626b55689c..d065278678 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit d0652786784249e933dd62f675591da99a5e960d diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 999ca37037..cf6e90a7de 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,7 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e54d6d8811..3deba47953 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 3df67b264c..484a09f62e 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,7 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index e260f8fa36..9f75dcea4c 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index e683376e47..ef5388b6ca 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,7 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; From ed4dbd808eb20ddd99e6349c41a66ec782c3cefb Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 12 Jan 2024 12:20:52 -0500 Subject: [PATCH 308/344] Revert "fix spec decoding" This reverts commit 197e308ffb872aee9a326eff1b6c6c0bccb075a7. --- deps/legion | 2 +- inference/models/falcon.cc | 2 +- inference/models/llama.cc | 2 +- inference/models/mpt.cc | 2 +- inference/models/opt.cc | 2 +- inference/models/starcoder.cc | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deps/legion b/deps/legion index d065278678..626b55689c 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit d0652786784249e933dd62f675591da99a5e960d +Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index cf6e90a7de..999ca37037 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,7 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 3deba47953..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 484a09f62e..3df67b264c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,7 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9f75dcea4c..e260f8fa36 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index ef5388b6ca..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,7 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; From 12fdbac30286eee17d4372ccd58230303dd422d6 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 14 Jan 2024 00:28:25 -0500 Subject: [PATCH 309/344] Add a background server for RequestManager (#1223) * add a background server for RequestManager * . * make incr_decoding work * make spec_infer work * format * update python inference * fix python issues * bug fix * add a Legion future to capture the termination of the background server * Add thread safety for background server. * Simplify backend server design. * resolve conflict. * Add server task timeout. * register callbacks to terminate background worker at exit or termination * [Python] enable decoding multiple requests * update README.md and default configuration * [Python] no need to use the llm context environment to start/stop the background server * require at least four cpu cores * [Python] add back explict start_server()/stop_server(). * fix * fix python chatgpt.json --------- Co-authored-by: Gabriele Oliaro Co-authored-by: zwang86 <46699021+zwang86@users.noreply.github.com> Co-authored-by: Zeyu Wang Co-authored-by: xinhaoc --- .github/README.md | 24 +- CMakeLists.txt | 4 +- include/flexflow/flexflow_c.h | 31 +- include/flexflow/model.h | 5 +- include/flexflow/request_manager.h | 76 ++-- .../flexflow/utils}/file_loader.h | 11 +- inference/incr_decoding/CMakeLists.txt | 1 - inference/incr_decoding/incr_decoding.cc | 8 +- inference/models/falcon.cc | 24 +- inference/models/falcon.h | 2 +- inference/models/llama.cc | 32 +- inference/models/llama.h | 2 +- inference/models/mpt.cc | 23 +- inference/models/mpt.h | 2 +- inference/models/opt.cc | 23 +- inference/models/opt.h | 2 +- inference/models/starcoder.cc | 29 +- inference/models/starcoder.h | 2 +- inference/python/incr_decoding.py | 7 +- inference/python/spec_infer.py | 18 +- inference/spec_infer/CMakeLists.txt | 1 - inference/spec_infer/spec_infer.cc | 6 + python/flexflow/core/flexflow_cffi.py | 48 ++- python/flexflow/serve/models/falcon.py | 4 +- python/flexflow/serve/models/llama.py | 5 +- python/flexflow/serve/models/mpt.py | 5 +- python/flexflow/serve/models/opt.py | 4 +- python/flexflow/serve/models/starcoder.py | 5 +- python/flexflow/serve/serve.py | 88 +++-- src/c/flexflow_c.cc | 94 +++-- src/mapper/mapper.cc | 3 +- src/ops/linear.cc | 12 +- {inference => src/runtime}/file_loader.cc | 15 +- src/runtime/inference_manager.cc | 48 ++- src/runtime/model.cc | 18 + src/runtime/request_manager.cc | 330 ++++++++++++------ 36 files changed, 681 insertions(+), 331 deletions(-) rename {inference => include/flexflow/utils}/file_loader.h (84%) rename {inference => src/runtime}/file_loader.cc (98%) diff --git a/.github/README.md b/.github/README.md index 528df18faf..0972135504 100644 --- a/.github/README.md +++ b/.github/README.md @@ -79,7 +79,12 @@ ssms=[] ssm = ff.SSM("JackFram/llama-68m") ssms.append(ssm) ``` -Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. +Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. You can also use the following arguments to specify serving configuration when compiling LLMs and SSMs: + +* max\_requests\_per\_batch: the maximum number of requests to serve in a batch (default: 16) +* max\_seq\_length: the maximum number of tokens in a request (default: 256) +* max\_tokens\_per\_batch: the maximum number of tokens to process in a batch (default: 128) + ```python # Create the sampling configs generation_config = ff.GenerationConfig( @@ -91,11 +96,16 @@ for ssm in ssms: ssm.compile(generation_config) # Compile the LLM for inference and load the weights into memory -llm.compile(generation_config, ssms=ssms) +llm.compile(generation_config, + max_requests_per_batch = 16, + max_seq_length = 256, + max_tokens_per_batch = 128, + ssms=ssms) ``` Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. ```python -result = llm.generate("Here are some travel tips for Tokyo:\n") +with llm: + result = llm.generate("Here are some travel tips for Tokyo:\n") ``` ### Incremental decoding @@ -124,10 +134,14 @@ generation_config = ff.GenerationConfig( ) # Compile the LLM for inference and load the weights into memory -llm.compile(generation_config) +llm.compile(generation_config, + max_requests_per_batch = 16, + max_seq_length = 256, + max_tokens_per_batch = 128) # Generation begins! -result = llm.generate("Here are some travel tips for Tokyo:\n") +with llm: + result = llm.generate("Here are some travel tips for Tokyo:\n") ``` diff --git a/CMakeLists.txt b/CMakeLists.txt index 90cab126e6..acbe7e385f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -264,14 +264,14 @@ if(NOT BUILD_LEGION_ONLY) LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/include/*.h) - list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) + #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) file(GLOB_RECURSE FLEXFLOW_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cc) list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") - list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) + #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 305c8da513..cab3d14ea7 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -611,13 +611,13 @@ flexflow_perf_metrics_t void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); -flexflow_generation_result_t - flexflow_model_generate(flexflow_model_t handle_, - char const *input_text, - int max_num_chars, - char *output_text, - int max_seq_length, - int *output_length_and_tokens); +void flexflow_model_generate(flexflow_model_t handle_, + int num_requests, + char const **input_text, + int max_num_chars, + char **output_text, + int max_seq_length, + int **output_length_and_tokens); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); @@ -988,6 +988,12 @@ void flexflow_request_manager_register_output_filepath( int flexflow_request_manager_register_ssm_model( flexflow_request_manager_t handle_, flexflow_model_t model_handle_); +void flexflow_request_manager_start_background_server( + flexflow_request_manager_t handle_, flexflow_model_t model_handle_); + +void flexflow_request_manager_terminate_background_server( + flexflow_request_manager_t handle_); + // ----------------------------------------------------------------------- // InferenceManager // ----------------------------------------------------------------------- @@ -1004,6 +1010,11 @@ void flexflow_inference_manager_compile_model_and_allocate_buffer( void flexflow_inference_manager_init_operators_inference( flexflow_inference_manager_t handle_, flexflow_model_t model_handle); +void flexflow_inference_manager_register_model_weights_loader( + flexflow_inference_manager_t handle_, + flexflow_model_t model_handle, + flexflow_file_data_loader_t loader_handle); + // ----------------------------------------------------------------------- // FileDataLoader // ----------------------------------------------------------------------- @@ -1014,13 +1025,13 @@ flexflow_file_data_loader_t int num_kv_heads, int hidden_dim, int qkv_inner_dim, - int tensor_parallelism_degree); + int tensor_parallelism_degree, + bool use_full_precision); void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, - flexflow_model_t model_handle_, - bool use_full_precision); + flexflow_model_t model_handle_); #ifdef __cplusplus } diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 75b1dbcbe9..dd6dc76b4d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -247,6 +247,7 @@ enum TaskIDs { RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, + RM_BACKGROUND_SERVING_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, @@ -806,8 +807,8 @@ class FFModel { // ======================================== // Inference APIs // ======================================== - GenerationResult generate(std::vector &prompts, - int max_seq_length); + std::vector generate(std::vector &prompts, + int max_seq_length); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f74b6c5b9f..50a51705cd 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -18,6 +18,8 @@ #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" +#include "flexflow/utils/file_loader.h" +#include #include #include @@ -30,25 +32,29 @@ using tokenizers::Tokenizer; class InferenceManager { public: - InferenceManager(FFConfig const &config); + InferenceManager(); static InferenceManager *get_inference_manager(); void compile_model_and_allocate_buffer(FFModel *model); void init_operators_inference(FFModel *model); Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); - void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, + void load_input_tokens_from_batch_config(FFModel *model, + BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers); - void load_positions(BatchConfigFuture const &bc, + void load_positions(FFModel *model, + BatchConfigFuture const &bc, ParallelTensor position_input, int offset); - void load_inference_metadata_batch_config(BatchConfigFuture const &bc, + void register_model_weights_loader(FFModel *, FileDataLoader *); + void load_inference_metadata_batch_config(FFModel *model, + BatchConfigFuture const &bc, FFHandler *handlers); public: - FFConfig ff_config; std::unordered_map> tensor_buffer; + std::unordered_map model_weights_loaders; int num_devices; }; @@ -91,9 +97,15 @@ struct BeamTree { class RequestManager { public: + enum Status { + INITIALIZED = 1001, + SERVING = 1002, + TERMINATED = 1003, + }; using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; + static const RequestGuid INVALID_GUID = 0; RequestManager(); static RequestManager *get_request_manager(); size_t get_num_processed_requests(); @@ -125,30 +137,38 @@ class RequestManager { int initLength, int non_tree_size); - FFModel *get_model(int model_id); + FFModel *get_ssm_model(int model_id); - GenerationResult generate_incr_decoding(FFModel *model, - std::vector &prompts, - int max_seq_length); - GenerationResult generate_spec_infer(FFModel *model, - std::vector &prompts, - int max_seq_length); + void serve_incr_decoding(FFModel *model); + void serve_spec_infer(FFModel *model); GenerationResult get_generation_result(RequestGuid const &guid); RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); RequestGuid register_new_request(std::vector const &prompt, int max_sequence_length); + // Methods to start and terminate request manager's background task + void start_background_server(FFModel *model); + bool is_background_server_terminated(); + void terminate_background_server(); + static void terminate_background_server_at_exit(); + // Methods to check and mark request completion bool is_request_completed(RequestGuid const &guid); + void trigger_request_completion_future(RequestGuid const &guid); + // Methods for preparing next batches BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, - InferenceResultFuture const &result); + InferenceResultFuture const &result, + Legion::Context ctx, + Legion::Runtime *runtime); BeamSearchBatchConfig prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); BeamSearchBatchConfigFuture prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result); + BeamInferenceResultFuture const &result, + Legion::Context ctx, + Legion::Runtime *runtime); BeamSearchBatchConfig prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, @@ -156,11 +176,15 @@ class RequestManager { BeamSearchBatchConfigFuture prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc, InferenceResultFuture const &result, - int model_id); + int model_id, + Legion::Context ctx, + Legion::Runtime *runtime); TreeVerifyBatchConfig prepare_next_batch_verify( std::vector const &old_batches); TreeVerifyBatchConfigFuture prepare_next_batch_verify( - std::vector const &old_batches); + std::vector const &old_batches, + Legion::Context ctx, + Legion::Runtime *runtime); void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); @@ -187,7 +211,11 @@ class RequestManager { &inputSerializedTree, std::vector> const &outputSerializedTree); - + static void background_serving_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void load_tokens_task(Legion::Task const *task, std::vector const ®ions, @@ -233,9 +261,11 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + Status request_manager_status; // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; + // private fields std::unique_ptr tokenizer_; bool verbose; @@ -247,12 +277,9 @@ class RequestManager { std::unordered_map all_requests; std::unordered_map request_generation_results; std::mutex request_queue_mutex; + std::unordered_map *> request_to_promise; + std::mutex request_to_promise_mutex; RequestGuid next_available_guid; - // Legion futures for inc_decoding and spec_infer - BatchConfigFuture last_bcf; - InferenceResultFuture last_irf; - TreeVerifyBatchConfigFuture last_tree_bcf; - InferenceResultFuture last_tree_irf; // TODO: Move this two vector to request struct std::unordered_map models; + std::vector ssm_models; // Performance profiling size_t num_processed_requests; + // Background server handler + Legion::Future background_server_handler; + private: struct ProfileInfo { int llm_decoding_steps; diff --git a/inference/file_loader.h b/include/flexflow/utils/file_loader.h similarity index 84% rename from inference/file_loader.h rename to include/flexflow/utils/file_loader.h index 6f01a79b80..646eb18da2 100644 --- a/inference/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -30,18 +30,16 @@ class FileDataLoader { int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, - int _tensor_parallelism_degree); + int _tensor_parallelism_degree, + bool _use_full_precision); BatchConfig::TokenId *generate_requests(int num, int length); template void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); - void load_quantization_weight(FFModel *ff, - Layer *l, - int weight_idx, - bool use_full_precision); - void load_weights(FFModel *ff, bool use_full_precision); + void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); + void load_weights(FFModel *ff); void load_positions(FFModel *ff, Tensor pt, @@ -54,4 +52,5 @@ class FileDataLoader { size_t hidden_dim, qkv_inner_dim; std::string prompts_filepath; std::string weights_folder; + bool use_full_precision; }; diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index 53b7cf0c2f..3e1a1521d7 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -7,7 +7,6 @@ set(project_target incr_decoding) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} incr_decoding.cc - ../file_loader.cc ../models/llama.cc ../models/opt.cc ../models/falcon.cc diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c3f9052305..f88af3bc43 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -24,6 +24,7 @@ #include +using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; @@ -250,6 +251,8 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + rm->start_background_server(&model); + int total_num_requests = 0; { using json = nlohmann::json; @@ -266,10 +269,13 @@ void FlexFlow::top_level_task(Task const *task, total_num_requests++; prompts.push_back(text); } - GenerationResult result = + std::vector result = model.generate(prompts, 128 /*max_sequence_length*/); } + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 999ca37037..e00f4e9cfd 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,10 +39,11 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -239,6 +240,20 @@ void FALCON::create_falcon_model(FFModel &ff, output = ff.argmax(lm_head, /*beam_Search*/ false); } + FileDataLoader *fileloader = + new FileDataLoader("", + weight_file_path, + falcon_config.n_head, + falcon_config.n_head_kv, + falcon_config.hidden_size, + falcon_config.hidden_size / falcon_config.n_head, + ff.config.tensor_parallelism_degree, + use_full_precision); + + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); + +#ifdef DEADCODE // Compile the model std::cout << "------start compile ----------" << std::endl; InferenceManager *im = InferenceManager::get_inference_manager(); @@ -256,6 +271,7 @@ void FALCON::create_falcon_model(FFModel &ff, // init operators im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/falcon.h b/inference/models/falcon.h index 01226a30dc..fce2dade3f 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e54d6d8811..14b8c31fa1 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,10 +41,11 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -264,23 +265,28 @@ void LLAMA::create_llama_model(FFModel &ff, } } + FileDataLoader *fileloader = new FileDataLoader( + "", + weight_file_path, + llama_config.num_attention_heads, + llama_config.num_attention_heads, + llama_config.hidden_size, + llama_config.hidden_size / llama_config.num_attention_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); +#ifdef DEADCODE // Compile the model std::cout << "------start compile ----------" << std::endl; im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - llama_config.num_attention_heads, - llama_config.num_attention_heads, - llama_config.hidden_size, - llama_config.hidden_size / - llama_config.num_attention_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff); std::cout << "------load weight finished----------" << std::endl; // init operators im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/llama.h b/inference/models/llama.h index 8ecfcd7155..ba1f0236f9 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 3df67b264c..7e8fc8358f 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,10 +40,11 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -246,7 +247,20 @@ void MPT::create_mpt_model(FFModel &ff, } else { output = ff.argmax(lm_head, /*beam_Search*/ false); } + FileDataLoader *fileloader = + new FileDataLoader("", + weight_file_path, + mpt_config.n_heads, + mpt_config.n_heads, + mpt_config.hidden_size, + mpt_config.hidden_size / mpt_config.n_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); + +#ifdef DEADCODE //------------------- compile the model -------------------------------- InferenceManager *im = InferenceManager::get_inference_manager(); im->compile_model_and_allocate_buffer(&ff); @@ -259,6 +273,7 @@ void MPT::create_mpt_model(FFModel &ff, ff.config.tensor_parallelism_degree); fileloader.load_weights(&ff, use_full_precision); im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/mpt.h b/inference/models/mpt.h index 1969cd9c89..08597e1d75 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/opt.cc b/inference/models/opt.cc index e260f8fa36..3ff4c96fdf 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,10 +42,11 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -254,6 +255,19 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(lm_head, /*beam_Search*/ false); } + FileDataLoader *fileloader = new FileDataLoader( + "", + weight_file_path, + opt_config.num_attention_heads, + opt_config.num_attention_heads, + opt_config.hidden_size, + opt_config.hidden_size / opt_config.num_attention_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); + +#ifdef DEADCODE //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; InferenceManager *im = InferenceManager::get_inference_manager(); @@ -269,6 +283,7 @@ void OPT::create_opt_model(FFModel &ff, fileloader.load_weights(&ff, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/opt.h b/inference/models/opt.h index 1ffe096bca..7c736a26d1 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index e683376e47..2327c86119 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,10 +48,11 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -221,22 +222,26 @@ void STARCODER::create_starcoder_model( } InferenceManager *im = InferenceManager::get_inference_manager(); + FileDataLoader *fileloader = new FileDataLoader( + "", + weight_file_path, + startcoder_config.num_attention_heads, + 1, + startcoder_config.hidden_size, + startcoder_config.hidden_size / startcoder_config.num_attention_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + im->register_model_weights_loader(&ff, fileloader); +#ifdef DEADCODE // Compile the model std::cout << "------start compile ----------" << std::endl; im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - startcoder_config.num_attention_heads, - 1, - startcoder_config.hidden_size, - startcoder_config.hidden_size / - startcoder_config.num_attention_heads, - ff.config.tensor_parallelism_degree); fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index bc113e4d52..0e9577d569 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 4a146ab503..6706cf3c29 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -102,13 +102,16 @@ def main(): max_seq_length=256, max_tokens_per_batch=64, ) - - # Generation begins! + + llm.start_server() + if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: result = llm.generate("Three tips for staying healthy are: ") + + llm.stop_server() if __name__ == "__main__": diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index c9fb5cc7bb..8b9a116dc5 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -73,17 +73,9 @@ def get_configs(): "cache_path": "", "refresh_cache": False, "full_precision": False, - }, - { - # required ssm parameter - "ssm_model": "facebook/opt-125m", - # optional ssm parameters - "cache_path": "", - "refresh_cache": False, - "full_precision": False, - }, + } ], - "prompt": "../prompt/test.json", + "prompt": "", "output_file": "", } # Merge dictionaries @@ -148,14 +140,16 @@ def main(): max_tokens_per_batch=64, ssms=ssms, ) + + llm.start_server() - # Generation begins! if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: result = llm.generate("Three tips for staying healthy are: ") - + + llm.stop_server() if __name__ == "__main__": print("flexflow inference example (speculative inference)") diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index c877a3530b..1b25de8623 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -7,7 +7,6 @@ set(project_target spec_infer) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} spec_infer.cc - ../file_loader.cc ../models/llama.cc ../models/opt.cc ../models/falcon.cc diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index b369a13c1d..7578721dd0 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -22,6 +22,7 @@ #include #include +using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; @@ -385,6 +386,8 @@ void FlexFlow::top_level_task(Task const *task, rm->register_ssm_model(&beam_model); } + rm->start_background_server(&tree_model); + // Register requests from prompt file int total_num_requests = 0; { @@ -407,6 +410,9 @@ void FlexFlow::top_level_task(Task const *task, tree_model.generate(prompts, 128 /*max_sequence_length*/); } + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 00133dacb4..d6f84833be 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3812,26 +3812,28 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate(self, prompt, max_sequence_length): - c_input_text = get_c_name(prompt) - max_num_chars = 36000 - c_output_text = ffi.new("char[]", max_num_chars) - c_output_length_and_tokens = ffi.new("int[]", max_sequence_length + 100) + def generate(self, prompt_list, max_sequence_length): + assert isinstance(prompt_list, list) + c_input_texts = [get_c_name(prompt) for prompt in prompt_list] + max_num_chars = 5 * (max_sequence_length + 100) + c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] + c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] ffc().flexflow_model_generate( self.handle, - c_input_text, + len(prompt_list), + c_input_texts, max_num_chars, - c_output_text, + c_output_texts, max_sequence_length, c_output_length_and_tokens, ) - output_length = c_output_length_and_tokens[0] - output_tokens = [] - for i in range(output_length): - output_tokens.append(c_output_length_and_tokens[i + 1]) + #output_length = c_output_length_and_tokens[0] + #output_tokens = [] + #for i in range(output_length): + # output_tokens.append(c_output_length_and_tokens[i + 1]) from flexflow.serve import GenerationResult - return GenerationResult(ffi.string(c_output_text), output_tokens) + return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset) @@ -4202,6 +4204,14 @@ def set_max_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_sequence_length( self.handle, max_length) + def start_server(self, model): + return ffc().flexflow_request_manager_start_background_server( + self.handle, model.handle + ) + + def stop_server(self): + return ffc().flexflow_request_manager_terminate_background_server( + self.handle) # ----------------------------------------------------------------------- # InferenceManager # ----------------------------------------------------------------------- @@ -4224,6 +4234,10 @@ def init_operators_inference(self, model): self.handle, model.handle ) + def register_model_weights_loader(self, model, fileloader): + ffc().flexflow_inference_manager_register_model_weights_loader( + self.handle, model.handle, fileloader.handle + ) # ----------------------------------------------------------------------- # FileDataLoader @@ -4241,6 +4255,7 @@ def __init__( hidden_dim, qkv_inner_dim, tensor_parallelism_degree, + use_full_precision ): c_weight_file_path = get_c_name(weight_file_path) self.handle = ffc().flexflow_file_data_loader_create( @@ -4250,13 +4265,14 @@ def __init__( hidden_dim, qkv_inner_dim, tensor_parallelism_degree, + use_full_precision ) self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy) - def load_weights(self, model, data_type): + def load_weights(self, model): # Check data type and create use_full_precision boolean - assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF - use_full_precision = data_type == DataType.DT_FLOAT + #assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF + #use_full_precision = data_type == DataType.DT_FLOAT ffc().flexflow_file_data_loader_load_weights( - self.handle, model.handle, use_full_precision + self.handle, model.handle ) diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 2b114f09b3..e9cd789bcc 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.bias = hf_config.bias self.hidden_size = hf_config.hidden_size self.layer_norm_epsilon = hf_config.layer_norm_epsilon @@ -70,6 +71,7 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.falcon_config.max_spec_tree_token_num # Sanity checks if self.falcon_config.hidden_size % self.falcon_config.n_head != 0: @@ -84,7 +86,7 @@ def __init__( f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 7ba0e78a37..900ab48bcd 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.hidden_size = hf_config.hidden_size @@ -62,6 +63,8 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.llama_config.max_spec_tree_token_num + # Sanity checks if self.llama_config.hidden_size % self.llama_config.num_attention_heads != 0: @@ -81,7 +84,7 @@ def __init__( f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 79a5bb940f..c0f995bf22 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.hidden_size = hf_config.d_model self.n_heads = hf_config.n_heads self.n_layers = hf_config.n_layers @@ -57,6 +58,8 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.mpt_config.max_spec_tree_token_num + # Sanity checks if self.mpt_config.hidden_size % self.mpt_config.n_heads != 0: @@ -72,7 +75,7 @@ def __init__( raise ValueError( f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dd36fa6592..dc3f841a5a 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.do_layer_norm_before = hf_config.do_layer_norm_before self.dropout = hf_config.dropout self.enable_bias = hf_config.enable_bias @@ -63,6 +64,7 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.opt_config.max_spec_tree_token_num # Sanity checks if self.opt_config.hidden_size % self.opt_config.num_attention_heads != 0: @@ -82,7 +84,7 @@ def __init__( f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index f4f28a70e1..4a6f191abd 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.dropout_p = hf_config.attn_pdrop self.hidden_size = hf_config.n_embd self.layer_norm_epsilon = hf_config.layer_norm_epsilon @@ -61,6 +62,8 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.starcoder_config.max_spec_tree_token_num + # Sanity checks if ( @@ -84,7 +87,7 @@ def __init__( f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 55601f957e..d1a935e5fc 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -118,6 +118,11 @@ def __init__( self.refresh_cache = refresh_cache self.output_file = output_file + def __del__(self): + # Stop the background server before deleting the object + if type(self) == LLM: + self.rm.stop_server() + def __get_ff_model_type(self): architectures = getattr(self.hf_config, "architectures", []) ff_arch = None @@ -283,25 +288,6 @@ def download_hf_tokenizer_if_needed(self): else: print(f"Loading '{self.model_name}' tokenizer from the cache...") - def __load_hf_weights(self): - print("Loading hf weights...") - - self.download_hf_weights_if_needed() - - # Create file data loader, load weights into tensors - model_configs = self.config_class(self.hf_config) - - self.fileloader = FileDataLoader( - self.weights_path, - model_configs.num_attention_heads, - model_configs.num_key_value_heads, - model_configs.hidden_size, - model_configs.hidden_size // model_configs.num_attention_heads, - self.ffconfig.tensor_parallelism_degree, - ) - - self.fileloader.load_weights(self.model.ffmodel, self.data_type) - def compile( self, generation_config: GenerationConfig = GenerationConfig(), @@ -379,12 +365,27 @@ def compile( max_tokens_per_batch ) - # Create inference manager + # Download the weights from huggingface (if needed) + self.download_hf_weights_if_needed() + + # Create file data loader, load weights into tensors + model_configs = self.config_class(self.hf_config) + + self.fileloader = FileDataLoader( + self.weights_path, + model_configs.num_attention_heads, + model_configs.num_key_value_heads, + model_configs.hidden_size, + model_configs.hidden_size // model_configs.num_attention_heads, + self.ffconfig.tensor_parallelism_degree, + self.data_type == DataType.DT_FLOAT + ) + + # Register weights file loader self.im = InferenceManager() - self.im.compile_model_and_allocate_buffer(self.model.ffmodel) + self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader) - # Download the weights and tokenizer from huggingface (if needed) and load them - self.__load_hf_weights() + # Download the tokenizer from huggingface (if needed) and load them self.download_hf_tokenizer_if_needed() # Create tokenizer (this must be done after we have downloaded the tokenizer @@ -399,11 +400,14 @@ def compile( ) self.rm.register_output_filepath(self.output_file) - self.im.init_operators_inference(self.model.ffmodel) - for ssm in self.ssms: self.rm.register_ssm_model(ssm.model.ffmodel) + # start background server + if (mode == InferenceMode.TREE_VERIFY_MODE) or (mode == InferenceMode.INC_DECODING_MODE): + import atexit + atexit.register(self.rm.stop_server) + def generate(self, prompts: Union[str, List[str]], max_length: int = 128): """Generate tokens based on the input prompt(s) @@ -415,16 +419,32 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128): if type(prompts) == str: if len(prompts) == 0: return None - return self.model.ffmodel.generate(prompts, max_length) + return self.model.ffmodel.generate([prompts], max_length) elif type(prompts) == list: if len(prompts) == 0: return [] - return [ - self.model.ffmodel.generate(prompt, max_length) for prompt in prompts - ] + return self.model.ffmodel.generate(prompts, max_length) else: assert False, "Please pass a non-empty string or list of strings" - + + def start_server(self): + self.rm.start_server(self.model.ffmodel) + print("Background server started.") + + def stop_server(self): + self.rm.stop_server() + print("Background server stoped.") + + def __enter__(self): + # Start the server when entering the context + #self.rm.start_server(self.model.ffmodel) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # Stop the server when exiting the context + #self.rm.stop_server() + if exc_type: + print(f"Exception occurred: {exc_value}") class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" @@ -461,9 +481,9 @@ def __init__( def compile( self, generation_config: GenerationConfig = GenerationConfig(), - max_requests_per_batch: int = 1, + max_requests_per_batch: int = 16, max_seq_length: int = 256, - max_tokens_per_batch: int = 64, + max_tokens_per_batch: int = 128, model_specific_data_parallelism_degree: int = 1, model_specific_tensor_parallelism_degree: int = 1, model_specific_pipeline_parallelism_degree: int = 1, @@ -475,11 +495,11 @@ def compile( :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional - :param max_requests_per_batch: The maximum batch size to allow, defaults to 1 + :param max_requests_per_batch: The maximum batch size to allow, defaults to 16 :type max_requests_per_batch: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 :type max_seq_length: int, optional - :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 + :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 :type max_tokens_per_batch: int, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 :type model_specific_data_parallelism_degree: int, optional diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 82a37a9736..9ad58695ad 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -17,7 +17,7 @@ #include "flexflow/dataloader.h" #include "flexflow/mapper.h" #include "flexflow/request_manager.h" -#include "inference/file_loader.h" +#include "flexflow/utils/file_loader.h" using namespace Legion; using namespace FlexFlow; @@ -1582,32 +1582,41 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { handle->set_transformer_layer_id(id); } -flexflow_generation_result_t - flexflow_model_generate(flexflow_model_t handle_, - char const *input_text, - int max_num_chars, - char *output_text, - int max_seq_length, - int *output_length_and_tokens) { +void flexflow_model_generate(flexflow_model_t handle_, + int num_requests, + char const **input_texts, + int max_num_chars, + char **output_texts, + int max_seq_length, + int **output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); std::vector prompts; - std::string const text_str(input_text); - prompts.push_back(input_text); - GenerationResult result = handle->generate(prompts, max_seq_length); - DEBUG_PRINT( - "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); + for (int i = 0; i < num_requests; i++) { + std::string const text_str(input_texts[i]); + prompts.push_back(text_str); + DEBUG_PRINT("[Model] generate[%d] %p %s %i", + i, + handle, + text_str.c_str(), + max_seq_length); + } + std::vector results = + handle->generate(prompts, max_seq_length); // If the prompt exceeds max seq len, check that we return the prompt with no // additional token. Otherwise, check that the output does not exceed the max // sequence length. - assert(result.output_tokens.size() <= max_seq_length || - result.output_tokens.size() == result.input_tokens.size()); - output_length_and_tokens[0] = result.output_tokens.size(); - std::copy(result.output_tokens.begin(), - result.output_tokens.end(), - output_length_and_tokens + 1); - std::memcpy( - output_text, result.output_text.c_str(), result.output_text.length()); - return FFCObjectWrapper::wrap(&result); + for (int i = 0; i < num_requests; i++) { + assert(results[i].output_tokens.size() <= max_seq_length || + results[i].output_tokens.size() == results[i].input_tokens.size()); + output_length_and_tokens[i][0] = results[i].output_tokens.size(); + std::copy(results[i].output_tokens.begin(), + results[i].output_tokens.end(), + output_length_and_tokens[i] + 1); + std::memcpy(output_texts[i], + results[i].output_text.c_str(), + results[i].output_text.length()); + } + // return FFCObjectWrapper::wrap(&results[0]); } void flexflow_model_set_position_offset(flexflow_model_t handle_, @@ -2616,6 +2625,22 @@ int flexflow_request_manager_register_ssm_model( return handle->register_ssm_model(model_handle); } +void flexflow_request_manager_start_background_server( + flexflow_request_manager_t handle_, flexflow_model_t model_handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + DEBUG_PRINT( + "[RequestManager] start background server %p %p", handle, model_handle); + handle->start_background_server(model_handle); +} + +void flexflow_request_manager_terminate_background_server( + flexflow_request_manager_t handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[RequestManager] terminate background server %p", handle); + handle->terminate_background_server(); +} + // ----------------------------------------------------------------------- // InferenceManager // ----------------------------------------------------------------------- @@ -2644,6 +2669,20 @@ void flexflow_inference_manager_init_operators_inference( handle->init_operators_inference(model); } +void flexflow_inference_manager_register_model_weights_loader( + flexflow_inference_manager_t handle_, + flexflow_model_t model_handle, + flexflow_file_data_loader_t loader_handle) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model = FFCObjectWrapper::unwrap(model_handle); + FileDataLoader *loader = FFCObjectWrapper::unwrap(loader_handle); + DEBUG_PRINT("[InferenceManager] register_model_weights_loader %p %p %p", + handle, + model, + loader); + handle->register_model_weights_loader(model, loader); +} + // ----------------------------------------------------------------------- // FileDataLoader // ----------------------------------------------------------------------- @@ -2654,7 +2693,8 @@ flexflow_file_data_loader_t int num_kv_heads, int hidden_dim, int qkv_inner_dim, - int tensor_parallelism_degree) { + int tensor_parallelism_degree, + bool use_full_precision) { assert(weight_file_path != nullptr && "Cannot convert nullptr char * to std::string"); std::string const weight_file_path_str(weight_file_path); @@ -2664,7 +2704,8 @@ flexflow_file_data_loader_t num_kv_heads, hidden_dim, qkv_inner_dim, - tensor_parallelism_degree); + tensor_parallelism_degree, + use_full_precision); DEBUG_PRINT("[FileDataLoader] new %p", handle); return FFCObjectWrapper::wrap(handle); } @@ -2676,9 +2717,8 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_) { } void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, - flexflow_model_t model_handle_, - bool use_full_precision) { + flexflow_model_t model_handle_) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - handle->load_weights(model, use_full_precision); + handle->load_weights(model); } diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index a2fb1d89be..bc26a79d3e 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -286,7 +286,8 @@ void FFMapper::select_task_options(const MapperContext ctx, if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || - (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) { + (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) || + (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) { output.initial_proc = all_cpus[0]; return; } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 63b26bfe7d..6ca6038778 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,12 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - TensorAccessorW acc_kernel(regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); + // TensorAccessorW acc_kernel(regions[2], + // task->regions[2], + // FID_DATA, + // ctx, + // runtime, + // false /*readOutput*/); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/inference/file_loader.cc b/src/runtime/file_loader.cc similarity index 98% rename from inference/file_loader.cc rename to src/runtime/file_loader.cc index 7c6870d439..56558b3185 100644 --- a/inference/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "file_loader.h" +#include "flexflow/utils/file_loader.h" #include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" @@ -28,11 +28,13 @@ FileDataLoader::FileDataLoader(std::string _prompts_filepath, int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, - int _tensor_parallelism_degree) + int _tensor_parallelism_degree, + bool _use_full_precision) : prompts_filepath(_prompts_filepath), weights_folder(_weights_folder), num_heads(_num_heads), num_kv_heads(_num_kv_heads), hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim), - tensor_parallelism_degree(_tensor_parallelism_degree){}; + tensor_parallelism_degree(_tensor_parallelism_degree), + use_full_precision(_use_full_precision){}; BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { @@ -650,8 +652,7 @@ void load_from_quantized_file(char *ptr, void FileDataLoader::load_quantization_weight(FFModel *ff, Layer *l, - int weight_idx, - bool use_full_precision) { + int weight_idx) { Tensor weight = l->weights[weight_idx]; size_t volume = 1; std::vector dims_vec; @@ -789,7 +790,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, delete data; } -void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { +void FileDataLoader::load_weights(FFModel *ff) { for (Layer *l : ff->layers) { if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { continue; @@ -809,7 +810,7 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { case DT_INT4: case DT_INT8: // load weights in quantization - load_quantization_weight(ff, l, i, use_full_precision); + load_quantization_weight(ff, l, i); break; default: assert(false && "Unsupported data type"); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index cc76da58bb..6588cbceeb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,8 +28,8 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager(FFConfig const &_config) - : ff_config(_config) { +InferenceManager::InferenceManager() { +#ifdef DEADCODE num_devices = ff_config.workersPerNode * ff_config.numNodes; // Check parallelization degrees assert(ff_config.data_parallelism_degree <= num_devices && @@ -53,6 +53,7 @@ InferenceManager::InferenceManager(FFConfig const &_config) num_devices && "Product of data, tensor, and pipeline parallelism degrees does not " "match the number of available devices"); +#endif } InferenceManager *inference_manager_singleton = nullptr; @@ -60,8 +61,8 @@ InferenceManager *inference_manager_singleton = nullptr; /*static*/ InferenceManager *InferenceManager::get_inference_manager() { if (inference_manager_singleton == nullptr) { - FFConfig ffconfig; - inference_manager_singleton = new InferenceManager(ffconfig); + // FFConfig ffconfig; + inference_manager_singleton = new InferenceManager(); } return inference_manager_singleton; } @@ -382,13 +383,13 @@ FutureMap InferenceManager::inference(FFModel *model, // input. assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_positions(bc, pt, model->position_offset); + load_positions(model, bc, pt, model->position_offset); } else { found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt, model->handlers); - load_inference_metadata_batch_config(bc, model->handlers); + load_input_tokens_from_batch_config(model, bc, pt, model->handlers); + load_inference_metadata_batch_config(model, bc, model->handlers); } } @@ -418,11 +419,12 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( + FFModel *model, BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers) { - Context ctx = ff_config.lg_ctx; - Runtime *runtime = ff_config.lg_hlr; + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; Domain domain = runtime->get_index_space_domain(ctx, input->parallel_is); @@ -462,13 +464,13 @@ void InferenceManager::load_input_tokens_from_batch_config( } void InferenceManager::load_inference_metadata_batch_config( - BatchConfigFuture const &bc, FFHandler *handlers) { - Context ctx = ff_config.lg_ctx; - Runtime *runtime = ff_config.lg_hlr; + FFModel *model, BatchConfigFuture const &bc, FFHandler *handlers) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; ArgumentMap argmap; Domain domain = - runtime->get_index_space_domain(ctx, ff_config.all_gpu_task_is); + runtime->get_index_space_domain(ctx, model->config.all_gpu_task_is); Rect<1> task_rect = domain; int idx = 0; @@ -478,7 +480,7 @@ void InferenceManager::load_inference_metadata_batch_config( } IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, - ff_config.all_gpu_task_is, + model->config.all_gpu_task_is, TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, @@ -489,11 +491,12 @@ void InferenceManager::load_inference_metadata_batch_config( runtime->execute_index_space(ctx, launcher); } -void InferenceManager::load_positions(BatchConfigFuture const &bc, +void InferenceManager::load_positions(FFModel *model, + BatchConfigFuture const &bc, ParallelTensor position_input, int offset) { - Context ctx = ff_config.lg_ctx; - Runtime *runtime = ff_config.lg_hlr; + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; size_t machine_view_hash = position_input->machine_view.hash(); ArgumentMap argmap; IndexLauncher launcher(RM_LOAD_POSITION_TASK_ID, @@ -514,6 +517,11 @@ void InferenceManager::load_positions(BatchConfigFuture const &bc, runtime->execute_index_space(ctx, launcher); } +void InferenceManager::register_model_weights_loader(FFModel *model, + FileDataLoader *loader) { + model_weights_loaders[model] = loader; +} + void FFModel::set_transformer_layer_id(int id) { // We assume that users call this function with // monotonically increasing ids @@ -529,6 +537,12 @@ void FFModel::set_position_offset(int offset) { } void FFModel::compile_inference() { + // Request at least four CPU processors for inference runs + assert( + config.cpusPerNode >= 4 && + "FlexFlow Serve requires at least four CPU cores per node, please add " + "`-ll:cpu 4` in the command line if you are using the C++ interface or " + "set `num_cpus` in `ff.init` if you are using the Python interface"); Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4270515224..c07c33efca 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4480,6 +4480,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, RequestManager::prepare_next_batch_verify_task>(registrar); } } + // RequestManager background serving task + { + TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID, + "RequestManager Background Serving Task"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + // registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + RequestManager::background_serving_task>( + registrar, "RequestManager Background Serving Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 56a2c122d3..46e17d4fdc 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -18,6 +18,7 @@ // #include "flexflow/tokenizers.h" #include #include +#include #include #include #include @@ -43,7 +44,8 @@ std::string LoadBytesFromFile(std::string const &path) { } RequestManager::RequestManager() - : verbose(false), next_available_guid(1000000), num_processed_requests(0), + : request_manager_status(INITIALIZED), verbose(false), + next_available_guid(1000000), num_processed_requests(0), total_request_run_time(0.0f) { // The following config parameters are set // during ffmodel.compile() @@ -53,26 +55,6 @@ RequestManager::RequestManager() max_requests_per_batch = -1; max_tokens_per_batch = -1; max_sequence_length = -1; - { - // Initialize futures for spec infer - TreeVerifyBatchConfig tree_bc; - InferenceResult tree_ir; - TreeVerifyBatchConfigFuture tree_bcf = - Future::from_value(tree_bc); - InferenceResultFuture tree_irf = - Future::from_value(tree_ir); - last_tree_bcf = tree_bcf; - last_tree_irf = tree_irf; - } - { - // Initialize futures for incr decoding - BatchConfig bc; - InferenceResult ir; - BatchConfigFuture bcf = Future::from_value(bc); - InferenceResultFuture irf = Future::from_value(ir); - last_bcf = bcf; - last_irf = irf; - } } void RequestManager::set_max_requests_per_batch(int max_num_requests) { @@ -171,19 +153,19 @@ void RequestManager::register_output_filepath( } int RequestManager::register_ssm_model(FFModel *model) { - int model_id = models.size(); - models.push_back(model); - std::cout << "Register new model with id: " << model_id << std::endl; + int model_id = ssm_models.size(); + ssm_models.push_back(model); + std::cout << "Register new ssm model with id: " << model_id << std::endl; return model_id; } -FFModel *RequestManager::get_model(int model_id) { - assert(model_id < models.size()); - return models[model_id]; +FFModel *RequestManager::get_ssm_model(int model_id) { + assert(model_id < ssm_models.size()); + return ssm_models[model_id]; } size_t RequestManager::get_num_ssms() { - return models.size(); + return ssm_models.size(); } RequestManager::RequestGuid @@ -203,7 +185,7 @@ RequestManager::RequestGuid << prompt.size() << ".\n"; printf("tokens size: %zu\n", request.tokens.size()); - return 0; + return INVALID_GUID; } else { request.initial_len = prompt.size(); request.tokens = prompt; @@ -214,7 +196,7 @@ RequestManager::RequestGuid "decoding." << std::endl; } else { - std::cout << "Num of models: " << get_num_ssms() << std::endl; + std::cout << "Num of SSMs: " << get_num_ssms() << std::endl; for (int i = 0; i < get_num_ssms(); i++) { BeamTree beam_tree = BeamTree{}; request.beam_trees.push_back(beam_tree); @@ -223,6 +205,10 @@ RequestManager::RequestGuid pending_request_queue.push(request); all_requests[request.guid] = request; + { + const std::lock_guard lock(request_to_promise_mutex); + request_to_promise[request.guid] = new std::promise(); + } if (verbose) { std::cout << "new req: " << request.tokens.size() << std::endl; @@ -261,7 +247,7 @@ RequestManager::RequestGuid << tokens.size() << ".\n"; printf("tokens size: %zu\n", tokens.size()); - return 0; + return INVALID_GUID; } for (int i = 0; i < tokens.size(); i++) { std::cout << "[" << i << "]" << tokens.at(i) << "\n"; @@ -274,7 +260,7 @@ RequestManager::RequestGuid "decoding." << std::endl; } else { - std::cout << "Num of models: " << get_num_ssms() << std::endl; + std::cout << "Num of SSMs: " << get_num_ssms() << std::endl; for (int i = 0; i < get_num_ssms(); i++) { BeamTree beam_tree = BeamTree{}; request.beam_trees.push_back(beam_tree); @@ -283,6 +269,11 @@ RequestManager::RequestGuid pending_request_queue.push(request); all_requests[request.guid] = request; + { + const std::lock_guard lock(request_to_promise_mutex); + request_to_promise[request.guid] = new std::promise(); + } + { std::string output = "New request tokens:"; output = "[" + std::to_string(request.guid) + "]" + output; @@ -312,10 +303,22 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); - assert(request_generation_results.find(guid) != - request_generation_results.end()); - return request_generation_results[guid]; + // First get the future of the request + std::future future; + { + const std::lock_guard lock(request_to_promise_mutex); + assert(request_to_promise.find(guid) != request_to_promise.end()); + future = request_to_promise[guid]->get_future(); + } + // Wait until the result is completed + future.get(); + // Get the generation result + { + const std::lock_guard lock(request_queue_mutex); + assert(request_generation_results.find(guid) != + request_generation_results.end()); + return request_generation_results[guid]; + } } size_t RequestManager::get_num_processed_requests() { @@ -324,10 +327,9 @@ size_t RequestManager::get_num_processed_requests() { BatchConfigFuture RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc, - InferenceResultFuture const &result) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); - + InferenceResultFuture const &result, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_TASK_ID, TaskArgument(&rm, sizeof(RequestManager *))); @@ -394,10 +396,6 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request_completed = true; } if (request_completed) { - request.status = Request::COMPLETED; - log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", - old_bc.requestsInfo[i].request_guid, - request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -405,14 +403,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request.tokens.at(0) == bos_token_id) { output = " " + output; } - { - // update generation result and trigger future + // update generation result GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); gr.output_tokens = request.tokens; gr.output_text = output; } + request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); + log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", + old_bc.requestsInfo[i].request_guid, + request.tokens.size()); log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; @@ -545,9 +547,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init( TreeVerifyBatchConfigFuture const &old_bc, InferenceResultFuture const &result, - int model_id) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); + int model_id, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, @@ -581,8 +583,6 @@ BeamSearchBatchConfig std::cout << "\n############### prepare_next_batch_init ###############\n"; } - std::cout << "\n############### prepare_next_batch_init ###############\n"; - // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; @@ -664,7 +664,6 @@ BeamSearchBatchConfig request.tokens.push_back(token_pair.first); } } - request.status = Request::COMPLETED; log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", request.guid, request.tokens.size()); @@ -676,12 +675,14 @@ BeamSearchBatchConfig output = " " + output; } { - // update generation result and trigger future + // update generation result GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); gr.output_tokens = request.tokens; gr.output_text = output; } + request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); log_req_mgr.print("Final output: %s", output.c_str()); new_bc.request_completed[i] = true; @@ -983,9 +984,9 @@ BeamSearchBatchConfig /***** Beam Search Phase *****/ BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); + BeamInferenceResultFuture const &result, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, @@ -1298,9 +1299,9 @@ BeamSearchBatchConfig /***** Verify Phase *****/ TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( - std::vector const &old_batches) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); + std::vector const &old_batches, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, @@ -1328,7 +1329,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { const std::lock_guard lock(request_queue_mutex); - std::cout << "\n############### prepare_next_batch_verify ###############\n"; + if (verbose) { + std::cout + << "\n############### prepare_next_batch_verify ###############\n"; + } assert(old_batches.size() > 0); @@ -2277,39 +2281,107 @@ std::vector> return merged_tree; } -GenerationResult FFModel::generate(std::vector &prompts, - int max_seq_length) { +std::vector + FFModel::generate(std::vector &prompts, int max_seq_length) { + RequestManager *rm = RequestManager::get_request_manager(); + std::vector guids; + for (int i = 0; i < prompts.size(); i++) { + RequestManager::RequestGuid guid = + rm->register_new_request(prompts.at(i), max_seq_length); + if (guid != RequestManager::INVALID_GUID) { + guids.push_back(guid); + } + } + std::vector results; + for (int i = 0; i < guids.size(); i++) { + results.push_back(rm->get_generation_result(guids[i])); + } + return results; +} + +void RequestManager::start_background_server(FFModel *model) { + assert(request_manager_status == INITIALIZED); + request_manager_status = SERVING; + // Start background task + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + TaskLauncher launcher(RM_BACKGROUND_SERVING_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + background_server_handler = runtime->execute_task(ctx, launcher); + // Register callbacks for normal exit + { + int ret = std::atexit(RequestManager::terminate_background_server_at_exit); + assert(ret == 0); // make sure the callback is successfully registered + } + // Register callbacks for termination + { + std::set_terminate([]() { + RequestManager::terminate_background_server_at_exit(); + std::abort(); + }); + } +} + +void RequestManager::background_serving_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { RequestManager *rm = RequestManager::get_request_manager(); + FFModel *llm = *(FFModel **)task->args; + { + // Update FFModel's lg_hlr and lg_ctx to the current + // task's runtime and ctx, since all future legion tasks are + // launched in this task + llm->config.lg_hlr = runtime; + llm->config.lg_ctx = ctx; + // Update the lg_hlr and lg_ctx of all SSMs' FFConfig + // since all future legion tasks are launched in this task + for (size_t i = 0; i < rm->get_num_ssms(); i++) { + FFModel *ssm = rm->get_ssm_model(i); + ssm->config.lg_hlr = runtime; + ssm->config.lg_ctx = ctx; + } + } if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding - return rm->generate_incr_decoding(this, prompts, max_seq_length); + rm->serve_incr_decoding(llm); } else { // Registered SSMs: perform speculative inference - return rm->generate_spec_infer(this, prompts, max_seq_length); + rm->serve_spec_infer(llm); } } /*static*/ -GenerationResult RequestManager::generate_incr_decoding( - FFModel *llm, std::vector &prompts, int max_seq_length) { +void RequestManager::serve_incr_decoding(FFModel *llm) { + Context ctx = llm->config.lg_ctx; + Runtime *runtime = llm->config.lg_hlr; + // Compile the llm InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid; - for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length); - } - - if (guid == 0) { - std::cout - << "=========== Discard request exceed prompt maximum... ===========" - << std::endl; - return GenerationResult(); + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights(llm); + // init operators + im->init_operators_inference(llm); + // Legion futures for inc_decoding and spec_infer + BatchConfigFuture last_bcf; + InferenceResultFuture last_irf; + { + // Initialize futures for incr decoding + BatchConfig bc; + InferenceResult ir; + last_bcf = Future::from_value(bc); + last_irf = Future::from_value(ir); } - int tokens_to_generate = max_seq_length - all_requests[guid].tokens.size(); std::queue> batch_pipeline; { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); } - while (!is_request_completed(guid)) { + + while (!is_background_server_terminated()) { + if (batch_pipeline.size() >= 4) { // Block here to avoid launching too many batches auto const &batch = batch_pipeline.front(); @@ -2324,15 +2396,10 @@ GenerationResult RequestManager::generate_incr_decoding( break; } } - if (is_request_completed(guid)) { - break; - } - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); runtime->begin_trace(ctx, 12346 /*trace_id*/); auto const &next_batch = batch_pipeline.back(); BatchConfigFuture bcf = - prepare_next_batch(next_batch.first, next_batch.second); + prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); @@ -2341,30 +2408,51 @@ GenerationResult RequestManager::generate_incr_decoding( last_irf = irf; runtime->end_trace(ctx, 12346 /*trace_id*/); } - GenerationResult gr = get_generation_result(guid); - // assert(gr.output_tokens.size() >= max_seq_length); - return gr; } /*static*/ -GenerationResult RequestManager::generate_spec_infer( - FFModel *llm, std::vector &prompts, int max_seq_length) { +void RequestManager::serve_spec_infer(FFModel *llm) { + Context ctx = llm->config.lg_ctx; + Runtime *runtime = llm->config.lg_hlr; InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid; - for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length); + { + // Compile the llm + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights(llm); + // init operators + im->init_operators_inference(llm); } - if (guid == 0) { - std::cout - << "=========== Discard request exceed prompt maximum... ===========" - << std::endl; - return GenerationResult(); + for (size_t i = 0; i < get_num_ssms(); i++) { + // Compile the i-th ssm + FFModel *ssm = get_ssm_model(i); + im->compile_model_and_allocate_buffer(ssm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[ssm]->load_weights(ssm); + // init operators + im->init_operators_inference(ssm); } std::queue> batch_pipeline; + // Legion futures for inc_decoding and spec_infer + TreeVerifyBatchConfigFuture last_tree_bcf; + InferenceResultFuture last_tree_irf; + { + // Initialize futures for spec infer + TreeVerifyBatchConfig tree_bc; + InferenceResult tree_ir; + last_tree_bcf = Future::from_value(tree_bc); + last_tree_irf = Future::from_value(tree_ir); + } batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf)); - while (!is_request_completed(guid)) { + + while (!is_background_server_terminated()) { + if (batch_pipeline.size() >= 4) { // Block here to avoid launching too many batches auto const &batch = batch_pipeline.front(); @@ -2380,17 +2468,12 @@ GenerationResult RequestManager::generate_spec_infer( } } auto const &next_batch = batch_pipeline.back(); - BeamSearchBatchConfigFuture beam_bcf = - prepare_next_batch_init(next_batch.first, next_batch.second, 0); + BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init( + next_batch.first, next_batch.second, 0, ctx, runtime); std::vector beam_bcf_vec(get_num_ssms()); for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) { beam_bcf_vec[ssm_id] = beam_bcf; } - // if (is_request_completed(guid)) { - // break; - // } - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); runtime->begin_trace(ctx, 12345 /*trace_id*/); for (size_t i = 0; i < get_num_ssms(); i++) { @@ -2398,16 +2481,17 @@ GenerationResult RequestManager::generate_spec_infer( depth++) { beam_bcf = beam_bcf_vec[i]; - FutureMap fm = im->inference(get_model(i), 0, beam_bcf_vec[i]); + FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]); assert(fm.get_future_map_domain().get_volume() == 1); BeamInferenceResultFuture beam_irf = fm.get_future(0); - beam_bcf_vec[i] = prepare_next_batch_beam(beam_bcf_vec[i], beam_irf); + beam_bcf_vec[i] = + prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime); } } // Token Tree Verification { TreeVerifyBatchConfigFuture tree_bcf = - prepare_next_batch_verify(beam_bcf_vec); + prepare_next_batch_verify(beam_bcf_vec, ctx, runtime); FutureMap fm = im->inference(llm, 0, tree_bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture tree_irf = fm.get_future(0); @@ -2417,10 +2501,34 @@ GenerationResult RequestManager::generate_spec_infer( } runtime->end_trace(ctx, 12345 /*trace_id*/); } +} + +void RequestManager::trigger_request_completion_future( + RequestGuid const &guid) { + const std::lock_guard lock(request_to_promise_mutex); + assert(request_to_promise.find(guid) != request_to_promise.end()); + // Set the completion promise in case other threads are waiting + request_to_promise[guid]->set_value(); +} + +/*static*/ +void RequestManager::terminate_background_server_at_exit() { + RequestManager *rm = RequestManager::get_request_manager(); + rm->terminate_background_server(); +} + +void RequestManager::terminate_background_server() { + if (request_manager_status == SERVING) { + request_manager_status = TERMINATED; + // Wait for the background server to terminate + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + background_server_handler.get_void_result(); + } +} - GenerationResult gr = get_generation_result(guid); - // assert(gr.output_tokens.size() >= max_seq_length); - return gr; +bool RequestManager::is_background_server_terminated() { + return request_manager_status == TERMINATED; } RequestManager *request_manager_singleton = nullptr; From 18cd4850229e1fe29778d6383ee3f7175668a093 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 14 Jan 2024 07:12:32 -0800 Subject: [PATCH 310/344] Update README.md --- .github/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/README.md b/.github/README.md index 0972135504..4a2a881c8d 100644 --- a/.github/README.md +++ b/.github/README.md @@ -102,10 +102,11 @@ llm.compile(generation_config, max_tokens_per_batch = 128, ssms=ssms) ``` -Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. +Next, we call `llm.start_server()` to start an LLM server running on a seperate background thread, which allows users to perform computations in parallel with LLM serving. Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. After all serving requests are processed, you can either call `llm.stop_server()` to terminate the background thread or directly exit the python program, which will automatically terminate the background server thread. ```python -with llm: - result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.start_server() +result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.stop_server() # This invocation is optional ``` ### Incremental decoding @@ -140,8 +141,9 @@ llm.compile(generation_config, max_tokens_per_batch = 128) # Generation begins! -with llm: - result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.start_server() +result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.stop_server() # This invocation is optional ``` From 75edadcbaf65fc4cea83eea91de73719ed5a4959 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 19 Jan 2024 23:21:31 -0500 Subject: [PATCH 311/344] Better debugging/logging tools for alignment checks (#1275) * only stop server if rm is initialized * fix * better logging * pass layer names to ops * add debugging functionality to hf script * fix * fixes * fix * fix --------- Co-authored-by: Ubuntu --- examples/python/keras/seq_cifar10_cnn.py | 2 +- include/flexflow/operator.h | 125 +++++++++++++++++- .../ops/add_bias_residual_layer_norm_params.h | 1 + include/flexflow/ops/aggregate_params.h | 1 + include/flexflow/ops/aggregate_spec_params.h | 1 + include/flexflow/ops/arg_topk_params.h | 1 + include/flexflow/ops/argmax_params.h | 1 + include/flexflow/ops/attention_params.h | 1 + include/flexflow/ops/batch_matmul_params.h | 1 + include/flexflow/ops/beam_topk_params.h | 1 + include/flexflow/ops/cast_params.h | 1 + include/flexflow/ops/concat_params.h | 2 +- include/flexflow/ops/conv_2d_params.h | 1 + include/flexflow/ops/dropout_params.h | 1 + include/flexflow/ops/element_binary_params.h | 1 + include/flexflow/ops/element_unary_params.h | 1 + include/flexflow/ops/embedding_params.h | 1 + include/flexflow/ops/experts_params.h | 1 + include/flexflow/ops/flat_params.h | 1 + include/flexflow/ops/gather_params.h | 1 + include/flexflow/ops/groupby_params.h | 1 + .../ops/inc_multihead_self_attention_params.h | 1 + include/flexflow/ops/layer_norm_params.h | 1 + include/flexflow/ops/linear_params.h | 1 + include/flexflow/ops/pool_2d_params.h | 1 + include/flexflow/ops/reduce_params.h | 1 + include/flexflow/ops/reshape_params.h | 1 + .../flexflow/ops/residual_layer_norm_params.h | 1 + .../flexflow/ops/residual_rms_norm_params.h | 1 + include/flexflow/ops/rms_norm_params.h | 1 + include/flexflow/ops/sampling_params.h | 1 + .../flexflow/ops/sigmoid_silu_multi_params.h | 1 + include/flexflow/ops/softmax.h | 6 + include/flexflow/ops/softmax_params.h | 2 + ...spec_inc_multihead_self_attention_params.h | 2 +- include/flexflow/ops/split_params.h | 1 + include/flexflow/ops/topk_params.h | 1 + include/flexflow/ops/transpose_params.h | 1 + ...tree_inc_multihead_self_attention_params.h | 1 + .../flexflow/parallel_ops/allreduce_params.h | 1 + .../flexflow/parallel_ops/combine_params.h | 1 + .../parallel_ops/fused_parallel_op_params.h | 1 + .../flexflow/parallel_ops/partition_params.h | 1 + .../flexflow/parallel_ops/reduction_params.h | 1 + .../flexflow/parallel_ops/replicate_params.h | 1 + inference/utils/download_hf_model.py | 4 +- python/flexflow/serve/serve.py | 31 +++-- src/ops/add_bias_residual_layer_norm.cc | 12 +- src/ops/aggregate.cc | 13 +- src/ops/aggregate_spec.cc | 3 + src/ops/arg_topk.cc | 12 +- src/ops/argmax.cc | 12 +- src/ops/attention.cc | 5 +- src/ops/batch_matmul.cc | 9 +- src/ops/beam_topk.cc | 10 +- src/ops/cast.cc | 8 +- src/ops/concat.cc | 2 +- src/ops/conv_2d.cc | 9 +- src/ops/dropout.cc | 9 +- src/ops/element_binary.cc | 9 +- src/ops/element_unary.cc | 11 +- src/ops/embedding.cc | 2 +- src/ops/experts.cc | 9 +- src/ops/flat.cc | 7 + src/ops/fused.cu | 7 +- src/ops/gather.cc | 9 +- src/ops/group_by.cc | 20 ++- src/ops/inc_multihead_self_attention.cc | 5 +- src/ops/layer_norm.cc | 12 +- src/ops/linear.cc | 12 +- src/ops/pool_2d.cc | 9 +- src/ops/reduce.cc | 18 ++- src/ops/reshape.cc | 12 +- src/ops/residual_layer_norm.cc | 12 +- src/ops/residual_rms_norm.cc | 12 +- src/ops/rms_norm.cc | 12 +- src/ops/sampling.cc | 12 +- src/ops/sigmoid_silu_multi.cc | 12 +- src/ops/softmax.cc | 48 ++++++- src/ops/spec_inc_multihead_self_attention.cc | 5 +- src/ops/split.cc | 5 +- src/ops/topk.cc | 12 +- src/ops/transpose.cc | 11 +- src/ops/tree_inc_multihead_self_attention.cc | 5 +- src/parallel_ops/allreduce.cc | 5 +- src/parallel_ops/combine.cc | 5 +- src/parallel_ops/fused_parallel_op.cc | 3 + src/parallel_ops/partition.cc | 5 +- src/parallel_ops/reduction.cc | 5 +- src/parallel_ops/replicate.cc | 5 +- src/runtime/cuda_helper.cu | 24 +++- src/runtime/graph.cc | 93 +++++++++++-- src/runtime/hip_helper.cpp | 24 +++- src/runtime/operator.cc | 101 -------------- src/runtime/substitution.cc | 9 +- tests/inference/huggingface_inference.py | 52 +++++++- 96 files changed, 746 insertions(+), 190 deletions(-) diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py index 281a09ed70..66ea8530e0 100644 --- a/examples/python/keras/seq_cifar10_cnn.py +++ b/examples/python/keras/seq_cifar10_cnn.py @@ -56,7 +56,7 @@ def top_level_task(): if __name__ == "__main__": - print("Sequantial model, cifar10 cnn") + print("Sequential model, cifar10 cnn") configs = ff.get_configs() ff.init_flexflow_runtime(configs) top_level_task() diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index fd21436681..73c2c3e092 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -9,6 +9,14 @@ #include "flexflow/utils/dot/record_formatter.h" #include +#include +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + namespace FlexFlow { extern LegionRuntime::Logger::Category log_measure; @@ -227,13 +235,126 @@ class Op { assert(false); }; virtual void print_layer(FFModel const &model) = 0; + template + static std::string get_op_name_without_uid(OpMetaType *m) { + std::string op_name_without_uid = std::string(m->op_name); + size_t last_underscore = op_name_without_uid.length() - 1; + for (int i = op_name_without_uid.length() - 1; i > 0; i--) { + if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { + break; + } else if (m->op_name[i] == '_') { + last_underscore = i; + } + } + op_name_without_uid.erase(last_underscore); + return op_name_without_uid; + } + template static void save_inference_tensors_to_file( - OpMeta *m, + OpMetaType *m, int shard_id, BatchConfig const *bc, std::vector input_tensors, std::vector weight_tensors, - std::vector output_tensors); + std::vector output_tensors, + bool before_kernel = false) { + // Check if output directory exists, and create it if it does not + char const *folder_path = "./inference_tensors"; + struct stat st = {0}; + if (stat(folder_path, &st) == -1) { + // Directory does not exist, create it + mkdir(folder_path, 0700); + } + // output base filepath, shared by all tensors from the same operator + std::string op_name_without_uid = get_op_name_without_uid(m); + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + + op_name_without_uid + "_shard-id_" + std::to_string(shard_id); + if (before_kernel) { + base_filepath += "_pre"; + } + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(base_filepath + "_batch-config"); + } + // save all inputs + for (int i = 0; i < input_tensors.size(); i++) { + std::string filename = base_filepath + "_input_" + std::to_string(i); + if (input_tensors[i].data_type == DT_FLOAT) { + save_tensor(input_tensors[i].get_float_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_HALF) { + save_tensor(input_tensors[i].get_half_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT32) { + save_tensor(input_tensors[i].get_int32_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT64) { + save_tensor(input_tensors[i].get_int64_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // only dump the weights once + if (m->decoding_step == 0) { + for (int i = 0; i < weight_tensors.size(); i++) { + std::string filename = base_filepath + "_weight_" + std::to_string(i); + if (weight_tensors[i].data_type == DT_FLOAT) { + save_tensor(weight_tensors[i].get_float_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_HALF) { + save_tensor(weight_tensors[i].get_half_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT32) { + save_tensor(weight_tensors[i].get_int32_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT64) { + save_tensor(weight_tensors[i].get_int64_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + } + // save all outputs + for (int i = 0; i < output_tensors.size(); i++) { + std::string filename = base_filepath + "_output_" + std::to_string(i); + if (output_tensors[i].data_type == DT_FLOAT) { + save_tensor(output_tensors[i].get_float_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_HALF) { + save_tensor(output_tensors[i].get_half_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT32) { + save_tensor(output_tensors[i].get_int32_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT64) { + save_tensor(output_tensors[i].get_int64_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // increase count of decoding steps + if (!before_kernel) { + m->decoding_step++; + } + } virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const = 0; diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h index 6f49983467..87fe2fb562 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams { bool elementwise_affine; float eps; bool use_bias; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/aggregate_params.h b/include/flexflow/ops/aggregate_params.h index f746881d89..deaa04b3e7 100644 --- a/include/flexflow/ops/aggregate_params.h +++ b/include/flexflow/ops/aggregate_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct AggregateParams { int n; float lambda_bal; + char name[MAX_OPNAME]; bool is_valid(std::vector const &) const; }; bool operator==(AggregateParams const &, AggregateParams const &); diff --git a/include/flexflow/ops/aggregate_spec_params.h b/include/flexflow/ops/aggregate_spec_params.h index eb662f4c07..69e8574cba 100644 --- a/include/flexflow/ops/aggregate_spec_params.h +++ b/include/flexflow/ops/aggregate_spec_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct AggregateSpecParams { int n; float lambda_bal; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(AggregateSpecParams const &, AggregateSpecParams const &); diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index bd9c38e2a9..b2876c011f 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -12,6 +12,7 @@ struct ArgTopKParams { int k; bool sorted; bool speculative_decoding; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ArgTopKParams const &, ArgTopKParams const &); diff --git a/include/flexflow/ops/argmax_params.h b/include/flexflow/ops/argmax_params.h index a8f629619f..9ddb8e1fe3 100644 --- a/include/flexflow/ops/argmax_params.h +++ b/include/flexflow/ops/argmax_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct ArgMaxParams { bool beam_search; bool is_valid(ParallelTensorShape const &) const; + char name[MAX_OPNAME]; }; bool operator==(ArgMaxParams const &, ArgMaxParams const &); diff --git a/include/flexflow/ops/attention_params.h b/include/flexflow/ops/attention_params.h index b72923a65c..89906407d3 100644 --- a/include/flexflow/ops/attention_params.h +++ b/include/flexflow/ops/attention_params.h @@ -11,6 +11,7 @@ struct MultiHeadAttentionParams { int embed_dim, num_heads, kdim, vdim; float dropout; bool bias, add_bias_kv, add_zero_attn; + char name[MAX_OPNAME]; bool is_valid(std::tuple const &) const; }; diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h index 430f16e249..3e09848c9a 100644 --- a/include/flexflow/ops/beam_topk_params.h +++ b/include/flexflow/ops/beam_topk_params.h @@ -11,6 +11,7 @@ struct BeamTopKParams { LayerID layer_guid; bool sorted; int max_beam_width; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(BeamTopKParams const &, BeamTopKParams const &); diff --git a/include/flexflow/ops/cast_params.h b/include/flexflow/ops/cast_params.h index efef3de890..38a69e8a69 100644 --- a/include/flexflow/ops/cast_params.h +++ b/include/flexflow/ops/cast_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct CastParams { DataType dtype; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(CastParams const &, CastParams const &); diff --git a/include/flexflow/ops/concat_params.h b/include/flexflow/ops/concat_params.h index 2987b25424..b1a7e74c55 100644 --- a/include/flexflow/ops/concat_params.h +++ b/include/flexflow/ops/concat_params.h @@ -7,7 +7,7 @@ namespace FlexFlow { struct ConcatParams { int axis; - + char name[MAX_OPNAME]; bool is_valid(std::vector const &) const; }; diff --git a/include/flexflow/ops/conv_2d_params.h b/include/flexflow/ops/conv_2d_params.h index 9aac91e315..562d5adef9 100644 --- a/include/flexflow/ops/conv_2d_params.h +++ b/include/flexflow/ops/conv_2d_params.h @@ -13,6 +13,7 @@ struct Conv2DParams { padding_w, groups; ActiMode activation; bool use_bias; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input) const; void solve_dims(ParallelTensorShape const &input, diff --git a/include/flexflow/ops/dropout_params.h b/include/flexflow/ops/dropout_params.h index 61aee12f9f..eb1a4d98cf 100644 --- a/include/flexflow/ops/dropout_params.h +++ b/include/flexflow/ops/dropout_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct DropoutParams { float rate; unsigned long long seed; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(DropoutParams const &, DropoutParams const &); diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 9489b793a7..bfbb758b6e 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -11,6 +11,7 @@ struct ElementBinaryParams { LayerID layer_guid; OperatorType type; bool inplace_a; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/element_unary_params.h b/include/flexflow/ops/element_unary_params.h index 1aac85c43e..16cb015e3c 100644 --- a/include/flexflow/ops/element_unary_params.h +++ b/include/flexflow/ops/element_unary_params.h @@ -12,6 +12,7 @@ struct ElementUnaryParams { bool inplace; float scalar = 0.0; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/embedding_params.h b/include/flexflow/ops/embedding_params.h index 71e5cc8b20..d813132048 100644 --- a/include/flexflow/ops/embedding_params.h +++ b/include/flexflow/ops/embedding_params.h @@ -12,6 +12,7 @@ struct EmbeddingParams { LayerID layer_guid; AggrMode aggr; DataType data_type; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index 7adced3c8c..90cce47526 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -17,6 +17,7 @@ struct ExpertsParams { int experts_internal_dim_size; bool use_bias; ActiMode activation; + char name[MAX_OPNAME]; bool is_valid(std::vector const &) const; }; diff --git a/include/flexflow/ops/flat_params.h b/include/flexflow/ops/flat_params.h index 5f821b0416..fc006849e5 100644 --- a/include/flexflow/ops/flat_params.h +++ b/include/flexflow/ops/flat_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct FlatParams { + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; void solve_dims(ParallelTensorShape const &input, ParallelDim output_dims[MAX_TENSOR_DIM], diff --git a/include/flexflow/ops/gather_params.h b/include/flexflow/ops/gather_params.h index 51f1184a72..de27cdfc7c 100644 --- a/include/flexflow/ops/gather_params.h +++ b/include/flexflow/ops/gather_params.h @@ -10,6 +10,7 @@ namespace FlexFlow { struct GatherParams { int legion_dim; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; }; diff --git a/include/flexflow/ops/groupby_params.h b/include/flexflow/ops/groupby_params.h index 24a74f5412..4f6245863a 100644 --- a/include/flexflow/ops/groupby_params.h +++ b/include/flexflow/ops/groupby_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct Group_byParams { int n; float alpha; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 7ae39f1cfe..58681069e2 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -16,6 +16,7 @@ struct IncMultiHeadSelfAttentionParams { scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/layer_norm_params.h b/include/flexflow/ops/layer_norm_params.h index c9aa40048d..3effce6204 100644 --- a/include/flexflow/ops/layer_norm_params.h +++ b/include/flexflow/ops/layer_norm_params.h @@ -12,6 +12,7 @@ struct LayerNormParams { bool elementwise_affine; float eps; bool use_bias; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/linear_params.h b/include/flexflow/ops/linear_params.h index 563304e89f..9a62ebd857 100644 --- a/include/flexflow/ops/linear_params.h +++ b/include/flexflow/ops/linear_params.h @@ -20,6 +20,7 @@ class LinearParams { float kernel_reg_lambda; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input_shape) const; void solve_dims(const ParallelTensor input, diff --git a/include/flexflow/ops/pool_2d_params.h b/include/flexflow/ops/pool_2d_params.h index 7d4f1f1c12..54af7f9db6 100644 --- a/include/flexflow/ops/pool_2d_params.h +++ b/include/flexflow/ops/pool_2d_params.h @@ -10,6 +10,7 @@ struct Pool2DParams { int kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w; PoolType pool_type; ActiMode activation; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input) const; void solve_dims(ParallelTensorShape const &input, diff --git a/include/flexflow/ops/reduce_params.h b/include/flexflow/ops/reduce_params.h index b79ba9157a..478649584f 100644 --- a/include/flexflow/ops/reduce_params.h +++ b/include/flexflow/ops/reduce_params.h @@ -10,6 +10,7 @@ struct ReduceParams { std::vector axes; bool keepdims; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/reshape_params.h b/include/flexflow/ops/reshape_params.h index ffd88948ea..15753c8e17 100644 --- a/include/flexflow/ops/reshape_params.h +++ b/include/flexflow/ops/reshape_params.h @@ -10,6 +10,7 @@ namespace FlexFlow { struct ReshapeParams { std::vector shape; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h index 24da4a2c08..949ae0c799 100644 --- a/include/flexflow/ops/residual_layer_norm_params.h +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -13,6 +13,7 @@ struct ResidualLayerNormParams { float eps; bool use_bias; bool use_two_residuals; + char name[MAX_OPNAME]; bool is_valid(std::tuple const &) const; diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h index 64751a30b0..a4e4de59ab 100644 --- a/include/flexflow/ops/residual_rms_norm_params.h +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -11,6 +11,7 @@ struct ResidualRMSNormParams { LayerID layer_guid; float eps; int dim; + char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; }; diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h index 81295322f0..2e4ceecf48 100644 --- a/include/flexflow/ops/rms_norm_params.h +++ b/include/flexflow/ops/rms_norm_params.h @@ -11,6 +11,7 @@ struct RMSNormParams { LayerID layer_guid; float eps; int dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/sampling_params.h b/include/flexflow/ops/sampling_params.h index 1449ddbf54..ddc98a3d6c 100644 --- a/include/flexflow/ops/sampling_params.h +++ b/include/flexflow/ops/sampling_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SamplingParams { float top_p; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SamplingParams const &, SamplingParams const &); diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h index c8182505b3..eb152db5c1 100644 --- a/include/flexflow/ops/sigmoid_silu_multi_params.h +++ b/include/flexflow/ops/sigmoid_silu_multi_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SigmoidSiluMultiParams { LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 6fd1a434d4..61094f7361 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -14,6 +14,7 @@ class Softmax : public Op { using Params = SoftmaxParams; using Input = ParallelTensor; Softmax(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor logit, int dim, char const *name); @@ -60,6 +61,11 @@ class Softmax : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); Params get_params() const; private: diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966d..63dc87641f 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -6,7 +6,9 @@ namespace FlexFlow { struct SoftmaxParams { + LayerID layer_guid; int dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 2f7a706bf1..1461224ba9 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -13,7 +13,7 @@ struct SpecIncMultiHeadSelfAttentionParams { float dropout, scaling_factor; bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; - + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/split_params.h b/include/flexflow/ops/split_params.h index f0f3b2e956..e21a1ab4a1 100644 --- a/include/flexflow/ops/split_params.h +++ b/include/flexflow/ops/split_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SplitParams { std::vector splits; int legion_axis; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/topk_params.h b/include/flexflow/ops/topk_params.h index 8b9a0f1bd5..01c6ae9da7 100644 --- a/include/flexflow/ops/topk_params.h +++ b/include/flexflow/ops/topk_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct TopKParams { int k; bool sorted; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(TopKParams const &, TopKParams const &); diff --git a/include/flexflow/ops/transpose_params.h b/include/flexflow/ops/transpose_params.h index 42737ee3e9..2e3e34007a 100644 --- a/include/flexflow/ops/transpose_params.h +++ b/include/flexflow/ops/transpose_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct TransposeParams { std::vector perm; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index 14fcde74ba..d1a51b8b8f 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -16,6 +16,7 @@ struct TreeIncMultiHeadSelfAttentionParams { scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h index c04676ffeb..a0daac8f9a 100644 --- a/include/flexflow/parallel_ops/allreduce_params.h +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -5,6 +5,7 @@ namespace FlexFlow { struct AllReduceParams { int allreduce_legion_dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(AllReduceParams const &, AllReduceParams const &); diff --git a/include/flexflow/parallel_ops/combine_params.h b/include/flexflow/parallel_ops/combine_params.h index 74ef01e08f..8ca05f7f50 100644 --- a/include/flexflow/parallel_ops/combine_params.h +++ b/include/flexflow/parallel_ops/combine_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct CombineParams { int combine_legion_dim; int combine_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(CombineParams const &, CombineParams const &); diff --git a/include/flexflow/parallel_ops/fused_parallel_op_params.h b/include/flexflow/parallel_ops/fused_parallel_op_params.h index cba3844a4c..8c56b30998 100644 --- a/include/flexflow/parallel_ops/fused_parallel_op_params.h +++ b/include/flexflow/parallel_ops/fused_parallel_op_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct FusedParallelOpParams { std::vector parallel_ops; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(FusedParallelOpParams const &, FusedParallelOpParams const &); diff --git a/include/flexflow/parallel_ops/partition_params.h b/include/flexflow/parallel_ops/partition_params.h index 921ab43eaf..33ccf6b02c 100644 --- a/include/flexflow/parallel_ops/partition_params.h +++ b/include/flexflow/parallel_ops/partition_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct RepartitionParams { int repartition_legion_dim; int repartition_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(RepartitionParams const &, RepartitionParams const &); diff --git a/include/flexflow/parallel_ops/reduction_params.h b/include/flexflow/parallel_ops/reduction_params.h index fab7da2626..60b6c4f6aa 100644 --- a/include/flexflow/parallel_ops/reduction_params.h +++ b/include/flexflow/parallel_ops/reduction_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct ReductionParams { int reduction_legion_dim; int reduction_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ReductionParams const &, ReductionParams const &); diff --git a/include/flexflow/parallel_ops/replicate_params.h b/include/flexflow/parallel_ops/replicate_params.h index 06edbc1ddc..da1f94217c 100644 --- a/include/flexflow/parallel_ops/replicate_params.h +++ b/include/flexflow/parallel_ops/replicate_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct ReplicateParams { int replicate_legion_dim; int replicate_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ReplicateParams const &, ReplicateParams const &); diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 03fc8e1633..94a8c23e68 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -36,9 +36,9 @@ def parse_args(): def main(args): if args.full_precision_only: - data_types = ff.DataType.DT_FLOAT + data_types = (ff.DataType.DT_FLOAT,) elif args.half_precision_only: - data_types = ff.DataType.DT_HALF + data_types = (ff.DataType.DT_HALF,) else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index d1a935e5fc..5c3cac9303 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -117,10 +117,11 @@ def __init__( self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file + self.rm = None def __del__(self): # Stop the background server before deleting the object - if type(self) == LLM: + if type(self) == LLM and self.rm is not None: self.rm.stop_server() def __get_ff_model_type(self): @@ -320,9 +321,9 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - #self.max_requests_per_batch = max_requests_per_batch - #self.max_seq_length = max_seq_length - #self.max_tokens_per_batch = max_tokens_per_batch + # self.max_requests_per_batch = max_requests_per_batch + # self.max_seq_length = max_seq_length + # self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -362,7 +363,7 @@ def compile( self.ffconfig, self.hf_config, self.data_type, - max_tokens_per_batch + max_tokens_per_batch, ) # Download the weights from huggingface (if needed) @@ -378,7 +379,7 @@ def compile( model_configs.hidden_size, model_configs.hidden_size // model_configs.num_attention_heads, self.ffconfig.tensor_parallelism_degree, - self.data_type == DataType.DT_FLOAT + self.data_type == DataType.DT_FLOAT, ) # Register weights file loader @@ -404,8 +405,11 @@ def compile( self.rm.register_ssm_model(ssm.model.ffmodel) # start background server - if (mode == InferenceMode.TREE_VERIFY_MODE) or (mode == InferenceMode.INC_DECODING_MODE): + if (mode == InferenceMode.TREE_VERIFY_MODE) or ( + mode == InferenceMode.INC_DECODING_MODE + ): import atexit + atexit.register(self.rm.stop_server) def generate(self, prompts: Union[str, List[str]], max_length: int = 128): @@ -426,26 +430,27 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128): return self.model.ffmodel.generate(prompts, max_length) else: assert False, "Please pass a non-empty string or list of strings" - + def start_server(self): self.rm.start_server(self.model.ffmodel) print("Background server started.") - + def stop_server(self): self.rm.stop_server() - print("Background server stoped.") - + print("Background server stopped.") + def __enter__(self): # Start the server when entering the context - #self.rm.start_server(self.model.ffmodel) + # self.rm.start_server(self.model.ffmodel) return self def __exit__(self, exc_type, exc_value, traceback): # Stop the server when exiting the context - #self.rm.stop_server() + # self.rm.stop_server() if exc_type: print(f"Exception occurred: {exc_value}") + class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 42fbb3016a..e670380901 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -58,6 +58,9 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -213,7 +216,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( FFModel &model, @@ -755,6 +758,8 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -783,6 +788,10 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); AddBiasResidualLayerNormParams params; params.layer_guid = layer_guid; @@ -790,6 +799,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 67810d3f5b..5f05458e34 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -85,6 +85,9 @@ AggregateParams Aggregate::get_params() const { AggregateParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -164,7 +167,8 @@ Aggregate::Aggregate(FFModel &model, AggregateParams const ¶ms, std::vector const &inputs, char const *name) - : Aggregate(model, inputs.data(), params.n, params.lambda_bal, name) {} + : Aggregate( + model, inputs.data(), params.n, params.lambda_bal, params.name) {} using PCG::Node; Node Aggregate::deserialize(FFModel &ff, @@ -175,10 +179,15 @@ Node Aggregate::deserialize(FFModel &ff, float lambda_bal; dez.deserialize(n); dez.deserialize(lambda_bal); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); assert(num_inputs == n + 4); AggregateParams params; params.n = n; params.lambda_bal = lambda_bal; + strcpy(params.name, name); return ff.get_or_create_node(inputs, params); } @@ -567,6 +576,8 @@ void Aggregate::backward_task(Task const *task, void Aggregate::serialize(Legion::Serializer &sez) const { sez.serialize(this->n); sez.serialize(this->lambda_bal); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } bool Aggregate::measure_operator_cost(Simulator *sim, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 19b2edc14a..1edd430881 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -84,6 +84,9 @@ AggregateSpecParams AggregateSpec::get_params() const { AggregateSpecParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 2727a1d249..780a77450e 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -112,6 +112,9 @@ ArgTopKParams ArgTopK::get_params() const { params.k = this->k; params.sorted = this->sorted; params.speculative_decoding = this->speculative_decoding; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -183,7 +186,7 @@ ArgTopK::ArgTopK(FFModel &model, params.k, params.sorted, params.speculative_decoding, - name) {} + params.name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -446,6 +449,8 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->k); sez.serialize(this->sorted); sez.serialize(this->speculative_decoding); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node ArgTopK::deserialize(FFModel &ff, @@ -464,11 +469,16 @@ Node ArgTopK::deserialize(FFModel &ff, dez.deserialize(k); dez.deserialize(sorted); dez.deserialize(speculative_decoding); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ArgTopKParams params; params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; params.speculative_decoding = speculative_decoding; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index dc7e4ea3b3..a52ce1886b 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -91,6 +91,9 @@ Op *ArgMax::create_operator_from_layer( ArgMaxParams ArgMax::get_params() const { ArgMaxParams params; params.beam_search = this->beam_search; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -140,7 +143,7 @@ ArgMax::ArgMax(FFModel &model, ArgMaxParams const ¶ms, const ParallelTensor input, char const *name) - : ArgMax(model, input, params.beam_search, name) {} + : ArgMax(model, input, params.beam_search, params.name) {} void ArgMax::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -410,6 +413,8 @@ void ArgMax::backward(FFModel const &ff) { void ArgMax::serialize(Legion::Serializer &sez) const { sez.serialize(this->beam_search); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node ArgMax::deserialize(FFModel &ff, @@ -419,8 +424,13 @@ Node ArgMax::deserialize(FFModel &ff, assert(num_inputs == 1); bool beam_search; dez.deserialize(beam_search); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ArgMaxParams params; params.beam_search = beam_search; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 1f71be07a8..97afc94341 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -363,7 +363,7 @@ MultiHeadAttention::MultiHeadAttention( params.add_bias_kv, params.add_zero_attn, allocate_weights, - name) {} + params.name) {} void MultiHeadAttention::init_inference( FFModel const &ff, @@ -1013,6 +1013,9 @@ MultiHeadAttentionParams MultiHeadAttention::get_params() const { params.bias = this->bias; params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc index f4b06877e5..e13169f6c1 100644 --- a/src/ops/batch_matmul.cc +++ b/src/ops/batch_matmul.cc @@ -138,7 +138,7 @@ BatchMatmul::BatchMatmul( inputs.second, params.a_seq_length_dim, params.b_seq_length_dim, - name) {} + params.name) {} // return A*B BatchMatmul::BatchMatmul(FFModel &model, @@ -190,6 +190,8 @@ void BatchMatmul::serialize(Legion::Serializer &sez) const { BatchMatmulParams params = get_params(); sez.serialize(params.a_seq_length_dim); sez.serialize(params.b_seq_length_dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -202,10 +204,15 @@ Node BatchMatmul::deserialize(FFModel &ff, int a_seq_length_dim, b_seq_length_dim; dez.deserialize(a_seq_length_dim); dez.deserialize(b_seq_length_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); BatchMatmulParams params; params.a_seq_length_dim = a_seq_length_dim; params.b_seq_length_dim = b_seq_length_dim; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 18d0ec1587..d2054cacb0 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -170,7 +170,7 @@ BeamTopK::BeamTopK(FFModel &model, params.layer_guid, params.max_beam_width, params.sorted, - name) {} + params.name) {} void BeamTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -418,6 +418,8 @@ void BeamTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->sorted); sez.serialize(this->max_beam_width); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node BeamTopK::deserialize(FFModel &ff, @@ -434,10 +436,16 @@ Node BeamTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(sorted); dez.deserialize(max_beam_width); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + BeamTopKParams params; params.layer_guid = layer_guid; params.sorted = sorted; params.max_beam_width = max_beam_width; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/cast.cc b/src/ops/cast.cc index 2a845cb303..e514236a31 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -112,7 +112,7 @@ Cast::Cast(FFModel &model, CastParams const ¶ms, ParallelTensor const &input, char const *name) - : Cast(model, input, params.dtype, name) {} + : Cast(model, input, params.dtype, params.name) {} void Cast::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -409,6 +409,8 @@ bool Cast::measure_operator_cost(Simulator *sim, void Cast::serialize(Legion::Serializer &sez) const { sez.serialize(this->outputs[0]->data_type); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -420,6 +422,10 @@ Node Cast::deserialize(FFModel &ff, assert(num_inputs == 1); DataType dtype; dez.deserialize(dtype); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); return ff.get_or_create_node(inputs[0], {dtype}); } diff --git a/src/ops/concat.cc b/src/ops/concat.cc index 80935e387b..d4d8e525fc 100644 --- a/src/ops/concat.cc +++ b/src/ops/concat.cc @@ -147,7 +147,7 @@ Concat::Concat(FFModel &model, ConcatParams const ¶ms, std::vector const &inputs, char const *name) - : Concat(model, inputs.size(), inputs.data(), params.axis, name) {} + : Concat(model, inputs.size(), inputs.data(), params.axis, params.name) {} void Concat::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index 7d8fd32570..94850a178d 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -389,7 +389,7 @@ Conv2D::Conv2D(FFModel &model, params.groups, params.use_bias, allocate_weights, - name) {} + params.name) {} bool Conv2DParams::is_valid(ParallelTensorShape const &input) const { ParallelTensorShape output_shape, kernel_shape, bias_shape; @@ -1026,6 +1026,8 @@ void Conv2D::serialize(Legion::Serializer &sez) const { sez.serialize(this->groups); sez.serialize(this->use_bias); sez.serialize(this->activation); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -1055,6 +1057,10 @@ Node Conv2D::deserialize(FFModel &ff, dez.deserialize(groups); dez.deserialize(use_bias); dez.deserialize(activation); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); Conv2DParams params; params.layer_guid = layer_guid; @@ -1068,6 +1074,7 @@ Node Conv2D::deserialize(FFModel &ff, params.groups = groups; params.use_bias = use_bias; params.activation = activation; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 9b11c9d912..58cb82d53d 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -118,7 +118,7 @@ Dropout::Dropout(FFModel &model, DropoutParams const ¶ms, const ParallelTensor input, char const *name) - : Dropout(model, input, params.rate, params.seed, name) {} + : Dropout(model, input, params.rate, params.seed, params.name) {} void Dropout::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -272,6 +272,8 @@ void Dropout::backward_task(Task const *task, void Dropout::serialize(Legion::Serializer &sez) const { sez.serialize(this->rate); sez.serialize(this->seed); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node Dropout::deserialize(FFModel &ff, @@ -283,9 +285,14 @@ Node Dropout::deserialize(FFModel &ff, float rate; dez.deserialize(rate); dez.deserialize(seed); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); DropoutParams params; params.rate = rate; params.seed = seed; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 42c6487581..4352f459b9 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -252,7 +252,7 @@ ElementBinary::ElementBinary( inputs.first, inputs.second, params.inplace_a, - name) {} + params.name) {} void ElementBinary::map_output_tensors(FFModel &ff) { if (has_inplace_output()) { @@ -1128,6 +1128,8 @@ void ElementBinary::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); sez.serialize(this->inplace_a); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -1146,11 +1148,16 @@ Node ElementBinary::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(op_type); dez.deserialize(inplace_a); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ElementBinaryParams params; params.layer_guid = layer_guid; params.type = op_type; params.inplace_a = inplace_a; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 9fb2e6dc1f..0e1d115557 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -212,7 +212,7 @@ ElementUnary::ElementUnary(FFModel &model, params.op_type, input, params.inplace, - name, + params.name, params.scalar) {} void ElementUnary::map_output_tensors(FFModel &ff) { @@ -557,7 +557,7 @@ void ElementUnary::forward_task_with_type( assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector input_accessors; - std::vector output_accessors; + std::vector output_accessors; if (m->inplace) { GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -723,6 +723,8 @@ void ElementUnary::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } bool ElementUnary::measure_operator_cost(Simulator *sim, @@ -837,6 +839,10 @@ Node ElementUnary::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); ElementUnaryParams params; @@ -844,6 +850,7 @@ Node ElementUnary::deserialize(FFModel &ff, params.inplace = inplace; params.scalar = scalar; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 40d5b600be..e630563b63 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -243,7 +243,7 @@ Embedding::Embedding(FFModel &model, params.aggr, allocate_weights, params.data_type, - name) {} + params.name) {} Embedding::Embedding(FFModel &model, Embedding const &other, diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 6a7d622e51..8c66f9c7bc 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -260,7 +260,7 @@ Experts::Experts(FFModel &model, params.use_bias, params.activation, allocate_weights, - name) {} + params.name) {} Experts::Experts(FFModel &model, LayerID const &_layer_guid, @@ -407,6 +407,8 @@ void Experts::serialize(Legion::Serializer &sez) const { sez.serialize(params.experts_internal_dim_size); sez.serialize(params.use_bias); sez.serialize(params.activation); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -432,6 +434,10 @@ Node Experts::deserialize(FFModel &ff, dez.deserialize(experts_internal_dim_size); dez.deserialize(use_bias); dez.deserialize(activation); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); assert(num_inputs == 3); @@ -445,6 +451,7 @@ Node Experts::deserialize(FFModel &ff, params.experts_internal_dim_size = experts_internal_dim_size; params.use_bias = use_bias; params.activation = activation; + strcpy(params.name, name); return ff.get_or_create_node(inputs, params); } diff --git a/src/ops/flat.cc b/src/ops/flat.cc index 669c457709..80aedbbb31 100644 --- a/src/ops/flat.cc +++ b/src/ops/flat.cc @@ -16,6 +16,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/model.h" #include "flexflow/ops/kernels/flat_kernels.h" +#include "legion/legion_utilities.h" namespace FlexFlow { @@ -317,6 +318,8 @@ Domain Flat::get_input_tensor_shape(ParallelConfig const &pc, } void Flat::serialize(Legion::Serializer &sez) const { + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); return; } @@ -391,6 +394,10 @@ Node Flat::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 1); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); return ff.get_or_create_node(inputs[0], {}); } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index c6ba0b04c5..483028599e 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -537,7 +537,7 @@ __host__ void Context ctx, Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); @@ -1097,7 +1097,7 @@ __host__ void if (metas->meta[op]->inference_debugging) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; + std::vector output_accessors_to_save; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { @@ -1114,8 +1114,7 @@ __host__ void weight_accessor[fused->op_weight_idx[i + woff]]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - output_accessors_to_save.push_back(output_accessor[my_off]); + output_accessors_to_save.push_back(output_accessor[i + ooff]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/gather.cc b/src/ops/gather.cc index d7c1dee44c..85580ed803 100644 --- a/src/ops/gather.cc +++ b/src/ops/gather.cc @@ -125,7 +125,7 @@ Gather::Gather(FFModel &model, inputs.first, inputs.second, params.legion_dim, - name) {} + params.name) {} Gather::Gather(FFModel &model, LayerID const &_layer_guid, @@ -168,6 +168,8 @@ void Gather::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -183,11 +185,16 @@ Node Gather::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); GatherParams params; params.legion_dim = legion_dim; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 50871983f5..f2f402737c 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -99,6 +99,9 @@ Group_byParams Group_by::get_params() const { Group_byParams params; params.n = this->n; params.alpha = this->alpha; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -161,8 +164,12 @@ Group_by::Group_by(FFModel &model, Group_byParams const ¶ms, std::pair const &inputs, char const *name) - : Group_by( - model, inputs.first, inputs.second, params.n, params.alpha, name) {} + : Group_by(model, + inputs.first, + inputs.second, + params.n, + params.alpha, + params.name) {} void Group_by::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -396,7 +403,7 @@ void Group_by::forward_task(Task const *task, // Create a vector of n outputs, where n is the number of experts. // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert - std::vector output_accessors; + std::vector output_accessors; float *outputs[n]; for (int i = 0; i < n; i++) { GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( @@ -522,6 +529,8 @@ void Group_by::backward_task(Task const *task, void Group_by::serialize(Legion::Serializer &sez) const { sez.serialize(this->n); sez.serialize(this->alpha); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node Group_by::deserialize(FFModel &ff, @@ -533,9 +542,14 @@ Node Group_by::deserialize(FFModel &ff, float alpha; dez.deserialize(n); dez.deserialize(alpha); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); Group_byParams params; params.n = n; params.alpha = alpha; + strcpy(params.name, name); return ff.get_or_create_node(std::make_pair(inputs[0], inputs[1]), params); } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8a3e9c96b1..7aa3503770 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -565,7 +565,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.quantization_type, params.offload, params.tensor_parallelism_degree, - name) {} + params.name) {} void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -929,6 +929,9 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.quantization_type = this->quantization_type; params.offload = this->offload; params.num_kv_heads = this->num_kv_heads; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index bc1358e49c..2218ffe392 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -56,6 +56,9 @@ LayerNormParams LayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -198,7 +201,7 @@ LayerNorm::LayerNorm(FFModel &model, params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} LayerNorm::LayerNorm(FFModel &model, LayerID const &_layer_guid, @@ -883,6 +886,8 @@ void LayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -911,6 +916,10 @@ Node LayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerNormParams params; params.layer_guid = layer_guid; @@ -918,6 +927,7 @@ Node LayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 6ca6038778..03c9e48af8 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -190,7 +190,7 @@ Linear::Linear(FFModel &model, params.quantization_type, params.offload, allocate_weights, - name) {} + params.name) {} Linear::Linear(FFModel &model, LayerID const &_layer_guid, @@ -1258,6 +1258,8 @@ void Linear::serialize(Legion::Serializer &sez) const { sez.serialize(this->data_type); sez.serialize(this->quantization_type); sez.serialize(this->offload); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } /* static */ @@ -1288,6 +1290,10 @@ Node Linear::deserialize(FFModel &ff, dez.deserialize(data_type); dez.deserialize(quantization_type); dez.deserialize(offload); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LinearParams params; params.activation = activation; @@ -1299,6 +1305,7 @@ Node Linear::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.quantization_type = quantization_type; params.offload = offload; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } @@ -1313,6 +1320,9 @@ LinearParams Linear::get_params() const { params.kernel_reg_lambda = this->kernel_reg_lambda; params.quantization_type = this->quantization_type; params.offload = this->offload; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc index e358448ddf..4621ab5909 100644 --- a/src/ops/pool_2d.cc +++ b/src/ops/pool_2d.cc @@ -269,7 +269,7 @@ Pool2D::Pool2D(FFModel &model, params.padding_w, params.pool_type, params.activation, - name) {} + params.name) {} void Pool2D::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -521,6 +521,8 @@ void Pool2D::serialize(Legion::Serializer &sez) const { sez.serialize(this->padding_w); sez.serialize(this->pool_type); sez.serialize(this->activation); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } bool Pool2D::measure_operator_cost(Simulator *sim, @@ -657,6 +659,10 @@ Node Pool2D::deserialize(FFModel &ff, dez.deserialize(padding_w); dez.deserialize(pool_type); dez.deserialize(activation); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); Pool2DParams params; params.kernel_h = kernel_h; @@ -667,6 +673,7 @@ Node Pool2D::deserialize(FFModel &ff, params.padding_w = padding_w; params.pool_type = pool_type; params.activation = activation; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc index 7a443e6ad0..454a35caf4 100644 --- a/src/ops/reduce.cc +++ b/src/ops/reduce.cc @@ -41,6 +41,9 @@ ReduceParams Reduce::get_params() const { } params.keepdims = keepdims; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -110,9 +113,12 @@ Reduce::Reduce(FFModel &model, ReduceParams const ¶ms, const ParallelTensor input, char const *name) - : Reduce( - model, params.layer_guid, input, params.axes, params.keepdims, name) { -} + : Reduce(model, + params.layer_guid, + input, + params.axes, + params.keepdims, + params.name) {} Reduce::Reduce(FFModel &model, LayerID const &_layer_guid, @@ -378,6 +384,8 @@ void Reduce::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -400,6 +408,10 @@ Node Reduce::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); return ff.get_or_create_node(inputs[0], {axes, keepdims, layer_guid}); diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 45da190680..49f99e2cb5 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -140,7 +140,7 @@ Reshape::Reshape(FFModel &model, ReshapeParams const ¶ms, const ParallelTensor input, char const *name) - : Reshape(model, params.layer_guid, input, params.shape, name) {} + : Reshape(model, params.layer_guid, input, params.shape, params.name) {} void Reshape::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -296,6 +296,9 @@ ReshapeParams Reshape::get_params() const { ReshapeParams params; params.shape = shape_vec; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -414,6 +417,8 @@ void Reshape::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -435,11 +440,16 @@ Node Reshape::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); ReshapeParams params; params.shape = shape; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 7de40fb389..ed9252c309 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -63,6 +63,9 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const { params.eps = this->eps; params.use_bias = this->use_bias; params.use_two_residuals = this->use_two_residuals; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -228,7 +231,7 @@ ResidualLayerNorm::ResidualLayerNorm( params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} ResidualLayerNorm::ResidualLayerNorm(FFModel &model, LayerID const &_layer_guid, @@ -779,6 +782,8 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->eps); sez.serialize(this->use_bias); sez.serialize(this->use_two_residuals); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -808,6 +813,10 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(eps); dez.deserialize(use_bias); dez.deserialize(use_two_residuals); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); if (use_two_residuals) { assert(num_inputs == 3); } else { @@ -821,6 +830,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, params.eps = eps; params.use_bias = use_bias; params.use_two_residuals = use_two_residuals; + strcpy(params.name, name); if (use_two_residuals) { return ff.get_or_create_node( {inputs[0], inputs[1], inputs[2]}, params); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index b447a2a3b5..f4f5bb72d0 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -55,6 +55,9 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -141,7 +144,7 @@ ResidualRMSNorm::ResidualRMSNorm( params.eps, params.dim, allocate_weights, - name) {} + params.name) {} ResidualRMSNorm::ResidualRMSNorm( FFModel &model, @@ -459,6 +462,8 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -477,10 +482,15 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ResidualRMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 2a34f83be2..bf07ee6bb0 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -53,6 +53,9 @@ RMSNormParams RMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -129,7 +132,7 @@ RMSNorm::RMSNorm(FFModel &model, params.eps, params.dim, allocate_weights, - name) {} + params.name) {} RMSNorm::RMSNorm(FFModel &model, RMSNorm const &other, @@ -437,6 +440,8 @@ void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -456,10 +461,15 @@ Node RMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); RMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 463b15aadb..9fc2316f9a 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -88,6 +88,9 @@ Op *Sampling::create_operator_from_layer( SamplingParams Sampling::get_params() const { SamplingParams params; params.top_p = this->top_p; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -136,7 +139,7 @@ Sampling::Sampling(FFModel &model, SamplingParams const ¶ms, const ParallelTensor input, char const *name) - : Sampling(model, input, params.top_p, name) {} + : Sampling(model, input, params.top_p, params.name) {} void Sampling::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -325,6 +328,8 @@ void Sampling::backward(FFModel const &ff) { void Sampling::serialize(Legion::Serializer &sez) const { sez.serialize(this->top_p); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node Sampling::deserialize(FFModel &ff, @@ -334,8 +339,13 @@ Node Sampling::deserialize(FFModel &ff, assert(num_inputs == 1); float top_p; dez.deserialize(top_p); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); SamplingParams params; params.top_p = top_p; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3b2ed7cef4..3ddd6b8d6e 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -52,6 +52,9 @@ bool SigmoidSiluMultiParams::is_valid( SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { SigmoidSiluMultiParams params; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -110,7 +113,7 @@ SigmoidSiluMulti::SigmoidSiluMulti( std::pair const &inputs, char const *name) : SigmoidSiluMulti( - model, params.layer_guid, inputs.first, inputs.second, name) {} + model, params.layer_guid, inputs.first, inputs.second, params.name) {} SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model, LayerID const &_layer_guid, @@ -366,6 +369,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -379,10 +384,15 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); SigmoidSiluMultiParams params; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ba0a1288d6..03618423be 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -17,6 +17,7 @@ #include "flexflow/model.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" namespace FlexFlow { // declare Legion names @@ -39,7 +40,42 @@ using namespace FlexFlow::Kernels::Softmax; /* Params */ bool operator==(SoftmaxParams const &lhs, SoftmaxParams const &rhs) { - return lhs.dim == rhs.dim; + return lhs.layer_guid == rhs.layer_guid && lhs.dim == rhs.dim; +} + +void Softmax::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); + sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); +} + +using PCG::Node; +/*static*/ +Node Softmax::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + int dim; + dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + + SoftmaxParams params; + params.layer_guid = layer_guid; + params.dim = dim; + strcpy(params.name, name); + return ff.get_or_create_node(inputs[0], params); } bool SoftmaxParams::is_valid(ParallelTensorShape const &input) const { @@ -48,7 +84,11 @@ bool SoftmaxParams::is_valid(ParallelTensorShape const &input) const { SoftmaxParams Softmax::get_params() const { SoftmaxParams params; + params.layer_guid = this->layer_guid; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -87,12 +127,14 @@ Op *Softmax::create_operator_from_layer( layer->get_int_property("softmax_dim", value); int dim = (int)value; return new Softmax(model, + layer->layer_guid, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, layer->name); } Softmax::Softmax(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor _input, int _dim, char const *name) @@ -107,6 +149,7 @@ Softmax::Softmax(FFModel &model, dim(_dim) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); + layer_guid = _layer_guid; ParallelDim dims[MAX_TENSOR_DIM]; int numdim = _input->num_dims; for (int i = 0; i < numdim; i++) { @@ -119,7 +162,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, params.layer_guid, input, params.dim, params.name) {} void Softmax::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -518,6 +561,7 @@ namespace std { size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; + hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); return key; } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 5d234df822..9c6ed0e0b6 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -511,7 +511,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.qk_prod_scaling, params.position_bias, allocate_weights, - name) {} + params.name) {} void SpecIncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -853,6 +853,9 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/split.cc b/src/ops/split.cc index 9298850a99..7c6b631b20 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -50,6 +50,9 @@ SplitParams Split::get_params() const { SplitParams params; params.splits = this->splits; params.legion_axis = this->legion_axis; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -137,7 +140,7 @@ Split::Split(FFModel &model, SplitParams const ¶ms, const ParallelTensor input, char const *name) - : Split(model, input, params.splits, params.legion_axis, name) {} + : Split(model, input, params.splits, params.legion_axis, params.name) {} void Split::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); diff --git a/src/ops/topk.cc b/src/ops/topk.cc index b38ff85f90..7d30a8aff3 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -87,6 +87,9 @@ TopKParams TopK::get_params() const { TopKParams params; params.k = this->k; params.sorted = this->sorted; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -134,7 +137,7 @@ TopK::TopK(FFModel &model, TopKParams const ¶ms, const ParallelTensor input, char const *name) - : TopK(model, input, params.k, params.sorted, name) {} + : TopK(model, input, params.k, params.sorted, params.name) {} void TopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -426,6 +429,8 @@ void TopK::backward_task(Task const *task, void TopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->k); sez.serialize(this->sorted); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node TopK::deserialize(FFModel &ff, @@ -437,9 +442,14 @@ Node TopK::deserialize(FFModel &ff, bool sorted; dez.deserialize(k); dez.deserialize(sorted); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); TopKParams params; params.k = k; params.sorted = sorted; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index 500b7867af..7a179c4f7d 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -51,6 +51,9 @@ TransposeParams Transpose::get_params() const { for (int i = 0; i < outputs[0]->num_dims; i++) { params.perm.push_back(this->perm[i]); } + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -98,7 +101,7 @@ Transpose::Transpose(FFModel &model, TransposeParams const ¶ms, const ParallelTensor input, char const *name) - : Transpose(model, input, params.perm, name) {} + : Transpose(model, input, params.perm, params.name) {} Transpose::Transpose(FFModel &model, const ParallelTensor input, @@ -383,6 +386,8 @@ void Transpose::serialize(Legion::Serializer &sez) const { for (size_t i = 0; i < params.perm.size(); i++) { sez.serialize(params.perm[i]); } + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -399,6 +404,10 @@ Node Transpose::deserialize(FFModel &ff, dez.deserialize(dim_idx); perm.push_back(dim_idx); } + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); return ff.get_or_create_node(inputs[0], {perm}); } diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d5a8a1063d..d0efb01d54 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -562,7 +562,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.quantization_type, params.offload, params.tensor_parallelism_degree, - name) {} + params.name) {} void TreeIncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -927,6 +927,9 @@ TreeIncMultiHeadSelfAttentionParams params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; params.tensor_parallelism_degree = this->tensor_parallelism_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 027d15c929..5d38e28903 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -55,6 +55,9 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { AllReduceParams AllReduce::get_params() const { AllReduceParams params; params.allreduce_legion_dim = this->allreduce_dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -79,7 +82,7 @@ AllReduce::AllReduce(FFModel &model, AllReduceParams const ¶ms, ParallelTensor const input, char const *name) - : AllReduce(model, input, params.allreduce_legion_dim, name) {} + : AllReduce(model, input, params.allreduce_legion_dim, params.name) {} void AllReduce::create_input_partition(FFModel &ff) { // Do nothing diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 7c266c5392..acc5c414c7 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -58,6 +58,9 @@ CombineParams Combine::get_params() const { CombineParams params; params.combine_legion_dim = this->combine_dim; params.combine_degree = this->combine_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -69,7 +72,7 @@ Combine::Combine(FFModel &model, input, params.combine_legion_dim, params.combine_degree, - name) {} + params.name) {} Combine::Combine(FFModel &model, const ParallelTensor _input, diff --git a/src/parallel_ops/fused_parallel_op.cc b/src/parallel_ops/fused_parallel_op.cc index c0a97bdda1..1a76cbfc40 100644 --- a/src/parallel_ops/fused_parallel_op.cc +++ b/src/parallel_ops/fused_parallel_op.cc @@ -59,6 +59,9 @@ FusedParallelOpParams FusedParallelOp::get_params() const { std::vector ops(std::begin(this->parallel_ops), std::end(this->parallel_ops)); params.parallel_ops = ops; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 353b3ce398..e6ab09d088 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -60,6 +60,9 @@ RepartitionParams Repartition::get_params() const { RepartitionParams params; params.repartition_legion_dim = this->repartition_dim; params.repartition_degree = this->repartition_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -92,7 +95,7 @@ Repartition::Repartition(FFModel &model, input, params.repartition_legion_dim, params.repartition_degree, - name) {} + params.name) {} OpMeta *Repartition::init_task(Task const *task, std::vector const ®ions, diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 5dca591328..5ca2b1301c 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -56,6 +56,9 @@ ReductionParams Reduction::get_params() const { ReductionParams params; params.reduction_legion_dim = this->reduction_dim; params.reduction_degree = this->reduction_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -89,7 +92,7 @@ Reduction::Reduction(FFModel &model, input, params.reduction_legion_dim, params.reduction_degree, - name) {} + params.name) {} void Reduction::create_input_partition(FFModel &ff) { assert(outputs[0]->part != LogicalPartition::NO_PART); diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 20face74e8..ba7bb6677f 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -55,6 +55,9 @@ ReplicateParams Replicate::get_params() const { ReplicateParams params; params.replicate_legion_dim = this->replicate_dim; params.replicate_degree = this->replicate_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -88,7 +91,7 @@ Replicate::Replicate(FFModel &model, input, params.replicate_legion_dim, params.replicate_degree, - name) {} + params.name) {} void Replicate::create_input_partition(FFModel &ff) { assert(outputs[0]->part != LogicalPartition::NO_PART); diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index fa6bf55fe5..57bc5a0458 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -287,7 +287,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", host_ptr[i]); + } } fclose(tensor_file); @@ -313,7 +317,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", (float)host_ptr[i]); + } } fclose(tensor_file); @@ -340,7 +348,11 @@ __host__ void save_tensor(int32_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%d, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%d, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%d", host_ptr[i]); + } } fclose(tensor_file); @@ -367,7 +379,11 @@ __host__ void save_tensor(int64_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%ld, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%ld, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%ld", host_ptr[i]); + } } fclose(tensor_file); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 6d33dd9f27..f8e8240ccf 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2276,6 +2276,8 @@ GraphOptimalViewSerialized case OP_CONCAT: { Concat *concat = (Concat *)op; sez.serialize(concat->legion_axis); + sez.serialize(strlen(concat->name)); + sez.serialize(concat->name, strlen(concat->name)); break; } case OP_SPLIT: { @@ -2285,6 +2287,8 @@ GraphOptimalViewSerialized for (int i = 0; i < split->numOutputs; i++) { sez.serialize(split->outputs[i]->dims[split->legion_axis].size); } + sez.serialize(strlen(split->name)); + sez.serialize(split->name, strlen(split->name)); break; } case OP_EMBEDDING: { @@ -2296,6 +2300,8 @@ GraphOptimalViewSerialized sez.serialize(embed->out_channels); sez.serialize(embed->aggr); sez.serialize(embed->data_type); + sez.serialize(strlen(embed->name)); + sez.serialize(embed->name, strlen(embed->name)); break; } case OP_MULTIHEAD_ATTENTION: { @@ -2311,6 +2317,8 @@ GraphOptimalViewSerialized sez.serialize(attn->bias); sez.serialize(attn->add_bias_kv); sez.serialize(attn->add_zero_attn); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2335,6 +2343,8 @@ GraphOptimalViewSerialized sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2357,6 +2367,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->position_bias); sez.serialize(attn->num_kv_heads); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2382,40 +2394,47 @@ GraphOptimalViewSerialized sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); - break; - } - case OP_SOFTMAX: { - Softmax *softmax = (Softmax *)op; - sez.serialize(softmax->dim); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_REPARTITION: { Repartition *repart = (Repartition *)op; sez.serialize(repart->repartition_dim); sez.serialize(repart->repartition_degree); + sez.serialize(strlen(repart->name)); + sez.serialize(repart->name, strlen(repart->name)); break; } case OP_REPLICATE: { Replicate *replicate = (Replicate *)op; sez.serialize(replicate->replicate_dim); sez.serialize(replicate->replicate_degree); + sez.serialize(strlen(replicate->name)); + sez.serialize(replicate->name, strlen(replicate->name)); break; } case OP_REDUCTION: { Reduction *reduction = (Reduction *)op; sez.serialize(reduction->reduction_dim); sez.serialize(reduction->reduction_degree); + sez.serialize(strlen(reduction->name)); + sez.serialize(reduction->name, strlen(reduction->name)); break; } case OP_COMBINE: { Combine *combine = (Combine *)op; sez.serialize(combine->combine_dim); sez.serialize(combine->combine_degree); + sez.serialize(strlen(combine->name)); + sez.serialize(combine->name, strlen(combine->name)); break; } case OP_ALLREDUCE: { AllReduce *allreduce = (AllReduce *)op; sez.serialize(allreduce->allreduce_dim); + sez.serialize(strlen(allreduce->name)); + sez.serialize(allreduce->name, strlen(allreduce->name)); break; } case OP_FUSED_PARALLEL: { @@ -2424,6 +2443,8 @@ GraphOptimalViewSerialized for (int i = 0; i < fused->num_parallel_ops; i++) { sez.serialize(fused->parallel_ops[i]); } + sez.serialize(strlen(fused->name)); + sez.serialize(fused->name, strlen(fused->name)); break; } default: { @@ -2621,6 +2642,10 @@ void FFModel::deserialize_graph_optimal_view( case OP_CONCAT: { int legion_axis; dez.deserialize(legion_axis); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node( {std::begin(inputs), std::begin(inputs) + num_inputs}, {legion_axis}); @@ -2637,6 +2662,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(dim_size); splits.push_back(dim_size); } + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {splits, legion_axis}); break; } @@ -2654,6 +2683,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(out_channels); dez.deserialize(aggr); dez.deserialize(data_type); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); EmbeddingParams params; params.aggr = aggr; @@ -2661,6 +2694,7 @@ void FFModel::deserialize_graph_optimal_view( params.out_channels = out_channels; params.layer_guid = layer_guid; params.data_type = data_type; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } @@ -2746,6 +2780,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(bias); dez.deserialize(add_bias_kv); dez.deserialize(add_zero_attn); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); MultiHeadAttentionParams params; params.embed_dim = embed_dim; @@ -2757,6 +2795,7 @@ void FFModel::deserialize_graph_optimal_view( params.add_bias_kv = add_bias_kv; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; + strcpy(params.name, name); node = get_or_create_node( {inputs[0], inputs[1], inputs[2]}, params); break; @@ -2791,6 +2830,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(offload); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2811,6 +2854,7 @@ void FFModel::deserialize_graph_optimal_view( params.offload = offload; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } @@ -2839,6 +2883,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qk_prod_scaling); dez.deserialize(position_bias); dez.deserialize(num_kv_heads); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); SpecIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2856,6 +2904,7 @@ void FFModel::deserialize_graph_optimal_view( params.qk_prod_scaling = qk_prod_scaling; params.position_bias = position_bias; params.num_kv_heads = num_kv_heads; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; @@ -2890,6 +2939,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(offload); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2910,6 +2963,7 @@ void FFModel::deserialize_graph_optimal_view( params.offload = offload; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; @@ -2967,10 +3021,7 @@ void FFModel::deserialize_graph_optimal_view( break; } case OP_SOFTMAX: { - assert(num_inputs == 1); - int softmax_dim; - dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + node = Softmax::deserialize(*this, dez, inputs, num_inputs); break; } case OP_TRANSPOSE: { @@ -2990,6 +3041,10 @@ void FFModel::deserialize_graph_optimal_view( int combine_dim, combine_degree; dez.deserialize(combine_dim); dez.deserialize(combine_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {combine_dim, combine_degree}); break; @@ -2999,6 +3054,10 @@ void FFModel::deserialize_graph_optimal_view( int repartition_dim, repartition_degree; dez.deserialize(repartition_dim); dez.deserialize(repartition_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node( inputs[0], {repartition_dim, repartition_degree}); break; @@ -3008,6 +3067,10 @@ void FFModel::deserialize_graph_optimal_view( int replicate_dim, replicate_degree; dez.deserialize(replicate_dim); dez.deserialize(replicate_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {replicate_dim, replicate_degree}); break; @@ -3017,6 +3080,10 @@ void FFModel::deserialize_graph_optimal_view( int reduction_dim, reduction_degree; dez.deserialize(reduction_dim); dez.deserialize(reduction_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {reduction_dim, reduction_degree}); break; @@ -3025,6 +3092,10 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); int allreduce_dim; dez.deserialize(allreduce_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {allreduce_dim}); break; } @@ -3038,6 +3109,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(info); parallel_ops.push_back(info); } + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {parallel_ops}); break; } diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index fb94135c8f..613df1cbcf 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -266,7 +266,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", host_ptr[i]); + } } fclose(tensor_file); @@ -292,7 +296,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", (float)host_ptr[i]); + } } fclose(tensor_file); @@ -319,7 +327,11 @@ __host__ void save_tensor(int32_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%d, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%d, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%d", host_ptr[i]); + } } fclose(tensor_file); @@ -346,7 +358,11 @@ __host__ void save_tensor(int64_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%ld, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%ld, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%ld", host_ptr[i]); + } } fclose(tensor_file); diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 0b3813f41c..36ac02a3a3 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -25,105 +25,4 @@ size_t Op::get_params_hash() const { get_operator_type_name(this->op_type)); } -/*static*/ -void Op::save_inference_tensors_to_file( - OpMeta *m, - int shard_id, - BatchConfig const *bc, - std::vector input_tensors, - std::vector weight_tensors, - std::vector output_tensors) { - - // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; - struct stat st = {0}; - if (stat(folder_path, &st) == -1) { - // Directory does not exist, create it - mkdir(folder_path, 0700); - } - // output base filepath, shared by all tensors from the same operator - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - m->op_name + "_shard-id_" + std::to_string(shard_id); - // save batch config, if passed - if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); - } - // save all inputs - for (int i = 0; i < input_tensors.size(); i++) { - std::string filename = base_filepath + "_input_" + std::to_string(i); - if (input_tensors[i].data_type == DT_FLOAT) { - save_tensor(input_tensors[i].get_float_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_HALF) { - save_tensor(input_tensors[i].get_half_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_INT32) { - save_tensor(input_tensors[i].get_int32_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_INT64) { - save_tensor(input_tensors[i].get_int64_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - // only dump the weights once - if (m->decoding_step == 0) { - for (int i = 0; i < weight_tensors.size(); i++) { - std::string filename = base_filepath + "_weight_" + std::to_string(i); - if (weight_tensors[i].data_type == DT_FLOAT) { - save_tensor(weight_tensors[i].get_float_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_HALF) { - save_tensor(weight_tensors[i].get_half_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_INT32) { - save_tensor(weight_tensors[i].get_int32_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_INT64) { - save_tensor(weight_tensors[i].get_int64_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - } - // save all outputs - for (int i = 0; i < output_tensors.size(); i++) { - std::string filename = base_filepath + "_output_" + std::to_string(i); - if (output_tensors[i].data_type == DT_FLOAT) { - save_tensor(output_tensors[i].get_float_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_HALF) { - save_tensor(output_tensors[i].get_half_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_INT32) { - save_tensor(output_tensors[i].get_int32_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_INT64) { - save_tensor(output_tensors[i].get_int64_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - // increase count of decoding steps - m->decoding_step++; -} - }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index e8b986582f..c0804d6e19 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -958,8 +958,12 @@ bool GraphXfer::create_new_operator(OpX const *opx, Node &op) { } case OP_SOFTMAX: { int softmax_dim; + assert(opx->matchOpX != NULL); + assert(opx->matchOpX->mapOp.ptr != NULL); + Softmax *softmax = (Softmax *)opx->matchOpX->mapOp.ptr; assert(opx->get_pm_constraint(PM_SOFTMAX_DIM, softmax_dim)); - op = model->get_or_create_node(inputs[0], {softmax_dim}); + SoftmaxParams params = softmax->get_params(); + op = model->get_or_create_node(inputs[0], params); break; } case OP_REPARTITION: { @@ -3749,7 +3753,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, softmax->layer_guid, inputs[0], softmax->dim, NULL); break; } case OP_COMBINE: { diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 5b533bf3c0..6857b5cbc1 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -1,6 +1,7 @@ import argparse import json import os +import shutil import torch from transformers import ( AutoModelForCausalLM, @@ -9,7 +10,30 @@ LlamaTokenizer, GenerationConfig, ) - +######################### debugging helper functions ######################### +def pre_forward_hook(module, input): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("model.", "") + print( + f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}" + ) + print("Pre-Input: ", input[0].shape) + torch.save( + input, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.input" + ) +def post_forward_hook(module, input, output): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("model.", "") + print( + f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}" + ) + print("Post-Input/Output: ", input[0].shape, output[0].shape) + torch.save( + output, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.output" + ) + print("===") + module.decoding_step += 1 +############################################################################## def main(): # Change working dir to folder storing this script @@ -28,6 +52,11 @@ def main(): ) parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument("--gpu", action="store_true", help="Run on GPU") + parser.add_argument( + "--inference-debugging", + action="store_true", + help="Print debugging info and save hidden states/weights to file", + ) args = parser.parse_args() # Check if max-length is greater than 0 if args.max_length <= 0: @@ -64,6 +93,27 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(args.model_name) generation_config = GenerationConfig.from_pretrained(args.model_name) generation_config.do_sample = args.do_sample + ################# debugging ################# + if args.inference_debugging: + # Print model and configs + print(hf_config) + print(model) + # Save weights to file + shutil.rmtree("./hf_tensors") + # Check that the output folder exists + os.makedirs("./hf_tensors", exist_ok=True) + # Save weights + for name, params in model.named_parameters(): + torch.save(params, f"./hf_tensors/{name}") + # params.detach().cpu().numpy().tofile(f"./hf_tensors/{name}") + # Register hooks to save per-op hidden states + for name, layer in dict(model.named_modules()).items(): + layer.name = name + layer.decoding_step = 0 + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_pre_hook(pre_forward_hook) + layer.register_forward_hook(post_forward_hook) + ############################################### # Generate output with open(args.output_file, "w") as f: for i, prompt in enumerate(prompt_list): From 57d1883b5cef266371dd616f812abca44b37099d Mon Sep 17 00:00:00 2001 From: FelixBrakel Date: Sat, 20 Jan 2024 06:07:28 +0100 Subject: [PATCH 312/344] Fix incorrect innode being checked (#1273) Co-authored-by: Gabriele Oliaro --- python/flexflow/torch/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 65b1669e99..df4042748f 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -955,7 +955,7 @@ def is_left_scalar_op(node): if len(innodes) != 2: return False return type(innodes[0]) is float or \ - type(innodes[1]) is int + type(innodes[0]) is int @staticmethod def is_elemwise_op(node): From 317cffd82f2dc6559f3243217e617b110c90be05 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 26 Jan 2024 11:36:24 -0500 Subject: [PATCH 313/344] Bug fixes and update Legion version (#1259) * bug fixes and update Legion version * fix * bug fix * update legion * fix arithmetic error due to num_devices uninitialized * update legion version * update ci * fix * debugging ci * Revert "debugging ci" This reverts commit 0b3148ef6adfcb64935e6b1e83a88494910a7b22. --------- Co-authored-by: Gabriele Oliaro --- .github/workflows/gpu-ci.yml | 12 +++--- CMakeLists.txt | 8 ++-- cmake/pip_install/CMakeLists.txt | 4 +- deps/legion | 2 +- include/flexflow/mapper.h | 9 ++--- include/flexflow/model.h | 2 + include/flexflow/operator.h | 5 +++ include/flexflow/request_manager.h | 1 - src/mapper/mapper.cc | 47 ++++++++++------------- src/ops/linear.cc | 8 +--- src/runtime/inference_manager.cc | 30 +-------------- src/runtime/model.cc | 61 ++++++++++++++++++++++++++++++ 12 files changed, 109 insertions(+), 80 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 3901d6b5f7..48dcda157e 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests diff --git a/CMakeLists.txt b/CMakeLists.txt index acbe7e385f..43ce4f7044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) + find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + find_package(Python COMPONENTS Interpreter Development) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 7ce38c4abc..105133a310 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() diff --git a/deps/legion b/deps/legion index 626b55689c..24e8c45234 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index 71be1892aa..e8337818ec 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,11 +83,10 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output); + virtual void replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index dd6dc76b4d..95be9ab581 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,6 +202,7 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, + NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -397,6 +398,7 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); + ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 73c2c3e092..1b19bdb82f 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,6 +406,11 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + finish_nccl_comms_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 50a51705cd..4763eb1ef3 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,7 +55,6 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; - int num_devices; }; struct Request { diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index bc26a79d3e..d46bfc2877 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,44 +661,37 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output) { +void FFMapper::replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID chosen_variant; + VariantID vid; { std::vector variant_ids; - runtime->find_valid_variants( - ctx, task.task_id, variant_ids, task.target_proc.kind()); + runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - chosen_variant = variant_ids[0]; + output.chosen_variant = variant_ids[0]; } - std::vector const &all_procs = all_procs_by_kind(target_kind); - // Place on replicate on each node by default - output.task_mappings.resize(total_nodes, default_output); - // Assume default_output does not include any target_procs - assert(default_output.target_procs.size() == 0); - for (std::vector::const_iterator it = all_procs.begin(); - it != all_procs.end(); + output.target_processors.resize(total_nodes); + std::vector handled(total_nodes, false); + size_t count = 0; + Machine::ProcessorQuery procs(machine); + procs.only_kind(target_kind); + for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); it++) { - AddressSpace space = it->address_space(); - assert(space < output.task_mappings.size()); - // Add *it as a target_proc if we haven't found one - if (output.task_mappings[space].target_procs.size() == 0) { - output.task_mappings[space].target_procs.push_back(*it); + const AddressSpace space = it->address_space(); + if (handled[space]) { + continue; } + output.target_processors[space] = *it; + handled[space] = true; + count++; } - output.control_replication_map.resize(total_nodes); - for (int idx = 0; idx < total_nodes; idx++) { - output.task_mappings[idx].chosen_variant = chosen_variant; - output.control_replication_map[idx] = - output.task_mappings[idx].target_procs[0]; - } + assert(count == total_nodes); } void FFMapper::select_task_variant(const MapperContext ctx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 03c9e48af8..0c7a0f78fe 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,8 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - // TensorAccessorW acc_kernel(regions[2], - // task->regions[2], - // FID_DATA, - // ctx, - // runtime, - // false /*readOutput*/); + TensorAccessorR acc_kernel( + regions[2], task->regions[2], FID_DATA, ctx, runtime); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 6588cbceeb..2a94df8b4d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,33 +28,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() { -#ifdef DEADCODE - num_devices = ff_config.workersPerNode * ff_config.numNodes; - // Check parallelization degrees - assert(ff_config.data_parallelism_degree <= num_devices && - "Data parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.data_parallelism_degree == 0 && - "Number of available devices is not divisible by data parallelism " - "degree"); - assert(ff_config.tensor_parallelism_degree <= num_devices && - "Tensor parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.tensor_parallelism_degree == 0 && - "Number of available devices is not divisible by tensor parallelism " - "degree"); - assert(ff_config.pipeline_parallelism_degree <= num_devices && - "Pipeline parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && - "Number of available devices is not divisible by pipeline parallelism " - "degree"); - assert(ff_config.data_parallelism_degree * - ff_config.tensor_parallelism_degree * - ff_config.pipeline_parallelism_degree == - num_devices && - "Product of data, tensor, and pipeline parallelism degrees does not " - "match the number of available devices"); -#endif -} +InferenceManager::InferenceManager() {} InferenceManager *inference_manager_singleton = nullptr; @@ -296,8 +270,6 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { - int expert_device_index = 0; - int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c07c33efca..f9763627c8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,6 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } + +void Op::finish_nccl_comms_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ncclComm_t comm = *((ncclComm_t *)task->local_args); + checkNCCL(ncclCommFinalize(comm)); + checkNCCL(ncclCommDestroy(comm)); +} #endif /** @@ -1578,6 +1587,43 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +FFModel::~FFModel() { + // Destroy nccl communication groups +#ifdef FF_USE_NCCL + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } +#endif +} + void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6853,6 +6899,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, + "NCCL Finish Communicators"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "NCCL Finish Communicators Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } #endif // Search { From d73bba1212be19dd8b07e0e8f591b6db2fe4189d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 11:41:51 -0500 Subject: [PATCH 314/344] Revert "Bug fixes and update Legion version" (#1286) --- .github/workflows/gpu-ci.yml | 12 +++--- CMakeLists.txt | 8 ++-- cmake/pip_install/CMakeLists.txt | 4 +- deps/legion | 2 +- include/flexflow/mapper.h | 9 +++-- include/flexflow/model.h | 2 - include/flexflow/operator.h | 5 --- include/flexflow/request_manager.h | 1 + src/mapper/mapper.cc | 47 +++++++++++++---------- src/ops/linear.cc | 8 +++- src/runtime/inference_manager.cc | 30 ++++++++++++++- src/runtime/model.cc | 61 ------------------------------ 12 files changed, 80 insertions(+), 109 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 48dcda157e..3901d6b5f7 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=$CONDA_PREFIX/bin:$PATH + export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=$CONDA_PREFIX/bin:$PATH + export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=$CONDA_PREFIX/bin:$PATH + export PATH=/opt/conda/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..acbe7e385f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,7 +413,6 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) - find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -425,13 +424,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -568,8 +567,7 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - find_package(Python COMPONENTS Interpreter Development) - execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 105133a310..7ce38c4abc 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() diff --git a/deps/legion b/deps/legion index 24e8c45234..626b55689c 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 +Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index e8337818ec..71be1892aa 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,10 +83,11 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void replicate_task(const MapperContext ctx, - Task const &task, - ReplicateTaskInput const &input, - ReplicateTaskOutput &output); + virtual void map_replicate_task(const MapperContext ctx, + Task const &task, + MapTaskInput const &input, + MapTaskOutput const &default_output, + MapReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 95be9ab581..dd6dc76b4d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,7 +202,6 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, - NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -398,7 +397,6 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); - ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b19bdb82f..73c2c3e092 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,11 +406,6 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void - finish_nccl_comms_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 4763eb1ef3..50a51705cd 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,6 +55,7 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; + int num_devices; }; struct Request { diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d46bfc2877..bc26a79d3e 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,37 +661,44 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::replicate_task(const MapperContext ctx, - Task const &task, - ReplicateTaskInput const &input, - ReplicateTaskOutput &output) { +void FFMapper::map_replicate_task(const MapperContext ctx, + Task const &task, + MapTaskInput const &input, + MapTaskOutput const &default_output, + MapReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID vid; + VariantID chosen_variant; { std::vector variant_ids; - runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); + runtime->find_valid_variants( + ctx, task.task_id, variant_ids, task.target_proc.kind()); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - output.chosen_variant = variant_ids[0]; + chosen_variant = variant_ids[0]; } - output.target_processors.resize(total_nodes); - std::vector handled(total_nodes, false); - size_t count = 0; - Machine::ProcessorQuery procs(machine); - procs.only_kind(target_kind); - for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); + std::vector const &all_procs = all_procs_by_kind(target_kind); + // Place on replicate on each node by default + output.task_mappings.resize(total_nodes, default_output); + // Assume default_output does not include any target_procs + assert(default_output.target_procs.size() == 0); + for (std::vector::const_iterator it = all_procs.begin(); + it != all_procs.end(); it++) { - const AddressSpace space = it->address_space(); - if (handled[space]) { - continue; + AddressSpace space = it->address_space(); + assert(space < output.task_mappings.size()); + // Add *it as a target_proc if we haven't found one + if (output.task_mappings[space].target_procs.size() == 0) { + output.task_mappings[space].target_procs.push_back(*it); } - output.target_processors[space] = *it; - handled[space] = true; - count++; } - assert(count == total_nodes); + output.control_replication_map.resize(total_nodes); + for (int idx = 0; idx < total_nodes; idx++) { + output.task_mappings[idx].chosen_variant = chosen_variant; + output.control_replication_map[idx] = + output.task_mappings[idx].target_procs[0]; + } } void FFMapper::select_task_variant(const MapperContext ctx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 0c7a0f78fe..03c9e48af8 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,8 +467,12 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - TensorAccessorR acc_kernel( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // TensorAccessorW acc_kernel(regions[2], + // task->regions[2], + // FID_DATA, + // ctx, + // runtime, + // false /*readOutput*/); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 2a94df8b4d..6588cbceeb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,7 +28,33 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() {} +InferenceManager::InferenceManager() { +#ifdef DEADCODE + num_devices = ff_config.workersPerNode * ff_config.numNodes; + // Check parallelization degrees + assert(ff_config.data_parallelism_degree <= num_devices && + "Data parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.data_parallelism_degree == 0 && + "Number of available devices is not divisible by data parallelism " + "degree"); + assert(ff_config.tensor_parallelism_degree <= num_devices && + "Tensor parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.tensor_parallelism_degree == 0 && + "Number of available devices is not divisible by tensor parallelism " + "degree"); + assert(ff_config.pipeline_parallelism_degree <= num_devices && + "Pipeline parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && + "Number of available devices is not divisible by pipeline parallelism " + "degree"); + assert(ff_config.data_parallelism_degree * + ff_config.tensor_parallelism_degree * + ff_config.pipeline_parallelism_degree == + num_devices && + "Product of data, tensor, and pipeline parallelism degrees does not " + "match the number of available devices"); +#endif +} InferenceManager *inference_manager_singleton = nullptr; @@ -270,6 +296,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { + int expert_device_index = 0; + int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f9763627c8..c07c33efca 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,15 +606,6 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } - -void Op::finish_nccl_comms_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - ncclComm_t comm = *((ncclComm_t *)task->local_args); - checkNCCL(ncclCommFinalize(comm)); - checkNCCL(ncclCommDestroy(comm)); -} #endif /** @@ -1587,43 +1578,6 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } -FFModel::~FFModel() { - // Destroy nccl communication groups -#ifdef FF_USE_NCCL - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - for (auto const &comm : view_hash_to_nccl_comms) { - // Find the machine view that has the hash - MachineView view; - for (size_t l = 0; l < operators.size(); l++) { - view = operators[l]->outputs[0]->machine_view; - if (view.hash() == comm.first) { - break; - } - } - assert(view.hash() == comm.first && "Cannot find the machine view"); - IndexSpace task_is = get_or_create_task_is(view); - Domain domain = runtime->get_index_space_domain(ctx, task_is); - ArgumentMap argmap; - int idx = 0; - for (Domain::DomainPointIterator it(domain); it; it++, idx++) { - argmap.set_point(*it, - TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); - } - IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, - task_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - comm.first); - FutureMap fm = runtime->execute_index_space(ctx, index_launcher); - fm.wait_all_results(); - } -#endif -} - void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6899,21 +6853,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - { - TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, - "NCCL Finish Communicators"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "NCCL Finish Communicators Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant(registrar); - } - } #endif // Search { From abf9fb8889504a7bb526401dc9f027e2d4640334 Mon Sep 17 00:00:00 2001 From: April Yang <114364211+april-yyt@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:39:42 -0800 Subject: [PATCH 315/344] Chatbot with Gradio, FastApi Endpoint, Langchain Integration (#1246) * add a background server for RequestManager * . * make incr_decoding work * make spec_infer work * format * update python inference * fix python issues * bug fix * add a Legion future to capture the termination of the background server * gradio finished * chatbot gradio version 2 * chainlit1 * chainlit2 * fastapi done * fastapi incr_decoding * langchain example & wrapper class * langchain example & wrapper class1 * added documentation * entrypoint * del apikey * delete extra files * rag search fixed some bugs * fixed rag search issues * updates before rebase * minor changes * reorganize files * Add thread safety for background server. * Simplify backend server design. * resolve conflict. * specinfer usecases with issues labeled * specinfer usecases with issues labeled 2 * fixed issues with prompt template * fix issues with rag specinfer * Add server task timeout. * register callbacks to terminate background worker at exit or termination * [Python] enable decoding multiple requests * update README.md and default configuration * fix issues with gradio and prompt template * fix issues with rag * adjusted fastapi entrypoint * update documentation * resole conflicts * issues fix * adjustments on usecases and api entrypoints * remove redundent changes * testing CI * Enable backtrace * restore newlines * version * add back misdeleted line * legion verion --------- Co-authored-by: Zhihao Jia Co-authored-by: Gabriele Oliaro Co-authored-by: zwang86 <46699021+zwang86@users.noreply.github.com> Co-authored-by: Zeyu Wang Co-authored-by: xinhaoc --- SERVE.md | 3 - docs/source/chatbot.rst | 64 +++++ docs/source/imgs/gradio_api.png | Bin 0 -> 256263 bytes docs/source/imgs/gradio_interface.png | Bin 0 -> 331678 bytes docs/source/index.rst | 2 + docs/source/prompt_template.rst | 55 ++++ docs/source/rag.rst | 90 ++++++ docs/source/serve_api.rst | 7 + docs/source/serve_fastapi.rst | 106 +++++++ docs/source/serve_gradioapi.rst | 30 ++ docs/source/serve_usecases.rst | 8 + inference/.gitignore | 1 + inference/python/entrypoint/fastapi_incr.py | 162 +++++++++++ .../python/entrypoint/fastapi_specinfer.py | 202 +++++++++++++ inference/python/incr_decoding.py | 6 +- inference/python/spec_infer.py | 6 +- inference/python/usecases/gradio_incr.py | 162 +++++++++++ inference/python/usecases/gradio_specinfer.py | 205 ++++++++++++++ .../python/usecases/prompt_template_incr.py | 187 ++++++++++++ .../usecases/prompt_template_specinfer.py | 236 ++++++++++++++++ inference/python/usecases/rag_incr.py | 220 +++++++++++++++ inference/python/usecases/rag_specinfer.py | 266 ++++++++++++++++++ tests/training_tests.sh | 4 + 23 files changed, 2013 insertions(+), 9 deletions(-) create mode 100644 docs/source/chatbot.rst create mode 100644 docs/source/imgs/gradio_api.png create mode 100644 docs/source/imgs/gradio_interface.png create mode 100644 docs/source/prompt_template.rst create mode 100644 docs/source/rag.rst create mode 100644 docs/source/serve_api.rst create mode 100644 docs/source/serve_fastapi.rst create mode 100644 docs/source/serve_gradioapi.rst create mode 100644 docs/source/serve_usecases.rst create mode 100644 inference/python/entrypoint/fastapi_incr.py create mode 100644 inference/python/entrypoint/fastapi_specinfer.py create mode 100644 inference/python/usecases/gradio_incr.py create mode 100644 inference/python/usecases/gradio_specinfer.py create mode 100644 inference/python/usecases/prompt_template_incr.py create mode 100644 inference/python/usecases/prompt_template_specinfer.py create mode 100644 inference/python/usecases/rag_incr.py create mode 100644 inference/python/usecases/rag_specinfer.py diff --git a/SERVE.md b/SERVE.md index f6e34750cd..e64756e8f4 100644 --- a/SERVE.md +++ b/SERVE.md @@ -187,9 +187,6 @@ We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruct FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. * AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs. -* Chatbot prompt templates and Multi-round conversations -* Support for FastAPI server -* Integration with LangChain for document question answering ## Acknowledgements This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: diff --git a/docs/source/chatbot.rst b/docs/source/chatbot.rst new file mode 100644 index 0000000000..fc6f616fae --- /dev/null +++ b/docs/source/chatbot.rst @@ -0,0 +1,64 @@ +:tocdepth: 1 +******** +Chatbot +******** + +The chatbot use case involves setting up a conversational AI model using FlexFlow Serve, capable of engaging in interactive dialogues with users. + +Requirements +============ + +- FlexFlow Serve setup with required configurations. +- Gradio or any interactive interface tool. + +Implementation +============== + +1. FlexFlow Initialization + Initialize FlexFlow Serve with desired configurations and specific LLM model. + +2. Gradio Interface Setup + Define a function for response generation based on user inputs. Setup Gradio Chat Interface for interaction. + + .. code-block:: python + + def generate_response(user_input): + result = llm.generate(user_input) + return result.output_text.decode('utf-8') + + +3. Running the Interface + Launch the Gradio interface and interact with the model by entering text inputs. + + .. image:: /imgs/gradio_interface.png + :alt: Gradio Chatbot Interface + :align: center + +4. Shutdown + Stop the FlexFlow server after interaction. + +Example +======= + +Complete code example can be found here: + +1. `Chatbot Example with incremental decoding `__ + +2. `Chatbot Example with speculative inference `__ + + +Example Implementation: + + .. code-block:: python + + import gradio as gr + import flexflow.serve as ff + + ff.init(num_gpus=2, memory_per_gpu=14000, ...) + + def generate_response(user_input): + result = llm.generate(user_input) + return result.output_text.decode('utf-8') + + iface = gr.ChatInterface(fn=generate_response) + iface.launch() \ No newline at end of file diff --git a/docs/source/imgs/gradio_api.png b/docs/source/imgs/gradio_api.png new file mode 100644 index 0000000000000000000000000000000000000000..7bf1b99a5e6448ddebb6f6c68932da548f12446e GIT binary patch literal 256263 zcmeGDWmFu|wgn2K!CiwBLLfK<>EOZL-66r<8@J#=k{}5L_eO)$xCeK)hJ@hm{)%(X zmGOP!a?Y>!>-88-7uB_QRqxt+t-0o$6{(^mjfGBz4hIK^B`YJL1_y_V4F`vug8B^D z<1A5U0SAXEWGybPA}cOVqvGOdVQps)2PYGmsEwkd-bb9S|0Q13DoY-%8SMp6U;?h* zCS|*n48lkJ_hKqPlO>U7*qBVjwIvA!Xgs=xvu7*Vi3t8%qar?ta%W&PIGUL;be!_QQTz!d`$QyZe)P-cuQY|b$XvTDPLh)0u zjAa_p+K5ZN=z%|cL~Hx~cK2J^B3yNvS|%gT%u?6j6G|Fs{TL5sq0`H;eB9?h$VW-$o`*TIa-5w zbU#6P4btS?Uj*#K>k?hBkK2=j7~2rq1F>@}%!4CoNKog!$-P$-vl$4T0!cq03OlKB zh0v+33yFahkh)>Y5tpJ|+-x$-11OU55ByzBu}U=Z%la>=Kcl3J8Nj`GC)||OFO3$S zofpUuk9SMQkHX6~$Vbhq`_Q_9GIxMUz@;iD#ZUUVjB`^KllB=wXJnRy&m!tYuip_P z81v&0ktJ+7hF$vm$_C=%tRd&N125 zN3%jQNJvU1?vA4R##9pf;rni!^BMpqzEd}zM|28bm`#jpzU~m$>NT;j29^(l+8MZ zAslVpWw6G*X13D@mI7ZyUk z71cvNLUM~yHUB~qigz{MmbqxIfA6=fd^MM{D;`UK@=Lu>zE8hT@z1~?wqwgE;_jq1 zY1!;QYJXLx)wUTe87~+eH8eF@HJUYlYa&CcG-n|DnnLB*DrY$ZX=eiZsY=Xn4}}WDLbhv(od0GyPQxiUf+Qro6VRCYh&prph6$lQgzEwj4Y? zwuH6`wiKiFC1b_1)74`itX;S*ZM1BtrgukoXR@thCZnd9M)gW+)OUil>Ud=rqy!{V z3xte)qR*}OdbvMY$i5$|ulA}gt$tI@MwuO3imlI}U7%oDr%M2`Q@sddE{Be%HrBa2jyzZ*^RmKy4JNoelX|{NQ2+NLxNI> z5;=B4!k2^&eYp6zwwGyd8Hwh<@P$OPy4m_I~WX?*-O=ouC!f7!`(KicpKU zw^x+Vj-XFdNfTESq=}!D{}ZUktjEhS*faiuPF*4oln9Z z<9%Ui4%LnBrSmQCjr;Y)-J4s!E6-=&P@E7Th*ZdW&yEr5kcHpRzj6KE_>DcpAh_#Y zc<^$Sug)7ChLE4I3fMTT7I!cHPkdB7N)~a}Tvkn%pDC5;M2AusK?hxDa=2%O;;exO zRwH`{vze5He6HRHvk#Jgau~xIoi)_C$tdxfxvq^qxqsrwrzP&He6QWJ99I>SO6d7A zos^oy`1u33>4%QfkFJr`U1*U41S|B&^sI`|RJ&vrMF*uZndhl$Uyp@(K<<(@8=H7J zmY-S@ED~PH*eWpO734a~A5ar;S)6?<`$_d)zGs=jQoy0V9^-dR&qkaG@4NR1mK{Nu zVp29e!=KoH=qT|Nh;1`GL$+@s%Dv9y*D=`PLr&oNi|xFERn zyLuzDybFbCI)L|k7B9ShdM%thJ61orAS@*OKuAGsgK_eal|Yd}>H9333t?geKf{%R zo=}l8#p~r)?_U0X857EXDBYrI!vHLSieIgTm7AS^68cc`trYQ;tkbqR+{Y#@yR*7m z6*~)@3wV_IGK4L!JD(3ePqlpU=8YxqNaYWn0IiwbiaCM5Du4eBUHO#_EE2h{49lm@ zaIbv0-@S?pK)cDSm1veQ82{Pk+1?X+Dc0UmE9nqD&3diX&)lr@S53?4+?><%sM}cj z9(q0#M6WA9pXAN9RPm`^p?>%8ap1aumQ^XE(Gl4EwEtT3R5P!5xh%cnTCBn6s)r|m zXPhU^HfNf*tV*xWZYeqHM-+MDyF?$zr($BwX5$h40C#a%7&PV;`2er~{L5K4Ti4mY z*EXr!$E;nflKSZmigVukd_9gl3!iEe?Qful_06u*9kEGd4xt; z4d*DE8h;KKIh;^63z_@08*MFb+;E*QEmAkMUcvnqU3#|@q8V-#F1Pdfy6DUg>n&DV zjCIUjI2|@0-xV(<|C;ZI8`YDX#9p^JRw1Rg!Vi~ik!u8Z43Z2!%H|o~i-K8K`&(lL z<#k>WPm*t$(|R8@)csK|6)85FZx+1Nh8dUo<=jAxB8{<{jBa}lhtBKv9aC?9UsE?R zgD4x8^qbjAe|c{c9fPPg#!_nuCIa;PIpfKR=+GRAlMu!Zj^p z!|`Cjku7GZ_As6YxQG+_XuU;vKx00Sa5+KB#J+Fsz36C7#fCd~2wh+=EaxN+1Tn$M zF2lLIlG}v%@LaX1SxD65%o3jPRM8OJlB@5&4fu7l@g}|q8j>;cfba^@SCW8rxE!b_ zqRe$5bg+|=b#hiwhjf0Kjg)lk|4UM47 z2Mc~R38{ZI2mTX!VdduL#Lv#|>FLSl`G(EW#gd)#?c2BP99-;NT&%zmtgc=TZl+*X z2Uog(cJg2SNSM36cd>SIvvzc#dF>i@T?rmMM&xT8IAQa9oMxncim{J+2aS3^Pe$KU?H#^Rp?{g1uCNDHG2 zvj6v86GpEd#3ll6B!#tvvIg)8OtZ%q!VvI>@t>c-HvE-Hp|m?NDMaC9CEjU(;dhsi z-3VvMP!4ervSgf+k%Ey?_xJa^4bchFyPHF6RYHi-6NvQ~)k?t5uM^DL(TGv8gFa>k z(Rf{^I+}!yC4Q*def8jjz4TqcbC<8tYoqBV!-pVOCh>|VOB4U|9TnplVcJ**UV3t6b zsG$F4z5BoUq5nM+!HAdGPmjreABwd06D}C>=yLVf-~*@tsa8dbBrF0_!Eu8*c4n$OFP4OIP2VLpTt!4p-;hF zF1d12V!4NM)0D4ct0#v1^zw+HVL2UnA7(q6neLFxvHC3>n{sSQAP1ox-jeY*1+G(5 z8=rB2>{6pUui)Y5oS?}MZyzJI<{?eCSx9wfb?+A7_!Uz^dHe zhhA(y1SI>uGQ4T(WKd35*>AcJTVTRiuwr3`=LV9{S zWa41<=hu19-F(O!z1Mox9QN~veDpH$qIF)Umd2xtUKnm?kE`j4s7&L56o&m=RQwrE z#TBgfBg!rsmJEZ#Sfvp1?gD!$bdez#k?lfeaVK~fIfI71oRFCPdMu|mfiJ)`+P<2c z@;L?Uy(>6Vx?I`ncMt2ldgKl&Hfd!hKElrz9KUCZCQGc}Fa0!p4Z`)f%P#x9G7W=U z(Fw=4w_Z%nP_O*pnD6X$zlMBw)GK&xvseeckwSjzTHz3WGC0wlepy1PBczH@vI=Bx z8|U0>Y15g7Zx7NDjbRUG(P@hVgBy1Bs zU(x7DUS>r^pcbKLJbmjys%^`7D7o@)wm0J(tODb12UF*(5QPn?kUM9{vX9*iRDpNLwIcAFtdg9Yx>{G0pTwuKOjiqrLy z+k{Zei9a}z;04G888vkXc^HW@Z?Z_en9QIF99j9Hj|FFh)!z3I27g7w!d9wxSIq#* z6q;q9C>RYNO0_EPK0Dc9S4p&ScUv&v1jW-yd+^r>B=74rWM>A*HnrO^hRAv1a$GO8 z&+IX^2Mwq3kA6Jp-C+cV;d3U!Q(ytpnaE5my<@aQ%iYEo1{1BOV)r;E zT0UH9^l8L4TfPm|Xk!;iWYyQJD|MNgcU$d{G;CItR0u}&ZEE(pr(ZF6>!qqWh=4ha zjh(=x8!Qz|A=ZH0TWvj|hF}T}U8r-Icx}obF647D*3=O2#S&i`>a?G*eVs?-6M}f# zBo#*?g^Yf!8g{z%+Il?qbbCr38t?$W1a*&|E?E+NjCjfH#&QJ_=aysHq1K0)_X!Nj z4sz_6SLrOeE=LA4<>BLf>hHP&bhICu!>ELPKY(L-OZC;xcjw#|YV9AAW2r@oij$ct z%?IS{7plK#t&ae4Hsa!yHY0IUlbXs1)sm*`oH5i*w!X#Pg&EuF(~S*gBBPN4YJV<% ze$Qd%!`(rE+r_@2Uac>YV%m9hR%$^3Lky|#H!_|Q<*kVVg7D&i+eu7KzMco1Uv=8hu+e!xm8k>deH zhYTe(Bt^*YIwAjoOI@>fBu~41KS2}q8NKr7;LS5zsMiTGa)yw!eyu}JC`NMOJhL{C zE!q94xh(r-o^yT3B1(a_%0Pu#9(!!Z**+C{k11nPx5g3ETbs zTk*BpOOjIKR)2co91IfniAVwl>-&so#+QFmT(rHu7YMx8sen5=GshRy$DlIrPxw+N zkI^I|;%fiaV;5xqa3P|37T=Dju`}MkEAn^l)FAi#=g|VIMj_2qvA#Zk=w*ZL1r@*B zW`sd+Yr%*J`MdRxt?PC_UdrNopKjtAHj|0<#%AEA_bEU<_MnSY7*DQnPuvIxFb(wL zFIyITiwF`*_D#a?D}Ryu!!!_2U{aRz*CdT52~M($^^JMOHA47#>t}8;%UWIBAdbH$ zpHF2ojG&#Z;G-`tmw%?F1eihb?8Zj4goll*9DDuIQJ+5L#-@q9NXJ+%oF$`x4exJF z1XOwp3-_Louv>M8DNY_tw1-iyO3h@I?9Be8j3MRK*?p-!P0o7K&=dC%YcZ5wyg=c0 z9D$oZs?67(m7to_yFD*YG?dDdZyYe@DcI_L7FD_ai;hR#Z}_`V$o5QmF{ClzL70e9 zL#*V+`dEVY)2LdFVIw{fkz@>6UFL*k@VX+a9yi@`a={n}^WxhTs6E3Y0MaEeYz_}c zQA|Cix-RU7`E7LY7L9tP8`u}Gh9; zcI|T6-^sWE1>#xln-Iik;=wF39!L>es({4FpR)oiG0VEv0NN?u zDx&2Amqfj_h1r~spDC{c=6g$uAn5`ZBKvo*OZ8h-H65$JynZh=-{ci@sO9@AXDHu2 zE6Y?P$HFg)P>PO&$)Y_dti!cPH3tInmIy{d`{Xe-6&fC{e)NUN?JO1(`|!<`X$PN1 zi4NY9mt7Ia?UIql8iE~CujfQl8_;-r@%=%G86)o5M6Dmmnx*-O=ci7S=wudkD)fOT zmr*iF#NJ_GW9YNXAX*UY*jRt2QIraL2fH_fVqjnkOHbKO6YRwB=Yz%Pjy3JK<72Iz z0#`26h3zZ%$r?JWsLp1wu_ucR)hVporm=N8F|;-GFt8P_tX~|B9*?mu;u#e<(5+e8 z6dkHO+#1TH@z>;8=I-mpC`H>+MBCU#I&}mo2#TE_6U(nDzloMWC69Q%O0N8)P17hk@`U_ z+K`bGj3~I?9{C<6F+6tqn6HByyT6Fk;57*LX{Bsz#qyrDh45aNjJ^|f!S0$Qr?xQU zDOgtjMbr+MeVoATLk4EwwFk7X#`okbd9qpGa%qYw0IACVLGs6f;X=g5^Nm^q z683cH&Gi*ol}_Q`(_W$-3`T^8@oJIA@WTpWaK;^3h>fp(&h2=OVw|fUD~IJ<$v>Z^ z4d_9zl9e)5r#7Ab+W9!|^Q;*1y4h*c(kGB990J=**ydqiW$R?9!$iLPJEyi+uejCF zDKHk51?`}^<4$Dw_t&@t|Xp-%X^1Ags8(~oZ0W7KpYNxpDE$pJ<+bE*!V9cb}wX(gi56pZPU9JguQzW>mx-c*P$lnY~qwOK#ufKBq@zXWE zzqXsJOfe+en{Q5@uipDmf7j^Pt2)C*iB;!JkJCx*drmqQHEy$Zdm;K`L zrS0(k=>W=znvnZ5lp22NX`y8^Z6jJRf;5oobH1x&$lr{1{8Bl6kQ>SQXpP>B+B!(! zYKO<_mFICXtA5J8QH{nti~n)WQazs1yMmF?Eb;Je=<#~L-tE#I$eQWs0qlBoc$hgv z{*cruBrGf(rXD3BDTx=ioXL9fmpykbU{GCi1$Al#gU{+Q{!$5)*$glb-|o$mQGjnR z(F~z(k!pEzlFQ3)-DX%c53BxK#v(i!Z>8a-ff@f6nDJQc@SeBlExopX+u%@8szX`f;A{Fz?Ly&?VTJoSR$gzy)}D+blRTVirQK%9vzn(a>9h}yte z{i+=%RHw!Swt5e`hdriZthK1q1dw!`lyJe)UDTlEa;x@-b8 z!h@ZzGzAR@lhg!}%pOFuSWKRH=&P1yhSWqvAz72*9Mp@Ih;5fWOug3bDm)BzS`Wpc zU;X1?2H(IYuG{@BGDNL3c9l2XKcpahG>KBcL+vJZsd_4G=lmJUkF$mK=o(^*df#=X z8rzl7G9#?d2c_0urMQ72E^I|oL7V1{Df|<#eNlrZe-+*n2kY>?xKo>AYtz9_WpC_r zNk8w4$u$3oeAdZkX==^8b9?e46ZpGoxYm&rabw@B8c3-}_c9F5v%+QKn_^vD_W2CZB(hYGF@fe!U?OkJqi%qq@hSi+(rQ zqv5OzlD5X$^N)zeo6Nz9H5_N3)AwtG`bM19!Z{^GfMdttZ&r&udfIGKl&({tKwDxC z(M$QHPnV_s{vk_xt92_^F+Mr?`t9)YCqGXJ;RJ~Sb&rmY^6%guo;&aYi^1t!u%uaa zHEY-BeA~8Kw&Zt3d-ut^Fggh*X)r({LHy!Ss4_o2e^a=fo9IA&mOqqQCs*td02Olz zEU0EQs%)|@{5^72Rpe1h2KD*<`PprB!mIrRUZ#xm9mO2!B5_hRftq;ADdG+&b{Y%BS+kyuphLWll-xf6?+tr_EG z1=&nHo1&_PIeW^T8!BL+ao@2`DSDUq0;gfJuo5wcdI zP@+#D8d=yIBhuqut5eCO~QU6iTn49ii^74lZf5t6lEFt}Du<03iMXv&SBcoc^~irS&wz%tY> z)f4v^GZ*g#;~#7CGg*4Z;GXcJ!3 z(N4K1Lm?O?I=$PV+1LZ5A^#;gCf)4%ZE%yBwHuB!3JN&oOPL`*en7z8?Xm6;h=fm; zX7Nm(>&9uTswK)}tWDfs**&PN0ixTA|2m{*t0Ode{PiK)lljpLUOMbqZmRpnAt1Yq zt9KP>!c*~P2vpUCuJ{TQx#{V%mT8x$ELWtjD6I2o3`NlaI7k-i{F|7J!-`SJ|F&ya z!_3bP358_{Uc_|mLkgEWwA;4wajaN7riy7htT3in(xdtQ_I*wDIJoVtSQ=f;u^!8A z?Xpke_xJHzjGHc9u>xc2GcVDCMoVoOuFuKuQs<_F5gS~tRoaZs5~#jJ^ZujS(Q9%l ztDZB>c**6r!YBjZD+{sEEHzeDw9oSGt`9(vjGIA> ztmf@CHV6AnyfE9y*=npB<$F2mRQC-DWq+|j-sk+;Xl~zxoj)QkZ!jW{tp-40{3TsB zjx)*_4Z}*ss*_Cw5@Wr1oWObYI>4z<*2~J<*EpoDEMhl%ywe^b5>d{OUGe8rU7W6x z<|Dh9BJK2$>f9setyYPIwUJZE4-;ROIP`xR(w^#}3_%oxww-1KJhCAg7iI@h0Ll0` z4}5Vc;$J>_XK4YJn(_@Rt$x5k8* zB+vW!Y-A$dc*qq)MuP9@-7maeZ8GL5rRhz@YAgK!NxJXwjZN7yUzYAMCl5aoe*l$3 z6ilk#z5Xv&kG%E``Snu8Pg(k4L`Rc<0iGU-*WOd?MoqaC!d{vJjrvZR$_MG(9)pZi zLZqdKn>c)q%lHK^3RQDbR0&)X0nx#~5g#EK@e7jh6KWz&3mS-yo8~)I9}L$Hzty)o zk=?euv=AA-?D=8UwA2f1n`1^yq0{u?Io5MXhW_#<>G@>06c8dSmiuGDRXf$5UGP4UPpMSTQ-yLOEg11Tmql)lG(v=QF9#C^@ocqMF$aGSPM5ly)6wZ^ zZgKM21m0ooHqVZ#)8Shq4I)cr&Br>(?JF=t@h_i{XM!$Y;(i@%Yw<&gxcRlqx=%eRMeK#sG(pqP@tmM$WRhX2xObE(z;F6H&FOi)z+G4x>;uKP~UZkLGmj_vNdd@9JN|sO>{h| zStkkS0$8AKTR0_nJI<*222aG%+&TOo^Wh7|zn>4N0FzDQevd=R6&Qv|Wy1Hpx^+o_ z-wOp;UYPr_1v_`nIM-&oR!eyz0Y|~pI?o+4JlA4Ra764N3mMEk0e3=o{J%YyYU}YS zKt6v-^Ax_Tc?Q%uu4KLF?%&)_BmyFUe~RGe`C98D!H-Rg`SuL+H9qA{4%HQV8CV*H zCh$3mfA+q{U3$sW?_NuG()8s>$8~c=OtSH31a*+F1C-VD9+c07y2dWgU*Bksu?!jI za(VDaK)SFxxY$4Y_&le1SFSzi2T;}loNfMv%kd?xT!wI(!_9D7W0Cz08`QhZ>4THW zMLL|#Qhv^=S9Kzv7`t_G*V!|yRCUIIN=J!oV;pW*doaW{AiN0NIIMJx_Hnew*Vs3}8qWyIhCSo@i z3)Ug1T01a~xSy*Wc5;39@P!ayPv*69BSh@ysP-F+mjQ zr-YOC2a>EwHBcr>4uwvdZ3oVFvNWJ}=Jqx9=^)h?{9u!0Q&^j!GzE-&q4|Wm|0_e? ze!AC(5HtGzv1Xs`gTS&ZEFyryKU~4TPc_rn=CP9NjmzMke>wd<2Buo)Pr&cImaytS zhK6moQu`U5_;7-9WL}1_pK=!s8+1}PFcSd_JeDN)n^8eO(NtuCzRh};DK9o9FQ2)*Y$v9uZ2S#chkAo|m9sB)3yY=DM+ zdwimtpTi1(-0ps!t+^^o{!TS|`B6ZYVU+#y5^8T?N=N~_Cg{ah3&Y& zEAMQl%d{zzwf_#?rEO0a7L--|+tBwRH*^*=ilkb!_3Gk`0n(TZILxzShnn16c6vg#U-kvNgD#1^_s_PFZA7n`sm?fZq-96TfuC$;wbjBx8GOjU_78^-`$nAlHby=B}( z8^Za%cMKr(i((11gAoBu7hhnjeqy3<5$c|5Gi5C!=nkKpEBjTgJblTk>yS3g6l;sr zq8;8gAh%vUNx(9%W9Fj#(8GPYL^qddY6H-#`}jQJr?`<90|f%g4bbx+bMamt(SG6m z{tb0R_+&CVd9~fxkCq3AAK(nZmwrKF^lmFz&c-oW=|WK1#v);AL9pnMu$_2&`v#+Y z(%@u+y3h7}9^F53rBcJ%a63G2qs!ZGphZ)Sj?T^^+lx4?m2u@{Hde*UeOrieEA}37 zeh~+G{|bPeTjJuGEnhpYbwTyl4v^wR%?J@-J=(|^*ilvk{}S*u-2n=GtGxT30+J#Cky>k#0{k7L=tk-|InxpJ7os)GO**JN^ScwMZeMBq%@0zv9jy z@1{Wj=G5QzVWnl9hsH#t#%jt)cijcZclC>O&ZQV!VEG4 zP-;7EwmI~t3#Fc`b-DZ30>WMv%ID92(&$j~F_S3pe!BJ>MTrniPp z(Em;O27zfzY#CZhv*05KlYT?RtpX2F0EAPtrq0mjn0S)fE=5_5LBf`0N+&$A_r9ou zyI7MS;&scsKzR+?4c zT}5jvX{KCXmR_E4W@$e#4NDlcMoekR;l)5&2%@i#pdL!xJlsAxE%}IpE0_*9e#NRK=l89LK-8LxF0W&lf^HxP|g^NA^NgEf_TCVe)uYENe7)` zB|3C;{?dhBHObbN=B=u+9&ukBO5qe@)@%7?AvPeb=0C&PR_+YLPB>uC#{e{c-(7Ea z+li8ufEW*O;EGYXZX@l{kNqU{IK7JZHx@rTC)5Pw7yb?z%j8JKY4ZGBXZ4xF?m&;j zZ_6P+bb8Z|PXaIrn(fo9JAESuL#Y@2H&vDm8sFvC3X%H#2K_fijGN+DPPJ>b*~)cV zBAB!o%YgtnB!=D}25^u$39r?eiWQRc4p0StV53)CbHlHq!FjdsjnE%sjyS1#k1&y>g5KD8`nb zc%3K{ZSZ%0e5z|J%(`vR0d#vOvX^6P!wPB>xI`77E;nxcGG3ruu&|=my+%J^^-EUaj#vpk&k^^%F(Azaw0Fx2!c;y7CdIRp=dK9h?Gj za2FnD+@Ic$$WQnlBf=5pmF~ZVJ$os(?!3gx-F6WO3p{i+qWBDc8a-Ty86t>b3UWjD=MuZg!SsY9DHw_BgM1COVykySHlnW zF6!76Y}&gz+4z(8rgc)`;yZHz4)X3&R`EuZSg8DXvkzX_LPFvr3*u_}CpL}kO)^AeI0@$oM}5#Hjz@ZW4$rFq;(Oe6asjJ)Drs>8I4;6*RBA8O%6yK z`G5=Vk-hgmY~!S}%soBMwirr@*jSA{))r%=IbS9M@-1WCXwdKq9k$)!CM60sMQ%uV ztPYk%Jvu519245Ri19o*706^!L~CkRR+bwe_8c`nCnK`}7v;$%WipAu3cyF%U8~8! z^^G^Uk~|7~Kn3hjuk6Nj<=vmiEWX~Kl=UJMV+>m4+Vak0XgJRM|6Wc%S=-pv>GAIa9L-mg&&*C9d^FQB9u;6I3Kc&THE{FuccQvQ zX<==Q+-(y?KX8PL7O|cdi2r%6Kr946JqhFY(1`O`DihZ}3Y8%Dh*MY0RV^Mef<&wH zMW;Hy4g!m;i6+@CuOWUGu2BvbhZn?xM?u9U8{Ibf4}8^}1fouXg;rx7+?MAG{V6=r z5zhgc7kaU&*cYb@nSP<6mLL1-l*P*Q%@Xra|u?j%8&Wu zDinE5J(R*DZPY7vii}G^GykuM9tytvrE0U7n+L&W#*JVN0s{uTw!k@5#9A>rZj)ld z`FBs6+Yeiix#%x4rLMlHu6;!7lJ`2@C1Mr4`c;&yndc&@cRho!^tKMDG1B-0a*OM< zRR4v;YAm38<;Jb~`gM*IV{}2M>-CPW0dG!OjA}PPzx2)UB{ywN6>BH3N>h1JVvLah z1Ilazo-%|bj0!gQ~ zgjYS}z)gRI$=3i}PFx47ps{a1e`~lO*qTp8(Q5!K-qmTbR(W2_-Lw839jAaGFad>= zhLx!IA8DNZ&%{Sha>?z#4$6duS}2ePX|xh)XiCIK(U#GVxS zr(i_Drt8ME`DszmIa?xu%}`J7>nkiTuRPp)FV$cq$9&s(@@B&bz_o-BDu)~Q(nJG@ zrfSX#lmHP(u`^jFYuM;U2vn-ZGWrj5`=ll)6zeN{sfB^Y{#P=n*uMy!EohgKP>!#O zUX>>lQJlj#wgWpEooW8>F!NYxQoMvg+JnsHNz~Tmi2gKwii*kxs=)poz~o7nVc>rZ zWyO$c4+8vXqZN1=-}lpb?a92~{PC8T`~0GzZ0UmF)!3wU>*jU?;GGsH4%Yx|xwt(# zLcczX`@FWDW;uQfK$X{PgJMl+*a?j4eOWkrrhgFmseQ+@db`@6v@75SiYg%Wn=6Z9 zq?%WIqguv+a9!3}=#_rxw6Ib6``y<6JmzP2+nS#!<15J_pomxc1~}08y+MH>nM@Wz z^LN6nFZ!9ho&&Dmj#Ug)Y|xL%Rg{0anVWrN$=l(>30f}@ds1P6IR#9jPO#^BK%tyTO#8bA+WbsFio(Ms~Nzh#53P$9|57$)qameGoJ!|CNw)5y<)vMoRhq`T5E`@6?6CC*d zma2DJw9M~St(_q|R04dD*~4Nqo%-PWTS})Qdfck+TR5w(JT*Pj3`qGn3R6EO%(L;>4aEqoM^91iz-=*6DQ6*TBgBQVil zQ?8J@Ys;Z@`S-1)>|xJ9f|)GTXS?F$d=9Ew-`NCSfDQo%{bZ%N>(txJ`Af2zZr%1E z_&bp-)ROxBY@u_m9fm#@~$Ea zu{+kiA;@BNvWUiJyrhQBF4H<+f8A&E!J2vs5aFfbsD$#D{elm^x^Sl~fVwwd zk!)=6`NW@d%is~Un3S{bGoU>9!tc2)8j0`qYoP{%;s@zl8x~gm7PZE{?d!mMzas6C zeW*2CwSbG0$t@p&)ydz|dBw1ud7m|@j8jU{;s+y3^_$Gh2@Rm){Aqu6Y;OIxe1h?P z4`$en3w<1w^@8@$D+fM-rIwI=&lIL=VYD71#TP@@n3_`G$d zBnQY5W7){A5m@%GKY#wLk!M2*_1a0A`g>dZkkXp0+K1m=TAN6AcvSNa4R9`CYk#8P z)R<4(Uy9D0wig&%3w34nNE#4mGSq?EA!AW+8|d5D@2@8Y^Ooq-$HJM;w&l8XM@^k> z!MQVb6c(MQ*DD?f-&qgMpGghLE~Kvxh_L;>yL-1g-k_%cb+?%Fm>S(Q*8X zfqtF3Q2hxOsHng;$EbvY(!X(-d;IpxXp60KXu!6qpcxqz$n)<@A@9PBl^*&GE;jV1 za6a(V8|z_9Fd|^5APARFKD=VJYz;o(PpGrglb4LTluzBdiYDo(*sgWgdK}w;Wfq8f zW4^Dy5in0MhaOx%Lt?ef-hkq1#l7bNLOR-UV~1_i9=WyVK)d4G)+F*gAV8;Oyi&C3 z|MB|k94*$))zB4SWZ+$VrFlFFe{;T{S<*3j$H{sQL{bAm^bppx_4%Fa)!x_KR|TFq zBvy4N-mw&Okpkmi8I&!m=WNnkB#^3l=4s&Ssu)0rfBFdc0f(h#saoGIVc6M!DG{}R z4UaDJgvgl}#8Dqif<<^QYcAI&p2Bjgp7iY_3Q)J?2ofeeA)Gxf^uPghfW?6u2`{5m zHQ9q=ojM;Vn-}aa@e|;%ma`=B%O~zp;8CFB@;A7ypQ=(N4f2<~e*>zpoZ;o~VRXzvdU2x_*q3V7hRY8q!taQ0`gQy&$=&v(pxW%hhG0Yk16aj_~l zB@Jc`R?5h_yTOU-Wz(+xbV+g5Sk5ZQ+7TGkSv_?O9TU>MW+kmj#yR&0l6C>n_5lS2 z?E*+Xj}P$nR)N&r{-%>b)A9$~>2huzh~M%>dn3@FUiAh#Fh3*|y=p6|iGrf=TgUmP znpre9RX_^=$Yv~9x{bO~m!HaQ|C)#M?vu4sn^nO6hrdc6_6etw0Ee3|uQ%vyr`Z2% zhO`ms-5g1^(<=U+c|70)eq`+*0ZvnlKr7(SGn*}P1RlD6dodEj2ROu6h-ZYpH1vC%3(4!F-Mjst*H?OtPHdgJ6HoxtroOOq;4wX#=` z4iSZHjNDuts2B65Z35xhyr$lcy93Gb8@Cq|U>1aw32Fb63(R>CcEQb7YJSDo#pE-VhmYk}M z8~F>xQ@SxQ{hd&PF7POHS{?Fi(tZ2|1PUDC0hvIK!i*LF#_% zDt!YW$Jct#+4JQRqhe9^8L-(HA4&OqNBLeUr@)bbEwT8r-v;CrDn@kdCyOV72~CwO zI`!Ye;w}wz>Df||Q9zkv*^>Is-l$7TV#wieAk0jb2lTXM%8$d2IW7u^g9)5DHUuIk zOjFwtbYz9Rj9QP+f!rF7DW#d55;s++sxk;TF3^7B7|pX)p72@Yb+s7Dygu7G&Ncu; z!<()!26n~~;#8c@2b~QA&Hg100z^*V3VXVDa6zU@(&3>K6A*VR<&1ct1)E4HAWXGJ zhxh1MSYP8x4XkEri4@bh-|0#=DFaF#gnu1)CQ&Y?zHxWD9hlZ_yPVGz)idpFZNYF^ z@TC<0=J{SHcX+_nA}A=h*`H@3m+(tGf3C`ty2fVd9iR^bK7|*ULMz&OL?R-T93Qnx zbV?x;WfBaaHcII4mg^rWe2#LF1oY!nYP)XB*CQT=LfTz`U|(1@J0E{O-u?JI8=z!M z=>0$Jy=PQZYtjZPAfRL=Nrr=goFqwB1VnPqAR?KDCWj`q2q;QWGDwmPl5@@=Ij3%N z&KVkLpzr3KGc)I`Z*=DVyKCL?2dmctd%wGOy;V;=_0+r50s2KP)cIP9ckTrY*b%c>exXxS`N%AWSiw-

%6sYu)ZmUDDoN%y-5-n{gn<(*xt3PA*X z3!gw9>5k=vxFqb7yvK0}KOyhCJF9i_#|65=&CRyAo(99NCo`8*oV&F+vBNVnGxhrs*OXdqz5!P?`6t0- z4F{BS1wMfw4==r_wDmj`s3coa%KxE=@D;u-C@eUT;Vc(-F#MeKv(6)?hDfi4iKiNH zV8k)YIF;~PmYzB%0nkO)LoHJylD9bC^u#>%j2Bgi8(9$92BZOD6|CfAb`#Gq{ zFi>_boCPp*5E*AKJaG2as=#FAG9Xn_ud)J}My?kF3L~>w&eU+q*bAVEPKi23SOUo} z;`(lNuhSgH;4QM}JJH0+?Evw{I0xzh)ySP;Cc$LuHzt^Y_;I{yuAbJe!NGHl!2@zR z!(W9+?-M*O$l=wQZ$!f;OgjKZGP~kLD@s{>D4oCPZQPG?B_X?(Uvz$X5T4u9d^?a} zD`RW2q;uQNc#C||5)g&u2$hfm8{$u)y_EBl_=*e3>1v}srY*&c;zIC z8_-NoWwMfWoUatrT<3~^>e=*}?_f-1Mx<|LxnV%OK^Z;Vz6Homep7e@Ad??)?%EqTQe5F}%Wj~bS6vu27}~m@8RoyVzmH_2w+D0-i&yF?8jdThJL5i>|ZvHCZA zFb|J-nDfTP9T2=;a&(NM2@2>=nV-JBdEmYa6a_$wplfrC>?;lR+~%NRiOc;PsjDWu z3D7bKu8{k?p$vsos})Aif+SJ3(Dxy8zCpKGcb_)qy~Q|DR7*Zy+Pe`AC9PGDKQV&5 z%!{7U_z$W+fW&AeMKj05{!@{kV~O+nwV=buy*O0ne%6hD5j|~75gAA!A;Ez^#dyPq zQs$B^(sVVZD`@8*j?$3RnlR(%etj8BLINehi zSbwaZTki3~d3|n&7PuY`>c^nOwZg;cql=I2bg_cKGrz}gF@Wqr94MXEoci1ixfHIM zckM#Te=2L74S>@DvTV+B91;c(o76FRfMxD7E%d0?5)#n!5}DU`SKj$G;*`Jnm6ZC_ z75R{5F51YMR9xXq=U&uhRZgFc>0Yvu#$nU7hRordYyf|=4_N~N)9aOgcRHP;CB+OS zYfQxwydGi$1L>zLjG=C$xCFAA4s;T7HhQj2_zxuE{6rV|he#GJhu?K)D!o}sE_Z`1 z?_RLY9*`ot!uyRXs4`f9+27_)^nD>g-vQt729V${rNKAQccmHu=esZa<^3=T(`t2t z^CmwjyW!Xh#IWi$w+54u;;8pOW3Hh+0+=mt0fP7iL-aMD3b1Jh)r@3c?@*tbcMBaL zIMb@9fT}tb(7|W@vG^EdhkTA!LCA)~*mnb^I^>bosAW=ru2#W2MN~5hDQUWudAVwV zEhtWSRbX_bu#if<*hxx*rU1xVj8FZ;0b2f++u9q4uLS=g{s?Ma--e-R&9Ih{cYbxz zpVVYFV4qTp!A}`V@!zc+9-gv|X6YDdbgMI?<$$yS4jPlvd)J9f&jp%6vk|N07j{`GBPC%roLJ+r(O8$}@x`D=u z-gk>&>)eiYxi=m-xhFDrECUcI6R%aes!16Up^8@#*M5JE{3i>`%`aFQ#5b0h3GL_x5`4M_Ym`N9Hqfjm zq0HG$Dn%r2feL_bz1fw#0;PburV4-~*8s^qpcHwa5Xb9uK61xW;{s2_oe9(ZB_ zxRH!rz`*_#~W%~`7z|IaKzv~d(6;+Uv z6Rx?1k3Io66e9G2QBlu%HRl_tSBZKY#Q^Hko_OVj_r7|V*b^devOFrIVnhO&n|U$A zxiy~3-zwt#byvieo(SFq_4a}co2QPp6cp$+eD?5~>bFg}CJ2E6arJH2-)EB)L?3xw z8$=<5mhY?Bb|&4yB>)&kszEY11Y()RVVYeL%u>Yl$T&c=sFFKI(^?2ERXAJ!G+)fD-{F|n!nHf}t_u)WA@ zOaJwDN+ym`6`;}R+J^#)u38b|FM%E}U8_;zm+&p6#D)x)Tw9ej$wPtGV8udLgv-Hk zZ#rxwO^0<@QB^~-Hz-;QM~K|>nMQX){)NloVu#YwIUv)Tqk>hL-~w)>RWD!(-Hp%m z5cGN&V>?}~fI`lyv1}PBx z&wZ-hibvA7f5&2o`=<5MqgzOc*GNrbt19 zv<>{S`F{gVi(Gq#hnTc~i(;{RtLlniyYjB-p?zg4YAbdzr*9}Li|L?O?Z=9V1r@DO zq2AV>#+c8aBd%2}fVv1kHIl;Rb{W9r(hr3B?z7B3y20{oVW*lWkAZ;(v>pH-yW)?r zRHljpioP5(?%M=e1aFK@M>onwDw4cO(6>1*JY9CDq%SBEniDnwDOO=+Zr#%x5ICz6 zZNi*H{#P-QLM_y#Dy^1ozH*U}0ohwBrkw`>{s2mzsm*TxT>qa-#{lHf=#IY2D#!(v z5U1q!EZmTG{X>PJIXYQ#FbbTs0w^RkYWAp4NLj}tHWyUEzII(LEqiQp++gd-|9Xv4om}N6L6h&oK4)dUe_}w@u*D}-uBu=%cDRDE^ zI*$!z$%u}~s7ytqe|-{{tZrLo{H;m_FnvY}^m06LNC}P5a<UqW^-a4# zji}Nxh3aKNvrTE~b&LdHbF1{AWeu6l6BU40bI;A`OC+UjzXE2;a|}-NoOZvSEeoa2 zHHL;~45f9dE%S%y&uyv@6wnQ;koM#U3!4QJQa#7Kn3>L1(WAXWH9{q?k7W61=PC?L zi;r&r$5c43UYPCEp1NOgn~Qnz?da{Fyj7%CsT;Jj1N*`OIfthqaO_B%3FYPL#ThQ| z^%OPGTa%8@8?)zny*7uoAR5QF!u&9V8b6J*{xmKGi}tZ~Moz^a-rxBR2)ibDfRG&! zve|Jb?*rur@9U*|%gjYvSm?HQsJU?P!9>Z&LpADRfsgQW=1hgW(3IpFO%LEo0&Wj0 zpZ^*K(gVaeM;LGrfYnqb8W0o$9aef|l@djOLv4^wfTjRe^2isM`m}dvpoKM9M=Po) zOL)JG<#|v?)=Y3sL;HFmE!T@9TjvAUNKzFYO|>7mAG}eoP-M?BsCnoPOppu=Z~+QE z$@q6%-9VN~b$mGuOgArOoF7|-8(m#2MGk!wJDtO*X2I+C(GZpvE!LvW{=J$@*=iF7v z1W`Glp+yrC;E`o8++(^}d^^yHylmgv_!3#{;r@dh3g}AbRdKskNv_r&XvC45A&#Fc z+;d4H5xy1-g2N=@2=5N1&`vI;<_R9gNZU3%DlqveQ3AF@yNNx8Je%JzP-|TF!{})r zCW`3){O%$4E3r?AGd@+9hlkGef_q{~0sB=Y?hn2t7!?1YTBU`&nBA)4EPxl!y*HU~ zNL(x5fAPX4G?{4Hv8v0Pwi%D1z%ZQa$$M)Ots2^KW&0hCy~l18-A1<(V;(lacN0^0 zIm!zfoST-8K*6O4rmjYico+W*CBu%N9+x;b0~n_gao93ili{m(6I(x=I@t_Ybk~AfDAU0@tL3P1l4nS{zpCJtn`d%xGtoH%p`o^j8BYJZ z5hgnxD;4@D%P$jN)N5Ud>3?!lG=DeS6ju_ueU)de6enM3!+9~5b9SH9+wKEX>Kskf zrAN_tFnf~X^wuhJ3VQkdbRfU=HsBx&ttPDsl`%lGnBzo3|I7g>_fAvou{_{LFQEcS zgCiJ(HY*gW9HjQx{LzqSt1xrn9hH&8%2~@J*G>5))Jc6+Nxx6KbIq)3Xdic_N#R=} zvU_Zmn|!Ill5P*Sczs8cYt z*_!6a$7)<-3TABvP;S*6mOf;I`!^t6r644rM04)#GzFz@t z;&ZWc%j2S_zQK7*pwNb8!?34&0};~y_)gnlPK+3CK%xWH+jbIpX<(JSTGe!ZRQc;zn)00_G~ zjvb-ur@@044Tjg{zwK^>zo+VhFK|91mvBWKJFAZ6FFdl(gkXo$UuUfSE8MVg6V^{i zAR(VeyZttB6ObBSV#&oi;$gz;rK=@CgQfopn}JJNNwB=m)N;Ut`~K_lq+D$5fG8!Y zI=Kwq8cze!KuJnAg3w4s2eH^(10{Cj*7q#Uh`b;K_QDH1p|A>4`v-zt`MQ+$z|j~; z-W;FbZl+nW*BT$fks@(TUkD$oLXS>@TbtBIkMw=wccfxNx5A$IV}rKC+A{V+oj64f z8YfV{rHxv69U$JAwHzJPw^U4jjQsp`iTKseqf#~CYKo6KZZATC5BqZr!W(3OXk|{( zQY=XwAnmDlJ}c1n#fRRZdyCQQFEHwpb!@A+JJYOp1%Fp@xy^h+ZIZ_I;2GgO$j-Vg zaqe1*Rr+1(VOLULC2_cLSCz#qmzJSqQn2eJvq31+@l)Y2$rj07n7+TwK zcf#K(4fVJRI>W;Dy@6W>5NNA70&3^&67@P@I7kQa! z_{vQV47r{k(%e}%V(dCOVN=RU2#H(qAs1ocX64n)$S+)t8zB1`tpNTG*eB$;b&$Rh zaHBcMtCDMe)1Sv}D2_jN$r^9~@JTpzPRQ6P5%pX)&FQvJHVH`vv)f5|szFZ4iDf@G zHNf3jGM59QN)iTx-M7S{7Av1$xNCdUQlu2`jh#JbU`oNi1IG3^t4axIDXURaDy%3AUU^J~>k1A9ali?NsBGCl2r@tlf|Jf@>Er-gAuR#VT)-TxTv* zo1RyT4J3nleyv*sX>uD+{kkT_etS1GBH;X*C7k_3@k_i)@PL$_+eom9=q!F2Cn%{F zg?9Pyf*?ZNTAbm{W4+VuL-=JEj{{QFegoY!!6jCPk|{yMOJQndmR03wbN+=8Wc5ki z-2^RlNc-AEgA*w)fKZS3k19T}Fzcp{qHh1(9Z$y(dB^!t`%A zq>JAB7VykMTCc1Bb}C%S&?+}C{}8cM$)y|2+t$f!oEB@={QmJzhnpm(?yilN2zzn6 zlx}2rcuz(1+fIGUX~DTk|EdOfKIM9C$f!O%0`{;PbQ^zm?DW=f#siC{9Dy{KRq zlQBBZ#$|pgx5Qe{CY#g7q}rBfdzeFZmJ+jbNAlpA4UN4-CFcINIqFm5fwADt0hdws z1p#F6%j-OkGaqcZTuL$(A}m@JKB|)bhmy~ARz_?`&{*t5++V_vF-Yas;=6(QNvk%6 zh28pMlHMg*R)5G?6pmwy4Q--fO@YhwXntTpKafpzAL)&s!Jgb|qghs+|BR8-L-?4Q zx!Ix=`T-5Gjz}d;pVmm4lC7`M(>%D_{6O^yqtm&-Rks`O4ck0BQ@j)9uUY zSp~<&j`y2QXo-hjA-`F0pU*-{w~kCc^k*o^Q2Qx4J=-q_@C$IVTDdIo1EN(uV3?k; zX;yVjkPu>VmS+NzVW4{?)O35MKMS16=7UhfWx&kddvU|SUr|{mqP1BWH}mG>W%Zk` zsfMb(nDa+#u@Cw7PTpMvoT&GfVmXG2r`fs8hVDP8f*otuq=fjZ4Mb~0WTqzvPnKdy`83!HI=tU#jEn4NNzbrt zn^8zn-R`Gy5r^>`a4$_(-8+W-vtXBO@S}STDj@aW{QE{%?K*Z=DzrY{ ztNq-YP;35fb08{r1Za2|NiZve#HMQd^WuaDT%zE2A zu8>q0vMw6&U5CwhDB3dSh_Ch8iJGG>^+X~RyS{U`>73*Odm@AtUZ0(cRHLI5ib*`V zDA9}u#%fA;zV2sX9W?RbLaNOoGeL6$cS1LOe+s*PpZ)Q52C&z6s+2Z8G!R(AV>3>8 zQ=B9Aix}w)&uCv=ek=0jJy^nOjsT0u(J{ZVH)O_WvdL^jQ*Ye*l7>=MjCA>qP|ntL zbYR@7fN`XM;H*{vne=VDT~6|unN@=Ac~Vt12#SQPzXckUf|^Pl2ppSrSSEUwi{9C( zRe2JI&DpAg-ZR6aWti&nq)?0BJ?5aqHA$`I!HQ&rH6n8C`dMfKGR&dliNhz-Zt*oA z_cSYtNxs@ABr-qp)3PnKuL|QT6(Qsh4NmKMM}r)JlX$ zEsMvSCgV(shs8TQ62cZ?^Aaed+5M;_`mG3+jZ*ByR%g2g2z)^|uxn0tKbP;V-4|(o zuYM}5wGyga%$LQuZGL3QyU=|k`I&dwcR5>9(AW99mdnB&{IrAqbf%4xeE^Cc4-1dR zhXss;WiriWc*OkPXJzp?ST*$(tbzkLe_WVzg~$UwYSvA`By3& zTwOZ9<&|3kV|C%cp*O;PAlN#HF%{@De(1JadT9z|UYPr_4{x2fkZQ{tL^=&x}8d8n?PoIIj}!`v^x9;pXjnRSUS6U zr^XA%M7BkJ#);StjxyNx_>`}rvkFX24|01?A$(6O=)zy~24kV>78(+wx^{zy)Pldt zEqk0kE%ndWrRzTSs~mZp64;}@9?tA zsk;JCmSQL(Ha35 zN9hz9;hP;aTDzqKx>9)kR|^!93$m*(UO?NgUExo&9N@BN`thn|I}|99X-?0MwS$B_ zy>Nd6&t2f_#d^#IrmwR5ZIi$o813{@dM=H*v7i-dc)7!21bXV29 zh4|~xPHCfUuZFDB;tMR{;h|4eQ&DCdht2fSO^iw3ylC$^9(A*C7HggwD_@j8qfMGA z@yi>iceoX7C(~PMK-WuiBe~tqOZ=T_ri?VqcQBT!X%TuqHemC`x&A6~50R~0`Rb$U zp2v^!btc^k+#$yluS-y!{K`*(6F$E!^&Agj_(r*Qypm4z`7uvKsAhg?&05t%LvoWy z;s)Bk|M5e~w-I{rHKzM^D+cxglNFQCRdn`A_k@m0^;TI8ue>$y;USWN;KEvzrS2F6 zFH=z17_9cz?Xs%-oV}VAKslg%n`Y%?YTTs5EVAyj?7K&hQld`T{Kf~m%oq=Nz=K44 zdQYYPvZT-}uE`PHO7|9_u^Kf$CQ|oqY3GF!-SVPNJEuB_3gG?K>u+I3(@izo;I8Z%_^C`YQ5fsB}jUwMd^D_GqU->Mq? zAW3Ku%gS#Js;Mzn@^n8N&vv=!Ho52?SwvOMYlf^Phf$%h*U@hhq(SyrK1!4~ zqVYKyUhKGy3ntC{h|1RT%I7_xyn6L9orNlCo6rQJ4Rj%ck~ZQx`R3%>UY9G~z7U4hB2X z&M!{s?@wZ%XzX94TMSN=mE=5ie=HjHEu?Vgj7D&v4>Q5j$-<*$w?oow;z}tLM<6$O z)!G$gap|hq&kx?wTilvQTYvfxQzMScama%wST3p1ZGnEHZB=@b%H~VIiVsMFn;_x^vIU-KjP-N51P@iU0@d zRqgcgXqp8rYIcm{#`?;{-i=tE&{mA7h=k{d=ijzia$PJe5BqHq-#Et# z$4eEN$`HWR;65z&?XbvUh;q`Jhc4@l)QN9fs_?OXC^pIx^HOqhg+Zqc{s^#Pk8 z%YNnks!Tj(VsT`G2Y-=uxkWgCcdXr~vW{skwBM~hu~%=*Z=<@>&eK50TD{+sIyUd~78-bk0w!UW@zslQ69dVwBx-$* zO>Ee0Z*|5h9kYVtck1zQn5^m>U|y~pmG?Am|ITiH0}|P-xltn#xI6986_xyjms+t+_#*TdfhhLoE$SclTaH-J_Zy*h z(tV}Vv{X8#^`|%ar0a@}JkHl`L|>PA0$#6uzH$*DJ5b({z{@@i#`Nfaz)~oEOjbl; zd+PU$vL>EiyS4sFaq(w`y3okH|*L)1HAxFJR?y<)X6idr6u>jc7*Z zCJ5K8yL3UV4N3i}LYqFzR~+$`(qJP)=enKH-jeY;qjqtYtodz>;S%yw#6qZ1aEv8V zl&{Oo?CiG>0$bMFq08{`g1f#%1;SUl6xxZ%Dl5&<)7b{#5PAEKDm$q(uw<|`#zrt= zWK2ApxA8FI7BF>o2S2c|d)zwf0@o08cQ(dO`VS{m!mI`&;b#BpZ{AD`HG>rjK1s z95L;P6~rNPO&iRWFqp^l)Y9V<>QY?fRdE7R^J-GaTU1OGEJ2|i@!^5qSrS#TN%X59 zA)^K^f%}+du0J%zcNuFn@^xG*=i z4`%JFTTC59wHDJTnY{+NF#S&=IHYL2FR!>~z=E9U*dw?|{P9LQyPhf3x=IG^NE;cx z*WTIuy292OGV6EBg(!fJ;8Gq`*M-Soc1 zoCF0;k9!^KO@gIIwTk1?E~vikA+tU{UP)Mwd#$qH8KI1;9N$-;4NByRGjkU%b9xq| zrI#^d;tZ$roG*`dZLM8Lsr^V<;OuH9{ zZIBm7MEbIRbb1>${rB-am59kV!F~9|vAl1dsuD?D?5g30#)x#dI0SEZO?=KSwN@P$ z6DoCt!Nc{Jy1u9x36F34t5H`XG?6qDw;`$yI^eJ#@Z+AD)g!tz6h*QsY_}segV{$Rvg2Ck*)^R1OjJ z$ynWwSdFu5gX=%U9;bpBMjVq{zd`J1yHl4D@XZl`>oN|Ry~;t>dK?bj~SetXRVJC9INzB+GIx=ja!S&4%KORK?c2i-c9oETv&ZE*njFk zLutF@6JH@4Z?vg;@p(jj7F{A#DZaTx~lEa8vT~pWeUoxgIqqZd^e$^XG3sg+)I_f0#?tj(NK)> z8k-yNn(sG^RZ^G9E(76(vg`P1^CD+rsHMWbu?%}toYM{d)>YlzV$des5t4}=ON|(s zJ?XL=c3`KhLv0z-*i^YhJ|$^8vL)0~A1;41RW9Fetvc7pVHj~5taQ@*><FXrx0&G9ICy7 z^qSMfVE+_JLj5&>BFz8z2fOb!0hL9m2M@u0x|``CS~zEg`sY_}HLIIji%x-us~%UQ zpABpVu}Bx+qj+32N8n=PJo^Ya6`5MaxHyJE&L4ct$8v69pWbi?rQ*e3-ocruYI!?ScX~WmMN; zwTX3JuS^5by*f5(qTc=jd-&rL2`X3WOHX{&?I&Rf88Z_q??M$y_Ihhs045nxEze)41+rQr$cJdA$byN$ONTf&x zTdzve6zi60i!uw!Q|@9{wVRYjU0Su>4>+;32K|=jlUaNL@GF~2sG(QD#Fh_BAl2Ss zI~|{Avr>nPT)gMJ@ZxurYore&q`Do|{W=)cC79fUQT0+}?>DUhXaLiIGG_hO)#(eQ z$8+x%&r?&k=RYgoC2l}*0Azf6e~BFk>)z^0JCjYCRz{OP_D3=NdU{mvUO z_N|FgRt(~ZcK!99-vj#a`AZtX;G6Y8T>m^>p(c#t@#sM~ z*8dS_B1JFCz@8StX>);(XYW~e1$@q}o!>B=%4><%m$VbvuD8~TK^h1vrlk(>W$MScKRkJqmF3$5u0_AW0o;x!J_7~M? zYXc<|Z=0eI$fgozC9bsbp>D+hU=d2YW?-X+vd=J)c@sSxFs94H>P1Gz?ZClFh~jLe zGZoxr{b4rcs4%3e-mll$p+NIGqFb9=y_}OU5$I`jm2p;pPd&m`EJD>LqE?;qIPkZj z2C&T0qCu|~&|2D5-Eo=n5TakU;8PK~cWhivi6M=ENc*6H!X$>L`!T}OW$R;AiT-^| z{;egkT#VOAqo=Noe-!EFgvsv-g_A)w=c%_{Hpx3D)fXf`iX0BB3vZXX3w(cHvGDV1 z7wSMt4^Gw(-4dB7IHM3u4p8TN74QJAJwER#D_W%F5^Ua&9c(^-vA;Tcn|iIm5u#UP z-BQXzN5R$B=gtaQOd0A^0?OK}goUv62VXNWb#RbCt~ zBzp|?jq0&si{2ZSys6dYk_i9_YUIx>PrfX{a<)0&@+rwkzh>h=K1YVYqoW)%Xx*s9 zI=0M5c%+z0jUQezk7J0jRroyV%s&Tv3NmjV-bbD)-Ai9a;ds|Mm#e2wHG->{3 zM79J4ZgX+hwj}7F=V&etC_GJskH6zt4PI1jDB(;^9zf?$kd;fx9}FOTWk4C80Eo3?i3leyL7OUh;!<66E_!lACb3co!ze(b4Hup_^G zW8A{^>Gwg!Lo+H@J9)E!l9V!@*ay@~d^7rQdE zxK9N-s@C}ixy&o*$XvRa$jx|-h~ zHT6v3ZI#TJ3;h&XDG;(TJ=LdOAq8VQ-%&}eug3@v3fTwfi`=`OU)a5T#1Yda`A7ns z?m)>5xC)20)PmKGadxZUT>|H)7`xST(6qbW{ig`NU1Qr=E~i(WQO8@$Ucwq@ zo@Ti3J`h$UZ;S1%=b+(v*Zs0y{uj-E9epA;j4IW)<^`u{xV(5RvhfBF;R&;X61x=>UB3y2mY_9gj_;ohU-Nj7Bu^P)ZU~ z5w152HhMFB{8kokrKV5XKaN!-TD#Hi2GGA?A-9|AIx$GGDujB(<~DH~j#y;2gNzJa z*0)0Yzx2)YlZ!=tBN&YyDVZ#g363#V8E8+_W^t@5FsT|RW*kPR`{Dt-s$E@XT8fN2 zJv`Kh<3eP=ZiIeq}$g?ZEgDrr58)S>lWh! z2Z{25bC{LiEPV2@$<{o!F%WQS%(kw`dYybZ+ic46BZAL(pX_)m7(>W^RHnbIfwOX# z%u8U*-p9_m#)fp?*n~HghlF}10IyS>E|^VCFgAWP6Ye(2Iq~nFF~}@hDIVQm|kCO*pUyG-C@j7258xB-k+kh6GkiaizA^x+ z2BS$s1k$D0;9{q;REOmGQHXVx63ffT0(tVY2voI1ve#><1iX6Lr_b>WOoKz%~x$L6keYPRuA=cANlMJ{z8T1w~gmTi|jv6Pa#=T@qbY`1hXLaF?` z=~-N`4u-T3I) z1(apL*|Fa*kRv=@YstHruY*)iXKNMoHYXp7I=;-W9@+EmVl~;?rhbZ1pID|Z4fPf- zQ=0M-S31q=?|s_;sk|X`rNXGIIoSs9j6`P}HL}K(`mFq{L8w@6qc*I6RZ!2MHt-h( z2q};l_GEwPz4FE;uxQb(Dn(pQO~Cezv73W6`_+#Hg9Ub(A!lt=73FnN1zWL{9GR>lfVnf=jYKC^T6SbAH$> zn2dEGU}6d1J$3kEaAtfWeU=7dS*m=^Y9HN)^q2XtL5j2 zOa4wXl$!SCX4O0i@4G;BCmvEvbA!FU%C9a{iuf73izT zp%G*a+4+IFdVB{qa9(o*$B4rT-{!I*e#>a(x*1i3~a3@(22U z_Kf86$ygT*#H$nIjOH=zzMN@UXdcmq3aJ5b502QTVT{a`tx=kS2B2*6MCKrMMRx^s zYm1P!XX7L>hdV}-!xvhvUj7^f45X#OUDAW)vIg;D$XTX6%cZ3!95u3St~x2sKy!1a z9~!2p^?X0|E&w#d_86q?@v%9SZ7*uP7Mo>nl~ z%iw&a3D+rlaWS*N5y3^3V4eeRm`~4Y8E<0*__IR#n-S?!&^Igq8sf(1ll>#CxVjcuAi5-s6=EYt|#0*Qh8r8ugz?9|_QP z`BhHqej=QDeFJVFlw$dt?jY{E4kc8FW`ma0$?(FejKHrV&DnwK~ibE0_M%z5ta-7kOEY0w*CYs2XAr{U-d{fCoRqo$Dad(JNi0_rad|;L#^t0yx)nx;6 z8@t%)mPfIK0uJ?ynP(Zl=)wFM>eGn<%Q{7ks6RvpBHz%7L6MG?-O7bY`4C7_wZQ#X zPW8v#MuY>soMk88N%Eg9?61rp4he8M2`)34f9wB${Y${gU$3`ZW@=~s&xz`PwMdkc zfpuWMuNv$ zM8X8D1Ny2b2z`)ma$+zV)9Le0@*w1H#h&EOaCpT{4YxXEu{SaB1*sd z?94xTHj5r@{9mOPA|~Q%{nC;|G1cE}^^>GQ!q}*c*_Ks=_qT2LlVoH18<0U`sb9EY z{4HkvE2sSR6ZG{pqL2)^Ix^ZUt7{T0GU#NGlj=tJx;(|=vu&(@!O z4=`SkiV4Nvw%tz>W7l&agYrBH0|u@C3sbjUuj3PC_g@$Hv-K-Kyw2pE&^H4AmGPv3 zb-W(Y#`@V^{bl`7o;yG$mnXPi1O3^>|MKu9xUVyLYCF$=aeucWJ^-N5)Pa5y|GzO_ z(Dgd*@kRYt_)7lTmBKMPZT~CdeFE0;KbQW;L-?Od|IK;-=hA-*DgTSoe-A1Di_(8d z7HBvB7p4D}B}o|nKLtE+Jp_Ok&X|H=u7$vhp7ek>6_6dR6auVe3^uwWz$*c-Q7++5 zm4?|h;C;omPJRib1p1mA<5ZJgZsce;SuiVYvrFeF{x5h53k5Rh856gXG?e2byRn?h zuJQ5ac)uW_9Z=k#-R_Cw>%KbOCZ{?dPg<^U+`5B)YeWz&U%+aL!Pt*TJL_hptxgsJ zIn~F%+3tFu;l2Qn3g%5Cf?R{@afh*9OoFVQMA44p=ZUt}%$YNy!1xf8IGQQ3N|y6Qm!0v9Mo1Hnbgg-ev^vl~!K#>Qjm;3p(Dx6pATZ;Z3!j?TX}`9EY)GS)TP z!XRjFuDrc*R`CfKj)>(4#VN${fQ+Z`%P9cQCb58(7V+6m3F@AklM70$K!0PJ+g03r}++&P85)>h=YU) zyWaV7i@Kn>n2+8T)<&UrF}vT&tm?}_tmwQm9>;CP6T4%1dXx2#-85$TkN}?A@7%>m-e1C4IkPda+_0NNW*H#aHZux*WG%hjgIN zs^trioiY?UG}x{W+L*B(KB{+VhaNCEqd#b!jxdj%L>u+iEuvS@CmGc?gJ)+KS#i1C? zQKgSU3m%VbaHw*xuW<^)eylqW&v33tRZaUE`P4aYrIWMRHh!YygH>~usB*LMdBob) zUkpU)T6J1bg$i!CK~6tE8Y)K|baNCZxWZ<{u1?j2x85&qCHb(FbdO*q@qW@hdfhyc zqM4~EmAd|o{gBWmUh8>!?Tp3v|3lPwhqL{D@3+;`nq6qkPLvw8Vnwv7i`t_$^)_mR zB36j1)l#&zx7stYS451WwNf*Ph*2|=*ek*>@6YG^{r;{i|L3}%*YlinpZnbBJ|3~F z)?(bQg%NWgOOMI?$!&OupH*rJ!1-&O6(aMJc?3zWB@9f1f`7(a(iln`%iW!d}OD>h&-B+?Q-A6+s$;e0LiLJ71T&49XZ;e)!fG zxehFF`t${BDS_xaY}V+Mnr>P9HTpvdletWvnpyUTo9iFn%(keScOohRUhI%TW)ohW z>Gl}_0$>XT{%uSc)|9kw*loIU3ZQeG~8GjNgJAa&yW28X&wa<8sn7)ab1~0G_{A`(A(IvPce-+R9)@8Wo><2Q^pN z>O=PDF0!IWiqXNO-`8lQOrNz&@{WU`Le^`kC-DD#uJi2W~AGsbD{k~spswIn&2;);Zn9Cg>c{1P#Ps_rs*U?Rgofh&#>x`_SSOHIi`m^Jkr@N z1>t)(*zatZ{WJj1{YQ96?kxeBR`zAQ*{`%9?y9k#4GKcB91(E{%o#3t9GaF ziDH!!V3xa=KMTWaxkgt=`)@DztvS=hAFzvM59g=W*06}QLp6#Z?C<-%(Gg5yztDdV zRbb}E3!c(hny+=V^giqRRKao6V6SwA_ixv3&;!lC2OF;`+0>@3a^JA+L>UjEINd)v(ztcaI)0gYkaV%S|A_8Zq3{yl6I z_}#+CM-+>or_PRuk1CG zRYMpY4z0w-k}%yJ&DnnP*{s2jd{cYTTouI__3HvfC3 zeL&ycI=nAV(d{#hU72CZZ6tyR;_CPO$YkYIi@GJS^sT0=_=D3a&Ovtg611- z;X|(9%s3Tvd*rT44E>YDi|~#RA9GD2{%6l$ZHx_BTda`A;a3FZ?otnk znt=4xqmL#DAWNV#UokP@69VVZEI#R7SodxtE+%!gTiS;b^cQ@QNXyTC6lda%8)?<+ zy78-Yfdg<-)vYG*H!0JiP2m?C@z`YwZjbjX&*UAb&$hVR)v&-j<&k_p=PU09Kfb#9 zm1z9*L4SDcK>1OUvy#}9{B9c`IRHyr>Q5XFe!N$m00H>+bNk}z^QqXD6Hwt7x{=O) zSK~AJiVp%gV@+of{pXZUQhRUgn?9873*GBV7rS5G9}Jm6H)F!OCkBlJ8Zlobj~))5 z_+D|DkHs;AA1);>#r7Phr^7MhDXG7Mg~f>`fmuPofEMS1p_1f=3bXdl)T9Tz%>X&QMTZuCD-u{uz^P zp@IigDXZf1*je%t6u-|fdBEgSpKLDKJfw5URyr9LD>j=$b_HK`Hye;kuJE zz5@t!A6=~rY3SyB9DYKJ5BMT8RU=|@M;o(}DY4v-{-R$g@5=|>fA}{u_#11_s%7c2 zOuBYP9>1l3g__A4%cZ^mPV~!})7iT?2V@lC{Y#`lDfHnS;d}_g%V90`|5O4wK6KPu zc_z-cGlFfM@+~x1yjc7ullQd)Y2Wx6CZ@3C=J(L<2GVfAht+4~3D%fgGx>F3&`0DP zg14#2@x|-%oDdnb3%6VXbr(Ej8$}wn_w-)C>u3f9w#O8ezW>TO@wMV%-m_11RC46S zd%43O4T)hbT`sq>%l+-ls&G)FJ@@b(M$p>Psr1oZ65D?6*Q{^OuBv&O)&SMk zAuL1HR5f}|MX(}Q_8ej)eW?_2Qo$7?cNq1s1G6_|?Nghnr1MJ7^T2;cB(DtSY?mGD zc3nF*i?Z6iH{)2BCb@Jnj$pGL<;n zIUO^DLh#eeKBqoIafh(w@ra1cjc$6Sq7j_Wm3AT9f6W%>9UqfR65nqN*-qlOL*b<^ zhziEJxnhq;MVVWogzgEIrx4p*EdUGs-Mi8kOfhDd_hV{`>n>B|1Edd33@JB7VwmaGYE9&wB5AH%AEpt;TpW-e~8vB;G z%zQzTujSzU$nLZHt(v~doH=gK3pdAq61n91iPLqF%8~Y>O2)M5D*CeK*<9OfvZbBh z8c$iA3`|^V(AXBh{m`QjS@^0scDL_A@n3o0Sj_dX`Q;Hi`W;{4d4<&%J(|{r#2nf# z#jV6NodQ+=(SvRvmW*PvRpU(HZ*I=skccN8a$Mt`P$auQB7CPnH0zJ(^U#Gg9w^@f zTc$|}*5ZnrYzPAjwn}V~Zf##-*bC7&S-r3YjQtnDA5}=SJflkz0A;J1D+0fi-HrYU zhpsM~xW&f+s{h>y%`als(5K@1q=Z}*7JQ9f%L>b@Amo4UKL1jSK_B!A z_-$OFF5xH(HNve{Z$EdrW!7OUt$@kzA(@v4Wat*K{~{uT?Nd3YO5nzyydk1T@kht3 zp!Pk`hKI{YVI8RflU2XKjx&p7C%MhGWs`<_va5Th(H0u3H!#QO5!X}!O{Bya5MLbnwDL_yZtM+@~ukrT|hY5DA>o#{SD18``(S_WKSnf}?GoewAR!~}}nVwslLmZ@=G7eMmIzaP3zTg`QQl5WB z%$t{c3v)sb!N#f=cijJAKF6A@0sXBMZ zYeHI;g7|&%qKTCq`s4`e+8GIdyWQaPeSOyK~e3Bum>l(ttdE>HNmHeTXF z2m-Z%J(pu8(MLk9e&g7=n2`LovzdbQCS|E0C6XjpfR_aSHd+%72J>vyZe|`X6@H*D6cK(Mvt}`v-TjgMUaDv<~8?D3Eb0 z8M(MYq>}Hu!~A)+=W!=EatpClN8#S{AqOL@=X4b zlFzzLBwtJL&bT|8X6tP8tHJ5CmYfBIzS&+Ha7&=N1|6?1`%EsHq<8mdi*_l5?-q00 zH-IfIgyQ10aPVAoWC+M80Jt%mx0g&+DZ7Uc8zqohsq0sobW#+4w|ZvRMV~;MV)&IM z%!F{k$n};`{(xU+>%;nlr0v%XsYgWjJUbo%tZN^40P07%9CncHj6qVi9lq&qNwMt| zK!JE@=f$ww>TL=>A@bmYhMHcrf0*Q1cU!U5cxta0c{Xxh!D>qIp9hhnQYx?uhP2c3uc?z0oJ28%q~uN4*<)EnoW@ zfd6GoK{@YO9xvyh*S9g~+A?@+d!bgzufREw*ooP56x-?Y490j+ftFwVJQog0Fjjon zl_68t0wbOfZWc`h5G&lGAw-@aUn2ChAlbSvP1Lk+O~osT8q>7(O6)0Cxif;jfkXB9 z*AbHNdy|+MIEY~q2+S{yfz>!R3k|{6rqmakiSG5QBaRbkE^siE(vrfvA** zWC(FeuGjTopn)*yfY<8SUZ$U1l>eMn! z$GACIz|vYLT^`t6NL#>wI*p!U+*?-1MS~@)(X%Zz<-Qc7^29P?1L3qX4s_BFMsEvH zsgy@XYcJAtSL(ha)Pu#2pm_SERmE6c^eRO+7#HM@>a+@8a08j^#l23KhfpDf6(qQS zU0m?-W)ao=Xad~$BTmk|HkIFJB}5O!p}Ld8=1JVKFDXdUvQHWI?RP<-7YVPRqjlvc zRz=#W&VcDr0136B@uT%+Fw0BqJ~O-Z42@KBFQ~A?ZNe+g5)VY)&wII#-&gB~Lc#@l zHX8o4M+&&xB_=?^GUztCrW88BORfoqFO*`&OmgZdV(M6lHt9n@D#%L9Er^{bfKFgsC)|5h_C%yIv10*+dR znB7#^nCPUr5-S>BsH`0gO;b*oNC_i@rQdnXLC?G&6*1=f4SAYad*xXMzSM9JKcLPL zrGm^9fPwI5@#Mbp>#vy^n0D^k{w>xUVoUIVom#L-od`u|F9|{x)(#y7Yam$DKAl%k zLx#yhbJy%WhFfkCO_I#0ZxmwVFMQzsgK@dn0M0Qnoz64m^W8k!Yo^98W~D ztj^LcO5$C9Sz_gM%WeYW}l@6o`Tk-X44-tBGaBPGHY25x1q z0?^)~7gl5|LMFLST7rh)5$s@$uEDb%2wEa4{4Ko~ECI7YdTL#E@E=Fm-@BG(OU)8A zO1a4WTInan{c&w)70Z|V@xN}Dcv|fDMpOk#vpWspjCsFt9nXW7nWOYLx-2c0&Jb{h zV$Cx_Su^jm%ls}eFdajk!wx(H^NQPDu3@QitSx@~6d@lxW;OeDh5IANn@LiQ8Vmly zY)sO>$nd&1v6uHXyrB|8K>aI*~NsbZNbacKLs?GtDx64W3NwN+)Om#Aq z`_+0d>{62MH1spU0%_sWyFmjlQrnkFG;0B`FG2M@gwK1 zLV`BJ(u04E27o8BP4$B26b)Ipt}1*r`v%P&B@vKWYoU0hsb(UlrDVw!IrG^MAA4&# zrX(`4a_isWrU@GDOwuChBR?dd1}&qb#3a9kmO%}jMq=pWK?@WOqSdiIl}AQJT2z|1 z-ftxwI!p}JCBREMav$oit-@{Z%O-?J7*70l%N~D32WHMw|K_&Y%!Hshh%a8TJ&0Xy z^!qvMz#+)SZHaB;#YA`$aV9wT@K`0lFd~NCEAqQOS83eGGd7^?2wm%m9FvNbI2BrV>yh~%F=Huv`@cX2^v;>|PuE4BWWnISCr%?m zS}9JJ+?1O@#Gs-9`bu`d{9~8l{5434C|2<_M?Llnk3(MK08Ix+Yoqy2?%~|{wY7m* z#YHld+z_urLyy+dlc1Zjq3qg$S3Fm&b$l2)p3@?@dsX;(2ulg)1)b4QFAR#~)$7Ce zZQSE^n#-fad|}C!gX&dnSOAu{GbH_Yr{^#E_Li}i90is|*leSp{9O66aCP>{wo-JuWRmAFT8yktM9?i0G zTU1%dqE!*VWb=XJqq1#NgYYRUeEN&#R&6(%jeizrh*K2w>9;h`-(Aa-pC$gt{no?= zQ3X7l5^cVXcC^PoK_u1|^N3JdYuvlhZv4vw_@Kleoe@kfy_6DZ8;fmppMEN~qrjWC z8%rhSU_ST07tY|qBe%9KV3Qg?$5otX0fMld&u)wGvG2Y4o1C};mlduG;6hzRE!6|@Ap?fdWM3TN+g1O z`GVK=G-!jO^sQBa**iFc$8s+hF(4)l_2WYf>=-&EX@Ix$Bf9xcVhMqcLEu*T&FvN5 zWB|d{4lu2W{)x69d*6MUNd!C~RVksr4U48x=ld`-4FbUC=ZI?ixS@dOY^QhYbebup zl%9>i?>l(wD9UO+X>a5iD{bt$Y(84k{)WOq4owRUG*UhMD}xIx8t|F<5KWiTJ#_d&;;L2}?P6SB9wjK|9_?47t!6 z2(8e`iXc&Fqq8DD|J{1-MyJ|24cBbwOjpU`_bfwUk6TvR^_{0S8~9X2=Hp;op7 za!qaeU1zCO)u85Rb}Lz<$U=x|`Sa0~s}h2$k~_~@lg3`p@EYGtEL~Fct=R=9=K>zS zGN+6jP*=n0er~tbO8l7`c~@=qyvCE@SB@fIk_%gqhwIVZ4Vm*vhi!7rM^>RHz|%&w ziv>Zvaj1zRoavTkF7yzGOy7wdcnv9EgkGWD7Rr@<9;L=Jzmlcu^zSCc$@tD zvA|~T4E2>4G+eWb&ZOfHa7I=pMax!+z0+YZi5ov1U499B*xw5y6=cJlfFNq|GcSH63Qr6W!gn3)nk`Hh-JUc^q)2!#Xr_B1#NlbbmqfV8IfF z9J3QQ(8K(UwQP2zk=c{ki0r(vG?kNQ)mA<3&%yy0XvZgd0Hx!mQJs+(2f1z{(7Hou zRK|5c&ZMk5;~aZE-bYNC+CM^jJCrvFJ>s^;c#4%<6>w(|bOuhxag@U0T`F_%!TX$w zxa4hM0B;H**{HhxD@*Cr=cQ$e3d&lYOE%G>D5orSY?)Bp!}^MLCV75rYKK3ija0!f4PTXVUr%v zfbj?cw-#3&Jo9BYQm>NiI?ui`UGC-GQe4$s;_`y#r?u$+bR>{IXCN8>s^yz9>~*fa z^E^beMGy2K@59ni)w-f@F=Hlm@3ht{(aY(3bRlULj%Ck;qLL`pkm=iKyk*0I)|1}9 z9wB{yb6g7j^g0_NL<0zxCJ{H`^V^5mMQ{974!#$?|J1#tmJIz7gq`Y83xlREzl8-H z_=WdnEXFm!lve;xA)0i`adJFf%wyIK@b`OwWP}Exj9KJjFfu&mByB%@SwLqOhNU!Z z6vEzMYji;K!#CV-yw+br=EVUg2bH#Kb1|vTHL$K9(_AnxL zJKs%L*9F@JEwJQ{;X8pQSki}-SIusZI6g6DBsVW4)nby?qO$uI?ChcD{Mml9A9z?e zJWz-rAh8^6D%xoPQFSTRc`U>%zlrrcDyOYUD?a#ocVw3wRaWliEM@c=Q0fd4ex2qj zZs8zpU9^PrBxdH8e-FqixgJoF|HagKfTF@=6B!2(h~xH2t8?chkDelbJiN_Jel2=rK|S1ozN6C>FVe)*j91?85FVSo z$v(Yjjh5yo%-stH$FKrJ3N8qh{jL7+37lZ653Gj=L6f{kibG6(% zQkh)sD>`-Zb0Mx?^8^a>eq0p242Oq29hyyz`gnZ`={<&Xt<$K@STb-wb@MH5xg=x1 zm0%ba;CGftG-UY11K3%x2E+&i*9NZ%(YWEuOWoxjb~ns0?w5YT1%C$Bnb~jNozs-OfJ-wqM;nIq zk!6-2+L@}(gjR-F+Zml^Q>VN7u1ef`9wOL!??TX>X5@PN9u&5>(4#FiDCi4=xm*#);WOB)iwh9V+4Un$cYg~`4n1Er_U zGZcCWTVCf2BoKTEsj9osVK&L#%Z;lMWwd!c$tsfX)ZMlFdAw51H;L$Yxe+?`_AXM; zpKg!>ycFC*tlK+;BdL?(LGHdl_{o=uX1$|V*0Qc0_9lk#5gnZXR-%N)?1zA_*+GNh zdWe?=rK2aL+6A8yBQYz9Zvey89hA(ghL#LX>(c9Gr7nLc47m|WCYGgrW$>nU zRl4S+c&4I$x?cdcF*Dk9Xp|h`+W_3ttT52&pzXLjnM*e9l#1-S_a%wv&A2$85K4wT zuBJI2e7aJvh1ls9cukou$)oKom;~@x0{&_KeRDu+AGFJ8gwVAfZS*SbbINx)Z)x6g zaD1&$HosNG?koIt;BV}2j9{=Kp6B}uK<;tO!N{P3GS9OPOc}4kiPA5?t%LpAOrBSd zfg+qv2Y(p$(*igTzkA=+eO9ecXCBK*wS874Gqr1$2mJtJt}o9 zr~GQ68rGDOG;=raAwz&pOxIcg2v<;O{p0XU?Xy$*wz&`(kA4sLg|Yi`*Ly-)SlIY< z>-l4Lt=2qI$^a2)T$J=JpmgK>6c$k=WYr+jww*#wZ1+v=PxPPdHhOks`N$rlr)Ce& zJ#&WCz;Q8fN<7aN?wNStL!%Up@7Z4N|K@V&N=g0?_w zPP8bfr8u9T@ylb|F&>o#tHEvIZ927awP6o+ zyH{Tdb{zIgr<21O)eJ*z@1+|&zB2`4`%&OxZtWe|dbpNn=x!af1aCe%a{46XhK`8+ zgQB~SwC;Ie0nIa8UAWR2DGxdwy%!^W>)>#2lJd27F0Yi1{WIq_O{5~>Oruhzo_INv zlmL90i1J5i*|7VrTH`w#GvOZ@JJz|;O>-h9rfw*YO;HR>lksp1U2y>uRd~!+hNILv zdJmN5EfZpFX8+WN4}%92OaVEOR|YofKsCPc5fM6vwv_g`N3ZW z=hO|EVr;X)HvVhFtw!Q3hJ-Mvil?*t_ef|uYDx0L?sfcNOSq#u zd2xO*76*=%$d$8vH#w--uoa;mXmzU8l7rdHlWzGiR=vr~mR6XqbE>%T4=_erLD*$QTMssTbm%8c<`uL%G+N#AwwEI_34JCh_AedY1a2!LTFPjuugmCeNlk7r($xn zs`l3lPtGyHJY&ing=FE~#(JZD8;sCnp`w8Ms;%1gHjnFhdzU4YyN%P8ciLOn`BtvU zCbX>eQyrw-*LkW0y#eooUasAz5X8C`@bKrwl-?>e)Of3)7B>6me z@{vnY*g~>^V|r7TujtF0_c|C)%waC}h&b2zGHqZt2r3N2-xkk%=G;B|@{RjzR=z(s zB6U>u)MVjsd2_%OWp69dvC2alAp2>_&D|_;VM&$B!)C>!6C^?-UKUQ%2O4XkGAX{` zNxmaNJv`YRUOk3fw!BRDRS#!=Pnq{ug2zmt~#A+&S?R`XsU9%bg5BHx&F23 zYxs#HGBM78jb}3_{7FVfzyhXI>(ZpYHI?x^-Fl;P&GLre-8Vq%(T#$u=#uLWj>Ck| zlas(#3r=VEv#UKscq5}Ide=}d4iI07F&js7D%>qawWp$U-WN(TX+h?^k9lwh!6RMu4LJg}^i9z;NTk2d+7Yoya2Qt2CY?u9nA_f1h3k&_DT$+O1I+Z#w znvC;?Ng6#UA|}z3Li<xv0A4ux~xbD_zSYxbYF#Itu8hy3+ve4L zm2zkb312SsV=z5XHbsp$c(MIqR$-0ii9b17h%^i(rn{pS;3a$716LhNdy{6A>eSX> z`oYH`dWCa&9mG2qE_$2inv@DzMy{g?GkPDKoad*7oxchkVRcHf(5H&qDnuwTvt+UZ z%9++kwPLzgxPqkwS-OXJ;hd2{vmf9Egdl37;ez?_ZvC3bUQHow=wQ7W@Z%N}TMws| zdKsAzd?+Djs2{1VS#Y6jr8T14&E$6iBaJDU^ztmHC@;9hsQX7^?#-o>WK7+#j1DlP z*Q&tH%3;!|^n6knRHZLh!ICGaH@$W#JL$q&z6g@O?tT*Z%@;*4 zQ5k4BVSwjICv`6HL6`eIYm;L=^@55fhpy)Dt(Z|Oi{dI0y8G3_p|7`cR1zJC+3!IwBRmJcP_d6%N7)12{IxrRk&(%qIw8ilj zsAq^dPX@Cdb8w-^bruvup);qg0uc45pcSKz>z{_gSWg#mpE#}BTDN+MLjxcD64x{y zwb^Uveg+&z#_&(x!E`1#HGjKu8ZwVH{S_04?pKG~PQ@`~xJ_dP1{|i0#EkZGMDBUa zT47z*Q&j>o4&9gqO?-fy4ewW7l$#zI;+mhG$!-Q^^nlQVF)tC1jDv<6%Nq9JuCfYC zee5OAE`j2Lr;Bh;aW&>$Z0Y`VBg}`F)-E7&)z~h${v&E@cftT)URZ!L{cRChd$;#f zN!`H6TD3AJC{4=P%{k&+_QPMi8$8dRbiTZ9kkI}l^m#L0fA*N%g{H0 zDX|B~wkQ^954E#rTOB_K=yzJKe2G zU35FdTJ@wDQ=*CxZEh+*ld=(!mJQPKt$1hq=SfM%7V7gF@8w#4ah&(-=@}`SkBaWr z0tcaXF}dub#c1Vgp4JB?2kSaK9cqAe>BJ{+0J|p}O&PhlXcT*#Yd5-4zRca!_91av z(P}Y}TQ}zswljB*@u*o%IgyTo`mO9-#Q|a8qx1(|8dJ3HA6tpv^H5Xgd3Ph^ZB&Hy zL_yy@gQc5v0-^l*O9+YtHwAlOC0hUGv9~Qe>?uv{iGoc6G?S(*T59iSi39DBGorn+ z1z!xBqRy#YU-|Lfy)tA8MVFt9qb45$>s%+pa&IypBQoIT9c>}P(~y5oSyN>t-jLnR z3+%fH`EURPpIY&+)A;0`W-2+MO8frItUzH(EEA{GiAT!Ez=+=?(xQ`tskTwxa(flE zYf47vdPJQ|j;$a3NaQioK!YHt>+UJ=3}u?>?d9c->Fe&dSb~Lx<#rD0PAI*a>vXO= z+NxCebY4?@nNY3I$bQ|^GXGKem`9e*C8(7cb@X?Uu|xR{G4w)kAGx z7IyQ%NYvwg$&i#uW&jB1D9##qX(TLiHzIozxUVY0JTXcwgIDWcAI({wgx~H+So~kP zkNEexzs<(_l|=zEWt&!u5xmrirsY#W)!@ckQ}ZZ2 ze(j3?Z8l29(f<@utFx*0{xvIu6zp)Z!KpajX^HQ2Q7IO-xp9MYTe)l$+cE{`OtK#Q zm(E6iuE(<@)1?dUbDlPo?RQYR^aVP#(4m~N`2t&_FH`qrfycn|e!jKZMa>7WyNAv5 zamwUMw*<)3k@%0jowm&M^+nsj?0+-9BQatgdjKQL&V z29ERO^<{;x-uWpJ{~0q-BVGL$)op${CIp|}bxvS5V2BhvOTPu~YMl6srtLB6nI*-J zKVZ5JM`!u0B?!;T10`(gtZ06^o1kOb(Wspo=$2tZ#q13Ah}tkLfM{HwH>8*pTs3fy z&)27iDRFD892ZDHmdknx+!~zpqv{O+O}Q9A=Dez)t97>)+>%Y=VNyc&1V#tctwz*H zZdQfKsd^U9gxWUa{BJ5;7POlCE{L1pUf?-=4d1#kb6xVULqKAbr`APYCs~&g9yraJ z`!Cypl*mtaoz^(@qOqC<^x9W>=)`462Gl(zUAW|7Ib8}zy{c_+cSAggA|kPoI%1IJUb@NNtH$D04G*ntDB>AvH*Or0K~D)RsswskYfF3u3-YcPW@@pNhpPd z?uvDP#>U*n-`cu=yqPuHh&W8BQcGnq^pyMZTxNhKX@XA$KYO#$%oeXh^pyf{8zpT=X6D#DaVJ$)pnXoBYLtuXn>kq@bgwqWq zzF?qDRT^gt18Za%vU~(^2E)G4>a`6fAv9DGK7L_;f9@jFRq2bFFE5>gCW2;JE0M(c zI-tqGNd(}w-R~;=$L5%px7i~C>4gqI;*4sB7$$*Tu`h^sRBta4dgGr8bMA?x%+-Wo zK2=2~H2%dHtoA$nDG-OT$vu4X=UheRgA)AP*5{2azJTRFcUXw}nMUTfzNdvT_{G+` z4rzg115tZs^nKuGd!W(KTf8z1!IlT{4oh5NcZb*wKC}n0lKo5-Fk1c3B8&TOOxb3` z_6k(6l&%lw&V{)(S4UuQ?Nboq;+hgRwR6ETVmpw1&y{%*cr&r7#o0b}zi+?U>PC6b zY;;xrRrc9HA7n8sv|C$5(15$m(L(5H`~9ZzQFyP_{K zevi2zHuASBIikW}K=RZrYc^17|4dR)1{9cqNdqcGdBj{q&B|ZYxvdsHi4A+RhJpfr zj6Ax1?)LST@D{(rItd6YgS}q~rE+R6>Q-p*sIh=DM@czq4Qxyualj-D|4m z-fOc!%?6gI%E1c!hYAv9o)=WCAf>$9X`7CFw%vRk=Weh(SNLP6p*ra~4M=f{<*4t6 z56V4(=%>GF_{GxD&Kql=fHrd68j^XJ+4*SJ-GPE+y0EsSdK+jY+*)Pq+;->dWX*8i z1HNCGo8@1OLhpA=rSsQJF_;F+-MwbTZKIjcHhcfR?57B$-V;QN;o`^VpQ(xIv+B1S zFR|B2aMv)L3=K0aMjW?J#(g4DgD!J$kco%tK})WWnD=lUAvU%g`cWQd(B_{AP4u^S znq%>6k351Isgs++6; z$rZsQW(*(a&uZ2eI9(_B`o3VGE&_EUx#lxFP*M*SnVDQ~M*`D@4Z9g9*XLWeG-J8Y? zEq`)BkY#&|^7*x}+CJ(ms-a+)Bo6sb|JYNveNGO>L&Ia3i%_;Mk8oa+ z2|ETO41OeC5wU*d+%WOwiU7-P>6V9bmzZw(ge~9vneRg{8z=oPt;oB*DOteX&88vU zFyk}j=QqZj4}M1a_+JWg$c3k*x`P~tR6D<{I*zvCKZdJPe3mTsyuim{I*Z?)2Qf4| z1XmW&rddY2AN4Rn4(7>{MHX5Y$Q1{mr(~RF%W@60>rfy$d{}0BV)iisJy6 z?t~RmSLn#))?imZtihujYrCrNMi+YAstP0#+4tD(;e9ZG7mHEW@(3^xH*+cb1Hs-x7&Q*EI zHMy+{Yyl1B2H`}_*uTbC$r3Ef=@&DVRr5k(*`XZa6N_op+U-5kNy)9 zIA$)-a-MZKEmWV~S%XC2=^cA!LVsl5C}@&4ef^n0l$Hf$ih;n!mmC+3TU zFGcb8eI{!gVuS6@_ftI15VCdY+bAjL!x`jl;qhlKeWm}@*6v-t))69C27}C7LLWTD zHww7*jnx{w$oeQ8S!fD#G-i6eGVS^fsrQ@~CIW z7&K$!0(ed>T=O*Xv5_QeM1#47ZmfAOL)nd{rndL~m7}*aufLxApbUE-ICEota3-}h z&$o7MBPOB=_)4YZr$Bn=W#~_nqUg!9!N|133GIOoVUwXcn`4T9V9=a9_nDcV&@y-C z86c(I{y=f^HxS#t*$XIh1XF~ql~y<;U2L2{{H;A+lh z*z1GlXz$wao9+2e6E_q2i)EM>!+M$gNeY>JUrs%GKZrRtde^gMej^4Ga+qXhttQ{R z4QD_FsKPbQiD1}CcH5<OB)v&d8iV6}Qxs(+j(jSD5;ksguh-i?|K z;*T&|2@H}D_qc%0OUic1q^u@4(DD-qlY4JXc3H@jf{=Tqqd_s{_YJ9!MT~ZbEJ;^? zazx%Q@Mr^bKfZUd>efT+)|P6|$#bCgGmJQS_nFd`tL6IHd7>XpTczj^Xy0hU?QQ0x zuwZ#}#CrSmrtngY12p6*D!s1JpkZ}3<|4T@`np7Z*+T59TSFCUjhM(N8MRS(Bb$x{ zs!e+UdO!Eh&ymwNOPOFSmvhf^Io%rP<9Gaa6%VTqzrOjol8w9O z3q>_8@W(MDeQjCPvvlrV9uT;^)(I93pP%p2tX;ljV7)(lLZ151d;iSIK7ol1YrWg# zKY^>~R-_`GBOiRYOzut(#Msamg<;~uPPEgvP)RXxw6Rd|ZV6TbBYQFGmS5X;dx|*U z3oXHDTIR)ZLD#{)`~P{&|Gwv5p?}|V69|*!oNO}W`}UVhr~Pa+;yg->X`kZp_8Bh5 z2@(3oZ+)gB`mUCmfNVE&=?#k&tx9XN%jv|VC0{?zg=C+D4O{-QVaXx8yQ7cG4|pmI zSJXPrhMB4Tx^TmyC1`iIVd?9KK8OoeIr9NoYtH*`y4@oxf6c6IyOQN2B7f#-ckTne z8wGE9(kJ~=IK#EIB{j2`K!>@5V8>*r?yaI`iN+`LjLF+6lPmObTf9x^4XYkiBl3qf zyR+*!{N(v3kv0y6!eB11F z&xA63egv(wX`521KK>f)>WW-96Xo&|0eaXEcbMucT+T?szw_A^Z}>g!&vHCn z#VxM)_|Fx0b9!d~;S+G-D*Hr@#^&=a67H75dE>DU1DWh;SZA+lofS%Zs3|@Rl zJm_T~y%|3LWHGeGXWA0V7+h?><2V24<{!$<>jiFWwKs=N!VweY4t6(ch-(62u@t4( zHyu{xuH9FU*a`4qcCcy2zf50fN!qKy?%kP9#Af-+tjAp03Or5ruGdxcPkdJtKK0qT zhIJIie2=f~aD8DDrSg6&-!evlcv`YIK5Q>GpPwG)F~yttN+8Lc`H1oRT)hC9Xd9YX ziO9Hm&@Nz_X1$lS6Rhf}y`Q}mRcjv2>o%u`m1Zi?YYpCcU&iiu)lYLv`qLefhZb4> zBQ@pcMW$OSVP~$?Kez13TMIbx%AP&m+gJ;JRizo28AXq?o^B1ZH=`UMJXA%T(MbS` zux%ZYF|Q!OM;97FA}%@3CU;aASZ9)F!yoX;bX?`Tw(6lxFCo59g)zjqH)^*YZC#`3 zZFy>_22Gh9`G02ZmfM&$iG0_v_@c~w*%_!71(BRn%Tj5&V7p?84h62Ws zIyn!M4?oIs?VE+e9KXrfD~bDKH>;8g{sTe3`TN5jg)4B5sWS@ zOmLS8wOc|V;nFphjKHFA5szuDygM)F#$x!3wX)vf-X4a5S+_L?BN_- z5KOh*OmqvTOvgLBwto}<0Lk87xt$&PT&e+OcrVHQ$y9}V9G?)0+@xu;8OqVa!uXJM z)_Y&B8PSE}|Nr>GKG^rp(2@qqitiwUoc{BuJ0^WGs%*`6Y-D{KhmL;vp5vMr->09- z--WM>4RO7@{e`__b%NG=(d*;>E8It*&ed&Yx9m>T$!`AVDKD1cO*{xXjvKkyAhh)ItoldI|EU{pg}k`>>C)VXiWiZ>XBul*{;6}eyzk}kdI+iR|59f0miA8Z zl}^6)kk5}?eWzwMlpM2iVdV1u%}b^cN}o8(&$?s-zIxc~eQ_`FA*CsaJ!RO(pMDK` zGeP_}V04gDq3N;qUyF?~` zFYqQSNOKBs)El^5%b zCO*w9cRma2!x95JudrY5^?cUjD8IO4mTGeOW9IGY(gQf10ANJ%6P8T=V||g$?6z+ewgI^nnALc*ZHn(U{PmN?>^+ za6{8IGU-8&%T4Ra5T!g_-f+L6OHgAyK=k95@^S&qaXyD3Ck=YlesP*DL zBwObGyu2_NeY=uCPc(2+64huWWJ&b=Syrp-WCMYWbC>Skns`dRz@>G3Y4iM)dUuw= zxzMp!PM2dRv%duo7AEdRvm69PbUf(G1kjN|?tQ+>PA{$LEK5~Fs$U$(k9zaz{l5Rl z-djf1wQO6XSa2t}TY%sY+*Tk2m*52V;O_1a+#z^^1b24}&LX%w!CAP=H?#LX_nrH_ zbKgGa-}`Y}TaC6tl36uJjT+KNuTd-YiC+6?QI0GZ$Zygd3g+Qd`>XIfc>} z26NOlaL_3#NNwBGuq(tOO;#M+OxIssrJ2uyON}ax_Z##+DZokAh+}_8S!pvM=(Tox z9Q6=N^gXQ;JLCuuN*DE3s7GjK=Yt|ch@?%j>s)5xs42M@e9Y^F#YLrxA4-{xr&ZJeZ?M~U)qd*XNzG!_Yhg$PO{S+cc#JQ4>(VmgExbtF zv-%oUD|`Z9FsAX`kl1=u%opP1lEkDeeg)~EkaTtVo}}Lm?YTy1zl>EsISeEL*uy+i zS8Y0Uy*3i4uBmCv)}kKz6kM+D8_>XMmF?8ao8xbT_(QgeZ0n%B#n$imcQmV{kIChR zUS#hMA?3@a3ZFewHP%5iD{&*`NbW(3SxGO z3F0?aREXkhCgi?v-hZtBRN&fl-%CB+Z|F<6Yu|am!WIQ5>G%7igzuk=ube%yl{RD=p+bKhZ&DFp6)|51w28+tCP= z2@x`p593ATNR~d9%4sL3p3U@boS+OnR+JgPx^o1;6_aY&nX49Csfuj3>%6D`!&;oN zp%c8)wLs7U&unzK6LNGS8_V<5-OrE(RIpisIAI zw0sdNP)SW{7p&BCWIuL7?-x&(L{1Hak zBEbhlEXM4OtXZuxxRJ@1teuA!ktEGAAdUk-C(fee5$yp!P_ zmvc{PJZ^Oj&sI%0(TJ_2-1*txf>^igu$`p7n+gHHR^YKAE?c@?ove?NuNOn2=z4lw zZFP@ODOvaAsT)QV=N&IQ=sBB+c)T!}Y)_}`kA9@KK5mt>6?u?!O=CNCEF6#&g6By^r;YaKDCMv1P!=GK59hgXoEb6LtJDmI6n_v4nZrzg{W@ndw*!;%q_$;FZv;ykI%o*n&MY}{P z<^qQjI?`TWugvnT@Xy1BWqUs*IeDA_ZL27Tf)QW*%8Ggh^&WECN=n8i8sM%@5%{I;pwgiIf|j=5Nz5pRhspN)5m#sB!M1w>l9k4QH_^kDFaQ zrG=j_s56OWyJoDrvgocJ#lFf^b)KC!P6dxPwt(2ShaawcKwz5C^hR9kWewRdn1LqfO&9aWGX` zF!3x~v~R8=mv*9`SjGc_lpQo6+ID`fbB*Z1*moR3WJi;0LG4}d1xc}?6jHAy!hY!N zuVZmqGI~X5YD}EcH5W7HilZ?1I|uYzo`0S#a+(ltdgucMu%p7cs$@cbTvkb46TP%^ zecggUz`?){r@c+&I!Mn5qHto)7La)nnJ&k zY@7PhykNt`KK_NvQtila`lxw!3*(Y!)`wj}LTt|=C7tRa*`@!b;EPTpzp7QdXR7U? z+g|-$#Kz|1OCQco4zn>D8N5fPFn?r6u#ec6KNBU5P`n{TW0(2kE(8)ISDMAl4|8Vc zU9vgy7%rDMf)p71YwVp?{IyG81zI9Ok*tI9;%whWHNFws;`ZLZ35h;1S8jJQeM=>T z-3^^+WXhOp^y_fG=I(Yc>%(WSu93&XzQ{KIOXAzUymtuetRiFBUS#?$GAcd84lZs>%bQbJ;<8E;$~uveDN zksz_h)9=}Ct7IeXa#YuiCfXYnf{<7d3E+5+=pE6+D6{BY$F27BQ|vk)Tcy#D@C3t! zD@EyYGvP2)?^TIM39>T9Se}QGd~ru${j@662x6qKykYO-(jA5{e1G7s;0}1b_jp|e zBPiO5BkGkLzrYMQm^FbA2oclxjimN+;wga_}c@<2N{n7-r6|}ZYkDa`r<*8NeL8!Nzysl260f|5V&?rzbaCaUc z9w->K35B!ROl|GTlDqQc=tk-?;nh}J|1(I;DF z(sPV#tIeK9GRL0}fT@^&yk7DyY|St4x(IFzhOG~Kap_U4u19LqSx#*_97A>+;^gtY zWc88V>SuqPkQwDYrbxpc!CF6)o13WQp&1$-J0zy)~_C2PPb}4Mf^#g^<#F8;8z|;?r!; z312|r>spkkT!q6L1>wYW8w;!W4ImoA>eJn5gN7kXbua9>1IpwStMW+4KhbZkP_~7seCa#hcCFMU)a z%wNb=pE*AV1qe%M8^v|ww^tmy)_qzP^HdZb_v7)YI1vh6PYd1&sgi7^POA=Jo-Mtb zMv`hV9#ra&_IKrz!&a`b4LLUZN1z%cMJ?QC>s;$*?nX(*DgW;?oNLCbTq zd|`uXlM=s)>W>JnH3;@5s##YKS)<0IY{{ix$OjzXSZ9L#U!YMb4v^?UQHWd`yP9z` z)SsR;J8ZvB2(GbN zfh)a&X;F`_bjk0M*Jx2#dn!=LiYEiP^qoV_pMmhVa`Cvco10%;mz-PF=jPOG@H^4v ztgeT&>{!lJ}$z9kNI+HglB5lS~n18_s|FIkRD-#d8faCpU%n*^9l=(gbwOZr4uyO z)O8j1sbToGc=k}enSEi(X(7}5>3yl`A|BZQl`qL?4jk_1jx!R!yR@_%UHhUVqa$ap zA19ieA^sF69D&!C=w^xG0@dlu9-34K=H}mz;}*X#C=l+A#S~GmaNNv?D8ik;R9HOp zchB1trUqvhP7HTQ*ZPB121}ps!i{ax^>~2>6M(T_I;5o0bfo$Aw3L33nhP)38bo5+ z;YlcsAQ&A!B~!-N;rT;&LcL6v+WBBrzxsw*>~R#WZjuow5M(B$vK#uWo{kXRKZTxc z#yIACTo#|51Y$^*Yx8B^IUi1mx8Iyunh`Vp7vAj?+{Aqt{h9CiUW)ACD$mC!&8mQ! zaPfGIK0Rg-<2aSnS#)%)qf>#%VY33a=?u~OY;)c`+Xz5X ziDBqNebrDU@-#<-i8+$pA&Fvw$x2*hHJLBN*e9cUb%4s%pK8*06!n4YO6o0_9f~6c zwvnl81u{a$<3Ql5NTX1{u;BSm^xeihC>L|agd;s6N$dKmUPq`btFxk=D*Gf`$h8Ia z-2D;sZk5HayIKm)*`v+MT+ z<&3FTI-KLw-H*OnrGC;ubu6LTD=5y-jpC&Y!%^~YrO5VV)2WyfdlLwOvN}JF%p!a1 z<+=HUk-vQQ69F<{UG_x)!RW;L>iT&yu)+HDp{bXSq|5fTAoASt`jNlOODLWvYKgWI z1XwI?B(M#qUFaiF2$?f$iR|Y-+|m0r$lP$(x^=7M^^9{j^3FSjIjuzOYTbiPj403< z^I%=`UU4US43sQDGBw%eDY+^vlxeY+CUW&?&*6FuQ}nSee#zdGQ78frcJh+=jY}3! zZmqp+?w`)(Y*X-ewcm6!i#lzJ>5AL>BTCTek0ltPJiXaUe{^Nz9m3+Q+ZUqK0U9!t~C zyT84J%VE2apqSx>mpZ)-%v3Spb2(a|UfbnH^rsx>vbG`#R8%6^6(g)m3Ojv5KZ1iH z_2dX^A-Q$|JkkY<^^;Lj51fgYko#vm3{Zx>-hI7N-B2)Ci`RhaRJYP()1u?x%gLbG zSqafe(D=ob8Z(XWlb6r5i$dbtbcfYrx#$77uSm6-2BTk=w}K=%gZ(nKcw_(YN*ebo z8LPTqSAMzT?zy=RWkS03YchG}Jm{~n`a{o%U#@zp(JZ$9`eKFkVq)Aj6ntGUs+Fg^ z%zsBZ&`O+Uv;?vE*&Yg%CURE7y(Kti$*lH#gef6Eze(gP{&dI+od@GI02R-UNnN^H zvFYWR<}x1__iavd%#m**FG75F7c2~6n3TMku3WpBNh*bSHZrJH(hIY`-w-V+GN-?9){&7 zLqKCf?pz-0tAVH}Z!}>65#1HXUkt@&j&PQp3;l4ww$Apf5B4r?v*8gva;hP-k!a^b zALbU18PXvO+<1k-zjTDY3E+!%V_mAxXXh~b`v$B#-w92C8ZtExN+6h(akfbp z22q&|zA~Z&8d-y49mFzyi-9F7TAD=USI^w?#Xp=dAzx*Aqeedya!N{k?1QR7;q#`O zCdd#dVqz}dP7gm{y!2kvK4~3S=;N&)6;d>SpwzdkBI#vmtOK`Z{)%C2XjcCYouSCg z5Dcp?GC`z$Uc3p1D>lb0dOBn_A-d-$vGO}UKQ}b*zwNERr^Pp~4?asLwkT9*TZbA8 zCp7%_9^H{2>d0g6;5aSPG#d!M&DWt?clBoy*f16(neiFIWBFGEC1F-__yL3vx5Ff0K* zhUwZ|drzRfY1w>f&yimpICRX)$!QT*)l>xj9og=~_Z3u^Rk^2|AlTj7tY?6~ldwpF z^NH`Ftf6oCMdKj3jc1RGL5%-~dS*?H6O6A?^PqPH0-c2MpK05Jhfk9xyXFG`el;J~_G z|1=uKH?_tdeZ~`n7H|zi{e*%y*7-cpsfGCh6iZl|X%>wx!5gdeNBHc=&|82h3}0O* z4K?ptXB%RgBk1Bm4}|86YGoQ*BWkrV4YO^nSNhL_mja(KG#2~yNh!Um%{o+`Spa7- zM|m$x=nqeT4H&ktEL%EXmK2>|tfgE#oWmfnQ=@PDZ2_?yH<)(CnjPn9nVozWh|uk2 z-~M5|st3tGA%T6kEM~5~k{4mOu1*D9kCHzpRn#VIlrDnzc?EQkbWz#YI@~vlc?~7+ z72O=^^MX*>8tkCVG4V%Iuj;N~{EtAtT=Z<`;IYvg#c@po-inG?7U+uU5z}M}IJ2cg zrw|D0Hh!vdQd=oIv{Oj^`8>903vV6xb=(dxnVg2~eT@}kW;)K!a+Vq;9#Rw|V^$^qfQZ(FB8uXo>qY18676_yJVgZLvLIDpN) zH92>s9{>mdQ1)H*sHL8a*JIXA`$kRFHE!y?4x0JT*3dXi^qlB$cZu%PQ$imttf4-i zXQWrJs}l?kd@d`o_X3+2d}K=BAL&1mYxYo13WDS;3kH7*=15>> zLOA1JNB{_iU5=t42$mvBqf;pR@E84*^_|;k*Bac0YT8Zbl~1XdJ37xNOqe+|$-|zA zhhBw_ahBD}BknhN>qNvg3~lhzYseM-%HD!K1dYQSi}X0GSwEJur24B3z7;G8Ew8D= zOQ(bi9O&rDe_^^7ISI=g)#T~+2eJAMg#GxHlX53e^FAYR8oB)0k!!Qgtir4gdv?XF zB6Km%mOR3FBqQ|xx!S@MZ)w`7etdyCZOe1|wtYj===TjRhtcV_Dr~ndpO0bp$&J6K~akKXG-0vRp3?k$OwXY_on+I7{qH)S) zS8u#tybmbzL_(1DX^imseqm5$_Wp~v#s+7VhP&pCrgt)4aV)Y;jmTmb+?(v#m7^eU z&<%#$+c5A+o7jV;wksulZC}_{g{#4nDbX~t5kuhCCT|f!OZ7U@@EenwTfSbZFf&T?HZ^zsWW91o zDlxxRmtsfQdU~BCADZBK6>Lv=Fdb)gzQ?m>Li!bf#b>hcnmkC{d!hDdZ7*w@g_^)< z>UR03?cNyceOi1}BD_%FkX#Tx!@1IrYHnV(p96DVK- zn<6>>WILIq<97M*U&bkWb85=nsp>c?KV-(}3{uAq(P~GrT;9=QC ztSkz21_ij>kJ6tS=Ezc39>jtj5bb=klvjTe?R0ztm?K2=U`3u^&l1v%>oHob(5%|9 zGj|c9fmImdyndVPEd&rBIoo#jFXks+2 zDN=JHe@Lya=C*9{IJ`}ics>n?0efo<^q<=&G=p*d7(E&cFi(4KZ$9ID$U$SzKuB-5)~HiKwA2>OUeNme|WZHsA5y*L)cV6Cnp^Y)ecd z7&aVTAi+EY?jQt7gThda8qGc#rPo(W^^Q~TY`8c-BKKCuK2Uni3C+-37&m2Ae68AFWjnQ1h3|LqIaNJ*P(=6afE%qNS^v* z{vO^>eKdV}p$})1Oh;Tek0SLb?g{wAK__U1-nQ8&2Pb5A{de2)9IeG`p6QqXq!0qc zZ~H*}w$OTfaCm-^aOPA&X73|$b-Y}->Z?m|lX0%KLrVg3|JfXRlNpeLLm-@Ti;^^R za3vxrg7Mg#pU87ZG`b zqs*h_ellJS)-iP$uI}1vNi+-qjNoIV(J*p5%1u@6m$Z%kUzOriI!s5Pc1#k;{;3g{ zRZRu`HNF=yx78F2(&-sa@p|7c_-*;?M2i7=RU1!OE{f&tF0MJWsPY8qq}RLXwBPIT znY3RzJP5kF&;3BLO(3=%59GBQMhIp!&3-p*C){hoSfs6Nvs5eYbLVbDf`R5gmKhuU z05a(sg-~+wmfkhj&D;1dSB1bPVTC)5VE1=I!B@v*OHm)F^m{f^!L^X>YJ{YFyD@Zv zSe5|Qdp0D3C#LZ6yY?UE$e2F9Z}{$y5yaa#m&`KitMqOf4JP35^Rn>vjBmpiq-M^b z`5gR~Qsu*dIIltd|W3??NJ;(!wf^bB{6VV1=j7x(7p{Eqe=snRO2# zK3p1nQjh6cvO*=8J(Y^NFdoV4Ubv=oD&RL=<<#u=YuI(8nGGnWQJqm7@Eq7ib(XU# zS#bLn>x5YhxxJ2PmtMQ2pPanK7}V=)A8sbKA4f##T;Ck|))Y#QereYXfz@>r*Y)&v z`>GHc?I7sMI_gv8(slpM0UUuTA!R~HCC8zksyj7g)Bx45w14_TCddo*1l znMI2-H5a>hCnTtr2XkUahbz_=^=l*k%&!?Tc-fDJ$)8v=6e;c4Kvsvt{4FH@A5-#g zH-qZfZ>gGkJIz1PMlya~U^bgpH)!!Fuu`7=GG6fB!K%RzKM3tpL(XFW8&-OZ$q7o? zX5UMzAJa>lVd%mu$vW*yu(xq;%?A5@ai!RdUwAp+2SZ-I7%wIZk$AjPB3to8`W~ULn6xT?5 zy{}ni_Y<3|=TV}=PVA})Fs>rAlyFHh&W#$+*~p=ARbsUxc<8ce{5lKu2j9g=x?o`z zcnIvWin{R*S0Oky2qg&cCKh^S3mlgRyZ$@Q2R^enFdGMYmBFy4??r}0TW^O;io1>- z&750k6Fp;~DQXL&$pu*S=%HcYh}VhUF@EAyEmhQQ7_@uL%@9~Rr|=aOwj>zIxE{7* z<_%`sK&RaWX@oKDykNX(pZ@+qXP?0b@tXnDIEZ5($_95RMfs5~)JXu=`Y@;-rw1@7 zEdeL8v@*%#J6cT+N%{p|G^h}C#C-RM8{9ds%*F$PWb6Ar(KW|Qs6gyZO88Hq0 z{MF%c<*`cQ4Q`_~lUA+=2W_uKACZ6X`SVVre=uFh6)8)vQ)HcG`1aka+BSd75XRfA zZ45VE;nsHjt2_H`dxWn6wL=GT9^2Ta1CXYz49g+mAZP)VmcFFJM9a|kIcq<=96qSu zy;i?Dxr0k@rc3u~r>S_%>lSZ=YeYF3F0qN z?x%LqGe!t*CCH#fc4ewsHoV*aV_kVXUyWB5_@s$mw^&J~&9)#Jj9iiM7aibFQUJEJ zU$=iyyi8v|Z7ua`5GBa$(FxbptqvK%?=n)F&CgqB>acT3S_4Wi}Cnn6Sw69mB}e;eg;eh9>_BD&CWtU|9)*0Og&Dp;GUZfv+7k-`P0#wrht1*`TlSM5t8Dc8npa?k0g}79aqnkX(U_ zuU?LCmFo35q9+M!^SoKy&~uu@nDV+w|F$*1qj%C8W>4PESoU$H_?q&)NA6b>+V)DU ztQYx}y@^yLAo>`Q=R2A5B0Y6o?`?5XL%0SDpnU4EqjQ~K+45tU^1tcT92KBfR?8K} z9~J?MOWYJ^g*ilI9E9+NDO+&}nA8bHi?@5)Yh+(7GV{oAd-4s*ej+XQ4ICg0;3%VKh@(RtHiin=UfNT5D08pW zSY0=E&72~00;x0ek&hO>(euMwg>|U2UNezr=Wx_}YBR6-#%wrA3PMcDViy^X#;KQj zFOU%oA$ZsOw~d0c4Gqet0MTV=s)*BRz9@fxBf8>4P8PE~$WXoRpsTCP~5< zb>O505CmGsGbD7(Ib3J8DII-YfXPMt2bJVc-H}||0A6uzqbg>eldhJU{MaW{?Oxe+ z0-^aP@VOwNYe@A3l2L_^_#r;u2+st`8BAqRb7i8+$t_0U9^N~s;lLd}{*PV&@8fYK zi_zVjIKI~WBrpy99S1H=U-=~T(jw7>b+U?##_vxT^Y>@*Ilj;Gw#kX~@{COQk(5Ul z3s@i;OaAg1vL=;qR2af{2;X>i5+O+vFhl^Q*6I&2SdE7reFUha>DRUwi#^*}Es)5D zvqjg##B|#xxq3!hqc`OKi4-LC$_0r`S{z_Vvdwa{oW-mqO@9pewagCuKZ-4X@$M&Z zfOXwV>wh4`nT*R;iz4kg2fe`c#A69^{I2QO4=#wae(iJOa(UhV6V>6z_AB0B{--dK z6v4k=@W!z3vvlaywPaVo?Y6JT0~5dokWR^Gc&JHINl|Zck!vb5aU4u#|K`PKUOkf8 zKv5(=dJab!n|k#=*05(qm3Nr1DC044OF0YKAp@hRR;*5Q+-2U4DUd`Uf%}ld9hhQg zA9bwCQF4T9aM8}l_E1tB-l*+Qd_l_@63r?H=XgCqQ#n`9LoDwVz+a{mO`6iUlNz1| zHM+vyT_zE670W~qSIgX85IPNfFI%i1{*24G9ZK$-n*tp}98qaKU7}Zh?R9ml&dyNW z2o#d{mWl#9(@EMnH$`q`(MfoB?dxQz{&|x?V>c4O(UUY1Oc_074CYgd{9v!Xqlq;{ ztb`TrTawmAM51AYv9hHpS?n5$1%0OMbdmYol+=p(57+h2&+l@E{;9|5OAdeR*k|5cu>&TXFe%EI8i#_L4W@G5AM@n3j61G3g_QD(|H*Zrp+vH zMja3r{Kv@XH!mF--6?5eOA#yE;!1507{y>1LZB9z8GfmqiT<}I{NtHavViqOgq1_< zn6m5KQj{^Xm6*z~jM%jr%1-iust7JLf((AEamEVt&go3W; z!G?+o1qOtp1qM+RT$1_+8RSn&$X~>eAtXiK*dY-pq_&5(F_2FWORmU>7^kc3!=AXyYu!N9rYmuGcbzWL5q6 zFPQ0{zevdd>yfveX*j0VsDZ?wo*Umz_8CEG#CjW8;sH3>9Rc*YN}saBg?6qjqqMl+ zo>27n!@uXH#sCUL06aY4=eqE}5)w4_37|E8*+FnP^xuMD#gqJ?LMmU-DP&D?#(=Yg z3MX%VKfwe`GKHn=>P#IQpK{^Q%;9v2gKdfOz6Z%k;(nt`4|G%~{1w@D`Pe@^n>`3Z`|1l0At&C8O1scHYLb^rAJCk5cF z-+Zr@{ZD3x1@jWUcOc4;3ot4F@}>X9FDZzAFCM&V@_#Zrc2Pia37S5XzWP^7_diJG zT_7OwpE*Mu|HH@_v0R?PID%c+~oBw||)1NjF^uL?w53?Qi|98&>l0`u{+hL_p zEm2M2Fc|{!o#3sB!+BbrW~ZNPy&TGj{xxRcp`X+e#^9{wYWvOinstW3hl>q7n%X)A zBbqdx(HZevS}ubbx7bqIl!19t39mG#qLL{5gD`J^BOO~Kdy@`lTOtk*b69*1yEz&B z?o^cGrv(3oOY^744wG63g=5p^f7m46cAzE7i31rTm8xv`^VOIQ0$0CcYpDc8chFdJ z52km-Vxw(IkDm&L!5M~k#)%>%~vt(&9E>Y7Ez&{T?TLer7Vf6?pxFMvD1!(rdbu%xjLyXRy#&3X!Cc z<1g9xjC49m|A@B~-gg)A>PRMMTa4cod^Oz3=i1$S`o1HB|D7;NA1jP--46?mL|N4T zbW>PG7k=hrN|!3OB#A#Trc}!nDNQ;yHchZwXBhlg1|PFPgUy64Jhtl#!7p%zp-`&bf0w6cF%SHP<6hH5i%1a@giLL_YY$H zPKPuHGv#qVzrBCIWr#UU%D&uSdn<-c{vbeQQjLi%i3Gc(yP| zW`pZ@pNRkO4@eeEa;DoKnzfaN&*4|1|6_Ja$2BB|p|aG<3TGgbE=F+GH;07ZP4P}> zmWK*oIaeBOFqZw>L%V6ygXg~SyCGu$Fu_JN9!ZoMQg0WPf4;9M)su_UkZvq&h5tKi z>_12KTj*k;9khEB`3Ypp@M z)d!kk3jddHDdfJq34kE9SR>Dzod6;V&U;M zU!4#hY?os+f&55vQ7YE8^@Rul`TMIbyX#bTNFg#_+|z?<*#|90tvV-zxreLL6H?+U ze*-eb%8xhBDCaLnUKf8Z5%AZQ!;DEjdnDNFbU4A z6Kdi-({y1!Pvl{{`VpBvkf@ielRA*@VJqSBo?s9 zhK(V;zIhvoQ1;-G3XB8*-P!e)@ztUt_r@e969t451{;9%?+C{LOR}yTni}!@OkSpn z`U*n8yo8VqdcDAh`$Onk{THNs4)2a_7T*GuP%U;GU^zitFR0KPqk*3gIrSv z@(ZS(aylV&H>%G(?Ch}L7~b--9gbo}&wkDlBEt_e`V>bS%c8^Y{usqW3n)k>sO@aur?~4Mt<;P4Hc`AWlwzx~k%XK>D{Y>} z;}NXy@*aAF572}}?d8{T#^C~ueBJ8k(OVM#jb83W;x$kx%(3p*7 z$UCPuw2JvWUh%@)O4O5Fr9XtM%Gz*-!%iYk9 zZZoAR1GXJwIEYC4=f7BEXI$YhD)ghHY;X5^?H#NG@A!#>VV`R0Nltd+=9oiXOEPtB z*$Z}4qLMx(m!k&54sK>u;{B~iXks;B7y$2^GqOFAH@x=Feu1~W%!{)+MEOVTb^X9v zReI)8Z)p%v9=Ziz?9s+JduwD{?2PFso$p)bnNPk_suF#Xd9`PzH&CwAa@joFvG|cI zC7xDhUk69{R+gpWqb4#w3(azi$m`ql?a94W`LtI6!@>#u`5{88+NfW?&3d**e_e^c z4B+xQ^plq5sg-?kxES~V;Pv^a9^1}*ZfuQ>Ft9P10?VvC+%|a!ej3+{4UkxNh&R2+ z(@l+|&-D=Qbjjf5EYH*+_50{#=tN>t)9njH4>wq;U=s*L(Gmgke`3$5lI?lSs5;{$Yp`6}y{QjK<5 zpsCB&*7?V`P1D7*u@ATBQXenp-|O($F3=38Fed@F-h=e(gauMUW~;z{@%2ZeDt)($FO4X%rY=81}{ z>aMi%Lxyk}w+7aG#eo6W6+3= zw?A(`ktsTC#ZrlXmejrU{JCF0(~QcN@wBW`_#n35(%~`v+DsrLsn=}_aYu0E--zL8 zFjGLE@z7N)C*+HHq#j1R05T{6H7!{s!WOJM&=|zVdPZf%+YC-^dmxTlsol`|)?f^^ zNHI40`M8h?gwm4QktB}D<$!u#$&8S`;U4wyyDbOn9&=?(aoMdo)EHqJi zdsJ-(Ug2#CMfH(dsB6(GkwkD<{bRGN!@G9T)M2GRS z?Q-IY8rwI=xZ@dJm%zAVcE?M6wP4x`#l9|gOTxF zAQOv)@>z)ruOeq*U|d5!s?R33EYv4C?M<|}?j3QJs1Hhzw^sDAOMdc@;WT$}RBnUx zK|dy~k&AyX{8$IXyx)BiL|?9PLzZim zinBc=JcJcF#&m2?ca_o_-ZW}G@gg)~i{he>q~D7$s8$CFFJ#F9q-`SqdwY#SgqDF? zbIyzAN82;??7?aE`=ta7&rN3Fq$C@I=GVqNnQYzm%!ejUsAt(#pR?hm+p%1N#!w1q z(^;Iw6Q+}egPs=#7;MzHJ=d#oA!x+*{=nxGK|7OCj{@GdT7a(95AGOA;0cL|eSv}}6==CX2weH(3h$l5 zs>i^yIx%64<_SO+a{~cnn6kWn^Y?+eq+f-}_ono3scuhwSBH<6Tb$kVrIV7&msC=-#Qb(H|zOs%I!D?&)@hyUKXyHJs&I_*)wRBXUN|@Ooj(^xoSYnvXVrDkq47U z(@pjhZ7;CbUz}|XGR6olOxxdL;6FTO9hAg$iFobk5}q~J2>r_ZIVAop-4+FQj8JQk z%51*ex}*#=fn*8Ew*!2@VKE&t=XxxUb;fO4XkBPKT{j)SPUuT_&n+$a-+eBzU9DDG zI17MFH)hIzQiX{;U+B0t0u4NjQn^h&1K5wc9r>3AE85G7rW2j4@2#(xwAp|dbBsxo zwdtLs)na*D`H!cLr}#MA+g=$-KR;;0z#sx4bebTd--Xcsv3T+4U-=JUN>+9IO9nG& z2s;L#TD7a~PdBo)Eaxa7uY;~+Sd*}*L!>r|SsD==UcmH(w7+((GcPNtwYo*-;%rJP z%BopvzjPH!jn?eJRS65Ep6OiqY!X4EC#)xUaW5cD!)ZF^5?!NiwcaVr1)ARtDR?b> z2Q3g`wcIL?AxKqa25IN;d3a~`T!_;@SckTMaOC^^s9rq#{o7LUeFTe*)sm#}on543 zq<5yI6AFSZ*+p?Hmtk!JIpfI?6a=+W?DdLg-w@Y@hv-wj%6ca&%Jnc;-MJeTgT^RN z&$6Q#6;2=$f}&GewaE+EpN+OQC0Z5j=nQ^N4QBph7(!I4#cI=`Zo!P5pmro>KL`j+ z7IOe?LjvUws+Xx5oMG0wIK2{SkboK4?eF9r!Gj`EyYzmVH#-!AEx#td`MNcfBm|5)BN5IT?yh1GLi9(P#%HJBu^YJQJvij z#UM}KxxecsrpPtSmp-$@Uv))YM}==An6(tkfDw}fs!DK@BmYL!`hhC8I2iWiRi*1@ zvEQfbqDP2e+=}uWzrc{z266b9Ws;K{MVC3)Gvz|L>n2l$1hunYqtx!xempDC(Q}Ln-uBJ&!6?cc?5a2*`&=_kqlk`+lr66uG=YY+wl{R?+V5OG0eoW zD54hc$zulj*2)z|BSy+s@rZ$qV37kS)H6KB55n?Uia3U6l9;t`@H!RisP`cU@=GRq?>pfg8lj(MCDg__cAZ__ zxPP~|02%;0oKn6WRDiz`if-K9Py}e1(E+YCO;;EGa3l^7eGcf|08Z}!HC=bi5_WRB z`=oM;y;eDCGVnDyvE+CdMd6G)N_!$0==W>4Mgkf>F=-F6bZb7J1iI!AMhP2q1L$wQ zWdoT9dG|{Yrhci$se{Aa2w}}$gIA@a1mXCzW~R%r~=-hJDgQq`vO7D}#(p=h`PDCr@*( z(^bQnV%897$=(s~#)oUt+ zX|9kZ8uN*~Tsu$|+gB(pUOOjCnbDW#h?@GL%#Cy1{@AoK(vzkQ$gPNLWbhmfc5CG= zkU0L!{k=n%Mi>EEXhB@T%ic}kd8q%(;>pP171*&{&SpA|;zTv=NCZcE@H3IJ;k7t<-mY#$yF@UvA6qmvY zdV&($FO?!zM|K_bjhIO0Jh|+l7~b`6vB5UMTB|{Nc+tl7swo)Og#b!G+n=;RAIM_P zR#IIvIVxdtyElch&Ttq=1YBfYtX9j%n#6;z8fTDHfpfBJHvrjhnAElQcVcvb!Vut{ z&TX9tlwWuaGd+*3S?kKnqjaSGDrJS;Y$J6swpYcm(1F`+wUc{x;;r*ENA5sdEqFMZo*bb?y>~Yw@Wqo(W zOVb5)ES?WH;S)8kdW84K^iA8z=}rp?3PQ};vJXp;r8!CDbw6f(J5MY-^?@sSySoc> z2DPuP3Blda2;$?otm}T6A%XXTx^J^S_}Tz1eb=HI-t)s=2zd$~qkjK}UUx8+)B-Wz zQ89aJZQ^jea~<0JK}lylmrkA0o8XY*H%|oWjunv0re_SG(8f7TSS~ z)M_D=0qw0R4`x{= z%YU$I=(QTVE7-zIQ34&g1yV|N_tMKEk!Ko<%YlP0>a)|2klUTR0cs4_nLz|}lEfj^ z3a4sC-nAUWX2S?L+3zqU`k{<$FB)IK0ews;;5x_GZTBa;6X4*Vs@_};#mUEh=-%^9Qzb&N+@_t+lUxUuzw+%w$A@PXfws0~2BBxsLoeINwA% z&{uz&2LQUjKn93f0yBMTcAR*;oav13^WTN0<9&k-(U^hb`Zr?DC@b$R~BLgsF z2!c$pz^1)xB*-;v4;fL0q^w;g1Ir4eee{}BrFqtyZHB*D9@%CW2r`;ZoO9&#d=cW# z$*{DjVSUjlSgNx}D)H0n7SG@P=Pmr#krLsQL=WC!_v@8wBJA|okd(aKeV~CiL9~bL z&x7uINwhhPac3G}NIsFV06I@oB0)%{RuGUR4d1#3aAPdIEp1nOa6d&YSDt*i6naTW z(SJ}j@3~bCO(9wBP~bXGjz-i&%uMnTI_7)u`$+DgLjMbV0CV(&Q9g@%Xr?vPVTa= zAy>}Qv5#`>Ew>AY2>UzbsMWRQ2SM1x&Y(9>)k`hoY8q}m#;TviI=6^y^U z$6AK+f6Z?p9eig%U{j3?jkODX3XnUYz)UaW%e4~~v-#<=b7N1`<`1b!1Xqbc{o*wO zHlayC&RIGET-j^r;nQn0+iVfDd_+P$S+1)Bj9w*VT`V8?-(Z&R14}IM6y=bPAfM=i zw}{Gj{LqU1F0Jt(Ud)qpE!s`CodXIf<7qy-pH2rGpTcK<;We!Ya6$sgndvW`GP(+WG*I;(*gdP&bM9h}Jc`Og4lqXS5~$oQ)>Xn#V{gxOGP?Ui72CV=UH z+E`_vXZslt?49>s09=!fh@UEs?R;Hqefp^QO z?#ar|`z#VHnOIXE=GX^&>rKLd1_t+QsIVn;V)-qF1Lr!w(ZeKX3H_tk)S7BDXcD z{U4pRzvScIH3_Z+)2)aQ2MNkjwC%M?;4_1oH=$02s^7j76&anQPDaF27iX(2l;X;f zfb*3DR8W_>EaZ9LDLlXL$fSn}OpwBjNY2V=Sp^o2gotD7|EKNczox|gze{~P0=HZ8 zaLv$FQsXE7ah~?i@Nq-h#I#^-P@?VPfM9G1mOFD`oud+-FT2!*@7u1fbUkXI>q2am z0Bi*k673YR18Z^PK+c6T;t#=8_8)02jJ|eaP!^t~$=n*14a20TeQ^5AQ~Z_Rf&VQJ zqu$W+6$3K7JAZiaI_~Io{mP`q?*rcm>UyvMpQBO}l-kBTMKEd@_sA zMD2&b3=-{FX{&s>Jdy-7?%VQbFq@;huq2)fA0n0kGSV*NCmt>^3Ir@kFT%+z?-{yZ zF@UsLW99bEjH#9j{j!$nHIJMc+pkU*Bo=V5=x-*KOTs!Q7J(Emw1pDjn0GMs(fNYJ`Zw4uoZO6F=N zVQe|0LG-p<5{i@`OTJ4J)*Tpy7&@Ahq={Njm&a2*TK+CjZ$D4}aJ+!>&K-)U@8(R8 zblXHfHq_WSmMBw+gT8+Xqjgi|Zw{BbK_heW>@wyYag<*`sg{Uv?s(6F^{QVubU2MS zPM|!Ujs?^k@>0e#EMr{>70Fv_i?!u@v(K z1F_(hMgPoyKDsCcN#}d&Z;0prOMpMtd`NevR2+2?X!rmNakS_q!j>n0*_LB}*H_Am z?77FjDt5y!X+nyfPf>_l@%BCE2Tyss9$f0;v$ov9jFP2;axyFYBv-lPxf?9FO4-uy zP{6!}T;+`T7xbb8=IXuQvguNW{L7`=Ee_0iJs98J-$pb97b256p%F*4 zhAF>A2WYu9JvqBN$sJG5OWUGn1}#i&UH4n)Fax0F&F|$Cp5$Awf^}Nouh1~O)Go!t z?YPtF*SOtvuOm&u=gi_YR!EUB@px1+W;`okX==(x_nC76Nk*u<14Qm=1Wn@jbh>k| zT6l%qdb;%E^QM}OzBJi7o4IGeLQi~1^uv8^gp2bdBKZlMbU4|)hQ%9weDal5diAbs zudxq5CoLLtX)QycH}R}0A95ES4{rwCQ^ebuN=(@AlW|gieXI8@u)J-Y;$x{XthKPP z6{VlfYxRBS4dE;=O_Xu#*>o~ae&G%m564eICwN3v3&Q;dsKA!ilD&m-n?&LdAH^!! z1{6e3?6rSlJ<7qpEs!>bds~Q5gsVTrBEwLVvvpeELjNgs;j@;0 z;IkC^Evug#kCJGR7n=s6=OVqCX~gWA<6y*mC`k4XUr3@Q=kKX2m~AI?_lN6NyE_tW z_(V0C0OypJf4MZd1wBW2?&|y*ao5YnJPR{=nGQPpyZIr5^sS35P;}gMV&ItfpF7L{ z@FC$sFuB+1KM!w`Jw86(Nas-zkP780QBD?eD(9$QkPMHo=;fccVTKqu1@nHK>@`4o z!F``#;m{gdz;f#J3qi2clY%T^)U}2Tbj(RX_}O*gJ;V2xhsZ?7*3-&8{7InTBR!pZ zY&nLU{rzV&X!MUO`=37)Yi7TvB+?Vz5we;sPNY%R4V}u*BfJ)JjNI7XmUUIk5Ynid zGq=#Lv3zSP%sxw4y3|yw=Qzd3ZEFMT5!!!dykd8&?KZ zKFnI6`e&I;#N}W~E$NxTGZUG+;MBAN%|CB7@IwBPn6*s~E4gaQiFf(;IRk;qvt{nE zrjYqk3cJCWvV6~tO;5n>%1QdjeP(9n6UKRMwk(MegI3d(7d?F{;?YUC@OH2_uEl3q zgPnlYbW5IMrl2x9Rg_1HEbU&QkpG;H;OO_My(-s&vkiEolj%5@NimUM=Dk0izW=Pz zee91es~vXDa0cXlbC&nZec$6jHL3gd*<$uKYG>*A8jZ@RdSJvQuTzG5qF-GXqJQbl=*&>)i*rA~E zTBR&8eb7UMlLryFYK9N8`?&1AJ)>HIV$xcXTJgS!bL4;Sy!+F~muIRagt&9H>~RX5 z?UG4chQLIePq7JuY02J{HqzPmu?YwW0eobiu&|)l2pZ+%ww;@l-)()n&QgUANt^Ll z=!v|3pTw!J!-g8i$8^!oeieu_Zksd2#L6lQLLYf)*Eqb=Z=)%v`6uHkjQ_oX64EiE z=-0FE@k$%(#Nr8>2B1pi$4+A|3Pv|rR|CV%u&3hQH?1P)y;6yuM^I-2!_;~`C3D3a zHS{$KB7cxJOi~zuc>vLq)v34Ta@rn@Gi05NzDKh6oe}bnYfu^~QN~5RGgBF#?z41x zPD|HvW5Bb09mDR_P8}gpgh^%7@GafB-->@LTGu`nC_7DhWVn=}L`VQkkDxt&o-v@m z?Oed1YKWc6XU|3hIWV{Py`Fi?-5&JTY!7PTkCgd?llpyfQ^^a@g0cFm5n{-GnAd5d zhk0=T65gtdzCa;THP-h_ayuSbm$Mb5yM~aZ77;9YA!s~K&LG7bV)HIq3Ud@?#`#aR z9>@QRip3_BFWwMw4wtZ{upG~o5Gsc#J>s=|GEsnLn8?3=pT?DJE%oOY!MOe2%3tWM zU^cvCtcR8>?R?)V0ypytVkASWl_#DwU%ZjCh7a?}F`;NIsqi3Z(s=KTQ_ z=KYu)v==_(?lQE6?`(y@wwS=brqVReeD`l3B*TtR`JP?ZDy>J1`3K(yoXklce<3yC zyCc|XM>k`{uT__PZ}iz-#0BXiGpe0nzXBKzR7H~ge_%nUtes`&Kr zV<6^zBbvf%^?^L&+o@d|1(|%BD@|@=N|RqmM=|Gfh@z6Jh;8^FmvPNk4OHtE`oFKj ze-1@93~#cp6QgIoTtC*vG#qzY-%a0Yxh>G+du`(m=_c=OZv36kWWFOA?7rxi(!k}O zln<@Kaq^YYQrTK`>IFV7bdwR+#07M2jW;-|S&dd0A3D3%8sM=?WyjriCX32bNb2nj zJ{hXvb~1yd&sJHk2FUMa*!{x`0dMtGAzPV}NSy)`MGvgSpHI*K{1vdjC)9;txCkQZ z@Bi%Q_q%g12IH?!{k;iVK!o`Hye@ETNB+%P{r~^5 z?M$Ss{=ZoO|L}|dzavZ{@&DQ*Y?Szv`ibSV9zj~|7ZxSLTmW{CXTOr_8yYI>vFG|X zUZbK4b0C>382|~r--8KFe_Y>BK_Cp@9U~SJ59+3=4Yz@&q({78rftJ>r}k`fjX^m& zSOdZT!g-Z8VJc8?#m(|Wn8_Te10{J@8Es=i`huhR`nXo}pdll5|Gcdsts2X3&$O#% zuItqSY|X;{zO-<{o`f>y9}e{Yx+2wT%$F};vT$Xv-lgJyezU2NBNH`*cX4-Lde`#y z#LcySc!5&UY=mA|H3SIU86iedwZd|*jGerogp`~Km*#?>f98J=4Eg&P`AKLHZlv8i zO5^AF@+*pJUM+Djjonu<)3Si4aL-B8^~R;#A-h7GP`i1;^Sc_^4jz_ong^qE0>-0% z6Je$}vU7-et)~~FC9>5hekWF2D3qp);V=0NurfK3<(Nc|OLG_zR;9x#<|P#TQc=3% zoxM2jzwof7rG8r#x;2(oGIW+7oMz%d57i37arYEg{ghQk%}_AcT>s0H?SX|xr}yWr z>+b(1%05-f_Bcs0zY56z&PW`2_B;2pGQsx}`=_!&VtWv)!5_V z_U#b2sp3uGfurSJI^%VgciGrdSbZTMx^>Q?KfU@O0FcJkgvDJ<3y3DDOZQHEX6~sEjwE^Uj_BMnXgP|SUWcU zH^&nVb&Uu3&HnV4)uyYR!Q_qN(XajrC_mQzPEktRuJ*=@HQ23YDvZn8rZqO#fYBL` zftugSSRSZMBuRW&AerNTDIH|D+Jgl=jEDupObtgWc9vbP=C@l@xK~)cKGc_d>@%; zgq)vu%}}~qJjb_J^0D^!%dc^HS&4n=7&Y+15gno^Yf422Wtp?@B)4j@0wzx>#KvLx zR%eY6>zh3jqyF;cf;fw5gk(Nr)N)!AJC(jg#$KSll8CHd>YFU0rn?3C2DF00*-hkK&C z$8Ph+!YFZa32xsn=kJY3e*0#Hd+XbUdVsi;u>_Mp9h6=j(+Di@mPQdRzi>ZY+{Kk^ z7^R4J2=0=8ZJ2YkdKGAyu1okcO{o#M=~%q|8;!q2!Ly5l`eoCl77xybcLovRLcmSt zM0D@jP#V&#WJK!0PkkQ$7470<>hM+!3qjqbHq)rjkI4tvpLa?~_JG6|HcxUGaZICC znCDDrx{2g|HU{~IxcfQ@EcCxIiaKf3jE9J=}h!Ul@P2Nd;_Y&j6rLNmlj|-tpqX&SWqYKlw(SO?fj!}sk&tYOh-?WF zJ_R#YiMM?}%KureT+<*WOZ$Y4Efy$__!Uz;Din-G{y~A}^{PaF0Q29Bps*R!cCIEC z7;isIHCnK&|7*_c{?eT?dx3;cz6P1|zcl0-UJ07OMsBZUh_)~~i=wx*Kufn1?q@X! zP-Imuk2boz@cwY)DYc1RxuVrlo-g~lK8OH6mEY>|%f^=gH^|Gz$r=4i!zilLpCCJ! z_?rWL=@0I|g^NjP|CScL-MQMdjjgTJ>w30(hw+o=1_9k3u8f2YB0eeDxUbq_VIW_r ze6pr2Z58RfBt7CWXp{|K^M@n+F!YXBGBur8>EOkTcFqwx^*ld61h523pw@m085yy|f91gL< zWB=>o6tqYqH5An0-WX&Q`@*p=7$5x>^y8I#kiVsG zQLRkWv9xUX6Hd+~tBL$nnmMOSpxta!AcFeUszgvA`CrXX0h2{;u~Lnd?b)+PWwmTX z_N5*2om}v3{7Y{)903;pdf3gK&H0?Lb1hlU1shjK7+>X0B&w;J_4Zey1HW6GOuJKD zB8P_5hVOFghv3gI?Ic67ljFti1FX9TK)soy2-s&TrV5VT+Q~?62aYzVy7wM1K1WdN zZy3T~h$&m8*6m|2m*VICpnb0Wr_O3xx6O#KVwU_v3GY@OqoFI>ezsJ;RduG|B`PwI^lU%D; zq}na+m60^U)9KNj{_lp0Ke7g6X)r6a|J9w>Vw%6hjL(2AwKmI1Z1)2b2K1%^^^98vVThkC2UC! z32^!6EE&CC{P>JDcHOAtu>I9yg@>hybWPt%VuO?P_+kk?L*%Ba(_Tqa+3n>T2AYul z1=7DkTQ^AERA{|+o{9ndMATUA%5_mJwmgk7J%iYSRkeArtII}jEE!Rdaj{cl9XU~H zy8-2_4*``cZWbenU0$i>qZ}N9U3` zYQ#)>c`qm7Yz$&^;P$PaMC)Qq-`vn~8E;_!ULWi&0$poop7l0kdPEEDSwJo(nTbXa za@%@3o*77xXO;m*Q?(6ieiH+u>J*N3Q{_`*!C7%KS zE+|tDN2J+K8N4AGJ}fv!!}j*c^PbXN`%NlHHcEX8T`l`ItcKFeeNNA+<4$q_F)YcI z40yBZ=W08wzw};x@&_&Vr;CbXS{>V?m%$w2r>(__W&A+$EqStbGV-Mrd0yF9t$#)P zgvVm2JkgH4u~095@M`M>@tSmh&RHmgqi(VFvHn!QiNV9GTxVE~vV$!yHMi<7m2|Sw zYSkD1Z9cBCY2fznlYzLI4k35wj--lSG@W$S8z%6VX^uP)xU65!TlSYVT^NZjCcT*R z7U1n4sb`&WkI1MJ8Y@O;#F0xr+X)4WhuMaei`|S{bMJ)A=THq)x{i5R^cvp?J{Mg3 zCVRt-5IuFT7{>gQ)0f~Q1WM9=OObonGa~h2Z*##>sRjJh%W}Zom6RzYy?ITRq_C;M9ID_N53OY}_s1ykE$Apq`$Ot0vjF z;OQtc0dA>mExK^G6GO>c-(lFQ3PRgFS8%v4a06LrJU%$2`#nuI$=J|`YFWQvT41)Y z`HshRTQ46%_|PH6Ceu%!eD1R`oxFj8fdf0SIR9&y&taueD<`M3r6CizU9j8W>dA@v zjo-Qo_&g5Pe_FsSwkgv7i`Pf=&%a zKC2Jk5He*hqt0}_-|)c-`F;1EyfZ8Q=W7yi@_hCs05#=ve3`eM`MfWDOMCI`i$!L# zF6=YQQngS;(rrH5xz}h!pKiW{jBk{55{E8}u1$Stg2tsfAVv=kd=?)9EwF9(Y8n!A zoixl*SDfou_fm0*+HxI1`r7-%P%)+F15@{&5{cLN=01_)TARTL#Wwst8!cGj0ah|K zQR}#GmwJ2AH%s(@MTQ!0Kpp-`ngI9x89X7p1q*e)&j=J7hEdPej|`8OK|NjP`=bRN z=A$$fvh>~VT>0)!32YWy+v86|_X3dDyIBgN^>el5OE{1}=u2mBJD95YW)BE($B*rr zcoK{e-xqNZj{;73wkx|7C2S(vKc2vLy(a3Jx?*EcIm0^OhpjK zP=9ei6uHkE^Q)16m9miq-IyiPT7I=UlEf64;%6js7vnpwW+B8~@;J~QV{Ot;6Gqsl zv>W-8)mmE%41Im6paX78owqdR4Q@PMAO!bP zcD0q8X=)t}%sK>~ZSGH~#BB~F4}@lUvq%}b*REwu#W7k-txB+j*BhehOUJ&oE@s%S zO1v|ix2X*lagRy9$aSEXEA*^u>@LgXm@gvWb=5#Ck0DDUU5CgiaelsF*A*PCl6Gp` z=&a7Qg-&x{BV&&*RxC^n(S^$>|D4vU=KPV-)J z9?;+S-bXBe0xjRo)e>?x$}D-@FxL08V`{JfQeEQM`@(ga`ehl{1rT&)hSC6}54zsn z1`L?1p}TBJ$^+1bjzu*gwVLLu5-$2qE?GT3Xh7Yv^BDev#LLb>!bPoBHR-xrFvec* z*P(JM&~N{Q$_36M+T@ma203^(lt$SPbfcQCf!q6__CEAN^HgU|yfpzaTaM;M~t&z?FCsA-)3g&IxNY_4DuQ z<~<1k^!X(=2NX9F>SJCqIjN4FVpq`?CFlUB z5$GqHwqV^F@Z8q3 z@)|y1ztKu87MoISZXr?$YC!H+nfyXEnOwMu&+KK42M`bv*!W9FUrJ{_cZ`d?03|5F zY+5(Bs_Ih)ycUOP4!Ab9&HDpm#L$Wq9Q^I4Q=`uA$aCof?#$i#<(n_ywuiBf?emoy z@S%X!^W~cz@$VBF)D>`ps*~t>c`u$QX~&~Gm&O}0#|shQ{{2~}1>?N7kH-qgoklt0 zU89g=NO7@TzWm3AqR}fjrD5vFBDSy|DD}>wz|N!niFLF`FUAk&N7aK!pAoDWY-AUq z(9d726+I-gjE7O55VH5`!wy_QRZ7uxcBg-Xz=19%l^ zYyPV-tR3wdR3zj*a)YIjTiWU0f2AB|n1-e;xui(5eG1TcA&QyT&E96^2jW>EhASUU zQYGKeT1$(xGOslG0oaz)s08}jDHR2qddt#3|5Qh?!Y*0){5oY=a~kfGzkZrox9rM3 zJLxUU_ae&*6$y2lNd6rgM;bs|UhQvlWWM{}^Iktd99Udl@xwSdh74gf9lP_Z z;=QUT8AVf>M|D>p%S;Ps>hAJyr|gaDyUiJ#j*!`U4rWu?5uf)^F8SQ7Ykc?^a^X|? z2T|Tdi~hY}MK_%zeeX<<8vIB6p_!uS%A4!XK$fjAbhBOCJLj4`n^YEb&vmJs3#5f~ z+2Jg>N_!+~;a4PBA(^xKCvKN)WDVsbP7%T5EZo#i|G)qfTW&Ea(Ca`n_hr35Yx+7l zcSDY1s4U$Xv09LW-~3})TDPR@7iJ$T>?pmt%&3oNGFZUdN>bpLtthmk9`Ad|^~L1b zUdX%D_GXU=bAC%8Ys$a#qvjoqRn1;nnx;i8gm{4L-TJ%}fi77h!#gL4mRc~DirA-^ z(}hW7p8M4-I!gG%)%(|xy{>h_kiexC8UJBWMssd|kY{ZJkj zDlhvn1zNrwri5v<=&6|+hO?#>!?^DaC|`0%ger%T=T6g+tB3g^a24`nkWt_D<)<0H z`2J@ypXoGXT@cP6^G#d?<55J)8q{XtYKBU^AB9g^6gGjItCH$$bqa#5fAqQtENX9A zlJ?vYR8$r-&bGY&QVlt;?v~*?{FA$r?0(R^e!YPOAw}nWmHspoT6<<^wW{NNcp>FB zyB1%QfevwOSA32DX*b4=dhQgz7tEHXs$X=DVw>#M`|=&W2tZ8YCk4tGv`jC~TE(F? zqyauh!KBKf5cLbCFKOk?oh*4>$oLQo2(%GCOl8IwZ>jY0D)f+*r4fW?*6dsRsSqa_ zV2eD5vP1Mh{v7Y{_^${jlpF__kaR3JsEeo-r!?L*2{WI9pb_7$Yy~rTn@?xj;{kBn zQM4ZDEGpJkqxJj5_|6kL9p1!A0JU0*W10L43AK_BSF|m$T+`);9RZu;5*R`S3?8{iOCFG&nrFl2;__T;~1Asz$g}!wL&2GJ#l!_Ml8Ka`V za}TKRzB}6T6PNA=B9eQaQCcYzyI-$K7r$=#FdZ7Na>EK)a(3&L+7J9_-*)l(`8)q# zdQR*5a`OC!Le9OYSH2^yhfN}y1@_Nq{z`1Oa5*<@XNa~i-=*VK1yqi$GU zI_KX`Oz3rO-Z#l|E>#M~x>7KBKlLR#A)AqY0@LsMb1cf(`VhAaIVM%mhx@{{LjFotn@f`*2 zy=&ics;(2O{7!l;%RDI9=ZfyFxnt0IbJf8YG;(h3TZ31NVFuW$zLkg|TxJcq4khQH z#-rr_iJUyU*k)n-Wjpa!-A*9*g@=cST|7!5JvFqQ*cnxhr0o;L4n@MI8h$DyC0uR) zxL@t7dty;S(b-s66^s>1?bwn~VA++C{HJ2(?Ec$>?a3dX&H(|wJxV4utKj?u0{T8r zPST+16hvp#6u~^KIrBCS-ik`i*7XT6B*a3e^NYTFkc~lw*kRGs^G@=(b;)u$tEf#v z!J5614#{*jD4zW++4_|dK&zldXGdG`#K0iuoyIi2E({+o!a*WjbC=)aab+Fat z*}p7erC}5ubon^3i|M*koi%SO?xio}a%bAbSZduX-J`wM%D77dvh*Q&UHr85v-m8s zV^JU?Q!Op}&EbZc)aC&~tVU|WI^ZImN4ZXop@GTDBMWAbNC1rf5YkFVk?}C0-u7gE zEIu*U5>_hSo4C4~Y~x4Kj3ShlV;m;duxK#F7C2jXY{tIL)}mf)ju`3OK?N!T-1?mw ztE2I-0uM&7ovQ-678VYac=oB&@l5>4%$T&ZGK1*I-9XNWNF9f{S<1eG8=Cx3nN$i9 z@5|kJ*M989Z@y2&uevA9!voxgEKn*ZxJNi!P7d!5GggR{fA>qk$wWiN9Lw=dZTBwW z-?(O>cxI;GPFV~UR?VcrmV$oed+*t z^}P+HxlaZ%MbhLXYmasMM|s-NAI#OrHBZdjHOgLU7x)O1=oSRZ7o6489VU1_0MiADlyYW!_ zwjPx<+_a(g`0YL_E|Gg1zw(vYlYl{mJfB*RI{mW-+wWz8qA%->cv{+7&KC8pUv*@d zdtyT}K`^tEmAO%MW52Lv=k>drbyV+eqP@@Vy7l`G)?yF2xgT8i1dIjv(h9s%7Fl1T zXR2CR_Vpmd|5{A{U5keu$a=HgO1j=c<6GsF#3xfLd{IMgr17kBJ?g*e99I2?^L&QR zc5`qn<7Bg@T~xxBZbnIuS8`O$U9KED%O3}06=XSuU7DdDALy;J-F4o^2)&bVM9P}? zQ)3f{gQWoxO?ALlrL-5EwUco@Pd5&i;F6IO4=CD^RgANrI3Xd}n4*qnCZ^1+) z4m!HDm|MdTb?DDh%Y(Ot%3M9MHPZk_l%Pvu0-0)A6!~7!+0(V0Ngf8Z5M_abDa>jw zR8MrCJFJ=o-uDgd^f{=8!1!mc5iR#;t#5q$MNS?8U1&F4W0I`4DdA$u0&)xMF-KxN zsdttJ6y>tvPAzwvjvvpCa7B*#05RjpWWm#;U9IN%3Q8c0xUHQ?{K|C9N_7n&Hlq%n zUo`21eF{X0Dt_`GHT3JW!}b(jFV-sYk*8iYXK&vXph;XEO3teD)gB+|1>w>pP8DMP z4klT`E&F2NSzzd*??|nY83GiVyO?%<(?80Z=TzXbUlfvNT!fPfk8&rZBx*U`OAGs{ z2uU4@XRjICC9Yd?1iK4ZK{s0qYKUh3B+XAI`1P@UEdLMDN9mg4bW$AVUB9RQJ!{q# zQd08GZ^oe7DYylFP5Ptohmd5)OG*P$xsB%X7R`3tc0)Hb7GhpaSqS!-{`9lSNSxI~ z1d32mzwp;G@6LzW(rp?3_p%r>%{OB|96bRodfE_&8l+lr5dNyZZ@a4vf4YkhEPm6# zYdhC%_ektRZo1p9%j%-*<GLXDTI4Gu;p zwn$UzOk}viaIH!g6nmF++y{Y@eGhrR#AMh&FOh|_G|L3%u62kz<%?HuElu5KA!fZ` ze<$cLVTSo{PY~+JwCCx>SX;vIrIfMvW`0#%M;||sk6-ki6x6m6}x*= z*(hKuHiSoz^cIjri&?j}!CV4{K zPPtRAc3RG!thiJKwuW22t4E=*xFIjw50FgXUA~9TT?a>BCH-(UERbsVY=f>gC~R&SvrvDJhq}EVVo8CxdGi#fa^=*LF<VVLa8C}#q&}l|a9i+-Fxcwyt)TJgxVyQ39&C7qf5jLh z##h3s*wvG0F$*BE^wHLOc-qk@jLJFD_Xf()++Jg9(Gr-*Mm<^(!1pbwNZ%c{`|08v z>Hv0cIVKLHCnJfiM!qgI9szLog2PPD+quIAbI%vPgqj7~LFe_|mTw9lzDE_k`O31x zceE3N@;xrv-v3N~#G1s2YgSnWKt=6zNxe>GaIMIMQ|Z%StX0-|1uvniXkDDXa$#G0 zxnv4 zWUa}E7BaGhpDAe?JN>KLxZ2RyqE-o6_8#0(YxnA$mZfH+zmte1nf*b)%9g?Y5R8>7 z;l{Q*X}w3F)UZ{ipE=*=>DBe{P+dZk2X(brZ$RVhFt#bs?+_nK==s^Dute@43CdYJ zfw|M>!F*zN<4aAcCo9r6!Z|+>yq)_g0LbF>^HXpw7G3BrN6_4dcyI|GZ?|C5LjSD-LvsiIHsKb=9c3@m7ug8;1a^=8_kx$Y?L9 zei^A{*M!1o=Gv7FXKaqe)ANo<8nSgG0|?Fc6LoU{5!W^qA4B(qHS77P-T1dvh>lZ~ z(Hq;vLMO>AhrLEuUybYDItK|^h0)v5byfETSD~f-oftG*x}nJF{F6p_6vn;Q9D8iYBaQ(0 zvhnDl8isUgPPjVus@R#_TGWvDu663A{TyX&{B_9Pz(udK#Uak*ZKqt^{~VOu$|VR< z^8vdPI<+Js_I%iSL@xCFV871VzC}Nz?Lx=YcU-=~VP!52_T?K_ODfs>y}Hb_(=VJE z!n2p_^EFl+9$RjPX`J?zDFT|_`JEWaky95{;TWA^OI06g#UcI3jPHzkJC+1kodn6T;ScZDEh(PgVd~K;@kz{!dx1rl5{{o=2WH<$d6T2a-=Qq4LvDV%-%+8jW zdJ9r+%OoO~;sYDSXvax5bNjt9%N4- zTQ!^oogh_m0Sz<9ZRP;#MK!&S;IyxvSkjIPQU!vUdvQRO4E2mz!NJC`Zp-R?wMUf( zxS3m5gMw(m7ot?P$iI}V18ka46>@Vs^9!F)4cA$nGy2qPun-&Eer8bQ*yLnJhZV`SpTl3YXsh{p*anPol7O@6kv%tF1App%Q2>;wGXFTpbowQCzU#J_o#-T- z!X{kBb_*ZK;(x?a(3;H;3!b@OZZqJ%c{P!5MjNLtNCFzy1HZJIxpe56(hwxcye|TA_{YKO2sKyYac91m<+ zmY++KQmnSLA1ozMbWd)(9rbHq=nd}TK@qx&k!a@)?pP6l^LCxCJtxG#5#%7(lj+^2_?1^1 zeYTTPHT}o{c2F;lPWX-IKUWeG$r~!2EJfwAADxFT<8t*wvzZx*E+qIa-pv!@Cdg#L zZm!n_0&W}BsZePK)qW4g5)x=-neP7xEZiX=C07AlLXykN#HRLqLfnO+9hW;I$5~YX z5j_6gquNM|ynea$|6qF4R5t{OM0k`w4Z~+v2B`FB_(R}{1-+cQD?P!{93;+fUW{3M znJVm6M!r&(vHJTrfFi*x5-c~aPEPTBRTdb9)H(6E!R|~{*9NU?zC|;hKVf3&=vUVS zY$>Pe6P48t%#-A=9B#cLJ`ur%D=}TfqHj1wAZa>g8tl|~Dv0bX2Aa1F^ZCnfk1$yY zaI2;miT=ABvlk_yv)U1-g(=EWeLy?~$G^OUk!bQ7#P7pfgqUW;FLo(69`Ypk7GECW z2j4Dw5Idnl`vhQ9dhEtU-Wbv_cbE>Ny7a^rVdAbG;u~4GaEs(x`s@+0P%nS;h)fPW z!R%KkA|ajA6N004hxh}Z6x%lxn_;6#N%%WNEI%0MTxM*_z9`iz5zk6BVJ>1jg z?LJvpW&MlGdhy(4|1B;z_OS(v;!;UQpdT6El<2?PuYVJ>(f!R-FCrOgtr|fS$))BO z;%W!5al@(PpqjebCP@RTs|H@XgOmeYk==$}BbFV@ob}ry4xCmyrK{X-{9R|!&kA^xQA+h5_lzc81IM{Y5v*X5X+B%+)eCu{hM#0c|PVnBYMY zmw-Ro#{-h4JxseJmG}Lf!Ybj|+nw4d>X+kf#6lcW9=dPe;(GN06UO_!ENg#mlhr*r zmMcC;*Ul;gWP0ZZh#2&V_J*8~fT@cJ+cCHY#@A)(4nQK2mVV|@JadCO!3TH zUBG$~#n^Oo?Z|guM!DD>bpQgz;aHpfho^mg=rbJ~bhfyz4D4OJ(oF~U;-O3M**XKyGu@UsTiWup`5LQ@hjRA0s{txrG7C4>mk0UyA%bC-F zM<4Y9+ClQYgM+ z%kuCxHRs$gdokbT%mR^Mtt04AjX^wsYrQ7mejIgY=zzpsoq@T`0|a(3mYG}?-G6r= z?;rZRuS$r}t#23c!B?1@*4=28)vTe16*g`#63&%77dC~qTO7{t~(HkR{y zJ4i?5n%e=bU%tHMqI6|OHq%`u^Wjx-*h&6VJl6|CU?D$8J||haY4jjD0@P!C`{Seu zZ`9p~^6cY2t?v0x-xQRcbMOJ(^lHQlzS~MwL({r5#%Ty~Q;FwU9`9EDi15*uvi7%C z!m~V<@{W&u#lZL3H3hRX;G-7eDLmWY?h}#om5i#N=4o?2?U7keaA`fAKj?X{_XG0x zU>27ye1{yT8>umI13GOYqad5nm&)`ouxsq{3J{>x)qwKp3pz}hZ4Xe{x3U6iXxpT& z_qieQKnOY+dEh4gBrr5LlQV`+YpfIFz;Gm|tj5&1xz2V2;5X_Wk3KwXu1ugSdAHgD z%%-%8c6(9nFc zRX2v4;oJ{^icbU-C4ovXAZZQY+P9MIm-Kh~^QefvI`w)#54hxbpDPR66sGHkUMsNx z8tw`_aMUt?$_-tJ!qv%TIgSlzlC6aW#TKCC-~aF(haDzeUX6NWqo+Xj&z~Slja; z9tevBLPpoq8#-OKvD*X3kGo7AdEmUhc>FVYIaMm>2nB$Jf=u^>bVsxXx7|X}n;YUq zr$Fc3nrP7sF?Y{xpO`)>d*26oV6n*a-d2IWRO=dI&fSHG@Ewrfhwr4zd_<9ylC6PeWZ;&pz@S3gLJmsw~~MRz+L zWwYGNE$>N%b9pa+>z&z}{D0Vc�!_t_|=}1Ox;u6s0Q)2m&g-gD47|^Gnw9a5(2a``-87 z*S_|(M@=LqZsN@qm>k)J4)lShvYDHEd&7hn2^}g{kD_SKW9ryGMDsV5AHtozQGiyy zyepTD-O=kQ_a*2jGFrxh&oYhvL^5HF{G`n8w)$bDnDTPV&ts)7d^L}x0RmCEK6(`} zH_NtnJ(d*ZoWW(66C#b)f%H8w>3Yf_MRs%iDYOXsA)n~=LOhmV|HS8bbtUgkjH@uiIt={)@L6@wg0HZe5MTD=zeNCXoMHr*j=7W`@X(H~LeU@o1H# zCiZ7rx3RjUMx#1#bwfMMJ_lR-U_Kvd-V)E%uNJAVn?AS~s}Z(-;j&Q0j*5m*(P2cnzbUeSiU{{oDxJ4i%j~$_>22I*YCUU9Y}I)_D$* z)XF7*!Xz^LHZff7PSvDd6WQ7t1(}+RXVO~nl6~GaTy|xT(;R2VGXYA6s9&o}oIBz3 z);XmmI5in8xdg56VxHkmdE}mW?3M5NA-h{aV$d(1;e~tX+|;P(vRW%G{{DP9ao)za z*G$-hW4o~F+?ytGYsBMU{VY}Q8qY0h0>mr850-l$wk)+bkYX4sUE;^R zvZC7z6YGXRx8e@Kugc`&iy}J_uSb5$dk0 zsOPcg5Bo$%T#+u{9mcd5>2tDv@y5x3E`XPE{1{NlgyTNnv%5fw3{$fM05{_wA_G!Z zKkMj`Qy$e4-F47oOjlWqijJV(!3vDP$?9`_+RvyZyjuF{cqg3b$+fopi5i zt&H(iqe<=xJB^*f0LAJu{fshg_FK^1Q8UGkuC8Jp5JcdSc~1iK7|%mY)F-Q^K5Eff zR&(*Uc@Y(%i_{OW(Zi1qVR+n=H}Jd(n-3*Ond9tiN?h-Fgv`a#x33Bh`6uY2#e8-l z5P%hNEpfycAU=!Dj4TiANZ6OFZqihTK7L@DZoz=TqU1rL&u;eBcnTb@y+itC%K<6= zvJBxXwG6)7W2uu50IL>nj!7vQ+knKg-_zM2?;#BFp|r%6oFuD(S8%@D01;yt9JtTQ!YhXa3LX6O|Dy$v`9yK`U@k1HXfcL) z?ZgdTQHCj6UmV}~nt_jJxYA&kzox^i*Gg-;ywT##>T^qcJrIA#3Onikuu0ZUs7XSo zAR}JXs%Q|PuD23jw=qM!H0~meWqD3Xdyi5mHKX1GqbzcVCQi;xtG%bV^ z;l8?OPGW(&huBUHj~QYfHDJ;Nx>+0ObxIqh1i`(Gkqgy5kT=7(g-Y?@duV1gJ$O*#R`0eaOR4 z421eMP+(w9+#C=K7s+JCu%Gg5R+y2UF9GgB;FSawNyfNBjw>#XjTiqKWs0>6=3(mN zG?qD@7Z|N}NDL0JcHU_o_@%t=F~@_()(Qe#7fr?2j% zwNOLQ5p0DCchwS=HcH(}I-eiyDLGwtJfIP;*je=-lG=w3-_dtG2ZT5F{^X<-HMaK*4$=rBA5Jh?Tdl zW0_e{#$#3ZH0#JIi46bu@I-|Bx1yU?;^%q8y|(+ki;hlk%aMdmy`>C=?2Cz=5bR6$ znfe2lu3Z4i=%4q7*W<3pWjyTjT#ihr>-Nwf23Mp;(oyYB)#Xn>2ZV~|_6-yh z_=fX@PW}UUT5_ zvZqC&iDItOSUv~}ip|&|o%B!~yTAN&t4dW-umT`jr72?VtrCgA!T6g6hh-~IIF1hM z_OpZDMUWcSaGY>FJ07Ca%WIqRnWQZmpc=o>vEVpU5BOLh|98%n5L`v3Hu_RedEHD= zwPMyWn|VtIG4z_z`=RxA`6BsIKob7u75k&-$Y$>Yf&8aOM*C~5G#vnxyOiJww1)ir zCICq&xjZvJ7Z&geRP9TJ`bSm9ybztHs`}+) zygvbnng&nWA(3L{&4HvW&_}K`!O!dt2+{>@=0`s(=Ng|pf?%Zt=MuH~j6wX>o3^9n zD3*Pf>r*VozXaEnML+zuN}*5uOjB-v92jMHqd~O2V7%GGdZ;>2_tv;s-pRz{drz)Z z(Go;uKK1uKBPwvhQg(kCoa}xF=Us2_F>O-~XMHB!D?Fb6^dMHJXZg6^jupzd8DFpAofTIpaEf{d=;5&bxIwRHtoJZpkIJ!_9C^Q6?*U@BKnktl?2j5oClQ*hJ&BJXPg@ss z-46-Zc{96U(eNL6`)M(JOpYQyo7x;EOy$J@cy+%+e-hfn3;rbSDF)DiDRQ=v9y4zx zlBpKz#(znSyxD0p=YH#H?h%f*oVh|;;yAsTQrn!#F(`O9x1-si=kl6(Tsg;Q%Y?Vk zO54KGbm7-IpW36%uM+@zVR&M9R4kqU3-G|}UAGJ8RoMs|UHMNk=#o6oQ%^yHY>W5k z+!torK0HoGe`H)Q>!NaArPLuNTL734DhUv2m(M~_A5Ubd?&}m5#*vNmbk!K31q*gt z{4w-NEY%Y@VBj&2A=f0N8owv@Tz*oTWaSjBMvO>vQ52vbxN!I=ppQQ*VWR?J4_CGs zV69nydr^p(nTBd4i~OpCMRtuS?>^L?rg;t7$MlA4q3u7y>se@B=XpKA z^y~ZpW{^O7Nr_2}&Ff3~A}XPBYW~rAUdaj|Fn%Y`Uv;b?;D>c&WaV!tkZahB5rT); z%S4tpbb1)9smn;aF;7VYn8J;rK@+>XOpe%%4;OoPFJuPgWFwyejgfA3MMph=MES7O znqpGkMIT%X&iytiWXvmIa-VmnXruCkM1uSy$$xsLw$;fr7oT4Iudvkry=T|A3aFd( z#RLKHIfYxZuae3y+kkR~MmvWr#g!&9%={xiM~<4^DF8j01Dcl3086)|S8Wqmt1^7)YB4L@@ao9mp{ zcjPe@&}9Ei_GoH^4mNaHrVq64$JvI|0;4z5G91~^4GibC?r&X&taw#9wyy+b)elIQN=&jNM;*VudeJ`ozi#%<4i~L-6~WiHoHEdf&bEdUvr^q z;IlDZzs}F(7`1BL6lGWDshmvymrP#yjRx-!s`c*nQN~W7yu}jT=m~J~yRON9sA8!_ z-cMXdu7|E)bl9K!VG-5}bi5MfG$6-biuM7>h4X>TQP8j6JT2N6;1tKCsunR2g#P-2 zHTlyx{Zfho;kHHe;cDcEl#M|rJ28NWYqrE(PT5W+xodf7cHI0b-XDFtT)o%#VCDnczvu%WzG!hXk3C^8-x9@N9 zHQBE+xL!Dy4+AsbP-H4_GBYfMkHeV&-m(-T86tsn1BAHeW!_AaS#%XGWbJq(XF`MH z+fYn%#X4WAlIOLhR;+_XlP(F3TN}*C_E>)e1mXrZ{#wd+I4^Rk>cu{0oYQ$6V0#0? z9Z@58m6c-YRlFcTn}+>CIqD?o63e#j8%CZX&du&=(o{t(<>~7|uisH*_SW|gUHENN zP%l5{`0NNQ(p|ex~pDor;*eI}ZWzo9!2`^x^h8 zKM9-J8;lT{#&hY^#WAMNazKr)V2M%6s|4cR96YS?H%*l>=k-(v;_7eZsGYJZ4lDyN#MThui(?R_$7%#T}e@ zkmIwtwe%2-L1j_dDkftFj@)pcao~WA4OKh9V7~&wCf@^;>AOY>O;2I~DZDt_(tqt` z3{cQO6vz?bC>|^>fqqEXyvlW&Yb|o3-pHCl5e~G{kHtz!^8B=JX-J(Pcc}jAx>}V` zVl&=~PeuZg1JP#q`dDex^k=az@3BI*qmRYh^!E%eS)9?Z?kmnVO-tsA9(+npL1EVM z`A(5-wDeup9> zS=X0_Y7aCO7(Onb+ywob-@~gvCV}X`KB*BCrv0k#Y&3=}flf~8`ZQ(-?HUgd#n1^} zfqbv>IC6~^w_^YdtpIHR_?4rdV%L7}3IE+%|04aZj;Mx)23x!YxJ(|y59F2Y@1vqO z!-y^Qs|V8oWf25wB><>V1AR#EAzqHzO^!ZRPIo=MiaZ$(c|zo**(xzRe|)|LitoGC9g3@t zSaGy0&>MG8IECt-Knm3xwO2Tu*F>7p_C~y=hg(x$&Zp5_tWdKJFn@D?2hOwu5B5I~ zm6j6MmRo!e za2pURNGrYO(9cnI^1dAAY7#fT+@^R9Z*OsABbwJrPS$Pa-*v7gC)n;&kv5mK7q*{j zZz{Ij(mY5iT`iuwrKLr<=EnfdaswcI(b83`Fe6*pA^Mvn_kUv~>#uJ`&~J@3mwv^k zB_-8MH`KKUOJ25$Sr*~_#u4`ZJyFeLSj+5ygsO~_>xH-P*?%0myG%~)Wjsl5soXmE zaKh?Q4btMC^lKq6%>3Jt;l7(w3FiL9pHW;OWo6|zjql0a37x^hNz4GH$Rw{FsG-n4 zpS%7W>ePQ9HdFO{%sg^9^EvwpA}t6K0|EXi^4R*^9aVYN|H7yE4+4ws(|&8|T#kEx z>Cs;OehsJ_>jxT!!AGe+YUukMIKqdF&OAWwqkT@n47C4VeD~y!PnU8&>p3algVRjgLWOVC+KxWd zOC6y5-t6je!M8cKB-&l(o&AS_-h=mWi4i&$52vrvd$P9yBde{Bj75%^8|~@aDtZ*_ zsR8mUv-#=tEbk@Azh1xHT&=V*M58f3ubh0Bp)Z1D7xG=u>T`FjNH1pJnxq6CC_+xk zZEq2l3aIj*6Jz@OYT^Is!v?&4s$@fD5KBk>vvz;Xeg2&4i`90buI`H$^krW?L@sdO z-Xy2W27-FS50%5g?34#&o>42vu6=!vfN^uZ7l+WiijD3HUO1bdKah|sAmPQB;Ijk>329newEX!}DVDB_}zpW`{-sLkG2 z*KUvHyvp{|cqFawc)98p;h5@HX^g(}9g2i^hc{M`lREdL4X?j?tlB>;FbsZ+KwcEFxd@Z@=w9&aTg zPs%GsBs+qJUR?yvmfrZ`&$5z(dgmfz_V631268|tC%43Id{|l|mrGKFS9d-h9$r)nuyP&bNmaz(Cmw$Jg2heKLPKM%gr4 zfcO)0f|BVak8HMQ%QYzf_A*!hY10SFoP%T6{PQZ`?nM9-db)t4=Qv=*F70i6!p&XN zw8H<3?p`tCL|dEm^dXD_gwpK#V>C4%s^jtA_S!f4`2YM6wdWVfydWdeLp7y-evg25 z?k|88_elTf9+_wSRQyY)bIjVW09Wxt{iHV$E(;8*vvBD%4l{0e%jf;>QtB`dv~@6| zCq_tQ;5kj_ZWbJvHwMpeA1q2cl~jCnSt7qY8FI~{`@3=1as|_$ynyU0vkT9*WJ|p2 zUFpUz8yH0w53QOGpwIna?6xIIe^RdfK?fN9@gzuwBN+Ew*?OyK`&8>6cDHJ8`3~g{ zW5-WD0Xt$Pvl_8o-`_Om{qN~F$l{v{j01OjK_&X>tG6m2d1kiXx!Pf$$R%UD(jl`? zH&$MCXU>?bYzzIIfN1~f2UIbWjBur?b~qt3Kui(%(my*ZZM@EiE&t<9tmEI_y}F49 zt>)QkS0xQjpVy9UcL4M(0l!&zIwmDFDkU5D*D-r-gtF2jw)eN+PepKOrUtoQSULaW zW9CQ1+AVIbi#1BmFKc8gT?yx|ea9wr`SWEZHs8xicS1_Qy|a80SWNBcpf}jFmHJnl zfW?@A_fGHOFR=}HY;5}UWVy?G6|7g#PH2p$Bij$NoNtxSNrgLl#~(P&zW**{vp6f6 zZ%THVP4?|y=gwca_;){O5TAEbV93<^=|^hGQm&hyUuWE+S=n1=H~ysDe$o`s`>X6H zNeHTRS>*e=dM_(d7&k2&IJmnuB;ti#r*~QG{zNUqlo-3z1Zw$wbWW7|cYo(Uk8n@S zcfomw6;MzT&@Z*7-3YnCJl7V|5$`j4fsStPZZ?}}IEx+h()AiO*R?@05wnKii7&+0 z==VIC6@UK@|GxLP7+GZAq_Y-HNthL@>ZgdN5A%5gHS$DcKdk+T>|6?nNb_T3tCF+< zH8rX9XVE2t#;Mn?owYQ;ZGHV|mnSN1T$lT9iU>8td6cHG0M%IxU7_)%i| z3ktpJ?3BS!7hP?ClB+Mi5|bi$d&30nTRZq1%7t`-7e`hpHGQvL`kdC=(su1=wW`#`IF8jV{xIGMtzu~oyX5`t|6VorT zjNs3>auKmiRBQo@!#St{2Q=~piRTT%pFU_1cX;|O%l;o`CH9ltSWv7|%B9pRB#4I& zQ#BrhNwCRr#RGx_#O;~-COoz$KU+{V9CH%6Dckie*TkO|_qd!zD#eYj|MkuPF3fL! zP%(JrZvXiB6KiM~^%P8G@mKr~yh|Ho$u{XhOG=7Zhh1e+GO#M1+cx^4Wsmu|&n&TU zm1R9edv&;_|KhgYG3y^p{<5x;uV8jPFet)>eY*GKjREPvuynOgR$81(UK1+nU+#Xv zSniK&voLWTe7kT_Q#;-y`K!<$?|IMH>+r1?ZIIhhx@d)h9}%(mCttt!fBL<9s=jFI zOEj+5_W%pK_eWEbO}cun!B_c9$sZR{h?`ARBZYt1^vc<$|Cg`wg$tzI&k-N~WN3q# zZ>MEL>U6`N{AuHJU%AYb`S>a0pEia@%6UH`p(7J1x<9RM=3`mMq^o!A{?=pupUg-$ zg#02PQ4_SM`}fZEe=q%KA^&$R{^ooBvygu<-hV9e?}C^A+SET=`w#B)A0K(9hyLfV z`R`Wx+aLZpwa?yP|Aaqhy668N#E~Pt2-8ygNpbx$+X&BjV9 zzvAX&YzQoEEZ|4!m|_iGvxU0DHXmI4gGB$fz2GxEQQ{RK-tKJ=%3EloVbZN~&~aol znF|sAXp>hb`4WSB%C->yz;rHLJ}i(RO_)WrfZflE8@gnZvi$%>8J}0XD+AT-4{b*_ zE7tcVf)Ae?Ne z%o}zc-S#88LOlgmw5?icy>Vob;3@1r;u-cV{`?uSw68TyX3TW`Hv?Nb>=eYNRh056 zeCmTibLR6DfjtWDrq+uqX)q_;TKuV3TUFKO@VjTPaqawWV$`*l+hpS1+q92!CmzRcheKKsg*Dz6&; zx?N@4+8(t%G-HnF2DL>?zan#j)h*HRnau1)1Uc0u&z0z9q{G)cCO1V39$$}LyPbma z;jMC8jj6-~^58$fD7omWbnZB>&QDI3_K&}nbjM4Ggi2xFdu(S}#Gu^`qkiuw0O6i& zY0>Z+>?C3O01Kgh`HWvwyicebeK79XUX&o3Te*( z78SaiK;A`3s1OU4UL5!TaaNw^t3YghOLTK4;MxyDy&D6h4-xN2l*TXwuf6wk$x;ACU?Y z{50-lz5Ig$G7p$@bd`xqbL5QlGgCIn+>dBQ&!KibRCISUpMSHEn3PWGg*K(J2WIQK zsH5yw!cuP{YkW22XwS{5?UDWeSi|h|#GxysWn<1e6-#+AI%Y(x5E3tS`q`hwKN=tl zhHd0pBJ=FFNG^~D(g>fCHF96h9Gk9R8!a}SZ!b3q5qillZH~S_<+@{MGE#7*rOKI6 z<(6o{Y@pQKatc)#rHC&7%e7H~D)0uKRyvf1zwSU&65p)lL?EJW!)=7|gAa?`n8(V) zXuWfFCq%q<$0NkEDY~+A_Jde`2oi;W4({XL1fJ&kyIOO(WSFv@3OxxAD6!Cp$(?BNn9Sdj%HiK$wh`Yvs%P-{w;Zjvtr4vPmLnf!d1HOl5wb zYXp*rg1)-ov{-8ND0FC>kmxc9Iu8}Z^IHPytfe+%58NxOf>+(wNWOd zmq3i(x%Qpm* z(8@`?D_T8hp|Q2C*t}%~jN+kL%>|Eo1{_0jYxiR+ap*)8VOAx-*)&Y%3U*pn=pX$lim-O@xo72g2eD@`su{5EcZM$^%8if)D&D(>Iqgve+*zLM+O zgTe2H9xiTCSbswij%<3{lp$X9<52q`JJaR9*M-O1U2S&dhODvnRZdQz$q-<2NXt1= zW)^j+y~OT36n-)4Fcdp@4rKo{*;i^`oGrm~an!k*F&8&OT}RL9gRNKwn0N2i5fWAN z>ab6dU9PW$Jv8hU$uGva>^scX6YU3mz(_GCiu+=xHj*l4i-NXi{2Ik_?ut6LfcQdB zu;bN^t6Kps{2vnu+aibC*t(tEgZ?nJPH!5*PkQWZE~N#R`Rwq!Jm1@4lzjGk)iWcgUiJ+@4gjce#8nXwL>5ooQ&!O zPe4smcfNce?kvqQ5OG5f>RNqmaLX~pt(Se00d2$jtHJqRIAtE`PU0{%={*alp&{hg zm*X6N8-xGrpv`pl&8DQfjZLV5&U#A&fW$0L)IVSWS`-4SwCE*es7lMN?DLfCUR86kd&Z z#m2J}b81@5PJ*XC6Pzo27hBMS9r(#>YJofiFV(`l!jQo%Lj4K2S==UnNVmAV&Q)Dv zy+|iDKaalTXOT0?ddW1O* z$mD$jj^#XE)JW@mht@W`9gV^(%b#{*;3Ix06G|qzNLIy~urFbHt2PReRyZw7{`L4c zBZDp$b};jP#ASnMc9AjPuB5oeqn;qlP^~aomUXPynkmq8zTpA8j6^EYKPb zwpqAmS?kD9abf$a!{MTUA-lx7ZIEeh$gLs58fQkca~H49eg&d)cU*cpiM=zlesedH zdQpK@ueikX$@qdF(NZF=dZL{)K-<+P`8mV-ZB>d$KOVX>b|soV6AkRjkE?x!@{)wL zxoDAyx?!UVn&uEr9XNVU%7+ z67%Yu>>)TZ19xzH|{AoBWjWsUIk=F@2YYcuDTUuxJ5UIh_FE_gxxl;rTo;{m5H6T?~Ln>kNyT0{7<-{adKXpnThMt5)79s z((FDCy}2U{h3y|Y9o?`nN0a;eQ&OddmJT+&1V06NQTK+BZ@#e4Bs{3hFA^Dfbu!|B zl3JzAPZU>0>TS3j@|UkBq-f+ZGI0&k53j7wwXwh+A|#h>einM+g9y&dg;w*vRZX~z zU&}Y;jf1pZUi*azkKNyKgU%ew9ImW5;fd+#1I>_xlU`G^fVb*P@bxkYcH-M4REEI( z1M}4po8HM)H=2x{jMXm&noAoWk)ZD0J!3Fz+k3Z=bM}zlI54j=>x%G{N7MD1BQzz+ zrED=#4eY3v81W-K?l|Z$;`XqSMQ+^ITdy8O$$7|27Mi;k5Pi-g$)qh$VE0gDiBKKv z6jnCa+rjX3Fnq*f1%JGF3Py}x7`V-WU{5pL1sNNQGjjMA`@uA~beKz`sI`r?H(3WD7 ztIPu0KqJ5wv}6+S@+fSQa+_w2k3~sR$`}JCG6#RKwqP>ehz9}y{(YH(89+*wf@%V{ zK{~sc4`D|{_50uHae%J!!t%lKjrbBZYh_rNlQ zQaR`0_`-*B^;Q|t6-B3(uR0cE>5U|-`LT(7$-q>v7D!fcp#yxV%iSL2(IDn@@S2o< zoKE=EE4z?BPb7{RE^GK=?!5W{*>2^K1 zga~TJV63blUg5!Y>D?ADc+?v=w3SR)5MO0UUrB|U)}7QV4le}mYbSwfYLM`I~ z9?cTwgU@CkOkvgYzPUD%V$NMR96nQtUQuJ)&SqZDGN zU>oa&qvo;0Hjy8EQR`i}C+IhPUp<;Oew&mdMe-AYi7Cnz*!A)Kqhcd*}QT24{$;MqzQ(iq~owzV*+>~IqyNDp~w zETyHxmHM39YfryWqdR{KQ(I;nYVf zxm0CTP7WY|A(Gc|`vpLxA;0+5_VAE-_Up}jb2Qx@(Qq+2VtP$uyC)w9#`Vb;49LP#bif+u?B zS?sQaIvne)GKtc$c9~Np1ha?XUsf(gR|!_N7=b6^vJ6J1>&0LP@Pn2Wcu!o4;Z2Ag z!QTGhSkUdsiu3XeD9pg23#twJ#l~EG5?b9>+>0;Y-Sir_5)Z?+)l<_3 z#c1^<4UEGpV2F%|Mfb>vYq8x!_JG0aLcLfrop3;yGF=u?{uD?qg24Zl{GubXg5K`I3w|KaW?U zcCy%uEjI9nV)Fu=Tv{V8Vf+gLBK&LIBHN8T{L;)hc`@tPPX-eCd&9~!MDq&Wa*0W~ zJz*WD+wUJlQ|*6%3XD6Ht4{3q!YwWiNEljul{?L{qK};Y?_6QB{25t`5$f{4`-=b% zn&p(iX{TGoD7@aNt&7W28F02O8l`{eMoEiz+;^D!o)66O zkD1Cg)9Lzi(cYG8B88a^zVcMk_xUQ9Yr5T9!w77{}WmvBCO==OT(1us`nD`rCAfOY4NX9FY!6+#E^baUS!2*Y?S>E`!-K`Ie!Uz_=B>; zNA80m&Kp(9qz1zczRp^gYj(H8502^GyC-HVz={enV@a+nh+2oOjhd3WsO_` zTzl8}==Jr!HTc@(*GcXlY}-eBD-U?5T1j{E1~D)xEmfTiGn!P}b%phX<0os5J9Fkl z;8NokFDy(C{hmKzt(3F~z*mzKUR`@zE$ihyHR&?OyGx?$iZZ>(lX*qXw!54#nPNEN z*Vy3mXz@unboWYxcrCrT9s}^?@r_?4yRhn_(rSd3FoAMVo`7c?b6JpqeHGt;y%Png z&)&z1RgCve#$Dvf;e<3Ac4ME25m`Vyf7+Q^Q-yYhUs*Y4#rhYCj(V6=t3R z;gIHVg*RMA-^?7R^L38z?oV3uBDxe!H*zLW8Vs-RoJjFoS^=X4k9QVV87@($&JAqf zlLJ_LPH=}o9ss;SA)a<}Mr5eXr4iZSi`hp%F8J#&(ekH{)sQ&%_gHE>?9mFWKpOtXg%6M~^_Tc!^oU zJs8^>+%7rL82TcDs%~pTr-iTjjp9SmnP#n7{wM;k3r3%kiiYK(e=^h+O}(mmLvsYA z3qS4*KR`rbgPulm&}aWVgT7}io5TUcX0>r!If*T&Xh}EE5t13)CKteT10}lHF1mz; zcdXgm6Af3mahjN$^d+5(HYN8DlSyFauh}%g{P>S@6@2JtI03{3+EXMW#pxb(gO zD##|U`X&%f+(W-0R`G83oeP!X_$9ux+i1rZv9~XQVd<5|bxfcQ>?%DcT#zyXOOWY? z0MmR(T}+nEPd3rbPrn*3kb3Q7SOqM8SqGO^H~=oFGeq>h(`@Q;`B(-!(M2j6kovt} zABhqZ?wEZ2id)1F!)SpxRg1$_dyK;_12t-lW4dnDd6nXf_|Wd+psgbtkna7;Opac; zfMD9Q$*Xm;g-;4Qb+jy5OuaH~@P$q8hmE+$<_fH9&4h?3v|7pF&74L~5F|3Vjx3=o zOe0op>vI57%?OXLU+sl&YNqEUpKbx$l9{8ec6sa@dyB!${z42%EP%v?Dg)$e zbx-d2?CAk^=}GJHGSxWuPU_iAcIq$yr-4?byx}=9@S2tb@!%XlPX%JnGVfa&#SOmm z@c-ajpFL@*rFWGKJ^B;pS{d_x4T{h(abchMh;PoS@C8MxpI9w2rTOcVQKjaGgCGl| z2*s{2x^M+b-K#3(9?uf)bGT%0&B=T`my;j$pHHVI-{8BNS9ztMn7I*>(!<_%qj4*crOccou3VPTeU@wB_p@yu7FO!|#leg5JeJS4TE78EkjT z1CPz5OZ&+IFsVHJxLI`72LP5infWPtl-VR;N7!BJnhW40h-&GSYw$Gq?ivCIYOH4s zFoLI5Ip@-^L0+|)cTOp{APa`Ni`~)Jcy`esC(;67`#J;L7rIqEFiXk_Ew7ISft(hG zwg9xNb5Q7o?y19<3y8I^6RL9`EQ~f_lHT=So~+hMw+Ha|pwr}03#oBa6?}hx{dGhm zkN+LWQ&i=+vcHMbi5{@6|ANAXZ#b=qgz?b?t@^%0hS9JUoKatVQ7+p9Y!XU;0@_C2 zbiH)AyQUI&%P^_!4FJS&6R`8fGSdvCt9A_z@eXPNW)JLY7A_-APJ48>(bX;hX9zZ~ zec2s{)`3g&oC24panS`*=?E(>Q4Ra~kd=W9MIa>-ejJkp0!}7ixuz5$vd7O5rJP;2mee2w-<6r-Pf(lPQXD=GC8Yy>vd> z7#GJ^k*dOh0T{S|OYbS1B)S~VF5nhqz(wD9gl-i zTrX)QX>pckJVjb)T+(mZprG?bN8n>K|_={3Fm>}Kd>^&jIV6l||R1nR3V8z;8 z;6uZKg89d@d|>=)`a$FH=&vcj*|u??6wib@BcI9-!5ni*QNFF>54pVGlr7hhZ{)lqn+wO+e5jfk`6FA9CtC{Ow=$DR+FC?stqYmYc*RyTt zaT994N1fUFO3o*Kny%mO;!j-FWz@`%EcjBF{A;PQs$o`0 z#K2OymZA;Xk+)oM2>@ZbqtqwpcErbaRuDaPwOk|$al0a&(BizBqB0!u9!K3x7t@@;>0jO3G5Sq4fAM4k!KMBh=ctnpqPXl zkK)c2I;A>C$q=5$9h_{W_H=(<6hWEh6WvlJXxIZYTY6?|TzWgzDYq7+;8*Y3YS^ad|KK>Ms7Bar5^5G2tIWfyu2Z;h5<9`4 z;DwfnHoE&|3%iPEI6dpr>i|0iU>82OZo(mgXZFhg^d-NoGx{!PYnZx3mgCW^c!*`; zSdtq+SI=sxLWDREaHXECR2bEjfK&N5RH6iF8zaL|tS2w^Doe!SC%EF?uz_CDVq-|Q z?g(3bV;O(#@{bFoXp%6*Ay5y=aN4>P^);cYV)Dn6pzHf%J1u=Zz=*xs_Zx{!r-hqB z)shAfWMgw!zmo^ zi@@*M#h?;Ioiytr?K;2QEi}ZyyfKGqx)$rD=u)2COaMgp3MEMHIZ87IxSinK7D)TkBxPw2Q>0S z9m{}A9mH9S@h5yrJX&q`QFWsiQ1J0L)zY2m5kYkZsA`hp^=)z^qSI6Q_;MLzr&9Vb zSc~{8ZP1>MNs6W7cXtt`96J=4=% z$$eYMs07_}5fG8%xkb^V?+#Hfg(@IOW)VdO$gAhyXktcAi}OI)nUbZhK<=%-9ms3u z2^rO;>jnC$R@&Tp!Xq9zqUiRfTdMNUkJNa?%KA7$GzxwlB?bx3e)K=%ZI6gg-*&f( z1-4svj2v_W83EC%Ah8_!zIv0lPSJ!XQp)K7S$D;u_Q7p&H}nh1)G;8|mqQ$HuB=ULl0Lj5IwwFa*=93bg-=?BN^JQp2^Kl&G|*GtTDf%cfW>s4v$FNBU@EvH>jpwGX-w^@)n9lQx54dXatiNj6p!hD- zmzTVb)yXS@3=;QGIKU`V0x?xN{68xZ7Ic+KsSO0~p#@+_Ft3#M=z8OqOIBaPZ1Rp& zMq$)lEC=w;6#?_+yy|u4(;(TjXN~>$$4ure-)ta)hRTwX!7Px>2sdD)e z#)yTc-fW>S8C-w!b6yW2M*TlnNE|fE&|>;D@X4~t3>W8uZO=st#CI%=rQ@D19q)so zm)RBT^zxq8Rv7^*%y`!=!V`v>q+~NhYR74i7-)d^^p@_vztRK_1n{9i8Z(ixQX9r7 zkh{|wP)=sJ)O5RP*Fb7BNAWS53*WW}3o-&vW_i?|maD870A+3Dl6W~O6_bXf`ecaU zGd#80(ermOPPcI+uR~FXg40WT>xdrValsuC<8mX0bF<|Z36Y8RshB$Fh5u|*o8gmK zHQcZstoXXr(K^E13}}N!Q_l@*soCz15}XfFv+u8|-(^WuFga`zgNK~bZhrEVN8rB= z!0iH9++Fb^&+|gW&#Gje61ZetW+pvSC{5ZXd{D8rHLpu-`yVd;ie&_*GX?`yVYlG+ zTp1gcjj39b#9Hh}A>?%?B9h%O!6)(z+b1g)2_XS2(0fx}HiNe!QzKYHkH?Y=_4(FL zz3ci|d6z%+RA8(_rCo%?H~K?!>&<$>+NR_Hrt66)>#pU244r3*!cykHVrcnmkChF# zGrHJ&z;G^=O0}JLRALYDZf%o?18!?04_Ng_c!>S0L^Y|n@4_jA2vw~ByI*@ zRE8zefxyCP`W$ZI9dAx*1?{bvz3h4YD4rwqj1RqW;& z*K2bD-9G;OJ-pW}e*u4}=WxA3cCO;G!kfP1&T^|33Hu z7NP|MlY8?6r}1^LIhy7S`qstEmMo|5J112^an>I>ecZm=z?G_vm>*|6`@jAx$@Ahx zD#0-9G~bzj8HgoiBTug+s}A}@f8k%gi#ORs`y{8{Ri6=o{`*#CVG!B4=Ikp*9}N%k0>PfI*TEPw|AsE#V2`KX|o!TE2%nOKmltE~dZM zMz{VC_P#PK%C-Af7!?EoB~{V@L`u5D1d&p@5kx?`dxlUcX^;-3yE_J?OG-Kh=^k>1 zVVIfou=jq?|6K3c8~6EsK5+SPdGS2=z3#PsEAAD01zJ1~D~EABUfS|#Gr1<~_&IqB zy@D)15D>fxESl8__@P%X;~$!w5HMkKjz&L|leAlxvXs#Gv*Z)5MX`8Z0}85jL&B7Y zB*xzY;lKxj#_e-Mg&qTU9?J!K@;ljQ(1^@crHwvZE`qC3m8J=`dlbKKlB zH#FRDBfECH4-Y zD|YwVPkGxknF0NwdU)Ot;QF7UPv$W$i>-U`<<+$N4bs1@TP`WVx07U95;vf)U{12PXmtmvLKY{FuDj@5~#bBbup0NTaTbO8pX4CdW+nsx-0 ztncQ_%|)%{G7zU5}vJb#Gi*%`JYOEsbpLN)R@&+h8eT4})z7%rGEoitOx78epFmIuct)E*Tc6!2+LZ_x zzyk88ww|L*^{>uf-L)JqA8QHsj-GtdKFC$6tx*~yL?@$0`x6ZXTc%$uOWhPwpNDKn zi-~lF&beU!!zN4R^a1A=GU(MEbwRGCN(L2w%6#ApGG_?j&v>H!iNx*pyZ!_%lB%7E z-wN+?>#?)DZSXS154hWo?s6Oww z2eYiEHI0))_K_lo%eaoMNk*Yf4{Jn=PfDYD(3}^#eJPN!V@=3k$UheMoL3R z)x)@E-P=5jpLZVH87B(Jm6MTP)is`78!S~hlK~c`guw!yS-Rf|mn^O0>Hp`8{Xb10pQL6W6PaeyvPKJx znd%*A2O35s7QQSeCCzw4JG@Jxk}V%p{rsv`Re<&9-<$y9PpUO3vwqDchw&SjmEAb; zz1F(=M1@-5685ZVi-boZC4jZ8a|0OD-!^s_y;EY88k4TE1KnHf=m1D2G(Bw%+>BvC z4(q{Odd5Gy-td?Ve|Q(X*747#+@CIxw4M*3>pJrQdI7yoULjy+P3c=goAb@{l|lhk z+f$vpX`3+csQPb%A3%Y$Wj>0~d zhz_^MrPOI_+<|-$cONSOO>rNf*Hvet&DAiTbeMJng-F6V%WTG7%=-$P4gi6RE+&*V zrwNRQssUk=iawyC0BLgCC!0LI&hFM5d0jSLC`0*qz4P`rwfrxrsHS9M($6`i z%g?!I^~?f-5!44q8lZV2$#1P!`0s~5zjY%C$PzW__8aAU7e$>d8Mmja z2U0CHxr7aue3%zE{#lXx43tR=9D2~&&6l4NA8qw(8*Xj`qqnbc^Osl}sN5D0f9~wM z)dJz34z9oT&!{5Ze}#~gV}-r&a&x=qQ-6y4>eKF6RZ{}Tg{%0VN?kD^^i0#5Co!J`*y;=?0 z?)K(&{SCf!wMh#ud6P}*1LNc$jHaY+4%(Y>;`Pt6iV}!{a^h3OjMzUCCJjaS=duz8 z;3hsU_EWj6?Z7=arAjuK5-N-tw_8m{a_rt9$x|_WahZT?`mk|r=$$pgLf)(7#TA}j zZb3P?v%|tdNTTo}dUfufxu##{?0jK=Vv4%w@q3I$2rC1m zWJ2;I!R6vwaH(Z_)kPqOIL(9W;cl$1ZCXjv?)M;`_wJjI?IOl~LBaB?(&rq~K-{0g zk|qa^MsGFB0BIqp`1B77E-hax1*2kDimCuZ~jOd5*#x%z$TeJFCFpy@;`Em zw=@M1ALL+h|4S7}Yxw{fv}>`nMYvNE7ZML3OqoGw$l%D=ZeP&ek? zzDU4JNA!<}h=<>+dB&10iJrPPb3)6&i;H0;A1XAVK9nL|K7i9pa(P zm$Ao_N=m`A&a-S^44NFku(m*o#nBlr$YNq=XLPO#ZR1{3RcCS-r{46 zBpif>Y9OY_Yo@yCdZ#C*Yi+L4qY3LrCKZ4?U9AA~lTf>sOuQo^^9 zS3P9H7$m-#^nT%S+h1}uUP`p<<*Z&VLREhv?H?~5gN)t~IN6R9ve8f9CK8K%YCAu^ z85&k82HCenhHPG=X&NfDJ?Rs5sy}*_9+06669l`g+$`Pi z@uD@`dvO$S-KK5FV6NJ-6Lke9CaEZbYV&M8yKL9ccXLXz>ljFP%D2om{e^$9IuHhA z+QgDLKMtegxi|g@+f4&7)&X32KgWieRPWoS7}m+bvSH{G$|1#L?$X|xu~M7UxNbar zjoalbkbTt&Wbn`e7gFriVlZ>a0uq`?rWb9s3+(|;Tc8P(43D*XjZD#ZcazmU$tF%$ z%ilL>H*lY%+AB}RJ%IFkz)>1N!wJabLDp_7jmwP0oCmzI46R~9+;#afdvr{p)PXe+xu+A)9&?#s zW7{%gVH_wT10VX-DJ0wxeh5A#DE&?(VN|;;iJ5BH@yfI(dhTiGKfAGAx5qf%wpu+^ zrtkHHN($Y6({4R8c6?gz7SPoHjZxFxCjIotAjzuc>%d)X6tz=TS zfL!^26ES;%vvjH*y45icEMUya0K*^oIvY@9ITITPK(RHKFfM5Vg7XS;VoNI-V$q@M zC9&eLwlz%tQNHlmN$P$IHZ5BtDBhtQZo8~YfODe_W&cDR6hb>8Yj6VF+hH}>0LoAJ zZ8aM=jB(Rq9sPZmC7T3nUt4GRnjcLGs@wHBG%osSr6$SPrtz?$&v76UUZ>@rG}5fZ z12+D1c9LEkFu!yK`cCOdFU)Rr)^1!6+2)0mibMN0ssJSjkMEP3$+$i_- zkm}ENwQK}(u|Vb_5{t&xY~)!iM#wORot5^T9(QK9X0;NFVMh7eMnKNi$0z%n$u3j3vF^*6sBL~;plBa= zSE{|0qSl;xOxvUXc!fvitiE1N{M~TIi_$SsTAJREA;a_1U%46+>v!ser21PLY7Z7% zplGL#QXsj-pN#Q3r1rFUPnqkU?cpn#rEJdoTS^08isG|?oXE2TlpQTb_Rwn)Q+mLi z@j^9F#M+m_$0}sxGLR+y!y9s`XzE7QcmbJgXLT;5*8F=w%)3>fxS$+J)C! zeX_KPqhYNMF}SMrb)qVAUK*$lc)0y&uX(0J0`Bzr<@0x~bjv?teM#>+s4!P_Ez_$H zDCj81m8IDh$=Kf|O%gkeTBms}Yh+ub5IiGG>@=NuooJ;W$Il;;^|IzTWb24{l|zr# zE={%doT|CzZ$M8%j=ULMsD)fJ<<*HThr~9QCZwONLCv_{#wUDi5?T(sT3fvQGPcg@ zbtmeV3>}5}2j9A0J(p8H!5LeOk&~!~sOkE!Do`NtBBg0g$zUaxOA6DSi!vu9a)OFu zgixVJk84X5TYReR4Hi?e`7pq3J`&5@{Dkj3WfJDP9CQ0)>%nH7)i5wD+G}~uzfwb?m?~Q15d}#So~?S^qrIglrPvfvs24DZ zy+d8cIsyg;TF{iXJWr&V{z>1s5Dz32G$KUOYM@_3+JP!4=r8mT95O;SBmNFm^Imne zFOzgzh#8^d_CAZtsU$U-_B$?@Oe&uA?)FUS_~C>{X0BaNuY|ES&{F>U>!Nh2?U!Hu z9Q>VvDfwkvd zy7YSftK%dxu<4cdLG2FTqKpMtbLyTVIM?(lF6Wmy3vb9vB3}^m4Z(l5vM*9=rLmzK z*&EI8lBzyX%bHz#)f!}wE_FQLY6tYdc&9x|LVER9gSC93pu|X)@>yFN*OP94;`SRN z`~7D%DEGgNJXg60fH{VyYBqOH7JnTq>b!OB?x)5`!2_V2u#QDh^W~0V(9u(o-N=FIXxG4QaPWBeoYqeic*7sMf4!K|1ze5~Vdz$Ml{0&HA1++4S2P znSjB8Vq-^4IUrq8F&c}eQc@C}SCQNn zR)WnLG@|NqgM8EDt?$kSM*6z*qDS<$lR-ugiJkFXtXr#kod86_7&Qy7g@F^nF=%C4S zLQ7_(fV&usqE28p1S}|{S*Nj3$c(MiS@&ODG|;*`%&Hp3e9#8eEP9=gc5M~wxFX^^ zzuCZ)L{+opgUaSr|mr;0U)h>m{#dLPxM`;Mbt7 z`?SW_h~(*RW}iaMm5)~B-3B2xS!%(5xwcb#wA*3(?8RtW+3^Rb6j# zOC|tKP+xT`Zn`~trx&bVJT#TOYkC{B4r!U=LxJchXReU8T_SmfB~lUgDZCSIXMUx$ z+_6A4Z1|Uq^$WGVf=!iIuY%p!@!J(>sdQR=kRL~h!0SB4>P#383v)&*E%Ezm?d8jI zctMHOR3Yz^zJDC05aeihx%VCD&c3K{n{wy@@iQi2CSZmy{x$Cl7nv^L0n@7f_OY4v zVuRD1jTl8Ipz?xJG;KCo z&FHcTTE9_Z+>C9dVR?5UkP+TvOX5!zhsi-aI_hJEBd^^z1cO4nqQAGcx~@sj?bJ_+ zP9-L@LYIJY1Q9n1wG`2Ah=mO~G(SB=lkfw*)9OZ`l?D@ircAIoP!gt@oRpMNX3UzQ zBREjJ)yraFFcJ$#8*ry>a9dbbIUv4?wcw#*QT><)q7xZcFWw`h*sXolvn>?523psc zq`nl|u`XxJq&GqSOY4orMO1_{k18QwNhMhUSrvUt)b}2I9=z7Wa9qGB;^Mx6<|nFa zbm5z6SO1Yle^G9p1fP(!2Bi^a-mLH<&~aB<^4YUzqMj5=7YTagg4vAg&C3!b9*P@L zg?#?JYwBTan^uV;x^(ekfN=#-$rpRTyFH1yW6uBR)GO;j4F%mqT%2@jsz`f%6%E~* zr44h89CKCWRZ>sAgG+^Bkk80Pxfc6rkOr8&(g{c|{wgr&E3#;$kuA2WVpW~`v(R;p zOMb~zTYd?eMVzJhCLhm~Ov4w%_VV&t?ZeI%Jn9=0%%9oahUoW8li5mc&AoaP<40Vk zMtY5|Hm3TF?4^o)L17h{qJ6?$?x4`xC zB9k7+Pit6xL5y_VnbFaOr}5V#$VLXk-OLe_H|Uh}F>C%O+Q6`w9#}9{$LHsONhm5H zDER70D_qrn=^;$8nbsGyUwEAo6h}aI|g4&0TR;aQK|QN z?c*nYP_+2e!J)UUax5_nv^g0l4r;wQzI=UMgIY(7?-=Ey!`H_ zP+ie}Cbsdmu(H-353`Np0wmZfSL|4mo=RI=+uU$I6lin~R}vBqr{tBL>^*5~^6!Ly z?fayhzbam77Vaz)Hzg|+tj(XX$yeCKq*v|&Y?ukMY!yVz@Ben9+$Q9Ba5C||D9nMe z)a6RNm?CR>|5w7X%Pn|W-k`sg_5YkM!0641m)^afo2H^D5H@_Wnb#C?db&dcFX6^c z+9TE9$XN@~qv$A;L3VC})fD^pd?-}xI(LF6&}coz;D=U$FO*@Ue;h|IbrDIKu+G> zF6a}qP@>9|e9rBBAu5t>rK(qrhb-$}6W<4aWrJs*HR&!urDj2^Y2O#~Ea=#0iT<{oH22B)dQ_&To78-R)26xz(3Z{w zu_e-idFzt-JY>V(3vKbX&x5f`2?*v z7dLecQ{IorQZr^r~&EZpX@ar107UM{>{dJnA z9mQJ)Z~Vk(m`5g;q2KrL%)XfoV+Zw#WZ^;(iQkT6EU zs3p}Rxc4|SFy{J#(j7Y{{SQ{>&VccmAv|N#?C}@vK7^&_D&TNE=SdOaZ)*5WI4r^S zgQusr2^jBFv8!JVos8MKM>^5Ou2B>%g^go>rZC*x^e)dYlDU<6lG}Px4(NT#J}se1 zUD1Z;G@N_Yk{Pe@dh*HqG@M1YP+MTx8?$*y<xQ znvr)_9#Ue7A{rv1IwH$q7296YNU7NltzWJjpCddWTZ3sRN^So%)PcR!cDs}RC9%y% zMxhp0m$d~I^wKs+ol+X*@0N}tDVuH*Y(cQTJ6=Zf#S5nu!v^k=ZdsBO+D|P|5FDN+ zBK5df)dGvQzIJ98axf$1`|I%W0zrAt&Yuk^tKr=SRDm22?`A|PMvoiNhVO3(Sx?=g zfW+OCVf4xv=N1bDY46ODe3NNx1}f9U{M$~~K7)&1+d=9|HEW65M$~Myfgm53LN1hS@t1vH@PZEAHrR!8kH!**~?I7gQ9HAC7 zsoAjfqRK!x9_FM6C>!am`+OG*z_jj>NQe47p~W4EI*qQiI=An-iV_xnQM^pv#`?u zr($029?=8ZM2N)4V@3~JInr<6KdK@3kqA6LBayYK{o>bG>#*+d>Q5g@1$akj8hBg{ zOW>m#2C>udEjmk9oo@35?=J@s0PYQIUN#l7Z3?mmCJ{8A#`yKu#wKil&{p~C*IYES ziXX5^nC@^KCLzh%+5Zj9?K^)d&lTu3soh^d$wNqyXs0{JB*(0@qqYf}t>#`Ovq zFPnk5_%r{x&Ms(gcjCc=xZE4qh=Zi3{yA-UULo2nwQ0e7gX`1_i7cdR{G{xH)WK4e zQ!I8&Z7rA2$40J8vUFX{NfMk;JSQTbju55DRx+`tV}&~0j3H=E@QHa)V*^L*e{6xZ zGh1M;PSJ0$Kud*hyXDb-J(~wE#!b#^I85pihGH=F;97h+Ne=*hqa6p};( zsWM7kG|?WCTZ2!QcJGo#ZH(si0CV0=POB-FbPiHk9^JDLc&{3(9&2#Nz!O#JdC*_o zUH1{O%5OK{(};~Iw>XIbfeln1TcdK{n>B1%nuU8IlLfnrG6j2|Q`CTRc~e|QYvMw23^OAq1`E6(4j#)U-DUOQY}|OCb>bbG-A%ZufdBpbQM<{? zMkI^*?Mbat%stXJ4$bogxT{eb+}T*}u&heg75bVBeeWe**RpLAf|0z-d_kI9B?0zj zm+{2Z&CSZ%(c}Z}+MB6rxdrK|4_CdLLReWCS~Np-9#4gj1L`(|P5(dJnQ>YzEYm5}&gEk6 zHTE*tU}c^4Fm~o*Xt@4?H9YeiNdilBbBBFY`Dt8p+te8kEg_a+qcgn zo!dR-YYTI#bQ7UApBm(Vf2x}YQTM{5+I)zeb(o!ctPYC~qn?R%yY$Mfj;PTT#WLHa z*?LLT(w?Wwn4o9Jq0+>3A*=2gIl=37TaFlMMc^T%wRDC!TsYiA?8no;q3{3DJm3oD zhZ-R&0waUs&&8uTuk=Di_X%xgfE1jroFQXg?5(Ss!e>&4D`pP_p>P88ME^6X@c(}D z`OYN0Ae~^)?t39_h0|UXKLQ}{)3T*?U`_kci8SZ3EK#BKqW}*TyR^Y3OiigT2k<5= zUre%a=L(Hc1WsdBjh9NC;4PMWYm;zFTlosC34KKu`pSTODww;>#=T|2KL5 zXY`}bWA;DU;eLgUlyw0UagwW?3Y&UC4u?V0V?mp_*Wosq()#sN@CFCw0qlVtoTkfT z{@eBcIk&%ga=!Wp6+|ijHx-2Q-rQx=sVXSYZ+s4*B;%xi^0X4rC+PL-B3&)5@^>Mg z$Wha2_{SSJ|7T@?^OQD7{3nwTVWz`rIRfy}TRKXJoW^R)sR}Lig5zwaxyL?DD{?Up zhV~So<)kML1G0X z0WZ5U&>|k_qWsx%rsh@C>08Tp1g8AwzZaMECnwEIA*+^v>n-U*2}C4jgMMC*s~hBY zpjQ3@bn+gmh|^jbz)0BW$7!5DVDEm#8AzZqISIYZ=k8?R4S$^$`hsp^_TDwK5p!aI zu&WYRpl%1$HjKCoZsh{aSH)#dfN6$}>sK7x+*sZiX1$t1pxUvA5iIi=|>f)XYz4{^In%5r+Q~KBCgUu;m&Q zuU6nUy9NICzghD)jxUq9^lj?z@e%=4WgvSu$8zxkDt^alh7w|u7rZKfO<_aqZz`SQ}od=qK zGJ!Of2mrt=$fFDL2h<|;e}f$VwX3vi{{fOc+Hjq7mBF7I>M}Tkv_tN{1n}GLk_)0c6Mm zy8G2wu@1WhPVa8M2Ij7XdZE0NToe8D(Bd4?lXf4vCW#;_ZmxRuTrM$?n6&tY93RyqISw(W{2=)Tn)L5wb-+)iYr65W+qlK#WjHiJME6~W-d8+0rx0`B^ zqS_&bLoi6s!bgb7sgncWd?+C@b{?Gbar_uH>@dcNe((`!{*C!bpM=vh9z z$$q{?@rRiEoYLGkf6xJcn-$NkS6AV}W1(IgjdECZ9l5!*IvN2WAvW}aW|8N!$#g{1 zHx){=k3MSe^sg53O8<+)o!GlraUzY{r)nU&C~DF&BLeN-P{S=vf7~IogBMt*1cBH- z@S7VJiyl4g)h*DeR>z`DFR5J5*j>2xTc9qvOKA4F+Nd39^dxsJty#Z4OK^Dqc5CWc)j!%ZkD^gNLLQ_F|P;!{0P1!4?8^e7781V``%_59LARRJ7OU8$&Ep!sEm-d6OtC*|E7%A}k-%=NCvcerc~E@ko+ z#a*loZfkUKw^5}FpvWJeS7v)v7R{kkr7CEt$>J|pf8*Tj}UXn3zHS*Q5;%j_xtl#|BVu)e_=STmoI0>=Y`pVfxigktFLrt zE<4SL2?;%G$CD}4?w`$kf23XZuK&GOjEoKW>ay0ezk!PX({A&jmrt{tECW0fkuzgI znnP=tSd@lS+mWEY$Japwpzt2U(ptA%ol{jJMAiT>TdXcg9A>b}^I zkpAANac01!SZTOCldmYjX7Dw#K(kaH0F~I4QHS5spYo}XQG3vf<(?Rm)nu11?Y!F%>M=du-yInQ z3RR!I?4Q8#^>vx+-kpO_GfQ-DsD8)(SYmv&Jk{$M?xwtEV8QLD1N`vR49cl&+3O>{ zo0jU4hhazMPu6Rypn6lrqAB#_I}Ye+uhLB9eyKwF3L-2dRk%okZN>oJ3FY<>2ggrH zHTlnS8C`y$t%_rRk7Zve+~g^IUgElE1ecl?8tDagnpkY3Hd*qfu`f=PZKPEu6!3cQ zy+pXI2pD3XgznEJ5z}BBw`oPt<8n&BW3&4;_?dDskqH783A_jszrgRjP*QC=P5=XOZC_2NukW99Pj}A4PXGy$SHM$fakcyv+ogE<% zD+%tg*T`PQg-4bgkbXufRLw7}7AwO&xJ{VZf^>Hzv%HDlow?{s2P^$iqxl-stv7W5 zt{kDY=xN)#Ny|4XEl8TIQ5?IW9Z7%Jt~sLA;2}8y9p[{nDX{3WX?9_}sg8x^3D z(hDS=_-nZ!3_^YzUi}keyf}q*zIkOjYtGGC z6Hh}iYFmJjq9V*7(99i1S|KM~8(0fjE%Ci~_vjwuc*-);g#z?F@M=}69t)yc<~8SH zWBSQd56EJyAVS1p<*8Hms3h?DN#N7}?(^4(RdZiI%Tvo=nf-dX6eI0?y}Zz8w{7L} z3`V++@!^0Xz%6Uootj@73dN9pyY2iBo#WWtL>NrrzU(LrzN#byhkd`0{q-4_>IC*mc0~2#r^=8aK(qW{%xB4vXRg& z;u?R@W+?+!yC78}(g`x{FirR_A}%K!zyWsgrfF``hp7!vdY%UPEk;QO=rVFZ?~YHZ z@-?Y%^;Q&D4lPvc9NB>G#=(7YJhSq8CGFc^@}2&l$P-SaTBxmdC`F>U33l+yXsmQUlP&3IbVYx%iZo-zJ_nGefh~ zsLX8W4jg(cDKZP(0=)X%hS&L_4UfQ2?Khx9l-aCaJS81S&YaZd)5eNhw2?T2nuYX} zkD%=#%2u|fltizHD0X4q%Y{A?3@q)Ks%Y)yj?vKiV@=DHd-fnCZ8`($haZ<~Mk#nwVHSBn68KXYw{#F1EzLi9l*mTI+b{hEgu4! zT*j@_BoI#xcc5X;y+Tw6@V}z3Rk<#aY-!X zfHHnuA=kFh9&GaK`=ynvpZ8rt+I*f00f*ba59Y%(`b&!P`M1d`C2@{DtLo@6#_ySBxJYZ)P?gYZ*Qc#kkvu}+v z5Nq5h@i6XPXm_duY$W%i_b9I)bW$(xI19^I+#|hbA0a#+7cM++SQ2!;2`hb@VsPGZ zw$3T@ZFeNVcH#<;j0`2|YQnMyvA?KirR;P))N7!BBSZ>!9-Yd*nzmke^wX}Md*xaa z7U&F-uujy|^@)V`Es&6&RfkVSP@u)~1(55Ye$#pFzKGI=u> zg^e(ES5@bCylunc$>d~j=rpXzKlaxig@M$JnD{eq0}-YZ(JeRZ@ub2v+s^8{8QNKi zOyTdh?3a7ouP_1)dJhU5#Psl}2*_jpHQp*;ISvF7Fu10A)X%HrT&Vh4`{ejBHctbLjQH?>r1QeF{9=q zS-mGDwB(3#kk1z3R;dT)q|N}#N&i-{gjae8N?YX2 zD5@E>KkViusc;=@OFM3pj)vqYT<6G5HBoAW4d}uN$dY3f4mn(b*smo zDxWoI^$M?aWYc3@ zPbpK%^K+SXj?#7CLp_S6(RY95-Z;oT@s7D1|Cs`L7t+A%{;iDBtZpqCL2=Gwhd1-bC4XXsE$;AeCg_WXt`i zF0xuv3J_fk7Mo*7Mc`-}dE@tT*)(91E zlY0ab6Re|0VM+5d2IzmwMa3Q+TWK-gy_cO-**eqFLNFHl2;=^-C>uyy>(*5f@%^IZ zGbh17Q<(L&P;Hu4{#l3t9~xhTt`P8l_WKs~mc2n6?r;Smb0rE{eR^@#lFUIW#8Q$PZo;b;Ne{6p-Hy}Pz?RxQ zd!TxMmE@sG4<(!(@Mrq?W(5Eh)k4o{XLwAD$^0Q%*~so$3S-hbfNP(LEW9c8o=P?* zvLuzB@U09}bs%VQj+RBOkCLFOvc4(GQ|QgN+0y~e#MGgudFqAab?__&#V_to$Rk9i zj@^8&OucNB%B@R_G+*TIQuL`{GJ&ZnoR^%|c(N}dV`1!o3V+^&a+FtIe^w0dT2!LJ z(R7_Q)qOuY*)dh#bf({Tnu^L(bB_61jHF1;x}&C3;Tp&?^_-f58z{&L0D0fA8Wsg-M&Ejl37jl1|KZ-{C(oYO{$02%}8E#k`oAe*qI7F z+=l8qGKrO@Hr6d+tipDAX8^AT!2yLP${L0}(m`yjDPt8Z5=QI+qJ{)CU(jA|sM?AW)7|q8e>k;a>s;>2{nZnl3StM83wi%1= z<`mLrJqwE1rGjri`PAfRmgKF-XTPIm>gq9~KsPA8GV9_Tomao}fC7>Lj9tC@Wz=G< zt z3MlaCwJBe5y>b_wcrOh|B)3yn@_);&~^G_#jA$XBi}LU zC+pFT{&bGsKZVFlcFK_{uM5mx#p95Vni7CQ-TWi4CYljbin7d9-^8zi?Vg zE@5LZhgMym`21nMClV93AA6I1>=X<3^=2UoKqd&}X&lv^HX26sC1Ctj3twvig|4n5 z`z3yFTRTGbTie+7Q5vuw-;cq@Ped%VOP!!z3Nem+DUzxuqsQ>%^uT=V=FV*ZSojEn zT~+XKJjj6$p@I6?BlMN)S7sa0Z}NPf#>#I4rbom*yXF?dQHnLxVY>aoUgrn?l=VrC z6B0VQDGgp7nFficP6r(lQW{;AKu@-Vm@Y;DiF{V-mu@ zddYwgGx?m8_*`Rl!^w%VjRZC|HU-WF%-oHu_Sk7a4WFL&F4W(%1Y8Db@+rHY*Zq55 zHl`vFJZg^uK9T$l*a@T~tyr8B2osE$+zG+M6sN8;5-{?jwl$VyWMB%7_9FEA209ESj7Y^UW50 zJ-RiT-)+}Px4!KGZ?&_RZp37aW^y*L-I$|8gf`%QbaFc*@h%ZQ`dHoW9ij65H-5o{|{ zS;`1u0n$D9Nn<#)zE5nMHc0}ZnBmIJvFqH8m1KCXJVFpnM}ajDXDZA-bv0J)C*&I2~V_v(iHMWjC`4H^Ld*}fH=Yd zkZu(Zco!Af?(6ZAcO^Y9&(mh-wInbBu6CTBCd68_z)oSUF?LQt$ zt)3dQnP57z`qm$+ITv!kuX;V}2t~`MxYorMgdW$@LF}z52&y_wi@|UPkHd($0h50D zvD7D0amcMz1MuAuaQt=1`d2HYM>|&Xt5(Oj-t5#{j&Xz}-;rq_YpU0BqtFvpo9P@= zFeC>GnVk>Ea@1tByRmH6)Kx2G`@0q)dt2;Wn5$F0c5W2>4GI%7rG*t(+@E40wVa|% z$piAu@bcnstRX^>`QIzge1kA1?+>bU zB}W9-*yxp&f}wuw-gYf(y=}Ixe!;#AS+2SV^OT zu4`!bojMW9T|1?p(k*J^PFL}lc1k4&B(C2h^*ri#(Z4V)A-UekHk@w* z8Oj!(&0^HmFo3aj;+eaA-YT%Tu$bG)Fr)r}`Q)=&zWU3}e4DGJTn|ljw@&vpDw|YA zooiTxsiF4$B-K4Oe5Sp|vpWaA!#jI9*#!at!|;!$nR8(9N-W??Z@LP>oWo8|-a2)Wto`==|RatO}hh7~v5uNGID}r{-Uif0~XL67Vs#5hzAbXJh*6q=J zlu5uAxL)@F4zvN+FNaseEUIszpZ?)-p>jn-?&<&I9J6oWQHF! z{j;V!o~E0X@$;_l?oKHUnnaE;8nLtIx~nx=KG3Tl*+QRe6rZ$%I&rf#-`s5QuCA8? zVYFE!c~UV!QDV`22a&(MjFe_F zTu}PMXVA7|n)!}r>%qY@Pfy3gUg}-Dja*?LNc3;+nO2;Gxk+J+8r*W=GzS;v)DPyh z^bW$uTUKn4Y7Kb}rb*+XYZe+iTR-Q-KhU5(5;`6{he|1C#j_Dr_%_EY&<1zJSdOfN zb|5+RNFgzoJpAef@9lgtqENbn2q6LFqm|o>E2c^0swSh7WR4zF;y}Zw?H$g_1@A@% zdgvs|s)2y?Nph8y*|0YYnb`<*TR>#9%u41v@n+^rXf2j!d_G4+xAYrat{pEwvX(gz z6l(HDmhKpFfXtMSAO4}&t}Xz$En>z?9SAF9vE<5}HXjBhCt?YCR`X&8T$7&+aOR=5MUr4cIjbc^M58 z8_68%4Z@t(5gAuKJSvJlX-^Br_Vf$gQ=(gQ?DX1imZ&B$QJINa1U28PU=Uj={P2O4 zWNc$(*BWiQENA|6X{IZ?zA{x(uc&3gDC??XS=A45^E$J=;WT+yeZX=>gXNsL*SE&g zY*xi=0C;Flhsz1L9# z2YID#LUugls#CX*jE32SEib5`*L7uH!HN;0{A`(T7R#T)45~5{g$Eb*^C={CwkFoO z!bCP~*QYmgqg&s`3mZn}@9@CCswzYgEJRgSK(XIk4g(dQWY-0PUIh|<{qgRJUwKGk z3Zs_DV|J>~4A{K*;`if&<|mb($^I96?-|royS0x?7Zp%>Y;=_(AfWUf>{3Eh6r`ho z^xkVgL`0+r(n|mZM0&3wARxVl9!Nk+=%FP6LXzL+ocGLo=0Co3o{#?z=fj?HMh5oY z>)z{L>sr@!-TM@{Syq}y%VPPf)?ZIBS^@nzaq_pUw+zwjl&sS|gTF}EF=t{sG9*U> z7@H2?ynjdtn>=@($q}*0sI!619t>C+xER0&JOh~mU!97|9{uz?HB~+n7FON%(jU~I z;@8jX>KL&_odgQh)zUvPb35f|c59xpfKhEUhAz$PvQW{8AAFf`a~k^^({hTQk#CmK z)bF&}i_i>8>a@#1Yi!atK3B>lcu{%Rw7^r&kL=;^a7e5M(dz zDXi)V4t}I3S6*uok8gC_!!=zpMyo#eP$S}{buLVNq?>Y?smH%5G_>4~wM6?I4meiX zc51?TPQQKwO%1b_a9T5s+xW(IqiIZn5Hlkh64n)@hmj03SDn(EXX9;@y72+*e}Jd2 zt69@Q`5O;x`S=#E6`=dvGY zHfjlOY^4b2Z6x%MXMUx8#>k4#BZN*m#f*yUsIdD^o5D>t9Qoc#8Q_J7$($y{?K+2`9$E$F#&`z8~U+zJ~C=Qxs?; zv#E$1aNo<@lE7iC@4^4Riqb2NU6sp4N=!uiQHgwq}JP?nIl zTdY0^rcP53;&*KPz}m|*^S3A6qG#aa!^C}IDc3Vir+88uTA3biJFt1KVrmEMe5wxoQdD~{#3z{zlxu>Xped`mu4w%{ zRJsYniJ}{4F|4Z2P@kPSzqd2EX!3~G>9Z6=7ytHPN(ed@K;?MS%=Hw}gJW$`uU~*l zBz1VLbB15enpg~K4kXnkg7C}3mZA$kdn!fpIkzw(SeV=UX>VgrrNYWO!kAf#!T30C zR*;cCk6v^NG+hmFe=al{^RvFQlq>_LXJ3;{!|D13-svK^JWIo4f zAZe&s3V7`eAYzpiz4|0iezmNU8B#MW&-@~I)?d}r;z_lUrm|hW6$$`2ujUqR--|vN zca381^nz2xOAP(r2BkaR2{|9e#eMqW)NQG%qm>58TeDGVu zss$HKZB5u2HyiWph818N-W<~_7^1$g(|J&F<5F(t@HSOS=i##d;1uQ1FIZZoAxA~! z<*GE(Jrh6SiKY%CV&{i7)vE1qG;+Dxyu6JDP+Qrm6=0Fkz76G^S_5sQF8F(`!Up8K zn=flScSnfnDCwc!Gv`Wtl3fmY6tF)<%U^gLql%k$4Qm8)3eas5;n~chc`{CO42RRV zJ;)nZ=`3+me!dMt+9@|3di^Ah7gH<7M^3Vo7;_L<9WHz5PA-c~X=jToO-K4^TsBkM zbf((`>D!A960c!b`ugG0Nv|ztHC4dLxr`p=so4>6iSv+pN8&RqNVYZBP~%LlsJf z$)((n`>{xH6A#oP+EB*;Bl^J2c39lXMmWa*FMbzk5Vib?q=}%3DDa3+OVU#Z&|YX? zfya^uTVElI)l`+3Tkb~#JDa&XKZ9z!Pj83QZS?&m^i*ACbKLzW8%V=YD*ow@T%8+x zzQ3qGzXPe?6vNE_NJ>~R;JfO=XZz#r#BzA>=EQBN7}sR+Q+UcAZr5UPWl;|;trHr} zTD`%(7x-dgA)FrKzGly3gwK09dWy!%*!{)XOfSY=-73l8^m4Pql8S|2hPYi3HdLVF z@1Um5BKzXmAd-fcn+j*$^bD=MyAfnlG@dCv6Vt%M*On$4uE3q)(K9={`P^)=H~5+IA$WBauh%A`0Ig!l2U6 z_H3LTP*N^@IyfQRU8(&6Hq&U9rFHd-&qovO+<&^5Fd9CmeOTVpH7ckEqy7ZRbi!n*&5w+JGA+Hj;vd)+>T!39 zF1uB!KY2WM2;VBqKjl*DwO=%MJ!kFuOq>UvkhiOVWnE`X-o5UkChE3KLdk-E4Sy@t zsy@;J);_6w^LYbZ(qj2P(5tz$aKHz6a-lzF*b7t_`{X2`(1?6uW-5P zNe4e>53jU|KT-7k^ACX7sqyDUdF9eADAV|$yMH^4+%4`vuw0A-PZ8$g3CI{ieSUti zI<&QK0V839=lLeZ9PMx8Mdr>zR>5??k6D)FHE7HP5Vxc3 z9tZGr?B{qy-|}i`xVmo3`bCm6PpP#@ElN= z)g1}sv_h2)!sp^)4#6p6(e~JXt&d_+^}0@b;zgnC&=LQy;+Xz++ZJTIvjqh zl@rr85KcqGU9xeoM1;HP!Dyf-4Vb|!RGLNqG{w_%aQ=}~n9Bj@UI zNFL*$Fs(fh{|smrV$a7WQ{_vkyXeE^=L`nizp7j4NRR2HiayA~1>w~eMc{&jH;P}@ z8e6wvT~DzW_Ko>YoFnCVc&d)56)5afw z524H9Q2*Sh`=LMp)H(u+d4v~o;nN5W_*23|ft%a9rRlk~xk{O7Jg)@)RPh%pJ5jBg zk7L?0Czl`&PbrXk1<}I3g)4OiSz}S00EI%X5l`ThU_?e7Pcf&_2XoF$R%avjMGt4u2%ZQS>s^Snw9AFRUeEi{Igy0QU`b3W zJ${dpe>BH;_X6yuDDzhkuFfB>LHDi>C~wCs`;SchhGlw?COtS`j%QP%J_+#w7#xg|cn)o4yuJSRfQB)PjNEtp;MWhm2wI-i$)pa1@} z3CUZuYNts*|2U6vI9J_O)<OLc|0_98VBTE^#*;gwF9b1xRAR z@+YIQPH?%`W?Ku?OQujEW1c`k@j)J?HA2*Q*GgW-^2VS`YMMuK;$<_x+6zoeN2-vi znX?5ZnPl^DmTOM%sSYi3lP)EX_sN9JfJcaHj=ir6h+DXzORtf)mG}1DBsb$E;BF6_ zMkO9AC+#7n>lXXPv$o;d{UCIY5^Ai&^cQQJiRrgz0*F}IaJt(tts-C{JibV13qF97 z9hSPA+Un*P3R+EYLZq=t)Y@@(Fx-F(T1f9ZS97zzA6K;dmmLb29_~#|2aT>PKsy)p zj3o;6N`~&4AoHB?!bT2aK|DUysrDD=H6aY=WI!LVztjS1 z#X8E=zUjkquSE$KZ?Ny4@9mM~w%~I+o$LeewWe9XCu%$RX{~!a+k?*tu-59twSk^) zYPV;>$cL(svFTXCWPbDKer%c_%O=)25KvX`PF@ynKx(5c7x$!4@I;F>VCp%mo2l*@ z&_#f`4PD4a&P+V~9Tac&r@%kSWYB7SbZ-%;ZStFVx9J<eW5)!!h~d5)C*sd1Aq0mgRp5+n%C+V8>w$dV>aKlK!kHn(4+ckVE}+H_441S z_i!Bx)$>_9j3#&iZ~0EQ=G63Mt{PE7)5GLuMC9#&-I@C5wN`zE0M()!fX53L^NuSld!GHs18b*KA2qR@UBWs+~kT`Axr7Gu&e30v& z2Erc&U}3k<)UIrI6dFwMU)L3g?CkWCD-?^mdv8s~=Insj{IjgLrf&uG0m}o-_{URI#SJT-2ZyQ5 zS+{2*0WEgt2vEcm+{Y#Ce6pr}3QSkBOMxI+V)5{=h?5pIHa163%~j8M2)SgM@v|-y z>jMW+Z}QX!H-I6nMP8%7BM7@*HMtlCu+DUCeAe@^T00%L{CuwHFIoL=ZoUMIJMMuS z*wU*PU>~{_yBy0s8aVI4xB;?S+-uMux<0kw^S3_Rusi_zbqc7B?349NI0V>i0x*-P4Ws@thIG-sM0J5(l> zj!arfIFS@N2={ysvN$C^!6q6?5?HRE{y%4?KtuzBcME$nA59$4SXC-zjkr2cRME|89?=OdSLR{o`w)F6XCj?umm zW+!iAK5zdcZfJcWru6DgQ^l#@wiiNMS@BZ!@6T`wv#Cs}xN^;`XkQ&XpoFn!NJBic z-}K`Bryznu&&CXf6<_p>kc3C=7WQ5O-2Fyj**#_OS%tkM9aqKs-IU|(icW2cjRM9Qaumbfb3vr0?~-LBk$0 z^wRUDOLKI)wf_yppdz?KGm5S45wXIn?gjeFPWcz%4ZSKfU{;fcDo-yo<>lVJ{ZyFU zUcylX;?6Ov7KjsAj0*X(}RWc=?_AdTzdvbKE7R)1LC8tN4Z1^pI0@ z{a0}usixjJg||mp%CQ_uN=jtW7eAKM|H;mp514y>&%&?B*MoX>TA01Vij?NAI3@3H z0yl9ijtAJ4eSs|(Z|?U@ra}u$qc{bsZ<4$6U_zRGIztK$kGNgFH;#~OaFy%I4{SS1 zYD~GW>`ZB*a}yMuMiz{QBs-phL(X%g4a(xNE3Uwd>#ZZOi3EY43~GtNmsSw{XNr~5 z3F>2fvy~3F8t?^&#*#>2}6gqw(rti%;CpZhgX&pF5}l z3|p?nLCj(5bw&B677&-YBX1Jy?yc*v3aV`eoO!G(HR8CK(D^r!cl_WFE8~8PlsJam&}KF+ntv0a&Wt0*cC;~ zP{FPTGGN*f5oyI2U0)%D$IHMkrDaz7)2cg_eN2-QQImJ!Q@qWmc!zK8{#(IFNDNBJ zgmd|(Atid+6-&3|moCSH-FGtND5 zNtAN04QjJt^o#0%XB?oX^J$P)pDT)`ig-;J>~X_^ZTX4Yy4tI{@q#Jje595TqK}nym%9E{Ww5^2A8n&y*|$yILq4w?Ru;|=W~$)^*tl2BG_u#2lBzEz zR-I;;^;&47;XE_ShSEtfz9Fi6G>k%_KN!tRDwEr|bE~sDzQM6UT^zoDWJV+a$i$_tHj;4d#F-+afES{5 zWdVjxm4L*f>p`ON2E)slIT`(XqP7#f{feRQQr?X2D7mP`J(3Q;j{=zH@E?FT+V!S7 zTz-2*qBrLBcECDOscMvBtepn^l%F@uT*Gc1qKN^9a&ac|E=Ec+s^NXNHiShF(+~D45d*>cQ!h7|6?o{I37g3XU=zciUCC1bPqZ zFDUQWZ14#)s9BpO7lyCaz=?IBf<5vCC0FTuugoho%%y4%+|VP&Grp~kg2q6_6lC(n z&k0I~KQS;u_iKu{NxTGXjjJBim0QARc>VXnkLMPSmnKaB_pz&w3e2s0L|O#g@7VuS zlluP=KgVze*R};O#GhE74O~$4X+{CFsK5s&J_o{K(nyW*ldP+hD~P^?WWn!E?a!Og zd4!B|uML&PZu&#HF64l^|5kxT?DG&*FZ0@%)73HnY^9E z)MhfAF9chT7aH1tfT$W{MW#+{K0OLcsrs-p9;pebXg3JxG$Max&ns zecaK@DPU#TGD|IS@!QtNeT`fi z<1f(Bvd+QS7u#{-FPpp!emt>+)1Ox`HDT^}C`4PgDZ}kHat=MNNwleo7DfiTu=OzN zBrIo$N53bmJnV_+m(l@qC6yv<;(no3$@O+mB%0tORG9;Ojege5CKu5ct%vcqqNMTx zJ!gUrr`%4{`|2Oz8TYd=3LyUT!sb9q1rcQcmJqI;BT zl2qAWa=N?WZ0jOOfCvF!lo zXl1nfM+MPjc&}3DtwKR1q#TDSFfbM2Rx&aV+dd!rFjc4*DGc9h^1qpIrg;PbDFsHp_}eE_|K|3TY2^m8=oV zYBkV$%Em@nF*OdJmSkKWEORw;rB2T~43Bg36rqh|P}$ulFX6n(5MV*hd+g^O8#kKX z+0sp6isVcC+u$0B1ku0vIoG(Q1b#}v4;_>Wi3QNTPw(qiKStz?3Y3LC$K#^G{q@RO zP}h>JeMf{frs(H$22_(Y>eU@UFO~vKy&RK$v99Gtqxq-ZPtn!X<&4je^T%YeX1}w) zhWxA#kg##dnD$wVymPBvWvO@#xm|>sN!t2rJ}ON<6Ed+HS3T))_{z%)Tby-%%z)9T zCm36a-&Eqex-vgYx7H&nK|EvW-X2{5`4mmw-<=iq@oQ{f)QccV*9r2S@q@#d_etR= zt6>dkN={A8!$+L-t+5O|4ucaaa!PB!{NUN$8g~qYneL5QSRi%9orj=~vbeW8jRmKe zE-+O823T=hHb=!bYw%&zmPhGngOCDF>#nV|>ixViyg#?qtmc4vys@a2!gL2LP?*~Z zAyf_vV6#wkna$-aPVjUPkMIY3VRIa&dPZv7P7}Qq*Boz23(Z@H#%6reXbaA| z{;*4-FO~6vM0W90cH`ovCY_P1PMCQ%jF9X$zm1Ta*`A;I;wGtHz`bh$YjiZ5lC*m$ zD$$11p5QrJ+UndnVCF_nWQ|L1T$2 zZ!9|tBp5*v)8x&1ZBh4$vb_3+mw)Q+)cCNpOHOA0AR?{7!#o|#8b1&-Q(0c3EcYVz zX`DcVym$*ZbSxAr6IWnR_4rT@y}0Dmckg{dCxrAPl`9}E7z5eKKQ(SeM8Q> z6ScFWmR>%BezgD5PlX(_?v#6BkN|Lb1^e+iMo^D}*;m(5hU}Mp_ckT2W4P_OezRXc zB1J;@1}D0$rE=anzyfcZ9P^bXKW)MQC8(4S6xAx8FK;Fwi2&O3+kF6HA295)&spXq zf{lGv{1AXTc2n|{Cl!)Lc?o!%=in2i5R77;FXXjt&>c=E_(H9JITx^=-@`%d61jNwnnb3-VbuIWY)jkdAsC^Co-27WtRg}a+#4RF>TN5+Zil5s8th- zOt|Ti321ZPYS1g@lC}CbfEmN+Y^oC?5nyf0Up@C2k!X75C}A0^t_u#CB4XT9zs%U= zy(_lUq8);>DRXFibXBrEbnE9f9QQbNBB0(x6wu?IdSXEuu2LKe1U)7Ep#ss2nEXDBjO9jQh ziJB#3k)2ar4bJzCk!!bM<2x=dz!oO}kkbviQ`8a!Uuqq>xCRHnI-rp`Y^d}MKfAQ zgEp-vt#IvTkzyY%{N*G^`HTs`feN4_baKjb3VY4SYq$mXL~#-W=?71PX_qe zxUA6TfLX&|=>+T8Iz(v1#Y=65b18qcOz2EL)(M_eN|2w0ux~Qv0#r|?=87NEW*yF= zOcMe52~!xxBOPMJ1)68X@wp7Bppnb$VHB`bumaWh!$#(#y~z*Cf{#KY5*|nB9+o=1 zUJL;oAMC<9AZUp((>%U0*<+zrd@rxdMhll%fgZ5{1OfDekFbl?6_NL_qic&xihz1` zWc`wT%iHC=kLksQ7oCH7n|Rh#M!!Q0Vcx5g`shBl8Q#x;ie?S(XSe@&>85eN0b0-n;zE|-6SAreXLRyj7#GUr+*5VV$vKs z&#F>p&aPQSw;h_gQ<7a76`6!HP{9BFFS#?a?imKbB!=%6)Klq(#zU{GUN1zTtszmN<@O=$95LG zyKM5(ZZzO@uP>K7l6?U%swZTlZLKuy=e!Uuv5_j?@C4v2TKKB}4!W)ZlsHd2^=9h6 z=4q22D5e~&ea$r<^mz@iuXO=9_10Mh#9}<4JxMc5jc;VXtXvumD1%IjO|}4a$hXlK zato_Q+IRURj(tTn9ZXLD*%^hRLo4{KC~NIydvF@~#jN0Yiw-elI1NF@oW={Dx&S6E zy8n2)Wv$p`dV99?9>-(iBiw0TSwr7wqQ{0f42myuWFS|WuPb_p{#a}T>0G$4@a7FK z&{JIpoG%_;H4XS?XQfbK6E|q=I8`Z}gs2BR2%eQ&r~@Vq*54sSo&byHOu4(HiB6^9 z7CK)|VETYvWq_tBzbLjwUDd<7o~QK@1}v0JY=We0!D`<0Yn;2pR!h zE@!&Q2aTEDw!8=g>`9CV)fv%tp0ZuCD8w@n=y?Ux!!j6FVoG=Lk99|Gqiimh1zt;$ z-`X>u19Vxbay|E=@iEaL(v%YTA7%o8?F7ZM2Q)z3!9k({T@WwuI*x5yf#XCQzc5xH zGroMa#IMT0DLR5t14BM_8}{>qK6u}{I>3vSONPV%sBk5$Bnm=b%w{M#eRdmgd~=>& z!@?&b;*!MYM8-zJ61FR!Elm~eLBT4-31dQ$j*_0l)|yj=1t55;Q%sCI+o^f%mHI4o z{O%vEeS)b8LF!z$Zm-mPtogFV6q_QhMl76=3rhER9mKebXC^zpRdbzlMl6KWc)pwF zKhNbd(c(JH+~Iu>NLCU4@|$i-X;$Y{7($@D6!QLzS5ad`W!=#=32lULA7ZC{oZX^( zWCit0W4(mwJUer>$AG0JZNL^_b9HLz&Mb?h`L9n73UvIeFGt-Iflu=7@n=cR8fE9; zxel~0qN6`_KZ`WzCP-Rp^psonSPsSJlsnJ8KG;V}EZ`VUQ}11y20R}Y^3+B}8oEvV zHaCrb4glKO(Sq>>?BdqcpF<9~K%U=XXR;kH76Uwl;s85|kE${-Jo}dF9r;0+Oa$-p zfHo)~#RPDtpz&6E>zA&uf!JW}z2@`w?(0m@q3`rs>G(yNz@0!s;nu!BUP8aTmgr#! zxTgb0P6KlDo}?^bEXK_>pj$Ia0T;zfw5@C?LJL|JRbiy$-2u`3-Q7E%zR} zreErOHJnadn{f5T8a+!8h&*&4lsB+Hf;dzsc%WVf_2aEqjNK66KwR0_V=;n zPVe*{aR96mXg@t0->dQFJoEa8Im>eTIDltzJrSav4_+k$U zsi_51ok10KU8!>b*2MYodT^G(V6o*PUmrwN9<|R8MvM^oiikHLBM`N`v$+AuvFc9s}J!9D2k0STTE3&VT4sQ!vfk zY5_v&e#__Uj}{6_nj}YcQXJI0?#`-ZDhJagFbC6JWQ_YiGTpy!hW~pI=U>jiamC#- z<3{Gbl`v(W;Vad%PPhwp+;TcWq!r^U7C(ziSQaQMhh(|!v=GAq@?OaW`|l?pbd6NX zb(WLP(P!(kFkaLw;@V7{)Mt0f=P{zUPw~J^964FainS|)ti?XEy_jg)Z3-n@>gGwK zf;QhZ(-=-kfoFp(|K8gYjrroir&ny`I{HE{nJ*YhIq0y-R6V!%BaWjjfO9NF*Yn~p zrCZG^+QUBHiSq2~EJSopzl|-(iKq3e*uq^khe5RXYV{b8_WupE3w-^2pL)w^Q4V=^~ zRne7`Xd(zWXi4i8v5W(ExU+zu;-}frXK|1(BeR>ElZf|Ls`(iWsz($$&ZUrXwrQs= za#-;t#NlSfTc~fpDIGJg$6m9_NS-<9wf(`P;Cdf8_Q}wRz|@y_wj`5g=0Y&tk6A-Q=i>EWle1vt-DVp# z;t>%uN*S;j>A!}BL7Nu(QuWH;;P>8w#BK~PUjuI1=>RwndI9)9Pg*JW4C>+}50Jg=kVZ1}>AJ$)WKq_Ox# z#i`}9XN9zn1c1}H<66VmdOlwdqoUwKW@=W->Wr~V^B(>)Y{7KFDfFx_1%d0b)*szt z{x@MePch54&@~OwMFnAK5tGQM8$6e|&a%xmG+*g!7H;bK?efx#m!7ruygH1Ym0$nl z-$^NH)wC_NCymZrV=@Y#0EI9yeDiD$@@x9v-vYc4{r|L)oS@Z?3BrPIJX7Zru2b7G zdpjb36L_#6#}8)lH}d`o8qH=2WDutC#pkXkv@~TjG^g)6Tp3@aXQg)qHnP7J6d*_P zXPyB5s$6_CO%IT>PKT)fAD!s`R?xtPbCT<1WB%Wb01yl28)~y^vtWV0xh8=X`&f+z z$OTR!e>ckm{^qo&&dQ(l)?xme`zG)}=br>M2REl`{M{@KxZnrT0`C|7oR;Y(J8+#5 z{39muZ@cq}VEVuKfbS&tzajb0rObar^53uIzlrNV75xAGO8$TTmBa!?#JEYEsVi0$ zRWE|+!UU3x34!nduUB8(^h{jpo|o(dajCk#%+-d8LDS{WE%JO1tY=>6PSz<#P*O@0 z$&*$}&-QLEDhSn#6q@MoEGRt6?Mo5^am!yOco+UvxBY(2V!g8^Rz~yU0ck-!AC;NR zb##yzBb{jM8*b#>afv;8p;NXR0U0qNzWpdIJL&4p3gyD{M7j}9iWY&mE z_@CUO3)(L~i_nIUyb`mCe@T*x=ard2NBcTj9d-NJr!B`S^%l-xp`3C~@m8?43&_BU zlfdTuB-k1)EKH8<&QNJ+RPjnlOBPE_a}LBBD36*D{Ob}qga6fh{Tt~eCi5R#=SyJQ zjJto5>t)%3*wC-KCzcqBQ;uuj`z&3dh9iQZQ-U!2Bfqr(O1pSI@$qS#GJ9T_TCP^| z!e+x}jFwf}RbaM}%moTmUJuo~Y5K78m86l)aCVhFisDi$a&>&=z}}czY~jMuTxESr zGwbz*yJJ?NoqY_Bw=kA6Bn*`zWbC(>_&_ruG$)xb^|B6s5fanf8m%l23n)X<^Y*43 zew^Gy4tPB#|AM@mel*}XpwVZ`mV$R%LWrr5Sef+`FXU;fadx3iYo71J~ zi+-&T5(uw<9IugqL(k5AwfMo_<|fi3b5Ig%S6rvzxq0}kX|jg;oagWhmu3lWSW_aF zDCea4H1mZrL{=q3N4Z1-zM&Z+SJEJhIHHhv%gwPoyXoLHH)Y&()ajYCkoksI}$_4w4-87~b^%V(m_d>PuJiT$?+h-Ky-Io^{ZT`Bx`Lob3k z)eD4DQZ$qCW3($u&fdL=Rg=tWgI?`G%XHoAf|$gif2b(ic+u4jl=4a=B-XY_zvk(b z$0=UxFV79X870RK=gdoGMd)ahOaQWo`A(0(z6nA%Bnn^J&P3u+;LsI_r-NUJv; z@$!(>_i76G^^byq8p@`L>vkx5_QuRm);*{qOGiUatDDLq5yaYFiF+xh5sm#eem)v2 ztjj(lUX_u@h_~%x@8eX$Qmv9G;0%)cHV>CHgjf-8Hh5C5>ub|IC=U zib++3pYmRL}{cSbC;=UIwNGlFIx zg= zt%`NR2Ln7$ME#Kbv}$XSCJ;+3ClJeNk4NR==z+MV2y}*=@t&M~U3ReKj4x|UXaoja z{6ilH1t74MpcP}@uCtPd#=MHn$WoqFGY;vI+pXv$-ww(n4IQ6L@>&>Hb;?qRY{0+SDOQ!nxJMwHS#U7wgI?@LB`- zA_r0n!%G&qmyR2D;``=1rLKESE&9TGOl4+CA$ zh!mMNo`lEaFN#KQjD|1`&npN&QO*#Oa6cvZo2(3mN$Wb+O-d<$%1A(C#k8BdDNYEv zJ6?(lT9ks~t&B3+F}t(suJCzN9)fr zzajEm+6Dt%P4izbwbc`rd0m_=siIvQXH3f-4``7|--90<+J}nJk=&&sqc?pW)-iQ{ z(IdrkS2(W4HV517mte>I9iKOC|y_t?h)@@Jzu*Y3mkh{nhY-hiECY2S5WSYPFwS8qf} zBlc@#g`)i(+xR_iZQr78?~LD;7YDGykPJ7Ajq(Gu4ryy1lB2#mMiWU=vZy`K{ zn`zXKgHOJb4!JYQWGhgE3c?RH>P$!K9C9vhc944$0IhkTR|Zg5A!`$%rBU}|Kex-!j>>rpJ5V1>r$%1-Df%$c|??oQhA0$g!#){$K*MVV<%xNYzUI87e zUhN-dV6ZcX_$lhRloL=}rMbk$aR?ULs>ac??mN@>?yVmdjucWd^?9PRPB9YKj;(BO zlh!7^KJ+G2J)3X0iaoaZs%6W-+J;(j7d};)u9!stUZjO?+(eU} zcj`!sg=M{?f+QbmMnBHM+cd)-sh$BXP9MqF9~}t?(#a-sVKPJ`p+qo@jh9(|si@|! zTkx2f3x_7d#5lujo8^rWijCC5M@PSf<)^~We#ADU)&)L2oeBy!p-0_90#MxNO?XHU7eDuiWqG(mgAFd6ySaX-CD+wVZdH+h^IRna3zx8W?X^vOPi^{91l%=IiodxIyBH4(RJ@_&xtY)(S$gy@F91CuNaB1)w4qh{ zkwxLX3(MXFOPuHx!U^0>LjlDmSLp|w%bM{wM5Z*g{nT(%HTd+0_qnwdyvbZDo?5CT|FA~5Me_PYmC$nW9?3D*|PB-Wi*IQ0^aZMIBmiMHCaZaOfPk ze%dg4bxrqXjGHS~Hoq(4OEEtH3Gaf^CWxkbtGk8ztd;jU<)363+1N~GhZ}8td&rC5 zx$k57`>zU^8Pwhwp;d`3O`wE4>g06l$LmJqW{s06%fI6C`oFxo{>zuhjtj5SCNPp& z#)kDO8hf=@eKw(mECm=@m23|mjq->4xN)c|JY7c_CPcLx^&$J+4K2?}U%zY#7K}&~ z($cQpR>~MaviEWH#6^Q_wKsxxy;5D{ZJCt76jL^||RE-}^E$#|v zCft0J&RNO+92bQu^sLq*vt1>P z{Nu)cRUU8DAbY zE0a!-Ck|V5bAhr_Md3wYX5*gMua&5(5vSp2s^s<4u)Z0u2*x*GZkoh0$Ybr!ls#pMbLMr;@8wKmvZ$gI1v zPzEt#Vv6f(A&(3hi%M{g;8(?w`#QM@o4}swk3)>CwWxHU^SwfxNoOP2$a#~Zb&Ac~ zqFn0q{cBOC^7T45P&=xUlK!on~d97A9YFAj>dlB7>J9^xg)c=b*0{jYT{DEuAE2Ye;$tAwXljC zyh@|-1#hu%($ik4Q{wsik0t|Nf3c*ExUNsQ2tO_UI55c5LRdrC&!@D=wXnj_wwfN)(UBaV7?QP7h9K=%!$ql>li8V46B!&VVwf$d-_u_Hl zT>|Fi$=^I?oAKku*L8%V64obGR6Z5y>?V)6ETvXjb8mx;esReU_Nm8hN+-M%ol^V8 z@}TQN-_5PDPvP0{_Koa(M=7g*dx=;U&YDhO&&TU8^-l!TrSObHp^hrV-Lei;=a>U; zUBwd8VR~2p^4uq=x!$`}%L!Y}OTAw-#hV7x5QO4Ulj`l8Wxzro3*Y`47OD~n6wJ~a z<+-NSnx)uhejh+jHs&%TLp?*h`3EdZPki|T{OBobFavk-Cr+6W&dRUz;0{S`w{h)( zPV(ZW6pIxLPr$P#(*I-ct>2>T+Vyb}1(i|}krEXF5d@_>M7ora4(VeN~@7~+z-QPdp^8?4h!2!d4uejEAo!5Du>kOacTgS+I zqn&Ur2h^6ucIFm=v zWyo<1zcTY1T&S|1c`<4?T^PyqFhWo%{_ZUFGXa$1 z)2h(WQ(=su`oKb8Jogukl`c2MbMgl;f6rBm(!OjixkflH(fJXB7V~bj0&}B}*>EA( z+#9n!lo_f>!<;xxjGXcusRckF3{f0lwV<2Uk#cNxgd6@6~Un%IcJVb9}*50btHy3hx)>i9yD0n-y$8$G5U8=u3ysaHW z55kkVo$#rY8)gVMHU6fXX{o`Jhft7Ib!hgm;|(9b>Ej^n*`>!G&|9jhZn;Vf=u z?*!IPD{=9i=W5_yY~uRVbn%WnITf@UG(R(kJ*J5ye;&r@XUVt4!!}%=dTg8U(?#~P zZ)oNUJ*+bCT4QrvM{A<`8WEhg-W~@!*63BHOcIm^E>OxH4pOw=8+qMcw%!kGSM;N? z@=t(@V2}XH+H)J6#lfU66j-rIJc__0t~8&N))E>VX(%}C``P9n05uNK9N7C7F08DS zY&+32WuJjWOu6i2`lD?Z<;RKU>vdEl!f*4u;B08=jKsvQ;|kR~f2`>5GvOY-T`>9D zE~jA3NrcT#uL5=|-nFoog2fm`>Y|%??8)(cliR|1uH>kT>R-I+rwFLx4A3_MI*Rk-ATDz)~Y{Q zrbQ$k3?b8z$)+=ZRw@+hqTgR>c{iM^7_~X-8J4hB@7C^lQ}T(?fp8e7@7?juDlz5Z z*J}q$-v=|4zaih#%E=V9;53X?B&!b|-wrXqMRrVyMIFu2WHsikjobq{F9*w3s{$*c z?H7JDbkulcTZw*b(w^4^CEdG*_Sb?+c8We~w)W@wHkM?H86!o1_@+JOukz8N9N85lKSui?F2r*=oE7io zUS<&J@BHAn`2EEM6v0fYn$hh4<%4&Jh$o!XIu)m|hP;VQgDb^^)<$p1D+Ts4x3a(f zImfnp&f7_!LSX;%eo=s8?a4UtA6E422)50*f@OAmiuS|JlD+XcCcvZJeH7M5d1Dq% zr-2!CotU!w5Jq3(hpSYDyfIk24A!y+r*I^wZ7I4ctq9%*VnGzs<#oObaxRsYa@ysToQozFuANUUc7QRM#P0VD#%z2o3edU;KGYf zGkTXxu6tSGS)kG&08VWO*KcZZ@~&=YhsQ!X!$oN9y)l&W$JsdElM6b*+^lew^7Obc z1TAtq@vE*rfD>qJm*0Ky>alx^rvpW0F8MU7S>t%4>m-i_k#G|JD!ZlAje%J?BL1!@ zBw-4n+7YBZXG(~ili3e_0^#69qKtZ1Nj?zoUMn_L!FiY+aVyE>9>x7iPsT3L0@~X7f8q24lP}CVEPIK^=FxnD`Y^hg{ah!m>xa|2% zWk=8IKixYQ)QnB;vax-43ku#;i<6QDw51bwJh9&JSA6Cpk{W)`me}o*i=6rR1tr}e z+!~CIs4uot6?5buM4BqS`Y3OeIqVzqtrf1DI+RAWxX!kuzRED3q)JwuLVG*A?+Z9B zO*;ueYa->6iuQ;3?L|EjtoqR!U46O@NqRfUm7`_Ww3bv2khIZ3UlJMtzGV7GxcFKk zKXl^xpv_e)8vFdFUb%*5J(r^I!ECR>8(kVWttPlviaB|u{ohuyXqC&?JEO=elN%l< z`{CO9*WwA8mrPVXpy*`E^243X%?`Et@GptIcEF|7THUg_yaeyTXf-Hk(@Kf&*H?r?A&f#X6Cghj9W{pERkvoz^zlrp??1 zbLrdR~VTD5n*l+wqT5M_a=W+0cCeJPIuPb-S=Z@Npnd9C+W zwk~N^;;2xuw|;(ok=Wem$E3+_ZyIMpfawqE;7_omk#9Bz!C5$?y}YSJ$NXoITt_EA zaO2q8LXTxHrM+Dq{!=xD&r8FM%`7$3&KVOBmQduT%u@5}l{n1unkjb{4%4>fic5bl zao>Q+TvN;@_ZMN8@8=YTOO{mkIjh4hnn6-?chjw2Tppx~c|UUFZufV(TtZWz5KQAw z3^h#!BzFWYOcRBL_+>z~1{K3~563tLly-fhuTb)DHmyTWRy13W*R~->)=_y4hQ0Nn zJ30qVu$FVZl8S5VIb-3Gzw=lc{((AwLOg5+VvGav3wG*q;11K17xh+=D1@h zQ^JY^#5!})B)rjg%;>u8y`<}#yk&(IxR-*XT>}L99PdZZloI_3w&1^x&7MsGK`1WM zWY#x~s(`2}B8TySD=pIZhA6;tE*mzcN zWsKfaS4diX^U_dg0$_%7#ckE)ROPKUk&|obUdhg{6Vw#Z_jv?bzW9sCm7Iz9!b7K> ze(>uG2+@7sWJ3_K#4Z{a4W(YM87Y1t!TGWM(Odwu%%{7uX-3q$Z7GJowNRd;p$r#) zkLIznsM0P(Ll>6jqyzwb|2#8%;wYa|#erA(@@<&2AcR00%uXlGFd1)Z=%=IbRxRa>YfK;*vNo+ebu*ijm*d-!@vt_76o9O?J>=H?wqTeM zHvoix1fLBnh8E=oa$R%~l+(f^w$`f|G4ouJF6q8KH8Im}YkhgFLNkzJuwF|MERUJz zr$~fAruPElw2DYp3UWr~iK4`;uNQSvH5I~Hn5&zkNGmApZgE?)-;>D&TL5|h7Ng}* zJY22Pvohg$BwStGuD`fHetT*r2@7eB*o+HibM7io{)luf5`vSR+BDUuX5Anmol91) zRXP?*`=UizxpZ^G?+i5TeiHh>DE|p%Uw!A7{Q=ECR<^kCcfuy5G#7SlquxA)@fh`F zQd)P}Ps?#N5H;Nxq2su83f1-bCMA~%n|HEP1#c8Pc9X8}onzAAwVJ$8w|!*Qd;4Bp^bP6{ z3V$M7DMnxq#eu(PL#Um*aA$9157VKswr7|u-`(QZ=(ZcXowmQomGJdINqT%IQ372=yIOO9-a)%QMBpcF=DDseTuVi3Y3Ug@+2m7I)Cl0@a0<%DdN zS06^T*Z2ZsO&T*72XStV2X#_YgN9p5fPrqbmPBCTKpOL&mrkL)iCzAVdANG73>@Ae zvruBvw(2s3op^A9W+9f+XK2r=%9)M{7K` z(ZH3ny*weehRG8m7WWu^v0?^`mA@}nYu#+MiEVKCaQ9QOc^km4?fsoQyZaT)0qVgdu6Jvr2xZzabg>g! zvv;88V^;NSM>zCAppBY0%*Wif1n&rs;C9Hut8ubFguWIgED`Q%0v4Jo0+rK^^{6e# zv?`0v|TWPHR&SccSas>rWN!7Zz z;=T)eU&xeY^yt(B^0|xTwY!iNZZLP{TV?`y{5>qqBJXtZDk#g{5KJ)`-VeGC@lZ*U+?4SkcyA8QowqHxJ3O3P0Pl9!UXt zp5G%d?S+?M;jt2ep+5_Ufh##ldNq1|O#??s2h_249!%NglKRIM6A^UfztPizBV z-gy0z`uwTlWk{jMXk9Do$*QsAgQP2(LWX>d)nfm}^!(R%(s9^L=KVYKoaJ6cB3DFK zS4B443jhY>JJqUIBBjD=wYsDsmySmq_3(viWg1;Hmd0T5V{~|zCwimR2ApQ6h?GDD zlBMVSRgk%m&V|KL9t6V_$Iy%v2!!UApuJgSGW~YUWUu!9riH|59GJKb$+*|}j1pI( zuO&{m*K1Jj&9wL}zTwz>F#r-f4>CvOed+ zsV!Be#u=6*>}5A)f~kdzT{T!alqqIo@5Oxf3$AF zqWf5dweIIr6|+%of=raCQhT;lNvB;CUHv?`mgh4&nN`3{coIdLQ4Flyc|z6>Tov2# z-KGX+ybRZDl1=t1j)5d$z6FaP$;J(}_0BJz*HT_cn)eZAiW|GD>3r+rBmNVajUxq> za)CdV1=5Vah&7k6#F|P%>er_~UO0Bx)WlY}*|i?;7$+!>#?dP*12PdytdPXZB3(A< z5mD#^4lyl!;>M$$VC9u!fgXuMz@kXDJ$Ay%U|y9kCXwNcVfW(3osY!*`%#dX65`sh5xtMs66de+riRgXtCP~i2x^qU z-$!_>SE`@ll66T7=kOezypwBOh{2!oDK)7-jXr=y0I69wpo}afeRMY!s$yW7J4 zwyB=K?iI3Tk@Qo>E9nWl@HusPFq7k*7LCH~ zX!0dRT!KF3SJ7UMgU-gEsCOpDm2^^Ih!RvAt;zVDSCmln6LF+x{0>hynO;0ufs5~Z z?Ve52f8w%wSz+U->4jYmLP%bDRfneLQ+Jmq9+dk#1?*=(qAoJD%|(r0{6n-ArM>`2 z+RuJ54Z1-((=l0B!Qj@M>|5INBwZ!nxeF*@msvoAE zYT^b1m}n3l!^TcR-VmlIn%+%p8c+s(m`Op5D4^0dJ2? z_dH9_+VL#$>$D}!ed;KFE1)_f9W&MM?g))^Nr@*in9M9o|DvG9+1;8E zUK1u)oqSFFxYH}^L)_`A-SfdAl6a)X+3P|FFSl$p0#tT{Y3x91kcnM0a>h29lw}Bd za0k_>xB{nw#qz_fO1Dpoc3u|{3%k7Oy+6cvu|Zh(=Em*~YmqYkLlpBU0+}_3o2mN# zo+;!gY0P(MpNi)grB}&wQkX~DC?04&b1#dzb1re zS>NJZpAMr(AReOyFcYNBGW_8V^QtA?JkUpQQArgA((bz9uRxSB!X6V@F|i7qTcWsE zzmh>xwmoy+3g}g?O{ay9pp!q51QAP#|0&>q|M@GXi2sf$1H-W(qZpoo3DMgKYHk?c z{P_b}rPU9 zpM_&_FBB~iUdPS5n%)OVHF7hPKRlz6@)nFmO<{vY5-NzZqXraTpoJBB1Xvcvo($LNTt`Y5oq~2WH3Mp2Q*FS@hd-5iW zwHXi|vo?mqgA?bvv-+#hzN6QBybh9~I7}V8%e{0-lU1MO<2Vl2c$i;Bo_pl2+ zOuKj?#%fSR|5G-0+)LbRh+;l zO0{(26Yh*lQ!zuMGGnXy!v&7niUiUz4aU}JpQM8^C&icNiIOkBtQ>TT99Eh1VpB*I z)Wx>D25Aeu9lO_OAJo_`EGu=RNkkmX#Jv@%d7kb;#ve`29;cCxpgYpUm1hIidhMVd z$&<5MB`*jj<{A?sncq7rZo4PHiOc%dIioKNfVlwi3gX%?PwJJtY{z(gI)+eo+z3r* zT?>&iy6HMB`BY0P5IHc%zn*?PUV3C>2aRM@ik}@$YG}HFFfoJ4k#&8(*?S{Z!Us_> zX2asq3^PNu%k$VM>rV=PzCoCQstXrBl1F>*hAuX*%0>#w1+qGJeGOY z;K)MVfQaPnyMY?AE1hq+XLJ8ptwyY3g#X6zpWr#(Hte2xgB3(!lcF=a?KM(KH!5@? zwkd}9CLmmhJ_XO>nGPVnc0$hFg!k#gv%<_5BDmiy<=Ydwr+qXWHYk{XKvm&xbZIP9)W>lfnWZ!J(XOz0&~SK?hX zKYEQxC%O~P}&%?||{cBcKwMW9*L5(-55{do6t+SNLpbViW z8v%3}etryq806`cN?p+G3SPxb163f0_(H+`Ez*6WI5K{i;!EDXN^h2CBwi=`3Ut>8 zzX4!;TTsJ&ez#J?!;i;KpZ-)ITR%5B ztP^}TYWdLhbhAUN^2>|UjiK4W=xb(@1CXSq-LGUe7e*n&QYoG{i{l}>_mTj;g^?|c zD?*uZMol&)$57%*w_NM72i0EDFabXWYpfL2ivF!>#tFEC*JfboEvo9W=7|iVto_JN zo>USDn!{qXCl#^EDvuOsn^Cg#Y7!i}%<^ddGRwwDkI z5+$?Nf^(oc$%o9O5>y?Y;^@&WP7%TE6zowvW=V~J7gd1Lp>DRBvtqMcTG$`pZc1hw z+tfCfZ}QG=iUU)VI7TQ1!SvEmUS2l&Be$^-x)O{%O<_+0*F-`Xk49@!LvN`cG}CiJ z*_8W}PwnW3c-v;Dz)?r^MBP*Fd(He7^&z)tJWwhJC+zb%rbMqzdLDDA226GTXd;H9 zl9GH>4`}tCu0{PO1{L2PbS`bdq2V{!VP=!4gz7S>&ianlEiIpCchnH!!!$Z=DD+gK z8>e@^1PJk zNjE%^4@|lgeKa}gqQltWV!769_$fJQPq5%+lgNrk*TuoK+b14HQ+~MblJF2%nyBi+ zCNF%m`slXZ&y~n)W}X9cdd{(wJZpj6H?W=GpbAjb4v25EK~nfU;eM?W6=dI(E6KOp ze+2g)zwm%sp+mtf2c}8p)io(&X}Inm&>zFHD%Zw!qf4}?oK-w@4(*#w#CV9ypTh}` zyg{lF!m?TaJSE=YDJo?qSbJ?1wMZgzcE2dujdfO&u!>DLSsHPp1i=)mE|@sKEr@)~ z0d0V=H)m`Lv|v20X=2zp1-(QvCjt8imKpcL2 zHy>gVJrQ$edbZthm((+r!t?9r*NrM)H$<}5)|(s50}Au2j7aKbk#(g)f6Dw$unUXx z*TQ1+!`);`g4pWi4Cdt`i*hXzM9%h{CU%$fAT9xCBkNb%f~OZUJD{VP$D<<$Q;8$J zY3K3S_u3d{JM{rD&ZL{4JFM}sQzq&3y&E^z7F`DU#+(lzQD<)HUR+u!GwM6oV9o)T zM2Pv{WnZn*LlWwi8=%*+)1*AwWAZeZ(~8+1p@My8eLYzDqXLP8)1as7e69T#y0hm; ziT06A94_X!Xznq@mz$h6Fj&1a<$Iudauy4(P^z(h*h{<1nW*4lR82_9{V;x**Jf=M zQtZ1+xPEA9_rbfhYOux{dEK!oQqA0-V-zZ2EzVN<-czyc_1j4XJ;gLM?S>=K>~ILp z;jaE&pEI{{%)M}QJ{H=gsV{TlvMQV3Z}$DZu~T%k84we{F@nebZWtKre0 zenz2Yl^tH3Yu)6{ChVN%kH7hIQ3etFKl@XrBu$MKDx&CtP*KL6fVxyM6sw@TV|yM8 zPm=s7wnf)?^s6g(P*lXFCPnQAT%ndeOg4i1n<^vW4FwZovVl4F9!gepTJIgzmIl&9 zj6|MPKRf3)tak_6q{3OzdcwmPj2C7e#||!G1N^YzGjE*#61V>AyZiCj0v!_dZz@h{ zb3ZK0NH5(zB0jm^^)&n1!F2`m*c&=J?x0AtYJXO@Ia$!Fh7^KT9e2kTKUYb>9w`&9 z9ep#(HLrGK(XEnz!t8q`a}3fIH&NcHZ71(s?xkyLJ7UQwC%=YHo@L&o{?qz1yo_!6 z0}=AS{0scvlHYn#nKG@OA6wT)>%%`A0zU~_*}51teql&^);D`nuffx=E*7LU_e^MW z$)?`%X@^ogk^Vpq?XZT=)g|w2?)-Ry3MK;hpWoh3#m--{6SMrqZb(y|*irKE0nHz| z2Ty(hwIx{80(@bW!Yvkev5YZ30d-cVA{2{0J)leSJt`L9Vr7SdxLUuls&@M$M<1`a zGqrQ?jjL95`_bnePpiga3vcX_KMHl>u?_@?@GZZ+BgH)Byf?;9<-{irD<4_(>!KU} zWP|jvl6U(5;KP;C9yIPq@}5B~Er`sZ6=+Vob@ zERHJ!Ddk3E$+^{8?0=Yqzj5nCe(dTj`nS3G!%_4vma{jN`p{e0{;?be6LmMPHewTnN)t*Umf}QQc2KM@fb^2 zt|v+`tCXAd{Kp-nNcu}a&iPwFF3F!*sL>QcxCs|pkK?i!)T*|@@LJQnOm_DhX@X$s z-9>9~aHD#8Z1C`G1YqVS9uX`BU1lCweV;v)Zb9ed)%~(C0-faX%(*B@pfdW1jDWc3vkLp6%%?`rCLn?=WMVY=BbW*TO1C}xiC?%B^! zV)ZdSpHB`EukyB^lk(YbAyi(QP+$4S2;sMajY)g=ezWAI10yFB8pWVmpZJ_iQ!1<` zYnu|iTv;nvMniD_RvMuPYc-iC&IWUYI}8|1&gZ$I_Iex_dF8l^^;paj9Rm0Ufzyc; zz|-^#8y1?ryUbHx4GpWA8mIu&3ux` zXHqoRw>7seP)n7sZ}3P|c&{8310aNu)bXL+!;U|Deu0+@k^!oJQSRR{IfRh2&iIV5 zZtqiYF9j*@M?c#{r~K_&$R?S3zJ}uMd^H7c%)-Z>L?I<^>&f}zuF{me!GVNVPlkSe z+wi!iFe{qug4sbwMy%1Z9)`QcV%D|`ez=usV|5^wRP zBDdh7#Yx@=|FkI1ogcgej?Y}s<@@Cv6XD0OF!)ox-LJB44m_2G({j#oE_ebD%Ra&F zFhh7SAUqC5zV#y7zC@j?RhPcEF*uRgzW9Wo^csNzG5&7hm|NwR9)#*VzpH)lnehwKuJ{-kHT+f-Y2O=NL1@ z6zR|f5}#URSD17^yl7jiSKQ~5MWJ#uzP(WX8tI5pXxyE=rI-Kg1-92!xAn%e-0C2v zeD1WU91RV1{As_4EO!wBxfg4UzUzA(=edhJITagbDX5PM+QJ`6gnvo{sP(IdJm&ID z^?>fh-ueEfaiE{vM3>AbjomKPtT2u^1^mH;8tYwF5}s=|3r?Z=pF@%w`V;Kx#VplI z*7s95mxvBc;We95Qouj(kaNKRAFQloaq6s){gmW=J*0q>aW#t-#Q5|P{X#TE$w7SLwSaECl4uYsKipY0L)X$>H(({T}(Y9h3WJebhSulqvUI z(o#_w9#(hT@~%7Xo9c~4QzadAHFT?Yvo)XHbW~dEii!ekeByeT%kXDx$UITkdzDv8#|H?Rz9oi*;1}FADT8dE;kkcDsEtEgY*Uh`6-y~ zQx*a2amU$S1zVuHX$tW$05skl3sfFi2wk33SvZ-wBKWMXJKT7(#C8 zKW()j6~rA9z$^#XWlBmxS)teTOMb%~rrTXVP@Nq8uDd0M%fVz!EJjkq+{5|;p@wQ! zJXf^(eO^_D_S4#FZZro6A%4R)A+_H3)JA4MAX2Olq|UGO;Z3J-nIiY_0HS+kYirJx z2Z}XJIxC8Ck4LvZSeX?nwZCR`W1_c{ z@Of_}g+bp*&CHR3VgaaYiF-y@;yiQ=)lt+~6|=xZ~5u~@FA&ql)EgWare zy_h}aZ?d>qTwmrM3@Kc0*3Sy`K7ALAk&Qk`k=JkQU^f+nP?^su@Yj3j&yEKUpQF|c zTxLwX64TD*^Wmtab^YBAC{Q_eD^EEzRjLl=9ble)runbx=HFE0fBj695*s9g6tQfm zmwvbk2_rGwO78%zYOJimB6cKs^o(9ZJPNJo1Bxo2MD28`%yB zzr`Gx-M6P%BzDa`H0=W(v5rgI5`&6(DgPIsqIE#+3Dp>OYQI9oH62uYZl&?qGXoF? zPgMuV7QZOMSqRWg#A=%U%{P1F9!^$>)PYtJ6Z`6tx>tS{dUp@@;a&Usc-exZQs;ml zkuYN$;XfZn$Y)rG94fKj1AW*h!gdGhqMkEWG$~= z@wm(f@k-Lz!af5G@U@350(2`mZ8iEcRmSe#s04Sr+f-J`f$I8!N`0*{U~PMS7=p@q z46a|%(u?DjXd1uf$@}hr+m?9Je!ZgNmT6_6OL%e9!IPnxlb^$4c^dm^(gjkwN`ZCQ zE(Xdqg1V)*>4?y3%hzYH@{C{5 zHyW^bD)H;GQz6|a5^^R3r7`ldo(eIT6}tUJWOQ$%=1oooAJ90SS%oPH*o%^{9yo&@ zc6#U>qtcmCImzD2kQi1uJt66_9OIQ5t)%4x1P(h#rqt#??QQBRW^GIp!bJ{soNt91 zEbY=Xe4I8adN0UKkU*#4O@1!O5ZP5cIH$F2j>I0r$4k_bV^jG@LgC=HY z0h>!~8-xjlG43w;353_{d%XLRXe@wuGJgVpbd)^w>dEx=5|6PwFJ=qC&uP%+=y{8W zkG#raw#3^_(Y`3=)`+BIVD{6-LU~OPww{>&qUBR+Kp^il#b!7)iZ)oK7R+mvRV-0@A^do3;}=ajH~TLZ zgR8ztilnga6R(|a$8H6lcz*wY&?|g}V#PXqCIKqy9xsmZiB{T?kL+7?KEi2%YQ)(c zt!Ez1{Riswh&%~>w_2#hlr(p_4Yu_Zkhujfmd&~{RH9B^q0v!)Kw`RGg6iwI&O2+n zhM8!FsOO!Cx>fK3^}T{pv-i@|&iEgsd)FWOwx20l^gGd9H$aUYdlCbZ;K}I$Tp1|B zQ&P_r6~ZH3y2)STvQ#=~acI0{w#={AGVM)f^TMaUX#aMK6}ARa_X#I*S=5;y1#M*x z6#a3`m*A06JCuPh9_i@Y>Onk@d+(M7}7XAI*j)blN?Owi%kH$p0e;?)M~0{vtEl{+h(G791P-0-cI9B|Gi4 zN-qX81uqHP#yS29c`A5&6MLr_R3L-MluDf~8cR;O`AecA07Y)>gmK7!X(M-|9CoRF zyHs$Vs(Ae_E|&Xi7aGJpay&h1X`{Vge3glrj7D`jy6U z&PZ6lF4-Q39uZUwa=Y!Rfc@;I;gYy8ZhTz^;1-zngASNVdEDa@TzvO^s)u&H+EMFq zZEG-#@Kf@@J+6*n-|+M-+Nfs5#7picPw4Puw$l${UGivn2bvV*7!$h1mo=~{N7Zd_ zmK8pvht8T}ET@9^@y0=_%lbRe^;&;ra_{m`S-+pfKd0+M{=YCjKP-U?ibvd7;rUtc zSch#*URHW>)^x*dF zS0y(E?6FBnmKppm7<~$wn_?>KR);H*{h5PCN_7QqyxR-Xcvku{G*6%7vJKIvdX?rs- z8@z`~$<8`_>P=KE&SDX;EHLYDpR_{7dCi}>4nrNKCTa23|;kHouNWp(* zz583|aE1`s>L0HO__f+UW@DkeonPHOu+|lniY>b6m~3UOiwYJ#1~OwMmW$xj$^#{c z!ce}AS;{JZ7*M`UNKoNoO(Z-kP}+xIDb#g`hkWMuMn?-TrGeZu&3Fj21Dg?$*?*Ya z4$ta-DcivcU2q=)i@4PJ3sv9N4f43PLrWA7jySKzlv*i1=j9c1BJL2=KTMI6pom@WVap>iNfEF~0CC&A$E% z{U)L|2lRF<&3#SYUum`UJb$Ao3%#n{PJLCr?3kj>QV2)irMfB5rsZ#?wvIs5AAD{N z48Jf!XMe}{ek;qrh)cUkAX(j8;ovPs*z$bj{tebFpGqY-7*udnd!g5p)A-Z*a)~EW zkw+WXEoMcr9zAz(XSlBcl5hR`EWN_1&rj)D0kJg;K>JO1YnpdKF(}=lzi4+L2>%k0 zx|NuP=vSfO-2z%?*em0EUYIEar6oTo)_*)yj}jIV^fTqB4kHf!BQ@eLbH6)uzrdRs z;07ws?mjCDYjiX2ZP@K0J4N|mB2hj+2kYy%BB{7-B7|`R@QX4vFW-hJyRSVzPpF@EQk@58R$`1-=8LmxPN#W9S0Vq>NALUcY-R*Zda~^%XZsgXPz8^31K7 ziwa5Rl&-CtBU*7lc$6^85&>LT*v*_;WFkw9rlpj<50`i>evU^9t6ip+o-f74=WKb3 z`{?Os4P1Qi{_6;j-yE1UMK!=JZVC8dCkI&Fy>*!#Eq+vXwTh++`-CZ1d!O^)biptN*;3h>b!SK%|sq4X=C(iUI2YVrf=|S{LLC16hjyJpmtE3b7v z+P%pdu?fYm18PoUYK;op48fUAzLfpV5Ilwu@4;dFgMA(#|9R8EI&CIMnG(Yqi_9!) z9-z+t?NifA!@jw;`XleIE`RlVi)PBw7vS2W*lGpajN+u^h(Jqqo*(H7{mIi!E?(H6 zlxWUD%;IVFK?k(%O?OQnO?82=KNDiAKk*$uzZSW{>w z5hP*+5REY?*sY~=TI<8YpZ#zNOqC*lr;cC();bDU!&E8JFCzg^HURf6T8oL>z4W<= zVL8%R{GTJYq8~PLf5(Cy@2~Qfx=wg@Mk1)4)IR5r#n?3;!|K0|#8q}QjI!W^OlOzx zd4&{}>>6W2oCz>>il`?N1dovzr%eB$3pAz^+Hye4MDKXFziOB9y}5TspU_Z#5u?rP zXH?H#bgnse?4YwjCqMHVY5z#a(O z%G6y{xdL-2z`CmKt@4Z*Ki<-`=+yz0Nj9REmEUnM_Xd+%rxvsF9_;DPj$CrUh{4)S zmuu@Z67_+#}B*b z=%l{V#Jl_lXoUF}GLDap+4q1RS{wxmer#-hte!2DQiWgXgLPD>w1$0SAs*dQ$us1m zn7wdw#k44Ag03px6eH4Lg!&D;h>tr8bafp*8bYj zU0%9!_~hf_W2;cC8wH=M7NOd#e?*^YwjwoM&5V2GK@$)p=!DUVY&t~)woWw2Q{qjI z1j(3mWx{R?Evqn&%L8Wc%3))TVcH@<&G@QJm`A0L`-ILbs^FS40P5hW;@`$OD*b1~ z-#8Y0P1AB!hB*k^3MY26P^sjeVN|_XKcJ6p6$~CFbaera9&nHl6a$74pww>sFrDnn zP9>OhS!L~3FDA-0V+-Q47d^OE;!(4jRF{sTeROn|uhOh3N>@vq0hACpdJQAQ)ZZ5` zoVV72jvosX?#u5Z2Q~A-og00pvtLrOtg|Lf`A2BCJ(D?OCH?y=<$rasmoK+maVsxE5?mn!iL z2KdyNi>GkUS{QkjrN3KBehTRV%DCC$XuwNQy=^hsB~U_&ReMw`iEP?D09O$DU<=H} z1tsB_P?X(gfOYa(z=DI&<1QWV$2NZB4kI!t>{hGHjtfL%ZZS5P6&1}Y_av;B$Y#?T zF%v{zCK5eabd|+^c0QPu6LCK|_EJrL6q|}H_7qrlNj%c`+1-$Udi3kLnWWpLzJV-Y z7&z(Pm`V;@x2DCYMY<70e-BX5o}L(<-z}G8ivVQQU15TbFq#-QMLQQSTie=AN+#X% zSo4fZs>16mRqlc)U2wc+sx-X(#N@<4>$5?$DLlsitRp7{wWZl?2ZyAKF}n^TZ2w~4 zHT*i~CJ+Kx07mYz*#mz7Q{J8YSRlb+8udV14Kx~%Gm?zJfG!o3{X<0+W|s(TM$Q&1 z*gV75`c@7cS3@Q?tq1ynokA3)npcHikGqxM9u2RcxtB*8aAfNK@#^erRyk%0wKUy` z2;wfBf};#5sDbe2;Y^AZ+eBZ_^31+X&Gh{k9nXk34DiMQ!!5E~?8I2@y_L4&b`nx_FU6)ZZ4V> zW?UP!D~DWrOw*5$ft-B_FzK@s8%>QfJx|maN=IGP2R%r!3*(we@!NI*T1+{{Ldw6dcd zbiVJ0TTiC|ov9PI;yYf~xl0wHpTwyv8FHI;(1EmQh%82bbvk9P@3i4gi&Vi>-;hKnxfN8HRA^T;WN_LkD%kR@IO_2naAFvVW* z^Nz|g^U(`5 zFBfxg@lOlzdax8RIaO!aWvGIG8Af0ZE5ir~7iO9ifHg06I)xSLa>IQb%6B{Yvi4wg zjHw4(bM;(}!kdmX`AfioL#ESEVzIM>mMd5sP~Jfzsf2eELfh#D$U6fYoE1+|x09X) zdTz5(_OLtbI1k<_mzD!0LZM{h-LlEcc!RzdP8FEc^P*Q!IDg`${6)9$+hfHldTFrT zRG9+c1$aZ*gk8NjQMvKEkfPn0)zWN#^LNFZ2!{lYZ(_B+eg=%`t=Dy&T)g=XrUe1r z+7VXf6mG8d)BJ-e?E}aD8LTcXJ+y9(yU|WKV*=l07}%61qwJI4-Ft_MUhtubnek}4 zzM<>Aw^CLqkjG1-NhOM6d7W^hsUVd?(x^X1zY{YdK=L#9Loo@h`wA5D{6c0PCxPqL z(u{kCI5^iXmO=DI_PsVfr{7U{&Lm5A^~Kez1ZtwsL_a)$LNXV4%)aWHS#fs;Sbuf; z8IdHw0oj~JME9~GL=Y?*Kdqkk9p9qaj3QZp61Z-{9yuW%1s(nL`-)>HrJ$4=w$VIV z9Es%cA*87Lww5rOQdaN!@Yw(-Ss)1wx5Y)%AW6+2e3ffs5(pm7doind?%^FrI@3{C zj#z7v)x>4O7Z$|MoxkdP?!ujOIG05K*SCg>=jKEoE5F4hj_i3zZLb({5B#E^G3g~n zYH76S`bVbAA+-DNRr57q$x|PdL?UZE_Bx{woihL7Lq(VG`r6<5sdbOqKK2f^y}|Rh z;Gf%nI`_86nqhW!2Yr#c;=R(KaTi+y_o4e>^NEek8x_@+qaYpTc0pN!Mw^n+aA{b?@9aD zM_}L6k5B`vrZ~ z-|rei_?+cMdgfw(iaxkUg$daI^`7G}{c!avpTqoHz=(f_-k4(4uCc|r{12<|@6YvZ zVPXJdaf|mWPT=LQIQWv(LAWhtHZ{-D__sO%wzE}&fmK4#yE?%Mj^zoy? zH?|4ug=(9rLf@8VM(zJJ`M(~*LJYfy)czh6UKYztKk%rNaWsOjp947VR$zs(?A4%T z0E5)ggRSoxNJPbb)$NYE^B#6<{HIm&*Rat1YglfA%c*ULN$*ewwFb9FDa*Hzl9*4v zS?Z3-c}?y5e|^~1ps;ga{R|W@;Zrk!uUif;v9lNT;DrvS(3i~_k9liXdHnNvbkB32 zizQSWtc}`~yR1cz5Vgbp&mp*W`407^c7OYpS8wcF{H~d`<*W5SzB@x|j5)&vWV|`v z1e#$#(N%Eq%Hk)_!=0`#{(;8(t7>^vA<(=3>HV?)_};JmkN@{X_I=>88jS+Ec|7%2 z!_v=fQU`8cN|x8O8$ZnC^NdtQ`viql zvSQqH$G60%8pFGTyRuzT!8@1DSS#$k6l;zu>r<{0C)z@kmwDm9mQ*Hm^#@pl7c@`S zN*rff;BoWcIJp$x&$n&VtxrO=iD`usnfihwrD@MkDZE#O*isg|6UH{HU8cAb zgdfxy4bcAaE$%E_#fFq35LJIg*7?F%$~n;*4%bdH`s+!Wx;LXfJqj$B+1BaEg1aSb zXez?vrQC}jQk$7kyYU;bjbANm_2s&_)U9xM4?WmEx^&QKvvd%!0oi+Y(?Y00ejGhq zcDHDg@hxu7!c0_IKOb-m+*w=*mqo9?e^W>&6$Y#fp^;@utBdO(cHGxRtr^z5EYf>K zCrlOh-`@zjIcb%txbba7BeE%;1za;NyLol*&u7MW(O&_N9~EQAgG8O88#q@bes_&{ea;qtM#|iZ6?_Hl|z&Y;2(t4`1dQ6aaLARfw=ALn^#p_kr2H z^GtOqh7L!el-wVIc^R*o_{L0w`jO|jh?-GiSCZ9OKK&zY9TO5hn24TU<=}RWPU_yx0M4 zaT$XDRGA1eZfv5Q$1H0`97|F^c;6m>F-+Qzy5rP6Nq_O9j*j4y@~4&8@UO&jUgRRU z5IE~yIH@@3OuT*F;m+MD5%$JGN%$d*ym{GYLPk$`+DdJ&6mP8(IwO$7MgM!6z zBrEm6$Ekk7{kIa;E^9B5JO&>hjRF5d3vV?J?K9KW8g|HL4?o2akxy-M+tUjAGTmj4tv$mjhA`Kb>%2!o9Wt4#=>p2_@{cL zS5D{fY3&OZlMdXw;698%mP$GP{l*1lN0GMqZ)%+OgYU>QEptQAtkwgmO*%jJWIg&G z8V+LyDZl!e%M6m7w-8SiJv$jX^(%JWvWKTxI2lX6`yU5|-|%M;xn`Ne{}pm2-r@qU z^0lt@mpxJ*u5>ns_q`cRL@zLUT67n;E@gb7 z?G6aVToHQ1)NV)g)o)QZAz$tPWA81)s!aRn(Lq!~1yK-b5u`)9ySqVpK)P#Fn-Hau zF6r*rbeDj1*OuILcWq!3=V9KN|IF*0_qxvcaz33ee)0mIJASc#YpuJer~Yia7lv{d zCl>O`^ydz->lE;{kI_+_23)|WQOGDS{G=C{67}5tHl^*L&oSy)nf1Be!Byy}CW^E* z7;GF2dNNes#70CcT8{TT$k<){{$!#l`_6kXN*qoYY zXd%s!1{x`3gk|xERdyeSer59Bv|ajuS|^^vQ%560wn=oB+G#*no#m>zNPJ2NWZC0_Nq1&}fR5LT{gW zdiNJIJM5D1NgqF35OS|glC0O?_2QsexzIy zJ7Q*~Jl(NWx!$3(zPRCLKI`ud{D$7;x|f-mGg&lPdB-R z;0%cJsGn61MFEwiu1(Ap0aPR%B*!-@yG9BV__5Q_fKej&S_syWie z1zzfGz0s86&8^Vu%RE~QuP^@7BV6jmAHFXr{0M$keZa|Q+MI_$i0Zwt4~BONP#XOR zjx`92_)v0SHZLIu!)hGlSy%l4V4nk(@vdG!LU1cLvPXGiV7E7jtW;rXt}UjJSJW0T zKX>cX1@(-P`;v3v`m$oONnzrYi?=&_Y7}9fC6$+LopAvsJiEC74&StA2sX9VzE5Sg za*p&lcwfOWSzxNTyt|Aj>LLSnW-J|FliU~2(MtPG|LYlepV1L>6W;1YDaw4{7IX&B z6%QUec;y-Au+G|=mE@8;B;_4H4*T*}(_%w4TM~;}iI+(^mRRCozAF^}+osyPT?3aS z7-g<}$}0!-D@6P)vH!ojYGs-G-a_c_`on~MNif?a)yo;~bz(Us;96VH#w1&-FA_sQa!#^HDQ)I?A~%JEZ{c#Y(wJ*IBH*v@ zvNx%`oh>rGlSFLz#zv1J1{adS zdVZ@;)2J5`B|P%deL&V~X!l`SX^Bwh(6PmA{?3`tkAGcmZ?_&9K(cGIGgW*!&PnQs z=zO%~3$WS>S;k*hJxmQ@CKNd8Pe<^SrEw3Qd1-7Yk|*)lw#bz{?+sFe*yu)=t_DXy z2MEA-KFG`-of|EI7u}YM;fA@=qSn^QmrDy zUWZ6&-$YUdJo$G?j=u)uOEjWSO&k@npUrMDgNF#U_uaj5X?4m8dauj9ZXV<9&S5ry z)9nNgzV84}=B)Mr*Eu?)LxP;T#@u}kXDr{~OTonY+RY)sZF~*Gi zvpf`%O`6M4D(RGMnol$uL!9eGN$>5WaB%bhx}|){PuNFy?-343kFz-zg=sH*p{;r1 z=Ij$bBG0#sF28x+6~4DR+wQ@z!eO6t18dXr+QZ>}S2kCx;|5qNd7W2a6~7RdD(jV( zpWPjg_|IIL6aqruXNHvMsg0OgY&e1X*VTKlSnaTJ+)MPqDgY4 z)w^y<{f^$xPb&SE4~Ve#=Rnn~-Dyv&)$aKaX0Drx=PL@8Mps`CgxPWm7I+z_NM{892m`xuGw z=l!VdNTJ&!8J;}ws?UABI2iekT11Wu=+o&)a(*?fH#Xr$5ME3+b%*U2FJ5G;bT^9W z#`eeGF{pe+=Am}vIz1g(|@HJs^pI#b{r~HF%~kkJ)ntZU(m zT)v1+C62x(sIflZIB%QM?!LcWe0MS4c|bHTA8OZjgE3w{p|jLBX562Xi&ZPi?5RF+ zR1B9)-|}<`cMEVm_zl80!B-k-uPTaTkt{g1{reQoQmHwsKjNSf3coqttHNWpJh!Tq z8R);--<-&3wgtInBYW@}7Y?vwW-GpWB9G~O@ailQlL9@f5tLXM|>}Eug#9 z2AGOwI~6F~I4Zh*#!okCV3`c8uG~blHVu4UDw+@@OnxS(zwnSY&iPk!(t$?+ta4fCt5#!uC#I zVdNh!WnCqCJP7;&cIXwXOw?C-nIFqh@Pu1qn#>fJ)2ddD&K}w{<}i^AW<9s>kc$#P zpiP6wlbpVhT!z<0c8!}M;j;BbXmxK4s~Vn!0Ey7FA@3676Qnx8vpr{^6m$59YxJJ$}|yCBNb zk&kj$+m@}T*lPpQFbZ5SvNZ<}rw zo4ZLiP2V@{CQl|~b25?(xlI}y_YcLCMQ{PcrCDJ4vWghIP6# zxxHaK)3XONTPIoOraI#;E&eeY+VvK(157Zr3|X>-hyJ53e#i~s;<4YDL5Vm?o0Tes zs#5jG!TmR5WBK`#X$OLq7FTY z7&Q2YW2wF`L>~^(znaD6sFyr#HRbjWWk|>JXT=xWo2kBDP@st$8@cuYu34#_VA?Fx zN*5ln*y?$WwZ{Q%vlk7wQe+osy;-N(eVH%Q+*jO#eau!X)vm2br^cgx;QAJY;6}VP z>y$^~0C~8#6uLTC3#*15Ebw0y9P~+$c#UkNL*e7TjIC8lSXH{-D5LeRTWL;VS6<2n zg6VGZ?hyVQSdx3|Rh$9NT{PuvU_4r^=P z7NV0av|=f!?>qsP)bd*3yY+eO*|c1^nP>}|;;*p0kF3Z&;-B+(hiuq{io*Y4?+&F@ zP!_Hs=OpbP%o)MgFTkAt-DKx_>L&HJx{eMO&tiv49gp~S%554_0=3`2td9Z3p@5#| zOWH08&Z2|Y;v9n9O1yujYA@(TzIGS*J@L3xlV{C?EJATS=4qNl{qo&>gLaPcLRMzv zYhwLtTltB$qv)AZmBX?oS7*bjAk%p4_#E&Th>-HfY%ws9%O>hGNZa@<9Y)0{hCSak z7KFJoqIlMp!l#{(8n_2_pA`~KDv@j^mm_B%2kf*>2vh|;At)vNY|Fm&l%Ir+VikS6 z&72o;8qJ^ay$I5Z_cOV8uT7xPl6WODOFW5iK4@5~;&I6MQ0X}3yLKPDp$b}t)`5*( z(70hI^<$nA!KF|Bu3iB1buZY(lbshOqZXet%1+SHZpT884McIad*jukGgrF8Mo9S# zFjhM5#-o(|eFg*k{6r*0ho&ci#l@ya4RYT)1`PD+u?;>c9m=*-Z&ar*(7)5UblN{^=zROmY!FD>EFZ)eGxjOvxK#3-t$B)vG*h}tw_NfT zT2nuAy{`_1YK%{r8NFl!jcnf(tvpH$2D29WyTEQPq59|3V_y50{x5JIhT()*rv&Ug z*C-FmrADuMH3+GAfX2rmr&0F%(%V*s@?{|IT|Ilf`UK(~D&?1%NyVn|CfzU^xCcE` zwhI+vkH4WZmDaZ-;ZzrFl8sD!zx{MM-`&`yFL%pLRT50=VGq$<{7`y9J=fTM99!O8WWU6z>>VZKvqzF)bCZvP4nrTVzDWVO_gI(7vlsD}^ih$5QZz2ATbq`lb=!n+$BN+cj~WEYue z>tRiRgB!2!$!F4i*rjG_P3MQ)IM}zdfJ6)V$iC%7NLAgA1gi6Mh$;ATUO-7)Y5Ho% zGFiP2=+v~oiby+#H7X&U>gai&SQ7}n0gi2!nrf*|ue=(OO+yq%GT(WRNXukaVkO^u zeE-~2MWj$IP89vR^7*UdwNR0bA<+PN@_^_@h3tw__Q}A9O9P=s%s)gq(5fApBqQHC zc*hSpqk;|#SJ-0eF?cz&f}t8@G5Nnq%ZW+iBMGOZWDLkHWzq&Bp1pous67V-LQTZ0 zWT&-C?jLG&nYowC65&LqVl=5)I^*;Cm8|%VL|Z`pK&MmH<8O`$lJykWa?DG!@EJ}K zuOGoiwn#J)A#?R*!Kra8f^~2efb^d+dT$U}dv>E~SRiq}_(36?&|d}00-(Mv|Z$HJ(|{+wiYYBI5)qU@JeNles3&W|fHnZ4t^ zU3GCU#VB&VXHlx)%W`C#f)gXIq%_^pQP5vAwM9f5ehNjYl!5xaBa^Q^s87lOo6MP-%!_&hT>mt)XhGNu&rQ~7MiA`OtL-DK=mCQl0(M=tX>sBEy1_NjOhWRrP zw&X19JmHwL_%*eNUIACjo+ASaj;n$)MB}%S0kIcpPjvZ;FXu(L(xZ8BhZCM}ev;hh zoPphBxmys{P;;FZ%uuqmBf1IF~r!ldGb5y?e`5zwj&K=PCuL*{fZ+;0!1Pm%GfUbiGgb`6uySeDO|EyqFL!*)v)BwLqBLO(8XEW zkwD-kq~W@A)*Bf2Vo6ZT9{npr`WEl;+Ms25;d61KlhH&lEL)uOxy7S;9uya`a{${s z?ngcd-hC@}IdTNxogbAGtExmGI`z{01YCW5U-hR$J2QuLLI)yV*!#iKseg``iXA*+ zlhe#LOpf_b%;|O@BL~Vfm*X9Wr?%Wpv?Q}?sy}kFBWdE z3a}CyoLaioAEC$3!9|q;zSwV}i|JUeCxks=ZmkO0>)>&mRWX*lZW7NAtTTRz!NO{U z+gfz%t6_P@zQ{hJKmC7&>#OeP+uz>*4XUn)aURY^b*d#T9bQAGoM*9VmfN0C@OtUK zq|moMADXK;vD?JCoY5nPqM7TEGtC{7pg8mZZJ=>3pW6~z*^EC#D2 z-LU$1u%tQDS6p^S^rVv}kS%Op?*XUnfPcc1wbhxNRy+4A&Z$|N{;NA0e0~_)Gi1op zqOJ;Myr+7NZRKSR8@#Ucn_QocX~FsAhW2=**~8ED6l`BgKdDRD=1=~}=$e9m9cukO z<8vmh&dF*oS!zrX0NyEQiW+Jq+O;EjN|o{ug7yY&b zskWS7Pj>(&qMo2gb#esv4+B!5kf>UnIPIg&R_b`{m96?M1EYU6u-)(-;kVPt)%208 z{nb*NY2l#)4XFAJt7VOB5`PDD?(CL9xlWO^u>_0yEuVJc#zF{w`mj+NCgQRSO>J|m zKHB~R;*KFv$6hSoK3Zj?rBRtII0PrpuHENqBZYPlVqu~{uQ-PVOagEDvkxq)1p``w z>s2Rmau0rlKlZljU-GJhz0CL~zLOn(+N@b)g;(D=uj`D?4TEVEqjC^wdy7vNw!n}D zILzbJiw_l!UW9Nxx~HYQ!K(RJ8ZEh>)l+=^KZBzmu2>jze~jO{^vb^11iZj_n%}A! zye_L_OkyALsf$mdQX2g7N`P8MVd+z@+Q=!o#kd^4e=644d$Y8FqP?h)kaxL;uEU}# z7ek_Il6_57Pa#RHk1FLFK2Lnp3;EcUHg7DH$->C?@rMP>^(96f#!{Q3PJLf-@HMZ! znh}+0I8@*e^GwgPn17NqJutN304d2n7I(XfZhyGsOWTZ*)G0g9~i#Bn9nwjE`KnLkic}XiL}x+@-As z69tSn53z(A3)>_#(2BBXlMBj>du?UblBiWW#5qTBid6SDR_0^S^Yd)9*R#^;VbXTC1&{-;%-pWfNe#f?jW%#4F(e?ou zMs~HG;H6tDi<##5NdE82d)Hl|yFBv!MRyl~59WnVh{0Z1O>L0)g5tQ#;hfva%~jVj z_@v2kI1(uEjtMwsLKP$588YRyt&kC-lDD(}n*FlbkN^se?_U%lV4jcXi)`b1n*(<2 zi+s66#K~~G2h6+?MJqgNLN5W>0E@9&nAs267N4|DK?xP9aFvf{_?zUTJ6$c++bbR> z+=_l`E@K=9LJnY-_Yk8OkEPm^CCYBGF##6BeLS(>u;N^xnBmbgu3)+JiC8s-VIzsK zFP26*e38oaZfL0;~_Bk!V0<$vRl;Wzuyj#aR%8N=R_F#I&k3!FH6I? z*g#IUY_ykI)i@Lr_55iWvLfalfzcaOsnwp#^YT4;s<;d>W4_ezauC08b+@Iiv@f!GuT)11 z3;^$1!RD98Fyo1D<-Nk+h}(Vi;ev3%k zNL3lmU9cly>kAYo;8Ht3{4yWfMy+(105B|yIe&!MwT1F82fvD`h3j>KnML55nK-tz`yj}ifLKfQ?WFLhwj-8h=O#21Rt_+zQhK^l+{D;e zXeC>Nz=Y7)yIAvMe>m^Nygx2fExe)qXEiV+n|Z%(qpKA^>Gq+*2rY@SR(?;*i!_Bz zwaou%m=P+<+fg(d5za%#Xdq{Va;MAVddl-Yp(Gq}X8h``C0;bbiUf>)I0_y}hxji5 z#XuC7L*^rYJ@|Hq=QhnIkO`#Mq+_%7=DC--c0>mCv!3dJ2Pd7WaL{CxgnoSX6Dr`H zE4u(@VB8Vwk2O*s$u@jZ%r*RM&e9UiMR6rb)mYanmU%bHYT++F+_U?iN@Wau^c|() zGI^8%0l=9N`tjH6>SA*+?;Yu=v%Bf)Dyd~$T_^{gc2e~^$K2v=0!hg8)}qdKDM+N@DPKWN zZ?+>or&$?&-l@d>wp;zb*!H1daE{t^r`p$pQYt!@TFTO?V4Uq!Yz%TEgoHnb^mWGt z`%BV{li^5}-<~oTJU~yAogHy^#^b^_KK1)h9^7reEXVo0C%+j@L9f)xun>bWyePbq3GBiPwtM3i|8%LXwdpMy%I)5nEj17hd`jV$zTAl` zp0JipjkY_m?`rH^?7%2zLwjiH5bI!N3Z=rbi%$BA>WP=-t!?$AC%8fDQJBV~!1OiS zF3WxyAY@E)tJYf2+7K1oT{{Zb*uo45jkJxHGR3gSs9d+morr0ImF-ncYGY7Q#)4If z2y7z~Dg7*XT=pxVZ89QR%h8xu9#oC+-JtELPHE2PG&)T@NbpM{p8Y46)?3W_C-?fj z$zkq(o1L2yxErS>P!!h##qn&@#8;-SuQ+|GI!B+}N{=`CO_Mh6n%dAo!LR5IYtM)> z%vp5omGkWlm0aelp@HZp6Pa>KFSqN;mt)j{kfn_uHFcsN`q-WjM%tL}My1j|=|A@5>Ns~K7*pX)KY2a( zdo=byWL@I9&`Yu3t-m=vh?R5t>>@8t`SU-}t)|gpN*4Mvi&cTQ+M(|aCT};txgr|U zG%Ew~Hp*5x9Ce#`gl_q5fYLS-2czle_(Zwgcm3RIJ-pI*B*I|9B+=8IE0nFNxbwfm3d0V_+T(XGH#fYoXw>n(SN~v zqd8S*p0V-tg{Db+w=H{-8d*ZWVSA?(#n&rCeYmsByaCx~vma2HY^2+X_jz@_-+f`R zov$8(Z7)dVvfVa*y?*ofDeF?LOMQo^+EgGKrohFgy`nG;981FYVQq@J1$f-fW{f9; zO>Epk;??(O?DP0En`KWZFo0yDLBK;Le-Fn(b7EP>lM~Czooxo{6`79)v>n%hv39>t zAFnaH57b3SP@BEg-n?$YCZb7HIZ8IYr)1>`bKa-iamo-FOwu$Mf`EB0Ue`Q}Kz zm@_BVPH?#QrL;e{0GzMcd7k_7B09rsUWZ9AI|ie)k~djIKwaLQQ94Hc^h7Dk7?{3$ zIi9nhF0=rpr+O9pkm$__?B)Tzx}!rkNR#|*w+^B7Yuf{`tz+(oZw`MC-v*wxBZOyjm8Mm$@?URTg%r680sZB9GQpw^s5ES~%uV^L&< z1UC|NyS@$fNVJ9GO0`qM$puX%(H2Z7x=Y27&AV5R`uv5h+Arf8a*`%LMHEY?`&rJBP+{l?E4yZTm{1k;)?XaS|& zY9Zu=hp8+eD|7zKj)0fri$95?k2*jD+hCi+?zEq3_507=5U5tb7}hQnMSkiGaHt$n^h`oKs=fT6xFh_!xGc-D9w zE)aMdJ~bD6dc|vA^%SJnkPH9Bts0VseYp3td|Y&PYs0zRQN26`{?xZm6T(wB3|XkP zZ9dVHv==?wHtiiOv8n1Far6A?{V6mz11FI|2ohv^Kh(qN&8c=X6y_TW7Z{9EY$xZi zkkoRJmRV}k_b!Q5@P8b_GF+%G2fV8pi+PgQ;oqO;eY-Y0^_ljoVz~s&3;FYc0zOkt zh?GF&aA5pIdyqsETt~T8!K@0G(Kw_96Mk?9cQGYu^$*&$j}AJ_$bIXF^yM)@`J{Z~ z-mRJA4!r}=VAaq}RFC_A80_V{W)GE7yb?uL?Uz0s=276h0k@ull?ndQR- zwx>*^w=|M<*76Bv&SyI-!L&{Ox0dL=*>XYNWoO%CD=a{rdE>a#myQX($ju9@c#NwX zyPgpQke}WW9oX20@06NN@UA67bm-p`)|bd;k(4@0U}j*%PkI+eR$D^*+@ob06f-_6 z=QtRjNF30#2&7xKnhCcQRjO-X7deV+UbaOpZ;2-JJnv~jPIK33YtK(rf&EGyD@LMyos z=YZ~T(PhMGCNRfNP$45hvwUu(Sm#`p=z~l=eWlVP8nr^I$syj3ZaqZthk9jx60VsY z8-X1FRML11hRUVLTi#x)OqFOeHVCT|om>?I0+`X#8Mw1LN(r|GUw0P%t+2 zu39fJss-2(x6H%y+|v!#jer%n7L7)!Q5y2F0X~g`4|moC^;*Jy>i{P=NZw7(9r>A$ z-gvPv)O6$}h8ea`XcGIhDewDTL>8t16^;6#Y+7EM>2_ejY$Z5+Ac2E%7hU@qeQA+B zVD@JcFvg_%x(wiWuy}n21jKCpy*wN#`e`jUbSMWu@wQlU+I>DaD31yP^e{U1wPM3( z(7D(>EfRCTs@k-j*J?;#7xXh^5AyB{Rw(wxd}b(({J5zM*&H^l`b4l^6IgJTZLAw0 zf1U&_j{M~XP`zpD2_ zAh>1QXZg3d4EZ2?_c!vnQ()j9vc$oXbh%xQJzX}dM`1g+0RNeg%cE1=t-!*|{FF(8ILEmHOo79#^ag*L`JZnL*%lQkdEP*m+QMxW^HAS6gH-5tB0V>lhShdWt# zH`-oB=gCG>rn>dkrl|Wpezx7OCI_kNXe?Tq!x@LP+~)dfpQXJz8B>efUOgfVks?~z zp;q}q_%q{NNuq??3V1Z_sD6-BhR@P71g*%+#P9VtgPlWCn zU;4opGg&e8ZBz`M@)ATtv2bY4P_fUK$NfaLFbW#2BRW-1JD15CDC1Zj`U3#Wr7;7Q z?W+K6#ro>A!Is$NrLNJ-f-DyRt?2*-A+CS{*=eWY_kv5c!QEgmkpfX6pZgiRVe#*z zSzySf%UAy&m^xx5A6;{YLmOvcMVQx5->KU9A>*$Jr!>$cqM5eR>r#B|_QWck_OcZ} zX#!qM(#ZALA(87LIK5?nUp?fp=_G;Z3QSHHnu|A?iutjVoZgJ=MNt@Y(2Hgx`-apl z?!DXtzaz}6dV!HorKIx1Z1s~}?m;xEI5rl8XNY)i-N|{4RExJ-^5IdQh-W1EgK*sb z134Ut?PML&4~s`l);Jb&R~8kuTX4L7u6kscmo|Zbf&c7!>{-gu41uj%lE=L$?Z3yq z%Ch%FK<>YZfcHX#iA#dwXn~j&zUR>!BMOBO?w=(5j`6Df-sE)~s4jR(8dH6za^`UKR7RB~=4(;PU z3GY|$H{6(Uzd&CvRqFoWa}b&K?kKScMv*&<(4mc3@qPNf0KLi0T-6us%|ML^H~y;n z2pQ2r`E00YgJ-X(v{{YaI*brw9?FH|7e}8sTlJ9J*T4GgR&c*0dt9K@l6X`#e*OOaCj4dnrr@w=n44sS8UhJcM;IY$=va_`BL*>1heY`F67c zsE|>+MyDl(ozF_^p)Q~2+FJaj*xKPa3~0|}lh{RG0Z%Qf0)$mg{UoiUFEuFWD6CPR z`62f_xzZUm%2Z+)+g{ejftc=3JRVJ6Pe>1kdY+; zxXngHn3Gq*Qnq2-Pzv{zKW2Qe2v7dXEubVg4B`^zy7oj)p~I%u`C!vh5F7gZxx7xU z!5UjrlO<5#3p+h(N-V8dD6{{96*Xr#PEmKGik&q)S3K=Fw;f^Zsb^DnI};K15%emY zsr^gabNGTFKwF^$Y$polCqAC<-Uxjd_05IfbCmXq1cb#X4%xcO(uaFPq520huqBL= zgFN$b(;k5LS2s!8U$C@_hH~nfSo(t{G%!0da^iu;puo-Q%6SUbHqUmwX=(++wRP6a z9VgWH1=ZWpd|G@d(x?bAlri{lsQ5$Y^#s$0K}2T_3>-@Is)5*29tp2tNi^4rOR+N6||q&N2+lClA=TegX^`p zB9=vKTLUnK^Bs~B!?RS#RYKj3CY~M5rulYz_JP`+S@oiD(lT2usd4;cp}mVN=&5;+ zKF$mB*Li8&m@eOvn`h;ERz)b1=^y(H2nu#&#nWo52NUG&AVt@@H};n72f0Q$dcBj5 zDWjERTdhgtZe)aHdxDBCR}BEAIp-w8bC!6S=myVp&g=B#sSM zPPOYD)Fe&4>dH%7_wfsKi{h;1X$k|mIcIC?CpYR;0`|sG<(Y_7HCENB^uMR30i98a zwrEGt$J>X@Jd~hf3+}{w@tr=!yD%@{G1>_{tri_FvN~$R)OjNINB&+K#;;Vwfvk&z zlaRB1gN>OC%%T_AXq2-9RoM)#T@qY!bK4X*Kn`+gIL(+7KzT%hTOfbDK1hp06BGY{ z;>CA{Xg#yWpxkWOlh?n3$%2bc_wtAfntu#aE74JwpR$ZgNdUIa_eLn|j_!QNt0w^V zKH!wh6mZ{7MzlOk^|)4Qe5y=}3Zy_;C1ortprvf_9q2Elsu{=W8R@Bv`oTFigd z;XnXoM!AQup;&+O+yL=l6~?%3qjQpCD&=2_#|V0Db!wfu2(zA<92t>~fl_A^H9R(} zbiRnvUB0`hxB?z3@I8gDr|luD0{jc6Q?;qx;i8ZmVxCwkHIyLuwy0OXfB4BOih4{U zK#@Bh0yMqjJMNA(zXb6&cJ43lHF7t9W9lEOxACF?9YZgfdz&w@0HJB+NABm6L_WU0 zI_wXR|I|s6qU?;6mheuLR9X{o8&);lIzu9E^8e6PH#|Q)j@h{^)w>DsU^2)wTeuJ$ zX9ZYct}V$bkW|Rl4Zo*c&(VB~HTwPE0(9d@`^djGET7u8{$LbU{{txsdhyO0WA$Tq zSpR=3Lk+_YOSEfy)uTmtVkS$~L>R0ce5|iFSFkGX+I$q}^S=O_W;gTP>_1dv$l;1J$KQ52SCnQOSJd}Vf2B6?k zNEvj8r7)i$4%TXO`lEJbP*4RIPl1wd@w90_neEq;nw%9uXvHta}{O0rZeuQ_3^M80= zLb>eJCHv&T?ZXOVnWrf!fPDF_nNejZAd!it*A~vB zI)o+B|7*j49Zyf(Q)^_O|8-5mLh#)LA3aR0`R$9KI7Rvvn(BMOY@e?lI0-zzuGW8V z9bPDK@)v3S{~RtxA_W7Xj&&&z(30qI$w`hD>Apv3Hn`bFUciI_v?n!5D|sX)YJb^PWVuB0y`{pW1hIMqs>C zvsa^2F6>6FRaHjMcnc~Za>ca)E zJu!3lAMaX-Kek*b>sqE=9w3STPtOQ>fj<&3*PjXa{?C4+i)NMzZ=9*AaT}D0rva`B zNgSJZX>1xUL)_6zN=mL;U|N4K9RBH5xp7Mev~y9uz7NeucOCBc6i_=NX*L-Zc))KI zGWQ~PzZS*&XAgW0O*u z1K@|p&>7J2$>$%(mz#{Mok1^^mh%K)vhq5#ru?Bx?LCfYTH9^G0PP4#dogG%mTj6qsxTI3!OAnPmZm7q)XUpymY(Yoy}AGg6Z@ zvF{|pb}J0mED~<#!_VE&coYRYLZQ_$Cj^u6%sDba3uKm12*-)0@qPdk)aX>N{uLHVGRVY}u%9s3R7Hi$L+gmV13hO1YK#Y0MH6Dw1RCk)2 zP#7T>{zT519JJR_fzNPfV@RLs*-qLDbfXRCEi+zZi}mb#8m*~(Yt8K!{zyH~$5)eX z1lxARbFFzXG@9*2$A}60gM(XNx@+SgeNQc$>>Rr`bVjf1F7D~cQCOMONxu9${gx)? zY;Q)cldm+Sh#kJ;^~Qq`?tT%?bi2h$CH)?<`9@!l+cdzWO|5V-Qu7L@p=zg&w%bw5JpmMc6Ro#s9j^oN|9G|KWcv zKk0ibN&Nz~$_7~Zr2SqA`#Jsup1VpCMuq!enqDj{4*(6;2F`b^*e9^u-A>d=HOdt5 zIR;AGG3UBxur7^MK_f(K%=e%sqcuVH@W z=9amqxKIEmj?q>x;r+TwuZeo+a4Ni#_h>-oyv(-w>NAckAqN~F=+2WO;3SggPBylV zmAsc`nRQ}WlDsF^{Ze5{djo~XTj_9X)=1ItL5|XR9D)zs#Io_zWKw*`7x%rG-D+PQqVmqwEmZiWD75&t2~8XN=oOlW+0{ zdvi+ib38z+M#~Z?$C6~XnR#oyziqX1f%HGizc-bCv}S&PwPyFmPxuR&oc&t{d1}Zf(BfV3;*?=Oa!@=vF8^j0rab z)|wo*1I6Er#5_(dIJs3=^LX1k z?Ih&Q$lg-k=eK-y0IB@B!0kXt&2SQg_g;$K#uJnOX(gjOExPO3P&9{Zl0QVFaeGGF zI9e)XF##Ru>UTT?AG&ZKKz( z)Fg?iSm&FIhMGXD;(+EG@eB~lR|&a=;q#1&KDy{!KuM%o^fLu`#Ab&R-nW$QX?3~q z{t2lpj|V=0@pn&NheY4R7EF6II5&C3RDAqR{+2yHdp!+RF!21a9y#zR5hwuIdAEt& z#S!TgXvG@s%}jGBiOJwI>WHJ6%m10>fP^V}pXUx9{v(gJQB$TD)uWtyYFMa3DEI_k zzd`1|Vl~5P5cU#GbFpsbE&zPw=<>tWfwg`Hfh}LLPod%ey!=88=@-Lgt(fdZ!-Vi$ zGrPF!prUkn9OX0}wsIgV$lb!d6?#EX^3ruE&fSLo7<2w`HGNXA%&-78VgUUzj$`hX zL(VlWnZP+3UCTF4&D)b@ORtr>b|1h1j)ivrplT1OT$$mB-j)`6TQk}n4y&qJk;kWT zj@wfeYw~b`w}5fn;Tibq2xYBiieqLEum{jKt>sQYww~J%$p(ts8h4;za~5lcQUY%p zG2eAlFPKya?6B6cP9w5S0GfTiouKtz!!rlDqD3`?&q@(h7L*6_(*g&QPbK(>q_hC_ zV_!U3396_cvM!0o7&_-1zfZdlLiQ?vBovu{gd+S-wl9XkylCU5Ns`;yjjAl)Ok&Q85|93y|u5(I5%GfEkkD~#>|B8^G z1Wo>fAN{jx8w>p;j`ZPVW2US-jm5FTmhCFEWHvxnV-X!C*BhS2d-1c9w0pd4SHOek zd+ilp@;K|n(9v7My1K2i%+UuA20?QYFEp{~l|O_fPHuzY`%ehlZCtEi>cqQ=uiHH@ zgVn~a&gH#s01>EZR#$I7q;!2vJ+4Qya?7{RMSd2pUs;f4f`+D*S zGF3P@Utw{Eub-qn5_{gTtVe%%Jz=m=3r%X-+1`&=y$rN3b8%Zd%5L~kHNS?%;Oo7J zB!Nk?8uH-%>iF>L9XzA~{9$lpbU50`i-463#ccI_{v~aT`nCV&i1x zoBN`MVq^71xqAr^|J2@|l)?^uO2|7P-iOK-rX66{mLr7szt?j6KHZoE{@JDrVLf;r zsQsYyrP4{wj>U-}AnzsdxQM(@WMv_?v%2-A1?a5CLaeVa+mUpGNT`c%n@C!Za-2u( zZhdISvbcsRaM|PFFqyB+WzO^NmH4+9t@FT|0i23efLno>X3$u6c-rj+DLU5 z)q**t?7_Y7JVk&q%c!4b_1n$7Mh#(akTuUY68urYM=vaFM3J>4eY{+gS?2|Bo+SA> z9ipTTg^yE61F(wK*;(xzj*`{OI!NOlfVsc*b$Duw(-$8b^wEDNw`~Wn(7Uc7Uf)BU zbBwH1Q>d>rHY>H%=C20C3U1mOw^E+&~;^PBgd_>f6rhYsMJ#S!JK2Hp4Px@bfkHQ&p~^K;HDF= zCHg59Kw?gp$#ZtNYj!~)WNR2i`Vt^L3`a;nxx(B-3E35gnBmP@J3RZ5z&6S{I49~E zUO_)vw`OO`le2vQG*y5;FE;-6BpYj4=eb%UD&T^oOEmMpJKpSP_noKCe`k5Y$-Y>l z)iNY63<+I1Pr!_a3t;Ei@^2?k#7dLn(_PFMRCUFmVX}*H!@*lz8_an*ZM)+P3J$w>Iav{d1+T^2qP5 zne=6A;tSSrNXu)RG2ED%v-0N{9zPNE1nRmd*H#mpZkK29&tZ6lZ?m4D0I~gDba(h+ z%1k)$61@zhi9U(l_Gr?4>2nu_ry!*B6zRV4%+s^R8*7c{yPW|MtR|}0A_m{hgaR;1 z&o#-HgA_A3WPEYVtTDF`rV|(2uzqjjopY&FA9KY^->_nmO8MLNM4e)bf{+CqnZ-672`#!yE z{xx*CZ}sK5eClBeaX@Pdo})}dKs-;$H z_stKuxI~^_Ppw*aRoxYq!B=_m6}VEaEY-Eplvj%$xrl*lPoaq?cgxPxax-5lcNj%@ z4>tQIpil+R3kQi>2dNDL$Y7j&@<;#Hi1I`1DA#p-GU#ky3%uUo)u?`-=W^$jpgDV* zE+*708zSM%-h&e*>g4+9!k6;g-pNU=@sdU&Ca}Q;cQiLrK$qdYcRwY&|Mc)82dE&v zAGW#>jGg@2Kt8?_vvcSGW@nxQbjC@sznsd^m&fk`J0+-grI~r~^Y&rpYi;*8+#y?6 zF^b0>jYrypl1-xnu7+v@#_W{btSK{(kyK$>W&xjJ@mVu;_2*y#7LSz$vz+>ZF_fx;_z~3>=f7Om)g*ct+rQUIiyFpI!f7I%>?tQGJtJk# ztSWpfsd95g|H_L4pyB2ue#^MSg~RH_J8p1My?)#KIC1;4wD$}MKY%An=Hc@<01y2O zfQwJ=5#jdDdv{~rk>Z;BZXc$NeNu7Lh&kyf?Zf^x?+O_%z*fwQB~CqOlko|EDWKii zYmX{6%z5~FS_wCI(wqY*f2>%Q!HCDCr?(p@QHlVelmf`kO}Q7(x{#daFE+tJ&!`-I z4XjrTBG@jo^+6^vHLEn~6EUBFd*!#<8S<8G{l#F>F`ZIw+~D5BQMB|=Wart}+q131 zhEnk3D_vQhO|kxxGd;T40S*mWK`_d8O>`km!D!QI{uab4EH_O(x&iLjk*ksQD;2hq z6k*`GRp&N;P9<6egP;T+XeUDxtHm6UR^W z6}pr{O3S#gphtXIDHDIw4*F&nD zdf28s@g4i!rc{>4>L%*v^lm2Y!wh2Ty!}dS(d?Obb6>z2NoR@s{KimXAHDz=^4$gD zZi+dTa$h(&Z&K0_LcZx2n;^YQp%L~i9SG>#ze%KoH+^Fls{hI^XsVxKXTO5Lo5xSI zB^=r4pM8YDk1Spq8~5|`l|3bTXT%5R3rXN*IeFRbu>n-gS&q6icf6^FLVhLU*$!kC zqd+)GnW^@xO_0{9&+M%4p-zLq?0*?|ttxpkgqq2D7QlMZBV9xq%ko~cA~hGTz$K0& z@GiIkw)rJ4d{-4Ec14&zfL>HIZzJRjlQAeEvY9qMKW4`UT1doLuMXw^(I z_p!N3&iud?@xFS3HuB*8Ci~{c<#nK^kZR6Zw?Lujjj0Dc&+an(2z81^m>40^nVzoD z%DJSEe|mME$)af~OzuzyUDmB#9oF{ihqBCn9Ao^l2VC~Zvi}zo(Z8C_nXB@r>%p;m zrzJP1iX6}U>rOLiI;+YqhZ#V27mpKfwAd$nDm(kH@Q;}D1$9BeR%yz8ta^0 zm$}F7OO<(58X=NzJGj5)?SBAC&%RYUvHc|=2P44G>%J>Jpa{_i)rmYzD+Zbj=^Rpe z&Y&TQ14Z*Bkz2#A;{???)#)`XG@c*HkCz!_7j~7SXYU3X>e=p$3!YAxs3y(SrdYYY z`c{c(0qr{=%c68Mn&+H#Up7#b>`wXQb!BUpM?xI3`2ip+q*0dVijO&+pMJF7pmSn3 zK-8?Sg;y8^b*(ga)z1HNn>P8yk+rQJrCs<9ws1jijTggQzPtDR?dkjNOfFM~L@&95 z&b2?Lbelcg{qQ`^ZgKB;HEjAutr=Bo%4VpfKNa+@}vM+(<@Iv)Fhj3^c`+>xTI zh`WD1)rZv1fNox~VTX`-Z}o%|LJ8#6v!|r|?gVZ;NZULce%bo&Yuu+k(UeCDU(%uX zyEbFb6u|!C;Qz~yu&&3HbZ+zaUQtRs)c9o&=im zoXd@3O!OX7pN)GjJOLQ-XSoM1G5;MX{SH;){NLW9*QxM@9I=0 zazcmi6bee!F~T}iZhqby!rZUi8ncx&-(SXy;A*t_yI}73ml-Ym#v5Gvs|$EZ1O^&W z!AUe~lTLC9jLErIrq`Q;3V-K2aNvj0&u`D$L&=mYT?eD+{}@Lf{`SDj^G2#X+I;ry z_h(k<^PyzeAzxWl)}K78Hjt8hi}=f%@1Q^56~XN4w{#NEU%b6Pz1_dT{@=v>I-IhR zL4kkqi+=1xeK#WVT$FHc^W9$=mmLKC`!d20d^0xb|GXBE^CyWdkHkr>M8SY^O!yD-^T5-Ne?SC-JxYm1sJkHfwM znYQVcaLOGH|ADS2KcyTHFN!A3@Uwh*FTH-x-O}5dS1QvMj5pQp8{q+SCY{Zdf6Q3? z4?w$flTtz=0Srr|#}V3T7q6<-3;^yvQU%J9X`eO8TX12OZ?`x6d~xA-D5tQA9SjDk z4RSc+y)l1@I449&aY|K)y1PaJ&RSeJ~P;f2xT;F zZ2o8 zSqp_Mr9b}S?|)c7dPMwI);WhWrky9j?X{=7g}qMq3!k2xv4EV!$!sc_*9Lz3yubfz z%CBy*?SfI&>FuVz#1$gsV4T2eU8^0P01sXH7{C^!*&eUgc1Zul_htX!F|+5-IJ`M| zb5y=QS!f@v%lsGFZJBjr$G&5H`8- zo8*45g#Aa-!@zCkCj2m}Q<3#d;!Cs6VK?S_vrW6FJMwzB^&k9Kvh1g#RtP(F+&ZV^ z!GE{*%MPRKjETWf{Q9+D*r0)j#n3|k-O5jje<`%=^1#26ojFf^h`{MKy!xo+8T|pE zGe3NDpiZ(Bbt?L%PNt*PcU1To+=sne~i}=vBq2Dl*#u690k{XA7 zrjlFA_1`|iq3#?sXuWUYMW|Er0`6HZH8k>6-r3*v|3~lAtqAT-%8ROo1uL7g!{3HkG!)CxtEv>-|(6b1CX5^i3U4h&%LJa@TI4Y+i&ewpC!mkcJMr z=E*-hnin}JyV~<@S5-!<^{m~(taD1wpB`9(E7oqu*|CAkcH4rres(&|uTx|i`%R|% zSz;8#x9t$2N#=h%)K_NY4e;C`0#QOCu@g)FD5VK*uSSW3v4 z*oVXAu96w60_DvI{-0{<2@2eJQQ+n7Li7ywe-XJ6-z?aYP4=hh0SAzN8zK>uvbgFE0G-eOiuCpz}OMPr_kTukWA# z*&o<*Qc&(LK)LIq^82p+MC|kwOx%2XkfkwZKTD(WPoDl}_uf6UC*-t%x*P{zhko+@ zT5o`IjRo+4#h-OJ^W^Q$X99Nc1b9`#t-Zk70A;#(VrMkD1*51PTLuvB^J>_kr$0Hf zJh#ABEMKDsig8pt{NpXooq+0*{9cM0`K4170DPVN$r(tOr>F~^_Wx}{uXhQg;ol7C z_lx5<>7Ksr*lg?GuzBEkYFD|BV3DP;{*ad@c3K(}Kf~E3LB=lkNBVwtzj|1F#myE~ ze@ul>a`@;PR?avIsREUlf79<7VB6^^LmuOG9D}sKUOZ{I!ycW7QY7Af>vxGKMU5S7 zb`6TNFU@dRe?muHzX+8~cQCr(Z}t}d(c3PxY$~23p_51_!<3fbdL>2GKC0bxRZTRQ zrd`Ii9En=8Np zO#`m|xta>hd1Nrfd zP}#4A9vK?Qa<{(F|9rtSCn9~n>gYLcDKX4Z?Boo3MugldKxBYXHS<@v40)MiU0URb zB7CAlCTTI!o-6jE0cd;(oA2mGjp!}_gM+ zDUBal`y=V%l+6v(`o6h`0+BaH<Q8+cPvvlbwWqqHKi1RLwCJs3(h_$Y5-j* zoqy-tiT`#y8eed~gkANdb2Q&WTS>eL7g3SFpigf`4~K5QeTeqk5b3JTkP!`0j^uhE z$b0<9O8c^mg3r|7jZ4$nO*V!YxznxJ!lV;Zm9U+jIsN9DYNlX{O5pS3-1T$zl?i@( zWvX&Vb@c8J+)JLTd9AC)OG}zUd2IEF9i(@5pJ7q9B7xZ|X|DFjRliq)^HOdP*_drV z*uC{t^KXFra4Cea8g`{zHO${%(e!jK1+yi@=`6Kb?Q7C;qg)|~q1AM>0bVcMjyTdT zi!*r(DW+In#gde0J}uDSE=nq$qZL9|t|q%S74Ifb6LV|UZSJ@9=PgfI*QH?%^C5uZ7eYP{yM=$k5DNN3hg*XbCA`L!cYBq^w&rMW<6RDxU97swwUd2AJljfIGfJYPP}WfrG%f!nttH~)@aANf<5Nwp%IPFbSNfZJSa z(v8)r<(^Udv7Ww>7Nd+J19&wI9QhM-xu_(ET=UkrI)%Ah;LIbe^rCGpIBs}GGvw$9 zc263rZL>H|!yM(?Wx5s_jl3QVIpK$U>?h-QIym3#v^2%7dwPgv8rS{g1z$IJ1DF27 zF^hOFgSC6kMJ>~*^jHM;t@zxKN~*TmO+h*Q!$N@Bspfh@x`k+y3c30mDO+a6(~b^> zwq53fbp|sz-mJmSZSdkHulfrOb)hJG?G&~3$SRza$cSeybbXcwlj#bP$U%C|`Xbx- zdC;CXDo2+^oVA_}*`d06bM<~hJ@5&&${Hpa$^4@C><8aDz6wG*MfeCIR$`eoO0-vp z9^>W&rC$h4(`{Qh*i?;o`;X7KeKu@KEzW(va3z5CI2&|D*I=ontMivfHgSW9aE*GW zo(C94hS~z(2&a5`zYcV6s6IA^^w%d(1zVrPvi^tbWyPx1&`W zXw0}^LxNtFvR7hdW>s$KMSke=q)y&Kn|EJ-j@JX<-PFZxdmqMo_g?6+zl~Xy@!AN) z;h~$E3z5d*I*HzVzKrGyTtN`xmBW-s`3Gm){5{3JwxjGr{W0+`C1@ILL{{6OyjwmO zaFd?-)kZ{*6tt-jinWU`>iaAJo9s-TG>r0SbrqfTu>jm4vVQIcjH+zZ|8z;5-)MCr z9xP-8PBO6(buey;$t&6?IzhcYfPvI+R4hiysgX&&Fsh2hO2c@sn%gt>hK}p&j>M1> ztu5l9`UbX4k;QZRN|9k`QW&`CcDHiAfR>!qK^CP;G(*n#!+xU|3!Y2hT2jYu})M5LFmFbaJAyn-0bZ&+U>N7}ea` zJ&wa}N|Kj*t6_#XNr&=N$McUP7tnpR`OSQ7w@)kut3iz@5Z~n zD>7ZHZ^7_F0ZwknHa8@jEMntv=+q^Vh#NB#rR#0p8Od!BW73w%`*RE`xev$^3olJ+ z29C}$i-!+!Z!J_{y~T4XzD{j|P_D!F^*N3vyYS%4P)mHwm0{bga>8O;MY3>nUEozHEoY z6Y3Jn={1jik7WsweO6QZBHuh`mpc^lf1n;XNX_Z54NF zmE2yJ_x0t-1WzZMg2~jGA|pi>NG{^?$gOib>{Swl+990(9{s(^MYN@SM)L~izppcc z;ht068WN7)b0r0u?;uxYTBNNLIOk2Sx*5rt^l>>Y{`9z0(0gGg=|p^l?}+D=dPSSk zsJtyrUYLxBMT;1=8;{tiN9B<}LSHW~vo%%$(a|#q?No zR>Nv-QZ9>T_<25U!wp!Mh)*@XC)*o(d16avqNFUQWhHlV6J@`-tspY1PtMyz8}BoS z9{Z{YZwa>16N=lAQfIQp!?3cZ6Jwh@@ zWs8M5tHULWIaR97QS+k1S6W-DVLo4KTJHI@?W5%!ppQ5c4www#;WAD}Z24sQJ*%d= zF1M=OstaBacTK$1smntnr9R?PMw2%m0MRu&X&CI+QMFTIKy1)h@~Ry zSc?WpvRhsM0k{F&`;z|6rKJ5G;KKQ0xM?s&r|l7IS|au%(p8lssi=kzLs3*krw6KG zoL}DH2jl90J#gIG+9+*%{+)}jFfAv|w7#9(Wq-eQHWfst2E%Zq>3qLq-gbGMj{TiU zAM<^*m4x}Ej?`8g%TyTWxcmi0QHRmxw8ZD*>XHTOCElc&fOo$5fv#d}`$9ENlwh~p zkeepI2z7rA%lP8b?Qz^D2ua@W-YqHU5s3`!OVIE=pX3rm44aib2lfW31T*z_K9#Zk?~ z`z^qIJ`SXB)r!k!hun!Xw|@X5dE!D@*L7jdv!cIQGfVnS+&| zcX*SbXlsERaWv&Ce%QCRvNGJ4o2{ELYr*O8Ml&sRdHy&q4X083a&roWon6ycPU0M| z9l#jsnc_KlKf>wyxg%^(Q+&Kan}K)rZl9p6M^THF?wlwlvoN&vXmwPaJqrfzYjtxu zH_k4oIk#N2`JL*I&&jR&Z)8OxWu;|%Xr;+(*E$fy`I$!C2= zlrEN6AehxUC(3BVJgv%khJeh-p2_WP2e|0eRbu2P8GG3}%I@@su~}x)EVf1I33wsY zC6yaE?YqGtV)w+N^qiJ!Y!{)^9pVdFGzT%t?9Adz^Ui{G0B5u&y-1t~R7+>`SG&3d z5d@yYR~sR5BWOjoQkf~Z{Yas5{%QGqetF+80s|eAu%ePxSg)|YU}TCn4V|aQ&ll-? zExhb%4+&uP1VZ82fuQ-UL7`iOzwvM&;6xTX5I1;z-&uC0&NTMjvky|*$jMy*mLpEJ zXZv`lg!IOcZ(3n~xBu9w!z_*V(cy4Yfx@$R!-tOkN?WtI=e$AXRF~xkx{#ttR&9gubu4s7IFYa<;#W? zoQg@(3}zM+xT|lmN)REQ`|;sO|K%GY@g1pxPfH1iqTEW}5vL*xS4u`aTnirxxx%*_ z;9gvzQZp?rhXR9Mv50?Ix!K&H)@SrWU2;J0OjGx*&&#TIc^+*;Cd;4^FbSZyn)^pz zFQjl1jH>A@i35jbP>U5&wujbBFxFMk5g;8J>yqgfYmy%<^yD6VemR}@u~*1yZTe~j zzZhD)t$8m4CvIyw?S98ZHC4{f= z%WnU22~14V6214zx(ga(#AndV=to#iD;^#bIw z@%^fzJ|4bzNYa#)&{_jqkevhG5Lc&fkJQnEk_OTg@|WOp`}_!&(miPq&Y)$PUOoQ1 z?YeEk?$Lxv5BxB0Tm#Q`S6t(u3~odwRFcr{@BUzAqG|c#BiQnyH(M989n zgKHIhmSk!S)A8F6DIxz25LOi2peoUfS4*VB<2u&Er;ltWg^cfxzjxXOfNk{c91~to!h9hz8ltSdgZ0 zS8!aPWK2aJIboP}cNtu0)X?TV&j|Q~X3=o_{2Hg>u~=Tgz+&6MiOf_yt_mj?EQ8D5 zN{F)`@$6LH4=zZ}US|cJP^CuDe{9 zU@EUgiDA*EIuM(F-aF^weIbMejLUBwpqvi#-%p2e zWVYLLO8BCthz1VV>!S+6@QduBa$7APQv-|{-53ocuZ~qBiz!AjR z-^Cz%sHmeBid`taS0UK~4n2j!mTP3=GBoO86I{wX#7qGxIq)B}?$ZR!ioq}5H^~|_ zjhyYdoCtTkDd0-p(A31CUE7}^W8WLYtK5Q2{mwNerLtu#%#nOhwZwiT%bmvg@LUZN z*Q)1c2lrT(v*7DRw}Z+GPgMlPghGLdzTuo!4p0-PBux!*o?=R8P1TBA;inv#@ve4c zBfNJL*Fg>L>1%#AAwC+O#L<+#ql3uz%|XqC8npcKmuIi^kJ z%Gao|GXs8aWEJ!IKfVic36YqZj&W1jExylZO_@_)u^Xe5PnT7;o$+E%;UVDRO5?_O zXVBY2NqNO;kLgzQohvsxtsR$+HeBv~%fz@tJa>TNs4RaA=S)ZC%{>AOl#D}q?q2Yb zk#n0b?RUb<)rXa3)3Cha{qWICW^5&h9HoA-Y5Kz=7pgu+$duQ;#$4fte$eJ=8j8AY zeow0)SI0)Y=Z`gk0;H(Pk{J5#mw8qH9Sjr~MPqori{wAgG$pfBQ8O%*JG4kxP{?kl zN?-$C-~M?*@fbiyK0pQhlTiGhoBe-ALeWLvvFITn=X}V-5+UG$wb1`0NqgkHJB&*$ zwsy$5of%~6LrL~XNtM({@*`rE7$or=Qj9|mByW?7C-Ql3G)HBR1+YB@mjtPPHn_b* zdX5r8!<-l&_GWpyEDoLHQa#DWRz>dRdmUO0JNDY*@@P#EKYc+7_T;Ak@`>^K5MFSB z@DA=dgzPma==TZ_93+DBvui)|UNA@Gt2^3B-{A+I~1S1|}kFb(df ziYrQY8aVJ{9R7wEFH_{3ha{I+oLf#E=ME0rg+;!OyB4W?iSFAPy{az=;VPle1Xcx( z01*a#{RHK4-1l=a9+~cU45f(j(3WV4+Q$;e)k&B7?zgh0 z5BKHjHkZdUiX3kK5YvBSOC~wLWN6P#cUy*1phy5!&Ai;43^5;AvLI5guERlD$Jv4l z9iEF@riojO!4#o}$&rp|rm9}BN_qTaaQybEOi1}iE;W-j2`NR8avZPt_+%^qOO4ql zW1WAPQ*t-;O&>)jT8Eo_`PWMxqH=ugnjhzt@zXqgC4PK(hX|Fz+P`7nBk4B(67|lw z_@s>0bsy0WP`mvF0uRr;w*Qpv-Djv@4`a$sy`DJ~R4{$4S)st7y9wAhliFurFBfYm z|3oAqns0pZp&cMoJJh=;a;yu6ZP*+N7^fuqUV)@QB)LcRxm$tzGk->X$%t-x1q9^Yu!wNM&`cLH*`^px)O4e3+4hOEXX^tNu4Ivm+E$6=Px>1y6O+D>Fe2NL{ZO8aY`| zWE&l1{F*OTOK@PV>VR`3%t#tSUdefPEqUm@pPv< z%?W<^+4_=2Lurds)PZWVw;~UmQPR_q%Kmg%g#9PG|C;^Q3AS5WLc)Fzo?_V@jGxY= zy4esOo!xiSr}ihMdpXvU8;Nnl8_#e3V!!h^=F8;Jx|G>pRaPfEU&i{J2py&R8%8_cg!I;1t zCbMxp-fdfAdDtD@y2PTRZ`&ijx~7?=Uy@>)qTL(J1|?WN#inrl0TBQG1MKOM-@H_& zvDT%yJv{e}tEug8fDyT!r)0OZ;G12t)GbRka`R+!m-&971$3GQy5AW~G~YCm+^Hep z;eLugoAJ$`jlVD~HuCUEzdc&6$0PTGW@>HU6Y7Qv?0Q(AkvGRaT0$vE=~}3cWE2E-6i@-c2K3YyIxucBc*P--f$RLB^}8HnO{vC_$M8+}Y1ywS8`` z8zCDaiuUOeU+hPoD7Qgu6$?3bCf9hLW7y{_aZyjiUiO+MehQ=wUoFa*r`ww@tSu{I zZ&bM1c73f{>*UQFZ-erAro;URexOx}t4afjF?u%ahOU_KF5f9ah3TFHEfLpeoPObR z3=~fB28Dnta+`}4*am@$#Zw##p)ESE2)MDJwjdc?f3^es2E!Z!7~swyuVZp$w7M{@ zoU}bxzfdxwl|G%X1J9F-47}X=g^w^bLY`l=J1-;~KK3@oUl_()|>87fTn= zc_UtPC(6mJyQ%m2EZr@%lv=03*VH=u&+6{C`dqoR+yJjtV77m~aJP4onypl-Jh$9Q zGT`ksXz=9SX<5ywE^s-><|t9@AWo7P>^wGj5H_D2owa~lF*}5JR0<||dK7P|^6JpBcr2`onW7fla-7K8 zI=Y}{lr#%kexk4X=G%TGK<{Na7+Jk-?+UB$RN>bxZ3Thon~MXgZ=Eq^W5Xp7{%+1I zR5SvG*S}UCqiK!39CQWZa{^?})Y9Bh3+{D74T<7isoUF`?G0e!Z}bRuRS*$ec`M#+u*rY*xCX)6N6t)1@+r}Y7^R=1uNp5d zB!E~8EET}HZVM^0ooW`SP#q+JBIQ7^UM@+O=_4|@0bbEzmn2XdcFts|ba~lQvTb#& zs4vy`tI_5QNrqV_@ehMmnF_g2(jNG1PDE;q+hdIGjpw>x5jR;n=iby5kw<*(rB@oq zZx4T+&{Pk@BH1ERH_&F0Atp`dq8cSspTVf)o>Q}JBTK*5lTR$()_NDw*bP)hZkAFL&YpPP&V+Z)AnCYcIbTGPy}SBaW6(m8G(RcwHge0^WZ z2lzBlAo{i<{8o%90+9oq1N-siDouT};6}C+>OquYO1=LosPjG7tLvS871b<&yd+S% z)S12*?lXhziHb|Pa@4>6b`7H>li5|k9&7&0v>fv9k*q3{I znXHB@WLW`s)>Xbe3#|?0&AF9bywQc*rhCTgt)X9?Q#D!03-OUOPPNm&1lB!fQU28O?6gnh&lQzcURPKeU*uu)Nl zDfTIPpKKRt}AOTta(zj2!ithG+s=l%-$Hc?;9GM7%8hEP3 zdmB0S!USdUAC63f?-Au2ww*L!cm@vw@Q!;ug2p65h1`a1b9vOvCk8C+mxc;76e5(* zNFcVRQ1?>2!{#RoD?e?ZYI}4HjkF+DXAelf>lE)QHVu z>XNqjzhldv|03c>(Ms93x`$)TnEJi388jJ4QGcir)VXqN!yDO~^?4n%1sw5(dsW^v zyuj^xm_=MBdXPLS$98KnZ7Ko!qGl^dP&zUd(XPX}Yd0MkowU4#fz*1<sAy{~yq=BVUozo1~MEwn0$3%T{_%UdDu3b(w? zh~_j*5olZq>IMde!LZz=ZR9LJTU7H+hjtkpV-H#H2C#!0TV08cBu@)QyDi}9)ZNTf zZRQI%xaa!n)%aWLG;AmeQ=m^PUP_u`Dw^?HAmNC!%zy%($GK#R^v zX4^tP7k%@YwLuc~#?6xZ=S2vR`=gKCO({)B7J`U1r}@HF)}A==o!G-K(|gR<%W=WK zG=hv#1#YBbwo>K_0+q*|c#1Z5E+nyb!X>CJ}j+GJuGifl&_cOCa!{%}m?&?&j zT?6gXpjq2-=s=p*DQ8l$*Jw3b5U*(9(Scr#?m{*VMMzAAS-^b-3?H?_`_}D&=0tm@$rckXx4kyQ42}zgB*Jvb;f!)C z@}a<|D-jporX~Rqu?Mi8!VQyey-J13tk?3adIO)uG$^|`w=-3MdmR)jBi?BsQrp>@ zifL8c^(AByproy_bscy+B#^}BbCTIUU;Aq%cuqLGTz7^jLE$CPfy1H(M`7_xOsK+{R|)c&|RT5zas|^7tr`hcnVHVN}XLD@|BRa_k+z z)Zwl@Ju%p;wTU$>sUMW-RTU(@L>aw9Ic0WttDgQB^YSO90Qgu1ijQ8~Ae6bwdy=6J zH{zY^6|xQQL$iQR5u>W@774|GX?*wi`mAtO{W{1F4=0}SZWokeI|33kX zkR0JYtR`UhMb|K!X@`M3CuG;UwGEBHxbB~U`&V@I*^K1Jh}Im$NKgNcF`dR zVqx>MqZ}qpDLSgnq@8j$Ut;!tL$rI1;py77Y$+py%YyA=$WirwJ$-4BCsWvFpir7nhLMtm>HK1#e$Yh;ertr1CiE(Q<%1}ZfltnvTBtTUSE@KGtve0bfX0*O(3;piltoi=#$P2{xxqg&Jzs;kjW%&SY z7sqFHt$Yf>vw58NI_+dux#R<xoVz9&Yn`$KM)1TmR^4HjhV6#OPU(k|S z8vP_>)jeI~spgMK^k$E6+e+Hr?E(}+05P#rI?-uqxfY2hj_jre#oG*U=*IoRha=LH zge?r*^z{Rs=eY=$dme6J2lVX=G!vzW`hC@~1h+xioXWAzt7x;JNB-pkuM*qmEueLB zUJafYh}LFy>_J0Dqw{S<)a$&(a%;w^X!~fJlou^k+Z|vYxamwvlxk+CMMe5Zuf$H` zs6YGV*4)VYYh*ChdPl4}YKw86$+_h+&PDG!f#C{O6P>ys(0sp+z0jbljcwCAs$rb` zgz`CiCb-XvC4y(fW5GYh9^pxYX-ZtJ$}P)rFp9nkK>a~9*aeuurryrMIFFKULzHD>qI=UNLPK4-Z}i{TU&xqQl~H6Pt6E7TQf$ z9UO8wGWJ+;9-f@pt)z`Kx2F9ZfI1$-*qEA~yyfALgc1w2}-l^;YR!4XFKS_O+}qJAwbH#1$>h<_|qFDlNo;5a+rAM+6sH5 z-efb$8|!<9R)&ziCLlaM0OHgT&=SVkfXCv()I?ES{1)VyAZ?Oo8qd>L^KVP z;UN{`*i7RV+}=mzov~cn&=+hNGAvKFjf7HsaKLu?avZp$#85{~OAxP$3g$SavEi|t z53#i1YFmC1u9oeFTod!q>%zZ*-bh9Tw%44tu>kPxkK1&T|X99I2o7kxPZI z$Vab@6iG+BNeB{5>SI@|mBLm=8zs7rc#ge_#%p^{q3n<ztrjsp?XRb)Ya$P7v<1+yu~Llkg;Yf~ zn(Ze4=b(Njd595P&?<`Rep&{#-p(Lo@CpWiQ-QnH*m5rU+gNwy;ZH@zI zLB4MOf_WZGW0kj8VLPOlfk};i`8Q)0*pzDVHZS{@APo%(!`3|>ZLa`hz@!*Zvr>nvL`Z3?nG16o}$yr zJlhm#FPXenP}qb$=kNwxOSJ51*BroEagCI2$u0xUM!={5fk{7?RidmKbCDeyRD-^g z=(}zlo)jj*ZigI4`g^nab{}tEt}zZ!b~I@;JCv z{n@HuDW~jrdf3QU9Sq`^Sw!O62-&~;uDt^BI)dNru>WioxezM7z9tOlid#f$vJS*o zrWkb1^VrTh-I2-mTbcBgMO`)AMtkEH@<3S|W2`KxRtoKS&|LX4js{wl>MDy}fc}Xx;$kt)2 z(L6w@yvx(WX}?p67FY&Xd^OMrB3P)X{Ro*!H!smaFoaoz|b_L_V;+#6Ij70bI=LH@W2Rvk1*7~vW5oO>1TiK!`!+`DPUj-dF@^7)&H z&d=7VFy=V>Vn>0^B-G_3D?cTyppc7Cx~conOG6a2-4>ni}*V1;9?1s7y%J2>* zNf8=N#{e(R3-n3^#I0sFWZY6!-pWnh)K4rDPFj3<-77r-2Dz)>!q^R_t??`%-jP0o zXu0+1oq!>U(!Vjr!9!hrEG=CMc9Ie^+hA`IH`lt_O%`0ma*PZoEAs$ZK|>UjMyG|l zj#B7HX8YLUR{2*S`@T+$RM>}I?u(DTa=-T3(Y%RH%_qlZzi97J+r4Xd$kpF!c^;m~ z4ZK1y+8c3#pv*&>Z2-NhAnxl8W%E0!N4)cDwVpvd{Lo@3ihO)7yTVQN^`&z;TbT71 zW(Q(A#+Mb@g27xo(FIa;QfQbImZvzj!+TEgkIIRE|H1e!r8((#8Otv!@$VLq&am+? zDW1!^6Su$50tnr1{<_!SEUyYmrZedqCZG89^e}IC-`(jE-&YjhmqCDzX-zu|G3DjI z`t-cZqO=Z+^V$nX;9&P!(jrCc?S`YCQ+Vb!8Eg&2dRFpZPtsqO1XJL?MLRcgiB_b% zul|vr{qClkO=)NM=b)ATB(|zg4&-JC-W(SpKTZVf+D-o(+UfsO<+T6EKeZu$0?hyV zWbdD^@+Xh`XI}rGdc8m8^G{La9~cY!|5N2Dx_=j{4KzEnjdqe#NE{9&usP5 z>i6}7wt;uel{6>v52}8yvH?_i5D0@ONO313xyi>s55a5EUFUW(OJD^tGrK{fa#J=+ zG>Cye7DKW=m91CQ_TZU)B1t_W#JI5p(m;oX3U@)>73})^d#kKtp9E7~ey^3OQp_F$ z{g~5HY=~d`Tq9J^QQ=j3` zHevkaS#KSfd`-hjU|l)c9=~9@a&Dy zeopJk(~>SN57=V(4abRNk-H8vNK-->v3KmdA{P!Y7{1ewwFGmAgoI3LmyTH^N&TVj zfB!?vP1weQ)KszH!{dN()1cn454WXZavHA4>AghTeYU9aSo3w1+d`Kr6}Y86Gv$@l zPd4&l%kI-sZa0PLf>|%0m1W-i$VV9gd8~=~;0MqwP$#`Acrw!IHn$vU^U1=2n?9jG zQSCQx_e9vzOvM*CQ~Gug%+jDckrbyZ`SPhAE~lLn6nCfg@Zs|LisnCefHIPkqVs7W zHEmGDpuSV$nIg*gJ$gk}rxPSwk5am-D7`y&h8HvI1u3;9Pk)K`9@+a&WfAE(USGFF zw#OnqO`Ok>9buc)Zj-Rsb=CqSAIe1?>AcSZUxjPOBShPbpENabJ!vwg+85s>p>*}? zDeymMdULNIE|QTckdmsZD9E&z33Kx@_wve+8}1sOm2|bzb$xAJ-EdU?mtA{~?D&o6 z${WQnBfG}OJo@7Hq5KYHrc99=74Gln6zXBW_W%04Lce0;bXK|7bKCw8vdaP*|G^_; zljOboXooU&>`~K$Fzzby-K6Z@FscW4#6*oTIm0&H{ikiaQq_t*mTN9qxBo|rRDPoY z_v`IF3?Vl)uno?>;$=#)3K?EBgugmFE}z^B1x%)x@rP`qs?T>tJJpZ*%0%uz^XM0F z*T_!a4xj(hS9!<>h}#jIE~`a*o~23EtA_-!ukrapUu>!(TsW8efG2c&w? zjdlsN+13~z`efBcHlZ@_#J)h}8BW6(u)8)>6Ygl_t*+eTEMgcze7xb%J zi^Jt*rt*rY4~|LNIyyS4Rq0BrCyb$1tSGgHORvB<%)u3U-dM$y_vii{;l4bg9EF>$3$+!p16uR9=_O&HJl7!)}WN$MVCcE{T_< zqaiqI{bC=_MNjAdr@i-#YI1AaMMXqhAYcQe2&fB1s!9nRlm#eAZ_+_Jp-OLwfPjL4 zN)e=k^d`L%6s1La2|e_Z03q}ivM0XZ*ynS7YoFzh^W*&3j)7x@k&x$k<~{Fn-Pe81 zWMRTs_UQU_^AYFK&M0~eaN0g@VpMl;(Sa+Op;B!vCc=q}t+1|bENYAoww)jf3my5N zvBp1^>txeao@~0@Pfh+eBZbG^$4vuH+>WaTJOfLcvysGA{CQ_}Y~+V-*-GOf>l<6N z@2yDDjdQ43ulnM3o~j8`#aM0|$(Zb{a{}p2U6*fD=MSq(i!p$za7Kw)c&1b55;R}mjqp@X-T5dQ_r2-{GtNeB87qo6meMG zE#X+)u;YDlzpt3QvCq#hjn|YJy0^>LXc?_^jaAiddgv|VyaiL-T5M{nT*v$HVSUfZ z6edzi-!6?+MJBln#ar;$0vUgI0u@9szbXEC7de>rs;FBxJ<~y#GY3sB)rlbcH4JrX z0&c_+mHd>nx%y*&(htVoC2qLM30I9xINZCU*n%GA&(GLfK5XgQULyMoY72=4i|RJQLa7uV%QnMwZ}K&~?tqK3yD5l|N?Lir;q&-_bgI&3)dULMfr%dn@=c<-*+ z8}Dy?@I8f|yQ8s^5qXPNKgM#~2;8fUKPf13DnpKs9))jfJH6l0F+{usLQ-$7Ya%n( zRCFJ0VFr=PV%u{8VHl7NtVtLu(sOzcmRa;G?R3+sxYo9RF1_aX!l#F0@h3NFXO$2z zDEo_>d494vIi6oW`TqwH_Dij+8}Zt=_^|g;DM^S2^5+)_0`E2kNB9x2G~!#ImEt#? zH4hi~+m)gg$_jIA`Y8l}eqeu*+C-xv!^K-`pxLni3sAMZlJHc26uVl=7YPsxZsBU@5QkGD}#$-7l(_; zsuxCzAW~eB|2>PLN-V!qp?6PHHuvD-Z(E1Yk=q8RqQaqBzZ9$!QxS-5dj3!&w%rs7c{ z;%M8^9?Y=g6~{VV+oUtn*!4zL$r5QkT0(d$(jYue(t<>+OZOPGxFK2pHFEr}q6GMH z23IrG2CD^r_;5{2XCXMX7gAtYHv}9Pw3JwST9Mlg&iOg+i7|EfoSwBJ=!z}+UU&k8 z@m+PyOwSGIzJMl+YTUc|i^J`)K3PtU%fQ!aDoiy!AouOx0R%$hG9R2zQy+cJ z7@en!T}dC!6R{2szO%0{dz2#AH8OZXbcS>PT{E6K(gcro6%+>6_z*8=6?CEBR#gBhjYJs=K|=D(wKe zV(cep(b39!cy9&h9GTCyM7R-KhHs%xiS+Y8M{oC29$?D|2is*v4hh*Yz^e{tr*t73 zGX2UN(T&=Jruvbs`woXZVU@-Xa*(uDe)BdHCLz}x`1ohj3_#9Cg0tJdE6(UblZGy^ z1d`K~COhcoSIvKw4YTVSubH%&?}({@N&ZM>NoWqCav=sEY#UYj#`CKuS&o2c^VKdl za2Q+4h=S9 z`aXrQe2TiIQn`yJ-ra?Xm-v3)s4qc{RXI9l$VI%dLMA95w`^GMt&kDcq5}*h!eYo| z#~iyQ+^en!k{^ol>D3Qkqg)scVTEa0U-T-*UcGDj)ssoUDRpxVicNaALt>BMS3U%* z5`IxQRWw(!c^r1vb;+cxZ~+C)%Ky+(iLM=NP)%)$^iHp+kd{lG=#7r4iOe%LH*Kz$fpJtWt_vcE{c8N1*a`RQhq2 zpqFucRm`mAJp9O@t`baU?>Me5a`Ibsx>7|gZ(=+I`gnBMz0S97_)a0%nxz8CYwTwt zYJwC-9l7l%9|WG&t122({}kf#e#mDf?%vwb2w-Olc&ugb_#YMuSg<_-?$3GZMTF^S zh^FfsBEb#5-Y;rw;4~=$^Y#~`#$3%&2S)(0C(~ejI%jo+5Bf6|Ka6`x2UF}B%tbhB zOl-e`v1VSm#@YfpF`|fF_mQWuwVQ9cW)BQG+qiiFw=~+t?;>0!j@J!NkpgXN?Riv$ z%W<4~(WO*N-M?Q06nu|sA1;y_9xizrkUm4D0So_1c*tR;hHugYtc~98#hcyu% z@3pzGk9`X|SA&LPH2CJl`rYG07^2q~KuKju6U0GRiEW?{o*TbhTh~-{p}r7mVTi^Y zSPbO6TdOrmTqaT+z*}5B3@;eZqAHU+E(kuWE=meMzj&32S{yM2?(FQuZ14;`Bm8ytJ$D+IKrUN|V{nBI;Kq!w<1d|Iwie~met z(m~veV%?W>nYaMkdnZqT9%RVn0p)_lWGj(0xzs2X(JyFjuF7#8!^q~vWTL)NvtIRM z-IcPSNuOmL@owgA8VlgfD{fLt35a+4)UQT}O?1|0x-VT18IBNpl7olIY!BGK5?x+E ztI4yxgK`&?+*o_Kd)jrZ$bV}~-n)6uuteZ=e88o_fWg?La! zAg~hO7-LroPo}V~L3IfX{x26J*I$QP#4<+W@X1AlGR0>METT>FS z4nmwJS(S2vBdo4d5K3Noq`^FlNiPNs6ACXVW? zh8Yi|5kwsE2=&lRZIpcLIE;@Vfh$zxt7#*bnrl!CpkbUW|am;HHa8H%6$!D-u5w|2nPM>lIs*}5_3 z{MesdbaJR_+;fx1Ycas-{E*ii1a;KKD@+_8Fk!PFJ7o3EF?lgqr6Mu{=ycQh?q~wd z!Kq8cKk9_)rbNO~zG%_X-X(<`MNgv+GsE3M^;m{(rXBoVg{K}Bx5v2dCWH>b zn$<0~yJB$R-OdLLFo%eBcai9G-H-qz&1WM>3r?|@K8&8%wn9wYUGOgH!Cj|Z(R zn}y>wyYJ*#%giK^iJII_seN7K^G2I5?Q@~Emm{UR$QFHB&RvJ=883jBh)jrR#;u>4|%0Px`nu(Nf zw4AAMgq>dy66&A-G(7L{8y;rxN}T%od$*sh+4a&3!b*EM}D$s=p^wW~~YMSl3* zHSgIxE?#6Eug{Rh%U)vlOgho{C6Pv)@Wa^m^K`HLHHPl@2}AcVYd$!xtIS0VP%%qe zB7Cm5H7prhD(PtHoQOT056p@CIAl$6PT{*_@OYrL`-6iM>q@caq=p-W1Ig}MO~BsF}yi6>s97Utka>j8`m;BVsdPpXKbNvKTI@w zM9Hign9fKyn|@Yt`&%DEbwF%+M-2BIGEX1$!B$*(+(!Ozs8@c_Q%O!Q)?4{IvhPU{ zBX)Z-H#;#J#V^&VAN~b0=^BP6Zr4+34rpcF~*F*|XD*#H3{>EKk=_^5|fT(XYM3ZY#2%p*DawD|t~}oxWPo5t6H}< z*@L4ca`YzZtf+(3{#Jj@tV&vwzk(R6xpU{ux9+Q>`)ujv7v4wMv#WTsDrhx@ zsTba{*jl#6g}xGF;>>FHqKX88-sdpMV?$eJv90<#W%N--kZGc=~YZtrOFRC6?PwIqJubk%G7+gO!+L)K3uFWM2udc~4I&hv%?{hWw#><+j z3HcqI_gQ&;+okywI4NLQS#)~OAefZLIGoL99+b-t4k;SmJJ~7w&~Q_pa_;ReW(nJ9 zVs&u+-h|b1`RsiAeQICT)X-9c98)=L1(vsNkEJ=SB(*2JTi8h&iGuB}xWHXxEB7*}y49U>_~n{jT=iDI-%;dKhP z53VK~oNj_rq+&jYR8M{F{XT!ULH)}b0^L{%i0?sFtAJ;$Tui{^ML9E-VY;2T0-?Z9X~Zdd;!j_l zl39Zsd3Nzzq~y`li*-7N)7{q(2#xDA=n2@Co3WN5dCo<#GrXB()S5H(gvHfud$$V)!_FB@5zmpLR{(ekYD^cJ@`#3Bk~;OpB5rQii8ThRN;oiaq9zuJQ~zWfqUTy+&c*upL1*4DQLPQZ$izE;L;uHq2Dr=T zm|s2lc*98=@@}?8BHYEh$0~fAwYo(lvqVdc)N4tw=&VTi9hMFd)}dP)G{qkdb6hvl zrUkvqM2#vP-sCP$1N+pwY0Z{=JGpVH-o-`c-pebk_`}R6DL!4gg5EuOU$h}!F-B#% z3Vv7tlcrN*-Gz~%4wJMB_J^5#urdr_)3b;G7nx0UJ@ybkSbAC-qK9E{qPft!}Jvm2Z zZV{$B^y(S=T0>Iom$YFk$_9^A=%16jWh~R}!EYX>y!J7p@0eT%+3y7T1#5<&9Yr$N zM7NgZV`%gNv1%>Yy|nEyWJN}1=LA0M6{uI%NMvn19@JdQ_*c12Al3 znM&D#hB#19T=m)3OG~4o=wwB(K$2-0WB&>_Wwg8=#}y~@;J$M^c!!ZuT-S#0h1mGP zjDSA|IP%qVOgZ)qoz0rXA;E+~-VEu|Z^%I(DdK#wOT(L}P3?}&@+SX1gDc)~m_3he zKA6_c*~fZQ(wRv#bBTk3kpVXUdSbLWG+nB7sGrStVFfiEf^<wuT zpD7K@RnvSBpkoBnEppcR!Y*1hl&Sf3pDlmW@+msli2_;FZRWeG#qmN7l`iVYc@Fdp zb6Hy+w<4;|r>~GxTgP1`v(H^(+-a3Mf~{er3vZnFNjFs&_%mAJJ{vp8WK&ZEY-QfM zIhZ=k>E=9BYvdflM>SQtPXHhBx!umJgUVNCV15 z9WA{Qk$K9GBNZAK9uLJV{;~m@c5a-2QL!h$q7CRtEnx|tysw6^OyGy3@l@xjj8*k- z+CQ(^_F~!nQ;iSBz;8KZb8$5De=&9Mb&x`RZ@C#sN;0s%iI8fjo$|nhd%`4i6kG(?P})}#o#N}hsRoWRJc;8hxl%()og8qM%v8ATaWP2LYThJ8QY`5Go@ z84e5>i|vjS*PA7*&%J7<>D<1cf#cbp;nbcqKt6(#pd_^~8_99nq1i z{vj;Nye(3ZA@)7OcmNY9B=LpO%c#D@9F0NY#u-*d^+(mynulvd!pb6kOfoE#@uKFy z*nud$iF!`-=(+~x2*2ABZk6YfhEDR9=28q%7{&2QN)CGD1XLTDiXt~3Jhjx0#Fu1& zX~gfR*IdQVKhXC$#;Y`v+h}h)yqf=Z2)C|Re;axK)yw3~gUJh;muX_&pJ&ebw&{;a zo(DGSBwBZ#Kd4B4!0D&~XOLOE9qvHE@|epi@!0++fP`(&PJXNu#s|N4Qjcsuv3|?yCR$lEjDj%EV6eC&wq|4r5PQ_?f)uoYyOuPcD0Jd;x_=5-1 zRr~e`0j55ZlaL#+KZNLcT2z8p1iUW=Sj~9N*c*3w@grvqwB15c8vQZ8olyl`-M?br ztFk8z0Uo83h5%6ecUHNu&Dl%k9pl8W(I84OVG^Ou9`!g=1B|*!5K0HaR@Z#iHgnyP zhMutyr5U+{9uqYOLI=Uu59^xARGOZT{qjj&pJqTq`S}Q&oTFlM@#L}I?n$oSE|7`$ z{YVXA5iGp2Vmr*I2^{eRw~pibL#lW9`|hr0Hqe5nG0&*5-cns0Zz~b`4;F4q9Xu#1 z!8r2OoAV(y6pU=$c=DeD0qP99>?HTCGVz5zDa7tEIhj=?skdajaEZQLx7L*8c_SQ%56n3AI@8SL5y=^!z%M~X} z5Yo%e>xE+TmOYL0d3=Hvbb=|URMOuYAQZrqkloXDu8V|afOHzVe%U{!wTBcM8_Y0m zoT2`9Uod;NCD>f0+p|k!V*N6J`PTVt5?uwSG@m~P^^%{8DBhDv%<@SQhN7|e^PTUf z$vN@9skGKP&}3%iXEy{+(Tf!&`&1(Hwno<3Z5TYi-jKQJl_s~sUCj5@L;Jw&u9^fE zwA9EEgY&+rsSJXQ`QI8E2$a37N&H3iM|}qFBooa8Q$hYHu)Q`$kluYkB3qYYiHI5_z55ZAM3ndKdj=kbiA z)_GWUgnWjHIDI&fT5kThf{D}g9V@BtPNGPohpi`qH&S;};5E}St9wJe)boahgxY~B zl5hSCh-}P-EMAX@>Wv9?30;J;UGGj=Q zGo&7?F3^?hosiEF?q@I2<^kp}s)pCa+8U;A%_PKo8&dIM7;?^^p)?q^Dh%|-(^3Xh zgKT5mu`tEBV8O7)HZw)}u)jhPWq@lc*X#iU@^N=z_1ECz_l~)2kiC}=dy0`=HiITcF>y&EgFsM&2VYxiwZAmr-@(` z=~I@N|1^g@PSjZW$b}e@=d`>8JRp6~F>UQ(<+tD4{34C6vXE8qj zjtyo!SnI5!ATRvhJV#hW@Qne8BHHXcqPTV?#P|@S^Q8Gj&L=W2E4Rt>2r5678e(Ke zo^qa|K`Ah(dV{QA{1nL<%t@235$Ghpdu&Ed{mtfWhV5Q^_3kTmQ``FS^8526Du)Pmj4jZ$ZXFLofY zS`Giz?bMkqb0%GLEV{!Y?1T?m)W=OW5Eh|WFILDU9;ex@$(0stq2g-|0?OjplZ8qu zt>vIP$9aE+U!^rA%z*MeEIV2`I9hDqI zD_g-Aig?r8Wbx8wlR9{9$n*hf!>E-Z{o5rFxokjZ54hXhF@}Cj55M zB^vgO#ZCz)Y3v2GtI7@==%Qhp{JUEt9lHEUM;!(J3fUnFUzQy|m-3%LHzdQJ4`mU2 z!MVtW3khx2)nrEV-OgxR2Y(0gYeF_K^0sJ&I@eBqU)?`&t)6NyGeo@rsj4`;{bN&_F; zGmvl9tV?{1=qA@qhExCp>$p=hUMq>VM;R(LE96E$Pf9R=X)7x+2s^z+zO9HvMrU5K zqSTP%un1OvTz6~7@h|v}Tk|dErC=4+$A?mi6HLpLDUdSBE(^8nS8R~@d7{4yrN!xA z@H6RDZW17)v42utZ~|rdH9{>Bc39I=SS%|@@GVUvCErp~_@1Y^dimCys>ufLd5g#A zD%5CZ0H7u4cC>jnlBKNPMmX%=8-zp9+i509=>Yp zFG6ADka5?r?ia=1=+ChnjiUcKJT4QZat({8maoV=PwX&1RTe)I^8i^CKY10D_vR@uJJc} zsVqLe=21~!ZSLoHk+1H3w#>~M%+n9x_(YG9?-AlNcM%Gs+Jl3GZbl9C#N)_;Q{O+o z5aR8B36n|^b)(>m59Y2OvIu$PP&|H~?AF6Tm>J097X!Jw6WX7U_ueyhPKiYC9!8de=}_pwG(h1SqvRG>9dVE9*-ZU;mZ% z70r9H?-~AU-*eDbc?(xOkL0`=OjQP0j0el0bKE~bsW!i77Ksnv`FM;%5Y3?DP9$mE zafsBRa-8X80gD!{(eH3{>H(?Joq72P=6Cw8_hT&9!mV#06K-wWe2UELV~g4@>_6dV z^egp`GG4@}FISUPRlXs`$E89hoAUX>r%GOV*Gr<&+q>hVvJhk;{G|23Z z_CnCRtV+mwN!UVP`RtAyEP?3q-#qvLSYUpRKCr&8?XNX=ldcXJL*xEy2&w zBUf>>jCs=N!p)%^wVO&*57VxI`S)*|4{|(4&2mxV=mt!v0VzIRHf*9~rDR|eLj>Og zdlXx~sZq+}eA9eGLLvp?*eG$dC~v2GLrf(Og2aY>{-FRr-zqs+#_LZ?INYt;?3NfS z9;)BpTpb!)TkqFBdL3vnGylDzdKML?1lj&itdJ=CPppVx|7($*mXHj?l^P9Zo6iqK z`;($Y4>%I4`y*i6QHz_OSvlKII4N#w>4657{&xiwM~RpJpL#3qU+b+EP;ccEKDEFH zgx_*v2Gb_5aVn7KmcMnY5t)_hX@l{H=P!8g zO~4ing+On!rdVR*RWtB??mOMYgB2Ip_~3W~yYdC$&s5x#I0`cA9A|K(R^%4dy6s*4 zYU_8byFN)s@y~5a-N_QD@OgsKYyDJMS-w?CaDnRUkdQm&<*YDbUCkRVXN+Yhy2MOg zgN`z0)O#w3Y0ZXe3r6x?*%o7oYF#Nmr27WC`n^mOoeNDTwi~xsjG;*9Ht}s;m)ZX?`3r;x+tmeX-OZe)I4Ys|0pkR12(uV z>ySn(c`?E-&l1R6TwzPMPgftq_jPNwN{Oi?TFbM#y-d+R3}}f#ZOd~+*0PCE?P{U= z27dYvyr|Xc6~4OYWlyObtca-nzOIO%IXEY$EpyJBzrXhKfkLfNz+>=y9^vO~2UIngYz6j_82~=!mKb#{cyT6{eEwByd0=Cr(8J zBokEpD8lqv{aV-fV#B2e?}p+oYe{WBp5PvJ4*6qeMiI_eyRs)&<4u7*t6uVcUMcE8 zsE^rQCY$kSFEK12hrzK}MrgBn&@ulGm0*wZ37n-<4Q5agcg#6~W{*%NFg) zqSOkpjma|(Xa25m5=%7 zZy5mXQYt+r{U8vZ`n8H}P9}cu@@sB$(~_z3cUGfLoXianSLlROqep8;3gmV$m`wN0 z!m9nPiO1b7zAWUV9E94w&)G4a1^9>I3Tf9>Q;6YEPnEIO*%E*3D91ZlnRl2iaG-G3 zlzW(+?%Uwi-cAwqQ$`n?a?Yn99+lrDyN0hBTKpVymyj6 zOKpXZ=KJuexl*r)Um6^2s!}ILH}Mk;6$N0Zwd*E#7xx9UiB4px)P#$p=4@~RvxIka zlJCqP$HBdQ7RVJ9cx|>x@pg4@Iq$50vd@lgR)RgJ0#y+=*)|VwSH-y9!FiYM!l=`C z1=3pn*@;OZZlJs%g|lZ3wm%&w)v0AChdTETw_%h9y0#58y*dR4@)!L| z)X_~OJp9UDwNBGF>Sl|i4vaw8bNTFSCcW_-lh&4ZsdDXpQE>ZpnX`vz&|W;AI6mA! z;%_|fw@uW>-Q>|=kJmFQHuU;#LDv!=5klVUla_ODS)9J@LSN#2G#+L09iYM{;yMpa-q8IF1Yg(Fk9XQU%Jfv#C9}BpR3e@3_$q7gJlxoukpt<)2QMB2LriUbVCW;`BJ$EJW zV=&Q4zU_c6{&tohVr$ib*A5Fa2uGr8N7F3{zUv|I=`yL&Myq6)-ddG9NRYU@$(;>L z<8IA1Erx=|pmlDHJAimT$}j#QxDxz+=uy_LvgGm^4e5^xC9|0S&8hSK-o*n8z5y?- zZ{*kzq}|r`UCCE$A<>d__@p(}%HvQSfWctuw?nJY@7V{BJ>wD|AAZgJU@%vV-8S-7 z?x}cFq_S0~t|o(VL!qaNp+}Um$0rCVsJ~=HXs;J+7YgUUDoS#!-5;talHS={ELG&~ zGL*?SWCERUTolFJkTYBKg}Apr;;zt;N1cBmf@!;H&@=iM;#?9kCY+1mZZ6i6tYvGp z&fYq!+^=3GWzxNuY%Jgt=%ct*wo!FnqPHK5QWlr=DA1T%ZP=YR-nATjlrrzkn=ksE z52HPKXWp(?RauP9*i^#q8yoptYeph;WF0f-ozt6tNYqVWD zT+fK~)h5|qVOUfT3b{A5-Ppo_@|`bOh!eq|ElH`iHJ$hM|yKTZcx z(~=F{<@4RK!Zqw{HJiUg)w-^}M~0>*V4ek|3W z)E*omv?e#7W6&d=_T=h)t>RqOfms|teK!sj5io*5QVU391CNf2yvmscUtxDId)n6Xvz*- zo$NS=nlc$?3A})`m|Iyv^G!CKzr+1v2VE5(rec4Sei%MZ^?a}-EiCw|YU=AyOuE34 z>C><;f#f4F-kuvPRe-hWJaVS5EYJG9F(Ft+x8Ad)0l%Q%_$AFTMe?LYI3GYGpkhc$ zaz*vi*;|BL(XTU9_gfLi_E1z&A>Mu5GYe8qgTPc4GvPET>t{ns_$C>`7(z2>?}=JMMqeO3SkWj(aBx6JH~rEw-*NS zng}jJkqm_s8SXp6bP(HilNxF7)|v0(=u17i5?l7AL1>UHgRncVW2fa0%_}J4dmfX! z{`w~nOj6ho|`zNs~36cke#PLb&AdG@m~)#XGHIX{}RaTa3}j3NAi=l!XVt!DM(>B%^0FN z!@}U~z4TC`Cqt{vwbU5V*r+D>=A!Y~*V#$U?+b%Uxjl-R!@lj4J)mI9r?KSNi7(%! z*rrxas2V_(7DX{$9Js6jVTW=Bd9Pw!oJorUL}u`={jvE8gS9qE+|IQ_b3Q@I`D!wE zdiy)pO`Cmj89jS(LXk%OR4uIr2k3{h$1J{6B#jp#ycs5^G8;xVfKG-Y?)sN*fRqu~&*+G7UDrxjke21 zjkHcp<_K~qlG;5&`*x$(MZ(DoozoO~+}Zq`(v4KjDTYD8@lMiMJXBAJq-Tvak9%){gLuuX>rrVOoZ!oS$&aDSVCbic;YWD@- z>%Bn97@9uto=$jumKBEHE$9!&rx1=lkkNa!%hE3}>Qh)S3%8N^JpsHc#GC3}i>En5 z_qJvzi*1B2Q$hRQ%A&pAP>wQSOTGV0a_jebkA0VILMOOG&joDA&HZ^UA<7~c`6g|e zGTP`VS`7uVLHXQ}!KD=UC+E;nv2+pw$wBCO00FL)h5s43*s%f#bBr7V`Xas|u&UeD zgnPO)WY@&k0;11W6O<9;F(){3<3S?LRt+~2Kq-L++OO4P6YSm&s4`!7y211ICifxW zB;2PG%jkQ%n6vITF9~`z7`OaWAD_t{`ZF474v=as#+R$xERqsnWz`G4lBvm=*Egk~ zdR$W+5Yx5k6fBgR1@M%J#}CD`)aS3>ZK8HTi~x%9!3$AB*LztvWtOr~iyY3~gB$yV zm;-UpHFxgf1-4m!s-N*2{nTX`=2#(NwQ(6Nw$Eu1DN-9F{6_dYB%#u{TH zLcTfUoBpMG96)Auk#r>5Wzx<%drAu}Z^77y^9(?~ia83mi5Ja2(U{C_SenIgx84E$ zf#W+>6+M(pt(z8i(!CC#PUe3Mv$HZ<`x~S36u6;&ruGOk^w#CEo&jwuD zMcmbK!NQKQK8es*qGT@=su%uno4)wVrjTxW16p$ppg-64=?*D*x~=BQC-tg)^4LE8 z{G7DMvqo8&Nd?0()|Rvyj?I8Ru5XMARFe_qcApILCT-1+J8?jvNjNK?-=vars$%NE5|U#oYx z@IFAIiGoKT5`4IeTVy;A-UFlQc^~$o_ie%jgJQp6gUB9+S_sDFlP|1{xTsqVxvQ%D zAk8Qz>JVpiV4h(EH0DR?JIFK#R#mlYv(xu62U-WKwsC>HNp-^uJPCvza*7V?l}n7> zVAy33me~#_SEHHETI8xA66wBOTu|~ESah@ND={C;8*IcsprX2R2E(Ol18{;OFM&ig zB8*Vzv+Dqh%LYtqRJ?q1c8;kkiMy!#_?mheJ;h3|m#Tk^C?dbW6-9}v9@qnQcQIcm zPZ@h7YJ{vBNSqEtGuY)aby$@!UCNbQe5p`*XV*8?4LMV<+b$7AM5TSlj9c`kEYyH~gx z(0t4bu`c_W^mS*e>>H*WYPNUu%2Wqxn^v8I!TZOqljXmWF1xh%_4qJ8IA7gsXN+;k>F#@SFWzB=+tG;cITd2P;whQWExRN<%Zb(F8a{6c-o;?|XWlF6&=hxk?iIx7zsatrL)! z0^l*A2YvBuRHq)Wddr4=J@fg)nrR_kVcht5Enzl=So(9^$jWZ*d-tvA8Dc6a&1cX9 z+}E)_1H?lKfJEw6^-TE;=VNKJVE8td@s3vCI&Y--PBl@fNI67RnBg?xYd zl|b`=P5*fH49!9(lJXqeXFp1S&-ppRhgg2yg>1&)T6|CDeE&1+n^1A;WT7@IhC7Ts zkG+Pf98(yvRi@2nH8SE95`^rZELRR*pbMR>o39+Rf5)WrQN0#?ATt5~#~Cp24#dLT zm%q~=m_5shY@tNm(~!BCV{P#K8vksm@BZ_;=`4LaaH=LA`Q&7Y0tuF!S^Q;9&jD$qSe8L?i6$ZDyXAMoP><(4U0rad5XJRgKeGb@``q83}CT#5cL~< zgOm>EE^EkK%@ijQ6s~lb=z#pGt;5ITXzB|RgO7hnFK&Xf1@%xx6XOWpp&A#f6gBDA zA2D1yqwg`c2_i1qQ3K(8vZwqPZuaDA+{R8eBowcMx>+*t@n)`s;IDIU7o^#IPLLO+ zKR}-dBp7$&egy;n=iDgC8NF0+}jAyWqe|^3TN{FnXUr!J<9+vPSzaDlV1ts6=HOKB>(0w|K`4b zzEpjD8gwv+GUWO6Yo5h-#);;t1V$P>j2XjulccM`c_c{;1`gUPe8T)*txayD!DEdoY{)ath9^3lP%m2FbPWfBj>6xq` z-Ng)!2{}A4Ju7hq;4qQ{fPN3TSjVy#g+=3p`U^7pgts-&lP1b6ZQj z$h^EfyJ>OAlP~onMG9xAU#>oE@n^%2m?c!&8w~kN6rSi`xY|?{8Y6h`k^h!NNfa2& z*BUCb;u|Zq2tx|32is4U*YxJ0_pU7{8VdqX^`lY`eH>`tG-dkp;g*`e12duCwM$Z>fP0%GymO^6Ejk@UeDLkL876 z?x$Grqqp#J-T5~jbk+qh>YUA#o|;i;RPj|Ze#1Zi;Vc=IwO>NfMBUn`IrF26QQK?d z5&Yice?e!{m4^y+&A7C=6_cXq|HkI{C#TD)k*Z(CG6WntMBvM<1fh99eVk~-W#Cl{ zL2(G#Ux_Ik%7l6_-LoXuz))DQJZ6T*od ztw^YF8<~%>F&WyrqeLf=x(z*W$Bo7S+s^N;G^IRC!(@?E=ep@q3@nk#`17cc-BV+1 zc09?i8OJ}ymuWKKW2S=HmsU#8Ywq*uH`oSsCT%~CIMM??hrK@Z6VEY zR`bLMQZ9Cu$WkuAD%)CElc&n~c4%Vy*RG-R76cf`ZN0B}Rq}YIDkW47Wuv9BBXz|U zE*H73{Oi}9^3S3lUb^4HXIR=s$Cqf80C7Ahc&NlAkg5PQITsq$yk3k2kRRFBw09ZS z>C;1m=Xd?iDp8!{$ZyIDXEF@ltEhC}=G3ov9#m*lGhQH< zC>?&AEOua>Psgav<5)DxnGOAky7(`UiiqLMsVPn9e&ZIn&uq+)ZWy)A=wlkrpC-pB zRjmfb&^-Dk9iI>v`L_(}S5bzafrsG_P3

r@0-DyY8nfA4`z`_iG_J^Zc|ePP?vZv~F{z<^4sgedf{|BC~2<=-*sp;ER@lbKitdku%8$e zjCm#}@lV_O@4w5@O3gF7%~aRSTM3)cOW2qetLQ>&@@u+IHNEP+Tmao=5!#HpQ~6BS zO@jg7@H^KS&D~Qd&&MQbn^CO@?j{STpkBdh$wx}Fn= zZ+dl)7YVCk5rQ7HR{Ac#zm+d>{U9N+1<6Goyud!lll9-70oX5e zCyB{w`Iux@YviO)vylDx@PHQQ9s%5Rs$L|ERIjMp+Osmr_-DuuTJ#1NCx0V!NT;fw zsoh<&XqF2v6U^5c0UFji3Ki;DHM~Em-yuyYNnpr+Ih6uh`oTXrsK5O#wU6xNX-)X- zRPMW2jii&MXuG;`Njjw(*6j$HE=q{c%+XAiQeja#0vs_ifBW!1yc}*M8}(Q+ne`Z* zxcS5zr!-dWEF&AJ={xmeEBMV-dkLTIitpe=uX+)FF2?AN-&htq^l8$N^Hq|Bv!i_& zSs|;I^3YamI)Bn@Vs4f9F1zoxZ#Hw4VB|S}sZfmrO_Y}B`afba*fB3f@E&wO7V7w}1n@Gbr z{2A7ulydOS{wbnpr0{9kdFHpzwOW;hd+aOCueWofRNmL&PBQ3Bh?j}7K&G>AvW4_W7C*nr(^H@Yx2SnI1LOK`RVl-#V(f{G;{_-cv*}#hH^|Y94{Z0)2zefElY!%J9P9$}A?4Wr* T^D+Ds`1e>=`LCjfCISBoaCY0a literal 0 HcmV?d00001 diff --git a/docs/source/imgs/gradio_interface.png b/docs/source/imgs/gradio_interface.png new file mode 100644 index 0000000000000000000000000000000000000000..9584d76fb3532a3090516e3aedd60208185435c2 GIT binary patch literal 331678 zcmeFZXIN9+);5ZW^eR<40)o<{*HBfYqeyRtDpEu5h?EEjsEDBS2LX{Ty+#QE0i{bX z5`@r;lu*N2k!QbqUweOHf9LNxFV_{aNYq@wRrM=UZeJP^zVT4a_%@w9r*B)&U!&zB40Nx3Djph7 zzkgDC)P?Rre{ViLL46DBNtdrB$7AM`y?RgaBZ+c&xqdn|!!3@CH@bX27YTkUvn*b` z0TrMTokRYGSM_CkW^GDmkz(y#lJ-j0$?1 z+Rb%^={j$^D65mxM*8QFI|_ItlRMtnw5r#;k57hVzoZusO&-%a*Miq=gM29MY~fSaj&~c@J{yD*!r(Jye6D9^IrLw ztoi*JqZcA<&%qbd%o1c2RHPhDYRv;^x$R9TkXB#^ZNav2>M-ifUl{7#*Q`$dpISL2 z7c@=E9h9kT7}@z`F5B2XQ;Rwfy{D-tu0`Dy%~VS59h?0kgP`I<8`;6D7q{|yuWhvv z*b7!EYJIc0aj?x=p-?I~e9_5D%%Jd84%%z9Y`ct*)hMJk;HG333>V3k#zS$(+4!ZaE?;;cwC$Ql_Uc6w%lQIjz_|%{y{TFH1!i zHORRLY2h(Bw}K{#(H$XM{Ql&T-{~EHPDgC-uj~11KH^ijI?3sM*-el78~y2Pdx^!! z!<;TNJB$h0B;n)0pDJ}651?Ij>}~|<9)3*F7^2T#*#4(eyOm1Q9@Eoo0{_S3Req*gstg9qK>bB@{@R| zL#IPe_Q{KMNqhhH&%rU((ru51qPQY?yiXf($lcv*R+uT}9W$8MHU{tWM%(AlpI>fR zw2|QL1$^l^-Sq28xS+)L5iecoTXJk19jmZB^-puvi;7OspXp5SCcvR zfeoL@`iQ2(dx@XEpuVB1MouH%#tgY-K|_d4jO4s`7ynhX9p~6H>X%msUk2R0@(I6C zS&CcotKzSlo}~3ps<>-kLhn-jW$W+`u=za|kYUwpuixhhzShE2_Sx^}u0L5QMc6a+>7zk{zjGBflkDk#+0pl~=49 zq3=yDLm%9|!BxV&b~S_3lGo>@;)f3(Qw!@pMt*!>L~OY5@paMa1KCnf!}+&ej-RfL zy<1gF;NAXe(5cmF-l_Ae`G6%U%&1z=nvmt=6B>r^SeJVG!%N!Y&d6l z>0_ng$j5a`?uJLiwV~9|6miLjpfmew2NL4RJb~INy|PccycG?abO&SKMEuFd0SC@k6mcZuFTIR~xK6HL8`;n7AWmi-H#Mq;V{wulD zU^#d&6ar1AoKRX#5PiJ*F#VyzcmFs@5d;=fsF51;)h?scDapzJG^jjiF=&|Ip%e7& zkpsp7ztYfw?_1`e&yseaVu(<4)U>0U3%i@JU=7yX>`-8avVjCmDR$KPvrb%e#tzLv+%ve24cIEu);P0rHu}vVx%l%pPqfc(B^%^ z^P^ATVlj$!USXvE%fUQx5zP|!n?xCDWLg|ltEJm0rIEW@{m%1r`A<&2oQ;W$OP%vp ze55~`f0z)gy;Ezvp0FN_rdY34c*Ne%PNwh<)DjpN5DQvS2x=;B5^MU{B*!H8T&CTo zJ;42`-wU+hM(#96h(ZWy$llTM{K~I1RO4W6(CamR%!iU(_9N4M$g$Ls&*9*S__6eX zA7Lty2mZ&4?3c_4f8*C)QnVcx_j=oyDiUGwv`q#2bh)LyTv7ye+;sczUP-&3^jlF<#^?59)FMr;b=Ky~D|qj&wZTCv z=CGSuTET9s)jrMYDDT!)M>ZGVj!IOLQnAH%Yz-$@z4N%e=ebC_iLPXUQc{L#@>qvG zWW>XI6F;EH9~7*FA;nY*!iwolr^Z6Z1S*3(@jbh|0xt=wMEx{$^IvbDLPcrQIv&EWe7ln7cvW27 zQL6V-x@4Hq$ZGkR{DS_%ukQyTrQMVCUNb#f86y%ic9_+J#4wVhyqa6hw=4!KKKp%X zkJ?xL(pq!dEpAxo(5OqG*<|6qk=35PsNWXCS{>7{HxkZksx|&5Pca0Q z^YTWIO8Bj|{OxzT4h?q<^NOZRGs_QE>w^y3C0|PpNM<tEFkw zk=WkUBu72$0B=rtN@XR&5c&wJx$^R`a5scPmLenGDSi#gO(`vXAjKw!47NMEx1F2Z z;hiW1(v?*_-Txeeq&?xg%@?F+pA|T%kaMuU)L&3m+g4kZ5u@ORaakN2M5Us1)6w!_ zJ5PI~;EY!oS%d?uKDzHd0_`+*Y(U2qCYD8lEZv%rA$f%%5018qE*R@slC}8m$sHZ$ zv4~2KyI-4W`XO4AyUnm+gkC$ICUfs;~W}SE5v8HQubS zZ~W7`BqaB!!79d@qRHyGee?TX?Yeu~(cB?NlYl?^AY_RMP9?7>rkHv3`fzCs)oFd# zdN{QDbbLqX_e68})9?#4>fc_Tl0GcYF*U_&nx@2)q`+gI%(~akzZd4IMi*oxi+@TY zu=mV!n;@GK1M5Jy!qO=5_S~W-L_U;>GCYyFfu}Kz=i|llI3h^$pyj^9tvaevx^2ly zPTFG@gH_qEuSY2H7ljQGS-q$D*DeO%rWNWb1M-PjdlL-@9UZ(Iz%?-*!3Ac#OTg6y z;H7wh<$td4T;RvM_~&zcJUplq9>L%D=mPK9pBKOj`^?$<#aGdIM8IE{ftPqP6bap2f6#V?)?2Y@E7RnBX4gHIS~;* zKR;nVabb5)M-fq3Sy_>rVj^No4A8*jrtJseI=jUvk_Wn-)?8(jR?`Z)O6v2KYA}V}SzB|uYbumYa<~9KxSuDN&3(Bj8 z%F34r2=e$pa9V!Mmbg9;L{C6)o8bNN?LZ32*E^XOU15%MEg|UN(^HMV4ns!temnR% zPWYjOL-(8CE{Q(e{BY@u>Jz*R_yok%oXY>_i>W)=(CoU2CTA+m|MLxhKO6o)S-+SI zXET59%1$F8j%^$xL)V?tA@CJ7o%6->?^txee~NV5YXu3PJ9q*D#Q6EsBWQU0B)cxH z*&|Ww-059VsgS}Up}$8O?skonf89Z3a_%xNS^l{b5EI4!is}E~VhVSor$aA}3GA$n z$+z{ggMy;jJ5N;;e)`ZyyXv7?w(4Ag__(9vr1J8b2}Wfq;3e-pUhS7?W^>2 z1)3yzmh|b|RxaSr6EFy-axC}jE{u4{^XtvuJD`Y`yA zb!5GlL)B?~RUe!8+ia7&ld3|kPUlXNU!X6!6)8+Z$KuxDB47RT+?8QqSiAw(_dCq4 zb1Zw;*I46D2D2*QK7TF&;lPlh54M-PH&%Jf<$bWLn5aMKs7%~h@LyKY^(vqUAF@C~ zf_D|=R|X0rY}9o9h8<8XBbA~tAe@yg@nSJQ0_=Xrht;;IPclj6}L z|4@00fKHJyrwF8#q)(3V#yBsd2jTgT3Pyi!s`qMbIy8-g*d1{U0MCx5$?V;zIcI>t zSJ@x}8+YVrQJuJOh8X{Pf2Hwo1wSdWpj*Pkr%%jzr2 z*DHYg!p{vP-a_F4yOr>!`bzmJHJ`5RIzLxfU#0PeE_5yWJXumNn&EGwzEgOizAbuF z{kR@svrX&Q;Ja|{L*5$UJ#u=G0`ger6~gGgT=f)}yX>*&>Xj}Ju+XXE&X4&TW}|Ay zWh0O+5prx*EZ5Fct!!t~Z*OJu20^LW{v=kWGodG{uwQzXmNWY1<}86DLy?)FAmJ}p z*v<{;%Cg}H+bC%L?k>0|T^2lC{1)8){0i9NcLBHHQ>0WNS@qgzZrwGKNGSOgWot7#>bop5dkG9 z;Z9*srW9af8Ezw)QzMT-eJiALVg#n_mcrHd3cww2hP;JwsUx7k$er?#qdl$Dwv`Uu z435yv&{PGd`)Sh&*>&-C6Jq>UfkR?ZR&84>#1;2`@Y!aW6MOrHEAWR(2hhRu{PxgG z7#xbH?8orTpq-NG=FOoWKk~>c{I-ZT5?#eFt3X)p)f(zIP5u^$yl|1|8VKrGFr$#F z7!o)TQ5VlXU{752NtuhEJ&gdd@WajCwSW(osK!;Ztzn_yzHF|gynLMu=IIdBT1}k` zHC-)<7qKf1og>%Q^DBdc?kwNlPK72}*U}Vqdy9t){%M-b%ceQD(Uf)*7q-jRB5j-f z0BFm*9>+-|a0MWaclVu%5L$`M$>h=?LG3I>q2Qao#X0#Ci_Hz%6QmNvv2%I5H{2!| ze?G>gNyTMgrumrA@ES>X>EHG*b2!IV4yqj@)7}p?O?4=8mIo|5g>yR z+;;?%GuL-VtVnjfmd%%kqPB@+$>>VVAH`?;7rx|%m(Oyys&j-=Uwh6m(?Ioel|b$Z zfTAxp5!B*RE#j(}&p*GYsm`>VykhmRctsRCiW~uqPR8kgwkBA=iLzV4-3&K(Z=rGM zNv*fz6Oc92x4<7TD-vR(qv8;~`rPEP-Tu#$icX$kTJx2fj1*<)Wk3TyjLbUV)Bqy} zCW%KLiz8)WOz3_n__z)t6ey_ziSsm4H0>rtkhOYtmoX8NvocMyZr@3jth9%L^bfAD za`m+Uu4ix8&j;tyxn5dcK8)s4i;i|ONV*q26ChCU-%X(;#P62tH#VTM)m2o?@c3v) zqn$g9mo23!o*fRjh9W=^mIs#UxWdpQ+wkdT=hkO5T2ZooE^i z@O&QQt3lG(#t{9c1+$gNOJoc@6jw=Nlk~{K*ExZ-iqd^d9A{dEz|XxSlvdiZ)9Te( zkX<*(n=3SxD?^BY1T00N2KB;d7AXPIt#7^oapOXz`huMNJ(7XsObQ=ah!A8Pmip<^ z#Chx7K-Y>J<4HKXCcDH8^kd$=b0bsSOS6ERfsU`Nxa294t!APD!Tfbwq=_yw2!mjd zw|352%xrWcmz0zQAMcZZp&4kGwQix|19><|csRmP=B~cQg?0y=Yuimq9J_5Sz^^Mu zUsmtCLJ4Z_8RsB_$t*mxXpYkCn4eU2od%37VaT zDNdvR=U(LVyq|T(ZFF4jrla2@t(}UUUT+ zF}wWhWOr*0Hy5cF-OkMMPZz9pA24Mn|*K4V_|0<1T6;z?P+f~VU7>tCe3Sdnm0tJ)qS%n zCP$mwPqZ3{V^bdVZQ&jmEj1m=T{{tJL>XH--B!|nBu&^?0-0_K_?4$vT|K)jAVOMH z*7CTeLUrkfu{BlWnvZg-l&e|JouKG)7WXQZx@cXsL%9-4qzZ@Pv5e)Zs)L zbmn+Db;n9BQ2)5^#4i#Ik0y;J6H*VyWu%;ao`8l=-q)YhRoSAPaLs^4-Oz%JE~8!r z#8l=1`e=P^Tfp7g8-|`dQkFUXYFjzMAKYHPKUt|qf}TP@+#l=jV65p-4FZ(8ZZ#rw zCt@cQ+xb2>8rl-FLV%z!OQgr8n@^OxiHT#q)Kwtz1c)(@p#$d+r4Pmh>V=*nXW`H& z*vW29Uv5WhOw&x@QAYW^k-VWF{C4%HZ2!G<+1Wvsu=;XG1pOPTtdnoivtuYLViBI^ zq@s)z>FE)a^q5TBHt2AX7Ci8)Gs^1?LulwPy;eWJX~n6@$!wXPD^2~;ERptaOyv}g zH({;nbL}aaEyzGiOUwPXz?DctqbqER`#*Cxvm7%i<~sB~m?*9LAI?xk0RWRMmc{T> z5>A1Z`(p*Fqd!Th0{N8)-a-nxPUY@zs)f<*jk>1n7}@e4{U{47=-00q5Vu#^`u4VR zpx#dpw6wp85OKL|llo3*d*M47?D&Vq?C`{Tmz3oGb1w>x-AXk$X}1XDUm(M4@O(JX)zK`bcT7juvj9 zcfvP8hx9@?BOI516d8kqwwe>zjzag_-;lxARik(XQ*-@EF0kq1V!*uRu!`M9fLNoW zLzO+;(sJ$3-NyvEQ*fs+znuPMy%BbkHE_Q_&IfKvY;JAMaZT^-TtFV)WqQUwXEG~kkG0iV)M;~`=t~c5af)Hq{`_<)vzwUKF8`oQ;Eqm8ROWXPy zWU7Q%fffBRGRdT?aPWNK`w}CbaO2K#UqOxU%aE~XCcLb`L)qC_CF?Y|!*Sylx*nOe zNEg4c-ci~EE$b8Cv5>gNd-_dvo(VX!a5*6K$%J`5wNZggGk3klY{bAA@dp8cHd4F2 zDI8zSM^Z}}JqIs^Wrc7l?4Y6W4|(d*Yc*3J`@E!k5KV)AXg80Rp!Fs$!NygwB?yEV z)xjPn)^s?&0v;3z7IwjWFJoBW%1TsV*gKugS*n-_Mwx@*dSx6fzvp$-V4)#{t`Pqf ze~TIm|Ni;ecv1KKqC7N{+8WU*rgt;Z?&}iPe`gc~D4$}+a%NZ3+*#UNPF295y@b$3 zj6CT`?3Y~~=^1v+?kFud{)aI|<7$;904*MxDJh=11+8aMs)HVWe&X?KmMALxB2i?S zON~=Q7?AGFt_Q6Pq#wmgPm?oR1qXnec}ansM&Fwfmw{)C19*A6mY3fRh^~b>1#!^S z9&2OxeP=X(f|~FtsxypE_%LJ5m!hm$F~58|T@^yi0xh3jIa1PoibRnWj5)sGnwdtVs22nfk7W_0=f^{<&KCs;m z(^cF`0mprZ0Uc)hjZC4*M0a}LFd5AR&KZJ-KD=DYr&3~OZ27%gQaJ_w`t!>y2bD0Z zZ_we+l_lz_sEiVQAmPFPXn-^5vzaODr^DF)_&ZS^4C3_Z_hUl|!6}cqlG(#wdbV0X zR?{s4bHT=tLGx#zha*QrD3->IJ7{|q$f|v#>E(p>oYz_@Oq!0{7s7&1dXv}PyiW={ zI0V0$H_bZ2E+ZA-V*`MH+vC9u-LdYXbl*HUY4#<9vQ~vP2w8Dk37BI}%8tmqgspuM zKW3}Md%pY9E6$~Nj#!2`ftzK58R-~J=x_nh3-~2h0kC6w32V(|&BN~w%=rysLHE*@ z4JXG{wmiFqXfd806iZXp5A-ccYd!Y%6O}x}7AIX7NYbc) z7%hRd##64@{tk@g=;SLEh=LPjWBz+fjp6>I$^FeAWiHI2e&aDqk_FmtnH6i7q{&7^ zgwn&Nmu9!qg0D>lZ-j)AK~Hv4fdLCAf5PcBsUMjF175$*&p?MN$(`fBk}?}Zg@di~8!e1R)~?BYKziwI4a^XsrtLU;;MTZK zmmqVTy>qzHv~4wdJRvVp*NaS+$?{A$mKD}hIPe~I&f>@ew5#i<2resr0_4g-m^M>^ z2oND&%IzJ;cbA2FX(qun3h0aWouNFBo`Y?*GAk$TIrdjv!G$#NK})pKroPOt^4u;% zXU~aO1PD>QmhBp6_^0mYnq{;o!j{?miPiv=HEX_3+8ehY3u}`zp7K|&7MvL8|#*%^?#u7-M}m(v&=9I!Xx;yV(r53N^VM$ z#`RC-elFyXq%o5ts_7B6_1(ZEl;k|JHX}mNnj%-!M%~?;yRU9DV)+v!X?H=6Pt#kZ zY^XH=>3lEUUDqNS`yu8k(UbIHlJWj5Xo%ZpPaE~!g$PPksf_g zh4iu3B18xkPFuB>){%Orr2S^9==onhQymH7Q4d65t<%Pf$i?tYJNW3-Glqz2fX5ufy&O9s#P3*@C;(@tfQI_awYG_~Zo zRX#kiT5j&Y`(uP<)xDD*OW?>c;z}<2t^oUx2j-XQ{Q9zGR&*xEy{J(-IiA-E)5EkD zZm>(aJjwy!twF1$->Kvu_)wBq@^WYY^TyiyqU>pkuGZa@(;c9Rtj)3BF zcCyppMG>w{K4Ff~{d>)PS|;qv0_+=SvL63_vV{+*Cs+Y(gK+tAYJ@zW9JdPHiW)uY{(J5(I z_NH=B=(nuTa@9?bpHIAlob*{E*27L$!mA3rmcnbNeT`)nQuyAdRR%o=dk#C-RfSq4 zUfA^Wn~-kW9}tLJo%YYAXuX8W=wEDZk3g0WLHw5o=J|-&sQ0ET6PmW#;>3l+;zLhUUZ2&_K>o;{!YHV*Fixw?HBYc$HPMLD9Sw&;I#7Q_kld= zKwD<*wKWuTaztN)Xsz3tgZC!LDHc~8KAwFBr_w|x1Bu<-wX7-r^q}GEIr;h5N|H`? zRmGGx6HdK)*0&PmH%LU9_V3H^khp5CbWC>;9WRzsqxedzu-JAsU0hFdqp+%He~Y%Y zOjky-jjO6#sDHnetmnqJLJJGW4$D2w^_6*~j@APPIuV+z2%Iv|lx2d8)AO@+Z>(Vdhp!*%X_<@C~jUuk?pOYo|jC`VSHE zDa@h@6dzKo0_{@YMNaD}9%NCY?v^^lfW27Pzj$fonZZ-JR?xOvOSzV9hO*6x;Yc4V{f+j@dzEfJo3He~V;-*lwG)16(k-IY_3SA+_UO3q%53oq8K!Q(ojH z1kkl4zYkeB5W0d#D{N8tVjhtUv_`7A>k2c9Yq0L4x~${xmS_;83LS=`e^Bmt0=}Yb_qb!^?cr|H0*C z-_pFAjckU7*~tf6X(SxQfx|XX@VW8#owlZ%Y3n;YADBssrtWN0B#n6fY=+1^^u-``$*hUBBow! z#RkiH(z950o}BukfaO2vra#l(TR7POK%it;fu;!juR!(xzb8=34-bG^mBXDSTBM`V zIm1$+FsG+5dAvI(|iMky?Gwznx@d*Ju#SS1|TQ65k-trr0K zIV;9x%Z!|k2ktV{R0Xduo zoo!1j$jVf${*3L{CnaE4?5?_geC~(6<>bVCv%n5fiA*`4$EgYdo*&A}M5d-(M=l`_ ziq61P-xn=N-J(haAw514IgtS*?7y{yj|{S+zaWm1arsQZDHw7_f2~Ne)X~}4(F1ZN z7KO*;&jHr!RdjnrrKO)0V2Kldcc7vLDV+tjbD%kD(EOPGdhdbERa zqDBH64-0eiq~k)0ki#JcW#oqqw-6xXrEC0R~(v{KtTEB9agH*3t;u)S}6+ zPnxnaa}l;}C06W`36kPkYP|4I_@WXCV8}6T5SwH7dyS-l&;9@$9m{vyx(>L-%0F%~ zn)3cKu)Iv7Hn-%ydHZSsJ1LYvKBb9b>!~@wh2elMKUi`0I8WBPTYe~7F_7V>)H>-` z=ZFHIF?!Wc)mm&?Iq;;eF)?MPr>uP*m-NGh zd8ozTMukHx#nw=*L~-zYVVhrwa*7bRdCCKKH@_G`HUz4`o#IEBbm z!bV#j&NfShsG)i5BW3Q)%pH7KHDG4w2zcJOeDzP_iG}*nzWfM{p%hEsv75l=z|JzT zhg5I@a(vqkXBG~WdxnIMz7!#BudXIT-OL1bPyIKV(XD_@nVlkCgD z)m1~vu!pj?bMA2_c+r@j6F|lR5nHleoRg8*#ae2)|Li31@yihe9j& zPCP01o#%JUWPiaDD9=-cHA z1fgn@Gtoxe*U`~22|busJH?y~7U^Uua zO|$?DBHu$HypRDv9htM>T4M1LD8>;eF1I4p z`@0Ns>8Su1cz{$vR|97_UzETHW>4kg6EbiV4~mnwH349BTH2WfGYBJQ^VJjCGxm5g zWxb5V$iF;iXn8t_TLXxz@_?u5;y~Q6=q)#zG*K~#&L_qz3Jui8}daN8y^n4$zr_(&X!x~L2T07MD}J+Io)R2 z7<1wu=TsH5-Q`THlyOk8ZEvE%YEIvjVGF<=x^bTkD@qd(8LO`xMiub#kI6F5Yl=Yb z6;MxgQEgROCggCZ$`PX8P^*as?A)H@@V@9=Pop4Hu;(P+|GnJNS@6ro`Z! z@>}nl;Xrha@iQ+3nfTPkJH5nv@h&JseCZv%ZfYH(m{jB%kii{qMZx{DPO;qA?Y zJCs+FwjIPmDcM*9_g1e=hoGl;#NUl_aoh$#IK!FZ#NCTkT6He2xU)Fb==YoAgK`ly!&@bAygj0M3jRQXK|yuUfyX|lZw z?s{Dq2O&JSP5N*EI7E++zq=WFynoYob(9iq0?a-6Z0?U73b8ez$%aN6iSHzwG!NHa zIAp5gEJDL3B|OyXIc#Ef`iHm5V3@#EQQJGvKsg{&kULXNiOQ-0O#$qlbo5(s$Ec-2 z`C+O6agN*PwaIXKaRz3M-Gsan@2ly*Qc`QSSS;kl3~aESWmGv=@>)WUq798LB?a#u z16+M-@U=^1IM?TiaxLRxf37+cK)&BqY;`Vce&a0%@EpK@f;cETqc8fN4*>N)S>!A? zZ@D8KQVVgG{ZKCo8*XOpV0oVa1r?ga0EPg3+b2@0l=b1VqP{^TjhF$6N_SX`ths^}87__LT%0N6* zrQNOQ)eFx#8P+#N5V*Tc9}Va?gP;kumH%uMrKn~Jz<5RdJ#T&rHaQ8w+DXrxmKH_r zOoi%L?g!^UjKxxXz^`Dd<7TJ)W_0AZbY1)aIvqoLrZNxinK|Vpw|)-4I0(>4eS_zo zTSz8A_6g#dnR7rB?rxbiXd$$$s;6nBkmAN z&`_bA-s8Q$nN#>#OQiXAg4skS>r8Ef1jT_(7K`)s&X7~&+M`ySZbm6R42dtG49|6}>LQi_F$u$@L(}OZ! zr(n9t`J5dj_bo$STKkz(xHaWw2S$={Wh)M zTQQ+;Z^aA#>$tF|9tbHOp9TQH?;wcg53|sB_j->WYe%BA5`_EidGmTr|8?QI6_8Or z+or1>0pjcYnHoLdN4WbJSF|&Ofx<;_YorGK<1uZV@CINjK7B3+tEu*8g0NnD(v5o* zKz8WA1HY=%&(GoDyksNNUEsLI?_Y)gy)kU;2LP=9_IGLcP~lpVEXlt%2Y7$*Tl+U8 z*korgU#CpJ8)s!HcFf6_2m!bZ6c0yM*mhhS%t$D+sGvs!koIrBh+b2aCjq>*vn1XM ztly2HPj9hA5e{UpY#$Dk45|Y7Yv6@UG-9m+XvgO)dF=pZi z4A^K3RLtvR!;=5W?udY~3j#QD!g@xxsYMMe7nQMNi3cFcvo3s3GtPPQ9oJN`Qn*33 zv;0GVoJ~&VTP8@p3Gmp-0yRjWIOrc4jH=Uj$&o*GzkrrtwdKDqt}<3ju-Xz565_v< zmvZCZhiK+pLXVVIXiv;yVi3IhS@68^y}J$^G)lY~yx-?dFu`c$pYv9|R@>*FaZ}YP z!)KUei^vbCO4iV~$a!_{Y?*29B7uB_{f)17@OnLpxo|A75uj^J&WxNi&u7F^dP>}F zGLlr%A=@#lUWps4eI@Bdc={$Kg<+>0Ou$`@$fSi3#&Up~!r zEuSI{A7&U`9LypYY5pAu9GKz!1V;*PaEr1 zI-|BDYh<-Pm?+~&bGK71aI*k|Vr73p>sAM=k$+Xe>9))MSOrX|Si~tu{+)2)+qwVn zRBOP#i<{m_^aukGrC4g7iOJZP$NuAg0XRQJit*D+WgMsFwjO4gNKeTJ?X7xz{-O@R zPB!zLn(qW=cOFhvkC0ca0c7NRzTm}KbvOxM)|42Z9|L}g;AV41Ek$F9UnABblt29P z)T8GhC0c&Hr7XE@JhVXR@EHeG2evuZN^!jPN(sv1)>2tMO;46JV+Y@i%Q}6(`KVXn zNLA(-1rz>%Glf6HwwDqDc@}yg5RQ_g+llJzR4~liTZ$Ufr6&B7BQ#l>YlDUf_`n_x zq*=hhqIjyZ7O87Zn*##i)gz9YiHHoJ@v>Gk=Tf7(Q1XeIv4E_bw(Z|qr7q#K8UCzU z5upHh1JYV)F!Fu*_Bs*6aHCm{eDu4g$kO3h5Y@jsl82pu`_Nh#d8Lj8OFA`tfDgIl z6y-B=U?Ma3-df_r@$!moYq&<4jTVZ*B(&cC)wBmRGTF#Nac$RR-AR#IS}{Ff{r2pP z9;RS&-NE&!^%_eQPy-_iS^#Qzp|F#amv<6jJ5-L5^KaOGWcRcXVI-Ok_8PIXHN&59 zue4{9@K8R~PgK~m^WVxdmY=^k?KP3ba%}NqWCSWyJC$pos3j>a#7~)1$d35#Ix*E> z^{ytL^55r?oeS)*$@ipT0Zp1q{8Qw$7*cw-=Ji*70BH$6c|X`gMy0UR+uERtq8$x5 zDrvQ9+!*5g`grxa*t$jByxjtrv;R=%k~(#?-R1{YPn11NPY1T$4KhXX-kK`T5;m z-Hj5y;ql#GsA z;wtIT+GSGwR_QA;`P-!AOnpj|-n&szM2o-xXSo_Qu-Xd{bc_X9F}nSpLyG>JrwA7Y zYKhimM2!9^#``&Oq;2fT6l-sXaj*T4&h8{BCkKb0^q7O#PBGtTS;+NHlDHGddYd;r z#}0K|n^qROlQO%S2OmF0=2tbKFry1LiD(z#jOS&x)5?GCKVfFzKetY^KLLXHq+`Bu zrhQ&h*ZHlm_R9yHXEE`8oA-yFnCLt%KRq6S!2#wv&3l=j9*gM}Uw%W^A3iSP`C?D3 zr?hRczfmbtbEschfm*x65-j%I%IV;yeOJ}yVQOvboZhiDV&%>X#jv%=bF;u({=3!E za99$TBcER8CzIzT2j8|7k9gFyVTXsQcQ)40ju}iw=Jwt>J%%GwmoeT5pQzmhy7l_C z-l)EM;1G<=@y4~R18)=AfG|i5JijBmvA+4QAf9-)`D7zvpqjb594G;euGk1;Vn0F; zF1;B2F8U$gZ545{Wmkt}5~N^}lem(9)V%q-$UQQ}DzxMsJgwNk+?-YlvvUaES)^BW z)p7OTJ79#Lg0_QGK|#Jr9a06Y1KcT^%}B3rn$Z$r$R)?t?y{!o#p{0pLsjaMNpx?WNNr_f#<-v)QJM!p+XzVU5s@64q!wSe#XWtM+&_ zBg(Jqr^-cmbmzvW`4sDGLqj{193c{1Y`1^H6Wgna)tx>Be7S8Lnda8q=&`O>TZB*WA zslxUuSyVqgv%I0#QzXJ^!qsVf)N5S*KQbNt3_u~|3Ti&^0$ir_SWknRep7#e(iKW2 zyHh*2SrvQGjVSvqrB|mrA7i3TaTYq;8C6xkDm(K+~bwYYdbMlHzm#SMx{-Zv+ zsuDwu2|S%4TS+D*TCY_fJv6r#S;b7rGDgFWcfYg-8ksmn&9^2}P0CY*+L#zOXltsD zS|1-)Z6dYG-mHHNY~DY>?Cu-P*3G1|BX8!w4(4gVYl$T-UjvYv-ykj1<{Ndj zud*LiMyQcL)Jnm!GZA5j-J4&a9Lei98mAe0E%sXNr851Gds_>Xs;$6!>A8nHzu1f` z+g|K_RLV%(u_z^xY<6aNi>|9WiDyybT5T9fBr8Z*Lyy4roxq^LQ%qtC{cvTgUTBm1 z{=!Jh>r&lx8QImQj_@37Sj#M%q*${BZZ9>V+)_MD!#8( zjtGM%N~EZo1BK+=w}Cy9Ru=abw$PIa@xjfyRgI66OfyGR=o#^o`|9;I>=Bbkm_3iW zpMMFYk^6L4FUf4|^sBSd**r8{H??^Z+SboU7g|%)w08TWYt0g>Ri(4nC738r%OPx` zo-xn5?%9#_n16)F=oh8C6B_9pjXPsn6P@rx1@_R80aFTHwu`eD(G#HHSud89X18Jm z=i$}Ow4~k}7Pb2Xd9aQixecVp2#AW#_`3PDS1lV{Sl-2Q)c824xX!Y@J|NJy09y0LDV{;bvVL6U;>oh&{cqO+B;}(k& z*!^BggSwaqTPd>CRQoR8+sHqe{H?pGJHedq&WtG%^$bljv5q-feG|G=xYL64YRf`! zzY1%7cX(gx}P594LQj(wjxtf z+!-}-@vg`8h7ngl&1+WCx!my7UrN6jMXZmy7~!)TPaZI{(vQTOixR%;;P5gT+Z2)y1@P%?2b3+ThbY}u z06k{ZYq35)Ut{Sve~F^y5w%(u;2w%Be%!7>JlRiRVWjY7EZm1t^goJo^%YokyAj!W zg8_Pdt2sURa3kp_1onQ!PRA&3)sZ~F_Vdx*%fYYBZOQ}Z5h#h&-g=AXIw31GhKPf* zARQ8#cO%~Iba=suK+#O*qG@Y2o~t7=o{U;m{n;n__L=be-ZYYlR3 zaqo1imlR|kH(@`?0kiOgfA@1RB+EkVB6~j@$**3~1#OL%_n0};Ex#6MYan9tb;-@e zZ=0TU%s4EPh!pWlXs(v$2KHC^a*nTSh#a6|r3>aqyFr+=%R$nB6A6-Ej~wg{t>DfPfG>8_QpP?4@IN!Cm&tjP{P zwEc7$+LLe^-e@7x$fP}KkZ1DsmJ~~Whpl{l)=0_Gdk%R33nNzB|0`cDQ9+Dbc86l{(3@=U)Z89R#kS;xlBf>O?J6*{`s$&Z=T!%c1Lrl_?WTJ zf9)}a2_t)}{_{TxJ-l~8W$UkNYt^^^pvN_zAKCsMV&8UrI>X3?DO1>ilbo9q2E9U& zwW5=t?V-6>^;e+`>B!($i3*3e8rY$DJ*|0lIWpd{D+m>c#p!k)Dl+d5c_PBwX<13J zBge>AuhOOdq!bqI)dQ0-*MZh2sG%5?;E^#&t@mIug?;8Cgcm#5PLc?@{vYgVfQk^MB&ZaTUPF&c2^|T&MuL>k z0)!+$2>JHwxo6J3b}@>&aR= zjQAbxPLR<8#opjkIk<&;z3e0mZ+HBu8U~}NrRuq{h5F%_DqQd5rQ9b6!`Yp=CjB$X zqTjUG%O=Jt4a5FswK|(_&Y1Cqa<4a4B4MG)5o&TiI&(IooIPz^x*APtCLA{@739V3 z&hudzE*vJ7fwm*PO8Qg!$b6*o4Ns0Y`NHGDyKnO3+gODILuWE`l#N1&aI}keKZUO? z585d@xz&}pP7HBN4jjuhjE_9Y_Mds`=5u>-%F&j9e#D%ubbNf;-5nXJgXCTleZyFO z4No-ZJE_9OEfl&CS(!FUMSHBDINGWys4Y>yd8PNgdTSb5iVcP}!a44;t zv?y0{q5`;tsG-nP9gbGA?G~)eGwv<*S$)JV(p&ui0 z1uzuBV5u@sURB@e)MU&yupV6>ZT1vkoJ)tD&YkB+DP=enK45O$*eyNVf zVxLsbGFtUeL3J|THR3z+Ur3SIn>47eM){ZKBHQ*OGJ%Twd>uy2B)Ehz4o~u!Zf8izPd#ehmSjcfFJL9R?Lb6M_6R$UO&U>Mgyjb@dTg@F5 zaHqJe-fqoDoT~{fmB%?-O4QH4iTOD{dII!r?SJlV3FRE)or0|7#iTiHY@@0vB5upS z3+I_!=ZMZP)e)HHG~-tWvlA%~uNqbBqqtP8_&sZe>tw$*#6D1apC70#=zo5B3dI}p z!rj6nL;Gi)GNM7tYm0NT1YYbZtRQ}x*>h-=?G<{!7|0{`Kzu7jJsGbX(ey8I57I{v+&Rd z>!b-qF1(_`sRcTI;&+s*RKh_(v{$M&L__V_Mpi@l3 zu@LtZ*qR0m(>HH468U}4!4w$BR@>9BrDU%}K73%?l-FR{>39PE?Frle+Ip`VPhA6)|TNxD4B>JfYTTut^fN zsB59sIC9*cJvi)W>+-s-ikLO&YhxPP?V62|;8PoBvgrNNZ;Yv?R>%32x6aI=>NS=a^3`V_xdDYq5dYtKK^nCgMiki z**`&X<$1_RyS(DyA?ebYIFdu~gM{*u5+8HH@w3~_Yheggj|Cdrcdo%^NyksjutFx5 z$o}@;vwo)Gs#|dh%wl_Op2^~_w1keytU8;JaKt_J)ny}l?%J6oLg^&|F!o>@$1J@- zXi}#CE@Q2yKFSzbPu^jlav}s-OU*EzVp;U3+O`cIi&TH>LX zQ=J1ht3DgDoqfHDvw`52DHiu?-|y8Y?L=jSLRPv}<~S!-FOK0i)5tZ5nV-G{u z59=woB5yTIYCih!VdUH)YR8zAalVVS+js5w)TC~R^ZN5H&#k{F{!;2$k6s4hqQ@yG5 z;@^a7kQ!Wu20s3w2A2I=d2*dnA$>>;CD<7|{4SR~k6|nSxPa3#4S*d|v9ssS)B$#g zZV;xmQYU=7&#+$$O7-Z4j8v)*SkbAO9YHa4v1cl_t_^&@$=jumujEr}-&W?}79Tmu z&&p!QiV-~BuswOawlgiSrN%By(tXEgf=G6PWr|!Q7A{XK4UlgF`o4iAmL#@(%Gvoa zkIC-uVXpK~Ec#C33SDt`ybeMMZ~Y~WMeqHx&iAtBt&0dql>|G(%*8mn`KP-uRvAm- zhujgR2H11X%IMLV?{*)2O=-1nC`|Ogy5(~I71arvB5|K|#rKbU7fBAAvsH15UJ-FR4o3sSBd(*}3S0-u>$XBTU51%dFJu4!pNX#j7f! z&TM|ae_tsMOD(oxSYg|YI~4q!&eZrUEEZHF`4k40Jc4y&E}waFko0vNry;?<-E7q` zzvBL3Y+O2PL9kZs&rw#*^_`?P2iqp+9=Q2yZYId-iV6k}HbmUaDak2O!(-S4;mL(h z#av+$MQ>-0ezX^G_oXQ{`xn>P3z7EHQ(%JT8uY5x*xHr-#@?AfpLL&okdQ$x_n$%z z7^dj$%*ZKzFtTDx$>`maF}T=~Z0D44K+b||g)GB+cuH70l*Cfmo&aeOawy$c2Y8QIqZ9VEx$nOy3NS>mHN2~)n zXE}f|j1Gmn-|wn>L?RqHx;=Y`;Gi3t2qaFLum|0BQ)(+qXRm&Hpksrcc0v?kgrvkG z)jrtjPdSOtz7d!ZtD=>`2!56Y$_vy|UTdxiYd7-#9TN+dDU`}QbYNakv8DIO8=E)R zj}>$Qk=J^N;)PnAJ5tn$qo}halw4earCaM&o0xV91*PMy1FX>S91bLMIdA)bL{{M) z13}!{d~lnyV7HZBi|1TF-+tbJX>#`J&I+RN4 zU~#u^WVvhUyVK?uQ+%%b%hbYSKg!WAkYyJF@#m68DbroXR_7$;Z~NNpoX&J99qEN3 zad;W?S{ch2i$iJ7*bw^)ED6jGS9(seN@hQ?y!ETDMI-G=gL~F_T*|4#B~3qf$rxy! z2Ut$2LJB|X3kv1g{5+`bTO(-oQi--_xei*mm^L_ENCOy*Au6tAne!qv+ntumYXC~S zUP;txjk~k9iPMB7%fYXdlq*ipcXD`2E4U@vowXzw(`wX0Jt$%^*pZo@R3djI!ee3% zX;7V1qJ}PA_9K-GD0p7sQ}%Q-Roit{q2G;_8+1m_9)!iUT^Bf5Ral1R!B80&gVH-G zoo@?wxL&1s7WMBH4^Hju+UN^>yy{Q`?%b&#xhvWnb3o&ZxLpF=G%9$!n5>lGFKDc0Ab;H?57DjMr~xsrx^!C|A3 z|9SA6St6+|T&7&v5>Y){8&7y(PnjYx43*m)1>@9O^)1V83`l2Wp%QVP(qj)a-N}@G z6FB0Spl^M9pE!R&Y~*5Y$-`SP4Q+DU{R*DhETK$uexz=`o5-nMFWY!*E9MxUv{s!b+>6?j9dRxd@QGiZRK)pSVm5{ zyrtKWRBh=l@<8@CD8DK1H>0^@PN=GA#bgg{o!U$tNz-Phf`M&tfTi~F;HT^S zFt%UKRZpECL`$_ZUT9;kj6YW|cbY6zvN2a>a3gc=dPxFs>*O82-a`*CU=BU?Q7>U7 zLn3&pD=HA^m;(L(`;%l-0Oo0zm!mU{KvCv(Fv7Man&4>blJ ze%pKb=N6P(Z~qXk6-qk+^)2ca*MVb)-w{RC%`R@7e5AP+^(hZOQr!>`UsP1oFc_zj zQaKmKKDM%r*J__j8Ex*1Y;CZERvO{A_dp~Bn=}|g5bs!Ljm{J8dnrjATt&3-|6bZZ z(KMH?bj#_faP90ub6@OU(CAk_jiQn}m?cXc`BJ!Bv6xs&K*u`rPg5Cb$rg#>Q94w6zzHQhNb z$S5(B^FMo5)KMD{Pu2C+Sj1dy0%EB^z}29{G9^jLxP%-R{^gJ32G3wlRN`<@+ul(1 zNZ10UrzFR7d9BcJ(WL0JeGx668oQ3p;vvmeGWBdGgo<0aV3<)5Wn4vk>FH_Xgb&Q@ zA$xABg*UI@?9i|Q_mQM~Jm#zCNH`nd1QbV&ci)6fWJYPV;$_afnx?ITlvCR57_P07 z4}B%1pUrwB8gjHDVX2P*)Moiw%CQ4A=Xyl~*V32VVFRsK(-L`weJ3%8gMV_SzdhUy zE^WI{&*nlimM-~yUSmm0A)-B&bY`IsT=pV|d|35}6I?JzKICLf?D(I0SSzIch{hNE zbnRD#AZbv3E}Ss;j+lT4oTxeX0)|r3xIYu~TFO0g475?+&Cm@xiTe1_^>?gr*t=SAm%X%BopMIi4G%_q#J~gz-sJ>ql;(h*T8Sb zw!dGnetQUU1Kc|fa|(v6@x4*U?@3DI*3d;Su9-Hl|B9p9Q)*!U0c9at<(3X(i=ZP! zd4b#XnWZ%5dR$fBgtR?=R^7xJu12w2*W`#7sI0 z^aVA8gk2`MH#sdl)FrrUALQPWz%Gy=L|{ADtv)C!W%-l~rmJZ#MCmUF^-l)Y26V5; zeuUdznesKTv5Ac=iE5JP&aAOjqT)zYUnM+J@uOI z&C`5TmAW&ay6-vdO1VNowE5fD9z-?OUDvg5>`YG?G0iDkty@}@$SJn(LVpzxp;t^M z7&YRX+4TW^m+Vr+T>S}SHvEpCwmu6P#6m?^V^td*EQX;NkXcY zIo30CH0qpO;H`p)Mv*8#Tc>`noGnch$Acl9LWaA27JoZie2(-wx&vUAWj2h@I(G;O z@l5dY9(DXPYh`h5k4&1Y@OWM!)(IZaJ?emU`QQ}7)dFlY|_5rLP)k` zA-c_@NWHQlV%Z$Uwk~wvl5ebd7gJ@IttiY;5H`+33csCJ+~kjyNR(shS8BT7&6zCe z2I(CwbFY|#9sC(thH*`P0QD?|!gh2iIl&qoSrvAv&w1PJs6D9dWy~f^NeLB|pI}_I&sE(TD&Fb5YT?4yF$8v8*aZouh51V4|qD=9u({87xM z1?^#ah+YixR@4zMX3k8sOX(cp0e8v{kyX1x>+8w)gtCgpr^pFs^_&*_9-94OqV~)w zn)A@2E`s8EeMfCL6aC)AYHK7O?du<3-CTEiyd8aNLN;Q>&Gt2cF4+!K_5V5!YZhUS zIrWZo$4(t_yS}ZqXk4Auo<5OWPGP|uQAX4PHC&ZPzh9_x6Xp_SW%3fd%`jsahn$JB z;GF&VxchKl?hL%wz3pC_qiejiB|s9B+|M2H9mc^ZZ<-^v67P9vQeIb;QN70)vUU#5 ziRO-du9zVLuLhihTS#*fT&s*7I|yY{!$r>9YP;TAsG1(|@McW+U_C4B(1Gf+$ocD1 z=$zr_Im5Gy2X4YmU>-asLEHdFCositifZdw0|bSyLLjBi5>o?8W!IKa-hFARaL<*; z%DZ{(xDB$_TDRQD^Rr-Q-{9uNX#fJTOUMKS(k>N`MI(r0j#RA+aanp)I}Mun|=K;$ytNhurwR!BSU1{XeXM+6%p|BzEqU%*rJQw-6Dv2C}* zx|?G=LMD=0D?^F-ebUL^>}%3;OWtU{b+N|BJ`45Di{Y)TtI_nEjzJ_o2j(Dvu-TWJ z=<)e(ieIs#mEF?m2%Ajm7`WbC1+5%j>%}cSQ~pub==N@W#lpRUKKh7~HoJ~rPK;@(`bGTyuNcxKa|V9-FmJk@kxrMv#}XPdFM+u75v zsmk!;)vx0z=#mxY)RA*=k(_;BwGA*b+-xc_#lMX>k>l=+4`P2STwyA<+Vi z4~GZ3^S|cQ5`uB)qfw(Q{Z=Jhr}{ff$8^Q-89wD}f;0fZ=1Gs}E1GB$LRKgyu6FGC z;ga?-WPQJWBKX+ep#>3sWw(aN#i6k>GR(gDh0I+Cb+k=Xs+Bb z!eWhB+RvjfU&}!IhZm-HCH;rbML2dQ`4;F7qx4ki^NXw~%;`KiPmir<_LLPWdh(yS z<5o7;*#EZX*0%FwML ~0CfTQ03#(sX*^MFktib56(*B{sg=}m4 z*`n?%$cFU~ftH61i+t=UEU$|I-g9n+E|4P@`rC+{&s@dQfW}~AcF9zkoc-dfGgiLB zUds<#zP!Y#&PRC;UbTCLb<`i}o<^c?TLV3^Y2emaO*Ra?ythzK0Xchs?RS#x`!Ez8 zx#i+Fcf1wjF2gI(|JWWG&|`U>e{nh2=}K3l;}hriybep3TTnX>+0M1THB*T^A$&t= zx&mf6i0~b264juOEDb%b>N^f__fy*?a)!qO1u{`V4-7l3)Aw0oUIssB46(6>e23l= zv*CSJSKAtug4U#G=et}}CyU^fz|qS%83>NJsvq~@vs%dzAhh6CwBlleGgk7)%Z*`a z?8zPJuEckomOi!$Ym}7QEnBR_aE?)zUSJ%Zu-bC8Gwe_8B9OK1R9366_ z&WdklRJ3vry5_`Di+T?5m@w|Qv>jydRqJW1INJZ0W~l;2SYR%d$LC05TS|sSFhP2u zt&HJ)jD3uXHZ)vU#fj0kMu+7r9J!$SeD7zcvHdnPii}l1+#4|ioqVA zBk#d5@2mu_4X=G~-g72;hYpR8dLK#e5NYu*LY8)VHwsi4eK^SDn!Bz{E))?m8unaU zeri9FLoo@0o88wRXk|=`b*wEfm^tNT zsT;Jw8U88P?rkgFTcJ4hh%T00Rjk?^K58Pv91XH!?Q zQ}4{=8^IpS(#|+!*&;QzaYuk-)Y6Kxdi%_%61vXTMm^4^*D7SO|RSw5;mI>XZTZQ!zgGsy`A=rLrfRD>ZuiKd9qu#Gw z82zCwKBuzq>U;u)XMiy`R4({xd|CoNy({cka@5f$83V1jb=F1Y8K!P%eU7U4t7*=a zsUc~NAP{FdA)t<4#Mfipc#7p1ZcCm^YeK2LKOMp6@db{k&LYRKD-UH16nSK@Bhhsv{9nGH19&z$+O@ZZG8%Oef$YNB?U9DDwO!{cL=*YG8i zc(yAH!#eviZWgcd0&5KHbkz=M}}sue%DHK>l~7!Fze0CyZo3`E$6k3c_qO_X|3{== zMSm%6%2?xxaoxz3XbX#}e~8|MriPW%cHjEm4IWe4Y?ZP5~cwF1(XT|Ll+R+&;K z@xMp0ui}NQ9qgp6JZP);MapNYkkLE#aV>YF`?L;=}s9j~cSU}owfA6KXu|#iY zwci@s-r%yCbMi(UhgY-Zjh!E4u2Fp>&`+@}dZg#VU7UX!)^aS(83n?V7)^JJ%Iw?O z&ESTX6|gT#ZQ(}Pxi|}pLwGgq=lp7B??YWVCI$%m)z0RyXwKpxOIAEhA2Tvv-Om0R z+v*3<-;)d4*nZ?2(>aVe&$=k2fx7e7_aM!4VI|P|1D8>de`~Fo+u0+C`dj!oP1ixA zB{Xg+%yChxGa%yVM{ZWDVh3o_%e|M=)hpgV3`)QTaXH1~n*DiaLPXf~Dn#;4WAnG; z*8BcokB%T?LYgA1Te2XP=+ctJS2SbqXlY}8p5`)P@WIqUtqOlO->z>fyl_1tYj`26 zRll+&P2HT|u*&$G2$!~ykIub?`{enB^PfQjvqgb!?aI0*+*rd74d1OMf%KW?49f_m zMxhy)q4nu5Z1+r{S9!%GA%j3uT)+1a+vJnU6(+JiVcR#@EIJ{Ag4-@HFI#1F9jmOk zE)ncsm2t`^hY>^#oT`ipVEVE;Mb!=YLjplDLBd2)T)#O;Q5vM6nJgiOoEZl9^t@-9 zQ_!@zbi2TEzC?~Lu4)BqAMT_#JsP?0Qf1*FE*3eGjl-z@VbEthdi2#6F7c3LX_4a+ zg=x+*TpcCF=4@-EEw0*d>-f)eqYqcXeUxV5wG{g!taZ)=VQpf8#vhawT4ECeNrbPX z-=XHiBqUAv!{pgXJ~?Q!D1wdstbEUvjO_f|(HakMtNpJFL8W>k-aD*^k3`+lCo*^J zV)#*{-94@T%g^^w2aQ{^2E?Vk_gU4D$|iQc>b9_(jjs>caS z9F}=EATDxpJkp9W=EwB`bG8`3YgGOa#jbS_?D=zFF3I8S8KBH2lpv?Kpqp31w~OFQWgTNHFM zvdRp_o8!)SIrz`5pmaPAX`dcC7(2OX6-^^1Dhwwev%J z3IcwQ4_6*9Zbbp$)ho*5%SKde;-XijSNExw#Q4y+w!?h!dK%MT3U%m+)3?sc-tmvh zM2Fl{W0`;^@RwF=F5|F5s=hO`Yt1>$KVpa5J6?pP&(f-hM$m;-#;E>X4SfYifg)+u z5S}Pv-m>si49kKKXA3&cM^oNn9i55YWy=iOs0hn|{{<&ekcjbNlV!+nwTFYAzwNF9 zv2GC^-Z@QT&W=w@ONl5{^>{arYhZt#zAw}VG&Jftk%Q(?-mxH19+d2v5xDwww{^M* z(_-qOS%lG0fkcSg@V)YF0%c{Zy~774??0SMDoH`d3k3>A5?^ob@?Z%EEW48zL@{%n zip&D4o)qI+RH;8MpNTbEN_KUAdM+9HqQ%kjoJo##O4d{PRy(4a8^+Leb}jOhws(Dx z-_M8~_W`)>U~xOC|H6-TWy#+*5L8Q!)ENJ|SSjK=$!*vSH)~rQh!46h9A;!-8THy3 z0#q$2=)5>;rAmWH~FsdsfFZi&1CCIt-FyB)Xe zuGz8~S2i=a_~G~V=z#J>h91Caxc{l#3-*%%j)drsGF=go5FQM+&zfIWO#S%u!6mX2 z7d{{ztJu3bot@aKG}aq{?L7G9`C{!UNg9aaQmS@&?mW+77uX?>N(QjQu~A5ZeycQR zpp|v|u067-fkc?E+mG(EGXcw|@rYr7z^A-{6 z=Ly&~6ZbDsL2gkCH76A_JyJls2GDh7h=ztmhE^ZGaAjWhs%HjSN{l!6A56$VU? zZLjUxq-PG~pRHK2iYV-7sEIxOegt z*!^rnqeP}M4)Im7BTc>PCi!#Rq^@P@#Xsmn$0GG14>vZYXHi9O+(Y8HYqS(}lHvO1~=f^^1zb_3Fl_gtVpDTD!bmpO+6qt(X?CQ8HJr@?;`i^FD)5(*}^_B5`T$9Be3x)G3W61I#!M966jUUT!9#;fH*Xy)u=PnL>k?)XY=j= z#dtKy=wOG@4dL;)X;)p8<$6Zr&Bnj${HD#bJRpU2Aajc%G{ZmOWc?$Kq*PQF+hcR) zOwo?{QqLys14gBAy1a0DhH+@R%e0cZXzp=n3hhGWLaQIJCHLux=T1xqGNaG7=_3*I zFQk2{WhpaYZZK{;*Cd=#1pK*nhjyC; z5IO=tIZ~j|tc*IxBmlH%)b0C*dmd(sLDMk+r`IemfE=ZvgbNyt92o;A;UXgUCvDj&Spi;c zgp;>x-;aoIz!2o4oj3RXG@J4-ETda5z$h^9uF>DmL1-z1_q6u&V6WcU^3&z=uk6=D zWgsT6BT}ilKYk(}AQ$WYF7WeG{`yPj(IaBnTkK_>A-o)HUnd{~+TXXqijp9UBy=U* z{=h%Pb5hJDx_;;f!Cu*G^ZMYWzAi3mZ(|$9EGXdK--))VU?F5sD zazSe~a-dM9fW>aG@ZTPqgO!_~g568}A=@Sk0P3K}FI(r~C68zo|V6rhgCI3!0gskOUTmeDRa0)7& z)$yOR3!LnkRD3{Y!FS+V%hV^5M^w zzL21k43d*YrInn2o5TEXGTXYd6L8k`7@C~{JWtTYX&4}a5l&@fz64l$(I<7|2;=mP z6zqrr3}_haZbRmQ9|q8`62Z`O%qlXD0~n0_8bPIZF4(#fF7!Nhuc**B<8Cys}$R%Lc<-x9( zX9qfM2qJ0-)}EesrwNj|m6RI*&A7U77Hl`P?Pc`5eg9*l{yOD9K0MR`3s0$gDfQ#0 zivT#yj#NnKEdVe@H$kcN|Mt_T$3-Lk!fa*r-L38@Yyjx_0SL?Fq2y;QwBn%_8BCfl zz<9FnmZb9ZaN22YV9H(s1*6Jsh3w2+x6*^}^@TI}O|ch9JH!7?neH!p191JpOnQ_t z-UaGd?HT$4fPZ43lS!b5V6x*|LjLuC{-k3RV5?O0jUe2jfi8X%v%4{J_rE5|0W^L& zQ0()bb+p}JxhYV1>o)}f?YNjY3MzZvH+Qc+*a5lftJxI* zee^jR{~Le$FZI5E39Mmv>+SU`&?>&2!~$58G=Oe8eY4`zBky0MW=dKC6WnS<1iHDf z8CwVF2+{!SUS%P>=4o^XKrG{3Hr?Rcd+!JQkeV%!n(tflQGmCGy8Mjq#l1Fk^hq%B zYk<$+2o}T^1jH)(EC^(HPZvJ`B0_fqpp?!6yS9I7T(B}aaRa5{H~F#i&``DYWVe!2 zZ*Jk2-4e+Fvnvfuho)eM8x1x*DIYV|EoD>}faA0PXYfL3;Nf3K3{4Q%kvCJj{Pg(i$L~>8ORbm<^5$>is%Wm@NG9z7Wvi!fpEq zSq`ry$;flz6OgagN5>8OOH}`x9L8TqTT=eV*Smgv-7x|XRZ9ieLLM8KM3_;Uo=k9E z`0VC{C;uHE=6}BZt*5}{qQsEV(LefI`7dhY$N-!fGR{Uzrz7tI%mw?pXK#c8&GEk^ zng)pATfsW?40cg0A3^GN(fpt4W~S@|WbZX4r*&QCMVy=AFD>OK5`cV{QJ*eug@zrm zU6++-sDS%68JOxSIai2dxdQ-J5ddJ7@=O5N2H0G=cYvvfb>kLiARW4kFr68DodcpL z;Ll@Q!Tm77@RllQ^r{V#(!27nu;uR?^feE>P$bQ{4X`c%p58(f*MN6#u8lq5z)A;z zekmR`6}9cv%URg`-T8k>o!=g*^loNNcEw?jqW(P=~n z;A$D|er78?>M~RogAM!SOP^VOV=FzHAGZ0QU;6vL0^ocI=R|)9y6=D&Dt1318@llP zs21Vd=g09JFrjiN-fDRh&E6R~khCQG%jevB3Qjs|C!!@9vXAltm+jzvz}6Vv7o_rW z^PNuW8*9aA;N&o1b5^D+zHG9&4SNch`}-h#GV9+tbKdLs1JA(YA%J(Zj(nSeSekr6l4tJ|^YPSGQvc=4CzS$u zmNt;AA;dYEIfaRU|9PnYQlB3mVxNFc+HK`P_?Cs+H>4Hd1Ox=qr=Yb-n?K(F%b$mE z2Eo`9ZwwSe@@8pn5V-ttehX0G@J47dFT6Wx!{GfPIJlR7unJsq+!zG^6F8jjelQih zlNZ|%#t`E0z5av!pq>BKUjq6!ALIZ}LdgdRA+O{}EN?)Nb^qDUUplt` zY$s$p{-d40^7l#qIqd(joxf5J$ZP$_75&E*{r~IPzeXBH39A_aI4#0Jjv6cnpxi=P z`?erb(Xu_>#`))#&3}4gQ>V?Yl{90A)w3Ec7`AboqN@ui8_ucZP<3*`XWu1!&;n7` zwwxc)OUNOeeXI1TE*09GkVarW#=0+d$h^1T;g{#~{OB$S78hh@eREc{MLh6m9iYt#ejO;=ny4sK zDX)Pp6b~Ya3;sHrkaTwUMY~ea!-hhZ-5o=lT_$97y3ft(c~<5}dRF>h5-m%G{aFWK zk#ztEOufHgo8#(RpG0FQK7;agmM)wNclk!#g!iH}>US@kPz2lxa>tlYdseFEPx-z2 zRqhA`x0C2#Jao}w-)8Anjd7 zD*Ud!Q9Y;o=FB-=j6zHFX+S&tt9`tc&imwr8Izmu`F2kJgzcQdmh>mjee0f0dMakg z;oVoZ1nYq;t@OtW1{FLzYGo<&1Do6zI$?i)*~g`Q#p(5*B;HHE4!I3rM=p%2o{pSX zg5PU?(r%s>Gk&(U$IogJikZ2<)!4>BaMJ=vZx^hDGGi^i)I2>vs96|=*^*Hvr+&V{ z{`HqmsjndSDF?9Amdks>Q18HK&JkfHH#h$I)(%P8 zwa4I>-Y5Br z1JH=|5x{MbGB<>H<$zpUcDG;a%?m+E2k~wY*@xda5|SdU{@@P3dcZ|;38*zUeIbdb z_&wt#KNZY`^2(Zkw{||ML^}#?U;q4Qx(`J?#T=vH?7+{gi?<(dK72`Hqsz$TX@HN- zOnK4ttnqT7kf|~r;tTrTaKLMF`t(99H@HbTZLQ@L&<_O5q9xN$a~?+nFsb?zkN=j;_v6m`sCeR09WZfcAD;wx799; zow-w=>-2#XiR(Qb9)d612bJc>Q=yx&c z=U|aMJowYwv_iRH;5Du-1fLl%>LCEWov?o64c;9P$gr5)SKgy6vO71q%?*~?lWKOQ zkT0sVY*Y9a3@(&92YD2)fCiZ-fRw{JF3NYc(4O4It|X;Lzf0U;waO{ypB*Y++bxm0 z4sw~}Qh3J%8w|ZX%np1J3as$YIHr=Z> z;BH7g9C68XP!?ogr7gGBS}B2+xYST?5wnji3!`-@ibOu6&dd$&=jTxe_==h*yy=jcOHN%;0!Ac@8cq^B4a5K}i{VzN>{#v|%uS6Baez zn%L3hn%-fF12^T}n(n+DVYY@;o9fHSVEX-}$9~=jvtMu1SFz123ALc`+7j|s7zL8h zJ!076ixe#b%XvVhddontS{I+&(UGPLlffGiM073wMPb+ zOAfG6l45`#$<?ty|5obr1$gS4;%voSUSL^>$_ITo{SJt%>Hm`gS5=Ukb`{cw=eQT0tIDrIhVm+ zr<%S}PDGASpzR~l1(4Ib^oRtBK|OFaSv*=a&4kU+x&C;=9hAw14uR(59d;ejtvaLl zX@ioagVJKs-+{VMErd@QZD0dT&9}a@9mxlz zEu5NaGLG(|4iIL_tyvH{H^qZeDeOD zke`AqNZe_gHN)xLBZ@#OpzkD6POKGi0N=aF4jd=Txeiyn>)ycYZH+aD*q@T+yDcT4 z>IeDeU=<*fmah2mAmKB2OhMFw8LqixadQI+_-fzkJOj9J_8lXDfd^2hJy6mKAkC$d zz*A0(%>f=q{&NrB7)q%zX3qBYE_#IVZz#{HZBVh<7}*ndA{0;|UjfFzYD#x%6UZd$ z02Sfm;TBdsPNQfX0p<3i5q`5p?}|5i#hn$$PWzXR0WPO%$e{qI8(jkR>qtKb%aas~ z%=?ZqyVpl}y(pu~AUf?|G`r+1zQM`e>#Lolk5RYp>gu@yMbQ4E5k0URH*;RFJaw?V zLp)%VPlNOD895N&NmZ^9!Y}D=_`DDMP3^~S8nQQS$@dEry+=K$pR?LI0CgQ6{atMC zj^;wH%`rn&);=vg{Qp9MBejb|NCC{fc$X5yiVmE7=4gL*ng~WMd37}n27$|hBn?Pw zP!1D)X&9&)F5#fFfuj9!KIZ&)7{%Oime8+A6g&q1xR@R!Atrqh?3hW>|1B?q?ER)Lb$RRYA2dyVisk)&c7)??N_It-_7*m^la*lOyPlGoHL9*tON z#_7vH7Bf<yJo;dCHHL37Rn8aNwPg|?<$w+5MY6{&0 zge)|moBMetA{O63vY!C5zp|n11vU>MmaaHv7kL+VS&(Txa7Wwe6vn7(G-B#0-e~F& zR|;EFu72ilH18^3IP*8~^u0Wc$@a65?FI%;&wsAZ?vRgwoBFSAH&At8uPMvQG1a=# zAH*sF$6DeQv76@Zw-X;Fm7G4v;{*N!owVNEefCh?Hioj6bkN>#(TSTSvMO|@G;W;- zX3-LuN8?0Lr?YMsBDo%G;pc0sw?PM#$}uT`Nq1N<63cK`-Z9 z^W;Te4XLstTt*dhul4oy|MgL;^l6l_nOBjVUF!@KFDNt2%M=dq@Y1Bj=C*1s)W2wg zLbg<1G&AREZFuoUd-av;A>U_j$S4AHt!s;~YL8Syp>@Gc-wKogH4Ri9Iwv~;1I5x{ zLE9aD-)8auRg#tdG@T5bV+-WOmd4|4rv@F#{nrdEJF&oCY^`mQEO43v-KYL^a+(gV z)xd1!UquI+WjSG?)q4ycWa*C*a%qjSUsx_+=+z%&Oyy?h&2 z6FlvKiJI;LH-`WG&w$MQ|GVArd;t+0#v8+(W0oeE?Cwk1y>}JDK6Q)qol#qz9JL=@ z-WBlxHfJlo5Ab8VuFl8Cr)dlaf*}PT7cC-f6BBtXkfFkip)7=oqT_;sR%Dte0(tWb zQ7$pkRr6_M7U4({Trsn1jIqbUlwX&(yLtWZMbd{SAif(bILHM-)%{D3lbciE*6uc| z-wsh^tjHjQ>`)k(dxb`<*GjvMN#{xIaVl#F5%0GKT-7kF`@k&n3RA?_*QDx1PZiHD zLC;T$9iMm1#g}rx+iz@M1q&j>YAO#%_`ghr03<;z7r;|h>EvI zIxwrIW%Gj_O<4LTqKJ;lY`)0~>&v0o%N@b;1sZ>^XWzOCS`jRIc%k?X%=68G!YFnOp|9>i>R91WD2W z!FghQCN*_>SB(LQ_fF+Vx4B1zVGiEhXWrouzldMwE;1TSIn&(By938NMD!_U$V39b zlA<}4t#lKM%h+l@!!M03yv9xr1gv!oqye0Y>;`35e%Lnw#B40cff$yhtIU;%sx9Uq zeHsJ;*2$qppoyiaM;#2(7mk<3#}F0GH4PFBE!4sW28vOaJ1#RT-8BBD8J`HC7tW=! zYFlk^)ISo>(gyw{I3o>eD9c&;q!oEhkx1~bHyJj4YqWa?OF@%!wW@}W*E`H!;JqZX zSWYP^m22}#5Dk$gt{#BjpM52MA?AIQYH~$OZ{8@OPGs41rSgL9?lnews9~G1nvNnP zJR@6ITLkd{RnhJkwq=CuD~qg3+t~`=RcEW z5IqbH1Q`F+kFxXj-|P30Ke2ExLbyd@v&+}VA($b;w(R~B#mDEtjL_BT>;mtCDBJrJ zP>&x7A0a=hzzndN5M<%u6iH5Jp&i+xVv)1IKUw)Qp^<1Ew9@4{w+#i@&QIbUU|aT7 zEYD~(`Sbt~rq>n-#!%JE+u5^-0!lO7IxrCA@`F}}`LlB6LgHpQ|1bLux<10qzJ10G zzKJ!vcV{J2Uv=e%GUXA>Xvz24h`TMclxV$kE4`gLaN?pjG>_w0fr#eG*^_^Rel15) z%=$UCH*I=1$Kf+dYO7x>#{?!DHQiTE#=iVQ;BzPz1^-@LDHkVw0w?Vf{kM*v`v#B> zAN=n=%(QuPyVhEp4?DHfV|m5{8LP)8`^o1V@fRWnHFfEZ%d8Xh%F~dlH7C~SgSHak)VVoKi)|T=TDC=$};K26ZR${(8R||*=VT^e+ ztw(AbiovE)$P{k%&JnHB%WVT$*i{42jVe#5*>#{J8h}jz1FbQdl zFmg?Utj`$6)J0%&&+ioSYjaNwWG-C=vo;vx9H~bnT6dt2UwlDZxl*Ec-MKNwta!zI z^Rcn7e(u|9W!ETa>Umz~h%BENY55RcL0`V5)nQ66#t$nz$IvX)n_OW z?!OYlE(%S`25=UPd+|=Mx~+U>3!6W@NUL?LY}P>k2Yq9T$k3`5dFaj6 zSAfnW6rg|ko(7$f1zA=DG`}@2r{!VBtB`YG%EI5iepQ>zx|o4qUe?Cype#L&{o=Rn z3Ins8wM*+@>{KEnN1xO59BDX~}- z04S7?3GfW=j_g7w`D*slNpjA!>VcR4Zd#8RV8G$_0&HRbr4w(rz$}n=nmkv<@oD>x zh4iSxU|Kubtk0nxdhnt0iZ&}8PByE!|5OU&tX+!^Wym*UQKX$xH9BtiTcT?~?>>C$ zG&0a}y|}hVWc?cg42#X)SYLbcUhhf|K=%Oi;HGEyjOnSae~{$q{UCT|98)`8B6+;m z5%ZXEjJ@hJHsu{7L7}vk&*>6-87kr-yDDc$9)!yRE?;L%_Z_JTs6??zQ;T-{E~1h2 zbn(r-Q)3Z|b*Zye6&C(rXgEXT<4TbGphwGr9#g;Cwl9dpqY~P}>v@9?6wc_wKICHg%{KgKGvG&3299?a3 z{xJcK2#})9w7>70*e9t*>pW{uWO3^Pj5?iywt<4*r&!CjQCCFjg{MALjVtMGEyY!l zi#@+Iq!k^W>UQXmr7Zce8PQ95QO-B1*6-4mO}vNF*2;&_jyZ$$Jp(}}69d-TQeyAN{NcC{PJ?f77F6KGu@jXbC7 zOnsx@sp{&a8d&$Nkhm!{rp?0dk10wXMa1LwYL`4NdocOl%2%zbL^{!@ZP6}U!tb`Wb(w<*UK9|^|sZLnp+ zH%wjn+Q}cAWIn76@$AT?x9FeoTYR-NtXtejEM74eauD`g+SOOlteEVIbfgrrgAh_n zfi>GM_WCzp4^>!E6vgG1M5UpF6avQ_iT9~*{{V1(+M>cnaT>}ZV@EXf)>o$-bJF-U z-xeLI1n3$z1N3)tYBX(picnRda}YwDjm@rj7koig?EkU%-Cs?nTiX^C6p<<@y>|#U zdKD1qBGOBwi*!N@B@7}U9YlIlq*v)ZQF=#uC-e@1KnN}HJ~L4(A(V2p9wXtCznX>SvCyqbc`owdo zbg@&ds2`J?zKlkYkyh)NaHP&`wl-7MjN55Rr6{-b9d~cdLnBJHZKDOhgr{ROg*!9c zC>A%-t&YV(AQ$!N4YK1V*6;UeIouQY8Sj4dY`OSpcX?S*qFevuDY#67BR_s^nI1_r zP+yzIR&0mb`uX+jYv6{@a7H;`hlbRZ zrhMkhkEdMD1EkcYBr%0(umA2Z_In1@>ve~@Z-o%v`&Z8k-OH05_d{iTm{&TxLd5Jx zP{Z!4jbtL5`mQdXD(V-w<8wq-2@IFKjN_5b(baoE(atc7LaZL5tM(LD-HXZ<3uH78 zR!)W(LXF!=)$D546}s9#HW_<)KgG@EW;fn6!=>1+ZHY`U)YxFkLf3l$0_D?lz)!rjop0 zJ^1kS_)A+d8!5$DO_-q0_V<1I^`E}tgSPbr{0ITNR zW5@h9{8yE8Fv|bd1j-$l;gf{xYqMfFSp%G3)%otVAnSOrvzoQ=au;M}Ic{rVpBQGI z%r0?emVot@@&-?c9)_bbM*|D!fw9sjt5WBm)A?0<0;`I*{0!*OBfylIp9?8YTb^ z*4bdxxkjRdL0Lk16d#nv!Ye3+? zu!OuJ^+Ru6PtMI7`VaU4rDquQ!)MUA>%cON!&JuS3{0fNACv7Xp3g^kDLH_>d{jba zpC7+^Alh2j^+kC00s}A9nQ@jcsjE6aqLnS$Nx?knj^`<^6QqM|bskT>`fab>wq~;{ zt;NAowrr=a9kx8{N$PmEb@n*vgWNWeAevko35LTaE3C6jDnMPHSMNR$7fy9)(#R6g z+3zm^o8YMSJ%q%?vTJXPc=yK;0yd@5*)wsYj!6zU0Hz+`XWZaNyaxaAAxT=yC?y7l7=R1 zt$p=v!NOzUhp}`@h$#gt#QsBkr!ei^uY6K3>vHI-kX1{G$hGUfrwWH+9?Iuc2PdY2 zQZ9EQ8IB5F6^1Dz|d6+*opO3=BgqfbMW=KEe`N` z3{mn$UjI%4!tRjzg1`0q)-jpg+k!|RN#D)b|G-(m-6ti$kKTBN(SB+8+rRi1-IlBk z5j^K9Bmn#t>$eu2mzuA|Z3vwbyhMI{55%L{df)TQ07@NE^dh<~rl`CBC8i z`^2|fDtsv2mX*DLEjZXbI70VgdLLxj?0|ZGw-#+zP{IekI2GHm7j!T?$n^n`ywG|l z6wPIoP3Y?eg&Q1Kp48C!^e>ZJ1rWFdSrOmoMNF?%U=$=@Q;P~WbprFJ+;K}*Zn-vU z!(BME^PO*A%}?W(9hBh)z8M%3^C=))`;Xk#WImf&xABVi&-9-12seo^dokX%yqG=< zSL`^Quq~e2FB8ex&#F8*y2SQ@hvB=WjmoYy)Dm%K#>1D;rCe4k(C|1Y-q=m+WInjn zb!F40avhi(HR<9Cnn>^$UCAgm9G!U+`o!+!pi8SL?i-N=c35^)Ji~$9pFrQf#SvAj03$bxW(#|< zBUl#qkt{Fdj8C;JnLgo6Tx=A3DGj?+UMd zWXEVhQ;-)cbq70@W4tAvlA8gz=tu_xAm!h{MTBf$!svHHM?&jTiB@OuHnQiai8Wci zQFqKmnlR=7Lg^6_GNt=G)kwXzLqAok@Q(UlN(H0l8*9UPtJEPx{-tY_4i~Q zbZnE{&IYU2D>Orl{r4uIRE9}vBHI_-LOk(6HkqU9nPU`#Aq%G;i|q!cK6mY3%pYzJ zP1-aTIBUhAyGzbMK1|7kt^i@GI$FKjyKpDun-02~_{6v%9=j%s6DtI}U8JWEtwMKCkKCl=k@8LqBg6{gn1k}H zv{OE#{-EmVh9@en=TL*jrv2@Fa6!)Nj{|I@J`TWY0V?;G4V)^zn9$Hc%47#^`10sE zE}u4E^x3kj;CRDCv*z-3uK#4x@Cbk+?q)dPpZ?U34f>FgulCF95uGceGhUIVebcFK z6Fd^T*dT5puGj+E9R2pf9lw#m-JubQ&!(KnHKsVwh{OkAxR2F(#pC{7sp4bLA&gEY z5wcoly?l-D!PS>uah^LMDdRiaxLqWw4;km#Q&eler+`J{qbM?{%nmGj;`NyhfFnQi z1RVcPIFq*x-vHs$Ko6gV&YX8XhS<8+2Ncy59a_d)#J;(jno0L^A=?W`H3#6oUF#eRKaP8cwG(o!dn)N{NlLy*9GApMr zfH3WrxVM-UT;=kav)4*0#wb8X~6AT?UlRWHN9{)zH?6q?~g3d=; z$x5Qikxa{2=)3`PnuX3uckhi%k#t!d%-~%-#y&p%^viX?oJH0Qfb=9ej0w&Ye<8N0 zuueEdNjz8aem!H~u%>y~;CZJ!4fL?6nLSy^r-Ml{)TH4X z))mg%A**`~|L+0N%9j(VFiC$hrj${z6qt3tps25_k;2xox znTYzQG#gJ23piX(y40p}7}5{?(boFFLM-R;bpoVm!cf<2(?bT+p3npk!{iC1H}Gz+ zR5!R^6t{HH3bNJo6my?#sA!v{S!Ec(==bBa=+|1UF7#kDvISL?KzO@FS_50z7mLM4 zHeDJz;afh=zUq2xzprXNs?rylw(0HDSdt2C%Ap?~h!F+nd#$w5wyH~>#1D!W9}DiC zL(=Fme2R>4>0TJhi=&9 zdF$q97a+RirthRkG`@@wh%!E^mJPN0N`ltvo zTR)^weeQTNx00L zalXp1JT&V0gYjDJV>OAdic?+gADM>w0j7eY5k%#ZSB&*R_`E#dMXj>O61UUY=Ffg%wKlMsVQ_ z>o$=*Fq)ZBKFSy`de)1I;s)LgnRVujC!{3b83{rLGhd)Sz9_A?LCv_1%t$$mE3PCb zFDb*5r{ANXX7`L^-!r|a|!%k1vw2TQo#~t-n5`R z#J#3>B5$qT>)SAT-(;UUrHGU?(j@pi0ZEJV^AEzGS%LDEkhXGWp3^yb|IBJFi<;A+ z;(nX_<&as*iTFi-5MR^LSH&U6`-(j%#)gi%t#R#Q_DDR7^{j5^LqY(#LakzQS~wdfhbom2svfyE!!)?(XR8r{!-JDWNUF zZMUXCk;7uDdNBga^wj+XvK0l^T6N_NHtwBJUd=1QTgg)$S{DE{nDmB5WIrtwr>ORB zvRa4~-@*R&T)2?VT%p06P}as9-E3wHM%D#F>sD#ivVS~!=mnj zj5+>M)TEc92B-ch>KBYRePedT^6)}`G@cSs98HPu)R-+tz$I*p#EkqmW>24Y9z1{y z^KqHz4tBB%sO%5bO<04%&BQ76(|)mbM;LnOmA+vA)C^DOj!_e6`>K&bgDRY5Ga9dQ z)vWcBmg0B-ry&^0{WSK`l!WytqAtbVi4^xMINh;n@0u!(y?U1ShMM?*NQuImPbZ@* zbhhauC&yoEj%Pv*Xhh*h3*BD~wrsL}m+hw9JA1t1sr{ZQ2@VPTEr32bOc4OWI>n=R86^$*K=sSt_sK##d8;R9gMrmm4 zCL|nnxEuq$R9P~ONsmm&zkCiCp%JpH+${Cf{w%PA74YFu;0jf;IVlY4guV^)vODv@ zHu`qlEr8q3RB0OPRcplYCUNAXd4_Z;-Q~!*53?Rxr_iu+6b3HeIaq__@ams^ z^1s3IKO4%W)yxt^aUQnGr?P1jMv*XSsAP;65#4H$PP3h=X&T)bD#zFO64Sgv-Ba@Z z$zBX+=Fs_=ph)O0qb`44!-t|QXrGAHM3T+M12Ok)(jz%Ri~b0mS6xI5-BP%N^28qO z5dw{Tsh{Y1Q)8AG+p0>j+9tM%VxS1p+o(QtaDUPtMJX9KZzB@2^fcps3$r&K&s8V4 zy?bmUUmXZk&x+%+F*jB(B$TO1RPS-uuOyeV3aBL@XOtI*YFZ^fNa-ELYLy)YWDPdz z_`1Fg(%5vFmMi7H1NXXsN!<84ad>4vBhCSdLR`qOMcSshuN(xw`syvR3X}U{g8Pws z>)JIfsv%+siH5kx8!79d{5*)**=bWhNbr7xwVuRF7zF8gYLxf7GA%|e@&lV%bG9Zl zh%F@gJWpgO(>;5Pa3xBK(+~T7CxJ0fd8owf zXrEjRL~8!st^L)<4YEjQ2;XNRYvq?@4N;`1uVnJO?+q%h?g`$2!*KNEnTmxw!Ouh` z#7#1X3t`{twCXSovT5Klm751BKDx&EEWl-=3;FxZF zOy=whcaONnN}y$A2I=?}{m5)(np1CFZoXVyVrM#LN-|_~B#&vaeR}otYCQ@b|E#qb z5-}wIWn0**Et7)z7WRq5h@PqdK{w zAw|;Gd=&vjqWAT0=bvqSrjx|=Kn78LOntSS)UE8%A60(yPk*q>Dfh64T(&?rdg>jy z^a!_5$7cng@Y!fJ&I#Rea!|fw5R_^~&1ga_EdLcH&i zxyU*z>tItlu%6K!YNcY19Q>XZM*DCa%5lWz@J{dQc!O&YL@knbNABUQ>uGjum4j!C zUpw2Vjz(j z4t1#uUUglVMfYcZE-GUW_{Yle^X+Qn)QT>Hw?{~WopnnVU{mBOUi`jJ@R%%pbK_I= zzVkVBb#m3e=V`KeUy_dCi*7-q)U!_?<)BjHU=Gmx*&EiXs*|`=WYoJ#>iJJBKxFD` zmX%X6ee{ubzq_wRhSUMJ?gZB#QK|xI#NPbS$ERwjS-0W+`u%Mzp8-0|5H`WaO*m2M z6*=4}QV>`l^MmEuGmFi@<+}s#Sv7O|Kbf$!d1m%`zB~DhpXJ<&?h7E6I@2xY^NoRQ z-4ofa(6n<_h!%b^RAHgeU4f4Kyad?YAYga2FSMhIl6*8O>>mrJh@(_HuoBQsRnHpiIZ!&Z_s^*h@rgKeR`s(w9h%$$3g29Jwo zv~9eT+y~3WfOs_uZRGvUQ$fO@TYn9)AXY>GJRrBQTNrb9aC8ng zix%Qu&jfwd8k?VcS3)+1 zAS{&g5bZPht$Nvi|9DMOcpg=+oFk}q{8fUO??hk$Gh(j%ps+Ec*)N{|Cn~jD=WTn?dbi^K#_l)?Wsxt>NV)Wqcu0{nX+KWExENB>@d78x^vT{3h9}$lXp5{>W24G5 zl~q6s%5piUe|c`UH^S!Xu=N{n)lOO1+I;-!AkUCZ<|g!!+JB^Z|2`u>sYlmXL;Mkv z!KPFUWUZ_rGm|8cLiut7^R+0kR6`me8;8>^7d3N|U?tJ@BwLT^j8>3h>SS@YxoC#d zSk+T57%Sp<#lT?m*8x)=q6?s~RN=>m242wyQVi_$7~BGTh(=qAr`62a1{K(w%hleV zer*W)bLerFM`_dDuwNMs4KEYJJ|a`{LFd2wy2(&XmM>)*?^w9 zmdDsrFbf7f<_OH&rb6fgQyJ@h*GCQUG22O2R7^~uW@DTdu0M9i+@Co7+usKdG z8^MD!cGq_^ti%y?JGM2_E9(7VR@@13A37!1RkU92p|@U%L8YAJ*_xTn^=F{&cR$_r zx;~KKbEr|XRPpJky$K%Mg%)LMhv!$Z7mEr8o4e985qOedltzn?wmw+3jbFDksybuZ zlwy_cr9eJrvV=t6T**-vU045jyCOJLJ1G15el7x4gBeXD3A+V+d8slM%k!VyqWEOo~Skc z_OE$ae5L3nolbPDO=hF~EBV-5R6L$WiYDp~GA0D?iW&lUm~7YOx^<+89vMRl|GKIk zMZvCDX50F1{a0;u48PLS51Zs0FpJn9ywDg*Iy*8-%apyI)cV8R?gz}l$4xF#u%BvhILKs&C(~BjR7wCax$U0>erb|MAwRZGy&@e zj4e_%aW299=JQ1kaqdWi1lR?RSZ(08k=Dz!G$A83@p-(}>Ce`Wm*J;I`-ton5wZo* zL76uXuey?1W}8q_uY7fVU?t%`X;yyYL;cALR&U!s9&sojSDH7FpBCW$VZyqR@737r z`*BNz$!g0qft+oq^;Cbcl)7N_GaE@oUzB14{w#DZ)hE+u>UH_<_{ZOfZZTTAUVR|8 zTReql?ocvyv!h8~Ii(wWcL&6mEaVokSHC{w5EXt4&T{rv2uRRR4tL&-(Q(XQFdNG` zt=oijYeI6w6s1nL2)Uqk7OT3Ea(Po5@Js=%W0r$5+b_jqHsj1$s@kg$$o?B9{^377 zfK+OnhRQWguJqZn4{{HKO{)c4Xs^rc9{O!Za4E)0e0==&Hq-0(t$Sc!oT_B?6J;ts zV48Ze$e1R%0QdTbGuQN%4Jbjn!9m7B!!|$ev(;HzJt|6@&%W20Xc_+)Z{hx=j{<}1 z=a`#gT{AD6HED(vmq>$8V2UKIoZ4y)5M13;i{b0o%(-mFt4=`2B#xPbQg~qhMC$z5 zuLyUGjGsJFz)dd1QUra)FW_vKPs80L+tB)(Wxj@OoazYFh({v}>Vv$RXFSCg=}4b# zFGG^_-Wy&@kYB=EKJiQCW@%<<{Dn2H4(z=q;jS2(t>?4307V!eG#Voc#2t&$mNh5( zS-pluQ|U79XVRvvvi-V$aS+qb`yP9-bTFi@taz=wDj>utU0%#*s4mNSG|?(EGeE?U zWG$B?mjQTyN)UBS0Up{yOt`&vhEq4I7rVr6uI zY7RX~Am{T*j7^;R#G2p!QEwg+FM{c4`(k%d{aW>vNHwV8EEJ2ArD8r)-;j8TH)LDE zO?mRYq=<-r{Or9$mM@2v`fGh#ZvC>i#!A$Nvh+XjEED`&LOCUumk{IB{U<(BxU=$u z4M==rshm`#$9P@Qjxw1h=FW#Hp(2&N$_R*wpeqwyz&|>ZA3#1=8XBMz2K>DJFZKX{ zw20JQWMZ&KB4@ag=zJ}ZL4UrAD}R@!U8oNVReXtXc0)}X&y8t-&#=m&`;4EyNHjVP zjearLE0*^`WPUWI;N_S-PJ#4&3Q+IwdNyrw^|@H-N-WV}oe@@=NQtl3iehO4u=Fxp z(_iGo4@6&C>e5g$CX%%>{%jgf)WJ6JW2gnOPfVsteU#V7atuNaD~;72-)*Xr`4*Q3 zO|))|IMwgpm}N`wc#ZXY696(hvOSCeLWG#y!!Adk+DF$OPhY}BaP+G_S><~D-sfmw z2SIj9`&GfmVKfrF-B6qyX}9|Jm^MATb$-skq5ut%W;1zPxT9JSQ%LlE{qA4Jqc>nN zI>q`0>0SRm$2aiu^hi^ZZ=qCud(&^fpwWaTRkcc(o6#GPS17+n&$=)6)`_PuJtaLW z&|%^uL>o%JQ@}>^IV4i`!A9L}r1vJB9>N3?LTn4lxW5jbfoq?eCU%T2cji!a(7DeP zq?avfYzW%FAk%DgTyy`TSfo6zYIGtIM*a_`JQo>pC3sK)NgGqVk!R@-9ZnSjuTI`>t2M^ZU;c)gWeMVs`-_4YpF3C&3U=GECl31lXcTnv zSG`%6Xy?i@yIz_IF<*K+rc+#(9V8G#bnPC@)fLj|Nj)r)upf}OwspHFaxvM^c`qPM zPiB8F)|JK%QVWJ*Dy^tpAH7QzROKb6e)P;zV()VozY9TGy;1T?RaGUG9ZyjFagS-6BckS3Q$JeBl?TMUq?VR%t`(VooF(seBis#x>G3 z9tY#Wk9^u_YfGJPB%Z8!tV&8{++Qg!y?WIypUn1Io)IEXROQyU;q8WB)-|*mhwH4@MfxlFE56t#>K_F5^d|Ic5W~y^Z(xTx=`c!?2fL7h)6rh?fbcvL z5lkQBXC(?AX5Wib5$C48_HI(j`8{C6c4uI?fo+(XCmjdPQ3Rhx;rGwnTF?5b2pC9w z9U6|S#1ZC+KiaKcKkTG1;;Fp^Hma1-1n7GhPi$Ianaf1C1_Kj7WodAK2J!52oiP+N z?8-JcFDJ=u(7*WJb+5a7Io`v$^s-OY%PW5Do&J#cp?6D=-O$ncp=(0UGut12l&7uQ9z72grnYaM-E$~CcOf!G zJZn~Y3Nvdha$6fWi!AjuqY<>fr>oW>$6s+Wc7l1=i9GjAnt}1d&e@(g59T>DksHhXodHy7 zs0N0!7bvOSKuAx=e9#p^0$V-A^utslRW!iO#4x~)_IADW;G};g$^VU>E}bdabAZAS z_yG_Y-hU7nVWrV5H@o?WHbq1NM{e@yW#snwItN0)Fh<#wrbp-?h2P}%7~r>AuU+ZU zL(NpG-KKEFS-WonOH!F7qVd~4bM<`dPnKxyv_lUkK05Pr>MI2=)htHD_>|Z?^1k}P zdUhO7fl=<;54v@2_c~BDRhlc#(5!}UL^1u-XZ$FQhxdX4-(UVu@7{9%v5z$zj{0r5 za7ykA(-Oz7>ha*dx*m)^(&OAb>_sOc4L<2@fEsUmzBgofxTDRx?abD;75ZZ?F9e) zbv!P=j@jS8&aG?axCtw8$2U!@j-CniP6q^TMVv^kzZ?E(o!Ev-YVVN%Qju-sPT{_jbilSl9`kA%#4Qf}th-EplONr6qN-QCJVGuY zC1JJlEdr_6n8MgtYE7WcdZ7A@yBBeRbI3#H#x_*Br;E1k3dp<}{>U`gF-bAU7O9fP zgpKt4wMo`b`VSeyZv-FYEiEEiZfnvp$`~u_Sk#+RE zD{~YLz?b(C4HF}*;687>;sp0;luxVN5q@RwFm>M8rd=^HAGE*oprY?>qr=KgL8TH& z_f8$KFZl^Qdyf=BLxb&CL+^nbTsO`E-v0*cUcz3?tG@mJv5fUEXe)Z@!5*^w32n(j ze!9Hd&koO2T)c@M8&Zpv!mVE{b$pBX#KQM@h%wOjWZeq7U*vB769VceCj^ruuxF9@ zoqhZfK$XG~Z|qsBd9W?!Hq@T&f0M<8RCqO|Jadp}tQ$z(IWEqzC1qY%o3hQ3v8LHH ztD#uLj{f>=Wn&5!3)zOv0Tvv4p{qZ5QTZg)_jnmu1zl$Y@yAa<(&tg^{jgS#+lRgv zI@0+wx*BSd$L2PgEgH^qbIItCN|8A%@`AkodII4;8N~St_F4wfNg_ zj;SweQ0b9>QgUy?{xQA(o2VJVm*#V{AI<5u_ebp`?4Ol|t_~ZURoyFh(i!}T)JnXLiMF;P*T>IMz{L_<;vCBg}_x`^_{a;k$ z{|@zk5svg9hVg%v`cF2-_@^oUe`BdYFrWpdpj*7RCo74g-iS39miM`TUU2A61GZfMa|`+B?E=c)2<%+ zJ2bFq**tNHcpUu1X7FX;-CE5EadZc7H3EAsmF1nKuh+LPr;T^mSSO<EzO36Vh)s}(hK)GEn~ zcMZ|2#;mJ@@=qaqO_sZo4jgFr%{m{$_F% z)l#F7dnc&^tqIHCAH;dE(tCnw2$pUzzqP23e5EHD3@a z>cl5tt?anZq2YZnpwN9TL36`0$MqQ4K5>b5%GEJzJ~>7@n9VS+-$~~9%)T~xii_iZ zj)`Otpgnl?xl`_2ar`r&IuCYcnp-a9h1c|PKlDumAWzU2&rx$iIDDN`hHQV>b!6(sI9|eV>R8E z2>>H%4Of1b7a$VZx6VXQ`t(~H04aUEeyJy%pZ?U7TP?=MRWJGBYu7kd1FNe_JK9nK zlmR976D&$ninOO&T0)#nlshs8=cuLgH8b2_h0biU5-jLNs*)vC9ek>uh3O@l!D%MiAb zaxs*1Ng!=a8DHH)Ci+>pMjM4lR6(F~^k)V}j&!)0kf;TB=x( zc?!BiI8T2$@TK|>{z;kwWQyC*y|%7<=IG2)E^?j{{NGXvydZ=Ikcmyv{$+K%Px|Iz zF~gp3&v38mEuZ4gS*02A`k-p2L@ZgUfo$&EeC*7y6Q6VYadOv=*z(w^PKmDh>9PE= zN!@WEw=24};75sv-?bYlWru5KdM|a)K0&;NpQAUcLr(aVC_v2pl9(EZfxm(Nt->3$ z5ub)VSU_g?0;D@K`OioN!V8FdyD=XuAfz6rb9Y5fa=&enm#>QRUB@SO2R(X@ca6m3 z-Hr>FkE#<#YuUYmEZNoBKBrgFoZH^Rk6I)syCUWGzI*L_T2ZB$_=P(>ML7>x>$fE7xba)qv!>d+gj109V2djoU@7rm+<0@H?|FasRxWos;b0UzN>&SphJF*xfrmKV<+A z=Bn8rl6+rcwv>a%u4CdE`{@B7z{kyeL-h;#OA-4*vFr6q*wZQpeJ)?zruCuC5=X{S z1puz-ji$P}a#DEvd%&!_ov<2Z7*1OQL3lQ18m)IL*cA9Cq|cd19_0}#hR>tPPMk1b zq3fx-l)iYgzTGpNvKiwH9LAj?R}3jC-#Hb0=GAy@T-v4xNl!^<8O;WCOHY~3n|-JB zC5$zMAyruaiyje^2CX63<0;Rz#IT~Wm8QpXjL_eS0-!PLPga{nBjrl&E%REZZDr@f z2`f<%p8Y|Vc5&8U2LUPX_!#?BCFJ=LTVxz`%rl;|O#L9eIA=9F@#B`)T$**6q80n^ zCaMYwk>6V-gJg~~Z?R}rgr|F2ew~J`O7uyDEjj!lu0eOr>)XliW|&V88G4XER!!Fx zDOP!p{?Ir*JapfXOYiiV9K|Xf9;BGn_p^gVoi?LJ3dEI{BaPyBMPhv#*nP{kNs^^l z^2yeI0za1IXg^*3lZ2SD(`|sK_j?~L9wz)ELUpiA-%%zqg{Eo=G84U`u6%ny2hg#~w$6ZmM%P>Y2&WF{>fvekryaB{e-@PrUQ{Lp z^=JwbRG z8*~xpx+&DTql3L&;k-zh;LPu;`cPzbtDM*OJx5o8mvMCkM5ucV>7Bf-;K#c&mXrL% ze$-g32cr2RRopx5wgRA^j3l=Q|FKxZ*+|i*IN9{=^d9)@eO=HE35+ zJ-qQy?_mvvF3gw_50#N4JjEIZM||HH2Y7yS)Fyt}49x7CihKdMW;#FTqr(h*Xh#8d zehjvJ_>2zhbewgK;RThkV_MNR9KBqrR_{AT*O+B?8mA@+N%z*?p%oq$xl5z?iL=i> z^|mK|#f_t)FW|Ge=1B9goiTHOc!XD9R*^>HsLFlNme~3P;(<+=Tkt~76r{Y+iJ@I@ z6;!u&jY~8C!<&XZhi%1$Oy* zb~L1Zc%MfMVtzo>8ba|&8C3fM1~64#;uUtM{Kp%c4rd`z&TydL?2%8Ge_w6p#|7aP zbu}jPk8e~0C98gE-Pl%Lpb@l-Et+yOp9*_~=5YuwD1BZE?RJxacRkX(RuZidr`YKA z(gYYn0lMt-tle$eU#>BWjE<$g(l zoq#P+v$!g~EfC^lvX-7A*o-%GH9df{YU3xL>XeA$?G%rU;asP0hNif&nw z%p)L6_>8sCXC|S#L&9_(?v5BUD&Czj}5SErdK zyKprnuXoX#s|cWV+!;`xtsCa63m43cH)x)`vru;#sl1BwQr?VHr6po^rJpR1VV7cO zQIN`1v!FGVmN5Ud{R)@O{XW19on$Kzy+BVq^e$e2uC#N6)>nB4eUDl$@@@|as{3^z^ zURSn#WHXszHiy-QZZdUnQ~;6Ay|D7gy%2}PZ06HxtbX^)>~S3#Hi;8tRPib{^E+Ey z+drr~1KIp6cHBd!CGQ-#C=Np_g3ah_bkrm;6!rVDw+b~9CvEtp=Vjx}$=cqB?_ zcyyw~PA~^s-=cQ*`w>wJ**!1xIz~{DWC}E{tJ@M;s+*Q+cDE;&{qd2X)r?N{X<8(| zm$xO9$~5Y`HMV#$IJJ;+MLuzA!1Xbqd&)yK2hH7kKFLX_a(!v{6Pc}<8#HyJAuf2T zioxUebzv)>-N@#*~sFkT|IPE1mkV%k@k3B z~m94&lv|Qg2nw z@R&L29rezSnI0_&eNw%VJLY{8(t6dMI6fdWF4wl~1JL2kalf7C!6TQYFY3Jr4s*{&k98rZ54X;2c)fR)`t2a@ z_ULPT;)a=g&5zlk5DSq0Gugz>ERB0D&C8XMdrPUZO8`>@YiUM>E{aICn z6%1m;XOMO;ogRL%vErMQE$>(Ek+JQqQ_a*0HCSFb`AHqjNc^3LjQYwYoGEhMibaUx z+eMCU*B3OuVCvwx^Njjw;bv{ka_mHNq*`P^%3+s*l+~vn@y2@G-Np4l)8_W{r@oXY zyWTbzZdT~?f-D&q;08kKsXw#jW>CTQZW;Q z%1wTnqST>mRwH+Z+*bkl^v>$vFaCGlA1IH z9S+g^=tVNs#|75SVYA5o(#ns|Bkq%pt9|w1epS{GzE9`l^@1w=?QRnVVJ2Hi5$%yt z&*?991aRAEzXJ8Jjh)KKxT)5@>Z-})KI@5iBmXx?*mgx4|Bh&+d?V0HY031I?)x?7 z5|kL@h>g(Q7fu~?&)wj$UEWROq}K>}P4JeR+FDikZoOH&B`7E3#hm(^_fKXU6#Tly zPGRS(!^v0c_D;o{i`g}8yo$q`IX@&x8)3}11S9!Uqzu0F-eWbHT&g+2)toZLNDBln zYz^OdL&=eyF!W|WhD?{|@+ODq0nr}KZWLhgS9!vRAG?o|M2v<$X6T;WSKMcqZPt(qg~u zkiGPC%vF4o&8$Ru+Wnk=70zdnc)UR)#mHgr*KvzRQ=^$cPOXiYnC&R!Xrab$n_6FS zUsHY_=_4HQdT4)O3h%%vJgFj-flmWEH!^iu4MJ08LJ3CpyRh~a1xog zQ^h;BNF9?G){Bav8k5!-Wj(#8D+zD}Pkz?$r@o1p$w5l%Rg4?qH%Uy;8Dx3!myr9h z{tj=r{|0Z0Wd?V)Z6w?oGddD}d2EN^fF6?@ma7ddw5&{ax}VJF=H>@$Wo^{?*h{P| zHc&3Wt)8kx<`Qw->;vm=?c6jzNV9SO<9&eRR5~U<wKMOWeNG#zpke38nPF=h1#uV58q8-2yZxFYe5~&0S#qJnQDI;fRLj_^3 zcaaQ)WDW*|Fyjmm=Ug4bRX>|wXj?05RcJ7BUu*J90AB2K;@(@Cu->pu0G;eq+vz8k zG5Q~HBGu#c8W$s^Bm*j%pCST68-Cq2k-1H%X-xK^*7{Hd0ZyOJ8sd!DbkCMT@RGeqMvByAOuvcdS0MCvR08W=7v?DHz~PMP4`?T3u$ucIb#+L^6`ppp!+t(rnw}2`ta*Y8GY3u$zTKBmi z*vXETC@n1JmzF>f zP`f&igPJlqw(RDza{2-m8)#yT)KLXU5^4O1m?kFs$$ubG`Z0&)bXkwT#3HBvOzXDN zy)jYKyT3EW_MPU_tL51I{u~~36pZFtt@4Zv_@)mlGP4T$U+qfwL&WqD6N3-4vm4IQ z0-%lwJk_F8A+$lI7a*wWeT*f$7J5^%8!5dG+S?9Q69GgPUfw z*Pdt-Apo7kSDz-h_T#Y>lph%=J;?pT}~a zYOztyKh&ve$G%z6WbP~pe0F8h`5iaLfnpOLZtGNp7k=2crD>M8$*LmbG}~+j@1evf z8Vj|F^ugWvFfyxs|GY`#&`SQ`PFA7!npb5M9mP2sF~kHFujNx6%(|qOcwJ_AM=eJq zLY@Xy|Eg7=P(Hsf=)YJ1LPzha?20TpJAMJah4m*n?QVSAntQzE{d3m~scnfNaZ+a{ zkun5{NOO$PG}5XxE>rUo)8nj(<(*T*%Ej0v%HWt%m7Cj+>}9~hwa?>GvmyOkc(AKD zL5tTJ19vE`{Khqu$Z9&DmK4e9i{K}}!~s!^<5IMp>1qYNukF_~+Jm3)GLJI`+aAtC zY=f0jQLW&77f$PM`KlOiYbMQk&WYy>PBrnDjTqkVhH&Duo&ABrELAGLhQrG2a7mKY z;>$Em@SbZw2G^^bu|DlUT9`E4ezq<-UoscHE86GGUaHA43slP+5QC?CT+eNGlhT z(`)J^Z->!zC+`C4ya&d@C3hspzA%P;Ws0g|mJ!mBDMjB0q+OOjaB;sE+3PqZ>4rTc z<5frqk}HLw_9;8Flo=Sli8f}qt$~8x02S0H}lIGeI3Z)zjh)d6((>6aKIPq z2)90L5_&9KlW0B;Z55vW`Drx#>L4Rto1DA&br^}4LGVY;iFjI%pC|} zx*?TW{J%Egiq+cRxr**y&J~Cj#X6vGS|sSH4G=sQY<0b8%;yW=4il;;f4X<^OO5c0a>y{U$4D%M5_1uvAh# z)Zn{byu8na>Aemm6^gz4WJ!SAe537*PXo+AplVaBbWR?RWEU9=yXIF?KS(4g@cZBs zos_p((YA)u-ZQ7>b+H5Xz1*HsyBS5O#Sbct{Jt5iG?_L`97toe@p zozOE;ICbP!kgUB&)G=vMt;M+Y?0=YG9CNF_y_=scXZYqb-Y#Hj?h z{IU2blfLBE0{JhGa$eS5|KS`QOuYH2A`x*^!?rMC273VOV-}e1 zuC+Y#qrsu>+rha7!i{H8|fOlBuL-1?gGF(ghJ>Y4AY0az4A z5{eW%UD5Mlcof8VN}(1#{7>7EozxR;*p}MIHaEyT<(2J-FV5>wTPH1Z+rB498F(PP zv-<7?Z)?!t(LKo#%@k|ty^(xz2)9IU&u5$Uq3|w#Z&r{*Hk+VrQtFJ9aC5>nA?w&K>?LLbQ;nk^HQ=NSkuyzF5Wx9O2d){g2BxecIoz!BRKpk`T)l zTHi4*^uu9UJONLE41W!L$-EDck>G8DHP@u4oK?nQ9^QrskJIUOf#h~x#PBj!t6o~yo-6Fti$b(y4k zaW}b=FzSVR{li^X^aVQ7kLT^pvsuG>u4nP#`+6ttKwqx4HGHZH7`G>_ez>V!M4ac1 znLf{W-PFzEpyt~jbFe6pDxp~>k47N4DXaB_**SMNR-30wQwDNnRovkx2PK7@Lot+1Km+wh`*L}KAqHw>6mPaLe zOEV^`v|2|ariXzt{JfO}Fhai|6QoZweABHUYhVJ>dayf}`U18N7(z(7Js_b{$t9Y= z9b*}zas}Idw!0rW`*PC_*qkA&r-Vrqq}_5Xc!L>nogT?nF?!TY>Tj>+pN8NK0X9mK z$~`}eD@*j11*{0cq%0pe$JlN5Acu4srM9?+&k@}@tGEq0^m35vHT>KI=Mn^_kPrHo zzyfKQ%INlTj?nl7gjt|*ndgwBQ4hVttV-3Z%?hoEwfuC4%Ukor`|!>SQA(dkcBOZS zxn%^cWcQxvYnN57v+fon&u1BrK9JOPO$U6W(U^OBZ;L#*RUEPnB3+`*8SYZ#S{ex| z_f+4(%MGpO(7Bmerk^HBeAPMu+KKYU52iC7#UHK`f%94Ali`blP>a0{O@826B9m5V zIh0Mi7NBYAuf3^C?e+qJ>=??MamaZHw#0WLaNyVa#34nmWVVH6f2}tA(75IZ6y6kTYW;0Xd8KW>qP|M?l*IA%{gF!-h4x9 zx=p*CHlqsV!TnkO$g zI``Wb^oBXl9MJ`jS#=Edk_an9?_=It+{T4$1qtQ;eJh6lAI+>JY5b%Zi~b4}TVWR$!n-+b@(`22E!Z?L#3 z`fdgtjf6)rX?e#k>Dx8)U9H?W5GgR_*{9w+gzMXeEv~v8Kmx^FDt^G_tulg|9n__jH}z9?(*F}|eB>6HJ@|My&X8r;+O?ta z0MESgw|VXJzQv;ZV|R01FX9t)odf_LvVo?uC1*=tCLpq5aZ}NjBc%gF1Md)*G4Ir| z!UzwLwCfHJ;%Xd6iUTx5;VOvKGbrw+mbT@;&=+?eQl=4!4?Qq2X5pJ5v6%-|rR93q zpYkG1Q>@38f+@!j^E?op`E3MEF_EOOy63yG4?%O4h7-AJD(8i@aTt?j)Qm}OnX({b z9*mP9AP8siHi^$SA-g94J2>hRniH!GRi@P)C2GmX z>+iaf2NQku{9x2$s?-)SU%EN<_S4dR@b&U16Y1E3J&4~WoHHyfj7GhwxtN@i8)(_% zjbi!8{%t(H{g1|zWfs2qv$6`$imcPNodLSP&XA#JMYG4OhnDEL6KKXPO6}jpq#A{z zxr0!ej*3nc@Qhj2ptKlws=ib3mU+oyXKXC7jIfN|BgX)7Bntelgc3z4CvU+>!BIO) zXO3O!&W@t2?@&XA_z;4jtSL(-8lAHhj2x0eGk<#nE>PN1cmDI8nYU!9@2h3|4sTj# zq)(rw!JiAZmrZ+H-g@2^ zf>~U7V2d4IczQe7p^j$_fskg4tB?EVfeXryos7`9HTmm!u%8_ zOv1T)8qfsnND}!vxi5mdYWj$_><5ZcxDrfLl}Fj}FeVw|lTkk#Eef9V;Uw9LQRe3T ztln}sccb>6YkbF<+CeUx3gYNBY=?{IYwoJ%!<e7dlr&^B0@G2G7jpuJ9pFQaYGyX&17L8BdPObu5 z*0aqza~dRxN)o!lfms`o?T}gme$$C;2e_>_+kwW8zh0XwjrT8o@NFj@!g|s~<=DE^ ziaNQI*C^x=^4&tWT!{s`f9pv0zFkL1Tchy0b&5IATi{OU5x5HEyBagynxo57kaZw3 zPckf}3I5YG#bVUDy2DD7w6y8!?f&q7k*UW0C9Zizd65jhLEKS9PFIMh`s|mV2Za`h z{!I)q^_tnLfUeTO&2n4X`48bpDu>p9{;t)u`?VY!9(Hr!>VC3%?iDU-l4?;$mz0(N zb;~lq3o@+CfImm&-JcB72gy$UH20FNGoqkcAcH6$PBw+;g10GZksC8EPd}VZ${j>M zq$0#=>2!|os8Nm?3MiRjxDX^mF*hrR)l{N75}doF2}zBk5)WDP&>5{g`f=pMkC~6nuh>{KBC*^sk1uPnU#<#+c;5>n0-M-U~0p=lbZVN|S z2&~6Bc5(`-m$F<-fiFkD9l4WlR?0?P*5C-xXK+GKAtr#-(71{1`e(6E546mX2#b&{ zHrvr5cOucwQ1f*IH?g>8a(vmVep2UcQvjzQ<} zx;xixD1|)uNx8sm$yp=qyDTW5btNLxY1b_?nj1N(zuA0t0tvA;qFl`tOw~S5>Pwrw z*q+Y~%hQSF3>pSn75z^Nt$q!V$3{vh{_#Y{&~PJR?$svaa=Es`=7B5|+9We7O5?U& zEGTsQj7y5a@!MfdMdwsLhF?))kM4S)G$zG7I){y-&cO~$!pV4%zRfL#2GHyPa;5ZH zmZ4_3H1?0bo9K(zv6XC+XfIh?Hq|!&S#X&t+*)}~y0S4P0c=VsPyvM%+}zZU3joKl zb$cMdiMa-SpO%7oT({#y+|sT~*h>T~-2u5$wH#(fQL?Gk053#rvVFF>V7OccN#~A2 z*5alvm!|zufCm@nfyzc!y-8e)WD+XV(nasraJA#CkYa+no?c3vO_=N!H@>U8bj}t8 zCfNwDBKb$5yV|4bRp%u3qk(I;rb2on4hrOF)wmBMZaT5uCrdlGS+Q^Mb3OzQ!&1{Z zOFDEgoQH){K*Kn_a@bVd-tH$s1s6#H?t+tdPWaK|^yve1(wf;~*Rc;=R>>KHGOoi2 z?@87fZ#)C-IPn-$eh)}dA$s3jTQp=T67Vw2i2b%#eY2py-~;5@xr>=Ie-sUz+{M7E z5uVNmhI|f&-%3j-B=;K{28)OJkEpl0&I*keYB?dSxXPE=s6R(+)xf*6^iqk*iRXG{ zsP$A(?b(%VKr#lQ-?^0fvM=oAQO0}#HrqZ`;1w7>=9`pl(;)afQvG?w%#0q`#==g& zrXi6BJt@|m{zI|u@TU6-Ca^CvWSzP9q|oPeX{9AK`WO9|U<{CgFuUjE+Rps*5X4Go z0B>iUP2IBF;)Py3)9Ub-T+9 z3~vAK8VRJ-L^w~TEKN`lu4-w9we?JCLcDDgx75ji>G0WF4hoV8yy?-1o`-2?ZpoKY zv*9N{=T4;;TDBj}Hl?f7-N+E@S0a#=rUr4j>~(=S0RTAL19((}2O0)3Vaav#wmZ8` zp!~sT6=0$k#@SLid79$&>AzX*Y9RB0jb#Q|vSLeG84p?3fneb3>lO=U4pn29tN-u z<#;>3oHh@;b1$WVz?0mkFd*v5_{{eD%`=arhhy+1Y>y)kh>fFE` zT{NubcjWgOWrPj;1U&_#z^NFK|P7i&wPV*O7lIal8fLhO_-emyo ziNH1fMyEq68GY?ceHydLXY#o^yF9vqtM$F6Zf5)K*X06H^|x^wOe1)P1*Sk}rB;pD zc@Hni(WH#S2ax-MN7vlmNB)y_DgJxbO$;$jyxRJe^#DK=mD!=pL0?@~P+8sg#C%8y z^<-fZ*KyQo++JC#)CTh296mjpa6c)LUfwl|LCp$ayLi1F>y2tm*%0`b2Q|B(D#YHwcBo7!_A*@s?^$fxIh zM*X`nz5Y>fDv9cAw%CJ1bN_l|4Fmu`xe)uO_^vXW#IpcM5n;7o?Du=U_brw;m4M9u z32zmr_K}ResL937;x-h-cC>>C)Zu|ZuVg4|0AGS%&F$LXVfLv9N`8|`4KEVgn86%Q zs1nE2(GGVjm8>q8s$%s4VP_>d-X0jr@C>xZj|4{nwG1 zKfy>(edU^ZhKCmd+d@kP^KH zuI>PA%B6jti)df~wj<@W#xou*&o;tNhSOdBQ|p;yLd8U>$6wz_ZqwNBx?LTRm%nT4 zpOe*AEYRuvrh|TdlX+ruxX}d`Dnwcc(|F5~XZ&7@tRz9|0$8}YWsMn)r32V_1^#zK z&s*`7JZ%%LO_%p*c8*T~Jb@&@!WBo^68SEv4=GL*>)B*j+=$QJ=Yw_V{sy>rA2%Hy zM)**zQ`qr7XBsJQ_cYQZQQk1P-^RrNq%ggR{`>|4X(p=Ge^+G*PtNE5lI|1g^`ErX zQ=CH3yny-r(?|`73y(C9T5CB#xmzi%yHoD~+86!dlobQK|LhWhU7FhURg8j~OKYa( z%X7f+vYzS+yObl@c)^|uoR=JHER1;hM*4emtcTjOI&n2 zIsKoDH27(eifA(Us#m=Cnfe$|sAd;Q6A|hdz0774%XjgWJ!XGonC!j7S*FzRVitY; zmO2bCC-mwau(LUJ4UlU(ib<9a#$`0sAyL_G-b@4fB~l~lN5mKk_%s|vP1)Zkei4ne z)dBKT<(5LZT3;uQZu<0s9Lrn`$9FYyW2fv}w*x15#oNCM(Yu^9a_q?kKOe~zX#dRo zvvt?==kMB_A9dk zK4y1cu#EKGXVTyp{21|6*LRG$c@p9H^uJVs(&JkiOGWCVztO#-&`*wFBhV?hd(sd5 z1tKi`AK#?tiwb~B?fCf>8uK2{WlSefL?b%fVl=nf%_M+BVm&NCNVA+w9YS>m=@B@1{YxN~pCI*A1oCH_rVJ$_ zbGB-Nt^FpnM|NpJl+w+{O*2A2Y5}m&1Rky>uFH7INJ6;TAC7&O+wW3MPEo;BV9ZNX zw6=3s^n=gK$eUxonT8YqrNr7@>UErkp#6+!b>1h9d-A%#Rk_{b?n*WchWJdINpn6j z(L=(04cS}IcOgrC)Zs8yyBQh;Wjmn3vHtTtziLBsN51Vh4Pvf@Q~`|3iy*74%E>A?F!G^{ma^-Fhr8H=oto|0XFK;o06dms|Vk$ zjoL*O1{TO@Qv;ZRfbC1ZqxS4eM|pELU)bUd+Mo)3n{29p%#Seq-F2G_d*OJmX#6TJ zgo^j`C3)`u)D%nity-;IH)N;DD~mtTze)j#_W(4wH_MrSX5@WJ+tQ%)`Z|J;LzY&p zd0E~UHVl|$9PS{6x2vz#{)i6O@|C*jU}JLDJ@o^-TxEbwXdCa+WT75NtHqo@tJQeK z?BTjSK)F;TEcT32shAThk39IZp#a;+HCFI*l>+9(LiqUZw*YTbtItjU$trSNjYn2G zk-Yx)?xEw+-i6#M*-$`+%I)~J^(q(>SL)A*_zwFg^9JAf-KCcaAm(`tA3+mH&J4(i zMmB};0Z(cb+F4uISOY(VXE6d+W^7r1PxH0g>`U>6{93@FcWfQZZMHVNa^_2NygX2d zY{bwgy)$1bOk2TAvNdXKU^rm`34Cq1Vh(B*4>5=)_R^IxFn3$1v7=-*DI@PgOVUqQ zq++VChBPv)MOtSBWSnZNAj4S!?L%(LfYtG7@vQ`XvvLyHuo zb?m>={r}t^kISiuUXP!hMGDtFsqs^0JK)z2JPusLhXWg|IZ|P` zr(ow$afz$1Jz^gd%E5eR9*W@R08EI^_jz$9t+PEWPN~dNgs1^pIYlmj^_UH4epARH zs)EgQk6OIh=c8&rXiG60a$>9(a{24IYNc`W%S?@ihAuQ8+k$t~b1>&E0) z=pspGZP!xmb;w0c2lmAcT6sqYkxSb20ctO8B9%`7PpnjhGR}0S1%3_M(NGspA`mTZ zKP>ZpW{YY|LU=Hgg)Whaf4t_3_1Jv;38wQJ$5TM7`UQVUIH#D?Ro0?9%Z9en)&Nj$ zNA`~{kxJMonD{^%Pv}ab1$a=P9g`GLdnYi$(VZFg67bFh`BBSTE#sP1@ck15AZY;Y9%fm%9C9Yk zL-BNQ!F+>rP1W(6t(MJ>qx0pTI*886wiHFS0cj%+Q-R!@?e~DjOjXM%>qRy*r^+6z zs^4vv!+KU`uon7+K`!##o>5X}J>m*npoC~a90`h9&YJcWuW{ZK!&EB8loB7;Uu|Ja zzK!gaeu++OXH8aQf=npUm*}0AU;InnA1n(P(a~aA??NTET;LM^kcx1p6b= zEV1)cl#ADoWDJVI__Anpy_kyL;1l5PnUDLRgGt zDdf*6qSC7lE?(pCJYUZ3YyRG$4^aEd&zf**nq5F|Bn%6tb&2|Lb|*ej_GozCM^6@u zpD~mXl8|@{)a&kP0J|sUFYssLzrQb79KXf%lUn`@$d%u|qlCA(+jmNQoO;E5nNpQn z9k?!$g&zXeyi=*PyEnI-DU9K95_wfhrSO(cjaKdR2-zwJx4hJEM?-X{$h~8&Q&Yf7 zf3>vY{npH5?YyMcn=HE{d_jFi47VxKxt-C-tK$-Ih0Mno^2<$K0iGlCYhEY0&twW4 zes~+ys(X#t1{LlP%wEX*CVDC^y(1bqD#*hWljD>! z6;UNLCT8 zppr_Mm%p_{`yrP%fcGPi4U}PMg2(*G62w( z)^?FS21|iU_{hU%wjkkGx~=NJacIt$vkdwhiDFuvueEisS;-BQhklBgJ=#&w8|5v>BtPR zN3RTY+9dFfv2VSfaYVeO2(E4OOAXtwQ(b%k>Uc$T>X&Qkz-ovD_}uiQ5_M>o3$3XT)+F~zVuMUv85 zO=cJQ29u+WlHf{xh+p*CpL)H$A`b$0FKLE>vwqf)8C?%>Xa)N{z@IvVGR0omY7Us% zU#8xuq?OlMROYdgs-}3oK_ZVduArkJZ@}i~KJ4x(h zaQu<{8YLg?ByqB>!jGf0vCU@tKSB!W;6$Loc@5$_9j;rB3Lr_eyL|)a;QPEK-7wM& zGlwl#fE-{=k2E0g`s>v0(*lQLRE$p77XlT>jW@?eogdM>~>HA2-?1<5N!4DPgI@j;*^?pm#R&i)8HhLx8>o955 z#yag!%}}16IR$RPD}`>+{|mkko&Ex3pxawqDc?ufwg7*4GgS4hNCh zA{HC!cj7tK1xj~nKRY|r-Z5Ex!~Ky*;MA7S)Xd}Vm*2q3S37k3Cim%pBIEY7ON@B4 zxTV@_^(J-O>?9=lq1uSJT&Ilp$*_6BFsBBq@{sVcq*Vdpxu*(`2@-=~Z=>F@R_d|G zNg%NNy9(WFdEO%35*MTqWQJKj>)lt*i1ATRtk9i zWD_fuSP`@vZ+bXC z85ExUHa{!+V-P1q*jTX<08XAR6Bz+~RM+=)Fx&J&sZgycM>R;Qd*R8^KibK^vhA@z zBfmZF$PReN5201f`mM2 z%%{QIZxR3t%%f6-zh$`nC!F`t!LtjG&b>yqs_(lpWug1i(PYndVw{;sQ&RF^pG)0zI5T#gDC} z^j{kcOMNO(HqZA<+{$Q_7U569@1DAvhV<40fIBE5>L&@Sf=6w;iYz?Ya zK=$;Vp~t;Zhosts-yLRIrc>}m9Ica2W0Va=>76s+rRt@#3%6EF0_NR)-15Sh%TPI8cp7j>?F!?7S|%|8owvi;YDk7bB4;a8O(^Q>r!mRtG~$OXFkX zTW34xox{vJ-ZMdM3wyNEdtaFv*=WV`hQcI(G?)f_a zIn`47wKMqe;QU2ZTkdayE77-t{WL)75Ti*xPDM~COFF}1$zaI9zRj!X03D?L#+^8V zmNjSXhcpm^#x0@Urch=dzWZ7*>udV73r$s&B%*Gbr+enFx8G?TfJQQ3OStfh%k6pY zL5ef7{21YTNqMa91YOU}eJXu>9o6=67hqN;;|?n;7MF$8sB5p$Vlb(M{!tmPOEMEpu4JZS1jO?|CzEUz zcvZ{c{8s8r6yQ`>-}CN3euJJ@U4HU#3R1?K@{%$tJdyY#{_OYNQ{x8LTE#EnYvT7; zqDgKWIEYH2Fb&Mp+Pkr**4q(=47MoS>+?}-Pu!`^o5XoF82DcVI8a@R-G)}s-dH*^ zCPa^Ikp^AIJWxAMmNZ^0W#Q#)*}4{p!p4f%XvuD3ffIPz(-PL8=PZJyaZQ;&svK0I zST%Brm)n^zt_lFDl!gnrx!9(JHksO@`t}K)H*>-G>5ZDe!_oA(2a3ei{Wp-8TgQo!=$m3VNx4J z6lbR0$`UxPrjo;CcC6n;H>qq;;K=e$>ZcSKsq*n;5ac!8KH2iS04F%m;h?8 z@#Hv~DxgMv*&v1O*SuKkAa3XlGW{c&X!Ir}66l$Dsr+h_z5O?A-UDG1RqH%R`GO=p zQ8L$@2-pP1qQ$@89P(6^PN02Ln9uB|Q*}^s0!-loq|2yw=2~;ks&IQJYle4-BR`xCC*xsr;VF7p1ngwjH}T`;wtzZIHNLg zvm-Yh`I6fC`C%$~Zl6IAo&!9rf=;7EO-flrlg@Ygy`E6{l44Fcm{xU(JQ^Te*;^k4 zG-}Djb~Q5yo&lp6r1}obZb^?BEsMU>ndWEbe4zk6L4LtyZ@a+1^q|b3E*Idd%M!W& z03S&4VLcw2E}c@{Yv~h;d~PwGcGoI)+rHdB20UY^p6toC{%Id0{I~*~Bf^qvck({e zA=7{!51*+w8LCO*MSf4f8<2kSJ}iNfYeaNye$a-#bQ^QFDS*O8_=rKDi^~7&>>w6) zz(y7AG-3AHhEaOa3CChq#xPc5sQ;8>vxuW^NW=SLl^m?d$|LS~>jKXfdYNHBbfiaUHNEUVA-!5@TWRzIrar zYSc3ev53Iu)~8fBciHy2BUS2E4>(x$eji!&6b(Je|2-Pa(W6;81bwsGhs-SoK?SqF zj{rRY?x$+`bQXyrJs*9siiXH{&XKY5bU1IvH1yp3{iypyq4Xt=WODF+4J;T-bjrwXo{oqEAyEC+t+5%UppC+so7yNIx_gPcUBNt@j@ zdwqZw5f2h^?R_xx`jz5zmi}^IA+y9D#tA6&{BwgSzuM5;b~! zAe@+(&_!gJ?DOXY2hLEBhx5WqSY8q*Ro=eksO3sZ0#*9xE$6<}nF-TdZ?#qv~ENMu}nCc)dIt6@hk=LKE;3b0)}Y zwl&1_;W{B}wCe~f*l57%U??a&BlNaA!&hZAKiiJqmax|&9=E{Z*$1!^uwPQ?yxJ&( zjtV@A@HjV2b261hlt;(_7obydY#z7n{RVyJ#$MM zJ~}WSsxG-nKen1Bal~KfBMeEyVa1OA5mPON-TSe+Amx|qFHXwy&+G3_?>2=UDe&Jm zpWS)5cxh{w2!j%;Kkee!=9|oePdkEzjXDduehi<2**>Z9CDww?KeDCbZD?&f2+tij z>FF38zktz9A(n^!NV?8VCz*+$5G1YrsWMotoVhAAAKcEMN%_7HjbKh8iCqB3hGedk z;?zh7?$xq5ZZxdDo}y#C*Hpe%(fbg72wsAbWe@Adzv%TfZxicBm0}auUK!F7z~@7$ zniL+<^*+nhaYuO>F6Nx8n6eMTo(*e)sw2T#sV(6@Ox}4?D&fMR+%7@&7#qto^EgPl zf^N3I2}kr@A;**_kL$Tesd&6D(%^+VUsjJ7LSdU#(p%c6a`qZ3T&}egA0VxkHBXps zxWUbBtmt}nL*0BpWOOX&=Dr8taQV<~sg8W)F~$0P3xW=Z0%~sL+F+K&0#(qfUUNH! zxq9jxLs4$S!q%2MQEKxQ6ggu9Xhnt*>jgbTx z7sKuhFMs&mmrJ~KpGiEr+iY#_+fn(m(lk>ICVKVum!YlZJ7%I)cgr1mi+KJMNM&=h z6!phcubu1)04Ud+IKFTT3H5#ODG&F`@Qxa&#bDn)M4_b00DGy&q0np;K6o1CY=!gQ ze!W=vvl1&P4V6yqWI(o2c!3@4XQ@K%a;-?6tGKuMSGM~?2iay(5TzfBDD zpuQFbDjpNvw<(JRF(=vS0yQXcSdeotxQ0i8Q*(Dk&ht1Yc06~!yVYxy*5#&Pra0dH z%hgJNc3dUYCiC;e`t}I`+^2|+RO$?vyA~G*pYFGKcI)Ew*rTq%RfQ1 zd9a4I_a{}0Fa8i$mS~bg!kKlb*CllC(zZ&?GL6MrKi^{}(U75=HAe=8rxh3^IuhI+ zm4`ZH7-{9XpmE=giQ2HK;tvy=uh)a2`zvL-6)CY56Ydj1RCAYS4~QgMlvo;rxwCaY z4t~uMZx1=472~O$7fWA!Lmx^J{rR;6H3org<0sGhl zh0#RN`4(yO-1P!0i@z`*lUy=R5r-f1HWJ}WqVLomRx`!4kL3fzuSOu8Uh=Ef^*`hc z6zjQ8RmYr9_Cd{p8zt`26xN&D=kWlI zOjv zco>Oov@*qLIRbDrj6X$%%}lj~M|Q4R3qsc0)eh2Xxib5T)=5X!S_Re|-jNHHTx+va zUa?wt;D~{OnQw-Wln0>Ov*)~-dz4JY9pcoXyp0L!u$ZC&U6 z%G$NLA@%(0zrE*s9neoQzP#pZp$G8eF>{;Bcqqxl<@Tm%znS#+RIi7%Kf<3w2xf#p z^!{(AjWWK5?Wpa|Z#Eq?hS)$a;-S537PUM!clhK6qlHwN`T>=9Eti6k30q7fg`vnE z!^|tu#WDU3U?nuaLm|J@k=k2$biUuunjXGCpK~m^apsKiJSnkLQ%xs4q~m>xt_G$5 zUv#~^k<37P^Q&)cc831TTYU(N-#oIB1Th{Nb~6qp>2pj{pZD?^ifwM1>6*@w=9LFo z#F^YWFDY3S7}bUEhir+lsHZ%WdL=`|CWbipnHtw=eWo#ap8k-U{Kctx%OA~kU+xX* z>5(^(Y4k7n@Q5*cH||SD#X1Px{V>UI@J}BhDxplxyr_db#XZ(cMcK@`VYIU-p*QcD zSC89k3BEYX#N-WqmVHxTvK0LcITH8?DUm7$!uW>JJNJ|~l60MN3>jLO5B6=#c-UO1 zu>;8BNsLff(l-$svXW68hc)So6muGzY-Gn<9G3~6{`&p1%*(J@4IW&Dvz(8t(5)BN z`%o+ezGY+=yUxu@vt7aw0e*6!7f+u^)aNMO?Pt{3O`g=)Ku&Um^CDLiLo)PAyjJ&< zNzDB|Kgx<@Wo9N)7f8Bd`}EH}L&jEm;eRvi@q4+c*m;@BDNTBgfE{NbAqp>za6d)D zs9!j0HcezW&ke2S7Z&1VTJj7-lvfD_ZS3yUhWesx+rn1-(``ZVc^3Z+|MKiJRRI7y zjRId{e7U&i^Ehb8vjN*Cn&<&N*4}jG9wx4cn1Anz>C>Z)`p-uzglxzk`E;ZyxN62; zU{o8@5h8MsCW!<>bATEdUjI(JcSHHWb8Qmvfi8#Z zU{c#B9I0OJPY*al5y`M+vV>5BtG~N(wr0eRKHd8v$8qS)5ci~`L1q2V&j=^5Ho$1q z{puiNkxu+~-95+bi+kq3YUqm{tS5jSOoQFj;3I%Zt@y&qmM!#`T2((1#)t&j4rmOeEbsoF@xVn0oRvt-i`rpU$ zBeur$BpdESp)Alhc$!Pb`1VBoX>k#KBAlU|jX-Svy%&-cim|P3 zAYGV#qqes`I#I5-wl~CKZNugY@$Ni8AFGNUbAdeT{EUKaTEuZ1?>zD4e zHc(+)COx}{F0a2)iT{FG@O}@{%YRG3jQnGkXoOaz^%y3v9a??L9eRjl1 zY%}a1f%A2henGBiimY0!sQwX+@IB(Ya_>JLy4CAA+wN;bC+8s84}pN^|1Min`lkTR z{?7o-fFFpZCBvbM$FUaiwKaCX4xWB`-{bs&GSB}T*uNDrsKKX(IO_w0$uc1_Owpnk z+?p3KZ^3!zI;iQao=b(r@4LHGrYELjUr0<|f0xuG{N}sulVl;Kp}cE3zDE^T;D8z_ z=mLjnS<6MW`RKjIOc{)NSAP-mh-`uK^zV_r_eq)_jUFK?i(h;2D%Q)JcO3QZOINq( zek@Mxe(Le?L%eQ6xlhjKke>#g;-RVS|8-e>PnT8vAD8t{M*3fWBFH^Gv9+K7{fYgP z9sZY}=lC7$|HT3TvKRZ)H%CN>R15})u=pE;Kw}brQ{n&m#pH@(@YL9df&wVrj3g-C z*ul^xA(Fqz`hWRi;8#MzSQ(1%?ymknxzPW6asShU{J&4`|9^@8-(~qftd##($>smX z-2ZSV|A%ZD8PMwPY&Fnl0gpZk=KaW@O#M}U z2;OS8kdlzInbE-OUapwM9w-R+fCHyoQefcAmRLpuwwWse0%UAjpJ zWrO`ffEw;3L`doGBco>ekBgGm~*ytilnlm*$C8HyRO>%;tVER zMw!(8=VnOn-t(>=b*AKLRO*Il`md%&6~M7Y@aM>+(x^}htYsfTF9jmC&l&$8w%)=o z%C_qoRzW}-1wk5%knToWV5GZ42I=mW5=26}ySp2tWd?>Gx|^Y4K>9touKW4!=X-ze zUobQ09LGM_UVE*z?fwP6mt;v!Gci;OQAH1M^KruO5t<9R95iGaiuZxHqExsCWTv2i z^;xT+=dV<+v1jtDEbXN~0hSj&QxNh`yF>i|YgH*XDT9m?FoKLRdu?p~h#Sm{9moHK znr5_JxE!4QU}=v%QQPIaKC?1-5@X;d3yF0xQwOrB?_)Tisxef)H}vs=mEH4HCJ!48 zmQmU$XuPMt)0$3vb*+!NPrAvAdggR6BHz3)O~3IH;N{eSM9egYaDlg#LJ$AH)0puP z?2nV@`lN1!Xcn)=x4EsG?3dUYtZNRzVnv3{Lx#{1^5|ilq)qczdpZ8 z-Jx{pW9JXkjKMigi#E^qipT4y5oe`J7u)0{+_3!;TTRPRwEx1mbDsWzUK*v=ZXno95zgbF+Xtj*2D5k*u?89MpwGG)~z08wUDi=b=}_`ludaChj z9tAZ?D7thO@5K#ph+TsZJL(oga}@)P-!{5cjpte;KA^rnHtCjz&e?V#=B1U8gu6A6 zf~{!aN7CO4G9R3ZXPM z_%3FD()weTsQNPaB^R2&_3eNETEZ~bbDox?1Ch`x(29~_nCsgI^n3<%tN-B({{3C2 zM=z6dqfpOgmI=ee#jr4>vUnfKf6u5a9|_hV7NEhBp>uiO7UTA}bk%+2;Y^mda)c!- z^(0#*9)^QBHfp^7ris}L3kIbL-~b2Gi>B_f&a@hcRG%HlQcPu0$g>-Z9^$FBJBXmi z5zkZ%T+N+jkVH}dkNEus2S}LUQ?_u7!bN}Wy2!(~8~*-ZY`>lrDP~V897_?KJ2OuE zXQv>|*Z}yb13Z5*dmC)FwXd5K$MSCp!zc*dw^nMxwxYkNpqVwF!00Tb2rR3O+Bw*V zVuw#wTrqnuj|g23I3?&kPl6_Nbkw1go_j{LA!7FJf^l6k(%efUT}=Y{oY?+i1Yt22 zC4`f%zy(O0A7;Ijpa!0u@B-ENuls*mh&TW1QdAx$nne6!&y$@7D*(;7nbwyipT$ah z$7QusH9FxESxI_(FdN@qrMT&MK2!ZGL`D1E`y+k!+s3VQPuaBD$&E7=g_nOd0A3NQ zQ5p~U-8w)82O~?aOWv+?v91HpdpAbJapN_ntfK%C7Bo`ij?l6>o+as>G=+O0g&DUP z+nA%&PO*G4Zq=A}NuQR(i3}}7LET6993rcAKl>hdJ#FXzi7~ER^0}5>;OENidDXif zfOr4ql$aUF5x<&l3egpm!R);kJ|`(sttusVJX3;$qP*AZa4MiG^)h3ioV<$Ny(-jssaSK>BJEY%+!%6tcv;ds^CGR ziD^Ku<;tpQJ9s+n$HB*8qY#SM09FXyxb5U%P&)zf%^Ld&o?Ibx_FL13TDF7UmYX?` z$OqIq|bX>v_)m7v_YOH2_?{7=lD~f@YN}~>mrjS!6 z==$I6-ikjVRJXLM+BNh69FnC!l7p0`wjgZ(REqsx&heU`r>+we!?nA*!1D6kwzHmM zwF(F(;t<%F1ifUphi^p)uG$rS^}dGDxiotnZ)mW)E%)@wux|w~ zNWpxV(|Mtxx?@=xKWw%0FgHs1?~5LWxH`!7jR$hhBdYWUsKm@%B^jnuOzNe&Zypq^ z05)3SVUC^4lT9^Bn0o*LgDD-n@lM;s^UhOxPlt&E9s@fDB`yJB)|b0)@3NrkLd>x& zY7B7=vn0Ky2|&MpXgj&9VWRcEpV1S>sPg$i+_`mPMkunNzq-+K*q#8;ta8AuVRCw?#@_ASt%()(t|p zyOZem2M>jk+vsMk0U7Jv= z_AH|{sYa_S?(7~ajp;&b?C)7x)NC$`Pf zI8}>uMMpxSEi>fH4}Ep5iMIOg&U=n=dybitcObQR=|qsYXl1x7GAiKy9Wc9&>s7!1 zReUj&eBMeZaH~q2ET>&thX{iCKf=56tHjo`M|fdNh*27}9B>u1zqGR`6?wykG^mrgLE${6o!Rf&k%SZ7BW8Xl&#iAH)v2V*Z}UZWx^3+*O8TPB zeiaVW;o8H+)ly}wb0*eKtGr_f6ZQ#M0HP1_U!sqO#$XK#7UOeqhW}kZzne4(z!*XE z@LS3j0^>VMNjj+CuWU;sM+M?eha3s zU!?Gv-$a>?ip!EU4^di;-s!GK(;mb!HLU6?>%OfmcSe%9P_N|_>)6y`8zY(9z4V3E z0rbauGQ!60yxZeQ@UZ=q+uT^lU_M}n+U}yoH%bs0E`S)r4v-J04Hj7^mzr)lXj+=d zxam@H*`kEUt8-vZM1l*j_%SUf%-=q&L&ALR5wxcx=f|Mdo}XDIcyiqN1UFyYhg|rC zUI0k&(7%>p<=r2IoY%DH>6kU0@0KW^ppLoT(3`gYdVZ-1fXVjjy*F(hi?6pbTR zJGBF*{jOi0*_|hgbli8{8i*Fp^QeQ!^ST{*UZi`Q_N)b@uHjQMq&wHwP_6sBzfpU+ zYf1tey|nRMW8!NtavUx^v)kp%jOJOb5_K`lb61|Y*aG?tVBI}anbU!(lxamq_}|v~ z-=dL_k$x$U+jVj%Qj3p3D_wr7E{0p9E-Iib0+Xe|c`jw_Vi8@a~lYO9PMba6e!3O|kPbiLHI4i5+DaXtGnaH?$1ndIz2S^osD; zhg069UDHPpo>&nhe%JR3Ub8Q^>^gikFH5w`(4qyBA{)?#=I06u7Lv?-?g?1t`!l>4 zi1i%eyl##Qd?}9pRwT4|XeC;D%Xbj=L@&14G@!v!GpXut3sXQT!qNv+f1?-;1fiEg zxLLdj4r2FG)Jg$Vl<&n$MOZ>?7fL>CK}E{m5icwH+Q4rZv@T}*YL~b74gd~&`7#OD z9b+jKTHeTSx16Z2kuGG`UVC_^;boHU#>un@tVWY7U~b6Q#iSVISr}2$)}006o)n@` zQyjKk_}A9#w(GAZr5^^gM=v{EGwJ1iMto4-wP0YQ(`nFJ3OT1}(F(W~Rt^zrDA0W3 znBmzxhcZWvpMr}_t)Fr_{$dl1oa2F}$H=qT;{EoziGEx^Vh!Jj_Q7y;(-#d;A3g8- z{c!`LR{Vn3&tko@Q@OzfM}Rk7G6#zV*pP@oV;Z7J{{|E=|)k(baz3m`3T+-?5tQDC4Ibgn<1|8wkwWHs$% z47B4gP9cA#m_ceYIa;hyW`)OT=Tj+&uUWqD79;dsRD`nkFrkkZs?v&8*r8F621-O) zYV3UeAV#h+QT}N zkb13)okFbjle_*Ib)}t7?s8Y^DOV*06-^P;_7;zME94ZHJmOlE+(7fzfug*l_v+jw z?xB+(4JHWdyoRT=xWOk?=Ob~b(%7SZH+_7P{x)#6ahAbD2!zzi*8S+7?Wm!7*;djGUkyoH{C~^uDecwUY~8P(3w+_ z#cKXz8W;=p0_kcd#V>;Drg)sfX%t>#qxn%$tMO|Q)9UT9w8a)Kj4gu&RZ#UTZm!WK zl-%z3odwTY{jsD*F6Ub$t|P{!kjKIYarC7vC;Usv?=z`ovrcE%Gs5YF0kdX2^`zrl zke_vL3d>TG991q2(?7X?p&)GON-b>Y z9P92(QuL|){tlg9FDw5xVt#ht!}DXv)sW?nldLW+r1Bzth22$^x`w`mtNu>^x0}Q& zt%kcGdHgnA(Wbx)qMw)Sc?-einAktCG_+8(7Ux?T-gR&{Z9~E-dMiP!9)VC-XgtsQ zyi_?!X$Ntlky>l4z_`91m5va3ItnAXAh**;S&y?L2-W7Tdk-o3_|c8y1buWpo(TV# z%EZK*0i+fUb9kSR`6`FQYG^+~>G~?;m4ImT(#+JwMWslOT77NPu7f{GWS=_8#qR7> zuKB4ty{IEofXeIB>Csco_V`hIy9ryp3iRbhpqYXQ%ID{schF{#Ca@>Ub+Bwm)8asw z)b#H!I_~3QeyAymn9UorLLOOu#06gl+818e)}Rd-^~EHNLl<}vj=S1!{pE*FKWC8( z*`;chWv>;>#q7&BP|7G3rk*Zx?c<1Y#y`e(JBkiR$-99~ zHN74b7Y~suz};upCw5l&-sEF^p>2upzZgNUQ{}Sk_iHQtRrHCE_;$!LUI_mNSw%(d ztf~6akNU0hNA)5!E92R)L{mbU z|6dZp(p1>few+5T1YLM5U=hfJb-px}Q_PSjKlCNHCK4KgQ1uJo_qP;|y>R)_`4}Jx znKwM2pU;ED*w8*0`bm?7c%Y-vpy5PezEH?#wa4t0saI3%bE)mQlu^%k3c(Z!;fKs@ z>3vBUNW^A$n2!40PSa{~^^vW^?KT48oUD7T)>a;EnJ&{T{6boY6LysXQ}pFlLL4{M z+nza%WXJ-YCIg-$Fwv3={CA?&vuC+b9K=MuGitnk&Z92we&~HJPu3Z_R6J##$X-22 z`1}_2dx4aQMM}2Hlo-lDs$SUAV4oOC7RgrD8ly@#*~&6&_Y>oYgmIDNLh{a)H&lH> zyz5o9Gn5tOZd!(zl1(}`9XfpqiMcL94HactvFy<2Y2Hepg08{gTKcsy1|K({4owcJ zizzUypRt`6^icmX6AM5(wZgBXwBv28$== z3(?Ejic%Hx;o6@iRu8*SH~c*hPg->quhiOAX!}NIa#v4gcijgO7nvra?u<*kyBAqg zD0YG}S(+U-Bpm~UX%^dbO1REhWAUaD)MhRiCL=!>N0U%y>N{(?7Gxsp9fdxjnNBXcx+JoEF z)08|-v`^KG&!w08&hf)KKY++kN=NakJT3(t$lC`BK(N=VUQA`?0dAy>_wJU6w9=I&HQCmf9LJab-`7?FxVF2; z_I0*rwIheXz+kWX=mgNS6#YjC2n@@U>AsWxT!U-4x=3who$l1@v_OdB6~Bc^N48u& zgI`Xh(ZBV0)(bX$NhCZd?6*TRp7b-QQmiTr-9E<=`+WsiYxAhGeslS;_c(|%Yka^m zx9o0I)!z8W43Xm|Q|wl{@2GyuTGxA=yz70Qe9pm2FbRLG^Az&x+#vv2Kn}`;+uBvl zexA#IVGS^#u;W6Cxa~bp=Z$Z%^14oxqM3bG&o=mVI0y~T71Z!M_e+zv@{&S0Gb(0e zLX)Cd>2V8&=_I>PUY6txI&qkFq*ilRF9KtMpqcC)jx9{^qr0X1TZSo}v=@m4B`tYS zb;w3p`6S-VUGA%G;y|q-e>jM^^!P`e(~(yId~I@gXdRH+_HCcK*XTPjhY+Lzrk^8#w6EsW8X4YE8e#O4M7~8u2cGVV-^zX2y zDdu1akA>nAdz_D@EV@HfqhVi=H%1~v!l@>ocp7}H%9}{!wbGj~QQqdgq+OuM5;Axq zjTdhBRnyzJ>|KabW23ZF&u`rqgBNtfU>8d8TBiMz5Z8@$;|&$kW`2VXdArn-knVPe zrdAQK$xG92A-ENZw+j}0R8l&5aWtZnM|0~WQsn;A2Wi#jGwbqUi`Boh|Il{W{-*4W$eyXHd55_hOzd?Y#Yng{V1kmPP$%mfqK|;1{GDjSKc>lP!j%DjCs1m79JfE2Tpi1XS18%$ z6eb<*n0i7xJ=(5uY7wtVhxv*8{1BD$%UWEYt%lHdMT)DEs0Q+(W@ ztQ$n*u8faWgvo6;pPX!t^)p~Ohal(X^yhlnN+M<=w%xiYhXY#&?8$!B^-@)spV#Bt9sX_Q|y{#suD;&U575HkQ0o0Cx;N0y=JV z6|{BtI`lKQ+Z!L2DCF$y&H?1eAQyaJ@1tJzrh$>wgr z6reyer1RQV`=c0z{*V;+_(lU&h+=pnGUCg`(EHCxG%PHdRuvuJ>y^4s5m>Y4SJddS z1U@KlZy*fo<4umlf9!sUq!SIIT;s4yG_(G80K~a)Wqy;rcb~412hrS~W8{Rvr(L`q zoSWL|#(bGOaOy9&X3Ac&`5X6`WYIS#9pgmOFB8xiLcr37L<5r^xDuZ7n+P zKG_kS5A?QMxzZxiEK~TnFF_NAUV(B9!+pnhTeIq;eA{=ri~hVzl0u)8Zeo}|xL3v( z2%H7XTOir8DcV8o9_%L(7_S03^uj*hmAFngH;f<*vUy)F=Jb)@RPPu zscH99;u$&N!qe5ozITNElbhp*3ceL*YlSbeD1#42{n5#(fgR)uo;DtCg5ucP?(kj5 zZjVzuymk?5McI+{$3o2GKzD0(t)_NkzmfD7!C^lszE;{n?=yWRlmbHj<-JZqo{iiS zKal=>{{JQYd$zknj?)0SB#v;nKC}KVV*x>d0cM)CGh(7rQs9>NCO_&cX0IkOc<@Ce z?^GwBXw49Seh9Njh-6CLO)#C$t$sMmDNE>y)adI|eNMFFzyBte`ih_top+v$)VBG9 z{t4sd0O>*rIjNNn53?(O(#S^!rTXTD(VVVrol*m*B2ycgraP1nDRv|D7nYhIWcXBH zmQ3&t?_j9*SM~Bc=LEOBFQX~nwnU=|zx4W?MPd}|IR{a&z%g@5PIc1eL&hAcAMUC6 z1A=tLS-m;6eg3(}7fU9t+r=n8uZ_pBCy=+GA4N-!3&2x-LlLR7`5mf>KwWUqVHuVZ z>2m7WM=NpcAZo1_mmIa#G$+=wL43!+I2X8i6Zbm-1GH@ z=EN)H?iZ)997kO~YzNT-nJ5k@6%qINUZyFQ8OC5013K>V%7$S%5tt%Q-2Leb`{waW z59eB6G&BKw3}1+HaWW|osJzh^WkD;#(tuE@s@pTaHmW=L&pBweLDZ1piDKAKRD=jqD6AyhveA zu`PTRG8R>u%vpi#7gzp%^F>JM8^

ft;{C^u4|oELm8B;J^NVl{v0J5e8I9A!?1q+mrC2f!Ix-uNDe zn5L&;=5V-v>|-9CB|Eh#2{)E3k_{HLq1}}zX_)y4V_CQaxjqNyXz;lQ6~6F4Pr<2Y zVe_<@Wtl8oSngEE3~owDTv1=6vGlp)8vU|fMNL;>R=&aLx8mUHtxcbrFB}(ZGa3heIPjst~KUFq1`#mB|#IWDpY4%vDo` z4^ToL`I>9vPzlOa?#~C8p;VjFFIMS6f0NRf8QSubzE&V*`nA#`HEE@rfG|n11FEB_ z+0Qe(`Ng9ojOx|0+NLZUs3z!X{>`rhD`cB|Fw=)SB7xn_fe`;D`K(K?0DfdPx09Vx zTMgbdD@Qmkh$SQmYN^3w=X3HRCkE_?`X%|*!CQ88g;X*7=`V>teHXsuQifd4qY3h= zRx+0*cpCo|KmYgwNtUi7VEHm;$s-ONGz|^R5Bu;n<#WIyqc48P5;n!{Y<^M8kWT%8 zJDlHD-v%JIH&Q+7QZril)t%Z!DTF+DB}n?ObB0}{UO8oREq7Pk%pL6-_iLkowc;w2 zhnp(s(t(SgN)QSQ17`YH`p;|1=l3w!C9#lWI54PW=yKBF|sWy9^v;|f{ zOR-P9{+}!U>t&duw=LV+Q*(9Pv|vqx1*qP1qd&l6PF>OuJXXMGWBV?}*HJyl=(<)o z?A~@d4j!zUvQyb%Yp{35=u7Ju9mO3V+a_0$-$;^qkuIAQy3N%%LTF9>)#$u5gCpo! zg>B|#1U2(_ZTyCt)Pbi+BxG{B_U!eti?pFmm)#2bQM~= zKX$3zR0VaGXVr}A^kW=PG9}ujbW3Qzx+3E!WSTA0X&ZUf&-8zV18Q{IMipuhJBZLSo1I{FOPIzU0>cK%h=2G^${Lb-Hbqp*3e zIX(ITnb*Vly42nB(-$HZI`6to(=u{eQDoYG$bS zzGNC_+FzNiObrmtYsT7*(46g z4@=u8$_`!fi|l;n-a=k>SSIxw#Vhlegx0ax59j|FoB5KCXrOYX{D2yvwEK95H8O=h zog$w2H#-U>$?{S0+NP&pHHoQ%vV4WvRU84hiA>rXw#SI4(!q+b@Zrb|(0muy!PbZo z5FRT|YmIP8QvoNtmu>twy23WvGwGuRr=jxcO{gI~gjz2nN_9F!lR4$;!m~m`l7>d# zP{Q%F^1mW3aSuSxwyR=xD7qL|0C5nX*=oxCW-{Wb^L-^OmFDK?&6_WNbrYYWQh-sS za=w#{z<1$Ofo<|zo*2t~ZVVWgz*-x)`~Bo7iEDqcaaCK8u?lrSTp+M3q;Q>guFy12 ze@XnU=piLI!1&g6qwYYmiRgzd2)M` z<$rqOxEqN>w(rb$o!A7fSbD@5W0KE}Rw|m=UINxJuEZ=|LFIE?p0-{%jJAhYzc)&| zJII>n9FoZq3UQ$z2f^E$&$&EzsP|Ija+VXDIKL%S8ajmL;QCpHI2$e9(^ zsvk0|#wrGaQ<%P?#lCKE9OP<^gsd>x!A;)!&5(*~#64 zQkfi|@dohI>5A8++^J!6*djK@aJeCNqHjCdI~1W`8qpx}M7V+OOf#zRwj>P2f$AO!R? z!IgIHrrA|uP3-kWGERGixX<2B4NG|D%WL1N4~?!T0gq+@m-M*Up#LnzorY|DT!}w1 zScLnN;d3p%3VG9-&YcFBi)K4unB_F}@Wq$UP}-yYQu5d7ahGjy2PRqV&y&*1m%QyP z5L1i2V!O)SM+jVqWL>sVK3l|zT3+vbw&>FGjbpC%B*bC9Ba#4n(eGn<>J(}2UnW_z zn6MAy+y{c{800w4`Z=YUZd(M%gn^ZXL~%|(9pNp=t~h{@as^I z?j$q9nxR;R7kvy2j_$bRJW#I=kBqQ4Aa59_=TGMjqxW2Yk~SQ<#q57w&XG%)F3RO# z>9{5PPo91y$$EUeo6odVk2X2MhBlda_@^m;z`PDzk;J@bQua4~RvF(%F2pSj;mkNL z5ZGC)Q!1*3uk94#<=5<-_{t@petEZ!EI(-(z838J=eN&T?(yaMidB2)`zeq$zV3E} zj%}1>g^j7fi~Cl7ry<*Wm~akB_TG%TcbRP7iNm)#Tx17R$(yU$8J|lI3#}loCOm;X z{H+;)8pFulLR31F(Vlb&JzOV2uBD(#q{D&*(XTDj6MdG9YIn22>&P72ZmT)kcZ6mZ zCW&3WmSp+~SjqY}iw?8x>5(^#g?VXb7yhqUgv=_q%6$5NF|#U^qzZpL%QLUpU2W%h7JPk|vIH#MlC40b^k@I`k^(nR_#7J-3=9^nO_sL>5z(IQ zKW$z{FrhhII(v~d|H53r2My3=2yBm5Rt=l^8V4iv&m_AYNC33+!^=Ba(#6hAH?oU<+l9TC4_)K1H@yt z2p}|Xjnm58pVA@42dpUcG2OEV5cjx!rA2}s6Ni~b^BPX8RormZDa;m}Q(VzUo|V3~ z!-ApC#v#Jo%6-B-qZhvt7oLfVHnGm*`v5vBr2}zalXg*=Mo!X$_eTnBVKvk2rD}AUbuC&^U(Z#y`shw0Q!8)ttK+t^~wvEy5m0s zrFOz}@M^%GHbFh%*}RQziv%vOp)c(qKuBCNn{S{(x?V`PjUcM+YcI3m#7>r255;+| z-4sf8{@hpp;<*!}1t|xz92Oas8XQ$4!HQS5KiQnyStfW)lD=EZlbP3)9Rr%UIEz@1#RskIP$5BV zx4B)(b98Ae%`*Z$FIWQ{1LOK zu?ttm8#z_Y#}!8E#<b=!$oPd!B(KHNPEqM@De67$wc>?laOxpPT$a$%9T|zuPO^&UWZ^ zpDb4YW!B$_qS^-1r=1*4l;~$TpqJ138@^fqt&~Q`gs0*nR-cCrZRldE(QyMWg_?`2 z#c{od32wjwlvI{`Ns(5JN}=4F?D92X3Q{j)4o7gWVX=NmX0v%qQ;rgf`g>=e?+mYV zo~7R7GCPHZ>#0;n(zXn8hkD4q{a$I7p%Dmy+@Sb$qIV`&!RPk7LhoyOh727yg4T#B zeVKoAaM$zdJOQv{a>YaeX=X?RLx=gWfjGPdmyEKbI)!}4Bm;FY% zCpd6{Tje^fd7eR~U|78c#%*!OhB%g&AP@wLI0mY?CTIR%<^I!12*FO8fWCDK=2`Qg zhJiQJPnKq7MZSmW%x*`LPx!J($)@|}O6+YVFr8UZT^G;>vfKvcX90Cl^aCd<(+a5C zcBb3QWXpAZ*Y?>r@{(XaB#}0_3?R1()`oeXhVTs5fy)7}_Dg2M!>?$#jUMnI=GW;t za-pAHr(3{lX!-MoV?)rOeN;T)?eivF5&QAi$q$Yyj5uCrqy6Vs5s5gDIKlv>oQSFU zV{Fu7wZm_OO1gR+gWR%Pa>EA;xq65Xsb8nc=ctMG_d2-qUyPh=rw#MkB>x%jPQC*g z3`ZQ0^Jk~{kJx=}Ogind5p`fd4AGEqHFGxv4Y!Gfy{E0lPH#Nb1Wx#v8!Xs;9yIx; zr{x((B*ch5nqhKSt>v4Rh(#to2(8P>kQ)~$>J)4&$2gnGRbU$a0%8_-$$W)Z3C3fS zB>`_juTq6uNG~wP2UxSn|0(;geM>mIU;k-V!&=XnWz@I{evC;}G7$qPrlp=b*KfZK zp&!*vO*kUipy;go%NGcjAfuVh@=P+Zh^+2H!7#SKYX!g3ab zocD}FAb=g^)m3bvE`j};RQ$}PX!(r0-n0_2d~pt2QG{BWH#0{VAdQhYXkd)melOdJ zYHu@ND+%wYSJKz2nk~O?Z}&;+dPf*m;iUOHM+9##giYXw9asR(`X@M{zE2?nx=FU1 zn%Ws-y(HC+(F^wG>E8FaYqc9yHlM7I*|POPH>fG4FFeHFz0f3nWoq(24(@^-%1AHPm3jbZFxv9fT0to7Ky*x?agEP6 zu9hEhh1v3W^U3zz#@LMgJ^QPL#KoMw3!6W*4d^fdAnN|U=n6aCw;rjOnxcBk^v{L( zq@!wf&LJ)2Ylfka#rv23e9-hz%0muXW}K| zc6YOUXe;GV>^K69kLmE|BN!bE)VVZTqwuF@!7#swv0hUEsL<&AT@9bnv`jh+0VD?KEWZA zm!y=UlhWkuG5j{Wrz;r}eW~{`#4^RQe0X3z;c_rwFtDL@_4&>M!o^vxe#d<<9JWp! zwX=C_$D#x*B-g7?+!O=UfhU@g=76AvzQLrNM)MCx7?!_DY@cH+P{$QxJZuR>z>&s^ zVGa-rKP}uyN)B}oq|e8vSZ$I?N4td~!P6QMeF3 zcRz5y$1Y^@k?EsqId9$E&ooqy_Sm`R{m(^9zteg{*Nhg$orUX-%yQpORu%uCnfX`v zuXT1JM~@`IAi#g2SP__+C*XZ6h1pvqR&~kzd!4oP&ld+kvnc^2((a<3dK}hV2J;gH zpGoY`Va&Z$PEdfH)oLwC@-2;4AH4)5g$u6qx7qX~W(&mYGWqP#r@`hn%ldRY<}1L&WVu9Vk5pFB*AY?W0TW(&yMPcc5| zO{l9vCn)wK`4hoi-i)%Vu(}Y=*JP3Gen1G;E390L4qJU z0ZNk$jBej0eMNBL-@@KYp-vZ^luQ32LuSWN&3;=4o3d*UU!+q9k8C}1?9xF!E{y$P zXGPvg^nGjE$9VK5_Q7we%;Q%ehSb?U;3TJ!e^4)vmSc7mxSn*bVv>CTBoj+&M|jUW zvz?A;m>Y8!b$YB}%&I8il$2f>x+5bLiUclFB&!o?P2ktNYzaCXY)U*w4o3XEfEIp5 zZwgT%x+D~Lz{`)i23umV?x%7sHT;-WOq&h(10ompxxT9jcdRMDpC zGmkN9a0u^7yG!FQa{fp+|4jmW<^Y^v4rc6N50A4P`kDoTO-M<3Os`zRTxSAf;(L6u zSj5OBShS5_hB)rS7(0-x;vnCqEngkatYe;{?TV{Oc`BqT7D1WEnyUFfSpZTO#80!P z$=`-@5N(5!w*J_b+n`gPG)EfZ|Yj@F~Q%N8Q?cz6TKVY;m za?LKP)t@X^QLi}#v2{XqGn=KR%PzgTF?)^79&P{pRN;Iy=mdV=?81q{(blxr4;<@Z zTfkDWJ|8^f9b@VeLHI69^V$1$9@|^|I^E+itoQ9zF25XDui>NGR8l;a-(g-(yVp&X zhmzBFyZlLuRDt3l!}XsJO1E7X-4!09m587Tac?38ySPO{038&})9UZn%Y&9fwKFZ- zAy`7DSsmbFg5*2PrNJ0c$K!9MTiV&ARSEI$RyJ zYdsZ;NP)4_DMQ5;XQkT zi8rT?VD$cTcA?vmfD73*tob~7LvAV#u>f6iif{ItwnAl_9$xrNk-V#{O*Pj;@~lob zam)tL`Y$WwwSt=*jb+zD>^5FDh?l_Wk9z6Ijk6ec62Iff;yB|mdwT<1f(kc9pHgjZ zX3ZHmURBxbQ?Tu7=SAm*xi)F6AeZ#-%aT07@uuyRgz;4N_^lkf$KJv$LMgE ztZiPrFt!*u;RbsXqj*z?CB_S=-q>3!mABTJ{^Za+3&0ck1z1E3q$1s>+XE9eYVxoO z1Q33JM@!(`YV9FN=q>$MoS8!1uCpDW%mZkKmxr?J#XqN0; z+)F5;40DVr?npBIGLrc<_f63`&T78nGoT68a=QdX->UzLzPoXcj9zwUC+MB_<~O}k z`TlKMW5A;4<_Jhfwut1tVbyB?5?AqdR0SvdAQ2_7a2Bw3oa zemTe9fK8ci!^C+%(|W?pG8NKkrEhXZ@O+8^GkGg? z^7tY@oGO>P&|&FZ$+#jLwPtmln;bDrod?(_$J>qMA3;xd!xEmND8-&yzF4`?y*368 z-jLZ#jT9CO@2o;co<3PCC|AZ*9 zT`BtOu=_UOY^^Eerg2rz+p22$+{V_Kk8D8_9RwGf(iE%D_=`u3@>>2yNAnLi zh76hWN{z0M>#S~;y6477Eo?G5-_byyZohm#%uuTd@Rq#=G(2`N$r~nZeai|N1UaYR z*ZG5*;l(F0>iHZQwow2XGASi!1V;^@ogtRmo{h$~TLQ>KI_o)In(Mu`hBiu;#fk?X ztcauoaFMnjNj_Bt#s*(*f=>*mbMPQD&Xz@)=WqqkkCXYqIyR1^R8DCtbmy=6Z{gGC z05v9bBtgLq+^e-KZi}YujFRi22=CYmi?VK3iXx-SV4Q-nmz%ev=vbLIyloP3si;r? zbE`14sg3&rg|~sn?rHBaApM*BEqg0cO1gMN$v%5+&`cKJMWW z6OXezLDMb2WHo(2TX_+c-0R2<0qGh_#o6K#LvVQlG}!$)2wCgC!J>Be@e`mw@_-l$ z-_C>Xh%>0(05(d8Z^*`y48t_xaU;KFtTkx?j_`@YIY3XzV+{@7pbwa0%|u0SY9K=%d}6XKU$pXp*ZWC#w$%emL~pp-{lEk_>c$2v8v49v47` zQUZy{aiK8?U!|6!rOBB!i6=f} z@DZOJC+ZrZwRnf52nq}t#h>rcQppi4eJ)YhW#7d1)O<>>k`;e7Gb8pkc*fkR9060H zlOpkc&&->l)tqeRf&9yU=^Tx2Z@g@-+F-io&_Y=bJN?k#NpQ#*Z=nQJmqUcO>KbX*Acr+2Z$3b(g1g+L6MB(Vum_(}&&^cFF>ZPO)u)f@~na zK!4dlb8Jk%I`DcX8y>41uBIyr6MUr*;e zU8=N7o3Rb3KFokOpu(WPzS9?a14~44iA`sESeRtiXWlN}7g`f_TysTAmr1`z0>0L< ziy={&zc?JeY$(%C4@TQ0Hs{t%%EA`F%=%d;zo*3Ko6k33wz9(a>GD6^-|lGGR3?$8 z_@1MytEe~0lBK4yR;8Ue#NEhc(5nn@0?Sdj%WZOC?d!vg9)zsSeoaUlXLeM&TKXRq z%>QBUEu-RUwsm2GTX0Qq2oPL?OXCFh;1E0ncXtR9G`LI9;O-DeaA*RA#v!;SC?P&4xfZy}A2^)I^|n%s3NYx_v8+*0#=6^PZ^hMHE#c6O`3c3$mcwK!~h{*mvDh&GdRQayL> z^V|1(7fnZ<8i&!m}JIY42e(E2t2bH`xFbCe*aTItsnf8RusWR)!un=JhV1 z7@#aCSqv5>Or8IP+vZM(P(55ODk;4rn`TEQQQOMKTqQKr`!HG5b364o->l*taAGSJ1i=Pl?Wx7&A4?8ag}s~iLlGHD$U0SLv_sCY{S<0q0OQ- zIJ-U;*G#RWdiZ)CanBDOrq{vlSUrLYL(V)Jg1SD7{$CB2c-=0rkxSoGEqt4{$#z*4 zxAKe=vab{Jy03VYs$`}$a(%(BFU&1H`}1_pWlN!26q_GmPg&qvs)vj9EehFWc>3Em z4DWC%qFwHED#yhM05zJ^P8EiRNHW?rgXGHc89@Owto0P!<`s=K>(|KjM?hL}HVo|d zcDb);BiQTV*vEu|)v|{Ox7W#izH>2%*xyODFMqmo(;68;tY0{9kR1JHR@w8}4W?*? zPIh$Y_ zRUC%PP0ifla(n9bYF|S|XGyJ|JIfOmKUZhQCF4p_>J+HZVOn)_rz$$7tqFR00MZGt4 zE)uBXaQpI?d&rh%L>{L*!(59g7sa;{dGpuuk09#QDtwi+iI;Jpp)V;l_`e$($;}1+uR#i#Tn~ zsBQV(YB^%iLt}0xMrD9|k4}8$C+a`v>a}og(RPFg#s3xnJ)V8OGe-+0A*>iz?Kw8$ z4jJ8k2_!UWYj7C^tA3HT0-PbUGy8MWGDDc&p1=9uT1bb;ZbS3@gawl{sXA0&f@H6 z2@-;UB8u93?GfQ}`+{lAbp`^fCtzGA0F2`_pv@J?;HFH-{Uv~jJLEdYp%PV=!#ld; z{XuoF8MXWJNJva3)?Wtu#ZVm9*{()^kssf8)DW!BW!Zc91r5gj+ivT7^&nl{seJoj zP&JoC*cbp+yms1fsAu^krrY!4mlf9hGw6(wVbWQ4c=|7}p`pw4+eB+#vVlJ*2VJ#@ zuD{VH&fGx&G+(Dq)1pn&}eOsR6`MPYwGUJCRA!LKs7qsoCq-TX1qNo8S4J z+t}dHj=tMN&^~R?_0p|p1MnRY*daEj2CBKgMr`liRk|Zs{TVm5GJkMz<{AH9e>PIT zW%~hYFooH1LvB5tD-auSH&pW4t%NFnHtR4Lsz&slgQkV5GhML8DfS51rc%SN%x(&b z=qqKAmfk^$rN9 zAbx27o?)FRiZh+H9EPgbK{aX%Mps>;JU43FTt=`Py5#@^j*t3%a^`^ynI2S;9-LH>7b?Zc zTq455TGbBC6#WO#sQ`TrtGX=DwbL8wg}AyG;QjXS2SP<>j!F+tw3~<@Y8o2FJzmPc{%!oHA+}P%1F$wT$~ZMRXc{aQ0|b*Tt#}0Nidd{^La$z-3L2Mdg7`MmW&lLuUo)%b zI8KyVGW(_FXXJaO=0$6L4pe+2Yf+Z}L5bq7E8Af`E$^)20D$*o@?xG1eA#~| za^O#9fgy#i6TpFTkY*i->J;!8zVV6eFk4v+1e7_-u!fB7r{BtLavCrg3z*6rbozN0 z=ZP&}a0k3d_WGg-lgiBQXbHDftlD?#bzsAO|4?WuYoo6MzH0n#=v}us(kNQ8=?}6mc(Kay zV|7R^4t4L*gxoUI)Di#ZvnAQY7K_d6k0iw?bSS%Q<_y?(hIOg?WR@$oU%8Nt$I!X49Dtvlb}sc*VD=*mQE!$P6s z46hru&&=Jb&#JNgO_aDBIj(4`*m95@X zeY5-adw9PQP;NF)j%$=Gs#(!w=cB>a)ey6tp|N%I709Wj73ntm8p;1qI2J?w3*1A^ zEku;crvEy%x$WKkf$dlSH$LviDW;|)S)BV)ww@ec?g4)8V$FtoMFG;1Ps`F4+g)8E zMebuCyS@rfI04B&uGYrer5qa|M#`6{is=GWm&U*j)5Hfjk2G2pfi*2_K4a^RTY?IJ zt`iClvOXGgcupyjZRD@P2!5mglNrkmGx+iQ?Q!iGYq{CS&mO+>4Rg-xtRe@hB`W9ygKtd?G9CtqnT zzK2M1XQ#|Yuhkr5%j|s}9qkXY4(SKFo8xN^<)&^~3$_1y@^7EnYtZH9peRU%?ZXb1qImelHF1mOSFIM?CdQKLddT?2 zo|G&K^>8bY^CxsER>YI2HTF|pA{NAu1-O+PZM2|wSX7-J=R35C#frp!GQR9u8GHvS0vs7 z(1nz;kjZESOC`>Fl6^xTzf4?;;^6A8Bjj-hxik+{8?Y${1M*2a_o1dYmAN=w==Gu^ zmICSb^BVvH2vy=<^ZAFA#p_=8;+E-SLG84St|}M4CvyLwFu2Ys{jh=NLxAWA$<&)9 zyhw(^ucg~66V7_zjEti!Cf2<{EeQiJ3U{jkApNQ^j;;GOB4SZ#wmhY$M_n-$W~wvC zU*y?leZ}F_lL0r})0Sdw$9_4tZH#*Kd}|DrE+vQ7mv&ENj!rTz(|edQ2Sa$hXZpUb#SRVTYzlD%&nnhi=9JR;0VncL$ut)P5k} zZizRjafof4{J>TsP}*m3@3bynrUIW(jw9d^4l8bf!UZ<*Otnt?Go>{P_)0xr>|f zykeO_m03=^`{o@%`SbJ_PFvSPrjM;wGN^*jlJ;GWL(Z;=jb|+x-ReylfBbr_S!+5+ zfS=6oaNvT!3FKO}2ojoFg_$d=BY6BKJjL~izL$4rwg(@`fhRT8gH;Hk>p{Jk+p}i|xV2#$aJTi@7dcl|0~)B|OP=INoHw zE8|dW;?Ripz%pF{J=BKV4O7Cu2&XJ8YF>^f@FKZc#sy4LsEIKcwqGGT?Yok8SV z{c*bDV|D#a2@fjDJ_Wcwr`@_=Xw;4B#dNv0f^y?tD@WxWkorayRIBB6;5`j%rSE-j zx#G1AskdVF2aw6{$o-4zm+fcyaj+1ynBOtdYL#n#k;HmPy{7>1>}th|*`%*B8fvb# zQksW@J>W(%1?Ix2?aTe|ZZJ=S*?0tGgK_MwKZ8+t5BH`)7)JOD}|D{5Gq;-vdN z?oYxh6)TT_Xmx9y8>gG*UNGhzmYyFC?$ZOco0IR*OcM@;a4y?Pl4M{nyTV99HR?9f z>i|^|0J-Kzwcf9S$-^(zx7y6AThOw^CIOm!%G~L`xif(1uaq&u6h^&;tNlgN0oK`j zzl(Org=mgcri0Ilq^J>ltnk+(W`N0%4EC#V&tb2tes0U`mOk@#FxBOW7#;0zRzor4 z1nOUp%SOSvHm`Kn6o0B0B}`g`00Bj z&Htd=|5x&Op_C_Ohljrj--5bm#NYR}FtBLoSY-R8Tc_`QTq(<}SHT7t;!XY{cHMQl zjR6C)K7Fa`UGgGUAY%~)BE@>8eQLi)fEO32HLpV3%X@R#S<-H2Zy1&iP(!so^e;@a zJvCq<;d>+l83S}%(H)`|hNi<}cM1qp*bQi&sQ=sMrK74cV$n zKU@K!jyizGQc&9fATDb7xG?me1j2x_z(2pJKu^u}$NJ~q45n56sz>N`Gc<7E_RM3+ zZKmNIo8&w~_--(wVOjjCvC!STs@=a4wl>(v;c-J8j+(GIUr#5GwB4-8+w)9zMS$zEGz~;gsao$*hc0v zIuRUXWl4Ho$@95IW=ZA@;9q7)h3i-RcQn}Xgioc(vwyR|BKRT6!f_vRtb<3w(kr*! zDB=Kf+`ta{sKq=~Z}!QH{=?M1exdmKh;YEjOo)yRi4B&IK-$+&IE}YQ?LPM`_1mMV zHi9km%_0b=RDkVNAT9%p!=BHjZULe1Lhr**XVWt)KfOg{Wd2=qaIl}6L-sd$>r-?7 zYy$Lv8;Qzq<@kOQMf^lS0cAL6Jw0%k?4Khchvag-;;5dqx#kZ;%#&?~Rtp(o`AdX~ zsxy^_w075pD3NfJd#K#Cd5-}j%4S@a(gD1Q-4|2Mw}1=9m}yE^?B@tc2p zgMWP!Jp*t=e+Fd#X{!HQ@A?0^H{*vVXG%%=KM(i!;Qi0Y{p*7JWHSF_mj7nT|G$}Y zkK@a3feLEoOE$6P`x#R&RMF(Y;#^Tdd8p_#R8&k6xHyD}ku*JsUy!6!H4&rV#%FQk z{jAu}aJl3A^WgWQyuH}SWBgq4(ueyv`=;x%t*xZ3ZBMgn8^Vg$$0kY_B}dTS7$iy~ zh8O{dO9d|#h>G*TV&Edz{;bie9jr+f0J6pU zFL!9+|F5ncv_^bOfI>_eX-Z6q*GCt%G%rT^|Jyx(-B(PRR;+-(Y4f!r0cVR40V!rtSgri}* z)V@UJv9vrLc^@SH!(zB09V{a9Ukw+#psKN1U=je(N{5K|C+jNeRqELXa%37`h~SIZ z8DFuFr>}OjZ)aa_tc~ai0(T1H$6<`J32d>Tlg6Fm-=klrgO{T6!SOt8wN}sfC1nt| zsPx*7gWM&?qHfSUjgvym-{ZuH*ws(W7k_FdBPgKT?w@F|RGjFdf(`V>1kK6A!pqI? z2-pp@_v>+>jS47})NN3j>C5{a4r}MRKIMauPqAMj6LX!9cB%iXNa4CXM*mk&y^JwSIm_+<+YZzC2r+JgK`Wxf9Og9SUO!4`J0^~o~KbuCHZ^QR8f7M*n`oZNsYqeMi4y1>r3bz z8U!r;EpCt=PxVgBeut z_G3RQ?Q8Tj3cqu;CexACa7xRWK0v=%e&$GowmdaD0u?9dKmwQ@?#U<`iKI`Rc=}Iv zhaSX!B(0i+v-GPho6Kc&!aWItm@S!gVR@MU2C9%8Kf-2bJ^%%D5{B{Db4K~W1tj8R z78*KEKm!7Xh+sFKa_gVeZ-T$@{_?r#V1OF=i{SnGELf!$s1t7nMz~yI1r*TT>p0ib zi8q$JLb80k(TU-0tHa8Lr-qcOaQd90b!92MLR~w{z9K9X4ov~P-~o z>6M#Wnm2O=VjnZySSq6FYz-OF0pxptMT*ezlq{9)%eRjTzTEn_gMJ-*mtO@ex{vvC2V3+`XOHJ25!#M4`3$2N%OGdC zloP?LG$E&&&Mjr#zT`;*>Zo;+dL}IvhI%U{w>H+-6LX-4GZtr+*!> z1DaB*`nyw~sz3&&0?)-AT=+2$e!RZjRC_XuME{RQDx)p0LpSiS%g0E{ds zU(iyz{z7_;^}lO$hJkx+V1!z%RILQfgALgxX~4IsTM4yYtIzOd){wv&t}iui`Hb)7 z3uY^{!fr)vUotL!AQ)TRL{}r5@y@c#bDZ|U>+9K+E12yIm*R{xZRRC_H=dPN6}KR@ zTC`nhSiJVtlsc_QPb|_)_G^!jBM>_IboNt=Hy)>a>*1jI{OQ0KC4yE%XV2v%KCY!oAX)x__GU^17t1hN{RT5&KIY}k_2@2p1w^Ni)hpyM1g-F!>H zOt;qEuaqPN7UNc#<$y=Y?MsfF zGYf?gB;uqb4HbboUTBB^J-x2Y{r&CAqw))-A2)91XQ8s$j=;m(ycE;{oqUGf;<0i%nz60*9>yg!-PeeL(nJDg~Ux}$lwtm zQn{}&r^9P0^WcAD{Zxc1aQIuJV(ClxNYkN^#pbuhvwLh%hPdPkG3c0#`6$X6k{*z% z@^CsHy?31;OJQ=TUQK5)lI2ihHdL2f=QZNapwtF;2*Bs!>%rxJ?o!4mH1as#pE#xq zTYO41KgYL_PyK?g%x~m=Fl#qGFnxCDu_Dx6vL^r}(Tgs^cnW8At%ir*kw0!_I#^8a zxz(9%FTi(vX>GZQ$~%UX8cECR`{@^Z4e-SZ9qE)AlyxLAIpru_bO9=i@9yX5DgEx^ z^62R9iW3ol&QV*{1J)#ra&_loo?_pJKJ7Z2%Y{0V^>^1Z-4Ds};fxLsXQ{MB-qZ3I z9+wLaAl*jG&We>9xr-wuqufzf^LcD(&aE_Sd6=kj0l$Vp@R@44rPgcXs%RhqX1sBa zEs8g)aX0@X2!(?2`JN%8#fZ`S>9T+kbQ^GREq`(sE#vbcf1%aT^+NZ`J!j?G`puaY z|N7es#V-AdSzYyf18affMbondhi?5t^SEi{ra_OD$9R_47sqOi+TC&F&V$tM0me<| z*Y2Rs`^xM(*Ey!^(}CPOqt`XtusulK?e7&yYcZmJbVk|)ytSW->NvI?z^+{ zEaR$Ts8=+Po6Sr_A=q;M0p=>hV<{r&9P!4%y)1FXHF7POq+VHcSE|f(%wwEgSE8$F zOAJs%b-EEA)h6z^2u@@_4|99*#_1^0>x>qpmSjcXl5eiF)1B)+|2nu42eGqT*YAqU z2CMzP($G6z_<`|ePH{mwN2>3fsv83L*1xI3L5UR2oJhNQcO%kK(2W|z2~%_x6^>41 z?DsTmc2^R(Ki9Ck9`Q->{g}?v|F}LCV41~zUZ3V8v30Tw0wuEQugL3+WKx!@w+6pT zEPJo-qSx(IFQRgPfBDEABJCaLrQ>JQrQa*he3$Oex3HZJ6ux%n8K{eeb34| zTFs^DHp#7fbP`Om^#{IKh#57yFUj~UU9UVo%s{zyn%^&BlMbIkI&2ePare$6xilHL z(yK*#l?;#cpir_r-@r+ubk ziC!>Vna=UVemSqAEO?5@P;qv*4SGzdTJjkVmR6XiU11&!%n!Ml$ERxiF@|~fl&?YL z&-J^I?o}b?sbKy=>H9tBj>37cCm5~ZNU`Vm4(apemcI9%xbVG;vF9*Rmua18@lZ5= zV%t0{Qzx`P)oTqtQZ&!lq0AB&$8sKr1S6DqyNd=PJi6xIi{>r#89lY%@xBQ@t0lqP z#~hF4LzX+!edwG#-zdvG*lh-%Z20a1(?&&WU>)xVVsK5-?`Te zd|WgPsTn~W4}ZSPz@WELyxaTRr2~U{s;1|@&@zSzo=e{25%6WU;sN2{=E1r?c3>Ao zZuuB}hAv4!Y;J6V{^n?xTh&rjNZ-D-wcd4qerPB$$05FBtomc>Bn=h@)9-y#tK8kW zoIq+KeLJB=mMv4}sxFC5i`n?wqG_$M!-c1s&To*8!3&RL_Yh#SV%F~^Yd-j6NyMQ( z^ulV+Df6+bsY}ScHwn?%C`{b=QyA&ZkR%I;D7%gOwvoN-PX3$mtPb|tZXCLH!;-(B zw_H7a%;Cf09m!;Kulmi7%Oj40+PVgGyWx037W;)`E^7zp0v*jgM7?JB_a^UL-C^Z- zhbRBRvxP{{6-N1$K8m|f`qf-@fc-*o(W1}gVJxW|A%!-hbmNEV9(s#u=&kn(^-Qxv zqLqQ{%IVK5hRuB3xbi$}!~6}G@6S+XkCzub;)S{QrQ26>`7cZ|4LE4~JQx(cnH>Es zyFwls%))t{4{8Ir1L}@fmZ4Rc6oSL=EV&S*rj5otVYg-v-VMTBA0tF$f9EoT2p);p zjOaaJd;0>%UIF9ubC2Vvr$Pz|rKUU{Y~^des$afiuS>GJFfjoe!yLWm7X4P>ypFD0 zgRc)u!;v@2Kz#1)3R~T^Ist|Sd(3z=jrMt0=isX6$~pNRss*k*+-WWW)5%H)Ue(?; zE;9?xUQpgW-fg&ZAxS@p1px*jPwBmg2j%?;m}Hx1Xh5cAV|Olk%Q% z>el;qC2na(0F{sU7G9QI#Li-9tJaWnJW{T_xa}7RhpQviT`9}&atk0NYUlOo4dmuZ z$H%HA<-b29eKFlyoSQ-#7IE~sJWAGcW5*vw(QtCAeir}9R%c~TCyfg)krCX$CKb7R zl7GMXjYuhbgWcPub*Ll-myL>ga7{=lSKo`{$N`Ek$j|o%g95;EFn(V9d8W$h{V$U;m>sEy zQ02kdqg}U5*N(1AD}S|RpNiH~r6q6ADr#*OuRi)aLK(tx2)1Th)|&@hg;m$w3W9r^ zMCPhS3;sPnx9O3s7$k>brRIagF(sd~NMWwFRAWC$`_@By*f+nvmb>rwokguY$m#p; zh=F9dXGpi*~49J7g|OWI`49eS`6!4-Kz60xX-8X z2U?xC6R<(lUk8+(do!gtHw+jh2iw@YO1rE`Ka~OF@yHKZPme{wa!g@JY$_d)3~)t_ zXT<9N^7rX6V$dqbDo{=i$1Akn`!v>BTml}oY&pOuk(Q;$KItrZ|C_d*`1}Qs zd+*k&Nh0+QYmzyJn#NR>#a^6&!ER+8_Tl1f01al+8()GJiE?kqs2qIH2d?l7C#!K$ zuH&MaL?oq~b{c~(un8Q*I1(OulZ0}bxNnqErlV@s!y7LM)yN3hnAnZ(UBcZfkt|3y z^7x%$joEfw_7@dp7gg(dC$nCT_s^F?f)O8z6YW)$vrUdkNQJzmCjA3;&$K4H%$mQq zti%B0%x;{6z|(zBU? zV>`EC(`~mhE}kqO_Hl4!IUZ^jH~93(1y<*E7byo}OWZtLbD7F&t)}4XigaKmmNw2` zJ8P4GE5S&h{tnB?TwNla>R)J2(-a=sLM;{pk7mYV5H#PMQ!;&}^y084MV_8xoDVQC zgN&B+>koTt5&FtiYvg*%+&m{q!KupYuV}YFW@=uZd_~P_#0O|s+XSCU{(6CPye;cr zz%;J7En(z|B$E05^|irbsgWwqb~Cw6CGh4eztHj(~DLY3x@ zR@|6*{))fHjJMTNl;};?m$J$U+*DEQem_kcxSvR=tosXoWpkG+S(#~4PQ_;SNol5h z!UWHfeh5D|QATe8gMXJI#3{NaZ3!WDpmZtxhHYGn0tp3T@-VKl!Fw zWRzz*R7;hZ>2Wi4au+z4N*TF*frB1VRkk!5BFpLT7AUX$n*AQB3aBpb@pr5Rn z8AEG>dHnWt7KRg|Y#>mV!Wj|l(uGi)_9pEGE~R^F7KSU_kvM&q!Q#~R;VqG)E|B=D z@&J-k@7honaq-AX>@|Rf6n-#w*9^M@l^9#w`L~8KcUsD=LY2+un{Z)p zAI|r&tjpBv6MSLQd^cu8fLB1v=Jjg$U8$k4L~n4w%ygYqO#m)4NvD_ZN%Ktk2JEfgI>x-{=lk~oeZ9@dw zs8w4!L}VQ|KHVME&;4$mR@3Jvs!;-e;kV{x zcGJF=yv7gElycH+DeXwR=iY)ZLx)$>wm)+2+&;o4?Dn^ty&7&vZgR~pkb~^3oSdQf z5f5nF8r08nkBc9g*EX7=w(BR2G<1IR?5M)YvR|rsBQA%#8u;V!ex-YZL;=nmUAr^l z`i?4!n9b-T#^5jZ3~-i}%(Qme%EWQ4P?1_l8~E$CQ5u7Y&!x+eTbx|QS@Q#Xgj?mi12Lp~aoHN#M!ifhn=!WF@K3uc3u0Ln z*;e5ui_J4}EOc-PD+crMYYfvp(Yfg?6b6jy#~fRVETw&Yk)^FgI{N~hM|ZAn;^IsQ zTT@)M+Ho)myg0#zqBOlNJ)8-C^5-bVuC`4=qR4#_dt_L`7N?wd*ZCS0FE-2Oy7t_4YQ&#|oDF2jTtj#{G-=h-bWstb=R9`ClTV{agG5j3_K$+T|X{z$~!SQAAwHgf4Bl&S=CSAvx=J?&7t>z&p7Uo%OQ(Zb=&>KJV;VH--&F~+Q(?|T*Mti<56WF`1 zMx}w{xpLTICo^UDi{p}h9eQ)N?sJ-LT7QJ|F+L;qPF84PaflZ+yT-4rUMx#Bjvg(r z0b<#9ORq2aXqE`KO%rRVpX15Bm*O<6(VZbpNaloXv{qsY4=WKhjnMB97S<#uk-1R= zcG{h6H4Hw%G;xa&O6%;ZdB-E%d9U_Kxj0ekTO|XBeO;u}&NX3T zoFm7!Ek^ne9FkUTGpzVwd$!NhK6p>JO8*4?Cl^4*;x0aa2TDEjxZY>Zjpruz_3lXD zJED$G^esk&NqWNpjl~6dWD!S)UK{@EnyHv(0W0`5MQ&6*$EHMQfH9#_5w=`te`v#- z8Y9m*D#n)aSkTspbX&jO!Sb0UQRi1_-;oBYh@eLYTRzJjBBj%`OFR_d4j-%6$Ai^d z9X~Xo(S7LY)$~2iY2KP3lQu-Hqxb`t=Y{^Qn&TEPnXbX%AT~H1cAY|l4z~6-|M^Ea z6n>P)C%t7VEQIQ8Yt%=HNWZ-&MYsK=hNRB7f*4~7j&6XMR+QQk~U5NZaIwgFZ(;n67KbB}YOts^)b zA*1;Tcf|Q_MMvp{S<}}%>7kVF7-p9s#f`7yA6=9%g3e;*N+8z1eouv_r3kCA6Yb90 zeNB8vrCL=#NM!Ma&^nXfT{D;q9sF)O&O&TM4JMf$Es&G02sQZ+Z)1xH8zo#iK;+x(4 z+~iiSm22``(gRR>Ote{8DFr#-%|5r&V+f4gML~Yvye$|`?pPCi`ponn?@eeyGs^7Ti}^F6E0r{5}Z?V830O*FNC?kv`}M9DPg%jz?T9 z>7s+u)&u{L@|zcSK8WS$=~-F`=|XbOYoG64&_+SILFp-0nl99T62481GyD`HdHLQ6 z-Vd3qBJaE@R`kQd4bw}PqVFkkpqJ7MK$Y3O&uwKR{Ih!Jnxu0GRzNH2D8|>S*B&tz zQ3PJf(|tVH;3imQY`6d)XXKjV9()%*Pq7a3lF9w+aIc>LcZz1SR}~3DDofV3@a4Wj z7M!(b0Zr7zz`<jKKTfuD4uW-mgW1{M?Jlnl!t*187NLDl5BKRw*<9X20T4}vn<3G+XYK_vJm^*P; zPZ1&=!H&hMY?@o@ihK5}A_D*W%VSUx4pV#ZQ!=3* zxN&q$Omfk+ddzsoOZ4~#;Ev!n;22_jS5Q!o@!ScB6$wl7XO#B48HdblBB$(! zmzVqwW=ykbA2YIyH98STYY6G=p~I$OmJgcf`+3yf6%MS#&LxLRr^!f72+`b#U=*+wS1ODZ z1AguIhj39e@^1lEJYRiY7CHVEqR*aLuPxg6Z=(3mVui(b@-h6Jly9X|4J~qCIc$w6 zl;1?99G<<$?nV2IwMm6oJyV+ArgZy)NP8-j?uI7qDvS4iPS_f*=X~@W4K0|E<-GPQ zYsC-JKW(Hg)rQ1O${+M4_w@wWxuI$ol2^*JoU%o%Q8t=VoRfI3*!=Ig-3>Fg!`JvR zK6Gb-9awnx?U=S=$6BU4Q&lIbyD2$s1CPNytmb~8z3{g>!&Fsz4m*wYsT(98p`8l z9P?chjrDSDs&*J_jg%wU=P%MAF6bu^x?O47K>BDHR9oyCh9NRT@bLnG&(D|ql@BX=)O98*F}Wr^qYvFFgZjkXAj`G=JhMx zZdqgbGlXxzeX`d=E&`lgQc-9OfUqPT*=e{He=Js=Ay5lvW@iG(Wz9@@V%#yGVqlr#Y&_#L{NQv?mv~K5bs`cUDd{iW_9p0q`xq? z+_1dOu2}D0)aguv>%$JCTjp6~-U$74XoeoF&5M#y)&NF^*wys|L&!2e$QnIROVBZ_ z2H6gujl6*5co)Xo{dD385-o8`eCMT@N$clW{8|7f^b&OPr;BpxDnS48tQ5UlIPT)B-&oAFlWJxg^y?;KXoSdv+7-Rck(ab=Th(gRMBUnFZJNN_TZl7)(>CWybXVCUwWbqbfn2^0+CW@@S$Yq z#XKvXY_=O$LNG(}(^Mo(VFZ8T#EADk#GJ`jPWF5qaYvQ?k9n8A%co@V)$W?S$h@n3 zMRsX?&xyfCs2B+z_1uZRGW7>(gQ~3-Sf$nA{>UxHj3VoBU*E+Zk57%PuMJ&3mFHVK z3@(M>Q;g?m`CDl|tr(uS06_;dnmdR6lV{q;079DKLh&&r!VpSd9eTz2=C2w7b7YjV z(JxrzK%8>eYgf5hCux-&1r+a2L{lK+f|}GDBi9zhB4yz3 zj69XZAL|F-^G(RFDrkF7@c50=4Ki+@_k&`$F3}JqrTBm?87MsDt8Ux!4thlgvKo#F z2E)scF{gCU1c2_}2nojjG65%3a@E8c*y#RL1%LiMJz%WUE&%(Tm<_`tpwHva7t*l) zy#;hM|cl zj_zi-6DL(C=ka{}xHjhN-qdNP(idlwCXLf8xlcZSJnWT?RN$`wyIB>=BtO&!QQOZ3 z@l88~r_-f<-q1XY&9<9uED)-eYkz3p<>P^`6;e$f=Kk@`$BzbmD*jN&T>})>Mr1x$ zn4zixhXKvufR?rM?+FmC>BR4Y$2;mM=+O&WsBX~fA0)5g>bnczt@^hb={noPl z!=kI;%e5WmwZZn*czGmB=*~07i$JnphC0-h5BBjO%qmNtAD&_L+|<#i&c)Z|Whx?O z#}ZiHvX5hn)OS~Dkz8x5oKMNMRtV|v=16jFZizG4I0yB?H(nkTnrG5a*6mPq+d&nv zrs>GJuMp2ciRku|0*A}Iyb1S_mg$?UUpu2D+c^9aq64ad6yYSRpD!C0u(gggs!_I z<{RDiJVEysNKpSs)*JB4ciV@nX=$|(3tzK=md+CvfkZWnQnY`Dm-Z0@Q`!1Z~e21I4xLi%N2jF?C1vERChW;OF+-oIEt%P zrLzq2MI@~rwOu7=#~*O|4Hbpxit)KR6C;*oyom{BWQ*vcsSrb;_oI&1W~-p*0Ec`u zPq(NvYW$LAk^L3sVbi=*JMf?+ZFP(%(rxgdLm5GP?Ahg^Se9$ZueZ$Q=9#gV?|?SY$JnG%1N*#^e|T`=E7X6 z_I1PCqDrq-GMUndOQluKMm=JN1Me!^u`nY~_b6yt9=Qp?xo5`o+H34bZ-cgoJ*fNj zfiyqrq2;)j^e16{reAo$6juH?ePsu2gX3NB*E577PH_Ul1S3g8-I{1#hedvt*FnrI z=m0s_cTVmu?=f#5;so?K7aUGDfEc=#c2BUpYuiJqfFWnvrl z=gn@Ks@Q!6kH0>Ajf;KLMoT{OE&_Dq<;Yv*@MBbIvJAuNeN^3~2m1PQxe^SZ|EY3V zK?W^UIXvc{{hq4-ZD&5Cc~hA6;)ezJfv#E)_1g%rQ(LLwn`R^BHc}A>t(dCmL9;yERJds_-&}#bfzG}BA!0!lm+Ix{@JHm5Nf*})A4evEol^|@&V zh!MOzp|W9|@EPi^lPq86qEL^ZzKRmZ@8hj$8+c;NKRoLgZP?s$vu#f4fgS2tLZU{w z-EWs}jFb{%jJ8*E&ArqnK(9yZv6fQcT_|@^F7gTc)cMiD|WLmu{k|2=$ z?z#S)ywDOhj3xxvd2&L#wZ6k#0B+ZA7hTVCHkuoSVp_V!3GT6jU7(}+H4=IZ1#~$a zEy_O5(SKO%Hov4#YOX4Bn?2rbZ>xU=zuMa_)UJ``c;#Kf~<1%jh5zTI1YJM`vPA=6%`2yR#uw)N-)VUYN}fu33)-l zGB!*P2J-mR9eXK8fuxG4y-bLUf&QIH_mpINsC($(s{U&WN6Vbi>+lt$x|=+5-*bg&t(@KloNUhBmKdhA>qG`328t-}RLRuWMF zgp|dx{^OTK4A6ev2!IyD zaL@Z;au682wwZ!gQ01%HDkHfa!KoqcG?`QaQ0w7d+}?Bp9Y)c7COh%)zu0@LsJOOh zT{j63EC~=S2^zs2g1d#_7AV{;K?_Zx!QI{6-Q5EuSmEwg!KHA$lPul)?0wI<54W{* zZ#z$`t%oGD#vHScK6?NEuLFXX#9cl=%8*Jr>ZGp&g!k2o7yr`LF`e6D`{$5~vqvH+ zb}&*EnLfKcz1mj*+PU0HITNvqAtk_p60PZ?MT2L}S}K7ks@gnU=uW zHhLItZ$pu)?J70k%0Hl7dN!a*sihmi%Yqbyl8SEVX)E+Kz&|QqI$-ho{+Dj7%|ff9 z;|D4I^uEV0x`>Vmy&=Ni$$546{bPH5cz%NhdBA{u@|Fymjy_r|@)KhE$lQJ_9rU+Z zc3Kkc*hYt$i4{U)`6kkDlkzpu;Wb)~-B=TS@h(-MI0I0uiuf<4QqWWEQ8UK42@=&hVHu zp3=+JBczYMrS5l7uQ8!MJ8e<4^>#Pd7Zb!u>>*jFmQMD&)!!<}K>e zcDyis$Ph~>QqD>5l+8NLLKwPa2Y~sV%0y5Dqk1=CquZub*>O*^(J4u&a!FS8s3|Y0 z=DWpl#HMdg>^=$Z+;6yoqg7U$-vRmKKLPpsQX-#SQ88J5hO);y6Paap4q6Q`Jz2>> zf{IZtvHgHdK2qAnb@?#1!P;BdJdAv=FlfN|t9QIM*vnGO*FM4OYXznn%Zwf8ob!a~+Q1$euwl1J zH`8H8Gg@x)Vh0$o$!1ks7*-nfUyA1pKLgu2<|vr5riw8=IDak<9Qj+DqrOK9IBQIJ zMApiD>ZEO{?h1THlhE+shJW_-JYis&{?73FW99 zkY(0qWIH82Ou68!EQu}E03FY(=*|=_vR|jHnswf~?8iJlG3^7UtuDRnw=_1W!R#V; zO^N?_B)3l!fmr=pv?KtQ<5bCN-+LD1@-g%k0eIAqK;#0KiZEa`owj}*9mL742(NW+Eki{(p>E^^o z>lgxb7p!zyPPGI(t6}|%R-txcEP)@ddUEfi&N7`8xND)$`Nb5 z*id)cE$Hy6|L}X?OR0UDs++9U-5}T9esqM%nuCU~A&ulNqiGQ-YDr(|8P|h9@0e>> zQ*Foh`S-pb-(TT)ODPxstha{LlgU@jkZ2UB-@BNd58cWzKJQv3qP4bLKz#%+nQfFv zoB*K~ZGGj8vf18lSmW$uSNGGi(KziDt}t%ig9sS$@>8fX!W<}t%>mhT{Uj8S@*XRR z32HCyq|vVvd zM(VY!QzOZrs}f(sRNZoI-|C}^n7^~-LBWvmqN7B5dqHaG6iD?dchcg0DyMB@dlBjT z+X{P<`$>!^5-6z9oqk`4cyq4m;7p!{w8DAHIwX4l0$DDEP- zgL_0Wg&sZB`g!L!0@T#rD@g^5XCB?{fFg8Cyy4VgZ?1-YgKs zm0x7k0m*r@v%LMElDH|_Zm+9PzccjbuX1=+lhDb&#jHFX3DM-M+Y(b)vLjsn1bDYE zE@0Y8Qyr(3^0h?KVWM#DDySgntoQO;J-Rhe=^Sxe5~fzkGaZ93X{apv#3ycl$%qBJL(F{zOKYcQvCuX9jq%u9$J0&!8di}LDf1cmv#l>7i$A4g6IYpw%-cPr zUYr5o2S>G9lmKC9YlX6Z{RXlV8|UiWxH%W6SkJGU2(E^qD0a)~2F(QR_2Q{lc(De` zOkuSgGxxr*&_i*GA+D&7i_&Y1#SfJc0%{zAY)u7$Q z1wgf^;jAh-oQkyDi&lwJ!LM^OiX^KgayU*`8s_VBs@RUS!BbNG{@SW0G(Ai7j3wVnRm{s8@ihDdOC4DG{8(sqi)&YCfO=Oj zc_aT&2_vma^V5Sr$x9Qj)Zzey;=rcb$0xVCauZ?a+}MJ$GY3BU&u+E&&%OzFJ2eJ70`gxI^#}EJ5z%irude& zpQ*rx3!K_tMbPnUa3{0TexIter9}=SqJlrf6@_H7kAYedly5|hmcjAtK3=1t%WGsh z;zEor5p&O*HWYzLSz`R4C!x(!7`~rN)y6(My{_}bJOhji)+^sgw~`{FpKs7OlI%rW zz}ESB!GPHP6xS%hb8oU#i;^P6s)+~@USAmqs)8;y0tz8OgSY-`GXfQ%M~G=P1^!$I&zK;H83V0|W~ zG*`F9{eWqar*^F+v+ellW8IL>kyV-TR0+3e35}eHof$iV^(|9wrxw8alJ6M^`2f~U zaOj|JKc

r$Q#uUAk>m2}IWJ_O>XN%XleehVP;{z@DARRmtt%?dCZR({SAs9W>U_ zTnfFrd96skSLtC~B5JoT+GHypS-AckJJbQ^)nR{?k&kh2mmp5h??l-5B{FbLETH|b z(%=i*2oUPbg4^u)<(G+!V#5$SU$|Y|r2B<3NA}AC3|GOxZ$Kh^BB*(=MmF^xpU;&X zMag*sksdcgh0a5Xa8t`JsXv*}C@GSHhx+OEB$&7m&Xa^poU$6iJ)xDinXuVglqO~c z6(i?s-X|c1_B#%*EACGn;ObWSGyX+TtRQDes$Q7B!}J^x4`Aht~x(i9hZ= zYJx+?`T8hh!ipcavo(u>)6=JYT#5oO-LO~NBoQ6sGRNPaD(a~tervo-`^b7>mP?Pt zPc8U0@BOAlW7+shPV4C_57@w#^tw^AnN4c1>N|vR^mBJY=xB(7UDs-bVd@5e7~EC0 zHZ{=(D^9pr$$?}zPwUC3);M>(7I^P<(a&FSB9u7$-Gx<=;z%LV#}TRuQ`P3tgkX_l z?tsi4nxy{hCBeJkTY1hnaY9bBZ<#1uC;7%%i{Es&jAxkzsB8^2mRFJz6 zZ0L;1<=hiv1u_rD@kR0=7ZrMoPf*m7sPb`7#v3z2bFpj$(0oum^Uo&3PKBNZ5Vff? zFW+b?34F;+eKElh?>#%JB0f$HSf^m6d;F7$&-4uc?UZS%X>mt3GfYl*v_5A&fdE|*l&byRVG32|J2JNuk%BW>AW%9%~2 zkyOMNrFqR$`CVXSRrXfy%?Q>wyrJ53CP0>&FRrB92q8P|JjHHK6k&S`ef?X$D`9V> zqVd!+*g3#jt3vH0k!n`FW|V6>WToz(fGi)4R2u&~+YtU&wn3gsh1hiw!n@zlb(`?U z<3tuz@=5D=ohuKRC`E;)pQfk+oC>%CF*_a@T#1I5Pe#$M*_Btw2?yk*~8XZ!SLzQ~X%xj*4 z?&C!^`_vlX+x(b`w&d_QHt0o#Fq`rKWwtlZ+X6P~K(nw>Ni~P0&^k$Seg) zi8q(WQ6WSnb`|kx@eQAM_1usAt+h7BbWR=w>aAPVXn7Z$jGF^xi8?)b-xj4s&GR{Q zc$V?LYs;c<8KIEP`~^{bn(y$9mWyr0p_rTYSsv8^fb+4= zE7>>hMvIw-h+H?yjNP52<#LV!ZRF`9leCXxc#EUst$n|?A^ehT@P)@>Q+E^Nu2Lw! z;@vg_G(v$akb5{T$9C9v8`g&u-L+f0@XkTSts$`Y*fz?=JEg_IbrgqogPoB-_s0oP z1=m+s7+M|nR%Ik|0zmOQEe~{+i!^A=x!>t!wG!AR2WQ?_4x5=Ba-v$&q7ZsCn=daD zAhdZ*h42&eUCx%skA3=TsKaiaB#mdURg?xLUlyeudA|3igVt>bp`}Wd7^i?6?05o9 zUyboRd;2@c55w86+mW-zl}cDW0P7m*)XUr?u2qeB-{x*6jKR|r^SFBPxQ4W{Py|I> z_7YN0t3lpN1db$jw3m;%U)QDCeAs<3(JBQ?2I9uX*I=7Da7tq;i*$oyarA)&P?RTZ zj8x@eMty7dK1n;_CPhrTVVT?8d;#JRs-C?qO+NQ5>h>KG`5A1)v)yMMhUs$QO#p%fW5dw&I% ztnYhoph8?LO$}@q>>j^mQ4PIX=iTAJ1gRfOlBD|nF1j22tLQGpXCrV8ICR%b-FRni z6H-rGWa^gYSTrm5uB};-8lN?3++ResQ&pU>x)}J_rSCPD9)TqCPe*pVRTbBO=B8ao z$0~428CS#yo_L`M&HXZpKS-b&d!IQ{U3o}g)qH*wg9=kCaL_^98I~6&d`Ox?@m6xI z@$%$F;m)BZ*~jmeiENgUF~y}AbIodiZOo%9YR@{|jB&arOPiBl@Yh4+9fsA^7i7V& z#*_qi3uA|PLL5Uvn@>k7AB~Fjwnvd8Hs8zD9oIApmGUV^b3>}}_qKEaO{D87E*Ry# zeQWiHXZ$;iNWM{(IHYO1rd)9_2w>qkji29Q=LcMnEpI}aoT^Pxh!eD?CnBE^vaPw+ z=Zr^FU!9K2%8M>Sg1PJk>(#^^;N&7ZU~D{46v*>(Pw4q(UHNu$irU<-LejCxZ6W54 z{S3t^Y#Qo|jrxnh zr~K3zFXREWsmV{tpM}uLlZ)m>dMeF!Xwss-?+bh1b57u(@wOHw+$H%Os^9;L-O&qB zjHM$4Q}Nilo{uVG0(y(`#@bSGIhC%d$zKt8sH@=;Gza^3pfj(NHZhVr;jQ;p4*7*; zNZ}+WyT9v^AG3R!9-ZM@wou<)@7MrErlr}w@luC${S@rhPOe8q?03U8v}lP8rHEl8duXai@F+G2uDQPP#a0)p9nY7=isfNYq-Us&+F@E3VTwfB`xO)M#t6&7*B&s;2UmAuS+B>a@x#N>J zI8pArII>H?_$8`c?%mn8wUgyM|NAqGYa3j8y#?(EK>$~a@=6~Y`v4|-o$G&L5MH?K z@2TxS`EhC79h69m5j^FP<8^x-3`&iC-US^enZC7bxtQRKq=aI)@iF2H3K~U!BK-60 zibsUl8$(-8Rm0;7(7`#coQJ~)Xt0Gda09HcjWz7q)1B4K|v$a-EZBk zkjhj%TD_>wJ#G{>CV#I^3oig$v^d>tnnK~uh`BdEoEh50=9Vr!ce~CYhGyTPllyIt zG>ZmQuSM)$Lfj2qgr#t=`^fA*clE)1;Pnu%q@nX&HniCR)$e?Ezd;s|W{zKQ;`es* zxMBIbotJ}|H-B!jF@*vHUUrt;T@W9UkFHb5YgBxPFyBSPo=1F3PCq++`OZe$aiD}9 zmSR(5Yci=;V(y~kVTHkv&Z~p(L{8rdB!7snHZJBFgB|*I9r^&ErTNF9JzXwo_VN6( zqqhM-r(op=_mS>iRyzxd0coB420!6tH5SHGOv+cko^@2ws8>&taJtSTz9!at7~oC0 zMS$n?3y#x0C@IdP3l`<~BN}SxdW=FN28ar}rIWU*43kpXuiI}?^&fUPk2?{ze311J5=Wh{XLdv*-E+BQbuGZq8*e~lX9bwuX7QJxcT@fk(&IKe zYCv$88{)>TvPJ>a=zE=wQi?@V&0Z6f3su@-*jOxYdXTtyj3~jPP^4`E*={^!wxn;42 zF!jxahk%z}ZAhi~x%0Iec()$F{gI}+hDMtaEWr}#C z(MO@(Ub1tWxD^O`K~1;#RmC# zm%j+pLGLMqA_~G*q_x%$N&nVnQ3|H%j?xciD{S$PY^-IQ!!|vhR&XOLbTaS{>AOvt# zSK9ld|Dv8S%cc(TI2KW?FdV;a zzqik7S@$kaNvLIcdHW!^ku~-)oBX58&CSZh7(7c4;1t}0G$mi~O+2`vlHCX{Apcuz ztDA-A)Pmf=q(heoQ@sduLj-rY%(jM*i$i)ZL+{m)FoyV#%u8uaIN*b3AJCkkCaTM5 zaiS8=W29`9@?W{$qFKQ-iGvwaX-MrVooKXPVs)Wl+uf_Hl(c@^hf`g08th8)2&@8< zz^-LHnCBos%@j+@Se<)F&*+VWcpUA=G952?c^1V_TLSpRBl06Z6i#B7e1tw!*3Lfz zQ9tmik$WaS8Sj-Bfb8-fv7PkEH_rvoYOUG{y^WU)_*nR(81PBp=U3hk|QWd)1- zQidSJmche1KHvzyKx_|~oTJPmX$vQ2EZ&0wpI+j1NWO7^lx zHi=x=r#=g9fhAO14h+#8CVx#h`ty*|U(b2e44axr_Y&uIUjQ^A~eZRPB7h2R$i#cy_0kb|^KbARTlL5-fiTW}h7o)NZ&1;|Ab&zSuBgu#_F#eRz3ssZ3 zGt&b%%H&dwEek^VbcdZea~h*Ur<<*6tcPEd9bCuTq#3~X0qm}98%&f)>NW)?EvW$@ zptmtx_lTMDH~*lG^p17UD67ouV4}SRi#OyaQ?I>~6+!uFN@d5S zZxf-lH#J^K)8w=}dXcYL=-4|fpjh!`An-XN0dN6~6X9oF_s_bx25OnR{+0AEep+U{ zSZ8+WRq3e$$z||g^yIQ(W!rlL=?V;_jTI{(%#Tcg(g@PlHVMa~;KqC40n}!JR?Jbz zwNSWDcNJKn_)B)$4tHz&+QIsOdS5M``wFb=>q42MZkkp7U7If=Ow$Lvxr8@FaxZ%q2=Hf6OJYl$?5P=_8%n zoB&=TR(|M%u?z4c-^>%r|~PV7a%TNUg@5e==VskKJ{yg##Q+ z4%DF0pcKbk<@r8DBj4tvdSPtcjppNqIY$_;R(rDXr8BSJ3poZrh=E1FbsEaC`l_#W%K%b9?1>}dIzk2I0B(OQ@BllU z{^%_UN~0IRX*#WwqNw_H$nE|N8n!>KYO_4W{UzBPLczEwX-@*<;YggUon@o93As~v z!Fx3qXnod7+ygLU5|7x)+Sf$5TwROm_?=$=}{ZcLPX0t60c{$0A|&!Zn9 z`72R_!eFW30X!d8Uh_Sb@zKYlBkbz1&ydd$A$^PQdBe<*-XI?CA|Nn*pTFFqLBi%` zSmq~}-=!1emtyk)GpgSlCB=JG$~6)GFpJA%BkIR{w&NDsR#g^@aNAd0)wW}gnZJ5s zF~kzrQ=fSyeVWz5KqzIxZxWy7JeaWKSg%1z5QhBL_i5QcKq90~=pY)j5XS9vQjU+< zz&I8NG-6WA~v`zdAGgl;Xk5d}6t z>)#jtHCeJde##@+g^Mg7RG0aS=F=<1`h-UHq1IU1-BQF~hV_W>`RqOrk}G`)7lGjw z=%t@&zoYFM=I4bSV5+1S4XwIPvOxm?UlHh)BEHLOMxQbb_T=)=m$pg@vS^v(VV~?r z-$Q@HzZqjZX7ieXXN*NSwrm+n|Djk36(ERY2lmsBE;G6K5^NsA8${%7>pE$5;Uzi!YIVUbCx+=5fV)UuPX6 z-%Ee}B@e<7I;WMxqOjJT@Xuu zL?zc=+6jd4krTyCmur0;g#T_Q>1jacHalj^Q-ZN_lvS2pq-8M@w>N={8uTh-h@2+w zuR}TBUIO5{GI(9z8(yu%R?5s{L>KH<-44X!bDN@V08ncpODdRCTzSj5EWH8LZps)- zwaYa`hA!ix3Lb zXp+sJ(Uxtr&+$C$xhwkp+rFK1CNq8`S7|SxcHUVz;>xa7N#l6|UJWMq(E=pK{g&Pob@{ z|ClX2m!04mXZK~QtR%_!VUTE=tj zzFgGw6@>V$*4A11zb&rEqu9XV$i??~SEX<0LN@LEohEg&22@%%#u8^K>ZVy>0KURP zRP6(mpuQK&T~Gp1a_T-)9OpuQZu5e2>@{(}K3cEg3<&2_1NurMkg?Qt(^h4En7Agk zdkS9##y7Qd-8od!sx~lVHh8?MOskV?)iUQiJYO~QcpgLVQ6!9HBy`2p3Xc?reBi_% zAw4)25>U^!7FE}AJ05lmKjS}F!J~B^h38Q~QHp$ZFQ++XLL?^FP(x=-73>z_DYq>W zozzk_nw91*=%Pn7r6zi{JvT)cQ<4m=7NHKVb>8{tu3(W1LbUqEnnsD^xU-m24fain zvhBl&K!RSHQaa53*vhVD6SksA8Qv2EW7p$rbemncQ6*V$eOw5T_+=1J7AuY_aFx7q zQ`uHc5^nmT#-CFWqPYGr+n+kRPjX;OX}wPW&M350y@>Ucr%)O{s`K1NKJ!wc;5#Hl zDI6erp{!+>iHODe2@vk`kDq+ilknJQeu8+%Ug+D)lRjkILaQzJkZ6g8B*W0Nj{D0aL|2<0- z=fNY!(CZD4`a#YrjxUHC-mQ1bJ&!&<<&URB;koONjLDxzld;yS4e4swj3NFv@bViw z@C%y3^0MLp)=lJp1_AwnjA*rY3Sd9XQqkQ2ZsO6eAMF5@%a=o2tHCV0YTq`{MpkxM zUb{t%#0{_*<4@PQSEFb$`MQQ8kLh3Gx+vM~SeU*E;I zHs5_S`@!UK{D=jxEH!J}iKqE6!U}ytP3L)YbB__PhgIPL%{G0o_ zCJKnxwWN+IfUx`if40VcFWs#T8Nz@f+aNspp4T{|G1 za=rzSSXg1--;zMy06+R~#H_o{y_RfM_#Z9Vkl5c3^3S5*FM+$WK$`RPh(fjm(mlYN zJZG|Ko;3A(b=3KsnLoT3g%GlaHFBPDY2DA%^8n7%4hJ`r}M|3{r^0H_VMLki|`-U_V3;O*YCA{b)PSHiTy9Um;dxR=mk^d0_{lDw${Xeht|94;P z|K6`O45?*O%TZY(nszS>5G0qx0u1Os5!=7@Fs|pz_GA3l=aEr`pD04021-f4JIqlG zt6BL)(J+$YCb9K^H#zdUJzs&C{DTki)kNvr$Zr0XcV_Dab|W>N(m>)<@89OqydOru zRc-T}-~6(2#IlVMmO}gn6ZsJs+P?D9CI@T}5EAF?oIn%_01YFSYSf79EZ!0Li{tgW z&-{VE*aERGWpM9EoWkyJB6!2Uh~W2&ydrS#Rx9}DmL-gLj@Qa0Hb`Y36B!Xx%-wtW zG%?x95lB~2oR9bLrY&1$$i%JnO z-lM+LhF%YyuRLaU?f*!jZ8~WgQ8;T0K)`=WBNEr$S!@oKR7ak_gf@|$of56hjS2(L zLb}JPLil;`{oruuvqFoDWEgrh^|$j!tXxZDy7H z1q%oL8x{`4GSX-F*+nLRQ2q;%Y~UU$0>;A;!jck_*YN71lcPb4JFUx1tokxN_nB$Z zu1+-#R|?2L1eS$?HCd)nT=N6|Lpx~(RhqLYGwbK~U+I_3Rvsu|K1z2FvEKfh7`~yc z^8`3N4X_`H>hdN_e}##ueZ+ww_g+5+9+ezOIHX0M?*lO5uYaQ8X%E&P$v?sN?jf|P z3ZCfjmNe|EsjYZW|0~i8ASqf;X9w^2xd9V`A$oI>!*1^+^=JbE%NT`vZMTHv+uq|} z7Sev!<4g^D8_FZpurN0Kv*&M^_a@af%Gz^z!X6&7WH2fI7g2hkZX9LI79%acW!%}{ zUQu2&BChx%gCn^?4u^5T`<6Pnq1~Ygi;~)<)nV@f2W73-@iLb;Y<}@yDJ;r)Vad5H-HejZs=bv-us_-cyya0ZM_8G zFHNLV50j2J+25gu1@fVK64!xB8f&774&4A?BS5kEa2HPN&jiBLmG8j)kQO8JCv_ax zFqx_stOW4z0Qtz4z|hLpvkLI2GAt#fBUFGMs|(z(IPwzNI$e3~(cyD3h>`v@K%-{s zC!{6lqXtSRuLDN&(M2DQYWyl4$tQ5`A#l$aIdg)g$hr%Z6y*ZI6AI&0J0DGxdlBfE z=@{+mBPnCTALp4Dn1X+-7@t7m1jQ6I2Z=ztSMYu#;lKLlP6mAb8l+l$u23>$G!}Ut zi(?dbw%3IIE3_trSQM{=1n80*!f4QErF8BWIP&^X6aQgSAvsXT&qZMgl7Q+6 z9%b9@u8~LdC#+nBO*B9CDGPF6cZCE6;3iu6`(4{^JzhNgkd}9RvrrFHBn_k4-V8Ry z`+3+yVvMf_P}V9tnyEwh^zZQK0EFGv#U@`DQf(F_L%?t?Aa1{v6I><>!eC?{wwTS zE?N>e{F8$;{(8AT(1R)dS#`AcvsZ;uN8JwCSP#UXxO@M#>NFx?jlWiXTBhl{QVIai z{x4S@`17_~NcXEwd2+LWazKUoBl_gnsgeEn$fEy8Pma8JukwkeiHWTMZc3{mH;KS( zIaQ$Q@b>QZtaY~(>bA~FrMkjHK?cB<-vKewde4)mhsdGm6Cmf4P~F$hxEyTUzZU_l zWrp5i9joOn{yr-xc($4%>E$(mo;sC_AQdM(XmlOU&=R59xEKR8wtHJ|PZ%in@_2j+ zt^J~?G$ndt__SsC*56IvPRIaLZlkZyS7qH(RVwvblB|!!?c;3y^rGx`(M>;=R}NOS zUJbA>_7;q98}tm;t^+`Dlxus}yF7*tS0E^HwrWUIpyq)9Jaw2&Kr&tH?P+4NFHqOW ziuOgo;2A2(qzqjx{>pAXO;1yF8EWZc=XTK-uH5jY7w8L7Q>f&mEI8c>tJXMI1J^gV z92}(=?rH#Hx|LCys*(vQAd-?^yQWIRN>H5hBr4K;bbq_p71Jesi)M7-`K4#zs|j4% zpi{No1I@xzjTXIX=ey3(6nWySTKz1od9|96AI7O}RDDHd`|5Ud7NF}Uo2&xE;iV#$ zJ9^-@n0m?){?mbF*M@|MQzU`CaqOr4n8+cM8?Rb#G>!SmIIJ&S`DL+Yt^9arnGt>nZ_X zq8mV~XEkO2!D?=472&+cX)Fo&>(-x`Y9N!x&1R|JGjinz6qAB(HpPtz8CN-W%7n8e zve6U@K7pO)JQxp)i5e00za%|_OC$l3WjG+VeH)0!U~}2zwBI6Z(Ujf{zletp#yDL* zu?4ouMR;mnZzWLm&GH!UNpi4Yy4VF~!x6HX&AnN4xf7Pd8@)6K=4Ig@FJtaochsKmKevfN*T{RMPgJ136#ZF8z zE`W;UCQ9y92K`CrhhPP|x z<<~m1*%n3R61DB)=y-#4;M`Pb(Rj&unb-Q`ip1Gq(zx}6b3y{3ubcFt-JX=X&2DGe zPHO6oICr4-i0`V6zg68|Zj3*zmp{32eUU{&auCQc2vhMD=!Q44+S64}G{mu=a~6db zf{G3LR8Ri{1Ac&h+Ee>-0CrKGGFWx9y%`ihDFZC_VW&%usX!@5Bro?x7It>PcxsAv zg_pDW&o;oHxANm16Tcdu)R4jXg%0@gR^}++YXBXT5_%mru%(iG&ratg-Qa0eW#;}t zThqF}2wIx|O->I==E4Ba+wCUUwz)cXZgkt; zi8fB(vqH?R8z7a!{B{{q!0zXE?ujwgks$nxQ-$=wXtKED zpmv#xMN4)r@%K~VwuMqzhR+jO5y;jOvsk?5f=@w2);bHP|y4wn};V;Y5nSuI{U0s#Qp z?}_{9df)A?Qar4N$O7+sdgHS2Ri6@F0#_(rbG?Q|-KITk0cr!}Do4F@@q^GGpxDHH zD}XkC{PO?Anb-%6+598bPO_?HCi!r!Fp5#WTDL{5k(B^)WjB|yG}-rH#;fmIXUDNH<5rxg zES}bMH}?ZV#F$tFFyr8>R$@0jg_Kl=#w_`c`k$oNvknbpvK%K%ew`SxMCzadbxErl z)smK)N}Ez1qiLMhv;=9Dv#sIzH)CAy#z@AJS)g>KT6QHWU-bzElG%u?X5oWQBQtMQ zZ*Xw>9maSEX`FVJ)3=mQcObH_*gvn&C8m^T`bw?_zTya_kVzq)vSd(Y1PxY2VK2Gf zUf>J1<|T>a2tOF!yDT|`;#2(RnDN%$ZcjqE(hw}XyhhG+6uy-XnIu3 z0ZyC_ohQ!VJsM`KA1KCr84CkzjxJYh)&P763I2NY1~4TvJFl5=UYB)d!89!z#l0y* zxcTK{U_dJ}CWL6_M|q5OPOOs?w2}GiFG}_D_U$u9s9eq2af39M)*@SasvKCywW8;8 z0l;K)z~}Bi#F=N81^%mZ^@NBGar#*+Itho2g|}^+`sUR82{vWGObeabr^lA~+MgD& zTknVV0Di!%A_1?8{gL|QwyKOx#yeN@+c)=g4tOrlS$zNY$&U!EuJ{RWN+!(;E3J-m zyL(Ype-Zx4&euz)IY^*wV}2%FegEolZxIZ}0bIQ@$OD)d9_+=c!BP9;n+Pu@`kT{q zmSFu5&xFxx^HGJp<}3T2cg44SQQc})TRJq$1+cS|7U3bwL>rxc)|3}_Sv>Doz_{;= z3tLj;4AHE0Ggudq>UA2*wt1UU#Zw6k7cDnL=X2kZ$nCFO0tt4-UFx>Seo?hb*Oxs) zNsQcU0BRJm8Zyg%8waPxeGF#&dS9(cWY_CX8@2Mp7jB2mYOr{+@lGouu*_ z>P+;owK@uy+pb=^#>|Okx!HhSE(~Hk`IrJKr$s5>kg#tGQr)Kg5gavJzinRM1R=0X zgk6TQx|vO9#NQCY^#LY4b*6@FA_+9>Mqks558pIPn6{N_k#X{)oJ@hMu09JYWRHai z`IA?C$wMU`-BQ*9P`LZe{$Z4)-ki(&I1;njntp}Rg)_z0Ul{ldl0Zea?FVPMx$$J$ z1ot5L7e0K~e}Uk;%l>%Z5yfW?CN$G;W|_;^U6$_$FIhqU?S;P}<#E}~Hg@s;B$z7w z0f(RMOP-E(SU{%;5u&6@@b&4Wran4{;>F_*9b@Brgg8pbd?%0ID)b^NoN2mlpKCQL5PpI~(Jf+gQEq#xTC_)Wuxxo!rZt zAD-fe>J84(Rr^zPjE*M3gAiA>6w#7NNy*saD1bA6GUm89wX9aZeOi(dsZ(0VX1+iU zs_ydOZe|)C z2V6FZ&gcXTUZ395ra!WOo3k=R&!vur^QB$CB@}sImLgaFp)F)` zK#-q?bkkrs-yWzj+{w>qQpdx|%8Nu8Y3r$j_%5xbQuZ!p56JdYf|;O>1YspHK(twAz#0f+R>0=F?L^YqP(=X&|JU19tSOc zTz~ZxfF3gSM$e0v6KFkz*yY_He2NB-dJjHr zi-nbjECV4I)1K8RfBMhWPG{Wi5;WxG8-!09{WqEzHSe7!18JydzhbGC3iml>KRqm; zh&x*uWdmoVTxh1-+Kcwtr%2e!1EFbu%Lbc=#DI$O-id_$F0$d<$Bged21qE9>rE5( zTkJOhY!5^g*I#KbP`e+?Lr0rGh~fyl6F|(yDv7F)wj_$-aoy>-(RyI;Fp(QW4!7V7RIjmM`6C;8)g`Qsa{ zO%QY>8YAxvd~@DhJV-}t9!;&-S$}qv>3r_Ju}n^@ zm*vWAFO@ahG!KG4)K6aD;CQO3H=<;<0X#$^+pLxPB8vX8i;IiaWVzBWf&E5=aTf2;-t|7w>K)L(0A6 zHiRVTnnrQa5)r-Pjar=(a<~r{a;gLA23GZxDIMqG0t)8NHV=`PU0>J)T)yeQunI`= z2`Gy=Mj2A`^v5M{V_(Z7sRMp#cVg@v1(Ccp}N-VA&ygP*Qqx<-`YD} z97Yn+M>+iUN^OyOS(v<;QSgX+b@BB$O(z|~JQ>}}vQ$ zLZeP-v`0l6Qb-hEpdXRpB4J{ZYg%sV?kjj3ftiuB6G@9N)6S&R-=K(TqF^6eRP28k zhkbZmbkK)v&)2pSw}78n1b+A65-}=vnq7cEZE-kCTZ9Sm;_{;3F{Pi+`EsnhX7dB@ zlWR&A6psN%ZytTA$Dfl6W}Ef~5W6b3X}570;bskY#{1k$&6FiIJvw}>8`2B!Vy2uP zie{sJWOzgAA%TUvDT*kP@C_y7TZZlD5fL*#QsDo?GEgWf15je}{F%C+OY1FsKa=De zV24Ijay#vr#ox_0;~yj~?k^0XzOEXu5S18@|Q9-E#wB(C`b|bUW*qjd}Z+M*!-F`OMDe*8W zpYC%g*kdKaF=eGi_N&)a`H45CO-c$snuA7dF+YGS4rl2JCt094xM|n=rj8Qv<^yax(zB;~TSciD#qradOr7NgWN{*Ci>ak;ZvH6^+y?zlGPiY&X1lWZdLX zr^mB%&>2`HzNs09YgAv7>v%Eq&QcHzX7nDyfRniRx|5vZfwdURLWEP_RZuAmR||!z zq6WaA$pu>z`Jc@{%n{0G?yXN)VP9YQu@+DfRxSMErzl_C9WKt~0!Cl0R9sA+6So6D zU3c%Su8chY>zqqvf=wXzSr4x;_#DBgnd~*C)>X#`OFGo_CloDrn{$VcS)G3gq(}=2 zo@=Xdpq9Q)di(`LpiD(Z(#A+eA(ky~+t}^Pp*5>jnTz%y(kdn5KFgG^b%*oQKA{BT zC8Sd8ld?y-)5egCtVMYjw!p*GXq|9=VekZE9!()HsU>K{@~eD+0%35XqP%5NT3uEO zMc|pLLX)Z^zTAdKN2K`F;*BcJ7|~(m+khY!z%|vH|IA#jQAZGrr;ofluMJTrkBQ(r z$HWu_UvuLMg1?2L1l$#majp--F)wy1Wf2S{7}p(sG&@Ngl#tmJ8qRRxpc6(eaoL68 z-MW{f7oW{=-EN!DD8&wJy_S%3uC(7~H4#vW^9kuCAZ(g-$U&biD$-bSKf z9mLy+ZO*d85hKP>M_}Z`L&Ng8{L!@d{S*w?2c0lf;-4vP= ztDR%fyq+_3)z%S@jvAI*di&Nx{d-@Di70;{;c^8o{fgfT-fWaFNhX`_R9Ue`vvT{$ zW1&;^V{VVrX7NOASJC;QmPSagq%h^t3=M(MBY~v{?d-Il{TvtPL{BDzY#qwWuFDn24-Zi(+c!Z;Mv6-DvE^ZHqPx`#I+7p$)UCMUzk!Ovw_ny%CG=EBTS`F6pI`f%(Ny9N4!57f<#{=zoDWU6Jv(Qmg}sORY#gGgBQ9$_ITO z49YfsJZf86okZhXJMY5N$?B{=(GV;4I@@W9T_cwkm=-)G+`&r-Ax$8OStuMW z7&|H7MB}K)aw);Iy(5>mdpT~nGqCO(Nlx*tdC5#=k`0z5A@M^2PrI)OitdJ&o*4NU z{FlWJ!ME)JB;fiH%*ncLr=@FQm~{)==pTK*hftJd3; z_eZ5|^Jm`#J8u@FehoKgo-O=hC&C(R+h|D&O-!sB`Tk`$A)2z3FR|;KVUWu^1!_+_ z=ar%?wTP@+HmM|KtVCrP7M>eg)AFrQGt#O!!2T}qy2Jo`LrBhTu@3n- z=`ahV87+bP3B;S*vCEiCDF_zllLaqQ(84hzr{4uBeRXeZI6yDU!lM{!t8E!3E-YVM4~#GTM>A21m1>iU>gR-gNL^i<=yhTfarBdT7k-t}h}l$C zoB?&LFBg{6X7@vSwT3Q=5WD1*)z9Xhw?2IlD9fO^++`{DY9T$%YC3x~IGvM6Vv~9A z%FpfrPHK6`=MuX5y|Eyd(Lm2dl;k#*js*p;{jf-LVHJZ-RB@VmQu3p%7w(C zn~L9IP|?Y(XXOC7|58Fo&TIi=(eglQVv~1RdXC!Y z($*iHKRuW;EO9v86`gVqKMOkhYB(iZGqOC+Wy&C8bbb^c!i|i+H_q0(2>WrKoO^VkfiwKt>KtLk#kW^TM!-nT ziWc=KL7oY=VuT<}o1yXSD0A$`<{R|$mf1!ZJ>awX|BJo1jLPa;--Su(l5P+sq`SMM zAEX;WKpF(3JEf&NrAzvuLrS{4k?wqG-i5n<`;7nI=e*cUSNbtSpcq|~r7C2KCh;v0tiE~=g!oI4+q^CI&6 zH0Bq4zw%B(tKoAH5k5|8JopMeZiq-F(tYx-qlJ~*<761U4MTJlfWw`1`{Sx!;^ZpZ zXe>{>-U~8+2(jgsQWe_!gQcG@leouy1I(u>Py9**@McRXK-N>Vv)mleZ1nB`8KJJ0QtA zKWtk2GDUm)1EOK$m)0@tY3$g{Bdu}2b+wT)H9mJStk#MkeT?wcvFaMC!|Id}xS6$j zdU=ICs{zI|@XuzrVoT(1stcKV3)hATmD;rw3)GbDKOCt$1%fbN*FB(@8aRJCM&&Rw zj?g~pBvPH^H?1u%kiD4?KnbNIj|{vBEM?sUQ^84e7k|jZs~$?#*Th0@Q>i|}{kG3r zohHXeFUop^)S(fGj9<6_00`RruP}^WIPZ^>$wX1~)dsqU+zNH>z3kVeK0Otm$!W#% zK$qV*O4|F~TMD5r6t3V+6n1_&;ww!5l#c72^%}k#q!=(Ra2phHofLL=MWKo^JtSdI zDN2h$n%#Vp+S1Yb(qVr$h1u`aFh)TcZDx(L7jG|+UP!DDp4s-A4NNkkrL^Y}e9GDq))9GFSp&@z>61A@=%NQJN179T2ky(VxS zxyjW0y)UV&32Ff4Bc2MLJWoVkN`3Gt!Ht6 zOw(@M`4)Xi$e_OeDf>kYOD=81YM`Z+aizYD?PA;Q?TGS}n?0dIiM|F3b@8kd>j!O; zEObqz_7u8MtE1Sr`nt?lb1r&|CQ4~pNr{}^O>BWfRrb3=Elt5jdY!jKo$|31EYemS zuQZQ4i29`RODrw|tMFCpk&CQNc1czOI*?z8p%dFX=PQdjA}Ywsk*1FOYdy&kXQ)T<)7Nm?#Mlc@+4d{S$b>$lpxRPFk7+6zvl}IgGM>s= znYO;IvC-j0loWZ3{iy5^iCu*>=o1KyQ<0c%cK>W!8_f;uzOPjk^9jqlaZ_9O4hpu$=}DqH~yXc ztaKGZmdjL3>0h3$ZvDtypPe=8_DI>wCj0Ao{S`UUS-z<2E@DS2VZqYWI>KeUTxa`Z zOQGy>lH&@8U*fnMnSjs!-S1@%mM4wkX^McaHyhuvl|ZT7FZ31dB1)MCJ?zbom^dS~ z$TgIv&l#5Jhi2E9P$37N8k`Mh+QqLN17^rHx3Mnw*lFYQa5j6tRdi%ggn$!uPee>y zLU46ROeSCmHi7YJc~OFwl8ELowj2^J`U3Uzd0I2MimhC~bUC} zca}x89nti40^TV%p^0=ywlx^3{P2f$ zv#%RfgAe=kKw`AM-C+6#2I}j4!a=L9bm$4D$g6|-msK{t13a3P1#fI9L(uS&gw4xp zcZ8VaKfI0I>MPT?Ms;oVsp~%IXj$#Y)J~HqPZfF#oa>zvU>k=o-Fm|RkQPdrt=-bd zIrY;a|5ox#TZ@L?*gfA87pyzon$!khee@Z5i>q z85}w0xE)~1ss3f@IKIhiPD)5!WLb#kmRPRK|Oxic- zOxmA3On8-`>=hIuF@0X@n)5t~lBA zaaLMRIAQcPaUrp%-dxRg-jBxcKf|GXVA0lH-HVC-QA7qQbc}qIIVVI1Q7M^ACt@O7dHvJey0mIw z7}H6W$Q-mRhWXbKcrNGh5N`Pu%mC)(Y(eYjS48rLxsW)#jOrXL*EA<$jXI z{JT46q^k~~1ow9Em(PJE90C}_+d#%Ax9@sVT7Des?JgI}ZCSoQY7@}!=No`xA~=!) zQDpye?NwLfs$+Wh#`A=d!f*ZB`K{kN6gd`7SQgI?8ex8ayr6WMekBV4Srf$sd^d+< zrmlX;{m#gc3d314%prsk+iWRZAWat!JNKAuPpdb z+~vXNOvp`5KP4yL6SP?$y9Pu$z|&*q6oT^{1^0Fuk=*r~_SrMDwTA}8v`?BdyR;_W zr-JWxYfwLH@)%@qI8B!-E2Muu`=HEt9nFn!;_Fvl{f3Ru&4Q zeqz?gAGuOC_ck`te5>R;-+H<6a-(LiZ@1~`{dvEey$f)HCK!a3dMAT#%J7z*bN-ZH zrq3)gk!QeW5?9g#HtqE&G~O)c%Q=qV^E3wl5Sr`(ZyfsH79Z&?1Xryl=MN?2nhv_n zh7poCe!5Y9@3+o@%(De~72yLDW!!icQH5DAQ0Kf?2i>AJk_yx9XO`iqc-iXoBhzzs z6uLMp6843dz!8xX(~J#l{T*r(7S*xF`Ihk7b|$M+X=AfJW>p8*(-z_M?nV47mev=1(Z@q1_5;k$kj!Nz$ zw!jG|7n~_(T0_hL5vmha{z)F%-fg|F2;3pndiWBzb4yu@9yTHm9l3DIibFs*q1OyS zMqb~s*vuu9Wvtk(*mTl*M+2wjBqX8&L9ay=7n*lnBIV|7&ZoTzP)(z5Q>-+Db183j zmp`jyKr|F0nqka7ZK+*G zaG>njQ2?66qK2m)=UsaFqoH!60}=kn&c?F>)ZctMC2&Lz^(p7btw zlwPCaa-{oBRhk(JA^#n*Ej>DY!%+>^ zrT4;1;OH97J~gMbeQB`u;S9vo>1fW34m=*3TZk6+m9#&xE{g|A#Z77=h4>-4iD-MU zxB!n?Bjj5G!kZzDz(wpM6oiYzUDn;mF?%W*Y{6nCV)pSs!Z5c6C(D?u*)-zu;lt*K zd%-tIp~}l71ze(184gTl?{MF?Q;@7WZ3J=-h}j(M5N-Xa<&*EAEt|zI7*qx9?Ypz_ z9n()gj<53dvE|65i}45B1|CIBt*sS?ViPFEEucYgaixBRyw7R42JDZ zxL#;QD&uXen51_elS39rATpMI++VETV^U*vRGD5n?C+TU{?^nZ5w6)4svt)VOKh_G znAU5u?UaYr95;QQRmOCD-e_5&Y%GNVA^#SI@&qGC(MnSK@#~4T;AJWD1^X1%`B*?D zRoMzdsd0hD8v@O}Ugf{x(MB(F&?LC-uOXK`=z_1rsX>Buxqyjn-ZW+e$NQp~)26&! zBmE3%cN`K(Ik(f*%wU+lBxxG2qgCvTvs<5D^bp!m&*M^(QPr@IGT)+~argP)y@6Jy z7W`#q7n6tPIy6D~au&x9h*yh^oHC%n?H~?6M4h`Kr;kLy+2_lzM|fvUj_L(V4oQyB z<9Y*y4ZX~AuTE5%BW*36N0KtX=3H}?9|w6SPL%RjGO3WwObMlNLolud9j-i_qPD8Lm4W4$3u4S4o(WmIw->Swv(hoy;@t^=-MPhE#spcG z$pB)`hcdVfQ5;;ej3J|gEWJByTY6*~UT-^!XJdlw3~XLCmrYXk?LUMu$F1)Pw>?lx zmIKKMVyGl2O)l!UjpKw(N-P|+ZYsNM^}-d%qtX?!Q)zM3{6ngyEX{S;g!Ob0xoq<) zGbyByDngC!%>M8rxTRWa@s%yfhY8`&UD2aKeS;Xz!>})widN4iq@I&L4D0#iiC%Ym z{~g_NR*TqQOFQ-*m{GX8pX3EvcdXWPCB=S z*fihT9^Q^lzzCPmB9FWJU^o&qFoVOoOAoc&9s3w za{Iy1nq%~ve0ieW6i#art&hk2$F7yB*7kQ-_@b;&OwvN~gYd&5VZZha*z}Y?2c_>u zmUA)sdcL{`_J+Ad5N^fq}2YUa!M;QCc&va8Qmwek?c@HX`=}0>OONfDwp55&OKKyE@T!319v0>Kcf6lbaQId^ZIOC4eqy& z=IoiiGSM0gJBV5v&u26AKs8fKG+;A+%ozH7kQI^kYj_5e+L+x&KDV5wUmG;I;BA*gdm%3BL?w`tsT5UE7K*4Y4kgGgNDTyos5Y54ctCpzA8+d za~58N;;f{oCWdX@#0KFb&r{3pO^gF(Kch#zt&!xdddB51x($I*K&e|3%<+m{sHGuM zyZK|_p7#Pv_snj-Q3IV}`1#|9a;k~a_#5uQW}4d6NUaEO{Ek;k{6k|V3enqSEaheS zXHaKzy65L-=6`edB?EWA-P>cfh8_Xpk;(v`aB2UH&vdnsENs1d#-hTm+%2EE3_Iie}R#OUU@)Pn2Qou_d>ZJBcq~XnQmL zu1a$8nlf@4o-@0hiXs)BsxCZHl6ZZ+&MF&Cq%7b_WG^UwU_}sm3X_Jt6B~oq@WVSuM5h6>b|t^^*zK3r@Nv))`+|mxPaQz@rro^+Fa) zw+i~oM{XAhUVQq8?2;Q(GHC|}?>FX)wpNEk zH?5{xJ{cX@@D^t{y{mH3!tP&zGQ0Emb`_Wu@w=SnM*}sV*W-|f^CTp-)FTtT4rTWz; z{XnqQlPk&$e>-f zAKW0QFeSau_#p!JoTX(C+y3=J9b4kfsemJMBE-B=SK74{rQBvJvN%-m%vn8KRxiFs zEKCELndZT^MN7|HPYzzrg!n^w<57=nQ!RWl#!-G_dojsd*Z zLL0%`l&EOF!>SkJc{1S=k4GxpU43Bt3Dc=o2j(Yv;N}-tH4v(ZJUq0A#_&|Al%_Yp zhQM19#f(eHG174-4836<@`&HU`{5U~ph)cSP*TyV&VSK}ea?Jk^-zmIHF4O07%F^D zxwXMG@Eiaj#BprOB#-TnuV}YcO5>u)`oG-r;!ZuEM#YB@$vwO*!@-#kS*3#~d(nO; z;vRw^)+u&m?K7RID|%YCR6C;sR6iA)GrYX}| zeHu?yCAgitU0Bk*IxUE5<4y_B4YiENTnFR*J*_D)JSrScRV4e-5LO`cWUtcaVp>av5+Cp z4pkS0-e#^PVLzm=N$VTb@|Q-2$(JWC<^W7Fy7p(I0=2Me`WT%SE25^zU~h}5`KT|W zyze(&1@eD6ESt7<`6ZnBrBubY8Lk@>)mK>z&UYtIL3?yWRT{h%Y!r-gx=j<5UZe_Z z3xkR1PqE_cpu?fisq^3XrSDZ|uYujD{hgA{sB@N5q?tc(pF2wz&Ra{Lm6 zv5uP@GsqiC1j*b}wzu09nkULJCJ^;yfuhrc>9|^N!BS9(EdZsV4d^RNvy^3jl#IjU z?muiY%B!$$_?-Eh>a&rGKP!~vc47K-=U`XE0{>xz|S>H>XgAXH1$>I@vZ?X z?~zGrZOs*J^xwnp zh#4t9Wx>O{Dg#x;P>X8_Nt`c2@Li8kmLaQk%x(niHsECtLvO$XpNE~2?}i|oF-!r% z(O)o>4*Lc!Ii8rPCcA_mPZ^iEPOoh*k7rMq>X2QMUbvW*=(c1RYt-kvxeTjTAYR!O zr0b0_+YDd2Ls|=n4Cb^?F|B8hC0*xPW;lWVX*J+R{3OYfaFG74`uQox_&1G^gNh8D zb1~g-s;tjN`lwwZL~FiT3!P&gT?vC{+$-vgq6QQgwturvlIEOlR!zn~S@#fFFM0TF zqvPuzh_~dg5O30Y0;zfJz&N75d}7R}&$TIubg4Ruk37_|*fTMUC1+>XhgC@;r7T^j zRO{vaI|9NftTuLxWt?0UG<)0x&X{~Ju~Bb+6!BJDZAPn?ebtD`N@7ki3>?oIQ~Nee z%akfV!HGR@pAd-v*AxEh#MxfS{O6lLsSHrdsBuHcJ~cTNJLvxUut3j0S&KnZIvw!>cy2HZxhwg7DY6n!one6?=Zq{ zJbg}S_B%^|v~VXnB(0p3`TB)4{Mw`>xz>#OoGcF+mIH;~4jr=v$eX3<&|L^-bA5FE zjYs%f&u~9%gsxOnBueNpPORBEhcQ*X1Y413`0fUn<0t_-?0JI@`Jj;gQC195d$Q-e zUTh>`Yp06Z#r$VA-}L5JuUe2r3Y&G^6aU)}Jqc17ubsa)liAEE*GlES>gQ^fC+d)U zPDc06gqVxGDz#5`7@n}K&%b+*y;BAMXx^UkQ_QRF$-kG8)=U#9)`L9mX|$o5Y@T}R zZOsYvpmW+xUm)#ONnP$1SUBm?jFbd7*u3RnKvlDt4s@`$WYJZ6E^Em*Sz(;=vIV9u z5vsBXn1(;7$*+8T?+88^che_%-+584Y&<27A*?9OE}WlS5o6OI-xMvZ2zukUzTx>! zhBaRCLsMJFA2G^L&l^Qng+(hZ<+%QfDRMf|wMR zFW(y*Z*Z3zfF^r-uW`8SM;0JB?O1Mh^j5+->i>%aA9?3;e6Xw)O~+T z)q|y3O0z@Ox8C>vM5Eks7n`wXDBJF6(OW5KK#;hJ`8~Am#pPG^Hcn~C+22EnQ9J(y zI^i0=eX`j(0smz5e^KQ$HvGCmzKAVPq7D@NI1ot^c&om7OP2HT@tYSOqSPKrRk*iEsh-UFagKIhs2%(i^BJ7QF{RS zg4Ug>m*ZS*Jc4(FqaRmb>gP5>B`x*GxPFJ>%|YasByi3nLQ~DjU*ZKQ?cxV}h0H)P zSscZ03>}Y`2?t}9{Q|_`a+(d$$=vF-f7r>Ws<83fBD>10?fN_W-NaPzuO^Tmsfzyo z+Ic_SO-G9B7siV*PtfNY?w#lqIuOQUFMEqVn9ixiS!&Q*AvEOSF{#}49)y}}ONtzO zC)Jh{&^4V{6Vq;g7Y7&ZG~C=f**CUfB=`5Kb&cs5yPtw!gF483TKfJ5{9cy9hmP5Z z@qBM!q$D>=sGphD{fNq9vIYBoh3VXi%s7|`#<}?u3YIP#34!3J|08 zwjK+ex7~!$yxVb*^G<~-0oZlwn>5)j_huYB^Vx)Hr!I5eT-do+!)bS-+MS}{&|3!? z|IC1g7g(LN;cdq@2BN2H(s+gZ$E{IJ?#r8oq_S}M6Nvs|HY>4lkwC#3s+`t)z3nHN zx*D6YvI*d<&lWIadNA?BfXWThvO=p_-}}#6^;YPH?8j@@YI~lI`BFk`T%5*`cQQ#{ zO@r00;iMJ_k$14R16EPrF@)eGKqOC!KnPl;dhXgIv`}u~tPmV{7$-Isn>d!%u$g%M z?k3}8M7J7rIeMgzfV!%+W~ODK{F)PB20)-8V9WZ8y)6T&X5Fr#TC60a{5!tiu75~g zC+T$u%e+hk?Z0TblfK1&Z~o!dEv8m}!0Tmm^cKN7n4junIO@DeT+Quw~XPy7D2z!OeQ|4`D=51}C3st6u`3Y3|NznRS;Z|*RV)m%HHIMwaD zRL_6kD3!)C+j?DwW_8Kn;SpJ^UR&i&7r~txY$~{l;szSchtkv@@xKDMDjl^l$?Syv zKuD67U#l9bWte}jOT@8?=X9=+es2K~L~=28()#}7Hz0ny#b*0FK46HWSw+(+#^Z^6AB@@H(mY|3%HjmwgtgK_*F!n5|cy`D;dwZ!OR51#WCL!PU=Rq-$fYGdVvGLvh!@{ulg$(C?IqRcs#wruY`rIFY zPQ9CAo2Bi1m4Eik6hgeuDQzv*j|Hx%U56UV9lY0n77Rvrk?Bx!kY{OWG zo|cw{OZeu{(4_*r9Wdxg?M^7lM5ZU$QG=?SOmj4-FM?LgKX5b*Kl3=J^6tu1^+bD1 zxj%dg0O*n{pw+5PosK6__6<}+H1rYe zrD_!_-xMvsQ5w>Sf2+S^7WPv0gN<&pUB(DFALbP;_3JBA>kXzb-o)O@O_)eK`WWwy zTJxLpfg&dECGCK0i9A;F_Nh0w41HVcBP3>h?1rOuwL80J(HBUYYhEQ+-nRmJeeXe5 zcT8cPMlJHtKR_b zM7niXsfDu*Bpfqzb!N5c4iD%_WMm+6yOvUOdP{i`ux_frTE=KSz09wl(^BDdrMdFj zkAoIY0DEhTj``3NWlLTe7?DNPb6@{eFcr-YgTdig2~b!dUpzC@puh5gH<^Zgvrxge z$uupO+pX+(aRtM?Z7JL8RvF z)1hSB=fjXVzA-3)PipHJ{Wqc~`$EPN7K$tZRxs9V-Ya##1b z!y1{62XHOp3aC7^99mIa9I)A5Z9mXrd9Qv9^axJet0S5io?Gj}7cmUNRqGgbBc!v^ zPJ4J4G>uO6Oo!tAoQ^^O_hk=AbxWbK-oGA>?X+(bQl}*2pZ*va(-6rqldg=My5;q) zf`5(9dMzjKn*!_1H*?engOaAB?00c`J|OlWz&x|h(^RAKu@Vo&~Uj~YxQybHhvEy^KtAh!zjWu z_x94lymbWU>7_=N1CR(3@lS%qTy<3SBmG2P0*V(Y{SK}hTIdxmo~ZpOF`MEV?Bf-& zj+mI}A`FvjR^Y?bjJG-_=1;WOH3|y*Y8Igipm0lF2ej=UbUy8vL`xK!FyT#4Hfo3K zKb+{3W@c(^jyLUrn>`Ld%O4$mDx>aV7EHjm0i4x0R+vT#=|a+iJa-?W?2#kJO%;eO z8LO~bztHVTw$i5<;L}NnkSUI)_)l{p;awehDxtM**j)CcHa(PrUqo+Vf-M3}+d$7Nnmo*2iSq9TG$WjvxER(eVVSoaDy`h`Ch__U(OV8>y31`gkz7 zL&R%07icm)_DuZ4vev{Aicy$(Mi{p8a^bp1%O$)iL&FY#kK-4%3b5og?)H_L?Rcb3 zac&V$_+gr?$59k2E{rW+Z@5wV8Ii@3bhdlJ2D{qV{TO>;Of*Q!LMU^$XaxY`3LFQ2 zY+gsQ-9e0)l*Z#OGF=~5QQ*|&a_Q+#@!_<3mr)2KOmaPYiuvC;FSS?GMtJhrD(i$(upkyV)YeGjW zwd9PtThRE$2EP%F>&P&qt?}V<)YWS67~IlUK=Q@%Dcb_*%4F{|3CovQRxj2dY!~14 z!Vf|>)|qa;assBi$CRF~3xqw@J|0~%?@5sPtqb(D?elee^nh#joqd|5q?a@;MUtxa z8fr6Opj%@Jbbj=)APtU&6;C?6RDXYCmLQ=2TPMV61>mfnF^sO}ec#=TXy%PyS5wg< z0mKUQ@&cQcY}SId{Wu{ML(%!X#YTZBlW7PvxL=7;M`$ot^WT(SvSNJWreGPeCFhbY z(`o|3IMN5?>5Tjs8eO~b6jI2ujuY^2u^X2Sw!n;CzDAMn$9Eo-D>P4C`GN1ludCwX zcPT-BNLeJeO+2-x{UNLWP2$@@UkTUJ{bN$jL7kCNK};naOQk;vawdMhs&M3@`wKn) zc$P)Qg}B#N@)u(9L(kxM(r=fbuNVS_TLKq_ZOnls!(&v(pilfXSUAD(-R(3{y!Fuy znQip9>2JIZdnPO0+T7^W4oO*QfD-{;GcTQ`m=>#rZArHNRHcv;wZF7b6I=dnhf(S; zy~M7ct+3@HP_fv6oN%yeE%KqQL_ZZzwxq@ncW(o3jesGPw}TiSZkvm*-h;D5hNuNq&RueCG<_ubj zAz$0GOe2ajpwXsd(sKUUNM$fzE~S&`g*eAN1__q--mg4f2@5r^stMH^+VmyC_nWUS zy=(yT>IhN`l*qu7y1}y;Jh=5|fSrU-7RV$2$>%`__&g@dFyv0Dt3L;^-_S~Bpp7u2 zu#OdxSR=pFl-;(6JI4|OuYdaXSwVX;?6w^$FoC!Jy7cc5&qsQ(_AL2iHp(7ClTJSb zVrTqy8~h>$mK!l0#q4w%k|#@esUn#=aA5?|v|2SM_SkENx>FBnvK#D?9Ou~(_bI9< zMuwNtN&TxgcD7!^jgiJgqO523meducUYVOrs5-D~&GaTMNV#`E6-XIwM(9ymoE3GK1nFLs zZL)56nmitHbKS}EIBIChke%g<3vhb?vj*K^MY1zFkpc{`n?JeX#M0P=-{wa0AI*-o zus~j+6A>^|cX*Pn)8|o6MtMnw%4wm5j=d0~F4O}bXL3%!Igk>NZ>9CMC96^&vRZfr zd=aXT5l@G`y?ozd%fz=#*?cfIz*HmBAOP15B;hqvPU(JZGOB&fP{^P|NAgYJa+35T z>^fWga=o&jaa|3o+n330BYfwMwO=kLvW28(Iz*Uq2@_18axERhU*c@InG0YX?cWMz ze}c(QKD_g-M64?N=F6+=+!#G{_^N+H@KLe?zinDJ2=PH|8m!74kQh;c3PDSB<*ESq z)jzQ^$eAovCp}H4h$6(qG^*w29reW&5U)VoL+7UMCulwtPw!kRSex-FG3wt2el`Qn z7Z_aEZ{*o`OAVyl@&?mNwopHpv59}I^xCVVIBA5^e*c}0W;REH@0KTBdSDiT@sl+v zcLD@@5%#X@Cx&oo-}%^sJ$63oR+lF znUi1vWgNgn9)XVDVtw?VF`q%v>*WPRkM0rSJF$n00%69S*AXnQ0OmG2p=DNN&AFAn zaj!?Dn2V07s|*T^{WGBsofHcvb>~lNZtC6l291%Qh2%OvWd+r*ng*XqIh}8D)gTlb zN4)R;`W`|2bm!hL0Z!M0vt^HOBxde2?`4r?xkWK`Br$GkX47fl0yaw;SY%Wk2`_fE zKk#HazMa6Y_G2|YG0nPy(?}BU$il^h?Rj6>)S%@%TY$Sk(=3wG8`a56+~Kzxun~1$ z7C(U;YMokT0u18_Jxn0USXpN~07!kIK1sGHaw0cZP+4-+xXaV*r^rl>bVXNGb3?2`dY&fp7~FQj#CKvX9UZ5Kmi>Ke~PCBzV>J ziqrA>Q}w*6ITH?jR}KNUUls;T+iNj_x*=v;lZrxIUnXeuPzehgnnl*760rsFH5vR#45 zKgdw86^gEfv9;xRf%m<uDG?|aOuqDPS~~n>4RZv zon9ENXm%NsC-lvE7tSU2TwE-l^M3Ah*@}dgm^2_?eBh)pXJmT@y{{$|F)H59-PvSJr@tblbS<@L8cSkM!-lCB-UGv9F zogiXb#*Hv4ANr5WHz^7V3Q4JEfX^2t88*wbAjt6bKe^WL)P8?Oj`)zk4dlc)@-ezf zxkshF!h2o!t)2t(n!Y>a zy5d?%i++M7T8S9yzZ`9hT<*JNk-r3>TIMY%0a}LWWbOxJl=-Mpsh0GMl@FK28d1ZK zMq%5&P`5>r1Cx5WJ$PME8<%U`y1`<7rVG(Nr?dLDLaSRNuNf$Mmz?Vxzq`fvNpdhS z*pLhL&v8b%l7H_&lxFzx*<>X0%sSZ>hPw(wNU6%w+h=x zLP7bd5*7CGP_nT0h*W)b|Da{uP-S^1%7kqaU>-wnY@G9=LC^MZj7k5_#YADKk5-hm z|MQF=4}R@0r(Kjn8e4gL49r^+-Fj@vhKHPe!{3J33Mnn^EirYbl!<$B%tOGWMA!uR zns#GwB(a%AvkEawa6~GL+E6+Kysqx)iWs`))Zr*9bC^ZdL#aidN7|;E5Q(o*RA@M* z2-ks$* zZ{GQ+2vdbFl!aU9X7&J7YRS&r16TUZcW&d{%&nguOa{>Re=j9L#OG1y#_p)2?E5Rm zYy;Q{aswT&h46z7i@^J}_~L4R@IG0cJSnWGwy&;G$LPJZZ-Hi^C)dEK(joK5bTO0M z$nq8FVg$?fZ&X3Fns@Vv;I_gUdAp{lAFs7Albpiv22lbvS@r|lE5x_;_m&`Dx$=-r z0h@BIpz*xZ=OmWaCVbpMKCC45)`3e9mXsBv9p){Q)T_-1elq*6wPhlFRPEHfSN&pG zQ2l;(!er8aw_aj=A;qz5;}bz9rc<|AhBEMg@!1ac6nd$BkU8XWX3vgT*_}B~@`+sF z!e+mfD#}Svw{6)@Zva%iAwb)35!lR{C*^>`>;my!*yUNTRJ~i_l4sPfdQ~2B2~6Y^ zBv+K;?#6<=h$AR&nJo3Cw|zI#+5KEuSxwFyuWc?WZW z9jhS%>Bd^glCJ%H(gIOs||V*`@LH&L;0)?K_@?akXB5{z$0X zS2AMC`?w1|%zGyvxLq?u*LU8xp;Yui`?M=ww-r{^nao|Jn13(i_-HXXR*>FMhg^cZ z;UQ94A51#o$qU$INgLohM^F+;>f;JWR1LL@7{Mn031tgeC=C+vpcMAzZ*(<&l)?(HrYR=i4qL)iJS zdnndtr@`Ng@hmcmlUqV?Zvx#mBI;k_uLcTnBpEr?Od)d&WQhi&Z6DP`igD>PqiVy& z8t7AW_q3cI?#+NJG&R@>+6%QxYEg!+KLpl=CHk8{6F6`7XKd*Cv7B`e>7*My`8F*4 z5GBRXVes?5Yl@lzpraaOX3Shdh&bvy#Fi8JkPf8m&-RF@C4x;bYy5ETxW$u3bg&rShist>VX?M0m%0j#lbnzMW^9|!Q5Gid8pUKWf zU%m5t2fx#?8{bvP8PI+gEH~L2PALV>Tid`V=J_a*vx-DKTkfU;ox-{OZ8pN+z#i%K zqg)e5SG8)fOLm>eQzgx;_{uw@S8*Z}=sP+e1-HcpZr=-%tG9q-xYb)9Z<+aCOznTe zYcp@BFWH7;$yDOKkvMh-9|bC?_ju~+{ZI`G;?%7YbGbk5ZPOUg5J+^h`C8I6Y6-Fv zO&y->C4agGDe*+ObXV-Oi7|$tSPwkvGew;m?C`xoDiLdg1x~)PLs3XjUwz%7G*z19r-MWUJ3;Es!;-_uXm5 zXee&%Mp%vAo5ZTxhs?k3wpbr4TMJ7em1-sGSDopiyO%8x;nDj_PIbxa znh~8HXNrA6!}%*Q`)se4`MufPsIzO-lTQ5YT3YLtHxB~cu9Az$ul@V9|{UYL>S;Fnj6OMgV0kp z#q#f9@<;V4OPo^i^VYV>Ht$zITsLIl*Lan~Ect0&jeXmJ zzh^@GgK){{Dj{6|9kPy&qxg~e3JO1_!pckNHxogn$eNJOPH;?aNUOW_agCBriQ#VxeO@m$Oy>gIShVBx zb23h*4B+Jd{!$;!{19)+8ttZ{=As#>Te@Cy3Yd`8md6_tFOs+5IzRd|F+IB{D9Si# zynntO8U;p=zg`6ZZnh0>&IW{w`lpC*+uof16o)FOc0Tr?!#Quz$P~?xn<6J8I_vt2 zD>&8p1#FVqm1QXFu z3j&}#o0+j)9Z^t5*q-M@E6ua>3PjwhvAt31ze1zG4x4ojPLp{nFKsXK--sZ29$I7);4Xyhf`>5E=_wn#yxu_+y=d}(35o)-i^8XN z3*q?SWE#)I(-&yH9H=rbIEELAGLeQOhLbuZS}avHI0>IUKvqhoXPmEYdpfQU&gB2~ zh-mU>S!3uF%|!w%kwfsP78VZnO8^e}(t7WWWr;?IV#$;B93?lX~RhKDg z0${A`otBvgf~l?8N8pG$joLjo;Pr3E8B8P!n z3~Inl%G!l*o^EH_a00qEXcREg6Ul{9;g}jvfO9K%dqV{dg2^8g^}LO_M-*JYg?;|F z9fyNEA`*J%^a&sDg*X9tH;=K{IB_{c(p(Jp^9_Xrl=@aMncJtGU#lJY$L;)+w`k0P z!vh%qFw|$I8Ka%Pw+< z9_MA`OcZCo|FYvq&pQq$Rn!FySXZwU=%4=d!*2mk{f~aQ===ZEcO$|e za6CI#Z_j_bhB|7Bgiutd+2YZLl+d;aeq{3j;-`6unr5zmAygUo+-YXAL%|Fwiw zpFajsq4byD$Llx$a^?RUDcFhp8626R`M=x#Tq^^1`hU0mUm^QH3FEKUKJtVB1@-hK zx|}}!kIOm^(h-`cYF2vuLtqyRL)_`X5GeR=U-xw~TCk)l39XHN7&d|*EJS@#M^$x^ zIPGa|WhO8*D}-Wap&%unrv8vu5R&G8+~9G(HlD_%Fkw4h^DFj#+$gdE|8`(9Db?hv z;ceXd>a@Q%y`wH&_%U!-RynBoCP(X2cHmEJAwquYm9GIoP;eAb(1;>XFql67^B+aR z+mT86g4&4WAfa#aWZbCqYzC!!YCtc4KJBGX7sb*{^G60nX2FZGirI_f^OYm~*QWvR z0FlBVcF*Zzg_4p77@6WUjr(By$Cv+oNm=l^bmK?YPKoNW2WJdm(zVheu5ilFJp*&m zJ#R6*ujc(h2m%b|CpS2*g_EB6yCHERg z-@2$~3h~n2 zv<7K3!)E?QMR8AEt0C9*+IJyGMfm2zp?Z@`FlWW(?_13N7CK=4AWfR~iw9;XR;ZqN zq2IA^W{a_Q%_poL0rTzJoLc&T3{Al zacgpp>#p!174*`zY&etMDN{FTt{VQB35*3DER|I(!A2Gd7mf?CAA(R^Epw@Amc zHhw7N&L%?f0pi`tp~t--bQ=N4@Yxtya$kzj4iCEq81*7#6J*cE2fe*E?UK^0<)1W# z%kL$12$(DDja2@+Sik*2D%Z~*cE@Bdo#a ztUZF-cyjQ`Za+GkJ9*#Mk$qy%8{PzBePKH7veh8$;(L3pmSyf+dQQjDMr++MOy$Z@{V4-61y!wSOST(?aTe(xY?Yni! zy&UPaVJ-v*kZU4%??+*CiHW{EFm4r`3QM=0a(KhkzQ!4dJ8M24CN=G!P>SL>Zia}q zuotF5L|T~Vp|B?XK)Mn4R!}t!%&b&X)vm1_pZ%?JXntdb4$z^n!%S!wR-}A7E&26v zh}Owl!Bi3UO{4n!gpSk7U}@Fo;OKs!oo^#0aPQKB6n{=u2$3YFfC*TC;WPN=kD_Ez z|8XnC|8y(jxE{6Rh$X+vws%3qE?d7$S6ZA=n1TCA9IFSBEpZk4!`pGP!Jk+^aY0Y( zTRJ(c=y6Lzst1k)hf-}@h#z_;?vW;AK=<7=w#mslb*Su{2@6MD1!-P3|Mp2I@BiO_ z(ulBNCqv8ROUiifhr2e@*dgaa?(EOYu$Qf$U?qS(hl*~1`{T(cUq7q9mH1y&ze1hp zLwxh40I9_cdh6_`?>|VyDhJ>Jt;%V*t0~XnYbX<^t$S^nX3dDy{q(SiW>gpl(j9is z!jqL|L}Z@o8_7@NW`8f3x=mb4x248w-cN;-sDcNmU@gcXUWJQ?Cb>QQJ%naO&|NZ1 z{g!sww;t;C!_V^Gi`nxY`_f`5%uY-O zCR%KosSL&t7G(^bqfO(S7pHM-{zFyr^?}7Q{aLIkga+HN0&!B7y!+X`SNKtYv$RoGx>^MDSm(IeR-fpy zc#QMBk#{?d>YWBn^{i1?5L;(SnZk3~sRyvC6so)*~i)6w1K%nDnsf7}1V*MGRP-L`+=_}wbH(5hBT z(N^tQd+Vh3-XliE-XgZDw(e52_MWxHioHUuwnog@Qj{PFf*=y#xBGsc&++_z-{biH z1CB%9*L9w+^EIxsM6U>ibpvNY4B+>sC_^3-WmRVx8LlLWaacX zXN*Tq$0Rz)-tzMIkedm0PA#{SK&W!Y0UMCh&z`~LGom8iW+!7D(+qT@QWLwC81PSy zJEG_yV%t5@LtA`rIxdYzuQ0)fFPTnea=NCmY!DDMm}bhwud%XUY{BgGm04wo61BZW;U9tpoIU>g75x1?4ZnlV7HYOCV(UO9# zY>`tnDtNyH++CeVJwZc>azw&F&6!+7dZXN2u{XenEL9749dv30~0YO3fy zFr_shifq3z;m?)r_gVT+4k^EXJ_HlI`b}>rCdA&KX(@y^F}t51o?s$cY?}mfyN923 zkAFey-g+E!?0AYaX)s1rr-IXd7SkHpuhn#V@2(_+QZZSfC2Ir~j*aOPIa#}TmsGA} zwAIOWE&3gNKt#4m=3cz04_B$=TLIghsFE%MqSZUr0UgjpKwojPTsYwfZJ za)`4|W$>SCLg`m-aTaEdIuIvlb%_36JKIe6Y9R;nUslHz7OdLpy4=Zk1WZ$aP15tN z+AM(eI{9+uQP0Rk@&!*z9hnBF?&FDF-0~KeQNET}uZB*smV{GLE zXfS_UtXwdg{J~V44#Nz(VAgj8XOq?2m~G>@B=s-S&Y{0_BNEQ@=>I^bekY^*-0A(x z;4;{J&+FfJeYfnu1u=Z6rX^sff?Wl(FI<}NP7QlsZRVlp%D~u3I!gokS20C5bmLUd zToSzj4m-Bq#Dxdq+ymGeV;viQU-UKVGD+MRzg&EYe5^ODy=bJbg1@vY3279u2rMwUbxP*uU-pTRH1mHe7_H z2x9Qr3fYfD1REjTq;l*|?pYpwNLUuvnwfgcmUQ?s5p$DH-tFX2c6wWc{%J+asxDlo z=pb6gbV+k*<)QNaHCgg95q~tX?%f43dRgRS?%Gv#%bW=O;r;GCQXgArN*D3IdsYDW zjgMuLGID;sD8tHksv>ae;cA3|NdV8ETno?MR!AEf0A~U!eXOBSPlxFb6UK~-!#W`IPOqJJETW_ zpYbYwWGGZ_>F{S!cDg*pS8><3??Pipu!CO}jr(%Mv7o)5m_nj&%PT}MQaoHpWWHhT z&K7F`Ng(y$5U(ZgkaV4j>j!DCzf$QS57Mq({_GZ!#lug^3iA%HDG5N=VXEkQ|6eY_ zS)nxK#)+3iF3{Erlx; z(6yU=uOM#!mYUMCkCxc>mGRs%_s_Z+>aIhq32aSAE%gcBB=@Ry3gg(Nxsi*MM z=|T|J0*+~?PL_N7+CS6-NBXQjb)t!Ojg{a1yzcE2dZwL;wz%kNd~htHCX{+~x*?y_ zsX^SA8{l3LGg9ZSa5pJ_Jch|;)!8A-9FK>s#&&jUcwr-JfRQ!q6E?MM~G|IusOb zyz5tf7HyRE1dlu;5c|$5YOHIcF@v8dhD4xfe9j1Tl7lhJpy;%iX0JOQ*{dZPAVeqU z{8U)Vb5I72y?SITY+OCr89XS93)c3qtY%|+fL~9M%nL;Q{&3sKMDd{3BHTBry|0|; zFwh4ECYl;Wtc%d~cDBIHniq{Aw9{2)Hf@tZg~$`RGbhH9p$SP$c^Ddp-a*KUOF4!| zH%~T&hAIc6DJ0rY>Myv;`8P1TRaHn6KgYkjz@;`JZGh`jW)1#jF1&UXzY(SNvQO=1 zBr`T7@8we1zjL(MUjSrv`SHI)Mh~-f+C$G@_b{oQ$(f*{k!4W8pD3Yp!20_gZLwpP zuf*19@0Y3M_ADnfFK2RL)hk|Hr9 zYOspgU7U%Mbi@wjisakHBk%itZCWgxU20K1B<7{dC_TQ|Ykyeq3OttTYfcmoJgS(D zxCeR1;7u>7!px6n zW}ZD5iRs9vt(o^FwT$t#qY2IuP~<}J0AHs|tZpU(oGp|0xaylUY=W6lrAEef{N7Pd zpZhD8&uNZLy)qX0#leDF3Zx*Z7d6lak+;1~?CR-}XJq2`&8yWtqt>zg7hg39Kx-8B z8V-W#xNWXbZiuPT|4q84m2sqYv~aSC4?T|XBBAQGn_W{(B-exmSNBi*sF4h}BAEdU z^EEro;9xkoA+3PIvyuUd-6Eu1D9Y4R9qtwmz?3fQ!S^DiKMB!&_8Y(NJ;zh1aNxp` z#gceCn6F#52|8zo&a?Sm*!j7SIYMT{X>&?Tlp1!Ve(Ekf$Qkf^!E7K1KeS9Oa0Cl* z+&YldIMuAZaInDac?Qe%uptz?R4ND-P-?vu1dF^O*dm5`XpN zL(L4Q47_@~NO*LX(!@$hNzYJ-a@e3$|SOH~UmHqb(I_nYYWm1w|&&Dg1wqX~vwc0xP-;+=2^b zFNs!HHg*p_8t3nx2sc5SXl5Q`F0uE+;PA6eoT>{S2LpoH=jr1?KRu?gy-7(73xW+H zYPh7vlZUtz-xVt-m{Cyrb*U!uHA5fU?fopu5~XGfabXxxWM$#*j5h%D?VcKyc{jB`_snLe73pJ%mrLQ@GSQc> z?LoDm`p1KUcXRwk9tXqsmNm>0-zKa~3k zHk~qgA1A(`4}bkv!14s}9}WE?pf6h6p{BuQcnv4~Bi3BSD((<@6B}A#TA~ za(Un|VYS8mGyh$=qMfB3B2I6hwevyZkz`2>#}z!yLk30C=8qjSU7K&X(?hLFibIJa znqudNubvaw!lZc09=6LWRQ>hsUVlNe^$S7KbSRdI5BmkndZJ z(+4?o*1_z1DGhT_3ppNt*|!dyU>J#+hUcG0(jd1{*R{jOg|uAL{8hq3tW48D&g8kL z?R;!#=;?o0c(F_8jquk$jc^y#TR!Yr@a0DXI%MmISdI|y7u&H(N~exL`un<$(Zb?W ztih*_`Kc04FWGMU1UT;p&qW6ui^B=zh;HbIkWY`?+YW=m{wmfS18}ZrhKjA$nzx_( zn9$nmhUk~#oD!Qg24RlY;&XN)sG%5r<%!I&D{?TpQj_-^!*MA=KH1^dl#0O+-Q&yz z0CG-?(3}1dl*4^4#y`CnrM`Kxz1wjqg$ai;NL%k?`%F9daP~;t?4fgXsWNM;{QWTJ zE@uH9h?U`}fhxCxoUoqXlBeOd(JqM z%t9?{_=C!8?-f)3iuZ4ABg{>(R=^XRI;mNM#DrTU<-{uex2KEl!lD?UCgC@i7K0Gz zlDmq6@y;THkUk5-2bKs-wznzEt(_t&IGJQZ8~Nh-AK5UeHK_Kl+In!;k3ZF5b!Umn zmu$qjb@Rk6!TI`F^*pC736o1-u%SWd4j;?e;mR|B%A~?qhMsYq2*iSSoh>SKGpH0C z{(2Ajg~<%qWV0|k=*w!s*=S0afQ+}}!EmtzZI`iPn0hu{^M0wv)^ivm7rf3l3Ev)w z15;)Bd&&7#(eBF@PCg7~hacKh-g~rr&qyemi6w%(Ih&D%XYRBwRv_hE+TxeK>|lJ4j$&ifNc ztNB)2=*)}#Hz_n_>YvU{yv=Jg2@0&Jmqh#3eJydo?rS`BW)#7?O|Fg0l+!D?oZLpS zaTf%ameti)XQappFFnD#2>0VYbfm?!+c(SysCn<}MBL%e%)u7Wkaw}hAnpn+6Ky8G zL7$hx>>#X*S5E%Yq}e>P3pEMN^jz@8`X|#44}GYuokC?$Lgh%xRS+ z81Z}d(RK|Ldv_PdG2gsI^<=SrrWv{!UC8Cg2S>dK=SVCcyRMaLW7uw7blAHRX0q{N zR*0bjgwT9o%4BWlh)ysUEmmB)pG`+%1BWS zZntuxN{iOvi+|O|QjK#W*yCG&^zo0BJ6D;>$-%l;l#4X8jGq@5-o6A#9*w0OA|*Eq zM+VyHq??BzLhZlHcV3`bca$Y&Ng|V6PkzZnPc2w2!ya{ZZ%-O#PE3L@O!_DWGr7-~ zHbgb07t9roWJKSr?HuL)sH7sxL+5UglDG!%*PwG(ul4M{u4PT!8vi=g2ypv7bX+HV zgX5U2Bt_NFOje+vQ}FnW_}|mki8SqCSGS` zWO5-EJUe-11KP{`_U@02!uO+%K!{iVF!YQNJ?%Bix+nwl&8|0X5Ngk=kogNbzx>wk zOWdhSW~7sTnW(Jeozs4HHc=5#Cks>MT%itL%~T#h8$)0GDi1P~7lO`tS%h6tfP8Vf z)Q&qPCVjG}8RedI9IS8rgo>7!mZ1N%?5X-zx_GiZ*VtY5cx(6Lok+I8rS)-$>qP34 z!*HYgfai|;A0DNP@Z8G){_!)=Gs%IJyk8Cr_YX)I9GACT4Sh`}Q6a`Zw#$9w}FRS&9 zy2ny5AQBSil(3CeTvF{7*19q081~t7*IhTgaB9tP;jFnc?@lt2Z>X={G z_ffMrOM6u5d(f2*PfR-3#1r7r^>vrbjVA)nY^d>5q#36}Y5UK76A+c6+Da1(N;U?DLi=(sor&WpiJnTPLz6+h!waNb4cxm6Sju`R`H;8 zz<&BCv>go(gR@5-l}D+|W9hTh{s4kQ$(>+Hv*fzRf794DFtr{Z8&6i~&o}!F-$CJT zKvgIve9R5KeH-kOMlghM&L@QyrI z(6cAB1*8HBY9fPOBd!3$_9RNE-eFD1PD0J}WxJHu&~fs|NAYY6IPJ1}ZF5i|ncEe@ z#6Rh2aaR8wExT0}-Q@Dc%=wANT~#^hz9H4CJq6~5&r4+BFtv@Ti6;PPXKxV?#P((P z&g$jEWTMbl)JJ-hpOxj?CFgigGjc-Wgfvy>;(w%09%n|TReL@(8GrQux3z@+W9qGu`#CZfE z*j*DA>!%=UH!Yd4AwCrT__2n<+>=n)LCbC7PiMw1Bd-REzsB}LIfeuUF8?O&3>JVP zsKBB-r{zBK&g?=f(<1@3rtpd8C$@@D%qX#2oYI58#x~`!F`I!lDxw80ey%{vrMWTQ zoZP-+h1{L|p>C=ytw3Rnp=sGQWPkE_&{5@WdZzXgTFEvH}5x>bhFNU6$a+G=x}X!6IB-_Ht6t)w3#=_9Xc?uP*AiXVHG zDdlE^;cczRvu)lxmJ~k~uA|iI!)+sjCwnV7&VJalil_-R`PIs*j7KMbID!^wj_S&j zt)Qx77y!lj)kCE9r@o~w?n=EWd47klT#fl? z{w~3GtozQt^84`JbE3Y4{r`uke_!y)eYgK4y0E~=82(g$_M5Z4_gGcu2vD@eP%iMU ztP$MfSaoeSXgT(G(&aRXcQWfiEn2H#%d79}sNzvJdzokz6OG(PdiBJR)SvN9{MZ7eWiHlrojV zic+-?Cx4Hf#Fv#fea-v1a;o`?CU4j(Qj7~;z*OyOF`5<`l0ko?4wd{qjvs?Y4EQ+R z@tQL*?bxw@Rva_s-;kxmVo}iiks)#nB@K6sM&3ZRmy5hXexfT>5rMgOak!G2J0!VS zB+H7t9_WSa-izg{9BHzuZ&hR$aWpIewe#wL4{4Efxs*APk@k2z&2xRLnEPnvtL2+RUqJHRIM+v*Y($Mu8KIYVzM8e+xVy9N&4b z-{9!+x&&qJ5=6tnP$pMw{!Kdc<4&RxjW{E zM+I~pvgEC(Ra^mk&W1ITy`GW(>Zvs;NECz@EL74M0f(63y$yo4^bPZH&Fh&HiVD?r zJol@ApjET-;j@3C6;tFn!Q%M81k0Yp_XXO4gF07PP(s{+th{j3Cp^+#yR%dnGzR{g zYrf$y)5Dz2yF1s-s`Evxob}25dBv+C8*i7@aaJtC(y+eyl73X!bcbXfZHwqo`|zC9 z=^^*&xdLMbg>v4D-#`N}nES$&EU)a)Gm;+2J5j=MS58nOe~R(6wy*(pS#UKdSF%6I zU)OwB)0AbB$$#>C(-R(t+DeASg6eJMPXqDUrjn-%vh$=QK*Me9Y|gzGa*d-Tq^>Wn zJ^&p%J{Ijfy^l-pl!I18Obg~&TiMaXWO~b$s)8xl;M=Y)M1CKLSAix1iO|1HP{5wXoSciME(rK7DOagb=Fn{CPansIAF7Dk5o zq}ENU%_iYbiMZ@uVR?%|2`eVPEkg1KUhupM<_Kvs`gy6hnXQ2CKUPEs{&SBW;28o8 zhf9dhCb`S155Kx2NHbp@pMvxs^u;)Klt~N}02k!6Zc}2aF|e^dLt|?POPyX}Au|hf z=#rm^Gggx3-hjkxIb~sKUEw#Gqxg)0de*>!MR3nbvZzQw;>su!e@KxUY9LDPVwi?n zLF$a(GQ4Tx2Ciy@in6fvbf~li$6{TpWGbG4ev(qp>XYT%AGGM%cmJmEsKDKFW=G|` zqMong_fm^Q4R=rAW>HsqcK5VZ?KO30NI|@u416(usjBoMwQev}WnD$ex)Ae_WO`cd zDh&P5pyI7{Wy!%%R`6cuMN!|wlmQP#mXrb3yFPP(#UUUVt^M4yy7lYpZ=Lr%i|&2v zF`wIfvv7f>oo5=?&SsCAAHLFtls|s^waKc9T1k#6O+LhV9}XrT&weBZjTO7MpQw+g zCL0!(>43YsyUAtUc+mQgU3%-0H>f=PU|te*dGcu;#tA&QtR-g~TId!$mSW0o=-iXE zvBWYZqr8eFL8eKKs{eqP1eDxy6J}dB9gmb2Xnu9p^f#>^D@?S^qz1J<-~7kRc`BR3 z)qu^yYh?Npj(?&gJp7q(T1ocGh91&zX?i3rOf?*$a(YU@w#(D0S%`G@#@7N)pF%t^ z8S;Sh!p?K(m@g0Z8LClpAASUsda}VOt;t!#RzrTbBq-ipj>j}d} zuin6agFVE;oj33G^rXy~?b)*u4^kD Wo?qAL~7dqC^1+9g!;p*&$$)cB(PSBD%s zQ#ln1kG*y_(lo3GsKjm#+_{bacM$|CPe~!E4Sfb62alirjOJ-hf^#3E5%yzQ5N5;K z*o)rC(rlQS&GZDU!HW5;SLjtY0o`8^1a~Va}ymC#@fj zZT<%|-PCNygm2>S8aCQZx*rUl7+4DS{BrT>43$gtBc}r{xj~5>BtKbpA!L~R@I^$r zcQ}T-Kk?ByUVl@NZu_sfH_Yi4U?}9gzwTwLAX50@`^Pi8Z2j*efQL8fMyVs^Gj1jB z3a3xyKJ1&HB5*ONE`G^H%;R!_a?S7YL^iXk3Zv?y^mOe_sMk?uLHqD9;BHV_Z7p5o z1?CXcuPiaCk#b}=6{a~W96SygDlU_pI0QJd6yXhelQ2U(yAnY`*z^$%(LpvvWwP%3WMKMCACfzsK09hx;`D&=K@3a2_u-8Y!T8M? zqXAg6YTxiB2zJ_D>$($+^IPgI0f55#1J8d+%-kePz6=_iDYuuo`P9W8P>mIFw#mHH zSh}2%PiY*Ie>R>eVTF@6M!WM(Z%4Mc)aEuF_m<_)5+;8iQl}rI3r0TO-bny3a?|H- zflrz3;&02Pcpowl`~QrWYHun+6U)8v<0O;g6}GXQw{WBs&&ld;Y%DV&Tzb2EtP96^ zmJ%X)J9jEnh^wG(s{PwRJ$$GMTW<{}b;P}c>>jKyyOspvLZ|j+7xkAzi~6XO`Y3z( zv244zAG+;{B&hLen#8=XSqTQgbGc{2g~HTQW%lUz1r{xlL0q;G;8`;!EEgrB|8rzM zo;yG_m~*Fg&yR<1LGIOdRmtyJ0h^Za?|c0yz5=ccpTZ_=7ImJ~X5w%4W4f66B2%MMtyuBmfW)_C3j#B}sSDGmMi)}FqY zi`96Vb^1BoeDe52aH^y)M51&6mTm7`J~ojc@}fUh-%#^}CAe|g8EFv>=H-%}FbLcZ ztkPlau~-PGA4SHDE=7gV#rO^EZ$Z6M)I+rg;qO9Ui81LQaZrGHIh;5p&>ib;Sxll|SPvXy;v_PopGtvqPB&^9b{?isUy7 z*V|O5z$|ckro`-IXOsZLJqxr04U3I<5S^mA#*xlomH3Pa|OTA zyBSG>8ZK!tOgHUcbxvC!JL9MS#j;>u$*M##QyyQyU~*TRrDT7qc{K7vd))F=&5|Cks??zk+YzXH4+AL&$*67VFtcD%d38`3S1xjtNIw*96yLo*3Ko2gP*1eg(O zMhBn#n<#i%ckT(k1swJNvyNAeMpw;#q`QK;ZYiW%0+w*{@nkK!_n!XmKY%RH$S&&Y z$rWE~ny(2qSgz3D<<{b|V-4#lEErXHZJklTcas)Pb$qi~1Oz&_vSi(=TDrtv*WGcf zGY5(!gtzr}xj&2KS+VA8*gfA}j_@w=s-<gZln>pBwzZWhwJY;1PDTko4xjMzO?chIca5T+ zHYO^I%>No@S1G>L?|F`NL(gGm;Gm1#Qt>S>jxEfUTLVIfQ>HBVkxdj;zpfm^6}A~3 zwd?e;`cTuapHJW`TBx#uM zQyDP3m8?vD){L4wu435()!erMxfleQsqB8H|>XD-AJTthq-3Nq7a- zY(AcMjauBoe`0tIn#R>$U;CXwRC%Ls8M$c;xCtRtvqzYON3CKg5;x3$&%28TC4Nqq z`$VZ6yzvoUHLSMb{eZ6#x=&V7hi7_@o6#M^WH{2e$MI&0H6Zn9eN zw{jgnLw|P@`1UNchGng4Wi|ah_KRtuJ#FicIgc^dL;IeFf^nlB!40}?-6Q_otGRxQ zmrGbuoS;YlDSZeHt>Aj5sr+X@eZPEOid$=PiKx5Qj#)wm@ZchyuJn)tO}1R>8p^gT zLsuNf+&U@0_pb#)*KeallFfbUryV0)zXzY45JIDqcB%yx{C0s-dwEhk2B#-wLeR;~ z{KGr1tP}m;M$fqtHP01gb@H)BuuQPdTJ;|rmJ8GL^Z#%L|8K85@BC#vWro*4>C9|` z!+xdKLZnQQ3;P%=p@Ce-C%`~tFCbxO(TQNE<+|C%2Uf@J_}(85;O#RKS`yqPXg2-Z zVA}9G_o1xjn0nhZ^HXk3#z^SAzpUumcensA`$+JFxP%D(Sxp9Ve>;#>tS{R~sgtMu zt@VfScTaDVtGf=vO8%&2l)O2cCdhKbw9}HVLwrJyhKkob(!OZNJlGK3RQ%2F4M@Mr zlch;qf_pQxgF*_v2L+1DTHs?lO#{t4%x{up1mA*TpUh~J{p3@|!_1>GXqvZ<+BEGlCOE>ORuYTJM!-hWf(`#3`H2d$Wze~aEEA#(zch0u|N7~(h0bvFIIGOn)n)j`kFAu)wzR1vL(d%Xf6bwB>9t8iUuxT z8(_SeGqo%r-!YoZfYf!krv3%WFAl=!jU#t%7XhaTlPu0er9^nwpM>j zD-kf-yF09KlWC`&TIQpv9+_h3uJePNYzG2`(t&Km)Yu?>e+BFDa@G++mq*l|sd-RPfgD#-bHLX2=9L2;gPFR1-)S(i#-1e^cP;XCp7#7qw-H}AcL z0=1KSAn#d8>N48422iL%WFKQ(KE;{espd8yU65sh>0cR{wWzV+zlEf25N9PR%OEAO zI0Atw-SBb@$TqWXQkbydZ(FNIadFAH+4-;<{E$l;;jIBDLxKF6?yv)>l~tWSvTXO% zIe02RMovn#OBC5=6~Hnld16t)g$(Qnbze^d9KwzM4~>EHYyllCP=e#j`_txKyN+gC z+?c^Ako%THgZhf^Wfc9Om!vuL)#H6`^SG)7V?zG^pFyoq$B#C7l}Ym2^7M>Uaq9iN z0tXN-R%pUM`7H-P`c=uV>w)zHBl+@l_4=^y_z&elnz9+Lswxs7 zmtCj6xr)Ek2-WDaG31OJwIlRQo{i^}*}XIIb{vxIOFP_Ly%~v5piHlD60Y-fp0&%T z864~!@WbwBPGjG;vESr(&eREvNqSn<|7$APzdrFsBvR_&l6MC=@KrL)d4jN36d4;r zWJOQ?$F3!cnJiACD4^o%I8IK^BcFI507dNi=fn3*>##*0K2Q4R>!Q%WI6Ht8MW-+WIFQmi$5|C0Qw2g1FEBhC#7m)UKWhiQxa?@Ua16iU}G#$S|O7 zQN;cCr27yQtl07;wC5a^n!rg#(t``hi8ruu?+*aJDxI_bKQmYFgY%h-<^ONyN_dID zo4osDNo1j~xRB&~jVX1Z`CyZBwdjtyM+m-`5Bai4Jw^D5n9kj;zx~HA1U^S7=-3)W zsNRk=qI=0QIH2K`wsJe0@(+~hai$(k;V$;EXSK79s>5b~stgeQ-g>C6UjQ2k`l{(W*wM0y>{T%cJ$O}kFJg>tYBI)4{pK6~p%q`)Uc z18_SYMzA@b`|+(ieou<@^V^MI83{O?&bM{Ui$ui0ggjo}Oj!HsIA&nD?GOMsXftU! zm_KY#q&DlPCaQ_-GKLN@tTx`#5+%$LzO;TFJ2oXkmP3EF%4@6S`YrgcN~Ddr%l`6- zZdeEr5}pot8us&fplgc(tHX|3(-4mPRJyop^~&g@WC=B1!M6^A`hmP-A-t(N23gjm>~lll6}(4zhD5^Tr|uH+oiT%22nt=# zBT;CbmL^39Yv|fgmEdV05*=Q29uTN%uO>#iii@hw;R1JbyaOIc70`4vy9`Tz_1ElK zr_1^GI(cFdGUX=Cy+0~) zX`XWIIY)p~oD2QqWVch;)hj6oS5j49j<$fX)2k5RmQ5HA=g?qB-<8hBpwpF~h7@h2ESzFqF{(=C+#1Yhf=Ix+g;KH1a^l!JGSlqI$o8q6BoZ$5Am#XQ!Uz=6 z`DSmu_CjC!grNmbi0PWvtw=!pw}$_rk9xHTNBJ^&qG3wlxN<$N5*gZBaAt-K2Jjh1 zy$dX*(hCf3Ij~129P|AA?^^&F=mavmyiZk+S^ldGI{8AdaO zF`?GynuAVifY%F>KYyPVhvc(_DTlunbNDV(fIYgUB}u2SpJE~`yxWYll>dG(x@_$- z1vEFHl*h-2QymJqqLMR3(*jWXnvM|a)D9C9m zV2oZc_uEa}wa7jDsf>HVS5al2Nm4xBvCV9jWG!a;_qu{3wjT=qaQN#TSQVW5H5%P; zfC;8MDjc~TE2G0vD0kRBw)odY`AT4k96S5=_`T*6ne;#Od|StK^Py&$&|E5k48Oh9 z#p_})*S0N`*?UN-MdeIZnQPiP)1{{waw0nC14mQ) zEN|&TRY;Ru2TLkW)kj3L5FH(8B?YtA{<~HM zM?po;9;E4P%9`YV@yPmPXl_S(`E2AP%qOrJXp^H%AbM3qT|yMq4?a5m?e6(@MW0rN zWP-dlOv0+V)(O=*$^_kDm)`t4Wc45?lbb427G@tKRhvB0@C`Q#_=lR=6odcIJj-xy zo*n()6p2CeO>jK7C?xGCBkpcp-y3>wUEwZOING$%$qpJ}7HU}HE`p*;DwfjgPXe$0 z78(S+j)XbS@UO@G*zv!0D2!Md+GlOx6<~+q-1|`aiN_b~9LHqT&r0kw?A_WAD!5cES#3VLE39pz1*|m;OEoL0{Qj#-V2skdhwe3 z&pj<<(tWmE+?XU9x5KPrQ|hm0C4aNutI5gN&n!^UEEHD^g6fa|1a~E*s9hl2S|@2 z&oKrsxGQ?`m9;0LA+=g?dwnV~=8F5PE3MRGe>ov?KA&i=_y=fn#w}RL)Q?pV?b3b| zmZQ!r-KGl$$70b&+P#~ekOvLdW%!p@kgV`ov0URm=H(Ed`PSo-Fi!fxxCOR=J1gZ@ zH^@gA1ayJe@x#ZSHTjLN4u+2{l(zk5$W^K&N0Or~^JK3;ZXVJ1gB?3yLWkM+6>6E8 zgiLuwHLr?JVryTmey;Po??`s~ltzEZ&;Y|MPfp^69wMU4C%gF_xj!qG=|9@34yKFB zW<5EeHMpnBYbN25rkx?yr49M2p1`}%*sphI(|^LoX?fBRCp5@=i2YKI`|*uKP&M`5 zT|~u!(Q|GT{7k3HtsF$)_48Om9!r8!9k zVz@6BjnVSEI2smg>raW1w^#&{n{=k2e4l|<0*`FmE6wc`p)@*fc~Qo~6tvt@i`(&Q zRBb2%@IPN0*|cq@D8ORt@$Muf3dUUNy;K3j?Zs!_?uv07L)@L%dE46cOvbXme`^B1 z!y@nw-;UEL%n_2OLVwVJa0`3&I$TGT zj26~0>wymA7ZQLfWf%d5c4x|5uO)lL;q3)atS$Wy4zz#N5X*j!TfQ(0y#(KW*zZ#4 zexmC+2l!jS9gdhE>HWuf#?qpZkY`td=}#j8IbLF8WrWhX{~-gH-mFi2^mk3N4f694 zO0+2T#`)x?0_R>@wEL*biHwpSRABX*cmM~i29cCBvDl{V;MX8LkVpQs7#!cB5;m?g z7CbvYQ1vkR6&$#*#lLR8+!7aW*(t}=7>x9;VP?i`TC zc@sc+@OdHEfBqr$mU|2xl@-IX>vH0QwthDvS6Tt`-^t2x+>o1Ad9D`TsWiXb z@@vmXZ#?0P=n~oNMHEc|C$JT1fAWJyl$vNbBap9u1!riQ+SklBAIkEJ&4l(w`q4b3~G(+2fwCuh`erMV4 zv((EykPkMyWlEW5n)BHS$xKp@oKh)t^9cN`uFp=Pxn_Z8?n?q{2pl7Y_m{@4Ug%52%y z_~2`KqF%Q z*ZrbQSOV&yOIM+*c)A6A1{N<|OSEj;M0UHjswaxD>qZ|&2}$t@`Jv6TW+n z6lxL?zQ;M%K6OFwr3f|=M zg=WIQzVrUr;dgZv%VGQK4iYsf^AbU?9mwkECS9*R&Jg)yyCqVJLc~XwBJvZW8RsUF z?qzce&VP8D*SRcA42L+d>1@Ka65FSrq$H-T0=UnBjvH62;l1L)Q#W9^ciga^{K*?g zB5X(*OwTB;Jx-Kfk`u%QcJvD1zu0Gc#*Mh3A9HOBQ8|8{Cb5N2D<^tvl(%4Fn6D)n z|J}#eS!rL%KFF0F;Q?ugY0seOVr=QaS8oK{fhFJYX&G`eJwGwPXBB=F#tFFZ+W&Tt35y zfO}x&e;{1;VFT4Kn)cI+W7S;%C=C>=+R4BRT}igC!f?342YAY8Mu=bXU-Um9ZovbO z$PJ$QFSrfoOf{dW(HCbt4kmDaUU=4UDLfq6^1DXH@Vbv&>=@L3PWR)C6N};W1Mx^A zj@1$)Yk{_A2L(wV+-E5^OojcyySrO`SV|k-F`z&|q)Shv*AQADknd%ebM86U?cv@Z-x%Nc{GkKJAS>^hYt1>I*`D{? zH*La*p7js#l#-!7UT#>)_t!XRcY^Iv=Ls75t4)$_zK~J)QA$QfQC7B*ceto;6rp^FC)k7NWK|&t~ zr&Q57|IZ=88KbSvV{oOcbhYQtX(N58;ThciEO9W*L$bCj53eM8Wb!tKs1(F>z$2|n2Q6rA97mZ1D^%{RF=+~S`+js8;W zSm`uuzo3~p;s>iB2%hg_;}zvmJ8rd?!f_s%hK|>zOXu&0OLWR&?Mi(Q1|Tc2_R0OH zge!031{ntg`zHlTk)8Nq!YuxXG`~0OTrRuZZ%MqCzkKSYN|e7i5Rbo~BYbd;tL>mF zqAlm}8>F$Ke&xX&3Ej~;8!>ItEQRg0cKpurb1r4DecL-6@~afG)I$xvgul-Jk-_%& zJE~i})LTeL?Xm8w_M|0ftK8^1g;uhmU(uHM(I&tz0+gaChX`10$>OT^bA{o;zpw>3 zhrMCinGRTb$^P4>2Ggy6z4?59i!c6r<2J-IT}DizPtty2vISBW{Y49U4vl00S=AC% zdTj7J`M`W&to7oBqqX4UXsD-UoyWs$Z%wN=?N=*|V;9Y5=pBgG$96}I zC53FuL!xqm_7u0KFux+?4jyebOzz9QnRxo9P#=$8Wa?L|LFW)mM-U<0tiRew$G#OP zn7%2Dcl|TfcSr$r>q53qlOOK?P!&(M2_R1stJ&Z8RAa^X_4B9+-o0=BI`?uT_3?Iw?<>ihJe8iqEP%l zX8`_U)qBf!8h@q!GyXbYqo6PVW`}y+HKZ?>Dcl^?1KJnr9H}Se!vDdacgiO;c z!p9CjMZ5uzlZ)(GPd1ttLmuT{Qx&|LTUT#OHe#A}=W+f(^V;d`nPKDc-GQlCoNuUM z;5zSsi_{+n`A^bAO_ozRIR6(p*xWzwr6vYvb{@!AJQtP|*4G@{;zaMYK+8N!^hah2 zD8PMW%T3Ea9XmR3nhQ2xs{Nx?r0X05zw29Zs#9JMxO2;f#R(RzQ6NC&{k37j4EkF& zL_LZqDU$OkAZNt1~n0p8n{SFn#3+SdZUFk6$o1`IthNu`gaAO zIz9Nx|8nqusm9+hYyKph{J+JXdGm9=B5@5Ce5Lo4QmO4|BO@IeZ-0w9Z)`yE_pY{c ztp8BNzd0;9IF;Ra^&3Mk3~et!nf9a&cQWz6T4cC5$tm$`7HgLO*?qb!31C0ZkHTzodXTZ2i z^O?y{4bQvx@<$r}S_OkA-q!9tK*^5$Z=R$T4aB+i4z@u@POs9$KzHHO%EHufnrlnf z&Pu6jbFtR=Ei}z`+E?MFfhB)A?>WF0oKBFAB@qVvrS1Q)I{!4*f51@ThZa?^lsjb; zwHM*)Jy@at`Nx;7Qv?BaJ^^HBpi7}fj^6)Vlq%;1>l~jX8wZ8oS)U0y@>H8vP`JUE zH>Ra@)(fgem1@-|4!;)7Yg4G6Wgm379GTu48F63eZdF0dnOSphRLZq@*|cH4rUR;S z6I=2r!|v%l;8FneJZEIYRXP8VyDl-!seU|c{DUlE4Hz=?O&N^#RZBQW=NrH= zdO7|<2>b3o(-gbo-=KQP4W#IzY`(lUci=rHO|y7rsmyfdTcyb6?BzGvxOuAT4xGoN zSq8wzna}=li)`OT;dMHDmqOJv%g*?&HIJ%Z0KYvu^A5D}>5Ky|4mPVX2|oU<-)8&U zj6LUi?90=~xN_#lP_P&=v=x5H3yr}5%x=B=Gr>&G4!2OTiJH$WiCE6QOs?tCo=vvz zf^hq>$S2iIu{@hEQ>hmfIlOi#f2Zg@!--JipJwip&AgiKlmSH|Xqt$Vk=pB)D(WcB zHMd{R{3!AkAzteM8&DnT$jxl#QVprLTVEz=s7@C-ch*JJvH|440~hGwaREq$9|vJk zM5lH%BOpEF!Ffussrs4Opg&Pp)=w`+`qr8UPwBgflMyp-x&9HM`JqtqAPHRhkx)KN`Nz;2INd z-r1i-r-sv+jbSm19)ix3`wJEA=kthDqJOzt)e(MH<8wU;mDX37&GF1KMd8fbdWBDC_N$iQqKSJN!2X$vLIyj&dQuvG?{r&t&uUN;uHP}A zsJ96rm>{?X!3oKpi1n%F$Ur~6snVl*23(d)XTB_8xz!P;UdxBvl!eQSIb9hu!8wz! zzvw$e$#Zy#N1Rza3UaGdd~M$z*(4j<@F(0lwhzKsTPCO&m)*h#*m9ZA3Lw%+LSS^} zztuVqcB*Dc3}bf&$<=O|)@4!vBKAMCh^YV(djKNNbWeLSSeOQg_*YZJrTI&DtMH=E zx+5j_N=?8jYyegv_yl8nC*hz=j{BrwU~tHU>M0;$s}^`wJr?t=o7 zu_er>5a8?!D81{up?AJB9`{K#ZQH-#&6_5(Qwn14vkH|l{HbYz0lo5#ax*$M8p--I z%Te<@k8!;aF<V!p~Rg#3)KR0_a4VDM`3j#E-si!BB z6U@Tu2yi*A_CKe;%i+}&cjmRO{&nxeN7E%di_`8rZcUR3^b0fbS8BTe7^lFq8Yi8k zr>bfVr^w8`yy>@9)%2Yb4B$_7p|@U20uf`}=}QC7#LQo;WUn)&rLEeN>xiKm=iR*#KupAz6M?P_<~;+_kUk^@(R=&Ywe zbCV5ZkYD-Xany&APo=dZtjTgxKs3{m;Y586P<;Wsv#P%5h29&F8$BHa{kI_Nz`>eP z*mNh}Uf7>uj|?uAH+fq4=@@{0hM{NM>1(~8%@jFe}+{+45V}Fr>~#Y*Z}WFumZG~*Zr)(?UxW6RX|48XAW`MKF0uR*wTB} z;2d~?0Y6~wQ<}~wJ|MX%2)rI-O@RKaw~WHB6EHHS3ulc;e>o3N1!US|a#nqi%VjtX zPttxK!oMf?{!2pU=OO&3G}TW{ID^6aX^#G50_LYV`p;>@pFZI~X3c+^qyJaU(H3rP z>f)BQejw{XiJtdEdzC4kqTyFBC|>=-bjR%7+8CiDzIvvrz(WDua~e0PZ4sXY3>~O! zi&PL}Y3Zs|>~SS)DmIR`0!2EO$V6(QN<6~Wr{k@^P~BydYcVRDe(hZ6H<;-@b2N1f z^`61uPJ)YaWJb$myv6r3-DUT&O!~n`t?V4-=H~It$Lc<7}OMMDG!onTBH2i6)#X2Se=sx9lH_0LJ@u^vWeIGn!(CV=faRx1C^GoC(`MJxe6Y`2G&h8Vm$ZV~_e&HVoO?-+?MfLBwv zIOu3bYu>Ybch`UHGenq24wYFK~T)Bl(3Dd-Z|`~<8|C+W4uL3YKjw$XL&Zkg4OGCaS8)KPr&oqZoBn(R9Z zHE(U<;QbV@ds4r=D2d*QQfsK1BE0}kI9T^?EgcISw}#Di#M3YKWle2(=gR&l$%9{S zV6oU|3Jbv-rMJ!YEj(BWG|%3<#W8KUH!RYA1cq`sZjB=rjGcMPn-5nIV^syP# zN+-1+Wy9FE0w@}>+g#w|ql(+ve%qfAc|@BU3Px)8Gjrfzgp*%K2^qMeW#bU>?tK`n zNSi&Hnmx{CPygodwtd{0_c?!HuqoJFK^NAhp)HMjU-4i~HMY?elSgl!C~*TxIrflM zm&Uxty=J`mb{4T9>==x{AokS{_Zog47}n?8hLEuOB3w}iR`D0X31c_)*IQ5oEHsP0 zXvZpt(G5gg3bZxzUL?6}woJsKW7DSS56_y{EF??Po0f>o#9ZcGpm|$^n7>GH^VW(U z^G#@a?WM9_Te_I{Z4Q>FGhpqOim?`i4ZA$*&2mo(x)^aq(u;;&otaBL ztJ~bt=|?G3xmin^u^L;AD(}q`#Bbrmie_p7CzImxs2xQkEfe^Xib1OjP8b%;=cnbn z_wi*1EKSgNxheLC@{Lxp*x#6f)s3=nL}HgZ$mzS{uS~sqA?9tNh!TOv3|DaU;pO1N zmiG~z%gUPOjjq6(N4`DZL8 z2Rj#w;KOKPBO8utnwecDk+6o=gAqh=NrigP=YGg0myEqNYNYK^8TAjuY!D@9rqU`K z2zDil$~jf7_e2{rUtWlj8)>$_(r;*dSw;+YpE<4+%h}C>8NOHgL4tV}!$mQplBwKi zdKI+sP&1RZ{_(_iu9$4c4Zd=j2#Fktz(WtzkhaDl$4boOA05z^6y--&;lN>Cefkvh zAYbu(E9H2a_4tfFNKb#(^eakYaKp2@nWYgzSW`Pz4VG|ibNwwsAw_^{}9l%ik zeOgKH^=^pc4^5C1(fP%`Y##MYu~Vg~HJMC{tjywyLcSz`U9E!6?-9UIY`5=}3nIk~ zzwvcU*DQ7;wz+t7u+ibBk|ptwa?bIq|0<<_{Wgoi^O(WF{;aG3#O5-f$Po1`Ra{o_ z*Rm1Ph6KdQq29&?aKod9fxSHuO9pwgv_# zZXQqK1FzYP^=YiisPG~~65jmbF>+BXDZVSCE(@dG5ox=~1@1C-Dqe7Bq7jXhxz3mk zYW9T${_y*iF7@_4wKcmuzGYv05v-4FtV~+e(3BF=xs7@OcK_Gy|Lf%!Vc)`D2#?H` z*OYo)O|Y0fBafn!s=RgaNB6*I`Vt_a>rl{C zbZT+^cxPsEnytPaT09rP%KSz5ez>g21>gkEiF;q=n7(B<_2Zps0?WRz6Q;$7m4JpW z{cvX)^b|BM)2=$ip$IKi&L3i$N>ZS3=Oou>`tvIMyb3?B!nyPRA6SL{l-vWbv_DvS zxxI(4J%0@9?_K3ST5bv8;eu@Wn?LkZox~=Mu6bhKdyb%m-ZaF{G9+3XqE4#;!DW+Y z0ETq>H+Ke_%?BY{C&!~S{`>1ZUK^8jY8s_KM3?x@09?r8QD>IbVwUu(70WpPZPA{Z z#w7vkPD!&l*;LcG@opD(;!s|cy$y_!!36s=dHZr?jWcDv+dksJS8=xA$8jOi`m=@f zQ;*0_=%=xoS|+c&a!l82zAfaBR|o8kBVIk^z5VEA_mHV4^P<1Ehw){xY{3P7cxG4mt`QNpvFS zaEM=!r;UYuk0du-Wl#81L z&SZa4ySDX}o1H<1ozL5*5l1_n=du;If?eOJ_kd$BJ?t-8dU9|sJ8xP(PqV@U2;@%#G zdMZn^c1lerd!H;a33-F-199K@cj?Y)B%b>$9fz*zQQz#CQgbnH+Z{jF)Az%>ub*GQ zQPS&Uj3hXz~rn=#Rp3RCEdpE$=*v<%F)QKIG%~2!WCk$C`|NZVVwl*(ugXwr$t2} zyTIBSIjXV>OlgJ491XSaU;Hu45bUj8+!{)A*;vp2p4unD$NVE!v+!rfgR%g?$7(9Knn5cl~sKeUK4DID)YS{Pg z5*lmfa(Tv>6vgiC*Z_%kC{*e@J5Eu1f&R zKr;pIdlA4yXSNZiVH|G?gjM?R9%p|XO9A-%(8rbX;x`)0<|$X?4UQ9X7nO4zaWpSU zZqjJ^(pp1~KR&s}z(Ae&;m)lpHKEJ4NFODJDH0gjRV*ZP9%jVXu*t=+$=Srsjhba#MY;qoD*1I&$fQcWw$} z7Ebag!ntIqlVm93LKNDYVv(=C&25*vRM=u7B z`sRin2(p3M>;f3BKcBr3rwvfTOVAJJJL+t1bgqMB;ue#%8n`1l>p8h4o)hMqG52%O z<0~r6Tx25AQqq`0ipe%GGmcC$CAW+@$Yi}e-zYTe2lLC^oO&q{!=9nrnTqgZCQnp= zlj}rlD-0K;7+OynCtESvO;s+jF~<0Tq)bNIQ0`Kig@x(f$~{b+8Kkl2(`OzE(F4-7 z8E?FkW*bXQyf~15u50(Q8fTYK`>(r8H@gG;PsCP&58u5mUcVGg0E?w>NNO>@ zLetfyw)g_jxa;$*&Bhq1ueZJ%xrHn~V1v-DuSYOBb829JGiKr8jiQCEb7n{yQ6wgc z+pRp7uN&FL;DM71>*(;y=E2bLa|3HElF+(NLqhSQyOIZ`6ZVACyW?sbFhI@9c)!wS zoC_}9GSL}wWRiom#dS?f0la7;G@2fu6&Mqn{$AVC@VV>v zUqhq6F32o~^F>G0Cpvl5(HCWn&D2}z(9LIAl&VALG-~TZ-rjI~b8rQH!D}udo*@Hbr$UPk;T> zK8OmhdQPMUPqqrO}6lo;P{(VkjdnEal2Hgv@ro}(;r>%% zI*m|dU4lKG;VRSV>N}yMKGjv$J1o0MpIpZsbF~Rz-y5d&RUck@Pi_^3dD`%-igv zm3S_DbqAgcM(kYM`ENKD_bK$<$Ct|_;?N3HUB^??2y9_@F>>gE$7denE440TJC{ED zkIN2n$2I)=u5NB1z2g_-IV7GNfuwkdB-cn@?_bL=yR5?O;wqyks5JsF3#vdL z;K@EMlj-V(q`s|sBpuh|W+wx4R~lad)e%{lxsY7%o>!vC1?e;Qr@bdIGpTi?k(Mf|1=q5Zqkt>RQ=7d4F z`MqLhjyBm%hU>-3NqG~%t2~sZwtdo=5S@DBg&|A6xsC*n2REOo4I7=)U2!qvE1To? zzcaK|?Ig6apRpRu?&t!;eBqKP=-7%uhE&Vu>XD6S4QWXmfsww}Lsz&Lr^LjtjaQAL zksxdQ8h)#9zP?MaYwQ23S-`Z0mMRn=T2RHkn5u^jL+lBW=}lFV;{)N z7zQ|%R0nCF6Ppr65~9)kML<$w6nI+ZeA+$0DIn5~uiCeCumr^o57BGtE0!h_4nmk{O_HTIYxHIZs zy1m$F`uIc&Ok28M&Ek!HubH!~I=a`&_;$L{U7=-o@O9SKdQ#DR@$?tyhb)j4vqp~= znrmYn&%?+eR?xnon_u*vch+{cb_#v)VjI(-H%$&to7*Ndt0Y5~=8q)LX_08G2GY2D zvw;?(k;5AY&}cI&WP9wYd(zk@{S9CnAb$PfY9!f{47&LseAqfItn6)msVKTZ4WFS2 z&fm~~1+IxFNF?cTmGQX)5U;G^D^?-gx7h}1VZ_;7e&9tgQfvNmrP7pG`J)PqNB)%$ zFmHn{%=DF_0>DJfXI7C41HeJCz)o0n$h;wBgfyD0Q^>&kPMXHDk3@eratQW1>4GcL z=MgXC`^i0w!eJf#Yn&Un!%;AVDd$e>JtD!9OJLhCrF*{>hapzUm^sKyf-A*9 zSOd`Z&?mP_-|j?N<%Ai%qhE~vCS+WUlDkBEtGyU?PjeEe!%=s%fcXvtzLpstNO zvKzPKI{!j!BW1wYgkh!pN`Mlahto<q$ zu5KIIfxw0}6835P#D}B~L&R}^#grn(CQeDqzW(SU-8^T9hkFv!^Zw2rG-!5p7^<~e zHiwaXC|zSYU{!4QRaeb#JS=?a%mGu&mP1UO$Z3#u?VU2|vBy&TwQB^!q!ncMYh{|5 zi^VqNyNzUzKGwmAvd4^>gRq*Y4|5$4Cz~3JbA74?9@v*kaI|5P>FM=N13f|CKF%T1 zY_o57r9yz1@n*%t>)*XDJ}ktqhaPX2Fk0cm5@=z|72wFwsPQIc+RyjGdL~)moltA> zp?4PHV&WbOwNm+O_@-e^nk)8ZeEhFSlN3mQ$0^4P*~rby8!``rpO!I##MS?diqXz7 z%LC2)r(sQNm4IGm!-B4T^SIGY?JFRdkuq;E$uEGx6=R>YPWpWw0?DvkdEF_eFZ#|+ ztf+~WjGl=Kee!X>{oHl=1CkT6x1dEsTsq|U-#2y{V5|9hB zMz@3_;7oD$kB8+^KvlW+&#H1HxW=|W+3I70xu(>L=iBe*Y(t_A*up6j%!dVsXZAH_H?mxAIfO&dFD4zmvx4V zWgCj{lC<`HiUxPq=R5Am`nK*xfzY>Q@|z`d2&Xk$FsaYi-NO6F{{$uxMwdxi>sq~J1=3p1(!JS}!4`1)-h4*1;FGx}{mrB92Tb%d z6^ZU%R)9@0KZ&j2@LC(wIyJelaANxIIP|YPxOZt-5&t{o-_KRQQ{;Z}2>hvHg`qh2 zKrNmoW$k{jZtJfv_;sYuzcREQ_MK+1&jk?6driC=KlMdPd9Bie%$Vy04$Fs`Ej{-9 z`ETjF3cB;=;+jDwZ_Lok_aup;X`?`G-pVT+*^YV)R#8CmL^JM58)zbRWxydZb|P|7 zTPLR4vm-xjUP&lTqo`n6%Ap@}!F8g;L@2arw2Q>W84u0J>)3Ya`(Qn>Z8>i5Sv2v( zME*ibK1aK!Ab-wCUgZ)krQHosrb|)}5MhHx-eU=^WZ*p7gK)ic=`5-Cf&*>y0F^YN zY5#uuj1&Z5#_hPOjV0r`?7XPsQqGHT(@|S-tV%Q5bMV@``iCDmZv?)jo%#utCPG$;Y4i zv+j{|Zv{60D?W^VcvZmQyWm4Ap5Tl5ON}61tI|+}=whx|nLKwwDvhI1Pq7z1%Wzb< zOjXE3QZ8;O?g zYjXQe$o6KS=_)cIp97Ni)657F9L6F$FR9ky(U?#Vq!ES;R51WSRHL5 zbYU>ci$yQzc@j)GKx!^ufRJ$)A(jL^_u5^HryJ1I4S@!7kp@+~sXn6kg7=M=yE?3U zA$n=fSwlf5!HZj4+OPwn?rBme_3|=OEtaQ4Vx z*{}6*m$|5nGkwOIS;B1ZiJr!5Y-J{onN>zL+oLfH zg`!3fv+GCFNtBJE_6$jB&3c|@;>=jsrD!gEfmf@7`}l`mpxH4%MyPvwDImJ z)|VOzxXo&34o1hseiYWeZKPXT+9cs-wx^YKtq|0|vO^Kga<{U3rUf;bWlY|Nw z_KyX7x|hs2?98l+?#=`sSLKV@z-z*R7?md!@#|HNa`tE6pO|Z{#DBDM2cs+Pg%6XJ zHPyhc^|*Pg=hb#f12kIi8FCoFjf{4;-=t z(w*=F$V!tnGC*H)$ zPlbzS7`-ammmpg8&Iar6H8)1%Yb>z||IpN~-!)a`pPJg&`ohRy($u3Sz;HLit2>JM zSeL$=+G;_YL}Q%Ni)0ix8DFR ziA5?DVXp{c|HzS$T)dZ>WyiMCiHfPFrcnz`@Nh8HNZsK#nC1S402 zwst}bl9lWO>UB)Eh)x{C2o-7klXBR2OP=?&-RWkl52(y@e0FVEl=Ea;uHR~W#${{d z-pPgl>E5QgZGOn9!q3kJJdpidhSn{?!+pNdJZ=XhD+rtnwM_7UcCzsJ>>u9f?musI zv>pxlJhkb=sA%?s^(kST?T&d|ie^>~3}21#xxTvlXF2 zPAsSIjRN{#$^<54ng?nh5oRt{9@iH%?)P#^=7wDbaa6NbjTqju?J^)c1 zL#`L~j@Wc}&f7G+hZ=V}{70}+Xw+KReCSEH>>Wavn1lhjeX+iV96xCbC>-aYzAwLwlDujeck-;@)FLkTC1ocR}YY!L1jT;Q*j$D#;ZKR7S-; zk)y5w0`m$e(yHC{X0330n91$*ah~BTB{MmEF=q0i=ury?Bm(& zJI7DqvRA5E06IWO7kaRA&&5`Wuck}ZL!n}rHv95Q1?7jN_wIgu{OGsYvI&GM$jdH5 z%w0vyEA1fPnwEip+QCyjo!}|hcxQ)vRIs-Dq8yfmH~;w;zuf{%qikGcQ~rnrLq~@G z2)qKoQ7+OscZ+a~TD=(@1q*A{{A=pLRtqV#u)3UFF+*VN`Qqd6H_q~Lha?K92I9tYWeOCrzBrXWn(N675U3 z)6f{wY0*7pcLz}XQ|~(ED=ZzCOjwKf(%F;um}lVY=XL%*4#0fK6Tm**)ilw#@crR- zpT`$p8vv1~-cSr6=FYf zrzFsN7aavjeyk!C~Ypw=V{ zUYR`x8+>VTph7{zj%t@6v3X2NmT$9g9-fTLG+fMf1Xe;qtwIT153bd?I zb38f*l$e)ede0Z4@O{**f&Yk`AEO%!Ap^7C)(={6wbVA|!z^XphmTU;=UY<2i`m?W z5$GJ0=I!ry38KxO-@oexLKSn znjXu7u8Pkli`^+K|LSce()6}|N`K9C*@pneH9vIuyNz4C*FtvmzX8@dbv4(gIlqyb z1_nlaT3)^c_3%FF*>h&5>e5A8c`XRn<`GV0DWO3kaU}vo^0!}>uu7Vy-SR-{({#xd z#7sALf!eh6@KhE>45XbibVd9!aESchvu_bBY>X=@o)q*O;?FZ=OBDeN5!k!6Ao98G zG8_ijK0`fft|={Jg6EVXQ2M=`veYV z2Ip_zJH;R^WMI+8{KlMzLm{?Yjj0|CfD;}#b;2d@Pd%`)^c3Bi$Dyf>7$vrsRoCNA znJz)>GFIr_MpDLeqk@LCdASDY#)9nUg>o{|(g8%PuV4_M%&F6#1^9%VC+;y+h@PJo zb51HczNO$H>MWzU&Qb7Y8B|i>bD}Pe4u=F=ygcEHzg}+_=pnAd&o}{gd)xjrrc3>J zWOy45b0h~OGsin^g}=F}fWh8%nJFIvo1X=vJ&d`6*IFS>9fo^|FZ~l67E60g8m7_) z_K?z4BLlx6*yVf((30RqXhH@7885o6E;ddW?>7#pLR(bBc1>fU{G=c56NZlMJQa3n z&W_h!h3he8IR!P0yEYVvWe@fMJ|Rto!CRH7Y(Zawm3-PC;YBH*^1=MqN<{!xXu4({CLq|-dV&{Iy$H9y4+Pw|3bI;$aG zW8;O0Mo^M91gBD&iMNzWh$2_9Nztm|wR(zY=Go!7`dR$ z$vg!OjeK$1;)R&gnD&+n>qh&;r%B{xc^&X+c93O8iUu1;v z$%Yv`Z^1ItI*oH@LEHD5FfFZ!R7E~yx^jh@n9!gtPRIYWir$P00gjh+QshWS=&~@7 zPw158h+@B-t+AfeIq+pxRwydks%OvD^76WyqjAx6>H?ySTq9@bH}E0=Kz4~~XxO8H zE*bC5Y12v9EsJA9k;eUZVdTzR>J8s-C6(iOarZf}X&vBl<05MFeU!U7B3{K6dre1O zN=TsG;nr5Ct{;*EKj^$+nowTdjKJ0nd12R;Gf4^FUZlh2ei72wlAygGL6kn^FeXPOs#`X-%X9PX%fFq1l+g; ztvtv~D--WIvq){d*`t@`O4lf@S0m=h4&mORnMzxpDvTCppfOfCk{jKZ6@GWKDNkgC z1q=ORu62t;$weD?NCyTHZq=cJo*67R0#Zox7sy-&VEj+BMD|+u{G*R?!{+u`(nklQ z;I3eV26=1zIJzjO@n(fnw)q=>q)MpO{*{W`d+kRWv@I@yx}b#13s|e{;@hH zKudbsnVCF!V7Aq5t5ZyTv-7gN>4-dTNLt#$MMf03RG6Eg{SKm06UW>4jk9UY#@oQvbOI>dnbwl2GA6b)Db(B2GMZDE-Wso z%Y#DcX=IgaG#b7uFP}bwNA^Gn7JDNUk;4~;P`N|NwQ7?DatB)Msxt}cD0N7ZPgCL) z@+u{&roF&V_*#NMStzD@SZkX}BCb#=QP3|<`XrA@X{Sat=^^igwn_7*1?c1;4>3Ln zkq{4LluJ8!L)X)lga7R6G%?v=!iUzV>Ol)DJ3M*A73a;Y;OepcBb;M_nU;x|Z>!Ie z3$(z~nHDJO37+rcNLgh!Gnl09@@ztOJAjQ(P9krW>(&v&@Pdz9Ug17@+%n0Jd2n3q zTo}tudfk342V2Yt&TTZ(Vixact#0e%B^;aJtqV}Lr{?TPFAI)Hb=m2maOI)X2hFLa&Il8 z^+hqpHX5xVqa}5WiKk(gInbUDGGWy1W9_?PZua@}w0B|d!BIpQ;xtw7B9p}ah<%=T zd5V%gnPZW}M;>iWk>%{7*Rm0R0!I`#oOnZ(G_c(`ZRTSQ*=b4H{d+ZY?=On&Z+P%C zJ_j0;ZeM>F>!*M&+Kbn~Ppz%XoBMFbCTV>xHeCZ=7~vcbB1?uceTiCh4I+iJbAeYy zxjEG70O5`997CC$8}odo9L$YP4`wT{-4>cAx|M&Ee$Jq|f3@?Q{;;|r$Mj3QIGw9{ z3vf=zMdRX(74nx`tE^e_K(U2uH)J%&}F7qWI&CVjKL(U3YHroNo5mg^0yb zJ-`W$GR}e~IS02^7Ne`2YaQ!_e5W07NA0>MyUDov(OGR)TsY!m`?3FWKyQHxa0icJ zb+=8Not00f;$Q2IbQvKXXM~;J3?9#t!Xxyd?YTHVMUa$duna8e0L>X^n}VXC-Xya% zF;N-!Z}rGOLQhDWsdNq8PjB2r4GaREhoEE%8h!M8vhSi$GmAg+LD?<9B&RG=5?qqr zAxZh{!!VErqtO|f36wO|5>rz`VAtf>^ULg@zxOBtDY2|Zs8;eh3*;WbaMz+8n$dn| zbS9Pw`QS9M$0Y&51iCr1YV<>X?^7fSn$4lpZ(Wf@x_bETz2u{h$9>CDDTyX;U?N)_ z;31u1f!}L#0bDH+T3MrkYRW@dhQ$vRc7C9vlc@D2W@vr=3zP_K9CV=H42dSf?PV<^ z+Y>+}?nY)PwHYmD>>%LdB7^rL!3jASIeXx3_vxqY30JThcwcv(dRAh9lEQ{mu++Gw z_{MFAHzw_MPIfXp{O|N^mvaSY^EO(_yyIKcfVywAfmS9rZ#~c`%~j6ftXdLp<-e2{ z2^O^N@0_#nJ>DG!3aGx;zUxu!*&OOpm|vTM=d-0@@56|Cjti9I>AK=r`A%zRCf-w? zun_D_2;Mx%cYUl1@lu(Ur;m4ui_jIAE2C|g@iMLw4Sc;fzkIzxBU{R_KKNj34&ln! zVN;LAaj#D_RE*`|CfeI4`(+QhG%`8jAhwOTlOw=1^z%(A2JNM&$}(bsckJ=91g=$8 z$4-u3!5BXBeNybsY3k2ydgQmV2uv30%C>l|b8a*q?|oU{?&0Oi?(WZ=Y&)#0ptbGG zELwq2i~}9upoxcOk4*gG!m#X}pzS~m^qUup-@)8AE-KACIA~OK%Et=ixy?7Fk;SPN z_j=!dN%OER1`Anwm#pRzh>DJ0Er-@O3QV;)0Ar`0gf!v`gV0Ca$NdxA*zV1IET8!N zqR(~_q`Qq()qO{Yu`8b^elC5ev+fCr?}=&J*IswY{&uP0<_XMB+-C4>$i}i;-?SSV zu4wiqXX0aj=AvRyL?o;4uRASE!fW{d?e@N71neMR22foqyL8$rl>MvbFsya(B5yZR zFPxUuz*m<9?*gbjuxk0 z**KPbJ)nUj4~Q(~Ccma?_$s2Ny#s=)(C>I);~nHQ8bM38(GVFFE1~Cp+c6X-jN3IY zGWsMLi5n)Tg*?)f63aoz7dTEe<%f|ekhEeWiI>G!dyZ*;s}d0FHWM%tCzX<$+FF|s zK|MdMlWeT6vAE>7csNvadGKaSCS5;GAnh_y0QfDMhog;~;Ke3(W3zl3LBH7qIp_}D z*X>e}+-yVlS=wGY+4kyBY10Y7aGBrUH{Z#pr>($@xbW<=`fR-yusGh^L5+%AFn@Kk z+ec{xxrWrh>Tcv-W64PtW+x2MyS zAKPeB8g);fEKiaIB{#J_SpKWC6 zDl+*TfLUBX*q7NbmZh}l(TrXLic6p>0vKi&iXwI4&^K8)Wh-P1L}AKB&uh z`jP3;R>g@uYDdVXz0_T&u;ZASYTa9+D+D}&D z_Qd<`Xvww$lf2eS%izq74B!qDPI4yqL5Z z)Luiw%De-n-FYls+HLmAL@R~#ick->4eh__>^t4M9P8o;onS6s^i9|2BoE(giZx*m zPVsj1RooG6KO6>$HDHGzvZl-r<{JL{$o6bF?~9WY*X@!}I|A4f>ECW5ao{R@tF$bv ze(L;T!lXvG8lq9R@Zn&OwRron4g7-TLSk85A>Lbw=SRiV#NKp!({Cw!o%w8y#(HjX z3LWO`WT+?NrPZc|y)T*Bo3_NU9`-=h)q_z;7!tSLPoHA)v2Nw*R<)#=HuC{B?(N5! z*o+2b`wkKoMSi^V?Zhm2*`rcPyr^kzX0eQe#Sc2*^nVz8&$uM}_7AviWokpSoRyW9 zrI~xuQY*{RkaAXL?iC6yAhWVCaWOYO_gxoU#EDhCsx*l6)YS-dYtLR8eAUYS33fBixEfcb zo!55^AyYkt>!(HgBl(nu_hIJ1qmms z(^*~4Zg(?4-;8k#v5ug>;#*y9nz3{JF=6i0`l)`pI}ko@=Or~PTB8|JkZ?t3kp`

rKSwRgzCwlt0E!GnJ#%;p~w%}c{|G%*k7+hXcUgw%%%6C! z(?&p#tf3};>qxzy(@B(Qd%Dq=3UJ*e1y9`5&K~>N%;OxH+!!G2)a0rB{p0MheW*#t zcWJ(3$|up0SvUA~JkiBcMkPTzP_x|&81_IQ#5#7Q%a*K~U)k`7}O)81i4w(KPa$T6c!wOG4D1=V{^i}6)Pu-_seJ_d5lSS4G}7{o>QQIlCF;m(=zwk_e@9#MV~ zy+qVZSm`4cD_#>eLzm>mwGP2K9|7tD?-2Ty!$}(gF5uko?od~%!CR8 zfIz&Tn_PNR-5O}zqe@AE{-}P3M?O`RT(2FML**pGZfNmy(?BWB#!$uWs5pNPZy#g4 zvUZBqEhNd_F(AYn#l4~mxz*tQhb!;Schd*vb>liPwlNQ3IFrrU)c&pKfCU^|FG0E$ z*GE@%XnyAS(&#DFyQ1*lOlvJe@F4~-3mgxtR@%pCG>w8*E79m)1nsBa&$CKvSIw#E zU|q&K(<-xKDvt?kxyujam<0qz;0g0Me1X&FvCNiQ<|vh0H4}-DF#g$jHw4noQAhem zMem3b?PIQ?_y0}qOZ$Gz5-d5{UG7`X`Q;%rFsJKLb;n!cyLU)}Zgc#`Yorhvhy&Mm85njjg`JEVgP zw``S#G#>)GH@`E5!$!jgJSPnvP3e}8^LfP95mv?PzZ4>Go@lEe;T~e=Y*0Xj6U!n>nwMvC3Tuj;wi{|>6xe%e&s^3h>x z4xOtKCirlTVcs%}-g~qwc_aPpk>$%v%(ZEi7UiPTCZ81jUA0V6ord;SG>u5ywXv=q;dhM}~9&Np6(X-%EO5Bk1WpKd;vXAeWZDnb9@n9|dG?zE>Cg`+!hl zOT-j=RDp9`qCVnjI1DGOT2r{|3>8xta@BnVY)^#hl(>DwPc;uQ+fM@6i#^7t zzlfoK8Rw+LjPr(&SI)BIka@z!KGQ&c51RX!R)*MrV$ggZVCVH>fyf22FIniN{ozKC zwK$s5QR=?>g7#fo+9KK4q!hoxyt^sTK=u^uTi@-cH!gxQ0T*Zi-2Uuahef5f}wNsjXm@E z52|}cavs{w3T-k)d3^`-h#%(2o|Yj}hXCD4-p@V*96|3-Dh&f&Gr|NtYvrq@A$2!< zlZy7sGQ@0Qm+D1K7s$)6c*Or1iXRnBrEIm0Ywq4d&Z6yyJuc|lE$8rM@ zdZE%NNVv&^*)7C~umD(CSOvt(sYe-@*##THz+bHz>S}E)z7ixP9)yid&Zd+JH-I26$miNYqXNpW#?v?Ao58cr`_JQ2A__1*VCu zjhxb)7W3PXb%BBHVvPe5UrBGeDYZ%9&jkL}pf4tA%lN=QM6PmrvYL|uK85@+@fK8N zD$y9Bcv?&Q3bLJw$z+6#)u|FKigNyQolp4LZ662{e`vS~pzhiLinvbd`2^1@$5XEY zRm^Oq@&fZi#&ZU89{^z?ALF63`IBGr*Yx=KVdn}Jz^4Gl5ABOiy$sBG+3e^5q&dwU zhuv}K8gj}BuXG)Qq3x7vd|l0iwx~eloxGf^Dlq(`wW(XK#xqtoHv^+MgmN=o=+j`b z#SM@I(Jt>!Eq{df@qM^EBR4-cp>%2j!}lY8rncVoAI0Kw!u_K>8}xoatW}Of-3Ae(aebbxE8l#`dfX3 ze$>5C^@}_sa~nVb4&O!IJs1Tfqu3Bd&o!5uEjAVIpflv3vpTNN8G-90LbcKdo!3pc z@lM~s^+e7xA6}RS5`*>WhvEHr#qtTu%6Vq=iKYh<$kHuDerqk|{cUHbkWHs~~SZtNwk>{s9IW)v3Cl z0{k{D#v{`pzDKdl$D1wXRcpd7;}f0EJh&m~o|$`>*y9bsZ$*D+PCT`z5hUay^73$R61V^RGd6)Bau-Bo+H=A~R%%Q751k z6d=@vL`-rB-i!&3Z};3^Bvql3I#br?UXbko?8Woz&{G3)$Sf3X?U0Nc63)+TU|$2h zC8R?Yx(&d#^jVUh19JHN${TFBWk}0>b!xRCXrR_K(HnW?9T+>t`KX=n*=eBlEcK)z z)rFj7QDzL9%n1M}Sp8)KiC+=v^0J1Y?Kt4PdHEnftQ;>NH&Duk9;$aVW?{qFS4El6j#JeF?*r*C2){eSPDB zN9xXo6jOIlR1v?DXw`inJ@3Hb>8Rcr{z}|BQvXkiAf1;mEdI#Du86no>-%i-pC(e5 zTFh$`AOYopdLO=oqS*%T`ePLp!M;=`7Ia$AhdTL&y?*s7n*7`;E-Eo`wYBXV?)p=! zVpFoK9zhPsV<`&1U+w}@44k$F)Ego{oXQj3Ddo8$kEiX|N?v{^fwxz(Q>8?&+APyj zCfY1@D$a!uvoy58vdG_FP8_c>6iwG%938UaE}o~6C*8H9!Nr z(w1BaJ&|8IjTf}edx`BRiFo;xbMa%RLZC13W6e=6u+Q-6M=0M6H3pTTP`rpm{u_$_GH(u zN**fLrQtH6B5(rV6wN{EsHX)?$u&nL?tmADiZnQDt}T3zGKeXq$hey3BppnccgBBo z@>F4PE>qNe`h!GSOIX}KzP78sbn51? zaVYvEa#NIwxXd{a+0#;_J({=rpJzut{mf+U!@_36k@GA(oZ z8&Jn3e8}>tlQ;g5DCQl&CH*?lBGu!h6jYc|xYM?9nj5&Te;yyDq;~6zl5WQl1%Cq= zB??hfvlvD-oMCd*rRA+f^Hj|&{OsU0VR#wTIO^$woeWo}8sYIiimJb$Oq|}VByP0Z z+hVbRT(*J?+u&?IV)kvH+$^%oQ`Dt{3^NOJ5zS4H|&SB1eCz z9uJL-ZbifE0ADuCQ3gZ)F*6ZaQ|{(h!26`4J{j(|KdgQD^h#`Uv0fK%%sVQ}GHUtJ ztga-_WMcr-S!kAT^(UV$DR>LM`)ML#xgaa|8ZO3Hg&2K#ibv)2mMxmYsB(ryAIy?P z>%SALwxRkIac@D9CU!MS%qe@LAoXh+JU>E`sL~EVt9l&kv!GXwzR+sVSQ;HB$p|)x zD~V{=yhMHNSl-Na#;n~Bfp%KkSvR=Q8(AIimXhQptCZa6UcSYLI=fzjQ|bMcrSsHk zZ1}A64D#txv^0vND-^MWWBVfaCw2;cPk*^L()xf@QW0^oI;t(*} zvK_?{lF<5HEqu|RU7~A7v(ZYy6F_c1 zHXOSFgq(KHyf?CljD(KwXI}~9jN5LK579~VdVRk23fXD%1;B`~tx%Bv7Y^Ea*1n%q zBk{7>z|jDZ_eL#rWS73TQEz*i|Me#v4$27cmy#V^Gej2E%mC7D+NNaePJb5#GD$OB zM#xdsXhTQZ>ZegitG}n!O*Nn7PB{a5ZzPI3VY2Rj zobZJpyqef^ws8~wLk*vWD5=rgTtYe-N0AZEo2KWZve8)<+<1_qlDNBzo7egV8vzR$ zM~hiS0?yai&m>NZ4DVrkENG<*5K)!~9EA{Tb=4S(S?y00ZKgt| zU1^3PcopvsOyIDRs~3U)Ew!eoudSoWjk6JJ=yqk9V^&_7E!lJ4C4O@MRQDQc{SD6w z{XokFm#Y0u-op2mFkPU2`RxOeWi>;920@>ads7Pk-K46e^+B~`EweJBJ!TY`w=SW} zHBthA7C3MGDHEX`#K}eTKA;~VT1Lkl^~J4ra9?_h68KFDur%kHB5Lp|!P$TsIY@z} zs`AMJ^n1XAAEFA2VK;$qg%_uoXUs!4l8Upa^OqfxTRs&n5D!P~@oq0J$8$a%{e1t@&6E>1RfoG0<0{ne>+Vb6 zw?R+&m0R}t-F26?HuU;9ZlSMJPJZLI?fS+`b*3>7iq525>XqDj^}LDZo;#EhK#}pOa*grt_5Q*s;hx@a~dfu6bre-0!4%uHG7QL+3+%Z zRu6SMzD_u+Kt>O}nVcrMhD7Ji%fL)M6Wn>`Ep;y*ZGHZ?pGLC{17Y6Cy6J2OG9f3t zBCMxq8FpN8KP0aUWkJ{I4F0A7dHvvCf~r8df?$8Y|CDdQYI4Ek-cnZBp%``(+`1`r z~Eo1#?U}_7!A^doEs)4Ju`4CT-75%A}ffTUPB=adxkAI!~E%$9qeRJh{ zD~3e%fJT)jO8krZylgRnR!u09H?IIuzWa&iM$-#iH`c?VRzH`MGLgD2vUKUGVhzM^ zdBppOxdwugX+^DdV8l&MJ&RwP)xu?FQu%rGX%542@*vw!yhaTy7WHD$bax}jm)cnY zzGE>f#FAZ&|JoI=^tQA|j(D)aUNwL7)s5UaImy7sjqQZW;`(R~& zS%_VDbAx$C^IfdknDLE)!-!uc^hV|#;MwTkE5ee0-2HqIGiYhQif`oQgb64KFmh$)F9ws}BaZf4f zkFcfB@&y|AV1jx$cAf>4Io+^baKJm809_k4o^>Z>m=J~(h(ZXX_T}I&AvkJ#wyj!~+kKMK_g?B=m_DZ5FejIt zRlr{iN$|LjZT51UD)u)Q@3bxz>B27P!T%l&T55q9Csroa7gx2>Nwr~tWIWIN0O+$jB$=ng^Wti;B7 zN3`LYw^_bH`~~JM7bVMrTa5>}W&zl(1}z28?EO3v-46oxiXMF?L(e+8G~?Ki=hTlc zA#eSje3WY1C{gs4f0Cr{&Q48hmyWZ896lP^g%;M*KX)NF^eGToTG6kUOv@A*bk+Tx z$nO@Bt3OSt6IB7J{iWaJh_GQf;>Xc9Sn(J7s+irfC5|N;T)b*DQQ8^Ew#BLKOHh#csD+kbXmE!j5r$wntEm<{M7PUG zDR#r63tT5MYZKfYlzOGs>00c&7wM~;b)6nL&rQB9T7OS-OQx%e@e!h0OR1OltM_&- zDMy-;5(f2i5=^tEf=}pUfSLc%?dkYjo%$}!;=AlgKK zu3x)q$O~y;dgMc7HW^~68qg}-H6CwH@KTitug69mXqX7Y`epIz8MVhmbPEx$VPGqh zN`^@gig@fkEiST3^POw%iH2HwHV(C;B_z?^vvaLZ}9YU7z?|2||K<~#!zA>a2has?zr$ z=e=XXzYcOv&+(3YW%xQ7$=O?G?kammYLXMO+}crA_CJi8BzbTt$-M;m%&}OBG!HS! zz4{s3P(MK*-8b6mE}W+KAjQ`y3nY*k%QSBKPKshXXclpHc)r{ZUkA{IAo0Wa`Nk)T zZ;K;m4fO^QLf+Gj1yL|WHGFBQww!YN2T;NEQOSKF3tznC6^RqM`R9L3BEJ=DbOI*^ z$fp?Etc;g3^bE!0SlD?of3AAi+D=h|yUi(3RxIJXJqAzf*}mu^+4m~)_RZW2o>I_! zsMkVhDXQJbZ?I46)ye$#SzPYT)E{@;o|{bsysqM%ZaKTP`iHrzgVL!l4*n;!-dKeY zlG=_zuWAj>9jEkK>2Sz$x$1M`;-ufjSC~jbQB*HHRo|y;DXpS#7Ww3qNAdJp2|TpK z?{pojvZ1s1duYNvm?pgIl%HT`apjv~!)L{ZA~KIw$GQv!2h{<&)G)(V<{Zzev|T^I zzOJ;-t8-9RkDXS}tWo zgb!(&z4+QdG~^fKcNp2y6xJ5Wip^ZLnG$ z5bXt{5}5ISi@+Xj^*`5$==v!NwLO;`9%dO*oYX^78Ed`Ds=Uj6A9Bv>7b+7>4KJsv zTlw0I4!iWkQI2N~_C_oCNxYrZGEr^V9H3Tgc*^@POU)wk0DkR=dpCI@Kpbi>dWWvZ zMT~pla}~P`5Mq*@o;uaX4}2Gn39P)Q36{muTm)WtZ0%}0KD(VnjaWU$84e=oD>%=g zXb`AnNEwz3eD&#DNN%Lz6|J`!ceYLOZtqiVH4b6%u6HR5G~YFIIVLoCxOc~{khlUX ztKc89bB+>7?|Yj+3bxy>O{~k>Hz-FmKLl3B^&~ZX3bamiEBD5yUd{y;_BcsQ120^C z6tpkp(HjNgM?N#*tAxeMS#fFoN8@R0P%6DPghwf7RVy*Z@ZA<}=?{E-5^!I*dFg?^ zWVvQ0R7Vfxn#J^JaD~@Cs5TuV)Y9|(xF%uX{bDdzh%eSL4TB`~@rJEWV_e`=te*u= zGX%3-=ymnvvLCtjA;~FF8{tqh^0xGbYAHGfirw4-Ry~>NCyRT)O&w{m>+7(Ef2b-L zbq!maKt~_;-G@U{p;SIO=vSVfpbB?^L1^LD8>yk+fN9XeiB#oQ$nmRURz z8l1>4R|GY6wzWn5{BbvB@XF9xqie3`)c2rbyNB+|MqTmweZQZhALJ3S(_b>pw1H_x zUTSJ8c-l1<76oB}R+_cq(~&E#+@8Mqr#E(FtEqWhI7GVmQwFFE8cUa%u z6D%ukC5&-;^yF1QJ!JEm93#Eepp{PKn2RW9@Pc2KMO0obCu)qGP+Mn()nGoTg6(a{ z*|#GPfK-J3VvQ5>+fX(V_gU(7mcc&i z;j!R=S7!sy%VeF|fJrfQYeHPEhvJ43ARM>mqr0DN9<`4b%ZnC{jB#3m#6vl|IycDAwNUV5LIhLbp>YMCAQ2mFh#@etL<_&*Fu4N3zF!Dkv?cTSZ}d=$ zI<6TbcPf5tC#%qe$>$4uH;U6_a#}bH;;uVF_b)w+p+5 ztnGN%sxT4xW`80{r@+*8R$)zju5EU7F1JyjI(StjkNe>h4&0M zdfTs%&{ymN4GG)xcP9`ymP6=Gb36m!5Gp0m-0PPir1gU}C5poMS5@WP#bE@6<0tQ% zIkj!rQp%WqpX5PE&zbo7xt1BFcFjOpy0ahep_FGbm`X~Ars{2{uxNGXPL_x%E@_Rg z1KbHbDP(@_jHGxJXo4JcdNP6E)wUnIyMC%=%!@juh`-l;(umUoGvtYrS-)+?SH-i0 z0BKi6{JAcYv2cRwtFrcv^>QY3QUXowCi`>x&=pXsL-b-MEBN|IIuXR}M-D3xk+Hk5 z06-h0Mmp;aDA*ZFU~BQfHa0}j)X($@g;Czx3NlD7#!iHy5e73>Gu^oac_=}!iI-gf z^iM(oqqIRV%nJ8N7XF_8Lmt>zJ|>qh-d7od%xbFP_8k53G8ahc{5id;SO(oeWvwTDH% z!*p69GrCsuB_+D0Uq3oOusDWQydA1KzxH8jrdXEc9${8BHZR3%Xbi;GjK&oj&vMhF zvKz>Wwdw8ce5eljO`qO3*F>hw@rRYQ#-~EflucnrmT#CdqwqmB6I;YOBTN*HOk#&n zd*&=qxEqX0?ylM5qr9HM+O>xW{i7QW9r(LXiTtL*&po2huRgM3(0~TM!-SYR%*uY2 za4^pfX6v0^vH|yrxRHL>MCtCpV6hvGxb3N?kx#eun+rCoLWLvpxGuvlIh)h>yBgun zW3>DIL_^|kC2<{Lcyr_ozDtiMc1KnkBWa?*LL_5laD*p|?009%@CE9Kd9SoRVy5~$ zB8nEclGt&x-YzSUxr(p8MLC$gk?Tt65{umTyTIo%>a5tGVR$v)9V+ER0Q;NH?=--O zqufU`O!@D-J_^uJYuPx_JJ+jgBZ1FNWrSpEfJ&5@@NFqANLGA?GZck1PGh|6(L~M9 zu2bH^6ZBTC_m)3%JJ(2{Or#1t9o>h(#ij6enM(K1Lo$(Z02&P3XXV^kz4il`?`YY- z6uHVnme-2C@S0gFq_|Se|1Xq)y zyOk8p54#eqB~$dDO_c?G+GzSofjFV`>ECd#)Gw!js6Zqm1m-L@$#O0uMFHq{3VUJ# zb_RpRbp@X&^8if<5JQ@HMLv*;rB-@(n4xok_`UJs+FiUI2Kl9y zgToK4)k!w|SSic<38)mNV%8mya^1`bovz9!=}kSz1s-u|9k8ZwDsIepA*!VskDvNW zDMB=gZ1s)4ZRUG|$&aIw&Dif`_%qT<$Uw+Mk07@t1{+xd_iqe30ZGL)#%xcx9$$Fa zBEf$5a@EO;d6v*$*nuC$#fvc;_X5h4c;h;8++jz1UVTL*{+>W5G5_O@1lpNvfB5BR zJrva!>{(ZEmI8FKGJ&Z7bGicY`jgP;(M@D)TlWC4ByKcnOyDNoHSD27;T^|mGHQi& z@Inq*v*`9e0+^jKXByMw=j{wHop87c;xOmke#Qn(VR>?8> z8pB=aX6dAomI|u3($1mC{P-)J%abRsrEhQlgv#_G zGpy}`+`tnAvwH3O@4C;i5UfVlVzLTyB^6O&ygW+yOBx_Vv+fQ`(3Xc_Sx$|v{@9?9 zUu_5`EO$xyhHNapix|#s;_6ITrwc^vYCSItQOsO@?*YMwzTik>(#Df~r=a?H|MtPy^$vJ0L7Cd784kesy!z|51ec9t2p~@L;anu>%}|sCib}>BpU>7 zMep|6MfzA9Gnrw!$W5DH8%h$b%-PCiG1Q}nlQh#62^_DY+iQ#b_*d^3Dh<_qGF3Q2 zFO!tdu}V*ea%4nSkcAn)WVr~=bh%gb`Euk>cZao3*wVACDwc)_vqjaMl71?JH%CEs zM~kcpU%63{n<6 zk(0{}&+dd9snoAsZWovv1Rq#TR>&XY*pOmD;*ADDFhP9471~(U_sCmYaOw54FwT?7 z!toAl`Y@7cYr{Zg|@Kxqs;ui&CZg^<0#3naE+Mr%FIT|8A%>ylE z(;*mfeFFp1MW29Hx}FsOK~i)R0nG%2X30dUF_@<962CZl3>E#5v+cTmSI%Z2w-TS{t6) zJo@RVPXeQRZ}6HOJRM%Rk4^<- zvQla(OXC5EzXj5#RK{}b1T=OqaFJ_mnU zd0;7+aXojIkY7s=pzh<_O7SRtFmdA`zx;dYS8ip@S=D!~e#ds*dfCw1lQ0)EdX6iH z!s-|Ad~hu7L-}$GWOvXli>~uG5|m%7K>#5YQkF1uvn4Pze>|s7*;pYh$L26X#{kvG z@_E|yVvqV7P>nRqvUO!{=I-0w+i)oO4Sa6NnXqJ&zER?vJG52^YcXECf~5nWTe z*ru7T2?i*p8AWke2Dy|=a)nVU!9JSB|0MMsIp2t|9214g^=L4QHEb33ryJTHRXUpa zio5+@@si7D&(X+M!)JB(25NPavSeH)5 zH&m(SM6^^>eKWubzg1X3D`VJM43U?)*q6~tMv*r;k5#KRpCpeeKh;DTw-44&b&a&x zi%*VPPw=|n@(eq(Au^u}3OrwU3(Rhd^zUO1(<6OgueJf(>>4 zlCja)fxE65T!R^DoRHa^m)n_DYSmDeW7K^!cj%VO%LE#@Vew6ZpvETpu*Miq zR~s-hMUKIH*6M*?#4++2fNrZPcuEZ|Xz-cVkG0ulmub?!#tf*|&+p1OcD3i9eTo}9 zOSH}l`iin0>bY`6dQkA@LsAMA@l}nPZB~~CyGy$ROeWgQ6pTNg)-9;*9Nggf#$_cq zSdWr5*>c5tzQb?eO5) zhctCX^xomL##F-ZFp?a0Z8z>7ca=U3@k-BU)-@O}e??N@_x1Ua^YaBUf*0qcU11PT zp);YGftXl&L2os+AR9#bj3pMaVqfkqvwR?9Cw?V3rg-PX=-1MAX=NWRWb16Yrkkjl z-p!sBvh!8eTp)BV&%ledcB{hfT4zS*4T-Y`3hNfyE>$Ffmt|~gx_<6`0+RIsODJNE z6M9Kkw=zd1u2tN*tM&p!wwo@uz17IMVdC+#YMrla9OM%~IcoL+b`@49QQ34rsS5ob zDplU1iZsjUM@2E!3TvM+*F~S>UI)mL~z3Mz3^SRdsC5Ju)UWHa~>l@O>{&eq#C*s}*rM#+fZSqb?UVaV!64im6X`+vkA`*u*n4RyLfA5n1Z}Im((>StY-j0#$ zCre$erJotO+sJ$nG|_p1;W5nG*ZB*yi)#4ebo+pE4%^@hy&f|Ve&WGtN!fZDwOLNq$ z(rr;yF5RlIjPU_HvGZSLaPU}}oXYu-qkA4`^h}@vOj?AieNmc{FbG@nI;Y8KTm$On zGnaTQ>vV-t1)koA>S}tz*}GJw(b#4DY(iC``ElcyR9l4^C1a)%2{} z{O=XDf6Hv)4|bIp00rR3=?*4kHqa!^{eYj<%+yv6B(?M)9J z*ej*Q8E0w(M&RqXf?}+{L=8&0M$*`?DFCzUVgY5%<1@PdoRsb^$S7LO*a zsU|J~pt6IjfQ!fE-`aQb*3dD;Hh>Rdd}nplW_{ zP%?PJ{73g5IG2sO@sGk3M=-^6-5U5&_uC97L!p&b)RFv?<#&rqW)(7KmrvFWl{3Fj zS4bt_mK?oQJJn!t)62P)q@+Ucy3lvK z$4`kH9fyM`{k1LOYnD^YUC?kiwdz2pAg9`RI&o7*x!H}`MK5BTPc^|m;~P#_4!gta zl(PES=2G3uD{ppZRn!Nxzw$}SWSbzpJJ>ImT6AaU)!K&)?+HbBRVz&zO+kXycW;&l zu5{68PoRl&(gNR))TP!`zKI1w&@wZ-|8@wsqctg4!1+so2zC3ZF@{OMe*6FYZ*aHW z+LJAuKKIm$&4@;R)mJm93zL%M&`cHew46o_H$-KoFBktl+~hWUe`o4S2PF{-S7__- zcD=v-t{n6%WZTk8Rf0jZROPt-sK&2QX);8i`|jH*t5;8k?e2z4fVW;(uR9xdkaf;^ z=;&cMK`N*AUhTlWp#eNJi`LN6H>7+3;hjQxHosEd5+xDQ4Q!yLmOs<`wA^Zd{WM75% zG;i7g@j50w+8QgozkWy^5P@+!z+=#aGhC0bHQZP4*Y>mOTCexvT)8amJX z;$diVPQHKsM~~lL5;%co8PfLdAF$cK>a5F!|B=KY-m=&TOC9(No%x*F1wE)Y_Fn3n zcehDl_(*mNezvLSs}9h=A9F+eQ_fSb9~_^v(ti>6ayQFi48i^NWZX}aca$b$2NFD| z>{jZw|8>z(EpDNz?Qyzg`yB2mY&A|YLqBwQ(}~!#XJ*`{e~5flyT-SH_sPtZeJJOn zuXbqN`m}n}MWs4}`g_Uw6PY9Trenm;$SQYTFgGR`=Mb|o!(ibmCi*k)Gdj!QyRYml zik6^q*XhJR3Lt>MC9l7JRsrzJ<@8YEDMeM%$rxLB%XsQZ;oF}&e)o8&Us7Ak0|5Z;p z2FP`ueHtR7_AOV3E7Y9pxJgsJs_!Ps(ds9p5?ih82DeTtK}*laR~Y^TcF7ek9SuTM zQ_}gw@SE1}TN_+Gwz{7`8J_T|yp7>r{^+TV?{17x_F1x3_RZb;(z$KD zy01o3!h@Bn6#KfRX3|50_G}TnsPVoTeW+|})JG4&iwkvdx#D$N;q5e!Km+07Oa}?0 zkqgP;O^a~?uSk%?{>zU)2TWdCD;Aa$*;XM#JL}Z{uSeN+ zV#B~qK+h`kk<|v=vf_|kVFq>G<;;!{`C68VB^^I>>4oE|kPFt=GB0-Ld$gA2WL>`egj-Xr@hmP0h^!YlC{@M&|1_6`NBYV@a|u=; z5>~bVde|f3A@jl1-w$w8z#6!Jax0Q~~A3x*9!#_tgti+Z@^XC+y5>WfZZu^YKw~zntE`aY^nyrF$ zO+HF)XC$VE*Jaq$of*oCKO9Sb)*0hIzrba8SZoLHQ+I1e$CbYD|L#_0L;dKoTgf7` z1U}zBT^E$dgd_t4{$VDF$dH&F8?faZ=e)f3G+HTZCTd zICRJL88=@u-KrFqGo*jO#pZC7;_RLK^1mcPnjGeGdbcU`C0X4f*M{`hUX-gotfSx% zeo?v)i+WvKUNLe|av{;&?))UO-@bgqd9~yC9Eeop;hO`-J}F@9jMkvbYOq?kel4bV zYf?*y+Gj&~i{-45$@NDQkFxVsrB5_!Lks3G+z`v$+I$44Qt({!>DSTWnz3ii)2|-z z^cwG;^BC0dYk;c%Jb|QjTn|5Uo2<~4!Pofqspl!{sn7-!`OcjKC?p1TRowAmU3I_I zIexT0YF_q}Lp?)azwQ45xs58jN|6{=$ zTyS`s{^c$3wEkaLxHZ$$2OcYXe4H+P)WE;w75EK@$t+IvS9b@w)1b(-z-~x1W0J^3q#3p{Y>#^twXDkZ59zTC`mAaC$SZooWNLOa}jj z^<99t$XT~t9+}hi?;F=T!BQ@6NQYS8z#7EBpGe!@3h4G0^(6qS`PMo7_gLx3AFPA` zGIGaVIEU({Oq6?`n5Z!V>qG8oMoSQB-M9CLUOOanMakje1LZ0ot*t2s>h72%}Vb@2N9~=M+3os!NZCeoF{? zVrv&qUaGgFWh%sE7=iSMfiub;gJbk82w+?C}$z_6jy*~yI(Lza{hc0csxb1R8$QP1}7 z@f%E3c2jd299NhJTlyrgj~(KKc0bo2)61wix39~uzlrv$^HO=mKr`cnI?M5P>BG9; zB85w@&p&bZF<86cTH|DsymI?dlE>at&nd?Rak?=SpFmKN?N-5#F6SO2*WQ!7J zbMUMl;xWos^A?8Q<+rBVnP)dScecHx?>=y+x>#`^`QV6urf$ei&#y6uMMnh6P0uPKX1L;x#_$9@ey0p(+}mn`YwHmms^jzCVF^<9@4?s1`U5-nzra`mX2&)!5Q6G zGR5Lc{W8Dk5c?eUQXMVdlOJ}TMwtf5>>X;npD%59B;nAVU-~b)5SO2o3oqv|l;y1$ zMp`FGGLt3dBd*5&`};m6{M`PlEmUvCwIHX&42E86uBcA**>M+r3{pNUvsZeHzp*A` z=HmD7a9*g-B&Diqs)yiqa@@C!@R{;W9^4@;^9MXj@%*qOk@6mZwTLp570HEJNiituR7ma zNgC+AGI~wg2n}ByHk?cQzFv|sb81~;Y@j{uKD1%m99HU^XUgt7VOtwLMn>J`{f7;a zGanIbAXIUyS-g7wk7kZ!s}un-;YPxm9M)f2$*3C`({3Q~ISk?FV7sFTYK55Xv5Nk` zK>G8>k5-Lo((`uUeL!ZQL=z>m;=VUdbb0joWd&0YDOB;QTOtbpAYH_A>2I$vkm2C`P)ibx>UVI zBg)!7S}rx+(X>fqcX|!qRUc*OWz^o3lsnjwW(ezzcVFzK|CmZOlralNLYU<1hW74kuCFPSN7L&_Cnbyg!8o-UUTo8;Q9@iPqIf?KdacA z#56(cxt;w5L>t~svlth~@y(}HO?%db>cxbEuClPWP8A=#xL97#3lm+&@5l#P?v4j+ z;rKfl(6Z3nmAv#IpW?dJ8Tv$=s()5R!bD&ohbNjm@y{;Szf>kI;|&8DKy_>LmX!zc zL(94&TToIfru4_1ZSRwJSGv75%5gPmGHqCKeLGf2p7{K`VX6s`9HhYsFUS8nHdF)! zM~=K;Cph`-W4o4Qj`TvIqx-(~9a6|Y|NfTshCjJsk2dle_wJGM5<`dRceXWbVcoP( zQhF9|8dtm0S77zvyR_H5dej=1&}Z`hu=kx&O?KV378Fqg1r_Nk(whR(iv>ZDjx^~V zX^|2L3B`i+UPQWrl+Z#)2mt|U(nLasARQ7Q6q7(`_lfUU&becp_nyCZjPH+1#)v%6 z-gD12*IaAwExjDis3O_KRKT<|Anhm{THcvdZ$ZuIX5}(q#B8BiZ^O6;7;#;>2zbGWRXj3t>-r8*hE?ZKt3PEJyI zL+P5O)#R~IN~8SWxfHL?t#SR8JY+agl81^t5bJj?=fVBMXpvO#MwEHD>Miro)DEp8 zDmtT;yFUlS%@^0ZX&@5`K+WjkyAJEgv*&!iFeGtRNnrh%Uw74h70EsWdb{oVYAAvs zX6G#`%>7IAbIevC??8fzAN>hs)fA}&$oRIf^M?*a8&WV=)Uy}W*@pWrZw0!~GL2J>5xSV%cfn#eXDOVwUm{k1yL)OjpNCVk6{lG`~gTExmYh(U+uYbjV0HtpV#LHn5eJ* z%8t2H()ZBJCUgVIqo`a+b3qAza%o1teCevc>aTV8lIF*Zn%8Y=f>4%K2@3Q z(Xd?~2N(oCMA&Dt=YtpG)X7>^$kN{3&)(n%Q&`F^{fLer=>ni1c7 zGP<3w41B8on&Bn^+*~g6vvDC$bjOHM>E$u%=-qex8d2$joiNkiqhmkz)xFCT*&_E? z+JSxmpGKE3V^zETXa*Km!M-zW)ehfFSsHb3_~=Gb_nx+XS$nas9p6q9{pcsZ<;m|+ zx&(jWSFt@b9TqP{kMezil=^-{b65qf`>nFZCF0PHleg&aHJ$SQp{lVT(bmMVx{%bG zg1)U)>mSLKUy(g_Z}uG5kow6UJa6UAQ$X{0fwwHP3?fv+vetJ(pTe%Q6~l61|p!+e-vi}nd%6wG&7-Tf`R4#o%E zwX);~GXf7Pe3s$5W*!a_27gzu5Rv?gHNU;MQU~-e2O3PV_f2)*rN&~fs-h}$*TVt|X6%)Fq{mE?*OvKXAZIq$vsZk+PL4}Kd>X6UZ ztLiPKL>5brTd3$unS1e6Lyy#2&1JJl0vAmaH6qv0i-ihHVV({$R(fl3htHk~-udN5 z8b2XgR`nRkxVc9uEdcN!BR@8UU9q$4ZcNGk!WyU)s-`YnZt{W*^H!bqueF3dB^vVk{eYdne=Lu9NHb406J7WTGvF%@wD^`;=*%*Iky=l&i zco9f!Ste^=J&H2d^xv4OfBCWn=6pr;oLBE~Elaz|QpJ2{I%_uRVCi7btM3uL!irR% zv54&d)HC>(hx76-W4Va({?OISEbYE~f_COOt_A~1ktK}+>o+P}dbTlb-hGA9k9e%> zWWQ_=v8b5IOu>cS9P6TQ$qVOr$WldsZ(r(3eWQ@yOOX8_eZBleMt!|H)sk&GV5uKA zguj6FK$dk(ytaC(#8qE@y}4!xs0>^o31Qr+|BLO9$Qa$`^hismTTW2f;9=L$y5pPo zB#QBFO{u)s0H=_*^H68boSQJam%Ii@jA}nLe2n|BjknAxaAikof0cyU*<1RS8(Zn^ zVUP>)`0N)t>3Rhj=1)aP)K(mnC$DfFb@OG1t33W6c4ht5S|s9rYNQ>AvSPs2f+4{O zBay01W?bCwPC*tWw?}r36_0@9n#y#C43I!+41x;IXU$v`1a4Wids)-au|4+Db!zSu zGDsGAc1nPK>CH))0{5IV!Pts{dE9a8~a+iOf(+>V1|MDU#6Gfxjl zc)!4%mAyx%hg~mhiganig>kj4?IMTi?oE01SZp!tsF0$^oUXY^F?;WT#Bkh z5HLYzfqdF!J!0;(q-&>n^P9;x0Z}9NfQabr2@_mjsrHV3V3Y#Rnh=y)FRU|d2d@75 z-RXUMb4-k^E(CHjEK%8icTEli5#Yb+yNB~2 zWgvs8Y(9Ijd2pANqc?BHtJ^w+qAGR15+^csst%}juAC_0&?yn(=a7j}c(%RpgsF*h zgicJfpv3&C7o8d_JD5%G*WQa-@*Dmir-Hvom!dBioPUsRPJPb zvC5Xu7rMrc9No1yKZSfa9~}aBF@n~`d#KMi|1AcTtyHN>UI62<1vIMS$eaqA)WIF! zf_*T?qZhp|vZ>6~sx`H=nc+(vPF|22yv@N$qj7Uq&br>y>$ws8y&i$Z#0g36@a!6? zLeMYF*`JD*&Y(-~4If>X2fqC7L{d48^|vK`n)JhN(7y6kt@RA8lt!$Hc6jXzuCWTF z>FGIUjn~V5j3WB3<_t)@F>bg~bKxmaxL)A%DSj(RDAE0vob^|d0r~cS zB^hRjx8ZM3roYd6#h-qzw@othC4S$VacJ}XR{y6;t#7fCu#pRN(qYrsl0!TCj31j| z+?P^=%aJ=Q1*6*o1HW_YSM3abc1?c;9Q_zX-!|PJM_>0_*T;b##kLr<;qR)iCkfU@ zFJ^AW{5Y?=#z|>@HM%}v^tO4ySO&Sjw%CV|hq`%9&KvCkCrtZ&uE%3w=W4(JJK(}5 zParAm031mT%7V2;_q{$%eW0q7Eemc{OdzWTun}VV>stTT_T8p38*OvvDTbB09j~*r zBao9ME;F2j^KBvvlX;s>61c$ey#smC8;R;t(=Nfv5d-7;Qr~%SC@S*ijK&%F*2qt^jNI`!8RE}oWK6V zb?=&#hOJVCY3pT`B8j?Y$-+d}m#={a=2?lH_Mz~^=;}yUi}SCjY09gH08f8whr2k( zJ8=`m$gHJbZRIsjpZz7rj|6u~BaIzTuLCDx_Z^a^Co>?w2m3OOAUe5UPL1&B`vSVW zo&$)b^&2vyQk~g{Ya#^REa{;P1z4>Ir)I#>!5Nqs(hAxnS0?v&%yQ$@e*7z(-B_O; z7*S(%oTq!RAhiK8V>I=>_6zq$#uxNXf3xR=TJ0NWYq#fAQ-eN2DZ9sl}s?Cxty zGN6?uJx&Q#efmDHmo27&0{-PalR3{jHJ{q6~BhzNWf#%D4 zrEX1d|yJ@zg5}h&ZE@chaqNSC=+`f3O6e((Y_NmKf{vEd=JeQ<{ zHvGMaYl(fS;*M#Z9Iz4@k2z(p?Qh+V4_B*P(6Gw!LJ8e}vWC^^Q&3CEi`w{}eC8J+ z|CC5y9jJ&qFfpK7C0AEvgtyP#jmR~UkitN0y@1PWq*O(k{eAk=`2O9*^z^dy{z78k zX?pn|Q!qTLkI^y9`1|_fMOA#aNYPN9-9Nm9UkThFN%oq#B>R*#qM`La1o6wkOJkz= zZXn7Jz8?p+N6&JprZF}CdcV}MtN_wk9MV>rHuD*`j~+Vo_gXbcd+(_Bh#j3+R9eko zp256r-h96rK9$PG-cSkP5%WZ}T#*JFAP`5$c3FMk~2iPkVA zDGQb+hi@zfj7pObkr(VRZOLt(PNnOI^|hos+LOl(UM|%WndK~a*AClh{V!W1Zc@23 z!S9pZXozBLIZ!z4BYJeCxf$z%6F?DY30wW>e%wdJenG+(Tg*oJT+3d15u&umab-Xh zoYMxCIYqtp5siFD9)-tBR1)5mOunG9VneD(6=vW0@-+*7q!3q{DKkC5^{lwW?g~=H zMDCHxRKILcM%_od7Rj&uF7^*ieku$kZ;Z$Uq2Cd65FWmn!Qrm5x+g*dY=(vkR?2i| z&sAWJI5p4Ye%-dPLa7@a*zOj8VOp5gMfK;u%2@KD= z-bljD9oj)0aBkI3yzA^IvA0gI?LFUl@+mMUPjbG8AN=bw@HCSG1hEKU=FFRp(ttzW z8OoY*N`uo+z&4 zl$1B*1b>1|M(c?VmM`<;MJLFRfa{2qCop}{L0zoIx{o{w+KIFH@x?-ZJx4L+^zjUc z0zL;`R}&zcYnts~O=D`_*4Q2MM4PKPrNVjf{%&R zlb3CILj+wTorj)HJ>V@H**u7u1)P3h?B4f@#_hSP@Fj%-<~+J38=$3XbO?;)Vc0(f zd+vgrXpgKzxs`v0+m`9^?o`Z6-CR)0vnGga=!#YVPYjZr98TFt*CalM1T27CS9kqF z){fA-^2@mS#vJtIgg%&E`sGI%@0Ga2=ELt7Nbz#nKuI-ye**<_UVR)OPs-}unyxbS zFOms=On_PE#CMBxsJj7!W(d~Fe#{jgr9v2gD_wa3Jby<*vhKO=S_b)gyGUV>n|;}E zEPMW>m(eG!4!D%lz^9Bjc@sl5EB+@X~h8oMh=7@`zkE`mT`|jTeXLPS981l^@n9(ez01 z)a4n`UD8FOG{JcPnCBPDvm{+HC-F+HvkLv&ayMu+BlaReKReK9%Z5CiN!+XE;fTs`31nP$GacJFCs#*o{wdBrFDIN`s;;x9 zwH_1nvzkWn!J6SDxv==N$6@e9xE^&&xubJotgrjVK{D30Gqqd(Azs;Wq-G*coUGCY z4JA}~J#DaBM29|IdqWN_?jwW>qq-dYpIwMKDg9K zfqw;Ox%l(dz&-NMj3;3p6xKcr$yLN#FKt^$1WA^U>SOKh`8*hZ6+bZ%zRFgBY5iT8 z#f%L9EcN3yBQ}Cz>Dxzi`UFmXrX+0kRi|JKNqI_0r>WEP$Ic7 zeQjGklZ_HxMsP%2`geG~RzIj{@j4FO>G!AypuS3z!}aF6M;tgbt0BG})j*zvuFs?z z@*$`*3KBLR8|>J25IVtmU>gL+$fU*nC=j(ypoClBYH^=ozQEyP%V9IvG`OmGt~>6` z*n@rTTk>CQE`r4T23t&>#hZLtu$3jks85Zwq)jGBaa-d0H_PMB$QUjl(G{tL=i-Ao)6LJL>3tWs_Okl0W$IsBfJU8K9K&T&~~5b>0Hz?W8lq)P3|5TbT! zJ{5YRJLh8@4{bhtWnEE*U@pYb5wwJC8%Cv(X~}VIk0?7%l4oKOTN_8n27z{e%zuo&L;HrjaFlBXXZYQg|m-!02xr6TK~ECFak- z)|AS2lp`ZwzcAw(e`%-O7lBr_TdUl*`<|}3#=<>8OqLA`rpT$9KNzp*2Ypuojr5QByOS2Q^BA>*<^6f2gT2kVW(fB+ zgEt$g3B%^ITT6_H{EWrX=#gZ<9Z9yhi5iM5{wQAvLH<+$BpBmk<4YWF``%WTa&2$+w+&XwA)FIH*V$-VW zSsaRh$P?x0$!ltD6PZY4Kj?#4Y$$Xr^mu=`V_YQ4-g$a{tOv(K*qY;TeEb)G zNFlPxn_&Q?`$jtl6ZCjHDI-ddHDn@ue$Hv{dIl^;P0^lc6aw@?UtUk2cVBKLLnRd@M8@-TJ|th&EAf(RZ<* zHkx_8mS4ku8)&Z-QS>!~fXQEM65a{H+J0^BCo?aS zw*^h=bP4EC#^^CvE zg2^>Y_j)O>LRN7PN)$O4L5$rMsUg{yVP1)oKyR;h4(}Il%ATrct3Wa5(&gBuBtuID zj$f(~CrdIPRZqLri(r|ZGZuLW?#BlpJ%-$9tn9|<|<`?wCb6mv&w>w3^e@F9KEDM!ZIHX6Y@*D55+^F(Q z4@{|BWunGJovkS_dFtUL&au6$CL6Uz#&`#`F8tcuKkWpN|u_Q-&Mq>8Ke!1>EkHK87oaF84!U&h@qXg{BOH$z{v z%2%IrPzS5&)?vG-$vohTo*d~;e#NKd;34}S0hr(rEM!SFcFCZ~-8e7BS97Hz1W4iJG2?28$wkihmMPJhW3 zhTqF+9iO9BCFf5s#=Gy-9aw~j$>!0WEx+*HD@I$qD23exZ$eF|0pIQK{YW+Q`WiMT z>sbKmy`kTDIYx&`PSOBt#|Aqo%$ueuSTvA5uvTqxNM*(06Ro$*LKqGH#L;SHx_eh@ ze!L>Vgfc2ob0Lq$#P7*wFWL5sJ?tBj;edS|qJ0u)mAqkWQ5B_Q_YRdcqU_L?=Nx~* zo!HdGFQ7K3Xx)_$w$T?wgce_4wa(XEaVEHbE(Tqtlgr)sZZj(6Q}=W=V7zEM^T}D4 zZBWLhT*W5l%XARDPw?53#!RNQ>6!GhfyyP13cX6#&csg3)$rRac=<8*xkam1(`{Ec1{#~GmYASem7mQuScwG_&ZA85ZWO7FTS-nh8(rU+$-bM&ahlNEJWY6 z;M{L8z;>a=7sdkhnyPjW>Ua9w>ZV8J!Nj#db&yyJAe4NYTJbe&is`-8aw)~U`XG9i zcp-qvl(H%YyY1d*e0_8=d+c$Zh3T)EHd@wL3FD{gg_|5AQLcv;6=k(qFD*;SZPHRzJ7)Muo7q!(3^n)WL!k8f8MSfC)E%$*&DFP%weM+1jn8ox#7O4=?BGH<@fPQx16k*HFumWZl zZu#|oZG3%Zm46l4*!(BkphQcQ=1s;k8f+X2*?Cb+>;-gUEgS|tUu{iMoL@8G3-ktS zZiO(*CB>~pJ&5Jf?j~?)xjcO+>3iJcx(7bY*}0RwKk=g`WwEkP8#Kyrkkqn){w#zz zpU1>j_%bg?U2a6@B6tV5-*93Q@Hr6PiUk6FoR$TF@>`Iv90CLnDhdRn^h7b`K-4*a zU7b|O4(>F8W~uuaqatBwHK8@X2}R41VYSbPpU`TOl%&q;>I4;_-#rc?a|bqfRX{A6 ziGqViRaJ-u8`wXZY2^S^tZTNv71GA*yKf((fNhiZ(IMS#O;oe+rSeeupwl_UbT>;r zHP$>=oWa!TJYkj&rjeQ+@Z3G=QM-Lt{00mf*MWr7Y0R&$c{~T>M__yr#0Rb&U%&y9Q7@=?Kex=! zL!$bjP;}wy_^UKhD`~9>obZ-p_KC&&*rAjhSNdE4x*jkTLF z-~@@ma^+CGn`3q~IXpwk&}QJ7C8AVVduF~v2>fYLIdp6tZr{-4&3y8DXPIt-`6HS( zJqmV84Mbn-oC3TWucR7gdzsMk3+e%Z3rVF~{rswxx^Xlj0q9)O?aYa~jzd>I(vo=K-Q zLlutx`m8Z*}AHVaB>frXAQVrmr9!Jh-9s(=wZP;3vHhG^}s-{(Tfd6dR`VO zH309>u5AoM^Ml=+kI3Y%($7yvEFi201SaLBhKQd>WAr0 z-M$Ar!3PD^Yi_KnlJImXOE6gP3|l!#r4u*MOp}Z8KSWn3VY$*9mngXCAS^53V|BBP zx_Hn%xL=!utkD3p!^HNH;tLiMyO8SoVKj0YxJGnR-eZX-D5&|E7pmb|w znKQ#y7=vBYUOIBaeWj zUkUco*7On!vZRgFKZGcNI67M6EqcHvSLK_y7PpkkD4}$H)Y6i(J~>HNCI(O!(`Imb zZNO7B>lY=;=V>`m_CgjU%YF!_%n?G-Fr_+Qi|AX#JySe7y0T>mWeeG=taD=Ck^3=e^SFG^#DjKfb)L9C6NT$Lh6qE^{F{1bl9_E zoQX~mh-dEPaMmM;}-Z zyA=;5eqgPvTs>;Et&^=yxT?@8Aeb!C2!pF~xhh^OtGR~8GKPr?i z1b2R@x!~QFN2Davwxb5g;xehw)G(Bu*PB-KpDzqmJMWa2W0Za)}bBzWAtW-I0TS& zyh(5?MtA-(WxN1W#`>S~2w*YOpeFcfdI}oty-T#PPm{Z%Z0T$AML1F(i9U_d5mlt* zPX}5jCoMvIFM^Yj#@RYC(R#dtszv0jZ%cmsLy+{6ibpj8YlXAB6dDw zfTf1rEiagK24*l?JCg_pI#F9S1FM?JenR`;!8IBl#n>uy#eI!QmDYmND z?G?1+9(uID4HNr)K6a`wWy^}*lGKAZhC~v6wVdv|YGgK;5wP98q!h6E*5`);=f<)r zUI94sG6QydO#;SngU{!IjPF4tX0qw-i)GJz250SR9V4RhnBg{TgyCXOK5l_C#D3al z@#df>1S9sDkmvJEYUAOFekox4FN&(e;?bjBPw7@2ZD&W^`Hik3bCvslEP&f@GIggs z0WzW^xfDjOz!?M(R7;fk8%eUKwH^S1O0N6EfuYoJ@VolGDIptsAioPhTU2*BG-|<9 z493n7XdX$HO4XdE!v{d5bH;X9q7%n`l4OlARfPCmoyk#EQ>15A_uiCN0HX!mH-8DZ z5$yc?VS{$2#eFeg)0Mjw51WrkC_c3ydv-{b;ZNRO5;5D>trXB;;lug8 z5`8I-aq2Nb!y(|876NC>06NwC@Lz&8^N(OHvj}1j87|E)^V??}c*+l+qlaZ}L(TMg z<=KM?snCiJM?0 zbmtCBM4fTnArf#cZ84@*@7=Fhdab1`-9rSF4ve~eIm%BD^s28BD`pR+*4*p6tqm;= z;Y}X{Fk@IyPkcirKjg#Gacy8S$Pm2;TX7Rn2m}tTYQ6&T#@b(_WD`w+*tXLYU@7Ej z8B&NiRGpoTlBK$GJJCr8UJ|}z|UXsqdk$5={^*EnVEDq6gaz9>EILnJznjM zDNQ{W6}6c}&E{v57zh0{vX{bmoc;2oKj%kHAMJh#dQFHmlLT6Id??z4houF| zHh}XT@Fs437*|kGyQeYbs>i9@?aj>npwSwU1XaC#Wyh{?R&HnO^Rh&E~G%K zmuW*o9OLgau5mVfm`Neb=+kGn_}%^(nQJ;}z38)z=ba?4H3z11$2L1H>wt<}3QSSu zO-+~H)D2pHo1Xk^A^nX*s$v8l+wfy+7u_R7apVII(V^57s6o{;OFDpwp-YyPenW|( zF?zfh3TF{lD(u&g^(+(%JWaL@9|g&BvSjMOh0OUyuMG$rmXCNpBQ70MuQgD>=g@3oI;1-vRP6wE&#(pTIeSh4njC*=`#RJ7tTf zg;wH8b}{gMquNOmIaIspxM%r_Jhx+#*MK}oTe4{VuGKNCq`R*UKjG4OSUwpAa%uC% zo?rw{u}Dg`tNjQt7*L60ACamNqB^%+wOp<$0Ljr>vPOIgcI5YYkV_Www@Z(0t;9%+ zJDgoBWgCf&yh1eC_mKtMojzcKCM7&9-9b zX@qZ#Eyxo7_6cPT8$LRmy?o!CyZv-$MapPe0|XM~_yE;QkuUBRfZf?woMHylY~4DR z0VM_oocKivpynkQF@W|N&|Du97pKUYNnLEa98sQt5G`{Jn)l`Zv>^kogo2V}!(%c> zYb~(KaH*O}4|%+HUcV&LKLDLRCBmj!Vi z<0@N2eZ>~Qah(&mvK^4hbVzLh^S+X(BHTfLBJlx-Kc(NGx<3Jh)LR}MZsK4X7v`vNT(^i}_D7TBffQ?V`UoWB6`w8IiZF1hqjzmo>N{EfsO6>ZvPP1MXhx@dRhITnCYzA z<1P9NEI!=RL*J3ILKiHatDv)O8HfXUhP;GpS2}xbW#(Hjttc=SH>{0nxkyM*_=7%K znqX8&!u6*Wj@a@h&k?s;X$$d>LH&;FEb6qYY-m0y|9JCQFW4T zz>;ER-z=N2?!oKV)O1z=hr?*ZBxXaP2fs1f;r6=I!OykBzd5g^s-K)pZCcF@&r$Kd zlg4@5hOyI&|@TMksIqIV9@U-2>Uj`JJ{t{p9S?q2yd)hDRpoGd0 z6om<_fw_`P-hyt?Qm+9{!nnhufW{{4SQoS7qe8*#Gy4g-^WchXTYEPMJ1y7#ItSrw z%?#_93fsi`?q4{I@#zFOP4o+4WN9VYs0_pQKJr|;^G2dl`duf8&X!G@z!p&dXwu`1 zFJE>!u5&14AJBg5aGqb!_85tAUrhkZ_(<~@xNd@UegP4h9oU3HFmSp4-k|1?Id zqeDpv?s`(7hjUVx3oTOuFon3gD*)HYsAi3&uNbSmRaM6*nN} z%ZDcb*?~BI)^LDh&(bAT3~g|Mfvu?12g{T0Rn|;iN|-Enh_+8lQ#NEo7!#2Qk~y$g zWU?YIyqRupO*!kL=D1i>s)jbzAqq4%@6;|bpY$i3aJyC;#;B^#5R~L!UNuy`VquAYaU)efv!( zdV{+oNu882kY*p8RseC3q|p6Qu&bVBh-COPK5!)!@}r_PR@rTR%$rVId!XTF)w)xH zG$EeG=fUJ?n}NfAWfP6_y9Q`8b2YEVc|p_>lXi1~bbQNN+eW~6vJnTW5R+1Wc-N=+ z+f38(T)>!mD8k97X{>pW*~(ui*FIL4)1qQoFm|B-?ewRZ`tQ$|Y(Zt{*SwU7XpvH) z#QDjgI$tD9()oaXdE+0`l5~Oq%4y*x+@_UcN9*DA|gY?qfJ}&U5XUP*$_|gub!@|Js zMKVTN4+P6NAOXT^RA4_*7h9@35@J~y9DnE&3w`@jhYk^Rz;)sxI4{=AwFqGEx3bv- z(2?Z6J+I|-QZpXq^R;w5wyq!@a6DPkU}-N7ek|NHdDa{n!1Z4INp59H?^GfbPTvKH zXxP{ixve(()?{o7j+Rj2_*7l&0w4I-dE`fVl|9t&a>af_wF@yGlum++rL>lmSe{G+ z;n=lPvU04ZC@JmnLs|J)<;?hKP41h9l)TH4=ZTGm_PT{(y%2Qp7CCAZI9~Xj`Hzd7 zRv80tNE@!dV*#OF%?bB@2%XnG8D;7qC^a~Hr+2nQk7@D*Oimpoh|d@RiouF{Whmd8 z<9L$Uw3hi9gj)Tq)NjcdUROrQklS}$GoswFGw8$cNYK|9OTWuaqJnYH?I25%=L=ed zjbYRk^Vie@-Y1gDZzg@xbw%eXYkCnk)2Qev$Pe}eOfJARKURW)PVuSfjqV*c>=SUK zq~8Ig-@kuP@rGuc`d;+vg5bG`^MxPJ6@GkEe53H3zzv~@H)XeOg^k`ip92wUI3Lv` zMB8cBJFGB7k_qr_CXv|>k009w2#AS^i-{Kr6kF^E$sm?Zo4a<|XTFFD1PA}6O7%aV>$O#nX`kX8~zJxS9%~GcYAE=MIi|n zfX~)Bkh9eIFMKvYJDHTA5o|-Lx#$D5r)X&%_cu;t?w89= z@fq&)q~9$1MoXvm7#MMv8xGwOZ1;D^*UY!u^I}hS!a3qaTb`X6^XHHJz(phT>b>Q6 zHviM(7jjz-9IwywU*(HeKFgM>VA_zWD>Pj6OZ$vp;n~vXO%GZJ8hW$y(ek(D)tTy< zgF!tFosqBFKA3SD)U_+sE9N6Oby0cD5ig3F8#bPXIaHo0W8T#>c-207?Ai2!ggWwn z(chEso;^#DciFt>6Z(m&JI5*=Ec@(8}j}YZ_MZ?NA-&pk%orPW|G!= zm7B8?$)7oJv%L!jY*z_wwC}=;^TJmRo<||Ax0IKC%L`t=G@mlXQB! z_kOnH-R8B=SDHR^@k)zBuBx6?`tzC^dYSUO+P{@yb5Pkn*b_Zo<}9L{-*o4l86%4P zjCI&~AC?&k!d~0;>w7fzf701I))~2h574CPh zm{|UV52SrCqc@FTwPUM~5|yw1O&+Kdy2Bdh@ayU0bZx<(NxvUbua&}Ll65L!(QeUb8h&AWU+4*+st#Abenkdy>Yi58x zgjFD(*Z$CmErhT%$KIAm%xot&?hhaYYBAfPoEGstag8XrcFAzN^ilU`s>$|5VMy?$ z^_Cm{TQO!)?TO`2eJkPdf*wObiMby*3D2We8Z0j+i0#~&!GCz>nW^maAxYbUCxEA- zcL$gRc2Vp=_5q7eSwJssS$<#slAS&|#^&N+t@*Vh@A3ykCYyWa$6mfsJW-XN2@{*g ztI&J#4S7AXfod1?{A3inYh(m-2l*aBW~u6Vo;MY-_G>A$@?<2k`Q84)VfA_6IdIk~ z@p-hjv2nuXMy_eU1J~o;9W_YsP)}|oZP#|^t*rQ~cU2iYPs}=2Nwvn_e|Y^|iRR}O z`P!n>wFi9p*tK^STO+K&^doVn&z+<BGKhX7t0?B=~s znAX=9-6B$dnp8QwL}t*9%QEAHyO-^EwU2Sy2#apE@_Vw*7Bg2jiPZazSNHc!2&>X0 z5pVSCU+=Rm-FT$LuPhK@3?N_{!*x&|#rAgP8kf9GB=EOS?A^xeA`Lq%f6c_@-`SQ1 zz$u~hN{zQtZu-IWvyF=*Jr33hUC){oo{!GAdT{%vK7!pjXW7WbWZTM=_0i&tgq+Jo zG;7#mYWSu`j^GmYfK4#dLA>LwrG0jn$JN?)p=NmY=KfA(e$VXzdW)Oh^0TAp3|F7r zFRK%vagcK^M4gGd?_y7rciV@CWUQ8VB zX5wlvZ=x_yN_!YDgp2;{Y`E_N6;4wmAj>$7`5-L+I=d2&>0bBM#F9(bOtYBbzm;+x z_!#%qksD$ON4c{V-Y>|5Lp*22>H0pm{mw(HRG7!D-B4_HG1Ss@ogArr>HK`ukj}QL zOfIZS|0@?N+e z%o`K$a>KnV!i;f;KplH$PT&Dsd<12BZa7E-GQhbeA zTOAM2sA^h$y;d6TjOX2v^TlwDA8xsU<%zCfr_(@m#x>jL?8O~&I_=5RWP0A;ph|z9 zw08Tw4ZD7vyM-juaKu?FP~6!q$w^KvLeL zT7){f+{))!nKAhWJs#Y2axSF+OPc^VgEnrLv;&}I_V&K`mziNLxh&y4%xs4(hN0RD znd@c-V%K1?nVYshsC#m;!Ju0C?+Qto?>PP9fHl%z&UggLVT&sU0EZ$M6HT&m>kb?_ znt|n+5MxH47y#^)>@ARQUL$ZeeMXXf4}M*JXtONu`DXo0_rR)L zgyuES*7pIq>uNGMPOnNhoIhG;veNv6NM~^5duzcU%lU$$mw-%}ye5HAC?)*rmYm*t zrG0G{cWJpDJTM+}iGz=eI<7YBlF%H0vz&LLt0&v83n{x-R$?rneHzT<@ZuPa-8Tlj0V1BUT?BF7b0vk*E{Nk&Nnj7FQ9uiu5<&fu!2PAo?sq{k$lP6q)D%r$^693uWZgk|K1uq! znf@tx4;vl`5jo^$JAJ*&<%`@QLsmWi*Azu8bG88{(dDNFW2Q&qv6i)Fzn;o^Drj`r zdYg{2x}CLeoUAw9i=%$K`ArTOU7V(i%BwY7g(k=c6XV{|WmVLhP0fYW_5%4df27?y zUzEm^spkxe!2C$0x5_I;V{oyXx2K1j=cy5w7!&|Fy8c1enCNoH=X}YCy<|HRu6NIV zI+@RZW;C<~x+3q859Pi?_A=x8Q=%^dss+G3pqQ#MtpHhOxxtjR@DLE0^A&!926x*x zKL=J_Cl1mmVK35};@eX(y47J<}=`Wy~qhVzF+iW`Vts;Qct_Dn%5# zN+rQ%N^v7+z0P&7URn>HTXXj!{x=sei_F*n7vAw(V>r0YSt#j03{L(CIu8IWVQqKY#fB%_g%K1+T%|* zYrJTAYqtQ1A|RG!BC$axiYClPpSeh9#D{T)yTizfPsFQn75AhNUodT_gH;WAP^_-b zx|Qi4{LO`x4!x9qJw#K2PhuDwpN?Y0oX)FsA!n8dOB1a9q z(fZ zk-BCE4;5r*&?+8?AnsLJT}V0o#iRd2*n7vb{k?I+-%eGER;?PLl(sfS5z=MVs;b)5 zZ0(8_yH%rRjM}R;YuBhPikcCd*oxRIL`VqkPx}4+?(sbL^Lqa5t9X%f&UL-7_c%vI z=9A#15a8e)aLk8T0botIF(&+aMjCneSiH{h@Azj%-M`s>hhJBTj=Z^bj`v-DCB6bm ztyDjN_)yq8ujw-MllP%&ym!>9*SRu|+Gw4)0zts`v>eLqeQ+TTDUg8c*i-xMMXK=e z_2k{V+Wut+t|r;mg^`KiayrrM#}%be)E;ajz#>sWbS7y^LLI73kD-N0$~6V0-`mtf zZ|hXKg=+Gr2Dxl`rzBWDb?-{XcJNCvS)R815r@2)Xkhy3Bct;=09Gky2q*5`>F>6gwq z{}ZtS(~GLdQ=$KW)<`AT*HB>f+=E5Im2zVL|4TGxurHG{@ww*e?lXBz3nvL1mG%Z0 z#vF{{67PpLu$_Bq&tM|ikD^+VyZCQvBhuG>?(1gyomBG**$(}@(1XnU7}R(mEZ1LF zXj*@uAsy)13X<1Pb_M+lybq9dq6Gz!=D4IbAB#_lO&to`8qs|LLS*2?nI1~sGc|`K z3z(a`b1)wpxv|0QGfZ{VfL|V;8jnN#>16pkFW^JnugoJofiF)C7lKabRIvC7B`j!S zF#=GzVsXzvrqB7wAF({Q$e8K&iC4!^FGkgsa%@ql0e}tZsrE$4x3Rx#pQv@w2ybLP`;1Z7(`RLUhMt7NdLVMjN}Pej0q2)Y@c)Q>h}`Dtk~WQWuyQK=o==m#q_TbbH&$(qSmB$77w(e1Lug?!0ZcA3F)hkZGuk$0zm?{;CIpz~H(V2MUK{`;>)fiA(S zbJ`Ub(-cO3P8hndsl`MNOI*jFa{3s~{xPJF+pv&YRQK1bZhBM`dGt}m{hSxtUv3At zRKJ7kWGcB_Cp^BKJA38VQv&Y_85*(cM(UnTv$t2;SrNl>s|wy>w#%1+`@?+NJ#$;K zg2*Vy0!Jz|51UUenJZ2^yaegv1OPP)O^o*^Z2K)>jizqF)kUkK#7wZ=x9d+H!xpa9 zaZGN&_af&*|ND6Zq>9}Y-&4IT697(+Ew13K8#9ghi#T>tN?nfF{h~Lr=VnimEn3S8 zepXCGn%kC`**j*}9y!ZU2XMZkudCk`9S^)7-qd-NuAEI}bdij7AdC$lpFbfAMq%|@{?%a6 z)VryG`k%H2@zq;Xk+!zrMrS0u0>Tae*% zz1zUB736&+Hvg;UN#rKu$z6*FfZwsgsqRIX>&mXSM@BJCSTDno17FtT^S}pd_zTjZ z76$*B6pQTr0$X`SX`U?5B~h+%>4gBcyJx*VuO{H&`E-#X}OJ`xjL5;Fj zL&+w=5Fa~pXwz!(>D|%3h3yyoPit&1O&ilD(-F(oI)VAU%0W-_c|EJQOgkFMU`_Zh^mhvWg3Fl72Zyc^18toU{%5na42sUZ4A_ z*{RGw;**#5{*>2)TG6{9AW}@*!-pBcrrLH^0Hxzwh+YO>XoE}JBV2|i?vj54smGTs zUj7{xEvU=-g#1Lo1#<%KeGROB>AR?oNrAH#9C8oUGqbBHwB2Vd<&z%o=XZalAJgu) zC>75BJlegpeX3h+$Y+x;)z%byhYuQn6;vXeoY{#z_tl1qUtjb89lVJh%8W=k?U;v& z)|EbKZW=B_VuuW)(}rT6ALkjvggQw~G|#JPne`c}OC^ebq*QNFI}u`{*g81&XebO; z5h*I)abPM!QW7%x|Kg(Y#3$vawW zuU6VlE*`o*GBOJjwe_7(3@gD}NS)KA|LSXi9-U~|QV*g&c}9p0n|^ZZ&Aq}>)b?`4 zi0r?k)UzR`K&HNEyR7u#C(XPIwwy^tSJdwkD`K$x&9ipm2kX~O$k4@wG{XP!=9`4M z*xYc3q{N=={1XuN@sGOLJi|ZdfTN}!eU+O`u5QV7V6#};6|T3%9ihzAI$gIN32q%y zTQfe631@$ZRz&M102A8X|6d<)Ce~$(9YLunHFtM$MpJSW@kdnyb8+4d>==94XxTFiAHOt1RbghUC0AH!s%`}=<$F6>_4IO(+Y!(RP1HychQlK#WpXdl~< zD~4{YyuvL|0l5O~l*jcc+{E)*h=SJw9cYhQRz3V4^b51H`lu)YFr{+ou@^JI_?g+I zr+wtBCw~Wr1TVxWP4BVk+^V!c#CFjStzz7xv8U_zM*wwz6+7BR&oX z*?v=IRSBVp;Y_^kW9V}0cghvo=M4Wg4Eui!#C>e;K9WuRiEi6>`?&u4^auog1hcdn zVpX_xOCy?$OTM0K?H7A_+O-6E_{l$@cAY}<=iP~h@t;2UeiYIg-E-Hlo(U>i&03g| zN1?-&&=tZ4r>1>J=4Uo`xuvem6{~r%Ka=4lRFn2107F9Q-R#9Dh5Ho67>e=JQ&Jj(H=cF#2n7uI%Qcr8-7DIp~ElI!<< z0FiFyxC|CNnll%^L+E&ucGj{7pVu++cecmX$WMlIZd z&xSyO!n4ugF~If3xGqD$&Kr3hxD>Jb zLEKy3i7uLXG1w`V`!+_T{vaa)0&LBiK7y$iRjjj%*z^1w_n>08(a#J!0TwUA7b^LkSlJUl@w9i0)ba$70%&Xtgxt6=W)@c%GA;i~(D?naG3;+oSb< z>m5o6TZjmqvQ0zh>vLyUT_VbpLY#tbO$aVDiT|db1aOJi8)0wM5l@9RU51{LBNxM~ zu)@^7r~+lbOC=PlD6@OQR{{t7h-|)L-)^RIYrjce$J~Un;-9Xne#zq_#cpsw)Uq=6 zphMK}73499H>rtU+#hZTEGI5)Ah91H6)k6*II()FG%{2AKpbYu7Jk7E4zx6AjpAuv>SEeh3m`FVryu5W{7| zYh*b&KAgB{W6cSUdlyCoSQq*JVKhToG8Kk z60kQ*6k*Z-l?I+N|E+*s)euUqQ@XxXi0QJh9?#P@ZZQwQV=bk*+$(VwE16!Ux+uFW zLUL6xX;Hf?q@j-7-_KFg)aV-dU2DWF$EO_^C#mVv@OF=(Gj8=B1*YYV=LtGBkVu*r z6(?L*7X{#|+$Uh%H%E$d$2$kTW1Gv-l$M69HQto8{yRHkT}g|_a!uIK5Ur&ziB3ru8HDM!x zw(x|mfq)=)6YJvSq17d`+oB-=9TJh57>Zl5G+MD{64sd7zm@BRBc6Q%O0z=`c|kQ$nU7K^`p$_WJ*#BfN+i7Y&rkfRuIbpxpupKQh}4=qnKK5bVtJ0WBOn+@nI z!UW79Ji&|SXWd`q-BW7@@Qx4JeE1?`!q2Z7_Dl6WcGnl4Gsf$WB=oIN+xl*B_xCn1q~j+YO_aX`#46h?1V`L5cDbKSr#VEH zjo@vnE|0~cEX*9kQsTPK<cjS6w-fj@*6;M^+)y)Kbylu zsh3Okg{)vO<*^b&ZmNV;AN%9h_lQXlJst%KRRxpiW|=IZoO}M^W^R#y>>kpm@Av9x1)SvwN_0- zZM_4VblULsLe=aGQMp_f@pQO))Y{~PinRZcR`Sq(+cCUHAGl@U;45M`z$n7qN0K2< zO(b@sB+LpQw&XX$hgjzZ4?fZWX1P^d=TPp4rA7nXy^>m2c_9oi)b4+38Fkff&Tfte zX3RA-Fz?M`ooqG8wO&pl0%-ZeT~Ef@>NI4S1_#!n!<~PmmwEcN7aQAFlIBAo|N9tY z^d;s*<;qqW^$N{33 z)RD*A{CiN&cJrt+oj>k~im=IEuF=8j7B>A)aCp^^(24pHkycyld86aJWtA+COg0EC zh%O^dY5N~%`2D}yUXeHE(ZS7rfw>eKDuJ*1RFo0Dz{wF_kjwx@Jz;(y(PqG0QL=V@ znMu%;t4PxH9#x_duiaQNM_1$K>Y#;~*K!WqsOQ9L!Q$w}+x&r>)xT}4b)VNzG*M~a zEB?-T>uX4%fhE)TP~TJlOZJzX{d{@0**Qx?QQQbM=i{D_G7PqdiEHg!aCG~Ca=e4Fgy`( z=8z^Mf1>3QY)3I0m|nU6kLh)?EdFx=MHI7%q7y1Cx4uA@mmB$ds=*JHS26*Xlro@R z3*H4Tb+tmw8iOBbT7Jxum)Fc3h_&DD)48x*YOy#wo>Js-JJs!jr`OvGqP=!NDy)-W#9ZWhyZ#H4s!!n)_x(?PjO}?+&vkjMQbwq9 z1CJzN7qJ|63ir&P@_wnIm+H9+OFl@hf`I0`Jtv=1(?oelKw#25lR0dKo{~Swz8W?4 z2NCKHH&es!P^4d#ZdvchMZK&{Si%Xqa}VnO_y`D=g3Fx~bP2mxcRLtYViwCIn0zXj zcYsF_^iOB7pFH&UcUS&|^Fpb4>s^-Dg?Va>x%s8+?w;xaO(y84>{{}a|AHk|a zT~`MZ*y`EesGGcc0fsPXJ8wO^{AX9NdKJ9R*Bkel?GZdP=DKQ zcKG$6k*;tjfrT*fu4`K(4)ydldeiq$cn#y&0!7SNTB)0r2)~{CHaaYoQ2AMDDCRie zm2hA5)OQf!rPc>Zh>M&n8Un*Uy~9VFb^0ZV@M?{Xidqh~Ou1A6_6+e#V;y zh@(!$-Ax01o$l9?dJ@qNUE@6L(j43&&6g!BQkLefL4w-|F9nA`2>;a)r7IlqR#2ZE zwB`P^esYGmK=$(P&{0^m8LnP(MnK;GwAgP<-Kc2_1~qSWbWC;=!W~tA8(ShfSp9IUs@j84JVxKVc^9kWoE(YN1DHj;~nN4xBp#oUm6J1;o z-{+d;>l}b9sV#4;$66|^ASk%JAaVht8;k!WY-jl`+^Eh?hP94NZfyfY2o@fGJDBg2 zI^%|SAm|p^XVypf#@-5;J}73%U*qW9epFIn!o*YKG|t}-8t0KkJ+S#P0DBQwMq>NFH@A6 z{}yLW=>q5`kKfCL`ff(>lcdZDGqTML5cI?fmcd)!sOP(D;nV(sPSi(Q3A=%&jOC92 z0kLtPkq1TV`6s2_D^0eCL0_W!+IQ}vJTot`H4YgHM>82@+4Jqs`dOIikZ`Fb4*A!0 zj1VSPYojpYO_mQ*T6e2gCpyUl^>u_qrT5_ckdnY(P5*wD;ti5MEL{=+h@foyCW3^iug z`l-KqaAsLsxwKrbROfd8^IzfYa2(j=AIbi|p-&ZhbKInXNB>3n2LPLl{R?x?qYlp; z49B`>(+|6)eQ(b$$v_Oh!c!CGgZ5U-Esj2~+343>CbMfK3K-^junK>n&+qHO1mWuY zB+!mWCe_fOS!wW{{HRPiUu>0c)&3VmIIN}1JX!p~ZDfK#tmp7z{hH4#A`xc0?US7C zZmA}fkMCPp%d9dW}F-FLjyVVJ?`O~ZzkjYc~DSXH2WHF3A>%<^=g zzVDqrZv|swVcky#%*EYtHScz>{3~RlLGIuNacj zq-PcI;Y+6@e6#CW{guF9^>h_iw&BT(`Q${M;+20&pGK(`W^vaRUe$InIYK5d^B8)4 zLDj2#$1mqK&LVazy7mw4+|hXTSCHD7&GrWnN3_L{0#lV6+K1n*zWYztnKQ~jjnz_9 zmLe*@Y$}R4v{|c-fmQxeaUu&}&$)hNsL$8n`e*K{LGQSj%ue zSle2!!ABH*@w|kdV>=>kcx6x$|FC{4 zpZjyUTYQGb#$SHj=bm;ZmYaQoD`J(Ri%cuuav&RizQv4^zP*;V*Nat-Sl~Z{%Jb)6O(cdKwFxVX-l^_nwK*3;f})2 zYlUtVTSuK6Ww)h})sg&otzq8RZRP6&p$Uwns{3kdGk2D0;=+%@m0jl3v*DxV@G;Sc zowtg`tou{LYdstLxQ~juW{$qM&CJz4)nJ#?Q>{5x8{tog20W%W6y?Mz6*TCO;`H;$ z&vbifP1>&rOR;cUEpwf~yVy*5&1Tdj%Vk{X5N6akw>TT#L15~i6A@k)hx=7kojDHF zF!LJ&%p&_KRIcK)!;Gz2@M!Oo`Y4qYKAYri`g^0w4ujP(vcHrMv6UGka?oU!tUY3^ zK^{;mvJ5Y9qlTplzQO<-h{9!z@AW*km+~TjOHjSIjkb)l^p}L8M^`X^X&kb~F)S_7 z?3>3b3VE=JmJMs>QaS`HyNx}`qTF5bk;}Q=70LF9JRKM&$oa_RSP!F%4em7)S)1)a z>?*DRa){SG#q@I@{3HIcExarG!n+ljXWtmiD+|)f4en>pGVzF{U4q?tHa$HOw}t<> z5!eDn)ZV(%D)rebK)5MP#W!WcG{y9Fuv4r&u>Y^^4N+fWxBw|~oU&ulB-{zEeaejP z`>0xQRAoQ$G8N!Um7J4w6b`>l!I-OGyYf~9Y$<n2fh1an3y@)Ruh93IL zS}8lF=rus=W{eOb5`xEvpn*#y^8`YAUDJ*iaJ?`O-{Vr0cIT(I84!5lai@8vz@5|!Dr{5czi;%>lOT! zZ5PH`Qj&Zv9U^RmMvi{dG^_6eh3`y#D1`Cw`T84;iv{Yo?;jg;}z)IlOPvYc% zi~prgv%&>}b@h*pDy>3{{I*Iebo}wKi3$bxVYKHiB>7!$kdwigx|&+gKBiV6N%l@S zV{hDbU+O5&;;7J#fLR-B-rf~Hp`IM-HE(^!)ds`m!UX*uktI6$<+W!=7~pcQw{u(* zxL@f9#_ii#y_gE-a)XXU@QG6MX|mCM`fKf(t{5WkN~ovx9(Vmko6MY6De@r}OcWcI zZ$Wd#sqh*D2JJF?h*Gc>VBi5_=q>qIZFVqE0oOxsfBWlckOhnv?_D6MoFdIaRjyw= zS8XiqP72NH_y;hXAWObU>Hl~!^XMR8|46!R>Cq4NH#y!&sW8CD5OZ$)#&7G$a)F`7 zPEC|I^n=H<2luGnDO^rkGUfh-Ld(A%bEtc{Vzpt5Lhcu4wIusng@nrbS}yHNpH-$N zvM028gJPQweRaey_O$F}XV|?~UYTeb!e?*Cf0@_s5}qf1;gd7*TM{fgWJ{GLW#9r5 ze})$+;-^voxWL&SI8D7!XT^??80_19B~c#RdlV`gi2VcLDUqjAERP=C7!q#;I4EWO z0({eHByS}CV&g9awv`fGA-2X62)ClcC^a#1mtGEFkuT+txOJa1hO5YrpFu{g(e^EsVbg`2(Ru*bKdJaPs>Yo)%A4 zPiY^e&N|k`plcr=q$(6QNM+r=nYHOncr5*X^|OHfgWHKtb3#R~;+{qzR=EdC0PVe+ zT>&JPrG1yvY-LYVrAp&sE>XFn9F!B*kEZ)VIL6e|!Yj7Iw!|bQG=oEyyISiOp)aM1 zY2{^jR}?#AH)0r-Se#gMPRnRaU4}b5u5ADWx_PFY-9nN4?BRAz4<2o@WJMZcH%&B! zZ8*Vf+5;AyF57#{iG8f@xdbFwc!}kPAs|nRpb`30Uy~?*eVO|0^DBVb17#y>}1YRIr(=a-&`*o)I>Sl)ch%I z+a4*X8s4EK^OwgvK)C3%-)s|?t+igdZ)%jYNLPekkDGQYuOZ16Kwux;xOW6VK7)!v zosqJR(J0+n&W*9sSOqG#C$@8PiQ`FPgNT_8ZUiq$u{wrR);>;*e;;#A`*m zs}^nv5&St+GGN1Md3_5sp(QaLfLl{3F^ASFyZed=f$wEe(19z7Y^@{i>px zi~0egw>&ytWruxiespBf)vK%3=;BNGL+WXu;Qe6+JcAW4XWqNJ+!-oY#qmh2$d~^@ zCX3uKqFtY@qitx5G?!>@WPup`$3gn2H7rg_NwybTc}nJgnf173Gp?c@F>$sw3%*>hux(1@itX!4X=#fP1SmiN^rPUXOVY@ zy5RGA=7y7;OGW-`N}6vQ6zVAM2mfcD+TLE5M{@Hl#^bv1BEfH~<*b%$;s5vO&K4nO z)DQI&z2+U})7yKX$ZRC+Hz>&<4WTZusE`?(%Vca}l*SxVoC{R%*9IBUs<&iT^PMYmsLmF&tBOtm6SQ?kf;8#w(QXb4@Q z?!F**75XK?7>)olL7hnrC(k*}vW4`Qgwq%%bADJLD z*pP~Uy)tH2n>mPD==Ys>C$47tK9Pc>CiM}S?loEwHF*OosU!I&N?)$n@#&1evQ!_> z_4{EQ!l!FO_Q;Q2hI{%E54vh&zZRW|KsxY%N+AWPz3{PJ4?N3)iVP!AJe%5N8dLs_ zo*2s|7}z>yKm2i5|CTzzGQDG!R;Z70O(?f7AD~4E*IA$}u{Gj;?>=EsfY;`+YY|2M zU%wTA$_ZwFQF)1jjp%Qz>*Tzq;lATSuv-MoC!~ws?MGanAE*JRFM{!=Al~duYu<{dhXp^ zvAttF(7BTK(D^0Jw%-f)Id3H_@^;wBh7yF3T48o#5NnsM-*sHme`ZQdKO=TRPf70_ zn)V|6hX+Or@C+D*{10k{BHS56fN zJ({jflPP)TvqBnV%Xm{^{j5tF1M(dwvXxlesjb4F3$(9V+#SqE=STbr-{dy-DY zaOv5aOp}Fs|LjbK^NfM?x1Ad%z7w|_xO@~>bnK+;zjT*Z?qS<#kp;smmQ!PqQ2b9_ zB&O>0yVF7dn;h(sl#j<0AI|uvZ+NkBjpEL&4B_L0Ep&r>f6tghBJzpv^SSQDPE3p( zg{3`^Mi=)@F0Qi=HDWX12~QA;ujB)b$|fWt%iE5v))v~KI~U*=J=?;dk2JFDR2CCw z_qF%F@X2dzCAq)yOHyAuJnaMZK+cdWP?kl>uoJJ+8tZ3rHOrt|j`g!H=TKiS60Y9> zw!}uq(?qdV&5;YvoU%Zl_6r00v^f}gdyE1e#pF{MeW0;PMyxpI{{Hq?;Mw0m1jPHc z)L-I!bG`K6F0J!heUY2>Z-aIpu9LxWv{lyP+6h}g#B|jQ3G#y}08|qrZB&}W)kVNA zee}R)8 zzh!MtdcZmYIURNy>onOpL~KG4XhJz{ljaVy!HlH0kyNKC=@Voy0SS3Ar&5}vl6TBt zoRu1nJfT^(N6d*9SiTJ2jPA}`Ok_X2+?LyAmjP;gvI+|(1`%RSZmNjeVB5&3B$aau za=Cl}4CE+{gj0^1O5)cWlO?b12Uxzpcv75V4)s?;Lj8|hptOJ=_9 zooEg)&#~r4nv*B46!}Hh^IvxGt+!vU58&LkzeRkbmj*! zV7_%Ng0flLf%))AgtcARtGnpa)TOEf0ds&gF|k-L5W}p)t{vs1zvbiu)7XXYYd|C@!U8-(Z z?*|`M*;Y3}n`pU%$G-}O<{jqWCQ1NaDp~XA%4d@}v6Fch;^TAC|K^zagQ#X= zj3oBQX6^cO->u0U0mFv^rcF2wZJPk{RREfvs%pnivvVTZObH>R{<$3ZSkrn}r>ilG9EvXw@haW#cnJaOj$7r7K|SkARP z74y;n$r>N?hJepTi(dB}&mFt#7o?fPh1coL{fN4A8*-5%KnonRu=gYIda3xngyZSp zjwL}~l0%4o`Dim)S~G)y%^$vUjwoFqpTL&kNp?~#uB{6{D2<~bo5x(PK)~drIpxr) z1sxR%2cK`pXZ>p_UT-q9x6K}Jq>op8$@J~HAI3a~-kBWWjO^5qp=$gEE3=7Y&YS+uGFOmR|*}RdC6+x1V|Lpa) z`S@GzDMJ+8ugr=3_j6<2F#~pHSYy=_(;)gDHbkvQlg#fa=b1{^_2d@*MrMg?m_!8ih8iSsD zPhFeKVQo+J#)-LBz)YBUGu;=hJ;&lU9B&>o_NB>i)yy@iB4Bf($tewwc7(=4v{Dt< z*d0yWUC$iT6Hl5A+V?qtM_2CDYHdn)QNn3wwMe!oyXR5d+CxF?5n<n_JU%a(>$rD_@4EDz(rAQAWIlgO=gfjz48BC7Xr;}U zMtB7T)Sx8htyu1!dHI~|{n9a|lYjWMG_+bDu#}j*E~C^e`_;&(`uK*Lgo!OH*-W3? z_9*>wWOhuwhZ zJfUQci)r7ujhnme=*sFnY44+)UeFNQOF-}TV!fk-V^|XNFuE-LA-7Ug`WJmwYDvo%QtpZB=pxlYk1(RXS+hCzla2a$E4}DWEyz1oTvSnM3|019{F;vv zrfYDq!fnyE?x92;>nPRI^lzVJHc-6pF|GS6wp1yHXj;Ts`Yd;z#kG^Ix2H9vz8zn^ z(b$bar{)#ifStRN_k?ltvZ$#zRbOJjBr_S$0a zd@@<5s^9G!<-;>SAwW6mJR??C%zZ` z9uI~It2}Ol?Ze#w3P#M*a+IgdhjWlyOc`L^=B}n|jU&$*$zQ$}nWpHJILhA19Cpv_ zRotCK>B6XDmqn#?oeYHL&e&_#y@OU7dXqJQ=ZfPRJUC*~k3j$to$dswo>YsKDmLLo z+Zzi?8I_MYvgPY&W`McQHY!aMoA~nf(`M^8*<}Xpf!|10m!ob4>E6b8YSkDn8P?g+ zVTI=QE~apaF|B~b2#g+<3Yb<`{k*;@UDJ9-oIZ;;^*=nu@zp=y-I%}<^EFRGeKk3) zC+=E{#VYI>Urj3G<4rXy&Is&fTmIMWd6aLmfkVMP2o6{0Kl);huFO?mI{61)|5=C_ z;|<4q);ZUJ!(gY0n1CF{^J?rB(W;RR;yJG@Jl7P{BVC=Qo^X}DANd_uHKH7H>CWdS z1Ln29#r$&sp(N!zvJo0{PG`51?J*|>oxG&Cx*w1o8fCeBo*$EpPrB$sjI7scET2`} zT_ps9mpTOZ*nkT*ekNLfow8bXofo}x=G@t|Cu}PF2BaXDvonr^k2G$6n7FF}S-XUI z$7wnu8#Z?Xkv;zURV&)~R+^Ob|1vpH`nb~XBv4x&m=|AI#EUH=nW4lcGUkhIH0+smv!`XTec<~brcOVIW1 ze$o|{@iU^Wn+=IcvmZZGe7TS6|)&m8anw6@47k7Pw7-E>X4;+R%egLiIAH#%ib`|kLr ztYje;VHw~x6*&559skn!papb!{BhkO>)eYe?Vgn^BmWtlisNq$md6MDORbGoS#a-| z;~oEjQk$0M2oIU)W>Oc0QO7l37)Jnr4yw>j5MK zv}=&!pdX2()YORJtQ~o|`R1ki&aTPGsr_`m?Wl#pauv@#{Kit9W^>&@WZfd6G(7|} z*Ic0;l>xlG?yBYdZg%>>a#{`3+YpJ$`gNerkI~nxH;^pT-=3H(fqy&2_0I}sYd<~O zP#yMv=z1HVH}HeQO0?SvGl$bHq0a>-@1COl&uBe6z2(2w`}wBI=9(T{5dN~`+2W6y zkk}FMt43%?sf0aUmrgFu8dvd_*!e_UX+B}*6{yhKymO%ZKF&^Pb~H|_b82NOJKD=O z*!fa6V7+Et&IHY|pdT00a5WVMZuCAtPR5Tul7Eifx_%pIf5B-1(P+zuA+4Fd@6&w|Df357$NhF|1f!pwtJr zs2ME(XI=(oPy_YAV&+2l#8vx@OvDY$QD=9`HNz%Pmd(nCB|CRbs$n0C_3FEXgf08? zm+EA64eRW`+0XoVQaWAxoN+yyeg-&|1Y#M|;gI8J_(yA##i;#8d-&!c2pi8p@3jQa z;MEmjlAVqlzE}-yk_H<3jizJ38+j-Rl1Q?xw^e@FToBn z8B7BB42E~9HS6{Ec}wITl>0=W6hgbJ@l4Wo685C zrF}`?XSo_*wss*(Hjks=&n$U5X-~awhCDH z=VHAklQ{)2R|pN;7ae}xadd3Dtea6L*5mQAupz%UXg1g5CMc0zwz_-&cTy7$b~>$+ z%wgh)M$)i>Zx}x;^Pk~glAb#i4Dvc731=qKl3J8pKOHw_fP`9IS!FlNM{;zi{?>-CsD{oc%E4nU+P7 ztO}QDBf3h_Ul!zJA;Rvk1Mz6hyvrOB(~xR^kSA{}fCr-3n!Nma8oZ{z*<0Nm1%HFxEAZ{A7Ti{t?A9J?RS{0!G z3tUD&r-Nkd4$tt6bG%jdJTqYm0WBj9CmyaF|8%h`<+7q(l996l$@y;n;jTEIEz6U% zKk;(9SM|*&yEYwljBOJ>;qdGZ&)msnnGb?s@B2b}Cycyg=AGenDjB4}&Ym&B)cx|O zRQoZHI``#zDKI;yyQUbzE`17v2Ct)3_pLFHiD}r$=pJkH5tWV0*jzt}RQCOK#a(la z6jPfNm4_k3rInYyYS%lz`-zJBNoN!`Kk;1_&$zv@uiCY`n&h*_Z<^SAl-l2wfMV@3 z-)%(nFRzv_ni#xh9+GWC2`dkWc*1clPl>`jMCXoOl;cBg@d8(|jJ=a|@lUPb=A$@u zVE=OkvIDqTi7-!DCohKhDY!}j8&PoWhEL12Uij;*6?N}FBgeo7vc8s9=ovl#`LaRa zxnchL6>+x#cu|yMg462*!Gz~A+x7EGUvhV!#T$*8zkaPbgVhG2pMoph`saaAoi67& z1N=JhoO`p8RG2RC63SSkz`bV+xWO;AM6-;#sho8Y1j=mL$Z$+3?k$99Q0H?J#-nxS zf3V3yjO(A~I)=lM=XY#`*F2L?KkUqwRo%}^+iCEZ^HO2ih6`ChUo@GhsEfQJuW=)@zRXHHQ4 zUiJFKYr4ig*oHb9IowPPmD!p4F5`kJ_1=z~I-AqQu+naR>74cY)D(WJ`SKPmZSzL1 zUQL&0<84!KeEeQSzMM+6BTDhJ*sHd_ZE<=y;<$P=Ve|lv1bDs=v zO~*SU1hoyijTksM&jI%@v9^b8r%>O$r7xS=4?>(Oii2Cecx78&ji8sxkG3_GoW@sk=n8^WiqC?V5_$FnSVdengjc>W^wyyXqh*8Gdn=fRJq z?0+Yhfs|$Qo&QW`LWTl|w3fjYwo2Fl$Ryw#vlfl=uc&_V3~+>sIEVa5Hw;I9(ai6> zhBmnu0_PZ${SdM=U8kjA6x2@~Cx2*^4ypffHWp7U!&ROfZO^B}H!FRr9cJ#@)oe!NR$Mj2{azky%ihm- z^{ zN?v?A6=(d9cY{wndr?wdC@B+u+0tq2jpLSd+9}T|^fusHYgN*&!+|1CQ%3ol=y@5V zwx+6kJp9p>TQXlmPiBufzBCgEb6z;jy;~2qP-U4DKgBv^CxxNR_XbI@&g8u4qc(Y{ z@BWnR34~^M*83Fu>jHr54I^f7PXAtgQrqE`F$r8zGtsyQ-)2p964nS?dth{P0sLwy z*q_?G((C=lf8BT11atO?`Shg?-Z%$4+o;+NatN6~4j~`K0XOI2M(efgXM<|Sr?tSE zy!rW7@A?#o2&ZQK(phW9*1gU;m$yq)rU+dGDf)EM*{ifyyxkC*#@fVb_gW}kcU2ay zyPj7ro##qVJXNCU;X6AqVQCV{rMHN9XK$TNn6{C2f3#y%w^JmD#M*}zU#%Lz9>+mQ zf(>-aG@s6noNBsdPdj%eyWOizP( znMH-y=v8mWty?EScIzTu{>0GcxE=37+{*&%UFgdUysDI<1BqIMfb?F_xvW6 z>0Y7zSm{!Q^PPg4qqw#8ZY%V>BI^fGSQkF#VD8#NlV@>1GO}Sw%dM3)d}23qzpwm| zlyV$zxgt=RVnA_(Hi>@VA|_~H$Y`nFJ%4MFS`Ns8I**_0sQpN*eno#KA(`v`NHo2` z@U!fzzTp{xo;QG-SMd7c4T2O#qixK{Z%}2=um|XCkSrC?oj(?CzYI$~i;HJ#bLE6p z4GbwtufB@#IOax_Q|dQBLrDYvzG=nAF^!-Z%jPZq!|u0;TGbWBr$-m$scSta1}}J9 z?fBMf87% zHM-p$QzvS(GwZI&`8Tp(EUE5igUBtO*SM9ll$g`jFE-$c*;mY6ELhRzzQ5)$cS3m4 zEsR7bvO~h|G>cLRN2k%S@?3=&aZ>y!P*9onfZwV;6%|f3=0_&H`)Xb{d-JyB>Ufu@ zCG!e3pb;gS(bbV@tF8V#W&@?aIm#K}W`NybBii{?tbHW9r}^_?{FHFwaDLQU{qW@( zW-uR9hemggjvYQ`a0>G~J0DXjyYmIOtmkC@)SwR9Nuw{416dv3QD2N>!f!8xxDAUF z%TIN5x1zTIEk4vGe_wRedn=NjD#3_1Un3*-sZvzWk|935qm60HJ3NF*C|kBdJVo3-JFpk~xeJ0E@9aTl505!aHC~o=aZ%4` zk@8Iabd}rlJnE|xBy9IQ=q99enSm)C-BE(k#QMz-NgGB>^5Xl}Y z^s3Y+{bm(T>@;mw;Fv6e{~0{I1(NmD==dLOy=gdopYJm34H%S>j((r*akF+QnKshMy-`A+?01_hJQ(a1J%^* z6gJ0W0HoTsn9(<6Loc^2@17uLJ7O1DA&3WR%+UG|x{$CrpHv3OsJVDI?r0ypA$|`P zzU;b&2{;PdI1Pr3-?D2$dfGl)_4`2LhdcGxhxFe953ps-ej96xl8O{plxrH6pG$39 zKGcH&uDeHGKp=;5IVP7oPp`KI{`|7;(bPL@K=#t!IOBD<@SU8U!k=!(LZv=1E--p9 zr_YKPAZfh=hN_jMs1GNe9Ou004?Dip8=qR;MSs*Iy6HcbmEExvb42U$u=N_68ISYp>F@En zD)Jj*CXXxdS98Cb{S_Pz&A(Y3Z4Xwi=ne!*GGqekopHHuRHC zRa=wXI+Ha=h}{cR_~8#_hefHkFF9+YNX@%irvUr&U%(%Jr3EA#Q`B@*{`E&wrn&X< zfuB((es9H|jefH?JoA;??V`5e=GXlxrAr2G&n*IcnNy?gpr&M)AL@?oRCLr6lit%r(%l# zZgHBtzwUBO@fW%w-VhcJF-kk1Q+uI!oQAj2Amg=zw zd#>z=HxNK^M3q4Kq4ffX*#Jtni8*FxB1QV!kx7GJY{ULALbD>(qNMQ2ke6 zpD_sM+a)Z?(FP$gLBx@}%cJW|2O})W@Rf$yu)V=qHZRVPJk%SHMTOytGR53(fPwZa z#4EaQSey`c?l>-Xol5KPm0^ew9aWc>|8Kotsz;~LiJDTCh=@Bqu5P6Q0&LVqKD%)V z3ymC6+J_!jPqk*88xZF-@yG?!G*x$oE?YS7vGouIDfuhb9!mAV-PRf;gz+2J0tC8z z#r@^F9)m7Mg@ezMzUCoxhpV3}%Jo40zFUFM_c`UnR`v!vmbu!N!o(yU-=>-_$A;OW z#{Pg|w1yES>+-`mLv`^dTW~7lPn`__v~iyB9XpnN;gjW!=zs|P9NXjtBPzm{!3 zC>7kaichXsS*#~STQJ#yING**wSQ}o2CZT=_O&f7i6paw?sd#z(e5B)6X@W&1O8Lm zD3AIx%01hraA4SNp@uQop&L1b6b{_zH4ksQV)5#h-wpuetkG^Ogk%bzJH{kTEB z4?Xye;nDw!Ep+zl#`p;ob_Q@rKQz7liT@w_!`a_n&h~g^I3XD@jc;{7e*th<1Nwqx zq%jmLzEqzbvvbm|j>I~BXJ%?jwQNa3tzAqLTOY;NUDQMtD*sNXOxoFTCvNAtC7GSm z&YgM1nUci9az03frr?7_4pRocmRZMN|UbJPy^@Q8)o-%2Y~$7+ zH7_s&kNMjO^OF;`&Q7h-QFZd9X-B-qAl#anbYRRprK9(!-W`qJxOP=4FgctEr@$Gl8-A z9w#3tRGP!*4Xk`*mT&zdqGD(OxkSY(}`Snk(Wel;O%kHG9ZW}yBRUi6|Qo`ay%31{o?iKjhtnR$16mu0$`lS_*` zw856(SUcY>yx2ly^3b)oT5UgQNu8T{?|O@;94Bw?N%StaB?bWDu_5DuZ=?=6>Et@Vb(OTEL0+-f~T~S;~-j#w7v$;MT zwOa&*H5G1(4NEVrM1Kc|_pLwd6I!Z6IX%#6$s8~mk9z~6`l7fis=ohYMO3n-`-+VcO$U@ zK?oJPzw=<)nc1q+{#1W09|eUxtxHr>rTi#>l(nWV=O zzI8b^qlryEh`2BWoG zpUyMz`*($zMs6=1gL)?wZZrX)lydUw&u@>gnUlZ^GcMo{bFvSMNlmQd2$T$qWoQ(q zNt5m_?}YtBi*j;BgD>TWLzbN^amUPu)9wC?DNlY&4>xWoI3Q$7V^a6d1+DoT4Rv)B z%qG#Qs@+~o6fN-M$*d>$RJ!<*@JNOLQ2hdO#vqscB4R$Ir5qNM(&_a5v z{EsC#Nx5b2Rrgz|3n^MJv%!*auoQY(lrJ;f_UGy_cNz$PPY<*lmS9UW@@hD!f!Jfl z?N*$7yY9R3B{BAl6`@o+C#8xvs#AFNk!_T4km2;liUFiu*{H;6$S!~YIZRYYnb^_n zenhX?_=9V<;pCsoxxh_p!AT}CB%3{CTh-^uJwUKM@KdL~fkreZdY%wssblxByI!hp z0727#$^%yB*Z*Ox|3PrX|3PpvU;u3;>c0RFz-kK8J7zVZ0nZA3)~ns zmX!5d%qcckpa<@&6*EnJ%p|2Y@BQYHgYuA8#@J~ zxrR98mU~Lorof)cI(aDMk^?jjbTs9JQs2iz8)$4xZ_oKI4WTZmQwLp5@Rv)j4_#(0 z(ffhBqdgh;R*@$6tGe1^bVS<#QSxImnFG>$`Q}>dAFur>|KN>-_tW;`DSBLr9-6&T zS1O$XMR`5^%xutytGiqCf%R6cs%gu#pQVcj}(08 zY}5Z76O4@V$6U1asJ!jR|M@a5BmT>b)FU#DSyLwOxQc3wg!8#x(?wLEy{7tEQJ>30 z)n2%AxWMjkoCc1Kcy+a{|G}4u!5J0<2g5Q1|<|7xV^KQLP zv2R6RvM^KKHeq_8LB+K;AVQIw^mVF!{Tq-u*W_Ueej(sJ=Scf9b4H%oyy0Cii?&6Y zRj`(pcIQ0Mzdib$R~8~`fSLa8a%8z1J--mPujaxWwsnU_I0(AQ7VQt(iCHK6W*($t z62W!?@dL23$>-@{-_oFE1cXnyTOs&shZ1s53+c!s+INfCCLWa0u)BoG?9XMQO89Tl zTez}OIuDo-Bwqiwu8Mh#R=7ZOED|FYJQGAqKiYXvOPeSv2a>1Ned!WBL=<$ai@=0j zlkgaw#$2grb`oe`HtYT9OFG=dkbeeYalK?C$^u}=p(kyz99W(7++ToE4|W4d42lz( zF_c1y$T^iz>`s47>A|%Hp~TpAtHzMhC=)YlqOHyMRjwAgh-1UnyX~9*(W(GjQl-`R zg0SsdpyLyI_D9W4hh=ND8Edesj;`j_LKBlwnI~N~wz}huQCYVdHKlr_ZY^Y;kQfkJ zOgWE98K*y-lX)+ZI2Tj${LB|-hf@EL)2uD7r?ht%zZSH*E$EZzn!Ho_Fgm?yj=DVh zG8Q~q`H=ad{9vg(juGNiPv*hnS|v9A&8(BO!91Gs){OrX452;f(aIvK4X7-WGdm_8lWe%3^f~->S(N2WSc&}7=NPAz zxzNUJLmq|nWvAshTWwh=8@!^!B4i%5dk(f|_h7tta%Q{Epxd))!K7k*<80OV)B}7| zK0nu|2c!Gl`6V(F2xYL{u=@<=S5BR}n)bcB8RmBVJ9_651@qZAR3Nc)cKoAW)|uk+ zOt-9i`2gc0@pgm#>peA&)RGquEQMID!_o?7zPh}TpxlcBE-xJYTY#TTry^-SJI`*k z+g|Q0U8}K!94S2*^bq{MQu812tYoMA{^*EKn7DuHTo~a$6Tsi-0Uu7M%L|2PW3J2> zjQ+Bp`&4XJZwe9v<`rCn!HZ%;vGoVVY}ACLCOgALK%aE1Zy+wOTU$5F*GtGtORHgC zk@Xc;+LPS^ruxc_aJtfz{WY#4i@F8vdRO>%I)BZSN5uy0@KNn1b@ z74~yO4!bjBsR0XWq&s!lGQl(Q$vEGm*=)-PpWN!^jAeaYpJ%H6DzXk5T#tEx_L8+; z@~Ypqdr_I>b(%_bE^OVCVpt>@A~MMhJ0tQizuBvR1xM@3|DIkvtr0s3_E=u|u45HL z;J*lHnMHq;`f=*O)-pFX_87MQ!dFp_>BjNn+zSoFlV`rw~6B`cO?_b9n)*G{eVc>Kvm2qz$1DsFsZBAD$iPzN5zG; zs`u>9)ZWIoP^G^dVKz!^0)5Ndf72X{8ok~+V=9$ge|f;2P3v`O`_<${CmYmH{8LSRH>0ba6)9uTCLI`|HZOf24v5EF zy4pPB2Uqh3nrrM&x$Y+8A&8iMiHPr-auskb&x1qPsyKI@I-rr!G;BfKPch-+k6*lkO6=FE`Lh}W$XRALn z?Cj1-dYMzdMYOTvJpsVKu1ueblz^8Yi;I^ddgrP_hZqh2=w6O2zEVE+jMzaT~vQBiOq$^jv?}PKA|Zj%QuD)pPQ8dqTughxuV&xwAvL zzg(`v5q=@F1{!1Z>n2i%lAoNU0GPI*-po{hj?qO0MI6;n< zag?ke5|3i)e4b|@RlIMs@-o9QCP9Vn^jkyprNlhd>~bNCinn%ZtXgVflfMYPgb&ZV zFHoK+F?@5GS=8DQ1xVTgpX%yQ)W^3&&iJvIpBI6!+2}&msd7@C3>7#0*%Q(79*lp_ z*iIjV?LwcJ2;6^3bsFf|<7%UwP-`cy|L-$0i%k_pgh6J4R(9H*b%b?-_8gPP-Zn3pIZoxN zL058xolRSJkM-V$I-P-Ga@V6TuI93_PhAf<3!D>AGha;m*Mwa^aYOuj$v6j?n4bN;XDfyzy7%@^1&aE*0XCmeE(Pac#f;h$Inj{V*^jp(@ z%YXkcYF42?C>XRO!L|XQXh|N8`BjrJ@lv#GAf6Vlx?QXnibg4G=qaWPdxmNowrSrJ#;p5N$ z5{J(|AILP;M$PY9sJGu&k~4EF08Pr)&y8NfIInrP0P8)t#C&OlA;+aeym(5ntU2Kj z)a*pvU*AX7V-OrpfGwyR`HEfrl35XbjBwMm=1TI=Y5@);FdP@ALy_iU;decPscvij{A z=I%2;Ed(wRZeesh1@Z+I7H1p>-_>GKm|9a&IH)DkO?)rcYi}TN=1s_C7d1vdCTP|{ zZqVAe%oa|f_=>|v_qx7nnFptFk~rGV884I%JN9kvp0FtTHEXwP6Q1pu_3ImMuyVps|hQ=k{#M}_-b@QD#6-!d%qKF!Ye=Qe(3QbaL|6;ErRZ%qg~7`lTT>e zi_GlF{JP2TTsPp&AVs(BWnsb@!=YH}6YYrYoR_gq4-a2m91pdjULLGH4;}Nn1U4wI z`isuF4Y!BX^-)?}M&gr}`JkqrsB^HuIgr=_iW6$wcY3x_FaY}-^@1Y1GLt|L~c zJc97R==2C)WE=c+Mp4pr-cQ=6g}Evi72rL-1arirGV5fhktlSe|5NmcR_gEs*i8q7 zsh%2BnEREDb^%eerq|=ig~6>IJ?ka7XB^hbk9)H^^aFwt6sz4ZSsA-&NqKNNnpyR; zOR%T_#k=(t1_LE#dwsYFoEKPKX;1>T8obpS+BKCDIms83NuB*ZC8!e48WK$3Hfjc| z2$)VCa5>=r1)UJu)X$!R<-E#`1z%n3WG!p=elr#Lo-yN?5ZFcdTB>&V9N_d*{*oGj zwuC{X^oZ}L>#>=?&Eo<4nFb5E(lW~* zBWBTO)gRl@)1Pq0QG~uNIknuCjchaMml+i1lGE_iZZAK!d9SKeQ6QQhLd{vqS*vCY!|QFHm;8qsp@M-5YEZDPO1(f;SAO*roT0qlpD*P&Mbs@oQ1Ip^M< z1^$8@;bD{S7tP20pQzVAnQrC;2HeZXsm}3GG%x;&DytcEzs1Xx#VYc4&0-G7UPae= zJV!MJk!hjsoPPSN=v?QUE?3j1JG@COOqWh+q!wQ_Tuwewac4QoI4+}M&8ErMRP`Pm znIZqxxq||!=*%K+>Tf+X+5wd?bCpuyMtvb_(WpMo)ogDM3Xm7Xj7-<7rTa{A5_&tP zEX+GVW$APCJ+v$_YAa!x{3pOUfwN1gj62E2w1_Pn#kk#ElQ8`J%?LQBAA(EA&K$^< zdGo#92ZeX7JU4W57BuQpa<|oeCe94e@?jPn*&OyzZw7^nAHEN2Jy=1y2k}z|Mj_d` z(A04I?L6dR38}LKg3%6a*W<}ECtf?yk|Zs9nr|ZZMkWU#zF^l#K(_SP#_r<#wcs0{>aE~PhO|2R zHd4#tCy^_94_7^!0RZeYaG&~Obvv4{Jn@)vOQ3w3+_E|!e-57509zqEd@4Pp~G!z@;7%iRY+{fI?xbI(n1LROwx3CXh9R36DbIQqs-Y9Z4@hJ*Yn*Y|@+?U|S5c3Q){W^Ps z7cZ;U&9IOiEw8&dRjsbr;q!14ETc5-KIszI^lWpoLU`@1^%Tfsp0moMjjKP}(l1&2 z5hje@uW4oT$hQ6N_E7$j!ojb+#=ncuJm1}V`;0<+HTQH+r|QP#b?*ZCy@PM$%vXwe zxe$n?4(rzD*oY*B%7b{03pqOT;86&RDD%4HFo)nsu^DdBMd@e5;=Z1rSxXZBAJ&*J z7R7iI6!Y)d$&cH8BmNJGO+610;Ri&j0=GrW$bUS4(~T?<7VgDIz)eoMtf00$>%;ut z8>^BQdh~6!x&Wophuu-NaXdnF`FocSgOMzgUl?DI&hc86J+iYz4gI<1;^ah0fC`{n zF}ZcN8Qvc&oKI4S$!mUlVZca@bKUID3_nTH~FGuj}+8C`58NW+d;xQL%H4a|TwaqewJ}EWH5MfUJ6Mp1f_pI1= zVfD;B(p;aCl44=Ohd8yd`?s15=l`$)KU-;y!C^XFJdHKCFh2>R>WiNdy;QDKmRQ~RR2A_ z?d{Lk3=)QOzgx{cb*n&$OHm+Dqfx0(NY&K)qn{Fo>=oX>nm&2nS6_+rE6&SQWRH0j_AoX38{b18ZJ%=N*!>rAXpM|Iuu&Evb}j|hZX6Sl~?Mrf7xpi*@=l6 zv_ucGQuFJg^rqR3yrLDzfkV!X`#bVmvYQ@nc)%8-mM;!_c(9kfA|Hspo}N259^uh4 zMNxTlDpzsy-H&BD0DHzl{1W)i-26hOPjJodQoZ0IJE?@^>F?3Ln{C^CHW}#Hcx;V@ zuLhroMKq;7;A%QHj~!gR>Jy`TayRM2d{%Aw)>CxVLps4@IXI$*K{I|6>uW%BrA5o$nQ&qo0Ff`8vtzXxZU?bCT3_s5E@$bCT|A8F=k z(4P!u^7=jmuM= z)<9_&4nW|k?(=V^uLFN9Kq6|PgC0i*N&Jhat$@ST$V+tOf5F@bNwD+C+~w|%tl#OX0Hi&a1}~cx~sG_2vL^MZYWmBCs3)6t!z#6ocI`d?dJzlB%W5E zpXrq>ejoT|0)RvJ`ePHX?2Q~`>#3zsQ;1J1;urF*_`mwd>ElMZ&B>r=aT1n*JZeyqEi+Y0a!ZdsA8^I}9YZega^nELGc4cJe5L84`!H4-OB4?bwSodMop4~_Ug=Si=pbVV z0^TFR(_9S!B1FG&Z>zJ)n~q65C8_Py0lCJ*VH7kdV_So@&bIe!ZWF!=x{w>?7JKdj zL1s{Bf8Xc__wKifc(H+vEDM7Y9-rb(d;GNsUC8D|hdh&}lp1zaoW$+cn)9Do@SO_H z4;Xh;PuzY2v$BK-ZHvw)nB`YV_P%~Ub)tn4)%o_k1l5_6)FtzdRM0ZB zeUYHs51H#oyAFit({dJ)Jsy_Qv2{<4{r>G*<=3>a)fgChL)tPkh0m%;NYAcF7(q#$ z5a$rf)k@8c2JbwU_Bm9na_O;^w%mAJasC}zIM#Fgm!UO(L zw}u(Wvu~-`RJJ2+vk<~XX&nI$>47iGkPEf;{r3qeEApCJZks7RShKwaOcBI1xM+;a zi48a|UT|1v$?V-RzC-IfGSBJ>!EO=OQQecO1XkP%cBtKPL&rrwly(7S@XoBsTe zTssg@9xw=jpk#gVB|YdOi=*f%O)WySWlbZAH5>&*3h~h}J;pB|Fnh09!!v2$z2fj- zYqg!q0=S9Ghj0HXm}^@gBpaGIo#Apz8x5F zlF^%36LGwrvdc-j;9@?H-tPHseGk-jYno!BB3Vc2UNRr*A{H*L?Didi)&fM9)tiZI zj-EQhNq>1EOmAFA>H<`@Q_Jiqlx|Tkm61Bo+XvD@`9mV5@bK1-ae#z zcsyvhRQXN>-!$F~q5La?j;*~_y82jbCCY}j87eRHcgd(1O0rq&+>>#jwVZKm-&=D$ zY%0NSNjsre-FN%uxa!dt1Ua*9`%+3q+$KsHE_3|waWl(y{pFB6{z@L!uUYkSE=bw| ztLZE-d`5?r41fb zKI622aXH+tdS+Uc{R`f%noS4h+t8oq<3G*^IGr=g@oHh4o&l=l&ze29UAmrO&&VLM zVj+L{JtKk$h`a6YZaYqFsFXhw!EAh)79jmSlHnPvFyJbCSsk#6=l;za+!kPpt4HS&gooSfT?dv9ZpVuupAfvl1_P zA>c~MIn20d89;Q(5Q2n4cz!uMzzK)0WbVn^wz^uo>x+LScU>a{#?}M z1m{UepD{R|N>(u&?l zMs?1x%m^%$W?F#6%>=cbs-FQe-t&92t+jL#u*upUfrJh$VJ2u4x&3mTbr*^k(SW@hehu(yAoY}_W zQ+n}475s|AO2OJKruA0tA`1o4u1#AJNUxUGkfxl+y9*HZnjRk3`Q1;ow|KPOL#LRW z@$aCr#vvt^U*vl5!YddifJ1L_%%RVfg&W*GJ#JgHoa%7TGAiYa``;525Am+`z`7Tp-T$-+#7#8<_J-i9I*W)A*c|>^`27&IGNOPtOYvx#|~Y9dIg3`$is=O z(~W{*308jJ6=Z&gCk-5EAs8i_qcf#di3W5dGpPwI-{sWgBKRTeOitS?OGpK z6t?`o->8PwOw_rDrpUYG)~#<5G;o%~Rv+Qo$jp@cuZkz!4@#CjUxP&9qo!)y**0l; z9h&13owj?3k-jXgmeNEY9cTFAg&#R`p5tz;?<6Y8b zQsb&Db{q=(t}ssef%&x?VPTfr6<^R1AUgDcGlJ|Rm?$Z+B zkAe*@b=^#UBv2l)M2R@mw0ND9qG$qrqHQnEydE^M<;q^HK)7m`uQrX@x;wJ+0ux+A z+J_>Zfl9}6R3mTPMr3h2oIPL1NytL{2tCJGpiutQ2E84MeVfNwkg>O2fi@&&XbhHV zH$@!mAgHD;<4t-|H>As}viIIrKYm-2hv%p_X*fo#>3%vhfh>*q;>HZw(TR=@ zA>dK{AvR7wZO+98+(Qu8X*f8Ke~jGDdn>L+?o8@D5V;*s6ruE+ax-EP&&?*Al8N2CZAN4?J ziUz?j&tUzh+cyRO9re^Dpyl`%t;LY!$)yWOwK@KNkI%2AEIcvh^Hx7zN^$(Hw(1R+ z)f=@`R2Ah*iMOe=lxNviQTWSA(cA~Pe3A?oWUka6=3hPUhvu)a8g7va;=Rk$$^Q?$ob6EI`&YVQC2_Q%bB)$$7YcGi~G2_zck^2e@{=LcPiSu zrug~jOh8^NL3@5~sCI2NA{v*In=$SQ{pwRu#Qx+ZQ9GzL!Arr<*Acr3L$lC4aJ(hw{vQU zox|au$Cs|8sYY=i1R}?>eR}Q_(4Vc)KVx?r=Y=YUt1tWZ%;N&Q<3$9Z32loUZoiC5 zERg10;)p12kZq3hN5i}qi0S<0-i$0(uG;6Daico8Bm*7Bjj>zx@X;dCNuR>(C6o&( z`}O1jw=UeSSD(rHV5d2l?t8s)yGJWJJi)hTla2dTzP~l-%TT4hC7yv) zk7nhA^Nb?Lm%mqI7Bq>5@OAOX9}ojiPa9?WHkvMipy^59wwuyAGWdL9y#dwbt7xkE z=XiVJ@{wmGcuajIRP?OirKXlEo5vxoXw%58n^HYM6DFl>DwFi5o{=d7wagQ%`J|gq z3>;qdG1@<-p!WD31%V6^K?iGx<@}u)`J>n7T~&qnB%bYCY`}r=Ur!NmqD)A5TyKF-9h)koTSC)1 zZN|{+UZ1i8(izq7HOZ2>w?sGF|bE#(*m0ex%$We=e&+^fXnE;d-NPw z1R*j;=*=}mzkm>{XO=d0dN8J?G-X9QU9b0BcSd|F5Nc>}3=bK-((`!dZQ9z-Zb4Ee zWgyAJYGl&Yq7)z$)L7Hj?w)DU()}B0L&xTfr%$eZmIA%wWaxE8=>D+A^JR}J`b41C z_l&9>Hpis)tws8_@T?O64_%EF^=OaT{&Usi+>@92eCPNG>|f2qU>-iJ_eQN=2{yOx zoK(F2_wvor(0E^vJ->r@&*o{sm*6Qy`MK4q?PpcwS9+dX@o_!myhQ^)n^nNp`wt>L zMPDI_U19PvS2EP^8-=7SNT`WETUg9$aw=^!5;ebX5p@Hs9hD^elfVH=J!4VgKPB-L z9oxZawS@y$JL`gaLOoJzoI$FB-E{7G=>c)_*9G17IUbmrQDZpJX1*oboZUxI8K2J+ zw-FT?{8ouhhr90QhLx_dpdv>!1s6>F8RR1?!HsoCo!Kzi-D}J_uxeR`#*H%;B}mtGge0Kn{zNe{?S)9X#2J96YU8G;Ec0 zCx_s7z`i)0h~>WLh@gFb^u`+8eXyOdK2l&}P^|0QbD-zbiTXm)N23x-^wK=g-ZE))?@}dOxIyLFf=e$%t3&Uh z0>YlJMq8Hn_OF|ZMSN#06EH@KB^o@2?J*rKbLHUI?4cxUsWQ2e>c0#8Zk!(R(T`vzbgcKPEaehgE3e5L- z<1*$IUg;i$@kgFe(Bkz!89k1R z(g`m!j^^Zf*Amq;Omr;T8$}-hVysJ!zg~ysO#WwoHDJ2GI`~EUL+nX8rs20Ibu|k% z1m47(vYu(SJX`y(F_U}oyVW~Lj?(%2Uo7s{>(xCr6mO}psrth6HZ*QD-)1Q%SUVH> zSSww+=Cx^V<7(MBx6zf6i157W1$*GCIb+c07&YLtG4+FnkAUcq5{4FRBx-{{376#I z)cCDm`TzNJ)$x7gs1hS!O!vY zd_*72isH_r9~LoR70+6`Q+CP&U##?SYi-;aQQWgr+cOjj-bfl%Q^egyz|$W|675eV zeY!Oe>7a+UyS=rB%FV4vl?#aHR%6eQoVyFl*!~(;0HQdBsCEB9lF~`DXENRQs=isY z29E?odww8sp=^a1^ZgFffaj;q#9S*s*N1$RFEJODwwyOh}l zEF|Qy^EG>)j2!2SZ#$B6FFVf7^P@{QQ_%wB>hd0o_Ty&qswD}~Kv6}@EV-X*<~*Wo zLwm`dXjP+IP1I->Z0GH#=pdrN&N9JicoWFo(K&mf-32r9w+us)>acDw1N?HIo*Iv5 z+C+m-t=uhAc02UNr0?bYGN#-@6W69L`3<|3hY{->P1SaA3nkh-E=2a=O?rP3NUMR? zhtsL)XGykH1n*s?_)2)Ij)RgsMS+_tWyu?215Me-7rUowe`Pz6oWs<*h2eMS@ zfzO?H>vNiWS6h3I?8Wgn^J1CWkao1ZWh`m&3=R&Ri>k_7G!XY;?%2z|0Gp^Ul``Y* zlsMn_9-_;19q4^zo&GiZZ{GgD8o>XbCjqASZ-ybD4SysovxaX*&v)N^C%MRx*))D{ z?2hUsIAiUqtu455KGlrYP&E=^;ZA`+Vsn?c{&^wF2X>{&PYtMQdL$4OLA3IJ%_k3E zodp@0gB7%B9C;S>(m2=S3E+r1UM-?O;NL>l7o5G7qx=w{~e zVWP>iCb`&D#_r>dFM|9v-A;LZu9hVN^lkNkI9;djxIU54_VJ=_k1un5Xiw~J-3IV_ zWAj8Dp_X=w*&2+^*(#0XZ{5BZba^XnjyOBQyH+H_HJaGG*BI)Tm_e&EzZL@y0JR4MY&oBPv`J-;v?dTb- zZuhSD#!xnzYAvz%ej#ZYFsf0I^%N+!vHYH6dugP44j#P=MIH@LyGxEFb zH_Ww)pD)#Rh-@Lze=jwWOArQ(A+yQaDXe==2N*&VK7qComf;+9sLXxG&E7x66`(IK zH|dC+ybM1>`aUX#rI4y(0zj2B+b{I%okdCb-%-VwE)mPL$d1aHB}+G>9Tmba%n67-)x~q$zDB4@2Z5%j)1~6 z-(MGi3)EeybeZ?JPx79-xz!oI(VQ!A+a05zF}qu%AO>$Lm}%9!mz$YIsd#c4D5#x4 z-b(cu;x%?^JY?y>?AfJM%Z9InU&oXGZ>+yy;r9;h_CkB8OkU<{hPqc9WmdY^IoeRt zH!kJA=UcgZuF*GqUlx%fBJK7hoR*(U!{#ijpo&BZ$}G3hSfiOaTcd>{Sz0gM3lUg& zHCynGe^vyEoM2FxP&9g*!6ZO1J;)7M6fyf>Ca_ikD-Ul}Nl#z@{1V9GM`DH_kN0cz zWAByCo(as9i?AByZ)W-5OiN+8`#2{AvPVG7&)bM|Mmjgpv2%t`~i&ONhlZy z*6&^Y2zwH|-RR~Ze#^=Pc67)-KxTU!8ZZbfF@EJI)=Fcd=QAmL1{$yz+W0~L3?jOt zD5+_C+%{|Kg6e73aHBUPW7E;|K(XPHHc20JcLD}lH7?e}0tsptfBE$7cFQ4JtX%Ul z(I)$y_}ahm{z&^!$L-~Rrw`hybquYv!<`(Dj(VRBZ~q7sg?Q??OBiXM<$qA#+%eGp zf+8FZ+nJ4Mwvk{ZbBPp)!4B6R!MWvm>(37?%4@D(icS5aF3Bz%yv3Kikzy}^2oG{N z3tqaQN;PW`RNPR+76x=~95=ISvKs+80xgIl; zZ8N>PBy%eiJ6-SF$rLXs_aBj;e4W`$&6tTT45{#oH8B46S64=SK^qcOMId+D8Sf5C&j}RV^pW zXUe{_WmKh0JB>!roZN3??u59BV(yHbjWKL^7EG*Isd2G`HIV7p@_Ry#*annDHqFMK zfO7qFiRv0}(gXP~t`7>x2&DhVfz$m1s z8fD*drRQHa@);|%E+z5<(2{%B?c@)wk!vFO_hHO(EV^9xM(-@m=RbX}GIN0y(Mh<; zUG{ORrRB}s1wb~=NVsh@^cl?j`E;6`TWks+CR>;zDb^r61Ruezr?56>myg2vSCTer z#1cS_#S#u{LDLjl;P_d%QDs9&|3Fw?DC%I3U=v*(Yc&5ljCqd&n}tUYakV95hgqx@ z1>=nF(2^ShNR$}ZOr(wBw^R{EEOmC$!aimtJjme35^K20MMiQSnJt8lJ&ht{1t2}}jy5%OMs{mh(Q z!Kd9)+WE-bdpH61U>qtxXXyR%$hFGtKur2ow`P^^(LL{j>7vEq!Kc7Zm>-?kmpc4Q z9X?D`&PI!ZW^$@sCvCSnP=Gb$Kz5V4DsY8C7Npta5|o`HZIvQvaZUE80oNDu!LI^5 zI%?FcXpipyq3kciqHM!0U|0nSNkQpGKqZEdjzQ@bk&;rnyHg~U&Y@B17Nlzgqy?lK z98!8%kBx3-N&Gr9vd;iUjvOov&uQJ{| zu4nwpv706jX`893X=m>CqLF>IAnIDr6AC(w@8YG7_UQQzs9e`?2pAoW9hNSlZ+)Ci z`Aobc-rldmAGa)_Mo^ShuX1D*?R`3muUEmzXI=LQE6Q@t_x%x|{<3$VtYCck2dfE9 zgX3~RWxGvNGrCH+cwT3Ld%1IbxhqKymbm&k@Z`9X-gZ1geA3e=Kx6F6yhbK4`S2J3 zjt}~`js_XQ=qbuyAM8(~ul5H#gco)nL9dO@PlAfXY?wdPcX=81Rb6B@Gkd zeTO5~={Q*HZa|^Lr;ctYZD95yczeync?&0e6i>heT(Sltz>r~DAKS8|f) z_eq8`@KWaqx?j7sEIvJ{7lz7vvVxgrD5ybKvsKAUd=8XCqT^VhxRvo(d?jEbrv+iX zzGGI=i-BnzRgo~g#bu<=o4~QS0_DdCk)2hyVPQ|qNjgW&N4y#eG&#y8!m*w?h?3gh zT8S<13-(6tJ>Iz8PK=-FD-H3CT%aM*JP3~Tw7RB{7JpEpcf=a;Wh?iNPX|Yo_dZD{ z`9U8(JGxkRIZN3`5~z9q-6pQ8yi~4a(J}|y;nush=zJU7gJ2ok-irgm$Ya5!xA>hIM;X@qbV=RON8t*Api^%_r>le``J|u+==nOp%9pqX=uVtOmK z7r$Oq2Rc{BhQOHWnHHw*GD-r4=?$jtfhAl!s%+1_TN zA8aQMI?FMU3jfCtLs%>-Dg%5E2{!#HMdpdw?r_h?Q+_U~haWU3WLO>88PUKCnZ<`R zcm8A;{gcRPts|J{Aije`LJuB_yVckBv-*e#Mokd=YL!DZQAzIl%vMd+b$kg_CAc67 z@Fl5)WSgxqD%H-mm@XNHWj@C$5|$<)pg85dN@s;?Vh}cG(}Kv9Cgy} zn<+V6S2~HuQl5^wJGx6eRg2YCMm(B)MQDQV+~d-#@sQzH2WYPrGf4E#l+)`GADG>% z5V$$~avK7bmON34V>e*i_B_PQt*#JxsDoj(43Xynn|8T6X8DE#vAR%TWYCl8B%lvq z6skdpV-9dV{9^FjkKZY<*N6BT*})N_{w>;~ZSxOh$ol=p>6T-?M0(>ddi%^9T&=&K z1Lg>xQ_Jq1!UjCDFJt~qPYg&rI~hj+F+YP~RbfuzFTb5`Y(=URoc{}$Y-YM&wwUbmsehGpui`vWaeIwCmF9pW5)ZgDlxxzhY!Pz3{1nJ z_#%UGKM>YEzCSx{5(IRB;Z%T#F`%WX1mlzO^K?c2{MxJUdscDPUUiimf&T$v#IjnW zwxd&OlsTnoNXf|f-5xypIbe%%~v)&j6;f__OuvD{Q!`W$#tks`m<^ewWmWkFzxtRYG zT)*(sU({eSLn7h+cnfGF%F5#0Y)`s`m>m#L#u74CX}{M#Se5=IbvUp z7Y)4xoYEn8m+LleHzjFE%Bj(S<+n7vqO+NtvT6BrjJy_!jy`_e8#9goXtz?vYMBz< zt(s0l=S17#nUE4uw~ao#(xNh%i?d?D@1rc!sZrfwp;nGQXhGBVSLD0x4BAx9XGW`9 zxkzgP%6v0?Nenb}oT+x@vKUjc!=28FaOl)?nSOfmRX2;3XTkF@Lw5prf!mw{hbI)M zaTib95BC^LHzhBgsMmiDGripCfVBUaRd08gujd5(d?C82R%yG9cCQuBk_=6>^%#Et zSs?+N$^+N$6>;CY8G|c0)6F7uhTTwb4mBRjrY6GHw$k4Ei-^fmOg_NS*i}~^OEVpP z726b9lP#xHuak*>=yk{z!n^YBWzU~7s`;h@JrC6J*?W*C6GtCYQYKGL6r5`#ZoT6p zu<72-r=y5Y#yf^e@Xe5+e{BY@ODWBh$Sw&pc~Y!YCjr?KD$}bA-z4AJ~sx z?rPn?aH+VslQrmWE>Q&?qu~^L>oM>rH<6ZTp)j2SMI?3}YELRsiNuI+`i_AR;Dr+7oIdeU_+VBc`dEk}v zJKw~;=Hb0&<3Nu?VYkgCAy zR*RgFF$u!0s<}a-sv_ziX&{q|V}fk8$Bp(7iN*$(N!L2BGj-5Ns7V=n{&!UbJ@@L+ zbnDUG?JoeEiUu5*wZqD`)jbI{am#Kkghv5PrPKL<60YG9%uLwTDcooJLh5VabuVYO zKb{=0-1BlVw|FFAi}vo`t8D}Czq)eJt&^%}QSh1DC$xIUSiy`5)pJcwkVR3FMo>{A zFqeM5bdU9k=aK)bt#z~->baDv_K z4l~bi3xwTgkeX0k{6H96c@aH|PPC3O>k_7wL~|!%UG-6Xc`nK{{k{(IU6W(9GwE0{ zqVRdgcdV<%E^JpOPq%AD!L#Mr26O}mQuTrO9yaqS5dgf$Nb#`ZtC!s~q86_EBH#<;Dli=tH zw;m8xsx<&}m4Om;|V0Wn_DS z#J@J%2Sj=evwFqBe_&QG8AN*= zflf)2Xu8UBE|}3R9nRs4@UjRep{b13bSrI!pd2@S)qF0*cvme_}|7Ik7O9r3Q0-qvOhXWE1wikUaS~gnGuwBL<^nvhnt|mPwsi zrB;g}BmODTnYx|TILSGGd9#jI(SGx6q#j#~)4pE&i%V41^dqTzIO3Lsob(!~SiOdG ze4RH@CAw?s&OOH^+ksX&m9eREMv^+s$HI`3`GdxBr>9@P45jgD_aqmU7)9o!l zF!EmLWd>Vi@FTaU=L}EwjP`XaRhy#<_Wc6M+C}8md0|)}#!irETtU}um_j)~b{UZm z043sH77bT_aHU2;G-d2DjQ)SKl(O$ICVwF+H)=4njN&ZN7D_roM&E7AT{m?A4GOPM=0GSiWDs zT}4D0(O{KkmqLs#=UPR8w!#>X1CM!Bx0^+8KXP}bK%tZ^it=e2vXpUYoMi9vq||Pr zVo)ay`2^Q)@v?yOBct=w)^-jDgO#WD!R&X1%R`Xv$C#i-YJpe6Zg%htuv2Ap!9I5; zFGU}E?fG{XjDSSYEoJ}Pn8QJ{F&Ahbr2pvc7{>Ozn@N+c6{dh8?HGUoL!AjTaT?Ca_Le?n{khF(|86z{evw-h zq_g#LIddm)f5eUa-o}fYK^A)$*8t1Kbv{BcT zc`tA8ZB@JZ1gHIy5pz5u@L)D&)>w41;Gla(VFWm>z1&a&}D4oNEc~-d%Ux_Dc z-0jRl_YJ-E@oACd*y}gtpB}TFeh9eWbz6Fey$K8AV@KEc7c8fQk1V`M)zBf1$!A3U zc`nJ?`xd;ygYcP1l(UVaVJ2wLoO~|$N-cw5rCnP>&J&kl5DRO&IBeWp_mHd3XY={& zJMpU|}d%`uhnd@wE(J467?FESod2Rl;!vE@ctil0jV+z{KiL_CjP&zA6;08}x}9t+lsXFT&&r+q8%jXKWB_pQFoJFHr5^^2b@#(zA* z62^ZnP@h7+P5^PN8uf@?r0T6=Ll*>GBOo9F1=OCF^fZC45s59_yk7N-bNc zyHPZ&U6bxL&z@?+UGwd`7UH_HpPqpCTY@7UgLq@1ThO^y%mclItt0N446||o%0dW9 zK~U1Jrz2Z+w*&29$bD`wJ#^!~lvnaP0dzGst4wHb=1bwP(Oj2Idxrh-T%vRdk5V)P zK&aEP?a0zEY-NvT_)@4!mxn#E!EG=R{4UP|a;PZy%?mo$cJO|#-7XiHk4hVBayNQJ z(ZnCz+L0#NXMAZEaiDWiU!~0nF)Hin-(0C|7lr1fc`DlXQFNg;{9K>#Zl3q59FQ`2 z-xOky#S&aBfOl1ErQ8z{5(T*b-8^~?{ho#(BgetX+@+mUahu$77V& zMBV?;9T~gNgZ^qf58OtP-v)oPPBt~Zlp&VE$^0l3!iQth9m#=YJ;Q~4^jgEF_IDwd z_Z(gSSkTcCXYG%-yw*Axb^eRRKgm6JJ+NShgOX#wONU)Gn|WEDnl(hXO|RUEaEQKn z;n>!Ebv9MF}wNT#&mxikJxVVP%!X6;BQ=Mourr;GJLtiUx{VA8Rpu+ojekqx5xBubXKC=7^|!f<^sN}J(t z4QMm(lZ`-4P!Lj+@{OY{JSy%cynBApJ9wPRT4POkrYN|oaMk-Tj4m8o(Q!@s(I07# z8rmJQZ>&)RT%6qI<-?m&;I(CBO@Wj=So#cRMcc!r3=}pn;?p#94m9?M;oUlFth~^t5WtF){SX65{q*9fQ@r z%-X)Xcf}3aoUy-m@}6>yS9L?@x5LS1EiFmCTf^}GJkoS=Q}f7OQ@e!}m<$H;1fJzz zL*of63Rq@7iMp{+saf2c{HcXQuR2}h`vJ}~ih{1kdf)XVOpmRB8Cdi(yBa?&v%K6R9T%gBV6ht^gH-ndOvXkB5Fkj*cfOkuh=eH$0f)^97<0T$@zdRy=EaOg| z?uk!kv1;gCH`+82eN})eM+_AP@N=*~FyNPLF|vk`FTn5J=U6uEK`!K~L` z-F~TcQ4nZBZ9x)Ru8T~>Ea(kCFWFF!PDY4V5l0@vM};~?M@1|*buOp z9A5*~^&*?6;wK>>b(rdBpVNuW)%$i^?BX}&CnxgrcM52g?kQe1lfY4P^jpZJ9qoInUh{N?XjuRh8(U;O!?7Xt2I3ns z^)Q4o5^ylT1u5Z^tj6t>x98ffq-OF3R-D%@fl(GxKc#N-!Ay@gq5V z|5rF(oyt*6iob7bgPbQr9JRpGasGw*^kF*|?j(i&%P9p-pC8k4@^zr=)#ffySCQ0) zAgw?9?}>Cj%wB;!N(*!0Kj<(OIMct5a%U6Z;D5h*kmHPb3s$Rvkf@>GOUrFxb#ic>cn!-+lgY^n??>@lSCH8{d zIX8XIa)SiH-un5`uXJhLq|gc z+EVCE*gw*v00x(QjJswF_LQ zrfOK5m-d6IHs8tfl>JAyc9`A{NvcT}p>gP!PzWGSiyHg_ZuK{YWoCgkFb66idVqLF zH4d{A*A&OQfIg!t*yYx!9F#Cwtn|1D2iM4zcP~eemJ`52ByViaNJg*BzBxP(v~Gj1 zhOVZTY6R_Kh26t7Kl=^v)gPYZsqkmVfdh{ z)jomRe885E!0&(0Mlm$3V}q(yRl(~nuNO5LnR{I&S_7}mwS4+MJlrU@JNjb$MgFue zN8q9E1xJpeG_fqlp9ev4%}H&|-nM^$k3jZ#Gr>UdZpv4-J7BYlW)Zr;*(q7s7}F;p zJU#RBUbTt7)b?>6SQ4<5!#qYS8=wzq2L868c-yJbvv$-AeZOVA(YTG@iDlInOw}{8 z9>+O=)x<&77Zu)jVm(=`;Ok-zP8?@ZwVB0~v8I5B7hGVrNmUMRu2E+;gLF$|{og`1 z)ak9K7IC98uRNx6ovkx`EZ4jfZsOvO9WOgQk#Ej?7E8uvi~^+35gP-?ba!;!U9TT1 z4MaADeVC_cunM8qmN8Ry^ixHeqfi@tie4yT3R|pjSms3T2J;zx$q)YWx@(8`T62$D z6=mwvHVME3w`WS`GbX6W#<3=5Rz5`67 z;T+ATp&VA2+RNKM=ccOyDzftc_s%$9tjnn#5v?KUKTjI5ZO{*SKuWPF#iqZA+*K3u z8|4GQf`{P`be(v3T2X*2*6in~4m=5I2@8ucT=Gs3^|P0`E+T12bF(i94*FTsz;8A? zf=?p+FM4UFv0Sf5j}nX7T3ZQCh9M0>b74{NUCC<+MsH*_eZ>)gRKd{@eUWYLDVEh~=gkAZX(Tl!eCJ+B_ zNlALe7s)Yjw}}QY^_n!P(-6k}cL@=|k^{T-bqw~}4Qywx=|9x;5W<^zC4yhW<{ZZgSwFI~! zC~TCdzt@_AuWOXvs;~4u_>FPB`@-Gr-`)w~5#UCC@z1=8 zUWLZAJ%tpXPVyfqSevl~Nm9AppMCyFINLnFz)AVk@-9id2*!q>+x6;Qh$18?2=&v4 zWYubCkywfcyVR)kk*Ve6tG_Ql3|zU(_n3=Qw#& zkN{r3>x6uFN0hW=?2Wh%l(Fl5ZNy{KKx zg9A>>j{zCmPP=T&cWFN^`Ejt@^2yiz-U8jkjp9RRu}-8ClRSXObQ<{&^mr6EhWQT~9s>TE@9c0PegIy~KCF{dx3cBhKQ8nS?mhii`+NWr*Rc<8Xk#0v>$DER#ny zA>0O46Gc&19TYmaBswM$ghJdKD3|PuZ$bWLD9tL0k3u$8wbzcMQ72|+=u*?|P+=h( zlD|3!jIBqeLYhJm((@`!yCZhg_#h=x%LP=!%aZTs-9g*Xw;H6pUBC51%-wU_C{sG~b8u7Ue1 z+ZR~72duaNu($>Ql*5X{fO5bz2V+A~!D!~~F|~{_k^DX*dfGIhF)ei+yUU6(`q+B^P|^5;<~4Q8(Pglw-boG3=7uM$mH8@bS=Sn!Im{%Erhp9 z$jb2~id{aTQmKnX?(8J}v1S0$2vZ=bP0Wx@-&lHv-Ru~UbV3z^kRQG1D8Ig@6At^= z%$ZU0mNH#BW3in1ecnpFm6l;ZNlaHG$vp98?t=Hx9S1cO59zjC)1^RK?Zk z8ls}-g+eboAIqK7l4~+Amp89(ZEmir_)-E6(D>#gJ%~Ys+N(SCg>OF9g*xJ*C0p+; zVo&DAJmsRf^ZsU?uZT0x-Ti7D?Ok}2PkgToGM^0$`3|AQm#~MHZ8`Cevbc< z_GTab5ye2$M29N?J$l8nnNqPSTTjtd>75BZgC=@zO~r|qD>s-SF)X(~7^|#6z$z=x z;Xp|g7`-h4VH`kNZvscs{|y{0;T$KK-6C;Y3ybibXeVHVy~uTag%||7n8ibleab0I zj{nk4{mIzhDj?sKc|I6N0>+?uckes2qE`rLqnkpgUlbR){d^-p)dLd$8wv{fZfRQ9 z^>mNSAhqhMH(?`z(S9!IS;vUicQ*Q+wqA^Am9#wz~M>R}WF!_7L>A?YNs!`vqZ;Knc} zW!?cQH*SVN@nd{Z_Z6^sT!O1_;M=T^*V z*)r9xWy^Ml>|$3$p%PTpS&%qk|0G060wv!lPlc-FfQREY!ED^(DIU`C4&oovw7vDx zswzU(@q<#D8nWlJ9reEn!-*MTmHl;MN>pAH>L(@QzqXyy(QP6i3o|dTk7`b=^1fa+;h7vU8>~e$!aiE85t9ZMv-riI3(R&) z)^Lb-C%$mS5mU99JCYW@#k$y0YRW9U+0 zInH33z9y<-C+867QksBZkuXu#o0J$pW9Bu3%v1c11`U_hFZ%*|CNgE6#uB*S7$<(2 zziN1w>Jtb*V&2OJwv^5AmL&Ti6GDMPPi>nLKN>dcf7z8spB<{O@Fmib>J$P7)_q6o zT2}(57n!H;vjB{50;B`1Vm~qULML5dpFw#aT{R;GZ;W+R)dZsfsD8ZfM|g>B1_+7- z%=iq=EBd(wtgD6b?@%rHCq0frb(B$YLjkhg+n@fMBcBt`U!3NxJ+R&KjUfElG$R6WZl#N;3x90~DHYOLL5#~yvba#ZPy z3z}L>bm_+;k6zXS5yo2+qaoN_4#J$0gux~ZLi*I{=NKxjnyCa`(@fo&&d(x@$ zA0P8_mG;mocq-Vlqe}bo#Vv%*E}E5^Rtoyjrw2}z z3gnq3`|gqyh@ZDk9DLr*6i!@~*#*jy+zh@}NK*yjz4Yt}SPAEZzZ@&)LwzEMhwg?t z-Xf#YPr%~KVT>&{;pBkcp%oxO^A`Sn<_}dJ@<&(Yd`FW3&vkj(ym@3wR=;(83;bz_~!JHsVeu za8)Q9RoJ=(hgUk@_^kfa7sw`<5|@L3(V+=cYb%@z5+v$@m@#=w2l{Qr%;{T2?v^dK z6Ay19`R^8QhDM0T{$x%tZaH2QEkal|35Po#Pctjl{!el3mw)vw6a!RaR*nGIJEqFX z`|a(UnrYCx6BOmmPA>|WLVcRITC((d0`JDy;tU3SLkE(8zEyB{7Y!Ee=aqCg1-RU^ z!5|obK5^gwWrQBkd&VSvv}28`c@C+Jk+;LK2Q&RN zykXcF`#ZwUUS9Z7h!x*TDJTubIyq6=Yx?UGL*Vgb8JyAAJ^?SM?|fs@4kiJ36#k_rM8KfcuE z>o)rC;JcGg|E`FpOnLC^^(0&2l+ZZZ^#LxWSMa=dKq$Ux6*znNjrR_5&ekb8ioSeGc7Ff?LF{-ra}N)Nr;?8QYt2>DI{*_)nUf+X4dh!t3YSkasownIqnD zgzOaJKzaW%gted@Ko1U5>AS~j)#yJReoIt3@?cb%yOW~?sC?e#A;_JPcaPW-w>fjO z7MUargzlm4i#+y;d1F@|ez9v)W5JuUmC>2p>%OfB_w&DTOYp`jx|F6RElB7(CnTz*%Ds)f zbqiJk3>CZE&l55>0j%>J^(L_Shc$=X3`yZf4WOu@b-17Vw0#y)*)3K;Wqd4y92v-O znZXa@dP&@eX`t;@;cWDnm;9gJ^{0U(o<=!l7IiA6pv>Vvx_9Fv{Vr_*#C1U0{-1|a z#Epkji^Z$NsH~2#lqg#2*ZmO-jB8&z#9cplI;Sw2uc7$++ z?=I2Bf%H+7;U?%E0MHS<fF&5>m5dIiTlvfwYlEr9mZU`s?bHIm}v5f&UQ>hzygGCLv5{H0# z;ap0@tOq83R#K?gQLe zI{R?Q$WwnHkI_b2M&n+de#mJ!C(GR@`hXMh`EKpm&e2# z@LBc?aR5G1r)fJ67{(o>7a2zB>J(Ym`h%pXFW7n~?!IjX!0%>B8fO`Pg1?5t8&65E z=n;fvGNXrEF9logqVaiihW+}}BwBe=O}uhIGeGJC82u6mpJ-LjCxPY~o~4V=F<$uv z@6UVbhU2lS0|T*FWQ4x~LfC+%)o}nf9>na3NYBx-F``_En|DoQ04x*6dCwTtK6_vY zi6QrgPVBeXE+Jar=Br~7WncZISo`X!s|MJM8VO*Hp5wI{&pyzlmlUMAmQ=ke0duhc zi7Dwc>cJ;IiIcE=gc~rb?5e);+;)_FWc3|DOmPoLDPTT-qA+g%Wfw!om!PKQ`wb+3 z1$iJG%@#`Hgz3=hC1qwb{y>1Fbx7{X4#n_L2Km4&k%`QH@>6u8>a@UL#q zfh2o?-upbpQ`;gzZST5tw0&i1=P-#N2-K6Q$1}nC0-$$03gu@xKnZh8ly32qLAdxA z@Z!m8Glm0jK$-t^KXgF%1a*$mMui3m>#cX?L@++V^i-DTZ%k{;pzKCRhBa7sj4pyB6pmVam49?uW8V~dT_$Y|YrBMABoNJ9E!Fy~- zk2KI+hWMx0?CZG?Jk*k{e=^~AlcDbQecc4<-FuX}0nd@Xn(EHvhezZ-QB*==}+jvR`>26mX=h)cM;NeISCT5yr3fIqG@ zuRM?ZIHTLLjaiw|+S()f{|We2g0a>JHw*868@B)^68UG-puL69*{r5KTv5lW`3Avi{{KJnoEtUU!J0xPW0#L+t~TGq|EDuQU^^4tF1j~bN$?Nd ztt<RqPS@qFSB>aaMW@`f# zo|Cy3Y_?ZisTa_@G5~s4rlFZ8f52eO>fKb_7!zGgt12(iXQF2Z^-Nq{4J>bTxyPO_ z>N@B9A=48nSb*$PoDwQ5dP7^r;_cyj7&0pzSR7nT$$Hy`0*LqzWn%ywm#_rv9oz`` zSwqZx-U8irYVQL#ofQZ)dN6!rz&(H+x4~hpBeJk3*T@?YpO0{re+;Q}m33oyZ5Q^YMK82X2%}b^*LK;JB^u zOlmP$s4d2yYo4Qh`-QhD3Gf#=#CWMe6PzO8{a+{ddqLhSv`th^jQrmw{v&=&D0S(b zy*NxiYssbIa}AyQZWV{Z{S5Y}p%#rN=d!gYtl>&Eao1zW!dreMhV1!+$(P#g2+VO2 zHW&YwChvj$SuYaK8_?V!$3pG@77GAn7I`gDqnic0KCc8&Iu|Wa+@1CtdUp%3Fc`|) z{_|d^z|Hyq9mIho*c8Ex7Qx(t?=`&C=<_!ajPE7au5%7piIE zo-)1+I`3jk)uKXS<9w#L$E}iwen}XDQ2x!reWb7OF zoG1&@JRv93-8?EJf}=GOZ#HfM%8d)0!smPcIokvG;zMeXh!fXK9-wO_1PPQ+xhs<( z6Y+tIv4>fK+leIO#2Wz^K6qaZhM+e=57=)49m`AHRjY5BW}HVk+LNF9Pd>!%UfLRZVrvyFsl&hzmdnrT0fS)agKcW ztVjAlj_FIJqNW&r8}$!G06_K zRLRb&w}0x_8Ar_7^t+UUzIv>Pp&w*(*|eB@QovjoE_@ney&kHxJ#eyLL;#r;1Z z#svdarrPxhutk?oxY?`Dm$_Jo^)!RknOBo1-$ll!}J2B38^74sjJ< zwKl!x)-5g=3trBib*${zX+q~vM%2M8EJxfDQ_a%VI}@(oQ4|Jg+U%;l zQ54)lgQ?sIn|oJZwd)^r))jXK_jiPdbk@^l_u zEKTi1F*LhZYs~#!;UrE_-%>HydfkjZHQ3TpIf^Pj!f7thExGxlpjRwL{Vvfx`fY>t zVzlE#>1FEH>KVADRx9{2bW5V$XWHQL6RsTVrI|$vPG( zxn=?ymIn2yMiJzJ?t1-UI+9z_YFo+gUvNIJb2D;Onx{4rtq5)2bqOk|Kle!J|917g zqOVAj0ZHSyGk@*g4bdL7G1+pcD}j}RNAraZpvM$L#VI_q_1j9iJORoOld;tCA{Qe2 zJ=yA2E|VCqAE^@sE~iBedpd!9cO6*`sx(+Ltqh9Ap?TXnokCNPzMPIPrXlS(TMtQ4 zY~>kRIw(<{bAQkIEOgc`_4!16xpHAZ0i^nA<`o-d!aXVH=7zz&H3N2?r|r? z412HdWK~RC`Q`r3Jk8VaZa3aaPRXgQIzH9k(jvHdN_6@E=PCKDmM&QsY)xpQw95wo zwYRZJo|N`zI>U19a&g5RBA#1EuE=&0Ip7rIP;vFNIiJJ9Adj<(Vr$X`wQldctfjw* zn@OuNIi}BoB| z>?3Cb*KaiZXY-6}M^dK?5KWpU=a8$Vzjph+T6sIAGWbE-NkHsYjCzQ4kmjE@hPSdzR6Z!~Z`*05i6&!0 z?MsSPuT#oJ&fgbX4e1LOjUB1X^_=H(*4RE38B`HU{%vsK|JUv7y}_AJLxOo#Zw{V% zK2DNRZ7kJFMALBwNVQx`$gd_(C2W?JY}l*!@_Y1%&B#V55dX$pF1!6-PqmasG_KtK zXn_1Arvij~x5>ZyTg+{hNsJYot}E2H8gErLl(r#s#3pLcNA)Ww9Ej%psd=ZtotnJY zErt|lr9oQ*$#-D>{t6%|zD!H&+*3KMv+9?`;or(6H$eEih5 z^x=%l>aVGJ`&HkuLEz* z-c3krOOcx70^ZA6d(LdJr~$BLmKht? z*{@VLmbB~{d-!?I>kmW|15VLh>D^+=FMKD+QDHhCY&T065_QhqyOWmh({9&Ppoq<5 ziX>kSZh|v~%=meWU^>m{AIb{Lf6KOBacmE(Y!MwPM``x7pTI==e@V zqUp+DvFc72gl~j%#!?(6U+;x}Tu|R#|C^Xod*bT={c1W^wlo-59=m3M@6bg}@9_-k zBUlz^Q7(SsJ3d)_yI`=&bn-2Ey|RGXR#IgjyPtkFRz8@^BSr4U;BkFH$8(x1r+7xdfLZXSpN+zglD5 z6*cCzKYcz{)V7o2(+p@^F2e-065XxQO~EnXtjKf#p+HLk^M%2pa)k0tJ{2{PD;hqiZ+f|hu8fv;i5L#m!Tq!dezDZNN?d4#biT-0 zk*iz(f7pBXcqaS*54=)TlFpK(`z}#J5i6&0cTk8_PC3jWQy2-uW?OQX+qn|M%#`!_ zyg6-4g~TXwm^qZw%&;8CX8bO?PoK~C^Zowr-#@>ihnGQbv?9Iy)83b&-2I$KEcs;_>1b&^ZYKiYuyv3Rm&f8 z)cbCaaLJKA_qE#1t6nB+4WzbSG_CT1+hLqgZTqd})r;EFZ?B;gv47zT!DFzF{K9zT z#UDI?^&-FDwdT@(yau&h-45~)!h5+MjSJoVEE650af$_vsBb~Plgq;P+b`#6Yhfo% zb7#b)HNH?pJR_{i9=xjb9X*ZQ=%dD`-s!j`$}S$VUu{L7P+&yV-&`8$a?cjWD@wH% z4y_;Xqv{Td0NWjC;}>(cT?bO4zc1|j|*IKSoRDZ}SW4*yJkyH}=5i+ukP?Di>? z>o4f2j`=zlF7S7)28B_~+j;CD(s;!A51v7a?KO#t=_XY_<0hQzgps*aNAHdX)thog zreTa*B0u`Q!#X{&dvPhz5gqHQf-rrSmbjKmb2pw$(0*^*G+^X4f6k+tS!r}M-56~z zn5XxLgy&aBFbzm1_`~B40 z(x;68s)Lvbj4o)s%xdcj5Uj=C-44bZ1MJPUSYsd?Rg195Pzg&E}?3gQV_5B+=91UVX1$Cz4kflx? zV)y8v#L4mkpCyzhdmY>A>@;GJbjfN*=J}i={KeJ`FlUktCL_@upEVSub4=0C&H5jL zX8Xrr0)Gh>fC91oqwX&A+$#hhQ0A@lG|;31Il_$2o96tWraAyMZ8;&M?mLhR?RYrQ z?TD{xWv`q7-w&T3Wj*AafOFm=2d5iosp8<9)e}|TSzwR27y)?N2#R?lL7nG&+uD*; zh9&kMpufrSN9G}3_pHZWt|Gg%YsEp9lxEh1a=eHl`eL$6e(~8&D_S{Bl0<#&JR-h4 z?9U35Q?UJ=s~Ks%fR4rL6v}_GXjkYvnY+Tx*71G)$_{5LU2cwu30k~@aN6&m;#wnS zj<{XE{DL?=U$4XHigZaaVQsVDMjlLQDPLo9vlYb@A5O=kHef#6=)bO27dDrTU0-H9a$S9{qN8AB9knDEZ;jV`PqpV1L`)Z0;^)jyW_038U9Y8&V9UyP z0#udIG{M#to>)UU*5-S;^O@lVxBADcQv-7qyzX|?MYQ5}&I7>Ug2V&RCJS~6Wk

U# zg9^bSm#j0d*lLg&)uikqpi=WDpgu(C@)r&iR@|aXs4rhY1x;48HI22ht*WUC$8IF) zQ|egxV*9FF6iJ*Z>qd8O=OoHRqQ?r<6j!-dEuiFt$=kVWC&5eK=JxM=M*YZ&X?ZjL zlh0AXDV5d1gw;>dtZ*e`*6gwGe2=1ZL+xkle9aw&%7kbnc4fW@AW8~y3~h$F{RW=< zt@F)TW+&H~SeLSGj^0}Eb!s(V&2)`jYdp|l>E~-SPK|f_FBLRH$y@=In6fC{xvy_= zwOK}d1Cu-1dqkG-WgeBR>7fSR2~X<0dZ~SFU^PdlP!Q0BJki81t-sj-MAqjJ*IC=KUIhI;<}h;DrUE#r^*8>sev;oa^ND@ z920lE4kP^gavA?yg)ocGn+R}Uy=AWgOE+ZAXgW%bi~6t#_I3ptqu}T^+0y`k9D7dL ztlQCf#M+eL^U=T)t@gYn5Otq?pDNMTan|f7Y8ouhBvrynZGVMVN`8`#KjrJbC$yCLOc>_Q4q^1B6)gP2S zW0=Sh(R+i4JiD?MgAmu3YokttU7@;utXLhp5km}9LiX_ZX5|@a7U}D7W($O+FrZ@t zaP7HmC2a`{v3B+UW1`kD0l3irF;RH`6UT1!zS<4bPwDO>4OC2DfkvuzO?2luYPsWVeq!0abo_56X{JC*|pXrW^eu8&p zJ|Bs>+(f2R-Vxf_miv3^G$6Rft@jGh$3u7`T@8CJcIU`?czF{`dAVb4%(4r% z&W38K5MQ>|vqHYET5j8)TtEWg0*=M~XGnD;^;Ay~%S?G##GHGPaSNx%Ycog0Dp6}F zXPvI#l9L@qS((Wy;@aQsAB0bv%FXA3?=-z%A^4vtEe#X8xpD)inIGp$8*_Yp`oOTZBuw|>*xJjZ^f8o|biQkh%=){02I2dF$x- z^>OSb7It7a;|5{!i3T*k#bf})X=pf`t#D}IE+Wlvzy3+ZffyaL)6}%se);OT?cvKG z#;;P$T<)Ymq;JInR1ASyBJV`xhQ*oyM5#sHA@|%+tI2$6z*<(_xu{U-Du1>=$L{hl z(=w9K+Mi~p2Jpe6s5R!K-l->RmVLs!#^9`4N0OQ0DMe~p$AxP)4w#T%0xu?Ce8(3x@D^SS4#dEKJzc=IMDO>b__3HFzrcS$o zXBxx;5m~q#$nevUVgvLshjfUei@j-RcTc4O;tPZ%;m?pPZ1JW~BsQfPgv|Wlatteh znGdU~C#j$$9S|5*>;@``r;%!b!scgyD!5jCW|1FSM|jGWNOvXVG>-xu7cRfc1Sh zDw=gGd`&Zm<;|U1=ceQ?<+Th8u+$MC2b0#MPA=nAl~1y{XL=Su&TGQb?+YlFU^K(1Y6Q7nxM?@U-L;K!dVzk&9xTHLw+ ztE8H1npM@^>`DQV(F_n%*(8yxDdz=7#P`}Utl`|!I76{enV}D64`1!?sJ9Rvaet*) zmGy;{hp%pPiRKpr)&{YN8=2ddos$G)&aXj?9cS@D$vszp<>eXkA^WnvJW#Mj+jrb1 zSh8s4ZBk>{{xludM0cIW4jQj#H<;b70eVS^$cSDv% z3;>ty#=I;{mG>nl4{@s{$LS*wefixE&fI+Z&L(C0oySB(uO`kNI3)cD-$p&hExBp2 zAkWW5Ki7~yYs*Asj8vC2OIh=3Wxu(-Osw9N<0F|neamhw>j;~7$^4*VTweM`we`SC ziZvctM+{%<%h(mk+vM2Sd(3q+=+JpMeDIiJZ@g?PqaU?Ys`UT0bu+1B?}o z9=CLh{13`5>7Y;dR|WU|(vG#}q^~%n-y%Pk>DG<**+RTE5W8ExGro*Y)VVRC;cjx2 zzbe&f^Ul3&y56q#7tLU2dWrK{H4XaX<9K&Uia_HckPe+qzur74XdQRVO1@k*aV(B} z?+RLQ(Iw3UBx%*rxtcdO%0S?0V0S<@P?REF$Y#oF{T=hJz0PJ%Y#3)=Z9GP2)hus( zf(!&>f+)y4n zY%VST&dsW2N^zZr@2X+{YPv3*@%cH)poyXm78;NCc)(7FO z`TY|at)zXk;W?&-zg|Pzha2QvI)=+T`^_xRj+t4Qng(#dJ(D1xQ}9oa^SAHk1sml2 zpmrH|oTSPoG!Gt28hc=bF;)wmAikS+fA_SKS@PBFXy1%Yy`S>}f^CfL$-qdBoeK*@ zVu~e8mw5_AS2jkyZU0ySm+cD}s3^)F!me~KehQ%-ppoh`ukLRXg&9oG%rT41Cy~3l zvP@`W7Wl#A)T&sW*!8^ZO(X+Bii%mwW^Q?NIqGW>+!>7lv@A2nl^qdlTz-O@BsL{d zj>)uKa-PdcrvQ@HIox*~ohkP!&8|;8rpnA)Zq9dR%{)0oS7*S5!}?ehQ`k0FKb0=< z29lM{lkxd);Yz3G#a&n-ZM=~3?PBorZfOdLgYB1(70n~q^Q??x3cCTujW2D>JTStv zwvDK}2mo4VPa#C(%j-2n6H>q%tfyBtY`?8Wbng5w8fFG-nAZXGoKwpL^)>u(Nvc{t z020C!Qo==KLd0EiszC@deHU0Ux=%b;D`LFnfti}@%Jk?(*@mk@U^RXwK3_xRrQQ`2|d5pOLJ9~ZK}hso%G z>-V^ubiV0INQR}G#1H$u7tXu~ZwL%=7Y0s`0WL1`dVi6T`M_>l*Gs%MN*gSe#HGMpDt6%tB*QNId8E%MJI2_G zb)9ceXrFGfmPT#CsD+!nv|Hx;vF>MPFWsb{n9E&yVU93_a>-_#l6=>;b-NYXeX8Sq zy@^`v$%2~epGcRR%2E0a?3OKCyCsyLt6V z@6Y_52SDMn?Bj;uzz#mIXp0O*J(XoPUnQUQ;;QFdky)XM#C||O=p4y7)s9ViHL!$7 zpxR%ipWpz#8b zwg&1JaK+!VdwBPXwBRwulllJ=i!K=$$0A#a9h0!BjGyst&&Q=g4T zd=5BGomF2x-;o$c@Jt7_K?+T(X7YT-E0z}+fwpb8JY&dhu@`?-iO7?Yl&^svUouZ= zv=8AHq{_`OHawq8|KBH%^TM??6290pW@dQEVhQCG?kmijpz7xrE%WV*r;NwO!#N+f z?(=iJ3zM1@-KV*6L1@>q4EikVW}nda7qVoei`R6>3I4^Ao}I*T(B9aIm$1O%Tpat>Vmj%gw^e))ymVc8k=UgUD{Eqg(wd$@3UcQJ95 z6=v7j$<2;Dd9w2JwR?;?X`-YfLI|ZMt8R-{d2mudCq}xC=N7cf+0xZ34p^bey;?gn zy=cUF{Fr^%TwIx5#-!cLK71j)>ZP6L*^-UC#GWkB_`M;Idj~u4@dU_qpR}s}9_BBT zYEG%6hOBV0q*e$xIWosIcsy?{hz!Rp%1`|akhB65u@Nh^yrH$6TW((W5^xTIx!yC^ zksGf_FTN4qRG9xoLx9Lmf@;&!U zb>w=5-Q+QnjWYwA{G3BYuMAeDPG7f)-?jI4>3RanWk>xm@cZ6X%0`%`Q5@y4h6X9;_*uPSGeKoMD>|@^%RF3wV2ieq0tP&ruy@|+; z$h|Wzqi*eT6Ecb1f}-AVi5ap1&i+GLTuG@Izp9o%oeXm9Qbr!BzGc;PqdG0!5Sv&R zj`bOp4k&m^`Ih>}Spc=dIHf%wGs3Su?|5f1^d`@B5^Ro`g{{0R_W==6nIH&uvK)2b zqeel$a6`BTF=_z$_**H>L*F2EtF}Ggw3;cvegPPk!+_jHG(>y@a(4hRHdxi*u4_;?(PgEvkcKKL819R_miwF96o6m_B${M8yt zpgr__0Y9U*K-sjU7pr&p%M^R>3l~^i3#{=~6zVEWB2WY-j8Bv7WS2`;%Ra!N7aKdV znV_p(U+)J3Am^%O*mIT8(o`R0z2%(qFyr)^zt0};SIG*aFdBjb$i4-~AkIMFT|-Fk@?#s`+4OXi}t)Y635|?3{*i-vCOaIm zU}ipQAhc_?mGxy8L|`gMJZH?Em^pdUA`iTXzErhVjXJY3?lKUOd8f6sP3mwoo6TN> zGiIb!csoN%9**~wv$F5)KjfC@px*u9qUw8{xoTLb;9(n03EpJd0J6RGG9#uGh>x+K zWDQuJ$Tle}?Z9z|E9kq&Y(HN&z2Qm8{+d3GeFd`W?;Hs8TW zpyXy|4H&}$U@akJ7HcnNqAR{!br~4@=w^lp9-{Nk6$GefgA3DZd`L;JaQTC1X%;bk zfj;~^L`K~NEGUbX6UI~?o6=By25RW>=&T@@wBDr{sbb7(x)+NrG#8G?6{;R@dP<*+ z((M1Nn0qnQ`Z;YE|BNugRq_{g`nUXe8V&1KQay3F1#M)WEfmofar+WReu6oTHZgwSI68MDP?2GmtJ)B=h6b4$ zN2%KsP^V6T8w?1oQ>|Ugb~%36a>ESO4{F|e9e11fYks%Qy8P1kB4z#QN}X#u$)Nag zr{Uq8wwp7}(?gKrcOR!T&1V8*x0WkW)rdjp0SDjPoy3Xobith}4XAjvteAc`vjhOY zY1;%%?TLWB$fWJb%FEs_+a?A2t!8Rcwl0&~%=`r%Uie3N()?Q4WPCOgfZFaB)=Qmo z`E{ch5bNo;e$TRsZL-?;t@9j}vVzhSP0Y;i#M=y9T8?*%f@a!k(hiA3W?jnbU)*IC_%l@?32c^Hc zfYlk46HcbP4!|#RNDuV434N)WCgIdPl=%tn33%I@=F{b^&QnXx;ApxyJU>mj|2fqr z__MxpKpX3>0#Lpnnr-gr#vG)t?&(O+<@}aqd8{q-_|mI#O*?|dDKJn(1Nz-2W2x35 zr~XA&yN;MdN%O#;F-<86Hc(*S@^pJQ@w5XD%BjZ}UPIjn+rIAie1-NkGW*zCykeyr zkbh?B0^O-9{dJ&m><#aEv`^ZcUEOFd+mL%n?8kU%#0hOlEfUGoSA8!aU_| z@G@yVvu?Unw&&bExjOAd2{Pr_ApznFq z$}1CcBS|p7CU=*dI@^xnx9zJ|sus7lcSS~Gb=kbjBlfsvG^{wj6x+1x(T2@O zH*DPMllbQK6H%IXtF}urV*Gxo{Yv0K4WAhm-E%2*p4+pKx7y;DaVx}>3Q{A4s)T9Us;r44uO^+l=;8bl%ZA6(>qwq5% z;%P|Jt&oK7pdUBZ-LQ##^lMzKUD>@?37$f43n{#u09Jay+Qxvlhft5Y0~)V-`PWp# zfff`@UlRM#0(<-2Un@`R*lA(w?iYz~+zMP}*CSo#);XpN&~a%EaSdQ$z8ybSSN?Km zc827S?|~cc+O;+KU5gn0L1xq0X`{{&p$M%X-@SSC^E3ZVqWA|cy7#mDt&*E^R8u$q zxYveFlSiNV^R@#&lD-b@yq0o)uOWLc@=C-uyWUT{9K$aMMa3K7wzV&O@5%D4biLGu{Ig$^=}#jM!XR(7D=u$E z+`7KADrvaC?2^?Vy*1a}RM}cyDPpdB)%an{I}78P?c2A}|CsT)r@B)^7E?j}W$KTb zMztP|92d8L2s_OsYllj?u zUfNnq#Dnr3zc<(X!6wvdisl)SM=m8n7d++;{kYgC!nW@=oxNkgKZijEe{}8H^%RUw zjG}eb`_oo)2soDSHU4PFK5DsdfpFmkJ zo(LJQ|NfBHd*IMv*Eh8p@BYE>b^Uf*)_hkh4WzeilylV;4wD{PnN1$t2paV{`1?rw z&zt^r|9cyQf_P|0In*2Bu#g>64o`PX-mm>``uj5Zu`t$F*MI6k(D>#!C9At}N_IEM z_F(z5_Cj^R!eR@hVf>93tNxeY{L7W{9&J9L?B7vgRVU;nRTShXg3Q93X~hHoovul3 z{eSQL!B3mx==Ar7!ej1^B4}NaFcGmscb|&Mz|3|W`Xsj-AM#f}e+P;G*$v@cK_gAH z!6zQ-(=!{tuvmPq|MvI)(yKcL_VhMc zXyfZH>pLzGvraW&7@ygDLGV|~ME_;5|7X{MB|dfH({(*Z&`sO@^6M!@&}InpVvsyS{v~fb)wBB>ETUS$)gvl zRc=gli;Ez2{R%*w&|34} zcYDmFRBgg4ssTH7r{P;nJ8L-z=KowzW4~EJ{{T%M>F(aRsqV7RdEtN54$*D91UEcGL*4WQLAH&BU?D$9Rkl1tdM(>m0 zu;c$Aj5ch#4D?ar^S6Wls2xhjbyF%tMfP9&N9!|e>n?ev*YD2$qjuO{1twi$=cyO} zdENcnr~7svy6NUiH>nC8+ZO><2 zef;0RULUZO{|5HIh5LU4``?z%e-ZoNa+B_V5&Pe8{J)6(Z>0UdROH|CYtVnG$iESk z|5A~E(+U6oQxW^RpUxnf&(Dr4%$gsrm0#0epA0wYPigm0RD9lA+MaQYRtBcr%MdKo z*>zs^`a4nBVFXZokHK2}(oqD9w}JqV@faU{svCinpq;uYEJ7MLvD}D|Cljc?8xo9i=9m_2WrBBK%0N!mE*7on<2w|cK6?;iN5J6`Xiu=POCoj2XI)5(YtQNfLk zwtpw_x=291c!*oL{!K-XZP_Ss*Z-YV|5rEv(FB*~zSXMS+b{-f)m#<5>`&kl86K14 zR;KrldYBJ3tT6XF%!px#8#5sb(U01dswbgOxjVQh>yo>nmEV*xJNZ^+rHv(UKW(?ubVo5{{`UC8fEl4H^n6|laZZTOYiqWr+ObUuR$+gKY{K=pBQil<9d{E zXK7lCZ-gF_uWt_He`czGZrt87_BoYi#Y~oo^XIWLn*&@3`+2m9Ip8VC@(yzl@BjqP zs;?h=!YJPUO83`sb|DpNm&AWvIkdoNo?Ji=wUh@{#BSS%y%5bWU67G3$w?^F3luIE z-6f(Ly8q}xX}HtI;rrBNw~Un+kGwEPEO>vD$Da-XF>n3u`JKNZ*`2Fj$DE?Yg?Hec|)iBS1yFm#{AAa#3Mw;(*+1C(V zEy$(@_S-zHn-!QILH$ci8?MFlwkxOpX3X7_1r%(@{i;Xx!^fhxi`oGXNj3X+2~N)O zU75k~%5K}qlHzGtXR~#4G-ff_|K=!Ks|c6y)3MK6+%xQag1mxe<_;7VQia2UN7A~k z`Du4y#zQYCOc*)pos^O{N4u;BFu}_gDjs}$>;JHc3ocsSh`#1;Hi52rt^*nh8l-mX z&ctTxmVC5R5!_Vu|85^6;crHv5HCL6p(bIc(!uSRA6YA_wrASebmjYznc7eSYGATG z+^2d6YPUSPVG0>9*tuwLOAX)C>$=Tjx%Z&h%Y37hispb`!ravP;)cUI_sbH>l5HE` z3L{b9Eex-VOS^jnxW7v95@>lhU94qG+N3Og;1=8zmG6ps9Ys4&S)z2&6uC4Ib~(+@ zFpDd9Wp2ygjTgWlusr^Ehf?qq@E1m(I+Eu*LDn#GdBEIfeICtyJnv{SZW_5t~!wmGnpR#IxZP`x?B{(T<)nv!vv>w+63*$_k>KA8+BNsyUO(&3*-l4etZj_ad zd;5W7ro1^j2S%?m63gX`=Qt=|L;FlpNzmY8*f#wmD^GmX1RwnhH*}*zq-l?r3ULXf z-WM0F7cYun-q*mGwt-a7Wd?-STDWLc@U*Xg1!AzGuE^6puzWLc(3xu!1O|_a4A_=uR_J?cUExsW1(Cffx z^<4S;f6qtE_tkMRw|2N>V zNn|EwQNVHj=nYOgKbV#JE}=lF{hHiiCRVZa^ru(vRe}b6&J3mKT&3@lA7v8r z*nVhaari3LV#2Di}pap*K8sIGJQ+x!wRm~h|%8eq)zwQ z>&>N3wP^Jd_KF}<{7dk9ffa(GWA8Uj{#@sArC4mD1>yL-axTMS`C+?8O;7hvqo9WC z=W?TqKi`$&6Beo^Xmf6mHV02R*x9y;yQ@Y^MYhFnTaDnVFDF$xbu8U+rF@Yi^S@Ot z$rPUW`0ei&g?$QOE^2+c60!2jA3{B7|07+uO?8msCgp*cYi@%Y?09m2Wwez*%(JMe zEea$$ar1+T)=CBb8}C`=QU~sI zn2<3SdL@D~U0&Ed^12~@Rb!ZnDhCeHP-&3XPA>}03F=#28qTB(c-HvH?dnSI5wJ(0&@_#fgPDah_-PV${{6iVZxx+} zYnhm)_&4Pf&nXmY)mG3ydG>iV6m^$lmM>Ot;73ryuX@>d&8f#h$ccGl5}RZ_iF`{y zsUxopD^+*9wSBBrYqZ_VAiP*{J#RSsv3#%)MZg1Vdq=eR;LIW+bMB z`Ka12mfl%pILF}^aN|Sa+vZH+^c4w6;OAsjaVaikkshrL?d2Di+pgd}>5_G;&)6`4 z+S=JS#}`8j$=n4)@9NehM(@K_ajgeb%#%m_pGJ=fm)l3nocR4d4V-y5x{{Z^t&-<_q$9)a6S`rg> zq>S7JmU-=>Q^RPsGxFRKlh5M|;^AB5P@$ubh_^ ztV!D(4zO79eGRRKpggQG!Sx8@d+qzJ)y~bNtS!3Co()XN1w8QNnP^evqhto!yQ7ds zaOHe9U#Z!U8Kr5xPoDC1FndM+==8D`f3=FOG&|{u)@DC5ky5#vRe5i`?7MAMfU19A zL{E5zvH0wM7TEFQXPrl^L;jzHHz?!IYP*9?+K(WooP$&OAxmL!W38$9Gu=|mu>wDc z*2?)=daKLi#6*)s?S|_YX9TI}ts5ej)1^&3YnQ zL}%vn#XYv(oZEhheM@Cc1=R`MZZ|Ue*JmFry_4zBzi1WaHYrY%J31Nf zWA^g>deJye^q$g9bzUn`;nl=Zn2grg`MWha-7T=jm5 zH>+vKO++8+jd0{QSb?nRlZAqX>bL5D+fxE;4bTNdr~lEYL#U^J8b7wiM}Y^*hs;_M zv>R-I#hJd(r21Uq(mbsY)zKT1xXtfVh`72Yr^N8J5{TcmMZXOvibxa6hn!#Dsww3$ z!mE_6(#4bU;x!Yw+hw#HQ>;s5qa!quy?*GC2vS%wM(CthKJ9D90^$5J=A|~ zLY)*iB{gB>K@to_X*P2yd}%6S-eOmGUyn>8w@u3%&>POqqV2#Gt*UDpZ-?3n*G9a1 z=j4&;iVA%T^Er{{q^IPu)>eq_Q?!KyWnG{ba!3a z0hL`HX~n2ptjtL0Q27b~krEZu;4yvkg>Q9>75v^_gQx%-GJ{>=$Zty$%?S&KV*82R z4-IFj1cA-ZOTe-!a2i&z@ozlGZ<&6j!%@bpc+Yl8tqgMeAOVSwb*Iq|l-KyZ(VFV} z<#(R1Qki8c7nP<3JCDwe-#Z6qJ=8mcay@vw@FC|~SM0)&7WzAxlW_ZaN*mu@PZTJi1utRJFUIJ9K*Y_Vc5^)3GWDS%Tb)x&k zi=(u72Q~R9vj68^LC6ur2OABT>(q3KNN-zv2pyb#-fJ;xP>QLj>Pbhxlmxx)XWAa^ zuKyLQoW13#n`)RX;-D5>H8LMOq&(aFOzkNnJyULTlV$bMVhFrvOBl=sF=nsYLCIVk z>ECuzAGV(;#n|-5kK1$N^$-71OVYDUXcLr}LmyqNo zP}`NJRWW_Vks!K!{C!P_$CX1-3>sH?)x&XD@6Gn9Xd-q(C2^p7nE_vF(T5e?jiaM_{T!`s{4Vz*^{W@saT=Yus%ve8?Q`~e8qo%rgDmTy3_`+nw z_iiAleZ9GTt1}eweT<_e#3m(Q)vy><*}fN51f4lmou=1y;S(whP=d(RYP9V`zC-C$ z6+02{hpMK#=F00XeK=T&RcZwZWrL;AB~-wcY8_=;m$8^uRiAmOZZY! zJD5pGa!N)x!ZoYAIcyz%%`7eJd3k!~WZNWod4)U9n`j9apTIl{XvY!7qWc1v_kXJf#24_MQ!A0aC*cI@-4 zVr9TXm(`k~I8P5^uL{q7Wsteb413DhYf_QUk=ZH8lLlR!$8&l$wS0VFYLCy$dN46TQ2Ueoc(s7I=x!*3olt#xS_@nZN%IDio@zIa{9SaTE(6;GAkQ;x|FTDeY&HQWjCgtoDH4`?CElvr z?&{j&-Oh+pzkwJab9vmA@pz>2cgbRIq7HO`b$;?i_@1Ui)ivyVYI4HE+xwEVn0cxX z2ZVB}@B&UW=B+!a53X~cr9xVsds$W0r$?ReQT$Zu-{>fF*SBYTEYUj(YmmWd<8CyR=HX1$N5+18w+(*5jBMCW-rGZqZ(oRQg2wYX zREWo0^KBDTZrG&$F5y?@eU1Zuympz@s|6&pal?J?ul9=X7*KI}m2g)`vL|C;&ft7* zcY5r$16&gQy|qVWy4VNwr-;L9ZB#ji;#tO&;t_fDp;-v(Y#1`t9qiT)Z`Bl#{@bRTef!?+L1LC(&wRXU>J-K0z7KwBy zZb>#fcEJU?A^_gy{?hs$sR+FDfm31rSWlXkGb|}YVaU$wLE~i!sFIX+;QOsefmS^g zqvN)O;Zj(tEjn_`UZ1u2Xyz^IzR29%0kSCXOMB~s>Thvs{ZqYyPqi{U)j%QUDMqVc zbzFL884~`u&IEjc(Q+a=HBa!??!j}s^uXRNLB`+_!+KhnI}^T8$z;6od1!vu6sb&} zd%ZLhAqij{p{aTF@e^DZbRYG67ddtDOLL&o^q^(RBI!bqBdD|;Hl;7E@4O{Ux#GiI z_tzAL$5opALg<3Bggl(1-(@}*WjWGVGCl33ca^L35T>*tpk+w{!TlB@qaxMqH(!;I zBHCN+9UYBc{HAg^;EU-(Dd|-!zcnI(&cuEij&qulJhC~gFH8tnDuZK(Xw_G$oTiE4 z1n{!B)G&gCr%H{5f|Q<)82=&Gh!}cS|;>qG4KV?z;JG3dbKRK zPpyVdBa8TBd)yr)eBhi%(Vo-DI78~UR(J-l5WTx~&s0N0pR8T7YO3XOX^WFz^Oo6i z!LUBKj8j3q?D2MLj)+G+xD%?>&FSa#g-upOQ)>_NR$^&erM}D?UgP`iWW1^A*IVY@ zEM1OGhw@|~D(!{g&^Mfl&jzC(xRIb3bs0(F-}*Fvvwh%SBp8ZIv{P{jvf>q?l;@22 zQ%?Mlzc!Eq_DA0@0Kk$Wj%)mB#_LM#z6YFkLQAT;H$L~B{A_J=_fni~au02%M;Sg~ zp%M-qJV;QKPPvZtHB_jk=X-b?zJ@9WTEGFfbcSmBmO=+$bsW|;}X;sNnf zr|oLG4`Z556I8Q?f^04Nx3J19N8Zi?dai+ua|y;~(vLzjiO|RI$F0qaD!{^_U3ZX* z-V@V}36U1%50>nvuQC>0fuG})u zpDF|-T`31Ht`lsf*Q$~@*cEyzXfRLLx%+Ij_ge`Q6;I1!mirxAk{X(2C4_4^{>ga) z@`S6HM}P-xO1i}x;;-j%4;=U%lW@%wX+4(dx3bVB)vSEWFt{P0!mo<}_^B+%bQCv7 z2$YHhr2e+13b%WK@EZQ-XVr2QiVDvTA1Kqz-@1tl( zRD%e!N9Oogl4o7{VhrEpjJ|Y9qGa_8d>{48iu}bUL%wdY?M+0jO6kN>9W+5~pLos| zG#>BO*R)_;AKX05wKVVKHURaqg?6UY-8dcchtmP>6v?t)#Q|9RB%ujPxl+_Ec0%e; z1K^Ra#V){P-2d*kt$(KKPqv$*h^20FJyt{yWZ;6k_w;3nmwPPFrFNTrUwS7FlJcy% z|60jnm4m@tAKhW+9MO8HhE4G6;ez_JFZwQAXHe+wvl*bJIlu*R&9DnJwN1|3%tbel zIFxGe@TS)dM`CTzJ&KxS7Z{zj1b;dpaZ7bMaofH^*8y)9bfW6bTAWD;=L`vQ4~==3 z?vgVJ?bl3FOZAy7?^16#jB)I4?xOOXXF2>SQPkvnp=tw#5|4!ey;BeNx3!J>uGD}p6x7@weLouDKvzpfrA8ApE5((!!J?oyQ3qHNsvTAeG zka58!*~7&d+S+TD+~4D$(k*y47fl3z0O5gNNKH*vDRicEJeReUQGUSTvf zyWHoAQE2ACSGe8;z$PWl-B-FXw&`hr^>WXriHJe}UB!JtgVM0?U1T8(Yppl_V=g-E zfgpAi9@)u)Mw1uRm8gutt{|X<)P5NG%N%;pUs|VBkQt ztE;;eXn&r251mk8&|Q*}lcMpD9D#EjpxJUIR#aO!rBI- zDxHb={<#A{VaUk?nnWB~7>Ow)FH|MCV0XhgIPww)16uR)cDnTVqwqWS@#*h1os}<0 zbH?-wifRWNBoh45>2Uh*=O$j&1>W82zblsk>{r&)+oIand|P7K7kC@r`^=s5Hh}Nypo$uo zMPmqncRk^@TA>&nJ^nCY=JCMf_I;QsQOFsbdsU@d&+Tpds9vd&%?>fCk49pt%TGAf zh4Kqs8tgnb$mJsa({=G6ST*eYyQ=*`j(!6Hd`&nBe5B`CQa0}1nUN$IM|DbLOr0An z#uXBV0mf7HQZ1LFKx#RF-}`YQF1E!Q(X*wXm1NlBRz1}N=5c+ST1z}6eMw1G>Lefd zGOu7x1>OQ7MgcBmdIpw}o64~bJeHmVzjiT`tzy7 z61qtqSYLGUfv<;w$Ehv|njfmRwZ&@BHn)$~l(rhkF?n+Mz#J>z@{Z+c{uG%JIOL}H z{hahAomR>$SCT(9u(+l@9*a+D)0B?x8R(5DkW2P|+gx0lD-*Ej0JzaQM~)O6l80{$ z&Xe?Xcp|B3rJ7|+G(qt&G;`pPS;MlQix}fPm9)Dj)Do{BRz5d7{RFD78Dq(q!$H*A zJ;WSt>_g|c!V2*Kb@Eehr-WBmurN?jublWN1teqNB>mBJU=WB(=Q!XT2Ah%LV z{^zKJtq5S!R5`W6)xY}rK_{c01Q=uofU^mwq|9ygm6j7jnFi98dIfw&58Gjk^{@5T zuIhL_!Ae%w;xS|eAmndnjw4lNya&>U9;c+N%oSCDlOPYf2=T)<<)j?Onj6zFqJzO2 zZGUsiVzRLhUcZHyNeduP_03{aAo?oGmYXIU(fk^?RG1cb56)lPOVx0pvsz@2ycC81 zf)Yu@o&;-^btljkn)ww+hH&pyeuAD)N!KK?wNq6L8C!pKdogiQIN2~fs(pv-lZX(T^7W) z?OH_|oZ|1RHtc|}>!zv~p$vSwE}(652$+YR&JzIliZ2(Lg0aI@dVua)D8_! z##a48_9(RQ$Ey5ZGz^hF0?udSMh`dSpnUc1PZb9Y)&}4M0$ZO&D~`8dbnLu`LI6{F zzO8Vga&!-l76^FUfUXRhTt6~XE6njMtz3l?P^*i&vQxAT6fIG4R~fPL>fLR8XO9)L zaT2YPGI!f|HdFP;-apGH0XwSqIOixYS#BIJmQK^AG8Vhq=#-(SYMQYV9{i3l^St&! zE~IwxWJ}oI>J7QY@EhUYN?(l#ko@C)@Wr&09nY-)+G@)z!pWqybu2C&)|s%RN3`8} zFC$w%75~%NtTmal8V$OP1}|N z6X!~TM_vEk3QIJjk-AT7JrHQ-3C+Uz4?DJDm#LE-InNO2%~<5s!M&LUnON9Ukk{3K zof5vjXm*(g>`xE&qSJ*-52AE)Zr_$D&96<+LmkI>tiBa5yo#*Q^-dU8r-d5;f~d{M z*LXHIH65tGcX_W#H+y8-lCfZGJiz0u`RwJL-_wk0HxytNM(An^d{)z=!gSCqTN-;P zB8Onn(zoPhU(v?&G=qE-RlGF2*6o8UZLG4rGz`Qti1-W>UCj(Xb~0wQGp}ub?wkkO zZ{#7};RJlI(hTfQyPq%??a}y49n0=HxSIE1*7_+XlQ8OM+oTzAcImdG{}%ldOY=H_ z-Qw8FTV~Rg(AJpy5=e9rKU&*scCvn4$P-8s0T&T$3|d1V!{ZM|J6WJU{-bEe()c4% znU~D{M{pm??r~bxHjyFwCZU;~%yJRIgY2sN>wQ^7qovOFLOWb6ktG#X$s9EK1T2%e z%@h94(XWc3G2j=D!(Frv8!|G*tB!-AwO845bAD5>RhmiH0Nn&Z!Grjez0ZRUcK42n z7m|JluA@WFs?4C^G|`v&$LZT@tBou|GT;gg5wVe&B62Ypz_JC#--tX#&B8^lCMYXR zzv#!lqL+w!S8Z`qpA-WnvGFD;3M3u{+ra73D<(-D=xEt6vTC+ztR&eFf^m{c z>)22ZefDi^>!98f5aoc7YR$2)$BAm<8frH8Ta1@hvqxwvRM&1Ya!aK;X+WCbmaz1Y zV5MTb+1;n#(QWq15BTU!jMHKQ7ig^aTyMj6`VxxUQ4a=hjwh6BaEgL6dzBL6f?_22JKyjJM%> zj*crXu+8>Q#aP=DJ?0szq3DIdX~tF_tH5wRg{z6taJ2P@%`hT8y-<`aE!;DFm32;{ zlq{gW`Q}o1#;+Vp(7JNzWr%vkS;Nt$-PMP!ebyPwka8^0=W##IHjip zP{IjFEA#pl#@0>5JLp=FJymL89X8I)lRVL^y?tL;jAo*rvHe8MFfbJnwldio*=G{B zItC#yZyw?((On)Pxq|mg_Kn(JXG6?mlj?0TJ3uYqkPWx(HpS#-S2D?0bCBO-FHd=; z0RY87t9K)%l%%`)iI*}jrRVf+G_hRf_q)3m$r~i6Q`7?`M|lnEU7%yDA6#5lmmRPZ3O5Bl!w*ho zq_ST{J7z6NAkV2k<1z5q3X+Jz32;r_L~I?e~&zoU6OU2S^F* zW=SUlh*U}AH>eQDguDURnD1aR(aJY&vau!A|PaCf5+t0}x&w1s-KnKTL)Qe@+% zW;qh+2f+q54V{8|EIOS)S-30XsB*H#z~)J6OyBmMT%A}#m1zQPj2zfCx`9WATGX5| zTx!|-oMF-(v&b~-EmqKi3r$cpIm=@Kc;S(GG0<$s*Ts3y-3|!*}%WaMO7|fZC zh0{f*aVq4i1StPICQl#(bA?EoQEXb6)O3W8Rn4ZOn_chsa`<&JY5kFK#9 zJS6?G%MjE*#~3KiQxFi+2!Is@VU1(#Qfcb_(l-PzWWy{^ip30a)w5(Jmv%ivHY^pl z+t{rR=YlA-O>Jg$-M-)ig-8&-w|{1~-4uF020;?%EtBUG(>#X2aIZupZtjz!qRJZw zJdjh1zCpr@d7X7raC3tq?u8B5;bG#(_`Qc+qJU0R1Emof-5ks&zQA5~%F}idY+$}| zCZ@9F@`J;!^p$|MkH-Cs2&os-}gRDDZeH_aCw;LnzB< z`mJFrWuUjL-Jt{1B-mz_LOYp}D*S>~s>;EPzH0`XpI?lV9IVM$HZ32n(OuHhR-7vC zooT~MYUCCBbh#l(T?rwrlfD>L@_F?7ddEbu6U@>YJgk0IF3<0fu0cYpD^W2p-P#HM z3Qxw(zvF9NUMp+<{T6^|+?1bcD-;+Yse_?y4+4EOhw0(Wj$^U97`FuPK?tHdu@{y* z)7))YZ2Oxy@O4*EA{S8@Z8lwKw`@710C&H3%|VNz3`?@LzUPEZbxu$z;3&#>hJ@8v zq>PM~Y->hwuwX1H?FE+;Wtz~`L>B+Yl~)?<1g^?EYIjZ@y$M-KQ~ zw|ix^Ydjj3rN{%+#i{Urssd{&)=mMeTvq8L;u!mBue+LxnSQ^H=rH5&rW!3R@Tx%H zDNmQlHMT~z zJ^~DerUiPDgWIu7bV0+l0TT*fC_p`Ji zdf;l`gzr5TOoCiy)pO*4+G?lE=Qa+##G$C?vw<4^ws%v{}X8onp>e!Z0|Mt*s$-r`;+sF+cE3QrdSU7&?1bi|o7AJ+WkhqP z7ITn2&x1fmn|1ZI*PHFE;?V+0+irP630tn<>TcED_o6zh1I%=sS5V#1MgwAC4B{Ux zKXB6uK0146FwV80B4|81bsE^JNrUSs1L%M%tUV{7f>*?&V@cT z*k&PBHUh0s_aHdjgrR-Py=d=MAKq}|wOWUb!weqGxeJAm>$YU6nAy7UtAhqSLs$Z? zd~fsw5!%JhO|G7`nNce@IcBBazFlEax)kdhnuK-B*y>8a%1Dn4vR1hdj-B>d=Ocl` zL6km&Vb>kyYj;O#gzNr5Ydi2u&&4+)y_VS(GE{L&F;KTj0)S2n-T{JiTn1OZ_>&$Z zo7hdY4%CsZ8rMkxEM_khur3QA9r;IGr)zvk)!p-;VqI#(W0u4(!Aio>%4#}sysoX& z=%nWXjR%pI+n`HXPom>19z2m|(|St@dd~ES@8uKDpZ7BEo_+Nj2o!ij^k6XCGhPi| z&KF|LOlNj)H1$2J&=Sfodv#C3{T9g5-4djiOG&L;g2Bnlb?5-d3~rg0eG9D;Fw?DE zFISl|N>7z59+R%$^R{}6&?3}Wvb1th>*3?IMfaVj;kuEo|HD59c-vg1HvQd1`WN=&2Ye$*7b)$kGI$aCRgp@P^m#nA^2GBB}nih!Op zWU#YFuppjo#QxRP)6PI8DXLgitxpzhjc3ZaTwLhvROqr^<_iupHc$}_+4?0 zuLdXbq2zW}67A=e>z%Xhd}L&@ZAV`|ZbY^^5+fS%I8)(Rw6q#>b``{*t(7aW@MjD|mL=IEv zPrC~Dpd0SfMBl}Mwqd%W!crcx@0QFp#BQ062)M5CciJ{^0b9T_8;8cWp(7lwN!jtz zqP&iqvWd{7$g456&VRbx^^AsdB7iwR9&Zo6_CKr9JSIc47R4O=Wf@2 z1QXZuI;ratU00W#yX+5n4s4j5R=|NTox@~Eox@B}dWW4is2?W86=HEGG<=4a&pWl} z4rt$IaPHVxD21k|zpcIf`eTKXs9OHdManoEEA&iK(c}BSoAibv+6A~0Ahgk;=_F$@ zDuICHO7~w4_jIu`-F5J?_*}M6D{%C>wX1!4VeZ@~08i{xwG<)F?<7NCLV* z5wBC;kA#HY5Z;+f+@GJ73h5#}CCjq32+pQb<5q$=gp8&BN}gM3 z@MrBph%%VlTI4RhegrF~-!9$ifHxDZ60m!_It8$%BTQJ$-p958Di;QgIS!Bm{eV;)R}pO>T| z)R*Ktg@Q`lB%X1Oi!dduEox8V@g%#hcr!~xDiw47BK8PRpn{YJD*knWdb5(0e4YL0 zXwQ8{!sOa>=9X%Xr(9q5s6X`Zl@7?z>g_}i?_V3&fglRg2OmD*(p=+5Le6Z?mC&)% z37RHhxGGpk16t= zUX|4anc)*n&CQEeUdXr(yeo1YEO_CBjZv+X!o=WBNX;a74}xfS8Z~_=r`MKJJ_u`} zMO{@*sXQ{K6Dysk#-(lRjquOh65LgJl4h?o^L;|hWBq?Thv{%F8#cXXi|(J zT$BKBp^7G}xlfR7O4JQ_e7XK!bxyIACmB_4qQ9vzT{-@KThT13?AG}|EysbNi4*4a z>lr#>eR5%i0j>%Z&%<+)f7_M6tcju1K~1kDZEZrRA*ey+S#UwuUuNe3yMR0#UldrH zrN`p;-1i@Y^hd|_e1NJ)n0|BDMUl^v#KZ?cg_ggl{^tQC4v-K(No6hL=47h8U;OJ| zr2o>JuL=Y_WRRJdS`mu(Pp6`uh$v~?lmBO)Unl(c!>e0>zGe6J#4|8IYht!nIr$&L zU+I84fML(`b7DJmK{BZc@B9ycOY~3m{kvt?0Ax6dbjWTMJj@!ZVt?oP*HQlSU<-H+ zt=E;8f`@?ORw6NP)sO$p+uwtI-IG~Ai;9Th+D-C9SXsl^?O-RqyIKc=g19(=oZE|I z{$bCWgVmdMwf~pFf5zk#c~So)LkZ~sNi1u45!QTteE$|${qpyj;N%6k5V~%!t;}@j zs-R#GG;S)nm+c3v=mQf>)462!0wjDP7cunt>7hR7Z|VO_ggTc2Q#9Xj1YD|4l;G}{Jsfl*tLVH-=Q)s|&i5w^vi<3xCgs`BZ1<%NpZ;9Ibo>2y zIU@S{z}P0I`k8-m@(q)3SXm>iS((}N?5vbSa+Xh3#U_uq55Bti{T^cBfO4N>lpVq* zKArPn(;s#@qWdAr^mggrF4+Im)%N)pT~OWLf!BRXEX(LS`7n7etex*0KK?D1P*aeh zYT0!Zq@6wLj*qHIjLB?xA@uvgILN@o#u{<)+?@=Spr@1l0{XvRj~nZ~bo&PjmLUSX zd73@J$Ycil0RjD&*W-dUKKkGMevh6lKq467rrc%FK_l1DocBdc-1*8});Co;B(?CpX}A zXO@H7dfj6CH~h>OrH;1PoMn`ax~BOL_y3xhm^rWm$@n4DrKV&`x)y3A#=}~IiOuQ) z+_x|MKPvL)I7iB!WcX!-DMsQPdF^2qCJC*jb8h;+NAwN68Seahcm8G+P$M9kr;XIb zhkVSbYIpmZ&luk%fb-LS63h8MG0wkK*$CrwpZhV);&wtSyS+Fhk8Pp0@4tK0*Ont0 z0FMV)d{?bIMR-Xi5mJ!`n8>LC!-9b*!d>8-{TsIawIp+z!0yR7@=53yBhbhVd24R$ z%Q}Jl_r<|)f545`dRd7(e*Wye5gK{c$8^s3&{IRN}_-O&HJ;3wOs{|B3A Bp7H`__ + +2. `Prompt Template Example with speculative inference `__ + + +Example Implementation: + + .. code-block:: python + + import flexflow.serve as ff + from langchain.prompts import PromptTemplate + + ff_llm = FlexFlowLLM(...) + ff_llm.compile_and_start(...) + + template = "Question: {question}\nAnswer:" + prompt = PromptTemplate(template=template, input_variables=["question"]) + + response = ff_llm.generate("Who was the US president in 1997?") diff --git a/docs/source/rag.rst b/docs/source/rag.rst new file mode 100644 index 0000000000..4b869c2352 --- /dev/null +++ b/docs/source/rag.rst @@ -0,0 +1,90 @@ +:tocdepth: 1 +******** +RAG Q&A +******** + +Retrieval Augmented Generation (RAG) combines language models with external knowledge. This use case integrates RAG with FlexFlow Serve for Q&A with documents. + +Requirements +============ + +- FlexFlow Serve setup. +- Retriever setup for RAG. + +Implementation +============== + +1. FlexFlow Initialization + Initialize and configure FlexFlow Serve. + +2. Data Retrieval Setup + Setup a retriever for sourcing information relevant to user queries. + +3. RAG Integration + Integrate the retriever with FlexFlow Serve. + +4. Response Generation + Use the LLM with RAG to generate responses based on model's knowledge and retrieved information. + +5. Shutdown + The FlexFlow server automatically shuts down after generating the response. + +Example +======= + +A complete code example for a web-document Q&A using FlexFlow can be found here: + +1. `Rag Q&A Example with incremental decoding `__ + +2. `Rag Q&A Example with speculative inference `__ + + +Example Implementation: + + .. code-block:: python + + # imports + + # compile and start server + ff_llm = FlexFlowLLM(...) + gen_config = ff.GenerationConfig(...) + ff_llm.compile_and_start(...) + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + + # Load web page content + loader = WebBaseLoader("https://example.com/data") + data = loader.load() + + # Split text + text_splitter = RecursiveCharacterTextSplitter(...) + all_splits = text_splitter.split_documents(data) + + # Initialize embeddings + embeddings = OpenAIEmbeddings(...) + + # Create VectorStore + vectorstore = Chroma.from_documents(all_splits, embeddings) + + # Use VectorStore as a retriever + retriever = vectorstore.as_retriever() + + # Apply similarity search + question = "Example Question" + docs = vectorstore.similarity_search(question) + max_chars_per_doc = 100 + docs_text = ''.join([docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))]) + + # Using a Prompt Template + prompt_rag = PromptTemplate.from_template( + "Summarize the main themes in these retrieved docs: {docs_text}" + ) + + # Build Chain + llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag) + + # Run + rag_result = llm_chain_rag(docs_text) + + # Stop the server + ff_llm.stop_server() \ No newline at end of file diff --git a/docs/source/serve_api.rst b/docs/source/serve_api.rst new file mode 100644 index 0000000000..6a607cbf0c --- /dev/null +++ b/docs/source/serve_api.rst @@ -0,0 +1,7 @@ +************************** +FlexFlow Serve Python API +************************** + +.. toctree:: + serve_fastapi + serve_gradioapi \ No newline at end of file diff --git a/docs/source/serve_fastapi.rst b/docs/source/serve_fastapi.rst new file mode 100644 index 0000000000..0aa6634670 --- /dev/null +++ b/docs/source/serve_fastapi.rst @@ -0,0 +1,106 @@ +:tocdepth: 1 +*********************** +FlexFlow Serve FastAPI +*********************** + +Introduction +============ + +The Python API for FlexFlow Serve enables users to initialize, manage and interact with large language models (LLMs) via FastAPI or Gradio. + +Requirements +------------ + +- FlexFlow Serve setup with necessary configurations. +- FastAPI and Uvicorn for running the API server. + +API Configuration +================= + +Users can configure the API using FastAPI to handle requests and manage the model. + +1. FastAPI Application Initialization + Initialize the FastAPI application to create API endpoints. + +2. Request Model Definition + Define the model for API requests using Pydantic. + +3. Global Variable for LLM Model + Declare a global variable to store the LLM model. + +Example +------- + +.. code-block:: python + + from fastapi import FastAPI + from pydantic import BaseModel + import flexflow.serve as ff + + app = FastAPI() + + class PromptRequest(BaseModel): + prompt: str + + llm = None + +Endpoint Creation +================= + +Create API endpoints for LLM interactions to handle generation requests. + +1. Initialize Model on Startup + Use the FastAPI event handler to initialize and compile the LLM model when the API server starts. + +2. Generate Response Endpoint + Create a POST endpoint to generate responses based on the user's prompt. + +Example +------- + +.. code-block:: python + + @app.on_event("startup") + async def startup_event(): + global llm + # Initialize and compile the LLM model + llm.compile( + generation_config, + # ... other params as needed + ) + llm.start_server() + + @app.post("/generate/") + async def generate(prompt_request: PromptRequest): + # ... exception handling + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + # ... split prompt and response text for returning results + return {"prompt": prompt_request.prompt, "response": full_output} + +Running and Testing +=================== + +Instructions for running and testing the FastAPI server. + +1. Run the FastAPI Server + Use Uvicorn to run the FastAPI server with specified host and port. + +2. Testing the API + Make requests to the API endpoints and verify the responses. + +Example +------- + +.. code-block:: bash + + # Running within the inference/python folder: + uvicorn entrypoint.fastapi_incr:app --reload --port 3000 + +Full API Entrypoint Code +========================= + +A complete code example for a web-document Q&A using FlexFlow can be found here: + +1. `FastAPI Example with incremental decoding `__ + +2. `FastAPI Example with speculative inference `__ diff --git a/docs/source/serve_gradioapi.rst b/docs/source/serve_gradioapi.rst new file mode 100644 index 0000000000..ed19e05347 --- /dev/null +++ b/docs/source/serve_gradioapi.rst @@ -0,0 +1,30 @@ +:tocdepth: 1 +************************* +FlexFlow Serve Gradio API +************************* + +Introduction +============ + +Users can also set up the API endpoints with a Gradio Chatbot Interface. + +Requirements +------------ + +- FlexFlow Serve setup with necessary configurations. +- Running the gradio chatbot interface. + +Example +======== + +In a running gradio chatbot interface, hit the "Use via API" button on the bottom left. + + .. image:: /imgs/gradio_interface.png + :alt: Gradio Chatbot Interface + :align: center + +Users can easily access an API endpoint for sending prompts to the model. + + .. image:: /imgs/gradio_api.png + :alt: Gradio API + :align: center \ No newline at end of file diff --git a/docs/source/serve_usecases.rst b/docs/source/serve_usecases.rst new file mode 100644 index 0000000000..4aa3fd2807 --- /dev/null +++ b/docs/source/serve_usecases.rst @@ -0,0 +1,8 @@ +******************* +Serving Usecases +******************* + +.. toctree:: + chatbot + prompt_template + rag \ No newline at end of file diff --git a/inference/.gitignore b/inference/.gitignore index 8ab99cb1eb..1da34a668b 100644 --- a/inference/.gitignore +++ b/inference/.gitignore @@ -3,3 +3,4 @@ weights tokenizers prompt output +.env \ No newline at end of file diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py new file mode 100644 index 0000000000..34f61739fb --- /dev/null +++ b/inference/python/entrypoint/fastapi_incr.py @@ -0,0 +1,162 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Running Instructions: +- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed. +- Save this script as 'fastapi_incr.py'. +- Run the application using the command: `uvicorn fastapi_incr:app --reload --port PORT_NUMBER` +- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests. +- Go to `http://localhost:PORT_NUMBER/docs` for API documentation. +""" + + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import flexflow.serve as ff +import uvicorn +import json, os, argparse +from types import SimpleNamespace + +# Initialize FastAPI application +app = FastAPI() + +# Define the request model +class PromptRequest(BaseModel): + prompt: str + +# Global variable to store the LLM model +llm = None + + +def get_configs(): + + # Fetch configuration file path from environment variable + config_file = os.getenv("CONFIG_FILE", "") + + # Load configs from JSON file (if specified) + if config_file: + if not os.path.isfile(config_file): + raise FileNotFoundError(f"Config file {config_file} not found.") + try: + with open(config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# Initialize model on startup +@app.on_event("startup") +async def startup_event(): + global llm + + # Initialize your LLM model configuration here + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + ff.init(configs_dict) + + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + llm.start_server() + +# API endpoint to generate response +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + # Call the model to generate a response + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + + # Separate the prompt and response + split_output = full_output.split('\n', 1) + if len(split_output) > 1: + response_text = split_output[1] + else: + response_text = "" + + # Return the prompt and the response in JSON format + return { + "prompt": prompt_request.prompt, + "response": response_text + } + +# Shutdown event to stop the model server +@app.on_event("shutdown") +async def shutdown_event(): + global llm + if llm is not None: + llm.stop_server() + +# Main function to run Uvicorn server +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + +# Running within the entrypoint folder: +# uvicorn fastapi_incr:app --reload --port + +# Running within the python folder: +# uvicorn entrypoint.fastapi_incr:app --reload --port 3000 diff --git a/inference/python/entrypoint/fastapi_specinfer.py b/inference/python/entrypoint/fastapi_specinfer.py new file mode 100644 index 0000000000..416aee6dc5 --- /dev/null +++ b/inference/python/entrypoint/fastapi_specinfer.py @@ -0,0 +1,202 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Running Instructions: +- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed. +- Save this script as 'fastapi_specinfer.py'. +- Run the application using the command: `uvicorn fastapi_specinfer:app --reload --port PORT_NUMBER` +- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests. +- Go to `http://localhost:PORT_NUMBER/docs` for API documentation. +""" + + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import flexflow.serve as ff +import uvicorn +import json, os, argparse +from types import SimpleNamespace + +# Initialize FastAPI application +app = FastAPI() + +# Define the request model +class PromptRequest(BaseModel): + prompt: str + +# Global variable to store the LLM model +llm = None + +def get_configs(): + # Fetch configuration file path from environment variable + config_file = os.getenv("CONFIG_FILE", "") + + # Load configs from JSON file (if specified) + if config_file: + if not os.path.isfile(config_file): + raise FileNotFoundError(f"Config file {config_file} not found.") + try: + with open(config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + +# Initialize model on startup +@app.on_event("startup") +async def startup_event(): + global llm + + # Initialize your LLM model configuration here + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + # Create the SSMs + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + + # Create the sampling configs + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + + # Compile the SSMs for inference and load the weights into memory + for ssm in ssms: + ssm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + # Compile the LLM for inference and load the weights into memory + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ssms=ssms, + ) + + llm.start_server() + +# API endpoint to generate response +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + # Call the model to generate a response + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + + # Separate the prompt and response + split_output = full_output.split('\n', 1) + if len(split_output) > 1: + response_text = split_output[1] + else: + response_text = "" + + # Return the prompt and the response in JSON format + return { + "prompt": prompt_request.prompt, + "response": response_text + } + +# Shutdown event to stop the model server +@app.on_event("shutdown") +async def shutdown_event(): + global llm + if llm is not None: + llm.stop_server() + +# Main function to run Uvicorn server +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + +# Running within the entrypoint folder: +# uvicorn fastapi_specinfer:app --reload --port + +# Running within the python folder: +# uvicorn entrypoint.fastapi_specinfer:app --reload --port 3000 diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 6706cf3c29..f7707816c8 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -41,7 +41,7 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 2, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters @@ -49,7 +49,7 @@ def get_configs(): "legion_utility_processors": 4, "data_parallelism_degree": 1, "tensor_parallelism_degree": 1, - "pipeline_parallelism_degree": 4, + "pipeline_parallelism_degree": 2, "offload": False, "offload_reserve_space_size": 1024**2, "use_4bit_quantization": False, @@ -64,7 +64,7 @@ def get_configs(): # optional parameters "cache_path": "", "refresh_cache": False, - "full_precision": True, + "full_precision": False, "prompt": "", "output_file": "", } diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 8b9a116dc5..fcb1b8f891 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -41,14 +41,14 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 2, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 2, + "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, "offload_reserve_space_size": 1024**2, @@ -75,7 +75,7 @@ def get_configs(): "full_precision": False, } ], - "prompt": "", + # "prompt": "", "output_file": "", } # Merge dictionaries diff --git a/inference/python/usecases/gradio_incr.py b/inference/python/usecases/gradio_incr.py new file mode 100644 index 0000000000..2735b665bb --- /dev/null +++ b/inference/python/usecases/gradio_incr.py @@ -0,0 +1,162 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functionality: +1. Configuration Handling: + - Parses command-line arguments to get a configuration file path. + - Loads configuration settings from a JSON file if provided, or uses default settings. + +2. FlexFlow Model Initialization: + - Initializes FlexFlow with the provided or default configurations. + - Sets up the LLM with the specified model and configurations. + - Compiles the model with generation settings and starts the FlexFlow server. + +3. Gradio Interface Setup: + - Defines a function to generate responses based on user input using FlexFlow. + - Sets up a Gradio Chat Interface to interact with the model in a conversational format. + +4. Main Execution: + - Calls the main function to initialize configurations, set up the FlexFlow LLM, and launch the Gradio interface. + - Stops the FlexFlow server after the Gradio interface is closed. + +Usage: +1. Run the script with an optional configuration file argument for custom settings. +2. Interact with the FlexFlow model through the Gradio web interface. +3. Enter text inputs to receive generated responses from the model. +4. The script will stop the FlexFlow server automatically upon closing the Gradio interface. +""" + +import gradio as gr +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# def generate_response(user_input): +# result = llm.generate(user_input) +# return result.output_text.decode('utf-8') + +def generate_response(message, history): + user_input = message + results = llm.generate(user_input) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + +def main(): + + global llm + + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + ff.init(configs_dict) + + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + # # interface version 1 + # iface = gr.Interface( + # fn=generate_response, + # inputs="text", + # outputs="text" + # ) + + # interface version 2 + iface = gr.ChatInterface(fn=generate_response) + llm.start_server() + iface.launch() + llm.stop_server() + +if __name__ == "__main__": + print("flexflow inference example with gradio interface") + main() \ No newline at end of file diff --git a/inference/python/usecases/gradio_specinfer.py b/inference/python/usecases/gradio_specinfer.py new file mode 100644 index 0000000000..08cde3f00b --- /dev/null +++ b/inference/python/usecases/gradio_specinfer.py @@ -0,0 +1,205 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functionality: +1. Configuration Handling: + - Parses command-line arguments to get a configuration file path. + - Loads configuration settings from a JSON file if provided, or uses default settings. + +2. FlexFlow Model Initialization: + - Initializes FlexFlow with the provided or default configurations. + - Sets up the LLM with the specified model and configurations. + - Compiles the model with generation settings and starts the FlexFlow server. + +3. Gradio Interface Setup: + - Defines a function to generate responses based on user input using FlexFlow. + - Sets up a Gradio Chat Interface to interact with the model in a conversational format. + +4. Main Execution: + - Calls the main function to initialize configurations, set up the FlexFlow LLM, and launch the Gradio interface. + - Stops the FlexFlow server after the Gradio interface is closed. + +Usage: +1. Run the script with an optional configuration file argument for custom settings. +2. Interact with the FlexFlow model through the Gradio web interface. +3. Enter text inputs to receive generated responses from the model. +4. The script will stop the FlexFlow server automatically upon closing the Gradio interface. +""" + +""" +TODO: fix current issue: model init is stuck at "prepare next batch init" and "prepare next batch verify" +""" + +import gradio as gr +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# def generate_response(user_input): +# result = llm.generate(user_input) +# return result.output_text.decode('utf-8') + +def generate_response(message, history): + user_input = message + results = llm.generate(user_input) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + +def main(): + + global llm + + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + # Create the SSMs + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + + # Create the sampling configs + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + + # Compile the SSMs for inference and load the weights into memory + for ssm in ssms: + ssm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=256, + ) + + # Compile the LLM for inference and load the weights into memory + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=256, + ssms=ssms, + ) + + # # interface version 1 + # iface = gr.Interface( + # fn=generate_response, + # inputs="text", + # outputs="text" + # ) + + # interface version 2 + iface = gr.ChatInterface(fn=generate_response) + llm.start_server() + iface.launch() + llm.stop_server() + +if __name__ == "__main__": + print("flexflow inference example with gradio interface") + main() \ No newline at end of file diff --git a/inference/python/usecases/prompt_template_incr.py b/inference/python/usecases/prompt_template_incr.py new file mode 100644 index 0000000000..8bffe9ddad --- /dev/null +++ b/inference/python/usecases/prompt_template_incr.py @@ -0,0 +1,187 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of prompt template upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Sets up a prompt template for generating responses to questions. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + self.llm.compile(generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch) + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 1: Prompt Template + template = """Question: {question} + Answer: Let's think step by step.""" + + # Build prompt template and langchain + prompt = PromptTemplate(template=template, input_variables=["question"]) + llm_chain = LLMChain(prompt=prompt, llm=ff_llm_wrapper) + + question = "Who was the US president in the year the first Pokemon game was released?" + print(llm_chain.run(question)) + + # stop the server + ff_llm.stop_server() + diff --git a/inference/python/usecases/prompt_template_specinfer.py b/inference/python/usecases/prompt_template_specinfer.py new file mode 100644 index 0000000000..dfc92e9ac2 --- /dev/null +++ b/inference/python/usecases/prompt_template_specinfer.py @@ -0,0 +1,236 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of prompt template upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Sets up a prompt template for generating responses to questions. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate + + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + self.ssms = self.create_ssms() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def create_ssms(self): + # Create the SSMs + configs = SimpleNamespace(**self.configs) + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + return ssms + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + + # Compile the SSMs for inference and load the weights into memory + for ssm in self.ssms: + ssm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ) + + # Compile the LLM for inference and load the weights into memory + self.llm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ssms = self.ssms + ) + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 1: Prompt Template + template = """Question: {question} + Answer: Let's think step by step.""" + + # Build prompt template and langchain + prompt = PromptTemplate(template=template, input_variables=["question"]) + llm_chain = LLMChain(prompt=prompt, llm=ff_llm_wrapper) + + question = "Who was the US president in the year the first Pokemon game was released?" + print(llm_chain.run(question)) + + # stop the server + ff_llm.stop_server() + + diff --git a/inference/python/usecases/rag_incr.py b/inference/python/usecases/rag_incr.py new file mode 100644 index 0000000000..15e7f3d092 --- /dev/null +++ b/inference/python/usecases/rag_incr.py @@ -0,0 +1,220 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of rag-search upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Taking in specific source information with RAG(Retrieval Augmented Generation) technique for Q&A towards specific realm/knowledgebase. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from langchain.document_loaders import WebBaseLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores import Chroma +from langchain.vectorstores import FAISS + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + self.llm.compile(generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch) + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 2: Rag Search + + # Load web page content + loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/") + data = loader.load() + + # Split text + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + all_splits = text_splitter.split_documents(data) + + # Initialize embeddings + embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY')) # fill in openai api key + + # Create VectorStore + vectorstore = Chroma.from_documents(all_splits, embeddings) + + # Use VectorStore as a retriever + retriever = vectorstore.as_retriever() + + # Test if similarity search is working + question = "What are the approaches to Task Decomposition?" + docs = vectorstore.similarity_search(question) + max_chars_per_doc = 100 + # docs_text_list = [docs[i].page_content for i in range(len(docs))] + docs_text_list = [docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))] + docs_text = ''.join(docs_text_list) + + # Using a Prompt Template + prompt_rag = PromptTemplate.from_template( + "Summarize the main themes in these retrieved docs: {docs_text}" + ) + + # Chain + llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag) + + # Run + rag_result = llm_chain_rag(docs_text) + + # Stop the server + ff_llm.stop_server() + diff --git a/inference/python/usecases/rag_specinfer.py b/inference/python/usecases/rag_specinfer.py new file mode 100644 index 0000000000..512b973955 --- /dev/null +++ b/inference/python/usecases/rag_specinfer.py @@ -0,0 +1,266 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of rag-search upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Taking in specific source information with RAG(Retrieval Augmented Generation) technique for Q&A towards specific realm/knowledgebase. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from langchain.document_loaders import WebBaseLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores import Chroma +from langchain.vectorstores import FAISS + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + self.ssms = self.create_ssms() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def create_ssms(self): + # Create the SSMs + configs = SimpleNamespace(**self.configs) + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + return ssms + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + + # Compile the SSMs for inference and load the weights into memory + for ssm in self.ssms: + ssm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ) + + # Compile the LLM for inference and load the weights into memory + self.llm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ssms = self.ssms + ) + # start server + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=200 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 2: Rag Search + + # Load web page content + loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/") + data = loader.load() + + # Split text + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + all_splits = text_splitter.split_documents(data) + + # Initialize embeddings + embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY')) # fill in openai api key + + # Create VectorStore + vectorstore = Chroma.from_documents(all_splits, embeddings) + + # Use VectorStore as a retriever + retriever = vectorstore.as_retriever() + + # Test if similarity search is working + question = "What are the approaches to Task Decomposition?" + docs = vectorstore.similarity_search(question) + max_chars_per_doc = 50 + # docs_text_list = [docs[i].page_content for i in range(len(docs))] + docs_text_list = [docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))] + docs_text = ''.join(docs_text_list) + + # Using a Prompt Template + prompt_rag = PromptTemplate.from_template( + "Summarize the main themes in these retrieved docs: {docs_text}" + ) + + # Chain + llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag) + + # Run + rag_result = llm_chain_rag(docs_text) + + # stop the server + ff_llm.stop_server() diff --git a/tests/training_tests.sh b/tests/training_tests.sh index 2d1f00883b..a6cab7d117 100755 --- a/tests/training_tests.sh +++ b/tests/training_tests.sh @@ -2,6 +2,9 @@ set -x set -e +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + # Default to single-node, single GPU GPUS=${1:-1} # number of GPUS per node NUM_NODES=${2:-1} # number of nodes @@ -87,3 +90,4 @@ $EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /t $EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/training_tests/test_params.json $EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/training_tests/test_params.json $EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json + From d21ed66a5baf2bfdeb06fd74e080abbd6eec9ce7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 23:01:16 -0500 Subject: [PATCH 316/344] Bug fixes and update Legion version (#1287) * bug fixes and update Legion version * fix * bug fix * update legion * fix arithmetic error due to num_devices uninitialized * update legion version * update ci * fix * debugging ci * Revert "debugging ci" This reverts commit 0b3148ef6adfcb64935e6b1e83a88494910a7b22. * update mapper interface * add ncclFinalize * Only delete nccl communications for training jobs --------- Co-authored-by: Zhihao Jia --- .github/workflows/gpu-ci.yml | 12 +++--- CMakeLists.txt | 8 ++-- cmake/pip_install/CMakeLists.txt | 4 +- deps/legion | 2 +- include/flexflow/mapper.h | 9 ++--- include/flexflow/model.h | 2 + include/flexflow/operator.h | 5 +++ include/flexflow/request_manager.h | 1 - src/mapper/mapper.cc | 47 ++++++++++------------ src/ops/linear.cc | 8 +--- src/runtime/inference_manager.cc | 30 +------------- src/runtime/model.cc | 63 ++++++++++++++++++++++++++++++ 12 files changed, 111 insertions(+), 80 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 3901d6b5f7..48dcda157e 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests diff --git a/CMakeLists.txt b/CMakeLists.txt index acbe7e385f..43ce4f7044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) + find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + find_package(Python COMPONENTS Interpreter Development) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 7ce38c4abc..105133a310 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() diff --git a/deps/legion b/deps/legion index 626b55689c..24e8c45234 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index 71be1892aa..e8337818ec 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,11 +83,10 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output); + virtual void replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index dd6dc76b4d..95be9ab581 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,6 +202,7 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, + NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -397,6 +398,7 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); + ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 73c2c3e092..1b19bdb82f 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,6 +406,11 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + finish_nccl_comms_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 50a51705cd..4763eb1ef3 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,7 +55,6 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; - int num_devices; }; struct Request { diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index bc26a79d3e..d46bfc2877 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,44 +661,37 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output) { +void FFMapper::replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID chosen_variant; + VariantID vid; { std::vector variant_ids; - runtime->find_valid_variants( - ctx, task.task_id, variant_ids, task.target_proc.kind()); + runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - chosen_variant = variant_ids[0]; + output.chosen_variant = variant_ids[0]; } - std::vector const &all_procs = all_procs_by_kind(target_kind); - // Place on replicate on each node by default - output.task_mappings.resize(total_nodes, default_output); - // Assume default_output does not include any target_procs - assert(default_output.target_procs.size() == 0); - for (std::vector::const_iterator it = all_procs.begin(); - it != all_procs.end(); + output.target_processors.resize(total_nodes); + std::vector handled(total_nodes, false); + size_t count = 0; + Machine::ProcessorQuery procs(machine); + procs.only_kind(target_kind); + for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); it++) { - AddressSpace space = it->address_space(); - assert(space < output.task_mappings.size()); - // Add *it as a target_proc if we haven't found one - if (output.task_mappings[space].target_procs.size() == 0) { - output.task_mappings[space].target_procs.push_back(*it); + const AddressSpace space = it->address_space(); + if (handled[space]) { + continue; } + output.target_processors[space] = *it; + handled[space] = true; + count++; } - output.control_replication_map.resize(total_nodes); - for (int idx = 0; idx < total_nodes; idx++) { - output.task_mappings[idx].chosen_variant = chosen_variant; - output.control_replication_map[idx] = - output.task_mappings[idx].target_procs[0]; - } + assert(count == total_nodes); } void FFMapper::select_task_variant(const MapperContext ctx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 03c9e48af8..0c7a0f78fe 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,8 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - // TensorAccessorW acc_kernel(regions[2], - // task->regions[2], - // FID_DATA, - // ctx, - // runtime, - // false /*readOutput*/); + TensorAccessorR acc_kernel( + regions[2], task->regions[2], FID_DATA, ctx, runtime); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 6588cbceeb..2a94df8b4d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,33 +28,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() { -#ifdef DEADCODE - num_devices = ff_config.workersPerNode * ff_config.numNodes; - // Check parallelization degrees - assert(ff_config.data_parallelism_degree <= num_devices && - "Data parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.data_parallelism_degree == 0 && - "Number of available devices is not divisible by data parallelism " - "degree"); - assert(ff_config.tensor_parallelism_degree <= num_devices && - "Tensor parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.tensor_parallelism_degree == 0 && - "Number of available devices is not divisible by tensor parallelism " - "degree"); - assert(ff_config.pipeline_parallelism_degree <= num_devices && - "Pipeline parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && - "Number of available devices is not divisible by pipeline parallelism " - "degree"); - assert(ff_config.data_parallelism_degree * - ff_config.tensor_parallelism_degree * - ff_config.pipeline_parallelism_degree == - num_devices && - "Product of data, tensor, and pipeline parallelism degrees does not " - "match the number of available devices"); -#endif -} +InferenceManager::InferenceManager() {} InferenceManager *inference_manager_singleton = nullptr; @@ -296,8 +270,6 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { - int expert_device_index = 0; - int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c07c33efca..440ae19047 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,6 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } + +void Op::finish_nccl_comms_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ncclComm_t comm = *((ncclComm_t *)task->local_args); + checkNCCL(ncclCommFinalize(comm)); + checkNCCL(ncclCommDestroy(comm)); +} #endif /** @@ -1578,6 +1587,45 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +FFModel::~FFModel() { + // Destroy nccl communication groups +#ifdef FF_USE_NCCL + if (config.computationMode == COMP_MODE_TRAINING) { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } + } +#endif +} + void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6853,6 +6901,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, + "NCCL Finish Communicators"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "NCCL Finish Communicators Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } #endif // Search { From be28d718c06c199866126a8bf4f1e35dfc4509a1 Mon Sep 17 00:00:00 2001 From: April Yang <114364211+april-yyt@users.noreply.github.com> Date: Sun, 4 Feb 2024 19:58:39 -0800 Subject: [PATCH 317/344] Docs Modification for Python Usecases (#1291) * modify README * fix link issues * update legion version --------- Co-authored-by: Zhihao Jia --- SERVE.md | 69 +++++++++++++++++++++++++++++++++ docs/source/chatbot.rst | 4 +- docs/source/prompt_template.rst | 4 +- docs/source/rag.rst | 4 +- docs/source/serve_fastapi.rst | 4 +- 5 files changed, 77 insertions(+), 8 deletions(-) diff --git a/SERVE.md b/SERVE.md index e64756e8f4..e9bab3d702 100644 --- a/SERVE.md +++ b/SERVE.md @@ -182,6 +182,75 @@ FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are s ### Prompt Datasets We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). + + + +## Python Interface Features and Interaction Methods + +FlexFlow Serve provides a comprehensive Python interface for serving with low latency and high performance. This interface facilitates the deployment and interaction with the serving platform for a variety of applications, from chatbots and prompt templates to retrieval augmented generation and API services. + +### Chatbot with Gradio + +The Python interface allows setting up a chatbot application using Gradio, enabling interactive dialogues with users through a user-friendly web interface. + +#### Implementation Steps +1. **FlexFlow Initialization:** Configure and initialize FlexFlow Serve with the desired settings and the specific LLM. +```python +import gradio as gr +import flexflow.serve as ff + +ff.init(num_gpus=2, memory_per_gpu=14000, ...) +``` +2. **Gradio Interface Setup:** Implement a function to generate responses from user inputs and set up the Gradio Chat Interface for interaction. +```python +def generate_response(user_input): + result = llm.generate(user_input) + return result.output_text.decode('utf-8') +``` +3. **Running the Interface:** Launch the Gradio interface to interact with the LLM through a web-based chat interface. +```python +iface = gr.ChatInterface(fn=generate_response) +iface.launch() +``` +4. **Shutdown:** Properly stop the FlexFlow server after interaction is complete. + + + +### Langchain Usecases +FlexFlow Serve supports langchain usecases including dynamic prompt template handling and RAG usecases, enabling the customization of model responses based on structured input templates and Retrieval Augmented Generation. + +#### Implementation Steps +1. **FlexFlow Initialization**: Start by initializing FlexFlow Serve with the appropriate configurations. +2. **LLM Setup**: Compile and load the LLM for text generation. +3. **Prompt Template/RAG Setup**: Configure prompt templates to guide the model's responses. +4. **Response Generation**: Use the LLM with the prompt template to generate responses. + + +### Python FastAPI Entrypoint +Flexflow Serve also supports deploying and managing LLMs with FastAPI, offering a RESTful API interface for generating responses from models. + +```python +@app.on_event("startup") +async def startup_event(): + global llm + # Initialize and compile the LLM model + llm.compile( + generation_config, + # ... other params as needed + ) + llm.start_server() + +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + # ... exception handling + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + # ... split prompt and response text for returning results + return {"prompt": prompt_request.prompt, "response": full_output} +``` + + + + ## TODOs FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. diff --git a/docs/source/chatbot.rst b/docs/source/chatbot.rst index fc6f616fae..c41307e231 100644 --- a/docs/source/chatbot.rst +++ b/docs/source/chatbot.rst @@ -42,9 +42,9 @@ Example Complete code example can be found here: -1. `Chatbot Example with incremental decoding `__ +1. `Chatbot Example with incremental decoding `__ -2. `Chatbot Example with speculative inference `__ +2. `Chatbot Example with speculative inference `__ Example Implementation: diff --git a/docs/source/prompt_template.rst b/docs/source/prompt_template.rst index 4e0f1beab5..7f987b0f18 100644 --- a/docs/source/prompt_template.rst +++ b/docs/source/prompt_template.rst @@ -34,9 +34,9 @@ Example Complete code example can be found here: -1. `Prompt Template Example with incremental decoding `__ +1. `Prompt Template Example with incremental decoding `__ -2. `Prompt Template Example with speculative inference `__ +2. `Prompt Template Example with speculative inference `__ Example Implementation: diff --git a/docs/source/rag.rst b/docs/source/rag.rst index 4b869c2352..640b2fe131 100644 --- a/docs/source/rag.rst +++ b/docs/source/rag.rst @@ -34,9 +34,9 @@ Example A complete code example for a web-document Q&A using FlexFlow can be found here: -1. `Rag Q&A Example with incremental decoding `__ +1. `Rag Q&A Example with incremental decoding `__ -2. `Rag Q&A Example with speculative inference `__ +2. `Rag Q&A Example with speculative inference `__ Example Implementation: diff --git a/docs/source/serve_fastapi.rst b/docs/source/serve_fastapi.rst index 0aa6634670..62a28e5937 100644 --- a/docs/source/serve_fastapi.rst +++ b/docs/source/serve_fastapi.rst @@ -101,6 +101,6 @@ Full API Entrypoint Code A complete code example for a web-document Q&A using FlexFlow can be found here: -1. `FastAPI Example with incremental decoding `__ +1. `FastAPI Example with incremental decoding `__ -2. `FastAPI Example with speculative inference `__ +2. `FastAPI Example with speculative inference `__ From e24eb03235a185d8ce1c92d5519a27c8add072c8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 22 Feb 2024 10:40:33 -0500 Subject: [PATCH 318/344] Add support for docker machines with cuda 12.1 and cuda 12.2 (#1308) --- docker/build.sh | 12 ++++++------ docker/pull.sh | 6 +++--- docker/run.sh | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docker/build.sh b/docker/build.sh index 6603d919f5..8ecacbc6d4 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -50,20 +50,20 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the # Check that CUDA version is supported, and modify cuda version to include default subsubversion if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then cuda_version_input=${cuda_version}.1 - elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then + elif [[ "$cuda_version" == @(11.2|11.5|11.6|12.2) ]]; then cuda_version_input=${cuda_version}.2 elif [[ "$cuda_version" == @(11.4) ]]; then cuda_version_input=${cuda_version}.3 - elif [[ "$cuda_version" == @(11.8|12.2) ]]; then + elif [[ "$cuda_version" == @(11.8) ]]; then cuda_version_input=${cuda_version}.0 else echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi - # Use CUDA 12.0 for all versions greater or equal to 12.0 for now - if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.0 - cuda_version_input=${cuda_version}.1 + # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available) + if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + cuda_version=12.2 + cuda_version_input=${cuda_version}.2 fi echo "Building $image docker image with CUDA $cuda_version" ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04" diff --git a/docker/pull.sh b/docker/pull.sh index e5b6f26f3c..27bf245c12 100755 --- a/docker/pull.sh +++ b/docker/pull.sh @@ -49,9 +49,9 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi - # Use CUDA 12.0 for all versions greater or equal to 12.0 for now - if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.0 + # Use CUDA 12.2 for all versions greater or equal to 12.2 for now + if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + cuda_version=12.2 fi # Set cuda version suffix to docker image name echo "Downloading $image docker image with CUDA $cuda_version" diff --git a/docker/run.sh b/docker/run.sh index 76ec1e1ceb..666c8e1121 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -62,9 +62,9 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi - # Use CUDA 12.0 for all versions greater or equal to 12.0 for now - if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.0 + # Use CUDA 12.2 for all versions greater or equal to 12.2 for now + if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + cuda_version=12.2 fi # Set cuda version suffix to docker image name echo "Running $image docker image with CUDA $cuda_version" From 0d75c1042bf87e45684bcb3679cfc9f39a87e589 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 2 Mar 2024 23:14:34 -0500 Subject: [PATCH 319/344] Fix NCCL tear down issue, update docker pre-build cuda version list (#1318) --- .github/workflows/docker-build-skip.yml | 2 +- .github/workflows/docker-build.yml | 12 ++++++------ README.md | 2 +- docker/README.md | 6 +++--- docker/pull.sh | 4 ++-- src/runtime/model.cc | 2 ++ 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml index 02b703467c..e5d7de858f 100644 --- a/.github/workflows/docker-build-skip.yml +++ b/.github/workflows/docker-build-skip.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"] + cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"] fail-fast: false steps: - run: 'echo "No docker-build required"' diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 05c94c7e84..54805cc325 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -103,27 +103,27 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"] + cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"] fail-fast: false env: FF_GPU_BACKEND: "cuda" cuda_version: ${{ matrix.cuda_version }} steps: - name: Checkout Git Repository - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} uses: actions/checkout@v3 with: submodules: recursive - name: Free additional space on runner - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} run: .github/workflows/helpers/free_space_on_runner.sh - name: Build Docker container - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} env: deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} - build_needed: ${{ matrix.cuda_version == '11.8' }} + build_needed: ${{ matrix.cuda_version == '12.0' }} run: | # On push to inference, build for all compatible architectures, so that we can publish # a pre-built general-purpose image. On all other cases, only build for one architecture @@ -137,7 +137,7 @@ jobs: fi - name: Check availability of flexflow modules in Python - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - name: Publish Docker environment image (on push to inference) diff --git a/README.md b/README.md index 318d2e38da..95790a90e5 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ If you run into any issue during the install, or if you would like to use the C+ docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest ``` -To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md). +To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, `cuda-12.0`, `cuda-12.1`, `cuda-12.1`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`. More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md). ### Build from source diff --git a/docker/README.md b/docker/README.md index b7ec7c3631..010aadf762 100644 --- a/docker/README.md +++ b/docker/README.md @@ -7,7 +7,7 @@ You can build and run the FlexFlow Docker images on any machine, but if you want ## Downloading a pre-built package The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow): -* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0). The CUDA images are named `flexflow--`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or +* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, and 12.2). The CUDA images are named `flexflow--`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or * `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish four version of `flexflow-environment` for AMD GPUs and, for NVIDIA GPUs, one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 12.0 is tagged [flexflow-environment-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-12.0). The easiest way to download any of the Docker containers above is to call: @@ -19,7 +19,7 @@ The easiest way to download any of the Docker containers above is to call: where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`). By default, the script will assume a NVIDIA backend and attempt to detect the CUDA version on your machine, to download the relevant container. If your machine has AMD GPUs, or no GPUs, or if you want to specify the CUDA/ROCM version to download, set the environment variables below: * `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be downloaded. -* `cuda_version` (supported options: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored +* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1 and 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored * `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored. @@ -44,7 +44,7 @@ If you only want to build the `flexflow-environment` image (the base layers of t After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND`, `cuda_version` and `hip_version` optional environment variables to run the docker image with the desired GPU backend and CUDA/HIP version: * `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be run. -* `cuda_version` (supported options: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored +* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored * `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored. Leaving these variables unset will assume a GPU backend, and instruct the script to autodetect the CUDA version installed on the current machine and run the Docker container with it if available. diff --git a/docker/pull.sh b/docker/pull.sh index 27bf245c12..f641e1a591 100755 --- a/docker/pull.sh +++ b/docker/pull.sh @@ -45,8 +45,8 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the fi fi # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" + if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then + echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi # Use CUDA 12.2 for all versions greater or equal to 12.2 for now diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 440ae19047..40f758282c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -612,7 +612,9 @@ void Op::finish_nccl_comms_task(Task const *task, Context ctx, Runtime *runtime) { ncclComm_t comm = *((ncclComm_t *)task->local_args); +#if (NCCL_MAJOR == 2) && (NCCL_MINOR >= 14) checkNCCL(ncclCommFinalize(comm)); +#endif checkNCCL(ncclCommDestroy(comm)); } #endif From ea31426f76fd4bd4709fd774becb3b303916e2be Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Mar 2024 21:40:05 +0000 Subject: [PATCH 320/344] add expansion config param in specinfer --- inference/spec_infer/spec_infer.cc | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 7578721dd0..5d584de1ff 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -62,7 +62,8 @@ void parse_input_args(char **argv, bool &verbose, int &max_requests_per_batch, int &max_tokens_per_batch, - int &max_sequence_length) { + int &max_sequence_length, + int &expansion_degree) { for (int i = 1; i < argc; i++) { // llm model name if (!strcmp(argv[i], "-llm-model")) { @@ -117,6 +118,10 @@ void parse_input_args(char **argv, max_sequence_length = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--expansion-degree")) { + expansion_degree = std::stoi(argv[++i]); + continue; + } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -270,6 +275,7 @@ void FlexFlow::top_level_task(Task const *task, int max_requests_per_batch = 16; int max_tokens_per_batch = 256; int max_sequence_length = 1024; + int expansion_degree = 3; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -282,7 +288,8 @@ void FlexFlow::top_level_task(Task const *task, verbose, max_requests_per_batch, max_tokens_per_batch, - max_sequence_length); + max_sequence_length, + expansion_degree); get_model_meta(file_paths, model_metadata, use_full_precision); @@ -304,7 +311,9 @@ void FlexFlow::top_level_task(Task const *task, rm->register_output_filepath(file_paths.output_file_path); // first decoding step: 3 results - rm->push_spec_infer_tree_width(3); + if (expansion_degree != -1) { + rm->push_spec_infer_tree_width(expansion_degree); + } // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); From e03dec04c2949fc79bfcc7f49b99b624ca100026 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 11 Mar 2024 23:28:39 +0000 Subject: [PATCH 321/344] parametrize max_spec_tree_token_num --- include/flexflow/batch_config.h | 1 + include/flexflow/flexflow_c.h | 3 +++ include/flexflow/request_manager.h | 3 +++ inference/spec_infer/spec_infer.cc | 2 ++ python/flexflow/core/flexflow_cffi.py | 4 +++ python/flexflow/serve/models/base.py | 6 ++--- python/flexflow/serve/models/falcon.py | 26 +++++++++++-------- python/flexflow/serve/models/llama.py | 27 ++++++++++++-------- python/flexflow/serve/models/mpt.py | 27 ++++++++++++-------- python/flexflow/serve/models/opt.py | 26 +++++++++++-------- python/flexflow/serve/models/starcoder.py | 27 ++++++++++++-------- python/flexflow/serve/serve.py | 17 ++++++++---- src/c/flexflow_c.cc | 8 ++++++ src/ops/inc_multihead_self_attention.cu | 10 ++++++-- src/ops/spec_inc_multihead_self_attention.cu | 16 ++++++++---- src/ops/tree_inc_multihead_self_attention.cu | 13 +++++----- src/runtime/batch_config.cc | 4 +++ src/runtime/request_manager.cc | 15 ++++++++++- 18 files changed, 160 insertions(+), 75 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 5c126293cf..009d1c250a 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -46,6 +46,7 @@ class BatchConfig { static int max_requests_per_batch(); static int max_tokens_per_batch(); static int max_verify_tokens_per_batch(); + static int max_spec_tree_token_num(); static int max_sequence_length(); friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index cab3d14ea7..0b74b7fce4 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -972,6 +972,9 @@ void flexflow_request_manager_set_max_requests_per_batch( void flexflow_request_manager_set_max_tokens_per_batch( flexflow_request_manager_t handle_, int max_num_tokens); +void flexflow_request_manager_set_max_spec_tree_token_num( + flexflow_request_manager_t handle_, int max_num_tokens); + void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 4763eb1ef3..a38a3b2671 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -114,6 +114,8 @@ class RequestManager { int get_max_requests_per_batch(); void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); + void set_max_spec_tree_token_num(int max_num_tokens); + int get_max_spec_tree_token_num(); int get_max_verify_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); @@ -259,6 +261,7 @@ class RequestManager { // configuration parameters int max_requests_per_batch; int max_tokens_per_batch; + int max_spec_tree_token_num; int max_sequence_length; Status request_manager_status; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 5d584de1ff..36d54eee64 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -275,6 +275,7 @@ void FlexFlow::top_level_task(Task const *task, int max_requests_per_batch = 16; int max_tokens_per_batch = 256; int max_sequence_length = 1024; + int max_spec_tree_token_num = 20; int expansion_degree = 3; InputArgs const &command_args = HighLevelRuntime::get_input_args(); @@ -303,6 +304,7 @@ void FlexFlow::top_level_task(Task const *task, RequestManager *rm = RequestManager::get_request_manager(); rm->set_max_requests_per_batch(max_requests_per_batch); rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_spec_tree_token_num(max_spec_tree_token_num); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer(model_metadata.llm_model_type, model_metadata.bos_token_id, diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index d6f84833be..14cf4eebf7 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -4200,6 +4200,10 @@ def set_max_tokens_per_batch(self, max_tokens): return ffc().flexflow_request_manager_set_max_tokens_per_batch( self.handle, max_tokens) + def set_max_spec_tree_token_num(self, max_tokens): + return ffc().flexflow_request_manager_set_max_spec_tree_token_num( + self.handle, max_tokens) + def set_max_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_sequence_length( self.handle, max_length) diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index 025008ec78..e7f3914037 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -21,9 +21,9 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, - #max_tokens_per_batch=64, + # max_batch_size=1, + # max_seq_length=256, + # max_tokens_per_batch=64, weights_filepath="", tokenizer_filepath="", ): diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index e9cd789bcc..7a55da26ef 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -19,11 +19,11 @@ class FalconConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 - self.max_spec_tree_token_num = 64 + self.max_spec_tree_token_num = 20 self.bias = hf_config.bias self.hidden_size = hf_config.hidden_size self.layer_norm_epsilon = hf_config.layer_norm_epsilon @@ -54,8 +54,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -63,15 +63,17 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.falcon_config = FalconConfig(hf_config) - #self.falcon_config.max_seq_length = max_seq_length - #self.falcon_config.max_num_tokens = max_tokens_per_batch + # self.falcon_config.max_seq_length = max_seq_length + # self.falcon_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 - max_verify_tokens_per_batch = max_tokens_per_batch + self.falcon_config.max_spec_tree_token_num + max_verify_tokens_per_batch = ( + max_tokens_per_batch + self.falcon_config.max_spec_tree_token_num + ) # Sanity checks if self.falcon_config.hidden_size % self.falcon_config.n_head != 0: @@ -86,7 +88,11 @@ def __init__( f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) + self.build_model( + max_tokens_per_batch + if self.mode == InferenceMode.INC_DECODING_MODE + else max_verify_tokens_per_batch + ) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 900ab48bcd..6b33030f62 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -19,11 +19,11 @@ class LLAMAConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 - self.max_spec_tree_token_num = 64 + self.max_spec_tree_token_num = 20 self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.hidden_size = hf_config.hidden_size @@ -46,8 +46,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -55,16 +55,17 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.llama_config = LLAMAConfig(hf_config) - #self.llama_config.max_seq_length = max_seq_length - #self.llama_config.max_num_tokens = max_tokens_per_batch + # self.llama_config.max_seq_length = max_seq_length + # self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 - max_verify_tokens_per_batch = max_tokens_per_batch + self.llama_config.max_spec_tree_token_num - + max_verify_tokens_per_batch = ( + max_tokens_per_batch + self.llama_config.max_spec_tree_token_num + ) # Sanity checks if self.llama_config.hidden_size % self.llama_config.num_attention_heads != 0: @@ -84,7 +85,11 @@ def __init__( f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) + self.build_model( + max_tokens_per_batch + if self.mode == InferenceMode.INC_DECODING_MODE + else max_verify_tokens_per_batch + ) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index c0f995bf22..92867fd498 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -19,11 +19,11 @@ class MPTConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 - self.max_spec_tree_token_num = 64 + self.max_spec_tree_token_num = 20 self.hidden_size = hf_config.d_model self.n_heads = hf_config.n_heads self.n_layers = hf_config.n_layers @@ -41,8 +41,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -50,16 +50,17 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.mpt_config = MPTConfig(hf_config) - #self.mpt_config.max_seq_length = max_seq_length - #self.mpt_config.max_num_tokens = max_tokens_per_batch + # self.mpt_config.max_seq_length = max_seq_length + # self.mpt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 - max_verify_tokens_per_batch = max_tokens_per_batch + self.mpt_config.max_spec_tree_token_num - + max_verify_tokens_per_batch = ( + max_tokens_per_batch + self.mpt_config.max_spec_tree_token_num + ) # Sanity checks if self.mpt_config.hidden_size % self.mpt_config.n_heads != 0: @@ -75,7 +76,11 @@ def __init__( raise ValueError( f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) + self.build_model( + max_tokens_per_batch + if self.mode == InferenceMode.INC_DECODING_MODE + else max_verify_tokens_per_batch + ) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dc3f841a5a..b715f5f35e 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -19,11 +19,11 @@ class OPTConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 - self.max_spec_tree_token_num = 64 + self.max_spec_tree_token_num = 20 self.do_layer_norm_before = hf_config.do_layer_norm_before self.dropout = hf_config.dropout self.enable_bias = hf_config.enable_bias @@ -47,8 +47,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -56,15 +56,17 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.opt_config = OPTConfig(hf_config) - #self.opt_config.max_seq_length = max_seq_length - #self.opt_config.max_num_tokens = max_tokens_per_batch + # self.opt_config.max_seq_length = max_seq_length + # self.opt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 - max_verify_tokens_per_batch = max_tokens_per_batch + self.opt_config.max_spec_tree_token_num + max_verify_tokens_per_batch = ( + max_tokens_per_batch + self.opt_config.max_spec_tree_token_num + ) # Sanity checks if self.opt_config.hidden_size % self.opt_config.num_attention_heads != 0: @@ -84,7 +86,11 @@ def __init__( f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) + self.build_model( + max_tokens_per_batch + if self.mode == InferenceMode.INC_DECODING_MODE + else max_verify_tokens_per_batch + ) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 4a6f191abd..37edaa4c40 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -19,11 +19,11 @@ class STARCODERConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 - self.max_spec_tree_token_num = 64 + self.max_spec_tree_token_num = 20 self.dropout_p = hf_config.attn_pdrop self.hidden_size = hf_config.n_embd self.layer_norm_epsilon = hf_config.layer_norm_epsilon @@ -45,8 +45,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -54,16 +54,17 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.starcoder_config = STARCODERConfig(hf_config) - #self.starcoder_config.max_seq_length = max_seq_length - #self.starcoder_config.max_num_tokens = max_tokens_per_batch + # self.starcoder_config.max_seq_length = max_seq_length + # self.starcoder_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 - max_verify_tokens_per_batch = max_tokens_per_batch + self.starcoder_config.max_spec_tree_token_num - + max_verify_tokens_per_batch = ( + max_tokens_per_batch + self.starcoder_config.max_spec_tree_token_num + ) # Sanity checks if ( @@ -87,7 +88,11 @@ def __init__( f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) + self.build_model( + max_tokens_per_batch + if self.mode == InferenceMode.INC_DECODING_MODE + else max_verify_tokens_per_batch + ) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 5c3cac9303..14555bfc12 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -186,9 +186,11 @@ def download_hf_weights_if_needed(self): os.path.expanduser(self.cache_path), "weights", self.model_name.lower(), - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision", + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), ) if self.refresh_cache: print( @@ -302,8 +304,6 @@ def compile( ): """Compile the LLM for inference and load the weights into memory - :param mode: The LLM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE - :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional :param max_requests_per_batch: The maximum batch size to allow, defaults to 1 @@ -372,6 +372,13 @@ def compile( # Create file data loader, load weights into tensors model_configs = self.config_class(self.hf_config) + self.rm.set_max_spec_tree_token_num( + self.model_configs.max_spec_tree_token_num + if "max_spec_tree_token_num" + in self.model_configs.max_spec_tree_token_num.__dict__ + else 20 + ) + self.fileloader = FileDataLoader( self.weights_path, model_configs.num_attention_heads, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 9ad58695ad..5714c8fe3d 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2582,6 +2582,14 @@ void flexflow_request_manager_set_max_tokens_per_batch( DEBUG_PRINT("[RequestManager] set max_tokens_per_batch %d", max_num_tokens); } +void flexflow_request_manager_set_max_spec_tree_token_num( + flexflow_request_manager_t handle_, int max_num_tokens) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_spec_tree_token_num(max_num_tokens); + DEBUG_PRINT("[RequestManager] set max_spec_tree_token_num %d", + max_num_tokens); +} + void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 42933cee27..a0d31bb6ef 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1125,6 +1125,12 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, } tokens_previous_requests += num_new_tokens; } + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); + } assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } @@ -1352,11 +1358,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + BatchConfig::max_spec_tree_token_num()); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + BatchConfig::max_spec_tree_token_num()); break; } default: diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 2d80ed2221..a00ea9c95f 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -382,7 +382,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + BatchConfig::max_spec_tree_token_num(), /*root*/ curr_depth == 0, m->hidden_size); } @@ -392,7 +392,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_sz = smem_size_in_bytes
(m->qProjSize, \ BatchConfig::max_sequence_length() + \ - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + BatchConfig::max_spec_tree_token_num(), \ THREADS_PER_VALUE, \ THDS_PER_BLOCK); \ compute_spec_inc_attention_kernel_generation_kernelqProjSize, \ m->hidden_size, \ m->request_infos, \ @@ -493,11 +493,11 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int kt_block_size = m->kProjSize; int kt_req_block_size = kt_block_size * m->num_q_heads * (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + BatchConfig::max_spec_tree_token_num()); int vt_block_size = m->vProjSize; int vt_req_block_size = vt_block_size * m->num_q_heads * (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + BatchConfig::max_spec_tree_token_num()); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -695,6 +695,12 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); + } assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index fc86e1498e..50c056c816 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -392,7 +392,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + BatchConfig::max_spec_tree_token_num(), m->hidden_size); } } @@ -528,11 +528,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int kt_block_size = m->kProjSize; int kt_req_block_size = kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; + BatchConfig::max_spec_tree_token_num(); int vt_block_size = m->vProjSize; int vt_req_block_size = vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; + BatchConfig::max_spec_tree_token_num(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -795,7 +795,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ BatchConfig::max_sequence_length() + \ - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + BatchConfig::max_spec_tree_token_num(), \ THDS_PER_VALUE, \ THDS_PER_BLOCK, \ bc, \ @@ -813,7 +813,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, output_ptr, \ scale, \ BatchConfig::max_sequence_length() + \ - BatchConfig::BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + BatchConfig::BatchConfig::max_spec_tree_token_num(), \ BatchConfig::max_tokens_per_batch(), \ m->qProjSize, \ m->hidden_size, \ @@ -847,7 +847,8 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_new_tokens, - BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(), m->hidden_size); dim3 grid(m->num_q_heads, bc->num_active_requests()); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index c432208eca..bd96dbb141 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -95,6 +95,10 @@ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); } +int BatchConfig::max_spec_tree_token_num() { + return RequestManager::get_request_manager()->get_max_spec_tree_token_num(); +} + std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode() << ") @@@@@@@@@@@@@@" << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 46e17d4fdc..f2d6cc0d4c 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -54,6 +54,7 @@ RequestManager::RequestManager() // ffmodel.compile() max_requests_per_batch = -1; max_tokens_per_batch = -1; + max_spec_tree_token_num = -1; max_sequence_length = -1; } @@ -75,15 +76,27 @@ void RequestManager::set_max_tokens_per_batch(int max_num_tokens) { assert(max_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS); } +void RequestManager::set_max_spec_tree_token_num(int max_num_tokens) { + assert(max_spec_tree_token_num == -1 || + max_spec_tree_token_num == max_num_tokens); + max_spec_tree_token_num = max_num_tokens; + assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); +} + int RequestManager::get_max_tokens_per_batch() { assert(max_tokens_per_batch > 0); return max_tokens_per_batch; } +int RequestManager::get_max_spec_tree_token_num() { + assert(max_spec_tree_token_num > 0); + return max_spec_tree_token_num; +} + int RequestManager::get_max_verify_tokens_per_batch() { assert(max_tokens_per_batch > 0); return max_tokens_per_batch + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM * max_requests_per_batch; + max_spec_tree_token_num * max_requests_per_batch; } void RequestManager::set_max_sequence_length(int max_seq_length) { From c85668066afabf804fa21bcc16595cf08233ec22 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 13 Mar 2024 19:46:25 +0000 Subject: [PATCH 322/344] fix --- inference/spec_infer/spec_infer.cc | 2 +- src/runtime/request_manager.cc | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 36d54eee64..3ff32ef3ea 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -275,7 +275,7 @@ void FlexFlow::top_level_task(Task const *task, int max_requests_per_batch = 16; int max_tokens_per_batch = 256; int max_sequence_length = 1024; - int max_spec_tree_token_num = 20; + int max_spec_tree_token_num = 23; int expansion_degree = 3; InputArgs const &command_args = HighLevelRuntime::get_input_args(); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index f2d6cc0d4c..16513e918a 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1577,9 +1577,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; + printf("Exceeding (%i) the space available (%i) in the TreeVerify " + "batch\n", + new_bc.num_tokens, + get_max_verify_tokens_per_batch()); + assert(false); } if (new_bc.requestsInfo[i].num_tokens_in_batch + From 8d82c91a8417f88c95389206752beb9741f93259 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 14 Mar 2024 00:52:03 +0000 Subject: [PATCH 323/344] fix --- src/mapper/mapper.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d46bfc2877..d7aac4e37c 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -501,7 +501,9 @@ void FFMapper::map_task(const MapperContext ctx, output.task_priority = 0; output.postmap_task = false; if (task.target_proc.address_space() != node_id) { - assert(false); + if (enable_control_replication) { + assert(false); + } output.target_procs.push_back(task.target_proc); } else if (task.target_proc.kind() == Processor::TOC_PROC) { output.target_procs.push_back(task.target_proc); From 0479a64c66fc9002f7d674184da2e3a82e96a393 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 14 Mar 2024 03:56:41 +0000 Subject: [PATCH 324/344] fix --- inference/spec_infer/spec_infer.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 3ff32ef3ea..b6c1e408cd 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -314,6 +314,8 @@ void FlexFlow::top_level_task(Task const *task, // first decoding step: 3 results if (expansion_degree != -1) { + rm->push_spec_infer_tree_width(1); + rm->push_spec_infer_tree_width(1); rm->push_spec_infer_tree_width(expansion_degree); } From 5bd71236c76ac497466602550b1bc9de884fd1b3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 30 Mar 2024 14:09:26 -0400 Subject: [PATCH 325/344] run CI per commit only on inference branch --- .github/workflows/gpu-ci.yml | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 48dcda157e..7bdb6805a8 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -1,25 +1,8 @@ name: "gpu-ci" on: - pull_request: - paths: - - "cmake/**" - - "config/**" - - "deps/**" - - "python/**" - - "setup.py" - - "include/**" - - "inference/**" - - "src/**" - - "tests/inference/**" - - "conda/flexflow.yml" - - ".github/workflows/gpu-ci.yml" - - "tests/cpp_gpu_tests.sh" - - "tests/inference_tests.sh" - - "tests/training_tests.sh" - - "tests/python_interface_test.sh" push: branches: - - "master" + - "inference" paths: - "cmake/**" - "config/**" @@ -194,7 +177,7 @@ jobs: - name: Save inference output as an artifact if: always() - run: | + run: | cd inference tar -zcvf output.tar.gz ./output From e0a6e4fee228ca31a74e69dd84d73e01762214a1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 30 Mar 2024 14:29:47 -0400 Subject: [PATCH 326/344] fix --- python/flexflow/serve/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 14555bfc12..cbc4122897 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -375,7 +375,7 @@ def compile( self.rm.set_max_spec_tree_token_num( self.model_configs.max_spec_tree_token_num if "max_spec_tree_token_num" - in self.model_configs.max_spec_tree_token_num.__dict__ + in self.model_configs.__dict__ else 20 ) From 1210256080072935fecd71dbf7cbfb31d9f99efa Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+aetiurf@users.noreply.github.com> Date: Sat, 6 Apr 2024 22:02:15 +0800 Subject: [PATCH 327/344] fix: 'model_configs' AttributeError (#1358) --- python/flexflow/serve/serve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index cbc4122897..ac622b3337 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -373,9 +373,9 @@ def compile( model_configs = self.config_class(self.hf_config) self.rm.set_max_spec_tree_token_num( - self.model_configs.max_spec_tree_token_num + model_configs.max_spec_tree_token_num if "max_spec_tree_token_num" - in self.model_configs.__dict__ + in model_configs.__dict__ else 20 ) From b4a639c8990f2d031ee4938f3e7dc8140e4eb324 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 7 Apr 2024 23:26:53 -0400 Subject: [PATCH 328/344] Changes to support Perlmutter environment (#1360) * . * remove deadcode * add benchmarking mode, initializing weights randomly * better logging when running out of memory * update --------- Co-authored-by: Gabriele Oliaro --- cmake/cuda.cmake | 15 ++- config/config.inc | 12 +- config/config.linux | 14 ++- include/flexflow/config.h | 2 +- inference/incr_decoding/incr_decoding.cc | 4 +- inference/models/falcon.cc | 20 ---- inference/models/llama.cc | 10 -- inference/models/mpt.cc | 15 --- inference/models/opt.cc | 18 --- inference/models/starcoder.cc | 10 -- inference/python/incr_decoding.py | 3 +- inference/python/spec_infer.py | 3 +- inference/spec_infer/spec_infer.cc | 4 +- inference/utils/download_hf_model.py | 4 +- python/flexflow/core/__init__.py | 1 + python/flexflow/serve/__init__.py | 8 ++ src/mapper/mapper.cc | 46 ++++++-- src/runtime/file_loader.cc | 109 ++++++++++-------- src/runtime/model.cc | 8 +- .../python_test_configs/generate_configs.py | 3 +- 20 files changed, 159 insertions(+), 150 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 68e4ca07b1..45ecc1798b 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -13,8 +13,19 @@ if(CUDA_FOUND) # set cuda runtime and driver lib # override cublas and curand because the FindCUDA module may not find the correct libs set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT}) - set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT}) - set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT}) + if(CUBLAS_PATH) + set(CUBLAS_ROOT ${CUBLAS_PATH}) + else() + set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + endif() + set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT}) + if(CURAND_PATH) + set(CURAND_ROOT ${CURAND_PATH}) + else() + set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + endif() + set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT}) + list(APPEND FLEXFLOW_EXT_LIBRARIES ${CUDADRV_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} diff --git a/config/config.inc b/config/config.inc index 1121c114c4..7d7b2db9cf 100644 --- a/config/config.inc +++ b/config/config.inc @@ -62,6 +62,16 @@ if [ -n "$CUDA_DIR" ]; then SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}" fi +# set cublas dir +if [ -n "$CUBLAS_DIR" ]; then + SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}" +fi + +# set curand dir +if [ -n "$CURAND_DIR" ]; then + SET_CURAND="-DCURAND_PATH=${CURAND_DIR}" +fi + # set cudnn dir if [ -n "$CUDNN_DIR" ]; then SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}" @@ -231,7 +241,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 30edfa7dfe..acffc210f5 100755 --- a/config/config.linux +++ b/config/config.linux @@ -36,12 +36,18 @@ FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # or all available architectures. TODO: support autodetect FF_HIP_ARCH=${FF_HIP_ARCH:-"all"} -# set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} - # set CUDA dir in case cmake cannot autodetect a path CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} +# set CUBLAS dir in case it is not stored in the CUDA DIR +CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"} + +# set CURAND dir in case it is not stored in the CUDA DIR +CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"} + +# set CUDNN dir in case cmake cannot autodetect a path +CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} + # if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib, # otherwise, we will build nccl from source NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"} @@ -102,7 +108,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 17a3f59e29..2c11ae1131 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -145,7 +145,7 @@ class FFConfig { Legion::Runtime *lg_hlr; Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; - bool syntheticInput, profiling, perform_fusion; + bool benchmarking, profiling, perform_fusion; bool inference_debugging; size_t simulator_work_space_size; size_t search_budget; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index f88af3bc43..aae7256ffe 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -107,7 +107,9 @@ void parse_input_args(char **argv, } } if (paths.cache_folder_path.empty()) { - paths.cache_folder_path = "~/.cache/flexflow"; + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); } // Expand ~ to the home directory if needed wordexp_t p; diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index e00f4e9cfd..a529411ddb 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -252,26 +252,6 @@ void FALCON::create_falcon_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - falcon_config.n_head, - falcon_config.n_head_kv, - falcon_config.hidden_size, - falcon_config.hidden_size / falcon_config.n_head, - ff.config.tensor_parallelism_degree); - std::cout << "------load weights ----------" << std::endl; - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 14b8c31fa1..517f534438 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -277,16 +277,6 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 7e8fc8358f..70e2b5e9c5 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -259,21 +259,6 @@ void MPT::create_mpt_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - mpt_config.n_heads, - mpt_config.n_heads, - mpt_config.hidden_size, - mpt_config.hidden_size / mpt_config.n_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 3ff4c96fdf..5677d5658e 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -266,24 +266,6 @@ void OPT::create_opt_model(FFModel &ff, use_full_precision); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - opt_config.num_attention_heads, - opt_config.num_attention_heads, - opt_config.hidden_size, - opt_config.hidden_size / - opt_config.num_attention_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------finished loading weights----------" << std::endl; - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 2327c86119..8b0dc1098c 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -232,16 +232,6 @@ void STARCODER::create_starcoder_model( ff.config.tensor_parallelism_degree, use_full_precision); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index f7707816c8..05599ea6b9 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -55,6 +55,7 @@ def get_configs(): "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } @@ -62,7 +63,7 @@ def get_configs(): # required parameters "llm_model": "tiiuae/falcon-7b", # optional parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": False, "prompt": "", diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index fcb1b8f891..a6dfa8042e 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -55,6 +55,7 @@ def get_configs(): "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } @@ -62,7 +63,7 @@ def get_configs(): # required llm arguments "llm_model": "meta-llama/Llama-2-7b-hf", # optional llm parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": False, "ssms": [ diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index b6c1e408cd..f7edfd7696 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -124,7 +124,9 @@ void parse_input_args(char **argv, } } if (paths.cache_folder_path.empty()) { - paths.cache_folder_path = "~/.cache/flexflow"; + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); } // Expand ~ to the home directory if needed wordexp_t p; diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 94a8c23e68..7b4f4d6fb0 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -1,6 +1,6 @@ #!/usr/bin/env python import flexflow.serve as ff -import argparse +import argparse, os def parse_args(): @@ -12,7 +12,7 @@ def parse_args(): "--cache-folder", type=str, help="Folder to use to store the model(s) assets in FlexFlow format", - default="", + default=os.environ.get("FF_CACHE_PATH", ""), ) parser.add_argument( "--refresh-cache", diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index d7b1a595d2..2820cf485a 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -41,6 +41,7 @@ "num_cpus": "-ll:cpu", "legion_utility_processors": "-ll:util", "profiling": "--profiling", + "benchmarking": "--benchmarking", "inference_debugging": "--inference-debugging", "fusion": "--fusion", "disable_control_replication": "--disable-control-replication", diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index cf467280bd..5af077273d 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -45,6 +45,7 @@ def init( use_4bit_quantization: Optional[bool] = None, use_8bit_quantization: Optional[bool] = None, profiling: Optional[bool] = None, + benchmarking: Optional[bool] = None, inference_debugging: Optional[bool] = None, fusion: Optional[bool] = None, ): @@ -72,6 +73,7 @@ def init( - use_4bit_quantization: whether to use 4-bit quantization, defaults to False - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - profiling: whether to enable the FlexFlow profiling mode, defaults to False + - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True @@ -106,6 +108,8 @@ def init( :type use_8bit_quantization: Optional[bool], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional + :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False + :type benchmarking: Optional[bool], optional :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False :type inference_debugging: Optional[bool], optional :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True @@ -132,6 +136,7 @@ def init( use_4bit_quantization is not None, use_8bit_quantization is not None, profiling is not None, + benchmarking is not None, inference_debugging is not None, fusion is not None, ] @@ -157,6 +162,7 @@ def init( "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, "profiling": profiling, + "benchmarking": benchmarking, "inference_debugging": inference_debugging, "fusion": fusion, } @@ -201,6 +207,8 @@ def init( configs_dict["use_8bit_quantization"] = False if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False + if configs_dict.get("benchmarking", None) is None: + configs_dict["benchmarking"] = False if configs_dict.get("inference_debugging", None) is None: configs_dict["inference_debugging"] = False if configs_dict.get("fusion", None) is None: diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d7aac4e37c..c293aecb19 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -487,6 +487,25 @@ void FFMapper::premap_task(const MapperContext ctx, assert(false); } +std::string humanReadableSize(size_t size, bool mb = false) { + assert(size >= 0); + char const *units[] = {"B", "KiB", "MiB", "GiB", "TiB"}; + int i = 0; + double finalSize = size; + if (mb) { + finalSize /= 1024 * 1024; + i = 2; + } else { + while (finalSize >= 1024 && i < 4) { + finalSize /= 1024; + i++; + } + } + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%.2lf %s", finalSize, units[i]); + return std::string(buffer); +} + void FFMapper::map_task(const MapperContext ctx, Task const &task, MapTaskInput const &input, @@ -637,16 +656,19 @@ void FFMapper::map_task(const MapperContext ctx, } // Report failed to creation log_ff_mapper.error( - "FlexFlow failed allocation of size %zd bytes for " - "region requirement %d of task %s (UID %lld) in memory " IDFMT - " with kind %d for processor " IDFMT ".", - footprint, + "Out of memory! FlexFlow failed to reserve block of size %s" + " for region requirement %d of task %s (UID %lld) in %s memory (id: " + "%llx)" + " for processor id: %llx." + " Total pre-allocated memory capacity of this kind: %s.", + humanReadableSize(footprint).c_str(), idx, task.get_task_name(), task.get_unique_id(), + Legion::Mapping::Utilities::to_string(target_mem.kind()), target_mem.id, - target_mem.kind(), - task.target_proc.id); + task.target_proc.id, + humanReadableSize(target_mem.capacity(), true).c_str()); assert(false); } else { output.chosen_instances[idx].push_back(result); @@ -929,15 +951,17 @@ void FFMapper::map_inline(const MapperContext ctx, created, &footprint)) { log_ff_mapper.error( - "FlexFlow Mapper failed allocation of size %zd bytes" + "Out of memory! FlexFlow failed to reserve block of size %s" " for region requirement of inline mapping in task %s (UID %lld)" - " in memory " IDFMT "for processor " IDFMT ".", - footprint, + " in %s memory (id: %llx) for processor id: %llx." + " Total pre-allocated memory capacity of this kind: %s.", + humanReadableSize(footprint).c_str(), inline_op.parent_task->get_task_name(), inline_op.parent_task->get_unique_id(), + Legion::Mapping::Utilities::to_string(target_memory.kind()), target_memory.id, - inline_op.parent_task->current_proc.id); - printf("target_memory.kind() = %d\n", target_memory.kind()); + inline_op.parent_task->current_proc.id, + humanReadableSize(target_memory.capacity(), true).c_str()); assert(false); } else { output.chosen_instances.push_back(result); diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 56558b3185..43ce9d7005 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -725,60 +725,69 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, std::string weight_filename = removeGuidOperatorName(std::string(l->name)); - if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); + if (ff->config.benchmarking) { + std::cout << "Initializing weight " << weight_filename + << " with random data (benchmarking mode)" << std::endl; + // If benchmarking, we don't need to load the weights + // We can just fill the weight tensor with random data + } else { + if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { + if (weight_filename.find("self_attention") != std::string::npos) { + load_attention_weights_multi_query( + data, weight_filename, weights_folder, hidden_dim, num_heads); + } else if (weight_filename.find("attention") != std::string::npos && + weight_filename.rfind("attention") == + weight_filename.length() - strlen("attention")) { + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); + } else { + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); + } + } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); + assert(false); } - + } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { + assert(weight_idx >= 0 || weight_idx <= 2); + weight_filename += (weight_idx == 0) + ? "_attn_bias" + : ((weight_idx == 1) ? "_weight" : "_bias"); + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = + join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } else { - assert(false); - } - } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { - assert(weight_idx >= 0 || weight_idx <= 2); - weight_filename += (weight_idx == 0) - ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); - std::cout << "Loading weight file " << weight_filename << std::endl; - std::string weight_filepath = join_path({weights_folder, weight_filename}); - load_from_file(data, volume, weight_filepath); - } else { - // default op - assert(weight_idx == 0 || weight_idx == 1); - // handle exception - if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + // default op + assert(weight_idx == 0 || weight_idx == 1); + // handle exception + if (weight_filename != "embed_tokens_weight_lm_head") { + weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + } + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = + join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } - std::cout << "Loading weight file " << weight_filename << std::endl; - std::string weight_filepath = join_path({weights_folder, weight_filename}); - load_from_file(data, volume, weight_filepath); } // Copy the weight data from the buffer to the weight's ParallelTensor diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 40f758282c..1fa281777a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4065,6 +4065,7 @@ struct DefaultConfig { // const static int iterations = 1; const static int batchSize = 64; const static bool profiling = false; + const static bool benchmarking = false; const static bool inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; @@ -4100,6 +4101,7 @@ FFConfig::FFConfig() { // iterations = DefaultConfig::iterations; batchSize = DefaultConfig::batchSize; profiling = DefaultConfig::profiling; + benchmarking = DefaultConfig::benchmarking; inference_debugging = DefaultConfig::inference_debugging; learningRate = DefaultConfig::learningRate; weightDecay = DefaultConfig::weightDecay; @@ -4137,7 +4139,7 @@ FFConfig::FFConfig() { export_strategy_computation_graph_file = ""; dataset_path = ""; substitution_json_path = tl::nullopt; - syntheticInput = false; + benchmarking = false; perform_fusion = false; base_optimize_threshold = DefaultConfig::base_optimize_threshold; perform_memory_search = false; @@ -4290,6 +4292,10 @@ void FFConfig::parse_args(char **argv, int argc) { profiling = true; continue; } + if (!strcmp(argv[i], "--benchmarking")) { + benchmarking = true; + continue; + } if (!strcmp(argv[i], "--inference-debugging")) { inference_debugging = true; continue; diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index ebaadade32..41703cf431 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -18,6 +18,7 @@ "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } @@ -25,7 +26,7 @@ # required parameters "llm_model": "tiiuae/falcon-7b", # optional parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": True, "prompt": "", From 7da197e71e31a1840d9404a63d5a9fdd20d4d41e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 23 Apr 2024 20:26:33 -0400 Subject: [PATCH 329/344] update workflow to build rocm docker images --- .github/workflows/docker-build.yml | 58 +++++++++++++----------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 54805cc325..d16179434b 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -20,26 +20,22 @@ concurrency: cancel-in-progress: true jobs: - oracle-runner-start: - name: Start an Oracle instance to build the ROCM Docker images + rocm-builder-start: + name: Start an AWS instance to build the ROCM Docker images runs-on: ubuntu-latest if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: - OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} - OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} - OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} - OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} - OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} - OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }} + ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }} steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 - - name: Install Oracle Cloud Infrastructure library - run: pip install oci - - - name: Start Oracle Machine - run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID + - name: Start EC2 instance + run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID docker-build-rocm: name: Build and Install FlexFlow in a Docker Container (ROCm backend) @@ -66,8 +62,8 @@ jobs: docker-build-and-publish-rocm: name: Build and Deploy FlexFlow Docker Containers (ROCm backend) - needs: oracle-runner-start - runs-on: [self-hosted, cpu_only] + needs: rocm-builder-start + runs-on: [self-hosted, rocm_builder] if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} strategy: matrix: @@ -148,27 +144,23 @@ jobs: ./docker/publish.sh flexflow-environment ./docker/publish.sh flexflow - oracle-runner-stop: + rocm-builder-stop: needs: docker-build-and-publish-rocm if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} runs-on: ubuntu-latest - name: Stop the Oracle instance we used to build the ROCM Docker images + name: Stop the AWS instance we used to build the ROCM Docker images env: - OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} - OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} - OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} - OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} - OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} - OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }} + ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }} steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - - - name: Install Oracle Cloud Infrastructure library - run: pip install oci - - - name: Stop Oracle Machine - run: python3 .github/workflows/helpers/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Start EC2 instance + run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID notify-slack: name: Notify Slack in case of failure From 002fdf017c7dd665b703da37494093161c3d55c7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 23 Apr 2024 22:35:42 -0400 Subject: [PATCH 330/344] downgrade to python 3.11 for now --- docker/flexflow-environment/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index edbf9a7e52..6ca337f58d 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -17,7 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut # Install Python3 with Miniconda ARG python_version "latest" -RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ +#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ +RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \ echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \ exit 1; \ From d54e4b6a747f3940a19989a56095a71540e4c0d8 Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com> Date: Wed, 1 May 2024 01:51:57 +0800 Subject: [PATCH 331/344] doc: fix c++ serving example (#1372) Co-authored-by: Gabriele Oliaro --- .github/README.md | 2 +- SERVE.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/README.md b/.github/README.md index 4a2a881c8d..c4f6baada6 100644 --- a/.github/README.md +++ b/.github/README.md @@ -178,7 +178,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` diff --git a/SERVE.md b/SERVE.md index e9bab3d702..9472d50a62 100644 --- a/SERVE.md +++ b/SERVE.md @@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` From b90771a376fddbddf09af3f23e4ecae57911438e Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 30 May 2024 14:24:42 -0700 Subject: [PATCH 332/344] Update README.md --- .github/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/README.md b/.github/README.md index c4f6baada6..5aba2295d5 100644 --- a/.github/README.md +++ b/.github/README.md @@ -4,12 +4,6 @@ --- -## News🔥: - -* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6 -* [08/16/2023] Adding Starcoder model support -* [08/14/2023] Released Docker images for different CUDA versions - ## What is FlexFlow Serve The high computational and memory requirements of generative large language From 385c118447a8b1451de3641c8ecf437245b9248b Mon Sep 17 00:00:00 2001 From: FelixBrakel Date: Thu, 30 May 2024 23:39:10 +0200 Subject: [PATCH 333/344] Add examples for every layer in the python layer API (#1297) * Fix incorrect innode being checked * Add example for every layer on the FFModel python class --------- Co-authored-by: Gabriele Oliaro Co-authored-by: Zhihao Jia --- docs/source/python/layers.rst | 2 +- examples/python/native/ops/add.py | 45 ++++++++ .../ops/add_bias_residual_layer_norm.py | 78 +++++++++++++ examples/python/native/ops/arg_top_k.py | 61 ++++++++++ examples/python/native/ops/argmax.py | 55 +++++++++ examples/python/native/ops/batch_matmul.py | 0 examples/python/native/ops/batch_norm.py | 36 ++++++ examples/python/native/ops/beam_top_k.py | 58 ++++++++++ examples/python/native/ops/concat.py | 43 +++++++ examples/python/native/ops/conv2d.py | 45 ++++++++ examples/python/native/ops/cos.py | 44 +++++++ examples/python/native/ops/dense.py | 38 +++++++ examples/python/native/ops/divide.py | 48 ++++++++ examples/python/native/ops/dropout.py | 49 ++++++++ examples/python/native/ops/elu.py | 47 ++++++++ examples/python/native/ops/embedding.py | 39 +++++++ examples/python/native/ops/exp.py | 0 examples/python/native/ops/flat.py | 0 examples/python/native/ops/gather.py | 60 ++++++++++ examples/python/native/ops/gelu.py | 51 +++++++++ examples/python/native/ops/identity.py | 49 ++++++++ .../ops/inc_multihead_self_attention.py | 103 +++++++++++++++++ .../inc_multihead_self_attention_verify.py | 103 +++++++++++++++++ .../ops/inc_multiquery_self_attention.py | 107 ++++++++++++++++++ .../inc_multiquery_self_attention_verify.py | 107 ++++++++++++++++++ examples/python/native/ops/layer_norm.py | 48 ++++++++ examples/python/native/ops/max.py | 54 +++++++++ examples/python/native/ops/mean.py | 48 ++++++++ examples/python/native/ops/min.py | 54 +++++++++ .../python/native/ops/multihead_attention.py | 0 examples/python/native/ops/multiply.py | 45 ++++++++ examples/python/native/ops/pool2d.py | 36 ++++++ examples/python/native/ops/pow.py | 46 ++++++++ examples/python/native/ops/reduce_sum.py | 48 ++++++++ examples/python/native/ops/relu.py | 46 ++++++++ examples/python/native/ops/reshape.py | 41 +++++++ .../python/native/ops/residual_layer_norm.py | 93 +++++++++++++++ .../python/native/ops/residual_rms_norm.py | 80 +++++++++++++ examples/python/native/ops/reverse.py | 37 ++++++ examples/python/native/ops/rms_norm.py | 64 +++++++++++ examples/python/native/ops/rsqrt.py | 44 +++++++ examples/python/native/ops/sampling.py | 55 +++++++++ examples/python/native/ops/scalar_add.py | 53 +++++++++ examples/python/native/ops/scalar_multiply.py | 53 +++++++++ examples/python/native/ops/scalar_sub.py | 53 +++++++++ .../python/native/ops/scalar_true_divide.py | 53 +++++++++ examples/python/native/ops/sigmoid.py | 46 ++++++++ .../python/native/ops/sigmoid_silu_multi.py | 58 ++++++++++ examples/python/native/ops/sin.py | 44 +++++++ examples/python/native/ops/softmax.py | 46 ++++++++ .../ops/spec_inc_multihead_self_attention.py | 103 +++++++++++++++++ .../ops/spec_inc_multiquery_self_attention.py | 107 ++++++++++++++++++ examples/python/native/ops/split.py | 47 ++++++++ examples/python/native/ops/subtract.py | 45 ++++++++ examples/python/native/ops/tanh.py | 46 ++++++++ examples/python/native/ops/transpose.py | 38 +++++++ 56 files changed, 2898 insertions(+), 1 deletion(-) create mode 100644 examples/python/native/ops/add.py create mode 100644 examples/python/native/ops/add_bias_residual_layer_norm.py create mode 100644 examples/python/native/ops/arg_top_k.py create mode 100644 examples/python/native/ops/argmax.py create mode 100644 examples/python/native/ops/batch_matmul.py create mode 100644 examples/python/native/ops/batch_norm.py create mode 100644 examples/python/native/ops/beam_top_k.py create mode 100644 examples/python/native/ops/concat.py create mode 100644 examples/python/native/ops/conv2d.py create mode 100644 examples/python/native/ops/cos.py create mode 100644 examples/python/native/ops/dense.py create mode 100644 examples/python/native/ops/divide.py create mode 100644 examples/python/native/ops/dropout.py create mode 100644 examples/python/native/ops/elu.py create mode 100644 examples/python/native/ops/embedding.py create mode 100644 examples/python/native/ops/exp.py create mode 100644 examples/python/native/ops/flat.py create mode 100644 examples/python/native/ops/gather.py create mode 100644 examples/python/native/ops/gelu.py create mode 100644 examples/python/native/ops/identity.py create mode 100644 examples/python/native/ops/inc_multihead_self_attention.py create mode 100644 examples/python/native/ops/inc_multihead_self_attention_verify.py create mode 100644 examples/python/native/ops/inc_multiquery_self_attention.py create mode 100644 examples/python/native/ops/inc_multiquery_self_attention_verify.py create mode 100644 examples/python/native/ops/layer_norm.py create mode 100644 examples/python/native/ops/max.py create mode 100644 examples/python/native/ops/mean.py create mode 100644 examples/python/native/ops/min.py create mode 100644 examples/python/native/ops/multihead_attention.py create mode 100644 examples/python/native/ops/multiply.py create mode 100644 examples/python/native/ops/pool2d.py create mode 100644 examples/python/native/ops/pow.py create mode 100644 examples/python/native/ops/reduce_sum.py create mode 100644 examples/python/native/ops/relu.py create mode 100644 examples/python/native/ops/reshape.py create mode 100644 examples/python/native/ops/residual_layer_norm.py create mode 100644 examples/python/native/ops/residual_rms_norm.py create mode 100644 examples/python/native/ops/reverse.py create mode 100644 examples/python/native/ops/rms_norm.py create mode 100644 examples/python/native/ops/rsqrt.py create mode 100644 examples/python/native/ops/sampling.py create mode 100644 examples/python/native/ops/scalar_add.py create mode 100644 examples/python/native/ops/scalar_multiply.py create mode 100644 examples/python/native/ops/scalar_sub.py create mode 100644 examples/python/native/ops/scalar_true_divide.py create mode 100644 examples/python/native/ops/sigmoid.py create mode 100644 examples/python/native/ops/sigmoid_silu_multi.py create mode 100644 examples/python/native/ops/sin.py create mode 100644 examples/python/native/ops/softmax.py create mode 100644 examples/python/native/ops/spec_inc_multihead_self_attention.py create mode 100644 examples/python/native/ops/spec_inc_multiquery_self_attention.py create mode 100644 examples/python/native/ops/split.py create mode 100644 examples/python/native/ops/subtract.py create mode 100644 examples/python/native/ops/tanh.py create mode 100644 examples/python/native/ops/transpose.py diff --git a/docs/source/python/layers.rst b/docs/source/python/layers.rst index 91f12094e6..1be91a8b17 100644 --- a/docs/source/python/layers.rst +++ b/docs/source/python/layers.rst @@ -3,7 +3,7 @@ Layers API ********** Layers are the basic building blocks of neural networks in FlexFlow. The inputs of a layer consists of a tensor or a list of tensors and some state variables, -and the outputs of a layer is a tensor or a list of tensors. +and the outputs of a layer is a tensor or a list of tensors. See https://github.com/flexflow/FlexFlow/examples/python/native/ops for an example for every layer .. automodule:: flexflow.core.flexflow_cffi :noindex: diff --git a/examples/python/native/ops/add.py b/examples/python/native/ops/add.py new file mode 100644 index 0000000000..50b9d16fd0 --- /dev/null +++ b/examples/python/native/ops/add.py @@ -0,0 +1,45 @@ +# The basis for this test of the 'add' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_add(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.add(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + _ = test_add(ffconfig, input1, input2) diff --git a/examples/python/native/ops/add_bias_residual_layer_norm.py b/examples/python/native/ops/add_bias_residual_layer_norm.py new file mode 100644 index 0000000000..6e8dffbc9e --- /dev/null +++ b/examples/python/native/ops/add_bias_residual_layer_norm.py @@ -0,0 +1,78 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_add_bias_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + residual_tensor = ffmodel.create_tensor(residual_arr.shape, DataType.DT_FLOAT) + + output_tensor, layer_norm_output = ffmodel.add_bias_residual_layer_norm( + input_tensor, + residual_tensor, + axes=axes, + elementwise_affine=elementwise_affine, + eps=eps, + use_bias=use_bias, + name="add_bias_residual_layer_norm_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + dataloader_residual = ffmodel.create_data_loader(residual_tensor, residual_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_residual.reset() + + dataloader_input.next_batch(ffmodel) + dataloader_residual.next_batch(ffmodel) + + ffmodel.forward() + + output_tensor.inline_map(ffmodel, ffconfig) + layer_norm_output.inline_map(ffmodel, ffconfig) + output_result = output_tensor.get_array(ffmodel, ffconfig) + layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig) + + return output_result, layer_norm_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + residual_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + axes_to_normalize = [1, 2] # Example axes to normalize + + output_result, layer_norm_result = test_add_bias_residual_layer_norm( + ffconfig, + input_data, + residual_data, + axes=axes_to_normalize, + elementwise_affine=True, + eps=1e-5, + use_bias=True + ) + + print("Input Array:") + print(input_data) + print("\nResidual Array:") + print(residual_data) + print(f"\nOutput Array after applying add_bias_residual_layer_norm along axes {axes_to_normalize}:") + print(output_result) + print("\nLayer Norm Result:") + print(layer_norm_result) diff --git a/examples/python/native/ops/arg_top_k.py b/examples/python/native/ops/arg_top_k.py new file mode 100644 index 0000000000..79edc5dfad --- /dev/null +++ b/examples/python/native/ops/arg_top_k.py @@ -0,0 +1,61 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_arg_top_k(ffconfig, input_arr: np.ndarray, k: int, sorted: bool, speculative_decoding: bool, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + arg_top_k_output = ffmodel.arg_top_k( + input_tensor, + k, + sorted, + speculative_decoding, + name="arg_top_k_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_MEAN_SQUARED_ERROR, + metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR], + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + arg_top_k_output.inline_map(ffmodel, ffconfig) + output_result = arg_top_k_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + k_value = 5 + sorted_value = True + speculative_decoding_value = False # Example value for speculative_decoding + + output_result = test_arg_top_k( + ffconfig, + input_data, + k=k_value, + sorted=sorted_value, + speculative_decoding=speculative_decoding_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying arg_top_k:") + print(output_result) diff --git a/examples/python/native/ops/argmax.py b/examples/python/native/ops/argmax.py new file mode 100644 index 0000000000..dda0e6b0bc --- /dev/null +++ b/examples/python/native/ops/argmax.py @@ -0,0 +1,55 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_argmax(ffconfig, input_arr: np.ndarray, beam_search: bool, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + argmax_output = ffmodel.argmax( + input_tensor, + beam_search, + name="argmax_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + argmax_output.inline_map(ffmodel, ffconfig) + output_result = argmax_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + beam_search_value = True # Set to True or False based on your requirement + + output_result = test_argmax( + ffconfig, + input_data, + beam_search=beam_search_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying argmax:") + print(output_result) diff --git a/examples/python/native/ops/batch_matmul.py b/examples/python/native/ops/batch_matmul.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/batch_norm.py b/examples/python/native/ops/batch_norm.py new file mode 100644 index 0000000000..b243e79d37 --- /dev/null +++ b/examples/python/native/ops/batch_norm.py @@ -0,0 +1,36 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def ff(ffconfig, input_arr: np.ndarray): + ffmodel = FFModel(ffconfig) + # TODO: convert input to ff tensor + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.batch_norm( + input_tensor + ) + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = ff(ffconfig, input) diff --git a/examples/python/native/ops/beam_top_k.py b/examples/python/native/ops/beam_top_k.py new file mode 100644 index 0000000000..cb2fdfb3d2 --- /dev/null +++ b/examples/python/native/ops/beam_top_k.py @@ -0,0 +1,58 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_beam_top_k(ffconfig, input_arr: np.ndarray, max_beam_size: int, sorted: bool, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + beam_top_k_output = ffmodel.beam_top_k( + input_tensor, + max_beam_size, + sorted, + name="beam_top_k_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + beam_top_k_output.inline_map(ffmodel, ffconfig) + output_result = beam_top_k_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + max_beam_size_value = 3 + sorted_value = True + + output_result = test_beam_top_k( + ffconfig, + input_data, + max_beam_size=max_beam_size_value, + sorted=sorted_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying beam_top_k:") + print(output_result) diff --git a/examples/python/native/ops/concat.py b/examples/python/native/ops/concat.py new file mode 100644 index 0000000000..0088d7b848 --- /dev/null +++ b/examples/python/native/ops/concat.py @@ -0,0 +1,43 @@ +# The basis for this test of the 'concatenate' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_concatenate(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.concat([input_tensor1, input_tensor2], axis=1) + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_concatenate(ffconfig, input1, input2) diff --git a/examples/python/native/ops/conv2d.py b/examples/python/native/ops/conv2d.py new file mode 100644 index 0000000000..02b3646aaa --- /dev/null +++ b/examples/python/native/ops/conv2d.py @@ -0,0 +1,45 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def ff(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.conv2d( + input_tensor, + 32, + 3, + 3, + 1, + 1, + 1, + 1, + use_bias=False + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = ff(ffconfig, input) diff --git a/examples/python/native/ops/cos.py b/examples/python/native/ops/cos.py new file mode 100644 index 0000000000..26f6307685 --- /dev/null +++ b/examples/python/native/ops/cos.py @@ -0,0 +1,44 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_cos(ffconfig, input_arr: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + cos_output = ffmodel.cos(input_tensor, name="cos_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + cos_output.inline_map(ffmodel, ffconfig) + cos_result = cos_output.get_array(ffmodel, ffconfig) + + return cos_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + cos_result = test_cos(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying cos function:") + print(cos_result) diff --git a/examples/python/native/ops/dense.py b/examples/python/native/ops/dense.py new file mode 100644 index 0000000000..ec0a3dc65b --- /dev/null +++ b/examples/python/native/ops/dense.py @@ -0,0 +1,38 @@ +# The basis for this test of the 'dense' layer is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_dense(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.dense(input_tensor, 64, activation=ActiMode.AC_MODE_RELU) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + _ = test_dense(ffconfig, input) diff --git a/examples/python/native/ops/divide.py b/examples/python/native/ops/divide.py new file mode 100644 index 0000000000..419bf714ab --- /dev/null +++ b/examples/python/native/ops/divide.py @@ -0,0 +1,48 @@ +# The basis for this test of the 'divide' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_divide(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.divide(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + # Avoid division by zero in input2 + input2 = np.where(input2 == 0, 1e-6, input2) + + _ = test_divide(ffconfig, input1, input2) diff --git a/examples/python/native/ops/dropout.py b/examples/python/native/ops/dropout.py new file mode 100644 index 0000000000..3aa44a5a5b --- /dev/null +++ b/examples/python/native/ops/dropout.py @@ -0,0 +1,49 @@ +# The basis for this test of the 'Dropout' layer is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_dropout(ffconfig, input_arr: np.ndarray, dropout_rate: float = 0.5) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply Dropout layer + out = ffmodel.dropout(input_tensor, dropout_rate, 0) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + # You can adjust the dropout rate as needed + dropout_rate_param = 0.5 + + result = test_dropout(ffconfig, input_data, dropout_rate_param) + + print("Input Data:") + print(input_data) + + print("\nResult after Dropout layer:") + print(result) diff --git a/examples/python/native/ops/elu.py b/examples/python/native/ops/elu.py new file mode 100644 index 0000000000..7a6ef1f621 --- /dev/null +++ b/examples/python/native/ops/elu.py @@ -0,0 +1,47 @@ +# The basis for this test of the 'ELU' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_elu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply ELU activation + out = ffmodel.elu(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_elu(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after ELU activation:") + print(result) diff --git a/examples/python/native/ops/embedding.py b/examples/python/native/ops/embedding.py new file mode 100644 index 0000000000..34bced3798 --- /dev/null +++ b/examples/python/native/ops/embedding.py @@ -0,0 +1,39 @@ +# The basis for this test of the 'embedding' layer is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_embedding(ffconfig, input_arr: np.ndarray, vocab_size: int, embedding_dim: int) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_INT32) + + out = ffmodel.embedding(input_tensor, vocab_size, embedding_dim, AggrMode.AGGR_MODE_SUM) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + vocab_size = 1000 + embedding_dim = 50 + input = np.random.randint(low=0, high=vocab_size, size=(ffconfig.batch_size, 10), dtype=np.int32) + _ = test_embedding(ffconfig, input, vocab_size, embedding_dim) diff --git a/examples/python/native/ops/exp.py b/examples/python/native/ops/exp.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/flat.py b/examples/python/native/ops/flat.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/gather.py b/examples/python/native/ops/gather.py new file mode 100644 index 0000000000..e13b6e4c75 --- /dev/null +++ b/examples/python/native/ops/gather.py @@ -0,0 +1,60 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_gather(ffconfig, input_arr: np.ndarray, index_arr: np.ndarray, dim: int, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + index_tensor = ffmodel.create_tensor(index_arr.shape, DataType.DT_INT32) + + gather_output = ffmodel.gather( + input_tensor, + index_tensor, + dim, + name="gather_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + dataloader_index = ffmodel.create_data_loader(index_tensor, index_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_index.reset() + + dataloader_input.next_batch(ffmodel) + dataloader_index.next_batch(ffmodel) + + ffmodel.forward() + + gather_output.inline_map(ffmodel, ffconfig) + output_result = gather_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + index_data = np.random.randint(0, 5, size=(ffconfig.batch_size,)).astype(np.int32) + dim_to_gather = 2 # Example dimension to gather along + + output_result = test_gather(ffconfig, input_data, index_data, dim=dim_to_gather) + + print("Input Array:") + print(input_data) + print("\nIndex Array:") + print(index_data) + print(f"\nOutput Array after applying gather along dimension {dim_to_gather}:") + print(output_result) diff --git a/examples/python/native/ops/gelu.py b/examples/python/native/ops/gelu.py new file mode 100644 index 0000000000..84fabd36e1 --- /dev/null +++ b/examples/python/native/ops/gelu.py @@ -0,0 +1,51 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_gelu(ffconfig, input_arr: np.ndarray, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + gelu_output = ffmodel.gelu( + input_tensor, + inplace=inplace, + name="gelu_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + gelu_output.inline_map(ffmodel, ffconfig) + output_result = gelu_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + inplace_flag = True # Example inplace flag + + output_result = test_gelu(ffconfig, input_data, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying gelu activation function (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/identity.py b/examples/python/native/ops/identity.py new file mode 100644 index 0000000000..fbf63e717c --- /dev/null +++ b/examples/python/native/ops/identity.py @@ -0,0 +1,49 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_identity(ffconfig, input_arr: np.ndarray, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + identity_output = ffmodel.identity( + input_tensor, + name="identity_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + identity_output.inline_map(ffmodel, ffconfig) + output_result = identity_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + output_result = test_identity(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying identity function:") + print(output_result) diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py new file mode 100644 index 0000000000..dce7bd565d --- /dev/null +++ b/examples/python/native/ops/inc_multihead_self_attention.py @@ -0,0 +1,103 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multihead_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multihead_self_attention_output = ffmodel.inc_multihead_self_attention( + input_tensor, + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multihead_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = inc_multihead_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_heads_value = 8 + + output_result = test_inc_multihead_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_heads=num_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multihead_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py new file mode 100644 index 0000000000..f6dc8e3933 --- /dev/null +++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py @@ -0,0 +1,103 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multihead_self_attention_verify( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multihead_self_attention_verify_output = ffmodel.inc_multihead_self_attention_verify( + input_tensor, + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multihead_self_attention_verify_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multihead_self_attention_verify_output.inline_map(ffmodel, ffconfig) + output_result = inc_multihead_self_attention_verify_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_heads_value = 8 + + output_result = test_inc_multihead_self_attention_verify( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_heads=num_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multihead_self_attention_verify:") + print(output_result) diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py new file mode 100644 index 0000000000..33390ab1f6 --- /dev/null +++ b/examples/python/native/ops/inc_multiquery_self_attention.py @@ -0,0 +1,107 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multiquery_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_q_heads: int, + num_kv_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multiquery_self_attention_output = ffmodel.inc_multiquery_self_attention( + input_tensor, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multiquery_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_q_heads_value = 4 + num_kv_heads_value = 4 + + output_result = test_inc_multiquery_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_q_heads=num_q_heads_value, + num_kv_heads=num_kv_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multiquery_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py new file mode 100644 index 0000000000..69a76f68bf --- /dev/null +++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py @@ -0,0 +1,107 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multiquery_self_attention_verify( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_q_heads: int, + num_kv_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multiquery_self_attention_verify_output = ffmodel.inc_multiquery_self_attention_verify( + input_tensor, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multiquery_self_attention_verify_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multiquery_self_attention_verify_output.inline_map(ffmodel, ffconfig) + output_result = inc_multiquery_self_attention_verify_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_q_heads_value = 4 + num_kv_heads_value = 4 + + output_result = test_inc_multiquery_self_attention_verify( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_q_heads=num_q_heads_value, + num_kv_heads=num_kv_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multiquery_self_attention_verify:") + print(output_result) diff --git a/examples/python/native/ops/layer_norm.py b/examples/python/native/ops/layer_norm.py new file mode 100644 index 0000000000..b3cca93d6e --- /dev/null +++ b/examples/python/native/ops/layer_norm.py @@ -0,0 +1,48 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_layer_norm(ffconfig, input_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + layer_norm_output = ffmodel.layer_norm(input_tensor, axes=axes, elementwise_affine=elementwise_affine, eps=eps, use_bias=use_bias, name="layer_norm_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + layer_norm_output.inline_map(ffmodel, ffconfig) + layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig) + + return layer_norm_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + axes_to_normalize = [1, 2] # Example axes to normalize + + layer_norm_result = test_layer_norm(ffconfig, input_data, axes=axes_to_normalize, elementwise_affine=True, eps=1e-5, use_bias=True) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying layer_norm function along axes {axes_to_normalize}:") + print(layer_norm_result) diff --git a/examples/python/native/ops/max.py b/examples/python/native/ops/max.py new file mode 100644 index 0000000000..bf9c629406 --- /dev/null +++ b/examples/python/native/ops/max.py @@ -0,0 +1,54 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_max(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + max_output = ffmodel.max(input_tensor1, input_tensor2, name="max_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input2.reset() + + dataloader_input1.next_batch(ffmodel) + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + max_output.inline_map(ffmodel, ffconfig) + max_result = max_output.get_array(ffmodel, ffconfig) + + return max_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + max_result = test_max(ffconfig, input_data1, input_data2) + + print("Input Array 1:") + print(input_data1) + print("\nInput Array 2:") + print(input_data2) + print("\nOutput Array after applying max function:") + print(max_result) diff --git a/examples/python/native/ops/mean.py b/examples/python/native/ops/mean.py new file mode 100644 index 0000000000..df8c3f642e --- /dev/null +++ b/examples/python/native/ops/mean.py @@ -0,0 +1,48 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_mean(ffconfig, input_arr: np.ndarray, dims: List[int], keepdims: bool = False) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + mean_output = ffmodel.mean(input_tensor, dims=dims, keepdims=keepdims, name="mean_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + mean_output.inline_map(ffmodel, ffconfig) + mean_result = mean_output.get_array(ffmodel, ffconfig) + + return mean_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + dims_to_mean = [1, 2] # Example dimensions to take the mean over + + mean_result = test_mean(ffconfig, input_data, dims=dims_to_mean, keepdims=False) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying mean function along dimensions {dims_to_mean}:") + print(mean_result) diff --git a/examples/python/native/ops/min.py b/examples/python/native/ops/min.py new file mode 100644 index 0000000000..df81f4f2d2 --- /dev/null +++ b/examples/python/native/ops/min.py @@ -0,0 +1,54 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_min(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + min_output = ffmodel.min(input_tensor1, input_tensor2, name="min_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input2.reset() + + dataloader_input1.next_batch(ffmodel) + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + min_output.inline_map(ffmodel, ffconfig) + min_result = min_output.get_array(ffmodel, ffconfig) + + return min_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + min_result = test_min(ffconfig, input_data1, input_data2) + + print("Input Array 1:") + print(input_data1) + print("\nInput Array 2:") + print(input_data2) + print("\nOutput Array after applying min function:") + print(min_result) diff --git a/examples/python/native/ops/multihead_attention.py b/examples/python/native/ops/multihead_attention.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/multiply.py b/examples/python/native/ops/multiply.py new file mode 100644 index 0000000000..fb4f489150 --- /dev/null +++ b/examples/python/native/ops/multiply.py @@ -0,0 +1,45 @@ +# The basis for this test of the 'multiply' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_multiply(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.multiply(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + _ = test_multiply(ffconfig, input1, input2) diff --git a/examples/python/native/ops/pool2d.py b/examples/python/native/ops/pool2d.py new file mode 100644 index 0000000000..b4dc8b219e --- /dev/null +++ b/examples/python/native/ops/pool2d.py @@ -0,0 +1,36 @@ +# AI generated from conv2d example +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_pool2d(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.pool2d(input_tensor, 3, 3, 1, 1, 0, 0, PoolType.POOL_MAX) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_pool2d(ffconfig, input) \ No newline at end of file diff --git a/examples/python/native/ops/pow.py b/examples/python/native/ops/pow.py new file mode 100644 index 0000000000..cf5bbebd80 --- /dev/null +++ b/examples/python/native/ops/pow.py @@ -0,0 +1,46 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_pow(ffconfig, input_arr: np.ndarray, exponent: float) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + pow_output = ffmodel.pow(input_tensor, exponent, name="pow_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + pow_output.inline_map(ffmodel, ffconfig) + pow_result = pow_output.get_array(ffmodel, ffconfig) + + return pow_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + exponent_value = 2.0 # Example exponent value + + pow_result = test_pow(ffconfig, input_data, exponent=exponent_value) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying pow function with exponent {exponent_value}:") + print(pow_result) diff --git a/examples/python/native/ops/reduce_sum.py b/examples/python/native/ops/reduce_sum.py new file mode 100644 index 0000000000..7e7b41b799 --- /dev/null +++ b/examples/python/native/ops/reduce_sum.py @@ -0,0 +1,48 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_reduce_sum(ffconfig, input_arr: np.ndarray, axes: List[int], keepdims: bool = False) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + reduce_sum_output = ffmodel.reduce_sum(input_tensor, axes=axes, keepdims=keepdims, name="reduce_sum_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + reduce_sum_output.inline_map(ffmodel, ffconfig) + reduce_sum_result = reduce_sum_output.get_array(ffmodel, ffconfig) + + return reduce_sum_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + axes_to_reduce = [1, 2] # Example axes to reduce + + reduce_sum_result = test_reduce_sum(ffconfig, input_data, axes=axes_to_reduce, keepdims=False) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying reduce_sum along axes {axes_to_reduce}:") + print(reduce_sum_result) diff --git a/examples/python/native/ops/relu.py b/examples/python/native/ops/relu.py new file mode 100644 index 0000000000..d855b27164 --- /dev/null +++ b/examples/python/native/ops/relu.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'ReLU' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_relu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply ReLU activation + out = ffmodel.relu(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_relu(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after ReLU activation:") + print(result) diff --git a/examples/python/native/ops/reshape.py b/examples/python/native/ops/reshape.py new file mode 100644 index 0000000000..348d6bd935 --- /dev/null +++ b/examples/python/native/ops/reshape.py @@ -0,0 +1,41 @@ +# The basis for this test of the 'reshape' operation is generated by ChatGPT using the manually created conv2d.py as a template. + +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_reshape(ffconfig, input_arr: np.ndarray, target_shape: List[int]) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.reshape(input_tensor, target_shape) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + target_shape = [ffconfig.batch_size, 500] + + _ = test_reshape(ffconfig, input, target_shape) diff --git a/examples/python/native/ops/residual_layer_norm.py b/examples/python/native/ops/residual_layer_norm.py new file mode 100644 index 0000000000..e12f2e53d9 --- /dev/null +++ b/examples/python/native/ops/residual_layer_norm.py @@ -0,0 +1,93 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual1_arr: np.ndarray, residual2_arr: np.ndarray, use_two_residuals: bool, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + residual1_tensor = ffmodel.create_tensor(residual1_arr.shape, DataType.DT_FLOAT) + residual2_tensor = ffmodel.create_tensor(residual2_arr.shape, DataType.DT_FLOAT) + + output_tensor, layer_norm_output = ffmodel.residual_layer_norm( + input_tensor, + residual1_tensor, + residual2_tensor if use_two_residuals else None, + use_two_residuals, + axes=axes, + elementwise_affine=elementwise_affine, + eps=eps, + use_bias=use_bias, + name="residual_layer_norm_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + dataloader_residual1 = ffmodel.create_data_loader(residual1_tensor, residual1_arr) + dataloader_residual2 = ffmodel.create_data_loader(residual2_tensor, residual2_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_residual1.reset() + if use_two_residuals: + dataloader_residual2.reset() + + dataloader_input.next_batch(ffmodel) + dataloader_residual1.next_batch(ffmodel) + if use_two_residuals: + dataloader_residual2.next_batch(ffmodel) + + ffmodel.forward() + + output_tensor.inline_map(ffmodel, ffconfig) + layer_norm_output.inline_map(ffmodel, ffconfig) + output_result = output_tensor.get_array(ffmodel, ffconfig) + layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig) + + return output_result, layer_norm_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + residual1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + residual2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + use_two_residuals_flag = True # Example flag + + axes_to_normalize = [1, 2] # Example axes to normalize + + output_result, layer_norm_result = test_residual_layer_norm( + ffconfig, + input_data, + residual1_data, + residual2_data, + use_two_residuals_flag, + axes=axes_to_normalize, + elementwise_affine=True, + eps=1e-5, + use_bias=True + ) + + print("Input Array:") + print(input_data) + print("\nResidual1 Array:") + print(residual1_data) + if use_two_residuals_flag: + print("\nResidual2 Array:") + print(residual2_data) + print(f"\nOutput Array after applying residual_layer_norm along axes {axes_to_normalize} with use_two_residuals={use_two_residuals_flag}:") + print(output_result) + print("\nLayer Norm Result:") + print(layer_norm_result) diff --git a/examples/python/native/ops/residual_rms_norm.py b/examples/python/native/ops/residual_rms_norm.py new file mode 100644 index 0000000000..9027dffada --- /dev/null +++ b/examples/python/native/ops/residual_rms_norm.py @@ -0,0 +1,80 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_residual_rms_norm( + ffconfig, + input1_arr: np.ndarray, + input2_arr: np.ndarray, + eps: float, + dim: int, + name=None, +): + ffmodel = FFModel(ffconfig) + + input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT) + input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT) + + residual_rms_norm_output1, residual_rms_norm_output2 = ffmodel.residual_rms_norm( + input1_tensor, + input2_tensor, + eps, + dim, + name="residual_rms_norm_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr) + dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + residual_rms_norm_output1.inline_map(ffmodel, ffconfig) + output_result1 = residual_rms_norm_output1.get_array(ffmodel, ffconfig) + + residual_rms_norm_output2.inline_map(ffmodel, ffconfig) + output_result2 = residual_rms_norm_output2.get_array(ffmodel, ffconfig) + + return output_result1, output_result2 + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + input2_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + eps_value = 1e-6 + dim_value = 1 # Example value for dim + + output_result1, output_result2 = test_residual_rms_norm( + ffconfig, + input1_data, + input2_data, + eps=eps_value, + dim=dim_value, + ) + + print("Input Array 1:") + print(input1_data) + print("\nInput Array 2:") + print(input2_data) + print("\nOutput Array 1 after applying residual_rms_norm:") + print(output_result1) + print("\nOutput Array 2 after applying residual_rms_norm:") + print(output_result2) diff --git a/examples/python/native/ops/reverse.py b/examples/python/native/ops/reverse.py new file mode 100644 index 0000000000..25394d4b9a --- /dev/null +++ b/examples/python/native/ops/reverse.py @@ -0,0 +1,37 @@ +# The basis for this test of the 'reverse' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_reverse(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.reverse(input_tensor, axis=2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_reverse(ffconfig, input) diff --git a/examples/python/native/ops/rms_norm.py b/examples/python/native/ops/rms_norm.py new file mode 100644 index 0000000000..3983d7f891 --- /dev/null +++ b/examples/python/native/ops/rms_norm.py @@ -0,0 +1,64 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_rms_norm( + ffconfig, + input_arr: np.ndarray, + eps: float, + dim: int, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + rms_norm_output = ffmodel.rms_norm( + input_tensor, + eps, + dim, + name="rms_norm_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY], + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + rms_norm_output.inline_map(ffmodel, ffconfig) + output_result = rms_norm_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + eps_value = 1e-6 + dim_value = 1 # Example value for dim + + output_result = test_rms_norm( + ffconfig, + input_data, + eps=eps_value, + dim=dim_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying rms_norm:") + print(output_result) diff --git a/examples/python/native/ops/rsqrt.py b/examples/python/native/ops/rsqrt.py new file mode 100644 index 0000000000..3d9ab65449 --- /dev/null +++ b/examples/python/native/ops/rsqrt.py @@ -0,0 +1,44 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_rsqrt(ffconfig, input_arr: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + rsqrt_output = ffmodel.rsqrt(input_tensor, name="rsqrt_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + rsqrt_output.inline_map(ffmodel, ffconfig) + rsqrt_result = rsqrt_output.get_array(ffmodel, ffconfig) + + return rsqrt_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + rsqrt_result = test_rsqrt(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying rsqrt function:") + print(rsqrt_result) diff --git a/examples/python/native/ops/sampling.py b/examples/python/native/ops/sampling.py new file mode 100644 index 0000000000..2219f09eff --- /dev/null +++ b/examples/python/native/ops/sampling.py @@ -0,0 +1,55 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_sampling(ffconfig, input_arr: np.ndarray, top_p: float, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + sampling_output = ffmodel.sampling( + input_tensor, + top_p, + name="sampling_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_MEAN_SQUARED_ERROR, + metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR], + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + sampling_output.inline_map(ffmodel, ffconfig) + output_result = sampling_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + top_p_value = 0.8 + + output_result = test_sampling( + ffconfig, + input_data, + top_p=top_p_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying sampling:") + print(output_result) diff --git a/examples/python/native/ops/scalar_add.py b/examples/python/native/ops/scalar_add.py new file mode 100644 index 0000000000..48a316ea8a --- /dev/null +++ b/examples/python/native/ops/scalar_add.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_add(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_add_output = ffmodel.scalar_add( + input_tensor, + scalar, + inplace=inplace, + name="scalar_add_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_add_output.inline_map(ffmodel, ffconfig) + output_result = scalar_add_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_add(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar addition with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/scalar_multiply.py b/examples/python/native/ops/scalar_multiply.py new file mode 100644 index 0000000000..ebae5cce01 --- /dev/null +++ b/examples/python/native/ops/scalar_multiply.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_multiply(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_multiply_output = ffmodel.scalar_multiply( + input_tensor, + scalar, + inplace=inplace, + name="scalar_multiply_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_multiply_output.inline_map(ffmodel, ffconfig) + output_result = scalar_multiply_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_multiply(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar multiplication with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/scalar_sub.py b/examples/python/native/ops/scalar_sub.py new file mode 100644 index 0000000000..2dc467b573 --- /dev/null +++ b/examples/python/native/ops/scalar_sub.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_sub(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_sub_output = ffmodel.scalar_sub( + input_tensor, + scalar, + inplace=inplace, + name="scalar_sub_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_sub_output.inline_map(ffmodel, ffconfig) + output_result = scalar_sub_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_sub(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar subtraction with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/scalar_true_divide.py b/examples/python/native/ops/scalar_true_divide.py new file mode 100644 index 0000000000..f1b64df506 --- /dev/null +++ b/examples/python/native/ops/scalar_true_divide.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_true_divide(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_true_divide_output = ffmodel.scalar_true_divide( + input_tensor, + scalar, + inplace=inplace, + name="scalar_true_divide_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_true_divide_output.inline_map(ffmodel, ffconfig) + output_result = scalar_true_divide_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_true_divide(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar true division with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/sigmoid.py b/examples/python/native/ops/sigmoid.py new file mode 100644 index 0000000000..0fbe21df45 --- /dev/null +++ b/examples/python/native/ops/sigmoid.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'Sigmoid' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_sigmoid(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply Sigmoid activation + out = ffmodel.sigmoid(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_sigmoid(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after Sigmoid activation:") + print(result) diff --git a/examples/python/native/ops/sigmoid_silu_multi.py b/examples/python/native/ops/sigmoid_silu_multi.py new file mode 100644 index 0000000000..cecc3e102e --- /dev/null +++ b/examples/python/native/ops/sigmoid_silu_multi.py @@ -0,0 +1,58 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_sigmoid_silu_multi(ffconfig, input1_arr: np.ndarray, input2_arr: np.ndarray, name=None): + ffmodel = FFModel(ffconfig) + + input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT) + input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT) + + sigmoid_silu_multi_output = ffmodel.sigmoid_silu_multi( + input1_tensor, + input2_tensor, + name="sigmoid_silu_multi_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr) + dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input2.reset() + + dataloader_input1.next_batch(ffmodel) + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + sigmoid_silu_multi_output.inline_map(ffmodel, ffconfig) + output_result = sigmoid_silu_multi_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + output_result = test_sigmoid_silu_multi(ffconfig, input1_data, input2_data) + + print("Input1 Array:") + print(input1_data) + print("\nInput2 Array:") + print(input2_data) + print("\nOutput Array after applying sigmoid_silu_multi:") + print(output_result) diff --git a/examples/python/native/ops/sin.py b/examples/python/native/ops/sin.py new file mode 100644 index 0000000000..4b60a4e1d4 --- /dev/null +++ b/examples/python/native/ops/sin.py @@ -0,0 +1,44 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_sin(ffconfig, input_arr: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + sin_output = ffmodel.sin(input_tensor, name="sin_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + sin_output.inline_map(ffmodel, ffconfig) + sin_result = sin_output.get_array(ffmodel, ffconfig) + + return sin_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + sin_result = test_sin(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying sin function:") + print(sin_result) diff --git a/examples/python/native/ops/softmax.py b/examples/python/native/ops/softmax.py new file mode 100644 index 0000000000..b5481bcc80 --- /dev/null +++ b/examples/python/native/ops/softmax.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'Softmax' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_softmax(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply Softmax activation + out = ffmodel.softmax(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10).astype(np.float32) + + result = test_softmax(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after Softmax activation:") + print(result) diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py new file mode 100644 index 0000000000..bd1aaa189b --- /dev/null +++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py @@ -0,0 +1,103 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_spec_inc_multihead_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + spec_inc_multihead_self_attention_output = ffmodel.spec_inc_multihead_self_attention( + input_tensor, + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="spec_inc_multihead_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + spec_inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = spec_inc_multihead_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_heads_value = 8 + + output_result = test_spec_inc_multihead_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_heads=num_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying spec_inc_multihead_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py new file mode 100644 index 0000000000..0b731c99e0 --- /dev/null +++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py @@ -0,0 +1,107 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_spec_inc_multiquery_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_q_heads: int, + num_kv_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + spec_inc_multiquery_self_attention_output = ffmodel.spec_inc_multiquery_self_attention( + input_tensor, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="spec_inc_multiquery_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + spec_inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = spec_inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_q_heads_value = 4 + num_kv_heads_value = 4 + + output_result = test_spec_inc_multiquery_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_q_heads=num_q_heads_value, + num_kv_heads=num_kv_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying spec_inc_multiquery_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/split.py b/examples/python/native/ops/split.py new file mode 100644 index 0000000000..d03a52a769 --- /dev/null +++ b/examples/python/native/ops/split.py @@ -0,0 +1,47 @@ +# The basis for this test of the 'split' operation is generated by ChatGPT using the manually created conv2d.py as a template. + +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_split(ffconfig, input_arr: np.ndarray) -> List[flexflow.core.Tensor]: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out1, out2 = ffmodel.split(input_tensor, 2, axis=1) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out1.inline_map(ffmodel, ffconfig) + out2.inline_map(ffmodel, ffconfig) + + return [out1.get_array(ffmodel, ffconfig), out2.get_array(ffmodel, ffconfig)] + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 10, 10, 10).astype(np.float32) + output_list = test_split(ffconfig, input) + + print("Output Tensor 1:") + print(output_list[0]) + + print("\nOutput Tensor 2:") + print(output_list[1]) diff --git a/examples/python/native/ops/subtract.py b/examples/python/native/ops/subtract.py new file mode 100644 index 0000000000..5f829cbae1 --- /dev/null +++ b/examples/python/native/ops/subtract.py @@ -0,0 +1,45 @@ +# The basis for this test of the 'subtract' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_subtract(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.subtract(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + _ = test_subtract(ffconfig, input1, input2) diff --git a/examples/python/native/ops/tanh.py b/examples/python/native/ops/tanh.py new file mode 100644 index 0000000000..ba4ba7d6ff --- /dev/null +++ b/examples/python/native/ops/tanh.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'tanh' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_tanh(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply tanh activation + out = ffmodel.tanh(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_tanh(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after tanh activation:") + print(result) diff --git a/examples/python/native/ops/transpose.py b/examples/python/native/ops/transpose.py new file mode 100644 index 0000000000..6f514d660c --- /dev/null +++ b/examples/python/native/ops/transpose.py @@ -0,0 +1,38 @@ +# The basis for this test of the 'transpose' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_transpose(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.transpose(input_tensor, [ffconfig.batch_size, 10, 5, 10]) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_transpose(ffconfig, input) From a83effedd6e0185a7e8225f445c0aaba840c1aca Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 20 Jun 2024 04:08:29 +0000 Subject: [PATCH 334/344] add code to keep runners registered --- .github/workflows/docker-build.yml | 41 ++++++++++++++++++++---------- .github/workflows/gpu-ci.yml | 24 +++++++++++++++++ 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index d16179434b..eeaab0e0af 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -9,9 +9,9 @@ on: branches: - "inference" - "master" - # schedule: - # # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated - # - cron: "0 8 * * 0" + schedule: + # At 00:00 on day-of-month 1, 14, and 28. + - cron: "0 0 1,14,28 * *" workflow_dispatch: # Cancel outdated workflows if they are still running @@ -58,13 +58,28 @@ jobs: - name: Check availability of flexflow modules in Python run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - + + keep-runner-registered: + name: Keep runner alive + if: ${{ github.event_name == 'schedule' }} + runs-on: [self-hosted, rocm_builder] + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + needs: rocm-builder-start + steps: + - name: Keep alive + run: | + echo "Keep self-hosted runner registered with Github" + sleep 10m docker-build-and-publish-rocm: name: Build and Deploy FlexFlow Docker Containers (ROCm backend) needs: rocm-builder-start runs-on: [self-hosted, rocm_builder] - if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} strategy: matrix: hip_version: ["5.3", "5.4", "5.5", "5.6"] @@ -106,19 +121,19 @@ jobs: cuda_version: ${{ matrix.cuda_version }} steps: - name: Checkout Git Repository - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} uses: actions/checkout@v3 with: submodules: recursive - name: Free additional space on runner - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} run: .github/workflows/helpers/free_space_on_runner.sh - name: Build Docker container - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} build_needed: ${{ matrix.cuda_version == '12.0' }} run: | # On push to inference, build for all compatible architectures, so that we can publish @@ -133,11 +148,11 @@ jobs: fi - name: Check availability of flexflow modules in Python - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - name: Publish Docker environment image (on push to inference) - if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} run: | @@ -145,7 +160,7 @@ jobs: ./docker/publish.sh flexflow rocm-builder-stop: - needs: docker-build-and-publish-rocm + needs: [docker-build-and-publish-rocm, keep-runner-registered] if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} runs-on: ubuntu-latest name: Stop the AWS instance we used to build the ROCM Docker images @@ -166,7 +181,7 @@ jobs: name: Notify Slack in case of failure runs-on: ubuntu-20.04 needs: [docker-build-cuda, docker-build-and-publish-rocm] - if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} + if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }} steps: - name: Send Slack message env: diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 7bdb6805a8..c7d0cd72cb 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -1,5 +1,7 @@ name: "gpu-ci" on: + schedule: + - cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28. push: branches: - "inference" @@ -43,8 +45,28 @@ jobs: pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py + keep-runner-registered: + name: Keep runner alive + if: ${{ github.event_name == 'schedule' }} + runs-on: [self-hosted, gpu] + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + needs: gpu-ci-concierge + container: + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + options: --gpus all --shm-size=8192m + steps: + - name: Keep alive + run: | + echo "Keep self-hosted runner registered with Github" + sleep 10m + python-interface-check: name: Check Python Interface + if: ${{ github.event_name != 'schedule' }} runs-on: [self-hosted, gpu] defaults: run: @@ -119,6 +141,7 @@ jobs: inference-tests: name: Inference Tests + if: ${{ github.event_name != 'schedule' }} runs-on: [self-hosted, gpu] defaults: run: @@ -195,6 +218,7 @@ jobs: training-tests: name: Training Tests + if: ${{ github.event_name != 'schedule' }} runs-on: [self-hosted, gpu] # skip this time-consuming test for PRs to the inference branch # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} From 4f82aaed6317cef0a2587848a3b6d57f1d709381 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 10 Jul 2024 23:15:28 -0400 Subject: [PATCH 335/344] fix docker --- docker/flexflow-environment/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 6ca337f58d..cef619ad68 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -37,6 +37,7 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ chmod +x ~/${MINICONDA_SCRIPT_NAME} && \ bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \ rm ~/${MINICONDA_SCRIPT_NAME} && \ + /opt/conda/bin/conda config --set solver classic && \ /opt/conda/bin/conda upgrade --all && \ /opt/conda/bin/conda install conda-build conda-verify && \ /opt/conda/bin/conda clean -ya From 25fb40772f587892510bfe0ca296ae54768ff35c Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 11 Jul 2024 15:16:40 -0400 Subject: [PATCH 336/344] [Tokenizer] update tokenizers-cpp repo --- deps/tokenizers-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp index 4f42c9fa74..c0fab1e14a 160000 --- a/deps/tokenizers-cpp +++ b/deps/tokenizers-cpp @@ -1 +1 @@ -Subproject commit 4f42c9fa74946d70af86671a3804b6f2433e5dac +Subproject commit c0fab1e14a9421c1501acee5b7703e5dafa60479 From 6a1a1886909fc864aadfb10823077f94fe03b72e Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 3 Aug 2024 08:31:37 -0700 Subject: [PATCH 337/344] minor bug fix (#1456) --- .../ops/kernels/inc_multihead_self_attention_kernels.h | 3 ++- src/ops/attention.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 9bf2f581e2..26dcf12425 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -56,7 +56,8 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int num_heads, int num_kv_heads, bool scaling_query, - float scaling_factor); + float scaling_factor, + int hidden_size); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) template diff --git a/src/ops/attention.cu b/src/ops/attention.cu index 9b8b90da70..18fc810aed 100644 --- a/src/ops/attention.cu +++ b/src/ops/attention.cu @@ -206,7 +206,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); - cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; + unsigned attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; // Assume no beam search for now int maxBeamSize = 1; // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) From 9784b5c6516bafe272fc6555daaa9b867a5eacfa Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 12 Aug 2024 11:02:49 -0700 Subject: [PATCH 338/344] update legion version (#1307) * update legion version * legion version update * update legion version --- CMakeLists.txt | 2 +- deps/legion | 2 +- examples/cpp/AlexNet/alexnet.cc | 2 +- examples/cpp/DLRM/dlrm.cc | 2 +- examples/cpp/InceptionV3/inception.cc | 2 +- examples/cpp/ResNet/resnet.cc | 2 +- examples/cpp/Transformer/transformer.cc | 2 +- examples/cpp/XDL/xdl.cc | 2 +- examples/cpp/candle_uno/candle_uno.cc | 2 +- examples/cpp/mixture_of_experts/moe.cc | 2 +- examples/cpp/resnext50/resnext.cc | 2 +- examples/cpp/split_test/split_test.cc | 2 +- examples/cpp/split_test_2/split_test_2.cc | 2 +- include/flexflow/graph.h | 2 +- include/flexflow/operator.h | 4 +++- include/flexflow/utils/recursive_logger.h | 4 ++-- inference/incr_decoding/incr_decoding.cc | 2 +- inference/spec_infer/spec_infer.cc | 2 +- src/mapper/mapper.cc | 7 ++++++- src/ops/beam_topk.cpp | 2 +- src/ops/beam_topk.cu | 2 +- src/ops/inc_multihead_self_attention.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cc | 2 +- src/runtime/batch_config.cc | 2 +- src/runtime/beam_search_batch_config.cc | 2 +- src/runtime/graph.cc | 4 ++-- src/runtime/inference_manager.cc | 4 ++-- src/runtime/model.cc | 6 ++++-- src/runtime/optimizer_kernel.cpp | 4 ++-- src/runtime/optimizer_kernel.cu | 2 +- src/runtime/request_manager.cc | 2 +- src/runtime/simulator.cc | 8 ++++---- src/runtime/substitution.cc | 4 ++-- src/runtime/tree_verify_batch_config.cc | 2 +- tests/ops/batch_matmul_test.cc | 2 +- tests/ops/concat_test.cc | 2 +- tests/ops/flat_test.cc | 2 +- tests/ops/linear_test.cc | 2 +- tests/ops/reshape_test.cc | 2 +- tests/ops/tanh_test.cc | 2 +- tests/ops/transpose_test.cc | 2 +- 41 files changed, 59 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..7079fdadb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -425,7 +425,7 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. diff --git a/deps/legion b/deps/legion index 24e8c45234..02eb1010ca 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 +Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0 diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc index 128496eab1..3507882329 100644 --- a/examples/cpp/AlexNet/alexnet.cc +++ b/examples/cpp/AlexNet/alexnet.cc @@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("AlexNet"); +Legion::Logger log_app("AlexNet"); void parse_input_args(char **argv, int argc, AlexNetConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc index 7dc49215b3..d7dc167557 100644 --- a/examples/cpp/DLRM/dlrm.cc +++ b/examples/cpp/DLRM/dlrm.cc @@ -19,7 +19,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("DLRM"); +Legion::Logger log_app("DLRM"); void parse_input_args(char **argv, int argc, DLRMConfig &apConfig); diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc index b2070cc52d..6d0fa7ee53 100644 --- a/examples/cpp/InceptionV3/inception.cc +++ b/examples/cpp/InceptionV3/inception.cc @@ -21,7 +21,7 @@ using namespace Legion; using namespace FlexFlow; -LegionRuntime::Logger::Category log_app("Inceptionv3"); +Legion::Logger log_app("Inceptionv3"); Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) { Tensor t1 = input; diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc index 455eb743ae..49ce934a6a 100644 --- a/examples/cpp/ResNet/resnet.cc +++ b/examples/cpp/ResNet/resnet.cc @@ -24,7 +24,7 @@ using FlexFlow::Optimizer; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("ResNet"); +Legion::Logger log_app("ResNet"); void parse_input_args(char **argv, int argc, ResNetConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc index d61a63cd03..b04093b0a9 100644 --- a/examples/cpp/Transformer/transformer.cc +++ b/examples/cpp/Transformer/transformer.cc @@ -17,7 +17,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("Transformer"); +Legion::Logger log_app("Transformer"); Tensor create_emb(FFModel *model, Tensor const &input, diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc index 2e6c3cec98..a2272f36e5 100644 --- a/examples/cpp/XDL/xdl.cc +++ b/examples/cpp/XDL/xdl.cc @@ -18,7 +18,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("XDL"); +Legion::Logger log_app("XDL"); void parse_input_args(char **argv, int argc, XDLConfig &apConfig); diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc index 779b8e9c14..e9f4bf876a 100644 --- a/examples/cpp/candle_uno/candle_uno.cc +++ b/examples/cpp/candle_uno/candle_uno.cc @@ -21,7 +21,7 @@ using namespace Legion; using namespace std; -LegionRuntime::Logger::Category log_app("Candle_Uno"); +Legion::Logger log_app("Candle_Uno"); void parse_input_args(char **argv, int argc, CandleConfig &apConfig); diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc index a707310885..a25f94abd9 100644 --- a/examples/cpp/mixture_of_experts/moe.cc +++ b/examples/cpp/mixture_of_experts/moe.cc @@ -20,7 +20,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("MoE"); +Legion::Logger log_app("MoE"); void parse_input_args(char **argv, int argc, MoeConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc index 3c28ca27b8..9b71b37cce 100644 --- a/examples/cpp/resnext50/resnext.cc +++ b/examples/cpp/resnext50/resnext.cc @@ -7,7 +7,7 @@ using FlexFlow::Optimizer; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("resnext"); +Legion::Logger log_app("resnext"); Tensor resnext_block(FFModel &ff, Tensor input, diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc index 97b98c3214..ac9d516a59 100644 --- a/examples/cpp/split_test/split_test.cc +++ b/examples/cpp/split_test/split_test.cc @@ -3,7 +3,7 @@ using namespace Legion; using namespace FlexFlow; -LegionRuntime::Logger::Category log_app("split_test"); +Legion::Logger log_app("split_test"); void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc index 69385d14cb..fef078adbc 100644 --- a/examples/cpp/split_test_2/split_test_2.cc +++ b/examples/cpp/split_test_2/split_test_2.cc @@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph; using FlexFlow::PCG::GraphSearchHelper; using FlexFlow::PCG::Node; -LegionRuntime::Logger::Category log_app("split_test_2"); +Legion::Logger log_app("split_test_2"); void top_level_task(Task const *task, std::vector const ®ions, diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 2e0cf1ca4b..9dc6572593 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -24,7 +24,7 @@ #include "legion/legion_utilities.h" #include -extern LegionRuntime::Logger::Category log_dp; +extern Legion::Logger log_dp; namespace FlexFlow::PCG { diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b19bdb82f..311699d926 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -19,7 +19,7 @@ namespace FlexFlow { -extern LegionRuntime::Logger::Category log_measure; +extern Legion::Logger log_measure; class OpMeta; class Simulator; @@ -233,6 +233,8 @@ class Op { std::vector const &, MachineView const *mv = nullptr) { assert(false); + Legion::FutureMap empty_map; + return empty_map; }; virtual void print_layer(FFModel const &model) = 0; template diff --git a/include/flexflow/utils/recursive_logger.h b/include/flexflow/utils/recursive_logger.h index 2c43b42309..d073f58f3e 100644 --- a/include/flexflow/utils/recursive_logger.h +++ b/include/flexflow/utils/recursive_logger.h @@ -26,7 +26,7 @@ class DepthTag { class RecursiveLogger { public: - /* RecursiveLogger(LegionRuntime::Logger::Category const &); */ + /* RecursiveLogger(Legion::Logger const &); */ RecursiveLogger(std::string const &category_name); Realm::LoggerMessage info(); @@ -42,7 +42,7 @@ class RecursiveLogger { void print_prefix(Realm::LoggerMessage &) const; - LegionRuntime::Logger::Category logger; + Legion::Logger logger; }; }; // namespace FlexFlow diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index aae7256ffe..ec3dda3158 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -28,7 +28,7 @@ using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; -LegionRuntime::Logger::Category log_app("llama"); +Legion::Logger log_app("llama"); struct FilePaths { std::string cache_folder_path; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index f7edfd7696..60233ac8d1 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -26,7 +26,7 @@ using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; -LegionRuntime::Logger::Category log_app("llama"); +Legion::Logger log_app("llama"); struct FilePaths { std::string cache_folder_path; diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index c293aecb19..4413d516ac 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -20,7 +20,7 @@ namespace FlexFlow { using namespace Legion; using namespace Mapping; -LegionRuntime::Logger::Category log_ff_mapper("Mapper"); +Legion::Logger log_ff_mapper("Mapper"); FFShardingFunctor::FFShardingFunctor(int _gpus_per_node, int _cpus_per_node, @@ -296,6 +296,7 @@ void FFMapper::select_task_options(const MapperContext ctx, // control replicate top level task if (enable_control_replication) { output.replicate = true; + output.map_locally = false; } return; } @@ -560,6 +561,10 @@ void FFMapper::map_task(const MapperContext ctx, assert(output.target_procs[i].address_space() == node_id); } } + if (input.shard_processor.exists()) { + output.target_procs = std::vector{input.shard_processor}; + } + // Find instances that still need to be mapped std::vector> missing_fields(task.regions.size()); runtime->filter_instances(ctx, diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 18534455a0..8545bea7cb 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -25,7 +25,7 @@ using Legion::coord_t; enum class HeapType { kMinHeap, kMaxHeap }; enum class PreferIndices { kLower, kHigher }; -LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); +Legion::Logger log_beam_topk("BeamTopK"); template struct Entry { diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index a958786be3..c24bdf7c74 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -25,7 +25,7 @@ using Legion::coord_t; enum class HeapType { kMinHeap, kMaxHeap }; enum class PreferIndices { kLower, kHigher }; -LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); +Legion::Logger log_beam_topk("BeamTopK"); template struct Entry { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 7aa3503770..8688585788 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -46,7 +46,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA"); +Legion::Logger log_inc_mha("IncrementalMHA"); bool IncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d0efb01d54..9b8c88420d 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -46,7 +46,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -LegionRuntime::Logger::Category log_tree_verify("TreeVerifyIncMHA"); +Legion::Logger log_tree_verify("TreeVerifyIncMHA"); bool TreeIncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index bd96dbb141..7989b0799e 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_bc("BatchConfig"); +Legion::Logger log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index ff7bf1a819..0509c23afe 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -24,7 +24,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig"); +Legion::Logger log_beam_bc("BeamSearchBatchConfig"); BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() { this->beam_width = DEFAULT_BEAM_WIDTH; diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f8e8240ccf..cf75235ae7 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -66,8 +66,8 @@ namespace FlexFlow::PCG { using namespace Legion; using FlexFlow::MachineView; -LegionRuntime::Logger::Category log_graph("graph"); -LegionRuntime::Logger::Category log_simplify("graph_simplify"); +Legion::Logger log_graph("graph"); +Legion::Logger log_simplify("graph_simplify"); const Node Node::INVALID_NODE = Node(); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 2a94df8b4d..3d299aeedd 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -25,8 +25,8 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); -LegionRuntime::Logger::Category log_offload("Offloading"); +Legion::Logger log_inf_mgr("InferenceManager"); +Legion::Logger log_offload("Offloading"); InferenceManager::InferenceManager() {} diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 1fa281777a..5cad628743 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -82,8 +82,8 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_model("Model"); -LegionRuntime::Logger::Category log_measure("measure"); +Legion::Logger log_model("Model"); +Legion::Logger log_measure("measure"); Op::Op(FFModel &model, OperatorType otype, @@ -6748,6 +6748,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(SGD_UPD_NCCL_TASK_ID, "SGD NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "SGD NCCL Update Task"); @@ -6898,6 +6899,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "NCCL Init Communicators"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "NCCL Init Communicators Task"); diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index e71adc87a8..59efaf5256 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); +Legion::Logger log_optimizer("optimizer"); __global__ void sgd_update(size_t count, float lr, @@ -247,4 +247,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, } #endif -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 5f654fbb5b..df37e3b135 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -20,7 +20,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); +Legion::Logger log_optimizer("optimizer"); __global__ void sgd_update(size_t count, float lr, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 16513e918a..d21285eef2 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -29,7 +29,7 @@ namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; -LegionRuntime::Logger::Category log_req_mgr("RequestManager"); +Legion::Logger log_req_mgr("RequestManager"); std::string LoadBytesFromFile(std::string const &path) { std::ifstream fs(path, std::ios::in | std::ios::binary); diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc index d943376416..b71af0d47e 100644 --- a/src/runtime/simulator.cc +++ b/src/runtime/simulator.cc @@ -31,10 +31,10 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_sim("sim"); -LegionRuntime::Logger::Category log_ps_sim("ps_sim"); -LegionRuntime::Logger::Category log_xfer_sim("xfer_sim"); -LegionRuntime::Logger::Category log_xfer_est("xfer_est"); +Legion::Logger log_sim("sim"); +Legion::Logger log_ps_sim("ps_sim"); +Legion::Logger log_xfer_sim("xfer_sim"); +Legion::Logger log_xfer_est("xfer_est"); // template class std::map; // for debugging in gdb // template class std::map; // for debugging in gdb diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index c0804d6e19..b86964049d 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -54,8 +54,8 @@ namespace FlexFlow::PCG { using namespace Legion; -LegionRuntime::Logger::Category log_xfers("xfers"); -LegionRuntime::Logger::Category log_xfer_matches("xfer_matches"); +Legion::Logger log_xfers("xfers"); +Legion::Logger log_xfer_matches("xfer_matches"); const TensorX TensorX::NO_TX = TensorX(); diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 841c735f59..49d42bb6dd 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig"); +Legion::Logger log_tree_bc("TreeVerifyBatchConfig"); TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {} diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc index 7931f44129..f61048febf 100644 --- a/tests/ops/batch_matmul_test.cc +++ b/tests/ops/batch_matmul_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("bmm_test"); +Legion::Logger log_app("bmm_test"); struct BMMTestMeta { int m, k, n, d; diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc index c67b718e0e..b0489d1adb 100644 --- a/tests/ops/concat_test.cc +++ b/tests/ops/concat_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("concat_test"); +Legion::Logger log_app("concat_test"); struct ConcatTestMeta { int batch_size, i_dim, num_channels, projected_num_channels, diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc index 428893a0dc..61de83b6b0 100644 --- a/tests/ops/flat_test.cc +++ b/tests/ops/flat_test.cc @@ -7,7 +7,7 @@ #include using namespace Legion; -LegionRuntime::Logger::Category log_app("Flat_test"); +Legion::Logger log_app("Flat_test"); struct FlatTestMeta { int i_dim, o_dim; diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc index 5b65de3a56..7c84ad1078 100644 --- a/tests/ops/linear_test.cc +++ b/tests/ops/linear_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("linear_test"); +Legion::Logger log_app("linear_test"); struct LinearTestMeta { int batch_size, i_dim, num_channels, dense_projection_o_dim, diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc index e8f4586b23..a8aa046a64 100644 --- a/tests/ops/reshape_test.cc +++ b/tests/ops/reshape_test.cc @@ -6,7 +6,7 @@ #include #define PRECISION 16 using namespace Legion; -LegionRuntime::Logger::Category log_app("Reshape_test"); +Legion::Logger log_app("Reshape_test"); struct ReshapeTestMeta { int i_dim, o_dim; diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc index 1c24d96aaf..1e86934f86 100644 --- a/tests/ops/tanh_test.cc +++ b/tests/ops/tanh_test.cc @@ -6,7 +6,7 @@ #include #define PRECISION 16 using namespace Legion; -LegionRuntime::Logger::Category log_app("Tanh_test"); +Legion::Logger log_app("Tanh_test"); struct TanhTestMeta { int i_dim, o_dim; diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc index 10481aa14f..045f28479c 100644 --- a/tests/ops/transpose_test.cc +++ b/tests/ops/transpose_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("transpose_test"); +Legion::Logger log_app("transpose_test"); struct TransposeTestMeta { int m, k, d; From f747438f0927ec528d481cfd6b9c7f15465677c9 Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:49:54 -0400 Subject: [PATCH 339/344] Managed mem support (#1466) * feat: fix missed compile definition * feat: add func `get_proc_mem` to process memory allocation * chore: minor * chore: try to use get_proc_mem * fix: proc_mem allocation * feat: switch to use get_proc_mem * feat: update Realm::Logger definition * fix: now all memory are allocated by get_proc_mem * chore: minor * fix: no memory allocation bugs * chore: merge file * chore: don't use ManagedMemory for now --- CMakeLists.txt | 1 + include/flexflow/model.h | 1 + include/flexflow/ops/batch_norm.h | 1 + include/flexflow/utils/memory_allocator.h | 2 ++ src/mapper/mapper.cc | 7 ++----- src/ops/add_bias_residual_layer_norm.cc | 5 +---- src/ops/argmax.cc | 5 +---- src/ops/attention.cc | 5 +---- src/ops/batch_norm.cpp | 5 +---- src/ops/batch_norm.cu | 5 +---- src/ops/beam_topk.cc | 5 +---- src/ops/dropout.cc | 5 +---- src/ops/inc_multihead_self_attention.cc | 5 +---- src/ops/layer_norm.cc | 5 +---- src/ops/linear.cc | 5 +---- src/ops/residual_layer_norm.cc | 5 +---- src/ops/residual_rms_norm.cc | 5 +---- src/ops/rms_norm.cc | 5 +---- src/ops/sampling.cc | 5 +---- src/ops/sigmoid_silu_multi.cc | 5 +---- src/ops/spec_inc_multihead_self_attention.cc | 5 +---- src/ops/tree_inc_multihead_self_attention.cc | 5 +---- src/runtime/graph.cc | 5 +---- src/runtime/memory_allocator.cc | 12 ++++++++++++ src/runtime/model.cc | 4 ++-- src/runtime/model.cpp | 15 +++------------ src/runtime/model.cu | 15 +++------------ 27 files changed, 45 insertions(+), 103 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7079fdadb8..d7a6391e06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -496,6 +496,7 @@ if(NOT BUILD_LEGION_ONLY) if(NOT CARGO_RESULT EQUAL 0) message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") endif() + set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON) add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) target_link_libraries(flexflow tokenizers_cpp) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 95be9ab581..ea64f65a95 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -22,6 +22,7 @@ #include "flexflow/node.h" #include "flexflow/operator_params.h" #include "flexflow/utils/hash_utils.h" +#include "flexflow/utils/memory_allocator.h" #include "flexflow/utils/tuple.h" #include "initializer.h" #include "layer.h" diff --git a/include/flexflow/ops/batch_norm.h b/include/flexflow/ops/batch_norm.h index c923dc1097..01cc0e16ec 100644 --- a/include/flexflow/ops/batch_norm.h +++ b/include/flexflow/ops/batch_norm.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_BATCH_NORM_H #include "flexflow/model.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h index 8e50a4c3b3..7091b159b2 100644 --- a/include/flexflow/utils/memory_allocator.h +++ b/include/flexflow/utils/memory_allocator.h @@ -62,6 +62,8 @@ class MemoryAllocator { size_t instance_total_size, instance_allocated_size; }; +Legion::Memory get_proc_mem(Legion::Machine machine, Legion::Processor proc); + }; // namespace FlexFlow #endif // _FLEXFLOW_RUNTIME_H_ diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index 4413d516ac..d7b9a5e99d 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -14,6 +14,7 @@ */ #include "flexflow/mapper.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { @@ -81,11 +82,7 @@ FFMapper::FFMapper(MapperRuntime *rt, if (it->address_space() == node_id) { local_gpus.push_back(*it); } - Machine::MemoryQuery fb_query(machine); - fb_query.only_kind(Memory::GPU_FB_MEM); - fb_query.best_affinity_to(*it); - assert(fb_query.count() == 1); - proc_fbmems[*it] = *(fb_query.begin()); + proc_fbmems[*it] = get_proc_mem(machine, *it); Machine::MemoryQuery zc_query(machine); zc_query.only_kind(Memory::Z_COPY_MEM); zc_query.has_affinity_to(*it); diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index e670380901..a17e156f18 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -493,10 +493,7 @@ OpMeta *AddBiasResidualLayerNorm::init_task( Runtime *runtime) { AddBiasResidualLayerNorm *ln = (AddBiasResidualLayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); AddBiasResidualLayerNormMeta *meta = new AddBiasResidualLayerNormMeta(handle, ln, gpu_mem_allocator); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index a52ce1886b..1892ac2353 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -233,10 +233,7 @@ OpMeta *ArgMax::init_task(Task const *task, ctx, task->regions[1].region.get_index_space()); int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; int batch_size = acc_input.domain.get_volume() / length; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); ArgMaxMeta *m = new ArgMaxMeta(handle, diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 97afc94341..203662d3ec 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -514,10 +514,7 @@ OpMeta * acc_output.rect.hi[1] - acc_output.rect.lo[1] + 1); assert(attn->oProjSize == acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MultiHeadAttentionMeta *m = new MultiHeadAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index 106e5ebad2..7dee6fdaaf 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -61,10 +61,7 @@ __host__ OpMeta * int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1; int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); BatchNormMeta *m = new BatchNormMeta( handle, bm, gpu_mem, output_n, output_c, output_h, output_w); return m; diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu index b77e9d489f..929ebf81f8 100644 --- a/src/ops/batch_norm.cu +++ b/src/ops/batch_norm.cu @@ -58,10 +58,7 @@ __host__ OpMeta * int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1; int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); BatchNormMeta *m = new BatchNormMeta( handle, bm, gpu_mem, output_n, output_c, output_h, output_w); return m; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index d2054cacb0..5f4547ace5 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -271,10 +271,7 @@ OpMeta *BeamTopK::init_task(Task const *task, Runtime *runtime) { BeamTopK *topk = (BeamTopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator); m->profiling = topk->profiling; diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 58cb82d53d..190d6fd496 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -164,10 +164,7 @@ OpMeta *Dropout::init_task(Task const *task, ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); assert(input_domain == output_domain); DropoutMeta *m = new DropoutMeta(handle, dropout, gpu_mem, output_domain); std::strcpy(m->op_name, dropout->name); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8688585788..aa60d0f19c 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -698,10 +698,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (attn->offload) { // cpu-offload enabled diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 2218ffe392..b19f400eb2 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -380,10 +380,7 @@ OpMeta *LayerNorm::init_task(Task const *task, Runtime *runtime) { LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); LayerNormMeta *meta = new LayerNormMeta(handle, ln, gpu_mem_allocator); std::strcpy(meta->op_name, ln->name); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 0c7a0f78fe..44b56d623e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -480,10 +480,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, // in_dim, // out_dim, // batch_size); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (linear->offload) { // cpu-offload enabled diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index ed9252c309..8dd670eea3 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -489,10 +489,7 @@ OpMeta *ResidualLayerNorm::init_task(Task const *task, Runtime *runtime) { ResidualLayerNorm *ln = (ResidualLayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); ResidualLayerNormMeta *meta = new ResidualLayerNormMeta(handle, ln, gpu_mem_allocator); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index f4f5bb72d0..b3ee7179d0 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -347,10 +347,7 @@ OpMeta *ResidualRMSNorm::init_task(Task const *task, Runtime *runtime) { ResidualRMSNorm *rn = (ResidualRMSNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); ResidualRMSNormMeta *meta = new ResidualRMSNormMeta(handle, rn, gpu_mem_allocator); diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index bf07ee6bb0..79dce65c57 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -294,10 +294,7 @@ OpMeta *RMSNorm::init_task(Task const *task, Runtime *runtime) { RMSNorm *rn = (RMSNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); RMSNormMeta *meta = new RMSNormMeta(handle, rn, gpu_mem_allocator); std::strcpy(meta->op_name, rn->name); diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 9fc2316f9a..b38c68843b 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -226,10 +226,7 @@ OpMeta *Sampling::init_task(Task const *task, int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; int batch_size = acc_input.domain.get_volume() / length; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); SamplingMeta *m = new SamplingMeta( handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3ddd6b8d6e..3d1c8d9094 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -237,10 +237,7 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task, Runtime *runtime) { SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); SigmoidSiluMultiMeta *meta = new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 9c6ed0e0b6..68d3a4c205 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -640,10 +640,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( int num_kv_heads = attn->num_kv_heads; assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); // We don't do offloading for SSMs (small speculative models) SpecIncMultiHeadSelfAttentionMeta *m = diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 9b8c88420d..df722a3d51 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -697,10 +697,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (attn->offload) { // cpu-offload enabled diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index cf75235ae7..b023aced6e 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1914,10 +1914,7 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MachineModel *machine; if (model->config.machine_model_version == 0) { machine = diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc index 06a7c468a4..cb4e867165 100644 --- a/src/runtime/memory_allocator.cc +++ b/src/runtime/memory_allocator.cc @@ -19,7 +19,9 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; +using Legion::Machine; using Legion::Memory; +using Legion::Processor; using Realm::RegionInstance; MemoryAllocator::MemoryAllocator(Memory _memory) @@ -51,4 +53,14 @@ void MemoryAllocator::register_reserved_work_space(void *base, size_t size) { reserved_allocated_size = 0; } +// Now it's for allocating FB memory, in the future we can +// add more types of memory allocation if needed +Memory get_proc_mem(Machine machine, Processor proc) { + Machine::MemoryQuery proc_mem = Machine::MemoryQuery(machine) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(proc); + assert(proc_mem.count() > 0); + return proc_mem.first(); +} + }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5cad628743..f1e222e6e3 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4273,8 +4273,8 @@ void FFConfig::parse_args(char **argv, int argc) { workersPerNode = atoi(argv[++i]); continue; } - if (!strcmp(argv[i], "-ll:fsize")) { - device_mem = atoi(argv[++i]); + if ((!strcmp(argv[i], "-ll:fsize")) || (!strcmp(argv[i], "-ll:msize"))) { + device_mem += atoi(argv[++i]); continue; } if (!strcmp(argv[i], "--nodes")) { diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index ad2b781567..62f6b89b7f 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -112,10 +112,7 @@ FFHandler // handle.workSpace = memFBImpl->get_direct_ptr(offset, 0); { // allocate memory for workspace - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.workSpaceSize - 1)); @@ -133,10 +130,7 @@ FFHandler } if (handle.offload_reserve_space_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); @@ -157,10 +151,7 @@ FFHandler } if (handle.batch_config_metadata_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 23b7f0efbe..fd39ed0db0 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -108,10 +108,7 @@ FFHandler // handle.workSpace = memFBImpl->get_direct_ptr(offset, 0); { // allocate memory for workspace - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.workSpaceSize - 1)); @@ -129,10 +126,7 @@ FFHandler } if (handle.offload_reserve_space_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); @@ -153,10 +147,7 @@ FFHandler } if (handle.batch_config_metadata_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); From 6d710acd79f968f65397874f62b8ebef20590620 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Tue, 20 Aug 2024 14:06:52 -0600 Subject: [PATCH 340/344] pip flexflow_python typo (#1461) Co-authored-by: Zhihao Jia --- python/flexflow/flexflow_python | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/flexflow_python b/python/flexflow/flexflow_python index cf247b9ede..8a9b65a404 100644 --- a/python/flexflow/flexflow_python +++ b/python/flexflow/flexflow_python @@ -6,7 +6,7 @@ python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.ge pylib_path="$(python "$python_packages"/flexflow/findpylib.py)" pylib_dir="$(dirname "$pylib_path")" export PATH="${python_packages}/flexflow/bin:${PATH}" -export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${PATH}" +export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${LD_LIBRARY_PATH}" legion_python_args=("$@" "-ll:py" "1") legion_python "${legion_python_args[@]}" From 3b59f0577cc6fc3a109921f72ceadef3458cf635 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 29 Aug 2024 00:04:28 +0200 Subject: [PATCH 341/344] update legion version --- deps/legion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/legion b/deps/legion index 02eb1010ca..0d32b35542 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0 +Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b From 28aff70cc98d065390eb58b7fd15dcd24f3fb786 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 31 Aug 2024 06:00:57 -0700 Subject: [PATCH 342/344] Fix nccl-induced segfault (#1481) --- include/flexflow/model.h | 1 + src/runtime/model.cc | 68 ++++++++++++++++++---------------- src/runtime/request_manager.cc | 3 ++ 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index ea64f65a95..6dda67bbfe 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1079,6 +1079,7 @@ class FFModel { bool use_propagation) const; #ifdef FF_USE_NCCL ncclComm_t *find_nccl_comms(MachineView const &view) const; + void finish_nccl_comms(); #endif #ifdef FF_USE_PROPAGATE void propagate(std::map const ¤t, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f1e222e6e3..4c67de1aa9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1589,41 +1589,47 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +#ifdef FF_USE_NCCL +void FFModel::finish_nccl_comms() { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } +} +#endif + FFModel::~FFModel() { // Destroy nccl communication groups #ifdef FF_USE_NCCL if (config.computationMode == COMP_MODE_TRAINING) { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - for (auto const &comm : view_hash_to_nccl_comms) { - // Find the machine view that has the hash - MachineView view; - for (size_t l = 0; l < operators.size(); l++) { - view = operators[l]->outputs[0]->machine_view; - if (view.hash() == comm.first) { - break; - } - } - assert(view.hash() == comm.first && "Cannot find the machine view"); - IndexSpace task_is = get_or_create_task_is(view); - Domain domain = runtime->get_index_space_domain(ctx, task_is); - ArgumentMap argmap; - int idx = 0; - for (Domain::DomainPointIterator it(domain); it; it++, idx++) { - argmap.set_point(*it, - TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); - } - IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, - task_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - comm.first); - FutureMap fm = runtime->execute_index_space(ctx, index_launcher); - fm.wait_all_results(); - } + finish_nccl_comms(); } #endif } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d21285eef2..bada87ab19 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2365,6 +2365,9 @@ void RequestManager::background_serving_task( // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } +#ifdef FF_USE_NCCL + llm->finish_nccl_comms(); +#endif } /*static*/ From 49523d62691039a9a8c29891acc5d48641048cc4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 2 Sep 2024 03:05:25 -0700 Subject: [PATCH 343/344] Fix python install issue caused by new Legion version (#1482) * fix * . * . * fix * cleanup * fix * cleanup --- CMakeLists.txt | 20 ++++++++++++++------ cmake/pip_install/CMakeLists.txt | 20 ++++++++++++++++++-- pyproject.toml | 3 ++- requirements.txt | 1 + 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7a6391e06..c82a53644e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,13 +37,24 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) STRING "Choose the type of build." FORCE) endif() +# option for using Python +option(FF_USE_PYTHON "Enable Python" ON) +if (FF_USE_PYTHON) + find_package(Python3 COMPONENTS Interpreter Development) +endif() + if(INSTALL_DIR) message(STATUS "INSTALL_DIR: ${INSTALL_DIR}") set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE) else() - # Install DIR not set. Use default, unless a conda environment is active - if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI) - set(CONDA_PREFIX $ENV{CONDA_PREFIX}) + # Install DIR not set. Use default, unless a conda environment is in use + if ((DEFINED ENV{CONDA_PREFIX} OR (Python3_EXECUTABLE AND Python3_EXECUTABLE MATCHES "conda")) AND NOT FF_BUILD_FROM_PYPI) + if (DEFINED ENV{CONDA_PREFIX}) + set(CONDA_PREFIX $ENV{CONDA_PREFIX}) + else() + get_filename_component(CONDA_PREFIX "${Python3_EXECUTABLE}" DIRECTORY) + get_filename_component(CONDA_PREFIX "${CONDA_PREFIX}" DIRECTORY) + endif() # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE) message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") @@ -64,9 +75,6 @@ option(FF_BUILD_FROM_PYPI "Build from pypi" OFF) # build shared or static flexflow lib option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON) -# option for using Python -option(FF_USE_PYTHON "Enable Python" ON) - # option for building legion only option(BUILD_LEGION_ONLY "Build Legion only" OFF) diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 105133a310..217d7e14f0 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -2,9 +2,25 @@ if (FF_USE_PYTHON) execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) - install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") + cmake_path(SET CMAKE_SOURCE_DIR_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion) + cmake_path(SET CMAKE_BUILD_DIR_ NORMALIZE ${Legion_BINARY_DIR}/runtime) + cmake_path(SET CMAKE_INSTALL_PREFIX_ NORMALIZE ${PY_DEST}/../../..) + cmake_path(SET WORKING_DIRECTORY_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/) # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + # CMAKE_SOURCE_DIR_=/usr/FlexFlow/deps/legion + # CMAKE_BUILD_DIR_: /usr/FlexFlow/build//deps/legion/runtime + # CMAKE_INSTALL_PREFIX_: /opt/conda/ or /usr/local + # WORKING_DIRECTORY_: /usr/FlexFlow/deps/legion/bindings/python/ + # PY_DEST: /python3.11/site-packages + message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") + message(STATUS "Legion_BINARY_DIR: ${Legion_BINARY_DIR}") + message(STATUS "CMAKE_SOURCE_DIR_: ${CMAKE_SOURCE_DIR_}") + message(STATUS "CMAKE_BUILD_DIR_: ${CMAKE_BUILD_DIR_}") + message(STATUS "CMAKE_INSTALL_PREFIX_: ${CMAKE_INSTALL_PREFIX_}") + message(STATUS "WORKING_DIRECTORY_: ${WORKING_DIRECTORY_}") + message(STATUS "PY_DEST: ${PY_DEST}") + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${CMAKE_INSTALL_PREFIX_} \")") + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E env CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR_} CMAKE_BUILD_DIR=${CMAKE_BUILD_DIR_} CMAKE_INSTALL_PREFIX=${PY_DEST}/flexflow ${Python3_EXECUTABLE} setup.py install --prefix ${CMAKE_INSTALL_PREFIX_} ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${WORKING_DIRECTORY_} COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)") endif() endif() diff --git a/pyproject.toml b/pyproject.toml index 4b8214f3fe..373c53beb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ requires = [ "setuptools_scm[toml]>=6.0", "cmake-build-extension", "ninja", - "requests" + "requests", + "pip", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 1037661337..ad65622367 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ onnx transformers>=4.31.0 sentencepiece einops +pip From a0f1ed783e3ef48ac374563cf3f4fc2388f34b4c Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 4 Sep 2024 14:15:06 -0400 Subject: [PATCH 344/344] PEFT support (inference/finetuning) (#1153) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * . * . * Update the default cublas behavior when CUDA_VERSION is not specified * fix bugs in IncMHA peft_bwd kernel * uncomment softmaxbackward * add layernorm to align test * add peft test scripts * fix import * fix * add code to convert peft models * add script to download peft for c++, fix bug * fix * add script to fine-tune models * implement loading lora configs/weights from file * remove peft_bwd assertion failure in embedding * fix download script * add peft dependencies in dockerfile * fix softmax backward * fix bc print indentation * Temporarily Revert "Update the default cublas behavior when CUDA_VERSION is not specified" This reverts commit 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a. * Fix cublas default (#1220) * Fix Legion prebuild workflow (2) (#1208) * fix * fix * fix * fix * Fix Legion prebuild workflow (3) (#1210) * fix hip error * use CUBLAS_COMPUTE_FAST_16F for full-precision gemm --------- Co-authored-by: Zhihao Jia * fix bugs, work on align opt-lora * update scripts * add code to output peft tensors in hf * update, fixes * linting * fix printing of tensors for numpy * update save_inference_tensors_to_file * linting * update * fix issue with save_inference_tensors_to_file * fix layer names for save_inference_tensors_to_file * fix peft * fix bwd bugs * linting * fixes * fix * fix * fix * add bc fields for peft training * linting * fix * remove ptr check * fix * implement save_operators for bwd * fix bug * implement save tensors for bwd * . * bug fix * fix * align linear * fix * bwd kernel updates * undo use of CUBLAS_COMPUTE_32F_FAST_16F for now * only send dataset entry once * update peft test scripts * loss * . * update generate/request api to take both inference and fine-tuning prompts * linting * alignment fixes in lora & linear layer * alignment fix * diagonal * fix * alignment fix ssm * sigmoid-silu-multi now fully aligned * rms norm kernel updates * fix * in-place residual rms * bug fix and linting * align backward of o_proj, attn_heads, qk_prods_softmax, and v_proj with huggingface * cleanup * finished all alignment fixes in attention backward kernel * fix * Update inc_multihead_self_attention.cu * Update inc_multihead_self_attention.cu * use grad to store peft in/output (#1241) * use grad to store peft in/output * format * . * format * enable peft request * several hacks for performance measurement; some of the changes should be reverted * Update sigmoid_silu_multi.cu * RoPE backward * PEFT bug fixes and alignment (#1269) * Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. * backup * backup * updates * update * backup * backup * backup * fix * cleanup * linting * Fuse bias + relu in OPT (#1271) * fuse bias and relu in opt * fix * fix * fix * fix * Peft alignment & debugging tools (#1288) * Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. * backup * backup * updates * update * backup * backup * backup * fix * cleanup * fix * fix * fix * update * simplify tensor names * fix * fixes and updates * fixes * fix * cleanup * . * restore softmax * cleanup * update alignment scripts * newline * fix legion aliasing error * fix warnings * fix * fix pipeline parallelism * fix tp issue in combine op * fix lora weight loading with tensor parallelism * fixes, implement Combine::peft_bwd_task * fix * replicate peft bwd * fixes * fix * fix combine and fwd-bwd pass dependencies * fix replicate bwd * fix * let user control amount of peft memory * only run peft_bwd if peft is enabled * fix rms norm inference region reqs * fix in-place fusion (part 1) * fix inplace fusion (part 2) * fix * disable automatic inplace rms norm for now * fix inf fusion inplace * fix rest input grads for peft without inplace residuals * fix * fix * fix residual rms * fix * fix * enable inf debugging in fusion bwd * hack to silence warning in fused bwd * fix * fix * fix build * fix * fix * add draft peft test * Peft python interface (#1306) * update script * less model renaming * fix * fix * fix * backup * . * update * . * fixes * fix * fix build * fix * fix * fix issues for downloading peft model * solved issues for download peft model * added printouts for debugging * fix * fix seg fault * add test, separate peft script in cpp * fix * fixes * fix * update peft python interface * update * update * update * updates * fix * fixes * fix * fixes --------- Co-authored-by: april-yyt * fix * update * fix * fix to support prompts larger than max tokens per batch * fixes to support benchmarking of finetuning throughput * many upgrades and updates related to finetuning * add ttft statistics * add warmup phase * add benchmarking code * Add scripts for evaluation with Microsoft Azure trace (#1363) * Add scripts for evaluation * Add absolute request rate value * Fix script for target arrival rate * Fix cpp req rate benchmark * update to use new dataset * Fix infinite loop * update * add data --------- Co-authored-by: Remi Delacourt Co-authored-by: Gabriele Oliaro * fix * fix * add peft tests to ci * shellcheck * fix * fix python requirements * fix * fix * update ci test * update alignment doc * fix cross entropy loss bug * update alignment test * update test * add llama peft alignment test to ci * Fix values for unused params in incr_decoding * Add PEFTModelID NO_ID singleton instead of None * Fix PEFTModelID::NO_ID reference * reduce logging * fix * fix * Add peft demo * Add readme for demo * fix alignment issue * Peft optimizer (#1290) * add optimizer config, only allocate weights for training * sgd 1 * sgd 2 * update * fix * linting * . * . * fix * fix allreduce bug * update * update * add optimizer hook in hf * update * update script * . * fix * fwd * bwd * start grads * fix gradient misalignment! * update * Add support for llama3 * various fixes --------- Co-authored-by: Remi Delacourt * Optimizers python interface (#1441) * python interface for optimizer * update lora linear config to support python interface * update python interface * finished lora python interface * fix * fix * update * update * more fixes * fix * initialize lora weights where needed * Add notebook * Update demo to use dataset * Fix' * Save weights after end of finetuning (#1446) * support accumulation of gradients without update * add code to save peft weights * fix * save configs * cleanup * Fully use notebook for demo * Parameterize generation and finetuning configs * Comment out inference for now * fix bug in lora inference only mode * fix * Add finetuning or inference only flags * fix * fix * fix * PEFT model upload (#1450) * upload test * fix * Make demo_class.py executable * fix * add base_model_name_or_path * fix * fix * support llama-3 tokenizer * print output tokens when not benchmarking * Use Llama3 in demo_class * Use Llama3 in demo * fix data loading for llama-3 * Add download models to demo * return/print loss at each finetuning step * fix * Adjust demo parameters * Fix for finetuning * pass finetuning losses to python interface * Update demo * Fix upload * Refactor demo * rename demo_class to demo * fix * remove epoch from loss print * Finish demo * fix test * rocm fixes * more rocm fixes * fix rocm build * docker fix * fix inference test * fix workflow * fix makefile * fix peft test * fix all-reduce issue with lora for TP scenario * fix bwd lm head * fixes * more fixes * update * fix alignment up to input ln * finished aligning all backward (tp>1) * align all peft * fix * fix broken link * formatting * fix * update * Revert "update" This reverts commit 90b2c876ca3ea9c29e59aa7ae9904f254298660d. * update * fix hip build * fix gpu ci * fix gpu ci * update default gpu ci version to 12.0 * update ci to 12.0 * fix * fix * update * fix * fix * update * fix * add cleanup * downgrade to cuda=11.8 --------- Co-authored-by: Gabriele Oliaro Co-authored-by: xinhaoc Co-authored-by: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Co-authored-by: april-yyt Co-authored-by: Remi <54138269+Flechman@users.noreply.github.com> Co-authored-by: Remi Delacourt Co-authored-by: Rémi Delacourt --- .github/workflows/build.yml | 12 +- .github/workflows/gpu-ci.yml | 10 + .github/workflows/helpers/install_cudnn.sh | 23 +- .github/workflows/helpers/install_nccl.sh | 8 +- .github/workflows/multinode-test.yml | 6 +- .github/workflows/pip-install.yml | 4 +- .github/workflows/prebuild-legion.yml | 4 +- .gitignore | 5 + CMakeLists.txt | 1 + conda/flexflow.yml | 7 + config/config.inc | 2 +- docker/build.sh | 9 +- docker/flexflow-environment/Dockerfile | 2 + docker/run.sh | 2 +- include/flexflow/batch_config.h | 42 +- include/flexflow/config.h | 41 +- include/flexflow/ffconst.h | 77 +- include/flexflow/fftype.h | 25 + include/flexflow/flexflow_c.h | 136 +- include/flexflow/inference.h | 1 + include/flexflow/layer.h | 2 +- include/flexflow/model.h | 61 +- include/flexflow/op_meta.h | 6 +- include/flexflow/operator.h | 95 +- include/flexflow/operator_params.h | 4 + .../ops/add_bias_residual_layer_norm.h | 63 +- .../ops/add_bias_residual_layer_norm_params.h | 1 + include/flexflow/ops/aggregate.h | 4 +- include/flexflow/ops/aggregate_spec.h | 4 +- include/flexflow/ops/argmax.h | 11 +- include/flexflow/ops/cache.h | 4 +- include/flexflow/ops/element_unary.h | 4 +- include/flexflow/ops/embedding.h | 5 + include/flexflow/ops/experts.h | 17 +- include/flexflow/ops/fused.h | 9 + include/flexflow/ops/groupby.h | 4 +- .../ops/inc_multihead_self_attention.h | 23 +- .../ops/kernels/batch_matmul_kernels.h | 4 +- include/flexflow/ops/kernels/cast_kernels.h | 4 +- include/flexflow/ops/kernels/concat_kernels.h | 4 +- .../flexflow/ops/kernels/conv_2d_kernels.h | 4 +- include/flexflow/ops/kernels/flat_kernels.h | 4 +- .../inc_multihead_self_attention_utils.cuh | 27 +- include/flexflow/ops/kernels/linear_kernels.h | 31 + .../ops/kernels/lora_linear_kernels.h | 77 + .../flexflow/ops/kernels/pool_2d_kernels.h | 4 +- .../flexflow/ops/kernels/reshape_kernels.h | 6 +- .../ops/kernels/residual_rms_norm_kernels.h | 30 +- .../flexflow/ops/kernels/rms_norm_kernels.h | 23 +- .../flexflow/ops/kernels/softmax_kernels.h | 46 +- .../flexflow/ops/kernels/transpose_kernels.h | 4 +- include/flexflow/ops/layer_norm.h | 50 +- include/flexflow/ops/linear.h | 9 + include/flexflow/ops/lora_linear.h | 99 + include/flexflow/ops/lora_linear_params.h | 150 + include/flexflow/ops/residual_layer_norm.h | 41 +- .../flexflow/ops/residual_layer_norm_params.h | 1 + include/flexflow/ops/residual_rms_norm.h | 16 + .../flexflow/ops/residual_rms_norm_params.h | 1 + include/flexflow/ops/rms_norm.h | 13 + include/flexflow/ops/sigmoid_silu_multi.h | 33 +- include/flexflow/ops/softmax.h | 9 + include/flexflow/ops/topk.h | 4 +- include/flexflow/ops/transpose.h | 2 + .../ops/tree_inc_multihead_self_attention.h | 2 +- include/flexflow/parallel_ops/allreduce.h | 19 +- include/flexflow/parallel_ops/combine.h | 13 + .../parallel_ops/kernels/allreduce_kernels.h | 14 +- .../parallel_ops/kernels/combine_kernels.h | 4 +- .../kernels/parallel_identity_kernels.h | 41 + .../parallel_ops/kernels/partition_kernels.h | 4 +- .../flexflow/parallel_ops/parallel_identity.h | 83 + .../parallel_ops/parallel_identity_params.h | 22 + include/flexflow/parallel_ops/parallel_op.h | 2 +- include/flexflow/parallel_ops/replicate.h | 9 + include/flexflow/request_manager.h | 44 +- include/flexflow/simulator.h | 56 +- include/flexflow/utils/cuda_helper.h | 13 +- include/flexflow/utils/hip_helper.h | 33 +- include/flexflow/utils/memory_allocator.h | 5 + .../flexflow/utils/peft_weight_allocator.h | 92 + inference/MODEL_WEIGHTS.md | 28 - inference/README.md | 42 + inference/incr_decoding/incr_decoding.cc | 11 +- inference/models/falcon.cc | 16 +- inference/models/llama.cc | 112 +- inference/models/llama.h | 11 +- inference/models/mpt.cc | 23 +- inference/models/opt.cc | 27 +- inference/models/starcoder.cc | 19 +- inference/peft/CMakeLists.txt | 139 + inference/peft/Makefile | 37 + inference/peft/peft.cc | 387 ++ inference/peft/peft_bwd_benchmark.cc | 391 ++ inference/peft/peft_fwd_benchmark.cc | 363 ++ inference/peft/req_rate_benchmark.cc | 518 ++ inference/python/ff_peft.py | 189 + inference/python/incr_decoding.py | 5 +- inference/python/peft_demo/INSTRUCTIONS.md | 25 + inference/python/peft_demo/demo.ipynb | 1907 +++++++ inference/python/peft_demo/demo.py | 240 + inference/python/spec_infer.py | 7 +- inference/spec_infer/spec_infer.cc | 11 +- inference/utils/download_peft_model.py | 68 + inference/utils/upload_peft_model.py | 142 + python/flexflow/core/__init__.py | 5 +- python/flexflow/core/flexflow_cffi.py | 5024 +++++++++-------- python/flexflow/serve/__init__.py | 43 +- python/flexflow/serve/models/base.py | 3 + python/flexflow/serve/models/falcon.py | 41 +- python/flexflow/serve/models/llama.py | 48 +- python/flexflow/serve/models/mpt.py | 46 +- python/flexflow/serve/models/opt.py | 51 +- python/flexflow/serve/models/starcoder.py | 47 +- python/flexflow/serve/serve.py | 446 +- python/flexflow/type.py | 11 + rdelacou/generate_trace.py | 121 + requirements.txt | 8 + src/c/flexflow_c.cc | 382 +- src/loss_functions/loss_functions.cpp | 8 +- src/loss_functions/loss_functions.cu | 8 +- src/ops/add_bias_residual_layer_norm.cc | 607 +- src/ops/add_bias_residual_layer_norm.cpp | 748 ++- src/ops/add_bias_residual_layer_norm.cu | 609 +- src/ops/aggregate.cc | 6 +- src/ops/aggregate.cpp | 9 +- src/ops/aggregate.cu | 7 +- src/ops/aggregate_spec.cc | 6 +- src/ops/aggregate_spec.cpp | 7 +- src/ops/aggregate_spec.cu | 7 +- src/ops/arg_topk.cc | 11 +- src/ops/argmax.cc | 42 +- src/ops/argmax.cpp | 81 +- src/ops/argmax.cu | 86 +- src/ops/attention.cc | 2 +- src/ops/attention.cpp | 2 +- src/ops/attention.cu | 2 +- src/ops/batch_matmul.cc | 4 +- src/ops/batch_norm.cpp | 2 +- src/ops/batch_norm.cu | 2 +- src/ops/beam_topk.cc | 10 +- src/ops/beam_topk.cpp | 2 +- src/ops/beam_topk.cu | 2 +- src/ops/cache.cc | 2 +- src/ops/cache.cpp | 2 +- src/ops/cache.cu | 2 +- src/ops/cast.cc | 2 +- src/ops/concat.cc | 4 +- src/ops/conv_2d.cc | 17 +- src/ops/element_binary.cc | 10 +- src/ops/element_unary.cc | 4 +- src/ops/element_unary.cpp | 3 +- src/ops/element_unary.cu | 3 +- src/ops/embedding.cc | 18 +- src/ops/experts.cc | 17 +- src/ops/experts.cpp | 30 +- src/ops/experts.cu | 65 +- src/ops/flat.cc | 3 +- src/ops/fused.cc | 234 +- src/ops/fused.cpp | 1257 +++-- src/ops/fused.cu | 1410 +++-- src/ops/group_by.cc | 6 +- src/ops/group_by.cpp | 6 +- src/ops/group_by.cu | 6 +- src/ops/inc_multihead_self_attention.cc | 139 +- src/ops/inc_multihead_self_attention.cpp | 1782 ++++-- src/ops/inc_multihead_self_attention.cu | 756 ++- src/ops/kernels/batch_matmul.cpp | 4 +- src/ops/kernels/batch_matmul.cu | 4 +- src/ops/kernels/cast_kernels.cpp | 3 +- src/ops/kernels/cast_kernels.cu | 3 +- src/ops/kernels/concat_kernels.cpp | 4 + src/ops/kernels/concat_kernels.cu | 4 + src/ops/kernels/conv_2d_kernels.cpp | 10 +- src/ops/kernels/conv_2d_kernels.cu | 10 +- src/ops/kernels/dropout_kernels.cpp | 2 +- src/ops/kernels/dropout_kernels.cu | 2 +- src/ops/kernels/flat_kernels.cpp | 4 + src/ops/kernels/flat_kernels.cu | 4 + src/ops/kernels/linear_kernels.cpp | 423 +- src/ops/kernels/linear_kernels.cu | 268 +- src/ops/kernels/lora_linear_kernels.cpp | 576 ++ src/ops/kernels/lora_linear_kernels.cu | 579 ++ src/ops/kernels/pool_2d_kernels.cpp | 4 +- src/ops/kernels/pool_2d_kernels.cu | 4 +- src/ops/kernels/reshape_kernels.cpp | 4 +- src/ops/kernels/reshape_kernels.cu | 4 +- src/ops/kernels/residual_rms_norm_kernels.cpp | 438 +- src/ops/kernels/residual_rms_norm_kernels.cu | 454 +- src/ops/kernels/rms_norm_kernels.cpp | 396 +- src/ops/kernels/rms_norm_kernels.cu | 444 +- src/ops/kernels/softmax.cpp | 284 +- src/ops/kernels/softmax.cu | 275 +- src/ops/kernels/transpose_kernels.cpp | 4 + src/ops/kernels/transpose_kernels.cu | 4 + src/ops/layer_norm.cc | 181 +- src/ops/layer_norm.cpp | 479 +- src/ops/layer_norm.cu | 352 +- src/ops/linear.cc | 154 +- src/ops/lora_linear.cc | 1316 +++++ src/ops/lora_linear_params.cc | 221 + src/ops/mean.cc | 3 +- src/ops/noop.cc | 7 +- src/ops/pool_2d.cc | 4 +- src/ops/reduce.cc | 2 +- src/ops/reduce.cpp | 2 +- src/ops/reduce.cu | 2 +- src/ops/reshape.cc | 4 +- src/ops/residual_layer_norm.cc | 521 +- src/ops/residual_layer_norm.cpp | 695 ++- src/ops/residual_layer_norm.cu | 690 ++- src/ops/residual_rms_norm.cc | 512 +- src/ops/rms_norm.cc | 168 +- src/ops/sampling.cc | 6 +- src/ops/sigmoid_silu_multi.cc | 187 +- src/ops/sigmoid_silu_multi.cpp | 297 +- src/ops/sigmoid_silu_multi.cu | 264 +- src/ops/softmax.cc | 261 +- src/ops/spec_inc_multihead_self_attention.cc | 2 +- src/ops/spec_inc_multihead_self_attention.cpp | 17 +- src/ops/spec_inc_multihead_self_attention.cu | 66 +- src/ops/split.cc | 2 +- src/ops/topk.cc | 6 +- src/ops/topk.cpp | 3 +- src/ops/topk.cu | 3 +- src/ops/transpose.cc | 6 +- src/ops/tree_inc_multihead_self_attention.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cpp | 654 ++- src/ops/tree_inc_multihead_self_attention.cu | 58 +- src/parallel_ops/allreduce.cc | 287 +- src/parallel_ops/combine.cc | 151 +- src/parallel_ops/fused_parallel_op.cc | 2 +- .../kernels/allreduce_kernels.cpp | 52 +- src/parallel_ops/kernels/allreduce_kernels.cu | 48 +- src/parallel_ops/kernels/combine_kernels.cpp | 4 +- src/parallel_ops/kernels/combine_kernels.cu | 4 +- .../kernels/parallel_identity_kernels.cpp | 97 + .../kernels/parallel_identity_kernels.cu | 96 + .../kernels/partition_kernels.cpp | 4 +- src/parallel_ops/kernels/partition_kernels.cu | 4 +- .../kernels/reduction_kernels.cpp | 2 +- src/parallel_ops/kernels/reduction_kernels.cu | 2 +- .../kernels/replicate_kernels.cpp | 2 +- src/parallel_ops/kernels/replicate_kernels.cu | 2 +- src/parallel_ops/parallel_identity.cc | 474 ++ src/parallel_ops/partition.cc | 10 +- src/parallel_ops/reduction.cc | 17 +- src/parallel_ops/replicate.cc | 91 +- src/runtime/batch_config.cc | 65 +- src/runtime/beam_search_batch_config.cc | 4 + src/runtime/cuda_helper.cu | 200 +- src/runtime/ffconst_utils.cc | 5 + src/runtime/fftype.cc | 25 + src/runtime/file_loader.cc | 92 +- src/runtime/graph.cc | 88 +- src/runtime/hip_helper.cpp | 274 +- src/runtime/inference_manager.cc | 204 +- src/runtime/model.cc | 840 ++- src/runtime/model.cpp | 4 +- src/runtime/model.cu | 45 +- src/runtime/operator.cc | 36 +- src/runtime/operator_params.cc | 3 + src/runtime/request_manager.cc | 768 ++- src/runtime/request_manager.cpp | 45 +- src/runtime/request_manager.cu | 123 +- src/runtime/simulator.cpp | 22 +- src/runtime/simulator.cu | 26 +- src/runtime/substitution.cc | 36 +- src/runtime/tree_verify_batch_config.cc | 4 + tests/.gitignore | 1 - tests/align/test_all_operators.sh | 2 +- tests/cpp_gpu_tests.sh | 4 +- tests/inference/cpp_inference_tests.sh | 64 +- tests/inference/huggingface_inference.py | 14 +- tests/inference/python_inference_tests.sh | 35 +- .../python_test_configs/generate_configs.py | 5 +- tests/peft/alignment/align_test_utils.py | 510 ++ .../alignment/llama_alignment_tests.ipynb | 2651 +++++++++ .../peft/alignment/opt_alignment_tests.ipynb | 450 ++ tests/peft/hf_finetune.py | 129 + tests/peft/hf_serve.py | 140 + tests/peft/hf_train.py | 161 + tests/peft/hf_utils.py | 352 ++ tests/peft/peft_alignment_test.py | 730 +++ tests/peft_test.sh | 66 + 285 files changed, 35212 insertions(+), 6650 deletions(-) create mode 100644 include/flexflow/ops/kernels/lora_linear_kernels.h create mode 100644 include/flexflow/ops/lora_linear.h create mode 100644 include/flexflow/ops/lora_linear_params.h create mode 100644 include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h create mode 100644 include/flexflow/parallel_ops/parallel_identity.h create mode 100644 include/flexflow/parallel_ops/parallel_identity_params.h create mode 100644 include/flexflow/utils/peft_weight_allocator.h delete mode 100644 inference/MODEL_WEIGHTS.md create mode 100644 inference/README.md create mode 100644 inference/peft/CMakeLists.txt create mode 100644 inference/peft/Makefile create mode 100644 inference/peft/peft.cc create mode 100644 inference/peft/peft_bwd_benchmark.cc create mode 100644 inference/peft/peft_fwd_benchmark.cc create mode 100644 inference/peft/req_rate_benchmark.cc create mode 100644 inference/python/ff_peft.py create mode 100644 inference/python/peft_demo/INSTRUCTIONS.md create mode 100644 inference/python/peft_demo/demo.ipynb create mode 100644 inference/python/peft_demo/demo.py create mode 100644 inference/utils/download_peft_model.py create mode 100644 inference/utils/upload_peft_model.py create mode 100644 rdelacou/generate_trace.py create mode 100644 src/ops/kernels/lora_linear_kernels.cpp create mode 100644 src/ops/kernels/lora_linear_kernels.cu create mode 100644 src/ops/lora_linear.cc create mode 100644 src/ops/lora_linear_params.cc create mode 100644 src/parallel_ops/kernels/parallel_identity_kernels.cpp create mode 100644 src/parallel_ops/kernels/parallel_identity_kernels.cu create mode 100644 src/parallel_ops/parallel_identity.cc delete mode 100644 tests/.gitignore create mode 100644 tests/peft/alignment/align_test_utils.py create mode 100644 tests/peft/alignment/llama_alignment_tests.ipynb create mode 100644 tests/peft/alignment/opt_alignment_tests.ipynb create mode 100644 tests/peft/hf_finetune.py create mode 100644 tests/peft/hf_serve.py create mode 100644 tests/peft/hf_train.py create mode 100644 tests/peft/hf_utils.py create mode 100644 tests/peft/peft_alignment_test.py create mode 100755 tests/peft_test.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d05856f1a9..ef5961bc87 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -52,13 +52,14 @@ jobs: run: .github/workflows/helpers/free_space_on_runner.sh - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.16 if: ${{ matrix.gpu_backend == 'cuda' }} id: cuda-toolkit with: - cuda: "11.8.0" + cuda: "12.1.1" # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement use-github-cache: "false" + log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt' - name: Install system dependencies run: .github/workflows/helpers/install_dependencies.sh @@ -156,11 +157,12 @@ jobs: run: .github/workflows/helpers/free_space_on_runner.sh - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.16 id: cuda-toolkit with: - cuda: "11.8.0" + cuda: "12.1.1" use-github-cache: "false" + log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt' - name: Install system dependencies run: .github/workflows/helpers/install_dependencies.sh @@ -169,7 +171,7 @@ jobs: uses: conda-incubator/setup-miniconda@v2 with: activate-environment: flexflow - environment-file: conda/environment.yml + environment-file: conda/flexflow.yml auto-activate-base: false - name: Build FlexFlow diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index c7d0cd72cb..00ca2df603 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -181,6 +181,16 @@ jobs: ../config/config.linux make -j + - name: Run PEFT tests + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export CUDNN_DIR=/usr/local/cuda + export CUDA_DIR=/usr/local/cuda + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + + source ./build/set_python_envs.sh + ./tests/peft_test.sh + - name: Run inference tests env: CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh index 7c11a4a420..73b8e88418 100755 --- a/.github/workflows/helpers/install_cudnn.sh +++ b/.github/workflows/helpers/install_cudnn.sh @@ -5,8 +5,11 @@ set -x # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" +ubuntu_version=$(lsb_release -rs) +ubuntu_version=${ubuntu_version//./} + # Install CUDNN -cuda_version=${1:-11.8.0} +cuda_version=${1:-12.1.1} cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') echo "Installing CUDNN for CUDA version: ${cuda_version} ..." CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz @@ -44,8 +47,11 @@ elif [[ "$cuda_version" == "11.7" ]]; then elif [[ "$cuda_version" == "11.8" ]]; then CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -elif [[ "$cuda_version" == "12.0" ]]; then - echo "CUDNN support for CUDA version 12.0 not yet added" +elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then + CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb + CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb +else + echo "CUDNN support for CUDA version above 12.5 not yet added" exit 1 fi wget -c -q $CUDNN_LINK @@ -55,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME" +elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then + wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt update -y + rm -f cuda-keyring_1.1-1_all.deb + sudo dpkg -i $CUDNN_TARBALL_NAME + sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/ + sudo apt update -y + sudo apt install -y libcudnn8 + sudo apt install -y libcudnn8-dev + sudo apt install -y libcudnn8-samples else sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local fi diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh index ca88668d84..ae6793ea2a 100755 --- a/.github/workflows/helpers/install_nccl.sh +++ b/.github/workflows/helpers/install_nccl.sh @@ -8,13 +8,13 @@ cd "${BASH_SOURCE[0]%/*}" # Add NCCL key ring ubuntu_version=$(lsb_release -rs) ubuntu_version=${ubuntu_version//./} -wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb" -sudo dpkg -i cuda-keyring_1.0-1_all.deb +wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" +sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt update -y -rm -f cuda-keyring_1.0-1_all.deb +rm -f cuda-keyring_1.1-1_all.deb # Install NCCL -cuda_version=${1:-11.8.0} +cuda_version=${1:-12.1.1} cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') echo "Installing NCCL for CUDA version: ${cuda_version} ..." diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index 226f953b38..2fc527bf08 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -38,7 +38,7 @@ jobs: # 10h timeout, instead of default of 360min (6h) timeout-minutes: 600 container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -87,7 +87,7 @@ jobs: runs-on: self-hosted needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest options: --gpus all --shm-size=8192m # 10h timeout, instead of default of 360min (6h) timeout-minutes: 600 @@ -138,7 +138,7 @@ jobs: runs-on: self-hosted needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index 3562134987..d5acbfc2e1 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -44,10 +44,10 @@ jobs: run: .github/workflows/helpers/free_space_on_runner.sh - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.16 id: cuda-toolkit with: - cuda: "11.8.0" + cuda: "12.1.1" # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement use-github-cache: "false" diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml index 267daaee6b..633fb00eb8 100644 --- a/.github/workflows/prebuild-legion.yml +++ b/.github/workflows/prebuild-legion.yml @@ -23,13 +23,13 @@ jobs: strategy: matrix: gpu_backend: ["cuda", "hip_rocm"] - gpu_backend_version: ["11.8", "5.6"] + gpu_backend_version: ["12.0", "5.6"] python_version: ["3.11"] exclude: - gpu_backend: "cuda" gpu_backend_version: "5.6" - gpu_backend: "hip_rocm" - gpu_backend_version: "11.8" + gpu_backend_version: "12.0" fail-fast: false steps: - name: Checkout Git Repository diff --git a/.gitignore b/.gitignore index 7f6a3c4137..cc34c1a7b6 100644 --- a/.gitignore +++ b/.gitignore @@ -187,4 +187,9 @@ gpt_tokenizer python/flexflow/version.txt inference_tensors +hf_peft_tensors +lora_training_logs + +Untitled-1.ipynb +Untitled-2.ipynb tests/inference/python_test_configs/*.json diff --git a/CMakeLists.txt b/CMakeLists.txt index c82a53644e..f06969ae04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -567,6 +567,7 @@ if(NOT BUILD_LEGION_ONLY) if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) + add_subdirectory(inference/peft) endif() diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 67ef6b3419..091ba929e4 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -25,3 +25,10 @@ dependencies: - sentencepiece - einops - requests + - scipy + - bitsandbytes + - datasets + - accelerate + - loralib + - triton + - peft diff --git a/config/config.inc b/config/config.inc index 7d7b2db9cf..6431eaf136 100644 --- a/config/config.inc +++ b/config/config.inc @@ -197,7 +197,7 @@ fi # set ROCM path if [ -n "$ROCM_PATH" ]; then - SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}" + SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}" fi ADD_ROCM_TO_PATH="" diff --git a/docker/build.sh b/docker/build.sh index 8ecacbc6d4..b68860712f 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -56,15 +56,14 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version_input=${cuda_version}.3 elif [[ "$cuda_version" == @(11.8) ]]; then cuda_version_input=${cuda_version}.0 + elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available) + cuda_version=12.2 + cuda_version_input=${cuda_version}.2 else echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi - # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available) - if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.2 - cuda_version_input=${cuda_version}.2 - fi echo "Building $image docker image with CUDA $cuda_version" ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04" gpu_backend_version="-${cuda_version}" diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index cef619ad68..3434916d6b 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -94,6 +94,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1 RUN conda install pytorch torchvision torchaudio -c pytorch RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook +# PEFT-related +RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/docker/run.sh b/docker/run.sh index 666c8e1121..cf105a10c8 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -58,7 +58,7 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the fi fi # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then + if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 009d1c250a..873fed0bdb 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -16,6 +16,7 @@ #pragma once #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "legion.h" #include #include @@ -36,6 +37,18 @@ using BeamSearchBatchConfigFuture = Legion::Future; using TreeVerifyBatchConfigFuture = Legion::Future; using BeamInferenceResultFuture = Legion::Future; +struct OptimizerTasks { + bool compute_gradients = true; + bool reset_gradients_to_zero = false; + bool update_weights = false; + bool save_updated_weights = false; +}; + +void set_optimizer_tasks(OptimizerTasks &tasks, + int max_training_steps, + int completed_training_steps, + int gradient_accumulation_steps); + class BatchConfig { public: using RequestGuid = size_t; @@ -43,6 +56,8 @@ class BatchConfig { BatchConfig(); int num_active_requests() const; int num_active_tokens() const; + int num_active_infr_tokens() const; + int num_active_peft_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); static int max_verify_tokens_per_batch(); @@ -56,26 +71,43 @@ class BatchConfig { // Maximum possible values for different parameters // These maximum values are used for copying BatchConfig // across workers - static int const MAX_NUM_REQUESTS = 64; + static int const MAX_NUM_REQUESTS = 65; static int const MAX_NUM_TOKENS = 1024; static int const MAX_SPEC_TREE_TOKEN_NUM = 64; // Set by update - int num_tokens; + + int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0; // number of tokens in prompt phase, start offset of tokens in inc_decoding // phase. num_tokens - num_prompt_tokens = num_generation_tokens; - int num_generation_tokens; + int num_generation_tokens = 0; struct PerRequestInfo { + PerRequestInfo() { + first_token_depth_in_request = 0; + first_token_offset_in_batch = 0; + num_tokens_in_batch = 0; + max_sequence_length = 0; + request_guid = 0; + prompt_phase = false; + batch_config_request_id = -1; + peft_model_id = PEFTModelID::NO_ID; + peft_bwd = false; + optimizer_tasks = {true, false, false, false}; + } int first_token_depth_in_request; int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; // request id in batch config: - int batch_config_request_id; + int batch_config_request_id = -1; bool prompt_phase = false; RequestGuid request_guid; + // PEFT fields + PEFTModelID peft_model_id; + bool peft_bwd; + OptimizerTasks optimizer_tasks; }; struct PerTokenInfo { int abs_depth_in_request; @@ -102,6 +134,7 @@ class BatchConfig { BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; + PerTokenInfo labelsInfo[MAX_NUM_TOKENS]; bool request_completed[MAX_NUM_REQUESTS]; bool request_running[MAX_NUM_REQUESTS]; @@ -129,6 +162,7 @@ class TreeVerifyBatchConfig : public BatchConfig { struct InferenceResult { static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; BatchConfig::TokenId token_ids[MAX_NUM_TOKENS]; + float finetuning_loss; }; class BeamSearchBatchConfig : public BatchConfig { diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 2c11ae1131..dd9d657117 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -65,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS; #endif class FFConfig; +class MemoryAllocator; +class PEFTWeightAllocator; + +struct CombinedBatchConfigMetaStruct { + BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS]; + BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS]; + BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS]; + bool request_completed[BatchConfig::MAX_NUM_REQUESTS]; + + BeamSearchBatchConfig::BeamSearchPerTokenInfo + beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS + + BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM * + BeamSearchBatchConfig::MAX_NUM_REQUESTS]; + BeamSearchBatchConfig::BeamSearchPerRequestInfo + beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS]; + + TreeVerifyBatchConfig::CommittedTokensInfo + committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS]; +}; struct FFHandler { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -76,18 +95,18 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; - void *batch_config_metadata; + CombinedBatchConfigMetaStruct *batch_config_metadata; // request info + token info + topolopgy mask info - size_t batch_config_metadata_size = - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens) + - sizeof(BatchConfig::request_completed); + size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct); void *offload_reserve_space; size_t offload_reserve_space_size; + // PEFT related fields + MemoryAllocator *peft_activation_allocator; + size_t peft_activation_reserve_space_size; + PEFTWeightAllocator *peft_weight_allocator; + size_t peft_weight_reserve_space_size; + // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; #ifdef FF_USE_NCCL @@ -98,6 +117,8 @@ struct FFHandler { struct FFInitInfo { size_t workSpaceSize; size_t offload_reserve_space_size; + size_t peft_activation_reserve_space_size; + size_t peft_weight_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; @@ -155,6 +176,10 @@ class FFConfig { bool cpu_offload; size_t offload_reserve_space_size; DataType quantization_type; + // PEFT related fields + bool enable_peft; + size_t peft_activation_reserve_space_size; + size_t peft_weight_reserve_space_size; // Control parallelizable dimensions bool only_data_parallel; bool enable_sample_parallel; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 512645e624..24b722c36f 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -46,6 +46,12 @@ enum LossType { LOSS_IDENTITY = 54, }; +enum OptimizerType { + OPTIMIZER_TYPE_NONE = 60, + OPTIMIZER_TYPE_SGD = 61, + OPTIMIZER_TYPE_ADAM = 62, +}; + enum CompMode { COMP_MODE_TRAINING = 70, COMP_MODE_INFERENCE = 71, @@ -72,6 +78,11 @@ enum InferenceMode { TREE_VERIFY_MODE = 2003, }; +enum RequestType { + REQ_INFERENCE = 4001, + REQ_FINETUNING = 4002, +}; + // This is consistent with TASO's OpType // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138 enum OperatorType { @@ -172,6 +183,8 @@ enum OperatorType { OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, + // PEFT Ops + OP_LORA, // Parallel Ops OP_REPARTITION, OP_COMBINE, @@ -179,6 +192,7 @@ enum OperatorType { OP_REDUCTION, OP_PIPELINE, OP_ALLREDUCE, + OP_PARALLEL_IDENTITY, OP_FUSED_PARALLEL, OP_INVALID, }; @@ -193,36 +207,37 @@ enum ModelType { }; enum PMParameter { - PM_OP_TYPE, // AnyOp - PM_NUM_INPUTS, // AnyOp - PM_NUM_OUTPUTS, // AnyOp - PM_GROUP, // Conv2D - PM_KERNEL_H, // Conv2D, Pool2D - PM_KERNEL_W, // Conv2D, Pool2D - PM_STRIDE_H, // Conv2D, Pool2D - PM_STRIDE_W, // Conv2D, Pool2D - PM_PADDING_H, // Conv2D, Pool2D - PM_PADDING_W, // Conv2D, Pool2D - PM_ACTI, // Conv2D, Pool2D - PM_NUMDIM, // Concat, Transpose - PM_AXIS, // Concat, Split - PM_PERM, // Transpose - PM_OUTSHUFFLE, // Transpose - PM_MERGE_GCONV_COUNT, // MergeGConv - PM_AXES, // Squeeze, Unsqueeze, Reduce* - PM_KEEP_DIMS, // Reduce* - PM_EPSILON, // BatchNorm - PM_REPARTITION_DIM, // Repartition - PM_REPARTITION_DEGREE, // Repartition - PM_REPLICATE_DIM, // Replicate - PM_REPLICATE_DEGREE, // Replicate - PM_COMBINE_DIM, // Combine - PM_COMBINE_DEGREE, // Combine - PM_REDUCTION_DIM, // Reduction - PM_REDUCTION_DEGREE, // Reduction - PM_ALLREDUCE_DIM, // AllReduce - PM_SOFTMAX_DIM, // Softmax - PM_NUM_HEADS, // MultiHeadAttention + PM_OP_TYPE, // AnyOp + PM_NUM_INPUTS, // AnyOp + PM_NUM_OUTPUTS, // AnyOp + PM_GROUP, // Conv2D + PM_KERNEL_H, // Conv2D, Pool2D + PM_KERNEL_W, // Conv2D, Pool2D + PM_STRIDE_H, // Conv2D, Pool2D + PM_STRIDE_W, // Conv2D, Pool2D + PM_PADDING_H, // Conv2D, Pool2D + PM_PADDING_W, // Conv2D, Pool2D + PM_ACTI, // Conv2D, Pool2D + PM_NUMDIM, // Concat, Transpose + PM_AXIS, // Concat, Split + PM_PERM, // Transpose + PM_OUTSHUFFLE, // Transpose + PM_MERGE_GCONV_COUNT, // MergeGConv + PM_AXES, // Squeeze, Unsqueeze, Reduce* + PM_KEEP_DIMS, // Reduce* + PM_EPSILON, // BatchNorm + PM_REPARTITION_DIM, // Repartition + PM_REPARTITION_DEGREE, // Repartition + PM_REPLICATE_DIM, // Replicate + PM_REPLICATE_DEGREE, // Replicate + PM_COMBINE_DIM, // Combine + PM_COMBINE_DEGREE, // Combine + PM_REDUCTION_DIM, // Reduction + PM_REDUCTION_DEGREE, // Reduction + PM_ALLREDUCE_DIM, // AllReduce + PM_PARALLEL_IDENTITY_DIM, // AllReduce + PM_SOFTMAX_DIM, // Softmax + PM_NUM_HEADS, // MultiHeadAttention PM_INVALID, PM_PARALLEL_DIM, PM_PARALLEL_DEGREE, @@ -268,5 +283,7 @@ enum { TENSOR_GUID_LAST_VALID = 3999999, PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000, NODE_GUID_FIRST_VALID = 5000000, + PEFT_MODEL_ID_FIRST_VALID = 6000000, + PEFT_MODEL_ID_LAST_VALID = 6999999 }; #endif // _FLEXFLOW_CONST_H_ diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 1cd90fda26..3e482b8d67 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -3,6 +3,8 @@ #include "flexflow/ffconst.h" #include +#include +#include namespace FlexFlow { @@ -18,6 +20,29 @@ class LayerID { size_t id, transformer_layer_id, model_id; }; +class PEFTModelID { +public: + static const PEFTModelID NO_ID; + PEFTModelID(); + PEFTModelID(size_t id); + bool is_valid_id() const; + friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + friend std::ostream &operator<<(std::ostream &os, + PEFTModelID const &peft_model_id); + +public: + size_t id; +}; + }; // namespace FlexFlow +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::PEFTModelID const &n) const { + return n.id; + } +}; +} // namespace std + #endif // _FF_TYPE_H diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 0b74b7fce4..52b4b3d362 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -55,6 +55,11 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t); FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t); +// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t); +// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t); +// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t); // ----------------------------------------------------------------------- // FFConfig @@ -270,6 +275,7 @@ flexflow_tensor_t * bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name); flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( @@ -281,6 +287,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name); flexflow_tensor_t @@ -565,6 +572,7 @@ flexflow_tensor_t * const flexflow_tensor_t input2_, float eps, int dim, + bool inplace_residual, char const *name); flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, @@ -590,6 +598,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); @@ -613,11 +624,16 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); void flexflow_model_generate(flexflow_model_t handle_, int num_requests, - char const **input_text, - int max_num_chars, - char **output_text, - int max_seq_length, - int **output_length_and_tokens); + enum RequestType *request_types, + char const **input_texts, + char **output_texts, + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, + int **output_length_and_tokens, + int *num_finetuning_losses, + float *finetuning_losses); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); @@ -978,6 +994,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num( void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length); +void flexflow_request_manager_set_enable_peft_finetuning( + flexflow_request_manager_t handle_, bool enable_peft_finetuning_); + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, @@ -1036,6 +1055,113 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_); +// // ----------------------------------------------------------------------- +// // LoraSGDOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_sgd_optimizer_config_t +// flexflow_lora_sgd_optimizer_config_create( +// double lr, double momentum, bool nesterov, bool weight_decay); + +// void flexflow_lora_sgd_optimizer_config_destroy( +// flexflow_lora_sgd_optimizer_config_t handle_); + +// // ----------------------------------------------------------------------- +// // LoraAdamOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_adam_optimizer_config_t +// flexflow_lora_adam_optimizer_config_create(double alpha, +// double beta1, +// double beta2, +// double weight_decay, +// double epsilon); + +// void flexflow_lora_adam_optimizer_config_destroy( +// flexflow_lora_adam_optimizer_config_t handle_); + +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_, + bool trainable, + bool init_lora_weights, + char const *base_model_name_or_path, + char const *precision, + int rank, + float lora_alpha, + float lora_dropout, + int num_target_modules, + char const **target_modules_, + enum OptimizerType optimizer_type, + float sgd_learning_rate, + float sgd_momentum, + bool sgd_nesterov, + float sgd_weight_decay, + float adam_alpha, + float adam_beta1, + float adam_beta2, + float adam_weight_decay, + float adam_epsilon); + +void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_); + +char const *flexflow_lora_linear_config_get_cache_folder( + flexflow_lora_linear_config_t handle_); + +char const *flexflow_lora_linear_config_get_peft_model_id( + flexflow_lora_linear_config_t handle_); + +int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_); + +float flexflow_lora_linear_config_get_lora_alpha( + flexflow_lora_linear_config_t handle_); + +float flexflow_lora_linear_config_get_lora_dropout( + flexflow_lora_linear_config_t handle_); + +bool flexflow_lora_linear_config_get_trainable( + flexflow_lora_linear_config_t handle_); + +bool flexflow_lora_linear_config_get_init_lora_weights( + flexflow_lora_linear_config_t handle_); + +char const **flexflow_lora_linear_config_get_target_modules( + flexflow_lora_linear_config_t handle_, int *num_target_modules); + +char const *flexflow_lora_linear_config_get_base_model_name_or_path( + flexflow_lora_linear_config_t handle_); + +char const *flexflow_lora_linear_config_get_precision( + flexflow_lora_linear_config_t handle_); + +void flexflow_lora_linear_config_set_lora_alpha( + flexflow_lora_linear_config_t handle_, float value); + +void flexflow_lora_linear_config_set_lora_dropout( + flexflow_lora_linear_config_t handle_, float value); + +void flexflow_lora_linear_config_set_trainable( + flexflow_lora_linear_config_t handle_, bool value); + +void flexflow_lora_linear_config_set_init_lora_weights( + flexflow_lora_linear_config_t handle_, bool value); + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create(); + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id); + +flexflow_peft_model_id_t flexflow_peft_model_id_no_id(); + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_); + #ifdef __cplusplus } #endif diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index f24a797ffd..ba4101c173 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -40,6 +40,7 @@ struct GenerationResult { std::string output_text; std::vector input_tokens; std::vector output_tokens; + std::vector finetuning_losses; }; #include diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index 69a57e4e1c..c3dbcac422 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -49,7 +49,7 @@ class Layer { Tensor outputs[MAX_NUM_OUTPUTS]; Tensor inputs[MAX_NUM_INPUTS]; Tensor weights[MAX_NUM_WEIGHTS]; - bool trainableInputs[MAX_NUM_INPUTS]; + // bool trainable_inputs[MAX_NUM_INPUTS]; int numInputs, numWeights, numOutputs; bool profiling; bool inference_debugging; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6dda67bbfe..4ad735ef7d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -108,19 +108,31 @@ enum TaskIDs { LAYERNORM_FWD_TASK_ID, LAYERNORM_INF_TASK_ID, LAYERNORM_BWD_TASK_ID, + LAYERNORM_PEFT_BWD_TASK_ID, RESIDUAL_LAYERNORM_INIT_TASK_ID, RESIDUAL_LAYERNORM_INF_TASK_ID, + RESIDUAL_LAYERNORM_BWD_TASK_ID, + RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, SIGMOID_SILU_MULTI_INIT_TASK_ID, SIGMOID_SILU_MULTI_INF_TASK_ID, + SIGMOID_SILU_MULTI_BWD_TASK_ID, + SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, LINEAR_INF_TASK_ID, + LINEAR_PEFT_BWD_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID, LINEAR_BWD2_TASK_ID, LINEAR_UPD_TASK_ID, + LORA_LINEAR_INIT_TASK_ID, + LORA_LINEAR_REG_TASK_ID, + LORA_LINEAR_INF_TASK_ID, + LORA_LINEAR_PEFT_BWD_TASK_ID, FLAT_INIT_TASK_ID, FLAT_FWD_TASK_ID, FLAT_BWD_TASK_ID, @@ -128,6 +140,7 @@ enum TaskIDs { SOFTMAX_FWD_TASK_ID, SOFTMAX_BWD_TASK_ID, SOFTMAX_INF_TASK_ID, + SOFTMAX_PEFT_BWD_TASK_ID, CONCAT_INIT_TASK_ID, CONCAT_FWD_TASK_ID, CONCAT_BWD_TASK_ID, @@ -163,20 +176,26 @@ enum TaskIDs { RMSNORM_INIT_TASK_ID, RMSNORM_FWD_TASK_ID, RMSNORM_INF_TASK_ID, + RMSNORM_BWD_TASK_ID, + RMSNORM_PEFT_BWD_TASK_ID, RESIDUAL_RMSNORM_INIT_TASK_ID, RESIDUAL_RMSNORM_INF_TASK_ID, + RESIDUAL_RMSNORM_BWD_TASK_ID, + RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, BEAM_TOPK_INIT_TASK_ID, BEAM_TOPK_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, + FUSEDOP_PEFT_BWD_TASK_ID, FUSEDOP_FWD_TASK_ID, FUSEDOP_BWD_TASK_ID, FUSEDOP_INF_TASK_ID, @@ -224,10 +243,13 @@ enum TaskIDs { REPARTITION_BWD_TASK_ID, COMBINE_INIT_TASK_ID, COMBINE_FWD_TASK_ID, + COMBINE_INF_TASK_ID, COMBINE_BWD_TASK_ID, + COMBINE_PEFT_BWD_TASK_ID, REPLICATE_INIT_TASK_ID, REPLICATE_FWD_TASK_ID, REPLICATE_BWD_TASK_ID, + REPLICATE_PEFT_BWD_TASK_ID, REDUCTION_INIT_TASK_ID, REDUCTION_FWD_TASK_ID, REDUCTION_BWD_TASK_ID, @@ -235,9 +257,15 @@ enum TaskIDs { PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, ALLREDUCE_INIT_TASK_ID, - ALLREDUCE_INF_TASK_ID, ALLREDUCE_FWD_TASK_ID, ALLREDUCE_BWD_TASK_ID, + ALLREDUCE_INF_TASK_ID, + ALLREDUCE_PEFT_BWD_TASK_ID, + PARALLEL_IDENTITY_INIT_TASK_ID, + PARALLEL_IDENTITY_FWD_TASK_ID, + PARALLEL_IDENTITY_BWD_TASK_ID, + PARALLEL_IDENTITY_INF_TASK_ID, + PARALLEL_IDENTITY_PEFT_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, @@ -327,6 +355,7 @@ class ResidualLayerNorm; class AddBiasResidualLayerNorm; class SigmoidSiluMulti; class Linear; +class LoraLinear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; @@ -349,9 +378,12 @@ class Repartition; class Reduction; class Replicate; class AllReduce; +class ParallelIdentity; class FusedParallelOp; class ParallelOpInfo; +struct Request; + // TODO: Move to an appropriate place /* This is used to create a type that recursively replaces value type @@ -561,6 +593,7 @@ class FFModel { bool elementwise_affine, float eps, bool use_bias = true, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a add_bias_residual_layer_norm layer @@ -571,6 +604,7 @@ class FFModel { bool elementwise_affine, float eps, bool use_bias = true, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a sigmoid_silu_multi layer @@ -599,6 +633,7 @@ class FFModel { Tensor *outputs, float eps, int dim, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a beam search top k layer @@ -808,10 +843,13 @@ class FFModel { bool position_bias = false, char const *name = NULL); // ======================================== + // PEFT Layers + // ======================================== + PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); + // ======================================== // Inference APIs // ======================================== - std::vector generate(std::vector &prompts, - int max_seq_length); + std::vector generate(std::vector const &requests); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], @@ -1103,6 +1141,9 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool need_to_add_combine(int layer_idx) const; + bool need_to_add_allreduce(int layer_idx) const; + bool need_to_add_parallel_identity(int layer_idx) const; bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, @@ -1117,7 +1158,7 @@ class FFModel { void clear_graph_search_cache(); public: - size_t op_global_guid, layer_global_guid; + size_t op_global_guid, layer_global_guid, peft_model_global_guid; size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid; size_t current_transformer_layer_id; // positional embedding start offset @@ -1137,6 +1178,12 @@ class FFModel { std::vector layers; std::vector operators; std::vector parameters; + // PEFT related + std::unordered_map base_layer_to_peft_layer; + std::unordered_map> peft_layer_to_peft_id; + std::unordered_map peft_configs; + // std::vector peft_operators; + FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; // Cached operators: key: operator hash, value: operator pointer @@ -1195,6 +1242,10 @@ class FFModel { SigmoidSiluMulti *>, std::unordered_map, Linear *>, + std::unordered_map< + std::pair, + LoraLinearParams>, + LoraLinear *>, std::unordered_map, Pool2D *>, std::unordered_map, std::unordered_map, AllReduce *>, + std::unordered_map, + ParallelIdentity *>, std::unordered_map, FusedParallelOp *>> cached_ops; diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h index 60785a1e29..d31c12b16c 100644 --- a/include/flexflow/op_meta.h +++ b/include/flexflow/op_meta.h @@ -9,7 +9,7 @@ class Op; class OpMeta { public: - OpMeta(FFHandler _handle); + // OpMeta(FFHandler _handle); OpMeta(FFHandler _handle, Op const *op); public: @@ -17,9 +17,11 @@ class OpMeta { bool profiling; // Measure the run time of the task bool inference_debugging; int decoding_step; + int bwd_step; char op_name[MAX_OPNAME]; LayerID layer_guid; - bool trainableInputs[MAX_NUM_INPUTS]; + bool trainable_inputs[MAX_NUM_INPUTS]; + bool reset_input_grads[MAX_NUM_INPUTS]; DataType input_type[MAX_NUM_INPUTS]; DataType weight_type[MAX_NUM_WEIGHTS]; DataType output_type[MAX_NUM_OUTPUTS]; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 311699d926..1a5af67b36 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -7,7 +7,9 @@ #include "flexflow/machine_view.h" #include "flexflow/parallel_tensor.h" #include "flexflow/utils/dot/record_formatter.h" +#include #include +namespace fs = std::filesystem; #include #include @@ -29,6 +31,11 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT }; enum class MappingOperation { PARTITION, REPLICATE }; +fs::path get_dst_folder(std::string const &subdir, + int step_idx = 0, + int shard_idx = 0, + bool before_kernel = false); + /** @brief A class to keep track of a dimension relation between two tensors * used by an operator. * @@ -236,11 +243,18 @@ class Op { Legion::FutureMap empty_map; return empty_map; }; + virtual Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) { + assert(false); + } virtual void print_layer(FFModel const &model) = 0; template static std::string get_op_name_without_uid(OpMetaType *m) { std::string op_name_without_uid = std::string(m->op_name); - size_t last_underscore = op_name_without_uid.length() - 1; + size_t last_underscore = op_name_without_uid.length(); for (int i = op_name_without_uid.length() - 1; i > 0; i--) { if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { break; @@ -248,7 +262,9 @@ class Op { last_underscore = i; } } - op_name_without_uid.erase(last_underscore); + if (last_underscore < op_name_without_uid.length()) { + op_name_without_uid.erase(last_underscore); + } return op_name_without_uid; } template @@ -259,31 +275,42 @@ class Op { std::vector input_tensors, std::vector weight_tensors, std::vector output_tensors, + bool fwd_pass = true, bool before_kernel = false) { - // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; - struct stat st = {0}; - if (stat(folder_path, &st) == -1) { - // Directory does not exist, create it - mkdir(folder_path, 0700); - } - // output base filepath, shared by all tensors from the same operator + // get operator name and print it std::string op_name_without_uid = get_op_name_without_uid(m); - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - op_name_without_uid + "_shard-id_" + std::to_string(shard_id); - if (before_kernel) { - base_filepath += "_pre"; + std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid + << std::endl; + // build the path to save the tensor + fs::path dst_filepath; + if (fwd_pass) { + dst_filepath = + get_dst_folder("fwd", m->decoding_step, shard_id, before_kernel); + } else { + dst_filepath = + get_dst_folder("bwd", m->bwd_step, shard_id, before_kernel); + } + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + // save batch config, if passed if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); + bc->save_to_file(dst_filepath.string() + ".batch_config"); } + // save all inputs for (int i = 0; i < input_tensors.size(); i++) { - std::string filename = base_filepath + "_input_" + std::to_string(i); + std::string filename = dst_filepath.string() + ".input_"; + if (fwd_pass) { + filename += std::to_string(i); + } else { + filename += "gradient_" + std::to_string(i); + } if (input_tensors[i].data_type == DT_FLOAT) { save_tensor(input_tensors[i].get_float_ptr(), input_tensors[i].domain.get_volume(), @@ -304,10 +331,17 @@ class Op { assert(false && "Tensor data type not supported"); } } - // only dump the weights once - if (m->decoding_step == 0) { + + // only dump the weights in the forward pass, at the first step + // note that we do not save the weight gradients, since we only support + // finetuning LoRA weights, which are not FF tensors. + if (fwd_pass && m->decoding_step == 0) { + fs::path dst_filepath_weights = + get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) / + layername; for (int i = 0; i < weight_tensors.size(); i++) { - std::string filename = base_filepath + "_weight_" + std::to_string(i); + std::string filename = + dst_filepath_weights.string() + ".weight_" + std::to_string(i); if (weight_tensors[i].data_type == DT_FLOAT) { save_tensor(weight_tensors[i].get_float_ptr(), weight_tensors[i].domain.get_volume(), @@ -329,9 +363,15 @@ class Op { } } } + // save all outputs for (int i = 0; i < output_tensors.size(); i++) { - std::string filename = base_filepath + "_output_" + std::to_string(i); + std::string filename = dst_filepath.string() + ".output_"; + if (fwd_pass) { + filename += std::to_string(i); + } else { + filename += "gradient_" + std::to_string(i); + } if (output_tensors[i].data_type == DT_FLOAT) { save_tensor(output_tensors[i].get_float_ptr(), output_tensors[i].domain.get_volume(), @@ -354,7 +394,11 @@ class Op { } // increase count of decoding steps if (!before_kernel) { - m->decoding_step++; + if (fwd_pass) { + m->decoding_step++; + } else { + m->bwd_step++; + } } } virtual bool measure_operator_cost(Simulator *sim, @@ -448,7 +492,8 @@ class Op { ParallelTensor outputs[MAX_NUM_OUTPUTS]; ParallelTensor inputs[MAX_NUM_INPUTS]; ParallelParameter weights[MAX_NUM_WEIGHTS]; - bool trainableInputs[MAX_NUM_INPUTS]; + bool trainable_inputs[MAX_NUM_INPUTS]; + bool reset_input_grads[MAX_NUM_INPUTS]; OpMeta *meta[MAX_NUM_WORKERS]; std::map inference_meta; int numInputs, numWeights, numOutputs; diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 5b187839ef..673f78ad46 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -23,6 +23,7 @@ #include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" +#include "flexflow/ops/lora_linear_params.h" #include "flexflow/ops/pool_2d_params.h" #include "flexflow/ops/reduce_params.h" #include "flexflow/ops/reshape_params.h" @@ -40,6 +41,7 @@ #include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" +#include "flexflow/parallel_ops/parallel_identity_params.h" #include "flexflow/parallel_ops/partition_params.h" #include "flexflow/parallel_ops/reduction_params.h" #include "flexflow/parallel_ops/replicate_params.h" @@ -67,6 +69,7 @@ using OperatorParameters = mp::variant; tl::optional get_op_parameters(Op const *op); diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index bb470376c3..9510ac0f28 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -24,8 +24,10 @@ class AddBiasResidualLayerNorm : public Op { bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, @@ -38,6 +40,11 @@ class AddBiasResidualLayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -61,6 +68,14 @@ class AddBiasResidualLayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -76,21 +91,55 @@ class AddBiasResidualLayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + template + static void backward_kernel(AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + ffStream_t stream); + static void + backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + template + static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T const *gamma_ptr, + ffStream_t stream); + static void + peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; std::vector axes; }; @@ -105,8 +154,12 @@ class AddBiasResidualLayerNormMeta : public OpMeta { bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h index 87fe2fb562..840f521b01 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams { bool elementwise_affine; float eps; bool use_bias; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 3ba4f414d1..283e9a4290 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -11,9 +11,11 @@ namespace FlexFlow { #define AGGREGATE_MAX_BATCH_SIZE 64 #define AGGREGATE_MAX_N 128 +class Aggregate; + class AggregateMeta : public OpMeta { public: - AggregateMeta(FFHandler handle, int n); + AggregateMeta(FFHandler handle, Aggregate const *aggr); ~AggregateMeta(void); float **dev_exp_preds; float **dev_exp_grads; diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 4302dd0733..a9f651b620 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -11,9 +11,11 @@ namespace FlexFlow { #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32 #define AGGREGATE_SPEC_MAX_N 12 +class AggregateSpec; + class AggregateSpecMeta : public OpMeta { public: - AggregateSpecMeta(FFHandler handle, int n); + AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg); ~AggregateSpecMeta(void); float **dev_region_ptrs; }; diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h index 298059e3ed..eca9943d20 100644 --- a/include/flexflow/ops/argmax.h +++ b/include/flexflow/ops/argmax.h @@ -17,6 +17,7 @@ class ArgMaxMeta : public OpMeta { size_t temp_storage_bytes = 0; int *d_offsets; void *d_out; + float *d_loss; Realm::RegionInstance reserveInst; ArgMaxMeta(FFHandler handler, Op const *op, @@ -89,18 +90,22 @@ class ArgMax : public Op { CostMetrics &cost_metrics) const override; template static void forward_kernel(ArgMaxMeta const *m, - DT *input_ptr, + BatchConfig const *bc, + DT const *input_ptr, int *indices_ptr, float *prob_ptr, int *parent_ptr, int length, int batch_size, + float *loss, ffStream_t stream); static void forward_kernel_wrapper(ArgMaxMeta const *m, - GenericTensorAccessorW const &input, + BatchConfig const *bc, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, - int batch_size); + int batch_size, + float *loss); Params get_params() const; public: diff --git a/include/flexflow/ops/cache.h b/include/flexflow/ops/cache.h index 1fbb1fa059..4f0b94ee5c 100644 --- a/include/flexflow/ops/cache.h +++ b/include/flexflow/ops/cache.h @@ -5,9 +5,11 @@ namespace FlexFlow { +class Cache; + class CacheMeta : public OpMeta { public: - CacheMeta(FFHandler handle); + CacheMeta(FFHandler handle, Cache const *c); float cache_score; }; diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index ddef59549c..043b5d19a7 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -12,9 +12,11 @@ namespace FlexFlow { +class ElementUnary; + class ElementUnaryMeta : public OpMeta { public: - ElementUnaryMeta(FFHandler handle); + ElementUnaryMeta(FFHandler handle, ElementUnary const *unary); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnActivationDescriptor_t actiDesc; diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ed89fcf37a..c90e1773e0 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -60,6 +60,11 @@ class Embedding : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; // void update(const FFModel&); void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index d68957d890..1ed4678a5b 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -6,20 +6,11 @@ namespace FlexFlow { +class Experts; + class ExpertsMeta : public OpMeta { public: - ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation); + ExpertsMeta(FFHandler handler, Experts const *e); ~ExpertsMeta(void); // Thrust helper arrays @@ -138,7 +129,7 @@ class Experts : public Op { float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim); diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index a8326e9ab4..02ab1db7b5 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -49,6 +49,11 @@ class FusedOp : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -60,6 +65,10 @@ class FusedOp : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index ec6cdfb9ab..73025216cd 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Group_by; + class GroupByMeta : public OpMeta { public: - GroupByMeta(FFHandler handle, int n, float _alpha); + GroupByMeta(FFHandler handle, Group_by const *gb); ~GroupByMeta(void); float alpha; float **dev_region_ptrs; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 43dc527bc8..f77df7c456 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -96,6 +96,11 @@ class IncMultiHeadSelfAttention : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -109,17 +114,27 @@ class IncMultiHeadSelfAttention : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; - - static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, GenericTensorAccessorR const &bias); + static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias); Params get_params() const; public: @@ -204,6 +219,10 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { // typedef hipFloatComplex attFloatComplex; hipFloatComplex *complex_input; #endif + // PEFT specific fields + void *softmax_activation_buffer; + void *query_activation_buffer; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/batch_matmul_kernels.h b/include/flexflow/ops/kernels/batch_matmul_kernels.h index 4de774ee06..c3923c4d4b 100644 --- a/include/flexflow/ops/kernels/batch_matmul_kernels.h +++ b/include/flexflow/ops/kernels/batch_matmul_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class BatchMatmul; + class BatchMatmulMeta : public OpMeta { public: - BatchMatmulMeta(FFHandler handler); + BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm); int a_seq_length_dim, b_seq_length_dim; }; diff --git a/include/flexflow/ops/kernels/cast_kernels.h b/include/flexflow/ops/kernels/cast_kernels.h index 3001d913ca..d601601ea2 100644 --- a/include/flexflow/ops/kernels/cast_kernels.h +++ b/include/flexflow/ops/kernels/cast_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Cast; + class CastMeta : public OpMeta { public: - CastMeta(FFHandler handle); + CastMeta(FFHandler handle, Cast const *cast); DataType input_data_type, output_data_type; }; diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h index 4da6aaf5e2..4562ae871a 100644 --- a/include/flexflow/ops/kernels/concat_kernels.h +++ b/include/flexflow/ops/kernels/concat_kernels.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Concat; + class ConcatMeta : public OpMeta { public: - ConcatMeta(FFHandler handle) : OpMeta(handle){}; + ConcatMeta(FFHandler handle, Concat const *cc); int legion_axis; }; diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h index 7b2a0fe135..f83e4687d7 100644 --- a/include/flexflow/ops/kernels/conv_2d_kernels.h +++ b/include/flexflow/ops/kernels/conv_2d_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Conv2D; + class Conv2DMeta : public OpMeta { public: - Conv2DMeta(FFHandler handler); + Conv2DMeta(FFHandler handler, Conv2D const *conv); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor; cudnnFilterDescriptor_t filterDesc; diff --git a/include/flexflow/ops/kernels/flat_kernels.h b/include/flexflow/ops/kernels/flat_kernels.h index caf817512d..6aa5a13b42 100644 --- a/include/flexflow/ops/kernels/flat_kernels.h +++ b/include/flexflow/ops/kernels/flat_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Flat; + class FlatMeta : public OpMeta { public: - FlatMeta(FFHandler handle) : OpMeta(handle){}; + FlatMeta(FFHandler handle, Flat const *flat); }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index d1e0e050b2..3d122d4bc5 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -385,6 +385,25 @@ inline __device__ void zero(T &dst) { dst = tmp.raw; } +template +__device__ __forceinline__ T WARP_SHFL(unsigned mask, T var, int srcLane, int width=warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width=warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + + template inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) { // use float32 to get better accuracy @@ -401,7 +420,7 @@ inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) { float qk = sum(qk_vec); #pragma unroll for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) { - qk += __shfl_xor_sync(uint32_t(-1), qk, mask); + qk += WARP_SHFL_XOR(uint32_t(-1), qk, mask); } return qk; } @@ -423,7 +442,7 @@ inline __device__ float block_sum(float *red_smem, float sum) { // Compute the sum per warp. #pragma unroll for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { - sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask); } // Warp leaders store the data to shared memory. @@ -442,11 +461,11 @@ inline __device__ float block_sum(float *red_smem, float sum) { // Parallel reduction inside the warp. #pragma unroll for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask); } // Broadcast to other threads. - return __shfl_sync(uint32_t(-1), sum, 0); + return WARP_SHFL(uint32_t(-1), sum, 0); } template diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index a5fdc7c602..90e50a0c9a 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -35,6 +35,9 @@ class LinearMeta : public OpMeta { float kernel_reg_lambda; bool use_bias, add_bias_only_once; Realm::RegionInstance reserveInst; + // PEFT related fields + void *output_activation_buffer; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { @@ -48,6 +51,23 @@ void forward_kernel_wrapper(LinearMeta const *m, int in_dim, int out_dim, int batch_size); +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *filter_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size); +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens); void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -73,6 +93,16 @@ void forward_kernel(LinearMeta const *m, int batch_size, ffStream_t stream); template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream); +template void backward_kernel(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -85,6 +115,7 @@ void backward_kernel(LinearMeta const *m, int out_dim, int batch_size, ffStream_t stream); + template __global__ void build_one_ptr(DT *one_ptr, int batch_size); } // namespace Internal diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h new file mode 100644 index 0000000000..5360b5f8ea --- /dev/null +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -0,0 +1,77 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H + +#include "flexflow/accessor.h" +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/lora_linear.h" + +namespace FlexFlow { + +struct LoraLinearWeight { + // weights + void *w0_ptr, *w1_ptr; + // gradients + void *w0_grad_ptr, *w1_grad_ptr; + // v values for SGD optimizer (when using momentum) + void *w0_v_values_ptr, *w1_v_values_ptr; + int in_dim, out_dim, rank, num_shards; +}; + +struct LoraLinearModelState { + LoraLinearWeight weights; + LoraOptimizerConfig const *optimizer_config; + float lora_alpha; + std::string cache_folder; + // Huggingface model ID (for download and/or upload) + std::string peft_model_id; +}; + +class LoraLinearMeta : public OpMeta { +public: + LoraLinearMeta(FFHandler handle, LoraLinear const *li); + ~LoraLinearMeta(void); + // PEFT related fields + void *low_rank_activation; + void *input_activation; + std::unordered_map model_state; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; +}; + +namespace Kernels { +namespace LoraLinear { +void init_kernel_wrapper(LoraLinearMeta *m, int seed); +void inference_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +namespace Internal { +template +void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream); +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + ffStream_t stream); +template +void peft_bwd_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int in_dim, + int out_dim, + ffStream_t stream); +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow +#endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h index 7f73a8295d..c5a954763e 100644 --- a/include/flexflow/ops/kernels/pool_2d_kernels.h +++ b/include/flexflow/ops/kernels/pool_2d_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Pool2D; + class Pool2DMeta : public OpMeta { public: - Pool2DMeta(FFHandler handle); + Pool2DMeta(FFHandler handle, Pool2D const *pool); ffTensorDescriptor_t inputTensor, outputTensor; ffActivationDescriptor_t actiDesc; ffPoolingDescriptor_t poolDesc; diff --git a/include/flexflow/ops/kernels/reshape_kernels.h b/include/flexflow/ops/kernels/reshape_kernels.h index e6c8c4d569..5b6fa5be19 100644 --- a/include/flexflow/ops/kernels/reshape_kernels.h +++ b/include/flexflow/ops/kernels/reshape_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Reshape; + class ReshapeMeta : public OpMeta { public: - ReshapeMeta(FFHandler handler); + ReshapeMeta(FFHandler handler, Reshape const *reshape); DataType data_type; }; @@ -44,4 +46,4 @@ void backward_kernel(T *input_grad_ptr, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 0eef4ca72b..fd4e0ecf1d 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_OPS_KERNELS_RESIDUAL_RMSNORM_KERNELS_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -31,13 +32,14 @@ class ResidualRMSNormMeta : public OpMeta { void *rms_ptr; void *norm_ptr; - float alpha; - float beta; - + bool inplace_residual; int in_dim; int batch_size; int num_elements; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { @@ -48,6 +50,28 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &residual_output, GenericTensorAccessorW const &output); +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output); +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad); +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, + GenericTensorAccessorR const &weight); } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 35c5aa69fa..475b6d94ed 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -31,13 +32,13 @@ class RMSNormMeta : public OpMeta { void *rms_ptr; void *norm_ptr; - float alpha; - float beta; - int in_dim; int batch_size; int num_elements; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { @@ -46,6 +47,22 @@ void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output); +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output); +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad); +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight); } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 8cfaf3c586..0b7f1090f6 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -23,20 +23,30 @@ class SoftmaxMeta : public OpMeta { bool profiling; bool inference_debugging; int dim; - DataType input_type, output_type; }; namespace Kernels { namespace Softmax { -template + void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr); -template + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, - size_t num_elements); + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + bool is_last_op, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad); + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); namespace Internal { template @@ -46,10 +56,28 @@ void forward_kernel(SoftmaxMeta const *m, ffStream_t stream); template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, ffStream_t stream); + +template +void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + ffStream_t stream); + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + ffStream_t stream); + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/include/flexflow/ops/kernels/transpose_kernels.h b/include/flexflow/ops/kernels/transpose_kernels.h index 7ff6163b30..a2c8ff0483 100644 --- a/include/flexflow/ops/kernels/transpose_kernels.h +++ b/include/flexflow/ops/kernels/transpose_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Transpose; + class TransposeMeta : public OpMeta { public: - TransposeMeta(FFHandler handler) : OpMeta(handler){}; + TransposeMeta(FFHandler handler, Transpose const *transpose); int num_dim; int perm[MAX_TENSOR_DIM]; }; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 9e48d81190..b5e9538ea6 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -37,6 +37,11 @@ class LayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -67,6 +72,10 @@ class LayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -81,11 +90,6 @@ class LayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void forward_kernel_wrapper(LayerNormMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - GenericTensorAccessorR const &gamma, - GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -96,13 +100,34 @@ class LayerNorm : public Op { T *beta_grad_ptr, ffStream_t stream); template + static void peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + ffStream_t stream); + + static void forward_kernel_wrapper(LayerNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); static void backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + static void inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); + static void peft_bwd_kernel_wrapper(LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias; @@ -124,6 +149,9 @@ class LayerNormMeta : public OpMeta { float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index a32df80537..ed2fad580f 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -52,6 +52,11 @@ class Linear : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * @@ -66,6 +71,10 @@ class Linear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h new file mode 100644 index 0000000000..9e83c3f90e --- /dev/null +++ b/include/flexflow/ops/lora_linear.h @@ -0,0 +1,99 @@ +#ifndef _FLEXFLOW_LORA_LINEAR_FIRST_H +#define _FLEXFLOW_LORA_LINEAR_FIRST_H + +#include "flexflow/inference.h" +#include "flexflow/node.h" +#include "flexflow/operator.h" +#include "flexflow/ops/lora_linear_params.h" +#include "flexflow/utils/memory_allocator.h" + +namespace FlexFlow { + +class FFModel; +class Layer; + +class LoraLinear : public Op { +public: + using Params = LoraLinearParams; + using Input = std::pair; + + LoraLinear( + FFModel &model, + LayerID const &layer_guid, + OperatorType type, + ParallelTensor const input, + ParallelTensor const output, + std::unordered_map const &_peft_configs, + char const *name = nullptr); + LoraLinear(FFModel &model, + LoraLinear const &other, + ParallelTensor const input, + ParallelTensor const output); + LoraLinear(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name = nullptr); + + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override; + void map_output_tensors(FFModel &model) override; + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + // size_t get_params_hash() const override; + LoraLinearParams get_params() const; + + std::unordered_map peft_configs; +}; + +}; // namespace FlexFlow + +#endif // _FLEXLOW_LORA_LINEAR_FIRST_H diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h new file mode 100644 index 0000000000..70539271f2 --- /dev/null +++ b/include/flexflow/ops/lora_linear_params.h @@ -0,0 +1,150 @@ +#ifndef _FLEXFLOW_LORA_LINEAR_PARAMS_H +#define _FLEXFLOW_LORA_LINEAR_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/inference.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_tensor.h" +#include +#include +#include +#include + +namespace FlexFlow { + +class LoraOptimizerConfig { +public: + LoraOptimizerConfig(); + virtual ~LoraOptimizerConfig() {} +}; + +class LoraSGDOptimizerConfig : public LoraOptimizerConfig { +public: + LoraSGDOptimizerConfig(); + LoraSGDOptimizerConfig(double lr_, + double momentum_ = 0.0f, + bool nesterov_ = false, + bool weight_decay_ = 0.0f); + friend std::ostream &operator<<(std::ostream &os, + LoraSGDOptimizerConfig const &llc); + + NLOHMANN_DEFINE_TYPE_INTRUSIVE( + LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay) + +public: + double lr = 0.001f; + double momentum = 0.0f; + bool nesterov = false; + double weight_decay = 0.0f; +}; + +class LoraAdamOptimizerConfig : public LoraOptimizerConfig { +public: + LoraAdamOptimizerConfig(); + LoraAdamOptimizerConfig(double alpha_, + double beta1_ = 0.9f, + double beta2_ = 0.999f, + double weight_decay_ = 0.0f, + double epsilon_ = 1e-8); + friend std::ostream &operator<<(std::ostream &os, + LoraAdamOptimizerConfig const &llc); + + NLOHMANN_DEFINE_TYPE_INTRUSIVE( + LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon) + +public: + // Adam + double alpha = 0.001f; + double beta1 = 0.9f; + double beta2 = 0.999f; + double weight_decay = 0.0f; + double epsilon = 1e-8; +}; + +// Serialization helpers +template +void serialize_to_json_file(T const &obj, fs::path const &filepath); + +// Function to deserialize JSON from file and create object +template +std::unique_ptr deserialize_from_json_file(fs::path const &filepath); + +class LoraLinearConfig { +public: + static const LoraLinearConfig EmptyConfig; + LoraLinearConfig(std::string const &cache_folder_, + std::string const &peft_model_id_, + bool trainable_ = false, + LoraOptimizerConfig *optimizer_config_ = nullptr, + bool init_lora_weights_ = false, + std::string const &base_model_name_or_path_ = "", + std::string const &precision_ = "fp16", + int rank_ = 8, + float lora_alpha_ = 8.0f, + float lora_dropout_ = 0.0f, + std::vector const &target_modules_ = {}); + // constructor used to support std::unordered_map + LoraLinearConfig(); + friend bool operator==(LoraLinearConfig const &lhs, + LoraLinearConfig const &rhs); + friend std::ostream &operator<<(std::ostream &os, + LoraLinearConfig const &llc); + + NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig, + cache_folder, + peft_model_id, + rank, + lora_alpha, + lora_dropout, + target_modules, + trainable, + init_lora_weights, + base_model_name_or_path, + precision) + + std::string cache_folder; + // Huggingface model ID (for download and/or upload) + std::string peft_model_id; + // Lora parameters + int rank; + float lora_alpha; + float lora_dropout; + std::vector target_modules; + // Training parameters + // whether the weights are trainable (fine-tuning scenario) or not + // (inference-only). If set to true, allocate space for the gradients + bool trainable = false; + LoraOptimizerConfig *optimizer_config; + // whether to initialize weights randomly (instead of attempting to load them + // from file) + bool init_lora_weights; + // parameters only used to upload model after finetuning + std::string base_model_name_or_path; + std::string precision; +}; + +class LoraLinearParams { +public: + LayerID layer_guid; + OperatorType type; + std::unordered_map peft_configs; + char name[MAX_OPNAME]; + + bool is_valid(std::pair const + &input_shape) const; + friend bool operator==(LoraLinearParams const &lhs, + LoraLinearParams const &rhs); +}; + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::LoraLinearParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index 0e9be82125..33a8e8be51 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -26,8 +26,10 @@ class ResidualLayerNorm : public Op { bool _elementwise_affine, bool _use_bias, float _eps, + bool inplace_residual, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, @@ -40,6 +42,11 @@ class ResidualLayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -65,6 +72,14 @@ class ResidualLayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -78,7 +93,8 @@ class ResidualLayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void inference_kernel_wrapper(ResidualLayerNormMeta const *m, + static void inference_kernel_wrapper(ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -86,11 +102,30 @@ class ResidualLayerNorm : public Op { GenericTensorAccessorW &output, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + static void + backward_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + + static void + peft_bwd_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias, use_two_residuals; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; std::vector axes; }; @@ -105,8 +140,12 @@ class ResidualLayerNormMeta : public OpMeta { bool elementwise_affine, use_bias, use_two_residuals; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h index 949ae0c799..166d4b2b4e 100644 --- a/include/flexflow/ops/residual_layer_norm_params.h +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -13,6 +13,7 @@ struct ResidualLayerNormParams { float eps; bool use_bias; bool use_two_residuals; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid(std::tuple const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -74,6 +81,14 @@ class ResidualRMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -82,6 +97,7 @@ class ResidualRMSNorm : public Op { float eps; int effective_batch_size; int dim, data_dim; + bool inplace_residual; }; } // namespace FlexFlow #endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h index a4e4de59ab..8b8f666dc1 100644 --- a/include/flexflow/ops/residual_rms_norm_params.h +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -11,6 +11,7 @@ struct ResidualRMSNormParams { LayerID layer_guid; float eps; int dim; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 1dc940ebd3..384404d8a0 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -34,6 +34,11 @@ class RMSNorm : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) override; void init_inference(FFModel const &, std::vector const &, std::vector const &, @@ -73,6 +78,14 @@ class RMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h index 604438260a..ac60ff15dd 100644 --- a/include/flexflow/ops/sigmoid_silu_multi.h +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -1,5 +1,6 @@ #pragma once +#include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/utils/memory_allocator.h" @@ -27,6 +28,11 @@ class SigmoidSiluMulti : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, @@ -55,6 +61,14 @@ class SigmoidSiluMulti : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -65,10 +79,24 @@ class SigmoidSiluMulti : public Op { T const *input2_ptr, T *output_ptr, ffStream_t stream); - static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m, + static void inference_kernel_wrapper(SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output); + static void + backward_kernel_wrapper(SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad); + static void + peft_bwd_kernel_wrapper(SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad); }; class SigmoidSiluMultiMeta : public OpMeta { @@ -80,6 +108,9 @@ class SigmoidSiluMultiMeta : public OpMeta { public: Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 61094f7361..82aff53766 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -33,6 +33,11 @@ class Softmax : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; void print_layer(FFModel const &model) override { @@ -58,6 +63,10 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 47144bf6d7..4b67692032 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class TopK; + class TopKMeta : public OpMeta { public: - TopKMeta(FFHandler handle); + TopKMeta(FFHandler handle, TopK const *topk); bool sorted; }; diff --git a/include/flexflow/ops/transpose.h b/include/flexflow/ops/transpose.h index 3e6fb575c0..bca0b83460 100644 --- a/include/flexflow/ops/transpose.h +++ b/include/flexflow/ops/transpose.h @@ -6,6 +6,8 @@ namespace FlexFlow { +class TransposeMeta; + class Transpose : public Op { public: using Params = TransposeParams; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 02df0c0137..168ad5f618 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -144,7 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { ~TreeIncMultiHeadSelfAttentionMeta(void); public: - int num_active_tokens; + int num_active_infr_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; bool *request_completed; diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h index 045f9b36a0..7e0e4362e2 100644 --- a/include/flexflow/parallel_ops/allreduce.h +++ b/include/flexflow/parallel_ops/allreduce.h @@ -34,12 +34,17 @@ class AllReduce : public ParallelOp { std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; + void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; @@ -47,10 +52,6 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -59,6 +60,14 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h index 2e4fdb86a9..1db776f59d 100644 --- a/include/flexflow/parallel_ops/combine.h +++ b/include/flexflow/parallel_ops/combine.h @@ -40,6 +40,11 @@ class Combine : public ParallelOp { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( @@ -52,10 +57,18 @@ class Combine : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); template static void forward_task_with_type(Legion::Task const *task, diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h index bdf7aae501..a4ccbee8a5 100644 --- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -17,11 +17,6 @@ class AllReduceMeta : public OpMeta { namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - void forward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); @@ -30,6 +25,15 @@ void backward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad); +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); } // namespace AllReduce } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h index 456013cd81..4b2227b178 100644 --- a/include/flexflow/parallel_ops/kernels/combine_kernels.h +++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Combine; + class CombineMeta : public OpMeta { public: - CombineMeta(FFHandler handle); + CombineMeta(FFHandler handle, Combine const *comb); DataType data_type; }; diff --git a/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h new file mode 100644 index 0000000000..fd6778a37f --- /dev/null +++ b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h @@ -0,0 +1,41 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H + +#include "flexflow/batch_config.h" +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/parallel_identity.h" + +namespace FlexFlow { + +class ParallelIdentityMeta : public OpMeta { +public: + ParallelIdentityMeta(FFHandler handle, ParallelIdentity const *reduct); +}; + +namespace Kernels { +namespace ParallelIdentity { + +void forward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +void inference_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); +} // namespace ParallelIdentity +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H diff --git a/include/flexflow/parallel_ops/kernels/partition_kernels.h b/include/flexflow/parallel_ops/kernels/partition_kernels.h index 81b190603a..1e77090d11 100644 --- a/include/flexflow/parallel_ops/kernels/partition_kernels.h +++ b/include/flexflow/parallel_ops/kernels/partition_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Repartition; + class RepartitionMeta : public OpMeta { public: - RepartitionMeta(FFHandler handle); + RepartitionMeta(FFHandler handle, Repartition const *repart); DataType data_type; }; diff --git a/include/flexflow/parallel_ops/parallel_identity.h b/include/flexflow/parallel_ops/parallel_identity.h new file mode 100644 index 0000000000..b3ca789f08 --- /dev/null +++ b/include/flexflow/parallel_ops/parallel_identity.h @@ -0,0 +1,83 @@ +#ifndef _FLEXFLOW_PARALLEL_IDENTITY_H +#define _FLEXFLOW_PARALLEL_IDENTITY_H + +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_ops/parallel_identity_params.h" +#include "parallel_op.h" + +namespace FlexFlow { + +class ParallelIdentity : public ParallelOp { +public: + using Params = ParallelIdentityParams; + using Input = ParallelTensor; + + ParallelIdentity(FFModel &model, + const ParallelTensor input, + int parallel_identity_legion_dim, + char const *name = NULL); + ParallelIdentity(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + bool get_int_parameter(PMParameter, int *) const override; + bool append_parallel_op_info( + std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + + Params get_params() const; + +public: + int parallel_identity_dim; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_PARALLEL_IDENTITY_H diff --git a/include/flexflow/parallel_ops/parallel_identity_params.h b/include/flexflow/parallel_ops/parallel_identity_params.h new file mode 100644 index 0000000000..6eeed662ec --- /dev/null +++ b/include/flexflow/parallel_ops/parallel_identity_params.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H +#define _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H + +namespace FlexFlow { + +struct ParallelIdentityParams { + int parallel_identity_legion_dim; + char name[MAX_OPNAME]; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(ParallelIdentityParams const &, ParallelIdentityParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ParallelIdentityParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h index 0bf573996c..39324c2a51 100644 --- a/include/flexflow/parallel_ops/parallel_op.h +++ b/include/flexflow/parallel_ops/parallel_op.h @@ -41,7 +41,7 @@ class ParallelOp : public Op { public: Legion::LogicalPartition input_lp, output_grad_lp; std::unordered_map - inference_input_lps; + inference_input_lps, inference_output_grad_lps; }; }; // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 65d69d8564..c27616634f 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -54,10 +54,19 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_kernel_wrapper(ReplicateMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index a38a3b2671..f0fab957ee 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -39,6 +39,7 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); + void peft_bwd(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(FFModel *model, BatchConfigFuture const &bc, ParallelTensor const input, @@ -65,15 +66,34 @@ struct Request { FINISHING = 104, // finishing request, but not yet verified }; BatchConfig::RequestGuid guid; - int max_sequence_length; + PEFTModelID peft_model_id = PEFTModelID::NO_ID; + int max_sequence_length = 128; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; Status status = PENDING; std::vector tokens; - + std::string prompt; std::vector beam_trees; + // PEFT field + RequestType req_type = REQ_INFERENCE; + size_t processed_finetuning_tokens = 0; + int completed_training_steps = 0; + int dataset_entry_processed_tokens = 0; + int max_training_steps = 1; + // how many gradient accumulation steps to do before updating the weights. if + // left as -1, it will be set to the number of entries in the dataset + int gradient_accumulation_steps = -1; + int benchmarking_tokens = -1; + std::vector finetuning_tokens_per_batch; + bool warmup = false; + std::string dataset_filepath; + std::vector, + std::vector>> + dataset; + std::vector finetuning_losses; + friend std::ostream &operator<<(std::ostream &os, Request const &req); }; // store the result of beam search @@ -120,6 +140,8 @@ class RequestManager { void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); + void set_enable_peft_finetuning(bool enable_peft_finetuning_); + static void set_inference_finished(bool finished = true); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, @@ -143,10 +165,9 @@ class RequestManager { void serve_incr_decoding(FFModel *model); void serve_spec_infer(FFModel *model); GenerationResult get_generation_result(RequestGuid const &guid); - RequestGuid register_new_request(std::string const &prompt, - int max_sequence_length); - RequestGuid register_new_request(std::vector const &prompt, - int max_sequence_length); + RequestGuid register_new_request(Request const &request_); + RequestGuid register_new_peft_request(Request const &request_); + // Methods to start and terminate request manager's background task void start_background_server(FFModel *model); bool is_background_server_terminated(); @@ -156,6 +177,8 @@ class RequestManager { bool is_request_completed(RequestGuid const &guid); void trigger_request_completion_future(RequestGuid const &guid); // Methods for preparing next batches + bool check_inf_req_completion(BatchConfig const &old_bc, int i); + void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, @@ -265,6 +288,10 @@ class RequestManager { int max_sequence_length; Status request_manager_status; + // peft benchmarking + bool enable_peft_finetuning = false; + static bool inference_finished; + // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; @@ -275,7 +302,8 @@ class RequestManager { int bos_token_id; int eos_token_id; std::string output_filepath; - std::queue pending_request_queue; + std::queue pending_infr_request_queue; + std::queue pending_peft_request_queue; std::unordered_map all_requests; std::unordered_map request_generation_results; std::mutex request_queue_mutex; @@ -304,6 +332,8 @@ class RequestManager { int llm_decoding_steps; int ssm_decoding_steps; double start_time, finish_time; + double registration_time, first_token_time; + bool first_token_time_set = false; }; std::unordered_map profiling_requests; double total_request_run_time; diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h index e410f66325..6cda96aa8b 100644 --- a/include/flexflow/simulator.h +++ b/include/flexflow/simulator.h @@ -33,21 +33,21 @@ namespace FlexFlow { #define MOD(a, b) ((a) % (b)) < 0 ? ((a) % (b)) + (b) : ((a) % (b)) -class Conv2DMeta; -class LinearMeta; -class Pool2DMeta; -class ElementUnaryMeta; -class ElementBinaryMeta; -class LayerNormMeta; -// class EmbeddingMeta; -// class SoftmaxMeta; -class BatchMatmulMeta; -// class BatchNormMeta; -class ConcatMeta; -// class DropoutMeta; -class TransposeMeta; -class Op; -class FFModel; +// class Conv2DMeta; +// class LinearMeta; +// class Pool2DMeta; +// class ElementUnaryMeta; +// class ElementBinaryMeta; +// class LayerNormMeta; +// class EmbeddingMeta; +// class SoftmaxMeta; +// class BatchMatmulMeta; +// class BatchNormMeta; +// class ConcatMeta; +// class DropoutMeta; +// class TransposeMeta; +// class Op; +// class FFModel; /** * @brief Costs of an operator. @@ -751,19 +751,19 @@ class Simulator { strict_hash_to_operator_cost; public: - Conv2DMeta *conv2d_meta; - LinearMeta *linear_meta; - Pool2DMeta *pool2d_meta; - ElementUnaryMeta *ele_unary_meta; - LayerNormMeta *layernorm_meta; - // ElementBinaryMeta *ele_binary_meta; - // EmbeddingMeta *embedding_meta; - // SoftmaxMeta *softmax_meta; - BatchMatmulMeta *batch_matmul_meta; - // BatchNormMeta *batch_norm_meta; - ConcatMeta *concat_meta; - // DropoutMeta *dropout_meta; - TransposeMeta *transpose_meta; + // Conv2DMeta *conv2d_meta; + // LinearMeta *linear_meta; + // Pool2DMeta *pool2d_meta; + // ElementUnaryMeta *ele_unary_meta; + // LayerNormMeta *layernorm_meta; + // ElementBinaryMeta *ele_binary_meta; + // EmbeddingMeta *embedding_meta; + // SoftmaxMeta *softmax_meta; + // BatchMatmulMeta *batch_matmul_meta; + // BatchNormMeta *batch_norm_meta; + // ConcatMeta *concat_meta; + // DropoutMeta *dropout_meta; + // TransposeMeta *transpose_meta; int segment_size; int max_num_segments; // simulation could be slow if the number of segments // are too large diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index f8bf67b3e1..486a65eb3d 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) { return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret; } -__global__ void - scale_kernel(float *ptr, Legion::coord_t size, float a, float b); +template +__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); __global__ void ones_kernel(float *ptr, Legion::coord_t size); @@ -156,10 +156,13 @@ template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); template -T *download_tensor(T const *ptr, size_t num_elements); +T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements); + +template +void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements); template -bool download_tensor(T const *ptr, T *dst, size_t num_elements); +void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain, @@ -179,3 +182,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type); cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); #endif +void check_device_vs_host_ptr(void const *maybe_devicePtr); +void check_ptr_alignment(void const *ptr); diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 5d3c831d4f..805cc46b4c 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) { return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret; } -__global__ void - scale_kernel(float *ptr, Legion::coord_t size, float a, float b); +template +__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); __global__ void ones_kernel(float *ptr, Legion::coord_t size); @@ -86,6 +86,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_discrete(DT *dst, + const DT *src, + Legion::coord_t size, + size_t *index); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); @@ -135,16 +141,28 @@ __host__ void updateGAS(float *para_ptr, float learning_rate); template -void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +void print_tensor(T const *ptr, + size_t num_elements, + char const *prefix, + int shard_id = 0); +template +void print_beam_tensor(T const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); template -T *download_tensor(T const *ptr, size_t num_elements); +T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements); + +template +void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements); template -bool download_tensor(T const *ptr, T *dst, size_t num_elements); +void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements); miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, @@ -153,7 +171,8 @@ miopenStatus_t miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, - Legion::Domain domain); + Legion::Domain domain, + DataType data_type = DT_FLOAT); hipblasDatatype_t ff_to_cuda_datatype(DataType type); @@ -164,3 +183,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type); void handle_unimplemented_hip_kernel(OperatorType op_type); #endif +void check_device_vs_host_ptr(void const *maybe_devicePtr); +void check_ptr_alignment(void const *ptr); diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h index 7091b159b2..fad7630770 100644 --- a/include/flexflow/utils/memory_allocator.h +++ b/include/flexflow/utils/memory_allocator.h @@ -54,6 +54,11 @@ class MemoryAllocator { return static_cast
(ptr); } + inline void free_all() { + reserved_allocated_size = 0; + instance_allocated_size = 0; + } + public: Legion::Memory memory; void *reserved_ptr; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h new file mode 100644 index 0000000000..dae46a8af1 --- /dev/null +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -0,0 +1,92 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ +#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ + +#include "flexflow/config.h" +#include + +namespace FlexFlow { + +class PEFTWeightAllocator { +public: + PEFTWeightAllocator(void *_base_ptr, size_t _total_size) + : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0), + local_offset(_total_size) {} + + inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id, + size_t datalen) { + const std::lock_guard lock(peft_weight_allocator_mutex); + void *ptr = static_cast(base_ptr) + sync_offset; + off_t model_sync_weights_offset = sync_offset; + size_t model_sync_weights_size = datalen; + if (sync_weights.find(peft_model_id) != sync_weights.end()) { + // Assert that sync weights for each PEFT model is consecutive + std::pair offset_and_size = sync_weights[peft_model_id]; + assert(sync_offset == offset_and_size.first + offset_and_size.second); + model_sync_weights_offset = offset_and_size.first; + model_sync_weights_size = offset_and_size.second + datalen; + } + sync_offset += datalen; + assert(sync_offset < local_offset); + sync_weights[peft_model_id] = + std::make_pair(model_sync_weights_offset, model_sync_weights_size); + return ptr; + } + + std::pair + get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) { + const std::lock_guard lock(peft_weight_allocator_mutex); + assert(sync_weights.find(peft_model_id) != sync_weights.end()); + std::pair offset_and_size = sync_weights[peft_model_id]; + return std::make_pair(static_cast(base_ptr) + offset_and_size.first, + offset_and_size.second); + } + + inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id, + size_t datalen) { + const std::lock_guard lock(peft_weight_allocator_mutex); + local_offset -= datalen; + assert(sync_offset < local_offset); + void *ptr = static_cast(base_ptr) + local_offset; + return ptr; + } + + template + inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id, + size_t count) { + return static_cast
( + allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count)); + } + + template + inline DT *allocate_local_weights(PEFTModelID const &peft_model_id, + size_t count) { + return static_cast
( + allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count)); + } + +public: + void *base_ptr; + size_t total_size; + off_t sync_offset, local_offset; + std::unordered_map> sync_weights; + std::mutex peft_weight_allocator_mutex; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md deleted file mode 100644 index d78fb37be9..0000000000 --- a/inference/MODEL_WEIGHTS.md +++ /dev/null @@ -1,28 +0,0 @@ -To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files. - -```python -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") - -for name, params in model.named_parameters(): - for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) - params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name) -``` - diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 0000000000..14c94e22ac --- /dev/null +++ b/inference/README.md @@ -0,0 +1,42 @@ +# Inference Examples +This folder contains the code to run inference examples in FlexFlow + +To create a sample prompt, call (from the `build` folder): + +```bash +mkdir -p ../inference/prompt +echo '["San Francisco is a "]' > ../inference/prompt/test.json +``` + +To download a model for use in C++, call: +```bash +huggingface-cli login # if needed +python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only +``` + +To run the incremental decoding example in C++, call: + +```bash +./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 +``` + +To run the speculative inference example in C++, call: + +```bash +./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 +``` + +To run a PEFT model example in C++, call: + +```bash +./inference/peft/peft \ + -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 4 \ + -ll:fsize 8192 -ll:zsize 12000 \ + -llm-model JackFram/llama-160m \ + -finetuning-dataset ../inference/prompt/peft_dataset.json \ + -peft-model goliaro/llama-160m-lora \ + -enable-peft \ + --use-full-precision \ + --inference-debugging +``` \ No newline at end of file diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index ec3dda3158..c9ffff5c07 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -264,15 +264,18 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - std::vector prompts; + + std::vector requests; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; - prompts.push_back(text); } - std::vector result = - model.generate(prompts, 128 /*max_sequence_length*/); + std::vector result = model.generate(requests); } // terminate the request manager by stopping the background thread diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index a529411ddb..195d6ba7e3 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_layer_norm( @@ -89,8 +89,9 @@ void FALCON::create_falcon_model(FFModel &ff, true, falcon_config.layer_norm_epsilon, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = res_ln_outputs[0]; att_norm = res_ln_outputs[1]; @@ -116,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -141,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -166,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -187,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h") + std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h") .c_str()); dense_h_to_4h = ff.gelu(dense_h_to_4h); @@ -203,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h") + std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h") .c_str()); } // final normalization and linear @@ -216,6 +217,7 @@ void FALCON::create_falcon_model(FFModel &ff, true, falcon_config.layer_norm_epsilon, true, + false, DT_NONE, "ln_f"); Tensor ln_f = res_ln_outputs[1]; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 517f534438..cf26194597 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "tok_embeddings"); + "embed_tokens"); Tensor w2 = nullptr; @@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_rms_norm( @@ -84,8 +84,9 @@ void LLAMA::create_llama_model(FFModel &ff, token_att_norm, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = token_att_norm[0]; att_norm = token_att_norm[1]; @@ -94,10 +95,11 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( + mha = ff.spec_inc_multiquery_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -111,16 +113,17 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; } case TREE_VERIFY_MODE: { - mha = ff.inc_multihead_self_attention_verify( + mha = ff.inc_multiquery_self_attention_verify( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -134,16 +137,17 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; } case INC_DECODING_MODE: { - mha = ff.inc_multihead_self_attention( + mha = ff.inc_multiquery_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -157,7 +161,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -175,54 +179,56 @@ void LLAMA::create_llama_model(FFModel &ff, token_ff_norm, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); + std::string("layers." + std::to_string(i) + ".post_attention_layernorm") + .c_str()); token = token_ff_norm[0]; Tensor ff_norm = token_ff_norm[1]; - Tensor w1 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w1") - .c_str()); + Tensor w1 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str()); - Tensor w3 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w3") - .c_str()); + Tensor w3 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str()); Tensor multi = ff.sigmoid_silu_multi(w1, w3); - w2 = - ff.dense(multi, - llama_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2") - .c_str()); + w2 = ff.dense( + multi, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + // ff.lora_linear(std::string("down_proj"), std::string("layers." + + // std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; @@ -231,6 +237,7 @@ void LLAMA::create_llama_model(FFModel &ff, final_rms_norm_output, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, "norm"); @@ -244,7 +251,7 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "output"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -261,7 +268,8 @@ void LLAMA::create_llama_model(FFModel &ff, output = ff.sampling(softmax, generation_config.topp); } else { // output = ff.arg_top_k(dense, /*k=*/1, false); - output = ff.argmax(dense, /*beam_Search*/ false); + Tensor softmax = ff.softmax(dense, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } } @@ -269,7 +277,7 @@ void LLAMA::create_llama_model(FFModel &ff, "", weight_file_path, llama_config.num_attention_heads, - llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree, diff --git a/inference/models/llama.h b/inference/models/llama.h index ba1f0236f9..edb78f1300 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -36,6 +36,11 @@ class LLAMA { num_hidden_layers = model_config["num_hidden_layers"]; vocab_size = model_config["vocab_size"]; num_attention_heads = model_config["num_attention_heads"]; + if (model_config.find("num_key_value_heads") != model_config.end()) { + num_key_value_heads = model_config["num_key_value_heads"]; + } else { + num_key_value_heads = num_attention_heads; + } hidden_size = model_config["hidden_size"]; rms_norm_eps = model_config["rms_norm_eps"]; intermediate_size = model_config["intermediate_size"]; @@ -61,6 +66,8 @@ class LLAMA { std::cout << "\tvocab_size: " << vocab_size << std::endl; std::cout << "\tnum_attention_heads: " << num_attention_heads << std::endl; + std::cout << "\tnum_key_value_heads: " << num_key_value_heads + << std::endl; std::cout << "\thidden_size: " << hidden_size << std::endl; std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl; std::cout << "\tintermediate_size: " << intermediate_size << std::endl; @@ -73,8 +80,8 @@ class LLAMA { // int max_seq_len, max_num_tokens; int max_beam_width, max_beam_depth; - int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, - intermediate_size; + int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads, + hidden_size, intermediate_size; float rms_norm_eps; }; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 70e2b5e9c5..e4a7e0056d 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor intermediate_output = nullptr, layernorm_output = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); } else { ff.residual_layer_norm( intermediate_output, @@ -86,8 +86,9 @@ void MPT::create_mpt_model(FFModel &ff, true, 1e-05, false, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; } @@ -113,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -137,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -161,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -181,8 +182,9 @@ void MPT::create_mpt_model(FFModel &ff, true, 1e-05, false, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_2").c_str()); + std::string("layers." + std::to_string(i) + ".norm_2").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; @@ -198,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str()); layernorm_output = ff.gelu(layernorm_output); intermediate_output = ff.dense( layernorm_output, @@ -211,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str()); } // final @@ -224,8 +226,9 @@ void MPT::create_mpt_model(FFModel &ff, true, 1e-05, false, + false, DT_NONE, - "transformer_norm_f"); + "norm_f"); Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 5677d5658e..b3f2ef4e17 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -94,8 +94,9 @@ void OPT::create_opt_model(FFModel &ff, opt_config.layer_norm_elementwise_affine, 1e-05, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_layer_norm") + std::string("layers." + std::to_string(i) + ".self_attn_layer_norm") .c_str()); Tensor residual = res_ln_outputs[0]; Tensor hidden_states = res_ln_outputs[1]; @@ -121,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -145,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -169,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -186,9 +187,10 @@ void OPT::create_opt_model(FFModel &ff, opt_config.layer_norm_elementwise_affine, 1e-05, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + - "_add_bias_residual_layer_norm") + std::string("layers." + std::to_string(i) + + ".add_bias_residual_layer_norm") .c_str()); added = res_ln_outputs[0]; Tensor final_norm = res_ln_outputs[1]; @@ -205,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc1").c_str()); + std::string("layers." + std::to_string(i) + ".fc1").c_str()); fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, @@ -216,7 +218,10 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc2").c_str()); + std::string("layers." + std::to_string(i) + ".fc2").c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + // ff.lora_linear(std::string("fc2"), std::string("layers." + + // std::to_string(i) + ".fc2.lora").c_str()); } // final @@ -229,6 +234,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.layer_norm_elementwise_affine, 1e-05, true, + false, DT_NONE, "final_layer_norm"); Tensor all_final_norm = res_ln_outputs[1]; @@ -243,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "embed_tokens_weight_lm_head"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -252,7 +258,8 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ true); } else { // output = ff.arg_top_k(lm_head, /*k=*/1, false); - output = ff.argmax(lm_head, /*beam_Search*/ false); + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } FileDataLoader *fileloader = new FileDataLoader( diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 8b0dc1098c..cd8bf3a9a7 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor positional_embedding = ff.embedding(position_input, @@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wpe"); + "wpe"); Tensor residual = nullptr, c_proj = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -96,8 +96,9 @@ void STARCODER::create_starcoder_model( true, startcoder_config.layer_norm_epsilon, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_1").c_str()); + std::string("layers." + std::to_string(i) + ".ln_1").c_str()); Tensor hidden_states = res_ln_outputs[0]; Tensor ln_1 = res_ln_outputs[1]; @@ -124,7 +125,7 @@ void STARCODER::create_starcoder_model( 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn.c_attn") .c_str() /*name*/ ); break; @@ -144,8 +145,9 @@ void STARCODER::create_starcoder_model( true, startcoder_config.layer_norm_epsilon, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_2").c_str()); + std::string("layers." + std::to_string(i) + ".ln_2").c_str()); residual = res_ln_outputs[0]; Tensor l2_norm = res_ln_outputs[1]; @@ -161,7 +163,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str()); c_fc = ff.gelu(c_fc); @@ -176,7 +178,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str()); } // final normalization and linear ff.residual_layer_norm(residual, @@ -188,8 +190,9 @@ void STARCODER::create_starcoder_model( true, startcoder_config.layer_norm_epsilon, true, + false, DT_NONE, - "transformer_ln_f"); + "ln_f"); Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt new file mode 100644 index 0000000000..e0bad79cab --- /dev/null +++ b/inference/peft/CMakeLists.txt @@ -0,0 +1,139 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_Peft) + +# Normal PEFT +set(project_target1 peft) +set(CPU_SRC1 + ${FLEXFLOW_CPP_DRV_SRC} + peft.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target1} DESTINATION ${BIN_DEST}) + +# FWD benchmark +set(project_target2 peft_fwd_benchmark) +set(CPU_SRC2 + ${FLEXFLOW_CPP_DRV_SRC} + peft_fwd_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target2} ${CPU_SRC2}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target2} ${CPU_SRC2}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target2} DESTINATION ${BIN_DEST}) + +# BWD benchmark +set(project_target3 peft_bwd_benchmark) +set(CPU_SRC3 + ${FLEXFLOW_CPP_DRV_SRC} + peft_bwd_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target3} DESTINATION ${BIN_DEST}) + +# Online peft +set(project_target4 req_rate_benchmark) +set(CPU_SRC4 + ${FLEXFLOW_CPP_DRV_SRC} + req_rate_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target4} ${CPU_SRC4}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC4} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target4} ${CPU_SRC4}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target4} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target4} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target4} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target4} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target4} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target4} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/Makefile b/inference/peft/Makefile new file mode 100644 index 0000000000..0e4b79f51f --- /dev/null +++ b/inference/peft/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc new file mode 100644 index 0000000000..c55f2c0bfd --- /dev/null +++ b/inference/peft/peft.cc @@ -0,0 +1,387 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string dataset_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // dataset for finetuning + if (!strcmp(argv[i], "-finetuning-dataset")) { + paths.dataset_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 1; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + bool enable_peft_finetuning = true; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + LoraOptimizerConfig *optim_config = nullptr; + if (enable_peft_finetuning) { + // float sgd_learning_rate = 2e-1; + float sgd_learning_rate = 1.0f; + optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate); + } + LoraLinearConfig peft_config_finetuning = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, + peft_model_name, + true /*trainable*/, + optim_config, + false /*init_lora_weights*/, + llm_model_name, + use_full_precision ? "fp32" : "fp16"); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + if (enable_peft_finetuning) { + peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning); + } + } + + // Start background server + rm->start_background_server(&model); + + // Run workload + { + std::vector requests; + + // Add inference requests + if (!file_paths.prompt_file_path.empty()) { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + int total_num_requests = 0; + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str()); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + total_num_requests++; + } + } + + // Add fine-tuning request + if (enable_peft_finetuning) { + assert(!file_paths.dataset_file_path.empty() && + "Dataset file path is required for fine-tuning."); + printf("Finetuning request with dataset %s\n", + file_paths.dataset_file_path.c_str()); + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.peft_model_id = (peft_model_id_finetuning != nullptr) + ? *peft_model_id_finetuning + : PEFTModelID::NO_ID; + fine_tuning_req.dataset_filepath = file_paths.dataset_file_path; + fine_tuning_req.max_training_steps = 2; + requests.push_back(fine_tuning_req); + } + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc new file mode 100644 index 0000000000..86d6d8cbbf --- /dev/null +++ b/inference/peft/peft_bwd_benchmark.cc @@ -0,0 +1,391 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_requests_to_run) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_requests_to_run); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Run workload + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector lengths; + int index = 0; + for (auto &entry : prompt_json) { + if (index == max_requests_to_run) { + break; + } + int prompt_length = entry.get(); + assert(prompt_length > 0 && "Prompt length must be greater than 0."); + assert(prompt_length <= 1024 && + "Prompt length must be less than or equal to 1024."); + lengths.push_back(prompt_length); + index++; + } + printf("Total number of finetuning requests: %ld", lengths.size()); + + // Add fine-tuning requests + for (int i = 0; i < lengths.size(); i++) { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = lengths[i]; + fine_tuning_req.max_sequence_length = lengths[i]; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + } + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------finetuning finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc new file mode 100644 index 0000000000..9ff042c157 --- /dev/null +++ b/inference/peft/peft_fwd_benchmark.cc @@ -0,0 +1,363 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_requests_to_run) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_requests_to_run); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + // Run workload + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector> prompts; + int index = 0; + for (auto &entry : prompt_json) { + if (index >= max_requests_to_run) { + break; + } + int prompt_length = entry["human"]; + int sequence_length = entry["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + index++; + } + printf("Total number of prompts: %ld", prompts.size()); + for (auto &prompt : prompts) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc new file mode 100644 index 0000000000..43008e74fe --- /dev/null +++ b/inference/peft/req_rate_benchmark.cc @@ -0,0 +1,518 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "inference/models/falcon.h" +#include "inference/models/llama.h" +#include "inference/models/mpt.h" +#include "inference/models/opt.h" +#include "inference/models/starcoder.h" +#include +#include +#include +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +class ConcurrentQueue { +public: + std::queue inf_queue; + std::queue peft_queue; + std::mutex request_queue_mutex; + bool producer_finished = false; +}; + +ConcurrentQueue *common_guids_singleton = nullptr; +int nb_millisecs = 1000; // Default bucket timeframe is 1 second + +ConcurrentQueue *get_common_guids_queue() { + if (common_guids_singleton == nullptr) { + common_guids_singleton = new ConcurrentQueue(); + } + return common_guids_singleton; +} + +void consume() { + RequestManager *rm = RequestManager::get_request_manager(); + ConcurrentQueue *guids = get_common_guids_queue(); + bool producer_is_finished = false; + bool queue_is_empty = false; + // int i=0; + while (!producer_is_finished || !queue_is_empty) { + RequestManager::RequestGuid guid = RequestManager::INVALID_GUID; + { + const std::lock_guard lock(guids->request_queue_mutex); + queue_is_empty = guids->inf_queue.empty(); + producer_is_finished = guids->producer_finished; + if (!queue_is_empty) { + guid = guids->inf_queue.front(); + guids->inf_queue.pop(); + } + } + if (guid != RequestManager::INVALID_GUID) { + GenerationResult result = rm->get_generation_result(guid); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(nb_millisecs)); + } + // i++; + // cout << "Iteration " << i; + } + rm->set_inference_finished(); + + while (guids->peft_queue.size() > 0) { + GenerationResult result = + rm->get_generation_result(guids->peft_queue.front()); + guids->peft_queue.pop(); + } +} + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_buckets_to_run, + int &bucket_timeframe) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-buckets-to-run")) { + max_buckets_to_run = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--bucket-timeframe")) { + bucket_timeframe = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_buckets_to_run = 1000000000; + bool enable_peft_finetuning = false; + int bucket_timespan = 1; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_buckets_to_run, + bucket_timespan); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + rm->start_background_server(&model); + + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Now run online workload! + + nb_millisecs = nb_millisecs * bucket_timespan; + int total_num_requests = 0; + int num_arrival_buckets = 0; + ConcurrentQueue *guids = get_common_guids_queue(); + std::thread consumer{consume}; + { + + // Load all requests in advance + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + auto const &lists = prompt_json.get>>(); + std::vector bucket_arrival_times_s; + std::vector>> buckets; + + size_t index = 0; + for (auto const &list : lists) { + if (!list.empty()) { + bucket_arrival_times_s.push_back(index); + std::vector> prompts; + for (auto const &dict : list) { + int prompt_length = dict["human"]; + int sequence_length = dict["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + } + buckets.push_back(prompts); + } + index++; + } + assert(bucket_arrival_times_s.size() == buckets.size() && + "Bucket arrival times and buckets are not the same size"); + // for (int i=0; i<10; i++) { + // printf("bucket_arrival_times_s[%i]: %i\n", i, + // bucket_arrival_times_s[i]); printf("bucket[%i]: %i\n", i, + // buckets[i].size()); for (const auto& prompt : buckets[i]) { + // printf("\tprompt: %i, %i\n", prompt.first, prompt.second); + // } + // } + + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1000000000; + RequestManager::RequestGuid ft_guid = + rm->register_new_peft_request(fine_tuning_req); + if (ft_guid != RequestManager::INVALID_GUID) { + const std::lock_guard lock(guids->request_queue_mutex); + guids->peft_queue.push(ft_guid); + } + + // Replay the trace of inference requests + auto start_time = std::chrono::steady_clock::now(); + for (int i = 0; i < bucket_arrival_times_s.size(); i++) { + if (bucket_arrival_times_s[i] >= max_buckets_to_run) { + break; + } + // sleep until bucket arrives + auto bucket_arrival_time = + start_time + + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs); + std::this_thread::sleep_until(bucket_arrival_time); + + // create inference requests for the bucket + std::vector requests; + for (auto const &prompt : buckets[i]) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + { + const std::lock_guard lock(guids->request_queue_mutex); + for (int i = 0; i < requests.size(); i++) { + RequestManager::RequestGuid guid = + rm->register_new_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + guids->inf_queue.push(guid); + } + } + } + } + + { // Notify the consumer that no more requests are incoming + const std::lock_guard lock(guids->request_queue_mutex); + guids->producer_finished = true; + } + } + + // Wait for consumer to finish + consumer.join(); + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py new file mode 100644 index 0000000000..a7d38a66b6 --- /dev/null +++ b/inference/python/ff_peft.py @@ -0,0 +1,189 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 10000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": True, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": True, + "fusion": False, + } + model_configs = { + # required parameters + "base_model": "JackFram/llama-160m", + "inference_peft_model_id": "goliaro/llama-160m-lora", + "finetuning_peft_model_id": "goliaro/llama-160m-lora", + # "base_model": "meta-llama/Meta-Llama-3-8B", + # "inference_peft_model_id": "goliaro/llama-3-8b-lora", + # "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly", + # optional parameters + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": True, + "prompt": "", + "finetuning_dataset": os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "../prompt/peft_dataset.json", + ), + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(model_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + # Add inference and/or finetuning lora + lora_inference_config = None + lora_finetuning_config = None + if len(configs.prompt) > 0: + lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + base_model_name_or_path=configs.base_model, + ) + llm.add_peft(lora_inference_config) + if len(configs.finetuning_dataset) > 0: + # lora_finetuning_config = ff.LoraLinearConfig( + # llm.cache_path, + # configs.finetuning_peft_model_id, + # target_modules=["down_proj"], + # rank=16, + # lora_alpha=16, + # trainable=True, + # init_lora_weights=True, + # optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + # ) + lora_finetuning_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + trainable=True, + base_model_name_or_path=configs.base_model, + optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + optimizer_kwargs={ + "learning_rate": 0.001, + "momentum": 0.0, + "weight_decay": 0.0, + "nesterov": False, + }, + ) + llm.add_peft(lora_finetuning_config) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + enable_peft_finetuning = len(configs.finetuning_dataset) > 0 + llm.compile( + generation_config, + enable_peft_finetuning=enable_peft_finetuning, + max_requests_per_batch=1 if not enable_peft_finetuning else 2, + max_seq_length=256, + max_tokens_per_batch=128, + ) + + llm.start_server() + + requests = [] + # Serving + if len(configs.prompt) > 0: + prompts = [s for s in json.load(open(configs.prompt))] + inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=128, + peft_model_id=llm.get_ff_peft_id(lora_inference_config), + ) + for prompt in prompts + ] + requests += inference_requests + # Finetuning + if len(configs.finetuning_dataset) > 0: + finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=128, + peft_model_id=llm.get_ff_peft_id(lora_finetuning_config), + dataset_filepath=configs.finetuning_dataset, + max_training_steps=2, + ) + requests.append(finetuning_request) + + results = llm.generate(requests) + + llm.stop_server() + + +if __name__ == "__main__": + print("flexflow PEFT example") + main() diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 05599ea6b9..f888982f2c 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -51,9 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md new file mode 100644 index 0000000000..9b2a7a53b2 --- /dev/null +++ b/inference/python/peft_demo/INSTRUCTIONS.md @@ -0,0 +1,25 @@ +## Peft Demo +* `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git` +* `cd FlexFlow/` + +* If you wish to run the demo by installing FlexFlow + * `conda env create -f conda/flexflow.yml` + * `conda activate flexflow` + +* If you wish to run the demo using a Docker container + * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow` + +* Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token) + + * `export HUGGINGFACE_TOKEN="[Your token]"` + * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"` + * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"` + +* Run the demo + ``` + mkdir inference/output + cd inference/python/peft_demo/ + python3 demo.py -config-file demo_config.json + ``` + + diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb new file mode 100644 index 0000000000..dfb5193a1d --- /dev/null +++ b/inference/python/peft_demo/demo.ipynb @@ -0,0 +1,1907 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FlexFlow Co-Serving Demo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import json, random, subprocess, os\n", + "from datasets import load_dataset\n", + "from types import SimpleNamespace\n", + "from huggingface_hub import HfFolder\n", + "import flexflow.serve as ff\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):\n", + " \"\"\"Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.\n", + " Only the 'open_qa' and 'closed_qa' prompts without context are kept.\n", + " The datasets are saved into the files given as arguments.\n", + "\n", + " Keyword arguments:\n", + " dataset_size -- the number of prompts to consider\n", + " inference_file_path -- the file in which to save the inference data\n", + " finetuning_file_path -- the file in which to save the finetuning data\n", + " \"\"\"\n", + " dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n", + " inference_data = []\n", + " finetuning_data = []\n", + " for row in dataset:\n", + " if len(finetuning_data) == finetune_dataset_size:\n", + " break\n", + " if (\"open_qa\" in row['category'] or \"closed_qa\" in row['category']) and len(row['context']) == 0:\n", + " inference_data.append(row['instruction'])\n", + " finetuning_data.append(row['instruction'] + \" \" + row['response'])\n", + " with open(inference_file_path, 'w') as file:\n", + " json.dump(inference_data[:1], file)\n", + " with open(finetuning_file_path, 'w') as file:\n", + " json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration fields" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "configs_dict = {\n", + " \"num_gpus\": 1,\n", + " \"memory_per_gpu\": 21000,\n", + " \"zero_copy_memory_per_node\": 40000,\n", + " \"num_cpus\": 4,\n", + " \"legion_utility_processors\": 4,\n", + " \"data_parallelism_degree\": 1,\n", + " \"tensor_parallelism_degree\": 1,\n", + " \"pipeline_parallelism_degree\": 1,\n", + " \"offload\": False,\n", + " \"offload_reserve_space_size\": 8 * 1024, # 8GB\n", + " \"use_4bit_quantization\": False,\n", + " \"use_8bit_quantization\": False,\n", + " \"enable_peft\": True,\n", + " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", + " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", + " \"profiling\": False,\n", + " \"inference_debugging\": False,\n", + " \"fusion\": False,\n", + " \"max_requests_per_batch\": 1,\n", + " \"max_sequence_length\": 128,\n", + " \"max_tokens_per_batch\": 128,\n", + " \"max_training_steps\": 100,\n", + " \"seed\": 42,\n", + "}\n", + "model_configs = {\n", + " \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n", + " \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n", + " \"refresh_cache\": False,\n", + " \"full_precision\": False,\n", + " # relative paths\n", + " \"inference_dataset\": \"inference_dataset.json\",\n", + " \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n", + " \"output_file\": \"peft_demo.txt\",\n", + "}\n", + "generation_configs = {\n", + " \"do_sample\": False,\n", + " \"temperature\": 0.9,\n", + " \"topp\": 0.8,\n", + " \"topk\": 1,\n", + "}\n", + "finetuning_configs = {\n", + " \"learning_rate\": 0.001,\n", + " \"momentum\": 0.0,\n", + " \"weight_decay\": 0.0,\n", + " \"nesterov\": False,\n", + "}\n", + "# Merge dictionaries\n", + "configs_dict.update(model_configs)\n", + "configs_dict.update(generation_configs)\n", + "configs_dict.update(finetuning_configs)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(configs_dict[\"seed\"])\n", + "\n", + "configs = SimpleNamespace(**configs_dict)\n", + "\n", + "create_datasets(inference_file_path=configs_dict[\"inference_dataset\"], \n", + " finetuning_file_path=configs_dict[\"finetuning_dataset\"])\n", + "\n", + "# Clear output file\n", + "with open(configs.output_file, 'w') as file:\n", + " file.write('')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download base and peft inference models" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Loading tokenizer...\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Loading tokenizer...\n" + ] + }, + { + "data": { + "text/plain": [ + "CompletedProcess(args=['python', '../../utils/download_peft_model.py', 'goliaro/llama-3-8b-lora', '--base_model_name', 'meta-llama/Meta-Llama-3-8B'], returncode=0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n", + "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize FlexFlow runtime and LLM object" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 - 7f4d49d21280] 0.672934 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.672995 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.673107 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.673118 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.673124 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "workSpaceSize (128 MB)\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Loading tokenizer...\n", + "Adding layer layers.0.mlp.down_proj.lora\n", + "Adding layer layers.1.mlp.down_proj.lora\n", + "Adding layer layers.2.mlp.down_proj.lora\n", + "Adding layer layers.3.mlp.down_proj.lora\n", + "Adding layer layers.4.mlp.down_proj.lora\n", + "Adding layer layers.5.mlp.down_proj.lora\n", + "Adding layer layers.6.mlp.down_proj.lora\n", + "Adding layer layers.7.mlp.down_proj.lora\n", + "Adding layer layers.8.mlp.down_proj.lora\n", + "Adding layer layers.9.mlp.down_proj.lora\n", + "Adding layer layers.10.mlp.down_proj.lora\n", + "Adding layer layers.11.mlp.down_proj.lora\n", + "Adding layer layers.12.mlp.down_proj.lora\n", + "Adding layer layers.13.mlp.down_proj.lora\n", + "Adding layer layers.14.mlp.down_proj.lora\n", + "Adding layer layers.15.mlp.down_proj.lora\n", + "Adding layer layers.16.mlp.down_proj.lora\n", + "Adding layer layers.17.mlp.down_proj.lora\n", + "Adding layer layers.18.mlp.down_proj.lora\n", + "Adding layer layers.19.mlp.down_proj.lora\n", + "Adding layer layers.20.mlp.down_proj.lora\n", + "Adding layer layers.21.mlp.down_proj.lora\n", + "Adding layer layers.22.mlp.down_proj.lora\n", + "Adding layer layers.23.mlp.down_proj.lora\n", + "Adding layer layers.24.mlp.down_proj.lora\n", + "Adding layer layers.25.mlp.down_proj.lora\n", + "Adding layer layers.26.mlp.down_proj.lora\n", + "Adding layer layers.27.mlp.down_proj.lora\n", + "Adding layer layers.28.mlp.down_proj.lora\n", + "Adding layer layers.29.mlp.down_proj.lora\n", + "Adding layer layers.30.mlp.down_proj.lora\n", + "Adding layer layers.31.mlp.down_proj.lora\n" + ] + } + ], + "source": [ + "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n", + "ff.init(configs_dict)\n", + "\n", + "# Create the FlexFlow LLM\n", + "ff_data_type = (\n", + " ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n", + ")\n", + "llm = ff.LLM(\n", + " configs.base_model,\n", + " data_type=ff_data_type,\n", + " cache_path=configs.cache_path,\n", + " refresh_cache=configs.refresh_cache,\n", + " output_file=configs.output_file,\n", + ")\n", + "# Add inference and/or finetuning lora\n", + "lora_inference_config = None\n", + "lora_finetuning_config = None\n", + "if len(configs.inference_dataset) > 0:\n", + " lora_inference_config = ff.LoraLinearConfig(\n", + " llm.cache_path, \n", + " configs.inference_peft_model_id,\n", + " base_model_name_or_path=configs.base_model\n", + " )\n", + " llm.add_peft(lora_inference_config)\n", + "if len(configs.finetuning_dataset) > 0:\n", + " lora_finetuning_config = ff.LoraLinearConfig(\n", + " llm.cache_path,\n", + " configs.finetuning_peft_model_id,\n", + " trainable=True,\n", + " init_lora_weights=False,\n", + " rank=16,\n", + " lora_alpha=16.0,\n", + " # target_modules = [\"down_proj\"],\n", + " base_model_name_or_path=configs.base_model,\n", + " optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,\n", + " optimizer_kwargs={\n", + " \"learning_rate\": configs.learning_rate,\n", + " \"momentum\": configs.momentum,\n", + " \"weight_decay\": configs.weight_decay,\n", + " \"nesterov\": configs.nesterov,\n", + " },\n", + " )\n", + " llm.add_peft(lora_finetuning_config)\n", + "\n", + "# Compile the LLM for inference and load the weights into memory\n", + "generation_config = ff.GenerationConfig(\n", + " do_sample=configs.do_sample,\n", + " temperature=configs.temperature,\n", + " topp=configs.topp,\n", + " topk=configs.topk\n", + ")\n", + "enable_peft_finetuning = len(configs.finetuning_dataset) > 0\n", + "llm.compile(\n", + " generation_config,\n", + " enable_peft_finetuning=enable_peft_finetuning,\n", + " max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),\n", + " max_seq_length=configs.max_sequence_length,\n", + " max_tokens_per_batch=configs.max_tokens_per_batch,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start the LLM Co-serving system" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Background server started.\n", + "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Starting background serving task.\n", + "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Updated models' configuration.\n", + "###PEFT DEBUGGING### LLM Model object exists.\n", + "###PEFT DEBUGGING### Model object exists.\n", + "###PEFT DEBUGGING### Model object still exists.\n", + "###PEFT DEBUGGING### Entering compile_inference.\n", + "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n" + ] + } + ], + "source": [ + "llm.start_server()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate inference" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "###PEFT DEBUGGING### Launching graph optimization task.\n", + "[]\n", + "num_nodes = 1 num_gpus_per_node = 1\n", + "[0]10445\n", + "[1]649\n", + "[2]6730\n", + "[3]2053\n", + "[4]18167\n", + "[5]369\n", + "[6]1317\n", + "[7]2085\n", + "[8]3090\n", + "[9]30\n", + "No small speculative model registered, using incremental decoding.\n", + "[0 - 7f4d49d21280] 1.600215 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n", + "optimal_views.size = 262\n", + "views.size() = 262\n", + "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n", + "###PEFT DEBUGGING### Starting inplace optimizations.\n", + "###PEFT DEBUGGING### Mapping output tensors.\n", + "ndim(1) dims[1 0 0 0]\n", + "###PEFT DEBUGGING### Setting up NCCL communications.\n", + "###PEFT DEBUGGING### compile_inference completed successfully.\n", + "Loading weight file embed_tokens.weight\n", + "Loading weight file layers.0.input_layernorm.weight\n", + "Loading weight file layers.0.self_attn.q_proj.weight\n", + "Loading weight file layers.0.self_attn.k_proj.weight\n", + "Loading weight file layers.0.self_attn.v_proj.weight\n", + "Loading weight file layers.0.self_attn.o_proj.weight\n", + "Loading weight file layers.0.post_attention_layernorm.weight\n", + "Loading weight file layers.0.mlp.gate_proj.weight\n", + "Loading weight file layers.0.mlp.up_proj.weight\n", + "Loading weight file layers.0.mlp.down_proj.weight\n", + "Loading weight file layers.1.input_layernorm.weight\n", + "Loading weight file layers.1.self_attn.q_proj.weight\n", + "Loading weight file layers.1.self_attn.k_proj.weight\n", + "Loading weight file layers.1.self_attn.v_proj.weight\n", + "Loading weight file layers.1.self_attn.o_proj.weight\n", + "Loading weight file layers.1.post_attention_layernorm.weight\n", + "Loading weight file layers.1.mlp.gate_proj.weight\n", + "Loading weight file layers.1.mlp.up_proj.weight\n", + "Loading weight file layers.1.mlp.down_proj.weight\n", + "Loading weight file layers.2.input_layernorm.weight\n", + "Loading weight file layers.2.self_attn.q_proj.weight\n", + "Loading weight file layers.2.self_attn.k_proj.weight\n", + "Loading weight file layers.2.self_attn.v_proj.weight\n", + "Loading weight file layers.2.self_attn.o_proj.weight\n", + "Loading weight file layers.2.post_attention_layernorm.weight\n", + "Loading weight file layers.2.mlp.gate_proj.weight\n", + "Loading weight file layers.2.mlp.up_proj.weight\n", + "Loading weight file layers.2.mlp.down_proj.weight\n", + "Loading weight file layers.3.input_layernorm.weight\n", + "Loading weight file layers.3.self_attn.q_proj.weight\n", + "Loading weight file layers.3.self_attn.k_proj.weight\n", + "Loading weight file layers.3.self_attn.v_proj.weight\n", + "Loading weight file layers.3.self_attn.o_proj.weight\n", + "Loading weight file layers.3.post_attention_layernorm.weight\n", + "Loading weight file layers.3.mlp.gate_proj.weight\n", + "Loading weight file layers.3.mlp.up_proj.weight\n", + "Loading weight file layers.3.mlp.down_proj.weight\n", + "Loading weight file layers.4.input_layernorm.weight\n", + "Loading weight file layers.4.self_attn.q_proj.weight\n", + "Loading weight file layers.4.self_attn.k_proj.weight\n", + "Loading weight file layers.4.self_attn.v_proj.weight\n", + "Loading weight file layers.4.self_attn.o_proj.weight\n", + "Loading weight file layers.4.post_attention_layernorm.weight\n", + "Loading weight file layers.4.mlp.gate_proj.weight\n", + "Loading weight file layers.4.mlp.up_proj.weight\n", + "Loading weight file layers.4.mlp.down_proj.weight\n", + "Loading weight file layers.5.input_layernorm.weight\n", + "Loading weight file layers.5.self_attn.q_proj.weight\n", + "Loading weight file layers.5.self_attn.k_proj.weight\n", + "Loading weight file layers.5.self_attn.v_proj.weight\n", + "Loading weight file layers.5.self_attn.o_proj.weight\n", + "Loading weight file layers.5.post_attention_layernorm.weight\n", + "Loading weight file layers.5.mlp.gate_proj.weight\n", + "Loading weight file layers.5.mlp.up_proj.weight\n", + "Loading weight file layers.5.mlp.down_proj.weight\n", + "Loading weight file layers.6.input_layernorm.weight\n", + "Loading weight file layers.6.self_attn.q_proj.weight\n", + "Loading weight file layers.6.self_attn.k_proj.weight\n", + "Loading weight file layers.6.self_attn.v_proj.weight\n", + "Loading weight file layers.6.self_attn.o_proj.weight\n", + "Loading weight file layers.6.post_attention_layernorm.weight\n", + "Loading weight file layers.6.mlp.gate_proj.weight\n", + "Loading weight file layers.6.mlp.up_proj.weight\n", + "Loading weight file layers.6.mlp.down_proj.weight\n", + "Loading weight file layers.7.input_layernorm.weight\n", + "Loading weight file layers.7.self_attn.q_proj.weight\n", + "Loading weight file layers.7.self_attn.k_proj.weight\n", + "Loading weight file layers.7.self_attn.v_proj.weight\n", + "Loading weight file layers.7.self_attn.o_proj.weight\n", + "Loading weight file layers.7.post_attention_layernorm.weight\n", + "Loading weight file layers.7.mlp.gate_proj.weight\n", + "Loading weight file layers.7.mlp.up_proj.weight\n", + "Loading weight file layers.7.mlp.down_proj.weight\n", + "Loading weight file layers.8.input_layernorm.weight\n", + "Loading weight file layers.8.self_attn.q_proj.weight\n", + "Loading weight file layers.8.self_attn.k_proj.weight\n", + "Loading weight file layers.8.self_attn.v_proj.weight\n", + "Loading weight file layers.8.self_attn.o_proj.weight\n", + "Loading weight file layers.8.post_attention_layernorm.weight\n", + "Loading weight file layers.8.mlp.gate_proj.weight\n", + "Loading weight file layers.8.mlp.up_proj.weight\n", + "Loading weight file layers.8.mlp.down_proj.weight\n", + "Loading weight file layers.9.input_layernorm.weight\n", + "Loading weight file layers.9.self_attn.q_proj.weight\n", + "Loading weight file layers.9.self_attn.k_proj.weight\n", + "Loading weight file layers.9.self_attn.v_proj.weight\n", + "Loading weight file layers.9.self_attn.o_proj.weight\n", + "Loading weight file layers.9.post_attention_layernorm.weight\n", + "Loading weight file layers.9.mlp.gate_proj.weight\n", + "Loading weight file layers.9.mlp.up_proj.weight\n", + "Loading weight file layers.9.mlp.down_proj.weight\n", + "Loading weight file layers.10.input_layernorm.weight\n", + "Loading weight file layers.10.self_attn.q_proj.weight\n", + "Loading weight file layers.10.self_attn.k_proj.weight\n", + "Loading weight file layers.10.self_attn.v_proj.weight\n", + "Loading weight file layers.10.self_attn.o_proj.weight\n", + "Loading weight file layers.10.post_attention_layernorm.weight\n", + "Loading weight file layers.10.mlp.gate_proj.weight\n", + "Loading weight file layers.10.mlp.up_proj.weight\n", + "Loading weight file layers.10.mlp.down_proj.weight\n", + "Loading weight file layers.11.input_layernorm.weight\n", + "Loading weight file layers.11.self_attn.q_proj.weight\n", + "Loading weight file layers.11.self_attn.k_proj.weight\n", + "Loading weight file layers.11.self_attn.v_proj.weight\n", + "Loading weight file layers.11.self_attn.o_proj.weight\n", + "Loading weight file layers.11.post_attention_layernorm.weight\n", + "Loading weight file layers.11.mlp.gate_proj.weight\n", + "Loading weight file layers.11.mlp.up_proj.weight\n", + "Loading weight file layers.11.mlp.down_proj.weight\n", + "Loading weight file layers.12.input_layernorm.weight\n", + "Loading weight file layers.12.self_attn.q_proj.weight\n", + "Loading weight file layers.12.self_attn.k_proj.weight\n", + "Loading weight file layers.12.self_attn.v_proj.weight\n", + "Loading weight file layers.12.self_attn.o_proj.weight\n", + "Loading weight file layers.12.post_attention_layernorm.weight\n", + "Loading weight file layers.12.mlp.gate_proj.weight\n", + "Loading weight file layers.12.mlp.up_proj.weight\n", + "Loading weight file layers.12.mlp.down_proj.weight\n", + "Loading weight file layers.13.input_layernorm.weight\n", + "Loading weight file layers.13.self_attn.q_proj.weight\n", + "Loading weight file layers.13.self_attn.k_proj.weight\n", + "Loading weight file layers.13.self_attn.v_proj.weight\n", + "Loading weight file layers.13.self_attn.o_proj.weight\n", + "Loading weight file layers.13.post_attention_layernorm.weight\n", + "Loading weight file layers.13.mlp.gate_proj.weight\n", + "Loading weight file layers.13.mlp.up_proj.weight\n", + "Loading weight file layers.13.mlp.down_proj.weight\n", + "Loading weight file layers.14.input_layernorm.weight\n", + "Loading weight file layers.14.self_attn.q_proj.weight\n", + "Loading weight file layers.14.self_attn.k_proj.weight\n", + "Loading weight file layers.14.self_attn.v_proj.weight\n", + "Loading weight file layers.14.self_attn.o_proj.weight\n", + "Loading weight file layers.14.post_attention_layernorm.weight\n", + "Loading weight file layers.14.mlp.gate_proj.weight\n", + "Loading weight file layers.14.mlp.up_proj.weight\n", + "Loading weight file layers.14.mlp.down_proj.weight\n", + "Loading weight file layers.15.input_layernorm.weight\n", + "Loading weight file layers.15.self_attn.q_proj.weight\n", + "Loading weight file layers.15.self_attn.k_proj.weight\n", + "Loading weight file layers.15.self_attn.v_proj.weight\n", + "Loading weight file layers.15.self_attn.o_proj.weight\n", + "Loading weight file layers.15.post_attention_layernorm.weight\n", + "Loading weight file layers.15.mlp.gate_proj.weight\n", + "Loading weight file layers.15.mlp.up_proj.weight\n", + "Loading weight file layers.15.mlp.down_proj.weight\n", + "Loading weight file layers.16.input_layernorm.weight\n", + "Loading weight file layers.16.self_attn.q_proj.weight\n", + "Loading weight file layers.16.self_attn.k_proj.weight\n", + "Loading weight file layers.16.self_attn.v_proj.weight\n", + "Loading weight file layers.16.self_attn.o_proj.weight\n", + "Loading weight file layers.16.post_attention_layernorm.weight\n", + "Loading weight file layers.16.mlp.gate_proj.weight\n", + "Loading weight file layers.16.mlp.up_proj.weight\n", + "Loading weight file layers.16.mlp.down_proj.weight\n", + "Loading weight file layers.17.input_layernorm.weight\n", + "Loading weight file layers.17.self_attn.q_proj.weight\n", + "Loading weight file layers.17.self_attn.k_proj.weight\n", + "Loading weight file layers.17.self_attn.v_proj.weight\n", + "Loading weight file layers.17.self_attn.o_proj.weight\n", + "Loading weight file layers.17.post_attention_layernorm.weight\n", + "Loading weight file layers.17.mlp.gate_proj.weight\n", + "Loading weight file layers.17.mlp.up_proj.weight\n", + "Loading weight file layers.17.mlp.down_proj.weight\n", + "Loading weight file layers.18.input_layernorm.weight\n", + "Loading weight file layers.18.self_attn.q_proj.weight\n", + "Loading weight file layers.18.self_attn.k_proj.weight\n", + "Loading weight file layers.18.self_attn.v_proj.weight\n", + "Loading weight file layers.18.self_attn.o_proj.weight\n", + "Loading weight file layers.18.post_attention_layernorm.weight\n", + "Loading weight file layers.18.mlp.gate_proj.weight\n", + "Loading weight file layers.18.mlp.up_proj.weight\n", + "Loading weight file layers.18.mlp.down_proj.weight\n", + "Loading weight file layers.19.input_layernorm.weight\n", + "Loading weight file layers.19.self_attn.q_proj.weight\n", + "Loading weight file layers.19.self_attn.k_proj.weight\n", + "Loading weight file layers.19.self_attn.v_proj.weight\n", + "Loading weight file layers.19.self_attn.o_proj.weight\n", + "Loading weight file layers.19.post_attention_layernorm.weight\n", + "Loading weight file layers.19.mlp.gate_proj.weight\n", + "Loading weight file layers.19.mlp.up_proj.weight\n", + "Loading weight file layers.19.mlp.down_proj.weight\n", + "Loading weight file layers.20.input_layernorm.weight\n", + "Loading weight file layers.20.self_attn.q_proj.weight\n", + "Loading weight file layers.20.self_attn.k_proj.weight\n", + "Loading weight file layers.20.self_attn.v_proj.weight\n", + "Loading weight file layers.20.self_attn.o_proj.weight\n", + "Loading weight file layers.20.post_attention_layernorm.weight\n", + "Loading weight file layers.20.mlp.gate_proj.weight\n", + "Loading weight file layers.20.mlp.up_proj.weight\n", + "Loading weight file layers.20.mlp.down_proj.weight\n", + "Loading weight file layers.21.input_layernorm.weight\n", + "Loading weight file layers.21.self_attn.q_proj.weight\n", + "Loading weight file layers.21.self_attn.k_proj.weight\n", + "Loading weight file layers.21.self_attn.v_proj.weight\n", + "Loading weight file layers.21.self_attn.o_proj.weight\n", + "Loading weight file layers.21.post_attention_layernorm.weight\n", + "Loading weight file layers.21.mlp.gate_proj.weight\n", + "Loading weight file layers.21.mlp.up_proj.weight\n", + "Loading weight file layers.21.mlp.down_proj.weight\n", + "Loading weight file layers.22.input_layernorm.weight\n", + "Loading weight file layers.22.self_attn.q_proj.weight\n", + "Loading weight file layers.22.self_attn.k_proj.weight\n", + "Loading weight file layers.22.self_attn.v_proj.weight\n", + "Loading weight file layers.22.self_attn.o_proj.weight\n", + "Loading weight file layers.22.post_attention_layernorm.weight\n", + "Loading weight file layers.22.mlp.gate_proj.weight\n", + "Loading weight file layers.22.mlp.up_proj.weight\n", + "Loading weight file layers.22.mlp.down_proj.weight\n", + "Loading weight file layers.23.input_layernorm.weight\n", + "Loading weight file layers.23.self_attn.q_proj.weight\n", + "Loading weight file layers.23.self_attn.k_proj.weight\n", + "Loading weight file layers.23.self_attn.v_proj.weight\n", + "Loading weight file layers.23.self_attn.o_proj.weight\n", + "Loading weight file layers.23.post_attention_layernorm.weight\n", + "Loading weight file layers.23.mlp.gate_proj.weight\n", + "Loading weight file layers.23.mlp.up_proj.weight\n", + "Loading weight file layers.23.mlp.down_proj.weight\n", + "Loading weight file layers.24.input_layernorm.weight\n", + "Loading weight file layers.24.self_attn.q_proj.weight\n", + "Loading weight file layers.24.self_attn.k_proj.weight\n", + "Loading weight file layers.24.self_attn.v_proj.weight\n", + "Loading weight file layers.24.self_attn.o_proj.weight\n", + "Loading weight file layers.24.post_attention_layernorm.weight\n", + "Loading weight file layers.24.mlp.gate_proj.weight\n", + "Loading weight file layers.24.mlp.up_proj.weight\n", + "Loading weight file layers.24.mlp.down_proj.weight\n", + "Loading weight file layers.25.input_layernorm.weight\n", + "Loading weight file layers.25.self_attn.q_proj.weight\n", + "Loading weight file layers.25.self_attn.k_proj.weight\n", + "Loading weight file layers.25.self_attn.v_proj.weight\n", + "Loading weight file layers.25.self_attn.o_proj.weight\n", + "Loading weight file layers.25.post_attention_layernorm.weight\n", + "Loading weight file layers.25.mlp.gate_proj.weight\n", + "Loading weight file layers.25.mlp.up_proj.weight\n", + "Loading weight file layers.25.mlp.down_proj.weight\n", + "Loading weight file layers.26.input_layernorm.weight\n", + "Loading weight file layers.26.self_attn.q_proj.weight\n", + "Loading weight file layers.26.self_attn.k_proj.weight\n", + "Loading weight file layers.26.self_attn.v_proj.weight\n", + "Loading weight file layers.26.self_attn.o_proj.weight\n", + "Loading weight file layers.26.post_attention_layernorm.weight\n", + "Loading weight file layers.26.mlp.gate_proj.weight\n", + "Loading weight file layers.26.mlp.up_proj.weight\n", + "Loading weight file layers.26.mlp.down_proj.weight\n", + "Loading weight file layers.27.input_layernorm.weight\n", + "Loading weight file layers.27.self_attn.q_proj.weight\n", + "Loading weight file layers.27.self_attn.k_proj.weight\n", + "Loading weight file layers.27.self_attn.v_proj.weight\n", + "Loading weight file layers.27.self_attn.o_proj.weight\n", + "Loading weight file layers.27.post_attention_layernorm.weight\n", + "Loading weight file layers.27.mlp.gate_proj.weight\n", + "Loading weight file layers.27.mlp.up_proj.weight\n", + "Loading weight file layers.27.mlp.down_proj.weight\n", + "Loading weight file layers.28.input_layernorm.weight\n", + "Loading weight file layers.28.self_attn.q_proj.weight\n", + "Loading weight file layers.28.self_attn.k_proj.weight\n", + "Loading weight file layers.28.self_attn.v_proj.weight\n", + "Loading weight file layers.28.self_attn.o_proj.weight\n", + "Loading weight file layers.28.post_attention_layernorm.weight\n", + "Loading weight file layers.28.mlp.gate_proj.weight\n", + "Loading weight file layers.28.mlp.up_proj.weight\n", + "Loading weight file layers.28.mlp.down_proj.weight\n", + "Loading weight file layers.29.input_layernorm.weight\n", + "Loading weight file layers.29.self_attn.q_proj.weight\n", + "Loading weight file layers.29.self_attn.k_proj.weight\n", + "Loading weight file layers.29.self_attn.v_proj.weight\n", + "Loading weight file layers.29.self_attn.o_proj.weight\n", + "Loading weight file layers.29.post_attention_layernorm.weight\n", + "Loading weight file layers.29.mlp.gate_proj.weight\n", + "Loading weight file layers.29.mlp.up_proj.weight\n", + "Loading weight file layers.29.mlp.down_proj.weight\n", + "Loading weight file layers.30.input_layernorm.weight\n", + "Loading weight file layers.30.self_attn.q_proj.weight\n", + "Loading weight file layers.30.self_attn.k_proj.weight\n", + "Loading weight file layers.30.self_attn.v_proj.weight\n", + "Loading weight file layers.30.self_attn.o_proj.weight\n", + "Loading weight file layers.30.post_attention_layernorm.weight\n", + "Loading weight file layers.30.mlp.gate_proj.weight\n", + "Loading weight file layers.30.mlp.up_proj.weight\n", + "Loading weight file layers.30.mlp.down_proj.weight\n", + "Loading weight file layers.31.input_layernorm.weight\n", + "Loading weight file layers.31.self_attn.q_proj.weight\n", + "Loading weight file layers.31.self_attn.k_proj.weight\n", + "Loading weight file layers.31.self_attn.v_proj.weight\n", + "Loading weight file layers.31.self_attn.o_proj.weight\n", + "Loading weight file layers.31.post_attention_layernorm.weight\n", + "Loading weight file layers.31.mlp.gate_proj.weight\n", + "Loading weight file layers.31.mlp.up_proj.weight\n", + "Loading weight file layers.31.mlp.down_proj.weight\n", + "Loading weight file norm.weight\n", + "Loading weight file lm_head.weight\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "[0 - 7f4ce019c740] 24.015346 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0178740] 24.062661 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0190740] 24.128376 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0184740] 24.199797 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 24.255941 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0178740] 24.306545 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 24.357210 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0190740] 24.407958 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0178740] 24.459366 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0178740] 24.510618 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0178740] 24.560416 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0178740] 24.611335 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0178740] 24.663808 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0178740] 24.710965 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0178740] 24.756020 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 24.805719 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0178740] 24.858560 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 24.910607 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0178740] 24.958879 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 25.002851 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 25.050780 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 25.104554 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0184740] 25.159509 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 25.211003 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 25.261411 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0190740] 25.312357 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0184740] 25.362253 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0184740] 25.412284 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0184740] 25.461502 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0184740] 25.513610 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 25.564433 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0184740] 25.613662 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0184740] 25.663786 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0184740] 25.712708 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 25.762206 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0184740] 25.812755 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 25.863367 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0184740] 25.913378 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0184740] 25.965063 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0178740] 26.015739 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 26.065768 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0178740] 26.115556 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0184740] 26.166644 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0184740] 26.218528 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0178740] 26.269681 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0178740] 26.320250 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0178740] 26.371698 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 26.422587 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0178740] 26.474391 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 26.524817 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0190740] 26.575224 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0178740] 26.627207 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0190740] 26.679366 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0178740] 26.729921 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 26.779766 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 26.832104 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0184740] 26.884087 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 26.935580 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 26.992909 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0184740] 27.043722 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0184740] 27.093960 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0178740] 27.144937 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0190740] 27.196991 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0178740] 27.248143 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0190740] 27.299549 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0190740] 27.351395 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 27.402975 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0190740] 27.453662 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0178740] 27.504152 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0178740] 27.554072 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 27.605613 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 27.656807 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0190740] 27.707595 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0190740] 27.757815 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 27.809557 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 27.862148 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0190740] 27.914188 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0178740] 27.965942 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0184740] 28.017837 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0184740] 28.069997 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0184740] 28.122560 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0190740] 28.172513 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0190740] 28.224002 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0184740] 28.276536 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0184740] 28.327091 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 28.377124 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0190740] 28.427226 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0190740] 28.477499 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0184740] 28.528489 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 28.580135 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0190740] 28.631761 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 28.683392 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 28.734001 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0190740] 28.783914 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0190740] 28.835832 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0184740] 28.885271 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0190740] 28.936179 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0190740] 28.987163 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 29.038264 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0184740] 29.084248 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0184740] 29.129864 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0184740] 29.175946 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 29.226707 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0184740] 29.277372 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 29.329588 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 29.380856 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0190740] 29.431483 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0190740] 29.483399 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 29.536268 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0190740] 29.588317 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0184740] 29.638727 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0190740] 29.689708 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0190740] 29.740987 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0178740] 29.791166 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0190740] 29.841776 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 29.893514 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0178740] 29.945509 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 29.945878 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n", + "[0 - 7f4ce0178740] 29.945889 {3}{RequestManager}: Final output: <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without\n", + "[0 - 7f4ce0178740] 29.945900 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(23696232.0) finish(29945893.0) latency(6249661.0) ttft(22415078.0)\n" + ] + } + ], + "source": [ + "prompts = [s for s in json.load(open(configs.inference_dataset))]\n", + "inference_requests = [\n", + " ff.Request(\n", + " ff.RequestType.REQ_INFERENCE,\n", + " prompt=prompt,\n", + " max_sequence_length=configs.max_sequence_length,\n", + " peft_model_id=llm.get_ff_peft_id(lora_inference_config),\n", + " )\n", + " for prompt in prompts\n", + "]\n", + "inf_req_res_1 = llm.generate(inference_requests)\n", + "with open(\"before_finetuning.txt\", \"w\") as file:\n", + " file.write(str(inf_req_res_1[0].output_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perform Finetuning on dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "No small speculative model registered, using incremental decoding.\n", + "[0 - 7f4d49d21280] 29.957050 {3}{RequestManager}: [0] input: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30 8215 2053 1005 279 8834 304 872 305 12055 311 2567 1124 10409 449 4907 323 88000 369 1317 18852 315 892 13\n", + "[0 - 7f4d49d21280] 29.957061 {3}{RequestManager}: [0] output:\n", + "Loss: 2.6536\n", + "Loss: 2.5942\n", + "Loss: 2.5360\n", + "Loss: 2.5083\n", + "Loss: 2.4783\n", + "Loss: 2.4570\n", + "Loss: 2.4420\n", + "Loss: 2.4194\n", + "Loss: 2.4050\n", + "Loss: 2.3949\n", + "Loss: 2.3841\n", + "Loss: 2.3764\n", + "Loss: 2.3676\n", + "Loss: 2.3535\n", + "Loss: 2.3396\n", + "Loss: 2.3299\n", + "Loss: 2.3287\n", + "Loss: 2.3215\n", + "Loss: 2.3058\n", + "Loss: 2.2978\n", + "Loss: 2.2885\n", + "Loss: 2.2852\n", + "Loss: 2.2660\n", + "Loss: 2.2619\n", + "Loss: 2.2594\n", + "Loss: 2.2479\n", + "Loss: 2.2379\n", + "Loss: 2.2243\n", + "Loss: 2.2245\n", + "Loss: 2.2057\n", + "Loss: 2.2035\n", + "Loss: 2.1891\n", + "Loss: 2.1817\n", + "Loss: 2.1703\n", + "Loss: 2.1592\n", + "Loss: 2.1548\n", + "Loss: 2.1383\n", + "Loss: 2.1321\n", + "Loss: 2.1179\n", + "Loss: 2.1138\n", + "Loss: 2.1062\n", + "Loss: 2.0934\n", + "Loss: 2.0856\n", + "Loss: 2.0758\n", + "Loss: 2.0656\n", + "Loss: 2.0532\n", + "Loss: 2.0497\n", + "Loss: 2.0410\n", + "Loss: 2.0258\n", + "Loss: 2.0161\n", + "Loss: 2.0047\n", + "Loss: 1.9940\n", + "Loss: 1.9820\n", + "Loss: 1.9737\n", + "Loss: 1.9614\n", + "Loss: 1.9486\n", + "Loss: 1.9378\n", + "Loss: 1.9281\n", + "Loss: 1.9174\n", + "Loss: 1.9047\n", + "Loss: 1.8922\n", + "Loss: 1.8798\n", + "Loss: 1.8674\n", + "Loss: 1.8574\n", + "Loss: 1.8485\n", + "Loss: 1.8301\n", + "Loss: 1.8213\n", + "Loss: 1.8091\n", + "Loss: 1.8007\n", + "Loss: 1.7850\n", + "Loss: 1.7784\n", + "Loss: 1.7606\n", + "Loss: 1.7496\n", + "Loss: 1.7320\n", + "Loss: 1.7216\n", + "Loss: 1.7067\n", + "Loss: 1.6954\n", + "Loss: 1.6781\n", + "Loss: 1.6667\n", + "Loss: 1.6551\n", + "Loss: 1.6425\n", + "Loss: 1.6272\n", + "Loss: 1.6096\n", + "Loss: 1.6030\n", + "Loss: 1.5824\n", + "Loss: 1.5724\n", + "Loss: 1.5558\n", + "Loss: 1.5399\n", + "Loss: 1.5266\n", + "Loss: 1.5109\n", + "Loss: 1.4952\n", + "Loss: 1.4829\n", + "Loss: 1.4648\n", + "Loss: 1.4496\n", + "Loss: 1.4360\n", + "Loss: 1.4154\n", + "Loss: 1.4010\n", + "Loss: 1.3958\n", + "Loss: 1.3719\n", + "Loss: 1.3562\n", + "[0 - 7f4ce0190740] 38.933268 {3}{RequestManager}: [Finetuning] guid(1000001) completed_training_steps(100) processed_finetuning_tokens(3400) latency(38933176.0)\n" + ] + } + ], + "source": [ + "finetuning_request = ff.Request(\n", + " ff.RequestType.REQ_FINETUNING,\n", + " max_sequence_length=configs.max_sequence_length,\n", + " peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),\n", + " dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),\n", + " max_training_steps=configs.max_training_steps,\n", + ")\n", + "ft_res = llm.generate([finetuning_request])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAABm/UlEQVR4nO3de1yUdfr/8fcICKKioqgIJKaVHe1gBw94KA9ZmYpKiqVW+3VLLcnd2tq21O1gWdtWW1m2pZ3QjDTL7UQlHlK3rNztaG1KKmIeERVFGu7fH/dvBoaZYQ4MzAzzej4ePMa5577v+TB+UC8/13V9LIZhGAIAAAAAuNUk2AMAAAAAgFBH4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBACNUEFBgSwWiwoKCoI9lIi3aNEiWSwWbdq0KdhD8cpPP/2kIUOGqFWrVrJYLHrrrbeCPSS/FBYWymKx6NFHHw32UAA0EgROABq1cPhH6znnnKOTTjpJhmG4PadPnz7q0KGDfvvttwYcWfiYPXu2LBaLOnTooLKyMqfX09PTddVVVwVhZOFn0qRJ+vrrr/XAAw/olVdeUc+ePV2eZwtM3H099NBDDTxyAKhf0cEeAABEugkTJujOO+/U2rVr1a9fP6fXCwsLtWHDBk2fPl3R0fyxXZs9e/Zo/vz5+sMf/hDsoYSlY8eOacOGDbr77rs1ffp0r64ZP368rrjiCqfj5513XqCHBwBBxd/AABBk2dnZuuuuu5Sbm+sycFq8eLEMw9CECROCMLrwcu655+qRRx7R1KlT1axZs2APp0EdPXpUzZs3r9M99u7dK0lq3bq119ecf/75uvbaa+v0vgAQDkjVAwBJX331lYYNG6aEhAS1aNFCl112mTZu3OhwTkVFhebMmaNTTjlFcXFxatu2rfr27av8/Hz7Obt379b111+v1NRUxcbGKjk5WSNGjFBhYaHb905LS1O/fv2Ul5eniooKp9dzc3PVtWtXXXzxxfrll180depUnXbaaWrWrJnatm2rsWPH1np/m/T0dE2ePNnp+IABAzRgwACHY+Xl5Zo1a5a6deum2NhYpaWl6Y477lB5eXmt7zF9+nS1aNHCZbrc+PHj1bFjR1mtVknSpk2bNHToULVr107NmjVTly5ddMMNN3j8Pmpz77336tdff9X8+fNrPc9dDZgt/WzRokX2Y5MnT1aLFi20fft2XXXVVWrRooVSUlL09NNPS5K+/vprXXrppWrevLk6d+6s3Nxcl+9ZVlam3//+92rbtq0SEhI0ceJEHTx40Om89957TxkZGWrevLlatmypK6+8Ut9++63DObYx/fzzz7riiivUsmVLj4G1pzk+e/Zsde7cWZJ0++23y2KxKD09vdZ7esuWKvnhhx/q3HPPVVxcnM444wwtW7bM6dytW7dq7NixSkxMVHx8vC655BL961//cjrv+PHjmj17tk499VTFxcUpOTlZmZmZ+vnnn53OXbBggbp27arY2FhdeOGF+vzzzx1e9+fnFkDkYcUJQMT79ttvlZGRoYSEBN1xxx2KiYnRc889pwEDBmj16tW6+OKLJZn/sJw7d65+97vf6aKLLlJpaak2bdqkL7/8UoMHD5YkjR49Wt9++61uueUWpaena8+ePcrPz9f27dtr/UfohAkTNGXKFH3wwQcOtThff/21vvnmG917772SpM8//1zr16/XuHHjlJqaqsLCQs2fP18DBgzQd999p/j4+Dp/HpWVlbr66qu1bt06TZkyRaeffrq+/vpr/f3vf9ePP/5Ya7OAa665Rk8//bT+9a9/aezYsfbjZWVleueddzR58mRFRUVpz549GjJkiJKSknTnnXeqdevWKiwsdPkPaV9kZGTo0ksv1bx583TzzTcHbNXJarVq2LBh6tevn+bNm6fXXntN06dPV/PmzXX33XdrwoQJyszM1LPPPquJEyeqV69e6tKli8M9pk+frtatW2v27NnasmWL5s+fr19++cUexEnSK6+8okmTJmno0KF6+OGHVVZWpvnz56tv37766quvHObQb7/9pqFDh6pv37569NFHa/2992aOZ2ZmqnXr1rrtttvs6XctWrTw+NmUlZVp3759Tsdbt27tkFr6008/6ZprrtFNN92kSZMmaeHChRo7dqzef/99+8/Pr7/+qt69e6usrEy33nqr2rZtq5deeklXX3218vLyNGrUKPvvx1VXXaWPP/5Y48aN04wZM3T48GHl5+frm2++UdeuXe3vm5ubq8OHD+v3v/+9LBaL5s2bp8zMTG3dulUxMTGS/P+5BRBhDABoxBYuXGhIMj7//HO354wcOdJo2rSp8fPPP9uP7dq1y2jZsqXRr18/+7EePXoYV155pdv7HDx40JBkPPLIIz6P88CBA0ZsbKwxfvx4h+N33nmnIcnYsmWLYRiGUVZW5nTthg0bDEnGyy+/bD+2atUqQ5KxatUq+7HOnTsbkyZNcrq+f//+Rv/+/e3PX3nlFaNJkybG2rVrHc579tlnDUnGp59+6vb7qKysNFJSUozRo0c7HF+6dKkhyVizZo1hGIaxfPlyj78vvpg1a5Yhydi7d6+xevVqQ5Lx2GOP2V/v3Lmzw++dq8/HMAxj27ZthiRj4cKF9mOTJk0yJBkPPvig/djBgweNZs2aGRaLxViyZIn9+A8//GBIMmbNmmU/ZpuDF1xwgXHixAn78Xnz5hmSjBUrVhiGYRiHDx82Wrdubfzf//2fw5h2795ttGrVyuG4bUx33nmnV5+Pt3Pc9v17M4dt57r72rBhg/3czp07G5KMN998037s0KFDRnJysnHeeefZj+Xk5BiSHObe4cOHjS5duhjp6emG1Wo1DMMwXnzxRaffY5vKykqH8bVt29Y4cOCA/fUVK1YYkox33nnHMIy6/dwCiCyk6gGIaFarVR9++KFGjhypk08+2X48OTlZ2dnZWrdunUpLSyWZ/4P+7bff6qeffnJ5r2bNmqlp06YqKChwmYJVmzZt2uiKK67Q22+/raNHj0qSDMPQkiVL1LNnT5166qn297CpqKjQ/v371a1bN7Vu3VpffvmlT+/pzhtvvKHTTz9d3bt31759++xfl156qSRp1apVbq+1WCwaO3as3n33XR05csR+/PXXX1dKSor69u0rqaqGZuXKlS7TE+uiX79+GjhwoObNm6djx44F7L6/+93v7L9u3bq1TjvtNDVv3lxZWVn246eddppat26trVu3Ol0/ZcoU+wqHJN18882Kjo7Wu+++K0nKz89XSUmJxo8f7/C5R0VF6eKLL3b5ud98880ex+3LHPfHlClTlJ+f7/R1xhlnOJzXqVMn+4qRJHu64ldffaXdu3dLkt59911ddNFF9nkiSS1atNCUKVNUWFio7777TpL05ptvql27drrlllucxmNbvbO55ppr1KZNG/vzjIwMSbL/HtXl5xZAZCFwAhDR9u7dq7KyMp122mlOr51++umqrKzUjh07JEl//etfVVJSolNPPVVnn322br/9dv33v/+1nx8bG6uHH35Y7733njp06GBP67L9o9CTCRMm6OjRo1qxYoUkaf369SosLHSoXTl27JjuvfdepaWlKTY2Vu3atVNSUpJKSkp06NChunwUdj/99JO+/fZbJSUlOXzZgrc9e/bUev0111yjY8eO6e2335YkHTlyRO+++67Gjh1r/0dt//79NXr0aM2ZM0ft2rXTiBEjtHDhQo81VN6aPXu2du/erWeffTYg94uLi1NSUpLDsVatWik1NdXpH+qtWrVy+Q/wU045xeF5ixYtlJycbK+jsQXkl156qdNn/+GHHzp97tHR0UpNTfU4dl/muD9OOeUUDRo0yOkrISHB4bxu3bo5fVa2OWX7DH755Re347S9Lkk///yzTjvtNK+6TJ500kkOz21BlO33qK4/twAiB4ETAHipX79++vnnn/Xiiy/qrLPO0j//+U+df/75+uc//2k/JycnRz/++KPmzp2ruLg43XPPPTr99NP11Vdfebz/VVddpVatWtmbC+Tm5ioqKkrjxo2zn3PLLbfogQceUFZWlpYuXaoPP/xQ+fn5atu2rSorK2u9f81/tNrYmjXYVFZW6uyzz3a5ipCfn6+pU6fW+j6XXHKJ0tPTtXTpUknSO++8o2PHjumaa65xGEteXp69zXpRUZFuuOEGXXDBBQ4rVf7q16+fBgwY4HbVydvPwiYqKsqn40Yte3K5Y/v9e+WVV1x+7raA2iY2NlZNmvDXuCfe/B7V5ecWQOSgOQSAiJaUlKT4+Hht2bLF6bUffvhBTZo0UVpamv1YYmKirr/+el1//fU6cuSI+vXrp9mzZzukcXXt2lV/+MMf9Ic//EE//fSTzj33XP3tb3/Tq6++WutYYmNjNWbMGL388sv69ddf9cYbb+jSSy9Vx44d7efk5eVp0qRJ+tvf/mY/dvz4cZWUlHj8Xtu0aePyvF9++cUhhatr1676z3/+o8suu8xtgOFJVlaWnnjiCZWWlur1119Xenq6LrnkEqfzLrnkEl1yySV64IEHlJubqwkTJmjJkiUOn6e/Zs+erQEDBui5555zes226lDz87CtaNSHn376SQMHDrQ/P3LkiIqLi+17INkaGrRv316DBg0K2Pv6Osfry//+9z8ZhuEwp3788UdJsjdg6Ny5s9tx2l6XzM/q3//+tyoqKhzSH+vC359bAJGD/6oCENGioqI0ZMgQrVixwqH18K+//qrc3Fz17dvXnnK0f/9+h2tbtGihbt262dPLysrKdPz4cYdzunbtqpYtW3qdgjZhwgRVVFTo97//vfbu3evUYjoqKsppNeMf//iH25WSmmPZuHGjTpw4YT+2cuVKpzStrKwsFRUV6fnnn3e6x7Fjx+w1WLW55pprVF5erpdeeknvv/++Qx2QZKZJ1fw+zj33XEly+Kx+/vlnl+2lvdG/f38NGDBADz/8sNPvS+fOnRUVFaU1a9Y4HH/mmWf8ei9vLFiwwKGea/78+frtt980bNgwSdLQoUOVkJCgBx980GXdl22PJV/5Msfr065du7R8+XL789LSUr388ss699xz7f85cMUVV+izzz7Thg0b7OcdPXpUCxYsUHp6ur1uavTo0dq3b5+eeuopp/fxdbUvED+3ACIDK04AIsKLL76o999/3+n4jBkzdP/99ys/P199+/bV1KlTFR0dreeee07l5eWaN2+e/dwzzjhDAwYM0AUXXKDExERt2rRJeXl5mj59uiTzf88vu+wyZWVl6YwzzlB0dLSWL1+uX3/91SHdrjb9+/dXamqqVqxYoWbNmikzM9Ph9auuukqvvPKKWrVqpTPOOEMbNmzQRx99pLZt23q89+9+9zvl5eXp8ssvV1ZWln7++We9+uqrDq2bJem6667T0qVLddNNN2nVqlXq06ePrFarfvjhBy1dulQffPCBevbsWet7nX/++erWrZvuvvtulZeXO6TpSdJLL72kZ555RqNGjVLXrl11+PBhPf/880pISLCvwEjSZZddJkl+76cza9Ysh1Uem1atWmns2LH6xz/+IYvFoq5du2rlypUe67fq4sSJE/b5sWXLFj3zzDPq27evrr76aklms4T58+fruuuu0/nnn69x48YpKSlJ27dv17/+9S/16dPHZaDgDW/nuD++/PJLl6syXbt2Va9evezPTz31VN144436/PPP1aFDB7344ov69ddftXDhQvs5d955pxYvXqxhw4bp1ltvVWJiol566SVt27ZNb775pj01ceLEiXr55Zc1c+ZMffbZZ8rIyNDRo0f10UcfaerUqRoxYoTX4w/Ezy2ACBHEjn4AUO9sraDdfe3YscMwDMP48ssvjaFDhxotWrQw4uPjjYEDBxrr1693uNf9999vXHTRRUbr1q2NZs2aGd27dzceeOABe4vpffv2GdOmTTO6d+9uNG/e3GjVqpVx8cUXG0uXLvVpzLfffrshycjKynJ67eDBg8b1119vtGvXzmjRooUxdOhQ44cffnBqNe6u3fbf/vY3IyUlxYiNjTX69OljbNq0yakduWEYxokTJ4yHH37YOPPMM43Y2FijTZs2xgUXXGDMmTPHOHTokFffx913321IMrp16+b02pdffmmMHz/eOOmkk4zY2Fijffv2xlVXXWVs2rTJ4bzOnTsbnTt39vhe1duR19S/f39DklMr+b179xqjR4824uPjjTZt2hi///3vjW+++cZlO/LmzZu7vO+ZZ57pdLxm63PbHFy9erUxZcoUo02bNkaLFi2MCRMmGPv373e6ftWqVcbQoUONVq1aGXFxcUbXrl2NyZMnO3w27sZUG2/meCDbkVefj7bP5IMPPjDOOeccIzY21ujevbvxxhtvON33559/NsaMGWO0bt3aiIuLMy666CJj5cqVTueVlZUZd999t9GlSxcjJibG6NixozFmzBh7y/XavhdVaxkfqJ9bAI2fxTD8qGAFAADwUnp6us466yytXLky2EMBAL9R4wQAAAAAHhA4AQAAAIAHBE4AAAAA4AE1TgAAAADgAStOAAAAAOABgRMAAAAAeBBxG+BWVlZq165datmypSwWS7CHAwAAACBIDMPQ4cOH1alTJ/sm2+5EXOC0a9cupaWlBXsYAAAAAELEjh07lJqaWus5ERc4tWzZUpL54SQkJAR5NFJFRYU+/PBDDRkyRDExMcEeDsIE8wb+YN7AX8wd+IN5A3809LwpLS1VWlqaPUaoTcQFTrb0vISEhJAJnOLj45WQkMAfKvAa8wb+YN7AX8wd+IN5A38Ea954U8JDcwgAAAAA8IDACQAAAAA8IHACAAAAAA8InAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAAAAAMADAicAAAAA8IDAKYisVmn1aovWrEnR6tUWWa3BHhEAAAAAVwicgmTZMik9XRo8OFqPPdZTgwdHKz3dPA4AAAAgtBA4BcGyZdKYMdLOnY7Hi4rM4wRPAAAAQGghcGpgVqs0Y4ZkGM6v2Y7l5Ii0PQAAACCEEDg1sLVrnVeaqjMMaccO8zwAAAAAoYHAqYEVFwf2PAAAAAD1j8CpgSUnB/Y8AAAAAPWPwKmBZWRIqamSxeL6dYtFSkszzwMAAAAQGgicGlhUlPTEE+avawZPtuePP26eBwAAACA0EDgFQWamlJcnpaQ4Hm/Z0jyemRmccQEAAABwjcApSDIzpcJCKT//N11xxVZJUuvW0siRwRwVAAAAAFcInIIoKkrq39/QpEnfKiHB0Pbt0po1wR4VAAAAgJoInEJAbGylxowxd7996aUgDwYAAACAEwKnEHHddZWSzBqno0eDPBgAAAAADgicQkTv3oZOPlk6ckRavjzYowEAAABQHYFTiLBYpIkTzV+TrgcAAACEFgKnEGILnD7+WNqxI7hjAQAAAFCFwCmEdOki9esnGYb02mvBHg0AAAAAm6AGTnPnztWFF16oli1bqn379ho5cqS2bNni8bqSkhJNmzZNycnJio2N1amnnqp33323AUZc/6qn6xlGcMcCAAAAwBTUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6WktbuRMnTmjw4MEqLCxUXl6etmzZoueff14pKSkNOPL6M3as1KyZ9MMP0rPPSosXSwUFktUa7JEBAAAAkSs6mG/+/vvvOzxftGiR2rdvry+++EL9+vVzec2LL76oAwcOaP369YqJiZEkpaen1/dQG0xCgtSzp7R2rTR1atXx1FTpiSekzMzgjQ0AAACIVEENnGo6dOiQJCkxMdHtOW+//bZ69eqladOmacWKFUpKSlJ2drb+9Kc/KSoqyun88vJylZeX25+XlpZKkioqKlRRURHg78B3tjHYHpcvt2jt2ihJFofziooMjRkjLVli1ahR5PBFuprzBvAG8wb+Yu7AH8wb+KOh540v72MxjNCopKmsrNTVV1+tkpISrVu3zu153bt3V2FhoSZMmKCpU6fqf//7n6ZOnapbb71Vs2bNcjp/9uzZmjNnjtPx3NxcxcfHB/R7qCurVZoyZYj2749TzcDJZKhdu2N67rl8uYgRAQAAAPigrKxM2dnZOnTokBISEmo9N2QCp5tvvlnvvfee1q1bp9TUVLfnnXrqqTp+/Li2bdtmX2F67LHH9Mgjj6i4uNjpfFcrTmlpadq3b5/HD6chVFRUKD8/X4MHD9b69U01eLDnRcD8/N/Uv39I/LYhSKrPG1vKKuAJ8wb+Yu7AH8wb+KOh501paanatWvnVeAUEql606dP18qVK7VmzZpagyZJSk5OVkxMjENa3umnn67du3frxIkTatq0qcP5sbGxio2NdbpPTExMSP0Qx8TEaO9e73479u6NVggNHUEUavMY4YF5A38xd+AP5g380VDzxpf3CGpXPcMwNH36dC1fvlyffPKJunTp4vGaPn366H//+58qKyvtx3788UclJyc7BU3hJjk5sOcBAAAACIygBk7Tpk3Tq6++qtzcXLVs2VK7d+/W7t27dezYMfs5EydO1F133WV/fvPNN+vAgQOaMWOGfvzxR/3rX//Sgw8+qGnTpgXjWwiojAyze57FVXmTzONpaeZ5AAAAABpOUAOn+fPn69ChQxowYICSk5PtX6+//rr9nO3btzvULqWlpemDDz7Q559/rnPOOUe33nqrZsyYoTvvvDMY30JARUWZLccl98HT44+LxhAAAABAAwtqjZM3fSkKCgqcjvXq1UsbN26shxEFX2amlJcnzZgh7dxZdbxFC+mll9jHCQAAAAiGoK44wbXMTKmwUFq1SrItpEVHS1dcEdRhAQAAABGLwClERUVJAwZI999v1j2VlEhvvx3sUQEAAACRicApxEVFSRMnmr9etCioQwEAAAAiFoFTGJg82Xz84AOpqCioQwEAAAAiEoFTGDjlFKlvX6myUnrllWCPBgAAAIg8BE5h4vrrzceFCyUvmhECAAAACCACpzAxdqwUHy/9+KPUSDuxAwAAACGLwClMtGwpjRlj/nrhwuCOBQAAAIg0BE5hxJaut2SJVFYW3LEAAAAAkSQ62AOA9/r1k7p0kbZtkx54QDrrLCk5WcrIMNuWAwAAAKgfBE5hpEkT6aKLzMDpwQerjqemSk88IWVmBm9sAAAAQGNGql4YWbZMWrrU+XhRkVn/tGxZw48JAAAAiAQETmHCapVmzHDditx2LCfHPA8AAABAYBE4hYm1a6WdO92/bhjSjh3meQAAAAACi8ApTBQXB/Y8AAAAAN4jcAoTycmBPQ8AAACA9wicwkRGhtk9z2Jxf05amnkeAAAAgMAicAoTUVFmy3HJffA0Ywb7OQEAAAD1gcApjGRmSnl5UkqK4/G4OPPxqaekPXukggJp8WLzkS57AAAAQN2xAW6YycyURowwu+cVF5s1TWeeKV1yibR1q3TSSVJ5edX5bI4LAAAA1B0rTmEoKkoaMEAaP958TEqSpk0zX6seNElsjgsAAAAEAoFTI2C1Sn//u+vX2BwXAAAAqDsCp0aAzXEBAACA+kXg1AiwOS4AAABQvwicGgE2xwUAAADqF4FTI8DmuAAAAED9InBqBLzZHPeee9gcFwAAAPAXgVMj4W5z3JgY8/HVV6UTJ9gcFwAAAPAHG+A2Iq42x+3YUbrwQmnNGnO/p9LSqvPZHBcAAADwDitOjUzNzXG7d5duuMF8rXrQJLE5LgAAAOAtAqdGzmp1HxixOS4AAADgHQKnRo7NcQEAAIC6I3Bq5NgcFwAAAKg7AqdGjs1xAQAAgLqjq14jZ9sct6ioqqapptRUqXdvs0W5rRtfRgb7PgEAAAA2BE6NnG1z3DFjzM1xXQVPMTHSySebwZUNrcoBAACAKqTqRQB3m+MmJUnR0dK2bY5Bk0SrcgAAAKA6AqcIkZkpFRZKq1ZJubnm486dUps2rs+nVTkAAABQhVS9CGLbHNemoEDau9f9+dVblVe/DgAAAIg0BE4RzNsW5EVFNI4AAABAZCNwimDetiC/7TbHlSkaRwAAACDSUOMUwWytyi2W2s+rmc5H4wgAAABEGgKnCGZrVS55Dp6qo3EEAAAAIg2BU4SrrVV5bao3jgAAAAAaO2qcoMxMacQIMwiyNYAoKpKuvdbztd42mAAAAADCGYETJLluVe4NbxtMAAAAAOGMVD245E3jCFtrcgAAAKCxI3CCS940jvjtN7POqaBAWrzYfKRZBAAAABojUvXglq1xxIwZ0s6dVcc7dZIqK6Xdu6Vu3RyDJfZ4AgAAQGPEihNqlZkpFRZKq1ZJubnm4/bt0pw55us1V5jY4wkAAACNEStO8Khm4wirVbrvPtfnGoaZ2peTY3bqi4pqiBECAAAA9YsVJ/hs7VrH1L2aqu/xZLVSAwUAAIDwx4oTfObt3k0rVkjXXecYZFEDBQAAgHDEihN85u3eTY8/7rwyRQ0UAAAAwlFQA6e5c+fqwgsvVMuWLdW+fXuNHDlSW7Zs8fr6JUuWyGKxaOTIkfU3SDjxZo8ndwzDfMzJkU6cII0PAAAA4SGogdPq1as1bdo0bdy4Ufn5+aqoqNCQIUN09OhRj9cWFhbqj3/8ozLYgbXB1bbHkzfBlK0GKjVVGjhQys42H9PTWYkCAABAaApq4PT+++9r8uTJOvPMM9WjRw8tWrRI27dv1xdffFHrdVarVRMmTNCcOXN08sknN9BoUZ1tj6eUFMfjqanmapI39u51fE4aHwAAAEJVSDWHOHTokCQpMTGx1vP++te/qn379rrxxhu1du3aWs8tLy9XeXm5/XlpaakkqaKiQhUVFXUccd3ZxhAKY/HV8OHSFVdI69ZZVFxs1j717Wto3TqLHn/c96lltjI3NGOGdMUVv9HKvBbhPG8QPMwb+Iu5A38wb+CPhp43vryPxTBsVSfBVVlZqauvvlolJSVat26d2/PWrVuncePGafPmzWrXrp0mT56skpISvfXWWy7Pnz17tubYdmutJjc3V/Hx8YEaPqqxWqUpU4Zo//44SX4UQkmaM2edmjSRDh6MU5s2x3XGGfsJpAAAABBQZWVlys7O1qFDh5SQkFDruSETON1888167733tG7dOqWmpro85/DhwzrnnHP0zDPPaNiwYZLkMXByteKUlpamffv2efxwGkJFRYXy8/M1ePBgxcTEBHs4AbN8uUXjxpmRjmFUD54MeRNMJSYaOnCg6ryUFEOPPWbVqFEhMV2DrrHOG9Qv5g38xdyBP5g38EdDz5vS0lK1a9fOq8ApJFL1pk+frpUrV2rNmjVugyZJ+vnnn1VYWKjhw4fbj1VWVkqSoqOjtWXLFnXt2tXhmtjYWMXGxjrdKyYmJqR+iENtPHWVlSVFR0szZji2JE9KsjjVNrlSPWiSpF27LBo3Llp5edKIEebmurb0wIwMRexqVGObN2gYzBv4i7kDfzBv4I+Gmje+vEdQAyfDMHTLLbdo+fLlKigoUJcuXWo9v3v37vr6668djv3lL3/R4cOH9cQTTygtLa0+hwsfZWY6Bzm9e0tdu5qNIHxZ6zTrn6QpU5yDMTbVBQAAQH0LauA0bdo05ebmasWKFWrZsqV2794tSWrVqpWaNWsmSZo4caJSUlI0d+5cxcXF6ayzznK4R+vWrSXJ6ThCQ1SUNGCA47EnnjC751ksvgdP+/c7H7d148vLI3gCAABA/QhqO/L58+fr0KFDGjBggJKTk+1fr7/+uv2c7du3q7i4OIijRKC5a2XuoZmiW2yqCwAAgPoW9FQ9TwoKCmp9fdGiRYEZDBqUqzQ+q1UaNMi/+1XfVLd6DRVpfAAAAAiEkGgOgchUM43PajUDHV/rn6pzt6kuaXwAAACoi6Cm6gHVRUWZq0OSWf8UCNXT+EjbAwAAgL8InBBS3NU/paZKbdv6F1DZ0vjWrg3MGAEAABB5SNVDyHFV/5SRIa1Y4V83PpuiIrNhBPs/AQAAwFcETghJrtqY21ajnDfVda5tcuW222gcAQAAAP+QqoewkpkpFRZKq1ZJubnm486dZhDkKY3PXeOIZcvqbbgAAABoJAicEHZsq1Hjx5uPTZv611SCxhEAAADwFoETGgV3TSWSkmq/ztY4oqCAjXMBAADgHjVOaDRcNZUoKpKuvdbztVlZ0oEDVc+pfwIAAEB1BE5oVGo2lSgo8O666kGTxMa5AAAAcESqHhq1jAzvGkfUVL3+6cQJ0vgAAAAiHStOaNSiosyUO3/2f7LVP6Wmum9jbrU67zfF3lAAAACNDytOaPTcNY5ITPTuendtzO+4Q0pPlwYOlLKzzcf0dNqbAwAANEasOCEiuGocYbVKgwb5fi/bqtUjjzi/VrM2ihUpAACAxoHACRGjZuMIq9VMuysq8i2FrzaGYaYE5uRIlZXSbbeZG/Ta0K0PAAAgPJGqh4hlq3+SfG8eURtbbdTYsY5Bk1S1IkU6HwAAQHghcEJE83fjXH9V79ZHdz4AAIDwQeCEiJeZKRUWSqtWSbm55uPOnf61MfeGbUVq7drA3xsAAAD1gxonQM71T5L/bcy9VVRk7gtF4wgAAIDQx4oT4Ia7NL60NOn2282AquaKlC8rVLfdRitzAACAcEHgBNTCVRrftm3SvHmug6rUVGnpUu/S/NztD0XwBAAAEHpI1QM8cJXGJ7neG8qWbhcV5XuaX/VW5lddJa1fTxofAABAqCBwAuqgtqAqL0+aMcOxJXlSkvNKU3W2xhGpqY7nsf8TAABAcJGqB9QTV2l+f/+7d9eSxgcAABBaWHEC6lHNFamCAv/uUzONb80ai9asSVHz5hYNHEgaHwAAQH0jcAIaUEaGmXZXVOR7i3PHNL5oST312GOOaXxWq+uaKwAAANQNqXpAA4qKMoMcyf/Ndd2l8d1xh9nSnBbnAAAAgUfgBDQwd/tDJSX5dz/DML8eecSxEYVEbRQAAECgEDgBQeCqccTOnd7t/+QLWzpgTo6ZxgcAAAD/UOMEBImrVuZPPOH7/k+e2Gqj1q513TodAAAAnrHiBISQQKfxVVdcXPd7AAAARCpWnIAQk5kpjRjh2B2vd2+pa1f/uvHZtG9vtkOn4x4AAIDvCJyAEBToNL64OGnSJDPwsqnexhwAAAC1I1UPCBPu0vjS0qTbbzcDKneNJY4fdwyaJMeOe1aruRq1eLH5WL2RRG2vAQAARApWnIAwYkvjW7XqN7333mYNG3auBg6MVlSUdMkl0owZji3JU1OlkhLpyBHnexmGGWhNmeL6Ott+U+5eY6UKAABEEgInIMxERUn9+xs6erRI/fv3sNcpuaqNslqlQYPc38swpP37nY8XFUmjR7u+xrZSlZdH8AQAACIHgRPQiNSsjVq82L/71FZDZVupyskxAzUaTAAAgEhAjRPQiCUn1899q+8NRQ0UAACIBAROQCOWkWHWJLlrGlFXK1ZI6enSwIFSdrb5mJ5uNpwAAABoTAicgEYsKqqqyUN9BE+PP+7YOEJy7NYHAADQWBA4AY2cuzbmqalS27aBD6hs9VE5OdKJE6TxAQCAxoHmEEAEcNVxLyPDTLVztalu9ee1veaOrQYqNVXau7fqOK3MAQBAuGLFCYgQto5748ebj1FRta9Gvfmm+eXqtZwc796zetAkkcYHAADCFytOQIRztxplazPu6rW1a836Jl/RyhwAAIQrAicATvs/eXrN1q2vqMhz2l5N1VuZu3tPAACAUEPgBMBntm59ruqjvFVUZDaMcLXKZbW6XwEDAAAIBgInAH6x1UfNmOHYkjwpybm2yZXbbnPdOEJyvidNJQAAQLAROAHwm6v6qN69pa5dPafxuWocMXq063NtTSXy8gieAABAcNBVD0Cd1OzW17Spf5vu1hZkVd8bir2gAABAMBA4AQg4d23Ok5L8v2f1phIAAAANjVQ9APXCVRpfUZF07bV1u29xcWDGBwAA4AsCJwD1pmYr84KCut+zfXu68QEAgIZH4ASgwdRl/ydJio6WrrvOcdWJbnwAAKAhUOMEoMHY9n+SnBtHVH/urqnEb785p+rZuvGNHu0YNNleGzNGWrasbuMGAAAIauA0d+5cXXjhhWrZsqXat2+vkSNHasuWLbVe8/zzzysjI0Nt2rRRmzZtNGjQIH322WcNNGIAdeWucURqqvTmm+aXq9dat3Z9P7rxAQCAhhDUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6evSo22sKCgo0fvx4rVq1Shs2bFBaWpqGDBmioqKiBhw5gLrIzJQKC6VVq6TcXPNx2zbzuKvXFi2SSkr8ey+68QEAgEAIao3T+++/7/B80aJFat++vb744gv169fP5TWvvfaaw/N//vOfevPNN/Xxxx9r4sSJ9TZWAIFVs3FEba8tXlz39ysqct9UAgAAwJOQag5x6NAhSVJiYqLX15SVlamiosLtNeXl5SovL7c/Ly0tlSRVVFSooqKiDqMNDNsYQmEsCB+RNm+Skiyq6x9XOTmG9u2rKp5KSTH02GNWjRrlR5eKMBVp8waBw9yBP5g38EdDzxtf3sdiGP70tgq8yspKXX311SopKdG6deu8vm7q1Kn64IMP9O233youLs7p9dmzZ2vOnDlOx3NzcxUfH1+nMQNoGFarNGXKEO3fHyfJVecI2x9jbrpKuHzdPPanP32uiy4q1nfftdXBg3Fq0+a4zjhjP6tRAABEgLKyMmVnZ+vQoUNKSEio9dyQCZxuvvlmvffee1q3bp1SU1O9uuahhx7SvHnzVFBQoHPOOcflOa5WnNLS0rRv3z6PH05DqKioUH5+vgYPHqyYmJhgDwdhIhLnzfLlFo0bZ0YzhlEVAFkshr0JhMXi+JqngMpiMZSYKMXFSUVFjX81KhLnDQKDuQN/MG/gj4aeN6WlpWrXrp1XgVNIpOpNnz5dK1eu1Jo1a7wOmh599FE99NBD+uijj9wGTZIUGxur2NhYp+MxMTEh9UMcauNBeIikeZOVZe7j5LxXk0WPP27+uuZrSUkW7d3r/p6GYdH+/c7Hd+2yaNy4aOXlNc49oCJp3iCwmDvwB/MG/mioeePLewQ1cDIMQ7fccouWL1+ugoICdenSxavr5s2bpwceeEAffPCBevbsWc+jBBAqMjOlESPMDnmumjzUfK2oSLr2Wt/fxzDM1aucHPOepO0BAICgBk7Tpk1Tbm6uVqxYoZYtW2r37t2SpFatWqlZs2aSpIkTJyolJUVz586VJD388MO69957lZubq/T0dPs1LVq0UIsWLYLzjQBoML504yso8P99bG3MCwrM+9KNDwCAyBbUfZzmz5+vQ4cOacCAAUpOTrZ/vf766/Zztm/fruLiYodrTpw4oTFjxjhc8+ijjwbjWwAQwjIyzM1zLe56RnghK0saOFDKzjYf09OlZcsCNkQAABAmgp6q50lBjf8yLiwsrJ/BAGh0oqKkJ56QxoyxNY7w/R4HDjg+Lyoy79dY658AAIBrQV1xAoD6lplpBjkpKY7HU1Oltm19X42yBV85OWabdAAAEBkInAA0epmZUmGhtGqVlJtrPhYWSgsWmK/7Ezzt2GE2orBazTqoxYvNR4IpAAAap5BoRw4A9c1VUwnbalTNNuaJic4peq6sWCFdd13N9uhmemBmphlEuesACAAAwguBE4CI5qrFudUqDRrk+Vrb/lHV2Wqg/vhHcxXKXVAFAADCC4ETgIhXczXKajWDnKIi3xtK2M5/5BHn12gsAQBA+KLGCQBqsHXjk5zrn+rS2pzGEgAAhC8CJwBwobZufDk5/t+3emMJAAAQPgicAMANV934tm0za6Lqqtq+3gAAIAxQ4wQAtXDVjS8jw/8aKJv27c325XTcAwAgPBA4AYCPbDVQY8aYNU++Bk9xcdKkSWbgZUPHPQAAQhupegDgB3c1UGlp0u23mwGVu0YSx487Bk1SVce9ZcvYVBcAgFDEihMA+MnVHlC2lLtLLnHeWDc1VSopkY4ccb6XYZiB1pQprq9jNQoAgOAicAKAOnBVAyX5t7GuYUj79zsfZ/8nAACCj8AJAOpJzaBq8WL/7mNbjcrJMYMxmkgAANDwCJwAoIEkJ/t/rW3/p4ICM3CiGx8AAA2LwAkAGkgg2phnZUkHDlQ9r17/ZLW6rrcCAAB1R1c9AGggtjbmkvuOe55UD5qkqvqnO+6Q0tOlgQOl7GzzMT3d7NIHAADqjsAJABqQuzbmqalS27a+B1SGYX498ohjJz7JscU5AACoGwInAGhgmZlSYaG0apWUm2s+FhZKCxaYr/u7GlWTLR0wJ0c6cUJavdqiNWtStHq1hb2hAADwETVOABAErtqY21ajau7jlJjonKLnLVtTidRUae/eaEk99dhj7A0FAICvWHECgBDiajVq6dK633fvXsfnpPEBAOCbOq04HT9+XHFxcYEaCwBAzqtRVmvdu/HVxN5QAAD4xucVp8rKSt13331KSUlRixYttHXrVknSPffcoxdeeCHgAwSASBeIbnyuVN8bqqDA3KC3oEDUPwEA4ILPgdP999+vRYsWad68eWratKn9+FlnnaV//vOfAR0cAMDkrhtfWpp0++1mQOVvUJWVRRtzAAA88Tlwevnll7VgwQJNmDBBUdVyO3r06KEffvghoIMDAFRxVf+0bZs0b57roCopybv7utsbiuAJAIAqPtc4FRUVqVu3bk7HKysrVVFREZBBAQBcc9WNTzKDqhEjpLVrpeJiKTlZ6t1b6trV99qomvVPkuN9MzKoiQIARB6fA6czzjhDa9euVefOnR2O5+Xl6bzzzgvYwAAAvnEVVD3xhLl6ZLH4Hjzt2CE98ID0/POO7dFpZQ4AiEQ+B0733nuvJk2apKKiIlVWVmrZsmXasmWLXn75Za1cubI+xggA8FNd94aaNcv5mC2VLy+P4AkAEDl8rnEaMWKE3nnnHX300Udq3ry57r33Xn3//fd65513NHjw4PoYIwCgDmy1Ufn5v2nmzE3Kz/+tTntD2VaucnKkEyfoyAcAiAx+7eOUkZGh/Pz8QI8FAFBPoqKk/v0NHT1apP79e6hJk7rtDWVL5UtNddxclzQ+AEBj5fOKEwAg/NW2N5Qvbc2rB00SHfkAAI2Xz4FTkyZNFBUV5fYLABAe3O0NlZoqzZnj3z1J4wMANFY+p+otX77c4XlFRYW++uorvfTSS5rj79+0AICgcNXGPCPDfO355/1L5SONDwDQGPkcOI2wbepRzZgxY3TmmWfq9ddf14033hiQgQEAGoa7vaH8bWVu4y6Nj258AIBwFLAap0suuUQff/xxoG4HAAgyd6l8SUn+3a96Gh9pewCAcONXV72ajh07pieffFIpNf92BQCENVepfL17S1271i2Nr6DAXOmqnh5ImSwAIJT5HDi1adNGlmotlwzD0OHDhxUfH69XX301oIMDAASfq1S+uqbxZWU5bsBbvf7JanWuuSKoAgAEm8+B09///neHwKlJkyZKSkrSxRdfrDZt2gR0cACA0GRL45sxQ9q5s+p4UpJzbZMr1YMmqar+6Y9/NLvwVb8nTSUAAKHA58Bp8uTJ9TAMAEC4CWQan+3cRx5xfo2mEgCAUOBV4PTf//7X6xuec845fg8GABBe6iONrybDMO+VkyNddZW0fj1pfACAhudV4HTuuefKYrHI8PA3oMVikZVWSQAQ0dyl8SUmOqfoeYu9oQAAweZV4LRt27b6HgcAoBFxlcZntUqDBtXtvuwNBQAIFq8Cp86dO9f3OAAAjUzNND6r1Vwh8qeNuTvV0/hGjCBtDwBQf/zex+m7777T9u3bdeLECYfjV199dZ0HBQBofKKiAl//JFWl8a1d61xvBQBAoPgcOG3dulWjRo3S119/7VD3ZGtRTo0TAMAdd/VPaWnSuHHSo4+az/0JqoqKzI11aRwBAKgPTXy9YMaMGerSpYv27Nmj+Ph4ffvtt1qzZo169uypgoKCehgiAKAxycyUCgulVauk3Fzzcds2ad48M6hKSXE8PynJu/vedps0cKCUnW0+pqdLy5YFevQAgEjl84rThg0b9Mknn6hdu3Zq0qSJmjRpor59+2ru3Lm69dZb9dVXX9XHOAEAjYirNuZS3faGonEEAKA++bziZLVa1bJlS0lSu3bttGvXLklmA4ktW7YEdnQAgIhjC6rGjzcfmzY1a6MkszbKW7YgKydHOnHCTONbvNh8JKscAOArn1eczjrrLP3nP/9Rly5ddPHFF2vevHlq2rSpFixYoJNPPrk+xggAiHDuaqOSkpxXmqpj/ycAQKD4HDj95S9/0dGjRyVJf/3rX3XVVVcpIyNDbdu21euvvx7wAQIAILlO4ysqkq691vO1pPEBAOrK68CpZ8+e+t3vfqfs7GwlJCRIkrp166YffvhBBw4cUJs2beyd9QAAqA81a6P87UnE/k8AAF95XePUo0cP3XHHHUpOTtbEiRMdOuglJiYSNAEAGlxGhpl2589fQbY0voIC6p8AAJ55HTi98MIL2r17t55++mlt375dl112mbp166YHH3xQRUVF9TlGAABcsm2qK/kXPElSVhZtzAEAnvnUVS8+Pl6TJ09WQUGBfvzxR40bN07PPfec0tPTdeWVV2qZj3/TzJ07VxdeeKFatmyp9u3ba+TIkV515nvjjTfUvXt3xcXF6eyzz9a7777r0/sCABoPW+MIf/d/OnDA8bmt/ongCQBQnc/tyG26du2q+++/X4WFhVq8eLE2btyosWPH+nSP1atXa9q0adq4caPy8/NVUVGhIUOG2JtPuLJ+/XqNHz9eN954o7766iuNHDlSI0eO1DfffOPvtwIACHOuNtXdudO/NL7qbcytVvOLVD4AgM9d9aorKCjQwoUL9eabbyo6Olr/93//59P177//vsPzRYsWqX379vriiy/Ur18/l9c88cQTuvzyy3X77bdLku677z7l5+frqaee0rPPPuvfNwIACHuuNtV94glz9chiqX3z3Jps9U8PPCA9/7xjC3RamQNAZPI5cNq5c6cWLVqkRYsWaevWrcrIyNAzzzyjsWPHqlmzZnUazKFDhySZzSbc2bBhg2bOnOlwbOjQoXrrrbdcnl9eXq7y8nL789LSUklSRUWFKioq6jTeQLCNIRTGgvDBvIE/InHeDB8uLVli0cyZUSoqqlp6Skw0dOCA56WoWbNs0VbVuUVFhsaMkZYssWrUKB+isTAWiXMHdce8gT8aet748j4Ww/Du/+CWLl2qF198UR9//LHat2+vSZMm6YYbblC3bt38Hmh1lZWVuvrqq1VSUqJ169a5Pa9p06Z66aWXNH78ePuxZ555RnPmzNGvv/7qdP7s2bM1Z84cp+O5ubmKj48PyNgBAKHNapW++66tDh6MU5s2x1VZKc2a1deLKw1VD5qqH2/X7pieey6fVuYAEMbKysqUnZ2tQ4cO2bdccsfrFadrr71WV155pZYvX64rrrhCTZr4XR7l0rRp0/TNN9/UGjT546677nJYoSotLVVaWpqGDBni8cNpCBUVFcrPz9fgwYMVExMT7OEgTDBv4I9InzfDh1f92mqVFiwwtGuXZBiuAyMzYHK3KmXRvn3xat78SkVFVW3I27ev0SgDqUifO/AP8wb+aOh5Y8tG84bXgdPOnTvVvn17vwbkyfTp07Vy5UqtWbNGqamptZ7bsWNHp5WlX3/9VR07dnR5fmxsrGJjY52Ox8TEhNQPcaiNB+GBeQN/MG+kmBjpySdd1z+Zz73rKJGdHe3Qla+x1z8xd+AP5g380VDzxpf38HrZqD6CJsMwNH36dC1fvlyffPKJunTp4vGaXr166eOPP3Y4lp+fr169egV8fACAxstdG/PUVMlFhrdLtDIHgMhRp656dTVt2jTl5uZqxYoVatmypXbv3i1JatWqlb3RxMSJE5WSkqK5c+dKkmbMmKH+/fvrb3/7m6688kotWbJEmzZt0oIFC4L2fQAAwlNmpjRihLR2bVW6XUaG+drzz5uBkK/d+CwWs5X5VVdJ69c73rcxpvEBQKQIauA0f/58SdKAGv1jFy5cqMmTJ0uStm/f7lBP1bt3b+Xm5uovf/mL/vznP+uUU07RW2+9pbPOOquhhg0AaERctTGX6t7KPDVV2ru36nhjT+MDgMYuqIGTNw39CgoKnI6NHTvW5812AQDwhS2Vb8YMx32cEhOdU/RcqR40SVVpfHl5BE8AEI58bo23Y8cO7az2N8hnn32mnJwcUuUAAI1OZqZUWCitWiXl5pqPS5f6dy/b/xXm5Jhd/QAA4cXnFafs7GxNmTJF1113nXbv3q3BgwfrzDPP1Guvvabdu3fr3nvvrY9xAgAQFDVT+axWM+3O1/onqSqNr6BADm3MqX8CgNDn84rTN998o4suukiSuSnuWWedpfXr1+u1117TokWLAj0+AABCSlSUWaskmfVP/sjKkgYOlLKzzcf0dDrxAUCo8zlwqqiosO+L9NFHH+nqq6+WJHXv3l3FxcWBHR0AACHIXSvzpCTvrqeNOQCEH58DpzPPPFPPPvus1q5dq/z8fF1++eWSpF27dqlt27YBHyAAAKHIVf3Tzp1mGp+vK1HUPwFA6PO5xunhhx/WqFGj9Mgjj2jSpEnq0aOHJOntt9+2p/ABABAJXLUyr2sbc+qfACA0+Rw4DRgwQPv27VNpaanatGljPz5lyhTFx8cHdHAAAISburYxz8pyPI/9nwAgNPicqnfs2DGVl5fbg6ZffvlFjz/+uLZs2aL27dsHfIAAAISburQxp/4JAEKTzytOI0aMUGZmpm666SaVlJTo4osvVkxMjPbt26fHHntMN998c32MEwCAsBKoNuaGYab95eRIV10lrV9PGh8ABIPPK05ffvmlMjIyJEl5eXnq0KGDfvnlF7388st68sknAz5AAAAag7q0MbfVP6Wm0sYcAILF58CprKxMLVu2lCR9+OGHyszMVJMmTXTJJZfol19+CfgAAQBoLNy1MU9M9O76vXsdn5PGBwANx+fAqVu3bnrrrbe0Y8cOffDBBxoyZIgkac+ePUpISAj4AAEAaEzqUv9UE23MAaDh+FzjdO+99yo7O1u33XabLr30UvXq1UuSufp03nnnBXyAAAA0NoGqf5JoYw4ADcXnwGnMmDHq27eviouL7Xs4SdJll12mUaNGBXRwAABEAlv9kz/7P9nQxhwA6pfPqXqS1LFjR5133nnatWuXdv7/TSouuugide/ePaCDAwAgUrirf0pK8u562pgDQP3yOXCqrKzUX//6V7Vq1UqdO3dW586d1bp1a913332qrKysjzECABARXNU/7dxprh7504lPMuufTpwwU/kWLzYfqYcCAN/5nKp3991364UXXtBDDz2kPn36SJLWrVun2bNn6/jx43rggQcCPkgAACJFzfonyf80vuptzKt35CONDwB85/OK00svvaR//vOfuvnmm3XOOefonHPO0dSpU/X8889r0aJF9TBEAAAiG23MASD4fA6cDhw44LKWqXv37jpQM8EaAAAEBG3MASC4fE7V69Gjh5566ik9+eSTDsefeuophy57AAAgsOqjjfnatWbr8rVraWUOALXxOXCaN2+errzySn300Uf2PZw2bNigHTt26N133w34AAEAgGuBaGO+YoV03XVmEwobaqAAwJnPqXr9+/fXjz/+qFGjRqmkpEQlJSXKzMzUli1blJGRUR9jBAAAbtS1jfnjjzsGTRI1UADgis8rTpLUqVMnp+55O3fu1JQpU7RgwYKADAwAAHgnM1MaMcIx3a53b6lrV//T+CwWswZqxIh6GTIAhB2/NsB1Zf/+/XrhhRcCdTsAAOADW/3T+PHmY9OmZrqd5LwHlDd7QtlqoAoKpNWrLVqzJkWrV1toJgEgYgUscAIAAKHFXRpfaqq5muSNrCxp8OBoPfZYTw0eHK30dFL4AEQmAicAABoxV23Mt23zPgWv5k4j1D8BiFR+1TgBAIDwUbONuWS2HPenlXnN+ifalgOIFF4HTpkeepKWlJTUdSwAAKCB1KWVefX6p6go9n8CEBm8DpxatWrl8fWJEyfWeUAAAKBh2GqgZsxwbEmemOicoudKVpbjeez/BKAx8zpwWrhwYX2OAwAABIGrVuZWqzRokOdr3dU/5eURPAFofKhxAgAgwtWsgbJaqX8CgJoInAAAgAPqnwDAGe3IAQCAE3d7QCUmend9VpY0cKCUnW0+sv8TgHBH4AQAAFyy7QGVn/+bZs7cpPz837R0qXfXsv8TgMaGVD0AAOBWVJTUv7+ho0eL1L9/DzVpUvf6p6uuktavJ40PQHghcAIAAF4LRP1Taqq0d2/VcdqYAwgHpOoBAACf1LX+qXrQJJHGByA8sOIEAAB8Vpf9n2oijQ9AOCBwAgAAfgnU/k8SaXwAQh+pegAAICBs9U+SuYLkD9L4AIQqAicAABAw7uqfkpL8u59t5Sonx1zRAoBgIXACAAABZdv/adUqKTfXfNy500y782clypbGt3ZtwIcKAF6jxgkAAARczfonyf825jbFxQEZGgD4hRUnAADQIOqaxte+vVRQIC1ebD6SugegIbHiBAAAGoyrNua9e0tdu9beja9FC2nyZDPlz4aOewAaEoETAABoUP6k8R05Yn5VZ+u4l5dH8ASg/pGqBwAAgs5dGl9KihQX5/qa6h33TpwgjQ9A/WLFCQAAhARXaXxWqzRokPtr2DgXQEMhcAIAACGjZhrf4sXeXedu41zS+AAECql6AAAgZCUn+3cdG+cCCDRWnAAAQMjKyDDT7mrruOeOLY2voMBcybKl/2VkmM8BwBcETgAAIGRFRdV949ysLOnAgarn1D8B8AepegAAIKTVdePc6kGTVFX/tGxZYMYHIDIQOAEAgJCXmSkVFkqrVkm5uebjzp3m6pHF4tu9atY/Wa20MgfgWVADpzVr1mj48OHq1KmTLBaL3nrrLY/XvPbaa+rRo4fi4+OVnJysG264Qfv376//wQIAgKCyddwbP958bNrUTLmT/AueduyQHnhASk+XBg6UsrPNx/R0VqMAOAtq4HT06FH16NFDTz/9tFfnf/rpp5o4caJuvPFGffvtt3rjjTf02Wef6f/+7//qeaQAACAUuUvjS0z07vpZs8yVq+pI5QPgSlCbQwwbNkzDhg3z+vwNGzYoPT1dt956qySpS5cu+v3vf6+HH364voYIAABCnD8b59bGMMwVrJwc87504AMghVlXvV69eunPf/6z3n33XQ0bNkx79uxRXl6errjiCrfXlJeXq7y83P68tLRUklRRUaGKiop6H7MntjGEwlgQPpg38AfzBv4Kl7nTp0/Vr61WKSUlWrt2SYbhKo/PkOQ+v8+Wyrdq1W/q39+PVn4Im3mD0NLQ88aX97EYhj+NPQPPYrFo+fLlGjlyZK3nvfHGG7rhhht0/Phx/fbbbxo+fLjefPNNxcTEuDx/9uzZmjNnjtPx3NxcxcfHB2LoAAAgBG3YkKyHH77w/z+rHiQZLo65lpOzSW3bHtfBg3Fq0+a4zjhjPytQQCNSVlam7OxsHTp0SAkJCbWeG1aB03fffadBgwbptttu09ChQ1VcXKzbb79dF154oV544QWX17hacUpLS9O+ffs8fjgNoaKiQvn5+Ro8eLDb4A+oiXkDfzBv4K9wnjvLl1s0c2aUioqqgqTUVEM33FCpv/7VcwTUrp2hffuqrk1JMfTYY1aNGhUS/3wKaeE8bxA8DT1vSktL1a5dO68Cp7BK1Zs7d6769Omj22+/XZJ0zjnnqHnz5srIyND999+v5ORkp2tiY2MVGxvrdDwmJiakfohDbTwID8wb+IN5A3+F49zJypJGj3asf8rIsEiK0osvmo0gavsv5OpBkyTt2mXRuHHRystzrqvKyKAeypVwnDcIvoaaN768R1gFTmVlZYqOdhxy1P//EypEFs4AAECIsbUxr+mJJ8zueRZL7cFTdbbGEVOmSDNmOHbkS00175mZGZBhAwgxQW1HfuTIEW3evFmbN2+WJG3btk2bN2/W9u3bJUl33XWXJk6caD9/+PDhWrZsmebPn6+tW7fq008/1a233qqLLrpInTp1Csa3AAAAwpS7VuZJSbVfZxjS/v20MQciTVBXnDZt2qSBAwfan8+cOVOSNGnSJC1atEjFxcX2IEqSJk+erMOHD+upp57SH/7wB7Vu3VqXXnop7cgBAIBfXLUyLyqSrr3W93vRxhxo3IIaOA0YMKDWFLtFixY5Hbvlllt0yy231OOoAABAJKmZyldQ4P+9bG3M1651nR4IIHwFNVUPAAAg1GRkmPVKFs/dyt0qLg7ceACEBgInAACAaqKizCYPkv/BU/v25srV4sXmo9UaqNEBCBYCJwAAgBrcNY5ITZXatq09oGrWTJo0SRo4UMrONh/T02kaAYQ7AicAAAAXMjOlwkJp1SopN9d8LCyUFiwwX3cXPB07ZjaYqI6Oe0D4C6t9nAAAABqSqz2gbKtRrvZxKimRjhxxvk/1jntXXSWtX8/GuUC4IXACAADwkas25larNGiQ+2tsHfdSU6W9e6uOs3EuEB4InAAAAPxQczVq8WLvrqseNElVaXx5eQRPQCijxgkAACAAkpP9u862pWVODt33gFBG4AQAABAAddn/qfrGuQBCE4ETAABAAARi/yc2zgVCF4ETAABAgLjb/ykpybvr2TgXCF00hwAAAAggVx33eveWunY1G0HYappqat5cmjzZucU5HfeA0EDgBAAAEGCu9n964gmze57F4jp4OnrU/KqOjntA6CBVDwAAoAG4S+NLTZXi411fU73j3okTpPEBwcSKEwAAQANh41wgfBE4AQAANCA2zgXCE6l6AAAAQcTGuUB4IHACAAAIIjbOBcIDgRMAAEAQBWLj3KIiGkcA9Y3ACQAAIMjqunHubbdJAwdK2dnmY3q6tGxZwIcJRDQCJwAAgBCQmSkVFkqrVkm5uebjzp3epfG5axxB8AQEDoETAABAiLB13Bs/3nxs2tS/ND4aRwCBR+AEAAAQwvxN47M1jigooP4JCAT2cQIAAAhxrjbOLSqSrr3W87VZWdKBA1XP2TgX8A+BEwAAQBiouXFuQYF311UPmiQ2zgX8RaoeAABAGPJ3/yfqnwD/EDgBAACEobrs/8TGuYDvCJwAAADClLvGEYmJ3l1fXBz4MQGNFTVOAAAAYcxV4wirVRo0yPO1tnOrX5uRYa5mAXBE4AQAABDmajaOsFrN+qeioqqaJleee0667jpzo10buu4BrpGqBwAA0MjUVv9U/fmSJY5Bk1TVdW/ZsvodIxBuCJwAAAAaIXf1T6mp0tKlUps2rq+r3nXvxAk2zwVsSNUDAABopFzVP2VkmM8PHnR/na3rXmqqtHdv1XHS+BDJCJwAAAAasZr1T5L33fSqB00Sm+cispGqBwAAEGGSk/27js1zEclYcQIAAIgwGRnedd1zxZbGV1BgrmbRxhyRgsAJAAAgwti67o0ZY3bZ8zV4kqSsLOnAgarn1D+hsSNVDwAAIAK567qXlOTd9dWDJok25mj8CJwAAAAiVGamVFgorVol5eaajzt3mqtHNfd/8oT6JzR2pOoBAABEMFdd9/xN46te/2QYFq1Zk6LmzS0aOJD6J4Q/VpwAAADgwF0aX2Kid9dnZUmDB0frscd6avDgaKWnk8KH8EfgBAAAACeu0viWLvXuWuqf0BiRqgcAAACXaqbxWa3+tTE3DDPtLydHGjGCtD2EJ1acAAAA4BVbG3PJv+YRO3ZIa9cGflxAQyBwAgAAgNfqWv9UVGQ2j1i82HykAx/CBal6AAAA8Elmpplyt3atVFwsJSebAdCgQZ6vve02ae/equdsnItwQeAEAAAAn/lb/1Q9aJKqGkfk5RE8IbSRqgcAAIA687f+iY1zES4InAAAABAQ7uqfkpJqv47GEQgHBE4AAAAIGNv+T/n5v2nmzE3Kz/9Nf/+7d9cWF9fr0IA6ocYJAAAAARUVJfXvb+jo0SL1799Dn37q3XXt25ud9mwNJzIy2PMJoYPACQAAAPUqI8Nz44joaOm66xxXnei4h1BCqh4AAADqlTeNI377zTlVz9Zxb9kys3EE+z8hmAicAAAAUO/cNY5ITZVat3Z9jW11asoUKT1dGjhQys42H9PTzYAKaChBDZzWrFmj4cOHq1OnTrJYLHrrrbc8XlNeXq67775bnTt3VmxsrNLT0/Xiiy/W/2ABAABQJ7bGEatWSbm55uOiRVJJiftrDEPav1/audPxePXVKKAhBLXG6ejRo+rRo4duuOEGZXqZvJqVlaVff/1VL7zwgrp166bi4mJVVlbW80gBAAAQCDU3zl282L/7GIaZ9peTI40YQRMJ1L+gBk7Dhg3TsGHDvD7//fff1+rVq7V161YlJiZKktLT0+tpdAAAAKhvycn+X1t9/6fqwRhQH8Kqq97bb7+tnj17at68eXrllVfUvHlzXX311brvvvvUrFkzl9eUl5ervLzc/ry0tFSSVFFRoYqKigYZd21sYwiFsSB8MG/gD+YN/MXcgT+8nTeXXCKlpERr1y7JMNx0jvBgx47fVFHhpl0fwkpD/3njy/uEVeC0detWrVu3TnFxcVq+fLn27dunqVOnav/+/Vq4cKHLa+bOnas5c+Y4Hf/www8VHx9f30P2Wn5+frCHgDDEvIE/mDfwF3MH/vBm3lx7bbIefvhCSYak6sFTzeeubdu2UQ8/LB08GKc2bY7rjDP2k7oX5hrqz5uysjKvz7UYhrtu+g3LYrFo+fLlGjlypNtzhgwZorVr12r37t1q1aqVJGnZsmUaM2aMjh496nLVydWKU1pamvbt26eEhISAfx++qqioUH5+vgYPHqyYmJhgDwdhgnkDfzBv4C/mDvzh67xZvtyimTOjVFRUFSilpBg6dkw6eND9alTTpobatZN27XK87rHHrBo1KiT+mQsfNPSfN6WlpWrXrp0OHTrkMTYIqxWn5ORkpaSk2IMmSTr99NNlGIZ27typU045xema2NhYxcbGOh2PiYkJqT/8Q208CA/MG/iDeQN/MXfgD2/nTVaWNHq0Wa9UXGzWPmVkWLRihdk9z2JxvXnuiRMW7drleGzXLovGjYtWXh6b54arhvrzxpf3CKt9nPr06aNdu3bpyJEj9mM//vijmjRpotTU1CCODAAAAHVl67g3frz5GBVV+/5P7hYIbAFWTg4b5SJwgho4HTlyRJs3b9bmzZslSdu2bdPmzZu1fft2SdJdd92liRMn2s/Pzs5W27Ztdf311+u7777TmjVrdPvtt+uGG25w2xwCAAAA4c3d/k//v+eXS9U77lmtUkGB2fq8oIBgCv4Jaqrepk2bNHDgQPvzmTNnSpImTZqkRYsWqbi42B5ESVKLFi2Un5+vW265RT179lTbtm2VlZWl+++/v8HHDgAAgIbj7/5PK1ZI113nuIFuaqr0xBOk8cE3QQ2cBgwYoNp6UyxatMjpWPfu3enqAwAAEOG83f/p8cedjxUVmXVT1EDBF2FV4wQAAABIUkaGuXJk8WPrJ2qg4A8CJwAAAISdqCgz3U5yDp68Caaq10AB3iBwAgAAQFiqreNeTo539ygqonEEvBNW+zgBAAAA1WVmSiNG1Nz/yXzuqr6ppttuk/burXpO4wi4Q+AEAACAsFaz455UVQNVVOR641yb6kGTROMIuEeqHgAAABqd2mqgakPjCLhD4AQAAIBGyV0NVFJS7dfROAKukKoHAACARstVDVRRkXTttZ6vtTWOqF47FRVV70NGiCJwAgAAQKNWswaqoMC762gcgepI1QMAAEBE8XbzXHeNI5Ytq7+xIXQROAEAACCi0DgC/iBwAgAAQMShcQR8RY0TAAAAIhKNI+ALAicAAABELBpHwFuk6gEAAAD/H40j4A6BEwAAAPD/0TgC7hA4AQAAANXQOAKuUOMEAAAA1EDjCNRE4AQAAAC4QOMIVEeqHgAAAOAFGkdENgInAAAAwAs0johsBE4AAACAl2gcEbmocQIAAAB8QOOIyETgBAAAAPiIxhGRh1Q9AAAAoI5oHNH4ETgBAAAAdUTjiMaPwAkAAAAIgLo2jigoML8WLzYfCaRCCzVOAAAAQIDUpXFEVpZ04EDVc+qfQguBEwAAABBA/jaOqB40SVX1T3l5BE+hgFQ9AAAAoB552ziiJuqfQguBEwAAAFCP/G0cIbFxbighcAIAAADqmbvGEYmJ3l1fXBz4McE31DgBAAAADcBV4wirVRo0yPO17dubtVK26zIyzJUsNBwCJwAAAKCB1GwcYbWa9U9FRVU1Ta7QcS/4SNUDAAAAgqS2+qfqz9113Fu2rH7HhyoETgAAAEAQuat/SkmR2rZ1fQ0d9xoeqXoAAABAkPlT/2TruFdQYK5cUf9UvwicAAAAgBBQs/5p8WLvrqP+qWGQqgcAAACEoORk786j/qlhEDgBAAAAISgjw1w98mfTXIn6p0AjcAIAAABCUG0d9zyx1T+tXRv4cUUqAicAAAAgRLnruJeY6N31xcXmqlNBgVkzVVDAKpS/aA4BAAAAhDB/Ou7Z/PSTlJ4u7dxZdYzmEf4hcAIAAABCXM2Oe1arGQAVFVXVNLkya5bzMVvziLw8gidfkKoHAAAAhJna6p881UPRPMI/BE4AAABAGHJX/5SaKs2ZU/u1NI/wHYETAAAAEKYyM6XCQmnVKik313zctk065RTvri8urtfhNSrUOAEAAABhrGb9k+T95rnengcCJwAAAKDRsW2eW1vziE6dpN69zRbltm59GRlmIAZnBE4AAABAI2NrHjFmjNkswlXwdPSo1LmztHt31TFalbtHjRMAAADQCLlrHpGcLCUkSIcOOQZNUlWr8mXLGm6c4YLACQAAAGikXDWPKCyUmjd3fT6tyt0jVQ8AAABoxGo2j7DVNLlja1VeUGBeS/2TicAJAAAAiCDetiDPypIOHKh6Hun1T0FN1VuzZo2GDx+uTp06yWKx6K233vL62k8//VTR0dE699xz6218AAAAQGPjbQvy6kGT5Fj/ZLWaK1KLF5uPkZDWF9TA6ejRo+rRo4eefvppn64rKSnRxIkTddlll9XTyAAAAIDGydaq3GLx7Tpb/dOUKVJ6ujRwoJSdbT6mpzf+hhJBTdUbNmyYhg0b5vN1N910k7KzsxUVFeXTKhUAAAAQ6bxpVe6OYUj79zsft61G5eU13lS+sKtxWrhwobZu3apXX31V999/v8fzy8vLVV5ebn9eWloqSaqoqFBFRUW9jdNbtjGEwlgQPpg38AfzBv5i7sAfzJvQNny4tGSJRTNnRqmoqGrpKTHR0IEDPi5FyQyoLBZDM2ZIV1zxm99NJBp63vjyPmEVOP3000+68847tXbtWkVHezf0uXPnas6cOU7HP/zwQ8XHxwd6iH7Lz88P9hAQhpg38AfzBv5i7sAfzJvQFRsrPfmk9N13bXXwYJzatDmuykpp1qy+ft3PMCzauVN69NF/6+yzXSxL+aCh5k1ZWZnX54ZN4GS1WpWdna05c+bo1FNP9fq6u+66SzNnzrQ/Ly0tVVpamoYMGaKEhIT6GKpPKioqlJ+fr8GDBysmJibYw0GYYN7AH8wb+Iu5A38wb8LH8OFVv7ZapQULDO3aZQZC/ujc+RJdcYUP+X/VNPS8sWWjeSNsAqfDhw9r06ZN+uqrrzR9+nRJUmVlpQzDUHR0tD788ENdeumlTtfFxsYqNjbW6XhMTExI/RCH2ngQHpg38AfzBv5i7sAfzJvwEhNjrkL5U/9kk5YWrbr+ljfUvPHlPcImcEpISNDXX3/tcOyZZ57RJ598ory8PHXp0iVIIwMAAAAaj8xMs8nDjBnSzp1Vx1NTpWPHzDbl7gKq1FSza19jFNTA6ciRI/rf//5nf75t2zZt3rxZiYmJOumkk3TXXXepqKhIL7/8spo0aaKzzjrL4fr27dsrLi7O6TgAAAAA/2VmSiNGSGvXmhvmJiebAdGKFbWvRp10kpnuV/M6f5tFhJKgBk6bNm3SwIED7c9ttUiTJk3SokWLVFxcrO3btwdreAAAAEDEioqSBgxwPOZuNSopyVyJWr9eSkyUjh6tei011Wx/Hu5tyoO6Ae6AAQNkGIbT16JFiyRJixYtUkFBgdvrZ8+erc2bNzfIWAEAAACYAVBhobRqlZSbaz4WF0s5Oebr1YMmqWqPp3DfIDdsapwAAAAAhIaaq1FWq/T6667PNfd4MgOrESPCN20vqCtOAAAAAMLf2rWOqXs1GYa0Y4d5XrhixQkAAABAnRQXe3deUZFUUBCejSMInAAAAADUSXKyd+fl5Ej79lU9D6fGEaTqAQAAAKiTjAwzCLJYaj+vetAkhVfjCAInAAAAAHUSFWWuHEmeg6fqbHtB5eSYDSZCGYETAAAAgDqz7fGUkuJ4PCmp9uvCpXEENU4AAAAAAiIz02w5vnZtVQOIoiLp2ms9X+ttg4lgIXACAAAAEDA193gqKPDuOm8bTAQLqXoAAAAA6o2nxhEWi5SWZp4XygicAAAAANSb2hpH2J4//njo7+dE4AQAAACgXrlrHJGaah4Ph32cqHECAAAAUO9cNY7IyAj9lSYbAicAAAAADaJm44hwQqoeAAAAAHhA4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAeEDgBAAAAgAcETgAAAADgAYETAAAAAHgQHewBNDTDMCRJpaWlQR6JqaKiQmVlZSotLVVMTEywh4MwwbyBP5g38BdzB/5g3sAfDT1vbDGBLUaoTcQFTocPH5YkpaWlBXkkAAAAAELB4cOH1apVq1rPsRjehFeNSGVlpXbt2qWWLVvKYrEEezgqLS1VWlqaduzYoYSEhGAPB2GCeQN/MG/gL+YO/MG8gT8aet4YhqHDhw+rU6dOatKk9iqmiFtxatKkiVJTU4M9DCcJCQn8oQKfMW/gD+YN/MXcgT+YN/BHQ84bTytNNjSHAAAAAAAPCJwAAAAAwAMCpyCLjY3VrFmzFBsbG+yhIIwwb+AP5g38xdyBP5g38Ecoz5uIaw4BAAAAAL5ixQkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMApiJ5++mmlp6crLi5OF198sT777LNgDwkhZO7cubrwwgvVsmVLtW/fXiNHjtSWLVsczjl+/LimTZumtm3bqkWLFho9erR+/fXXII0Yoeihhx6SxWJRTk6O/RjzBu4UFRXp2muvVdu2bdWsWTOdffbZ2rRpk/11wzB07733Kjk5Wc2aNdOgQYP0008/BXHECDar1ap77rlHXbp0UbNmzdS1a1fdd999qt57jHkDSVqzZo2GDx+uTp06yWKx6K233nJ43Zt5cuDAAU2YMEEJCQlq3bq1brzxRh05cqTBvgcCpyB5/fXXNXPmTM2aNUtffvmlevTooaFDh2rPnj3BHhpCxOrVqzVt2jRt3LhR+fn5qqio0JAhQ3T06FH7ObfddpveeecdvfHGG1q9erV27dqlzMzMII4aoeTzzz/Xc889p3POOcfhOPMGrhw8eFB9+vRRTEyM3nvvPX333Xf629/+pjZt2tjPmTdvnp588kk9++yz+ve//63mzZtr6NChOn78eBBHjmB6+OGHNX/+fD311FP6/vvv9fDDD2vevHn6xz/+YT+HeQNJOnr0qHr06KGnn37a5evezJMJEybo22+/VX5+vlauXKk1a9ZoypQpDfUtSAaC4qKLLjKmTZtmf261Wo1OnToZc+fODeKoEMr27NljSDJWr15tGIZhlJSUGDExMcYbb7xhP+f77783JBkbNmwI1jARIg4fPmyccsopRn5+vtG/f39jxowZhmEwb+Den/70J6Nv375uX6+srDQ6duxoPPLII/ZjJSUlRmxsrLF48eKGGCJC0JVXXmnccMMNDscyMzONCRMmGIbBvIFrkozly5fbn3szT7777jtDkvH555/bz3nvvfcMi8ViFBUVNci4WXEKghMnTuiLL77QoEGD7MeaNGmiQYMGacOGDUEcGULZoUOHJEmJiYmSpC+++EIVFRUO86h79+466aSTmEfQtGnTdOWVVzrMD4l5A/fefvtt9ezZU2PHjlX79u113nnn6fnnn7e/vm3bNu3evdth7rRq1UoXX3wxcyeC9e7dWx9//LF+/PFHSdJ//vMfrVu3TsOGDZPEvIF3vJknGzZsUOvWrdWzZ0/7OYMGDVKTJk3073//u0HGGd0g7wIH+/btk9VqVYcOHRyOd+jQQT/88EOQRoVQVllZqZycHPXp00dnnXWWJGn37t1q2rSpWrdu7XBuhw4dtHv37iCMEqFiyZIl+vLLL/X55587vca8gTtbt27V/PnzNXPmTP35z3/W559/rltvvVVNmzbVpEmT7PPD1d9dzJ3Ideedd6q0tFTdu3dXVFSUrFarHnjgAU2YMEGSmDfwijfzZPfu3Wrfvr3D69HR0UpMTGywuUTgBISBadOm6ZtvvtG6deuCPRSEuB07dmjGjBnKz89XXFxcsIeDMFJZWamePXvqwQcflCSdd955+uabb/Tss89q0qRJQR4dQtXSpUv12muvKTc3V2eeeaY2b96snJwcderUiXmDRodUvSBo166doqKinLpY/frrr+rYsWOQRoVQNX36dK1cuVKrVq1Samqq/XjHjh114sQJlZSUOJzPPIpsX3zxhfbs2aPzzz9f0dHRio6O1urVq/Xkk08qOjpaHTp0YN7ApeTkZJ1xxhkOx04//XRt375dkuzzg7+7UN3tt9+uO++8U+PGjdPZZ5+t6667Trfddpvmzp0riXkD73gzTzp27OjURO23337TgQMHGmwuETgFQdOmTXXBBRfo448/th+rrKzUxx9/rF69egVxZAglhmFo+vTpWr58uT755BN16dLF4fULLrhAMTExDvNoy5Yt2r59O/Mogl122WX6+uuvtXnzZvtXz549NWHCBPuvmTdwpU+fPk5bHvz444/q3LmzJKlLly7q2LGjw9wpLS3Vv//9b+ZOBCsrK1OTJo7/nIyKilJlZaUk5g2848086dWrl0pKSvTFF1/Yz/nkk09UWVmpiy++uGEG2iAtKOBkyZIlRmxsrLFo0SLju+++M6ZMmWK0bt3a2L17d7CHhhBx8803G61atTIKCgqM4uJi+1dZWZn9nJtuusk46aSTjE8++cTYtGmT0atXL6NXr15BHDVCUfWueobBvIFrn332mREdHW088MADxk8//WS89tprRnx8vPHqq6/az3nooYeM1q1bGytWrDD++9//GiNGjDC6dOliHDt2LIgjRzBNmjTJSElJMVauXGls27bNWLZsmdGuXTvjjjvusJ/DvIFhmN1ev/rqK+Orr74yJBmPPfaY8dVXXxm//PKLYRjezZPLL7/cOO+884x///vfxrp164xTTjnFGD9+fIN9DwROQfSPf/zDOOmkk4ymTZsaF110kbFx48ZgDwkhRJLLr4ULF9rPOXbsmDF16lSjTZs2Rnx8vDFq1CijuLg4eINGSKoZODFv4M4777xjnHXWWUZsbKzRvXt3Y8GCBQ6vV1ZWGvfcc4/RoUMHIzY21rjsssuMLVu2BGm0CAWlpaXGjBkzjJNOOsmIi4szTj75ZOPuu+82ysvL7ecwb2AYhrFq1SqX/66ZNGmSYRjezZP9+/cb48ePN1q0aGEkJCQY119/vXH48OEG+x4shlFta2cAAAAAgBNqnAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAACohcVi0VtvvRXsYQAAgozACQAQsiZPniyLxeL0dfnllwd7aACACBMd7AEAAFCbyy+/XAsXLnQ4FhsbG6TRAAAiFStOAICQFhsbq44dOzp8tWnTRpKZRjd//nwNGzZMzZo108knn6y8vDyH67/++mtdeumlatasmdq2baspU6boyJEjDue8+OKLOvPMMxUbG6vk5GRNnz7d4fV9+/Zp1KhRio+P1ymnnKK3337b/trBgwc1YcIEJSUlqVmzZjrllFOcAj0AQPgjcAIAhLV77rlHo0eP1n/+8x9NmDBB48aN0/fffy9JOnr0qIYOHao2bdro888/1xtvvKGPPvrIITCaP3++pk2bpilTpujrr7/W22+/rW7dujm8x5w5c5SVlaX//ve/uuKKKzRhwgQdOHDA/v7fffed3nvvPX3//feaP3++2rVr13AfAACgQVgMwzCCPQgAAFyZPHmyXn31VcXFxTkc//Of/6w///nPslgsuummmzR//nz7a5dcconOP/98PfPMM3r++ef1pz/9STt27FDz5s0lSe+++66GDx+uXbt2qUOHDkpJSdH111+v+++/3+UYLBaL/vKXv+i+++6TZAZjLVq00HvvvafLL79cV199tdq1a6cXX3yxnj4FAEAooMYJABDSBg4c6BAYSVJiYqL917169XJ4rVevXtq8ebMk6fvvv1ePHj3sQZMk9enTR5WVldqyZYssFot27dqlyy67rNYxnHPOOfZfN2/eXAkJCdqzZ48k6eabb9bo0aP15ZdfasiQIRo5cqR69+7t1/cKAAhdBE4AgJDWvHlzp9S5QGnWrJlX58XExDg8t1gsqqyslCQNGzZMv/zyi959913l5+frsssu07Rp0/Too48GfLwAgOChxgkAENY2btzo9Pz000+XJJ1++un6z3/+o6NHj9pf//TTT9WkSROddtppatmypdLT0/Xxxx/XaQxJSUmaNGmSXn31VT3++ONasGBBne4HAAg9rDgBAEJaeXm5du/e7XAsOjra3oDhjTfeUM+ePdW3b1+99tpr+uyzz/TCCy9IkiZMmKBZs2Zp0qRJmj17tvbu3atbbrlF1113nTp06CBJmj17tm666Sa1b99ew4YN0+HDh/Xpp5/qlltu8Wp89957ry644AKdeeaZKi8v18qVK+2BGwCg8SBwAgCEtPfff1/JyckOx0477TT98MMPksyOd0uWLNHUqVOVnJysxYsX64wzzpAkxcfH64MPPtCMGTN04YUXKj4+XqNHj9Zjjz1mv9ekSZN0/Phx/f3vf9cf//hHtWvXTmPGjPF6fE2bNtVdd92lwsJCNWvWTBkZGVqyZEkAvnMAQCihqx4AIGxZLBYtX75cI0eODPZQAACNHDVOAAAAAOABgRMAAAAAeECNEwAgbJFtDgBoKKw4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAe/D9KcbfSZkpy3gAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "epochs = list(range(configs_dict[\"max_training_steps\"]))\n", + "loss_values = ft_res[0].finetuning_losses\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')\n", + "\n", + "# Set plot labels and title\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss Value')\n", + "plt.title('Loss Value vs. Number of Epochs')\n", + "\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save finetuned model to HuggingFace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(['python', '../../utils/upload_peft_model.py'] + f\"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly\".split())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Stop LLM Co-serving system" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-22 06:46:20 - ###PEFT DEBUGGING### Background serving task completed.\n", + "Background server stopped.\n" + ] + } + ], + "source": [ + "llm.stop_server()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference all over again with the finetuned model" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n", + "Loading tokenizer...\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n", + "Loading tokenizer...\n", + "[0 - 7ff1caf83280] 0.270628 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270673 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270699 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270744 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270753 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "workSpaceSize (128 MB)\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n", + "Loading tokenizer...\n", + "Adding layer layers.0.mlp.down_proj.lora\n", + "Adding layer layers.1.mlp.down_proj.lora\n", + "Adding layer layers.2.mlp.down_proj.lora\n", + "Adding layer layers.3.mlp.down_proj.lora\n", + "Adding layer layers.4.mlp.down_proj.lora\n", + "Adding layer layers.5.mlp.down_proj.lora\n", + "Adding layer layers.6.mlp.down_proj.lora\n", + "Adding layer layers.7.mlp.down_proj.lora\n", + "Adding layer layers.8.mlp.down_proj.lora\n", + "Adding layer layers.9.mlp.down_proj.lora\n", + "Adding layer layers.10.mlp.down_proj.lora\n", + "Adding layer layers.11.mlp.down_proj.lora\n", + "Adding layer layers.12.mlp.down_proj.lora\n", + "Adding layer layers.13.mlp.down_proj.lora\n", + "Adding layer layers.14.mlp.down_proj.lora\n", + "Adding layer layers.15.mlp.down_proj.lora\n", + "Adding layer layers.16.mlp.down_proj.lora\n", + "Adding layer layers.17.mlp.down_proj.lora\n", + "Adding layer layers.18.mlp.down_proj.lora\n", + "Adding layer layers.19.mlp.down_proj.lora\n", + "Adding layer layers.20.mlp.down_proj.lora\n", + "Adding layer layers.21.mlp.down_proj.lora\n", + "Adding layer layers.22.mlp.down_proj.lora\n", + "Adding layer layers.23.mlp.down_proj.lora\n", + "Adding layer layers.24.mlp.down_proj.lora\n", + "Adding layer layers.25.mlp.down_proj.lora\n", + "Adding layer layers.26.mlp.down_proj.lora\n", + "Adding layer layers.27.mlp.down_proj.lora\n", + "Adding layer layers.28.mlp.down_proj.lora\n", + "Adding layer layers.29.mlp.down_proj.lora\n", + "Adding layer layers.30.mlp.down_proj.lora\n", + "Adding layer layers.31.mlp.down_proj.lora\n", + "Background server started.\n", + "[]\n", + "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Starting background serving task.\n", + "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Updated models' configuration.\n", + "###PEFT DEBUGGING### LLM Model object exists.\n", + "###PEFT DEBUGGING### Model object exists.\n", + "###PEFT DEBUGGING### Model object still exists.\n", + "###PEFT DEBUGGING### Entering compile_inference.\n", + "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n", + "###PEFT DEBUGGING### Launching graph optimization task.\n", + "num_nodes = 1 num_gpus_per_node = 1\n", + "[0]10445\n", + "[1]649\n", + "[2]6730\n", + "[3]2053\n", + "[4]18167\n", + "[5]369\n", + "[6]1317\n", + "[7]2085\n", + "[8]3090\n", + "[9]30\n", + "No small speculative model registered, using incremental decoding.\n", + "[0 - 7ff1caf83280] 1.100415 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n", + "optimal_views.size = 262\n", + "views.size() = 262\n", + "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n", + "###PEFT DEBUGGING### Starting inplace optimizations.\n", + "###PEFT DEBUGGING### Mapping output tensors.\n", + "ndim(1) dims[1 0 0 0]\n", + "###PEFT DEBUGGING### Setting up NCCL communications.\n", + "###PEFT DEBUGGING### compile_inference completed successfully.\n", + "Loading weight file embed_tokens.weight\n", + "Loading weight file layers.0.input_layernorm.weight\n", + "Loading weight file layers.0.self_attn.q_proj.weight\n", + "Loading weight file layers.0.self_attn.k_proj.weight\n", + "Loading weight file layers.0.self_attn.v_proj.weight\n", + "Loading weight file layers.0.self_attn.o_proj.weight\n", + "Loading weight file layers.0.post_attention_layernorm.weight\n", + "Loading weight file layers.0.mlp.gate_proj.weight\n", + "Loading weight file layers.0.mlp.up_proj.weight\n", + "Loading weight file layers.0.mlp.down_proj.weight\n", + "Loading weight file layers.1.input_layernorm.weight\n", + "Loading weight file layers.1.self_attn.q_proj.weight\n", + "Loading weight file layers.1.self_attn.k_proj.weight\n", + "Loading weight file layers.1.self_attn.v_proj.weight\n", + "Loading weight file layers.1.self_attn.o_proj.weight\n", + "Loading weight file layers.1.post_attention_layernorm.weight\n", + "Loading weight file layers.1.mlp.gate_proj.weight\n", + "Loading weight file layers.1.mlp.up_proj.weight\n", + "Loading weight file layers.1.mlp.down_proj.weight\n", + "Loading weight file layers.2.input_layernorm.weight\n", + "Loading weight file layers.2.self_attn.q_proj.weight\n", + "Loading weight file layers.2.self_attn.k_proj.weight\n", + "Loading weight file layers.2.self_attn.v_proj.weight\n", + "Loading weight file layers.2.self_attn.o_proj.weight\n", + "Loading weight file layers.2.post_attention_layernorm.weight\n", + "Loading weight file layers.2.mlp.gate_proj.weight\n", + "Loading weight file layers.2.mlp.up_proj.weight\n", + "Loading weight file layers.2.mlp.down_proj.weight\n", + "Loading weight file layers.3.input_layernorm.weight\n", + "Loading weight file layers.3.self_attn.q_proj.weight\n", + "Loading weight file layers.3.self_attn.k_proj.weight\n", + "Loading weight file layers.3.self_attn.v_proj.weight\n", + "Loading weight file layers.3.self_attn.o_proj.weight\n", + "Loading weight file layers.3.post_attention_layernorm.weight\n", + "Loading weight file layers.3.mlp.gate_proj.weight\n", + "Loading weight file layers.3.mlp.up_proj.weight\n", + "Loading weight file layers.3.mlp.down_proj.weight\n", + "Loading weight file layers.4.input_layernorm.weight\n", + "Loading weight file layers.4.self_attn.q_proj.weight\n", + "Loading weight file layers.4.self_attn.k_proj.weight\n", + "Loading weight file layers.4.self_attn.v_proj.weight\n", + "Loading weight file layers.4.self_attn.o_proj.weight\n", + "Loading weight file layers.4.post_attention_layernorm.weight\n", + "Loading weight file layers.4.mlp.gate_proj.weight\n", + "Loading weight file layers.4.mlp.up_proj.weight\n", + "Loading weight file layers.4.mlp.down_proj.weight\n", + "Loading weight file layers.5.input_layernorm.weight\n", + "Loading weight file layers.5.self_attn.q_proj.weight\n", + "Loading weight file layers.5.self_attn.k_proj.weight\n", + "Loading weight file layers.5.self_attn.v_proj.weight\n", + "Loading weight file layers.5.self_attn.o_proj.weight\n", + "Loading weight file layers.5.post_attention_layernorm.weight\n", + "Loading weight file layers.5.mlp.gate_proj.weight\n", + "Loading weight file layers.5.mlp.up_proj.weight\n", + "Loading weight file layers.5.mlp.down_proj.weight\n", + "Loading weight file layers.6.input_layernorm.weight\n", + "Loading weight file layers.6.self_attn.q_proj.weight\n", + "Loading weight file layers.6.self_attn.k_proj.weight\n", + "Loading weight file layers.6.self_attn.v_proj.weight\n", + "Loading weight file layers.6.self_attn.o_proj.weight\n", + "Loading weight file layers.6.post_attention_layernorm.weight\n", + "Loading weight file layers.6.mlp.gate_proj.weight\n", + "Loading weight file layers.6.mlp.up_proj.weight\n", + "Loading weight file layers.6.mlp.down_proj.weight\n", + "Loading weight file layers.7.input_layernorm.weight\n", + "Loading weight file layers.7.self_attn.q_proj.weight\n", + "Loading weight file layers.7.self_attn.k_proj.weight\n", + "Loading weight file layers.7.self_attn.v_proj.weight\n", + "Loading weight file layers.7.self_attn.o_proj.weight\n", + "Loading weight file layers.7.post_attention_layernorm.weight\n", + "Loading weight file layers.7.mlp.gate_proj.weight\n", + "Loading weight file layers.7.mlp.up_proj.weight\n", + "Loading weight file layers.7.mlp.down_proj.weight\n", + "Loading weight file layers.8.input_layernorm.weight\n", + "Loading weight file layers.8.self_attn.q_proj.weight\n", + "Loading weight file layers.8.self_attn.k_proj.weight\n", + "Loading weight file layers.8.self_attn.v_proj.weight\n", + "Loading weight file layers.8.self_attn.o_proj.weight\n", + "Loading weight file layers.8.post_attention_layernorm.weight\n", + "Loading weight file layers.8.mlp.gate_proj.weight\n", + "Loading weight file layers.8.mlp.up_proj.weight\n", + "Loading weight file layers.8.mlp.down_proj.weight\n", + "Loading weight file layers.9.input_layernorm.weight\n", + "Loading weight file layers.9.self_attn.q_proj.weight\n", + "Loading weight file layers.9.self_attn.k_proj.weight\n", + "Loading weight file layers.9.self_attn.v_proj.weight\n", + "Loading weight file layers.9.self_attn.o_proj.weight\n", + "Loading weight file layers.9.post_attention_layernorm.weight\n", + "Loading weight file layers.9.mlp.gate_proj.weight\n", + "Loading weight file layers.9.mlp.up_proj.weight\n", + "Loading weight file layers.9.mlp.down_proj.weight\n", + "Loading weight file layers.10.input_layernorm.weight\n", + "Loading weight file layers.10.self_attn.q_proj.weight\n", + "Loading weight file layers.10.self_attn.k_proj.weight\n", + "Loading weight file layers.10.self_attn.v_proj.weight\n", + "Loading weight file layers.10.self_attn.o_proj.weight\n", + "Loading weight file layers.10.post_attention_layernorm.weight\n", + "Loading weight file layers.10.mlp.gate_proj.weight\n", + "Loading weight file layers.10.mlp.up_proj.weight\n", + "Loading weight file layers.10.mlp.down_proj.weight\n", + "Loading weight file layers.11.input_layernorm.weight\n", + "Loading weight file layers.11.self_attn.q_proj.weight\n", + "Loading weight file layers.11.self_attn.k_proj.weight\n", + "Loading weight file layers.11.self_attn.v_proj.weight\n", + "Loading weight file layers.11.self_attn.o_proj.weight\n", + "Loading weight file layers.11.post_attention_layernorm.weight\n", + "Loading weight file layers.11.mlp.gate_proj.weight\n", + "Loading weight file layers.11.mlp.up_proj.weight\n", + "Loading weight file layers.11.mlp.down_proj.weight\n", + "Loading weight file layers.12.input_layernorm.weight\n", + "Loading weight file layers.12.self_attn.q_proj.weight\n", + "Loading weight file layers.12.self_attn.k_proj.weight\n", + "Loading weight file layers.12.self_attn.v_proj.weight\n", + "Loading weight file layers.12.self_attn.o_proj.weight\n", + "Loading weight file layers.12.post_attention_layernorm.weight\n", + "Loading weight file layers.12.mlp.gate_proj.weight\n", + "Loading weight file layers.12.mlp.up_proj.weight\n", + "Loading weight file layers.12.mlp.down_proj.weight\n", + "Loading weight file layers.13.input_layernorm.weight\n", + "Loading weight file layers.13.self_attn.q_proj.weight\n", + "Loading weight file layers.13.self_attn.k_proj.weight\n", + "Loading weight file layers.13.self_attn.v_proj.weight\n", + "Loading weight file layers.13.self_attn.o_proj.weight\n", + "Loading weight file layers.13.post_attention_layernorm.weight\n", + "Loading weight file layers.13.mlp.gate_proj.weight\n", + "Loading weight file layers.13.mlp.up_proj.weight\n", + "Loading weight file layers.13.mlp.down_proj.weight\n", + "Loading weight file layers.14.input_layernorm.weight\n", + "Loading weight file layers.14.self_attn.q_proj.weight\n", + "Loading weight file layers.14.self_attn.k_proj.weight\n", + "Loading weight file layers.14.self_attn.v_proj.weight\n", + "Loading weight file layers.14.self_attn.o_proj.weight\n", + "Loading weight file layers.14.post_attention_layernorm.weight\n", + "Loading weight file layers.14.mlp.gate_proj.weight\n", + "Loading weight file layers.14.mlp.up_proj.weight\n", + "Loading weight file layers.14.mlp.down_proj.weight\n", + "Loading weight file layers.15.input_layernorm.weight\n", + "Loading weight file layers.15.self_attn.q_proj.weight\n", + "Loading weight file layers.15.self_attn.k_proj.weight\n", + "Loading weight file layers.15.self_attn.v_proj.weight\n", + "Loading weight file layers.15.self_attn.o_proj.weight\n", + "Loading weight file layers.15.post_attention_layernorm.weight\n", + "Loading weight file layers.15.mlp.gate_proj.weight\n", + "Loading weight file layers.15.mlp.up_proj.weight\n", + "Loading weight file layers.15.mlp.down_proj.weight\n", + "Loading weight file layers.16.input_layernorm.weight\n", + "Loading weight file layers.16.self_attn.q_proj.weight\n", + "Loading weight file layers.16.self_attn.k_proj.weight\n", + "Loading weight file layers.16.self_attn.v_proj.weight\n", + "Loading weight file layers.16.self_attn.o_proj.weight\n", + "Loading weight file layers.16.post_attention_layernorm.weight\n", + "Loading weight file layers.16.mlp.gate_proj.weight\n", + "Loading weight file layers.16.mlp.up_proj.weight\n", + "Loading weight file layers.16.mlp.down_proj.weight\n", + "Loading weight file layers.17.input_layernorm.weight\n", + "Loading weight file layers.17.self_attn.q_proj.weight\n", + "Loading weight file layers.17.self_attn.k_proj.weight\n", + "Loading weight file layers.17.self_attn.v_proj.weight\n", + "Loading weight file layers.17.self_attn.o_proj.weight\n", + "Loading weight file layers.17.post_attention_layernorm.weight\n", + "Loading weight file layers.17.mlp.gate_proj.weight\n", + "Loading weight file layers.17.mlp.up_proj.weight\n", + "Loading weight file layers.17.mlp.down_proj.weight\n", + "Loading weight file layers.18.input_layernorm.weight\n", + "Loading weight file layers.18.self_attn.q_proj.weight\n", + "Loading weight file layers.18.self_attn.k_proj.weight\n", + "Loading weight file layers.18.self_attn.v_proj.weight\n", + "Loading weight file layers.18.self_attn.o_proj.weight\n", + "Loading weight file layers.18.post_attention_layernorm.weight\n", + "Loading weight file layers.18.mlp.gate_proj.weight\n", + "Loading weight file layers.18.mlp.up_proj.weight\n", + "Loading weight file layers.18.mlp.down_proj.weight\n", + "Loading weight file layers.19.input_layernorm.weight\n", + "Loading weight file layers.19.self_attn.q_proj.weight\n", + "Loading weight file layers.19.self_attn.k_proj.weight\n", + "Loading weight file layers.19.self_attn.v_proj.weight\n", + "Loading weight file layers.19.self_attn.o_proj.weight\n", + "Loading weight file layers.19.post_attention_layernorm.weight\n", + "Loading weight file layers.19.mlp.gate_proj.weight\n", + "Loading weight file layers.19.mlp.up_proj.weight\n", + "Loading weight file layers.19.mlp.down_proj.weight\n", + "Loading weight file layers.20.input_layernorm.weight\n", + "Loading weight file layers.20.self_attn.q_proj.weight\n", + "Loading weight file layers.20.self_attn.k_proj.weight\n", + "Loading weight file layers.20.self_attn.v_proj.weight\n", + "Loading weight file layers.20.self_attn.o_proj.weight\n", + "Loading weight file layers.20.post_attention_layernorm.weight\n", + "Loading weight file layers.20.mlp.gate_proj.weight\n", + "Loading weight file layers.20.mlp.up_proj.weight\n", + "Loading weight file layers.20.mlp.down_proj.weight\n", + "Loading weight file layers.21.input_layernorm.weight\n", + "Loading weight file layers.21.self_attn.q_proj.weight\n", + "Loading weight file layers.21.self_attn.k_proj.weight\n", + "Loading weight file layers.21.self_attn.v_proj.weight\n", + "Loading weight file layers.21.self_attn.o_proj.weight\n", + "Loading weight file layers.21.post_attention_layernorm.weight\n", + "Loading weight file layers.21.mlp.gate_proj.weight\n", + "Loading weight file layers.21.mlp.up_proj.weight\n", + "Loading weight file layers.21.mlp.down_proj.weight\n", + "Loading weight file layers.22.input_layernorm.weight\n", + "Loading weight file layers.22.self_attn.q_proj.weight\n", + "Loading weight file layers.22.self_attn.k_proj.weight\n", + "Loading weight file layers.22.self_attn.v_proj.weight\n", + "Loading weight file layers.22.self_attn.o_proj.weight\n", + "Loading weight file layers.22.post_attention_layernorm.weight\n", + "Loading weight file layers.22.mlp.gate_proj.weight\n", + "Loading weight file layers.22.mlp.up_proj.weight\n", + "Loading weight file layers.22.mlp.down_proj.weight\n", + "Loading weight file layers.23.input_layernorm.weight\n", + "Loading weight file layers.23.self_attn.q_proj.weight\n", + "Loading weight file layers.23.self_attn.k_proj.weight\n", + "Loading weight file layers.23.self_attn.v_proj.weight\n", + "Loading weight file layers.23.self_attn.o_proj.weight\n", + "Loading weight file layers.23.post_attention_layernorm.weight\n", + "Loading weight file layers.23.mlp.gate_proj.weight\n", + "Loading weight file layers.23.mlp.up_proj.weight\n", + "Loading weight file layers.23.mlp.down_proj.weight\n", + "Loading weight file layers.24.input_layernorm.weight\n", + "Loading weight file layers.24.self_attn.q_proj.weight\n", + "Loading weight file layers.24.self_attn.k_proj.weight\n", + "Loading weight file layers.24.self_attn.v_proj.weight\n", + "Loading weight file layers.24.self_attn.o_proj.weight\n", + "Loading weight file layers.24.post_attention_layernorm.weight\n", + "Loading weight file layers.24.mlp.gate_proj.weight\n", + "Loading weight file layers.24.mlp.up_proj.weight\n", + "Loading weight file layers.24.mlp.down_proj.weight\n", + "Loading weight file layers.25.input_layernorm.weight\n", + "Loading weight file layers.25.self_attn.q_proj.weight\n", + "Loading weight file layers.25.self_attn.k_proj.weight\n", + "Loading weight file layers.25.self_attn.v_proj.weight\n", + "Loading weight file layers.25.self_attn.o_proj.weight\n", + "Loading weight file layers.25.post_attention_layernorm.weight\n", + "Loading weight file layers.25.mlp.gate_proj.weight\n", + "Loading weight file layers.25.mlp.up_proj.weight\n", + "Loading weight file layers.25.mlp.down_proj.weight\n", + "Loading weight file layers.26.input_layernorm.weight\n", + "Loading weight file layers.26.self_attn.q_proj.weight\n", + "Loading weight file layers.26.self_attn.k_proj.weight\n", + "Loading weight file layers.26.self_attn.v_proj.weight\n", + "Loading weight file layers.26.self_attn.o_proj.weight\n", + "Loading weight file layers.26.post_attention_layernorm.weight\n", + "Loading weight file layers.26.mlp.gate_proj.weight\n", + "Loading weight file layers.26.mlp.up_proj.weight\n", + "Loading weight file layers.26.mlp.down_proj.weight\n", + "Loading weight file layers.27.input_layernorm.weight\n", + "Loading weight file layers.27.self_attn.q_proj.weight\n", + "Loading weight file layers.27.self_attn.k_proj.weight\n", + "Loading weight file layers.27.self_attn.v_proj.weight\n", + "Loading weight file layers.27.self_attn.o_proj.weight\n", + "Loading weight file layers.27.post_attention_layernorm.weight\n", + "Loading weight file layers.27.mlp.gate_proj.weight\n", + "Loading weight file layers.27.mlp.up_proj.weight\n", + "Loading weight file layers.27.mlp.down_proj.weight\n", + "Loading weight file layers.28.input_layernorm.weight\n", + "Loading weight file layers.28.self_attn.q_proj.weight\n", + "Loading weight file layers.28.self_attn.k_proj.weight\n", + "Loading weight file layers.28.self_attn.v_proj.weight\n", + "Loading weight file layers.28.self_attn.o_proj.weight\n", + "Loading weight file layers.28.post_attention_layernorm.weight\n", + "Loading weight file layers.28.mlp.gate_proj.weight\n", + "Loading weight file layers.28.mlp.up_proj.weight\n", + "Loading weight file layers.28.mlp.down_proj.weight\n", + "Loading weight file layers.29.input_layernorm.weight\n", + "Loading weight file layers.29.self_attn.q_proj.weight\n", + "Loading weight file layers.29.self_attn.k_proj.weight\n", + "Loading weight file layers.29.self_attn.v_proj.weight\n", + "Loading weight file layers.29.self_attn.o_proj.weight\n", + "Loading weight file layers.29.post_attention_layernorm.weight\n", + "Loading weight file layers.29.mlp.gate_proj.weight\n", + "Loading weight file layers.29.mlp.up_proj.weight\n", + "Loading weight file layers.29.mlp.down_proj.weight\n", + "Loading weight file layers.30.input_layernorm.weight\n", + "Loading weight file layers.30.self_attn.q_proj.weight\n", + "Loading weight file layers.30.self_attn.k_proj.weight\n", + "Loading weight file layers.30.self_attn.v_proj.weight\n", + "Loading weight file layers.30.self_attn.o_proj.weight\n", + "Loading weight file layers.30.post_attention_layernorm.weight\n", + "Loading weight file layers.30.mlp.gate_proj.weight\n", + "Loading weight file layers.30.mlp.up_proj.weight\n", + "Loading weight file layers.30.mlp.down_proj.weight\n", + "Loading weight file layers.31.input_layernorm.weight\n", + "Loading weight file layers.31.self_attn.q_proj.weight\n", + "Loading weight file layers.31.self_attn.k_proj.weight\n", + "Loading weight file layers.31.self_attn.v_proj.weight\n", + "Loading weight file layers.31.self_attn.o_proj.weight\n", + "Loading weight file layers.31.post_attention_layernorm.weight\n", + "Loading weight file layers.31.mlp.gate_proj.weight\n", + "Loading weight file layers.31.mlp.up_proj.weight\n", + "Loading weight file layers.31.mlp.down_proj.weight\n", + "Loading weight file norm.weight\n", + "Loading weight file lm_head.weight\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "[0 - 7ff1680b6740] 16.224181 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7ff1680b6740] 16.321885 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7ff168092740] 16.407712 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7ff1680b6740] 16.492788 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7ff168092740] 16.563500 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7ff168092740] 16.624616 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7ff168092740] 16.675778 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 16.725625 {3}{RequestManager}: Output token is: 13272\n", + "[0 - 7ff168092740] 16.776205 {3}{RequestManager}: Output token is: 315\n", + "[0 - 7ff168092740] 16.827883 {3}{RequestManager}: Output token is: 41389\n", + "[0 - 7ff168092740] 16.878348 {3}{RequestManager}: Output token is: 2715\n", + "[0 - 7ff168092740] 16.929025 {3}{RequestManager}: Output token is: 288\n", + "[0 - 7ff168092740] 16.979287 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff1680b6740] 17.029879 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 17.078696 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 17.127942 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 17.177796 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 17.227023 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff1680b6740] 17.277136 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 17.328143 {3}{RequestManager}: Output token is: 64614\n", + "[0 - 7ff1680b6740] 17.378508 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 17.430618 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 17.482129 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 17.533479 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 17.584503 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 17.634591 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 17.685727 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 17.736768 {3}{RequestManager}: Output token is: 14535\n", + "[0 - 7ff168092740] 17.785909 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 17.836515 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 17.886526 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 17.936502 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 17.986222 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 18.037888 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 18.088468 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 18.138261 {3}{RequestManager}: Output token is: 25212\n", + "[0 - 7ff168092740] 18.187102 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 18.237270 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 18.289979 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 18.340895 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 18.391145 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 18.441155 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 18.499716 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 18.552423 {3}{RequestManager}: Output token is: 97814\n", + "[0 - 7ff168092740] 18.603261 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 18.654986 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 18.706227 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 18.756543 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 18.807690 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 18.857508 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 18.907649 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 18.958208 {3}{RequestManager}: Output token is: 41759\n", + "[0 - 7ff168092740] 19.009971 {3}{RequestManager}: Output token is: 388\n", + "[0 - 7ff168092740] 19.060626 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 19.112370 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 19.161425 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 19.206435 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 19.254004 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 19.306102 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 19.356853 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 19.408861 {3}{RequestManager}: Output token is: 89435\n", + "[0 - 7ff1680b6740] 19.460391 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff1680b6740] 19.511207 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 19.565692 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 19.617057 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 19.669739 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 19.722325 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff1680b6740] 19.773583 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 19.824646 {3}{RequestManager}: Output token is: 68550\n", + "[0 - 7ff1680b6740] 19.876650 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff1680b6740] 19.926939 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 19.977325 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 20.028247 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 20.078419 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 20.128614 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 20.179748 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 20.230542 {3}{RequestManager}: Output token is: 18311\n", + "[0 - 7ff1680b6740] 20.281634 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 20.330089 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 20.375491 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 20.422220 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 20.475078 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 20.526058 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 20.577651 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 20.628505 {3}{RequestManager}: Output token is: 7013\n", + "[0 - 7ff168092740] 20.681354 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 20.734160 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 20.786299 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 20.837268 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 20.888265 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 20.939708 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 20.990707 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 21.041260 {3}{RequestManager}: Output token is: 18742\n", + "[0 - 7ff1680b6740] 21.091386 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 21.145432 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 21.197149 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 21.249242 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 21.301514 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 21.352632 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 21.404018 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 21.455101 {3}{RequestManager}: Output token is: 56994\n", + "[0 - 7ff1680b6740] 21.506371 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 21.559369 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 21.611370 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 21.663655 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 21.715270 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 21.766481 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 21.818563 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 21.872108 {3}{RequestManager}: Output token is: 29505\n", + "[0 - 7ff168092740] 21.922670 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 21.973973 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 22.024297 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 22.076266 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 22.127594 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 22.179008 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff1680b6740] 22.230414 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 22.281805 {3}{RequestManager}: Output token is: 993\n", + "[0 - 7ff1680b6740] 22.282235 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n", + "[0 - 7ff1680b6740] 22.282243 {3}{RequestManager}: Final output: <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the long neck of giraffes? Why do some animals have long tails? Why do some animals have long legs? Why do some animals have long ears? Why do some animals have long noses? Why do some animals have long whiskers? Why do some animals have long tongues? Why do some animals have long claws? Why do some animals have long teeth? Why do some animals have long hair? Why do some animals have long fur? Why do some animals have long feathers? Why do some animals have long scales? Why do some animals have long sp\n", + "[0 - 7ff1680b6740] 22.282250 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(15892528.0) finish(22282245.0) latency(6389717.0) ttft(15123707.0)\n", + "2024-07-22 06:43:05 - ###PEFT DEBUGGING### Background serving task completed.\n", + "Background server stopped.\n" + ] + } + ], + "source": [ + "import json, random, subprocess, os\n", + "from datasets import load_dataset\n", + "from types import SimpleNamespace\n", + "from huggingface_hub import HfFolder\n", + "import flexflow.serve as ff\n", + "import matplotlib.pyplot as plt\n", + "\n", + "configs_dict = {\n", + " \"num_gpus\": 1,\n", + " \"memory_per_gpu\": 21000,\n", + " \"zero_copy_memory_per_node\": 40000,\n", + " \"num_cpus\": 4,\n", + " \"legion_utility_processors\": 4,\n", + " \"data_parallelism_degree\": 1,\n", + " \"tensor_parallelism_degree\": 1,\n", + " \"pipeline_parallelism_degree\": 1,\n", + " \"offload\": False,\n", + " \"offload_reserve_space_size\": 8 * 1024, # 8GB\n", + " \"use_4bit_quantization\": False,\n", + " \"use_8bit_quantization\": False,\n", + " \"enable_peft\": True,\n", + " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", + " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", + " \"profiling\": False,\n", + " \"inference_debugging\": False,\n", + " \"fusion\": False,\n", + " \"max_requests_per_batch\": 1,\n", + " \"max_sequence_length\": 128,\n", + " \"max_tokens_per_batch\": 128,\n", + " \"max_training_steps\": 100,\n", + " \"seed\": 42,\n", + "}\n", + "model_configs = {\n", + " \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n", + " \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n", + " \"refresh_cache\": False,\n", + " \"full_precision\": False,\n", + " # relative paths\n", + " \"inference_dataset\": \"inference_dataset.json\",\n", + " \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n", + " \"output_file\": \"peft_demo.txt\",\n", + "}\n", + "generation_configs = {\n", + " \"do_sample\": False,\n", + " \"temperature\": 0.9,\n", + " \"topp\": 0.8,\n", + " \"topk\": 1,\n", + "}\n", + "finetuning_configs = {\n", + " \"learning_rate\": 0.001,\n", + " \"momentum\": 0.0,\n", + " \"weight_decay\": 0.0,\n", + " \"nesterov\": False,\n", + "}\n", + "# Merge dictionaries\n", + "configs_dict.update(model_configs)\n", + "configs_dict.update(generation_configs)\n", + "configs_dict.update(finetuning_configs)\n", + "\n", + "configs = SimpleNamespace(**configs_dict)\n", + "\n", + "\n", + "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n", + "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n", + "\n", + "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n", + "ff.init(configs_dict)\n", + "\n", + "# Create the FlexFlow LLM\n", + "ff_data_type = (\n", + " ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n", + ")\n", + "llm = ff.LLM(\n", + " configs.base_model,\n", + " data_type=ff_data_type,\n", + " cache_path=configs.cache_path,\n", + " refresh_cache=configs.refresh_cache,\n", + " output_file=configs.output_file,\n", + ")\n", + "\n", + "lora_inference_config2 = ff.LoraLinearConfig(\n", + " llm.cache_path, \n", + " configs.finetuning_peft_model_id+\"-dolly\",\n", + " base_model_name_or_path=configs.base_model\n", + ")\n", + "llm.add_peft(lora_inference_config2)\n", + "\n", + "\n", + "# Compile the LLM for inference and load the weights into memory\n", + "generation_config = ff.GenerationConfig(\n", + " do_sample=configs.do_sample,\n", + " temperature=configs.temperature,\n", + " topp=configs.topp,\n", + " topk=configs.topk\n", + ")\n", + "llm.compile(\n", + " generation_config,\n", + " max_requests_per_batch=configs.max_requests_per_batch,\n", + " max_seq_length=configs.max_sequence_length,\n", + " max_tokens_per_batch=configs.max_tokens_per_batch,\n", + ")\n", + "\n", + "llm.start_server()\n", + "\n", + "prompts = [s for s in json.load(open(configs.inference_dataset))]\n", + "inference_requests = [\n", + " ff.Request(\n", + " ff.RequestType.REQ_INFERENCE,\n", + " prompt=prompt,\n", + " max_sequence_length=configs.max_sequence_length,\n", + " peft_model_id=llm.get_ff_peft_id(lora_inference_config2),\n", + " )\n", + " for prompt in prompts\n", + "]\n", + "inf_req_res_2 = llm.generate(inference_requests)\n", + "\n", + "llm.stop_server()\n", + "\n", + "with open(\"after_finetuning.txt\", \"w\") as file:\n", + " file.write(str(inf_req_res_2[0].output_text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py new file mode 100644 index 0000000000..9e01b4645b --- /dev/null +++ b/inference/python/peft_demo/demo.py @@ -0,0 +1,240 @@ +import json, random, subprocess +from datasets import load_dataset +from types import SimpleNamespace +from huggingface_hub import HfFolder +import os +import flexflow.serve as ff +import matplotlib.pyplot as plt + + +def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'): + """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k. + Only the 'open_qa' and 'closed_qa' prompts without context are kept. + The datasets are saved into the files given as arguments. + + Keyword arguments: + dataset_size -- the number of prompts to consider + inference_file_path -- the file in which to save the inference data + finetuning_file_path -- the file in which to save the finetuning data + """ + dataset = load_dataset("databricks/databricks-dolly-15k", split="train") + inference_data = [] + finetuning_data = [] + for row in dataset: + if len(finetuning_data) == finetune_dataset_size: + break + if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0: + inference_data.append(row['instruction']) + finetuning_data.append(row['instruction'] + " " + row['response']) + with open(inference_file_path, 'w') as file: + json.dump(inference_data[:1], file) + with open(finetuning_file_path, 'w') as file: + json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': ')) + + +configs_dict = { + "num_gpus": 1, + "memory_per_gpu": 21000, + "zero_copy_memory_per_node": 40000, + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": True, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": False, + "fusion": False, + "max_requests_per_batch": 1, + "max_sequence_length": 128, + "max_tokens_per_batch": 128, + "max_training_steps": 100, + "seed": 42, +} +model_configs = { + "base_model": "meta-llama/Meta-Llama-3-8B", + "inference_peft_model_id": "goliaro/llama-3-8b-lora", + "finetuning_peft_model_id": "goliaro/llama-3-8b-lora", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": False, + # relative paths + "inference_dataset": "inference_dataset.json", + "finetuning_dataset": "/usr/FlexFlow/inference/prompt/peft_dataset.json", + "output_file": "peft_demo.txt", +} +generation_configs = { + "do_sample": False, + "temperature": 0.9, + "topp": 0.8, + "topk": 1, +} +finetuning_configs = { + "learning_rate": 0.001, + "momentum": 0.0, + "weight_decay": 0.0, + "nesterov": False, +} +# Merge dictionaries +configs_dict.update(model_configs) +configs_dict.update(generation_configs) +configs_dict.update(finetuning_configs) + + +random.seed(configs_dict["seed"]) + +create_datasets(inference_file_path=configs_dict["inference_dataset"], + finetuning_file_path=configs_dict["finetuning_dataset"]) + +configs = SimpleNamespace(**configs_dict) + +# Clear output file +with open(configs.output_file, 'w') as file: + file.write('') + +# Download base and peft inference models +args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model] +# hf_token = input("Please enter your HuggingFace personal access token: ") +# subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/download_peft_model.py'] + args) + + +# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs +ff.init(configs_dict) + +# Create the FlexFlow LLM +ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF +) +llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, +) +# Add inference and/or finetuning lora +lora_inference_config = None +lora_finetuning_config = None +if len(configs.inference_dataset) > 0: + lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + base_model_name_or_path=configs.base_model + ) + llm.add_peft(lora_inference_config) +if len(configs.finetuning_dataset) > 0: + lora_finetuning_config = ff.LoraLinearConfig( + llm.cache_path, + configs.finetuning_peft_model_id, + trainable=True, + init_lora_weights=False, + rank=16, + lora_alpha=16.0, + # target_modules = ["down_proj"], + base_model_name_or_path=configs.base_model, + optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + optimizer_kwargs={ + "learning_rate": configs.learning_rate, + "momentum": configs.momentum, + "weight_decay": configs.weight_decay, + "nesterov": configs.nesterov, + }, + ) + llm.add_peft(lora_finetuning_config) + +# Compile the LLM for inference and load the weights into memory +generation_config = ff.GenerationConfig( + do_sample=configs.do_sample, + temperature=configs.temperature, + topp=configs.topp, + topk=configs.topk +) +enable_peft_finetuning = len(configs.finetuning_dataset) > 0 +llm.compile( + generation_config, + enable_peft_finetuning=enable_peft_finetuning, + max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning), + max_seq_length=configs.max_sequence_length, + max_tokens_per_batch=configs.max_tokens_per_batch, +) + + +llm.start_server() + + +# prompts = [s for s in json.load(open(configs.inference_dataset))] +# inference_requests = [ +# ff.Request( +# ff.RequestType.REQ_INFERENCE, +# prompt=prompt, +# max_sequence_length=configs.max_sequence_length, +# peft_model_id=llm.get_ff_peft_id(lora_inference_config), +# ) +# for prompt in prompts +# ] +# inf_req_res_1 = llm.generate(inference_requests) + + +finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_finetuning_config), + dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset), + max_training_steps=configs.max_training_steps, +) +ft_res = llm.generate([finetuning_request]) +for res in ft_res: + print(res.finetuning_losses) + +# exit(0) +# hf_token = input("Please enter your HuggingFace personal access token: ") +# subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/upload_peft_model.py'] + f"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly".split()) + + + +lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.finetuning_peft_model_id, + base_model_name_or_path=configs.base_model +) +llm.add_peft(lora_inference_config) + +args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model] +#hf_token = input("Please enter your HuggingFace personal access token: ") +# subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +# subprocess.run(['python', '../../utils/download_peft_model.py'] + args) + + +prompts = [s for s in json.load(open(configs.inference_dataset))] +inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_inference_config), + ) + for prompt in prompts +] +inf_req_res_2 = llm.generate(inference_requests) + + +llm.stop_server() + + +print("==Inference result before finetuning: ", inf_req_res_1[0].output_text) +print("==Inference result after finetuning: ", inf_req_res_2[0].output_text) + + +epochs = list(range(configs_dict["max_training_steps"])) +loss_values = ft_res[0].finetuning_losses + +plt.figure(figsize=(10, 6)) +plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b') \ No newline at end of file diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index a6dfa8042e..39529abda3 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -51,9 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -76,7 +79,7 @@ def get_configs(): "full_precision": False, } ], - # "prompt": "", + "prompt": "", "output_file": "", } # Merge dictionaries diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 60233ac8d1..9689080825 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -414,15 +414,18 @@ void FlexFlow::top_level_task(Task const *task, /*allow_exceptions */ true, /*ignore_comments */ true); - std::vector prompts; + std::vector requests; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Add inference request + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; - prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 128 /*max_sequence_length*/); + tree_model.generate(requests); } // terminate the request manager by stopping the background thread diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py new file mode 100644 index 0000000000..38dd577574 --- /dev/null +++ b/inference/utils/download_peft_model.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +import flexflow.serve as ff +import argparse, os + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--base_model_name", type=str, help="Name of the model to download" + ) + parser.add_argument( + "peft_model_ids", + type=str, + nargs="+", + help="Name of the PEFT model(s) to download", + ) + parser.add_argument( + "--cache-folder", + type=str, + help="Folder to use to store the model(s) assets in FlexFlow format", + default=os.environ.get("FF_CACHE_PATH", ""), + ) + parser.add_argument( + "--refresh-cache", + action="store_true", + help="Use this flag to force the refresh of the model(s) weights/tokenizer cache", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--full-precision-only", + action="store_true", + help="Only download the full precision version of the weights", + ) + group.add_argument( + "--half-precision-only", + action="store_true", + help="Only download the half precision version of the weights", + ) + args = parser.parse_args() + return args + + +def main(args): + if args.full_precision_only: + data_types = (ff.DataType.DT_FLOAT,) + elif args.half_precision_only: + data_types = (ff.DataType.DT_HALF,) + else: + data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) + + for data_type in data_types: + llm = ff.LLM( + args.base_model_name, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + for peft_model_id in args.peft_model_ids: + lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id) + llm.add_peft(lora_config) + llm.download_hf_weights_if_needed() + llm.download_hf_config() + llm.download_hf_tokenizer_if_needed() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/inference/utils/upload_peft_model.py b/inference/utils/upload_peft_model.py new file mode 100644 index 0000000000..7098d72f98 --- /dev/null +++ b/inference/utils/upload_peft_model.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +import argparse, os +from huggingface_hub import HfApi, HfFolder +from transformers import AutoModelForCausalLM +from peft import LoraConfig, PeftModel +import torch +import numpy as np +import flexflow.serve as ff +from peft import LoraConfig, get_peft_model + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Download a PEFT model with FlexFlow, process it, and upload it to the Hugging Face Hub." + ) + parser.add_argument( + "--peft-model-id", + type=str, + required=True, + help="(Local) Hugging Face model ID of the PEFT model to upload.", + ) + parser.add_argument( + "--upload-peft-model-id", + type=str, + required=True, + help="(Remote) Hugging Face model ID of the PEFT model to upload.", + ) + parser.add_argument( + "--cache-folder", + type=str, + default=os.environ.get( + "FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow") + ), + help="Path to the FlexFlow cache folder", + ) + parser.add_argument( + "--private", + action="store_true", + help="Whether to upload the processed PEFT model as a private model on Hugging Face Hub.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + # Ensure Hugging Face CLI is logged in + if not HfFolder.get_token(): + raise RuntimeError( + "Hugging Face token not found. Please login using `huggingface-cli login`." + ) + + lora_config_filepath = os.path.join( + args.cache_folder, + "finetuned_models", + args.peft_model_id, + "config", + "ff_config.json", + ) + peft_config = ff.LoraLinearConfig.from_jsonfile(lora_config_filepath) + print(peft_config) + hf_peft_config = peft_config.to_hf_config() + print(hf_peft_config) + if peft_config.precision != "fp32" and peft_config.precision != "fp16": + raise ValueError(f"Unsupported precision: {peft_config.precision}") + model = AutoModelForCausalLM.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if peft_config.precision == "fp32" else torch.float16, + device_map="auto", + ) + model = get_peft_model(model, hf_peft_config) + in_dim = model.config.intermediate_size + out_dim = model.config.hidden_size + + weight_folder = os.path.join( + args.cache_folder, "finetuned_models", args.peft_model_id, "weights", "shard_0" + ) + num_shards = 1 + while os.path.exists(weight_folder.replace("shard_0", f"shard_{num_shards}")): + num_shards += 1 + if not in_dim % num_shards == 0: + raise ValueError( + f"Number of shards ({num_shards}) must divide the input dimension ({in_dim})" + ) + lora_weight_files = os.listdir(weight_folder) + for lora_file in sorted(lora_weight_files): + lora_filename = ".weight".join(lora_file.split(".weight")[:-1]) + hf_parameter_name = f"base_model.model.model.{lora_filename}.default.weight" + if hf_parameter_name not in model.state_dict().keys(): + raise KeyError(f"Parameter {lora_file} not found in HF model.") + + ff_dtype = np.float32 if peft_config.precision == "fp32" else np.float16 + weight_path = os.path.join(weight_folder, lora_file) + # LoRA_A: [in_dim, rank] + # LoRA_B: [rank, out_dim] + if "lora_A" in lora_file: + weight_data = [] + for shard_id in range(num_shards): + weight_path_shard = weight_path.replace("shard_0", f"shard_{shard_id}") + weight_data_shard = np.fromfile(weight_path_shard, dtype=ff_dtype) + print("===in_dim:", in_dim) + print("===out_dim:", out_dim) + print("===rank:", peft_config.rank) + print("===num_shards:", num_shards) + weight_data_shard = weight_data_shard.reshape( + (in_dim // num_shards, peft_config.rank), order="F" + ) + weight_data.append(weight_data_shard) + weight_data = np.concatenate(weight_data, axis=0).T + elif "lora_B" in lora_file: + weight_data = np.fromfile(weight_path, dtype=ff_dtype) + weight_data = weight_data.reshape((peft_config.rank, out_dim), order="F").T + weight_tensor = torch.from_numpy(weight_data) + + param = model.state_dict()[hf_parameter_name] + + actual_numel = weight_tensor.numel() + expected_numel = param.numel() + if actual_numel != expected_numel: + raise ValueError( + f"Parameter {lora_file} has unexpected parameter count: {actual_numel} (actual) != {expected_numel} (expected)" + ) + + if weight_tensor.shape != param.shape: + raise ValueError( + f"Parameter {lora_file} has unexpected shape: {weight_tensor.shape} (actual) != {param.shape} (expected)" + ) + if weight_tensor.dtype != param.dtype: + raise ValueError( + f"Parameter {lora_file} has unexpected dtype: {weight_tensor.dtype} (actual) != {param.dtype} (expected)" + ) + + with torch.no_grad(): + param.copy_(weight_tensor) + + model.push_to_hub(f"{args.upload_peft_model_id}", use_auth_token=True, private=args.private) + + print("Upload process completed.") + + +if __name__ == "__main__": + main() diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 2820cf485a..b8ed15eaea 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -88,7 +88,10 @@ "offload": "-offload", "offload_reserve_space_size": "-offload-reserve-space-size", "use_4bit_quantization": "--4bit-quantization", - "use_8bit_quantization": "--8bit-quantization" + "use_8bit_quantization": "--8bit-quantization", + "enable_peft": "-enable-peft", + "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", + "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 14cf4eebf7..7692ccb88f 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -28,6 +28,8 @@ CompMode, MetricsType, InferenceMode, + RequestType, + OptimizerType, ModelType, OpType, ParameterSyncType, @@ -36,6 +38,9 @@ ) from flexflow.config import * from .flexflowlib import ffi, flexflow_library +from typing import Union, List +from peft import LoraConfig +import json def ffc(): @@ -1243,1009 +1248,935 @@ def get_weights(self, ffmodel): # ----------------------------------------------------------------------- -# FFModel +# SGDOptimizer # ----------------------------------------------------------------------- -class FFModel(object): - """ """ +class SGDOptimizer(object): + __slots__ = ["handle", "_handle"] - __slots__ = [ - "handle", - "_handle", - "_layers", - "_nb_layers", - "_ffconfig", - "_tracing_id", - "initializers", - "attr_tensors", - ] + def __init__( + self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 + ): + self.handle = ffc().flexflow_sgd_optimizer_create( + ffmodel.handle, lr, momentum, nesterov, weight_decay + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) - def __init__(self, ffconfig): - """Constructor of FFModel. + def set_learning_rate(self, learning_rate): + ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) - :param ffconfig: configurations of FlexFlow and the created model. - :type ffconfig: FFConfig - :returns: FFModel -- the model. - """ - self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) - self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) - self._layers = dict() - self._nb_layers = 0 - self._ffconfig = ffconfig - global ff_tracing_id - self._tracing_id = ff_tracing_id - ff_tracing_id += 1 - self.initializers = {} - self.attr_tensors = {} +# ----------------------------------------------------------------------- +# AdamOptimizer +# ----------------------------------------------------------------------- - def get_layers(self): - return self._layers - def add_layer(self, op_type, name): - layer_id = self._nb_layers - op_handle = ffc().flexflow_model_get_last_layer(self.handle) - self._layers[self._nb_layers] = convert_op_handle_to_op( - op_type, op_handle, idx=layer_id, name=name +class AdamOptimizer(object): + __slots__ = ["handle", "_handle"] + + def __init__( + self, + ffmodel, + alpha=0.001, + beta1=0.9, + beta2=0.999, + weight_decay=0.0, + epsilon=1e-8, + ): + self.handle = ffc().flexflow_adam_optimizer_create( + ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon ) - self._nb_layers += 1 + self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) - def create_tensor(self, dims, data_type, create_grad=True): - """Instantiate a FlexFlow tensor. + def set_learning_rate(self, learning_rate): + ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) - :param x: a shape tuple/list (integers), including the batch size. - :type x: list of int - :param data_type: the datatype of the created tensor. Options are - DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. - :type data_type: DataType +# ----------------------------------------------------------------------- +# Initializer +# ----------------------------------------------------------------------- +class Initializer(object): + __slots__ = ["handle", "p_handle"] - :param create_grad: weather the tensor creates a gradients vector. - If you don't specify anything, a gradients vector is used. - :type create_grad: bool + def __init__(self, handle, p_handle=0): + self.p_handle = ffi.new("flexflow_initializer_t *") + if handle == None: + self.p_handle.impl = ffi.NULL + else: + self.p_handle.impl = handle.impl + self.handle = self.p_handle[0] + assert ffi.typeof(self.handle) == ffi.typeof( + "flexflow_initializer_t" + ), "Initializer handle is wrong" - :returns: Tensor -- the output tensor. - """ - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_tensor_create( - self.handle, num_dims, c_dims, c_data_type, create_grad - ) - return Tensor(handle) - def map_tensor(self, tensor, parallel_op=None): - op_handle = self.__get_op_handle(parallel_op) - ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) +# ----------------------------------------------------------------------- +# GlorotUniform +# ----------------------------------------------------------------------- - def create_constant(self, dims, value, data_type): - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_constant_create( - self.handle, num_dims, c_dims, value, c_data_type - ) - return Tensor(handle) - def exp(self, x, name=None): - """Exponential activation function. +class GlorotUniformInitializer(Initializer): + __slots__ = ["glorot_handle", "_glorot_handle"] - :param x: the input Tensor. - :type x: Tensor + def __init__(self, seed): + self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) + self._glorot_handle = ffi.gc( + self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + ) + super(GlorotUniformInitializer, self).__init__(self.glorot_handle) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) - self.add_layer(OpType.EXP, name) - return Tensor(handle, owner_op_type=OpType.EXP) +# ----------------------------------------------------------------------- +# ZeroInitializer +# ----------------------------------------------------------------------- - def sin(self, x, name=None): - """Elementwise sine function. - :param x: the input Tensor. - :type x: Tensor +class ZeroInitializer(Initializer): + __slots__ = ["zero_handle", "_zero_handle"] - :param name: the name of the layer. Default is None. - :type name: string + def __init__(self): + self.zero_handle = ffc().flexflow_zero_initializer_create() + self._zero_handle = ffi.gc( + self.zero_handle, ffc().flexflow_zero_initializer_destroy + ) + super(ZeroInitializer, self).__init__(self.zero_handle) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) - self.add_layer(OpType.SIN, name) - return Tensor(handle, owner_op_type=OpType.SIN) - def cos(self, x, name=None): - """Elementwise cosine function. +# ----------------------------------------------------------------------- +# UniformInitializer +# ----------------------------------------------------------------------- - :param x: the input Tensor. - :type x: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class UniformInitializer(Initializer): + __slots__ = ["uniform_handle", "_uniform_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) - self.add_layer(OpType.COS, name) - return Tensor(handle, owner_op_type=OpType.COS) + def __init__(self, seed, minv, maxv): + self.uniform_handle = ffc().flexflow_uniform_initializer_create( + seed, minv, maxv + ) + self._uniform_handle = ffi.gc( + self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + ) + super(UniformInitializer, self).__init__(self.uniform_handle) - def add(self, x, y, inplace_a=False, name=None): - """Layer that adds two input Tensors, :attr:`output = x + y`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# NormInitializer +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class NormInitializer(Initializer): + __slots__ = ["norm_handle", "_norm_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_add( - self.handle, x.handle, y.handle, inplace_a, c_name + def __init__(self, seed, mean, stddev): + self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) + self._norm_handle = ffi.gc( + self.norm_handle, ffc().flexflow_norm_initializer_destroy ) - self.add_layer(OpType.ADD, name) - return Tensor(handle, owner_op_type=OpType.ADD) - - def subtract(self, x, y, inplace_a=False, name=None): - """Layer that subtracts two input Tensors, :attr:`output = x * y`. + super(NormInitializer, self).__init__(self.norm_handle) - :param x: the first input Tensor. - :type x: Tensor - :param y: the second input Tensor. - :type y: Tensor +# ----------------------------------------------------------------------- +# PerfMetrics +# ----------------------------------------------------------------------- - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_subtract( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.SUBTRACT, name) - return Tensor(handle, owner_op_type=OpType.SUBTRACT) +class PerfMetrics(object): + __slots__ = ["handle", "_handle"] - def multiply(self, x, y, inplace_a=False, name=None): - """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. + def __init__(self, handle): + self.handle = handle + self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) - :param x: the first input Tensor. - :type x: Tensor + def get_accuracy(self): + return ffc().flexflow_per_metrics_get_accuracy(self.handle) - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +# ----------------------------------------------------------------------- +# NetConfig +# ----------------------------------------------------------------------- - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_multiply( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.MULTIPLY) - def divide(self, x, y, inplace_a=False, name=None): - """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor +class NetConfig(object): + def __init__(self): + self.handle = ffc().flexflow_net_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) + cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cpath) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_divide( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.DIVIDE, name) - return Tensor(handle, owner_op_type=OpType.DIVIDE) +# ----------------------------------------------------------------------- +# DLRMConfig +# ----------------------------------------------------------------------- - def max(self, x, y, inplace_a=False, name=None): - """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. - :param x: the first input Tensor. - :type x: Tensor +class DLRMConfig(object): + def __init__(self): + self.handle = ffc().flexflow_dlrm_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) - :param y: the second input Tensor. - :type y: Tensor + cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cstr) - :param name: the name of the layer. Default is None. - :type name: string + cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) + self.arch_interaction_op = ffi.string(cstr) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_max( - self.handle, x.handle, y.handle, inplace_a, c_name + self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( + self.handle ) - self.add_layer(OpType.MAX, name) - return Tensor(handle, owner_op_type=OpType.MAX) + self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) + self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) + self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( + self.handle + ) + self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) - def min(self, x, y, inplace_a=False, name=None): - """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. + mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) + self.mlp_bot = [] + for i in range(0, mlp_bot_c[0]): + self.mlp_bot.append(mlp_bot_c[i + 1]) - :param x: the first input Tensor. - :type x: Tensor + mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) + self.mlp_top = [] + for i in range(0, mlp_top_c[0]): + self.mlp_top.append(mlp_top_c[i + 1]) - :param y: the second input Tensor. - :type y: Tensor + embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) + self.embedding_size = [] + for i in range(0, embedding_size_c[0]): + self.embedding_size.append(embedding_size_c[i + 1]) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_min( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MIN, name) - return Tensor(handle, owner_op_type=OpType.MIN) +# ----------------------------------------------------------------------- +# Single DataLoader +# ----------------------------------------------------------------------- - def reduce_sum(self, input, axes, keepdims=False, name=None): - """Layer that computes the sum of the input Tensor along given axes. - :param input: the input Tensor. - :type input: Tensor +class SingleDataLoader(object): + __slots__ = ["handle", "_handle"] - :param axes: the axes along which reduction is applied - :type axes: List[int] + def __init__(self, ffmodel, input, full_input, num_samples, data_type): + assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" + assert type(input) is Tensor, "SingleDataLoader input is wrong" + if type(full_input) is Tensor: + self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) + else: + self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) + self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) - :param name: the name of the layer. Default is None. - :type name: string + def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): + assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create( + ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type + ) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_reduce_sum( - self.handle, input.handle, c_axes, len(axes), keepdims, c_name + def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): + # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create2( + ffmodel.handle, input.handle, full_input, num_samples, c_data_type ) - self.add_layer(OpType.REDUCE_SUM, name) - return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) - def rsqrt(self, input, name=None): - """Layer that computes the element-wise reciprocal square-root. + @property + def num_samples(self): + return ffc().flexflow_single_dataloader_get_num_samples(self.handle) - :param input: the input Tensor. - :type input: Tensor + @num_samples.setter + def num_samples(self, samples): + ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) - :param name: the name of the layer. Default is None. - :type name: string + def next_batch(self, ffmodel): + """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. - :returns: Tensor -- the output tensor. + :returns: None -- no returns. """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) - self.add_layer(OpType.RSQRT, name) - return Tensor(handle, owner_op_type=OpType.RSQRT) + ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) - def pow(self, input, exponent, name=None): - """Layer that computes the element-wise power. + def reset(self): + """Reset the current position of the dataloder to 0. - :param input: the input Tensor. - :type input: Tensor + :returns: None -- no returns. + """ + ffc().flexflow_single_dataloader_reset(self.handle) - :param exponent: exponent to raise each element in the input tensor. - :type exponent: float - :param name: the name of the layer. Default is None. - :type name: string +class RegionNdarray(object): + __slots__ = ["__array_interface__"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_pow( - self.handle, input.handle, exponent, c_name - ) - self.add_layer(OpType.POW, name) - return Tensor(handle, owner_op_type=OpType.POW) + def __init__(self, shape, data_type, base_ptr, strides, read_only): + # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html + if data_type == DataType.DT_HALF: + field_type = " 0: + raise ValueError( + "Target modules can only be specified when init_lora_weights=True" + ) + else: + if init_lora_weights: + raise ValueError( + "LORA weights initialization from scratch not supported in inference model" + ) + if len(target_modules) > 0: + raise ValueError( + "Target modules can only be specified when trainable=True" + ) + + # Check rank, lora_alpha, lora_dropout values + if rank is not None or lora_alpha is not None or lora_dropout is not None: + if not trainable or not init_lora_weights: + raise ValueError( + "rank, lora_alpha, and lora_dropout can only be set when trainable=True and init_lora_weights=True" + ) + rank = rank if rank is not None else 8 + lora_alpha = lora_alpha if lora_alpha is not None else 8.0 + lora_dropout = lora_dropout if lora_dropout is not None else 0.0 + + # If passed, check if the values of rank, lora_alpha, and lora_dropout are valid + if rank < 1 or type(rank) != int: + raise ValueError("Rank must be >= 1 and an integer") + if lora_alpha <= 0: + raise ValueError("Lora_alpha must be > 0") + if lora_dropout < 0 or lora_dropout > 1: + raise ValueError("Lora_dropout must be in the interval [0, 1]") + + self.ff_initialized = False + self._cache_folder = cache_folder + self._peft_model_id = peft_model_id + self._trainable = trainable + self._init_lora_weights = init_lora_weights + self._base_model_name_or_path = base_model_name_or_path + self._precision = precision + self._rank = rank + self._lora_alpha = lora_alpha + self._lora_dropout = lora_dropout + self._target_modules = target_modules + self.optimizer_type = optimizer_type + self.optimizer_kwargs = optimizer_kwargs + + def ff_compile(self): + c_cache_folder = get_c_name(os.path.expanduser(self.cache_folder)) + peft_model_id = get_c_name(self.peft_model_id) + base_model_name_or_path = get_c_name(self.base_model_name_or_path) + precision = get_c_name(self.precision) + c_target_modules = [ + get_c_name(target_module) for target_module in self.target_modules + ] + c_optimizer_type = enum_to_int(OptimizerType, self.optimizer_type) + # SGD optional optimizer args + sgd_learning_rate = self.optimizer_kwargs.get("learning_rate", 0.001) + sgd_momentum = self.optimizer_kwargs.get("momentum", 0.0) + sgd_nesterov = self.optimizer_kwargs.get("nesterov", False) + sgd_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0) + # Adam optional optimizer args + adam_alpha = self.optimizer_kwargs.get("alpha", 0.001) + adam_beta1 = self.optimizer_kwargs.get("beta1", 0.9) + adam_beta2 = self.optimizer_kwargs.get("beta2", 0.999) + adam_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0) + adam_epsilon = self.optimizer_kwargs.get("epsilon", 1e-8) + self.handle = ffc().flexflow_lora_linear_config_create( + c_cache_folder, + peft_model_id, + self.trainable, + self.init_lora_weights, + base_model_name_or_path, + precision, + self.rank, + self.lora_alpha, + self.lora_dropout, + len(self.target_modules), + c_target_modules, + c_optimizer_type, + sgd_learning_rate, + sgd_momentum, + sgd_nesterov, + sgd_weight_decay, + adam_alpha, + adam_beta1, + adam_beta2, + adam_weight_decay, + adam_epsilon, + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy) + self.ff_initialized = True + + @classmethod + def from_jsonfile(self, jsonfile: str): + with open(jsonfile, "r") as file: + config = json.load(file) + config_dict = dict(config) + config_dict["optimizer_type"] = OptimizerType.OPTIMIZER_TYPE_SGD + return LoraLinearConfig(**config_dict) + + def to_hf_config(self) -> LoraConfig: + return LoraConfig( + base_model_name_or_path=self.base_model_name_or_path, + r=self.rank, + target_modules=self.target_modules, + lora_alpha=self.lora_alpha, + lora_dropout=self.lora_dropout, + ) - :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. - :type padding_h: int + @property + def cache_folder(self): + if self.ff_initialized: + c_cache_folder = ffc().flexflow_lora_linear_config_get_cache_folder( + self.handle + ) + return ffi.string(c_cache_folder).decode("utf-8") + else: + return self._cache_folder - :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. - :type padding_w: int + @property + def peft_model_id(self): + if self.ff_initialized: + c_peft_model_id = ffc().flexflow_lora_linear_config_get_peft_model_id( + self.handle + ) + return ffi.string(c_peft_model_id).decode("utf-8") + else: + return self._peft_model_id - :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied. - :type activation: PoolType + @property + def rank(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_rank(self.handle) + else: + return self._rank - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode + @property + def lora_alpha(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_lora_alpha(self.handle) + else: + return self._lora_alpha - :param name: the name of the layer. Default is None. - :type name: string + @property + def lora_dropout(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_lora_dropout(self.handle) + else: + return self._lora_dropout - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_pool_type = enum_to_int(PoolType, pool_type) - c_activation = enum_to_int(ActiMode, activation) - handle = ffc().flexflow_model_add_pool2d( - self.handle, - input.handle, - kernel_h, - kernel_w, - stride_h, - stride_w, - padding_h, - padding_w, - c_pool_type, - c_activation, - c_name, - ) - self.add_layer(OpType.POOL2D, name) - return Tensor(handle, owner_op_type=OpType.POOL2D) + @property + def trainable(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_trainable(self.handle) + else: + return self._trainable - def batch_norm(self, input, relu=True, name=None): - """Layer that normalizes its inputs. + @property + def init_lora_weights(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_init_lora_weights(self.handle) + else: + return self._init_lora_weights - Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. + @property + def base_model_name_or_path(self): + if self.ff_initialized: + c_base_model_name_or_path = ( + ffc().flexflow_lora_linear_config_get_base_model_name_or_path( + self.handle + ) + ) + return ffi.string(c_base_model_name_or_path).decode("utf-8") + else: + return self._base_model_name_or_path - :param input: the list of input Tensors. - :type input: Tensor + @property + def precision(self): + if self.ff_initialized: + c_precision = ffc().flexflow_lora_linear_config_get_precision(self.handle) + return ffi.string(c_precision).decode("utf-8") + else: + return self._precision - :param relu: whether a ReLU function is applied. Default is True. - :type relu: bool + @property + def target_modules(self): + if self.ff_initialized: + num_target_modules = ffi.new("int *") + c_target_modules = ffc().flexflow_lora_linear_config_get_target_modules( + self.handle, num_target_modules + ) + target_modules = [] + for i in range(num_target_modules[0]): + target_modules.append(ffi.string(c_target_modules[i]).decode("utf-8")) + return target_modules + else: + return self._target_modules - :param name: the name of the layer. Default is None. - :type name: string + @cache_folder.setter + def cache_folder(self, value: str): + self._cache_folder = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_cache_folder(self.handle, value) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_batch_norm( - self.handle, input.handle, relu, c_name - ) - self.add_layer(OpType.BATCH_NORM, name) - return Tensor(handle, owner_op_type=OpType.BATCH_NORM) + @peft_model_id.setter + def peft_model_id(self, value: str): + self._peft_model_id = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_peft_model_id(self.handle, value) - def layer_norm( - self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None - ): - """Add a LayerNorm layer + @rank.setter + def rank(self, value: int): + self._rank = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_rank(self.handle, value) - :param input: The input tensor - :type input: Tensor - :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over - :type axes: Union[int, List[int]] - :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True - :type elementwise_affine: bool, optional - :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 - :type eps: float, optional - :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True - :type use_bias: bool, optional - :param name: Name of the operator, also used for loading weights in inference mode, defaults to None - :type name: _type_, optional - :return: The LayerNorm output tensor - :rtype: Tensor - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_layer_norm( - self.handle, - input.handle, - len(axes), - c_axes, - elementwise_affine, - eps, - use_bias, - c_name, - ) - self.add_layer(OpType.LAYER_NORM, name) - return Tensor(handle, owner_op_type=OpType.LAYER_NORM) + @lora_alpha.setter + def lora_alpha(self, value: float): + self._lora_alpha = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_lora_alpha(self.handle, value) - def residual_layer_norm( - self, - input, - residual1, - residual2, - use_two_residuals, - axes, - elementwise_affine=True, - eps=1e-5, - use_bias=True, - name=None, - ): - """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in - better efficiency compared to using separate element-wise add and LayerNorm operators. + @lora_dropout.setter + def lora_dropout(self, value: float): + self._lora_dropout = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_lora_dropout(self.handle, value) - :param input: The input tensor - :type input: Tensor - :param residual1: The residual tensor to add to the input before computing the LayerNorm - :type residual1: Tensor - :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm - :type residual2: Tensor - :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise - :type use_two_residuals: bool - :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over - :type axes: List[int] - :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True - :type elementwise_affine: bool, optional - :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 - :type eps: float, optional - :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True - :type use_bias: bool, optional - :param name: Name of the operator, also used for loading weights in inference mode, defaults to None - :type name: str, optional - :return: A tensor with the sum of the input and residual(s), and the LayerNorm output - :rtype: (Tensor, Tensor) - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - residual2_handle = ( - residual1.handle - ) # This is intentional. Data will be ignored, and we cannot pass None - if use_two_residuals: - assert residual2 is not None - residual2_handle = residual2.handle - handles_array = ffc().flexflow_model_add_residual_layer_norm( - self.handle, - input.handle, - residual1.handle, - residual2_handle, - use_two_residuals, - len(axes), - c_axes, - elementwise_affine, - eps, - use_bias, - c_name, - ) - self.add_layer(OpType.RESIDUAL_LAYERNORM, name) - return Tensor( - handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM - ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM) + @trainable.setter + def trainable(self, value: bool): + self._trainable = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_trainable(self.handle, value) - def add_bias_residual_layer_norm( - self, - input, - residual, - axes, - elementwise_affine=True, - eps=1e-5, - use_bias=True, - name=None, - ): - """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, - resulting in better efficiency compared to using separate attention bias addition + - element-wise residual addition + LayerNorm operators. + @init_lora_weights.setter + def init_lora_weights(self, value: bool): + self._init_lora_weights = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_init_lora_weights(self.handle, value) - :param input: The input tensor - :type input: Tensor - :param residual: The residual tensor - :type residual: Tensor - :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over - :type axes: Union[int, List[int]] - :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True - :type elementwise_affine: bool, optional - :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 - :type eps: float, optional - :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True - :type use_bias: bool, optional - :param name: Name of the operator, also used for loading weights in inference mode, defaults to None - :type name: _type_, optional - :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output - :rtype: (Tensor, Tensor) - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm( - self.handle, - input.handle, - residual.handle, - len(axes), - c_axes, - elementwise_affine, - eps, - use_bias, - c_name, - ) - self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) - return Tensor( - handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM - ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) - def sigmoid_silu_multi(self, input1, input2, name=None): - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sigmoid_silu_multi( - self.handle, input1.handle, input2.handle, c_name - ) - self.add_layer(OpType.SIGMOID_SILU_MULTI, name) - return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) +# ----------------------------------------------------------------------- +# PEFTModelID +# ----------------------------------------------------------------------- - def batch_matmul( - self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None - ): - """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. - :param A: the first input Tensor. - :type A: Tensor +class PEFTModelID(object): + __slots__ = ["handle", "_handle"] - :param B: the second input Tensor. - :type B: Tensor + __no_id_h = None - :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension - :type a_seq_length_dim: int + def __init__(self, id=None): + if id is None: + self.handle = ffc().flexflow_peft_model_id_create() + else: + self.handle = ffc().flexflow_peft_model_id_create_id(id) + self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy) - :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension - :type b_seq_length_dim: int + @staticmethod + def no_id_handle(): + if PEFTModelID.__no_id_h is None: + PEFTModelID.__no_id_h = ffc().flexflow_peft_model_id_no_id() + return PEFTModelID.__no_id_h - :param name: the name of the layer. Default is None. - :type name: string - :param name: Whether to add use bias in layer normalization - :type name: bool +# ----------------------------------------------------------------------- +# Request +# ----------------------------------------------------------------------- - :returns: Tensor -- the output tensor. - """ - if a_seq_length_dim is None: - a_seq_length_dim = -1 - if b_seq_length_dim is None: - b_seq_length_dim = -1 - handle = ffc().flexflow_model_add_batch_matmul( - self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim - ) - self.add_layer(OpType.BATCH_MATMUL, name) - return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) - def dense( +class Request: + """A class to record the metadata of an inference or finetuning request.""" + + def __init__( self, - input, - out_dim, - activation=ActiMode.AC_MODE_NONE, - use_bias=True, - datatype=DataType.DT_NONE, - shared_op=None, - kernel_initializer=None, - bias_initializer=None, - kernel_regularizer=None, - name=None, + req_type: RequestType, + prompt: str = None, + max_sequence_length: int = 128, + peft_model_id: PEFTModelID = None, + dataset_filepath: str = None, + max_training_steps: int = 1, ): - """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where - :attr:`activation` is the element-wise activation function passed as the activation argument, - :attr:`kernel` is a weights matrix created by the layer, and - :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True). + self.req_type = req_type + self.prompt = prompt + self.max_sequence_length = max_sequence_length + self.peft_model_id = peft_model_id + self.dataset_filepath = dataset_filepath + self.max_training_steps = max_training_steps - The size of input tensor is :math:`(N, C_{in})` and the size of output tensor - is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim` - - :param input: the input Tensor. - :type input: Tensor - :param out\_dim: dimensionality of the output space. - :type out\_dim: int +# ----------------------------------------------------------------------- +# FFModel +# ----------------------------------------------------------------------- - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode - :param use_bias: whether the layer uses a bias vector. Default is True. - :type use_bias: bool +class FFModel(object): + """ """ - :param shared_op: the layer whose parameters are shared with. Default is None. - :type shared_op: Op + __slots__ = [ + "handle", + "_handle", + "_layers", + "_nb_layers", + "_ffconfig", + "_tracing_id", + "initializers", + "attr_tensors", + ] - :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + def __init__(self, ffconfig): + """Constructor of FFModel. - :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. - :type bias_initializer: Initializer + :param ffconfig: configurations of FlexFlow and the created model. + :type ffconfig: FFConfig - :param kernel_regularizer: Regularizer for the kernel weights matrix - :type bias_initializer: Regularizer + :returns: FFModel -- the model. + """ + self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) + self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) + self._layers = dict() + self._nb_layers = 0 + self._ffconfig = ffconfig + global ff_tracing_id + self._tracing_id = ff_tracing_id + ff_tracing_id += 1 + self.initializers = {} + self.attr_tensors = {} - :param name: the name of the layer. Default is None. - :type name: string + def get_layers(self): + return self._layers - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - shared_op_handle = self.__get_op_handle(shared_op) - c_activation = enum_to_int(ActiMode, activation) - c_datatype = enum_to_int(DataType, datatype) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - bias_init_handle = self.__get_initializer_handle(bias_initializer) - if kernel_regularizer: - c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type) - kernel_reg_lambda = kernel_regularizer._lambda - else: - c_kernel_reg_type = enum_to_int( - RegularizerMode, RegularizerMode.REG_MODE_NONE - ) - kernel_reg_lambda = 0.0 - handle = ffc().flexflow_model_add_dense( - self.handle, - input.handle, - out_dim, - c_activation, - use_bias, - c_datatype, - shared_op_handle, - kernel_init_handle, - bias_init_handle, - c_kernel_reg_type, - kernel_reg_lambda, - c_name, + def add_layer(self, op_type, name): + layer_id = self._nb_layers + op_handle = ffc().flexflow_model_get_last_layer(self.handle) + self._layers[self._nb_layers] = convert_op_handle_to_op( + op_type, op_handle, idx=layer_id, name=name ) - self.add_layer(OpType.LINEAR, name) - return Tensor(handle, owner_op_type=OpType.LINEAR) - - def concat(self, tensors, axis, name=None): - """Layer that concatenates a list of inputs. + self._nb_layers += 1 - It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs. + def create_tensor(self, dims, data_type, create_grad=True): + """Instantiate a FlexFlow tensor. - :param input: the list of input Tensors. - :type input: List of Tensors + :param x: a shape tuple/list (integers), including the batch size. + :type x: list of int - :param axis: the dimension along which to concatenate. - :type axis: int + :param data_type: the datatype of the created tensor. Options are + DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. + :type data_type: DataType - :param name: the name of the layer. Default is None. - :type name: string + :param create_grad: weather the tensor creates a gradients vector. + If you don't specify anything, a gradients vector is used. + :type create_grad: bool :returns: Tensor -- the output tensor. """ - assert type(tensors) is list, "tensors should be a list" - tensor_handle_list = [] - n = len(tensors) - assert n <= 256, "Please increase MAX_NUM_INPUTS" - for tensor in tensors: - tensor_handle_list.append(tensor.handle) - c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_concat( - self.handle, n, c_tensor_handle_list, axis, c_name + c_dims = ffi.new("int[]", dims) + c_data_type = enum_to_int(DataType, data_type) + num_dims = len(dims) + handle = ffc().flexflow_tensor_create( + self.handle, num_dims, c_dims, c_data_type, create_grad ) - self.add_layer(OpType.CONCAT, name) - return Tensor(handle, owner_op_type=OpType.CONCAT) + return Tensor(handle) - def split(self, input, sizes, axis, name=None): - """Layer that splits a :attr:`input` tensor into a list of tensors. + def map_tensor(self, tensor, parallel_op=None): + op_handle = self.__get_op_handle(parallel_op) + ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) - :param input: the input Tensor. - :type input: Tensor + def create_constant(self, dims, value, data_type): + c_dims = ffi.new("int[]", dims) + c_data_type = enum_to_int(DataType, data_type) + num_dims = len(dims) + handle = ffc().flexflow_constant_create( + self.handle, num_dims, c_dims, value, c_data_type + ) + return Tensor(handle) - :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`. - :type sizes: int or list of int + def exp(self, x, name=None): + """Exponential activation function. - :param axis: the dimension along which to split. - :type axis: int + :param x: the input Tensor. + :type x: Tensor :param name: the name of the layer. Default is None. :type name: string - :returns: list of Tensors -- the output tensors. + :returns: Tensor -- the output tensor. """ - if type(sizes) is list: - split = sizes - else: - assert input.dims[axis] % sizes == 0, "Split dimension is not divisible" - split = [input.dims[axis] // sizes for i in range(sizes)] - n = len(split) - assert n <= 256, "Please increase MAX_NUM_OUTPUTS" - c_split = ffi.new("int[]", split) - c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") c_name = get_c_name(name) - ffc().flexflow_model_add_split( - self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name - ) - output_tensor_list = [] - for i in range(n): - tensor_p_handle = ffi.new("flexflow_tensor_t*") - tensor_p_handle.impl = c_outputs_handle_list[i].impl - output_tensor_list.append( - Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle) - ) - self.add_layer(OpType.SPLIT, name) - del c_outputs_handle_list - return output_tensor_list + handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) + self.add_layer(OpType.EXP, name) + return Tensor(handle, owner_op_type=OpType.EXP) - def flat(self, input, name=None): - """Flattens the input. Does not affect the batch size. + def sin(self, x, name=None): + """Elementwise sine function. - :param input: the input Tensor. - :type input: Tensor + :param x: the input Tensor. + :type x: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2253,15 +2184,15 @@ def flat(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) - self.add_layer(OpType.FLAT, name) - return Tensor(handle, owner_op_type=OpType.FLAT) + handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) + self.add_layer(OpType.SIN, name) + return Tensor(handle, owner_op_type=OpType.SIN) - def softmax(self, input, axis=-1, name=None): - """Softmax activation function. + def cos(self, x, name=None): + """Elementwise cosine function. - :param input: the input Tensor. - :type input: Tensor + :param x: the input Tensor. + :type x: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2269,23 +2200,18 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_softmax( - self.handle, input.handle, axis, c_name - ) - self.add_layer(OpType.SOFTMAX, name) - return Tensor(handle, owner_op_type=OpType.SOFTMAX) - - def reshape(self, input, shape, name=None): - """Layer that reshapes inputs into the given shape. + handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) + self.add_layer(OpType.COS, name) + return Tensor(handle, owner_op_type=OpType.COS) - Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order, - except with a new shape given by :attr:`shape`. + def add(self, x, y, inplace_a=False, name=None): + """Layer that adds two input Tensors, :attr:`output = x + y`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param shape: A list defining the shape of the output tensor. - :type shape: list of int + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2293,45 +2219,41 @@ def reshape(self, input, shape, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - c_shape = ffi.new("int[]", shape) - handle = ffc().flexflow_model_add_reshape( - self.handle, input.handle, len(shape), c_shape, c_name + handle = ffc().flexflow_model_add_add( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.RESHAPE, name) - return Tensor(handle, owner_op_type=OpType.RESHAPE) - - def gather(self, input, index, dim, name=None): - """Layer that gathers values along the dim axis. + self.add_layer(OpType.ADD, name) + return Tensor(handle, owner_op_type=OpType.ADD) - :param input: the input tensor - :type input: Tensor + def subtract(self, x, y, inplace_a=False, name=None): + """Layer that subtracts two input Tensors, :attr:`output = x * y`. - :param index: the index tensor, which specifies the indices of elements to gather - :type index: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param dim: the axis along which to index - :type dim: int + :param y: the second input Tensor. + :type y: Tensor - :param name: the name of the layer. Default is None + :param name: the name of the layer. Default is None. :type name: string - :returns: Tensor -- the output tensor + :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_gather( - self.handle, input.handle, index.handle, dim, c_name + handle = ffc().flexflow_model_add_subtract( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.GATHER, name) - return Tensor(handle, owner_op_type=OpType.GATHER) + self.add_layer(OpType.SUBTRACT, name) + return Tensor(handle, owner_op_type=OpType.SUBTRACT) - def transpose(self, input, perm, name=None): - """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm + def multiply(self, x, y, inplace_a=False, name=None): + """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param perm: A permutation of the dimensions of a. - :type perm: List of int + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2339,23 +2261,20 @@ def transpose(self, input, perm, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - c_perm = ffi.new("int[]", perm) - handle = ffc().flexflow_model_add_transpose( - self.handle, input.handle, len(perm), c_perm, c_name + handle = ffc().flexflow_model_add_multiply( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.TRANSPOSE, name) - return Tensor(handle, owner_op_type=OpType.TRANSPOSE) - - def reverse(self, input, axis, name=None): - """Layer that reverses specific dimensions of a tensor. + self.add_layer(OpType.MULTIPLY, name) + return Tensor(handle, owner_op_type=OpType.MULTIPLY) - Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`. + def divide(self, x, y, inplace_a=False, name=None): + """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param axis: the dimension to reverse. - :type axis: int + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2363,20 +2282,20 @@ def reverse(self, input, axis, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_reverse( - self.handle, input.handle, axis, c_name + handle = ffc().flexflow_model_add_divide( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.REVERSE, name) - return Tensor(handle, owner_op_type=OpType.REVERSE) + self.add_layer(OpType.DIVIDE, name) + return Tensor(handle, owner_op_type=OpType.DIVIDE) - def scalar_multiply(self, input, scalar, inplace=True, name=None): - """Scalar multiplication of a tensor by an scalar. + def max(self, x, y, inplace_a=False, name=None): + """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param input: the scalar - :type scalar: float + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2384,20 +2303,20 @@ def scalar_multiply(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_multiply( - self.handle, input.handle, scalar, inplace, c_name + handle = ffc().flexflow_model_add_max( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.SCALAR_MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) + self.add_layer(OpType.MAX, name) + return Tensor(handle, owner_op_type=OpType.MAX) - def scalar_add(self, input, scalar, inplace=True, name=None): - """Scalar addition of a scalar to each entry of a tensor. + def min(self, x, y, inplace_a=False, name=None): + """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param input: the scalar - :type scalar: float + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2405,20 +2324,20 @@ def scalar_add(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_add( - self.handle, input.handle, scalar, inplace, c_name + handle = ffc().flexflow_model_add_min( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.SCALAR_ADD, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) + self.add_layer(OpType.MIN, name) + return Tensor(handle, owner_op_type=OpType.MIN) - def scalar_sub(self, input, scalar, inplace=True, name=None): - """Scalar subtraction of a scalar to each entry of a tensor. + def reduce_sum(self, input, axes, keepdims=False, name=None): + """Layer that computes the sum of the input Tensor along given axes. :param input: the input Tensor. :type input: Tensor - :param input: the scalar - :type scalar: float + :param axes: the axes along which reduction is applied + :type axes: List[int] :param name: the name of the layer. Default is None. :type name: string @@ -2426,215 +2345,234 @@ def scalar_sub(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_sub( - self.handle, input.handle, scalar, inplace, c_name + c_axes = ffi.new("int[]", axes) + handle = ffc().flexflow_model_add_reduce_sum( + self.handle, input.handle, c_axes, len(axes), keepdims, c_name ) - self.add_layer(OpType.SCALAR_SUB, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) + self.add_layer(OpType.REDUCE_SUM, name) + return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) - def scalar_true_divide(self, input, scalar, inplace=True, name=None): - """Scalar regular division of a tensor by an scalar. + def rsqrt(self, input, name=None): + """Layer that computes the element-wise reciprocal square-root. :param input: the input Tensor. :type input: Tensor - :param input: the scalar - :type scalar: float - :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_truediv( - self.handle, input.handle, scalar, inplace, c_name - ) - self.add_layer(OpType.SCALAR_TRUEDIV, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) + handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) + self.add_layer(OpType.RSQRT, name) + return Tensor(handle, owner_op_type=OpType.RSQRT) - def gelu(self, input, inplace=True, name=None): - """Gaussian Error Linear Unit activation function. + def pow(self, input, exponent, name=None): + """Layer that computes the element-wise power. :param input: the input Tensor. :type input: Tensor + :param exponent: exponent to raise each element in the input tensor. + :type exponent: float + :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) - self.add_layer(OpType.GELU, name) - return Tensor(handle, owner_op_type=OpType.GELU) + handle = ffc().flexflow_model_add_pow( + self.handle, input.handle, exponent, c_name + ) + self.add_layer(OpType.POW, name) + return Tensor(handle, owner_op_type=OpType.POW) - def relu(self, input, inplace=True, name=None): - """Rectified Linear Unit activation function. + def mean(self, input, dims, keepdims=False, name=None): + """Layer that computes the mean of the input tensor across the given + dimensions. :param input: the input Tensor. :type input: Tensor + :param dims: dimensions to take the mean over. + :type dims: list + + :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and + collapses the dimension if False. Default is False. + :type keepdims: bool + :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ + dims = list(dims) + c_dims = ffi.new("int[]", dims) c_name = get_c_name(name) - handle = ffc().flexflow_model_add_relu( - self.handle, input.handle, inplace, c_name + handle = ffc().flexflow_model_add_mean( + self.handle, input.handle, c_dims, len(dims), keepdims, c_name ) - self.add_layer(OpType.RELU, name) - return Tensor(handle, owner_op_type=OpType.RELU) + self.add_layer(OpType.MEAN, name) + return Tensor(handle, owner_op_type=OpType.MEAN) - def identity(self, input, name=None): - """Identity function. + def conv2d( + self, + input, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + activation=ActiMode.AC_MODE_NONE, + groups=1, + use_bias=True, + shared_op=None, + kernel_initializer=None, + bias_initializer=None, + name=None, + ): + """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input` + to produce a tensor of :attr:`output`. - :param input: the input Tensor. - :type input: Tensor + The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor + is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: - :param name: the name of the layer. Default is None. - :type name: string + .. math:: + C_{out} = out\_channels - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) - self.add_layer(OpType.IDENTITY, name) - return Tensor(handle, owner_op_type=OpType.IDENTITY) + .. math:: + K_{H} = kernel\_h - def sigmoid(self, input, name=None): - """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`. + .. math:: + K_{W} = kernel\_w - :param input: the input Tensor. - :type input: Tensor + .. math:: + S_{H} = stride\_h - :param name: the name of the layer. Default is None. - :type name: string + .. math:: + S_{W} = stride\_w - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) - self.add_layer(OpType.SIGMOID, name) - return Tensor(handle, owner_op_type=OpType.SIGMOID) + .. math:: + P_{H} = padding\_h - def tanh(self, input, name=None): - """Hyperbolic tangent activation function. + .. math:: + P_{S} = padding\_s + + .. math:: + H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 + + .. math:: + W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 :param input: the input Tensor. :type input: Tensor - :param name: the name of the layer. Default is None. - :type name: string + :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution). + :type out\_channels: int - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) - self.add_layer(OpType.TANH, name) - return Tensor(handle, owner_op_type=OpType.TANH) + :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`. + :type kernel_h: int - def elu(self, input, inplace=True, name=None): - """Exponential Linear Unit. activation function. + :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`. + :type kernel_w: int - :param input: the input Tensor. - :type input: Tensor + :param stride_h: the stride of the convolution along the height: :math:`S_{H}`. + :type stride_h: int - :param name: the name of the layer. Default is None. - :type name: string + :param stride_w: the stride of the convolution along the width: :math:`S_{W}`. + :type stride_w: int - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_elu( - self.handle, input.handle, inplace, c_name - ) - self.add_layer(OpType.ELU, name) - return Tensor(handle, owner_op_type=OpType.ELU) + :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. + :type padding_h: int - def dropout(self, input, rate, seed, name=None): - """The Dropout layer randomly sets input units to 0 with - a frequency of :attr:`rate` at each step during training time, - which helps prevent overfitting. - Inputs not set to 0 are scaled up by 1/(1 - rate) such that the - sum over all inputs is unchanged. + :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. + :type padding_w: int - :param input: the input Tensor. - :type input: Tensor + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode - :param rate: Fraction of the input units to drop. - :type rate: float(0-1) + :param groups: the number of groups in this convolution + :type groups: int - :param seed: random seed. - :type seed: int + :param use_bias: whether the layer uses a bias vector. Default is True. + :type use_bias: bool + + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op + + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. + :type bias_initializer: Initializer :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ + shared_op_handle = self.__get_op_handle(shared_op) + c_activation = enum_to_int(ActiMode, activation) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + bias_init_handle = self.__get_initializer_handle(bias_initializer) c_name = get_c_name(name) - handle = ffc().flexflow_model_add_dropout( - self.handle, input.handle, rate, seed, c_name + handle = ffc().flexflow_model_add_conv2d( + self.handle, + input.handle, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + c_activation, + groups, + use_bias, + shared_op_handle, + kernel_init_handle, + bias_init_handle, + c_name, ) - self.add_layer(OpType.DROPOUT, name) - return Tensor(handle, owner_op_type=OpType.DROPOUT) + self.add_layer(OpType.CONV2D, name) + return Tensor(handle, owner_op_type=OpType.CONV2D) - def multihead_attention( + def embedding( self, - query, - key, - value, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, + input, + num_embeddings, + embedding_dim, + aggr, + dtype=DataType.DT_FLOAT, + shared_op=None, kernel_initializer=None, name=None, ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, - and returns the dot-product attention between them:. - - :param query: the query Tensor. - :type query: Tensor - - :param key: the key Tensor. - :type key: Tensor - - :param value: the value Tensor. - :type value: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int + """Layer that turns positive integers into dense vectors of fixed size - :param kdim: total number of features in key. Default is 0 - :type kdim: int + :param input: the input Tensor. + :type input: Tensor - :param vdim: total number of features in value. Default is 0 - :type vdim: int + :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1 + :type num_embeddings: int - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + :param embedding_dim: dimension of the dense embedding. + :type embedding_dim: int - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG. + :type aggr: AggrMode - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE + :type dtype: DataType - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer :param name: the name of the layer. Default is None. @@ -2643,97 +2581,105 @@ def multihead_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc().flexflow_model_add_multihead_attention( + shared_op_handle = self.__get_op_handle(shared_op) + c_aggr = enum_to_int(AggrMode, aggr) + c_dtype = enum_to_int(DataType, dtype) + if kernel_initializer is None: + kernel_initializer = GlorotUniformInitializer(42) + assert ( + (type(kernel_initializer) is GlorotUniformInitializer) + or (type(kernel_initializer) is ZeroInitializer) + or (type(kernel_initializer) is UniformInitializer) + or (type(kernel_initializer) is NormInitializer) + ), f"Unknown initializer type: {kernel_initializer}" + handle = ffc().flexflow_model_add_embedding( self.handle, - query.handle, - key.handle, - value.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - kernel_init_handle, + input.handle, + num_embeddings, + embedding_dim, + c_aggr, + c_dtype, + shared_op_handle, + kernel_initializer.handle, c_name, ) - self.add_layer(OpType.MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) + # NOTE: We must keep a reference to the initializer or else it will be + # immediately destructed + self.initializers[name] = kernel_initializer + self.add_layer(OpType.EMBEDDING, name) + return Tensor(handle, owner_op_type=OpType.EMBEDDING) - def inc_multihead_self_attention( + def pool2d( self, input, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + pool_type=PoolType.POOL_MAX, + activation=ActiMode.AC_MODE_NONE, name=None, ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - In inference mode, the attention is computed using incremental decoding. + """Pooling operation for 2D spatial data. - :param input: the input Tensor. - :type input: Tensor + The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor + is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: - :param embed_dim: total dimension of the model - :type embed_dim: int + .. math:: + C_{out} = out\_channels - :param num_heads: Number of attention heads. - :type num_heads: int + .. math:: + K_{H} = kernel\_h - :param kdim: total number of features in key. Default is 0 - :type kdim: int + .. math:: + K_{W} = kernel\_w - :param vdim: total number of features in value. Default is 0 - :type vdim: int + .. math:: + S_{H} = stride\_h - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + .. math:: + S_{W} = stride\_w - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + .. math:: + P_{H} = padding\_h - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + .. math:: + P_{S} = padding\_s - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + .. math:: + H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + .. math:: + W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + :param input: the input Tensor. + :type input: Tensor - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`. + :type kernel_h: int - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`. + :type kernel_w: int - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :param stride_h: the stride of the pooling along the height: :math:`S_{H}`. + :type stride_h: int - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param stride_w: the stride of the pooling along the width: :math:`S_{W}`. + :type stride_w: int - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. + :type padding_h: int + + :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. + :type padding_w: int + + :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied. + :type activation: PoolType + + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode :param name: the name of the layer. Default is None. :type name: string @@ -2741,102 +2687,34 @@ def inc_multihead_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention( + c_pool_type = enum_to_int(PoolType, pool_type) + c_activation = enum_to_int(ActiMode, activation) + handle = ffc().flexflow_model_add_pool2d( self.handle, input.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + c_pool_type, + c_activation, c_name, ) - self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) - - def spec_inc_multihead_self_attention( - self, - input, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, - name=None, - ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (beam search) mode. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + self.add_layer(OpType.POOL2D, name) + return Tensor(handle, owner_op_type=OpType.POOL2D) - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + def batch_norm(self, input, relu=True, name=None): + """Layer that normalizes its inputs. - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param input: the list of input Tensors. + :type input: Tensor - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param relu: whether a ReLU function is applied. Default is True. + :type relu: bool :param name: the name of the layer. Default is None. :type name: string @@ -2844,209 +2722,255 @@ def spec_inc_multihead_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention( + handle = ffc().flexflow_model_add_batch_norm( + self.handle, input.handle, relu, c_name + ) + self.add_layer(OpType.BATCH_NORM, name) + return Tensor(handle, owner_op_type=OpType.BATCH_NORM) + + def layer_norm( + self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None + ): + """Add a LayerNorm layer + + :param input: The input tensor + :type input: Tensor + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: Union[int, List[int]] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: _type_, optional + :return: The LayerNorm output tensor + :rtype: Tensor + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handle = ffc().flexflow_model_add_layer_norm( self.handle, input.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, c_name, ) - self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.LAYER_NORM, name) + return Tensor(handle, owner_op_type=OpType.LAYER_NORM) - def inc_multihead_self_attention_verify( + def residual_layer_norm( self, input, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, + residual1, + residual2, + use_two_residuals, + axes, + elementwise_affine=True, + eps=1e-5, + use_bias=True, + inplace_residual=False, name=None, ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (tree verify) mode. + """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in + better efficiency compared to using separate element-wise add and LayerNorm operators. - :param input: the input Tensor. + :param input: The input tensor :type input: Tensor + :param residual1: The residual tensor to add to the input before computing the LayerNorm + :type residual1: Tensor + :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm + :type residual2: Tensor + :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise + :type use_two_residuals: bool + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: List[int] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False + :type inplace_residual: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: str, optional + :return: A tensor with the sum of the input and residual(s), and the LayerNorm output + :rtype: (Tensor, Tensor) + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + residual2_handle = ( + residual1.handle + ) # This is intentional. Data will be ignored, and we cannot pass None + if use_two_residuals: + assert residual2 is not None + residual2_handle = residual2.handle + handles_array = ffc().flexflow_model_add_residual_layer_norm( + self.handle, + input.handle, + residual1.handle, + residual2_handle, + use_two_residuals, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + inplace_residual, + c_name, + ) + self.add_layer(OpType.RESIDUAL_LAYERNORM, name) + return ( + Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM), + Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM), + ) - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + def add_bias_residual_layer_norm( + self, + input, + residual, + axes, + elementwise_affine=True, + eps=1e-5, + use_bias=True, + inplace_residual=False, + name=None, + ): + """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, + resulting in better efficiency compared to using separate attention bias addition + + element-wise residual addition + LayerNorm operators. - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + :param input: The input tensor + :type input: Tensor + :param residual: The residual tensor + :type residual: Tensor + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: Union[int, List[int]] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False + :type inplace_residual: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: _type_, optional + :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output + :rtype: (Tensor, Tensor) + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm( + self.handle, + input.handle, + residual.handle, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + inplace_residual, + c_name, + ) + self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) + return ( + Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), + Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), + ) - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + def sigmoid_silu_multi(self, input1, input2, name=None): + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid_silu_multi( + self.handle, input1.handle, input2.handle, c_name + ) + self.add_layer(OpType.SIGMOID_SILU_MULTI, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + def batch_matmul( + self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None + ): + """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :param A: the first input Tensor. + :type A: Tensor - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param B: the second input Tensor. + :type B: Tensor - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension + :type a_seq_length_dim: int + + :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension + :type b_seq_length_dim: int :param name: the name of the layer. Default is None. :type name: string + :param name: Whether to add use bias in layer normalization + :type name: bool + :returns: Tensor -- the output tensor. """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify( - self.handle, - input.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - c_name, + if a_seq_length_dim is None: + a_seq_length_dim = -1 + if b_seq_length_dim is None: + b_seq_length_dim = -1 + handle = ffc().flexflow_model_add_batch_matmul( + self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim ) - self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.BATCH_MATMUL, name) + return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) - def inc_multiquery_self_attention( + def dense( self, input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, + out_dim, + activation=ActiMode.AC_MODE_NONE, + use_bias=True, + datatype=DataType.DT_NONE, + shared_op=None, kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, + bias_initializer=None, + kernel_regularizer=None, name=None, ): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - In inference mode, the attention is computed using incremental decoding. + """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where + :attr:`activation` is the element-wise activation function passed as the activation argument, + :attr:`kernel` is a weights matrix created by the layer, and + :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True). + + The size of input tensor is :math:`(N, C_{in})` and the size of output tensor + is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim` :param input: the input Tensor. :type input: Tensor - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int - - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param out\_dim: dimensionality of the output space. + :type out\_dim: int - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :param use_bias: whether the layer uses a bias vector. Default is True. + :type use_bias: bool - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. + :type bias_initializer: Initializer - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param kernel_regularizer: Regularizer for the kernel weights matrix + :type bias_initializer: Regularizer :param name: the name of the layer. Default is None. :type name: string @@ -3054,107 +2978,128 @@ def inc_multiquery_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) + shared_op_handle = self.__get_op_handle(shared_op) + c_activation = enum_to_int(ActiMode, activation) + c_datatype = enum_to_int(DataType, datatype) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention( - self.handle, - input.handle, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, + bias_init_handle = self.__get_initializer_handle(bias_initializer) + if kernel_regularizer: + c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type) + kernel_reg_lambda = kernel_regularizer._lambda + else: + c_kernel_reg_type = enum_to_int( + RegularizerMode, RegularizerMode.REG_MODE_NONE + ) + kernel_reg_lambda = 0.0 + handle = ffc().flexflow_model_add_dense( + self.handle, + input.handle, + out_dim, + c_activation, + use_bias, + c_datatype, + shared_op_handle, kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, + bias_init_handle, + c_kernel_reg_type, + kernel_reg_lambda, c_name, ) - self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + self.add_layer(OpType.LINEAR, name) + return Tensor(handle, owner_op_type=OpType.LINEAR) - def spec_inc_multiquery_self_attention( - self, - input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, - name=None, - ): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (beam search) mode. + def concat(self, tensors, axis, name=None): + """Layer that concatenates a list of inputs. - :param input: the input Tensor. - :type input: Tensor + It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs. - :param embed_dim: total dimension of the model - :type embed_dim: int + :param input: the list of input Tensors. + :type input: List of Tensors - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int + :param axis: the dimension along which to concatenate. + :type axis: int - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int + :param name: the name of the layer. Default is None. + :type name: string - :param kdim: total number of features in key. Default is 0 - :type kdim: int + :returns: Tensor -- the output tensor. + """ + assert type(tensors) is list, "tensors should be a list" + tensor_handle_list = [] + n = len(tensors) + assert n <= 256, "Please increase MAX_NUM_INPUTS" + for tensor in tensors: + tensor_handle_list.append(tensor.handle) + c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_concat( + self.handle, n, c_tensor_handle_list, axis, c_name + ) + self.add_layer(OpType.CONCAT, name) + return Tensor(handle, owner_op_type=OpType.CONCAT) - :param vdim: total number of features in value. Default is 0 - :type vdim: int + def split(self, input, sizes, axis, name=None): + """Layer that splits a :attr:`input` tensor into a list of tensors. - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + :param input: the input Tensor. + :type input: Tensor - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`. + :type sizes: int or list of int - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param axis: the dimension along which to split. + :type axis: int - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :param name: the name of the layer. Default is None. + :type name: string - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + :returns: list of Tensors -- the output tensors. + """ + if type(sizes) is list: + split = sizes + else: + assert input.dims[axis] % sizes == 0, "Split dimension is not divisible" + split = [input.dims[axis] // sizes for i in range(sizes)] + n = len(split) + assert n <= 256, "Please increase MAX_NUM_OUTPUTS" + c_split = ffi.new("int[]", split) + c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") + c_name = get_c_name(name) + ffc().flexflow_model_add_split( + self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name + ) + output_tensor_list = [] + for i in range(n): + tensor_p_handle = ffi.new("flexflow_tensor_t*") + tensor_p_handle.impl = c_outputs_handle_list[i].impl + output_tensor_list.append( + Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle) + ) + self.add_layer(OpType.SPLIT, name) + del c_outputs_handle_list + return output_tensor_list - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + def flat(self, input, name=None): + """Flattens the input. Does not affect the batch size. - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param input: the input Tensor. + :type input: Tensor - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + :param name: the name of the layer. Default is None. + :type name: string - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) + self.add_layer(OpType.FLAT, name) + return Tensor(handle, owner_op_type=OpType.FLAT) - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + def softmax(self, input, axis=-1, name=None): + """Softmax activation function. - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param input: the input Tensor. + :type input: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -3162,107 +3107,93 @@ def spec_inc_multiquery_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention( - self.handle, - input.handle, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - c_name, + handle = ffc().flexflow_model_add_softmax( + self.handle, input.handle, axis, c_name ) - self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.SOFTMAX, name) + return Tensor(handle, owner_op_type=OpType.SOFTMAX) - def inc_multiquery_self_attention_verify( - self, - input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, - name=None, - ): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (tree verify) mode. + def reshape(self, input, shape, name=None): + """Layer that reshapes inputs into the given shape. + + Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order, + except with a new shape given by :attr:`shape`. :param input: the input Tensor. :type input: Tensor - :param embed_dim: total dimension of the model - :type embed_dim: int + :param shape: A list defining the shape of the output tensor. + :type shape: list of int - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int + :param name: the name of the layer. Default is None. + :type name: string - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_shape = ffi.new("int[]", shape) + handle = ffc().flexflow_model_add_reshape( + self.handle, input.handle, len(shape), c_shape, c_name + ) + self.add_layer(OpType.RESHAPE, name) + return Tensor(handle, owner_op_type=OpType.RESHAPE) - :param kdim: total number of features in key. Default is 0 - :type kdim: int + def gather(self, input, index, dim, name=None): + """Layer that gathers values along the dim axis. - :param vdim: total number of features in value. Default is 0 - :type vdim: int + :param input: the input tensor + :type input: Tensor - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + :param index: the index tensor, which specifies the indices of elements to gather + :type index: Tensor - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param dim: the axis along which to index + :type dim: int - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param name: the name of the layer. Default is None + :type name: string - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :returns: Tensor -- the output tensor + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_gather( + self.handle, input.handle, index.handle, dim, c_name + ) + self.add_layer(OpType.GATHER, name) + return Tensor(handle, owner_op_type=OpType.GATHER) - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + def transpose(self, input, perm, name=None): + """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + :param input: the input Tensor. + :type input: Tensor - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param perm: A permutation of the dimensions of a. + :type perm: List of int - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + :param name: the name of the layer. Default is None. + :type name: string - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_perm = ffi.new("int[]", perm) + handle = ffc().flexflow_model_add_transpose( + self.handle, input.handle, len(perm), c_perm, c_name + ) + self.add_layer(OpType.TRANSPOSE, name) + return Tensor(handle, owner_op_type=OpType.TRANSPOSE) - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + def reverse(self, input, axis, name=None): + """Layer that reverses specific dimensions of a tensor. - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`. + + :param input: the input Tensor. + :type input: Tensor + + :param axis: the dimension to reverse. + :type axis: int :param name: the name of the layer. Default is None. :type name: string @@ -3270,43 +3201,20 @@ def inc_multiquery_self_attention_verify( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify( - self.handle, - input.handle, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - c_name, + handle = ffc().flexflow_model_add_reverse( + self.handle, input.handle, axis, c_name ) - self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.REVERSE, name) + return Tensor(handle, owner_op_type=OpType.REVERSE) - def rms_norm(self, input, eps, dim, name=None): - """Defines the RMS Norm layer. + def scalar_multiply(self, input, scalar, inplace=True, name=None): + """Scalar multiplication of a tensor by an scalar. :param input: the input Tensor. :type input: Tensor - :param eps: a value added to the denominator for numerical stability - :type eps: float - - :param dim: The dimension with respect to which to take the norm - :type dim: int + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3314,26 +3222,20 @@ def rms_norm(self, input, eps, dim, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rms_norm( - self.handle, input.handle, eps, dim, c_name + handle = ffc().flexflow_model_add_scalar_multiply( + self.handle, input.handle, scalar, inplace, c_name ) - self.add_layer(OpType.RMS_NORM, name) - return Tensor(handle, owner_op_type=OpType.RMS_NORM) - - def residual_rms_norm(self, input1, input2, eps, dim, name=None): - """Defines the Residual RMS Norm layer. + self.add_layer(OpType.SCALAR_MULTIPLY, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) - :param input: the input 1 Tensor. - :type input: Tensor + def scalar_add(self, input, scalar, inplace=True, name=None): + """Scalar addition of a scalar to each entry of a tensor. - :param input: the input 2 Tensor. + :param input: the input Tensor. :type input: Tensor - :param eps: a value added to the denominator for numerical stability - :type eps: float - - :param dim: The dimension with respect to which to take the norm - :type dim: int + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3341,28 +3243,20 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handles_array = ffc().flexflow_model_add_residual_rms_norm( - self.handle, input1.handle, input2.handle, eps, dim, c_name - ) - self.add_layer(OpType.RESIDUAL_RMS_NORM, name) - return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor( - handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM + handle = ffc().flexflow_model_add_scalar_add( + self.handle, input.handle, scalar, inplace, c_name ) + self.add_layer(OpType.SCALAR_ADD, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) - def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): - """Defines the Arg TopK layer. + def scalar_sub(self, input, scalar, inplace=True, name=None): + """Scalar subtraction of a scalar to each entry of a tensor. :param input: the input Tensor. :type input: Tensor - :param k: the top k indices to select - :type k: int - - :param sorted: Whether the entries should be sorted - :type sorted: bool - - :param speculative_decoding: Whether you need to perform beam search - :type speculative_decoding: bool + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3370,23 +3264,20 @@ def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_arg_top_k( - self.handle, input.handle, k, sorted, c_name + handle = ffc().flexflow_model_add_scalar_sub( + self.handle, input.handle, scalar, inplace, c_name ) - self.add_layer(OpType.ARG_TOPK, name) - return Tensor(handle, owner_op_type=OpType.ARG_TOPK) + self.add_layer(OpType.SCALAR_SUB, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) - def beam_top_k(self, input, max_beam_size, sorted, name=None): - """Defines the Beam TopK layer. + def scalar_true_divide(self, input, scalar, inplace=True, name=None): + """Scalar regular division of a tensor by an scalar. :param input: the input Tensor. :type input: Tensor - :param max_beam_size: the top max_beam_size indices to select - :type max_beam_size: int - - :param sorted: Whether the entries should be sorted - :type sorted: bool + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3394,889 +3285,1498 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_beam_top_k( - self.handle, input.handle, max_beam_size, sorted, c_name + handle = ffc().flexflow_model_add_scalar_truediv( + self.handle, input.handle, scalar, inplace, c_name ) - self.add_layer(OpType.BEAM_TOPK, name) - return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) + self.add_layer(OpType.SCALAR_TRUEDIV, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) - def sampling(self, input, top_p, name=None): - """Defines the Sampling layer. + def gelu(self, input, inplace=True, name=None): + """Gaussian Error Linear Unit activation function. :param input: the input Tensor. :type input: Tensor - :param top_p: The top_p parameter of the sampling - :type top_p: float - :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sampling( - self.handle, input.handle, top_p, c_name - ) - self.add_layer(OpType.SAMPLING, name) - return Tensor(handle, owner_op_type=OpType.SAMPLING) + handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) + self.add_layer(OpType.GELU, name) + return Tensor(handle, owner_op_type=OpType.GELU) - def argmax(self, input, beam_search, name=None): - """Defines the Sampling layer. + def relu(self, input, inplace=True, name=None): + """Rectified Linear Unit activation function. :param input: the input Tensor. :type input: Tensor - :param beam_search: Whether you need to perform beam search - :type beam_search: bool - :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_argmax( - self.handle, input.handle, beam_search, c_name + handle = ffc().flexflow_model_add_relu( + self.handle, input.handle, inplace, c_name ) - self.add_layer(OpType.ARGMAX, name) - return Tensor(handle, owner_op_type=OpType.ARGMAX) + self.add_layer(OpType.RELU, name) + return Tensor(handle, owner_op_type=OpType.RELU) - def reset_metrics(self): - """Reset performance metrics. + def identity(self, input, name=None): + """Identity function. - :returns: None -- no returns. - """ - ffc().flexflow_model_reset_metrics(self.handle) + :param input: the input Tensor. + :type input: Tensor - def init_layers(self): - """Initialize layers. + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - ffc().flexflow_model_init_layers(self.handle) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) + self.add_layer(OpType.IDENTITY, name) + return Tensor(handle, owner_op_type=OpType.IDENTITY) - def prefetch(self): - ffc().flexflow_model_prefetch(self.handle) + def sigmoid(self, input, name=None): + """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`. - def forward(self, seq_length=None): - """Forward propagation of all layers. + :param input: the input Tensor. + :type input: Tensor - :returns: None -- no returns. + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. """ - if seq_length is None: - seq_length = -1 - ffc().flexflow_model_forward(self.handle, seq_length) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) + self.add_layer(OpType.SIGMOID, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID) - # TODO: seperate compute_metrics from backward - def backward(self, seq_length=None): - """Backward propagation of all layers. + def tanh(self, input, name=None): + """Hyperbolic tangent activation function. - :returns: None -- no returns. - """ - if seq_length is None: - seq_length = -1 - ffc().flexflow_model_backward(self.handle, seq_length) + :param input: the input Tensor. + :type input: Tensor - def compute_metrics(self): - """Compute performance metrics. + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - ffc().flexflow_model_compute_metrics(self.handle) - - def update(self): - """Update weights and biases of all layers. - - :returns: None -- no returns. - """ - ffc().flexflow_model_update(self.handle) - - def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): - """Configure the model for trainting. FlexFlow uses lazy initialization, - so the actual creating of all operations (including creating and partitioning - of weight, bias and output tensors) happen during compile. - - :param optimizer: optimizer instance. - :type optimizer: Optimizer + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) + self.add_layer(OpType.TANH, name) + return Tensor(handle, owner_op_type=OpType.TANH) - :param loss_type: Enum of LossType. - Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, - LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE. - :type loss_type: LossType + def elu(self, input, inplace=True, name=None): + """Exponential Linear Unit. activation function. - :param metrics: List of metrics to be evaluated by the model during training and testing. - Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY, - METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, - METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR - :type metrics: MetricsType + :param input: the input Tensor. + :type input: Tensor - :param comp_mode: Enum of CompMode. - Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE - :type comp_mode: CompMode + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - self.optimizer = optimizer - - c_loss_type = enum_to_int(LossType, loss_type) - metrics_int = [] - for metric in metrics: - metrics_int.append(enum_to_int(MetricsType, metric)) - c_metrics = ffi.new("int[]", metrics_int) - if comp_mode == None: - comp_mode = CompMode.TRAINING - c_comp_mode = enum_to_int(CompMode, comp_mode) - ffc().flexflow_model_compile( - self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_elu( + self.handle, input.handle, inplace, c_name ) - for ff_tensor, np_tensor in self.attr_tensors.items(): - ff_tensor.set_tensor(self, np_tensor) - print("Compiled ffmodel!") - - def fit(self, x=None, y=None, batch_size=None, epochs=1): - """Trains the model for a fixed number of epochs (iterations on a dataset). - - :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. - :type x: Dataloader - - :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. - :type y: Dataloader - - :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` - or :attr:`--batch-size` from the command line. - :type batch_size: int - - :param epochs: Number of epochs to train the model. - An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. - The default value is 1. - :type epochs: int - - :returns: None -- no returns. - """ - if isinstance(x, list) == False: - dataloaders = [x] - else: - dataloaders = x - dataloaders.append(y) - - num_samples = y.num_samples - batch_size = self._ffconfig.batch_size - self._tracing_id += 1 # get a new tracing id - for epoch in range(0, epochs): - for d in dataloaders: - d.reset() - self.reset_metrics() - iterations = num_samples / batch_size - for iter in range(0, int(iterations)): - self._ffconfig.begin_trace(self._tracing_id) - for d in dataloaders: - d.next_batch(self) - self.forward() - self.zero_gradients() - self.backward() - self.update() - self._ffconfig.end_trace(self._tracing_id) + self.add_layer(OpType.ELU, name) + return Tensor(handle, owner_op_type=OpType.ELU) - def eval(self, x=None, y=None, batch_size=None): - """Returns the loss value & metrics values for the model in test mode. + def dropout(self, input, rate, seed, name=None): + """The Dropout layer randomly sets input units to 0 with + a frequency of :attr:`rate` at each step during training time, + which helps prevent overfitting. + Inputs not set to 0 are scaled up by 1/(1 - rate) such that the + sum over all inputs is unchanged. - :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. - :type x: Dataloader + :param input: the input Tensor. + :type input: Tensor - :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. - :type y: Dataloader + :param rate: Fraction of the input units to drop. + :type rate: float(0-1) - :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` - or :attr:`--batch-size` from the command line. - :type batch_size: int + :param seed: random seed. + :type seed: int - :param epochs: Number of epochs to train the model. - An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. - The default value is 1. - :type epochs: int + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - if isinstance(x, list) == False: - dataloaders = [x] - else: - dataloaders = x - dataloaders.append(y) - - num_samples = y.num_samples - batch_size = self._ffconfig.batch_size - for d in dataloaders: - d.reset() - self.reset_metrics() - iterations = num_samples / batch_size - self._tracing_id += 1 # get a new tracing id - for iter in range(0, int(iterations)): - for d in dataloaders: - d.next_batch(self) - self._ffconfig.begin_trace(self._tracing_id) - self.forward() - self.compute_metrics() - self._ffconfig.end_trace(self._tracing_id) - - def zero_gradients(self): - """Empty the gradients of all layers. + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_dropout( + self.handle, input.handle, rate, seed, c_name + ) + self.add_layer(OpType.DROPOUT, name) + return Tensor(handle, owner_op_type=OpType.DROPOUT) - :returns: None -- no returns. - """ - ffc().flexflow_model_zero_gradients(self.handle) + def multihead_attention( + self, + query, + key, + value, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kernel_initializer=None, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, + and returns the dot-product attention between them:. - def set_optimizer(self, optimizer): - if isinstance(optimizer, SGDOptimizer) == True: - ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle) - elif isinstance(optimizer, AdamOptimizer) == True: - ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle) - elif optimizer == None: - pass - else: - assert 0, "[Model]: unknown optimizer" + :param query: the query Tensor. + :type query: Tensor - optimizer = property(fset=set_optimizer) + :param key: the key Tensor. + :type key: Tensor - def print_layers(self, id=-1): - ffc().flexflow_model_print_layers(self.handle, id) + :param value: the value Tensor. + :type value: Tensor - def get_layer_by_id(self, layer_id): - return self._layers[layer_id] + :param embed_dim: total dimension of the model + :type embed_dim: int - def get_last_layer(self): - return self._layers[self._nb_layers - 1] + :param num_heads: Number of attention heads. + :type num_heads: int - def get_layer_by_name(self, layer_name): - for layer_id in self._layers: - layer = self._layers[layer_id] - if layer.name == layer_name: - return layer - assert 0, f"Cannot find the layer with name {layer_name}" - return None + :param kdim: total number of features in key. Default is 0 + :type kdim: int - def get_tensor_by_id(self, id): - handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id) - return Parameter(handle) + :param vdim: total number of features in value. Default is 0 + :type vdim: int - @property - def label_tensor(self): - handle = ffc().flexflow_model_get_label_tensor(self.handle) - return Tensor(handle, deallocate=False) + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) - def get_perf_metrics(self): - handle = ffc().flexflow_model_get_perf_metrics(self.handle) - return PerfMetrics(handle) + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool - def set_transformer_layer_id(self, id): - ffc().flexflow_model_set_transformer_layer_id(self.handle, id) + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool - def create_data_loader(self, batch_tensor, full_array): - """Create a SingleDataloader instance. + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool - :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model. - :type batch_tensor: Tensor + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer - :param full_array: the entire data. - :type full_array: Numpy Array + :param name: the name of the layer. Default is None. + :type name: string - :returns: SingleDataloader -- returns a dataloader instance. + :returns: Tensor -- the output tensor. """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + handle = ffc().flexflow_model_add_multihead_attention( + self.handle, + query.handle, + key.handle, + value.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + kernel_init_handle, + c_name, + ) + self.add_layer(OpType.MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) - if self._ffconfig.enable_control_replication: - assert ( - self._ffconfig.python_data_loader_type != 1 - ), "To enable control replication, please set --python-data-loader-type 2" - return self.__create_data_loader_ptr(batch_tensor, full_array) - else: - if self._ffconfig.python_data_loader_type == 1: - return self.__create_data_loader_attach(batch_tensor, full_array) - else: - return self.__create_data_loader_ptr(batch_tensor, full_array) - - def __create_data_loader_attach(self, batch_tensor, full_array): - full_array_shape = full_array.shape - num_samples = full_array_shape[0] - num_dim = len(full_array_shape) - if full_array.dtype == "float16": - datatype = DataType.DT_HALF - elif full_array.dtype == "float32": - datatype = DataType.DT_FLOAT - elif full_array.dtype == "int32": - datatype = DataType.DT_INT32 - elif full_array.dtype == "int64": - datatype = DataType.DT_INT64 - else: - assert 0, "unsupported datatype" + def inc_multihead_self_attention( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multihead_self_attention( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multihead_self_attention( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multihead_self_attention_verify( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multiquery_self_attention( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multiquery_self_attention( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. - if num_dim == 2: - full_tensor = self.create_tensor( - [num_samples, full_array_shape[1]], datatype - ) - self.map_tensor(full_tensor) - elif num_dim == 4: - full_tensor = self.create_tensor( - [ - num_samples, - full_array_shape[1], - full_array_shape[2], - full_array_shape[3], - ], - datatype, - ) - self.map_tensor(full_tensor) - else: - assert 0, "unsupported dims" + :param input: the input Tensor. + :type input: Tensor - full_tensor.attach_numpy_array(self._ffconfig, full_array) - dataloader = SingleDataLoader( - self, batch_tensor, full_tensor, num_samples, datatype - ) - full_tensor.detach_numpy_array(self._ffconfig) + :param embed_dim: total dimension of the model + :type embed_dim: int - return dataloader + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int - def __create_data_loader_ptr(self, batch_tensor, full_array): - full_array_shape = full_array.shape - num_samples = full_array_shape[0] - if full_array.dtype == "float16": - datatype = DataType.DT_HALF - elif full_array.dtype == "float32": - datatype = DataType.DT_FLOAT - elif full_array.dtype == "int32": - datatype = DataType.DT_INT32 - elif full_array.dtype == "int64": - datatype = DataType.DT_INT64 - else: - assert 0, "unsupported datatype" - np_raw_ptr = full_array.__array_interface__["data"] - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - print( - "numpy array: %s, %s, %s" - % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0])) - ) - dataloader = SingleDataLoader( - self, batch_tensor, raw_ptr, num_samples, datatype - ) + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int - return dataloader + :param kdim: total number of features in key. Default is 0 + :type kdim: int - def __get_initializer_handle(self, initializer): - if initializer == None: - null_initializer = Initializer(None) - return null_initializer.handle - else: - return initializer.handle + :param vdim: total number of features in value. Default is 0 + :type vdim: int - def __get_op_handle(self, shared_op): - if shared_op == None: - op_handle = ffi.new("flexflow_op_t *") - op_handle.impl = ffi.NULL - op = Op(op_handle[0]) - else: - op = shared_op - return op.handle + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) - def get_output_tensor(self, ffmodel, data_type): - shape = self.dims - if data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__["data"] - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_float( - self.handle, ffmodel.handle, raw_ptr, False - ) - elif np_array.dtype == np.int32: - raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int( - self.handle, ffmodel.handle, raw_ptr, False - ) - elif np_array.dtype == np.int64: - raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int64( - self.handle, ffmodel.handle, raw_ptr, False - ) - fflogger.debug( - "get weights raw_ptr: %s, %s, %s, %s" - % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) - ) - assert ret_val == True - return np_array + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool - def generate(self, prompt_list, max_sequence_length): - assert isinstance(prompt_list, list) - c_input_texts = [get_c_name(prompt) for prompt in prompt_list] - max_num_chars = 5 * (max_sequence_length + 100) - c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] - c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] - ffc().flexflow_model_generate( - self.handle, - len(prompt_list), - c_input_texts, - max_num_chars, - c_output_texts, - max_sequence_length, - c_output_length_and_tokens, - ) - #output_length = c_output_length_and_tokens[0] - #output_tokens = [] - #for i in range(output_length): - # output_tokens.append(c_output_length_and_tokens[i + 1]) - from flexflow.serve import GenerationResult + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool - return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool - def set_position_offset(self, offset): - ffc().flexflow_model_set_position_offset(self.handle, offset) + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer -# ----------------------------------------------------------------------- -# SGDOptimizer -# ----------------------------------------------------------------------- + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool -class SGDOptimizer(object): - __slots__ = ["handle", "_handle"] + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float - def __init__( - self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 - ): - self.handle = ffc().flexflow_sgd_optimizer_create( - ffmodel.handle, lr, momentum, nesterov, weight_decay + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, ) - self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) - def set_learning_rate(self, learning_rate): - ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) + def inc_multiquery_self_attention_verify( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + :param embed_dim: total dimension of the model + :type embed_dim: int -# ----------------------------------------------------------------------- -# AdamOptimizer -# ----------------------------------------------------------------------- + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int -class AdamOptimizer(object): - __slots__ = ["handle", "_handle"] + :param kdim: total number of features in key. Default is 0 + :type kdim: int - def __init__( - self, - ffmodel, - alpha=0.001, - beta1=0.9, - beta2=0.999, - weight_decay=0.0, - epsilon=1e-8, - ): - self.handle = ffc().flexflow_adam_optimizer_create( - ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon - ) - self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool - def set_learning_rate(self, learning_rate): - ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer -# ----------------------------------------------------------------------- -# Initializer -# ----------------------------------------------------------------------- -class Initializer(object): - __slots__ = ["handle", "p_handle"] + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool - def __init__(self, handle, p_handle=0): - self.p_handle = ffi.new("flexflow_initializer_t *") - if handle == None: - self.p_handle.impl = ffi.NULL - else: - self.p_handle.impl = handle.impl - self.handle = self.p_handle[0] - assert ffi.typeof(self.handle) == ffi.typeof( - "flexflow_initializer_t" - ), "Initializer handle is wrong" + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float -# ----------------------------------------------------------------------- -# GlorotUniform -# ----------------------------------------------------------------------- + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool -class GlorotUniformInitializer(Initializer): - __slots__ = ["glorot_handle", "_glorot_handle"] + :param name: the name of the layer. Default is None. + :type name: string - def __init__(self, seed): - self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) - self._glorot_handle = ffi.gc( - self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, ) - super(GlorotUniformInitializer, self).__init__(self.glorot_handle) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + def rms_norm(self, input, eps, dim, name=None): + """Defines the RMS Norm layer. -# ----------------------------------------------------------------------- -# ZeroInitializer -# ----------------------------------------------------------------------- + :param input: the input Tensor. + :type input: Tensor + + :param eps: a value added to the denominator for numerical stability + :type eps: float + :param dim: The dimension with respect to which to take the norm + :type dim: int -class ZeroInitializer(Initializer): - __slots__ = ["zero_handle", "_zero_handle"] + :param name: the name of the layer. Default is None. + :type name: string - def __init__(self): - self.zero_handle = ffc().flexflow_zero_initializer_create() - self._zero_handle = ffi.gc( - self.zero_handle, ffc().flexflow_zero_initializer_destroy + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_rms_norm( + self.handle, input.handle, eps, dim, c_name ) - super(ZeroInitializer, self).__init__(self.zero_handle) + self.add_layer(OpType.RMS_NORM, name) + return Tensor(handle, owner_op_type=OpType.RMS_NORM) + def residual_rms_norm( + self, input1, input2, eps, dim, inplace_residual=False, name=None + ): + """Defines the Residual RMS Norm layer. -# ----------------------------------------------------------------------- -# UniformInitializer -# ----------------------------------------------------------------------- + :param input: the input 1 Tensor. + :type input: Tensor + :param input: the input 2 Tensor. + :type input: Tensor -class UniformInitializer(Initializer): - __slots__ = ["uniform_handle", "_uniform_handle"] + :param eps: a value added to the denominator for numerical stability + :type eps: float - def __init__(self, seed, minv, maxv): - self.uniform_handle = ffc().flexflow_uniform_initializer_create( - seed, minv, maxv + :param dim: The dimension with respect to which to take the norm + :type dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False. + :type inplace_residual: bool + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handles_array = ffc().flexflow_model_add_residual_rms_norm( + self.handle, + input1.handle, + input2.handle, + eps, + dim, + inplace_residual, + c_name, ) - self._uniform_handle = ffi.gc( - self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + self.add_layer(OpType.RESIDUAL_RMS_NORM, name) + return ( + Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), + Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM), ) - super(UniformInitializer, self).__init__(self.uniform_handle) + def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): + """Defines the Arg TopK layer. -# ----------------------------------------------------------------------- -# NormInitializer -# ----------------------------------------------------------------------- + :param input: the input Tensor. + :type input: Tensor + :param k: the top k indices to select + :type k: int -class NormInitializer(Initializer): - __slots__ = ["norm_handle", "_norm_handle"] + :param sorted: Whether the entries should be sorted + :type sorted: bool - def __init__(self, seed, mean, stddev): - self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) - self._norm_handle = ffi.gc( - self.norm_handle, ffc().flexflow_norm_initializer_destroy - ) - super(NormInitializer, self).__init__(self.norm_handle) + :param speculative_decoding: Whether you need to perform beam search + :type speculative_decoding: bool + :param name: the name of the layer. Default is None. + :type name: string -# ----------------------------------------------------------------------- -# PerfMetrics -# ----------------------------------------------------------------------- + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_arg_top_k( + self.handle, input.handle, k, sorted, c_name + ) + self.add_layer(OpType.ARG_TOPK, name) + return Tensor(handle, owner_op_type=OpType.ARG_TOPK) + def beam_top_k(self, input, max_beam_size, sorted, name=None): + """Defines the Beam TopK layer. -class PerfMetrics(object): - __slots__ = ["handle", "_handle"] + :param input: the input Tensor. + :type input: Tensor - def __init__(self, handle): - self.handle = handle - self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) + :param max_beam_size: the top max_beam_size indices to select + :type max_beam_size: int - def get_accuracy(self): - return ffc().flexflow_per_metrics_get_accuracy(self.handle) + :param sorted: Whether the entries should be sorted + :type sorted: bool + :param name: the name of the layer. Default is None. + :type name: string -# ----------------------------------------------------------------------- -# NetConfig -# ----------------------------------------------------------------------- + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_beam_top_k( + self.handle, input.handle, max_beam_size, sorted, c_name + ) + self.add_layer(OpType.BEAM_TOPK, name) + return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) + def sampling(self, input, top_p, name=None): + """Defines the Sampling layer. -class NetConfig(object): - def __init__(self): - self.handle = ffc().flexflow_net_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) - cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) - self.dataset_path = ffi.string(cpath) + :param input: the input Tensor. + :type input: Tensor + :param top_p: The top_p parameter of the sampling + :type top_p: float -# ----------------------------------------------------------------------- -# DLRMConfig -# ----------------------------------------------------------------------- + :param name: the name of the layer. Default is None. + :type name: string + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sampling( + self.handle, input.handle, top_p, c_name + ) + self.add_layer(OpType.SAMPLING, name) + return Tensor(handle, owner_op_type=OpType.SAMPLING) -class DLRMConfig(object): - def __init__(self): - self.handle = ffc().flexflow_dlrm_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) + def argmax(self, input, beam_search, name=None): + """Defines the Sampling layer. - cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) - self.dataset_path = ffi.string(cstr) + :param input: the input Tensor. + :type input: Tensor - cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) - self.arch_interaction_op = ffi.string(cstr) + :param beam_search: Whether you need to perform beam search + :type beam_search: bool - self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( - self.handle - ) - self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) - self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) - self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( - self.handle - ) - self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) + :param name: the name of the layer. Default is None. + :type name: string - mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) - self.mlp_bot = [] - for i in range(0, mlp_bot_c[0]): - self.mlp_bot.append(mlp_bot_c[i + 1]) + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_argmax( + self.handle, input.handle, beam_search, c_name + ) + self.add_layer(OpType.ARGMAX, name) + return Tensor(handle, owner_op_type=OpType.ARGMAX) - mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) - self.mlp_top = [] - for i in range(0, mlp_top_c[0]): - self.mlp_top.append(mlp_top_c[i + 1]) + def add_lora_layer(self, peft_config): + return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle) - embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) - self.embedding_size = [] - for i in range(0, embedding_size_c[0]): - self.embedding_size.append(embedding_size_c[i + 1]) + def reset_metrics(self): + """Reset performance metrics. + :returns: None -- no returns. + """ + ffc().flexflow_model_reset_metrics(self.handle) -# ----------------------------------------------------------------------- -# Single DataLoader -# ----------------------------------------------------------------------- + def init_layers(self): + """Initialize layers. + :returns: None -- no returns. + """ + ffc().flexflow_model_init_layers(self.handle) -class SingleDataLoader(object): - __slots__ = ["handle", "_handle"] + def prefetch(self): + ffc().flexflow_model_prefetch(self.handle) - def __init__(self, ffmodel, input, full_input, num_samples, data_type): - assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" - assert type(input) is Tensor, "SingleDataLoader input is wrong" - if type(full_input) is Tensor: - self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) - else: - self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) - self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) + def forward(self, seq_length=None): + """Forward propagation of all layers. - def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): - assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" - c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc().flexflow_single_dataloader_create( - ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type - ) + :returns: None -- no returns. + """ + if seq_length is None: + seq_length = -1 + ffc().flexflow_model_forward(self.handle, seq_length) - def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): - # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" - c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc().flexflow_single_dataloader_create2( - ffmodel.handle, input.handle, full_input, num_samples, c_data_type - ) + # TODO: seperate compute_metrics from backward + def backward(self, seq_length=None): + """Backward propagation of all layers. - @property - def num_samples(self): - return ffc().flexflow_single_dataloader_get_num_samples(self.handle) + :returns: None -- no returns. + """ + if seq_length is None: + seq_length = -1 + ffc().flexflow_model_backward(self.handle, seq_length) - @num_samples.setter - def num_samples(self, samples): - ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) + def compute_metrics(self): + """Compute performance metrics. - def next_batch(self, ffmodel): - """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. + :returns: None -- no returns. + """ + ffc().flexflow_model_compute_metrics(self.handle) + + def update(self): + """Update weights and biases of all layers. :returns: None -- no returns. """ - ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) + ffc().flexflow_model_update(self.handle) - def reset(self): - """Reset the current position of the dataloder to 0. + def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): + """Configure the model for trainting. FlexFlow uses lazy initialization, + so the actual creating of all operations (including creating and partitioning + of weight, bias and output tensors) happen during compile. + + :param optimizer: optimizer instance. + :type optimizer: Optimizer + + :param loss_type: Enum of LossType. + Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE. + :type loss_type: LossType + + :param metrics: List of metrics to be evaluated by the model during training and testing. + Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY, + METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, + METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR + :type metrics: MetricsType + + :param comp_mode: Enum of CompMode. + Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE + :type comp_mode: CompMode :returns: None -- no returns. """ - ffc().flexflow_single_dataloader_reset(self.handle) + self.optimizer = optimizer + c_loss_type = enum_to_int(LossType, loss_type) + metrics_int = [] + for metric in metrics: + metrics_int.append(enum_to_int(MetricsType, metric)) + c_metrics = ffi.new("int[]", metrics_int) + if comp_mode == None: + comp_mode = CompMode.TRAINING + c_comp_mode = enum_to_int(CompMode, comp_mode) + ffc().flexflow_model_compile( + self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode + ) + for ff_tensor, np_tensor in self.attr_tensors.items(): + ff_tensor.set_tensor(self, np_tensor) + print("Compiled ffmodel!") -class RegionNdarray(object): - __slots__ = ["__array_interface__"] + def fit(self, x=None, y=None, batch_size=None, epochs=1): + """Trains the model for a fixed number of epochs (iterations on a dataset). - def __init__(self, shape, data_type, base_ptr, strides, read_only): - # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html - if data_type == DataType.DT_HALF: - field_type = " 0: + finetuning_losses = [ + c_finetuning_losses[i] for i in range(num_finetuning_losses[0]) + ] + results = [] + for c_output_text in c_output_texts: + results.append( + GenerationResult( + text=( + ffi.string(c_output_text) if c_output_text != ffi.NULL else None + ), + tokens=[], + finetuning_losses=finetuning_losses, + ) + ) + return results + + def set_position_offset(self, offset): + ffc().flexflow_model_set_position_offset(self.handle, offset) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 5af077273d..fd29080a6a 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -15,7 +15,16 @@ from typing import Optional from ..type import * from flexflow.core import * -from .serve import LLM, SSM, GenerationConfig, GenerationResult +from .serve import ( + LLM, + SSM, + GenerationConfig, + GenerationResult, + LoraLinearConfig, + PEFTModelID, + Request, + RequestType, +) def __check_positive_int(configs_dict: dict, key: str): @@ -44,6 +53,9 @@ def init( offload_reserve_space_size: Optional[int] = None, use_4bit_quantization: Optional[bool] = None, use_8bit_quantization: Optional[bool] = None, + enable_peft: Optional[bool] = None, + peft_activation_reserve_space_size: Optional[int] = None, + peft_weight_reserve_space_size: Optional[int] = None, profiling: Optional[bool] = None, benchmarking: Optional[bool] = None, inference_debugging: Optional[bool] = None, @@ -69,9 +81,12 @@ def init( - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1 - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1 - offload: whether to enable offloading of the weights to CPU, defaults to False - - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB - use_4bit_quantization: whether to use 4-bit quantization, defaults to False - use_8bit_quantization: whether to use 8-bit quantization, defaults to False + - enable_peft: whether to enable the use of PEFT, defaults to False + - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB + - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - profiling: whether to enable the FlexFlow profiling mode, defaults to False - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False @@ -100,12 +115,18 @@ def init( :type pipeline_parallelism_degree: Optional[int], optional :param offload: whether to enable offloading of the weights to CPU, defaults to False :type offload: Optional[bool], optional - :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB :type offload_reserve_space_size: Optional[int], optional :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False :type use_4bit_quantization: Optional[bool], optional :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False :type use_8bit_quantization: Optional[bool], optional + :param enable_peft: whether to enable the use of PEFT, defaults to False + :type enable_peft: Optional[bool], optional + :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB + :type peft_activation_reserve_space_size: Optional[int], optional + :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB + :type peft_weight_reserve_space_size: Optional[int], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False @@ -135,6 +156,9 @@ def init( offload_reserve_space_size is not None, use_4bit_quantization is not None, use_8bit_quantization is not None, + enable_peft is not None, + peft_activation_reserve_space_size is not None, + peft_weight_reserve_space_size is not None, profiling is not None, benchmarking is not None, inference_debugging is not None, @@ -161,6 +185,9 @@ def init( "offload_reserve_space_size": offload_reserve_space_size, "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, + "enable_peft": enable_peft, + "peft_activation_reserve_space_size": peft_activation_reserve_space_size, + "peft_weight_reserve_space_size": peft_weight_reserve_space_size, "profiling": profiling, "benchmarking": benchmarking, "inference_debugging": inference_debugging, @@ -182,6 +209,8 @@ def init( "tensor_parallelism_degree", "pipeline_parallelism_degree", "offload_reserve_space_size", + "peft_activation_reserve_space_size", + "peft_weight_reserve_space_size", ] for param in positive_int_params: __check_positive_int(configs_dict, param) @@ -200,11 +229,17 @@ def init( if configs_dict.get("offload", None) is None: configs_dict["offload"] = False if configs_dict.get("offload_reserve_space_size", None) is None: - configs_dict["offload_reserve_space_size"] = 1024**2 + configs_dict["offload_reserve_space_size"] = 8 * 1024**3 if configs_dict.get("use_4bit_quantization", None) is None: configs_dict["use_4bit_quantization"] = False if configs_dict.get("use_8bit_quantization", None) is None: configs_dict["use_8bit_quantization"] = False + if configs_dict.get("enable_peft", None) is None: + configs_dict["enable_peft"] = False + if configs_dict.get("peft_activation_reserve_space_size", None) is None: + configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3 + if configs_dict.get("peft_weight_reserve_space_size", None) is None: + configs_dict["peft_weight_reserve_space_size"] = 1024**3 if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False if configs_dict.get("benchmarking", None) is None: diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index e7f3914037..17bb894250 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -32,5 +32,8 @@ def __init__( def build_model(self): assert False, "Not implemented yet" + def convert_hf_weight_name(name): + assert False, "Not implemented yet" + def convert_hf_model(model, dst_folder): assert False, "Not implemented yet" diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 7a55da26ef..0e8fbcbd7d 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -124,7 +124,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) else: token, att_norm = ffmodel.residual_layer_norm( @@ -135,7 +135,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -153,7 +153,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -170,7 +170,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -187,7 +187,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) else: assert False @@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size * 4, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_h_to_4h", + name=f"layers.{i}.mlp.dense_h_to_4h", ) dense_h_to_4h = ffmodel.gelu(dense_h_to_4h) mlp_output = ffmodel.dense( @@ -205,7 +205,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_4h_to_h", + name=f"layers.{i}.mlp.dense_4h_to_h", ) _, ln_f = ffmodel.residual_layer_norm( @@ -239,10 +239,18 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return (name.replace("transformer.h.", "layers.") + .replace("transformer.", "") + .replace("self_attention.dense", "self_attention.o_proj") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) n_head = ( @@ -251,17 +259,12 @@ def convert_hf_model(model, dst_folder): else model.config.num_attention_heads ) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("transformer_h_", "layers_") - .replace("transformer_", "") - .replace("self_attention_dense", "attention_wo") - ) + name = FlexFlowFalcon.convert_hf_weight_name(name) # Split Q,K,V attention weights - if "self_attention_query_key_value" in name: - name_q = name.replace("self_attention_query_key_value", "attention_wq") - name_k = name.replace("self_attention_query_key_value", "attention_wk") - name_v = name.replace("self_attention_query_key_value", "attention_wv") + if "self_attention.query_key_value" in name: + name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj") + name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj") + name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj") q, k, v = torch.split( params, [ @@ -278,5 +281,5 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) # LM head weight model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 6b33030f62..96f0258572 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -62,7 +62,7 @@ def __init__( # self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath - self.maxint = 2**31 - 1 + self.maxint = 2 ** 31 - 1 max_verify_tokens_per_batch = ( max_tokens_per_batch + self.llama_config.max_spec_tree_token_num ) @@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="tok_embeddings", + name="embed_tokens", ) for i in range(self.llama_config.num_hidden_layers): @@ -117,7 +117,7 @@ def build_model(self, max_tokens_per_batch): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) else: token, attn_norm = ffmodel.residual_rms_norm( @@ -125,7 +125,7 @@ def build_model(self, max_tokens_per_batch): w2, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -145,7 +145,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -193,21 +193,21 @@ def build_model(self, max_tokens_per_batch): mha, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_ffn_norm", + name=f"layers.{i}.post_attention_layernorm", ) w1 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w1", + name=f"layers.{i}.mlp.gate_proj", ) w3 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w3", + name=f"layers.{i}.mlp.up_proj", ) multi = ffmodel.sigmoid_silu_multi(w1, w3) w2 = ffmodel.dense( @@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w2", + name=f"layers.{i}.mlp.down_proj", ) _, token = ffmodel.residual_rms_norm( @@ -230,7 +230,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="output", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -246,28 +246,16 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(dense, 1, False) - output = ffmodel.argmax(dense, False) + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return name.replace("model.", "") + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) + name = FlexFlowLLAMA.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 92867fd498..b350ae106d 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -97,7 +97,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) axes = [ @@ -114,7 +114,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) else: hidden_states, layernorm_output = ffmodel.residual_layer_norm( @@ -126,7 +126,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -148,7 +148,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: attn_outputs = ffmodel.inc_multihead_self_attention_verify( @@ -169,7 +169,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: attn_outputs = ffmodel.inc_multihead_self_attention( @@ -190,7 +190,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) else: assert False @@ -204,7 +204,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_2", + name=f"layers.{i}.norm_2", ) # mlp layernorm_output = ffmodel.dense( @@ -212,7 +212,7 @@ def build_model(self, max_tokens_per_batch): 4 * self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_up_proj", + name=f"layers.{i}.ffn.up_proj", ) layernorm_output = ffmodel.gelu(layernorm_output) intermediate_output = ffmodel.dense( @@ -220,7 +220,7 @@ def build_model(self, max_tokens_per_batch): self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_down_proj", + name=f"layers.{i}.ffn.down_proj", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -232,7 +232,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"transformer_norm_f", + name=f"norm_f", ) lm_head = ffmodel.dense( all_final_norm, @@ -249,18 +249,27 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(dense, -1) output = ffmodel.sampling(softmax, self.generation_config.topp) else: - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return ( + name.replace("transformer.blocks.", "layers.") + .replace("transformer.", "") + .replace("attn.out_proj", "attn.o_proj") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.blocks.", "layers.").replace(".", "_") + name = FlexFlowMPT.convert_hf_weight_name(name) if "Wqkv" in name: - name_q = name.replace("attn_Wqkv", "attention_wq") - name_k = name.replace("attn_Wqkv", "attention_wk") - name_v = name.replace("attn_Wqkv", "attention_wv") + name_q = name.replace("attn.Wqkv", "attn.q_proj") + name_k = name.replace("attn.Wqkv", "attn.k_proj") + name_v = name.replace("attn.Wqkv", "attn.v_proj") q, k, v = torch.split( params, [ @@ -273,13 +282,10 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "out_proj" in name: - name = name.replace("attn_out_proj", "attention_wo") - params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) shutil.copy( - os.path.join(dst_folder, "transformer_wte_weight"), - os.path.join(dst_folder, "lm_head_weight"), + os.path.join(dst_folder, "wte.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index b715f5f35e..02668abf59 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_attention_layer_norm", + name=f"layers.{i}.self_attn_layer_norm", ) else: hidden_states = ffmodel.add(token, positional_embedding) @@ -163,7 +163,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multihead_self_attention_verify( @@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multihead_self_attention( @@ -203,7 +203,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_add_bias_residual_layer_norm", + name=f"layers.{i}.add_bias_residual_layer_norm", ) if not self.opt_config.do_layer_norm_before: @@ -226,14 +226,14 @@ def build_model(self, max_tokens_per_batch): self.opt_config.ffn_dim, ActiMode.AC_MODE_RELU, True, - name=f"layers_{i}_fc1", + name=f"layers.{i}.fc1", ) fc2 = ffmodel.dense( fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_fc2", + name=f"layers.{i}.fc2", ) if not self.opt_config.do_layer_norm_before: @@ -245,7 +245,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_final_layer_norm", + name=f"layers.{i}.final_layer_norm", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -263,7 +263,7 @@ def build_model(self, max_tokens_per_batch): self.opt_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="embed_tokens_weight_lm_head", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -279,30 +279,29 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return ( + name.replace("decoder.", "") + .replace("model.", "") + .replace("self_attn.out_proj", "self_attn.o_proj") + .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias") + .replace( + ".final_layer_norm", ".add_bias_residual_layer_norm" + ) # important to use the leading "_" to avoid matching the last LayerNorm + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("decoder_", "") - .replace("model_", "") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("out_proj", "wo") - .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") - .replace( - "_final_layer_norm", "_add_bias_residual_layer_norm" - ) # important to use the leading "_" to avoid matching the last LayerNorm - ) + name = FlexFlowOPT.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights shutil.copy( - os.path.join(dst_folder, "embed_tokens_weight"), - os.path.join(dst_folder, "embed_tokens_weight_lm_head"), + os.path.join(dst_folder, "embed_tokens.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 37edaa4c40..2d4471201f 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -111,7 +111,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) positional_embedding = ffmodel.embedding( position_tensor, @@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wpe", + name="wpe", ) axes = [ @@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_1", + name=f"layers.{i}.ln_1", ) assert self.mode == InferenceMode.INC_DECODING_MODE @@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer False, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.attn.c_attn", ) residual, l2_norm = ffmodel.residual_layer_norm( @@ -171,7 +171,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_2", + name=f"layers.{i}.ln_2", ) # mlp @@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.intermediate_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_fc", + name=f"layers.{i}.mlp.c_fc", ) activation = ffmodel.gelu(c_fc, False) c_proj = ffmodel.dense( @@ -189,7 +189,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_proj", + name=f"layers.{i}.mlp.c_proj", ) _, ln_f = ffmodel.residual_layer_norm( @@ -200,7 +200,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"transformer_ln_f", + name=f"ln_f", ) lm_head = ffmodel.dense( ln_f, @@ -217,18 +217,19 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(dense, -1) output = ffmodel.sampling(softmax, self.generation_config.topp) else: - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.h", "layers").replace(".", "_") - if "c_attn_weight" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + name = name.replace("transformer.h", "layers").replace("transformer.", "") + if "attn.c_attn.weight" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -241,10 +242,10 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_attn_bias" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + elif "attn.c_attn.bias" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -257,14 +258,14 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_proj_bias" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.bias" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) - elif "c_proj_weight" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.weight" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index ac622b3337..132c50995b 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -28,44 +28,38 @@ ) from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer +from peft import PeftModel, PeftConfig, LoraConfig from huggingface_hub import HfApi -import sys, torch, shutil, hashlib +import torch, shutil, hashlib, json, gc from typing import Union, List -class GenerationConfig: - """A class to store the sampling configs.""" - - def __init__( - self, - do_sample: bool = False, - temperature: float = 0.9, - topp: float = 0.8, - topk: int = 1, - ): - """Initialize the sampling configs - - :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False - :type do_sample: bool, optional - :param temperature: The temperature setting, defaults to 0.9 - :type temperature: float, optional - :param topp: The top probabilities (top-p) setting, defaults to 0.8 - :type topp: float, optional - :param topk: The top-k setting, defaults to 1 - :type topk: int, optional - """ - self.do_sample = do_sample - self.temperature = temperature - self.topp = topp - self.topk = topk - - -class GenerationResult: - """A class to store the output of a generation request.""" +class _SupportedModels: + def __init__(self,): + self.supported_models = { + "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), + "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "GPTBigCodeForCausalLM": ( + ModelType.STARCODER, + FlexFlowSTARCODER, + STARCODERConfig, + ), + "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), + } - def __init__(self, text: str = None, tokens: list = None): - self.output_text = text - self.output_tokens = tokens + def get_ff_model_type(self, hf_config): + architectures = getattr(hf_config, "architectures", []) + ff_arch = None + if next(iter(architectures), None) is not None: + ff_arch = self.supported_models.get(architectures[0]) + if ff_arch is None: + raise ValueError( + f"Huggingface model of type {architectures} is not yet supported by FlexFlow" + ) + return ff_arch class LLM: @@ -92,68 +86,117 @@ def __init__( :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ - self.supported_models = { - "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), - "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), - "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), - "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), - "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), - "GPTBigCodeForCausalLM": ( - ModelType.STARCODER, - FlexFlowSTARCODER, - STARCODERConfig, - ), - "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), - } + self.supported_models = _SupportedModels() self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.model_name = self.hf_config._name_or_path ( self.model_type, self.model_class, self.config_class, - ) = self.__get_ff_model_type() + ) = self.supported_models.get_ff_model_type(self.hf_config) self.data_type = data_type assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file self.rm = None + self.pefts = {} def __del__(self): # Stop the background server before deleting the object if type(self) == LLM and self.rm is not None: self.rm.stop_server() - def __get_ff_model_type(self): - architectures = getattr(self.hf_config, "architectures", []) - ff_arch = None - if next(iter(architectures), None) is not None: - ff_arch = self.supported_models.get(architectures[0]) - if ff_arch is None: - print( - f"Huggingface model of type {architectures} is not yet supported by FlexFlow" + def add_peft(self, lora_config: LoraLinearConfig): + """Add a PEFT adapter to the LLM""" + if lora_config is None: + raise ValueError("lora_config cannot be None") + if len(lora_config.peft_model_id or "") == 0: + raise ValueError("PEFT model id cannot be empty") + # Inference (trainable=False): LoRA model should already exist in huggingface. Any changes of parameters from original model are ignored + # Training (trainable=True): Either an existing model (init_lora_weights=False) or a new one (init_lora_weights=True) + + if lora_config.trainable == False or not lora_config.init_lora_weights: + peft_config = PeftConfig.from_pretrained(lora_config.peft_model_id) + else: + peft_config = LoraConfig( + peft_type="LORA", + base_model_name_or_path=self.model_name, + r=lora_config.rank, + target_modules=lora_config.target_modules, + lora_alpha=lora_config.lora_alpha, + lora_dropout=lora_config.lora_dropout, + init_lora_weights=lora_config.init_lora_weights, ) - sys.exit(1) - return ff_arch + if peft_config.peft_type != "LORA": + raise RuntimeError( + f"PEFT type {peft_config.peft_type} not yet supported in FlexFlow" + ) + if "base_model_name_or_path" not in peft_config.to_dict(): + raise ValueError( + f"PEFT model {lora_config.peft_model_id} does not have an associated base model" + ) + if peft_config.base_model_name_or_path != self.model_name: + raise RuntimeError( + f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}" + ) + + self.pefts[lora_config] = { + "peft_config": peft_config, + "peft_type": peft_config.peft_type, + } + + def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID: + if lora_config is None: + raise ValueError("lora_config cannot be None") + if len(lora_config.peft_model_id or "") == 0: + raise ValueError("PEFT model id cannot be empty") + if lora_config not in self.pefts: + raise ValueError( + f"PEFT {lora_config} not registered with LLM {self.model_name}" + ) + if "ff_peft_model_id" not in self.pefts[lora_config]: + raise RuntimeError( + f"Attempting to run PEFT {lora_config} before compiling LLM {self.model_name}" + ) + + return self.pefts[lora_config]["ff_peft_model_id"] def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" - self.config_dir = os.path.join( + config_dir = os.path.join( os.path.expanduser(self.cache_path), "configs", self.model_name.lower() ) - self.config_path = os.path.join(self.config_dir, "config.json") - os.makedirs(self.config_dir, exist_ok=True) - print(f"Creating directory {self.config_dir} (if it doesn't exist)...") - print(f"Saving {self.model_name} configs to file {self.config_path}...") - self.hf_config.to_json_file(self.config_path) + config_path = os.path.join(config_dir, "config.json") + os.makedirs(config_dir, exist_ok=True) + print(f"Creating directory {config_dir} (if it doesn't exist)...") + print(f"Saving {self.model_name} configs to file {config_path}...") + self.hf_config.to_json_file(config_path) + + # Save PEFT configs if the LLM has any registered PEFTs + for ff_peft_config, peft_dict in self.pefts.items(): + peft_config = peft_dict["peft_config"] + peft_model_id = ff_peft_config.peft_model_id + peft_config_dir = os.path.join( + os.path.expanduser(self.cache_path), "configs", peft_model_id.lower() + ) + os.makedirs(peft_config_dir, exist_ok=True) + peft_config_path = os.path.join(peft_config_dir, "config.json") + print(f"Saving {peft_model_id} configs to file {peft_config_path}...") + with open(peft_config_path, "w") as json_file: + + class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return super().default(obj) - def __get_revision_hashes(self, model_name: str, weights: bool): + json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) + + def __get_revision_hashes(self, model_name: str, folder: str): ff_revision = None - ff_revision_file = ( - os.path.join(self.weights_path, "rev_sha.txt") - if weights - else os.path.join(self.tokenizer_path, "rev_sha.txt") - ) + ff_revision_file = os.path.join(folder, "rev_sha.txt") + if os.path.exists(ff_revision_file): ff_revision = "".join(open(ff_revision_file).read().split()) @@ -173,65 +216,109 @@ def __get_revision_hashes(self, model_name: str, weights: bool): def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. + + If any PEFT adapter is registered, perform the same operation for PEFT. """ - if self.data_type == DataType.DT_HALF: - torch.set_default_tensor_type(torch.HalfTensor) - elif self.data_type == DataType.DT_FLOAT: - torch.set_default_tensor_type(torch.FloatTensor) - else: - assert False, "Data type not yet supported -- cannot download weights!" - # Use local cache, or download new version - self.weights_path = os.path.join( - os.path.expanduser(self.cache_path), - "weights", - self.model_name.lower(), - ( - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision" - ), - ) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..." + def get_weights_path(model_name): + return os.path.join( + os.path.expanduser(self.cache_path), + "weights", + model_name.lower(), + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), ) - if os.path.exists(self.weights_path): - shutil.rmtree(self.weights_path) - os.makedirs(self.weights_path, exist_ok=True) - print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=True - ) - - # Download if needed - if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model + def refresh_cache_if_needed(model_name): + weights_path = get_weights_path(model_name) + if self.refresh_cache: print( - f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + f"Refreshing weights in cache for model {model_name} at path {weights_path} ..." ) - else: - # Remote model + if os.path.exists(weights_path): + shutil.rmtree(weights_path) + os.makedirs(weights_path, exist_ok=True) + + def get_hf_llm(model_name): + return AutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + torch_dtype=( + torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16 + ), + ) + + def download_llm_weights(): + refresh_cache_if_needed(self.model_name) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, self.weights_path + ) + if ff_revision != latest_revision: print( - f"'{self.model_name}' local model weights were updated! Converting new weights now..." + f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." ) - # Download model from HuggingFace, or load it from the local folder - hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, trust_remote_code=True - ) - # Print log message to notify user download of model has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading HF weights. Converting them now...") - # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print("Done converting the weights...") - else: - print(f"Loading '{self.model_name}' model weights from the cache...") + hf_model = get_hf_llm(self.model_name) + # Convert the model to FlexFlow format + self.model_class.convert_hf_model(hf_model, self.weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {self.model_name}") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + def convert_peft_model(hf_peft_model, peft_type, weights_path): + for name, params in hf_peft_model.named_parameters(): + if peft_type.lower() in name: + name = name.replace("base_model.model.model.", "").replace( + ".default", "" + ) + name = self.model_class.convert_hf_weight_name(name) + params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + + def download_peft_weights(): + for ff_peft_config, peft_dict in self.pefts.items(): + if not ff_peft_config.init_lora_weights: + peft_config = peft_dict["peft_config"] + peft_type = peft_dict["peft_type"] + peft_model_id = ff_peft_config.peft_model_id + + weights_path = get_weights_path(peft_model_id) + refresh_cache_if_needed(peft_model_id) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + peft_model_id, weights_path + ) + + if ff_revision != latest_revision: + print( + f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..." + ) + hf_model = get_hf_llm(peft_model_id) + hf_peft_model = PeftModel.from_pretrained( + hf_model, peft_model_id, config=peft_config + ) + # Convert the model to FlexFlow format + convert_peft_model(hf_peft_model, peft_type, weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {peft_model_id}") + # Deallocate hf model + del hf_peft_model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + self.weights_path = get_weights_path(self.model_name) + download_llm_weights() + download_peft_weights() def download_hf_tokenizer_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. @@ -241,13 +328,11 @@ def download_hf_tokenizer_if_needed(self): # Use local cache, or download new version self.tokenizer_path = os.path.join( - os.path.expanduser(self.cache_path), - "tokenizers", - self.model_name.lower(), + os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower() ) if self.refresh_cache: print( - f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..." + f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..." ) if os.path.exists(self.tokenizer_path): shutil.rmtree(self.tokenizer_path) @@ -257,46 +342,29 @@ def download_hf_tokenizer_if_needed(self): # Get local revision SHA, check if it matches latest one on huggingface ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=False + self.model_name, self.tokenizer_path ) if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model - print( - f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - # Remote model - print( - f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..." - ) + print( + f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." + ) # Download tokenizer from HuggingFace, or load it from the local folder - if self.model_type == ModelType.LLAMA: - hf_tokenizer = LlamaTokenizer.from_pretrained( - self.model_name, use_fast=True - ) - else: - hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) - # Print log message to notify user download of tokenizer has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading tokenizer. Saving it now...") + hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) # Save tokenizer hf_tokenizer.save_pretrained(self.tokenizer_path) - print("Done saving HF tokenizer.") + print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) - else: - print(f"Loading '{self.model_name}' tokenizer from the cache...") - def compile( self, generation_config: GenerationConfig = GenerationConfig(), max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, + enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = None, model_specific_tensor_parallelism_degree: int = None, model_specific_pipeline_parallelism_degree: int = None, @@ -312,6 +380,8 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 :type max_tokens_per_batch: int, optional + :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False + :type enable_peft_finetuning: bool, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None :type model_specific_data_parallelism_degree: int, optional :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None @@ -321,9 +391,6 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - # self.max_requests_per_batch = max_requests_per_batch - # self.max_seq_length = max_seq_length - # self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -355,6 +422,7 @@ def compile( self.rm.set_max_requests_per_batch(max_requests_per_batch) self.rm.set_max_tokens_per_batch(max_tokens_per_batch) self.rm.set_max_sequence_length(max_seq_length) + self.rm.set_enable_peft_finetuning(enable_peft_finetuning) # Instantiate the relevant model self.model = self.model_class( @@ -366,16 +434,27 @@ def compile( max_tokens_per_batch, ) + # Download the config from huggingface + self.download_hf_config() + + # Download the tokenizer from huggingface (if needed) and load them + self.download_hf_tokenizer_if_needed() + # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() + # Add PEFT layer if registered + for ff_peft_config, peft_dict in self.pefts.items(): + ff_peft_config.ff_compile() + ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) + peft_dict["ff_peft_model_id"] = ff_peft_model_id + # Create file data loader, load weights into tensors model_configs = self.config_class(self.hf_config) self.rm.set_max_spec_tree_token_num( model_configs.max_spec_tree_token_num - if "max_spec_tree_token_num" - in model_configs.__dict__ + if "max_spec_tree_token_num" in model_configs.__dict__ else 20 ) @@ -393,9 +472,6 @@ def compile( self.im = InferenceManager() self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader) - # Download the tokenizer from huggingface (if needed) and load them - self.download_hf_tokenizer_if_needed() - # Create tokenizer (this must be done after we have downloaded the tokenizer bos_token_id = ( -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id @@ -419,22 +495,36 @@ def compile( atexit.register(self.rm.stop_server) - def generate(self, prompts: Union[str, List[str]], max_length: int = 128): + def generate( + self, + requests_or_prompts: Union[str, List[str], Request, List[Request]], + max_length: int = 128, + ): """Generate tokens based on the input prompt(s) - :param prompts: The generation prompt(s) in the form of a string, or list of strings - :type prompts: Union[str, List[str]] + :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests + :type requests_or_prompts: Union[str, List[str], Request, List[Request]] :return: the generation results :rtype: GenerationResult """ - if type(prompts) == str: - if len(prompts) == 0: + if type(requests_or_prompts) == str: + if len(requests_or_prompts) == 0: return None - return self.model.ffmodel.generate([prompts], max_length) - elif type(prompts) == list: - if len(prompts) == 0: + return self.model.ffmodel.generate_inf_only( + [requests_or_prompts], max_length + ) + elif type(requests_or_prompts) == Request: + return self.model.ffmodel.generate(requests_or_prompts) + elif type(requests_or_prompts) == list: + if len(requests_or_prompts) == 0: return [] - return self.model.ffmodel.generate(prompts, max_length) + if type(requests_or_prompts[0]) == str: + return self.model.ffmodel.generate_inf_only( + requests_or_prompts, max_length + ) + else: + print(requests_or_prompts) + return self.model.ffmodel.generate(requests_or_prompts) else: assert False, "Please pass a non-empty string or list of strings" @@ -446,17 +536,6 @@ def stop_server(self): self.rm.stop_server() print("Background server stopped.") - def __enter__(self): - # Start the server when entering the context - # self.rm.start_server(self.model.ffmodel) - return self - - def __exit__(self, exc_type, exc_value, traceback): - # Stop the server when exiting the context - # self.rm.stop_server() - if exc_type: - print(f"Exception occurred: {exc_value}") - class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" @@ -482,13 +561,7 @@ def __init__( :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ - super().__init__( - model_name, - data_type, - cache_path, - refresh_cache, - output_file, - ) + super().__init__(model_name, data_type, cache_path, refresh_cache, output_file) def compile( self, @@ -496,15 +569,13 @@ def compile( max_requests_per_batch: int = 16, max_seq_length: int = 256, max_tokens_per_batch: int = 128, + enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = 1, model_specific_tensor_parallelism_degree: int = 1, model_specific_pipeline_parallelism_degree: int = 1, ssms: list = [], ): """Compile the SSM for inference and load the weights into memory - - :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE - :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional :param max_requests_per_batch: The maximum batch size to allow, defaults to 16 @@ -513,6 +584,8 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 :type max_tokens_per_batch: int, optional + :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False + :type enable_peft_finetuning: bool, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 :type model_specific_data_parallelism_degree: int, optional :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1 @@ -527,6 +600,7 @@ def compile( max_requests_per_batch, max_seq_length, max_tokens_per_batch, + enable_peft_finetuning, model_specific_data_parallelism_degree, model_specific_tensor_parallelism_degree, model_specific_pipeline_parallelism_degree, diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 994a85f57e..0f4726837c 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -46,6 +46,12 @@ class LossType(Enum): LOSS_IDENTITY = 54 +class OptimizerType(Enum): + OPTIMIZER_TYPE_NONE = 60 + OPTIMIZER_TYPE_SGD = 61 + OPTIMIZER_TYPE_ADAM = 62 + + class CompMode(Enum): TRAINING = 70 INFERENCE = 71 @@ -153,6 +159,11 @@ class OpType(Enum): RESIDUAL_LAYERNORM = 2306 +class RequestType(Enum): + REQ_INFERENCE = 4001 + REQ_FINETUNING = 4002 + + def enum_to_int(enum, enum_item): for item in enum: if enum_item == item: diff --git a/rdelacou/generate_trace.py b/rdelacou/generate_trace.py new file mode 100644 index 0000000000..986dab37df --- /dev/null +++ b/rdelacou/generate_trace.py @@ -0,0 +1,121 @@ +import pandas as pd +from math import ceil +from random import shuffle, uniform +import json, pickle, requests, os, argparse + +class TraceBuilder(object): + + # trace_type: either "conv" or "code" + def __init__(self, import_times=True, import_prompts=True): + self.req_times = None + self.imported_req_times = False + self.prompt_data = None + self.imported_prompt_data = False + if import_times: + self.import_trace_timestamps() + if import_prompts: + self.import_prompt_data() + + def import_trace_timestamps(self, trace_type="conv"): + if not self.imported_req_times: + # Import Microsoft LLM 1 hour trace + df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"]) + req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds + req_times = req_times - req_times.min() + self.req_times = req_times.tolist() + self.imported_req_times = True + + def import_prompt_data(self, shuffle_=True): + if not self.imported_prompt_data: + sharegpt_filename = "sharegpt_opt_text_completion_length.pkl" + sharegpt_filepath = f"./{sharegpt_filename}" + if os.path.exists(sharegpt_filepath): + os.remove("sharegpt_opt_text_completion_length.pkl") + sharegpt_url = f"https://github.com/sosp-ae-39/sosp-ae-astra/raw/main/datasets/{sharegpt_filename}" + response = requests.get(sharegpt_url) + with open(sharegpt_filename, "wb") as file: + file.write(response.content) + with open(sharegpt_filepath, 'rb') as f: + data2 = pickle.load(f) + os.remove("sharegpt_opt_text_completion_length.pkl") + + prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + + for pair in data2: + assert(len(pair) == 2) + + prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + num_pairs = len(prompt_lengths) + assert(num_pairs == len(generation_lengths)) + print("Number of conversation pairs: ", num_pairs) + + print(f"Prompt lengths: min={min(prompt_lengths)}, max={max(prompt_lengths)}, avg={sum(prompt_lengths)/len(prompt_lengths)}") + print(f"Generation lengths: min={min(generation_lengths)}, max={max(generation_lengths)}, avg={sum(generation_lengths)/len(generation_lengths)}") + total_lengths = [prompt_lengths[i] + generation_lengths[i] for i in range(len(prompt_lengths))] + print(f"Total lengths: min={min(total_lengths)}, max={max(total_lengths)}, avg={sum(total_lengths)/len(total_lengths)}") + + self.prompt_data = [{"human": prompt_lengths[i], "gpt": generation_lengths[i]} for i in range(num_pairs)] + + if shuffle_: + shuffle(self.prompt_data) + self.imported_prompt_data = True + + # Delta is in seconds + # Rate is in req per second + def generate_trace(self, target_arrival_rate=10, debug_verbose=False): + self.import_trace_timestamps() + self.import_prompt_data() + + microsec = 1000000 + avg_arrival_rate = len(self.req_times) / (self.req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude + if debug_verbose: + print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate) + scale_factor = float(target_arrival_rate) / avg_arrival_rate + if debug_verbose: + print("Scale factor to obtain target arrival rate: ", scale_factor) + + # Buckets are 1 second timeframes + nb_buckets = ceil(self.req_times[-1] / microsec) + buckets = [] + j = 0 + k = 0 + for i in range(nb_buckets): + bucket_size = 0 + while(j < len(self.req_times) and self.req_times[j] >= i*microsec and self.req_times[j] < (i+1)*microsec): + bucket_size += 1 + j += 1 + bucket_size = bucket_size*scale_factor + prob = bucket_size - int(bucket_size) + bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob) + + # If used all of the prompt data, loop back at the beggining and reuse some prompts + if k+bucket_size > len(self.prompt_data): + bucket = self.prompt_data[k:] + self.prompt_data[:(k+bucket_size)%len(self.prompt_data)] + else: + bucket = self.prompt_data[k:k+bucket_size] + k = (k+bucket_size) % len(self.prompt_data) + buckets.append(bucket) + + if debug_verbose: + print("Avg arrival rate obtained (req/s): ", sum([len(b) for b in buckets])/len(buckets)) + return buckets + +def generate_and_save_trace(arrival_rate, output_file): + builder = TraceBuilder() + trace = builder.generate_trace(target_arrival_rate=arrival_rate, debug_verbose=True) + with open(output_file, 'w+') as f: + json.dump(trace, f, indent=2) + +if __name__ == '__main__': + # Set up the argument parser + parser = argparse.ArgumentParser(description='Generate and save a trace.') + parser.add_argument('--arrival-rate', type=float, default=10.0, help='The target arrival rate for the trace.') + parser.add_argument('--output-file', type=str, default='sharegpt.json', help='The path to the output file to save the trace.') + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function with the user-provided arrival rate + generate_and_save_trace(args.arrival_rate, args.output_file) diff --git a/requirements.txt b/requirements.txt index ad65622367..64f1808934 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,11 @@ transformers>=4.31.0 sentencepiece einops pip +# peft-related +scipy +bitsandbytes +datasets +accelerate +loralib +triton +peft diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 5714c8fe3d..e39cb29037 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -67,6 +67,13 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *); FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *); + // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_optimizer_config_t, LoraOptimizerConfig + // *); FF_NEW_OPAQUE_WRAPPER(flexflow_lora_sgd_optimizer_config_t, + // LoraSGDOptimizerConfig *); + // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_adam_optimizer_config_t, + // LoraAdamOptimizerConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *); }; Logger ffc_log("flexflow_c"); @@ -649,6 +656,7 @@ flexflow_tensor_t * bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); const Tensor input = FFCObjectWrapper::unwrap(input_); @@ -672,6 +680,7 @@ flexflow_tensor_t * elementwise_affine, eps, use_bias, + inplace_residual, input->data_type, name); assert(tensor_outputs[0] != nullptr); @@ -679,7 +688,7 @@ flexflow_tensor_t * DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 " "%p, output0: %p, " "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps " - "%f, use_bias: %d, name %s", + "%f, use_bias: %d, inplace_residual: %d, name %s", input, residual1, residual2, @@ -689,6 +698,7 @@ flexflow_tensor_t * elementwise_affine, eps, use_bias, + inplace_residual, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); @@ -706,6 +716,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); const Tensor input = FFCObjectWrapper::unwrap(input_); @@ -722,13 +733,14 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( elementwise_affine, eps, use_bias, + inplace_residual, input->data_type, name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, " "output1: %p, elementwise_affine %d, eps " - "%f, use_bias %d, name %s", + "%f, use_bias %d, inplace_residual: %d, name %s", input, residual, tensor_outputs[0], @@ -736,6 +748,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( elementwise_affine, eps, use_bias, + inplace_residual, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); @@ -1469,13 +1482,20 @@ flexflow_tensor_t * const flexflow_tensor_t input2_, float eps, int dim, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input1 = FFCObjectWrapper::unwrap(input1_); Tensor input2 = FFCObjectWrapper::unwrap(input2_); Tensor tensor_outputs[2]; - handle->residual_rms_norm( - input1, input2, tensor_outputs, eps, dim, input1->data_type, name); + handle->residual_rms_norm(input1, + input2, + tensor_outputs, + eps, + dim, + inplace_residual, + input1->data_type, + name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); flexflow_tensor_t *tensor_outputs_wrapped = @@ -1529,6 +1549,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, + const flexflow_lora_linear_config_t peft_config_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); + PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); + + DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, " + "peft_model_id: %p", + handle, + peft_config, + peft_model_id); + return FFCObjectWrapper::wrap(peft_model_id); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1584,39 +1619,83 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { void flexflow_model_generate(flexflow_model_t handle_, int num_requests, + enum RequestType *request_types, char const **input_texts, - int max_num_chars, char **output_texts, - int max_seq_length, - int **output_length_and_tokens) { + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, + int **output_length_and_tokens, + int *num_finetuning_losses, + float *finetuning_losses) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - std::vector prompts; + std::vector requests; + for (int i = 0; i < num_requests; i++) { - std::string const text_str(input_texts[i]); - prompts.push_back(text_str); - DEBUG_PRINT("[Model] generate[%d] %p %s %i", - i, - handle, - text_str.c_str(), - max_seq_length); + if (request_types[i] == RequestType::REQ_INFERENCE) { + std::string const text_str(input_texts[i]); + Request inference_req; + inference_req.prompt = text_str; + inference_req.max_sequence_length = max_seq_lengths[i]; + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { + inference_req.peft_model_id = *peft_model_id; + } + requests.push_back(inference_req); + DEBUG_PRINT("[Model] generate[%d] %p %s %i", + i, + handle, + text_str.c_str(), + max_seq_lengths[i]); + } else if (request_types[i] == RequestType::REQ_FINETUNING) { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = max_seq_lengths[i]; + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { + fine_tuning_req.peft_model_id = *peft_model_id; + } + std::string const dataset_fp(dataset_filepaths[i]); + fine_tuning_req.dataset_filepath = dataset_fp; + fine_tuning_req.max_training_steps = training_steps[i]; + requests.push_back(fine_tuning_req); + DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i", + i, + handle, + dataset_fp.c_str(), + max_seq_lengths[i], + training_steps[i]); + } else { + assert(false && "Unknown request type"); + } } - std::vector results = - handle->generate(prompts, max_seq_length); - // If the prompt exceeds max seq len, check that we return the prompt with no - // additional token. Otherwise, check that the output does not exceed the max - // sequence length. + + std::vector results = handle->generate(requests); + for (int i = 0; i < num_requests; i++) { - assert(results[i].output_tokens.size() <= max_seq_length || - results[i].output_tokens.size() == results[i].input_tokens.size()); - output_length_and_tokens[i][0] = results[i].output_tokens.size(); - std::copy(results[i].output_tokens.begin(), - results[i].output_tokens.end(), - output_length_and_tokens[i] + 1); - std::memcpy(output_texts[i], - results[i].output_text.c_str(), - results[i].output_text.length()); + if (request_types[i] == RequestType::REQ_INFERENCE) { + // If the prompt exceeds max seq len, check that we return the prompt with + // no additional token. Otherwise, check that the output does not exceed + // the max sequence length. + assert(results[i].output_tokens.size() <= max_seq_lengths[i] || + results[i].output_tokens.size() == results[i].input_tokens.size()); + output_length_and_tokens[i][0] = results[i].output_tokens.size(); + std::copy(results[i].output_tokens.begin(), + results[i].output_tokens.end(), + output_length_and_tokens[i] + 1); + std::memcpy(output_texts[i], + results[i].output_text.c_str(), + results[i].output_text.length()); + } else if (request_types[i] == RequestType::REQ_FINETUNING) { + assert(results[i].finetuning_losses.size() > 0); + *num_finetuning_losses = results[i].finetuning_losses.size(); + // *finetuning_losses = results[i].finetuning_losses.data(); + std::memcpy(finetuning_losses, + results[i].finetuning_losses.data(), + results[i].finetuning_losses.size() * sizeof(float)); + } } - // return FFCObjectWrapper::wrap(&results[0]); } void flexflow_model_set_position_offset(flexflow_model_t handle_, @@ -2597,6 +2676,14 @@ void flexflow_request_manager_set_max_sequence_length( DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length); } +void flexflow_request_manager_set_enable_peft_finetuning( + flexflow_request_manager_t handle_, bool enable_peft_finetuning_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_enable_peft_finetuning(enable_peft_finetuning_); + DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d", + enable_peft_finetuning_); +} + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, @@ -2730,3 +2817,238 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, FFModel *model = FFCObjectWrapper::unwrap(model_handle_); handle->load_weights(model); } + +// // ----------------------------------------------------------------------- +// // LoraSGDOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_sgd_optimizer_config_t +// flexflow_lora_sgd_optimizer_config_create( +// double lr, double momentum, bool nesterov, bool weight_decay) { +// LoraSGDOptimizerConfig *handle = +// new LoraSGDOptimizerConfig(lr, momentum, nesterov, weight_decay); +// DEBUG_PRINT("[LoraSGDOptimizerConfig] new %p", handle); +// return FFCObjectWrapper::wrap(handle); +// } + +// void flexflow_lora_sgd_optimizer_config_destroy( +// flexflow_lora_sgd_optimizer_config_t handle_) { +// LoraSGDOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_); +// DEBUG_PRINT("[LoraSGDOptimizerConfig] delete %p", handle); +// delete handle; +// } + +// // ----------------------------------------------------------------------- +// // LoraAdamOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_adam_optimizer_config_t +// flexflow_lora_adam_optimizer_config_create(double alpha, +// double beta1, +// double beta2, +// double weight_decay, +// double epsilon) { +// LoraAdamOptimizerConfig *handle = +// new LoraAdamOptimizerConfig(alpha, beta1, beta2, weight_decay, +// epsilon); +// DEBUG_PRINT("[LoraAdamOptimizerConfig] new %p", handle); +// return FFCObjectWrapper::wrap(handle); +// } + +// void flexflow_lora_adam_optimizer_config_destroy( +// flexflow_lora_adam_optimizer_config_t handle_) { +// LoraAdamOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_); +// DEBUG_PRINT("[LoraAdamOptimizerConfig] delete %p", handle); +// delete handle; +// } + +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_, + bool trainable, + bool init_lora_weights, + char const *base_model_name_or_path_, + char const *precision_, + int rank, + float lora_alpha, + float lora_dropout, + int num_target_modules, + char const **target_modules_, + enum OptimizerType optimizer_type, + float sgd_learning_rate, + float sgd_momentum, + bool sgd_nesterov, + float sgd_weight_decay, + float adam_alpha, + float adam_beta1, + float adam_beta2, + float adam_weight_decay, + float adam_epsilon) { + assert(cache_folder_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(peft_model_id_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(base_model_name_or_path_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(precision_ != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const cache_folder(cache_folder_); + std::string const peft_model_id(peft_model_id_); + LoraOptimizerConfig *optim_config = nullptr; + if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_SGD) { + optim_config = new LoraSGDOptimizerConfig( + sgd_learning_rate, sgd_momentum, sgd_nesterov, sgd_weight_decay); + } else if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_ADAM) { + optim_config = new LoraAdamOptimizerConfig( + adam_alpha, adam_beta1, adam_beta2, adam_weight_decay, adam_epsilon); + } + std::vector target_modules; + for (int i = 0; i < num_target_modules; i++) { + std::string const target_module(target_modules_[i]); + target_modules.push_back(target_module); + } + std::string const base_model_name_or_path(base_model_name_or_path_); + std::string const precision(precision_); + LoraLinearConfig *handle = new LoraLinearConfig(cache_folder, + peft_model_id, + trainable, + optim_config, + init_lora_weights, + base_model_name_or_path, + precision, + rank, + lora_alpha, + lora_dropout, + target_modules); + DEBUG_PRINT("[LoraLinearConfig] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_lora_linear_config_destroy( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config); + delete peft_config; +} + +char const *flexflow_lora_linear_config_get_cache_folder( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->cache_folder.c_str(); +} + +char const *flexflow_lora_linear_config_get_peft_model_id( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->peft_model_id.c_str(); +} + +int flexflow_lora_linear_config_get_rank( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->rank; +} + +float flexflow_lora_linear_config_get_lora_alpha( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->lora_alpha; +} + +float flexflow_lora_linear_config_get_lora_dropout( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->lora_dropout; +} + +bool flexflow_lora_linear_config_get_trainable( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->trainable; +} + +bool flexflow_lora_linear_config_get_init_lora_weights( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->init_lora_weights; +} + +char const **flexflow_lora_linear_config_get_target_modules( + flexflow_lora_linear_config_t handle_, int *num_target_modules) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + *num_target_modules = handle->target_modules.size(); + static std::vector target_modules_; + target_modules_.clear(); + for (auto const &target_module : handle->target_modules) { + target_modules_.push_back(target_module.c_str()); + } + return target_modules_.data(); +} + +char const *flexflow_lora_linear_config_get_base_model_name_or_path( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->base_model_name_or_path.c_str(); +} + +char const *flexflow_lora_linear_config_get_precision( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->precision.c_str(); +} + +void flexflow_lora_linear_config_set_lora_alpha( + flexflow_lora_linear_config_t handle_, float value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->lora_alpha = value; +} + +void flexflow_lora_linear_config_set_lora_dropout( + flexflow_lora_linear_config_t handle_, float value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->lora_dropout = value; +} + +void flexflow_lora_linear_config_set_trainable( + flexflow_lora_linear_config_t handle_, bool value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->trainable = value; +} + +void flexflow_lora_linear_config_set_init_lora_weights( + flexflow_lora_linear_config_t handle_, bool value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->init_lora_weights = value; +} + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create() { + PEFTModelID *handle = new PEFTModelID(); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { + PEFTModelID *handle = new PEFTModelID(id); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +flexflow_peft_model_id_t flexflow_peft_model_id_no_id() { + PEFTModelID *handle = const_cast(&PEFTModelID::NO_ID); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); + delete peft_model_id; +} diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index a87aaade84..99c13f5a67 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -86,7 +86,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( num_classes, k); // Scale logit gradients by op->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, @@ -116,7 +116,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, @@ -146,7 +146,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, @@ -173,7 +173,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(loss_grad_volume), CUDA_NUM_THREADS, 0, diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index f78311980c..636ef9c4c3 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -81,7 +81,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, label_ptr, num_samples, num_classes, k); // Scale logit gradients by op->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -100,7 +100,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_ptr, label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor); } void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( @@ -119,7 +119,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_ptr, label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor); } void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, @@ -135,7 +135,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - loss_grad_ptr, loss_grad_volume, 0, scale_factor); + loss_grad_ptr, loss_grad_volume, 0.0f, scale_factor); } }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index a17e156f18..7a1da2e974 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -43,7 +43,8 @@ bool operator==(AddBiasResidualLayerNormParams const &lhs, AddBiasResidualLayerNormParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && lhs.elementwise_affine == rhs.elementwise_affine && - lhs.use_bias == rhs.use_bias; + lhs.use_bias == rhs.use_bias && + lhs.inplace_residual == rhs.inplace_residual; } bool AddBiasResidualLayerNormParams::is_valid( @@ -58,7 +59,8 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; - if (this->name != nullptr) { + params.inplace_residual = this->inplace_residual; + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -71,6 +73,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -171,6 +174,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, ln->add_int_property("use_bias", use_bias); ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); + ln->add_int_property("inplace_residual", inplace_residual); layers.push_back(ln); outputs[0] = ln->outputs[0]; outputs[1] = ln->outputs[1]; @@ -189,6 +193,8 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer( layer->get_int_vector_property("axes", axes); float eps; layer->get_float_property("eps", eps); + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; return new AddBiasResidualLayerNorm(model, layer->layer_guid, inputs[0], @@ -197,6 +203,7 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer( elementwise_affine, use_bias, eps, + inplace_residual, false, // allocate_weights layer->name); } @@ -215,6 +222,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( params.elementwise_affine, params.use_bias, params.eps, + params.inplace_residual, allocate_weights, params.name) {} @@ -227,6 +235,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -239,7 +248,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( _input, _residual), elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), - use_bias(_use_bias) { + use_bias(_use_bias), inplace_residual(_inplace_residual) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -348,48 +357,57 @@ void AddBiasResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } // attn output - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + // added: attn_output + attn final bias + residual + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -397,7 +415,7 @@ void AddBiasResidualLayerNorm::init_inference( READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -420,48 +438,56 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - // attn output - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + } + // input: attn output + // added: attn_output + attn final bias + residual + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -469,7 +495,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -478,13 +504,11 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { } /* - regions[0](I): attn output - regions[1](I): residual - regions[2](O): added output (attn output + final attn bias + residual) - regions[3](O): layer norm output - regions[4](I): final attn bias - regions[5](I): gamma - regions[6](I): beta + regions[0](I/O): attn output AND added output (attn output + final attn bias + + residual) regions[1](I): residual regions[2](O): layer norm output + regions[3](I): final attn bias + regions[4](I): gamma + regions[5](I): beta */ OpMeta *AddBiasResidualLayerNorm::init_task( Task const *task, @@ -517,10 +541,6 @@ void AddBiasResidualLayerNorm::forward(FFModel const &ff) { assert(false); } -void AddBiasResidualLayerNorm::backward(FFModel const &ff) { - assert(false); -} - FutureMap AddBiasResidualLayerNorm::inference( FFModel const &ff, BatchConfigFuture const &bc, @@ -546,69 +566,94 @@ FutureMap AddBiasResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - // attn output - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + // input + // added_output: input + attn bias + residual + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + // attn bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + weights[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); - // layer norm output + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } + // output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); - // attn final bias - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { + // gamma launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); - + launcher.add_field(fid++, FID_DATA); if (use_bias) { + // beta launcher.add_region_requirement(RegionRequirement(weights[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } return runtime->execute_index_space(ctx, launcher); } +void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } +} + /* - regions[0](I): attn output - regions[1](I): residual - regions[2](O): added output (attn output + final attn bias + residual) - regions[3](O): layer norm output - regions[4](I): final attn bias - regions[5](I): gamma - regions[6](I): beta + regions[0](I): input / added output + regions[1](I): attn bias + regions[2](I): residual + regions[3](O): output + regions[4](I): gamma + regions[5](I): beta */ void AddBiasResidualLayerNorm::inference_task( Task const *task, @@ -626,30 +671,72 @@ void AddBiasResidualLayerNorm::inference_task( *((AddBiasResidualLayerNormMeta **)task->local_args); assert(regions.size() == - 5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + 4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + + int rid = 0, tid = 0, did = 0; + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(m->input_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR attn_bias = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual = + helperGetGenericTensorAccessorRO(m->input_type[1], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW added_output; + if (m->inplace_residual) { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(m->output_type[1], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); GenericTensorAccessorR gamma, beta; Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain attn_bias_domain = runtime->get_index_space_domain( + ctx, task->regions[did++].region.get_index_space()); Domain residual_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain added_out_domain; + if (m->inplace_residual) { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + } else { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[did++].region.get_index_space()); + } Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - Domain attn_bias_domain = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain gamma_domain, beta_domain; assert(in_domain.get_volume() == out_domain.get_volume()); @@ -673,23 +760,23 @@ void AddBiasResidualLayerNorm::inference_task( if (m->elementwise_affine) { gamma = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[5], - task->regions[5], + regions[rid++], + task->regions[tid++], FID_DATA, ctx, runtime); gamma_domain = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); if (m->use_bias) { beta = helperGetGenericTensorAccessorRO(m->weight_type[2], - regions[6], - task->regions[6], + regions[rid++], + task->regions[tid++], FID_DATA, ctx, runtime); beta_domain = runtime->get_index_space_domain( - ctx, task->regions[6].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); assert(gamma_domain == beta_domain); } @@ -707,16 +794,7 @@ void AddBiasResidualLayerNorm::inference_task( } AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - (int)attn_bias_dim, - (int)residual_domain.get_volume(), - input, - added_output, - output, - residual, - attn_bias, - gamma, - beta); + m, bc, input, attn_bias, residual, added_output, output, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -729,13 +807,299 @@ void AddBiasResidualLayerNorm::inference_task( weights_accessors.push_back(beta); } } + AddBiasResidualLayerNorm::save_inference_tensors_to_file( + m, shard_id, bc, {residual}, weights_accessors, {added_output, output}); + } +} + +void AddBiasResidualLayerNorm::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // added output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // attn bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + // beta_grad + launcher.add_region_requirement( + RegionRequirement(weights[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + +void AddBiasResidualLayerNorm::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + AddBiasResidualLayerNormMeta *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 5 + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR added_output = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW attn_bias_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + gamma_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + AddBiasResidualLayerNorm::backward_kernel_wrapper(m, + output_grad, + added_output, + input_grad, + residual_grad, + attn_bias_grad, + gamma, + gamma_grad, + beta_grad); +} + +Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int field_id = 0; + // output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void AddBiasResidualLayerNorm::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(task->regions.size() == regions.size()); + AddBiasResidualLayerNormMeta *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == 3 + m->elementwise_affine); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, output_grad, input_grad, residual_grad, gamma); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + } AddBiasResidualLayerNorm::save_inference_tensors_to_file( m, shard_id, bc, - {input, residual}, + {input_grad, residual_grad}, weights_accessors, - {added_output, output}); + {output_grad}, + false /*fwd_pass*/); } } @@ -755,6 +1119,7 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -771,6 +1136,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; float eps; + bool inplace_residual; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); @@ -785,6 +1151,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -796,6 +1163,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + params.inplace_residual = inplace_residual; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); @@ -816,6 +1184,7 @@ size_t hash::operator()( } hash_combine(key, params.elementwise_affine); hash_combine(key, params.use_bias); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 1add43ecd9..681f55c998 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -23,12 +23,13 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, AddBiasResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; @@ -45,6 +46,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { @@ -75,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -84,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -94,53 +94,36 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { } template -__global__ void LayerNormFusedForwardKernel(int attn_bias_dim, - int residual_volume, - int64_t effective_num_elements, - int64_t effective_batch_size, +__global__ void LayerNormFusedForwardKernel(int64_t N, + int64_t attn_bias_dim, float eps, T const *input_ptr, T const *attn_bias_ptr, T const *residual_ptr, - T *added_output_ptr, - T *output_ptr, - T const *gamma_ptr, - T const *beta_ptr, + T *X, T *mean, - T *rstd) { - // Add attention bias and residual - CUDA_KERNEL_LOOP(i, residual_volume) { - int bias_idx = i % attn_bias_dim; - added_output_ptr[i] = - input_ptr[i] + attn_bias_ptr[bias_idx] + residual_ptr[i]; - } - - __syncthreads(); - - // LayerNorm + T *rstd, + T const *gamma, + T const *beta, + T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - if (i >= effective_batch_size) { - return; - } float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < effective_num_elements; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { - const int64_t index = i * effective_num_elements + j; - sum1 += static_cast(added_output_ptr[index]); - sum2 += static_cast(added_output_ptr[index]) * - static_cast(added_output_ptr[index]); - } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const int64_t bias_idx = index % attn_bias_dim; + X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(effective_num_elements); + float const scale = float(1) / static_cast(N); sum1 *= scale; sum2 = max(sum2 * scale - sum1 * sum1, float(0)); mean[i] = static_cast(sum1); @@ -150,17 +133,15 @@ __global__ void LayerNormFusedForwardKernel(int attn_bias_dim, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < effective_num_elements; - j += min(blockDim.x, kCUDANumThreads)) { - const int64_t index = i * effective_num_elements + j; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; const T_ACC gamma_v = - gamma_ptr == nullptr ? T_ACC(1) : static_cast(gamma_ptr[j]); + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); const T_ACC beta_v = - beta_ptr == nullptr ? T_ACC(0) : static_cast(beta_ptr[j]); - output_ptr[index] = (static_cast(added_output_ptr[index]) - - static_cast(mean[i])) * - static_cast(rstd[i]) * gamma_v + - beta_v; + beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; } } @@ -178,57 +159,108 @@ void AddBiasResidualLayerNorm::inference_kernel( T const *gamma_ptr, T const *beta_ptr, hipStream_t stream) { - - std::pair kernel1_parallelism = std::make_pair( - GET_BLOCKS(residual_volume), std::min(residual_volume, CUDA_NUM_THREADS)); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel3_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = std::max({kernel1_parallelism.first, - kernel2_parallelism.first, - kernel3_parallelism.first}); - int num_threads = std::max({kernel1_parallelism.second, - kernel2_parallelism.second, - kernel3_parallelism.second}); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormFusedForwardKernel), - num_blocks, - num_threads, + m->effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), 0, stream, - attn_bias_dim, - residual_volume, m->effective_num_elements, - m->effective_batch_size, + attn_bias_dim, m->eps, input_ptr, attn_bias_ptr, residual_ptr, added_output_ptr, - output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); + output_ptr); } /*static*/ void AddBiasResidualLayerNorm::inference_kernel_wrapper( - AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + // inference kernel + int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; + int residual_volume = residual.domain.get_volume(); if (m->input_type[0] == DT_FLOAT) { AddBiasResidualLayerNorm::inference_kernel( m, @@ -239,8 +271,8 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( residual.get_float_ptr(), added_output.get_float_ptr(), output.get_float_ptr(), - gamma.get_float_ptr(), - m->use_bias ? beta.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, stream); } else if (m->input_type[0] == DT_HALF) { AddBiasResidualLayerNorm::inference_kernel( @@ -252,12 +284,566 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( residual.get_half_ptr(), added_output.get_half_ptr(), output.get_half_ptr(), - gamma.get_half_ptr(), - m->use_bias ? beta.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); } else { assert(false && "unsupport datatype in layernorm"); } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed); + // if (m->input_type[0] == DT_FLOAT) { + // print_tensor(input.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(attn_bias.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:attn_bias]"); + // print_tensor(residual.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:residual]"); + // print_tensor(added_output.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:added_output]"); + // print_tensor(output.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + // print_tensor(gamma.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:gamma]"); + // print_tensor( + // beta.get_float_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:beta]"); + // } else { + // print_tensor( + // input.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(attn_bias.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:attn_bias]"); + // print_tensor(residual.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:residual]"); + // print_tensor(added_output.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:added_output]"); + // print_tensor(output.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + // print_tensor( + // gamma.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:gamma]"); + // print_tensor( + // beta.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:beta]"); + // } + // print_tensor(in_ptr, 32, "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual_i = dX_residual + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad) { + dX_residual_i[l] = f_grad_input; + } else { + dX_residual_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual, + reset_input_grad, + reset_residual_grad, + N, + buf); +} + +/*static*/ +template +void AddBiasResidualLayerNorm::backward_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), + B, + dim3(kThreadX, kThreadY), + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void AddBiasResidualLayerNorm::backward_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::peft_bwd_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); +} + +/*static*/ +void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorR const &gamma) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } } }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index ceb1a6514e..bcca1ba2c6 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -22,12 +22,13 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, AddBiasResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; @@ -44,6 +45,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { @@ -74,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -83,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -110,20 +110,17 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const int64_t bias_idx = index % attn_bias_dim; X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -135,7 +132,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -161,42 +158,33 @@ void AddBiasResidualLayerNorm::inference_kernel( T const *gamma_ptr, T const *beta_ptr, cudaStream_t stream) { - - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - LayerNormFusedForwardKernel - <<>>(m->effective_num_elements, - attn_bias_dim, - m->eps, - input_ptr, - attn_bias_ptr, - residual_ptr, - added_output_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - output_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + attn_bias_dim, + m->eps, + input_ptr, + attn_bias_ptr, + residual_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); } /*static*/ void AddBiasResidualLayerNorm::inference_kernel_wrapper( - AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta) { cudaStream_t stream; @@ -208,6 +196,69 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + // inference kernel + int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; + int residual_volume = residual.domain.get_volume(); if (m->input_type[0] == DT_FLOAT) { AddBiasResidualLayerNorm::inference_kernel( m, @@ -297,4 +348,478 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual_i = dX_residual + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad) { + dX_residual_i[l] = f_grad_input; + } else { + dX_residual_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual, + reset_input_grad, + reset_residual_grad, + N, + buf); +} + +/*static*/ +template +void AddBiasResidualLayerNorm::backward_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void AddBiasResidualLayerNorm::backward_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::peft_bwd_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); +} + +/*static*/ +void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorR const &gamma) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 5f05458e34..c83b738a0e 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -85,7 +85,7 @@ AggregateParams Aggregate::get_params() const { AggregateParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -242,7 +242,7 @@ OpMeta *Aggregate::init_task(Task const *task, Runtime *runtime) { Aggregate *agg = (Aggregate *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - AggregateMeta *m = new AggregateMeta(handle, agg->n); + AggregateMeta *m = new AggregateMeta(handle, agg); m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); @@ -603,7 +603,7 @@ bool Aggregate::measure_operator_cost(Simulator *sim, return false; } - AggregateMeta *m = new AggregateMeta(sim->handler, n); + AggregateMeta *m = new AggregateMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp index d5ebdb0c22..5a508cfac4 100644 --- a/src/ops/aggregate.cpp +++ b/src/ops/aggregate.cpp @@ -281,13 +281,14 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, out_dim); } -AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) { - checkCUDA(hipMalloc(&dev_exp_preds, n * sizeof(float *))); - checkCUDA(hipMalloc(&dev_exp_grads, n * sizeof(float *))); +AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(hipMalloc(&dev_exp_preds, aggr->n * sizeof(float *))); + checkCUDA(hipMalloc(&dev_exp_grads, aggr->n * sizeof(float *))); } AggregateMeta::~AggregateMeta(void) { checkCUDA(hipFree(&dev_exp_preds)); checkCUDA(hipFree(&dev_exp_grads)); } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu index 38e141b252..9704302092 100644 --- a/src/ops/aggregate.cu +++ b/src/ops/aggregate.cu @@ -307,9 +307,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, } } -AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) { - checkCUDA(cudaMalloc(&dev_exp_preds, n * sizeof(float *))); - checkCUDA(cudaMalloc(&dev_exp_grads, n * sizeof(float *))); +AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(cudaMalloc(&dev_exp_preds, aggr->n * sizeof(float *))); + checkCUDA(cudaMalloc(&dev_exp_grads, aggr->n * sizeof(float *))); } AggregateMeta::~AggregateMeta(void) { checkCUDA(cudaFree(&dev_exp_preds)); diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 1edd430881..6ea3ff3747 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -84,7 +84,7 @@ AggregateSpecParams AggregateSpec::get_params() const { AggregateSpecParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -210,7 +210,7 @@ OpMeta *AggregateSpec::init_task(Task const *task, Runtime *runtime) { AggregateSpec *agg = (AggregateSpec *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n); + AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg); m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); @@ -543,7 +543,7 @@ bool AggregateSpec::measure_operator_cost(Simulator *sim, return false; } - AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, n); + AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp index 314e20a59c..a676fa81c3 100644 --- a/src/ops/aggregate_spec.cpp +++ b/src/ops/aggregate_spec.cpp @@ -290,9 +290,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, out_dim); } -AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n) - : OpMeta(handler) { - checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); +AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, + AggregateSpec const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(hipMalloc(&dev_region_ptrs, aggr->n * sizeof(float *))); } AggregateSpecMeta::~AggregateSpecMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); diff --git a/src/ops/aggregate_spec.cu b/src/ops/aggregate_spec.cu index 8d50d45d21..ac5a372efc 100644 --- a/src/ops/aggregate_spec.cu +++ b/src/ops/aggregate_spec.cu @@ -287,9 +287,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, out_dim); } -AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n) - : OpMeta(handler) { - checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); +AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, + AggregateSpec const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(cudaMalloc(&dev_region_ptrs, aggr->n * sizeof(float *))); } AggregateSpecMeta::~AggregateSpecMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 780a77450e..534bac2419 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -112,7 +112,7 @@ ArgTopKParams ArgTopK::get_params() const { params.k = this->k; params.sorted = this->sorted; params.speculative_decoding = this->speculative_decoding; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -387,7 +387,7 @@ InferenceResult DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW probs; - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); ArgTopK::forward_kernel_wrapper( m, input, probs, indices, batch_size, nullptr); @@ -399,7 +399,7 @@ InferenceResult } InferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } @@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task( ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); - download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + copy_tensor_dev_to_host( + probs.get_float_ptr(), ir.probs, batch_size * m->k); return ir; } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 1892ac2353..4123e50e7e 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -91,7 +91,7 @@ Op *ArgMax::create_operator_from_layer( ArgMaxParams ArgMax::get_params() const { ArgMaxParams params; params.beam_search = this->beam_search; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -314,7 +314,7 @@ FutureMap ArgMax::inference(FFModel const &ff, launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -348,15 +348,18 @@ BeamInferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); - ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); + float loss = 0.0f; + ArgMax::forward_kernel_wrapper( + m, bc, input, indices, parent, batch_size, &loss); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); - download_tensor(m->probs, ir.probs, batch_size); - download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); + copy_tensor_dev_to_host(m->probs, ir.probs, batch_size); + copy_tensor_dev_to_host( + parent.get_int32_ptr(), ir.parent_id, batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -383,23 +386,36 @@ InferenceResult return ir; } - GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW parent; - int batch_size = bc->num_active_tokens(); - ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); + int batch_size = bc->num_active_infr_tokens(); + float loss = 0.0f; + + ArgMax::forward_kernel_wrapper( + m, bc, input, indices, parent, batch_size, &loss); + InferenceResult ir; + ir.finetuning_loss = loss; + + if (bc->num_active_peft_tokens() > 0) { + printf("Loss: %.4f\n", loss); + } + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; ArgMax::save_inference_tensors_to_file( - m, shard_id, bc, {}, {}, {input, indices}); + m, shard_id, bc, {input}, {}, {indices}); + } else { + m->decoding_step++; } - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); + return ir; } @@ -453,4 +469,4 @@ size_t hash::operator()( hash_combine(key, params.beam_search); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 8a1cf0b3b0..60d44cdf2b 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -334,6 +334,21 @@ __device__ void mergeShards(int num_shards, } } +template +__global__ void compute_sparse_categorical_crossentropy_loss( + DT const *logits, + BatchConfig::TokenId const *labels, + float *loss, + int num_tokens, + int num_classes) { + float const LOG_MIN_VALUE = 0.00000001f; + CUDA_KERNEL_LOOP(b, num_tokens) { + float my_logit = + max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(loss, -log(my_logit)); + } +} + template __global__ void argmax_forward_kernel(T const *__restrict__ input, size_t shared_memory_size, @@ -381,14 +396,16 @@ __global__ void copy_result(hipcub::KeyValuePair *d_out, /*static*/ template void ArgMax::forward_kernel(ArgMaxMeta const *m, - DT *input_ptr, + BatchConfig const *bc, + DT const *input_ptr, int *indices_ptr, float *prob_ptr, int *parent, int const length, int const batch_size, + float *loss, hipStream_t stream) { - checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); if (m->beam_search) { @@ -425,28 +442,77 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, k, prob_ptr, indices_ptr); + + // compute cross-entropy loss if there is a finetuning request + assert(loss != nullptr); + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int num_finetuning_requests = 0, num_bwd_tokens = 0; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_bwd) { + assert(num_finetuning_requests == 0 && num_bwd_tokens == 0); + num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = + bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + num_finetuning_requests += 1; + } else { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + } + } + assert(num_finetuning_requests <= 1); + if (num_bwd_tokens > 0) { + checkCUDA(hipMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + hipMemcpyHostToDevice, + stream)); + // copy loss to d_loss + checkCUDA(hipMemsetAsync(m->d_loss, 0, sizeof(float), stream)); + compute_sparse_categorical_crossentropy_loss<<>>( + input_ptr, + static_cast(m->handle.workSpace), + m->d_loss, + num_bwd_tokens, + length); + // copy value from d_loss to loss + checkCUDA(hipMemcpyAsync( + loss, m->d_loss, sizeof(float), hipMemcpyDeviceToHost, stream)); + *loss = *loss / (float)num_bwd_tokens; + } } /*static*/ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, - GenericTensorAccessorW const &input, + BatchConfig const *bc, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, - int batch_size) { + int batch_size, + float *loss) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } - int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (input.data_type == DT_HALF) { ArgMax::forward_kernel(m, + bc, input.get_half_ptr(), indices.get_int32_ptr(), m->probs, @@ -454,10 +520,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else if (input.data_type == DT_FLOAT) { ArgMax::forward_kernel(m, + bc, input.get_float_ptr(), indices.get_int32_ptr(), m->probs, @@ -465,6 +533,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 05c84719c1..8a2e2da2d0 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -44,19 +44,35 @@ __global__ void copy_result(cub::KeyValuePair *d_out, } } +template +__global__ void compute_sparse_categorical_crossentropy_loss( + DT const *logits, + BatchConfig::TokenId const *labels, + float *loss, + int num_tokens, + int num_classes) { + float const LOG_MIN_VALUE = 0.00000001f; + CUDA_KERNEL_LOOP(b, num_tokens) { + float my_logit = + max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(loss, -log(my_logit)); + } +} + /*static*/ template void ArgMax::forward_kernel(ArgMaxMeta const *m, - DT *input_ptr, + BatchConfig const *bc, + DT const *input_ptr, int *indices_ptr, float *prob_ptr, int *parent, int const length, int const batch_size, + float *loss, cudaStream_t stream) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; + if (m->beam_search) { // set all parents id zero in arg top1 case. checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream)); @@ -73,7 +89,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, m->d_offsets + 1, stream)); - // copy dout to incides + // copy dout to indices int parallelism = batch_size; copy_result<<beam_search); // print_tensor(indices_ptr, 32, "argmax op"); + + // compute cross-entropy loss if there is a finetuning request + assert(loss != nullptr); + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int num_finetuning_requests = 0, num_bwd_tokens = 0; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_bwd) { + assert(num_finetuning_requests == 0 && num_bwd_tokens == 0); + num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = + bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + num_finetuning_requests += 1; + } else { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + } + } + assert(num_finetuning_requests <= 1); + if (num_bwd_tokens > 0) { + checkCUDA(cudaMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + cudaMemcpyHostToDevice, + stream)); + // copy loss to d_loss + checkCUDA(cudaMemsetAsync(m->d_loss, 0, sizeof(float), stream)); + compute_sparse_categorical_crossentropy_loss<<>>( + input_ptr, + static_cast(m->handle.workSpace), + m->d_loss, + num_bwd_tokens, + length); + // copy value from d_loss to loss + checkCUDA(cudaMemcpyAsync( + loss, m->d_loss, sizeof(float), cudaMemcpyDeviceToHost, stream)); + *loss = *loss / (float)num_bwd_tokens; + } } /*static*/ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, - GenericTensorAccessorW const &input, + BatchConfig const *bc, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, - int batch_size) { + int batch_size, + float *loss) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -104,6 +170,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, if (input.data_type == DT_HALF) { ArgMax::forward_kernel(m, + bc, input.get_half_ptr(), indices.get_int32_ptr(), m->probs, @@ -111,10 +178,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else if (input.data_type == DT_FLOAT) { ArgMax::forward_kernel(m, + bc, input.get_float_ptr(), indices.get_int32_ptr(), m->probs, @@ -122,6 +191,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else { assert(false && "Unsupported data type"); @@ -202,6 +272,10 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); d_temp_storage = gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); + + // allocate space for loss on device + gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(float)); + d_loss = gpu_mem_allocator.allocate_instance(1); } ArgMaxMeta::~ArgMaxMeta(void) { diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 203662d3ec..aef4f0a16a 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -1010,7 +1010,7 @@ MultiHeadAttentionParams MultiHeadAttention::get_params() const { params.bias = this->bias; params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp index ee7f87a7fb..10655a4a1a 100644 --- a/src/ops/attention.cpp +++ b/src/ops/attention.cpp @@ -156,7 +156,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/attention.cu b/src/ops/attention.cu index 18fc810aed..4c460cdbbf 100644 --- a/src/ops/attention.cu +++ b/src/ops/attention.cu @@ -194,7 +194,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc index e13169f6c1..e5f0611fb0 100644 --- a/src/ops/batch_matmul.cc +++ b/src/ops/batch_matmul.cc @@ -279,7 +279,7 @@ OpMeta *BatchMatmul::init_task(Task const *task, Runtime *runtime) { BatchMatmul const *bmm = (BatchMatmul *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - BatchMatmulMeta *m = new BatchMatmulMeta(handle); + BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm); m->profiling = bmm->profiling; m->inference_debugging = bmm->inference_debugging; m->a_seq_length_dim = bmm->a_seq_length_dim; @@ -616,7 +616,7 @@ bool BatchMatmul::measure_operator_cost(Simulator *sim, batch *= sub_input0.dims[i].size; } - BatchMatmulMeta *meta = sim->batch_matmul_meta; + BatchMatmulMeta *meta = new BatchMatmulMeta(sim->handler, this); // allocate tensors in simulator sim->free_all(); diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index 7dee6fdaaf..5856f1dddf 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -284,7 +284,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, int output_c, int output_h, int output_w) - : OpMeta(handler) { + : OpMeta(handler, bn) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu index 929ebf81f8..01e993067a 100644 --- a/src/ops/batch_norm.cu +++ b/src/ops/batch_norm.cu @@ -270,7 +270,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, int output_c, int output_h, int output_w) - : OpMeta(handler) { + : OpMeta(handler, bn) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 5f4547ace5..36cc7fd8fa 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -375,7 +375,7 @@ BeamInferenceResult // embedding size: eg. 4096 int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; // total token nums - size_t batch_size = bc.num_active_tokens(); + size_t batch_size = bc.num_active_infr_tokens(); // need meta for: how many sub requests in a main request BeamTopK::forward_kernel_wrapper(m, @@ -390,9 +390,11 @@ BeamInferenceResult BeamInferenceResult ir; - download_tensor(index_ptr, ir.token_ids, batch_size * m->max_beam_width); - download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); - download_tensor( + copy_tensor_dev_to_host( + index_ptr, ir.token_ids, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( + value_ptr, ir.probs, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); if (m->inference_debugging) { diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 8545bea7cb..5d80707ea7 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -681,7 +681,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { + : OpMeta(handler, op) { DataType data_type = op->inputs[0]->data_type; int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); int max_requests_per_batch = BatchConfig::max_requests_per_batch(); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index c24bdf7c74..bf4c23cad0 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -723,7 +723,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { + : OpMeta(handler, op) { DataType data_type = op->inputs[0]->data_type; int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); int max_requests_per_batch = BatchConfig::max_requests_per_batch(); diff --git a/src/ops/cache.cc b/src/ops/cache.cc index 691e45b559..33b862ae85 100644 --- a/src/ops/cache.cc +++ b/src/ops/cache.cc @@ -165,7 +165,7 @@ OpMeta *Cache::init_task(Task const *task, Runtime *runtime) { Cache *c = (Cache *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - CacheMeta *m = new CacheMeta(handle); + CacheMeta *m = new CacheMeta(handle, c); m->cache_score = 0.0f; m->profiling = c->profiling; m->inference_debugging = c->inference_debugging; diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp index 95c5995f9e..a9512c2c59 100644 --- a/src/ops/cache.cpp +++ b/src/ops/cache.cpp @@ -75,7 +75,7 @@ float Cache::cache_update(Task const *task, return cache_score; } -CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {} +CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {} template void Cache::cache_forward(Task const *task, diff --git a/src/ops/cache.cu b/src/ops/cache.cu index a113e57a1c..2f95e59669 100644 --- a/src/ops/cache.cu +++ b/src/ops/cache.cu @@ -74,7 +74,7 @@ float Cache::cache_update(Task const *task, return cache_score; } -CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {} +CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {} template void Cache::cache_forward(Task const *task, diff --git a/src/ops/cast.cc b/src/ops/cast.cc index e514236a31..4a52bf874e 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -190,7 +190,7 @@ OpMeta *Cast::init_task(Task const *task, Runtime *runtime) { Cast *cast = (Cast *)task->args; FFHandler handler = *((FFHandler const *)task->local_args); - CastMeta *m = new CastMeta(handler); + CastMeta *m = new CastMeta(handler, cast); m->input_data_type = cast->inputs[0]->data_type; m->output_data_type = cast->outputs[0]->data_type; std::strcpy(m->op_name, cast->name); diff --git a/src/ops/concat.cc b/src/ops/concat.cc index d4d8e525fc..0a82779b6d 100644 --- a/src/ops/concat.cc +++ b/src/ops/concat.cc @@ -197,7 +197,7 @@ OpMeta *Concat::init_task(Task const *task, Runtime *runtime) { Concat *cc = (Concat *)task->args; FFHandler handler = *((FFHandler const *)task->local_args); - ConcatMeta *m = new ConcatMeta(handler); + ConcatMeta *m = new ConcatMeta(handler, cc); // Note that our internal axis index ordering is opposite to other frameworks init_meta(m, cc->legion_axis); m->profiling = cc->profiling; @@ -365,7 +365,7 @@ bool Concat::measure_operator_cost(Simulator *sim, } } - ConcatMeta *m = sim->concat_meta; + ConcatMeta *m = new ConcatMeta(sim->handler, this); init_meta(m, this->legion_axis); sim->free_all(); diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index 94850a178d..2428c9b99a 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -588,12 +588,13 @@ OpMeta *Conv2D::init_task(Task const *task, // regions[4], task->regions[4], FID_DATA, ctx, runtime, // false/*readOutput*/); - Conv2DMeta *m = new Conv2DMeta(handle); + Conv2DMeta *m = new Conv2DMeta(handle, conv); m->relu = conv->activation == AC_MODE_RELU; m->use_bias = conv->use_bias; m->profiling = conv->profiling; m->inference_debugging = conv->inference_debugging; - m->trainableInputs[0] = conv->trainableInputs[0]; + m->trainable_inputs[0] = conv->trainable_inputs[0]; + m->reset_input_grads[0] = conv->trainable_inputs[0]; std::strcpy(m->op_name, conv->name); m->layer_guid = conv->layer_guid; @@ -753,7 +754,7 @@ void Conv2D::backward(FFModel const &ff) { inputs[0]->region)); launcher.add_field(rid++, FID_DATA); // regions[1](I/O): input_grad - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -803,7 +804,7 @@ void Conv2D::backward(FFModel const &ff) { /* region(I): input - region(I/O): input_grad (if trainableInputs[0]) + region(I/O): input_grad (if trainable_inputs[0]) region(I): output region(I/O): output_grad region(I): filter @@ -816,17 +817,17 @@ void Conv2D::backward_task(Task const *task, Runtime *runtime) { // Conv2D* conv = (Conv2D*) task->args; Conv2DMeta const *m = *((Conv2DMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + + assert(regions.size() == (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + + (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); size_t rid = 0; TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; float *acc_input_grad_ptr = NULL; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { TensorAccessorW acc_input_grad( regions[rid], task->regions[rid], @@ -1119,7 +1120,7 @@ bool Conv2D::measure_operator_cost(Simulator *sim, int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - Conv2DMeta *m = sim->conv2d_meta; + Conv2DMeta *m = new Conv2DMeta(sim->handler, this); m->relu = activation == AC_MODE_RELU; // require input_c is divisible by groups diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 4352f459b9..cf8696182b 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -429,7 +429,7 @@ OpMeta *ElementBinary::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb); for (int i = 0; i < eb->numInputs; i++) { - m->trainableInputs[i] = eb->trainableInputs[i]; + m->trainable_inputs[i] = eb->trainable_inputs[i]; } m->op_type = eb->op_type; m->profiling = eb->profiling; @@ -892,7 +892,7 @@ void ElementBinary::backward(FFModel const &ff) { inputs[0]->region)); launcher.add_field(rid++, FID_DATA); // regions[2](I/O): input0_grad - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement( RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, @@ -910,7 +910,7 @@ void ElementBinary::backward(FFModel const &ff) { inputs[1]->region)); launcher.add_field(rid++, FID_DATA); // regions[4](I/O): input1_grad - if (trainableInputs[1]) { + if (trainable_inputs[1]) { launcher.add_region_requirement( RegionRequirement(inputs[1]->part_grad, 0 /*projection id*/, @@ -980,7 +980,7 @@ void ElementBinary::backward_task(Task const *task, in0_ptr = helperGetTensorPointerRO( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { Domain in0_grad_domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); assert(in0_domain == in0_grad_domain); @@ -998,7 +998,7 @@ void ElementBinary::backward_task(Task const *task, in1_ptr = helperGetTensorPointerRO( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[1]) { + if (m->trainable_inputs[1]) { Domain in1_grad_domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); // assert(out_grad_domain == in1_domain); diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 0e1d115557..09cf13c717 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -354,7 +354,7 @@ OpMeta *ElementUnary::init_task(Task const *task, Runtime *runtime) { ElementUnary *eu = (ElementUnary *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - ElementUnaryMeta *m = new ElementUnaryMeta(handle); + ElementUnaryMeta *m = new ElementUnaryMeta(handle, eu); m->op_type = eu->op_type; m->data_type = eu->outputs[0]->data_type; // Input and output should have the same data type @@ -737,7 +737,7 @@ bool ElementUnary::measure_operator_cost(Simulator *sim, if (!inputs[0]->get_sub_tensor(mv, sub_input)) { return false; } - ElementUnaryMeta *m = sim->ele_unary_meta; + ElementUnaryMeta *m = new ElementUnaryMeta(sim->handler, this); m->op_type = op_type; if (use_cudnn(m->op_type)) { Domain input_domain, output_domain; diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index e20200420f..435abdfe11 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -282,7 +282,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, stream); } -ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { +ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary) + : OpMeta(handler, unary) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index c7f5e90f4c..15e6852388 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -291,7 +291,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, stream); } -ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { +ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary) + : OpMeta(handler, unary) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index e630563b63..95b538bdb6 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -469,7 +469,7 @@ FutureMap Embedding::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(EMBED_FWD_TASK_ID, + IndexLauncher launcher(EMBED_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -559,12 +559,6 @@ void Embedding::forward_task(Task const *task, } forward_kernel_wrapper( m, input, output, kernel, in_dim, out_dim, effective_batch_size); - if (m->inference_debugging) { - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - Embedding::save_inference_tensors_to_file( - m, shard_id, nullptr, {input}, {kernel}, {output}); - } } /* @@ -672,6 +666,16 @@ void Embedding::backward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +Legion::FutureMap + Embedding::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // nothing to do (backward function only updates weights) + return FutureMap(); +} + void Embedding::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 8c66f9c7bc..3acc68ed9b 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -589,18 +589,7 @@ OpMeta *Experts::init_task(Task const *task, Runtime *runtime) { Experts const *exp = (Experts *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ExpertsMeta *m = new ExpertsMeta(handle, - exp->num_experts, - exp->experts_start_idx, - exp->data_dim, - exp->out_dim, - exp->experts_num_layers, - exp->experts_internal_dim_size, - exp->effective_batch_size, - exp->num_chosen_experts, - exp->alpha, - exp->use_bias, - exp->activation); + ExpertsMeta *m = new ExpertsMeta(handle, exp); m->profiling = exp->profiling; m->inference_debugging = exp->inference_debugging; std::strcpy(m->op_name, exp->name); @@ -682,7 +671,7 @@ FutureMap Experts::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv << std::endl; */ - // int num_active_tokens = bc->num_active_tokens(); + // int num_active_infr_tokens = bc->num_active_infr_tokens(); IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -1075,7 +1064,7 @@ void Experts::inference_task(Task const *task, output_ptr, weights_ptr, bias_ptr, - bc->num_active_tokens(), + bc->num_active_infr_tokens(), chosen_experts, batch_size, out_dim); diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index c06f02a647..502be878a9 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -27,7 +27,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim) { @@ -35,25 +35,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, handle_unimplemented_hip_kernel(OP_EXPERTS); } -ExpertsMeta::ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation) - : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size), - effective_batch_size(_effective_batch_size), - num_chosen_experts(_num_chosen_experts), alpha(_alpha), - use_bias(_use_bias), activation(_activation) {} +ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e) + : OpMeta(handler, e), num_experts(e->num_experts), + experts_start_idx(e->experts_start_idx), data_dim(e->data_dim), + out_dim(e->out_dim), experts_num_layers(e->experts_num_layers), + experts_internal_dim_size(e->experts_internal_dim_size), + effective_batch_size(e->effective_batch_size), + num_chosen_experts(e->num_chosen_experts), alpha(e->alpha), + use_bias(e->use_bias), activation(e->activation) {} + ExpertsMeta::~ExpertsMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/experts.cu b/src/ops/experts.cu index ce15cdff55..f6f555d1ad 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -515,7 +515,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim) { @@ -529,8 +529,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaEventRecord(t_start, stream); } - assert(num_active_tokens > 0); - assert(num_active_tokens <= m->effective_batch_size); + assert(num_active_infr_tokens > 0); + assert(num_active_infr_tokens <= m->effective_batch_size); assert(m->effective_batch_size == batch_size); int num_experts_per_block = m->num_experts; @@ -540,7 +540,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int data_dim = m->data_dim; int num_chosen_experts = m->num_chosen_experts; // int num_tokens = m->effective_batch_size; - int num_tokens = num_active_tokens; + int num_tokens = num_active_infr_tokens; int expert_capacity = m->expert_capacity; assert(chosen_experts == num_chosen_experts); @@ -579,14 +579,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, #ifdef INFERENCE_TESTS // Checking // 1. check that m->sorted_indices contains indices sorted - int *indices_cpu = download_tensor(indices, num_indices); + int *indices_cpu = copy_tensor_dev_to_host(indices, num_indices); // assert(indices_cpu != nullptr); std::vector indices_vec(indices_cpu, indices_cpu + num_indices); std::vector indices_vec_sorted(indices_vec.size()); std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin()); std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end()); - int *thrust_sorted_indices_cpu = download_tensor( + int *thrust_sorted_indices_cpu = copy_tensor_dev_to_host( m->sorted_indices, m->num_chosen_experts * m->effective_batch_size); // assert(thrust_sorted_indices_cpu != nullptr); std::vector thrust_sorted_indices_vec( @@ -613,7 +613,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]); } // 2. check that indices[m->original_indices[i]] = i - int *thrust_original_indices_cpu = download_tensor( + int *thrust_original_indices_cpu = copy_tensor_dev_to_host( m->original_indices, m->num_chosen_experts * m->effective_batch_size); // assert(thrust_original_indices_cpu != nullptr); std::vector thrust_original_indices_vec( @@ -668,8 +668,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } assert(non_zero_experts_count == non_zero_experts_check.size()); // 7. check exp_local_label_to_index - int *non_zero_expert_labels_cpu = - download_tensor(m->non_zero_expert_labels, non_zero_experts_count); + int *non_zero_expert_labels_cpu = copy_tensor_dev_to_host( + m->non_zero_expert_labels, non_zero_experts_count); // assert(non_zero_expert_labels_cpu != nullptr); std::vector non_zero_expert_labels_vec(non_zero_expert_labels_cpu, non_zero_expert_labels_cpu + @@ -684,8 +684,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, non_zero_experts_check_vec.end())); assert(non_zero_expert_labels_vec == non_zero_experts_check_vec); - int *exp_local_label_to_index = - download_tensor(m->exp_local_label_to_index, non_zero_experts_count); + int *exp_local_label_to_index = copy_tensor_dev_to_host( + m->exp_local_label_to_index, non_zero_experts_count); // assert(exp_local_label_to_index != nullptr); std::vector exp_local_label_to_index_vec(exp_local_label_to_index, exp_local_label_to_index + @@ -699,8 +699,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } // 8. Check expert_start_indexes - int *expert_start_indices_thrust = - download_tensor(m->expert_start_indexes, non_zero_experts_count + 1); + int *expert_start_indices_thrust = copy_tensor_dev_to_host( + m->expert_start_indexes, non_zero_experts_count + 1); // assert(expert_start_indices_thrust != nullptr); std::vector expert_start_indices_thrust_vec( expert_start_indices_thrust, @@ -746,9 +746,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int *num_assignments_per_expert_thrust = (int *)calloc(non_zero_experts_count, sizeof(int)); assert(num_assignments_per_expert_thrust != nullptr); - assert(download_tensor(m->num_assignments_per_expert, - num_assignments_per_expert_thrust, - non_zero_experts_count)); + assert(copy_tensor_dev_to_host(m->num_assignments_per_expert, + num_assignments_per_expert_thrust, + non_zero_experts_count)); assert(num_assignments_per_expert_thrust != nullptr); std::vector num_assignments_per_expert_thrust_vec( num_assignments_per_expert_thrust, @@ -759,9 +759,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int *destination_start_indices_thrust = (int *)calloc(non_zero_experts_count, sizeof(int)); assert(destination_start_indices_thrust != nullptr); - assert(download_tensor(m->destination_start_indices, - destination_start_indices_thrust, - non_zero_experts_count)); + assert(copy_tensor_dev_to_host(m->destination_start_indices, + destination_start_indices_thrust, + non_zero_experts_count)); assert(destination_start_indices_thrust != nullptr); std::vector destination_start_indices_thrust_vec( destination_start_indices_thrust, @@ -1233,25 +1233,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } } -ExpertsMeta::ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation) - : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size), - effective_batch_size(_effective_batch_size), - num_chosen_experts(_num_chosen_experts), alpha(_alpha), - use_bias(_use_bias), activation(_activation) { +ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e) + : OpMeta(handler, e), num_experts(e->num_experts), + experts_start_idx(e->experts_start_idx), data_dim(e->data_dim), + out_dim(e->out_dim), experts_num_layers(e->experts_num_layers), + experts_internal_dim_size(e->experts_internal_dim_size), + effective_batch_size(e->effective_batch_size), + num_chosen_experts(e->num_chosen_experts), alpha(e->alpha), + use_bias(e->use_bias), activation(e->activation) { expert_capacity = ceil(alpha * num_chosen_experts / num_experts * effective_batch_size); diff --git a/src/ops/flat.cc b/src/ops/flat.cc index 80aedbbb31..e9f637294a 100644 --- a/src/ops/flat.cc +++ b/src/ops/flat.cc @@ -187,7 +187,8 @@ OpMeta *Flat::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handler = *((FFHandler const *)task->local_args); - FlatMeta *m = new FlatMeta(handler); + Flat *flat = (Flat *)task->args; + FlatMeta *m = new FlatMeta(handler, flat); return m; } diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 9ad5c4dc9c..121139beb1 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/fused.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" @@ -87,12 +88,32 @@ FusedOp::FusedOp(FFModel &model, Op *op) // weights[i]->owner_idx = i; weight_data_types[i] = op->weights[i]->data_type; } - numOutputs = op->numOutputs; - for (int i = 0; i < numOutputs; i++) { - outputs[i] = op->outputs[i]; - outputs[i]->owner_op = this; - outputs[i]->owner_idx = i; - output_data_types[i] = op->outputs[i]->data_type; + numOutputs = 0; + for (int i = 0; i < op->numOutputs; i++) { + bool found = false; + // Handle in-place outputs + for (int j = 0; j < numInputs; j++) { + if (inputs[j]->region == op->outputs[i]->region) { + // This output is one of the inputs + assert(!found); + assert(inputs[j]->region != LogicalRegion::NO_REGION); + op_output_source[i] = SOURCE_INPUT; + op_input_idx[i] = j; + found = true; + break; + } + } + if (found) { + // do nothing + } else { + outputs[numOutputs] = op->outputs[i]; + output_data_types[numOutputs] = op->outputs[i]->data_type; + op_output_source[i] = SOURCE_OUTPUT; + op_output_idx[i] = numOutputs; + outputs[numOutputs]->owner_op = this; + outputs[numOutputs]->owner_idx = numOutputs; + numOutputs++; + } } numOperators = 1; op_num_inputs[0] = op->numInputs; @@ -109,10 +130,53 @@ FusedOp::FusedOp(FFModel &model, Op *op) op_weight_source[i] = SOURCE_WEIGHT; op_weight_idx[i] = i; } - for (int i = 0; i < numOutputs; i++) { - op_output_source[i] = SOURCE_OUTPUT; - op_output_idx[i] = i; - } + // for (int i = 0; i < numOutputs; i++) { + // op_output_source[i] = SOURCE_OUTPUT; + // op_output_idx[i] = i; + // } +#if 0 + int input_offset = 0, weight_offset = 0, output_offset = 0; + printf("\nNew fused op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: " + "#inputs=%i, #outputs=%i, #weights=%i\n", + op->name, + get_operator_type_name(op->op_type).c_str(), + op->numInputs, + op->numOutputs, + op->numWeights, + numInputs, + numOutputs, + numWeights); + printf("op_input_idx:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_idx[i]); + } + printf("\n"); + printf("op_input_source:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_source[i]); + } + printf("\n"); + printf("op_output_idx:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_idx[i]); + } + printf("\n"); + printf("op_output_source:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_source[i]); + } + printf("\n"); + printf("op_weight_idx:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_idx[i]); + } + printf("\n"); + printf("op_weight_source:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_source[i]); + } + printf("\n"); +#endif } bool FusedOp::use_same_regions( @@ -165,7 +229,8 @@ bool FusedOp::add_operator( // op->name, op_config)); // Cannot fuse parallel operators (except allreduce) since they have different // paralel_is in forward and backward - assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); + assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE || + op->op_type == OP_PARALLEL_IDENTITY); // Currently don't consider nested fusion assert(op->op_type != OP_FUSED); MachineView my_view = outputs[0]->machine_view; @@ -271,6 +336,18 @@ bool FusedOp::add_operator( found = true; op_output_source[output_offset + i] = SOURCE_OUTPUT; op_output_idx[output_offset + i] = j; + break; + } + } + for (int j = 0; j < numInputs; j++) { + if (inputs[j]->region == op->outputs[i]->region) { + // This input is one of my inputs + assert(!found); + assert(inputs[j]->region != LogicalRegion::NO_REGION); + op_output_source[output_offset + i] = SOURCE_INPUT; + op_output_idx[output_offset + i] = j; + found = true; + break; } } if (found) { @@ -311,6 +388,50 @@ bool FusedOp::add_operator( "Reach to the #outputs limit during fusion.\n" "Consider increase MAX_NUM_OUTPUTS to allow more fusions.\n"); } + +#if 0 + printf("\nAdd op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: " + "#inputs=%i, #outputs=%i, #weights=%i\n", + op->name, + get_operator_type_name(op->op_type).c_str(), + op->numInputs, + op->numOutputs, + op->numWeights, + numInputs, + numOutputs, + numWeights); + printf("op_input_idx:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_idx[i]); + } + printf("\n"); + printf("op_input_source:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_source[i]); + } + printf("\n"); + printf("op_output_idx:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_idx[i]); + } + printf("\n"); + printf("op_output_source:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_source[i]); + } + printf("\n"); + printf("op_weight_idx:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_idx[i]); + } + printf("\n"); + printf("op_weight_source:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_source[i]); + } + printf("\n"); +#endif + return true; } @@ -404,9 +525,13 @@ void FusedOp::init_inference(FFModel const &ff, } for (int i = 0; i < op_num_outputs[op]; i++) { int my_off = op_output_idx[i + ooff]; - assert(op_output_source[i + ooff] == SOURCE_OUTPUT); - assert(my_off < batch_outputs.size()); - my_batch_outputs.push_back(batch_outputs[my_off]); + if (op_output_source[i + ooff] == SOURCE_OUTPUT) { + my_batch_outputs.push_back(batch_outputs[my_off]); + } else if (op_output_source[i + ooff] == SOURCE_INPUT) { + my_batch_outputs.push_back(batch_inputs[my_off]); + } else { + assert(false); + } } ioff += op_num_inputs[op]; ooff += op_num_outputs[op]; @@ -526,10 +651,6 @@ FutureMap FusedOp::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig - // so we transfer the maximum of them - // size_t batch_config_size = - // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -571,6 +692,83 @@ FutureMap FusedOp::inference(FFModel const &ff, batch_outputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } + offset += numOutputs; + // add softmax output grad + if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { + // printf("operator %i is last SOFTMAX! adding grad for output %i\n", + // numOperators - 1, + // numOutputs - 1); + assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[numOutputs - 1]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[numOutputs - 1]->region_grad)); + launcher.add_field(offset, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +FutureMap FusedOp::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // Set iter_config + iter_config = ff.iter_config; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig + // so we transfer the maximum of them + // size_t batch_config_size = + // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); + IndexLauncher launcher(FUSEDOP_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int offset = 0; + for (int i = 0; i < numInputs; i++) { + assert(inputs[i]->part != LogicalPartition::NO_PART); + assert(inputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[i]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[i]->region_grad)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numInputs; + for (int i = 0; i < numWeights; i++) { + assert(weights[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement(RegionRequirement(weights[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numWeights; + for (int i = 0; i < numOutputs; i++) { + assert(outputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part_grad, + 0 /*projection id*/, + i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region_grad)); + launcher.add_field(offset + i, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 3282bc57d9..9f826cd611 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -15,6 +15,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -30,6 +31,7 @@ #include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/residual_rms_norm_kernels.h" @@ -42,6 +44,7 @@ #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" #include "flexflow/utils/hip_helper.h" #include @@ -78,17 +81,27 @@ OpMeta *FusedOp::init_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void FusedOp::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active tokens + if (bc->num_tokens == 0) { + return; + } + assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs); + bool softmax_grad_additional_region = + (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); + assert((int)regions.size() == fused->numInputs + fused->numWeights + + fused->numOutputs + + softmax_grad_additional_region); GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; @@ -124,6 +137,7 @@ __host__ void FusedOp::forward_task(Task const *task, ctx, runtime); } + roff += fused->numOutputs; // Assert that all meta share the same dnn/blas handler int start = 0; for (start = 0; start < fused->numOperators; start++) { @@ -138,11 +152,6 @@ __host__ void FusedOp::forward_task(Task const *task, } } - hipStream_t stream; - if (start < fused->numOperators) { - checkCUDA(get_legion_stream(&stream)); - } - int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; @@ -163,8 +172,9 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -179,21 +189,6 @@ __host__ void FusedOp::forward_task(Task const *task, m->legion_axis); break; } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); - break; - } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -209,16 +204,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[1].get_float_ptr()); break; } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -229,25 +214,48 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float const *bias_ptr = nullptr; + void const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].get_float_ptr(); + bias_ptr = my_weight_accessor[1].ptr; } } else { assert(fused->op_num_weights[op] == 1); } - Kernels::Linear::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - bias_ptr, - in_dim, - out_dim, - batch_size); + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_infr_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_accessor[0].domain; + Domain output_domain = my_output_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->output_type[0] == my_output_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); + Kernels::LoraLinear::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -375,87 +383,127 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } break; } - case OP_POOL2D: { + case OP_RMS_NORM: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } - case OP_SOFTMAX: { + case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_RESHAPE: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + &tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_TRANSPOSE: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + &beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } case OP_LAYERNORM: { @@ -477,23 +525,127 @@ __host__ void FusedOp::forward_task(Task const *task, break; } case OP_RESIDUAL_LAYERNORM: { - assert(false && "Operator ResidualLayerNorm does not support " - "the forward() task"); + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(false && "Operator AddBiasResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(false && "Operator ResidualRMSNorm does not support " - "the forward() task"); + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_input_accessor[1], + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); break; } case OP_SIGMOID_SILU_MULTI: { - assert(false && "Operator SigmoidSiluMulti does not support " - "the forward() task"); + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + if (op == fused->numOperators - 1) { // if this is the final operator + output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( + fused->output_data_types[fused->numOutputs - 1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); + } + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::inference_kernel_wrapper( + m, + bc, + (op == fused->numOperators - 1), + my_input_accessor[0], + my_output_accessor[0], + output_accessor[fused->numOutputs]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } default: { @@ -503,6 +655,33 @@ __host__ void FusedOp::forward_task(Task const *task, assert(false && "Fusion currently does not support type"); } } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); + } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; @@ -517,18 +696,525 @@ __host__ void FusedOp::forward_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void - FusedOp::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void FusedOp::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { + // Return if no active PEFT bwd tokens + if (bc->num_active_peft_tokens() == 0) { return; } + + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + // Domain input_domain[MAX_NUM_INPUTS]; + // Domain weight_domain[MAX_NUM_WEIGHTS]; + // Domain output_domain[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + // input_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i].region.get_index_space()); + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + // weight_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + // output_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + int ioff = 0, woff = 0, ooff = 0; + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // TODO: implement this + assert(false); + // ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + // int num_inputs = fused->op_num_inputs[op]; + // Kernels::Concat::peft_bwd_kernel_wrapper(m, + // my_output_accessor[0], + // my_input_accessor, + // num_inputs, + // m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_dim() == 5); + assert(my_output_grad_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + // TODO: implement this + assert(false); + // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + // BatchNorm::peft_bwd_kernel_kernel( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_weight_accessor[0].get_float_ptr(), + // my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LinearMeta *m = (LinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->input_type[0] == my_output_grad_accessor[0].data_type); + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + Kernels::Linear::peft_bwd_kernel_wrapper(m, + my_input_grad_accessor[0].ptr, + my_output_grad_accessor[0].ptr, + my_weight_accessor[0].ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_grad_accessor[0].domain; + Domain output_domain = my_output_grad_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->output_type[0] == my_output_grad_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + Kernels::LoraLinear::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_grad_accessor[0].domain; + Domain a_domain = my_input_grad_accessor[0].domain; + Domain b_domain = my_input_grad_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + // TODO: implement me + assert(false); + // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + // Kernels::BatchMatmul::backward_kernel_wrapper( + // meta, + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].get_float_ptr(), + // my_input_accessor[1].get_float_ptr(), + // (float const *)nullptr, + // m, + // n, + // k, + // batch, + // meta->a_seq_length_dim, + // meta->b_seq_length_dim, + // fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_input_grad_accessor[1].domain); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + // Kernels::ElementBinary::forward_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + // Currently assume the Embedding layer cannot be finetuned + // so we do nothing for embedding + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // TODO: implement me + assert(false); + // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + // if (m->data_type == DT_HALF) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_half_ptr(), + // my_output_accessor[0].get_half_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else if (m->data_type == DT_FLOAT) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else { + // assert(false && "Unsupported data type in ElementUnary forward"); + // } + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_weight_accessor[0]); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( + m, + bc, + my_input_grad_accessor[0], + my_input_grad_accessor[1], + my_output_grad_accessor[0], + my_output_grad_accessor[1], + my_weight_accessor[0]); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_grad_accessor[0], + my_weight_accessor[0], + my_output_grad_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + // TODO: implement me + assert(false); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::peft_bwd_kernel_wrapper( + m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma); + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorW residual2; + if (m->use_two_residuals) { + residual2 = my_input_grad_accessor[2]; + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + } + ResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + residual2, + gamma); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + } + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_grad_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_grad_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } + } +} + +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](O): outputs +*/ +__host__ void FusedOp::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == @@ -582,11 +1268,6 @@ __host__ void } } - hipStream_t stream; - if (start < fused->numOperators) { - checkCUDA(get_legion_stream(&stream)); - } - int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; @@ -595,8 +1276,10 @@ __host__ void for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + assert(my_off < fused->numInputs); my_input_accessor[i] = input_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + assert(my_off < fused->numOutputs); my_input_accessor[i] = output_accessor[my_off]; } else { assert(false); @@ -604,11 +1287,14 @@ __host__ void } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + assert(fused->op_weight_idx[i + woff] < fused->numWeights); my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[i + ooff]; + assert(my_off < fused->numOutputs); + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -623,6 +1309,21 @@ __host__ void m->legion_axis); break; } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -638,6 +1339,16 @@ __host__ void my_weight_accessor[1].get_float_ptr()); break; } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -648,27 +1359,25 @@ __host__ void assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - void const *bias_ptr = nullptr; + float const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].ptr; + bias_ptr = my_weight_accessor[1].get_float_ptr(); } } else { assert(fused->op_num_weights[op] == 1); } - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); - Kernels::Linear::forward_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - bias_ptr, - in_dim, - out_dim, - batch_size); + Kernels::Linear::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); break; } case OP_BATCHMATMUL: { @@ -796,124 +1505,78 @@ __host__ void case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: - case OP_SCALAR_TRUE_DIV: { + case OP_ELU: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - if (m->data_type == DT_HALF) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr(), - my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_FLOAT) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false && "Unsupported data type in ElementUnary forward"); - } + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_RMS_NORM: { + case OP_POOL2D: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); break; } - case OP_RESIDUAL_RMS_NORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = - (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); break; } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - TreeIncMultiHeadSelfAttentionMeta *m = - (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &tree_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - SpecIncMultiHeadSelfAttentionMeta const *m = - (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( m, - &beam_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); break; } case OP_LAYERNORM: { @@ -935,119 +1598,23 @@ __host__ void break; } case OP_RESIDUAL_LAYERNORM: { - assert(fused->op_num_outputs[op] == 2); - ResidualLayerNormMeta const *m = - (ResidualLayerNormMeta *)metas->meta[op]; - if (m->use_two_residuals) { - assert(fused->op_num_inputs[op] == 3); - } else { - assert(fused->op_num_inputs[op] == 2); - } - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 0); - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 1); // weight - } else { - assert(fused->op_num_weights[op] == 2); // weight + bias - } - } - GenericTensorAccessorR residual2; - if (m->use_two_residuals) { - residual2 = my_input_accessor[2]; - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } - } - ResidualLayerNorm::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - residual2, - my_output_accessor[0], - my_output_accessor[1], - gamma, - beta); + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = - (AddBiasResidualLayerNormMeta *)metas->meta[op]; - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1); // attn bias - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 2); // attn bias + weight - } else { - assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias - } - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; - } - } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - attn_bias_dim, - residual_volume, - my_input_accessor[0], - my_output_accessor[0], - my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], - gamma, - beta); + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); break; } case OP_SIGMOID_SILU_MULTI: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; - SigmoidSiluMulti::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); break; } - case OP_ALLREDUCE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); break; } default: { @@ -1176,9 +1743,6 @@ __host__ void FusedOp::backward_task(Task const *task, } } - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - int ioff = 0, woff = 0, ooff = 0; GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; @@ -1202,6 +1766,7 @@ __host__ void FusedOp::backward_task(Task const *task, if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { my_input_accessor[i] = input_accessor[my_off]; my_input_grad_accessor[i] = input_grad_accessor[my_off]; + assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { my_input_accessor[i] = output_accessor[my_off]; my_input_grad_accessor[i] = output_grad_accessor[my_off]; @@ -1220,9 +1785,9 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; + int my_off = fused->op_output_idx[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 483028599e..cab28181da 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -14,6 +14,7 @@ */ #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -30,6 +31,7 @@ #include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/residual_rms_norm_kernels.h" @@ -42,6 +44,7 @@ #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -77,27 +80,32 @@ OpMeta *FusedOp::init_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void FusedOp::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active tokens + if (bc->num_tokens == 0) { + return; + } + assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs); - // Domain input_domain[MAX_NUM_INPUTS]; - // Domain weight_domain[MAX_NUM_WEIGHTS]; - // Domain output_domain[MAX_NUM_OUTPUTS]; + bool softmax_grad_additional_region = + (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); + assert((int)regions.size() == fused->numInputs + fused->numWeights + + fused->numOutputs + + softmax_grad_additional_region); GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - // input_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i].region.get_index_space()); input_accessor[i] = helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], @@ -109,8 +117,6 @@ __host__ void FusedOp::forward_task(Task const *task, int roff = fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - // weight_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_accessor[i] = helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], @@ -122,8 +128,6 @@ __host__ void FusedOp::forward_task(Task const *task, roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - // output_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_accessor[i] = helperGetGenericTensorAccessorWO(fused->output_data_types[i], regions[i + roff], @@ -132,6 +136,7 @@ __host__ void FusedOp::forward_task(Task const *task, ctx, runtime); } + roff += fused->numOutputs; // Assert that all meta share the same dnn/blas handler int start = 0; for (start = 0; start < fused->numOperators; start++) { @@ -148,36 +153,39 @@ __host__ void FusedOp::forward_task(Task const *task, int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { - // Domain my_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS]; +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; my_input_accessor[i] = input_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off); +#endif } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; my_input_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } else { assert(false); } } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - // my_od[i] = output_domain[my_off]; - // my_op[i] = output_ptr[my_off]; my_output_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -192,21 +200,6 @@ __host__ void FusedOp::forward_task(Task const *task, m->legion_axis); break; } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); - break; - } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -222,16 +215,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[1].get_float_ptr()); break; } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -242,25 +225,48 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float const *bias_ptr = nullptr; + void const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].get_float_ptr(); + bias_ptr = my_weight_accessor[1].ptr; } } else { assert(fused->op_num_weights[op] == 1); } - Kernels::Linear::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - bias_ptr, - in_dim, - out_dim, - batch_size); + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_infr_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_accessor[0].domain; + Domain output_domain = my_output_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->output_type[0] == my_output_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); + Kernels::LoraLinear::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -388,88 +394,127 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } break; } - case OP_POOL2D: { + case OP_RMS_NORM: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } - case OP_SOFTMAX: { + case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_RESHAPE: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + &tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_TRANSPOSE: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + &beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } case OP_LAYERNORM: { @@ -491,39 +536,694 @@ __host__ void FusedOp::forward_task(Task const *task, break; } case OP_RESIDUAL_LAYERNORM: { - assert(false && "Operator ResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(false && "Operator AddBiasResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_SIGMOID_SILU_MULTI: { - assert(false && "Operator SigmoidSiluMulti does not support " - "the forward() task"); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(false && "Operator ResidualRMSNorm does not support " - "the forward() task"); - break; - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); - } - } - ioff += fused->op_num_inputs[op]; + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_input_accessor[1], + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + if (op == fused->numOperators - 1) { // if this is the final operator + output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( + fused->output_data_types[fused->numOutputs - 1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); + } + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::inference_kernel_wrapper( + m, + bc, + (op == fused->numOperators - 1), + my_input_accessor[0], + my_output_accessor[0], + output_accessor[fused->numOutputs]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); + } + ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} + +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](O): outputs +*/ +__host__ void FusedOp::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; + // BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active PEFT bwd tokens + if (bc->num_active_peft_tokens() == 0) { + return; + } + + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + // Domain input_domain[MAX_NUM_INPUTS]; + // Domain weight_domain[MAX_NUM_WEIGHTS]; + // Domain output_domain[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + // input_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i].region.get_index_space()); + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + // weight_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + // output_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + int ioff = 0, woff = 0, ooff = 0; + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off); +#endif + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // TODO: implement this + assert(false); + // ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + // int num_inputs = fused->op_num_inputs[op]; + // Kernels::Concat::peft_bwd_kernel_wrapper(m, + // my_output_accessor[0], + // my_input_accessor, + // num_inputs, + // m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_dim() == 5); + assert(my_output_grad_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + // TODO: implement this + assert(false); + // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + // BatchNorm::peft_bwd_kernel_kernel( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_weight_accessor[0].get_float_ptr(), + // my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LinearMeta *m = (LinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->input_type[0] == my_output_grad_accessor[0].data_type); + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + Kernels::Linear::peft_bwd_kernel_wrapper(m, + my_input_grad_accessor[0].ptr, + my_output_grad_accessor[0].ptr, + my_weight_accessor[0].ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_grad_accessor[0].domain; + Domain output_domain = my_output_grad_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->output_type[0] == my_output_grad_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + Kernels::LoraLinear::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_grad_accessor[0].domain; + Domain a_domain = my_input_grad_accessor[0].domain; + Domain b_domain = my_input_grad_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + // TODO: implement me + assert(false); + // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + // Kernels::BatchMatmul::backward_kernel_wrapper( + // meta, + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].get_float_ptr(), + // my_input_accessor[1].get_float_ptr(), + // (float const *)nullptr, + // m, + // n, + // k, + // batch, + // meta->a_seq_length_dim, + // meta->b_seq_length_dim, + // fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_input_grad_accessor[1].domain); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + // Kernels::ElementBinary::forward_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + // Currently assume the Embedding layer cannot be finetuned + // so we do nothing for embedding + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // TODO: implement me + assert(false); + // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + // if (m->data_type == DT_HALF) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_half_ptr(), + // my_output_accessor[0].get_half_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else if (m->data_type == DT_FLOAT) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else { + // assert(false && "Unsupported data type in ElementUnary forward"); + // } + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_weight_accessor[0]); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( + m, + bc, + my_input_grad_accessor[0], + my_input_grad_accessor[1], + my_output_grad_accessor[0], + my_output_grad_accessor[1], + my_weight_accessor[0]); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_grad_accessor[0], + my_weight_accessor[0], + my_output_grad_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + // TODO: implement me + assert(false); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::peft_bwd_kernel_wrapper( + m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma); + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorW residual2; + if (m->use_two_residuals) { + residual2 = my_input_grad_accessor[2]; + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + } + ResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + residual2, + gamma); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + } + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_grad_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_grad_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } + } } /* @@ -531,35 +1231,22 @@ __host__ void FusedOp::forward_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void - FusedOp::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void FusedOp::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; - // BatchConfig const *bc = (BatchConfig *)task->args; - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - // Return if no active tokens - if (bc->num_tokens == 0) { - return; - } - assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == fused->numInputs + fused->numWeights + fused->numOutputs); - // Domain input_domain[MAX_NUM_INPUTS]; - // Domain weight_domain[MAX_NUM_WEIGHTS]; - // Domain output_domain[MAX_NUM_OUTPUTS]; GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - // input_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i].region.get_index_space()); input_accessor[i] = helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], @@ -571,8 +1258,6 @@ __host__ void int roff = fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - // weight_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_accessor[i] = helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], @@ -584,8 +1269,6 @@ __host__ void roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - // output_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_accessor[i] = helperGetGenericTensorAccessorWO(fused->output_data_types[i], regions[i + roff], @@ -610,20 +1293,15 @@ __host__ void int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { - // Domain my_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS]; GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; assert(my_off < fused->numInputs); my_input_accessor[i] = input_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; assert(my_off < fused->numOutputs); my_input_accessor[i] = output_accessor[my_off]; } else { @@ -632,8 +1310,6 @@ __host__ void } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; assert(fused->op_weight_idx[i + woff] < fused->numWeights); my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } @@ -641,8 +1317,6 @@ __host__ void int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); assert(my_off < fused->numOutputs); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { @@ -658,6 +1332,21 @@ __host__ void m->legion_axis); break; } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -673,6 +1362,16 @@ __host__ void my_weight_accessor[1].get_float_ptr()); break; } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -683,27 +1382,25 @@ __host__ void assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - void const *bias_ptr = nullptr; + float const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].ptr; + bias_ptr = my_weight_accessor[1].get_float_ptr(); } } else { assert(fused->op_num_weights[op] == 1); } - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); - Kernels::Linear::forward_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - bias_ptr, - in_dim, - out_dim, - batch_size); + Kernels::Linear::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); break; } case OP_BATCHMATMUL: { @@ -831,126 +1528,78 @@ __host__ void case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: - case OP_SCALAR_TRUE_DIV: { + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - if (m->data_type == DT_HALF) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr(), - my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_FLOAT) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false && "Unsupported data type in ElementUnary forward"); - } + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); break; } - case OP_RMS_NORM: { + case OP_FLAT: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = - (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); break; } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - TreeIncMultiHeadSelfAttentionMeta *m = - (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // TreeVerifyBatchConfig const *tree_bc = - // (TreeVerifyBatchConfig *)task->args; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &tree_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - SpecIncMultiHeadSelfAttentionMeta const *m = - (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( m, - &beam_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); break; } case OP_LAYERNORM: { @@ -972,119 +1621,23 @@ __host__ void break; } case OP_RESIDUAL_LAYERNORM: { - assert(fused->op_num_outputs[op] == 2); - ResidualLayerNormMeta const *m = - (ResidualLayerNormMeta *)metas->meta[op]; - if (m->use_two_residuals) { - assert(fused->op_num_inputs[op] == 3); - } else { - assert(fused->op_num_inputs[op] == 2); - } - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 0); - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 1); // weight - } else { - assert(fused->op_num_weights[op] == 2); // weight + bias - } - } - GenericTensorAccessorR residual2; - if (m->use_two_residuals) { - residual2 = my_input_accessor[2]; - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } - } - ResidualLayerNorm::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - residual2, - my_output_accessor[0], - my_output_accessor[1], - gamma, - beta); + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = - (AddBiasResidualLayerNormMeta *)metas->meta[op]; - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1); // attn bias - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 2); // attn bias + weight - } else { - assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias - } - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; - } - } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - attn_bias_dim, - residual_volume, - my_input_accessor[0], - my_output_accessor[0], - my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], - gamma, - beta); + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); break; } case OP_SIGMOID_SILU_MULTI: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; - SigmoidSiluMulti::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); break; } - case OP_ALLREDUCE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); break; } default: { @@ -1094,37 +1647,6 @@ __host__ void assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { - std::vector input_accessors_to_save; - std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - input_accessors_to_save.push_back(input_accessor[my_off]); - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - input_accessors_to_save.push_back(output_accessor[my_off]); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back( - weight_accessor[fused->op_weight_idx[i + woff]]); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); - } - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], - shard_id, - bc, - input_accessors_to_save, - weight_accessors_to_save, - output_accessors_to_save); - } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; @@ -1156,9 +1678,6 @@ __host__ void FusedOp::backward_task(Task const *task, int sum = fused->numInputs + fused->numWeights + fused->numOutputs; assert(sum * 2 == (int)regions.size()); } - // Domain input_domain[MAX_NUM_INPUTS], input_grad_domain[MAX_NUM_INPUTS]; - // Domain weight_domain[MAX_NUM_WEIGHTS], weight_grad_domain[MAX_NUM_WEIGHTS]; - // Domain output_domain[MAX_NUM_OUTPUTS], output_grad_domain[MAX_NUM_OUTPUTS]; GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; @@ -1168,8 +1687,6 @@ __host__ void FusedOp::backward_task(Task const *task, int roff = 0; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - // input_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i].region.get_index_space()); input_accessor[i] = helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], @@ -1181,8 +1698,6 @@ __host__ void FusedOp::backward_task(Task const *task, roff += fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - // weight_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_accessor[i] = helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], @@ -1194,8 +1709,6 @@ __host__ void FusedOp::backward_task(Task const *task, roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - // output_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_accessor[i] = helperGetGenericTensorAccessorRO(fused->output_data_types[i], regions[i + roff], @@ -1206,8 +1719,6 @@ __host__ void FusedOp::backward_task(Task const *task, } roff += fused->numOutputs; for (int i = 0; i < fused->numInputs; i++) { - // input_grad_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); input_grad_accessor[i] = helperGetGenericTensorAccessorRW(fused->input_data_types[i], regions[i + roff], @@ -1219,8 +1730,6 @@ __host__ void FusedOp::backward_task(Task const *task, } roff += fused->numInputs; for (int i = 0; i < fused->numWeights; i++) { - // weight_grad_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_grad_accessor[i] = helperGetGenericTensorAccessorRW(fused->weight_data_types[i], regions[i + roff], @@ -1233,8 +1742,6 @@ __host__ void FusedOp::backward_task(Task const *task, } roff += fused->numWeights; for (int i = 0; i < fused->numOutputs; i++) { - // output_grad_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_grad_accessor[i] = helperGetGenericTensorAccessorRW(fused->output_data_types[i], regions[i + roff], @@ -1260,9 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task, } int ioff = 0, woff = 0, ooff = 0; - // Domain my_id[MAX_NUM_INPUTS], my_grad_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS], my_grad_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS], my_grad_od[MAX_NUM_OUTPUTS]; GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; @@ -1283,19 +1787,11 @@ __host__ void FusedOp::backward_task(Task const *task, for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; - // my_ip[i] = input_ptr[my_off]; my_input_accessor[i] = input_accessor[my_off]; - // my_grad_id[i] = input_grad_domain[my_off]; - // my_grad_ip[i] = input_grad_ptr[my_off]; my_input_grad_accessor[i] = input_grad_accessor[my_off]; assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; - // my_ip[i] = output_ptr[my_off]; my_input_accessor[i] = output_accessor[my_off]; - // my_grad_id[i] = output_grad_domain[my_off]; - // my_grad_ip[i] = output_grad_ptr[my_off]; my_input_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else { @@ -1304,11 +1800,7 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - // my_grad_wd[i] = weight_grad_domain[fused->op_weight_idx[i + woff]]; - // my_grad_wp[i] = weight_grad_ptr[fused->op_weight_idx[i + woff]]; my_weight_grad_accessor[i] = weight_grad_accessor[fused->op_weight_idx[i + woff]]; assert(my_weight_grad_accessor[i].domain.get_volume() == @@ -1317,11 +1809,7 @@ __host__ void FusedOp::backward_task(Task const *task, for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); int my_off = fused->op_output_idx[i + ooff]; - // my_od[i] = output_domain[my_off]; - // my_op[i] = output_ptr[my_off]; my_output_accessor[i] = output_accessor[my_off]; - // my_grad_od[i] = output_grad_domain[my_off]; - // my_grad_op[i] = output_grad_ptr[my_off]; my_output_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index f2f402737c..03b9a5199b 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -99,7 +99,7 @@ Group_byParams Group_by::get_params() const { Group_byParams params; params.n = this->n; params.alpha = this->alpha; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -271,7 +271,7 @@ OpMeta *Group_by::init_task(Task const *task, Runtime *runtime) { Group_by *gb = (Group_by *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha); + GroupByMeta *m = new GroupByMeta(handle, gb); m->profiling = gb->profiling; m->inference_debugging = gb->inference_debugging; std::strcpy(m->op_name, gb->name); @@ -579,7 +579,7 @@ bool Group_by::measure_operator_cost(Simulator *sim, } } - GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha); + GroupByMeta *m = new GroupByMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index 761c35f182..9ca6f77898 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -188,9 +188,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, data_dim); } -GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) - : OpMeta(handler), alpha(_alpha) { - checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); +GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb) + : OpMeta(handler, gb), alpha(gb->alpha) { + checkCUDA(hipMalloc(&dev_region_ptrs, gb->n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index 0ed09e20b3..43bcb900df 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -198,9 +198,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, } } -GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) - : OpMeta(handler), alpha(_alpha) { - checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); +GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb) + : OpMeta(handler, gb), alpha(gb->alpha) { + checkCUDA(cudaMalloc(&dev_region_ptrs, gb->n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index aa60d0f19c..8219cf9e1f 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -363,7 +363,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, nullptr /*owner_op*/, - true /*create_grad*/, + model.config.computationMode == COMP_MODE_INFERENCE + ? false + : true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { @@ -871,6 +873,139 @@ void IncMultiHeadSelfAttention::inference_task( } } +FutureMap IncMultiHeadSelfAttention::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(idx++, FID_DATA); + if (qkv_bias || final_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiHeadSelfAttention::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc->num_tokens, + bc->num_active_requests()); + if (bc->num_active_peft_tokens() == 0) { + return; + } + + IncMultiHeadSelfAttentionMeta *m = + *((IncMultiHeadSelfAttentionMeta **)task->local_args); + + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 4); + } + + Domain input_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_grad_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 2); + assert(output_grad_domain.get_dim() == 4); + + assert(task->index_point.get_dim() == 1); + + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + input_grad, + weight, + output_grad, + biases); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + IncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } +} + void IncMultiHeadSelfAttention::backward(FFModel const &ff) { // IncMultiHeadSelfAttention does not support backward assert(false); @@ -926,7 +1061,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.quantization_type = this->quantization_type; params.offload = this->offload; params.num_kv_heads = this->num_kv_heads; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d60386f927..826fea4347 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -12,13 +12,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/utils/hip_helper.h" -#include +#include "hip/hip_complex.h" #include namespace FlexFlow { @@ -27,9 +27,288 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + namespace Kernels { namespace IncMultiHeadAttention { +template +__device__ __forceinline__ T + WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T + WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + +// gridDim = num_heads +// blockDim = num_tokens/num_request * head_size +// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| +// one thread process one head_size +template +__global__ void compute_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + // eg. if head_size = 128, thread_per_key = 4, with float32 precision + // then K_VEC_SIZE = 1, QK_VEC_SIZE = 4 + // K_ELTS_PER_THREAD = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 / 1 = 32 + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + // constexpr int QK_VEC_SIZE = 16 / sizeof(DT); + // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + // DT const *q_ptr = + // query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size; + + // q tensor in this thread + // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total + // K_VECS_PER_THREAD elements + // QK_vec_k: 32->1, 64->2, 128->4... head_size + // K_vec_k: 4->1, 2->2, 1->4 threads_per_key + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + // first iter = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 + // K_PER_ITER how many keys in this loop + // The number of timesteps loaded per iteration. + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + // get k, perform qk proj + + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast(k_cache_batch + + ti_circ * hidden_size + + head_idx * per_head_size + jj); + } + // Compute dot product. + // This includes a reduction across the threads in the same thread group. + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + // // todo add positional embedding to the qk production + // // Store the product to shared memory. There's one qk value per + // timestep. + // // Update the max. + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); + } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("softmax %.10f\n", qk_smem[0]); + // } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + float logit = qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float( + *reinterpret_cast(output_ptr + request_idx * hidden_size + + head_idx * per_head_size + vi), + out); + } +} + // only used by MPT model. https://arxiv.org/abs/2108.12409 template __global__ void apply_position_bias_qkprd(DT *input_ptr, @@ -86,8 +365,10 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, // int qkv_index = i / (num_tokens * qProjSize) % 3; int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); - size_t in_token_idx = i - token_idx * hidden_size * 3; + size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; + int qkv_index = in_token_idx / hidden_size; + int proj_size = qkv_index == 0 ? qProjSize : kProjSize; int head_idx = @@ -109,6 +390,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, } } } + template __global__ void scaling_query_kernel(DT *input_ptr, int qProjSize, @@ -158,6 +440,10 @@ __global__ void int token_idx = (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + int pos_i = real_i % (proj_size / 2); float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); hipFloatComplex complex_pos = {cos(freq), sin(freq)}; @@ -189,7 +475,7 @@ __global__ void int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); int real_part_index = idx + head_idx * proj_size + - token_idx * hidden_size * 3 + + token_idx * hidden_size * QKV_WEIGHT_NUM + hidden_size * (q_tensor ? 0 : 1); int complex_part_index = real_part_index + (proj_size / 2); @@ -217,28 +503,59 @@ __global__ void } template -__global__ void store_kv_cache(DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, +__global__ void + apply_rotary_embedding_bwd(DT *input_ptr, + hipFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, + int proj_size, int num_tokens, - int max_seq_len, int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; - size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset; - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; - // key cache - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = hipCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; } } @@ -254,56 +571,68 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); - hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_q_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_q_heads - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - int lda = k, ldb = k, ldc = m_; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - hipblas_data_type, - lda, - input_ptr, - hipblas_data_type, - ldb, - &beta, - output_ptr, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - // apply rotary emmmbedding for q and k - // step1 change the k, v to complex tensor + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + hipblasDatatype_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + + // Step 1: Compute QKV projections + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_q = m->qProjSize * m->num_q_heads; + int m_k = m->kProjSize * m->num_q_heads; + int m_v = m->vProjSize * m->num_q_heads; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_infr_tokens(); + int k = m->qSize; + int m_ = m_q * QKV_WEIGHT_NUM; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: QKV weights + // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] + // matrix B: input + // matrix B's layout: [qSize (hidden_dim), num_new_tokens] + // matrix C: devQKVProjArray + // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - // apply bias for q, k, v + + // Step 2: apply bias for QKV, or scale the query if (*m->qkv_bias) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -321,7 +650,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->scaling_factor, m->hidden_size); } else if (m->scaling_query) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -333,10 +662,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->scaling_factor, m->hidden_size); } + + // Step 3: apply rotary embedding if needed if (*m->apply_rotary_embedding) { /*q&k*/ parallelism = num_tokens * m->hidden_size; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -352,14 +683,42 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +__global__ void store_kv_cache(DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int num_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + // key cache + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; + } +} + template void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, hipStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); if (num_tokens > 0) { int parallelism = m->hidden_size * num_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -374,6 +733,129 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *weight_ptr, + DT const *bias_ptr, + int num_tokens, + hipStream_t stream) { + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + hipblasDatatype_t compute_type = HIPBLAS_R_16F; +#else + hipblasDatatype_t compute_type = cublas_data_type; +#endif + // Project to output, save result directly on output tensor + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = num_tokens; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: attn heads + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->attn_heads); + // matrix B: output + // matrix B's layout: [oProjSize, num_new_tokens] + DT *C = static_cast
(output_ptr); + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Add final output bias + if (*m->final_bias && shard_id == 0) { + int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + bias_ptr, + num_tokens, + qkv_weight_size, + m->oProjSize); + } +} + +#define LAUNCH_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos) + +template +void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + hipStream_t stream) { + dim3 grid(m->num_q_heads, bc->num_generation_tokens); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + template void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, @@ -393,27 +875,29 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, if (m->quantization_type == DT_INT4) { int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; - decompress_int4_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); + hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int4_attention_weights), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_q_heads); } else { assert(m->quantization_type == DT_INT8); int parallelism = m->qProjSize * m->qSize * m->num_q_heads; - decompress_int8_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); + hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int8_attention_weights), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_q_heads); } } else { if (data_type == DT_FLOAT) { @@ -435,7 +919,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, } template -void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *input_ptr, @@ -443,19 +927,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, hipStream_t stream) { - // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { checkCUDA(hipMemcpyAsync( m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } - checkCUDA(hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(BatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream)); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -465,14 +943,520 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - - // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + if (bc->num_generation_tokens > 0) { + // phase 3: Compute attention score for generation tokens + compute_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } + + if (bc->num_tokens > bc->num_generation_tokens) { + // phase 4: Compute attention score for prompt tokens; + compute_attention_kernel_prompt( + m, bc, shard_id, bias_ptr, weight_ptr, stream); + } + + // compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); + compute_o_prod_bias( + m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); +} + +std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); +} + +template +void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *input_grad_ptr, + DT const *weight_ptr, + DT const *output_grad_ptr, + DT const *bias_ptr, + hipStream_t stream) { + assert(!m->offload); + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + hipblasDatatype_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + // Currently assume we are calculating gradients for all tokens + // of a request + assert(num_tokens == num_total_tokens); + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + // Step 1: compute gradients before final projection + { + int m_ = m->vProjSize * m->num_q_heads; + int n_ = num_tokens; + int k_ = m->oProjSize; + int lda = m_; + int ldb = k_; + int ldc = m_; + float alpha = 1.0f, beta = 0.0f; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: output gradients + // matrix B's layout: [oProjSize, num_new_tokens] + DT const *B = + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; + // matrix C: attn_heads gradients + // matrix C's layout: [vProjSize * num_heads, num_new_tokens] + DT *C = static_cast
(m->handle.workSpace); + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0"; + save_tensor(C, m_ * n_, filename.c_str()); + } + } + // Step 2: compute gradients w.r.t. value + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: qk_prods_softmax + // matrix A's layout: [num_new_tokens, total_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods_softmax); + // matrix B: attn_heads gradients + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->handle.workSpace); + // matrix C: gradients for value (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + + 2 * num_tokens * + (m->qProjSize * m->num_q_heads); // skip over regions reserved + // for Q and K gradients + // after transpositions + int m_ = num_tokens; // total_tokens + int n_ = m->vProjSize; // num_new_tokens + int k_ = num_tokens; // num_new_tokens + // before transpositions + int lda = num_tokens; // num_new_tokens + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // total_tokens + // N.B. strides are applied before transpose operations + int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens + int strideB = m->vProjSize; + int strideC = num_tokens * m->vProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + // save result to file for checking + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0"; + save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax"; + save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); + } + } + // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: attn_heads gradients + // matrix A's layout: [vProjSize * num_heads, num_new_tokens] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: value cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + DT *C = static_cast
(m->qk_prods_softmax); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = num_tokens; + int k_ = m->vProjSize; + // before transposition and striding + int lda = m->vProjSize * m->num_q_heads; + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // num_new_tokens + int strideA = m->vProjSize; + int strideB = m->vProjSize; + int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens + + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache"; + save_tensor( + B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); + } + } + // Step 4: softmax backpropagation + { + float alpha = 1.0f, beta = 0.0f; + int n_param = m->num_q_heads; + int c_param = num_tokens; + int h_param = 1; + int w_param = num_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param)); + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + + // TODO: fill all elements above diagonal to force causal attention + size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->qk_prods), + num_tokens, + num_tokens, + m->num_q_heads, + entries_above_diagonal, + DT(0.0f)); + } + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = get_peft_dbg_folder(m, shard_id) + + ".qk_prods.softmax_grad_in.masked"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + } + // Step 5: compute gradients w.r.t. key + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: query activation (in query_activation_buffer) + // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->query_activation_buffer); + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + // after transposition & striding + int m_ = num_tokens; + int n_ = m->kProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->kProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->kProjSize; + int strideC = num_tokens * m->kProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".query_activation"; + save_tensor( + B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".devkproj_pre"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); + } + } + // Step 6: compute gradients w.r.t query + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: key cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: gradients for query (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = m->qProjSize; + int k_ = num_tokens; + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->qProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // Step 7: perform rotary position embeddings (RoPE) bwd + { + if (*m->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_bwd), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + A, + m->complex_input, + m->token_infos, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); + } + } + + // Step 8: compute gradients w.r.t. input + { + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix A: QKV projection weights + // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] + DT const *A = weight_ptr; + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [m->qSize, num_tokens] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int lda = m_; + int ldb = n_; + int ldc = m_; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } + } + } } } // namespace IncMultiHeadAttention @@ -481,42 +1465,47 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, using namespace Kernels::IncMultiHeadAttention; template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + + DT qVal = devQKVProjArray[val_idx]; + + // query cache + qCache_ptr[i] = qVal; } } template -void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - hipStream_t stream) { +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + DT const *bias_ptr, + DT const *weight_ptr, + hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); - miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif + hipblasDatatype_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; @@ -530,64 +1519,102 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - // a flag of using this scaling alpha - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; + } + int parallelism = m->hidden_size * num_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests + // Step 1: compute query-key product QK.T/sqrt(d_k) + { + // Scale by sqrt(d_k) as per the original attention paper + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // after transpositions + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + // before transpositions + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + // N.B. strides are applied before transpose operations + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // matrix A: devQKVProjArray + // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] + // To get query projection, skip over Q entries from previous requests + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // matrix B: key cache + // matrix B's layout: [kProjSize * num_heads, total_tokens] + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Step 2: Add alibi position bias to qk production + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests DT *C = static_cast
(m->qk_prods); - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd), GET_BLOCKS(parallelism), min((size_t)CUDA_NUM_THREADS, parallelism), 0, @@ -599,13 +1626,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m->global_num_q_heads, shard_id); } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. + + // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods + // with -inf to force causal attention. assert(num_new_tokens <= total_tokens); size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; if (entries_above_diagonal > 0) { size_t parallelism = m->num_q_heads * entries_above_diagonal; - hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal), GET_BLOCKS(parallelism), min((size_t)CUDA_NUM_THREADS, parallelism), 0, @@ -617,137 +1645,129 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, static_cast
(-INFINITY)); } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(miopenSet4dTensorDescriptor( - m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; - k = total_tokens; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + - // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_q_heads * m->vProjSize; - - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + { + // Before modifying the parameters below, make sure to read the following + // description of the HIPDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; + } + checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + hipMemcpyDeviceToDevice, + stream)); + } + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ + // softmax(QK.T/sqrt(d_k)).T + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->vProjSize; + int n = num_new_tokens; + int k = total_tokens; + // before transpositions + int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + // N.B. strides are applied before transpose operations + int strideA = vt_block_size; + int strideB = num_new_tokens * total_tokens; + int strideC = m->vProjSize; + // matrix A: value cache + // matrix A's layout: [vProjSize, num_heads, total_tokens] + // To get A, skip over V.T entries from previous requests (all heads + + // padding) + DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: qk_prods_softmax + // matrix B's layout: [num_new_tokens, total_tokens, num_heads] + // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous + // requests (all heads) + DT *B = static_cast
(m->qk_prods_softmax); + // matrix C: attn heads + // matrix C's layout: [vProjSize, num_heads, num_new_tokens] + // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous + // requests + // store the result attn heads, also skip the genration tokens + DT *C = static_cast
(m->attn_heads) + + (bc->requestsInfo[i].first_token_offset_in_batch) * + m->num_q_heads * m->vProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } tokens_previous_requests += num_new_tokens; } - - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - num_tokens, - qkv_weight_size, - m->oProjSize); + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); } - - assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -813,10 +1833,71 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); checkCUDA(hipEventDestroy(t_start)); checkCUDA(hipEventDestroy(t_end)); - printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); + printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed); + } +} + +/*static*/ +void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + // assert(input.data_type == weight.data_type); + assert(input_grad.data_type == output_grad.data_type); + if (use_bias) { + assert(input_grad.data_type == bias.data_type); + } + + if (input_grad.data_type == DT_HALF) { + assert(!m->offload); + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_half_ptr(), + weight.get_half_ptr(), + output_grad.get_half_ptr(), + bias_ptr, + stream); + } else if (input_grad.data_type == DT_FLOAT) { + assert(!m->offload); + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_float_ptr(), + weight.get_float_ptr(), + output_grad.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed); } } @@ -895,7 +1976,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(kSize == vSize); qProjSize = _qProjSize; kProjSize = _kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul + assert(qProjSize == kProjSize); // required for attention QK.T matmul vProjSize = _vProjSize; oProjSize = _oProjSize; size_t size_of_dt = data_type_size(attn->data_type); @@ -949,14 +2030,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { + case INC_DECODING_MODE: { key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); @@ -965,21 +2047,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case BEAM_SEARCH_MODE: + case TREE_VERIFY_MODE: { + // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); break; } default: assert(false && "Unkown inference mode"); } - size_t tokeninfo_size = max_tokens_per_batch; + size_t requestinfo_size = BatchConfig::max_requests_per_batch(); + // size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; @@ -990,7 +2075,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + complex_size * sizeof(hipFloatComplex); // more components will // be added here later if (offload) { @@ -1035,10 +2119,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); + token_infos = static_cast( + handler.batch_config_metadata->tokens_info); + request_infos = static_cast( + handler.batch_config_metadata->requestsInfo); + if (offload) { - token_infos = - gpu_mem_allocator.allocate_reserved( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_reserved( + // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); @@ -1052,10 +2141,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(hipFloatComplex); + // request_infos = + // gpu_mem_allocator.allocate_reserved( + // requestinfo_size); } else { - token_infos = - gpu_mem_allocator.allocate_instance( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_instance( + // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -1064,6 +2156,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); + // request_infos = + // gpu_mem_allocator.allocate_instance( + // requestinfo_size); } // allocate more size for quantization data @@ -1077,6 +2172,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; checkCUDA(hipStreamSynchronize(stream)); } @@ -1098,4 +2195,37 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + hipStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + hipStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + hipStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + hipStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a0d31bb6ef..b278611b60 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -12,9 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" -#endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/decompress_kernels.h" @@ -483,6 +481,63 @@ __global__ void } } +template +__global__ void + apply_rotary_embedding_bwd(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int proj_size, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); + + int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -497,17 +552,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // Step 1: Compute QKV projections { @@ -517,7 +573,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int m_k = m->kProjSize * m->num_q_heads; int m_v = m->vProjSize * m->num_q_heads; assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); + int n = bc->num_active_infr_tokens(); int k = m->qSize; int m_ = m_q * QKV_WEIGHT_NUM; // before transpositions @@ -604,7 +660,7 @@ template void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); if (num_tokens > 0) { int parallelism = m->hidden_size * num_tokens; store_kv_cache<< -void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *input_ptr, @@ -843,6 +899,504 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } +std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); +} + +template +void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *input_grad_ptr, + DT const *weight_ptr, + DT const *output_grad_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + assert(!m->offload); + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + cudaDataType_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + // Currently assume we are calculating gradients for all tokens + // of a request + assert(num_tokens == num_total_tokens); + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + // Step 1: compute gradients before final projection + { + int m_ = m->vProjSize * m->num_q_heads; + int n_ = num_tokens; + int k_ = m->oProjSize; + int lda = m_; + int ldb = k_; + int ldc = m_; + float alpha = 1.0f, beta = 0.0f; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: output gradients + // matrix B's layout: [oProjSize, num_new_tokens] + DT const *B = + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; + // matrix C: attn_heads gradients + // matrix C's layout: [vProjSize * num_heads, num_new_tokens] + DT *C = static_cast
(m->handle.workSpace); + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0"; + save_tensor(C, m_ * n_, filename.c_str()); + } + } + // Step 2: compute gradients w.r.t. value + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: qk_prods_softmax + // matrix A's layout: [num_new_tokens, total_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods_softmax); + // matrix B: attn_heads gradients + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->handle.workSpace); + // matrix C: gradients for value (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + + 2 * num_tokens * + (m->qProjSize * m->num_q_heads); // skip over regions reserved + // for Q and K gradients + // after transpositions + int m_ = num_tokens; // total_tokens + int n_ = m->vProjSize; // num_new_tokens + int k_ = num_tokens; // num_new_tokens + // before transpositions + int lda = num_tokens; // num_new_tokens + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // total_tokens + // N.B. strides are applied before transpose operations + int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens + int strideB = m->vProjSize; + int strideC = num_tokens * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0"; + save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax"; + save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); + } + } + // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: attn_heads gradients + // matrix A's layout: [vProjSize * num_heads, num_new_tokens] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: value cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + DT *C = static_cast
(m->qk_prods_softmax); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = num_tokens; + int k_ = m->vProjSize; + // before transposition and striding + int lda = m->vProjSize * m->num_q_heads; + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // num_new_tokens + int strideA = m->vProjSize; + int strideB = m->vProjSize; + int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache"; + save_tensor( + B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); + } + } + // Step 4: softmax backpropagation + { + float alpha = 1.0f, beta = 0.0f; + int n_param = m->num_q_heads; + int c_param = num_tokens; + int h_param = 1; + int w_param = num_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods)); + + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + + // TODO: fill all elements above diagonal to force causal attention + size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + fill_entries_above_diagonal<<>>(static_cast
(m->qk_prods), + num_tokens, + num_tokens, + m->num_q_heads, + entries_above_diagonal, + DT(0.0f)); + } + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = get_peft_dbg_folder(m, shard_id) + + ".qk_prods.softmax_grad_in.masked"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + } + // Step 5: compute gradients w.r.t. key + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: query activation (in query_activation_buffer) + // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->query_activation_buffer); + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + // after transposition & striding + int m_ = num_tokens; + int n_ = m->kProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->kProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->kProjSize; + int strideC = num_tokens * m->kProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".query_activation"; + save_tensor( + B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".devkproj_pre"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); + } + } + // Step 6: compute gradients w.r.t query + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: key cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: gradients for query (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = m->qProjSize; + int k_ = num_tokens; + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->qProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // Step 7: perform rotary position embeddings (RoPE) bwd + { + if (*m->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + apply_rotary_embedding_bwd<<>>(A, + m->complex_input, + m->token_infos, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); + } + } + + // Step 8: compute gradients w.r.t. input + { + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix A: QKV projection weights + // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] + DT const *A = weight_ptr; + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [m->qSize, num_tokens] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int lda = m_; + int ldb = n_; + int ldc = m_; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } + } + } +} + } // namespace IncMultiHeadAttention } // namespace Kernels @@ -877,24 +1431,25 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, } template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + + DT qVal = devQKVProjArray[val_idx]; + + // query cache + qCache_ptr[i] = qVal; } } template -void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *bias_ptr, @@ -905,17 +1460,18 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; @@ -929,12 +1485,35 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; + } + int parallelism = m->hidden_size * num_tokens; + store_query_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); + } // Step 1: compute query-key product QK.T/sqrt(d_k) { // Scale by sqrt(d_k) as per the original attention paper @@ -1066,6 +1645,25 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, m->qk_tensor, C_softmax)); } + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; + } + checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + cudaMemcpyDeviceToDevice, + stream)); + } // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ // softmax(QK.T/sqrt(d_k)).T { @@ -1090,7 +1688,6 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous // requests (all heads) DT *B = static_cast
(m->qk_prods_softmax); - ; // matrix C: attn heads // matrix C's layout: [vProjSize, num_heads, num_new_tokens] // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous @@ -1136,7 +1733,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -1206,6 +1803,70 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( } } +/*static*/ +void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // assert(input.data_type == weight.data_type); + assert(input_grad.data_type == output_grad.data_type); + if (use_bias) { + assert(input_grad.data_type == bias.data_type); + } + + if (input_grad.data_type == DT_HALF) { + assert(!m->offload); + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_half_ptr(), + weight.get_half_ptr(), + output_grad.get_half_ptr(), + bias_ptr, + stream); + } else if (input_grad.data_type == DT_FLOAT) { + assert(!m->offload); + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_float_ptr(), + weight.get_float_ptr(), + output_grad.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed); + } +} + IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, @@ -1424,11 +2085,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); - token_infos = - static_cast(handler.batch_config_metadata); - request_infos = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo)); + token_infos = static_cast( + handler.batch_config_metadata->tokens_info); + request_infos = static_cast( + handler.batch_config_metadata->requestsInfo); if (offload) { // token_infos = @@ -1478,6 +2138,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; cudaStreamSynchronize(stream); } diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp index 7145af2108..8eeede65c7 100644 --- a/src/ops/kernels/batch_matmul.cpp +++ b/src/ops/kernels/batch_matmul.cpp @@ -13,13 +13,15 @@ * limitations under the License. */ +#include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {} +BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm) + : OpMeta(handler, bmm) {} namespace Kernels { namespace BatchMatmul { diff --git a/src/ops/kernels/batch_matmul.cu b/src/ops/kernels/batch_matmul.cu index ac280db1a4..97f13fa5a8 100644 --- a/src/ops/kernels/batch_matmul.cu +++ b/src/ops/kernels/batch_matmul.cu @@ -13,12 +13,14 @@ * limitations under the License. */ +#include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {} +BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm) + : OpMeta(handler, bmm) {} namespace Kernels { namespace BatchMatmul { diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp index 16b9b4cec0..1e561959f1 100644 --- a/src/ops/kernels/cast_kernels.cpp +++ b/src/ops/kernels/cast_kernels.cpp @@ -14,12 +14,13 @@ */ #include "flexflow/ops/kernels/cast_kernels.h" +#include "flexflow/ops/cast.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {} +CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {} namespace Kernels { namespace Cast { diff --git a/src/ops/kernels/cast_kernels.cu b/src/ops/kernels/cast_kernels.cu index a96f37dbbd..fdce63b9f1 100644 --- a/src/ops/kernels/cast_kernels.cu +++ b/src/ops/kernels/cast_kernels.cu @@ -13,12 +13,13 @@ * limitations under the License. */ +#include "flexflow/ops/cast.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {} +CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {} namespace Kernels { namespace Cast { diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp index bf5d46b9cc..6c05e0143c 100644 --- a/src/ops/kernels/concat_kernels.cpp +++ b/src/ops/kernels/concat_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/concat_kernels.h" +#include "flexflow/ops/concat.h" #include "flexflow/utils/hip_helper.h" #include @@ -23,6 +24,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Rect; +ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc) + : OpMeta(handler, cc) {} + namespace Kernels { namespace Concat { diff --git a/src/ops/kernels/concat_kernels.cu b/src/ops/kernels/concat_kernels.cu index f625560625..2569c36b21 100644 --- a/src/ops/kernels/concat_kernels.cu +++ b/src/ops/kernels/concat_kernels.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ops/concat.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/utils/cuda_helper.h" @@ -22,6 +23,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Rect; +ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc) + : OpMeta(handler, cc) {} + namespace Kernels { namespace Concat { diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp index 7d2fa20c49..85a94ad6be 100644 --- a/src/ops/kernels/conv_2d_kernels.cpp +++ b/src/ops/kernels/conv_2d_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/kernels/conv_2d_kernels.h" +#include "flexflow/ops/conv_2d.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) { +Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv) + : OpMeta(handler, conv) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); @@ -326,7 +328,7 @@ void backward_kernel(Conv2DMeta const *m, output_ptr, n * c * h * w); } - // Compute filter gradiant + // Compute filter gradient // NOTE: we use alpha for kernel_grad to accumulate gradients checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn, &alpha, @@ -341,7 +343,7 @@ void backward_kernel(Conv2DMeta const *m, kernel_grad_ptr, m->handle.workSpace, m->handle.workSpaceSize)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn, @@ -352,7 +354,7 @@ void backward_kernel(Conv2DMeta const *m, m->biasTensor, bias_grad_ptr)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn, diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu index 6c0fd85496..661acdf732 100644 --- a/src/ops/kernels/conv_2d_kernels.cu +++ b/src/ops/kernels/conv_2d_kernels.cu @@ -1,9 +1,11 @@ +#include "flexflow/ops/conv_2d.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) { +Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv) + : OpMeta(handler, conv) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); @@ -309,7 +311,7 @@ void backward_kernel(Conv2DMeta const *m, reluBackward<<>>( output_grad_ptr, output_ptr, n * c * h * w); } - // Compute filter gradiant + // Compute filter gradient // NOTE: we use alpha for kernel_grad to accumulate gradients checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn, &alpha, @@ -324,7 +326,7 @@ void backward_kernel(Conv2DMeta const *m, &alpha, m->filterDesc, kernel_grad_ptr)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn, @@ -335,7 +337,7 @@ void backward_kernel(Conv2DMeta const *m, m->biasTensor, bias_grad_ptr)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn, diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index 14225f0bce..c8b1887fd4 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -28,7 +28,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, Domain const &output_domain) - : OpMeta(handler) { + : OpMeta(handler, dropout) { profiling = dropout->profiling; inference_debugging = dropout->inference_debugging; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index e142bba83b..d65b951f51 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -27,7 +27,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, Domain const &output_domain) - : OpMeta(handler) { + : OpMeta(handler, dropout) { profiling = dropout->profiling; inference_debugging = dropout->inference_debugging; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); diff --git a/src/ops/kernels/flat_kernels.cpp b/src/ops/kernels/flat_kernels.cpp index be48854fc0..6815ce7492 100644 --- a/src/ops/kernels/flat_kernels.cpp +++ b/src/ops/kernels/flat_kernels.cpp @@ -14,11 +14,15 @@ */ #include "flexflow/ops/kernels/flat_kernels.h" +#include "flexflow/ops/flat.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { +FlatMeta::FlatMeta(FFHandler handler, Flat const *flat) + : OpMeta(handler, flat) {} + namespace Kernels { namespace Flat { diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu index 3836c02c94..fc0c0270c1 100644 --- a/src/ops/kernels/flat_kernels.cu +++ b/src/ops/kernels/flat_kernels.cu @@ -13,11 +13,15 @@ * limitations under the License. */ +#include "flexflow/ops/flat.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { +FlatMeta::FlatMeta(FFHandler handler, Flat const *flat) + : OpMeta(handler, flat) {} + namespace Kernels { namespace Flat { diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 072eb5e96b..a36d6719c9 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -14,6 +14,8 @@ */ #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/utils/hip_helper.h" #include @@ -24,24 +26,53 @@ LinearMeta::LinearMeta(FFHandler handler, Linear const *li, MemoryAllocator gpu_mem_allocator, int weightSize) - : OpMeta(handler, li) { + : OpMeta(handler, li), weight_ptr(nullptr) { + DataType data_type = li->data_type; + // allocate weight and bias in the reserve space for cpu offloading + if (li->offload) { + weight_ptr = gpu_mem_allocator.allocate_reserved_untyped( + weightSize * data_type_size(data_type)); + if (li->quantization_type != DT_NONE) { + quantized_weightSize = get_quantization_to_byte_size( + data_type, li->quantization_type, weightSize); + quantized_weight_ptr = + gpu_mem_allocator.allocate_reserved(quantized_weightSize); + } + } // Allocate an all-one's vector - float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); - for (int i = 0; i < batch_size; i++) { - dram_one_ptr[i] = 1.0f; + gpu_mem_allocator.create_legion_instance( + reserveInst, data_type_size(data_type) * batch_size); + one_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * batch_size); + int parallelism = batch_size; + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (data_type == DT_FLOAT) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((float *)one_ptr, batch_size); + } else if (data_type == DT_HALF) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((half *)one_ptr, batch_size); } - float *fb_one_ptr; - checkCUDA(hipMalloc(&fb_one_ptr, sizeof(float) * batch_size)); - checkCUDA(hipMemcpy(fb_one_ptr, - dram_one_ptr, - sizeof(float) * batch_size, - hipMemcpyHostToDevice)); - one_ptr = (void *)fb_one_ptr; + // Allocate descriptors checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); + + allocated_peft_buffer_size = 0; +} + +LinearMeta::~LinearMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } } -LinearMeta::~LinearMeta(void) {} namespace Kernels { namespace Linear { @@ -96,7 +127,61 @@ void forward_kernel_wrapper(LinearMeta const *m, int batch_size) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); + } +} +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *weight_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); @@ -126,6 +211,67 @@ void forward_kernel_wrapper(LinearMeta const *m, stream); } + if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->output_type[0]) * max_peft_tokens * out_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->output_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy output activation + if (m->output_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->output_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + } + if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -134,12 +280,60 @@ void forward_kernel_wrapper(LinearMeta const *m, checkCUDA(hipEventDestroy(t_start)); checkCUDA(hipEventDestroy(t_end)); printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed); - // print_tensor(acc_input.ptr, acc_input.rect.volume(), - // "[Linear:forward:input]"); print_tensor(acc_kernel.ptr, - // acc_kernel.rect.volume(), "[Linear:forward:kernel]"); - // print_tensor(acc_bias.ptr, acc_bias.rect.volume(), - // "[Linear:forward:bias]"); print_tensor(acc_output.ptr, - // acc_output.rect.volume(), "[Linear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *weight_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); } } @@ -223,8 +417,20 @@ Parameter* Linear::get_parameter(int index) } } */ - namespace Internal { + +template +__global__ void AddBiasWithReLU(DT *output_ptr, + DT const *bias_ptr, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, out_dim * batch_size) { + int bias_idx = i % out_dim; + DT value = output_ptr[i] + bias_ptr[bias_idx]; + output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f; + } +} + template void forward_kernel(LinearMeta const *m, void const *input_ptr, @@ -234,20 +440,57 @@ void forward_kernel(LinearMeta const *m, int in_dim, int out_dim, int batch_size, - hipStream_t stream) { + ffStream_t stream) { + // additional processing for uploading weights + if (m->offload) { + // Note that we update weight_ptr when uploading weight + if (m->quantization_type != DT_NONE) { + checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr, + weight_ptr, + m->quantized_weightSize, + hipMemcpyHostToDevice, + stream)); + if (m->quantization_type == DT_INT4) { + int parallelism = in_dim * out_dim / 2; + decompress_int4_general_weights
+ <<>>(m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + in_dim, + in_dim * out_dim); + } else { + assert(m->quantization_type == DT_INT8); + int parallelism = in_dim * out_dim; + decompress_int8_general_weights
+ <<>>(m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + in_dim, + in_dim * out_dim); + } + + } else { + checkCUDA(hipMemcpyAsync(m->weight_ptr, + weight_ptr, + in_dim * out_dim * sizeof(DT), + hipMemcpyHostToDevice, + stream)); + } + } checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); DT alpha = 1.0f, beta = 0.0f; hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); - hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipblasDatatype_t weight_type = m->offload + ? ff_to_cuda_datatype(m->weight_ptr_type) + : ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = output_type; -#else - // TODO: currently use the output_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + assert(input_type == weight_type && weight_type == output_type); hipblasDatatype_t compute_type = output_type; -#endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, HIPBLAS_OP_N, @@ -255,7 +498,7 @@ void forward_kernel(LinearMeta const *m, batch_size, in_dim, &alpha, - weight_ptr, + m->offload ? m->weight_ptr : weight_ptr, weight_type, in_dim, input_ptr, @@ -269,6 +512,16 @@ void forward_kernel(LinearMeta const *m, HIPBLAS_GEMM_DEFAULT)); // use_bias = True if (bias_ptr != NULL) { + // fuse bias and relu + if (m->activation == AC_MODE_RELU) { + int parallelism = out_dim * batch_size; + AddBiasWithReLU<<>>( + static_cast
(output_ptr), + static_cast
(bias_ptr), + out_dim, + batch_size); + return; + } checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, HIPBLAS_OP_N, @@ -306,7 +559,7 @@ void forward_kernel(LinearMeta const *m, GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, - 0, + stream, elements, B, C, @@ -318,6 +571,74 @@ void forward_kernel(LinearMeta const *m, } } +template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); + // update input_grad_ptr and output_grad_ptr offset + int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; + input_grad_ptr = + static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; + output_grad_ptr = + static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; + hipblasDatatype_t compute_type = output_type; + int output_size = out_dim * num_peft_tokens; + if (m->activation == AC_MODE_RELU) { + relu_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else if (m->activation == AC_MODE_SIGMOID) { + sigmoid_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else { + // TODO: only support relu and sigmoid for now + assert(m->activation == AC_MODE_NONE); + } + + // Compute data gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + if (input_grad_ptr != NULL) { + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } +} + template void backward_kernel(LinearMeta const *m, void const *input_ptr, @@ -335,16 +656,11 @@ void backward_kernel(LinearMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); DT alpha = 1.0f; + float sgeam_alpha = 1.0f; hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = output_type; -#else - // TODO: currently use output_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = output_type; -#endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( @@ -356,7 +672,7 @@ void backward_kernel(LinearMeta const *m, // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); } - // Compute weight gradiant + // Compute weight gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_N, @@ -377,7 +693,27 @@ void backward_kernel(LinearMeta const *m, in_dim, compute_type, HIPBLAS_GEMM_DEFAULT)); - // Compute bias gradiant + if (m->kernel_reg_type == REG_MODE_NONE) { + // do nothing + } else if (m->kernel_reg_type == REG_MODE_L2) { + checkCUDA(hipblasSgeam(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + out_dim, + &sgeam_alpha, + (float *)kernel_grad_ptr, + in_dim, + &(m->kernel_reg_lambda), + (float *)kernel_ptr, + in_dim, + (float *)kernel_grad_ptr, + in_dim)); + } else { + assert(false && "Only L2 regularization is supported"); + } + + // Compute bias gradient // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { @@ -388,7 +724,7 @@ void backward_kernel(LinearMeta const *m, out_dim, batch_size, &alpha, - m->one_ptr, + static_cast
(m->one_ptr), HIPBLAS_R_32F, 1, output_grad_ptr, @@ -401,7 +737,7 @@ void backward_kernel(LinearMeta const *m, compute_type, HIPBLAS_GEMM_DEFAULT)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(hipblasGemmEx(m->handle.blas, @@ -426,7 +762,14 @@ void backward_kernel(LinearMeta const *m, } } +template +__global__ void build_one_ptr(DT *one_ptr, int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size) { + one_ptr[i] = static_cast
(1.0f); + } +} + } // namespace Internal } // namespace Linear } // namespace Kernels -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index c30c9f71c1..d4f930db6c 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -63,6 +63,8 @@ LinearMeta::LinearMeta(FFHandler handler, // Allocate descriptors checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + + allocated_peft_buffer_size = 0; } LinearMeta::~LinearMeta(void) { @@ -170,6 +172,172 @@ void forward_kernel_wrapper(LinearMeta const *m, } } +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *weight_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->input_type[0] == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + + if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->output_type[0]) * max_peft_tokens * out_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->output_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy output activation + if (m->output_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->output_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [Linear] inference time = %.2lfms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *weight_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); + } +} + void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -323,17 +491,7 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + cudaDataType_t compute_type = output_type; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -398,7 +556,7 @@ void forward_kernel(LinearMeta const *m, size_t elements = (size_t)out_dim * (size_t)batch_size; constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) - gelu_forward_kernel<<>>( + gelu_forward_kernel<<>>( elements, B, C, (float *)output_ptr); } else if (m->activation == AC_MODE_NONE) { // Do nothing @@ -407,6 +565,74 @@ void forward_kernel(LinearMeta const *m, } } +template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + // update input_grad_ptr and output_grad_ptr offset + int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; + input_grad_ptr = + static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; + output_grad_ptr = + static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; + cudaDataType_t compute_type = output_type; + int output_size = out_dim * num_peft_tokens; + if (m->activation == AC_MODE_RELU) { + relu_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else if (m->activation == AC_MODE_SIGMOID) { + sigmoid_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else { + // TODO: only support relu and sigmoid for now + assert(m->activation == AC_MODE_NONE); + } + + // Compute data gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + if (input_grad_ptr != NULL) { + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + template void backward_kernel(LinearMeta const *m, void const *input_ptr, @@ -428,17 +654,7 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + cudaDataType_t compute_type = output_type; int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( @@ -450,7 +666,7 @@ void backward_kernel(LinearMeta const *m, // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); } - // Compute weight gradiant + // Compute weight gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, @@ -491,7 +707,7 @@ void backward_kernel(LinearMeta const *m, assert(false && "Only L2 regularization is supported"); } - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { @@ -515,7 +731,7 @@ void backward_kernel(LinearMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp new file mode 100644 index 0000000000..c3c2cce3cf --- /dev/null +++ b/src/ops/kernels/lora_linear_kernels.cpp @@ -0,0 +1,576 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/utils/hip_helper.h" +#include +#include +#include + +namespace FlexFlow { + +LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) + : OpMeta(handler, li) { + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; +} + +LoraLinearMeta::~LoraLinearMeta(void) {} + +namespace Kernels { +namespace LoraLinear { + +void init_kernel_wrapper(LoraLinearMeta *m, int seed) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (m->input_type[0] == DT_FLOAT) { + Internal::init_kernel(m, seed, stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::init_kernel(m, seed, stream); + } else { + assert(false && "Unsupported data type"); + } +} + +void inference_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +namespace Internal { + +template +void init_kernel(LoraLinearMeta *m, int seed, hipStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); + + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + for (auto &model_state : m->model_state) { + LoraLinearWeight weight = model_state.second.weights; + int w0_num_elements = weight.rank * weight.in_dim; + int w1_num_elements = weight.rank * weight.out_dim; + + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(weight.in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(hipMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + hipMemcpyHostToDevice, + stream)); + + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(weight.rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(hipMemcpyAsync(static_cast
(weight.w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + hipMemcpyHostToDevice, + stream)); + } +} + +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->input_type[1]); + hipblasDatatype_t lr_actv_type = output_type; + assert(input_type == output_type); + hipblasDatatype_t weight_type = output_type; + hipblasDatatype_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipDataType compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->input_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + // Assert that we have at most one request that requires peft_bwd + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + void *intermediate_result_ptr = nullptr; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed1 = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + size_t activation_size_needed2 = + data_type_size(m->input_type[1]) * max_peft_tokens * rank; + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + if (activation_size_needed1 > m->allocated_peft_buffer_size1) { + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed1); + m->allocated_peft_buffer_size1 = activation_size_needed1; + } + if (activation_size_needed2 > m->allocated_peft_buffer_size2) { + m->low_rank_activation = + allocator->allocate_instance_untyped(activation_size_needed2); + m->allocated_peft_buffer_size2 = activation_size_needed2; + } + // copy input activation + checkCUDA(hipMemcpyAsync(m->input_activation, + input_ptr + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + intermediate_result_ptr = m->low_rank_activation; + } else { + // use workspace to save intermediate result + assert(m->handle.workSpaceSize >= + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + intermediate_result_ptr = m->handle.workSpace; + } + // buffer = weight_first * input + // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + rank, + num_peft_tokens, + in_dim, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + input_ptr + first_token_offset * in_dim, + input_type, + in_dim, + &beta, + intermediate_result_ptr, + lr_actv_type, + rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + // output = weight_second * buffer + // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + out_dim, + num_peft_tokens, + rank, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + intermediate_result_ptr, + lr_actv_type, + rank, + &alpha, + output_ptr + first_token_offset * out_dim, + output_type, + out_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } +} + +template +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + DT const *WGrad, + DT *V, + DT *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + DT gt = WGrad[i] + (DT)weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * (DT)momentum + gt; + if (nesterov) { + gt = gt + (DT)momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= (DT)lr * gt; + } +} + +template +void peft_bwd_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); + assert(input_type == output_type); + hipblasDatatype_t weight_type = output_type; + hipblasDatatype_t lr_actv_type = output_type; + hipblasDatatype_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipDataType compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + + // Compute LORA_B weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + rank, + out_dim, + num_peft_tokens, + &scaling_constant, + m->low_rank_activation, + lr_actv_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + weight.w1_grad_ptr, + weight_type, + rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + // Compute LORA_B input's (and LORA_A output's) gradient inplace in + // low_rank_activation + { + DT alpha = 1.0f, beta = 0.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + rank, + num_peft_tokens, + out_dim, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + // Compute LORA_A weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + in_dim, + rank, + num_peft_tokens, + &alpha, + m->input_activation, + input_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + weight.w0_grad_ptr, + weight_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Compute input gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + if (input_grad_ptr != nullptr) { + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + num_peft_tokens, + rank, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + if (bc->requestsInfo[i].optimizer_tasks.update_weights) { + LoraOptimizerConfig const *optimizer_config = + m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config; + assert(optimizer_config != nullptr); + assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig)); + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + // Get optimizer config + if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + (LoraSGDOptimizerConfig const *)optimizer_config; + // LoRA_A weight is split in tensor parallelism, so no need to apply + // all-reduce + sgd_update<<>>(w0_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w0_grad_ptr), + static_cast
(weight.w0_v_values_ptr), + static_cast
(weight.w0_ptr)); + // LoRA_B weight is replicated w tensor parallelism, so we need to sync + // and sum first + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]); + checkCUDA(ncclAllReduce(static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_grad_ptr), + w1_num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); + sgd_update<<>>(w1_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_v_values_ptr), + static_cast
(weight.w1_ptr)); + } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) { + assert(false && "Adam optimizer type not implemented yet"); + } else { + assert(false && "Unsupported optimizer type"); + } + } + } +} + +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu new file mode 100644 index 0000000000..5f130782aa --- /dev/null +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -0,0 +1,579 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/utils/cuda_helper.h" +#include +#include + +namespace FlexFlow { + +LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) + : OpMeta(handler, li) { + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; +} + +LoraLinearMeta::~LoraLinearMeta(void) {} + +namespace Kernels { +namespace LoraLinear { + +void init_kernel_wrapper(LoraLinearMeta *m, int seed) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (m->input_type[0] == DT_FLOAT) { + Internal::init_kernel(m, seed, stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::init_kernel(m, seed, stream); + } else { + assert(false && "Unsupported data type"); + } +} + +void inference_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +namespace Internal { + +template +void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); + + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + for (auto &model_state : m->model_state) { + LoraLinearWeight weight = model_state.second.weights; + int w0_num_elements = weight.rank * weight.in_dim; + int w1_num_elements = weight.rank * weight.out_dim; + + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(weight.in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); + + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(weight.rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); + } +} + +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); + cudaDataType_t lr_actv_type = output_type; + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; + cudaDataType_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->input_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + // Assert that we have at most one request that requires peft_bwd + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + void *intermediate_result_ptr = nullptr; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed1 = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + size_t activation_size_needed2 = + data_type_size(m->input_type[1]) * max_peft_tokens * rank; + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + if (activation_size_needed1 > m->allocated_peft_buffer_size1) { + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed1); + m->allocated_peft_buffer_size1 = activation_size_needed1; + } + if (activation_size_needed2 > m->allocated_peft_buffer_size2) { + m->low_rank_activation = + allocator->allocate_instance_untyped(activation_size_needed2); + m->allocated_peft_buffer_size2 = activation_size_needed2; + } + // copy input activation + checkCUDA(cudaMemcpyAsync(m->input_activation, + input_ptr + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + intermediate_result_ptr = m->low_rank_activation; + } else { + // use workspace to save intermediate result + assert(m->handle.workSpaceSize >= + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + intermediate_result_ptr = m->handle.workSpace; + } + // buffer = weight_first * input + // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + rank, + num_peft_tokens, + in_dim, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + input_ptr + first_token_offset * in_dim, + input_type, + in_dim, + &beta, + intermediate_result_ptr, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // output = weight_second * buffer + // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_dim, + num_peft_tokens, + rank, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + intermediate_result_ptr, + lr_actv_type, + rank, + &alpha, + output_ptr + first_token_offset * out_dim, + output_type, + out_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + +template +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + DT const *WGrad, + DT *V, + DT *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + DT gt = WGrad[i] + (DT)weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * (DT)momentum + gt; + if (nesterov) { + gt = gt + (DT)momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= (DT)lr * gt; + } +} + +template +void peft_bwd_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; + cudaDataType_t lr_actv_type = output_type; + cudaDataType_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + + // Compute LORA_B weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + rank, + out_dim, + num_peft_tokens, + &scaling_constant, + m->low_rank_activation, + lr_actv_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + weight.w1_grad_ptr, + weight_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + // Compute LORA_B input's (and LORA_A output's) gradient inplace in + // low_rank_activation + { + DT alpha = 1.0f, beta = 0.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + rank, + num_peft_tokens, + out_dim, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + // Compute LORA_A weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + in_dim, + rank, + num_peft_tokens, + &alpha, + m->input_activation, + input_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + weight.w0_grad_ptr, + weight_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Compute input gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + if (input_grad_ptr != nullptr) { + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + rank, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + if (bc->requestsInfo[i].optimizer_tasks.update_weights) { + LoraOptimizerConfig const *optimizer_config = + m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config; + assert(optimizer_config != nullptr); + assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig)); + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + // Get optimizer config + if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + (LoraSGDOptimizerConfig const *)optimizer_config; + // LoRA_A weight is split in tensor parallelism, so no need to apply + // all-reduce + sgd_update<<>>(w0_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w0_grad_ptr), + static_cast
(weight.w0_v_values_ptr), + static_cast
(weight.w0_ptr)); + // LoRA_B weight is replicated w tensor parallelism, so we need to sync + // and sum first +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]); + checkCUDA(ncclAllReduce(static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_grad_ptr), + w1_num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif + sgd_update<<>>(w1_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_v_values_ptr), + static_cast
(weight.w1_ptr)); + } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) { + assert(false && "Adam optimizer type not implemented yet"); + } else { + assert(false && "Unsupported optimizer type"); + } + } + } +} + +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp index 8af85612ca..b3f20a35dd 100644 --- a/src/ops/kernels/pool_2d_kernels.cpp +++ b/src/ops/kernels/pool_2d_kernels.cpp @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/pool_2d_kernels.h" +#include "flexflow/ops/pool_2d.h" #include "flexflow/utils/hip_helper.h" namespace FlexFlow { -Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) { +Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool) + : OpMeta(handler, pool) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc)); diff --git a/src/ops/kernels/pool_2d_kernels.cu b/src/ops/kernels/pool_2d_kernels.cu index b418d20cd3..c236f049ba 100644 --- a/src/ops/kernels/pool_2d_kernels.cu +++ b/src/ops/kernels/pool_2d_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/pool_2d_kernels.h" +#include "flexflow/ops/pool_2d.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) { +Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool) + : OpMeta(handler, pool) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); diff --git a/src/ops/kernels/reshape_kernels.cpp b/src/ops/kernels/reshape_kernels.cpp index b17d95bfea..47f407fd82 100644 --- a/src/ops/kernels/reshape_kernels.cpp +++ b/src/ops/kernels/reshape_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/reshape.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {} +ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape) + : OpMeta(handler, reshape) {} namespace Kernels { namespace Reshape { diff --git a/src/ops/kernels/reshape_kernels.cu b/src/ops/kernels/reshape_kernels.cu index 9786f63815..0a2b01ae52 100644 --- a/src/ops/kernels/reshape_kernels.cu +++ b/src/ops/kernels/reshape_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/reshape.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {} +ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape) + : OpMeta(handler, reshape) {} namespace Kernels { namespace Reshape { diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp index 6906556452..016364edfd 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cpp +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -22,18 +22,16 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; + #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, ResidualRMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; + inplace_residual = rms->inplace_residual; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; num_elements = in_dim * batch_size; @@ -47,12 +45,14 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } } + namespace Kernels { namespace ResidualRMSNorm { @@ -78,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -87,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -109,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; X_out[index] = X1[index] + X2[index]; sum += (static_cast(X_out[index]) * static_cast(X_out[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -128,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); - output[index] = Y[index] * weights[index % N]; + Y[index] = static_cast(static_cast(X_out[index]) * + static_cast(rms[i])); + output[index] = static_cast(static_cast(Y[index]) * + static_cast(weights[index % N])); } } @@ -144,19 +138,10 @@ void forward_kernel(ResidualRMSNormMeta const *m, T *residual_output_ptr, T *output_ptr, hipStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualRMSNormFusedForwardKernel), - num_blocks, - num_threads, + m->batch_size, + std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream, m->in_dim, @@ -178,7 +163,57 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); @@ -211,6 +246,67 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, assert(false && "Unsupported data type"); } + // save input activation if needed for PEFT. This must be done after the + // forward kernel since that's where we add the residual + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + residual_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + residual_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -222,6 +318,288 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ float ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dX1_residual, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX1, + T *dX2, + bool reset_input_grad1, + bool reset_input_grad2) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad1) { + dX1[index] = static_cast(dX_val); + } else { + dX1[index] = dX1_residual[index] + static_cast(dX_val); + } + if (reset_input_grad2) { + dX2[index] = static_cast(dX1[index]); + } else { + dX2[index] += static_cast(dX1[index]); + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(ResidualRMSNormMeta const *m, + T const *output_grad_ptr, + T const *residual_output_rms_input_ptr, + T *residual_input0_grad_ptr, + T *residual_input1_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + hipStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + N, + nullptr, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + residual_output_rms_input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +template +void peft_bwd_kernel(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_0_ptr, + T const *output_grad_1_ptr, + T *input_grad_0_ptr, + T *input_grad_1_ptr, + T const *weight_ptr, + hipStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->in_dim; + + T const *residual_output_rms_input_ptr = + static_cast(m->input_activation); + + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>( + N, + output_grad_0_ptr, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_0_ptr, + input_grad_1_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + } +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad +*/ +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(output_grad.data_type == residual_output_rms_input.data_type); + assert(residual_output_rms_input.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1_grad.data_type); + assert(residual_input1_grad.data_type == weight.data_type); + assert(weight.data_type == weight_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + residual_output_rms_input.get_half_ptr(), + residual_input0_grad.get_half_ptr(), + residual_input1_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + residual_output_rms_input.get_float_ptr(), + residual_input0_grad.get_float_ptr(), + residual_input1_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, + GenericTensorAccessorR const &weight) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(output_grad_1.data_type == input_grad_0.data_type); + assert(input_grad_0.data_type == input_grad_1.data_type); + assert(input_grad_1.data_type == weight.data_type); + + if (output_grad_1.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_half_ptr(), + output_grad_1.get_half_ptr(), + input_grad_0.get_half_ptr(), + input_grad_1.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad_1.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_float_ptr(), + output_grad_1.get_float_ptr(), + input_grad_0.get_float_ptr(), + input_grad_1.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 17ac14449b..0d44f0260a 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -24,17 +24,14 @@ namespace FlexFlow { using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, ResidualRMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; + inplace_residual = rms->inplace_residual; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; num_elements = in_dim * batch_size; @@ -48,6 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -80,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -89,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -111,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; X_out[index] = X1[index] + X2[index]; sum += (static_cast(X_out[index]) * static_cast(X_out[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -130,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); - output[index] = Y[index] * weights[index % N]; + Y[index] = static_cast(static_cast(X_out[index]) * + static_cast(rms[i])); + output[index] = static_cast(static_cast(Y[index]) * + static_cast(weights[index % N])); } } @@ -147,26 +139,17 @@ void forward_kernel(ResidualRMSNormMeta const *m, T *output_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualRMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input1_ptr, - input2_ptr, - residual_output_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + <<batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>( + m->in_dim, + m->eps, + input1_ptr, + input2_ptr, + residual_output_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(ResidualRMSNormMeta const *m, @@ -219,6 +202,401 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + // save input activation if needed for PEFT. This must be done after the + // forward kernel since that's where we add the residual + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + residual_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + residual_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ float ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dX1_residual, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX1, + T *dX2, + bool reset_input_grad1, + bool reset_input_grad2) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad1) { + dX1[index] = static_cast(dX_val); + } else { + dX1[index] = dX1_residual[index] + static_cast(dX_val); + } + if (reset_input_grad2) { + dX2[index] = static_cast(dX1[index]); + } else { + dX2[index] += static_cast(dX1[index]); + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(ResidualRMSNormMeta const *m, + T const *output_grad_ptr, + T const *residual_output_rms_input_ptr, + T *residual_input0_grad_ptr, + T *residual_input1_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + cudaStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + N, + nullptr, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + residual_output_rms_input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +template +void peft_bwd_kernel(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_0_ptr, + T const *output_grad_1_ptr, + T *input_grad_0_ptr, + T *input_grad_1_ptr, + T const *weight_ptr, + cudaStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->in_dim; + + T const *residual_output_rms_input_ptr = + static_cast(m->input_activation); + + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>( + N, + output_grad_0_ptr, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_0_ptr, + input_grad_1_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + } +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad +*/ +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(output_grad.data_type == residual_output_rms_input.data_type); + assert(residual_output_rms_input.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1_grad.data_type); + assert(residual_input1_grad.data_type == weight.data_type); + assert(weight.data_type == weight_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + residual_output_rms_input.get_half_ptr(), + residual_input0_grad.get_half_ptr(), + residual_input1_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + residual_output_rms_input.get_float_ptr(), + residual_input0_grad.get_float_ptr(), + residual_input1_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, + GenericTensorAccessorR const &weight) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(output_grad_1.data_type == input_grad_0.data_type); + assert(input_grad_0.data_type == input_grad_1.data_type); + assert(input_grad_1.data_type == weight.data_type); + + if (output_grad_1.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_half_ptr(), + output_grad_1.get_half_ptr(), + input_grad_0.get_half_ptr(), + input_grad_1.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad_1.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_float_ptr(), + output_grad_1.get_float_ptr(), + input_grad_0.get_float_ptr(), + input_grad_1.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 24ab7051e6..4158628005 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -23,16 +23,12 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; @@ -47,12 +43,14 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } RMSNormMeta::~RMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } } + namespace Kernels { namespace RMSNorm { @@ -78,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -87,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -107,16 +103,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -124,10 +115,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rms[i]); + Y[index] = static_cast(X[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } } @@ -138,19 +128,10 @@ void forward_kernel(RMSNormMeta const *m, T const *weight_ptr, T *output_ptr, hipStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormFusedForwardKernel), - num_blocks, - num_threads, + m->batch_size, + std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream, m->in_dim, @@ -204,6 +185,363 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } } +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + + if (input.data_type == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (input.data_type == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ T ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX, + bool reset_input_grad) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad) { + dX[index] = dX_val; + } else { + dX[index] += dX_val; + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(RMSNormMeta const *m, + T const *output_grad_ptr, + T const *input_ptr, + T *input_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + hipStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + N, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + m->in_dim, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBackwardCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(input_grad.data_type == input.data_type); + assert(weight_grad.data_type == weight.data_type); + assert(output_grad.data_type == input.data_type); + assert(weight.data_type == output_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +template +void peft_bwd_kernel(RMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *weight_ptr, + hipStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + m->in_dim, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + } +} + +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(input_grad.data_type == output_grad.data_type); + assert(output_grad.data_type == weight.data_type); + + if (output_grad.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 7c9f4a9f98..dd6ada864d 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -24,16 +24,12 @@ namespace FlexFlow { using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; @@ -48,6 +44,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } RMSNormMeta::~RMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -96,66 +93,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - -#ifdef DEADCODE -template -__global__ void - RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { - __shared__ float v_shared[C10_WARP_SIZE]; - long long const i = blockIdx.x; - float sum = 0.0f; - for (long long j = threadIdx.x; j < N; j += blockDim.x) { - long long const index = i * N + j; - sum += (static_cast(X[index]) * static_cast(X[index])); - } - sum = BlockReduceSum(sum, - v_shared); // use BlockReduceSum() to sum X_ij^2 - - if (threadIdx.x == 0) { - rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); - } -} - -template -__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rstd[i]); - } -} - -template -__global__ void elewise_apply_weights(int64_t batch_size, - int64_t in_dim, - T const *norm, - T const *weights, - T *output) { - CUDA_KERNEL_LOOP(i, batch_size * in_dim) { - output[i] = norm[i] * weights[i % in_dim]; - } -} -#endif - template __global__ void RMSNormFusedForwardKernel(int64_t N, float eps, @@ -167,16 +104,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -184,10 +116,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rms[i]); + Y[index] = static_cast(X[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } } @@ -199,24 +130,15 @@ void forward_kernel(RMSNormMeta const *m, T *output_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - RMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + <<batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>( + m->in_dim, + m->eps, + input_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(RMSNormMeta const *m, @@ -261,6 +183,346 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } } +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + + if (input.data_type == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (input.data_type == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ T ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX, + bool reset_input_grad) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad) { + dX[index] = dX_val; + } else { + dX[index] += dX_val; + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(RMSNormMeta const *m, + T const *output_grad_ptr, + T const *input_ptr, + T *input_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + cudaStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + m->in_dim, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(input_grad.data_type == input.data_type); + assert(weight_grad.data_type == weight.data_type); + assert(output_grad.data_type == input.data_type); + assert(weight.data_type == output_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +template +void peft_bwd_kernel(RMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *weight_ptr, + cudaStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + RMSNormBackwardCUDAKernel + <<>>( + m->in_dim, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + } +} + +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(input_grad.data_type == output_grad.data_type); + assert(output_grad.data_type == weight.data_type); + + if (output_grad.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index 89c9f14a01..fa31c5adff 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -25,13 +25,13 @@ using Legion::Domain; SoftmaxMeta::SoftmaxMeta(FFHandler handler, Softmax const *softmax, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, softmax) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; @@ -41,20 +41,26 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { -template void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + if (m->output_type[0] == DT_FLOAT) { + Internal::forward_kernel( + m, input.get_float_ptr(), output.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::forward_kernel( + m, input.get_half_ptr(), output.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -70,11 +76,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, } } -template void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, - size_t num_elements) { + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -84,8 +88,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } - Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + assert(input_grad.domain == output_grad.domain); + if (m->output_type[0] == DT_FLOAT) { + Internal::backward_kernel(m, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::backward_kernel(m, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -101,21 +119,112 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); -template void forward_kernel_wrapper(SoftmaxMeta const *m, - half const *input_ptr, - half *output_ptr); - -template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); -template void backward_kernel_wrapper(SoftmaxMeta const *m, - half *input_grad_ptr, - half const *output_grad_ptr, - size_t num_elements); +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + bool is_last_op, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(hipMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + } + } else if (m->output_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(hipMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + hipMemcpyDeviceToDevice, + stream)); + } + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + num_classes, + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + num_classes, + stream); + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} namespace Internal { template @@ -138,7 +247,8 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, hipStream_t stream) { @@ -149,6 +259,116 @@ void backward_kernel(DT *input_grad_ptr, stream)); } +template +void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor, + cudnn_data_type, + bc->num_active_tokens(), + num_classes, + 1, + 1)); + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &alpha, + m->outputTensor, + input_ptr, + &beta, + m->outputTensor, + output_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); +} + +template +__global__ void sparse_categorical_crossentropy_loss_peft_backward( + DT *input_grad, + DT const *output_grad, + BatchConfig::TokenId const *token_ids, + int num_tokens, + int num_classes) { + CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + int class_idx = i % num_classes; + int token_idx = i / num_classes; + input_grad[i] = output_grad[i]; + if (class_idx == token_ids[token_idx]) { + input_grad[i] = input_grad[i] - (DT)1.0f; + } + } +} + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + hipStream_t stream) { + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + // ignore last token + checkCUDA(hipMemsetAsync(input_grad_ptr + + (tokens_previous_requests + + bc->requestsInfo[i].num_tokens_in_batch - 1) * + num_classes, + 0, + num_classes * sizeof(DT), + stream)); + checkCUDA(hipMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + hipMemcpyHostToDevice, + stream)); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sparse_categorical_crossentropy_loss_peft_backward
), + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream, + input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + static_cast(m->handle.workSpace), + num_bwd_tokens, + num_classes); + // scale + hipLaunchKernelGGL(HIP_KERNEL_NAME(scale_kernel
), + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream, + input_grad_ptr + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); + + tokens_previous_requests += num_bwd_tokens + 1; + } + assert(tokens_previous_requests == bc->num_active_tokens()); +} + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e47006cc9d..16f1219bf6 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -24,7 +24,7 @@ using Legion::Domain; SoftmaxMeta::SoftmaxMeta(FFHandler handler, Softmax const *softmax, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, softmax) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( inputTensor, input_domain, softmax->data_type)); @@ -40,10 +40,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { -template void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -52,7 +51,15 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + if (m->output_type[0] == DT_FLOAT) { + Internal::forward_kernel( + m, input.get_float_ptr(), output.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::forward_kernel( + m, input.get_half_ptr(), output.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -68,11 +75,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, } } -template void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, - size_t num_elements) { + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -82,8 +87,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + assert(input_grad.domain == output_grad.domain); + if (m->output_type[0] == DT_FLOAT) { + Internal::backward_kernel(m, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::backward_kernel(m, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -99,21 +118,113 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); -template void forward_kernel_wrapper(SoftmaxMeta const *m, - half const *input_ptr, - half *output_ptr); - -template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); -template void backward_kernel_wrapper(SoftmaxMeta const *m, - half *input_grad_ptr, - half const *output_grad_ptr, - size_t num_elements); +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + bool is_last_op, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } + } else if (m->output_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + cudaMemcpyDeviceToDevice, + stream)); + } + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + num_classes, + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + num_classes, + stream); + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + namespace Internal { template void forward_kernel(SoftmaxMeta const *m, @@ -135,7 +246,8 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, cudaStream_t stream) { @@ -146,6 +258,115 @@ void backward_kernel(DT *input_grad_ptr, stream)); } +template +void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + bc->num_active_tokens(), + num_classes, + 1, + 1)); + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->outputTensor, + input_ptr, + &beta, + m->outputTensor, + output_ptr)); +} + +template +__global__ void sparse_categorical_crossentropy_loss_peft_backward( + DT *input_grad, + DT const *output_grad, + BatchConfig::TokenId const *token_ids, + int num_tokens, + int num_classes) { + CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + int class_idx = i % num_classes; + int token_idx = i / num_classes; + input_grad[i] = output_grad[i]; + if (class_idx == token_ids[token_idx]) { + input_grad[i] = input_grad[i] - (DT)1.0f; + } + } +} + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + cudaStream_t stream) { + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + // ignore last token + checkCUDA(cudaMemsetAsync( + input_grad_ptr + (tokens_previous_requests + + bc->requestsInfo[i].num_tokens_in_batch - 1) * + num_classes, + 0, + num_classes * sizeof(DT), + stream)); + checkCUDA(cudaMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + cudaMemcpyHostToDevice, + stream)); + sparse_categorical_crossentropy_loss_peft_backward<<< + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream>>>( + input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + static_cast(m->handle.workSpace), + num_bwd_tokens, + num_classes); + // scale + scale_kernel<<>>(input_grad_ptr + + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); + + tokens_previous_requests += num_bwd_tokens + 1; + } + assert(tokens_previous_requests == bc->num_active_tokens()); +} + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/src/ops/kernels/transpose_kernels.cpp b/src/ops/kernels/transpose_kernels.cpp index 49a7d827f5..199e1cd0c1 100644 --- a/src/ops/kernels/transpose_kernels.cpp +++ b/src/ops/kernels/transpose_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/transpose.h" #include "flexflow/utils/hip_helper.h" #include @@ -22,6 +23,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; +TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose) + : OpMeta(handler, transpose) {} + struct TransposeStrides { int num_dim; int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM], diff --git a/src/ops/kernels/transpose_kernels.cu b/src/ops/kernels/transpose_kernels.cu index b401ff0ba1..18a6e405af 100644 --- a/src/ops/kernels/transpose_kernels.cu +++ b/src/ops/kernels/transpose_kernels.cu @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/transpose.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -21,6 +22,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; +TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose) + : OpMeta(handler, transpose) {} + struct TransposeStrides { int num_dim; int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM], diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index b19f400eb2..3161987d60 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" @@ -56,7 +57,7 @@ LayerNormParams LayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -561,7 +562,7 @@ void LayerNorm::inference_task(Task const *task, assert(regions.size() == 2); } - LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); + LayerNorm::inference_kernel_wrapper(m, bc, in, out, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -645,6 +646,104 @@ void LayerNorm::forward_task(Task const *task, LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } +Legion::FutureMap + LayerNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // regions[0](I): output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I/O): input_grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + if (elementwise_affine) { + // regions[2](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I/O): input_grad + regions[2](I): gamma +*/ +void LayerNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); + assert(task->regions.size() == regions.size()); + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + + Domain out_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain in_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 3)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + Domain gamma_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(gamma_domain.get_volume() == m->effective_num_elements); + } else { + assert(regions.size() == 2); + } + LayerNorm::peft_bwd_kernel_wrapper(m, output_grad, input_grad, gamma); +} + void LayerNorm::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -722,55 +821,60 @@ void LayerNorm::backward_task(Task const *task, Runtime *runtime) { LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); - float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL; - float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; Domain out_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain in_grad_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - in_grad_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + gamma_grad = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[4], + task->regions[4], + FID_DATA, + ctx, + runtime); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRO( - regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain gamma_grad_domain = runtime->get_index_space_domain( ctx, task->regions[4].region.get_index_space()); - gamma_grad_ptr = helperGetTensorPointerRW( - regions[4], task->regions[4], FID_DATA, ctx, runtime); if (m->use_bias) { Domain beta_grad_domain = runtime->get_index_space_domain( ctx, task->regions[5].region.get_index_space()); - beta_grad_ptr = helperGetTensorPointerRW( - regions[5], task->regions[5], FID_DATA, ctx, runtime); + beta_grad = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[5], + task->regions[5], + FID_DATA, + ctx, + runtime); assert(gamma_domain == beta_grad_domain); } - assert(gamma_domain == gamma_grad_domain); - assert(gamma_domain.get_volume() == m->effective_num_elements); } else { assert(regions.size() == 3); } - - LayerNorm::backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); + LayerNorm::backward_kernel_wrapper( + m, output_grad, input, input_grad, gamma, gamma_grad, beta_grad); } bool LayerNorm::measure_operator_cost(Simulator *sim, @@ -785,7 +889,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } Domain input_domain = sub_input.get_domain(); Domain output_domain = sub_output.get_domain(); - LayerNormMeta *m = sim->layernorm_meta; + MemoryAllocator gpu_mem_allocator(sim->memory); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this, gpu_mem_allocator); sim->free_all(); float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); @@ -821,16 +926,24 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, if (sim->computationMode == COMP_MODE_TRAINING) { float *in_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorW in_grad_acc( + inputs[0]->data_type, input_domain, in_grad_ptr); assert(in_grad_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *out_grad_ptr = NULL; out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorR out_grad_acc( + outputs[0]->data_type, output_domain, out_grad_ptr); assert(out_grad_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + GenericTensorAccessorW gamma_grad_acc( + outputs[0]->data_type, output_domain, gamma_grad_ptr); + GenericTensorAccessorW beta_grad_acc( + outputs[0]->data_type, output_domain, beta_grad_ptr); out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && @@ -842,13 +955,13 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } backward = [=] { - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); + backward_kernel_wrapper(m, + out_grad_acc, + input1_acc, + in_grad_acc, + gamma_acc, + gamma_grad_acc, + beta_grad_acc); }; } diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 07dbdb3dfb..27d314e21e 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -27,21 +28,37 @@ constexpr int kColwiseReduceTileSize = 32; LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; - use_bias = ln->use_bias; + profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; - checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + size_t totalSize = effective_batch_size * data_type_size(data_type) * 6; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + ds_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + db_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + scale_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } -LayerNormMeta::~LayerNormMeta(void) {} +LayerNormMeta::~LayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} template __device__ __forceinline__ T WARP_SHFL_DOWN(T value, @@ -74,7 +91,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -82,8 +99,14 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { +__global__ void LayerNormFusedForwardKernel(int64_t N, + float eps, + T const *X, + T *mean, + T *rstd, + T const *gamma, + T const *beta, + T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; @@ -103,18 +126,10 @@ __global__ void RowwiseMomentsCUDAKernel( mean[i] = static_cast(sum1); rstd[i] = static_cast(rsqrt(sum2 + eps)); } -} -template -__global__ void LayerNormForwardCUDAKernel(int64_t N, - T const *X, - T const *mean, - T const *rstd, - T const *gamma, - T const *beta, - T *Y) { + __syncthreads(); + using T_ACC = T; - const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = @@ -135,28 +150,19 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *gamma_ptr, T const *beta_ptr, hipStream_t stream) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), - m->effective_batch_size, - kCUDABlockReduceNumThreads, - 0, - stream, - m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), - m->effective_batch_size, - kCUDANumThreads, - 0, - stream, - m->effective_num_elements, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + + LayerNormFusedForwardKernel + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -167,24 +173,154 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } if (m->input_type[0] == DT_FLOAT) { - LayerNorm::forward_kernel(m, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - m->use_bias ? beta.get_float_ptr() - : nullptr, - stream); + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); } else if (m->input_type[0] == DT_HALF) { - LayerNorm::forward_kernel(m, - input.get_half_ptr(), - output.get_half_ptr(), - gamma.get_half_ptr(), - m->use_bias ? beta.get_half_ptr() : nullptr, - stream); + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); } else { assert(false && "unsupport datatype in layernorm"); } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } +} + +/*static*/ +void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } } template @@ -224,7 +360,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, using T_ACC = T; const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { - const T_ACC s = T_ACC(1) / static_cast(N); + const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * static_cast(rstd[index]) * static_cast(rstd[index]) * @@ -235,27 +371,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, } } -template -__global__ void LayerNormBackwardCUDAKenrel(int64_t N, - T const *dY, - T const *X, - T const *gamma, - T const *a, - T const *b, - T const *c, - T *dX) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - dX[index] = - static_cast(a[i]) * static_cast(dY[index]) * gamma_v + - b[i] * static_cast(X[index]) + c[i]; - } -} - template __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, int64_t N, @@ -452,116 +567,148 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, hipStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), - M, - kCUDABlockReduceNumThreads, - 0, - stream, - N, - output_grad_ptr, - input_ptr, - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), - B, - kCUDANumThreads, - 0, - stream, - M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); - + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); - hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), - blocks, - num_threads, - nshared, - stream, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - input_grad_ptr, - N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), - B, - kCUDANumThreads, - 0, - stream, - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } else { const int64_t B = (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; - hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), - B, - dim3(kThreadX, kThreadY), - 0, - stream, - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } /*static*/ template -void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr) { +void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); +} + +/*static*/ +void LayerNorm::peft_bwd_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + stream); + } else { + assert(m->output_type[0] == DT_HALF); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + stream); + } } -template void - LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - float const *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *gamma_ptr, - float *gamma_grad_ptr, - float *beta_grad_ptr); +/*static*/ +void LayerNorm::backward_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr(), + stream); + } else if (m->output_type[0] == DT_HALF) { + LayerNorm::backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + gamma_grad.get_half_ptr(), + beta_grad.get_half_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } +} -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 44979c48fe..0801d11617 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32; LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; @@ -50,6 +50,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } LayerNormMeta::~LayerNormMeta(void) { @@ -96,73 +97,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - -#ifdef DEADCODE -template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { - __shared__ float m_shared[C10_WARP_SIZE]; - __shared__ float v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; - float sum1 = 0.0f; - float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); - } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); - if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(N); - sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, float(0)); - mean[i] = static_cast(sum1); - rstd[i] = static_cast(rsqrt(sum2 + eps)); - } -} - -template -__global__ void LayerNormForwardCUDAKernel(int64_t N, - T const *X, - T const *mean, - T const *rstd, - T const *gamma, - T const *beta, - T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - const T_ACC beta_v = - beta == nullptr ? T_ACC(0) : static_cast(beta[j]); - Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * - static_cast(rstd[i]) * gamma_v + - beta_v; - } -} -#endif - template __global__ void LayerNormFusedForwardKernel(int64_t N, float eps, @@ -177,18 +111,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -200,7 +129,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -221,25 +150,18 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - LayerNormFusedForwardKernel - <<>>(m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -290,6 +212,116 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, } } +/*static*/ +void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } +} + template __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { @@ -327,7 +359,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, using T_ACC = T; const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { - const T_ACC s = T_ACC(1) / static_cast(N); + const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * static_cast(rstd[index]) * static_cast(rstd[index]) * @@ -338,27 +370,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, } } -template -__global__ void LayerNormBackwardCUDAKenrel(int64_t N, - T const *dY, - T const *X, - T const *gamma, - T const *a, - T const *b, - T const *c, - T *dX) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - dX[index] = - static_cast(a[i]) * static_cast(dY[index]) * gamma_v + - b[i] * static_cast(X[index]) + c[i]; - } -} - template __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, int64_t N, @@ -620,44 +631,83 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, /*static*/ template -void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr) { +void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); +} + +/*static*/ +void LayerNorm::peft_bwd_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); if (m->output_type[0] == DT_FLOAT) { - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + stream); + } else { + assert(m->output_type[0] == DT_HALF); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + stream); } - // }else if(m->output_type[0] == DT_HALF){ - // LayerNorm::backward_kernel(m, - // output_grad_ptr, - // input_ptr, - // input_grad_ptr, - // gamma_ptr, - // gamma_grad_ptr, - // beta_grad_ptr, - // stream); - // } } -template void - LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - float const *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *gamma_ptr, - float *gamma_grad_ptr, - float *beta_grad_ptr); +/*static*/ +void LayerNorm::backward_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr(), + stream); + } else if (m->output_type[0] == DT_HALF) { + LayerNorm::backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + gamma_grad.get_half_ptr(), + beta_grad.get_half_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } +} -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 44b56d623e..20ad762b62 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -498,7 +498,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; m->inference_debugging = linear->inference_debugging; - m->trainableInputs[0] = linear->trainableInputs[0]; + m->trainable_inputs[0] = linear->trainable_inputs[0]; m->weight_ptr_type = m->input_type[0]; m->quantization_type = linear->quantization_type; m->offload = linear->offload; @@ -632,8 +632,11 @@ void Linear::inference_task(Task const *task, m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + assert((weight.domain.hi()[0] - weight.domain.lo()[0] + 1) == in_dim); + assert((weight.domain.hi()[1] - weight.domain.lo()[1] + 1) == out_dim); + assert(weight.domain.get_volume() == in_dim * out_dim); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); GenericTensorAccessorR bias; if (m->use_bias && !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) { @@ -645,14 +648,15 @@ void Linear::inference_task(Task const *task, runtime); assert(bias.domain.get_volume() == static_cast(out_dim)); } - forward_kernel_wrapper(m, - input.ptr, - output.ptr, - weight.ptr, - bias.ptr, - in_dim, - out_dim, - batch_size); + inference_kernel_wrapper(m, + bc, + input.ptr, + output.ptr, + weight.ptr, + bias.ptr, + in_dim, + out_dim, + batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -664,6 +668,119 @@ void Linear::inference_task(Task const *task, } Linear::save_inference_tensors_to_file( m, shard_id, bc, {input}, weights_accessors, {output}); + printf("\tin=[%i,%i].T @ w=[%i,%i] -> out=[%i,%i]\n", + in_dim, + bc->num_tokens, + in_dim, + out_dim, + out_dim, + bc->num_tokens); + } +} + +FutureMap Linear::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void Linear::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LinearMeta *m = *((LinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(regions.size() == 3); + assert(task->regions.size() == 3); + if (m->quantization_type == DT_NONE) { + assert(m->input_type[0] == m->weight_type[0]); + } + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false, true); + printf("\tw=[%i,%i] @ out_grad=[%i,%i] -> in_grad[%i,%i]\n", + in_dim, + out_dim, + out_dim, + num_peft_tokens, + in_dim, + num_peft_tokens); + } + peft_bwd_kernel_wrapper(m, + input_grad.ptr, + output_grad.ptr, + weight.ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); } } @@ -782,7 +899,7 @@ void Linear::backward(FFModel const &ff) { launcher.add_field(rid++, FID_DATA); // regions[1](I/O): replica_grad assert(replica == NULL); - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement( RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, @@ -878,17 +995,17 @@ void Linear::backward_task_with_dim(Task const *task, Runtime *runtime) { // Linear* linear = (Linear*) task->args; LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + + assert(regions.size() == (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + + (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); DT *input_grad = nullptr; size_t rid = 0; TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { Domain domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); if (domain.get_dim() == NDIM + 1) { @@ -1119,7 +1236,10 @@ bool Linear::measure_operator_cost(Simulator *sim, int input_n = sub_input.get_volume() / input_c; int output_c = sub_output.dims[0].size; int output_n = sub_output.get_volume() / output_c; - LinearMeta *m = sim->linear_meta; + + MemoryAllocator gpu_mem_allocator(sim->memory); + LinearMeta *m = new LinearMeta( + sim->handler, output_n, this, gpu_mem_allocator, input_c * output_c); m->activation = activation; m->kernel_reg_type = kernel_reg_type; m->kernel_reg_lambda = kernel_reg_lambda; @@ -1164,7 +1284,7 @@ bool Linear::measure_operator_cost(Simulator *sim, }; if (sim->computationMode == COMP_MODE_TRAINING) { void *input_grad_ptr = NULL; - if (trainableInputs[0]) { + if (trainable_inputs[0]) { input_grad_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type); } else { @@ -1313,7 +1433,7 @@ LinearParams Linear::get_params() const { params.kernel_reg_lambda = this->kernel_reg_lambda; params.quantization_type = this->quantization_type; params.offload = this->offload; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc new file mode 100644 index 0000000000..fde6bc2b28 --- /dev/null +++ b/src/ops/lora_linear.cc @@ -0,0 +1,1316 @@ +#include "flexflow/ops/lora_linear.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/layer.h" +#include "flexflow/model.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/utils/hash_utils.h" +#include "flexflow/utils/peft_weight_allocator.h" +#include "legion/legion_utilities.h" +#include +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::LoraLinear; + +bool check_lora_layer_match(Layer *potential_target, + std::string target_module_name) { + if (potential_target->op_type == OP_LINEAR && + potential_target->name != nullptr && strlen(potential_target->name) > 0) { + std::string s(potential_target->name); + if (s.find(target_module_name) != std::string::npos && + s.find("lora") == std::string::npos) { + return true; + } + } + return false; +} + +PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); + if (peft_config.target_modules.size() == 0) { + printf("PEFT config does not contain any target module\n"); + std::cout << peft_config << std::endl; + assert(false); + } + PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); + peft_configs[*peft_model_id] = peft_config; + + for (std::string target_module_name : peft_config.target_modules) { + assert(target_module_name.length() > 0 && + "LoRA target module name is empty"); + // find target layer + for (auto it = layers.begin(); it != layers.end(); ++it) { + Layer *target_module = *it; + bool match = check_lora_layer_match(target_module, target_module_name); + if (!match) { + continue; + } + + if (base_layer_to_peft_layer.find(target_module) != + base_layer_to_peft_layer.end()) { + // lora linear layer already added, no need to add again + Layer *peft_layer = base_layer_to_peft_layer[target_module]; + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + } else { + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + std::string name_ = target_module->name + ? std::string(target_module->name) + : std::string(""); + size_t last_underscore = name_.length() - 1; + for (int i = name_.length() - 1; i > 0; i--) { + if (!(std::isdigit(target_module->name[i]) || + target_module->name[i] == '_')) { + break; + } else if (target_module->name[i] == '_') { + last_underscore = i; + } + } + name_.erase(last_underscore); + + name_ += ".lora"; + std::cout << "Adding layer " << name_ << std::endl; + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + // fix LoRA layer's transformer layer ID and model ID + peft_layer->layer_guid.transformer_layer_id = + target_module->layer_guid.transformer_layer_id; + peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); + } + it = layers.insert(it + 1, peft_layer); + ++it; + base_layer_to_peft_layer[target_module] = peft_layer; + peft_layer_to_peft_id[peft_layer] = std::vector(); + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + } + } + } + + // save finetuned lora model configs to file + if (peft_config.trainable) { + std::string finetuned_model_folder = join_path({ + peft_config.cache_folder, + "finetuned_models", + peft_config.peft_model_id, + }); + fs::remove_all(finetuned_model_folder); + std::string finetuned_model_config_folder = join_path({ + finetuned_model_folder, + "config", + }); + fs::create_directories(finetuned_model_config_folder); + std::string lora_linear_config_filepath = join_path({ + finetuned_model_config_folder, + "ff_config.json", + }); + serialize_to_json_file(peft_config, lora_linear_config_filepath); + std::string optimizer_config_filepath = join_path({ + finetuned_model_config_folder, + "ff_optimizer_config.json", + }); + if (typeid(*peft_config.optimizer_config) == + typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + peft_config.optimizer_config); + serialize_to_json_file(*sgd_config, optimizer_config_filepath); + } else if (typeid(*peft_config.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + LoraAdamOptimizerConfig const *adam_config = + static_cast( + peft_config.optimizer_config); + serialize_to_json_file(*adam_config, optimizer_config_filepath); + } else { + assert(false && "Optimizer not supported"); + } + } + + return peft_model_id; +} + +Op *LoraLinear::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + std::unordered_map _peft_configs; + std::vector const &peft_ids = + model.peft_layer_to_peft_id[(Layer *)layer]; + for (int i = 0; i < peft_ids.size(); i++) { + _peft_configs.emplace( + std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); + } + return new LoraLinear(model, + layer->layer_guid, + layer->op_type, + inputs[0], + inputs[1], + _peft_configs, + layer->name); +} + +LoraLinear::LoraLinear(FFModel &model, + LoraLinear const &other, + ParallelTensor const input, + ParallelTensor const output) + : LoraLinear(model, + other.layer_guid, + other.op_type, + input, + output, + other.peft_configs, + other.name) {} + +LoraLinear::LoraLinear(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name) + : LoraLinear(model, + params.layer_guid, + params.type, + inputs.first, + inputs.second, + params.peft_configs, + params.name) {} + +LoraLinear::LoraLinear( + FFModel &model, + LayerID const &_layer_guid, + OperatorType _op_type, + ParallelTensor const _input, + ParallelTensor const _output, + std::unordered_map const &_peft_configs, + char const *name) + : Op(model, + _op_type, + _output->data_type, + name, + 2 /*inputs*/, + 0 /*weights*/, + false, + 1 /*outputs*/, + _input, + _output) { + assert(_input->data_type == _output->data_type); + // overwrite layer_guid + layer_guid = _layer_guid; + data_type = _output->data_type; + + ParallelTensorShape input_shape = this->inputs[0]->get_shape(); + LoraLinearParams params = this->get_params(); + + // Create output tensor + { + int numdim = inputs[1]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[1]->dims[i]; + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, inputs[1]->data_type, this); + } + for (auto const &kv : _peft_configs) { + peft_configs.insert(kv); + } + // assert(check_output_input_weight_parallel_dims(allocate_weights)); +} + +void LoraLinear::init(FFModel const &ff) { + assert(false && "LoraLinear does not support normal init"); +} + +void LoraLinear::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, output_tensor); + IndexLauncher launcher(LORA_LINEAR_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(LoraLinear)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, output_tensor); +} + +template +void load_peft_from_file(DT *ptr, + size_t num_rows, + size_t num_columns, + int num_shards, + int shard_id, + std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + + // HuggingFace dims (serialized in row-major order) + // lora_A: [rank, intermediate_dim] + // lora_B: [hidden_dim, rank] + // FlexFlow dims (serialized in column-major order) + // lora_A: [intermediate_dim, rank] + // lora_B: [rank, out_dim] + // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B + assert(num_rows % num_shards == 0); + size_t chunk_size = num_rows / num_shards; + size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0; + + // Allocate memory for the weight shard + std::vector
host_array(chunk_size * num_columns); + // Read the chunk + size_t total_size_read = 0; + for (int i = 0; i < num_columns; ++i) { + in.seekg((i * num_rows + offset) * sizeof(DT)); + in.read(reinterpret_cast(host_array.data() + i * chunk_size), + chunk_size * sizeof(DT)); + total_size_read += in.gcount(); + } + // Check weight shard size + size_t expected_data_size = chunk_size * num_columns * sizeof(DT); + if (total_size_read != expected_data_size) { + printf("load weight data error: expected %lu bytes, got: %lu bytes, data " + "size: %lu\n", + expected_data_size, + total_size_read, + sizeof(DT)); + assert(false); + } + assert(host_array.size() == chunk_size * num_columns); + // Copy weight to device memory + copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns); + in.close(); +} + +/* + regions[0](O): output + regions[1](I): kernel + regions[2](I): bias +*/ +OpMeta *LoraLinear::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinear const *lora = (LoraLinear *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(lora->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorRW(lora->outputs[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int batch_size = output.domain.get_volume() / out_dim; + assert(input.domain.get_volume() == in_dim * batch_size); + assert(output.domain.get_volume() == out_dim * batch_size); + + LoraLinearMeta *m = new LoraLinearMeta(handle, lora); + m->trainable_inputs[0] = lora->trainable_inputs[0]; + std::strcpy(m->op_name, lora->name); + m->layer_guid = lora->layer_guid; + + int num_shards = lora->inputs[0]->dims[0].degree; + int shard_id = task->index_point.point_data[0]; + int num_dims = lora->inputs[0]->num_dims; + assert(in_dim == lora->inputs[0]->dims[0].size / num_shards); + assert(out_dim == + lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree); + + DataType dt = m->input_type[0]; + assert(dt == m->input_type[1]); + assert(dt == m->output_type[0]); + assert(dt == lora->inputs[0]->data_type); + assert(dt == lora->inputs[1]->data_type); + assert(dt == lora->outputs[0]->data_type); + + // get layer name + assert(lora->name != nullptr && + "Layer name is not set, cannot determine weights location"); + std::string lora_layername = std::string(lora->name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + + for (auto const &kv : lora->peft_configs) { + PEFTModelID const &model_id = kv.first; + LoraLinearConfig const &lora_config = kv.second; + + int rank = lora_config.rank; + + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + // values below represent total weight sizes before sharding. Lora B is not + // sharded. + int lora_A_num_rows = in_dim * num_shards; + int lora_A_num_cols = rank; + int lora_B_num_rows = rank; + int lora_B_num_cols = out_dim; + int lora_A_num_shards = num_shards; + int lora_B_num_shards = 1; + + LoraLinearWeight weight; + weight.in_dim = in_dim; + weight.out_dim = out_dim; + weight.rank = rank; + weight.num_shards = num_shards; + PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; + weight.w0_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + + if (!lora_config.init_lora_weights) { + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " + << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " + << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " + << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " + << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else { + assert(false && "Data type not supported"); + } + } else { + // initialize weights + int seed = 0; + init_kernel_wrapper(m, seed); + } + + // allocate space for gradients if the LoRA layer is trainable + if (lora_config.trainable) { + // Ensure we have an optimizer + assert(lora_config.optimizer_config != nullptr && "Optimizer not set"); + assert(typeid(*lora_config.optimizer_config) != + typeid(LoraOptimizerConfig) && + "Optimizer config is not a subclass of LoraOptimizerConfig"); + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + // Input is partitioned (no replication) + // w0_grad is local weight gradients + weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is sync weight gradients + weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } else { + // Input is replicated + // w0_grad is sync weight gradients + weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is local weight gradients + weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } + // allocate space for v_values if needed by optimizer + if (typeid(*lora_config.optimizer_config) == + typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + lora_config.optimizer_config); + if (sgd_config->momentum > 0.0f) { + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } else { + weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } + } + } else if (typeid(*lora_config.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + assert(false && "Adam optim not yet implemented"); + } else { + assert(false && "Optimizer not supported"); + } + } + assert(m->model_state.find(model_id) == m->model_state.end()); + m->model_state[model_id].weights = weight; + m->model_state[model_id].optimizer_config = lora_config.optimizer_config; + m->model_state[model_id].lora_alpha = lora_config.lora_alpha; + m->model_state[model_id].cache_folder = lora_config.cache_folder; + m->model_state[model_id].peft_model_id = lora_config.peft_model_id; + } + return m; +} + +void LoraLinear::forward(FFModel const &ff) { + assert(false && "LoraLinear does not support normal init"); +} + +FutureMap + LoraLinear::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_inference(ff, argmap, output_tensor); + IndexLauncher launcher(LORA_LINEAR_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void LoraLinear::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + // int num_infr_tokens = bc->num_active_infr_tokens(); + // int num_peft_tokens = bc->num_active_peft_tokens(); + inference_kernel_wrapper(m, bc, input, output); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + + // get layer name + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + // print layer name + std::cout << "INF " << lora_layername_substr << std::endl; + + // build output filepath + fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + lora_layername_substr; + dst_filepath /= layername; + + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(dst_filepath.string() + ".batch_config"); + } + + std::string filename = dst_filepath.string() + ".input_0"; + if (input.data_type == DT_FLOAT) { + save_tensor( + input.get_float_ptr(), input.domain.get_volume(), filename.c_str()); + } else if (input.data_type == DT_HALF) { + save_tensor( + input.get_half_ptr(), input.domain.get_volume(), filename.c_str()); + } else { + assert(false); + } + + int rank, num_tokens; + for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { + PEFTModelID peft_model_id = it->first; + LoraLinearWeight weight = m->model_state[peft_model_id].weights; + rank = weight.rank; + num_tokens = input.domain.get_volume() / weight.in_dim; + fs::path dst_filepath_weights = + get_dst_folder("weights", m->decoding_step, shard_id) / layername; + std::string filenameA = + dst_filepath_weights.string() + ".weight_A.original"; + std::string filenameB = + dst_filepath_weights.string() + ".weight_B.original"; + if (m->input_type[0] == DT_FLOAT) { + save_tensor((float *)weight.w0_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((float *)weight.w1_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); + } else if (m->input_type[0] == DT_HALF) { + save_tensor((half *)weight.w0_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((half *)weight.w1_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); + } else { + assert(false && "Data type not supported"); + } + } + + filename = dst_filepath.string() + ".output_0"; + if (output.data_type == DT_FLOAT) { + save_tensor( + output.get_float_ptr(), output.domain.get_volume(), filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor( + output.get_half_ptr(), output.domain.get_volume(), filename.c_str()); + } else { + assert(false); + } + + if (bc->num_active_peft_tokens() > 0) { + // input activation (intermediate) + filename = dst_filepath.string() + ".low_rank_activation"; + if (output.data_type == DT_FLOAT) { + save_tensor((float *)m->low_rank_activation, + rank * num_tokens, + filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor((half *)m->low_rank_activation, + rank * num_tokens, + filename.c_str()); + } else { + assert(false); + } + } + m->decoding_step++; + } +} + +FutureMap LoraLinear::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + set_argumentmap_for_inference(ff, argmap, output_tensor); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void lora_inference_debugging(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW input_grad, + GenericTensorAccessorR output_grad, + int shard_id) { + // get layer name + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + // print layer name + std::cout << "BWD " << lora_layername_substr << std::endl; + + // build output filepath + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + lora_layername_substr; + dst_filepath /= layername; + + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(dst_filepath.string() + ".batch_config"); + } + + // weights, weights gradients + fs::path dst_filepath_weights = + get_dst_folder("weights", m->bwd_step, shard_id) / layername; + assert(m->model_state.size() >= 1 && "Model state empty!"); + for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { + PEFTModelID peft_model_id = it->first; + LoraLinearWeight weight = m->model_state[peft_model_id].weights; + std::string filename_weight_A = + dst_filepath_weights.string() + ".weight_A.finetuned"; + std::string filename_weight_B = + dst_filepath_weights.string() + ".weight_B.finetuned"; + std::string filename_grad_A = + dst_filepath_weights.string() + ".weight_A.gradient"; + std::string filename_grad_B = + dst_filepath_weights.string() + ".weight_B.gradient"; + if (m->input_type[0] == DT_FLOAT) { + // weight A + save_tensor((float *)weight.w0_ptr, + weight.rank * weight.in_dim, + filename_weight_A.c_str()); + // weight grad A + save_tensor((float *)weight.w0_grad_ptr, + weight.rank * weight.in_dim, + filename_grad_A.c_str()); + // weight B + save_tensor((float *)weight.w1_ptr, + weight.rank * weight.out_dim, + filename_weight_B.c_str()); + // weight grad B + save_tensor((float *)weight.w1_grad_ptr, + weight.rank * weight.out_dim, + filename_grad_B.c_str()); + } else if (m->input_type[0] == DT_HALF) { + // weight A + save_tensor((half *)weight.w0_ptr, + weight.rank * weight.in_dim, + filename_weight_A.c_str()); + // weight grad A + save_tensor((half *)weight.w0_grad_ptr, + weight.rank * weight.in_dim, + filename_grad_A.c_str()); + // weight B + save_tensor((half *)weight.w1_ptr, + weight.rank * weight.out_dim, + filename_weight_B.c_str()); + // weight grad B + save_tensor((half *)weight.w1_grad_ptr, + weight.rank * weight.out_dim, + filename_grad_B.c_str()); + } else { + assert(false && "Data type not supported"); + } + } + + std::string filename = dst_filepath.string() + ".input_gradient_0"; + if (input_grad.data_type == DT_FLOAT) { + save_tensor(input_grad.get_float_ptr(), + input_grad.domain.get_volume(), + filename.c_str()); + } else if (input_grad.data_type == DT_HALF) { + save_tensor(input_grad.get_half_ptr(), + input_grad.domain.get_volume(), + filename.c_str()); + } else { + assert(false); + } + + filename = dst_filepath.string() + ".output_gradient_0"; + if (output_grad.data_type == DT_FLOAT) { + save_tensor(output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + filename.c_str()); + } else if (output_grad.data_type == DT_HALF) { + save_tensor(output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + filename.c_str()); + } else { + assert(false); + } + m->bwd_step++; +} + +template +void save_peft_to_file(DT const *weight_ptr, + size_t size, + std::string filepath) { + std::ofstream out(filepath, std::ios::binary); + // Check if the file was opened successfully + if (!out || !out.is_open() || !out.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(out && out.is_open() && out.good() && + "can't write to lora weight file path"); + std::vector
host_array(size); + copy_tensor_dev_to_host(weight_ptr, host_array.data(), size); + + size_t target_data_size = sizeof(DT) * size; + out.write((char *)host_array.data(), target_data_size); + + size_t out_written_size = out.tellp(); + if (out_written_size != target_data_size) { + printf("save weight data error: %lu, %lu, %lu\n", + out_written_size, + target_data_size, + sizeof(DT)); + assert(false); + } + out.close(); +} + +void save_peft_weights_if_needed(LoraLinearMeta *m, + BatchConfig const *bc, + int in_dim, + int out_dim, + int shard_id) { + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) { + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + std::string weight_export_folder = join_path({ + m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder, + "finetuned_models", + m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id, + "weights", + "shard_" + std::to_string(shard_id), + }); + fs::create_directories(weight_export_folder); + + int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank; + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + std::string w0_filepath = join_path( + {weight_export_folder, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weight_export_folder, lora_layername_substr + "_B.weight"}); + if (m->input_type[0] == DT_FLOAT) { + save_peft_to_file( + (float *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w0_ptr, + w0_num_elements, + w0_filepath); + if (shard_id == 0) { + save_peft_to_file( + (float *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w1_ptr, + w1_num_elements, + w1_filepath); + } + } else if (m->input_type[0] == DT_HALF) { + save_peft_to_file( + (half *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w0_ptr, + w0_num_elements, + w0_filepath); + if (shard_id == 0) { + save_peft_to_file( + (half *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w1_ptr, + w1_num_elements, + w1_filepath); + } + } else { + assert(false && "Data type not supported"); + } + } + } +} + +void LoraLinear::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + assert(m->input_type[0] == m->output_type[0]); + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + // int num_infr_tokens = bc->num_active_infr_tokens(); + // int num_peft_tokens = bc->num_active_peft_tokens(); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + + save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id); + + if (m->inference_debugging) { + lora_inference_debugging(m, bc, input_grad, output_grad, shard_id); + } +} + +void LoraLinear::backward(FFModel const &ff) { + assert(false && "LoraLinear does not support normal backward"); +} + +void LoraLinear::print_layer(FFModel const &ff) {} + +void LoraLinear::map_output_tensors(FFModel &ff) { + assert(numOutputs == 1); + assert(numInputs == 2); + assert(outputs[0]->get_volume() == inputs[1]->get_volume()); + outputs[0]->parallel_is = inputs[1]->parallel_is; + outputs[0]->region = inputs[1]->region; + outputs[0]->part = inputs[1]->part; + outputs[0]->region_grad = inputs[1]->region_grad; + outputs[0]->part_grad = inputs[1]->part_grad; +} + +bool LoraLinear::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { + if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && + lhs.peft_configs.size() == rhs.peft_configs.size()) { + for (auto const &kv : lhs.peft_configs) { + auto it = rhs.peft_configs.find(kv.first); + if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { + return false; + } + } + return true; + } + return false; +} + +fs::path create_unique_temp_directory() { + std::srand(static_cast(std::time(nullptr))); + + fs::path temp_dir = fs::temp_directory_path(); + fs::path unique_path; + + do { + std::string unique_name = "flexflow_tmp_" + std::to_string(std::rand()); + unique_path = temp_dir / unique_name; + } while (fs::exists(unique_path)); + + fs::create_directory(unique_path); + return unique_path; +} + +void serialize_string(Legion::Serializer &sez, + std::string string_to_serialize) { + sez.serialize(string_to_serialize.length()); + sez.serialize(string_to_serialize.c_str(), string_to_serialize.length()); +} + +std::string deserialize_string(Legion::Deserializer &dez) { + size_t string_size; + char buffer[4096] = {0}; + dez.deserialize(string_size); + dez.deserialize(buffer, string_size); + return std::string(buffer); +} + +void LoraLinear::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); + sez.serialize(this->op_type); + sez.serialize(this->peft_configs.size()); + for (auto const &kv : this->peft_configs) { + // Serialize PEFTModelID + sez.serialize(kv.first.id); + + // Serialize LoraLinearConfig and OptimizerConfig to tmp folder + // 1. Create tmp dir and serialize it + fs::path unique_temp_dir = create_unique_temp_directory(); + serialize_string(sez, unique_temp_dir.string()); + // 2. Dump LoraLinearConfig to json file in tmp dir + std::string lora_config_filename = std::string("lora_linear_config_") + + std::to_string(kv.first.id) + + std::string(".json"); + fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; + serialize_to_json_file(kv.second, lora_config_json_filepath); + // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type + std::string optimizer_filename = std::string("optimizer_config_") + + std::to_string(kv.first.id) + + std::string(".json"); + fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; + assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr)); + if (kv.second.trainable) { + if (typeid(*kv.second.optimizer_config) == + typeid(LoraSGDOptimizerConfig)) { + sez.serialize(OPTIMIZER_TYPE_SGD); + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + kv.second.optimizer_config); + serialize_to_json_file(*sgd_config, optim_config_filepath); + } else if (typeid(*kv.second.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + sez.serialize(OPTIMIZER_TYPE_ADAM); + LoraAdamOptimizerConfig const *adam_config = + static_cast( + kv.second.optimizer_config); + serialize_to_json_file(*adam_config, optim_config_filepath); + } else { + assert(false && "Optimizer type not yet supported"); + } + } + } + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); +} + +/* static */ +using PCG::Node; +Node LoraLinear::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + size_t id, transformer_layer_id, deserialized_model_id; + OperatorType op_type; + size_t num_pefts; + size_t name_len; + char name[MAX_OPNAME] = {0}; + + LoraLinearParams params; + + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + dez.deserialize(op_type); + dez.deserialize(num_pefts); + for (int i = 0; i < num_pefts; i++) { + // Deserialize PEFTModelID + size_t pid; + dez.deserialize(pid); + PEFTModelID peft_model_id(pid); + // Deserialize tmp folder containing LoraLinearConfig and optimizer config + fs::path unique_temp_dir = fs::path(deserialize_string(dez)); + // 1. Deserialize LoraLinearConfig + std::string lora_config_filename = std::string("lora_linear_config_") + + std::to_string(pid) + + std::string(".json"); + fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; + std::unique_ptr lora_linear_config = + deserialize_from_json_file(lora_config_json_filepath); + // 2. Deserialize optimizer if needed + if (lora_linear_config->trainable) { + std::string optimizer_filename = std::string("optimizer_config_") + + std::to_string(pid) + + std::string(".json"); + fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; + OptimizerType type_; + dez.deserialize(type_); + if (type_ == OPTIMIZER_TYPE_SGD) { + std::unique_ptr sgd_optimizer_config = + deserialize_from_json_file( + optim_config_filepath); + lora_linear_config->optimizer_config = + dynamic_cast(sgd_optimizer_config.release()); + } else if (type_ == OPTIMIZER_TYPE_ADAM) { + std::unique_ptr adam_optimizer_config = + deserialize_from_json_file( + optim_config_filepath); + lora_linear_config->optimizer_config = + dynamic_cast( + adam_optimizer_config.release()); + } else { + printf("Optimizer type: %d\n", type_); + assert(false && "Optimizer type not yet supported"); + } + } + try { + fs::remove_all(unique_temp_dir); + } catch (fs::filesystem_error const &e) { + std::cerr << "Error removing tmp directory: " << e.what() << std::endl; + } + params.peft_configs.emplace( + std::make_pair(peft_model_id, *lora_linear_config)); + } + dez.deserialize(name_len); + dez.deserialize(name, name_len); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + + params.layer_guid = layer_guid; + params.type = op_type; + strcpy(params.name, name); + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} + +Op *LoraLinear::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + LoraLinearParams params = get_params(); + return new LoraLinear(ff, params, {inputs[0], inputs[1]}, this->name); +} + +LoraLinearParams LoraLinear::get_params() const { + LoraLinearParams params; + params.layer_guid = this->layer_guid; + params.type = this->op_type; + if (strlen(this->name) < MAX_OPNAME) { + strcpy(params.name, this->name); + } + params.peft_configs = this->peft_configs; + return params; +} + +bool LoraLinearParams::is_valid( + std::pair const &input_shape) + const { + return true; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::LoraLinearParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.layer_guid.model_id); + for (auto const &kv : params.peft_configs) { + hash_combine(key, kv.first.id); + hash_combine(key, kv.second.rank); + hash_combine(key, kv.second.trainable); + hash_combine(key, kv.second.cache_folder); + hash_combine(key, kv.second.peft_model_id); + hash_combine(key, kv.second.lora_alpha); + hash_combine(key, kv.second.lora_dropout); + hash_combine(key, kv.second.target_modules); + hash_combine(key, kv.second.init_lora_weights); + } + return key; +} +}; // namespace std diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc new file mode 100644 index 0000000000..6e0c60e057 --- /dev/null +++ b/src/ops/lora_linear_params.cc @@ -0,0 +1,221 @@ +#include "flexflow/ops/lora_linear_params.h" +#include +#include +#include +using json = nlohmann::json; + +namespace FlexFlow { + +// ---------------- Optimizer configs ---------------- +// --------------------------------------------------- + +// empty optimizer +LoraOptimizerConfig::LoraOptimizerConfig() {} + +// SGD optimizer +LoraSGDOptimizerConfig::LoraSGDOptimizerConfig() + : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {} + +LoraSGDOptimizerConfig::LoraSGDOptimizerConfig(double lr_, + double momentum_, + bool nesterov_, + bool weight_decay_) + : lr(lr_), momentum(momentum_), nesterov(nesterov_), + weight_decay(weight_decay_) {} + +std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) { + os << "SGD Optimizer (lr=" << llc.lr << ",momentum=" << llc.momentum + << ",nesterov=" << llc.nesterov << ",weight_decay=" << llc.weight_decay + << ")"; + return os; +} + +// Adam optimizer +LoraAdamOptimizerConfig::LoraAdamOptimizerConfig() + : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f), + epsilon(1e-8) {} + +LoraAdamOptimizerConfig::LoraAdamOptimizerConfig(double alpha_, + double beta1_, + double beta2_, + double weight_decay_, + double epsilon_) + : alpha(alpha_), beta1(beta1_), beta2(beta2_), weight_decay(weight_decay_), + epsilon(epsilon_) {} + +std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) { + os << "SGD Optimizer (alpha=" << llc.alpha << ",beta1=" << llc.beta1 + << ",beta2=" << llc.beta2 << ",weight_decay=" << llc.weight_decay + << ",epsilon=" << llc.epsilon << ")"; + return os; +} + +// Serialization helpers +template +void serialize_to_json_file(T const &obj, fs::path const &filepath) { + json j = obj; + std::ofstream file(filepath); + file << j.dump(4); +} + +template +std::unique_ptr deserialize_from_json_file(fs::path const &filepath) { + std::ifstream file(filepath); + json j; + file >> j; + return std::make_unique(j.get()); +} + +template void + serialize_to_json_file(LoraLinearConfig const &obj, + fs::path const &filepath); +template void serialize_to_json_file( + LoraSGDOptimizerConfig const &obj, fs::path const &filepath); +template void serialize_to_json_file( + LoraAdamOptimizerConfig const &obj, fs::path const &filepath); +template std::unique_ptr + deserialize_from_json_file(fs::path const &filepath); +template std::unique_ptr + deserialize_from_json_file( + fs::path const &filepath); +template std::unique_ptr + deserialize_from_json_file( + fs::path const &filepath); + +// ------------------ LoRA configs ------------------- +// --------------------------------------------------- +const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", ""); + +LoraLinearConfig::LoraLinearConfig( + std::string const &cache_folder_, + std::string const &peft_model_id_, + bool trainable_, + LoraOptimizerConfig *optimizer_config_, + bool init_lora_weights_, + std::string const &base_model_name_or_path_, + std::string const &precision_, + int rank_, + float lora_alpha_, + float lora_dropout_, + std::vector const &target_modules_) + : cache_folder(cache_folder_), peft_model_id(peft_model_id_), rank(rank_), + lora_alpha(lora_alpha_), lora_dropout(lora_dropout_), + trainable(trainable_), optimizer_config(optimizer_config_), + init_lora_weights(init_lora_weights_), + base_model_name_or_path(base_model_name_or_path_), precision(precision_), + target_modules(target_modules_) { + + if (peft_model_id.empty()) { + return; + } + assert(!cache_folder.empty() && + "cache_folder must be provided when using PEFT"); + if (trainable) { + assert(optimizer_config != nullptr && + "optimizer_config must be provided when using PEFT"); + assert( + !base_model_name_or_path.empty() && + "base_model_name_or_path must be provided when training a PEFT model"); + assert(!precision.empty() && + "precision must be provided when training a PEFT model"); + } else { + assert(init_lora_weights == false && + "init_lora_weights must be false when LORA not trainable"); + assert(optimizer_config == nullptr && + "optimizer_config must be nullptr when not trainable"); + } + // if we are not initializing LORA from scratch, load the configs from + // existing repository + if (!init_lora_weights) { + std::string peft_inference_config_file_path = + join_path({cache_folder, "configs", peft_model_id, "config.json"}); + std::ifstream config_file(peft_inference_config_file_path); + if (config_file.is_open()) { + try { + json model_config; + config_file >> model_config; + rank = model_config["r"]; + lora_alpha = float(model_config["lora_alpha"]); + lora_dropout = model_config["lora_dropout"]; + for (auto &s : model_config["target_modules"]) { + target_modules.push_back(s); + } + // do not load the base_model_name_or_path from the HF config because we + // may be applying LoRA to another model + } catch (json::exception const &e) { + std::cerr << "Error parsing PEFT config from JSON file: " << e.what() + << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file " << peft_inference_config_file_path + << std::endl; + assert(false); + } + } + assert(rank > 0 && "rank must be greater than 0"); + assert(lora_alpha > 0.0f && "lora_alpha must be greater than 0.0"); + assert(lora_dropout >= 0.0f && lora_dropout <= 1.0f && + "lora_dropout must be in [0.0, 1.0]"); + assert(target_modules.size() > 0 && "target_modules must not be left empty"); +} + +// constructor used to support unordered_map +LoraLinearConfig::LoraLinearConfig() : LoraLinearConfig("", "") {} + +bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { + if (lhs.cache_folder == rhs.cache_folder && + lhs.peft_model_id == rhs.peft_model_id && lhs.rank == rhs.rank && + lhs.lora_alpha == rhs.lora_alpha && + lhs.lora_dropout == rhs.lora_dropout && + lhs.target_modules.size() == rhs.target_modules.size() && + lhs.trainable == rhs.trainable && + lhs.init_lora_weights == rhs.init_lora_weights && + lhs.optimizer_config == rhs.optimizer_config && + lhs.base_model_name_or_path == rhs.base_model_name_or_path && + lhs.precision == rhs.precision) { + for (int i = 0; i < lhs.target_modules.size(); i++) { + if (lhs.target_modules[i] != rhs.target_modules[i]) { + return false; + } + } + return true; + } + return false; +} + +std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { + os << "LoraLinearConfig: "; + os << "cache_folder: " << llc.cache_folder << ", "; + os << "peft_model_id: " << llc.peft_model_id << ", "; + os << "rank: " << llc.rank << ", "; + os << "lora_alpha: " << llc.lora_alpha << ", "; + os << "lora_dropout: " << llc.lora_dropout << ", "; + os << "target_modules: ["; + for (int i = 0; i < llc.target_modules.size(); i++) { + os << llc.target_modules[i]; + if (i < llc.target_modules.size() - 1) { + os << ", "; + } + } + os << "], "; + os << "trainable: " << llc.trainable << ", "; + if (llc.optimizer_config != nullptr) { + os << "optimizer_config: "; + if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + os << *static_cast(llc.optimizer_config); + } else if (typeid(*llc.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + os << *static_cast(llc.optimizer_config); + } else { + os << "Unknown optimizer config type"; + } + std::cout << std::endl; + } + os << "init_lora_weights: " << llc.init_lora_weights << std::endl; + os << "base_model_name_or_path: " << llc.base_model_name_or_path << std::endl; + os << "precision: " << llc.precision << std::endl; + return os; +} + +}; // namespace FlexFlow diff --git a/src/ops/mean.cc b/src/ops/mean.cc index b2ec94fdf8..0d41276735 100644 --- a/src/ops/mean.cc +++ b/src/ops/mean.cc @@ -87,8 +87,7 @@ OpMeta *Mean::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handler = *((FFHandler const *)task->local_args); - OpMeta *m = new OpMeta(handler); - return m; + return nullptr; } void Mean::forward(FFModel const &ff) {} diff --git a/src/ops/noop.cc b/src/ops/noop.cc index da2d4922e3..45bd76d59d 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -90,8 +90,9 @@ OpMeta *NoOp::init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + NoOp *no_op = (NoOp *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - OpMeta *m = new OpMeta(handle); + OpMeta *m = new OpMeta(handle, no_op); return m; } @@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff, set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) { set_argumentmap_for_init(ff, argmap); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc index 4621ab5909..c8b194afa9 100644 --- a/src/ops/pool_2d.cc +++ b/src/ops/pool_2d.cc @@ -315,7 +315,7 @@ OpMeta *Pool2D::init_task(Task const *task, assert(task->regions.size() == 2); Pool2D const *pool = (Pool2D *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Pool2DMeta *m = new Pool2DMeta(handle); + Pool2DMeta *m = new Pool2DMeta(handle, pool); m->profiling = pool->profiling; m->inference_debugging = pool->inference_debugging; std::strcpy(m->op_name, pool->name); @@ -545,7 +545,7 @@ bool Pool2D::measure_operator_cost(Simulator *sim, int output_n = sub_output.dims[3].size; int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - Pool2DMeta *m = sim->pool2d_meta; + Pool2DMeta *m = new Pool2DMeta(sim->handler, this); init_kernel(m, input_w, diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc index 454a35caf4..1c0566e9ca 100644 --- a/src/ops/reduce.cc +++ b/src/ops/reduce.cc @@ -41,7 +41,7 @@ ReduceParams Reduce::get_params() const { } params.keepdims = keepdims; params.layer_guid = this->layer_guid; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/reduce.cpp b/src/ops/reduce.cpp index c062955ed6..fe122b13eb 100644 --- a/src/ops/reduce.cpp +++ b/src/ops/reduce.cpp @@ -25,7 +25,7 @@ using Legion::Domain; ReduceMeta::ReduceMeta(FFHandler handler, Reduce const *rd, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, rd) { checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/reduce.cu b/src/ops/reduce.cu index 65efd90e9b..1352787a12 100644 --- a/src/ops/reduce.cu +++ b/src/ops/reduce.cu @@ -24,7 +24,7 @@ using Legion::Domain; ReduceMeta::ReduceMeta(FFHandler handler, Reduce const *rd, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, rd) { checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 49f99e2cb5..4e7fd2eb96 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -180,7 +180,7 @@ OpMeta *Reshape::init_task(Task const *task, Runtime *runtime) { Reshape const *reshape = (Reshape *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ReshapeMeta *m = new ReshapeMeta(handle); + ReshapeMeta *m = new ReshapeMeta(handle, reshape); std::strcpy(m->op_name, reshape->name); m->layer_guid = reshape->layer_guid; m->data_type = reshape->outputs[0]->data_type; @@ -296,7 +296,7 @@ ReshapeParams Reshape::get_params() const { ReshapeParams params; params.shape = shape_vec; params.layer_guid = this->layer_guid; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 8dd670eea3..2a30d12d6d 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -44,7 +44,8 @@ bool operator==(ResidualLayerNormParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && lhs.elementwise_affine == rhs.elementwise_affine && lhs.use_bias == rhs.use_bias && - lhs.use_two_residuals == rhs.use_two_residuals; + lhs.use_two_residuals == rhs.use_two_residuals && + lhs.inplace_residual == rhs.inplace_residual; } bool ResidualLayerNormParams::is_valid( @@ -63,7 +64,8 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const { params.eps = this->eps; params.use_bias = this->use_bias; params.use_two_residuals = this->use_two_residuals; - if (this->name != nullptr) { + params.inplace_residual = this->inplace_residual; + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -78,6 +80,7 @@ void FFModel::residual_layer_norm(const Tensor input, bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -117,7 +120,6 @@ void FFModel::residual_layer_norm(const Tensor input, } int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0; - Layer *ln = nullptr; Tensor casted_input = (data_type != input->data_type) ? cast(input, data_type, "type cast for residual_layer_norm") @@ -133,20 +135,20 @@ void FFModel::residual_layer_norm(const Tensor input, ? cast(residual2, data_type, "type cast for residual2_layer_norm") : residual2; } - ln = new Layer(this, - OP_RESIDUAL_LAYERNORM, - data_type, - name, - 2 + use_two_residuals /*inputs*/, - num_weights, - 2 /*outputs*/, - casted_input, - casted_residual1, - casted_residual2); + Layer *ln = new Layer(this, + OP_RESIDUAL_LAYERNORM, + data_type, + name, + 2 + use_two_residuals /*inputs*/, + num_weights, + 2 /*outputs*/, + casted_input, + casted_residual1, + casted_residual2); ln->outputs[0] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/); ln->outputs[1] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/); { int numdims = axes.size(); int dims[numdims]; @@ -179,6 +181,7 @@ void FFModel::residual_layer_norm(const Tensor input, ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); ln->add_int_property("use_two_residuals", use_two_residuals); + ln->add_int_property("inplace_residual", inplace_residual); layers.push_back(ln); outputs[0] = ln->outputs[0]; outputs[1] = ln->outputs[1]; @@ -199,6 +202,9 @@ Op *ResidualLayerNorm::create_operator_from_layer( layer->get_float_property("eps", eps); layer->get_int_property("use_two_residuals", value); bool use_two_residuals = (bool)value; + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; + return new ResidualLayerNorm(model, layer->layer_guid, inputs[0], @@ -209,6 +215,7 @@ Op *ResidualLayerNorm::create_operator_from_layer( elementwise_affine, use_bias, eps, + inplace_residual, false, // allocate_weights layer->name); } @@ -230,6 +237,7 @@ ResidualLayerNorm::ResidualLayerNorm( params.elementwise_affine, params.use_bias, params.eps, + params.inplace_residual, allocate_weights, params.name) {} @@ -243,6 +251,7 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -256,7 +265,8 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, _residual1, _use_two_residuals ? _residual2 : nullptr), elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), - use_bias(_use_bias), use_two_residuals(_use_two_residuals) { + use_bias(_use_bias), use_two_residuals(_use_two_residuals), + inplace_residual(_inplace_residual) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -326,6 +336,22 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, } } +void ResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } +} + void ResidualLayerNorm::init_inference( FFModel const &ff, std::vector const &batch_inputs, @@ -347,13 +373,19 @@ void ResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } int field_id = 0; // input - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); + // added: input + residual(s) + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, @@ -371,13 +403,15 @@ void ResidualLayerNorm::init_inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -422,13 +456,17 @@ void ResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); int field_id = 0; // input - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); + // added: input + residual(s) + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(inputs[1]->part, @@ -439,20 +477,21 @@ void ResidualLayerNorm::init(FFModel const &ff) { launcher.add_field(field_id++, FID_DATA); // residual2 if (use_two_residuals) { - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[1]->region)); + inputs[2]->region)); + launcher.add_field(field_id++, FID_DATA); + } + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, @@ -516,7 +555,323 @@ void ResidualLayerNorm::forward(FFModel const &ff) { } void ResidualLayerNorm::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RESIDUAL_LAYERNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // added output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_two_residuals) { + // residual grad 2 + launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + // beta_grad + launcher.add_region_requirement( + RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + ResidualLayerNormMeta const *m = + *((ResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 4 + m->use_two_residuals + + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR added_output = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual1_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual2_grad; + if (m->use_two_residuals) { + residual2_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + gamma_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + ResidualLayerNorm::backward_kernel_wrapper(m, + output_grad, + added_output, + input_grad, + residual1_grad, + residual2_grad, + gamma, + gamma_grad, + beta_grad); +} + +Legion::FutureMap ResidualLayerNorm::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int field_id = 0; + // output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad 1 + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_two_residuals) { + // residual grad 2 + launcher.add_region_requirement( + RegionRequirement(batch_inputs[2]->part_grad, + 0 /*projection id*/, + reset_input_grads[2] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(task->regions.size() == regions.size()); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); + assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual1_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual2_grad; + if (m->use_two_residuals) { + GenericTensorAccessorW residual2_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + ResidualLayerNorm::peft_bwd_kernel_wrapper( + m, output_grad, input_grad, residual1_grad, residual2_grad, gamma); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector input_accessors; + input_accessors.push_back(input_grad); + input_accessors.push_back(residual1_grad); + if (m->use_two_residuals) { + input_accessors.push_back(residual2_grad); + } + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + } + ResidualLayerNorm::save_inference_tensors_to_file(m, + shard_id, + bc, + input_accessors, + weights_accessors, + {output_grad}, + false); + } } Op *ResidualLayerNorm::materialize(FFModel &ff, @@ -554,13 +909,19 @@ FutureMap ResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } int field_id = 0; // input - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); + // added: input + residual(s) + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, @@ -578,13 +939,15 @@ FutureMap ResidualLayerNorm::inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -620,14 +983,13 @@ void ResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); if (bc->num_tokens == 0) { return; } - ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - assert(regions.size() == - 4 + m->use_two_residuals + + 3 + m->use_two_residuals + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); int region_idx = 0, task_region_idx = 0; @@ -655,13 +1017,23 @@ void ResidualLayerNorm::inference_task( ctx, runtime); } - GenericTensorAccessorW added_output = - helperGetGenericTensorAccessorWO(m->output_type[0], - regions[region_idx++], - task->regions[task_region_idx++], - FID_DATA, - ctx, - runtime); + GenericTensorAccessorW added_output; + if (m->inplace_residual) { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + added_output = + helperGetGenericTensorAccessorWO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(m->output_type[1], regions[region_idx++], @@ -699,8 +1071,14 @@ void ResidualLayerNorm::inference_task( assert(in_domain.get_volume() == residual2_domain.get_volume()); assert(residual2_domain == in_domain); } - Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[task_region_idx++].region.get_index_space()); + Domain added_out_domain; + if (m->inplace_residual) { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + } else { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + } Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[task_region_idx++].region.get_index_space()); Domain gamma_domain, beta_domain; @@ -734,13 +1112,13 @@ void ResidualLayerNorm::inference_task( m->effective_num_elements * m->effective_batch_size); ResidualLayerNorm::inference_kernel_wrapper( - m, input, residual1, residual2, added_output, output, gamma, beta); + m, bc, input, residual1, residual2, added_output, output, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector input_accessors; - input_accessors.push_back(input); + // input_accessors.push_back(input); input_accessors.push_back(residual1); if (m->use_two_residuals) { input_accessors.push_back(residual2); @@ -779,6 +1157,7 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->eps); sez.serialize(this->use_bias); sez.serialize(this->use_two_residuals); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -794,6 +1173,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; bool use_two_residuals; + bool inplace_residual; float eps; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -810,6 +1190,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(eps); dez.deserialize(use_bias); dez.deserialize(use_two_residuals); + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -827,6 +1208,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, params.eps = eps; params.use_bias = use_bias; params.use_two_residuals = use_two_residuals; + params.inplace_residual = inplace_residual; strcpy(params.name, name); if (use_two_residuals) { return ff.get_or_create_node( @@ -853,6 +1235,7 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); hash_combine(key, params.use_bias); hash_combine(key, params.use_two_residuals); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index f1b7a537b0..582e0752ef 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -23,11 +23,12 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; use_two_residuals = ln->use_two_residuals; @@ -36,6 +37,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, profiling = ln->profiling; inference_debugging = ln->inference_debugging; eps = ln->eps; + inplace_residual = ln->inplace_residual; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); @@ -45,6 +47,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { @@ -75,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -84,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -110,8 +111,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T residual2_val = (residual2_ptr == nullptr) ? T(0) @@ -120,12 +120,10 @@ __global__ void ResidualLayerNormKernel(int64_t N, sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -137,7 +135,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -161,19 +159,9 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, T const *beta_ptr, hipStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualLayerNormKernel), - num_blocks, - num_threads, + m->effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), 0, stream, m->effective_num_elements, @@ -188,10 +176,41 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, beta_ptr, output_ptr); } +template +void save_inference_tensors(ResidualLayerNormMeta const *m) { + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } +} /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( - ResidualLayerNormMeta const *m, + ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -208,12 +227,13 @@ void ResidualLayerNorm::inference_kernel_wrapper( checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } + if (m->input_type[0] == DT_FLOAT) { ResidualLayerNorm::inference_kernel( m, input.get_float_ptr(), residual1.get_float_ptr(), - residual2.get_float_ptr(), + m->use_two_residuals ? residual2.get_float_ptr() : nullptr, added_output.get_float_ptr(), output.get_float_ptr(), m->elementwise_affine ? gamma.get_float_ptr() : nullptr, @@ -224,7 +244,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( m, input.get_half_ptr(), residual1.get_half_ptr(), - residual2.get_half_ptr(), + m->use_two_residuals ? residual2.get_half_ptr() : nullptr, added_output.get_half_ptr(), output.get_half_ptr(), m->elementwise_affine ? gamma.get_half_ptr() : nullptr, @@ -234,6 +254,76 @@ void ResidualLayerNorm::inference_kernel_wrapper( assert(false && "unsupport datatype in layernorm"); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->inference_debugging) { + if (m->input_type[0] == DT_FLOAT) { + save_inference_tensors(m); + } else if (m->input_type[0] == DT_HALF) { + save_inference_tensors(m); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -245,4 +335,551 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad1) { + dX_residual1_i[l] = f_grad_input; + } else { + dX_residual1_i[l] += f_grad_input; + } + if (dX_residual2 != nullptr) { + if (reset_residual_grad2) { + dX_residual2_i[l] = f_grad_input; + } else { + dX_residual2_i[l] += f_grad_input; + } + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual1, + dX_residual2, + reset_input_grad, + reset_residual_grad1, + reset_residual_grad2, + N, + buf); +} + +/*static*/ +template +void backward_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), + B, + dim3(kThreadX, kThreadY), + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void ResidualLayerNorm::backward_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void peft_bwd_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); +} + +/*static*/ +void ResidualLayerNorm::peft_bwd_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index e5ebdce6ed..8cdf87a92c 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -22,11 +22,12 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; use_two_residuals = ln->use_two_residuals; @@ -35,6 +36,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, profiling = ln->profiling; inference_debugging = ln->inference_debugging; eps = ln->eps; + inplace_residual = ln->inplace_residual; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); @@ -44,6 +46,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { @@ -74,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -83,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -109,8 +110,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T residual2_val = (residual2_ptr == nullptr) ? T(0) @@ -119,12 +119,10 @@ __global__ void ResidualLayerNormKernel(int64_t N, sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -136,7 +134,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -160,33 +158,57 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualLayerNormKernel - <<>>(m->effective_num_elements, - m->eps, - input_ptr, - residual1_ptr, - residual2_ptr, - added_output_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - output_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + input_ptr, + residual1_ptr, + residual2_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); +} +template +void save_inference_tensors(ResidualLayerNormMeta const *m) { + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } } /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( - ResidualLayerNormMeta const *m, + ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -203,6 +225,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + if (m->input_type[0] == DT_FLOAT) { ResidualLayerNorm::inference_kernel( m, @@ -229,6 +252,76 @@ void ResidualLayerNorm::inference_kernel_wrapper( assert(false && "unsupport datatype in layernorm"); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->inference_debugging) { + if (m->input_type[0] == DT_FLOAT) { + save_inference_tensors(m); + } else if (m->input_type[0] == DT_HALF) { + save_inference_tensors(m); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -240,4 +333,529 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad1) { + dX_residual1_i[l] = f_grad_input; + } else { + dX_residual1_i[l] += f_grad_input; + } + if (dX_residual2 != nullptr) { + if (reset_residual_grad2) { + dX_residual2_i[l] = f_grad_input; + } else { + dX_residual2_i[l] += f_grad_input; + } + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual1, + dX_residual2, + reset_input_grad, + reset_residual_grad1, + reset_residual_grad2, + N, + buf); +} + +/*static*/ +template +void backward_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void ResidualLayerNorm::backward_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void peft_bwd_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); +} + +/*static*/ +void ResidualLayerNorm::peft_bwd_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index b3ee7179d0..744902f908 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -42,7 +42,8 @@ using namespace FlexFlow::Kernels::ResidualRMSNorm; bool operator==(ResidualRMSNormParams const &lhs, ResidualRMSNormParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps; + return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps && + lhs.dim == rhs.dim && lhs.inplace_residual == rhs.inplace_residual; } bool ResidualRMSNormParams::is_valid( @@ -55,7 +56,8 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; - if (this->name != nullptr) { + params.inplace_residual = this->inplace_residual; + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -66,6 +68,7 @@ void FFModel::residual_rms_norm(const Tensor input1, Tensor *outputs, float eps, int dim, + bool inplace_residual, DataType data_type, char const *name) { if (data_type == DT_NONE) { @@ -90,9 +93,9 @@ void FFModel::residual_rms_norm(const Tensor input1, casted_input2); rm->outputs[0] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/); rm->outputs[1] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/); // weights int weight_dims[1] = {dim}; @@ -100,12 +103,13 @@ void FFModel::residual_rms_norm(const Tensor input1, weight_dims, data_type, rm, - true /*create_grad*/, + false /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); rm->add_float_property("eps", eps); rm->add_int_property("dim", dim); + rm->add_int_property("inplace_residual", inplace_residual); layers.push_back(rm); outputs[0] = rm->outputs[0]; outputs[1] = rm->outputs[1]; @@ -120,6 +124,8 @@ Op *ResidualRMSNorm::create_operator_from_layer( long long value; layer->get_int_property("dim", value); int dim = value; + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; return new ResidualRMSNorm(model, layer->layer_guid, @@ -127,6 +133,7 @@ Op *ResidualRMSNorm::create_operator_from_layer( inputs[1], eps, dim, + inplace_residual, false, layer->name); } @@ -143,6 +150,7 @@ ResidualRMSNorm::ResidualRMSNorm( inputs.second, params.eps, params.dim, + params.inplace_residual, allocate_weights, params.name) {} @@ -157,6 +165,7 @@ ResidualRMSNorm::ResidualRMSNorm( inputs.second, other.eps, other.dim, + other.inplace_residual, allocate_weights, other.name) {} ResidualRMSNorm::ResidualRMSNorm(FFModel &model, @@ -165,6 +174,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, const ParallelTensor _input2, float _eps, int dim, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -177,6 +187,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, _input1, _input2) { eps = _eps; + inplace_residual = _inplace_residual; inputs[0] = _input1; inputs[1] = _input2; layer_guid = _layer_guid; @@ -234,6 +245,22 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, } } +void ResidualRMSNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } +} + void ResidualRMSNorm::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -249,36 +276,44 @@ void ResidualRMSNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -306,36 +341,45 @@ void ResidualRMSNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -383,73 +427,131 @@ FutureMap 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); return runtime->execute_index_space(ctx, launcher); } /* - regions[0](I): input1 + regions[0](I/O): input1 / residual output regions[1](I): input2 - regions[2](O): residual output - regions[3](O): output - regions[4](I/O): weight + regions[2](O): output + regions[3](I): weight */ void ResidualRMSNorm::inference_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 5); - assert(regions.size() == 5); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; } ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + assert(task->regions.size() == 5 - m->inplace_residual); + assert(regions.size() == 5 - m->inplace_residual); GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input1, input2, weight, residual_output, output); + + GenericTensorAccessorW residual_output, output; + GenericTensorAccessorR weight; + if (m->inplace_residual) { + // residual_output is mapped to the same region as the input + residual_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + output = helperGetGenericTensorAccessorWO(m->output_type[1], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + weight = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + } else { + residual_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + output = helperGetGenericTensorAccessorWO(m->output_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + weight = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[4], + task->regions[4], + FID_DATA, + ctx, + runtime); + } + + inference_kernel_wrapper( + m, bc, input1, input2, weight, residual_output, output); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - ResidualRMSNorm::save_inference_tensors_to_file( - m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output}); + if (m->inplace_residual) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input2}, {weight}, {residual_output, output}); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input1, input2}, + {weight}, + {residual_output, output}); + } } } @@ -459,6 +561,7 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -479,6 +582,8 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + int inplace_residual; + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -487,13 +592,285 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + params.inplace_residual = inplace_residual; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } void ResidualRMSNorm::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RESIDUAL_RMSNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): RMS output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I): residual output / RMS input + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[2](I/O): residual input grad 0 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + // regions[3](I/O): residual input grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(3, FID_DATA); + // regions[4](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + // regions[5](I/O): gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(5, FID_DATA); + + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad +*/ +void ResidualRMSNorm::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 6); + assert(regions.size() == 6); + ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_output_rms_input = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_input0_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_input1_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime); + backward_kernel_wrapper(m, + output_grad, + residual_output_rms_input, + residual_input0_grad, + residual_input1_grad, + weight, + weight_grad); } + +Legion::FutureMap + ResidualRMSNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int fid = 0; + // residual input grad 0 + launcher.add_region_requirement(RegionRequirement( + batch_inputs[0]->part_grad, + 0 /*projection id*/, + inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // residual input grad 1 + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual && !reset_input_grads[0]) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + } + // RMS output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(fid++, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): RMS output_grad + regions[1](I/O): Residual input 0 grad + regions[2](I/O): Residual input 1 grad + regions[3](I): weight +*/ +void ResidualRMSNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + int expected_regions = + (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5; + assert(task->regions.size() == expected_regions); + assert(regions.size() == expected_regions); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + + int rid = 0, t_rid = 0; + GenericTensorAccessorW input_grad_0 = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad_1 = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + + GenericTensorAccessorR output_grad_0; + if (!m->reset_input_grads[0]) { + if (m->inplace_residual) { + // mapped to input 0 + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + } + } + GenericTensorAccessorR output_grad_1 = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + + peft_bwd_kernel_wrapper( + m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + if (!m->reset_input_grads[0]) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_0, output_grad_1}, + false); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_1}, + false); + } + } +} + Op *ResidualRMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { @@ -516,6 +893,7 @@ size_t hash::operator()( hash_combine(key, params.eps); hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 79dce65c57..8dadd7dcc3 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -53,7 +53,7 @@ RMSNormParams RMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -422,7 +422,7 @@ void RMSNorm::inference_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input, weight, output); + inference_kernel_wrapper(m, bc, input, weight, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -431,6 +431,166 @@ void RMSNorm::inference_task(Task const *task, } } +void RMSNorm::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RMSNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): output_grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I): input + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[2](I/O): input_grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + // regions[3](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + // regions[4](I/O): gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(4, FID_DATA); + + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I): input + regions[2](I/O): input_grad + regions[3](I): weight + regions[4](I/O): weight_grad +*/ +void RMSNorm::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 5); + assert(regions.size() == 5); + RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + backward_kernel_wrapper( + m, output_grad, input, input_grad, weight, weight_grad); +} + +Legion::FutureMap + RMSNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // regions[0](I): output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I/O): input_grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + // regions[2](I): weight + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I/O): input_grad + regions[2](I): weight +*/ +void RMSNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 3); + assert(regions.size() == 3); + RMSNormMeta *m = *((RMSNormMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + RMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } +} + void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); @@ -474,11 +634,9 @@ Op *RMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { RMSNormParams params = get_params(); - return new RMSNorm(ff, params, inputs[0], true, this->name); + return new RMSNorm(ff, params, inputs[0], true, params.name); } -void RMSNorm::backward(FFModel const &ff) {} - bool RMSNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index b38c68843b..0358a2cd31 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -88,7 +88,7 @@ Op *Sampling::create_operator_from_layer( SamplingParams Sampling::get_params() const { SamplingParams params; params.top_p = this->top_p; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -302,7 +302,7 @@ InferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); Sampling::forward_kernel_wrapper(m, input, indices, batch_size); if (m->inference_debugging) { @@ -313,7 +313,7 @@ InferenceResult } InferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3d1c8d9094..e7c2fea19c 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -52,7 +52,7 @@ bool SigmoidSiluMultiParams::is_valid( SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { SigmoidSiluMultiParams params; params.layer_guid = this->layer_guid; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -254,7 +254,188 @@ void SigmoidSiluMulti::forward(FFModel const &ff) { } void SigmoidSiluMulti::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(SIGMOID_SILU_MULTI_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // output grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // input 1 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // input 2 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(2, FID_DATA); + // input 1 grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(3, FID_DATA); + // input 2 grad + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(4, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output grad + regions[1](I): input 1 + regions[2](I): input 2 + regions[3](I/O): input 1 grad + regions[4](I/O): input 2 grad +*/ +void SigmoidSiluMulti::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 5); + + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime); + + SigmoidSiluMulti::backward_kernel_wrapper( + m, output_grad, input1, input2, input1_grad, input2_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file( + m, + shard_id, + nullptr, + {output_grad, input1, input2}, + {}, + {input1_grad, input2_grad}); + } +} + +FutureMap + SigmoidSiluMulti::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // output grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // input 1 grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + // input 2 grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output grad + regions[3](I/O): input 1 grad + regions[4](I/O): input 2 grad +*/ +void SigmoidSiluMulti::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 3); + + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + SigmoidSiluMulti::peft_bwd_kernel_wrapper( + m, bc, output_grad, input1_grad, input2_grad); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file(m, + shard_id, + nullptr, + {input1_grad, input2_grad}, + {}, + {output_grad}, + false); + } } FutureMap SigmoidSiluMulti::inference( @@ -347,7 +528,7 @@ void SigmoidSiluMulti::inference_task( assert(input1_domain == input2_domain); assert(input1_domain == output_domain); - SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output); + SigmoidSiluMulti::inference_kernel_wrapper(m, bc, input1, input2, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 7b7f30a288..ceaa1a7788 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -23,7 +23,7 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; } @@ -34,36 +34,56 @@ SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { } } -__device__ __forceinline__ float sigmoid_float(float x) { - return 1.0 / (1.0 + expf(-x)); -} - -__device__ __forceinline__ half sigmoid_half(half x) { - return (half)1.0 / ((half)1.0 + hexp(-x)); -} - -__global__ void SigmoidSiluMultiKernelFloat(int num_elements, - float const *input1_ptr, - float const *input2_ptr, - float *output_ptr) { +template +__global__ void SigmoidSiluMultiKernel(int num_elements, + T const *input1_ptr, + T const *input2_ptr, + T *output_ptr) { CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = - input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i]; + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i]; } } -__global__ void SigmoidSiluMultiKernelHalf(int num_elements, - half const *input1_ptr, - half const *input2_ptr, - half *output_ptr) { +template +__global__ void SigmoidSiluMultiBackwardKernel(int num_elements, + T const *output_grad_ptr, + T const *input1_ptr, + T const *input2_ptr, + T *input1_grad_ptr, + T *input2_grad_ptr, + bool reset_input_grad1, + bool reset_input_grad2) { CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i]; + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + + if (reset_input_grad2) { + input2_grad_ptr[i] = + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } else { + input2_grad_ptr[i] += + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } + T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; + if (reset_input_grad1) { + input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); + } else { + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + } + T sig_grad = ss_grad_val * input1_ptr[i]; + + float x1_grad_val = static_cast(sig_grad); + x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val); + input1_grad_ptr[i] += T(x1_grad_val); } } /*static*/ void SigmoidSiluMulti::inference_kernel_wrapper( - SigmoidSiluMultiMeta const *m, + SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output) { @@ -81,8 +101,84 @@ void SigmoidSiluMulti::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t input_tensor_size = + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; + size_t activation_size_needed = + 2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync(m->input_activation, + input1.get_float_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_float_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync(m->input_activation, + input1.get_half_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_half_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat), + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel), GET_BLOCKS(num_elements), min(CUDA_NUM_THREADS, num_elements), 0, @@ -92,7 +188,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( input2.get_float_ptr(), output.get_float_ptr()); } else if (m->input_type[0] == DT_HALF) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf), + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel), GET_BLOCKS(num_elements), min(CUDA_NUM_THREADS, num_elements), 0, @@ -116,4 +212,159 @@ void SigmoidSiluMulti::inference_kernel_wrapper( } } +/*static*/ +void SigmoidSiluMulti::backward_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = output_grad.domain.get_volume(); + assert(input1.domain.get_volume() == num_elements); + assert(input2.domain.get_volume() == num_elements); + assert(input1_grad.domain.get_volume() == num_elements); + assert(input2_grad.domain.get_volume() == num_elements); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + input1.get_float_ptr(), + input2.get_float_ptr(), + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + input1.get_half_ptr(), + input2.get_half_ptr(), + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed); + } +} + +/*static*/ +void SigmoidSiluMulti::peft_bwd_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume()); + assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume()); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + int num_peft_requests = 0; + int num_peft_tokens = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + } + } + if (num_peft_requests == 0) { + // No PEFT requests + return; + } else { + // Otherwise assume at most 1 peft request + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + } + int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int num_elements = in_dim * num_peft_tokens; + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + num_elements, + output_grad.get_float_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + num_elements, + output_grad.get_half_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 590b641b5a..929d557a17 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -22,7 +22,7 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; } @@ -45,9 +45,44 @@ __global__ void SigmoidSiluMultiKernel(int num_elements, } } +template +__global__ void SigmoidSiluMultiBackwardKernel(int num_elements, + T const *output_grad_ptr, + T const *input1_ptr, + T const *input2_ptr, + T *input1_grad_ptr, + T *input2_grad_ptr, + bool reset_input_grad1, + bool reset_input_grad2) { + CUDA_KERNEL_LOOP(i, num_elements) { + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + + if (reset_input_grad2) { + input2_grad_ptr[i] = + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } else { + input2_grad_ptr[i] += + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } + T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; + if (reset_input_grad1) { + input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); + } else { + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + } + T sig_grad = ss_grad_val * input1_ptr[i]; + + float x1_grad_val = static_cast(sig_grad); + x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val); + input1_grad_ptr[i] += T(x1_grad_val); + } +} + /*static*/ void SigmoidSiluMulti::inference_kernel_wrapper( - SigmoidSiluMultiMeta const *m, + SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output) { @@ -64,6 +99,83 @@ void SigmoidSiluMulti::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t input_tensor_size = + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; + size_t activation_size_needed = + 2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + input1.get_float_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + checkCUDA(cudaMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_float_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + input1.get_half_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + checkCUDA(cudaMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_half_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { SigmoidSiluMultiKernel<<profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiBackwardKernel<<>>(output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + input1.get_float_ptr(), + input2.get_float_ptr(), + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiBackwardKernel<<>>(output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + input1.get_half_ptr(), + input2.get_half_ptr(), + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed); + } +} + +/*static*/ +void SigmoidSiluMulti::peft_bwd_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume()); + assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume()); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + int num_peft_requests = 0; + int num_peft_tokens = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + } + } + if (num_peft_requests == 0) { + // No PEFT requests + return; + } else { + // Otherwise assume at most 1 peft request + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + } + int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int num_elements = in_dim * num_peft_tokens; + + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiBackwardKernel<<>>( + num_elements, + output_grad.get_float_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiBackwardKernel<<>>( + num_elements, + output_grad.get_half_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 03618423be..a02d88b98b 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -86,7 +86,7 @@ SoftmaxParams Softmax::get_params() const { SoftmaxParams params; params.layer_guid = this->layer_guid; params.dim = this->dim; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -270,52 +270,12 @@ OpMeta *Softmax::init_task(Task const *task, domain = input_domain; } SoftmaxMeta *m = new SoftmaxMeta(handle, softmax, domain); - m->input_type = softmax->inputs[0]->data_type; - m->output_type = softmax->outputs[0]->data_type; // checkCUDNN(cudnnCreateTensorDescriptor(&m->outputTensor)); std::strcpy(m->op_name, softmax->name); m->layer_guid = softmax->layer_guid; return m; } -FutureMap Softmax::inference(FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv - << std::endl; */ - IndexLauncher launcher(SOFTMAX_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - return runtime->execute_index_space(ctx, launcher); -} - void Softmax::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -354,17 +314,11 @@ void Softmax::forward_task(Task const *task, ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - if (m->output_type == DT_HALF) { - forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); - } else if (m->output_type == DT_FLOAT) { - forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else { - assert(false && "Unsupported data type"); - } + forward_kernel_wrapper(m, input, output); } void Softmax::backward(FFModel const &ff) { @@ -402,52 +356,69 @@ void Softmax::backward_task(Task const *task, Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - if (m->output_type == DT_HALF) { \ - return backward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type == DT_FLOAT) { \ - return backward_task_with_dim(task, regions, ctx, runtime); \ - } else { \ - assert(false && "Unsupported data type"); \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); - } + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + backward_kernel_wrapper(m, input_grad, output_grad); } -/* - regions[0](I/O): input_grad - regions[1](I): output_grad -*/ -// Note that the backward task of softmax is actually a no op (i.e., input_grad -// = output_grad) since the upstream cross_entropy_loss function computes -// performs softmax_cross_entropy_loss to avoid intermediate zeros -template -void Softmax::backward_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Softmax* softmax = (Softmax*) task->args; - SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - TensorAccessorW acc_input_grad(regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - TensorAccessorR acc_output_grad( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - // make sure the image indices match! - assert(acc_input_grad.rect == acc_output_grad.rect); - - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); +FutureMap Softmax::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SOFTMAX_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // if this is the last operator, we add the region below in order to copy the + // output to the grad tensor + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + int last_op = ff.operators.size() - 1; + assert(ff.operators[last_op]->op_type == OP_ARGMAX || + ff.operators[last_op]->op_type == OP_ARG_TOPK || + ff.operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + if (ff.operators[last_op] == this) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); } void Softmax::inference_task(Task const *task, @@ -455,8 +426,8 @@ void Softmax::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3 || regions.size() == 2); + bool is_last_op = (regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -465,16 +436,19 @@ void Softmax::inference_task(Task const *task, ctx, task->regions[0].region.get_index_space()); SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); - if (m->output_type == DT_HALF) { - forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); - } else if (m->output_type == DT_FLOAT) { - forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else { - assert(false && "Unsupported data type"); + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad; + if (is_last_op) { + output_grad = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); } + inference_kernel_wrapper(m, bc, is_last_op, input, output, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -483,6 +457,73 @@ void Softmax::inference_task(Task const *task, } } +FutureMap Softmax::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void Softmax::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + assert(regions.size() == 2); + assert(task->regions.size() == 2); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Softmax::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } +} + bool Softmax::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_SOFTMAX_DIM: @@ -508,29 +549,35 @@ bool Softmax::measure_operator_cost(Simulator *sim, sim->free_all(); float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorR input_acc(DT_FLOAT, sub_input.get_domain(), input_ptr); assert(input_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorW output_acc( + DT_FLOAT, sub_output.get_domain(), output_ptr); assert(output_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); }; + forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorW input_grad_acc( + DT_FLOAT, sub_input.get_domain(), input_grad_ptr); assert(input_grad_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorW output_grad_acc( + DT_FLOAT, sub_output.get_domain(), output_grad_ptr); assert(output_grad_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, input_grad_acc, output_grad_acc); }; } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 68d3a4c205..52da51fb26 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -850,7 +850,7 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index b1687d12a2..aebd5e8892 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -141,7 +141,7 @@ template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, hipStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); @@ -200,15 +200,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; // int qkv_block_size = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index a00ea9c95f..4688a8233c 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -361,7 +361,7 @@ template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; @@ -471,17 +471,18 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; @@ -541,20 +542,9 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, DT const *A = static_cast
(m->devQKVProjArray) + bc->requestsInfo[i].first_token_offset_in_batch * m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - - // print_tensor((float*)A, 32, "A"); DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + DT *C = static_cast
(m->qk_prods); - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -854,29 +844,15 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { beam_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); - + static_cast( + handler.batch_config_metadata->beamTokenInfo); beam_request_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask)); + static_cast( + handler.batch_config_metadata->beamRequestsInfo); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } cudaStreamSynchronize(stream); diff --git a/src/ops/split.cc b/src/ops/split.cc index 7c6b631b20..92cfbd49e9 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -50,7 +50,7 @@ SplitParams Split::get_params() const { SplitParams params; params.splits = this->splits; params.legion_axis = this->legion_axis; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 7d30a8aff3..0e88befa68 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -87,7 +87,7 @@ TopKParams TopK::get_params() const { TopKParams params; params.k = this->k; params.sorted = this->sorted; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -226,7 +226,7 @@ OpMeta *TopK::init_task(Task const *task, Runtime *runtime) { TopK *topk = (TopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - TopKMeta *m = new TopKMeta(handle); + TopKMeta *m = new TopKMeta(handle, topk); m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; @@ -474,7 +474,7 @@ bool TopK::measure_operator_cost(Simulator *sim, return false; } - TopKMeta *m = new TopKMeta(sim->handler); + TopKMeta *m = new TopKMeta(sim->handler, this); m->sorted = sorted; // allocate diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp index b6e898b654..303c6e85e9 100644 --- a/src/ops/topk.cpp +++ b/src/ops/topk.cpp @@ -513,6 +513,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, // TODO: missing profiling here } -TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {} +TopKMeta::TopKMeta(FFHandler handler, TopK const *topk) + : OpMeta(handler, topk) {} }; // namespace FlexFlow diff --git a/src/ops/topk.cu b/src/ops/topk.cu index cc87ee8a42..cfb2bf6448 100644 --- a/src/ops/topk.cu +++ b/src/ops/topk.cu @@ -509,6 +509,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, } } -TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {} +TopKMeta::TopKMeta(FFHandler handler, TopK const *topk) + : OpMeta(handler, topk) {} }; // namespace FlexFlow diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index 7a179c4f7d..bffde477de 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -51,7 +51,7 @@ TransposeParams Transpose::get_params() const { for (int i = 0; i < outputs[0]->num_dims; i++) { params.perm.push_back(this->perm[i]); } - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -193,7 +193,7 @@ OpMeta *Transpose::init_task(Task const *task, Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - TransposeMeta *m = new TransposeMeta(handle); + TransposeMeta *m = new TransposeMeta(handle, transpose); transpose->init_meta(m, in_domain, out_domain); m->profiling = transpose->profiling; m->inference_debugging = transpose->inference_debugging; @@ -320,7 +320,7 @@ bool Transpose::measure_operator_cost(Simulator *sim, return false; } - TransposeMeta *m = sim->transpose_meta; + TransposeMeta *m = new TransposeMeta(sim->handler, this); this->init_meta(m, sub_input.get_domain(), sub_output.get_domain()); sim->free_all(); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index df722a3d51..132a48be40 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -924,7 +924,7 @@ TreeIncMultiHeadSelfAttentionParams params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; params.tensor_parallelism_degree = this->tensor_parallelism_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 26291fb3b4..890d32bc87 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -16,6 +16,8 @@ #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/hip_helper.h" #include #include @@ -26,11 +28,333 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + using namespace Kernels::IncMultiHeadAttention; namespace Kernels { namespace TreeIncMultiHeadAttention { +template +__device__ __forceinline__ T + WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T + WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + +template +__global__ void compute_attention_kernel_fused_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int const max_token_per_batch, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + int num_heads, + int num_requests, + BatchConfig::BitMask *causalMask, + bool *request_completed, + int qk_smem_sz) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = + request_infos[batch_config_request_id].num_tokens_in_batch; + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int first_token_idx = 0; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += + request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; + } + + bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; + int q_start = + request_infos[batch_config_request_id].first_token_depth_in_request; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_ + qk_smem_sz); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < qlength; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + + // if (head_idx == 0 && request_idx == 1 && tidx == 0) { + // printf("laod q %d, %d %.10f\n", + // request_idx, + // qi,q_vecs[ki_o][ii].x); + // } + } + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + bool const mask = + prompt_phase ? (qi + q_start < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); + + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + + // if (head_idx == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, + // %.10f, %d\n", + // request_idx, + // qi, + // ti, + // qk, + // q_vecs[ki_o][0].x, + // k[0].x, + // bitmask.non_tree_cache_size); + // } + qk_smem[ti - first_step] = mask ? 0.0f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0); + + // if (head_idx == 0 && qi == 9 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + bool const mask = + prompt_phase ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + if (ti < tlength) { + bool const mask = + prompt_phase + ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, + // out.y, + // out.z, + // out.w, + // vi, + // (first_token_idx + qi) * hidden_size + head_idx * + // per_head_size + + // vi); + // } + } + } +} + template __global__ void commit_tokens_kernel( DT const *devQKVProjArray, @@ -45,15 +369,15 @@ __global__ void commit_tokens_kernel( int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - int token_pos = i / (hidden_size * KV_WEIGHT_NUM); + int token_pos = i / (hidden_size); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - size_t val_idx = - token_idx_in_last_batch * 3 * hidden_size + hidden_size + offset; + size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size + + hidden_size + offset; DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; @@ -89,8 +413,9 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length(), + m->num_active_infr_tokens, // number of active tokens in previous batch + BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(), m->hidden_size); } } @@ -109,12 +434,15 @@ __global__ void update_tree_branch_kv_cache( int total_tokens_in_batch, int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { + + int token_idx = i / (hidden_size); int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch - size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; @@ -127,6 +455,53 @@ __global__ void update_tree_branch_kv_cache( } } +template +__global__ void update_tree_branch_kv_cache_fused( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo *request_infos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_new_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { + + int token_idx = i / hidden_size; + int offset = i % hidden_size; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + + int const req_id = tokenInfos[token_idx].request_index; + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + int const request_token_offset = + request_infos[req_id].first_token_offset_in_batch; + int const first_token_depth = + request_infos[req_id].first_token_depth_in_request; + + // if(i % hidden_size == 0){ + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", + // req_id, token_idx, request_token_offset,(token_idx + first_token_depth + // - request_token_offset), kVal); + // } + kCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = vVal; + } +} + template __global__ void tree_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, @@ -157,13 +532,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = @@ -171,16 +547,20 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(); int vt_block_size = m->vProjSize; int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } + assert(processed_tokens_in_batch == + bc->requestsInfo[i].first_token_offset_in_batch); int last_token_idx_of_the_request = processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; while (processed_tokens_in_batch <= last_token_idx_of_the_request) { @@ -213,7 +593,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), m->hidden_size); } @@ -335,24 +715,23 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; + m_ = m->vProjSize; + n = num_new_tokens; k = total_tokens_in_request; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens_in_request; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens_in_request; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, HIPBLAS_OP_N, HIPBLAS_OP_T, @@ -376,45 +755,44 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, HIPBLAS_GEMM_DEFAULT)); - - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + - processed_tokens_in_batch * m->oProjSize; - - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); processed_tokens_in_batch += num_new_tokens; } // Before moving to the next request // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } + // Project to output, save result directly on output tensor + DT alpha = 1.0f, beta = 0.0f; + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = processed_tokens_in_batch; + int lda = k, ldb = k, ldc = m_; + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + DT const *B = static_cast
(m->attn_heads); + DT *C = static_cast
(output_ptr); + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + B, + hipblas_data_type, + ldb, + &beta, + C, + hipblas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + @@ -432,7 +810,85 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - assert(processed_tokens_in_batch == bc->num_active_tokens()); + assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); +} + +#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_size_in_bytes_tree
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::max_spec_tree_token_num(), \ + THDS_PER_VALUE, \ + THDS_PER_BLOCK, \ + bc, \ + smem_sz); \ + compute_attention_kernel_fused_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::BatchConfig::max_spec_tree_token_num(), \ + BatchConfig::max_tokens_per_batch(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->num_q_heads, \ + bc->num_active_requests(), \ + m->causalMask, \ + m->request_completed, \ + smem_sz[0]) + +template +void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + DT *output_ptr, + hipStream_t stream) { + + // update the kv cache + // update K-V cache + int num_new_tokens = bc->num_active_tokens(); + int parallelism = m->hidden_size * num_new_tokens; + update_tree_branch_kv_cache_fused<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_new_tokens, + BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(), + m->hidden_size); + + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + // 0->qk production size, 1->total shared size + int smem_sz[2]; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } } template @@ -461,21 +917,17 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } } // copy committed tokens info to GPU for the commit_tokens kernel - // Note that m->num_active_tokens stores the number of active + // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache - checkCUDA( - hipMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - hipMemcpyHostToDevice, - stream)); + // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << + // "\n"; + commit_tokens
(m, bc, stream); - // After commit we update m->num_active_tokens to be the number of active + // After commit we update m->num_active_infr_tokens to be the number of active // tokens for the current batch - m->num_active_tokens = bc->num_active_tokens(); + m->num_active_infr_tokens = bc->num_active_infr_tokens(); // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { @@ -483,12 +935,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } - checkCUDA(hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream)); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -502,11 +948,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( // m, bc, stream); + // use the new kernel + compute_attention_kernel_fused
( + m, bc, static_cast
(m->attn_heads), stream); + + int processed_tokens_in_batch = bc->num_active_tokens(); - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + compute_o_prod_bias(m, + bc, + shard_id, + output_ptr, + weight_ptr, + bias_ptr, + processed_tokens_in_batch, + stream); } } // namespace TreeIncMultiHeadAttention @@ -622,34 +1077,21 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_kv_heads, attn->quantization_type, attn->offload), - num_active_tokens(0) { + num_active_infr_tokens(0) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); - if (offload) { - // assert that we have enough reserved work space left - assert(gpu_mem_allocator.reserved_total_size - - gpu_mem_allocator.reserved_allocated_size >= - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); - } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); - } + + causalMask = static_cast( + handler.batch_config_metadata->causalMask); + committed_token_infos = + static_cast( + handler.batch_config_metadata->committed_tokens); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } checkCUDA(hipStreamSynchronize(stream)); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 50c056c816..86c53d7ea1 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -12,9 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" -#endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" @@ -390,7 +388,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch + m->num_active_infr_tokens, // number of active tokens in previous batch BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num(), m->hidden_size); @@ -509,17 +507,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = @@ -571,7 +570,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), m->hidden_size); } @@ -773,6 +772,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + @@ -788,7 +788,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - assert(processed_tokens_in_batch == bc->num_active_tokens()); + assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); } #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ @@ -896,7 +896,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } // copy committed tokens info to GPU for the commit_tokens kernel - // Note that m->num_active_tokens stores the number of active + // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << @@ -904,9 +904,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, commit_tokens
(m, bc, stream); - // After commit we update m->num_active_tokens to be the number of active + // After commit we update m->num_active_infr_tokens to be the number of active // tokens for the current batch - m->num_active_tokens = bc->num_active_tokens(); + m->num_active_infr_tokens = bc->num_active_infr_tokens(); // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { @@ -1052,7 +1052,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_kv_heads, attn->quantization_type, attn->offload), - num_active_tokens(0) { + num_active_infr_tokens(0) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); @@ -1060,21 +1060,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); committed_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens)); + static_cast( + handler.batch_config_metadata->committed_tokens); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } cudaStreamSynchronize(stream); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 5d38e28903..52c4ec2e28 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::AllReduce; /* Params */ bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { - return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim; + return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim && + std::strcmp(lhs.name, rhs.name) == 0; } bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { @@ -55,7 +56,7 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { AllReduceParams AllReduce::get_params() const { AllReduceParams params; params.allreduce_legion_dim = this->allreduce_dim; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -110,6 +111,7 @@ OpMeta *AllReduce::init_task(Task const *task, meta->input_type[0] = ar->inputs[0]->data_type; meta->output_type[0] = ar->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, ar->name); return meta; } @@ -146,6 +148,102 @@ void AllReduce::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + void AllReduce::init_inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -224,64 +322,103 @@ FutureMap AllReduce::inference(FFModel const &ff, return runtime->execute_index_space(ctx, launcher); } -void AllReduce::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = outputs[0]->parallel_is; - assert(numOutputs == 1); - assert(numInputs == 1); - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, - outputs[0]->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +/*static*/ +void AllReduce::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + AllReduce::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {output}); + } } -void AllReduce::backward(FFModel const &ff) { +FutureMap AllReduce::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, - inputs[0]->parallel_is, - TaskArgument(NULL, 0), + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - inputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void AllReduce::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + AllReduce::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } } bool AllReduce::measure_operator_cost(Simulator *sim, @@ -318,62 +455,6 @@ bool AllReduce::append_parallel_op_info( return true; } -/*static*/ -void AllReduce::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input.data_type == output.data_type); - inference_kernel_wrapper(m, bc, input, output); -} - -/*static*/ -void AllReduce::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input.data_type == output.data_type); - forward_kernel_wrapper(m, input, output); -} - -void AllReduce::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input_grad.data_type == output_grad.data_type); - backward_kernel_wrapper(m, input_grad, output_grad); -} - }; // namespace FlexFlow namespace std { diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index acc5c414c7..ce9c032350 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Combine; /* Params */ bool operator==(CombineParams const &lhs, CombineParams const &rhs) { return lhs.combine_legion_dim == rhs.combine_legion_dim && - lhs.combine_degree == rhs.combine_degree; + lhs.combine_degree == rhs.combine_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool CombineParams::is_valid(ParallelTensorShape const &input) const { @@ -58,7 +59,7 @@ CombineParams Combine::get_params() const { CombineParams params; params.combine_legion_dim = this->combine_dim; params.combine_degree = this->combine_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -102,10 +103,11 @@ OpMeta *Combine::init_task(Task const *task, Runtime *runtime) { Combine *cmb = (Combine *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - CombineMeta *m = new CombineMeta(handle); + CombineMeta *m = new CombineMeta(handle, cmb); m->input_type[0] = cmb->inputs[0]->data_type; m->output_type[0] = cmb->outputs[0]->data_type; assert(m->input_type[0] == m->output_type[0]); + std::strcpy(m->op_name, cmb->name); return m; } @@ -202,12 +204,23 @@ void Combine::create_input_partition_inference( assert(ff.config.computationMode == COMP_MODE_INFERENCE); assert(batch_outputs[0]->part != LogicalPartition::NO_PART); assert(batch_inputs[0]->part != LogicalPartition::NO_PART); - // input_lp is a disjoint partition + // partition batch_inputs[0]->region into inference_input_lps[batch_inputs[0]] + // according to the partitioning of batch_outputs[0] (i.e. make the + // partitioned dimension whole again by combining the partitions) ff.create_disjoint_partition(batch_outputs[0]->num_dims, batch_outputs[0]->dims, batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // partition batch_outputs[0]->region_grad into + // inference_output_grad_lps[batch_outputs[0]] according to the partitioning + // of batch_inputs[0] (i.e. restore the partition in the dimension that was + // combined in the forward pass) + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } FutureMap Combine::inference(FFModel const &ff, @@ -226,7 +239,7 @@ FutureMap Combine::inference(FFModel const &ff, size_t machine_view_hash = mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(COMBINE_FWD_TASK_ID, + IndexLauncher launcher(COMBINE_INF_TASK_ID, batch_outputs[0]->parallel_is, TaskArgument(nullptr, 0), argmap, @@ -234,6 +247,7 @@ FutureMap Combine::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(inference_input_lps[batch_inputs[0]], 0 /*projection id*/, @@ -278,6 +292,52 @@ void Combine::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Combine::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = inputs[0]->data_type; + + // Warning: we need to use batch_inputs[0] here, instead of the usual + // batch_outputs[0] + parallel_is = batch_inputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view; + + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(&data_type, sizeof(DataType)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Combine::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -357,6 +417,37 @@ tl::optional Combine::as_dot() const { return rf; } +/*static*/ +void Combine::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + CombineMeta const *m = *((CombineMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + DataType data_type = m->input_type[0]; + if (m->inference_debugging) { + std::cout << "INF " << m->op_name << std::endl; + } + if (data_type == DT_HALF) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_FLOAT) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + forward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Combine forward"); + } +} + /*static*/ void Combine::forward_task(Task const *task, std::vector const ®ions, @@ -400,6 +491,56 @@ void Combine::forward_task_with_type(Task const *task, forward_kernel
(input_ptr, output_ptr, output_domain.get_volume()); } +void Combine::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + // CombineMeta const *m = *((CombineMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + // TODO: figure out why m->output_type[0] or m->input_type[0] are not working + DataType data_type = *((DataType *)task->args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + assert(input_grad.data_type == data_type); + assert(output_grad.domain == input_grad.domain); + CombineMeta const *m = *((CombineMeta **)task->local_args); + int shard_id = task->index_point.point_data[0]; + if (shard_id == 0 && m->inference_debugging) { + // m is null when shard_id > 0 for some reason + std::cout << "BWD " << m->op_name << std::endl; + } + if (data_type == DT_HALF) { + backward_kernel(output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_FLOAT) { + backward_kernel(output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_DOUBLE) { + backward_kernel(output_grad.get_double_ptr(), + input_grad.get_double_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_INT32) { + backward_kernel(output_grad.get_int32_ptr(), + input_grad.get_int32_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_INT64) { + backward_kernel(output_grad.get_int64_ptr(), + input_grad.get_int64_ptr(), + output_grad.domain.get_volume()); + } else { + assert(false && "Unsupported data type in Combine backward"); + } +} + void Combine::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/parallel_ops/fused_parallel_op.cc b/src/parallel_ops/fused_parallel_op.cc index 1a76cbfc40..dec7b20fb2 100644 --- a/src/parallel_ops/fused_parallel_op.cc +++ b/src/parallel_ops/fused_parallel_op.cc @@ -59,7 +59,7 @@ FusedParallelOpParams FusedParallelOp::get_params() const { std::vector ops(std::begin(this->parallel_ops), std::end(this->parallel_ops)); params.parallel_ops = ops; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 8d7e20e395..7067035465 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -20,26 +20,23 @@ namespace FlexFlow { AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; - size_t num_elements = bc->num_tokens * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - num_elements, + input.domain.get_volume(), nccl_data_type, ncclSum, m->handle.ncclComm, @@ -49,19 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m, #endif } -void forward_kernel_wrapper(AllReduceMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - input.domain.get_volume(), + num_elements, nccl_data_type, ncclSum, m->handle.ncclComm, @@ -71,10 +76,29 @@ void forward_kernel_wrapper(AllReduceMeta const *m, #endif } -void backward_kernel_wrapper(AllReduceMeta const *m, +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } } // namespace AllReduce diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 2c000137a1..3041f9adf9 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -13,32 +13,30 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; - size_t num_elements = bc->num_tokens * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - num_elements, + input.domain.get_volume(), nccl_data_type, ncclSum, m->handle.ncclComm, @@ -48,18 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m, #endif } -void forward_kernel_wrapper(AllReduceMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - input.domain.get_volume(), + num_elements, nccl_data_type, ncclSum, m->handle.ncclComm, @@ -69,10 +76,23 @@ void forward_kernel_wrapper(AllReduceMeta const *m, #endif } -void backward_kernel_wrapper(AllReduceMeta const *m, +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens(); + size_t data_size = data_type_size(output_grad.data_type); + checkCUDA(cudaMemcpyAsync(input_grad.ptr, + output_grad.ptr, + hidden_dim_size * num_elements * data_size, + cudaMemcpyDeviceToDevice, + stream)); } } // namespace AllReduce diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp index d6e9568223..2a29be1ad4 100644 --- a/src/parallel_ops/kernels/combine_kernels.cpp +++ b/src/parallel_ops/kernels/combine_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/parallel_ops/kernels/combine_kernels.h" +#include "flexflow/parallel_ops/combine.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {} +CombineMeta::CombineMeta(FFHandler handler, Combine const *comb) + : OpMeta(handler, comb) {} namespace Kernels { namespace Combine { diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu index 1ab79a7944..5809e2d4f3 100644 --- a/src/parallel_ops/kernels/combine_kernels.cu +++ b/src/parallel_ops/kernels/combine_kernels.cu @@ -13,12 +13,14 @@ * limitations under the License. */ +#include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/kernels/combine_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {} +CombineMeta::CombineMeta(FFHandler handler, Combine const *comb) + : OpMeta(handler, comb) {} namespace Kernels { namespace Combine { diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cpp b/src/parallel_ops/kernels/parallel_identity_kernels.cpp new file mode 100644 index 0000000000..8378231fb2 --- /dev/null +++ b/src/parallel_ops/kernels/parallel_identity_kernels.cpp @@ -0,0 +1,97 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle, + ParallelIdentity const *reduct) + : OpMeta(handle, reduct) {} + +namespace Kernels { +namespace ParallelIdentity { + +void forward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t data_size = data_type_size(input.data_type); + // copy input to output + checkCUDA(hipMemcpyAsync(output.ptr, + input.ptr, + input.domain.get_volume() * data_size, + hipMemcpyDeviceToDevice, + stream)); +} + +void backward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens(); + size_t data_size = data_type_size(input.data_type); + checkCUDA(hipMemcpyAsync(output.ptr, + input.ptr, + hidden_dim_size * num_elements * data_size, + hipMemcpyDeviceToDevice, + stream)); +} + +void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators"); +#endif +} + +} // namespace ParallelIdentity +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu new file mode 100644 index 0000000000..6800f3ab16 --- /dev/null +++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu @@ -0,0 +1,96 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle, + ParallelIdentity const *reduct) + : OpMeta(handle, reduct) {} + +namespace Kernels { +namespace ParallelIdentity { + +void forward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t data_size = data_type_size(input.data_type); + // copy input to output + checkCUDA(cudaMemcpyAsync(output.ptr, + input.ptr, + input.domain.get_volume() * data_size, + cudaMemcpyDeviceToDevice, + stream)); +} + +void backward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens(); + size_t data_size = data_type_size(input.data_type); + checkCUDA(cudaMemcpyAsync(output.ptr, + input.ptr, + hidden_dim_size * num_elements * data_size, + cudaMemcpyDeviceToDevice, + stream)); +} + +void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators"); +#endif +} + +} // namespace ParallelIdentity +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/partition_kernels.cpp b/src/parallel_ops/kernels/partition_kernels.cpp index cfd76c0f18..bd1c96d4c7 100644 --- a/src/parallel_ops/kernels/partition_kernels.cpp +++ b/src/parallel_ops/kernels/partition_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/parallel_ops/kernels/partition_kernels.h" +#include "flexflow/parallel_ops/partition.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {} +RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart) + : OpMeta(handler, repart) {} namespace Kernels { namespace Repartition { diff --git a/src/parallel_ops/kernels/partition_kernels.cu b/src/parallel_ops/kernels/partition_kernels.cu index 08008f1035..3a39b39fe4 100644 --- a/src/parallel_ops/kernels/partition_kernels.cu +++ b/src/parallel_ops/kernels/partition_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/parallel_ops/kernels/partition_kernels.h" +#include "flexflow/parallel_ops/partition.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {} +RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart) + : OpMeta(handler, repart) {} namespace Kernels { namespace Repartition { diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp index 2a3fe5cca1..1f3e8e0962 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cpp +++ b/src/parallel_ops/kernels/reduction_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace Reduction { diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu index 34ae8007da..df7630976b 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cu +++ b/src/parallel_ops/kernels/reduction_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace Reduction { diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 1647f014be..f49e0d4eb0 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) - : OpMeta(handle) {} + : OpMeta(handle, repl) {} namespace Kernels { namespace Replicate { diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 35bc109bd3..0b5c434aa6 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) - : OpMeta(handle) {} + : OpMeta(handle, repl) {} namespace Kernels { namespace Replicate { diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc new file mode 100644 index 0000000000..883910ae09 --- /dev/null +++ b/src/parallel_ops/parallel_identity.cc @@ -0,0 +1,474 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/parallel_identity.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::ParallelIdentity; + +/* Params */ +bool operator==(ParallelIdentityParams const &lhs, + ParallelIdentityParams const &rhs) { + return lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim && + std::strcmp(lhs.name, rhs.name) == 0; +} + +bool ParallelIdentityParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +ParallelIdentityParams ParallelIdentity::get_params() const { + ParallelIdentityParams params; + params.parallel_identity_legion_dim = this->parallel_identity_dim; + if (strlen(this->name) < MAX_OPNAME) { + strcpy(params.name, this->name); + } + return params; +} + +ParallelIdentity::ParallelIdentity(FFModel &model, + const ParallelTensor _input, + int _parallel_identity_legion_dim, + char const *name) + : ParallelOp(model, OP_PARALLEL_IDENTITY, name, _input), + parallel_identity_dim(_parallel_identity_legion_dim) { + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + assert(dims[parallel_identity_dim].degree > 1); + // ParallelTensorBase::update_parallel_ids(numdim, dims); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this); +} + +ParallelIdentity::ParallelIdentity(FFModel &model, + ParallelIdentityParams const ¶ms, + ParallelTensor const input, + char const *name) + : ParallelIdentity( + model, input, params.parallel_identity_legion_dim, params.name) {} + +void ParallelIdentity::create_input_partition(FFModel &ff) { + // Do nothing + return; +} + +void ParallelIdentity::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + // Do nothing + return; +} + +OpMeta *ParallelIdentity::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ParallelIdentity *ar = (ParallelIdentity *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + ParallelIdentityMeta *meta = new ParallelIdentityMeta(handle, ar); + meta->input_type[0] = ar->inputs[0]->data_type; + meta->output_type[0] = ar->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, ar->name); + return meta; +} + +void ParallelIdentity::init(FFModel const &ff) { + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ParallelIdentity)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void ParallelIdentity::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(PARALLEL_IDENTITY_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void ParallelIdentity::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void ParallelIdentity::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(PARALLEL_IDENTITY_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void ParallelIdentity::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + +void ParallelIdentity::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + parallel_is = batch_outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ParallelIdentity)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +FutureMap ParallelIdentity::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(PARALLEL_IDENTITY_INF_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void ParallelIdentity::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ParallelIdentity::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {output}); + } +} + +FutureMap + ParallelIdentity::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void ParallelIdentity::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ParallelIdentity::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } +} + +bool ParallelIdentity::measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const { + cost_metrics = CostMetrics(); + cost_metrics.forward_time = 0.0f; + cost_metrics.backward_time = 0.0f; + + cost_metrics.sync_time = 0; + cost_metrics.inputs_memory = 0; + cost_metrics.outputs_memory = 0; + cost_metrics.weights_memory = 0; + return true; +} + +bool ParallelIdentity::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_PARALLEL_IDENTITY_DIM: + *value = parallel_identity_dim; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool ParallelIdentity::append_parallel_op_info( + std::vector ¶llel_ops) const { + ParallelOpInfo ret; + ret.op_type = op_type; + ret.parallel_dim = parallel_identity_dim; + ret.parallel_degree = -1; // ParallelIdentity does not affect parallel degree + parallel_ops.push_back(ret); + return true; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::ParallelIdentityParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.parallel_identity_legion_dim); + return key; +} + +} // namespace std diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index e6ab09d088..fddf739599 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Repartition; /* Params */ bool operator==(RepartitionParams const &lhs, RepartitionParams const &rhs) { return lhs.repartition_legion_dim == rhs.repartition_legion_dim && - lhs.repartition_degree == rhs.repartition_degree; + lhs.repartition_degree == rhs.repartition_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool RepartitionParams::is_valid(ParallelTensorShape const &input) const { @@ -60,7 +61,7 @@ RepartitionParams Repartition::get_params() const { RepartitionParams params; params.repartition_legion_dim = this->repartition_dim; params.repartition_degree = this->repartition_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -200,6 +201,11 @@ void Repartition::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } FutureMap diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 5ca2b1301c..7306e04334 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::Reduction; /* Params */ bool operator==(ReductionParams const &lhs, ReductionParams const &rhs) { return lhs.reduction_legion_dim == rhs.reduction_legion_dim && - lhs.reduction_degree == rhs.reduction_degree; + lhs.reduction_degree == rhs.reduction_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool ReductionParams::is_valid(ParallelTensorShape const &input) const { @@ -56,7 +57,7 @@ ReductionParams Reduction::get_params() const { ReductionParams params; params.reduction_legion_dim = this->reduction_dim; params.reduction_degree = this->reduction_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -125,6 +126,13 @@ void Reduction::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // output_grad_lp is an aliased partitioning along the replica dim + ff.create_aliased_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + reduction_dim, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } OpMeta *Reduction::init_task(Task const *task, @@ -137,6 +145,7 @@ OpMeta *Reduction::init_task(Task const *task, meta->input_type[0] = reduct->inputs[0]->data_type; meta->output_type[0] = reduct->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, reduct->name); return meta; } @@ -372,6 +381,10 @@ void Reduction::forward_task(Task const *task, GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + if (m->inference_debugging) { + std::cout << "INF " << m->op_name << std::endl; + } + assert(input.data_type == output.data_type); if (input.data_type == DT_HALF) { forward_kernel(input.get_half_ptr(), diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index ba7bb6677f..38215fc903 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Replicate; /* Params */ bool operator==(ReplicateParams const &lhs, ReplicateParams const &rhs) { return lhs.replicate_legion_dim == rhs.replicate_legion_dim && - lhs.replicate_degree == rhs.replicate_degree; + lhs.replicate_degree == rhs.replicate_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool ReplicateParams::is_valid(ParallelTensorShape const &input) const { @@ -55,7 +56,7 @@ ReplicateParams Replicate::get_params() const { ReplicateParams params; params.replicate_legion_dim = this->replicate_dim; params.replicate_degree = this->replicate_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -125,6 +126,12 @@ void Replicate::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // output_grad_lp is a disjoint partition + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } OpMeta *Replicate::init_task(Task const *task, @@ -137,6 +144,7 @@ OpMeta *Replicate::init_task(Task const *task, meta->input_type[0] = repl->inputs[0]->data_type; meta->output_type[0] = repl->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, repl->name); return meta; } @@ -276,6 +284,51 @@ void Replicate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Replicate::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + + // Warning: we need to use batch_inputs[0] here, instead of the usual + // batch_outputs[0] + parallel_is = batch_inputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view; + + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Replicate::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -350,6 +403,9 @@ void Replicate::forward_task(Task const *task, assert(task->regions.size() == 2); ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); + if (m->inference_debugging) { + std::cout << "INF " << m->op_name << std::endl; + } Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -381,6 +437,37 @@ void Replicate::forward_task(Task const *task, } } +void Replicate::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + Domain output_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain input_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + // Currently only support the outter most dimension + for (int i = 0; i < output_grad_domain.get_dim() - 1; i++) { + assert(output_grad_domain.lo()[i] == input_grad_domain.lo()[i]); + assert(output_grad_domain.hi()[i] == input_grad_domain.hi()[i]); + } + size_t num_elements = input_grad_domain.get_volume(); + size_t num_replicas = output_grad_domain.get_volume() / num_elements; + float const *output_grad_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + float *input_grad_ptr = helperGetTensorPointerRW( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); + if (m->inference_debugging) { + std::cout << "BWD " << m->op_name << std::endl; + } + + backward_kernel( + output_grad_ptr, input_grad_ptr, num_elements, num_replicas); +} + void Replicate::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 7989b0799e..4c339750c7 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -25,7 +25,35 @@ Legion::Logger log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; -BatchConfig::BatchConfig() : num_tokens(0) { +void set_optimizer_tasks(OptimizerTasks &tasks, + int max_training_steps, + int completed_training_steps, + int gradient_accumulation_steps) { + assert(max_training_steps > 0); + assert(completed_training_steps >= 0); + assert(gradient_accumulation_steps > 0); + assert(completed_training_steps < max_training_steps); + // Compute gradients should always be true + tasks.compute_gradients = true; + + // Reset gradients to zero in the first iteration and after weight updates + tasks.reset_gradients_to_zero = + (completed_training_steps == 0) || + (completed_training_steps % gradient_accumulation_steps == 0); + + // Update weights every gradient_accumulation_steps + tasks.update_weights = + ((completed_training_steps + 1) % gradient_accumulation_steps == 0); + + // Save updated weights only in the very last training step + tasks.save_updated_weights = + (completed_training_steps == max_training_steps - 1); + if (tasks.save_updated_weights) { + assert(tasks.update_weights); + } +} + +BatchConfig::BatchConfig() : num_tokens(0), num_peft_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].first_token_depth_in_request = 0; requestsInfo[i].first_token_offset_in_batch = 0; @@ -74,6 +102,14 @@ int BatchConfig::num_active_tokens() const { return num_tokens; } +int BatchConfig::num_active_infr_tokens() const { + return num_tokens; +} + +int BatchConfig::num_active_peft_tokens() const { + return num_peft_tokens; +} + /*static*/ int BatchConfig::max_requests_per_batch() { return RequestManager::get_request_manager()->get_max_requests_per_batch(); @@ -107,8 +143,13 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; os << "Max sequence length: " << bc.max_sequence_length() << std::endl; // Current values - os << "Number of tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of active tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of inference tokens: " << bc.num_active_infr_tokens() + << std::endl; + os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl; os << "Number of requests: " << bc.num_active_requests() << std::endl; + os << "Number of generation tokens: " << bc.num_generation_tokens + << std::endl; // Per-request info os << "Per-request info:\n"; @@ -121,9 +162,27 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " BatchConfig Req ID: " + << bc.requestsInfo[i].batch_config_request_id << std::endl; + os << " Prompt phase: " << bc.requestsInfo[i].prompt_phase + << std::endl; + os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; + os << " optimizer_tasks: {" + << "compute_gradients: " << std::boolalpha + << bc.requestsInfo[i].optimizer_tasks.compute_gradients + << ", reset_gradients_to_zero: " + << bc.requestsInfo[i].optimizer_tasks.reset_gradients_to_zero + << ", update_weights: " + << bc.requestsInfo[i].optimizer_tasks.update_weights + << ", save_updated_weights: " + << bc.requestsInfo[i].optimizer_tasks.save_updated_weights << "}" + << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; os << " Request running: " << bc.request_running[i] << std::endl; } diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 0509c23afe..b10f8e82ab 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -137,6 +137,10 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 57bc5a0458..386a0c940b 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -36,7 +36,8 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { using FlexFlow::get_legion_stream; -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +template +__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } @@ -271,18 +272,10 @@ __host__ void print_beam_tensor(T const *ptr, template <> __host__ void save_tensor(float const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - float *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(float) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(float) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + float *host_ptr = (float *)calloc(num_elements, sizeof(float)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(float) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -293,26 +286,17 @@ __host__ void fprintf(tensor_file, "%.9f", host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template <> __host__ void save_tensor(half const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - half *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(half) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(half) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + half *host_ptr = (half *)calloc(num_elements, sizeof(half)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(half) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -323,27 +307,18 @@ __host__ void fprintf(tensor_file, "%.9f", (float)host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template <> __host__ void save_tensor(int32_t const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - int32_t *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(int32_t) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(int32_t) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + int32_t *host_ptr = (int32_t *)calloc(num_elements, sizeof(int32_t)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(int32_t) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -354,27 +329,18 @@ __host__ void save_tensor(int32_t const *ptr, fprintf(tensor_file, "%d", host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template <> __host__ void save_tensor(int64_t const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - int64_t *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(int64_t) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(int64_t) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + int64_t *host_ptr = (int64_t *)calloc(num_elements, sizeof(int64_t)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(int64_t) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -385,13 +351,12 @@ __host__ void save_tensor(int64_t const *ptr, fprintf(tensor_file, "%ld", host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template -__host__ T *download_tensor(T const *ptr, size_t num_elements) { +__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -404,14 +369,25 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { } template -__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { +__host__ void + copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA(cudaMemcpyAsync( dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); - return true; } + +template +__host__ void + copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(src != nullptr); + checkCUDA(cudaMemcpyAsync( + dst, src, sizeof(T) * num_elements, cudaMemcpyHostToDevice, stream)); +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; @@ -609,6 +585,48 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) { return CUDNN_DATA_FLOAT; } +void check_device_vs_host_ptr(void const *maybe_devicePtr) { + cudaPointerAttributes attributes; + cudaError_t cudaStatus = + cudaPointerGetAttributes(&attributes, maybe_devicePtr); + + if (cudaStatus == cudaSuccess) { + // Check attributes and perform actions accordingly + if (attributes.type == cudaMemoryTypeDevice) { + printf("Pointer is allocated in device memory.\n"); + } else if (attributes.type == cudaMemoryTypeHost) { + printf("Pointer is allocated in host memory.\n"); + } else if (attributes.type == cudaMemoryTypeUnregistered) { + printf("Pointer is unregistered.\n"); + } else if (attributes.type == cudaMemoryTypeManaged) { + printf("Pointer is managed.\n"); + } else { + printf("Pointer is not allocated in recognized memory type.\n"); + } + } else { + fprintf(stderr, + "cudaPointerGetAttributes failed: %s\n", + cudaGetErrorString(cudaStatus)); + } +} + +void check_ptr_alignment(void const *ptr) { + if (!ptr) { + printf("Pointer is NULL\n"); + return; + } + bool aligned2 = ((uintptr_t)ptr % 2 == 0); + bool aligned4 = ((uintptr_t)ptr % 4 == 0); + bool aligned8 = ((uintptr_t)ptr % 8 == 0); + bool aligned16 = ((uintptr_t)ptr % 16 == 0); + printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n", + ptr, + (aligned2 ? "yes" : "no"), + (aligned4 ? "yes" : "no"), + (aligned8 ? "yes" : "no"), + (aligned16 ? "yes" : "no")); +} + template __global__ void assign_kernel(half *ptr, coord_t size, half value); template __global__ void @@ -620,6 +638,13 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + scale_kernel(half *ptr, coord_t size, half a, half b); +template __global__ void + scale_kernel(float *ptr, coord_t size, float a, float b); +template __global__ void + scale_kernel(double *ptr, coord_t size, double a, double b); + template __global__ void add_kernel(half *dst, half const *src, size_t size); template __global__ void @@ -716,26 +741,43 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); -template __host__ float *download_tensor(float const *ptr, - size_t num_elements); -template __host__ half *download_tensor(half const *ptr, - size_t num_elements); -template __host__ double *download_tensor(double const *ptr, - size_t num_elements); -template __host__ int32_t *download_tensor(int32_t const *ptr, - size_t num_elements); -template __host__ int64_t *download_tensor(int64_t const *ptr, - size_t num_elements); -template __host__ bool - download_tensor(float const *ptr, float *dst, size_t num_elements); -template __host__ bool - download_tensor(half const *ptr, half *dst, size_t num_elements); -template __host__ bool download_tensor(double const *ptr, - double *dst, - size_t num_elements); -template __host__ bool download_tensor(int32_t const *ptr, - int32_t *dst, - size_t num_elements); -template __host__ bool download_tensor(int64_t const *ptr, - int64_t *dst, - size_t num_elements); +template __host__ float *copy_tensor_dev_to_host(float const *ptr, + size_t num_elements); +template __host__ half *copy_tensor_dev_to_host(half const *ptr, + size_t num_elements); +template __host__ double *copy_tensor_dev_to_host(double const *ptr, + size_t num_elements); +template __host__ int32_t * + copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); +template __host__ int64_t * + copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(float const *ptr, + float *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(half const *ptr, + half *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(double const *ptr, + double *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, + int32_t *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, + int64_t *dst, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, + float const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, + half const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, + double const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, + int32_t const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, + int64_t const *src, + size_t num_elements); diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..5a7d98b4dc 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -188,6 +188,9 @@ std::string get_operator_type_name(OperatorType type) { return "Sampling"; case OP_ARGMAX: return "ArgMax"; + // PEFT Ops + case OP_LORA: + return "Lora Layer"; // Parallel Ops case OP_REPARTITION: return "Repartition"; @@ -199,6 +202,8 @@ std::string get_operator_type_name(OperatorType type) { return "Reduction"; case OP_ALLREDUCE: return "AllReduce"; + case OP_PARALLEL_IDENTITY: + return "ParallelIdentity"; case OP_PIPELINE: return "Pipeline"; case OP_FUSED_PARALLEL: diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 819e6527e5..8213726e8a 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -30,4 +30,29 @@ bool operator==(LayerID const &lhs, LayerID const &rhs) { return lhs.id == rhs.id; } +const PEFTModelID PEFTModelID::NO_ID = PEFTModelID(); + +PEFTModelID::PEFTModelID() : id(0) {} + +PEFTModelID::PEFTModelID(size_t _id) : id(_id) { + assert(is_valid_id()); +} + +bool PEFTModelID::is_valid_id() const { + return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID); +} + +bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { + return lhs.id == rhs.id; +} + +std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { + if (peft_model_id == PEFTModelID::NO_ID) { + os << "NO_ID"; + } else { + os << peft_model_id.id; + } + return os; +} + }; // namespace FlexFlow diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 43ce9d7005..c373e0da9b 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr, bool final_bias, std::string layer_name, std::string weights_folder) { - std::string q_file = layer_name + "_wq_bias"; - std::string k_file = layer_name + "_wk_bias"; - std::string v_file = layer_name + "_wv_bias"; + std::string q_file = layer_name + ".q_proj.bias"; + std::string k_file = layer_name + ".k_proj.bias"; + std::string v_file = layer_name + ".v_proj.bias"; std::vector bias_files = {q_file, k_file, v_file}; if (final_bias) { - std::string o_file = layer_name + "_wo_bias"; + std::string o_file = layer_name + ".o_proj.bias"; bias_files.push_back(o_file); } @@ -217,12 +217,10 @@ void load_attention_weights_v2(DT *ptr, std::string weights_folder, size_t volume, int tensor_parallelism_degree) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; @@ -407,12 +405,10 @@ void load_attention_weights_quantized(char *ptr, std::string weights_folder, DataType data_type, bool use_full_precision) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file, o_file}; int file_index = 0; @@ -690,7 +686,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, if (weight_idx > 0) { assert(weight_idx == 0 || weight_idx == 1); if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } } load_from_quantized_file(data, @@ -734,44 +730,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); - } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); - } - + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); } else { - assert(false); + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) - ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); + ? ".attn_bias" + : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); @@ -781,7 +767,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, assert(weight_idx == 0 || weight_idx == 1); // handle exception if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = @@ -809,6 +795,10 @@ void FileDataLoader::load_weights(FFModel *ff) { if (weight == NULL) { continue; } + // TODO: currently skip Lora layers + if (l->op_type == OP_LORA) { + continue; + } switch (weight->data_type) { case DT_HALF: load_single_weight_tensor(ff, l, i); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index b023aced6e..1a38782e81 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -36,6 +36,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" @@ -54,6 +55,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -1992,6 +1994,7 @@ std::pair, std::unordered_map> mv.device_type = MachineView::GPU; mv.ndims = 1; int total_parallel_degree = 1; + assert(op->numOutputs > 0); for (int i = 0; i < op->outputs[0]->num_dims; i++) { total_parallel_degree *= op->outputs[0]->dims[i].degree; } @@ -2434,6 +2437,13 @@ GraphOptimalViewSerialized sez.serialize(allreduce->name, strlen(allreduce->name)); break; } + case OP_PARALLEL_IDENTITY: { + ParallelIdentity *parallel_identity = (ParallelIdentity *)op; + sez.serialize(parallel_identity->parallel_identity_dim); + sez.serialize(strlen(parallel_identity->name)); + sez.serialize(parallel_identity->name, strlen(parallel_identity->name)); + break; + } case OP_FUSED_PARALLEL: { FusedParallelOp *fused = (FusedParallelOp *)op; sez.serialize(fused->num_parallel_ops); @@ -2475,6 +2485,7 @@ namespace FlexFlow { using PCG::Edge; using PCG::Graph; using PCG::GraphCostResult; +using PCG::log_graph; using PCG::Node; void FFModel::register_all_machine_views( @@ -2759,6 +2770,10 @@ void FFModel::deserialize_graph_optimal_view( node = Linear::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_LORA: { + node = LoraLinear::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_MULTIHEAD_ATTENTION: { assert(num_inputs == 3); int embed_dim, num_heads, k_dim, v_dim; @@ -3042,8 +3057,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], - {combine_dim, combine_degree}); + CombineParams params; + params.combine_legion_dim = combine_dim; + params.combine_degree = combine_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_REPARTITION: { @@ -3055,8 +3073,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node( - inputs[0], {repartition_dim, repartition_degree}); + RepartitionParams params; + params.repartition_legion_dim = repartition_dim; + params.repartition_degree = repartition_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_REPLICATE: { @@ -3068,8 +3089,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], - {replicate_dim, replicate_degree}); + ReplicateParams params; + params.replicate_legion_dim = replicate_dim; + params.replicate_degree = replicate_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_REDUCTION: { @@ -3081,8 +3105,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], - {reduction_dim, reduction_degree}); + ReductionParams params; + params.reduction_legion_dim = reduction_dim; + params.reduction_degree = reduction_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_ALLREDUCE: { @@ -3093,24 +3120,43 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], {allreduce_dim}); + AllReduceParams params; + params.allreduce_legion_dim = allreduce_dim; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(num_inputs == 1); + int parallel_identity_dim; + dez.deserialize(parallel_identity_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + ParallelIdentityParams params; + params.parallel_identity_legion_dim = parallel_identity_dim; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); - std::vector parallel_ops; + FusedParallelOpParams params; int num_parallel_ops; dez.deserialize(num_parallel_ops); for (int i = 0; i < num_parallel_ops; i++) { ParallelOpInfo info; dez.deserialize(info); - parallel_ops.push_back(info); + params.parallel_ops.push_back(info); } size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], {parallel_ops}); + strcpy(params.name, name); + + node = get_or_create_node(inputs[0], params); break; } default: { @@ -3149,20 +3195,20 @@ void FFModel::deserialize_graph_optimal_view( optimal_views[guid_to_nodes[guid]] = view; } assert(dez.get_remaining_bytes() == 0); - printf("Deserialized Views...\n"); + log_graph.debug("Deserialized Views...\n"); for (auto const &it : optimal_views) { - printf("node[%zu]: type(%s) view(%d %d %d) ", - it.first.guid, - it.first.to_string().c_str(), - it.second.ndims, - it.second.dim[0], - it.second.start_device_id); + log_graph.debug("node[%zu]: type(%s) view(%d %d %d) ", + it.first.guid, + it.first.to_string().c_str(), + it.second.ndims, + it.second.dim[0], + it.second.start_device_id); auto const &list = graph->inEdges.at(it.first); for (auto const &it2 : list) { Edge e = it2; - printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); + log_graph.debug(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); } - printf("\n"); + log_graph.debug("\n"); } } diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 613df1cbcf..057be8f443 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -29,7 +29,8 @@ hipError_t get_legion_stream(hipStream_t *stream) { using FlexFlow::get_legion_stream; -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +template +__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } @@ -55,6 +56,14 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void + copy_kernel_discrete(DT *dst, const DT *src, coord_t size, size_t *index) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[index[i]]; + } +} + template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { CUDA_KERNEL_LOOP(i, n) { @@ -224,22 +233,24 @@ __host__ void updateGAS(float *para_ptr, } template -__host__ void - print_tensor(T const *ptr, size_t num_elements, char const *prefix) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ void print_tensor(T const *ptr, + size_t num_elements, + char const *prefix, + int shard_id) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; - checkCUDA(hipHostMalloc((void **)&host_ptr, + checkCUDA(hipHostMalloc(&host_ptr, sizeof(T) * num_elements, hipHostMallocPortable | hipHostMallocMapped)); - checkCUDA(hipMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost)); - // checkCUDA(hipDeviceSynchronize()); + checkCUDA(hipMemcpyAsync( + host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); + checkCUDA(hipDeviceSynchronize()); int idx = 0; - printf("%s", prefix); + printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); - if (idx >= 16) { + printf(" %.20lf", (float)host_ptr[idx]); + if (idx >= 100) { break; } } @@ -247,6 +258,40 @@ __host__ void checkCUDA(hipHostFree(host_ptr)); } +template +__host__ void print_beam_tensor(T const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + T *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(T) * channel * skip, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(T) * channel * skip, + hipMemcpyDeviceToHost, + stream)); + // checkCUDA(hipDeviceSynchronize()); + int idx = 0; + printf("%s", prefix); + + for (int i = 0; i < channel; i += 1) { + for (idx = 0; idx < num_elements; idx++) { + printf(" %.20lf", (float)host_ptr[idx + i * skip]); + if (idx >= 100) { + break; + } + } + printf("\n-----***********------\n"); + } + + checkCUDA(hipHostFree(host_ptr)); +} + template <> __host__ void save_tensor(float const *ptr, size_t num_elements, char const *file_name) { @@ -370,9 +415,7 @@ __host__ void save_tensor(int64_t const *ptr, } template -__host__ T *download_tensor(T const *ptr, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -381,21 +424,27 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { hipHostMallocPortable | hipHostMallocMapped)); checkCUDA(hipMemcpyAsync( host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); - // checkCUDA(hipDeviceSynchronize()); return host_ptr; } template -__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ void + copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA(hipMemcpyAsync( dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); - // checkCUDA(hipDeviceSynchronize()); - return true; +} + +template +__host__ void + copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(src != nullptr); + checkCUDA(hipMemcpyAsync( + dst, src, sizeof(T) * num_elements, hipMemcpyHostToDevice, stream)); } miopenStatus_t cudnnSetTensorDescriptorFromDomain( @@ -450,22 +499,23 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain( return miopenStatusBadParm; } -miopenStatus_t - cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, - Domain domain) { +miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + miopenTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); switch (domain.get_dim()) { case 1: { Rect<1> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; - return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + return miopenSet4dTensorDescriptor( + tensor, cudnn_data_type, dims[0], 1, 1, 1); } case 2: { Rect<2> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; dims[1] = rect.hi[1] - rect.lo[1] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[1], dims[0], 1, 1); } case 3: { Rect<3> rect = domain; @@ -473,7 +523,7 @@ miopenStatus_t dims[1] = rect.hi[1] - rect.lo[1] + 1; dims[2] = rect.hi[2] - rect.lo[2] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[2] * dims[1], dims[0], 1, 1); } case 4: { Rect<4> rect = domain; @@ -482,7 +532,7 @@ miopenStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; dims[3] = rect.hi[3] - rect.lo[3] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[3] * dims[2] * dims[1], dims[0], 1, 1); } case 5: { Rect<5> rect = domain; @@ -493,7 +543,7 @@ miopenStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; dims[3] = rect.hi[3] - rect.lo[3] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + tensor, cudnn_data_type, dims[3], dims[2], dims[1], dims[0]); } default: assert(false && "Unsupported dim number"); @@ -553,6 +603,49 @@ void handle_unimplemented_hip_kernel(OperatorType op_type) { throw std::runtime_error("Unimplemented hip kernel for Operator: " + FlexFlow::get_operator_type_name(op_type)); } +void check_device_vs_host_ptr(void const *maybe_devicePtr) { + hipPointerAttribute_t attributes; + hipError_t hipStatus = hipPointerGetAttributes(&attributes, maybe_devicePtr); + + if (hipStatus == hipSuccess) { + // Check attributes and perform actions accordingly + if (attributes.memoryType == hipMemoryTypeDevice) { + printf("Pointer is allocated in device memory.\n"); + } else if (attributes.memoryType == hipMemoryTypeHost) { + printf("Pointer is allocated in host memory.\n"); + } else if (attributes.memoryType == hipMemoryTypeArray) { + printf("Pointer points to array memory, physically located on device.\n"); + } else if (attributes.memoryType == hipMemoryTypeManaged) { + printf("Pointer points to managed memory, automaticallly managed by the " + "unified memory system.\n"); + } else if (attributes.memoryType == hipMemoryTypeUnified) { + printf("Pointer points to unified memory (not supported currently) \n"); + } else { + printf("Pointer is not allocated in recognized memory type.\n"); + } + } else { + fprintf(stderr, + "hipPointerGetAttributes failed: %s\n", + hipGetErrorString(hipStatus)); + } +} + +void check_ptr_alignment(void const *ptr) { + if (!ptr) { + printf("Pointer is NULL\n"); + return; + } + bool aligned2 = ((uintptr_t)ptr % 2 == 0); + bool aligned4 = ((uintptr_t)ptr % 4 == 0); + bool aligned8 = ((uintptr_t)ptr % 8 == 0); + bool aligned16 = ((uintptr_t)ptr % 16 == 0); + printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n", + ptr, + (aligned2 ? "yes" : "no"), + (aligned4 ? "yes" : "no"), + (aligned8 ? "yes" : "no"), + (aligned16 ? "yes" : "no")); +} template __global__ void assign_kernel(half *ptr, coord_t size, half value); @@ -565,6 +658,13 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + scale_kernel(half *ptr, coord_t size, half a, half b); +template __global__ void + scale_kernel(float *ptr, coord_t size, float a, float b); +template __global__ void + scale_kernel(double *ptr, coord_t size, double a, double b); + template __global__ void add_kernel(half *dst, half const *src, size_t size); template __global__ void @@ -587,6 +687,15 @@ template __global__ void template __global__ void copy_kernel(int64_t *dst, int64_t const *src, coord_t size); +template __global__ void copy_kernel_discrete(float *dst, + float const *src, + coord_t size, + size_t *index); +template __global__ void copy_kernel_discrete(int64_t *dst, + int64_t const *src, + coord_t size, + size_t *index); + template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, size_t size, @@ -604,16 +713,42 @@ template __global__ void apply_add_with_scale(int64_t *data_ptr, size_t size, int64_t scale); -template __host__ void - print_tensor(float const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(double const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(int32_t const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(int64_t const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(half const *ptr, size_t rect, char const *prefix); +template __host__ void print_tensor(float const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(double const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(int32_t const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(int64_t const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(half const *ptr, + size_t rect, + char const *prefix, + int shard_id); + +template __host__ void print_beam_tensor(float const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); +template __host__ void print_beam_tensor(int32_t const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); +template __host__ void print_beam_tensor(int64_t const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); template __host__ void save_tensor(float const *ptr, size_t rect, char const *file_name); @@ -626,24 +761,43 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); -template __host__ float *download_tensor(float const *ptr, - size_t num_elements); -template __host__ half *download_tensor(half const *ptr, - size_t num_elements); -template __host__ double *download_tensor(double const *ptr, - size_t num_elements); -template __host__ int32_t *download_tensor(int32_t const *ptr, - size_t num_elements); -template __host__ int64_t *download_tensor(int64_t const *ptr, - size_t num_elements); -template __host__ bool - download_tensor(float const *ptr, float *dst, size_t num_elements); -template __host__ bool download_tensor(double const *ptr, - double *dst, - size_t num_elements); -template __host__ bool download_tensor(int32_t const *ptr, - int32_t *dst, - size_t num_elements); -template __host__ bool download_tensor(int64_t const *ptr, - int64_t *dst, - size_t num_elements); +template __host__ float *copy_tensor_dev_to_host(float const *ptr, + size_t num_elements); +template __host__ half *copy_tensor_dev_to_host(half const *ptr, + size_t num_elements); +template __host__ double *copy_tensor_dev_to_host(double const *ptr, + size_t num_elements); +template __host__ int32_t * + copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); +template __host__ int64_t * + copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(float const *ptr, + float *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(half const *ptr, + half *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(double const *ptr, + double *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, + int32_t *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, + int64_t *dst, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, + float const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, + half const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, + double const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, + int32_t const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, + int64_t const *src, + size_t num_elements); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 3d299aeedd..1b65dfd869 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -54,10 +54,31 @@ bool parallel_tensor_list_overlaps(std::vector const &list1, } void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { + + // Check if the model object exists + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl; + } + // TODO: currently assume there is a single data-parallel pipeline // (i.e., data-parallel-degree == 1) assert(model->config.data_parallelism_degree == 1); model->config.batchSize = BatchConfig::max_tokens_per_batch(); + + // Check if the model object exists after importing config + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist after " + "setting config and batch size." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl; + } + model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; @@ -117,7 +138,28 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); - + // no need to map inplace tensor + // A tensor is inplace if it shares the same region as another tensor + { + bool inplace = false; + for (int j = 0; j < op->numInputs; j++) { + if (op->inputs[j]->region == op->outputs[i]->region) { + assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); + tensor_buffer[pt_base] = tensor_buffer[op->inputs[j]]; + inplace = true; + } + } + for (int j = 0; j < i; j++) { + if (op->outputs[j]->region == op->outputs[i]->region) { + assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end()); + tensor_buffer[pt_base] = tensor_buffer[op->outputs[j]]; + inplace = true; + } + } + if (inplace) { + continue; + } + } if (op->op_type == OP_REPLICATE) { assert(op->numInputs == 1 && op->numOutputs == 1); } @@ -175,7 +217,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } } if (!found_parallel_tensor) { - log_offload.print( + log_offload.debug( "Cannot find a previous tensor for operator(%d) output_idx(%d)", op_idx, i); @@ -191,6 +233,13 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { pt_base->region.get_field_space()); pt->part = runtime->get_logical_partition( ctx, pt->region, pt_base->part.get_index_partition()); + + pt->region_grad = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part_grad = runtime->get_logical_partition( + ctx, pt->region_grad, pt_base->part.get_index_partition()); pt->machine_view = machine_views[j]; // std::cout << "output mv: " << pt->machine_view << std::endl; Domain part_domain = @@ -205,6 +254,30 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // std::cout << std::endl; } + // Check whether we need to reset input grads + // We use a parallel tensor's region as the key + std::set reset_inputs; + for (int l = model->operators.size() - 1; l >= 0; l--) { + Op *op = model->operators[l]; + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i]->region != LogicalRegion::NO_REGION); + if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) { + // We should not reset input grads since other operators have already + // saved gradients into the region + op->reset_input_grads[i] = false; + } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || + op->op_type == OP_RESIDUAL_RMS_NORM || + op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { + if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) { + op->reset_input_grads[0] = false; + } + reset_inputs.insert(op->inputs[i]->region); + } else { + reset_inputs.insert(op->inputs[i]->region); + } + } + } + // Perform fusion optimizations if (model->config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); @@ -235,34 +308,35 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { continue; } - printf("operator[%zu]: type(%s) guid(%lu)\n", - i, - get_operator_type_name(model->operators[i]->op_type).c_str(), - model->operators[i]->op_guid); + log_inf_mgr.debug( + "operator[%zu]: type(%s) guid(%lu)\n", + i, + get_operator_type_name(model->operators[i]->op_type).c_str(), + model->operators[i]->op_guid); for (int j = 0; j < op->numInputs; j++) { assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; - printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\tinputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numOutputs; j++) { LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; - printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\toutputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numWeights; j++) { LogicalRegion handle = op->weights[j]->region; - printf("\tweights[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\tweights[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } } } @@ -290,9 +364,9 @@ void InferenceManager::init_operators_inference(FFModel *model) { assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); assert(tensor_buffer[op->outputs[i]].size() > batch_index); outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; - if (i > 0) { - assert(outputs[0]->machine_view == outputs[i]->machine_view); - } + // if (i > 0) { + // assert(outputs[0]->machine_view == outputs[i]->machine_view); + // } assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } if (op->is_parallel_op()) { @@ -332,11 +406,12 @@ FutureMap InferenceManager::inference(FFModel *model, FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfigFuture const &bc) { - // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)", + // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) + // num_active_requests(%d)", // bc.get_mode(), - // bc.num_active_tokens(), + // bc.num_active_infr_tokens(), // bc.num_active_requests()); - // assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); + // assert(bc.num_active_infr_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) int batch_index = index % model->config.data_parallelism_degree; @@ -390,6 +465,53 @@ FutureMap InferenceManager::inference(FFModel *model, return fm; }; +void InferenceManager::peft_bwd(FFModel *model, + int index, + BatchConfigFuture const &bc) { + int batch_index = index % model->config.data_parallelism_degree; + FutureMap fm; + bool found_input_operator = false; + int last_op = model->operators.size() - 1; + // Assert that the last operator must be argmax or sampling + assert(model->operators[last_op]->op_type == OP_ARGMAX || + model->operators[last_op]->op_type == OP_ARG_TOPK || + model->operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + for (int o = last_op; o >= 0; o--) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + if (op->op_type == OP_INPUT) { + continue; + } + std::vector inputs(op->numInputs); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > batch_index); + inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + if (op->op_type == OP_INPUT && + tensor_buffer[op->outputs[i]].size() == 0) { + continue; + } + assert(tensor_buffer[op->outputs[i]].size() > batch_index); + outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + op->peft_bwd(*model, bc, inputs, outputs); + } +}; + void InferenceManager::load_input_tokens_from_batch_config( FFModel *model, BatchConfigFuture const &bc, @@ -509,17 +631,26 @@ void FFModel::set_position_offset(int offset) { } void FFModel::compile_inference() { + std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl; + // Request at least four CPU processors for inference runs assert( config.cpusPerNode >= 4 && "FlexFlow Serve requires at least four CPU cores per node, please add " "`-ll:cpu 4` in the command line if you are using the C++ interface or " "set `num_cpus` in `ff.init` if you are using the Python interface"); + + std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four " + "CPU cores per node." + << std::endl; Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; create_operators_from_layers(); + // Launch the graph optimize task + std::cout << "###PEFT DEBUGGING### Launching graph optimization task." + << std::endl; { FFModel *model = this; TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, @@ -535,7 +666,7 @@ void FFModel::compile_inference() { deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor @@ -570,6 +701,14 @@ void FFModel::compile_inference() { } } } + + std::cout + << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." + << std::endl; + // Perform inplace optimizations + std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." + << std::endl; + loss_op = nullptr; metrics_op = nullptr; // Perform inplace optimizations @@ -609,6 +748,8 @@ void FFModel::compile_inference() { } } + // Output tensor mapping + std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl; for (size_t l = 0; l < operators.size(); l++) { Op *op = operators[l]; @@ -634,11 +775,14 @@ void FFModel::compile_inference() { } #ifdef FF_USE_NCCL + std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." + << std::endl; for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference // (fusedop may include allreduces) if (operators[l]->op_type == OP_ALLREDUCE || - operators[l]->op_type == OP_FUSED) { + operators[l]->op_type == OP_PARALLEL_IDENTITY || + operators[l]->op_type == OP_LORA || operators[l]->op_type == OP_FUSED) { MachineView view = operators[l]->outputs[0]->machine_view; if (view_hash_to_nccl_comms.find(view.hash()) == view_hash_to_nccl_comms.end()) { @@ -670,6 +814,8 @@ void FFModel::compile_inference() { } } #endif + std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." + << std::endl; } std::string join_path(std::vector const &paths) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4c67de1aa9..f46630db3c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -47,6 +47,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" @@ -66,6 +67,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -77,6 +79,7 @@ #include #include #include +#include namespace FlexFlow { @@ -135,19 +138,21 @@ Op::Op(FFModel &model, std::string pcname; if (_name == NULL) { pcname = get_operator_type_name(op_type); + pcname = pcname + "_" + std::to_string(op_guid); } else { pcname = std::string(_name); } - pcname = pcname + "_" + std::to_string(op_guid); assert(pcname.length() < MAX_OPNAME); + // std::cout << "Creating operator: " << pcname << std::endl; std::strcpy(name, pcname.c_str()); + // std::cout << "copied name into name var: " << this->name << std::endl; for (int i = 0; i < numInputs; i++) { assert(tensors[i] != NULL); inputs[i] = tensors[i]; } for (int i = 0; i < numInputs; i++) { - trainableInputs[i] = true; - // resetInputGrads[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { outputs[i] = nullptr; @@ -191,8 +196,8 @@ Op::Op(FFModel &model, } } for (int i = 0; i < numInputs; i++) { - trainableInputs[i] = true; - // resetInputGrads[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { outputs[i] = NULL; @@ -1245,7 +1250,8 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, int idx = 0; \ for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ - if (op_type == OP_ALLREDUCE) { \ + if (op_type == OP_ALLREDUCE || op_type == OP_LORA || \ + op_type == OP_PARALLEL_IDENTITY) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1475,10 +1481,12 @@ bool Op::get_weight_parameter(TNParameter tnp, return true; } +#ifdef DEADCODE OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false), inference_debugging(false) { for (int i = 0; i < MAX_NUM_INPUTS; i++) { - trainableInputs[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_INPUTS; i++) { input_type[i] = DT_NONE; @@ -1490,9 +1498,17 @@ OpMeta::OpMeta(FFHandler _handle) output_type[i] = DT_NONE; } decoding_step = 0; + bwd_step = 0; } +#endif -OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { +OpMeta::OpMeta(FFHandler _handle, Op const *op) + : handle(_handle), profiling(op->profiling), + inference_debugging(op->inference_debugging) { + for (int i = 0; i < op->numInputs; i++) { + trainable_inputs[i] = op->trainable_inputs[i]; + reset_input_grads[i] = op->reset_input_grads[i]; + } for (int i = 0; i < op->numInputs; i++) { input_type[i] = op->inputs[i]->data_type; } @@ -1503,6 +1519,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { output_type[i] = op->outputs[i]->data_type; } decoding_step = 0; + bwd_step = 0; } FFRuntime::FFRuntime(FFConfig &config) { @@ -1520,6 +1537,10 @@ FFRuntime::FFRuntime(FFConfig &config) { info.workSpaceSize = config.workSpaceSize; info.offload_reserve_space_size = config.cpu_offload ? config.offload_reserve_space_size : 0; + info.peft_activation_reserve_space_size = + config.enable_peft ? config.peft_activation_reserve_space_size : 0; + info.peft_weight_reserve_space_size = + config.enable_peft ? config.peft_weight_reserve_space_size : 0; info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); @@ -1546,9 +1567,32 @@ FFRuntime *ffruntime_singleton = nullptr; int FFModel::model_counter = 0; +void make_debug_dirs() { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + std::string debug_dir_ = + ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow" + : std::string("~/.cache/flexflow/debug/flexflow"); + wordexp_t p; + wordexp(debug_dir_.c_str(), &p, 0); + debug_dir_ = p.we_wordv[0]; + wordfree(&p); + fs::path debug_dir = debug_dir_; + if (fs::exists(debug_dir)) { + fs::remove_all(debug_dir); + } + fs::create_directories(debug_dir); + assert(fs::is_directory(debug_dir)); + std::vector debug_subdirs = {"fwd", "bwd", "optim", "weights"}; + for (auto const &subdir : debug_subdirs) { + fs::path subdir_path = debug_dir / subdir; + fs::create_directory(subdir_path); + } +} + FFModel::FFModel(FFConfig &_config, bool cpu_offload) : op_global_guid(OP_GUID_FIRST_VALID), layer_global_guid(LAYER_GUID_FIRST_VALID), + peft_model_global_guid(PEFT_MODEL_ID_FIRST_VALID), tensor_global_guid(TENSOR_GUID_FIRST_VALID), parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID), node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0), @@ -1586,6 +1630,9 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) for (int idx = 0; idx < config.workersPerNode * config.numNodes; idx++) { handlers[idx] = ffruntime_singleton->handlers[idx]; } + if (config.inference_debugging) { + make_debug_dirs(); + } model_id = model_counter++; } @@ -2932,7 +2979,8 @@ bool FFModel::apply_fusion( // don't fuse parallel op except allReduce since they have different // parallel_is in forward/backward if (operators[l]->is_parallel_op() && - operators[l]->op_type != OP_ALLREDUCE) { + operators[l]->op_type != OP_ALLREDUCE && + operators[l]->op_type != OP_PARALLEL_IDENTITY) { continue; } size_t start = 0; @@ -2978,7 +3026,8 @@ bool FFModel::apply_fusion( // don't fuse parallel op except allReduce since they have different // parallel_is in forward/backward if (operators[i]->is_parallel_op() && - operators[i]->op_type != OP_ALLREDUCE) { + operators[i]->op_type != OP_ALLREDUCE && + operators[i]->op_type != OP_PARALLEL_IDENTITY) { continue; } fused_op = new FusedOp(*this, operators[i]); @@ -3010,8 +3059,19 @@ bool FFModel::apply_fusion( found = k; } } - assert(found >= 0); - op->inputs[idx] = fused_op->outputs[found]; + if (found >= 0) { + op->inputs[idx] = fused_op->outputs[found]; + } else { + for (int k = 0; k < fused_op->numInputs; k++) { + if (fused_op->inputs[k]->region == + op->inputs[idx]->region) { + assert(found == -1); + found = k; + } + } + assert(found >= 0); + op->inputs[idx] = fused_op->inputs[found]; + } } } // Insert op @@ -3287,6 +3347,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + // PEFT layers + case OP_LORA: { + Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } default: assert(false); } @@ -3313,9 +3379,123 @@ bool FFModel::is_mlp_block(int layer_idx) const { return false; } +bool FFModel::need_to_add_combine(int layer_idx) const { + if (config.computationMode != COMP_MODE_INFERENCE || + config.tensor_parallelism_degree == 1 || layers.size() <= 2) { + return false; + } + auto const &l = layers[layer_idx]; + // softmax followed by argmax/arg_topk: add combine before softmax + if (layer_idx == layers.size() - 2) { + auto const &l_next = layers[layer_idx + 1]; + if (l->op_type == OP_SOFTMAX && + (l_next->op_type == OP_ARG_TOPK || l_next->op_type == OP_ARGMAX)) { + return true; + } else { + return false; + } + } + // argmax/arg_topk not precedent by softmax: add combine before + // argmax/arg_topk + if (layer_idx == layers.size() - 1 && + (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX)) { + auto const &l_prev = layers[layer_idx - 1]; + if (l_prev->op_type == OP_SOFTMAX) { + return false; + } + return true; + } + return false; +} + +bool FFModel::need_to_add_allreduce(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR) || + // LLAMA with element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 3 && + layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && + layers[layer_idx - 2]->op_type == OP_LINEAR && + layers[layer_idx - 3]->op_type == OP_LINEAR))) { + return true; + } + return false; +} + +#ifdef DEADCODE +bool FFModel::need_to_add_parallel_identity(int layer_idx) const { + auto const &l = layers[layer_idx]; + // add parallel identity (allreduce in the backward pass) before the lm head + // we find the lm head by looking for the linear layer right after a residual + // rms norm / layer norm, and before a softmax, followed by + // argmax/argtopk/sampling + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_RESIDUAL_RMS_NORM || + l->op_type == OP_RESIDUAL_LAYERNORM) && + // there are at least 2 layers before the norm, and at least 3 following + // the norm + layer_idx >= 2 && layer_idx < layers.size() - 3 && + // norm is followed by linear layer (lm head) + layers[layer_idx + 1]->op_type == OP_LINEAR && + // lm head is followed by softmax + layers[layer_idx + 2]->op_type == OP_SOFTMAX && + // softmax is followed by argmax/argtopk/sampling + (layers[layer_idx + 3]->op_type == OP_ARG_TOPK || + layers[layer_idx + 3]->op_type == OP_SAMPLING || + layers[layer_idx + 3]->op_type == OP_ARGMAX || + layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) { + return true; + } + return false; +} +#endif +bool FFModel::need_to_add_parallel_identity(int layer_idx) const { + auto const &l = layers[layer_idx]; + // add parallel identity (allreduce in the backward pass) before the lm head + // we find the lm head by looking for the linear layer right after a residual + // rms norm / layer norm, and before a softmax, followed by + // argmax/argtopk/sampling + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_RMS_NORM || l->op_type == OP_RESIDUAL_RMS_NORM || + l->op_type == OP_LAYERNORM || l->op_type == OP_RESIDUAL_LAYERNORM) && + // there are at least 2 layers before the norm, and at least 1 following + // the norm + layer_idx >= 2 && layer_idx < layers.size() - 1 && + // norm is followed by linear layer or attention + (layers[layer_idx + 1]->op_type == OP_LINEAR || + layers[layer_idx + 1]->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + layers[layer_idx + 1]->op_type == + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + layers[layer_idx + 1]->op_type == + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION))) { + return true; + } + return false; +} + void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; - // for (auto const &l : layers) { + std::map + op_before_allreduce_tensors_to_parallel_tensors; + std::map transformer_layer_allreduce_count; + std::map transformer_layer_parallel_identity_count; for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; std::vector inputs; @@ -3323,14 +3503,19 @@ void FFModel::create_operators_from_layers() { // create new input tensors assert(tensors_to_parallel_tensors.find(l->inputs[i]) != tensors_to_parallel_tensors.end()); - inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); + if (l->op_type == OP_LORA && + op_before_allreduce_tensors_to_parallel_tensors.find(l->inputs[i]) != + op_before_allreduce_tensors_to_parallel_tensors.end()) { + inputs.push_back( + op_before_allreduce_tensors_to_parallel_tensors[l->inputs[i]]); + } else { + inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); + } } Op *op = nullptr; - // add a combine before arg_topk - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX || - l->op_type == OP_ARGMAX)) { + // add a combine before last arg_max / arg_topk or before second-to-last + // softmax + if (need_to_add_combine(layer_idx)) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, @@ -3353,37 +3538,97 @@ void FFModel::create_operators_from_layers() { // config.tensor_parallelism_degree); // operators.push_back(repl); // op = repl; - } else if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - // mlp layer - is_mlp_block(layer_idx) || - // llama mlp layer - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_GELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || - // LLAMA without element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 5 && - layers[layer_idx - 1]->op_type == OP_EW_MUL && - layers[layer_idx - 2]->op_type == OP_EW_MUL && - layers[layer_idx - 3]->op_type == OP_SIGMOID && - layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR) || - // LLAMA with element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 3 && - layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && - layers[layer_idx - 2]->op_type == OP_LINEAR && - layers[layer_idx - 3]->op_type == OP_LINEAR))) { + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + } + } else if (need_to_add_allreduce(layer_idx)) { assert(op->numOutputs == 1); - AllReduce *allreduce = - new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + size_t transformer_layer_id = op->layer_guid.transformer_layer_id; + if (transformer_layer_allreduce_count.find(transformer_layer_id) == + transformer_layer_allreduce_count.end()) { + transformer_layer_allreduce_count[transformer_layer_id] = 0; + } + std::string allreduce_name = std::string( + "layers." + std::to_string(transformer_layer_id) + ".allreduce." + + std::to_string( + transformer_layer_allreduce_count[transformer_layer_id])); + transformer_layer_allreduce_count[transformer_layer_id]++; + AllReduce *allreduce = new AllReduce(*this, + op->outputs[0], + op->outputs[0]->num_dims - 1, + allreduce_name.c_str()); operators.push_back(allreduce); + op_before_allreduce_tensors_to_parallel_tensors[l->outputs[0]] = + op->outputs[0]; op = allreduce; + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + } + } else if (need_to_add_parallel_identity(layer_idx)) { + assert(op->numOutputs == 1 || op->numOutputs == 2); + size_t transformer_layer_id = op->layer_guid.transformer_layer_id; + if (transformer_layer_parallel_identity_count.find( + transformer_layer_id) == + transformer_layer_parallel_identity_count.end()) { + transformer_layer_parallel_identity_count[transformer_layer_id] = 0; + } + std::string parallel_identity_name = std::string( + "layers." + std::to_string(transformer_layer_id) + + ".parallel_identity." + + std::to_string( + transformer_layer_parallel_identity_count[transformer_layer_id])); + transformer_layer_parallel_identity_count[transformer_layer_id]++; + ParallelIdentity *parallel_identity = nullptr; + if (op->numOutputs == 1) { + parallel_identity = + new ParallelIdentity(*this, + op->outputs[0], + op->outputs[0]->num_dims - 1, + parallel_identity_name.c_str()); + } else if (op->numOutputs == 2) { + parallel_identity = + new ParallelIdentity(*this, + op->outputs[1], + op->outputs[1]->num_dims - 1, + parallel_identity_name.c_str()); + // output 0 is taken from the residual rms norm + assert(tensors_to_parallel_tensors.find(l->outputs[0]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[0]] = op->outputs[0]; + } else { + assert(false && + "Op needing ParallelIdentity has unexpected number of outputs"); + } + operators.push_back(parallel_identity); + assert(op->numOutputs == l->numOutputs); + // last output is taken from the parallel identity + assert(tensors_to_parallel_tensors.find(l->outputs[op->numOutputs - 1]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[l->numOutputs - 1]] = + parallel_identity->outputs[0]; + op = parallel_identity; + } else { + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + } } - assert(op->numOutputs == l->numOutputs); - for (int i = 0; i < op->numOutputs; i++) { - tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + // if the operator has op_type==OP_LORA, and the second-to-last operator in + // the operators vector has op_type==OP_ALLREDUCE, move the operator before + // the ALLREDUCE + if (op->op_type == OP_LORA && operators.size() > 1 && + operators[operators.size() - 2]->op_type == OP_ALLREDUCE) { + Op *tmp = operators[operators.size() - 2]; + operators[operators.size() - 2] = operators[operators.size() - 1]; + operators[operators.size() - 1] = tmp; } } } @@ -3424,7 +3669,7 @@ void FFModel::compile(LossType loss_type, deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor @@ -3549,7 +3794,7 @@ void FFModel::compile(LossType loss_type, for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i]->owner_op != nullptr); if (op->inputs[i]->owner_op->op_type == OP_INPUT) { - op->trainableInputs[i] = false; + op->trainable_inputs[i] = false; } } } @@ -3745,9 +3990,18 @@ bool FFModel::check_operators_integrity( } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(FusedOp::use_same_regions( - fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + } else { + assert(FusedOp::use_same_regions( + fused->inputs[my_off], old_op->outputs[i], pt_mapping)); + } } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; @@ -4086,6 +4340,12 @@ struct DefaultConfig { const static bool searchOverlapBackwardUpdate = false; const static size_t offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB + // PEFT related fields + const static bool enablePeft = false; + const static size_t peftActivationReserveSpaceSize = + (size_t)1 * 1024 * 1024 * 1024; // 1GB + const static size_t peftWeightReserveSpaceSize = + (size_t)1 * 1024 * 1024 * 1024; // 1GB const static bool cpuOffload = false; const static bool onlyDataParallel = true; const static bool enableSampleParallel = true; @@ -4122,6 +4382,11 @@ FFConfig::FFConfig() { computationMode = COMP_MODE_TRAINING; cpu_offload = DefaultConfig::cpuOffload; offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize; + // PEFT related fields + enable_peft = DefaultConfig::enablePeft; + peft_activation_reserve_space_size = + DefaultConfig::peftActivationReserveSpaceSize; + peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize; quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; data_parallelism_degree = 1; @@ -4248,6 +4513,18 @@ void FFConfig::parse_args(char **argv, int argc) { quantization_type = DT_INT8; continue; } + if ((!strcmp(argv[i], "-enable-peft"))) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-activation-reserve-space-size")) { + peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } + if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) { + peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } if ((!strcmp(argv[i], "--only-data-parallel"))) { only_data_parallel = true; continue; @@ -5383,6 +5660,38 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_BWD_TASK_ID, + "residual_layernorm_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_backward_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + "residual_layernorm_peft_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_peft_bwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // AddBiasResidualLayerNorm task { TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, @@ -5419,6 +5728,40 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + "AddBiasResidualLayerNorm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::backward_task>( + registrar, "AddBiasResidualLayerNorm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + "AddBiasResidualLayerNorm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::peft_bwd_task>( + registrar, "AddBiasResidualLayerNorm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // SigmoidSiluMulti task { TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID, @@ -5452,6 +5795,38 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_BWD_TASK_ID, + "SigmoidSiluMulti Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, + "SigmoidSiluMulti PEFT Bwd"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti PEFT Bwd Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // rms norm task { TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task"); @@ -5495,7 +5870,36 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } - // rms norm task + { + TaskVariantRegistrar registrar(RMSNORM_BWD_TASK_ID, "RMS Norm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(RMSNORM_PEFT_BWD_TASK_ID, + "RMS Norm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // residual rms norm task { TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID, "Residual RMS Norm Init"); @@ -5519,7 +5923,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "RMS Norm Inference Task"); + registrar, "Residual RMS Norm Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; @@ -5528,6 +5932,51 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_BWD_TASK_ID, + "Residual RMS Norm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Residual RMS Norm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + "Residual RMS Norm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Residual RMS Norm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID, + "layernorm_peft_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "peft_bwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5571,6 +6020,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, + "Linear PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Linear PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5699,6 +6163,22 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(SOFTMAX_PEFT_BWD_TASK_ID, + "Softmax PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Softmax PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // compute Loss { TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward"); @@ -6303,6 +6783,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar( + INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, + "IncMultiHeadSelfAttention PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + IncMultiHeadSelfAttention::peft_bwd_task>( + registrar, "IncMultiHeadSelfAttention PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // speculative MultiHeadAttention task { TaskVariantRegistrar registrar( @@ -6380,6 +6878,54 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } + // PEFT tasks + // LoraLinear tasks + { + TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID, + "LoraLinear Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(LORA_LINEAR_PEFT_BWD_TASK_ID, + "LoraLinear PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); @@ -6411,31 +6957,47 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); + TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "FusedOp Forward Task"); + Runtime::preregister_task_variant( + registrar, "FusedOp Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } { - TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); + TaskVariantRegistrar registrar(FUSEDOP_PEFT_BWD_TASK_ID, + "FusedOp PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "FusedOp Inference Task"); + Runtime::preregister_task_variant( + registrar, "FusedOp PEFT Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); + } + } + + { + TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "FusedOp Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); } } { @@ -6529,6 +7091,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(COMBINE_INF_TASK_ID, "Combine Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Combine Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(COMBINE_BWD_TASK_ID, "Combine Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -6543,6 +7119,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(COMBINE_PEFT_BWD_TASK_ID, + "Combine PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Combine PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // Replicate { TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); @@ -6586,6 +7177,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, + "Replicate PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // Reduction { TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init"); @@ -6644,6 +7250,34 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID, "AllReduce Inference"); @@ -6660,33 +7294,101 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + TaskVariantRegistrar registrar(ALLREDUCE_PEFT_BWD_TASK_ID, + "AllReduce PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Forward Task"); + Runtime::preregister_task_variant( + registrar, "AllReduce PEFT Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } + // ParallelIdentity { - TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INIT_TASK_ID, + "ParallelIdentity Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Backward Task"); + Runtime::preregister_task_variant( + registrar, "ParallelIdentity init Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar); } } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_FWD_TASK_ID, + "ParallelIdentity Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_BWD_TASK_ID, + "ParallelIdentity Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INF_TASK_ID, + "ParallelIdentity Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID, + "ParallelIdentity PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 62f6b89b7f..9f3e2fbb10 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -165,8 +165,8 @@ FFHandler 0, Realm::ProfilingRequestSet()) .wait(); - handle.batch_config_metadata = - workspaceInst.pointer_untyped(0, sizeof(char)); + handle.batch_config_metadata = static_cast( + workspaceInst.pointer_untyped(0, sizeof(char))); } else { handle.batch_config_metadata = nullptr; } diff --git a/src/runtime/model.cu b/src/runtime/model.cu index fd39ed0db0..5dab73e1a4 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -14,6 +14,8 @@ */ #include "flexflow/model.h" #include "flexflow/utils/cuda_helper.h" +#include "flexflow/utils/memory_allocator.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { // declare Legion names @@ -161,12 +163,51 @@ FFHandler 0, Realm::ProfilingRequestSet()) .wait(); - handle.batch_config_metadata = - workspaceInst.pointer_untyped(0, sizeof(char)); + handle.batch_config_metadata = static_cast( + workspaceInst.pointer_untyped(0, sizeof(char))); } else { handle.batch_config_metadata = nullptr; } + if (info->peft_activation_reserve_space_size > 0) { + // allocate memory for peft activation reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::RegionInstance workspaceInst; + handle.peft_activation_allocator = new MemoryAllocator(gpu_mem); + handle.peft_activation_allocator->create_legion_instance( + workspaceInst, info->peft_activation_reserve_space_size); + } else { + handle.peft_activation_allocator = nullptr; + } + + if (info->peft_weight_reserve_space_size > 0) { + // allocate memory for peft weight reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); + handle.peft_weight_allocator = + new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size); + } else { + handle.peft_weight_allocator = nullptr; + } // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 36ac02a3a3..dcac52397a 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -2,14 +2,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/simulator.h" #include - -#include -#include -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif +#include namespace FlexFlow { @@ -25,4 +18,31 @@ size_t Op::get_params_hash() const { get_operator_type_name(this->op_type)); } +fs::path get_dst_folder(std::string const &subdir, + int step_idx, + int shard_idx, + bool before_kernel) { + std::vector debug_subdirs = {"fwd", "bwd", "optim", "weights"}; + assert(std::find(debug_subdirs.begin(), debug_subdirs.end(), subdir) != + debug_subdirs.end()); + std::string step_substr = "step_" + std::to_string(step_idx); + if (before_kernel) { + step_substr += "_pre"; + } + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + std::string debug_dir_ = + ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow" + : std::string("~/.cache/flexflow/debug/flexflow"); + wordexp_t p; + wordexp(debug_dir_.c_str(), &p, 0); + debug_dir_ = p.we_wordv[0]; + wordfree(&p); + fs::path debug_dir = debug_dir_; + assert(fs::is_directory(debug_dir)); + fs::path dst_folder = + debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx)); + fs::create_directories(dst_folder); + return dst_folder; +} + }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 6b2d223f54..e9feb86eb5 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -42,6 +42,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -119,6 +120,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Combine *)op)->get_params(); case OP_ALLREDUCE: return ((AllReduce *)op)->get_params(); + case OP_PARALLEL_IDENTITY: + return ((ParallelIdentity *)op)->get_params(); case OP_FUSED_PARALLEL: return ((FusedParallelOp *)op)->get_params(); case OP_TRANSPOSE: diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index bada87ab19..31a32dd3c8 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -14,6 +14,8 @@ */ #include "flexflow/request_manager.h" +#include "flexflow/ops/fused.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" #include @@ -21,6 +23,7 @@ #include #include #include +#include #include #include @@ -28,12 +31,16 @@ namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; +using json = nlohmann::json; Legion::Logger log_req_mgr("RequestManager"); std::string LoadBytesFromFile(std::string const &path) { std::ifstream fs(path, std::ios::in | std::ios::binary); - assert(!fs.fail() && "no such file"); + if (fs.fail()) { + std::cerr << "Failed to open file: " << path << std::endl; + assert(false); + } std::string data; fs.seekg(0, std::ios::end); size_t size = static_cast(fs.tellg()); @@ -43,6 +50,52 @@ std::string LoadBytesFromFile(std::string const &path) { return data; } +std::ostream &operator<<(std::ostream &os, Request const &req) { + os << "Request {\n"; + os << " guid: " << req.guid << "\n"; + os << " peft_model_id: " << req.peft_model_id << "\n"; + os << " max_sequence_length: " << req.max_sequence_length << "\n"; + os << " initial_len: " << req.initial_len << "\n"; + os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; + os << " llm_cache_size: " << req.llm_cache_size << "\n"; + os << " status: " << static_cast(req.status) << "\n"; + os << " tokens: ["; + for (auto const &token : req.tokens) { + os << token << " "; + } + os << "]\n"; + os << " prompt: " << req.prompt << "\n"; + // os << " beam_trees: ["; + // for (const auto& tree : req.beam_trees) { + // // Assuming BeamTree has its own << operator defined + // os << tree << " "; + // } + // os << "]\n"; + os << " req_type: " << static_cast(req.req_type) << "\n"; + os << " completed_training_steps: " << req.completed_training_steps << "\n"; + os << " gradient_accumulation_steps: " << req.gradient_accumulation_steps + << "\n"; + os << " max_training_steps: " << req.max_training_steps << "\n"; + os << " dataset_filepath: " << req.dataset_filepath << "\n"; + os << " dataset: ["; + for (auto const &pair : req.dataset) { + os << "["; + for (auto const &token : pair.first) { + os << token << " "; + } + os << "], ["; + for (auto const &token : pair.second) { + os << token << " "; + } + os << "] "; + } + os << "]\n"; + os << "}\n"; + return os; +} + +bool RequestManager::inference_finished = false; + RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), @@ -114,6 +167,14 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) { spec_infer_tree_width.emplace_back(tree_width); } +void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) { + enable_peft_finetuning = enable_peft_finetuning_; +} + +void RequestManager::set_inference_finished(bool finished) { + inference_finished = finished; +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -121,33 +182,45 @@ void RequestManager::register_tokenizer(ModelType type, this->model_type = type; this->bos_token_id = bos_token_id; this->eos_token_id = eos_token_id; - std::string tokenizer_folder = - (!path.empty() && path.back() != '/') ? path + '/' : path; + std::filesystem::path tokenizer_folder(path); + if (model_type == ModelType::LLAMA) { - bool path_to_file = !path.empty() && - (path.size() >= strlen("tokenizer.model")) && - path.find("tokenizer.model") == - (path.size() - strlen("tokenizer.model")); - std::string tokenizer_filepath = - path_to_file ? path : tokenizer_folder + "tokenizer.model"; - this->tokenizer_ = - Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath)); + std::filesystem::path tokenizer_model_path; + if (std::filesystem::is_directory(tokenizer_folder)) { + tokenizer_model_path = + std::filesystem::path(tokenizer_folder) / "tokenizer.model"; + } else { + tokenizer_model_path = tokenizer_folder; + } + if (std::filesystem::exists(tokenizer_model_path)) { + // load from tokenizer.model + this->tokenizer_ = Tokenizer::FromBlobSentencePiece( + LoadBytesFromFile(tokenizer_model_path.string())); + } else { + // load from tokenizer.json + std::filesystem::path tokenizer_json_path = + tokenizer_folder / "tokenizer.json"; + if (!std::filesystem::exists(tokenizer_json_path)) { + std::cerr << "Failed to open file: " << tokenizer_json_path + << std::endl; + assert(false); + } + this->tokenizer_ = Tokenizer::FromBlobJSON( + LoadBytesFromFile(tokenizer_json_path.string())); + } } else if (model_type == ModelType::OPT) { - std::string vocab_file = tokenizer_folder + "vocab.json"; - std::string merges_file = tokenizer_folder + "merges.txt"; - std::string added_tokens_file = - tokenizer_folder + "special_tokens_map.json"; - std::filesystem::path path1(vocab_file); - std::filesystem::path path2(merges_file); - std::filesystem::path path3(added_tokens_file); - assert(std::filesystem::exists(path1) && + std::filesystem::path vocab_file = tokenizer_folder / "vocab.json"; + std::filesystem::path merges_file = tokenizer_folder / "merges.txt"; + std::filesystem::path added_tokens_file = + tokenizer_folder / "special_tokens_map.json"; + assert(std::filesystem::exists(vocab_file) && "Vocab file vocab.json does not exist at the specified path"); - assert(std::filesystem::exists(path2) && + assert(std::filesystem::exists(merges_file) && "Merge file merges.txt does not exist at the specified path"); // opt_tokenizer = new OptTokenizer(vocab_file, merges_file); - std::string vocab = LoadBytesFromFile(path1.string()); - std::string merges = LoadBytesFromFile(path2.string()); - std::string added_tokens = LoadBytesFromFile(path3.string()); + std::string vocab = LoadBytesFromFile(vocab_file.string()); + std::string merges = LoadBytesFromFile(merges_file.string()); + std::string added_tokens = LoadBytesFromFile(added_tokens_file.string()); this->tokenizer_ = Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); @@ -182,28 +255,40 @@ size_t RequestManager::get_num_ssms() { } RequestManager::RequestGuid - RequestManager::register_new_request(std::vector const &prompt, - int max_sequence_length) { + RequestManager::register_new_request(Request const &request_) { const std::lock_guard lock(request_queue_mutex); - // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - - if (prompt.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << prompt.size() << ".\n"; - - printf("tokens size: %zu\n", request.tokens.size()); - return INVALID_GUID; + request.max_sequence_length = request_.max_sequence_length; + request.peft_model_id = request_.peft_model_id; + request.warmup = request_.warmup; + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + request.tokens.push_back(bos_token_id); + } + if (request_.benchmarking_tokens >= 0) { + assert(request_.benchmarking_tokens < get_max_sequence_length()); + request.benchmarking_tokens = request_.benchmarking_tokens; + request.tokens.insert(request.tokens.end(), + request_.benchmarking_tokens, + 15); // insert random number } else { - request.initial_len = prompt.size(); - request.tokens = prompt; + std::vector tokens = this->tokenizer_->Encode(request_.prompt); + if (tokens.size() >= get_max_sequence_length()) { + std::cout << "Warning: too many tokens in prompt, only load up to " + << get_max_sequence_length() << " tokens, but got " + << tokens.size() << ".\n"; + return INVALID_GUID; + } + for (int i = 0; i < tokens.size(); i++) { + std::cout << "[" << i << "]" << tokens.at(i) << "\n"; + } + request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); } + request.initial_len = request.tokens.size(); + if (get_num_ssms() == 0) { std::cout << "No small speculative model registered, using incremental " "decoding." @@ -216,58 +301,111 @@ RequestManager::RequestGuid } } - pending_request_queue.push(request); + pending_infr_request_queue.push(request); all_requests[request.guid] = request; { const std::lock_guard lock(request_to_promise_mutex); request_to_promise[request.guid] = new std::promise(); } - if (verbose) { - std::cout << "new req: " << request.tokens.size() << std::endl; + { + std::string output = "New request tokens:"; + output = "[" + std::to_string(request.guid) + "]" + output; for (int i = 0; i < request.tokens.size(); i++) { - std::cout << i << " : " << request.tokens[i] << std::endl; + output = output + " " + std::to_string(request.tokens[i]); } + log_req_mgr.print("%s", output.c_str()); } GenerationResult gr; gr.guid = request.guid; - gr.input_text = ""; - gr.input_tokens = prompt; - gr.output_text = ""; - gr.output_tokens = prompt; + gr.input_text = request_.prompt; + gr.input_tokens = request.tokens; + gr.output_text = request_.prompt; + gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; + ProfileInfo profile_info; + profile_info.registration_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[request.guid] = profile_info; + return request.guid; } RequestManager::RequestGuid - RequestManager::register_new_request(std::string const &prompt, - int max_sequence_length) { + RequestManager::register_new_peft_request(Request const &request_) { + assert(enable_peft_finetuning && "PEFT finetuning is not enabled"); const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { - request.tokens.push_back(bos_token_id); + request.initial_len = 0; + request.max_sequence_length = request_.max_sequence_length; + request.peft_model_id = request_.peft_model_id; + request.req_type = RequestType::REQ_FINETUNING; + request.completed_training_steps = 0; + request.gradient_accumulation_steps = request_.gradient_accumulation_steps; + request.max_training_steps = request_.max_training_steps; + request.dataset_filepath = request_.dataset_filepath; + request.warmup = request_.warmup; + + // Load dataset + if (request_.benchmarking_tokens >= 0) { + assert(request_.benchmarking_tokens <= get_max_sequence_length()); + request.benchmarking_tokens = request_.benchmarking_tokens; + std::vector input_tokens; + std::vector output_tokens; + bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON); + if (bos_added) { + input_tokens.push_back(bos_token_id); + } + input_tokens.insert(input_tokens.end(), + request_.benchmarking_tokens - (int)bos_added, + 15); // insert random number + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); + } else { + using json = nlohmann::json; + std::ifstream file_handle(request.dataset_filepath); + assert(file_handle.good() && "Dataset file does not exist."); + json dataset_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + for (auto &prompt : dataset_json) { + std::string text = prompt.get(); + std::string output_text(""); + std::vector input_tokens; + input_tokens = this->tokenizer_->Encode(text); + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + input_tokens.insert(input_tokens.begin(), bos_token_id); + } + std::vector output_tokens = + this->tokenizer_->Encode(output_text); + if (input_tokens.size() + output_tokens.size() > + get_max_sequence_length()) { + std::cout << "Warning: too many tokens in sample, only load up to " + << get_max_sequence_length() << " tokens, but got " + << input_tokens.size() + output_tokens.size() << ".\n"; + return INVALID_GUID; + } else { + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); + } + } } - std::vector tokens = this->tokenizer_->Encode(prompt); - if (tokens.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; - printf("tokens size: %zu\n", tokens.size()); - return INVALID_GUID; + if (request.gradient_accumulation_steps == -1) { + request.gradient_accumulation_steps = request.dataset.size(); } - for (int i = 0; i < tokens.size(); i++) { - std::cout << "[" << i << "]" << tokens.at(i) << "\n"; - } - request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); - request.initial_len = request.tokens.size(); + assert(request.gradient_accumulation_steps > 0 && + "Invalid gradient accumulation steps"); + assert(request.gradient_accumulation_steps <= request.max_training_steps && + "Gradient accumulation steps should be less than or equal to max " + "training steps"); + // Currently don't support speculative inference for PEFT + assert(get_num_ssms() == 0); if (get_num_ssms() == 0) { std::cout << "No small speculative model registered, using incremental " "decoding." @@ -280,29 +418,38 @@ RequestManager::RequestGuid } } - pending_request_queue.push(request); + pending_peft_request_queue.push(request); all_requests[request.guid] = request; { const std::lock_guard lock(request_to_promise_mutex); request_to_promise[request.guid] = new std::promise(); } - { - std::string output = "New request tokens:"; - output = "[" + std::to_string(request.guid) + "]" + output; - for (int i = 0; i < request.tokens.size(); i++) { - output = output + " " + std::to_string(request.tokens[i]); + for (size_t r = 0; r < request.dataset.size(); r++) { + std::string input = "[" + std::to_string(r) + "] input:"; + std::string output = "[" + std::to_string(r) + "] output:"; + for (size_t i = 0; i < request.dataset[r].first.size(); i++) { + input = input + " " + std::to_string(request.dataset[r].first[i]); } + for (size_t i = 0; i < request.dataset[r].second.size(); i++) { + output = output + " " + std::to_string(request.dataset[r].second[i]); + } + log_req_mgr.print("%s", input.c_str()); log_req_mgr.print("%s", output.c_str()); } GenerationResult gr; gr.guid = request.guid; - gr.input_text = prompt; - gr.input_tokens = request.tokens; - gr.output_text = prompt; - gr.output_tokens = request.tokens; + // gr.input_text = prompt; + // gr.input_tokens = request.tokens; + // gr.output_text = prompt; + // gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; + + ProfileInfo profile_info; + profile_info.registration_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[request.guid] = profile_info; + return request.guid; } @@ -363,51 +510,117 @@ BatchConfig RequestManager::prepare_next_batch_task( return rm->prepare_next_batch(*bc, result); } +bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, + int i) { + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + bool request_completed = false; + // printf("model_type = %d\n", this->model_type); + if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + request_completed = true; + } else if (request.tokens.back() == eos_token_id) { + // Encounter EOS token id + request_completed = true; + } + return request_completed; +} + +void RequestManager::check_batch(BatchConfig const &old_bc, + BatchConfig const &new_bc) { + int num_incomplete_prompts = 0; + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + if (new_bc.request_completed[i]) { + continue; + } + // ensure there is no request with zero tokens + assert(new_bc.requestsInfo[i].num_tokens_in_batch > 0); + // ensure there is no more than one incomplete prompt + if (new_bc.requestsInfo[i].prompt_phase && + new_bc.requestsInfo[i].num_tokens_in_batch + + new_bc.requestsInfo[i].first_token_depth_in_request < + all_requests[new_bc.requestsInfo[i].request_guid].tokens.size()) { + num_incomplete_prompts++; + } + } + if (num_incomplete_prompts > 1) { + std::cout << "Error: more than one incomplete prompt in the batch\n"; + pid_t pid = getpid(); + std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt"; + std::ofstream filen(filenamen); + if (filen.is_open()) { + filen << new_bc << std::endl; + filen.close(); + std::cout << "String written to file: " << filenamen << std::endl; + } else { + std::cout << "Unable to open file: " << filenamen << std::endl; + } + std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt"; + std::ofstream fileo(filenameo); + if (fileo.is_open()) { + fileo << old_bc << std::endl; + fileo.close(); + std::cout << "String written to file: " << filenameo << std::endl; + } else { + std::cout << "Unable to open file: " << filenameo << std::endl; + } + assert(false); + } +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - // Step 1: append result from previous iteration to request's tokens - for (int i = 0; i < old_bc.num_tokens; i++) { + for (int i = 0; i < old_bc.num_active_tokens(); i++) { size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; Request &request = all_requests[guid]; + if (request.req_type == RequestType::REQ_FINETUNING) { + continue; + } if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { // This is a prompt token continue; } else { + // This is a decoding token assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == request.tokens.size()); - // This is a decoding token + if (!profiling_requests[guid].first_token_time_set) { + profiling_requests[guid].first_token_time = + Realm::Clock::current_time_in_microseconds(); + profiling_requests[guid].first_token_time_set = true; + } log_req_mgr.print("Output token is: %d", result.token_ids[i]); request.tokens.push_back(result.token_ids[i]); // std::string output = this->tokenizer_->Decode(request.tokens); // log_req_mgr.print("Output: %s", output.c_str()); } } + int num_generation_tokens = 0; int num_active_req = -1; - // Step 2: prepare the next batch for existing requests + // when finetuning is enabled, the last entry in the batch cannot be used for + // inference + int inference_batch_size = + BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning; + + // Step 2: prepare the next batch for existing inference requests BatchConfig new_bc; - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i]) { // add new requests to the next batch + for (int i = 0; i < inference_batch_size; i++) { + if (old_bc.request_completed[i]) { + // no need to carry over tokens to new batch for this request continue; } else { assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + assert(request.req_type == RequestType::REQ_INFERENCE && + "Found misplaced finetuning request"); + int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); - bool request_completed = false; - // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { - request_completed = true; - } else if (request.tokens.back() == eos_token_id) { - // Encounter EOS token id - request_completed = true; - } + bool request_completed = check_inf_req_completion(old_bc, i); if (request_completed) { std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically @@ -435,32 +648,40 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.llm_decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print("[%s] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf) ttft(%.1lf)", + request.warmup ? "Warmup" : "Profile", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time, + profile_info.first_token_time - + profile_info.registration_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; + outputFile << "[" << (request.warmup ? "Warmup" : "Profile") + << "] guid(" << request.guid << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ") ttft(" << std::fixed << std::setprecision(3) + << (profile_info.first_token_time - + profile_info.registration_time) + << ")\n"; + if (request.benchmarking_tokens <= 0) { + outputFile << "token IDs: "; + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } } + outputFile << std::endl; + outputFile << output; } - outputFile << std::endl; - outputFile << output; outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath @@ -468,13 +689,15 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(false); } } - } else { new_bc.request_completed[i] = false; new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; num_active_req++; @@ -487,8 +710,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase + assert(old_bc.requestsInfo[i].prompt_phase == true); + int space_for_incr_dec_requests = 0; + // If the prompt can't fit in the batch, compute how much space we + // need to leave out for incomplete requests in decoding phase at + // higher indices. + for (int ii = i + 1; ii < inference_batch_size; ii++) { + if (old_bc.request_completed[ii]) { + continue; + } + Request &old_request = + all_requests[old_bc.requestsInfo[ii].request_guid]; + bool req_completed = check_inf_req_completion(old_bc, ii); + if (!req_completed) { + space_for_incr_dec_requests++; + } + } new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens - + space_for_incr_dec_requests, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); new_bc.requestsInfo[i].prompt_phase = true; @@ -509,13 +749,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } new_bc.num_generation_tokens = num_generation_tokens; - // Step 3: add new requests to the next batch - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + // Step 3: add new inference requests to the next batch if there is space + for (int i = 0; i < inference_batch_size; i++) { if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + assert(new_request.req_type == RequestType::REQ_INFERENCE); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; @@ -526,15 +767,16 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 1; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; + // add start time to profile_info for the new request + profiling_requests[new_request.guid].llm_decoding_steps = 1; + profiling_requests[new_request.guid].start_time = + Realm::Clock::current_time_in_microseconds(); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -551,6 +793,170 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } + if (enable_peft_finetuning && + !old_bc.request_completed[inference_batch_size]) { + assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0); + Request &request = + all_requests[old_bc.requestsInfo[inference_batch_size].request_guid]; + assert(request.req_type == RequestType::REQ_FINETUNING && + "Found misplaced inference request"); + + request.finetuning_losses.push_back(result.finetuning_loss); + + request.dataset_entry_processed_tokens += + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + request.processed_finetuning_tokens += + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + request.finetuning_tokens_per_batch.push_back( + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch); + int dataset_entry = + request.completed_training_steps % request.dataset.size(); + if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request + + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch == + request.dataset[dataset_entry].first.size()) { + // completed the current dataset entry + assert(request.dataset_entry_processed_tokens == + request.dataset[dataset_entry].first.size()); + request.completed_training_steps += 1; + request.dataset_entry_processed_tokens = 0; + } + + assert(request.completed_training_steps <= request.max_training_steps); + if (request.completed_training_steps == request.max_training_steps || + inference_finished) { + // check if the fine tuning request has completed + request.status = Request::COMPLETED; + + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.finetuning_losses = request.finetuning_losses; + trigger_request_completion_future(request.guid); + num_processed_requests++; + + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[%s] guid(%zu) completed_training_steps(%d) " + "processed_finetuning_tokens(%lu) latency(%.1lf)", + request.warmup ? "Warmup" : "Finetuning", + request.guid, + request.completed_training_steps, + request.processed_finetuning_tokens, + profile_info.finish_time - profile_info.start_time); + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath, std::ios::app); + if (outputFile.is_open()) { + std::string tokens_str = "["; + for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); + i++) { + tokens_str += + std::to_string(request.finetuning_tokens_per_batch[i]); + if (i != request.finetuning_tokens_per_batch.size() - 1) { + tokens_str += ", "; + } + } + tokens_str += "]"; + outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") + << "] guid(" << request.guid + << ") completed_training_steps(" + << request.completed_training_steps + << ") processed_finetuning_tokens(" + << request.processed_finetuning_tokens << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ") tokens_per_batch(" << tokens_str << ")\n"; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + } + } + + // Step 4: add PEFT bwd requests, if there is additional space + while (pending_peft_request_queue.size() > 0) { + Request &request = pending_peft_request_queue.front(); + // assert(request.req_type = RequestType::REQ_FINETUNING); + Request &all_req_handle = all_requests[request.guid]; + // assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); + if (all_req_handle.status == Request::COMPLETED) { + pending_peft_request_queue.pop(); + } else { + break; + } + } + + if (pending_peft_request_queue.size() > 0 && !inference_finished) { + Request &request = pending_peft_request_queue.front(); + assert(request.req_type = RequestType::REQ_FINETUNING); + assert(request.dataset.size() > 0); + // update status and training steps + Request &all_req_handle = all_requests[request.guid]; + assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); + + request.completed_training_steps = all_req_handle.completed_training_steps; + request.processed_finetuning_tokens = + all_req_handle.processed_finetuning_tokens; + request.status = all_req_handle.status; + int dataset_entry = + request.completed_training_steps % request.dataset.size(); + request.dataset_entry_processed_tokens = + all_req_handle.dataset_entry_processed_tokens; + request.gradient_accumulation_steps = + all_req_handle.gradient_accumulation_steps; + + assert(request.status != Request::COMPLETED); + assert(request.max_training_steps > 0 && + request.completed_training_steps < request.max_training_steps); + assert(request.dataset_entry_processed_tokens <= + request.dataset[dataset_entry].first.size()); + + int num_peft_tokens = + min((int)request.dataset[dataset_entry].first.size() - + request.dataset_entry_processed_tokens, + get_max_tokens_per_batch() - new_bc.num_active_infr_tokens()); + int num_peft_label_tokens = request.dataset[dataset_entry].second.size(); + assert(num_peft_label_tokens == 0); + + if (num_peft_tokens > 0) { + assert(new_bc.request_completed[inference_batch_size]); + // request info + new_bc.request_completed[inference_batch_size] = false; + new_bc.requestsInfo[inference_batch_size].first_token_depth_in_request = + request.dataset_entry_processed_tokens; + new_bc.requestsInfo[inference_batch_size].first_token_offset_in_batch = + new_bc.num_active_infr_tokens(); + new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = + num_peft_tokens; + new_bc.requestsInfo[inference_batch_size].max_sequence_length = + request.max_sequence_length; + new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; + new_bc.requestsInfo[inference_batch_size].peft_model_id = + request.peft_model_id; + new_bc.requestsInfo[inference_batch_size].peft_bwd = true; + set_optimizer_tasks( + new_bc.requestsInfo[inference_batch_size].optimizer_tasks, + request.max_training_steps, + request.completed_training_steps, + request.gradient_accumulation_steps); + // tokens info + for (size_t i = request.dataset_entry_processed_tokens; + i < request.dataset_entry_processed_tokens + num_peft_tokens; + i++) { + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.dataset[dataset_entry].first[i]; + new_bc.tokensInfo[new_bc.num_tokens].request_index = + inference_batch_size; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i; + new_bc.num_tokens++; + new_bc.num_peft_tokens++; + } + } + } return new_bc; } @@ -722,11 +1128,17 @@ BeamSearchBatchConfig if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; + outputFile << "[Profile] guid(" << request.guid + << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ")\n"; + // outputFile << "end-to-end latency: " << std::fixed + // << std::setprecision(3) << total_request_run_time + // << std::endl; + // outputFile << "num decoding steps: " + // << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -736,7 +1148,6 @@ BeamSearchBatchConfig } outputFile << std::endl; outputFile << output; - outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath @@ -884,10 +1295,10 @@ BeamSearchBatchConfig // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; num_active_req++; new_bc.requestsInfo[i].first_token_depth_in_request = 0; @@ -901,13 +1312,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 0; - profile_info.ssm_decoding_steps = 0; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; + profiling_requests[new_request.guid].llm_decoding_steps = 0; + profiling_requests[new_request.guid].ssm_decoding_steps = 0; + profiling_requests[new_request.guid].start_time = + Realm::Clock::current_time_in_microseconds(); // init the beam search metadata per request - int ssm_decoding_steps = profile_info.ssm_decoding_steps; + int ssm_decoding_steps = + profiling_requests[new_request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = spec_infer_tree_width.size() > ssm_decoding_steps @@ -1552,7 +1963,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].num_tokens_in_batch = std::min(max_prompt_load_size, (int)request.initial_len - @@ -2105,7 +2515,7 @@ std::vector> // must in this branch. int layer_slot = i - processed_whole_layer_tokens; int layer_slot_total = treeLayers[layer_num]; - if ((first_layer_slot == layer_slot)) { + if (first_layer_slot == layer_slot) { verifiedTree.push_back(output); new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); @@ -2297,19 +2707,34 @@ std::vector> } std::vector - FFModel::generate(std::vector &prompts, int max_seq_length) { + FFModel::generate(std::vector const &requests) { RequestManager *rm = RequestManager::get_request_manager(); - std::vector guids; - for (int i = 0; i < prompts.size(); i++) { - RequestManager::RequestGuid guid = - rm->register_new_request(prompts.at(i), max_seq_length); - if (guid != RequestManager::INVALID_GUID) { - guids.push_back(guid); + // reset inference_finished flag + rm->set_inference_finished(false); + std::vector inf_guids, peft_guids; + for (int i = 0; i < requests.size(); i++) { + RequestManager::RequestGuid guid; + if (requests.at(i).req_type == RequestType::REQ_INFERENCE) { + guid = rm->register_new_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + inf_guids.push_back(guid); + } + } else { + guid = rm->register_new_peft_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + peft_guids.push_back(guid); + } } } std::vector results; - for (int i = 0; i < guids.size(); i++) { - results.push_back(rm->get_generation_result(guids[i])); + for (int i = 0; i < inf_guids.size(); i++) { + results.push_back(rm->get_generation_result(inf_guids[i])); + } + if (inf_guids.size() > 0) { + rm->set_inference_finished(); + } + for (int i = 0; i < peft_guids.size(); i++) { + results.push_back(rm->get_generation_result(peft_guids[i])); } return results; } @@ -2342,6 +2767,18 @@ void RequestManager::background_serving_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + + auto print_timestamped_message = [](std::string const &message) { + auto now = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " + << message << std::endl; + }; + + // Print at the start of the task + print_timestamped_message( + "###PEFT DEBUGGING### Starting background serving task."); + RequestManager *rm = RequestManager::get_request_manager(); FFModel *llm = *(FFModel **)task->args; { @@ -2358,6 +2795,11 @@ void RequestManager::background_serving_task( ssm->config.lg_ctx = ctx; } } + + // Checkpoint print + print_timestamped_message( + "###PEFT DEBUGGING### Updated models' configuration."); + if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding rm->serve_incr_decoding(llm); @@ -2365,13 +2807,48 @@ void RequestManager::background_serving_task( // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } + #ifdef FF_USE_NCCL llm->finish_nccl_comms(); #endif + + // Print at the end of the task + print_timestamped_message( + "###PEFT DEBUGGING### Background serving task completed."); +} + +std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { + for (size_t i = 0; i < model->layers.size(); i++) { + if (model->layers[i]->layer_guid == guid) { + std::string layer_name(model->layers[i]->name); + return layer_name; + } + } + assert(false); + return "invalid_layer_name"; +} + +bool is_peft_operator_type(OperatorType type) { + switch (type) { + case OP_LORA: + return true; + default: + return false; + } } /*static*/ void RequestManager::serve_incr_decoding(FFModel *llm) { + + // Check if the model object exists + if (llm == nullptr) { + std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl; + } + Context ctx = llm->config.lg_ctx; Runtime *runtime = llm->config.lg_hlr; // Compile the llm @@ -2419,6 +2896,9 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); + if (llm->config.enable_peft) { + im->peft_bwd(llm, 0, bcf); + } assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); batch_pipeline.push(std::make_pair(bcf, irf)); diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index fadbf80d6d..8e5f302466 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -73,74 +73,69 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - size_t total_copy_size = 0; - checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->tokens_info, &(batch_config->tokensInfo), sizeof(BatchConfig::tokensInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::tokensInfo); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->requestsInfo, &(batch_config->requestsInfo), sizeof(BatchConfig::requestsInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamTokenInfo, &(beam_batch_config->beamTokenInfo), sizeof(BeamSearchBatchConfig::beamTokenInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo, &(beam_batch_config->beamRequestsInfo), sizeof(BeamSearchBatchConfig::beamRequestsInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask, &(beam_batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + hipMemcpyHostToDevice, + stream)); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask, &(tree_batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->committed_tokens, &(tree_batch_config->committed_tokens), sizeof(TreeVerifyBatchConfig::committed_tokens), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - } - // add a size check - assert(total_copy_size <= handle.batch_config_metadata_size); + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + hipMemcpyHostToDevice, + stream)); + } } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 8380d6be73..343f1dd6e6 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -40,8 +40,21 @@ void RequestManager::load_tokens_task( printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + + // pid_t pid = getpid(); + // std::string filename = "bc_" + std::to_string(pid) + ".txt"; + // std::ofstream file(filename); + // if (file.is_open()) { + // file << *batch_config << std::endl; + // file.close(); + // std::cout << "String written to file: " << filename << std::endl; + // } else { + // std::cout << "Unable to open file: " << filename << std::endl; + // } + } else if (batch_config->num_tokens > - BatchConfig::max_verify_tokens_per_batch()) { + BatchConfig::max_verify_tokens_per_batch() && + batch_config->get_mode() != INC_DECODING_MODE) { printf("Warning: Speculative decoding. too many tokens in prompt, only " "load up to %d tokens\n", BatchConfig::max_verify_tokens_per_batch()); @@ -80,91 +93,69 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - size_t total_copy_size = 0; - checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->tokens_info, &(batch_config->tokensInfo), sizeof(BatchConfig::tokensInfo), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::tokensInfo); - checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->requestsInfo, &(batch_config->requestsInfo), sizeof(BatchConfig::requestsInfo), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BatchConfig::request_completed); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamTokenInfo, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BatchConfig::request_completed); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->committed_tokens, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); } - - // add a size check - assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp index 0daf151d2c..56931e0dc7 100644 --- a/src/runtime/simulator.cpp +++ b/src/runtime/simulator.cpp @@ -82,17 +82,17 @@ Simulator::Simulator(FFModel const *model, checkCUDA(hipEventCreate(&start_event)); checkCUDA(hipEventCreate(&end_event)); - conv2d_meta = new Conv2DMeta(handler); - // linear_meta = new LinearMeta(handler, 4096); - pool2d_meta = new Pool2DMeta(handler); - ele_unary_meta = new ElementUnaryMeta(handler); - // ele_binary_meta = new ElementBinaryMeta(handler); - // embedding_meta = new EmbeddingMeta(handler); - // softmax_meta = new SoftmaxMeta(handler); - batch_matmul_meta = new BatchMatmulMeta(handler); - concat_meta = new ConcatMeta(handler); - // dropout_meta = new DropoutMeta(handler); - transpose_meta = new TransposeMeta(handler); + // conv2d_meta = new Conv2DMeta(handler); + // linear_meta = new LinearMeta(handler, 4096); + // pool2d_meta = new Pool2DMeta(handler); + // ele_unary_meta = new ElementUnaryMeta(handler); + // ele_binary_meta = new ElementBinaryMeta(handler); + // embedding_meta = new EmbeddingMeta(handler); + // softmax_meta = new SoftmaxMeta(handler); + // batch_matmul_meta = new BatchMatmulMeta(handler); + // concat_meta = new ConcatMeta(handler); + // dropout_meta = new DropoutMeta(handler); + // transpose_meta = new TransposeMeta(handler); this->machine = machine; segment_size = model->config.simulator_segment_size; max_num_segments = model->config.simulator_max_num_segments; diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu index b44ce1690a..056781f73d 100644 --- a/src/runtime/simulator.cu +++ b/src/runtime/simulator.cu @@ -81,17 +81,17 @@ Simulator::Simulator(FFModel const *model, cudaEventCreate(&start_event); cudaEventCreate(&end_event); - conv2d_meta = new Conv2DMeta(handler); + // conv2d_meta = new Conv2DMeta(handler); // linear_meta = new LinearMeta(handler, 4096); - pool2d_meta = new Pool2DMeta(handler); - ele_unary_meta = new ElementUnaryMeta(handler); + // pool2d_meta = new Pool2DMeta(handler); + // ele_unary_meta = new ElementUnaryMeta(handler); // ele_binary_meta = new ElementBinaryMeta(handler); // embedding_meta = new EmbeddingMeta(handler); // softmax_meta = new SoftmaxMeta(handler); - batch_matmul_meta = new BatchMatmulMeta(handler); - concat_meta = new ConcatMeta(handler); + // batch_matmul_meta = new BatchMatmulMeta(handler); + // concat_meta = new ConcatMeta(handler); // dropout_meta = new DropoutMeta(handler); - transpose_meta = new TransposeMeta(handler); + // transpose_meta = new TransposeMeta(handler); this->machine = machine; segment_size = model->config.simulator_segment_size; max_num_segments = model->config.simulator_max_num_segments; @@ -103,13 +103,13 @@ Simulator::~Simulator(void) { simulatorInst.destroy(); cudaEventDestroy(start_event); cudaEventDestroy(end_event); - delete conv2d_meta; - delete pool2d_meta; - delete ele_unary_meta; - delete batch_matmul_meta; - delete concat_meta; - delete transpose_meta; - delete task_manager; + // delete conv2d_meta; + // delete pool2d_meta; + // delete ele_unary_meta; + // delete batch_matmul_meta; + // delete concat_meta; + // delete transpose_meta; + // delete task_manager; } __host__ void diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index b86964049d..9b6510fe5e 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -43,6 +43,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -3754,14 +3755,17 @@ bool FFModel::convert_graph_to_operators( assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; new_op = new Softmax( - *this, softmax->layer_guid, inputs[0], softmax->dim, NULL); + *this, softmax->layer_guid, inputs[0], softmax->dim, softmax->name); break; } case OP_COMBINE: { assert(inList.size() == 1); Combine *combine = (Combine *)node.ptr; - new_op = new Combine( - *this, inputs[0], combine->combine_dim, combine->combine_degree); + new_op = new Combine(*this, + inputs[0], + combine->combine_dim, + combine->combine_degree, + combine->name); break; } case OP_REPARTITION: { @@ -3770,7 +3774,8 @@ bool FFModel::convert_graph_to_operators( new_op = new Repartition(*this, inputs[0], repart->repartition_dim, - repart->repartition_degree); + repart->repartition_degree, + repart->name); break; } case OP_REPLICATE: { @@ -3779,7 +3784,8 @@ bool FFModel::convert_graph_to_operators( new_op = new Replicate(*this, inputs[0], replicate->replicate_dim, - replicate->replicate_degree); + replicate->replicate_degree, + replicate->name); break; } case OP_REDUCTION: { @@ -3788,13 +3794,24 @@ bool FFModel::convert_graph_to_operators( new_op = new Reduction(*this, inputs[0], reduction->reduction_dim, - reduction->reduction_degree); + reduction->reduction_degree, + reduction->name); break; } case OP_ALLREDUCE: { assert(inList.size() == 1); AllReduce *allreduce = (AllReduce *)node.ptr; - new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + new_op = new AllReduce( + *this, inputs[0], allreduce->allreduce_dim, allreduce->name); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(inList.size() == 1); + ParallelIdentity *parallel_identity = (ParallelIdentity *)node.ptr; + new_op = new ParallelIdentity(*this, + inputs[0], + parallel_identity->parallel_identity_dim, + parallel_identity->name); break; } case OP_FUSED_PARALLEL: { @@ -3819,8 +3836,9 @@ bool FFModel::convert_graph_to_operators( abr_ln->elementwise_affine, abr_ln->use_bias, abr_ln->eps, + abr_ln->inplace_residual, true, - NULL); + abr_ln->name); break; } case OP_SIGMOID_SILU_MULTI: { @@ -3828,7 +3846,7 @@ bool FFModel::convert_graph_to_operators( SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr; SigmoidSiluMultiParams params = ssm->get_params(); new_op = new SigmoidSiluMulti( - *this, ssm->layer_guid, inputs[0], inputs[1], NULL); + *this, ssm->layer_guid, inputs[0], inputs[1], ssm->name); break; } default: { diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 49d42bb6dd..a71b1070b2 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -54,6 +54,10 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index f3732d54f4..0000000000 --- a/tests/.gitignore +++ /dev/null @@ -1 +0,0 @@ -inference/python_test_configs/*.json diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh index 3fb361f25c..73b0cb30dc 100755 --- a/tests/align/test_all_operators.sh +++ b/tests/align/test_all_operators.sh @@ -11,7 +11,7 @@ function generate_torch_tensor(){ python tests/align/align_create_tensor_torch.py -o "$1" } -ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather) +ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather) #create flexflow tensors conda activate flexflow diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index 1e8dd4298f..c7206eac93 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -23,8 +23,8 @@ remove_mnist() { download_mnist() { if [[ ! -f train-images-idx3-ubyte || ! -f train-labels-idx1-ubyte ]]; then remove_mnist - wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz - wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz + wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-images-idx3-ubyte.gz + wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-labels-idx1-ubyte.gz gzip -d train-images-idx3-ubyte.gz gzip -d train-labels-idx1-ubyte.gz fi diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 8beea55999..a9dd8809ba 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 # OPT (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### @@ -37,63 +37,63 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1 # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 # OPT (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 # OPT (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 # OPT (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 # Falcon (full precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # Falcon (half precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (full precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (half precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 6857b5cbc1..5e563c9974 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -77,20 +77,18 @@ def main(): # Set default tensor type depending on argument indicating the float type to use if not args.use_full_precision: - torch.set_default_tensor_type(torch.HalfTensor) - + torch.set_default_dtype(torch.float16) + else: + torch.set_default_dtype(torch.float32) + # Run huggingface model cuda_availble = torch.cuda.is_available() device = "cuda" if args.gpu and cuda_availble else "cpu" # Get Model - model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device) + model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=True).to(device) # Get Tokenizer hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True) - hf_arch = getattr(hf_config, "architectures")[0] - if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": - tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True) - else: - tokenizer = AutoTokenizer.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) generation_config = GenerationConfig.from_pretrained(args.model_name) generation_config.do_sample = args.do_sample ################# debugging ################# diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index a1ee281914..a83464754f 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -84,12 +84,13 @@ function compare_decoding_steps_spec_infer_incr_decoding { local specInf_file="$2" # Read the number of decoding steps from the second line of the files - second_line=$(sed -n '2p' "$incrDec_file") - read -r line <<< "$second_line" - incrDec=${line#*: } - second_line=$(sed -n '2p' "$specInf_file") - read -r line <<< "$second_line" - specInf=${line#*: } + first_line=$(sed -n '1p' "$incrDec_file") + incr_dec_steps="${first_line##*llm_decoding_steps(}" + incr_dec_steps="${incr_dec_steps%%)*}" + + first_line=$(sed -n '1p' "$specInf_file") + spec_inf_steps="${first_line##*llm_decoding_steps(}" + spec_inf_steps="${spec_inf_steps%%)*}" if ! command -v bc &> /dev/null; then echo "bc is not installed. Installing..." @@ -97,8 +98,8 @@ function compare_decoding_steps_spec_infer_incr_decoding { fi # Perform the comparison - threshold=$(bc <<< "$specInf * 1.5") - if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + threshold=$(bc <<< "$spec_inf_steps * 1.5") + if (( $(echo "$incr_dec_steps >= $threshold" | bc -l) )); then #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file." : else @@ -184,13 +185,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p # Falcon (full precision) python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") -diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") -diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") -diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") -#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") -diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 41703cf431..0a745c7984 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -14,9 +14,12 @@ "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 4, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8 GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py new file mode 100644 index 0000000000..93727bdc89 --- /dev/null +++ b/tests/peft/alignment/align_test_utils.py @@ -0,0 +1,510 @@ +import os, re, torch +import numpy as np +from typing import List +from enum import Enum +from dataclasses import dataclass + +abs_dirname = os.path.dirname(os.path.abspath(__file__)) +cache_folder = os.path.expanduser(os.getenv("FF_CACHE_PATH", "~/.cache/flexflow")) +hf_path = os.path.join(cache_folder, "debug/huggingface") +ff_path = os.path.join(cache_folder, "debug/flexflow") + + +def print_unique_files_list(dirname): + files_list = os.listdir(dirname) + for f in sorted(files_list): + match = re.search(r"layers.\d+", f) + if match: + if "layers." in match[0]: + layer_num = int(match[0].split(".")[1]) + if layer_num > 0: + files_list.remove(f) + elif "layers_" in match[0]: + layer_num = int(match[0].split("_")[1]) + if layer_num > 0 and layer_num != 100: + files_list.remove(f) + return sorted(files_list) + + +def compare_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, tolerance=1e-2): + """Check whether a HuggingFace tensor and a FlexFlow tensor are equal + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor_filepath (str): The file path of the FlexFlow tensor + tolerance (float, optional): Floating-point error tolerance for the checks. Defaults to 1e-2. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(hf_tensor_filepath): + raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found") + if not os.path.exists(ff_tensor_filepath): + raise FileNotFoundError(f"FF tensor file {ff_tensor_filepath} not found") + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert len(hf_tensor) == 1 + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",") + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor = ff_tensor[:len_hf_tensor] + + mismatches = [] + if not np.allclose(ff_tensor, hf_tensor, atol=tolerance): + print(f"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(ff_tensor, hf_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0] + print(mismatches) + # print(np.nonzero(hf_tensor)[0]) + # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0]) + # print(ff_tensor[36], hf_tensor[36]) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len_hf_tensor + print("Ok!") + + +def compare_tensors_difference( + hf_tensor_filepath: str, + ff_tensor1_filepath: str, + ff_tensor2_filepath: str, + tolerance: float = 1e-2, +): + """Check whether a HuggingFace tensor is equal to the difference between two FlexFlow tensors + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor1_filepath (str): The file path of the first FlexFlow tensor + ff_tensor2_filepath (str): The file path of the second FlexFlow tensor + tolerance (float, optional): The floating-point error tolerance for the equality check. Defaults to 1e-2. + """ + assert os.path.exists(hf_tensor_filepath) + assert os.path.exists(ff_tensor1_filepath) + assert os.path.exists(ff_tensor2_filepath) + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert len(hf_tensor) == 1 + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=",") + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor1 = ff_tensor1[:len_hf_tensor] + ff_tensor2 = ff_tensor2[:len_hf_tensor] + ff_tensor = ff_tensor1 - ff_tensor2 + + mismatches = [] + if not np.allclose(ff_tensor, hf_tensor, atol=tolerance): + print( + f"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}" + ) + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(ff_tensor, hf_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0] + print(mismatches) + # print(np.nonzero(hf_tensor)[0]) + # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0]) + # print(ff_tensor[36], hf_tensor[36]) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len_hf_tensor + print("Ok!") + + +def compare_hf_tensors(tensor1_fp: str, tensor2_fp: str): + """Checks whether two HuggingFace tensors are equal + + Args: + tensor1_fp (str): The file path of the first tensor + tensor2_fp (str): The file path of the second tensor + """ + if not os.path.exists(tensor1_fp): + raise FileNotFoundError(f"HF tensor file: {tensor1_fp} not found") + if not os.path.exists(tensor2_fp): + raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found") + hf_tensor1 = torch.load(tensor1_fp) + hf_tensor2 = torch.load(tensor2_fp) + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert len(hf_tensor1) == 1 + hf_tensor1 = hf_tensor1[0] + if type(hf_tensor2) == tuple or type(hf_tensor2) == list: + assert len(hf_tensor2) == 1 + hf_tensor2 = hf_tensor2[0] + assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape + hf_tensor1 = torch.nan_to_num(hf_tensor1) + hf_tensor2 = torch.nan_to_num(hf_tensor2) + if not ( + np.allclose( + hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy() + ) + ): + print(f"mismatch between {tensor1_fp} and {tensor2_fp}") + print(hf_tensor1) + print(hf_tensor2) + print( + np.isclose( + hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy() + ) + ) + mismatches = np.where( + ~np.isclose( + hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy() + ) + )[0] + print(mismatches) + assert False + print("Ok!") + + +def check_hf_sum_tensors(tensor_sum_fp: str, tensor1_fp: str, tensor2_fp: str): + """Checks whether a HuggingFace tensor is equal to the sum of two other HuggingFace tensors + + Args: + tensor_sum_fp (str): The file path of the sum tensor + tensor1_fp (str): The file path of the first tensor + tensor2_fp (str): The file path of the second tensor + """ + if not os.path.exists(tensor_sum_fp): + raise FileNotFoundError(f"HF tensor file: {tensor_sum_fp} not found") + if not os.path.exists(tensor1_fp): + raise FileNotFoundError(f"HF tensor file {tensor1_fp} not found") + if not os.path.exists(tensor2_fp): + raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found") + hf_tensor_sum = torch.load(tensor_sum_fp) + hf_tensor1 = torch.load(tensor1_fp) + hf_tensor2 = torch.load(tensor2_fp) + if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list: + assert len(hf_tensor_sum) == 1 + hf_tensor_sum = hf_tensor_sum[0] + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert len(hf_tensor1) == 1 + hf_tensor1 = hf_tensor1[0] + if type(hf_tensor2) == tuple or type(hf_tensor2) == list: + assert len(hf_tensor2) == 1 + hf_tensor2 = hf_tensor2[0] + assert torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape + assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape + hf_tensor1 = torch.nan_to_num(hf_tensor1) + hf_tensor2 = torch.nan_to_num(hf_tensor2) + hf_tensor_sum = torch.nan_to_num(hf_tensor_sum) + sum_check_tensor = hf_tensor1 + hf_tensor2 + if not ( + np.allclose( + sum_check_tensor.detach().cpu().numpy(), + hf_tensor_sum.detach().cpu().numpy(), + ) + ): + print(f"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}") + print(tensor_sum_fp) + print(sum_check_tensor) + print(hf_tensor1) + print(hf_tensor2) + print( + np.isclose( + sum_check_tensor.detach().cpu().numpy(), + hf_tensor_sum.detach().cpu().numpy(), + ) + ) + mismatches = np.where( + ~np.isclose( + sum_check_tensor.detach().cpu().numpy(), + hf_tensor_sum.detach().cpu().numpy(), + ) + )[0] + print(mismatches) + assert False + print("Ok!") + + +def check_hf_zero_tensor(hf_tensor_fp: str): + """Check whether a HuggingFace tensor is a zero tensor + + Args: + hf_tensor_fp (str): The file path of the HuggingFace tensor + """ + if not os.path.exists(hf_tensor_fp): + raise FileNotFoundError(f"HF tensor file: {hf_tensor_fp} not found") + hf_tensor1 = torch.load(hf_tensor_fp) + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert len(hf_tensor1) == 1 + hf_tensor1 = hf_tensor1[0] + assert torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0 + + +def print_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, txt: str = ""): + """Print the contents of a HuggingFace tensor and a FlexFlow tensor + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor_filepath (str): The file path of the FlexFlow tensor + txt (str, optional): Additional text to prepend to the tensors. Defaults to "". + """ + assert os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath) + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert len(hf_tensor) == 1 + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",") + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor = ff_tensor[:len_hf_tensor] + + print(f"{txt} - HF tensor:") + print(hf_tensor) + print(f"{txt} - FF tensor: ") + print(ff_tensor) + + +def compare_flexflow_tensors( + ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5, max_len: int = -1 +): + """Check whether two FlexFlow tensors are equal + + Args: + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (float, optional): Floating-point error tolernace for the check. Defaults to 1e-5. + max_len (int, optional): Maximum number of elements to check (if > 0). Defaults to -1. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") + assert os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp) + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",") + + if ff_tensor1.shape != ff_tensor2.shape: + print(ff_tensor1.shape, ff_tensor2.shape) + assert ff_tensor1.shape == ff_tensor2.shape + + if max_len > -1: + ff_tensor1 = ff_tensor1[:max_len] + ff_tensor2 = ff_tensor2[:max_len] + + mismatches = [] + if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance): + print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0] + print(mismatches) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len(ff_tensor1) + print("Ok!") + + +def compare_flexflow_tensors_shortest( + ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5 +): + """Compare two FlexFlow tensors up to the maximum length of the shortest tensor + + Args: + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (float, optional): Floating point error tolerance for the check. Defaults to 1e-5. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",") + minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0]) + ff_tensor1 = ff_tensor1[:minlen] + ff_tensor2 = ff_tensor2[:minlen] + mismatches = [] + if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance): + print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0] + print(mismatches) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len(ff_tensor1) + print("Ok!") + + +def check_flexflow_tensors_sum( + ff_tensor_sum_fp: str, ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance=1e-5 +): + """Check whether a FlexFlow tensor is equal to the sum of two other FlexFlow tensors + + Args: + ff_tensor_sum_fp (str): The file path of the FlexFlow sum tensor + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (_type_, optional): Floating-point error tolerance for the check. Defaults to 1e-5. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",") + ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=",") + + ff_sum = ff_tensor1 + ff_tensor2 + assert ff_tensor1.shape == ff_tensor2.shape + + mismatches = [] + if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance): + print( + f"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}" + ) + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(f"Sum Tensor: {ff_tensor_sum}\nActual sum:{ff_sum}") + print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0] + print(mismatches) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len(ff_tensor1) + print("Ok!") + + +def load_ff_tensor(filename: str, shape: List[int]): + """Load a FlexFlow tensor from a file as a numpy array + + Args: + filename (str): The file path of the FF tensor + shape (List[int]): The shape of the FF tensor + + Returns: + _type_: The FF tensor as a numpy array + """ + if ff_path not in filename: + filename = os.path.join(ff_path, filename) + ff_tensor = np.loadtxt(filename, delimiter=",").reshape(shape, order="F") + return ff_tensor + + +def load_hf_tensor(filename: str): + """Load a HuggingFace tensor from a file as a numpy array + + Args: + filename (str): The file path of the HF tensor + + Returns: + _type_: The HF tensor as a numpy array + """ + if hf_path not in filename: + filename = os.path.join(hf_path, filename) + hf_tensor = torch.load(filename) + hf_tensor = hf_tensor.detach().cpu().numpy() + return hf_tensor + + +def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2): + """Check whether a Huggingface and a FlexFlow tensors, both loaded to memory in the form of a numpy array, are equal + + Args: + hf_tensor (_type_): The HuggingFace tensor (in numpy array form) + ff_tensor (_type_): The FlexFlow tensor (in numpy array form) + tolerance (_type_, optional): The floating point error tolerance for the check. Defaults to 1e-2. + """ + assert hf_tensor.shape == ff_tensor.shape + mismatches = [] + if not np.allclose(hf_tensor, ff_tensor, atol=tolerance): + print(f"mismatch between hf_tensor and ff_tensor") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(hf_tensor, ff_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(mismatches) + len_hf_tensor = hf_tensor.flatten().shape[0] + assert len(mismatches) <= 0.05 * len_hf_tensor + print("Ok!") + + +def are_np_arrays_identical(*np_arrays): + if len(np_arrays) < 2: + return True + + first = np_arrays[0] + + # Check shapes and dtypes + if not all( + t.shape == first.shape and t.dtype == first.dtype for t in np_arrays[1:] + ): + return False + + # Stack all tensors along a new axis + stacked = np.stack(np_arrays) + + # Check if all elements along the new axis are equal + return np.all(stacked == stacked[0]) + + +class TPType(Enum): + REPLICATE = 0 + PARTITION = 1 + TO_REDUCE = 2 + + +@dataclass +class TensorComparisonIdxs: + hf_tensor_type: str + ff_tensor_type: str + hf_tensor_idx: int + ff_tensor_idx: int + + +def replace_value(lst, old_value, new_value): + occurrences = lst.count(old_value) + if occurrences == 0: + raise ValueError(f"Value {old_value} not found in the list.") + elif occurrences > 1: + raise ValueError(f"Multiple instances of {old_value} found in the list.") + else: + index = lst.index(old_value) + lst[index] = new_value + return lst + + +def truncate_dimension(tensor, old_dim, new_dim): + # Check if old_dim appears exactly once in the tensor's shape + shape = tensor.shape + dim_occurrences = shape.count(old_dim) + + if dim_occurrences == 0: + raise ValueError(f"Dimension {old_dim} not found in the tensor shape.") + elif dim_occurrences > 1: + raise ValueError( + f"Multiple instances of dimension {old_dim} found in the tensor shape." + ) + + # Check if new_dim is less than or equal to old_dim + if new_dim > old_dim: + raise ValueError( + f"New dimension ({new_dim}) must be less than or equal to old dimension ({old_dim})." + ) + + # Find the index of the dimension to truncate + dim_index = shape.index(old_dim) + + # Create a slice object for truncation + slices = [slice(None)] * len(shape) + slices[dim_index] = slice(0, new_dim) + + # Truncate the tensor + truncated_tensor = tensor[tuple(slices)] + + return truncated_tensor diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb new file mode 100644 index 0000000000..86a4ef76c4 --- /dev/null +++ b/tests/peft/alignment/llama_alignment_tests.ipynb @@ -0,0 +1,2651 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch\n", + "from align_test_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/FlexFlow/tests/peft/hf_peft_tensors /usr/FlexFlow/build/inference_tensors\n" + ] + } + ], + "source": [ + "print(hf_path, ff_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check weights (semi-automatically)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "def convert_hf_filename_to_ff_filename(f, num_layers=12):\n", + " if f.endswith(\".lm_head.weight\"):\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n", + " elif f == \"norm.weight\":\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n", + " else:\n", + " f_version = \"fwd_step_0_\"\n", + " if f.startswith(\"layers.\"):\n", + " layernum = f.split(\"layers.\")[1].split(\".\")[0]\n", + " f_version += f\"layers_{layernum}_\"\n", + " f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n", + " weight_index=\"0\"\n", + " if \"lora_A\" in f_version:\n", + " weight_index=\"A\"\n", + " elif \"lora_B\" in f_version:\n", + " weight_index=\"B\"\n", + " f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n", + " f_version += f\"_shard_0_weight_{weight_index}\"\n", + " return f_version\n", + "\n", + "files_list = os.listdir(hf_path)\n", + "num_layers=12\n", + "for f in sorted(files_list):\n", + " if f.endswith(\".weight\"):\n", + " if \"self_attn\" in f:\n", + " continue\n", + " f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)\n", + " # print(f, f_version)\n", + " hf_w_path = os.path.join(hf_path, f)\n", + " ff_w_path = os.path.join(ff_path, f_version)\n", + " assert(os.path.isfile(hf_w_path))\n", + " assert(os.path.isfile(ff_w_path))\n", + " # print(\"\\t\", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))\n", + " # print(\"\\t\", ff_w_path)\n", + "\n", + " # check equivalence\n", + " compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load model for automatic check" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "from peft import PeftModel, PeftConfig\n", + "use_full_precision=True\n", + "peft_model_id=\"goliaro/llama-160m-lora\"\n", + "peft_config = PeftConfig.from_pretrained(peft_model_id)\n", + "if peft_config.peft_type != \"LORA\":\n", + " raise ValueError(f\"PEFT type {peft_config.peft_type} not supported yet\")\n", + "\n", + "peft_config.init_lora_weights = (\n", + " False\n", + ") # prevent HF from re-inizialing the weights randomly\n", + "model_name = peft_config.base_model_name_or_path\n", + "# Load base model, and apply the PEFT layer\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " torch_dtype=torch.float32 if use_full_precision else torch.float16,\n", + " device_map=\"auto\",\n", + ")\n", + "model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "embed_tokens True True\n", + "layers.0.self_attn.q_proj True True\n", + "layers.0.self_attn.k_proj True True\n", + "layers.0.self_attn.v_proj True True\n", + "layers.0.self_attn.o_proj True True\n", + "layers.0.self_attn.rotary_emb True True\n", + "layers.0.mlp.gate_proj True True\n", + "layers.0.mlp.up_proj True True\n", + "layers.0.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.0.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_output_0\n", + "layers.0.input_layernorm True True\n", + "layers.0.post_attention_layernorm True True\n", + "layers.1.self_attn.q_proj True True\n", + "layers.1.self_attn.k_proj True True\n", + "layers.1.self_attn.v_proj True True\n", + "layers.1.self_attn.o_proj True True\n", + "layers.1.self_attn.rotary_emb True True\n", + "layers.1.mlp.gate_proj True True\n", + "layers.1.mlp.up_proj True True\n", + "layers.1.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.1.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_output_0\n", + "layers.1.input_layernorm True True\n", + "layers.1.post_attention_layernorm True True\n", + "layers.2.self_attn.q_proj True True\n", + "layers.2.self_attn.k_proj True True\n", + "layers.2.self_attn.v_proj True True\n", + "layers.2.self_attn.o_proj True True\n", + "layers.2.self_attn.rotary_emb True True\n", + "layers.2.mlp.gate_proj True True\n", + "layers.2.mlp.up_proj True True\n", + "layers.2.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.2.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_output_0\n", + "layers.2.input_layernorm True True\n", + "layers.2.post_attention_layernorm True True\n", + "layers.3.self_attn.q_proj True True\n", + "layers.3.self_attn.k_proj True True\n", + "layers.3.self_attn.v_proj True True\n", + "layers.3.self_attn.o_proj True True\n", + "layers.3.self_attn.rotary_emb True True\n", + "layers.3.mlp.gate_proj True True\n", + "layers.3.mlp.up_proj True True\n", + "layers.3.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.3.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_output_0\n", + "layers.3.input_layernorm True True\n", + "layers.3.post_attention_layernorm True True\n", + "layers.4.self_attn.q_proj True True\n", + "layers.4.self_attn.k_proj True True\n", + "layers.4.self_attn.v_proj True True\n", + "layers.4.self_attn.o_proj True True\n", + "layers.4.self_attn.rotary_emb True True\n", + "layers.4.mlp.gate_proj True True\n", + "layers.4.mlp.up_proj True True\n", + "layers.4.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.4.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_output_0\n", + "layers.4.input_layernorm True True\n", + "layers.4.post_attention_layernorm True True\n", + "layers.5.self_attn.q_proj True True\n", + "layers.5.self_attn.k_proj True True\n", + "layers.5.self_attn.v_proj True True\n", + "layers.5.self_attn.o_proj True True\n", + "layers.5.self_attn.rotary_emb True True\n", + "layers.5.mlp.gate_proj True True\n", + "layers.5.mlp.up_proj True True\n", + "layers.5.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.5.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_output_0\n", + "layers.5.input_layernorm True True\n", + "layers.5.post_attention_layernorm True True\n", + "layers.6.self_attn.q_proj True True\n", + "layers.6.self_attn.k_proj True True\n", + "layers.6.self_attn.v_proj True True\n", + "layers.6.self_attn.o_proj True True\n", + "layers.6.self_attn.rotary_emb True True\n", + "layers.6.mlp.gate_proj True True\n", + "layers.6.mlp.up_proj True True\n", + "layers.6.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.6.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_output_0\n", + "layers.6.input_layernorm True True\n", + "layers.6.post_attention_layernorm True True\n", + "layers.7.self_attn.q_proj True True\n", + "layers.7.self_attn.k_proj True True\n", + "layers.7.self_attn.v_proj True True\n", + "layers.7.self_attn.o_proj True True\n", + "layers.7.self_attn.rotary_emb True True\n", + "layers.7.mlp.gate_proj True True\n", + "layers.7.mlp.up_proj True True\n", + "layers.7.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.7.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_output_0\n", + "layers.7.input_layernorm True True\n", + "layers.7.post_attention_layernorm True True\n", + "layers.8.self_attn.q_proj True True\n", + "layers.8.self_attn.k_proj True True\n", + "layers.8.self_attn.v_proj True True\n", + "layers.8.self_attn.o_proj True True\n", + "layers.8.self_attn.rotary_emb True True\n", + "layers.8.mlp.gate_proj True True\n", + "layers.8.mlp.up_proj True True\n", + "layers.8.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.8.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_output_0\n", + "layers.8.input_layernorm True True\n", + "layers.8.post_attention_layernorm True True\n", + "layers.9.self_attn.q_proj True True\n", + "layers.9.self_attn.k_proj True True\n", + "layers.9.self_attn.v_proj True True\n", + "layers.9.self_attn.o_proj True True\n", + "layers.9.self_attn.rotary_emb True True\n", + "layers.9.mlp.gate_proj True True\n", + "layers.9.mlp.up_proj True True\n", + "layers.9.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.9.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_output_0\n", + "layers.9.input_layernorm True True\n", + "layers.9.post_attention_layernorm True True\n", + "layers.10.self_attn.q_proj True True\n", + "layers.10.self_attn.k_proj True True\n", + "layers.10.self_attn.v_proj True True\n", + "layers.10.self_attn.o_proj True True\n", + "layers.10.self_attn.rotary_emb True True\n", + "layers.10.mlp.gate_proj True True\n", + "layers.10.mlp.up_proj True True\n", + "layers.10.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.10.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_output_0\n", + "layers.10.input_layernorm True True\n", + "layers.10.post_attention_layernorm True True\n", + "layers.11.self_attn.q_proj True True\n", + "layers.11.self_attn.k_proj True True\n", + "layers.11.self_attn.v_proj True True\n", + "layers.11.self_attn.o_proj True True\n", + "layers.11.self_attn.rotary_emb True True\n", + "layers.11.mlp.gate_proj True True\n", + "layers.11.mlp.up_proj True True\n", + "layers.11.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.11.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_output_0\n", + "layers.11.input_layernorm True True\n", + "layers.11.post_attention_layernorm True True\n", + "norm True True\n", + "lm_head True True\n" + ] + } + ], + "source": [ + "named_modules_ = [\n", + " name.replace(\"base_model.model.model.\", \"\").replace(\"base_model.model.model\", \"\").replace(\"base_model.model.\", \"\").replace(\"base_model.model\", \"\").replace(\"base_model.\", \"\").replace(\"base_model\", \"\")\n", + " for name, _ in model.named_modules()\n", + "]\n", + "\n", + "def remove_prefixes(named_modules):\n", + " i = 0\n", + " while i < len(named_modules) - 1:\n", + " if named_modules[i + 1].startswith(named_modules[i]):\n", + " named_modules.pop(i)\n", + " else:\n", + " i += 1\n", + " return named_modules\n", + "named_modules = remove_prefixes(named_modules_)\n", + "\n", + "def convert_hf_module_name_to_ff_filenames(n, num_layers=12):\n", + " if n == \"embed_tokens\":\n", + " ff_in_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n", + " ff_out_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n", + " elif n == \"lm_head\" or n == \"norm\":\n", + " ff_in_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_input_0\"\n", + " ff_out_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_output_0\"\n", + " elif n.startswith(\"layers.\"):\n", + " layernum = n.split(\"layers.\")[1].split(\".\")[0]\n", + " ff_in_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_input_0\"\n", + " ff_out_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_output_0\"\n", + " else:\n", + " assert False, f\"Module {n} not supported yet\"\n", + " return os.path.join(ff_path, ff_in_name), os.path.join(ff_path, ff_out_name)\n", + "\n", + "# Compute the hf path, check if the input and output are there\n", + "for n in named_modules:\n", + " in_name = f\"fwd_step_0_{n}.input_0\"\n", + " out_name = f\"fwd_step_0_{n}.output_0\"\n", + " if n == \"lm_head\":\n", + " in_name = f\"fwd_step_0_base_model.model.{n}.input_0\"\n", + " out_name = f\"fwd_step_0_base_model.model.{n}.output_0\"\n", + " hf_mod_in = os.path.join(hf_path, in_name)\n", + " hf_mod_out = os.path.join(hf_path, out_name)\n", + " check = os.path.exists(hf_mod_in) and os.path.exists(hf_mod_out)\n", + " \n", + " check2=True\n", + " if \"self_attn\" not in n:\n", + " ff_mod_in, ff_mod_out = convert_hf_module_name_to_ff_filenames(n, num_layers=num_layers)\n", + " check2 = os.path.exists(ff_mod_in) and os.path.exists(ff_mod_out)\n", + " print(n, check, check2)\n", + " if not check2:\n", + " print(\"\\t\", ff_mod_in, ff_mod_out)\n", + " # print(n, check)\n", + " # print(\"\\t\", )\n", + " \n", + "\n", + "# Compute the corresponding ff path, check if the input and output are there\n", + "\n", + "# for x in named_modules:\n", + "# print(x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'down_proj'}\n" + ] + } + ], + "source": [ + "print(model.peft_config['default'].target_modules)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manual check" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "hf_embed_input= \"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.input_0\"\n", + "ff_embed_input=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n", + "compare_tensors(hf_embed_input, ff_embed_input)\n", + "hf_embed_output=\"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.output_0\"\n", + "ff_embed_output=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n", + "compare_tensors(hf_embed_output, ff_embed_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.10.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.input_layernorm_shard_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.06630182 6.3429456\n", + " -0.21220279]\n", + "FF:[ 0. 0. 0. ... 0.06630275 6.34293985\n", + " -0.21219885]\n", + "[ True True True ... True True True]\n", + "[15889]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.input_layernorm_shard_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.14172177 9.79423\n", + " -6.2940273 ]\n", + "FF:[ 0. 0. 0. ... 0.14172006 9.79421902\n", + " -6.29402065]\n", + "[ True True True ... True True True]\n", + "[ 2878 3206 3367 3607 5183 5346 6257 6544 7466 7679 7805 8119\n", + " 8159 8911 9450 9897 13696 13938 14058 14599 15126 15839 16128 16195]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for i in range(tot_num_layers):\n", + " hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n", + " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n", + " if i > 0:\n", + " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", + " compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)\n", + " hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", + " if i > 0:\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)\n", + " hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n", + " ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n", + " compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)\n", + " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n", + " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)\n", + " # w1\n", + " hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", + " ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n", + " compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)\n", + " # w3\n", + " hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n", + " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0\"\n", + " compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)\n", + " # w2\n", + " hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n", + " hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n", + " ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0\"\n", + " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", + " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)\n", + "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- LM head --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Final Norm --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "# ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "print(\"-- LM head --\")\n", + "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "# compare weights\n", + "hf_lm_head_weight = f\"{hf_path}/base_model.model.lm_head.weight\"\n", + "ff_lm_head_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0\"\n", + "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "# # Manually check the matmul\n", + "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n", + "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n", + "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n", + "# print(ff_tensor_out.shape)\n", + "# print(ff_weight.shape)\n", + "# print(np.matmul(ff_weight, ff_tensor_out))\n", + "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n", + "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "print(\"-- Final Norm --\")\n", + "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "ff_BWD_norm_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0\"\n", + "hf_FWD_norm_weight = f\"{hf_path}/norm.weight\"\n", + "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", + "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "class LlamaRotaryEmbedding(nn.Module):\n", + " def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n", + " super().__init__()\n", + "\n", + " self.dim = dim\n", + " self.max_position_embeddings = max_position_embeddings\n", + " self.base = base\n", + " inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n", + " self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n", + "\n", + " # Build here to make `torch.jit.trace` work.\n", + " self._set_cos_sin_cache(\n", + " seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n", + " )\n", + "\n", + " def _set_cos_sin_cache(self, seq_len, device, dtype):\n", + " self.max_seq_len_cached = seq_len\n", + " t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n", + "\n", + " freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n", + " # Different from paper, but it uses a different permutation in order to obtain the same calculation\n", + " emb = torch.cat((freqs, freqs), dim=-1)\n", + " self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n", + " self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n", + "\n", + " def forward(self, x, seq_len=None):\n", + " # x: [bs, num_attention_heads, seq_len, head_size]\n", + " if seq_len > self.max_seq_len_cached:\n", + " self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n", + "\n", + " return (\n", + " self.cos_cached[:seq_len].to(dtype=x.dtype),\n", + " self.sin_cached[:seq_len].to(dtype=x.dtype),\n", + " )\n", + "def rotate_half(x):\n", + " \"\"\"Rotates half the hidden dims of the input.\"\"\"\n", + " x1 = x[..., : x.shape[-1] // 2] # first half\n", + " x2 = x[..., x.shape[-1] // 2 :] # second half\n", + " return torch.cat((x2, -x1), dim=-1)\n", + "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n", + " \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n", + "\n", + " Args:\n", + " q (`torch.Tensor`): The query tensor.\n", + " k (`torch.Tensor`): The key tensor.\n", + " cos (`torch.Tensor`): The cosine part of the rotary embedding.\n", + " sin (`torch.Tensor`): The sine part of the rotary embedding.\n", + " position_ids (`torch.Tensor`):\n", + " The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n", + " used to pass offsetted position ids when working with a KV-cache.\n", + " unsqueeze_dim (`int`, *optional*, defaults to 1):\n", + " The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n", + " sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n", + " that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n", + " k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n", + " cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n", + " the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n", + " Returns:\n", + " `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n", + " \"\"\"\n", + " cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n", + " sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n", + " q_embed = (q * cos) + (rotate_half(q) * sin)\n", + " k_embed = (k * cos) + (rotate_half(k) * sin)\n", + " return q_embed, k_embed\n", + "head_dim = 64\n", + "max_position_embeddings = 2048\n", + "rope_theta=10_000\n", + "kv_seq_len = 24\n", + "rotary_emb = LlamaRotaryEmbedding(\n", + " head_dim,\n", + " max_position_embeddings=max_position_embeddings,\n", + " base=rope_theta,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43525000e+03 -6.48986062e+05 1.17611250e+05 ... 2.14103413e+01\n", + " 1.20965385e+01 3.64246368e+00]\n", + "[False True True ... True True True]\n", + "[ 0 162 185 308 339 745 747 820 830 909 933 968 1008 1156\n", + " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n", + " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n", + " 2888 2894 2937 3024]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_feed_forward_w2_shard_0_input_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43525000e+03 -6.48986062e+05 1.17611250e+05 ... 2.14103413e+01\n", + " 1.20965385e+01 3.64246368e+00]\n", + "[False True True ... True True True]\n", + "[ 0 162 185 308 339 745 747 820 830 909 933 968 1008 1156\n", + " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n", + " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n", + " 2888 2894 2937 3024]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_attention_shard_0_o_proj_in_grad\n", + "HF: [ 1.2223595e+06 -2.6348565e+06 -5.0760525e+05 ... 6.8275871e+01\n", + " -5.8116108e+01 9.5347488e+01]\n", + "FF:[ 1.22235925e+06 -2.63485625e+06 -5.07605000e+05 ... 6.82758865e+01\n", + " -5.81161423e+01 9.53475494e+01]\n", + "[ True True True ... True True True]\n", + "[ 51 77 95 168 175 232 725]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 1.22235950e+06 9.93645859e+01 -2.82157593e+01 ... -3.94578514e+01\n", + " -1.98409653e+01 -1.33438044e+01]\n", + " [-2.63485650e+06 -1.13461929e+02 1.14223976e+02 ... 7.52578735e+01\n", + " 1.33362747e+02 6.78501587e+01]\n", + " [-5.07605250e+05 4.34111862e+01 8.10619354e+01 ... 4.70537224e+01\n", + " 4.02149696e+01 6.98045502e+01]\n", + " ...\n", + " [ 3.02792250e+06 3.31295319e+02 9.98417091e+00 ... 4.90895653e+01\n", + " 9.71413574e+01 6.82758713e+01]\n", + " [-3.64456375e+06 -2.43692596e+02 -6.85474396e+00 ... -3.71503868e+01\n", + " -1.34136658e+01 -5.81161079e+01]\n", + " [ 3.31921500e+06 2.24193970e+02 -6.64005566e+00 ... 2.11662292e+00\n", + " 3.37400856e+01 9.53474884e+01]]\n", + "FF:[[ 1.22235925e+06 9.93645630e+01 -2.82157211e+01 ... -3.94577713e+01\n", + " -1.98408775e+01 -1.33438234e+01]\n", + " [-2.63485625e+06 -1.13461960e+02 1.14224037e+02 ... 7.52577744e+01\n", + " 1.33362701e+02 6.78501205e+01]\n", + " [-5.07605000e+05 4.34111404e+01 8.10619278e+01 ... 4.70536804e+01\n", + " 4.02149124e+01 6.98045578e+01]\n", + " ...\n", + " [ 3.02792250e+06 3.31295227e+02 9.98412323e+00 ... 4.90895386e+01\n", + " 9.71413727e+01 6.82758865e+01]\n", + " [-3.64456400e+06 -2.43692627e+02 -6.85472488e+00 ... -3.71504822e+01\n", + " -1.34137001e+01 -5.81161423e+01]\n", + " [ 3.31921500e+06 2.24193970e+02 -6.64004517e+00 ... 2.11670875e+00\n", + " 3.37400322e+01 9.53475494e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 51 77 95 168 175 232 725]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 1.2223588e+06 -2.6348530e+06 -5.0760291e+05 ... 3.0279325e+06\n", + " -3.6445672e+06 3.3192180e+06]\n", + " [-4.2496326e+02 1.1576636e+03 9.8397858e+02 ... 1.6480791e+03\n", + " -5.9697235e+02 6.2627173e+02]\n", + " [-2.2012039e+01 6.6097900e+01 3.9933994e+01 ... 5.7103355e+01\n", + " -1.5968766e+01 3.6536639e+00]\n", + " ...\n", + " [-1.2302110e+00 5.3052688e+00 2.1982718e+00 ... 1.3990868e+00\n", + " -5.5132383e-01 4.8985812e-01]\n", + " [-1.0771493e+00 6.9571300e+00 2.7373023e+00 ... 4.9663010e+00\n", + " -9.9705428e-01 2.1829298e+00]\n", + " [-5.9534687e-01 3.0272012e+00 3.1143982e+00 ... 2.4072502e+00\n", + " -2.0490403e+00 3.3617332e+00]]\n", + "FF:[[ 1.22235850e+06 -2.63485275e+06 -5.07602656e+05 ... 3.02793250e+06\n", + " -3.64456750e+06 3.31921800e+06]\n", + " [-4.24962585e+02 1.15766296e+03 9.83978577e+02 ... 1.64807898e+03\n", + " -5.96972351e+02 6.26271790e+02]\n", + " [-2.20120354e+01 6.60979462e+01 3.99340210e+01 ... 5.71033745e+01\n", + " -1.59687757e+01 3.65366316e+00]\n", + " ...\n", + " [-1.23020661e+00 5.30526114e+00 2.19826817e+00 ... 1.39908671e+00\n", + " -5.51325083e-01 4.89858717e-01]\n", + " [-1.07714510e+00 6.95712519e+00 2.73729825e+00 ... 4.96630049e+00\n", + " -9.97055829e-01 2.18292713e+00]\n", + " [-5.95347941e-01 3.02720070e+00 3.11439991e+00 ... 2.40725493e+00\n", + " -2.04904509e+00 3.36174107e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0]\n", + "Ok!\n", + "7.4363425925925934% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n", + " 2.38160934e+01 3.15938339e+01]\n", + " [-9.55138900e+06 6.71377197e+02 2.06871887e+02 ... -3.86393509e+01\n", + " 2.14816055e+01 -6.58599396e+01]\n", + " [ 1.14522670e+07 2.19898975e+03 -6.89673233e+00 ... 9.51593590e+00\n", + " -1.68612709e+01 6.02474251e+01]\n", + " ...\n", + " [ 2.10891925e+06 3.78648706e+03 1.02701221e+03 ... 3.59794388e+01\n", + " 5.03902206e+01 4.19777756e+01]\n", + " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ... 9.36443710e+00\n", + " 3.84094887e+01 -7.51948738e+00]\n", + " [ 7.39155050e+06 1.11731885e+03 3.38369843e+02 ... 3.70399475e+01\n", + " 1.77629051e+01 9.76780853e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.52523600e+06 -1.27625293e+03 -4.39336700e+01 ... -3.34414597e+01\n", + " 2.38162422e+01 3.15938187e+01]\n", + " [-9.55138900e+06 6.71377319e+02 2.06871674e+02 ... -3.86393127e+01\n", + " 2.14817867e+01 -6.58600464e+01]\n", + " [ 1.14522660e+07 2.19898950e+03 -6.89660644e+00 ... 9.51594448e+00\n", + " -1.68611774e+01 6.02474518e+01]\n", + " ...\n", + " [ 2.10891850e+06 3.78648633e+03 1.02701196e+03 ... 3.59794846e+01\n", + " 5.03901253e+01 4.19777679e+01]\n", + " [ 2.11695400e+06 -2.36282440e+02 -1.08002762e+02 ... 9.36448860e+00\n", + " 3.84096107e+01 -7.51954842e+00]\n", + " [ 7.39155000e+06 1.11731921e+03 3.38370087e+02 ... 3.70398293e+01\n", + " 1.77627277e+01 9.76782227e+01]]\n", + "6.011284722222222% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 4.9057606e+01\n", + " 4.7770348e+01 5.8564331e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 4.90575981e+01\n", + " 4.77703362e+01 5.85643845e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750 900 1198 1249\n", + " 1287 1305 1414 1428 1490 1588 1600 1612 1625 1657 1676 1677\n", + " 1692 1694 1724 1730 1772 1822 1825 1838 1853 1910 2035 2043\n", + " 2053 2059 2073 2078 2123 2145 2214 2238 2241 2285 2292 2389\n", + " 2542 2582 2589 2599 2674 2688 2711 2840 2856 2961 2963 2980\n", + " 3064 3176 3192 3255 3262 3278 3338 3341 3412 3419 3492 3590\n", + " 3624 3646 3657 3807 3840 3842 3846 3883 3887 4005 4049 4071\n", + " 4076 4077 4079 4137 4142 4192 4193 4202 4218 4224 4273 4355\n", + " 4358 4381 4401 4435 4469 4499 4514 4546 4598 4619 4747 4846\n", + " 4872 4916 4952 4966 5016 5067 5107 5112 5116 5194 5225 5350\n", + " 5364 5403 5515 5537 5550 5578 5650 5653 5654 5736 5751 5837\n", + " 5870 5881 5972 5998 6006 6051 6061 6107 6129 6204 6236 6292\n", + " 6296 6327 6382 6393 6403 6420 6424 6436 6468 6542 6599 6675\n", + " 6681 6711 6723 6767 6823 6914 6983 7047 7064 7133 7167 7197\n", + " 7198 7209 7528 7537 7538 7686 7850 7855 7889 7910 7919 7927\n", + " 7937 7939 8089 8101 8157 8169 8175 8223 8292 8304 8306 8342\n", + " 8351 8414 8475 8500 8543 8558 8609 8656 8687 8704 8724 8726\n", + " 8777 8816 8826 8871 8904 8934 8983 9012 9033 9043 9068 9093\n", + " 9125 9133 9144 9151 9154 9217 9222 9320 9335 9367 9398 9421\n", + " 9434 9521 9547 9633 9702 9726 9763 9949 10018 10053 10062 10079\n", + " 10137 10149 10203 10261 10269 10292 10312 10332 10471 10478 10514 10596\n", + " 10645 10676 10678 10781 10795 10810 10833 10891 10904 10935 10957 10977\n", + " 10982 11028 11095 11172 11223 11251 11283 11303 11319 11374 11392 11437\n", + " 11486 11627 11678 11750 11759 11979 11996 12019 12126 12237 12262 12288\n", + " 12303 12309 12315 12387 12543 12569 12613 12648 12786 12852 12866 12879\n", + " 12947 12963 13037 13058 13261 13284 13312 13394 13399 13427 13526 13527\n", + " 13592 13695 13741 13752 13775 13803 13812 13866 13902 14049 14170 14241\n", + " 14354 14382 14426 14451 14455 14486 14502 14582 14820 14934 14961 14976\n", + " 15000 15003 15014 15077 15096 15108 15135 15148 15165 15219 15232 15290\n", + " 15339 15345 15819 15945 15994 16077 16135 16218 16231 16233 16239 16243\n", + " 16295 16311 16339 16356 16366 16417 16456 16498 16502 16503 16506 16547\n", + " 16585 16603 16611 16633 16661 16683 16704 16710 16723 16724 16745 16754\n", + " 16773 16787 16789 16818 16829 16833 16913 16933 17025 17033 17037 17055\n", + " 17084 17098 17109 17176 17225 17240 17292 17294 17339 17390 17427 17437\n", + " 17579 17626 17630 17654 17719 17902 17912 18023 18025 18124 18203 18339\n", + " 18344]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 4.9057606e+01\n", + " 4.7770348e+01 5.8564331e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 4.90575981e+01\n", + " 4.77703362e+01 5.85643845e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 4.7819588e+07 3.8833264e+07 4.7789860e+07 ... 1.0804405e+00\n", + " 2.7186510e-01 -2.9918199e+00]\n", + "FF:[ 4.78195960e+07 3.88332640e+07 4.77898600e+07 ... 1.08044124e+00\n", + " 2.71864563e-01 -2.99182224e+00]\n", + "[ True True True ... True True True]\n", + "[ 109 211 312 422 590 832 835 1016 1053 1076 1268 1353 1374 1693\n", + " 1701 1710 1722 1832 1954 1965 1997 2076 2124 2146 2378 2520 2605 2624\n", + " 2967 3007 3015]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 3.3558659e+09 1.3409817e+10 -1.4671958e+10 ... 7.2100967e+01\n", + " 6.5979071e+00 -2.1230124e+01]\n", + "FF:[ 3.35586406e+09 1.34098166e+10 -1.46719611e+10 ... 7.21009750e+01\n", + " 6.59790993e+00 -2.12301121e+01]\n", + "[ True True True ... True True True]\n", + "[ 4 95 111 163 179 191 279 305 363 406 447 487 489 494\n", + " 517 617 703 713 735 796 805 819 826 858 882 959 964 967\n", + " 986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n", + " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n", + " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n", + " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_input_0\n", + "HF: [ 3.3558659e+09 1.3409817e+10 -1.4671958e+10 ... 7.2100967e+01\n", + " 6.5979071e+00 -2.1230124e+01]\n", + "FF:[ 3.35586406e+09 1.34098166e+10 -1.46719611e+10 ... 7.21009750e+01\n", + " 6.59790993e+00 -2.12301121e+01]\n", + "[ True True True ... True True True]\n", + "[ 4 95 111 163 179 191 279 305 363 406 447 487 489 494\n", + " 517 617 703 713 735 796 805 819 826 858 882 959 964 967\n", + " 986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n", + " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n", + " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n", + " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 9.3464905e+01\n", + " 7.5613129e+01 7.6598846e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 9.34649200e+01\n", + " 7.56131058e+01 7.65989227e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_o_proj_in_grad\n", + "HF: [-9.4470595e+09 -7.3870331e+09 1.2659395e+10 ... -2.8149616e+01\n", + " 1.7019112e+02 -7.7236428e+00]\n", + "FF:[-9.44706150e+09 -7.38703309e+09 1.26593966e+10 ... -2.81496239e+01\n", + " 1.70191177e+02 -7.72364044e+00]\n", + "[ True True True ... True True True]\n", + "[ 11 98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-9.44705946e+09 2.28078384e+01 3.18554016e+02 ... 1.17267204e+02\n", + " 2.06791725e+01 1.13138672e+02]\n", + " [-7.38703309e+09 -7.36898804e+00 7.93705673e+01 ... 2.04039650e+01\n", + " 3.18331490e+01 5.44241562e+01]\n", + " [ 1.26593946e+10 1.77534424e+02 -2.97175941e+01 ... 1.16716766e+01\n", + " 7.70214081e+01 2.81902496e+02]\n", + " ...\n", + " [ 4.51210445e+10 3.63867615e+02 -8.04915466e+01 ... -1.34332123e+02\n", + " -1.22151840e+02 -2.81496162e+01]\n", + " [-1.39591885e+10 1.59216873e+02 6.11343079e+01 ... 1.56675262e+02\n", + " 9.68551483e+01 1.70191116e+02]\n", + " [-1.29442345e+10 -2.39441833e+02 2.73647644e+02 ... -4.41197014e+01\n", + " -9.48526230e+01 -7.72364283e+00]]\n", + "FF:[[-9.44706150e+09 2.28079376e+01 3.18553864e+02 ... 1.17267227e+02\n", + " 2.06791859e+01 1.13138741e+02]\n", + " [-7.38703309e+09 -7.36921692e+00 7.93703690e+01 ... 2.04038925e+01\n", + " 3.18332825e+01 5.44241333e+01]\n", + " [ 1.26593966e+10 1.77534454e+02 -2.97174206e+01 ... 1.16717224e+01\n", + " 7.70213699e+01 2.81902618e+02]\n", + " ...\n", + " [ 4.51210527e+10 3.63867554e+02 -8.04915695e+01 ... -1.34332092e+02\n", + " -1.22151901e+02 -2.81496239e+01]\n", + " [-1.39591834e+10 1.59216995e+02 6.11343040e+01 ... 1.56675293e+02\n", + " 9.68551559e+01 1.70191177e+02]\n", + " [-1.29442304e+10 -2.39441772e+02 2.73647644e+02 ... -4.41196594e+01\n", + " -9.48526916e+01 -7.72364044e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 11 98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-9.44705946e+09 -7.38703309e+09 1.26593946e+10 ... 4.51210445e+10\n", + " -1.39591885e+10 -1.29442345e+10]\n", + " [ 1.14852783e+03 4.39543152e+02 1.07877356e+03 ... -2.42416113e+03\n", + " 2.64504834e+03 4.68633453e+02]\n", + " [ 5.72417107e+01 4.12602806e+01 -2.27319489e+01 ... -3.40788422e+01\n", + " 4.86237946e+01 1.25752163e+01]\n", + " ...\n", + " [ 6.76848269e+00 8.23165894e+00 2.10253639e+01 ... -3.19590777e-01\n", + " 3.68098617e-01 -1.95310101e-01]\n", + " [ 4.08574820e+00 5.33035660e+00 1.41003275e+01 ... -1.35607815e+00\n", + " 4.06074905e+00 -7.67630756e-01]\n", + " [ 2.03186665e+01 9.77407932e+00 5.06271019e+01 ... -6.80029154e-01\n", + " 4.11142111e+00 -1.86585218e-01]]\n", + "FF:[[-9.44706150e+09 -7.38703309e+09 1.26593966e+10 ... 4.51210527e+10\n", + " -1.39591834e+10 -1.29442304e+10]\n", + " [ 1.14852808e+03 4.39542755e+02 1.07877344e+03 ... -2.42416138e+03\n", + " 2.64504932e+03 4.68633698e+02]\n", + " [ 5.72415771e+01 4.12602005e+01 -2.27318707e+01 ... -3.40787392e+01\n", + " 4.86236725e+01 1.25752039e+01]\n", + " ...\n", + " [ 6.76847696e+00 8.23167515e+00 2.10253181e+01 ... -3.19590837e-01\n", + " 3.68098557e-01 -1.95310280e-01]\n", + " [ 4.08574867e+00 5.33037567e+00 1.41003180e+01 ... -1.35607564e+00\n", + " 4.06074095e+00 -7.67629445e-01]\n", + " [ 2.03186874e+01 9.77407932e+00 5.06271439e+01 ... -6.80029511e-01\n", + " 4.11142349e+00 -1.86585203e-01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "6.640625% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-5.1505955e+10 -4.7166772e+03 -1.3288132e+02 ... -3.0123844e+00\n", + " -5.5234032e+01 6.0299168e+00]\n", + " [-3.5960029e+10 -5.3263096e+03 -1.9434322e+02 ... -5.6601189e+01\n", + " -1.0787462e+02 -6.0718418e+01]\n", + " [ 4.8131662e+10 1.1578307e+04 1.7744476e+02 ... -5.6970375e+01\n", + " -1.7497168e+01 -7.2297249e+00]\n", + " ...\n", + " [-9.0346426e+08 6.4752144e+03 3.2408417e+02 ... 6.1075470e+01\n", + " 8.5356834e+01 8.3221588e+01]\n", + " [-5.0754217e+09 -2.2929268e+03 -1.4913528e+02 ... 8.6639397e+01\n", + " 1.1156468e+02 1.0695674e+02]\n", + " [ 5.5844772e+09 3.0225920e+03 -6.3137859e+01 ... -6.5270996e+01\n", + " 8.2730171e+01 -1.0107367e+02]]\n", + "ff_attn_in: (768, 24)\n", + "[[-5.15059548e+10 -4.71667773e+03 -1.32881012e+02 ... -3.01225996e+00\n", + " -5.52339973e+01 6.02991867e+00]\n", + " [-3.59600292e+10 -5.32630957e+03 -1.94343079e+02 ... -5.66010437e+01\n", + " -1.07874649e+02 -6.07182846e+01]\n", + " [ 4.81316659e+10 1.15783076e+04 1.77444519e+02 ... -5.69703102e+01\n", + " -1.74972763e+01 -7.22990799e+00]\n", + " ...\n", + " [-9.03455232e+08 6.47521484e+03 3.24083832e+02 ... 6.10753632e+01\n", + " 8.53567886e+01 8.32217255e+01]\n", + " [-5.07543654e+09 -2.29292749e+03 -1.49135025e+02 ... 8.66392517e+01\n", + " 1.11564789e+02 1.06956917e+02]\n", + " [ 5.58446592e+09 3.02259229e+03 -6.31376152e+01 ... -6.52709351e+01\n", + " 8.27302551e+01 -1.01073837e+02]]\n", + "7.025824652777778% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_output_0\n", + "HF: [-6.33203254e+13 -4.43651289e+13 6.35509366e+13 ... 1.08435585e+02\n", + " 9.42303467e+01 5.89958420e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 1.08435623e+02\n", + " 9.42303467e+01 5.89958954e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415\n", + " 428 482 492 514 526 531 671 731 763 777 893 927\n", + " 984 1105 1184 1206 1418 1541 1548 1572 1577 1613 1619 1643\n", + " 1658 1661 1691 1701 1706 1726 1757 1784 1815 1833 1849 1856\n", + " 1880 1891 1921 1956 1969 2012 2021 2028 2030 2059 2065 2144\n", + " 2149 2183 2210 2238 2292 2342 2357 2384 2414 2495 2531 2565\n", + " 2597 2662 2713 2781 2821 2829 2877 2904 2921 2927 2962 2973\n", + " 3044 3066 3094 3100 3106 3159 3193 3251 3377 3389 3397 3427\n", + " 3436 3570 3594 3703 3729 3770 3772 3780 3811 3840 3842 3860\n", + " 3907 3920 3929 3946 3955 3969 4005 4009 4034 4048 4077 4089\n", + " 4104 4129 4134 4178 4202 4212 4219 4239 4245 4256 4273 4373\n", + " 4407 4463 4464 4465 4481 4511 4537 4541 4543 4549 4597 4599\n", + " 4633 4759 4760 4789 4846 4884 4901 4930 4954 4971 4993 5024\n", + " 5030 5041 5050 5116 5130 5163 5207 5224 5282 5313 5322 5349\n", + " 5363 5403 5410 5412 5454 5543 5581 5590 5654 5673 5784 5821\n", + " 5849 5880 5911 5917 5982 6000 6062 6165 6178 6193 6200 6272\n", + " 6322 6351 6366 6376 6380 6382 6393 6412 6420 6430 6433 6446\n", + " 6476 6482 6488 6490 6519 6527 6540 6556 6563 6567 6577 6600\n", + " 6619 6680 6709 6735 6768 6777 6780 6823 6825 6826 6830 6863\n", + " 6880 6912 6988 7006 7030 7071 7077 7102 7123 7244 7264 7367\n", + " 7389 7390 7434 7451 7452 7455 7505 7532 7539 7589 7598 7620\n", + " 7651 7653 7659 7709 7714 7740 7751 7759 7803 7808 7820 7917\n", + " 7923 7926 7949 7962 7966 7978 8002 8004 8040 8050 8052 8068\n", + " 8180 8223 8250 8253 8265 8341 8344 8375 8376 8386 8449 8468\n", + " 8501 8509 8522 8535 8585 8590 8593 8642 8657 8674 8687 8707\n", + " 8714 8726 8729 8737 8756 8769 8801 8846 8850 8865 8907 8998\n", + " 9018 9043 9059 9066 9083 9093 9098 9130 9131 9165 9189 9216\n", + " 9285 9337 9368 9526 9539 9563 9620 9659 9723 9793 9804 9817\n", + " 9820 9827 9908 9995 10053 10128 10135 10143 10205 10253 10274 10292\n", + " 10300 10311 10327 10356 10406 10441 10491 10494 10551 10562 10563 10634\n", + " 10649 10674 10710 10734 10821 10831 10833 10838 10845 10911 10966 10981\n", + " 10988 10990 10998 11008 11044 11049 11100 11127 11141 11197 11250 11269\n", + " 11285 11308 11361 11383 11437 11460 11494 11502 11511 11522 11546 11557\n", + " 11564 11588 11649 11658 11671 11674 11703 11729 11749 11759 11832 11892\n", + " 11979 11988 12000 12038 12063 12078 12107 12119 12165 12259 12269 12270\n", + " 12347 12369 12386 12415 12475 12518 12566 12569 12574 12652 12693 12792\n", + " 12833 12834 12852 12872 12900 12946 13117 13121 13124 13321 13345 13357\n", + " 13427 13431 13446 13473 13526 13635 13638 13662 13706 13733 13803 13807\n", + " 13852 13882 13912 13924 13962 13969 13986 14023 14036 14046 14085 14110\n", + " 14130 14141 14175 14183 14191 14220 14222 14223 14285 14310 14331 14336\n", + " 14354 14375 14425 14427 14451 14482 14493 14516 14560 14563 14581 14623\n", + " 14671 14677 14679 14680 14685 14688 14742 14799 14860 14868 14870 14872\n", + " 14900 14909 14916 14940 14964 14991 15003 15023 15027 15033 15038 15051\n", + " 15086 15100 15184 15214 15232 15290 15352 15363 15365 15407 15433 15451\n", + " 15522 15577 15707 15720 15725 15739 15830 15837 15875 15937 15965 15985\n", + " 16017 16054 16113 16136 16142 16169 16191 16232 16238 16250 16268 16282\n", + " 16285 16290 16295 16304 16327 16334 16353 16356 16363 16382 16403 16407\n", + " 16408 16409 16458 16459 16495 16497 16499 16500 16516 16532 16595 16603\n", + " 16611 16657 16678 16680 16695 16701 16704 16754 16768 16807 16818 16856\n", + " 16870 16951 16971 16986 16989 16992 17048 17134 17181 17208 17217 17236\n", + " 17243 17319 17363 17398 17448 17471 17497 17557 17646 17654 17659 17692\n", + " 17754 17947 17957 17969 17975 18029 18128 18146 18196 18206 18207 18250\n", + " 18265 18313 18406]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-6.33203254e+13 -4.43651289e+13 6.35509366e+13 ... 1.08435585e+02\n", + " 9.42303467e+01 5.89958420e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 1.08435623e+02\n", + " 9.42303467e+01 5.89958954e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n", + " 671 731 763]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 5.0590863e+10 3.7823513e+11 -5.0394451e+11 ... -5.5814421e-01\n", + " 2.2970559e-01 -1.2293311e+00]\n", + "FF:[ 5.05906831e+10 3.78235290e+11 -5.03944544e+11 ... -5.58144033e-01\n", + " 2.29705781e-01 -1.22933090e+00]\n", + "[ True True True ... True True True]\n", + "[ 189 254 317 418 515 546 577 634 636 675 712 808 1011 1030\n", + " 1080 1091 1132 1168 1254 1265 1285 1287 1354 1381 1427 1459 1506 1620\n", + " 1654 1752 1887 1897 1900 1937 1981 1985 1986 2003 2029 2152 2181 2295\n", + " 2395 2426 2445 2673 2687 2859 2947 2977 3037]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n", + " 1.2655228e+01 7.1020460e+00]\n", + "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n", + " 1.26551876e+01 7.10206795e+00]\n", + "[ True True True ... True True True]\n", + "[ 9 49 113 174 243 267 271 288 323 335 397 399 438 439\n", + " 457 475 506 568 569 652 680 689 715 735 739 758 766 777\n", + " 785 837 842 852 865 884 893 919 930 932 936 939 957 1018\n", + " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n", + " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n", + " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n", + " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n", + " 2932 2948 3018 3034 3066]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_input_0\n", + "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n", + " 1.2655228e+01 7.1020460e+00]\n", + "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n", + " 1.26551876e+01 7.10206795e+00]\n", + "[ True True True ... True True True]\n", + "[ 9 49 113 174 243 267 271 288 323 335 397 399 438 439\n", + " 457 475 506 568 569 652 680 689 715 735 739 758 766 777\n", + " 785 837 842 852 865 884 893 919 930 932 936 939 957 1018\n", + " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n", + " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n", + " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n", + " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n", + " 2932 2948 3018 3034 3066]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_output_0\n", + "HF: [-6.3320325e+13 -4.4365129e+13 6.3550937e+13 ... 7.2449814e+01\n", + " 8.6617142e+01 8.3981407e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 7.24498901e+01\n", + " 8.66170959e+01 8.39814606e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n", + " 671 731 763]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_o_proj_in_grad\n", + "HF: [ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... 2.5297220e+02\n", + " -8.1722275e+01 -7.0014725e+01]\n", + "FF:[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... 2.52972260e+02\n", + " -8.17222137e+01 -7.00146637e+01]\n", + "[ True True True ... True True True]\n", + "[ 6 36 43 55 60 82 101 110 117 217 221 229 236 256 289 392 421 429\n", + " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 7.28854608e+13 6.37500977e+02 2.96775421e+02 ... 8.35403061e+01\n", + " 1.72460327e+02 2.90482426e+01]\n", + " [-6.08358210e+13 -5.23222847e+01 -2.34542664e+02 ... -1.87500763e+01\n", + " -8.99429398e+01 8.64021378e+01]\n", + " [-7.97326117e+13 -4.24736328e+02 -1.82208099e+02 ... 3.21808720e+00\n", + " -5.87415466e+01 -2.08511108e+02]\n", + " ...\n", + " [-1.13411917e+14 -3.48418640e+02 1.52205795e+02 ... 1.51519928e+02\n", + " 2.45651031e+02 2.52972198e+02]\n", + " [-3.75985275e+12 2.39696625e+02 1.51989685e+02 ... -2.85605354e+01\n", + " -1.79121232e+00 -8.17222748e+01]\n", + " [ 1.11016038e+14 -1.96372967e+01 -1.27668396e+02 ... 3.35008011e+01\n", + " -7.46116943e+01 -7.00147247e+01]]\n", + "FF:[[ 7.28854608e+13 6.37500977e+02 2.96775513e+02 ... 8.35403976e+01\n", + " 1.72460068e+02 2.90483646e+01]\n", + " [-6.08357832e+13 -5.23225098e+01 -2.34542755e+02 ... -1.87501526e+01\n", + " -8.99431992e+01 8.64022217e+01]\n", + " [-7.97326201e+13 -4.24736572e+02 -1.82207733e+02 ... 3.21793270e+00\n", + " -5.87416573e+01 -2.08511139e+02]\n", + " ...\n", + " [-1.13411925e+14 -3.48418640e+02 1.52205902e+02 ... 1.51519714e+02\n", + " 2.45650864e+02 2.52972260e+02]\n", + " [-3.75988630e+12 2.39696686e+02 1.51989319e+02 ... -2.85606136e+01\n", + " -1.79138493e+00 -8.17222137e+01]\n", + " [ 1.11016046e+14 -1.96372318e+01 -1.27668480e+02 ... 3.35009079e+01\n", + " -7.46116791e+01 -7.00146637e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 6 36 43 55 60 82 101 110 117 217 221 229 236 256 289 392 421 429\n", + " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... -1.1341192e+14\n", + " -3.7598527e+12 1.1101604e+14]\n", + " [ 3.3241980e+03 -6.3044128e+02 -3.0447307e+03 ... 3.0137921e+02\n", + " 3.8262988e+02 -4.2889914e+02]\n", + " [ 3.5639046e+01 -1.6155790e+01 -2.4461178e+01 ... 2.7450909e+02\n", + " 1.6181946e+02 -2.5407137e+02]\n", + " ...\n", + " [ 4.6487908e+00 -9.6633381e-01 -2.7078497e-01 ... 3.6374569e+01\n", + " -1.7563061e+00 -7.1206141e+00]\n", + " [ 1.8901447e+00 8.9006472e-01 -4.3125896e+00 ... 2.6014965e+01\n", + " -3.7720141e-01 -7.8855257e+00]\n", + " [ 1.9513500e+00 5.8041654e+00 -1.4006979e+01 ... 7.2743622e+01\n", + " -2.3499712e+01 -2.0133139e+01]]\n", + "FF:[[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... -1.13411925e+14\n", + " -3.75988630e+12 1.11016046e+14]\n", + " [ 3.32419922e+03 -6.30442505e+02 -3.04472998e+03 ... 3.01379364e+02\n", + " 3.82629669e+02 -4.28898712e+02]\n", + " [ 3.56390572e+01 -1.61558037e+01 -2.44611683e+01 ... 2.74509308e+02\n", + " 1.61819229e+02 -2.54071594e+02]\n", + " ...\n", + " [ 4.64879847e+00 -9.66338813e-01 -2.70792574e-01 ... 3.63745117e+01\n", + " -1.75632846e+00 -7.12060070e+00]\n", + " [ 1.89013767e+00 8.90062451e-01 -4.31257772e+00 ... 2.60149212e+01\n", + " -3.77217919e-01 -7.88551569e+00]\n", + " [ 1.95135939e+00 5.80417490e+00 -1.40069904e+01 ... 7.27435226e+01\n", + " -2.34996586e+01 -2.01330910e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "7.609953703703703% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-1.17282076e+14 -2.12461621e+03 8.80099030e+01 ... 4.34470520e+01\n", + " 7.55885468e+01 -2.88791332e+01]\n", + " [-2.07757936e+14 -3.81796265e+02 -2.33774780e+02 ... 8.11984329e+01\n", + " -4.41825638e+01 7.35064125e+00]\n", + " [ 4.11484165e+13 2.50572113e+02 1.91601822e+02 ... 1.00269365e+01\n", + " -3.41638985e+01 1.20433075e+02]\n", + " ...\n", + " [ 7.95562329e+13 1.55007373e+03 1.70351212e+02 ... -1.80320053e+01\n", + " 8.77533417e+01 2.14678173e+01]\n", + " [-1.86546485e+14 -5.18847070e+03 -3.34331085e+02 ... 2.51586838e+01\n", + " -4.06135368e+01 -6.27860641e+00]\n", + " [ 1.89751705e+14 -3.09853809e+03 -1.18278351e+01 ... -1.24640663e+02\n", + " 1.59719009e+01 -6.47173615e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-1.17282034e+14 -2.12461694e+03 8.80101547e+01 ... 4.34468918e+01\n", + " 7.55886002e+01 -2.88791542e+01]\n", + " [-2.07757920e+14 -3.81795776e+02 -2.33774765e+02 ... 8.11985397e+01\n", + " -4.41825829e+01 7.35066986e+00]\n", + " [ 4.11484543e+13 2.50570099e+02 1.91601196e+02 ... 1.00270777e+01\n", + " -3.41638451e+01 1.20433121e+02]\n", + " ...\n", + " [ 7.95562413e+13 1.55007288e+03 1.70350784e+02 ... -1.80321960e+01\n", + " 8.77533112e+01 2.14678249e+01]\n", + " [-1.86546469e+14 -5.18847070e+03 -3.34331268e+02 ... 2.51588135e+01\n", + " -4.06132622e+01 -6.27861023e+00]\n", + " [ 1.89751521e+14 -3.09853711e+03 -1.18275299e+01 ... -1.24640862e+02\n", + " 1.59719791e+01 -6.47173767e+01]]\n", + "7.530381944444445% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 7.7873253e+01\n", + " 8.6085976e+01 6.8200005e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 7.78733292e+01\n", + " 8.60859299e+01 6.82000580e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181\n", + " 226 261 284 318 320 378 382 385 391 395 403 422\n", + " 434 495 515 523 524 549 579 610 644 710 764 772\n", + " 870 984 987 1045 1249 1330 1362 1489 1517 1550 1556 1588\n", + " 1595 1659 1672 1684 1689 1768 1792 1799 1808 1818 1842 1871\n", + " 1889 1899 1910 1915 1925 1936 1993 1997 2033 2041 2059 2062\n", + " 2066 2098 2111 2124 2129 2130 2146 2153 2159 2166 2197 2206\n", + " 2210 2212 2222 2234 2237 2320 2321 2357 2359 2362 2385 2428\n", + " 2518 2539 2553 2568 2598 2683 2689 2694 2711 2714 2733 2787\n", + " 2788 2795 2811 2815 2853 2881 2890 2917 2981 2997 3021 3037\n", + " 3089 3149 3163 3191 3196 3217 3225 3248 3277 3287 3292 3305\n", + " 3327 3361 3385 3402 3417 3425 3456 3479 3516 3521 3528 3555\n", + " 3587 3599 3608 3684 3702 3733 3770 3779 3819 3822 3823 3898\n", + " 3921 3942 3950 4012 4053 4077 4086 4091 4139 4185 4198 4225\n", + " 4241 4296 4347 4349 4368 4403 4407 4418 4453 4471 4472 4473\n", + " 4494 4537 4549 4555 4558 4598 4623 4648 4666 4698 4729 4782\n", + " 4848 4866 4886 4943 4959 5008 5010 5012 5057 5079 5177 5178\n", + " 5186 5211 5271 5281 5296 5313 5328 5356 5364 5409 5429 5440\n", + " 5453 5455 5457 5476 5529 5563 5591 5621 5625 5631 5654 5661\n", + " 5692 5705 5720 5740 5751 5758 5787 5799 5813 5835 5836 5867\n", + " 5872 5893 5953 5974 5980 5982 6000 6055 6082 6086 6102 6107\n", + " 6123 6159 6172 6193 6220 6230 6231 6263 6286 6297 6362 6396\n", + " 6401 6430 6436 6485 6497 6499 6502 6510 6537 6554 6555 6563\n", + " 6564 6579 6586 6598 6615 6625 6626 6649 6651 6661 6754 6764\n", + " 6776 6852 6863 6874 6883 6892 6913 6945 6969 7036 7057 7066\n", + " 7082 7138 7147 7150 7157 7197 7202 7231 7234 7235 7240 7270\n", + " 7278 7287 7322 7327 7345 7348 7361 7390 7402 7490 7539 7573\n", + " 7610 7714 7721 7758 7794 7812 7827 7829 7837 7839 7882 7894\n", + " 7943 7948 7952 7969 7975 7996 8024 8027 8037 8043 8055 8078\n", + " 8079 8088 8090 8095 8154 8258 8264 8283 8297 8313 8329 8336\n", + " 8359 8361 8376 8383 8416 8421 8428 8454 8475 8502 8521 8613\n", + " 8642 8653 8696 8756 8764 8777 8791 8837 8849 8859 8878 8955\n", + " 8991 8997 9006 9012 9040 9066 9093 9097 9098 9131 9158 9162\n", + " 9165 9214 9216 9280 9297 9301 9316 9355 9371 9412 9421 9475\n", + " 9510 9580 9620 9645 9696 9713 9732 9768 9802 9817 9819 9826\n", + " 9839 9846 9947 10004 10062 10065 10072 10103 10107 10108 10138 10167\n", + " 10173 10228 10262 10292 10326 10356 10360 10372 10421 10446 10466 10468\n", + " 10499 10505 10513 10517 10589 10606 10612 10645 10664 10669 10726 10777\n", + " 10835 10838 10839 10848 10855 10877 10897 10941 10963 10971 10977 10997\n", + " 11030 11060 11065 11076 11088 11140 11167 11174 11231 11252 11257 11259\n", + " 11275 11297 11302 11319 11331 11333 11357 11358 11380 11382 11402 11423\n", + " 11446 11447 11500 11501 11522 11585 11623 11670 11728 11736 11759 11761\n", + " 11772 11785 11839 11894 11916 11924 11936 11962 11968 11969 11977 11984\n", + " 12008 12030 12054 12074 12123 12175 12182 12194 12237 12262 12282 12285\n", + " 12341 12348 12351 12370 12376 12386 12399 12449 12507 12513 12518 12522\n", + " 12549 12572 12643 12648 12663 12689 12696 12710 12769 12780 12788 12792\n", + " 12793 12852 12864 12879 12884 12985 13018 13041 13057 13176 13264 13272\n", + " 13274 13275 13292 13303 13333 13379 13427 13428 13442 13451 13454 13500\n", + " 13510 13533 13564 13588 13607 13640 13655 13686 13687 13688 13732 13747\n", + " 13786 13801 13803 13826 13841 13846 13850 13892 13909 13946 14036 14040\n", + " 14046 14060 14080 14152 14161 14183 14195 14210 14240 14278 14331 14354\n", + " 14370 14372 14386 14395 14409 14432 14434 14497 14506 14531 14559 14589\n", + " 14648 14663 14686 14698 14715 14743 14757 14799 14808 14810 14849 14893\n", + " 14902 14929 14937 14947 14953 14958 15005 15012 15018 15036 15066 15069\n", + " 15083 15152 15154 15196 15197 15212 15292 15309 15323 15340 15343 15375\n", + " 15389 15396 15408 15410 15454 15499 15532 15557 15605 15647 15677 15736\n", + " 15745 15756 15769 15809 15824 15876 15882 15900 15906 15941 16027 16030\n", + " 16040 16116 16190 16192 16205 16207 16239 16279 16285 16295 16348 16358\n", + " 16367 16384 16386 16394 16399 16455 16457 16458 16471 16495 16500 16502\n", + " 16520 16541 16542 16598 16623 16643 16651 16665 16673 16679 16713 16725\n", + " 16734 16736 16739 16751 16756 16768 16861 16870 16939 16976 17007 17028\n", + " 17040 17069 17087 17108 17125 17139 17151 17158 17174 17175 17178 17182\n", + " 17189 17221 17258 17341 17360 17370 17381 17395 17396 17415 17432 17450\n", + " 17463 17470 17472 17473 17496 17507 17536 17608 17626 17627 17649 17653\n", + " 17664 17771 17815 17822 17831 17864 17883 17931 17994 17999 18035 18174\n", + " 18209 18250 18274 18307 18327 18403 18423]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 7.7873253e+01\n", + " 8.6085976e+01 6.8200005e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 7.78733292e+01\n", + " 8.60859299e+01 6.82000580e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181 226 261 284 318 320 378\n", + " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 6.5550952e+14 4.9376585e+14 3.8510841e+14 ... 1.6802770e+00\n", + " -1.1248941e+00 -1.1701980e+00]\n", + "FF:[ 6.55509317e+14 4.93765882e+14 3.85108377e+14 ... 1.68027747e+00\n", + " -1.12489426e+00 -1.17019880e+00]\n", + "[ True True True ... True True True]\n", + "[ 6 79 111 149 155 168 187 195 220 223 252 261 329 343\n", + " 347 369 386 392 403 438 439 450 461 524 535 643 656 659\n", + " 661 668 722 727 732 742 754 801 816 820 835 837 849 850\n", + " 978 993 997 1012 1019 1034 1044 1071 1088 1094 1114 1135 1151 1170\n", + " 1190 1212 1273 1275 1277 1289 1290 1308 1311 1337 1364 1379 1394 1430\n", + " 1454 1460 1469 1474 1703 1725 1728 1732 1733 1741 1754 1757 1804 1806\n", + " 1856 1862 1932 1945 1996 2030 2044 2045 2065 2071 2075 2094 2149 2152\n", + " 2163 2180 2182 2215 2254 2357 2362 2370 2392 2398 2428 2484 2519 2521\n", + " 2524 2582 2618 2641 2645 2664 2674 2681 2691 2735 2747 2779 2872 2899\n", + " 2909 2935 2957 3000 3033]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [-1.3871785e+17 -8.3164397e+16 4.9509505e+16 ... 4.3806694e+01\n", + " 9.4386072e+00 -2.4460859e+01]\n", + "FF:[-1.38717840e+17 -8.31644654e+16 4.95094495e+16 ... 4.38065948e+01\n", + " 9.43864822e+00 -2.44608364e+01]\n", + "[ True True True ... True True True]\n", + "[ 80 83 172 173 176 184 215 285 329 338 341 395 403 465\n", + " 468 565 572 601 614 636 639 651 660 749 750 806 828 844\n", + " 873 952 971 988 992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n", + " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n", + " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n", + " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n", + " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n", + " 2936 2945 2951 3013 3016]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_input_0\n", + "HF: [-1.3871785e+17 -8.3164397e+16 4.9509505e+16 ... 4.3806694e+01\n", + " 9.4386072e+00 -2.4460859e+01]\n", + "FF:[-1.38717840e+17 -8.31644654e+16 4.95094495e+16 ... 4.38065948e+01\n", + " 9.43864822e+00 -2.44608364e+01]\n", + "[ True True True ... True True True]\n", + "[ 80 83 172 173 176 184 215 285 329 338 341 395 403 465\n", + " 468 565 572 601 614 636 639 651 660 749 750 806 828 844\n", + " 873 952 971 988 992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n", + " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n", + " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n", + " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n", + " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n", + " 2936 2945 2951 3013 3016]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 3.5121140e+01\n", + " -3.5587997e+00 9.5641022e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 3.51211472e+01\n", + " -3.55898285e+00 9.56410980e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181 226 261 284 318 320 378\n", + " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_o_proj_in_grad\n", + "HF: [-1.6186993e+17 -3.5698813e+17 3.4442975e+16 ... -2.5844165e+02\n", + " 2.0677340e+01 -2.4573349e+01]\n", + "FF:[-1.61869621e+17 -3.56988336e+17 3.44430865e+16 ... -2.58441467e+02\n", + " 2.06775093e+01 -2.45735531e+01]\n", + "[ True True True ... True True True]\n", + "[ 93 99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n", + " 608 702 720 731 759]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-1.6186993e+17 -2.1968115e+02 8.5754425e+01 ... -6.9909119e+01\n", + " -2.6478451e+01 -7.4195160e+01]\n", + " [-3.5698813e+17 3.9582391e+02 5.5431940e+02 ... 1.9529277e+02\n", + " 1.2558211e+02 6.7965935e+01]\n", + " [ 3.4442975e+16 2.8310864e+02 -8.1522171e+01 ... -2.3606525e+01\n", + " -2.0410315e+01 -1.5228156e+02]\n", + " ...\n", + " [ 4.0923264e+16 -2.4507169e+02 -8.2614380e+02 ... -2.6583340e+02\n", + " -1.9878247e+02 -2.5844165e+02]\n", + " [ 6.9156258e+17 1.3969666e+02 -7.5639044e+02 ... -1.5231053e+02\n", + " -3.3650037e+02 2.0677340e+01]\n", + " [ 9.9511712e+16 -3.2348724e+01 3.0624988e+02 ... 1.0391423e+02\n", + " 6.0626881e+01 -2.4573349e+01]]\n", + "FF:[[-1.61869621e+17 -2.19681122e+02 8.57541504e+01 ... -6.99092026e+01\n", + " -2.64783611e+01 -7.41952515e+01]\n", + " [-3.56988336e+17 3.95823853e+02 5.54319275e+02 ... 1.95292725e+02\n", + " 1.25582062e+02 6.79659348e+01]\n", + " [ 3.44430865e+16 2.83108551e+02 -8.15224686e+01 ... -2.36064014e+01\n", + " -2.04101429e+01 -1.52281570e+02]\n", + " ...\n", + " [ 4.09233933e+16 -2.45071564e+02 -8.26143555e+02 ... -2.65833405e+02\n", + " -1.98782272e+02 -2.58441467e+02]\n", + " [ 6.91562577e+17 1.39696579e+02 -7.56390808e+02 ... -1.52310455e+02\n", + " -3.36500092e+02 2.06775093e+01]\n", + " [ 9.95114373e+16 -3.23486938e+01 3.06250122e+02 ... 1.03914482e+02\n", + " 6.06264191e+01 -2.45735531e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 93 99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n", + " 608 702 720 731 759]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-1.6186993e+17 -3.5698813e+17 3.4442975e+16 ... 4.0923264e+16\n", + " 6.9156258e+17 9.9511712e+16]\n", + " [-5.3483575e+02 2.6249797e+03 -6.7268573e+02 ... -6.1204077e+03\n", + " -4.3047915e+03 -9.5139771e+01]\n", + " [-1.2200641e+01 1.0347147e+02 -2.6777636e+01 ... -1.4766699e+02\n", + " -9.8514114e+01 1.2616925e+01]\n", + " ...\n", + " [-3.2097631e+00 9.1431990e+00 -1.6333975e+00 ... -6.9996667e+00\n", + " -6.4008064e+00 1.9126304e+00]\n", + " [-3.0982289e+00 1.2355285e+01 -3.1715555e+00 ... -4.6754313e+00\n", + " -6.2553053e+00 1.0515085e+00]\n", + " [-2.9516125e+00 2.7038031e+00 -6.0580249e+00 ... -1.6555168e+01\n", + " 1.3245420e+00 -1.5741113e+00]]\n", + "FF:[[-1.61869621e+17 -3.56988336e+17 3.44430865e+16 ... 4.09233933e+16\n", + " 6.91562577e+17 9.95114373e+16]\n", + " [-5.34834961e+02 2.62497900e+03 -6.72686401e+02 ... -6.12040576e+03\n", + " -4.30479297e+03 -9.51402283e+01]\n", + " [-1.22006664e+01 1.03471611e+02 -2.67777309e+01 ... -1.47666946e+02\n", + " -9.85141525e+01 1.26169167e+01]\n", + " ...\n", + " [-3.20977211e+00 9.14321709e+00 -1.63339353e+00 ... -6.99966621e+00\n", + " -6.40081263e+00 1.91262615e+00]\n", + " [-3.09821057e+00 1.23552399e+01 -3.17152786e+00 ... -4.67541933e+00\n", + " -6.25528765e+00 1.05149710e+00]\n", + " [-2.95161533e+00 2.70380235e+00 -6.05802393e+00 ... -1.65551491e+01\n", + " 1.32455230e+00 -1.57412362e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "8.101851851851851% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-7.3778828e+16 1.0956941e+03 1.1773144e+02 ... -4.0466427e+01\n", + " -3.1198654e+01 -1.7603550e+01]\n", + " [-1.2087128e+18 6.9384756e+03 6.1327003e+01 ... 1.5329468e+01\n", + " 7.6757736e+00 -4.5589094e+00]\n", + " [-6.7892266e+17 5.4895034e+03 7.6927376e+01 ... 9.1396770e+00\n", + " 2.3195824e+01 -6.1995559e+00]\n", + " ...\n", + " [ 2.6452032e+17 9.9761787e+03 2.2349066e+02 ... 5.7504387e+01\n", + " -8.6791611e-01 4.6890911e+01]\n", + " [-6.7528534e+16 3.3856902e+03 2.5189743e+02 ... 2.2824722e+01\n", + " 8.7917282e+01 -2.1569672e+01]\n", + " [-2.1779064e+17 5.2511855e+03 6.6282043e+01 ... 9.9689598e+00\n", + " -5.5022659e+00 -3.2573143e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.37791458e+16 1.09569678e+03 1.17731285e+02 ... -4.04664154e+01\n", + " -3.11988506e+01 -1.76035423e+01]\n", + " [-1.20871251e+18 6.93847900e+03 6.13275528e+01 ... 1.53295393e+01\n", + " 7.67594433e+00 -4.55900288e+00]\n", + " [-6.78922523e+17 5.48950342e+03 7.69272308e+01 ... 9.13961220e+00\n", + " 2.31957569e+01 -6.19959354e+00]\n", + " ...\n", + " [ 2.64520284e+17 9.97617871e+03 2.23490509e+02 ... 5.75044785e+01\n", + " -8.67943764e-01 4.68908234e+01]\n", + " [-6.75287400e+16 3.38569165e+03 2.51897339e+02 ... 2.28247147e+01\n", + " 8.79171448e+01 -2.15696106e+01]\n", + " [-2.17790679e+17 5.25118652e+03 6.62821960e+01 ... 9.96885872e+00\n", + " -5.50213098e+00 -3.25731125e+01]]\n", + "9.809027777777777% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.7.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_7_layers_7_feed_forward_w2_shard_0_output_0\n", + "HF: [-7.5522525e+19 -1.3283726e+21 -7.2549753e+20 ... 4.9017162e+01\n", + " -9.7436657e+00 8.5870697e+01]\n", + "FF:[-7.55228501e+19 -1.32837218e+21 -7.25497390e+20 ... 4.90171394e+01\n", + " -9.74382782e+00 8.58707886e+01]\n", + "[ True True True ... True False True]\n", + "[ 19 64 75 ... 18418 18428 18430]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[23], line 95\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- W2 --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 95\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m compare_tensors(hf_w2_weight, ff_w2_weight, tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m)\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- Lora --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:47\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "attention_tests=True\n", + "for i in range(tot_num_layers-1, -1, -1):\n", + " # HuggingFace filepaths\n", + " hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n", + " hf_BWD_loraB_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0\"\n", + " hf_BWD_loraB_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0\"\n", + " hf_BWD_loraA_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0\"\n", + " hf_BWD_loraA_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0\"\n", + " hf_loraA_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", + " hf_loraB_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", + " hf_BWD_lora_dropout_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.go_0\"\n", + " hf_BWD_lora_dropout_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.gi_0\"\n", + " hf_BWD_w2_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0\"\n", + " hf_BWD_w2_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0\"\n", + " hf_w2_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.weight\"\n", + " hf_BWD_w3_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0\"\n", + " hf_BWD_w3_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0\"\n", + " hf_BWD_w1_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0\"\n", + " hf_BWD_w1_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0\"\n", + " hf_BWD_act_fn_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0\"\n", + " hf_BWD_act_fn_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.go_0\"\n", + " hf_BWD_ffn_norm_out = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0\"\n", + " hf_BWD_ffn_norm_in = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0\"\n", + " hf_BWD_attn_out_out = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0\"\n", + " hf_BWD_attn_q_in = f\"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n", + " hf_FWD_w1_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", + " hf_FWD_w3_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\"\n", + " hf_FWD_act_fn_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0\"\n", + " hf_BWD_attn_oproj_in = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n", + " hf_attn_qproj_weight = f\"{hf_path}/layers.{i}.self_attn.q_proj.weight\"\n", + " hf_attn_kproj_weight = f\"{hf_path}/layers.{i}.self_attn.k_proj.weight\"\n", + " hf_attn_vproj_weight = f\"{hf_path}/layers.{i}.self_attn.v_proj.weight\"\n", + " hf_attn_oproj_weight = f\"{hf_path}/layers.{i}.self_attn.o_proj.weight\"\n", + " \n", + " # FlexFlow filepaths\n", + " ff_BWD_w2_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n", + " ff_BWD_w2_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n", + " ff_BWD_w2_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_pre_input_0\"\n", + " ff_w2_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_weight_0\"\n", + " ff_BWD_ssm_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0\"\n", + " ff_BWD_ssm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0\"\n", + " ff_BWD_ssm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1\"\n", + " ff_BWD_w3_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n", + " ff_BWD_w3_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_input_0\"\n", + " ff_BWD_lora_A_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n", + " ff_BWD_lora_B_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n", + " ff_lora_A_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n", + " ff_lora_B_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n", + " ff_BWD_w1_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n", + " ff_BWD_w1_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_input_0\"\n", + " ff_BWD_w1_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_pre_input_0\"\n", + " ff_w1_weight = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_weight_0\"\n", + " ff_BWD_ffn_norm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_0\"\n", + " ff_BWD_ffn_norm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_1\"\n", + " ff_BWD_ffn_norm_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_0\"\n", + " ff_BWD_attn_out = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n", + " ff_BWD_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_input_0\"\n", + " ff_BWD_ssm_cached_w1_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w1_output\"\n", + " ff_BWD_ssm_cached_w3_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w3_output\"\n", + " ff_FWD_w1_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w1_shard_0_output_0\"\n", + " ff_FWD_w3_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w3_shard_0_output_0\"\n", + " ff_FWD_act_fnc_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_act_fn_output\"\n", + " ff_BWD_attn_o_proj_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + " ff_attn_oproj_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_weight_0\"\n", + " \n", + " \n", + " # HuggingFace checks\n", + " print(\"\\nHuggingface checks:\")\n", + " if i == tot_num_layers-1:\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", + "\n", + " compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n", + " check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n", + " if i == tot_num_layers-1:\n", + " check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n", + "\n", + " # FlexFlow checks\n", + " print(\"\\nFlexFlow checks:\")\n", + " compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n", + " compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n", + " compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n", + " \n", + " # HF-FlexFlow checks\n", + " print(\"\\nHuggingface-FlexFlow checks:\")\n", + " print(\"-- W2 --\")\n", + " compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " \n", + " print(\"-- Lora --\")\n", + " compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n", + " compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n", + "\n", + " compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n", + " compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + " \n", + " print(\"-- W2/W1/W3 --\")\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n", + " compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " \n", + " print(\"-- Attention --\")\n", + " compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n", + " hidden_size = 768\n", + " qProjSize = 64\n", + " num_heads = 12\n", + " num_new_tokens = num_tokens = 24\n", + " if attention_tests:\n", + " # compare attn weight tensors\n", + " ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n", + " ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n", + " \n", + " hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n", + " \n", + " assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n", + " \n", + " # Compare attn outproj grad in tensors\n", + " compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n", + " \n", + " ########### Compare value projs grads ######################\n", + " # 1. compare qk prods softmax\n", + " hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.qk_prods_softmax.output_0\"\n", + " ff_attn_qk_prods_softmax = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax\"\n", + " \n", + " hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n", + " ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + "\n", + " for head_idx in range(num_heads):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + " \n", + " # 2. compare attn heads grads\n", + " hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n", + " ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + "\n", + " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n", + " # NEED TO VISUALLY INSPECT\n", + " compare_loaded_tensors(hf_attn_heads_grads, ff_attn_heads_grads)\n", + "\n", + " # 3. vproj grads\n", + " hf_vproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0\"\n", + " ff_vproj_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_v_proj_in_grad\"\n", + "\n", + " hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n", + " ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n", + " compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)\n", + " \n", + " \n", + " ##############################\n", + " hf_value_states = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.value_states.output_0\"\n", + " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # print(hf_value_states.shape)\n", + " ff_value_states = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_vcache\"\n", + " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", + " # print(ff_value_states.shape)\n", + " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", + " \n", + " \n", + " \n", + " ########## Compare key and query projs grads ##################\n", + " ff_devQKVPRojArray = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n", + " ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n", + " ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n", + " ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n", + " ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n", + " assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n", + "\n", + " # simulate qk_prods_softmax\n", + " ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n", + " ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n", + " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", + " ff_value_states = torch.from_numpy(ff_value_states)\n", + " ff_value_states = ff_value_states.permute(1,0,2)\n", + " # print(ff_attn_heads_grads.shape)\n", + " # print(ff_value_states.shape)\n", + " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", + " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", + " #print(\"Simulated QK prods grads:\")\n", + " #print(simulated_qk_prods_softmax_grads[0,:,:])\n", + "\n", + " # qk prods softmax right before softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.qk_prods_softmax.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " \n", + " mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n", + " # print(hf_qk_prods_softmax2[:2,:,0])\n", + " # print(ff_qk_prods_softmax2[:2,:,0])\n", + " assert(pct_mismatch <= 0.1)\n", + "\n", + " # qk prods softmax right after softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.pre_softmax.gi_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " compare_loaded_tensors(hf_qk_prods_softmax2, ff_qk_prods_softmax2)\n", + " \n", + " # qk prods softmax after mask\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.matmul_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in_masked\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + "\n", + " # Compare query activation\n", + " hf_query_activation = hf_path + f\"/fwd_step_0_layers.11.self_attn.query_activation.output_0\"\n", + " hf_query_activation = torch.load(hf_query_activation)\n", + " ff_query_activation = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_query_activation\"\n", + " ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n", + " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", + " # print(hf_query_activation[:,0,:])\n", + " # print()\n", + " # print(ff_query_activation[:,0,:])\n", + " # assert False\n", + " # compare_loaded_tensors(hf_query_activation, ff_query_activation)\n", + " check_rope = False\n", + " if check_rope:\n", + " ########################################## ROPE and Kproj ##########################################\n", + "\n", + " # Compare FF kproj with intermediate kproj data from HF\n", + " hf_kproj_grads_post_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_post_rotary.go_0\"\n", + " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", + " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", + " # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", + " # Check hf ROPE \n", + " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", + " cos = cos.cuda()\n", + " sin = sin.cuda()\n", + " # query_states: torch.Size([1, 12, 24, 64])\n", + " # key_states: torch.Size([1, 12, 24, 64])\n", + " # position_ids: torch.Size([1, 24])\n", + " # tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " # 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + " query_states = torch.zeros([1, 12, 24, 64]).cuda()\n", + " position_ids = torch.arange(24).unsqueeze(0).cuda()\n", + " query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n", + " hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", + " # print(hf_kproj_grads_post_rotary[:,:,0])\n", + " \n", + " hf_kproj_grads_before_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_before_rotary.go_0\"\n", + " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", + " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", + " # print(hf_kproj_grads_before_rotary[:,:,0])\n", + " # Compare HF rope with manual ROPE\n", + " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " # Compare HF Kproj with FF Kproj (before ROPE) \n", + " ff_kproj_pre = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj_pre\"\n", + " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", + " #print(ff_kproj_pre[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " \n", + " ff_kproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj\"\n", + " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj: \", ff_kproj.shape)\n", + " #print(ff_kproj[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " \n", + " \n", + " #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", + " hf_kproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0\"\n", + " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", + " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", + " #print(hf_kproj_grads[:,:64])\n", + " reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " #print(reshaped_tensor.shape)\n", + " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " ########################################## Qproj (with ROPE) ##########################################\n", + "\n", + " # Compare QProj\n", + " hf_qproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0\"\n", + " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", + " # print(\"HF Qproj:\")\n", + " # print(hf_qproj_grads.shape)\n", + " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " # print(\"\\t reshaped: \", reshaped_tensor.shape)\n", + " # print(reshaped_tensor[:,:,0])\n", + " ff_qproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n", + " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", + " # print(\"FF Qproj:\")\n", + " # print(ff_qproj.shape)\n", + " # print(ff_qproj[:,:,0])\n", + " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " hf_attn_in = f\"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0\"\n", + " hf_attn_in = torch.load(hf_attn_in)\n", + " hf_attn_in = hf_attn_in.squeeze().T\n", + " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", + " print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " print(hf_attn_in)\n", + "\n", + " ff_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in\"\n", + " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", + " print(\"ff_attn_in: \", ff_attn_in.shape)\n", + " print(ff_attn_in)\n", + " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", + "\n", + " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", + " mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n", + " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", + " assert(pct_mismatch <= 0.1)\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.01614726 0.01363804 0.01768043 ... 0.00724926 -0.00149747\n", + " -0.01781223]\n" + ] + } + ], + "source": [ + "a = np.fromfile(\"/usr0/home/goliaro/.cache/flexflow/weights/goliaro/llama-160m-lora-full/full-precision/layers_11_feed_forward_w2_lora_A_weight\", dtype=np.float32)\n", + "print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# value states: torch.Size([1, 12, 24, 64])\n", + "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n", + "key_states = value_states\n", + "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n", + "# query_states: torch.Size([1, 12, 24, 64])\n", + "# key_states: torch.Size([1, 12, 24, 64])\n", + "# position_ids: torch.Size([1, 24])\n", + "# tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + "# 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + "query_states = torch.zeros([1, 12, 24, 64])\n", + "position_ids = torch.arange(24).unsqueeze(0)\n", + "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n", + "key_states = key_states.squeeze()\n", + "print(key_states.shape)\n", + "print(key_states[0,:,:])\n", + "print(hf_kproj_grads_before_rotary.shape)\n", + "print(hf_kproj_grads_before_rotary[:,:,0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23]], device='cuda:0')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.arange(24).unsqueeze(0).cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 12, 24, 24])\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 17\u001b[0m ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m 18\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> 19\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 21\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "layer_num = 11\n", + "hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n", + "ff_qk_prods_softmax = f\"{ff_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + "hf_value_states = f\"{hf_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + "\n", + "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n", + "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n", + "print(hf_qk_prods_softmax.shape)\n", + "#print(ff_qk_prods_softmax.shape)\n", + "#print(hf_qk_prods_softmax[:,:,0])\n", + "#print()\n", + "#print(ff_qk_prods_softmax[:,:,0])\n", + "\n", + "for head_idx in range(12):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + "\n", + "\n", + "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n", + "print(hf_value_states.shape)\n", + "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n", + "print()\n", + "print(attn_output.shape)\n", + "print(attn_output.transpose(1, 2).contiguous().shape)\n", + "print(\"Hf attn heads\")\n", + "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n", + "\n", + "print(\"Attn heads grads:\")\n", + "hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + "print(torch.load(hf_attn_heads_grads).shape)\n", + "print(\"HF value grads:\")\n", + "vproj_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + "print(torch.load(vproj_grads).shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 3, 4])\n", + "torch.Size([4, 3, 2])\n" + ] + } + ], + "source": [ + "a = torch.randn(2,3,4)\n", + "print(a.shape)\n", + "print(a.T.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000,\n", + " 0.0000],\n", + " [ 27.8890, -21.5089, 45.8214, ..., 5.4010, -10.8787,\n", + " 39.7619],\n", + " [ 19.2197, 27.4681, -68.7141, ..., 102.3280, 66.7925,\n", + " -160.8711],\n", + " ...,\n", + " [ 63.9532, 17.4273, -29.4416, ..., 101.6105, 67.5937,\n", + " -198.4432],\n", + " [ 31.2799, 13.0724, -44.7179, ..., 132.4898, 42.3135,\n", + " -194.4037],\n", + " [ 42.3453, -16.2693, -55.7386, ..., 90.5921, 52.2032,\n", + " -124.1802]]], device='cuda:0')\n", + "tensor([[[-1.1845e+06, -6.7460e+05, 7.4494e+05, ..., -9.1441e+05,\n", + " -1.4912e+05, 3.5769e+06],\n", + " [-7.3920e+01, -7.9389e+01, 1.1027e+02, ..., -7.3020e+01,\n", + " -2.3540e+01, 3.4587e+02],\n", + " [-5.3885e+01, -1.7373e+01, -1.9780e+01, ..., 4.1291e+01,\n", + " 5.5099e+01, 5.5910e+01],\n", + " ...,\n", + " [-2.1948e+01, -3.2109e+01, 2.8364e+01, ..., 3.4321e+01,\n", + " 5.0713e+01, 5.6592e+01],\n", + " [-4.4339e+01, -2.8339e+01, 1.4070e+01, ..., 6.2797e+01,\n", + " 3.0760e+01, 6.1743e+01],\n", + " [-1.6287e+01, -5.0413e+01, -1.9940e+01, ..., 4.3766e+01,\n", + " 4.7833e+01, 4.7295e+01]]], device='cuda:0')\n" + ] + } + ], + "source": [ + "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n", + "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n", + "a = torch.load(a)\n", + "b = torch.load(b)\n", + "print(a)\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "for layer_num in range(12):\n", + " hf_lora_A_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n", + " hf_w1_weight = f\"{hf_path}/layers.{layer_num}.mlp.gate_proj.weight\"\n", + " ff_w1_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n", + " hf_w3_weight = f\"{hf_path}/layers.{layer_num}.mlp.up_proj.weight\"\n", + " ff_w3_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n", + " hf_w2_weight = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.weight\"\n", + " ff_w2_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/alignment/opt_alignment_tests.ipynb b/tests/peft/alignment/opt_alignment_tests.ipynb new file mode 100644 index 0000000000..ca679b1857 --- /dev/null +++ b/tests/peft/alignment/opt_alignment_tests.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch\n", + "from align_test_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "\n", + "--- LM head ---\n", + "Ok!\n", + "Ok!\n", + "\n", + "--- Final Norm ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "qProjSize = 64\n", + "num_heads = 12\n", + "num_tokens = 25\n", + "for i in range(tot_num_layers):\n", + " hf_base = os.path.join(hf_path, f\"fwd_step_0_decoder.layers.{i}.\")\n", + " ff_base = os.path.join(ff_path, f\"fwd_step_0_layers_{i}_layers_{i}_\")\n", + " \n", + " # LayerNorm\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n", + " ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.output_0\"\n", + " ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_1\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + "\n", + " # # Attention QKV proj\n", + " # print(\"---Attn---\")\n", + " # ff_tensor = ff_base + \"attention_shard_0_qkv_proj_output\"\n", + " # ff_tensor = load_ff_tensor(ff_tensor, [qProjSize, num_heads, 3, num_tokens])\n", + " # ff_q_proj = ff_tensor[:,:,0,:]\n", + " # ff_k_proj = ff_tensor[:,:,1,:]\n", + " # ff_v_proj = ff_tensor[:,:,2,:]\n", + " # hf_q_proj = hf_base + \"self_attn.q_proj.output_0\"\n", + " # hf_q_proj = load_hf_tensor(hf_q_proj).squeeze().T\n", + " # hf_q_proj = hf_q_proj.reshape(12,64,25)\n", + " # hf_q_proj = np.transpose(hf_q_proj, (1,0,2))\n", + " # hf_k_proj = hf_base + \"self_attn.k_proj.output_0\"\n", + " # hf_k_proj = load_hf_tensor(hf_k_proj).squeeze().T\n", + " # hf_k_proj = hf_k_proj.reshape(12,64,25)\n", + " # hf_k_proj = np.transpose(hf_k_proj, (1,0,2))\n", + " # hf_v_proj = hf_base + \"self_attn.v_proj.output_0\"\n", + " # hf_v_proj = load_hf_tensor(hf_v_proj).squeeze().T\n", + " # hf_v_proj = hf_v_proj.reshape(12,64,25)\n", + " # hf_v_proj = np.transpose(hf_v_proj, (1,0,2))\n", + " # compare_loaded_tensors(hf_q_proj/np.sqrt(qProjSize), ff_q_proj)\n", + " # compare_loaded_tensors(hf_k_proj, ff_k_proj)\n", + " # compare_loaded_tensors(hf_v_proj, ff_v_proj)\n", + "\n", + " # Compare attn bias, residuals\n", + " print(\"--- Attn bias + residual ---\")\n", + " ff_residual1 = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_input_1\"\n", + " ff_residual2 = ff_base + \"attention_layer_norm_shard_0_output_0\"\n", + " compare_flexflow_tensors(ff_residual1, ff_residual2)\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n", + " compare_tensors(hf_tensor, ff_residual2)\n", + " ff_tensor = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_output_0\"\n", + " hf_tensor = hf_base + \"final_layer_norm.input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " \n", + " print(\"--- MLP ---\")\n", + " hf_tensor = hf_base + \"fc1.input_0\"\n", + " ff_tensor = ff_base + \"fc1_shard_0_input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " hf_tensor = hf_base + \"fc2.input_0\"\n", + " ff_tensor = ff_base + \"fc2_shard_0_input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + "# LM head\n", + "print(\"\\n--- LM head ---\")\n", + "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.input_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "# Final layer norm\n", + "print(\"\\n--- Final Norm ---\")\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.input_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "ff_tensor1 = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_input_activation\"\n", + "# compare_flexflow_tensors_shortest(ff_tensor, ff_tensor1)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.output_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_1\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_1\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_mean\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_2\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_rstd\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 22\u001b[0m\n\u001b[1;32m 19\u001b[0m compare_flexflow_tensors(ff_tensor, ff_tensor1)\n\u001b[1;32m 20\u001b[0m compare_tensors(hf_tensor, ff_tensor) \u001b[38;5;66;03m# fails\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Compare fwd input/output of layernorm\u001b[39;00m\n\u001b[1;32m 25\u001b[0m hf_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_decoder.final_layer_norm.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "# Compare backward pass\n", + "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n", + "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n", + "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n", + "\n", + "hf_tensor1 = hf_path + \"/bwd_step_0_decoder.final_layer_norm.go_0\"\n", + "compare_hf_tensors(hf_tensor, hf_tensor1)\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n", + "compare_tensors(hf_tensor1, ff_tensor)\n", + "\n", + "hf_tensor = hf_path + \"/bwd_step_0_decoder.final_layer_norm.gi_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_0\"\n", + "ff_tensor1 = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_1\"\n", + "compare_flexflow_tensors(ff_tensor, ff_tensor1)\n", + "compare_tensors(hf_tensor, ff_tensor) # fails" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\n", + "HF: [ 0.0193019 -1.0467215 0.21579844 ... 0.04534929 -0.25642633\n", + " 0.10879952]\n", + "FF:[ 0.01458706 -1.02212262 0.20589906 ... 0.04446212 -0.25625792\n", + " 0.108039 ]\n", + "[ True False True ... True True True]\n", + "[ 1 3 7 ... 19170 19174 19188]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m hf_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 15\u001b[0m ff_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_fc1_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_fc1_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# LORA input\u001b[39;00m\n\u001b[1;32m 20\u001b[0m hf_lora_A_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:32\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers):\n", + " hf_input_ln_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.self_attn_layer_norm.output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_layer_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " \n", + " hf_ffn_norm_in = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.input_0\"\n", + " ff_ffn_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_0\"\n", + " # compare_tensors(hf_ffn_norm_in, ff_ffn_norm_in)\n", + " \n", + " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_1\"\n", + " # compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " hf_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\"\n", + " ff_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\"\n", + " compare_tensors(hf_fc1_in, ff_fc1_in)\n", + "\n", + "\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out)\n", + "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.final_layer_norm.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\n", + "HF: [-0.00542103 -1.781267 0.16552497 ... -0.77217525 -0.5760026\n", + " 0.04363118]\n", + "FF:[ 0.03817766 -1.5644939 0.22477378 ... -0.94569921 -0.43960798\n", + " -0.06447437]\n", + "[False False False ... False False False]\n", + "[ 0 1 2 ... 19197 19198 19199]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m ff_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 21\u001b[0m ff_FWD_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_FWD_norm_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_FWD_norm_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n\u001b[1;32m 25\u001b[0m hf_BWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_decoder.final_layer_norm.gi_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:29\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "\n", + "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "\n", + "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "\n", + "# Compare fwd input/output of layernorm\n", + "hf_FWD_norm_in = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.input_0\"\n", + "hf_FWD_norm_out = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.output_0\"\n", + "ff_FWD_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\"\n", + "ff_FWD_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_FWD_norm_in, ff_FWD_norm_in)\n", + "compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n", + "\n", + "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py new file mode 100644 index 0000000000..16b46cfa81 --- /dev/null +++ b/tests/peft/hf_finetune.py @@ -0,0 +1,129 @@ +import os, sys, shutil +import torch + +# Reproducibility +import random +import numpy as np + +torch.manual_seed(0) +random.seed(0) +np.random.seed(0) +# torch.use_deterministic_algorithms(True) + +# import bitsandbytes as bnb +import argparse +import transformers + +if transformers.__version__ < "4.31.0": + raise RuntimeError( + "Please update the transformers library version to 4.31.0 or above" + ) +from datasets import load_dataset + + +from hf_utils import * + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--peft-model-id", type=str, default="goliaro/llama-160m-lora") + parser.add_argument( + "--lora-alpha", + type=int, + default=-1, + help="The scaling coefficient for LoRA. Leave it set to -1 to use the original value from the HF config", + ) + parser.add_argument( + "--lora-dropout", + type=float, + default=0.0, + help="The dropout rate for LoRA. Set it to -1 to use the original value from the HF config", + ) + parser.add_argument("-lr", "--learning-rate", type=float, default=0.001) + parser.add_argument("-n", "--max-steps", type=int, default=2) + parser.add_argument( + "--optimizer", type=str, choices=["sgs", "adam", "adamw"], default="sgd" + ) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--publish-peft-with-id", type=str, default="") + parser.add_argument( + "--save-peft-tensors", + action="store_true", + help="Save PEFT hidden states and weights to file", + ) + args = parser.parse_args() + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + # Get PEFT config, model, tokenizer, and optimizer type + peft_config = build_peft_config(args, finetuning=True) + tokenizer = get_peft_tokenizer(args, peft_config) + model = build_peft_model(args, peft_config) + optim_type = get_optim_type(args) + + # Print model with PEFT + print(model) + for name, params in model.named_parameters(): + print(name) + print_trainable_parameters(model) + + # Add hooks to save PEFT tensors, save any weights of interest before finetuning + if args.save_peft_tensors: + make_debug_dirs() + register_peft_hooks(model) + save_peft_weights(model, target_modules=["lora", "lm_head", "down_proj"]) + + # Load fine-tuning dataset + data = load_dataset("Abirate/english_quotes") + # TODO: remove using of a single row + key_to_filter = "quote" + desired_value = "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”" + data = filter_dataset_for_debugging(data, key_to_filter, desired_value) + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + # Training loop + trainer = transformers.Trainer( + model=model, + train_dataset=data["train"], + args=transformers.TrainingArguments( + per_device_train_batch_size=1, + gradient_accumulation_steps=1, + max_grad_norm=None, # Disable gradient clipping + warmup_steps=0, + max_steps=args.max_steps, + learning_rate=args.learning_rate, + fp16=True if not args.use_full_precision else False, + logging_steps=1, + output_dir=os.path.join( + args.output_dir if len(args.output_dir) > 0 else "./", + "lora_training_logs", + ), + optim=optim_type, + lr_scheduler_type=transformers.training_args.SchedulerType.CONSTANT, + ), + data_collator=transformers.DataCollatorForLanguageModeling( + tokenizer, mlm=False + ), + callbacks=[HFTrainingCallBack] if args.save_peft_tensors else None, + ) + # silence the warnings. Please re-enable for inference! + model.config.use_cache = False + + # for batch in trainer.get_train_dataloader(): + # print("First batch: ") + # print(batch) + # break + + trainer.train() + + save_finetuned_model(model, args) + + +if __name__ == "__main__": + main() diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py new file mode 100644 index 0000000000..7bfc560cc2 --- /dev/null +++ b/tests/peft/hf_serve.py @@ -0,0 +1,140 @@ +import argparse +import torch +import os, sys, shutil, json +from peft import PeftModel, PeftConfig +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + LlamaTokenizer, + GenerationConfig, +) + + +def peft_pre_forward_hook(module, input): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("base_model.model.model.", "") + print( + f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}" + ) + print("Pre-Input: ", input[0].shape) + torch.save( + input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input" + ) + # print("===") + + +def peft_post_forward_hook(module, input, output): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("base_model.model.model.", "") + print( + f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}" + ) + print("Post-Input/Output: ", input[0].shape, output[0].shape) + torch.save( + output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output" + ) + print("===") + module.decoding_step += 1 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--peft-model-id", type=str, required=True) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--max-length", type=int, default=50) + parser.add_argument("--prompt-file", type=str, required=True) + parser.add_argument("--do-sample", action="store_true", help="Use sampling") + parser.add_argument( + "--save-peft-tensors", + action="store_true", + help="Save PEFT hidden states and weights to file", + ) + args = parser.parse_args() + + # Check if prompt-file exists + if not os.path.isfile(args.prompt_file): + print(f"Error: {args.prompt_file} does not exist.") + return + + # Get peft model config + config = PeftConfig.from_pretrained(args.peft_model_id) + + # Load the base model + model = AutoModelForCausalLM.from_pretrained( + config.base_model_name_or_path, + return_dict=True, + # load_in_8bit=True, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + device_map="auto", + ) + # Load the Lora model + model = PeftModel.from_pretrained(model, args.peft_model_id) + print(model) + + # Get tokenizer + hf_config = AutoConfig.from_pretrained( + config.base_model_name_or_path, trust_remote_code=True + ) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + config.base_model_name_or_path, + use_fast=True, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + + # Generation config + generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path) + generation_config.do_sample = args.do_sample + + # Register hooks to save tensors, if needed + if args.save_peft_tensors: + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + # Create output dir + shutil.rmtree("./hf_peft_tensors") + os.makedirs("./hf_peft_tensors", exist_ok=True) + # Save weights + for name, params in model.named_parameters(): + if "lora" in name: + torch.save(params, f"./hf_peft_tensors/{name}") + # params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + # Save hidden states + for name, layer in dict(model.named_modules()).items(): + if "lora_A.default" in name or "lora_B.default" in name: + layer.name = name + layer.decoding_step = 0 + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_pre_hook(peft_pre_forward_hook) + layer.register_forward_hook(peft_post_forward_hook) + + # Run inference + # Read prompt-file into a list of strings + with open(args.prompt_file, "r") as f: + try: + prompt_list = json.load(f) + except json.JSONDecodeError: + print(f"Error: Unable to parse {args.prompt_file} as JSON.") + sys.exit(1) + + for i, prompt in enumerate(prompt_list): + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) + with torch.cuda.amp.autocast(): + output_tokens = model.generate( + **batch, max_new_tokens=args.max_length, generation_config=generation_config + ) + print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) + + +if __name__ == "__main__": + main() diff --git a/tests/peft/hf_train.py b/tests/peft/hf_train.py new file mode 100644 index 0000000000..707fc9d0ae --- /dev/null +++ b/tests/peft/hf_train.py @@ -0,0 +1,161 @@ +import os, sys + +# os.environ["CUDA_VISIBLE_DEVICES"]="0" +import torch +import torch.nn as nn + +# import bitsandbytes as bnb +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer +import argparse +from peft import LoraConfig, get_peft_model +import transformers +from datasets import load_dataset + + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") + parser.add_argument("--lora-rank", type=int, default=16) + parser.add_argument("--lora-alpha", type=int, default=32) + parser.add_argument( + "--lora-target-modules", + type=str, + default="down_proj", + help="Comma-separated list of layers from the base model to target", + ) + parser.add_argument("--lora-dropout", type=float, default=0.05) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--publish-peft-with-id", type=str, default="") + args = parser.parse_args() + model_name = args.model_name + use_full_precision = args.use_full_precision + lora_rank = args.lora_rank + lora_alpha = args.lora_alpha + lora_target_modules = args.lora_target_modules.split(",") + lora_dropout = args.lora_dropout + output_dir = args.output_dir + publish_peft_with_id = args.publish_peft_with_id + if len(output_dir) == 0 and len(publish_peft_with_id) == 0: + raise ValueError( + "Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the trained model" + ) + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + model = AutoModelForCausalLM.from_pretrained( + model_name, + # load_in_8bit=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", + ) + + # Get Tokenizer + hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + model_name, + use_fast=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = "[PAD]" + tokenizer.padding_side = "left" + + for param in model.parameters(): + param.requires_grad = False # freeze the model - train adapters later + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + + model.gradient_checkpointing_enable() # reduce number of stored activations + model.enable_input_require_grads() + + model.lm_head = CastOutputToFloat(model.lm_head) + + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + # target_modules=["q_proj", "v_proj"], + # target_modules=["down_proj"], + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + print(model) + print(model.named_parameters()) + model = get_peft_model(model, config) + print_trainable_parameters(model) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = transformers.Trainer( + model=model, + train_dataset=data["train"], + args=transformers.TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=100, + max_steps=200, + learning_rate=2e-4, + fp16=True if not use_full_precision else False, + logging_steps=1, + output_dir=os.path.join( + output_dir if len(output_dir) > 0 else "./", "lora_training_logs" + ), + ), + data_collator=transformers.DataCollatorForLanguageModeling( + tokenizer, mlm=False + ), + ) + model.config.use_cache = ( + False + ) # silence the warnings. Please re-enable for inference! + trainer.train() + + if len(output_dir) > 0: + print(f"Done training! Saving the model to {output_dir}...") + model.save_pretrained(output_dir) + + if len(publish_peft_with_id) > 0: + print( + f"Done training! Uploading the model to HF hub with id: {publish_peft_with_id}..." + ) + model.push_to_hub(publish_peft_with_id, use_auth_token=True) + + +if __name__ == "__main__": + main() diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py new file mode 100644 index 0000000000..9332c803b2 --- /dev/null +++ b/tests/peft/hf_utils.py @@ -0,0 +1,352 @@ +import torch +import torch.nn as nn +import transformers +from transformers import ( + TrainerCallback, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + LlamaTokenizer, +) +import os, shutil +from peft import PeftConfig, PeftModel +from datasets import load_dataset, DatasetDict + +debug_dir = None +debug_subdirs = ["fwd", "bwd", "optim", "weights"] +verbose = False + + +def make_debug_dirs(): + global debug_dir + global debug_subdirs + debug_dir = os.environ.get("FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow")) + debug_dir = os.path.join(debug_dir, "debug", "huggingface") + shutil.rmtree(debug_dir, ignore_errors=True) + os.makedirs(debug_dir, exist_ok=True) + assert debug_dir is not None + assert os.path.isdir(debug_dir) + for subdir in debug_subdirs: + subdir_path = os.path.join(debug_dir, subdir) + os.makedirs(subdir_path, exist_ok=False) + + +def get_dst_folder(subdir, step_idx=0): + global debug_dir, debug_subdirs + assert subdir in debug_subdirs + dst_folder = os.path.join(debug_dir, subdir, f"step_{step_idx}") + os.makedirs(dst_folder, exist_ok=True) + return dst_folder + + +def simplify_name(name): + return name.replace("base_model.model.model.", "").replace("base_model.model.", "") + + +def get_optim_type(args): + if args.optimizer == "sgd": + return transformers.training_args.OptimizerNames.SGD + elif args.optimizer == "adam": + return transformers.training_args.OptimizerNames.ADAM + elif args.optimizer == "adamw": + return transformers.training_args.OptimizerNames.ADAMW + else: + raise ValueError(f"Optimizer {args.optimizer} not supported") + + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +def peft_backward_hook(module, grad_input, grad_output): + assert type(grad_input) == tuple and type(grad_output) == tuple + if len(grad_input) == 0 or len(grad_output) == 0: + return + assert module.name is not None and module.bwd_step is not None + name = simplify_name(module.name) + if verbose: + print( + f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}" + ) + print("Backward GRAD Output:") + for i, out_grad in enumerate(grad_output): + if type(out_grad) == torch.Tensor: + dst_folder = get_dst_folder("bwd", module.bwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.output_gradient_{i}") + if verbose: + print("\t", out_grad.shape) + print(f"\t\tSaving to {dst_filepath}") + torch.save(out_grad, dst_filepath) + else: + if verbose: + print(out_grad) + if verbose: + print("Backward GRAD Input:") + for i, in_grad in enumerate(grad_input): + if type(in_grad) == torch.Tensor: + dst_folder = get_dst_folder("bwd", module.bwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.input_gradient_{i}") + if verbose: + print("\t", in_grad.shape) + print(f"\t\tSaving to {dst_filepath}") + torch.save(in_grad, dst_filepath) + else: + if verbose: + print(in_grad) + if verbose: + print("===") + module.bwd_step += 1 + + +def peft_forward_hook(module, input, output): + if len(input) == 0 or len(output) == 0: + return + assert module.name is not None and module.fwd_step is not None + name = simplify_name(module.name) + if verbose: + print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}") + print("Input:") + if type(input) == torch.Tensor: + if verbose: + print(input.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.input_0") + torch.save(input, dst_filepath) + elif type(input) == tuple: + for i, inp in enumerate(input): + if type(inp) == torch.Tensor: + if verbose: + print(inp.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.input_{i}") + torch.save(inp, dst_filepath) + else: + if verbose: + print(inp) + else: + assert False + if verbose: + print("Output:") + if type(output) == torch.Tensor: + if verbose: + print(output.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.output_0") + torch.save(output, dst_filepath) + elif type(output) == tuple: + for i, out in enumerate(output): + if type(out) == torch.Tensor: + if verbose: + print(out.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.output_{i}") + torch.save(out, dst_filepath) + else: + if verbose: + print(out) + else: + assert False + if verbose: + print("===") + module.fwd_step += 1 + + +def peft_optimizer_hook(model_, callback_func_handle): + def post_hook(optimizer, args, kwargs): + if verbose: + print("Optimizer Hook activated") + bwd_step = callback_func_handle.step_count + for name_, module in model_.named_modules(): + name = simplify_name(name_) + for param_name, param in module.named_parameters(recurse=False): + if param.requires_grad: + if verbose: + print( + f"Step #{bwd_step}: Saving weight gradient for {name} ({param.grad.shape})" + ) + dst_folder = get_dst_folder("weights", bwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.gradient") + torch.save(param.grad, dst_filepath) + + return post_hook + + +class HFTrainingCallBack(TrainerCallback): + def on_train_begin(self, args, state, control, **kwargs): + if verbose: + print("Starting finetuning") + model_ = kwargs.get("model", None) + optim = kwargs.get("optimizer", None) + assert model_ is not None + assert optim is not None + self.step_count = 0 + optim.optimizer.register_step_post_hook(peft_optimizer_hook(model_, self)) + + def save_lora_weights(self, model, pre_finetuning=False): + lora_weights_handles = [ + (simplify_name(name), params) + for name, params in model.named_parameters() + if "lora" in name + ] + for simplified_name, params in lora_weights_handles: + dst_folder = get_dst_folder("weights", self.step_count) + if pre_finetuning: + dst_filepath = os.path.join(dst_folder, f"{simplified_name}_original") + torch.save(params, dst_filepath) + if verbose: + print( + f"Step #{self.step_count}: Saving ORIGINAL weight {simplified_name} ({params.shape})" + ) + else: + dst_filepath = os.path.join(dst_folder, f"{simplified_name}_finetuned") + torch.save(params, dst_filepath) + if verbose: + print( + f"Step #{self.step_count}: Saving FINETUNED weight {simplified_name} ({params.shape})" + ) + if not pre_finetuning: + self.step_count += 1 + + def on_step_end( + self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs + ): + self.save_lora_weights(model, pre_finetuning=False) + + def on_step_begin( + self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs + ): + self.save_lora_weights(model, pre_finetuning=True) + + def on_train_end(self, args, state, control, **kwargs): + if verbose: + print(f"Finetuning ended after {self.step_count} steps") + + +def build_peft_config(args, finetuning=False): + peft_config = PeftConfig.from_pretrained(args.peft_model_id) + if peft_config.peft_type != "LORA": + raise ValueError(f"PEFT type {peft_config.peft_type} not supported yet") + if args.lora_alpha > 0.0: + peft_config.lora_alpha = args.lora_alpha + if peft_config.lora_dropout >= 0.0: + peft_config.lora_dropout = args.lora_dropout + # prevent HF from re-inizialing the weights randomly if finetuning + if finetuning: + peft_config.init_lora_weights = False + return peft_config + + +def prepare_model_for_lora_finetuning(model, save_peft_tensors=False): + # Freeze all layers except the LORA ones. Cast small layers to full precision for stability + for name, param in model.named_parameters(): + if "lora" not in name: + param.requires_grad = False # freeze the model - train adapters later + else: + param.requires_grad = True + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + if not save_peft_tensors: + model.gradient_checkpointing_enable() # reduce number of stored activations + model.enable_input_require_grads() + model.lm_head = CastOutputToFloat(model.lm_head) + return model + + +def build_peft_model(args, peft_config): + # Load base model, and apply the PEFT layer + model = AutoModelForCausalLM.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + device_map="auto", + ) + model = PeftModel.from_pretrained(model, args.peft_model_id, config=peft_config) + model = prepare_model_for_lora_finetuning(model, args.save_peft_tensors) + return model + + +def get_peft_tokenizer(args, peft_config): + # Get Tokenizer + hf_config = AutoConfig.from_pretrained( + peft_config.base_model_name_or_path, trust_remote_code=True + ) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + peft_config.base_model_name_or_path, + use_fast=True, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = "[PAD]" + tokenizer.padding_side = "left" + return tokenizer + + +def register_peft_hooks(model): + # Save hidden states and gradients + for name, layer in dict(model.named_modules()).items(): + layer.name = name + layer.fwd_step = 0 + layer.bwd_step = 0 + if verbose: + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_hook(peft_forward_hook) + layer.register_full_backward_hook(peft_backward_hook) + + +def save_peft_weights(model, target_modules=[]): + # Save any weights of interest + for name, params in model.named_parameters(): + simplified_name = simplify_name(name) + for target_module in target_modules: + if target_module in name: + dst_folder = get_dst_folder("weights") + dst_filepath = os.path.join(dst_folder, f"{simplified_name}") + torch.save(params, dst_filepath) + + +def filter_dataset_for_debugging(data, key_to_filter, desired_value): + filtered_dataset_dict = DatasetDict() + for split, dataset in data.items(): + filtered_dataset = dataset.filter( + lambda example: example[key_to_filter] == desired_value + ) + filtered_dataset_dict[split] = filtered_dataset + data = filtered_dataset_dict + return data + + +def save_finetuned_model(model, args): + if len(args.output_dir) > 0: + if verbose: + print(f"Saving the model to {args.output_dir}...") + model.save_pretrained(args.output_dir) + + if len(args.publish_peft_with_id) > 0: + if verbose: + print( + f"Uploading the model to HF hub with id: {args.publish_peft_with_id}..." + ) + model.push_to_hub(args.publish_peft_with_id, use_auth_token=True) diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py new file mode 100644 index 0000000000..266bb64137 --- /dev/null +++ b/tests/peft/peft_alignment_test.py @@ -0,0 +1,730 @@ +import numpy as np +import os, torch, argparse +from alignment.align_test_utils import * +from transformers import AutoConfig +from peft import PeftConfig +from tqdm import tqdm + +class AlignmentTest: + def __init__(self, model_name, tp_degree=1): + raise NotImplementedError() + def check_weights_alignment(self): + raise NotImplementedError() + def check_fwd_pass(self): + raise NotImplementedError() + def check_bwd_pass(self): + raise NotImplementedError() + def check_step(self, step_idx, learning_rate=0.001): + raise NotImplementedError() + +class LllamaAlignmentTest(AlignmentTest): + def __init__(self, model_name, tp_degree=1): + self.model_name = model_name + self.peft_config = PeftConfig.from_pretrained(model_name) + self.hf_config = AutoConfig.from_pretrained(self.peft_config.base_model_name_or_path) + self.num_layers = self.hf_config.num_hidden_layers + self.hidden_size = self.hf_config.hidden_size + self.intermediate_size = self.hf_config.intermediate_size + self.num_attention_heads = self.hf_config.num_attention_heads + self.num_key_value_heads = self.num_attention_heads + self.projsize = self.hidden_size // self.num_attention_heads + self.tp_degree = tp_degree + self.lora_scaling_factor = self.peft_config.lora_alpha / self.peft_config.r + + self.num_tokens = None + self.ff_batch_size = None + + + def check_weights_alignment(self): + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "lm_head.weight": + f_version = f"layers.{self.num_layers-1}.lm_head.weight_0" + elif hf_filename == "norm.weight": + f_version = f"layers.{self.num_layers-1}.norm.weight_0" + else: + f_version = "" + if hf_filename.startswith("layers."): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version += f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # compute weight index, then rename lora if needed if needed + weight_index="0" + if "lora_A" in f_version: + weight_index="A" + elif "lora_B" in f_version: + weight_index="B" + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + if f_version.endswith(".weight"): + if weight_index == "0": + f_version += f"_{weight_index}" + else: + f_version += f"_{weight_index}.original" + elif f_version.endswith(".gradient"): + prefix = f_version.split(".gradient")[0] + f_version = prefix + f".weight_{weight_index}.gradient" + return f_version + def get_tp_partition_dim(ff_weight_name) -> int: + # MLP layers split the intermediate size dimension + # gate_proj, up_proj: [hidden_size, intermediate_size] + # down_proj: [intermediate_size, hidden_size] + if self.tp_degree == 1: + return -1 + if "lora.weight_B" in ff_weight_name: + return -1 + if "lm_head" in ff_weight_name or "norm" in ff_weight_name: + return 1 + if "gate_proj" in ff_weight_name or "up_proj" in ff_weight_name: + return 1 + elif "down_proj" in ff_weight_name: + return 0 + else: + return -1 + print("-- Weights alignment --") + hf_weights_folder = os.path.join(hf_path, "weights", "step_0") + ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0") + files_list = os.listdir(hf_weights_folder) + for hf_weight_name in tqdm(sorted(files_list)): + if hf_weight_name.endswith(".weight"): + ff_weight_name = convert_hf_filename_to_ff(hf_weight_name) + # print(hf_weight_name, ff_weight_name) + hf_w_path = os.path.join(hf_weights_folder, hf_weight_name) + ff_w_path = os.path.join(ff_weights_folder, ff_weight_name) + if not os.path.isfile(hf_w_path): + print(f"File '{hf_w_path}' not found") + if not os.path.isfile(ff_w_path): + print(f"File '{ff_w_path}' not found") + assert(os.path.isfile(hf_w_path)) + assert(os.path.isfile(ff_w_path)) + + # 1. get shape of hf weight + hf_weight = torch.load(hf_w_path, map_location='cpu') + hf_weigth_shape = hf_weight.shape + ff_partition_dim = get_tp_partition_dim(ff_weight_name) + ff_weigth_shape = list(hf_weigth_shape)[::-1] + if ff_partition_dim >= 0: + ff_weigth_shape[ff_partition_dim] //= self.tp_degree + + # 2. handle flexflow shards in case of tensor parallelism + ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weigth_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + if ff_partition_dim >= 0: + ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim) + else: + assert(are_np_arrays_identical(ff_weights)) + ff_weight = ff_weights[0] + else: + ff_weight = ff_weights[0] + ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype) + + # check equivalence + try: + torch.testing.assert_close(ff_weight, hf_weight.T) + except Exception as e: + print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n") + raise e + + def check_fwd_pass(self, step_idx=0): + hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}") + ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") + + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "embed_tokens": + f_version = f"layers.0.embed_tokens" + elif hf_filename == "lm_head" or hf_filename == "norm": + f_version = f"layers.{self.num_layers-1}.{hf_filename}" + else: + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix + f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "") + # lora in HuggingFace is split into A and B operators, in FF we use a single operator. + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + return f_version + + def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): + hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}" + hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + if hf_tensor_name == "embed_tokens": + self.num_tokens = hf_tensor.shape[1] + return hf_tensor + + def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE): + ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else "" + ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else "" + ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}" + ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename) + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[0] //= self.tp_degree + + if "layers.0.embed_tokens.input_0" in ff_tensor_path: + # get number of tokens + ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',') + self.ff_batch_size = ff_tensor.shape[0] + + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=0) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=0) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens) + return ff_tensor + + def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + if additional_ff_tensor is not None: + additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype) + ff_tensor = ff_tensor - additional_ff_tensor + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance) + if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print("FF tensor:") + print(ff_tensor.squeeze()) + raise e + + print(f"-- FWD pass {step_idx}--") + + # Embedding layer + hf_tensor_name = "embed_tokens" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding output") + + # Transformers blocks + for i in range(self.num_layers): + # Input laye norm + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + if i == 0: + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + else: + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} output") + + # Attention + hf_tensor_name = f"layers.{i}.self_attn.o_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label=f"Attention {i} output") + + # Post-attention layernorm + hf_tensor_name = f"layers.{i}.post_attention_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Post-attention layernorm {i} output") + + # W1 (gate_proj) + hf_tensor_name = f"layers.{i}.mlp.gate_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W1 {i} output") + + # W3 (up_proj) + hf_tensor_name = f"layers.{i}.mlp.up_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W3 {i} output") + + # W2 (down_proj) + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W2 {i} input") + + hf_down_proj_in = hf_tensor.clone() + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + + # LoRA_A + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} input") + torch.testing.assert_close(hf_down_proj_in, hf_tensor, rtol=1.3e-6, atol=1e-5) + + # LoRA intermediate + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="low_rank_activation", hf_tensor_idx=0, ff_tensor_idx=None) + hf_lora_A_out = get_hf_tensor(hf_tensor_name, output_comparison) + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" + hf_lora_B_in = get_hf_tensor(hf_tensor_name, input_comparison) + torch.testing.assert_close(hf_lora_A_out, hf_lora_B_in, rtol=1.3e-6, atol=1e-5) + ff_tensor_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora" + ff_lora_A_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_lora_A_out.shape, tp_type=TPType.TO_REDUCE) + compare(hf_lora_A_out, ff_lora_A_out, label=f"LoRA_A {i} output") + + # LoRA_B + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) * self.lora_scaling_factor + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_down_proj_out.shape, tp_type=TPType.TO_REDUCE) + compare(hf_down_proj_out, ff_tensor, label=f"W2_out + scaling*LoRA_B_out {i}") + compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_down_proj_out, label=f"LoRA_B {i} output") + + # Norm + hf_tensor_name = "norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Norm output") + + # LM head + hf_tensor_name = "lm_head" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label="LM head input") + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label="LM head output") + + def check_bwd_pass(self, step_idx=0): + if not self.num_tokens or not self.ff_batch_size: + raise ValueError("Number of tokens and batch size must be set before running backward pass check") + hf_bwd_folder = os.path.join(hf_path, "bwd", f"step_{step_idx}") + ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0") + + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "embed_tokens": + f_version = f"layers.0.embed_tokens" + elif hf_filename == "lm_head" or hf_filename == "norm": + f_version = f"layers.{self.num_layers-1}.{hf_filename}" + else: + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix + # f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "") + # lora in HuggingFace is split into A and B operators, in FF we use a single operator. + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + return f_version + + def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): + hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}" + hf_tensor_path = os.path.join(hf_bwd_folder, hf_tensor_filename) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + return hf_tensor + + def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE, pre=False, shard_axis=0): + ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else "" + ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else "" + ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}" + + ff_tensor_path = os.path.join(ff_bwd_folder, ff_tensor_filename) + if pre: + ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre") + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[shard_axis] //= self.tp_degree + + # exception: intermediate attention tensors + intermediate_attention_tensor = ( + "self_attn" in ff_tensor_name and + not ( + ff_tensor_name.endswith(".self_attn") and + ( + tensor_comparison_idx.ff_tensor_type == "output_gradient" or + tensor_comparison_idx.ff_tensor_type == "input_gradient" + ) + ) + ) + if not intermediate_attention_tensor: + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=shard_axis) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=shard_axis) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + if not intermediate_attention_tensor: + ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens) + return ff_tensor + + def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-3): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + if additional_ff_tensor is not None: + additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype) + ff_tensor = ff_tensor - additional_ff_tensor + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance) + if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .06 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print("FF tensor:") + print(ff_tensor.squeeze()) + raise e + + print(f"-- BWD pass {step_idx}--") + + # LM head + hf_tensor_name = "lm_head" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label="LM head gradient output") + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label="LM head gradient input") + + # Norm + hf_tensor_name = "norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label="Norm gradient output") + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Norm gradient input") + + # Transformers blocks + for i in range(self.num_layers-1, -1, -1): + # W2 (down_proj) output + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output") + + # LoRA_B + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor + compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output") + + # LoRA_A + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input") + + # W2 (down_proj) input + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input") + + # W2 input (HF) and SigmoidSiluMulti output (FF) + hf_w2_input = hf_tensor.clone() + ff_tensor_name = f"layers.{i}.SigmoidSiluMulti" + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_w2_input, ff_tensor, label=f"HF W2 {i} output and FF SSM output") + + # W1 (gate_proj) output + hf_tensor_name = f"layers.{i}.mlp.gate_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W1 {i} gradient output") + # W1 (gate_proj) input + # HF W1 in = FF W1 in - HF W1 in (pre) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + ff_tensor_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE, pre=True) + compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_tensor_pre, label=f"W1 {i} gradient input") + + # W3 (up_proj) output + hf_tensor_name = f"layers.{i}.mlp.up_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output") + # W3 (up_proj) input + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input") + + # Attn O-proj + hf_tensor_name = f"layers.{i}.self_attn.o_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output") + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj" + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient input") + + # V-proj grads + # FF shape: [num_tokens, qProjSize*num_heads] + hf_tensor_name = f"layers.{i}.self_attn.v_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + mixed_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, mixed_comparison) + hf_tensor = hf_tensor.squeeze().T + ff_tensor = get_ff_tensor(ff_tensor_name, mixed_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=1) + compare(hf_tensor, ff_tensor, label=f"V-proj {i} gradient input") + + # K-proj grads + # FF shape: (num_tokens, qProjSize, num_heads) + hf_tensor_name = f"layers.{i}.self_attn.k_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + k_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="devkproj", hf_tensor_idx=0, ff_tensor_idx=None) + hf_tensor = get_hf_tensor(hf_tensor_name, k_proj_comparison) + hf_tensor = hf_tensor.squeeze().view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous() + hf_tensor = hf_tensor.T + ff_tensor = get_ff_tensor(ff_tensor_name, k_proj_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=2) + compare(hf_tensor, ff_tensor, label=f"K-proj {i} gradient input") + + # Q-proj grads + # FF shape (devQKVPRojArray): (num_tokens, qProjSize, num_heads, 3) + # Q-proj out grad: devQKVPRojArray[:,:,:,0] + hf_tensor_name = f"layers.{i}.self_attn.q_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.devQKVPRojArray" + q_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="", hf_tensor_idx=0, ff_tensor_idx=None) + hf_tensor = get_hf_tensor(hf_tensor_name, q_proj_comparison) + hf_tensor = hf_tensor.view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous().T + augmented_hf_tensor_shape = torch.Size([3]+list(hf_tensor.size())) + ff_tensor = get_ff_tensor(ff_tensor_name, q_proj_comparison, augmented_hf_tensor_shape, tp_type=TPType.PARTITION, shard_axis=2)[:,:,:,0] + compare(hf_tensor, ff_tensor, label=f"Q-proj {i} gradient input") + + # FF Attn input with HF layernorm out + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + input_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label=f"Attn input {i} gradient input") + + if i > 0: + # FF attn input with FF layernorm out 1 + attn_input = ff_tensor.clone() + ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm" + _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1) + input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5) + + # Input layernorm + + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + ff_in1_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=1) + input_layernorm0 = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + input_layernorm1 = get_ff_tensor(ff_tensor_name, ff_in1_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + torch.testing.assert_close(input_layernorm0, input_layernorm1, rtol=1.3e-6, atol=1e-5) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + # if i > 1: + # compare(hf_tensor, input_layernorm1, label=f"Input layernorm {i} gradient input") + + def check_step(self, step_idx=0, learning_rate=0.001): + hf_weight_folder = os.path.join(hf_path, "weights", f"step_{step_idx}") + ff_weight_folder = os.path.join(ff_path, "weights", f"step_{step_idx}", "shard_0") + def convert_hf_filename_to_ff(hf_filename): + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # lora in HuggingFace is split into A and B operators, in FF we use a single operator. + f_version = f_version.replace("lora_A", "lora.weight_A").replace("lora_B", "lora.weight_B") + return f_version + def get_hf_tensor(hf_tensor_name): + hf_tensor_path = os.path.join(hf_weight_folder, hf_tensor_name) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + return hf_tensor + def get_ff_tensor(ff_tensor_name, hf_shape, tp_type=TPType.REPLICATE, pre=False): + ff_tensor_path = os.path.join(ff_weight_folder, ff_tensor_name) + if pre: + ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre") + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[0] //= self.tp_degree + + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=0) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=0) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + return ff_tensor + def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance) + if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print("FF tensor:") + print(ff_tensor.squeeze()) + raise e + print(f"-- optimizer pass {step_idx}--") + + for i in range(self.num_layers-1, -1, -1): + # LoRA_B gradient + hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_B.default.gradient" + hf_gradient = get_hf_tensor(hf_gradient_name) + hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_original" + hf_original_weight = get_hf_tensor(hf_original_weight_name) + hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_finetuned" + hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name) + torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5) + ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) + ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE) + compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient") + # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0" + # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") + # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0") + # ff_out_gradient = load_ff_tensor(os.path.join(ff_bwd_folder, ff_out_gradient_name), [self.hidden_size, 128])[:,:self.num_tokens] + # ff_out_gradient = torch.from_numpy(ff_out_gradient) + # print("Output gradient shape: ", ff_out_gradient.shape) + # ff_low_rank_activation = f"layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation" + # ff_low_rank_activation = load_ff_tensor(os.path.join(ff_fwd_folder, ff_low_rank_activation), [16, 128])[:,:self.num_tokens] + # ff_low_rank_activation = torch.from_numpy(ff_low_rank_activation) + # print("Low rank activation shape: ", ff_low_rank_activation.shape) + # simulated_weight_grad = ff_low_rank_activation @ ff_out_gradient.T + # print("Simulated weight grad shape: ", simulated_weight_grad.shape) + # print(simulated_weight_grad) + # print(ff_gradient) + # compare(hf_gradient, simulated_weight_grad, label=f"LoRA_B {i} simulated gradient") + + + # LoRA_A gradient + hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_A.default.gradient" + hf_gradient = get_hf_tensor(hf_gradient_name) + ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) + hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_original" + hf_original_weight = get_hf_tensor(hf_original_weight_name) + hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_finetuned" + hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name) + torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5) + ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) + ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.PARTITION) + compare(hf_gradient, ff_gradient, label=f"LoRA_A {i} gradient") + +parser = argparse.ArgumentParser(description='Argument Parser Example') +# Adding arguments +parser.add_argument('-m', '--model-name', type=str, default="goliaro/llama-160m-lora", help='Name of the model') +parser.add_argument('-n', '--num-steps', type=int, default=1, help='Number of finetuning steps') +parser.add_argument('-tp', '--tensor-parallelism-degree', type=int, default=1, help='The tensor parallelism degree used when running FlexFlow') +parser.add_argument('-lr', '--learning-rate', type=float, default=0.001, help='The learning rate used at finetuning time') + +# Parse the arguments from command line +args = parser.parse_args() + +if __name__ == "__main__": + llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree) + # llama_alignment.check_weights_alignment() + for i in range(args.num_steps): + llama_alignment.check_fwd_pass(i) + llama_alignment.check_bwd_pass(i) + llama_alignment.check_step(i, args.learning_rate) diff --git a/tests/peft_test.sh b/tests/peft_test.sh new file mode 100755 index 0000000000..5600d57edf --- /dev/null +++ b/tests/peft_test.sh @@ -0,0 +1,66 @@ +#! /usr/bin/env bash +# set -x +set -e + +cleanup() { + rm -rf ~/.cache/flexflow/debug +} + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}/.." + +# Token to access private huggingface models (e.g. LLAMA-2) +HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} +if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then + huggingface-cli login --token "$HUGGINGFACE_TOKEN" +fi + +# Clean up before test (just in case) +cleanup + +# Create test prompt file +mkdir -p ./inference/prompt +echo '["Two things are infinite: "]' > ./inference/prompt/peft.json +echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.”"]' > ./inference/prompt/peft_dataset.json + + +# Create output folder +mkdir -p ./inference/output + +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + +# Download test model +python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m + +# Run PEFT in Huggingface to get ground truth tensors +python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision + +# Python test +echo "Python test" +python ./inference/python/ff_peft.py +# Check alignment +python ./tests/peft/peft_alignment_test.py -tp 2 + +# C++ test +echo "C++ test" +./build/inference/peft/peft \ + -ll:gpu 2 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 2 \ + -ll:fsize 8192 -ll:zsize 12000 \ + -llm-model JackFram/llama-160m \ + -finetuning-dataset ./inference/prompt/peft_dataset.json \ + -peft-model goliaro/llama-160m-lora \ + -enable-peft \ + --use-full-precision \ + --inference-debugging +# Check alignment +python ./tests/peft/peft_alignment_test.py -tp 2 + +# Print succeess message +echo "" +echo "PEFT tests passed!" +echo "" + +# Cleanup after the test +cleanup

xwfYkiaUKu_DN4<&$z< zq4k8K=K(~iwZ&_G^>IsrIP8rU68G>Xx+1kobZ-$|EkdXMO>&-xLB2)H{Q%RB#n@?! zc~`LoT{p4Z+&S!y8h@ogBZ_nvYcMAa-+=bIfLnOCB5ReCY6*f7XoR=V!@2cuZzS>x zy1#}96x#*iZ$k-z2k7478M8|;su9{oXDDST)^LjNmKFbJw9v|3p{a~JF*p-u8&&`0B!fwE3F6shTL^ z@&(pVh8-R~TM$h(R(z&SD^y$T;Ez_Wgb7TR5gxEKD5F#zAf^>I12tq5Cy8=zl{ti! z>#EkX{nO97_aQ@oR{qF34&Asxuy{IlEB6tJM)W(UTJvVdi?H7$GhzgEoq3{&g1tRY9FYdboKn} z{Qj?Q{1TyPSIW#N%GIvwrK&jz^vS_dDM?pY4su1Wm^z4~B>`ry!$k7;KgzC8K-=j_w#vny$IWT=DpD};6Rezh0g z=AnV+vFx}XFi9P))dWliqNtDY2ulqLLbR6qIs6#H?ye!1ho8dZv}kZsg}Nf{?$8a2+KB&JG!~rSElg`nMnx=rKc5qP zKQE?9aqw>DX~chpmm0Iy?`9j{t=!jG)jeO3dcVwdu`-~sX`;Dy`EE^DW1U`eqvPF1 z;l)&W{9?;1V&S#sw|^Rwg%CMtb$mPq(HKq^SN}4-I z7rTEVem>IN-M-kl_F)I|{x3v}0;Y9feg0cX^XTa%Z?;u)Aat@3#Z!c*ufv}G>rz2S zpAAJJi+au*Q7lEeJOGp$CvK&{HCQ&%cAcV7Pc?6J0L8b#O_v;G=2RFc>vAj|w3|Sm zadX6jQPg6h?Wv|b=e>m2Uq7Y!SSiYd?K=-<2wA_ht;{GlNWW&ZfUewk8OoLMFDVZj ze>q&LmU6n6A9U$FP=pRE>AY^SWt3;}EHQc3@>N=yMn?4K_P;LEY{pr61@&#G>-X3G=%n)kZ;U>K$w z&%_h3c;#XI4?#NJ9KrR-_ghEU(cw8)sNYgg0-I7KcUZvMP`=Wmg}J{fTN9PmEs;Er zLw?S+JvzGJc@p}2-;{udPuPO7;=Mb+sIq<^I2@JZc6 z{DX>4MfLhWdMC#z=(-Gl8R$aWu29jq>S&rPRFvt9`rrAdUuG(@m_|{RJq)>lR<|b1 z`HV43WfF0iQV!h!5$#wV65*Eq7+fGupX&uvqj`YKF;VZvwmU;^OsvQOx3&wWl*)T# zOJ82vCSALf|6aDXJO}J~b_{>X=$%og6qedAE*ZLl�e{-TdnQqIjxBEA3s5A@^K` zAO)S~zBaioRF#5;E})_;pRok~eAuMVo1BfVL_{gm?DH!2zw64D4X>o>IX_ooE-f*%Zac4IKn&?rf3u&sU(bj6grwXRYcC!N)^Sj@#cL6={GOfNaK?^2_`E=dyP;&MFVIO}OUEM9qhxJhlgWDB7{cm?C zljOO4J%zT&n!~CS-tgVa@!&a{|NbEEXd%Ly=lBrzqK1Rc$`uC*0HhM|m#R7^n6hUI zOGMhZ9}DM@d*uhHOjO3Ec-!iqYAB0E;a;$LAAeP?vv2@FP?DD#!JM%`w22J+u zCN9hg%4xfl=6^=7h#fSL#3(%rz-aRtO)j%O?vaG?LX$=1mf1;x-8&e;hco;Ji@lBg zKkEEdeV@~Dw+O;szDTlhnOq4kkfD&gzMW8?(W{Qp?mD4a=8Upjy%cclN#?G{&{e<( zkfL{u{Nw%8@{xj)n(1Zc?u=P$Dw8xE39lX->O}R{=_|@+Bt&L0=G2>-Z7A14jP5b! zKJ9f83FA=relEZw`(sVI<4>3B$8oco*0_iV{wNpCKR!*~XvxYjBRcjfo1*SmnPiga z!$_v0CFTv~k9W-iGo1^+sc)!${c0BYa-#TmmYdp=%FV|Maz*pK8>*F!Umi{c7WsMV z4{*N^hHsE( zjf?^E$=*Mxc^5y5*Jv_Z6rLAr1@(ullGVbo( zex;w3!<_A@eP{E<^*dl$zd^jC7HO4oP>cV2~M<=tfcuD>1}JQhe^mD%c&5Qop^Z_;gE>r5B0zHKNLP4L_% z&2=lj?MN|yDCB80-~N*0^HtyU{rewR$REA^94zVW{CvLAcsjGfk5*3Br0^C3)QR{* zD}yNHj$jW~8AnAk&5zlI!g1toGjS#B{&6zW0+mTYi}FS$vzJbW77lgyYND+`jnTW< z4{c|)%3$^TFqS-S3NvX*w)sWSHCQ)e&4ZZpa7URCwJ%Wk~-46j?P>Xo7I0)d<0ho0m)wP{I@9ENevGTLrj` zSJW9NEvY0}cyyH7sN%EIxw*G+dlDq*7{Z&E*ZbsrjNWt*Z0ye7p!LbNV~&3mSM#FL z(qD{jj7Y65TP1;)&_3wov@DdpO^K zc|oT*+P(F2Z-SS5fubz5XZn0EUaei7%i$>1K41=w)rDRhlgoB9!2JFR!pGuu z6^eT4uPKl5HL*hgvO$~q`W^+c+DFXG1$zg~d63IW>8AMqMnXQc^Jw_xb|n92Z`2`p zrV;+^@%QtE!>(2XEIbG!N zbL(cw@$^b0UDt=jeTS%XuBgl1+pQBd3Fou5mwUAf2Wc0RJr^{iOJI26@s~3geGSca z-4Fi3-lT&>njs9tWfa9e??~Oh``3w{@&WzqrT4EHW9g#o+iDCL<(vTS2Pyx~8nr+JZew6$I6b z^zMpu%FlxQHPX~?N84KWYP(mcOU-FgUNgKD6+oJ)lYgS*kfi;YLh}YryWv+iWzIe8 z2fTvNJ*!)W)e=SaDiI*^I6QlF8hh#ux^@gTJBcrVMA=Ma%Rx-L^D7=`lX=i@3fJ?= zk3rA*natlz69oChsSjnn7{N@dD+zj@%dam!+4Z0ZAdu6DG>K7AnkWlf)e#duK5-J5kOh+<&1ute|SIBY!NQW0A zXO-5mBSVI6er5&R^sRvq1l6#kBkN>)%I zGt+Qfumd8bx6hvBIBr=@_X(~RZc=BNDRhHT&E7)jr;qUGQsEDfYZ3MTHph7BU(=^| z%|t>vOBk=0TU>vYdHuEQ^#te2QQ)2NuJS&K=RKo%kA?ULS*HHQ&&HBJBQpzlMfrU* z$t-XjHi&EgHDu1ns&RuQ*TzLIx5g#E$x3ZxN(LLn%TrjpbwO~VqGoz1$P+EcgI(bz1Rk3SOF#nK7Zb@2f9VJGmSx>0VP2~+ znXBG=ne#kQHGe{-mPz&P)9#8!wOdd@5);RrO>WH_%>e|t>_4ql6EEuIR6hq^0jC-X z1`f968gEDCN{#A2(L9>x9~`0eU3FwgB*GqOD|O?@L(5tGh*}7Krqo8oT^u*`j!cW(@7+GOiPFq>&`~WKO8xFv*0LQT;%)N$+zoH8R*UTe&oa>LhFS zFc~-p>4!BLyxKMh&eD&YY+c6`V4CQxKr~{v)$HW|1jri-(i=4z8@-a(FJ`7H&C+sM zsl?Jx!#5|NV|WAN7zpRQB>*3TMFuD`}CG!`8H-P*c+tL2u(we0R4aVjrMRXGIZjpf)5~kiu_9H{B)T_?Q;-w4ZyVS#m5Vx8bNWL@iQ!A7+rd- zz98W7ceubL>dV+*txZeC(XOp^$DKi+hGwAgLAkkg7DV~)cNEL*nC+HE)5-I2ZH>{00nk2)w-!tc49F412AiVIDOAtW+J91|xqvmb`72Eh=Y z4XavW?I!V+@iW9D6k$^st|kgA1bn~Csr;?kIfz!WlnwK0tsgXPCc$Ji!q@)S{KM5Y zqSXVpoCjW04<0J!{&<_U&OaC{m(T_3j5l|-#4(2AVZ0>1S$S9uz^p#SXG(%H;#@F- z?3z=2sfr#7&5m#Rt$*p>bB55elMVxLaAiMg44y0WkEvsiYX;K;e^%cWE8mSA-|gmw zpqz|Lc1MM_&5nMo>#LZ#a9wCD3kPQw`7*#M1=+@&P%U)3jNMYuMUdPJ+AgC z&G%Dg#gaUlLwq?P1}8k>Y~$_4(PuXS$l{pnx_Yqj528GB=ttCqN{kTl|!GZ91$ zexwu)Fpc8KIS*(?h;y;o+h>l9nu-CAkkA-wXBq3jZG50!^PLxf3jktT0V$qcQ#5aJ zv1AJw$bB+2{p3qafy+^vU#>%oJMVLVa>U|Z%@XzGlF~gA)M}A|lbLUEh%8x&-W{ND zU;GiR4Ylp!I0HplBN)n*yiUr)S*SZP1FX+6w4;l>osy@!7@zx7=HFN&bZrQi26==v ztB4kNj&vgRR8SJ*7-j3Dd+>dwCv1u?%9w``o^@*G>O-wvN6KALyibpq$2FcR}}7~mcV$+rrmw6rRP z^QX)yJ;_&2T(L>r$V>edlBhQHes4&{9Yi^4lX0wFUtn=_Z;2Ysj~dl>Y8%(=ZAs<& zn!$IHaC63Jen({=6d6XB)9*-`4m?f4v+Fcz&FW`-kImr!nst*S`@VH%HhLbT%igS@ zqR-&K@sY(ck1qkw)qWMD8q|yg-P6y{T2juAyFx|V9$0f2b&e$D;8-sMNUWKHyozD- zP9CB|Q+yYPO8rFqwl4E~6b8pZNV!5q^NVVk^D~gA)sDH;H!K+ z31WA(>#nS#h>InKB@F#9Dypzwl#OJ|!Er@SC}XXtd^z$bhbRG3N<WPG94b=U1rcP1P4qT?*46^zT)2*0mW+#?x*vFo*=bN?V~vb5!8lDq0F&r<|t# zK{CyU@}_r%Bseofs+P_#-O&wEW?!P)#d7I8@)8`WnZuPYy_H|TZGU}KP&;r^A%U#+ z(B*r&!gv5UllM`CQJgKM8Mz;SKUg`7i*n~)&t;`+!wFYJ{#9sZk)lGB)=Zr&zz6_P zz69R2jLOKVYN$}+a z@b3K%Nv;}9n-L!?_7l$o+A)yh)UGyr?`Ix+QC?YG zV~<(T0EnTvNYV2y)xCnD4%@tM>VzKkzCcmh!TG+gMI6E&TqB*IZ$uE5)KymJr@9=e zi?FQiSnTysYMt{h8J$yE+={8TjLjJ8%T>fDh579O5ch)jc!!dCF);f*nRTj*azT~n z*pWsb$6kcx14^k+yPRR_O7;q)2E~&e?+CXdhC#$(38Sgcn9MWO)VFt2afLc&6W?$C zQMwXB^}}d(i9;A?ns{x#)Xqiv1-@!^#cD@NZ+^CRVp-Ic5bBuYh3B(m`u4T>W~c4< zyKD=0ivRn}-46o5!dqZ7?tTMJg3r0~Me+KZeWm!2tUn}{b_{LukN@0B1;Hg7DT!R; z!`Vmjf5&kHjv!Weq0&030m`xZ2fga}-q(aT*UE0;e(`629LLDn+M9(Zt{2|2=!{>b z?@qO;VymcgFd?Jw=cXc?UMyx`N*27+@8HdOzooRG*b3@|1^xK_{70X?Z~a1MQ{;je z<<6}q|50E+Q+{~z@xuFoz~`yi74giEqh24Z@=}@=@*CW z6Q2B<*=MBE>UYi5=|I*;2O}UFLg5dk=2VaWA5@fvOZ5p%=Kn)Q2h%A5K#Tb$1Ax%e zN#|_FK}S(VmtX)a-Ts79?&Wa4yq5s;rqrvEV)e(TXO!|4qowF1@^gj3it!i!FDmN! z#osI)_VfxBoqA*2p~K=XS3OhjGE^W@@joM00ftQGS0E%)w)mGTR8;AgU6K92Q`gy> z3*SiuY9`^>-jB*X$y{oY+|1PrAG5^nr{-DdBz`WOfSNsR-@gmm`ByG8y-8eg`61;20E^}`Ti#iDfEH%W_aRk7P4_FQ zX-u9PgZ}VvL6}&d=zZ3aiq6ok&vPO>oIHmJAz9LDNlrd1|4*M#X3`tjncjH@fsO zOPBMbOP*7zk#MqE_{aZ*tB4<8TErvj$+enNaIS?C(RUwQp`sr*aNILnxk|2p@#f95 zY8+JHV*Y8G^O5(h&4kcHBaat3_K9Qp8JcemvkHAx)0hog$CDIN??U1=f7`4iR;p00 zd~$jgt|a=hc_C5kWx>cNuUftThrK(GhpKHHKRyPtm@zZ9>|@E+rcgvh zQiB$in6dB1QmLe|w5x_Bdxjxp8%q+>SR+eBC5hjmy6^7$xu56xZGV3Mn*Zi?=FD|o zXXZNB^?3($4vXaJUQ8AsGrbawy+)W0>R)xB&7QGqJnSMM|jd)0T{Q$8;bd+Uq!|1_@_B_GIqgv6onzDwJsvkWTKVHkD z-rrjZna_W*5$&La%$;aH6{8ZlSl77K8QbxUNtJ0PJ>Pg;;$ofbw}_N2$@PUzFCzEI zsrWBvsmy!f@|uI%jI8nlTN5d=wL&*x+dPk(D18^~ByM>;-v*$fCt0%h9^LeuWzA}z z`3n5@7h!VAmpJX8-*M`doyE43eaF;~eqt_aGcr{VkDn;EpT_2$^6#fTN$)g*Z|Zt$ z$T#rC@#xc*;Nh-@t%XwZ8#Xk`jZdY+Q2M^Zme{y22YF>tUzBkzZ`>rp#o(XF9{hbMdfvh2O{o;Z_rsV1Tpp?x%-su8_ z6FF#mKsH+X@-%Bf{lsp4LF~oC=!5<0Vc*QKAmMtWFK^@oq?@?;M;7!Z4<3)id3SuC z9p&8@%!x=*dOOCtR9#@MPA(p|5EkQxxh&R^3{va3Ioc6}6fd`x*+Kh)!ANN}PUn^J zI?Yos-D@K4(pn2ptWRnj^=lpcq)isg7S@1o7>``+T{Y)}yv*EH3KCe+jW8S{MEWrC zD%n(Xdt7SVK1G{7Uc(Jy!I~C?M9vL#5h&lS@stxp&$D?R=Jv9sb;h4(!-2HYZM~Qj!C&IUduf@XB5UX<8kV+vIpxmno4~PFKG6* zkEKA)kB=!oDi~E$zy0DIonIoe%Uk$cx3!~9(3J*0`RbXd7ZMt^aQomJ7g!)2!ch|* z2i4rd0$t)>k&hSO6wDpI3@Y*pm)h;txlSHSVo39F7N%s*uk&jbDm{x2+`TL9_@puzQ z=W>DISOVUeLTem|>&$w8;+Em`9fz>q?e?hRd~>xUbt_yQIXGFgYif)wrD#VsOHd_t z_6CC-|FYo7$$MM*?>YgfXl0h5?{s6GD4@b#932!tTndh8RfyJ|GTTs5!xZvP6>tn@ zAZ{(hY;O^cEuG0LIK1ti>JTR8ng^KY@F%GIbo{Fv!y~7x?;rIt7wc|5vP;HGs8`g7 z5P`!n9?I63EVB4e!Xuu-vNcvxc5Lq4={nJk1c+vMYWmYL+~a%x(}&FrdP$9@T>7cp zyxytMmMWbR*&e9VSq`-(n@e~#HbFxtVfD?_fqW4OAI6C={+Ipngyc-pec`~4Uc5XJ7U2`_%!r13KD~)w$zPIlUi`=5{4SUS~L5*ct0YUtIi}ksL4(Q4Arx~uIV5nox z8<_2)%x!iJMx;6Y`+FS)a}ak^$G6z50-EFs!Aio8wB>2XlRD``tkCc6`>%1y6zTR@ zNSUg7@C(RGx!#6Ghe-%Q+dON{Hx-$T)o_@A^d35r3*%4oL$|Qy;IiarNf!Odq{_!- z4z1Q1x7-efk#nE7MiqvApM$9{n2B#3e|910sGtib;Q0w18)aP}xHx&H)__e=9#=(6 zuspbDd+5e@IXL+|!XfR!S~=THI89db9HeAPCI|TL`&fS4$%@o@gfLZ2k;){rf2t3- zl_VQ8@$$?|6Hh4Ah5pXv@RLXG=}@!N9Vwce2k#m^Kk0C^YSMMfK?&C$#d|4l1fcKs zbT_Xkr#&Z6DI^^*ZGTW=oH_S?Qs-PBP6(z|uoXuNH|X`Q(R$Lhjb&tZu3vAcL1fd5 zWBL_WIL$1dRMq#uD0I02y;`}ayOaDkXb%l=Nqa`=%}$3sAxgW)-mcY9g;BaH*n+3U z)4ng0BwlL|6OwhIr$B@836+QI$F&;t26FG_ECmvS3`3)6aMJ?7KLT4k@D+kdj!IWY$ z@|*p+1pcYu>{|z>&2CqYs;7Ky_8a&Jo};`oMlZ1v2B6^Am20SI%jwXC0`&Qb?dWek zUZJ1vKm0yNjucBC*+qYdD9R^;)+WZ^Lp_~H>{4;f7it(HA3JRbb=F`4&v z=mw(rmv@x+!sO*G;gh?=!Hpb(!Qqf`5sAidA(ykoG89Q785>LH5s6rgi4fCOxpPy< z$K$B1CfaEv;*C>;L1x%iT}=r`s6uRL{%Nv)XP9GbsM4YU9u{Rs6tUI~cN3xLFNS$E zitHN?^A3(An;tb=6r?&5Q17twB&b&nO!7i>wP&=KZbV6Ypo4C>^Wym(<1q?~QPk9! zEu}G@x)eao8kvMwA_)&l6W-}2 zT^~9&+btLfIH4@nwlskm58iGpZi0cfXQPKbcxTw~(G1n5kI8y2$xp_UpQa`)K1mA6 zjOP+bEo)33>Y-`}r%uBtjDXnp#3X6mWccfiD-fYXGNd|VlYtD}I9JLY3%zNI2qZX- zhHK%hlS_P(@>GQ zp38byZB39=0&Evt(7G{dhJbh;ERME5D^2G)^fl`${K_$3k>OhcRn)61RIwy8wkiZIft{FTUEQ(%Xqxx%j1?6)D?-Jlwa*~|lonsMYXXQw zV!m9(@>~5_-h=@33|(2c6Wxu4&Ux5=t>JB=X;0FXKY8%Z5mxUOJ92{uX~tCE*DUd6 zEVUbeK0$YL-=p)7+||#yd$0H21YKF@uF#CHOwy&i-TGF}pj!qUaCZ>(eGs%#Nf77= zl4u<%|Un6V}*kk0*G7nMlo_QRl=wEJEpc(a(`9vg%6AcW_OtXNQfIFt7%o?C$g zpTWXUvkl{9nrV2 z&@&5i{UmrEo~Ix9Ka7RPQsHSHG1&K~D49ZKSoldCH4V?>sgGu};a(s_86G_caOCVX zJzLmi4~J%JSRn`^fdOx3^H5oIp$qUl!YZu0hIi|?My&L?Xk4~rZI!SjGq_HhfdhBuG> zh=3?Pjpfyo(WllxY9HO{_CtBkf|r7!iFg1^G>*n0+YPGb9jjgeSSBWYR)x0%tIBq~ z@19c-b?<3(DjXL!Ab8?d1RQaV@^Mio^(_Qauvs92|R5~guO~^*;Xs+{CgMV%RRII3YV;BoNMMK<9UWjbq}bBvy(iD z_*H)&5Gv+g?cJiXXl)*0Ik>tZs+jgdae}@rs7ViJo68AH3cTK)XI3y-J;#D(vtD%L z8)sNh_DAOSAyg9#k;tl8!neEv#syoO1;p$;6im}x?{d*zjZUma#IK z(Kmbu3i@-;&q|=_c%E*oLX$^P6&3B<+fK)z=VtHerIuIqB8<8d^Vqp6*U)NPQ4jDu z5Acs&=gO<7i2IZ1mJaUA#hqKPS%1O%-^@doHh0#tn%kzBJ{>jfW=K}ls)c0dz0(9l z8CHc%f;%j<&@ruE{VzX!dq$_(`%m3G3POCRqFwI4+8I|IoDn%*do%f&^(VDHmPW=3b$*`gJJ?Ao77lb)BE5&yqNmJp3Xf#40mVsrcFQb z!Sl>($ybq^`n$M08@9QE;PkEO&W;3n62zXG7D$4}l8XCm5uMl4`WEbWMnOojP`!oz z_Nq5s(>%o_s6CyFp#oRZOJBY7YJOTg#JzJxf@;q~*Ih+Nl2^Ir==y=CFGRe>QEExf zbdf#CTbmQwBP*WHKi|oR+R>maG=Uv-I~Ylwb8-*3Dm*hNUA;ZsT5t(_qwR>`9QKB~ zpSaoHe76Af&T_$J?2W^=Z&eSxO*n@R#B$rxWhYqNwj>^Z6}T(CuQrh@#%@#Mcl$eD z2xmIPhkDN*lzy6JaiOY&N*_Lbcht3Y)O}|3WF*(-Z$N-x$SDBrz6tJaFjl8EwhzT} z^F&-#6f7JTI#ND*scV#6GwMf&__x0EZykNrGU`lwN1lP_Sqs|c-_NF@2llqL7(i-E zk>}~$GFmJ$)uE=n#E|cP{8o`mWJs?K(k%b}DeYQU>S}A%6}T6+F%JQROdB^3Y@DkU zIMs-bwclh*vkTSh(4W({sYAaQxp}sk<;?e_BnMsl8am@D9;ME$!*p1M&^ex4l5XaUC!^m*=QmeQo@2B-!GU^i98^HWCC~ZLb(xKj1G<8%asca1-TF>lE2O)|^QMofPF%6fV?g(#)9?KAwQ{vmZ)I zmp$5bEfV2t6eTvF_B^tweBu4(R^M9rFAaJe0N1Tcb85-9xVme4Z;Ah#>L~HuGj8Q4 zrw8hY? zMS;8dv}Vobn_17|0+yeTJ*p^@@s8vAJ`tX)R)!3+TiPUMmWDgWaZB#&iF=bOpRt#r z`8Yp8OBDKbP^*2IW$KJ(Sm<_JvA$&)%jDpEB9VWJU!|=K`D|jm{`0c6RXhjOpLKon zdsj~5?}iNfG`ov&&C;vpkusRtl#8~F7wvM3M|01VENT-n7x$v60h$@Zof`QZOx*cI zea+`?ADv(}f_FY1zWi-d+JUl=kqQ_+;TTTSE#R5_6Str?i!E>9-}xcDq06GTq^<3!y&Q+j7Cre^0nl7*F*@zI0j?7r*%M!r|68V;zU%P867! z4r%Z=D@cajMX<^~YYvnSPaL_Rdq1>zAqKNs$~_qIs@wzFYod0NyR#4Ee;08i&csaG zflB_=9=e>f`DtX$#8CD*(xJ^cyS+A3cLi6ltgdfrXtz47ePrQetw&4?6%%}I{F#9H zM;WfT;NJ6F27FH172aL-6OiTFFxj;sp}mT+^9iXAZDklxFMQZAFC{wuM1tB#BQ#4b zi^hF8q%hf6QsKfUC-*+F&7s=}i85uu-uvE;Rw=35) z0Vmnv?2jjI+K}Oe)b5TAa8xL95epuqXbCz!##}T_K3AEZwRd;^_|UF*IZw5>f6o5! zYy+;_YOk~&)q^<*dqN%Btyg0c%cqc~YjG=v6`nz_MTcLb94UZPWCcHa)0p-rqNfj+&O2DX32a~%w}p5DJkLyD%#D8));(N zu_?E%l30X$7EITk1S+)B`WRHAgjD>h|l5Oo!HNkPZL zPi&&s?P37*&Pk`;#O9bc%Q>0n3Z7eiap|?FcDbxNGtQq!-7D$(E?IY}PLi^nX!0V^ z>=!D^yCvFS!4py4geYnt6^h@4Vgjjk66at_=6gBQ18Y!wadXy1N}6hrx{=IQpqZf0 z%J_wfHsGXMh7)!RUPj?Q-?;ccsOYEi-PNC8J`DRGRP_G`D*EpK1r=>LadUH(ul?!X z&g+nqw=K3FalX{sb<6VP-JMlOc0K8R`7ruq!NIM5dtdc-KWRAm;AEBGzAwG6o-F>~U8cvlwuJS*2y07mo zBrGf{E+Q-`E+!)-fs>Jzmy=bJms3;PsG_V4*r$4%H4XLk{!u-3-M*%tYP&>gyQKa> zo^EkJ`$Ihi$WueFW1e1zeZ9QAkp^oM=w8t(q@_)}7B@xC?w^w^2GhZp0>-Eq4<#F?+brwJB*iKnh4Mv#&c z4<_66B)fh{-gYu2FN9j$L*4&7f0~mX_d4C%Kf^gG<2vbr8|l*SAN=X*Oz;2i0_r+{ zdiHKX!9NP9*Nbui1eN*dR%UV3XmNU0Nqj-c^P-ZXPbCSXPckQ-T)1AEL@P@H9MpHe zIjFx7)XX0U>Z6+4&ox)?F|x`TtS(0JB;&?IX3qQidtd7F3meu1ROYkV;>KG5f(i(z z^)I^q6+x|PEd&r$M*CxcKW+YvKYh{J@CSd|+12?s0d>0P#r*4M&AoL?z4d^7+TTC$ z2Y>p9eL6V!=0EeN|6`v{zM^|IWpAq}neHnMnjC1Uc=F+G4KOQcUs*ag+z8~WS}V)u z-?8#+uiICZf12!hfn#Z8ybe z9{sz0n#a3tpE~_%pMC`FQ^Ov1K2o#b5BpSk(3?-EfPN6LPg5cPVxOJ|>{Hwi`?Q1? zuurXPp^9tvsbUBLuuoeVRNbKr?G|kjf2}q9RBO#X9Y!cktlOvlatO7b_Nmry_Gu(< zH4OJo9PBT*ZlAJ$+NZB)IW%OSe>CBPw5;2w`Ol$%ecC2*C$_h))t=p%`NS!6qb^A2sH~X|2v{*0U+`Dd{?g8x65`0XSv_vjnoy2sGxhAIl zBQSaRr+pd<*r)l;8f*5cSrcHN8f?QcB{Yfpcg(>l8-sQL_Gv0$pDGQhYOdR-+GpPXRZl8XZNL{l}BUDwW>MD(ZeVR%wYQdg!`O`k#BVfH| zp9W}pNO$;ZHYfx3X`lI;eQKT-uNnmEcfh6ouutbL5x#5oY4vaR>0}dNpPJj?I;Ym` zQ$N5yeco&YyIEwOwr-!6>t3lW1nkqqISMlMr+xbI5Bqet{GH-D>c#}~b^A0wlAOP0 zpYC6L6ie<~vrnCV*r#uy|6rdk*Zjpk{lp=T1ng4`lrj{>Kr_+OW?*WW>>LNAHRXqW zS_}@E`qMt;{bir#b8Vw3^GNpq_Nn<_?9-f|_G$6DeTrMNPcI~LqyqM7Zqz^7r$s;P zQ;Q$=sUtOUmF6Mc4KwBIn7b|n*r%y3OcDPz`}70nchZnV$C`a=3wi#-J{2c;Mfd%( zPa`&1{Ar&uf7+*y*X+|>nga@)KkU=1$k>3V8vy%M8n91u>2?8vN5m$uuOzBdf7+*m z(CRh&RF2mduunTuYtf2p_UYK~_NnojecEwg-9GiE!S@38Y56bv)M>*4Yu!E-1?^;`bN(bg72tVg=YIKaczG_UT+4`j>rb^Ka}^V&z}#Q`X<@(^u%< z?9(Y3-hXeO8vkjZ1_SnK@Bf2+>bh>9`v28FeJ+xvtIUB7FxTV%WuLwV?9)$wu}?cp z!vB+fO8gJ@Y1lv8r&oU3r*;Q^+NVqnoZv6}^b=*xK2;=t_+_6){LMby@`ruu{lh-J zgYWv?K6U(8`}6`}pIZH}PtUL0r=t05_NnaufqkmFW}mvq|J6R7h-Ln=PcLNt%|4ag z|GRyfAgF^i5&LPMYW}iMV}IJGcf1Lg0Q=NW`KNvA1lXqufPI?E{$-yM|Flo}f3r_> zJOKN29=dLyn*Wo1`W&!NZJzyUpEmy8J{6s%!H)m1Pi+AE)b~&O^g8kXmwkF1uupTj zK4RcDv)n`L_Nm2R?b9p2*{3$MW>;ppoo2bc*ocYQpZ4h$z&;iDWuKb-i+$?+Pxh%P zxcE2wbUzCW*r(cneOd&~|L^uGVck9@{zv;X_21j4X*}Nq*6dRYz&-H&z8~2a)>EplHr<$`r z?Nb{%cw!Zc&;;z$qCf4^(ZAZK5iIb(+NTi&@c2iL3ATm~;ir9?`pZ5I0_;<7PR(ET z=>@{y?b8eE_NgwJ2Rw(=0_;;ZjB9YsK9yLvPq84FKn`!)x_!!`|7o8}|FBOpezQ-p zh~vNP)BXP&_Nfzn%|6wk|FloFK)>wMQNTVWuiK}LHT!fM`LFirg?0OslUMq8`&9F% zeR}S1_9>Y6FZStI4!}OOVbRX9w(0z^Pboj`(@_wY0$`uAxY|Hm5@bZ89%nBe(+dLi z{%)U=APVdDX>bjPmSZ;ghkZ)sRB%M?0G3&Tz`(NdKkZW%SaIDx)g+%*^zb=13s<44 zsr_c3s^Hh{(|5n@)7;*`!)7!M4_G$7D`;^5c z1cH5Ft=Xpn0IG!N82!UO-OC}wh6w=n=^!jcZ_PecAg$Y{pf&qcz;Vq!#s1Ac9Y+Xy z|7oAv5R|G3n{>#k)&FXrn*QBB&0n`qrT?%``QdB!DFNx+0ba9DIl#J9Zgw0Gm<%Pa z+oyZj=w!e?#s8yy%2=~c+kV-n3grJY`&0n1PtW~}eR}mb`?U9$eHy=RpE~}Fed_uT z_US13H~VzQ-0nBuQmI$oyCx5L&sS5V}O0?3?KVtpN{?2KE3dt?bE5B_Gv1iZVZd`QAL_m z^7aAtsRZxe>{AQCK3(SC57?&#aH1jfT;iI28v2KQD*ZS6l=c_-~(7>I}X|Hfy$rdw!;5rpVIznpBDUSpN`iyjnOSN=`agCRQ0IS1|eQW z{4-xFkFhA%1(0I`7NM`otaf9bKcRVq01d~&-edT_lDQxz@C-Jl9^d9og||zvvU6I; z9FLnDLc1$1i#$*fv)rnctWX*kTea21?zu^&Y+irUB|~Ou9OeiCB^b~K$w4yOa&H}RD=97g#s{E;Gyc09WR)H&Y-zo;J1@vgV6(_Y;6qb~y{Hf%MdFJ^I(C39XzixWCg+~_faiFU2&jE*v;zrE zWTTSrzuvn_;^hozd&5RQqPJbW);qLqAcKY`VtFbEy^oSQO18dD9BfR{bJAj^Q84*AB#+%Juf_b|m`(1>|Gi7G2XfaAzqHf_4YPuMuJLNIC zip6TuQ6VcF$-2O3;r%k(3{6EtQql6;_%d42>7hJI58lzLg|%?5>A_fj2^oDDwzP<$ zd300!fQ{5LCY}H-9_UfvKo;ANLpbaOD1_Q6A|cPgo3!w1whsbUS5 zXQET#>}nKA_VD$bk&a$P79XCb4JFJjztURh707J1xxi$p(dY9 zj(^(rP{PrHX+iIa@EmnFX71lVy4&dUCgf+=Hi?6YpG~qvj;S+a`Mw-~`k6)fq_fJ+ zuoF2x`|Q}mFVXE^0?>=6K1c+s&+mOJ9Fww0ygq*->2uuv#k#^pEvc{POuk-tyJ)_; z=ym*S8d@R~-I#q{I7fZyit(qcx1aJ>zgEd0XjlY-_fu%=P>KYvm+`kK4ra;QPo-^3 zP3^KM$%Xw`&Tq4dF*1mB{`89D^X>TM`s>3K$}(Pb+2HLm;GkANlu*BY_Ry-~u*GR^$3}cmGL}>sg%`o|hprfB{5ShF<1jAO`ht<_$3ffTWbvnD=c2Cn zfPH#@L);GYT{|L*s?CRTUQX>PK3sN|+tpF^b#Z{`i&*cjncwZx)Ask47;Udg;)K-g zmz|;;N~9C;H-FftDq&x-!wCmkr8oL*bO-Fyj;yT__PS!9CX?Hhzgn*rZnAH6{iR_6V_UX48 z;z4SPQD4^eTM(7Y-(=170Q*$CrLs-=s{?kA_EPSeeag}VZ`Ei0v`??zMf|c)o3~|0 zZe%(>@;MUGEH_zeFlQmcYtyduF4w80UL+_apkcYWuw7r$I{n6@baU;GYxXI>b^b0S zzt;TT^e8=v>&5(p*E^OQ=_(b1_@>t7o(^ku(czYn80lggr?yFhj#EmWX|fA{vYdnW7>m`-ar zZ!_gD_G!wReOjLi*r#e<FYK7 zv;weCr4-lf)4f0K(=Xv?PXYGnMVh}VXx%=wT(eK5u22B`bc`~xW}jaAVV?#A_Gynt zgjlo5ntdvw$Ojs4_4w01opk}ty3lxWfPE^qZl5Lr_Gtv=mwgJnCuic}WHE0Lh##;| zAss*M)0$NR&Ji3$#B%Y2*6dTd$C`aw2iT{VIKp@QuuqMC*rz6deR`h@4hpfy;(yww zNq~JS7&{O7VV^Pp`xL)spYn=k_5k)NJwmd-O~MxcmLNgmpe(G}r-aQ+6ABE8JIwVQ zuumbpf7qv+xj=-W4Imgphi3Apefs3D_9+YehkaTDh7V`yDSG^2pLzlI>3hIF6(E95 z9WgxG9@q`P+o#!jHUx6gFZ+}muutc}Kkd_ULlgs4j3cf}Tv9hYR4C?LY>MHo<;)iAPt+#<-j}G1_pnwkSx!GIhoP%hQR7La1 zP0HJ2T~r%IM@2yU}o~I7UbBERYMjy;}pPMHIw=18}k2NVZAHmAW`EC6~Ygk)QK%QK$3elo8I(5kej? zaA~&=h~^y8+s(#pWzg;N5Ae=h>DGV*iFZG$C4B6;h+N!>HE<*oSX*{d zyj$t9l0^7GxpUqIb1+_ea`$oFUHxMv?g#JRkGSYU4b-_OymdN4Fv4?dwTIE&*)kA=pUdsz2pHcE#r1f^a?b>Kr%Tc*TnoQp40)qV5yHvo{pw zfk3Q97s$c^L4Fs|e78!BD{*$iOFa3M00Z>hS!xce3FDq>O#A+-=?Ewh8V?*YV+0DvdZIRlB{=mis z{iLx`?fVzvCjdDSTg5(Ph3; zI^Pon5~}(9NIc`{p%~Zx>41_aWm@r$^GBfEJ>8k0LtAR~`eLA*< zJboUqPb<||zs>;mY3OXnvMU>LSl0=bOln#12BeXLC%mo@K!Lc86ogssmA~bFh(1fs=ormoS#fhty`uo;%o|$+2LB3Fsn&xW_*aZgDf)75V%yc7sH9H4-&5}VH;>&s08vc}WDWX#D%eFQ%)7`&Aazr8vq(0rK?X8hJ`1 zzXC*4@=YWGAWv@`nKj}r1&!?uPp;5E9JfZEK2_P&l6Mb=eG2W-rNnY`0_16U5#_eJ ze&;%Q8d&ABPM$vOTO&{VY?1waTcr)l2UOO`(;~Sa~w;;5iLTr{LX#`F|nsO6mFN>E*fii*tV_s>#l68!Mb-^w(gx4R$3<-+A=HA{m~*4 zHMd#i!2*DHDor6rhT%Xey4DcK3Kc;E87+{wHjI#4pQ#FSFR*U&F1b4LM81?cIcNoi zK{#k2`CPK8TQitsnIG9OF$~+}K_0Rud-T-44IqQYIls*1u0n&!y77jtoiDh@Z3^xY zL9rr1p!{#Lwy*V@%JmrA6Xhl~a@Zl{*To7TDraj17b3Q{Zm0-EBKtB`3(T?5M*F8l z#L~fR167TPnl>Q&J+s;Hl+5qpA zZr8P=r3>&*Z$o@Uuss>3LA2UrOpFKMor;0qxB=cN`KNd4YvQx!o#IB0-ilVe??Qd_ zt=6>owf3V7`V!!sGSeU_Rsv63M`vY6A4q0CQ%$g<5h)x{qxyx(b@(+zMVFDS zC0}Znr=|t{67axRkCBdqaMebx(kC}{nQ$hKl!JQgIXB0&?r=1CsRr5nWFO$2mXbfo zl8vaWF2NTZwQJs~qP9%?5ARet_NRAh5}~$i1bC-1*Xrj~$nk65DH%>A^Y{>(7Op>7 z_fE|TKfTiyz&mXOaf+;Yr>j~{LV$PLL)Opel>6Q~YsbZvTXrvjyHyJ`8epMaT8_94 zOhn%q(v;B_E9jQdp^wChZOaNT0*TPb_wGeQ8jVCSnmoGCv$L}3u3&*)5JI=T(^rOb zyrZR{)1c)Q{l)g<5s;zOW(+k4Mw3N|M1dMXcCm(>q*tKQi@FAk`+Z~!LC~rb4tBu= z>+`a9HI;crOq%>yMT5(FyCxYNOAb4F8KP^#L8lAs0G(D-3yO{Mtt`LjxsB_iCd{L( zD3uH*0++hOjJY$h@wpS3>&qU4pdQGO1_(hfH?HmBB#UmZGQ}9!KQ;jI3&O}bpyD`3 zJr(J|J0MPCt)iP|ZEqv)(YEIc6)r?FD7Z^c5*G4uce%fO=;kN>u~AO+-IPYx>idbu zOQW1hFRS%aPIO*%iGAWaUhG>|8Gaj*6j{;ekMNF->dZ-W(S7WqSXQ~MCY&7I?4PjP zunvRC^sk9D_qa-dL5SGxbaKCo&s8JRy(bB-5t)7EEpH^sUHdGOR{5vhD|6gSH}gVE z%ddj>>03CyaUbX(avzcBRlzpDyAOGFKB;VCGrQh>?9IT9fPr`4_s!mr939>F@o^s- zwjZ!hPqYsJ_Nmt$9s#HQU$5=A6XlmD?O%46*DVuVn%=+qQlOWRzAVY^6Xl;?+|Omt zR`BK7gY|%xNb<)CaA$k)gi>X`lKX-^c=uhAN#(G={FwClt==H@r)N?g)AiE zZEo*vvCrG`n77q2?E`I-mEr(YGjwO!;*dA`s zeMd?@eCBgZ03>a65hPC?JC!VmwnqkUc;d@`hn9*B%RU@&=+K$l@u4V$_rB7z4e7~b z&(-rAp2Q^Y=Q0_KGxAAl9rf7fd0<0n@>pz2^%LqqZ0dkd#+c7usn~Q>sfYdO#gZqN z4*7)G`(E8QMospmbRG(a@g7I%W=p}YSNrDfI&{P`+1s)lA(a$>Sb#`MfavuAv4;WTPXi>{0wmuCNPP&9UJbya z17)NGWz_@aj015Ff%5wUHy#gE2n$q92~@fssQfTcVJvxB%Ls!L7ZNN?_52x_d48caeh0E6W7R@(HNmX0INAh?l^ryZ#Br~_DS!QI9DsM zrU&HE2S^y}ld)C!w%HiJOt9uS#7X$G_lHv#gSm}waaqt#g^@n{9zK;J^2KeD)A-@3 zb@CK?fqeSRCHP5zJSD^hD?$#HLW94YQc8)j3FF=z91f7D*1D&SV?G}sfXy1YRu8=j z${e3}B*IU+EM9Kp#OVf;Jix|n+{PcE(pk}{29zA<98>+Krsr75q=&gE@Pp_ zP-q5l3cf%+Xp!4%0P-|uw@CB>TBM&y^lnm&A8S!3m^(B$%6|8mhtP9{3g;dvEJ+J< zTUmh*IYuaf&iStf>64%tJ>0m(h&`YMyIA-EfIOWIsuhm(eajtI8tnFJ(UTVG`!oii zMxU<+y?7F0{Pt|+tGVjypB-6?^;YoH!Q9rmpEJn80oPCIdqiJy{e!krk)8t-+tTKD^1Psvubbgkyl0jzwFa0-Jo&lK!cc%H;NNb z>Dic|_T`9Gx{H0-@;;4Wt6BI?+^)hSqw)FD2lV4b4J^TBo5oqWR_UAkm$cXH(|gqh z8KU&65MO%@@5wHKD;7XjJrTd3VMu(ra1N;Wci(rX$nhmdmIicp+}i%|*mwQUQ@u|f zp-#Rx2(}p+N`0htwH2?Xy`>8LRxuabw#2`0p1=Rv)A5zNO9mJE1C3X@PVW!R4mgbs zhbxz2BY5=}X>R+L=U%8@9x;i*rr|kTl^(H}=895pwDw!cnFhD|n| zl`lhi+9}sEsB2{7u#)=Hq8!zPM-P{3@j!6+sNpq!8xKRVueb~+8$J08JMCk<)3~>B zG;cIKv+60I=^3xL@$$hdO6}Wf@08snonXH@XkynKwDH=>-ZKi>XC9PVAB!Ic>{AMl zzG9AlnVEeKV4o`HCFw8ygMAwBZoI6>bRc8m;#pgv*La+6^Oik2Vs<}NU*-PeUYnc1 z)|?0yp181xF(wj+92B`v${jWLzgV&+wCv3b#&d(vcBSUU zyyy)h3j)WAnl_%8lCEJY95ZWI#x)*(-Fr^G`sR=RZ`_Wtn4N7VN zTJ(%zp|w%8p-J>kru|@1wFzRYcDtN)xVcVMZ%3QJiOO2$IlFclYr2?yPL$ft)A#CC zyaf*?(5-?JG0qRG&AYq1bal`7q4%gkeCu zEODwsUVg>NRd=Rra^L+84tP%yJaM@YK9{U=Tyd9y-I7PEPY3Cd3BoZU@RqO`g zmV&aF06eG2DxDpgP6%n~!t&-6Zs4-<{qU8{MqxZnog zu`=8JsCoXIBn!1j8Q!Crd6J75dEGiCtB~S4@fWwFltl+^@VYWrJYpm8qJwdMmVsB_ z^y7>^^9c}!>ZGQL`Ep^YFLvIYzc78G)_{`%^E)IUKDl^PS5Dc9@w_hUvCBbhGdarV z3I+Y~M7??jW80Z=ErWdR8`@kz**~x=FPjvFr?;aV?2B*eJl!rydeT&S$d$DQU}#}0CYOthJ?q))lTHO@{e2zN897g zf?6nh`dfrbiK7up`6hcO(wIzKUpLdd zPcQw&M^gvf_L+*!w5W%_RYva)*1i8&E_wv}?WHnTut%=F!vmc7KfJHywU4UTBM z)m_a5*T9Vm&HDyB>eT1<+UyK7GdEs5OpHrKq4QJZp+x3J(8!vHat-#Zo zhV!>ZcpauX1TmHx8Q)`5gd%e4Zx+UA@4C0MPO3~2Is?=5vg=b5&aHCRj8Qt_8;`as zb1d%RJfdilr1K3JfF!V7w(XV0q78^C3(Y+`uI@`lx;A68T-ok+@65NEK`cZfeZvpO zW?vH#4*;2|RT~P2Eazg*@Ty;J=|Fk9$zJ3hc;FVsusx_+?x=~^@U@2=JmbAXERSKO zMtuQ07+J3N5mP7tX1rH8<8}G$u}{}W+QE@W<@oc7jay_>lgscJnODK}n*~iLKHte( z?Nebkp86!y5W}qN8{QSGl;xNZJv$sH3y!;XjRd!F$WND~#1NNclDtqjAw`8m za5-h_80M0xr$QLghM7&9im`E_{bqbmWnEz(oF1k-$S2N3qnm|gn`2!ZaNAd3!F1^C zPl-SlPy)Uv%TA=`hwZEK7=LWBf`4~uN0ltcD`MeErBpO+MK(zBo0-o;YV@P1X;dMl z|6MC4BxJDqdug~aRr*Q@m60ybnKLv|SK}8cdvFD0DThl93Y#*;BzZtc{W5Utkvf$? zOPH~uOsI9Ug49(X<^z7&*hU3&CjT8n{Us0 z-j+j({cxmCb#dFW8lavs3ayzs$XbzRiqXkz)7Uy}2ty(yeWSUFn4P%M6k|(1pq_H7 zHB)-0-$-S#i>Dc~KD9#RH&QH9z|$16b2aZVlj-Wt^_HzDy4DsR&)ByM9QzD#lFSoS}f_0xn-MR;^JpMHfG`$R$(Ld^0g~lh<1h;BJ~sTSsnOG?faaDDu|{hj3_Mxr}zy3O5q^ zZqy<7nwf&G&u%ss6ExwotHajRVRzI%qF72*f+s7j&h~&8ULtrlg9V7;Pl=}D2R$S<{8w=85c8_A5%(i zGX!skV?mr++S|^$)opc)5c4(@5X;}-EiSptp~d20{dS^UwX0dfVQxx^_Y^bcc zenE}IA{d^}sn%F?^&l|mPv!;urThq8RxK1~F4jm!DpL3t>wplmFtrvsRR=Zauvcbi z+Ay`r96H+OTkK4>xS5q$*S@&J*qjwEN#@kLZDzC#9EpjP3v1JD3g<`O{$&o)kOcFo zOJ<9;JgT({E|N+;WOX-&>|&j)AZ7D-c>T6I_l;bx~k89I$%^MkUcZif!dFb?R9c`Ze# zbjzFe)S&K;9O@+;4n0)mJ7St4<8{+yF7?_*D%o)1f3bJpUrqIUpYQ_-kc5zg-XRo` zE=7uT5_)ech$0ArG(|u}qy-S^gd#;yLJ<&=j?xJo5j6C!p$Le8G?kvob)9pcInRC1 z%$oJA`EBM;*lU0H+IxS>`xWWDj%GB;g~6z$a9V~`&N3RHqG}x_M<;{ru7z`a3X0U1 zf?1l>Jmar(V?mos#HcIt+;ZS22^B_h^ z(%37P`av1l0ZqgTY@kIb&@4+2b7@zf3`oqb^x3V_=pziH=#3u~@)U=5I>LzK2!dH0 zvK+-3zV40_ucl19e<(z4ITd^fiIlD@>e21w^Kz9dEuEPc78A{IpLwlG5<_>3p<&cs zcY#n82)-_sXZ8A-gL|}Gl;vW9LRP`6NKDzDAQMVPHR` zOS{C>;x&`vhonlK4lqpbk|N}~>P4JbQWK*l@}v`gG~7*RKIufWbkusUC_NFrGq9kM zOLjr#zPR2bfv&ql`SMy@5QWxZh&MVEZK#t1qEsz9nKwLyEFuIA3N>9K1!S~!sYTH^ z?9B~IaT-&8^f6spM+zs=_*|k%TxaeOBWL>pHb^CgL(iscum%(4Ub>?bygnS=hzpJ6 z42sHsUaX1if+je;oG6Mbra~8?k>%lY*&?5hyz-k^3k0JsxV+FLQX{-X3ZYVcL0*k| z*8L(;OrUZRFANI|4g1u=$0La5aTD*?8c2=kNi5^*yc}W$`sv=CBd+t=P{+0m%e73) zd;O(;smF0unP!7{`GEM3eX5)iQA4FtS1r6cn0LDgBw5q_4&k6+A+|*>}rZ z(%lxA+m|MKL=cNb@94SVL&K@tsn9G{np!raNsJILjApR5u!T2qv0}@rY%;>T5RwWr ziM-Gc(2(q;D?+Q_!h2qIg`}+8Ln1dWch4qn=#{#lNs*;Zt_i;6C>0{rsk%;-S+^QT z3K$~2&@1l3lQ`PFu0EB(W)Y!bQ?dNpSkvo%1>BgFg5{d?sJ7}pmiD|V*v*kytwGWe z!d<8r0VxdL{V^D$Iad7xj?pAV34lA_oeJdGVd#UyMaru*>x2!DB8Nf=f`g@M+W-iM zuVC0(YbXIqDpi{bhv5NgI+f?G@15&+(g(cZ3dD*Eck`ED7x98|5{#|&j5UZ@c4G38 zhgZFx4(d{rBn|-8i5S0~OCchOTUeOS?ldD%1QV&bF9(%G1CB92LUe@jTJ(hMPZK}s zwzY&EuLJFUkUa8;x=kLOmWhYFQr{hgn|t}Ue8caMJLzx$gx84DN2wg$4#hj2lj}FQ zy7xE|EE$6dtk<0bBd}%Ja0k~m-)=(~$;#ngN%JE)TG_fFe-X7Hsrsm@WQ}U%5@7K^+@s3IUT>|R?HG=qNjC3s zig)%gN~Q1J`k#8;rv5HErr{R)pWYft9pb zXTR=YpjP^WIx@6vRbjtW*v7L;Sm&caRHs*GJ?;09Jmt8^C_&3^l&$yOzT&y;w;9C| zQ?~?j4gDj|gCcKUL|5IIywa)hwEO)QnT|5hQFVmr;FCO3aCl#oprM96w)@HgxEpu1 z@tHnI!?DB43?wxL(13N)X%)|^yru8*5igR_K*=RZ1ghY*!Z(23zXHWi0&$T#n~MxG z3DPSQz~4qDK^QZ1OoZSvj7g_4#`^Ht`KE&7F5}A}jctN|302ZeI2u@|_1;x8ji8y_ zh-SggpX+9=13-!B+~Y3xerk;$@>9_|{6Zc4Kr`-#kz$3rseY!RfmF<;Xk1NS{)MJW z@{w*7S6fH{O!)m3`|j#y9m`0H3iqdde{G(+xh}KpTw?d^jgAi@212D+)E(^xS_ai9 zRdkeU<1w%?zPT~)aZ~1TQ#|>&v9_zB_PF`IYRhP|=0%MB*l`PmKixZSd-J%t?{VXC zbJJ3D+Z)vmgc^a|+@jRdsoK)O-9l(S?i32{(Nt@?sn&y0YcUS)A{?I zcd4q99?R#pwEk)CEo|wnJ?VQG-1h$D<81JUwBUa8mciO)UJ7|i{4bJQ0x2E1z_l6W$TK3+tRA~vPA3hpVn`M>MN0<GGhts{9D`Hje)* ztaMmv=q|X4$~S3$hq?6sm9S4)XFR7H&!G^#&tXl8L=~|tbBXpO@S z>~2SOoJ?dlL;v#_u4l3Ply<#o-mOHHdXN4$+LjsL)*h@)WVR%Ud~3*Xn@v_7Rq4r5 zke+-m(LS?Ty)QA{?k3T;Hj5P2iG}g&^!N~XkJhslK*NJ0{L4Po z_MiEI=mej;PGL_6vyoje1qzWFcIJufM(WO)C@|}s9G)S#<9eJS$NS72Ki8L;&8OQtkqOC zRM08}TgwiJ8n+Sk`o%ou*!nTec5$rV4Sjn+ zH<1)Ap36xEMR%r;8M_4z1LE$T@31jpuy~7(be`~5`WS}{s^ihQxTXyyGkE=a4-I;# z^XH}?UD7bgoj3HLkuN4N!RvJK0pe#I%dUk_vX(*aHr`Rz);G-JJ<&^SYW~zHSf1Hd=h>i(klU!wdj;Z{SZOS- z5A|~D-JiOBm9o>|-5#<(Hwt7?aRaTKd(zYX;2AT+mgu*exo@Hkfn+6EkkVk@P)nC< zT--yxVPDr5F5|wv!rj2~({I`9hN|bQp`sV3EdTuGmJA8F^&Rk($5#~cs@|HA56V3V zZIlV6V!_{kt_@~Q2V)^#F~=MSykE5SVSL+K*@3bxO-`OyEbT3u2J$tq?n^$BR;JKy zQW?A~8eAbx^Ui?~*L1l9NK)y2Gp6*0*L&Oe0%2xUtxEK_0x|{^@fGl646M-7$-hkC z_XMRR-zeHrUm--O#n|dftn^-osP(hyv*}#$rHdMe6h0VO5dWDjtKu3A=gKea{ekSf zvvf4h{ymV-ZEPW~$Lf}+sBE&`{!jHM1U){8ox#M#LLITvF0y# z;_ps>+(5~gm8XwNYzgFD16X8zu*s3&mo;|NvjRp?PrRzTpWu8QnlnANqS!cX`j>rr zyUz7*`!u@t`Q%^rsof|4cg!z$mRFR&ahk1-$>;ps7s<)JU!#x5@eT|YidKaquzdT@ zipJWKd1A)A2}^mG0l1v>=*}3kI#X8GDc!>&OF^IX0#oauOCrO-1lKxuA#j$Cs<;&c zZmcL|WDS$Z*b*ziqI5b1`r$i@2<6e|KT!|IM};GQw#H-PZNRTLR#4B&qfJ*7hsIMB z#uL9r}f{zOsr zn0}gpi|ebI>7Yx%Iw7V_u;E9$&jr7W1g^ckYVSGoxtC3+_J*kNjW=9wfwTrqLtgjp z9Sbe&PEb&iiu?MiA1{{{b0$9-62n>CNc>*R5iOOZ5gNqffm(oU;iT2#P>~lmy3F?a zft*|dV`#pr?C0akbFrstBGFBhbfP#s}c-c8)occPg)&SM(;5^*3F zzch$&9k%zLf$mFT*#z^N&NsW4{e%+puZ*WFYrXC$mt9l2spJXx^dn_Ye64&%F;n#O z`m|r(x8qf%3wMYQCcJo=K70h7Ya%{``v;-ky6Pnphrwn37_teN>pd-pfkhX~dUejm zJ6CoOe6)a>lZ#Z#BCE$X-MM07jHv-xkiXzl5C63g2HmCd^751$V1wbUz?NL)r zKr`g~42t(G{HcgI)*);k!(8W>%H_X-kZy(tP^qEZCtGYLlOp981#$6L-t_yf*4M7N zUM`;O+G6xLn12;=r|J7!zQ>PQzy8!vYMY3EBMQFOUqeKI~^}o)+$O~%1Zfh_rRGg_E(=+ zxVK-fbp${5m%eoRTkZ#>tP=hCKDv{gM3)$6Gh z^HUGx*;cq=!`E-am9)0QEA*MC8)T-dpJ!&17Hc%+w$@*&wyvKAM6IfCF6TGT{(L!k zAC%lh#nm!Tk2v@x`Q<0IW{Zby%folf4kY5~#Rp_k^43;^f(yP|`Xz_|jvQtpJxkZq zcZjQ?HhQ(({gil48~*lZ%UaV18m2l6lcE6>xO@AlNQC32C@GS~`ARDLnh%3-%OYaS zdy#p`ts;&+^z|Dbze|q!_cwxzbyC`!e_!qj`FWG>q>M3a`a^reR^ItjE?dpc*?XY} zLJntB=G(6dABFBOg`ItQd-mnC+TsNOO%l22`&nn@_E=oyQ5m1lMpGd0+sVQQTa5FV zBgY)&f5z;WEdp}geEQafDKzdkc^?ZITj0W>{`g~%xbVIrPlN$hVa0K#xWmwFfwdnH z>pGO&`N@9(Mc)7Xxvb2@_K6X3fx`v^jq#~Xu;@(nfhgc1FJwCKNJx*6KyelH?RLu1 z26%cv^&JkvkIF7|Q|Ss0B4W`b-htnZYlm1yAx{nP zZN~J)a!R=)@|w&SljW4x01$t(fZy6&sojbWJBp>E%2uWfx2zT2(iF$TVFXVmHwLlX?8{lDW7F|GFw~Ec>vN+IL-r!BMEJbv7XWO zZ~~sq3&Z0AP;%*@-Ni5|3Me}-@i|%`Vew3YHpsh4JZoC?xl*xQ+VnFhz8Yg>KZ<$8 zRhe9yR?A_CDkH%|KMK5H6fxi5w!N(zLf1QEt!inbXIH1^oj$}zrGME*pEFtiRT=!b znC?#iTZ4~u3A0}EMS~mry73qFE?+dv_AzWFYCLiO<02J~_J*!#2S|57Z2c6W7nQF2 zX&mHh-(KRoo2)QKU<=qQX>nxV(gvOiAk47*a|?S8PC7lsI@RUeg=yh5ckE04LH@l`{94vFuJ`iyY{{u@Ak zwS2`?CwMwR>lDjUp~Xz}E+TLwzT& zbo^+Ms;(ncJ9L>F+5qLo>+$J^s`2)je)j4QYFjQ}fAP9D;LY`VYlG4HluzR#n?5#| z#l)RuLiZOGAIrEL2_UJl3dD-63(T-|7B2#Og(G8dJ-A$(y}N_I_%DoB0dU2U zA)U^`5&2u|R_3_rsWyWg6^9kww|?yjv^TTKC;kdni`r*5>?dyDWOJq1r?(Xy7~J`M z%9UclJZ1D?v0hebDy29S(@BR(BnfJu?R+Z0YT(GI7OfEDMe$5G%lOVr(byL+X)d5)%Zq6nNLqnI97RrtMoLg0%6ye6HZDlSdayZ3C6-d;7Jql5aB|gbodH`u?#v z@D8M6((dTKl5eKM^@f{nLcGLymVyAyEi7aU`}B5~xV=7au(|2a1s26D|8d2rG~3wB ztXTHZ;P@vsUzQZ-GjY`S{q|h=sj0HJF|=`7kZW3YheG^o;R%(>O5ynlqwgQ5G{nwM zC4S{d`u02ayl3l-@g*-R$Y2)kfT}8P#B7@iW|FTt(&6;lSEEHKiPJ*z zrCn^uJ+o*1SO_jW^+_Se#fzH6Ufd*Y3a4GVes;R$WJ+SxtgSOgS-rAaiUY%URug4D zY$2sc1J5wfh;p*i;7p%xNDt%8isW=P&o-!$j?L;|3lLzB6XGLFH_l#h7wwqf-w#mg zM@24mu@DG1A3eJLWICsvGneyOR&3N^#R5z2=a@*L%cFMB*xpp%*-~Po2i_FWeBGd| zMit$d9odzgcRZb^?VFqE!k(4S?Q5+Z5CCL!E5g-}~tW$K;$tFip+SCAn~YoLXoRRP+>IOPc{80}c0 zDP55vS7~HWk!h1b#Utf?$DFI~e4-1Yzm!>6^R@2>DN}c(QA%1*xZc!0FN(c}R@6>h zrGngISLVQ$&UaJ(cugS@(Om=3D%V#8XAS*I?bhH`6Kk-{!Ln;bT z-pQ<9+DKRmC-RMg0i>R)hTChi${qNG_gKBun0|fc-JeG_k7wVDJx)3gqHPU_ft=Xq zoS{dJ`Lwaxl^MYsXsL*O3W%}5qQ&K6LKAXr&N&K{sg7`y^{q9;o@4A zY6rU-%a|kkcGh#ZLf%1qi<=O?*)Js=H)KVni!4T4^XqYN4kZYcM3?K%V0Hp>{o6jB znoUvu+dj?CYLCbK04*FkbijEeCU>3K2(y)^3&!f7L8{es?KVtj;rjvF`QAdI7#_u@ zj@Kd1f7_?If7z!U|6!lrrr4*bcHY>(?NhZ_zg%UCeVQi4AMMLG!x#`su}@n%?3_O@ zTw@50aR{EDX+`jA?RwdW!kv#K2QWf_Qb%ADxe=*BT@9 ze7I@)!EULaQr;%lg~JQfx@+&U6FLS(c_07UhZM{YdgU{`41YV03xTpYT{zb7^Zc_k5uNVLo{^~FL zluJGCqL_X>OO8qiv>soDwtRD~HBLbS<7iZrse1WCddEwKgiEz6L+2KZl&QG( zxf0~5w7?8UgUZcw*PKr{(`z@T>&Wnm1l9Lj^NkpR9pB{l+Y3z>CN+}OE?G8Q$N^?) z1(o=XzM*m?f$bZAZ_a6d5urqTsOT+x%();6hP>F@n7yj-MZN80S(;|59f7BI$N%|h z%;5&0teTI7hwSmjpxnMTl1p-!4`jejcaLz`C}Bp2m<8=^N+0eiUnuxijDb{^fOCR@ z^{u$5#?fa|;%!r3q6I%0I43H8Y2b~O+3FXH9z$Vul%BEH8Yb&Og@uxgq(8W%nN<`m zr8xZ(y8Vo~HKLnk&txKzJs|4VxdcNEL)UEAmzSK=ovwY0mGEr)fD<&R>-g;A;stSw zbuReemKS>V8c+Mk<69zUB%cvd(;Jl^qxomKa7z6eDsJIrxI>(K@$;G@kyp&J=vO6? zpX6znDgl0pwfMKs#-0Qt4)XwrE zf|Gg&IH>*zWIw0d;y{k~eiwRVw22<@_q?sSV?bSmN}Df?sRmAWN;ae~lzF%8e)=fz zev8i~`qCe+*PkUE0GDWPAL1Cr<;$X)e%{(1`RLOH|Ll6N6WF*I+pPz$EmcYVwi73C zcG34UOWcW5^(}hM1m6Mi64vUTgTTTmp0;$g359yI-sGl>IbNFs8f4hMrPNOo4l(}g zDbQgUZ3%MJ^wQfpo%3|255}(`q;^#9GSMc(ET3kYmjhEQ6VNW2ECJeXVZ6g*SK1kr z25iqKBr`u5PVLn)M2wlqdHFT+XQGMku0k@jsykT)=B?dtrV zj&!5ieT0lP(di@e`qW4-^P}bbPwJDUDGXR}Cw*8TT8L#q69DZFD;!NN0CglN$^gTT zmTpo5*Ghl4{^ZL+PD|q~N_uXXKNgmB%5GSdzIW2eZEKrwsFJDi>+N*;k7J!)c zU?=}m2{5mWDA*B+;91%2LqL2J&x2z*({}s#vpO){kmZ;&e7+{gDc%$}$Zt*<{i=7T zQzmX$Pt>2Bc|O6hw7dTn>)2u4UP$f+m>g%MhA#AatxLpwMzi++JyhG7y^T%vOui zXMDYecwmwrRB|UuOSm7S#S|46(7hz0xC6!B`4FVeO|ehE7s)-BMW!A`ORmMR`T$n> z9F%VqhEr*Vbvr|CM*3K23=J?}e6bGwMOtzBv>`E_5;aAyG`-w)U}d{}dqogCS@3tdU2fdRy(8XM8G1&KqH^ar!ULJO zhQT*8_HwOlR{PqfOWlr~(*!q)6cBM1pxD$L(`mX+9_r*WuSHkLD4pCN4s?fryf{w0 zU!APzfsB_M;~zCCH8h{ed=hHPGh1cEbxn)(B8+=e#I=#uN8K5UO09bbt5oOz-r`puAMpob>Il<9J^l1Pjqel1 zBOuMJZKc-S51eKleG9At%6PFVZqq2RQ^@WKZ4A*dVmtFawHUS7RM{5;FoEem%4`tWA)?jgV9Uj74R99eniK^qItHX*{~1b` zR|vMc z@-xHOb9MQ{E!2!3Bk6=4>q7iazCR`A2R*#2fS$_7vg|*an4#I6T;8#W*k2liQJG{Gc*>@-zV5=j-0`@$zP97}vYcqMen}Ovu4F#J*7?u$oW^V2=l8w&66NOx zq$6K8CvHWa+086RTPpSAh6Cg|C%90)O+VFp^=+egccU%&jzdk=|X!ZnB4;o{tb}2m{-4tUitS@i08D1Ix08WfSFua(K#Rg9*)k12u9UMH({f1A4H}v_*%w9*nmT!g;e2F z5jOoCkJlpEGh$pCVi;{Xs8NZT3oV+>FX5MbQ_d_fnHz{2PFX%@T2XB6PF1PL7^isxw2NaKJF;q;Wi zy>tRCFNxaFQNM(wS)v6A^o3^oFGjMqM*@Q@Zm05KI_8E0y&fbzA#7n zY$A9^OYI$zMuR|IOQcPAgh&&q47cfeEMO5N_#d1~<~aB!SMe5(KF+a-%kM>FxM3Fw z-h+q!bfdPff>9jS0!Ml@iuw|PHXi_6#v!_@ASE*N^8naE5%{n>({Ulwx*^qKr)ZoO zZuiJAjs%}aLVIq}Zvm2`krD&$;B+E52T3b}1J-VnljiZzL>y291v&-5o_c@>eHdN@ zLG!rEsyh({RgeIi%#t8zBQtAGc6N(Xq|*8H?&|Un&x@>l5%2Ic2H|Ne`S5Z*w*A+v zmAJxT6-MKAM4ThIavPe1L`2}|kE&pXs0<$xmZb?%gR4x@GG|~ z!$`$f1XL5X7`E_;c`Zl}fL@wJeM_r?fCBB#fFrh{=Vust_MnG0<>@%O$3~$~7BUYF z84tDSx=^Ym0GPov!y^EUgok=KQl7;8oo$_y8F0E5UH1~qi3F7Ofvq9op5K9nGKh68 zx@J6F8&oT2TzleJyEay%m`2A_My4%sH0auX&PY{AEGu7mm%LD2J_N1sEPy`>2d#B; zlMUhCKi=K@@s83ll(_b;OQ6Cch#~ViLYw7<7m_w?hALqS+ykgTSVtUr6i4M$tUH3= z$sj6;v_N~-Lkrj=5bPHYQHg`Z4`kc`yunVxVBd=eadi2VKtzuU0uIvS10A$wTQjI( z(T00(J3FbU+us6@7{J2-3hxIhdmQQWan*+exS=Dpyiu+plCz7@yaa`vRyDWAo4rFp zaK5EQvnlCS^vMJDjz8*+Y-zuKw#*7 z=EDykw=>*F938)1nI)@*`9MiB_|$FaJ4gCI+hzGEh)Wgh1`f!J#75dT96qPBJ(i7& zudKy`bMWBb@vJ+L+Bkr2p%9@3PsNL(ZgGd45pWrX42i~hlQ{Zj8CZ!A!`t|{w;atZ z3vhoVXDtfSY|*+!ptacV8eVSZ$8^P(fpNQZH+=kh+_MQN@ULzCW+KAS1Zm7u%r^sm z?+ot9Z=IWM{i+(rxEKg1*RXbc0Th5Z08)OW(akg=DcWuXzVA#65+n&mqv8H|8U{mn zBA&(@iEt7JQzAmWT96-iq4Qej1|2JU@DvA@J|&lOX|NjvYG4P9s}RwHT)C^XR=!)!EJ%XB20lI&Dp#A{ex1=g*Qu8_znvJ8M$7Q956M8JVlybW&j5(wq z`bBcG*6u;ajPFOJ61%&8Afkz2D@ke-pN_hk=JuA_lklOL z!J#j~h-f6VWSfrZQg3ie>s9Txq>ay;3%ybDeODN3;2jzuf!ag+^A5-)k8M`=LBtl$ z^7-*+T3LyexZ#zuA&%BzjzmLFh7keYk^07Jk17b6p0T}<*4n2(+p!`VPb+N-aUcP$ ziL}yE&_~;#Yq+5}JdHVtCddmMhoei~ewEfS`00eDq#XRUh$fpjruss9#fVI2y$y=S z4|!~ZdeC&K_`Y-^tsTRdJq~!oQn~<7Yk;TqMiDA4#;;0E^z1?dwm~L7^{GDfcTL}4 zuI^>soQ$f5^bo-$LJB4FGXnn%VpncLOqus73-l$XGfqjX9uAX2 z-kH6euWmD`!bm{P+!)8Drg$5jvatBhR5HwJTqeFWo!w07bI_uVsG{%Dq8+S)c%Y20 zX+aEG5X&+&N|HCyq&Nq7M+sFbPV1@!@0lEc)r-w(0v7CLu% zpdMB9@4`XvislkJ<`OM(&Z_7QJiMrf89aQ>1D)niLWB?TUwPn*0(V)FV&HF^6XO`# zF~{n@ebCY_T{B<_H$sZ?%bVSrn07-o0{C$lw@V(Dcm3wwQUUm~l%Al)%Y zPrpPpDR4pfP^gzNL5tSKB4Kcw?wXbyNE&?7%}sR!iGRtFji(Ko%L@(G|6^emf#f-? zg0zPg1ewqg=2u6KECTxxt~_EU1e3)(l|V*SlP<*ISZo&{0%7-B0-^TwS)km!r2T?!N z&DE*zGZcF~yE1$sf(S0mo~cXSH~xvx%q7!rg`X$jn&0|1Rm#AUidf@k#?J_dva0SK z6f{EXuZ-VX=byl8$^0BNnOoqk-N2gmu?_qIB@-a8qrgH%t zS?tVac?-m$z+`4ES93Caf4LKozm~2UkuwG;6sFLi3XNwlVGhehomdY!Ss`mlK8!s> z^Ed6(^PgqHCP7TWzls={4X+u!+<*A<`3JGRgL=m9at3*O@0T$)=dDc@BmTTx9{9D) zS9{(++PzWFQe4+HLhuXb7CnbTSh_=g(Q@T*`VD43Gu<3TDJE@)e(=oY1eko}*`j;C z|E8guuXXf`0qgzPYn@NdxgHteve_L8{hW#Y%Y2RJBNh#XOM16mKfO>7qvgG}e`n;C zX`bGTYriPdSU1|E)mh~_EIb0H(FGUM1S1S~ooH4+UuiQ918ZcjCiADp^dx^S+`IfD zbDU6|YqW*PTCsL|OLk5^cq*Z-5*={60GO1}etsQ9qu;6)C~bIzi<$XF%oCPt-q%&5 z5j;G}zn6daB>Puw>!la zUvIvHy2TIicMCGf61?UXaOw{?n3iKp!2^=U3X?9irb=I<7SEVl}-}pV) zmsG=cBxMq&Xjm@tElZ-5_~<37FVCygUWyx&H8sqC;ibpeu%U#Mte~9eG&6@!x$pPZ zF;Ss47Q>Yf9-qnbPxF^P*a)EOF;9@MKFfPd?3Hk+EM`Zbj9B)E&@t)4ed2KSef<|= z9vj>GVF4=D46JrAU2P*pr(WeS7J(17-0@{N{B=b!(ae~YVd@!9(9Y8$oo$IR_yuWhFkF$dX=3=t9bM%ZmK{@ zUexX(#Z9f^u^GnXSf|77DM^(hW2DF5NY}H#gZ$4CcbDIUpGyG^S)PiqwvFyPp*3Pr{`qUrJKU}@>uzZ{83cb`?KOK+M z%c>r)bjZ1W;1l;2+qEfl0EEM-FF;9m;U=Guumza7o>MJ~4?styNFdd(<%Y^jJNMk{WM z2*A>yJ7`h!>BsE-+pYF#tGtS@k8rn?O9!T(r-w%oDXbtZ)5~8yr2# zdU6@V32wPiO%1jbg$xls^0e-!D?){c+m)g$AK===j92Oox`L-`g*v;k{r9w^E}r6r zs%HB2R5Cr+Wv@+GG)}IF^ria!2Q!ie=oDu>$sIU;fcG+Z58Y16Jy0nJ?>@?nyg>`9xKtL@vwNJ zVpG62UaGI_VfC!ZrdVQr~pBB4GT@O&U+z9+fLK8RO+{ zx}Mj@o36ZjH~#kC6VL14RIWBnj8{DD@pSlq_UYtQuTt;*qzC^e>{Igp4fg3djr(y| z=N8%f0(%wO?k7E*TjHmC)UT`Io0=&-w|rjz(ZJ<4-;DQj-!43TG51ic;UB zPY>GsUjCU|)uam|KGEsgmNN{}oR)HZiv`HMn4v{0})*{t8as zHi{yrD%uCz+uOTbzw!?`Rn_S}g-!K#zVYvJ>c7}jZ*T91-mVlj)#&cSp?fkz_vNqn z=KK2jh5C7!J-B5OU`CNsKL@DXc=+mHX6m(|B#N1OEyT|;B==Uxo8b`5X4pmh@Vo!O zQ?Er_{h#5fcVp`R##23fV{0gWYHDocKm1e*p6VX&@0}1!@l&yhY5%}eU6YOemQ&4w zQ{Hc(#GIIYVr~W54H9of~H#hh3OXtv+Uh^;gcV6DE$V(^X`Gw-$ z3JMB$3p}3|hDQ{6eJu)(F24V)xO=bImsAq|yp+&T`XcW2gN3rFv^U9RZ{GZkr#|{0 z%Bd7KwYIkQZ#gv@U-#l)cq&Cs&3~U#{J!#^erjXVfA&*b(_gne{|BM^x1Tyf!BhVw zr@raTt?b5k_H_TdpGv_~DROG}!1%x9)c%hjn+D%@54A1~5q=NVw0^4j+fRM>?|$mw zXwT^A=)e5bA7kD7W37i{O~Vr(cP6^}C)@trPo0__{htu3pB5Ir{3k;7%gWT^%I4b2 z%I}q#lhvvJF`>Hk?eOQ?`u6JS_Uh*D`tk0@&abV#-@7NjcYh!LIzIeOCX@g2gUNtX z2#-M(z9SOMEPlPeD!=O~QphORpt_(Zo{>HY}V}79i z&19uxR~)bLyYlJUJELVcKE8W9``%}1;Dzz~iuvZiz4iHz?<>EyhXd*POd6^dx?-3m z90nVzm-=u*#xG48YrcKVlK1>F*jT&z8IOtQGi|E-K3Z!1#$l+b{{JMQI==bugzB;= z{GWts%s&a$+y73e;#iTc%J?X@fVKQk4(b?cgm8wzBD2UYjv68TUkFua7==(3LYV(| zLUoZss491pis+^B{~}Zc7mARw_^MaPE4G%jaBBzmmF%7&1EDCJg6}Ii4G$=Us=$8` zs&I?noYZvZ{vuS*`5-BTYTMt0s&FEOP!;-%P!;-%P_6way8aJBb#FlOpM}kP&R{#3E^11yBe=Toi=v3WZSR&hjdv5UK(HAXHP3 zJS|MaqW?jtaw6)XV@dRiThxCOs)&ChRGR@)J=|8ke-o-=6hgIlJ35yK_7|Z#jRnae zhdTd1gsLxvP%UiPp(~VoY)Re7yUjS#tKKp|9<)J*n87kPH|TFwZC z`;uMC|3;|FQwY_mmzI>q27+5GeZhP+ zYb#=J*zkfn@#{U%mU%<-RCR+^p|W5i@3fxAC6P*_B$LfB$D=r*^}5l+-GcZ?2=}dH zAp{3c1F2u5@1c0e$EP|Un?{^7CEFZQi$>s>ES}v#jF`C;l8HJ=5Q(4;SokbJeHXKp z2hb2Wt=a}4$;#mhfd+gm;q=LVHh@ z!7P<9U#-vs4yN$IZ(#UETxe|;Z@t-h6bq>f&LwQm!2OGX>QulL5H$XT42S}m;MXBM zRTM(?VjO@=T+7xtKYUO@it1VTW%}}$a9~tK7nQh{z+Z&wlLV?KT2a8~x_=X@vD@K* zfedCkaO2MOu=HGg(-}wsS z5|L&ZQpA5-v}+Yq0B7|DK+1QgZ`<$NGx-LBkcn?108oA8a=WUw>G za86hGw&;)t*gv5X-xutY)jiti=;m9~r9o?{?y}#N=)%+~Jq18Lm>W%dX!89!jsq`< zTn+~AoC{}}z-cjoo=zCQ(%jyGs!DFBcnVE+stx(^@>%6~GL{5qG-^nSlFc0)RT7wKB)(}>6o*t)u0lLe+3LCXx9KbK>< zx+uNMHq?;KPFqon0!>>k zRJc@kG4MVcodw}Ki!z2+3}8dUKTVLmmBCjU*)``z?Kpl!tUjsm zr_#B~x@*)p^g#NIb^`6762Sl{EV=kZOdx6i*f#W_*Oh(*eS8>cGMX!3oMKVh=1X$= zRWBJ%lKm9j-vz%N$a*GscsY;njr-iM3Noz;GNA+E*&BB?YPcVpH-wA?MV0Pfi;A>5 z?Td7&`QfQ+FHO$yjnLWLdgy1oTwnSj>#YA)quy0a7aORNdRlhapl2BPJ+lcYa1~aq zYFp@$+N!dDb2h39aq8;Hy`S>5dc$ojw^g3(yP-?F&@BJ~#9{`LP|s>Az|Ibx7ih~z zeo9Bh)^qdA*I9p|i!OVWgxn728&x4{oTWQTtg-W%^}NYFi!p}OZzrg$IctyQx0X|_ zJxyk*4?Pw`o?pD%AOHJ#p}^@P^Z9Jx&iX}$&&pE3*`OQ8DCSlzCGC*ZJ)$zHx)J(J z$_d^z?j&7P_gOGW;2FTvo&k4bwnOT_v3KXuQ2+nm_+PUyV{BvJ8T-!IcQcG-WQjIY zB0Gf=k|bu#WZ#pJWUCNmiOAA0LXssV$ugECNrXt4Yu@kg=lA`cb6w|Le_ZGK^ZN63 zUVps)c%A2Yz0Tu#f8K9gC56(;sunTNr9WRiY?Vsir)8~^FlRTS-hA5De#Th9oQI>a z741#12aDhcP1k7)hnuCVs7JAP?w*reg_$Ex#`YS1o!JIY|&oH5S)Ox`ZcS*YXnO&VJ2`<;P!o zv3Q}JfBL214{%&c^p4YlnOb@0nX~0d7VrV!CA$hbIW7!TH*x6x{ClIdbRBr`U8kCZ zwCoG%&{L<6^SD2WJd)3Uz5B7SCgshcGSr=(rO8w5`PV3Z7QbppOta(8ODL(9<=_6^ zVxQH_S_!?hd-k3Y{ffhkJ1T!SW;uB#JpAA+s#eJJ}5)YAqu9frsK4o^SJHSNR;TVYxws=*A zK&dZ13yFtfC~WTQPr3iDlb2|csJQxnwy4BsUW1#fa4|kDb+10H4`aM zc2nwvQyUCZpAk}{NGwfe}{7) zMS`<;PX1vQs%`6^eTVvFh`h99P;(Ppxg@tsmG}6XedHOp#t4g1BvQWWBAy6dVXPxG zvi>1dw;<|=Euud|k^fDo;saRn6^_PbfirwB!NM(_{LC(gdmX7Gu2vv3dLUFqGqSev3ihl{! zg%B=Zhl6!QFq2T##u-)=x_e%Vdv6)^jr-D)>fpR8q7nP8;cm!+y3Ur=oNUpJDs9rX zdm{C0g8(EEi>VCD&*}n8qL3yd`J4?w*}`!@ zP{#F-puA`%LsU>s#SxglgW0p&y>g0|Mo_J%)%VWB+K=0OIu_7nR3zUT_S|1>h|bME zVAAPrJn*YX{+k-pPsLM9BI%A=*fK=_y^MeTRIZZzFLe-Erk`2|9jJqdpxD!1ga6~F zVv9{C%fUS5Ix5ILBC9MFSUrsl&jP7oi?4?kAFdbwL_=hqxsIu@2hu?ubnfFS$Wti~ z&P=8pWP#=X^;0#nuF8i7M$*}ovecDF9y-6{j`)OvuAERpajw&}RG+g;@;wS);U>ki z_41Z}v&48Zj=MNna!Y3glT{w&(^XsEm9lP>ic?j(Si)WiTz(WF*MZ7@54DuSN$B z(cnN75-8!8EMZDCAinh)4m!xB1Q-^rJ!C?q(Y2o@Q`(#@v;Z|p+2ExHZhkT}w}E|; z7t-Ir9v3eyD4XtkTxtE5y6`cdpGTN2x-t<3Ro{580P$gVdBN(sh_xeD-Tk0DDM75N z$e~{<8S9F*TFbS^0}N@4zu9rR=9ae^6?0Og`w35 zzTBYP)}W}0{C*xO0z6?Y;TTb=wGp`w)&dKlkYMaHx^b=EUU7YXnu~%X>vF2*jxU=R z(L2JatKYoc5J|Ypvn!M#-6s4-6qHrf5IE9&?;D_Nj5Ks>G_Y?B{@#4D?%5&sF)Ij^ z#r8PY`Pm-Og#TL;PUrp!0AElo6xF7tWv#Aj&HQEDV4&%XCt$Y6waE7TiV2eZx%Du# zwZM(5k_y-%fY&NW`S08VBV{Iaki9IBvk5FZsAUiRk5BtltgX?l?U~`_UZqyPW1uf9 z+~FlaeROe=J(9;0xUF1k@Vy;;x&5_jBiI>)aYOzDKnN5ts=^LtcmgLqTR8v_Yh4Sg zC(yCNO;_z)4C?%*-2Usa6uy;bJGF(~CAT;zm!o=F$5%Zbh$#J$o;#(%jp9Jx%+* zkJ5fCK%B2J^9W(mxz7x>wTCan4fiDlBr}QV-RFf)O%x}rn1>%jqS%yap9d` zB^SM(sqe?Ea$etOGhgNOL~@)OVW~dB+V076DRjWx6p>Ho>_RzqIdd+d5$?{=OF+*p zwVt-G7g^`Hz9I+rQ9SaM+|otJ9pvEGQ0~I1!LqK`m^6*nV6LD=a3TtNDXUxPip#Jw zIP@%Q)%q<{6dULY{Anl+)BeVC%2v^bHIxpE4dvp==C<)fW+U0SEI8K1S@Y zQwJXh5OpYIXB|8f%K;{_$1)Xa?C{z;JTr^?Aar;Ph1^H7xgNl6UUNSJn75E}H{M+c z5gFO1sX4w)D;i<{@wU+!e4oy#vdN~gNK+z?d_Zvqta8eu5oVk{1Mf7&rbiC}cE>vS zk##tM&YfAp+Fb`PpmRS&vX0kr^k9emL)oJ}5r>|f38=|K`lk~0$pkuV+>`Ya8hO5B zyg@=F3Or&zlHd1Vw-=zu2i><-N0eQF1Mah-*U zR_}O1*Xkzjq_d2pkz0%@&O;>oZrZ1Z2h(#BYnL0k5V9 z$FOH-Emk@ALbyH?XXDg4EwTM?-y%mi+3T=evCh-u$WMn^YqC~#(8*tC=YOWpJDUsROQwRRX3;}a*8m<^@xll7Pc{Io zu8zKgg*g&gmfy};7Z**F24cR=H`7u69OV(hE9y+O@hKik0S5-q9 zwy)jvqJep7DvRJH=XKaiH8v>}JNE`zdIMy+K@Qshd#{7brfxjx0-uv`iuqz4^K)17 zqXvWVOHxx<9kYq!*i_|~Fv#4*)@Gm&z*>$;-@k<2|tByUfOj&w;lPD8S&li$=+pBr7_ZfZoS-1 zn%zB~`6o;2PvOta+?PUmabHUB3zc&(Jed8{xq-O9j)4EzijEaF+vo6-`pYxi|Lo;< z^V5I)RQ4a6?sV{9rk}cjVEU=8n9X72KJC&1(Q6<5=da<*{qCdt?N7zgEzv$43wn(9 z<=A?%meFR^+CdEHFedKHLfk)os)DZK8sjj$yYKbOS7a{6@3?;n)%%Rkvwhp#Nb!FN z)e*V{<-ZA4*k*n?VRYU)ZAs30aiYGIT|cApi+2B8LiMuoFT#5zOKI^Jrf(+R zpS^PwpW*c2%!ic|(ixQYu)>X#LbiraGUVOzf|(%!tys>W@em8449d_0w~zlnq3U@4 zn9<@~_4vA1PqOmGMY?yHSFa`i(f=Y;bK7*+;-hq6uM4a*72MBk`3v3Sj^hfMY#Iy5 z`2~+)NdNp9^52AN;jyt!1uhfi#BQfdcCRy2f5Mkj?!Ub_HT4bqVYKPetogm>?S~Vs zCUJT1Swgq|i%{)2SEkX6yAz@@d@L=e_yyj4mJ#=E;JZW4p;EM8nMOV&)Phx&&$%4@ z)@WUsMT`Tj+&VKBisLfbTE|T-@+eud>s(K0?$fR2YUg!};3?<3ePK=HJt1bJSJ0P*lB z)eo-WAFjB1T$fir79`_^;h+Y=V9I%@E z{lOBU6_9!)D@0(zd+g-ZX_rjBt7ASgKncj7?-nW^2&}BkflC5Zfj9lT<7I;2i3Dv? z8D}bK;Ed<4o+zO2K%ZBlQi}uuXV28D80$OkSvbK~{8bZ$6XI}TrRUnLTTS@fIQ2%u z@(b$WEpv&Ee_4s7w!(sz`}bZG&fZc#i1MwtbxO1)C$;w$6eQPNnZr<7ul!hKQ>|nr zo_eMe?q8)GWv;kzq%) zOrAH*^oWj}4C;G)exh|s#4WYC6d=Ml0J_-a*mFQsfXc5uW)vh0w{R`U2~1=W1+bnR zT+RT5;|w}64FbiNlvx5vpX1@Wu2>=jj1pZaat9H8QqV+fC@heZbKfL@O@33g)9zs3#tSajK}%VjuRQ%<%q1|Zr@ zfB~tA1Rc2&;w97Ut9qZ~57AhN=t#%~!h4y%z$}ZD;5sr;LKJvwYok+Ee-Tzf?a~h5 zVspkOYPWkSJ6;>eTMW%Le6cXAltbh`YX#=0gtG{inBGyYzqSxx*4P}t#fwDA-d`=V ze%MzdIQ1)CF!U7oLw@5>*C>oNRK6>kfzQ6)G3xgoz_M3|I(k?JfuMn1Gb#o)@?n|R zTL-6P_4+N~x^&MewI1!eY=Fm;)m!RFs!E@n$qCjICC$elVBB@hXfD1fLZEU{2=UF+ zAKE^GC6z37exNhnlzh8&8S7??p-hD?@ndONYpr$8l$!dyf85ltWS-mcF0sCq{lB*! z=_`k^3p+yq)&YN`R3-Lox0)|2O7l)Bra-anGjTh|y>#w~ty^--l@@oQBP9QmkEK;B zPpU&3sizk~=byn@I7%$=g|f-~r)8}B0igf%V#d<5Bo?b{JQ*GpEG+9Jpt^|@NaBAu zB7XiT2;g{B;}UBQtn%E)lNrwwu0$XE+`Con%ws6p`$Vvm1XeoM`QTVj7*QbD8Xr2| zEvlE}wY8F2_u}M4uRg&&M4g-#m3>LWUP$*9&VZ!V@lKnWx=aevW%o)Y8_9zB0akQ% z=xH~86u8XLMq17W0zuw+$ZuyE$z#vszWOt1iAn^i)N`7qJ!j#ld-Yuo0LJ}&J6<4J zpn_TLzg~0+vOQzb<3LYaqUo{e>s?<<2t2TeWBfngz$2U9*oO$9pDh6wt_|Py2+tjc z(PgnQG`Gpg&gG^C`OC?lUy+cE!00=7mkxQUO(rQjm)I*RG zBxvW}Kkna38Z-t*7~5iOP%0>`u>PPlSEhqANF0O&5colnmk=zd^CTMp&OCivSQ|ij zwphYFAC zVIG9DDHAf+Ziic2Dwe0(xh)DmEr+AK9|~U>X;S@dlBS~;!WEDt+CKY0_c0l9{!!GV z%!|s0$1QL0c%EoP@}h{LP$zNYl)@D# z@|!^p3ItC$DI`ZELh%5HsT}EpFTT6#?FJD_ppU_i4tB5~hL_AP(oJ7-h#*1%`iEZt zOMft#KMMf)enhijYpummfW~0}h?N?3HCu!9hJXd;7xG?m5PaNF_kat%@sQPIm0 zu7>42Q{C9bOb~~5OLw%#U_NNkgXi)H(@m}XJxjZFwX@ITx8}y*msMUQ7A0)UXClLM zKvX)=Ut$5m=}Co%OYWx1Wvz>R>V<{9B&LSUfqbb%uwJGl$^4i11 z5PLYJPNeD6LFN+?^H(+w-)C;`amQh&V%C9+@+1&j7WLco9?`b}&~_#g$`V$N1MEsq z8@o;AaGI_roGq; zph%aZRM|^F25JDJga}o;rz=+&LdQy%$+?HxHpS!qAf)Bf08l?urewXmen*f>uH;jE z0+Ht0d-zuiW~+wd;^@5hIO%eAgEF5aWEW8W3oss<0EHI-u80Ptw&u01SELDMl@a(_uNs7 z0vkW>;nNEi?+THloihcV$7`nGP1A@#bc|v32_hJIQypFkf!~Lam6f8KF?;|rtl)VV zaBWVa&!ng4xK)2SHN?S-yJBF?oXZR=(MNz$6qF}r{wt$!L!jcT?F z9!y2Q?w>m9b=ToqIO`ioi8|Kd?c(5D4*LtCtN^cLaj;V=5_V>_h~*b(T%1Xz0v7;j zVP2u{UOE9SvRp4@r*##HTWYS(jcD)ghJt5v&ysc!(wD2CU_fsD0{(#HWLh{hbgO-c zMr4`KS6qDYa$oQ=Ty?z$>i;Q0g^s-oc~B44RMtt{HhK~`hfRkjfU7CBjZfZ5lW#tI z0faj5zdw4YHpaVo^vW1>|ChjAnVMiK1Ux-h2t_F@-gg_QtG7qXxd{uu_}59T%U_o! zD?aN4|B@6jv;Lt9P?%0?AQ6ryvH|#>sr!%kiEC;Ud**i!MLfthjRVO-CNz*|h<_SK zxVMB@sYKm~5wY~4Z)z0qh1SaOn*ln+1s-edBx7cg2c*AphF4l}MfJm+UR7`kub%kX z3Zh^Zv9g>s;d?Pl4v;LpZ4k^u1LS&b}$FEIP2yxKXJ03QzqPwjh057h;ix-L} zDl(JIaDcs%(xXER<|JOQ)8VQy=Geu7oNHk>iQxH@ii=XXLHYI5u8Fb`)grtz(@6EC z-nMwl0GCyP+`EIf)xtw>`lP+My)g)`)`p>4Q$ktcN&qB`xEbLj9s^iwNR+8aci_4uSN7cl=oExq`)UxwjPxVKlDOk9~l z>hkgZXGxdUU7vDRhM#;M63XHNcg?KUy;slTQdyu_K?-dXbHy9DVcpiF+fz#t5_A(= zo6>wjS>6?JxbEt@F1OTgzf28Hpz5W$Om%Yu`5~1QOZ?pwAi<{OSQFLF)mh9G{aEz# zWZE+GFv_$T!fp6QIzoNAdG2hjd^&5=i#wL?DS-Rvz3EA5_X+uQm6kW3&BWha;bgQ_ z=}p{rpOl!XsGpuT^H|*Go0IogVi8$z^jJ>!_!vF7R68S@gW%2f_%ZEa&&DfP=drFX zZ&1a*rhabo)vZtHTkG!UCfRaUnV-@0St+{|SI={Q?@91nJH-uJ9D-q@8PX* z(OWUrTPed^IX0D5!&_BD@LCj$Efy@a?5%m|{ojPDwhf3uQUd8{_~WrW`{mlE?*15MN_Utb4gz-eII@2S=Eb+qWkBxEYcQ)d9A+F zd-Lh9sq`Zmo(`ut93y?5xxN%-u__g$*(vDTM{>N`U35ImIVqSVE$DZ;*e@UT#Uk66 zRO0I`i%{3{BYFB-EquZ2r!4^fXUzS5_1UgPZ$m}d~UUNAV*BQ;tI9wrrDJbzEa+hd7HWPK4B z%YQ$kqvYYzJ@XYCt{j!Tj*F~c?;o~4r&dq)wxnVbK%K&H4ypg>Dq;Q z$)E;}phokcXHG#)OhWZy(DT@!mW-fQCZYNysI4=oedzxYs+yPiFqip{UjDy8s7n31 z*iXS%vIMn+X@2}iN6krss-eLQ9fNQ~^vxXjrsk#1VpIYe9j}CD31aTE1%eF;(BlNC z9y%Peo@ku}mL)<6q2ajaY*&@SWzmqJq;TVwe|S`l9hg^LI3}Ofx-Z@i6X6uchBXX7 zPK3yoKypgR3TUtznu32G&YUsgmm>(5l8+k1qY12Mm^>YVI3Bf6tB6=YVEpB z(2UF_vgl+&ar7&3b=&TbuH+DYyNy6}eqPA~V4?l6Thz#qtI6R^9u<`wg-Y;ifdv~x zN@OOQC$i`Sg1y!gZKEP}`mT7R!Pq+J8KLkzrEq7ZsQV4asNIzM3% z!y8fSkRWP21C3%km&FVqZC(^+BC6qc`k}JB@j2+58HvdbC1efTNF7u{bP2?auq%EQ zb{rY0gWSC4yrrXfg~_8Tj6i})!tbD9IxtvHUAW#3>|7G`IKbplVFV!R$w~-? z1tFvKh%B+ho0fN?%59?`1Vldy_`}D|Vqy^QhbAjwqrEWUhM}+=DvRt0nSc$q4`e&V zXiDdsf;X$E)mFTdSFiIsX<|+Yt9R}+*h>xw?Rx-R{D^2%zF`0>H&`@I{#4L((^?qXH)6D?I8@H1ZuRY%+tiv+6AvFf!Ve5aR zm9|MciO%Tw*82ysEr+WH@jnbU*LDtgGB@2XfrF4&43QDn81d#ikeImNN$1#Hgra1D zr0a(=B3kiZPBKCsteM#szCjI^E>QhflJS&fi@5H)nf0m4Gj@tQOas#<4t_{K8 ze#sB!YX`imIlrkeRAhYh*Xp-T#gP)rL|*j^&jRz~kEAb43;8LZc#s|1<C|`_pT-nJ`}}*q z&SD#PQ5I+QYoI?X)!F{Om)XHpWp_A)V<#aQyO`OOE?=Z{@Bx7ooEOcdlK?D zGQ{~| zmYsijQumfc;1RM!^saQ0bc&%&vV8THr;mLBIOU{R;jT=Y)_|dGy53|IUy8CI|4q#T zwe45w|3#>ZUaJTmr*?_latvd&y<#p{(PN4~b4@<)^o3UWJ7_#@@e#>Jb8FpV)I0qbM>tVzy1Wd8|~nwNc3~n=4xR zVZnt$MU)M3EDLQDO5nxVn9g%EeN>+ny9@a|Ir2yi+hf8PD2q<{cUNAhKJ5_vcGCJ# z*Y%#=L$!rW5kl5jW_@kv-e`67?z^da*$<*QQUQNns6U^MeuKg|dd_8;{WL9u%L?4PY^iPiVC%?Nl z^qshE-c6Z}3C|@e!#I6=3e6LF?o<^Pcp=1XTFMif9mS_V$Q-qv=N?-#mFPW<4!^B_ zo~R48<$EaX4b?EWDRX}kc>U05xzvDS&37d93;XecO7rK~$c2NXG5^Y9JJSegY=pXh zO_jksA-ge(pHr_Cy(q+TcWrgt;UH<{dD5Zj*63ALKdWuD9;cw5SoTpu&CK*c6;czI zD7uG~NY1p?^zBu?qxm(Fx5weF*Rkc}4D}kG&a=68IB$l1hO74FR3*cs(X6H@Fk`Fj zt1eXVmKM!uiB;kwY1pI}RABv3Fagb>A$Y?l1*^T){HVZ-Y(t=6ll&k{0rPZOV6@6{`j?(OI-$)o(7B z_+_|Y&a2mJ!P}5^?Ng%ZAdjtaLmB^FX9{X}Bta``4&l(Uh}6FJTJ>xfyN57McrrB6 zWSztv?+jH@Bpt!$h2@6i%gHlTlV3Yd;M!_!i|V}$j%iZ@?djjSZ;k8RBAf|6fPqS@q}jnS{N?9AXLxzg9r^S>;3 zUaWr68`zn?2K zlVy8veJ=g6N*QOK?>B{bFGdF0sj2oNOE%$JeX20+iX`Spt_T!iML^Sdf(}<tmb^iBx^IQ(XxS+SCuFEF{o~d-;luKL)pLn0Qpu23*{2LDDxp}om zm@F!xD{=G}xTd#5Bht2kJJ6x*ehU4jf2~sHVq_TaNa)4Lu{Wo$)mc7^DS2hw>vPPn zFr2TNvZy?L{;m5vuqawWBYV;!^;>f(|J+H<_(zjmr?$fRW-1*`DlFJDt0Um9K%m|q zd-mX&?2Kn6H;t~GnPX>cdhxZlBwR^eOm~cF;NNgWr~OFgX1(6k)ld)96?}92S#gWR z^1atu@(!OQ*vndR+P2E3rx8G3zEfizBt%9lU4h&qp5hrVB=YFxL?|+p~%VqNG`if-; zzq0L|gI__~#B$cNs?~sUj1||#J2Oh`d22Ie^W-F7D(K^nbV1LgQKWQh*tpMo`zbhh zzg1>ibfA-N#bSLJFYG0cmf7db-+?Y5ZcXSo|0b>FRJQ5;aJpg_NrIL?^p-k>w8do+ zxuuZ`{B`-cn<`9GRd!t_mZ_rX%I4t3HRK-}M)6nc<@kezg~m>gId)??P^xzX=QQj@ zjv>JXR5C%|gR)2a6dPPP^}g=@C8jU5MW z()Q<4txdfb4yiY#1nW88wk+^u_S@Q2Od(^f%62n}PNv%RNk?z^uE~m^kF3vFeml;P ztb>0pbg z5`*(8m|BR8h?-Ay8RU`#=rpx<(AbrG5^|{8PhHZU@dEIB5`&f^C&&ULlXeqil<%=a z5=B!uUfPNx--E-cL9~`3YS7uj7Kr#%yvd2=HIU^$D5@2!8k|-$0@ZE@@zRShX^eOQ zPfLRzWLcEwfJC<#9*pPr66yIQy?Rb4OVUoM8<|G3W2&eQ_l)6VsvKlFQ<9=?k5aCM z$Y8g5JIS!66kSD=LRoKLS!Zy; zLUu_I#6*`yge0I(F)pEsty;n>X=*LhTR)0SL$MrbU_>R^a1acOu@Yf+JfT3myJQIp zRwxY&TZO<(Nj5STs$~@2s4l%UD+4PF0|NPpnI)}&q?ZqtAkwspaOTSv{P`e>b#9G( z(Eg3L@0Trvk?cp-7x1fKgm}CneM+f^szG3?rZj1|h2AQ-(++nkmU7%uerOKMwoj5+ z|72?_*)9|QwH)aB3*B&A3 z#1}V+0%_f+nIbBMDlI;NE@3>ttX5*Ml3ZuyM;}lY)2+UKlm2*vreP}AX%sK9C9kcf zH0HOcR43r=U4;oek8?PW+b8*6#Ifm8Si`FBj4!9pSH`A5k6iH9LwSE!pF>lyWrpt9 zXv_ERWf-b|@cd0~aN^5}CDf<~qzi1cOTy@q^rF75=z_YHd~j3URlc_SB3%#76^$aY@*u)chP}65(RG)Ga2kr%pgnm)=P>SxiG`{& zqsuanFj1)8!aHHLzgYQ1 z)p^`%_s}1j8XbG?#kbe2t5jaK#snyDzN$hnjc>mT+DVeA@WNI2tF#bl8Q@L<6`{@# zvBf=KyR35Kd-@*qI2V#c99YGdrybB2z(&;=D!Agv$|X??Q#IUsc&ZfK%Di3Tkv=7i zm#18Z@WSM4VMfv^h0&c4sQ~8yh^)VWs z$g#v2J_7Gn;P@z32QqJY(Cqjfm?!sClVu|;Fu~;e7avzgs)~iV8%PvmlK!P)dnqQN z!#{-wjFdnS7b!L3O^jMREi`jO4{Ega7_%RAP)hmACB*F+ZyZ>GX_4lAAg;7WmMigl zrEj(E@sn^tt#8N*K`cYxSjDNx!ICb%F0P43ECpvM!&)@ObglOGXr`~#cyChmw{Sv+ zmO~oLLl1klCaXqxYwzC6X2;VsS^#r#Fsvfpm;!WGk*|nUyRqQD)LOll4t78<+R`DV zmJcZdJw{ZE>O4x!!yeO`5zYejNt>05J=#EjTEz_cho^eHYi6^M9Pvo5E9MY;qVl0$mD3ZEG z@_>Wzi8uBh_U6Yi+!=NW;)NgfI!gGnT^r_BP08vyU(E^|pr_rGR!jOz1hOvoT z{?YjMXMiG`i$!2R_4#YU=tS9Nx7SOW+?Co~RzJijRI5F0w0)q)&#=N5VP&3R7zNg7%mPWuEeKXr@9F(p)$6DsH1Ju8gbOz3FDgBIPRMLc*8 zS6Q16=2-=^$)8smKp#XdC`5w9h%)kLym1`ehh`*QrUh@KZnj)xT#3WL8y|I#7t ze_+&+;5!=265$oM&upqzP|;BeFrtFfppV&JyuOVrrrp41hiwm;n-;KTO_HX^ z^C(65ebzNXxg9;OAH;-F4R?Z_7sFFut0mQ7P^16@tNVxNRRTS6!hsxhQyR|qkk6Vi z%Q}QIfN;7?b?Us(qH;#hBhA2oi0Eb6#S2w2Jv6h>TKyOuSD%tfx+rg3ftkxE;}@mZZ) zvj%{$Y<=4`99zg7q^VydSE89uHN~MVmjXPiT8?oQo}6CAZ&*Qlac7 zeWom3OxIkmogK&hoUsqTI&4dcmyY20ECVCWwq!gtx_j5ksupTar-t0INa&@S+}Sk3 zJTy!MjD(EvaM1V-ia-UHZLmA{FHWf4(jZP}bcP}<1`^3k=f7)VprkF^W;jCV6H@9@ zPEsoTOHO~@TABI65nkBLQpWzQ`DkNLHQ&u6LfI372CY%c<2f&CpSDVVv3h?RL=)|o z$tOv9TGZ!on($x8@tZXKg*1q@HE3RMRQ%g0_P3#tR*(Dp%+{pIrwxOK==lC^I@i`r z{`=hI`ZL$-4XJI7;Y>8u868k+TVQKe>>bhw7Ea@GG=hNeQY7-{cy}MV}B5KFd}p?s(3J{+414C>G;u(H#_?i zt~bW+n!PVSUlht2eWz;nd93|&Tg><)v&mabSG8l7)-iR}Y`U#|I_|ATDj`|yr8S2(yjXKBm06OBAobapePB zU$Mqm@D>guSApBU3kXz@#eL&cb#cE9Fao}DI?i$128>Nx##R7h+cl1>#=^LUidY1s zE699{z@dqSaUl3`;iUp#^+FsHUSnsQUUfmXN zYpz5=nD=|bEFViUQ^2l(XO~!|$c5)SbX zaqDL8lS{K5S>U^YA8OFIBf<~{K_>}L#ws_*c(}w{*FE`!GLv9-GhEO0ifi&bJvv6CTJl-2#pa14NI4UjZo ze3CCpMCJ!tvGw#t`YISXJ~3h}athf`L--YC_cE5jF~~k~ffE!V2_~W1k8=52+bfe^ zvu35WNSKR%S$wODribOvwwkLrp`FOy)529j1{?7Xq5~={Ii%m3^1aplaQ4>Q;Us>( zA;X&{T6ZGOjJ_KZ$D-$+%9GKhB9^{@*G=QneY91tLsoeak3H0B!1&E+3o@sGpYFZ0 z57E*|#$#;0EPRzC3KAj0d^Czs6?}e7K-kT~{J7DT_>)E<9Re@(uQa_)i$R!P`sjI6 zqPfZ7vRPg)-^VOm0=xKtS9Rw*o&~T-Wgsh55nT1lS^4hkFJQ&Wn>{Y7n(MhK>}ep^ zDTOO%a!118t*At0UC%2XD(`O8yG1{Lb*G1MBU>V$P7i~}Pf%R0S0r2ai-g`&opi~c zPr0erOi$Y}$<{$uPB@F_ z*1a$%XP1PWDtSGj3J8dF^3It2aSluWge~8DDjjZV?LYUm@0=YFiJ+kp#&v#Fx{p_# zFDZ@9UL6Rk*$eWP-FVe_t>#b7{WE#5i?-f%_`|l}pL|}l{bQN`Nddd@bWPE7oflZH zW4aS>{2Vn1WOUf$iqrN+{09A3>3`G|Pcrb9>R$)2x`EL%`L}HN!`Q1r=fMMRj2XRgKRQYv3yF zGl|HPT#86Ed~)J*qM;91D{Fv+loL=cY>Gcwk&Y0p%+oruO+C?(M4B0p99_Fz&FDa4 z<{zIc6!c;x?2^S-Hq)ejk=TcB_KCY@^>HKGhnj`ifsSbhdIb!m~)I za6L-tYAp+G$S=aKa=0~x%?meSxirzjlOnva;z@zt%COSd5uuA*&$DE z42Q3b-6!La8gBu2wJ$2!%S1r_;PDP3dhXPk~`b1)m%Oj%DW$=Pv11{TcKUY%F9Fs=7~GnS|=`f>G#KotrvCK9RQyJTI{J z33nBs>L7OlIbI2@^Blc?%;B=auCJ!WHS*bfBuD$Av+y+&-LI zzoNx%P<0K)(3X5tE@z|QT`YbRiG@oRiD)(NhY^Fb8e*R(yjQc!%T{V)&ur93&Jf#6 zO@A7`yZ-s`*>mxq>6%JR(UY;g1beyn!!K@h@7zAL<IrNj@k{Cq z*dE@s{zZ{2p>~YJKEJy?_$ylqyXZM;8C!qtTG-w1BIf`Z!MP?IIl;?_I5e)a)Lz(vQ{Eq^GPwo!(hU<}Q~R0sIeMHO8)Z(J zOH)Z9o!pET8Jy3zB0-~ormAes}KKi^9rvZxKxGx1hlJOxo!<|;SX z&4`nt2|yEu5W?z8W*GDNuUpGC^cKbr>hZlwCmHX9xE=7`ZHLP+1%Hs7PR)drBiHou z(jClTSnE)h1kC4&-+kl_{mhAnwau+>m;c1H#d8W?^*=a$@sg{;^mqTAW^2E-wGtE3 zis17=M`xG6#bPsmJe~)BoBpv!H<^MY2EVjU__~5RJNreVzT%)O^QF@Q|D1dJLo)~r zOYi1QYHGqPURY_*@e!$6OTc({xyI#tkE;Y+liRg$jhia_SYu?H;+8UYBa|vNi%l>* z90`uww-CKlnUJ|V3Kicg7xW^W`@3?)I5N~`WcdSmUP!EWSX@y?B(_jGP3?oWT| z{5f}IuO+&&CCG3sQ_~78WL|Ua${m*#e9yd2&2Nq&Gf4Pw(&17jDsk*warpRDH=K~~ z=j2W$d+kE_O!2?@*0m5Sf0x-q>sXmWYU28@k;H}1r+U5)9Y0yrBwic38zxDXp*0@- zDM~)vx_$h{&{@p)PcNyz8y61PiRWhK6<>bjym7d+={ovyF?Q#3X6)t1=k~rlW&AxV z7U#J#ySL=@*u$=FMfz24>aQ~#{iX|EgIM7y5PQYirsXi)=IKX=%yAX;_Lc?PP?u;L zb(=urPAhqp@`j6H)1@;*ov5(DJAeec=?MD3zHm9GwWpVp?S*jusKz3UP-gdjql zJ0p;_(d#fx6#VA;v!pCom8^CKn(cO0b3VGZc!7=Nj5t`XkF3HS{xBMU4P((@PfS6i zBiZ}VjV;a{l@N|N^xV>udvN3vNf6YBEbN^r9LUMl39t)xNrjfO)lr3VsqZh$>1 zIS-bb02br%WSxif`)4*XJvLTzaDtjd!An_;P%S~Dhe9Ev0>cQwm(IdL*}{MO-|T%g z>f~Z$^vzB7B89Ur3*Npg(ulAw8Z|1>f)oQ4<43o2OFm%ch_~a!1 zn&-(D74BcFm-R+5j#r6%Oerx(iF8cgvh-a^>ih*+si`jI`@AaU@04|En@B6*`TkBh6pX{@B zo`X3?=Dg>e_q_hsk3+n@NT$b5wz)T*BSyx(SEg=V?ysWHWlOb(uC~h=w`50B*SLcMHfL$fRbhq!#6&uJIXR z-luw+Q|&Vi{<0lyF)es`L(q`{m-=kUvdB!`9xG{M8boPmynKj1Q={s6Vx!C0rr{aE zr`-493Aq_evWh0Bhi34GMs{)1WhJ4G8$~w8N>d!AQ7XC5A8N%FhP~(X>y1&&3{JBv=ct=b3!8PL)}=XM{iySaf;0d; zj4hfQ2=B)TXSAcJ6poK`>e(BGTdx#eVUdB~Fv08!e4$2%CufaZj@MCr%FX+Y3{+2^ z^BuF_G&bTgDOE%=PUczC5fWap@}co|r}1B0^y^$lz59*tazXB;9`Rx%Vo=X4$gB#AU*k% z$|;e9H}g%+XfJ7>Zm@{@lqzwGjfmCXcj77laF%(B2u?xU7+~rYdIHOV4L{pXL%H4& zI3U5D)pRLbsvV^Hxhc3~{0B0IY$$H6vdz!ec{2>lo;pin93C!H@>$BT4*z z27-)L7-;~}bGiKOoW3`S>41WSZC$AE7wMpI1cmx&Bnp}*^eP)n?>RZlmpIfOaWy*c zYMSP1G3Z+P-A>?|u$?R;&nifaW8sWCB4MBwL1Et$wFscVOmdtUV5%VvEzzVBRp+!{ z=jN^M^z)8@b*T>Fhe`*I-4ewE2n7~WR6f=rs5o=oJeM0AE~mM@>d$*Mr+Kw+?c0?l za*L`YEL2c&F44zE&WD4YkMM2wF;C#Q4uiT>h;=Xx0P}vRhrY|r{hXWoj4d*{Kt`HQ z)_Kc&D}v~Dk+eFrUbx5tR;8-Ud7UdqS-_;XN>=(S7!@b4*&g9XY_9*D-Iu$WdE+vMkMIw%@b(LfG+)FqsUW zlS46Sb&wjGdE5f>2_Yc5QTXsu{T*GGh^bQ5ey1vv9FH&4!l^W2=M2?M4&P3vt&f|+ zmcTHTZBM!3==;1e;lr9rJElk(9V#|RAQmZFZnYJP@;asKh=mX_%s~UX>p)~tM(l-p zo|55vFUq4M?c;@kaKV(d*neyf_;QomjYDf3UXGO1rn@{QXi{jE#bY2d{ zAMz5xkK=pnO$!Q6tcwO+Kt60SfW~8!uZ8mp`s!9hC4Fj2V&;3O;_&coI?-olM8}9X z<%hc1n&USJx4OO1)jWY30@61|Ck&`7{nJxE;GyS8+O0}g?Vkpj$mp~jZu}JYO>tp< zzER~<$6pVM3}>j7sOuh@*TKJ@OB3f!&wrfxHsIo&p9W!F{FY8Te*WrO15cD%KLFsf z&h;O7(cGgxLEbK$<{Fs+!*^eJRl^M-d2vjP+VEzvVd zlcSk00;uzUC6bRMJVO(zi!AIvPB(kyg; zQg-B09z24nmImJg@QjJVWT(IzbsvWFxmQFvb17(i9c1imxc+p-!+VvjqOgcMu0*Y} zBgP^`4W<)_9rR?TYPOJ~eU zVp;tc5rws42HFYbEty)25Zc<1L)1nz<0`7f&W2CvhV1&|EtHRr_`u>XM3 z7q=`NF*PzamGV`TQ&?c>Xu2RmwC&_{Y91x;MNr$5%#LF_ZTdQ?o!ItMjvbFflPYrB zo2&a|hC6P6Kbx`%l1b3(Q}BsqLLs2<+gS5SwieyIW^o{#3_vCdQlB7sC{t}*p8^iQ zlt2^r+pZh1Wnv4uLPe+)u9zI?P93TSi{dw_J)L)E)v+%;vhBjSkG^g@5Z8VvIZ#PR z+QJNc&K|gR4=dd_AycfzFjJ);KZ@&6p%QpEbDDSeRWkNe3Jjn(fjl)7(&-K~B`4K1 zTtso>OLMkW+g}qWc*c!_`zyy9fE3`42S{tIK_w*YQnvcaU$b#VYHnpI8$j+FEF11Y z-*w<)r^|CrZv=7?sa>iYod!)VqTE{-)#j!?EDV3`W141+lbjGvHU4Y-)L$Jwp?n{! zXC4dRvz>58MkK8=I8;&C#U*$u&8PbE2=$;SkCuYY{3u@~92T-5{#5lNu=4P$%3_b} zVxgW__CXRUWPHD6N!PKT72EOSAh2}8*~6gAXGxHXRl5}5uambXa;UNg7&2w$A=BU; zB!RFR?wAlY+=t~=;Z@CXx@P+7e#`19U68pWP!Iq>V}b7vHVt^zPEDGg51G9r{%V|i z`}5IN&u6Y@YLm`SPRjs5memdJxUF-4?Pzar$wVDVv)ev5p=NVo<^G{T*x^s3w4I+V zJ0n5>X~2?I^^wflUELGEq$FNX@(GeZUb}E6rSytvh5ml=kKJa`J>dO4+fVhtusRK9 z08_knjA9No5+R@!1X0Ruh~<*GSRJF3XA;fWqul*(gzE8ds54aMWyfPx&(Uhl6h}^R zD*o3VRP|MNj=J_uod>F~dtX>S(Yji}1ON!JbwL1%VN(OJX=%s5l*zQdLf7_8-MB5B zX|C6o&hN*&-i=lI&U2{rMmUat3fldpdrkEntu6wNN`wKZF^zFuf+a8taJtd4>4s3V z+=f9I3){T|gw|$5>3xeQdI?&kRyHOzV`td3%e+!TPkyOMc%tb1x*BNgA5>a?mNgBC zKi8sJzCPRWDCH?z_Keoap0}4w7n>nj-^Qwn-;L{3@2s)q!8DSDDG|-gJ$ccJod5y? z&X;vON&o)N{Diwo-Ov?6@gipwdlDoD7bYEgn13Z4sRwb2;FxH77M3psN{HnDHQQ-j z>IHEQLs*+R$4WenaeiL}@-r6OCeVk-sDIq_h(HkSR(&Cb&cOW_ZK%;Y@i)b>z z6vuAc;d1fBm2O~L@`)vg>!b6RW3Q%N4>&sO_=MbhHA9ilIf8#LgPY}{bClVOw3{_h z_pGnb%DK}M@wVJ!QPku`a`jkhOk8)TV7zC z!rXo38g3}^DFbJQ>HrQj1Z{l z-p*~5P)Xa%rMGSJ?Othx!c}%r-BBZ_b&?tm7ox9i^QOp{mF{WXxZbL6akFyHddI8c zc!E>nN7E&6JCBly_eDJ?%a0vW4?kWSt(ZT6<0)+~+E>jAyzJ=7)Xf0vYrYgvjDFl0 zXQXK0(Y^Fq(5(*dW_R}ZHPapcz9E6C2%U7b{LH4)<&wKE3Y(R?ELBV}wH-r}g7(R< zPDqU(N4FYudBaj^DBic%{FcM!sc!@a9vykr8-MBLg>kVc9l!cHWkAf^>W-zUf|cJ0 zXi`vF&+@yP1s~=I!cMy-VtmF=l--JBLh_Ew3A4bygx+B{l=V)LM|4nhiO>F2jMlCo z2iWWjd5x83?BEh3`Xp?+U74^0dZp-AaX4+QKX1$tt1U866fZjC5fZp0k{qtJ^kxzq zCz?$3Nm_XROrS;+zz|Gw%p=*DBpXJQSk0!f-Z{-# zKGxfyTVo6~m9xq^6UT4WZP2W_ZSP{DF7k6NQrJq?xn=LieO3Jqx!BVo_EG$4JvG~n zUH7-ZsXHxqlPu37cuX`nX&DTx@H*YRM<(Nh|5Sp^&eeM<8xz|hVb!|>RGus7R1xPX zR81@sl5I0w;Wlfc*U#`M(LxhWw~~0npGK(m7^g`+s?q)D-6A2cPD;qnB}u%By~#U9 zI`SbV7s>6#X++_?ayb-axs!)j_dg6rKli3SBZzT`#2QNktsTc*dna%`!-=hxJZUb4K>7IBUFmIwzS zhsLJiS-TGv+UAq|jdRR?VZRD(RG3}>(cy`E5Ty=Mk`4Xnqqm!;GIxh5HWMzXG>;ka zeBLtY^S2*&cqMXk4!%EKHR|n8Alz2+D>9(bSZ7cyZL(uf&Y4TxX#tEh zv2PH&UitZ^7<(4S+qq})izu_pHYuOK=j)4aT3(xQ71_NA)d@+7eRRxHq&6atd3DR; z8N~ePcFckque8m5$&F`%%aRulLFOmCy}*)J%?u#cV!cF0TF<*K7N``}nZXEAsJ?q` zb?d>ICnmjlI?GZwvHWF5!=jSw8m~`9NT;OzYCxCHT(Euy8+c`|W}^M$^|=q`yd?E^ zLVl9rTK!X~py4Tjt7H)2BWu)&?=IX`l!zm4Yst30Iq;{OrnZ&a(IqS4Z!a@{lbxVP z5piFkmw&Kc8ix&~qRsE{-H^Sa{$QZRJWV{^_q$me4a9>1Lj7j3i1+zOZxRJrMI+c7 zbu(u#*Yk;DS(KyJlB+_DOP;IAxv8}~cz1+>F^?nRZcfoHBf!!(=VBG&ua3-ooMqyX zY;t`DA5Y+#1|7GNKSks}yo4mNb2=|mO)WiHJAa8Y(b24|!-~Db`~xC>^!NIf-%#H} zTt%w)V~#W|Byo6&SBr?Qnr{N+zrl8n8&nXabK_`C7RE<-AXI{p*p-nGV@MuE#$RYH znWLm$WXazMGUhh)!)}-gNco_@0Ot@ss?JhH21jjFIX!9s!+-db6Flu`8j}9aI@XLV zB|1i6*M;k}hIfq7I*U3D3>73q$El1Vy|W&HcQw{cQ>teqP?as`^t^)^7q!Yzh%p7&UUFeDsw9k-yeP1XA)a)?OP_~rR2=Eeo{1w}3t*&CdJmzdsl_%t6 zQhY1*vnTix0{^tEeAmI<)wAm4IHcaFSRh5%@f>wJ?ws@s*^Xf?ml^T17uQh zZ<5cI&RlR~6}>nOhq3Jtz>FVk+?%?(TbRw(nVW7n)&!}gupv9>@qGoncS3d=j2ilr z^mj(`qd-k+iHz0;FihxYd5clWa5_9g=!k-*_}S2ntZzZ{I;)jk_mP_<`2S3(-qjN4 zWoP9qaU4@-?eVVC^HJiem~~aC8tF>hDtQ~c>~*W^e;}{ccF^TCm%XuNatOX}c5PD?1I%)oTiby`eY*L?}SxtC8w7 z|CK8<Nb74tI)2Jyw%y_gnSvKe8u_(^00H zD@pesF>}mI9CA&u61LJqKovaSDK{iAA*^O0&;?v@9U*^kZECHDL-8qz=TFgpf7AH$YU(2Bw!S{WW9 z8XAH@$hgXW;2FERQRL}(5jaskZs4g(S+vJ4XuJ>;lX5~COv zQ9*}~qLKM@NG%aeNr#W)e#@L203;2Iq*3f|FUGd}(99ug{D=H>= zoa8AT`oOj&;Ehu1+KD)_N9cBXDmCMH`y3~g;w7*S(Va?tWpEf@HpBXu$gEBWr2`=O zG&q3(Z?|EM!X;9PP&X1t4A0b#Q(N3&Cu3O)v4}W^u1bcK5;SwSI5%wI!;&C(JS+R3SJ1u2;kKh9);)`5+5elDs40N1h?%9}%*1xhAE3IVVo8o7u@j$;tM zI7m-7RAv+`@qiIZ@^2?u*Ua{5q%_<9HLPT{jrR9_WR-NLhpROjQj~`==kpaAX<$v+#&vdHXxMli%8G^xTR`T}tZF%A!$!G(^1(YH$XYD3v<M^R!ZMEZ@&NaySrVj9 zhc9*;J7gk9iBJL^{@nwN#VGljzZZ3S?;ZwEqVtn*um%$An}^VHbi86={ww}hWbbUN z0DjaLaz^IyA8wp+q~~uraSK}F<7>=#vQ#O6wYEe~-l)*f02zma4l2G+|0&l*Vy=Kf z+$dTv+w!I9;QD^j0S%dihSDe~@|)szeEMTL1W&WV+7er@L9+2cF$@C|hE9=R)Zz}I z>ZxEc66o$6>ZuQW21AsIfajAD%cppTuiqcN$hn4P)wPEw;o(U(s1@|}eHzl2Oyux+ zpNwO35h&K}My$Shu`7kRHJ7#qKrYs!?mj8sA1#fdA@}Hz!fq`!KQiYfj!qq0Gr>41ywm<@bVhm*BWac?wLxOLNQ_S>BDQ$h zpcM&wBvc<>`$YiNz%v}``VOx+2Z41o6!zgXCXOy;BZ9zPZ_ILxC`P(g9QgMjHp0ixNj4n>|Sz1cK)PATu}}6sMB>cg{hg=}E^1G7KKj@TfHgl0|_I z#}u2$HlA^8cuB8L2Cyb|hw$@5-O$YT0M>*Eh&m2zM`6g(EVgK7Q({6emi5Ya*2-{Z zD>`!sjWr4YVH9yPm+-7vglt%Fo^XZOb~D-IpmEq1jrayUw#5kw$9FSTq%;PMH9C+$ z>e%KBRh&wyDBg}3XRlhEb~LA0p$9Pf_*JYsn! z{}QLHVEd3#dnux`T(+~hT4El{YJ&bm_D0RnI#&^0bT5`bY{#6AKp(F0XF=C)d`((b zf;5(CPqrKUsjF6p^L=Gk8vbz{F~NZdZFvKV!*(+*x7`jc8?fXo$8_f?_6Xf*v~UC` z0~&$LJ;y$Em3wd&k5MIMSq6?^5DUFqUXl&u&J#T!_b^rBQ;fINKHcR$z43N8a-*a- z@}zFRj8pPSxq9XOwwW=JpLC0(L$_Z9913YY=qb+WogAyQ&c;L>$6Ws0pT@~Nd7LAO z&P;^qRcO_l;F?F26Z~|<#~v`NbG(itFkeTb8gR`{IH;2lm*NdkJF9^)(SdWQS7Po6N-ZHEY9E7fWg^vq`?GHTI zAIMuC4NivMg(3;`;Bp%4Zq=)oI7TOG@HnFll_W70&QE3InBjYNh0b0>F|jAW#YIOy zB>3l#4|H3NTSw~OiHA3!S=z~IuOpa)zA*LhF;j7{mXl+nh9hn`XbnIsP*>ZH9Bx8{ za{m|`U=XSalfBQGR%om}utO!0l+!chXGvTr5{{*U-to?N;zc3N4bxK@1~=&N6zO4e zBT8UwCx5=166l-~1-}l#BFS$eDavf#-z}#8%IlAVhZWK8_s}l*ApQEZz-Fvos_DVw zQ=0FmwG*X9gujycm9%;Se{*cgT zxK{Hl!nSa2_89w|N@2f74$8!8P>~&Fl2DoN-KVL`bxKZOsSaiG`LoVbjGlZ+d;ycl zZR8aaQ$B4tnZS(e_-aBQUl8JOBUqDs;I(j1C6tNHTu~3( zGd&*fDvnn^{Cw9Bmn4#xq^HzGf$GNW#(sjZ{V#*>X5da}mibCVAc2{u_*n?%bkWrC z)!@M(C#V~)xq)cp*I}|vwhE#1|0F`MkjM9yS@D2Lf4!sIHa2xFsK_>&xuT^;CJKH{ z3)@$>je|L?j9Q0qrYs8uDzd#cSdz{`(X4q)aJJha9D6k!%DyLV79u?TP$mH6$!zn0 z9L_Zo$_mG}N3sc${eCUv#1nZQ1sDg?v465T=NBdCp!>;BMUjhR9MyHUj3etP(vW zZxti@NMsGtfZraIUAxWxB~I~6Y48`<;F_xOFSSBr+^gKYhwSZmV<&p>8d`SmA*aOs z^~zhf=B<#^e@*BJFw1x{{j{OG(ct!I6h}nbkr~D!&4kI><7z;)qnXKgc#AIAEE}@D zThXbTbEEqoLNyT0?1m{lAi&+wKVJfn%dSj;XqLMnoSv`8Se+puSp6-d!>t*THDIbM zQ}oOM!cBQvrZ+Fhq+_i2OgGK=0VB6oSR#uk^hbK?em7}`2x(CnnN!HyRrL_*}iGK8X_^FV%I*TdOsQW zB7WwUEtlhwKZNQ#cKs_gJ9Q^zJCq|1zB=xl+@@ zm;H)Cs7i!e?_A2_Vfd&EZ5C)cyua%hJV`31hJMT{0p|3#=u{1>6Ro)`NH@xLQf zgN#v*TOHAfMf|@BRpmc~sy2gA&HNXkicKBvdG;?t)jL-~Y^09|Ul914P*rJ3HT|1V zjSOWFs>c|FY7~P|ExtCm5zVXdHI_;;`%S3E{zItFXG#7;sM<0JRi=}5X37o>LKUI( zhfwV%T~gLuNQ<4T`-f1~Z?WY-demn$CH+mPdbc)<$h!O{RF6OSp9od1Cd3~?HFCYD zJ|=)gq90?F?rCkSaXq&A-2g3omUb!i521RQL8yuZ{F_j<{VM1g56M)zzplw3RFC~3 zRB!%^Pz8iC2-QN~)jx!)z5O3TwT3~cs%%0VArJpWsPCJs@>{^;ZG)1i!vr*xqXSrQMxwB-o-SWj)_;r?XQ7gwwh@;e}`9zyv z^+Ls*88#gn78eJ5e-o-JRK|liZYua)Yx>Cb*P{9qATrO@g?#fGn+w&zAS>2bMJm0$ zv%e$b(I`SQv(@OjU*D5SCU=cSsYm{Fl={QgSyq-2xJ%936;6T;ot+Js{tC6>YX0@;U8XRKm?%xXe zQFY~DYpC{+8Bbu=!GU;&DXb>=up`wtmJ`Eus3?5IF&!t^g<&dB6Fy3<+6j4gxL$nj3_8`)Gt20PaRjl4{&ar0&Hi9emWg{wUk9=;S|`D%Lla{BK4z;z8=)k}88yy_$IHZ$|Zx zp=$O2HdO6yrMe~lVN|^zUHr|cW|V))@MJKmcOUz{dYt5&c_}ip&Ohr)P*(82G1dQ# zq?&j0-;C;mm(dJKwc-z;`s59TVW|GCr#>urRZvjypN8rqazZA#IG5b`s4#4{Fo7Yd zW)w$d6=(nLss1gg=9Z-}B-MiQw#D*C&ngobhU&}8wvLbQ=RQ7pQI)q<^)Rpc;k)X- z|CCgVYX@Fa-ZL=O{<@NHbuSo_>PKq#hsK=$E~&Os8e8icTl*QL>i4$d?Y6g*?KKQc z^}jKyvpw~zJ)hco8yfohI{L=`#i)+;_4oc;QvF{U)%CHC-tpG4@rhp(?E{m`o0C0* zQ=JSw_3PKk(dmuRnf|%ix!u`82B7-4o;o}K{XYTK?WL)??^8d&PcN?iUp-aCy1HU* zy5*tFmCowQjrs08i)YrKKK}SNTCl|J+%SK>hN1n&HAIKhW%Sl?chk3`j?*Sz|d1?tl{dQX~Pe-i4iEy z#fz@8MWRyevIZ0Yq)EVz`bST_#?Vu9f9t77f9t7>brH(w@&8p%eOWCEVd$woal*qK zm)MRr9TkhwdWQv}j^DCp=&3{<%Mz)tNB_}N<(ja+_0+OlUm0+#rOI)Jo@y=TzVI(S zmH1mv4U?5YD;_Ul=&8hvq6ADJ?mzU@2)3ht^wc&&wpo4~Z#&aqrD53e+f8h5o9r#X z80sj|SJx&EwY$LrbKj}gd`Y1N`Sx0%Hn#;rcRA%117+ojQpB@)re(>e<-661l15dS zj80Ij)(ZZ8(85=MT$%oQOq~Io_}wKazq>rktmLhXu3szd+tpJtD~b)L#QJnn*ZVdyHNa3+4(6wZ#1NT<|r{xHc@=ytrD(L(%e4qxR8;WFo1qdbxFG;5} za0Vb6xxhV)Ch80_Q$ZlfOO^=C4As=M3J+&t)qzyLWQ9f-e830`geo7XgazaBSPgBO z?_1mG0}QCnGhslD^DUtV-Xsyrpm43R3tH}_SiUnIdDwVn@HLy;0@0Q;Vpk=Z0>%s> zx`-JJurx_2t(#Y_jfCar!f@ebBl_*4b}!OU4ji{dZg?7WY7jttWkj%o>o8ZURHETI zBcYiIZyH4&5$#>N>fL7s)D`i%ue2yRQ1fW)h-+dIv!p zQ(aAm=9JFH7@#@GpzQ?uGRVw{?Weebm)WN?zrZ5J-9{jRO7+{i@v*z;!DEl@Yp$&& zIdX59#uXilId-nzK6tbF+0%@^>_7z85Ph*_$G`1bU4PU^u9%hgk|>;dTiPy3YCa_L zEQ3LfbdC{Gb&`DFU`3Ba);p=}+bSZzi)S6zHq*c99R(78B@&quwQp{EmRGpc`R0+9 z5^R%+%i4twQ zFmZsQ3atso_erW)z@->VR!}b|?=}-dqJ;s}>P+MnTK^N z-nU_s=j45WN6}^^;)O=t_s-)@ZDD3e+(uNvmi z+d`w?;c=7*Qw;$zly274xvi?1^LGLZUOH^+@AWAdD9-5>@X1_e^^+lla2vs4^HW&?^x)V`@{(cS9T0-$nbbxG#UE!+gK>^ zTV(H#w>Cb|@~3f^0+b%;^WN!7Zb`J*R|lTx!{Uze8H{DM5bMso`v>O0?qe(DBu%t}SvxttFZWrj{-yW9#MsNv;xKwYzZD9! z3_C`)aVsXUUl;tgGwjCROf(^szGaboze3|?RsT#(O`ugAA2#v)!ub1T?a)Tl=zaVV z6Uoze+Zvk~5eOg!5PN$-Fx3uT{W+AAup9kOF;#d*b@j$ZT=+i4G{=O9{cGz^VEH(! zVmdSC4XdqFoD4D*bE3b+Nz(J&2N~)Q_4m4w=pOKoM^HM>#aN)_ZTds*5gP`XDhx%k z!yy;e$;RIVdy_zmZ0Ytl&OX~762rxRY4sUbzoQ^^);6x4OL2$7H+I2*NqFtF9AI$R z^e?}lvIVdMfp0iHQtGPgHnB(?3& zxi`RerfL-fB;`5Bt4%yH7+3}~v{Wei>w}M5Rk?-SV_3+$D%#=keW%!K(u)X%28eFW z5tYQBF|7as1izGSv-U7(* z2jhkZ8+LQN6B8ZfAA?`7D0XQ(u_WFH zxf;Lq)R+DbUj187WpJsqPzIN}$HTr?j;hFr3p*FL^f0bs=K;k`^c_k63@w(yrRut& zn~0GNE_E^A9h~x;OYP*wKzZYb7k+c8FL6c>|8S|t($Jj@E|miHkoe7|_Gz%sXC!8k z6Gt?XR-HWL0Ek^2Od-M|@)#3995OU`X&A#ILSYJ2VPLWDojLh$O}AyPx8*=V$vlPaJd~y{vKrVQd*j2QF~f34_7q_|#62 zzPHO|pL1yGYsR-|dfZ^J98FK*`J-d0%4rNMH7}WArRox_L(2?b)Tg8Yj_En@sZd}b zb~YJ?mD&wq>iNe?4R1vY(^)xt*m?l$#DE9ABv!dxMPU~unX``>R_fQIr-K<*>etk< zexs>jR7Rf|k;6!R<}r?PGq}W%PYNxt_yWI(A(nYyt5I>z(gtb(+qj?kp1b4ilAz7{Y~Oy#peh% z4)rQ=He#f?)hSDD*UScA8MdO#f!?QqWNnvJIS=(GR0j`fsoQVeWP53-y-MvGJFPa3 ztol|UZd1gv`&l5xN)eX>*vU$)gRx-HNLuuQMe}!SVKddlt!Dqd2Xro zV4uLKaXw)`qa+8oD^79`^#_fFfv&)g?Gw`2oS&KB0S(af1jTPtXi6@ba-mdclNd=_ zN*^cw8%Y(RdHPDnWYJ_Etdm&Sa|*;{Ee`>R=hH6Rl1ws9IVny6M%R$Y8wQRFPN3%; z&;STJ6DOVZ8wo1p20g(LQt1UO zXtrAZ0_{^zL5J*rh*V5*mvq5#>8JTSaV9KQd!}&`rmoPs`?#~#<*)RLG|(nb*jMf* zpxSvq@&hAc3q8rD$%_C9EwqGo(J`5#i|wWM#Nq?RHw+>*;^*s3#o}MIV)but{b{Ig z0Kgoe{5HJg?Sc~MV(D)pl~z2511Nk%4lS0tXrsNWDh_NaqCJrX6o57iP$7-x{D$hG z=jc#=6RCP}tR5ESzZfSf4VArHnWO#j%`1F0q=G@D_AjCQNdVHD><52{)L6cPW9?NS z+p2PS5rasTyM+Gshe)N2!a+EIJ+=bD5K@Wd3?a2(3DpVz)NAo+49Zyl--_r~vQK-K z!*d?SS3P77t*L!eqqS564y~EeuGQfM-La_MxpV|5tokHesYsEQ!WJeevx zMwQ>ADhk#sS5wn$^pJ5>jcSavPQA`pyVtysOJlsWDAsavY5Qbs4EmEL$Btj??V0^)QLkvQx2IrfnjMNGUEv5?r4Mq zoh7xJ`jP^jz#_Vhf2Y@80$5JbS>zBc3Oe^@&ZAce$oCy6j>6DiGz!!k^*$cCjc@rV z%OBf8U2txjCz{9+AvE_ou9LQXilD!OSl(eD>f)dWsgPnURW`m&q1yC#IExP&5=!{( zqgvq5k8GewZ^W1mO9r`p={sLv5c2yR^%kJxnksxA1Hs|yHpf^kiHvR?YDhIzrneydiH4TD>c7u}Hn#qunLkz40gRUn-lJN-88a$K?=>&AHqM^wQA62n)kl~}+ zK#hc;+O4R!ijYn$8m-%N{s!Nt4l1uuM}TA3c0otEA}EL9qjJJzOQ^@OUBfnPXTP(F z<6u(bRECern%#YXh1-%L)@Z0B4VB!@(nyA6(@?J!Sw?7xWv6Sea;XaaLuCJVBSkUS1(ye+}v-rYFl$I<{eCW&DJ zf2Im-rm+J1?tX&IGJI4t^ER1j6+aaInVEsC(yB+wu$>WdBkjwm&+kwcim=-5;Vhw^ zXP?I7Lz|Nh?bvj)IwZ%bv}Njq&BQ#JZTi#T0EWps>kFdpO9>V}i-y!?vjs1svK5DI z$s?e`?gKJB8H0We7@rcPB`cyVi11oEyj2dJ{dt@tkykpN&HC%nsdpVgI(0nAp?R9p z$G?Wq){|RuV-{#sCz*-i$W&&b0P-}Zn=}w`+C>rZaQQR({WxdMba)+8SwiE&r>Tl@ zR0OAdPMzJKLkXGzLov3?HVOsKYF^4rg zsWTTJ_+$39DZpGwmX12C#JqB8$8J(5^p;yL3rRc9r{U)8)#^aGbK_xkAUi5ec!gO4BOGri$H-J7 zSKGx_frhI;^?1cQRz-9q0ScH^c}!XMoM83J_lW^R`TBD&S9!YX`I6||wQGuFYg{NH zZN$2Q?0T5(y2L6~62&WByB-`w7e71{v;l3%Dr`s{uBj)jMy;)@Pp+$U32IML4Y5PI z2h&DLbtWRbdQUg8)td&jf~UW4&J%gh)>31(g6~ zlY)A+>)1mc&#o1Ys2m4&b%UX?+ z5=HXnUr8aM)(iB1F-X-mr!ZyKMge*1-4z}sj}(E^bG2PObs9Me=O?J~kl*Q`Z`($&uA4t;Z0S(HHk;g9fFW5dm^+~B5weO>8|Nj=29 zA%%f!X}`Q4HTVD4Q;#UPjSa84v?Slx6WSS4#)+a{q;w$sMz40~oP3zAp0Vuy20fnW zHa>c1@!hNZH-3j~ffwhvQ|_$Jjeps`MXQV4j%X+=^QfEuMM%#6A(+AaB*1M zi{2xC%;L`j?$Dyg(ZccAAceNOLD%yz4yZq}xz2dtqO6CidAJ)FM z)s(&62Vl-CC2JMDBTFt#{I>)>-)%E`jm#K&>g|>j8svuNV%b=E>~cn`vMSuY=gkMH z;^(*JWTt!VWZz9_+_qG1SKa7nyhx+fDV#FA(W*XOVtG`3wxmy1Yqj7_?O6Bb#%V10 zNdL3wP1Q}C-+JoVwrG|+o3>`+*&n)3%9L)}S*pByYkNxj$L1Luw!E`vjIZVxWE580 zlyZef*bc;bu8cOY77N=xLKU0W-LW-7Xg0@LnOUXw*~F=B*}J;)pR?C~7z-e<*c`D4 zgMdhNBr7q00KlPB*<~o)9)f#P0hAXo=XQ;>rps4Y6CYk7Sg8MW z6=miQFlT4--zHg+X-1~ms$xrVVaUScdO}82{2E+dWQ2~P9>Q@Lp*ASzR zoub)NrZ0OOyZHx1rCs_PM77sw{tcp1Fh&6UXc*+_`QIQar1EmyRUl?EOp!qI{2N65 zD0dP#ct?N!4wwmpafNHR(>8oOBt<}Di|R2+qqn+r_gN5O*a#mC9RLC_;R%jTf(ikm zUyOH5+(W}1jQ|;M1@km8nh+D4Fk+4)_^(X}fc6d}XJmg^etG__9RzTnSu@)pFijmm zJIxCq4M+gUWY?oEoT@nN4-den)%{Kk8-s$ib6j~Kj$AViTEsV019|0Dw8f+c5fPjL%ritRNCqDiBoWX1Nge7o%7`AR0 zLGO+Ml8$!CUsHfL^npr>07p4NZ4wJ)%ol)!{u4Zs#{~P77hr0T$a{(<#s&Duvc(`^ zK}Usm#&1wN6YhO1J`D%}8$S7BV0p4hz+|1_g}E6BQ##l?L+OQZBrR5x?w z^JD-FV#I4tdMgh@huR&jN3Z6F0u2Rv!c^|m6Czo>vBDOG)ye%^Wf>*scJ=}j6G0a$ z8gxzRK@@F};@!xwN8dwvcNsxqjkC$i3U@A>mS4E60MvZ)^@lN~bfc~(WvKp!l-PH` z!VM~Lh?DX@rBH5|Z|07ui69^smbeb20mMs(e!woGKmRI;1wa7I6UP!h14)3YQKoH; z=r>jbUI3c+i6|3ZcQyHNrhe$jj9S@AfJ%iDNNQj7!H4p%lt-rl_GbJV!srH&(Lyaq z!sju#DOAz41q|o9Vc=`4RoP0-a%E@9LZ!qP>3w#0sWdtJTI|zXGndYJV$?Hu?!^d# z5qE%IwCt$P5vLtX-oh;NXY9(?B6tl!Op3mljVf*`(HHw&#mM=8ffLo%*X@e@BOr6b z6c_*pa#OwIkJY{IIIm^=;(e*7xs$s^4++4FRhSm8Tey5-e(Yn$nsGjsmU51m)y z^BNJ%8i0AR00LtX;{@nZS0bihSf3}ABV`lYWA$@1pb z1n~_6KhKj|d7ja_V^NONQ(OK6@8qRbEk3E4Ag{;f)d*+Z)e>bTRvNtI7S0mi70g_& zbZ`tN5Ou?5iil0(_(&kY4AA99jQsBF@geJ>v7jmj3-ow*C&g=TDbBsx3VX2ZX^#WK zcD}0*yOT|W_|CP`v0z?0!OurXbawZbS=m%*s{-NjWq$%6qvD#CWW~$HFh+SL5%B(+ zA8sCp`z#{#_vCvePuctlD7vNJfORGdHsMp12u)Q)2XMg(hXoxacul1Dg4}Gd;vd!m zneDM9R;2QF+hcKZ%9tgu=n?SsRX_lbF*i%g#IBhadRF46U=b&lT#@eMpuxlJ0eHd? zP|3#|-ohF^by&P8%^PLzg+gEr!1BkZ-=G9880Pz$;5`oA*LIpIk|O^Pdw>27^&da} z|6^vD8SB`0W6i!x_L)K1%T`f#60(e?NN5&&vTtcPbiPvOUhopGYl^8{y^0u z9LNb1G)FW6TrrwlmhGU_Y8FyA|IbG>>war@X?|R$&5k$!e5T<|U#*OJyBw`Cs36;Xw5f##g$UA-i^3F=M^nzS! zmXx4PHc+U`x#+7_%nDe?P5g;Ic2>9{*+=T~f)*3)QIfY3Bu3!9)oXQGL#D~vu;AB` zlgacvlbY0aqld;Lie>m1Fbf=^k~&ZqrO(5cyraoTCZsBEQ{xv0(8m@p89MJ`>nXU| zB)UI<8vuh!!BTr_RRFdVjP>fR*#xp|62b6f-d-XL6kwuPCK3UFC`d29t94UKW!4xf zIy!gjL4`z+RbtH068an{n<^bEPWQHjC>ZAg0s5Gi79_0gu96TNmSxlY5zo+OAYiPj zM$5TmO4xMS6v{IpNq<$2m^%51)wM!h3>!{mE8=og_fbKqsQ5Y@$l~bMNa`sv*1Hqd0vE@!eZ@Zpfpz zAM8*CA!Oj;Lp_tRA+tL#BK0K3g7^>bS@^9Zi4UKXH9_ITc7WcPpa!H7Hjd9Tk>1>~8nCKSCn-1ZmPQLlBbSDQJV>H-G zX+O1zU?%|%z(k-HfY;bY;*Wy|hlZq`Zs4jI$;pB_7Rz6OWgNuVCc#bGSQiegi>B6L zo<4h8nirWB zzh*}WM*?u#&_FB}2uDBw!{ZsF7D9uLF^LPXjwkD4AQHAPuOh@4o8&!C=dNoQ0(XE? zk$Gi+9uN#8Ti`>i3_Nsfb` z;&YqteN1x9F6OOb@~SL3YXf!P{A%X(8Z12!X*tzw$>ItSYsyYCj%-OBbbazqxb`J0 zhhJ6b*JL}5L_3;@cTA$d^Yj~yQ*%rti*e4jWS+qu`fOMfR4eLaR2G4pY3@UUwUlbC&~umzAbs| z7=Lfc_V~)-xp!9hwj~d9S~}AM&%@`QM{hijKY0FQ=&8Rv8L#jFoL)>4UO+W3kck&V zPi6M-VhQzvBzPUa1h=_--+S(rm*Y_|=*2PX#kt|db>ao*{QuEYPtE;)=_SA-bi0_@ z(hDz8{D0`F;wHEhAts4gZ-yqt3-ysP@%i6+s)tXi!u+Wxb5bW*g?b<5E+3WtdHJq+ z#bF=y<+(GfGXfF|FY{+5DCzWJTDHR-&sK%+P4=D+zP{Ub-&06lT3x*e~S@IvhcBP)x zGxM5H=C#`~tRCl}<^r!=GdXMQJ>71#`uor1Rxbpo`3LX$>EBwoCh;odR9a|%YnVhD zX4pS+c`^J_szIv($|URNM%%4R%h4v+V?Qr%x#vbK>)Dz>+Zz2MzFqH(f!<|&tvKv! zwZU%KW$@$IJ@37?B(>CJwE%irzpvFY_C{BS(HD&3wq!W4bT+VTBao5ACvgUqO9WM@1yz~^Rh=E=NH0WtUP<3X|Glrg8A5_~FR5u*-oS~;~1T~xlHFDngfAv%h@54&fgd=ItI=a&W#A?`M>F@i15(WMC-w%;FxgHyBQ1twWu1*aAFYqTl$ts zMT1+S>f2R%$(k-eWy_>s6hST(ukVuR?2@RvxrJf?sEv5JV$c?8>rq8Qf7~}g zh=!<#&=iJ%T8zEvIsl7;LgF!LF?uZ58Pw>06l+$B(o<%gRJ_k-ilyF7M9QwO3m#1d zpZgO2a5dZ^1{USAM;s)`>Fr@KK*V=u#BYe#qsgXlM4uX(E`>?wy&ZAL`YT9l14q_$pE;NO%ut^B^iXbXL z5L}QVOoY^aig$IQGdYGKpk7f(_N`6~R$#djlM+0-r%Pf^ASJ0?j?RMI&XK>JCw~Ay zFw1)aucJW9fZKNTE&Uis(Ls{jCg>_4!nO@&x}Org8D7DaXhmnf+jh(7%brhlqE&Ya zW*_p59$xb*7V$lXxbnN~QIx~z-s9-BXi}1;-fj{0mcyf6eN1wR3o~jT()ozsczHWm zBUYIPsYN8_4;(TC)azW3dQW@;CQA9>j+|f|LO!lS{mw~YRJrHAa#o6)UZPPk2$`Db z`zXaHY};w&PDyIYwdjben~+!psCa)XqVX_xbthy$#lZznf9!dj5c6-+6aRI8YwFeU z$@y)oEPU|jj{a!GH3UJ43mky_6V(7S{<0_62Dv*Li&_D0tgzJHh1Ctj&v5<4d`Z~& zeezrWWd8Ckqyl95@}5;gqGvHEjK1~bDGal}#c)>rSB{Dx(Cf#es%i7Ty~^bJj_+T= z4}E_lXLHDS0v|viuK-~pMyp%@5A;;)sEpn!o>GqHirr`C%7lo=>A6w1L#aH1hwtPy zFPQ55(~GrjYn(HC{;yMw>9x6vYh}dB*#BKmy_{}+jedZ69I^HG6E5h5P*GKBfVhXB zmOVONqB;KW;jpLMUH^ZNBcKjG$h$;if|}mJ|5s1d-I(r(&N`k?DO9z+y$nuYRB=h)nu zIFUzRb0k@@rcL^uYRAERRH@5NXa+-1mC4j!YxA}Kollj?{{QHyvbk1D?Xr1x27hJq z9jx``3NBu`{n-8A<_$rY3?&|B@-+nlHS0wYclsgvRM8e6_T&Yu{QXT!*9IE<^QN4KVAETL^-Sf4`UEe zF2TGupiz8NX{&w34&YlluJP zBwriSfLt3-<1dJFSVQAjc!rH%V+er~C+`{Q(vys>v$B_>Dn>LOaeUC+VD0q{>z%Lj3I37J5Ks-dXM>|CsZwuTWwmqF99Qqa@@56HUtI z80anc*jPko_V&*=(Y9{pbLLZ2l{pLfq@%px#tRl7R)zxjX0432C0hPiw3D_LCVYG^ zp8tBuR$`;}sr%v2g|CfbR&%d6-x&OS{>o$C^bdO|haiV~4%a!nY3ZFuD+YNMqK6na zSR#Tqv0o|tsz55QN6~Yw#CETaUJdS>5u?MO3!NO~U%Qld{G7XoOz0+BFo~;PD#s1a z*#tb%`(1N(ug`S9kzS|yrPakartyc=eJv54*2J^md%iY}v*>y6m$$uTLkTv#f29>D zFjLlOHP~khk~R;UzrC!;`ddqr)vVJFjftm>q*$G*d-$6glhop**nJhK!ejsGsZyMg z)l~8A#uO8<6_U{yjT!kp402k$-HUQBfh_Cf%X9MzaS^^y$t7!rkQ1w9m^_M;klw?2qJCvGNN;E76p_R zJ0_q~nr`Afq~I8yceQuGB!kC1Zjvf)9p$4j*4FP_h%2zi$jLe2tU?+q^UC8+>0DOG za-8?Mk5wQDNV+B)_i71HsvjFgJP_9^$q}KnoYH+c^x;A;{y(Tyuu^g4_Q{Qs-qb$1 z>A|-eb&(G~)Qu`UOJmg);AOoA=#`uL!4WgoT+aEJ$7C3N-$lWP%UchKvu+0X)Nh_%Wraz54Oz!}-X$fD8{iftS7DVk4G=M1Wrl$X-OAA@Q z8l{FPdEe(rSrkjh;-%q+4<~@YEHM@hhrz>14FzDkqQQW?SFneG#WN9AI9@VHrVWir z>YK8cQ!`>0iUa$;ejummK8?<>>HYXB@3B1+Pq7Gs3tOvhZUpyV^uO(v_x=S1qL%&) z`hk(Yo=O5rP7|fV6de1nUCK4G$7F|~diW%mAhHwOLJi`wqAi2{77T7P%DyCC^+BD9 zxn*vQ^ObP~nwbwH#gl-_J|#pG6JOF^D(NBoSWhFbcG*p}s{s;zC%@$mf7kWdg!A^Pqk0m32pF%1kBB= z<)n1^;up+G2a>=q9RXDAwZa4MKJ)N4>xf@GCO@}i#f0wI%+kVhJO9b_eYP3001^tC zr)3WSgd0aW%y-!(bpHww1gRH$%ES6^e1)HFKi14RYi>GwLO(bwDKN+0&?&={GiZVA z54n}#T(OfgYHMWhBGurw-w)}J8&;$;)TibXiI&l?S&j~z4q&|K(qPhQ7K%hv##N_o zt_1A1H(Tio3EA0E{G$s^9O%?Jj`rR|i(V?6?c7ffMJfLd6!5zIS zb)5d$Oj<7|DdXpyz;*ID0lpBPJAjJJkMuzeu^uTE@Tp!ZT93*QP*weAvpT6_>pg~~ z)I|=wDnmfEXIAAit9YNt5KwuIh{$470gS2IFOV!2L6++gb?LE)q6y2oO4}lYHfJmq zdbsn^{{g7gM1!a?6(NG0Ev`I?CQPKLu2V%*>1N!wat)wB-l*^B!5NEUQ*Mbl&QLS1 zJ;ERp!J?WfY>OL^D(5Lp2yQ71GES%GgWvQxcB9&XiF&(3WNQ^P?L3fkfi& zA$RA+jw_JwaR@!C*ru%|8CNFW8+<=ekBF53n9X>9M^{r}olF{WSc7R(Hezw_5C}rX zDiW~=A)`Ey;YXR_T&~eCB3IvN&#E?xeWWHV*UcL$|XNK=au*4AfF=o1`E3(<5 zB7SB{6KFj>kijAW;ld=cNcnRN6p=&>H)bF*a8HQ6I_Y2yB1)R%2JHc8_@Si_%%q`A zx&T7+MXF7#xmDRBb%qRu;w3B}TG4=F`H5Qp1&^3`-!u4A3Y3UGl3U_<4|JNR>X!g{ zTNW8pEV}RU+E(6tCUQ=k*Bm4sBT3#i5@7a;?|Rm92A8f`XZTshNjj7<(Qb;#$1&Vf zf&-8MSGmdt?NN=x>iw9Lcw(N*%zVa@WVZ<#RV|ZIGu5}WC3Nz{dM&yjFB`!w^L6vN-fMQk!_DH20a2Y{R0im>JO+|vDWQ+X8MEvjbQxJ z2PVz6Q%}UqbfbEu^eSiO@B;if5lW`Q)&9m)M5LeZ=f4DAiC%^|3%#9(0D!+NlcYQ&?*8o+5b+JoO%} z=XI8kL~Tt33oAawYdwu*rI5?rx%_a7yH&`R9*8VHT9|O*kU%&DP0gTeLdC4B-7P2F z`G&>x00eylQQ1|Tvl1aXQ_QFY?-0I}erR@2A~8)OQ8Ir?H-n5G{C=-r0k5=**QKvq zEy0WVfP@5Qp(0dB)wv<>N=4GN=pn(2bJ^X6=Htxflg_;$4ptpC)d%B|=4jpbPv*>- zG;bw+{7Y5ag$2rff7lcp^?XAldWD=#%Q(&SiUaG#6fFJL zoPxby^RuCl8v+5|ITQnX79q!GHc#IUHQ(-Zs_j;7% zi4=5;DHoCaHvDntSZ!v#bCv{H!;*-c?%}Y!mOcNoh*U9cr)92)o9i3VYmw$Rc1pqDlFhj=c!SY}w&tP?|f&j5>2t3~>LDpnt0p*{BSW?YaGy$`MEA_(|W4>RN*}J~bkv*fIb{?Df2&^>U3Q z+i69SIEq3{A|hM#l-oO@x^#4_waI&m-~{K&IpxYSm9!5%kPLvVkz314GK^L$!PrX) zL`@7CW=TLcZb4;x)?)*H^@C+I0IVdT&N4N~G(Z;h@J*&0*W?yQ>bfXnTUOGNDZ0)G z(PyoN2Voc@HJKy$qs%M0G&_ne2G}<|Eu7dSl#va`palGILys%PcQgwVu#{8$n(Gv9 zfT>MYm<`S4G);^$6*gZFrdK46vzGEsgLJ~|Ef>wko+mOdfpp|+MJmA%oV^_hx*Csu z*l!NcNv!xG35gkUZ|XIGFjGI#L4ragi}&kAw<6EGOkRFghpaYJd(e|S+iXI0M`q0U z=Wtu?5=?&f!sC1W3(9SNfwEl1tp=$xK960u-Nhmh5>4fz66WlzJ$%j}y;S1hm|5>L zg267R>A9axnYTzpxt+pysW_&Qs@XMhbAV}$cE&f0PBW!d#^vmFwarsMGq%LUZaO&k zIE>%|f#+T`_cQ_fgvlrYA^_FZZ-hE6v(}%Z$;6snR~VEkQhteZ79{|nax|{Jos;8T zy{7C{s`O2+@I;fh1!&Btk`cQCAMs63-&P@EHOX?2!6iEd8EsCGN|`iLNY>PJ%5b;N zqL&Kfy1wR0kvi_-ai&MQRGK662~U54O|XA%?1Ih&w1(6z_(O(N-`65Fa90rEeCt5v z!S854d3JU&jXENH@gYby6|@d&YL8YXzluZb3s(KoP*5HJOqe&Y@n6Zb$mo%pc*g_PyD6xLUtyr9V8dr% zlWDToBc?k&bgUY+N#))(<-26%3kMcZ%Dda<0wgkY+BEeDB=jDrR2$JZmB^;f#J=+$ zBKZELm*!n{rhaU{pdLkOV#ROzAhTPaNYJ$+inlN^QKTr0VCH0DQWZYy2Iv;VBZ~puo>&HXDg{1fuEryU800BP zScstl5GZ*J@)SpDASelSc7u$%x#B1gqaNkBp8cb4h|z%P@c`tw=S&w{s}Ps9Qbmeb#aRc;~>kW9F((s#zS_#;6a{+2`9i%w{yQb2Rk5^TWI2;b^1r zBcsuV&e0vikKIONH#)~`{!yIb2W^ZP@pmNoctSL8=&JExtxYlh*u?xy|@HNXi_@Zq6;^xpp;J+*XdQ_&dh7L;;z~BbJ z{4n3JHBKLC1u#vQER5qD`;FO$; zbZN@|<;*Q@G5ji@8$~+JCnuuA*Mcp&$GvFA4!I;Sy9Q{fB`B}`OwLs{81=g7dHTka zY`P@TF6BzVCE1rqk26kA+FtScLS*I8z8Av(p446F55=yM->bncee~qiOUhq0? z_F-7bx4)vB#CvahvjiXmy%8gIv1}O?D&+_%b;lyQd%yPsc>X@6Zx-lo} zrf?xAr|N;Y_zMdg|1IE!*!%|_A2a-mVfp2JOA1!5)=p(S1**G#O0Hk6FF$F@dla{% z68QD4%V(aVcG)GRn-8pqceV0g>0HYsX{uXH%0Yi+`@t>Bnd4M=OMU`HVcRiQ##30a zflgrLEk8}apuC3$+y3+92UX!L<-{3hOYCPfOf1of__OZvT^g(++=Ep5+j&gyMOsE* zymCq64-5JAOT`JBzFLQ)1AgW&5{_ zxc}JbjW&X1dHmTgK@O7y+6*f;( zC@HASv{%-d?2vrb%2gl8A>jUVsO)vJdPBs_4fpRmmxFv98va_b*?x~b`=79xQ$vE~E zVf(6P^TaMMC+?Dgc~17H@`&t$xKh>egSWrlY2EA!wv!U<>Ao%8&okBFEY8VaVpjP+3V;|3Ya;v9V z!#)K`qrhxNKO_FpdR4U&S%62>-k9-k+ZB*6B2GHb7xmuNk}4m)wx(}=x|H#!ZXizd z>Z_5?_Dp5n+wM0S09Pleg)hruLmqmjs86<^SB6dfon~}zzl;_BYzq7*7Ho0!X}rEQ z|LV@K%R6U2oX{*TJ%7PEBXDXs<^feQ0S9}bJB6_Qwd4G5E(0A&MYJ~7ARWtB??O6x zjxjaJTj)S4$&AdLv}&S9<6^wt5W`EpM1mdepYd8SJx(I$S|^_OYvIAhlRmcyHOt|M(@Bd7KRi{qPa{EnSAbKfU5gcfW6 zjhzJD-EZIc_v>2T(W1@6gOcO;wXE~wRZ1@OsPW=DSn1uO6ec(Y4Zk7QUb(ULEgT{s9{ukLkO+osNAn~6ddWzC%a}3Z1{z9xgx2|W( zv9eeiKILNn<-7*y@+eEld*&e~miuAa{`SD;U#pov4yBcNmcukNahsXmAD3xtZC>2R zbAlCEwqy$IP(^Ut6fyOC13JulRL(X&f-t~+0NQD8OVuuC)~cZ4OOn=R%R0x{&~(GKT1%Uunu;&WKwQ?%#L;}BLOb+^@s zx%~Vfz2Mje7uuO~X#w#!I~7+7PK%ZL?bB zg=KxCgQy)r^!KWOqNZ50CJVY!?9GZ2s zLE^i^m(6hT-7TF%O*mRzFxuhSu@{W*mxk#*Y2`6#bsiaQM;X;z&2zX9j>0BpQ$wB z8mCIF=$RbJmAs$3izeI_oe|BrV{-MrG;$WGFL{pNdF?ZXN+`Y@PV*M4$n!jv6X>FQ*5%IDm&dJm zbHwU)t!vo^6fkDECY#zQoAxMMPi}#vyv$vxTG`NOPD(lLoZjXfFVIYQ77MEyN7zcS z>ET-Un0TJh*jp(Go;eN`wEEsIJ6Ou$6&g+kIHZ6q{s1{-LnY30eyU4nYYkG*x&5Oi#7#a!L&xNKiXISXf6d^`u@>wWl&}-EP5BM0c&(WWP$^j z)csr3v2$P^3SZSYAKeQe)FjExa}nCeB-Bem)JSnBN~-4oJ$Ej8?iY9-@$cUHld+5@!}D-GQZ1ZtH=j8k zs2Nm}g79g9G?DoRKOwHnfq2mFn>|8rKOsc+yeT)YyrNvVyWqG3I zVd&4oOYwzi^jJ;S;FjCl3>5cMae zy1fwyzGD{r@#6XCWH+EU+X9oik~iOWPskVM&|jBsUM{+M^~+5hXBc5HEa?+rnZjC# zl{<4*vgGZ%p@)YEQ%Md?2l<|1IWr0-_^ zQJ=uI+Twh(79@x?3C>CNbK9Y#l+B=B2}erY_g4R}MR?^&y!s@1X5X|Y=nPmp`hf!M z!)HtS9ur(LP45Z{S`1)T;@98y=Jaq$Wb+mBD!%0qjn@=PiF8Scjxj&o*NUkPkdYBA zluUk*iYT#`P5~o&0Q|XT;*F5qVOn>e|kYDUmlR<Ujoq-G|? z=$@H>MX~>rgY<8c_rD9pH@leHa;T>>Im=Ml<3FVZNF@+*G}{;&wiiB*Ae3l!|3>ot zIhP#42RY%FGRMspr3Zy0<^*p^Y0aZEE`(%CMqK+RmCE7EKKSa4y%c-)pX`xWIZmzx zm-Rgn`o;?CqK=ep={C*P{K6$@ZA;B}0Z?sMmO%OrL1uw>Z$DqIxFAdTLEc|2cUMz- z0!6qDry)-DdQQ&b4}4i62@jgoXp(|6_|s%Gf(1u*#5qZU5Vmc_78f6Uc~JW8E6W3Y z(|M_wjW3$H_77K55!g#Ge$YG+9ks$lmtfVJlwvQ_F6I84JmXp}SW>PIE0ZzLM=Y@0 z-p*Vu=TT@%9p8jEIgt<`_}BsIi(a{CNtI1UDWb#BpgeiS8+K5r$wwg-{SaT>brbaE z-j?KQlj2`47iyC%5v;sxcBRU$r25{K2M#5M&L!#znp!R1T#_|nK&cAR$|!^kKX~q0 zl1M1e*Xq|(waczNXviqXTr(s=)GE$`JAloIoI3nyJ(?7%s}SqiHL`ed7_HdS75qnx*n z22b#YW$-2JR5T+ETi#Jv)1~+g4jYF+F8l2-9d5sfcHv;yrxWDdQ*V z|FKW~@1#%-w1_lu1=|q~sgQr(FcpPl!}9fGDb`EQbC$0=8m{t<(gc$a^8WRRzm;s| zbvZ9cXey#TefQd_ZL00lK8&HCMv!wSP+ewTSq#+?40!b%%C~zJFl(5SBdT>y4X&0Y zu;x|EqKXK{y)2kQI6KcwHL%0%&&p7^CN~6YTs2~f-t6N==~`Db7sFt~M^weWH_XR3 z7w&ZZyxP5WHEUNCkp3-Q3)Ka3>r*c#BdBEVxUp>}>6LQ9KP=^vZavMS>KVkR9|7fW zr5c%J7&&p-_Bq4f83L?p;u{%nL36_!Ai480f@zK+ov2G(hIJOE&!7f@%TM^ro+mO7`h?1DP zY$s0}mTehJeRp*+K}yVP{hh@t%S!CHw~FADBr8Tl{J@*taZacLivZ5C9b?_f3^i8s zhrYWxIf{*5Eb9B|mbD=U@DjUs9LH9(%5@y~1@z_xy_=x=uvC?cDzT2fKOo6v#ID{? z6WT@hJe!&NC)K+nElTtfOqgS96DD&?iS4$jAIzx+ihsFh{w1$8^v+Bl!Fc4)H>R2Z z;NqR`rTBQ}Qca>e?aa0Y-=+IIIDsw_2RvvvqCAh=V~#sr)VVJ6ZvL;)IdTX^prS-$ zS*kYoU?xd+_MdU+kD1{uqyE3{fVeqfzs~PgCYeRXxji!JzbP*dDXgq(Y(#Qsu*pP3 z*YoF+nVsz~r#d9G5*9wqsJBVA6YPrho8fbb0eo*somLP0p4spkP1&gqV@jcz=Q!j%Nxhk+%e&9m2g~BKlFZM&K;holh zYD~-cLl)71A>u&3^3~ShTaC*fh=yUZijpk>0A81EK2;H*J_w*? z=?9-`UjN#{EZCJHs>~PHe&@))F6@1-W%oT%q(`gwuch}74exRqckJGKm2(wYojUdB z*K}Ujy(Zbq`&)}t|FTSzH-~@Lzcn{_m~#rfSMU*q2%SRkjR7mg(Y z{OInRKlDm3LgB}zME)IG{|CXFsqN(#>y^cQV$>L4sqQ^Ad z{zRo=7kG;FY^raj!A1agnvTe_oiLI!kRKVM5}u0OUo`t_><0f^dEe-~)wP04^(!*@ z-ike>LJ!^3S^PGmTDH3R*LYJ+w*1WQJpl5X+30(|m0v2jHJf(5EYV%#`lAHLnD0W* z?32&);d-0zA$_5}BO3dQTPra)*Q^2`7kZR96y`WSo_X?ckN1M?J~RHCaSFSEy;@pq z>`Ya07we6>TS0G?oUaj`Y9AerZuiAsS}Yt9ayMwUE@~H9sj2%@lZ9Nal4hq>xmb5!l_(3YASa?`tm|ZI)w~w?nZ1|7lWF_|-ntx9_ zB|lPk<`$dt`I(#FrHmAW7x3%duPIX(bJvc zbpN-o>lzAR^taUe?cV$5FW#v+OlxQGM=tU(8??7@pOv^xioBp3u()zT(({13k8$il zcA_Xns^vL2z42u2*ZU`iJH6c!N9(27qjx;&t>;I_OCM#KJb7fcZl^ID;NZIE*?hjs zrGSmkQjHo%@sn&lsMNu~$Hm};lIMERN_W|_CBLScCJ?t|?;cQu0kTslfz_-des#f8 zcoDKfvhwgRHSULUIjo_@BsG2>Y8v}qwbzacS1<6wqo<~%Hi%2CapBW3_c1Sf9mq|>{VaBsvB9VX`7mZ|>ebV9+ zm((^QANj2Ma@sKLQ)v6!OP{rHRXUH6DnGb1=I!%JgP&-R?^rp+UPJWzuo^Xv!R}nM zMPwb%9i;CK)@lR>&gnqUJa(I%-55AytRv|MaW{F+l0BbsL1&_g?BrcbG^kZP7 zSk@>KShX6>b!AjTtGSdtq}qShwcC7kPI_WQZ2&(1g#wb;IqMY+C&5VntXC@HSY@pfcjd%rMk#EAV;fjImUICRzr5XeDY7{~> zKF~I7EBOpOk=H5a88Y^8oTSue6>awPa(we>e)`Qil&j+M@F-qD0|&EM0%>{B9& zi%cWiA_2+xUuGf*`(&*QDR>pDd41GN{s=#7yp&&Q9)nWU67dL;ybvc@z|z_j>kmFM;jPK|QX|(cK^*n!>&e z6<4!Q1&Q|ZIkknw!FVplY2ZMcn_;mtQk=vHFNC0gW%8LgJJzcQU44HbvgFaMkT?=T zf(7FgoP&b>Y*T7zL-C6+vbPs{Sp;MHG9l7jLM>hl%(V{Hhy!wBDwsc2Yvx`32$X9+ zFCnIfb1ql8ri8dSWb3~L8&%E{jYGZd{1^7%p;pQ}@!P}a0|%HbB^Qm@zSjN8vp5YK zkob!FUV`ZqsZF`F6|w8zxleRJMIqe{G^%z^9POrU)p znft~N%%O{EJ%G^`71TT3&;Spz+pIq??Rw^r0CkbBnR*u(ChX9dzpb z)bP7a{I5j2j2rHM>#2qcP;W>}EuQ;U^^vTzW?ILW_>J&JGqHJ8TGvr2ci1bC)TqQk z#{iBy=F6SwZ=8Pz8%p^yyn1CV(J*t8IA1D&c}bNkz5OX}vj)RFD~yI+wD;kuyZ&a; zlj~^mUBccC zE~0>A6>4Hqw+G2>(t%2FtU3TFXAk;!27B^~=`;NcA0&=^6j#h98 zDE8zsP_B_l(hDo%h3BgV%Cs?wxPX+~m=vmk8kSgfFM<+@NwS(?I7l!aB%Hj$NqXV6 zi*bAhc-eqN$!x;OK3*M#XRS^YMB%_LAZuvi`C_caB;LN5@b3(Pr5a~0L@=>Tz7~*t zJvBM7I{8Lw@+lNcukKw%pnJI9tp@wMXl%;u>XbW!DREFR5(WPBlF3>Fd=>>ZZv>f< zK!%$jvqsQa4X|D#=$;-Ji3K4?ad(AM(EzaZCg^T8*pQY+noM~dkbb8cSJ1|2LZ@b> z(o@Yz>4uGDr7qxR zmwVP&umK5tmIQj^a?cO|Mq$B^VwjXPzzfiO`3E3FOh$%YX1*o3&IKq*y1&AB=vT<7 zH~@V<03lKL4YKd`tY+K}$atra4f>iv56GrZCTVJ9v!#I;MM!Z>Hn(d!Y$zM_BU=!V z1F>T9vxj~@m!hcU#k`jjRn65+MmP?#6pXU<*|WdHvb1oS{R7T7jnFsQ zoPq%0%~4o;Hfzf!TZO&u*HM_eoB(@xUKp$>T(QVgQN!MW^R}dXtQnh`E36j7wWI+J z{=rp1vYBgRtwj~5kU4%}pii-!XTEZj!J&O#oZY0njCGmeZjH@4u_(%&5=320_4BKVM~pLeshK7 zza;^DTnS#MNDtTw;9T4W?8lP%-0voC!{k=qg?KbkD>`BHSK)FYY#a~9)3s_ee<^xr9>=3S9QaP7O}+9|BJo*ifXD2 z7k!^zAS9uKv`|HAKtMX7cLZ#RfK;VQXbNHp9Rws45fP;;NJm6Ep?3%(N+&cyMWt9k zft>h#d#}CrUN`%kG0xq(op%{CbIkHS@Bhbk)hPm|IU}^|tZR9b|B%8`GJtZg1}YK| zPpnZZ)~E;Af>J!X6_4^aP0zze=-{wRr0gGL%>8A|H*)?co~a0F;G`RIora97X8MSI z~yB(zJ0yHz9B-c~fE zGR|A&wYmg$jtmq@MD4}XUn<*L}X}DEMXR7{cpBcViWXFn$wgwwvp`PTYcYi|@ z$xIJOu=CDP)MKV6WVn?xL>Y&CKrwT`A`~wo8?xbkQKj5Oc=oY}s${)BGIJ^gYD0l$ z0bVK&*?j#3SCGzY#53B%u(Uh~$TQcMge;}NY__0TM1*UgP#6j6NrNa`gUv|UYCd$y zI?7A!#Ow%Xs1*rTNX_U_}H(k-k+~%?h{`I z^W%WWuU{2-pR++qFdzSEn^s3Modcda!ebhpnOC|bM0Xo2PcpsoX?`V*bj$-ORwE); znj_udC*vWn&0BgdwJb+)ej{W@YF!ZH>*%(fHQSiO7yJkco5KwojV~FK5o<#x5`ZpO4X*;f*shjoBLX=JG%v z4`$9FEP9P9nIeMyMrQFg6+FN~kF3$tTj)ZXSwgKOaI^j>y#1YU;5AZi)qZQwcq>}H z^YZpp^ZfRXkB!szMFPG8IYaHO!d<*-(WCL()!Q~RVZ_3S4$`m6P}@b*XX_qKWU+voHLTZp|q zgS1%TSY(r%?__Zxs_yDC7vd3Xc(s{M*`)Gr*PNbEnVvf>4T27XfsORO^A7!mxB3m6 z`VVJA_J#yn)rRthrGAl_9jYN|8k`%|t#LMJ!U1O;V}vM)$Mo1x<5G8-#h|3mAT6B# z^5)=UzKS84p}IMN`tp(f&Ep%?hRR_6yBE=GzN72DqnW@c_y}Nan(XFhzXuq>G_QgTE4>x*2q3;0S2UkD^QK&3msOXU)Pm-*81e#N8Jk@u+ zPlkH7W8_dc(DC@l+<5Cx&5_FML#yNChrqbqnm2s1X6JF)P*&t+Y{%to$&O*XIToRz zm8Bto*tbCkQJ@FH9f=gE9~SeB7vV-i$_a1*zX%BlyqVvokKavtBQY{I3!JnXncM|V z7@wJN(rvfcR)<8u8m-6WccAS=jAyk;7@6tVZ_7IBTgNY)vR}rZ%Lv(yym8+e1Ngm< zNalM&L?vR;QMl}8EF!;=Qc6Tv;j-r#rBj@!#}mFxS#17L6R##pqZ``pj188ZvSji; zGe1_CJcBEoxQvHIY%%xYVWU)}C&`$R`DEe1tT{t_$V^!=(+{_2JxRzes>(B@YMmC! zoQhl%n<@P=H^ws0_5q^T$Xr(~+wTKiBQqf(69pt#JDJ&>V%+Y>+rd&3wZoh{>^{xZBBN3+mdc)&U*8ih6}JO%E+v@$J* zI4-ivEwPWv{c=RVYn5f=haTdY;>eh&YRLAMio9!~8QVL`?d9wz7J+Fxq22n0TeQ(+ zs&i^nc8FDPzzQOD#U_9Jd_IOtV8V(O!&On)7C=SHa{VHYpYFhLCHHbH;JB{M|H8u^ zr3E*(pjOpPm%r3F;Ng%qNWG)fiTmb$FPYC%VV*<`R;D`)tG6vXb@KM|c?FC)bww{| zJ-&uH=NU&xI$y`;I+At6JYNygq38^bjZLyB{S$H z{s3?4{CN)Q#OY=!c*mKgt!CXKAM^Sv&$1XUSw(>X7?*bG8?l?#jgQA*R*->)2P-pRMfmp z;o)qWY}vn{1g%{FHu1O=1NVCAXa^ik+@5G1=s7WtZ)t_I9=8 zpd0q!>$eA(yIB>pEi5O#IYYCs4g1w!Jn<+8EM_qXu~eAPXySrhewTbhohL9iZ$VvC znL@DWQIf1%waG3P`rG=?>lE}p4bJ%(ez48#N&Q)hMYq;L?%-jkuW`PdpY9e;?@Slc zxr}aF#dxI4ze#zzUk&+N72f)3E<87Ct+HA326n#{>#>&Ti`fS0p(rA8+x z<{EU&8+|RnlFWt2l3_b}%@0{vPTf8FYB>E$dQj`C%?FN^w2CF8`vMy3%yrufX%1cG z`(*}Y!WCvmTo?Bz704%3l>HXU8^x#(6(a5k@U4`yAG^EVaCVSO+(}Kk$9BzY54(Q$ zg|b4a@5bg(waZ$ON^oG6Lk3}6HB;37+!yIiyAP!=AJ<_i9Nz8oA21rsTpzJX_&T)Ua^BYE=cy#m7n#MU z{l5p*F6HR9<42IEdM|SXN8nZ_QDGOvZkrn&?xYW1E>dX!?q^Y~xy!bcx+Cl2S>|!+ z8A}kI<(~1pPzCu28<^f{@BzZ_+U(1_yKY?>otMR zD`6ux-~{(KF0YchonzTZ{N0?;<77wAd=D)Au%;mA?BGig!A_LMX*_e)Onlo$Ls*~I z(838RvZRtXLYDc)YK^hbU5KKIaGY4JiRec*88)3eE{eJ8u1%u~ynLS^($e^eq?6L? z2?hLj8+iTXufYN$b?>%O`iJBPly=*adrP7RTso1Z+KW5KKDU~D5Kb`VPkbcW{i|qb zBKK_h>~QYVT&>ZmyZ0Z73Q8Y);ojU!O~2n2sF%j7eV1C(NzfPm{+#7tW~xfZroOC* zoqknncb)8t&LI}9ADby=m%1_SDJW+kPL`m|F&FWp*RdfeACQfyP;rL)&cj>$*@1@T z2LVRy$>pJPxxZCJd_@IP#9liDjTxSD40-SIUa#YxZj{j7!H>_`J|Eb*3UZeDZVY~` z+x+#?VR>IydC#{7YZ%rF|dC z9Qm(}#_JdO(jL5n_&@o@eEa);0srk#BXFOytk^p{H2l%2dv}b~#By~r0B@0)aUE^ZufE3kKC;L7BFk58PLOhwI!fW=$mIzq2yX*qS_8JE>SXYJZw?+`HG(S z9o?MuEXne`p79p^J}5cw*u9Hq-#Kz;@=5m+lylYEXs)S=U(T{uS!EIm)9+35tuVva zdBDjko=Qqga@m4yq04NJZ!2OdvR0mMNwsHkNw-mze}7b7`>yrz&$I{f*MoOg?~8LK zWa-J3W3QuYkJ*deljM2{E3f^)1yNpA|5i9I!UPk(CX>(NXE$slr6dC?{UR6<7#U@- z5`8WMA^xB_M?Vv<=JRUf_75cdS2^I3rkyax4?nwvJEec@9X0k;w+q^*6BFiF6c<8I z=nJ|tUfqh4)P9k8&)C|HBDkF-Y83s1T_X)6)oZ36{4tJI{D;1fUaFjJ64QIlomb7O zqL{N_x2nu6Ov`?k!Y|?(^4IaY!4edL-+*sk@)XeZa#tKmoRi!@Uyg)dKZr^)jybxL;zcH$7QV%wLe? zEajp0ubx_Ns+kApsZnARFh`-}V$>{LxhAsu46Z0XXtcPZGVy3E`ifGqnx*&oiR|~? zS5#g#T3-8~da9aLVCF>5y1`ZT$wsT|_5ZD>`gxygwtx5Eda6-S_GHVI&nmB{NBm=h za@I7js=sFt!M#CGf3#fH{xv;z7Z#ic)$-Kio}tDW1s8C(dK#UY8Bd7~E)v)BGP^MI z=3#H}Gv!tmXH1WnRl;yLn=dB zeVl&Hyl;nHua433b>*I&>NC1t`=HhL%Bk7uvDoW%#ae#e7iMSP_g=4m)#`Wc#_a4O z>_$V6*0sQgvvcc4H<~6}uU&sVJOAzfgPt0G68HbRf2w4c%D_)AUApAqa9+bP;~)Pt z$jQ~q)$NL}n?t{wprL#Azy4{!Rc8i)8shC{>GMS2*NLH^4*SV42-I+**S`cRgFOwo zetqn^y5o)Fpioj&D3M{F{!5+SyK|i(oHBUR|J^oC`p>p012%mSf7dBNmtmW}OE~{6 z!65BkGy^vMM>RE@x_{~0eaow4oBx1KlhdATrJYGj&&W?tm`->1%Q)kodFiie`tbGa zLobGE${)<>phjxu+ig!lt)!eeUK)SLSBWa)a*VUAdDN z@F4GbK|$e8fzQ9lX>@VmtK#gKXWI;;(xNIv)|vgexB^zpKSTpJRO*7ADUYD zIz6;I-8Veb^?Rm!Y<6{SZt-9L^dJ0mlYyU3{)3;sU#2qf)A^NeGpiH-;h!#l`0@{d zI(x7&yY_K=`P2NTPoMtVKMnf4Fw*?`+ae{3`TvuD`hGk6Kl!KbB`g4#B#i`NLi`W^ z)Zst-r&#v?-9JT(WtuFrN_-&eq2>OIf9m#+e~LE#U;gPkhJUKuO)FxS`(OSkXlWI# zkW;;ew!n87{ulq$^gsEhmVf=z4wx8$;h&26{`F7OI;>TfGB_-ic>LJ^`lsv+|8#(R zP+fc=$jfZDMUlCfFYMk&qeX+L(}ty$+3RJG*XUTVJZOM7mN45ujxzWIXc#fs0Zfe< zD)<7DOsagVqjcR7SzT2k=W2oM$Z>rmalQ)uDg(S!|y+G|+o2V0HVMaU8vkq_U z2XH|dy!5Ru)8X}>``VvAonTYNN4R7y>AngqCF~8q>3!8@)P=y`U9A~CkwiIzX+fU{TW=M{`TT%24lcu%c zAXE1cQjQjd5zY5yw4>f-&F>5yZPIsx-hwW6AzCy~-K(ptAGCMpDvwcGnzW;Khpz~!`G&4=NOGC3R5ee)=i1URlOH|?p|dr9<92I z9kgYJM3eFn8BrwLJ^gsnY|@|OBx3`48BM^d#gCIP)VU7<8*nILfWi zrLavnPm_WoR5oErbdq(#837ltAJ!yM4@l65Zg@*_ST`1l zyN;<%b*LW&f&KvFS);LTsJ4zymOX#)p{EG%yY03&%0tlJyaGX}Z8?u-VcIOANQ76d zOQ7G4OJge^heVd#CXYu`JX2xNH+g_!XE%|YOofGY(B0u$@*n5N%%p&Jj$vvHA8D!c ze)TYsL+aRP9&X;Y8Z;or@6OQj60C@*8Sy=e*Rq<1!BRx{g0L29m~TLkRxe&IuanVmksl+1S?^WccmCq0u#Fn;0Zfc9C|j*Ja3nKK(^`k=2P!#pv_!!| zss|Ez!s~Wo#=%$e^SsZg@RnFi@qxrdFv*m;<548-FP%%6^&pFXE8}E# z!f3C+Zb@`8qpA7hJ~k{SKXTD&36&L4*~UF{GbB?;^L%Rs`F+7N*Wcj8RW<<@( z1#qAxPr9BRs4YmmvERm1&eYz5_%P}1J3M9-Cd5!j=43=~pld%HLPoRA&bj62@slXN{e(frmb3&m&Rq*7rW!onxy zOO(a#?;Oas+Nj=pLf@JcM|~s`b^dNxKULdYHP#@q%tM(FJb9jEF@nuVrM1rDezd7i zVT?YwnCvwyRs`p)eGZ(SrhmMw1melFdDj$W8>riu9)1n?wA0#;oZi+mzkY{SXf0+< zlLMM-&3_fm8U&FG5LVYHO|-^huu%txs#ig*Zt=DcnYp1l#q~BJ;tu`bdr5>H+35}+ zVPZbBbVFna5S&d5>l^_{Q89ZP(Oj_aRcIMf zBlZomNUA2ilT9(M8x=m3v|b1-oiq~8zAL-jbI8&2HK*y4oq=CSgC1)XhPZT_%L2zn zUyZ!{q_x?r?`onpCHi5imtDQ684-tWC0;(S=J$bMR*nm_LI!;EHvRDRefSZjT4rjcMRL@E8M zzJ4KmQF=-xFT}v@5{shS*qLXyB33&{7G&$WAfCz>;qB7ALArF7>sXk*3pV2@6r%i~ z8_MGvEw~VQ5vx{!l(YEJ%~YeLyLufPRuZ(g{n{H9fsy01P*c7O@WlRpeHY>RkAhnB zS3zZ&!u5T?vPrV?sz$(T&~%1^x)rf_D$H>BYv~7@TfK$f!D29UiM}+ew z;M5!}+G*ohCnwIL8Xa$l(;yr97hSUW8RI(@Ysnh>M-0=*P*CyA?4sd58?iSV;`D`~ zC${45?tu(`;QX*m--;c=`s403#1jG-}JjZ=I99cCXE9<$B> z97C~?l{+yiIF=1DjsQR48&Y>e^4D=p92?WA-~^ry5yZGFk~JAzgMNU&3Vx*wb4g~; zzrSgR{Mw)Vb5Ah!1E{4MlMm2ujJDI*k`LSRqj<2K+X52Er;LfXEgA@|e}9Vz5u$`T zhMWfPU%J*M&LWZw9=~$C(Ehlyw^by@AuWXak-;At$d>2^UrJqvL5^3Wt7xEThJuRM z8KVcLlx?Ho|0t-MrRn0j!4}S5TN&^ZRFmB+c!}<_li*X(O#J++{%RInh$zQbjTT@B zzpTy}VbdXYA!IvKK})H|2C#4JrV2FBH?Y2wm!9MldDY?tUIO96s(!Lk2BsQ4`K_!AU86_|d4Ri6OH20Sg@ zyVY`1=R56@f;)-@q|&^L;=`Jjo^|f&)<1jEB2W*;BJ%jHVy<7chI_PZ(6h%aA5)KC z@afq>$xrG*?UAjv>I0EEg3~fC--QU)d45z&1zP@q-IJt$EmVBg2Lf2c`Oy&`GDraP zQkY_mkmQ4grrGc(6EL3VH%bQ$_5YYIRHeXO+ z3su2;@YxYo@`1Gs(>=M}U1jz)vF}^JzTW3PbhCUuqBJ)nSJ(tBW0N6Jn2Unke>Z?n z8WsHIQ1e)hGdNTNIF4BGmqW!v*qz}F9r1)&*8kv8w_enI!W4PGC}D7@LiYDyK?SJnv`b-~hM3QJ|cc`-4{?5Kgq< zk?BwVN&g} z>7S&MwvMN-VkkKA6V-B0nRq8ZL8bh6Rg!^RzyiM?+{P-F<=63tv$Xz^CJUwz{*J%$(Uokr0d;3Lv%l5D!j2+a#rav zq(&LW==00|Y}h3xzz6c{@Wi^RDTSu#e=JnzI);UM>Nk4ct6sLO_NGRo&ZmY;-hVAr zeI`Js6qf%pW{_B*MEGl=8a;*u%)b83_^Z+|#o(U(<~_s;Iw`T-9}^c0HC~bK_7q>px*8>EW$9Ug+uO z)`sx51J(?_{I*L?SZy`z@2zy`cH4kdyPkLZQbPMibGxV{=+00(z=0ENYazmk8>hv; zw$*zof=lhTLP;Ao^Wx6@IQA{C9eAt5Qh~Bgj9XnseGfw4g5X9ex&V=XEL7)kzz%|8 zq4F^NQ-+0#?^1H;Qn}Ton$xAO*_A>;qnNu04#J8pT?P&eY!Ui z2AeKkFwW9on%<3fF7$&)TM)-lQKP~=X*R`1Qj*?HK@HOWuUfzUS$WQ zx&_S70u*e~6^K9YHP^RN)63`FKzfCkArFxV9H7Tr+mXdU%`GQq3anI>UPBT zXY4QMp$#G?g@3q!bwKHsT;(ma_bMp58thK%;5=tDN&tBbL!Xn<)wU3H14I?2QwcS6 zYS>s}0_s76hyG)s>NsP`fWH>%sxLGhH@vgSJ9`Vgd!@^rIPCqMX`KwWrF9(0zzNuq z3C*rp%`R!%fvTK-W&X#AaVWz=-C$U#q#-OT^S0*Lu?pUkx5kun#!i+So!lIQw06)B zfG3lzU{3~xW#LY5SmuYR>sw5lblNQ>a2yoC(&vnLPesWQK(r!5r$07$S{+ zr6Q6U3#iqjZx|LT0BJst&>6s#=D~*uSf18#CqLe9i!NO9aKzjgU1sDZ0rZ&4a^fXY zR;lYaW$Yc6#bJ&`49le8*fr28j}aI@!ZI5=BlM_<(=<%-7GqQ!@sNfobcVjKM)i?K z9ZAeVc%~yFON8_E5pFueZ#seme@}%nEL6wC$!7w*E?Qk9fuMQZ>5~=+I>SOGF$Vk@ zE@AUST`J&u}C&RI~IWBGa;Jxf^#5@69F_jQb5y9JnQ ztkI$5x%bvnM?_?zBcxCk%l-w^Kt+HQn5LeLGA$r>x4^GR%njIikj>0m>r%XM*E^b+ zRl&j=i+(=Ta_DWefau)qCrgKii-U*Y?P`<`WpouayicYvFVHYfZ&BBv?bXvCa+X#` zmJGCc2((#e{zWnLTp)n;&;0U$-;|CutY&NRS2fIV3qq#MYc9|(E?^u4myU>Y=dsXN zgc*IORqrG2E9H8EIl7ya4p9r8O^VfKUMEX08ng`Bq<-Kd$AibAZ-^MmE$qHCcAxG% z&SA50K-efGfAA6fc+*H0GdVObF#)}QX&W^1?pbXM-; z|Gj_O|8#S(ZF6{GbM$DFD){AQmZ5>MT;^Ad-rjn)T(%q+4s(kqbM+&OrsRDNTov)l zbQ{IVh-Yb(^6<*dwMxB>Hlu90FYkoA?i?ZfE6?%C;DSbFaWXhrU(?u{PLONoiB-@# z`P-jLnL|Mw-D+!d!5MzHGpcqARPYV7JR4}$`O`^2zhzauPL{9c8ntqO&EZBVb*-tD?C}%HLIy|9(gJyMq6Bttx(kf0qX3 zz0v(u%am?wAs%DxA4&(`&4mOmB>gyG2lR0qVOy`fJ1{>Wt3s7~!sGi{DGe_J_tr=={63G|-hQ*{;Bi-Q(46?<$2T2Vp$5camOY~%4^3DeRxaw z@LK!oWbQ#C-Q}dX(s7#5Q4m{SGo~k0?r0ddhf+7tbG#PZwXgiw7v}%m*zz}SDv@jXuBN6Kpgs7}^+wp< zw*2LkgN?(+3#8(`%*DmKuD?fXFwYMHm%r%@SI#<@ZygKX(nW}7SSyE_G+{ahbGDiB z*}nrjct7Gf0t|ALl0Kfgs>+;dz-6dp(;8%}S!p=r6lRf!40;=2EEql)AcppGx9I3_ z`simUoF1K0#{w-hHihd6ZRk_+ZLr^D{G6B zlfh@cniMenQ+>{mvnr;`-_M>sbMBSQY15D&=j_}baGpGK;nk1x4ndYBlNa?vps|jq zDA%de-ZLI{k}t+B*!bBlxlgIG{SmXbg|e>lONxCw_trk-$InYdg|(1NCJBFa)JRzi zhK}l)DPj6oN4>x?8)^UeEW(+uoA_5pWs4Ga9(OJ|RTb|4S4VYb=%|wY#dWhp>H0la zfxZW-V?>-YVYZtm^!2mT8|Qpoil1Keb&f#3+vEdmwK99mdfqXD0xIW2ZSE=j+`pRV zF%i11VvMCZKNbrRL5GZ8>_+N#)|jr+ADbCvKfC&M?fsN&y`mG>}V__0#Nwb=dnX|H#l9NWTQ`1ry;AQr1Lvt6hy-OzG>trsI)6Ll-Ndoq-z8l>EIsp+1kiwBz@7yE zEO*20-Jp`Axz+66P##B*G338Qs?)zjs}5AG^WW zEsO)ew-R|)q;u(`VjVh*7by~xSW3V1)n{*CS;a`Z=NmfR=oFI$0n!VROb|frO<8}; z`d&0Z%!GX@H7Rtx0Sd4tsQ`wp*Z|cFGw~fVFja72V$e<2+fk4+elB!FMAqLG?; z52I|EjY>g1~?1mGghh|McaQ&JTLK6Ccc8zQ#n zgb@mcDKdo|M*x^K$HR#St~~Dv-T>Fg_%$U{0{of? z5cW12#9U&E^>TfcO#lMUMt6Q6RRDn`J<`wD4eUe~gY2J;gStR89>-pNdO1 zw9x&SG81FDLNRca8LLAPI%JobEOqB1pCaQ|<X)p@ehV8Dju&%H0erh%?&WV@F!knqwnrPU4*^ZGJl#n1>8)J2HVL1yjk6xhRX(a;x*Nq-qBb201ygr?nFEA z&FX7cM}~_>h=|hax(<*8yb=p&pM{#U>B9h!-}P`{wd&nVtcrjxH&ammti)RVab`=?p4u0Hk=a@?T^-$hS3@eqOTzKfVVF150 z#cE$y^>{bg@Xnd@m39y&iLlPUR%(pW&nkU2U*D_YdOelbgUs^1Y?l~Ts;bZ?SI5U| zzu;3dS8g<4?D`VicWGfacC2asw*Rd%=)(68H`)(x^M`rGJ^k5vC+sYG#0 zbqXm)6t9gTse-8$+)RJqnN z9(_f;xmo2O2zc>6+V5!mchA`NrRw8XAeQf@MUuYWyDB4MANv5HaPu?Mx+Ck=OP9@5 zoe3n6b@R80#^wb_`tP`{*2OV=A&3Vb>BmELOT^NHtK-KbV0apkg&s~D>i=HsTL@;h zrhs^0F>qfXm?;#XdP5fom2|$YG#vx26*`nP$dwQcrF1aqoK04t0m%!{$Khx?L_Wr`Du5~^cp89c zGTGLox8b@xzOB3tJ-p?*H_vn`knyHeT?w<7cWOyL(@5TX5HMDcBC5A~<{mV!Q)?nq z!oA>Xs5&%F<+(7x-dkR_DModw6RX)11nBZXky!U)Uw29>%P?ZnB*@%L#kY_As1*Ll zB1b|W%|(B&3CHM61BpDasaRqbH%~mcL^`wgB@Ss8shD1CP@aGh(Q=(?%;q^Rm9D6; z33%YzS*!tg>L`-CsZg!a9Wn6dB(OT~Mwm2Q_2CyEOmljclGizlmL2$CvKt|Ws@#TI7q7v66%Do#>25X)+OQ&8eA z`@(>arNw}*U@wRYw1ne3Z-7V&SuKWn01C6r2?m(TV^kw$cH{+EZYe{aZP`%k`qhLK zpELP7P9O$0yejH%46N!6k7ezDI@E%^8qWAOV{y*c;x|V_pT+B|s@;YSLXtR(q3+=! zO%k{y_G*LB7{q9$E`brcxW$ozTtkojX534Od_fG)tWl796wH9R(v1OYwrOwl^<5a@q zl_JO51jkSlHh29Xd*eq}RHAH&VI%i5fdlqk|Z8p!1mFx-?TF;>bzo+N|` z2}w2XbXO?m0DR_%KEVlKTwPp=Enx#9;3+v~d(x+Qu>MCyw<~8Fl*AGY2N@m&bPs06b(sNm*=67l0FsL*|)) zwD>ESL8wju6Pdbt;)H1O>0-bc1%V3{A`{->usL20@CJbGCP}-1bedtOk|EtE0lq8% zk9`f6B6Sp1A4oQU#sH4S&q?Iu65Nk(+PsgQjXdFS!HQw0Iuppkmh_{V(jyB!Uk~#O zJVqs8-jP)ysUSdF8InnsbQD0^E`P@B2uVMPKICtP#Ao_Z1Q>Ygb7rp1W6j`DZWGJS zps6S*{^Cvtq1ddu;^Uao#8V#zpNb$c2fkO_eCLt`eslIhQ&0?q$W3H%b%9($>if>NCOsAqwSM$|QgBdZH4(MamYV+cPi{``!r(~9Af zN=~OW0r6!mhJU)i1zcEr;^erLpd8_}IkKQ%!-~su`o>wTV8ii6;PQ?i`@~e*H_OGz zn$$JP%fD`47LG``V14=bxhG7*?0XFV^bo^04R~8}8Ss;rZ=ZZ9>kPhv)_b%FaH1vh zvU6h>!S|h!PDw{!oM3GU}r0r*7&`SJFIJ)ITgMACWY_y6GIb5n!tt zM=M%rcYTQ!JYiAFX_a?#QTjyE9EH_jg4Rc(b(PReuREKope(MqpFVO=ijN&rO)?Qw zF*{=J*;uuBmUdPmK?vqycYiGo>!CTGXggY;8p>uXfiyI#bGYMv>4V2b!7Hv#h%j6R zxH0k4I~DuPbms+tsbKvTl_U?jD?V)>+@7XZkdV0eD_0~qK%o_ZfVL|k zf>*+53B)$!xwhB7*HfBHoc%}Z!`hq)>%S5=n)zo&f1s0`A<$JXJe&|EI;HTU(pRURM z^-o52$#2m#)YIjojOjyBu_16(+q06z7TY*@zU9j8w!&+u)<+aY33}aD8jW zktjRoAiXNY4W3=0jYyx+U1iE&I92vnoO51~CK+aMH_}8t*zyKKOLEsnlA#1eMc74& z>_r?$L>oU3l93NGnF_veHQtI2l(vIiWDI`+ekjr-%(vovtU=N=n1S(+3qn7m;-QLh z(FAJ9-Mk+T4}V0MTz3csD=z+sVelROago*6ef<%^0Qha5_)tWwM+4mF_;vo2IIWZz zImt-ptvDmk>x!>`xa|S8vLF}XyD>bwRyag({EaxfC?U@qL@Y?L532YL9@M@aN{R9b z1xfS7(=U^5hy#MIA%56x?8Y`k3&n#Xd18|9#+%zk$)5Tt$BpJYh&bZ5>har(r*6N}xrOZu zq1UDCX{W^5Bchchqtra3ujj^vM7>>fzqfH(It($3fU{1%`hbd5Gh@f=W8J)M4@Vo!w(zx%att<`GWeLnP?Zw*^#e|I$B{DRdVjlHeqzI1v2 zBh8!NZ9a6rQ1-Iq)lANe;5#=|Lbk$xQ@A^zqeTKT{ZIe2)bWfpYp6wU!QNSe;9(Thriw_^0BTGLAcTxQjRG`{G$H_tdULe$CLA$o6{CDDkg< z8ZfAK{7J}6*zu=fpBj(n-u{iW(>Wk^pVkpY-lS^%2sX6M6KV{SG)X>EJ5l_=-u}6u z4k^eQuj92vgF_a)Yjq`Hws@21XGdHCx8)2oB`bT2iPfQAB zR!(!D&X`E4F0E`DFAX|Ko2wS;nPC-l`*>5fe%d<`r(>wOl%lOiV&)&%t&YC?pLylz0fCTLh6OSIP z^W`?(;`elqe| zpRNaof*DSSRv;?&Zqi$2{Ym)IzV2xb;cmX8gPktEhb4l)FzavpZZO&``M<@t9d2Q1 z{F85b9)K|IKMU%|~8lDb_rYlZ#6JnESf}?MW%Xy5jWC$m%lt z8cbHiYL$mhDWGiTmeQ_aWmGovWg7-3YchFY8rZVwk%Cx(fWyxmOhD$&K(x@*#%J{& z=LDymsF#(uzgSyHGo8W;GnZ-gSrlHKY^a=IDIgiSxJb_})kB23Bn_0lk(hkTA0!-Z zz+}EsNA8IikrN*ZH#@B>{3_~vUhXkn}! z_}~s4O{0Wzv}@n+Ts`B?%XuAusm|_DF`}e4EF@2~jm#9v{8e!xkQjvGcKh z@WRy6;>N%77_FOZa~Nzda`ky}`H@(}iaOly=M-WMBn^BnPOzL+sz(m?#3n@5zMr=G z)vB!?CtkTpJJEOInB0`EphJ+lR%m(86t;`Q(`L1uR>*y4>XZLN3+^8K&a}0Wip1aJ ziPq?-5n^D6x>Hp3iJ`81svV2AXU(V{N}n2@8b?YX#u<&3@Kq1j7EX$#aPYhHB_Vo@ z+B)J*=pJ|AjYcw+9jWTlSb?_8JTq)Xidd|eV7r8hv4Jkc{#LY*N>T#G#nwZle@ZO! z7E;sm4aU>t0)1dnH`t`JJk3R^gDrevKzo!WE%(MM6~@%w$$rO*ksW3RN4}3Vx(smE+|$RGY9UC!z#JA9B&qNN(FL z;6sWVSOVRrqVfi2RNXD;#bv0ICCh%z;Lq_TVuYy^Yt+NlMnsg!@d6)bR%JIB4dx9yS=`v}3UR3tjfo zHmYp7mTa0PEuzo)k>5ORcON(xB1@bTG>yKd;n|l5;1~C4sT$DJ&)w0s=)sZGB3~b5 z{Z1F7PXkuY>A{NPAiMklH=Ld_u(-4HpM*Q&ILc=+5_?fUN(bUNWxT8T>RZ)zn~jch zTv69oa$Py1Km}1XsG$aZ&;2Zu3`{4+grl8xIH6A*C5*mLJro5=m@Z&Iw?B$Xab4D! zkQ^LGY7z_iUy)-4@zIPWfuVy9`dMx7?N^pzMSPY7Wz+i9I4Sbdmr^(-~$RDNMt+(|gK_dew*Px$~g?3}WsY*sNkghUHv;rB3rRC^EgV@i~J5PN9 zi}E`UvVwRuLB=Ry?rvV;ja-flK@$!=azeLuGZ?F3mZs|~Xj~%9$9O?s*4Ht>$@_Mz6FXROPxv~8x!s*i&w}LL^En_Qc#{YY zK$kA9OVlKSnJ}+zou}*z`S|Fm39$pcNl;n^YOZ#OQjU3->&g&D10cq-)TgsrWMPD^ z=q+>6TR`m3F-f}) z)5T4l=SG=xOX^C|I^<=dh9G^)b6v{8z>Ynz8mNzb4YksgeFY^3j)%MAVsN4ClqN?1t|(hlMd3QN>S(g4OkDIIZ%ptSI+#4uDJ8f6)W0Q6mJt*cM?bTOLji&AFv0Bm6awCVGMrd%8ad?o{Z!EmL(bg-@T;)P{+h1$Q@!uvPeO`8& zTh26luWd86Z-i+x)C#ecTr7>>?g0;KX*3B*?V!5aP`o2OU~a1lu zInVruBEAe@3Nu<;`>OciN0kTB&M+ehxdl&d0dLJ7c`7nVq9*gk99rA)YU-S>(h6uvyde;o+hcZ=WP1cR#p@=*U zKnd|TkcSsUU9H7PzM+70k~`KCg5A{f(O#~|)g|$%84Jp}d;)d3B$Ci6Fj{4nzrRh3 zfyAd)57{U7cqlLFW<>B3djvDc6isv(3V^&uwpRriLr2R(SJu%xfMFdQIwV>+^|eNB z4;ZFNMXqIWauJn$qO~>!xXo&5i5hEEH6rX5RGN@ znC02H!p>)ud*A*9h|I+<-xty07iQz`6^j^>CZ@f4GW7Bef z@DZr{SC!D2`&9s0t67f%H=0)u!O;}K@pVHaHxi2Q5GLL&bSu1TTGm`KpPtY=@Em0F z2+gLd%dxX)h^!=6s?_KuFEJ2(c#cvP$4?$ZcykTau{>qGz20tHXfHQaX0%^#RFU?2 z#ga!irx`_cx#F^s0teyY>Va)Q))5;KJNnaGo0tH}45gL$ebv@Rs*p zHbM~U$Q@;5B=aF&;kcr?XltOqP6_kgLAcx9~40p=!L&37%8bbjZI*Lsip4eFv>5br>nsXSriog1XFj zvtMQn{Q&zl%tI)8C*oYlBVm&=_aQrX^zjMMw7!}#6svF;#fXA)hQFlpjRMLkZK2 z9@3%>L2(p&hudsZ=3<@9yG)K8G%NuG*^pA0oD@<>$VxE&;r zih`(G^c#GqBme|9UY?9TBt=DOkrEss0PQUqDGgVN&oVETybEvjP==sr0TWWO(t6xD zgI3#$d4-lm%+ct}x+{Xbz3`3C4BGfB+K9=CsL>j{@}X9zzR+!%A_2-3qh4Et zHhR|MRbr7{tt1fd9j#(QC%A@}w$hRyMtt{97R(~Efl>$({W(zUvle8$BksmtXc{Dr zI@`%@M9c2YwnrxitOPU3>z1fG;TvdAizJgGTmYKQ7G25F_wzN0_kCSix5R)rk+G;@ zACbs@Z(Xwrxo%Ae0DtcR%4I@Zb#ztAs#x{_XxLs-+XkF_psl*#6|DJ+5fpiwxyfnR zPYH4N`c(xt5~3=6t6Bqqy9d{Pb-pR$L`2iL($sNF-CCq8Z_yz5-pL1; zOXI{ADwx+ql3LfLoXUJ{@G%OSH8DfFstf4qVWEYsSew86)9bxy`fKP9ZVFCj*5mw} zBA{I!^&Hi*Bi*_qHF)pj{*Hjh52NEsjcS}!t7yduuUV!=cX2vucXXJZDj^&meHj$lpDp1HQcU{X`r=KSUMoSLs6{)bz-I1BQ!cCcRMzp) z4{*qjRNr1Yr$t>P8j^|?*KX6$!-?57DqQRs`R&t(c+Z0Bm01t?-UNKqyUI+377f|R z`GZevR4R{cHwv(9`CSMtZ@FHqT?u@m4`-NxTxs=iok~!ysZxSZMH8eWaF94YKQdlb zvbuz8Dl~u4G#&MSqUxDc8ZQ57fCo1+1vk9as#hU7ryWgqJ2a3$S*zgYJ1Q-~!Ea^% zG#dY@k5VB)sBdLE+Oj%YV=q*iyHz^yD)q%GZCb${Re!n$IvUnHx`sQNCjYdTb-aUy zbbnWQHyqq$7_8!|A}grc60XuMrrNC%g1z|DLK03LPjOtQ*cp{Rm0AJoE~`yVc1~SY8~z?LYkW45-ZlC1 zjKZnwBQSIXsy05*IWEvOH>^6}csli}>(kS-xhra)JJc2)b}cljE{3ZuiFGYO&p$AQ z5uj?TBu5oxT_790cHI{7aK#qtg3zeu135P_UU2RLe#sJGPPBg^DmB} zYwK!X?w+q^y<7F|+VMT72vFZ1INc~y`v&Z0r#7p32^!7Nwz6} zIsPyU%UBNNv3-;g!UQ4m~-d&FQZiM%AC=Cuh-}pI$BL94&QtI`4kVo%C+9C=qaIy zk~y=eyRZ<;)P`zG44Q1|q#*s@Ro{P(7>m}t%Nn|QtMm?+i>DlIg_jF;R{~*k%3Uju zzgq}KXH!J?fmj_iUVc|b5xM#MKJ^!YNJ z^I!c_>)*}`^CbWDJXEgf0r7ADH1uS8#M1up1#ei$?;o2pZ^Pl?Vg7SdM@JX%D}gT* z!B}!yRTvI*W^de0$rA!272NlT5Z}zk!sr-D*RNN6B^48|$O#?-XrRUE_D_;zqEq`= zt(}rJD4cWau@{;_g$x7S{tMo6i#DeWdK=oEh|CAV$W_THtHvt~^X^X!{F@F6K<^R=h0*r_X}BmBKhvxF zv`R+RAi0r!Snp!?>0~a8TwSV(*3|eHX`b_2WhpEM=9*moP5L_tj5n+_!A5uVJf8eg zmq3FB44j#!&9hX}r)S#-!C3qLs^>E{p=uJ>ES~tAY9q%|m&xnQ0NPUiwXLWap%?qu zIQ4v*jQ98a+OTmhENJE-sHF8uq*Kf8FFk)bonm8OKpU3Q$1wCUwa?97RvY*Av#}-n zv$L!#7hHbF&xzxlB0Z@-r;cURk04>}D>hLS*$#iSJ4d2~#0k1-LaJj}3-8BO)&Mt?p0>i;b$^a_dU_Mdb`GDePI6cl3yZ}f zbR_K$q z2eF3}*Rr_%w-K?ud$uW>KV%2-Rv@Lh_mX4hK z<)I68In;@DW;|=gK2;WSC6f8(XB&+now8;cl9md7c3qB(f?Kb5zm|=>I~;!gDSHzS zqTcO>BftH0x!!9S*r5sJ%zh^6bXCDAUeuMHF+-{Dv5e>JHLWZW9E)-vc6h_?(V%67 z?ypiZS0Vh(S*F_w%-J3?G>zP9=%Um66) zpqY#ASNnJ4xV;@DZaMhy3g?nuqj2Y0US|#!&{;C6zhWkM9iEASy_H}{oW9@B5o0dK z_4hG);~o(k67yZj)rl4#V{@uOuXcol zU69F-9+_TK?jUN+a0-t^0kMf)HY)5YkEplBL4iIZ(*v-v&sopAzZ`zzt78fj(?VMpm{(MN|f9rMV4)?mm$4?*XfIsq%a$KmtwW){l%9{OwIea)>0b84Bs+l-ieIOP= zC&uu?_fDhDdyAmwm}o{C?|9ucfr-{^VA&f6s_TD%Q`RQ(Wm4l$Y*80QZe^RXccu2{ z*{kvJz4Bgx%I%b}TMn$sTRrh-X=|gQ-$H+;i)RjWn)?XqTJA5K+}qXjRP*^Y_fk{t zQHul)7#!RnOZB*A`R2Fs{P}re|F70JtKS|Jzibpr6~-(&J`8ZQSX!BWWH*xZ&2PKI z(Jwi{t3{7NhabKGAE9@6-99mfWBnE%$ygmAq;j%V^fSpYgRkS<wqQI(_1?uGNi4ikzz`|aMY z?G0U(j_=nc8`jm9R?9nja-trG98Hho@}e{{o;rLmCu3}#4Ko+^1C!8@&LM9`jeq|)5X1UV)Bxs@BndH=2B=Rjwc_p zi8THZhWX*M`@@cwT*nw54dAI=24DX!w?_ou*m*mLV3vuZiI(8J2ykL{z(HK~2n4EH z&2@{hkLL|W<)uzHqqTXSEz6|ZQGTkTynurJ_s9e4z&E$S5BHQfwW#9jsJeHln%$H# zyudHLQkdPdkkOwr9P@nk7Rys92XK^yCQ2n7vC$dGO2r^Q#hlS)S#$V3s;55?YXUmNI9|&dw@WTYy%of7| zhPuU`H2}vxS3oNnfAt#6CNv@C0k$Xxj-PrQ&=jj&Z_203 z_^>MO7f&agfOq7IS8N2}qvLR^BzZX#D$w3r-iZ8>D0nhj!HGKNMm?w2{rXe{&(@PY z;#zTj8rxAlyb-AgXh;V=W2K&ervUlXMBoD8zNalh+sQF3++tA8KLofT@xcvCp<=~0S{tfb~d=s1> z$L$2*3uGecH$<`+MWk?$(rV$?k^+Gh$(*YK)KMG;2*EO+l*${5pkkH=AF;L@8G?hN z5kT?210mT7u{Sc}%{~%UrXLy%aCT`s2`S?9j1m|1jFRsss6+K>Bk+>VG*T{mPl)kt z`557AtCzVZrQuAMt&3^G5MMN1;04RNx|s#3@ty- zC;!b}-a^WuWJ0PtsllM&RCA5rCB4h-hr;7k2W9;$yHBrt`FO=wREbpM8Z^72M7JyX_#=zG)vj{(hGi!Lvu5FCVI z)fM2jDv-0`qZ}z_kR(%3Tjf=U(QiN)&zb0rYPpAq)Il{dG|SZQ5WI;{`f5AI23>`e zoArBoQkei+^X=kjZ&9&ZdkR8YcgyfCOiC9L%-ZsP+WLny*O;|TZ+*MPTr-Mb7+lsI zFJQ?;N><4-n9FHjVb<-wrhA`R`=KA$mvBDFlzBsnumqmSu;bMnV2rbqMLVirE0)DY z!GHab{Z@D=Qw()O%ij@&crEiXj=`txn2&KhoKmd(x7;rIsYDoPI_=XWW@vU_XWKr! zx(hYlBwUZWdEL>Q`_Rw$=Jv;cR4o@-lOKo1WIs((7&y}8(ht#ZyuJmKFN3EGeENZK zakN}w1PunGe#Xa~UBFoB4rgDcYPI7!jerdj85DrG+7R?s{#*wrh~;zDRzKNx!1~vS z%@@11`It9~#n*%m$sx}h8Jm@|noTbLxYuVt*DU^N5q!Viq{`BLTc@#^%>{5*;KmJU zePQ%o@F7ml0_kwFO!o~TTT74)SO!s{nKopZ9WZR!7?_nmxH^KZ2L~^TfIULV5^vjV zHd5*#=;u&&kDCq83Za*l*^uiiLwp3pF5}R?`DZzcw&#v2(~cUfPFiM8S|V0oTzL%= zL5(mH!gA*GBkcC;e(t) z!R@37b!!Zb=JGEYJbShs@IrTt1X#*0Ov8!Oa5o~To9uqK*8MK#^)dUpQ+p0_wCu4x zttgW_ALVuYw2u^BwI6vF|IkpyE40WpA^T zAIIskSwC^*%^-pc8t&BDL%6@n*(?L~Fbo?^@RDti!&_P_1f>;X17i=oPUJX^2+_yD z0vsTiwK1F2Ug5Pz}X`OjvZ^R#APmni|h=Z4AMO% z280UnnhN;Ku0~zVv&c;swrldaj<51Y6L~F0H@eC42U`HgWz~3d1#!OU9FExH zzx>m@D(@H0AU4u`$)r)<%#2!(BQv{-Y7h0)c$ISx1z%ZaJ4Dl8_h62cSJ<L(4GFbDi zJS5`v@_Qa+v%Li#Hf=}SYR~+Yk3Aoof7>xSg|i~ZvSx)My!okxwxm%Q%Cah!j;jV0p-Km+s{ zT7s!G?~`)_$U7ra__1XCfR2uU7JDtKh2?&APe$$;co-^n`k0aRBRqkqZ%^0u^JTXR zdU_%|TO*360o#rifQvtu_xS@JQpSaTc|CJCC>8(!5Gezwp^{c0O??0W1-RI&sjkF0b8A z`_EKl30Bn|cCUw)o3wzei6_Zzd?+OeR+AwfMOxCos3yPhv1fkZkrd&T#KEQqe8~&B z$PC~3XEu64wg>So(4&f$N1w`oOJ& zv4}qRLe_8Xa$nJJmdsb)N3Uotxl4sjs&ox$*Z>mvxT?9=(rkjR=G%cl&%Q6WouH{r zL?O|3zD-NB?KTb1iO(qK+5|gzgU;8v&ShKCt<#Mg18W->6=Ckr^A~b>7d3eCN}+(3 zg>S_(ThuYEUvo>xa1aC8K9qzk%ywTMNbhv!>lH!Pf#I6v-MA+3b0X~9=J`DEv(J8h z%Fx-u(FR_Ci2omIs@)da))(CU!?}0u8n1kSvZsrN(H8G74YR|exI?uras;4PBbUYFD$#SxL_u5TIzeSrP=jfy7uN=c|QzE-J#R}+jyI4};f28mH3V&VBW652v ze7~3o=4?ChUOO4+wrti&X$}7_h9XGilJ%oczi4=nt@KcY|Kpz)_B^5CQVyX?TSjUy zh*@^VNf-AgaWiC!9SE2YqzM|nP>uH~zQB5m-)#?}mMtC0k#nExjF&AN%~K6NIo0>2 z0syW!IWPcF+D0WT{kKudPPrr+6=T_zAYV0IW7W~aoGnvro_MyFMDkB1=Ik^QGHybz z)GoBq+zIM>dgaYhSBM3K0>Bq-hch?lr6AhX*-5NsC;y<#`9i1+6}$T`u}#LjJy#|8 zoF>Sz3~u`EiF%T+;3P7+P$i|7z2#o%hh|Btd>xwa zU#dKeTfdsKPN<4?wpQ)!2N60#z#=DRg12t8bB0s(h zn>wjJb#1c6kgG>8hEVdx*+oJmF9r(?f6{JP@QtDLmaV>FZW%t|gqYh!{q3LfC&x`j zaplK3But#{0k6E`@zqa|;V1CLEBx)Bw%=Pxbp5&Nn9ALE5XJbzum2jeyB~i^qWm{1 z=ZqWI^5t;q8>grEcN;!malGrN}!klx|$H!0MN2o`&pe zA>0uOFH|_1=H}| zluK8Hi%Rno+$!Zol-;UoHG65FnkDkHDeDj!=q1n1(J+nfg~tA>iszMX_5HlVm#TWy zNj4tAOb6uBJuq&DzwHUPg+i1_7BVEXk7T z3B{W}Uu#-Vz70!-tStq@#4`cnPRM{F{3p)(jpY=+BVmR?ZqVfOhuy5UiS^DG*LUdE z7YB7HrMQdzlgjhZ zphL|aj|x^*Nx@w!^?X*RK53fUJZS{5J#Dj&CKv6aS&m^SVmrAASD|ZczCTj8O|cADssNEMm>1hscclF)R~w z+Ten;Ze6{uv4OfEk5>UrH5ehjn5zeEf}U$BZ{zZ2wKGr$Jo@4W_?VrQT-QPl`Lb5RIl3|ET@Aw`D zGFRKZc=7nngTo@OYuUp;xy!nC)1m`FI4y}Vh9~MdWAFeKJ&v{5SX;t%Zo~15ftXA6@+cOx z(!+r-*o#b~NE5w#n{j~4l%pAyey_znD-3NRH?t%gisWi49_>-fYc}Bo4~{o9{en`T zB@4o8JMS2!BoInOpXT}vQJx;JR|tuM8Vwr%szk+vjzqFOx9bt7 zVuXn3Ek8To5czcgm<~0Wsx;4|;25+^HQ>fop0m4KeB4Ua$e}BaPI{_cm@wNrr(LPE z8VR)mfDldoK2V$v9@x(e(^QX;8V%p&c~8gu={orJiOEj$Vk zjihbb+?e9W$Vb3G0fSPJw`Yk(eIPdB7>AUnR@KZTHWgN@iAk8FjjLzOdrpp4oWG!a z|CgIeH9*-@XG%CBz&-0~H59k$HyEAZjY3pRtsC@=9s(VpWl8O?brS1aGlI&k7|~}? zB4Asag|v(gqc2(_Fgk$?H`pI>pka)5{ya{{n7KJ+8%h;Ljjf;HVCXX-wjaahiSY`l zKS>QxE#w71kiL2nOgR!zESAPlmvqxV26~9=b5)CsmG*$iM3cc$vj9~C?&aa2=WkJ$ z;!@*1oWi4f2D*NF1)t%lL=GLf;2Yp)#@q}72LU4te0>zDUL5wA9_GO@0|h6(L@stjCdf_3JI!5*{{ToO6hD+?7F1vG>R)Qgj)K99+X(gfV$T zr!=T79yHriO8w@^wH;Nq*m{$6umtqb;sItuxQw5OTOp_|uS z9xII4C3tLba(Mmf$AUjrPsh$3|Ej;+{9*&_MOmWAjKKLs08Ljs( zN*(}jNN7|fB!vA;F?#todRCL&FHW9+1pAF^H{@%}AO*kED7a3g2O5P8l{PAjy)75^ z#SaOB!wHA1 zAlX>(U@Fa*0ha7BikAKdAxNqvJE%=6t@2>hk{!g_j;a9W7e%DlC*WB<}dVHwj9LrSijq*Vw}DQ9&rOY1#_FenjXX3cNQH zVy_rennn9yHh=(zyhDKvTq)%)2PMs5tkbB-$LUz(_+8iHDs+8>QmMGg1Chs+5$3+h zNGhwWcy8C=8h|`Z(VB1?8H?fRd#A@u-%NdVE>_f;T$;D~J~;@^xapZ>S}j9!0eR!p zdD$51as9lB_`FXQd9(V|`-9PDIV{;GlsW|<14K|Zidx4jf7K#?U5RS#Hz=D(WlA)% zMp9X0sC5dW;`R9(P*KJRkOAQ0DWDQ$q@Q<>gxT6r?z>Xzz$lI`KYLgg6NRCsJt+v; z0}&4+Y={Lmdl4)z3ZRySbOwbF3qYDMiqBWWlJ7&9z`2U1G)rS_twjs~U9ZlxM8sN%gSg^z1=vF-lsTrE9o)oSA2&owqtszv_ybU6o399+jUb9>Y_Y(^(6Eley^eAoGV+Ll z+`EpnYlYlppxLftGOU9cBG^#%9949zciGqwb)55c)WN1c<8)bf$l=h2EkpyQZo{Rg z4Tz40ixW|YOO1*KR+nEkP;fQo@zpEct>X@^S9h;N&DTq9H}Z?IU2$(hLaXyk8){mb z=sFsG_nQ=lFPbRK8sGT135`=MQS$vjXj@cq-sWm%QfWT+A#+7E0smA9h_R`4v}kw0 z3_4m|Ua~p`H(lH22zXilXuiRusyvr1hxSDV*?w*UhT;tpRD zg0cQ7cO_TD^FJI}FB@ODw~583;0D(FwlCK(dfPsiB*(w@>_mul?#;J}sB{lPyF&lG+vVyS z<>KswUUU?`>{jmR-hp0bGLK+4r6iiry(1R~Ez`|-(MoIboSJ}05b6w(F5l}~dyg2U zQ8O^HjzYEbr%L^iVRxTly>MbTVZQr9tOw-LRnFH=w$M{r)gVaIV@)2IG?;UM zr6M#43>40GBF!e0txggmjd(9Itg=;)s7IZ6ZV!ih-xaGC1#UJ|ReS~RtyOj38?ip) z1;Ib&bWwc_hYnK3bu@EkJoj(G_Q?nCiL=UE_1|2mvk2*<8y;{}?Xf-O2oN8%+i7sv zAyF^`kGKWNLDa!F8R(g$Y4Nnib}D}up!o&hF9=l#9+^l(B2x#0tm?uS25yIRI*MKN z6fg7@bq+>_1j=r8zu#|apc#7VL*JZQm{7&Ak=k+JlupNku^#aLttzY2@KDKVTg3u5 z_U_O~^$_9ZNDWQ%WbWV_n&DZG5#dNW_dVcD2Ir9p-Kx>(7pqZgB)F&Z1MJ4|Tu5JO z2z7~d}xTOjQ%6^{<*Mj6H4luF=*Eq#~cOK57-7` z9LDpW$$I1>m!Vm8c&ad0w!Q5mS?3-N{!#mQ2iLIBHJ8 z#7|%VX7U_Bt(XLsyg}!@2V%COi`fI&!srJulV(V2aZko`RRqu3IAMHz?DT@0bMfOH z>_?-Tsj1}&e_!0CLhWB%xA`*cJ{tEHL0FTEC5R-K?BC9q;~1QAt4b6kkkj-7%ysPndyA*PoT7}uYYje zcDlx?wiMDc5nXQt*`}uUXG&`3hJ7YY?I;2;&;dJ&VJ~WHgn==DT8TuU7JzU_2=@7> zT!sZ}By~4dN$*U#yeCyLO00G7{emuRVT{6wf6h;B@u%nKBQ+VeMDQ2P;@yr5u?Y+~u7q9(CUN{z zE!1JjS9;-Q6rE4HG6jy_-A)&@zi{li%$B^IQuO&WX_4j!!x@n}B9%U954cZMa=1Z< zdPWR8n0nFm&R~yKtamn*Nd2Q`jv|zC^{FZS6He7yn0hh;+akjWay-$RVIMi33grZO zj){2FvwAX|BFEiwmzf)uXG7_oy^1&@=}Zxzim|EGJ&>^pCDl2w)mlR08@)B)dTtcd zV2{Gui=H*4*L_Tz#b!pj<|~ghy#`{f0!JI3%-U+#L5o(-H=R^gTZVXVzzXFlaq=QK zEyZ2x6YONp3VpX7g&DSMc#qN;A-$Lc-pt^%4uu8rUa+N)Q7H1z5V4c@{OAeM;LOS??12O{`&RF1?$$vI?HQQx0-SJ1h2%!QPX{7eQNq zCVeD5o|kRS6E;5zBhrc4i*0b$)5ten)&S5kN*FxYCw+P143?gZ1yyOk1#Mn;k*3yZ zrlC)wlUk%oDxfF(PG5lm|3=cQ-K6VW+9qbwLqi#;^2T&ncHPuJJwH?CqoY18pa)D* zHA<@XI?4}^ZQx{8`?O#)0P4*=#_dROOlm*#4t?Ru0rK;#GY0z22840O} z(S&9&lFGi$0K{o3B*^R9K9M>AK>d5{g1*Fem%{!qX%Bj8qD=N?P2|^p`;BcaHOTxo zI*ru+8^%9HjA#EU10#Q?n*6L|Qfa)Y@-|)N6_avJaeud?h>sv`5V4oV`&-cJ?kjSi z*1NxM*=UkWutr(W#_rO`cg@01XVX4tl3KpM)mi`8z=jdpW_uI+Sh3?(2Aa=%W$^f% z5A*MM#J?4Q(!FRkQt3&-Uh6%mF^ry;nTj4XGrR|+DE!meO=AEfg>@-J_UBjXXs8@1 z_@(J}3h3W=pZ-RI4aO+kVFImV6yX{WBI>#Z5^Pp*EMPjHPm3T0g!fkYO>F6Kb_N;g zGGFMaNj2l<6sRxk7=~ULHo1`BH~9lFt%6EEa1X1tU>f<{j4D3;{q%NkYO1a=t3pV? zcTO##AIG7%#+~5?OWOp5c3D02ZK96XvZ&8Pi!>QrzEyDc&Gpf|;k~uaqvC0)`{#(h zAa>|84J`w&U7Ghdw{BJ52q(rnP|(*pnq^vVtw=2~>CGhcij@h@mdi_2y?Y|&&o~h~ zTy%PJ`P`lhb;|R%0kTzjXUA< zj%SCla(T0cL#aMttHW?|Uzw+`oy8B?-5|`{iG7!KXN_!oCplE%2`} z`^wBynm}Pi06d{mmT%#cXr}q0$v9kSh=T$pv_egrHBR_b-I)7zXcPUb)%!ATdw~I( z!tWJSjud#)D`P0}cCEossv9J0BrPB|@k;k=0CiT#bbx{SRPNzAW4p2s{adAVks@wL zI6jd}EgCicO4wr}y+;F>=hzcljahqbi~aqce^(s1Q)tSWbG1CHoTfC9N5b>X4T{x{ z53jznruQp{CyNO1UI_6<_P!%fdVK8n2{Z6QhmV=Aup3$&x(GtLTaozdJ5g)ToE?5# zhB2@RNDGj`d%ir2l?>b}49wqS(bZ8}g{Js;h!@`#E5QLpLPwQlKZKl$OzH`4Hy|$y z5r)A%v6{p0homnk6zw9wms;(j=oK19?AnO~uY67-1m7uzWnZ>Yf)`CHs7No>WCj!w zCmVJVm27N$a|bSK=3DZhipN~{OAPeR()^r;4jIVv+{5BNL70D_V>Q|5@4{U20y9(Q zvRSq-8;2jp_1zA~2KKZGw*=q8Svt1;R5p=Kh^)QcF*cA6XZC4^hZBB)@v+$WmHgl$ zfXk=YTxW61i-0nJMM&r&iJJ{=_!emZ>s}OJ@b)C&m-$cj1ye%~kNNVSaD}nkD5|`i zdF8wnN;NRe3HP`?cBZ4~KXu2Y%YVi;Hngl@F!E>o<)Oi=xt9~lpFDW_-S0bfiR83* zzFmR|KkIFYmx2kBok8WB4_=1bm_Ep;i<}J^id&)9>Wyda)fDSkiJQqv}J5oOZHqrrr*~OH$is!!z#Pv;6Wcy}dpzMzuLEmyt4_sUvAxpo zBBzx?UaJBcS-8X4flLq6imIUdoge`dzljvZEkXEW53^wYNqV_<(L_7#Vy7)CsTTb& z4>K(KIoA~QJe#HAIX`0EzNHCEEoylxiL7aLZoJk{)O-*=qOY;wtsmZ6?6-G=n(cO* zfmzGM^EXa;EYo{2mlKU;HyX&@%>ogs8HF@Cxrqd;Kw8^OoO*nOwlpo+u&;U~D?-yq zDM3l=-hcd4k>>$}Z)C$Ggp}ttay9^C-d#X7pd655odMZ7c&cA zuh6@q0RIogR6#ztkO04kkbt-N8Hs?&emRC`C4{|hno|1&Q2Wm2?r^1ocF$-i9cKUV6ezoFFRj910apL%6l z{Z}s4kF?|Xn@c5GsR6mR|8S{6`R>8_jy3t||B9)9L#Zxdh2H-tsg1?i|AA6{|4)_F zr++J{RaMnLs)CcMgGfYbYVFg1iPWMued%>6mGzPTK&hDx>G+1y!-nX8trH=0_H9UH}sxAcv# ze;LP<(9~WMmijlBIz*TsCag{n=C=vGBsBFmq5U5*mBgj)&JO*DOYI|ZsdMv-|8S{` zi_8CtsndUpsU$9yWTk#y`~Kg!)Vb63SrV6evbprX#MEmYExYrr|8K?AsW1OkOf4y+ z`yXQJu~s8G;{Qi`<`th-W-F6e;BjWujGQsTgw0uX}J21EguA61>b#et(jj zj&T=Qpdj!bXs5x38$%=cni6#l zb!khHF}XZDg$#J^=6$?03^zgc=G=9pzC+OX-eui)k6@nY{6u?j3HdV6kZYEa>jNKd z$39lMn4A2gpd(UqQ$1GIsOj=DKt{Lo99t|Wj^l1PK=mgV9SQ^A8~qUMJt?L`STvbJ z(GuN8zu04qRQ6Dh#KlY`FE?T6)k*2PBp%OOzAc}~cQ1fV1oGRydYx)l685F-zK?H8 z_n%{$HRiY@cdHJ$Wemg|0Sh1bV*N;BD*YdzF2v$5F|~~t;x}PP5>w~KV_q6^B|z8S z^Nly9Nvu6HK5$zb7G@Zu9ABh9;BF#8P}~jFd0WfV|Q+bl${Sw${@yTYvm?| zV!PP{ikt9bf*92wCPO!&hf$b(s^}<;=`k=4GnYydQ+1s*{t{Dd!F7L$sjiPf(*KC5 z{$j!8JJ@gk5mPN+qB5q)*k|qk7E@QA;OPDqQ&ZdM#z|r-J?pt0I~4ejn0kIpPs{a_ zmh2PXJwVXZlMBFMlzzbw6_0Ba%5seqpB6=#YZYQf$_Fpm0(xYMlz3ldTNgO9{fM;GS>;#gSs#HZWP`<)T;ULbg=omAnx5m$Pzuoyg zkX8A38384UsTt$Zxc`W$7)Sn+Y3ZdIl9)P?`p5MyG?FByHcdR?!SLY|{}EHS5HsxW z3h4mxo?#WcZ{xk&JOUFK28j4O*2v+q+kb>&;>s_ zNEXh6G6D&}$n1xRE>LO@C!=2ELTXU?Az0mVoRUJ_$8#+xo9T#8+(>ymWO57MCb*fx z>O4$mm=IHczLBD#lUAnwlO5xSXiELf@O<*cChcR9)CFf#UYrL(;>i2v=vjglE3^=0 zu1E1($G31rlbgbRqfSD35>By=RqqXKh&lwaS|EuLC#J$K9!-8%t;=tSU4j{1gE~Es zoBnqGV+MP&>qySWHodTs^6<1G{COUT1#(*>#lFSr7yuio=KNH0QF_?&&>Ml#Llj!F zZ-WdD2MmCqx+erb#OM~5Iy0EurCfjlQ$!TtP}Hp+2copk;E&GI;^___hTRgn+2R0! zI$Df%eoT_Ye0`y5Bb3$lq3bi&ymeR$LuU6Q@g7ph&m^!jt8H}518r`u5uU35g1{)r z0PUNhQ%!vLoC2oW`w14<#0P!T4!_hjH|Z$v*|E6$3PT-Pe`No`d_NM&sj}Ayvh~8y zf3SJ#Y9h+hgE8{=_jDbevb|y8Rle|3@r!hHv!e2KF{A*kT= z`eVderI_(Kss*8KWd z7x6gdx-5CX*Yg+;90XgJX`xHNrg} zwTF608QW*tH5CLrtSdlI0}bkjssCszoL8VcKj!((#`srhc9fU^b}hL=Dp355Yuxs= zUD0#^=9gQyR+M`-EM(!4K}spcUShr@G<>rn;V<2+*u04D3X8h(X3`9EhXZXQJ~2TJ z21T9}g|n^d0TT{}<=m$t5$|&{Zx48=-~+DZ_$=p?rio{uC*9Bf>2~fCpVo7mzb%eBy;W4(*lurd0H1*oGs=(=>W`_M*p_aUz;Wa(SVR-=&qfB9SV|N!I-OVgN;U zpcO|=Msc@~0NEb!!qN!N`Z$X>yxszZoXyP3A?go!X+J?_%Fi!ny!!R3Bp706rc*y$ zq+PP-*ejYM1+Ry44`018+t_%p^QPp52ouNYQ7#O_e?^*-|FJkjku_cV@hq<`R{m}m z*ul0k>!hG}2&XJ^IWqR=yMNu(vN})wi`VvX*I+Z(4&G5X&K>z_k3u=|Z_p`x@j-|S zRDl_)STt%CLxF&h3VLx$OTr#xCFllZBaF)!#WAzar>ck6XJYkjLkHy#$J7b7DG29y z;I7_@Cy7bC*F_tz6LU};rYH{kEZ$`R#t%iK=LB;okPBW=y=cJ=x`rKy_rhGTiC$8b zrVFSBx|-z#ErA3;&E5=qdy6H9j|%y8o!TgJh6;eIahi3U$q_rr6g?r^EHKSHMV)Mq z9Do%9?%fN?)PtRrEH0sxC3Kr`>h0)MbInu^yVShV)WRjf4HUeDEvAaWHGZa+=UsnV zf4y?_dewi$RJ|LuPB-eJZzL2WjzshSB~$oqDSuFD?S4)z_1Jc?wEDa>AuXOiUeL#R z{Q6Sw1TGBBzoK2^)A4WOq2eFJ*Hk~@leZ1p8bn1*J_l0ywF(L=&3+-eV z{L=+NnHzfCTS^&wJJL5tzzx;NLN=Y6>!c50qEI1i4t(|~uyN6GQ#PH-1S3l`);*xI zz-70@Q{3Dxp=>%eJ{>aSvtNy@xCnl{sIWsn?D8qwvKFa^;?RpiDclV)4g)VAFxk=Q z(E=rWn-I0X^@qNPp%#jeV9i|o83EJR&^schqO!ml_Mx0!1Zxwc3tk>PgVx4n8Br+k z(g0|grHKJ}?Y^r15ImmDl#@G(Qb;9vtz&OOA!_SbjQwrJT&*;mI);c+_^PUxWt0Av z@Gv4L4LEFZMqE3K2b>L8Wuf>`Tu^>49u!Cdjmmt_ZZUHTX7jxV_zSpi^fF;;B^YUW z3ss?fDVElp&Z%o0`c2^a|KM`4xT)i-QCm7LFe5zZ%YtZ@!5g|>L&?a?_zbY|QDPUOKRQ3&gI}6{Z7d6hQJSW~*PnxKDW~SM;#*KYD6(sfqL;INA)HGy=OD1|GW!4q@x5xI27y zMb`h(Qzc8o8(5m*mXFzbYL17hK)(9s0pyqk%^QP5%2I-8`j2y1D)qC)u>LzzbmjI9 zNCQsa9G0bz^HNn7hh_1Mt(WYs1KKDqjb$W~2@r6g@O4B9s`SQD5P${=kP-DyEB2XB zpC76a!hxu0&?$Wsbv;jY9oR>MozN(t2+8(Sc+emr)H6O7NV^E=*MGK8e)cK<+2;?} z8*q3GpeijGv{-`_+JxlZFSSp#HYrgth+p53M@RVux1>( zkK%G)MhZ|08EioHK=UP-TB96OsHB2g@kdFZt6*910n2BmM<1{ORZ%veimRPH`|LQ+ zgX)wtvWPXu=neHR-rTknPj5CpihZMpa=OR=i!($1jo3$JP*y{Y6SMCAZ{V0dO0S~q z*hy4GcU{P5=*e%$x&RbEaJNR8$D{&DzERtM9p=RYEMV#rRbJIDX|`Ri6DmQ1fmaJ2 zfblM3f&2CGlP_tX8_cdZc%4N&VFD+4fNo`!;y2{`^%89iWVaebz5$Dgso%vw_*uy3 z5^r8Qzj>*2p;M_rFdXz{8R=I9ynLEh5cnEKzB_c~!jX?GhSqx0UPi`6QEEJ#Hu%%-*%PXu<4fs^bY7*QkU-OI6Wa_SI~J@vLh6*aNO(My}+dzglHwdZDldov$W0yoqZl} z16CNWmEInP@4odjem;UkuP|&;9D(qT9CC*j<(K{L+Pl42@t;E?bSVo z@@9t-t_wGTFeZ1V7QCN&(Pccr7gNcNo#3-y=RJwyc#_M}SPk_G`+!~H&SCPkqO4oV ze2Xk@TPDN{=*Us+X#5lcn?Za*_3WYf73y0N7m#hSh+jqTdsGh<>@k*Zb}v?&ssUX3 z7}(JuTalk|1>oHo43BXtB#q6FF?o*fG2WWcthl5ykEx#xg_KArmVfv6AnxO>#w*-MR3N`c%|e3_?h= z5614|!uhM{efus0`7;bKUk7xR5jfA_{>xOXWXiG_O9w*zwLJr&tizL(UeRF0GbUo- z4oX4|HG+dDc<|k0@qqhz6DV9AXcT+sIiU!?xq?WcaQ$W?w^jMTZOC+w;S`VDJsi)a zA~;lYK=J;_?H@hHO#_2HU0lzYlW<;zBJRl|t$Zc{8w=)ih4T zcoHxOG7CuxLt%n6-9()kZzN!N?-e0j}4_7DmfvH^<)j;#Uj3+tDpLdAhezdxaQFHRh~IA=TI#Ty@?&OD`%&!ZB1`k5+ED5$4=jT< zybAeI1jmN;eQ)adjh+z-nGr(_3%hYNdyM~3ojp7?pY#LrzWMF$>HBW11mGkTHf(me7uU-DZ-wH2ImqR~i6 zS$!^aeSDinkUDM3QeqEvm3`pC(Ifr5Rcj0P+`iSteXB40*4Xlm-Rf`I|3;TyX~(W~ zxUIa6TlxQpsRP*6A-C0$xYg0Z)$#r0v)+*l8(;UsXerp`C%-*GH`}BRdno>nu;n~uV5|d90M%C zI64Gg#H@3)s&dMpxxcS(uX*rmFcR#3fZ0iSHz#7M+5mSc8>dz_;tyVI zV2nGo9&GgCH+7L6$7RIyTQ{?MekAn=PE&RkL1;2&s+lrgnGC=+`doF1y z!wy9qO#m%8B0J6;%-c@=RH^*w&?o7sx{WOm^E+Uqyx8`4-)74G#K>^8Sc!Si-}#;T zm2}}(;PYQ$-zCHQHteTGsTw;3-(KMy>G3;8tl)QpAjo#FU{^)OFg-`+Vg-vc;@ro<*l#7&wcwo`QBLXJ$En0OGr-s zp;`tTnRVaV`6xD)S$taLI^o2DfPI)2aQG0j0DOxO)^?B*Utt8-9%4ta8q``#Q>Dw+ zYka)}txCQ&Nn9`ykyU!9&{M{?<^4&U{Ryu=aylIGa-Uuzzt$XIJ z$G^ct-)?JS)oR{AZq@2}Wd z_bb0>GWl0bwd)rw|A#H6UUWG9O{mLHT^js}u}&{>eZ_6h?Q9I$V|7c8MJ1d*u|at!oMf zB+1*=+zMRuvq{p~02pv8PP-8zWhV;+E4;yjIi&MeI1-&>+T7a#=_)W^&t(w6QRw&j zpNQd~J6>7I-j}@ajMO6Pg(F_TbdvtpJ6hM}^KWY9mI;EuPY z0phn`S;{RyF40~DRNSi@^n2)a=W5iOyeQZ1f=kin_jmvb z9ROOmAOGTrz4WNKN@{mpRvnXZ$p;NM(Lq3c5itQG#Ka|twj1Fn&_g8!q!urU;{d2wK1VFuGdcr@04enCB}(qM^Pw@5 zF;vJQ0V;oRuJcZ=(A0` z;g7o!9@8cKvfkvznJihcAvB=uVX|?TKRsaUGf0_S-F?>!EL>k)p^qclKeD@Zz#!ym{W-Bmls1ShE&2X6gWNIXW~)#jCsSINs$^hOBUt2TYztiBHTy zFmn}wp7&Yg793Co7lb=rCewPWZb8r6ZgIWKI*L_D;}aiMZ(i&!I)^J2s}_NUaPxa# z>6aZhKVm`kKv=RdPl;jr@x?RNuQbt;yhWa*aWT`K3&pbIZ=A;Mc5&{8e83^J#DZje zrKrNK8+S^N5YZl|G;41ugH(aR0(N$r=byIVdZJULcZ`tHBdEU~m8i*zjZ@TMR4*)=liTVXX&JLL(`( zSo~)1q2_Xb`77prU6R}k$K(kd2re#TFHUG^m#&kQ_p3NjrqpNgD67)d=)6;K%;(7T zi*-E8v&62us(yF9`2N{4*|P5jgqZDB&07#Pp2ak%JzcZ~M-c46-vFBz3BE0YbcA>S z3(G!!QW#(&TuM)&j8S@XO|t`d=|MZ{9485 zN25MvR+{^V?5JTVxbD%r&5WW@u3B$_K9wmO9aa-EOXje2m>bCOfEl`X(~Vg2X+BAe z{CpaN1`y~|dxx`R8Ad8nAQXiM*CXBx!GRiortJT=Ig{z31>rX11QqE1^_}H^Dnbqoqdv>CqBl#cd(Qax(wCZ7?@Rc0J5T>ve0^={YAA(MQsdzF+NCzWxb^-$XBMw-y$x5! zI+J-01a|_|#5X`Nvdeqkc-&hFR460N318Qt>w2VxvSqUv3Dx_0oPwdk3WY`GQ|M4K z>xM2s9dQBLD?Wm#P$$+9$~diEA}+FwF+W3k4HxAJ1Eyy-0FIRD6rn5tM7e|E_IpN@ zMg!{Gz91NI0tE+ta@)foVBkUBjxL>D$5@7@XiJs*l3bXRBYbUc`yJg|B?$&!6OQaz z-A3$u*uet<4z9zFI#FK4dB8QaYg$MrDl1eR@PE+5RcWY0G`ug|i=ox&RA2VmiDxJK zOo=A=P@0b`NNS+jw=;cbnX6^}&6k6#mWa%-fF@fIT#S0`P_!0N>Fk-eI~2$v0CYp8 z@JC2wtX=I7DqBni3CDn;^+g~K9Qr~_IRz|K^v2GazBlzH{+GBGmKkl>3*bPZBX%en-7+eJFs~(< zDnUw4dmx5A)80ZS6{4yBi z@%k>6_(!f}uZ@F)i=wMQX&FEI`47I9i`^vaOjIExd?wuAO)?`BsQGfBYRYat5U>P= zWD&krv@a8{b6^<#;-VS_asl*4T|Fb7`a@vm!`()Wvqg~OX>H&vBKr+u@0RVU9;tpV zvBUtEQ+TTB{cVim{v;u|+{yODIGtqJ4)NXw8lsE=)r-NHNnxxzqD~2w5h6X`ir@tB z0sB`PRYntXX(o3dy_>B z2OkX00)=SrQb#Kv6R-&it6iz#Bxff zcBL9Gz*+wG<+vp%oU*=BVN+?i($5e`NKu<8erM4pRZ}ae12hG6zu4pm5Tkxi{g_;T z%vSPaveL)FG^W|fk5?mLhqY<@m%)8B>#?+t5IrCuENQ>(<5z7?m7(ilfEKh7G(&m< z#sC4V?j%5TnTq#imcrXrmU4UBp(T26o~tR`v;OBKcZCM>N|6~Fz#n4Euz4F3@>_;R zy#_{nD6RA#gqrT25borcPfvZa6AAWX=fa<`1wV3?w}>!*aIkpgwvd&DJF3V`?9JLC zDfLnF=e(33k<26tnR|8ca|!t><3G5nlLojbfiL}I$;tz@o(CfKO~}Sh4ok7E1g?K= z)uO!bGhhw4F=+RJ!mG{=6<|gV0EYqG<^T^xyU*~slrZpn3-XT;1%{?<&KkyD0N@zl z$8r$|fD=uudBC<+nSf!>;N_>%o1{~u$!_j0{nP7)5T!z=`S{A00FNFe^H3F1K)#J2 zeBsQXlsy!tns%DaD2hfJf8bYg4Z9Q4aiVn}k;M`NBvbidr&89Mm|X-t)k+@+iN7 zQ=>8wQgnD+72p!!^eX?5^a_olXjfjZl#45CEDbPYcx?KD* z96!5M>-)YbxhN^4sjB-h+{=TbKNWsjPxrJ^L#r!2IV`E~F#L=?1Gz3591YbkqP6_s zAQmNbRZR3*-_!M|kbdJl(DV7A`LCTTHEjBNp97TFza}<~(+k zr!5*Lvwsea@RXJI{QnVCGdzzLdMa0WsTcP5#D%dZ#?$jnm$3=o4CdnR=v&Q zyv;MbEegFY3+JhR^OlQqCJ3t4+P`8d8Mbh8f8H8#{vSnej2=thZ9#V41EZNX%Ozkw zIMYd)=_;Q&6Lrqn6z<}4-hF@JZc5VAlr$%wV`qKfAC?we)^2)>aA9CRei=TuP>VR~ zd1}^qpG|I6SsyC-y!qgwqhi|Zx{trQZv>3{vR^KD|8>u*;{j8G7g`oC#Q1jjeTiuC zjonU;T;!i$qR7w}FNfsvtHWbNUi+c^1VU2cdwr8``&{^CI=xrq^$_yv~`ro{I3l6XoX`l9GGOUo>vv?&+`jxBafg z`4?s^=CpiGg!m>e!NU=H#UWfJYyJ zmqLW9rkbDM{!*#_&0###j5kFEaugde4*j2~t(x8hQhpVJgVA&X%b_ zIwGh|I;dSeh=C32unFpP3wq0zIpTu4GJ@V027Ras>TU_@=?(fg71X;H)VCkRLy=vN9eFYX7`9=p%KTgMd)QloOlqS zkK7C{rzLyaxd(Tl#uscyfh%4C@TH0&C%P54E%T;~ilk5?Xr z*_&8PB4g;mb1>PxX^sS#{pbv-2-4Py(d zF-I<`W+q!-;!s0_&#iNk{4O03UPpPZQw>s}{+y9HVq4ZbDG@0COs>THj;WU91f%1Z)X+&+$PhjA z-zt}q$-pHwVCypOCqXpo8o((>fdpnn=8AC=I5{(mB1ylJ-TY|e^()uvIUP|jPd{iN z3g#Hb83!9^VU^&xr{5Lj_&wU4^YS@AT2O0p&adQdjjf)(?do4KXqmmBX=qk*Y-SjzBaY^C zz;(mySI`m2d8O0?J51CybRyaPlKE>`&M9ylbI%|;ruR}Zz9>2jxXO;^2sj0vN#2@0 zvNh)zJH#0tWVV<0eJeUE5le<>Q6MD0oy$%;7NM84lv1NpqAl-mTx0%Lu8twdaDIHA zd=7c}+9>-P#eU&TiMthRe_`Q6a^{6EsFBSt>#`jw0!QpB{eH;3O*`~|5L0t}glYPz zPV#dJRs#14712LzkK82F98`Dh6V?k`O}XxqFV+MXXC~Vu{{B2Kf3r1)>yO1x+hOIA zdjX$ng1>F6P5duns==ZakFN`}EM8(vPVm6H+{9UbMo(dTP{!Et!;YFl$uOtmbC1s5 zr7wAK@AtV^ou&eJb^aGIwfc{u!-z_?$9;>nE*=xomx1dAng%_|L z&0(|tkb7N@ZLf^Kj`+Uzmo28&98XjZZRxL%p6i)clr+p)$v6|d!>7l ziV8UnXD=VVeI}sce3T9LAeAi83@;NDa>>8^KVqubPnu-G1v#^%YhJsk4#VL8imAVS z({AmJa26#Vx}sE)s;qy}n@z zPtuS7DPnSvTp@B_KilAijZWZ`hUG?;=K)hQ5IqF<93xBk#~eh?RkoI|M)Y#A3CTSB zw2TSKWFAr69Pmc%_2lU#F{k~BFAq$LmF6-ac|lxbqD{)0xey zj@G2t2kabZpTh6kgE-_Y>`RcE$qpAz2?wBTEwGnRwj7SGhh;ZgO(d@B3>=l6WgKIC z7(nuS*9(xY+GSI_UZ53cXsTy!e50v2sG7*holIWI`p3s1ob__QgUBX4b+l5DHEB6H zDh1SNvXV^$Bm;TXeE8%5d;UsqcK7|&X`=x#_G&!a!_YG?9VoY7T{ugd%`pheHSHKn z=M%7(->+RoOgK2fbiIBPE+rGaQT&!N*ikLAJk;l&-uLD|RnG)T&^9LxEZvrS<0f(8 z)k3cPMx98Pqd;aaf{MsiJ*$`B@bzSR&J}@?ij_vE8Aa7*-AMWPCbiZ#2iU#)8Sfp8 z-kv;|BMoEc=bF0jI3g0YhFBa|v)Cq5Huao5L(A0zN+*kn?!AY{ba1-q2!q&$<= zb~Q*c;|WdeCbMNHyPBqGHS{Z`NX~K3l=n-POI+TUR_`+qESdGeXi2e-`M&+g+XSAh zH1Kh?l#iSV6rR-BEPQeYd@dm7A%}no+5WuwXosA$iaq z#&HyLj;n4Sq#pbps%VbqajYkZO|X(q^x%>4dWs4ezb!p^wBa=cht)z;Pf9Nh+-cKM zRKOKqce39&wCKl)wLIE;VVqxPbqS{9*>8#Ou;U>XJrZgsTj4zj_i~%&j~k_CUY($2 z?Npl?91^#1tsc0wlDlA#jOshVC3kP4>8*adMQ^IzP}l~}SJmZ2vI>Tp?-&L#awz7F z{gCir`owwVVmvuLe5e%NNmGuykRC;=DLK}B^s)LbuLDl*_V?WOlQM^F748a_7I}OT z4!WFp0w?!qvgoKysj`Fd3`3GhPDD?jIivvxR1woer$@G5cePaN6SLEHGV8!zkuvne z>%Owf+?BGpygAV#Cebb#1$(|+BC(Mrgl#MZxnz`fL5V+DF!WzIe^#vyHN~pQ_DXqj^)`Bii`#D5eHB z6`v>D&r3B@NMb_DTn=eaG==1c98O;^<;9m8G7_2;-8J*rMX5JVf&OAVRbC(UuC z&A*H=E(^uiS-r9%-{>~BgJ;NA3Y)jy2KgDi66f_ZbZ9ygV5T}R0V5ymiMA;|@-wh+ zv$cwx(_Kw5RlSmLFO8Wg-5Ls0UGsl$@m5ZRG@X$ZWCWKd zSA1Ai0PD?J#mY}z0T#g*AELQ9-QP>*RY%-d*emy|#c@V?SpQf<=?J>Z!|hT`?rg>* zU6=6BSq|FH<3D)KnK$X<_bb%nxcN*po^l*M&DUL^nweg8NOvd2#EygCZLmwegP2Er zc6W%+clPdZ;!QIi$&z$p^sMZ(9!-A(j8ArA2#-*@%H}R*1h+B|t|nS=s`C0_B!m6vtv5+1pG-~wFItbA1QVdJoh*` zI9t%{ei={4diWKu{?O~*vK+D~ZbSXB`7pmZ*7p`#NBN#?tRR@ z|AuWM`Dq$0UG8IOtfVcKt5!j1P~rC<`Y!e$b$*#I0paX z9PQXBNH`5-94t+uB}kFl#At&GVt5gf1h%&AxObZX3!<6^8eedgNb{(D1wvc33q-q0 z&mlN!@dop$##`V}HmR$${*hHQ#QKd4;{1180u6EQr5w)YU)lQ!W3BVDK_D@47f z`}(whRG<)fF}$K4?T9yCgx-!#!|S0v^`0b1?a?{)Nvrq{Sw%USU0LiFgmSWXlX zYa3Pbl=o>{Ip3i$PmnUD9)S^sNO+7M6pJAU)zZhGFnCWhh&$5j)+CW73WAg%MNg2z z5MJI*I#dr{8G0Af6~FnFmzR%#Lbub=@uFeHutb8eQ-Xj`h8n#s%Ly_gY<4l$OKF`b zOap78@Rd!|By&8<2Ku-7pj5v`-Z;AL&*ioTKfFK|L24^ri}W$ZiDBfFAX1cO3@wY% zYkQRql2n3-qcTV|eDw)ZoGnSO04FgIUh^*^mNU51@ai;*KApf@J2w2D$VsOhDWK@n zAgYmcqBowmmX6$`7}w&tNYo>dbZjJre`p>lDQ`+j5OQIVf`|e^3A!#4a%BY~PDG_V zke(9b#5_b&fhtl?g^k6FD4|(QK1q8}gGYyn4N1KKKzoCSFN5Lp?E*l&F%XZiC#sQM zbeIdovW2sDQhZqysanumSCT4?Y9h%%Mp6)aIF%iSSQbvCsmPcD;;SWi&oV51+L0?% zW3huaF+ZY;x&2{%==rlg66i;sT91s&OI-A!@ClZMry|@>(YM#xgg_Eg^bvtS7_}yF zv3XThAGIZ1$192yoxqhRq_OR3Vwi6g1Q9NT8VgO$;OEe3#Jeb}(@wIAABu>jT_{RZ zEpxe?RT_OPUKdJL8wCrw+#}YPk|q|<5TJh_ez{WjSPPzJY^cy!PatFv;|L~F9x~!+ zZ_0IEO(&xN7L~Ne_g;@Vw%cF^w9pnK| z-4wZvTjww|I<;KN%!lx~v&2~SNW@bPk*CS;NMB@(9uUPk^oFR$>$EsUBtLpyOs!B& zp_`(LekK=>!L?l(C%jJO-NXBreqiJd9Qn8!@VtxuFx>N5mEev7KT%Q!ZHN0}>J(*-i>m(IY(|uz} z$nYz=Sakf6O4oPlcuq;LJS_Cx71inNrQ<2G7F^4IyF^R@ZfV%R$%Y`J*P+CsbJHjN zMpiz;mYw;Z66f28e?wH6fREgA@9CA!D=NDFE5toYi%tA7F{)H8ORTmHtq;ABLEufP zJVKs$Jbuo_Vaa4k+Pv~8X+_dB`k{zlTWA)3A)EH(E%NDZ!qb|kQM))prG$x;lTtVn zY@W;ZSit4pr^KNELll*rW=d>2vUetEXkcIn3;k6HqG8Z{nW@u-^^?z4K*1H=US9?AT>=V9}->Rnaqag>s_xE<~=- zw*PC1F{a()Q6;flge#VB4zbNV-xvk&9fQKO|b-KCD5XGOd+* zm(*c_g<_+Z18U063n1cjcN2`CY0p7|z+L9lc5uZmW5ArRVk)6m4==w&tkS22>fw#a z7v>bZrybhxjSo=&&^QCDP9LihjQ(+KDd7~H@Js(2(VVgT)+S>IB6+V0vz2ga0x#wL zS^DW2-7*}f6`1=^f@M;faZ?Z{5P#zXgpYBfuvj9US4aMUc^7S+3 zD^EpmZK--t$ubuOik0xt4ZE`^u~{G?R^N`z%FC)}`L76ES#(+QWale7_ezjNE#>eX zS4uC(-WYT4wTzd5}2tb=JZ&i#&IcfJ&_vMnd z*=CWzCe@0-Ndx$z?T!T=kYsIX{C*4t54EZDZTkPTiGRg$su zNRtujT1HE`)SjPx3iz5aYpv|vjPyVP*pA66IH@esGE&^XVbyFS=Sa#E%K|%-9yjB> z!sw&ACR-5Omo!;CE&9Hk&1yo?c{h`;gzxd73F@+{DM)OoM5VoixPs?6gx86M@aC%- zIo;PUCmQQXx_mgVTvc+U5DM95uQ1QpeIfIcU)E6x@&QjXVA{yf6XQR$Q>1NFa;V;) zN%H2$#OohVI~?Gpbb_qBL2%VJZq$N*55q?^C`e}qQ|{5^{b^E$YQb+5C$6sMWwejjmE64p#lM>6O0FM_Lq6+#2_E#soHlV#ueq&7g^E%U@;oh72(m(9nCj-hsOykQp zjjv;x{y?-b2~7-xroNb#xjzjRZ{FPa)AAsudHPQSry>29L2K)qcA35QYk$~KYNLE? z+et&l9fMA@SjOu=3<<+F$JlpXO`WG=8N}E(FNw|CkoSFWTHIsbyT^WrXnOnk&4-tJ zJvU;zVhlT4V;jF^8)2wT#f`^2UPa9_RAsDTl2@g761XL=!tyA_^PoXr`jH*t&=-oa z6Mje=qEBiT-XS7LM*6XYp(B3>WSR$wMuVrW4*z2`pkg%m;ObC?(V(-@h<5WxMDwSF z=22~aO|#w}B);>H}~zMTB~<>l2+58_}ujH#HoA;#6E=T|3XjPx;(DdK)V zV}HJ{Wg!&13~5=_-e0?eoq2#=?cSewiCt)oTdBaVKi;1_ez59`Taz&!{DYl2b+G;2 z)(`*3MDi+k0bYLy$6a=fuZ$>Hc5U|vLBIYQw-R2zfV%gbpxxE_TZOuJocISq;->lz z45OS%OdCn;>1)eN)o<@as+=BTD~7?+^vpn?))cW z$~sBsXm*K7n#_q7=uta)iyS3~M_#}C&)eSBy)ptl#Ku(f4E^J3PiP&uzRx6`I~4r& zpDzQCPCuA+(fsK({6O(t!2BVdz-|YV%mDj}oZxo_es`yuc^oS~=em9yF&}rRoUXC` zt+*Ac3X|eVA5Uv4l+%g{eMs5sy;*JU0mBt%UPt{s6cse~(*IN4Rf$U> zlmEMzI_`&UeL`{O_VZS3`>gScb0A?yDYQ}?HOP8YMq zR64*Rne{(n>Y(j^#ncm*^Vms+JZeoS2K?{LsWybs9_Hns7SMJ*yiU8T9G-um4l7@E z)S(YSiISFnNtt3eC+eNZYO4z-A1(Dt3q;&qY`RUm&rl(Phac)lx#7Bt`y)4;ilK*V zERrQYRw|uKjLhw78xq!WE^a-2va6Wu?9V;n(bZ+iV|r>ER5I4{mh0^^6Kimhv(jFB zvuR>l_2o$~**);r2P@Pd4`y*&o(&Qk=Y4QQ$XV(mf-e zaA^MFE%$@N^XbDAaql@sJE^6vdFK`bx{AkAe2KIa{M!p*Qi|i*d2<Eif^eI@>2>wCyBDLJL9jpMS)DwpLMJ$X`9!snon5Zk5-6_D%!0 z@UkJU{LHe?{P{s1Y}DJWg;Xu2FQF>nUz=8L(MQc}1$}!T!BDd!} zd)^;Xm+fYE@7m{6NM!#; z;`r)K-viCQyfbq1w=_6MNMyAy-G+MQ`O^iZN%otlTS{^XPNBx``2KS;x8Y%L#UE!| z+w*c1n9?tt66N_c{or6G?>z%5C%^&(^o5rql?6#95w{Xz zb>`l(>>~38oMuAA46PCIzR!dU#*B`_rEH9IWatDbICIz|GSWH#JspM2@SP+0^3oY4Ql5MDEdmi zb@0kEQTo;c=~}xSC1QrBWU&~Z+Pq!N(9mO3-6!H=xjB_=F;)4eg=_j#J#SXqD+dR( zO=7_5*B^#z-E_0h4=@ot-Tc~olmcp1;LIy;;|@agdOVVnw?;9W2+P093jX< zL;lj4U9if8eZIue^D%`e7``h2zP^ZEAB2Cjf`t?Nq-sc3B z*qPhPT`iKgF{#=f;4f8=M61w*$mWx--yS^@y_A)BaJIxmt$h*jcU>(@lW^Zya@ zs{C%fng4fi5#Ds7*UaQ)%2JxN!CXU64A%NV-=?qLy#{!qOFtEU-xt;=j!!qOEspLFYs4qm)!=5vT*t{Op?86LHrQhp0D(qU9~=BdvIV8qip zQyKeJq;nwX)6*)N$d!Js7n6@JDNx5!WeT+&%_~U^71B=(3YCUWHH$PAO*zRk3bv86AXM6Gs4AuH6 zICS7PuIym)HDM_@ES4cL3IG1O`MVl~R*}YJ$=0_{cApZB$d^gIWqNcdP{QtU68uuN ziGl)e(6x2pkp!t^t)OPUYhCTt`&=f^1LHPL*BQ_Et2Q#i+%pU}+oZjVyF|Tb_C3-!#$N5JC%rpxE%x>6=^6Dxy2}3atWxaT*PGD^+II$- zi3!FvfnLM*#r$<-Ic9T3n&WYT$kP`W1Ck2!??3kWNigfstEq}8nSLvR4U5O5j)Ntk z44E8yTWvZ8c`AC$oQzJ;eLKUWS1y`r=_1ph=k}{KpHQ{n#IaB@D9(>BTpcw|-)=Z1 z_5A^C&r9@RrY6__e=eZeiN3hb@DD7C#)qtxEVIv6;|7V z;d#g8`@RbQ^37m`%*~!2+j$TA$-{p->v#(OS4!KbE`X=N14YZ=Dku_-2S)5PbhnD3 z&ZoYJypU1QWLGN%u<0XF@-hOv0za?wuXu3Rf<(e+{xwvsEG*WFd6s2!O`qhB9HB@C zBgCK7BhDLaebvb=Dh}ZnU9c1#&hn16<>JPON@VK)KkVJ-S5s~K@cC4d5K`!&hoaI2 zl-@!I=_m+@7@A5`5J6O=2uklo1f(NM?@iFqJJLj&8j8{p1QZd-M6c^^&$I5CHM3^c znis$SV6VOQn|++e`Th8gX!+CQn1$FVcPequw%HcWY0~N{j^3c{a;Pr#xo}$Z7X`~*cv$~rN>FdzAP^r@Se5n zYDT#)s$qQtVfJ=1Z8=bvJZ2(E1)IZgZ%mpxn^+@Ou(ztMk-l^V(Pa>)!meX8NXYq| zrqyg(zE(H?F@-b8cnU-vF_g{bbFFi{nhuepc^;}|eqqZpN%I6o`}}S}jHIaRxTeEd z9jh7uB8cDWi#=5Wgnrn%2PlalzRY>!?cgSiGLUhZ z3ydoJYHC*&xLr<5*AQ54cok-oST=dVM`|ebAfDNr9UGwL8*R;ZJ1;dmSo_Jk&f^<})iis?;ACTZb57cKCOs|UaDacIeNANx z$mqC$>mtKw#(9+N5GRA0zt3T#@W`509)KMk&Q97)8o5rksBJ~(k}Osh3@&OQ<8E*_ z+?b*3N3cXnQ?|}(o*|ow+%VI*P+hbquZO%8ziVBX+#xz7fHJ?-l1z2f8r%}j8zWv2 zJs#}Z$=n@wIIu&1!Ap0HOW&G<0C*W`#uS#hvm1P3dG(fNN<^6fhE!C zlF-uxyKrX1BQ+#n4y(b^V%OX|rBa4vdh5Ll*1gG&`V)@EEKcVwoJvn$hB|X!6_pKu zbLOtW__dkwIWo7&*h8R!ClR5fW`AVIqz=H!oRofOXMd!2?e2sf*n-27StY_;x`w(X z`Icpq40?u_u2Yk#Ju9s~A*iM9I0?8uQ zjzGCYq*I#PH(S7^WP`5`^Tb(5uTSu7P4HYN)4Tz?e9~|kV_F$ft2$I;J0M8!tD!U* z-4w4Hk4<=~vfkF9`X=7)Ydu`y*#^IKP=0uNej4`Fz#GnB*hw`mYa*cqFQS=%+yt&A8Kasfc%EOhFS>L8x0ZqnR<6IFqs!(8plv=I0f~oGGM`uh z7-a2br41Tv?KHT0`WX{gR5y9yn?aASP|weoIDKOczVb4CJL`4^7Zz+87V2v)Ye~I} zsC>2B54eIkRd4j&4ju7Q#hL)~$3q`Wt9}cjtD6jW&tq-C`Mtd87M>c_KNQ*pC;D7kCPUdG1I_hn9Iq;UxxNFD_L^ici_k-(M2sQ z``f9A-@Pr>n0S-ziPuv%RW^-^n6M=3Iia!K{9Jr3DmjF~^{6%aW*#jCKdDSmqg_u< z(Xx0_Je5}d)A25wM$%h$^`M`A`?8wM8K#IzCf6^%;7ajf=5?>|M_@)1-?RW+`qI